From 809cecae0954f75a9aa4644c3c261f53ca9a15bc Mon Sep 17 00:00:00 2001 From: root Date: Thu, 5 Mar 2026 18:06:10 +0800 Subject: [PATCH] v1.0 --- __init__.py | 107 + __pycache__/__init__.cpython-312.pyc | Bin 0 -> 3434 bytes __pycache__/_aiter_ops.cpython-312.pyc | Bin 0 -> 36892 bytes __pycache__/_bc_linter.cpython-312.pyc | Bin 0 -> 1981 bytes __pycache__/_custom_ops.cpython-312.pyc | Bin 0 -> 148594 bytes __pycache__/_ipex_ops.cpython-312.pyc | Bin 0 -> 18566 bytes __pycache__/beam_search.cpython-312.pyc | Bin 0 -> 3723 bytes __pycache__/collect_env.cpython-312.pyc | Bin 0 -> 30083 bytes __pycache__/connections.cpython-312.pyc | Bin 0 -> 8813 bytes __pycache__/env_override.cpython-312.pyc | Bin 0 -> 12979 bytes __pycache__/envs.cpython-312.pyc | Bin 0 -> 71705 bytes __pycache__/forward_context.cpython-312.pyc | Bin 0 -> 13763 bytes __pycache__/logger.cpython-312.pyc | Bin 0 -> 9819 bytes __pycache__/logits_process.cpython-312.pyc | Bin 0 -> 4754 bytes __pycache__/logprobs.cpython-312.pyc | Bin 0 -> 9992 bytes __pycache__/outputs.cpython-312.pyc | Bin 0 -> 16434 bytes __pycache__/pooling_params.cpython-312.pyc | Bin 0 -> 8772 bytes __pycache__/sampling_params.cpython-312.pyc | Bin 0 -> 25874 bytes __pycache__/scalar_type.cpython-312.pyc | Bin 0 -> 14217 bytes __pycache__/scripts.cpython-312.pyc | Bin 0 -> 611 bytes __pycache__/sequence.cpython-312.pyc | Bin 0 -> 5252 bytes __pycache__/tasks.cpython-312.pyc | Bin 0 -> 503 bytes __pycache__/tracing.cpython-312.pyc | Bin 0 -> 6078 bytes __pycache__/version.cpython-312.pyc | Bin 0 -> 221 bytes _aiter_ops.py | 983 +++ _bc_linter.py | 54 + _custom_ops.py | 3512 ++++++++++ _ipex_ops.py | 457 ++ assets/__init__.py | 0 assets/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 156 bytes assets/__pycache__/audio.cpython-312.pyc | Bin 0 -> 2208 bytes assets/__pycache__/base.cpython-312.pyc | Bin 0 -> 1612 bytes assets/__pycache__/image.cpython-312.pyc | Bin 0 -> 2443 bytes assets/__pycache__/video.cpython-312.pyc | Bin 0 -> 6565 bytes assets/audio.py | 43 + assets/base.py | 40 + assets/image.py | 59 + assets/video.py | 149 + attention/__init__.py | 18 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 467 bytes attention/__pycache__/layer.cpython-312.pyc | Bin 0 -> 40423 bytes .../__pycache__/selector.cpython-312.pyc | Bin 0 -> 6689 bytes attention/backends/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 168 bytes .../__pycache__/abstract.cpython-312.pyc | Bin 0 -> 16616 bytes .../__pycache__/registry.cpython-312.pyc | Bin 0 -> 9083 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 1266 bytes attention/backends/abstract.py | 391 ++ attention/backends/registry.py | 195 + attention/backends/utils.py | 33 + attention/layer.py | 1051 +++ attention/layers/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 166 bytes .../chunked_local_attention.cpython-312.pyc | Bin 0 -> 4831 bytes .../cross_attention.cpython-312.pyc | Bin 0 -> 6697 bytes .../encoder_only_attention.cpython-312.pyc | Bin 0 -> 3740 bytes attention/layers/chunked_local_attention.py | 121 + attention/layers/cross_attention.py | 178 + attention/layers/encoder_only_attention.py | 103 + attention/ops/__init__.py | 0 .../ops/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 163 bytes ...unked_prefill_paged_decode.cpython-312.pyc | Bin 0 -> 13417 bytes .../ops/__pycache__/common.cpython-312.pyc | Bin 0 -> 15147 bytes .../ops/__pycache__/flashmla.cpython-312.pyc | Bin 0 -> 8343 bytes .../merge_attn_states.cpython-312.pyc | Bin 0 -> 1952 bytes .../__pycache__/paged_attn.cpython-312.pyc | Bin 0 -> 8261 bytes .../pallas_kv_cache_update.cpython-312.pyc | Bin 0 -> 4764 bytes .../prefix_prefill.cpython-312.pyc | Bin 0 -> 25870 bytes .../rocm_aiter_paged_attn.cpython-312.pyc | Bin 0 -> 4201 bytes .../triton_decode_attention.cpython-312.pyc | Bin 0 -> 18817 bytes .../triton_merge_attn_states.cpython-312.pyc | Bin 0 -> 3363 bytes ...on_reshape_and_cache_flash.cpython-312.pyc | Bin 0 -> 6317 bytes .../triton_unified_attention.cpython-312.pyc | Bin 0 -> 30039 bytes .../vit_attn_wrappers.cpython-312.pyc | Bin 0 -> 8158 bytes attention/ops/chunked_prefill_paged_decode.py | 401 ++ attention/ops/common.py | 414 ++ attention/ops/flashmla.py | 252 + attention/ops/merge_attn_states.py | 47 + attention/ops/paged_attn.py | 262 + attention/ops/pallas_kv_cache_update.py | 130 + attention/ops/prefix_prefill.py | 814 +++ attention/ops/rocm_aiter_paged_attn.py | 123 + attention/ops/triton_decode_attention.py | 712 ++ attention/ops/triton_merge_attn_states.py | 105 + .../ops/triton_reshape_and_cache_flash.py | 184 + attention/ops/triton_unified_attention.py | 941 +++ attention/ops/vit_attn_wrappers.py | 178 + attention/selector.py | 231 + attention/utils/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 165 bytes .../__pycache__/fa_utils.cpython-312.pyc | Bin 0 -> 3702 bytes .../kv_sharing_utils.cpython-312.pyc | Bin 0 -> 1285 bytes .../kv_transfer_utils.cpython-312.pyc | Bin 0 -> 2274 bytes attention/utils/fa_utils.py | 108 + attention/utils/kv_sharing_utils.py | 33 + attention/utils/kv_transfer_utils.py | 60 + beam_search.py | 88 + benchmarks/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 160 bytes .../__pycache__/datasets.cpython-312.pyc | Bin 0 -> 110697 bytes .../__pycache__/latency.cpython-312.pyc | Bin 0 -> 7622 bytes benchmarks/__pycache__/serve.cpython-312.pyc | Bin 0 -> 55361 bytes .../__pycache__/throughput.cpython-312.pyc | Bin 0 -> 30000 bytes benchmarks/datasets.py | 3222 +++++++++ benchmarks/latency.py | 172 + benchmarks/lib/__init__.py | 3 + .../lib/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 209 bytes .../endpoint_request_func.cpython-312.pyc | Bin 0 -> 27725 bytes .../__pycache__/ready_checker.cpython-312.pyc | Bin 0 -> 2862 bytes .../lib/__pycache__/utils.cpython-312.pyc | Bin 0 -> 3618 bytes benchmarks/lib/endpoint_request_func.py | 777 +++ benchmarks/lib/ready_checker.py | 72 + benchmarks/lib/utils.py | 79 + benchmarks/serve.py | 1531 ++++ benchmarks/sweep/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 166 bytes .../sweep/__pycache__/cli.cpython-312.pyc | Bin 0 -> 1681 bytes .../__pycache__/param_sweep.cpython-312.pyc | Bin 0 -> 5865 bytes .../sweep/__pycache__/plot.cpython-312.pyc | Bin 0 -> 23877 bytes .../sweep/__pycache__/serve.cpython-312.pyc | Bin 0 -> 13925 bytes .../__pycache__/serve_sla.cpython-312.pyc | Bin 0 -> 16061 bytes .../sweep/__pycache__/server.cpython-312.pyc | Bin 0 -> 5061 bytes .../__pycache__/sla_sweep.cpython-312.pyc | Bin 0 -> 7045 bytes .../sweep/__pycache__/utils.cpython-312.pyc | Bin 0 -> 535 bytes benchmarks/sweep/cli.py | 38 + benchmarks/sweep/param_sweep.py | 91 + benchmarks/sweep/plot.py | 580 ++ benchmarks/sweep/serve.py | 416 ++ benchmarks/sweep/serve_sla.py | 492 ++ benchmarks/sweep/server.py | 114 + benchmarks/sweep/sla_sweep.py | 132 + benchmarks/sweep/utils.py | 4 + benchmarks/throughput.py | 799 +++ collect_env.py | 857 +++ compilation/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 161 bytes .../activation_quant_fusion.cpython-312.pyc | Bin 0 -> 9973 bytes .../__pycache__/backends.cpython-312.pyc | Bin 0 -> 28531 bytes .../base_static_graph.cpython-312.pyc | Bin 0 -> 2518 bytes .../__pycache__/caching.cpython-312.pyc | Bin 0 -> 9492 bytes .../collective_fusion.cpython-312.pyc | Bin 0 -> 54484 bytes .../compiler_interface.cpython-312.pyc | Bin 0 -> 24254 bytes .../__pycache__/counter.cpython-312.pyc | Bin 0 -> 2076 bytes .../__pycache__/cuda_graph.cpython-312.pyc | Bin 0 -> 9053 bytes .../__pycache__/decorators.cpython-312.pyc | Bin 0 -> 24185 bytes .../fix_functionalization.cpython-312.pyc | Bin 0 -> 12149 bytes .../__pycache__/fusion.cpython-312.pyc | Bin 0 -> 17363 bytes .../__pycache__/fusion_attn.cpython-312.pyc | Bin 0 -> 17705 bytes .../__pycache__/fx_utils.cpython-312.pyc | Bin 0 -> 4507 bytes .../__pycache__/inductor_pass.cpython-312.pyc | Bin 0 -> 6564 bytes .../__pycache__/matcher_utils.cpython-312.pyc | Bin 0 -> 17044 bytes .../__pycache__/monitor.cpython-312.pyc | Bin 0 -> 2468 bytes .../noop_elimination.cpython-312.pyc | Bin 0 -> 6746 bytes .../partition_rules.cpython-312.pyc | Bin 0 -> 2943 bytes .../__pycache__/pass_manager.cpython-312.pyc | Bin 0 -> 6980 bytes .../piecewise_backend.cpython-312.pyc | Bin 0 -> 4737 bytes .../__pycache__/post_cleanup.cpython-312.pyc | Bin 0 -> 1210 bytes .../qk_norm_rope_fusion.cpython-312.pyc | Bin 0 -> 10897 bytes .../sequence_parallelism.cpython-312.pyc | Bin 0 -> 19511 bytes .../torch25_custom_graph_pass.cpython-312.pyc | Bin 0 -> 2170 bytes .../vllm_inductor_pass.cpython-312.pyc | Bin 0 -> 9703 bytes .../__pycache__/wrapper.cpython-312.pyc | Bin 0 -> 12449 bytes compilation/activation_quant_fusion.py | 209 + compilation/backends.py | 759 ++ compilation/base_static_graph.py | 57 + compilation/caching.py | 178 + compilation/collective_fusion.py | 1234 ++++ compilation/compiler_interface.py | 639 ++ compilation/counter.py | 48 + compilation/cuda_graph.py | 216 + compilation/decorators.py | 571 ++ compilation/fix_functionalization.py | 253 + compilation/fusion.py | 374 + compilation/fusion_attn.py | 359 + compilation/fx_utils.py | 91 + compilation/inductor_pass.py | 133 + compilation/matcher_utils.py | 317 + compilation/monitor.py | 62 + compilation/noop_elimination.py | 134 + compilation/partition_rules.py | 72 + compilation/pass_manager.py | 135 + compilation/piecewise_backend.py | 121 + compilation/post_cleanup.py | 21 + compilation/qk_norm_rope_fusion.py | 238 + compilation/sequence_parallelism.py | 363 + compilation/torch25_custom_graph_pass.py | 44 + compilation/vllm_inductor_pass.py | 173 + compilation/wrapper.py | 238 + config/__init__.py | 102 + config/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 1981 bytes config/__pycache__/cache.cpython-312.pyc | Bin 0 -> 6069 bytes .../__pycache__/compilation.cpython-312.pyc | Bin 0 -> 33761 bytes config/__pycache__/device.cpython-312.pyc | Bin 0 -> 2978 bytes .../__pycache__/ec_transfer.cpython-312.pyc | Bin 0 -> 4526 bytes config/__pycache__/kv_events.cpython-312.pyc | Bin 0 -> 1417 bytes .../__pycache__/kv_transfer.cpython-312.pyc | Bin 0 -> 4599 bytes config/__pycache__/load.cpython-312.pyc | Bin 0 -> 3396 bytes config/__pycache__/lora.cpython-312.pyc | Bin 0 -> 4632 bytes config/__pycache__/model.cpython-312.pyc | Bin 0 -> 74258 bytes config/__pycache__/multimodal.cpython-312.pyc | Bin 0 -> 8619 bytes .../__pycache__/observability.cpython-312.pyc | Bin 0 -> 5399 bytes config/__pycache__/parallel.cpython-312.pyc | Bin 0 -> 21952 bytes config/__pycache__/pooler.cpython-312.pyc | Bin 0 -> 3113 bytes config/__pycache__/scheduler.cpython-312.pyc | Bin 0 -> 8976 bytes .../__pycache__/speculative.cpython-312.pyc | Bin 0 -> 22183 bytes .../speech_to_text.cpython-312.pyc | Bin 0 -> 1110 bytes .../structured_outputs.cpython-312.pyc | Bin 0 -> 3591 bytes config/__pycache__/utils.cpython-312.pyc | Bin 0 -> 8245 bytes config/__pycache__/vllm.cpython-312.pyc | Bin 0 -> 48524 bytes config/cache.py | 207 + config/compilation.py | 978 +++ config/device.py | 75 + config/ec_transfer.py | 110 + config/kv_events.py | 56 + config/kv_transfer.py | 114 + config/load.py | 124 + config/lora.py | 112 + config/model.py | 2172 ++++++ config/multimodal.py | 248 + config/observability.py | 123 + config/parallel.py | 655 ++ config/pooler.py | 122 + config/scheduler.py | 298 + config/speculative.py | 654 ++ config/speech_to_text.py | 38 + config/structured_outputs.py | 92 + config/utils.py | 178 + config/vllm.py | 1166 ++++ connections.py | 189 + device_allocator/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 166 bytes .../__pycache__/cumem.cpython-312.pyc | Bin 0 -> 13260 bytes device_allocator/cumem.py | 327 + distributed/__init__.py | 6 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 250 bytes .../communication_op.cpython-312.pyc | Bin 0 -> 2312 bytes .../__pycache__/kv_events.cpython-312.pyc | Bin 0 -> 16650 bytes .../parallel_state.cpython-312.pyc | Bin 0 -> 71265 bytes .../tpu_distributed_utils.cpython-312.pyc | Bin 0 -> 9285 bytes distributed/__pycache__/utils.cpython-312.pyc | Bin 0 -> 23220 bytes distributed/communication_op.py | 43 + distributed/device_communicators/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 182 bytes .../__pycache__/all2all.cpython-312.pyc | Bin 0 -> 20137 bytes .../all_reduce_utils.cpython-312.pyc | Bin 0 -> 12373 bytes .../base_device_communicator.cpython-312.pyc | Bin 0 -> 14463 bytes .../cpu_communicator.cpython-312.pyc | Bin 0 -> 10909 bytes .../cuda_communicator.cpython-312.pyc | Bin 0 -> 14466 bytes .../__pycache__/cuda_wrapper.cpython-312.pyc | Bin 0 -> 9500 bytes .../custom_all_reduce.cpython-312.pyc | Bin 0 -> 14661 bytes .../__pycache__/mnnvl_compat.cpython-312.pyc | Bin 0 -> 1783 bytes .../__pycache__/pynccl.cpython-312.pyc | Bin 0 -> 17305 bytes .../pynccl_allocator.cpython-312.pyc | Bin 0 -> 7588 bytes .../pynccl_wrapper.cpython-312.pyc | Bin 0 -> 15911 bytes .../quick_all_reduce.cpython-312.pyc | Bin 0 -> 13622 bytes .../ray_communicator.cpython-312.pyc | Bin 0 -> 11299 bytes .../__pycache__/shm_broadcast.cpython-312.pyc | Bin 0 -> 31362 bytes .../shm_object_storage.cpython-312.pyc | Bin 0 -> 29775 bytes .../__pycache__/symm_mem.cpython-312.pyc | Bin 0 -> 6798 bytes .../tpu_communicator.cpython-312.pyc | Bin 0 -> 4209 bytes .../xpu_communicator.cpython-312.pyc | Bin 0 -> 4689 bytes distributed/device_communicators/all2all.py | 490 ++ .../device_communicators/all_reduce_utils.py | 344 + .../base_device_communicator.py | 311 + .../device_communicators/cpu_communicator.py | 209 + .../device_communicators/cuda_communicator.py | 333 + .../device_communicators/cuda_wrapper.py | 216 + .../device_communicators/custom_all_reduce.py | 326 + .../device_communicators/mnnvl_compat.py | 27 + distributed/device_communicators/pynccl.py | 386 ++ .../device_communicators/pynccl_allocator.py | 191 + .../device_communicators/pynccl_wrapper.py | 564 ++ .../device_communicators/quick_all_reduce.py | 290 + .../device_communicators/ray_communicator.py | 259 + .../device_communicators/shm_broadcast.py | 733 ++ .../shm_object_storage.py | 660 ++ distributed/device_communicators/symm_mem.py | 156 + .../device_communicators/tpu_communicator.py | 107 + .../device_communicators/xpu_communicator.py | 95 + distributed/ec_transfer/__init__.py | 14 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 377 bytes .../ec_transfer_state.cpython-312.pyc | Bin 0 -> 1493 bytes .../ec_transfer/ec_connector/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 186 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 9812 bytes .../__pycache__/factory.cpython-312.pyc | Bin 0 -> 3522 bytes .../shared_storage_connector.cpython-312.pyc | Bin 0 -> 9582 bytes distributed/ec_transfer/ec_connector/base.py | 247 + .../ec_transfer/ec_connector/factory.py | 88 + .../ec_connector/shared_storage_connector.py | 201 + distributed/ec_transfer/ec_transfer_state.py | 42 + distributed/eplb/__init__.py | 8 + .../eplb/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 291 bytes .../__pycache__/eplb_state.cpython-312.pyc | Bin 0 -> 27954 bytes .../rebalance_algo.cpython-312.pyc | Bin 0 -> 11139 bytes .../rebalance_execute.cpython-312.pyc | Bin 0 -> 13283 bytes distributed/eplb/eplb_state.py | 837 +++ distributed/eplb/rebalance_algo.py | 260 + distributed/eplb/rebalance_execute.py | 431 ++ distributed/kv_events.py | 371 + distributed/kv_transfer/README.md | 29 + distributed/kv_transfer/__init__.py | 20 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 506 bytes .../kv_transfer_state.cpython-312.pyc | Bin 0 -> 2680 bytes .../kv_transfer/disagg_prefill_workflow.jpg | Bin 0 -> 142656 bytes .../kv_transfer/kv_connector/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 186 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 433 bytes .../__pycache__/factory.cpython-312.pyc | Bin 0 -> 7051 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 11293 bytes distributed/kv_transfer/kv_connector/base.py | 10 + .../kv_transfer/kv_connector/factory.py | 192 + distributed/kv_transfer/kv_connector/utils.py | 268 + .../kv_transfer/kv_connector/v1/__init__.py | 19 + .../v1/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 528 bytes .../v1/__pycache__/base.cpython-312.pyc | Bin 0 -> 22740 bytes .../decode_bench_connector.cpython-312.pyc | Bin 0 -> 16090 bytes .../lmcache_connector.cpython-312.pyc | Bin 0 -> 9244 bytes .../lmcache_mp_connector.cpython-312.pyc | Bin 0 -> 36054 bytes .../v1/__pycache__/metrics.cpython-312.pyc | Bin 0 -> 8692 bytes .../multi_connector.cpython-312.pyc | Bin 0 -> 20049 bytes .../nixl_connector.cpython-312.pyc | Bin 0 -> 98101 bytes .../offloading_connector.cpython-312.pyc | Bin 0 -> 24140 bytes .../shared_storage_connector.cpython-312.pyc | Bin 0 -> 18463 bytes .../kv_transfer/kv_connector/v1/base.py | 546 ++ .../kv_connector/v1/decode_bench_connector.py | 419 ++ .../kv_connector/v1/lmcache_connector.py | 216 + .../v1/lmcache_integration/__init__.py | 18 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 448 bytes .../multi_process_adapter.cpython-312.pyc | Bin 0 -> 16467 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 8840 bytes .../vllm_v1_adapter.cpython-312.pyc | Bin 0 -> 47697 bytes .../multi_process_adapter.py | 379 + .../v1/lmcache_integration/utils.py | 221 + .../v1/lmcache_integration/vllm_v1_adapter.py | 1411 ++++ .../kv_connector/v1/lmcache_mp_connector.py | 867 +++ .../kv_transfer/kv_connector/v1/metrics.py | 189 + .../kv_connector/v1/multi_connector.py | 454 ++ .../kv_connector/v1/nixl_connector.py | 2440 +++++++ .../kv_connector/v1/offloading_connector.py | 504 ++ .../kv_connector/v1/p2p/__init__.py | 0 .../p2p/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 193 bytes .../p2p_nccl_connector.cpython-312.pyc | Bin 0 -> 20406 bytes .../p2p_nccl_engine.cpython-312.pyc | Bin 0 -> 29262 bytes .../tensor_memory_pool.cpython-312.pyc | Bin 0 -> 11585 bytes .../kv_connector/v1/p2p/p2p_nccl_connector.py | 531 ++ .../kv_connector/v1/p2p/p2p_nccl_engine.py | 632 ++ .../kv_connector/v1/p2p/tensor_memory_pool.py | 273 + .../v1/shared_storage_connector.py | 450 ++ .../kv_transfer/kv_lookup_buffer/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 190 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 7381 bytes .../mooncake_store.cpython-312.pyc | Bin 0 -> 7413 bytes .../__pycache__/simple_buffer.cpython-312.pyc | Bin 0 -> 10955 bytes .../kv_transfer/kv_lookup_buffer/base.py | 179 + .../kv_lookup_buffer/mooncake_store.py | 164 + .../kv_lookup_buffer/simple_buffer.py | 242 + distributed/kv_transfer/kv_pipe/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 181 bytes .../kv_pipe/__pycache__/base.cpython-312.pyc | Bin 0 -> 2697 bytes .../__pycache__/mooncake_pipe.cpython-312.pyc | Bin 0 -> 17391 bytes .../__pycache__/pynccl_pipe.cpython-312.pyc | Bin 0 -> 13145 bytes distributed/kv_transfer/kv_pipe/base.py | 66 + .../kv_transfer/kv_pipe/mooncake_pipe.py | 295 + .../kv_transfer/kv_pipe/pynccl_pipe.py | 285 + distributed/kv_transfer/kv_transfer_state.py | 78 + distributed/parallel_state.py | 1794 +++++ distributed/tpu_distributed_utils.py | 188 + distributed/utils.py | 543 ++ engine/__init__.py | 0 engine/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 156 bytes engine/__pycache__/arg_utils.cpython-312.pyc | Bin 0 -> 83319 bytes .../async_llm_engine.cpython-312.pyc | Bin 0 -> 251 bytes engine/__pycache__/llm_engine.cpython-312.pyc | Bin 0 -> 244 bytes engine/__pycache__/protocol.cpython-312.pyc | Bin 0 -> 7586 bytes engine/arg_utils.py | 2144 ++++++ engine/async_llm_engine.py | 6 + engine/llm_engine.py | 6 + engine/protocol.py | 170 + entrypoints/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 161 bytes .../__pycache__/api_server.cpython-312.pyc | Bin 0 -> 8173 bytes .../__pycache__/chat_utils.cpython-312.pyc | Bin 0 -> 63079 bytes .../__pycache__/constants.cpython-312.pyc | Bin 0 -> 320 bytes .../__pycache__/context.cpython-312.pyc | Bin 0 -> 27255 bytes .../__pycache__/dynamic_lora.cpython-312.pyc | Bin 0 -> 2943 bytes .../__pycache__/harmony_utils.cpython-312.pyc | Bin 0 -> 19789 bytes .../__pycache__/launcher.cpython-312.pyc | Bin 0 -> 7957 bytes entrypoints/__pycache__/llm.cpython-312.pyc | Bin 0 -> 67802 bytes .../__pycache__/logger.cpython-312.pyc | Bin 0 -> 2680 bytes .../__pycache__/renderer.cpython-312.pyc | Bin 0 -> 15816 bytes .../responses_utils.cpython-312.pyc | Bin 0 -> 2616 bytes .../__pycache__/score_utils.cpython-312.pyc | Bin 0 -> 8667 bytes entrypoints/__pycache__/ssl.cpython-312.pyc | Bin 0 -> 4121 bytes entrypoints/__pycache__/tool.cpython-312.pyc | Bin 0 -> 6851 bytes .../__pycache__/tool_server.cpython-312.pyc | Bin 0 -> 10188 bytes entrypoints/__pycache__/utils.cpython-312.pyc | Bin 0 -> 12679 bytes entrypoints/anthropic/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 171 bytes .../__pycache__/protocol.cpython-312.pyc | Bin 0 -> 7151 bytes .../serving_messages.cpython-312.pyc | Bin 0 -> 15848 bytes entrypoints/anthropic/protocol.py | 162 + entrypoints/anthropic/serving_messages.py | 460 ++ entrypoints/api_server.py | 184 + entrypoints/chat_utils.py | 1690 +++++ entrypoints/cli/__init__.py | 13 + .../cli/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 638 bytes .../__pycache__/collect_env.cpython-312.pyc | Bin 0 -> 1690 bytes .../cli/__pycache__/main.cpython-312.pyc | Bin 0 -> 3560 bytes .../cli/__pycache__/openai.cpython-312.pyc | Bin 0 -> 9687 bytes .../cli/__pycache__/run_batch.cpython-312.pyc | Bin 0 -> 3012 bytes .../cli/__pycache__/serve.cpython-312.pyc | Bin 0 -> 9913 bytes .../cli/__pycache__/types.cpython-312.pyc | Bin 0 -> 1431 bytes entrypoints/cli/benchmark/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 175 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 1239 bytes .../__pycache__/latency.cpython-312.pyc | Bin 0 -> 1264 bytes .../__pycache__/main.cpython-312.pyc | Bin 0 -> 2913 bytes .../__pycache__/serve.cpython-312.pyc | Bin 0 -> 1244 bytes .../__pycache__/sweep.cpython-312.pyc | Bin 0 -> 1234 bytes .../__pycache__/throughput.cpython-312.pyc | Bin 0 -> 1272 bytes entrypoints/cli/benchmark/base.py | 25 + entrypoints/cli/benchmark/latency.py | 21 + entrypoints/cli/benchmark/main.py | 56 + entrypoints/cli/benchmark/serve.py | 21 + entrypoints/cli/benchmark/sweep.py | 21 + entrypoints/cli/benchmark/throughput.py | 21 + entrypoints/cli/collect_env.py | 38 + entrypoints/cli/main.py | 79 + entrypoints/cli/openai.py | 256 + entrypoints/cli/run_batch.py | 68 + entrypoints/cli/serve.py | 249 + entrypoints/cli/types.py | 29 + entrypoints/constants.py | 10 + entrypoints/context.py | 572 ++ entrypoints/dynamic_lora.py | 57 + entrypoints/harmony_utils.py | 535 ++ entrypoints/launcher.py | 175 + entrypoints/llm.py | 1768 +++++ entrypoints/logger.py | 84 + entrypoints/openai/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 168 bytes .../__pycache__/api_server.cpython-312.pyc | Bin 0 -> 90943 bytes .../__pycache__/cli_args.cpython-312.pyc | Bin 0 -> 10785 bytes .../__pycache__/orca_metrics.cpython-312.pyc | Bin 0 -> 4734 bytes .../__pycache__/protocol.cpython-312.pyc | Bin 0 -> 108493 bytes .../__pycache__/run_batch.cpython-312.pyc | Bin 0 -> 21409 bytes .../__pycache__/serving_chat.cpython-312.pyc | Bin 0 -> 47992 bytes .../serving_classification.cpython-312.pyc | Bin 0 -> 9202 bytes .../serving_completion.cpython-312.pyc | Bin 0 -> 22040 bytes .../serving_embedding.cpython-312.pyc | Bin 0 -> 24167 bytes .../serving_engine.cpython-312.pyc | Bin 0 -> 49057 bytes .../serving_models.cpython-312.pyc | Bin 0 -> 13786 bytes .../serving_pooling.cpython-312.pyc | Bin 0 -> 13003 bytes .../serving_responses.cpython-312.pyc | Bin 0 -> 61788 bytes .../__pycache__/serving_score.cpython-312.pyc | Bin 0 -> 17783 bytes .../serving_tokenization.cpython-312.pyc | Bin 0 -> 9533 bytes .../serving_tokens.cpython-312.pyc | Bin 0 -> 10031 bytes .../serving_transcription.cpython-312.pyc | Bin 0 -> 5567 bytes .../speech_to_text.cpython-312.pyc | Bin 0 -> 15387 bytes entrypoints/openai/api_server.py | 2096 ++++++ entrypoints/openai/cli_args.py | 302 + entrypoints/openai/orca_metrics.py | 120 + entrypoints/openai/protocol.py | 3299 +++++++++ entrypoints/openai/run_batch.py | 547 ++ entrypoints/openai/serving_chat.py | 1772 +++++ entrypoints/openai/serving_classification.py | 235 + entrypoints/openai/serving_completion.py | 715 ++ entrypoints/openai/serving_embedding.py | 695 ++ entrypoints/openai/serving_engine.py | 1433 ++++ entrypoints/openai/serving_models.py | 304 + entrypoints/openai/serving_pooling.py | 346 + entrypoints/openai/serving_responses.py | 2021 ++++++ entrypoints/openai/serving_score.py | 503 ++ entrypoints/openai/serving_tokenization.py | 203 + entrypoints/openai/serving_tokens.py | 269 + entrypoints/openai/serving_transcription.py | 148 + entrypoints/openai/speech_to_text.py | 405 ++ entrypoints/openai/tool_parsers/__init__.py | 142 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 2762 bytes .../abstract_tool_parser.cpython-312.pyc | Bin 0 -> 11487 bytes .../deepseekv31_tool_parser.cpython-312.pyc | Bin 0 -> 12457 bytes .../deepseekv3_tool_parser.cpython-312.pyc | Bin 0 -> 12551 bytes .../ernie45_tool_parser.cpython-312.pyc | Bin 0 -> 9347 bytes .../glm4_moe_tool_parser.cpython-312.pyc | Bin 0 -> 9266 bytes ...granite_20b_fc_tool_parser.cpython-312.pyc | Bin 0 -> 9334 bytes .../granite_tool_parser.cpython-312.pyc | Bin 0 -> 8520 bytes .../hermes_tool_parser.cpython-312.pyc | Bin 0 -> 15493 bytes .../hunyuan_a13b_tool_parser.cpython-312.pyc | Bin 0 -> 15019 bytes .../internlm2_tool_parser.cpython-312.pyc | Bin 0 -> 8009 bytes .../jamba_tool_parser.cpython-312.pyc | Bin 0 -> 10133 bytes .../kimi_k2_tool_parser.cpython-312.pyc | Bin 0 -> 18882 bytes ...lama4_pythonic_tool_parser.cpython-312.pyc | Bin 0 -> 13678 bytes .../llama_tool_parser.cpython-312.pyc | Bin 0 -> 9198 bytes .../longcat_tool_parser.cpython-312.pyc | Bin 0 -> 2024 bytes .../minimax_m2_tool_parser.cpython-312.pyc | Bin 0 -> 19625 bytes .../minimax_tool_parser.cpython-312.pyc | Bin 0 -> 30942 bytes .../mistral_tool_parser.cpython-312.pyc | Bin 0 -> 12635 bytes .../olmo3_tool_parser.cpython-312.pyc | Bin 0 -> 14943 bytes .../openai_tool_parser.cpython-312.pyc | Bin 0 -> 3796 bytes .../phi4mini_tool_parser.cpython-312.pyc | Bin 0 -> 4282 bytes .../pythonic_tool_parser.cpython-312.pyc | Bin 0 -> 13216 bytes .../qwen3coder_tool_parser.cpython-312.pyc | Bin 0 -> 23599 bytes .../qwen3xml_tool_parser.cpython-312.pyc | Bin 0 -> 40883 bytes .../seed_oss_tool_parser.cpython-312.pyc | Bin 0 -> 23616 bytes .../step3_tool_parser.cpython-312.pyc | Bin 0 -> 11565 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 9136 bytes .../xlam_tool_parser.cpython-312.pyc | Bin 0 -> 15705 bytes .../tool_parsers/abstract_tool_parser.py | 273 + .../tool_parsers/deepseekv31_tool_parser.py | 390 ++ .../tool_parsers/deepseekv3_tool_parser.py | 390 ++ .../tool_parsers/ernie45_tool_parser.py | 210 + .../tool_parsers/glm4_moe_tool_parser.py | 200 + .../granite_20b_fc_tool_parser.py | 273 + .../tool_parsers/granite_tool_parser.py | 253 + .../openai/tool_parsers/hermes_tool_parser.py | 494 ++ .../tool_parsers/hunyuan_a13b_tool_parser.py | 420 ++ .../tool_parsers/internlm2_tool_parser.py | 227 + .../openai/tool_parsers/jamba_tool_parser.py | 323 + .../tool_parsers/kimi_k2_tool_parser.py | 590 ++ .../llama4_pythonic_tool_parser.py | 341 + .../openai/tool_parsers/llama_tool_parser.py | 290 + .../tool_parsers/longcat_tool_parser.py | 37 + .../tool_parsers/minimax_m2_tool_parser.py | 643 ++ .../tool_parsers/minimax_tool_parser.py | 849 +++ .../tool_parsers/mistral_tool_parser.py | 390 ++ .../openai/tool_parsers/olmo3_tool_parser.py | 366 + .../openai/tool_parsers/openai_tool_parser.py | 97 + .../tool_parsers/phi4mini_tool_parser.py | 120 + .../tool_parsers/pythonic_tool_parser.py | 332 + .../tool_parsers/qwen3coder_tool_parser.py | 781 +++ .../tool_parsers/qwen3xml_tool_parser.py | 1316 ++++ .../tool_parsers/seed_oss_tool_parser.py | 744 ++ .../openai/tool_parsers/step3_tool_parser.py | 303 + entrypoints/openai/tool_parsers/utils.py | 229 + .../openai/tool_parsers/xlam_tool_parser.py | 556 ++ entrypoints/renderer.py | 409 ++ entrypoints/responses_utils.py | 77 + entrypoints/sagemaker/__init__.py | 4 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 228 bytes .../__pycache__/routes.cpython-312.pyc | Bin 0 -> 4235 bytes entrypoints/sagemaker/routes.py | 72 + entrypoints/score_utils.py | 242 + entrypoints/ssl.py | 78 + entrypoints/tool.py | 143 + entrypoints/tool_server.py | 209 + entrypoints/utils.py | 319 + env_override.py | 378 + envs.py | 1729 +++++ forward_context.py | 356 + inputs/__init__.py | 44 + inputs/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 794 bytes inputs/__pycache__/data.cpython-312.pyc | Bin 0 -> 8951 bytes inputs/__pycache__/parse.cpython-312.pyc | Bin 0 -> 5339 bytes inputs/__pycache__/preprocess.cpython-312.pyc | Bin 0 -> 23443 bytes inputs/data.py | 359 + inputs/parse.py | 137 + inputs/preprocess.py | 727 ++ logger.py | 267 + logging_utils/__init__.py | 10 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 339 bytes .../__pycache__/dump_input.cpython-312.pyc | Bin 0 -> 4113 bytes .../__pycache__/formatter.cpython-312.pyc | Bin 0 -> 3900 bytes .../__pycache__/log_time.cpython-312.pyc | Bin 0 -> 1376 bytes logging_utils/dump_input.py | 83 + logging_utils/formatter.py | 77 + logging_utils/log_time.py | 34 + logits_process.py | 121 + logprobs.py | 208 + lora/__init__.py | 0 lora/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 154 bytes lora/__pycache__/lora_weights.cpython-312.pyc | Bin 0 -> 7808 bytes lora/__pycache__/models.cpython-312.pyc | Bin 0 -> 39871 bytes lora/__pycache__/peft_helper.cpython-312.pyc | Bin 0 -> 6076 bytes lora/__pycache__/request.cpython-312.pyc | Bin 0 -> 4310 bytes lora/__pycache__/resolver.cpython-312.pyc | Bin 0 -> 3875 bytes lora/__pycache__/utils.cpython-312.pyc | Bin 0 -> 10927 bytes .../worker_manager.cpython-312.pyc | Bin 0 -> 12899 bytes lora/layers/__init__.py | 41 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 1279 bytes lora/layers/__pycache__/base.cpython-312.pyc | Bin 0 -> 2976 bytes .../__pycache__/base_linear.cpython-312.pyc | Bin 0 -> 7931 bytes .../column_parallel_linear.cpython-312.pyc | Bin 0 -> 25344 bytes .../__pycache__/fused_moe.cpython-312.pyc | Bin 0 -> 19112 bytes .../logits_processor.cpython-312.pyc | Bin 0 -> 10762 bytes .../replicated_linear.cpython-312.pyc | Bin 0 -> 3314 bytes .../row_parallel_linear.cpython-312.pyc | Bin 0 -> 7678 bytes lora/layers/__pycache__/utils.cpython-312.pyc | Bin 0 -> 2875 bytes .../vocal_parallel_embedding.cpython-312.pyc | Bin 0 -> 8478 bytes lora/layers/base.py | 67 + lora/layers/base_linear.py | 164 + lora/layers/column_parallel_linear.py | 578 ++ lora/layers/fused_moe.py | 472 ++ lora/layers/logits_processor.py | 252 + lora/layers/replicated_linear.py | 70 + lora/layers/row_parallel_linear.py | 181 + lora/layers/utils.py | 65 + lora/layers/vocal_parallel_embedding.py | 166 + lora/lora_weights.py | 198 + lora/models.py | 890 +++ lora/ops/__init__.py | 0 lora/ops/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 158 bytes lora/ops/ipex_ops/__init__.py | 6 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 315 bytes .../__pycache__/lora_ops.cpython-312.pyc | Bin 0 -> 2053 bytes lora/ops/ipex_ops/lora_ops.py | 57 + lora/ops/torch_ops/__init__.py | 20 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 392 bytes .../__pycache__/lora_ops.cpython-312.pyc | Bin 0 -> 5264 bytes lora/ops/torch_ops/lora_ops.py | 128 + lora/ops/triton_ops/README_TUNING.md | 60 + lora/ops/triton_ops/__init__.py | 21 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 620 bytes .../fused_moe_lora_op.cpython-312.pyc | Bin 0 -> 19306 bytes .../__pycache__/kernel_utils.cpython-312.pyc | Bin 0 -> 10301 bytes .../lora_expand_op.cpython-312.pyc | Bin 0 -> 10691 bytes .../lora_kernel_metadata.cpython-312.pyc | Bin 0 -> 5305 bytes .../lora_shrink_op.cpython-312.pyc | Bin 0 -> 8865 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 13075 bytes lora/ops/triton_ops/fused_moe_lora_op.py | 640 ++ lora/ops/triton_ops/kernel_utils.py | 364 + lora/ops/triton_ops/lora_expand_op.py | 336 + lora/ops/triton_ops/lora_kernel_metadata.py | 154 + lora/ops/triton_ops/lora_shrink_op.py | 290 + lora/ops/triton_ops/utils.py | 362 + lora/ops/xla_ops/__init__.py | 6 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 313 bytes .../__pycache__/lora_ops.cpython-312.pyc | Bin 0 -> 5662 bytes lora/ops/xla_ops/lora_ops.py | 141 + lora/peft_helper.py | 128 + lora/punica_wrapper/__init__.py | 10 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 379 bytes .../__pycache__/punica_base.cpython-312.pyc | Bin 0 -> 18772 bytes .../__pycache__/punica_cpu.cpython-312.pyc | Bin 0 -> 13293 bytes .../__pycache__/punica_gpu.cpython-312.pyc | Bin 0 -> 14975 bytes .../punica_selector.cpython-312.pyc | Bin 0 -> 1076 bytes .../__pycache__/punica_tpu.cpython-312.pyc | Bin 0 -> 16045 bytes .../__pycache__/punica_xpu.cpython-312.pyc | Bin 0 -> 11244 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 5789 bytes lora/punica_wrapper/punica_base.py | 492 ++ lora/punica_wrapper/punica_cpu.py | 351 + lora/punica_wrapper/punica_gpu.py | 422 ++ lora/punica_wrapper/punica_selector.py | 21 + lora/punica_wrapper/punica_tpu.py | 359 + lora/punica_wrapper/punica_xpu.py | 279 + lora/punica_wrapper/utils.py | 150 + lora/request.py | 100 + lora/resolver.py | 88 + lora/utils.py | 293 + lora/worker_manager.py | 279 + model_executor/__init__.py | 11 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 378 bytes .../__pycache__/custom_op.cpython-312.pyc | Bin 0 -> 7872 bytes .../__pycache__/parameter.cpython-312.pyc | Bin 0 -> 28677 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 3798 bytes model_executor/custom_op.py | 194 + model_executor/layers/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 171 bytes .../__pycache__/activation.cpython-312.pyc | Bin 0 -> 33991 bytes .../attention_layer_base.cpython-312.pyc | Bin 0 -> 1564 bytes .../batch_invariant.cpython-312.pyc | Bin 0 -> 31725 bytes .../layers/__pycache__/conv.cpython-312.pyc | Bin 0 -> 10853 bytes .../layers/__pycache__/kda.cpython-312.pyc | Bin 0 -> 16543 bytes .../__pycache__/layernorm.cpython-312.pyc | Bin 0 -> 24933 bytes .../lightning_attn.cpython-312.pyc | Bin 0 -> 23160 bytes .../layers/__pycache__/linear.cpython-312.pyc | Bin 0 -> 57622 bytes .../logits_processor.cpython-312.pyc | Bin 0 -> 4444 bytes .../layers/__pycache__/mla.cpython-312.pyc | Bin 0 -> 8685 bytes .../layers/__pycache__/pooler.cpython-312.pyc | Bin 0 -> 38340 bytes .../__pycache__/resampler.cpython-312.pyc | Bin 0 -> 11454 bytes .../layers/__pycache__/utils.cpython-312.pyc | Bin 0 -> 12499 bytes .../vocab_parallel_embedding.cpython-312.pyc | Bin 0 -> 24473 bytes model_executor/layers/activation.py | 577 ++ model_executor/layers/attention_layer_base.py | 35 + model_executor/layers/batch_invariant.py | 854 +++ model_executor/layers/conv.py | 236 + model_executor/layers/fla/__init__.py | 8 + .../fla/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 175 bytes model_executor/layers/fla/ops/__init__.py | 17 + .../ops/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 403 bytes .../fla/ops/__pycache__/chunk.cpython-312.pyc | Bin 0 -> 9841 bytes .../__pycache__/chunk_delta_h.cpython-312.pyc | Bin 0 -> 16360 bytes .../ops/__pycache__/chunk_o.cpython-312.pyc | Bin 0 -> 8069 bytes .../chunk_scaled_dot_kkt.cpython-312.pyc | Bin 0 -> 6716 bytes .../ops/__pycache__/cumsum.cpython-312.pyc | Bin 0 -> 11751 bytes .../fused_recurrent.cpython-312.pyc | Bin 0 -> 16544 bytes .../fla/ops/__pycache__/index.cpython-312.pyc | Bin 0 -> 1941 bytes .../fla/ops/__pycache__/kda.cpython-312.pyc | Bin 0 -> 53731 bytes .../ops/__pycache__/l2norm.cpython-312.pyc | Bin 0 -> 6366 bytes .../layernorm_guard.cpython-312.pyc | Bin 0 -> 15289 bytes .../fla/ops/__pycache__/op.cpython-312.pyc | Bin 0 -> 1656 bytes .../__pycache__/solve_tril.cpython-312.pyc | Bin 0 -> 27843 bytes .../fla/ops/__pycache__/utils.cpython-312.pyc | Bin 0 -> 9355 bytes .../ops/__pycache__/wy_fast.cpython-312.pyc | Bin 0 -> 6858 bytes model_executor/layers/fla/ops/chunk.py | 240 + .../layers/fla/ops/chunk_delta_h.py | 344 + model_executor/layers/fla/ops/chunk_o.py | 183 + .../layers/fla/ops/chunk_scaled_dot_kkt.py | 154 + model_executor/layers/fla/ops/cumsum.py | 280 + .../layers/fla/ops/fused_recurrent.py | 390 ++ model_executor/layers/fla/ops/index.py | 41 + model_executor/layers/fla/ops/kda.py | 1351 ++++ model_executor/layers/fla/ops/l2norm.py | 146 + .../layers/fla/ops/layernorm_guard.py | 396 ++ model_executor/layers/fla/ops/op.py | 60 + model_executor/layers/fla/ops/solve_tril.py | 556 ++ model_executor/layers/fla/ops/utils.py | 194 + model_executor/layers/fla/ops/wy_fast.py | 158 + model_executor/layers/fused_moe/__init__.py | 106 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 3100 bytes .../__pycache__/all2all_utils.cpython-312.pyc | Bin 0 -> 5248 bytes .../batched_deep_gemm_moe.cpython-312.pyc | Bin 0 -> 17338 bytes ...ed_triton_or_deep_gemm_moe.cpython-312.pyc | Bin 0 -> 6732 bytes .../__pycache__/config.cpython-312.pyc | Bin 0 -> 37749 bytes .../__pycache__/cpu_fused_moe.cpython-312.pyc | Bin 0 -> 15307 bytes .../__pycache__/cutlass_moe.cpython-312.pyc | Bin 0 -> 41691 bytes .../__pycache__/deep_gemm_moe.cpython-312.pyc | Bin 0 -> 15603 bytes .../deep_gemm_utils.cpython-312.pyc | Bin 0 -> 14759 bytes ...deepep_ht_prepare_finalize.cpython-312.pyc | Bin 0 -> 13810 bytes ...deepep_ll_prepare_finalize.cpython-312.pyc | Bin 0 -> 13692 bytes .../flashinfer_cutlass_moe.cpython-312.pyc | Bin 0 -> 11265 bytes ...r_cutlass_prepare_finalize.cpython-312.pyc | Bin 0 -> 13006 bytes .../flashinfer_trtllm_moe.cpython-312.pyc | Bin 0 -> 6560 bytes .../fused_batched_moe.cpython-312.pyc | Bin 0 -> 36161 bytes .../fused_marlin_moe.cpython-312.pyc | Bin 0 -> 30989 bytes .../__pycache__/fused_moe.cpython-312.pyc | Bin 0 -> 68741 bytes .../fused_moe_method_base.cpython-312.pyc | Bin 0 -> 5436 bytes .../fused_moe_modular_method.cpython-312.pyc | Bin 0 -> 6844 bytes ...gpt_oss_triton_kernels_moe.cpython-312.pyc | Bin 0 -> 12376 bytes .../__pycache__/layer.cpython-312.pyc | Bin 0 -> 72036 bytes .../modular_kernel.cpython-312.pyc | Bin 0 -> 45735 bytes .../moe_align_block_size.cpython-312.pyc | Bin 0 -> 8126 bytes .../__pycache__/moe_pallas.cpython-312.pyc | Bin 0 -> 4926 bytes .../moe_permute_unpermute.cpython-312.pyc | Bin 0 -> 9456 bytes .../moe_torch_iterative.cpython-312.pyc | Bin 0 -> 2729 bytes .../pplx_prepare_finalize.cpython-312.pyc | Bin 0 -> 12401 bytes .../prepare_finalize.cpython-312.pyc | Bin 0 -> 3837 bytes .../rocm_aiter_fused_moe.cpython-312.pyc | Bin 0 -> 8618 bytes .../routing_simulator.cpython-312.pyc | Bin 0 -> 12370 bytes .../shared_fused_moe.cpython-312.pyc | Bin 0 -> 3716 bytes .../topk_weight_and_reduce.cpython-312.pyc | Bin 0 -> 7970 bytes .../triton_deep_gemm_moe.cpython-312.pyc | Bin 0 -> 6188 bytes .../__pycache__/trtllm_moe.cpython-312.pyc | Bin 0 -> 6222 bytes ...quantized_fused_moe_method.cpython-312.pyc | Bin 0 -> 22021 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 13285 bytes .../layers/fused_moe/all2all_utils.py | 160 + .../layers/fused_moe/batched_deep_gemm_moe.py | 406 ++ .../batched_triton_or_deep_gemm_moe.py | 180 + model_executor/layers/fused_moe/config.py | 916 +++ ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 146 + ...336,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 218 + ...792,device_name=NVIDIA_A100-SXM4-80GB.json | 218 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 218 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...072,device_name=NVIDIA_H100_80GB_HBM3.json | 218 + ...ice_name=NVIDIA_H200,dtype=int8_w8a16.json | 146 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 218 + ...584,device_name=NVIDIA_A100-SXM4-80GB.json | 218 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 218 + ...168,device_name=NVIDIA_A100-SXM4-80GB.json | 218 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...=1024,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...evice_name=NVIDIA_H100,dtype=fp8_w8a8.json | 123 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=128,N=1024,device_name=NVIDIA_H200.json | 146 + ...856,device_name=NVIDIA_H100_80GB_HBM3.json | 147 + .../E=128,N=1856,device_name=NVIDIA_L40S.json | 147 + ...192,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...192,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...E=128,N=192,device_name=NVIDIA_H20-3e.json | 146 + .../E=128,N=192,device_name=NVIDIA_H20.json | 146 + .../E=128,N=192,device_name=NVIDIA_H200.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 122 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...E=128,N=384,device_name=NVIDIA_H20-3e.json | 146 + .../E=128,N=384,device_name=NVIDIA_H20.json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + .../E=128,N=384,device_name=NVIDIA_H200.json | 146 + ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_B200,dtype=fp8_w8a8.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 114 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...N=768,device_name=AMD_Instinct_MI308X.json | 213 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + .../E=128,N=768,device_name=NVIDIA_H20.json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + .../E=128,N=768,device_name=NVIDIA_H200.json | 146 + ...name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json | 82 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 82 + ...928,device_name=NVIDIA_H100_80GB_HBM3.json | 147 + .../E=128,N=928,device_name=NVIDIA_L40S.json | 147 + .../E=128,N=96,device_name=NVIDIA_H20.json | 146 + ...=1024,device_name=AMD_Instinct_MI300X.json | 200 + ...evice_name=NVIDIA_B200,dtype=fp8_w8a8.json | 147 + .../E=16,N=1024,device_name=NVIDIA_B200.json | 146 + .../E=16,N=1024,device_name=NVIDIA_H100.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=16,N=1024,device_name=NVIDIA_H200.json | 146 + ...344,device_name=NVIDIA_A100-SXM4-40GB.json | 146 + ...344,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...344,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 146 + ...336,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 218 + ...792,device_name=NVIDIA_A100-SXM4-80GB.json | 218 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...792,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=16,N=2048,device_name=NVIDIA_H200.json | 146 + ...688,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...688,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 146 + ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...ice_name=NVIDIA_H200,dtype=int8_w8a16.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 130 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 146 + ...584,device_name=NVIDIA_A100-SXM4-80GB.json | 218 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 130 + ...VIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 146 + ...168,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 146 + ...VIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 130 + ...N=192,device_name=AMD_Instinct_MI300X.json | 201 + ...AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json | 164 + ...192,device_name=NVIDIA_A800-SXM4-80GB.json | 146 + ...E=160,N=192,device_name=NVIDIA_H20-3e.json | 146 + ...E=160,N=320,device_name=NVIDIA_H20-3e.json | 146 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json | 164 + ...AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...Instinct_MI325X,block_shape=[128,128].json | 200 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 200 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 200 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 200 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 200 + ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...=64,device_name=NVIDIA_A800-SXM4-80GB.json | 146 + .../E=32,N=1408,device_name=NVIDIA_B200.json | 147 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...evice_name=NVIDIA_B200,dtype=fp8_w8a8.json | 147 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...128,device_name=NVIDIA_A100-SXM4-80GB.json | 147 + .../E=512,N=128,device_name=NVIDIA_B200.json | 146 + ...vice_name=NVIDIA_GB200,dtype=fp8_w8a8.json | 147 + ...128,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...E=512,N=128,device_name=NVIDIA_H20-3e.json | 146 + .../E=512,N=128,device_name=NVIDIA_H200.json | 146 + .../E=512,N=256,device_name=NVIDIA_B200.json | 146 + ...vice_name=NVIDIA_GB200,dtype=fp8_w8a8.json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 + ...256,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...E=512,N=256,device_name=NVIDIA_H20-3e.json | 146 + .../E=512,N=256,device_name=NVIDIA_H200.json | 146 + .../E=512,N=512,device_name=NVIDIA_B200.json | 146 + ...vice_name=NVIDIA_GB200,dtype=fp8_w8a8.json | 146 + ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...E=512,N=512,device_name=NVIDIA_H20-3e.json | 146 + .../E=512,N=512,device_name=NVIDIA_H200.json | 146 + ...=64,device_name=NVIDIA_A100-SXM4-80GB.json | 147 + .../E=512,N=64,device_name=NVIDIA_B200.json | 146 + .../E=512,N=64,device_name=NVIDIA_H20-3e.json | 146 + .../E=512,N=64,device_name=NVIDIA_H200.json | 146 + ...=1408,device_name=AMD_Instinct_MI300X.json | 200 + ...N=176,device_name=AMD_Instinct_MI300X.json | 200 + ...N=352,device_name=AMD_Instinct_MI300X.json | 200 + ...N=704,device_name=AMD_Instinct_MI300X.json | 200 + ...N=128,device_name=AMD_Instinct_MI300X.json | 200 + ...N=256,device_name=AMD_Instinct_MI300X.json | 200 + ...256,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...N=512,device_name=AMD_Instinct_MI300X.json | 200 + ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...280,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...280,device_name=NVIDIA_A800-SXM4-80GB.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...280,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=64,N=1280,device_name=NVIDIA_H200.json | 146 + .../E=64,N=1408,device_name=NVIDIA_B200.json | 147 + ...device_name=NVIDIA_H20,dtype=fp8_w8a8.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=64,N=2560,device_name=NVIDIA_H200.json | 146 + ...device_name=NVIDIA_H20,dtype=fp8_w8a8.json | 146 + .../E=64,N=3072,device_name=NVIDIA_H20.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...320,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=64,N=320,device_name=NVIDIA_H200.json | 146 + ...device_name=NVIDIA_H20,dtype=fp8_w8a8.json | 146 + .../E=64,N=384,device_name=NVIDIA_H20.json | 146 + ...640,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...640,device_name=NVIDIA_A800-SXM4-80GB.json | 146 + ...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...640,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=64,N=640,device_name=NVIDIA_H200.json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 147 + ...device_name=NVIDIA_H20,dtype=fp8_w8a8.json | 146 + .../E=64,N=768,device_name=NVIDIA_H20.json | 146 + .../E=64,N=896,device_name=NVIDIA_H20.json | 146 + ...name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json | 82 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 82 + ...N=192,device_name=AMD_Instinct_MI300X.json | 200 + ...N=384,device_name=AMD_Instinct_MI300X.json | 200 + ...384,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...N=768,device_name=AMD_Instinct_MI300X.json | 200 + ...768,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...14336,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...14336,device_name=AMD_Instinct_MI325X.json | 200 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 138 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=8,N=14336,device_name=NVIDIA_H200.json | 146 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...16384,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...16384,device_name=AMD_Instinct_MI325X.json | 200 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...=1792,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...=1792,device_name=AMD_Instinct_MI325X.json | 200 + ...792,device_name=NVIDIA_A100-SXM4-40GB.json | 146 + ...792,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...792,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=8,N=1792,device_name=NVIDIA_H200.json | 146 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...=2048,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...=2048,device_name=AMD_Instinct_MI325X.json | 200 + ...048,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...048,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 154 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=8,N=2048,device_name=NVIDIA_H200.json | 146 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...=3584,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...=3584,device_name=AMD_Instinct_MI325X.json | 200 + ...584,device_name=NVIDIA_A100-SXM4-40GB.json | 146 + ...584,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...584,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=8,N=3584,device_name=NVIDIA_H200.json | 146 + .../E=8,N=3584,device_name=NVIDIA_L40S.json | 173 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...=4096,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...=4096,device_name=AMD_Instinct_MI325X.json | 200 + ...096,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...096,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=8,N=4096,device_name=NVIDIA_H200.json | 146 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...=7168,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...=7168,device_name=AMD_Instinct_MI325X.json | 200 + ...168,device_name=NVIDIA_A100-SXM4-80GB.json | 146 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...168,device_name=NVIDIA_H100_80GB_HBM3.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../E=8,N=7168,device_name=NVIDIA_H200.json | 146 + ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json | 164 + ...=8192,device_name=AMD_Instinct_MI300X.json | 200 + ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 + ...=8192,device_name=AMD_Instinct_MI325X.json | 200 + ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 + ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 + .../layers/fused_moe/configs/README | 12 + .../layers/fused_moe/cpu_fused_moe.py | 354 + .../layers/fused_moe/cutlass_moe.py | 1052 +++ .../layers/fused_moe/deep_gemm_moe.py | 387 ++ .../layers/fused_moe/deep_gemm_utils.py | 416 ++ .../fused_moe/deepep_ht_prepare_finalize.py | 420 ++ .../fused_moe/deepep_ll_prepare_finalize.py | 367 + .../fused_moe/flashinfer_cutlass_moe.py | 307 + .../flashinfer_cutlass_prepare_finalize.py | 362 + .../layers/fused_moe/flashinfer_trtllm_moe.py | 192 + .../layers/fused_moe/fused_batched_moe.py | 1012 +++ .../layers/fused_moe/fused_marlin_moe.py | 792 +++ model_executor/layers/fused_moe/fused_moe.py | 2306 +++++++ .../layers/fused_moe/fused_moe_method_base.py | 112 + .../fused_moe/fused_moe_modular_method.py | 164 + .../fused_moe/gpt_oss_triton_kernels_moe.py | 316 + model_executor/layers/fused_moe/layer.py | 2038 ++++++ .../layers/fused_moe/modular_kernel.py | 1222 ++++ .../layers/fused_moe/moe_align_block_size.py | 174 + model_executor/layers/fused_moe/moe_pallas.py | 83 + .../layers/fused_moe/moe_permute_unpermute.py | 229 + .../layers/fused_moe/moe_torch_iterative.py | 60 + .../layers/fused_moe/pplx_prepare_finalize.py | 362 + .../layers/fused_moe/prepare_finalize.py | 77 + .../layers/fused_moe/rocm_aiter_fused_moe.py | 265 + .../layers/fused_moe/routing_simulator.py | 310 + .../layers/fused_moe/shared_fused_moe.py | 97 + .../fused_moe/topk_weight_and_reduce.py | 171 + .../layers/fused_moe/triton_deep_gemm_moe.py | 163 + model_executor/layers/fused_moe/trtllm_moe.py | 143 + .../fused_moe/unquantized_fused_moe_method.py | 578 ++ model_executor/layers/fused_moe/utils.py | 332 + model_executor/layers/kda.py | 448 ++ model_executor/layers/layernorm.py | 578 ++ model_executor/layers/lightning_attn.py | 729 ++ model_executor/layers/linear.py | 1496 ++++ model_executor/layers/logits_processor.py | 109 + model_executor/layers/mamba/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 177 bytes .../__pycache__/abstract.cpython-312.pyc | Bin 0 -> 3273 bytes .../__pycache__/linear_attn.cpython-312.pyc | Bin 0 -> 20236 bytes .../__pycache__/mamba_mixer.cpython-312.pyc | Bin 0 -> 19928 bytes .../__pycache__/mamba_mixer2.cpython-312.pyc | Bin 0 -> 28575 bytes .../__pycache__/mamba_utils.cpython-312.pyc | Bin 0 -> 8024 bytes .../__pycache__/short_conv.cpython-312.pyc | Bin 0 -> 9617 bytes model_executor/layers/mamba/abstract.py | 71 + model_executor/layers/mamba/linear_attn.py | 402 ++ model_executor/layers/mamba/mamba_mixer.py | 535 ++ model_executor/layers/mamba/mamba_mixer2.py | 928 +++ model_executor/layers/mamba/mamba_utils.py | 225 + model_executor/layers/mamba/ops/__init__.py | 0 .../ops/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 181 bytes .../__pycache__/causal_conv1d.cpython-312.pyc | Bin 0 -> 38667 bytes .../layernorm_gated.cpython-312.pyc | Bin 0 -> 7212 bytes .../ops/__pycache__/mamba_ssm.cpython-312.pyc | Bin 0 -> 19389 bytes .../ops/__pycache__/ssd_bmm.cpython-312.pyc | Bin 0 -> 8508 bytes .../ssd_chunk_scan.cpython-312.pyc | Bin 0 -> 17232 bytes .../ssd_chunk_state.cpython-312.pyc | Bin 0 -> 27541 bytes .../__pycache__/ssd_combined.cpython-312.pyc | Bin 0 -> 5410 bytes .../ssd_state_passing.cpython-312.pyc | Bin 0 -> 6480 bytes .../layers/mamba/ops/causal_conv1d.py | 1240 ++++ .../layers/mamba/ops/layernorm_gated.py | 172 + model_executor/layers/mamba/ops/mamba_ssm.py | 478 ++ model_executor/layers/mamba/ops/ssd_bmm.py | 211 + .../layers/mamba/ops/ssd_chunk_scan.py | 456 ++ .../layers/mamba/ops/ssd_chunk_state.py | 700 ++ .../layers/mamba/ops/ssd_combined.py | 230 + .../layers/mamba/ops/ssd_state_passing.py | 157 + model_executor/layers/mamba/short_conv.py | 264 + model_executor/layers/mla.py | 159 + model_executor/layers/pooler.py | 817 +++ .../layers/quantization/__init__.py | 177 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 5204 bytes .../__pycache__/auto_round.cpython-312.pyc | Bin 0 -> 17005 bytes .../__pycache__/awq.cpython-312.pyc | Bin 0 -> 12290 bytes .../__pycache__/awq_marlin.cpython-312.pyc | Bin 0 -> 27763 bytes .../__pycache__/awq_triton.cpython-312.pyc | Bin 0 -> 13303 bytes .../__pycache__/base_config.cpython-312.pyc | Bin 0 -> 8171 bytes .../__pycache__/bitblas.cpython-312.pyc | Bin 0 -> 18008 bytes .../__pycache__/bitsandbytes.cpython-312.pyc | Bin 0 -> 25303 bytes .../__pycache__/deepspeedfp.cpython-312.pyc | Bin 0 -> 11053 bytes .../__pycache__/experts_int8.cpython-312.pyc | Bin 0 -> 10786 bytes .../__pycache__/fbgemm_fp8.cpython-312.pyc | Bin 0 -> 8582 bytes .../__pycache__/fp8.cpython-312.pyc | Bin 0 -> 48847 bytes .../__pycache__/fp_quant.cpython-312.pyc | Bin 0 -> 16355 bytes .../__pycache__/gguf.cpython-312.pyc | Bin 0 -> 27392 bytes .../__pycache__/gptq.cpython-312.pyc | Bin 0 -> 15138 bytes .../__pycache__/gptq_bitblas.cpython-312.pyc | Bin 0 -> 17169 bytes .../__pycache__/gptq_marlin.cpython-312.pyc | Bin 0 -> 36625 bytes .../gptq_marlin_24.cpython-312.pyc | Bin 0 -> 12232 bytes .../__pycache__/hqq_marlin.cpython-312.pyc | Bin 0 -> 16942 bytes .../__pycache__/inc.cpython-312.pyc | Bin 0 -> 2568 bytes .../input_quant_fp8.cpython-312.pyc | Bin 0 -> 8415 bytes .../__pycache__/ipex_quant.cpython-312.pyc | Bin 0 -> 22177 bytes .../__pycache__/kv_cache.cpython-312.pyc | Bin 0 -> 6894 bytes .../__pycache__/modelopt.cpython-312.pyc | Bin 0 -> 63209 bytes .../__pycache__/moe_wna16.cpython-312.pyc | Bin 0 -> 21307 bytes .../__pycache__/mxfp4.cpython-312.pyc | Bin 0 -> 47430 bytes .../__pycache__/petit.cpython-312.pyc | Bin 0 -> 14343 bytes .../__pycache__/ptpc_fp8.cpython-312.pyc | Bin 0 -> 6947 bytes .../__pycache__/qutlass_utils.cpython-312.pyc | Bin 0 -> 6465 bytes .../__pycache__/rtn.cpython-312.pyc | Bin 0 -> 29542 bytes .../__pycache__/schema.cpython-312.pyc | Bin 0 -> 4340 bytes .../__pycache__/torchao.cpython-312.pyc | Bin 0 -> 15833 bytes .../__pycache__/tpu_int8.cpython-312.pyc | Bin 0 -> 7400 bytes .../__pycache__/w8a16.cpython-312.pyc | Bin 0 -> 5386 bytes .../layers/quantization/auto_round.py | 454 ++ model_executor/layers/quantization/awq.py | 278 + .../layers/quantization/awq_marlin.py | 869 +++ .../layers/quantization/awq_triton.py | 337 + .../layers/quantization/base_config.py | 170 + model_executor/layers/quantization/bitblas.py | 502 ++ .../layers/quantization/bitsandbytes.py | 658 ++ .../compressed_tensors/__init__.py | 3 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 203 bytes .../compressed_tensors.cpython-312.pyc | Bin 0 -> 34493 bytes .../compressed_tensors_moe.cpython-312.pyc | Bin 0 -> 127709 bytes .../triton_scaled_mm.cpython-312.pyc | Bin 0 -> 9368 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 7738 bytes .../compressed_tensors/compressed_tensors.py | 914 +++ .../compressed_tensors_moe.py | 3534 ++++++++++ .../compressed_tensors/schemes/__init__.py | 35 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 1227 bytes .../compressed_tensors_24.cpython-312.pyc | Bin 0 -> 16077 bytes .../compressed_tensors_scheme.cpython-312.pyc | Bin 0 -> 2420 bytes ...ompressed_tensors_w4a16_24.cpython-312.pyc | Bin 0 -> 6711 bytes ...ressed_tensors_w4a16_nvfp4.cpython-312.pyc | Bin 0 -> 4861 bytes ...pressed_tensors_w4a4_nvfp4.cpython-312.pyc | Bin 0 -> 9136 bytes ...ompressed_tensors_w4a8_fp8.cpython-312.pyc | Bin 0 -> 6831 bytes ...ompressed_tensors_w4a8_int.cpython-312.pyc | Bin 0 -> 5695 bytes ...mpressed_tensors_w8a16_fp8.cpython-312.pyc | Bin 0 -> 5787 bytes ...ompressed_tensors_w8a8_fp8.cpython-312.pyc | Bin 0 -> 7849 bytes ...mpressed_tensors_w8a8_int8.cpython-312.pyc | Bin 0 -> 5749 bytes .../compressed_tensors_wNa16.cpython-312.pyc | Bin 0 -> 7662 bytes .../schemes/compressed_tensors_24.py | 392 ++ .../schemes/compressed_tensors_scheme.py | 55 + .../schemes/compressed_tensors_w4a16_24.py | 176 + .../schemes/compressed_tensors_w4a16_nvfp4.py | 124 + .../schemes/compressed_tensors_w4a4_nvfp4.py | 218 + .../schemes/compressed_tensors_w4a8_fp8.py | 183 + .../schemes/compressed_tensors_w4a8_int.py | 153 + .../schemes/compressed_tensors_w8a16_fp8.py | 138 + .../schemes/compressed_tensors_w8a8_fp8.py | 200 + .../schemes/compressed_tensors_w8a8_int8.py | 137 + .../schemes/compressed_tensors_wNa16.py | 219 + .../compressed_tensors/transform/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 213 bytes .../__pycache__/linear.cpython-312.pyc | Bin 0 -> 10625 bytes .../__pycache__/module.cpython-312.pyc | Bin 0 -> 7745 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 661 bytes .../compressed_tensors/transform/linear.py | 260 + .../compressed_tensors/transform/module.py | 173 + .../transform/schemes/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 221 bytes .../linear_qutlass_nvfp4.cpython-312.pyc | Bin 0 -> 2667 bytes .../transform/schemes/linear_qutlass_nvfp4.py | 64 + .../compressed_tensors/transform/utils.py | 13 + .../compressed_tensors/triton_scaled_mm.py | 224 + .../quantization/compressed_tensors/utils.py | 216 + .../layers/quantization/deepspeedfp.py | 218 + .../layers/quantization/experts_int8.py | 240 + .../layers/quantization/fbgemm_fp8.py | 195 + model_executor/layers/quantization/fp8.py | 1333 ++++ .../layers/quantization/fp_quant.py | 420 ++ model_executor/layers/quantization/gguf.py | 651 ++ model_executor/layers/quantization/gptq.py | 393 ++ .../layers/quantization/gptq_bitblas.py | 482 ++ .../layers/quantization/gptq_marlin.py | 1099 +++ .../layers/quantization/gptq_marlin_24.py | 320 + .../layers/quantization/hqq_marlin.py | 371 + model_executor/layers/quantization/inc.py | 65 + .../layers/quantization/input_quant_fp8.py | 171 + .../layers/quantization/ipex_quant.py | 467 ++ .../layers/quantization/kernels/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 192 bytes .../kernels/mixed_precision/MPLinearKernel.py | 94 + .../kernels/mixed_precision/__init__.py | 105 + .../MPLinearKernel.cpython-312.pyc | Bin 0 -> 4534 bytes .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 3766 bytes .../__pycache__/allspark.cpython-312.pyc | Bin 0 -> 5581 bytes .../__pycache__/bitblas.cpython-312.pyc | Bin 0 -> 13151 bytes .../__pycache__/conch.cpython-312.pyc | Bin 0 -> 4606 bytes .../__pycache__/cutlass.cpython-312.pyc | Bin 0 -> 6374 bytes .../__pycache__/dynamic_4bit.cpython-312.pyc | Bin 0 -> 5547 bytes .../__pycache__/exllama.cpython-312.pyc | Bin 0 -> 7644 bytes .../__pycache__/machete.cpython-312.pyc | Bin 0 -> 7850 bytes .../__pycache__/marlin.cpython-312.pyc | Bin 0 -> 12333 bytes .../kernels/mixed_precision/allspark.py | 115 + .../kernels/mixed_precision/bitblas.py | 323 + .../kernels/mixed_precision/conch.py | 98 + .../kernels/mixed_precision/cutlass.py | 119 + .../kernels/mixed_precision/dynamic_4bit.py | 111 + .../kernels/mixed_precision/exllama.py | 161 + .../kernels/mixed_precision/machete.py | 154 + .../kernels/mixed_precision/marlin.py | 325 + .../kernels/scaled_mm/ScaledMMLinearKernel.py | 73 + .../kernels/scaled_mm/__init__.py | 97 + .../ScaledMMLinearKernel.cpython-312.pyc | Bin 0 -> 3478 bytes .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 3691 bytes .../__pycache__/aiter.cpython-312.pyc | Bin 0 -> 4849 bytes .../scaled_mm/__pycache__/cpu.cpython-312.pyc | Bin 0 -> 10421 bytes .../__pycache__/cutlass.cpython-312.pyc | Bin 0 -> 6915 bytes .../__pycache__/triton.cpython-312.pyc | Bin 0 -> 2223 bytes .../scaled_mm/__pycache__/xla.cpython-312.pyc | Bin 0 -> 4859 bytes .../quantization/kernels/scaled_mm/aiter.py | 120 + .../quantization/kernels/scaled_mm/cpu.py | 219 + .../quantization/kernels/scaled_mm/cutlass.py | 160 + .../quantization/kernels/scaled_mm/triton.py | 42 + .../quantization/kernels/scaled_mm/xla.py | 105 + .../layers/quantization/kv_cache.py | 146 + .../layers/quantization/modelopt.py | 1788 +++++ .../layers/quantization/moe_wna16.py | 541 ++ model_executor/layers/quantization/mxfp4.py | 1162 ++++ model_executor/layers/quantization/petit.py | 320 + .../layers/quantization/ptpc_fp8.py | 137 + .../layers/quantization/quark/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 190 bytes .../quark/__pycache__/quark.cpython-312.pyc | Bin 0 -> 22202 bytes .../__pycache__/quark_moe.cpython-312.pyc | Bin 0 -> 26404 bytes .../quark/__pycache__/utils.cpython-312.pyc | Bin 0 -> 3631 bytes .../layers/quantization/quark/quark.py | 528 ++ .../layers/quantization/quark/quark_moe.py | 683 ++ .../quantization/quark/schemes/__init__.py | 9 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 450 bytes .../__pycache__/quark_ocp_mx.cpython-312.pyc | Bin 0 -> 12598 bytes .../__pycache__/quark_scheme.cpython-312.pyc | Bin 0 -> 2321 bytes .../quark_w8a8_fp8.cpython-312.pyc | Bin 0 -> 7144 bytes .../quark_w8a8_int8.cpython-312.pyc | Bin 0 -> 5708 bytes .../quark/schemes/quark_ocp_mx.py | 306 + .../quark/schemes/quark_scheme.py | 55 + .../quark/schemes/quark_w8a8_fp8.py | 179 + .../quark/schemes/quark_w8a8_int8.py | 139 + .../layers/quantization/quark/utils.py | 105 + .../layers/quantization/qutlass_utils.py | 185 + model_executor/layers/quantization/rtn.py | 652 ++ model_executor/layers/quantization/schema.py | 90 + model_executor/layers/quantization/torchao.py | 380 + .../layers/quantization/tpu_int8.py | 139 + .../layers/quantization/utils/__init__.py | 6 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 319 bytes .../allspark_utils.cpython-312.pyc | Bin 0 -> 2201 bytes .../__pycache__/bitblas_utils.cpython-312.pyc | Bin 0 -> 8574 bytes .../flashinfer_fp4_moe.cpython-312.pyc | Bin 0 -> 4159 bytes .../flashinfer_utils.cpython-312.pyc | Bin 0 -> 11727 bytes .../__pycache__/fp8_utils.cpython-312.pyc | Bin 0 -> 46575 bytes .../__pycache__/gguf_utils.cpython-312.pyc | Bin 0 -> 26672 bytes .../__pycache__/gptq_utils.cpython-312.pyc | Bin 0 -> 5958 bytes .../__pycache__/int8_utils.cpython-312.pyc | Bin 0 -> 21359 bytes .../__pycache__/layer_utils.cpython-312.pyc | Bin 0 -> 2026 bytes .../__pycache__/machete_utils.cpython-312.pyc | Bin 0 -> 2322 bytes .../__pycache__/marlin_utils.cpython-312.pyc | Bin 0 -> 20344 bytes .../marlin_utils_fp4.cpython-312.pyc | Bin 0 -> 16876 bytes .../marlin_utils_fp8.cpython-312.pyc | Bin 0 -> 13583 bytes .../marlin_utils_test.cpython-312.pyc | Bin 0 -> 7010 bytes .../marlin_utils_test_24.cpython-312.pyc | Bin 0 -> 19041 bytes .../__pycache__/mxfp4_utils.cpython-312.pyc | Bin 0 -> 6661 bytes .../__pycache__/mxfp6_utils.cpython-312.pyc | Bin 0 -> 5059 bytes .../__pycache__/mxfp8_utils.cpython-312.pyc | Bin 0 -> 1144 bytes .../nvfp4_emulation_utils.cpython-312.pyc | Bin 0 -> 7531 bytes .../nvfp4_moe_support.cpython-312.pyc | Bin 0 -> 2252 bytes .../__pycache__/ocp_mx_utils.cpython-312.pyc | Bin 0 -> 1780 bytes .../__pycache__/petit_utils.cpython-312.pyc | Bin 0 -> 4228 bytes .../__pycache__/quant_utils.cpython-312.pyc | Bin 0 -> 30167 bytes .../__pycache__/w8a8_utils.cpython-312.pyc | Bin 0 -> 17323 bytes .../quantization/utils/allspark_utils.py | 67 + .../quantization/utils/bitblas_utils.py | 229 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 18 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...dtype=int8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 + .../quantization/utils/configs/README.md | 3 + .../quantization/utils/flashinfer_fp4_moe.py | 89 + .../quantization/utils/flashinfer_utils.py | 298 + .../layers/quantization/utils/fp8_utils.py | 1206 ++++ .../layers/quantization/utils/gguf_utils.py | 373 + .../layers/quantization/utils/gptq_utils.py | 158 + .../layers/quantization/utils/int8_utils.py | 489 ++ .../layers/quantization/utils/layer_utils.py | 41 + .../quantization/utils/machete_utils.py | 56 + .../layers/quantization/utils/marlin_utils.py | 575 ++ .../quantization/utils/marlin_utils_fp4.py | 397 ++ .../quantization/utils/marlin_utils_fp8.py | 351 + .../quantization/utils/marlin_utils_test.py | 161 + .../utils/marlin_utils_test_24.py | 467 ++ .../layers/quantization/utils/mxfp4_utils.py | 181 + .../layers/quantization/utils/mxfp6_utils.py | 142 + .../layers/quantization/utils/mxfp8_utils.py | 24 + .../utils/nvfp4_emulation_utils.py | 142 + .../quantization/utils/nvfp4_moe_support.py | 63 + .../layers/quantization/utils/ocp_mx_utils.py | 51 + .../layers/quantization/utils/petit_utils.py | 124 + .../layers/quantization/utils/quant_utils.py | 687 ++ .../layers/quantization/utils/w8a8_utils.py | 516 ++ model_executor/layers/quantization/w8a16.py | 114 + model_executor/layers/resampler.py | 283 + .../layers/rotary_embedding/__init__.py | 278 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 6299 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 9571 bytes .../__pycache__/common.cpython-312.pyc | Bin 0 -> 6616 bytes .../deepseek_scaling_rope.cpython-312.pyc | Bin 0 -> 4452 bytes .../dual_chunk_rope.cpython-312.pyc | Bin 0 -> 10549 bytes .../dynamic_ntk_alpha_rope.cpython-312.pyc | Bin 0 -> 2145 bytes .../dynamic_ntk_scaling_rope.cpython-312.pyc | Bin 0 -> 2300 bytes .../ernie45_vl_rope.cpython-312.pyc | Bin 0 -> 4306 bytes .../linear_scaling_rope.cpython-312.pyc | Bin 0 -> 4154 bytes .../__pycache__/llama3_rope.cpython-312.pyc | Bin 0 -> 2400 bytes .../llama4_vision_rope.cpython-312.pyc | Bin 0 -> 5263 bytes .../__pycache__/mrope.cpython-312.pyc | Bin 0 -> 16796 bytes .../ntk_scaling_rope.cpython-312.pyc | Bin 0 -> 2333 bytes ...phi3_long_rope_scaled_rope.cpython-312.pyc | Bin 0 -> 6041 bytes .../yarn_scaling_rope.cpython-312.pyc | Bin 0 -> 3760 bytes .../layers/rotary_embedding/base.py | 235 + .../layers/rotary_embedding/common.py | 188 + .../rotary_embedding/deepseek_scaling_rope.py | 106 + .../rotary_embedding/dual_chunk_rope.py | 215 + .../dynamic_ntk_alpha_rope.py | 43 + .../dynamic_ntk_scaling_rope.py | 68 + .../rotary_embedding/ernie45_vl_rope.py | 75 + .../rotary_embedding/linear_scaling_rope.py | 115 + .../layers/rotary_embedding/llama3_rope.py | 54 + .../rotary_embedding/llama4_vision_rope.py | 80 + .../layers/rotary_embedding/mrope.py | 403 ++ .../rotary_embedding/ntk_scaling_rope.py | 47 + .../phi3_long_rope_scaled_rope.py | 151 + .../rotary_embedding/yarn_scaling_rope.py | 81 + .../shared_fused_moe.cpython-312.pyc | Bin 0 -> 2514 bytes .../shared_fused_moe/shared_fused_moe.py | 56 + model_executor/layers/utils.py | 253 + .../layers/vocab_parallel_embedding.py | 558 ++ model_executor/model_loader/__init__.py | 152 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 4614 bytes .../__pycache__/base_loader.cpython-312.pyc | Bin 0 -> 2953 bytes .../bitsandbytes_loader.cpython-312.pyc | Bin 0 -> 33235 bytes .../default_loader.cpython-312.pyc | Bin 0 -> 11756 bytes .../__pycache__/dummy_loader.cpython-312.pyc | Bin 0 -> 1702 bytes .../__pycache__/gguf_loader.cpython-312.pyc | Bin 0 -> 8459 bytes .../online_quantization.cpython-312.pyc | Bin 0 -> 5077 bytes .../runai_streamer_loader.cpython-312.pyc | Bin 0 -> 5329 bytes .../sharded_state_loader.cpython-312.pyc | Bin 0 -> 9682 bytes .../__pycache__/tensorizer.cpython-312.pyc | Bin 0 -> 34493 bytes .../tensorizer_loader.cpython-312.pyc | Bin 0 -> 7756 bytes .../__pycache__/tpu.cpython-312.pyc | Bin 0 -> 5654 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 12738 bytes .../__pycache__/weight_utils.cpython-312.pyc | Bin 0 -> 47920 bytes model_executor/model_loader/base_loader.py | 57 + .../model_loader/bitsandbytes_loader.py | 822 +++ model_executor/model_loader/default_loader.py | 329 + model_executor/model_loader/dummy_loader.py | 28 + model_executor/model_loader/gguf_loader.py | 176 + .../model_loader/online_quantization.py | 224 + .../model_loader/runai_streamer_loader.py | 116 + .../model_loader/sharded_state_loader.py | 206 + model_executor/model_loader/tensorizer.py | 790 +++ .../model_loader/tensorizer_loader.py | 151 + model_executor/model_loader/tpu.py | 118 + model_executor/model_loader/utils.py | 288 + model_executor/model_loader/weight_utils.py | 1106 +++ model_executor/models/__init__.py | 44 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 854 bytes .../__pycache__/adapters.cpython-312.pyc | Bin 0 -> 23022 bytes .../models/__pycache__/afmoe.cpython-312.pyc | Bin 0 -> 27636 bytes .../models/__pycache__/aimv2.cpython-312.pyc | Bin 0 -> 12433 bytes .../__pycache__/apertus.cpython-312.pyc | Bin 0 -> 22496 bytes .../models/__pycache__/arcee.cpython-312.pyc | Bin 0 -> 15832 bytes .../models/__pycache__/arctic.cpython-312.pyc | Bin 0 -> 25739 bytes .../models/__pycache__/aria.cpython-312.pyc | Bin 0 -> 29070 bytes .../__pycache__/aya_vision.cpython-312.pyc | Bin 0 -> 22096 bytes .../__pycache__/baichuan.cpython-312.pyc | Bin 0 -> 19811 bytes .../__pycache__/bailing_moe.cpython-312.pyc | Bin 0 -> 25387 bytes .../models/__pycache__/bamba.cpython-312.pyc | Bin 0 -> 20895 bytes .../models/__pycache__/bee.cpython-312.pyc | Bin 0 -> 7195 bytes .../models/__pycache__/bert.cpython-312.pyc | Bin 0 -> 40571 bytes .../bert_with_rope.cpython-312.pyc | Bin 0 -> 30603 bytes .../models/__pycache__/blip.cpython-312.pyc | Bin 0 -> 15226 bytes .../models/__pycache__/blip2.cpython-312.pyc | Bin 0 -> 30671 bytes .../models/__pycache__/bloom.cpython-312.pyc | Bin 0 -> 16589 bytes .../__pycache__/chameleon.cpython-312.pyc | Bin 0 -> 51176 bytes .../__pycache__/chatglm.cpython-312.pyc | Bin 0 -> 19890 bytes .../models/__pycache__/clip.cpython-312.pyc | Bin 0 -> 39577 bytes .../cohere2_vision.cpython-312.pyc | Bin 0 -> 21196 bytes .../__pycache__/commandr.cpython-312.pyc | Bin 0 -> 20204 bytes .../models/__pycache__/config.cpython-312.pyc | Bin 0 -> 19057 bytes .../models/__pycache__/dbrx.cpython-312.pyc | Bin 0 -> 21474 bytes .../__pycache__/deepencoder.cpython-312.pyc | Bin 0 -> 31074 bytes .../deepseek_eagle.cpython-312.pyc | Bin 0 -> 10569 bytes .../__pycache__/deepseek_mtp.cpython-312.pyc | Bin 0 -> 15043 bytes .../__pycache__/deepseek_ocr.cpython-312.pyc | Bin 0 -> 26166 bytes .../__pycache__/deepseek_v2.cpython-312.pyc | Bin 0 -> 68524 bytes .../__pycache__/deepseek_vl2.cpython-312.pyc | Bin 0 -> 27455 bytes .../models/__pycache__/dots1.cpython-312.pyc | Bin 0 -> 22344 bytes .../__pycache__/dots_ocr.cpython-312.pyc | Bin 0 -> 40858 bytes .../__pycache__/ernie45.cpython-312.pyc | Bin 0 -> 1632 bytes .../__pycache__/ernie45_moe.cpython-312.pyc | Bin 0 -> 27862 bytes .../__pycache__/ernie45_vl.cpython-312.pyc | Bin 0 -> 71702 bytes .../ernie45_vl_moe.cpython-312.pyc | Bin 0 -> 27968 bytes .../__pycache__/ernie_mtp.cpython-312.pyc | Bin 0 -> 10929 bytes .../models/__pycache__/exaone.cpython-312.pyc | Bin 0 -> 19637 bytes .../__pycache__/exaone4.cpython-312.pyc | Bin 0 -> 19564 bytes .../fairseq2_llama.cpython-312.pyc | Bin 0 -> 6728 bytes .../models/__pycache__/falcon.cpython-312.pyc | Bin 0 -> 21587 bytes .../__pycache__/falcon_h1.cpython-312.pyc | Bin 0 -> 27523 bytes .../__pycache__/flex_olmo.cpython-312.pyc | Bin 0 -> 6661 bytes .../models/__pycache__/fuyu.cpython-312.pyc | Bin 0 -> 16120 bytes .../models/__pycache__/gemma.cpython-312.pyc | Bin 0 -> 17051 bytes .../models/__pycache__/gemma2.cpython-312.pyc | Bin 0 -> 17800 bytes .../models/__pycache__/gemma3.cpython-312.pyc | Bin 0 -> 21915 bytes .../__pycache__/gemma3_mm.cpython-312.pyc | Bin 0 -> 30048 bytes .../__pycache__/gemma3n.cpython-312.pyc | Bin 0 -> 43914 bytes .../__pycache__/gemma3n_mm.cpython-312.pyc | Bin 0 -> 32596 bytes .../models/__pycache__/glm.cpython-312.pyc | Bin 0 -> 1516 bytes .../models/__pycache__/glm4.cpython-312.pyc | Bin 0 -> 12153 bytes .../__pycache__/glm4_1v.cpython-312.pyc | Bin 0 -> 73665 bytes .../__pycache__/glm4_moe.cpython-312.pyc | Bin 0 -> 29377 bytes .../__pycache__/glm4_moe_mtp.cpython-312.pyc | Bin 0 -> 14288 bytes .../models/__pycache__/glm4v.cpython-312.pyc | Bin 0 -> 33722 bytes .../models/__pycache__/gpt2.cpython-312.pyc | Bin 0 -> 17469 bytes .../__pycache__/gpt_bigcode.cpython-312.pyc | Bin 0 -> 14468 bytes .../models/__pycache__/gpt_j.cpython-312.pyc | Bin 0 -> 14677 bytes .../__pycache__/gpt_neox.cpython-312.pyc | Bin 0 -> 14815 bytes .../__pycache__/gpt_oss.cpython-312.pyc | Bin 0 -> 29609 bytes .../__pycache__/granite.cpython-312.pyc | Bin 0 -> 19471 bytes .../granite_speech.cpython-312.pyc | Bin 0 -> 40569 bytes .../__pycache__/granitemoe.cpython-312.pyc | Bin 0 -> 21202 bytes .../granitemoehybrid.cpython-312.pyc | Bin 0 -> 26784 bytes .../granitemoeshared.cpython-312.pyc | Bin 0 -> 14924 bytes .../models/__pycache__/gritlm.cpython-312.pyc | Bin 0 -> 10482 bytes .../models/__pycache__/grok1.cpython-312.pyc | Bin 0 -> 20514 bytes .../models/__pycache__/h2ovl.cpython-312.pyc | Bin 0 -> 16115 bytes .../__pycache__/hunyuan_v1.cpython-312.pyc | Bin 0 -> 39860 bytes .../hyperclovax_vision.cpython-312.pyc | Bin 0 -> 45437 bytes .../idefics2_vision_model.cpython-312.pyc | Bin 0 -> 17722 bytes .../__pycache__/idefics3.cpython-312.pyc | Bin 0 -> 29966 bytes .../__pycache__/interfaces.cpython-312.pyc | Bin 0 -> 40067 bytes .../interfaces_base.cpython-312.pyc | Bin 0 -> 7934 bytes .../__pycache__/intern_vit.cpython-312.pyc | Bin 0 -> 19385 bytes .../__pycache__/internlm2.cpython-312.pyc | Bin 0 -> 20047 bytes .../__pycache__/internlm2_ve.cpython-312.pyc | Bin 0 -> 6544 bytes .../__pycache__/interns1.cpython-312.pyc | Bin 0 -> 35563 bytes .../__pycache__/interns1_vit.cpython-312.pyc | Bin 0 -> 19530 bytes .../__pycache__/internvl.cpython-312.pyc | Bin 0 -> 55357 bytes .../models/__pycache__/jais.cpython-312.pyc | Bin 0 -> 16818 bytes .../models/__pycache__/jamba.cpython-312.pyc | Bin 0 -> 24369 bytes .../__pycache__/jina_vl.cpython-312.pyc | Bin 0 -> 7014 bytes .../models/__pycache__/keye.cpython-312.pyc | Bin 0 -> 72080 bytes .../__pycache__/keye_vl1_5.cpython-312.pyc | Bin 0 -> 30496 bytes .../__pycache__/kimi_linear.cpython-312.pyc | Bin 0 -> 24934 bytes .../__pycache__/kimi_vl.cpython-312.pyc | Bin 0 -> 21995 bytes .../models/__pycache__/lfm2.cpython-312.pyc | Bin 0 -> 22592 bytes .../__pycache__/lfm2_moe.cpython-312.pyc | Bin 0 -> 30736 bytes .../__pycache__/lightonocr.cpython-312.pyc | Bin 0 -> 8682 bytes .../models/__pycache__/llama.cpython-312.pyc | Bin 0 -> 27826 bytes .../models/__pycache__/llama4.cpython-312.pyc | Bin 0 -> 30028 bytes .../__pycache__/llama4_eagle.cpython-312.pyc | Bin 0 -> 10746 bytes .../__pycache__/llama_eagle.cpython-312.pyc | Bin 0 -> 9989 bytes .../__pycache__/llama_eagle3.cpython-312.pyc | Bin 0 -> 15965 bytes .../models/__pycache__/llava.cpython-312.pyc | Bin 0 -> 34616 bytes .../__pycache__/llava_next.cpython-312.pyc | Bin 0 -> 23721 bytes .../llava_next_video.cpython-312.pyc | Bin 0 -> 21407 bytes .../llava_onevision.cpython-312.pyc | Bin 0 -> 35923 bytes .../__pycache__/longcat_flash.cpython-312.pyc | Bin 0 -> 26831 bytes .../longcat_flash_mtp.cpython-312.pyc | Bin 0 -> 16797 bytes .../models/__pycache__/mamba.cpython-312.pyc | Bin 0 -> 12866 bytes .../models/__pycache__/mamba2.cpython-312.pyc | Bin 0 -> 13159 bytes .../models/__pycache__/medusa.cpython-312.pyc | Bin 0 -> 8335 bytes .../__pycache__/midashenglm.cpython-312.pyc | Bin 0 -> 37463 bytes .../models/__pycache__/mimo.cpython-312.pyc | Bin 0 -> 6408 bytes .../__pycache__/mimo_mtp.cpython-312.pyc | Bin 0 -> 11829 bytes .../__pycache__/minicpm.cpython-312.pyc | Bin 0 -> 26755 bytes .../__pycache__/minicpm3.cpython-312.pyc | Bin 0 -> 10448 bytes .../__pycache__/minicpm_eagle.cpython-312.pyc | Bin 0 -> 16201 bytes .../__pycache__/minicpmo.cpython-312.pyc | Bin 0 -> 32959 bytes .../__pycache__/minicpmv.cpython-312.pyc | Bin 0 -> 73082 bytes .../__pycache__/minimax_m2.cpython-312.pyc | Bin 0 -> 21553 bytes .../minimax_text_01.cpython-312.pyc | Bin 0 -> 43778 bytes .../__pycache__/minimax_vl_01.cpython-312.pyc | Bin 0 -> 18368 bytes .../__pycache__/mistral3.cpython-312.pyc | Bin 0 -> 27521 bytes .../__pycache__/mixtral.cpython-312.pyc | Bin 0 -> 23628 bytes .../__pycache__/mllama4.cpython-312.pyc | Bin 0 -> 48299 bytes .../mlp_speculator.cpython-312.pyc | Bin 0 -> 8279 bytes .../__pycache__/modernbert.cpython-312.pyc | Bin 0 -> 24727 bytes .../module_mapping.cpython-312.pyc | Bin 0 -> 2456 bytes .../models/__pycache__/molmo.cpython-312.pyc | Bin 0 -> 65613 bytes .../__pycache__/moonvit.cpython-312.pyc | Bin 0 -> 32302 bytes .../models/__pycache__/mpt.cpython-312.pyc | Bin 0 -> 16329 bytes .../nano_nemotron_vl.cpython-312.pyc | Bin 0 -> 68874 bytes .../__pycache__/nemotron.cpython-312.pyc | Bin 0 -> 19889 bytes .../__pycache__/nemotron_h.cpython-312.pyc | Bin 0 -> 32010 bytes .../__pycache__/nemotron_nas.cpython-312.pyc | Bin 0 -> 17261 bytes .../__pycache__/nemotron_vl.cpython-312.pyc | Bin 0 -> 25825 bytes .../models/__pycache__/nvlm_d.cpython-312.pyc | Bin 0 -> 9100 bytes .../models/__pycache__/olmo.cpython-312.pyc | Bin 0 -> 15766 bytes .../models/__pycache__/olmo2.cpython-312.pyc | Bin 0 -> 19003 bytes .../models/__pycache__/olmoe.cpython-312.pyc | Bin 0 -> 19440 bytes .../__pycache__/openpangu.cpython-312.pyc | Bin 0 -> 39497 bytes .../__pycache__/openpangu_mtp.cpython-312.pyc | Bin 0 -> 9825 bytes .../models/__pycache__/opt.cpython-312.pyc | Bin 0 -> 17130 bytes .../models/__pycache__/orion.cpython-312.pyc | Bin 0 -> 15731 bytes .../models/__pycache__/ouro.cpython-312.pyc | Bin 0 -> 19104 bytes .../models/__pycache__/ovis.cpython-312.pyc | Bin 0 -> 25979 bytes .../__pycache__/ovis2_5.cpython-312.pyc | Bin 0 -> 29516 bytes .../__pycache__/paddleocr_vl.cpython-312.pyc | Bin 0 -> 61436 bytes .../__pycache__/paligemma.cpython-312.pyc | Bin 0 -> 17608 bytes .../__pycache__/persimmon.cpython-312.pyc | Bin 0 -> 16085 bytes .../models/__pycache__/phi.cpython-312.pyc | Bin 0 -> 14166 bytes .../models/__pycache__/phi3.cpython-312.pyc | Bin 0 -> 599 bytes .../models/__pycache__/phi3v.cpython-312.pyc | Bin 0 -> 28443 bytes .../phi4_multimodal.cpython-312.pyc | Bin 0 -> 60670 bytes .../models/__pycache__/phi4mm.cpython-312.pyc | Bin 0 -> 48134 bytes .../__pycache__/phi4mm_audio.cpython-312.pyc | Bin 0 -> 49487 bytes .../__pycache__/phi4mm_utils.cpython-312.pyc | Bin 0 -> 77869 bytes .../models/__pycache__/phimoe.cpython-312.pyc | Bin 0 -> 24029 bytes .../__pycache__/pixtral.cpython-312.pyc | Bin 0 -> 66710 bytes .../models/__pycache__/plamo2.cpython-312.pyc | Bin 0 -> 40287 bytes .../models/__pycache__/qwen.cpython-312.pyc | Bin 0 -> 15912 bytes .../models/__pycache__/qwen2.cpython-312.pyc | Bin 0 -> 20082 bytes .../qwen2_5_omni_thinker.cpython-312.pyc | Bin 0 -> 47069 bytes .../__pycache__/qwen2_5_vl.cpython-312.pyc | Bin 0 -> 62664 bytes .../__pycache__/qwen2_audio.cpython-312.pyc | Bin 0 -> 20529 bytes .../__pycache__/qwen2_moe.cpython-312.pyc | Bin 0 -> 22367 bytes .../__pycache__/qwen2_rm.cpython-312.pyc | Bin 0 -> 5524 bytes .../__pycache__/qwen2_vl.cpython-312.pyc | Bin 0 -> 66859 bytes .../models/__pycache__/qwen3.cpython-312.pyc | Bin 0 -> 14033 bytes .../__pycache__/qwen3_moe.cpython-312.pyc | Bin 0 -> 29684 bytes .../__pycache__/qwen3_next.cpython-312.pyc | Bin 0 -> 54763 bytes .../qwen3_next_mtp.cpython-312.pyc | Bin 0 -> 11917 bytes .../qwen3_omni_moe_thinker.cpython-312.pyc | Bin 0 -> 70411 bytes .../__pycache__/qwen3_vl.cpython-312.pyc | Bin 0 -> 68928 bytes .../__pycache__/qwen3_vl_moe.cpython-312.pyc | Bin 0 -> 13197 bytes .../__pycache__/qwen_vl.cpython-312.pyc | Bin 0 -> 33646 bytes .../models/__pycache__/radio.cpython-312.pyc | Bin 0 -> 24093 bytes .../__pycache__/registry.cpython-312.pyc | Bin 0 -> 44147 bytes .../__pycache__/roberta.cpython-312.pyc | Bin 0 -> 11941 bytes .../models/__pycache__/rvl.cpython-312.pyc | Bin 0 -> 5507 bytes .../__pycache__/seed_oss.cpython-312.pyc | Bin 0 -> 18362 bytes .../models/__pycache__/siglip.cpython-312.pyc | Bin 0 -> 48402 bytes .../__pycache__/siglip2navit.cpython-312.pyc | Bin 0 -> 32260 bytes .../__pycache__/skyworkr1v.cpython-312.pyc | Bin 0 -> 35337 bytes .../__pycache__/smolvlm.cpython-312.pyc | Bin 0 -> 2255 bytes .../models/__pycache__/solar.cpython-312.pyc | Bin 0 -> 18938 bytes .../__pycache__/stablelm.cpython-312.pyc | Bin 0 -> 16058 bytes .../__pycache__/starcoder2.cpython-312.pyc | Bin 0 -> 15422 bytes .../__pycache__/step3_text.cpython-312.pyc | Bin 0 -> 22603 bytes .../__pycache__/step3_vl.cpython-312.pyc | Bin 0 -> 51493 bytes .../models/__pycache__/swin.cpython-312.pyc | Bin 0 -> 20902 bytes .../__pycache__/tarsier.cpython-312.pyc | Bin 0 -> 27177 bytes .../__pycache__/telechat2.cpython-312.pyc | Bin 0 -> 5791 bytes .../__pycache__/teleflm.cpython-312.pyc | Bin 0 -> 2894 bytes .../__pycache__/terratorch.cpython-312.pyc | Bin 0 -> 14318 bytes .../__pycache__/ultravox.cpython-312.pyc | Bin 0 -> 30805 bytes .../models/__pycache__/utils.cpython-312.pyc | Bin 0 -> 38448 bytes .../models/__pycache__/vision.cpython-312.pyc | Bin 0 -> 20197 bytes .../__pycache__/voxtral.cpython-312.pyc | Bin 0 -> 37827 bytes .../__pycache__/whisper.cpython-312.pyc | Bin 0 -> 42285 bytes .../models/__pycache__/zamba2.cpython-312.pyc | Bin 0 -> 35673 bytes model_executor/models/adapters.py | 543 ++ model_executor/models/afmoe.py | 711 ++ model_executor/models/aimv2.py | 247 + model_executor/models/apertus.py | 587 ++ model_executor/models/arcee.py | 439 ++ model_executor/models/arctic.py | 635 ++ model_executor/models/aria.py | 655 ++ model_executor/models/aya_vision.py | 450 ++ model_executor/models/baichuan.py | 496 ++ model_executor/models/bailing_moe.py | 646 ++ model_executor/models/bamba.py | 522 ++ model_executor/models/bee.py | 157 + model_executor/models/bert.py | 925 +++ model_executor/models/bert_with_rope.py | 732 ++ model_executor/models/blip.py | 349 + model_executor/models/blip2.py | 695 ++ model_executor/models/bloom.py | 390 ++ model_executor/models/chameleon.py | 1120 +++ model_executor/models/chatglm.py | 498 ++ model_executor/models/clip.py | 965 +++ model_executor/models/cohere2_vision.py | 472 ++ model_executor/models/commandr.py | 473 ++ model_executor/models/config.py | 503 ++ model_executor/models/dbrx.py | 482 ++ model_executor/models/deepencoder.py | 673 ++ model_executor/models/deepseek_eagle.py | 260 + model_executor/models/deepseek_mtp.py | 360 + model_executor/models/deepseek_ocr.py | 593 ++ model_executor/models/deepseek_v2.py | 1758 +++++ model_executor/models/deepseek_vl2.py | 655 ++ model_executor/models/dots1.py | 574 ++ model_executor/models/dots_ocr.py | 900 +++ model_executor/models/ernie45.py | 53 + model_executor/models/ernie45_moe.py | 760 ++ model_executor/models/ernie45_vl.py | 1742 +++++ model_executor/models/ernie45_vl_moe.py | 803 +++ model_executor/models/ernie_mtp.py | 279 + model_executor/models/exaone.py | 545 ++ model_executor/models/exaone4.py | 531 ++ model_executor/models/fairseq2_llama.py | 154 + model_executor/models/falcon.py | 545 ++ model_executor/models/falcon_h1.py | 685 ++ model_executor/models/flex_olmo.py | 155 + model_executor/models/fuyu.py | 373 + model_executor/models/gemma.py | 426 ++ model_executor/models/gemma2.py | 439 ++ model_executor/models/gemma3.py | 571 ++ model_executor/models/gemma3_mm.py | 741 ++ model_executor/models/gemma3n.py | 1166 ++++ model_executor/models/gemma3n_mm.py | 811 +++ model_executor/models/glm.py | 23 + model_executor/models/glm4.py | 305 + model_executor/models/glm4_1v.py | 1821 +++++ model_executor/models/glm4_moe.py | 754 ++ model_executor/models/glm4_moe_mtp.py | 359 + model_executor/models/glm4v.py | 784 +++ model_executor/models/gpt2.py | 397 ++ model_executor/models/gpt_bigcode.py | 339 + model_executor/models/gpt_j.py | 346 + model_executor/models/gpt_neox.py | 344 + model_executor/models/gpt_oss.py | 725 ++ model_executor/models/granite.py | 516 ++ model_executor/models/granite_speech.py | 913 +++ model_executor/models/granitemoe.py | 569 ++ model_executor/models/granitemoehybrid.py | 709 ++ model_executor/models/granitemoeshared.py | 333 + model_executor/models/gritlm.py | 245 + model_executor/models/grok1.py | 558 ++ model_executor/models/h2ovl.py | 554 ++ model_executor/models/hunyuan_v1.py | 1053 +++ model_executor/models/hyperclovax_vision.py | 1166 ++++ .../models/idefics2_vision_model.py | 426 ++ model_executor/models/idefics3.py | 717 ++ model_executor/models/interfaces.py | 1092 +++ model_executor/models/interfaces_base.py | 214 + model_executor/models/intern_vit.py | 453 ++ model_executor/models/internlm2.py | 460 ++ model_executor/models/internlm2_ve.py | 142 + model_executor/models/interns1.py | 830 +++ model_executor/models/interns1_vit.py | 432 ++ model_executor/models/internvl.py | 1452 ++++ model_executor/models/jais.py | 397 ++ model_executor/models/jamba.py | 610 ++ model_executor/models/jina_vl.py | 147 + model_executor/models/keye.py | 1761 +++++ model_executor/models/keye_vl1_5.py | 726 ++ model_executor/models/kimi_linear.py | 663 ++ model_executor/models/kimi_vl.py | 578 ++ model_executor/models/lfm2.py | 532 ++ model_executor/models/lfm2_moe.py | 762 ++ model_executor/models/lightonocr.py | 195 + model_executor/models/llama.py | 732 ++ model_executor/models/llama4.py | 859 +++ model_executor/models/llama4_eagle.py | 223 + model_executor/models/llama_eagle.py | 218 + model_executor/models/llama_eagle3.py | 367 + model_executor/models/llava.py | 842 +++ model_executor/models/llava_next.py | 583 ++ model_executor/models/llava_next_video.py | 467 ++ model_executor/models/llava_onevision.py | 923 +++ model_executor/models/longcat_flash.py | 749 ++ model_executor/models/longcat_flash_mtp.py | 349 + model_executor/models/mamba.py | 276 + model_executor/models/mamba2.py | 289 + model_executor/models/medusa.py | 179 + model_executor/models/midashenglm.py | 827 +++ model_executor/models/mimo.py | 188 + model_executor/models/mimo_mtp.py | 294 + model_executor/models/minicpm.py | 664 ++ model_executor/models/minicpm3.py | 242 + model_executor/models/minicpm_eagle.py | 389 ++ model_executor/models/minicpmo.py | 768 +++ model_executor/models/minicpmv.py | 1745 +++++ model_executor/models/minimax_m2.py | 552 ++ model_executor/models/minimax_text_01.py | 1012 +++ model_executor/models/minimax_vl_01.py | 396 ++ model_executor/models/mistral3.py | 637 ++ model_executor/models/mixtral.py | 621 ++ model_executor/models/mllama4.py | 1147 +++ model_executor/models/mlp_speculator.py | 235 + model_executor/models/modernbert.py | 450 ++ model_executor/models/module_mapping.py | 74 + model_executor/models/molmo.py | 1555 +++++ model_executor/models/moonvit.py | 677 ++ model_executor/models/mpt.py | 335 + model_executor/models/nano_nemotron_vl.py | 1740 +++++ model_executor/models/nemotron.py | 518 ++ model_executor/models/nemotron_h.py | 852 +++ model_executor/models/nemotron_nas.py | 491 ++ model_executor/models/nemotron_vl.py | 653 ++ model_executor/models/nvlm_d.py | 216 + model_executor/models/olmo.py | 414 ++ model_executor/models/olmo2.py | 454 ++ model_executor/models/olmoe.py | 498 ++ model_executor/models/openpangu.py | 1062 +++ model_executor/models/openpangu_mtp.py | 265 + model_executor/models/opt.py | 426 ++ model_executor/models/orion.py | 372 + model_executor/models/ouro.py | 516 ++ model_executor/models/ovis.py | 559 ++ model_executor/models/ovis2_5.py | 673 ++ model_executor/models/paddleocr_vl.py | 1407 ++++ model_executor/models/paligemma.py | 412 ++ model_executor/models/persimmon.py | 377 + model_executor/models/phi.py | 374 + model_executor/models/phi3.py | 18 + model_executor/models/phi3v.py | 737 ++ model_executor/models/phi4_multimodal.py | 1447 ++++ model_executor/models/phi4mm.py | 1253 ++++ model_executor/models/phi4mm_audio.py | 1296 ++++ model_executor/models/phi4mm_utils.py | 1907 +++++ model_executor/models/phimoe.py | 676 ++ model_executor/models/pixtral.py | 1355 ++++ model_executor/models/plamo2.py | 981 +++ model_executor/models/qwen.py | 371 + model_executor/models/qwen2.py | 541 ++ model_executor/models/qwen2_5_omni_thinker.py | 1246 ++++ model_executor/models/qwen2_5_vl.py | 1613 +++++ model_executor/models/qwen2_audio.py | 473 ++ model_executor/models/qwen2_moe.py | 596 ++ model_executor/models/qwen2_rm.py | 123 + model_executor/models/qwen2_vl.py | 1670 +++++ model_executor/models/qwen3.py | 353 + model_executor/models/qwen3_moe.py | 762 ++ model_executor/models/qwen3_next.py | 1397 ++++ model_executor/models/qwen3_next_mtp.py | 296 + .../models/qwen3_omni_moe_thinker.py | 1721 +++++ model_executor/models/qwen3_vl.py | 1673 +++++ model_executor/models/qwen3_vl_moe.py | 415 ++ model_executor/models/qwen_vl.py | 804 +++ model_executor/models/radio.py | 555 ++ model_executor/models/registry.py | 1155 ++++ model_executor/models/roberta.py | 259 + model_executor/models/rvl.py | 107 + model_executor/models/seed_oss.py | 497 ++ model_executor/models/siglip.py | 1174 ++++ model_executor/models/siglip2navit.py | 724 ++ model_executor/models/skyworkr1v.py | 953 +++ model_executor/models/smolvlm.py | 38 + model_executor/models/solar.py | 502 ++ model_executor/models/stablelm.py | 359 + model_executor/models/starcoder2.py | 367 + model_executor/models/step3_text.py | 559 ++ model_executor/models/step3_vl.py | 1148 +++ model_executor/models/swin.py | 514 ++ model_executor/models/tarsier.py | 619 ++ model_executor/models/telechat2.py | 153 + model_executor/models/teleflm.py | 79 + model_executor/models/terratorch.py | 319 + .../models/transformers/__init__.py | 127 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 4347 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 20334 bytes .../__pycache__/causal.cpython-312.pyc | Bin 0 -> 2697 bytes .../__pycache__/legacy.cpython-312.pyc | Bin 0 -> 2531 bytes .../__pycache__/moe.cpython-312.pyc | Bin 0 -> 13223 bytes .../__pycache__/multimodal.cpython-312.pyc | Bin 0 -> 17102 bytes .../__pycache__/pooling.cpython-312.pyc | Bin 0 -> 4751 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 8536 bytes model_executor/models/transformers/base.py | 464 ++ model_executor/models/transformers/causal.py | 65 + model_executor/models/transformers/legacy.py | 90 + model_executor/models/transformers/moe.py | 318 + .../models/transformers/multimodal.py | 411 ++ model_executor/models/transformers/pooling.py | 119 + model_executor/models/transformers/utils.py | 207 + model_executor/models/ultravox.py | 681 ++ model_executor/models/utils.py | 877 +++ model_executor/models/vision.py | 552 ++ model_executor/models/voxtral.py | 845 +++ model_executor/models/whisper.py | 959 +++ model_executor/models/zamba2.py | 986 +++ model_executor/parameter.py | 649 ++ model_executor/utils.py | 94 + model_executor/warmup/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 171 bytes .../deep_gemm_warmup.cpython-312.pyc | Bin 0 -> 14426 bytes .../__pycache__/kernel_warmup.cpython-312.pyc | Bin 0 -> 3870 bytes model_executor/warmup/deep_gemm_warmup.py | 314 + model_executor/warmup/kernel_warmup.py | 98 + multimodal/__init__.py | 40 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 699 bytes multimodal/__pycache__/audio.cpython-312.pyc | Bin 0 -> 5231 bytes multimodal/__pycache__/base.cpython-312.pyc | Bin 0 -> 1308 bytes multimodal/__pycache__/cache.cpython-312.pyc | Bin 0 -> 29736 bytes multimodal/__pycache__/evs.cpython-312.pyc | Bin 0 -> 10336 bytes multimodal/__pycache__/hasher.cpython-312.pyc | Bin 0 -> 5420 bytes multimodal/__pycache__/image.cpython-312.pyc | Bin 0 -> 7508 bytes multimodal/__pycache__/inputs.cpython-312.pyc | Bin 0 -> 39325 bytes multimodal/__pycache__/parse.cpython-312.pyc | Bin 0 -> 25551 bytes .../__pycache__/processing.cpython-312.pyc | Bin 0 -> 75627 bytes .../__pycache__/profiling.cpython-312.pyc | Bin 0 -> 14921 bytes .../__pycache__/registry.cpython-312.pyc | Bin 0 -> 13943 bytes multimodal/__pycache__/utils.cpython-312.pyc | Bin 0 -> 20131 bytes multimodal/__pycache__/video.cpython-312.pyc | Bin 0 -> 13385 bytes multimodal/audio.py | 118 + multimodal/base.py | 26 + multimodal/cache.py | 755 ++ multimodal/evs.py | 294 + multimodal/hasher.py | 106 + multimodal/image.py | 130 + multimodal/inputs.py | 1036 +++ multimodal/parse.py | 544 ++ multimodal/processing.py | 2186 ++++++ multimodal/profiling.py | 369 + multimodal/registry.py | 360 + multimodal/utils.py | 512 ++ multimodal/video.py | 306 + outputs.py | 345 + platforms/__init__.py | 277 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 10992 bytes platforms/__pycache__/cpu.cpython-312.pyc | Bin 0 -> 16371 bytes platforms/__pycache__/cuda.cpython-312.pyc | Bin 0 -> 26113 bytes .../__pycache__/interface.cpython-312.pyc | Bin 0 -> 27241 bytes platforms/__pycache__/rocm.cpython-312.pyc | Bin 0 -> 22319 bytes platforms/__pycache__/tpu.cpython-312.pyc | Bin 0 -> 12381 bytes platforms/__pycache__/xpu.cpython-312.pyc | Bin 0 -> 12738 bytes platforms/cpu.py | 414 ++ platforms/cuda.py | 656 ++ platforms/interface.py | 641 ++ platforms/rocm.py | 466 ++ platforms/tpu.py | 276 + platforms/xpu.py | 274 + plugins/__init__.py | 78 + plugins/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 2732 bytes plugins/io_processors/__init__.py | 68 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 2457 bytes .../__pycache__/interface.cpython-312.pyc | Bin 0 -> 3760 bytes plugins/io_processors/interface.py | 77 + plugins/lora_resolvers/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 172 bytes .../filesystem_resolver.cpython-312.pyc | Bin 0 -> 2738 bytes plugins/lora_resolvers/filesystem_resolver.py | 52 + pooling_params.py | 228 + profiler/__init__.py | 0 profiler/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 158 bytes .../__pycache__/gpu_profiler.cpython-312.pyc | Bin 0 -> 2114 bytes .../layerwise_profile.cpython-312.pyc | Bin 0 -> 19953 bytes profiler/__pycache__/utils.cpython-312.pyc | Bin 0 -> 8207 bytes profiler/gpu_profiler.py | 37 + profiler/layerwise_profile.py | 392 ++ profiler/utils.py | 151 + py.typed | 2 + ray/__init__.py | 0 ray/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 153 bytes ray/__pycache__/lazy_utils.cpython-312.pyc | Bin 0 -> 881 bytes ray/__pycache__/ray_env.cpython-312.pyc | Bin 0 -> 3096 bytes ray/lazy_utils.py | 26 + ray/ray_env.py | 79 + reasoning/__init__.py | 92 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 1804 bytes .../abs_reasoning_parsers.cpython-312.pyc | Bin 0 -> 12109 bytes .../__pycache__/basic_parsers.cpython-312.pyc | Bin 0 -> 6808 bytes ...epseek_r1_reasoning_parser.cpython-312.pyc | Bin 0 -> 2509 bytes ...epseek_v3_reasoning_parser.cpython-312.pyc | Bin 0 -> 3258 bytes .../ernie45_reasoning_parser.cpython-312.pyc | Bin 0 -> 7251 bytes .../glm4_moe_reasoning_parser.cpython-312.pyc | Bin 0 -> 6635 bytes .../gptoss_reasoning_parser.cpython-312.pyc | Bin 0 -> 7166 bytes .../granite_reasoning_parser.cpython-312.pyc | Bin 0 -> 13505 bytes ...yuan_a13b_reasoning_parser.cpython-312.pyc | Bin 0 -> 9658 bytes .../identity_reasoning_parser.cpython-312.pyc | Bin 0 -> 2671 bytes ...inimax_m2_reasoning_parser.cpython-312.pyc | Bin 0 -> 3820 bytes .../mistral_reasoning_parser.cpython-312.pyc | Bin 0 -> 2734 bytes .../olmo3_reasoning_parser.cpython-312.pyc | Bin 0 -> 11043 bytes .../qwen3_reasoning_parser.cpython-312.pyc | Bin 0 -> 2668 bytes .../seedoss_reasoning_parser.cpython-312.pyc | Bin 0 -> 1277 bytes .../step3_reasoning_parser.cpython-312.pyc | Bin 0 -> 4618 bytes reasoning/abs_reasoning_parsers.py | 290 + reasoning/basic_parsers.py | 162 + reasoning/deepseek_r1_reasoning_parser.py | 67 + reasoning/deepseek_v3_reasoning_parser.py | 62 + reasoning/ernie45_reasoning_parser.py | 165 + reasoning/glm4_moe_reasoning_parser.py | 171 + reasoning/gptoss_reasoning_parser.py | 173 + reasoning/granite_reasoning_parser.py | 363 + reasoning/hunyuan_a13b_reasoning_parser.py | 237 + reasoning/identity_reasoning_parser.py | 58 + reasoning/minimax_m2_reasoning_parser.py | 67 + reasoning/mistral_reasoning_parser.py | 55 + reasoning/olmo3_reasoning_parser.py | 302 + reasoning/qwen3_reasoning_parser.py | 67 + reasoning/seedoss_reasoning_parser.py | 27 + reasoning/step3_reasoning_parser.py | 107 + sampling_params.py | 669 ++ scalar_type.py | 355 + scripts.py | 17 + sequence.py | 98 + tasks.py | 13 + third_party/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 161 bytes .../__pycache__/pynvml.cpython-312.pyc | Bin 0 -> 250620 bytes third_party/pynvml.py | 6140 +++++++++++++++++ tracing.py | 135 + transformers_utils/__init__.py | 26 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 934 bytes .../__pycache__/config.cpython-312.pyc | Bin 0 -> 42255 bytes .../config_parser_base.cpython-312.pyc | Bin 0 -> 961 bytes .../detokenizer_utils.cpython-312.pyc | Bin 0 -> 5861 bytes .../dynamic_module.cpython-312.pyc | Bin 0 -> 1832 bytes .../__pycache__/processor.cpython-312.pyc | Bin 0 -> 12491 bytes .../__pycache__/runai_utils.cpython-312.pyc | Bin 0 -> 4995 bytes .../__pycache__/s3_utils.cpython-312.pyc | Bin 0 -> 4136 bytes .../__pycache__/tokenizer.cpython-312.pyc | Bin 0 -> 10705 bytes .../tokenizer_base.cpython-312.pyc | Bin 0 -> 7013 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 6004 bytes transformers_utils/chat_templates/__init__.py | 5 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 280 bytes .../__pycache__/registry.cpython-312.pyc | Bin 0 -> 2553 bytes transformers_utils/chat_templates/registry.py | 73 + .../chat_templates/template_basic.jinja | 3 + .../chat_templates/template_blip2.jinja | 11 + .../chat_templates/template_chatml.jinja | 10 + .../template_deepseek_ocr.jinja | 14 + .../template_deepseek_vl2.jinja | 23 + .../chat_templates/template_fuyu.jinja | 3 + .../chat_templates/template_minicpmv45.jinja | 93 + transformers_utils/config.py | 1203 ++++ transformers_utils/config_parser_base.py | 20 + transformers_utils/configs/__init__.py | 70 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 2553 bytes .../configs/__pycache__/afmoe.cpython-312.pyc | Bin 0 -> 3223 bytes .../__pycache__/arctic.cpython-312.pyc | Bin 0 -> 9285 bytes .../__pycache__/chatglm.cpython-312.pyc | Bin 0 -> 2241 bytes .../__pycache__/deepseek_vl2.cpython-312.pyc | Bin 0 -> 4846 bytes .../__pycache__/dotsocr.cpython-312.pyc | Bin 0 -> 2951 bytes .../configs/__pycache__/eagle.cpython-312.pyc | Bin 0 -> 3182 bytes .../__pycache__/falcon.cpython-312.pyc | Bin 0 -> 2401 bytes .../__pycache__/flex_olmo.cpython-312.pyc | Bin 0 -> 2330 bytes .../configs/__pycache__/jais.cpython-312.pyc | Bin 0 -> 9355 bytes .../__pycache__/kimi_linear.cpython-312.pyc | Bin 0 -> 5090 bytes .../__pycache__/kimi_vl.cpython-312.pyc | Bin 0 -> 1518 bytes .../__pycache__/lfm2_moe.cpython-312.pyc | Bin 0 -> 7333 bytes .../__pycache__/medusa.cpython-312.pyc | Bin 0 -> 2766 bytes .../__pycache__/midashenglm.cpython-312.pyc | Bin 0 -> 3253 bytes .../__pycache__/mistral.cpython-312.pyc | Bin 0 -> 6508 bytes .../mlp_speculator.cpython-312.pyc | Bin 0 -> 2721 bytes .../__pycache__/moonvit.cpython-312.pyc | Bin 0 -> 1349 bytes .../__pycache__/nemotron.cpython-312.pyc | Bin 0 -> 8256 bytes .../__pycache__/nemotron_h.cpython-312.pyc | Bin 0 -> 11584 bytes .../configs/__pycache__/olmo3.cpython-312.pyc | Bin 0 -> 2419 bytes .../configs/__pycache__/ovis.cpython-312.pyc | Bin 0 -> 7726 bytes .../__pycache__/qwen3_next.cpython-312.pyc | Bin 0 -> 13396 bytes .../configs/__pycache__/radio.cpython-312.pyc | Bin 0 -> 4110 bytes .../__pycache__/step3_vl.cpython-312.pyc | Bin 0 -> 4707 bytes .../__pycache__/ultravox.cpython-312.pyc | Bin 0 -> 4685 bytes transformers_utils/configs/afmoe.py | 84 + transformers_utils/configs/arctic.py | 206 + transformers_utils/configs/chatglm.py | 75 + transformers_utils/configs/deepseek_vl2.py | 126 + transformers_utils/configs/dotsocr.py | 71 + transformers_utils/configs/eagle.py | 84 + transformers_utils/configs/falcon.py | 89 + transformers_utils/configs/flex_olmo.py | 77 + transformers_utils/configs/jais.py | 243 + transformers_utils/configs/kimi_linear.py | 144 + transformers_utils/configs/kimi_vl.py | 38 + transformers_utils/configs/lfm2_moe.py | 159 + transformers_utils/configs/medusa.py | 65 + transformers_utils/configs/midashenglm.py | 103 + transformers_utils/configs/mistral.py | 174 + transformers_utils/configs/mlp_speculator.py | 69 + transformers_utils/configs/moonvit.py | 33 + transformers_utils/configs/nemotron.py | 212 + transformers_utils/configs/nemotron_h.py | 282 + transformers_utils/configs/olmo3.py | 79 + transformers_utils/configs/ovis.py | 182 + transformers_utils/configs/qwen3_next.py | 274 + transformers_utils/configs/radio.py | 89 + .../configs/speculators/__init__.py | 2 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 188 bytes .../__pycache__/algos.cpython-312.pyc | Bin 0 -> 1843 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 4451 bytes .../configs/speculators/algos.py | 38 + .../configs/speculators/base.py | 114 + transformers_utils/configs/step3_vl.py | 174 + transformers_utils/configs/ultravox.py | 118 + transformers_utils/detokenizer_utils.py | 198 + transformers_utils/dynamic_module.py | 59 + transformers_utils/processor.py | 402 ++ transformers_utils/processors/__init__.py | 15 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 727 bytes .../__pycache__/deepseek_ocr.cpython-312.pyc | Bin 0 -> 16200 bytes .../__pycache__/deepseek_vl2.cpython-312.pyc | Bin 0 -> 14878 bytes .../__pycache__/ovis.cpython-312.pyc | Bin 0 -> 20039 bytes .../__pycache__/ovis2_5.cpython-312.pyc | Bin 0 -> 19896 bytes transformers_utils/processors/deepseek_ocr.py | 438 ++ transformers_utils/processors/deepseek_vl2.py | 406 ++ transformers_utils/processors/ovis.py | 453 ++ transformers_utils/processors/ovis2_5.py | 468 ++ transformers_utils/runai_utils.py | 104 + transformers_utils/s3_utils.py | 95 + transformers_utils/tokenizer.py | 293 + transformers_utils/tokenizer_base.py | 155 + transformers_utils/tokenizers/__init__.py | 16 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 365 bytes .../__pycache__/mistral.cpython-312.pyc | Bin 0 -> 21236 bytes transformers_utils/tokenizers/mistral.py | 502 ++ transformers_utils/utils.py | 130 + triton_utils/__init__.py | 19 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 575 bytes .../__pycache__/importing.cpython-312.pyc | Bin 0 -> 4223 bytes triton_utils/importing.py | 103 + usage/__init__.py | 0 usage/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 155 bytes usage/__pycache__/usage_lib.cpython-312.pyc | Bin 0 -> 12486 bytes usage/usage_lib.py | 294 + utils/__init__.py | 82 + utils/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 3068 bytes .../argparse_utils.cpython-312.pyc | Bin 0 -> 21336 bytes utils/__pycache__/async_utils.cpython-312.pyc | Bin 0 -> 15951 bytes utils/__pycache__/cache.cpython-312.pyc | Bin 0 -> 10893 bytes .../collection_utils.cpython-312.pyc | Bin 0 -> 7211 bytes utils/__pycache__/counter.cpython-312.pyc | Bin 0 -> 2586 bytes utils/__pycache__/deep_gemm.cpython-312.pyc | Bin 0 -> 15767 bytes utils/__pycache__/flashinfer.cpython-312.pyc | Bin 0 -> 18119 bytes utils/__pycache__/func_utils.cpython-312.pyc | Bin 0 -> 8474 bytes utils/__pycache__/gc_utils.cpython-312.pyc | Bin 0 -> 6677 bytes utils/__pycache__/hashing.cpython-312.pyc | Bin 0 -> 2462 bytes .../__pycache__/import_utils.cpython-312.pyc | Bin 0 -> 19180 bytes utils/__pycache__/jsontree.cpython-312.pyc | Bin 0 -> 5946 bytes utils/__pycache__/math_utils.cpython-312.pyc | Bin 0 -> 1400 bytes .../__pycache__/mem_constants.cpython-312.pyc | Bin 0 -> 275 bytes utils/__pycache__/mem_utils.cpython-312.pyc | Bin 0 -> 10482 bytes utils/__pycache__/nccl.cpython-312.pyc | Bin 0 -> 2862 bytes .../__pycache__/network_utils.cpython-312.pyc | Bin 0 -> 14246 bytes .../platform_utils.cpython-312.pyc | Bin 0 -> 3106 bytes utils/__pycache__/profiling.cpython-312.pyc | Bin 0 -> 2301 bytes utils/__pycache__/registry.cpython-312.pyc | Bin 0 -> 2164 bytes .../__pycache__/serial_utils.cpython-312.pyc | Bin 0 -> 6124 bytes .../__pycache__/system_utils.cpython-312.pyc | Bin 0 -> 9100 bytes .../__pycache__/tensor_schema.cpython-312.pyc | Bin 0 -> 9848 bytes utils/__pycache__/torch_utils.cpython-312.pyc | Bin 0 -> 27124 bytes utils/argparse_utils.py | 487 ++ utils/async_utils.py | 303 + utils/cache.py | 214 + utils/collection_utils.py | 139 + utils/counter.py | 45 + utils/deep_gemm.py | 391 ++ utils/flashinfer.py | 490 ++ utils/func_utils.py | 236 + utils/gc_utils.py | 147 + utils/hashing.py | 63 + utils/import_utils.py | 411 ++ utils/jsontree.py | 165 + utils/math_utils.py | 32 + utils/mem_constants.py | 13 + utils/mem_utils.py | 232 + utils/nccl.py | 64 + utils/network_utils.py | 331 + utils/platform_utils.py | 59 + utils/profiling.py | 56 + utils/registry.py | 49 + utils/serial_utils.py | 169 + utils/system_utils.py | 229 + utils/tensor_schema.py | 255 + utils/torch_utils.py | 658 ++ v1/__init__.py | 0 v1/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 152 bytes .../cudagraph_dispatcher.cpython-312.pyc | Bin 0 -> 6234 bytes .../kv_cache_interface.cpython-312.pyc | Bin 0 -> 23029 bytes v1/__pycache__/outputs.cpython-312.pyc | Bin 0 -> 8571 bytes v1/__pycache__/request.cpython-312.pyc | Bin 0 -> 11233 bytes v1/__pycache__/serial_utils.cpython-312.pyc | Bin 0 -> 26108 bytes v1/__pycache__/utils.cpython-312.pyc | Bin 0 -> 18864 bytes v1/attention/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 162 bytes v1/attention/backends/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 171 bytes .../__pycache__/cpu_attn.cpython-312.pyc | Bin 0 -> 19526 bytes .../__pycache__/flash_attn.cpython-312.pyc | Bin 0 -> 40752 bytes .../__pycache__/flashinfer.cpython-312.pyc | Bin 0 -> 54410 bytes .../flex_attention.cpython-312.pyc | Bin 0 -> 39878 bytes .../__pycache__/gdn_attn.cpython-312.pyc | Bin 0 -> 14183 bytes .../__pycache__/linear_attn.cpython-312.pyc | Bin 0 -> 3100 bytes .../__pycache__/mamba1_attn.cpython-312.pyc | Bin 0 -> 5797 bytes .../__pycache__/mamba2_attn.cpython-312.pyc | Bin 0 -> 12457 bytes .../__pycache__/mamba_attn.cpython-312.pyc | Bin 0 -> 4703 bytes .../__pycache__/pallas.cpython-312.pyc | Bin 0 -> 16914 bytes .../__pycache__/rocm_aiter_fa.cpython-312.pyc | Bin 0 -> 29813 bytes .../rocm_aiter_unified_attn.cpython-312.pyc | Bin 0 -> 7641 bytes .../__pycache__/rocm_attn.cpython-312.pyc | Bin 0 -> 13999 bytes .../short_conv_attn.cpython-312.pyc | Bin 0 -> 3891 bytes .../__pycache__/tree_attn.cpython-312.pyc | Bin 0 -> 16892 bytes .../__pycache__/triton_attn.cpython-312.pyc | Bin 0 -> 14470 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 38931 bytes .../__pycache__/xformers.cpython-312.pyc | Bin 0 -> 15355 bytes v1/attention/backends/cpu_attn.py | 496 ++ v1/attention/backends/flash_attn.py | 1215 ++++ v1/attention/backends/flashinfer.py | 1572 +++++ v1/attention/backends/flex_attention.py | 926 +++ v1/attention/backends/gdn_attn.py | 387 ++ v1/attention/backends/linear_attn.py | 74 + v1/attention/backends/mamba1_attn.py | 165 + v1/attention/backends/mamba2_attn.py | 354 + v1/attention/backends/mamba_attn.py | 115 + v1/attention/backends/mla/__init__.py | 0 .../mla/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 175 bytes .../mla/__pycache__/common.cpython-312.pyc | Bin 0 -> 83013 bytes .../__pycache__/cutlass_mla.cpython-312.pyc | Bin 0 -> 11604 bytes .../__pycache__/flashattn_mla.cpython-312.pyc | Bin 0 -> 13253 bytes .../flashinfer_mla.cpython-312.pyc | Bin 0 -> 7353 bytes .../mla/__pycache__/flashmla.cpython-312.pyc | Bin 0 -> 12446 bytes .../flashmla_sparse.cpython-312.pyc | Bin 0 -> 21861 bytes .../mla/__pycache__/indexer.cpython-312.pyc | Bin 0 -> 14370 bytes .../rocm_aiter_mla.cpython-312.pyc | Bin 0 -> 11605 bytes .../__pycache__/triton_mla.cpython-312.pyc | Bin 0 -> 8208 bytes v1/attention/backends/mla/common.py | 2200 ++++++ v1/attention/backends/mla/cutlass_mla.py | 275 + v1/attention/backends/mla/flashattn_mla.py | 337 + v1/attention/backends/mla/flashinfer_mla.py | 171 + v1/attention/backends/mla/flashmla.py | 314 + v1/attention/backends/mla/flashmla_sparse.py | 560 ++ v1/attention/backends/mla/indexer.py | 362 + v1/attention/backends/mla/rocm_aiter_mla.py | 294 + v1/attention/backends/mla/triton_mla.py | 206 + v1/attention/backends/pallas.py | 436 ++ v1/attention/backends/rocm_aiter_fa.py | 816 +++ .../backends/rocm_aiter_unified_attn.py | 196 + v1/attention/backends/rocm_attn.py | 362 + v1/attention/backends/short_conv_attn.py | 105 + v1/attention/backends/tree_attn.py | 425 ++ v1/attention/backends/triton_attn.py | 373 + v1/attention/backends/utils.py | 1117 +++ v1/attention/backends/xformers.py | 417 ++ v1/core/__init__.py | 0 v1/core/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 157 bytes .../__pycache__/block_pool.cpython-312.pyc | Bin 0 -> 16194 bytes .../encoder_cache_manager.cpython-312.pyc | Bin 0 -> 14635 bytes .../kv_cache_coordinator.cpython-312.pyc | Bin 0 -> 18956 bytes .../kv_cache_manager.cpython-312.pyc | Bin 0 -> 18258 bytes .../kv_cache_utils.cpython-312.pyc | Bin 0 -> 49762 bytes ...ngle_type_kv_cache_manager.cpython-312.pyc | Bin 0 -> 29272 bytes v1/core/block_pool.py | 428 ++ v1/core/encoder_cache_manager.py | 343 + v1/core/kv_cache_coordinator.py | 480 ++ v1/core/kv_cache_manager.py | 420 ++ v1/core/kv_cache_utils.py | 1356 ++++ v1/core/sched/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 163 bytes .../async_scheduler.cpython-312.pyc | Bin 0 -> 2774 bytes .../__pycache__/interface.cpython-312.pyc | Bin 0 -> 8735 bytes .../sched/__pycache__/output.cpython-312.pyc | Bin 0 -> 7276 bytes .../__pycache__/request_queue.cpython-312.pyc | Bin 0 -> 11565 bytes .../__pycache__/scheduler.cpython-312.pyc | Bin 0 -> 51758 bytes .../sched/__pycache__/utils.cpython-312.pyc | Bin 0 -> 2838 bytes v1/core/sched/async_scheduler.py | 62 + v1/core/sched/interface.py | 181 + v1/core/sched/output.py | 202 + v1/core/sched/request_queue.py | 221 + v1/core/sched/scheduler.py | 1617 +++++ v1/core/sched/utils.py | 72 + v1/core/single_type_kv_cache_manager.py | 736 ++ v1/cudagraph_dispatcher.py | 148 + v1/engine/__init__.py | 206 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 7557 bytes .../__pycache__/async_llm.cpython-312.pyc | Bin 0 -> 33097 bytes .../__pycache__/coordinator.cpython-312.pyc | Bin 0 -> 13737 bytes v1/engine/__pycache__/core.cpython-312.pyc | Bin 0 -> 55949 bytes .../__pycache__/core_client.cpython-312.pyc | Bin 0 -> 68308 bytes .../__pycache__/detokenizer.cpython-312.pyc | Bin 0 -> 13847 bytes .../__pycache__/exceptions.cpython-312.pyc | Bin 0 -> 1127 bytes .../__pycache__/llm_engine.cpython-312.pyc | Bin 0 -> 19373 bytes .../__pycache__/logprobs.cpython-312.pyc | Bin 0 -> 6010 bytes .../output_processor.cpython-312.pyc | Bin 0 -> 24927 bytes .../parallel_sampling.cpython-312.pyc | Bin 0 -> 5376 bytes .../__pycache__/processor.cpython-312.pyc | Bin 0 -> 22250 bytes v1/engine/__pycache__/utils.cpython-312.pyc | Bin 0 -> 38194 bytes v1/engine/async_llm.py | 797 +++ v1/engine/coordinator.py | 377 + v1/engine/core.py | 1420 ++++ v1/engine/core_client.py | 1400 ++++ v1/engine/detokenizer.py | 351 + v1/engine/exceptions.py | 18 + v1/engine/llm_engine.py | 408 ++ v1/engine/logprobs.py | 182 + v1/engine/output_processor.py | 642 ++ v1/engine/parallel_sampling.py | 145 + v1/engine/processor.py | 621 ++ v1/engine/utils.py | 1072 +++ v1/executor/__init__.py | 6 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 302 bytes .../__pycache__/abstract.cpython-312.pyc | Bin 0 -> 16251 bytes .../multiproc_executor.cpython-312.pyc | Bin 0 -> 36691 bytes .../ray_distributed_executor.cpython-312.pyc | Bin 0 -> 295 bytes .../__pycache__/ray_executor.cpython-312.pyc | Bin 0 -> 23654 bytes .../__pycache__/ray_utils.cpython-312.pyc | Bin 0 -> 20559 bytes .../uniproc_executor.cpython-312.pyc | Bin 0 -> 9603 bytes v1/executor/abstract.py | 352 + v1/executor/multiproc_executor.py | 877 +++ v1/executor/ray_distributed_executor.py | 8 + v1/executor/ray_executor.py | 626 ++ v1/executor/ray_utils.py | 498 ++ v1/executor/uniproc_executor.py | 183 + v1/kv_cache_interface.py | 443 ++ v1/kv_offload/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 163 bytes .../__pycache__/abstract.cpython-312.pyc | Bin 0 -> 6713 bytes .../__pycache__/arc_manager.cpython-312.pyc | Bin 0 -> 10966 bytes .../__pycache__/backend.cpython-312.pyc | Bin 0 -> 4102 bytes v1/kv_offload/__pycache__/cpu.cpython-312.pyc | Bin 0 -> 4414 bytes .../__pycache__/factory.cpython-312.pyc | Bin 0 -> 2858 bytes .../__pycache__/lru_manager.cpython-312.pyc | Bin 0 -> 6122 bytes .../__pycache__/mediums.cpython-312.pyc | Bin 0 -> 1873 bytes .../__pycache__/spec.cpython-312.pyc | Bin 0 -> 2790 bytes v1/kv_offload/abstract.py | 161 + v1/kv_offload/arc_manager.py | 237 + v1/kv_offload/backend.py | 97 + v1/kv_offload/backends/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 172 bytes .../backends/__pycache__/cpu.cpython-312.pyc | Bin 0 -> 3682 bytes v1/kv_offload/backends/cpu.py | 62 + v1/kv_offload/cpu.py | 93 + v1/kv_offload/factory.py | 56 + v1/kv_offload/lru_manager.py | 139 + v1/kv_offload/mediums.py | 39 + v1/kv_offload/spec.py | 62 + v1/kv_offload/worker/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 170 bytes .../__pycache__/cpu_gpu.cpython-312.pyc | Bin 0 -> 7817 bytes .../worker/__pycache__/worker.cpython-312.pyc | Bin 0 -> 5297 bytes v1/kv_offload/worker/cpu_gpu.py | 185 + v1/kv_offload/worker/worker.py | 144 + v1/metrics/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 160 bytes .../__pycache__/loggers.cpython-312.pyc | Bin 0 -> 42175 bytes .../__pycache__/prometheus.cpython-312.pyc | Bin 0 -> 3416 bytes .../__pycache__/ray_wrappers.cpython-312.pyc | Bin 0 -> 7801 bytes v1/metrics/__pycache__/reader.cpython-312.pyc | Bin 0 -> 8674 bytes v1/metrics/__pycache__/stats.cpython-312.pyc | Bin 0 -> 17798 bytes v1/metrics/loggers.py | 1238 ++++ v1/metrics/prometheus.py | 82 + v1/metrics/ray_wrappers.py | 169 + v1/metrics/reader.py | 257 + v1/metrics/stats.py | 420 ++ v1/outputs.py | 249 + v1/pool/__init__.py | 0 v1/pool/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 157 bytes v1/pool/__pycache__/metadata.cpython-312.pyc | Bin 0 -> 3972 bytes v1/pool/metadata.py | 82 + v1/request.py | 259 + v1/sample/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 159 bytes .../__pycache__/metadata.cpython-312.pyc | Bin 0 -> 1529 bytes .../rejection_sampler.cpython-312.pyc | Bin 0 -> 26659 bytes v1/sample/__pycache__/sampler.cpython-312.pyc | Bin 0 -> 12009 bytes v1/sample/logits_processor/__init__.py | 352 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 12668 bytes .../__pycache__/builtin.cpython-312.pyc | Bin 0 -> 13159 bytes .../__pycache__/interface.cpython-312.pyc | Bin 0 -> 3583 bytes .../__pycache__/state.cpython-312.pyc | Bin 0 -> 7296 bytes v1/sample/logits_processor/builtin.py | 274 + v1/sample/logits_processor/interface.py | 106 + v1/sample/logits_processor/state.py | 165 + v1/sample/metadata.py | 44 + v1/sample/ops/__init__.py | 0 .../ops/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 163 bytes .../ops/__pycache__/bad_words.cpython-312.pyc | Bin 0 -> 2096 bytes .../ops/__pycache__/logprobs.cpython-312.pyc | Bin 0 -> 1306 bytes .../ops/__pycache__/penalties.cpython-312.pyc | Bin 0 -> 1925 bytes .../topk_topp_sampler.cpython-312.pyc | Bin 0 -> 12753 bytes v1/sample/ops/bad_words.py | 52 + v1/sample/ops/logprobs.py | 25 + v1/sample/ops/penalties.py | 57 + v1/sample/ops/topk_topp_sampler.py | 290 + v1/sample/rejection_sampler.py | 791 +++ v1/sample/sampler.py | 316 + v1/sample/tpu/__init__.py | 0 .../tpu/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 163 bytes .../tpu/__pycache__/metadata.cpython-312.pyc | Bin 0 -> 5134 bytes .../tpu/__pycache__/sampler.cpython-312.pyc | Bin 0 -> 9288 bytes v1/sample/tpu/metadata.py | 120 + v1/sample/tpu/sampler.py | 215 + v1/serial_utils.py | 532 ++ v1/spec_decode/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 164 bytes .../__pycache__/eagle.cpython-312.pyc | Bin 0 -> 44943 bytes .../__pycache__/medusa.cpython-312.pyc | Bin 0 -> 3886 bytes .../__pycache__/metadata.cpython-312.pyc | Bin 0 -> 2938 bytes .../__pycache__/metrics.cpython-312.pyc | Bin 0 -> 9531 bytes .../ngram_proposer.cpython-312.pyc | Bin 0 -> 8691 bytes .../suffix_decoding.cpython-312.pyc | Bin 0 -> 4670 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 858 bytes v1/spec_decode/eagle.py | 1229 ++++ v1/spec_decode/medusa.py | 73 + v1/spec_decode/metadata.py | 66 + v1/spec_decode/metrics.py | 224 + v1/spec_decode/ngram_proposer.py | 291 + v1/spec_decode/suffix_decoding.py | 103 + v1/spec_decode/utils.py | 16 + v1/structured_output/__init__.py | 338 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 11672 bytes .../backend_guidance.cpython-312.pyc | Bin 0 -> 11526 bytes ...backend_lm_format_enforcer.cpython-312.pyc | Bin 0 -> 8551 bytes .../backend_outlines.cpython-312.pyc | Bin 0 -> 14526 bytes .../__pycache__/backend_types.cpython-312.pyc | Bin 0 -> 5691 bytes .../backend_xgrammar.cpython-312.pyc | Bin 0 -> 15035 bytes .../__pycache__/request.cpython-312.pyc | Bin 0 -> 4702 bytes .../__pycache__/utils.cpython-312.pyc | Bin 0 -> 18018 bytes v1/structured_output/backend_guidance.py | 265 + .../backend_lm_format_enforcer.py | 177 + v1/structured_output/backend_outlines.py | 324 + v1/structured_output/backend_types.py | 136 + v1/structured_output/backend_xgrammar.py | 362 + v1/structured_output/request.py | 94 + v1/structured_output/utils.py | 469 ++ v1/utils.py | 414 ++ v1/worker/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 159 bytes .../__pycache__/block_table.cpython-312.pyc | Bin 0 -> 14690 bytes .../cpu_model_runner.cpython-312.pyc | Bin 0 -> 7884 bytes .../__pycache__/cpu_worker.cpython-312.pyc | Bin 0 -> 10410 bytes .../__pycache__/dp_utils.cpython-312.pyc | Bin 0 -> 7631 bytes ...nnector_model_runner_mixin.cpython-312.pyc | Bin 0 -> 3934 bytes .../gpu_input_batch.cpython-312.pyc | Bin 0 -> 40325 bytes .../gpu_model_runner.cpython-312.pyc | Bin 0 -> 184645 bytes .../gpu_ubatch_wrapper.cpython-312.pyc | Bin 0 -> 19075 bytes .../__pycache__/gpu_worker.cpython-312.pyc | Bin 0 -> 41672 bytes ...nnector_model_runner_mixin.cpython-312.pyc | Bin 0 -> 6245 bytes .../lora_model_runner_mixin.cpython-312.pyc | Bin 0 -> 8743 bytes .../tpu_input_batch.cpython-312.pyc | Bin 0 -> 27062 bytes .../tpu_model_runner.cpython-312.pyc | Bin 0 -> 87151 bytes .../__pycache__/tpu_worker.cpython-312.pyc | Bin 0 -> 15458 bytes .../__pycache__/ubatch_utils.cpython-312.pyc | Bin 0 -> 3318 bytes .../__pycache__/ubatching.cpython-312.pyc | Bin 0 -> 12408 bytes v1/worker/__pycache__/utils.cpython-312.pyc | Bin 0 -> 16353 bytes .../__pycache__/worker_base.cpython-312.pyc | Bin 0 -> 16809 bytes .../xpu_model_runner.cpython-312.pyc | Bin 0 -> 3324 bytes .../__pycache__/xpu_worker.cpython-312.pyc | Bin 0 -> 9256 bytes v1/worker/block_table.py | 327 + v1/worker/cpu_model_runner.py | 122 + v1/worker/cpu_worker.py | 206 + v1/worker/dp_utils.py | 230 + v1/worker/ec_connector_model_runner_mixin.py | 87 + v1/worker/gpu_input_batch.py | 975 +++ v1/worker/gpu_model_runner.py | 5143 ++++++++++++++ v1/worker/gpu_ubatch_wrapper.py | 466 ++ v1/worker/gpu_worker.py | 894 +++ v1/worker/kv_connector_model_runner_mixin.py | 144 + v1/worker/lora_model_runner_mixin.py | 213 + v1/worker/tpu_input_batch.py | 593 ++ v1/worker/tpu_model_runner.py | 2173 ++++++ v1/worker/tpu_worker.py | 355 + v1/worker/ubatch_utils.py | 73 + v1/worker/ubatching.py | 231 + v1/worker/utils.py | 415 ++ v1/worker/worker_base.py | 378 + v1/worker/xpu_model_runner.py | 55 + v1/worker/xpu_worker.py | 189 + version.py | 2 + vllm_flash_attn/.gitkeep | 0 2569 files changed, 478204 insertions(+) create mode 100644 __init__.py create mode 100644 __pycache__/__init__.cpython-312.pyc create mode 100644 __pycache__/_aiter_ops.cpython-312.pyc create mode 100644 __pycache__/_bc_linter.cpython-312.pyc create mode 100644 __pycache__/_custom_ops.cpython-312.pyc create mode 100644 __pycache__/_ipex_ops.cpython-312.pyc create mode 100644 __pycache__/beam_search.cpython-312.pyc create mode 100644 __pycache__/collect_env.cpython-312.pyc create mode 100644 __pycache__/connections.cpython-312.pyc create mode 100644 __pycache__/env_override.cpython-312.pyc create mode 100644 __pycache__/envs.cpython-312.pyc create mode 100644 __pycache__/forward_context.cpython-312.pyc create mode 100644 __pycache__/logger.cpython-312.pyc create mode 100644 __pycache__/logits_process.cpython-312.pyc create mode 100644 __pycache__/logprobs.cpython-312.pyc create mode 100644 __pycache__/outputs.cpython-312.pyc create mode 100644 __pycache__/pooling_params.cpython-312.pyc create mode 100644 __pycache__/sampling_params.cpython-312.pyc create mode 100644 __pycache__/scalar_type.cpython-312.pyc create mode 100644 __pycache__/scripts.cpython-312.pyc create mode 100644 __pycache__/sequence.cpython-312.pyc create mode 100644 __pycache__/tasks.cpython-312.pyc create mode 100644 __pycache__/tracing.cpython-312.pyc create mode 100644 __pycache__/version.cpython-312.pyc create mode 100644 _aiter_ops.py create mode 100644 _bc_linter.py create mode 100644 _custom_ops.py create mode 100644 _ipex_ops.py create mode 100644 assets/__init__.py create mode 100644 assets/__pycache__/__init__.cpython-312.pyc create mode 100644 assets/__pycache__/audio.cpython-312.pyc create mode 100644 assets/__pycache__/base.cpython-312.pyc create mode 100644 assets/__pycache__/image.cpython-312.pyc create mode 100644 assets/__pycache__/video.cpython-312.pyc create mode 100644 assets/audio.py create mode 100644 assets/base.py create mode 100644 assets/image.py create mode 100644 assets/video.py create mode 100644 attention/__init__.py create mode 100644 attention/__pycache__/__init__.cpython-312.pyc create mode 100644 attention/__pycache__/layer.cpython-312.pyc create mode 100644 attention/__pycache__/selector.cpython-312.pyc create mode 100644 attention/backends/__init__.py create mode 100644 attention/backends/__pycache__/__init__.cpython-312.pyc create mode 100644 attention/backends/__pycache__/abstract.cpython-312.pyc create mode 100644 attention/backends/__pycache__/registry.cpython-312.pyc create mode 100644 attention/backends/__pycache__/utils.cpython-312.pyc create mode 100644 attention/backends/abstract.py create mode 100644 attention/backends/registry.py create mode 100644 attention/backends/utils.py create mode 100644 attention/layer.py create mode 100644 attention/layers/__init__.py create mode 100644 attention/layers/__pycache__/__init__.cpython-312.pyc create mode 100644 attention/layers/__pycache__/chunked_local_attention.cpython-312.pyc create mode 100644 attention/layers/__pycache__/cross_attention.cpython-312.pyc create mode 100644 attention/layers/__pycache__/encoder_only_attention.cpython-312.pyc create mode 100644 attention/layers/chunked_local_attention.py create mode 100644 attention/layers/cross_attention.py create mode 100644 attention/layers/encoder_only_attention.py create mode 100644 attention/ops/__init__.py create mode 100644 attention/ops/__pycache__/__init__.cpython-312.pyc create mode 100644 attention/ops/__pycache__/chunked_prefill_paged_decode.cpython-312.pyc create mode 100644 attention/ops/__pycache__/common.cpython-312.pyc create mode 100644 attention/ops/__pycache__/flashmla.cpython-312.pyc create mode 100644 attention/ops/__pycache__/merge_attn_states.cpython-312.pyc create mode 100644 attention/ops/__pycache__/paged_attn.cpython-312.pyc create mode 100644 attention/ops/__pycache__/pallas_kv_cache_update.cpython-312.pyc create mode 100644 attention/ops/__pycache__/prefix_prefill.cpython-312.pyc create mode 100644 attention/ops/__pycache__/rocm_aiter_paged_attn.cpython-312.pyc create mode 100644 attention/ops/__pycache__/triton_decode_attention.cpython-312.pyc create mode 100644 attention/ops/__pycache__/triton_merge_attn_states.cpython-312.pyc create mode 100644 attention/ops/__pycache__/triton_reshape_and_cache_flash.cpython-312.pyc create mode 100644 attention/ops/__pycache__/triton_unified_attention.cpython-312.pyc create mode 100644 attention/ops/__pycache__/vit_attn_wrappers.cpython-312.pyc create mode 100644 attention/ops/chunked_prefill_paged_decode.py create mode 100644 attention/ops/common.py create mode 100644 attention/ops/flashmla.py create mode 100644 attention/ops/merge_attn_states.py create mode 100644 attention/ops/paged_attn.py create mode 100644 attention/ops/pallas_kv_cache_update.py create mode 100644 attention/ops/prefix_prefill.py create mode 100644 attention/ops/rocm_aiter_paged_attn.py create mode 100644 attention/ops/triton_decode_attention.py create mode 100644 attention/ops/triton_merge_attn_states.py create mode 100644 attention/ops/triton_reshape_and_cache_flash.py create mode 100644 attention/ops/triton_unified_attention.py create mode 100644 attention/ops/vit_attn_wrappers.py create mode 100644 attention/selector.py create mode 100644 attention/utils/__init__.py create mode 100644 attention/utils/__pycache__/__init__.cpython-312.pyc create mode 100644 attention/utils/__pycache__/fa_utils.cpython-312.pyc create mode 100644 attention/utils/__pycache__/kv_sharing_utils.cpython-312.pyc create mode 100644 attention/utils/__pycache__/kv_transfer_utils.cpython-312.pyc create mode 100644 attention/utils/fa_utils.py create mode 100644 attention/utils/kv_sharing_utils.py create mode 100644 attention/utils/kv_transfer_utils.py create mode 100644 beam_search.py create mode 100644 benchmarks/__init__.py create mode 100644 benchmarks/__pycache__/__init__.cpython-312.pyc create mode 100644 benchmarks/__pycache__/datasets.cpython-312.pyc create mode 100644 benchmarks/__pycache__/latency.cpython-312.pyc create mode 100644 benchmarks/__pycache__/serve.cpython-312.pyc create mode 100644 benchmarks/__pycache__/throughput.cpython-312.pyc create mode 100644 benchmarks/datasets.py create mode 100644 benchmarks/latency.py create mode 100644 benchmarks/lib/__init__.py create mode 100644 benchmarks/lib/__pycache__/__init__.cpython-312.pyc create mode 100644 benchmarks/lib/__pycache__/endpoint_request_func.cpython-312.pyc create mode 100644 benchmarks/lib/__pycache__/ready_checker.cpython-312.pyc create mode 100644 benchmarks/lib/__pycache__/utils.cpython-312.pyc create mode 100644 benchmarks/lib/endpoint_request_func.py create mode 100644 benchmarks/lib/ready_checker.py create mode 100644 benchmarks/lib/utils.py create mode 100644 benchmarks/serve.py create mode 100644 benchmarks/sweep/__init__.py create mode 100644 benchmarks/sweep/__pycache__/__init__.cpython-312.pyc create mode 100644 benchmarks/sweep/__pycache__/cli.cpython-312.pyc create mode 100644 benchmarks/sweep/__pycache__/param_sweep.cpython-312.pyc create mode 100644 benchmarks/sweep/__pycache__/plot.cpython-312.pyc create mode 100644 benchmarks/sweep/__pycache__/serve.cpython-312.pyc create mode 100644 benchmarks/sweep/__pycache__/serve_sla.cpython-312.pyc create mode 100644 benchmarks/sweep/__pycache__/server.cpython-312.pyc create mode 100644 benchmarks/sweep/__pycache__/sla_sweep.cpython-312.pyc create mode 100644 benchmarks/sweep/__pycache__/utils.cpython-312.pyc create mode 100644 benchmarks/sweep/cli.py create mode 100644 benchmarks/sweep/param_sweep.py create mode 100644 benchmarks/sweep/plot.py create mode 100644 benchmarks/sweep/serve.py create mode 100644 benchmarks/sweep/serve_sla.py create mode 100644 benchmarks/sweep/server.py create mode 100644 benchmarks/sweep/sla_sweep.py create mode 100644 benchmarks/sweep/utils.py create mode 100644 benchmarks/throughput.py create mode 100644 collect_env.py create mode 100644 compilation/__init__.py create mode 100644 compilation/__pycache__/__init__.cpython-312.pyc create mode 100644 compilation/__pycache__/activation_quant_fusion.cpython-312.pyc create mode 100644 compilation/__pycache__/backends.cpython-312.pyc create mode 100644 compilation/__pycache__/base_static_graph.cpython-312.pyc create mode 100644 compilation/__pycache__/caching.cpython-312.pyc create mode 100644 compilation/__pycache__/collective_fusion.cpython-312.pyc create mode 100644 compilation/__pycache__/compiler_interface.cpython-312.pyc create mode 100644 compilation/__pycache__/counter.cpython-312.pyc create mode 100644 compilation/__pycache__/cuda_graph.cpython-312.pyc create mode 100644 compilation/__pycache__/decorators.cpython-312.pyc create mode 100644 compilation/__pycache__/fix_functionalization.cpython-312.pyc create mode 100644 compilation/__pycache__/fusion.cpython-312.pyc create mode 100644 compilation/__pycache__/fusion_attn.cpython-312.pyc create mode 100644 compilation/__pycache__/fx_utils.cpython-312.pyc create mode 100644 compilation/__pycache__/inductor_pass.cpython-312.pyc create mode 100644 compilation/__pycache__/matcher_utils.cpython-312.pyc create mode 100644 compilation/__pycache__/monitor.cpython-312.pyc create mode 100644 compilation/__pycache__/noop_elimination.cpython-312.pyc create mode 100644 compilation/__pycache__/partition_rules.cpython-312.pyc create mode 100644 compilation/__pycache__/pass_manager.cpython-312.pyc create mode 100644 compilation/__pycache__/piecewise_backend.cpython-312.pyc create mode 100644 compilation/__pycache__/post_cleanup.cpython-312.pyc create mode 100644 compilation/__pycache__/qk_norm_rope_fusion.cpython-312.pyc create mode 100644 compilation/__pycache__/sequence_parallelism.cpython-312.pyc create mode 100644 compilation/__pycache__/torch25_custom_graph_pass.cpython-312.pyc create mode 100644 compilation/__pycache__/vllm_inductor_pass.cpython-312.pyc create mode 100644 compilation/__pycache__/wrapper.cpython-312.pyc create mode 100644 compilation/activation_quant_fusion.py create mode 100644 compilation/backends.py create mode 100644 compilation/base_static_graph.py create mode 100644 compilation/caching.py create mode 100644 compilation/collective_fusion.py create mode 100644 compilation/compiler_interface.py create mode 100644 compilation/counter.py create mode 100644 compilation/cuda_graph.py create mode 100644 compilation/decorators.py create mode 100644 compilation/fix_functionalization.py create mode 100644 compilation/fusion.py create mode 100644 compilation/fusion_attn.py create mode 100644 compilation/fx_utils.py create mode 100644 compilation/inductor_pass.py create mode 100644 compilation/matcher_utils.py create mode 100644 compilation/monitor.py create mode 100644 compilation/noop_elimination.py create mode 100644 compilation/partition_rules.py create mode 100644 compilation/pass_manager.py create mode 100644 compilation/piecewise_backend.py create mode 100644 compilation/post_cleanup.py create mode 100644 compilation/qk_norm_rope_fusion.py create mode 100644 compilation/sequence_parallelism.py create mode 100644 compilation/torch25_custom_graph_pass.py create mode 100644 compilation/vllm_inductor_pass.py create mode 100644 compilation/wrapper.py create mode 100644 config/__init__.py create mode 100644 config/__pycache__/__init__.cpython-312.pyc create mode 100644 config/__pycache__/cache.cpython-312.pyc create mode 100644 config/__pycache__/compilation.cpython-312.pyc create mode 100644 config/__pycache__/device.cpython-312.pyc create mode 100644 config/__pycache__/ec_transfer.cpython-312.pyc create mode 100644 config/__pycache__/kv_events.cpython-312.pyc create mode 100644 config/__pycache__/kv_transfer.cpython-312.pyc create mode 100644 config/__pycache__/load.cpython-312.pyc create mode 100644 config/__pycache__/lora.cpython-312.pyc create mode 100644 config/__pycache__/model.cpython-312.pyc create mode 100644 config/__pycache__/multimodal.cpython-312.pyc create mode 100644 config/__pycache__/observability.cpython-312.pyc create mode 100644 config/__pycache__/parallel.cpython-312.pyc create mode 100644 config/__pycache__/pooler.cpython-312.pyc create mode 100644 config/__pycache__/scheduler.cpython-312.pyc create mode 100644 config/__pycache__/speculative.cpython-312.pyc create mode 100644 config/__pycache__/speech_to_text.cpython-312.pyc create mode 100644 config/__pycache__/structured_outputs.cpython-312.pyc create mode 100644 config/__pycache__/utils.cpython-312.pyc create mode 100644 config/__pycache__/vllm.cpython-312.pyc create mode 100644 config/cache.py create mode 100644 config/compilation.py create mode 100644 config/device.py create mode 100644 config/ec_transfer.py create mode 100644 config/kv_events.py create mode 100644 config/kv_transfer.py create mode 100644 config/load.py create mode 100644 config/lora.py create mode 100644 config/model.py create mode 100644 config/multimodal.py create mode 100644 config/observability.py create mode 100644 config/parallel.py create mode 100644 config/pooler.py create mode 100644 config/scheduler.py create mode 100644 config/speculative.py create mode 100644 config/speech_to_text.py create mode 100644 config/structured_outputs.py create mode 100644 config/utils.py create mode 100644 config/vllm.py create mode 100644 connections.py create mode 100644 device_allocator/__init__.py create mode 100644 device_allocator/__pycache__/__init__.cpython-312.pyc create mode 100644 device_allocator/__pycache__/cumem.cpython-312.pyc create mode 100644 device_allocator/cumem.py create mode 100644 distributed/__init__.py create mode 100644 distributed/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/__pycache__/communication_op.cpython-312.pyc create mode 100644 distributed/__pycache__/kv_events.cpython-312.pyc create mode 100644 distributed/__pycache__/parallel_state.cpython-312.pyc create mode 100644 distributed/__pycache__/tpu_distributed_utils.cpython-312.pyc create mode 100644 distributed/__pycache__/utils.cpython-312.pyc create mode 100644 distributed/communication_op.py create mode 100644 distributed/device_communicators/__init__.py create mode 100644 distributed/device_communicators/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/all2all.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/all_reduce_utils.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/base_device_communicator.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/cpu_communicator.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/cuda_communicator.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/cuda_wrapper.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/custom_all_reduce.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/mnnvl_compat.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/pynccl.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/pynccl_allocator.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/quick_all_reduce.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/ray_communicator.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/shm_broadcast.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/shm_object_storage.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/symm_mem.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/tpu_communicator.cpython-312.pyc create mode 100644 distributed/device_communicators/__pycache__/xpu_communicator.cpython-312.pyc create mode 100644 distributed/device_communicators/all2all.py create mode 100644 distributed/device_communicators/all_reduce_utils.py create mode 100644 distributed/device_communicators/base_device_communicator.py create mode 100644 distributed/device_communicators/cpu_communicator.py create mode 100644 distributed/device_communicators/cuda_communicator.py create mode 100644 distributed/device_communicators/cuda_wrapper.py create mode 100644 distributed/device_communicators/custom_all_reduce.py create mode 100644 distributed/device_communicators/mnnvl_compat.py create mode 100644 distributed/device_communicators/pynccl.py create mode 100644 distributed/device_communicators/pynccl_allocator.py create mode 100644 distributed/device_communicators/pynccl_wrapper.py create mode 100644 distributed/device_communicators/quick_all_reduce.py create mode 100644 distributed/device_communicators/ray_communicator.py create mode 100644 distributed/device_communicators/shm_broadcast.py create mode 100644 distributed/device_communicators/shm_object_storage.py create mode 100644 distributed/device_communicators/symm_mem.py create mode 100644 distributed/device_communicators/tpu_communicator.py create mode 100644 distributed/device_communicators/xpu_communicator.py create mode 100644 distributed/ec_transfer/__init__.py create mode 100644 distributed/ec_transfer/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/ec_transfer/__pycache__/ec_transfer_state.cpython-312.pyc create mode 100644 distributed/ec_transfer/ec_connector/__init__.py create mode 100644 distributed/ec_transfer/ec_connector/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/ec_transfer/ec_connector/__pycache__/base.cpython-312.pyc create mode 100644 distributed/ec_transfer/ec_connector/__pycache__/factory.cpython-312.pyc create mode 100644 distributed/ec_transfer/ec_connector/__pycache__/shared_storage_connector.cpython-312.pyc create mode 100644 distributed/ec_transfer/ec_connector/base.py create mode 100644 distributed/ec_transfer/ec_connector/factory.py create mode 100644 distributed/ec_transfer/ec_connector/shared_storage_connector.py create mode 100644 distributed/ec_transfer/ec_transfer_state.py create mode 100644 distributed/eplb/__init__.py create mode 100644 distributed/eplb/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/eplb/__pycache__/eplb_state.cpython-312.pyc create mode 100644 distributed/eplb/__pycache__/rebalance_algo.cpython-312.pyc create mode 100644 distributed/eplb/__pycache__/rebalance_execute.cpython-312.pyc create mode 100644 distributed/eplb/eplb_state.py create mode 100644 distributed/eplb/rebalance_algo.py create mode 100644 distributed/eplb/rebalance_execute.py create mode 100644 distributed/kv_events.py create mode 100644 distributed/kv_transfer/README.md create mode 100644 distributed/kv_transfer/__init__.py create mode 100644 distributed/kv_transfer/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/kv_transfer/__pycache__/kv_transfer_state.cpython-312.pyc create mode 100644 distributed/kv_transfer/disagg_prefill_workflow.jpg create mode 100644 distributed/kv_transfer/kv_connector/__init__.py create mode 100644 distributed/kv_transfer/kv_connector/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/__pycache__/base.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/__pycache__/factory.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/__pycache__/utils.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/base.py create mode 100644 distributed/kv_transfer/kv_connector/factory.py create mode 100644 distributed/kv_transfer/kv_connector/utils.py create mode 100644 distributed/kv_transfer/kv_connector/v1/__init__.py create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/base.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/decode_bench_connector.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/lmcache_connector.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/lmcache_mp_connector.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/metrics.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/multi_connector.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/nixl_connector.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/offloading_connector.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/__pycache__/shared_storage_connector.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/base.py create mode 100644 distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_connector.py create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_integration/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_integration/__pycache__/multi_process_adapter.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_integration/__pycache__/utils.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_integration/__pycache__/vllm_v1_adapter.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py create mode 100644 distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py create mode 100644 distributed/kv_transfer/kv_connector/v1/metrics.py create mode 100644 distributed/kv_transfer/kv_connector/v1/multi_connector.py create mode 100644 distributed/kv_transfer/kv_connector/v1/nixl_connector.py create mode 100644 distributed/kv_transfer/kv_connector/v1/offloading_connector.py create mode 100644 distributed/kv_transfer/kv_connector/v1/p2p/__init__.py create mode 100644 distributed/kv_transfer/kv_connector/v1/p2p/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/p2p/__pycache__/p2p_nccl_connector.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/p2p/__pycache__/p2p_nccl_engine.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/p2p/__pycache__/tensor_memory_pool.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py create mode 100644 distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py create mode 100644 distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py create mode 100644 distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py create mode 100644 distributed/kv_transfer/kv_lookup_buffer/__init__.py create mode 100644 distributed/kv_transfer/kv_lookup_buffer/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_lookup_buffer/__pycache__/base.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_lookup_buffer/__pycache__/mooncake_store.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_lookup_buffer/__pycache__/simple_buffer.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_lookup_buffer/base.py create mode 100644 distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py create mode 100644 distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py create mode 100644 distributed/kv_transfer/kv_pipe/__init__.py create mode 100644 distributed/kv_transfer/kv_pipe/__pycache__/__init__.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_pipe/__pycache__/base.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_pipe/__pycache__/mooncake_pipe.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_pipe/__pycache__/pynccl_pipe.cpython-312.pyc create mode 100644 distributed/kv_transfer/kv_pipe/base.py create mode 100644 distributed/kv_transfer/kv_pipe/mooncake_pipe.py create mode 100644 distributed/kv_transfer/kv_pipe/pynccl_pipe.py create mode 100644 distributed/kv_transfer/kv_transfer_state.py create mode 100644 distributed/parallel_state.py create mode 100644 distributed/tpu_distributed_utils.py create mode 100644 distributed/utils.py create mode 100644 engine/__init__.py create mode 100644 engine/__pycache__/__init__.cpython-312.pyc create mode 100644 engine/__pycache__/arg_utils.cpython-312.pyc create mode 100644 engine/__pycache__/async_llm_engine.cpython-312.pyc create mode 100644 engine/__pycache__/llm_engine.cpython-312.pyc create mode 100644 engine/__pycache__/protocol.cpython-312.pyc create mode 100644 engine/arg_utils.py create mode 100644 engine/async_llm_engine.py create mode 100644 engine/llm_engine.py create mode 100644 engine/protocol.py create mode 100644 entrypoints/__init__.py create mode 100644 entrypoints/__pycache__/__init__.cpython-312.pyc create mode 100644 entrypoints/__pycache__/api_server.cpython-312.pyc create mode 100644 entrypoints/__pycache__/chat_utils.cpython-312.pyc create mode 100644 entrypoints/__pycache__/constants.cpython-312.pyc create mode 100644 entrypoints/__pycache__/context.cpython-312.pyc create mode 100644 entrypoints/__pycache__/dynamic_lora.cpython-312.pyc create mode 100644 entrypoints/__pycache__/harmony_utils.cpython-312.pyc create mode 100644 entrypoints/__pycache__/launcher.cpython-312.pyc create mode 100644 entrypoints/__pycache__/llm.cpython-312.pyc create mode 100644 entrypoints/__pycache__/logger.cpython-312.pyc create mode 100644 entrypoints/__pycache__/renderer.cpython-312.pyc create mode 100644 entrypoints/__pycache__/responses_utils.cpython-312.pyc create mode 100644 entrypoints/__pycache__/score_utils.cpython-312.pyc create mode 100644 entrypoints/__pycache__/ssl.cpython-312.pyc create mode 100644 entrypoints/__pycache__/tool.cpython-312.pyc create mode 100644 entrypoints/__pycache__/tool_server.cpython-312.pyc create mode 100644 entrypoints/__pycache__/utils.cpython-312.pyc create mode 100644 entrypoints/anthropic/__init__.py create mode 100644 entrypoints/anthropic/__pycache__/__init__.cpython-312.pyc create mode 100644 entrypoints/anthropic/__pycache__/protocol.cpython-312.pyc create mode 100644 entrypoints/anthropic/__pycache__/serving_messages.cpython-312.pyc create mode 100644 entrypoints/anthropic/protocol.py create mode 100644 entrypoints/anthropic/serving_messages.py create mode 100644 entrypoints/api_server.py create mode 100644 entrypoints/chat_utils.py create mode 100644 entrypoints/cli/__init__.py create mode 100644 entrypoints/cli/__pycache__/__init__.cpython-312.pyc create mode 100644 entrypoints/cli/__pycache__/collect_env.cpython-312.pyc create mode 100644 entrypoints/cli/__pycache__/main.cpython-312.pyc create mode 100644 entrypoints/cli/__pycache__/openai.cpython-312.pyc create mode 100644 entrypoints/cli/__pycache__/run_batch.cpython-312.pyc create mode 100644 entrypoints/cli/__pycache__/serve.cpython-312.pyc create mode 100644 entrypoints/cli/__pycache__/types.cpython-312.pyc create mode 100644 entrypoints/cli/benchmark/__init__.py create mode 100644 entrypoints/cli/benchmark/__pycache__/__init__.cpython-312.pyc create mode 100644 entrypoints/cli/benchmark/__pycache__/base.cpython-312.pyc create mode 100644 entrypoints/cli/benchmark/__pycache__/latency.cpython-312.pyc create mode 100644 entrypoints/cli/benchmark/__pycache__/main.cpython-312.pyc create mode 100644 entrypoints/cli/benchmark/__pycache__/serve.cpython-312.pyc create mode 100644 entrypoints/cli/benchmark/__pycache__/sweep.cpython-312.pyc create mode 100644 entrypoints/cli/benchmark/__pycache__/throughput.cpython-312.pyc create mode 100644 entrypoints/cli/benchmark/base.py create mode 100644 entrypoints/cli/benchmark/latency.py create mode 100644 entrypoints/cli/benchmark/main.py create mode 100644 entrypoints/cli/benchmark/serve.py create mode 100644 entrypoints/cli/benchmark/sweep.py create mode 100644 entrypoints/cli/benchmark/throughput.py create mode 100644 entrypoints/cli/collect_env.py create mode 100644 entrypoints/cli/main.py create mode 100644 entrypoints/cli/openai.py create mode 100644 entrypoints/cli/run_batch.py create mode 100644 entrypoints/cli/serve.py create mode 100644 entrypoints/cli/types.py create mode 100644 entrypoints/constants.py create mode 100644 entrypoints/context.py create mode 100644 entrypoints/dynamic_lora.py create mode 100644 entrypoints/harmony_utils.py create mode 100644 entrypoints/launcher.py create mode 100644 entrypoints/llm.py create mode 100644 entrypoints/logger.py create mode 100644 entrypoints/openai/__init__.py create mode 100644 entrypoints/openai/__pycache__/__init__.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/api_server.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/cli_args.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/orca_metrics.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/protocol.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/run_batch.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_chat.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_classification.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_completion.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_embedding.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_engine.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_models.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_pooling.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_responses.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_score.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_tokenization.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_tokens.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/serving_transcription.cpython-312.pyc create mode 100644 entrypoints/openai/__pycache__/speech_to_text.cpython-312.pyc create mode 100644 entrypoints/openai/api_server.py create mode 100644 entrypoints/openai/cli_args.py create mode 100644 entrypoints/openai/orca_metrics.py create mode 100644 entrypoints/openai/protocol.py create mode 100644 entrypoints/openai/run_batch.py create mode 100644 entrypoints/openai/serving_chat.py create mode 100644 entrypoints/openai/serving_classification.py create mode 100644 entrypoints/openai/serving_completion.py create mode 100644 entrypoints/openai/serving_embedding.py create mode 100644 entrypoints/openai/serving_engine.py create mode 100644 entrypoints/openai/serving_models.py create mode 100644 entrypoints/openai/serving_pooling.py create mode 100644 entrypoints/openai/serving_responses.py create mode 100644 entrypoints/openai/serving_score.py create mode 100644 entrypoints/openai/serving_tokenization.py create mode 100644 entrypoints/openai/serving_tokens.py create mode 100644 entrypoints/openai/serving_transcription.py create mode 100644 entrypoints/openai/speech_to_text.py create mode 100644 entrypoints/openai/tool_parsers/__init__.py create mode 100644 entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/deepseekv31_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/deepseekv3_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/ernie45_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/glm4_moe_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/granite_20b_fc_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/granite_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/hunyuan_a13b_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/jamba_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/kimi_k2_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/llama4_pythonic_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/longcat_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/minimax_m2_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/minimax_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/olmo3_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/openai_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/phi4mini_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/pythonic_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/qwen3coder_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/qwen3xml_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/seed_oss_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/step3_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/utils.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/__pycache__/xlam_tool_parser.cpython-312.pyc create mode 100644 entrypoints/openai/tool_parsers/abstract_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/ernie45_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/granite_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/hermes_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/internlm2_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/jamba_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/llama_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/longcat_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/minimax_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/mistral_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/olmo3_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/openai_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/phi4mini_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/pythonic_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/seed_oss_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/step3_tool_parser.py create mode 100644 entrypoints/openai/tool_parsers/utils.py create mode 100644 entrypoints/openai/tool_parsers/xlam_tool_parser.py create mode 100644 entrypoints/renderer.py create mode 100644 entrypoints/responses_utils.py create mode 100644 entrypoints/sagemaker/__init__.py create mode 100644 entrypoints/sagemaker/__pycache__/__init__.cpython-312.pyc create mode 100644 entrypoints/sagemaker/__pycache__/routes.cpython-312.pyc create mode 100644 entrypoints/sagemaker/routes.py create mode 100644 entrypoints/score_utils.py create mode 100644 entrypoints/ssl.py create mode 100644 entrypoints/tool.py create mode 100644 entrypoints/tool_server.py create mode 100644 entrypoints/utils.py create mode 100644 env_override.py create mode 100644 envs.py create mode 100644 forward_context.py create mode 100644 inputs/__init__.py create mode 100644 inputs/__pycache__/__init__.cpython-312.pyc create mode 100644 inputs/__pycache__/data.cpython-312.pyc create mode 100644 inputs/__pycache__/parse.cpython-312.pyc create mode 100644 inputs/__pycache__/preprocess.cpython-312.pyc create mode 100644 inputs/data.py create mode 100644 inputs/parse.py create mode 100644 inputs/preprocess.py create mode 100644 logger.py create mode 100644 logging_utils/__init__.py create mode 100644 logging_utils/__pycache__/__init__.cpython-312.pyc create mode 100644 logging_utils/__pycache__/dump_input.cpython-312.pyc create mode 100644 logging_utils/__pycache__/formatter.cpython-312.pyc create mode 100644 logging_utils/__pycache__/log_time.cpython-312.pyc create mode 100644 logging_utils/dump_input.py create mode 100644 logging_utils/formatter.py create mode 100644 logging_utils/log_time.py create mode 100644 logits_process.py create mode 100644 logprobs.py create mode 100644 lora/__init__.py create mode 100644 lora/__pycache__/__init__.cpython-312.pyc create mode 100644 lora/__pycache__/lora_weights.cpython-312.pyc create mode 100644 lora/__pycache__/models.cpython-312.pyc create mode 100644 lora/__pycache__/peft_helper.cpython-312.pyc create mode 100644 lora/__pycache__/request.cpython-312.pyc create mode 100644 lora/__pycache__/resolver.cpython-312.pyc create mode 100644 lora/__pycache__/utils.cpython-312.pyc create mode 100644 lora/__pycache__/worker_manager.cpython-312.pyc create mode 100644 lora/layers/__init__.py create mode 100644 lora/layers/__pycache__/__init__.cpython-312.pyc create mode 100644 lora/layers/__pycache__/base.cpython-312.pyc create mode 100644 lora/layers/__pycache__/base_linear.cpython-312.pyc create mode 100644 lora/layers/__pycache__/column_parallel_linear.cpython-312.pyc create mode 100644 lora/layers/__pycache__/fused_moe.cpython-312.pyc create mode 100644 lora/layers/__pycache__/logits_processor.cpython-312.pyc create mode 100644 lora/layers/__pycache__/replicated_linear.cpython-312.pyc create mode 100644 lora/layers/__pycache__/row_parallel_linear.cpython-312.pyc create mode 100644 lora/layers/__pycache__/utils.cpython-312.pyc create mode 100644 lora/layers/__pycache__/vocal_parallel_embedding.cpython-312.pyc create mode 100644 lora/layers/base.py create mode 100644 lora/layers/base_linear.py create mode 100644 lora/layers/column_parallel_linear.py create mode 100644 lora/layers/fused_moe.py create mode 100644 lora/layers/logits_processor.py create mode 100644 lora/layers/replicated_linear.py create mode 100644 lora/layers/row_parallel_linear.py create mode 100644 lora/layers/utils.py create mode 100644 lora/layers/vocal_parallel_embedding.py create mode 100644 lora/lora_weights.py create mode 100644 lora/models.py create mode 100644 lora/ops/__init__.py create mode 100644 lora/ops/__pycache__/__init__.cpython-312.pyc create mode 100644 lora/ops/ipex_ops/__init__.py create mode 100644 lora/ops/ipex_ops/__pycache__/__init__.cpython-312.pyc create mode 100644 lora/ops/ipex_ops/__pycache__/lora_ops.cpython-312.pyc create mode 100644 lora/ops/ipex_ops/lora_ops.py create mode 100644 lora/ops/torch_ops/__init__.py create mode 100644 lora/ops/torch_ops/__pycache__/__init__.cpython-312.pyc create mode 100644 lora/ops/torch_ops/__pycache__/lora_ops.cpython-312.pyc create mode 100644 lora/ops/torch_ops/lora_ops.py create mode 100644 lora/ops/triton_ops/README_TUNING.md create mode 100644 lora/ops/triton_ops/__init__.py create mode 100644 lora/ops/triton_ops/__pycache__/__init__.cpython-312.pyc create mode 100644 lora/ops/triton_ops/__pycache__/fused_moe_lora_op.cpython-312.pyc create mode 100644 lora/ops/triton_ops/__pycache__/kernel_utils.cpython-312.pyc create mode 100644 lora/ops/triton_ops/__pycache__/lora_expand_op.cpython-312.pyc create mode 100644 lora/ops/triton_ops/__pycache__/lora_kernel_metadata.cpython-312.pyc create mode 100644 lora/ops/triton_ops/__pycache__/lora_shrink_op.cpython-312.pyc create mode 100644 lora/ops/triton_ops/__pycache__/utils.cpython-312.pyc create mode 100644 lora/ops/triton_ops/fused_moe_lora_op.py create mode 100644 lora/ops/triton_ops/kernel_utils.py create mode 100644 lora/ops/triton_ops/lora_expand_op.py create mode 100644 lora/ops/triton_ops/lora_kernel_metadata.py create mode 100644 lora/ops/triton_ops/lora_shrink_op.py create mode 100644 lora/ops/triton_ops/utils.py create mode 100644 lora/ops/xla_ops/__init__.py create mode 100644 lora/ops/xla_ops/__pycache__/__init__.cpython-312.pyc create mode 100644 lora/ops/xla_ops/__pycache__/lora_ops.cpython-312.pyc create mode 100644 lora/ops/xla_ops/lora_ops.py create mode 100644 lora/peft_helper.py create mode 100644 lora/punica_wrapper/__init__.py create mode 100644 lora/punica_wrapper/__pycache__/__init__.cpython-312.pyc create mode 100644 lora/punica_wrapper/__pycache__/punica_base.cpython-312.pyc create mode 100644 lora/punica_wrapper/__pycache__/punica_cpu.cpython-312.pyc create mode 100644 lora/punica_wrapper/__pycache__/punica_gpu.cpython-312.pyc create mode 100644 lora/punica_wrapper/__pycache__/punica_selector.cpython-312.pyc create mode 100644 lora/punica_wrapper/__pycache__/punica_tpu.cpython-312.pyc create mode 100644 lora/punica_wrapper/__pycache__/punica_xpu.cpython-312.pyc create mode 100644 lora/punica_wrapper/__pycache__/utils.cpython-312.pyc create mode 100644 lora/punica_wrapper/punica_base.py create mode 100644 lora/punica_wrapper/punica_cpu.py create mode 100644 lora/punica_wrapper/punica_gpu.py create mode 100644 lora/punica_wrapper/punica_selector.py create mode 100644 lora/punica_wrapper/punica_tpu.py create mode 100644 lora/punica_wrapper/punica_xpu.py create mode 100644 lora/punica_wrapper/utils.py create mode 100644 lora/request.py create mode 100644 lora/resolver.py create mode 100644 lora/utils.py create mode 100644 lora/worker_manager.py create mode 100644 model_executor/__init__.py create mode 100644 model_executor/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/__pycache__/custom_op.cpython-312.pyc create mode 100644 model_executor/__pycache__/parameter.cpython-312.pyc create mode 100644 model_executor/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/custom_op.py create mode 100644 model_executor/layers/__init__.py create mode 100644 model_executor/layers/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/activation.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/attention_layer_base.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/batch_invariant.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/conv.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/kda.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/layernorm.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/lightning_attn.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/linear.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/logits_processor.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/mla.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/pooler.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/resampler.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-312.pyc create mode 100644 model_executor/layers/activation.py create mode 100644 model_executor/layers/attention_layer_base.py create mode 100644 model_executor/layers/batch_invariant.py create mode 100644 model_executor/layers/conv.py create mode 100644 model_executor/layers/fla/__init__.py create mode 100644 model_executor/layers/fla/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__init__.py create mode 100644 model_executor/layers/fla/ops/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/chunk.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/chunk_delta_h.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/chunk_o.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/chunk_scaled_dot_kkt.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/cumsum.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/fused_recurrent.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/index.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/kda.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/l2norm.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/layernorm_guard.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/op.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/solve_tril.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/__pycache__/wy_fast.cpython-312.pyc create mode 100644 model_executor/layers/fla/ops/chunk.py create mode 100644 model_executor/layers/fla/ops/chunk_delta_h.py create mode 100644 model_executor/layers/fla/ops/chunk_o.py create mode 100644 model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py create mode 100644 model_executor/layers/fla/ops/cumsum.py create mode 100644 model_executor/layers/fla/ops/fused_recurrent.py create mode 100644 model_executor/layers/fla/ops/index.py create mode 100644 model_executor/layers/fla/ops/kda.py create mode 100644 model_executor/layers/fla/ops/l2norm.py create mode 100644 model_executor/layers/fla/ops/layernorm_guard.py create mode 100644 model_executor/layers/fla/ops/op.py create mode 100644 model_executor/layers/fla/ops/solve_tril.py create mode 100644 model_executor/layers/fla/ops/utils.py create mode 100644 model_executor/layers/fla/ops/wy_fast.py create mode 100644 model_executor/layers/fused_moe/__init__.py create mode 100644 model_executor/layers/fused_moe/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/all2all_utils.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/batched_deep_gemm_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/batched_triton_or_deep_gemm_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/config.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/cpu_fused_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/cutlass_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/deep_gemm_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/deep_gemm_utils.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/deepep_ht_prepare_finalize.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/deepep_ll_prepare_finalize.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/flashinfer_cutlass_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/flashinfer_cutlass_prepare_finalize.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/flashinfer_trtllm_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/fused_batched_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/fused_moe_method_base.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/fused_moe_modular_method.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/gpt_oss_triton_kernels_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/layer.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/modular_kernel.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/moe_align_block_size.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/moe_permute_unpermute.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/moe_torch_iterative.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/pplx_prepare_finalize.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/prepare_finalize.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/rocm_aiter_fused_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/routing_simulator.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/shared_fused_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/topk_weight_and_reduce.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/triton_deep_gemm_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/trtllm_moe.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/unquantized_fused_moe_method.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/layers/fused_moe/all2all_utils.py create mode 100644 model_executor/layers/fused_moe/batched_deep_gemm_moe.py create mode 100644 model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py create mode 100644 model_executor/layers/fused_moe/config.py create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100755 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json create mode 100644 model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json create mode 100644 model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json create mode 100644 model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json create mode 100644 model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json create mode 100644 model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 model_executor/layers/fused_moe/configs/README create mode 100644 model_executor/layers/fused_moe/cpu_fused_moe.py create mode 100644 model_executor/layers/fused_moe/cutlass_moe.py create mode 100644 model_executor/layers/fused_moe/deep_gemm_moe.py create mode 100644 model_executor/layers/fused_moe/deep_gemm_utils.py create mode 100644 model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py create mode 100644 model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py create mode 100644 model_executor/layers/fused_moe/flashinfer_cutlass_moe.py create mode 100644 model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py create mode 100644 model_executor/layers/fused_moe/flashinfer_trtllm_moe.py create mode 100644 model_executor/layers/fused_moe/fused_batched_moe.py create mode 100644 model_executor/layers/fused_moe/fused_marlin_moe.py create mode 100644 model_executor/layers/fused_moe/fused_moe.py create mode 100644 model_executor/layers/fused_moe/fused_moe_method_base.py create mode 100644 model_executor/layers/fused_moe/fused_moe_modular_method.py create mode 100644 model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py create mode 100644 model_executor/layers/fused_moe/layer.py create mode 100644 model_executor/layers/fused_moe/modular_kernel.py create mode 100644 model_executor/layers/fused_moe/moe_align_block_size.py create mode 100644 model_executor/layers/fused_moe/moe_pallas.py create mode 100644 model_executor/layers/fused_moe/moe_permute_unpermute.py create mode 100644 model_executor/layers/fused_moe/moe_torch_iterative.py create mode 100644 model_executor/layers/fused_moe/pplx_prepare_finalize.py create mode 100644 model_executor/layers/fused_moe/prepare_finalize.py create mode 100644 model_executor/layers/fused_moe/rocm_aiter_fused_moe.py create mode 100644 model_executor/layers/fused_moe/routing_simulator.py create mode 100644 model_executor/layers/fused_moe/shared_fused_moe.py create mode 100644 model_executor/layers/fused_moe/topk_weight_and_reduce.py create mode 100644 model_executor/layers/fused_moe/triton_deep_gemm_moe.py create mode 100644 model_executor/layers/fused_moe/trtllm_moe.py create mode 100644 model_executor/layers/fused_moe/unquantized_fused_moe_method.py create mode 100644 model_executor/layers/fused_moe/utils.py create mode 100644 model_executor/layers/kda.py create mode 100644 model_executor/layers/layernorm.py create mode 100644 model_executor/layers/lightning_attn.py create mode 100644 model_executor/layers/linear.py create mode 100644 model_executor/layers/logits_processor.py create mode 100644 model_executor/layers/mamba/__init__.py create mode 100644 model_executor/layers/mamba/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/mamba/__pycache__/abstract.cpython-312.pyc create mode 100644 model_executor/layers/mamba/__pycache__/linear_attn.cpython-312.pyc create mode 100644 model_executor/layers/mamba/__pycache__/mamba_mixer.cpython-312.pyc create mode 100644 model_executor/layers/mamba/__pycache__/mamba_mixer2.cpython-312.pyc create mode 100644 model_executor/layers/mamba/__pycache__/mamba_utils.cpython-312.pyc create mode 100644 model_executor/layers/mamba/__pycache__/short_conv.cpython-312.pyc create mode 100644 model_executor/layers/mamba/abstract.py create mode 100644 model_executor/layers/mamba/linear_attn.py create mode 100644 model_executor/layers/mamba/mamba_mixer.py create mode 100644 model_executor/layers/mamba/mamba_mixer2.py create mode 100644 model_executor/layers/mamba/mamba_utils.py create mode 100644 model_executor/layers/mamba/ops/__init__.py create mode 100644 model_executor/layers/mamba/ops/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/__pycache__/layernorm_gated.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/__pycache__/ssd_bmm.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/__pycache__/ssd_chunk_scan.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/__pycache__/ssd_chunk_state.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/__pycache__/ssd_combined.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/__pycache__/ssd_state_passing.cpython-312.pyc create mode 100644 model_executor/layers/mamba/ops/causal_conv1d.py create mode 100644 model_executor/layers/mamba/ops/layernorm_gated.py create mode 100644 model_executor/layers/mamba/ops/mamba_ssm.py create mode 100644 model_executor/layers/mamba/ops/ssd_bmm.py create mode 100644 model_executor/layers/mamba/ops/ssd_chunk_scan.py create mode 100644 model_executor/layers/mamba/ops/ssd_chunk_state.py create mode 100644 model_executor/layers/mamba/ops/ssd_combined.py create mode 100644 model_executor/layers/mamba/ops/ssd_state_passing.py create mode 100644 model_executor/layers/mamba/short_conv.py create mode 100644 model_executor/layers/mla.py create mode 100644 model_executor/layers/pooler.py create mode 100644 model_executor/layers/quantization/__init__.py create mode 100644 model_executor/layers/quantization/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/auto_round.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/awq.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/awq_marlin.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/awq_triton.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/base_config.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/bitblas.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/bitsandbytes.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/deepspeedfp.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/experts_int8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/fbgemm_fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/fp_quant.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/gguf.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/gptq.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/gptq_bitblas.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/inc.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/input_quant_fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/ipex_quant.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/kv_cache.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/modelopt.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/moe_wna16.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/mxfp4.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/petit.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/ptpc_fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/qutlass_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/rtn.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/schema.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/torchao.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/tpu_int8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/__pycache__/w8a16.cpython-312.pyc create mode 100644 model_executor/layers/quantization/auto_round.py create mode 100644 model_executor/layers/quantization/awq.py create mode 100644 model_executor/layers/quantization/awq_marlin.py create mode 100644 model_executor/layers/quantization/awq_triton.py create mode 100644 model_executor/layers/quantization/base_config.py create mode 100644 model_executor/layers/quantization/bitblas.py create mode 100644 model_executor/layers/quantization/bitsandbytes.py create mode 100644 model_executor/layers/quantization/compressed_tensors/__init__.py create mode 100644 model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/__pycache__/triton_scaled_mm.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/compressed_tensors.py create mode 100644 model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__init__.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_24.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_nvfp4.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_nvfp4.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a8_fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a8_int.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py create mode 100644 model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/__init__.py create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/__pycache__/linear.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/__pycache__/module.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/linear.py create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/module.py create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/schemes/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/schemes/__pycache__/linear_qutlass_nvfp4.cpython-312.pyc create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py create mode 100644 model_executor/layers/quantization/compressed_tensors/transform/utils.py create mode 100644 model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py create mode 100644 model_executor/layers/quantization/compressed_tensors/utils.py create mode 100644 model_executor/layers/quantization/deepspeedfp.py create mode 100644 model_executor/layers/quantization/experts_int8.py create mode 100644 model_executor/layers/quantization/fbgemm_fp8.py create mode 100644 model_executor/layers/quantization/fp8.py create mode 100644 model_executor/layers/quantization/fp_quant.py create mode 100644 model_executor/layers/quantization/gguf.py create mode 100644 model_executor/layers/quantization/gptq.py create mode 100644 model_executor/layers/quantization/gptq_bitblas.py create mode 100644 model_executor/layers/quantization/gptq_marlin.py create mode 100644 model_executor/layers/quantization/gptq_marlin_24.py create mode 100644 model_executor/layers/quantization/hqq_marlin.py create mode 100644 model_executor/layers/quantization/inc.py create mode 100644 model_executor/layers/quantization/input_quant_fp8.py create mode 100644 model_executor/layers/quantization/ipex_quant.py create mode 100644 model_executor/layers/quantization/kernels/__init__.py create mode 100644 model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__init__.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/allspark.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/bitblas.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/conch.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/cutlass.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/dynamic_4bit.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/__pycache__/marlin.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/allspark.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/bitblas.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/conch.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/cutlass.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/exllama.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/machete.py create mode 100644 model_executor/layers/quantization/kernels/mixed_precision/marlin.py create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/__init__.py create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/__pycache__/aiter.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cpu.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/__pycache__/triton.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-312.pyc create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/aiter.py create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/cpu.py create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/cutlass.py create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/triton.py create mode 100644 model_executor/layers/quantization/kernels/scaled_mm/xla.py create mode 100644 model_executor/layers/quantization/kv_cache.py create mode 100644 model_executor/layers/quantization/modelopt.py create mode 100644 model_executor/layers/quantization/moe_wna16.py create mode 100644 model_executor/layers/quantization/mxfp4.py create mode 100644 model_executor/layers/quantization/petit.py create mode 100644 model_executor/layers/quantization/ptpc_fp8.py create mode 100644 model_executor/layers/quantization/quark/__init__.py create mode 100644 model_executor/layers/quantization/quark/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/__pycache__/quark.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/quark.py create mode 100644 model_executor/layers/quantization/quark/quark_moe.py create mode 100644 model_executor/layers/quantization/quark/schemes/__init__.py create mode 100644 model_executor/layers/quantization/quark/schemes/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/schemes/__pycache__/quark_ocp_mx.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/schemes/__pycache__/quark_scheme.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/schemes/__pycache__/quark_w8a8_int8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py create mode 100644 model_executor/layers/quantization/quark/schemes/quark_scheme.py create mode 100644 model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py create mode 100644 model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py create mode 100644 model_executor/layers/quantization/quark/utils.py create mode 100644 model_executor/layers/quantization/qutlass_utils.py create mode 100644 model_executor/layers/quantization/rtn.py create mode 100644 model_executor/layers/quantization/schema.py create mode 100644 model_executor/layers/quantization/torchao.py create mode 100644 model_executor/layers/quantization/tpu_int8.py create mode 100644 model_executor/layers/quantization/utils/__init__.py create mode 100644 model_executor/layers/quantization/utils/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/allspark_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/bitblas_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/flashinfer_fp4_moe.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/flashinfer_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/fp8_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/gguf_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/gptq_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/int8_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp4.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/mxfp4_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/mxfp6_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/mxfp8_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/nvfp4_emulation_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/nvfp4_moe_support.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/ocp_mx_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/petit_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-312.pyc create mode 100644 model_executor/layers/quantization/utils/allspark_utils.py create mode 100644 model_executor/layers/quantization/utils/bitblas_utils.py create mode 100644 model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 model_executor/layers/quantization/utils/configs/README.md create mode 100644 model_executor/layers/quantization/utils/flashinfer_fp4_moe.py create mode 100644 model_executor/layers/quantization/utils/flashinfer_utils.py create mode 100644 model_executor/layers/quantization/utils/fp8_utils.py create mode 100644 model_executor/layers/quantization/utils/gguf_utils.py create mode 100644 model_executor/layers/quantization/utils/gptq_utils.py create mode 100644 model_executor/layers/quantization/utils/int8_utils.py create mode 100644 model_executor/layers/quantization/utils/layer_utils.py create mode 100644 model_executor/layers/quantization/utils/machete_utils.py create mode 100644 model_executor/layers/quantization/utils/marlin_utils.py create mode 100644 model_executor/layers/quantization/utils/marlin_utils_fp4.py create mode 100644 model_executor/layers/quantization/utils/marlin_utils_fp8.py create mode 100644 model_executor/layers/quantization/utils/marlin_utils_test.py create mode 100644 model_executor/layers/quantization/utils/marlin_utils_test_24.py create mode 100644 model_executor/layers/quantization/utils/mxfp4_utils.py create mode 100644 model_executor/layers/quantization/utils/mxfp6_utils.py create mode 100644 model_executor/layers/quantization/utils/mxfp8_utils.py create mode 100644 model_executor/layers/quantization/utils/nvfp4_emulation_utils.py create mode 100644 model_executor/layers/quantization/utils/nvfp4_moe_support.py create mode 100644 model_executor/layers/quantization/utils/ocp_mx_utils.py create mode 100644 model_executor/layers/quantization/utils/petit_utils.py create mode 100644 model_executor/layers/quantization/utils/quant_utils.py create mode 100644 model_executor/layers/quantization/utils/w8a8_utils.py create mode 100644 model_executor/layers/quantization/w8a16.py create mode 100644 model_executor/layers/resampler.py create mode 100644 model_executor/layers/rotary_embedding/__init__.py create mode 100644 model_executor/layers/rotary_embedding/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/base.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/common.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/deepseek_scaling_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/dual_chunk_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/dynamic_ntk_alpha_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/dynamic_ntk_scaling_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/ernie45_vl_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/linear_scaling_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/llama3_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/llama4_vision_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/mrope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/ntk_scaling_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/phi3_long_rope_scaled_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/__pycache__/yarn_scaling_rope.cpython-312.pyc create mode 100644 model_executor/layers/rotary_embedding/base.py create mode 100644 model_executor/layers/rotary_embedding/common.py create mode 100644 model_executor/layers/rotary_embedding/deepseek_scaling_rope.py create mode 100644 model_executor/layers/rotary_embedding/dual_chunk_rope.py create mode 100644 model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py create mode 100644 model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py create mode 100644 model_executor/layers/rotary_embedding/ernie45_vl_rope.py create mode 100644 model_executor/layers/rotary_embedding/linear_scaling_rope.py create mode 100644 model_executor/layers/rotary_embedding/llama3_rope.py create mode 100644 model_executor/layers/rotary_embedding/llama4_vision_rope.py create mode 100644 model_executor/layers/rotary_embedding/mrope.py create mode 100644 model_executor/layers/rotary_embedding/ntk_scaling_rope.py create mode 100644 model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py create mode 100644 model_executor/layers/rotary_embedding/yarn_scaling_rope.py create mode 100644 model_executor/layers/shared_fused_moe/__pycache__/shared_fused_moe.cpython-312.pyc create mode 100644 model_executor/layers/shared_fused_moe/shared_fused_moe.py create mode 100644 model_executor/layers/utils.py create mode 100644 model_executor/layers/vocab_parallel_embedding.py create mode 100644 model_executor/model_loader/__init__.py create mode 100644 model_executor/model_loader/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/base_loader.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/bitsandbytes_loader.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/default_loader.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/dummy_loader.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/gguf_loader.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/online_quantization.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/runai_streamer_loader.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/sharded_state_loader.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/tensorizer.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/tensorizer_loader.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/tpu.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/model_loader/__pycache__/weight_utils.cpython-312.pyc create mode 100644 model_executor/model_loader/base_loader.py create mode 100644 model_executor/model_loader/bitsandbytes_loader.py create mode 100644 model_executor/model_loader/default_loader.py create mode 100644 model_executor/model_loader/dummy_loader.py create mode 100644 model_executor/model_loader/gguf_loader.py create mode 100644 model_executor/model_loader/online_quantization.py create mode 100644 model_executor/model_loader/runai_streamer_loader.py create mode 100644 model_executor/model_loader/sharded_state_loader.py create mode 100644 model_executor/model_loader/tensorizer.py create mode 100644 model_executor/model_loader/tensorizer_loader.py create mode 100644 model_executor/model_loader/tpu.py create mode 100644 model_executor/model_loader/utils.py create mode 100644 model_executor/model_loader/weight_utils.py create mode 100644 model_executor/models/__init__.py create mode 100644 model_executor/models/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/adapters.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/afmoe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/aimv2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/apertus.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/arcee.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/arctic.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/aria.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/aya_vision.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/baichuan.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/bailing_moe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/bamba.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/bee.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/bert.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/bert_with_rope.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/blip.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/blip2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/bloom.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/chameleon.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/chatglm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/clip.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/cohere2_vision.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/commandr.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/config.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/dbrx.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/deepencoder.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/deepseek_eagle.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/deepseek_mtp.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/deepseek_ocr.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/deepseek_v2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/deepseek_vl2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/dots1.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/dots_ocr.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ernie45.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ernie45_moe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ernie45_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ernie45_vl_moe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ernie_mtp.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/exaone.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/exaone4.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/fairseq2_llama.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/falcon.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/falcon_h1.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/flex_olmo.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/fuyu.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gemma.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gemma2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gemma3.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gemma3_mm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gemma3n.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gemma3n_mm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/glm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/glm4.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/glm4_1v.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/glm4_moe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/glm4_moe_mtp.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/glm4v.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gpt2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gpt_bigcode.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gpt_j.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gpt_neox.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gpt_oss.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/granite.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/granite_speech.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/granitemoe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/granitemoehybrid.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/granitemoeshared.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/gritlm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/grok1.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/h2ovl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/hunyuan_v1.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/hyperclovax_vision.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/idefics2_vision_model.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/idefics3.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/interfaces.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/interfaces_base.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/intern_vit.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/internlm2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/internlm2_ve.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/interns1.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/interns1_vit.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/internvl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/jais.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/jamba.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/jina_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/keye.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/keye_vl1_5.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/kimi_linear.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/kimi_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/lfm2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/lfm2_moe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/lightonocr.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llama.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llama4.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llama4_eagle.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llama_eagle.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llama_eagle3.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llava.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llava_next.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llava_next_video.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/llava_onevision.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/longcat_flash.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/longcat_flash_mtp.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mamba.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mamba2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/medusa.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/midashenglm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mimo.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mimo_mtp.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/minicpm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/minicpm3.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/minicpm_eagle.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/minicpmo.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/minicpmv.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/minimax_m2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/minimax_text_01.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/minimax_vl_01.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mistral3.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mixtral.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mllama4.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mlp_speculator.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/modernbert.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/module_mapping.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/molmo.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/moonvit.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/mpt.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/nano_nemotron_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/nemotron.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/nemotron_h.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/nemotron_nas.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/nemotron_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/nvlm_d.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/olmo.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/olmo2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/olmoe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/openpangu.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/openpangu_mtp.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/opt.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/orion.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ouro.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ovis.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ovis2_5.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/paddleocr_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/paligemma.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/persimmon.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/phi.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/phi3.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/phi3v.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/phi4_multimodal.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/phi4mm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/phi4mm_audio.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/phi4mm_utils.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/phimoe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/pixtral.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/plamo2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen2_5_omni_thinker.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen2_5_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen2_audio.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen2_moe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen2_rm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen2_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen3.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen3_moe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen3_next.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen3_next_mtp.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen3_omni_moe_thinker.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen3_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen3_vl_moe.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/qwen_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/radio.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/registry.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/roberta.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/rvl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/seed_oss.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/siglip.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/siglip2navit.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/skyworkr1v.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/smolvlm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/solar.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/stablelm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/starcoder2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/step3_text.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/step3_vl.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/swin.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/tarsier.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/telechat2.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/teleflm.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/terratorch.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/ultravox.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/vision.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/voxtral.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/whisper.cpython-312.pyc create mode 100644 model_executor/models/__pycache__/zamba2.cpython-312.pyc create mode 100644 model_executor/models/adapters.py create mode 100644 model_executor/models/afmoe.py create mode 100644 model_executor/models/aimv2.py create mode 100644 model_executor/models/apertus.py create mode 100644 model_executor/models/arcee.py create mode 100644 model_executor/models/arctic.py create mode 100644 model_executor/models/aria.py create mode 100644 model_executor/models/aya_vision.py create mode 100644 model_executor/models/baichuan.py create mode 100644 model_executor/models/bailing_moe.py create mode 100644 model_executor/models/bamba.py create mode 100644 model_executor/models/bee.py create mode 100644 model_executor/models/bert.py create mode 100644 model_executor/models/bert_with_rope.py create mode 100644 model_executor/models/blip.py create mode 100644 model_executor/models/blip2.py create mode 100644 model_executor/models/bloom.py create mode 100644 model_executor/models/chameleon.py create mode 100644 model_executor/models/chatglm.py create mode 100644 model_executor/models/clip.py create mode 100644 model_executor/models/cohere2_vision.py create mode 100644 model_executor/models/commandr.py create mode 100644 model_executor/models/config.py create mode 100644 model_executor/models/dbrx.py create mode 100644 model_executor/models/deepencoder.py create mode 100644 model_executor/models/deepseek_eagle.py create mode 100644 model_executor/models/deepseek_mtp.py create mode 100644 model_executor/models/deepseek_ocr.py create mode 100644 model_executor/models/deepseek_v2.py create mode 100644 model_executor/models/deepseek_vl2.py create mode 100644 model_executor/models/dots1.py create mode 100644 model_executor/models/dots_ocr.py create mode 100644 model_executor/models/ernie45.py create mode 100644 model_executor/models/ernie45_moe.py create mode 100644 model_executor/models/ernie45_vl.py create mode 100644 model_executor/models/ernie45_vl_moe.py create mode 100644 model_executor/models/ernie_mtp.py create mode 100644 model_executor/models/exaone.py create mode 100644 model_executor/models/exaone4.py create mode 100644 model_executor/models/fairseq2_llama.py create mode 100644 model_executor/models/falcon.py create mode 100644 model_executor/models/falcon_h1.py create mode 100644 model_executor/models/flex_olmo.py create mode 100644 model_executor/models/fuyu.py create mode 100644 model_executor/models/gemma.py create mode 100644 model_executor/models/gemma2.py create mode 100644 model_executor/models/gemma3.py create mode 100644 model_executor/models/gemma3_mm.py create mode 100644 model_executor/models/gemma3n.py create mode 100644 model_executor/models/gemma3n_mm.py create mode 100644 model_executor/models/glm.py create mode 100644 model_executor/models/glm4.py create mode 100644 model_executor/models/glm4_1v.py create mode 100644 model_executor/models/glm4_moe.py create mode 100644 model_executor/models/glm4_moe_mtp.py create mode 100644 model_executor/models/glm4v.py create mode 100644 model_executor/models/gpt2.py create mode 100644 model_executor/models/gpt_bigcode.py create mode 100644 model_executor/models/gpt_j.py create mode 100644 model_executor/models/gpt_neox.py create mode 100644 model_executor/models/gpt_oss.py create mode 100644 model_executor/models/granite.py create mode 100644 model_executor/models/granite_speech.py create mode 100644 model_executor/models/granitemoe.py create mode 100644 model_executor/models/granitemoehybrid.py create mode 100644 model_executor/models/granitemoeshared.py create mode 100644 model_executor/models/gritlm.py create mode 100644 model_executor/models/grok1.py create mode 100644 model_executor/models/h2ovl.py create mode 100644 model_executor/models/hunyuan_v1.py create mode 100644 model_executor/models/hyperclovax_vision.py create mode 100644 model_executor/models/idefics2_vision_model.py create mode 100644 model_executor/models/idefics3.py create mode 100644 model_executor/models/interfaces.py create mode 100644 model_executor/models/interfaces_base.py create mode 100644 model_executor/models/intern_vit.py create mode 100644 model_executor/models/internlm2.py create mode 100644 model_executor/models/internlm2_ve.py create mode 100644 model_executor/models/interns1.py create mode 100644 model_executor/models/interns1_vit.py create mode 100644 model_executor/models/internvl.py create mode 100644 model_executor/models/jais.py create mode 100644 model_executor/models/jamba.py create mode 100644 model_executor/models/jina_vl.py create mode 100644 model_executor/models/keye.py create mode 100644 model_executor/models/keye_vl1_5.py create mode 100644 model_executor/models/kimi_linear.py create mode 100644 model_executor/models/kimi_vl.py create mode 100644 model_executor/models/lfm2.py create mode 100644 model_executor/models/lfm2_moe.py create mode 100644 model_executor/models/lightonocr.py create mode 100644 model_executor/models/llama.py create mode 100644 model_executor/models/llama4.py create mode 100644 model_executor/models/llama4_eagle.py create mode 100644 model_executor/models/llama_eagle.py create mode 100644 model_executor/models/llama_eagle3.py create mode 100644 model_executor/models/llava.py create mode 100644 model_executor/models/llava_next.py create mode 100644 model_executor/models/llava_next_video.py create mode 100644 model_executor/models/llava_onevision.py create mode 100644 model_executor/models/longcat_flash.py create mode 100644 model_executor/models/longcat_flash_mtp.py create mode 100644 model_executor/models/mamba.py create mode 100644 model_executor/models/mamba2.py create mode 100644 model_executor/models/medusa.py create mode 100644 model_executor/models/midashenglm.py create mode 100644 model_executor/models/mimo.py create mode 100644 model_executor/models/mimo_mtp.py create mode 100644 model_executor/models/minicpm.py create mode 100644 model_executor/models/minicpm3.py create mode 100644 model_executor/models/minicpm_eagle.py create mode 100644 model_executor/models/minicpmo.py create mode 100644 model_executor/models/minicpmv.py create mode 100644 model_executor/models/minimax_m2.py create mode 100644 model_executor/models/minimax_text_01.py create mode 100644 model_executor/models/minimax_vl_01.py create mode 100644 model_executor/models/mistral3.py create mode 100644 model_executor/models/mixtral.py create mode 100644 model_executor/models/mllama4.py create mode 100644 model_executor/models/mlp_speculator.py create mode 100644 model_executor/models/modernbert.py create mode 100644 model_executor/models/module_mapping.py create mode 100644 model_executor/models/molmo.py create mode 100644 model_executor/models/moonvit.py create mode 100644 model_executor/models/mpt.py create mode 100644 model_executor/models/nano_nemotron_vl.py create mode 100644 model_executor/models/nemotron.py create mode 100644 model_executor/models/nemotron_h.py create mode 100644 model_executor/models/nemotron_nas.py create mode 100644 model_executor/models/nemotron_vl.py create mode 100644 model_executor/models/nvlm_d.py create mode 100644 model_executor/models/olmo.py create mode 100644 model_executor/models/olmo2.py create mode 100644 model_executor/models/olmoe.py create mode 100644 model_executor/models/openpangu.py create mode 100644 model_executor/models/openpangu_mtp.py create mode 100644 model_executor/models/opt.py create mode 100644 model_executor/models/orion.py create mode 100644 model_executor/models/ouro.py create mode 100644 model_executor/models/ovis.py create mode 100644 model_executor/models/ovis2_5.py create mode 100644 model_executor/models/paddleocr_vl.py create mode 100644 model_executor/models/paligemma.py create mode 100644 model_executor/models/persimmon.py create mode 100644 model_executor/models/phi.py create mode 100644 model_executor/models/phi3.py create mode 100644 model_executor/models/phi3v.py create mode 100644 model_executor/models/phi4_multimodal.py create mode 100644 model_executor/models/phi4mm.py create mode 100644 model_executor/models/phi4mm_audio.py create mode 100644 model_executor/models/phi4mm_utils.py create mode 100644 model_executor/models/phimoe.py create mode 100644 model_executor/models/pixtral.py create mode 100644 model_executor/models/plamo2.py create mode 100644 model_executor/models/qwen.py create mode 100644 model_executor/models/qwen2.py create mode 100644 model_executor/models/qwen2_5_omni_thinker.py create mode 100644 model_executor/models/qwen2_5_vl.py create mode 100644 model_executor/models/qwen2_audio.py create mode 100644 model_executor/models/qwen2_moe.py create mode 100644 model_executor/models/qwen2_rm.py create mode 100644 model_executor/models/qwen2_vl.py create mode 100644 model_executor/models/qwen3.py create mode 100644 model_executor/models/qwen3_moe.py create mode 100644 model_executor/models/qwen3_next.py create mode 100644 model_executor/models/qwen3_next_mtp.py create mode 100644 model_executor/models/qwen3_omni_moe_thinker.py create mode 100644 model_executor/models/qwen3_vl.py create mode 100644 model_executor/models/qwen3_vl_moe.py create mode 100644 model_executor/models/qwen_vl.py create mode 100644 model_executor/models/radio.py create mode 100644 model_executor/models/registry.py create mode 100644 model_executor/models/roberta.py create mode 100644 model_executor/models/rvl.py create mode 100644 model_executor/models/seed_oss.py create mode 100644 model_executor/models/siglip.py create mode 100644 model_executor/models/siglip2navit.py create mode 100644 model_executor/models/skyworkr1v.py create mode 100644 model_executor/models/smolvlm.py create mode 100644 model_executor/models/solar.py create mode 100644 model_executor/models/stablelm.py create mode 100644 model_executor/models/starcoder2.py create mode 100644 model_executor/models/step3_text.py create mode 100644 model_executor/models/step3_vl.py create mode 100644 model_executor/models/swin.py create mode 100644 model_executor/models/tarsier.py create mode 100644 model_executor/models/telechat2.py create mode 100644 model_executor/models/teleflm.py create mode 100644 model_executor/models/terratorch.py create mode 100644 model_executor/models/transformers/__init__.py create mode 100644 model_executor/models/transformers/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/models/transformers/__pycache__/base.cpython-312.pyc create mode 100644 model_executor/models/transformers/__pycache__/causal.cpython-312.pyc create mode 100644 model_executor/models/transformers/__pycache__/legacy.cpython-312.pyc create mode 100644 model_executor/models/transformers/__pycache__/moe.cpython-312.pyc create mode 100644 model_executor/models/transformers/__pycache__/multimodal.cpython-312.pyc create mode 100644 model_executor/models/transformers/__pycache__/pooling.cpython-312.pyc create mode 100644 model_executor/models/transformers/__pycache__/utils.cpython-312.pyc create mode 100644 model_executor/models/transformers/base.py create mode 100644 model_executor/models/transformers/causal.py create mode 100644 model_executor/models/transformers/legacy.py create mode 100644 model_executor/models/transformers/moe.py create mode 100644 model_executor/models/transformers/multimodal.py create mode 100644 model_executor/models/transformers/pooling.py create mode 100644 model_executor/models/transformers/utils.py create mode 100644 model_executor/models/ultravox.py create mode 100644 model_executor/models/utils.py create mode 100644 model_executor/models/vision.py create mode 100644 model_executor/models/voxtral.py create mode 100644 model_executor/models/whisper.py create mode 100644 model_executor/models/zamba2.py create mode 100644 model_executor/parameter.py create mode 100644 model_executor/utils.py create mode 100644 model_executor/warmup/__init__.py create mode 100644 model_executor/warmup/__pycache__/__init__.cpython-312.pyc create mode 100644 model_executor/warmup/__pycache__/deep_gemm_warmup.cpython-312.pyc create mode 100644 model_executor/warmup/__pycache__/kernel_warmup.cpython-312.pyc create mode 100644 model_executor/warmup/deep_gemm_warmup.py create mode 100644 model_executor/warmup/kernel_warmup.py create mode 100644 multimodal/__init__.py create mode 100644 multimodal/__pycache__/__init__.cpython-312.pyc create mode 100644 multimodal/__pycache__/audio.cpython-312.pyc create mode 100644 multimodal/__pycache__/base.cpython-312.pyc create mode 100644 multimodal/__pycache__/cache.cpython-312.pyc create mode 100644 multimodal/__pycache__/evs.cpython-312.pyc create mode 100644 multimodal/__pycache__/hasher.cpython-312.pyc create mode 100644 multimodal/__pycache__/image.cpython-312.pyc create mode 100644 multimodal/__pycache__/inputs.cpython-312.pyc create mode 100644 multimodal/__pycache__/parse.cpython-312.pyc create mode 100644 multimodal/__pycache__/processing.cpython-312.pyc create mode 100644 multimodal/__pycache__/profiling.cpython-312.pyc create mode 100644 multimodal/__pycache__/registry.cpython-312.pyc create mode 100644 multimodal/__pycache__/utils.cpython-312.pyc create mode 100644 multimodal/__pycache__/video.cpython-312.pyc create mode 100644 multimodal/audio.py create mode 100644 multimodal/base.py create mode 100644 multimodal/cache.py create mode 100644 multimodal/evs.py create mode 100644 multimodal/hasher.py create mode 100644 multimodal/image.py create mode 100644 multimodal/inputs.py create mode 100644 multimodal/parse.py create mode 100644 multimodal/processing.py create mode 100644 multimodal/profiling.py create mode 100644 multimodal/registry.py create mode 100644 multimodal/utils.py create mode 100644 multimodal/video.py create mode 100644 outputs.py create mode 100644 platforms/__init__.py create mode 100644 platforms/__pycache__/__init__.cpython-312.pyc create mode 100644 platforms/__pycache__/cpu.cpython-312.pyc create mode 100644 platforms/__pycache__/cuda.cpython-312.pyc create mode 100644 platforms/__pycache__/interface.cpython-312.pyc create mode 100644 platforms/__pycache__/rocm.cpython-312.pyc create mode 100644 platforms/__pycache__/tpu.cpython-312.pyc create mode 100644 platforms/__pycache__/xpu.cpython-312.pyc create mode 100644 platforms/cpu.py create mode 100644 platforms/cuda.py create mode 100644 platforms/interface.py create mode 100644 platforms/rocm.py create mode 100644 platforms/tpu.py create mode 100644 platforms/xpu.py create mode 100644 plugins/__init__.py create mode 100644 plugins/__pycache__/__init__.cpython-312.pyc create mode 100644 plugins/io_processors/__init__.py create mode 100644 plugins/io_processors/__pycache__/__init__.cpython-312.pyc create mode 100644 plugins/io_processors/__pycache__/interface.cpython-312.pyc create mode 100644 plugins/io_processors/interface.py create mode 100644 plugins/lora_resolvers/__init__.py create mode 100644 plugins/lora_resolvers/__pycache__/__init__.cpython-312.pyc create mode 100644 plugins/lora_resolvers/__pycache__/filesystem_resolver.cpython-312.pyc create mode 100644 plugins/lora_resolvers/filesystem_resolver.py create mode 100644 pooling_params.py create mode 100644 profiler/__init__.py create mode 100644 profiler/__pycache__/__init__.cpython-312.pyc create mode 100644 profiler/__pycache__/gpu_profiler.cpython-312.pyc create mode 100644 profiler/__pycache__/layerwise_profile.cpython-312.pyc create mode 100644 profiler/__pycache__/utils.cpython-312.pyc create mode 100644 profiler/gpu_profiler.py create mode 100644 profiler/layerwise_profile.py create mode 100644 profiler/utils.py create mode 100644 py.typed create mode 100644 ray/__init__.py create mode 100644 ray/__pycache__/__init__.cpython-312.pyc create mode 100644 ray/__pycache__/lazy_utils.cpython-312.pyc create mode 100644 ray/__pycache__/ray_env.cpython-312.pyc create mode 100644 ray/lazy_utils.py create mode 100644 ray/ray_env.py create mode 100644 reasoning/__init__.py create mode 100644 reasoning/__pycache__/__init__.cpython-312.pyc create mode 100644 reasoning/__pycache__/abs_reasoning_parsers.cpython-312.pyc create mode 100644 reasoning/__pycache__/basic_parsers.cpython-312.pyc create mode 100644 reasoning/__pycache__/deepseek_r1_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/deepseek_v3_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/ernie45_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/glm4_moe_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/gptoss_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/granite_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/hunyuan_a13b_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/identity_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/minimax_m2_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/mistral_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/olmo3_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/qwen3_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/seedoss_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/__pycache__/step3_reasoning_parser.cpython-312.pyc create mode 100644 reasoning/abs_reasoning_parsers.py create mode 100644 reasoning/basic_parsers.py create mode 100644 reasoning/deepseek_r1_reasoning_parser.py create mode 100644 reasoning/deepseek_v3_reasoning_parser.py create mode 100644 reasoning/ernie45_reasoning_parser.py create mode 100644 reasoning/glm4_moe_reasoning_parser.py create mode 100644 reasoning/gptoss_reasoning_parser.py create mode 100644 reasoning/granite_reasoning_parser.py create mode 100644 reasoning/hunyuan_a13b_reasoning_parser.py create mode 100644 reasoning/identity_reasoning_parser.py create mode 100644 reasoning/minimax_m2_reasoning_parser.py create mode 100644 reasoning/mistral_reasoning_parser.py create mode 100644 reasoning/olmo3_reasoning_parser.py create mode 100644 reasoning/qwen3_reasoning_parser.py create mode 100644 reasoning/seedoss_reasoning_parser.py create mode 100644 reasoning/step3_reasoning_parser.py create mode 100644 sampling_params.py create mode 100644 scalar_type.py create mode 100644 scripts.py create mode 100644 sequence.py create mode 100644 tasks.py create mode 100644 third_party/__init__.py create mode 100644 third_party/__pycache__/__init__.cpython-312.pyc create mode 100644 third_party/__pycache__/pynvml.cpython-312.pyc create mode 100644 third_party/pynvml.py create mode 100644 tracing.py create mode 100644 transformers_utils/__init__.py create mode 100644 transformers_utils/__pycache__/__init__.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/config.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/config_parser_base.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/detokenizer_utils.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/dynamic_module.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/processor.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/runai_utils.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/s3_utils.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/tokenizer.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/tokenizer_base.cpython-312.pyc create mode 100644 transformers_utils/__pycache__/utils.cpython-312.pyc create mode 100644 transformers_utils/chat_templates/__init__.py create mode 100644 transformers_utils/chat_templates/__pycache__/__init__.cpython-312.pyc create mode 100644 transformers_utils/chat_templates/__pycache__/registry.cpython-312.pyc create mode 100644 transformers_utils/chat_templates/registry.py create mode 100644 transformers_utils/chat_templates/template_basic.jinja create mode 100644 transformers_utils/chat_templates/template_blip2.jinja create mode 100644 transformers_utils/chat_templates/template_chatml.jinja create mode 100644 transformers_utils/chat_templates/template_deepseek_ocr.jinja create mode 100644 transformers_utils/chat_templates/template_deepseek_vl2.jinja create mode 100644 transformers_utils/chat_templates/template_fuyu.jinja create mode 100644 transformers_utils/chat_templates/template_minicpmv45.jinja create mode 100644 transformers_utils/config.py create mode 100644 transformers_utils/config_parser_base.py create mode 100644 transformers_utils/configs/__init__.py create mode 100644 transformers_utils/configs/__pycache__/__init__.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/afmoe.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/arctic.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/chatglm.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/deepseek_vl2.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/dotsocr.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/eagle.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/falcon.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/flex_olmo.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/jais.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/kimi_linear.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/kimi_vl.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/lfm2_moe.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/medusa.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/midashenglm.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/mistral.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/mlp_speculator.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/moonvit.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/nemotron.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/nemotron_h.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/olmo3.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/ovis.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/qwen3_next.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/radio.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/step3_vl.cpython-312.pyc create mode 100644 transformers_utils/configs/__pycache__/ultravox.cpython-312.pyc create mode 100644 transformers_utils/configs/afmoe.py create mode 100644 transformers_utils/configs/arctic.py create mode 100644 transformers_utils/configs/chatglm.py create mode 100644 transformers_utils/configs/deepseek_vl2.py create mode 100644 transformers_utils/configs/dotsocr.py create mode 100644 transformers_utils/configs/eagle.py create mode 100644 transformers_utils/configs/falcon.py create mode 100644 transformers_utils/configs/flex_olmo.py create mode 100644 transformers_utils/configs/jais.py create mode 100644 transformers_utils/configs/kimi_linear.py create mode 100644 transformers_utils/configs/kimi_vl.py create mode 100644 transformers_utils/configs/lfm2_moe.py create mode 100644 transformers_utils/configs/medusa.py create mode 100644 transformers_utils/configs/midashenglm.py create mode 100644 transformers_utils/configs/mistral.py create mode 100644 transformers_utils/configs/mlp_speculator.py create mode 100644 transformers_utils/configs/moonvit.py create mode 100644 transformers_utils/configs/nemotron.py create mode 100644 transformers_utils/configs/nemotron_h.py create mode 100644 transformers_utils/configs/olmo3.py create mode 100644 transformers_utils/configs/ovis.py create mode 100644 transformers_utils/configs/qwen3_next.py create mode 100644 transformers_utils/configs/radio.py create mode 100644 transformers_utils/configs/speculators/__init__.py create mode 100644 transformers_utils/configs/speculators/__pycache__/__init__.cpython-312.pyc create mode 100644 transformers_utils/configs/speculators/__pycache__/algos.cpython-312.pyc create mode 100644 transformers_utils/configs/speculators/__pycache__/base.cpython-312.pyc create mode 100644 transformers_utils/configs/speculators/algos.py create mode 100644 transformers_utils/configs/speculators/base.py create mode 100644 transformers_utils/configs/step3_vl.py create mode 100644 transformers_utils/configs/ultravox.py create mode 100644 transformers_utils/detokenizer_utils.py create mode 100644 transformers_utils/dynamic_module.py create mode 100644 transformers_utils/processor.py create mode 100644 transformers_utils/processors/__init__.py create mode 100644 transformers_utils/processors/__pycache__/__init__.cpython-312.pyc create mode 100644 transformers_utils/processors/__pycache__/deepseek_ocr.cpython-312.pyc create mode 100644 transformers_utils/processors/__pycache__/deepseek_vl2.cpython-312.pyc create mode 100644 transformers_utils/processors/__pycache__/ovis.cpython-312.pyc create mode 100644 transformers_utils/processors/__pycache__/ovis2_5.cpython-312.pyc create mode 100644 transformers_utils/processors/deepseek_ocr.py create mode 100644 transformers_utils/processors/deepseek_vl2.py create mode 100644 transformers_utils/processors/ovis.py create mode 100644 transformers_utils/processors/ovis2_5.py create mode 100644 transformers_utils/runai_utils.py create mode 100644 transformers_utils/s3_utils.py create mode 100644 transformers_utils/tokenizer.py create mode 100644 transformers_utils/tokenizer_base.py create mode 100644 transformers_utils/tokenizers/__init__.py create mode 100644 transformers_utils/tokenizers/__pycache__/__init__.cpython-312.pyc create mode 100644 transformers_utils/tokenizers/__pycache__/mistral.cpython-312.pyc create mode 100644 transformers_utils/tokenizers/mistral.py create mode 100644 transformers_utils/utils.py create mode 100644 triton_utils/__init__.py create mode 100644 triton_utils/__pycache__/__init__.cpython-312.pyc create mode 100644 triton_utils/__pycache__/importing.cpython-312.pyc create mode 100644 triton_utils/importing.py create mode 100644 usage/__init__.py create mode 100644 usage/__pycache__/__init__.cpython-312.pyc create mode 100644 usage/__pycache__/usage_lib.cpython-312.pyc create mode 100644 usage/usage_lib.py create mode 100644 utils/__init__.py create mode 100644 utils/__pycache__/__init__.cpython-312.pyc create mode 100644 utils/__pycache__/argparse_utils.cpython-312.pyc create mode 100644 utils/__pycache__/async_utils.cpython-312.pyc create mode 100644 utils/__pycache__/cache.cpython-312.pyc create mode 100644 utils/__pycache__/collection_utils.cpython-312.pyc create mode 100644 utils/__pycache__/counter.cpython-312.pyc create mode 100644 utils/__pycache__/deep_gemm.cpython-312.pyc create mode 100644 utils/__pycache__/flashinfer.cpython-312.pyc create mode 100644 utils/__pycache__/func_utils.cpython-312.pyc create mode 100644 utils/__pycache__/gc_utils.cpython-312.pyc create mode 100644 utils/__pycache__/hashing.cpython-312.pyc create mode 100644 utils/__pycache__/import_utils.cpython-312.pyc create mode 100644 utils/__pycache__/jsontree.cpython-312.pyc create mode 100644 utils/__pycache__/math_utils.cpython-312.pyc create mode 100644 utils/__pycache__/mem_constants.cpython-312.pyc create mode 100644 utils/__pycache__/mem_utils.cpython-312.pyc create mode 100644 utils/__pycache__/nccl.cpython-312.pyc create mode 100644 utils/__pycache__/network_utils.cpython-312.pyc create mode 100644 utils/__pycache__/platform_utils.cpython-312.pyc create mode 100644 utils/__pycache__/profiling.cpython-312.pyc create mode 100644 utils/__pycache__/registry.cpython-312.pyc create mode 100644 utils/__pycache__/serial_utils.cpython-312.pyc create mode 100644 utils/__pycache__/system_utils.cpython-312.pyc create mode 100644 utils/__pycache__/tensor_schema.cpython-312.pyc create mode 100644 utils/__pycache__/torch_utils.cpython-312.pyc create mode 100644 utils/argparse_utils.py create mode 100644 utils/async_utils.py create mode 100644 utils/cache.py create mode 100644 utils/collection_utils.py create mode 100644 utils/counter.py create mode 100644 utils/deep_gemm.py create mode 100644 utils/flashinfer.py create mode 100644 utils/func_utils.py create mode 100644 utils/gc_utils.py create mode 100644 utils/hashing.py create mode 100644 utils/import_utils.py create mode 100644 utils/jsontree.py create mode 100644 utils/math_utils.py create mode 100644 utils/mem_constants.py create mode 100644 utils/mem_utils.py create mode 100644 utils/nccl.py create mode 100644 utils/network_utils.py create mode 100644 utils/platform_utils.py create mode 100644 utils/profiling.py create mode 100644 utils/registry.py create mode 100644 utils/serial_utils.py create mode 100644 utils/system_utils.py create mode 100644 utils/tensor_schema.py create mode 100644 utils/torch_utils.py create mode 100644 v1/__init__.py create mode 100644 v1/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/__pycache__/cudagraph_dispatcher.cpython-312.pyc create mode 100644 v1/__pycache__/kv_cache_interface.cpython-312.pyc create mode 100644 v1/__pycache__/outputs.cpython-312.pyc create mode 100644 v1/__pycache__/request.cpython-312.pyc create mode 100644 v1/__pycache__/serial_utils.cpython-312.pyc create mode 100644 v1/__pycache__/utils.cpython-312.pyc create mode 100644 v1/attention/__init__.py create mode 100644 v1/attention/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/attention/backends/__init__.py create mode 100644 v1/attention/backends/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/cpu_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/flash_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/flashinfer.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/flex_attention.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/gdn_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/linear_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/mamba1_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/mamba2_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/mamba_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/pallas.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/rocm_aiter_fa.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/rocm_aiter_unified_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/rocm_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/short_conv_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/tree_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/triton_attn.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/utils.cpython-312.pyc create mode 100644 v1/attention/backends/__pycache__/xformers.cpython-312.pyc create mode 100644 v1/attention/backends/cpu_attn.py create mode 100644 v1/attention/backends/flash_attn.py create mode 100644 v1/attention/backends/flashinfer.py create mode 100644 v1/attention/backends/flex_attention.py create mode 100644 v1/attention/backends/gdn_attn.py create mode 100644 v1/attention/backends/linear_attn.py create mode 100644 v1/attention/backends/mamba1_attn.py create mode 100644 v1/attention/backends/mamba2_attn.py create mode 100644 v1/attention/backends/mamba_attn.py create mode 100644 v1/attention/backends/mla/__init__.py create mode 100644 v1/attention/backends/mla/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/common.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/cutlass_mla.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/flashattn_mla.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/flashinfer_mla.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/flashmla.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/flashmla_sparse.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/indexer.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/rocm_aiter_mla.cpython-312.pyc create mode 100644 v1/attention/backends/mla/__pycache__/triton_mla.cpython-312.pyc create mode 100644 v1/attention/backends/mla/common.py create mode 100644 v1/attention/backends/mla/cutlass_mla.py create mode 100644 v1/attention/backends/mla/flashattn_mla.py create mode 100644 v1/attention/backends/mla/flashinfer_mla.py create mode 100644 v1/attention/backends/mla/flashmla.py create mode 100644 v1/attention/backends/mla/flashmla_sparse.py create mode 100644 v1/attention/backends/mla/indexer.py create mode 100644 v1/attention/backends/mla/rocm_aiter_mla.py create mode 100644 v1/attention/backends/mla/triton_mla.py create mode 100644 v1/attention/backends/pallas.py create mode 100644 v1/attention/backends/rocm_aiter_fa.py create mode 100644 v1/attention/backends/rocm_aiter_unified_attn.py create mode 100644 v1/attention/backends/rocm_attn.py create mode 100644 v1/attention/backends/short_conv_attn.py create mode 100644 v1/attention/backends/tree_attn.py create mode 100644 v1/attention/backends/triton_attn.py create mode 100644 v1/attention/backends/utils.py create mode 100644 v1/attention/backends/xformers.py create mode 100644 v1/core/__init__.py create mode 100644 v1/core/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/core/__pycache__/block_pool.cpython-312.pyc create mode 100644 v1/core/__pycache__/encoder_cache_manager.cpython-312.pyc create mode 100644 v1/core/__pycache__/kv_cache_coordinator.cpython-312.pyc create mode 100644 v1/core/__pycache__/kv_cache_manager.cpython-312.pyc create mode 100644 v1/core/__pycache__/kv_cache_utils.cpython-312.pyc create mode 100644 v1/core/__pycache__/single_type_kv_cache_manager.cpython-312.pyc create mode 100644 v1/core/block_pool.py create mode 100644 v1/core/encoder_cache_manager.py create mode 100644 v1/core/kv_cache_coordinator.py create mode 100644 v1/core/kv_cache_manager.py create mode 100644 v1/core/kv_cache_utils.py create mode 100644 v1/core/sched/__init__.py create mode 100644 v1/core/sched/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/core/sched/__pycache__/async_scheduler.cpython-312.pyc create mode 100644 v1/core/sched/__pycache__/interface.cpython-312.pyc create mode 100644 v1/core/sched/__pycache__/output.cpython-312.pyc create mode 100644 v1/core/sched/__pycache__/request_queue.cpython-312.pyc create mode 100644 v1/core/sched/__pycache__/scheduler.cpython-312.pyc create mode 100644 v1/core/sched/__pycache__/utils.cpython-312.pyc create mode 100644 v1/core/sched/async_scheduler.py create mode 100644 v1/core/sched/interface.py create mode 100644 v1/core/sched/output.py create mode 100644 v1/core/sched/request_queue.py create mode 100644 v1/core/sched/scheduler.py create mode 100644 v1/core/sched/utils.py create mode 100644 v1/core/single_type_kv_cache_manager.py create mode 100644 v1/cudagraph_dispatcher.py create mode 100644 v1/engine/__init__.py create mode 100644 v1/engine/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/engine/__pycache__/async_llm.cpython-312.pyc create mode 100644 v1/engine/__pycache__/coordinator.cpython-312.pyc create mode 100644 v1/engine/__pycache__/core.cpython-312.pyc create mode 100644 v1/engine/__pycache__/core_client.cpython-312.pyc create mode 100644 v1/engine/__pycache__/detokenizer.cpython-312.pyc create mode 100644 v1/engine/__pycache__/exceptions.cpython-312.pyc create mode 100644 v1/engine/__pycache__/llm_engine.cpython-312.pyc create mode 100644 v1/engine/__pycache__/logprobs.cpython-312.pyc create mode 100644 v1/engine/__pycache__/output_processor.cpython-312.pyc create mode 100644 v1/engine/__pycache__/parallel_sampling.cpython-312.pyc create mode 100644 v1/engine/__pycache__/processor.cpython-312.pyc create mode 100644 v1/engine/__pycache__/utils.cpython-312.pyc create mode 100644 v1/engine/async_llm.py create mode 100644 v1/engine/coordinator.py create mode 100644 v1/engine/core.py create mode 100644 v1/engine/core_client.py create mode 100644 v1/engine/detokenizer.py create mode 100644 v1/engine/exceptions.py create mode 100644 v1/engine/llm_engine.py create mode 100644 v1/engine/logprobs.py create mode 100644 v1/engine/output_processor.py create mode 100644 v1/engine/parallel_sampling.py create mode 100644 v1/engine/processor.py create mode 100644 v1/engine/utils.py create mode 100644 v1/executor/__init__.py create mode 100644 v1/executor/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/executor/__pycache__/abstract.cpython-312.pyc create mode 100644 v1/executor/__pycache__/multiproc_executor.cpython-312.pyc create mode 100644 v1/executor/__pycache__/ray_distributed_executor.cpython-312.pyc create mode 100644 v1/executor/__pycache__/ray_executor.cpython-312.pyc create mode 100644 v1/executor/__pycache__/ray_utils.cpython-312.pyc create mode 100644 v1/executor/__pycache__/uniproc_executor.cpython-312.pyc create mode 100644 v1/executor/abstract.py create mode 100644 v1/executor/multiproc_executor.py create mode 100644 v1/executor/ray_distributed_executor.py create mode 100644 v1/executor/ray_executor.py create mode 100644 v1/executor/ray_utils.py create mode 100644 v1/executor/uniproc_executor.py create mode 100644 v1/kv_cache_interface.py create mode 100644 v1/kv_offload/__init__.py create mode 100644 v1/kv_offload/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/kv_offload/__pycache__/abstract.cpython-312.pyc create mode 100644 v1/kv_offload/__pycache__/arc_manager.cpython-312.pyc create mode 100644 v1/kv_offload/__pycache__/backend.cpython-312.pyc create mode 100644 v1/kv_offload/__pycache__/cpu.cpython-312.pyc create mode 100644 v1/kv_offload/__pycache__/factory.cpython-312.pyc create mode 100644 v1/kv_offload/__pycache__/lru_manager.cpython-312.pyc create mode 100644 v1/kv_offload/__pycache__/mediums.cpython-312.pyc create mode 100644 v1/kv_offload/__pycache__/spec.cpython-312.pyc create mode 100644 v1/kv_offload/abstract.py create mode 100644 v1/kv_offload/arc_manager.py create mode 100644 v1/kv_offload/backend.py create mode 100644 v1/kv_offload/backends/__init__.py create mode 100644 v1/kv_offload/backends/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/kv_offload/backends/__pycache__/cpu.cpython-312.pyc create mode 100644 v1/kv_offload/backends/cpu.py create mode 100644 v1/kv_offload/cpu.py create mode 100644 v1/kv_offload/factory.py create mode 100644 v1/kv_offload/lru_manager.py create mode 100644 v1/kv_offload/mediums.py create mode 100644 v1/kv_offload/spec.py create mode 100644 v1/kv_offload/worker/__init__.py create mode 100644 v1/kv_offload/worker/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/kv_offload/worker/__pycache__/cpu_gpu.cpython-312.pyc create mode 100644 v1/kv_offload/worker/__pycache__/worker.cpython-312.pyc create mode 100644 v1/kv_offload/worker/cpu_gpu.py create mode 100644 v1/kv_offload/worker/worker.py create mode 100644 v1/metrics/__init__.py create mode 100644 v1/metrics/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/metrics/__pycache__/loggers.cpython-312.pyc create mode 100644 v1/metrics/__pycache__/prometheus.cpython-312.pyc create mode 100644 v1/metrics/__pycache__/ray_wrappers.cpython-312.pyc create mode 100644 v1/metrics/__pycache__/reader.cpython-312.pyc create mode 100644 v1/metrics/__pycache__/stats.cpython-312.pyc create mode 100644 v1/metrics/loggers.py create mode 100644 v1/metrics/prometheus.py create mode 100644 v1/metrics/ray_wrappers.py create mode 100644 v1/metrics/reader.py create mode 100644 v1/metrics/stats.py create mode 100644 v1/outputs.py create mode 100644 v1/pool/__init__.py create mode 100644 v1/pool/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/pool/__pycache__/metadata.cpython-312.pyc create mode 100644 v1/pool/metadata.py create mode 100644 v1/request.py create mode 100644 v1/sample/__init__.py create mode 100644 v1/sample/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/sample/__pycache__/metadata.cpython-312.pyc create mode 100644 v1/sample/__pycache__/rejection_sampler.cpython-312.pyc create mode 100644 v1/sample/__pycache__/sampler.cpython-312.pyc create mode 100644 v1/sample/logits_processor/__init__.py create mode 100644 v1/sample/logits_processor/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/sample/logits_processor/__pycache__/builtin.cpython-312.pyc create mode 100644 v1/sample/logits_processor/__pycache__/interface.cpython-312.pyc create mode 100644 v1/sample/logits_processor/__pycache__/state.cpython-312.pyc create mode 100644 v1/sample/logits_processor/builtin.py create mode 100644 v1/sample/logits_processor/interface.py create mode 100644 v1/sample/logits_processor/state.py create mode 100644 v1/sample/metadata.py create mode 100644 v1/sample/ops/__init__.py create mode 100644 v1/sample/ops/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/sample/ops/__pycache__/bad_words.cpython-312.pyc create mode 100644 v1/sample/ops/__pycache__/logprobs.cpython-312.pyc create mode 100644 v1/sample/ops/__pycache__/penalties.cpython-312.pyc create mode 100644 v1/sample/ops/__pycache__/topk_topp_sampler.cpython-312.pyc create mode 100644 v1/sample/ops/bad_words.py create mode 100644 v1/sample/ops/logprobs.py create mode 100644 v1/sample/ops/penalties.py create mode 100644 v1/sample/ops/topk_topp_sampler.py create mode 100644 v1/sample/rejection_sampler.py create mode 100644 v1/sample/sampler.py create mode 100644 v1/sample/tpu/__init__.py create mode 100644 v1/sample/tpu/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/sample/tpu/__pycache__/metadata.cpython-312.pyc create mode 100644 v1/sample/tpu/__pycache__/sampler.cpython-312.pyc create mode 100644 v1/sample/tpu/metadata.py create mode 100644 v1/sample/tpu/sampler.py create mode 100644 v1/serial_utils.py create mode 100644 v1/spec_decode/__init__.py create mode 100644 v1/spec_decode/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/spec_decode/__pycache__/eagle.cpython-312.pyc create mode 100644 v1/spec_decode/__pycache__/medusa.cpython-312.pyc create mode 100644 v1/spec_decode/__pycache__/metadata.cpython-312.pyc create mode 100644 v1/spec_decode/__pycache__/metrics.cpython-312.pyc create mode 100644 v1/spec_decode/__pycache__/ngram_proposer.cpython-312.pyc create mode 100644 v1/spec_decode/__pycache__/suffix_decoding.cpython-312.pyc create mode 100644 v1/spec_decode/__pycache__/utils.cpython-312.pyc create mode 100644 v1/spec_decode/eagle.py create mode 100644 v1/spec_decode/medusa.py create mode 100644 v1/spec_decode/metadata.py create mode 100644 v1/spec_decode/metrics.py create mode 100644 v1/spec_decode/ngram_proposer.py create mode 100644 v1/spec_decode/suffix_decoding.py create mode 100644 v1/spec_decode/utils.py create mode 100644 v1/structured_output/__init__.py create mode 100644 v1/structured_output/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/structured_output/__pycache__/backend_guidance.cpython-312.pyc create mode 100644 v1/structured_output/__pycache__/backend_lm_format_enforcer.cpython-312.pyc create mode 100644 v1/structured_output/__pycache__/backend_outlines.cpython-312.pyc create mode 100644 v1/structured_output/__pycache__/backend_types.cpython-312.pyc create mode 100644 v1/structured_output/__pycache__/backend_xgrammar.cpython-312.pyc create mode 100644 v1/structured_output/__pycache__/request.cpython-312.pyc create mode 100644 v1/structured_output/__pycache__/utils.cpython-312.pyc create mode 100644 v1/structured_output/backend_guidance.py create mode 100644 v1/structured_output/backend_lm_format_enforcer.py create mode 100644 v1/structured_output/backend_outlines.py create mode 100644 v1/structured_output/backend_types.py create mode 100644 v1/structured_output/backend_xgrammar.py create mode 100644 v1/structured_output/request.py create mode 100644 v1/structured_output/utils.py create mode 100644 v1/utils.py create mode 100644 v1/worker/__init__.py create mode 100644 v1/worker/__pycache__/__init__.cpython-312.pyc create mode 100644 v1/worker/__pycache__/block_table.cpython-312.pyc create mode 100644 v1/worker/__pycache__/cpu_model_runner.cpython-312.pyc create mode 100644 v1/worker/__pycache__/cpu_worker.cpython-312.pyc create mode 100644 v1/worker/__pycache__/dp_utils.cpython-312.pyc create mode 100644 v1/worker/__pycache__/ec_connector_model_runner_mixin.cpython-312.pyc create mode 100644 v1/worker/__pycache__/gpu_input_batch.cpython-312.pyc create mode 100644 v1/worker/__pycache__/gpu_model_runner.cpython-312.pyc create mode 100644 v1/worker/__pycache__/gpu_ubatch_wrapper.cpython-312.pyc create mode 100644 v1/worker/__pycache__/gpu_worker.cpython-312.pyc create mode 100644 v1/worker/__pycache__/kv_connector_model_runner_mixin.cpython-312.pyc create mode 100644 v1/worker/__pycache__/lora_model_runner_mixin.cpython-312.pyc create mode 100644 v1/worker/__pycache__/tpu_input_batch.cpython-312.pyc create mode 100644 v1/worker/__pycache__/tpu_model_runner.cpython-312.pyc create mode 100644 v1/worker/__pycache__/tpu_worker.cpython-312.pyc create mode 100644 v1/worker/__pycache__/ubatch_utils.cpython-312.pyc create mode 100644 v1/worker/__pycache__/ubatching.cpython-312.pyc create mode 100644 v1/worker/__pycache__/utils.cpython-312.pyc create mode 100644 v1/worker/__pycache__/worker_base.cpython-312.pyc create mode 100644 v1/worker/__pycache__/xpu_model_runner.cpython-312.pyc create mode 100644 v1/worker/__pycache__/xpu_worker.cpython-312.pyc create mode 100644 v1/worker/block_table.py create mode 100644 v1/worker/cpu_model_runner.py create mode 100644 v1/worker/cpu_worker.py create mode 100644 v1/worker/dp_utils.py create mode 100644 v1/worker/ec_connector_model_runner_mixin.py create mode 100644 v1/worker/gpu_input_batch.py create mode 100644 v1/worker/gpu_model_runner.py create mode 100644 v1/worker/gpu_ubatch_wrapper.py create mode 100644 v1/worker/gpu_worker.py create mode 100644 v1/worker/kv_connector_model_runner_mixin.py create mode 100644 v1/worker/lora_model_runner_mixin.py create mode 100644 v1/worker/tpu_input_batch.py create mode 100644 v1/worker/tpu_model_runner.py create mode 100644 v1/worker/tpu_worker.py create mode 100644 v1/worker/ubatch_utils.py create mode 100644 v1/worker/ubatching.py create mode 100644 v1/worker/utils.py create mode 100644 v1/worker/worker_base.py create mode 100644 v1/worker/xpu_model_runner.py create mode 100644 v1/worker/xpu_worker.py create mode 100644 version.py create mode 100644 vllm_flash_attn/.gitkeep diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..19b2cdc --- /dev/null +++ b/__init__.py @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""vLLM: a high-throughput and memory-efficient inference engine for LLMs""" + +# The version.py should be independent library, and we always import the +# version library first. Such assumption is critical for some customization. +from .version import __version__, __version_tuple__ # isort:skip + +import typing + +# The environment variables override should be imported before any other +# modules to ensure that the environment variables are set before any +# other modules are imported. +import vllm.env_override # noqa: F401 + +MODULE_ATTRS = { + "bc_linter_skip": "._bc_linter:bc_linter_skip", + "bc_linter_include": "._bc_linter:bc_linter_include", + "AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs", + "EngineArgs": ".engine.arg_utils:EngineArgs", + "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine", + "LLMEngine": ".engine.llm_engine:LLMEngine", + "LLM": ".entrypoints.llm:LLM", + "initialize_ray_cluster": ".v1.executor.ray_utils:initialize_ray_cluster", + "PromptType": ".inputs:PromptType", + "TextPrompt": ".inputs:TextPrompt", + "TokensPrompt": ".inputs:TokensPrompt", + "ModelRegistry": ".model_executor.models:ModelRegistry", + "SamplingParams": ".sampling_params:SamplingParams", + "PoolingParams": ".pooling_params:PoolingParams", + "ClassificationOutput": ".outputs:ClassificationOutput", + "ClassificationRequestOutput": ".outputs:ClassificationRequestOutput", + "CompletionOutput": ".outputs:CompletionOutput", + "EmbeddingOutput": ".outputs:EmbeddingOutput", + "EmbeddingRequestOutput": ".outputs:EmbeddingRequestOutput", + "PoolingOutput": ".outputs:PoolingOutput", + "PoolingRequestOutput": ".outputs:PoolingRequestOutput", + "RequestOutput": ".outputs:RequestOutput", + "ScoringOutput": ".outputs:ScoringOutput", + "ScoringRequestOutput": ".outputs:ScoringRequestOutput", +} + +if typing.TYPE_CHECKING: + from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs + from vllm.engine.async_llm_engine import AsyncLLMEngine + from vllm.engine.llm_engine import LLMEngine + from vllm.entrypoints.llm import LLM + from vllm.inputs import PromptType, TextPrompt, TokensPrompt + from vllm.model_executor.models import ModelRegistry + from vllm.outputs import ( + ClassificationOutput, + ClassificationRequestOutput, + CompletionOutput, + EmbeddingOutput, + EmbeddingRequestOutput, + PoolingOutput, + PoolingRequestOutput, + RequestOutput, + ScoringOutput, + ScoringRequestOutput, + ) + from vllm.pooling_params import PoolingParams + from vllm.sampling_params import SamplingParams + from vllm.v1.executor.ray_utils import initialize_ray_cluster + + from ._bc_linter import bc_linter_include, bc_linter_skip +else: + + def __getattr__(name: str) -> typing.Any: + from importlib import import_module + + if name in MODULE_ATTRS: + module_name, attr_name = MODULE_ATTRS[name].split(":") + module = import_module(module_name, __package__) + return getattr(module, attr_name) + else: + raise AttributeError(f"module {__package__} has no attribute {name}") + + +__all__ = [ + "__version__", + "bc_linter_skip", + "bc_linter_include", + "__version_tuple__", + "LLM", + "ModelRegistry", + "PromptType", + "TextPrompt", + "TokensPrompt", + "SamplingParams", + "RequestOutput", + "CompletionOutput", + "PoolingOutput", + "PoolingRequestOutput", + "EmbeddingOutput", + "EmbeddingRequestOutput", + "ClassificationOutput", + "ClassificationRequestOutput", + "ScoringOutput", + "ScoringRequestOutput", + "LLMEngine", + "EngineArgs", + "AsyncLLMEngine", + "AsyncEngineArgs", + "initialize_ray_cluster", + "PoolingParams", +] diff --git a/__pycache__/__init__.cpython-312.pyc b/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ae34cc3151acb0b33ccb04faa6ac7f25a4f0110 GIT binary patch literal 3434 zcmb7G%WoUU8J{J0Nr?|hr1%i^q$El%`6|HEFU@m zfAhWOn{U4Pc0T?s8Vw;By7+S~Ka9}VjOmQQHgWtr0ij=zLB zV^J4nkc8;sL-x90CmE2w)Ts0S@C4z!;7Lj^YGh5~l#ucnmOu#{nns zB;XXD2AsiJz*#&8IFECH3wROm96pcpcnM#?7jXd>KMww0kb|oD{zxbI-b{W3(JkwP zMo11BVL8mBzNANts2t_=WqrsPmWMfgMISL@a*Weg^|&!Aj~WR%VI<`wm#ye2BQ2*n z{T+SG$jBLETpl+jI=rA zyvXV6`Z?pgd>&{4-_Y~MlDuSGkS_?xjN_7e5wF3y68f# zoiLw#*}SKvLwqTm*^@QTb*VfLn(&Y8Ir) zE`nVNq%@_OnxmC8mPNSwa z<3-Jc`?gCx84c~Ij~tKh>3cF7*|8p~rrowkbUd7t(&(0jRsEh?)oeINGgmYiuXKat ztX=A}qC@w~Mhz;Xx?Lt^!*0$L?Y2OvaY?D)fJV11>xBJOvF7m|^FB3A+|tXotwA-H z9VpRv>JFS7%E{t{T{=rmC;FxL)F0PX+i9=Oy<%;j6&<++>DJW~duDp;POHYcs!ij8 ziZNuPz1hsmn!bISsjfl$+%KK5Eq~RrSIO^JEYgdkm$0_HZ&^M_hfhULQ(ZxyY4`fP zIBDXn@M!eZn!aFY;>;L)WB^_o!a7o3AgpD<6!xYFoBU)5CIT%C5!M4h9b;37B$z14 zAjKffV2nYA!8n5n29pe?0AT5;H^sdv$|fHP!bAc9R5Z(mN&`f7>cp(Dr?NAc+7G|u zPXVB1vUZ6K;jti~1LTMuvwcKXdfWkGp}f!hpBmWzN!$AmeF#1FJ@)(1fpFk+ST^|h z1mtWy79ExjJ`ThKe@A{FIV^lWu-IGo5j5%p^bz{C*xBKyLOyVpNMM;4XjC(57I75# zs_VK+g;F! z0M0hyXTJyV0XiDE@Kt#Dmp2c?GhZxR{xo;Eu=@GJ`r*R*Un|cRzIzy%`|#~2nSUhb zp052R{49C%!?(T|9e;A|>5XTjg;u2S&v^Ff>YvyCSZJ-@{w()dsx`a)EdFjQ^6oJ> zNBMEW%7mAbS6Hkz2rFe?x2LL^uuAmGloxTYbbIC8tMhIp^Dl&dH2jnCz9Ci)UV&et zK}EqGRD+H>#YrIURIr&=tJlBQ0Yz((!<%MU{nZDn6C<$0rFuJXFysr=D#u7jd^ zxn=Qu(+D@PyvFhWzZrCtudnrWE1u^h+97X;zo0VHe*F9wjX@->PchmeqQ z#mtEBxtM6BuDE_6zY^1}%ym}+(hVRf)k>G#AY%rQ6l=vVxr2-eAt~8Pz3zq?6G75Q zE4J)L88d{WP%Av^4l`y1NmH%qEjPxPICrqh8ADQ{m0WW( zj2Y+qthy77ndF)kcZxC7NSbI(uDLUe$s%d4HGkWkWy~Cs(yg(p?mT01Xza@K>DQkp z3(qIl7;@s<%b@!p;$A~y=yQMi@BZ|c;>MA4rzHunNaU@=U;R{QF7pMdO wfzYlTiC2%rjem*jEpgoq3`qBcC-W|1^V7vYEd6HbnA6{;Cc?g-1o^f64}U}bYybcN literal 0 HcmV?d00001 diff --git a/__pycache__/_aiter_ops.cpython-312.pyc b/__pycache__/_aiter_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1418a0e1a573f5902bed232e0f7113902c37b8c2 GIT binary patch literal 36892 zcmeHw33MD+c3o9(=)D_g>}vxgK>#E{f;&Nq6etoD7l0xOYJsHbX`mWllh|0@;3mL^ zW;hmLPfT#)F`+n#pvl+*E!p9mI2qZA&x~_?9Q!zNx&`{MRiSJ!mQSoWCvJ+;%-Bxi z-1n=hySh;YQrmbrE`qP#e}8@NzyJPH|Ns7dkH^X38M6FLv@gbSe?kuRsZo#zKk#wf z4Nl+$eu(R(w`qtUHuai#nP%=aGe66)wb#mM<{{f~L2m)WEkpKUN3Ubp+3SSc8gUJn z1lv_huiL~$+;1vb^m>GXhzY)?0gGUVuUBwH%x7!WqGaEKGvY_MLVaI->VT;N2@Kc- z7g8?zLQ-}k<$#0B{U@9jy!s^)dQ?j(uIO98qHhBaG61l=mpgyVh{lbWC5*gf6lU`hEj;p&NZ&)90M#airO7nC4UZWf4{`zrnmqKP_dR z=@d?&BrSg@N^%krLDtPB?Xy6MEh>$yTCwLuA9*7qb>a#nd@D&t2x5dp{s@ zcQY^9p9l{Ph5Lpg!2SK>q8J&8hsK7&@qtlsxY;CC2vIT89}kIq(TYpMN@8wm#uszvK&uSZhNiLYb+KMn{KYl9f`7HJezGL|V#< zkz#x~D3_{ed11tA_*YWEwE$EACuj4$5_&l_+p}n`d|+}|iyxRdTUDB{bSdW!Om(xI zCH|7+927^#$3n5_l}Nu<14$UvIv@NyfE#FwXYtN|1fB{w!C$eVg__Pap$r8_sMRU~ zzk>EGSZMul(ll(Ir24k7`nF7%&a$=0+!R%9pR__JtjLpUBJWYto-m)Kc9{3PWmPR6 zT8FLsgTKMUOvaUUheyW7{{G7pUHY_^HC_A1iyXMVI|*$g2viz{awgRn_Y@yI$6WKBTP zQMo#it{gZ7lQ<-{AO>k?>=eKhmxhLVuI-rFF}rzY_gp0ub^iQ2uBL?LnMBJoi-4}C zKelX7TDIS-YFenc)4V6yyl1iMv4o}kU02;a|AzZ@_rLPK-TA#EzkcMl>rldSh-p)f z`jC-e>SKd}(-?L%cT_7WSKjjilyA~JX^CUWQhv_bRV@3SFi%(pO!~@$blusID)yH7 zO)3YTN$bPYhAJbZV7UV21dcKdnBpXF_UI|xLvyu3k!=GesH0UiV@xq_nj|aJ9Ozf% zLbN~f0d)nZ}1lva-%pvPOo-;+I7kiNu^5S!>C7W(%|Q5EMBITWyYN3Y4lu=iZ0D>sOD;Dl&dUd;I;PIw zbr)VcI&*ZcaIRy{Hb3yj+1Jm$UHHB7UoXGy-amEluBYhQ^E1ycxjiY5ceZloOLO6* zr*8g8(%qW2njKEqNni1`;hEu-&wuUG%%zV!1zy`!XWGxX3uharN!^%UbxNKyQ9+1| zgkte9IyK33u?_z>ORo6n*tyWfNObT_T(UDqRESBg$R#wGcxX5rJ0}%}(f3^lQ~wtl zj-Z1VB)8mZ6_-yKl~bV92P18g{bE}v2IC$9wpqo(8CqDO^=nNNnQ^A=13)y|?t$U+ z;zI~8n1{&t`IzzZOxQAp&JRQ9ohZn-3*q)nK=&q0r0dWf|3u-0eZn!}oN!IJ-&AtV z4uxpg(~;qk>=iR2uukYyVi6%UJQ_i7$&6AbO2>R9A2TeinGoV3cr_P^J1GR!x!6u% zH-S9_9wV@qz&-+x6X+nYpTH9Y4iLy>!BUi*S@BCwS?g`202yjxf~w`Z}iY3c~ZWGQ<|%I;);ft1~! zvI7>ezcS_oh5jB?dEK@&=e62oL#C}-s#aFD>w!FMtTT}rubsjewfm+?+=N(?leOK@ zQ1o2nrsp%uI7Y?ONKI|f26AIf`2?bf`2Rp2v3^vGHg?BVm2_3jbuGGD5|$QuK;YIi zsTeehS1j{EX5(K!K;0Vh*=$mrAMAi$_Lcm;%Uk&)SuK%jrSsv@J(z zH8gBM3qq^#K>Nu;;+gcZR-Cd`sdG5w=2D|=E?U;)7DIEstCRvI10#+Wv*ta^HcgAU=R#x1v=g)$f2yd8 z{wkX3&+SXqZA%r`r2;h{eG%_WlBZh+-&;}eU`sBIVvi8YTeQ|?rZlLW|`OL9xV z91}frOe&6#oIeW>qcHiUX_jgsU0 zITRV=T-Dsasl>u;Ts!NL%Mu$Mh!2M^VQRpP`2fo{yiaBy% zZCy!KR+d4;`S%pB3INLD<*cjjSzscIXNMB2cfP&hcdI@&n^ZHYWf9?J7A-(-p!lp> zZ*n)Sh9!(qe5TVnikwq6R!8R$`uFgkjbt!}McEv3RrxRhrqu}Q0xpY;^}EC593e1DV2l6-k2wIC@Ce8PT&=YP zI@!r}97DuKN?|Pk>bRJ*w%oJGCIFDNe|tIWB+*;ed|c>I>su=n#MP;9F$(fc`=fFa zqZCyp9+3y83l|XY&+-2OwJhiYcu;h~9fLBkiln6%ZHY70NMjsKXl(*d6V^#Prl@AN z=;UA(>d2{3dtQZFx}^f8$*^f9^(T5&`Vul}cFKatTCu#^yo5lkN+l=OG1O!!YK%&& zN=B{UoW`roRIb=bMEMcLZ3AFJDXN{XNEWtC9eK~vz?w0vd|v%xH1rv&_~D%CRIxD| ztZt`J`a*Q{;zb~G1Lx06j&m16ed7ZISV!bXCCB;E5RC9xT$DU8iJ@5J{1D9ZdH5il z;^UN2X1@!84~1h`!^9Xrgs?Hmj!}R-&UXxr4o2gl{_t2oiCukYGRfLKwKq+>$Id2L zZ6~S8*?bd-kb+sT2o_8WZE8Z{PsI@N9TLA^GIe0Ox<4jaSij_!2MY3nQNLC{$WSp| zgGJKn8dguBYc{P(dQ*vqDPVS8)IAs(9u9?fUfdaa;;7oK{4=D5)q#JCm-ur8zDeNc z3EUtsM}Pt?UmVDpOJvh#i%GwL7>9{xTCqai`Y}kj@8;a)b3E#_Z@yt(SU9%m+Lf^E zV$C;)Q7SxRlyq(7WyMJw8W=+5Kk}mEWI>$LVUON=5_QEiw9;#aZ<*94IBB1-W2&YI zQlvTFO$%!=Z007CDP_wo=$==Uqr?wO{D{Oqr`ItXNRw&Mw~#|GNpm+4SZzP&a9%5z zDVXJFoKqcW6IsVyyXTr~#x>h9dw$-1>%@1T{m!$u?Ryc(<(_&{7PZzMkxJ_6)Ur`0 zmdKQN5QXxE1^rraw%+@&++aF|<9yqE-$KK!W4B!$2}{Rv#y^RUSy$9C+e37WjJJWQ*XUTmM8PRe z{1@Ot{FeZja_XrPlh`Y5}q7(~_!H-p`^{l1*7-^Ir&y(eOxr zBorBoMTbU5`ZG-s`H>DWefvHTO@C=-OTLPR!7KB;sbGTn+yQXBN58R7l7MNnoA;oP zML)~>QcZ5V2yl>gN?J+Efv^b8Gm{Fs~2Yrzw%P5s`|AZukJ`-=#I7h!!w6x`x2Ec3!9Ubt&8rB z)BL;c;@J~(wnT8#qPuOHr#08hGnW$;&5OR4guUgSD*(H4AyKz+(X}aI*(A3rZS@k) zs+Wx9Cs_#7Oz_Qla*0ub7+uI?e|`3uKK~lRT_g6D^($L;{iP z_9-CSbC;r?Nx3!)G_(s{jAB(SEC}jq(|1u~xhpyjOkL~$w?xfq7X^~sD(kcy!Iu58 zCb7I{Uz?|6I(f?)_Zvv`nRsEvbJqcG`!cr$Im$rroXUauOaF4EBD}O!D z)*i`RqlOk)4RfWae*RUI@loW&RP#4c;$J2?tzmNd2qPhNX_Zv-+txe5P08Sd>n|->E~diPT2wu;nd??RuuG7PXsE-@Wx3f3uE%gT~*Y$x_vrYq*Z6yN{5NR)eLDWFk&URoaiLJhZ&PF=jvKU{6_(e#5HO$@8_-K4Tc9Od z^?miJnN44%*|PO+TcJ!V=dH^JHq4c2Qs0i${U{AB0_Hu??{fWe3Ka(QN&|Y80lnIQ z?q^4fYT!#tl6eoLs?V8MsJ&{<=+Q3Jvp!$^7tU8*q&>2m29MbC|4DfWYIxWa*_ho5 zEn~t39F5TBABR%(>8lOsYYga(2K2QCbh);gBD;lV{BOa&mvt+KT)(34h82BVSM)W~ zjA~({MyikLNcH09Ce=*_B{Y(1n?ZP4s+(8z-Lj&uf$b6Gl((Utwyzlf;q`4S?;RR> z?}d#qlFCkl66`Xdw;RxRgBwF@E9_A$t6O+XWBc}z9tf8Nqcry#`eE>R z#K7+;6@9x`^gSNgGEgi$rQz@i)(cGMbs=P#{pjC}`g)^&^@Kt0CqdWi zsbr2$8N`x(joOe~=+W?T5N+tRK{`(x(4R4&KWjjL4t3Lu|IcG=>lI$mw3JTd{vsPE z=t@%X3-yp^f$$}RltTveFB{OqD8&-WqGYdLNe5uvk(09RzANi@%GPd>p|AwrK11z{vQOWjq&Wd8RO^c&hhRhHPg8y+fQg zAeRj1tCPOfi@wHpe1~tZ+kI!ee4R*>$S>_{lEA$}Iw; z%Bp6jcI%h3nHMAlCLicjZ#z|Qd$O@TVOLhpv=#U{XnYQLo=|7k@q>n}Q5|Ntq^fOK zrw*3|`Xj1C9rkt*dpnYiSh?S^Lgmq(z(>&D6&f~M|HOB@caV})6(TNAJ=t;c;DOM= zXP-KFvgec`)2n5s*%oSSo5ZIJ5gVA~y>M9rwqD*5B^KO6s&{Ky(9X&nWhLu6qWh@*D z$K%+KsdSz?)+H~<$Hc;a{jso69qKuGxaW9x=ydnt&ckS5E3#7ycH&|*j=hWHBQ)p` za#`E3T-R0z{@2u?bgWD#>0W2Du`^-s{9-DRcJacx{1#s5{yU%AxxdS}hoJ5s-CG~_ ztf*C})BWd$;Gt?NYqG7NVbCLY|DVB7Xj!WQYgg=sU0%fxkA<&{y5cxUHGMGIcral<_{G6=F z_QZ=+Oz+cUcE=G$+u5`%@lrJw(#m|A{8VXK<221<57Nqf&iqtq+2ZO4T5gaN3ckh1 zgG$oer%KNrZ&2e44#7ESimz7PpDI09yitv>q?h^Z`l-@$$6GS#c?S7-=40&y_m)@P zE0$ApzIdw|1Jd54;-4yae|)nVA8BPi{C=vm3gcw=*@Lt)AB#U#T1D}8OnV&|yRvLPx|U_wYD70~umR z*d4S1f$i3i+(I-q7LMcAg>#Y1Qkf=M?U5jb5FLJ7(11gVWPohoQ{zr%r`V9z1mT6fR6a-$wQlXL85Kh9YcVj%4l$4@%{9 zrUYY}=p}S(F_?1f)E*Uuh$zyoA$I1mX{f_9B5V6F5rX1c5;U=Lp0ITp%z> zV2Z#jfo~A_1p?n9@Jj?12+$5O@z)5@&K;4qrC_t>|HUh3;U#lp2ky2AVFzF2O8&@e zS%?yJLowP4G{yY@3s7|**|_3$3p;KHwtQsf3QK>k;Qi978`100JEc35r8^TP?dbx} zQ}~gW^E9T)x85z^nX0L~TiJfMX2YGD_GC@_$1a=4`;nh{x7{t@#=Ljvdbi#!-^{$X z>UvXbspj1?@Mz9UszKZlHVrAjKa<$p*4 zcik<2jFo?%Zsr^AmbWqQExO*j@0Ral-W|E#*i2QoYUX&l!;IQsLmj;hO*vcONNw55 zU@s8ua6#|d5xr%vvozqyd(>^|>TYUflNI)^Q4b)Qp<+0LfQvXp9@-=R2N3giSq0SQ zi0yCc*0(DSn&wJ3u;_$&iBz%~>EX5uFXu0zy^Pm>V&*6A`1U1z`>+FY`qEr!qQ32c z+2nVoIg`UFj|ej>dx%OclZoV6AfK;D)ZM4K0ugay4ENCx;0XL1cqBY)H z_J4|aZa^G%#fX;YwxqA^R{gD~6OVN*`nnT#`F0Q8UB;P>|2ezz?W!og^tp>dKEuKwY9R;igGk%bJDl@R@<%c+eL4m{C@HG zCl-A@340II6UoVX5BXLUYC7tZ8Fjw^%IB*y5R_FNTN#S!Z;-8lm~cv3KGKmdCh->$ zY1uk+MnIa5@&?I@hJMhnTvilfOY~S4ayk^yzf82RD()dL82I!v&A3`#p zyO8Nl1?_2Z&7BJJ5~*yxG>!E~#LZIx2B(YJRR_y7Ff+XRHc7P;(IM5~G7U_*D^hMR z`zuSii&JjEnv@&6xT}~G6#Byz1!Ty)iXoE+T0K;-(*&L-@C*SmX6)qp-)WKlkV+W> zFcRtW@Vio#*Qm8a+21yl>b9hBn?|aTBm`K({;E@M{P|Mu23gMeQVmnF`UvzB5CB^B zPjZvj=dYS-`=`%y6MSaZveKbyRPryWcyu@_?+5`q^}S4P+A;n6B)2Vy32nq#BkwiG zjDrs3-LbbO?X6I}x%G*rJ#P>H?(-j+(QVSQeD&N5314%<-YgF_s8x!?1ZYcQ_FS_5 zY1=PGP|J^68?@cOq9Tq1EI&)b&QUKLZ0O8t?vu!9P70Bg=7?-t(k`nS+N!K)P_I&^ zTlF|Ntv1$6i2HX^I)6>l`AL20yae(xP14Tf3NU!m7mKV(uxoiTfm17%mnrwJ5cn#A z83I=ce2u_Q5qO2bG=ZN6&_A9_v!BniDe4^;qU-7e6OIx&ky+j&pGQ>3z&P-c?P1I~ z)N9T;)QjVFLBwaLcg24K481uGmNjkr|B&+j6>@4fU<)n9(m7V6jlK(KjXSo6!Hmh+gZS>7Ls>cYeNp{=^&4y#CC>$=9F9i6YsPnO&G#G=Z8_ zNlmJ>E?vSo+CJu7N>h;B>Sasph9w7%s>r8dB|A=W6D^iwZ!-ha|A|s*PVUezPtc`G zbUh5t$WB$u6n6!`J^w?yh{^?th*{0ek0 z1vBFc8jGCUqyr_h;xH0~#3lIn)6%cyZ*MKEtJRbg~zp5H8|Bwv`0bj|jt; z=(~cvR%Bv_ROO?Y4yjf?2Tqd*c~!YJClRp{Lq(}{Ss<2_SViMVrjl)7fa#!o;TYco zU92J6O#-D9u8crA0ZNm_mCTny=Ly6l{<5AXN;S$-_wvso;qj69mGybO{)(Y@kjkD% zdDuZ@M+g6&eeaShkZ@JKTh_ABwpg}d=E>48s6+N-jKe zY_9RO^{=jbaZdC1?VO`@wtsH>{FVjxzxLkrzU}ya+im+n z9G;uD%M%61*k~+DH;>@v#`EJ5@v>y51vpRtC@yq}j^I*({xcEDaV8SRx)AP#k(|mW z4ICPjJoq{=5*dZ4;+Kaa{n{Be*%`3LJ@_`{s!@FW%nGa!O&5bbHYvsGt~(&t|>J=LMYP& zFBAx9CSNnjJIf}$6Q(>0-T9;Y5ZcYyX+?k#llWrj7;oQ>~)rx|+Zm0*wUL5@;gOOn{}}q^oq;fPr1nPWdT6nvaoU**1n&eIlYz$h^9y~6K;vSdZN@%ro<5brV&%E( z=cZk$;<6iuuOFUvyj$IT{rtlDY4?)9Y!TJW;$AnNdfIE)tUv{1v6#ei|mNTAQk@VaERRj$1cy3zi*5| zq3Qb6&_(8@M$ujqhf}Wfy-mKPf;bZQl0NPu@Hl}E05p+jmubG5+Gi|)_ylF(5_eN( zdk8#6;NgXmlnYxkUDi%WeOUhv6>n~!Wka)EZ(1jeZG|kL>(!W-p=+0MK=pMx7b<`mG#0j)5Z2XT)}+~5@VMfXRfUN3Ll5B{Cw8cbj{_( zI3i9r%UmfvL>Dx(Yl$=07ULe_Q3UM3Iijj6yEj@rTkn%wAMGh+&F_l#OCAjr z^ZZ{U$;rzEX(#8eoICZ7FNhtOiH7z?Kwt1K?rzO2y6;T-cD`Ny{dVo@^E`?p@#k{n ziHWoZbPUR;Eff zO&?H~DL#!NNH*C%O17c!Wz3m!?%l{JX#;|53^?xHz>vvIw_%_alj!ke4L0_pdt+yC zrPfeHxkL9RFFwnJfm?IwM*UXO3nhs7Jo(`6^jKtQKzs%c7M787zv(d2GHaHb(|1V% z#V8J}8OGS#?)hrw1{R*a?c18LZULdv$WVYF*7p(Xol7et#zaH!Xc!hh#l;!!S zalLeQm&-Iajp3wVzG|P~4`lXutHU~yMEHkD(5VhdI>yj%V$*+aS+n=&V-%)doF5+z zMliC^L}k_^D3_L<6Oe!FfL*{p-0IAPJBc0d=#U4IOOujKteM@?uT5cNEE6F*f~>;q z00P?_7-R-ED62sbzs3+89g9bYF|!bY{iDOMVeGDBO_m+FA9J=hT(3rATT6f!ax5g|F8lZXyn z#{GMYC%LRN?TH2HE@C9kZd?~4Ipv{*>4yrGVg-e9y1`GW24Jnh1CfF7_)wgFr=Y2A z`}$zp_GS%-C)w{xXqfB>_R#lvVn-1&vSu7PHj&=&)?vA?(2RFem~<^%olkW4$|Ci*@{>sQ|l)E)-p9E z4}Kkc1|uWbOD*ob(w1jHa`iBl{n3F;l-tyfU>~_3{TcV8DK1fb;`hqlzHt3Z^PRW0 z-zjOwr}*D1`%Bh5zgk?zV>`!-C>v{_BCG?QU7!XphEyv18u{hm<8sk_c?DS3XXN>R zk4i&ls{azrx)II#eP8iyD$-){iACRug#83-+7J;n-|n0EPDxgYfzEC`px^<@NH_Wa z0be}neK$?w6(AyA2ZaG=cemmL7teZ0yiSu}w`32oEi<8z=1^LZ3YT)IpJ^YErXasLs=;P@KH;2;%LBrOLKk z{%;E9>c{PlG5Jb&_Int(aSuPIA^tWzO5o23 z{5gTY0Fc~t@qDyjNgiWld4Wu_$!1crva9%6lM*p)Qm%WFFX(UE&Q|3_59pF*2)~vi znPD&4z?XeEWH&~$?>6j%1SZ&VQKZFp`3^^WA4?p`G*HUBSTeYjI4*XQ`!IoH1bPTi zFD(iLXcQy&q#nY)r-JXoGd7Qfo2wkQvfL@zkSy6S)txG>yi>X@S-NfNc&e=GPT7uR z*^Y#zI8|JJr+8Dcc+*r@Do}POurV3fICYGLXit{mE}LR4dDYrGtGbe_x)PS4Hob<% zI}L}D4fvgix>Qx&ovMAws(rZIKSlJXk_}Jg(U+|Ll(j5vGFcnuwtmRr^#DI!Qcj>S zZGjVq;B19TkoNi26r?>jNV_)35q`dh!W`jq10CVDp?1%+ymse?*{uz-Q%+}RZjhbX zAiTAkzlFT$tqnwOd`HU^ddFOx3baET2F`3ka6CN7Bb4|Mzrb=jlv5m&^^k^j-pXW% zm$o=?W=qUlJNa7z@zt4=5V^IX_EB!Ubzg2m`*MQJ8wM#tm40aIa>|TIyz-!2_H?XXR1H6 z5>-irP~?SF+}xVx)avFpx?bDU; z&tnYD49)ckM_z7Ci(C+=Ms&b4#Q?4; z1-Y_?AeWOXshA5~Kb&@vE1NsHvYErBwdnw1#Y*(~;$+d9bP174b4X=GD$gNR5UDbUM8!{6t8%Y_OJSYp zdu7&JT2sv%Qk%BYUvWihdwXi#MhN4Ru?`UpT_4hCy}07Wp6h$&S7UioWnD{BWKb2P z?-~om$;u6?Ftj8^D&8Vh7|?e7vd=<18QhXqgdxjcma!rX$oSzhFgdL)w9rZ%n1L@T zxD_}mxo-7qL$3~{3&>^X%4?W1ILPJXN-Ig(T2R56mU}AU3nm@)A9_H~C|ZlGXhmGp zy8DFzjZOhys;GUf`qk>R6*#g}L|H&CyHcBJ2f3VFRc+cun42ppPkRXSQW1QF`MKbl zG^GtSa(X`u08_FjS_@USpQ~w1t!-Ie0+nG3X;br5N`O3-+>xh}JMvU=hw46h?oc)S zwQWmP)!r{D)3kEnOK2}ple85$%7Au8rn7)tB!#qtFefK#PTEB-H&I$7144iKpr5kI7&D54_TW)hW?Yi^~F zXfOr$T_x7SjHv}qEAM2zt-v)EMlL%QWcT&mXt#kiX$O&1OCUF8N{eX=vxFYFl$?p) zN;#YAtqmzf7-ry0Fs$ei(pKQe%^SG(!psZl0&-=wkz5c?S=vRItR94Ux$^1{eZZ7l zh=!pUW3Z|=wYrI{fA{O%shWBWao|YRHry|%)>IAfjL`#*5}+rmzFwU!AeWsK(Lq>V z2S6k@?H_w-<|XtAzWTI>NM5e6cviUXO#8@%UDd2hncJFl*QE=IRK(E_&!NusGhd{j3`DF*xzlbThGFfr3UqqytuQ-Ajk~jAk66FgFN*R%U zJQG_7*pGNJTt3A>8)3ys%956E#S#Lv_#ply(Y{K6Ruwd-9gdRk2?8$?_%eY(0_O-! zQQ#P1?4ZL{a!xW20<#3ZLEvTbVBdj$ft=qW@Jj?12z-aYuMzkS0>4QB%Ye$1hfUUU z=R%TwkGw9hSNs`-!ygy>5x%#l%{%kDo=xM&3IYpF59pO{vhz>!sRH*a6E9CtbvuJo5#v;GX9L7~OwW;!oRCPTwJg`0VcfV>C zZ_5}w;7hnU+&u&H3LHsd086d{a@kem;~8OR{iv+CwB*zpAv4Fh3V4 zO&1bYL>*>;u;QE{YYCA`b4X=GD$gNR5UDbUM8(G#%jSCJg_mD=P(v=PQyDVr*QK9m9JzYR9HV~rmoP%6WuBbHaBFxR%oDV#NRTuD%Ok%(@ zi2>InMy_mP<6Xh^MUykA(V;SqR7YQT}DJPA)1kjqY7IS6wSS1!U}H%ikU!n|Y= zeT2!`5hiCx82D52L*_Xws#Q*f%H~HdIX}YW{0NitBTUYZFg^AfsX5_qLtCn}B2`v- zzoY_us(lmiC3O@G9QoQktk)_a7d97^uo1F@T=FQGu)G#TBn&^0g!h5y`L2 z=;noFSxdT*NJS81IzU)4l1@`m&~Q-oWrQ7IO+TlsZpcV2EAUKNfosZ2u54M!C6|@3 zys{E0udGDMD=U%YvJxhjl`y%igf(&XYp|s8hUaz9eUF!K&y*7Ql2T$TaFkN*X32%G zoN}k@AeWQFI?(!=^>f8ZXHD8gBsVptb=TLWDN&SE>HK`;TCdI1#d|V&0?*_LT$3lc zvU!p#FHa(6^CVX`PjWSKSPmd2T@HSKc1%N#OqS%(7%Q6rS;}*FX7nqKXm-+KQCapsX8Jw56&lN=jwPRir9`gD=*4ckFI6_AayQ zP)9)uB8NyE;DA&kaWFTcUc$}C+_^Y8RePxrCoYv$s~1kZH+EvXP|=E!_RX7pZ@xG0 zy>I5XiHRhE!PRe_yDA}n;NoP&`$~7367rDP#HO57Xr5BkBRo=3@=8U`tCeUzN=bw; z)m9#XFOIy%qPEI3JIb_@YHPddqn?4}V_=Ed@t>%jU~xOS6(a<^DAUUDl_GC~AM6<0 z)doB489OO=s7x8`0mK{u6k*)mWP-b<9n$qsaf|muO|YOQs#d>9{fZExfYycj zLz(MjpRVqM5Lka7J@vmXXM&2C@f>d{Fh!XKOAl&SKDxSbg=M%?%y{+So?Fdqa$d<4 zidKO;)qn}ZtA~@PtAfAbcw!3Bu;G3b#146UZlMbbG4?4Q&Gg->n1S`Gg@19J9CYM> zVhk)|97PI5?~Gy+*BC$wT%APuG%KcId73qXVzLs{00qcJ1DJcmjtq2Yv9hav71<^` zk?qJD*(Pfwt!{mn)M2c;ORlHe%yNYpxIzzHy<&;jif~R%s0#Inp6?=} zikJzeISXG`1MyHEeNZ{*?-}?#RJ$-*_)?hO*8J!JjO*OBOzz(>r1>obgd2n1o0{H> z%gO>6{@Pb0XJiY-SwzPQLryRB23zdE2V#eGW@t3^L#jh?`l5?dMexKX>u&rKb zu7MbB9&D52WsH~r>u^@7K{`&=;x(Izd2l(0ZhaOmEcQZ3ToA8II>eKp=Z8SExzTg~ zf70Fg54!Zz8)1xBD{{yB7skl$zb0qI8<-y^((fqtcuW6BfnT7^Meq{%s>*_hInVj4|-c1*q^J9_tCLg3qw9t)=|mvd$XaL;_hELsBj5^6!+bE;*5w_jlQ1>Oxw zUI`H@ejsEk!&spJ?`omYduQDH}h1g3`4!RZ(OV10k9i)!Z`iMZHUZj;4k>X`bZTW{cTboChZ-b%lo9k#vh!H#^Ff}f9a5{LmrT+SU7uC)j8GpMyf3clh J{yHW*`8TffrM3V7 literal 0 HcmV?d00001 diff --git a/__pycache__/_custom_ops.cpython-312.pyc b/__pycache__/_custom_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..043eeac2d3c95f13e10f359e230949c14c8b5d19 GIT binary patch literal 148594 zcmeEv31Cy#m9`{HvTS)@*cY*5z;+A-j9C)16SlA?5GrIp8Ei|k{Uif+?bJ!r7AI{n z-7wwaw4K%r>EO(C8fT`R&}pY3(@rB{QoFBdo3`on&#Y-7)7k#N|99?v@99Z?SH^@S zGygvVUp>9MpL^~(=bn4+x!=pq&a~hYus>1%f1b8len~&H&kFV7#sdu&%Xy1rk*onr z(0bS!v>mnu(+;Nv?T76_$6-g%dDv;Ccv~R-a60>T1v6@GR!cDRu$#rC1+s$KhqKwU zJ&+U3J)Fy)9f7=H{^5KzcR{d_DPf}wi2d5mK!k*IuQ-jkEPgCDd zKRn$(tznJU?D+TLnKp|*x8dIS8|~?m`e3#At+HKmJ!UyPE0mr=eaHyR4$e9}3uU|g z=?(X3IpW_^mf(H1WUF&ZIgi;7&%e#QxyZZVHuL5o@4^gAk4*}XyM%myiBy0;>OrPd z_?YAHqH&kOdtZcBEWXXW#mKwlHuFwE-qH+<_76~q2uqqM7{5t^_mc(hr})dHsesxv zLHu;V`x(eL6Mr5-ysbUmVv%O~%ca?Z6mtac=c3ek_?wTv1^8QtKN^>XQVGWNE)6;i zOT_S5Bq)8c;QbQ8`~UyoTqczQ&J~i)Kczv6Qw;4%wn=3nt5jZR6XNFoE3m&yfD;_I z-@X}8k+R;Um{T+#7D>x6PRoR&Ai~_`g3@n?3Q#+8q`N`4mb3ZTDy_peKn}odi-s`-$wrmDcdiAfO-D$ zysfiO#^kIkkT#)zmG4mhHk-Fl+9G)W&ckDZv=#7JC2bS5NHjn06Qt*uv>oHQLy%%8 zjWe4OyD%fFZlmTj^O9+NrLk(>jh5YSo}aa@C0Ww@(EBZ{CGGEWOS;gKt4{5L{`e!PCf-Z{5W#ni4pn&Xx10;_a*%O z0scfIHedSkV-9^j?_neM6>Y@sAonxx)z}=d-+)hhS?ez9trN-8A`U6ygd$$vCKwA5 zEVHCPG_-afYwabiwHo&GZ_I%X;T(8cfIVLY&H5w6nMp`|eq0vhYsa5y z?a-xIk=!8~Bhd&DbVYC#K{WMP|M{4ZE|9)qfb>Du!*6OmltfrB8aKQ27Y2A8Vx>K= zmv%*f72?vqWhm_dlr|pO=`S_@Du0x<WMC0`xLHk6!;jaYgIZk~Sv1U~IVZr+s z1n;j3-oGe#|JR`8-%CNwCrRHoK=r|KLiKMnsPtMr%ZPmLvT$+UqmG`wjfAG5r1w ze%Go}Lg%>q^;ZOvMO~isFVL$zCq< z=RHf^Z&9BMq<=%7{f1HecZ_14;rD;w_i^E9j&s*c)g;=b|3s-LR1PYdng3hR-~S_c zf2ZJodyveNeh*02t9=qtLjg7Du}T*6ULnRRjIjx01p8Dd%?^6`U4I##94vI2m%&#o zbwx;*j(#jse=|KCC{ zjNX(3g6|!_4j4J_0vtM|E5L5E`72s`kwjX7oSoz5LQ8cnbhpZdmg!tbGzubaAZncx zty^y3GY<>bfVWdJS){&CWJQ6RR|sLb(mclet-9W%zN|!zUHBHS97J^rBxRH(SUVx6 zl1XT{(^lfUtI*a*;?xbZO9(5R4KT+#jV4#2C0s)g86)LtXwgM5;3eGNYM<6^WZC(Ju&3vvEV@12w}+oKmSR#)-&BL2qQ z*FYQZ0iHajbH7JK^C>lsq}D5;mV56MbXA=~9LCjv@dt!E6nqDTX)B~_7%#EpAfjP3 zm(~MD9~9Oh+8N*)Yq4D#VaNuQ{Gqp{S0euY90&T&YCp78`+WII`Z8fI{h}kBLxlb6gT^O@N7_-wn zhR+$aW7in*yT^#XAG7$bG1BZABYy7~@%zl<(fa+sqZv$halkx{-p_+$#2-o(UpE(Z zJpiimVIk!fNNLFTUeR3v4IOFx3rZwlHjnqfp`9zt(EdV94>qCJvv zYRb`y+i5+m(~hF`XN7!8AZ@T&`W`^LM-os9luWu~V3Z`-t3?WdRX$>liN*m|m?W)` zRsK1A*YG;k63VR?wxGd0 zhMjMjf}C`w;N#=P2kDMVTWCc<&`K`l-bsuCM?-x<7KT{ZgwaT4IXwgE=`_SAQnXV% zD_YEZhO%Vy7&bCQXTqrSlZ?*1AHM`^1;n(nUYukhRylZcifQss<7|q6z99{4{n_3} zJLtiu#y$RgTtry#c38Tmwsm?tEL{ayCyD9Ed(B4_&=%k~Vmg3*949;3s67m+;CA(k zw14wHV9j~4{sJkwG+hNOsJ@#jWy4*X?RESc`5!Ul|FWv>cyugn2lBmN2ps{%Mjip& zM6_76w}z4)Gp}6|VU9?La!ybVA2ZXvlyO_GL!XQmgLfe0Ga6We=MS*=blO8CpDaSE z4;ges7u4}NqmR#iLHh-qU)5BIX1ZuE^M7XR*jDXJF;+N}quaHAS`x2c9v6(-B*A-u z?+VoTgdoT5_?@7Wg0y>z(>8}c^WV1hA3^IsJO0+A-yapga{Qxm*1V+DUk31y>Fo$(n4QAIeqy4w0V7q`HV1TtTuiYWk1CvxW|NgToFBnZwb`>IkJueWJw%H)~-R|4QWS-LOlnl{UN@!ceG3SXLxrB-+w*^ z_4|gfUEgH&`~~YjZ^82@^J$cMg~fc!Jg3O>1oHhQ%lB<+xr*)IL5W}GC9-$a^1s5n zKVp5e;n#QZ>oQM)ka|-_{DGyWzQ2I@0UnP$E46s-TeDt8imOQZ3{Q#hMWj&6Qu)na zC%)0=&G(S+kCEg14B{*OW#;`h)buyV@h8af15rxEy@WJh!#C|(5HzUgF}(XzXt!JO z%%OjcL2L+VwB5Y2I^u2DfeT;9H%3Cw{W5Yt%P{eSJO<(T$LNQw#$g(Db&N((*PvFH zphwjDAEEx|^qL92LhjDWrqNT7=g;)9{xN?2Io^ub&p$zbCF=8Sc|#z`{SN6aO~Eq|54_2x3E-dSH^B9rb3VDO%pTA?+V2WvV6A(`)HvWRV>f!kxkQV#3dM-}Y{t+pj z$10fCR=4Q&044mBrsI&wV**(f{7`%wt4)I6m^o_uCYFh`7TI!#eTG zxO;O`@69u6Z{9TY=3mU)j9Pyuj1ioCK%Q}GY>?yMm{#N8S@<6;{7>_Gb!dssA@F%L zL!uHo0JLa_+5Ov{ofYt9HpM4;7R6G$c0j~Kx1AQvps?mkUvKN zd7G4r`Cx+$K80F;2;%wZ&((Lr&IIy^O)5ZbFXB5D^aT1tn&QUYqDF z1CPJWQOiTS$am~t<9i}_h7IQ)%5c_zd-W1P7CyaXaicxBsjve>1faQ0U^^;##^gU2O@%>XdKAzCP_;k zLAx}}ngKbo467*SEem}XNp?0!5GClXpghFPN>vks7V!Z(!SFX?8rNfO_-uS9MSDWN z_!)-XDhG(4VR%GP^A+>=C~clFX1;lhlyCkX>r$Jvz(Cty8n+F{9c+z|0%xI>3jz7) zZ9u+6<1exX`9&J1c{lsYAivlE`PjSJnv{@V0?1#(2$X^+58cKHl<6a|LB;*@w=e<% zN`m%P2ymSE+foChuir)=mW}Ge@^`!su;12ql3qrii1V!cEOu+n`z?v-N2T9}ysNk# z<~CR2HmT|{=iwjT1}sNzVPn?D zBN(fbBAFIE534KT{3XfwUdVPo5|5LaS3u;d)<9C*p>Jo=eu+)mDX2v#m8RN(V$xR;T1XzGf2!*KP6HyM zzF38Qv5NYF(s#p;u_ z$JjXeSO{BJSXbDjTJ+5@aG3!{7R*!HrU6S&Mu@o zE{r)LjHwsKGzeoF&0`ql51`$D%i8)uwCK*ZHE3S1uKx&`$D>*#!uh{BBVJgwn_3taoBf;kK=KAK+h)+K5 zkCdKnUAd~VaizaJP+wEt+!i^}6e>R%2n5T$wXNYuQ_$Pg94>2aQ(S8TzF>{yTh~5Q z_8&(F{Ia*s*XS=>6KF#3!t3zB6HG!)VF!YL?Xp~0e*+H|IUhlZRi<`^slmtOB7{nI zLQ(QWyraKx1_I>%aqf($Ylb`lzmSdBm`Y9EVePPJOv7qvv&eaf00`3n03|D2A87UZLXtPw8n|ea zi8skp5OBD9Sa;+Z^p*k-{%#Z?=(4$pGO zN_i$;B{Xj~UdR;8r3ViM1lqT$G5M@95nt$p2Di~IVhy2SA^yS!fK86sC-=_k-S@J6 z7Hgy8@CBMr_!LK=K3E^AwIjLqapHju@^U=f${=^xI?^twB^U;pgD^jQs{VMOwaHhn zwYtO}A8mOa>X7Fn2vY$beE7&y`(+9iQyf9H%kmnHyw;JvdHuyN+sj7-oMtU+{-b!n zHFwqyYda>7WE%@xr8J&Svhy4~Y)2{dURwrfzdM254Kyi}J{?vqrPb150d;Zg`ZZ7T z>OJbi8}}UViVZE>bmsMYe*HV^*5EH*`|>>{X^JD#B-fr$>@>oPqqeEJ%{yv8tL~ip^aW<-Zf1EEvW zM@oyR=wh)n(0C<%+EJ?p#SR!u8%8cODJgy;g+rq=G*a3KX(edC*@$JE<=0ZGBj(&C z(~h!CsCQgVf3LhG{>yc5NxEE0Ny-o?_Mk6Pg4IQo!x zX!o($#$&O2kHzNth9=j<3TtAn+FKb?8W1+5C-HC|NZO#2alBAJiD=wF>61RGhZuvx zrkw^_?OO%0ZP%||Pft56t^SXDKn@62fMRRF%b=>HhwG0Ao9d;}@XKWIYiX^oZIo9d z7u^yWrg7@BylTK@JZ^nt>xU1VJ=%S=x1zW8!m6R{dDrdp<4CSJT3Y>bo08G!Z}Zmr zYESqT_eo!%)vx}@3bh8kjVHY){5~nHIKtq_ekH30thvz}@zn(UVZ{~pw|E2oP*}<2 z@5A-&C?)7S?NwuxEFXk`dT%(;)Qk){D34X;l_HQ_l=Ma~FE{<9_7G|I%DzyeGKrNK zZuZGxKSUwa?&WWkiN=_demUYL(MR$t6O0k;*~=O<-T0omgXYvr^`Ya)=x?qi2}1i6 z891!LaSN4c2f5cTd;woa#2=hi<1=^blBd%QSzh}fX zbHp`c#5M2D1w{@g=ho@|)6L#+Yf#A~^nz?$9|3+jj^aydYfMK{-i=n74CT_234AE`AYR@}0l`y#`$m>pnKY{EPd>N@sCo*T@E3jv z0S5Oj^TEZ~VJH#a)+m=w)J8e2uDMc4tE*U{rr9*0 zIWZxcv0BY;mi=}0r)hrB&rJ1aAnaGN!>x73DD@{rW%A@+JijAAcW8VtCD1vA%n$mZ zOdw_#A|ep+hXHU25R&~A)KNg9seFQhdIZS;mm4XDf}c?A0t5!=a_%%cW}iLc%IV4M z-PvC?(0Qf70Nacfl47`c1KiY}^GM{VCwZhkB>7K6s>u!qeBl%N&nzfkBL34LR3r_l z5)(apcP9MW1vQPYHS7!28W#YhonZv*jExY!Bh5@Ug=L*8v39xyDRfJ_&J00{G`)<> z4x4E$@f2>7T}UQ^&I}6>Jfn4;SrKX}`OwP%IbZwl_YI?ji_F8@WwEdF0Z0gLu zrLr@}S-RSi&YW8+{q|a(drM`zZ?olj5kQ`Q8cddg4Ov=j{JSAH{;WOel4p`^dP&WX zRf;N`*4d%^bL@iuQm8cv>t3g)i#Eluxuphr_n*y!DkXMaUf3c|^CY9qT z4$6h4PHGhee}}*DCcvr7@<;chYd~ z%vkQso7w3_&aNF`H+ovr0$C!H0*gci79@y>7X$$Q24T+4 z$=MF)8`CUV6MM7z0s}ju?t7#5d%5gCK^sD%vB0&kIy!|q=EPst8Ye{_27?h((#hWf zG0hZ^$~ZA~3sS^MW!9L)G@BNI24b4di0N1)i584G<0KjyF&&E}@v_HDsx@MoHwM9b zD=p8zrLwcf2^X#91$T&;s!MpYh5jXC==gx3s;P*e%B^@+;*#HsFO_2YgUyjPQ;NAT zHO2HYNr`A5v;k(c^Fvh08466OsQfTRypMt|3Lc^0Q3~Eq!DAGBfPxPqNUaTcf|5}% z8{{v%ozuW6|3d@qt77(5Vj2iV$J7K$I6W7X&|^voGn$*i^`!d=D{0W%DH&XyMS^B_ zZBrP$5S)wY^3v2$5Jb8!sgi4a$*2Nj zIs*oiN}#7p?h>YZN}$M}KwIQbA_!9jnxMfKkCQ~D%DqV2NPwD$2LLM5;sBs%Vcu6i zP>kJ=2S*mI9GrM%!N{U}-gMX;)m9F!oN!QvQWoo)T0*GWbY#~)v{}l)Q-z1|f#;H1 z&M4qaWWX`Om2@#!E}(jjFjT4v5e|@o(Iv^BMIzb@4$sB|KmkbvpqLr6&+NY*Y~W=iWtFH? zAYZRn?_|2n`u7E-ig!}}BK?#QD7ktTu8s(g9_zZ+Fe$zzOez(0@>h`c0|XJ;g~C`n zEi-2i&sY|lv20+|;EF3NukMOIBn{2*$Lv$PHlNu&;?D26C+eOZwa->TrRqLf6qowc zsH8Jg@V00Vjf#t71zu@Sjc;M1#PJm;Ax7J5*OCxsK`Uy}MDfmMQE>Z9_1z)9r zAxtvBNMJoZ@xVIGfGH7FAWIo+hIVaBrKHljS_5;@c&q~Vc!HH{$s5K6>HI(6SH*HpSlO) zZkC%MI(mb?Fh)qlW%i>;_BZ<@OkD6b`(bPpsrQG?>Do`xBJuj@X?^`;whapV0#iGH zxROfSo9;}D5UR3|nfiy+gY8JsktU^GBJ{wg(>_KXi)3%y2*nh(*J#(g!>;FaK!S0! z7oZH--oOe$K8f-!T9q^)E!)~=I*k&ClF=FpW4pl*DGD|}&3<3RD43r*;g|i8X#O0z z&CLN=GMEg8ml-_BFQBNe5n_=@kT{Y;7PF*!L_1@vBTVhkUjJPt+tT?;6dv#CC6YFlI*`_(8_DMHgmi!6l-RIoB z+xr*CrrjM~x&7+Cp}aj^+g{5lJloja*gL zBbym9G{=dD(3m?tHAnNrQIm0M3UFPtn~g#u!&GLbD4@1g^2y&r$G$`0p3A^}Gb>>p zE{WNfkV@FydnCGOU0fe*4kQl;5=!GAO=vcB)LLTmokD_YLIY164LmyTcp^<6KAS!D zp;B1+)%wdaGl_^)%g4i2?1_HyG*zDcC;VD^Yz2p#WKUC_Cw#)!?DrfY;tf@Gc!}pE zHqC38cI;@GUW9>IRC%gBHBC(czc1ubi(%>&Uj$g}Id!7G_5>Q%hzbaU{XQ9u^F&ZJ zFC$*2f#%$HnY@g6nQ<`1Nf+;UJjMV(z5fg2{V(CA!Jv{&F!`@BLP@AHMT{!)-yr!* zgeA142x>je;+Qv*RnW7&Z)&t`J>f+rmn7URtv(oMa&4%K*)w=sYZBI7x;(uC1>SI!S4DHl(5DP}0Z1lCzSc z@Dqx9jXtLvQmVQRWBLY?4wF6&g-I{)6ALR+MW;jB#8yLGIHCm?NtvME7p+V^^7`-b z;k4YwmCK}NF3IG2luV{a@doM}{qo~@uQ-4a{s0VEXnwHtN_t(O$ro9+)S#92=t7s& z1_RgnTA-Z_@(lF%vhl_0Em2IDe}K~EmndMOtwfbM+3<%)FW}@6N=U&k2&YKt0-VaT zJ!s@=RLXj!s5P~3+1tllKzUQ zt8~EsY~Y!|_vgJ-`eG>*zB$gW&ENq>n~RE=y@E)XqvvGr$-YwquEB*!2&ID40t8-h zW?j@ind5GH3m4ktA5rr<5MVc}IRNY9MlYEvL(LF5ahig*1cWA}V)7fUTE=1$a-L+B zYyn#^t*tDUxX(+VM648;=3>9yAPs6a6p(!kYIB9(E0(QQ|zEdsO#bx73`*AQE ziZq32=fWR48CD7p?%1)@dtl!t@BY2p_V3>1tv<9zSxU>+vLH5o177U*)wY6Vmj!%n zfK8cN5C~>dsEo?y1rH!e%wyuL&Ibp!sP+%-|xnpfO;XIFKvI=iNO&9HlZ z%sn3{;T&`fc3!QG?mIZ-J`}Yd;^^RPVf!{p=5e{H73MYoQo^w7bgU9J-vS+DD`5#z z7-T_}N?|j}wo9??+NG@+(@(Y5FZWfffP}>+SR5>`p%TU2?1RBs9qh83WcgQkGfKX4 z5oC}1gF*T45$yt13s1)bR>|%hCM>n~9`8HZe{vu)c=&43kb76uzKe@JM#8CgjV7}Z zqvdjglGu*Oq>tI|Cay3Y_0c2)5VQnv7_12OSrE@G$|e6AX&}1haD3gpcf(fi?)&$} zu?}n4({*HNFTYMH)kSzBaOJ;46a%Vauc`NiN8y_?QMK4V-W+Mc3PXl~%R&7Ha^?`I zx8kA8@~iaXS8|Kawsp4+=Xzqfp5feuvD}5w)1?nixH92t(be|o5${m$v8e0VYqdVRO8ZWC|MNRD|C$vgBVzIhB zt+=5gzyuB8#DGu9rXMGldg-u)4+i};Fz_}4S0C)XXg$_FPT3vjvhj7xqt-EY9}ENs zYC#>tah{@mE^9QQrNy|y>M_rCt@fzFZHCH<=h({7|$v(~<0u{zeWkzu8@XL~nPRq0Jj z0IRMh`!s49n>?5qkH>-EVKt4{m!eE?Q8#i>o>wd7jo6l;8-M|A2*s$q@2sJY96~ zi9-SdU;2?q!G(+e#M5Z7D!CjhX?WP775Vo_H-|u6g$GdO36{KKLiCaBoU;eJ5B5|) z@nCP}lergi`>jKh7Dclb4^%w6@|l%aDqg7kRwdYTUgjGXP;ZS8ll&2^RfQ~+Xt9tA zkMZ24co-YHw3ya7Y+U!Dsp*(9mW^Wg1nh$X{*pAV(y)NBa8#nYu;}3G)+}RMISq-H zQD2LY#&|4YVc+Kd4KLg8QhQN@y{U*#abX!k=EGqn<5ZK}2&3LwKfMIQBr9MiGmZ(W z-y!w85%;^tKUmrG&eL!i$+Owcg_$vmnUt&(S4f;@ax%%D9j;4iJ2XwFiYqinx%#M( z!A7NyOP8}LtJvzLiy4M#YbUrmD1Z!Pk<%$~QIJ6aPL%N@O0>*lAFembqDTr>5wusL zN`_fk`EiE6IOc{iK*pIW2z#gdCl73WcKb8iL31ZW_Z)(bAj=7~QgueGr1pvW1Ws&n z0=#u(dk=>Ucl5_`m9s6bN<48IsM6YJs9l7n*~JIz>_mZ)eAuB2pO6Ar>lYXAy) zBK$s_e#6oNifkp9e%CcGTMEUH)Qb7wM%wmfFI$?_LTROc#y1qODrc1OwuVB^1LWj< z1SJI=cMItS<2yz4#1Occo+eN*k%CDSFr=PLPl-q^PoZ~HDVRpVbOh{FmWJo?Z%$|? zBn)euyv{_%2MuGo=+?%xfW|aY^iGgyb>Nap9g=|QQII%O!mfvT3Y{%=rgxvVHBR8z*tto4jQ9{0c@qK3C*1-JP%{nCX4DTQW`QMd;@OVwj^64g-gm}6l0WJEv~$yX zeIK8B#yOHVp*R0r<{1acj3f(g?3>rScEI^;)-zdG4!&^oTSu?w?u)wi@oAZ*4Iv?e z`9{`vP~|NQo@0yw0UH3|(U0U$+;1Tt+PXAzvWB-QhpmCwSkfnsTwpM>P&pPtq9Ih$ zZ~`J>q4XL`!8C9k6J0^uz`!usl}-6VH&c%$crJ%vrL+PzWrQ# z->i>!p2<=NXC!+{?~eWjmzP{zGSG6Ve6TvUXl*oW-H2+2RygVWnsaMzrrYx}f9JBe zbGql8@jWsBjSPE6J|Dgu8onUeBt-mP(9<*!NTR5W_0!%Pf69R%&ljCsQXn~>{pK__ zOZ!Q(f|6I`tv%rjg|H@91&bV<1z^e=^*8izCqP#N{sen4eYSrA9liyG$s7wEI^AQS z6l|YvUr@c6AgD4P=}5*6ol%x#af-bJgDRI&P)0#H1$R+ULBUcAmQk=Afno)J3x!F+ zr_XUb@M)eKA2dG0tB~U(gf442x=gb`wB6jjxu@cZZD*__S-EGoc5m(3^ThTuR&8Qe z_qOz|=%00Y;l+i6vz}k@+=A=5>!Ysq95afIm;sA*w$fj(IQg*-CEth-{G^8+6>A@Q zX5#P;Jd7<48?=q<)&7RbY>c6 zhlvzO2I6dX7+b53NJpk|$tIKHyu^U~SUxaGsbU4w^0K8G_H5ZQnau!(_em(vCd5V> zqe==^Ay6`i0x{ja>CUZElmux$W?PZ9pT=w(nhVM~Njw!sJPV_q<>$73W8U*+&y}6Y zFq;}B*K;>UT^sqdm|`4XHYXTOgRoec;HBYhn*j1Ug3i4Npm8S!CG)SKQiZ7+JQ7>R z!Ll8Bo+GeKWng(Nw@5`~=hythD|W_K?7W`4E9&CvxAwWlj`~CFa1HrGko`@v1nql$ zD6+gpvDUl`z21S27<)ZW>$Rwvy#KqA?VD!(w|=d7xMFLpV(ax>Qn#>yRtkNA03C3J zy+kDxplsd9S7#;>N(T6*W?_LxZeY()*cA?{2GN;#f~|BI`e>zK!-4fXs`q(!diQVL zyJ_Fn-8(ia&J#XbWGFeIWlI}TqYCK+(+r(jbYTo{fm)2k0daI>oE%VBTf~+3Y=J1R z1b8(zw+GLBIkK0Y2oOw@Det4F{S+La;2;HuD4;nnV0DvWB{ga&_>ew`T;HJ~y^EnV zEYyum&ulU$^EoB=`0-%CulUleNmzApn>?3_yO(N5d!c-8kG5HP~(}?7A9J7nWEU6)NKxA zA99Wmkmn!|+tJ{`L+e-Ri$M-D& zRrV8<<{$t>O~9r&nC+|p6*bg>7)8E^WLp0HyF58WiIhG4!$Pnd@7YZm}o$^udYyF3#Vk_G&^}Npyvs+X5)qyUKq^5%@g!P zeu=!HG#)xRM+^{#I?p3iOAy6EB{)o_i?jQa3<3sSfgzK}`&#I{s2| zVPvpB4x2{f4x&zBK6q<`;v8dAI6Ig-o(_kQecsw&u&Hq|-2irC8yv)Oh%;bQcNn$S z)_^L#j`y8|`xFXD$Kp!^Hx9a`1oxoz`Sj<~uUcPnz3950yC>?}!zG{zWRC^TNCyaU zZlT#%QxDsNwgiEX)_824x1WL4z|Jt<$saSaZLayhgf7rW)9R11Lz|(Ad`$X=moC^< z@jX1O`kI$RG(L?Y3__yZPGF^A0)b{43Wu*Qm$6&-ceIv65wY?qNA{|a`TQk0gGzwI6G*Gc)m8a^V*;awZ zmhhl_!60^^CA5*R&`7+NJ1FR+;9&~hhhWqOwEPG~Q81ZcTEbvTcKrqqBg|-iyFuN!yEX{=ueY+D{Xr1G}CF2@b ze78(9uECi$@*W5+)hV=-@2N6T5f(K#s2Su^O}>ukNjQa>a!eeS1E= zI%=OEb;B!C*4dTaD|;%st9n|_pE`G{x8+=WS4&s*E19`x7k4l2Y3ZHSQ`fg`!1wHl zXHHztyr*l!EB1`IlbiW5`~1Fee|3M$K~# z+02|7%I`&s4fI4wO@S)qZlqsu{JIv-thn-^e8EvAi;xih1KPhT}s|Kf( z#_GvKx@@ie@o%l2omRub&cvpYcBLT{*fIA_tSjTYM4cItLap?MBJC;utJ=UEY9C;b%yO}u(6Hfzm1ixi ziQ(R_EErt4kj;3n&$GzmD_P=Ef7akD}X7U`KQ}q#;W5fEot|<^`Iz?@wqTzKo#ZVGs1|IGxxUsXyrt zw9RG(`cL}^={)ExDnH>nX@GzSJzncMa-?F3XX();3`2}1BW~f+C7$Iaddmnq@j0># zZ;u{juL0o@m3*qHHGo@lPB#;t`G6a7KpU!|npkCKMN+Bpf>j^BI01mtn?m^3kc9>n zC~4@3isxnAq<@izdQ`Er@~CI+T91#v)hpy_YYb_ZR>o7S?^$YY?nEMoV%xuqt02mp@r`q3Sbh`YSH4yts0p;$qdcsdo>~eSXn%i>_u|tJ)K_ z?}@tiz#KXw^K3dihFZHb;WyHiVfc}r-Ba}F!y}LxHgs=zViUHsgrWevvXZmKGkYL{ zuI$~?w|`*%mBQ%#558?jfZ1*vC`hy*UdOK_#|2}XVEL$!fc$ud zP%%omOTI>rY(0}@Sji-jM$&RC_LKE+X!|&&A7hOJnHpV5MJ_){g!5ArFd;h2$ijql zQ~oq%pddh$@@_nUQqHh|e5vc2UR(D)y}oPinSIq?+49tu{_3ah8z^{c_dv_Fxs_3S zWz=0c;x4*VWVNWgXV3XV=MMGm>#gZ|C|X$7SMimVPp#~)c&aK^SO&jFAhphJ=NY-D z;{3{UD|?roTRr5O2@>qiIkUQV&XXky*Fy-*7gZ~C8j|Ay+Mnz z^OlS|b}su8+C-mymN|yAKDbFCjc!k9vubxIu*IpiFA`T2Ow(Y&Ekc+9lPWzyPo7g9 zQLjGW#`8(bX!%Kk3I!h~sBi=LH*9u?c4C2y)P{2_W4V>lRlA3B?~l6f|JPUClYa}d z+2Wt)7QF$V8O8t3HVz+MxY%+jV`bXctt->d$u2;oH8N0Q6Ygt>HPO2pVog?` zhS*FFXk%DsKN(-UrdV}yDM7L_FX1M9im!2T&YHQkW^#yOU2JHB$(ppmjQ129Tgjhs zUxJP*B@W+T0Z@#6Y`_tBEJ?yx6!<*cDYOS0escY3tcPSeRKXWW8@O4FmIMd+Wp+9% ztSHBh_8&D)76WTXsLugAUkW42Q8sU{D88T^^&delDdE`DVVuw?N2-p z?DH(-OX~fs=_STBiig$C99n2SLoaHu)X@4Sm3emJJ`T2s!iof|625?pFEFVW9Umjdm7ZRk=On3y*s)5fjeiV8yyNtuol1M8S zf=7%1fcU^P3>+Ls6TwmKVHwbW3;-@z2^~+2N6UTQP@E0rY-o3IT346K0Ye z>k~G4sF>v@a9miAi*1*9j#1t)B(e#~I z5>F#37M7MATY{79@ZaD$7Cgpg=dnh7<2QjXU_%{I_B&d@0y{|Vo(=}fVvZa`D zF)WXF+3YZ|HWS9pRMf^aChS{ZHaj1=3kynsIQgU6mZa!|x@dMbmPaWrgDrlk(?RfHVY}Vkf&dwCR)3IHog}S zbznVMDy*}{SZCuBPbfv5HOY0d^rAYKVI3^Dg6(VFLsgIvmR2q2;3E>v6uBzZ_Se?e z)dRLV(Nud%wKh{huwn5o^)6pngBt6DYwyx~w;a7wDH1CSw`_%=U4~1qq=7nCQx;iR za!f0j&HW`Fb?&1Y0hwFKvgy^`%@+8C5lO|H;sNmijb5Tb92iU3s4(XsqyeY37k%?n z=nxw0EoqEQTJaAa$ohXO(JC@Y>jVhNE7UR%18B$NC65r}uSL%syEYut>l8B;>Rp!$$|i4mN{wB|~WOjD%?V_9aJj!F9HB=t{IcJx#6 z7b!4P5Ra`JmM>9W3ciV%AO0yC3PuGp<*Wj7ZP06lf@5zC3XY0v?z#OFE>F8SZNPVN z=HSv;$=WOV*Gkq$?dzlN^|z|#=<{7z+rRnpj*B}6^DpkcHg)wC`wN-h%Dh^C?Ot-2 z5Op7VH9=wQ$~l|Woz=6aJ2%y~@oUCy2da^_f70yFtD8a>2O)Ha@D)2 zl$-=>kcy<=OO<@?v<@qsU@Ko)CumEO$0-Ql}Hicu7k`+VfIPy=YYcXE3E4 zgs82#86=^EO?{Lf_F$nH;k$rDPH+(;pZuk?iA1J;Dv(L_$$VIYU|?Si$5$IY_%^s( z!g>?;3aC43EH9RA$LmA9FWS-~{*8-p7zI;D+T*K(;}%AaxHJnXmD?Y(B{Bm@JhS#R z3W@mY6&sT^C=6_mTOq0y0eZAn6_lldXcMh)6FYxolV`D~;wTo7$?y5@TPgXLikm)R z+biMs(kjo+)4NV@_b|gKY8BshWI$RVf{)fk?K=@en2s20dku|t?hKPUf$&ApJ*^z= zn+QMO{01(?*IAshmZBUWgK;WWk^Whmt11@IIvi_rRxxl~J(C|FL~&DU{Fn{2nanmy z!KA1;=!91NXi_lF`gsB32B`QZY`|zb0|mW$`zcmp$KjnR6!A3UFcZy5a8{TBd0=o@ zmM~FL)SozoSaqCzj&~x_fF@0*_uSgt&$km#y+eAsKIC+kq%4STVExBwIWAX$rnd!5>j@nSucd*jb8a=;@CUgo!?D zxUTuL{2Ikl@DId_*C7S0%&}ZGIwqjy9P?d?yS@1T$s?{Q%(-O-HURK^qu5RE3EZlC z0`t|~vA2!?XqJt{d0I&@VbCX071W#W(bHrC_137qN{v+d6Y~4X#-j#o3CT!7s1Xq^ zMY7@&sf|usJa>Q>{o;ckQ%P+CH&PK#UC!JBN&WWuUEeS1-KqjaZBOIQZj?VxYg=QWt6S&BfP665;hJZPzsHQMwmE(ov`eL4kjKKlK-6YQm~vbIe==gddjp+oj*Kfacs)s z{_5D2vZ#IHE4U42`?>8S8Pj?L!!z!R&A4kIA5OKSuH_@SQ-^bB$8u+n!MeO|sQ_Qe~o;sMmg6QP;#8`j!PX_L?bxVb zZff8g4bgPs=ycdaH;QPMbtEN0Z^cyL>Im??G<}PMiTbwY!3c}PcuTHNp~)Gh{7P){RPeW=P9^CfiBpM_Gc#lB_*I>1CfW7 zj6B>dOJH=%V{RO@FiLX}=-`{2q`4dU86?VOXx_=m5xJKWT6q0}M+dfphDSL5P#;}B z&3Omn`4zq_UVns{D0X-yx?vBf>xBzQeoPM68{#KbP588OTXo+u=<7w`uy4Q*-&RDEKY~%vl#(>ahb>FVMTI6ud~m zUsLcs1meu1!+c)E^&03M!M?I|)AF6mp!owZ=QJ?g1ex0ldo(tH}^Jm7Smfau|gJoh!Iem`Q{!QzLBbhEX^h4ktM6Af*B$XAiwC137qsMH^B3ZqpOiJ)}$>G9}*#8(1|29ERB= zlT@DuBiV(~i3?)c3;HKSm#&YMtiRG2E7=px+B;G(ZMa}|tYG#?;q>9cxv|2z%tzlF z7JCNwF`VGggFwPi^e`Uo^y2!GIt)St%_o17DzH@iR2`}%p(YhXsL3(&Cxn?4yi9PK z!{CH_z>Jt#5_6aIFB+&Dth;hz$h|vi-_7xoaugB?5;#b5{^Q%J1Vu=JzoHz#PebhSWbH z;HgNKbH|a40H}S)aET$=a#Sm_l2%QT933{J(H6H&9z)~F^KeRFhoeqB5#VpvStsZY z38L(S8%fY+`fRiut;CH|=>|QhaS6&3SsKjDp6JkZ(g%9b^vQTfH=nR?c14^<(aEp! z8pM9M%8s*02A{9YQiB;1GIe5xP@SkaU~6;SAHRo!QDpd&7g8>!6_-${V)c(!6DSjm zyo8jLe7?%R0(kzBFzHLk3UZZW0oh^nr>`PbFs}=0!2F)|-K%?H`!_XepBi;f)o(f} z!qIx$xd2>;pBS8VF&LYRODoonZrk}C=XMMiEsGT`8_Al`)6%`OcX^-n!b%jld0;DiU>9eeah_>;JmZZtOOEFa z7*`Z@aZg7HEJ$b5Oa6PK=km7!JvmxpeCea3i#%<<}_{1rurb9|Qy;elAR4@(W|`gSzFn^iP)`99u zmV2iq0-}9%<79r6ibDP|6BTaVYY}z?&CGxGi$&vG+Wc15QKLImb3&`1pF$V7r{pjn z=yZ8CfxMK0H5B|E1%FS$KTuGHV6>l}f1)S~*3&S7L((u6Bn;ECn0pxq|JjUg=!~H} zp2V)cCN1)fTo;XkOC5z;r-N&-k{$K10S<`ZgjsOlOxOaOovNS>E$W}%XChL5`7A9{yi zo4gnpg`xQxJfQ&M9-$dt%Ig7%q%Fm_}YFv+_`zFN!l8R8U-JCjpLS0i7trc`_MRp z|0MZesOuDbg1XLK5-0d$E5l)w{*yy+%4T1|)`kBix~H4Di+l~syOKKkuhdZrdIcRN z6P3aIAvfGzt>!BDENr2Xg#*qC1;Xulq@gjsNw7-M$9V}KXH>*LX1vUJf|KF*MH`ne zt|Jeg875Dn4v;EG^D|hW8c6c?@Wtu>MxxJChscK!E*a9DNn3|`z*yMx@oka2cV>TL^Hx60zMn@n@`oNnA6qDiN``|MK+A)(9pTRw$pG(uq zVqWb`QmI}^Y?3u(mF%5VmQf;7Yh`vxCNhjKGG%kW0p=_f{67%nv|u(6W+bJI;u01 z#brv_dYpTVmSl9erEHUyOfobn~I!ijmCW{SfbMtg$%)DkNXyN5Wwh6|@?gPwjQ z9BiV^MXk-612t+RZ6GH+Yw#vsR1Y%GU&~urjmW_Njmn8En8Gi z;t(4L^vLI7XR{gh8V(gFbs%x`kZd5(8SGRTMv(6qvtwcH^B8Q(U@sQ*knuARo4FKN zx7Gt|D{A9$+;pi}%ed-<`ff0rioZ7 z);6PjB9p6%_Sp>oB=Y{t?}RQ}!weV0>+s+U)=0i}*ND**h3>NS?&-_FaG-C`g$Mg9 zV$+uNw_KaXuSI;l3-R(7z_+1PAr6MCTEnpLtPRQ^z)K|^2CM9@>K{@%YIXdaIQPj0 zLwmU2q?`^m%8CQ0#}lrz&P#5pe1YdI`wh|69KXxQLab%S29M3uN; z{(x&aTAP&&_C)@CHCLaZ_`8sj#c@|vijb=j!czUicWuCbaNi+E(j`#+DoEXwc%)ZRd-cy{^R$MyS5n{u55bYzHi-kWzE%==(IgU_C06r8JRHk ze8ahhzUq%Poy+fE+I8Ou_I`Fk{ky2o|CPq48V6=S)iji~qH7bCwYq!tBOAM_yIMx_ z3y~7nY(LcfP}k;H-C1YayVv#Zx#pe$r#R8sJFoiS9_ao(QP18X{C3Y!Gk^B(C)ZwB z`}DlaWf#k$Q&z>?tJorPBy;7U<9YXU?t#uL)lt`GEH53-sCyNRVsi@VChy)^z=T;B z4)#v%KlqKipI`kP?)l!*wfQ&j$zRd6^+$P=aOvuXp*&C5wvn9No~rJqt}XO(V&ANx zyt!T5Ucm)htIw?-$tdXA)|Y<{x2MeM-;BFt7C*NbX9bI#T|1I4|7OOw^ZVxwOpD&L z=em9GNa0Mr|M#T>!}BU*^D2i5SHS}l)bVSg?wL{hOuitn(?PcceK?6|?g6L#kI07374_XX5xKq6>lzCL$xmB~_5dRY|GqQG3#n zVJNXkYgzo;ycJKo^?KZf{3fDggNabrKuhPOPX{hGV@CW?JsR9WyBlbglk_pvNYps# za|slHKAmnVlZj?oKHBDFtnRIgW=qKSiIt`DM>2=nMuMJdlVl<_sJ2n>itwP3%S1CE z-W1iON;M-=WdvO$h+9%=R&~i1RyZn^FDYT?kThKpZ6<+n;~afW*e{=ioD?r}t*3o;ndbm} zKpot%W2e`&Krhw;-krPI=@5VrRYUS+!dDqH6nULUCFAP+uABDWZHU?@GcGqTRygmG9bMbbw7lvr zicTsWu*W7Xzvf>4hQ;A!?W*>&A$5^2cAFj+!63& zwn6Q*6i;DiXk63#>@n9|sNi2g#>VbO{Rqv?3h84=niV;`m$GAWOH zozrv`u_|P5SVf+nd%$z`#^npl3)+TA1 zKmdITF1;nTo4O~G0G5VUnF%;=t^H<#2W!6um3Zs1GioAwXxPU6?uQHRL*LExq@S^9 zDIdKZIGgFM1nn$I_k9-=j4{F#!cFB{&^xJHyYTLg)g81mur#k_#i`R?(|s@tbmlA$ zsv6cHn4Lp2!;d{OlD>r8TAZp61n9(yRusjMVu`7L>O9Eo!L{b_lNjNCbC?o=oWSs$ zP_{_8`&SOuhro3Wsw#d;L|2!Q7kzjff^&PWmT1zP1gLaO^J>Fo4poHVX7y6jsSu7< z_~Bt#^_NJZL4@^*wZ6t5^4Fn>b#hY>dJ;w#-@4MMl2t$g8v~ZXuYm?4uB*}yM8mi} z0r-jey{$EdB5>Cbj`P*9D+fr=lvE^yDixG^A=D7c!>B)Vb*x9}2QJy=Vi-I&?D}~F z_}TDr4@PzMvQbT`7eHmJDoYLi7_5jV;3u`FK^W3=h?q{5=Mb)fhSq_UM-ubrqKez3 z+tJySUJli)YNpKam%DVnZ5NRul^dIX32jJ(zp56~o%wOUNu-8qFVXNd1Z9|RW=46+#AX z&oGm8R+U^XR3>R@l4E3Y^{-6i^n&Yf4EeAL{W+@aUJ9m?ME4RsO`|6g1$o2C;s5AO zXk36tG1TGvFR&^rmyPS z_M`MkujCg+i|6+{F1s(f2dXdT4CSvJEFNrmZc;RF{TatddUn^tujEfSKlR+y-UI!2 zUtV)@&0yR0{B7ONS93kl+=Wrsf#{+GLkM#hj^q}zJL=o6(ffsX^J;olSLcu1bJTOt zeZ_q(PfhBt=*u4PKO1-^@cnr&mA+Vdz3726n_hvkXy5Wv!-b?H#?X zeJ3CYX6L+71Uti?yU(n}>F1l-sz})9>~Fo?ak1k&cfYXqTWhZu?MW)8B9>J#Fn@5~ zmE!AJTW~sCFX4@J^$N!)Iw2kBLJ(fYbzFT%hg{2|xwBqzWs?C2`JfthEs437^dB5J zFu3WuYyGPb5qUGXgqTUDO}J?gI>Q88XVO$KeGhG!iv>WEDLU!u8GI*y)N61Y9S)e^ zA`&LNKxJ-J#6zlseayBRA+nA4y4bjgk)FD2Y;WR%LkIx6R!M6KDTB>4Oa?L0 zg)PU7jx>2?hpmf*MW;bvNZBK`R-0+9wlQioy6-2Vkn}MVdK>g@#du;(Q=^&*GV>XQyD)N@^U@AV)+7a z^?KS9QfuRMnRo^xIaN3=Td|^??HH)p*?YZ^3e&vt-kB5-j~t^EY6bJK3<*e;thk4x zIJgp|fq2{bHO*DTGnm|^K{p}!6533`ZO%CVyBV^RKc1Z`R?#U^^5A)HZ9`5{%8bTjPW*nDrG>#nyOu!{g(8K0s9JVTh1vaWa2}g)K z5`*2w|6yz+*Yykm($RGDmN&w=v}94fX=m>ZRH2wkUW~QmAS7QZSd8;6b!c zv2ItayOfMw-epT6BOebbxhf88S8XfV4FB|>1@W$E@63z=l?nO<)v8q-ZF-B5gRjYeO7-?qM-f!1d`p6R%H_e-l^Tz$Rh z5R?#;r$_A*ew3aKqy1>^JZNq*GDos#D%W(R7Skr{=e!7;rBXnBi;gLI0|!J94Cfaggg#F33Ia;uA|<7N#xYt zozcZmgMinrWlS#NMsrfEGsboT4|mcm2s(=dThWfpOR6?NXee|)h6#?JnHAI?!e|;d zc`pU4DEJlO(>{9IPXQ^*;#Zi5sdP4>c)&N*s7RTDj}ac+izHC5CEAHDj=2|ekTKh6 z?y~_f#REbB6gsB~HzsD0VPuwO2%Vu5^#m~iOq@gW2dHwaZJ`e16%uyjO#0xWahl=y zK%}rfvIy|Rxt~@G)3XqpHCP=*bZa6rrV0BNFmZ!k0OfVqh^ZMYq)b>S1Q~tfg|~!4 zjk%laY`CowZTd+3a$d8IAH6VBy-xgw9tVU9^f@%w&`uR5dJ%Z&a76T_y1=tta^=QU5N%kynK{W8}<`+V;G2weesysk8O|U642m}mY zXoTb#(0|vzub`H(Js@_6-mOYj+#rQ5wn>D5!;DgV$z+o%f05`Xt)Z1kiECeKpSqdy zI3=X@F<<7=+D2w;Z`zJgCMPYa;~8;AIKFEB6jfA?41DiI^$EuY5sq&d+I!KxQHLv( ze3nM?#>?`ahddcP1`6T{ilW7*St{ZHS0x$0uo_3Ra0n@7^KqUn=f&7IV{ zuWw`Tk!!gnDs!f76k^ULV9ps>G0w$4Mc={x0|QtwR$*1#)7o2$Rb|vQ3kh(_j+r3j zoSoV|^@-`d^PXIMVR2v0h4NTNNqCP&?iqV`3cgAlaBS zO5yS2!GISQSvZ9q@#5ZludhN5rZNPVv2gPV9#WKPUIN0YvBxCpZ2 z3>M<}5l-6WB)P+Q_x!nWaKRDzjSMYq?kyroFn6q);kB2)q?w8a$g!6c!KF;JPj@g4S#}g%1`622y@jBb-iNUpx8Eg6oTgH*LTXR5-{V!-^Z~b8k4%M6L>z znY<5hTIgjv6Z&CANp%G`Bc6b4Oh?7TJHDYJ@_txr+@>U|#s-SaHSXL%(b6M%X$X5M zlViA6OPj?UTa|(^EWL42*-1DLsr7}tb*Cg4jIO6~+d#oa3N}%&nSw18Y^7iu1@}?# z0D2{Fr>7kh?4)291sf=0H$B}?!5#!DPvP#PR1_2ww%m{0Oy=NjKR>kNksZVKIWaq@ z4WI;keU31b{fGY?e%KbPcO3Qv*}xlwICTd9VUypSryst}A*b4G_kxXUEsc%#(HEZ7 zt2;YZ+G5CTVv)$9IA9mFv!W!8FWh%v9?AO&X4F2Nh8!dyQE)E-X%?dN<38N3W!Oyz zHv202eE*lZHvx?5xYC8Y)!kCJx?8Q*z9F@Q07(de*vx91eaCEGViU_kEkGb7xRSug zBEg9rOAawooCM)4h$QnYCX3~X-&nDe3C{oCvuEbbKivphJw49IC0f8IaS0%z9u z{`tSJ>fYP8TUX%xLndkCqf>XOTXm}H)TvXa&KWKpb2IFwU>#9sQy0!*!GmYQLi;x- z5@3;iJCQ)t*m>gkZp6&#+yeJZLR%p!{V=Qe_o!KtU_k$e3c*@dJxLDIDnBFao`I_S zhu4l&8Tg9%G;s|dK_io6iJ#f7J(Tu#?SA~P8cNCZB$K>{O@Jx30luH2bp?3%(6cnt zH<+D0GSego)jMZtp()bNXH)5p&en|TVTmBtB4Ua09=>S^AuC$KrM1LP?I5UV3U0L- zBB|&tBV)wUco!X0{NxPx7@LS-Gn6lyFGL@?U6YfeFG$lrqS}Rqw5GZTg6f= zh_WQu_J5j$A@~#tQz8iSz6{f%IW#T_+e-!?99sXbeYPC2Zq3%NCD@VZi@6<>5N5Ch z(<VWW2dHhjyik9Eo-$|B1NuL3y-(99udA@g__wo)u&27=OgSMg3TW$_Bboq z05HipP!8#noQtv#kA9l;2nc$>*v{cp*KAe8TSki_GZ&A#BQux&3ei#3Bc(amq?WU? z%E22un+{@XZ&y08^$S9o)8q+B6F~Vh8|Y@k{18f){L}ZjAR14mo$(;C=o%&aSVSIY zFd+fFEU^zu8VX0cK_**CO*29&`+1vyCF`g|*)YIBw*=SN^O&0Dqp-SZ zXl`pnM3rFFDY9fU=qBkXBF{UDk=Y95?oO(c1t863?rlbEA)7*!G`UMcXT#Y*IY2VU zd-p*mTL{>3l*G=i9OgBiUzox)wT<@)de~<7g9d1`lB71P3)}02u3QGMK!d@1$TZc?*p`smo{-woq#r*;NvKp7 zP4hz_cj6kF!n(uVsVh!`R&Bumqc%zt;`i^K%%+fRzo(X+ja!|vy?etRD21uXJ0d)h zvRzNKHP$a~5dKQ#rY}{OLXX#Y96?FhPZ?29MAJLk+Sthr&h1@`&d_XxV(}LgfanP$DOg@-mmr(m{>rVbb@4>{7Ue zr2Oap)ij^RA#x;#P|pWwt19mSsI}V;&@DEOHkb>N3sKHHbtH|2Y64<>tg2H_u0HjM zKF(Dhm=B~xL0BB-|u6m;Bq977?E6vba9|7~|ZjR>SxEkjks!vH9X^n@d z3dl*M5eVAc|HsfLpww??$QdFOq%ii(M&qjFWOe}Io0A5<&HT`Kux+$2(e$IwR-c1H}?VbTa% z$$biyjmKX%>s8AFyC?5FX$_E^1N9`!0;>VFO|>d6^3lJs4piXyxD$Slg*K(SF0iWg zMB_==xnl1|9I}?I)3DQHXq07vc`FgOp*c|12J=iR55#6yb#0)2Wk<__Bke6gQH}0Z zJ;sc&ba{#o_7E_zhB$wse1_O(3Am_KP6L~4KB#<- z4-otb*~Pd7*j>3gz! za(Z%m@_O=n3VI5^rL&phev$&PsaVB~AN7iGwc6uN)NE=$U32yXuG~=o zaQ9@wMTQNg4|Xks_+MPrCCT_z7$2~Y9qQ_y*TBY50dyf_ET@{XB!Sum5# z+_O*hJvF#~XzlRKP;vdJ|7ZC>&VR@M=LLUUaMiaf;}u?lv&2E6e=p?=lE-G;%qWT8cY8#V0%onjYqCX(OnHj*`80rbS@upu-Tj`d=gNO;^oP?yi< z5}w*9YFlB^jxQ(w%m%=n)DGN{(`2-`>}*gyL_G~_8sv^h`_Yc5U-kCT0K*4)`I1xl z5=x?c86{Fy61$h+MJCr1+d!}pz+|fY8lNKAPbyO22NiSlrAJZNUNmm64cluGcUUkl zEM+khN{$9wJKwMTS_~g^x~)K#ckIjt6;{xSWhlzRpSmd zLFF7OVN${KtRO)TD_DsvWCgKw5E@C_(1XL9B7xb%dn19m5!=XaYN(dU-r+jjd{Qh1 znxj4)l|;Wgl{IgCwWh^W)U*MjImDxQo4>>=#vA_YOzN#~LXvOt0fH}*B&zZqZ4Q!f zkvX(xICsPqsh&Tw<#Ji1deO*dNRkcevdVt=ctcavei)lB=K8t2th0nCqbArG=)Jw+ zNNXchZQo)YE)X=JN#oTO2DX!RmS(vq-^S}NkpuzRD-1fz%3=HP!AM2jNJgY$7OP(^ zdqg_Bff$U?(g@z6rMyzm(|CwFy&+>ml*UJqxVs{qHjSEc4u!^fQFl}O(UYo{4;xb7 zA8&WUxGQqHcD)C@_k{Qel+h-1SkF2;n?OoM7e$>T2!9ClK-vuqcsI|{f zYtdy=lwrNnKrrV>Y47)Wm0*N*PSWPf8rOvFHN#6sj=XDMD|>+ru{fhPM36M!Z=5fi z84`{BnN1Z^Z8Ev})Ulr_lqMY~8jr?Xj(Zo%_X#GoT-0rNRCxu@yvEwlm=oL%nj{SE zg0Q_{Fnh3RXyv>1dRY%A+D*7kGFSiX#p`#syBd!XYdaK%sZQ^(ZSZ5($unqB69rKU zDV^L;Yy_1{R1+zGK$;M|!Ae))DOqV%k#6X=$A`N{nnxdb*S=n5@C>RddM;#V25@TV zq*4)MrmLZ&`B+0MoLDHIL5A3tKoZ6W3Szb3EBcSBMzLIDG4D39G>aYMd^maYU-b7< zsPpKViF{I6&Nc*_h1IF79_EW5kxHWbyhiLl6R>JhM!28YAL5$Hd@;&L2`-bqQ^*KQ z!}g^@F$pmN0s{U)?=4q7Sq2%wxELTE1?}$Il#wHbP1Df^l^UXHFQrahcTr=OiQ=Xv zaQc!mP=12CzsI`s+&21_oG7@-hZ^@U7<3OkHc~Ts=vQ>*Bs*57qq=Si z$AR4aPNRi`xXq+c2sJRA<|z$_rNL@^enq4oy6`Yet3PG)Q;*qupgBvBPs{D^eJb2^TTYJifvUVk1xV2gVnIp?}}fi-_TR(VCM2UYmciZ zJ!lD9p2bN5O1@_hJQv#=w(U^|yy~?p)z+w2_(Y>R`AB=P84OPB;fs(5T*P2rX*rG) zABl|+O^@rCl-Kx}wL$DOMm?gBtC26G))qv}6pRd(OUg5N(DV#Q)E94m@xJoMAno6= z8ENx;+7c-)xp2?EdU%grV z%ii;r>ftq@j;}x8KHU9IJ`b$ABKgyYRtzpw|ZQfq8hq~Y4=eB$FD-`ryLvee}w=Klst`2hW_n z@%-22ZpBAGy!hb{&wcpPGdoW1X;+#Ke)xmF>uk+@P!faqQXuQwY9YEm;RbK zPL#_4@Q)})a5pcA5p3Vz)!Hfsd}F7y`se75rovVl8sMmPPt>|T3Rf_#UCPh-QZ?>Y z=PG}KFUVp4r^Nm{!Jh#{t=pp3jZy1brHYrtci!ZCZ}X;07fj${5i&}1TQF)n*3c4l zFm*VZ+1QCA6Jmcr?!H8GlCL!EUob!FQU@L0^hs1~go!OQm;!J|r=g`I7vCirC;gUZJ1~AWu86Ici`y_bgbB04$Hr@X?jzrv%68_y3o5Z*O~R+g0zb)9cA{>R))I z*BZ$Qz{%kQU@}>ygU1Hu_NKv;(_q>acUi=f6HCSCkaw`xb`36?ss}fOGAnx>kcUgk zFRVDfV%RqP;42SbdU&jO5d)-wexuM-(m&0`=T@#U=P zA>Z^X&dS&VOGaJ{Q3JJ@Lf~udZJo_eb_!`T>ZN4{XNFd$aT6Xh+EjWbSib4U@ArW9 zor9;+PPC2unK7_n${p_8;!G-NH{cEfo_hN{_IrY9&)83;1)aBlF6g?${q#HBH;kZ?GuCH=$`8K-=w zGQmzw{hgo6GFU)zZIbjOixg^OoI{K|kPP}Y_KV3X%TRZdDdYQD`ec>e>DRKyN!GYz zwF~vg!DN-g$?A|kfoRXhPfMQ4?a2*hABN8k|Omw4k;9$2|3<*EFhoL~+ao|-Pe zB!Z17Ol*blN}h&k&Mx1<7L~WhrwP8NiiFSdyq=<-Lfmoa*Nc0Kv7v<=iY;gMFc_8M z#*;2w!_!Z(KAN}G7UwiJCU;WvQH~tffuD^~mXNK2gnZ%+R8`w9#dG3S3p&KL5 zd~|VGRF}L?7JrKOwOB(9@D>3Bwa&(6fsaN$k5Ars_VXWnb>xFD{ZKDq0^dYczj6A; zcVE8o#@QRa=VU4W{wHV9kk{XM;lqnRyz#@AKKi2(1bJyM$nzAFL|TrVTON!nh_oqiOpB!Ik6Mj&+GB;Y5C=vB zeXZec@((u26pO{rY;OX&+AxaDx9H+RsT0u|Ts^5+GeOCW)lPcQei(I8Y_Ljp;=Hsj z_F*)p^tkB4MHkE5hy`G$J6-K*jPi6&$%^ulQQEqpJkBuD|($91C(lULXAdI`)TeKC|V<>7NMyNZX~DhH~N+DKt?y zQ+fza0>DWUJSutPZKHh1O1;MWYj|IHMwGp?f!@opqfeD@OHl2yBGYL4VQPxH5PPMW z_DYAbR*_Cqm3P?QK5>cXSL9pKbU7s0mpf1~3kR3}lTy?)_><`9Xg&f>6xQc-K?bwU z0}#i=QmuPAg)Dg?E^5VHi{JL;L%u#!5a5S7 zVO^-(SYoPm#>EKJb)kOh0>jU0c31%J4%BJa%!?1aSEpiSrLq93DGuoqHtMeag7Cd$ z7_RAee8yu~mt|?W#Q$;)W}1f$K9E3Z{MF&nQV5tz0#N=O zbu}y%uiy=_hLiyaE+e^p($Y~0@ue_=6Qc#O)pAIU& zV2+vEYEgSdxh5{F3q{>4Uk5v!&fYXcvbijT^MD~rvthy5J@59sZSotRB17XGDHvKo zA&;f>d3sa(t^M^8f7ZEzfr8#ttj!9ho$DUxMi|bOp^BwHS$BEHn_EX$MrO@&aD}qKYVPYVk~!2Z#uXWKJe7QQ#_G1mR|+-m|s4g zKRcW+PB4$<*Y|p^7Y8mZJ->8l@5qYr1)IVPHoa5+&d#v~4~!M>Fj-?>E3OzXt_>I0 z4(}YXjup=zFJ2KYUNO3JtoS}W9Vw~_%~=^PTG{K16xWOw*M*DgM(RVwb#lRq;#4o~ z^?Z5fpl7gn(9_=#cFwrwD-Jn}CI1sTg1=#F{|aEz7LQ6hlJ&=_;R8Y;W6Cf{w`om`-*Ck1)kjJn8^h+Q9 z^oJk)@Q<#KzVXp(7d|*Q^x>t~r9VXu)$P^xs0}M_Md)hmEuGCr#6U!x+C)Ms98%&O zB~-0E55hyDLUzC&l1PhfBJC#hF(!wnquIuFz3R`hyZ#Qn-=GT6USap=V8*UQhQ*uN z?-}z<>$P4lgf`?`1(=kJW)7VU7uAm!Ee;ng9xGbXyCvc)8a#5vHyh5X(SwRBhxUhy zYH_)+0Xf;D@Nz^uJ2zycruh-H9Y%LZ76&370*4fT+G>= z=JR^Jva`jx9jK#vs7#mYu+%-m6+Crkod#7|5~~vm@PEV0)g_XViGBS)C9OptUd3z$ z>2L5_jNpF}*f77zW3BAwaULgC=2Cu>LiT^<{r^TlipgfB-dHwXnrysd(QJum98R{T zKk*{9Y~F-RQT6@9%0J)>Vk22&cV>Sj3(PTVaK(c%RkC4kNxoKB;wXAQEB{>WKyAzh zoU+*7fxR&Ya5Pk2VJwZ9(~_MRa}i6oxF z)|dl0a`WfJ(ug_vzKdA8Rv0%@u{ukB!MOti2cCWay1?Vml9fm57QfPVsV!`ujZKGi zDF1qNl_2tiM|7zaeCT}9(5FKM3tnp;UH7BbkxxfTE5E<)m2H={eQdL4EI+-TWiKAA z|Jox^&*fDNEe+?*8PA&^&YM4$x3Je8al@Y(Vv0-sV0KRbrh(etG>n9@D)jx*d0?NB zX)}lY=RZBXKU}hq5^+IUD6k;3a9yZieXkdOO0wtS<^EQY^YBxnt46zFNSvMCn>K;E z)Tbwz6<);^wy`zgS zuN?F5>`jI5O*PbbH0AJdoMBPVz&m-~ojy^qJ=AbKL5!uA2OY+8>krWH?us4|naQ82 z9x+ijs6A>b9PdvLrb6Gv$a`kmER>qNbqJwiuI=J|7d2uK{ib@~O??^ihmtw@2a(Y{ z;vcq0PWiBQMhF)OcPi|$Yb#J}dg$|^nZr!sL&HvC^7~F*v*eG*#Ou*TPRy2PP=_q6 zd9%c9X(oQVbdfU~tLLnsYrpUS4-HyQ(8bvk+A!B)v~4#^1&tZZv6&aNWWJs}k*kS@ zEtrllGw~>Y`4DD5>9YmUh&iN|EN-rGs<5Zv=!sa&K1%6-y_R{BZd6hhICZ@67!2+t1v%_`Ms?K6~Q}&wem8dMBOQ zoB%Xxk0G1|oK+G88;%`@Qf8k}Wz{bhUzh`3nt15&aXv+8{+K%O;u-Q$+U_Gqq2j6w zK!W(-tGyq-_U#)#`KlBv=G1@C`?Y&lb|7E%Yud1dThP{a)wk&!d=iC#qT)vX02F_y z*N1Pu2AviZR5#B4{zsqxDI}Cd_47^CQF8)S@{CDsfVw9s`qT#C%u}AAnWG*-i5FYY z;rL`S#9uc&i^q(`M7Ab3YXjHc{Jr@o;*B@HilX0m<;PUSz52NuUq17}ndg9usw>=p zGuSBlFqjpg8L!URdD6H3PXnez1E; zGy7J2BiZz>F5B~c>ese+dyX9p)*e0>oOAENV~3?Hw9EKFw+Bgi9S`7~pj^XZLsC8L=QQmO`s-0Uwj|r8a6}LlLei1O4#X;K(6=tT=XD?mB;%Ao zghIRldOI=dz_1OO^hSsh)mKqstQ0*Kf8L-ooEd=oZ%8oi0XGo10EhU~oBF;lNBH>O zKD=&Z!IP_4owV$@Df7PXU4DnfYnqg7>ti?7i}C2Bn`P4u`Mt^6gM zJwt68O?XL{T2nSi)GKlAR+>y6j;hFU*$IkJS~ICxUQ^WvlRJP?iCX-a?ffK4hIX#= zIkMH}jM;#5XBkoaVh-RCDRG9Qr>`fL#uTLIf|!d~TuenO7NADp;em&r-hO&bZ+34m zl9P9CCe|c z-`JniKfOPvcUfP?=~YsuWyC*PWiA*#KKkHQ?;5fF-M`?4M|-W;5Z@2G*Js=M+6H%y zWt3qXyAa#mL#u{z&QI&jxRzHrYz^mC54VkWhU?aZbJs#0TL9bKaKZdu@3pM#{%zr` zit()Ka8~tj`S9+M^0BOiy=jrDZu~_?Ma1KS_$vP8+;Ta5>Ag0L6<^lbgMA14I}mm- zBY^(u%;-G^&+)!K-=J-{ve);nb8c*gC8HoV)3B@U!cbLO5OuRzbQp8P;D^x8chmk& zW4qe%#<}A62|cV1u}= zngObz;iZ$FarZr8_dTPo%Zta{n?v@^Vm$x0!^1ORadaLl%IRK4^2JWtk#^-|+-K5c zB3NbpIpL{B(lx9;K^m)Hfg6agj>Y5BzNK3AcZc1(>C(-X!V_$|A*8$IXp^d|fX_;3 z5f3+T>jUO5K%+id%D|w$y{!#rSet{BlvMbn<<_GBce9sp+e_Q(HR~zjzzxF2Vc(@u ziDu9v7K zhUUL(pDC+^NAw?NRJp!~S;bu&2AYo6r28wPIxcRw3txaD%&{nzn0J*GtPMdGYeSLV z;I&NlZ2I&ZbI%Fc=g8`?3fj*O1TR}|-Q5;~c&~nC$|~Um?17dJy10?U3Cv#fPY&k< zW`8ehNKnffN}F$;*~=R8`0q%?$YG<$Qu0gf#~SkgTYCWR9`Dln;goeOB(p!DG#)*u zRXs2Rc!Mn?|w9$N0SGL zwJA@uE3I&JL$5dPV~UgykuV?5kD&M5Jtl^|LtSSvB_m6ax{fIr#6s5E#+q(t_3=NK zh;mNa!OH~eNq}R^=|AS{~R-6;HRLmQ%*u3qtk5XX$8a{r6iD zj0N*a7KzD_#EmCd8B>vXPS`zX%sn?`pDP)#>f;)wgFIsz&7mRh!RE%GbcIQ*;qZSv zGp@Eaa)6{$&jw@Vt|K@(VhHc09yOC?UKXg58{nKo;4n&94`Yjj*28j#L}bIP%qq7m zunv5xRdvxNj@mXHqd~Gxh+~1da}CMEaSvAUXW+`qkVpti7LO`nj!75di*(Ebh%*5k%r^Z{3!?~Eh=PH5tmHr_T{#86O^v{Pd}OR4P3&mPTFbg zfF9Cs#}O^obvJfzVXLgOFf&_avmId5{S55%V86!>%jOj9PC6O$KnqSRd!0NH&b`HWr{iuTM=MB&$T#fm{HbI`27FQ)p)hFffo7Qy zl9w@?!=~9tTVY{7pHvnq^x|0yZ61n;{$@G=Z#!^J-_YvBoyv`B{TR^qc)X5N^b)ns zpKMqezFmu`e+(xW*mp7@RF;N85dWwcu~QDVUpu>mPONUy$}rQa4@lN|=a$F!1okz< zyC~vO9X;B5l0FjEH|qjxoA)<%wIZf*d!RlrH_&lR>8#SIXXp`4ja?m$t;-OgxxKZ{ z7}8keh^WN8jR?u=VLj~cc7WrF4kaTg#rYYwfRlQ|oqA@&aYIS8I*aYygbtq2tMxqf zP9^q+7tEK9v0dy79*~3i+Rvcudk=UrQ_GDrG;GjeA4&8?9B&qf9iJfc;;{(H;u@LU zN5Gj+2@-TLQz{SP;zDIOD7X2c=DhgFPPBA#*Gha%If#r-kUa*G4|atXTfhy|N+Jb? z7t+qB4W*oS58J~9b9%k+=a-y2IdJk^&p^+3{?c$h92y`d0lXZ+pBuaxAQ}PO8UVuE zTN$q0%5@i{hV41n7%GM_oX6Mi4!Mg%c4>bJYeHD+rXFi(gFqCuAHx`pX7ePESees* zj>dWgTab%vizZCLU-{m%82-w%#Fv!vw7atC5v*gqU#dcR!PDNSZ~kRF3jFg72{D z?!&W~*$Px4UxtvV4%@4RUn=;VK%1bI;8|J5D1@6Tc-qUA5g*w$s+;+!TX0R`$GhWq zV!J`k{*{yINhXM8`eE#IVvkBlS$1VDX?BdH*$qKWQqPWSoOI+`nDT)_@JFN@&eoA` zi6N#|gxxE+^YGYE)$q1tkswDJK0+vJ;*L9f;UO4ncR>UE4VvAF+HtJ8UcmXn=koEk zwia{3zJ#)%6Gpyo_>eKUy96xVXkx*81eegeyZahhRzf7HLOv;)~H@f za5#=?N_B57kUKZ*_tb`0d5ZF{NW<~=Q>1ba!6|}vP6zBG%4eATEWv4lrwKksum+z~ zK2Pi!0?{~AINpxtO(K+Rqc7mi_t-`>YC{`Uz!+fMR~Gh_4SC0Wbs=Y+T&KZN3MeK_ zFX33{NQ0PIgn|XG9nlU`-iGlD?KL~*CwNKYP{o%mx1IzLXO2{&AgTZ2y&dL5)}Uw|)RD$cTCOl<1g)Zh5UtvJo^$&4pxw?Pk^s|Jq^E*>tt z>cgRYRo8)c^`BZR%Pp!s&smx+k6MG4N5!GSXY7yKg0^QYkJ?XKs~yp-H9H<$yM6oC z4ePdT+bFg%l~UYf{U zvci(4xplumF&%X_h>%td4c*0)i&dxH{8y5S#+2~LGJRIB{R|G|W#dn(`12*{Y$%}) zEWfD3GeD*~)b{kkyS3IjB#Dy%wQ)|hw&vu^p}c@nB}<3!aEK!fU`xTaHW5mY4w6-+ zm~s{`7%7(403B&72v%fDFmcb~#yhi78qmw_qDBuxtya4 z!aIl%W$KcYdk*?ILI!sHj|^eW4q%*9v>oeHDG(9(MAA>Y&ozEeN`sy!aQ*2{T?8;Y zZfR!gZFa0u*pjOGBX>Q|9%vM#wCoa1EcvSN<0L|LC+d5YK@>}j3UJ_Wp)$(XwF?Ov zo@i!rM^V0s2R)?t2E2y3%W3hIk7ZPx-ViA+zp(uL@~^L?-=+=EZ0bF6=H&Z0mC-WL z!jl=}dCS9j%ONVm4GN^b{8T*i0PNE|*{8S5+0G*dqSg_0y5TdrWk~=G&DbtxLfir9 zV9azl@a>vXWcfBYshKus!1@DDcRD@e4Q#NdwHdrqV2CCEK5-k`@Fg>AeDl>Y=-+In9;2NwLY`xHSzHKaX&T!|*-tk47 z!izRRSaqc0%w*h`jaQY5rR)=ROKbIp&XZXDdbOy>LVSv*Q65G3;sp7KjgJ~wZoL6u z>}J~V7OoEXMAA>YFNS}rVfd&1CWb#P$hc(IVn+3(8rVSUufqnSWr@Q(E@^7>plJUz z$s#VYiN8(o9fI!?#MuVNDA$4biPQs=xF#- zJOm?n3_KwAz+VP@Z|D-K8P~!|Tlo1g=U9fa<>|%Lk?*H!1~(XRHU%|ot2BH|o&EI3 zf%F)CLz%d1GqjsAQX;ilNxws8bfI*X^b>TU_jI+F2)b~&Lk^s#)RTj9g(C#>?aF+n zk25_6fQ6@E$X9vcJD{Q=TDvfq*SWH?fG-Nh-OY!^!F(65Ed;=7>6^TIKS5H!c9(Pq z0)>;Zh|ezuFj+Gds9awi`eFqoSfuj%7^aIjO#e4>$1t5LXuCYZ@S=Ap_vPY?#lyQU zmW4Cto!%6I;JdkR^Pp>J(aXy(E+1Ad-WSSWFy>i!dfg|B^PX|m1v_s~AL6~w2|MQu z?}zi;2QRO=>fC4^@qNfOTtE03V{rGY*^ivw`WFS|QwDZ-J-6xUO>({wZk891EE=tS z$92`dYjTM9s9Qv?(6kdW(S0=P(XfWb;GqN`Y21|p4YlRAq0gf!l7D6riy40b^SA06 zQ&Uc1h}x;kgvL-QLdeccY<4(w&^SH0x|r#MD5AoFsC=K`2LxPu1?n2>CL=rL>IWsV+@dG~wht(OW^Dfx6JM}Go&4P3!)D(1 z)bHY`s9T$T%{N#lZv*2@%?PQ-swz}7OBX<^i3$0P)=zRD;#EHP2LwL?i26H{x0KDJ z24aps+QmZ#j4BcjIUV1LR$fPThuL!L@h!Amk)^Qog6q8Ng6F(vykL2_08%^j+E8jE z@B$G0gLXR~%_&sgP=Dv&%J&qehU~eqB8z8Q$X+4@dUM(834G)?(H@#KWNyzUI(?B& ziAli?;MD(1r1Y1`DpwM_m*7PvN%Lf<{>qQp5CpxX9CeUFl95(GlBLYXS@KDNE8K?Y znHh;)Tb@{JYCMWy+L|GNz7r3RnhlLzPbR29xr9IoEVn)b@SC8iIBbkQt_CS~N%2MV zboxCXK8qhIu50x;$4%a_DOM@3`T|uq$W-s(8S;Eyoa%z}h>kDD!Trcm4uN&V-XP`0 zzTKO|enRk5fGNWt{)|r$yu_y3i9FC$fduv>V~n6z%K#(j*y!G}p95#Sk^W<0Mg!OW zVk-_ib6BBxe^r*L`Q6TzK2{r!Jh^`%tlkhkva_wb45}NsO(kL`(Ftq912}%z(6K*i z-v^jrBFB9M6xwp@G{89v4n~`rF<`lCdi))ZpsixLYc=C|vL(6b4I128nk|f5tS?x1 zTP|TyDb|x15Rx-pvPWHQ%_ka~j~wkhY1n>UPo1N~5QY0KlK@FL3F9^FAjI0_tHd=Yvez|$fy)|UtDzzy2Dk(1D5?T(ZY&pp22iR{;{$R$9N2`*dR6gCb(wpl~#a9>*IKsNyY7bznhGOJ@FH zTF6%xa;hHVbCZg`YG22;#wWYWZ&N$@$gfu!<-N&eFO_|Ma@iaAb-+SP`AfX=FYNRY zAd)#aM740RI~8__2ixIK4Y$zL@8eH24e!wwUss89d9TkxSMbPGdxqb`gZu@pR75E5 zPCP4rai?0JqvF-bhqWhkFM>fk0~#QYdyq$|ArGoVZu{ofEy2`p>iOVHrY>EO3u}|~ z6JIlySogvQ6^d?HXr{6kGp~sX0W+^nRW#qW)Ilc)9p_-f)_{rn!Mxl-*EdpLu%2S% zL&||kKdnXXL^+n=PS2oS@Dzeu*gNSiL;QN|aPYz*h##C%#IHMoZvA>1+)<@dS0RkLX7! z!3@!(Yj2;^p^~Jmkvvp3`I=Cr*|1`?%X_=Qwt~L@-(M*IBwsL$qs*R^ucrk4#*tE; zB{vJYr$5p7MBoU{83ztF9*3o2XMmfdov683*`@IBR_(0FTwOa174ghEKuK4SeO+1-2n@W}t&Hq))bY zDFJzc8WG+Dtu6bMM&%@*Z#g1&7B%xqm`pJ8wv03!pRei!dsO}}Ftu)kE)h;60+N3Q zx=Y~2N z_7fdZds90+5O~@ekW!6HtmH|(b_eIj2&mDjIc<;W? zclWP;`m=D5mR8LB#TyaPH1Ch|LOXW-Y0*2L(4GfFg%6EoJREX9%vYX#`pN$Kb4v!6 zsE-$}9?Mu0a;~|KJ*UHchv5dmml|=UpUv;f@6Yb9=q-sMynQNM2A?}VaC~szi`_#z z2R=K&Ywy=Bc;)b=!f!jTiQx-#@hL>xaVrDl9Z+ zUYa?Q^}U+WlyJ?m-pbQ!-_I@{Tst&1p6J@o86D_*PpM}ERX-MP-ptvK4X~aRoxDx1A8zY=O)$QXwDJ$s0`;# zsX>Q6)29l|b%!k*&h&a|&Ef!=QzIc>QT%&LR7LKa#IDttCnA0_4dZMHX0bA zU;22FDi3k$&^l7jgYfTo#@r8v><>;}=5myR8ox;o4>+gB9XN41Vn?%rlNm<>8=-PQ zXzgG?_^RfmjzCujg^9;_s|lw!K3#RNv-4=jvbl2`l_y({*R?AL=0Z%EJAYyQ{JMEd zmMoYr-RL3UcUyC7M_pihduKC*BZy*J7*|M_b!l#8sSHY3fosS|G6)SdH>PNw8fAP1tSeYJsSv7d{{@DFXDCBaeZ^ z3)7)zd(foPZ~_p(!MgT9$H^m*k(8FE+Q5l}Elmd@F~NbjsE;UJ=Rrtd`@7nj7#)hz zn0SllQCJ1agjKrV@4$+c_rmcZ;&Q^*V7!#-36*6yLDzUB@Z_=pM6)i`qMg#1UYF(| zUKLiz;=Ah8>ZN4?7}>S#KdHXj(iSj0DUSqc**5McDfMg{S#FWzWOa8IDZ@6GuFKgS zE~Us_HQE&$^(o2($1cGJj$JMq#1@QH_en#xny|ZOxPBOCQ5T2oi$$LoZ*=F;2zc_h z^aVUJ6N5nvh=oXC0dMWdy#r&md=CtZKP*qQGZ?~NTDh52_-Fy_29s+l=4a(&Jo+it zhd zM{`xTh&?z-)7q{yHqhCWdDzw!zh)J=A2T@|t6I&xN!1hv_ge7Pe?onfeW zkHRa4h}2Ku>YZBH&{xUYlYTvVU!`>)L<1C#KXKO53TsWP*&ezkGC|zIS_Uc*17jYp0~9U_RRlEcnK>lH}vUf@v&robl~gtNwI+I~3$uhiCXj-vxuQ<@C z+_&!d0v?0?u3XT-lV#!g!lC(JpEkUD)PB{s^68DI*Y(y%q|ETeBZJN1jOnmA_2$r_ z;$X#vs`FJt3x-w?&KdJoonDW%X1b^MKwrt})$hA9&KCC-f8{|DS0U@ghN0l_nlabB zao2*dYr$*lUf=xc<}ufOr&mYZ-rnQA%dm$nhqH((E_ys-Q}EoJFpxp8aUPwcl?+2x z0F{?B_y}W=%XGqlWONS6(72;~r|xcKXma=Ud#Vm{5?d>k;f75j)XQ=2Qc^)&C5^yI z;38;1A(V7tVsCprllKFdg$XExYdrY&iHGMEC+%2y#fuz1!*+wd*8=XCC!&0d=>Bmz^$zSpOD}U5WYi6BKMt;_V_DneK8mpcCpdqP ztTCcEH$+QBLqI%1{G|q|jX%Nwff1g8_Ns8PZh622%}52eGqI6RZGeUG&LPd=1kKhfCI+3h!GRmV#|Vx_p0 z!$AyRwiwR{u%Ubv*(zTnI7d*74ok}CgM)Qz6}KzQP}OG zum&cNmQMc3u`Pl*;|u)JU|UI#yIIyIrrpw_f*yFCje=0^M?o}46WsiDBDRMDB`00$ zhyo#WNGg?}&@58(9(5eRP&%qi!#5H)yOc84j^K6Ht^x%B$A!`L15ZEDzkYDRVApW) zUHgK+Nb|k#EB@kVLe6Q|oE|xS8}~6t6;bEKKt9>7KqPQcb5b=z7I(ahj$U~y?D4|k`>Ro2_s`Fsgt*9D!13u|8 zIQOY<9Oyh4P1o%&I}%P(>+)>`C4J5>w$xC`9^l@94nB` z7O_p1bdO2H+bCi3&v4h|%llLGye)?~@g@DV4z7OR)S$)i1+-)GPi&SHLp7u;sNzpR z+KVT2%pen4&D2clre!tW*0-_rmo4!P%wP(?ty7Mh-bp_pfvUXkl(o%|9C>~{>4%7f z#(eHfC0Ed4c*9|)!qVt!7FbNX_dLtPL6!6irgEr=$cmzd%H~c^YvR}I-;YQY&p7-1 zYb{zn8a*{6uUn8LG7 zP%z4`JkmK`d%AOmhQ*^^^!d3O)Hc`<0|RR;5xEv|ZBHS3ZI4&x(_uZ6fl_2J2zfWM zK^)6WSk`(4El~dWohNPlugHQZ7Pl>PH%1G~+hlLDHKicN>%E9E!IE2UL-o`<)kfP) zMNM|9>l#Mrjn@M4M|wOg^`TiBHR6A*} zxea(6A5kt7WTT7hQP@us#0Rph1^E9~9l%$uTQJ9x#d0No3U&c~DM1T~#R2s->p3ez zfj}NRY5jrig!NL2$Zor|9gI__DC#jX{tnK4fxu}gT)9>pXhwXyqsqPAWgYtt z967Frv5Q9o&>w&=bIYw_y!=leS^nv?B~-HU$16q}2X_wD4j;QX_uXmrKVI?Aw^;no zXXRSOR%>;Rv;h{2G9Ld`Xj~`H0Q?>?;e=l-!=7c58fX}iVjHpJLY7UrfM{Q_f85urJ(kuwORk(lcM6u*#8clh!MX5&y<}&O3lB^yhEtOSx z{J(Kd--NP*bg@bc9Ngd2cMoDBoEkXw-R76uF1Eea{Q8kskA(8phTUsJ_O&7R+K4;r z?8?5C?>oI`y?tK($>__7Ic(EXCoDGl3Af?=n84DR-+g`eU3Hhhcsb-QxrY746HlM` z%8YZf2WDSMn?{HFYx~yr=hNN(n5PWx@zXM%>v_7T{}{I6BECEkvtiX2KP%#khMWwy z9?2>^H+Nv}P-Fkxv8<}Tv}@{9$FBGSA!i`u3!s4G?m*ZbxazJ5*(*Zs3dWDOU(rjE zJ###}CY)U}mR&oZy&#;uU@UtP_7#y4!$0hVzkt&lxv`qrU;nvN-^dfTlKp6^9wW!Qb} z-}n5=_v`9kn=zX9qq@trp&bpO#`gE>j)pU9V84IQ;JTrEBH2aQL0uUsEE-%iG-EjJ z;+#<7y#CdZqVl0x;i8)UHIba${$@Nm>-@H{oSI=}WX-SB>C`FD}$cj`uJ-z&XuAe|pIe6ap}&V|zRrNh>-!kXky zHomxaaM#fEmuFm@@ts-c_l}p$50}gzsUIs@6wX=nm9_Y4+5J`wM1ECDw#NN7U7Dv5 zIkjkJE@l{}=sSru$UXUI5PI$;R_QiL@jSF?Qx#53gT5p^V>MN7N&78|LR@$z{iYO! zOhhpgQOHPXHRmKz=*~4W7pO!qNC+?vGXYMkx7j47-GgY9F2y?tKf3jh5Pni`Bm9W( z2GexmN0!52OLoY_>PUz_oCcD9T3NISAEB(VUV;i!3P18IyW*l*r*2<#8{r2_v?Sq2 zeY@;o^D(G%8*=(uIi0_;}-&&14I75B;ah z7V~lGlWEN+$B9uD&H>GrfnW~^mPONgxRPye?>s1QC_9BL&x5~Fr_h;0yDqn4SST_d zDQK6QW~E&es{W05V-0&b*G+V>k}>X9Z826{E_eqahjxYRyM#0q&D2~jKo#DG(8L`G zHQXg!g$5PupEn4dSVz=nNQtMsSR?RsA|TXgP{p;kb#=(6#eEli!W@7z`ryfgLxnm~ ze*&gIm2rC91WLGX07;U6Fkf<0$g+wN6I526c1%!=O0cJAxDQqNE~UkIeVXlgI})2Gyxu zT-l46?#2(#+<12Q!yoirfAjV0Z@d5#mqQ581Y--e2Q3vEK5P&N~6A=pZ=jQ|1ye8_2;AdHjN ze{sM9f#}j;af6QiWYi_q%|z1>)>&N3+yxfIa`Vp&`)3Xv!;Y;b?t`_d-%eYopaiQdV$!a2Pe*Ru--i{8tw zL?Afdw7~;cd^2SvWUL!ky;L%=Dm43E7Du((l0(HvisG zW?(3Pc=}LDII|8Ha>|Bk$Fl1pWzz>X#_}yWvu~DIG7CZ_wV}E9jn7>hp1bz)&at^0 z#!5CsmTh`xR(RR2kf&tu;n2**p=lxl`2tJM40v+&KwY5vTUFZuR#^LBT&=c)Od)Vq zQ#)*zu)8Vu1jrY?U2QG23qlAz$;7nfdL7|Zazj8t6 znaBsdFss%0AfhU(x?3M26K4oMm( zY~XXbZQzKf=~pFHTb^Op#D@69Icl8bTyV8`Di3?eGKc)0T$FY#m;hoQ_vF$-q$i*8 zlocmpD1&$QWT_l*eK75mAG^=uy@Wf|rLu%yppI7MqG#YAMo>gNnW^S+PUWIb)$Co@5%hq>+~V-qSz!2%1C zS>UI6|8WAr2OEezK_EC{BQdhYD9%b_y9_(`4G2NdaI_WEeZ6G3s5Sy`5Ijim5W&L) zj}UwcAev!ddSIE+^myf>Zo}Av1%YvtaWziS$LuG?Ap={4)m9KO_|(Zpvc-e=_(`(G z&rQV^i)zM;=7x*rj#$Tv=7;S0W_&WI@Z6SxEko(A&3L`$)tbu>UCr6r=em|N?cCOZ ztwVWZIaR$b1D|x~^gn#X4Gm-t7}wq6ljZY9io)e9`ZEwvKBpp5UOBM&TJg+K@%&Iu zb7*1n7+~@INI7`uXlZE9hH&}D{>}J&+3ewEBcBP)UKcK1pTuKxN{6gpzIW)Ukw?R` z?z_D8%FG?t7Oe^8R1Q5n+%&ZJO3r+|9;oc!7+JLOoucrfJqfqwk_9i+Sn%qxlGRi2 z-7Qu!-tATc=f%D;Ed`}HlcnT->>P{vTnE3rAGhT-swJwe*j5*=cwktt9Vkt@kqQ66 zN?_0W_3L5$Eq{1I?o->DMsVWo+t+VbdO7#owr}54ZEvxlNAQ+ad;r;SyeSWf@3i2| zK0db|F7{ZM?dx%yi(;82zaf+28Ue=KRJn1e_rh z?#nBbOvP$Dnl0VYx3$%2Mz)Qu(Ui8fXnH)YTt&g6ZrZQZA)NPtW<{Y6E}9{(z!J7o zSTZYA-9=4&@p@MX`CJphVFL2hsH3wBQmt}|7rsdFBEcZRHwZ2gyiD*4!5ai`6Z|Q`pAnoS zh!Fe@!QT?Z2>zAeuUV%Di2WOZ4ZT?L07O%FpFFauty9V36$*UHEP~kt%LvvGY$U+= zLmmX5B4{KyK)@w{q7a-Q=q7lY;0pwO1ZN1oLhus7Hwi8f{3im=&B|*8JTIoaN$_)m zKO?w8@Gik$68wVTe-PXt_>kb208zWx?D{*VV+1z|J|_4Ff`20T-vs|mkdEH3HK@~v_K|R4e1WO54 z5Zp^}Kfzi85r&W9=|uE45y4Ev`VzsdL#E6jK}fC zv37B~P@MRZXJ2SPFHFuAnr#akWno$>>=cEen=qA<7Es*X5qpY4fu~TvAr?Fer}wDu zM04X|_@COZTKFWHh((<%q+8y5mCxaEw4|lui|Z|xn9XXn{-e`kU1uG)to<8{=YLqr zLYA^$Sf>BNGW!>nSsz+zLYA6eSQh`nQt%5);4dxih(CYaUl;b*ozA$%XEVnwWnoL% zRZIC_*)nN#Ru)UOIaXPJfdJR(R_nC>X1Y<0S%408z1(xLN1&tjTf7o;ev&6D1{Yme zaef6~tQcPR%GOI;1vqX=YbObr$Pt3z7W}K@XsFhDFdU?e~`S8eE^$9zkSVs{k zW&t|%;LCe2?iJ`r@D{Jc>MS|Ak;)m7>6MX+%1Cw1%~{n}9PrFIThfQ4oxralCN!Kb z#~i>RZD{e;nZjYtd~STkTuh}~vh!kYV%Q1v$GpTc5P@Bk&&L#8Qs$g(?Q4zsnaZ;G zb7I-Vax7W7v0P$#>hnW6;j9_4d|oQB&;a-~Eyg@>(7>N} z&NJYNr7`8yis51kTi$*_EjLphjapu&GBj%Wn93xz{KT?It!!dB8ntqn%G0Qo&r|`a zRY2L2M?eHH%mksZ~v^hSZu(td`Uw zG0;swt+~WDr&wz0BBd4MrT1OIKJtT?ADpP3Z6FtLom{{TU$&d?fE3*gLc~oLdzu%mJ^H9 zj49Ae_P!ZR&9q>>fuT868P1*=o5f32mdxxJNdW?aShI=kwWb?L16&_Xz(E>yJfty& zBg+1mix@_d95rsHJla!UrZSkfk64vu*6f?SMxT&ygdVd2zlJx-F&um%-cy-!szuhm zq4r|+8!|Q68P1#@OZrfaC8yw~f01>SHG=Z?^!3DSOk9JDN)7`D6ZjSa?U0kEiDdj^ zA}M);Kt1gEUV(x;9I_!xUlv7)|gi$Gb}}A{ASE2lCswcG>c`;7AX2% zUMyFjc`UXl0eajWi(M$*>9M-3%XCTt*ZTx;6gS;R-aBXyyGmkdyae|+*)bQfbgc>9 zOnEFBSurm$*(r$SrJ>Hlj#oUFJTVLL(FcFF_s3xQ#3L0o!;4;7acM=k0&|VHvTmb9 zuGQ9yiOa#O*Vb%^2?V8?RTE3&HI!n}ve&o1x;5rflk2UUt+!+%Gaa~IB_{MbF~Le@ z$%!iLDvP1F4VJ|TFNEv%WyJ`T{K~G%yn2*TBpi_=``0nWYiKbetP=5yJ9-1rE%93Obgl#Z8n_4L00(duNJ<51OgSyZfeW?g zYhx~^(wT*uSiosuT)_1#fE%)4DxL*XG7DmPnbvf@6u|W?fE%)4DxL*XG7Dn!EHkPj zl~p&rnbx9+D(KjNUsGAC130omgQO7?0#ETkam>Y(){KLD!Wrc;H!o>TDK9Zs&^?ha zgxoX2j+wW7yjPQ9eZ(43g$o-K`iGgomsN2hO(em)@?$Q6rd!gzF}Fa`#NL=!p#0Wz zd!OEW%O{eRmO1k%c|Ks(WI*DQ1;CL9@XNVxm%LRX@zBG2#~*GAKinjfk(t#TI}cbR z3zu@-a{xWS-nn$;O`F5I+X@fFLfj-iI^$=xKZX=aJhbD%@f~}^JHT{Aa{uA+{YrR0 z*33+bYz4AvJ->ZRtg1V7W zt-LmWvHZJXi}(w|fl}aGD}j}iQ#!xJ)I^TO&bb9=7sUKheAPDNuN+zZ_l-HG$5K3w zbiEG%zlIr}g0llSvMHD*K9|OnQ%kv+O4mBPn<hQ*s1^AW==oEh^ILlB?LST?a7txx45 zH4#W4V5qR}UE5qxFqB<8=9m*paXYqIbs93E6Ojp&P7|F)(y7TgE|H8+10o4QVOq>9 z&~{-NSwTF zXXF^m&4fCC4OWKTr7;Jupz!LnlqQl+^{Y~3a*1nk!A&GRs!TMrC!ALu^NMSlSUY47 z`zvEUaSc9Ia)qgDFzl;{`Ng#?wdo`$$rjggG&Y?pl6e+%jeLP(bs$MlD3TC$WwKZ# zOSI%Pkt~JS6e|;Gz*0CZRxZ#AOHpZTxHvOCEdoZY-_X~X(poEq5$nfDP)9RHtY6HL6uE9<)cRRP!3+EoC8CDY z#~hV-q^dSjH$O6`9xn|?Fj=U5Z(?@Zvc+RQJK_%3D(fr`!n!)LU7R%OgB zu6gJb8V;d{I>QCCV_tC$Y(ub`pT?=K{1Nw#9ZPUCMF@j-gGm`Z`8^xCm_sV^i_>Hs=-}A&KcD=vojp) z;`C#}KtGXmsn-*T@JN5FBiB$2<(lN&}Uf68a zI5_dkGv2O!tCsITV#YhmTy6^)WlRK^z4~H_NuH6Aq7dN)}Y{dNTyrLXTI#Y z=!t>pA&JgVKsf~{R~$mI24MwBZ91aFf@FMp70LLpVUi`CDGN4Hn%v}9jZL1C?${MC zJQI51nLy#S?ktk(@{o7ZSyH#SiU~rN)GLy4xj-Z{wJzXiQYE6eQ(}eeI=X;*SqQxg zshEWaY>3PP4dUIg_K8|mc2}6s8qk@W2(es_36SI zaQzd&@d>F!ils3X*Gw=KAG%Bl)fhTF8Vsqpn9o$H)#{g3^UIoHw!@{%*{rSqog7xy zkMUbRBa)SO(>L2tAmDm|fTKX#pO>hOSe;T;q(@e!b%~br;OvU>EQ<|J0^(8)D%1kI5zluJr zmMEL4B~~t3I!K@0*2|qF5GFRK!zsVlg=?(MB_ zZ4E8n5uUg67H`C!u!6ideYFPK0@oLFzzr2-DqcaRL_{xYw%ts{8=I+&gxc^E2Kq9E z{lU2lZsyjZFQfH@LISudcuzQhkW6@)g0Pw!^AXEL zJirM*u`FWQ#1bTqk6D1&QBf9Oj$W1t8*sfWzzt<#O6n|#!Tq9p!cELW%u7t0)P;ET zF|(bh^H>XIlT28E%7y30L}R4^d{zoB6E@)bXMr0&%akU9PPmu?35qA&#ISycN{v_s z3*jTis(UHwUH3TSu+T~*h zZ%=p<=D~1b?N!?xQD!M$W8trt891eDp+={AKr8cvs%C(qSyeGe;pP@q3@jz;&gVf$PKuuF{&ABsDQfX<~6gGgSj{S2BV@Lw<3jIDln!q+r_38MDwIHJ%S#<@x== za9UB!feU(-V`)sGq(Xl&h@h@zF&8gsJu~KJ3R(fNHzLJcFE450%i!9uHxTpjQl=^| z%JsXSm$DN0Xf`it<0+QQ6gH5soF-PlPFhH;h*&YP5@OScm0B`$Vr9gzRa6-J|6042 z=C+PAi`Mle2!Id>0t7)4Bt?QxiJ~YH;zOh*C6dN^*s&Z@nK*G}lUbxDRhfnDs+oyv zrm~B7)DGIoJ8ETW6>4R7$y;qDqRjEvsN;szh-0s`A zAK!Prufa*8DSFIxqUqE?Mf0xVI@>*`J4f~tm0Db;+lc?`b0>u&`UN)a{ZH@f0Qk7C zqC>o*JK+8O^6@XB84N8*nnk^o5q|#o=ZIxh<|V_TEaMcTPe!pY4quQQi~5pZ)Oi*e z$rnEdiH z(|^`%9>r*5V|0hf)}RMR!xxx4FS|CRPK-4NVjz)O_I>B-42z%z+w0`W=Gh_G$xGO3 zqG8LODDD9=zk})wnO#hXyx;?^2h3Z7P{>ENwxra$ zS6Bh}Z&QQq!ctkMb+^6eLOg&6gJ0BM)cDr+DUb5W_^V*vtUNPEA*^&2MsW|o{h8k^ zK8I!t`0~o{w!hql9Te8NxXhW4vYsU`90SHxF2z+Y>1;#c$hFB4kbwB&%I6KK&}9Rj1fy}?4}MF7c=WmCDcI!DAM>>IA1 z=X>8hlbV`^w9kJt=i5`WC;3e^963+G$F|XCiI5NlsI@xU+)DP9_n`x{$R;CnaDAXwWjRZlVvbuAM7fn6g6eR2);+a1eFifI;Mt|2H) z#DSW;SsU>>#NVt58{Hf~@e766Of+hN-RAG(L_XNe2+nIa=YvMQIUh7aaLEUa5PRbn zpt+WHk%%&ic7Y-^&IO9F68a#Ko0t6TH9iC^EtQdzOkG`wx*s^%{W?TJG{=l7FKHIR z3oL7X^IL9z|DRfOR}XVnTj5qW$B%ta1?rZHI>C_1Y3|Y}U08JBa>Ic)zx&0HU;Oy@ zjn}3=tV_>|b7~0>X@MG*wuLn+C;(Bz2 z>(K?Sn_D&JL=KjDvW$$~Bt zS-GDmUWLe(GYnAq#*;+Vlc0V-s0@WnQ(&YT?_^mY!qkA?#A2)(6n>%&Q4|dI&SJM{ zXqAgwK-X-0KQN0LHd0s=Ti4rHuR(w9BrFOEu(L)ph`X$J+Qk_Y)`b^mM>yu}2;?~7 z&tUF;)ao4-XGbK9oE>3fnMSP_XGdVDJM$rtVckln2{~7sRbl73ILm_B zC~W;W`b?ZvJDey zWc&nLM=WyyN23H74+V}9#S17YNza$ed%glCg`O|nFCy>tYF-%w)oqPMG6)`vJ~0k( zZ}9WlGqe9K>fWr{a$c|?EWdS(L5PU_7TUm>N@T1lB&rBfZMIOqeZYcH{1)nW=^w~E z*y4ho?IIArB_jpQ!I3c?;Ptq_Bq3gLN!$C9CX44D8AiVez(|fPzATx&EYb8RE^A#D z34q-eC?PN|&$vDjE?t%k5rmME6&WOAKtqOzszk#?BS3nTsK%9TjA+~sm8(t~a zacgWakc@TQbvwLLvyNNgRN$GKwcP1)@J>OQF3^Q{3OJn_9_bXP(JwGC5oVPD__#7G zAzsZOlQdB;Y!t~5VSrU6M}$Wau_*SI$SQv486xxXM;Rs?aXXKUlKCt}86&d^ zibUu5R40k1u)s^52tPs8uV;v6>5a`1&C~N*AUaQUfoPHFBGD2pgiA!r^u{g|tfDMfUz=kpaEE5KvLkw9a4V=Q_D|G`g%xf}Z;4GGRnKN*n`3wsN zo_7vm=K?8x2E=^`Wf{SiR%zuZT-9f4P&%warUPK7bXe;)L;*U^O|&$bk6`HxneR61 z=;v7E6GWRPPM&AcsnN`$et*#-nGcof5}9`^Wst1mA|f(GR`u)AFd6<_Z%vO9;Sg&{ z#)!s|G$0d1=Q!#niKh4r>O|ArVrGbDiROsri57^MM5!+jEi!y|k!Xn__e(^}9I=;) zR(#K{g0W*8rXTgS5|##=mMsCWGkbIhoLCjed|I|-$b3KQ9GQ*!5P8)^7s!yIVgoBh zGRWoGZi#5I54noITHvT!;6$}#akXS|wPY^X1Qb`x^0-V3-xizMVb zuhk3+H^C%OKrG|C@uqGlGIH#0hu{g`jl-UXLe8vqi-aUf~C`!Vy1kP;vv zOHhxW6NdKk5*b42S2zaI{Ky^zXiZ$ERd?G3mucGFR)-J&k%k>Y*u8;TiTaNik$R-L z(xh8yZ(gHfKvQnzILuJBTtQQJ)ceL1M916BjUNC()UfuM`83Gd7B2zt&I9|ly8!yN z1{b%(;p67K9j0#VxMAvytu%*LJYLAGzd)26qP8=|4bh}`d}^F-uyW=tN0l=f+E;Fq zp?&2x8QL+Ia19IV|4M|W8Q3Eu8C*YhhI%3+0_Ava*20lA8TNn?dP2N0MlXpVS7f{J zfqDd4x()l>neEot`}p%uo_!*Ozn}T<#s7jR19u0~esp+G-^LcE+`Yz^K+v zTuGDpB~&tG@f;+J1z)nCDJ(vn!oxGswRJ!e-4jXT>W2Eu>x`lP`jRsganK>j8;ChJ zq|L=K#l7ur-oZKd{!%A*M@98YKyr*95`;S@AxWZML#b9s#y|wh^Ih|G**T+-B>VIx z=fQooZ{nTz)j^3<;6A3n8>;8vA@h2BUOw#dMf#!fwTt)-J!Y!6iggK4aBR0Iaath^ zH>VY1N%j(9rb~H{&+U=IA0L;1zzr`(HXd=_T8)%8B&apo4TM-b#^?r0?GPZ8vQl%q z#?hT?zQHEXS;1|Ucu9Jdx~ZB0@9n1Ao8J5I*#hVIO?6;Cj0_8W;)Z%!XXy>KH?#l7 z{=?^)gT~9?#pdDit=9S5ywDA^6x^|m4irFTq}g~HJlTBwFPzCJSZ?g^9Mqq`|GdB% zeMh+&eMbq-N8M9<#%)AG%k6{u!N*^G^5T=jtM^+Cqeb2`t+@6yWL^QQyT(?nHqBR7 zpWb58Y6q8jpqeG#$^*4mx332`UrxWc$HqTUhu?q1PhyF~x6kjcvh;kj!Lsw%6huIp+>$bnI zAHmd7bx?fZ5ocw77 z{IwMF$V$}?&OgYBq2l1ShU)=t}5ytPyt5H#QAtuxk2;stRY`@A<&dG75|~S@lK`3ZOF& zs~FKeK{$ZUymOxs2{6vQIqf9~0??ti?b6Wy>Cn3k8~T3%?dmHEWd8!%g$y95{RP-& zZrOm^U(jnICvKUWW>335tIg}m70dtOuHnaFZx>rAW_e`Wb0=;MUwRcR8)N?BQMe>$ zdSR7%ryJNUcnB51`tqB+^<2{|!gxXuj>zZebaG%c%`RzPOe&}9doA84XZpiSZFsua z_btUVgADJg&TCo&}`iY7}IGqwl#iYa{e>%$A&x0(gz(Z=5 z9^ZwURi2h17Qu~9d~ujX@TU_+qb$OKsfr#W8u#Yu1Q`1Ofk+^kJ=F(|SHTt2=kx>w zxj&7l&w-;&1{i%{M>*sfuv5Y`i%eed*5fVg&|vHcoKOPtsUJ8F6*ioy4D_dwUM!QH z6#vJwDOaDQu|Q=$%W#ouE!{b)-aRVaJ52R=dK3(wzSQLONtjWk$fXB9=4{3*8rM_p zVn;7YHH#e!iJLGC{SHiNiF=iu12z=~)*-iHH+W%P48R>g4al^PYXClWYX^=@uoiRy z&J{?S3=_SIFf|a=u>nE0b_uN<9cQ+733rVaAzQlyhvV-T*Si$gA&}(V&EYlD?tsxU zKi$c^$!6tHG|>c<%`A6X4~x8q>q4|F@`{J;BeZ@XVHbeOxe+ziK3kLkug4GpL%gDL zC_8B)jEVuBA&T`4GGlIUeY&OdWM`eBuaO2ch6?gULH5rWw~#SG=Fb==MK&*kM3BHn z^bk=TNhG@pVJ=$yfFp;8f}KzC&`$F6HYDGRrz@E~ltGk``;bEt>1n{u@X#T%OyXHU z=7k;Y@>8q>qkWEwJw@yCRB&q0<*8tGFY?=`Rs$B%@om7i&P{xkn`lk0y$aUNyl)(ZS2Xe= z-}b@@bT<&!oq{aSw`ntZpUO}__i@CG@0zqPNQ~J1K{>(K>=ku6t#Ho3TjC^9@q6`4-l^zZ$$NuYy(6;N_zbdEMzme*h-P5RMb5?+tcr$bZ8p}L}!Z0xB4h1p*-i5(g7P#$z{=87v)YYl>s)|(d(0ZKb7XUkP zQm`LW>+z|4WN~PR%x^&4m#OzS=@)x5^&V?F@if%}s)gr|(^Nd~;8Y=;s9L~Uj0F^@ zsTL3($2o}lTxVtyJA?P|fuq$77>#jp%QP7hM=Q>M&DSNvB3KEDq8y9-7OeARUL@58 zG9M<=MKYhir%PmIuiOukRVerl5ml+x4-+9Hs)#@Q{nOp*r{9xNmf#HikW3IUA4(>P zrcwhV&FSWcPZoH}sm`M5RIX&BcC9RajDbbSkji%Ffc|t3VSD7<;=e!kg({DBtX<{F z#di`X7vD*oTzp5p41d@6Wgm|!Vkf}w6_^jVFReI$z&?&WYp?D>*~^ zXm8{VJPX@T3I_J$XBE0d1K&RL4K13OIw@_$4FZhX0~|F6IMEuixG`jLTgYrvh-^!U z*brSL8bAoM%S!+F5#q*(WR#KM(#bEwCxykqAZY-6?7AfcM%7MCO_RkwDOuLPmm~9P zeV5nM|EduF>I}Y_xHNEdY2d`A$^0tfY)ux$CKg#N>yr7C`%)nDF#su&`7Bx!H;@vG zU=Iy<*&6*H3Jc8F12bO@%>1hGXZ>&ZdvSe3Jgf5^^*_Bg4tsClH)w$W@*#iY>dV!h Wn>!<^-|md$Ztg4uzulQveEnb2Jb#t| literal 0 HcmV?d00001 diff --git a/__pycache__/_ipex_ops.cpython-312.pyc b/__pycache__/_ipex_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91bc25c8e21da3b7d974ad0f9568b49054186b6b GIT binary patch literal 18566 zcmdr!TW}lKb&K}`SiA^;@5k~@kQ6D&l5IU4Tec-jelV6TS+Xn#fv`&wBnUve04cIi zBPZ^JjMGSLG9{I1DtabUlcP+md`0f0wbI8-W;)X$OnbQ5sxxZqPRBF-SWEUKlYX@4 z+}#D1BtSo!#4}kD7w5jtJ@?#m&pqedh5pIoaZ&Ijt*<3c?Wd^!z>NObG~nKk0a&0o ziZdmtGx%*znkLL=%qD~_Ny~)wjMYS$Dc;Q6uA9_eXY5Y3hvN)wrk!FBAb|+GaY&4l38Q}%U`r|o4;8WRXCK<~Pr-g}`(6Nb3TFA1zAfyF$LLAx8 zc8YQ1xI4AXz5jM2BY5hJiKEV#c?)Oatz(3N#^w|-h_{!oDPKBlt^)+aR?hOO^}2@Y zj05mn%eSvQE@vCIb9Mt3`g3D(7_c}$Hx}A}#kDQR%5oBT;xu4ze=ZCz0|w@E&%|xO z;^8e$O8>|471zmAVg0-n%6%2e{k)e8@IJ1pLVd78c?kN2;Zt3q-ZtX^X|3V?TrFU) zgHJuQZh%iCe460X3?DN{RS(zls;wxSfloPqw!*lr70TNxl>g!Iae!+#@UaS{vEwt& zxwAt1E?BAV3UxhvkgTGYix{Lh1f%y>=+Rf9ydPx5T)u9Y=5`pchqu8#P#y!f?WiDy zE^g2;LN$y~Nir&NX=jCzD{0zY725BvP`>?6yARfA4}A6l7w?D91F$X+!e<|R_5;R& zjpqpFxC7AcAbcKz5AMV+?n?$9)B+Da>s&vp_IbF%T#b4IDXZchF;G@l(h=P77ffT> zvxQT&UtNXSskMiI>yK)jujU>z?A`j({PdGx>o+&T9j<`?NQLqzDwH3sP_D}q%9Bd- zF_5N8@~pOavH~u(c6%9qiih(7?G_Se}KQaeTX0!yVjnh81Y! zEzlaw*3A!REB<4+$GPX>!$1i$?BPySz+Z{GDo0LM=%v=aux;%rn3dWN_XxwCHq4+I zcJGV8&$9AKczkBt+Ly}rBzl78&Kh=O3ykz~h1OrGP`*9(r~x}`+zZ@j#Zx6vVI-A5 zUoCPfwr#s}Am8zATYNtI+sko=wP~HUMG`Wd$nY1U>5M3wF35I)&*p?wT=Q{=*rAtX zfzQ2PL$W~SsWD?MX#!itr29n|O%+<0^%k6E2E3$;W@v|aH$8b1XUUtdYc&_CtnQ^z zWA;r=A)5#Pj5Ghps<%Ijm@OBrc}scRfLZgFj7aH8`0}P7Fr% z6&uJ*%k@!hyl5<*O-#nJiF7KOm}xIItlTIF1c)Lemnem%eE!SBbHi7Tzm}M_ zNQ~$5k+~yRcD;6NF)%kYYr=wKbH}d47k4ej));oybURSJ&@kVyczo&na_rr}&N=(6 zdG=AsVGdA;e$gIwxsZ=$BD71I!|8L~*W09$3(a4rQbOs8T=nGqAoTr`&A zq7%8~idjHSMHWy*3RqUC#)1||X2L}#8~b3@y+%kbQSZ~tWqOWYryB}%!#dqlpnH}B z%a6TB?-bDa|NYNgXn%i;(8vN@Gf zSH+4nu9#HeLwZ!%lbwY2;h316soFRnEbD`Y_i$-J{CuIy7v?Uk``QY=wsl{3!PmWP zTJ!a;(!J_({r)E@^Z;c7(kL2cM3Wh%_^Fw|#tA{uAa>Y+$rmVLku6)sMMyrMOT@>w zZkSy_?o4pwR7lhNaN=ckLsx+@)saV2nU1gQtQZR<(r6I zh#G2TCI-h%A}S`+8R+N*yky8Imz{<)%`qNT@jE7UAe)H_vDCO+OR$QWm>}{HjRY#9 zYKdH9teND6Y*fq&367VmjTHor5~7-nrkG`o5qmNO^+UH z%A3c0o2nq2x9FMyOps9Cmbd2}c{=aRyRK_vkVDKeBwJ%Surx48yFv?I_$n!O*)fsk zawq}bXJaEgcPI;%0UmL3oe1nq@=+pMC_S-al?%loF_KH?MAioHCg`#V2QWE^$wQcY36qC0c?6R~m^_NfW01&Blwwtcs}V%P z61*7A@Io}39tXJ*r7aIRjpeEgPO&tR=;W@MhT`_yT8Si!7a;-5r=RjSFIB(e z>t3b1SAE@5sAl2l{L#hZ-+gM4`cYn6pljFZ?gHJtP9G}Jha|=) zF@bfavA{I0G0hSaUT2yLOw;1{8q+H=H4@VX^%7G{{;DOW0f3Kd7^jQubJ@XP$V5dj z4PCe&A(o%W0`tKjjDm3#M`J^OHVeG>GYNckzB=z90$=kx6?p5m0&goT@b;p>+w(Q5 zsOQ~zChsu{yl@y8Av^&|q*fKlXR+WoCeLB=JCji18ALw>iHK*=79rFMD#_yR?PPKH zR}ifxi@U#IS#)O7Vgi)}I5u$P$WB!Qqi^Ghr$tcFDedHOB*bWnPhW@|wWAZJLNxNL z0DwleeuBGb{~UsdV{Cd*%jV7KuA#2Ht~mgxBO~S^q8-5n+o&B?1FKd@gTpl)IH;OVR9Oi7a;*pFP)8o>nGx18- zlgrBXDLyeWngyMb0goh|isNeN53UqhogV@KR&3GuE9Y8)!=%+;v>2Q)tB$PnUCuQh zE4NA(aQe%fk;StR*6_N9-vno%W)(;50vV5{?p?m`hHZW4BZZxhtnGaC){$R4@h*Kr@>R`h zYMusPJ;CK-$+(e+#Wgt#1O4_q3w1w~xH+wQ7{C+cr4Yx2gG8d|N;wysQz z{CTimqvwmzxXj>TSu8i^#$`Ig3F!=+p0Y#ltY|<6bW#UyZLH!8SI zSN-Kqa@fRWOxDsS-nO=^xm)auQMo{A?mASgw{2UinaHZ$koR6kg%1xN1ApK%FS-Z% z-1EM?KOeZRNg_E2Z9}ppp2_`}$`tTUBgdXSe42gi*^?}n=0!G@&az@ILtYlyB+ z@@5L(!2Z2(^ns4*r5OLZyP@E2Sa)|6+#O3N-*xv%zUHO&cYK|L9nOEJ)+P#_$GL=F@q>Ja}W(T@l%5Cf3NXvzJ31-oo? za7CQ7E^?}{eFI7iF~=>)aXab;VD(1Zzi9p${mp#^dLQ_kSHg?Vr2{t@g8}H#qBGIq zWI6^1pBAH0vvH$tN0=-JZk10wlZ?g+w8!UD8+^qD0J9vz7uPkRz-1r-&kAc$vxQAC zifrC=fK#*0kI1I;vT0m4O+qX#r`aqb7&CUw_;9P? zFA}el-3ALocAz~1Hihg2ONSV0VE7ntjLS~V(uro^4kM!DZdWnpAz(t&@~@>pli& zF+}d%oE1%Vi09exdl-7 zkLLk`ZbY5dowtQQ0_X^8QFxSxBX+!qO^pkeA!{%SiZRHDS9KO`g15B-dV0bF zBybtXrv#L50nG$rt&}|`s)>d`yCxOYNW>dNh)d04zd204g~@A}l&lzHz1R6=c6322IQUNC2+C`x-gU~ zW|9z26uyT&`~nYwd4qoWPOjS~+#|_}i!7I{j|dk=e)YxT{y)%~DG@L`>fgR$uv}HL#h^zZ2|PWqMYF zJrdiueCT!W4foAMKk=@%JiN*@YCX`*>QgAQ2h5?T?pfd)^LEPLwCd}S*w!~3uRE5_ zue)zp3+&!2p4&`lo#`wvosvJGTTxqqAhDvr$_eQJlz?d!mi*Nug>rT&Q1G{xzN^=UWL-lEnyK?)T16(sv9ig>)*xW zWgMb&Q+fmRT{AIGc48!)xU+GiV!>m991Q~iY_JHhbkkFM+e0&_W}bjI7T`u`vzW38 z)vHKMWRm;@+&%Cd3-6>C&)t4jb-Hnz(37v_-Mcc^v7#z?OVEYOqiIj+(Em<5TNR2pEmuHq? zkCksQjC-seP=hrxKskNtD+LmB@*@la;vNX8cDSo}RS0)~hPy&h_NaD-ZlW9477b(~ z0caX;kbrVVd6_tc8iLi2`;vucQT46mLMA7>yjf_@79BV+K+59d$j?ahw|KyqvfvR4#-TlLyVPDXD#wb( zUWsP7K&8gKrMwR19i_L&F4|J{dGjl1Vitc204Anw3?rtSp1i|gu9`?6qgiSd5Q{^f z;nz(D>WZda2_?MqR+hRf`Yz-tX9gN5HyK*~e~d#L?&8uO2CvOT-hZQGFN2Nk(pA~< zTnsLWXLy#}Ik4&rXKXgj9(ndb7T%lE^g4T9YscwLu#O1jfUAh?#3&pS`rB7{WPnkV z7)fbo#kq8Llohk^W?P(K*5p=4P2PChU&M86n4M1N*o=^#gjWpM>?p4e*Uu)h>{KF| zWML!d=#UURdXT7u+A%$m$QF?$vi&6tYHzSmIX=q^6A7eGqXi7r=>hgJ^@mr<>3;S! zgw*ukUlp;XQ^{#Pz6Nttx8H_o46rAF-XbSrNiod=?(~$%rZeiRX+><>7H8ALghX`{ zYg`6y!XXP`>lD0c!N(@pab8IANfBDcG%lRuQ~a=Ds>4Ehg4J*umgbN!B9@qot4|hu zKOuDU+5>M4>rG^#NG^6I>%O1;vNjcxNA?fHtBXJ{(v0hcBBbFVi@OyEC^Q2ah*k0?y=HAH?x^%h;ap9RqpGkiQToZz{B*nkTu00tmi%LyErw_&-TBd`B%TJdZSjJKD_P20c;@@6UQ3VgR!%e|~h!v+b& zK{Zvdg`SP;Q=0oG2QgwM{uuGhW+J+{8)DS3|6tQ9ZZqY!oJa z147$$3is*E10^Y_*lFXaYR8CIwx-a$`~m1^0hA-ym@t}VV_Lw7R4>~80;|!pBr}n1 zc>e@1caRv}eY*|!aJAa->YG(u6p@jA0O3b+jd?#J`8!Ph9+Q8-Bmi!KYV#+Zf5cLx4s5%z|4M48o2Mx@s;se+wIm} z>#YY0tq0axADVUk(%pDFT)SG=yL|Y~r&gZ2X<8Xt3m>?7^46}mPOpZJ%pR89f!QzJ z4%aWdH2>04csciGekFfv-@D;w=8oJ6cB}^bHJ|Q)=F|0qJsYZDI5>ZBY2Q1+{#B+Q z+DN|8?8Q}jXtjB0jUJl4_)FpnI$aWT7`i!KtG>RE>=e9}?4EZo9$MnxNWPx@S?Ak3 z-rDi5|0s~W@y2#sE{H3e}=G-yi62ao94h;?X4_&(+^ zi3Em@48cRf5c(GpWh_J-L-6wW5HyOk$#gUdMi&p)Ca!2yeK!kWCW`Bv)M^;@tOHMf zG|j1|vt8YG@JcyuMFB&4AWEUovAdziq9M}SffbeB3J%OdxbLn(a zcp3Aa=4h(AfspUSoQtZdSKJ6Ou;&#ILSCw-aUnUM zRD788>$w2ts;F?S5=1CO)i)_&gsQa`OD77|5v2x;YN@(Lr4FHbs;Wk5K&TO+CWM-) zs<6_65KFapDXj>#QEi<{J3<{)YlqT_P#4wHr{L5e-s1A!jR2%^0?VE@A(y*0vtmcL zffWGX0maZb->BFC23CMB!WD5~js}WIofC5|RC>rXH|7|fX&%gZb*A|+2P~;q0ti(h z(}D(YrVRq5F%8R}BTO^9fk(kwA`}*Yi{b(>@W>k^g5Huajx;M4eGW200BBMng}>BN9&AmV$MYY!*Fmn<`_Na!JNKh)E(f%B2`8Z zszPZAA{2s7N*JMP;H6T7P%Tx{pwuB$Plalg280@^x+bLwp=Nmf=yG~4t+Zf{r5aeJ z6`?k&u~lhDsDr9+Q92Ro!bx`nq|F)2{#adZk6}vyTwHE|Va~dCz+6c?U=Fl{stMef zV^BUl2zim!K7{-T1rVyj;e!Ba!(&-9a@C_XGgqDn;pEUH3=2N4P(gjB#H0L3)`X%u7GfJ!ltThl1D zcT0VPQvXf~Ccw5!ksScvX=nyA8(FEoL8_|O{(>Q?GxFiy6DE)?{h-6l9W7(S40K}x z0Erg;9->@DGO<8YU%Mp zxJL<-GF=sE+L@HqQq}c~?ei}#g$m(LrH+)LqFxMvqHG}9Molc1ItsxqrHPa^Z_rsS zq^u3vDeVO8MK#w)z&+YVTIAsOCwB>BG$xbkq#4-^FFw{T49pK86M@M)_eeW-NoV6&jmYD76OMd10RqtnZrQ1>UKB+5iwsCzy&rn^*qVx(3}9_15tb`9C_p)FSqfdXefkZX8(+j6P{a8G z#&%^dDbXV%BJrApRWuecP$oCPOF?`x3cplQo#`mJ^HKQeBRZSft4HepWkX&vl1=a< zOO8L68<8#WzuKsGcDE5(EhZQRlj(!1mHj2*eaM1iLy0dyqF78O(>)hu^86>&{;yQu zzd1u!POmweFWD8V#kBKZtOLJtP1#LNcPUJNtvD$<2? lgx9Tu1?%AdAikZyAz}aTTO-tp?NG?_7(25q|r_|6h@kWyyBDvQ=4(ZPJ!xTe1xYjvd%WY^AW>7A1mimb;G>qvbBM zyNpeN0a!pqB&0udfx-;XgAN7az?T5cp~v2P5hEXnMS{90dhm^c+@|fNo!MPdl3Qc} z&d!^k_h#qKeDCp3u~?YEqw=4WCxH7a4*E~TV|Km*<}p!-!l1AJnL6!7z2Frds7Yx+1C9VZ!TW*^<5Qa$dYQ@bXDXo>54i{Q=}P z&rzQGFn;P+*vt6lX@L4@u*jm+({QL!_V{AN6Of=yHY~t%%Mz9;t(t;mNb<6v7X@pH3YNZ1HB*o@#Tlx4Nx-a2 zg|b?<>V2#J%3o(E-B6?9FlLkwV|s#FmkEK$u_pg$9M zIZ;Uky=FlaXlqcbW@umyLyWCvQo~)*_PcS~K?o4zGZ*cssvDAMxMfj-)HshFk!uyv zv$A;%(~cF(TG?CzTglWlJ7QXT)f*Tg@UoK?tSGU9B_(2!+UZI~gmq+Unz|t_Kah-) znHjNzqNqs~DvEYkMDv=8d{h)as7b1)35cSi%OK{}vT4~|S+nefC`y{9TN0MHi8=+U z)_{5-MjS~HNdjZ>6;)lbj3n}$X&J^4s!~YONc>2Kk@$doPIiEje_zO1m1?e9uAa6a zRBD}mSi7(=zkGq_)N&zLty@dFmitguD>&{f&7eY^Sm$Ei<8(; zE4_Yeiy!I6Q?0o+*ZxsE+DU(}6QA1Rr@ALjHgC61Zt$Jd(OCO_XJ}%JpYC3`w8;;*mNt*h zb`H#K@t3-1&u#Lf?cuh(G234H^tC6Gow4jQ{+uE7L*3Wy97yrmIR@l0`R;D7a~a-F z?sm^RHLF^)R(_2Y4Rr`SR{Z~K9a+~bsSKThg^3&6Z;b+g(q4~27n-}xqEfA7E%iMd zZ)h5*TuJTMy0hkkvSNANYKQurVlHL^#t3|zaRA9El7mPPG~E3=gxpai0+Iw(7mvg({xAl$hLWH+lJ0;d2VCj}hE?^i-DK(C~Sjia80&Ew6N8$2GZc3dKL{uyykPI=a)S^npXY(dii^hq86S~`e7dA}nQv_ha z*%-tSfFny3R}SqKVm}$#zVb9UnOA`{$;%{(9qkccEZ!ZR>>fR_c{KNF>dX6onEw4w zx<@kG0Z{aUBt5nrLXHcZ39s=#5BHK}?))16i@3AH8Fq0o6MD6boC+F;MO^$jjGTi> zxcaLFHH9)Hr{HJa0J2J8-BV|}bMxDwgzxh&l0JE^lT(a$YGXL`)NodN`}sQ{H9(gBc1scyRUA(j<8?lzQI zrEpiU>bBoKIrh~H?koOw-UPC@UK*rd86WWtxD#L!8tn2ke4VesE9?q%7-*OV-{4jN zmsg)$eD(`-@QaNYB(p)E8u_p_QLGE6W@#f!1Dqd2JTlJ zd_u ze9XXgnX!O(ABD9B)OxQiu=W4v8Ks*jmTbBby-r)E7|=9_h3$Q0?0p`2c*z zW#ip1uIzAZGTb9!H>UiF*u z<$U31>2VCh{5?pRV}B;O4$1wEOngH!f8mnf@H2pfY~))u$(-*I zAUhlkd(LpmQTUnZ9s%MhfcrKDcH3y&%aS9)b|}eQ>_(G#7wd7rKR-Oz#pd~rpy%Kfn)~%{r^;gwj|Fil(WMx@6T*JC=d4FSsO%891$^A{bsn{juFmN&VJF42lGPqZP-7qbw$ z#hqgIoBYKB(T=ArVh+MWF&ANx=s;L3IuVwLE`+6G9>Ov)AE6++5tfSu2rI-b2rI=x zgjHe@!fLS?;a0H(VU1XduvRQXSSJby>&0?}+r$cl+r>(R4Pq6-Mzp0GZD|s>;%SFi zgRohwMYvO}L%2(Sm0V&YYHbml5bhIq zAl&b16`QZsa-3uWZ1<}*^V|eK!B=vgEKjz_?#UU@RdJFYBc#7*yTHBur#HBp+6&wb zzHZlFQ(L<0c-i9X8}W#N@v&hKp7K1t>t1QpH{$UHdaw6M-o7it9{;(cvbkr%AMlJE z@m;Sgm9qk)QvcQ7>mJGP9relCUVpFXxiUW3du7}^EXsNP<6>WL|LDk=ci1B)vsnlG z`;!k@{lk5}!L-MbzH6T3qh{1E+lIYYQu(vTCIVMSeaRg5#Pis2UtnNV8j*9+2=?8+ z>wVr~YPRfR&(gRr;2lZT?PRGVqvH6mr+0X?PxSf*PN%9zJXEMCXM!wq#AmC zcsTWx!=BjK^$ra84f!?HFw@-+c_y^&@?Q zBYm>oH$E~pA?w+^kPQmdWW(rKK=OEG!`OuH`pB?s2uR)l=8MrkIzHkJT$Qaf6oJvv z;cMQ2Yzs(zK0l2mrq$=UXow$O9q>pE%1|G|df}vMYK3lOVr)X%%A%t_-%SY%fs^X! z*?@A_Jo}`2O4&xib_yCO*g-)vf_|X71r%wP~j9d zhB(o{YR(NAhumt8)IC|3(U*f9TT*Y-SSm+I_l&QZLi&&~WO`aH1KJ+0k_!~7wF4bi z@LZroea09i$5!2#Bu$9(>Ch7+H7{%yO`=(}gv{UNep@r3?blr3dblJawFaxtdIIB; zPpFmHy`dj?V!PlEh@;~HcK1k9UE{fq_IjC}4; z)=q;1OHWVHAd4V@PvPEevHCDC_dIG&kL~0<+Lf8|jW0D68 zQ1XZ#A950+G>9Hn#vtKa-Z6B6-Exlt)pw5uj#7(`NYW_U;q`lc{y?9v-&1Fijr4J} zTiQqc;K$H_z8k#&(b#xEHd34bqLKP#ExrZRp}f&js+_bTz{pq= zaBf4OPeN-OZjQI?-*>IW(}b;~Nx>RTgxi|>M~8={u)?eDV)Y|LFA3 z_I#?}5-W$0y^4-yxsVHKZKDLsTef!eXrHE$L>i*WDxtH!)t5zKDsrHAC zf@McT%+U~cG)3B1a*LL8D`L47^Sk1?H4(##)e+6Fzh|vSh55y^muD`&(>rzaL0-}9 zrI}0byb?LOlAU+kH|3k_iD#Ebv@7=9>AiEi-rIj?|91~W?OUU!t-pF`Eq=sl(B_hw z1>=vMOC@_|YVNlDr1=k9BgPfGWBPEkuwh|v;X<^aC2DWEXKGo~BlqWiTJy8*j`n)) zPwHzs^0a^I;t}>I7Y1Gvf@DaN@ain50dfoR>axFtH(`x}fJFXU3$QFdp$TE>)oFt# z&W`(p#NZ3#m=c2mpx-;(H*!VnJ0M7&KG84GgjOb`UjXIdl{|tn0R-=WAoUAgzp$f` zt%L3lxexW2Jjp7gF0@E`0Ra{e<`UoQC#=k7iS!~R=TK&Rp_jfzJE!ytk|*)!FK3aT z=N8Q!i|1Coesc0~L>qZw#b}Pa@QoWQcIS#Ud-CLCWa9d%_v%GxnL#6nNJeM?8He>D zZ3-HxZDlAOu8A5$nvhOKKR~mFNrInYD7n3;AI6N;s!#`Xbx()x_9RRLS3Saj7fTi? zgPxnhNME4;D#6wp5Ii@%{(wIL-M^+C512|LVcYh5`atH%`ax-Qe5}qWy^MDZ+_Iik z!=Nr*qSVU>h!{Y%ek0EH<4wk)$}n+K2@#@U<% zrm|iV(4>l`Ysmf%6=_27G+<&{-`+p9fBM|Fo}X)uSxch2lFg8#$;T%7&k%WwpXH|b zNkaf}<@zR{+Bt`4L2S5&Xc}y|M9@viX|4&F)z?5K8c{o#dZ!I(Cm8|}b(!d%2tUA! z`ZrCWM~EogaH$kT2dBo;mW-iWHn*Sd?r7^h^3@}jEc{F|mJ&E{4~x+(rU{4CFNZ#U?; z3C-_o!JuhnzH(h-s4CVNB71)`)f5nxVLrr9X|8jLN-U^^qe?#HyQK@r{UO$K;|@^l z;Pj}kK98Sj6NE`+eVU&e^!PkC$D~8SCL#sa^DT|fF@nH75hEZ@iJ@H?nj2?m7(!6KvA~{|%!2wSB&c53vp7wR8i4 zY#Qqe1U!<@uQcr|O4Fp9co(exLXG-9vif%-_>47Wjnm~Be(tJ@bO1}-{`%C{=lo0d z${%Xu)Ix{0JrWVt zvN6FP%UU#vZ6mU7Xw>UN-)O-y(AMb`L>Wc_iDnMNkn9t}tAe0_+_AY!(i_P8UHtj` z5CE7fMtjugUUk+*OoaVTPMy5nJ=HyTV8Ix-?U+2WYAKv+UC=LFpZ|K?(lXh;Vzf-& z!ZshVJh0knc|RryJaD;ZyJxzWTvd8f`=^wNxQ|J0)mk-w zak1)E_p)Hv>Qn0U;y|N}e&H!7GqdAXo;P@#V(L3Oj^I`}vMG;pN)PfXDN7FV|fT z=65hzjqsu(g=(cCeZEep(@ICs7d8v!Y#Pgi3*crY zF9?!mQ0y-L{9i!;z#5%zyf#;G&se^aT@%r++G|&A`E%N6$@7>GITp+Zv*l9*XWbIF z6i&7)Gus*6dpd4C6V;u0$}miVIDGOj_*+wZuJrR1&;tgvid~uH45>X=`gc#yIYbQB zhU@j;Py6QYrIpVhX&6e%8B&#~6GDolmBpsD@yht^lWAo#eUC85hATyeI?)Ij zPI58VS-PcvjC5HG(IjKKWQcBkZQupEK|uudLhY?^oxG)c^jLehntIxIdRRO&agGGT z(m|9CW(${U&!0ch(JC}hm8p=EfF;<>j)pnHV8OVg3UZys-ow*x8KtXVaJ$93V0!MHIVqQFCcxJ%CQY6 zk%veh>DQ4eU11*pav|X~kihtrI+KDGbX1XSOmtP6ri_;HaW65BV45~?JARtOLgG?3 zbtXq4$tg)(rSIeYALGxzh9Dhf{JF`!YIA=3$ZXe4*ZiI(TivQH_jcD**Rri7W-E!> z>V9D@e0Ti4;GN)d+1^;$-lejAt9IAz@Kku&F2w9Y)V_VyT|66{2`;-^V(yl^eB9kS zty^^#&h4HrS#mb5lvcmzyW?9px7=_r)^IRhdT8d@gVOy73X7+WtImSi)|u92XG6@{ zupq{ryA}sy&Vy0gK@du-mI4K3fNA-B(0-#Hb}t=-HeKeT4u z-aEB-x_<7;d}Z9aHLBbC6cqPGsEcgh^>yS$j^r8k^!YuQO2Ugz-VqCER)E>`k%*jT zjVkb#eivh~N`pb`KDjZrMcZGDTVIOmUV2J8_4=7bR064n7NTEhxT z(rrdYDFJj!SS;G8wBG_mmBkR12QeFqy_(i1-G;{FF(Yf4t$sCZ3YlI%lvYC~sASkU zLP^vrl|7?0O`;7~sx%P0J&+WWtFQ{2AG8>7isEFsbKsT*&O_(aYpk6*4DVU`-oqtD1@t##l@GkX5uP!V@tk6JsAr zmI&L#+>Eo*rq&5Q#1TAxRuR7U3BF_|Fb;-etL2U{)J(z6xHnFtz* z)D6-k)usbyepPy%p5LWs?JzV72CsjB_D>UJs>754J1IEgA(o*}6d^nul>#2IE=$>4 zt$wCH?v+MI1G1T+W>(6~z}8FjIHf|dis*)&R_2FgGkwk9i*Lx90iUdAkFr7X_{WC> zei8zyTQR#U`?12wrK@YDKSIt2`15X%=y>-%qp+G^Fgr9eH0zu3&0ktP7|(Bu zSXW*7vnOXx68}7ZU@FoaVaoe^CzhbpT4!(Q#y_fF1^xi9XURh{cDrsDG=FPUuv@AO-W6sL? zwzzX^)V7uO@Eqqm`pFY3*}1p9Q{Fkd;)pN7j-RTON@ih}%lmjGVpn6BB34Lxm4*j_LhzTM1+iXz!jkg3~X2_ zU~$5S)0D0%z7#bKWBfD?8-CH9K${}=JvhOBkL}X9lroYUc%>uM{nrn6*@%FvmmC*~00s`4~=IrTXXWP1tp6EQ%6U+hP1hN^F`bGp&3;KhO?iWvV zoM;o;N2Re*sV_iABl_+ad%8|YBmfHPfG;aRhBL-}J^GP;pT0uaM>Z&$3WkNCjUf75 zB4LN@0O3wwgDjko1j8}bVb5S+{{*ZrfSXJ{&kZG}Bc>lV98d>&VS3={rDu{% zX;j#~IC$3&F@f6>v3_dcfKRZ9XP=vSZr-|Z@vbxOJQy*pE8{?2LbN=_crQ>~e~(uy^#S^MP4r_i&{roV$N z%C72Xd&Jq(?IVIJHiPaW$%JheMzM!LiUYd5rn{-FrZH#-$GDDgcN{-)MlB2p&841g zPY`D1Fw#BEMn(D=!0smaD?+!2(4sy`=!TrhD~Vk+3G4zpHEskrW&{!}n$uv5i5B4} zbfAOvM2&!*rzya$kS35^qbe4VUe0WO0X%_k&?TJ&3k?apv>!P$tUXZMiJXe^2qpzw zAxt)&nVYpQksBrn--V@INQ=-C(jm0|wxNeqCu(h}_txLm;SH@^TQ{Q&Ab+>^s_am2 zubi8DV76aK(t=kGnlX?0~eFZ5sSYZZc~?Eyuzpp$~xfja_ym5?V-E= zrP}rf*6itRP9Vsb=B%3Ui972S zDu`3=TeuQ+Hbb3|Up{?#ej6#$7q8w8+#UEyThw_pYC8(Wy}M$*e7<6O6g8yf>Wez} zMQ!_5toc!0{!{kDAEIBhA7)Z3rQ!Q8x(}vtnO~xGl8ciJDQ>d|<~{1Fz}zx+2Md^7 zmZU)bR^e9ofI#SiVzNt-@i1z+jif!DCwk5WjW7}sdO*gRk+G}y_~|YPe~$RbUQd)w z9bP|+L5D)LCR4hkNKvNDV(?SwyF&jsY)T|=5XOCE&z5GvCVh5{TD@%D z9deyrD$P+zMzJlOk{ z)Wc?^N<+a&nXe*3#!Dg2 zzbPF+bwMYY@-YLUh6exD(HmqYRHsokPqJ)4{AJvv9t3_OJyfMrQa(i`r2m39tpp*^ z6MzuNxXGhw^l)zZe0{X>a6Gr2Y*TV0hgS2d=ZB&@kHzzjM~*(Q=1z8Ph6lld!QM^lTFv=GNK6t z7M~Diz=n>c4jENuSm5de&e)q2LD6jnz`Ttd^!+7mRFhgOd?9fSC9QDO6ljE?+!aa4O#2%y z5XeddVJ1uGC)SbFDgpl|*<8ZHnTTWe1ki3oh8TxJS8zUAIaxs#|T^AD1;2EF9TQXA)g|SRnZ~Pctl}+>sCY-V+K0^-;~M z^+EQq{|eMxijt+FFX-s-`1@hsOK@zx6nMsLr^2er?h|zQUs2E3+e%bLw;uQhYpnw& z%tDhV(BD+wkZ5$!apvribLTGfcDHpMIaomruE=aQ;lQ2=Cy7KkCNq#8qtSo5xzV-J zdQ#{=+rVR(EmX3SVZ0{z%#`X`B?HNbU*SFddc+h$#+t@?CY891r_(bowL0BMPTeZN zH)>%a3R*m)3|$LChgZLvT1fb&k=5B7NOGtt7Y8DCUUx$ilw76&N#Y8TUMEA1X_EeA z+rLabPxz`P^H)O}Y((Y+SIf&rqGrhYkp%eCLkj*ef?yWXPxlZ8AZ<0B2uA(VFOlQVXfpj9L?*dE z*9i~pMc;aD^7sRzeY)kIv2gC{LhZfMovZeW`PR6-F=}e0H~YZ9t~x+<#~lr@*S0z$ z&#yS$EB4Z5dv(lSJwN#2jUP_{Y z(dSwSQDnXWTrBG)n_;ixg=`{C4xvhEW6znCLw!rcyF{D+-%z&+#H`yX?hL|rvfmTbK<4mu)t}?I#ac{8RBIPX;=MO0F66^9DqT7cd;~ zt~4b4Cym{*=KPE2f|jv~>*Q3@KL9zIw9a6c=HM>;JwAjWX*rw+Y6={Gre%_{5K0*G zx?#x6GizW?P(OZMI4HnjlM&Wn@yAgsGo4Lm@fCa75}3-2!)uQkw%K_1?`TdHqO@YM zn}6sJ+ZH>f1G8VB`TG2gWp~pbwkh_o-zX~KkJee9Of&rgb-=d|E<)a$(og9@N8U!# zEkq%1goh@zpckM~WbgbydQ-_iNNuHLMw@;~?Yf4t#1oj`3cenE>$TTkn--TytmjOV zqQAfEgVyi0E`;Lw&rP1DtT$f2!Lp7n8Mm%xLS!iB$=Etae@%%$^=b>L} zx$N3lR?P!z&M)fl4D57SnyQLzcOPHjjxK--%kcrl^aeiiObOt<}N!fBvU zfCLS4*Bf4Fv-^hqqj2ni_YAYyZeR{qaCH&|AW<^KVm=7WIa5&mP#gObtzS)k+{}DS zlG>WOr}#H?qfP7l8;Z!kAtgy;-V0o2?*%9m3zGf|B`J)x4j0n)!*JnN_N62?gsSC9 z&I^(sSrv&dF`w%^78EqsPV^Q!4eX+LT&Iv7IUJFn0liM8X7FDq!VFLMU)cT9zrXqu zaq-;Ub3eZH@2`F)P2kgnbP1^>O#!$nllps)ded@s3CrnJsJq?w>t0b7;P0;b?UCk*MS7RQqa{`|V4x zwVd9zWGj^&t>lqex{`Aa9G){9CfixrkTBZI=7I~m;@M5)V(Gj+F*Wg+!>Fv32&C3s zDrrFOEf%CDth|5qX$hB8^oZll6)VN%-|Sk`BKA3RX?UaZa6b3P`PRc#+CQ%35i-fD z%YMb111r#p>iM$u0@0u$lXTqDIw(j>Z?gf1fwBo zT+{_2l_5V*06d7CpFs_ha4D&YnEqCmueW@GpJN+kqeIdeaqH9?0Uq;nEQG`kX~C2Tw_L8=5;DhVtls0`#rj$_s`W0}pK$&Qxo zyIT>@ISBI^t1GH=ZKkG^-Tsdox-Gr(bOPXMjD+-+XOzt#q*FI1$QqtXUuUA4nLiHz zKo@aspmnvRW^yuZUDxT3^PNX}J5RTDoajELXgvN7Il(r62RCM;E#0BmZy}I%ULSlF zH;s#7^PrTnK_R=`^~Y}{#!srnxYW|PkmeYivoZhOJ@2*NXWUL@;*J;TV+WMva}2$A#&D>X2ZhwPAOMX?Kv zUvhg_(;)mz+6JVhC83{6Q^_H~RReD+MSa9TXkZ|e^BHy4cm+F`2lC6t#L*p0WiFf4 zWZ9TVC&x2IG0HROr_FSU<}eGI339Z-GRS>ws)iVRdJkd2WoKQ?S-0$Ljyap-_MK7F z&Wr$}4l@9ea{VnG@y%fz^AmzAe1z?hZOQot46edd|$I%%T-vq6f5_m;) zpuKdUy)v+4690D5lrBX-!nC3QOr1WHcxnoCvr6X$@p;ieVVIlJWtK9lg9FN?>;JdT zw+9jiQO0fsA?e9Yx*xu{bodFz5}u7?UfIys&(2?fSEJv5V^k9B%(9-LHR#yd+dE}r z0?WxdT5+6x*qFdq#D4ks?!7e)+2K-p+w3%sM%1-lN%7D_+wKZuu z3B_-?hO###;89&)VO>h+lU7U4<`S8_vDE3BtWBR*N2Zo!s266ENuKZNrN!j2WK!YN za}E*twc(mHikh2z*oZ$<*z6k-wQrIke{usEB3;OaYY05n#!Kz<5UrgJ7X(_EsdR;{ zkQ!KEN}qB9!69HQC25?4Q)@KneX0eSAeNygVWTgb^+k)m=MtC|O? z*&#hbd&qz=CjiN_a;4gh^xObxGT4?gu?g|hM#jbVv^Fl^^BR^z9 z=nk0?7KC_&TS6Lyg`q5jMS-@AWs3vtYI<7BN&-jJ)OBAf4Up*-yVj+b1x{p47Xqi$ z^t9T`16>(XD*~t0)OB@NhO*KAs*oLFbtnhn)=)0OngCg_uq##fuaj+6T{m+wl>QDc z*9ref?C-?i<6#7J=tBp0Thp%@M6S!lScIAoPZpmbB22L2Oe{@gOs0hm(~2vaWX+2e{Dw2u?7elxD6Es*NR1iSzXZYt(Izyc1KKHl?o1c!f4Y8t6&x zeoJfIeey^C#qzrYKjZ)9wK@Jh(;d^iYyNBT0$4@H3mO*v)4HhpU>K{*8#6c7$$lBv2nc8U~Zv(s3kKcn}w{1^i- z>)F|eDTYa!rC0ADkTt#SvJIxMqc>C*Xo%kZg5DWH%nbMSd)T=U($6UUBMLsI;J;9S z{XgN8110E9KkUF{3*glY2c1E2AJsV&GcejPAlHY4fxN8}r*}#BC^cI-0Fn+#WH!AH=HrtT5V5ozmZnZN z7;S7EqfTn)2?R3v3#rpm8L$ik!3=6>&ab~VPrU4E_)<>8kMsVZeF&m8`=ifwu5|ZAP556-VcE{;o>LDUd9&MQwk-MeD$T`M;CyUzED?i4L=X^L%W0yr1jVq548wF1kU5qUqg zaa*=86*R16;o*~PuBc(nPO%)WtZ6NmGB~)>y^BLjC2ea?dUSCm2bPMTTg#(|e6FZw zKDbo4bInbU1sudUmZj46wJr2m$hm75EKB*j){5}>NikQtbFGA8rCiC5wK9qcoUnJT zoMIJRp0HL)u_`XVdaasbTe*U*Yc+^{TFb!~s(#5@vQ|fr^<4herM#N8ZS=66%d46{ zx#VhEYoN!*h3!c_MGvLN@qMD@JG2 zZZ7r3z;lKj*ee`?+gRd2;b)agl*kAo>O{gYB=J6MN#s{ePS-;zm0x}Tg~ycvvq?C1 z=KL2LT8tGdk=*Rk8={QW`pqygTnVyf)3j8FEAc)xDd;P`>xHwVF{B|+4f1-)Di$Gn zLAx>>kLi>}MJPM2IN=dc6Q|>(PB;!*y4hLCf^r1T#^$AptUn?rZ0R3|>}q3n6&6aT zQU@o8Ekp-C2?W$IntPi-pwNp?D2GCs&rN3Q>8st{XWH6NrTo{_rqRc+o;IDH{DtQW zDr*0NbLEz5A^+^kUsTRD@{?0L87p4z^99JeU4qAp$N#LgbSqsC6t>)A_5n%7P}maG zSS)?#K@~l#q9l~AcXoDtq4mI!N}~12(rN^-PCtn{Y+=tf6U0OL$Q|1^ro&x}{1)u@nH<w``t2;#v?i$erfe}m^#5{<@A$B1fCHVS2)t2s?7KnT(59-`_se+;@ zi)eTbFO_rm>0D9KG{}>nB5X`)IwBT`;mr`k8#bFcgiR?Cl5{X7j?ZMykxAIDGTZ*L zkVz^D89TVkVDTWFf(us2Ol^DHOak>xV#G0S1QciHq#Eh9ne6w<0sqJ4Nz6&h z$6A0|iErI-p_W{#B|r=-yM{7|H^qD}dhkBa1R3t`b>a&W2M{m(_$UGrUt|(dsKbL{ z0V@o#1i(ySdYymU4Dkk@s1a}56`coD6MV?ax@(>ZKRJAe{Q=pU;8N)a17t0|p{b5V zq;O#*ktZg$*9Xrcf3O^q`9NTTxWSYJVqKPB5R*p2IdU4xpWQdJ?>ntHEE$@tT~jxs z_DXn0ued8B?bF(C9>*aCu2MKW=Qtx4MNUrNR<9%kM5KHH2owVXTJ)I-Wken)y&D(| zv?6< zp>qXi;6DR8EX)5O=(M9hnV>Vb8`irkMP<|4>Gt0+t>n6eM7W3p zVwNcYatA;>rac_@8U z@QyAVcbuptDf-ev^ab-^FW$nuk;y?`r?{RpQ8zKU0)2zb57OgTjttEjA0vNZ5yA+} zt)!MGG4qU=Y7i}n!JMhv%rR6y1whm6enj9c0ES2>4@N89;<}r7^uLnZ`FYOVn|OrF94Tk}GO6 z>55YTNw}@!08p9{NPb+}wG(l>F1WKcM3wiiDb zf`e@#5yK1#ZqEdxC*}S>1gmCzg?xu|qt=r5E9VD382;Yy;_mp?eesI@ciZk&99(f0 z%|1W#{QTayvp#BrKN@Tg82p$p5-n!Z$@PNCb+iG^l3nIivlJqrdgv1f=P`f+lXKz@nG-3NTANDobF3U_g$e?H$VY7_DDVo zzJ@?{iW4|VkbJ_(h=l>%FWd3WbqQsZkRfL;&KjWu@FkkB5-lM=iJY-nCa#2n+L9U~ z={4kfNOLiW$gd{3#~Quf`8eBvAV;gOeQeR{U5_j}eg2w_bFs-7cUDbyeq!M2+V~Zd zec4nVGnK<=t8g>v$Pm@Ju@AWG7kZ=H561Ej!jgJx9n9L;a3g!Y0S`6E zR{6Ik&RnpXlc%V_YhpPyOF4D-ZQK4buPSn6rD9v8b1v_mtqhK~`PC1sj%90Y42}ra z9V@$ABc0JLJ7TtGw8m9FUmSNfvX-UCt9CDT{-iXv`%Jv@g~`t8QV3)3SMOOIj<)y2 z_MVGZpQl!r$8>bCiODtHea~2Vzov1;Rxno@v(+xt{PV^S8}GW8_jktjcgE|x(DGUf z=ISdkQx(LICRfx|bD-{LyaEhgchK;I3GU<&+ z8Ym7h3X!DFPH7SuTWDdD>I1@SYu>kq-|@w*)lpsb=7y0Dg*9vk!t{ova57BBC;{HX);lR?AZ5je9AC|pBNMxja{EB3=Tfz0%9~NwoWb${jvh%e zITNw2INjlU+CY-%&ZN)Jcm!UO?veg7WmPal3Z@NL>W4AXRE8BqUq9ok8MOYOr;>*- zf~Klc)Q165hOjI2teyyojA^$YrtAiE&uCG3TCqelMY-X3h0rVI8cN306xf7Sr{ZcX zlsvOxs76go-BSnZ!0Ny_L@0d2g@dq?HV&dG!OA9=UdWg>{zP|exPE9>*)O>Ar2R`% zzFNA2)$Pi!#Q;;|$4HXWOMxkpaJnpY@!5w76g-y66^A%d^C^Y6=kzz8cDWeeuG}Tfgoj<+;yV+M8|npq;;G) z%T$w!R~uQBD5rD%6fxOIDnZdJDQ9(Qp^(I{h`~jdG`F&e2z3_A8jwauCOY@R$=W3P^&#IU_iiF74H@p6)fi^ulrFX97BU`0% zUhk=yQ_=F~k8C?1;`rTLGq;xA%`tcLvioSveKcZ&Js7j?YKdDb-j_bO@x2=#y!O4< z7KfK=+8-W#{$F4G!;3$&EgdA6?Y!*!2hT5_`?hA*IAff(&DiEHE$sd%XGiqldF-ND z4p3cr*`MNPfVRx#y?^+FW8XWrT(vt^wRA=j<(8N2X;T_ ztNbTMZSJc)eLDwwxGcwX<-6_goxF4MKig{7%DCDNK5DH1vz@hzvQ!~37ku`}hDQ2H zw)H2PnxjsgZ0f~tx_EuPy~-wcK$-%CB@#a<%J#2N>=y{=hp5=kF=52~5^Y41hk`5g z&P>9*Ml3tOL>rGn#xa7qcC3nq z{uO#0q~IC_Z&2_S1ydA!i-H*nzE8pLQt(F<{5l1hRPTR5FOE>~R}}mW1)osxn1WwY zz|`*?wSc_a6dQp`iZN|G;d-W9SIB^q^f*CpiQrRKCQVg>DxF;os1m)a@D?v3CYzp9 zB>jh^?;;=mbN<&6!vBEhA7^uX&c|HV$DH|NF85>3^0yX_ul!5S@e6L-noiFbf2_-S zkY6}?`V$M6LgT0LT@%JImj$>Cj(wp#e&M}<6p9{CM;K6aHM zcL%TG^B!3Y2plZPN5-8;dET&gR@0QD`{gSp7zILq@KbH{+-qUWmuQUDHQA~VY z;jh-hTR5|8`oetnqA_aR2iFh2cqPB+8#rnPF~#-c_YCco+@Dlh+ncq2vV%wX{{X6_ Bc{Kn4 literal 0 HcmV?d00001 diff --git a/__pycache__/connections.cpython-312.pyc b/__pycache__/connections.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ada7a81ac8b4fe4e62567f290ecae10fdf21410 GIT binary patch literal 8813 zcmeHNYit|WmA-dge3N=mmMBpYsYfg(wq!f9Y&n%(%C?-Qw)(JRv})r4%^BHrDN>oC z{Ls>X*LHUmG@FglBvKj|Rok?Ykt~p3g|Q1PO8+X*Kcej}wX-Q8pxDLsA7k!9S|eEO zIWxnDC`NX>`LRECWS)EOJ@?+3d(QdJJ$J^xb-C;Wt~mFD*uQxR`2$vp7iThs3&6}1 ziAXd~rsxEnVkQ`hbtcYEaTA=*vvGb(m=JWHi(4ivAm`%N2`iL*+%{#Oa8SaKV?+{e z63Ox*W6VC`WQgqi(7e@z>k1dL{t;^-MzfrVOi#xW=QQ_;nN;L#TsG?=s&QYBq|QTW zol)Y`5k-~9q2dt5i?X7|k_l0anltK!!>kng;jVuTYmvzWmB<7m(Fs;!B=$CyIGK}p zNtop3$Rw34H{BDw#LL13cjZI|Kgmeen;fi+mu(k7Yt7BGnEzqm!iM@Q#Ya8@x1r6e}m+#NRIt8f> z=g{h}P7mX$y>9#PdmDj87agcQI|0Y!dwMpjd5|0!@s zry|d&vKWoW(^pdG zlZk7kj zWzhtn@KR&#JDenn3%Z0jr49;1iPoZLXqJ<*iXrL_RT@C7q6Y*Nj3Sx`R9R`3*^Vkr zRmm}msLunrMphi2oWq}W_?I0$OGk1$M(*zz`6SSn3+&1Uc0CB}nLF~(*}m#&%y|M? zPhi;^P?})e2JNBdZoscF2jr`S{^=TQ+ggJX{fPO{bo{5$;@`2CXoww0DA`iVjG7YV z#6{35`iY@cOC**&pGr+@uERRWOw}s7Vk#6{hl;E8@Y&89nyu^*gnOvNK>i)QQP*yJ z*3rJ~=v_LM+j;Q*&V!!>`*OjdY;fq;-iN_4)NhBb-(c1gT($=POGO{L9wuRiYAozh z(Mq2~UF)8TkO#n!R8f)&h|ral0Ltz9b}WgoWHVlv-rXSTH;trZ5huqEv%xAeZ0@oh=2QoOKuljQsl^#W=X}S7qQIVyX0^5|T?0{*Nok;qT;H^p?h+Bje ze!2(V21bvSB_D(a^D28IY&_%w_OH5%QSqvf<}One(!vGpxOF1y-Lr^frO~(I zZCmj_mv@jl*xQ6Ud!7h3yY2x1u+K3ingmUG1Ac{H0VxIU|CKp!KxXk5F!wvmJo{HX zPpm=DnULP>SJgjjyv>6LU+bXEwqIo;90k=BCyh4R} zH)=*1gid^ffwmSA~?(!l|H3>xD1oCxNMs9A)HSfqa?|9JMpR4bGT00v? zw&_OZ(a5~bp0|*Ofyb%`Zgssk?4uvha5J~G3%EN>Ss^9-Vw6D?q45iE0Qr3UBA!6u z*b}b3h)r%Q90Sg|3#NIDF$Rq?aEyb$;<1f_@w&-wNM9}i=D!Wu0)>S0GW%_E37&ae z*$>0OqI;px>Xb+f^5Ds&BC4rKYDU#8kQSyOq~77H?TCigsG=hLD^O`ep^fw&uTu8yp$1F*Kx{P6(n3GUASAdK6->T<6*T{&k**4gpU8CY%f z&JW)jUUPtG4dsuJJXUQm(cAtP*ni-|oMnUNF=Xg)gC#!!ZX2~<4XQzM0}i1ZVD>cD zsS#O=QMhym)_`rb)oy?lEANL^1PG47?-2Za@N=M5^?u+5@96XFBYaIeBXE@ipbaqe zJ}8tIfM^`-Q<5?QY_$Vwcv#h`+SHnv$SdI9#{lpGZWn3JhPKp;7C)f-@5P?wG^yE-1a2Jg~p z5D(oNTC))M-kfXjzH86`@LT=^?BDZYwhUm@X$ipp4VA6;%QREgYYANDo>&9EPoww$ zTkt&!DyW37G6oIGF(6e2bCltQO)uMxoRF#<6wixDH_*ajG}38{@N$W1kf;P+V^gkI;8%IKchL3f#|V zxJ&ps3*0Yz!hLMUPQyJy#=#=*c0zNem4=@YGHn9#T@+~#Md>p--zS6*STg-IyxW-> zupmEVfj+gd`4-&&3&rLZK2jahx7vsm)4I3|n*hC_CKI*HtK$)cHSvgo@53Z2ZtG8v z#~<$BjL5)2_8GB#0{l)RLPCnJ(L8H>xa{;H6?LHZ7i zR%d}gc3fA=jz4!pt#6(0xYe=9J*@9ts$XgE_{o`f&n&(4Pwo5OJi6j-nm=~y*uv4b z$3JCB$Nn{zG(4jNJ?|!UzGZ96nhjc(>w6z>Xnm}9!R_Db3XibA>I?7YGo28q+01S} ze88qhS2i=kk3wW9dJFmxTKxw63U53)*{<-rr`W5%D8IxEt?H`jgC$?xL!K|5u6grx zjehH>ngpsF{I(5}a~XbT;MY1-!-o^jYR^othF@;$A&9?o_T-+lIB_t7;%>$?|vE8HAj$v`4vq##xhu4kbadcE7%2pJriP)-_eNehbVNsM|H|lXuro(XuoH%!Pt9s{HV`%ugQYE-+?{1)6r+Sdwmw< z2ZY0n<=$R8y5D(k$b$R`K0qjd--Z&ADOnUXyC_a2rI|SLPEmYyCK5L#7EzRvQE2AQ zCX;dfOODbE%TZd8;OjS~9Z3fgd|jil@XpeBImcn!bJ)gDLcB^T2yr7m;=xhl^#iKBb+(Y6Pn;^V?KZj>knYL>1Ttj87_G)Sex;5Du46!IrKiORjGu z+c$zOU3sU#yBDMIl8$%h>nMy~Y|IlZmpbmUcTe86W<%kE-T>~<(!LVtS=loL^pVHQ z!|Ajx&qJBFlKQ5+4LLh$XkPH%^5-2`bCQqB_Jo#>{PeiN zzcj9wt2TFmhdy~f-Wb;5aPnRHI-sn$E7t5VdmiO*AG@dELe5V{!t@cSu65yX9*d=y zKR$KmyE^;plc>|yycec^WV4y`LRm2{*6hUYDmakaR-Dam%;qo8<~L^Z{|U1Na7nH0 zk6a$!r!NUgJZQA!Ey$HkgEc2OLBBYo2H_9yh)- ztNoF)Q3EN6AI*YU8GK(gfZ&ABCs66vzqaUKW(@ctBsC{~d+j&>FNA`7sPZflM1!{N zTs(O;5;s1=l)umFuUn00@nSdYmyAGgP!VId*BN#lWlp1rJO$R{{X&MWgY+k literal 0 HcmV?d00001 diff --git a/__pycache__/env_override.cpython-312.pyc b/__pycache__/env_override.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b185770a0411e3dac7f08bc2f58f73dcf5a9c57 GIT binary patch literal 12979 zcmcgSTW}lKb-Tdgy?7Adn*_Cdh$JKtvP4QEC0Q?0Pg|m5N=_r!K|t=31dIp0yOd1M zqNWpPM5<~y1q(}YMAW0z3Uk)9AF5C6HLJZA zj*NN>XKCeV`Np=8LvLfPuM-z&*2XzGR}C}{8(916)CD(N!8$k>OLMMaigmtjy5M0e zSr_z_0nR4Yt>erOzY&q_qqd8xwR&V9XxQbAnUYj5%$UxOEuzDsyDvn4ZB8Fngkr9rE zsw*Od#3Vm78sc6a562X8hvGl`g>(JS^_~cwdH(dXp(BR}j`oFm&-Wb;^`1D|`;6jO zJ5QeJJKqbXP;dX~vnQWD8aj94xub{s27ZC(Ig~wA6$NAuBm7_c9z_tB>B~-F5nzfK z5hsTXBoQ}UaRR|eU=m}TN|A9>$`CiFjB!hfid$2_D9~q&+2ZyqG?Xc^QiTecSgn_| z!<(k3op6Mv*Xfj5%x!;$j@Ikr^1soBd|59^rne1Rng87HJ}b)T^Yg#2?Jy9M`458; zZOX&bOFGM>EQ#7DK;Nv7&HuigEv76I5pB)4>m~l%d1{p&!J3i6k}*f0E5B)6v|Wds z|BY_Rm-Uilxk&O{X-LryeN_He-v>#Qp2Hvfz4y^t8j`l(-)kS_RSwRa(s3LiQ`Qk8 zWfQmS_=`LAQt37Iv(zv1i)AkR|79|ob| zM65bomV61gS1_C+zD!OK0m~U4MMA)&lpRfo96vBVIKqd=MxTu&IK^}Hm58{m#e15I zC;5r9v2Y?0NsOqYeCyi=#IVRI_4>546XIwx@p*tb#__#LmK)&`w@HP3UNQ5WIL;^5 ziBc=z;)DNd{{{l~4TCO7QzWpOK_a6lW{VO_>RU@0f9nWJB7p7w4j9WM>aRK>nPx0e za}ijm>PN|UNrtFxWB-gfs_G2ud!qFEI@Z1k8iMH7+u;&rpO0@6e`VCmz)Gi!a4MlI z{84c-7T}b_qX?%PrS~qGqIgb4Nf!RL zz;sKr!va>tsm=-|$HlR6QQ#|J6N(A@1jVUh6;knlb!fyjSECZZ zzC0d^Vusr){m1Wl zPCO{9o9mlz$~5hjoAxf1?JG6u`FH#7`WL&7r#nxi{U`5vPOWH==gz(P;>{Po8M^1` zSalK=O{*@#Q!!1i0p7$fehn#5r+8ImEf|U<*zqAK2P3><%hmW67^-Y49y6o~mf$5# z&hU6Ic?V{A8+j*YF36O+;+YpCya;MxWRhdGqoraIK~#)_$n#Y=4V6xX&H@HrV1X&e z!n_y}BgsTah>Rq{AjW`qX#Egjh`^&7qL^YJMg_hZ;h?^z7{`)hiir;=MmWV7;}Qym zhs@(s7jP+*z{Q52h`R>T2CHa%qPTG`is2Ak77;K2yaZ8Yo5EHyqQIUK8t1C?6Oa@d z>uUL-Lp-WgO3f2FkAE8aQDGC7Kz2QzB7R0Sr>W*o4Q7-1zX;4+geka6IZYMsbiHf5 z-SgiB6hC1In{Rf^e&m+m8K6rEN>~% zk)e;s^pQ2Ap~606TeT7ul{b!GKb|SyCYNtpEDuiGR?URXdCh*s{(!EW6>m&lpPX;H z{%7g7o+WzkGVQr`YUb1|KXWESZ<6Uv8M;NLTNX~jirINH_G_*g*OIk*)$tkNd_eDd zL=)9p7pu0VD?6t8m+Sm#s``P``}Ix@m#-cCcW3i7x$O2`o1B@PYn(|ADQvB*S0x?#1Tr zdvs5>GkDLreHlsECev*T?o9hWxqaVa`_uR61FLSr)w1d#Of?y5lT2-zcW0PRndw|) zb}do6A2?jouP!<^&F}cY(UPWGWJe3%h8*A)UW$x`hQ`^jDkO!W!WS3cK-JJ$fVcB9 zak>7o|8j%+MjO!BNum9VrVQA^VA@bu>f@#>ezlE?nZexAOdV6q8n>m)Bmr-_17=B`K#{6to%i*G+$m`-oY?i?GU{f+m%+ryRJa90s&x2aRJZ^f)CtijtrxNzN$B zw4xNn(aagwc7LEPq-4wZQ>F%;p1fKjPBjt1yiUBw#EH7vz-dK~l| z(7O6n&0VphZSZV(0ubZTxB_RNy`Kjaonr?$QLzq=538CIUk07)%uMjpX=djCLg5YS z1_7oVl!{X+b5nvRF2^RML{5;jdqu2KISe=-Uircrmcs32?2|cfswXqC)~b_ zt3h@(WL!=n1?X~7N+i$kd zpILHuF8doZ{w~?ymGO7W{_eZg?{zKuPozC3W^Ir?tlB&uS*&_0?Rjd&SGD3USM|Gf zw*Va==tEn39O;T$PnF7q zVXOWAq4RgC>AnozEYr7GjykX==|d~6b4^u4UqrewBh?QX7{y&={PKrz2vUnbj9KnV@Y}GedOE7LIpbm^CImxCO}vH# zxP5&%29;~gFfGF))32!dWbmjtD%2qOb`;}eWK6O095_A%j#bR#V=TG}-PyYy8k|t) zD%4Bd1enqh^hFBX5K4Q+{u~!(KhJ}oNKou?4%{mt)kb%~WoAQ{v+P<*GXdRFssll((EOzwyUtt055eLJ z`HE7J1Ey_4r6>3VKv9OgMrBTpIl%lPoR(GWX>{ev`w7hmyemHf*NXW`Z6 zXvd}j<~bV3;5QVm0vcdn0-5`ybVzqoY%Y0#x}Km*VO|1w z!wC4xaJBwtia-}d$IqzeW$O74sg4h+&XqFX)TxIx&GUzE9e?Y1re(j}vVXDVz+%n8 zG*!9mtIGHSvM;dc+dB2^D&;j*e8;y+z|TA7pW=^N{-m8~4Lq=VW@{I%_1_wue|528 z&!Aq#UTJuB9QyHd9X1X#=kIeKeGW+h)`&W%b zMFV&x8auSs^xkI|nbT>jAG{t_&6&y$xw0cu*)3Og-`~=?NO#>1%(qOR%Q*duPX9tc zrn|sNQe~fZJaQ4vDuB}v$oPY@KX|7uv!hqu(ffhFcg;kVc^~?F=g9kYfrZBJx881D zs@t(@La)lPuN_=ja%2U;uKceYKKar9ngDZ zC3fulTRLs6|Eab|-5)`H-+kMCkM7ns<{@f>cjyml_5%`r`@@?3vlMtmW+mC({IQ8} zmMz!p2SjYOs|0kz>u77pSX*RkOM3I}CF>qoq$UV6tLzV}JMuGOhO=yT+ngwS(Xq2! z-GMO50Y;S_=%7w%C^e-eT^me$gEAe=&Z&br4faRXCFd+q>8=xU`w0zEZ4qt4DqVo+ zo+WGdLuN~9D$yF`Y7LwFzQ*~MC0`qEpk}Uok#5N10c=H8Bkm2IK;>HdegDqcb9dOg z;UA6sU?g*(Up~)`EwDV!Zu4TGr znXUq+TL$oSE&Q!HEt}xX>Q=qrn9EpeWNS^jwqwZ}T=DPJ)Cd@wZrm-?yJ7VP`v=zA z)m~)h)l)`-uDs698D?48QJ1Fb9)J9@fpG46{ILr-=JChvaB5htg4_<2eEe~_npeYa z9)C;&k^-8|{}}Y0+GhW!&6`i{u>ZKid1|-$$311I_L+a;@jOGAf6`j^jLH1|rt)X( z=J$L2&$!M1Ttz_+a#htCF{-a=^Y~cm2~9Jsue}RT`DOe-17q#E=zGpBh0%BTG$ZSV z(p*wzh`?LG9c2YGn=Ivqf(U2M5hdC35&0-Mdx}#ORf(b$&AFfQ9lEx}zYp=foJ%U- z58Aq&|Dj}$qC_l8IY)(~055w1UQE#hs79MMw&idrlN=D9GQnys8%JR)K0vCE?E&fwXz5N7%U0J$BN#=tY6KsGLV*zsQDM~x&K@3g z=2AW?kTrL!baPkT0l1+&#rOq`{Btm0F$t(zZ?t_idv9y;w2j-!8M+5?U*HVQ8oFuk z$sfFM=jd!##@8sM9*o9N z*m(>rDAmS9Ny+0Qqtp}`qCr9DR{m8SG=+mIiU+ASDE}G`_%dc+!3?=Xbp$CUb-dy_ z22Lo%`WcQ-a52RJJcIG}STf9Nb`J%-3zlU6Sbw%zapp##eWsYfmKlzWD0SK<;8O*3 zSQT6hoNpGv@I>o0cQXXVqQ{?A%hwoAOekjc#9=txAt1kE&(mCS6wq6-fKd|`MLytG z*+bh{3U`Mj%hRQZ0E#67#vR^q1Kh}}g3SWHaCDkYu6PPNaFhHrKvl}|?&fb1*w6x) z$>>b|(t0aQyMfgkD3Sdzf&Xim0y-2S{2DTFZ+!~376cX{Kx{ObKebXO^CJQ>2uMOC zG%^#JI{v^?Hrw{CU2h(|c`#GgA=hh{ZZ`KVo)qtArY>cO zFTFO;@maPxm-bL@pyXSuN!Vh17!lL_R+In)u;l9>0)01{EX^Mdy>-%Q*U48n# zwSMl!JLUJRyJ1Zg%8kJFz}$2A$prv|?zF$-PTxIi57=FW{a?KK#U;x&wLf_0+&$}F zwcj-@UYnSi0P|o$T(Wej-JN$^?pY5)_d|<+j(sz7GqPl9S9|up2MZsAo)w2@qma#M z=>Gz2vOzy5vteV@&ms6B+F_8IBtr;is!4=!q?Vx7nk$o~K0{(DHut6LF7Rr?+iZX@ z0ZRm#^|x6f3O<@Pu<%dU89Aj5oql*7PCb{qyu@JY`N+y*XjruY1jlV?<*!;u&V0UOO( zKFBfX9Oc1j#xm_5yu8n|%* zyjc*Z8G_HN;F1RlqA>W`v@NF{OnNoZ76g@mN7Vo%{jmV8R2BSyZHm0K2$ z-m~qzU(+<-g#ux*rgNIQ>iX13*gG?}ow9A`uLOvx5_hW)*BJky#&o#JGHKcw>+;Dh2Ci35aH3H-@P=!!! zIuob|=`QfspsYCXE3zQSZTP&MM@lG_YK!KfRgWm7y_ER&E7p+ekPC$%MxxG9Ebs+d zgij{&=cc%alJPNMw~#ohx>41yefSQT&tJd{&rmy3Bq_fYv+qH+@r_nvBmDQVuU2E1 ztmjzcl~6pXDy4T2!r$O13w% zWy*Bf{hB*NRLVr<64Cl`c{jOv^@QP+fvjI{+q%&A*5sNIE06k2XGyZ`&bd_rf8PU% zj=yWe1`pZwsFk3+vu#Vp+7GEd;JTkthro|@l>8id-|C(-ELrQo$#Rr5PaVD7|5|^B ts+FnQ4Amr4P3h)CGIbc-I>F;4ya`LxmgUxiQ>X7y&8sy8Ri{$m{{RFl%sv1B literal 0 HcmV?d00001 diff --git a/__pycache__/envs.cpython-312.pyc b/__pycache__/envs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9515e07d12f9f6580bb4980d3ca1d27747a5fd39 GIT binary patch literal 71705 zcmdqK3w%^pwLg9)Gf5^765emZkc5OlNJxP2eoT@f8Av7}GvP699f!;SLqkH(On4P4 z)@s343qPxXRteflxJ98_E!C>jXZ2onhKXi6)@#-J;Q9(st=6mlziXX6GbhPp61X(I z|2rSpS!l410z~ za;F_nk^kw^*)!dv+*z*VmAIixW6ySvc4-c#N)qx?C@)EJC-2IQkQMv;v;gg6xqd3w z*W1Ur$D?K%W9jw@?$N+9MB5YHqoQqRio8jxyiuH&WuMHovKiCcr*Mm-Mfs^J`o?gc zWS_?6$BGiuRV|L=ymb2vE-_w|$WfJ;z|Rrv)Xufjb(Dm!H&Z;7g0xoByrNuqGL$zIRbtc=<07RH(x+hlLx z`)y`yi@lMtt&G|28@c3NjNNT-;%oOX_71z1F$ZHByGsyrv>>Ft}Cu5AL_ z%vg)SwlL-r*jC2ejA`t4aCbX#H!6oaM7g`T+)jbr&6tO=UG{sJ{(BkQV1EZ=t&C~x z4);6Vqx!VCTU>Ff=x*aYy?q;c9e5!kyJdsJXY82f<0-p$wt8GFqBAZx$ljOp#~;cF)t`+xR_ zSVlf1u=g_dIAb5SzmKtxh`$SsuZXt3%6VV2f0`-3!kA<~$JmREW!bwJ`#NKK z`+08b8;pI^{%_3fZ!z|5`_o+VJB+<#zkvIESKQ}&ocHhc&v1$VU~IPivy6S8F}?jF zxBLUferUhM*ZhnH?9VWEl`)O|b7(n8El0Itw`k+@+{TXt_AFyRW~|5l1=I=&dS2$d z3i}tiM3^y+{W;Z${e<&gv0vsp|CBMk{dq3`p91?5V?Ptvml=DNF`fMd=FHE3>b+T~X!zO4NFhYyDbaUuWz$0{aGIzZKXw8M`L1Z!z{?0{b>& zzhkVz{vF1C&zQ#klIpI1pgj2D-;Mg*-*bOgu^iB@e-y3#JJ<24>;dNZ8=~G-F8No11sVGrV{h8Kx#Zs&)7XEcqW-@{ z-j7vz|KPj|dk?qqPl1IP`xj$+`^$tK$2v#U569{wS0hUOM3hQmTxWlU=*M*};}!Ow zit9SYv+Vy#bmDq4;rN?k|Cu`je^V7{1f=JZdi$%QW}3i%F7R|wGsFH1aj#5~Gs^x; z(Rvow(b-=UC9)Z>u>VT5F`99`{nr8?Bk2x<;7b{8S`u23>&weP`{`!FSg@<*^?TdP<_6yMdpYK@vV*aYyFGTx);r@vgICF6h zxb}uea@~U~MGr|I(zqOe<}&n7K=T-K0xD)`8=(0NwE!w%$OWjBp&ig7ue)}*G&>~M z_D^b_)`GeCq1Bxzvw+Lw04if>I-qifk^wDbD8>B-N~Z!}#CRH@#SGCbsvrn|m*D?W zfv4Ga|kYGkMw&_;&l0BT}r9v~}2MSyG!l>lmHs1(p9h8FNhEpQPxH$Nn) zMyd=sTewU)psfTk^$UU7Ie!tLyBJyw=x&B+MDAf|DWG>SWaN7oU27l>Yx_t;CCWIs zOckJaGE~E*Yg{JeI9;Y4Nh+Gm$lJ!H)&pu`s1A^eAqyZkLyds8Gqe%V4u-6Nb~4lq z$U_kKXEU%}oWB*&y$sz2sFk7RfOa#q0#F-6)qvU=S_NniL#qMZ$B-M4%#a6=mmvHl zIqDGbUcmdf#Jzy_Gt>&`07JV09b~8t(ESXx13E+yQ@jV*Va~q~&;ty~fZoLrwSI)5 zeSqH0(0)JEHf{=dLUt;J`xs2nNwWmCxA{f^dUeWW$1B0A7khvfSzFJgMdCx z5L15~*e5vug!=~c!Aam}7(WfDlcA3SI?K=#fIi94Cjfnlp)-JfahQA!}H51 z^Cd3xJfJT#^d&$qF!W_WUt#FCfWAr)bNY9{zQ+0209|3|ML;hy^mRaAXXqP%zQNG9 z0DY68?*RH1L*HYnzUO)gRDHJ(RbS;RUv<5Jvfp;S&_}OaLEd+`{(k^^i6H#_F8+T{ z!2b^TKe)vA0ezpL9{~CRLw-O%Waug&KSA85ZoX$XYu5m-TgRxdN)Z0i zJ$}ZyzX0?qL%#&{bB6v4&@UMJJ)mC_#J&0hu-7>MkGSJsTz}kwNDT6R^+~jO;nymC z{U_x9hD-JW`Yl7(0bOJ02B809=q8}wG4y9Zzh~$TK!0H9uYmr@(BA<4iJ`v(>SgGk zfUYz2FF-d4!e652rho~5UBF3z|4gu$Gg{z(5!ZBp-w<#z;J*r3PkaTg$Nv=Z|8MG% zL|z*HzbVR-m-u%9j{*GO0v-?e9|Fz<{7(UA0sfbOv*EW&0v-)mBj9m>lLR~guvWkm z0qX=j32?H2Cj(9)_$iJ3ukOFWKPPWCUHF0Q?P7hBymgn!5140!GTjlYpHnaC$U2|FNMXB6;%9?;Agjj;4J^7AZ3ny|jg+1l#d*6J?O zhDX`9Hkuq&wWg{KW=l<2Yiv6ZPObC!+_JMZEQK>S)z#HIY8$LJhq*DF##m!RlPx@! z!KTJ4N2Sqfax@xkwd=y;II|geeM7aW&RW&bXySEm*q{{0Eh?MWtur+_HXEBPz#L|a z&D6BXSQj3{B`j4{bq;F-w-?T?t9I0xE1Qf>TRG#JgsqWqj?z>^RlUPnXEHT9>P_|N zK(l3o!)msh!ufoCQ<;P4XlydAGuPEQs!dg($YHeEOctBD!4jUtrPkFMt+kYEacnX* zS?NN0T|xDgwwN4dkZ+_G9X4~l zsiE2CunMLb8_kXlrmf*zE?#Y_Y_4(CHPkpL--W`s4>^^Oz}vV$AZpv=yo-< z1*xFu3@&0qOcq6Cq9u%A<0fUUabX^MXNRx7NGq?xfP{5zEiJ8Kt#{|{Z~-?#JZ&;= zbtpH-jjD|`xRKFuKt(Guf(YBMRwSPVDj~bG50V-(@<0AZi<% ztIho`3w2u$-GbgV)j5oHbq$-Ny2g47}Gz^=$-(UDRTqN>~vGt&aC*U)rLM>kOv zdwo4D3xrw-WsSKqtZV6TIfa(68JlXLSrv``rmn2KWU0_D^+vPBp{RXf5cKsRczr`9 z^txqJRaIn42^wg?p;sajCP=Stz|;oO&1RS|5NI=Pv4snH6m2Hh(tfoZ7IRIlt!}Gh zbFB%pOX21QvTepX^Cpu6)LvkaZVUO~`Jo%?6z! z3|)*sVeuO4nxT)aqHJA5laVw`LminqMLEDgi2=fRK}4C+m;@vQeJ+T+CI{7+r7(_b zL@yA}o2nhwT4N(@vfypsBAd;&S_}@iPF_Mo(^l1}8DVe4u!*s!QK{T&Q(T4GhK7jp zTZd-hJD@xXqnHs&2cqQgy!86zc}enZ|M`9#X8~3CtQ{UaN(Aj@%+*bsM*mTO}HR#;IrdGm`6~ z_NIs;&V-ezfp#OeC_I&K(6vy54-fC%Rsu#oD?_zzOagYnO z#)$LShj7Qb#zhXJwcb%K$H z=JK_dNZXFy=&-_FhWUmcYcpDG;n8rz`{Z16aluXfvc*gko-i~q>!52#sIWpfD26{x zN*uGSO6?#h>IpIf^}vqhH3L{kHVvZlNuG$D~OpGK)$rSO6vAi)_F;QvxrQ8&?MHVl_6t9wG0vA+HlvpOA-G+KH zuee&>TwjmH1=&SwIK}IeJuN=?PF>&v{1s1YYgh+IrA^o{;s+{`6A1SfiyTwURxyw2 zjZIJiiWY&)nX4QSMTK}Dq=l`0VU-TacmS0V2r_ESWH^Fu(&&FjExa{ zfQ$9C{*0O^)dkEx7%*}k2QSCjGNsTSi;-+aYS=#%F#Ev;V=PL|!R&i$~l zI`6%nJzz{nYpeVP;*n6ck)~-4Ti@I$ln=EO^JZ(B8k)&Nv^6wtP&tn(2cJf4U1M2A zeD`0ux-`+nMd8#|rw{(pfuf{vx^Ji4-mzono(`Y<6{1z)Y`^;nCklap&dEaUsY;OG z^BMw!!E1CDQiXq8FlFj7pu&#BS5sE>_0}2|5WA_V0ixTumE9w2Z9`qPFx!ergIi## zqBR@lyBSW6o&ET5T5EfYvvp^?*Cz;}^Xi0Xmj;&LX)p;PjU7(k zPPgok-L8%nx3|UVLlmPZC7j&m^z3zqb$j--?hj9Nx!rr*dmKAGJ9au^Obd?{`K|5y z9D=PfYq)XSTbcIf`i<LfEJkz( z^B%?m>Qp7xmQ6J05qU7d#PG;k!#WGJSy6I0XAQ#9Hvg>#SHJpW-~7n ztDE6zZES`iiHOLYK?+qtEjlXKl~sVx%0wEhB#i}|wJ3*PB@QV*1B;Wf5+N-N!p3HD z&Lb|bBKb70q311@Xv9MCH7VcLWTWs{ld*>UCDJ)olW9Y=pt2+r_Lpo0+Z{(umGH0Y z8X6jDN|J6b=WZ&Kg~t;ODv>CTPO*DgyS1{(3@1+vlIVeGKN3r_Dgh?ApOILAiqdHC zLFiCr;7CtKgjlSIRcaw({Wta<7sZ9CHNvb?5KRf{R5c^eAjXvyV(b!WG{rVhx>N}0 zpoM5;g@i?Yq55_8MobtiaM?AT$1EGP1-=c2&ITH+LXyE>_-Q0#RL!~-{l>aLH~_Y0 z3z&|nM-i}f&|q+n6%nmzv^g3Oo3n0(DXxdWVdBX;c#%SI1?wB&&56R$ ze72@$jsRIjY?AA*x6-PvsR=%bu^C1b)20d{Xu}#sQLVzC;~=WyyC{nd@&*;-ZrNmm zWK)o}|8<3I2nwQL7I;3?TM8LgLZmi0Fle<{GbxQURIaC(A$&)OF?do7QDF5i`abA& zsDpZwN5Z08Xd{w;SY4Izpf!ClkCM1ni*+tph=%5xT5{}^l@%z%LdSvtt*{xQGzA)< zPE3}nt%^fov23clHl9?65^<@nB%^~twANe2Ai@`bw`74{q!0*4U4?E_BCzm*$frS! zN?BEk*#o~0V}!9FWkKUX!VoTD+WQx&YQVzNYyn|t-&zF*J5ZX2C8jZyICez7lzh%J zTdL}sVcwVoB`C`hT*=PLD86A$l^|S%jz#da?;?YIG73}9+uM#WLHHaC)M;4p7d78$~SW_{X#8Lfnf#ZAzISe{Z}S>>v-6_=5UnYJjS z$U)hNp(YA=$*wF4ThQa^PY;Mr+T45+$w|pa@ofI>63+`naQf@g_@d}83TpNH?ZR28 z{=Xq=r1dY4yCg-P6me-I=l&yqtx}QB63*IEjc_S%8Pqn^o5Gq04yGu33g=F)hlL%2G;Zuz`)lWzOwf>Ph^J*9g*d*=I`GOEnq-?3!b z(tDSk)W6MJN$*<_sA4@%!`dH?%%*UHfZa%TOXF_3MLSdISIH9;}XHdW7X0kRj zrB~7>r^tn9U^#=jU{G5X{MXbSGLZd-eMtBmLNB)Ob! zPd5-$j_pe&7p%bb3@S&_a5b6Pk86*cK9nBJnC90_OCX<3sHavQF)Dr3AO7u1i9oTx zE)6ETP&AQVME+u`Yc28#6b@?^oY%>VL7>v5Sp?-$(oc>ixIA9kKE@dA-s7Raw9hCn zN6|6(@1+5z$%$?~UU|}d!tBpkez__*er<5HF_=;5*HsSeqYia=JURiKgbKu?i_&h* zVF`~e;wW!x(r)cxjRr@c4{2Pw2<>vFFRIug2cx~%Xm7(|fWt{BD@AfMrTrQ?Pb2yG zaZB7n+LcLZ-`}q6zBpm7)I(9;xsvalBT3+x-Xaq_A*7RNTh!t4I<`4|EjuBNd!4e! z+2*55$n~|$$aUY>;cRuZ%Z@hpKDT@@y}YDsLCJy=mwRtmQzkDZ7JI2VeRwYKvhHqo zxmz9X{q7d*ZpkIB&I4}QThi|i@=DYTPvi<6K2NK+M7bHm@+xYSMntAz45vb~dgS)D zuogN1F(0bSWx}H(YL;iNm&U+g5JH>{E$(o(Lj@}XEmx!31pN1IMzJH(pOcc3Q~sPH zC9k=zmooD@S9WJC=w4>&UesvKCJ9pNOmH>=)5jR&co8fnun7f){;EM45TZG4ij%rI;uUIbX0Rx+OETqjm*Qk zLpm~h&QA0%9V{?Z$!^3~3{FFkxPTUIvMF87&7~1#vJnd~>L%VE1 zFoq9?D1B9wxYy8%TCukJ@P4^r^G-KKT_TJ@O9t=G_KsE;kwF6my2L;!Txep`pkhRw z!rtv2Z7oEI0XwH^HaH^8^%`6VSTZ4U2?oj15*c**ONw9|{@+Z&r4Q@WYBHXWbxi*> z%F*vInWCg)sa7ebM31FJuF9!N5-_Ex)TTL>a#(XL*;{f*isywz${kKRq^Xu3)3zk- zkXn-N8H>AU4`~zCIFa<|c%5`ucUaq|J0wwk2xQVB-Cm76A*SS9nnTjZ;JF>sBh?<# zAk}?5HA3ryxN61yH28m5yH6@gw!E2bhDL(O3Q;LMm{Q0LesfHTVG~~v!@t}R)|3<_ zv$|ldAJ#{tA)HLYEQixkp~G#G<#xF!jqP$c%h}>1-4>ae888Gb_hQJ{3s7>yqm)ch zMy>_pkrO~5FVK{=HV&}&?$X#LLG2dJc%$I}3G->u>pEGj9l9e1m^sX)$ zSemq-w-XKj6~<`v*pq1|(oW3@j?O!p9LgAdbX92F)T5TyQ>4Puu6^AFOHNOEV#?*s zN3(vOJtH)A=9zh?=Y4Gc(G4#zuMJK&KVNbw<Q@aBh^IjP{Jv4S!=j_f&T`67ryLRmDQUrjh=8c^KUmMNi)`KykxRpOrB(1ts}Pw(-PxAY*84*tYW4$Q;$f)y%Rs7 zqeu5=iM$!5$m#PMQ1b6bq~3xF$rGN`cDl}{zaata<)lQ)%<=1HD3&;`23?No@inL~ zx@aL!rjI5~ifU4=^VcAPr|WHKQmzLd&j)=F~xfT4}*{f)t*!>-O~UXbjS2C4Z2+!F?u6Vq{iXoLt0RtqB0FBhf)&N5T+sZa0*O= zj_MyyJ(PSXb*~0}TIm}Vxq}Plg3Q9P3^EIcv`CZH#z7|?N{X5X&Ay1CNP(e9v4l0n z@&*jS!Q?`Nr_CVO0ha5L6s4+60uC~@dVDy~1kYiQY@sp@lr&PoOvNN<_J&hf|9j<) z$PR0P$W2trN{Nk( z!Mt;KoxSVQxbE4DgJY@!vln-2E|2-zq~|AnWy1o+H91otW-?rT`@O`+Ai1REAg274Dz(p@nu95@WM`1?& zS333GS79bev4#FI?Xjd|nq$(_!emF(%F}3%epV?GoY3NCJT67Lxbuw~h|RN!apoBs zIJk($8^G{q8490_NCHi=?6LlFw(fCrx+Y&*a=GTpJA&hE!Eu{{*_-|P%`6EBN4rrW zktA-X2K!0k_;{4bJ5cfwDh-ps&d+2`@)pgLRTs2pYu}Iv#w?;VMast+p%M36J4 z#s8b|qd5{0TRFyCNWekY97s|Hbk)8TnxP*>Fe34n!&s^Kq5<4kbjd2^Nj?e_^DkeL z*BnNq!g%=B{AJ;~N!)5>wYet$AZ7?@J_YPq2rg$%doAr2UvJ~?|*3eEX%zYlOJJPTduxbUlDY`CM zZZP+2#c616G8Y@X9Xoco;k@0ulp2=zB9MUPq_?FVLBsSr5+X>wam}JAlndmBl@+I6|nT#b43W2KK?jjw+{22eR z)-B883YT(uN#dDx9AzHG%QlLNQE|B)ZG0U6y&pj-EJnsk`f*R>o|$`kZXh)uLE6`p z82<6QI`e`V1xKoTb(zT*oub=$aS28BtkkXO&Q+?6T$4ohu|Dl|tDWQxp zkF9$DDqeHVIORIEsaHzMzXx+WY0N#En_3;#S$b*wFQyq+YrnSKxJ>)vvSffQAUXP@Ub0?T4`8V9 zf;!?F&x?(yZ6*48H(HeMLULXkbG8)EssOBG9i=Ra^(&QN)M{DAg>T^hO(mH8WbV15 zvqk>mwO0y)x#phShCpsZckaf4>Mh!hraRb;{TeS+luJtiG>iNW>JF(m`5sE{qb~IA z2UE|I&8#69G_iFoz{ z6)BZx%%{zrZI@i#(^m{6g-j;P>{S3@4pQY8&yvNH(>`(>wOWx@q-cD77irCYzle;x zw0jYK_d+@NG9pE8gOX^}KVr==fXNowi6&XtFp8_en6S<#V+Tq(-J?30ubi#tGd1-5 zOi_wT304xf@1}%TGrR>dm>l>~%v*<>LSQS;tv|c|oaL;=ztH?rc`)Ddv*h%5H@|1I z_!CMU-iMY`bI#}Zms_-kgE5N2F(cujn%{wB}=G|HL&{#sxD? zLH)YdwMk18Z8pDkJ@5>#2fv{aQLM+~z9*`lOgfi#Hm$4Zk~L^p7M#Akdr};QLercT z%Ci7&M=Vv2a@>u|ArwO#H4{@RQH2xTu#4z=mCC?oC+$y$b+mC7(E`{Rl;qiz)Pa5aRlT41Z!zR$Pa=!=Y@Wa$5}eXF(_yM665R>@|k_7_w~#y3(PEw za&|@^&XxwJlwGO_W-gC#))0rYQC&Jr7a=M>(0vdW(m9Gdlh7(-Z#wEAM6B|*?&-@0 z>h`Er?ymykS!L{8QhENxQ)m&PkBGYB)=>Br>;759X%gwM-sj%E2WR}<;dJ=tgt$EN zpo4(N>Lu?qv;pyE$109noU%YpSukfIR9jvmKD$u!c9g?#y$zRE?8C#^B$pzb zyw2H*Wza$N6h|1i*D>3~HhnNPK8kueTj^=}x#eeB9ytAgzhupo@!d1m z4Ww~Igqp(AVQu;13i*SH*gKQhs|fLhmOetfzX^ike3F<)9;qjN(yfiMDSMT3wyP9VkI8q~G5J+&8UL-UK8)yV3a|fQtdt%#} zU8i@QX*=E4b@%0k!JIYSQ`Zh8pM;24t(5@oj2nXo5{F=PJ?NA>X*>o@&JkfJX$$!S zbpM3xY#??D`|9nZz?WWz0w?>rAdxKJIhIZulBHInRd|DyXV#xyf5vj!(zW{XqVDO5 zM%{J$qaIxfCOTT8UXGApvSXmQ1&}T*)&fA$@Vz4j7?@ zeb*ifA9Qec|5KvxD)h5g;cR`2BK%?kVL(M29o9+41s*^;Ms%c9yt3zNI%P; zNA~NjciL3;YZehZO#9^|UVbVuFYmN{&AkQswh>zm%+?F?#e?S$UVPv@*jm)J5R-aA z)U#ZW$g_ML)&Hkv3Og5hxp1cXeQm=Xx$VT6 zm8h;PfTF>Oxn0#kec25uDJ`2G=vsbk`D3e(tv*%Kl^@I~y`e?+s~K6Zrl#M>NX|{f zwMsO7n_TNRgYv15>3igMyhwnKq$i%;t<*1}5&HxH?(Uw|G*BZ(jsIGOUWl^ti+R2A zB1ZB2T4lZ$pgQ}r2&=@xLmbE+%<|kvB$Eq+x!-aj_hQlcBLAYgms)~zo4O0E18FAb zi5;k70BTP?o>aw7Xo5a0E2&Q64NZEO#!HbFUfrafPxF`8bm@ zBEi!!9AD$GX>tcncnlsVr68Y7k9{Q|_4b0!^N5_elq3u~?%BG*dWgbbRKU|+uqvl^u>l6PZ~Nfkg8|+Vuuv&r#_W) zpI*rxj|WV*p4jT2x%Bev;DoioY@=UqJgU87y_#CvtCi-=!y%M8S+Ayz>P^Pq1ifC1 z2Gw)v)_bkY|5CK0=-_8b5(k^}??xwXiaEa!7XQH61OBpgFKL5?b=`S!=2~=g$cDtH zlXTJ!&4uXMws?ALxa;;vj7_yx*I5o4`0+M+lS>;ZVmt$f=hfR>-V)ipgHEp=2#+^L zE>lhpYY z+Ees(HMw-(xO1-;-J1liJ(ckVmgDBo`nMr#c`&=t!ksFQ=;^P zW@BR2_;DgzC?X;4>tCwwE{t;~Dk|zb(Q4m^1qT_KnW~Y2EGfTuq8{ID^JpMixeyh3 z>h{^SVk4f=es@yeXej9^;+aYuCy+Y-b*)r5x34)57FeN=3WYu@yaNN#&sSPRU06j) zLPPZwQLGp$9RHYdeu{sg#oxH8yC{yKdfSF$ab!5;?AWp8@cXU2rK+nGN&t;sM?K=xmrDXAtzbGdoy=o5GmH4Oh-$v!AxHE2wiEA53_z z)nNJn1zM>6gVwq%u!C4X~95(_z zs2raBd@I(23)QM~QHw6M;Fv~GzXCm&uk=7Rp~*fjDji$kY4Ak=x3iX0?emcq>GvDT zFW{353=R8tLS5^8aUPr=AF1U-~Unfw6!^3tQ0@R!G)n{;+k&#Z-k zSqp_031!ZVbKd3hQV2uj#7pkmBMxTj8OLlA&c`29_Y{G?U{i_1;!9qfhYIRfLL5rs z^dM>j*5W?#z0Y{wW_)u?>Td(u32PS134P?B5Vc~PEvh>j{Y}lm;!WLi5}B$yMP@yb znY3_0VmkkmCYiDaa)D2F;Lw?WY;l(>sHej@@iqkSI79KgBPcB!g3@1tQaMZ$V8AZO z21)jz`WN^oMO4dKDS(~{`%9{?Z12ubv@&0dR_}yvw4I3X%OC<3-T0&zhbEStU*=y7AKn_AXX`GE;}rfMG3Lu?=pG~uMlOG1W1dAk62`pz z+>*0Pdh!+r@)k#p`K)-xd_^#`CSuI<7Q}Gni z2nYmi(+tCK8i;tsaK12J<=q*tEQC-8^|er`%ikW6b`4KjW}^oY)31JvW65Pa#<41> zUpMETra_~iald^y@^-wBZ6z_WFub3!rZ>6skD6e{NL7UOv|1t($X9Z5l&e#*eKw@X7g1RG|9X8u^&T${) zx;W0vyyJ@SUPweZwQK?Z5_=dch)}UZp~b^>>;$d7QWVUu>oz3vgUB20;|IaLQ-1N* zi1Ci{48Cq`MME+@yq5SF)heQ09HXitH6GpGrj@Qb>d{9AP)iATv;i1K>TydTEw3l7D3Dgvl^0Aa!EV@$(t(CIswRdx38E1fWgCFl zmRS1DM6Zy34hDww1zkEwzX(G08IGv*uS2_iB#H)#fAJ(rEj>7lj8LU29)#FKJ8CLn zeywuiG`hh&D4Q)rwqjYIgk2IL<#}CLgvzs)ol_o2Zf(re=;oLqKdnSVw_aNH7q=8~gT{*1K{er)$jDQW50@ftnuu?LPl zaB4?)c0t#|uKT*=^GhyyFPA;D_sY1>9Y#rdDJDwdtpsDopPY1J(#e@8W~w89m#TuZ zmIlWx>&-x^ffME}5>e`%vI6)=qS^0^iRH6!_=H$0D(3u}x4@uewgeId_>3@fJe<0eS zp`F(QBdcuI*dx&b``jwwk&M{r73g*?mRCP5k&UziiZ`E(waR>dT`>9vtH_2}%(w0Z5 zSG1{2zETd}Z?fy`u0ZMn7=eO0@RbUd3SY@TeqJE0xF@YVkXC+aVK8k8oTY-L3B=NZ zHg9D_;`xG+12?gU+73~#=%F_G6eag1>e{2|+D$rSta_TQuxk>$lOFw&fPM*L(U03|^~?C|8euRX~qW%bF|r8|N{xx40KaXej9W=)>s5**Kk;*R?6A zU(};t9?)ZZd*u}jMVz(jTil6v-$EyziqnbGOB3Nx^ypUy^sDh?_!St3IOn9^;!Zqx z3!V5kbV45Yc1+#(+ya#s-!>{AA}WWe%bx|6{T0ljOFIz!?9o>T^f-31?xpzw@wUbm zG#vfC#RBnO+-SI+cs@=i7GBzn&D$P5{ABpcrk7>}^qWT9Aia+|F--mPRg#x6Z--Pr zd<(4pIac(ANd^ zmba}R$8Mn?KZq;(uu5;YgYf=a=)~1HotWQ+!KmobuMFsM3S;#ZD4{rOfe~0yK0+NC z`t0P7(TAJ#I3Ttj@$Tab+b(*}d;Ciq@K|-Qq&YZuQ}^sRXNX3C{zr#Le+cx;hPOlg zeBc%&B}`OKcsrrXb z;axEchB$2Y2AYpN;-cC>njOrq?KYSPTK*`T*P{JCY`#r8A4fZK0f}b9V}^f2^KxNa z3BKbbeS}$u{kOy<{e5tfMnFG(koY#Ve*Fg#`JW`Q10r^JoNvcL(!r$pW%8F3%@7G5 zaOCtx`C#@k^X0d}4wb`E_uHxs)Le2CL9*(!eW+n)Gr zWBDb7Ea|EAc#-ZA;O9q&$Io#@yf}HJd(s+t&c$@a8!XxooIS)Q_FCNJPUxT~h|@_6 z%M;J{i7`gMV2$o8M78?rkv8vt&WyIW|2;>V@Pesl{Dxgk2yL@eVtqh7ukw z>BrRgyP&bqwX!Y#3Dg?KqprC`SpO%?D!U8g_^7u^P}F2A+uUb}po9T*e7Pn{OEiF9 z-JLfQRJ@^-$V=!VZ<{T-oEJ2#?w%g!4alizmCW!>u?GV^rTm6^PS!`Z zfPcG4H}w0(z5q!(*YUmr8RwY}cuQwd8sEa{`+z+@)^)%ePQ~{cJ6iE!LcN1OS%?Qr zJ5hSjz5Tf~R)r{mWkOz5pbIW&FJ_$2c(yh$XI0Of^?^A!d$j@b9q&z>cih^2lIUdv zoNx$BK)JjmuH%9$|C1^AB&r`wD=S}u?_w?}DN8Ku3vjc@#2b5V%GoLYd8@BvbkC|A z0l(lW;?&T>UQV3)+Q`_}PZLQWp?f79yrm#1;vK7A`rr=EwRIOH+C8X6`>;$%m0vt1 z8gGZG&#@TQ>t4>G8HI%`aVLoK!Hyd(r=d2W0#530ySVH8u8VEw+d}#|U3qxSxHS4Y zmC{5n2Dv>WK>K5I7bOW5ubBq=wa7q+v-k@kRc8KRy>(#=>PLSlsZ5bO&Cn{Ekt~x# zGqj!WcSjPNFTlO4PeeN$6+M=%`%c*Q(fvv7d-Ac4QcN zfv6fLBe#IQn+8;XWaeI+dw%Z4lJj_bSAJ&;By(O=GUvsS%&7W)E0_150cm*Bzk)c( z=gs(W|H?#@?H+U~vfiJ)q|bW4woa_~`o3=g?^(bn_ULiLi$pnyiM~#TxzFCiV5-hT+^;|td?jbVdlpK? zX!yr12SS1e@ED;C}^F}p+^O&CRLqk;KY+=(3Z@!pA6)QcZHsY|{e|Kf&< zl44)fig%tja=2ly`Xr{#5>CcTo%SCHk6Lfs;%KZh+HgX<-pZ~%rV76bBX%@1;d{en z8m!r%4?dhPq|bi5oQ?&aa&&DAjw|aKw&%Gf}can!w9PEMNc9R{O8A>>@vT!FgULvSlAf%D?%ec{^y5B{(d4~ z9F$c*yIFDx&u$_{NjfXu%UDOC2hR?#2Zu)J z(N0*qxNL}s>2oC5YvT&`9Vh7<#KNIh^j{+(P@WxCKJdZO@r8Kw7Xh>v^pSvrk? z1b>r=A6g%OOEv!ERpUP1|UuONYA-8X2;ANn0M;L}T8%0WGRv?jJOvFwF~{xyxk6&r*4CUmS~B=o`$ zK}0yEtYYEPMMG?!z7ZRSXJY}~ip%;wS}a_`v7vQ!Mr*Cvvd+}xFxJ&InW~$sOpbNU zR(zDuVXJL2S!)~Ws;!Rt%043&6|w#ygLuKXTVFXSD+Uj=+rgy!57W0^cX`{}!pV3? zrpp_9N#>xVCM9FkD5i5ndcpOzUC-}|hP5lN;QXK&W41;avv59hp}xW7XliJ-nJqOA ztGT`zYcXPg)z)ORnQFH7={vZq%=mWzxXtiWRQD%ZEh=2tO*)=drt%}?(hjyz}hcV+rX?E+t`gzBuyscAVs29Or#+(cxUHbV*DlN%f`g_!Dq2BjzW1 zhYsM6BBF)GP|YrWHe`@_H3AkRK*SGiF~)-Uk=m~gf+W1%s3LJlePWEA7h)4X8Y`PO z5^AcODAg_;%2Z5?k=jQ`B`fbZ5T2I&BN~byBk3~(M?eNbbmO6AU>?c9+qS%Td3Z#Y z5s~qins*rS|ajYWHAO13Ca}%wgsB==9Mf2LI?8ofK?Ec9M1EUu8j9MNTwLCZq&%gLb)x+wc zz(7Wfz-axBx;nHTsiV;f#2u{>Ad()=9|n;dX$+Le;05&(sSFPZX5zd(`S{`mK}EWQ zqqny@y*oW^+uic~mJU~2+rcra9G~pN2X7CKRb{nw_)r7q7X&cUD*so*DBIG9o;GMqSh0m6)*8tzO(Ss_U<`xHvdOp68vU( z!+#$Utt<$?sg4t^e+hA-#-M&9CP9@Zj;A#OUHI+rx^Ng>c>Chxe;OW%m83J$SZSWj$72ZV6`MJ!e(u z!Q!_^d;Dd1J@|yG2kJ3cinEiMcv_cU0rED5dfymc4?c+=403LN1O(-;MC8z_vYHd(`1=>IyxLx)3 zXoG(YuLoZyF{O{p4HVNmE)xGDkw_{ZBD8RYcq-oAGVaNyb6d}D^_NoEceXk17tlvQ z8j{GY4n5_+IXEvzfJp7|i2SZfWL+u)MxSoV#52nEm~WSu|r~gk#F7Jt}N(UzrO)J0HLA~Mb)I>H+6|W$Th@|C2;w^?>5MzbK zg1OCjwDK|jB1|P-kCj=gY^|+OES8*sCP!KajG}G~tpjcluN&j?Ix3*sz>M3}0a1@D z(e_^KO;=e_qWOht3HSNlpaGZ-(vC>reZP;r_hhaK;IKlbF{B%FU8@;2cA&h!4c(h? z3*Gxi{O&2oCwnrN2k_CJ%$5JEx;Ob2x~GLx6le65*kMoR$^gFak-2&p-5Z&`=h4)) zY$Sa?2*-!^pQB5IZAR;>$3U2E~1JPO7SJRCJ zE;uk1qQV*2pQXZ{(Jn3ZmiS(0MC$juA>0Otjmv(0^8%TY0YK_d`jH3}?Q%GO4 z8+0s2#~+I&m&alv^<-8CGAp|?s}uBQJb~tq4nT+>efI#sGL&n$y^b@=bSQE z4Pz#aT=ynX_t-WK-My*tyGPQOxwI#9bpWB3%(a7e57m`DzO{-K!>yIyt@{aze=sFV ze8Q*ta)@qa^~Lih?L?Y?DrxW0i@Gxx50tVyPUqC&(K#y~owSiPdP#R?qIX@6%-o(v zy%>6K=Tk2R>e)q?wgoenDb7H5rn2?Z#}SKas*w}^>Ctf+TKEg8TLb5|vd`F)xhjB9 zVPvl9BmChB?zX)ixxH;SKF#ac>y$mtZS)MZuFbjI-GX~Ze^H6sgshX&A=Q(l_{Qxh zU$pEYiS&PD_Zn}x*GAgF<3Pkg{(UUTg-<+xEa~IAh?RLzQ=~okrE~*+HEy5Rpd!o= zCCt$3Jm_gXP?FB&EDbi(a)WU@BJ(Itlsiz%>u7Pd>~zm5GI-kdwzoLxlf#BN`*wO- zb{c%LXU7h=>~F?9R>)q~^bpo!@B-WX}y|7yI?aucS{t zwdBdAoyGo=wO7Vp(fg;EgX!!2x)zUqqB)q=Y&$((1O9b*-P=1_4cpsg z1HR&pKf9f6V#s8-69*i<7%G(X8Jv6DJua`ox6^HCb7PEs?FQLB-@Vt_+Tp|?#Fwb{ zHm1wH*DW6aRjsXkgsZAUmN7M24;14Dg}4hu>fZk1gIu-fDED=BM2PIbSFd-w4ehwY zPPfw~dgg5Pwi|Z4ooz(0(>Dlp-kt3otu6y3ovDP(b?kAWGZ6u%z6v2GUPTFU3#qi= z6lmJ^j7)chs#6L=PMw-o`|1>rtz-BjO^~wvwKF*4UC=}99M1H8CI0EkkFhJdnh5AFY$}e8vfR5~_PFZVw6pG;AAc}8c)lPTIy#~*AgR?EJ zDA4G-_wQ--w0Izm7;|5V{5jAde;x_Gj!x=D^@Q-VgpziVzkozNz9}9rb})T1eWZ3H z@ns?C!ttQg3`MDgvd29xpFDWtpg((-Uq9;>v`RGGvK^c}DeaJ4nor`36o(|}h%wL+ znnN0&;DRKBP0BvM#p6z)3hY*kXn-NK_Kd|MrBx(kD`kfP9!}#MkgIk zI;_Ecw78GRw(SeS`(HxEBON+F; zi$09k@8^RiRDOq|=jwM+IIZ6=(qotILUQEX{Vs5*|1VPRaPpyKD=5(rHJW`9NViWa zN^!mjX0vYCY;3aNAog;@=AG^~QfCGy)Zjh?<~IqC)8$fBqL1{dN*8kFsD&-YC2xo9 zR+JKyDr=RN)()!eX=CNO-3fIfdkw7~FRpLr21xyG-|lXKp%veYgp}7YqV)>(I25CT6oh4u+zJxe=u!NNTK0 zFwy7(b#j}tB$$m;YDfuKNpQ2<1 zC10b2o@tR6P*O(8{gix;5`NzHS%RLY62y;py-3h6DS3?&K417d zf(}#i044NxaQO%&@22EIO5Q`sLzKLilJ`;aFeUuDQ+{3OBZTt%DnCHb&nS77lAlw; zXIXf6hj&l@i}E;l&ijA=0D7>Zu@$yGV(rx88uL;ezIF#Zt(_#6M(6-G&Py)O%|K6&S7b=nrx1$=4zv3li6yntTQ>PO`FU(%Qa|Wob+lMP&up(obxV1zVSOvXmDV4cHisxwvD8kz)>)^!=-b^$ITYljh}yy1?{;?3jPjSdI6+Po- zd6R#H=C+XOq?vU@x{*gMfGCct%2;KA7#(;FR>k|@n<#P(2bvp1}FB;m69_bw!GUC zC|MgEXGGfReZ2Cd`GomY%ZDv+^8@3I-MY%xvL`-v|FQd@Sa@pR$5!@CD+x?12~JyZ zWp;OVZBKSnAiF7;Z97u?YR0%D4S&0_M9SWz`P+>WX?&%|n*sXIjIS)xy{MmFS)wn} z*l?Pu1O~-Y^e_1`dMW=0CHGP?3)%8Af*uAG&TeUMZH2mr?(>#7x3$Pxv=vVA9oPe} zp5Ch<$>hz+#7UVv17-BG9S&z(Tf1Tzyl~IRpem1+9is+6FZ3TE|9u9Qs6G z$y!Q`lvGkuMaeo!YAG>OvVoF%N*X9>q@;-w8zr z(n5)glAV-zDA`5Hy_B?4vYQeok=sU4J0*K4xsMW=5-%k_N;)XnOUXV;*k&Cd=pZHc zQ*wxs+q5S~sqQgK;u)0>(uL!coS@`Glsrz!M<_W-$!SVHO34$He4LU`P;!QnPD;*F z@<~cQMah$tJVnW;DLF^U)0AAG>QSve+{S50bDDRh)yhh2dDET!d zzoF!}lw6~PE$;6K`aLCopyZE~{E3oYO0H9KgAz9h!%c!-r{vF+{DqP?DETWTZ&LDi zO8%P?vf%Q+D3PeHhLV`M=ko@yb zM1nXp%o|A@XXPL!#{fBORzyXLDOpHK1trTVsitHzCEF<3K}kC$9HY^>JS{$DiA)P$ zUO@6{iS>T}b*6r$^4`GJau|ith~$-~OH!{^qtU#smo#I3B4zzV%J_+t_7f@nCsOuL zrTm{sB>}1A7uwOG$y1Ni9nTBuriP{)j+l>nkK2wN4C)M_oU&6pdveMGIc33|g)juC zv_aj%kYTo8H|f;)pl)`kxWuo+JF0@Zl2HB}ziz^D?BB#!yxWlAS;XJ=?@~ON| z-RZeO-Mr8|)XwX)Qtg>}MCmEr30%kvl`it@@cqT0Zc!+&(61YRyyaB&iCsZmVgDBT zQZujAubY0VrL+1p%9e(v%{a5<^pc)w^8?f7ceMwnttQHYy49iSIcI84*Yr#;2~02X zm#zs;U(5Fh>ehzxmNV%=-SR%{C?i5`L0wrWCy&_SBPwUiWX=S2Gefz^oq7s<$P4A< z3x?*0X3ippjtlB$g=V4!2FpVIsGMCycWe&oib69DqL3j}1lFLz&ivEsi9MwlrJ0@M zyL4x#(Ma`UaM4^<;klvNVA+gQUSb#3Gj!&m9{9R2Gz%j<`4q-zwz_vSXPw!1df%Bt zrw?^CT)rzf(=4bkhl((ehE9|(3C#w`jh2Kez$4l-1dr;ie%;)z8vi15pcsT&LkpKu zMs-lPl)J!{=c}2vs*0K{>?#f9E)VLez{E3~Pj5bR_vyR4N-r-B&ZrhVtq#q_XwK?f zL?f9Gx-qascYc8Y3)CR-G`ADCEe*{p7D835map01CzC%vWIziLLK5FRj1>emsLNLm zTP`G?2Xjtn+RP)hM_Z0pAA>;6>?fqNxE(O3AXEUko^c8snHO5L%&(i*Wee(-g^FWR zQ@@cUY==LmGBB~~iYqV?%-R?lKjlcxQ52n`?(1|24FsNqW)4IHqer4Ky9o0m|G4)= z(J5D8Y%XQ>8{2%ABeYi3Upz`61hYFq-9j~WD)RHV!{FW=wOFWmx4cn49r2D-&9qbK!ko|?j3NX+s4G_IstTxX zuHv(*vGOH;T~XJfpl*p;V2g2Mp2-VB1(-EFXmg>|m}VME6{w;?#fu`)JF{~F(OU>2 zne^h&bm#&ewV8u}Dt4=s=8Rmb1{UUqmaXvXXe&jxB2)ktPCw-)_T@qHSwd$sy-*>i ze5a=db#p_LrzpC8N+<_vgH^aem8l|LrKeOHCKoNUl+O;SMx$y2F=d~>a7|$5+MsTO zx{sujXk_zGuOOYI>U|*wZe}MG-JJfir;^H<(2)~QVW9I^sZTvsO&TgTq`m1<+UVXi zDS7ItCA~>WX*epQo1Su0(j||^rTLM`v_P$u=71y?lldVPSY1!Kll(<%1G9`lU45vm zl4JqAtyF93VkiwZa-|{F6e)yMb6?8Y*f2P+ceIq2+nX&VPdQcCo0Ob3^SU;58fZ&R z&c&tD#bmCuWUk6ko9|Q}QjK^4_>j3z7bfvywMCD0E3_0Gpx5A2zhe`=uYr7d(UQ-bsGV_L*7K98n~NhoUaodJR0=^Q z&kNv5FbRu7r48IlP}jgyiN|AZXxVDNZo#E|atl;Kpqes$sp;>4lwiDBx0W;a%P&JmjxdPS$+w$GIWiMCPkQess^{*3nJyWX@-Wgq*-erTJ-bI(DU(eKNsx+s3uS@jzx;|Tz za%%kR68`iSOVbTE)8=Y&LRnA<`1|Up(Z?4ZvtLgJ4BE5C9?v_r_0;6f+)h{5!n2;v z`GF}50;3jOPo?X6X~LB2R2v#3CF|%m#l<|@7D%4)dIn{~L8tZ?Y4W&z{C#!Q7;5o) zGT?;m({-hNf<*glqL1zlBu{@mgR=1~2eqHA$>sL(H`+engzeLHrG0`T^dE0IG3RJ& zAbHyB8OXk#KUK3)qj=}9O9b~;Yj%xUl7yPcQ?5?k*qg>!mXti>hNPL9)tk-uL@7D@ z>ga{NlNig9lE+;gzoK_0W4TiDv>TFc#+u$d#urG*h8vPLcS3I&<13})k{eRef|TA> zjMr+U)+;E^h$l}00n<`?txC?_$e9Wfr)TxvqvUKyP9e$^ zrS$ILoa=k2k*i~tU%yW&u^0H{t5eqZ?o;qXz$advw66ECg0Iy|$+OWyVM?!2%Vexa z&OF>@eoF5KC1(?Ia!|&Q(z{v7*@m24l*!NNZBcUCkTVlyW}$mZ&VJ-fyE?tG_ke=0 z)d4@%&}-E3ZPo$BSS8I&>8(+6nvgT~>NH!gRl)I2;p8z_$FA+&s^E811JvL>3jR*u z1!!STO0QGN*@+yC@Y-IFg0})M0-5u&dUq>1c)w@zq^px_Z|qZWyf9M`so)2JGkfk= z@CSg4uD?sksfKxCDox2sKY)V@?VOC>8YRbqoXHsJsTsWuN{*d!u1;y{y-UGez-L^| zvG%$Zd^_-&R}J;OI}{x64_Bo6UIlLhj(cwIZC7xF1PQO{^(y#&;7ny!?*S#JI)zKr z_nK0e%DaKj2;|_E%!ZIY^G1>`E&XPy9=yLtxxNS2F`&Bqtls;SoI}XT`TvSKldYzb zI1VQeAT7$l0RastAn36X6&zbnD`Ai+G6s+#DD$9Ent9S!=tr=3x@&jcyM6@1ipE_( zK_%+GzgxdbSmk`*tyI;m`q#fIUK0*3^``d3;XF5A1*R#_*4c%*d{8lDBH}){iaM$b zOux(|h8c^RDe){!zx&OccpbixsQdLWcRc0%8T3rz%ALbt&8OTi#edc8-)f{13Jj@0i~@Epa?ONH9CL+AY6w)BO$?(yWVXcb zJ*EVq{6VIT)0#8wN;G{Kw`}@lVi*^V6Ece#lUkB_k0HloRZ_F7=Y$=hi0MCJZ*drA z)T8EwI0m!ib3*iA<~7C*!W$X>h=Uhh-DXX^4!c%1WWxEroLjV8pKF@(HIx|F+lb5v zhH=SeTt=-)&SEMsRU{XgIEE<6y29A$U7HxzQ^ahEcVXANJ(;$XtV!Nebknq-v~3Sy zT$!j$45L++nvi)d6C~xlk!e1~v5U{X zn1{_?3e6^kuF15qi%_E5v=`WR`rr^Pmp?H5G9z9I46`ySGkt7kWa1dtm$x$87{-&o zVPSS;nmK8^?9!qQs|({sI3hE6TxLk-WyYjl$?$MIy$vkMtYX~!uF1T|xYj?&v=;hu zYPs_|GH{{p6hoY36DR#LF-$tDWo9uhy5?l!n9xBP*-6azIK(UBb=ck0hD`XhFXwvV z=SO^}^DtO&6s-G}7pNsRPb)sSdXrw<&NWe)c^SUgYZheQVTikSvn*as^O|@w&0FGK zShJDZlL?>6^s4EMZMO|mv0r)Lw2OOSo4tC)FJU*MuVm&iHu@LDarh21e2?`ZvyE|? zV@Ia-tX<*V=Gj@>Y8OnjwVQ~z2WAg@VtU1cusa3}$;8qnCd894>%YrPiDzMIWH%B2 z2D95l&5C#(=038o=9>+ffph6OO-0Xny-JQ@iVn_qm~qQy5q9^nB*Rl9WruvTBD0IJ z>m=9g$+Q(^%Cr}0ZZUSB$v07%7{;xE37OerGba^@pCx<2dslJ*!R#um1wtIcJZ zMbuJeTj|1(KpIWNa&o{cad0_mqPji}KV)1V(Spn_#)a*kOfw&A+#R=+t5%FgB^8kw zz))|$iHc(|3HXVZfD6xjEqwOFCg2HOUF0eW*h{xdGV2&Sznsow-ecS+j}J1fmon+` znLc6lV6-$$ugu7?8I_sDxG0#CiDMdQ{VqvVW)tI_kWYt6ooCq_KXDfv|SY3kF}L`D66s6J8L z@ZGm~+Mkt-RsH1e^#4Bf-RHgjC@J)peyj2O>yp8M=g)rkd9TTW1;3wv>G|RFUhV%V F{U5_dP22zg literal 0 HcmV?d00001 diff --git a/__pycache__/forward_context.cpython-312.pyc b/__pycache__/forward_context.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30e6622fcb5c30f5394264dc3476088704ba87b0 GIT binary patch literal 13763 zcmeHNYiu0Xb-uIj7r9GvMN)hYAELMtMOl_)Nwy@L67{kqO0{I!UL;ssS+;h)8_ndR@ztq+F7(&wV<;1slGWJVKSfQx|th(%sy~Cbo zGLwZQi_+JY6jHXRO(58ww5J?V2ZtR=XUY|IaoCx3r#w+l${Y2jd{H0I6O#U@pZ60; z)kW)go-0|O3PyveP&AZkh&J#%cd{|n6m8L&JD6RCVXw6M$uTB}UT1QH;=2R-QHT!6jf#I_%uH3sa+5MBhn0S1 za3iwEUGj$49nr{V)^0|NmNl(>aw}SGQCv2Lu{UDuwmUX+lsl!tqFYO)?I`W|ROyf` zNZS;|vu@13TkhmE2`fY6Hn|IZ4+F<8z}XN>JXU43ts>N{b1N*UOto;8FdRr z=5vv^K7#IM>7#H8$wRL@%j55y5hB}uP4gU!2!>x)#--_`E+^tTlJ)UST34>?DJd;o zR#fD8Wl5LfNlDWpcBAh6OJ|S74j(^q_~ePPV}^YsJ!AOBq?96`pPou8RO9gTqa(*u zY3j=9jI0>-XC+liCY9vjOnN+V+3;ORCQ~L%gD27nJ(kQ|Hiv49XELgsNK3jByCmuH zD={gqW;87(PoZ+)`9qXCq7GF?A?Mh}kDT|YeG9~I) z6fr%W5;Nnxr-ZJgG*QyT6xJgpB?+7Y-4mCRil}GADK#^tsL2^oO3UJOI`PW1f`mEp zCDcG~Q<5wzGVhJkYb`Rvyuk2b%$S~;RMOf}qaGar<5Wym;>0PQE0PB4V*yLTj}zF5 zn5esY6wy0OFPCN6L`f*{dZkDfkngC@G2y#5OL(1Q5&P^(jt>#>IYpmV)1oA%GwE$5 zjKr&os)4OQ9?(-&!DOO2sS2^p8Bq^MBQ~`aqo^eEB6hWz;1(<6J;88jN^)Ey#Kr%7 zYP+7A+CG(-+6LyktmxaWPw&~ccXE%iJ(;+)eQHL(l1XpBiY>C;;rHm`3Lbsz<7 zhD+si;K)b(h9?$FV^hashA$RNW#s83!GT!pm1!wiDsjhRawd*!JGP1ujK!pMI-^Up zAGKJFlf-ab%4Cw9FowruB7H{f!kF(dwHqI`lOj$ril{_8gy;?SvCZN16a%aw{HJFM z?c)5{!swDiEZXeI^|0pl`Qta_B}ZG)X5$&I`cG^wr*GBHT=gFlWOh%T8m$$stH_gB zd}`Wg))cUEIVCDGC&p4EMCoB#DSa(qX6DvS(X)M0YNB3?Es9rbGEE)qRgEr zEQPFarnr9%TwQmD7ssRjd{K|BC zGM139$E&v?&S8^)cLA*3#vo9M)N>BKOrETxOg5)1To$sI+!oo53 z>;Fbj2}gApQN+QiXxNiV+Hk5;`m$o!AvhzR%DSk~L7BRXq8*3~H&%h4N4{hxE+tJ^ zNN64uV$FCk7p|wgaGa2e43S7Js@CuT8Xdt;%OJuI$0pgF+x&EHG&ANhr!-DkYftw>(;X+5}jb|5L`O@j4m(_O^eJs$Nb8O&?9>=)l;4ML`P>k0c zeTsd8rdJbi_z6V&N0wF}lpel1cCN*h4ow`9PQ@p?6pw6MzYp0hvvH5?sX9jeNqLHOd^(KB_^vVpG2%2DNsC)6jBE3-;U*0yw4nGEY&+?? z`ZNZMxYR@Vn1mwfXt?F67_aK(CpQ=C&!9jfJgYaHBPC5Lv%$(*4O2#ps=k2;GBUtI zgSp_&oM$KELa=K&xFa9ju{iQUaCgqLI~UwtXzW>T+?8+Kb;Dfrj@n+ZjxI%Efk%5^0^O=`=quiVS|D+x`+e_Da9p-Jit;RUsIT2(|v zieC{&&x(V@tWbPS`z9iCl%VBM3!sqDwQ_gR8xxnwtdXHtt27qrmZm%q6KR^rq^3hL zzn0Lih#4r6eXK%?oj4xJtNOs9E8DF0d}rEoMf@d`8sPROJ} zqoRX^ijkla0Dq|RRdYQ`W&@qts8^=Nka$fIFR6+&DNdXAf)3>_Y2l0sa5C^YL;uT^ z(M!RWOu!U0yVS%%OjN@1W;9*A_z32`7#ZS=jTI3k@p1z5F86oQWOgVN7l$wntVu~D zE^`dnLK82Nl094{MK98dk!6CD^OB;L`>W91G;Cd95nX`LZ}O(qS_ zgq@g>(TkRlO>@Z7+AAD2qF&aXEMr03zrOPMr1%1@IBeTAG2r>LoHC~~N5rHw1GeBx ze&p=Xny#$PRQtsmPHT#I?yPu8(XS~cCs`d~{?gSe9u-aDjNF{obsf29_>(ATGpyg0vJ|THF$Tq*R5w#g903GPvV$^6 zu&bviB8^ca@k@A`3SnJ|NL**sW?QQcsx>7lH9d~#SL}CC3iq;(12FrR1Ce|nQV8@G z0)2(RhE=b#!*|1R-_u$Q*uDLQ#^&Y5-h5;42Z7#|?!FtNKM8dF?xxPazt`x{f30i8 zF1%a6hkefj;0N9jX8&hq2e`&OB8>S_^Jf75w?~bo15_U!|DVUqY_`mgl`|$qPAz^q zKx*i88q~^-VeuR+U-E$Pk~K6gcX$b6ltgs~UnxCfvI{v{$~9=U$=Iq|ZChovH9AARkG`B|DL%vu4CMwK{TZAWCfLQL-Nf)Auhozbe4 zV>()9BUQBc?5f|Po<&6StPpHk4tD2*-MQe_oM$WIN{*-7RHn!X{uB@Md1g`asCkwR ze$=Tl`>9<`s5blmkd`dl?%J*0z%5R$5p#SJ`UaZf(>11uHZ|K-Md)?PzlZ)cGCJqkiX~NUkK_Z}7Tx*4oc>JssofmJYzV>>!EHUFaV#eo`|OuY&}`^QI(SvkpNQ3_o|+y=p02aQWT-6 z;%OpVfg21j0MLGp=nb~g)%UIBH5{jFnmg7fQoH4r?}`?JokO?z0v$*&#W?}iy^1; z!0nwyhR@>eAMF4B{)YsbbEj#?@hb;s30nO}0M-6#g?p>VZ6x~@muaedVXWiI!dA+K z>03!rJ#L=6Wcz+<<3TI#-IkqEFAQ~Osno?w{S)LVtqIXO#gA6pC4*M=xLC1Td8ifg zLb!gh^1M9H_d1I<&N$(#tcU(edVER98x83U_n{=%;PjW94AbRlKfvGY}O>zVQ(N)Sl`RicP* zXL2kUju;gf2 z;ew;>rkro+U2;UwiW%sIjfncI$h^U#uG=SpPT{ON`L zxBBx<151u=s}8&6z9#I9Pz+`fzpyG96*R$eAVq@n7`!#$F}++c?4rg?zMf$LK7A|QR7GzvpL`E%&-1tI1H^6Fpeos&* zfo9Em7)^A(*GyGSqvA4tS|6f$_L09m=igjpcBiXo6THVDUI8NOp@#)Ja*lS+OzRV< z5j|8BNM_T*Ii@~`Y&AvETK+IP5?U=fG3lz5NJ_XBQ>V~k1wXA0(L8(LZ@;w*c;pgRPCCri_{oL!95(B@WZ=SKSv&mS z&aCUMaHrHeE&5o*t-g$Y)r%AnhU?)2m4fqKBiS;TEqNaSuxMqo-+uG0FEmu-#R(P|C#sRfup!{6bjZAN6;8jykg^QQ_e z0}x$w3pe;PPu)uU=Der%cQ^MV{=F7P-PgQr!ds3ewqS1)zUFALzk@G#${OLy1x7{3 zk5^w15bN_bc@3V++BLZ@sCbx@#hOeDbDpe64Ogy;*4qS@+>`9GFz3w*>s>`mkp5T` za1}I)ZE-Hk=HUr_^-R_=KbUpDYP!wsrkSJDmgfQ6Y~7w^yKKMfxI zj#7I%l6aVCu%OC!{T{RS>w-qll$^L2BLRVcc{z2GYCKF3{cFPH9?Yc?^9W^KXde)m z-eOhB+V8sV2xbb`YNW{VKzExEA?TZ~!YuJI=l3$5j5o86-)gxVer4b9|Ly|r?{wKPw*uIJgRHy6}< zt&Xw*n#Kev2sI%a)X5*t16>8m9=a5?W&x~%IvE`0a-EJ-9ymYzFdLMbF0;8%Wvq#< zR*lNn!U-3WkYRk?8|E7IZB~tJqctMSy4b4}UMx^!z;5BqG>xZRqGz}~ustJ2WKKC9o#7uRe| zuDe@rjRHD{)E1qLRURe|SWx9F2eY9{{Tadh>tj3MH@Jo!@DeyqU<<-qkC}H0kjsDM zGdA}}KKnoM*)^Q(^jCFH6`K9)N1W~&ZXZAUXg9s{aVZVIJ8nNp&(uWAyu{Ux9ybr` z;|gwY;HIa#iS&dL$0bB2mBJMSUd7?&3dcL$1K|RSEVyL((yIEnAI9!E8gUp+RcGvU z+I*44??sI!?&^-E5_nF>%~kj+jgXlMvzMwOr{r`zLmOR{uucGEh> z#PELZ9n}s3FspqI=N5lEG0$+By5n!dT;)5DXEXLmWyWwr9?=zDzeV z-}U`mTbo)kA}k#02v%nb*BJs7}+>p0XYRELU3 zr>}YQe1srU3{0c+BtfLnsARjF<|wJP>I@>Ip6r5Qi>#XZMauMRGwJvhHIt?nNOYJR zE@^5?Ny{o}2`bUlaPpUWDv5B@Tjs%oEVw6<^g4+9%?t-s!MhY-DC6lFZW|4k`M?rI z>G4b?T&t^-N}$eCbcv!kMKU6zg|2wuDvZ^T3Yu1lIAHw9`U(=upUhYi$kqKTVU{w&zg)JjFxJ^Vq}$;E9y z(sPfWg#GtZch`f?p5@NreCO~I40cZ&x5o?a#$|VJ-rc+G-ko>vz9)R(-dFIzU)ggj zn%ndgy>1ah+K-p&*PEz5z;`M~DeZGY3f_)6}{G59=Md^h~? zD0sTQ-MQSeGvBlG1NY8*zTDoiLi^Sa+MmDs#rL<|-~N2g)A~{K5Kt_5nhKs~9(NTy zt;ksodRkobXNuu|XY+%)O}8>Xse9sQfrj7OS^b_*UG6v?OkcxvfTDW zzU_${Ckw&0TyWs0q4uxWJ)m2kzIM4l53v~~A_AK`t$oCxh_nxQUKCy7< zrt?8d+s)lyo4xfXU;W}rtN5U^_ggQ2|3o?7Z0 zTR4gux1PAYW2tl7!qGywXE{8W4-ek%y>LbZ6q;KrS-!lklNp9qWBY0L2=*mP5Vy zQ1826xTohrz4t?-KW!2pgxlUZ`S!`8L+Ct8Uf7KTzwx^^G#BkG+*52}p;4jO4wC#9 zZz??pKN4tt0{vbiOZc_J?o&?-Klb^LZ(u*(;5_cJ|G3RVaPQ~<3f}iPk2kvC zucNH@H;isRwcq})`|T)*>{9=n_P#%!!Bg)ze*&tK_HOt}h9_RPawk2!tk!FCMN^@I zL(1thQ*`){er~$@$SO3&o0khpC4E&h96aV8mEn>{h09kc^Gx+qM`oER>r^^@j27Pf z>W<-S8Ff-ohjG;5p+{2Vg4$@WDJ?m2%^GB*86vMyKEH=A;c+)f+! zdM>|84caf9I(0gB>ddj&p^@{4kDogcJrX-HcK*n77e-F;Q$!_;+Vo|TR5zUC$&93% z=Lku7m3ufzIGSh1O9YWUV_qhbzHA;avp`8nH>XFGn$0{_8Uf6hkoY=jpb{E+SXg+274 z;e{8Myj}CImDY~s*1h@Gy?8^~dSE`d5ME*j3hf;^*08X9iFM#9VU9H}99d$W zh5o_i{?UB@XpXhrM8@b(2X-v>FAeO;vF4lpCAR0GYop*>n0UzWDemAlhS>PxFmz{TdE&|Izrc%`Ge=)@P&U_)!sO%UE=H5I)C`IyVcoBAo$&usqR z+CC=;c%wxz?pSR8qL~4*P$8i~xRi=GR&49`91ATB-@==(%}Z>!=yC}iH(y?5_!Juj zVepo`%J6~EYVRm)iWG+SnI2$xehZOp9|s!*SD^_)3g4B6_M#Kt61D_+SYrow9~Zrp z^s&w@ML$6S)-u4I#>F~H!X+ber|~LvSR7=wP|@ZW25+bD>G|QqxHS?632igC*g=qo zFzW0nJhrPAsRj_BnPWu&M~i?GUL3@e;P8U{wy)^rNgrz$zcuuYq1)kn+qR;g=LHDa bIu6#Cu?_Nc$U^rMJ`~67W1@94*WUgIiV+B$ literal 0 HcmV?d00001 diff --git a/__pycache__/logger.cpython-312.pyc b/__pycache__/logger.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94c9a5fce5cae8349750307170c0f8cdbaac25a9 GIT binary patch literal 9819 zcmcIJZEPDycC*|ixg;f$5=B{Gme!Uii~6AL#EBiB9ow-b+c{ZwZN*8L^U)M{DN*5v zGP|-YMY=kH(JHtgCl>*vLw!A^y+aiES_nmfra*uAip#I|2WdGcVk6@m+~r69(UA`U z(m#D~mP<;uoEAkgz|PFhym|ZP&71eN|5;h-Bv6vVPZK|CCFGCzVirfSv;Llika;4L zFp)XglH|f1Yb{|5YxywGS|Kd3)*7}#%O`Cqd)Ut58X+mB9AQVw8FsRFYm!g7!Yu;Y!xGCxw(d>}Gv2=}CFRUeXa|+OVxyH*q9~hPt}HNIil2#hD?&n zZFn1u?K0jKV`m^+i*RM?<>EXy>J7JKWn~~+i;`O@V7*E{=yP&l3n5La)n%} zY*(6O_f0;*!J9{}Q0kRN*?Uu9?-j$GTqRe_J~m$iJ^xK>xLI+?wTesLrnJa)ie0Xk zEpbt9xGvl(ekv?&3DLL#V#k&cO&cI~Dn5C8JzxyC%FRlvT(7jn1-a$A5Dr+#Ng_+v ziM&H`-QtaphTE0)(Sh<_%^r-NGyCNhM$ebu%$~gSCLivUJLOjRN|)RQ`>R*l;j007 z@3s)7%iJMH;H_Kfxy2c4!y&m#ZokfjcSE}i)^)&RnX(fCtAp3A;XRX`gz60r>gIbA#zRY#&jNhN5}9Z5PKiA7^0if$i^ zQY{fp0)!1wDHM86IT6#2GlcLaj77B(96h6GBN=&ca!d*GI)5xZsoVP#nnI&VU5G_h zZR$v%BdJ_blIdtl393?{BdW%<1na*R=!hqhN+KQ41l5B8mq;t=Oi;cAP$@-KqrR{l^!N3JwxROk!zG=lelr%OP8P^g?H8h}1^aF~%3{6EfpaCed#*$H0 z)$MXrQ{pL2w*eDia>`#^8M>fRC7L=FP0LAGAW#pmMt2xcfI_!2;&q$K;B@PW-WOgv z*$0@#5o3)?eQe1X&5T8d;WJ<2VFj(avxG#YwFqLzZz>e1xdJtuh+ZcrRAhfdxtdTl zII)NcVlz5ApHhc)Axei;-9{B{oTg)-aZ1GsJve=`j@Nk}q-C0nmL|)cX>ORCw){Q! zk2cgo3GO|PBa`GKen5AsS~T`L+vArg;&7x^Ss5B1{)ngB;hi>P)dH1@(~=~8^VDuF zHMV;!G1j92U5eIob$tJG2VUQ=>`o?zc8^VhJfwHy>FhT45E`4*t0H4Gk=7y%btIFH zDYOI79YVZKP+cPh8?jbh>-+2exBBONS)p@IyCZZiaVtXSI?R*rZr5}tphF2pv7_zq zV(yIYf*0LeBo(I(O1okB1%xY8>IF)F_|^~S_^c3|lkW(@#Wz-jAPoZ4cTcH(B1+RB z99t0UD-v6TFzDS4RGivOuMg-Qcxzzp>8#MbP;*DraWf!VAcmZ&!Y&|; zP=c`njAh2cU@T>rNn6l9)pkOOC1gdK7y+H4P)W;3AVJblR8?dN^cSOs?u8FiBm>T{ zCZR?Qk|Xg1Rka9>reCLM!eG(|lWZRw<_2N6ElhL3&C_UNXk1g|5U9IoGN~jZa2epv zKZ9RDyB6f>)7UQ!Y?hSTh&qxPPs(^{BNGX2Bw|o_0M`5kJZcYA*T{W)#hiG@UUyGC z^|_~Y#S_Sifjo@a@7TB96Hk8b*}39r%ZhFHT^>V#U{$G<91XnI^F$%wU=VaJz_9?1 z-vkFj5oD`km2HYmw#y=zNXKC(MdZoOQvhV^R!O=&r~`xWoNSWkta^Ax^mb z(m+O24zeD;4UQyKiD_5~I-}#NhKB~ybOp~*1EV0ZTLu~z)Lp2w?qx-a#oWsP$j0p6r4GZzjhfF30#1XLCxh$uU#skN~Gb?K%d};qgOHsnHjw# z*eW!Dcq&qcNfRh18l?$EmDF+gU{o!kxD1*J4qv&dDQQ&85zq?}&|?`I8wnYfrKp!L z8(W~LZG)U493=&5yh<=NP|^W-bp&{rjz*E1CtB$`ODkyb(WR= zRP6>~GKgfVY6KRj$iQplp__Qy7nQr7?tDe{npnTkw#aA2u0{E-*tJ=&es?^bE^UwT z6h-xMmg4C5%26~glv%>f{Bx8n(3F28`%1&=2JSgGaj$NnW2vW7tclE=D5g z%|;@+GZIN<w6@&>51;*32KkuxzE=E2TTyxY16>gxhpRQf!UdlW15D?TC& zF}pSNAdX@cfk*u-sIHR-7MoQpRFgx!+*iEA+MV~+7FgS|%V_r;VC{WJ<+>H-3$>(= zJIj4V?AES@3k7VJ>R9t}2WzqyUS7x1LXC+tRKyv|8#qIG17`?uJ_MX0gG3|@tm0<+ zV=!1MHs~=2K?WPWQQK)~vsuwl&RL0DF!D0pfEl9UYF5-LO*H$|rMb+$%-t&L%(uBR zG_`3JCInz9ubkJ=pJ!#%Yxc@*8Pde6dN+K_vSGeRJ$%(?t}oBZ{BML?MTlp}45#@` zm{IinA1Tw^C~o_aVsgQxWX2*xUO^@%UXEyu=90g(XlA!GueNCDu(KjtHB`u~l&#FD zR2^~#jb2il)f?=czoWZ`~aEfGD7bD2gYGgr~M|46`2#u&EmWgEhK<%jrp6g*h|5c5B|~bx&fdUU7ar zjah+SN;6ciJ<;2D?4|xe^B{|7MDb$lHnvq&7eJGybzxM6KpFy;s0>38%geemp_*Qy z?z{*RP{b;rLrRY$4j$rG-Gw#-)z09E_B@Z@rpliZQu*nj!)a4UvoDU zEPUlDZk@M!oCRWaIRES>V)I)4?pgO2_Len|?`Lnm^XA>gr|)|9tyMSYy{Fc^9qU%! z=PnSw()~~*?&`bMoy#wOT6y?(&03T6%NKwC;-dE9)3ay3=860Hhc@DE zSm^mp&0SAut-2}iJ-X(tnY*;&ZC&+t=e*rzj0KT2?3#7o_qVS4cjf%MmMWHqR{YPc z`j6)PM}O~sVb*cqRr9mM?;Kv(w&L264V+kUoyhO#n>{@@e5azhU?;W5xd%=n)~<=w zbI*S&?kw=|@^uRkk*z%Zjf&d(w?Ak*(a!&_-FjlTT^G=7Zi+N9@?nPLpP{qSkc^fL z2}~4ASy_PcEt!{5JlRLeW@Yswz%^r;;WabP0@r)mQZ@n8JVdg^@0lq;P(7fmv`Q1P zhL?bqGVBEi52k)Sh(S#>k(S2B=~xDILM9FY4Ma-u@ifFI5Z*u^G!uj!AUrS?#jDP8 zMCgt?nJ=wCmkUS3a}+PGZBL< z*`9!-7z$l);BgMRzqb2vM3?N%gj?7fTY1vpd zY1eecXvxhg!%kPiuLt|I9cppf3bkX}hCd^wU2Md`Mx0Q)e&IH;awOgUE!Zj^!&dq2 z*dBA-O75Q4U{Z=fzNt|zJnq~y7|u;A(Mk?v8=*WpKks%-kfQ73hOivd?8zjR0I_Z) z#&j+^RTFr%XLH^>6o^w4mMMpH92TJ7nG#T`=-k06n-r&+6m*?amC|{ZXEsIv1OS6> zNSuS2lNH?+%RoL2(gdKdbuk_>a`w6%hS?A%l&Pj-;sw_XL3WwqKM-BS6g--Eolj>b zbP>X$ID55DP*5zoV*oHR^H1@zLO7+-XiR}$5|Fz90z}Y^5&Y|*i_E}6mPoh8DUx9Y z{g~Et9@5c3l1ZGcNSXt+ZIL+m8K6asMU_V1gm$sl;u5zpS6au#b&IB|D9Wgl%f*P< zreK_6^^m{oh$zCRsS_&{iN^OH;lnu<(OLdFjAX=kl@T`0^|H25GT5EA3epmM<-zy}dWz*uK!SNDFqp zzU$At)n8k16MyahillkrC)wuElDvF)dH3zAjYu9MlCS-wI`FWaRQTRMv{<)vV8yjB zEA0Dv7mQ}B+Wz~YM7;H15sRbh^9pal3hg&)H=M<9tB%$8ddPotzCe0;{&$@synMo2 zdn@dph<5CI9KDVFCk-6bdL;@NN)MAViaCK&9uN8~hsVrDI~*`LBsj*(D2_f-%5f^h zZKb)^!DUvOrAfv8w-|YQAR-oNx#Z-c33l7w`$IK-K zt<{(~HYzg{4@7NzP>B8-nB~L^(`&>-c&RZn3$75{S~^+wh47-IK0-AE!I2Dq04|$1 zy_k4q`DXB$##JRgo@75XS(1^(MesjvSjE69M&Ylfq8iD|%bQO1GV{2=Eki|unhn#S z03qSI7Z;SiX~Hs>%1BCQfMDjtf#Zc_A_mk_A?b7+a^q=;FB4jE9oseY#5N-cIoS+m z(hV<;xrD~Lk?WSCY4r4P1oG^_ALQI750aspNC4+XqE`?fJV;P6{FFn| zSr~(VDMcg<6yrtk8Kdzy3;v9KPyG`M&IZpPJKh`VduibK;OVmixD_Li1x%+kNFl?Q zAUmI-@}tM6TvcNDH$@tnl+LlAfFa}m09&IbGZPT3P8b<|BY=+$v>c-3Nz4phO} z%3GZ5hJ%Iw0?EnHRP!L)DAVTb3=tVHfd%^kbs*?6{$PecN@*8Vx+S9;A*3Bn(=cEW z|-W~sdUJDzk>)^iV&}9r#{^O z!Tt}Ad~oDn4y?2uTJas871sni{vmyT+s&rMndOm{sw1BYM~$?kg}DImSAQV5m-fMT z={QywutGBEPqBIrD%}&y!0#>=fvBPAP|QeAGCQedGD(%9pJ=cEy}(jr95Nzf7BCx( zv|>oDvPeL;K_BFkWdJBDDHa>(4wgYs(pMln$C{qOsR6`Wy(t~g-+QsQU-xYqJ9oaf z@ANB=3|<&K-+SzgZjbbxJ%8rdAb1KVjtzp$vNw7T(X&(pB$)_BGeB8-_!NFp7=jHN ze=;za`A8;{#&8T{c8U=R%M}5UX8K>EaQqp;^+_v3c0k&V}aM9};K^7dhfvl h+170x+%pd@ay8sBZV6zq{rJM|eg77I$ofoj{s(BjXj=dP literal 0 HcmV?d00001 diff --git a/__pycache__/logits_process.cpython-312.pyc b/__pycache__/logits_process.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62c9293800f7261a7a8b42ee3ab1ad15cbf91d9c GIT binary patch literal 4754 zcmai1-A^3X6~FVz?Dv`vW5WW24TJ@=Yda2(2{;05Cr%Ac3U*s$ceLyburTayW@c?e z2h~mUuvV#p+XrJ+m9ZKrYuReaOXL5cFS}kxY^JG%6!#%gAJ)ZEq`tK0-es0GCC**# zz4x5C_uM=8{?6~5`J2b%L{L=rH!*e}Lf>G)E>+u|Ekq%72VFukk|`CAm$+G?}wA`_)#MS6Nwv zx0rA#tm2lrepuRpWFB_q!7IpBN|h{t7X_M4RXg zx|%|0iIFMj#i7TVtVYXpoRXO(&g4vH7TefWYzvE^U*b(_mfJM|M3a}|^Z3+ETCO5zNu;@x2*J}~!^RFWs~&Fexk9%+$9^)`NoM`rq-Py_?4fLd zLd!`OEnjDm;j+h>^drHxGfj=5pkp*hpBuL*(L(3owpc^CVOZ4>S(fyq5{*SviJZ8u z;bZ>xzfj=5Jmm|KVzM6OEmn=`hQ+J8p~aFG50{*f6^l`oxWz}3NhK~@2S*b_5&0+h z7I)omP|yT0uHl!_z^jlY*!C?y-bqqZreZf>Ly@Rr@g)hjiTR{q`AcnSn|M1qHYRSD z|MZ28`D8d5OZFHMZCWvUZlq40dUNii5>{iA;pBoblZc0}sp@=KC8&eN2<$@11*-+d zOZHZD>E!_lP1l-Xr8@ZQ51_hra<&j(k(wloqxG)EdgV178J>+#Xv$ls;Fe zYh0bVm$;kAH-~d|;q z6a!=L@S926A*c&e-W}X(JUx-mz&j~ zubj0#0EnP`vlKGQWHxJKp~8?!+Zb44z#8_iT?L5r9wyQXqG9bhGK~pKqn4}pdRn0? z?? zeMA`^`_(F_ct`irj3tN3nEVa;1_{8XLvzFFcR{p-mb{n}wjMb%6F;3|bAF4mz|$&{)0_6w9c!I3uA&XDlZ~)&y`^ zY)&Bn)-Z3u!&DL?vrifu8N6`e?AW+;;o>{z$1SfkdiK)zj|VT5}wkaC|z_c-Ac`8Q7WXltY8Yze@+f43LZa}Cp6r1QRA*M_75hFUI zW6axBUg2#~NGX2E+e%~J1Oqw;r*sn)oyhI~VIk%YMvu^RJJN&Gn zWuu`h*U**e%{LrR`!)mOMj-Sw5K3S8+E=~OpZAGtgHL?z1^QEK9~^j6Gmv!;WNQYV)dyCu=Igs3rPif<_-wBJZ2H}TyK!|S@9z4l z^XLi-RR6@^wk~h zdkDjfzQl`V5($+^gxU*$GDVyW!*g#cDHFsGQ^^!%uy84)>_sJzS>apXRUR=9w^s(H zE>qOr?gnHO21;NI%(A4IOqq;W;_5&~CWJQfRxDVlwny%?pmpy^cF$yqvmRp+Lw;kxN8Hrnrqo2l(E30{+=TcQ{R5>rz~W-RXi=dcTf!br=t?oec^TU}t7!?%#vz8FwJ#Qj zRbuiEvP-BG$X0P!*ab=KNsGr`0m;1`1z)oMYp5WB3N&YhgB4u;_dw^0P^f8Kjr^{E z?aj<${_v?nZTm*;kzDPO%um0p?S0OnreKi)T)lvB;LG~NysH(M-PQheb)ZlkUUz29 zBBJ}dH@o|@T?5cP-L}~lTI9HDmAnFHpCtVy73KhPFdw6@_BXDc`~CdQi-o!aH%E&s%^h2lGbh%W2dC~Qwh)L# zp@rMOd9Zcu@ZF{&2mKW}$@yvT;@*3>!e7vk$4HTQJYfK&VJWx) z20lp1S6b|4IlXIlQ WIj&)g!|LyP&r_85-w4;_Mg9ls*Hn=J literal 0 HcmV?d00001 diff --git a/__pycache__/logprobs.cpython-312.pyc b/__pycache__/logprobs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47f52e4a545bcb2c27542d9411f51730d40dc0ae GIT binary patch literal 9992 zcmcIKTWlLwc6Z3(o1!RD@8{@g(N^Txk!4GXAF}26B}-1^>>^4HF2xy1wE2){Mz$qL zyK0)EHe%Egwy0?&Md%_$6gUg0j}Px(+U-Yx{%|a1$qZ7gf!zWP+7Ei=-8TJc&$+`H zk}{pg0=)q5+`0GMbI(2ZyzUwPOIewdK#8#LhX135kYD4AUOc(Rb^}AmUBVNdiI7Qr znj%cpG--;OC(TiIl4Y>Z9I-^Llh&wh(njCeh&}3Y-<{JO^t7Si8Uq<=Fecykm}6@Cp?|InONG2tf#ZPF8BY z#=DPq2$j<&-uVtYSp{t$v|Z4y=G{W2;KMn58Q}5Y4Bi8+8rmusOhRoQH}4fZJV5ad zo|*RWm4Kmcf${skLZZg}X4QU17R2CGM98(|xTuz&pO>*~Lby6F#6p7Kq&oPZ91KN* zlB8Ou!$O4jGpap)O%NmTARj_xddY2zTx2=uYJKD*#B`YhPDVD?*8+h}}dPK-}2{^yz4bX?-5<02=`&&|a}IUJke#^;3C z@EI-?kIn_r$!`kYP<%CIbp;i-99kcQ|x_T{DwaTjNe z`CNRO3(dwQm`;%xz=%wPycQ1yrzrBBz+BIf!~6Wk6kZ6$d4Ug5MB3b3SJ4t&IK~S% z=q%NiqiIZKaY41L2#4RP+5>@DFe(HBsxuIX#`*aOw%vii)%jo~H)0C}_;?7qEz@vT zvRWPp1Y@zd44jI`q(DG)1D92EI3|l_v?a;n$7CCtTO zKR9^k%0XdoBs{fuZb6=n$M#-}M523jhWE`ah^U^lLT!|GL-nWR32U|3vtHuy{mEFS zwsB?TX2&XfYxgGGnBh8C&ab|ds9GD^VmrlJeR7ddQH_U@?MGCvi-pQWP^>;NJ%BMR zAXpZ`hT0(5c{2*pGZ2mf2r7EdKsXB^s31NA;Q|PEZWU|ERmuQhYtq1b z;90}F1RGeGa%k5Ibzn7Y(Bp+QE8bxZYgZ2=l|lo|CotkG9O0`3o6vwe1uWHorJir# zYv9=k&sumk!LtsY&49N>@FL#;vL1R`7i|7U)%|KDD39jUHaGxQgz5nH>Z&QBh6Onh zt&${zqR4SBIv!`B2(UYG*~4okAG?I+MKhyoe{+W!_G4#{u9^j>`-DDRAubq=&%*-_FFtiy2pQ)BY!iZ^ zS%V#1G(0ma<2*SS2IlbL>1ja}iZD=SX*2+GQ4WB)42J|M0NW1ee9cibC2;eScI?3r zz({b$dD9MR15j8>YGG9iY zx}y}(OD^kiY5~l1FnJ3=a?K)4mCuep2UFFvkaQriEtAj zq6;eyD?9+P4J!<4#Ac}ctjbD4WLj*87uBxIJ)I=6(1%f=Scl+UP%ZijJyM^x?l25U ze+ShHd2Dy5?G1{(A#HC{>}?-*B>9h`_oJKb{af|}U%0qT*HF6an9_CZmtDsdZ`-Op zu`A=Py6aiQqzO#bkr!2Q6M_W<@f*PLl@8%#{@`l$DQ zuLhg;b|~JCw0F1S-MxM=z2}&+=h)}o;jDvnpJ1|1;wVeo>lAxk+TNM?N}# z|NJJm|8x6GG_ndA$_0Bs<)SM3&!AHOzbYyl%DKcWnU>5;>;vRD6i`hGR1@BoEz8zr z+p_(8sHY__z~4hXtwpm+sA?&ZU$Pa=`mPFW&(FF&PgSghLYG?a4^**hPLyISx~_e* zVo~Lb&x4y01#oDw8Dl=`Ah>fuj0$`>DCZp~Mo$pD1jPj{n83xx{0^$0RC7d(L^b6! zFNWtd)#R{`DkRlKhuDK1RBKUc@b!eeFs=73d1ScoSi*+S* zm5z+!?4LokLb5efKc1(uUk(gr+|{c$lGW>;-{tm;cbp`=YYJh5m z{JQIK%H4*{Ja_9{s_uo)-F;8Yr2B|wzzR2u*eO$bp>dbU#uzZAka;kQMOVcM;%63L zJSE5y7n_exfd_)n)QI{Z{Ey>h0RwVS0mDuCO`2Sz8v-Vv_)557ZgX(W2A32FELIhX zVy<@@q2at>d%r2KT=+I$pQ439X6zv&fGrI6U0qUYtWn_amJrgI+f19CPV(_jB5@CnO zj-rj}xJJkwQ^*u1A=9OkIn^o=Coa)kpNwXS7QkG>8E8;2R>IsHV*J zbo&$w6fK)33gw%Tjc87%Ui=$atwg-6?UAQc)7ba9eBjNG-p+f_nW+|TQW2JbmkCoy%9s7iRkmDY38W}+g zZ;>1I_7bC8koU-688?HsnUIO0Cv+X^XIp;jVjmb@5$! zm)VcaJKUd$V*l0uLh)NQH;X>Bl=EDyIupAVjD*pAK^l?fLWVfMM|FlJxKhJ)JS2!H zI`oD&BkaV69H*W;?~hdrB}zo&tU4}2a^<8bLXw!pJd59=nV~~C$b?l#Gz`fITpF6e z8(NdYCDZyOp}#>j#v&eH(2`UH{E9w6jO4>shZ>>h^A!l)9I0p83+(3UR~1m(v5Mm4VZr zRX-XS-)y;{42*A7B(5e0-&;-%j4Le{GX00r{Ub{M$fs{S>OZ&HG^+HUTfdrkEm{BG zrBweprD-(N(DFgwy}tGOjlopIVWhwLUUPC`qdryta;B!??&Y=1$@=xdRLy}*Ye%|u zztXyYWANddsn%0ZD;&552W+8+_?l8J`~RW;q2=#i-gqI^bZoP7IOQJx`nNB@HoyLD z6KQ>o`C4rnl~6-|KI}YM$^OFYK3Qx3#a{QxCT9puvR<%FO*Gz?3QaU`sL0!J;Tv1* zI6-sGm^#YKK6utbANesO9tAXdM8Tr(QY|3GkmbWgjQ9au8{m14YcodPg((tRhZXXr zr#4l0c++zvWzR(qhW~)^@4Shg+$MM-BIM&>_zEG$qeVLr!EKO_bDsHCqyyJ0@XUbW zIpiB7e~!}%$Una6IgzrT(1M>oi2Uvx`Jn%TSa1xGZY(?oO1yY>C)HMM+K!-2>s;GG z>#IH zB$c(RSKseQ@adKpm6jJb8}|LOV*kzIOl3{FvR$ccPwv~S?Amm9LF8Qd($_S-@%G@$ zppzn=!>&MTl@3MD>m4viAl2 zqIU(=fm@T~@d$i849!ArSHuGsF=`k0Vzmz|j1?%8#3R@`jnz4<&SQmQA&z5(;V%_2 zEmwnni8?0^8qzF?9e!=9u-KpU5Uy+E;#TVrd|&EMnAZk3*`Y6*+c)~Rnh&jR^!0RnVa(JiCYVs?7$b$)w$JlaOLz(e)YhuE1T@WFIqY_Mz&f8 zS5DsSUp24tYfi=0xXBKFv8(st#jRb(SH@OH6263duR*Em++>euS%;-R)qC{QPNnx% zFv6Dp&-%CVk*)Dr`XBphk)*5z+TbR9wX5@Md$KlsvlDOqouAzLNkUS}JF*V!a+3PC zq%SGoZ&2!bvo7p%6Q^gp3|iZ}&6d4cr^&J_;oBzg00dS~h8;u)9()W{eXT3(JpiFaR@#E6_`Fu_y_e5Nua7KE5xozYUK|`N6!ad z9UZGqEwPOY4N%)0+<{Ex?1N4D7djLWw&u31a~HVRsL%D*9K{2~0CAT!7r=36+U zCHV8cxrj61lHY*k#UEfr&$(FlRGk4*Fbdz`@(g$rCVT=v32zd>fabPzbN{2}{@bs- zdnCoa@~|B$x@Xm!`7(ya`}Ui!fdHD$CZq6qdY5vD3_w@fVO~TR~2f$i`Jg$0k;T>>8IuSSD9! z{?oQ(*7KhXSzrMY4e^d5qe-S^=Nz~^+3z^+uy>px3%#zwvCs^vec7?hE<2a(G8%1d zu49Q^a(-eixKP%?^~|#D1W3nnL4I<9rPmi@17W)d0fQTrtV}7UR z_^1nf9lIy6!mrwzAW_@jg|BO14N#uFD4r;?MK?iW{1{>E03s%-yobj!9cgGu;K z#on2GbHo0y>(iFc?H3;FiFvrq930A2H79PYS8sTpLb9>)_hk^IKXF4#soKAKIx&-! z*WbWvO#S=2GEE&2^k>?4!40KRzo7u+Q=IcVv&-T9`gbO1N~jqBIMjHY^Z4sUw$rV2~_XVdyn&V|*NW?^p>N5V;BS<1n*Fpd*-0dJ2OkJ|-z(08Op@Gz_0iKo0 zViuU*B*oWo8nq4=aKv+gT0=b;P(2gWIOy-xIErW>L>fe3=s@|u!>_CMSF}I091&*$ z1Vu}Ne{GShnPI>q5vJl-SCpEb6!C$(HMyRe78s^`^(gu?-@$FjkTTu5IoQlDLEzMqWK&&-9`-9;M-}dv zQgLj>^SGj!b}HPUQZcyV$yzN;RnoIf;F0y3n8VnHN7mWOI5Rb!$sa2<2eM{(d0g(p z*dl9zHc$X>5LRSu_+}^7&54O*|Gi12sweBfE+=t$vMy}7iLWMGhAj`VI-iz9i<-{= E1H@41C;$Ke literal 0 HcmV?d00001 diff --git a/__pycache__/outputs.cpython-312.pyc b/__pycache__/outputs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..738ff75060fcb51e846d83b7b3b52c76cb1c6bd8 GIT binary patch literal 16434 zcmdrzYiwKBdFS%|6!oAb%A&+8QHn&1k}OMpDV7pHQzx?Q)^5A7?O2-kN}^1W%Dq%< zG0NgL*^sO$DsMk5uMML7s*Mz=2NYO<4MTo(zX}Yzm0NNb$(o_ruws9h$VGwd$M${a z+r z%Uq1(Fm8glDQ2c|bK0_Ojag~jlC~||WA zVTkP)ttRpm5xlPuq3LZC!@kWvB-W+n`XIM?S8kBjY=OL1JulV`^xZ<682o|X!1d5_ z`=DIA=n#84f%@BK)}pvyay7hP)oA z*;i9Dtkn!bF2Ad0Kdspdd3{i`zozEEnl%#s6?Q$sslE%Vxy049cv*aIRm>zsKsl=7 zP9c#?B-065j+j*Q$;_H+e@e`VQYtwICGA(%R>UtQB#63GnN%*G&Mqv75)^H^BxRRZ za@0=6pWAlZ3gHJwXc$O?({rF0@GE@sn$D4j_q zF|$c4cR|cS%Q9eEG|c5(A}1n{lWZ**)E0*9+=ujSB9a&R$+04b&+zKpFFAV9*qR3}ylE-D?v-5m5 zBl1b3tey)Q30`7zJL+=r5Kc-9@)V6>DwPq$8&fjn{Z67ttS{G~Sr9`8vx$H_!L)WvCAoGbFFD8_n_&ZFasK+^8eITcJCuA%=2_|L`Z9E_rZysY z8IDM1fzL`joAs0cq@cUW#HtL-g{xfIFg5+pWubC*CCbm?uF?ikDQonAp9kV{b{RG! zP?KP2IDi^bsZ27xDquf=r({<%Ip7$Oz{byByv%x5nTT{&O29xUD`4}0yg{Eb*8S@V znP&sY^7`(f^w`=mVyczyX4Qf#tvZceIH$H&wvN7~RENGms<&n-RCm?Ps-EiJsWwT> ztxB1sCV7Ap5D-R$Aj8?|0dSkFlO_1p2Z5||m&rX2*rwD0nfEx=B8%yH87c9fOpN80 zSH@OSD@h2Zmb?Xe(c&&aV(v>I<~TwTg+z0o=>Nj$Jlt~(UmpTq3?-&XFb0d z)w2!(dASY1O;Yl;6@9&mulKIM;2SA8Ms@&AYKGzi1RcuXLIgxV*LX1B1vLP2G(n40 z^zdDiV1^uvp00eMjs7SA%DpqxPAWC`Hg8v^>&szyWi}&(^CixK{C=;>U*r%cUJU5-tV0w=8!&+m`-PQ*bkD{&rl?I1z&Tb5ES(MiPZOd@k!dXW*`*#R@$~LFf zwHba$;KTZby-C8K5%iO>GYhEQnn|O*4tfUFRs^$X2Q9$@S^{V%q7%}td5d6!w0q4S zaj4!)FgL(wFH#*LpSeW>cH3iaI`w)AgZ`TnzSJ-hYH-RBCv@q%NVE|Y4dk}O%x&~Tw>xBmlCv@_L3z-C>8*2(u-AqNS#s(%D_`A~cw6hmWg-P3H)2&?DLIva zWdPM(b;jdVRfmy}LN1*u)tpGDBPI!DlQaoH#;)@`V>z~XTzn1~j=z5D5IcvZrvTg} z@A*162e$m*?|G|d+jsC|Vrq8%=t{}k@||a2dv#E1DBk{}C!%;F+n(Vav+0Pd zOiUWPn2UI%0U#qihF}l@43ntVTvkdhsXiOg2kG}CNMjE^QNjcTSTAGDwyVb zgtk}gv1Y+95Mdv*i8!0k4lJWZXcaBMPC6jox@L-Ws$R{eV(cSv4NgD<4w9%+A1f+a zmg-`&vlkkg8=7th1_9$EtCtsHiL*H*^KoGp1oz*){dm zkZ^7IUAtGI(x{_`uCMW8EUk4}*IZbSAys9~pqT?&KhtUX70~5brFsQj(>WOlmf&^U z#4wb0t%$CIPVrw}6k*E2A_0MP9V~mp9BORXEHg8Iir4IY9^4PvWQv-b*HgJgN@rJd z)p|PX>F;Po;N)pLD9+}Ck<)Pc9O;JEXY5C$hCw%~wB;MgX3}ew{Z-rEXYRAwu2jgd zrW(OdZ$7iS98ZEhD+;V>%_s)+RaiE34xo}O^K|xCBym1<0}M(i4~jt5DY^E192A6% zJTFS|m4uX7*66S}wHqaIIhzytXU4w7>qS}j=fG@eCXS>)A&DCb$vW6%aqw4ZHNXyw8+O<_*j{l%8CeH= zD{fd~>kdfaNN8AhLP{SeS$9FoYvj2h)nuf+kU}eyhBXo~&EqG$~;$iE8hy=ZySx^8Xc zk61T1!lKeHbFcTVo1}kRx59W`m7eG^sM!FUhf@OMHO^?Ra%!wg#+lJ+5Rz-w<2nbB z_vg^bKm+u-K^=XB>(=UX-l*gf8cB5l`u!<1f&ZA7(AcgE(ggIS0sf}xY4T03ao!%S zH=v@Ah_;TNl)3;Tt?(lA5*nm+0ZLS6+Ps9OR9%1)#<(*+q5qy8Vyn2->S2@0hJ%Fnolr^fuf`fK0(#$y8 zlR%I%IMpeFP@-=6D)-Y7%!YGCBs`X?C7qIUstpdZm=Tyqiu$Gux0Gs2tgJv@#Kw5x zMXY)WtNQ5iuQ-HNE~UCENg0nm<9CeBnG=GL*Dq=qa~}m1+SBrraDs0VU|>$~cf5b< z-RvrMjwtZoG4gG1sc)p%_oUKCSyAYN(1Bv;(9c4LUcLBU?}59MKRj{oM6vhSE$;@m zF;!~oF1Cf0w(!oVcV+T-tO!2plYpKUo}~RmNvGT|Wze_lOKnZCc*+ z-0^IE{*F)eca{f9-|_MR(z5Ti=a#3~99EjcTNi)cJo(WG;U_3lu5q=XT%*hvr}ux% zeQWxCyKmtj8HW!>qy=#7f|{{pus|J8UA~}lEVWiJdJO7-`mu!Dzb-(I2XY#)7s&Cx z(Fu;Irn!7)%>f*en4$s-=XBl~)eM2@PulcoML(R*w?ylVi0OP+J*v8Vt3gE9RHr|2 zMKvd#JpBncp>>05Iu99|-a4K4p?A7+finGxKg#b?PHE$NY1f)>%_G(6Jrxb<49s3b zC5oy}RPU+tW@|3Fn)x$aa`%Cp{1X7^#U8wC`JwNguP}HFJkQ75OYQq_U%Pc}BMzwn zzQp$z`NIl-xDYx5h@(BFp1xwwxY9FT=$-_`;i1w{v^aED89G}S1lJDt6c;Lm!o|>p z5}GLV&>|hBj;`D5x7If@NV3-RU7Hsy1^HrdLWH zdpA+;aC^^lWwTfJj9ce0MA{lTyqJ!98K*LW02Kn}vmV75Zg~lHM(XI6P(Y~`0WQEL zWUSI0f@cx*BN#)_ivZ<|G@}F1dWbv$$Nv@bk!c?|!)D0LDXug;RvL(u4j+fvDtGo< z50``F9QQO=nmqP#2$CQ8JIYo>+enLlL%8KC+cD)JU7=0+PGn0^_8%xa5#=KLhwk<( zoug$pCOxFJz0m%c(lS`~V!Da6b(DP=YbLFHp>INInYb?~EmP$dM7ENSzO6G#`%t+J zlYY_}+!XG(%KI?YPQrs*@>`L+g3@=W+<~Y735K@(cc%0{hfll%nWxI#nAt2+RFnNdyKf6b_Ovv%LT26H{q@WKJUP$fX};UcJNa^(ZpQ# z@L}_bQ>mKb7vYtGT@dFJV3H;QWdvQ1+c^fPTIZl(0>_;h-w?pd0y}bJa0}W6o9LK# z33hO_IR(cnB<5OkMqH}ybFDcdiFaV>Tqs2wy54lxy~c>pd4oQc&-T0VL*TB6Oz|gq z&G(i9_kAukp91q9uXB+~9r21+K{J^7p&BnSP{Te}YRNEDa8IpKTY%epxC%#aT7^=* zb=L@l^?>pCEAbqbYcNt zMx>H(T?H4{*Q*>@`hdZYs3|}Cr99)G)Yd!=YtQUT9%4y6!FoW8h6?GYAp#Z>TJU-v zXofY`4O-`h1+ zS5LWCLc5$c92}L3Kn*Xa^`l+M&@ZPAgQAiJb!=~VKu@sGms@)+1iLZ*N1!gJ0Nf-W`r09opf_Q56m+ctSMN`>{DHq6zvKwO_5I{R#i0uUJ zI!8yk_-6#O7TbwIq{6~0&%=;)>({Qpz|FwG>BZ~YTE&=CbOCF30~W4d4Xg)nUce!% zZ(4IioT~3E@FD?U!Y~6U|Mv~}IzG1%^>UAVEo;=!ywe7O=i%BcD@B26rIL&2VNdWn zU!|ApRd)m^XwC!EPK%iZ@T&_VT7KZsXYaqXOI{%b?l&19HfgWK=yeKX!(emj?W#>0 zXVg4IHI@mUQ=Ijkn*jEDBl9BH*x@5^*N(#Wa93QRf=TBM-{h)XJlvz*ox&CKuRzR_ zd=z^OK5u$EE-z+R(*l20g!vjDIa9SFY(gVecA;tkv9PSVzLZF>if1JWZY~#rmTJqv z-MgH-XOfmM>8>JObLHxshpVOd)r2hOJL@*bsFAe;B;>~dfP&oKUu=&k?U7gAC3kz# z-KDs@HlO>cn}4r0Sm=IcyLGnUnf(wRHM~#^jVqz??a;)wZ?fQ+WZKN$*YQyxU(0VB z@mYPI^&4k+1qz!*LS9qPs8l!GFmXAR7xP{9gR9E?cN`u*jX-wQx-ofs=GM$Ma4SdO zZux+_hVGm`hcO`YsU(}xMDvlyamWixmj%NP zTN?e9t!NfFdbMn-VvW`{bHt*y8&g}OP^e9sLfeRSpa++3fFD5Qvo~zcf?dL}WcYqr z8X*XIM6_L!naH^vMly)gYJ|(-*_@BTm&zfWHjROm<^OV28taAm5zh&`NKv3h~gi) zn<f`_GoThl|}Ol-d`@+S2QDtBBq1)CBuQ?2vf6rFGz41L7 z72dR8^?Q_#5`h`IvxC0YS4auBB*^DN-;s|AeCeM22r+D{Ve!UWEe1LNtrFXR8ehAE zWrV7~{`@-scYBC!`rgJU{K&E!&iGaRZnWwV^Y2Y^S4<$Hwca4CSmZ7ecLGVokSp+Kps>LunK|PJux^jN@-nRf#Z;61}RrFrK7%1TFbr)< zH?c%70z@_57wFP#BsBG3K~-KB8>TFMVa&{1@K@5;#g~)xS%zlFeq#@bD6>+JEDEu_ z$V^x{wVX;PB>XuK#nzaxaxwvV{PSpbX||`8miiQF_B}do(A3~t9~!;3(ML}WQA3Zc z)HDoHL$j)IxL>nxd|K)!!3NvDmP4ir3ihoG=(m*MfGm=~)A z-Rp?Aq;lf&9+Ix(a`oa>w(UvWw&$QWh&UI~MO?9cTxlQQZl7pqbDrLAJyY?+tA^9smDY`drdV%M$F>vDFEPR30XKT3()| za=>97M>zl=O%6b;?B}eL%;wUpccjwzNIx>G*C+D-JCFZ!ob{6k==}e7JibTUHZ_w} zIVGRRh18jQxQe;=7X8DDfB5cPF?vdgp4#@G{;x83It&Dtpv@}gj;P3EsxO&Mr@_C6 zzo8jTTustP9TKuCDGES!qbn=PR6hwVQSeKxt$_QHT_PgilBN*d!q9R05ColjvV?}Q z>c$_Ljj|_p5;8Q^t7Dk$fIf2i^BlEB%a9VIn)%LtqSH?C^s<@0 z!}QWq-A5}tKR&A6X^*07L}quP)RQucZKA)3`AJZmNnb~RXM_4bm@Q4EJBmTJ2k2bZ z9!}M90{R5MR+^FCgmN&)L=FNhn>mjAwS#cc_sP-sN$)R6=$FLvJ{kCD68;4__CATc zPfol~hTbO=|K#*<{K2-f=cWzZ(I)qAOoJSl#{l3Uo7`oZyaGQ_ha4Z~0MK$E_M84C zj&prPD2#!-#rx`&&FKf$$W7BjTQBF@Ji2x0&f^aWM9ZIJL^`-rT$v!$$YAv0q?21+=c?s3PbGSK`h%f PcHgUX&HNfEQ>Ol(1`NT1 literal 0 HcmV?d00001 diff --git a/__pycache__/pooling_params.cpython-312.pyc b/__pycache__/pooling_params.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b2a547abdad285bbeff7662ce337cc826a62024 GIT binary patch literal 8772 zcmbt4TWlLwb~EIV8u1}g6h*zMAw|lf^;Z0fY}rw)*p{6r-q_hR+y(^A8OgNx%FM`# zxJ;r1UercXL`Dl(MrwrHZUF^eV7*^)|AO|XK!1jo4GpH=)@=e5=@-39`{AGVoI8An zE&_C5-MROid(S=h&Uw@SWVc%=coNK~@qh49)VDY&KP{!s-X?TzQ9Q-d2`Wj4X&U>6 zgdu4R8rwCj?O+g=o#SBFK?7Nr*&a za{Ou(R{^yB4dLB2L6TozlQV1bxp;~PpcR+KQ!5vv(rT>ABSlp%j5Xke2~?Qosj%Tq zxVW&9H-s6%B$y49&~n9DyG-x%#@{k`X}!N}=z$Jz$O1*7l{XzDqp(FVS77E6%qrL_ zu$B_cUISxGu(ldlYYFD4fmup0XU!hg60BWt3FbPEY$Z&G;I8a$C$P3~r(gq~9i8(0ApCeh0N}ctGd_nl3WK%Xa`jJ;=|pfp=eH?v|#82L&(R zd5x-!Iev)uTw}t%R*DbscHp&3@blf|BptkA+06G`Glho;wjRUqy?_bueSAMZP}z;) zy+AEkr2_j80%Vxyu2JEAz*i4Id;l5YgS;Q!p>;!OShJS|=6qC)CZ*_qf^^KEU*s}4 z6l6h^xaG9SWpt_Hk|?FpgvG+aGcU_x{L-2%NV8;&Fj-tn#X!a*88MyA$PqccDx{=Y z4rEyt(+P>w;pKTC$rF{exc-75b8?y+pT-tMo|7*N`mXyVju)1rYYCX3Zvu-sDV`OY zQdmU-tDt>cI+a)_WD-$H<`9l7aU%&~xt3I@DVe2o3V29!d|ZlNNW=eJEjW6q`P0Ot2E11`~P36^}E&`HJZu5C#~6P8H5h+ z2=1YBN<1kFPYgc06km?Rxo}3^Z7hBNKk$IrG^_}kr@8EEmgOQ42Jp?qTXGL!`H4s^+>DnOhkmM2B`sk z5|-odM`F>~WkItE@5^E|vie?BT#+=Zex}GOxB$(pZxSn6Yhr#trzs=+?V&%tMXi9% z|7-dJ6*4S^jG_x*Vh1)JXud+P#YjS8Swafjv)M^GnVHPQGvhLBCdlLOuN{8z$m(HX zG7-NtnOT=Fr&E(x6N%(x$wowo>5wKe>snhhkd-(e`SKxnU)&P7BXojq8A!l5{h-rB*oIA zpjnAgi0IfheNdhw;%*`YAe4S3;t+H-=T%TazV>w9`qMSy7m&adG#gaWOmz+A&ncZ# zYEMY%-miLwmCk)?|H$^J(sxK5oK(D1PpxLBrAV1uEn*NB)!hT$0l9~6=-i^@%D2u@ zcT0UZKb@UK089;1bN2 zY|v~_-#NOrm`^`c*a5<_W+UDr5+gy4h@Uow)XNVBOym^jf#+LPj=ECD)Vw#?9G#;g z6-a8l`MeyxdKeh@@-Vv@oi7Z$AxA@Nykq*nC|CH%8Ot~J2`!gGI0|?vLt*dg+@_Ry zW_1L1cEi4S#D-_1g*O2WrVL)kb6U%z>N{sB??~eedlggeHOz37;njCJwxFqXDr8>D z4!)gU<6=>aI=S>!L3~e)%L0eGJ%LtdE^7rFEnnuyK%;${Ig?mhiKi}xSP|n8&GPeT zVoi8O6w{)Jrh`}ru@6x_k)#0TAX0M4HF}_RMI;DUn;^O)z(jm`J<_NeAx88gdE8W- zMDoU@v?Af9p%vE^&a5xay6c5dhrR(*rC&osl*{J){KXqDD%O6ryZ`#apD(KR_RrtG z@%C2Wf!(XR4{Qg17y2r+?Y)<~&)8G_L}{bt zEZD@d6nui3P?5XxW+>au*5cKFV9Xg8sCN!RkCDsy0+lo57@oRY(t-{8v-TPsRg3$W z{%laj(~u_zxSE+j@b7;plmff8w1Vx!n`Qx$DZweU>s7*;mb zAJFv@2Sh_mOV6Ar`do4i;z9_2Kx{(>aX)Y(jzV+sRH#L>XGkhmHey6V#{j9dfG>-R_w}_sn&>Y6TdFFh|Gd>o?XFM?hhZD8ol~pmhY)zQNtTsY2h> zJ@?l=zwasZ9aY%wZ`*y>=N~&fn;+z(+wL95gyJ})I=jCEh`jZ)JhS5ng780ff~DVz z{-$et;6A0no=hJiYV!nj z#8+T_`Eg}>{t^2!Q4RfmP>RSU)p?AHrpypc(fvP2ZBt$0jl?6IT>)t(a_s}THq}^u zDrc&f>l|~XB2%z`>B*U@mielye2c1i4lQ;8ThXv~F+ubh8ox`-O^%%NvVr6fe+ zo_T9Bz%hO)c?HadAaq0@KRODaC(UVFy7` zcCEpJHMs40V4YNbLyFD&&z)nN^y7}+tpoY?{K8I0sL*jxbr0{l#|!T9?dT)-RMA9v z#-D>6lSEW>t6pcXj(tKXwE0yze(}ZNGDWekXYRk^96`D=hu7 zFTn_e_>zCoeM$3(g$5h?x6SP<#484{rLbCmU{(_!^6{^7^v9oo-r=Wcc%+`H4*)w& zH?!`b5zL*M$O~p+2-)zUxBK{o<{abPDQhy=PUV^Rqz<1 zwPqi-R*%;q(jpOE1Lrv8Tj7>Ff%_))v<;TrawT3wDm8mpU{w=}Ur=`{QQ{|b$O?5s zaUaYhF6&~pg_ICEe>Ud-vBEyLtT4vKE=N-k^{tdrV<>f$%NcqK4}}083k;y*BGe=# zsK7`QS*O3jWMr3WYZp}_W2Asux2nmDVl^;#Ek|_1q-| zMKGb{{HrQaEnuo}L15+gq4!%LDRtE`U zyQd1)DTq2h|M13#iX*7{f}6}=wxLJ==*CBiWB+zenLMp_4efSKKj@l%+#67PefhCn z|4Rk`ONwv4NEu$H&(rYg`56u7r_TfCr_)okK?b_(Ia2T(*)*yhp3M_#SKqCVZho{K zD*zSmi0TdI6T8C;h2aGy_zIY^Q}p=b@u_9NaUS278N?sizkx zdU|g~ZbtI&KIj|)K9t~@o!&*oy$Ghx6Dm5W?wMjIHFbstri&v~n+rp7rFXtypC>`J zzu@o#N*&(6$vkonp@AHQP`qP689aE8|2qEr_`P2!1M@qsmlc~HD%9n~mDs@ce^|aI z5ppRagp02{&9#q2$TUO~U>}>=M2MV+^%y#b_LcsOtwsspqm6a3h9_sZf*dzI)oc`E z;}M883=nHnJU!Zs=5_(?EZS5OahU334afw=S=dK>8Jm;XEMW5rG~kUZ5v>?ND7tR( zG&bnB%ZV9=tKuRy!~vs6KnF|`Gw~3NvcnDX{hUni?_sX=6*M5=KRqZ_2L=_kS8eay zZ4W$b4_pUTtW2K-!Cs)Jo^%8i*KmFne02K@pd=k_pd|J-5J8()_pud!2&}imAMvp&bg&Bb z+^>S)tBHKUmxBDInGBI{NHz1SfPp%~b0SSMV4CDmu%-ntO*|`LSirQw0U92_wBBil z>XBusq1-b<$^=>4ShdnTm$gG|STWFZ*>+;=YksDh%eIYiH8s$=Z08vGkj64-3EWr$ zoy$7LxY|}p;w8O2syKgPtT--bgax&PX_6&UKz3o-HM_?H0ReQZ(PTrnWdiKL`)T?^^yWZ)7 zcUl>k0l*7=B)=Lh^o=ULV*nWUtN!6#|BD6xi;8a+07riW=vVu>-Tv`H|G3gOu^lb8 zv-att(bB90h+;*A@KxwMuXrX`gs+*fVcAM*C>cPKU|7nIovQiCr>3~fU5-i^2|%uV zYH?}))yV5huf7f5Zvc1@(M)NUD*Q*cZ;wPWX-SU6Q*jwSL0{u1s6qZ4(2_8;A!cIN zF<5X6J{&r*J9MHjbmD>I1UR5pi)x$QGs3&zI6;&Yq9HFbf>Ir#F*w5EQfR*h-yI!! zB%)a&=-Ag1*tbO@@2*7?B}_{s!lz>(AgJCV=7E|qo|3f|tVCsmn8v&YdA{a|L`Xr6 z{ForY^h)_$V=h6qFS>DgCpPtBsWFK-1PsL7qDJasinTN%s~W?{W3u=@Yrcz_+n@UEbtx{1Qfh?dLX5U%$AyaN`#{%phn7%w;y^o7No$ z1%G2{b7AYi&2u}uoyY`nS|*p#+TZoYqWB7dOZ+PA|@sKGnE+WlL>)f*)Hq-|Nvr*V{5ysI7m8#=CLp+Y z++%u)RtH9lM$$QP@|%-?x4uUP#VMR|@67#o3J1>ZQ8@VHai^)R7_gcwTSI#kyng64 zn)F`>$d|kb|Bb+&PJ(ikYOfOfG$o4B^++PVDui4bgY|zAowsI&JRL%QBImli5jlDX zaaQ~_XQHNRFJh#kxoW@@8B!G^MPbcN5K=Um!CIG|!Am7`Kv3v7BoT#PGhcuo*+9;T z_;CQKiloLK(^d1hm;?k!BqgC-7mYMc{})Tqz5hy$7O2s0skZ;1rvHHdX8u6=e=xsk Yq%A6A{j;TymsFPr!ixWHK@g$*zrMX*)&Kwi literal 0 HcmV?d00001 diff --git a/__pycache__/sampling_params.cpython-312.pyc b/__pycache__/sampling_params.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ce8a64a99a9488a706efdc69810e576a33a7ebf GIT binary patch literal 25874 zcmchAdvIIlb>{_eLE=RMB*6FU@=fqfiF%WgC{eQYqHL446XbzFyq6MW5MVAqOGKfB za-50ex;5D}HQ6L%X2!dv*WOy)$xf7iHu9#s@y>Q;0Z}PYRM{$%bawO44s5DhCo?;{ zzw_OD@e=5!?e1Pl2j~0F`QEQ{&iBxNE-SNe_yYXz`VYUvalask_L)><=emyL-sD8i z!-+akAJBPp>{sv6zM>IZQoG57gV`qM*fMdSGQ^9a^pmN^naWdQzaLreFsu*q!RL|FVYIK~QlPaXz zcab;xJawYYQ!kcz8uZ*#oLK%UC)%abcMU4FQL;0C2mCAE)hoX5DnFd3Nu^eT>Xe$a zP#JnN3+n>4N-EK)o)+d?4c{86Mk>2{T#YX0N30cV->ZAKMDg;pvJmwM(eSDKMwP#9 zN$+lq_DuWc7Xtp^WnsZ5`{t#HB!`8$kSs)`>k;9y6qICN#2*R{{)&Q4xpgVyoL>rv zZheYB6zlnIiQWY{v>?fmCGeOhg27P47m>u2VIsH$CbQ^^_+|sX za2O7&Kj@Em1EI^8C7B{PAG++1grAW^vr;%5A}w*JvOTpQh{nKayzwfzsX4)`WQ$3%VQAr z9$qwfj7z-Rn6hbgF|&mHQ#<~Og8quIDR)WAP|Bn<{y3Vruf@g_KqK^-jb09c~9Palbh-6q#QbTJSFnM11mPDKc<<@xY;j zFCCDE0{%-w3rmqJq2SQVfx!GwSZy3HYn<@l!jf!93=Gr*30DBT$|cLnmrpC8(s}co zT&MBmRRa%S`gTUykkhK+c?>R(LEQ|BDXCprh*Gd}h@0Dr8 z&*RMDDJLGY!d05bX;RPQeXv+ADqMyd0FQ5%CzZ6bYW?9iiA{e~E zRC!pU1nsR|AAUh{>$zQssrRvRm*Fw_{w(Hysv z26&T;WN0z&syfSokoiR2q7D?Tn`Qi|n=pzxQ{@meinbc$kkl}21_*nD*41z*6A=}HnJS9}p6 z9GaH|Hu;4GpDYRUixG_bz>*+cpA9UA{Vz*{Q!>RIeetuQMPb$#6hc9e7sC<|NqEU0 z6ot^7ptX+#m(quX*-#L^K7TMGTw07E=0L>15Rh0hG|Tystaw;(n^T7H;=J61MC4`y zEd&GttpvINQWj);Q932dA(^Hu(LWpUdTG86OM$tRaW=FVjD)HE1VN@rBvt11E`-7n zFPn_L(av4^_+TE}K7IsQ zCfv4a+n9_ydVp^`goH!bgLz$AMedJuZ|Zd1>n@&K z;{LVan(iIl)H^y%ITF^}i2RNrr5_xYsnGHeK$tK=SXP?cJVs0V^#%E8^w@6Ajna9u z9W(3=QVGTap;=!bd~`4q!8wZD4e-f-=RR5H(wxO<`G_yAFJCb}HgILGmC3cvEt{}0 z`M}omkskiP-l5j`T5W^QUGnK&8O1Rm7w_nl643nfB_;o^VwsEf|6_{s6y=*n4Kn@M z#5tThnVjz!Q3g027L>+pVcjaP#r_%HNghR#d6E5JYJPb?B9;#jI0TT_-trOnM4$V# z>Ot#z!+9&mZ`1gnqm*(R-%O2PvPa3U{rHQOe0q0xkkHH)EEL$SIy?-jv~zQZ>NUffxNh0v-O1L@<*AjEYaOd+xA@MVHwc@qJEtBr9A19<*6hmU z?W@<{ifJbVs?-vLWtp_(=<8M$28-V-7z` z!Kuu%_?P)KA!UcwN}BaGnOB?hJ%(G{b)ARDJg-l=o?7&ak~k^NhD7KTieUG|NTq3X z=Q9A`(&dUo#RZ}WO%&`C_3s%p_tNfi>I3r8fMI4F~*f%(`HhW7SS08s^?F`2IUd`; z(Bycd>~>jNuP+;VT*8&LBx`E_&RSx&JTB)PRkyDt>>XS7uDGd-brH-v1;dp_5eYyn zKeG~M8LwL>KSwFh^mCjoH}YAa3|uBc8Nes6a(~a8jNNGtKC1~~5JjUn2QiT;pG=ly^$z8;Am1s=$>%RTe|}=-%nPSd#>rFXXC_jX(`Tk8&U-IT zo&Rj%Z16mCArpw;pQ>*`cN zAUHieQWt8YgLu#$ZdU3}a6RSWN7R=gCJaBi7Ln6iv%pU3}7K@G#*fI0QggB62Kcav#SK(hxCLS6Y z7zb)PnGqnOe~akTBKon|f>;h^>ria}r}R>>L+TQHw0OM?_T_l;dd0jmdvM+#3NCMRe|fj*MaGVGC1hFG9M2ln!A711S^; zgs#DCA_*%NHpRAUOQ9qoj_XlAx6D8=I$S_nOSf0F!@#I(k8=qU3(qEQR)zC(R zBftie1-zoF45Tcq7~V^MUpQ5PNJ7`J(ZMoAb)B*xdD-WMIX|2-U-F6G zYatolE*MxH!OzX=zX0X&^XN4Hj`nzc1ILZ%1yiMmK>2G_@0 zG&_^_`%V4p-8^@rfnti)-xMNdjr*!a!?M4kA(FAqiu`-Vtl|@^zfmW0H|lfjv0Z*n zEQxW;da*PJW$2qa(G=4GnqzwSS>UIK%NjGNE(2V)7!Ow&XgplyF{A1-!ex(@sIC&Y z9I;ZsikJzoGG+#J#w>uYm=&-pW&^B_l>yeo$^mO*cEGxr1F$|;0oV|$1Z<2s0h?kj zz~)#LU`wnTP>9t4w#I4!+hTQq?Xd>Hj#wjLXRPU)y5HALqb+pYtl>t>jpl2dyL*b% z2z;MCt|9k+^NoTY_y1Jed$PypD|+&IP`Eber>v_t6Sq;EwG(4Nq7w(7XGHErQK z2DjVyCffHtXy5nPXs9p$dHcTAb1SFTMq#?GaIKvBrsX>)H;iwGH}~He-ReHL)p#i3 zIFxMPmo{=`H6L*Zkv4I*hPbusq0RNih1(aPh*u0k5wB=ZHZ(#Jhh@H;Ya5B%nv>0K zw=VwD-t;jiI4nu0Yvs^w-$v^vX#<=;cQvGq_=Sb4b6>n;Y<)U@;<-=KrQ~Jex(>uU zkFL+gC!hZ$Z6+@p=V*)D-5(h^q5V-s`zL8T`8qhEFW$0e-Ei0XNxFi3D!KZWwWd{A zlRim1;qj4+bJnJ-2&*QnhOk<$vN~NySUq74gf()Fz3C>xnz%h@3f#%!Zfk} zQSZs7l05qrP4R3oX$GCP7-%pSe)4>AfB2FBQt-V5Ke;P+At}!HZBEn|j~5ifca7ST z^a_@Ms~PK9HnDJ9@pxmLH@mPEEe|-*?UhgD5lfNH###M!Ju8Vhl3C7}`+N`+ckoPh zIlG$KNYvtll4mR)Yc{Sd_O5CXgblDli{T#%tu4YRNyeUtFqK;P?5|kzQV-grR7Bf) zGfOmh$uwQ?Et@VASEjtd{z#3+QanV2CS3MuIg%@jb6!R(W@82q*`y1;d{;FK%nh&^|H9Y$(mM&7MO-MDvO62EhG~MFj$$s-S0iX)`@gw z?8k>mR}?NYof2oGuF&4TEI_041%yaQ2=)t?p)>}^e^Vy-u*eMTr(2B=vu=u`WO0Gcn#fE*ERbGfl##Cy$ZU)X z%FYB0=qDt>W7rFP>*e*A zSNK1)CF`2rYFuvwTJp$hyA|2Cw53GG}yMJo!duXeCW9s%49pE^x!%-3C=*L4x z9sWKh84;T`Li@(3B8z_ivs*34ZsE8<<0Gs6jYGE&t?6GohHc7Qldqpqed5-}WNrPe z$=^Hku&ic{f6KgXetYkOvTn*o-Fn@cRps5bHYcpjkWQ0r9TZ~=g+F@xD1_f+Qwx}; z$YcEW_}XN`DkK}5;B^6B+t!wZwPj;8VeL+~vycKxjk-JZ+M%_X?WVE&O=AxSh7-f* z?zw(a_wVbrhRjQ{4ko1BAZ zPd|#w=GeCNBy2qh3K7^`wrvs;CSlvuoiKH8^4mQ}5?zGlCCBkow*HUiP#N;wi%M50uWBSGpqMwH!QAx+1)4>1uOI2RlizT>Un&6|Wao ze_c5#losbh{Xw`Yo;GSWM|_9fhP#@v{%AD|s}_ma5E^8oXrbS5Lr@1Hn0gocVBskm zuCV1;{26gj3;kQfZ$tb=!WG63u8d%qja@8bdC%=><;6LR#Fr8H^5QG(xpu@?gtIU{ zu?&1g1kQ{KgB_fr^;=OqTw(m+%7~9S`K^3nq&bUo7RHx1qKd>^WJL8vXcLHiqAPDi z6$!76sA2+X{6yiAt_meB60WeM;3}e;-0UgtUu5p8F3wpb9YL+0oFB2~ zi599wd__2m#8)ie$BKt8o=0}dHg_JWE6#_s>x;+Ba+NcWfU7O1cq)4HKyiPu@}&cq zTd1y!gep=u4aGT&)J?}OxhUeFlZ#^1CtIiyxhTR}Bo}l7tl+!3AEAr*qqI;F!!(~1 zqbeggcU5UB&Y3@Y^2TvS1ucePT0HmWzaaQdth?&a^Ga zUy)FSwUn2`K`qtn@2)w_D8-_{a#$qnlR1>FIcry|HVZMyqDXkT24yIsT!(XG$(*{_ zO+G;!quKIk80-0N(2A(Yk%Jo7)glTvGZy9Mpgmg-rUZ9PMO%Xz^%oi_Z0!B)h&mKl z9A%H-qKv>a%xHZMUj)jn4=VGBq87_5$V60AiVLE{eF<4%E5cD2(eEQ7`3(Tl*XX;a zA;+Wc!adhaLeQ5#H?XH)*z*N7q24DpV|V3c8y?X;PwvoWLgX0*a7ihoCeoFw@@p0Z zoIo82N|$k^>Sal-93P4iZDKpTnK+T-9vs^nQ8Uam`RE|=yp-V?7JAgnA?|)MM&O1* zUad!)^ZAw5%c<@rwuPLDk7_9=&1idevpB<3n1*89*qhx-&T!_{O0KB{Nsy@|I$9)9 z)+CY}BAbu#Xo(uyt?gK663nexs*VA*^F@c%-Q(Xr#Npx_&zYpMH7Oh&7e;bZ%hR05 z2+s^HV?Z*77RYY-hL(srUt$ke#Z;3^b-u|(FTaU+1q628N{J2tb6#YLE`MMPe=z!4#-+HF%R%hsPxcVW zwJTB(hj-aNA5Kz`{ci@BNCt&zDU#zqPM&1GBNHISIT#(tGeRmBN7(-qmYkCiWp++pWDu1ZEn0 zim;;ut^nY|MsP{l%Vp*_wt4$JQC}obXuev5XSCr{dh0vXN`JwV_3EL$Zmr|3uJx|B z+aK7wa3Lt(a{SL-_r#yfeK7aq#(U-Qri)vZ&%|xdBw^Fpe|!JR#A}Bc*P*)|KkE8n z*WKEC74fE%Ta}Y>+a&Rf-5!fu+aEfcUFe{}wb=eG`= ziSIufcb(fY=*jw}hxMz3bGVrC%y-}30`_?O$$OJOIsd`=dq?A~&uvvt#~suET>F~C zwG!E`=}*-3Kd|q;V*o%wl9fm9PHvAqn;3gGe&o6Mb4X|g`JxL@dTg^&z9`f{qGsTM zee{kKfQ1^nJF`7Lofw~vkIlqqo{u|T$O{GAleH~jZQGc=Z*@}u$7Ir3v+e9nID6xj zeVZe6(`qp1rWF~&wyk8*T0@Xlmd3%muI)o-5{J&j51#$7HSRc{mxRN)GV`Wq)q`f+ z*sHZ2vh^d+4?TC!f2fN$o!zQD7q^{*jp~iVw-1xSk41YN=C%3{>hIY;oQON7vTZ1FTO$+OBWDvMXJJGxk`?jItE*QZ*t<7Q z0w9@qOR=m_2RU~87;J}|qwgPk=h)q6@An;L+$Zjh{^ZyP$KofZD%1c|V1o8Tud;KlMWVg%{(F&t!9trh+LkFY=imd4K4=-*+A%VI_6aX8Bv8 z^-#QNaC7d?%n!VO=8cb@y7w$K#nWVp)o#5pqa^H?acjq>>;0y8n(nsUcaNdgnm%ZX zKl|MMiRV~e&~bir;fELQ9xH4c6IWKTZ4(kUA#PKy;dLY&9h>_5j$U>tk6asHzVGPy zCH21(tYdU1oZa!to=qL)rmr9~+t#jxwJUDjlO&sFcf!`aIdb3De}~^5J_mE@hnMaT zKZCfakTuDghV7a?iJCp}>b(@#zHD4{Aqx$>T|bbhA9yctmTRF{hO2TpZQm3 zw#xTEu?+5K-Naz8e1Q`Z|&Zk-0mMs^p8EXHErA4 z6Snq`4f+8Kl6E%5o6c`lUWnT+q)Xuyw<`Aq^43KX6=)xOX9i%moq18m)a<(KQLazj zQ_nGXVn2&l@22)=<@6EU3&FC4tE;&Fuk1-=j%%vIDM{`SHnMvvg%kK6gCW|r%LHDq z&tb;u&+ct3bA?m%xeq7*%jy4g`on{{LyGQnRx^Wisv`Rm0bNU*^IuL?;s(AHpu>%s zqjj@Xh1!QTL#a~I3sI%|o22t=)STa|xuI15SmrJ;HZyfssd)-MF~f0Pcf)Y=R#qE~ z86qZ)S(E5wL(FhR7qmpI8YPx#o*M?_ts-W)n%N51c(7fZNd@^Z$TPc?AhYS66EYJ& z+m0<_(+u|`&0-;o+95T~5H?%dq+?&U4A?%!?tti6)*(*>YRgZB;hP3^>Z0GT@qPRvyNMJ1K&D?*8zsPZs5G zBO}T|vy6n|=GoOF`Cky}Ujn2`DQ*bQY-f>9S}BLBnv`ugFFQT;M3!*HRDF^`J!JK% z+-20^?;tT{>kS)GZp{5O<8~x$9m#4Z)%rDyq zHZOla{Gk2F%C%MhpF8ip_~XV~PbKZGar*$~po-cz?W^{!7S1kcY zO}c`!oz*FW&a8>6@+=CH`KY1UTt!Dqcbluix+N&-=oT1!%2&MWPI6!PCZvBn#sTbg zU>GQ}UepKi_{$AG#>e!<2A_6jA&RJP7%@PhK@@&5TsFYbQ#I!|N)Ycv%n;VSJ{l_# zOUS~E9OafvjX*3Vn!-V*6@<8uV}#B;FOcK+De-8-hMDAauj3qJu{5t{riE-NVwse1 z$@}+t?)7i+yW|*0eoNml%^EJ#k>3{7NNKDTwTaU@H%u`T&IF;9FbCpTZLy6Rwx;M~ zrd7l1^+=@^$9&BinP#g)P@9iFEv z3E4yRH1DOHTIlQoimL#ugrKJ?{6Wml{+v_pVcMjmnNs&`ouR!Wj} zEgL<5GW5Nnt+u_1y1npd?^livfO6*)#Drb-*@V#uG1eiAye2fuj@|tb3Ogr-N)~DAAf8#Hd%h&eLU`H|Lew~ z6;sk)v)1}rZ0-7{d%O2|qWAdymg66na%EMI8i(ivZB?@SIDXCbA8`oy*v8rHal5c( zZQYo-Z|!7q?a&?b_VDS%@aYdL62oWXL+7CEo!9j}>Klvqjsrj1`KWVnyYon*^9at( z-knIBd3qM3gcJlid(@JFh(26`*86cu$7E0bp-`Fv*>t`EWVh)&YonscM0R9% z?TQ`NoZjW=+RfZ>MRvTWxtw^5>>*Ec%Hg9@#cZ*T3@h&;kwl`L;i(~vwh0SO4XLuQ z_LveGL!LM@lSjQs9g}Pt%Uu5Pjfc*bZD&`)*%h~SkqnE|DRHa7eESm4zPPRLp}k?- z-j=Yp#Z7Hb9;TqxkX3d)kliXFEH`5a}6L{kVzWWU(9_ay8+aZ}HeHAf;ftGT}f zvRlpJ;)gCs8<*fI{IE*uX7%h^OfObOT~>?Ao^{KfW2UErG}%FR6qhhGDFoM0bhseX z&tvtvp}V2auAIdVOvE76<1GM`oW5yg#X~r(cf9X&sQpPvg2geku%s+gI1`BFh0Kq% zV4cr@Ax%)8lnM_jckpOp-h~4v1aY=dHPga?>60({ab-zHDiFxA1ikFXVME_My$*z} z9g8~23-nwY9aD^u%@Qxh;NT(lF312$SsWCe#!V<#m-|V^S8SvUc&n(NWfn`0a_1`( zTM!4|aOyFn-LGS}0s{dl03)dK>;ijyF7p6VSh+MOg)=csOE~eUUJJyGJ&V?{80dpAK>^RKaUp^aIZd$Mo8(g-j!bQVd;*@ z@`vQZQ|VGB5p6-3+n~H$pbsphT-Zs#t96;e;r8ELSbc6qJ(WjML|p~_pQZdI>hTL8 zqsLdX9^aX;cS4%SWrnf}i1ZCD+x5fu>xXxYhRX7v*AK6ppsN_pl|A1qdHeF_r90gZ zx?mrB%d~F#j^&;{;W&}3AO47V)1{ob>9GTGzH#IBjkTG0&)5U|c-%Dp>yNDvciT{3K_#YjJjfX%P!;a{wf{JS351jWWH~6s}R*0mEtP+7?o#KT7C*X zMl~{OMq|#hFlq^-7FjD8wUkkdtQ3rDVpN**3O+_PGinjNm{Bc^TJ$^)%4=oRBByXz zSR11jIjh2`WsC~pyzpbxaz>@qq2OawJEImoi-KG^7`4b*4i>h8QH6qN;bhcGMs3@L z>SWZ8U8pWb?b?M}#i%{IP^%dg6LR6la#h2q{e?6&&$WzNWVfA#ty|3;VaB5>^;7b{ zK{U~d{6X0h9FIEtg~F}l@u;gGMu~zOyW`QSemt~Ncvp8kT8`;1d%k--YJ!_?>}If+ z7{>Nn$D^hFxNsO83JbBM|JSxDS9wprEs_%HT4&^R-XP4C+^qhig zwk&Vio3l7B?{?SXCB=|GsF)l-b+hW9deQL9=m*Lj!)t`a2rLsQux--gd1!6f>y6gr z_ZmD$qY!8lGQ0w?jMrdl$_92gfOvUJntnI#I#OOyF>QyllN}<*Vsdq6oU~76ohL z&}c1t6D@n=&7wnBJ zL!I_GnV#JzpHh@2#i3k;JyZCcP$}Mt49%ijXE%;d(~$T)0k)A#LlJNP2134w{3Y_e zL|~SHNI)VmN8mDnD+DG9u>E0Xhb5_8ewsjlz&wE(v4dYGZDh8s-s-y@Q{-L-2n+35Y^YYkIIck6E1F)+m|Pn!(Tg_)|VIs zxEoq>US&(`4aPwxJmz>Nn_KB|Hff7*#jRW;<_lc1pcmB^@m!nc)7nArvl;Q08~Gds zPkj>}c&ox&aOBYp4_6Hyh}*ZtS7V|w?x$y?@Lt^4=r)J3@sYDSL)05v!HwtS)EDNq z_=Y4ujOz!A$IA4o`SuVlT3CcFzMDdiJnHVlb8wSu&eby;e8SbS#rIL7c8aEH`RoeM za-lO0KWb`$)3740;uX1;B!4nlT|*vQd<{k9euUb{JR#RYRdim5IzrMHSEkoSSMhRP z7ct@juZ42RqDJoeFd4`Bnl;xJ-;v}GBzp(dhwBED{3+yuUazZK39rqpM)9s4#jx*D zcYmC3-ayv+ll*w{%msS&?##yQ=Gf+e_s8BDySwK>-_dy2(M031dmV|!(_1{Azl7pa_@oVi}5P==47I3cng{5_aOgNG!(gW#e>IHae>9r zy~Phw#v2|{;qaW^+R4==5Nb5S(4(GyHB|e`XI9HMTnR@fN=Lb_MRlPP&)kZvoV>lX zHi|nvtS0rwJ;>DP!8l*L*0C1ZIJv&GK7@?aQ$`*Qj8K^(8{+!!Z1E$Hj1`Y+>#(7n zboVEF2a2Buip(xx8?d6)Wx-gW?mgxb84nlm`uqPUZ?~EoI4y1>e=P=jWyV<@O+r0LE z?48(M`N7bMc<+gXaPpp*5YDDY7}FlEx;wp>p`%>W=p8Z9i0kXjy`QTeyu&BzM$-qF z`yg_GE-kIqW2A3HHc#HE+B}=+K9F!7xZ9C%9Zer%Y=(uMruCy~Bi!g6P1HNiriqF%X(`_^6Slw8SiTucbNIQ(@6b999BDr^J^EA4AiL0 zMk6lmsVii%u?6ebDO!@$Rp9~Oue29M=5ALq^%oTT6hO+F`OgoeFq2YJyo8&+GEW>v z;xNzQ&7%n3fekZBB2@xoHMWvu5`APQ4*eyuxfay_24OHaM%tN8S;-VO2z3reFl7t* zI5nS~cbJ$wlWJuYm;r-fdR_}>Hoc1xe;|yb#(0o~{>ud>*`!>I53h-aaoH75KQShm z&MPRb@GZiaIsOj`qeBJC3$#qudLGVH$uv{{n0Ujgx}XhGrZMHeR5&XC3jDBxa^ar< z;9!tW_fZL_8~O!j{cFztGp^}xxaOa6oqxl%{(>w28Q1%_T>H*lyYq<=Eke1DlOAI>K%7MomelSS&*1y-E(CP4820;uwy)~_PoYez=tT*er z;|+W8(66prr2>0gTcUHW@sBzDJg$7kq_ZUX+9YrN#?;NJWM%aubJ^cn$@lS`qf9rD NNBEy+_^@8_e*oXVqksSa literal 0 HcmV?d00001 diff --git a/__pycache__/scalar_type.cpython-312.pyc b/__pycache__/scalar_type.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c881d72c1f5b9817396a956b31ce6a8c72802be2 GIT binary patch literal 14217 zcmcgzX>1%vcJ7{w&2z{Z-j|vpC2_<{Jakx=Dan#7$&sjwYuWTVqv3Ru9BB@^duWSe zQf9nPmh=XY6d=$D-i=3z5mV6uD3*#zNTJ|`w#j^Jys*L_9w{P;v`Pu z{oIKD=0|vZoBXDLdBn_9nb~g%SVycp=i^3fChj;VSw7?>t8Bez*71xKm^j%i*?w4X zkJn2WlmMv0dnUc)p8n!S1Xfpsy5dybNZ~Z^DtST>LoQw`lssXN*Y8nPd{~YJrvhH1 zF?$(ZaBH>ryv4~J4S2*P@gru*G-8p=Bi3n)%c>O&d4lKVNkyR%JfT)kC^zJ?X@c7w z^aNzLTPt$A10iY3Px%tJ`|T-@->9(+of$f&2`7&oJLcZIdq8s>dwuw*`~2ad#}GG>yBKD3M!ONDX?^2NxPB5#pPA_W|Wk`k1Ztt%--iCr$gyfp_o zJ_!zO%09HD`NknTjZ%A7Y2`YM<-l0AuA~Ab)hMw6n*;e8Bkx4M*2q^PUuWd2kgqrL z)yOv(`5NRKjeITgO~^L`XC3lejC?)vBJwoy2IO1htx}_G!A`MAO+*E$P~ygHQghb$ zTQ`B(k_FQS7!z<{Ee#A}R_(T|+7{U+39`VTilnVsP}>33%0||fRk~xk$+b-@y1=+{ z7)Iw*;F z%j2@b@DQu3ohT9~JxbW;odW-hzA!^NDvML9EQ#Sz7JhYVv}<%aEQ=%sMAn=kYF9%6 zIXpoT{hy zNBF~}#-{w@SV$2eP=aA!geJ+*x69q*-J%)>nLLUl9zQ#LP8=8oJg_rU~-Z$xskO(u4p_P*pve zjqu5mW+B?WqB-4TzMwBG>jYEXp`d?S6M~+gn|MjHDKfeadQ;MzWPk<#X8f-WFhMrM zU8d5UN8+#4Lu%Sl3#i;jGEuEs`zq8HAX17h3oIp2KIVR4O65P`B}@L^A{dWki9nUk z@_-czTm~ zy}-E&hMo)n`U$DTNcRy>kVHX7b_G0Fdjeo?4{bFmG(~cNb}17g3Q%wgt)}n)fYDW; zR8v6}60N}PR>O+ht$?|7jjfbXj_3({!lKIlF|7pX-RbSXzS7&IG?jDH?$9dNeY*9b zXih?quF(rsVmT7XqNr3LtJSzu64%WnE_%G`hQy6*&4}#oO<;}#fw~P_ZjM{AJFm|! z%*ITQ>|0~u&t~t;J}^Dnu|LU~3X4|C4m=qC>9>FK?Z5lO_<>XLlGAHul*aA*lLA*) zA07VWt=n%UYIiPr5;Z;ZB`d}D>n|<5^!?+@je8P}dma_y?`bpO>_WtXEe?IVN|HTz&&8?Siz7*?w5L1FtIazX-%2#^ zkH2&-(R?m0G+-_jO)*DI`YdoKkZ9>oln=yTI2Av8KJmi&g#AKXxWFv0bpnbw1gycW z%^^OrkC4k+SRa2o}jS!>26d)LSyw0Sg%2GV~i$i-iAk z%)E73l-Nv+zR5Ps=DQ>-b7XJQdR$a(U__9m z(T!B4u`sH}k$E0fm2yx$O;O}J@|bDk7#wE$`*TWOGr zNsIqXBPQ0EAY&`_n3(i|*HYnAbCyHtMsuxDg%tte5mvQAfatoG zNvOL#TV9n|(U1`WtuYCPh%eA;KPBR4}LND_{ar)YnJaaHsb|s#;WRQtDW9Mehm1ztNkV@{EXFRE5fJN2Ws5J)bumd{;UZ~h9`9~YYwN^ zI^QZv+4Xr;J5-xFwJ%d|$*S+qXOCK`MtvStEZLaxLR?mmnbUJ{IVE8eY(h#&7DHl8 zTU~`ik-ZnQjb(V)Qvn&KF>KZKR=jSkdK8%%iL&I+rl`Ov*esmlqJ%cDvI|M1Wg~;9 zR4w>>&w$(D@dAb9hSE<72`yzOB^{J>QqqS6@=U}_vlE{a*QZou0ckt%u~q;*&LkP_ z^^A5jINuk{!}%BmVAa1xvVrr7)gL=s4W>S6Fm)@Kddv3tGpp70%hi2}>b|Awf%)Sr z)lD&bqPl1P_?ON)rm8AGedn|H?!0$@;6csOj>AjNmml^foX6rN$5v|^Ze6>1?QzYH z#ep9U-W_~!{!zz4qmzE4lY`Ji<(1d3EnItS-vXZ>O6=P0YxnuYmfl2NU%aM2Uf#c2 zQ@e2|(af3LJb|6`UDJ2Xh~)4enelI$H_e;jg_=GTW_c4gvxyNUrTLfV3@BR8oF@+y z^_=ramed|d~u^6bC7J1A>L3npVf71MrNFn!EP!Vk>Z`zlN}9D5d6 zRAj*nd0iO!8S`<7r;dbz@5l;#O?Y7$K}!BATw<`3`0j+yJCSn1Cb6G%Zx$gR@=oEG z!f6bo(S;^e^622Y`90I2sW7?Egqkn~0GjK6>CPxY9O>vtW62C;hbkf*6ma__kzC`D zqyWq4eHLC7A)bkH8aEAGx9U02yzSNl#%_B zWx)E1E4dHm1s9%?#&R zC6R)m&dup|ZCOZh!z*@sq<1_)uN>KynP<08>OSBPAxfbRcCQEfGaylSBl+(+4x1Ib z=Z)KCP(C#s^CwmuRo{0;U;DTrb~RDgbN`h@?f!&g|NN1as)nd9QMF^?6}G3I5C^hZ zJP|3!gjs#2cX7-o`y~~t9)f#5E=H<|0mk67W5y{?*fZb~n5&>W4K;*_;*$#L1Lieo zRu(bPD%}^396ojUynFbKv&SyDPaZ`y@vI^5lp_Rvn39)~sI+FHc<7K$A(KyvlxD^S zs>~D6Um*D(%)X4Xm(QD5Djf5+|FT#M+n<(kg*7+Aw`On7#ypRO_FoFs8@}GV`Cjbs zV_~~qH+^$Dw)L^FjeRZG^uWBoU-4MzPq$pI*_o)>xri3stM;m>|mc zJYpuryaaoJyUP!OzU0U0i}MoY*b0%am}NIH)f1`AB5~U2`fGx0MKZ^&>YkJCYb5NQ zKUD7z{P@MsUtH=rn6Mv=3kNq&m!`oaHN{JxXY*__ZR}AY3VU3 z=FyUrQCVD%dAAwK(C+bi5i4RLJAx#CRP-@H1a*&Ai0io0!Un#>evDb?^1uLJM;_1B z&k6c8B^1_Q=UkG&WlU0!%w~InvA`GUNd-SeaUvX^RQLDLX$A|thLrIhDdbgq z5X(durDt3Y%8Dl}OFer1upCrFN_W~->7ED&{OKNDeEtncM&Y^*m~Q_7H7e~5kbVgY(JSIZ81P5ylA>Mxc?Z^}{!c z|B;$qpkxELep51U$sVT8n!!-8DeQWH zn2sJ^xE2?hHjX;I#~J~?8FQ5Ag0??<#MoRpE)pGd7>5jypFtMvo(5ktg`Z1mHUR*s z8zw-7zn}>i?2&T9Fb%d8x`P zl;oWf!ucDgM1^rQQ5%q?BaNRHd{%O&WLfM>;LqL{7y33XPkM4ewA^fYeuv3nuUzGo zQl4T%0#N5X#)4-AV$RCv6$$z&)L6*x55233bWAMbm^dxrAW7fB`3Med)O4B4nnhVU z^3^Gumt%@)bIP(zXU~!9oZV@ZF9C(xg(OSYep;|B?n;Qemc=~@{Mq+BvR`}=}6ioB{C-w&;JMEL4C*?RP*RuncASkuYdF~nQ$^Mc|lx9Mq&$rkl z^M)3A7A?CE7W66383aC4V^;goYo36MaWVr)h9T_^W-x_GuCV8Fo?~#@H4KN9MhqN0 z3qO*U$azvDQQ(vYd8uF>J)T2BHa4T#{ucoXty^M=(+5<+7;Ouj-w^$tJAMjG=LfH9U~Eya(O#f%KUO3>kGWa3FQa!s4NWZ;tsrM_T(F3GX z2eU{*qj%=+<|#dS#H)_0L-|*S?Yw0Jmn%K#G^Rh3>mJXarIL;slp?i^k z`u<9FUG&IJ->0oOiSpg?{Z)6Ode;r}3SrnNsYyyRNezt#NzG12pCPMBCSZ@7h9bpT zbcB&YwB4xDalVP8dAEC=k&-I>Uo=sjNJyhM!uy@MJ@fIkguN>+bZs0c4T=qvWa=$0 zj8uO<;B_Md*Sv;tBjT8%gp>^(;Gt8vQW5i4%C=wP3M?Y`7+xFn_QrRHlK|=9$ z+efyQOdKXJk@UbshDhI-Jm$!b6nTQ;6ud~f3z8OLSrpP~@QOkvo)X$HS4Be6x~Y<= zsi3L0UJlYpyUy zq_nt9%|CO!Cyx$HL+eiZvy_l46SQG`Gt6iP^L!Gs$P%=97^al<>?W&}?8n+H&RJJe zOLQf9tb5h9(Uy;9V!~qYUzWs$&bYnvmt_?<+M_38(jvI2I}e#P$HrSe-gI;hh{?tUJ9RmdcXm zGkk_!-r*H{#+;^84V?6taMFWu)AK z)8sh7wW8PP&L2}%C}O+sD$2916|KNHiCL$zjIsfuU?*B2Kv7)xxMg>uc=wl%#_0Qt z*W%rS@xh^Z)0rj5*?7s>FYT?dp@h9ZF7z|K=4w|8(N2NN;uLal=q3`vZcxh14zs&) zW`a9T>}H^fJIt2RP{^-aU9`dqV~BdnZKuRdNjD`dY`Gsft#E8A=*8iNU)Ahh4-9h2 zJ*k8yaqczEG?XT!Kj~;zAsMQip@bQ7>@fFR^vO)RHz@ZeCCvUCqa3p~e3bJe(Tdr{ zMg7D|d7FwQN@zuucPV*~5<4a2eKG!2Dk(<^UCL5V!)N(p?y057DkN*cUf(&LG?|ev zGV&JWOSsaC@0?n(^sW|_e&=M;WJOsS*RnlssfkuDS+*rjCY0btlek>lo~Uh~J25}L zWN9}LHCu=IMdunv@8lugWIe{mzO}~DJ1G=cJEN^J>+Oy;4%wu`VXaGjBb#jJZ1!bq zbHdu3G&NYAH;#P8;k9PwY+G1CazAG)!yd67;-mdvarCB2qbO;!T03rBi0;1m<{F1= zvMl%8W^ITLtZ{fH%T3m;G34<|3P_?y)A{CT8|9m+yi?D2CI#NwPkFqkpV#^5Fm>`e zpDf#It%+j7cqLEpytO;pN?_f|BHmg{6?lPuwu+?5Vs%Dw{{yc@^P+UOh-L53d_^TR z5gh~z@Qr~5t1|}7c-=q3UJp)vMPCM1GYX&O(x zTrtuhJ4PC(O|CMnBunF|PlyACl%q(WkY*izLOPb+e`VI@D^CijifJB}o5QE8DMuga za+!pvEYc9 zjrQr8{?P$FGcdYOE2OG@qrH9maBsor63U&fNA>Q>RZ1VybSODa8h!`~{eGwF`oV>R z(cY-ERNRW6@|09xKeKQqdN>+hDruWL_T2i)=H<$5iOOv;&tk>m(fd1=`%WeLPAyfQ zo;$tbXk2#eOgMHf_AYvs9J}WJ;FncfVtlMOCNG}5Z@E9dyyt9U&)KD_b8|y0&L;gj z-{RrLx0jr~bEj4ab7!Km6R@VG%Ko|2(0(X$wk4cxv2%<3;=uiyA2)v9xa1t1JGDyf z^VZ;=#ip%LXRo^+S_nlwF?eRJbH`W8YM=dGUVqDe(~ep6FO~0@I|&Tsjrj3XS;HS4 z*QGKM*j1#wPsHAdL@Q4JKulH@;$6)m<5yyeg&@dc(~V(fGO#6DZ%b&x0sS{$gUTlW zCNohzha_p{d45gc_=MXQYaErZITuI`ch#d-}n`Wgbn}y0rQDx&Hw-a literal 0 HcmV?d00001 diff --git a/__pycache__/scripts.cpython-312.pyc b/__pycache__/scripts.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b64dd55ace37b0e35b06f60d761500d86886e39 GIT binary patch literal 611 zcmZ`$J&W5w5S`VREj#y7qzM;-AWc-*f?Qz-LI`OC;>zGgDAwXzXSEWm_1)=nDxJX&gjCs)Eq6Dd(af8DZ{D)s4-Pt@gIizNUjpz8n(}XlE{YSK=5T9uK8GK}l|kIsWMi7PvuT5Uc=J}`Q-*%4dUUV#tzcW9tGe}}t#4@| z(vBh#r!LP^=Jr!UhinQ{6`9_bceyLhbA$|QvRX#*MF!PB4a!>{jtxKHNJ Kx?_{$n*ATG$&}Lo literal 0 HcmV?d00001 diff --git a/__pycache__/sequence.cpython-312.pyc b/__pycache__/sequence.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3f4026b48fdf7ab66cb4c1297aaa38f34c582a3 GIT binary patch literal 5252 zcmb6dTTC3+_0DTqc3IYd9~e`64EV9JS?mVeAz9PV*hwvHoW@S-teQ@SnTr`1W;b_c zjd5{{RJSo|B!l_@tEqy28e35t=d0;g+rOs$=#r%}nntOss8aJ&txKgQ{b|pcnO#`C zB zD>x_}4OSzt8hvQBZbD4dUDF>aUO+*)0A zRF9b8-PrjTpFOEO^_7!P^`9De#ufUki2>kg=zrmH$*{9IC!2Fz&iwelFtk7^ z??(tGD?bBdj!=^3z#RPZV1%?lMJnlh3RXN!@G)JyQUer8m)6Zg>*h5$*F`j3StcCN zYSz$d16p_uEfUab*U+L!LsPl+&!MSo=L~h2q1kED=1eti6bx zrIsTb{_Q-*Rrw*qGHkfehB}n2;a|pQUKKB0jIEciIJ3-EGO~g0?!6PR`TYi3TTa>Z2+%7bP` z8||_uEKOE5ma#u)rwkjV%swb<*<9zvY)iEWUFIY6U~nl|B7zi+R$<1kf{>1F*g1B#a-+{cibzWK!Z`KvW18b*C;sLg!bbduGfUNF~mHY5h ztrpPGNWIUp77p>-VM@b6rKr^fuRerz5%YB|(KqQl){oDU)5?uCh4j!hDIL*k zKq`v64e?M|#dIA~^x6$}67>l1#PsMjlCIb50LJwg-2}4rKyT0`Dghn`ym3NEG`LNt zEQhi&s$;b{Pc1vcY_*9azb6dSj~oZAY&7}rpy97Va(TICg!rPVJ1uZ{6KMT(82dDnIEwYuf@ zqu0ehg=H@aTw#<>XhFLMeCV)&*8c!xjyPo4uY;yn`Ktn0S@3oF0smS|B${wl zxXQm3-02*d0Ddl@juX}nTP8wo#IV8992I&|R{$^c9>JAt(*Q!PqIj(*=1aq&V|GGh z9U#Z{0&uxem%C)6Mc}D|w~8tXWJctL=h{{MxylUvAZiZ*m?9rUx6Ej-Ke`m%wcNCA zE;XB45Eh#{@^u{_#J0@~uRp&OYsZD&+1__I|GM>8t@nER7km1bdd@7h4J`JYd1v$8 zhP!N^~9C1?S2w8Dka z=R6l~0|Z<~y)IxrZ;{yd|U8NQ0*ScRs@a_jEv z;*GlLx|wtDhqrq^td(69IM78sjgJBX8dueK1Uc4?k}I7H+#m?XgTMwi!WXvj9RE!j zlCNy9_LiPcoJaO806^h*(~ZgL$$RnLi}BrsczZtF{;<|%JLA3;K}SZ)K?h60Sao3E zWqDUHspaR2BfzspQx~c`tbyn4sHSbO_%#&53J+cvZ=C5yZ!5&*d?`yv8OXY`leI(Hqb zDSs_o{rnTJPLm+-vaO>zMpWOtG(0>S%+KAnAM(cTBvphKSxdT z8?Tje=8w5Kjw7!f6(M*1To~tk2M}BdzH_x=* z`DLUwg&$Ctv#j?@XLa|WuP90~1#L9c&E9009SfPA1n|k<$tP1J-+1A@)Nj-u9+`V$ z_KA7tjmevnxBG5>`>yoM*gLUz)x|?k-fQZ6FZHQ+{C6Hd&Lt$aA9Xr_0IQxWLLstU z0s0Q8gQ%*fw2cn)^2@%qAw@xTuC#2dY8u$>0RW$(CvTtMv#|Niows%_#ScN_xEzh& zNKL2iMOzo6t%c~$g*|U}-s-&9*1g!)U1&S{e)K3Xf&VWxUie-2A5*_i6&f!T;urGa z3!f3OZEzU;4+V;QwaoYvqVqo$&OtpNs9LG_2W`c1mrJlO!d7g3{6eK?G|M)w^!}=f z7Y{N{L+5S%ZS#(K`y1~b&c~1E!^b`SeOGB>*mN^wWm^$!1K^6fp*bvpb4=xK$nOUk zmT`ve&h{X91VI-9bbl`kvc~{(Lz%%5_+JbAGLlasXhGmL+SstLoh5*R_AdZn%P&F8 zUr*{AXWD0SbCa`^`NsCeSVuwZSl<4~?emM<4;RG4#R%y+{-7oz9lPHE1#$k!8>ySA z{MLPo4f~4{P>Uh5v+M5W#V;HwB2zI;BK0e^h&7ASv)sJB@chkht`NkFb)3{TZx;#t z+&=Pl>P~6}(SL?GsRP08Vn~u+#EL#3A6tx*< zLO0}0Wa0aeVOnGOPd49{N&NmvUOJo{_r5Qa75$_CCC75mbTVbkxNKOSx!vX$PML{eL67|3dbBMC$%3Z2C}q_I~q@ytw85j-7eZe80IRPqu)C`RKp-7Vcn? b09X-VTJ)wVZz0L`76|}v;Q>-FQr4}nlA%_q}*@y`qatUkpnGTuV9cCxRy`^XW z0P`nUJo{I8iFj$}6zHip#e?9<*+hMZ$9wNH-|yqijctEGDyVTzt}#OY^5!k{*W&S5 z6&EN(A*Lw8Z5(NB4I{vzR#v2~YX~&ZuarKIwxNhVCFYcv@5BNOI+&+3-CSl=%>TKh zqRF`fB7g@}7J48$0q>=1sOne^1xQ=)AWplSFhP>IVo4LC69{FYOM^5erTtTlzY5 z{w}PH{JVv)u)(jR00LD%NBVe!F2rSfRy&Rrea6Zg}|xh#&CzudZ;zihs!%w7Ni C4v|p+ literal 0 HcmV?d00001 diff --git a/__pycache__/tracing.cpython-312.pyc b/__pycache__/tracing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..647c3095c7f384676b6eec379708d1ee37465a54 GIT binary patch literal 6078 zcma(#ZEPFIm9xtgMUkSYPfM~anUQ2EbYnV=9ovmR5?ZEhH4>$eRL)t{-LAPKdF|zr zo?Y1zAs0%{huSy;F|N3ai=u$?qd;=G1AOhD{B(PN4!A#5{iC>X zGtjUWI0lp^l+RN47h|k&zv$oY{jc{B$;5ur! z-5l2mxUL$mo8xu>uDgcod4t?uxkeUzZ;)$3Y~&9(Zz?7@jgxAzsGD;DhICW66{9dW zM;Uwu*u1F}%p7$>HdAx7s4{BWPLO|CwrbPUPG7;MhN9<-1!gOhu>w;#`GT6ejKaqX zrcD=Y00*bC$q6NSZh9)6O{SHp?8LN^O(({Z8D%;>m7N-!ngFhrEY8HHS>dX#@l~{+ zRPEfwOi?ufnWL6fU@@N)K*X!MsX2akgwtVB+nt*!#m(XUoLpT-7M23r1wlo1PPGdx zo7<}Xs!@}TX28BfD3Jw?$l{_96Im0g>wTD5QA{;Y6~zfEO1_}Y8wfWm%9VN5a4GMT z4V3zq1N-fKaeq-S?z2@kN9}zJ^M}88?F zyc+=#8x7FiAeBHf>jsG9zwL@0P@F|kO$pm(`h|I$S}E3!Vs>E11@<7+=XEgz@Upl> z#DCk_SQwa}EFLMG{2T(C;t~I?#kV|FJW?2T`w`d_k0{5_s6EfjuT0|u;s$cOLu}~X zmLcy8cq7^r-BPyLHPz}g=CpYX>Qz-Y)C&e>gCOGoJXQd@W%4kzt1Ru}dUSeq)3Vuo z9xe|YvS&1CD^6#@Hj1v*S_(C_VnH|UT-C|&RiOp&)jj|KR_oa)@f?NKZX=yHO|T^o z{yF)D=(*{)1lx1|wQ<))8*Q4mCBfso?pqQsg*^P`+s4!4ZOP~HdT)W)ks7r&r$9hb zn}^?gFA;Ctb!kb;keHY%Elm}vnWYBJQ=2VDbt`HX>}XB7@#u_+N{-rvD7q+JHS`jV zs!_Z*l*OY-)0$^Ax?M^UT#!0PtthxH7zPf@FsK%fPBDE>H&r9b-PKFMSQ9@yC-}N* z%+n{q7J8&pV2Z;oYpdWS7SF0p>6~mI>W5`P@&M`yYe`9=7zXi()wXN%UxW z_p$W`F??dfCq;tmL~06t9wa-4*ShvBr#|W2QyzI~t@G&eWJPNKSlY8H?RhYA;t^C1@EuoBb%nDH7p4~ zCz}5kH3NN15WxtS$iFpuCI{vgNIB9RE9Mw36K;DeMj~{R$?Ex*!WV5D{5C7tg$whu zrM?-kt|C8sEsD%hPRbRG=x8Z2iq@Jdn6vsEn4ks%1KvH31st(pIgQl3s~DH5rf0p8u9(asBmUF`UGF?7Ju;;BA0Qw9tKc z2FFI>gLmHtz?y*$oLQvp?b5B%+dsbb;|Bw;+#gwsj4z+9G)C?m`lzw@PomfqT=$b; z`yJ(@reXg1d4NQE%YBFLE!?|wKT__UUTb-+9C+<_p}v)K4?^*>6n8yi<3*}Ml()7S z7A^;$jn|=jlE-7am&k%(fm!I}I@$Doo=1F=v2A*t3|S;B3Zg+9=(JfYbZ|%usBHUv zfgOQw3oX5R1-q&w_fjix)|k;S%-v*U1!F}WHTFDc+*xVw{il6*(>EI*MxI-X#6ON4 zT8$jKcWo{5ayjtw<4DI>uMu7XzT49PY`sRO$lF3r)JaY}555;K#uB*-3%eaaXi30G z_fzq9L;{ZMLI!$)L)WUi6Pz$luavXN#HnOD133)t%n>eHX!)LlsoP!kK-eq5asi&| z?j8+2zZOcod;G3?FY#gbuO@CLZeF`{b{Ymgf?zZ`&~qDc!ty5A&+w4mIqh2*f-EVpQ3 zIij`5txyWIX{~a*)+Tpo?Q-X0N37Fn*`Di^2Ii=#sCsliZx;00GsBEl&O$>&{9`zgjFn< z7L7yNNR1Li7QQ}jh_01d>riG*uDc0yKCf7~BuL7w(yl7E0hzd&Ta4Q}V{Do`q~*angfNFM!uFwH)ko7FFBfubup-n3G2kJ3p!`uOfsb;#+8{&;!IK* zo0^=SNM^^UQc8B})nqE;?A%6&YGrad>(P3vv~=>dnPeubOeW5Gq|Q20c4}Igt_kx7 zWRsKA$#fz+lTJEawGo-=sZ=JZOirCjPB?8fv^&T6DW|m#m8!{gH<6l|R5G}v6ckv_ zt}6FLA`6H&lt`p?V6dBo$HKbd27$mga#|=$gOz8Fm!*LUq+n8irSH3Csi%tBb)YQuKJNKW zS?a2E$I4P?6+Z-CGg^sFmZgzO@4m9MqcS>PmWCgH=e4r*EQozB4Pu|I@XyNLsj@U$ z**CK)?d2haFJoKOlwdhLjvNHwC)?uYO)?8f)s29SexJ8fwJKmsVkD-v2}EI7&}?Al z@Xx}lL;#^nsU-SINH!?+6099h4xX{=4s*z{4un8dT_F{CV~=40YovN{dF9 z-K4hZ7+h7zYS@WV48y4k*?Bd(R^w0nm{G{7hIJxdV|@!GSXjA!`Frx^GAXwlc=!B@ zaP!KYGq;!CJ^yD574Y{hZBVN_KGj@gC*YeM#X%T+e&o))8*!2up{12+(B4R^#99tU7+;J$>|x&|3;H3SNvD=c}Ar)iGrSE&|Nqtyry zy{6k2-I@k7O3X5%P_}an6pol^YPyAmkOrIdyZ>pNM)0;Kkn60~*#&g2RIE9@af-+M zLs;NHz++*}vP?daf^R{kS@!5GZr*M#A{_h?M_q{*B&u6g1O3U#A}*a!nC0f(-CyPADYjec4LpTf&Chv_ z=X7jma))Da%>O~SCE#;U;K8(>|8xQcy1BWx51xO9#`!NaUVS=ETq^!TbAk{~^D6vW zTgdVMG>&wkP%zxY70W?4U&maOr);i|LThGt!u2*0IOhJbI>E4lM2obpK)>!21Oak( zA^3<4{XZFcMD{*z8(3*Bx9(dG|E{U?&bhUwfo1>W&fYRYb3rd zMuqSr>BT>aVuSyWe$sIGk(BsfX?#^0=cx4n>Fo#Kw;mUS{<{Y^2)xz bool: + from importlib.util import find_spec + + return find_spec("aiter") is not None + + +# `find_spec` is not torch.compile compatible. +# In cases where aiter availability might have +# been checked in forward passes that are torch compiled. +# we keep this global outside to not cause torch compile breaks. +IS_AITER_FOUND = is_aiter_found() + + +def if_aiter_supported(func: Callable) -> Callable: + """Decorator that only executes the function if + ROCm AITER package is supported on gfx9 archs. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + # checks the platform, device arch and aiter library existence. + + if current_platform.is_rocm() and IS_AITER_FOUND: + from vllm.platforms.rocm import on_gfx9 + + if on_gfx9(): + return func(*args, **kwargs) + + return None + + return wrapper + + +def _rocm_aiter_group_fp8_quant_impl( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size" + from aiter import QuantType, dtypes, get_hip_quant + + aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128) + return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8) + + +def _rocm_aiter_group_fp8_quant_fake( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter import dtypes + + M, N = x.shape + x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device) + out_bs = torch.empty( + ( + M, + (N + group_size - 1) // group_size, + ), + dtype=torch.float32, + device=x.device, + ) + return x_fp8, out_bs + + +def _rocm_aiter_fused_moe_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: torch.Tensor | None = None, + activation_method: int = 0, + quant_method: int = 0, + doweight_stage1: bool = False, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, +) -> torch.Tensor: + from aiter import ActivationType, QuantType + from aiter.fused_moe import fused_moe + + activation = ActivationType(activation_method) + quant_type = QuantType(quant_method) + + return fused_moe( + hidden_states, + w1, + w2, + topk_weight, + topk_ids, + expert_mask, + activation, + quant_type, + doweight_stage1, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + ) + + +def _rocm_aiter_fused_moe_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: torch.Tensor | None = None, + activation_method: int = 0, + quant_method: int = 0, + doweight_stage1: bool = False, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +def _rocm_aiter_asm_moe_tkw1_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + fc1_scale: torch.Tensor | None = None, + fc2_scale: torch.Tensor | None = None, + fc1_smooth_scale: torch.Tensor | None = None, + fc2_smooth_scale: torch.Tensor | None = None, + a16: bool = False, + per_tensor_quant_scale: torch.Tensor | None = None, + expert_mask: torch.Tensor | None = None, + activation_method: int = 0, +) -> torch.Tensor: + from aiter import ActivationType + from aiter.fused_moe_bf16_asm import asm_moe_tkw1 + + activation = ActivationType(activation_method) + + return asm_moe_tkw1( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + fc1_scale=fc1_scale, + fc2_scale=fc2_scale, + fc1_smooth_scale=fc1_smooth_scale, + fc2_smooth_scale=fc2_smooth_scale, + a16=a16, + per_tensor_quant_scale=per_tensor_quant_scale, + expert_mask=expert_mask, + activation=activation, + ) + + +def _rocm_aiter_asm_moe_tkw1_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + fc1_scale: torch.Tensor | None = None, + fc2_scale: torch.Tensor | None = None, + fc1_smooth_scale: torch.Tensor | None = None, + fc2_smooth_scale: torch.Tensor | None = None, + a16: bool = False, + per_tensor_quant_scale: torch.Tensor | None = None, + expert_mask: torch.Tensor | None = None, + activation_method: int = 0, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +def _rocm_aiter_topk_softmax_impl( + topk_weights: torch.Tensor, + topk_indices: torch.Tensor, + token_expert_indices: torch.Tensor, + gating_output: torch.Tensor, + renormalize: bool, +) -> None: + from aiter import topk_softmax + + topk_softmax( + topk_weights, topk_indices, token_expert_indices, gating_output, renormalize + ) + + +def _rocm_aiter_topk_softmax_fake( + topk_weights: torch.Tensor, + topk_indices: torch.Tensor, + token_expert_indices: torch.Tensor, + gating_output: torch.Tensor, + renormalize: bool, +) -> None: + pass + + +def _rocm_aiter_biased_grouped_topk_impl( + gating_output: torch.Tensor, + correction_bias: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + routed_scaling_factor: float = 1.0, # mul to topk_weights +) -> None: + from aiter import biased_grouped_topk + + biased_grouped_topk( + gating_output, + correction_bias, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + need_renorm, + routed_scaling_factor, + ) + + +def _rocm_aiter_biased_grouped_topk_fake( + gating_output: torch.Tensor, + correction_bias: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + routed_scaling_factor: float = 1.0, # mul to topk_weights +) -> None: + pass + + +def _rocm_aiter_grouped_topk_impl( + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, # mul to topk_weights +) -> None: + is_softmax = scoring_func == "softmax" + from aiter import grouped_topk + + grouped_topk( + gating_output, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + need_renorm, + is_softmax, + routed_scaling_factor, + ) + + +def _rocm_aiter_grouped_topk_fake( + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, # mul to topk_weights +) -> None: + pass + + +def _rocm_aiter_mla_decode_fwd_impl( + q: torch.Tensor, + kv_buffer: torch.Tensor, + o: torch.Tensor, + qo_indptr: torch.Tensor, + max_seqlen_qo: int, + kv_indptr: torch.Tensor | None = None, + kv_indices: torch.Tensor | None = None, + kv_last_page_lens: torch.Tensor | None = None, + sm_scale: float = 1.0, + logit_cap: float = 0.0, +) -> None: + from aiter.mla import mla_decode_fwd + + mla_decode_fwd( + q, + kv_buffer.view(-1, 1, 1, q.shape[-1]), + o, + qo_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + max_seqlen_qo, + sm_scale=sm_scale, + logit_cap=logit_cap, + ) + + +def _rocm_aiter_mla_decode_fwd_fake( + q: torch.Tensor, + kv_buffer: torch.Tensor, + o: torch.Tensor, + qo_indptr: torch.Tensor, + max_seqlen_qo: int, + kv_indptr: torch.Tensor | None = None, + kv_indices: torch.Tensor | None = None, + kv_last_page_lens: torch.Tensor | None = None, + sm_scale: float = 1.0, + logit_cap: float = 0.0, +) -> None: + pass + + +def _rocm_aiter_gemm_a8w8_impl( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + bias: torch.Tensor | None = None, + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + from aiter import gemm_a8w8_CK + + # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects + # a to be [M, K] + # b to be [N, K] + # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format + return gemm_a8w8_CK(A, B, As, Bs, bias, output_dtype) + + +def _rocm_aiter_gemm_a8w8_fake( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + bias: torch.Tensor | None = None, + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + m = A.shape[0] + n = B.shape[0] + Y = torch.empty(m, n, dtype=output_dtype, device=A.device) + return Y + + +def _rocm_aiter_gemm_a8w8_blockscale_impl( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + from aiter import gemm_a8w8_blockscale + + return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype) + + +def _rocm_aiter_gemm_a8w8_blockscale_fake( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + m = A.shape[0] + n = B.shape[0] + Y = torch.empty(m, n, dtype=output_dtype, device=A.device) + return Y + + +def _rocm_aiter_rms_norm_impl( + x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float +) -> torch.Tensor: + from aiter import rms_norm + + if x.dim() > 2: + x_original_shape = x.shape + x = x.reshape(-1, x_original_shape[-1]) + x = rms_norm(x, weight, variance_epsilon) + return x.reshape(x_original_shape) + + return rms_norm(x, weight, variance_epsilon) + + +def _rocm_aiter_rms_norm_fake( + x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float +) -> torch.Tensor: + return torch.empty_like(x) + + +def _rocm_aiter_rmsnorm2d_fwd_with_add_impl( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter import rmsnorm2d_fwd_with_add + + residual_out = torch.empty_like(residual) + output = torch.empty_like(x) + rmsnorm2d_fwd_with_add( + output, # output + x, # input + residual, # residual input + residual_out, # residual output + weight, + variance_epsilon, + ) + return output, residual_out + + +def _rocm_aiter_rmsnorm2d_fwd_with_add_fake( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, +) -> tuple[torch.Tensor, torch.Tensor]: + return torch.empty_like(x), torch.empty_like(residual) + + +# Global flag to ensure ops are registered only once +_OPS_REGISTERED = False + + +class rocm_aiter_ops: + _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER + _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR + _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM + _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE + _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA + _PG_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN + _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA + _TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION + _FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM + _FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM + _TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE + _MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS + _TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM + + @classmethod + @if_aiter_supported + def is_enabled(cls) -> bool: + """Verifies device specs and availability of aiter main env variable.""" + return cls._AITER_ENABLED + + @classmethod + @if_aiter_supported + def is_linear_enabled(cls) -> bool: + """ "Verifies device specs and availability of env variable.""" + return cls._AITER_ENABLED and cls._LINEAR_ENABLED + + @classmethod + @if_aiter_supported + def is_linear_fp8_enaled(cls) -> bool: + """ "Verifies device specs and availability of env variable.""" + return cls.is_linear_enabled() and current_platform.is_fp8_fnuz() + + @classmethod + @if_aiter_supported + def is_rmsnorm_enabled(cls) -> bool: + """ "Verifies device specs and availability of env variable.""" + return cls._AITER_ENABLED and cls._RMSNORM_ENABLED + + @classmethod + @if_aiter_supported + def is_fused_moe_enabled(cls) -> bool: + """ "Verifies device specs and availability of env variable.""" + return cls._AITER_ENABLED and cls._FMOE_ENABLED + + @classmethod + @if_aiter_supported + def is_fusion_moe_shared_experts_enabled(cls) -> bool: + return cls.is_fused_moe_enabled() and cls._MOE_SHARED_EXPERTS_ENABLED + + @classmethod + @if_aiter_supported + def is_mla_enabled(cls) -> bool: + """ "Verifies device specs and availability of env variable.""" + return cls._AITER_ENABLED and cls._MLA_ENABLED + + @classmethod + @if_aiter_supported + def is_mha_enabled(cls) -> bool: + """ "Verifies device specs and availability of env variable.""" + return cls._AITER_ENABLED and cls._MHA_ENABLED + + @classmethod + @if_aiter_supported + def is_pa_attn_enabled(cls) -> bool: + """ "Verifies device specs and availability of env variable.""" + return cls._AITER_ENABLED and cls._PG_ATTN_ENABLED + + @classmethod + @if_aiter_supported + def is_triton_unified_attn_enabled(cls) -> bool: + """ "Verifies device specs and availability of env variable.""" + return cls._AITER_ENABLED and cls._TRITON_UNIFIED_ATTN_ENABLED + + @classmethod + @if_aiter_supported + def is_fp8bmm_enabled(cls) -> bool: + return cls._AITER_ENABLED and cls._FP8BMM_ENABLED + + @classmethod + @if_aiter_supported + def is_asm_fp4_gemm_dynamic_quant_enabled(cls) -> bool: + return cls._AITER_ENABLED and cls._FP4_GEMM_DYNAMIC_QUANT_ASM + + @classmethod + @if_aiter_supported + def is_triton_rotary_embed_enabled(cls) -> bool: + return cls._AITER_ENABLED and cls._TRITON_ROTARY_EMBED + + @classmethod + @if_aiter_supported + def is_triton_gemm_enabled(cls) -> bool: + return cls._AITER_ENABLED and cls._TRITON_UNQUANT_GEMM + + @staticmethod + @if_aiter_supported + def register_ops_once() -> None: + global _OPS_REGISTERED + if not _OPS_REGISTERED: + tags = ( + tuple() + if is_torch_equal_or_newer("2.7.0") + else (torch.Tag.needs_fixed_stride_order,) + ) + + # register all the custom ops here + direct_register_custom_op( + op_name="rocm_aiter_group_fp8_quant", + op_func=_rocm_aiter_group_fp8_quant_impl, + mutates_args=[], + fake_impl=_rocm_aiter_group_fp8_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_asm_moe_tkw1", + op_func=_rocm_aiter_asm_moe_tkw1_impl, + mutates_args=[], + fake_impl=_rocm_aiter_asm_moe_tkw1_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_fused_moe", + op_func=_rocm_aiter_fused_moe_impl, + mutates_args=[], + fake_impl=_rocm_aiter_fused_moe_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_topk_softmax", + op_func=_rocm_aiter_topk_softmax_impl, + mutates_args=["topk_weights", "topk_indices", "token_expert_indices"], + fake_impl=_rocm_aiter_topk_softmax_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_biased_grouped_topk", + op_func=_rocm_aiter_biased_grouped_topk_impl, + mutates_args=["topk_weights", "topk_ids"], + fake_impl=_rocm_aiter_biased_grouped_topk_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_grouped_topk", + op_func=_rocm_aiter_grouped_topk_impl, + mutates_args=["topk_weights", "topk_ids"], + fake_impl=_rocm_aiter_grouped_topk_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_mla_decode_fwd", + op_func=_rocm_aiter_mla_decode_fwd_impl, + mutates_args=["o"], + fake_impl=_rocm_aiter_mla_decode_fwd_fake, + tags=tags, + ) + + direct_register_custom_op( + op_name="rocm_aiter_gemm_a8w8", + op_func=_rocm_aiter_gemm_a8w8_impl, + mutates_args=[], + fake_impl=_rocm_aiter_gemm_a8w8_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_gemm_a8w8_blockscale", + op_func=_rocm_aiter_gemm_a8w8_blockscale_impl, + mutates_args=[], + fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_rms_norm", + op_func=_rocm_aiter_rms_norm_impl, + mutates_args=[], + fake_impl=_rocm_aiter_rms_norm_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_rmsnorm2d_fwd_with_add", + op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl, + mutates_args=[], + fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake, + dispatch_key=current_platform.dispatch_key, + ) + + _OPS_REGISTERED = True + + @staticmethod + def rms_norm2d_with_add( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + ) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add( + x, residual, weight, variance_epsilon + ) + + @staticmethod + def rms_norm( + x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float + ) -> torch.Tensor: + return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon) + + @staticmethod + def gemm_a8w8( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + bias: torch.Tensor | None = None, + output_dtype: torch.dtype = torch.float16, + ) -> torch.Tensor: + return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype) + + @staticmethod + def gemm_a8w8_blockscale( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype = torch.float16, + ) -> torch.Tensor: + return torch.ops.vllm.rocm_aiter_gemm_a8w8_blockscale( + A, B, As, Bs, output_dtype + ) + + @staticmethod + def fused_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: torch.Tensor | None = None, + activation_method: int = 0, + quant_method: int = 0, + doweight_stage1: bool = False, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + ) -> torch.Tensor: + return torch.ops.vllm.rocm_aiter_fused_moe( + hidden_states, + w1, + w2, + topk_weight, + topk_ids, + expert_mask, + activation_method, + quant_method, + doweight_stage1, + w1_scale, + w2_scale, + a1_scale, + a2_scale, + ) + + @staticmethod + def asm_moe_tkw1( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + fc1_scale: torch.Tensor | None = None, + fc2_scale: torch.Tensor | None = None, + fc1_smooth_scale: torch.Tensor | None = None, + fc2_smooth_scale: torch.Tensor | None = None, + a16: bool = False, + per_tensor_quant_scale: torch.Tensor | None = None, + expert_mask: torch.Tensor | None = None, + activation_method: int = 0, + ) -> torch.Tensor: + return torch.ops.vllm.rocm_aiter_asm_moe_tkw1( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + fc1_scale, + fc2_scale, + fc1_smooth_scale, + fc2_smooth_scale, + a16, + per_tensor_quant_scale, + expert_mask, + activation_method, + ) + + @staticmethod + def topk_softmax( + topk_weights: torch.Tensor, + topk_indices: torch.Tensor, + token_expert_indices: torch.Tensor, + gating_output: torch.Tensor, + renormalize: bool, + ) -> tuple[torch.Tensor, ...]: + torch.ops.vllm.rocm_aiter_topk_softmax( + topk_weights, topk_indices, token_expert_indices, gating_output, renormalize + ) + return topk_weights, topk_indices + + @staticmethod + def biased_grouped_topk( + gating_output: torch.Tensor, + correction_bias: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + routed_scaling_factor: float = 1.0, + ) -> None: + torch.ops.vllm.rocm_aiter_biased_grouped_topk( + gating_output, + correction_bias, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + need_renorm, + routed_scaling_factor, + ) + + @staticmethod + def grouped_topk( + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + ) -> None: + torch.ops.vllm.rocm_aiter_grouped_topk( + gating_output, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + need_renorm, + scoring_func, + routed_scaling_factor, + ) + + @staticmethod + def mla_decode_fwd( + q: torch.Tensor, + kv_buffer: torch.Tensor, + o: torch.Tensor, + sm_scale: float, + qo_indptr: torch.Tensor, + max_seqlen_qo: int, + kv_indptr: torch.Tensor | None = None, + kv_indices: torch.Tensor | None = None, + kv_last_page_lens: torch.Tensor | None = None, + logit_cap: float = 0.0, + ): + torch.ops.vllm.rocm_aiter_mla_decode_fwd( + q, + kv_buffer.view(-1, 1, 1, q.shape[-1]), + o, + qo_indptr, + max_seqlen_qo, + kv_indptr, + kv_indices, + kv_last_page_lens, + sm_scale=sm_scale, + logit_cap=logit_cap, + ) + + @staticmethod + def triton_fp4_gemm_dynamic_qaunt( + x: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + out_dtype: torch.dtype | None = torch.bfloat16, + x_scales: torch.Tensor | None = None, + ) -> torch.Tensor: + from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4 + from aiter.ops.triton.quant import dynamic_mxfp4_quant + + if x_scales is None: + x_q, x_s = dynamic_mxfp4_quant(x) + else: + x_q = x + x_s = x_scales + + y = torch.empty( + x_q.shape[0], weight.shape[0], device=x_q.device, dtype=out_dtype + ) + + gemm_afp4wfp4(x_q, weight, x_s, weight_scale.T, out_dtype, y) + return y + + @staticmethod + def triton_rotary_embed( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + cos_sin_cache: torch.Tensor, + head_size: int, + rotary_dim: int, + is_neox_style: bool, + ): + from aiter.ops.triton.rope import rope_cached_thd_positions_2c_fwd_inplace + + num_tokens = positions.numel() + cos, sin = cos_sin_cache.chunk(2, dim=-1) + query_shape = query.shape + key_shape = key.shape + rotate_style = 0 if is_neox_style else 1 + + query = query.view(num_tokens, -1, head_size) + key = key.view(num_tokens, -1, head_size) + query_ = query[..., :rotary_dim] + key_ = key[..., :rotary_dim] + positions = positions.view(*query.shape[:1]) + rope_cached_thd_positions_2c_fwd_inplace( + positions, + sin, + cos, + query_, + key_, + rotate_style, + reuse_freqs_front_part=True, + is_nope_first=False, + ) + query = query.view(query_shape) + key = key.view(key_shape) + + @staticmethod + def triton_fp8_bmm( + X: torch.Tensor, + WQ: torch.Tensor, + w_scale: torch.Tensor, + group_size: int = 128, + bias: torch.Tensor | None = None, + dtype: torch.dtype | None = torch.bfloat16, + splitK: int | None = None, + YQ: torch.Tensor | None = None, + transpose_bm: bool | None = False, + config: dict | None = None, + ) -> torch.Tensor: + # ruff: noqa: E501 # isort: skip + from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( + batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm, + ) + + return aiter_triton_fp8_bmm( + X, + WQ, + w_scale, + group_size=group_size, + bias=bias, + dtype=dtype, + splitK=splitK, + YQ=YQ, + transpose_bm=transpose_bm, + config=config, + ) + + @staticmethod + def triton_gemm_a8w8_blockscale( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype = torch.float16, + ) -> torch.Tensor: + from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale + + return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype) + + @staticmethod + def group_fp8_quant( + input_2d: torch.Tensor, + group_size: int = 128, + ) -> tuple[torch.Tensor, ...]: + assert group_size == 128, "Group size must be 128" + return torch.ops.vllm.rocm_aiter_group_fp8_quant(input_2d, group_size) + + @staticmethod + def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool: + return (n, k) in [ + (1024, 8192), + (2112, 7168), + (3072, 1536), + (32768, 8192), + (4096, 7168), + (4608, 7168), + (512, 7168), + (7168, 2048), + (7168, 256), + (8192, 1024), + (8192, 32768), + ] + + @staticmethod + def shuffle_weight( + self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16) + ) -> torch.Tensor: + from aiter.ops.shuffle import shuffle_weight + + return shuffle_weight(tensor, layout=layout) + + @staticmethod + def shuffle_weights( + *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16) + ) -> tuple[torch.Tensor, ...]: + """ + Applies shuffle_weight function from AITER to each + input tensor and returns them. + + Rearranges (shuffles) the input tensor/s + into a specified block layout for optimized computation. + + Args: + *tensors: Variable number of torch.Tensor objects. + layout: A pair of integers specifying the block sizes used to divide + the tensors during shuffling. Default is (16, 16). + + Returns: + A Tuple of shuffled tensors. + """ + from aiter.ops.shuffle import shuffle_weight + + return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors) + + +rocm_aiter_ops.register_ops_once() diff --git a/_bc_linter.py b/_bc_linter.py new file mode 100644 index 0000000..2929a8b --- /dev/null +++ b/_bc_linter.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# vllm/_bc_linter.py +from collections.abc import Callable +from typing import Any, TypeVar, overload + +T = TypeVar("T") + + +@overload +def bc_linter_skip(obj: T) -> T: ... + + +@overload +def bc_linter_skip(*, reason: str | None = ...) -> Callable[[T], T]: ... + + +def bc_linter_skip(obj: Any = None, *, reason: str | None = None): + """ + No-op decorator to mark symbols/files for BC-linter suppression. + + Usage: + @bc_linter_skip + def legacy_api(...): ... + """ + + def _wrap(x: T) -> T: + return x + + return _wrap if obj is None else obj + + +@overload +def bc_linter_include(obj: T) -> T: ... + + +@overload +def bc_linter_include(*, reason: str | None = ...) -> Callable[[T], T]: ... + + +def bc_linter_include(obj: Any = None, *, reason: str | None = None): + """ + Usage: + @bc_linter_include + def public_api(...): ... + """ + + def _wrap(x: T) -> T: + return x + + return _wrap if obj is None else obj + + +__all__ = ["bc_linter_skip", "bc_linter_include"] diff --git a/_custom_ops.py b/_custom_ops.py new file mode 100644 index 0000000..54243aa --- /dev/null +++ b/_custom_ops.py @@ -0,0 +1,3512 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING, Literal, Optional, List, Dict, Any + +import torch +import torch.nn.functional as F +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType + +import ixformer.inference.functions as ops +from ixformer.distributed import _distributed as cdist +import vllm.envs as envs +from ixformer.core import config +import math +_USE_TORCH_OPS = config.IXFORMER_USE_TORCH_OPS + +current_platform.import_kernels() + +if TYPE_CHECKING: + + def register_fake(fn): + return lambda name: fn +else: + try: + from torch.library import register_fake + except ImportError: + from torch.library import impl_abstract as register_fake + +# activation ops +def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.silu_and_mul(x, out) + + +def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.gelu_and_mul(x, out) + + +def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ops.gelu_tanh_and_mul(x, out) + +def swigluoai_and_mul(out: torch.Tensor, x: torch.Tensor, + alpha: float = 1.702, limit: float = 7.0) -> None: + ops.swigluoai_and_mul(x, out, alpha, limit) + +#https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py +def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: + x = 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) + out.copy_(x) + return out + + +def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: + x = 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) + out.copy_(x) + return out + + +def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: + #inplace + out.copy_(x) + out.mul_(torch.sigmoid(x * 1.702)) + return out + + +# page attention ops +def paged_attention_v1( + out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + block_size: int, + max_seq_len: int, + alibi_slopes: torch.Tensor | None, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +) -> None: + torch.ops._C.paged_attention_v1( + out, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + +def paged_attention_v2( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + block_size: int, + max_seq_len: int, + alibi_slopes: torch.Tensor | None, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, +) -> None: + torch.ops._C.paged_attention_v2( + out, + exp_sum, + max_logits, + tmp_out, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + + +def paged_attention_rocm( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + query_start_loc: torch.Tensor | None, + block_size: int, + max_seq_len: int, + alibi_slopes: torch.Tensor | None, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + fp8_out_scale: torch.Tensor | None = None, + mfma_type: str = "fp8" if envs.VLLM_ROCM_FP8_MFMA_PAGE_ATTN else "f16", +) -> None: + torch.ops._rocm_C.paged_attention( + out, + exp_sum, + max_logits, + tmp_out, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + query_start_loc, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + fp8_out_scale, + mfma_type, + ) + + +def mla_decode_kvcache_cpu( + out: torch.Tensor, + query: torch.Tensor, + kv_cache: torch.Tensor, + scale: float, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, +) -> None: + torch.ops._C_cpu.mla_decode_kvcache( + out, query, kv_cache, scale, block_tables, seq_lens + ) + + +# merge attn states ops +def merge_attn_states( + output: torch.Tensor, + prefix_output: torch.Tensor, + prefix_lse: torch.Tensor, + suffix_output: torch.Tensor, + suffix_lse: torch.Tensor, + output_lse: torch.Tensor | None = None, +) -> None: + torch.ops._C.merge_attn_states( + output, output_lse, prefix_output, prefix_lse, suffix_output, suffix_lse + ) + + +def convert_vertical_slash_indexes( + q_seqlens: torch.Tensor, # [BATCH, ] + kv_seqlens: torch.Tensor, # [BATCH, ] + vertical_indexes: torch.Tensor, # [BATCH, N_HEADS, NNZ_V] + slash_indexes: torch.Tensor, # [BATCH, N_HEADS, NNZ_S] + context_size: int, + block_size_M: int, + block_size_N: int, + causal: bool = True, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + batch_size = slash_indexes.size(0) + num_heads = slash_indexes.size(1) + nnz_slash = slash_indexes.size(2) + nnz_vertical = vertical_indexes.size(2) + num_rows = (context_size + block_size_M - 1) // block_size_M + + block_count = torch.zeros( + batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device + ) + block_offset = torch.zeros( + batch_size, + num_heads, + num_rows, + nnz_slash, + dtype=q_seqlens.dtype, + device=q_seqlens.device, + ) + column_count = torch.zeros( + batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device + ) + column_index = torch.zeros( + batch_size, + num_heads, + num_rows, + nnz_vertical, + dtype=q_seqlens.dtype, + device=q_seqlens.device, + ) + + torch.ops._C.convert_vertical_slash_indexes( + block_count, + block_offset, + column_count, + column_index, + q_seqlens, + kv_seqlens, + vertical_indexes, + slash_indexes, + context_size, + block_size_M, + block_size_N, + causal, + ) + return block_count, block_offset, column_count, column_index + + +def convert_vertical_slash_indexes_mergehead( + q_seqlens: torch.Tensor, # [BATCH, ] + kv_seqlens: torch.Tensor, # [BATCH, ] + vertical_indexes: torch.Tensor, # [BATCH, N_HEADS, NNZ_V] + slash_indexes: torch.Tensor, # [BATCH, N_HEADS, NNZ_S] + # [N_HEADS] : different head use different number of indices + vertical_indices_count: torch.Tensor, + slash_indices_count: torch.Tensor, + context_size: int, + block_size_M: int, + block_size_N: int, + causal: bool = True, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + batch_size = slash_indexes.size(0) + num_heads = slash_indexes.size(1) + nnz_slash = slash_indexes.size(2) + nnz_vertical = vertical_indexes.size(2) + num_rows = (context_size + block_size_M - 1) // block_size_M + + block_count = torch.empty( + batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device + ) + block_offset = torch.empty( + batch_size, + num_heads, + num_rows, + nnz_slash, + dtype=q_seqlens.dtype, + device=q_seqlens.device, + ) + column_count = torch.empty( + batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device + ) + column_index = torch.empty( + batch_size, + num_heads, + num_rows, + nnz_vertical, + dtype=q_seqlens.dtype, + device=q_seqlens.device, + ) + + torch.ops._C.convert_vertical_slash_indexes_mergehead( + block_count, + block_offset, + column_count, + column_index, + q_seqlens, + kv_seqlens, + vertical_indexes, + slash_indexes, + vertical_indices_count, + slash_indices_count, + context_size, + block_size_M, + block_size_N, + causal, + ) + return block_count, block_offset, column_count, column_index + + +# pos encoding ops +def rotary_embedding( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + ops.vllm_rotary_embedding(positions, query, key, head_size, + cos_sin_cache, is_neox) + +def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, + key: Optional[torch.Tensor], head_size: int, + cos_sin_cache: torch.Tensor, is_neox: bool, + rot_dim: int, + cos_sin_cache_offsets: torch.Tensor) -> None: + ops.vllm_batched_rotary_embedding(positions, query, key, head_size, + cos_sin_cache, is_neox, rot_dim, + cos_sin_cache_offsets) +def m_rotary_embedding( + positions: torch.Tensor, + query: torch.Tensor, + key: Optional[torch.Tensor], + head_size: int, + cos_sin_cache: torch.Tensor, + smrope_section: torch.Tensor, + is_neox: bool, +) -> None: + ops.vllm_m_rotary_embedding(positions, query, key, head_size, + cos_sin_cache, smrope_section, is_neox) + +# layer norm ops +def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, + epsilon: float) -> None: + ops.rms_norm(input, weight, epsilon, out) + + +def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, + weight: torch.Tensor, epsilon: float, + residual_alpha: Optional[float] = 1) -> None: + output, residual_output = ops.residual_rms_norm(input, weight, epsilon, residual_alpha, residual) + return output, residual_output + +def rms_norm_qk( + output_q: torch.Tensor, + output_k: torch.Tensor, + input_q: torch.Tensor, + input_k: torch.Tensor, + weight_q: torch.Tensor, + weight_k: torch.Tensor, + epsilon: float, +) -> None: + ops.rms_norm_qk( + input_q, input_k, weight_q, weight_k, epsilon, output_q, output_k) + + +def fused_qk_norm_rope( + qkv: torch.Tensor, + num_heads_q: int, + num_heads_k: int, + num_heads_v: int, + head_dim: int, + eps: float, + q_weight: torch.Tensor, + k_weight: torch.Tensor, + cos_sin_cache: torch.Tensor, + is_neox: bool, + position_ids: torch.Tensor, +) -> None: + torch.ops._C.fused_qk_norm_rope( + qkv, + num_heads_q, + num_heads_k, + num_heads_v, + head_dim, + eps, + q_weight, + k_weight, + cos_sin_cache, + is_neox, + position_ids, + ) + + +def apply_repetition_penalties_torch( + logits: torch.Tensor, + prompt_mask: torch.Tensor, + output_mask: torch.Tensor, + repetition_penalties: torch.Tensor, +) -> None: + repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat( + 1, logits.size(1) + ) + # If token appears in prompt or output, apply, otherwise use 1.0 for no-op. + penalties = torch.where(prompt_mask | output_mask, repetition_penalties, 1.0) + # If logits are positive, divide by penalty, otherwise multiply by penalty. + scaling = torch.where(logits > 0, 1.0 / penalties, penalties) + logits *= scaling + + +def apply_repetition_penalties_cuda( + logits: torch.Tensor, + prompt_mask: torch.Tensor, + output_mask: torch.Tensor, + repetition_penalties: torch.Tensor, +) -> None: + torch.ops._C.apply_repetition_penalties_( + logits, prompt_mask, output_mask, repetition_penalties + ) + + +def apply_repetition_penalties( + logits: torch.Tensor, + prompt_mask: torch.Tensor, + output_mask: torch.Tensor, + repetition_penalties: torch.Tensor, +) -> None: + """Apply repetition penalties to logits in-place. + + Args: + logits: The logits tensor of shape [num_seqs, vocab_size]. + prompt_mask: A boolean tensor indicating which tokens appear in the prompt. + output_mask: A boolean tensor indicating which tokens appear in the output. + repetition_penalties: The repetition penalties of shape (num_seqs, ). + """ + apply_repetition_penalties_torch( + logits, prompt_mask, output_mask, repetition_penalties + ) + +# fused quant layer norm ops +def rms_norm_dynamic_per_token_quant( + input: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + quant_dtype: torch.dtype, + scale_ub: torch.Tensor | None = None, + residual: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + output = torch.empty_like(input, dtype=quant_dtype) + scales = torch.empty( + (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32 + ) + + torch.ops._C.rms_norm_dynamic_per_token_quant( + output, input, weight, scales, epsilon, scale_ub, residual + ) + return output, scales + + +# quantization ops +# awq +def awq_dequantize( + qweight: torch.Tensor, + scales: torch.Tensor, + zeros: torch.Tensor, + split_k_iters: int, + thx: int, + thy: int, +) -> torch.Tensor: + if envs.VLLM_USE_TRITON_AWQ: + from vllm.model_executor.layers.quantization.awq_triton import ( + awq_dequantize_triton, + ) + + return awq_dequantize_triton(qweight, scales, zeros) + return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters, thx, thy) + + +def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, + pack_factor, group_size: int = 128) -> torch.Tensor: + return ops.wui4a16(input, qweight, scales, qzeros, None, group_size, "NN") + + + +def custom_gptq_marlin_gemm(input: torch.Tensor, qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, + pack_factor, group_size: int = 128, bias = None) -> torch.Tensor: + if _USE_TORCH_OPS: + return torch.ops.ixf_ops.wui4a16(input, qweight, scales, qzeros, bias, group_size, "NN") + else: + return ops.wui4a16(input, qweight, scales, qzeros, bias, group_size, "NN") + + +# gptq +def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, + b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor, + b_g_idx: torch.Tensor, use_exllama: bool, use_v2_format: bool, + bit: int) -> torch.Tensor: + if use_v2_format: + raise NotImplementedError("gptq_gemm not support use_v2_format") + return ops.gptq_gemm(a, b_q_weight, b_gptq_qzeros ,b_gptq_scales, + b_g_idx, use_exllama, bit) + + +if hasattr(torch.ops._C, "gptq_gemm"): + + @register_fake("_C::gptq_gemm") + def _gptq_gemm_fake( + a: torch.Tensor, + b_q_weight: torch.Tensor, + b_gptq_qzeros: torch.Tensor, + b_gptq_scales: torch.Tensor, + b_g_idx: torch.Tensor, + use_exllama: bool, + use_v2_format: bool, + bit: int, + ) -> torch.Tensor: + return torch.empty( + (a.size(0), b_q_weight.size(1)), dtype=a.dtype, device=a.device + ) + + +def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, + bit: int) -> None: + ops.vllm_gptq_shuffle(q_weight, q_perm, bit) + + +# marlin_24 +def gptq_marlin_24_gemm( + a: torch.Tensor, + b_q_weight: torch.Tensor, + b_meta: torch.Tensor, + b_scales: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: int, + size_n: int, + size_k: int, +) -> torch.Tensor: + return torch.ops._C.gptq_marlin_24_gemm( + a, b_q_weight, b_meta, b_scales, workspace, b_q_type.id, size_m, size_n, size_k + ) + + +if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): + + @register_fake("_C::gptq_marlin_24_gemm") + def _gptq_marlin_24_gemm_fake( + a: torch.Tensor, + b_q_weight: torch.Tensor, + b_meta: torch.Tensor, + b_scales: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + ) -> torch.Tensor: + return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) + + @register_fake("_C::gptq_marlin_gemm") + def _gptq_marlin_gemm_fake( + a: torch.Tensor, + c: torch.Tensor | None, + b_q_weight: torch.Tensor, + b_bias: torch.Tensor | None, + b_scales: torch.Tensor, + global_scale: torch.Tensor | None, + b_zeros: torch.Tensor | None, + g_idx: torch.Tensor | None, + perm: torch.Tensor | None, + workspace: torch.Tensor, + b_q_type_id: int, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool = True, + use_atomic_add: bool = False, + use_fp32_reduce: bool = False, + is_zp_float: bool = False, + ) -> torch.Tensor: + return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) + + @register_fake("_C::awq_dequantize") + def _awq_dequantize_fake( + qweight: torch.Tensor, + scales: torch.Tensor, + zeros: torch.Tensor, + split_k_iters: torch.SymInt, + thx: int, + thy: int, + ) -> torch.Tensor: + in_c = qweight.size(0) + qout_c = qweight.size(1) + out_c = qout_c * 8 + return torch.empty((in_c, out_c), dtype=scales.dtype, device=scales.device) + + @register_fake("_C::awq_gemm") + def _awq_gemm_fake( + input: torch.Tensor, + qweight: torch.Tensor, + qzeros: torch.Tensor, + scales: torch.Tensor, + split_k_iters: torch.SymInt, + ) -> torch.Tensor: + num_in_feats = input.size(0) + return torch.empty( + (split_k_iters, num_in_feats, qweight.size(1) * 8), + dtype=input.dtype, + device=input.device, + ).sum(0) + + @register_fake("_C::machete_mm") + def machete_mm_fake( + a: torch.Tensor, + # b_q Should be the tensor returned by machete_prepack_B + b_q: torch.Tensor, + b_type: ScalarType, + out_type: torch.dtype | None = None, + b_group_scales: torch.Tensor | None = None, + b_group_zeros: torch.Tensor | None = None, + b_group_size: int | None = None, + b_channel_scales: torch.Tensor | None = None, + a_token_scales: torch.Tensor | None = None, + schedule: str | None = None, + ) -> torch.Tensor: + m = a.size(0) + n = b_q.size(1) + return torch.empty((m, n), device=a.device, dtype=a.dtype) + + @register_fake("_C::machete_prepack_B") + def machete_prepack_B_fake( + b_q_weight: torch.Tensor, + a_type: torch.dtype, + b_type: ScalarType, + group_scales_type: torch.dtype | None, + ) -> torch.Tensor: + return torch.empty_like(b_q_weight, memory_format=torch.contiguous_format) + + @register_fake("_C::cutlass_w4a8_mm") + def cutlass_w4a8_mm_fake( + a: torch.Tensor, + # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b + b_q: torch.Tensor, + b_group_scales: torch.Tensor, + b_group_size: int, + b_channel_scales: torch.Tensor, + a_token_scales: torch.Tensor, + out_type: torch.dtype | None = None, + maybe_schedule: str | None = None, + ) -> torch.Tensor: + m = a.size(0) + n = b_q.size(1) + out_dtype = out_type if out_type is not None else torch.bfloat16 + return torch.empty((m, n), device=a.device, dtype=out_dtype) + + @register_fake("_C::cutlass_pack_scale_fp8") + def cutlass_pack_scale_fp8_fake(scales: torch.Tensor) -> torch.Tensor: + return torch.empty_like(scales, memory_format=torch.contiguous_format) + + @register_fake("_C::cutlass_encode_and_reorder_int4b") + def cutlass_encode_and_reorder_int4b_fake(b: torch.Tensor) -> torch.Tensor: + return torch.empty_like(b, memory_format=torch.contiguous_format) + + +if hasattr(torch.ops._C, "allspark_w8a16_gemm"): + + @register_fake("_C::allspark_w8a16_gemm") + def _allspark_w8a16_gemm_fake( + a: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: torch.Tensor | None, + n: torch.SymInt, + group_size: torch.SymInt, + sm_count: torch.SymInt, + sm_version: torch.SymInt, + CUBLAS_M_THRESHOLD: torch.SymInt, + has_zp: bool, + n32k16_reorder: bool, + ) -> torch.Tensor: + m = a.size(0) + return torch.empty((m, n), device=a.device, dtype=a.dtype) + + +if hasattr(torch.ops._C, "ggml_dequantize"): + + @register_fake("_C::ggml_dequantize") + def _ggml_dequantize_fake( + W: torch.Tensor, + quant_type: int, + m: torch.SymInt, + n: torch.SymInt, + dtype: torch.dtype | None = None, + ) -> torch.Tensor: + return torch.empty((m, n), dtype=torch.float16, device=W.device) + + @register_fake("_C::ggml_mul_mat_vec_a8") + def _ggml_mul_mat_vec_a8_fake( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: torch.SymInt, + ) -> torch.Tensor: + return torch.empty((X.shape[0], row), dtype=X.dtype, device=W.device) + + @register_fake("_C::ggml_mul_mat_a8") + def _ggml_mul_mat_a8_fake( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: torch.SymInt, + ) -> torch.Tensor: + batch = X.size(0) + return torch.empty((batch, row), dtype=X.dtype, device=W.device) + + @register_fake("_C::ggml_moe_a8") + def _ggml_moe_a8_fake( + X: torch.Tensor, + W: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + quant_type: int, + row: torch.SymInt, + top_k: torch.SymInt, + tokens: torch.SymInt, + ) -> torch.Tensor: + tokens = X.size(0) + return torch.empty((tokens * top_k, row), dtype=torch.float16, device=W.device) + + +if hasattr(torch.ops._C, "ggml_moe_a8_vec"): + + @register_fake("_C::ggml_moe_a8_vec") + def _ggml_moe_a8_vec_fake( + X: torch.Tensor, + W: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + quant_type: int, + row: torch.SymInt, + tokens: torch.SymInt, + ) -> torch.Tensor: + tokens = X.size(0) + return torch.empty((tokens * top_k, row), dtype=X.dtype, device=W.device) + + +# cutlass +def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool: + return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability) + +def cutlass_blockwise_scaled_grouped_mm( + output: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + scales_a: torch.Tensor, + scales_b: torch.Tensor, + problem_sizes: torch.Tensor, + expert_offsets: torch.Tensor, +): + torch.ops._C.cutlass_blockwise_scaled_grouped_mm( + output, a, b, scales_a, scales_b, problem_sizes, expert_offsets + ) + + +def cutlass_scaled_fp4_mm( + a: torch.Tensor, + b: torch.Tensor, + block_scale_a: torch.Tensor, + block_scale_b: torch.Tensor, + alpha: torch.Tensor, + out_dtype: torch.dtype, +) -> torch.Tensor: + assert a.ndim == 2 and b.ndim == 2 + m, n = a.shape[0], b.shape[0] + out = torch.empty((m, n), dtype=out_dtype, device=a.device) + torch.ops._C.cutlass_scaled_fp4_mm(out, a, b, block_scale_a, block_scale_b, alpha) + return out + + +def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool: + return False + + +def cutlass_scaled_mm_supports_block_fp8(cuda_device_capability: int) -> bool: + return False + + +def cutlass_scaled_mm( + a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: torch.dtype, + bias: torch.Tensor | None = None, + format: str = "TN" +) -> torch.Tensor: + """ + `cutlass_scaled_mm` implements a fused version of + `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)` + where scale_a * a and scale_b * b are implemented using numpy-style + broadcasting. + + In order to support blockwise scaling like found in DeepSeek V3 we also + support extended "group" broadcast rules. We extend the numpy-style + broadcasting rules with the following rule: + "if the extent of a dimension in the source shape is between 1 and + corresponding extent in the target shape we repeat each element along + that dimension src_shape[dim] // target_shape[dim] times consecutively" + example if we have: + a = [[1, 2], and target_shape = (2, 4) + [3, 4]] + then we would expand a to: + a = [[1, 1, 2, 2], + [3, 3, 4, 4]] + currently we only support the case: + scale_a.shape * [1, 128] == a.shape + scale_b.shape * [128, 128] == b.shape + """ + assert out_dtype is torch.bfloat16 or out_dtype is torch.float16 + assert bias is None or bias.numel() == b.shape[1] and bias.dtype == out_dtype + + + m = a.shape[0] + n = b.shape[1] + if format == "TN": + b = b.t() + out = torch.empty((m, n), dtype=out_dtype, device=a.device) + + ops.w8a8(a, b, scale_a, scale_b, bias, format=format, output=out, out_dtype=out_dtype) + + return out + + +def cutlass_scaled_mm_azp( + a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: torch.dtype, + azp_adj: torch.Tensor, + azp: torch.Tensor | None = None, + bias: torch.Tensor | None = None, +) -> torch.Tensor: + """ + :param azp_adj: In the per-tensor case, this should include the azp. + Always per-channel. + :param azp: Only set in the per-token case. Per-token if set. + """ + assert b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0 + assert out_dtype is torch.bfloat16 or out_dtype is torch.float16 + assert bias is None or bias.numel() == b.shape[1] and bias.dtype == out_dtype + + # Massage the input to be 2D + target_shape = (*a.shape[:-1], b.shape[1]) + a = a.view(-1, a.shape[-1]) + assert azp is None or azp.numel() == a.shape[0] + + out = torch.empty((a.shape[0], b.shape[1]), dtype=out_dtype, device=a.device) + torch.ops._C.cutlass_scaled_mm_azp(out, a, b, scale_a, scale_b, azp_adj, azp, bias) + return out.view(*target_shape) + +def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool: + return torch.ops._C.cutlass_sparse_scaled_mm_supported(cuda_device_capability) + + +def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool: + try: + return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability) + except AttributeError: + # Return False on non-CUDA platforms where it is not available + return False + + +def cutlass_sparse_compress(a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Compresses a sparse matrix for use with Cutlass sparse operations. + + This function takes a dense tensor and compresses it into two components: + non-zero elements and metadata. The compressed representation is compatible + with Cutlass sparse kernels. + + Args: + a (torch.Tensor): + The input tensor to be compressed. Must have one of the following data types: + - `torch.int8` + - `torch.float8_e4m3fn` + - `torch.bfloat16` + - `torch.float16` + + Returns: + tuple[torch.Tensor, torch.Tensor]: + A tuple containing: + - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`. + - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation. + + Raises: + ValueError: If the compression operation fails. + + Notes: + - The `a_meta` tensor has a data type of `torch.uint8`. + - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`). + - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor. + - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`. + """ + assert a.dtype in [torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16] + assert a.is_contiguous() + + # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4 + elemsPerMetaElem = 4 + assert a.shape[1] % (2 * elemsPerMetaElem) == 0 + + return torch.ops._C.cutlass_sparse_compress(a) + + +def cutlass_scaled_sparse_mm( + a: torch.Tensor, + bt_nzs: torch.Tensor, + bt_meta: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: torch.dtype, + bias: torch.Tensor | None = None, +) -> torch.Tensor: + """ + Performs a scaled sparse matrix multiplication using Cutlass. + + Steps: + 1. Create a dense matrix `a` of shape (m, k) on the CUDA device: + `a = torch.randn((m, k), device='cuda')`. + + 2. Create a dense matrix `b` of shape (k, n) on the CUDA device: + `b = torch.randn((k, n), device='cuda')`. + + 3. Prune matrix `b` to 2:4 sparsity along the specified dimension: + `b = prune_to_2_4(b, dim=0)`. + + 4. Compress the transposed sparse matrix `b.t()`: + `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`. + + 5. Perform sparse matrix multiplication using the compressed matrix, + applying scaling factors for `a` and `b`, and the output data type: + `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`. + + Returns: + - The result of the scaled sparse matrix multiplication. + """ + assert bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0 + assert out_dtype is torch.bfloat16 or out_dtype is torch.float16 + assert bias is None or bias.shape[0] == bt_nzs.shape[0] and bias.dtype == out_dtype + + m = a.shape[0] + n = bt_nzs.shape[0] + out = torch.empty((m, n), dtype=out_dtype, device=a.device) + + torch.ops._C.cutlass_scaled_sparse_mm( + out, a, bt_nzs, bt_meta, scale_a, scale_b, bias + ) + + return out + + +def get_cutlass_moe_mm_data( + topk_ids: torch.Tensor, + expert_offsets: torch.Tensor, + problem_sizes1: torch.Tensor, + problem_sizes2: torch.Tensor, + input_permutation: torch.Tensor, + output_permutation: torch.Tensor, + num_experts: int, + n: int, + k: int, + blockscale_offsets: torch.Tensor | None = None, +): + """ + Prepare data necessary to perform CUTLASS grouped matrix multiplications + used in CUTLASS-based fused MoE. + + The function takes in topk_ids (token-expert mapping) and uses it to + compute: + - expert_offsets: Indices that mark at which token index each expert begins + its computation after the input is sorted with + input_permutation. The number of tokens computed with + expert E is expert_offsets[E + 1] - expert_offsets[E] + - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's + multiplication in two grouped MMs used in + the fused MoE operation. + - input_permutation: Permutation that must be used to shuffle the input + before executing the MMs. + - output_permutation: Permutation that must be used to shuffle the output + after executing the MMs. + - blockscale_offsets: Optional argument passed for fp4 moe. Indices that + mark at which block scale index each expert begins + its computation. The number of block scale rows + computed with expert E is blockscale_offsets[E + 1] - + blockscale_offsets[E] + """ + return torch.ops._C.get_cutlass_moe_mm_data( + topk_ids, + expert_offsets, + problem_sizes1, + problem_sizes2, + input_permutation, + output_permutation, + num_experts, + n, + k, + blockscale_offsets, + ) + + +def get_cutlass_moe_mm_problem_sizes( + topk_ids: torch.Tensor, + problem_sizes1: torch.Tensor, + problem_sizes2: torch.Tensor, + num_experts: int, + n: int, + k: int, + blockscale_offsets: torch.Tensor | None = None, +): + """ + Compute only the per-expert problem sizes needed by the two grouped matrix + multiplications used in CUTLASS-based fused MoE. + + The function takes in topk_ids (token→expert mapping) and computes: + - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's + multiplication for the two grouped MMs + used in the fused MoE operation. + """ + return torch.ops._C.get_cutlass_moe_mm_problem_sizes( + topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, blockscale_offsets + ) + + +def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor): + """ + Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor. + This is used in MoE to permute the input tensor before performing grouped matrix multiplications. + """ + num_tokens_permuted = dst2src_map.shape[0] + output_tensor = torch.empty( + (num_tokens_permuted, input_tensor.shape[1]), + device=input_tensor.device, + dtype=input_tensor.dtype, + ) + torch.ops._moe_C.shuffle_rows(input_tensor, dst2src_map, output_tensor) + return output_tensor + + +def get_cutlass_pplx_moe_mm_data( + expert_offsets: torch.Tensor, + problem_sizes1: torch.Tensor, + problem_sizes2: torch.Tensor, + expert_num_tokens: torch.Tensor, + num_local_experts: int, + padded_m: int, + n: int, + k: int, +): + """ + Prepare data necessary to perform CUTLASS grouped matrix multiplications + used in CUTLASS-based fused MoE. + + The function takes in expert_num_tokens (token count per expert) and + non_zero_expert_idxs (consecutive indices of experts with non-zero token + counts) and uses them to compute: + - expert_offsets: Indices that mark at which token index each expert begins + its computation. + - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's + multiplication in two grouped MMs used in + the fused MoE operation. + """ + return torch.ops._C.get_cutlass_pplx_moe_mm_data( + expert_offsets, + problem_sizes1, + problem_sizes2, + expert_num_tokens, + num_local_experts, + padded_m, + n, + k, + ) + + +def cutlass_moe_mm( + out_tensors: torch.Tensor, + a_tensors: torch.Tensor, + b_tensors: torch.Tensor, + a_scales: torch.Tensor, + b_scales: torch.Tensor, + expert_offsets: torch.Tensor, + problem_sizes: torch.Tensor, + a_strides: torch.Tensor, + b_strides: torch.Tensor, + c_strides: torch.Tensor, + per_act_token: bool, + per_out_ch: bool, +): + """ + A single grouped matrix multiplication used in CUTLASS-based fused MoE. + The function executes fp8-quantized OUT = AB matrix multiplication. + + - expert_offsets: Indices that mark at which token index each expert begins + its computation. The number of tokens computed with + expert E is expert_offsets[E + 1] - expert_offsets[E] + - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped + MMs used in the fused MoE operation. + - a/b/c_strides: The data strides passed to grouped matrix multiplication. + """ + return torch.ops._C.cutlass_moe_mm( + out_tensors, + a_tensors, + b_tensors, + a_scales, + b_scales, + expert_offsets, + problem_sizes, + a_strides, + b_strides, + c_strides, + per_act_token, + per_out_ch, + ) + + +def cutlass_fp4_moe_mm( + out_tensors: torch.Tensor, + a_tensors: torch.Tensor, + b_tensors: torch.Tensor, + a_scales: torch.Tensor, + b_scales: torch.Tensor, + alphas: torch.Tensor, + problem_sizes: torch.Tensor, + expert_offsets: torch.Tensor, + sf_offsets: torch.Tensor, +): + """ + An FP4 Blockscaled Group Gemm that takes in a_tensors, b_tensors and runs + the gemms for each combination based on the specified problem sizes. + + This is used as the MoE gemm during NVFP4 Quantized FusedMoE forward. + - a/b_tensors: the NVFP4 a_ptrs and b_ptrs tensors which are quantized + input and expert weights. + - a_/b_scales: The blockscales in FP8-E4M3 precision + - expert_offsets/sf_offsets: Indices that mark at which token index + each expert begins its computation. The number of tokens + computed with expert E is expert_offsets[E + 1] - + expert_offsets[E] And the sf_size per expert is + sf_offset[E+1] - sf_offset[E] + - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped + MMs used in the fused MoE operation. + """ + return torch.ops._C.cutlass_fp4_group_mm( + out_tensors, + a_tensors, + b_tensors, + a_scales, + b_scales, + alphas, + problem_sizes, + expert_offsets, + sf_offsets, + ) + +# gptq_marlin +def gptq_marlin_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits) + + +if hasattr(torch.ops._C, "gptq_marlin_repack"): + + @register_fake("_C::gptq_marlin_repack") + def _gptq_marlin_repack_fake( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: torch.SymInt, + size_n: torch.SymInt, + num_bits: int, + ) -> torch.Tensor: + pack_factor = 32 // num_bits + marlin_tile_size = 16 + return torch.empty( + (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor), + dtype=b_q_weight.dtype, + device=b_q_weight.device, + ) + + +# awq_marlin +def awq_marlin_repack( + b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int +) -> torch.Tensor: + return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits) + + +if hasattr(torch.ops._C, "awq_marlin_repack"): + + @register_fake("_C::awq_marlin_repack") + def _awq_marlin_repack_fake( + b_q_weight: torch.Tensor, + size_k: torch.SymInt, + size_n: torch.SymInt, + num_bits: int, + ) -> torch.Tensor: + pack_factor = 32 // num_bits + marlin_tile_size = 16 + return torch.empty( + (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor), + dtype=b_q_weight.dtype, + device=b_q_weight.device, + ) + + +def gptq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = torch.ops._C.gptq_marlin_repack( + b_q_weight[e], perm[e], size_k, size_n, num_bits + ) + return output + + +def awq_marlin_moe_repack( + b_q_weight: torch.Tensor, + perm: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + num_experts = b_q_weight.shape[0] + assert size_k % 16 == 0 + output = torch.empty( + (num_experts, size_k // 16, size_n * (num_bits // 2)), + device=b_q_weight.device, + dtype=b_q_weight.dtype, + ) + for e in range(num_experts): + output[e] = torch.ops._C.awq_marlin_repack( + b_q_weight[e], size_k, size_n, num_bits + ) + return output + + +def gptq_marlin_gemm( + a: torch.Tensor, + c: torch.Tensor | None, + b_q_weight: torch.Tensor, + b_bias: torch.Tensor | None, + b_scales: torch.Tensor, + global_scale: torch.Tensor | None, + b_zeros: torch.Tensor | None, + g_idx: torch.Tensor | None, + perm: torch.Tensor | None, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: int, + size_n: int, + size_k: int, + is_k_full: bool = True, + use_atomic_add: bool = False, + use_fp32_reduce: bool = False, + is_zp_float: bool = False, +) -> torch.Tensor: + return torch.ops._C.gptq_marlin_gemm( + a, + c, + b_q_weight, + b_bias, + b_scales, + global_scale, + b_zeros, + g_idx, + perm, + workspace, + b_q_type.id, + size_m, + size_n, + size_k, + is_k_full, + use_atomic_add, + use_fp32_reduce, + is_zp_float, + ) + + +# machete +def machete_supported_schedules( + a_type: torch.dtype, + b_type: ScalarType, + group_scales_type: torch.dtype | None, + group_zeros_type: torch.dtype | None = None, + channel_scales_type: torch.dtype | None = None, + token_scales_type: torch.dtype | None = None, + out_type: torch.dtype | None = None, +) -> list[str]: + return torch.ops._C.machete_supported_schedules( + a_type, + b_type.id, + group_scales_type, + group_zeros_type, + channel_scales_type, + token_scales_type, + out_type, + ) + + +def machete_mm( + a: torch.Tensor, + # b_q Should be the tensor returned by machete_prepack_B + b_q: torch.Tensor, + b_type: ScalarType, + out_type: torch.dtype | None = None, + b_group_scales: torch.Tensor | None = None, + b_group_zeros: torch.Tensor | None = None, + b_group_size: int | None = None, + b_channel_scales: torch.Tensor | None = None, + a_token_scales: torch.Tensor | None = None, + schedule: str | None = None, +) -> torch.Tensor: + return torch.ops._C.machete_mm( + a, + b_q, + b_type.id, + out_type, + b_group_scales, + b_group_zeros, + b_group_size, + b_channel_scales, + a_token_scales, + schedule, + ) + + +def machete_prepack_B( + b_q_weight: torch.Tensor, + a_type: torch.dtype, + b_type: ScalarType, + group_scales_type: torch.dtype | None, +) -> torch.Tensor: + return torch.ops._C.machete_prepack_B( + b_q_weight, a_type, b_type.id, group_scales_type + ) + + +# CUTLASS W4A8 +def cutlass_w4a8_mm( + a: torch.Tensor, + # b_q Should be the tensor returned by cutlass_encode_and_reorder_int4b + b_q: torch.Tensor, + b_group_scales: torch.Tensor, + b_group_size: int, + b_channel_scales: torch.Tensor, + a_token_scales: torch.Tensor, + out_type: torch.dtype | None = None, + maybe_schedule: str | None = None, +) -> torch.Tensor: + return torch.ops._C.cutlass_w4a8_mm( + a, + b_q, + b_group_scales, + b_group_size, + b_channel_scales, + a_token_scales, + out_type, + maybe_schedule, + ) + + +def cutlass_pack_scale_fp8(scales: torch.Tensor) -> torch.Tensor: + return torch.ops._C.cutlass_pack_scale_fp8(scales) + + +def cutlass_encode_and_reorder_int4b(b: torch.Tensor) -> torch.Tensor: + return torch.ops._C.cutlass_encode_and_reorder_int4b(b) + + +if hasattr(torch.ops._C, "permute_cols"): + + @register_fake("_C::permute_cols") + def _permute_cols_fake(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor: + return torch.empty_like(a) + + +def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor: + return torch.ops._C.permute_cols(a, perm) + + +# fp4 +def scaled_fp4_quant( + input: torch.Tensor, input_global_scale: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantize input tensor to FP4 and return quantized tensor and scale. + + This function quantizes the last dimension of the given tensor `input`. For + every 16 consecutive elements, a single dynamically computed scaling factor + is shared. This scaling factor is quantized using the `input_global_scale` + and is stored in a swizzled layout (see + https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x). + + Args: + input: The input tensor to be quantized to FP4 + input_global_scale: A scalar scaling factor for the entire tensor. + + Returns: + tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every + two values are packed into a uint8 and float8_e4m3 scaling factors + in the sizzled layout. + """ + assert not current_platform.is_rocm() + assert input.ndim >= 1, f"input.ndim needs to be >= 1, but got {input.ndim}." + other_dims = 1 if input.ndim == 1 else -1 + input = input.reshape(other_dims, input.shape[-1]) + m, n = input.shape + block_size = 16 + device = input.device + + assert n % block_size == 0, f"last dim has to be multiple of 16, but got {n}." + assert input.dtype in (torch.float16, torch.bfloat16), ( + f"input.dtype needs to be fp16 or bf16 but got {input.dtype}." + ) + + # Two fp4 values will be packed into an uint8. + output = torch.empty((m, n // 2), device=device, dtype=torch.uint8) + + # We use the rounded values to store the swizzled values. Due to the + # requirement of the Tensor Core, the minimum tile is 128x4 for the scales. + # So, we first pad the scales to multiples of 128 and 4. Then, the scales + # (in float8_e4m3fn) are packed into an int32 for every 4 values. More: + # https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x + round_up = lambda x, y: (x + y - 1) // y * y + rounded_m = round_up(m, 128) + scale_n = n // block_size + rounded_n = round_up(scale_n, 4) + output_scale = torch.empty( + (rounded_m, rounded_n // 4), device=device, dtype=torch.int32 + ) + + torch.ops._C.scaled_fp4_quant(output, input, output_scale, input_global_scale) + output_scale = output_scale.view(torch.float8_e4m3fn) + return output, output_scale + + +def scaled_fp4_experts_quant( + input_tensor: torch.Tensor, + input_global_scale: torch.Tensor, + expert_offsets: torch.Tensor, + blockscale_offsets: torch.Tensor, + topk: int, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantize input tensor to FP4 and return quantized tensor and scale, for + packed MoE Inputs. + Args: + input_tensor: The input tensor to be quantized to FP4 + input_global_scale: A scalar scaling factor for the entire tensor. + expert_offsets: The expert offsets tensor + blockscale_offsets: The blockscale offsets tensor + Outputs: + output: The quantized tensor in FP4 + output_scales: The blockscale tensor in FP8-E4M3 + """ + assert not current_platform.is_rocm() + assert input_tensor.ndim == 2, ( + f"input.ndim needs to be == 2, but got {input_tensor.ndim}." + ) + + # Control the maximum number of tokens per expert supported by the + # NVFP4 MoE Expert Quantization. This is used to prevent the kernel + # from running out of memory. This value can also be increased to support + # larger models. + MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE + m_numtopk, k = input_tensor.shape + + assert m_numtopk <= MAX_TOKENS_PER_EXPERT * topk, ( + f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT(" + f"{MAX_TOKENS_PER_EXPERT})" + f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use" + f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value." + ) + scales_k = k // 16 + padded_k = (scales_k + (4 - 1)) // 4 + + # output is uint8 and packed fp4 values + output = torch.empty( + m_numtopk, k // 2, device=input_tensor.device, dtype=torch.uint8 + ) + output_scales = torch.empty( + MAX_TOKENS_PER_EXPERT * topk, + padded_k, + dtype=torch.int32, + device=input_tensor.device, + ) + torch.ops._C.scaled_fp4_experts_quant( + output, + output_scales, + input_tensor, + input_global_scale, + expert_offsets, + blockscale_offsets, + ) + output_scales = output_scales.view(torch.float8_e4m3fn) + return output, output_scales + + +# fp8 +def scaled_fp8_quant( + input: torch.Tensor, + scale: torch.Tensor | None = None, + num_token_padding: int | None = None, + scale_ub: torch.Tensor | None = None, + use_per_token_if_dynamic: bool = False, + output: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantize input tensor to FP8 and return quantized tensor and scale. + + This function supports both static and dynamic quantization: If you + provide the scale, it will use static scaling and if you omit it, + the scale will be determined dynamically. The function also allows + optional padding of the output tensors for downstream kernels that + will benefit from padding. + + Args: + input: The input tensor to be quantized to FP8 + scale: Optional scaling factor for the FP8 quantization + scale_ub: Optional upper bound for scaling factor in dynamic + per token case + num_token_padding: If specified, pad the first dimension + of the output to at least this value. + use_per_token_if_dynamic: Whether to do per_tensor or per_token + in the dynamic quantization case. + + Returns: + tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and + scaling factor. + """ + # This code assumes batch_dim and num_tokens are flattened + assert input.ndim == 2 + shape: tuple[int, int] | torch.Size = input.shape + # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz + out_dtype: torch.dtype = current_platform.fp8_dtype() + if num_token_padding: + shape = (max(num_token_padding, input.shape[0]), shape[1]) + if output is None: + output = torch.empty(shape, device=input.device, dtype=out_dtype) + else: + assert num_token_padding is None, "padding not supported if output passed in" + assert output.dtype == out_dtype + + if scale is None: + if use_per_token_if_dynamic: + scale = torch.empty((shape[0], 1), device=input.device, dtype=torch.float32) + torch.ops._C.dynamic_per_token_scaled_fp8_quant( + output, input, scale, scale_ub + ) + else: + scale = torch.empty((1, 1), device=input.device, dtype=torch.float32) + torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) + else: + assert scale.numel() == 1, f"{scale.shape}" + torch.ops._C.static_scaled_fp8_quant(output, input, scale) + + return output, scale + + +# gptq allspark +def allspark_repack_weight( + qweight: torch.Tensor, + scale: torch.Tensor, + zero_point: torch.Tensor | None = None, + has_zp: bool = False, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format + for Ampere W8A16 Fused Gemm kernel + + Args: + qweight: uint8 weight tensor, original k x n format. + scale: fp16/bf16 weight scale tensor, 1 x n format. + zero_point: fp16/bf16 weight zero_point tensor, 1 x n format. + Must be provided for asymmetric quantization. + has_zp: if use symmetric quantization, has_zp = False. + if use asymmetric quantization, has_zp = True. + + Returns: + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : + rearranged weight, scale, and optionally zero_point. + """ + K = qweight.shape[0] + N = qweight.shape[1] + N_32align = (N + 32 - 1) // 32 * 32 + + qweight_reorder = torch.empty( + (N_32align, K), device=qweight.device, dtype=qweight.dtype + ) + scale_reorder = torch.empty((1, N_32align), device=scale.device, dtype=scale.dtype) + zero_point_reorder = None + if has_zp: + assert zero_point is not None, ( + "zero_point must be provided for asymmetric quantization." + ) + zero_point_reorder = torch.empty( + (1, N_32align), device=zero_point.device, dtype=zero_point.dtype + ) + + torch.ops._C.rearrange_kn_weight_as_n32k16_order( + qweight, + scale, + zero_point, + has_zp, + qweight_reorder, + scale_reorder, + zero_point_reorder, + K, + N, + N_32align, + ) + + return qweight_reorder, scale_reorder, zero_point_reorder + + +def allspark_w8a16_gemm( + a: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: torch.Tensor | None, + n: int, + group_size: int, + sm_count: int, + sm_version: int, + CUBLAS_M_THRESHOLD: int, + has_zp: bool, + n32k16_reorder: bool, +) -> torch.Tensor: + return torch.ops._C.allspark_w8a16_gemm( + a, + b_qweight, + b_scales, + b_qzeros, + n, + group_size, + sm_count, + sm_version, + CUBLAS_M_THRESHOLD, + has_zp, + n32k16_reorder, + ) + + +# int8 +def scaled_int8_quant( + input: torch.Tensor, + scale: torch.Tensor | None = None, + azp: torch.Tensor | None = None, + symmetric: bool = True, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: + """ + Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp. + + Args: + input: The input tensor to be quantized to int8. + scale: Optional scaling factor for the int8 quantization. + When not provided, we invoke dynamic-per-token quantization. + azp: Optional zero-point for the int8 quantization. + Must be provided for asymmetric quantization if `scale` is provided. + symmetric: Whether to use symmetric quantization (scale only, azp ignored). + + Returns: + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. + """ + output = torch.empty_like(input, dtype=torch.int8) + if scale is not None: + # static-per-tensor quantization. + assert symmetric == (azp is None), ( + "azp must only be provided for asymmetric quantization." + ) + ops.static_scaled_int8_quant(output, input, scale) + return output, scale, azp + + # dynamic-per-token quantization. + input_scales = torch.empty( + (input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32 + ) + input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32) + ops.dynamic_scaled_int8_quant(output, input, input_scales) + return output, input_scales, input_azp + + +# gguf +def ggml_dequantize( + W: torch.Tensor, quant_type: int, m: int, n: int, dtype: torch.dtype | None +) -> torch.Tensor: + return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype) + + +def ggml_mul_mat_vec_a8( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: int, +) -> torch.Tensor: + return torch.ops._C.ggml_mul_mat_vec_a8(W, X, quant_type, row) + +def ggml_mul_mat_a8( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: int, +) -> torch.Tensor: + return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row) + + +def ggml_moe_a8( + X: torch.Tensor, + W: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + quant_type: int, + row: int, + top_k: int, + tokens: int, +) -> torch.Tensor: + return torch.ops._C.ggml_moe_a8( + X, + W, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + quant_type, + row, + top_k, + tokens, + ) + + +def ggml_moe_a8_vec( + X: torch.Tensor, + W: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + quant_type: int, + row: torch.SymInt, + tokens: torch.SymInt, +) -> torch.Tensor: + return torch.ops._C.ggml_moe_a8_vec(X, W, topk_ids, top_k, quant_type, row, tokens) + + +def ggml_moe_get_block_size(quant_type: int) -> int: + return torch.ops._C.ggml_moe_get_block_size(quant_type) + +# mamba +def selective_scan_fwd( + u: torch.Tensor, + delta: torch.Tensor, + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + D_: torch.Tensor | None, + z_: torch.Tensor | None, + delta_bias_: torch.Tensor | None, + delta_softplus: bool, + query_start_loc: torch.Tensor | None, + cache_indices: torch.Tensor | None, + has_initial_state: torch.Tensor | None, + ssm_states: torch.Tensor, + pad_slot_id: int, + block_size: int = 1024, + block_idx_first_scheduled_token: torch.Tensor | None = None, + block_idx_last_scheduled_token: torch.Tensor | None = None, + initial_state_idx: torch.Tensor | None = None, +): + torch.ops._C.selective_scan_fwd( + u, + delta, + A, + B, + C, + D_, + z_, + delta_bias_, + delta_softplus, + query_start_loc, + cache_indices, + has_initial_state, + ssm_states, + pad_slot_id, + block_size, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + initial_state_idx, + ) + + +# ROCm skinny gemms +def LLMM1(a: torch.Tensor, b: torch.Tensor, rows_per_block: int) -> torch.Tensor: + return torch.ops._rocm_C.LLMM1(a, b, rows_per_block) + + +def wvSplitK( + a: torch.Tensor, b: torch.Tensor, cu_count: int, bias: torch.Tensor = None +) -> torch.Tensor: + return torch.ops._rocm_C.wvSplitK(a, b, bias, cu_count) + + +def wvSplitKQ( + a: torch.Tensor, + b: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + cu_count: int, + bias: torch.Tensor = None, +) -> torch.Tensor: + out = torch.empty((b.shape[0], a.shape[0]), dtype=out_dtype, device=b.device) + torch.ops._rocm_C.wvSplitKQ(a, b, bias, out, scale_a, scale_b, cu_count) + return out + + +# moe +def moe_sum(input: torch.Tensor, output: torch.Tensor): + torch.ops._moe_C.moe_sum(input, output) + + +def moe_align_block_size( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + ops.vllm_moe_align_block_size(topk_ids, num_experts, block_size, + sorted_token_ids, experts_ids, + num_tokens_post_pad) + + +def batched_moe_align_block_size( + max_tokens_per_batch: int, + block_size: int, + expert_num_tokens: torch.Tensor, + sorted_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + torch.ops._moe_C.batched_moe_align_block_size( + max_tokens_per_batch, + block_size, + expert_num_tokens, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + + +def moe_lora_align_block_size( + topk_ids: torch.Tensor, + token_lora_mapping: torch.Tensor, + num_experts: int, + block_size: int, + max_loras: int, + max_num_tokens_padded: int, + max_num_m_blocks: int, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + adapter_enabled: torch.Tensor, + lora_ids: torch.Tensor, +) -> None: + torch.ops._moe_C.moe_lora_align_block_size( + topk_ids, + token_lora_mapping, + num_experts, + block_size, + max_loras, + max_num_tokens_padded, + max_num_m_blocks, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + adapter_enabled, + lora_ids, + ) + + +def moe_wna16_gemm( + input: torch.Tensor, + output: torch.Tensor, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: torch.Tensor | None, + topk_weights: torch.Tensor | None, + sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, + top_k: int, + BLOCK_SIZE_M: int, + BLOCK_SIZE_N: int, + BLOCK_SIZE_K: int, + bit: int, +) -> torch.Tensor: + if not current_platform.is_cuda(): + raise NotImplementedError( + "The optimized moe_wna16_gemm kernel is only available on CUDA platforms" + ) + torch.ops._moe_C.moe_wna16_gemm( + input, + output, + b_qweight, + b_scales, + b_qzeros, + topk_weights, + sorted_token_ids, + experts_ids, + num_tokens_post_pad, + top_k, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + BLOCK_SIZE_K, + bit, + ) + + +def topk_softmax( + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indices: torch.Tensor, + gating_output: torch.Tensor, +) -> None: + ops.vllm_moe_topk_softmax(topk_weights, topk_ids, + token_expert_indices, gating_output) + + +def grouped_topk( + scores: torch.Tensor, + num_expert_group: int, + topk_group: int, + topk: int, + renormalize: bool, + routed_scaling_factor: float, + bias: torch.Tensor, + scoring_func: int = 0, +): + """ + Perform grouped top-k routing for mixture of experts. + + Args: + scores: Raw inputs (logits if scoring_func=1, scores if scoring_func=0) + num_expert_group: Number of expert groups + topk_group: Number of groups to select + topk: Number of experts to select per token + renormalize: Whether to renormalize the output weights + routed_scaling_factor: Scaling factor for routing weights + bias: Bias tensor (e_score_correction_bias). Always fused in kernel. + scoring_func: 0=none (no activation), 1=sigmoid + """ + if not current_platform.is_cuda(): + raise NotImplementedError( + "The fused grouped_topk kernel is only available on CUDA platforms" + ) + return torch.ops._moe_C.grouped_topk( + scores, + num_expert_group, + topk_group, + topk, + renormalize, + routed_scaling_factor, + bias, + scoring_func, + ) + + +def moe_wna16_marlin_gemm( + input: torch.Tensor, + output: torch.Tensor | None, + b_qweight: torch.Tensor, + b_bias: torch.Tensor | None, + b_scales: torch.Tensor, + global_scale: torch.Tensor | None, + b_qzeros: torch.Tensor | None, + g_idx: torch.Tensor | None, + perm: torch.Tensor | None, + workspace: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_past_padded: torch.Tensor, + topk_weights: torch.Tensor, + moe_block_size: int, + top_k: int, + mul_topk_weights: bool, + is_ep: bool, + b_q_type: ScalarType, + size_m: int, + size_n: int, + size_k: int, + is_k_full: bool, + use_atomic_add: bool, + use_fp32_reduce: bool, + is_zp_float: bool, +) -> torch.Tensor: + return torch.ops._moe_C.moe_wna16_marlin_gemm( + input, + output, + b_qweight, + b_bias, + b_scales, + global_scale, + b_qzeros, + g_idx, + perm, + workspace, + sorted_token_ids, + expert_ids, + num_tokens_past_padded, + topk_weights, + moe_block_size, + top_k, + mul_topk_weights, + is_ep, + b_q_type.id, + size_m, + size_n, + size_k, + is_k_full, + use_atomic_add, + use_fp32_reduce, + is_zp_float, + ) + + +if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "marlin_gemm_moe"): + + @register_fake("_moe_C::marlin_gemm_moe") + def marlin_gemm_moe_fake( + a: torch.Tensor, + b_q_weights: torch.Tensor, + sorted_ids: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + b_scales: torch.Tensor, + b_zero_points: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + workspace: torch.Tensor, + b_q_type: ScalarType, + size_m: torch.SymInt, + size_n: torch.SymInt, + size_k: torch.SymInt, + is_k_full: bool, + num_experts: int, + topk: int, + moe_block_size: int, + replicate_input: bool, + apply_weights: bool, + ) -> torch.Tensor: + return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) + + @register_fake("_moe_C::moe_wna16_marlin_gemm") + def moe_wna16_marlin_gemm_fake( + input: torch.Tensor, + output: torch.Tensor | None, + b_qweight: torch.Tensor, + b_scales: torch.Tensor, + b_qzeros: torch.Tensor | None, + g_idx: torch.Tensor | None, + perm: torch.Tensor | None, + workspace: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_past_padded: torch.Tensor, + topk_weights: torch.Tensor, + moe_block_size: int, + top_k: int, + mul_topk_weights: bool, + is_ep: bool, + b_q_type: ScalarType, + size_m: int, + size_n: int, + size_k: int, + is_k_full: bool, + use_atomic_add: bool, + use_fp32_reduce: bool, + is_zp_float: bool, + ) -> torch.Tensor: + return torch.empty( + (size_m * top_k, size_n), dtype=input.dtype, device=input.device + ) + + +def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, +) -> None: + torch.ops._C_cache_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping, + kv_cache_dtype, + k_scale, + v_scale, + ) + + +def reshape_and_cache_flash( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, +) -> None: + ops.reshape_and_cache_flash(key, value, key_cache, + value_cache, slot_mapping, + kv_cache_dtype, 1.0, 1.0) + + +def reshape_and_cache_flash_mix( + key: torch.Tensor, + value: torch.Tensor, + key_scale: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + key_scale_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, +): + ops.reshape_and_cache_flash_mix(key, value, key_scale, + key_cache, value_cache, key_scale_cache, + slot_mapping, kv_cache_dtype) + + +def concat_and_cache_mla( + kv_c: torch.Tensor, + k_pe: torch.Tensor, + kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + scale: torch.Tensor, +) -> None: + ops.vllm_concat_and_cache_mla(kv_c, k_pe, kv_cache, + slot_mapping, kv_cache_dtype, + scale) + + + +def copy_blocks( + key_caches: list[torch.Tensor], + value_caches: list[torch.Tensor], + block_mapping: torch.Tensor, +) -> None: + torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) + + +def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: + torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping) + + +def swap_blocks( + src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor +) -> None: + ops.vllm_swap_blocks(src, dst, block_mapping) + + +def convert_fp8( + output: torch.Tensor, input: torch.Tensor, scale: float = 1.0, kv_dtype: str = "fp8" +) -> None: + torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype) + + +def gather_and_maybe_dequant_cache( + src_cache: torch.Tensor, + dst: torch.Tensor, + block_table: torch.Tensor, + cu_seq_lens: torch.Tensor, + batch_size: int, + kv_cache_dtype: str, + scale: torch.Tensor, + seq_starts: torch.Tensor | None = None, +) -> None: + torch.ops._C_cache_ops.gather_and_maybe_dequant_cache( + src_cache, + dst, + block_table, + cu_seq_lens, + batch_size, + kv_cache_dtype, + scale, + seq_starts, + ) + + +def cp_gather_cache( + src_cache: torch.Tensor, + dst: torch.Tensor, + block_table: torch.Tensor, + cu_seq_lens: torch.Tensor, + batch_size: int, + seq_starts: torch.Tensor | None = None, +) -> None: + ops.vllm_cp_gather_cache( + src_cache, dst, block_table, cu_seq_lens, batch_size, seq_starts + ) + + +def indexer_k_quant_and_cache( + k: torch.Tensor, + kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, + quant_block_size: int, + kv_cache_dtype: str, +) -> None: + torch.ops._C_cache_ops.indexer_k_quant_and_cache( + k, kv_cache, slot_mapping, quant_block_size, kv_cache_dtype + ) + +def indexer_k_cache(k: torch.Tensor, kv_cache: torch.Tensor,slot_mapping: torch.Tensor)-> None: + num_tokens, head_dim = k.shape + _, block_size, cache_stride = kv_cache.shape + assert head_dim == cache_stride + for i in range(num_tokens): + block_idx = torch.div(slot_mapping[i], block_size, rounding_mode="floor") + block_offset = slot_mapping[i] % block_size + kv_cache[block_idx, block_offset, :] = k[i] + +def ref_mqa_logits( + q: torch.Tensor, # [num_tokens, n_head, head_dim] - 可能已量化 + k: torch.Tensor, # [num_blocks, block_size, head_dim] 或展开形式 - 可能已量化 + weights: torch.Tensor, # [num_tokens, n_head, 1] - 权重 + cu_seqlen_ks: torch.Tensor, # 序列起始位置 + cu_seqlen_ke: torch.Tensor, # 序列结束位置 +) -> torch.Tensor: + """ + 多查询注意力logits计算的PyTorch等价实现 + """ + + M, H, D = q.shape + N = k.shape[0] + device = q.device + # 初始化输出logits [M, N] + logits = torch.full((M, N), -float('inf'), device=device, dtype=torch.float32) + for i in range(M): + seq_start = cu_seqlen_ks[i] + seq_end = cu_seqlen_ke[i] + + if seq_start >= seq_end: + continue + + #当前查询的Q [H, D] + q_i = q[i] # [H, D] + + seq_k = k[seq_start:seq_end] # [seq_len, head_dim] + + # 计算注意力分数 [H, seq_len] + attention_scores = torch.matmul(q_i, seq_k.T) # BF16计算 + attention_scores = F.relu(attention_scores) + + # 应用权重 [H, seq_len] + attention_scores_f32 = attention_scores.float() + weights_i = weights[i].unsqueeze(1) # [H, 1] + weighted_scores = attention_scores_f32 * weights_i # [H, seq_len] + + # 汇总所有头的logits [seq_len] + logits_i = torch.sum(weighted_scores, dim=0) # [seq_len] + + # 将结果填充到输出logits的对应位置 + logits[i, seq_start:seq_end] = logits_i + + return logits + +def ref_paged_mqa_logits( + q: torch.Tensor, + kv_cache: torch.Tensor, + weights: torch.Tensor, + context_lens: torch.Tensor, + block_tables: torch.Tensor, + max_model_len: int, + clean_logits: bool = True +) -> torch.Tensor: + """使用分页KV缓存计算FP8多查询注意力logits的PyTorch实现 + + Args: + q: 查询张量 [B, next_n, H, D] + kv_cache: 分页KV缓存 [num_blocks, block_size, 1, D] + weights: 权重张量 [B * next_n, H], dtype=torch.float32 + context_lens: 上下文长度 [B], dtype=int32 + block_tables: 块映射表 [B, max_blocks], dtype=int32 + schedule_metadata: 调度元数据 + max_model_len: 最大序列长度,用于确定输出logits大小 + + Returns: + Logits张量 [B * next_n, max_model_len], dtype=torch.float32 + """ + def reassemble_k_from_paged_cache( + kv_cache: torch.Tensor, + block_table: torch.Tensor, + context_len: int, + head_dim: int, + block_size: int + ) -> torch.Tensor: + """从分页缓存中重组K值""" + num_blocks_needed = (context_len + block_size - 1) // block_size + valid_blocks = block_table[:num_blocks_needed] + device = kv_cache.device + # 初始化输出K序列 [context_len, head_dim] + k_sequence = torch.zeros(context_len, head_dim, device=device, dtype=kv_cache.dtype) + token_offset = 0 + for block_idx in valid_blocks: + if block_idx < 0: + break + # 当前块中的token数量 + tokens_in_block = min(block_size, context_len - token_offset) + if tokens_in_block <= 0: + break + # 从缓存块中提取K值 + block_data = kv_cache[block_idx] # [block_size, 1, D] + + # 提取K值 + k_sequence[token_offset:token_offset + tokens_in_block] = block_data[:tokens_in_block, 0, :head_dim] # [tokens_in_block, D] + token_offset += tokens_in_block + + return k_sequence + + def compute_mqa_logits( + q: torch.Tensor, # [next_n, H, D] + k: torch.Tensor, # [context_len, D] + weights: torch.Tensor, # [next_n, H] + context_len: int, + max_model_len: int + ) -> torch.Tensor: + """计算多查询注意力logits""" + next_n, H, D = q.shape + device = q.device + + # 初始化批次logits [next_n, max_model_len] + batch_logits = torch.full((next_n, max_model_len), -float('inf'), + device=device, dtype=torch.float32) + + # 扩展K以匹配多头 [context_len, H, D] + k_expanded = k.unsqueeze(1).expand(-1, H, -1) # [context_len, H, D] + + # 转置以便矩阵乘法 + q_transposed = q.transpose(0, 1) # [H, next_n, D] + k_transposed = k_expanded.transpose(0, 1) # [H, context_len, D] + # 批量计算注意力分数 [H, next_n, context_len] + attention_scores = torch.bmm(q_transposed, k_transposed.transpose(1, 2)) # [H, next_n, context_len] + attention_scores = F.relu(attention_scores) + # 应用权重并汇总所有头 [next_n, context_len] + weights_expanded = weights.transpose(0, 1).unsqueeze(2) # [H, next_n, 1] + weighted_scores = attention_scores * weights_expanded # [H, next_n, context_len] + logits_per_token = weighted_scores.sum(dim=0) # [next_n, context_len] + + # 填充到输出logits中 + batch_logits[:, :context_len] = logits_per_token + + return batch_logits + def clean_logits_tensor( + logits: torch.Tensor, + context_lens: torch.Tensor, + next_n: int, + max_model_len: int + ) -> torch.Tensor: + """清理logits张量,将超出上下文长度的位置设为负无穷""" + B = len(context_lens) + + for batch_idx in range(B): + context_len = context_lens[batch_idx].item() + if context_len >= max_model_len: + continue + + # 当前批次在logits中的位置 + batch_start = batch_idx * next_n + batch_end = (batch_idx + 1) * next_n + + # 将超出上下文长度的位置设为负无穷 + logits[batch_start:batch_end, context_len:] = -float('inf') + + return logits + + B, next_n, H, D = q.shape + num_blocks, block_size, _, cache_stride = kv_cache.shape + device = q.device + + # 初始化输出logits [B * next_n, max_model_len] + logits = torch.full((B * next_n, max_model_len), -float('inf'), + device=device, dtype=torch.float32) + + # 处理每个批次 + for batch_idx in range(B): + context_len = context_lens[batch_idx].item() + if context_len == 0: + continue + + # 当前批次的查询 [next_n, H, D] + batch_q = q[batch_idx] # [next_n, H, D] + + # 当前批次的权重 [next_n, H] + batch_weights_start = batch_idx * next_n + batch_weights_end = (batch_idx + 1) * next_n + batch_weights = weights[batch_weights_start:batch_weights_end] # [next_n, H] + + # 从分页缓存中重组K值 + batch_k = reassemble_k_from_paged_cache( + kv_cache, block_tables[batch_idx], context_len, D, block_size + ) # [context_len, D] + # 计算多查询注意力logits + batch_logits = compute_mqa_logits( + batch_q, batch_k, batch_weights, context_len, max_model_len + ) # [next_n, max_model_len] + + # 填充到输出logits中 + logits[batch_weights_start:batch_weights_end] = batch_logits + + if clean_logits: + # 清理logits:将超出上下文长度的位置设为负无穷 + logits = clean_logits_tensor(logits, context_lens, next_n, max_model_len) + + return logits + +def sparse_prefill_fwd( + q: torch.Tensor, + kv: torch.Tensor, + indices: torch.Tensor, + sm_scale: float, + d_v: int = 512, +): + """ + 稀疏注意力预填充内核的PyTorch实现 + + Args: + - q: [s_q, h_q, d_qk], bfloat16 + - kv: [s_kv, h_kv, d_qk], bfloat16 + - indices: [s_q, h_kv, topk], int32. 无效索引设为-1或>=s_kv + - sm_scale: float + - d_v: 值向量的维度,只能为512 + + Returns: + - (output, max_logits, lse) + - output: [s_q, h_q, d_v], bfloat16 + - max_logits: [s_q, h_q], float + - lse: [s_q, h_q], float, 以2为底的对数求和指数 + """ + def ref_masked_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + sm_scale: float, + ) -> torch.Tensor: + query = query * sm_scale + dtype = query.dtype + device = query.device + attn = torch.einsum("qhd,khd->hqk", query, key) + attn = attn.to(torch.float) + attn = torch.softmax(attn, dim=-1) + value = value.to(torch.float) + out = torch.einsum("hqk,khd->qhd", attn, value) + out = out.to(device).to(dtype) + return out + s_q, h_q, d_qk = q.shape + s_kv, h_kv, _ = kv.shape + _, _, topk = indices.shape + + device = q.device + dtype = q.dtype + + # 分离K和V + k = kv # [s_kv, h_kv, d_qk] + v = kv[:, :, :d_v] # [s_kv, h_kv, d_v] + + # 初始化输出 + output = torch.zeros(s_q, h_q, d_v, device=device, dtype=dtype) + # 处理每个查询位置 + for i in range(s_q): + # 当前查询 [h_q, d_qk] + q_i = q[i].unsqueeze(0) # [1, h_q, d_qk] + # 获取当前查询位置的稀疏索引 [topk] + sparse_indices = indices[i, 0] # [topk] + # 过滤有效索引 (>=0 且 < s_kv) + valid_mask = (sparse_indices >= 0) & (sparse_indices < s_kv) + valid_indices = sparse_indices[valid_mask] + # 获取有效的K和V + valid_k = k[valid_indices].repeat(1, h_q, 1) # [valid_len, h_q, d_qk] + valid_v = v[valid_indices].repeat(1, h_q, 1) # [valid_len, h_q, d_v] + out = ref_masked_attention( + q_i, + valid_k, + valid_v, + sm_scale + ) + out = out.view(h_q, d_v) + output[i].copy_(out, non_blocking=True) + return output + +def get_device_attribute(attribute: int, device: int) -> int: + return torch.ops._C_cuda_utils.get_device_attribute(attribute, device) + + +def get_max_shared_memory_per_block_device_attribute(device: int) -> int: + # ruff: noqa: E501 + return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute( + device + ) + + +# custom ar +def init_custom_ar( + ipc_tensors: list[torch.Tensor], + rank_data: torch.Tensor, + rank: int, + fully_connected: bool, +) -> int: + return torch.ops._C_custom_ar.init_custom_ar( + ipc_tensors, rank_data, rank, fully_connected + ) + + +def all_reduce( + fa: int, + inp: torch.Tensor, + out: torch.Tensor, + reg_buffer: int, + reg_buffer_sz_bytes: int, +) -> None: + torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes) + + +def dispose(fa: int) -> None: + torch.ops._C_custom_ar.dispose(fa) + + +def meta_size() -> int: + return torch.ops._C_custom_ar.meta_size() + + +def register_buffer(fa: int, ipc_tensors: list[int]) -> None: + return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors) + + +def get_graph_buffer_ipc_meta(fa: int) -> tuple[list[int], list[int]]: + return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa) + + +def register_graph_buffers( + fa: int, handles: list[list[int]], offsets: list[list[int]] +) -> None: + torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets) + +def allocate_shared_buffer_and_handle(size: int) -> tuple[int, torch.Tensor]: + return torch.ops._C_custom_ar.allocate_shared_buffer_and_handle(size) + + +def open_mem_handle(mem_handle: torch.Tensor): + return torch.ops._C_custom_ar.open_mem_handle(mem_handle) + + +def free_shared_buffer(ptr: int) -> None: + torch.ops._C_custom_ar.free_shared_buffer(ptr) + + +# quick all reduce +def init_custom_qr(rank: int, world_size: int, qr_max_size: int | None = None) -> int: + return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size) + + +def qr_destroy(fa: int) -> None: + torch.ops._C_custom_ar.qr_destroy(fa) + + +def qr_all_reduce( + fa: int, + inp: torch.Tensor, + out: torch.Tensor, + quant_level: int, + cast_bf2half: bool = False, +) -> None: + torch.ops._C_custom_ar.qr_all_reduce(fa, inp, out, quant_level, cast_bf2half) + + +def qr_get_handle(fa: int) -> torch.Tensor: + return torch.ops._C_custom_ar.qr_get_handle(fa) + + +def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None: + return torch.ops._C_custom_ar.qr_open_handles(fa, handles) + + +def qr_max_size() -> int: + return torch.ops._C_custom_ar.qr_max_size() + + +def get_flash_mla_metadata( + cache_seqlens: torch.Tensor, + num_heads_per_head_k: int, + num_heads_k: int, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Arguments: + cache_seqlens: (batch_size), dtype torch.int32. + num_heads_per_head_k: Equals to seq_len_q * num_heads_q // num_heads_k. + num_heads_k: num_heads_k. + + Return: + tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32. + num_splits: (batch_size + 1), dtype torch.int32. + """ + return torch.ops._C.get_flash_mla_metadata( + cache_seqlens, num_heads_per_head_k, num_heads_k + ) + +def flash_mla_with_kvcache( + q: torch.Tensor, + k_cache: torch.Tensor, + block_table: torch.Tensor, + cache_seqlens: torch.Tensor, + head_dim_v: int, + tile_scheduler_metadata: torch.Tensor, + num_splits: torch.Tensor, + softmax_scale: float | None = None, + causal: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Arguments: + q: (batch_size, seq_len_q, num_heads_q, head_dim). + k_cache: (num_blocks, page_block_size, num_heads_k, head_dim). + block_table: (batch_size, max_num_blocks_per_seq), torch.int32. + cache_seqlens: (batch_size), torch.int32. + head_dim_v: Head_dim of v. + tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, return by get_mla_metadata. + num_splits: (batch_size + 1), torch.int32, return by get_mla_metadata. + softmax_scale: float. The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim). + causal: bool. Whether to apply causal attention mask. + + Return: + out: (batch_size, seq_len_q, num_heads_q, head_dim_v). + softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32. + """ + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, softmax_lse = torch.ops._C.flash_mla_fwd_kvcache( + q, + k_cache, + None, + head_dim_v, + cache_seqlens, + block_table, + softmax_scale, + causal, + tile_scheduler_metadata, + num_splits, + ) + return out, softmax_lse + + +def sm100_cutlass_mla_decode( + out: torch.Tensor, + lse: torch.Tensor, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + seq_lens: torch.Tensor, + page_table: torch.Tensor, + workspace: torch.Tensor, + scale: float, + num_kv_splits: int, +) -> torch.Tensor: + torch.ops._C.sm100_cutlass_mla_decode( + out, + lse, + q_nope, + q_pe, + kv_c_and_k_pe_cache, + seq_lens, + page_table, + workspace, + scale, + num_kv_splits, + ) + return out + + +def sm100_cutlass_mla_get_workspace_size( + max_seq_len: int, num_batches: int, sm_count: int, num_kv_splits: int +) -> int: + return torch.ops._C.sm100_cutlass_mla_get_workspace_size( + max_seq_len, num_batches, sm_count, num_kv_splits + ) + + +if hasattr(torch.ops._C, "weight_packed_linear"): + + @register_fake("_C::weight_packed_linear") + def weight_packed_linear_fake( + mat1: torch.Tensor, + mat2: torch.Tensor, + bias: torch.Tensor | None, + is_vnni: bool, + ) -> torch.Tensor: + return torch.empty( + (mat1.size(0), mat2.size(0)), dtype=mat1.dtype, device=mat2.device + ) + + +if hasattr(torch.ops._C, "fused_experts_cpu"): + + @register_fake("_C::fused_experts_cpu") + def fused_experts_cpu_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool, + use_int8_w8a8: bool, + use_fp8_w8a16: bool, + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, + block_size: list[int] | None, + a1_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + is_vnni: bool, + ) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"): + + @register_fake("_C::int8_scaled_mm_with_quant") + def int8_scaled_mm_with_quant_fake( + mat1: torch.Tensor, + mat2: torch.Tensor, + scales2: torch.Tensor, + bias: torch.Tensor | None, + out_dtype: torch.dtype, + is_vnni: bool, + ) -> torch.Tensor: + M = mat1.size(0) + N = mat2.size(0) + return torch.empty((M, N), dtype=out_dtype) + + +class CPUDNNLGEMMHandler: + def __init__(self) -> None: + self.handler: int | None = None + self.n = -1 + self.k = -1 + + def __del__(self): + if self.handler is not None: + torch.ops._C.release_dnnl_matmul_handler(self.handler) + + +_supports_onednn = bool(hasattr(torch.ops._C, "create_onednn_mm_handler")) + + +def is_onednn_acl_supported(): + return torch.ops._C.is_onednn_acl_supported() + + +def create_onednn_mm( + weight: torch.Tensor, # [K, N] + primitive_cache_size: int = 128, +) -> CPUDNNLGEMMHandler: + handler = CPUDNNLGEMMHandler() + handler.k, handler.n = weight.size() + handler.handler = torch.ops._C.create_onednn_mm_handler( + weight, primitive_cache_size + ) + return handler + + +def onednn_mm( + dnnl_handler: CPUDNNLGEMMHandler, + x: torch.Tensor, + bias: torch.Tensor | None, +) -> torch.Tensor: + output = torch.empty((*x.shape[0:-1], dnnl_handler.n), dtype=x.dtype) + torch.ops._C.onednn_mm( + output, x.reshape(-1, dnnl_handler.k), bias, dnnl_handler.handler + ) + + return output + + +def create_onednn_scaled_mm( + weight: torch.Tensor, # [K, N] + weight_scales: torch.Tensor, + output_type: torch.dtype, + dynamic_quant: bool, + use_azp: bool, + primitive_cache_size: int = 128, +) -> CPUDNNLGEMMHandler: + handler = CPUDNNLGEMMHandler() + handler.k, handler.n = weight.size() + handler.handler = torch.ops._C.create_onednn_scaled_mm_handler( + weight, weight_scales, output_type, dynamic_quant, use_azp, primitive_cache_size + ) + return handler + + +def onednn_scaled_int8_quant( + input: torch.Tensor, + scale: torch.Tensor | None = None, + azp: torch.Tensor | None = None, + symmetric: bool = True, +): + """ + Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp. + + Args: + input: The input tensor to be quantized to int8. + scale: Optional scaling factor for the int8 quantization. + When not provided, we invoke dynamic-per-token quantization. + azp: Optional zero-point for the int8 quantization. + Must be provided for asymmetric quantization if `scale` is provided. + symmetric: Whether to use symmetric quantization (scale only, azp ignored). + + Returns: + tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp. + """ + output = torch.empty_like(input, dtype=torch.int8) + token_num = input.numel() // input.shape[-1] + input = input.view((token_num, input.shape[-1])) + if scale is not None: + # static-per-tensor quantization. + assert symmetric == (azp is None), ( + "azp must only be provided for asymmetric quantization." + ) + torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) + return output, scale, azp + + # dynamic-per-token quantization. + input_scales = torch.empty((token_num, 1), device=input.device, dtype=torch.float32) + input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32) + torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, input_azp) + return output, input_scales, input_azp + + +def onednn_scaled_mm( + dnnl_handler: CPUDNNLGEMMHandler, + x: torch.Tensor, + output: torch.Tensor, + input_scale: torch.Tensor | None, + input_zp: torch.Tensor | None, + input_zp_adj: torch.Tensor | None, + bias: torch.Tensor | None, +) -> torch.Tensor: + torch.ops._C.onednn_scaled_mm( + output, x, input_scale, input_zp, input_zp_adj, bias, dnnl_handler.handler + ) + + return output + + +def cpu_attn_get_scheduler_metadata( + num_reqs: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + seq_lens: torch.Tensor, + dtype: torch.dtype, + query_start_loc: torch.Tensor, + causal: bool, + sliding_window_size: int, + isa: str, + enable_kv_split: bool, +) -> torch.Tensor: + sheduler_metadata = torch.ops._C.get_scheduler_metadata( + num_reqs, + num_heads, + num_kv_heads, + head_dim, + seq_lens, + dtype, + query_start_loc, + causal, + sliding_window_size, + isa, + enable_kv_split, + ) + return sheduler_metadata + + +def cpu_attn_reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + isa: str, +) -> None: + torch.ops._C.cpu_attn_reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping, + isa, + ) + + +def cpu_attention_with_kv_cache( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + output: torch.Tensor, + query_start_loc: torch.Tensor, + seq_lens: torch.Tensor, + scale: float, + causal: bool, + alibi_slopes: torch.Tensor | None, + sliding_window: tuple[int, int], + block_table: torch.Tensor, + softcap: float, + scheduler_metadata: torch.Tensor, + s_aux: torch.Tensor | None, +) -> None: + torch.ops._C.cpu_attention_with_kv_cache( + query, + key_cache, + value_cache, + output, + query_start_loc, + seq_lens, + scale, + causal, + alibi_slopes, + sliding_window[0], + sliding_window[1], + block_table, + softcap, + scheduler_metadata, + s_aux, + ) + + +if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"): + + @register_fake("_qutlass_C::matmul_mxf4_bf16_tn") + def _fake_matmul_mxf4_bf16_tn( + a: torch.Tensor, + b: torch.Tensor, + a_sf: torch.Tensor, + b_sf: torch.Tensor, + alpha: torch.Tensor, + ): + return a.new_empty(*a.shape[:-1], b.shape[0], dtype=torch.bfloat16) + + +def matmul_mxf4_bf16_tn( + a: torch.Tensor, + b: torch.Tensor, + a_sf: torch.Tensor, + b_sf: torch.Tensor, + alpha: torch.Tensor, +) -> torch.Tensor: + return torch.ops._qutlass_C.matmul_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha) + + +if hasattr(torch.ops._qutlass_C, "matmul_ada_mxf4_bf16_tn"): + + @register_fake("_qutlass_C::matmul_ada_mxf4_bf16_tn") + def _fake_matmul_ada_mxf4_bf16_tn( + a: torch.Tensor, + b: torch.Tensor, + a_sf: torch.Tensor, + b_sf: torch.Tensor, + alpha: torch.Tensor, + ): + return a.new_empty(*a.shape[:-1], b.shape[0], dtype=torch.bfloat16) + + +def matmul_ada_mxf4_bf16_tn( + a: torch.Tensor, + b: torch.Tensor, + a_sf: torch.Tensor, + b_sf: torch.Tensor, + alpha: torch.Tensor, +) -> torch.Tensor: + return torch.ops._qutlass_C.matmul_ada_mxf4_bf16_tn(a, b, a_sf, b_sf, alpha) + + +def ceil_div(a, b): + return (a + b - 1) // b + + +if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxQuest"): + + @register_fake("_qutlass_C::fusedQuantizeMxQuest") + def _fake_fused_quantize_mx_quest( + a: torch.Tensor, b: torch.Tensor, xh_e2m1: torch.Tensor, xh_e8m0: torch.Tensor + ): + return xh_e2m1, xh_e8m0 + + +if hasattr(torch.ops._qutlass_C, "fusedQuantizeMxAbsMax"): + + @register_fake("_qutlass_C::fusedQuantizeMxAbsMax") + def _fake_fused_quantize_mx_absmax( + a: torch.Tensor, b: torch.Tensor, xh_e2m1: torch.Tensor, xh_e8m0: torch.Tensor + ): + return xh_e2m1, xh_e8m0 + + +def fusedQuantizeMx( + a: torch.Tensor, b: torch.Tensor, *, method: Literal["quest", "abs_max"] = "quest" +) -> tuple[torch.Tensor, torch.Tensor]: + if a.dim() == 0: + raise ValueError("`a` must have at least 1 dimension.") + if a.size(-1) % 32 != 0: + raise ValueError(f"last dim of `a` must be divisible by 32, got {a.size(-1)}.") + if b.device != a.device: + raise ValueError("`a` and `b` must be on the same device.") + + xh_e2m1 = torch.empty( + *a.shape[:-1], a.size(-1) // 2, dtype=torch.uint8, device=a.device + ) + + rows, cols = a.numel() // a.size(-1), a.size(-1) // 32 + n_row_blocks = ceil_div(rows, 128) + n_col_blocks = ceil_div(cols, 4) + padded_rows = n_row_blocks * 128 + padded_cols = n_col_blocks * 4 + + xh_e8m0 = torch.empty( + padded_rows, padded_cols, dtype=torch.float8_e8m0fnu, device=a.device + ) + + if not hasattr(torch.ops, "_qutlass_C"): + raise RuntimeError( + "The `_qutlass_C` extension is not loaded. " + "Make sure your custom op library is imported before calling fusedQuantizeMx." + ) + + if method == "quest": + return torch.ops._qutlass_C.fusedQuantizeMxQuest(a, b, xh_e2m1, xh_e8m0) + elif method == "abs_max": + return torch.ops._qutlass_C.fusedQuantizeMxAbsMax(a, b, xh_e2m1, xh_e8m0) + else: + raise ValueError(f"invalid method {method!r}, must be 'quest' or 'abs_max'") + + +if hasattr(torch.ops._qutlass_C, "fusedQuantizeNv"): + + @register_fake("_qutlass_C::fusedQuantizeNv") + def _fake_fused_quantize_nv( + a: torch.Tensor, + b: torch.Tensor, + xh_e2m1: torch.Tensor, + xh_e4m3: torch.Tensor, + global_scale: torch.Tensor, + ): + return xh_e2m1, xh_e4m3 + + +def fusedQuantizeNv( + a: torch.Tensor, b: torch.Tensor, global_scale: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + xh_e2m1 = torch.empty( + *a.shape[:-1], a.size(-1) // 2, dtype=torch.uint8, device=a.device + ) + + rows, cols = a.numel() // a.size(-1), a.size(-1) // 16 + n_row_blocks = ceil_div(rows, 128) + n_col_blocks = ceil_div(cols, 4) + padded_rows = n_row_blocks * 128 + padded_cols = n_col_blocks * 4 + xh_e4m3 = torch.empty( + padded_rows, padded_cols, dtype=torch.float8_e4m3fn, device=a.device + ) + + return torch.ops._qutlass_C.fusedQuantizeNv(a, b, xh_e2m1, xh_e4m3, global_scale) + + +def hadacore_transform(x: torch.Tensor, inplace: bool = True) -> torch.Tensor: + """ + Perform Hadamard transforms using [Hadacore](https://arxiv.org/abs/2412.08832) + kernels. Note that these kernels exploit the recursive properties of + Sylvester Hadamards, and therefore do not require transform weight data + + Note that sylvester hadamard transforms are also symmetric, which means that + this function is also applies the (transpose <=> inverse) transform. + + :param x: value to be transformed inplace + :param inplace: modify value in place + :return: value after transformation + """ + return torch.ops._C.hadacore_transform(x, inplace) + + +if hasattr(torch.ops._C, "hadacore_transform"): + + @register_fake("_C::hadacore_transform") + def _hadacore_transform_fake(x: torch.Tensor, inplace: bool) -> torch.Tensor: + return torch.empty_like(x) if not inplace else x +# Add our new features here.. +def gather_cache( + src_cache: torch.Tensor, # [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + dst: torch.Tensor, # [TOT_TOKENS, ENTRIES...] + block_table: torch.Tensor, # [BATCH, BLOCK_INDICES] + cu_seq_lens: torch.Tensor, # [BATCH+1] + batch_size: int, + seq_starts: torch.Tensor = None +): + ops.vllm_gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size, seq_starts) + +def gather_cache_int8( + src_cache: torch.Tensor, # [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + src_cache_scale: torch.Tensor,# [NUM_BLOCKS, BLOCK_SIZE, 2] + kv_lora_rank: int, + dst: torch.Tensor, # [TOT_TOKENS, ENTRIES...] + block_table: torch.Tensor, # [BATCH, BLOCK_INDICES] + cu_seq_lens: torch.Tensor, # [BATCH+1] + batch_size: int, + seq_starts: torch.Tensor = None +): + ops.vllm_gather_cache_int8(src_cache,src_cache_scale, kv_lora_rank, dst, block_table, cu_seq_lens, batch_size, seq_starts) + +def quant_kv(kv): + amax_, _ = torch.max(torch.abs(kv), dim=-1, keepdim=True) + f_scale = amax_.float() / 127.0 + scales = f_scale.view(kv.shape[:-1]) + + # 量化 + kv = kv / f_scale + kv = torch.clamp(torch.round(kv), -127, 127).to(torch.int8) + return kv, scales + + +def concat_and_cache_mla_int8( + kv_c_int8: torch.Tensor, + kv_c_scale: torch.Tensor, + k_pe_int8: torch.Tensor, + k_pe_scale: torch.Tensor, + kv_cache: torch.Tensor, + kv_cache_scale: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + scale: torch.Tensor, +) -> None: + ops.vllm_concat_and_cache_mla_int8(kv_c_int8,kv_c_scale, k_pe_int8, k_pe_scale, kv_cache, kv_cache_scale, + slot_mapping, kv_cache_dtype, + scale) +def invoke_fused_moe_kernel( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: Optional[torch.Tensor], + B_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + config: Dict[str, Any], + compute_type, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + block_shape: Optional[List[int]] = None, + bias: Optional[torch.Tensor] = None, +) -> None: + ops.vllm_invoke_fused_moe_kernel( + A, + B, + C, + topk_weights, + topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + mul_routed_weight, + top_k, + config['BLOCK_SIZE_M'], + bias=bias + ) + + +# broadcast +class Async_helper(): + # For now, the comm and the other kernels are in the same stream, so we can remove the stream wait.. + def wait(self,): + return True + + +def broadcast(tensor, src=0, group=None, async_op=False): + cdist.broadcast(tensor,src,group,async_op=True) + if async_op: + return Async_helper() + else: + pass + + +# w8a16 +def linear_w8a16(x: torch.Tensor, qweight: torch.Tensor, scales:torch.Tensor, + group_size: int = -1, format: str = "TN")-> torch.Tensor: + return ops.w8a16(x, qweight, scales, format="TN", group_size=group_size) + + +## lora sgmv / bgmv +def sbgmv_expand(x: torch.Tensor, + w_t_all: torch.Tensor, + y: torch.Tensor, + b_seq_start_loc: torch.Tensor = None, + seq_len_tensor: torch.Tensor = None, + lora_indices_tensor: torch.Tensor = None, + batches: int = -1, + max_seq_length: int = -1, + token_nums: int = -1, + add_input=True, + ): + ''' + x: inputs + w_t_all: lora weight + y: output + + y += x@wt_t_all + ''' + assert x.dtype in [torch.float16, torch.bfloat16, torch.float32] + assert w_t_all.dtype in [ + torch.float16, + torch.bfloat16, + ] + + assert x.is_contiguous() + # assert y.is_contiguous() + if x.dtype == torch.float: + x = x.to(w_t_all.dtype) + + if w_t_all.ndim == 4: # shape:(lora_num,1,size,rank) + assert w_t_all.size(1) == 1 + w_t_all = w_t_all.squeeze(dim=1) + else: + assert w_t_all.ndim == 3 # shape:(lora_num,size,rank) + assert w_t_all.is_contiguous() + + assert add_input == True + + lora_indices = lora_indices_tensor.cpu().tolist() + lora_num = w_t_all.shape[0] + + ## 单一lora model, 且所有request均使用lora + if lora_num == 1 and all(x == lora_indices[0] for x in lora_indices): + if lora_indices[0] != -1: + w_t = w_t_all[0] + y += torch.matmul(x, w_t.t()) + ## 多个lora model + else: + ## prefill + if batches != -1: + for i, lora_id, start, seq_len in zip(range(batches), lora_indices, b_seq_start_loc, seq_len_tensor): + if lora_id != -1: + xi = x[start: start+seq_len] + w_t = w_t_all[lora_id] + y[start:start+seq_len] += (xi @ w_t.t()) + ## decode + else: + batches = x.shape[0] + for i, lora_id in zip(range(batches), lora_indices): + if lora_id != -1: + xi = x[i].unsqueeze(0) + w_t = w_t_all[lora_id] + y[i] += (xi @ w_t.t()).squeeze(0) + + return y + + +def sbgmv_shrink(x: torch.Tensor, + w_t_all: torch.Tensor, + y: torch.Tensor, + b_seq_start_loc: torch.Tensor = None, + seq_len_tensor: torch.Tensor = None, + lora_indices_tensor: torch.Tensor = None, + batches: int = -1, + max_seq_length: int = -1, + token_nums: int = -1, + scale: float = 1.0,): + """ + xx: inputs + w_t_all: lora weight + y: output + scale: float + + y = x@w_t_all * scale + """ + assert x.dtype == w_t_all.dtype + assert x.dtype in [torch.float16, torch.bfloat16] + assert x.is_contiguous() + assert y.is_contiguous() + + if w_t_all.ndim == 4: # shape:(lora_num,1,size,rank) + assert w_t_all.size(1) == 1 + w_t_all = w_t_all.squeeze(dim=1) + else: + assert w_t_all.ndim == 3 # shape:(lora_num,size,rank) + assert w_t_all.is_contiguous() + + lora_num = w_t_all.shape[0] + lora_indices = lora_indices_tensor.cpu().tolist() + + ## 单一lora model, 且所有request均使用lora + if lora_num == 1 and all(x == lora_indices[0] for x in lora_indices): + if lora_indices[0] != -1: + w_t = w_t_all[0] + y = torch.matmul(x, w_t.t()) * scale + ## 多个lora model + else: + ## prefill + if batches != -1: + for i, lora_id, start, seq_len in zip(range(batches), lora_indices, b_seq_start_loc, seq_len_tensor): + if lora_id != -1: + xi = x[start: start+seq_len] + w_t = w_t_all[lora_id] + y[start:start+seq_len] = (xi @ w_t.t())* scale + ## decode + else: + batches = x.shape[0] + for i, lora_id in zip(range(batches), lora_indices): + if lora_id != -1: + xi = x[i].unsqueeze(0) + w_t = w_t_all[lora_id] + y[i] = (xi @ w_t.t()).squeeze(0) * scale + + return y + +def dynamic_scaled_quant_dynamic_int8(x, input_scales=None, int8_out=None, scales=None): + return ops.dynamic_scaled_quant_smoothquant(x, input_scales, int8_out, scales) + + +def rejection_greedy_sample_torch( + output_token_ids: torch.Tensor, # [batch_size, max_spec_len + 1] + cu_num_draft_tokens: torch.Tensor, # [batch_size] (前缀和形式) + draft_token_ids: torch.Tensor, # [num_tokens] + target_argmax: torch.Tensor, # [num_tokens] + bonus_token_ids: torch.Tensor, # [batch_size] + is_greedy: torch.Tensor = None, # [batch_size] 或 None +): + """ + 完全等价于 rejection_greedy_sample_kernel 的 PyTorch 实现 + 接口参数与 Triton 核完全一致 + """ + batch_size = output_token_ids.size(0) + device = output_token_ids.device + + # 处理 is_greedy 为 None 的情况(保持与 Triton 核相同行为) + if is_greedy is None: + is_greedy_mask = torch.ones(batch_size, dtype=torch.bool, device=device) + else: + is_greedy_mask = is_greedy.to(device) + + for req_idx in range(batch_size): + if not is_greedy_mask[req_idx]: + continue # 非贪婪请求直接跳过 + + # 计算当前请求的token范围(前缀和转实际数量) + start_idx = 0 if req_idx == 0 else cu_num_draft_tokens[req_idx - 1] + end_idx = cu_num_draft_tokens[req_idx] + num_draft_tokens = end_idx - start_idx + + rejected = False + for pos in range(num_draft_tokens): + if not rejected: + global_pos = start_idx + pos + draft_token = draft_token_ids[global_pos] + target_token = target_argmax[global_pos] + + # 存储目标token(与Triton核完全一致的行为) + output_token_ids[req_idx, pos] = target_token + + # 检查是否拒绝 + if draft_token != target_token: + rejected = True + + # 全部接受时追加bonus token + if not rejected and num_draft_tokens < output_token_ids.size(1): + output_token_ids[req_idx, num_draft_tokens] = bonus_token_ids[req_idx] + + return output_token_ids # 原位修改 + +def rejection_random_sample_torch( + output_token_ids: torch.Tensor, # [batch_size, max_spec_len + 1] + cu_num_draft_tokens: torch.Tensor, # [batch_size] (前缀和形式) + draft_token_ids: torch.Tensor, # [num_tokens] + draft_probs: torch.Tensor | None, # [num_tokens, vocab_size] 或 None + target_probs: torch.Tensor, # [num_tokens, vocab_size] + bonus_token_ids: torch.Tensor, # [batch_size] + recovered_token_ids: torch.Tensor, # [num_tokens] + uniform_probs: torch.Tensor, # [num_tokens] (0~1均匀分布) + is_greedy: torch.Tensor | None, # [batch_size] 或 None + NO_DRAFT_PROBS: bool = False, # 是否忽略draft_probs +): + batch_size = output_token_ids.size(0) + max_spec_len_plus_1 = output_token_ids.size(1) + device = output_token_ids.device + + # 处理 is_greedy 为 None 的情况 + if is_greedy is None: + is_greedy = torch.zeros(batch_size, dtype=torch.bool, device=device) + else: + is_greedy = is_greedy.to(device) + + for req_idx in range(batch_size): + if is_greedy[req_idx]: + continue # 贪婪采样请求直接跳过 + + # 计算当前请求的token范围 + start_idx = 0 if req_idx == 0 else cu_num_draft_tokens[req_idx - 1] + end_idx = cu_num_draft_tokens[req_idx] + num_draft_tokens = end_idx - start_idx + + rejected = False + for pos in range(num_draft_tokens): + if not rejected: + global_pos = start_idx + pos + draft_token_id = draft_token_ids[global_pos] + + # 获取draft概率 (处理NO_DRAFT_PROBS情况) + if NO_DRAFT_PROBS: + draft_prob = 1.0 + else: + assert draft_probs is not None, "draft_probs不能为None当NO_DRAFT_PROBS=False" + draft_prob = draft_probs[global_pos, draft_token_id] + + # 获取target概率和均匀随机数 + target_prob = target_probs[global_pos, draft_token_id] + uniform_prob = uniform_probs[global_pos] + + # 拒绝采样逻辑 + if draft_prob > 0 and (target_prob / draft_prob) >= uniform_prob: + # 接受draft token + output_token_ids[req_idx, pos] = draft_token_id + else: + # 拒绝并使用恢复的token + rejected = True + output_token_ids[req_idx, pos] = recovered_token_ids[global_pos] + + # 如果全部接受则追加bonus token + if not rejected and num_draft_tokens < max_spec_len_plus_1: + output_token_ids[req_idx, num_draft_tokens] = bonus_token_ids[req_idx] + + return output_token_ids + +weak_ref_tensor = ops.weak_ref_tensor diff --git a/_ipex_ops.py b/_ipex_ops.py new file mode 100644 index 0000000..95c17cb --- /dev/null +++ b/_ipex_ops.py @@ -0,0 +1,457 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm.logger import init_logger +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +try: + import intel_extension_for_pytorch as ipex +except ImportError as e: + logger.debug("Import error msg: %s", e.msg) + + +class ipex_ops: + @staticmethod + def _reshape_activation_tensor( + x: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + num = x.size(0) + d = x.size(1) // 2 + x = x.reshape(num, 2, d) + x1, x2 = torch.chunk(x, chunks=2, dim=1) + x1 = x1.reshape(num, d) + x2 = x2.reshape(num, d) + return x1, x2 + + @staticmethod + def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ipex.llm.functional.silu_and_mul(x, out) + + @staticmethod + def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ipex.llm.functional.gelu_and_mul(x, out) + + @staticmethod + def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: + ipex.llm.functional.gelu_and_mul(x, out) + + @staticmethod + def gelu_fast(x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.gelu(x) + + @staticmethod + def gelu_new(x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.gelu(x) + + @staticmethod + def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: + ipex.llm.functional.gelu_quick(x, out) + + @staticmethod + def paged_attention_v1( + out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: torch.Tensor | None, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + ipex.llm.modules.PagedAttention.single_query_kv_attention( + out, + query.contiguous(), + key_cache.view_as(value_cache), + value_cache, + num_queries_per_tokens, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + ) + + @staticmethod + def paged_attention_v2( + out: torch.Tensor, + exp_sum: torch.Tensor, + max_logits: torch.Tensor, + tmp_out: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + num_kv_heads: int, + scale: float, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + block_size: int, + max_context_len: int, + alibi_slopes: torch.Tensor | None, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> None: + assert kv_cache_dtype == "auto" + num_heads = out.size(1) + num_queries_per_tokens = num_heads // num_kv_heads + ipex.llm.modules.PagedAttention.single_query_kv_attention( + out, + query.contiguous(), + key_cache.view_as(value_cache), + value_cache, + num_queries_per_tokens, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + ) + + @staticmethod + def rotary_embedding( + positions: torch.Tensor, # [batch_size, seq_len] + query: torch.Tensor, # [batch_size, seq_len, num_heads*head_size] + key: torch.Tensor, # [batch_size, seq_len, num_kv_heads*head_size] + head_size: int, + cos_sin_cache: torch.Tensor, # [cos_sin_dim, rot_dim] + is_neox: bool, + ) -> None: + rot_dim = cos_sin_cache.size(1) + ipex.llm.functional.rotary_embedding_batched( + positions, query, key, head_size, cos_sin_cache, is_neox, rot_dim + ) + + @staticmethod + def rms_norm( + input: torch.Tensor, weight: torch.Tensor, epsilon: float + ) -> torch.Tensor: + out = torch.empty_like(input) + torch.ops.torch_ipex.rms_norm_vllm(out, input.contiguous(), weight, epsilon) + return out + + @staticmethod + def fused_add_rms_norm( + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + ) -> None: + torch.ops.torch_ipex.fused_add_rms_norm_vllm(input, residual, weight, epsilon) + + @staticmethod + def varlen_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + seqlen_q: torch.Tensor, + seqlen_k: torch.Tensor, + alibi_slopes: torch.Tensor | None, + max_seqlen_q: int, + max_seqlen_k: int, + pdropout: float, + softmax_scale: float, + zero_tensors: bool, + is_causal: bool, + return_softmax: bool, + gen_: torch.Generator, + window_size_left: float, + window_size_right: float, + logits_soft_cap: float, + ) -> None: + if ipex.__version__.endswith("cpu"): + if logits_soft_cap != 0.0: + raise ValueError("IPEX CPU does not support logits_soft_cap") + assert alibi_slopes is None + assert window_size_left < 0 and window_size_right < 0 + ipex.llm.functional.varlen_attention( + query.contiguous(), + key.contiguous(), + value.contiguous(), + out, + seqlen_q.int(), + seqlen_k.int(), + max_seqlen_q, + max_seqlen_k, + pdropout, + softmax_scale, + zero_tensors, + is_causal, + return_softmax, + gen_, + ) + else: # XPU build + ipex.llm.functional.varlen_attention( + query.contiguous(), + key.contiguous(), + value.contiguous(), + out, + seqlen_q.int(), + seqlen_k.int(), + alibi_slopes, + max_seqlen_q, + max_seqlen_k, + pdropout, + softmax_scale, + zero_tensors, + is_causal, + return_softmax, + gen_, + window_size_left, + window_size_right, + logits_soft_cap, + ) + + @staticmethod + def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + ) -> None: + assert kv_cache_dtype == "auto" + ipex.llm.modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, slot_mapping + ) + + @staticmethod + def reshape_and_cache_flash( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor | None = None, + v_scale: torch.Tensor | None = None, + k_scale_float: float = 1.0, + v_scale_float: float = 1.0, + ) -> None: + ipex.llm.modules.PagedAttention.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + slot_mapping, + kv_cache_dtype, + k_scale_float, + v_scale_float, + ) + + @staticmethod + def flash_attn_varlen_func( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens_q: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + softmax_scale: float | None = None, + causal: bool = False, + out: torch.Tensor | None = None, + block_table: torch.Tensor | None = None, + alibi_slopes: torch.Tensor | None = None, + window_size: list[int] | None = None, + softcap: float | None = 0.0, + seqused_k: torch.Tensor | None = None, + cu_seqlens_k: torch.Tensor | None = None, + # passed in qwen vl + dropout_p: float = 0.0, + # The following parameters are not used in ipex kernel currently, + # we keep API compatible to CUDA's. + scheduler_metadata=None, + fa_version: int = 2, + q_descale=None, + k_descale=None, + v_descale=None, + num_splits=0, + s_aux: torch.Tensor | None = None, + ): + if out is None: + out = torch.empty(q.shape, dtype=q.dtype, device=q.device) + real_window_size: tuple[int, int] + if window_size is None: + real_window_size = (-1, -1) + else: + assert len(window_size) == 2 + real_window_size = (window_size[0], window_size[1]) + + if block_table is None: + assert cu_seqlens_k is not None, ( + "cu_seqlens_k can't be None when calling varlen_attention." + ) + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + ipex_ops.varlen_attention( + q.contiguous(), + k.contiguous(), + v.contiguous(), + out, + cu_seqlens_q, + cu_seqlens_k, + None, + max_seqlen_q, + max_seqlen_k, + 0.0, + softmax_scale, + False, + causal, + False, + None, + real_window_size[0], + real_window_size[1], + -1, + ) + return out + else: + return ipex.llm.modules.PagedAttention.flash_attn_varlen_func( + out, + q.contiguous(), + k, + v, + cu_seqlens_q, + seqused_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal, + block_table, + alibi_slopes, + sink=s_aux, + softcap=softcap, + window_size_left=real_window_size[0], + window_size_right=real_window_size[1], + k_scale=1.0, + v_scale=1.0, + ) + + @staticmethod + def get_scheduler_metadata( + batch_size, + max_seqlen_q, + max_seqlen_k, + num_heads_q, + num_heads_kv, + headdim, + cache_seqlens: torch.Tensor, + qkv_dtype=torch.bfloat16, + headdim_v=None, + cu_seqlens_q: torch.Tensor | None = None, + cu_seqlens_k_new: torch.Tensor | None = None, + cache_leftpad: torch.Tensor | None = None, + page_size: int | None = None, + max_seqlen_k_new=0, + causal=False, + window_size=(-1, -1), # -1 means infinite context window + has_softcap=False, + num_splits=0, # Can be tuned for speed + pack_gqa=None, # Can be tuned for speed + sm_margin=0, # Can be tuned if some SMs are used for communication + ) -> None: + logger.warning_once( + "get_scheduler_metadata is not implemented for ipex_ops, returning None." + ) + return None + + @staticmethod + def copy_blocks( + key_caches: list[torch.Tensor], + value_caches: list[torch.Tensor], + block_mapping: torch.Tensor, + ) -> None: + torch.xpu.copy_blocks( # type: ignore + key_caches, + value_caches, + block_mapping, + ) + + @staticmethod + def swap_blocks( + src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor + ) -> None: + torch.xpu.swap_blocks(src, dst, block_mapping) # type: ignore + + @staticmethod + def scaled_fp8_quant( + input: torch.Tensor, + scale: torch.Tensor | None = None, + num_token_padding: int | None = None, + scale_ub: torch.Tensor | None = None, + use_per_token_if_dynamic: bool = False, + output: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantize input tensor to FP8 and return quantized tensor and scale. + + This function is designed for both static and dynamic quantization: + If you provide the scale, it will use static scaling and if you omit + it, the scale will be determined dynamically. Currently, XPU platform + only supports dynamic quantization. The function also allows optional + padding of the output tensors for downstream kernels that will benefit + from padding. + + Args: + input: The input tensor to be quantized to FP8 + scale: Optional scaling factor for the FP8 quantization + scale_ub: Optional upper bound for scaling factor in dynamic + per token case + num_token_padding: If specified, pad the first dimension + of the output to at least this value. + use_per_token_if_dynamic: Whether to do per_tensor or per_token + in the dynamic quantization case. + + Returns: + tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and + scaling factor. + """ + # This code assumes batch_dim and num_tokens are flattened + assert input.ndim == 2 + shape: tuple[int, int] | torch.Size = input.shape + out_dtype: torch.dtype = current_platform.fp8_dtype() + if num_token_padding: + shape = (max(num_token_padding, input.shape[0]), shape[1]) + if output is None: + output = torch.empty(shape, device=input.device, dtype=out_dtype) + else: + assert num_token_padding is None, ( + "padding not supported if output passed in" + ) + assert output.dtype == out_dtype + assert scale is None, "only dynamic fp8 quantization supported on XPU" + assert not use_per_token_if_dynamic, ( + "per token dynamic fp8 quantization not supported on XPU" + ) + scale = torch.zeros(1, device=input.device, dtype=torch.float32) + torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale) + + return output, scale diff --git a/assets/__init__.py b/assets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/assets/__pycache__/__init__.cpython-312.pyc b/assets/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5570056bb103f22c88983aace9884fc66ec3976b GIT binary patch literal 156 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVS?ibN7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?&M4u=4F<|$LkeT-r}&y d%}*)KNwq6t1)9YO#Kj=SM`lJw#v*1Q3jjE|B?kZi literal 0 HcmV?d00001 diff --git a/assets/__pycache__/audio.cpython-312.pyc b/assets/__pycache__/audio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c93871dc52fbd1e0e969214349c685edbf8e73a8 GIT binary patch literal 2208 zcmbVNU1$_n6ux(Uc4zCB^b1QYWmdr(wF+MVqQ8`2((gsTMebfr=B~ro2)-iy)fsVd+s^+ zo}cgB`8AVCA{a#Y!u(c1=nuAN46)8^{sPPr(vgk{vao_N<2k}vyuyb(PXtR;#E=(A z%!(`VkQa$$2}&a5V~X@)e!a6PPk8LHiREvky_s+aOdhE1%W@0SpoHl&*0wI>_mM4ihrNSMC?*(yUPEyCyf)=Yx zeA9AtjYNBJQ2L^2+onCOPHBV$8B3!Tbw<-wqFIxt!G&0fI&+3SRgXBFw(7G8%OzwW z1?x!R&LD#~1RWm*DP)r~VkJ)J7lo~C#Y3rNBxZLs9~bdPs>HGUgN@K+lZJCLAQKk_ zCA)QMtVttRiOW)u94hOkGsGee1Y5I=AVv+pOzo+96~kLt{do8`^FWr+Jer01`SA_l z?HX5vG><8PhgUT7i0b1}S;$#Qf92f^WG6*L3@RjY8`CFO?4;8T{z5QGk3G z+=@Tr*acROLR5!>#ylKwD}hupiNSoFYv1;Ipdow}PWRe@ETE0H&L#P4`O5CA!`0lL zseRHT-1ZWPQn4l@9ED%T2WsY{SMKa8W>H_{f z=4U=nyP3Qrl54iEdYa`DL#3K;%;lRB^-_&JVC?M!vVi{V==<*U59hx>f5%+!`0%sj z{aoi)^PkUO9a(;JJ$JAw9i+QJ@8ww>gq{DF#Sky_6|k52?Ih415YSE_5o)3!v#e8z z11%ROU~XqqbkA*Mw&Z9^eA)05w^l%z!2s31efO@j`$+JYsX%rl?>ogQCaE1Qp74u6N0+ z=JL5Uyw>}Z{G+_K`{&_m--&yXJ(x*PwnS;60y4&=mbLKr7&F*AL?3!$Xv6l|U>3Kz+KG z1;pBf_A+t+$O;Nw3X4ZunUoRM0p0~5pQ4Rq_dTI|L(J4TEbhI!?+JowlSi>^D5%9k zjuYRwUS2NVI=6CeZGW}@ox2C#eS$z(R|KvmVX=s>3!7kH#C0<)j^ZmtcH}4yRUgA8 zqd0PjIU?@}Qd@r&!=nrWL7cS~d-+)?LCmkX@XAy6bO&ju2J8&D8ug5@o@Iy6E@ALvFIJH$BF zcu@;TN27YaP1)~)y;>dxQsXhke@h7OeSo?jpj{78{vq1+5WV#mI$9HW-1ocCR+CVs zXCblC+Fji_c(?V)LiTYCaj7f4>$vM5t`(2s8e$Z3#dSXp1^?FCk<<<;#C^YId#)$$ eWncf~IJ>LwsmC&SQy6MNw_7qpdH!|}2Kpa#UInuN literal 0 HcmV?d00001 diff --git a/assets/__pycache__/base.cpython-312.pyc b/assets/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39e3a8e93b5e2dabab3d743b4436f4ca100d4581 GIT binary patch literal 1612 zcmZuxO=ufO6rTN&w9?A4?AWqhg4vWbsYoQ@Dy^Mh3a+EZaqOf-at}Imv(~OG$@?Q_ zN3ktrqClX9UQ$}fp%ip%a1P0>w_XdqNYr4lIAHp7=*_L0OHX|>k`m+e!Mu4h`^}qq z?|pB79vBEA5L5iJ^rMK-Z*+xE#k(n>Tq&$O+6AOwsb?e9RV2 zzZJ*@!1m=N-ERm5UJpDJb3udCWnD5tdJu+C3n0Y4`t{I5-%jtUoV+E(!hh0LjPrsf zuBv%0Uo>DLexi}$Eij^Mrn9PQ*` z8pKVwsYKO{Zx}9#zn9P~t>)O;hMUMcmgmO?sbV|ddjXpu6G(t@^ckqfh;VEd6yf_; znPN`JP7k+@>ZlCyofMtZxu^V2&j$yn42lW0nd=5oNYPL#5TQaGX0x!Bf`b*8IJji5 zDY~;^n~tU%y3&g(`-Jxb6%89UaXrb{EV)E=)?*^3Z}UXMzUO*FpDryet8OPHpVUg*+CEB{U>iB(BfN|iX#@R~v5o7L$nGwahv(k!heD_f-K*etWj z?&Zl<%{3C0EpGt8-6E;r6-U6^01Wp6s0R979{fh!i=1yq&i^7`Xk6=v(&vj079WwO z-dNlfFCNH)jhjaRMV?eBnTD-dh8J{GY6TmG(q{fuz5(Hb!K(5r7{{y9XOGU+WQ4Z) zI$x%^6Qc7v|5P}ZeZGzuUZ@LY>idNBy}DQG+ykl3>0%8Ka0c`?BCqOO?)S;D^MrL3 z-S`&iVjc8%a0?0dA>~tj>-GH9x6^wyFhc-l_-jr`0lC&IyrS9N$}2QeN&!2Ta@S4$ zFXQjBlbWq7U3$O4uVZYY-VzM&4b$FHyS<}!3!TL@K95TRVFILN8BUeNBvxlk3+?$C z@mx$(=neG!s?@7D%-EMnsUNM(-%MxKl{-rqN`$;|_Tk3z-1W5jaXLG{pk{9@r*E%h zfxW{3z^5r{kUfF&3VPhZTVANU!FWqaPSb-2DKtuH_~z^JBzzo&;ob+;K>vi%*txHr z*1NOq=xpP5M?AeJPPD~|=DBAhEpcL3{NP|nY0k8VE;bfB^5~v?rY)b@l_xv$NJ~~a zV@fmi)&1|s+hbE-EbL2@d(x%0bZLL^Z0pq9&9lvg=G9g>(FsTQN5=O?rrIM@F9TvQ zbQnhAXiFT$L-2+ecPke5f`zILAvvatY0+sLlOl9lw{?RwkOb}DZnh$9{rX?mO*VcD zPO?IHf=e(9^$B8^baNujT8>^djSulVfTVry-T`$ea2)qiLY(pfjsJnhU!c^%=u!&} uANqY<Dm6a#} literal 0 HcmV?d00001 diff --git a/assets/__pycache__/image.cpython-312.pyc b/assets/__pycache__/image.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4948bb29ac608a3fd0437967fe48ed6cb6a741b GIT binary patch literal 2443 zcmai0&2JM&6rcTa)*Hu3Ac2q)YG|Ms;UrE72@*o$ql%)22qh^8OLVpNj_pnMt23Jq zyP=AzR01ItI3);Gq^AmnLl6A}dgl@;9MB4+NT>&H4xy^lQ{U{`j#H`XNS=A`&6|(+ zd%t<>mn|&{f@X>jjF%EZ@0e0+pa*U5dl=S`j&#mM7MI~TCi5n52^k@f1yi)7j1Ct1bVmEhRA-f6ufe=SOwT({iRpDJbGC%`3v#G+}s{Xyzoy(zvA+@O!o`Xh#azQx{Cr zQr-EiY2;K`h&}2bUNETP*sA5|*i;R+geF`3DD@myrAEQ_G##iHgp z<@Dh2#Q}FIHI{c%iw4a(HZ|;&rRfHB?37o;DO>aA2~K4ljp&P-Ip_C|5v@dXnu*oX ztl?4B1<~`cN;S&`V{mx+XnJszZG0QswyGOifoPUW3RyooHFoW)nsY3-+&nlmlDaT3 zurxFD(AaCcY^a*xkG#22ucMZBV?l+BWzgzN(j@{4L-XGGuKfH=t@@4i8*geaK_tERC8GG3or=eRxA7#{b=5^Qc zXVE9oN8fIo+_=6myw#cf?dy*M-=*uyxPjAVmNe}qs8Y8OH zY6fQhD_(s+UhaS5<#b*bUWl7jRDftwd8$sJ;O+@*WwEXsc7dKxIJQ|z%u}oftn)DS zz@BjDiR(~I6N?7$(KNLMEs^dUZeSwsdcKfz=VyRjF-vn*)5&Qb)ImSCh>b!KiWA$q z3b-$M4#^dL(R4ID$%l}iWT1s~3{j4S@CVnU<)aNiH%_G4vXEgF1s&-Q(7v$UdA0%V zUx$D9UbRka$rB;sjSsYr`0W7k0}r{*C77zVeXiA*beCV|XW1_QYPDK3jR>Oxgw6;V z5YAT!)^s&n0)P)Nj&lzXjW8mWI~yY3Uv~+nD8p<8y@|Cw>R9V}+_O%%+dtcm_1@!N zx3)c8zQ6q7yDhmVh?#xBgju(cZ1ZJRW#vLueMM!Tsd@NRsD4aUznRy}>P&q9%t=-U zTUAxFZO038O2J*A9%%?jb_qNm;+2B8L{70t&qC)%X0S~iLe4Qo3L*}odX9o?1(_!8 z%$;=!_)!PCAJC4{wJmm4#E7J(q0)vT@m-#m&TQac=6;_0fI!?6z*u3% ziX2Fibdy`h&&((02gKy60~0H8UOKU!gNh@a*uWpz5Z;B|AQ|_Y>wg3=C5!@o6RU4f z2!nwAh*xsqRS=f*Fi7Fn_(ZjcWHpMz+ROfpkU`c3;f*qhz4wsZ^5qcuE945;SP7tC zLRS$uj{8$a+^M&y>n-YjhhlG0=O5_oPWzda_)dFQg^x*XyF&aBzY^a)h4|)2y|1{V UfAQVii3)-a;^ez_nHn7S4{caTYybcN literal 0 HcmV?d00001 diff --git a/assets/__pycache__/video.cpython-312.pyc b/assets/__pycache__/video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6271a763db3e1392b73fa7bf919ce28bf9e3134 GIT binary patch literal 6565 zcmdT|Yit|WmA>=fYxoi+>SLt>c*iEmZAx&qt0rSp?LdHh2;@6A$!}L$6vq@>sc(e(F2yA` zk^gvvMG7N_-*0{oc8Mewy@ceJw~^fX9&2%`^Nr5)JxFQpMHu!2+J$7nZWRx|j;`h| zpo`q==mHaK|CBV1F{VeBbSa@qnnr$9oD&mLVn&H^CO4X%H{D~T>WqX<*RiCoU`d?- zx!}x{IFmak%9#skH6zKuavV)b(+Uuw6RMO@W-_X*;NuxNrz)S4i9?NlTG6fEML7u_ zs>yRWqe-TFDyb@IDW#ZBtmrwMPE=M(SLcMBgKy(sAiII|YMs4mnOX26w8$9DERoxi zWL9S1=j@$gkii&;%Cow|7ByIVmlipLopss#T4|4^1$A3%L3a!!I}GN1=X<;*39Y$P zDCS!DmJo$BUfG?fl$N8Jtda&Ink3s4ox+(^bS5`FolH+p!FEx3tYkAUM6s09W~6k# zu4p>c_RmZWo=?h3Mxz`mxX?&B{mMltl~t8~a)hU2tm(>1SV`-e=~gZ#HC@ciEp&Z( zcczQ9)f7F(n=UfGn3QneOb^uva%M3H_K`|QYL1*Br|H;q(y=t`hKd+( zH4f7Gkrqlq&)|7euoziMpD_b9T7(WXvUy~?zSuvgr?P|DWVT)}1?&D+;oBI_+reeQ8|2$ut8AM$lT`wbWbnLJ&hkj|yVDGVyDB`SB6)FOg~ zRG0r2xPW+!j8wG9g3diba}89dmE3GqTWL=;t7>IHuj;JK8tD7zhgFS!i>Ya>!Og-= z*_vQoypjXb4U3 z^VJ*U)SGOs!P{evqc_gNjdsC&o9)_KX|HQui-O?+*&)L*O8|CDg5B7*g$2Kc0Q}Csb&dW)9|wyXwx=LC#cm8VuyFC$#~-`-d*O-?tdU0XXEDz(o1J=ori7R5_Z?=oSjm zI}#U?Z$=c7Yf>g$Z#n#yw#r1<;>b#Osb}{}{Ab$z@GJS_4};;Qv%d=NxXi8l8?L=}<+bm|FS8#9Lm&G>F!WOY zJ#S>az4M*Uo1Ne9y7Kbnv884+*jRR=Q0ooz3# zZT*8JP^hab9x%lrQR(o+1MD*|g-4}A5`_?><0f%d{Ph0fv||F(PO z{J(y6rT@;GWsdP5WHvZ1D3lTB5z21lYs??V5!k0q_*wM@W}&9u7cc-{?7K+tbR1pd z66`dzegp7ek<+XAI!nyWmSk}D&AHBAZRerYBG~Z+U<;uxpm&*x7)MVI6YFn!i20`H z2?j3;97~O!C)5GRswU0Sg0`v_WRt2$9RN*K!9e3N5I#m43__Mi5B*&Qy%=BfKbt@N z@w5BZyu%+ji@B?Zu1#Kh=H4 z!u(bHL_Cr$$v!c#Nsfw5au~efAiiB1tkfc)rOeIkgv|x3W$+cN1r`hbppM8HPMOys zDzQ?XC1UK}?UMEYeaVg=IKeKt09~CHy5b&a{pp_|nNH&4CCttuG3Nic6gX%`$=|$WJe)0-6=(vs5|I}-_1Z~Ih-PK;B6^<0;IkU zpY~(O{s)mv@LzlS%F|0d4;pur8h5M-JphSM?OcBDpTBY0C?xI$yH~pNM;^ICODFHS zS_>~M`|fQ6r>y(302sCX>Gfc1;h9pfcRk#>-roJNqw5DbxKWX3*E*h~^~<|To%1XvYoJv4tl*56bBbB&c2Ei zPT^%{1O#O^?ogcY$I(Atb|jg&OK~e6kP)mPkN6!R;Up4@S9TG;?5+zuA=s>NJ#}0^ z0uAU!YchlsM`pPl7vq2nM%+l4#9BsG{JqlFEqB4WzPrOuslmO#bOf@$k`O zhcNLhH-&=4o1x&g2Z$TUK*S7eO~55`TN2svQwWbj5o7TgDDbP0y~~)qrl?aI94CrW zxTX=zLQ`$}19o_N5o9#tUgpuGVAF$OcPZFi?7bbk6)OdIt-5zv_KNgmCo)+&VRAh; z+p8x{(ng!wM2Y)ICxl-Es(zNlNnL2Jo27*)B>XUlD1o&+`dzT6IDY&1t>bIKp;hYVJgv})_wxp`%1#GqF&Qarr$u#Mqzsus+3H>5u8abe-wcVyrGm4wm zQ)A~z6Vef^C-9Y`e*4?EzI})PDEMJ;EpP<9m|){&;eT~y#D}Y`44)ykE2zF4}?Db4q@0}0M2VfV0vdfriPF0hrLP9NqB#x1nPfWp^$SXRXkc&uWGbDER-Rh4 z0ioL4DMg7&ss>LscI9AZY6^lB1?>2g0xvNMW$;bXiMUtHKqrHlv^p>9Gs*Ovg5lYs z+7lQXR>L${13%Lb8vw=|p4I4a09?_FBav;c*D?>W#&k6c;W{0wa<7pF1VHH1M?>p1 zpeRad2rj8-qlu!0@zgb;I7^&%lZVhr$8CgkXE8WRSf96mjUvPZR8Zd8oPjdSIQk1cpCKqV zIGX|Ot*YIHF;ca=TD7}U#X6dx&*t0PZfz@d_J0QbHC1GQs`N}`PQCNZo8SEF)x8C+ zI99lLb6|_eW4F_BkSXSt^;H^@fB7;0~0-KLkQI(^_(G)$l# z!k1hcp5-#ChRLlnoe=xOowT-?oE9dpNu~qb*X%s*C$x{!0~Ns(VFl}+FUeyteE>AM z6Gx9(K~gJusT;`YNmUz2rm`8Vi}VKzq&0sIULEOUqNVTNqf~^x22eGjuXmP%n}!Bf z^hMQVjQq}#j6znDQ1tAxgd#Dc6?VBtECoJ*F9DC3W$llUL9od%|KUc=j!#hJHz-y@ zvEQP`Pf*7vsP7Zh{Cl+b_b5{4Ii~N|d}CQathdaDn2zG^GJ;}x_h+PlBdzcs@7&Hb z7RJg5ise(ZxYP4d--mr0g!A}s7?#;j+Cs4cQc&Ct-&KA-_mjEO=t)pFX8&WMjR_Z^ zDHO{ITHM)lm%DrFuKU9S8-!clhZ@@pU8T^@eBiMYMqcW@&$Rw0`xJAKDI=1T`LsQz xjQ=WMVv)NEw3F%kI1nj%)&kGG^%7Zi-xt&?_;>zbFZyY(|KJ|(r@I-*{|qq~+N%Hn literal 0 HcmV?d00001 diff --git a/assets/audio.py b/assets/audio.py new file mode 100644 index 0000000..b527ffc --- /dev/null +++ b/assets/audio.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from pathlib import Path +from typing import Literal +from urllib.parse import urljoin + +import numpy.typing as npt + +from vllm.utils.import_utils import PlaceholderModule + +from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] + +ASSET_DIR = "multimodal_asset" + +AudioAssetName = Literal["winning_call", "mary_had_lamb"] + + +@dataclass(frozen=True) +class AudioAsset: + name: AudioAssetName + + @property + def filename(self) -> str: + return f"{self.name}.ogg" + + @property + def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: + audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) + return librosa.load(audio_path, sr=None) + + def get_local_path(self) -> Path: + return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) + + @property + def url(self) -> str: + return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg") diff --git a/assets/base.py b/assets/base.py new file mode 100644 index 0000000..5ca9de4 --- /dev/null +++ b/assets/base.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from functools import lru_cache +from pathlib import Path + +import vllm.envs as envs +from vllm.connections import global_http_connection + +VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com" + + +def get_cache_dir() -> Path: + """Get the path to the cache for storing downloaded assets.""" + path = Path(envs.VLLM_ASSETS_CACHE) + path.mkdir(parents=True, exist_ok=True) + + return path + + +@lru_cache +def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path: + """ + Download an asset file from `s3://vllm-public-assets` + and return the path to the downloaded file. + """ + asset_directory = get_cache_dir() / "vllm_public_assets" + asset_directory.mkdir(parents=True, exist_ok=True) + + asset_path = asset_directory / filename + if not asset_path.exists(): + if s3_prefix is not None: + filename = s3_prefix + "/" + filename + global_http_connection.download_file( + f"{VLLM_S3_BUCKET_URL}/{filename}", + asset_path, + timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) + + return asset_path diff --git a/assets/image.py b/assets/image.py new file mode 100644 index 0000000..c1a0f2b --- /dev/null +++ b/assets/image.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +import torch +from PIL import Image + +from .base import get_vllm_public_assets + +VLM_IMAGES_DIR = "vision_model_images" + +ImageAssetName = Literal[ + "stop_sign", + "cherry_blossom", + "hato", + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk", + "Grayscale_8bits_palette_sample_image", + "1280px-Venn_diagram_rgb", + "RGBA_comp", + "237-400x300", + "231-200x300", + "27-500x500", + "17-150x600", + "handelsblatt-preview", + "paper-11", +] + + +@dataclass(frozen=True) +class ImageAsset: + name: ImageAssetName + + def get_path(self, ext: str) -> Path: + """ + Return s3 path for given image. + """ + return get_vllm_public_assets( + filename=f"{self.name}.{ext}", s3_prefix=VLM_IMAGES_DIR + ) + + @property + def pil_image(self, ext="jpg") -> Image.Image: + image_path = self.get_path(ext) + return Image.open(image_path) + + @property + def image_embeds(self) -> torch.Tensor: + """ + Image embeddings, only used for testing purposes with llava 1.5. + """ + image_path = self.get_path("pt") + return torch.load(image_path, map_location="cpu", weights_only=True) + + def read_bytes(self, ext: str) -> bytes: + p = Path(self.get_path(ext)) + return p.read_bytes() diff --git a/assets/video.py b/assets/video.py new file mode 100644 index 0000000..d025368 --- /dev/null +++ b/assets/video.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from functools import lru_cache +from typing import Any, ClassVar, Literal + +import numpy as np +import numpy.typing as npt +from huggingface_hub import hf_hub_download +from PIL import Image + +from vllm.utils.import_utils import PlaceholderModule + +from .base import get_cache_dir + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] + + +@lru_cache +def download_video_asset(filename: str) -> str: + """ + Download and open an image from huggingface + repo: raushan-testing-hf/videos-test + """ + video_directory = get_cache_dir() / "video-example-data" + video_directory.mkdir(parents=True, exist_ok=True) + + video_path = video_directory / filename + video_path_str = str(video_path) + if not video_path.exists(): + video_path_str = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", + filename=filename, + repo_type="dataset", + cache_dir=video_directory, + ) + return video_path_str + + +def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: + import cv2 + + cap = cv2.VideoCapture(path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file {path}") + + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + frames = [] + + num_frames = num_frames if num_frames > 0 else total_frames + frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) + for idx in range(total_frames): + ok = cap.grab() # next img + if not ok: + break + if idx in frame_indices: # only decompress needed + ret, frame = cap.retrieve() + if ret: + # OpenCV uses BGR format, we need to convert it to RGB + # for PIL and transformers compatibility + frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + + frames = np.stack(frames) + if len(frames) < num_frames: + raise ValueError( + f"Could not read enough frames from video file {path}" + f" (expected {num_frames} frames, got {len(frames)})" + ) + return frames + + +def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Image]: + frames = video_to_ndarrays(path, num_frames) + return [Image.fromarray(frame) for frame in frames] + + +def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]: + import cv2 + + cap = cv2.VideoCapture(path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file {path}") + + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + fps = cap.get(cv2.CAP_PROP_FPS) + duration = total_frames / fps if fps > 0 else 0 + + if num_frames == -1 or num_frames > total_frames: + num_frames = total_frames + + metadata = { + "total_num_frames": num_frames, + "fps": duration / num_frames, + "duration": duration, + "video_backend": "opencv", + "frames_indices": list(range(num_frames)), + # extra field used to control hf processor's video + # sampling behavior + "do_sample_frames": num_frames == total_frames, + } + return metadata + + +VideoAssetName = Literal["baby_reading"] + + +@dataclass(frozen=True) +class VideoAsset: + name: VideoAssetName + num_frames: int = -1 + + _NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = { + "baby_reading": "sample_demo_1.mp4", + } + + @property + def filename(self) -> str: + return self._NAME_TO_FILE[self.name] + + @property + def video_path(self) -> str: + return download_video_asset(self.filename) + + @property + def pil_images(self) -> list[Image.Image]: + ret = video_to_pil_images_list(self.video_path, self.num_frames) + return ret + + @property + def np_ndarrays(self) -> npt.NDArray: + ret = video_to_ndarrays(self.video_path, self.num_frames) + return ret + + @property + def metadata(self) -> dict[str, Any]: + ret = video_get_metadata(self.video_path, self.num_frames) + return ret + + def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray: + """ + Read audio data from the video asset, used in Qwen2.5-Omni examples. + + See also: examples/offline_inference/qwen2_5_omni/only_thinker.py + """ + return librosa.load(self.video_path, sr=sampling_rate)[0] diff --git a/attention/__init__.py b/attention/__init__.py new file mode 100644 index 0000000..dd35165 --- /dev/null +++ b/attention/__init__.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionMetadata, + AttentionType, +) +from vllm.attention.layer import Attention +from vllm.attention.selector import get_attn_backend + +__all__ = [ + "Attention", + "AttentionBackend", + "AttentionMetadata", + "AttentionType", + "get_attn_backend", +] diff --git a/attention/__pycache__/__init__.cpython-312.pyc b/attention/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a3e51323d7e8a00f81c5bc5dd63bf1aa994e739 GIT binary patch literal 467 zcmZ8eJxc>Y5Z%4cByt!*uoK%ua@z=rAcFQuyV!p+Dt789+hDD~s4yfpJ%^1Rc^9Y)d?Hd zpx6X9gnD;FAXvlIGD7cdJk?zA36yFXrb9}Z5R^Wlg~>+0rvzk0s=OKiQ)cVI?RG7mbhqVnPb?=r<9Y-%Rb3%tI?YbnUncpAA~jy7 z=bQP@eOm#1pq5rA-9_TQbJusbckVgoo_p@UbGhssehK4Oha11iaeql3`By3dcVjk= zyTrw~0WPMC=@Yt9{eWIaupwa>H4YdVY)qI2Ob9b3%%he8ixg%ZwG1NdsBOS5g*Zl? z15O5;60TAAfSbYQglE({;2kX)C}Dm}qI9%upp3!RMER(1z{g-)qGGgippwD%MAc~Z zKsAFMiJDRWfPXYF5MX|1qIR@yppL<=MEz)RAjn{MqG7agpmB87z$)hVB$`H>2bvk| zO|*=*4zx13B+)h+8VHTH541CXX`*AabD)#KWr?oQ@IaWs<%#amo`D_)`x2{1*9@#- za7AM6=(>S*46aP9AKfspfx%UYjiZ|eHjQo`*v$OZi7lgB2eyuG8`w6wePH|Oj)5I4 zye6@8bl1Qx9T(p<@)dbN)aSr%Js00F@{iOI8G2g+cnikDpV&LvH_)f!#tgB*E8IXI z%k?!mt@@O64LpPt&GG$j>%_Q$12JRt;aDy5KElTK;OHSXp8W%dQO~-#X=Gk4MTUm- zvHDkx1CQFc*x^_(UKVSJKNf4m(>P_}xY(-r(Rf*0FZzb`)yOtvjWxYu8aNhfj$1~) zE0?1_V=eJcDR*o9c&rV7A(Xi)W*l62V+)9>lx&s^Os_Yg*w)dA!5Y&^oojE!9U$@l>IwCg=&U?XXaZ9(bJ z#jC&Adav+Kdu;0~MrHO!|2w^N8(MgR_22e*G=Adh4td7ld#$!{nD@HLC-w1I@f^ks>|3D+_N@^9 z&GjNEt{^5B1Bo`mTYVd!^7)O0xedKME=qs9;UsSLB z-OkK|MKkl1G&BEU)cg=;<}e$Z{`e_rW)6Sy_?$C_9-Z>`rc&{-)bRLNFcCc$=fl6C z;S1^V);-ZgB6=bb2W%XSCR6qhraTI8SLjknN;dmL{P^U=(Nj?j3p$)?d&Em!!9z=jP*g@#L0N zjADpzyZb9Rm$(#M@%OS>g^zM6iMZpq5z4XbCohJ4RZ%+gvPAI4bTWy|8)Et>H&%j0 z-vH*vIBJ^abex_`b1{>g1_YHkT2!N?FXfiY94V0@^%=9MIg9JQic2}=R81UjLb-VN zQ{30WFK}lKPjN5kLe~CY#E@hkPyPRvE^iyc*hZ2V+9~H*nqqN286KZV9-C@=Qt}3e zlfkj^R4{rbI?PZ^XK)B#2j)eP@IW|Z%X=R`+823p-{Hp&^*wepa`?zm-jCw*mf<9( zXDpicor)$SvG|$c!8m5F{J~IKg*QCI_$(0{eJ=oX?xZ z?=^41q#45-`7n|siTeohAxiDXbRRq&k*gk&D#^;{HzLJW%8-O}p1W;OJt@qqqD)!CF-?nr+QDojpHy`YwzuI-7I#t1`~O z$9B#cxM6j_=ALzDDti~LyKg)E8KeJK_dJ}}{}HFNAz{;XXP{sJ_?sk+-5+{;JGdWp zIC?i5ezZvkH*aE}=b*$xA&EFK+2s5icrIabsKYu!Bem>T6ss{whxBJSenSeBxcD2f z%8>e0CURp@I>zg(X$UHs$gY?k96h-AX&sxuI=KvdH0ry&OY@UYPRDyuf-zwmwNHcR z$0wjyKYdARAUMaC8EhwB5qtG4{!o&ACl$mYUI9ifWhl6-P(L&%}9tI2O;F`FLuQ9~)Hg zABv*M#inHoT;NP)%4$tYWteAE%A_38z>R!IQM|^~X+O8@pPRs!EPipD}Jo zj}Jucxf(T2hU6OYd+^~a7<@$Iou`4)_~hqq>G21bF12V+QEdZq9o45&+uBd8Z5yV& zoz6aBKWt=9L&`<&K|y9)|xF_Bb2SVwl`C@DdXI9+g6>mH3+tb zMcb-d&Z@;gPd2bs2yDIX+a6v97yiDZ60}WUZ}& zwUq(wg0;P1GE|xGa0aWnVC7IJxv|$B+iyD86-p77sqW4=x--V^JD^|cZ+f>EoW-FZ z+qjZPbu71%vS8vGTQb(VTaHR4Ok8$Mj4sbR8LK2QaC!3tA0HY%n|CkeP4Z=f(Zt{+ z*cy>>gfE^Plw=VY!=k0MT8HG7Etk*Ho~W%)RdA>Yq^Q(9=zOK)xRxKSSV8| z=?g4VSpvVnG8KC33oMgx0sF;t=n3rU6!{d&ki|KaI2sjY!2-%GOY266^k4P^a@1cn z%Kbb>i-0L@dii*&R!*2UrRro@PM7Rk5+A%TUjH!n61Z$gN)-2CNYDF zBN~bhf&@(lqhqn4Br{;5f#4{U7=-P1LTSfNfu}kZPfW!5AR%)m&`HLzI!sOkQ>Wr; zxl76b0_BHKOs4SS)Tt;qGCt0VBe_dF7CaG8y%3L&1$$*qnbo3Lz#|HclHuTiRFK3W zaxF+dauO!BN+VlhjqEjB$k zJQR#hOeBT}NlYbmL@z`ETUZ;YTNnb_g5HT2Er=`-wN0j?V}tQvYP_iCVI^xZ(%BL` zJV8AgO)MP(xt?r5)T|LF!kFJ^8BtyZ|XS&4cDzWquQG&Vi!f@(T5b1)9o0%9N9!n*| zp>qBS6vaPD4xpthH5Icb<9cnML2$^L?-Mf2Po?+Si?zo%2V(XZpVDUDsmO4l3AJ zuycWW@m2o1B~Pr`fglUHW9D}04(eu{S!=UkZJv)`+n2F6FIu-|9(}xE;7lc;DXirg zOZ`=Q&RV`$(RI(D51By@d28l67uz;3dbVV&TQZ(4Naw7X8@TRhy-^y-)IPCL`eerW zB&tx?K4V(6*5s_NthGk4)?}?A!5X^K{Gqj{;73^~n-K_u=p*GPBmTSd=(JbSe{pU= z2NeQ+Og8|DSUK{a6_tBA1jM{MuKY1 zWGGg?Lbz{*@Ct-C%DL62RFe|zTME>IX;BFmgq#j3e<E4c%j}}XjciV^AHCW7G6IDob;@xY&qQ9dn7P!w=rMce*r7a5uz;xSQHl%IS zp0Sd&`DItytR;F!ERe&rN0l|o`KP_e)r?#U1uIW=2!>Fi7W$YjMYuw_7Nsv60}`1o z0fkx`GftPKXfK2P(j{r{h?aCyVCccSFTheyCuT0&YMM+$Q`cq7yUqW{iH{?>{Jn%_%;<&VXL`Jo8 z0z6~3wb z^@&`{=wbGeEM?hmx&kw$Vr0j%V0a<^a;JLb@R&1g{}%Ucy*%#Vi#Y9kX*<0JrE(Ev zE}%cTh_W=LoiA6W9+FE&N~d;7s@4lqa1n*?N^qh|Vk>(=J zkQca+cj}~Fr2L6lR~LWU3&nOjaogZLux*0X8$lweK@<)=(ZRuZGKmythNDU$&n4rD zp|H$3JSQ&k^oD-yi%+J;e?cpYDD(Y|Ls|8MZNF&>$@{OdVG_iSIl6Nm78EE;*JBqXq^7a#naqP?z#Ue70gvs*G z(eYS3Axj6# za3YyE5+sVYtrS62O5P%ii3!bu_aq6qD8I&bwYW(pX;u((oiC?RoJd7P!Yp!PatQh> zc{_Ej7?dv^1SyIQagsF1?WquZZ@KmfPg6c`XOaw78#W+3aW(uoiZVe#=43qYnUM5O zl=7e$5r#vp2Ph|kM;H+1d4k6giXRJWkFTa$X>Z@#Zv#S#OC$?txYh9~B3k z6iWD6#OEDF%0nVI|5eIH2%1Sp@>Y44@RumPE|RxJFloTDL?TJ5t^Cz5f1YRh2K)?? zOsTpu_ku=CK)+1#@hXb})w>{ta6`>opbv;8?&b;i|vt2UIa-6Yg* zn(e<;+c49AyP|8MqI<@XgFxM}Do3&Fh1&Hq{kiJqY;})N-81uWt}&c#+%4e0v;J1? zDvI+{Xluquns!~&Gs4NqP2ne^{R}cKId=3t_Ajrw+WusE32=2 zI>lWDclBIo&Yx-8|3TmbcjovLVCJQ5hE8mq*lKf5?}g5J^A&ey(<6`x*zAxg)U28@ ze$Ab$Ze))@8@;GHSJz2Tf3B)ETh$>{bzFI9p=#ZGdoxwLkj!<@%9T`Py{&?`b-^2& zF|l^m-w5{Pywy3cFXyexd24c2wYlor*K04==4w{m^H>|)GkZUD))vaShOQZG#?;;_ ztIU=)31v;$veiP_>V>kkGuFQ-59FLa_P-?OtYZIb&=)zUp9Qwv@i~3=LIvlkyHg1W zwXxl+;i@{X?9Fy=7dp3RJNF5l`xZL)3zY{D>CE|R@UVZ3O^}-0sy3=Z@YbqUszC5o zOt5CZ&L&IE>@wH}sI|Tf9?6?6DN&Wh3L)V_n)b3a) z-I;Oj%x&9+bXn_a!Mgfdi(uWHs|m7F#NNE+Xw5j*-f*_$ycIcbXU^N0^==lto9|hS zE;pnsT?cg3%b|O2uEIaJHS6CX_%~$zhXntjMPL72lcC!Fb6@`j-8}>6Dt+zX?7_Lx z51oOWum283;YF@{POhr0JA1QDn}w#$f9Csf?T=~~D-R>qd80D$x#BR*0%*sg zwd$7Bn|0P>hGw1Zg0ua~`bFpJoVWav?V>H~T_sB?{T5`_+tGN z)Wrd8DV1!5EXuX!*B|*&StfV;pe=By%$PSZl#(}qHP4$EU!HeLIGL?YOZJS2!@wuV z-TeUE<*VGQx>xnD8ZL3O+z0}f|DYkGa7kMHrq*842rW6wezUq+{j34+AauU$_qI_C z&~dXmO>WbKe8pU|l>g-e2q%hQ*)OI$!zJ0S2jZ1I4K>k-PlaVjp_1y6N8$72SPnHq zc!lbhA(>8k8ol&ex@j{~RmpMc^Ma0lBW-+HpEiy;&_)gAhG}Dws$4+LDMQ4yA*Tk{ zh@CkaP5h?8FUQ#VfoUt!Dtw6;OKyxtpi%Y95JDBcLk@%FK`QsRbZN^MSK72?R{wJA ztbWB3HF(=ks2^q)N;G_dB~m}NsXZY>{IfA*N};=BrlM~{;h5#LIFDnT(a-2+xETXP zh2~eB)3#JtjveWdA@#`z^jvDK>`yDw2h>I5t4Ukrxp$Pb#B86Os;IWo7bm}qo|<3s z`r;HKEPlypi}w49WHDdBoS7o|mxO#BHTcfnqZ*m{~Y7Uuv;6P z1aoyV7-iCFkfTsh8H+-@9IT;I97&ED%M;`$#~`Y1O$MKfpN$V<6F0)bpObfn*`DlD z?vXYiNwZxkjl|&eBKBe>^f{(oBc&zQfzpJD^H~EhsNR)B3btOLK7A zpsF#X4Jimr#h?6v@Se0`)}kfeC5nBT;Q=!-0hjk6bvMYg(Ia>J^%LFC4m2 z*)ZSrz4FD%4Y{`U*J9rve|KDH+c)RRHMPBY=#4|yy0dGJ2>9Rh=!HYM+J@Jky8KkO zHZ0VJb5()4`25i;&DUz*cV&)04Qj8h`!1)itGKX_D9FYOhr~TX2@RnJ9B0KJn^Cs_ ztT3Z63Ys%YUo@linlnm2tGQvo<_E=T%_vPGPMU+}Pd%e(K4Au8KIz|<=K4|S;`j4E zKs!TLarplPFdJ?DhXiSjG4&6Pu`;K>iIVeA;6Nxv(y6>DjxE7+{Qp8h#xuk57m9=nOs?=viZMX@B5yv%)LTS> ziH~VU9)0}Ck;9K2iyV0JXy0SU`yS&Rl)83uYWu>aF)DwVD`Xk{j>d9XSd+&zGk|Xe6R2QbwA$uqn(S+ zqdBm7U%vR|8E1|ZBK-o6FF0DE^;ce%Eo%|VS~Bh1g|h7#=k`q5c6^xL^4HGIp38dH z3ZAu#p7l9@eb(P0_&cujU0e74?eA`1@b_M{UNBvF{+7e@+Rm@;%yn$cbnLmj?E~LW z{Xg-~93&-&FJ1i7eBj#CA2k28^Cz8)z9(k()zb) z#tsy_p3%D0wjlrg(% zov>;h@X^@2V+$317YsR1dBzijP~mm^W&8Y^`Bb)blhC^9y`~?Ae-K`%I&i_1^OR*h ztFC)i74)`}#_#MYaCm&r_x-@TfnO5H{P!wYzjR$M>&p2zF8bG8Tf5-j_@42Ho*#HV zs95kH$&?@Yxxa2M_4-SfU%F;ST&8^EZLg15-b?O_?yPs4;N6xhubx{wZ<;@ICH2lr zZ@mOrN=YfYwyKT>f-3jsi*LM$Lft=b3$6PWDj&*tAG)Ph&stLPojt$g@Fz@3Y8n>`lQ}`>5Z50p;ndRLoF#4@ozEt z3=&mjr@w+}Mu$>H5U~6=BC%4%;VSOcBE8ef*(fb`TBa57HVq6sg|G8%X^kLVZLJml zbJR%L7W;%6Y0wP!)yP}i6C(UMYGjaG`3W`B5CYs+qx&nwJxEZ14J*Qd?6*P>KS0cY z+|HuhTD|e$+_aOxerz*I>G5+MPutBQ4_g58ror)va}iM~%@942%o|3dXZhEWf+sa$ z{ta@TBj@+wU`rqsALR$hA0sDD4oQWfd(McarFeUp?zwyE0yR~oV&kPT|AvtAPf_f7 zE@ySV=9+~{lwYv=f4A@T1D6lX#}@*f*+7pF=vfG?0X}>B-KT{Bog&jpJov5tY#=NI z!V7^OV8w0kZW97K7p=Rr(ym-^UE}Mcmq)X8Jwjd2LfslsQ2VI^Z$SuZ{LmV_pEjef zp8oE_<_T~Ejg=;LDCQ85oPM+oP1JJ?N(`ELigA#bh2@#T%wYMTbgY?A;Txn((z}#= z*p%1IM|9M(AJtLG2L)oyeD-CT;MKxM#btqGF!xW{6dV#H>Q14$GgH--afWAfIje0ZJ-3(UbvDo~ z1iCZ+o{Wox+hW{HSQXD*K9{XuFVwHk)NRPPHVW2_V%*s|#tRTh`hwSi7&;KD2JR|La3)@#`ab>q^Z$DmDXp!F!Mj52|O; z*;%%KI#r|AkR%4tqT~Jl9T?=#4TI_WgO84Y{@ zsetBLYn5x$;`~pyQGc@o0GgD-%a5Z5%1VUFN`hr=r7!K`URc z+z86p<<5CGflPL}Gi4j07hT?*^EX4dAqh5W>Ru0D4(BRrvK8GzMK{igXY1Dp^=s}q z%;oL^XLh;oc|Re~J)4P=np|%7rHjZXz-Q5lygfokgXvMESf~!?)^5yg*m9@#fZp_^?p9ev!NeeN zP34sZiwIjeudiSeVLMk@Gq?BRNWmfcom{1VZvDm6^LvGgwt`CxbaR!BVt_{s@N$)* zE9)?5g%UBKl&h(qFTWfpl!^XwZdJ<_;~VV-pXjgPO3DkBB3wmPsTScHuC@Kj-Zw@H ze$gMm>oee!D>|>Fgo?EVu^ni%-TiTcm}JPvb#&h;c|vC@z2zx?Ej^nqn8<@By31aB zdiLppg?Xu!EY`-nsB+-a$ib^~ye9^geJJMRP;J}<#eAcrz)tGEl8>n&k3k6;X{6N#s-1Ko zPN9{tIw|Spe5B^8wPrC@W*u938|hTHc3vnp6NRp^<{`#dtn#d#Upwy&_VI*hV=_eV#Hw!B?NQjd_aF5jnW*hdxte1gTaz5`Ekx!|D^N zywUJ-!4=yL*%A!- z`B$m*8FCog$zGu1yUU2kmy&8KY3&ThlMy;%a{7#_vk8k}quBKtgc2yfgxBC{>N1^8 z{xFqdm8}Wo%}06+`nZv7zzEIL?-aoFXNv)igro{2g)_QAGX7f1RAYDt?K?cr?uR-t`sCbaFnURWw}?YwJ1LQujG$FFLAGyF#Q z+TQHCJ^}w5`{+IO%a!;*rZhBXthG5uAmeDuHLcG!9TJ)jWt~-HUSjgdA z<@ZXt(v~a?=9Kgle4NdN_TLL|t}ZBra`p=q$M|q8Bgkp4udHe7za{d!kQgOLEk`8&4i;S zW_HxXGKJ=#N4;{Z)MrYej>gx0(W)VXwiD45jx*54Zt!d{8Gk+kvps-{Hhe=ky{c&Ntdw3o2!g!w`OlD&t^XJJ zu(@B*H!a#r)3aT0wnX79haomR%HVzQxaiy`_DH0|_~yarWHOrI{}u9v+)&CV_WuJa zNL~FH53`-Zo+0O-l0%}Eya{p!oc%WsPL3ugNBKXci2dXcDG+jqtLRb`mN%S8PC=WO zIs!HzVFxrm#HkTikF6xXHP6_>YNDbI$s2%#IFwd36_SE9eycWsNXYKIWm@&4qG(?3ct ze(AaFmxhEd4J~?3W<6tqXDpLU-J^Yu-TGhBW4|6m2R$;LvAZ1SJ)oa4-oVS97oBs_ zxntjZ{Jpi`8(3)C_3pqm-R~X0HXto-3=m zS7!I1R|{2~x9qjEvuEcFbC1vOUhuSLtgXNPsFrKm`Rjbst|Spfe`>VtTW|lfux;NK zd&tnwk0Y!f(nAt(@qb0meBEplT3+$qDYB>szix-y(IZ5 zVPGj2=)4jW{A&uVAWSw$b$l#pLSJWUQ^YxdQMfG4A)hMDA)j56+x*f2pSbK(-3Rff z{A4!j?(d>S+VU1AbE%bNE+)uapplC|3;wK%?8OGi4*5&5(F3a_iGd^qMi1<&(Ssoz z{2$WAypYQ{ABS;GhBq4!p5S#5Web zUIJmfpbCIEJmdNWui;G9*eWGR?ydgB6$ua74q)UZmNUoJaFqh%HL0QZ{ zOm-SHnVoOwV7o!H-iWaxMuQbXj4ft|>xgl1oy>*ZgORXY5A}(7H;ebAdgRcNH8P|= zk!y{ow*d`>;E=4hkxT{F88A-SCo?ysU!kI6aIr)phL%a+5{8W{uN?VWP2PyN0D zIfwfE4DVC0z^8uSv+8?f=rg=e!7JxvR@SF5wSYo_e*ViSmQTakcMMuEFOv`VZWZ%} zv_gx!`F}%Dikw+;o*?H8IcLc^M-J}r;&=}^4svk47fxzpb!RLsF{v&J_L5Ud&i_Q2 z$SMKf0w=V3x!9S#Tq#P-t-gZ^r7u(N-y-L$Q#dNIhG2C$tvyVIYbY=4viD< z%cb5o0-ZP&PrCJtA;v-Tid=bBuCn2t%Tmi^!quegQT4mV#o8{R3g&r5VQ;plU+C#y z=sB`b_9#2H{t@T%*`YIXY3%D`1q0w)5^Mqtrm?K@Qv2823l{k9;MBdpU?a%R)zlRn z1UYf$y5OR;Za`Uo__{w_@KB7Gt8FNh0D`c*vMF29alN9WP)0H3Tzk0SqdZkyb#1n) z?Rr&Pp_=?PC^uW(e!aTA;3t28t82{GuDM>jrcg`%I@MoK{vgYb26j;c`-Gmp>s5V) z28wFrssfi@`uayS zPoukJnRrqf(>jL7D`KxZPfetTm!yNVtH1%G;sDty8 zq*jyQe49`*ekcKExHw-(M7_d7$?+UV z>;z=`+IoFvEoO@4AVxqTXVud90Kx;RUxrZYDj8OvR8ksOW^SXWrdhR-pqg5S_!IJR zeC#}Eh)b3n$8VBzsZZ?n=(LQmAygsD)YL^s^lf0Bv`sfD1eKZ(#>YgHZPv79$P9fE z+b|3@ld-mGTiOQe``RUGuyDmC$?PJ_uOVpC?5}u{c>;wLE2`<|?vKxLNP4yUUKx`5 zi8KN<=BLVjQ?GrFH?Ec6qdq^)cgZwM`3fS|zK5LbJX@$nB(oTW^`{j2XXN~E>lza{djRyb0SA<4JMzo%SF^g6xk8`Z2{f z$S3kZK&^0S&!?G0KDbv!FU4iC9A85Y;mEveT(Tn_VHSq|gx=*R{(3xUluhi-VP=en+zFM7hcVAqw{JL7MS3&9;1T)9B-^+T5r zUFpuY?-%etaA4*T)8wwd?x~03Wp+)Uu%_?*@$9~sfd6aa7hIXTFgD)V@iS8Q&U)5g z$FaYjZJD0^Z#X}Q{`BNePF`@{sIGh6f7w6(bgrfK&697O%(b??dG?L7_e}bl)fdd5 zUurt!BLnS1bvw~W<-S~bJp&qYf}qyebf*Lma+l*|+y&RINur55wa_#w=L=gZjA?$`%QY6eEhXP|dsa)B;q2^!@w|M3WUb7=MFK4=;Ww z6v*F%bN>;A_9YYbfu|Je(V!%)0ysJBD7~=m(&mesGj-b+9Xp6D#+v)Gv_xg z$zN=!hK6t1PviY0Bs4Ee2Cvqcy`s8l;Bj>zM~J^%_LJ!wh<+W#=N~_zOr^gbR3@Wt zGz5rRZ2*;IT3Q3NUqi&WLSFNIdA0hkY+2*sRoc%*l)l58wQBv~+~_^=w?e%ioZBhq zj*$kfGEQ`9+0=RE$^pjEE4KUTavxF>#zzLTAW#)ZN_#%EXdD`*OYV~Du26XCal&x0 zpYGjDBv^Jye=Qm0BnYrS5OTu&L8t!floKHCM#s3Ux_*-!9JMC4wr0bU- z_}Yv&@IR-_9pro$&XP4vLEc1?Ek)j1YZ_Pv4KymRbJKd&k~dP)Tv_(hB6!(BQ{_Wc zGeCJ;y9^B^QGE1@^b!6B#b}>?%6p0|n$mr$_y%f`T9lT3ji4YoljMAroaf2;EjUY# zHIZ;_$+0GOf@Z4jzR@6No!#I{H*#*HZ-KI6xvgG@B~~=AH}B6jY!(_egAl5*&$uC1 z@CJ*HEo~7hx8$m8m~`plSgtgf3$%*1A+v#9LSPrj950j@&628N$kZ=tWbEq~4<7kH zK`(>2=ljkTofscaz$(y*5nN5XTwmk=q6~jb4ihReQiSk;C~>7FKpcMU|oxL6jBNJfs6bF7lnx=}kN3<39ik)^bxQS5k2)a1qy(65P4^ zor7;36yaTcKX(4e$>3Wy*Ig6h3!V3$7qXk|bY2J#IxmEWh%wmk`~eM|qPxmiza=Cf z(XkAb(JnzT7Lk{E*v&U;;u8ZCuHfb zgE?f&H!falkQDh?nxnWPMx!}A;{<6jTz*B2$n6r#dw@RPZ2+*4Il0y?u(NaiaL!Sa zb#w}j&MO0gW7GM)7)#l3OU~-PWwd6Em4dNy&NP>t@B7f$en0(;0>#cF%!=!k#WTKa zpX4p0R$Z5wGN{(FxPv=OPvbsSd`$yXYwAKUs7}u05Qml81AMl$Cvc;*Wqd+1t_*|WiOJU4h~A3^PY$1f&gXq< zKFe?YVrT9Laq|~hic`yyO|;>LEv0-`79;rNTaU#1x8$2ir0YW%y%`PC3IZEOgG;K= zd;fh~Yy`zStLS8DH;kI&rgIoS;)CNu%W75J8u9j$qT9QAgDKb-hYFH-R|~soMW+6` z#PcO`tG1{lFPXm+&FoRPF>C5eYu-uMy!4Nc;gXgih1#WZ$HvvX%H7mT3h9lk#-DJd zSX-~s#UiDy_*GTgF5I0ZF3roX_kvm%E=*&6F5T7+yYI}pTTry_Cf+tf6$UXM-?FqL z=vGVI+s4`l`wn!%kj~+-TC=NYn-OdDv zRAbhua=-HbJ0&1zmU@wBCiJ44Yu@r+`$AKnU~D*l=)$_Ws+_CjwWnsE%DUKpid~$MpGRb#gS}a?ODrA8yCYm&f*g~`oN&gA8y$%Jk<Yk}QcDy~n|Sy-RCX+rmqL&}$;jWyZ!nz+qgDw&R!t|kEM8DDY^*MWjq z_ly82?j8&~iliT#K9*@K_9?7dou<`QxR6GiJ~l{D40J95b|h_Lc|?x91uK#UR$qQQ z)qDpz8{t4Wy=)%QoDnCJnsVv1(iBPmIb|c~pHaU)1OWY7e*c^@T35C8% zP2>IZq{tYbNWAPuZ~X~gdsiY}N)_UTHGQefVhK6=KL%xp!)skGOkpT^>3E9!4ELjn zI+`TpT&E%Z)M(<-D~@Z&-rMy4GYhVx{{zqjznP}BQLEj{utSYJ>PvTY7>3w~LViZg zz8^#ME;#D4j*#F8EjT(d#tsofP(NXXQ!&J91TZv_cgBXH|4YYppkMv>0UPkUsBY5d zzjWe?ID$=*&-D4%QicZ_%Cz_o0A?(LNiH4!?(5 z+d&wQwfkQK!p2B{XoMXxflLQC0YXjz(QGmtJu%3mPhip*rcpq$in?HIC9|?SX~JV; zc|HEhWFMKyWtpRiyuAq4?aSMkA!fRZT~z#XGrzLBiS}`VO9R8QSQk}+fC(IhXrG1f zSboK{eEcMtcR45Ca#OjiNSFsk`^I@DSIIXoi)DAN(^W(`v_==wikGmn+i3DFE2Txv z#*(-KM@mc!rFhv&7;F5{@JUf3<74b#J}g#asKOJt?FeHun&kbISsZAwWbytc2J-<# zGBIhshGjdmTDka&lpJwwW^^#lY(I(Cgo2d5fl71@pQZ94ix|WmTYMvhRUD8^0W7_; ziT?)W>ZKHAODh+KxwcV$6~)--=3{1dH*ec_WK(4Cv8RsoiSo_$l$}&zMOy*EqBoDx zE!$x#w8*?b-a^DIBJ!SPG5nuWDNMGxo1iwJpS+3j!lIP|^C4!zNj%Y(pd@4D6v$x` zvwuqv30as7M%0N25fUR_O5_KKCT0X2Pm~T@_jy_c`G?41c#iNN69kF4kM>*g)@>L} zSdZGl^N7WNXe>bDisR-w9c~8JRsI#%bd#(6OU`+d^Zksg&v5mB#Z}$p+J43zzRC68 zaUEr5tPS>5`!UdyCS8}EKosPFU?s5QO z19{7iJC+_@Pp-73V8HVhhCY^u*VdIZzzyEcj# z(rweN&N*DKZJym+FaW+)+BBaMN;?WBc&LKjf`uR$Ev_oq2ohB?3375~`&}2HdoSvq zHR{|K^rUc*@ps+h$a`xtl(=7|}F0mG9<4UAgd@JDzf# zS$PFuXu!-IMx}GMvtS|Lry9skr7*Lu zQz;B^QDNkh2eNa?Kz2$4xhm%bYAMU%v~o#rOtk tKgsun52{=Vbh%Oq8VI!#8byldWx+T|37w=WuX87 literal 0 HcmV?d00001 diff --git a/attention/__pycache__/selector.cpython-312.pyc b/attention/__pycache__/selector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..312fb0b6cbbd2e01537c41b416675cebb9b4b60f GIT binary patch literal 6689 zcmbt2OKcp;b=C9j>FN1!hQp6Iq?(c_a%gf$nOteZl@wD;Nm*$_tyfyXnM2ym=_1)w zPd8oNLy5$2!CmaZivafO5^Z1`)***Alw5N1CBXV%!$2-GpbQS$5^Tp#klbdM46tx= z$gAq^8Isb%i=@b@e)W6x-m6#dQ~#1k2ndQE`Wx+W9--gUhF@|Xu>U!S&?EE?QjyB( zsN{aR5(n*o9;gIML5}tX^-v{T3NtvQM=H@$l)+)0uf$5Rick_*o73Z^IKw4Y5~T#| zi|ENps+3}IR8LpBN?i=*^-LvO$}%{ncUN+y9D@bDr_x*MWpG^YtMr%pITS!Rhfh2K zYH2AAs3PuCxveNd%K;ZSKK|ax9_|yUzvaX=GDao`Rd|b`R@GNFsqkf%hWfp zY%ZmR_KIkji zHmaeWfXdxQHt&zs5EbZcR0wQ2f&}F{wk3NNOG~Z@By$53qN!N2)^S-|*03rqZL%2~ zZ@&4LlH!YuE%p~-V?!gRQ2}X68w$~sB^^&l+OlLCSkkNxuA;!!z0LTy=G@k}^fI9X zBE1T$mnFsMP<0p?Lp3ff(oIa@f!xe@4VT} zo~&m_zdKtG-FiAX|LH_y@)m&mumCyPWs{V#dIYH>9^RttSr zP-BzrRd7|{CmiVo&I=r2LT5)zx}nGEU(wAaMVFa8kBE~D!1y%$Eb4i7P^%9Oj5b2O zqz9U1__a%0x=H9dd|lycaa$H6!M|GNDj{$)^vy~41f}w-se$xOP;LJ@`Rq-wl^{*V z8&!Ug-eiYo(Em0%$(68LWNXL?pyVfrk?Qg}MD_77uSa!<48ejwho8snfq=x5_0SN* z;KWw3qRN)Gg`Kc!gV}SE_cml^Ol7w%Ea_(Xo;xK9+9FqU#o<>K3%ZPZjzGJtb%j{i ziLs-kSjufn1RGN!GZpeK4-TrV-3|f8{Df~I@nz2R0}-u@YjO1P%T`G)S~|odg4(BhZx?vber96OS-?R zl~-Rps`e!@R7qF1G<{RjD(fb(ElIPBi(7qf7+}IQRXQrSIcclADBWcv%*4SBA$U$I z@Z==WN+enh)9MthPSffPtwv~d7OFysP`(HipVQ}_i7L~>K73;c^#_I6S4HY%7_o;> z*s`dxq)vhrSE@yCNuEWpTfQsE2zNa1?el|Gr_iMX@=j!~4W@@lLJkvddiH6W6Hz94kJ}+X*#VORQ17I#xCi znLfl%;n*-VRo=$}T!?E_L64OhPp=_d+n^fruh|o-w@|i|M*p8$)m}q2k*%pEtv=h2 z{*fvG2C9I^3K)fePa7Bo>Ty}ACH>y*R4uhO<@a~KZQV%M((lcHlvxD74H16;)w*hF ze?3UOt#50ubk_TjD%R4l!v(+H`T9FNq5260WS6>)P9r6S?;VvigL(azg@4irm_bW2Oi4El2$LlPm$79jWK&Ws>F%`e z@GNOn*?{m9V?$E$I>BYd#%fV|OUDXCAOu$+B86RO;OFkYOw5Y3X;umNzZb(!h!1QV zB9r1)-nYLmIN3RAbgOsNH^N>-;+bJW-2w!NY0u5d>$+lt->x_j>d-dMZT;B&O5oZc zcC>*t8vPtMi*i|K0_gArajYuc*C6Ct^7&G!1UIIO=Vpr25+K<)SDY?RPq-e31~n{r zHX+2a?BgQQY^fuOb-lU*sLG~_i+8q$zLaxDDsYztDamcWq$?m!du!|+7Sh4N-MlL? zPY-^Lj<8kG?MbRw$g|kN3BdtaP7hO(`8#*6F5Y?L)}o&XIXq-k)_u)hbs}!2teXX~CSi_0DUCf>YCh)Em)9Ap){a7fo6bBoJz1y`0Zv1Hi^-NT!flcd3T z(LW%I&};5QHG`%uwiC0o6+?kU3p?E}I~_-8pP=Q49%-^^l(7?6@diX_EIA+viZxF$ z5d590Mj!+~COJo9guv;wRjuf2Rk3Pox<#m^aYCB0Y`Q5ZO*;yiuk4)DZUhKhdw89G z51@&q7vp)F?_>@QdFhD5VOKgq5Q>v%lS#GIiTXHGZ3f+QXH;9%EO{Zop%04&$QX>V zY8e3QU%{S003e~hd_C0t2mb6XKh_GO@K7s)_~CkVqB$_z;QJ0DXuP;rSZEX$XmAi8 ztj7z@)bL(vypbCJWbSF|{Ihh=qvXS6D-g`h9`GnPb`XzrWm-ve_SL2WM4BmQJ=iDH~B_m@{Rh$jkxbKk)Ibr~Cvk@csCQ z@uqmbDGmU2dt$yJ=6A)RgDB-iL}GSl@r#2hhr08hqYy}`M{4$by*_ib{`G4RdR^y6 zo{bcqj$hszztI@Ku{VCJF@9@z{F}QYZ`aSu*JaUH$*v3Id)# zTGZqGY&do`gMOZgUF!}0eBi>hOz;<(Fu;Z2;^N|wJd_6HG>F~*1ynb|+d;PE`5lCO zue8nCo;O60sdjv93|{sAHt>i8Z`&Df$6L_bj@4cknfcf`$b7-S{=YEze(@iJ;2Q(V z8DK%6*Wo(LB_Z}_xt*j)N1sW2o7s86pcWBKwZc|=&sCE5UNzyRvNW@%k23DXelhi6 zYuhh1koRhkbgUB%T!>|Z|CFlhrXk@68uf$>leZCD=xyd@@_#DtZrh4BE3hA9_T_Im z?dLeQ*CHe+)X0O;NVFiz_N2EX%Y<_7$oIxN<^KoB)J(!;cyCjB*l?E zu4IoR@=aLwOZZv0p!yH=1-!D1p`O{@+$%eaaJ#}oAL~y;Uu#9e;i+Hq3FwKPdR%+AghuO5s%+}?afm*s6qThiq59_t9?kHiM5yvQvE`}(FGwah33^qW2&Bo>N7D3- z<#|pR-j~X&PK-Gj>T(HvqVSwf`*=DYM_ICLqLghy1Jo~{M(_&wRkG>2-Y$k6h+EcH zitL2k{4&n^T?bC$6rsR+T(?Hk8K<|?QFJq4*&St`lb4wrm1Qymli4d6dzf)jvaA?} z3HnMU1miSnbpwl_X4otdc7i~m6IufCI3fCeKxnYP#D7D5&(WplX#P1GdyY;&N8``YYrl(i zfAp>0*w9X-6^V0iarM#HeyTP`uk0h*YK^1WSL-NS&lNsdXym4M(S=q-$?xpz+Ivefnt)g}IaUv5P;o8)L8UBWQr%$en1ye{a5-mp<#tbCG5$(+a}(xAFA% zU;FU2Rv2KgyScuP&pf=*iqaO3y1GA79}2A)Z3!ruX~iidqExn(pimNZFHgCas*IME0X>}P5FG73at*%2E4n&YyJ2nw{RpbbjK$;aJ|?vYlMwL65c zrb`I}T~Zk6(!#(FF|664=JPD=*bh>20&MIG!0A?Ch?{C=&bET^{cSi|Pn~RprO(6A NqI1)q(Q$U<{|k5s#Uua# literal 0 HcmV?d00001 diff --git a/attention/backends/__init__.py b/attention/backends/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/attention/backends/__pycache__/__init__.cpython-312.pyc b/attention/backends/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb6e80d57e369df6fe696ae2569c09b3b28ac386 GIT binary patch literal 168 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVx$BqY7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?1b9voEnBi13_^dyW&`Z*ijMJ)6c+ zqMem_QRXwtI#`(>WdVtQ&#q&3veF<*LndaIjgwqrtJL*_HnS|s@WQ~0NKW(XZdTfk z(hiB2TBK+cWZx5X+C8kc6SZAZk5=~(POli1`owOje^wBC-Vzc6%l3HhKM;YqO|>6C zF{uVpGfGiTrHeVKI9CwkEvo;*tLIN9C!aq#dHQqH&#A7-Y)VmHPRXkCIVms6nY8LU zFBgi1bRnxcFDx%gROh-V6_Y9XqB0%l)yQNayOhhHPsyolR?42qseJLY zw2ZW8GL@c_rs#X*LQ~SEOj?>uEv9BN*-VizT2H^saL%Nb3rj^pOlv*YAQlCCwny-~ z$w^#-7rBH@IUVm^ZkvYbpP!p3<`yRwGmHC*m?o*X?~SF; z9(iKnv(iL1Gc&QcjLFYWT*_v16DebaCT3FU1t~8o6UG`DUtE@3L5+bnB$PP7TiiXH z-QlWwxM0`S=PJUHN}!Y8XDi`dSIZ z{}f6^Ub4^HL>rc{14~#?J!hA)#mr(>Iyb8dGbu$%>zfnE9QG{75!G$t@g}#*uX0JG zv}c)nm&YoVhmgfC5ELmp%Q6am7K_O-yl7G~`AjjHTnX2j8aE1e14VHIT;(c}t~KGC zZq~Qyc#GVDQn`~r7l3Ndn}av@Y3)JqizVPQF^vl(QWGx8rwl~NG+nBhEj6QhZt@hKq-w?!*kJ= z@JL?ChuZFjDrD(2qCZd;h%(N#farKn-jJ8i~f6HNfX}lBNQd|U`?uj#kc>_QFodVuCdua8;k2T@P0 z+r(X#(jl{S&{8^VmJV4;ccXNejV1MAx20wTH6!94z12}m%^uWHznk@t`cmUME;+=< z@V6I#V~k!z>(6YPxX(g$^p~Ql^}#1a*gQqVaSP=*C?60fEG3UE55y0uAyb^3(9X}5 zk>k3IjipJJ#%2q0%sgdd#wj~~N{yt_X=$;TguGi^Dkf)^W@jNvLdw$OVnHrSVp1dl zrThbJF}1f=ydcSWDVv&_A) zj#Gr&4glQbih$blYrIV7-n+aSo-PzWm&56sgDjQAld@cp8;&zgBOvLu5^WriaZ(nj z>JZ3nlLmN|d(grK2TQJ9?0~a~X=>&dVoxTbjm-F{#SBO#(^+MuZ%2lV>Lc|1X9>{R z{L8l;q_$_4GFhkvtofS0p)U|N9u|yabsVUk{8CPvTSfKM=2-Pn`NE~-oRkt3)lDxp z`D)NI$4MwL<7o#d8qX)qFB`zPLw*_`$c&X|DD^CX;{XZ~BehAFlx@^Mu%R`o%%z}L z4edC>^+=OMOP~BU%jEy^BOa~i2wI#dB?~gff90`HV9bI$P5ik4u5v#M>@t>SnugDK zNMW$!Z~Gts^bBs^sIk0JN_Y*=tnDR^ZKt@crY&G=*|9(sYC|7g|Ljthty;EWR{9^} zzpnm8e5jD}ah1Cp3f~Oe2yEE5V*AUn{iWc6lIwuBq^(_`H0weO5;wVvoX(cEgPtRL z7H+y|WNXKXx#X0U_(O~}Lw%WOLhule!PYfRbJ|&bqpi$_)ygr-=rQEn72B%K$GyQT z0#Ayb?UK<0sCcJ=7q`pLJdm!X38hcPGsG_)iqY`8Y>0IO0F@D>43S7HHxjw zN@JE>Y&AAr$T!i&Pk=IY=qkSqeSD`*v1x>9PWt*z_0;KiZPOPpvj;%T=rddN);@Qd zU20%;w0Y<2G3g9aCIB!B-e}3Ys}k<4L^`)32g;EHrO<)ADG}(~@nl2S~t;!NY95{Sq(p%L2xN z_GZJQ@XPj{Yo@p_O>*27p-A=#dsc;_$4DD4hc|c`jt;aU%=?VG+Is~piu}ST=s4== zkWRoJ3mu0>hhev@Iu!eAi^6|nlIND~0=F#09n&(+4orIY3DQtrf`W446b#>1vNvHt z=Vvn)$C>z$zX(K`n3ZYtRZm9A2iuX!up_Vr&Wdq*qbuDijut>DB5!3iMT?ccT4e{8e= z*xK~ZLfz}fesJ_o=<)j;-+HtX>EGzz+I6Um|B*xQ^X15qs)O_QJn(R>JwW`-ozTPw zq4T#t|D*Bpq4QNc=a1fN;X?i2@BOf|ryAx4o~cH-Q2WD2KbX{3rhvpvZkfm4I<@n; z-$KKeO|5i3Gz413S3zN$)-gLZK1EaVmw~)^`ETPz37TgBW}v~*7_$3$s1g;hQlBWF|XQ{CBA z`&9E*v5#~gPfVFBEM>)5;gTeiz7#9YWfXFK>Fvh~#W~4pwlGViMn~V`%rg|Xs{yvP zF+Q5JP4gO&55W+}7m;SA&7`k6wvUp&CejU%Q^?_RmHVdv=RR;Z*tLFktM@<||7)6$ z>Ped(MwyPXx_Lvsem}?;P{J0eA!^v}wu+h_V|tS02wW;kP=gI0=E~S{qyL7Y{|M}V zg|^Hs6lSYgbuK9q?K{;qm%{qQc2GT3q^(A^Rb*aMOOt8y%%{%av95nj-_-i!t|>(_ z=?pRnQBB(~P->CD>jY!~=IWx=!XE92G*l^tdOnlMGvC(A?nidnK-(ainhIF$;zAK^ z@0v0SwUlPOj{uTddc^~2*SNcZw8|SYQ)ylG%+ogwk7!~*PI2?5w5_4l7&k<4l3xs_=<4ObpO)pf}zIHjsT!FIs>NEAL))$Zbu=#dOv2T)x@Z zwHO?sy%u>FS|pc3!(+7Abo87SDo)S{HV{D8prLc^Y|6E(Myn~;?ivnSYkj!2Rg+3n zo;`Im$Q|48teUMT?M+75yXyWnE{VQVKlAMkgpeh}W!1haU`|_Bo#4);;au^qdVjUa z+^l*n9C{!<{L?F)rbQNmyV*FqC2=LpboGY9-g>3Cp|-20)HKwzo;`ECp{PAy$nSf& zxSP(bm?9O6bdgfeQbaq&8cMr}(2h0i1_mc=taidSd>e!~(Buuf$ce24>Jj2znT&e1 zCAGMSdky&oWaKXp_!k6H1VjR}1abuC2^0b1zF+9Xlu5p+!AxFLN@Xdf!1J%RS z$2LRHeCQ3YPj2<@EBEf(?47XCdS=u2EGf?6@AkdZ_w9iXyrUlmqZV3E|1@;WLhJZu zNaJg~92_rAJi8e@ZXtGZ(|4-LU~D9Bzk2)h`~R{uG`;CNSLusw^&Kwv9j^G=E526P zdlg?t#TTylA}rCj-Rkc5lw93Vuzmd{Z`{(z;Z5J;m0*9#)&I$gsYNi$&(XY*oYO6l zUUH+zFjF!zY#DRIX4$3a%D#tI!=upuH^|C)0;DU$ir!Bzn|u(jm9B@bTXkzbzPI5{ zOD1QyOv){;-U@}R9H*p%-G{2q*=!+IJn*>cn$c4-`6pyDVO1fUfjg4v8-r9hL4d3| zX3wiWSbD`wTDyssNljMmz?UDVlt2one3X#siU{p3!2vB#Qo%_o@G|GR{_*Fi_%s39 z@t?5M*(PV(op!psOyErdj#Z(@7FJ%c(hn1f~*Z3;;p|BqppM!f!Lf8869bu$$ z_~@U0>4#tX(aFD_{_%9_&`TvDa(!;SxN-9LPk--pd0>CJx^r4LV?FTL>MU*>+4 zED5oV(OZXZ+i%a8_di?Sd%Qe&;*M~k^32Ki=So7~`Wv@|Td$QzkCHb@ICa<2T5*J{ zHr_FKJq_#7F}Q8#S|SA6=$de_+JcmUvOc{2m2&sQqwww#sb(~MTx{f?*XbC9>!WJN z`!3!NywOT$chyNL7l$%8r951BUzM<{KF$+dQ?5_0y>V^ez8@vl1_+^z*rk>?mTtXt zd+63H-}{Qmq*g)wXeE?`>(a*Lt?B0a-v)0TiIU)7Be!qk@b5qQy(g<6vXAiaGHLD8X*nF8(&6W;9Ns^d9jWYVt}6QQSembgaRh!b;VIb&>(I| zSrA3jS{u7_Z}zPgHP8Yo?3N!fMh?-zzMvGl%Wq2ANqC}b=X%lIq}@f$C8Tg~s;4gLsen&eY^BH~e$$OEUI-U%n?xWSa~Ayo z0VX3Tlz*GR?-2MFf!`%?6F?z4nl`DK=}ufWUrx#5N=JR?$My1$=_4lq^uz$y)x(zG zbllyzc&`9OZMh)j6sBK)|U#Btx|A;^3Z2-ix1j27;uVwED-8-ft z(K|x4Vuh_8H+ygN-Vyq$g45B18;Q49udR{{>cLR~Vy|6$jpR@djtU%DR0mnqgX1FH zb#v^-7|Ej^92Z^jD&$ZPdguwh-FvO~K@d4@rxAyByNkR?^laPU==oaR{-S6aNx-US zI|jKHmeY2K)7&AHVmoAwo8`rpw-A#kxkV@Y=MtTgXP!=mhDUVHIUQG6ct0O z+Q}j`{E}Z{kuG{ZVAcc?aT5ZrhS!Q*8~(Ih7!*6iNG*B=@i=ZbNB_iOu^o{X9a8K3 za0462ik(f$yPA|oo0NAWil?VZoyKGDBWwE}SsUKDb~YmRTl$Qs5UCxJEQoJObRfP* z(4txro#G(ACk43SK|LZ_645o>DJHsMR_~Ud0Lm#fnoi}RiBr5pG7BdhHc(P?tg3x^ z3Cek~(4;ybrWcc!3vyP!jBrt2PURQsE|Y1qlVfDtgH!!myfyOT8eBc!gyU+JGfe&~ zcFZDOCE0TJE4cZzk@HCJ$xbq0)t)pRUv;>KEYZKE7jWr7V9%<3-s+aqOGW#_6o>JV z5mO6{Hoe&L0uA%KRr^ca5LXNt)x&W7BF5HkcS1Vx4Jr#9R}@L6pgk?bp|1|)P;pYiKppaL!WdSGbMMXQA z8FhCOksb*3NhWIt@>R`<__z3OrQJM?DZT|^g^YDSCSLh9)c+hI2f@g-%h$fLuG|R@ ztqE0|%iaHx@8sVOPW+pRt-+_tgHNqZZTbd2?2Ui>+*a?Aa_^C~sc)RcN#}3f@(q-I z0~@=_zM&6%r)~*bqo>NFr*JRi_HVi#BMmFE=KcADAg+k+b9Q(C-N7M7tJ2%IHuall zt4@^tvqDa@-#U1Fl>6~$=LwtrFL)ksnpuphn*vW*)QS8o&@cleI^}05?ZYM9OeU#h z3kX!gH5{(w;QC2k&g8|yW%)%y31$lyGe!8b3$sP?U2DD)az-I|gd%Q}T8L^AMlM2T zC{Rd~kzRalJfw{=85uyrx>xilB`u1@tIf#|fWugo{~e`>QTY!D{Cfh#lzf}OMFKwr zP^c+Qj54TA-5wNLqS$6t+r^vUaC(I`qI>pHX{T~ z3+bZqO`97Et$e!fd$jKVAwFozfc!^P`^Ntgb(dgA*f!tJ~Ud zH*QvO!3qWF(U&Z7TWSV0JVu*S1qt^~bom4=N%!VaXcmh#V?Hp$tr?reT?lThSg>4- zHJ4do7`1Du3q%P28RpC~2ul=xl|QwlU{{}~Xgu6;vD-4Dwi7XvvqB!I^{f!5Dbvpy z%M`~wCi+o2c5)hDGz>>$FX-X#vBJ!}lrC!DijtDXLB%>P)%7A)SgaA#1kX|G3m66O z#wcI}FcsQ}k&7f&n60B?lF+Vp;qGAyk zABFog?F3z;Z2NQ<&sYP~;KgLsyafGJ>p+Pu%F;f4{xKX(^^RroxQ??-#~0^tcX~ly zlE&B~s4XtGKM~WP%^KrI(|{j3U`l2rjI9(~#5!TW1elX@CX-kUj6FSmb1ViKQ@}Y# zKNzt5GJw_>TUxaAtc#TWjKHLZE?mkKmK037q`7*P@tCoCF|QdM0ktb#^zt(G2P*;X zojU(S?DWgNgxZi5qYhCA-6{W6Em1(APW9*1OfE+HKy# zYO9>B7f61rmz7$C5L@TXF`u`r@?K+=BO)LT4TZMhBM&{RL8tG>|Jb-6`!IxQ2`a>8 zDRXfS;xaW0-6aX{Dhbq;y*S0R(~&XyNaRCdmJ&jf#Q*c&+ z$xJ+^>S+F1(ps2Dwsw;*f1Ap#5+I#LCh5rRJ(+Ap`S%E{5n#q4>6!AM6Zj7V&Jp+# zfa=#(rL=@N$0VH6q{g!8dR7xYqFgo6}{(eP+x??h#HywX2h86B%cN4BEl<>+|jGoQWJ z>4$5RJYRT&FnmT%N|8xfbyLa%cjx-ljrppVaz3uJXZ`Svg^j6l$7t10MFB3_yDq-t zsRk(*Lay3MsWz^AU?cKQZ#7Ig%^gLl4sOq5x2C=~U!|sT1?~ylk0M2msYumEW&Mpx{8m6KM*EjgxGw+{|t)?#cA)s+atY7+E`qD&~5pBB1$)w@@@h@28f3mC6_=|bc{5LC9>pTWHF8V z6J%7wb>CkRH3F~E;C~+g4tm_aJ;n`=+>3YUG6Qd2W>89#8I&TKQT0-)L0(W%OfKo9^?V{8maH~U<8phR4 zbvLC(xX5UA52Z%AHoDH~t5SE+G0364Na^P~mF<7D9O@tX?rZPB1xq5p#dY-DeC@_- zRb39jSqoop_5YU~x@Q9u+B#t~$INuqNa^uUc}3rK>H0^AATgDUwNbJFPFVTC1HrM5|63&)J$5YCbb*w7E`bF0`Ggp-HEc;Ql%vSf2N8nj4 zj`5{pCaa8VS}#k>Of7qbS|t9}E=FlFn<`=h=9F=SNK5i;3enf(W>I6T`qF`M)0h}H zOahIzOb$#bI+~L+pX`S0wvW8U~Tx;P!%+XZx7=KyZkU)1^*Y| C0A$ku literal 0 HcmV?d00001 diff --git a/attention/backends/__pycache__/registry.cpython-312.pyc b/attention/backends/__pycache__/registry.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9af6f17dc425e40d74dec0324609a065db6d594 GIT binary patch literal 9083 zcmb_iZ)_V!cAq7e6iJDqME$k=Co9X5OG1Z;w)w^(A~f?^XQSKq4Rz=Fz-BTcgxw z>k@SWaS`TY!KctVK8bo4IZLSTHlcndT;F4rR;Ewg)>oo|QwD&t7AW^t(pY)c@K<$D zMVmUDA&Dl|%)+cC?b-o_td#~&!P`%0E3Ic?8e%QfRjeU|HZV79n-o}^MVa=}#@n7m zJMS+H{WY<+(kOQD(q<@aVLPBdXLKF2u1H(4qt7r{&d8`aX+lojU^yyjY%-%8+U&1z zV6li``ub%>ktY-uahbvRwabIa{&R!<7lvbJO)(|w29VTdav3A3sFRaSgU>dN>8di# zlIq0uU4%OX@Y@3 zG>K^%5NDaL%acrxnj#KA<#NUeZ&(L{!(B5W`H^rB*2#dJ_1qEX$e(!#nG+nn86#c9 zu-BT9eMB1(h=FbJhYEA#y3hG4zn|JYUau{eIk~5H=9Y`Pt_Pjc@*9Li=3E2Bp>sxZ z%WZ@lDk{zipAhPqbALiU6&(ud{m?Z-Zo03L86i?rJaU#9CGD7)l~5GYOvad!rkIjX z7nIG?D!xK9DdXFqL_JmI{ch3KBc<#O#fO5^{1*JyjzMyP2(a>VU}BARo52 z3&maIxf?lkCMT7~udA7y8Gu#`jMGLuraPtRT&8A?&0y98d4?t( z`I!D>lJBi#QpdI>c;>Z6$Z5@x6kq+V8vMT!g?a!LpzkhtGsr!Mk+|;vJ z*Ynl0P}{GG7b;pi7I&WdS?V9vzf+f6V?S>lUuqp+ZvBIWvn#the{${PYfC86^l`u>h38|+BKB#aepkbHHUd?ChG#ek>%zWzT1UN10NgDQ)1gPBB8Pgg4Z^B1w5`OAS1gJkzO9P2ITAK*cx)C`-!)#gHr$pO=lP9%-UrKphaIWbkWDQm4V5XDDSk-jHlaS^=CMn6ZMM$xYfW zk77J%(C+R4IJ!Jb8Q6&33DYAz&@4N_yTWnXnX4P$!u{qp+?txoT3wGqZVT&nT46UU zxQF0#3;%YT%+MHf_BUf&SjV>Mx?+PBa+!37(Td%Qn`hiA-O}iG%9F|$tXBFjk8fr) z8#B3-QlQMTe&|t6uf~jmmaC4a|A~9GGMc{b91GCr+0PsR#OozI?UWMYAm&@HIhWr3 za)b@&*Hs;RLyh}W&r=(weh5HRZ<%vdL*i7V9;aXG#j_js4hNH(;=iA@-ACCy@nZ=0 zPPVa^9513*_z>Fxq5l}=qSDI_1q;y`0`yj9B|Kd5Wvp<45?~SVNtWN_ZO9p&ZEB*7 z^-f5`Y02KC(f;wV_{b%6ui(Wf2l^`b(+O7TDH$!fx1!JHl!|Xj*@BLwC65XXOfYaK z=xJG08rREJC1>k&T3Jn%MBJvRGbOFziP97iALyt~AQXx(Vk@Qf-fHKKX%03bFu9j#4@bXesCKs)id{*+d?IMC!HeT_TPOUe6I>|8 zJ>RGly#77p#PbViMKfvG|G;>3>*1|G|ITvg_@eLl))hqcqGWeTv`d zpJJ&SQYLL7SVuZjvJN|m~`q#e9k@$HX~4h~u~ zXsa0Ucx-rRcyPd~@B!Yia$d`Cnc{@1D%xHY74B~B_b_<_lar7V|X`~VuIBA z-$Qbnyl~Zce5+v+9uelxuY`Yp{=!P=()_uVh70qyz(Txl7HGHiQQ z6#R#fbk{0)g<%0$gQ5AM7hJ5}JK%X^wcYF4d*4_i@LF$hdAh7= z;bb7l>d!hcPfM+9Y%5|0xY6S7j;r4W{8jQU=b#k~zEYvQ5vi-3Pm4|v7I2R6-MTr! zK)c4DI|O6aRJ;w=jAI~=yc-POC=V(_U==o{V{*U{Io1tqy~45prFl%iHf(*`WCdce zNl$R(vw6Rur-o=P2DLDlYenstRx)bkX`hfrc*qQn^1M(&-yTK$gKo!;0z(i20^uC~ z$a<*}^gooPR0v$C5ZRo`K-dDHj$AO*!x$Naa)3K&P0e!S!kf3JvIW8oIPCz4%R&4A z;_qPb%fm$Y6;pGpM^eF1X)}P{KwLTXu$s3#?BGoqWmX}eE@{LHa{y&dVmBd(kBH5d zTWJ>WIKnC6uWG`K@{P=QmKChP$y)-uG}{j#PtEcpBJk!m0@G-;!Y_3Jc*WZUSQc4I zaW=Hc8a)i>LSy!K9h|k*ziD7+tWd$W!y3qF5Vo-u#fn6gB(z09r5Y4MRcToR>e)cJ zH#_S9Sx02{$o7`#tSM+6uw|_MSVDAA0^5c(!Efy%BzVIBC=YU*A2_Gr?Llb=mg^y= z?=Igt+!1hR&LPTh#k50^kGL!R3i?0rI(#VOkhj9;RxsD}m-N2awOKsnD0hkI6XZwt z1_9lb&V7&eJlwM=9r;Z7tn24T&nz81v-n#7;+}!;1@0bS^6gv+g%<+cd~JXPC<#7$ zFJy4;V??WmtTqM-*f`;4B|7n)QsnFel}{J!Kei61g!i(2naJwTyl$T$LfwhHb@2m&TMRiGA z$N1gXneyFN#T6H2c~E{U`Z^UwIcHe7o-;cH^x+VTf3%IqnFdb(q<1Q;{N;Cfb5lSvMOzW&r1l~h8* z8pCh`MWbJJV@j6+yMgU2DP?pN4dwV}nflS{QHpVfswirtB=x?GL#2C@ zcIa_@IdpWq|!vrU- zO+r#RTkB$|ed`1v^aOWie}m)_B=h8%uO5LjOeuH@X5(o-*R3dbHn<`3xq&{O0?tW_&fwH&w^840PE_RV^2NW>i8`@Q$(&CGl6H@{C! z6#$oK{Ng^)0sdCTcx6&B3=kXu0~;7&nn!%i)`-Hor~8I&B-rpWzG<5TG~g!BcF8E) z&asT0=b8pLfSLQivR!Qy>88iq1(xFm%k#pT!3z6^ZS7dK;-fE?q~w8g!(eXJX|{O4 z<~q{#qPHroM&dkkq|@}AC_=+p2^sgk4}&##U72p+O6rB{>s-A34_7n2D^XZyvWC}D z9RLTLU~Sqx;5s9ZvHP-+tm7$Mq;#v}cxgm; z0~w#9)Cq!6I;v$6r7vNqs{D7kF8y}B?Y1vSN33&s;bG^>w~MVSyzaTH_4bb32!r~z z=lOMK-0*tZ@Ti`2{9=1YA$at@ey+DrZXj zw?%oJ6&La8+@sB8La`~h?1-S5as+n)VBbMaX80?L1K5R4)F;;yBFEDMIp!zq667Lu z)5tE_G*9LM?%7l|oLJ~h_zFH str: + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_impl_cls() -> type["AttentionImpl"]: + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_builder_cls(): # -> Type["AttentionMetadataBuilder"]: + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + cache_dtype_str: str = "auto", + ) -> tuple[int, ...]: + raise NotImplementedError + + @staticmethod + def get_kv_cache_stride_order() -> tuple[int, ...]: + raise NotImplementedError + + @classmethod + def full_cls_name(cls) -> tuple[str, str]: + return (cls.__module__, cls.__qualname__) + + @classmethod + def get_supported_head_sizes(cls) -> list[int]: + return [] + + @classmethod + def supports_head_size(cls, head_size: int) -> bool: + supported_head_sizes = cls.get_supported_head_sizes() + return (not supported_head_sizes) or head_size in supported_head_sizes + + @classmethod + def supports_dtype(cls, dtype: torch.dtype) -> bool: + return dtype in cls.supported_dtypes + + @classmethod + def supports_kv_cache_dtype(cls, kv_cache_dtype: "CacheDType | None") -> bool: + if kv_cache_dtype is None: + return True + return (not cls.supported_kv_cache_dtypes) or ( + kv_cache_dtype in cls.supported_kv_cache_dtypes + ) + + @classmethod + def supports_block_size(cls, block_size: int | None) -> bool: + from vllm.config.cache import BlockSize + + if block_size is None: + return True + + valid_sizes = get_args(BlockSize) + if block_size not in valid_sizes: + return False + + if not cls.supported_kernel_block_sizes: + return True + + for supported_size in cls.supported_kernel_block_sizes: + is_multiple_of = ( + isinstance(supported_size, MultipleOf) + and block_size % supported_size.base == 0 + ) + is_int_equal = ( + isinstance(supported_size, int) and block_size == supported_size + ) + if is_multiple_of or is_int_equal: + return True + return False + + @classmethod + def is_mla(cls) -> bool: + return False + + @classmethod + def supports_sink(cls) -> bool: + return False + + @classmethod + def is_sparse(cls) -> bool: + return False + + @classmethod + def supports_attn_type(cls, attn_type: str) -> bool: + """Check if backend supports a given attention type. + + By default, only supports decoder attention. + Backends should override this to support other attention types. + """ + from vllm.attention import AttentionType + + return attn_type == AttentionType.DECODER + + @classmethod + def supports_compute_capability(cls, capability: "DeviceCapability") -> bool: + return True + + @classmethod + def supports_combination( + cls, + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: "CacheDType | None", + block_size: int | None, + use_mla: bool, + has_sink: bool, + use_sparse: bool, + device_capability: "DeviceCapability", + ) -> str | None: + return None + + @classmethod + def validate_configuration( + cls, + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: "CacheDType | None", + block_size: int | None, + use_mla: bool, + has_sink: bool, + use_sparse: bool, + device_capability: "DeviceCapability", + attn_type: str, + ) -> list[str]: + invalid_reasons = [] + if not cls.supports_head_size(head_size): + invalid_reasons.append("head_size not supported") + if not cls.supports_dtype(dtype): + invalid_reasons.append("dtype not supported") + if not cls.supports_kv_cache_dtype(kv_cache_dtype): + invalid_reasons.append("kv_cache_dtype not supported") + if not cls.supports_block_size(block_size): + invalid_reasons.append("block_size not supported") + if use_mla != cls.is_mla(): + if use_mla: + invalid_reasons.append("MLA not supported") + else: + invalid_reasons.append("non-MLA not supported") + if has_sink and not cls.supports_sink(): + invalid_reasons.append("sink setting not supported") + if use_sparse != cls.is_sparse(): + if use_sparse: + invalid_reasons.append("sparse not supported") + else: + invalid_reasons.append("non-sparse not supported") + if not cls.supports_compute_capability(device_capability): + invalid_reasons.append("compute capability not supported") + if not cls.supports_attn_type(attn_type): + invalid_reasons.append(f"attention type {attn_type} not supported") + combination_reason = cls.supports_combination( + head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla, + has_sink, + use_sparse, + device_capability, + ) + if combination_reason is not None: + invalid_reasons.append(combination_reason) + return invalid_reasons + + @classmethod + def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None": + return None + + +class AttentionMetadata: + pass + + +T = TypeVar("T", bound=AttentionMetadata) + + +class AttentionLayer(Protocol): + _q_scale: torch.Tensor + _k_scale: torch.Tensor + _v_scale: torch.Tensor + _q_scale_float: float + _k_scale_float: float + _v_scale_float: float + _prob_scale: torch.Tensor + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: ... + + +class AttentionImpl(ABC, Generic[T]): + # Whether the attention impl can return the softmax lse for decode. + # Some features like decode context parallelism require the softmax lse. + can_return_lse_for_decode: bool = False + + # some attention backends might not always want to return lse + # even if they can return lse (for efficiency reasons) + need_to_return_lse_for_decode: bool = False + + dcp_world_size: int + dcp_rank: int + + def __new__(cls, *args, **kwargs): + # use __new__ so that all subclasses will call this + self = super().__new__(cls) + try: + from vllm.distributed.parallel_state import get_dcp_group + + self.dcp_world_size = get_dcp_group().world_size + self.dcp_rank = get_dcp_group().rank_in_group + except AssertionError: + # DCP might not be initialized in testing + self.dcp_world_size = 1 + self.dcp_rank = 0 + self.need_to_return_lse_for_decode = ( + self.dcp_world_size > 1 and self.can_return_lse_for_decode + ) + return self + + @abstractmethod + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int | None = None, + alibi_slopes: list[float] | None = None, + sliding_window: int | None = None, + kv_cache_dtype: str = "auto", + logits_soft_cap: float | None = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: str | None = None, + ) -> None: + raise NotImplementedError + + @abstractmethod + def forward( + self, + layer: AttentionLayer, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: T, + output: torch.Tensor | None = None, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError + + def fused_output_quant_supported(self, quant_key: QuantKey): + """ + Does this attention implementation support fused output quantization. + This is used by the AttnFusionPass to only fuse output quantization + onto implementations that support it. + + :param quant_key: QuantKey object that describes the quantization op + :return: is fusion supported for this type of quantization + """ + return False + + def supports_quant_query_input(self) -> bool: + """ + Check if this attention implementation supports pre-quantized query input. + + When True, the attention layer will quantize queries before passing them + to this backend, allowing torch.compile to fuse the quantization with + previous operations. This is typically supported when using FP8 KV cache + with compatible attention kernels (e.g., TRT-LLM). + TODO add support to more backends: + https://github.com/vllm-project/vllm/issues/25584 + + Returns: + bool: True if the implementation can accept pre-quantized queries. + """ + return False + + def process_weights_after_loading(self, act_dtype: torch.dtype): + pass + + +class MLAAttentionImpl(AttentionImpl[T], Generic[T]): + @abstractmethod + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: list[float] | None, + sliding_window: int | None, + kv_cache_dtype: str, + logits_soft_cap: float | None, + attn_type: str, + kv_sharing_target_layer_name: str | None, + # MLA Specific Arguments + q_lora_rank: int | None, + kv_lora_rank: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + qk_head_dim: int, + v_head_dim: int, + kv_b_proj: ColumnParallelLinear, + indexer: object | None = None, + ) -> None: + raise NotImplementedError + + @abstractmethod + def forward( + self, + layer: AttentionLayer, + hidden_states_or_cq: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: T, + output: torch.Tensor | None = None, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError + + +def is_quantized_kv_cache(kv_cache_dtype: str) -> bool: + return kv_cache_dtype != "auto" diff --git a/attention/backends/registry.py b/attention/backends/registry.py new file mode 100644 index 0000000..f07a605 --- /dev/null +++ b/attention/backends/registry.py @@ -0,0 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Attention backend registry""" + +import enum +from collections.abc import Callable +from typing import TYPE_CHECKING, cast + +from vllm.logger import init_logger +from vllm.utils.import_utils import resolve_obj_by_qualname + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +logger = init_logger(__name__) + + +class _AttentionBackendEnumMeta(enum.EnumMeta): + """Metaclass for AttentionBackendEnum to provide better error messages.""" + + def __getitem__(cls, name: str): + """Get backend by name with helpful error messages.""" + try: + return super().__getitem__(name) + except KeyError: + members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values() + valid_backends = ", ".join(m.name for m in members) + raise ValueError( + f"Unknown attention backend: '{name}'. " + f"Valid options are: {valid_backends}" + ) from None + + +class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta): + """Enumeration of all supported attention backends. + + The enum value is the default class path, but this can be overridden + at runtime using register_backend(). + + To get the actual backend class (respecting overrides), use: + backend.get_class() + """ + + FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" + TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" + XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" + ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend" + ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" + ROCM_AITER_FA = ( + "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend" + ) + TORCH_SDPA = "" # this tag is only used for ViT + FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" + FLASHINFER_MLA = ( + "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend" + ) + TRITON_MLA = "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend" + CUTLASS_MLA = "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend" + FLASHMLA = "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend" + FLASHMLA_SPARSE = ( + "vllm.v1.attention.backends.mla.flashmla_sparse.FlashMLASparseBackend" + ) + FLASH_ATTN_MLA = "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend" + PALLAS = "vllm.v1.attention.backends.pallas.PallasAttentionBackend" + IPEX = "vllm.v1.attention.backends.ipex.IpexAttentionBackend" + NO_ATTENTION = "vllm.v1.attention.backends.no_attention.NoAttentionBackend" + FLEX_ATTENTION = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" + TREE_ATTN = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" + ROCM_AITER_UNIFIED_ATTN = ( + "vllm.v1.attention.backends.rocm_aiter_unified_attn." + "RocmAiterUnifiedAttentionBackend" + ) + CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend" + # Placeholder for third-party/custom backends - must be registered before use + CUSTOM = "" + + def get_path(self, include_classname: bool = True) -> str: + """Get the class path for this backend (respects overrides). + + Returns: + The fully qualified class path string + + Raises: + ValueError: If Backend.CUSTOM is used without being registered + """ + path = _OVERRIDES.get(self, self.value) + if not path: + raise ValueError( + f"Backend {self.name} must be registered before use. " + f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')" + ) + if not include_classname: + path = path.rsplit(".", 1)[0] + return path + + def get_class(self) -> "type[AttentionBackend]": + """Get the backend class (respects overrides). + + Returns: + The backend class + + Raises: + ImportError: If the backend class cannot be imported + ValueError: If Backend.CUSTOM is used without being registered + """ + return resolve_obj_by_qualname(self.get_path()) + + def is_overridden(self) -> bool: + """Check if this backend has been overridden. + + Returns: + True if the backend has a registered override + """ + return self in _OVERRIDES + + def clear_override(self) -> None: + """Clear any override for this backend, reverting to the default.""" + _OVERRIDES.pop(self, None) + + +_OVERRIDES: dict[AttentionBackendEnum, str] = {} + + +def register_backend( + backend: AttentionBackendEnum, class_path: str | None = None +) -> Callable[[type], type]: + """Register or override a backend implementation. + + Args: + backend: The AttentionBackendEnum member to register + class_path: Optional class path. If not provided and used as + decorator, will be auto-generated from the class. + + Returns: + Decorator function if class_path is None, otherwise a no-op + + Examples: + # Override an existing backend + @register_backend(AttentionBackendEnum.FLASH_ATTN) + class MyCustomFlashAttn: + ... + + # Register a custom third-party backend + @register_backend(AttentionBackendEnum.CUSTOM) + class MyCustomBackend: + ... + + # Direct registration + register_backend( + AttentionBackendEnum.CUSTOM, + "my.module.MyCustomBackend" + ) + """ + + def decorator(cls: type) -> type: + _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}" + return cls + + if class_path is not None: + _OVERRIDES[backend] = class_path + return lambda x: x + + return decorator + + +# Backwards compatibility alias for plugins +class _BackendMeta(type): + """Metaclass to provide deprecation warnings when accessing _Backend.""" + + def __getattribute__(cls, name: str): + if name not in ("__class__", "__mro__", "__name__"): + logger.warning( + "_Backend has been renamed to AttentionBackendEnum. " + "Please update your code to use AttentionBackendEnum instead. " + "_Backend will be removed in a future release." + ) + return getattr(AttentionBackendEnum, name) + + def __getitem__(cls, name: str): + logger.warning( + "_Backend has been renamed to AttentionBackendEnum. " + "Please update your code to use AttentionBackendEnum instead. " + "_Backend will be removed in a future release." + ) + return AttentionBackendEnum[name] + + +class _Backend(metaclass=_BackendMeta): + """Deprecated: Use AttentionBackendEnum instead. + + This class is provided for backwards compatibility with plugins + and will be removed in a future release. + """ + + pass diff --git a/attention/backends/utils.py b/attention/backends/utils.py new file mode 100644 index 0000000..4c7fa47 --- /dev/null +++ b/attention/backends/utils.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Attention backend utils""" + +from dataclasses import dataclass + +from vllm.config import ModelConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + +PAD_SLOT_ID = -1 + + +@dataclass +class MLADims: + q_lora_rank: int | None + kv_lora_rank: int + qk_nope_head_dim: int + qk_rope_head_dim: int + v_head_dim: int + + +def get_mla_dims(model_config: ModelConfig) -> MLADims: + hf_text_config = model_config.hf_text_config + + return MLADims( + q_lora_rank=getattr(hf_text_config, "q_lora_rank", None), + kv_lora_rank=hf_text_config.kv_lora_rank, + qk_nope_head_dim=hf_text_config.qk_nope_head_dim, + qk_rope_head_dim=hf_text_config.qk_rope_head_dim, + v_head_dim=hf_text_config.v_head_dim, + ) diff --git a/attention/layer.py b/attention/layer.py new file mode 100644 index 0000000..85224c0 --- /dev/null +++ b/attention/layer.py @@ -0,0 +1,1051 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Attention layer.""" + +from collections.abc import Callable +from typing import cast + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import vllm.envs as envs +from vllm.attention import AttentionType +from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.selector import get_attn_backend +from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target +from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer +from vllm.config import CacheConfig, get_current_vllm_config +from vllm.config.multimodal import MultiModalConfig +from vllm.config.vllm import VllmConfig +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.model_executor.models.vision import get_vit_attn_backend +from vllm.platforms import current_platform +from vllm.utils.torch_utils import ( + direct_register_custom_op, + kv_cache_dtype_str_to_dtype, +) +from vllm.v1.kv_cache_interface import ( + FullAttentionSpec, + KVCacheSpec, + MLAAttentionSpec, + SlidingWindowSpec, +) +from ixformer.core import config +_USE_TORCH_OPS = config.IXFORMER_USE_TORCH_OPS + +if current_platform.is_rocm(): + from vllm.platforms.rocm import on_gfx9 +else: + on_gfx9 = lambda *args, **kwargs: False + + +FP8_DTYPE = current_platform.fp8_dtype() +logger = init_logger(__name__) +USE_XFORMERS_OPS = None + + +def check_xformers_availability(): + global USE_XFORMERS_OPS + if USE_XFORMERS_OPS is not None: + return USE_XFORMERS_OPS + + if current_platform.is_cuda() and current_platform.has_device_capability(100): + # Xformers FA is not compatible with B200 + USE_XFORMERS_OPS = False + else: + try: + from importlib.util import find_spec + + find_spec("xformers.ops") + USE_XFORMERS_OPS = True + except ImportError: + USE_XFORMERS_OPS = False + + # the warning only needs to be shown once + if not USE_XFORMERS_OPS: + logger.warning("Xformers is not available, falling back.") + + return USE_XFORMERS_OPS + +import ixformer.contrib.vllm_flash_attn as ops + +def check_upstream_fa_availability(dtype: torch.dtype): + if ( + dtype in (torch.float16, torch.bfloat16) + and current_platform.is_cuda() + and current_platform.has_device_capability(80) + ): + from transformers.utils import is_flash_attn_2_available + + return is_flash_attn_2_available() + if current_platform.is_rocm(): + from importlib.util import find_spec + + return find_spec("flash_attn") is not None + return False + + +def maybe_get_vit_flash_attn_backend( + attn_backend: AttentionBackendEnum, + use_upstream_fa: bool, + attn_backend_override: AttentionBackendEnum | None = None, +) -> tuple[AttentionBackendEnum, Callable | None]: + if current_platform.is_rocm(): + if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): + attn_backend = AttentionBackendEnum.ROCM_AITER_FA + + elif ( + check_upstream_fa_availability(torch.get_default_dtype()) + and on_gfx9() + and attn_backend_override is None + ): + attn_backend = AttentionBackendEnum.FLASH_ATTN + use_upstream_fa = True + else: + return AttentionBackendEnum.TORCH_SDPA, None + + elif current_platform.is_cuda(): + if ( + attn_backend != AttentionBackendEnum.FLASH_ATTN + and check_upstream_fa_availability(torch.get_default_dtype()) + ): + attn_backend = AttentionBackendEnum.FLASH_ATTN + use_upstream_fa = True + elif current_platform.is_xpu(): + assert attn_backend == AttentionBackendEnum.FLASH_ATTN, ( + "XPU platform only supports FLASH_ATTN as vision attention backend." + ) + use_upstream_fa = False + else: + return AttentionBackendEnum.TORCH_SDPA, None + + if attn_backend in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + }: + if attn_backend == AttentionBackendEnum.ROCM_AITER_FA: + from aiter import flash_attn_varlen_func + else: + if use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.attention.utils.fa_utils import flash_attn_varlen_func + else: + flash_attn_varlen_func = None + + return attn_backend, flash_attn_varlen_func + + +def _init_kv_cache_quant( + layer: nn.Module, + quant_config: QuantizationConfig | None, + prefix: str, + kv_cache_dtype: str, + calculate_kv_scales: bool, +) -> None: + """Initializes KV cache scaling factors and quantization method. + + This helper function sets up the KV cache quantization attributes that are + shared between Attention and MLAAttention layers. It initializes scale + tensors for query, key, value, and probability, and configures the + quantization method if applicable. + + Args: + layer: The attention layer instance to initialize. + quant_config: Optional quantization configuration. + prefix: Layer name prefix for quantization method lookup. + kv_cache_dtype: The KV cache data type string. + calculate_kv_scales: Whether to calculate KV scales dynamically. + """ + # The default k/v_scale is set to 1.0. This is ignored + # when kv-cache is not fp8, and should be used with + # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we + # expect the pre-quantized k/v_scale to be loaded along + # with the model weights. + layer.kv_cache_dtype = kv_cache_dtype + layer.calculate_kv_scales = calculate_kv_scales + layer._k_scale = torch.tensor(1.0, dtype=torch.float32) + layer._v_scale = torch.tensor(1.0, dtype=torch.float32) + layer._q_scale = torch.tensor(1.0, dtype=torch.float32) + layer._prob_scale = torch.tensor(1.0, dtype=torch.float32) + + # We also keep q/k/v_scale on host (cpu) memory for attention + # backends that require the scales to be on host instead of on device. + # e.g. Flashinfer + layer._q_scale_float = 1.0 + layer._k_scale_float = 1.0 + layer._v_scale_float = 1.0 + + # The output scale on host memory. This should be the input scale of + # the quant op after this attention layer. + layer._o_scale_float = None + + quant_method = ( + quant_config.get_quant_method(layer, prefix=prefix) if quant_config else None + ) + if quant_method is not None and not isinstance( + quant_method, UnquantizedLinearMethod + ): + assert isinstance(quant_method, BaseKVCacheMethod) + # TODO (mgoin): kv cache dtype should be specified in the FP8 + # checkpoint config and become the "auto" behavior + if kv_cache_dtype == "fp8_e5m2": + raise ValueError("fp8_e5m2 kv-cache is not supported with fp8 checkpoints.") + # If quantization is enabled, we make "k_scale" and "v_scale" + # parameters so that it can be loaded from the model checkpoint. + # The k/v_scale will then be converted back to native float32 + # values after weight loading. + layer.quant_method = quant_method + layer.quant_method.create_weights(layer) + + +class Attention(nn.Module, AttentionLayerBase): + """Attention layer. + + This class takes query, key, and value tensors as input. The input tensors + can either contain prompt tokens or generation tokens. + The class does the following: + + 1. Store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int | None = None, + alibi_slopes: list[float] | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + logits_soft_cap: float | None = None, + per_layer_sliding_window: int | None = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: str | None = None, + attn_backend: type[AttentionBackend] | None = None, + **extra_impl_args, + ) -> None: + """ + The KV cache is stored inside this class and is accessed via + `self.kv_cache`. + """ + super().__init__() + if per_layer_sliding_window is not None: + # per-layer sliding window + sliding_window = per_layer_sliding_window + elif cache_config is not None: + # model-level sliding window + sliding_window = cache_config.sliding_window + else: + sliding_window = None + + vllm_config = get_current_vllm_config() + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + calculate_kv_scales = cache_config.calculate_kv_scales + else: + kv_cache_dtype = "auto" + block_size = 16 + calculate_kv_scales = False + self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype( + kv_cache_dtype, vllm_config.model_config + ) + if num_kv_heads is None: + num_kv_heads = num_heads + assert num_heads % num_kv_heads == 0, ( + f"num_heads ({num_heads}) is not divisible by num_kv_heads ({num_kv_heads})" + ) + + # Initialize KV cache quantization attributes + _init_kv_cache_quant( + self, quant_config, prefix, kv_cache_dtype, calculate_kv_scales + ) + + self.num_heads = num_heads + self.head_size = head_size + self.num_kv_heads = num_kv_heads + self.sliding_window = sliding_window + self.has_sink = extra_impl_args.get("sinks") is not None + + # During model initialization, the default dtype is set as the model + # weight and activation dtype. + dtype = torch.get_default_dtype() + if attn_backend is None: + self.attn_backend = get_attn_backend( + head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla=False, + has_sink=self.has_sink, + attn_type=attn_type, + ) + else: + self.attn_backend = attn_backend + + impl_cls = self.attn_backend.get_impl_cls() + self.impl = impl_cls( + num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window, + kv_cache_dtype, + logits_soft_cap, + attn_type, + kv_sharing_target_layer_name, + **extra_impl_args, + ) + self.backend = AttentionBackendEnum[self.attn_backend.get_name()] + self.dtype = dtype + + # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how + # torch.compile works by registering the attention as one giant + # opaque custom op. For other platforms, we directly call them + # and let torch.compile handle them. + if _USE_TORCH_OPS: + self.use_direct_call = False + else: + self.use_direct_call = True + + self.use_output = self.attn_backend.accept_output_buffer + compilation_config = vllm_config.compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + self.layer_name = prefix + self.attn_type = attn_type + + if kv_sharing_target_layer_name is not None: + validate_kv_sharing_target( + prefix, + kv_sharing_target_layer_name, + compilation_config.static_forward_context, + ) + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name + + # use a placeholder kv cache tensor during init, which will be replaced + # by bind_kv_cache + # this variable will not be accessed if use_direct_call is True + self.kv_cache = [ + torch.tensor([]) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + + # Initialize q/k/v range constants. + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + + # for attn backends supporting query quantization + self.query_quant = None + if ( + self.kv_cache_dtype.startswith("fp8") + and self.impl.supports_quant_query_input() + ): + self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + # For some alternate attention backends like MLA the attention output + # shape does not match the query shape, so we optionally let the model + # definition specify the output tensor shape. + output_shape: torch.Size | None = None, + ) -> torch.Tensor: + """ + The KV cache is stored inside this class and is accessed via + `self.kv_cache`. + + Attention metadata (`attn_metadata`) is set using a context manager in + the model runner's `execute_model` method. It is accessed via forward + context using + `vllm.forward_context.get_forward_context().attn_metadata`. + """ + if self.calculate_kv_scales: + torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name) + output_dtype = query.dtype + if self.query_quant is not None: + # quantizing with a simple torch operation enables + # torch.compile to fuse this into previous ops + # which reduces overheads during decoding. + # Otherwise queries are quantized using custom ops + # which causes decoding overheads + assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"} + + # check if query quantization is supported + if self.impl.supports_quant_query_input(): + query, _ = self.query_quant(query, self._q_scale) + + if self.use_output: + output_shape = output_shape if output_shape is not None else query.shape + output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + hidden_size = output_shape[-1] + # Reshape the query, key, and value tensors. + # NOTE(woosuk): We do this outside the custom op to minimize the + # CPU overheads from the non-CUDA-graph regions. + query = query.view(-1, self.num_heads, self.head_size) + output = output.view(-1, self.num_heads, self.head_size) + if key is not None: + key = key.view(-1, self.num_kv_heads, self.head_size) + if value is not None: + value = value.view(-1, self.num_kv_heads, self.head_size) + if self.use_direct_call: + def fun(layer_name: str, output: torch.Tensor): + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[self.layer_name] + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + output = self.impl.forward( + self, query, key, value, self_kv_cache, attn_metadata, output=output + ) + return output + + if envs.VLLM_SUPPORT_IXSERVER: + return maybe_transfer_kv_layer(fun)(self.layer_name, output) + else: + return fun(self.layer_name, output) + else: + torch.ops.vllm.unified_attention_with_output( + query, key, value, output, self.layer_name + ) + return output.view(-1, self.num_heads * self.head_size) + else: + if self.use_direct_call: + def fun(layer_name: str): + forward_context = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[self.layer_name] + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + return self.impl.forward( + self, query, key, value, self_kv_cache, attn_metadata + ) + + if envs.VLLM_SUPPORT_IXSERVER: + return maybe_transfer_kv_layer(fun)(self.layer_name) + else: + return fun(self.layer_name) + else: + return torch.ops.vllm.unified_attention( + query, key, value, self.layer_name + ) + + def calc_kv_scales(self, query, key, value): + self._q_scale.copy_(torch.abs(query).max() / self.q_range) + self._k_scale.copy_(torch.abs(key).max() / self.k_range) + self._v_scale.copy_(torch.abs(value).max() / self.v_range) + self._q_scale_float = self._q_scale.item() + self._k_scale_float = self._k_scale.item() + self._v_scale_float = self._v_scale.item() + # We only calculate the scales once + self.calculate_kv_scales = False + + def extra_repr(self) -> str: + s = f"head_size={self.impl.head_size}" # type: ignore + s += f", num_heads={self.impl.num_heads}" # type: ignore + s += f", num_kv_heads={self.impl.num_kv_heads}" # type: ignore + s += f", scale={self.impl.scale}" # type: ignore + s += f", backend={self.impl.__class__.__name__}" + return s + + def process_weights_after_loading(self, act_dtype: torch.dtype): + self.impl.process_weights_after_loading(act_dtype) + + def get_attn_backend(self) -> type[AttentionBackend]: + return self.attn_backend + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + # Block size may get updated after model loading, refresh it + block_size = vllm_config.cache_config.block_size + # Should not be called for enc-dec or encoder-only attention. + assert self.attn_type == AttentionType.DECODER + if self.sliding_window is not None: + assert not vllm_config.model_config.use_mla, ( + "MLA is not supported for slidingwindow" + ) + return SlidingWindowSpec( + block_size=block_size, + num_kv_heads=self.num_kv_heads, + head_size=self.head_size, + dtype=self.kv_cache_torch_dtype, + sliding_window=self.sliding_window, + ) + else: + return FullAttentionSpec( + block_size=block_size, + num_kv_heads=self.num_kv_heads, + head_size=self.head_size, + dtype=self.kv_cache_torch_dtype, + ) + + +class MultiHeadAttention(nn.Module): + """Multi-headed attention without any cache, used for ViT.""" + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int | None = None, + # This has no effect, it is only here to make it easier to swap + # between Attention and MultiHeadAttention + prefix: str = "", + multimodal_config: MultiModalConfig | None = None, + ) -> None: + super().__init__() + self.num_heads = num_heads + self.head_size = head_size + self.scale = scale + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.layer_name = prefix + + assert self.num_heads % self.num_kv_heads == 0, ( + f"num_heads ({self.num_heads}) is not " + f"divisible by num_kv_heads ({self.num_kv_heads})" + ) + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + # During model initialization, the default dtype is set as the model + # weight and activation dtype. + dtype = torch.get_default_dtype() + + # Determine the attention backend + attn_backend_override = None + if multimodal_config is not None: + attn_backend_override = multimodal_config.mm_encoder_attn_backend + backend = get_vit_attn_backend( + head_size=head_size, + dtype=dtype, + attn_backend_override=attn_backend_override, + ) + + # Some auto-selected backends can be upgraded + # to upstream flash attention if available. + # If vllm native fa is selected, we use it directly. + use_upstream_fa = False + + self.attn_backend = ( + backend + if backend + in { + AttentionBackendEnum.TORCH_SDPA, + AttentionBackendEnum.XFORMERS, + AttentionBackendEnum.PALLAS, + AttentionBackendEnum.ROCM_AITER_FA, + AttentionBackendEnum.FLASH_ATTN, + } + else AttentionBackendEnum.TORCH_SDPA + ) + + self.attn_backend, self._flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + use_upstream_fa, + attn_backend_override=attn_backend_override, + ) + ) + + if ( + self.attn_backend == AttentionBackendEnum.XFORMERS + and not check_xformers_availability() + ): + self.attn_backend = AttentionBackendEnum.TORCH_SDPA + + self.is_flash_attn_backend = self.attn_backend in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + } + + # this condition is just to make sure that the + # use_upstream_fa in the log is correct + if ( + current_platform.is_rocm() + and self.attn_backend == AttentionBackendEnum.FLASH_ATTN + ): + use_upstream_fa = True + + logger.info_once( + f"MultiHeadAttention attn_backend: {self.attn_backend}, " + f"use_upstream_fa: {use_upstream_fa}" + ) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + ) -> torch.Tensor: + """Input shape: + (batch_size x seq_len x hidden_size) or + (batch_size x seq_len x num_heads x head_size) + """ + bsz, q_len = query.size()[:2] + kv_len = key.size(1) + + query = query.view(bsz * q_len, self.num_heads, self.head_size) + key = key.view(bsz * kv_len, self.num_kv_heads, self.head_size) + value = value.view(bsz * kv_len, self.num_kv_heads, self.head_size) + cu_q = torch.tensor([0,] + [q_len for _ in range(bsz)], device=query.device, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) + cu_kv = torch.tensor([0,] + [kv_len for _ in range(bsz)], device=query.device, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) + out = ops.flash_attn_varlen_func( + query, + key, + value, + cu_q, + cu_kv, + q_len, + kv_len, + softmax_scale=self.scale, + causal=False, + ) + + return out.view(bsz, q_len, -1) + + +class MLAAttention(nn.Module, AttentionLayerBase): + """Multi-Head Latent Attention layer. + + This class takes query, and compressed key/value tensors as input. + The class does the following: + + 1. Store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + scale: float, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int | None, + kv_lora_rank: int, + kv_b_proj: ColumnParallelLinear, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_sparse: bool = False, + indexer: object | None = None, + **extra_impl_args, + ): + super().__init__() + self.num_heads = num_heads + self.scale = scale + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.head_size = kv_lora_rank + qk_rope_head_dim + self.layer_name = prefix + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + calculate_kv_scales = cache_config.calculate_kv_scales + else: + kv_cache_dtype = "auto" + block_size = 16 + calculate_kv_scales = False + + # Initialize KV cache quantization attributes + _init_kv_cache_quant( + self, quant_config, prefix, kv_cache_dtype, calculate_kv_scales + ) + + dtype = torch.get_default_dtype() + self.attn_backend = get_attn_backend( + self.head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla=True, + use_sparse=use_sparse, + ) + impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) + self.impl = impl_cls( + num_heads=self.num_heads, + head_size=self.head_size, + scale=self.scale, + num_kv_heads=1, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype=self.kv_cache_dtype, + logits_soft_cap=None, + attn_type=AttentionType.DECODER, + kv_sharing_target_layer_name=None, + # MLA Args + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + qk_head_dim=self.qk_nope_head_dim + self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + kv_b_proj=kv_b_proj, + indexer=indexer, + **extra_impl_args, + ) + + self.use_direct_call = True + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + self.kv_cache = [ + torch.tensor([]) + for _ in range( + get_current_vllm_config().parallel_config.pipeline_parallel_size + ) + ] + if envs.VLLM_USE_INT8_MLA: + self.kv_cache_scale = [ + torch.tensor([]) for _ in range(get_current_vllm_config( + ).parallel_config.pipeline_parallel_size) + ] + self.is_int8_mla = envs.VLLM_USE_INT8_MLA + + self.use_sparse = use_sparse + + # Initialize q/k/v range constants. + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + + def forward( + self, + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output_shape: torch.Size | None = None, + ) -> torch.Tensor: + optional_args = {} + if self.calculate_kv_scales: + torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name) + + if self.use_direct_call: + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[self.layer_name] + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + if self.is_int8_mla: + optional_args["kv_cache_scale"] = self.kv_cache_scale[forward_context.virtual_engine] + + if self.attn_backend.accept_output_buffer: + output_shape = (output_shape if output_shape is not None else q.shape) + output = torch.zeros(output_shape, + dtype=q.dtype, + device=q.device) + output = self.impl.forward( + self, + q, + kv_c_normed, + k_pe, + self_kv_cache, + attn_metadata, + output=output, + **optional_args + ) + return output + else: + return self.impl.forward( + self, q, kv_c_normed, k_pe, self_kv_cache, attn_metadata + ) + else: + if self.attn_backend.accept_output_buffer: + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + torch.ops.vllm.unified_mla_attention_with_output( + q, + kv_c_normed, + k_pe, + output, + self.layer_name, + ) + return output + else: + return torch.ops.vllm.unified_mla_attention( + q, + kv_c_normed, + k_pe, + self.layer_name, + ) + + def process_weights_after_loading(self, act_dtype: torch.dtype): + if hasattr(self.impl, "process_weights_after_loading"): + self.impl.process_weights_after_loading(act_dtype) + + def calc_kv_scales( + self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor + ) -> None: + """Optional scale calculation for MLA inputs. + + Mirrors Attention.calc_kv_scales. Not all MLA backends require this + """ + # Use safe defaults if ranges are not present + q_range = getattr(self, "q_range", torch.tensor(1.0)) + k_range = getattr(self, "k_range", torch.tensor(1.0)) + v_range = getattr(self, "v_range", torch.tensor(1.0)) + + self._q_scale.copy_(torch.abs(q).max() / q_range) + # kv_c_normed is the compressed KV representation; use it for k/v + kv_abs_max = torch.abs(kv_c_normed).max() + self._k_scale.copy_(kv_abs_max / k_range) + self._v_scale.copy_(kv_abs_max / v_range) + self._q_scale_float = self._q_scale.item() + self._k_scale_float = self._k_scale.item() + self._v_scale_float = self._v_scale.item() + self.calculate_kv_scales = False + + def get_attn_backend(self) -> type[AttentionBackend]: + return self.attn_backend + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + kv_cache_dtype = kv_cache_dtype_str_to_dtype( + self.kv_cache_dtype, vllm_config.model_config + ) + return MLAAttentionSpec( + block_size=vllm_config.cache_config.block_size, + num_kv_heads=1, + head_size=self.head_size, + dtype=kv_cache_dtype, + cache_dtype_str=vllm_config.cache_config.cache_dtype, + ) + + +def maybe_calc_kv_scales( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + + # Only calculate if the layer's calculate_kv_scales flag is True + # This flag gets set to False after the first forward pass + if not self.calculate_kv_scales: + return + + self.calc_kv_scales(query, key, value) + + +def maybe_calc_kv_scales_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="maybe_calc_kv_scales", + op_func=maybe_calc_kv_scales, + mutates_args=["query", "key", "value"], + fake_impl=maybe_calc_kv_scales_fake, +) + + +def get_attention_context( + layer_name: str, +) -> tuple[dict | object | None, Attention | MLAAttention, torch.Tensor]: + """Extract attention context for a given layer. + + This helper function extracts the attention metadata, attention layer + instance, and KV cache tensor for a specific layer. + + Args: + layer_name: The name/identifier of the attention layer. + + Returns: + A tuple containing: + - attn_metadata: Attention metadata for this specific layer, or None if + no metadata available + - attn_layer: The attention layer instance (Attention or MLAAttention) + - kv_cache: The KV cache tensor for current virtual engine + + Note: attn_metadata may be None, but attn_layer and kv_cache are always + extracted from the forward context. + """ + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[layer_name] + attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name] + kv_cache = attn_layer.kv_cache[forward_context.virtual_engine] + return attn_metadata, attn_layer, kv_cache + + +@maybe_transfer_kv_layer +def unified_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + output = self.impl.forward(self, query, key, value, kv_cache, attn_metadata) + + return output + + +def unified_attention_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(query).contiguous() + + +direct_register_custom_op( + op_name="unified_attention", + op_func=unified_attention, + fake_impl=unified_attention_fake, +) + + +@maybe_transfer_kv_layer +def unified_attention_with_output( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + self.impl.forward( + self, + query, + key, + value, + kv_cache, + attn_metadata, + output=output, + output_scale=output_scale, + output_block_scale=output_block_scale, + ) + + +def unified_attention_with_output_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + return + + +direct_register_custom_op( + op_name="unified_attention_with_output", + op_func=unified_attention_with_output, + mutates_args=["output", "output_block_scale"], + fake_impl=unified_attention_with_output_fake, +) + + +@maybe_transfer_kv_layer +def unified_mla_attention( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata) + + return output + + +def unified_mla_attention_fake( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(q).contiguous() + + +direct_register_custom_op( + op_name="unified_mla_attention", + op_func=unified_mla_attention, + mutates_args=[], + fake_impl=unified_mla_attention_fake, + dispatch_key=current_platform.dispatch_key, +) + + +@maybe_transfer_kv_layer +def unified_mla_attention_with_output( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + self.impl.forward( + self, + q, + kv_c_normed, + k_pe, + kv_cache, + attn_metadata, + output=output, + output_scale=output_scale, + output_block_scale=output_block_scale, + ) + + +def unified_mla_attention_with_output_fake( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + return + + +direct_register_custom_op( + op_name="unified_mla_attention_with_output", + op_func=unified_mla_attention_with_output, + mutates_args=["output", "output_block_scale"], + fake_impl=unified_mla_attention_with_output_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/attention/layers/__init__.py b/attention/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/attention/layers/__pycache__/__init__.cpython-312.pyc b/attention/layers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed7d72c36c5c7d2904a175a0df40211536543562 GIT binary patch literal 166 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVx$2kX7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?)V62q}hF<=LZDgV?<*h13Hb_zUgI+$Z2l9-Jz@|;ut?GCg`xnm&+=9M zMSoQ+iUN-NEvXtP1~@NT!D^@&;=E+ZRi&tKK468bYEk8U(27)}#b`BFj8)^scr{T> zR0oO!d@f`SR+GhKHC0Si)5SE5$%bNOszb#gftY0BL{H7_-Uyq1Lp6tu2z;aF9@C$N z$#Teu-4-8qmlgN=$Q#6n-zG-F9D3;YhKnOhLT=z|6lQZi7JuDw%$ifF*Usyu8)nU5 ziT=nhO-DC$N6!f?e%*95-EnH#oVN_d!ZUhl-khn|%9ZOND%UKl+Ls1?-Oyp%WgWNf zj!JBxzxT`=R~w6qb?UI8{#ft4^NosSn3QE^>eXt!_M>@rsH)#EHLG6IEmX?WU%OeM zP6I~fbO-dZ*|6Q1D_Od2?^x5*-p?ZaZLcnxC6=9;Z`5v>#ueQ3`Kj<**Z7;_y03j; zH+WgOzl3EPHBi=Cs?@GOXCao{R;`4aYj$PXWXc@q-Q@$+bQ-i)>Rt`MAPFeSoA-fy zBz)upI11~+#{j`~AJRWbEB+=~5culnNbllTM5n)FQ*^}MqKBwFG~0KX{6;`_KEja)RDULYi<^Q~Ffu&jp%+t0WhgW$9flrZGV)^&ZAv?h2c8z})qH z1ny-Lbg7~bAS{@o;X~*eep50;Ghj$&5MUhG0ppNNjIa?jWg}!NhHQpmjam+^dD{C8 z<#wbPStGZEViYt}f4EYjrtX*;H@a#32q#?(pCIDP3 zalEt0jhk8t@vqqc3z~)n&s`&4&mep^4+6PI9H3rX=w3ok#{Me#L;z=`Xe%boUc7ek zot&TU1&NLz*$2eND;*sj{o&n$Q(Y`9Ru+#sIt8~pdaH5z%+GI}HVal|uCTb|%-3rL zRJG8@U%}FsOllYYi$(H_ODu&BUTPTnb*eAUYqsZ{%X2@aE%UpcF^Mk+0hz3ltvE>@ z-WZ%}4^FkzsZAIoEK%Y?KwG3{xpGUh%ogp;*fY2jAmf!hFjm@>f)L|b3 z@*P-LC5PS=I`KoT!_$v0Z~A?QqAhiFD?pOjJE!lw);e&ab@I*6j(k4&UQ2zC9)gE* z!lhh{g*2_ES4~Z0il$ZTM#Dm0)wB;F+IDBc{2k#g=IaI%oh7gfz9WqyL0Ho zIW&x%I9IP*6z_?@JBxJ9R5C3~)4;+n;ncRp5dG)6)?$1Am&ejQuoO*iF9Z4hHtEPo z*B6vu>LgOF^s#p0xa$Y$Np{SYpmzggZ_W)O7b4NoZ)M;%{X|LeXWU^jI{u9;%h!Y_ z`wzMjGVsLMv>W7Xh-61xKJ6+by8l@iM>h#lZb5nqbjbNxnA^$2FiZ6vvzJW)A~p;2 z8#5^iIg!QCTXqaFrDFFo8V^V`f_n!kf_vLiEYLk=U`Jl+Zo{={s+V=TX-EYhtU=)RlG0vy#-7(H zmLgtq(0JtM{!3p01C`MEdJTgZ`W3Hw?+W)w6QZ8hnVHXI4}g5B0@9vl6L^hOnX)vpZ_Oy^7U{*|m6(Hp zg}3!MMCt>*NZpj0fo8B7YAQ{+8T-`#uv^&+WQpX&StjZYr~Wl&o18*lhmM{{GK1s- zl8Z<#A$bD{+Kj%5Q;!z(e+?EG}KA&$In0p_S^mpW;?lyAi60cXP=U=taxFTmr1+`>!)x~8@$^C_}_p5 zc#{R_Uu<4AybboIup8WWgE#EA3%Fsl3%Z>r!6>@h3vG9eJ{2A!YM?d=%LYUv{x_mb3pf>z`}NUVt0ff z@}FDy0q%vES|y#>;6`lnAF;_+XtEWX{4zG#$sF9s6xx}>MrOL5neJpqHnK6t zt~*S$Dz-y04pF=p<_C$QPF^j7UQgdeauP^R@LU9o!E;^_b`Ix|pe60U0{QMXaiiqW zWM`^?pA$dpynMQo9{MITDJf3|)2;+Plmp3(8$>Qd;we`~P9ab&gppH8X88VvI}2_E zr6|elxqs@;jXz&#rzhPQ4#i34peH6!93Yw8gOlxa!5u^~NhXdwxbT++H-%D~B$7|@ zaQ9syBqC5x8iAIfXV@$bfw6URSEYs=a;*``|}-NA@6v?&dL za%bpR@(55CMkAv?8v!lk0tM;?0&D^Ys0%F6edt3U`cR-R?Bpj0j(^Ar*WXU%{X87vzl0c|;>n2xfwumW)s#oC$Nc{yk#3PYKpLNb#Sh==IC`tV~| zBc{v&Eu@9Dh(3}NwVwN8W>kyn-o;s`wKLaZy14jyXN^N^z50;W2W^+Sv_-o2$py83 znCp;E7O!^c71w>Rq6yuXghp7`zf;7Scn%#KL@9 zU}CmZUQT;i>`lwki&nl=ysT!I^rFUk+ch_JOVw0MWs!E(_m|5$^bWKupb)#ITZ(E~ zMP;9E2ToARBR92 zo_yG>EMyJUG;t1RsT~vbU7;n@Z1;M%tY=y1#(R8vr*H=*SF(I16!nk!sBTqgG20R^ zKOPZsJgDuvkZls1Eb{8T*{Wa*bl7SgC?U46i2e2@%jZ@+_BfR3zYdD-iswf#O1FOR zlC)=K_HErVQ`T)gRZ#Dyij~5GPE(~^N-t(h8j$&7xniZP5=hESkCi%~*x|`B%TYbY7n^@(VNN zW$SjSID-a%@U$saBQ<+q9L$GSy7Wc!kOAK7FR$Q zRwJB(6It!CLw1DkB5FsWRf3DuvjW{$W45%23bZfTF?aU@`ICDhu_ZdtvBtiA>`=5N zx5vArgERh{oSN{z-wj+)ab+AXXo{=miMGb3OrvDMg)Nu!#areyMctqO+`}Zz0=T4O zP)vtd&`}~YzoIiwvCRC%QgKD6B_`%Ay+HA(nPin{_BQjWR4v}pnFN>W+!-jA%ZSY0 ztZFeyFO;q2bT37fr=yr1#0+Pl6Oge8E(dOjvmDl=X07}k9k4(H1vOcKf?7jCzssWg zG@bP#c<^SG}U1Jescb1;lOcryhJ8I;-az`~PFmj~D)|#kQEah@0fSTLtV4BIo zdsc~JPwh_>wS*=3rFYPvvk!e0S`Wj|d;!@SX~sz3SiLu0>rJm+Z^Q@JZteuh;Lv7f zBU4SC+!}c0$)Vc7nYH<^1}4@6jp3=!UabwEtq)(Q4PV$Ao?Vab_{iv?M`KT>wni>i zgp4^<>n0*xeS>^BU{q4PfI`?J&JL_^| zDD~)iZD@Kug4BiD&ZeP2f3sLF591N?9* zAV_K~gW~Nm!F!Aw+%N}+-UkA_E&&V~S?n}_P>-pN7as0Gc))}3+Mf1p_U@t-;0A%4 zF1Fi5TZH=|&9mrUgREqop`F}Cq3ttJ{utJxlZ>E|j3+=epC;&D&7+H&SC=$V_vPeu zk`pxPek9{xCwGO63^VwDeKAXQ)zX!H_QCZDre^`GRl|IJdhbFmSMmlN73{hx4*e<5rVt9_hsU@dx8)C5~p z+AS6ef+slswwK`6!STEOBwK=hZBx<#Vch^(>;}U*!InCpEw($0OTiktK<5{p$E|r` z1#O`J|5<_N-7{O;>{xj3d9uYtwC?*7)FSxpORygV!hNaxE<5(vJ{fD=?fSrHlRy1! z^zUrZZW}X%|M|D*A>aZeE?3h16pcIcWz~wQ8r%(W{GSkdC@trK9z`B8S1}AGBf0{d znjF|qpW>L!XBdKcY%+`Uk z>vN6qqw8~DhR2>BJoV&E{q&XE=_^|YudaWuar7M4Pkud_{^|9fTyOM_)_afCdXH3N z#~QJ*r-KJJmo}Cfi36MWHty{Nq{BVPeF%n5PybEu^Wgu)KLWgI&U+YnFA0sMKPQ%~mpVQQ=YW;J}1qiDFt|%<1DBw$j{|t0kQ9h`shSNeJ4kk1L850*uC4(Xm@N;I7mLRgaVJOPy0{tCm zG7$(aIp%MmD~wT$GJhXgk@(ep(hQQ+;TM6?z`XE$Y`iHU0cLCBLetMlnIuP>ycsxq z#$JZ7_65P5?}g%E5D$j}DK02#Kn_EsZ#vYxzDOJ3Sa zUapFRJK{lUUZ|!{?GPk(E(s()-G~o24xVkq(>s1K?pvE{hDms+Dkk`0?6FxcMvsctBb`Xz(S9Um?t;rbtsPh{>C5JfRn|PKfsSCu zWBVcC3AlK_8r-#HcCc-4L$>tMFxX31)LjFcrDJg0Fp~GW>->>mwZoMMVXzsKAan_= zhR61T&IGf}NrrC*{!TDh=S6;h?5v77?jFPA9154`C< zgy3_`^tJgb-@AJ49p&mZ{!RPn0t`V>iK&K}j@b_}%R%-zNe3OP)QcBa({pOYfG!W9Dqi5IW8_C1<Dk)B3u4mw17s{I#tWD@9k;IGL; zO(F+^jlnVe>r23l@b`3ZqCR-KHhB6)1R9$?Bs#Di1@R#kU<@z@Ntxo zYZS#IYA1`g<++dc5rWyJx<_H~#>h2F#Q1rBl^vNiHF zdZ>{YuP0_|iJ5xhLM?HjksPfjr)tTm#^CT~aiiGi8?5&ouk{@Ve<2oZ5~(E&*yKDY z_V2Dd&oNS~fUqmLQ;Pd{$rUYLw>hVO=kCaSe1I++ZL6R7MbhS2k ztQo>$7+4<0H!ggG9h-B4PYQmPX%hJS{KiZCI68)trjM{du2Rfe;Fg$Flyx2O`&XX}kgTe+lW5voF=5#@|ZY!L09#Nd--g%|liXAq@6z-+4VLlG$I5QJ?X5qh7Ifv-s4zmf1uQ4~Bc2xQ*`iE#WG zIrHyi<{6oIMvlDncm>Z(nFwzS{}G6P@`J6w!8Kpg=M#=UGC=Oa@h9dsewuy91aN=h zRb+8vvFXLa^P$nd4*zAiDPaMX>qyhjNts0YemwVJt{LFPAn}Dhe*dHQw?n-69{iUi zR2SkkA>QRqFe)0$35cu%z|2Js< BY6buR literal 0 HcmV?d00001 diff --git a/attention/layers/__pycache__/encoder_only_attention.cpython-312.pyc b/attention/layers/__pycache__/encoder_only_attention.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1106eb8fa0fb5c853c0f724b5f29adbfc978f74 GIT binary patch literal 3740 zcmb_e&2JmW6`x)1lDo?#MOhRj%2m9vY|BJ!O0nbGQDVd21?*HVYS%y)1s1E_p-pSK zRA-ltB~XA;7^sY*1$-!+Lk~UV(7^cva_qUiP;~(jix?1)UUH*K84hyld$S}hyMoa} z7v!6nH#2YEypP|Tzh^QUfzOkER~2+ZzQsZROZGb5zd`2-afvH<#KczggqmOqLd+9Q z5qJqNQIkxmCYy3CX(sucq4PGjp|J zbC{1+Z(nW191#d5_m1|}deyobmE1HPrBV@?70J~fN>6*!W`VI&4B_WcW`qFYK5#=s~A@##*-M?r%i_~|c;Q{BTG_+kiw4=;`b!&Nv!ftiI zfI;pK4J|tieQO~W0lf4j+qp|G)%{BK4h*&1o>%LQ#AxhNy;iIH0~v#JCW6L-7ooKA#yOep$RsYUwXGcj!!v^ymp=S+jWcZ^Qy1xPf?i5yqCb_qJ z1@Q6fn{ZX+CKSPhiKf^$;c6o68IzDENN6U$k_K~9Q@Tk?@_aOU#dqqU+YR4a9$e9K z;sau;_6FS#pz~YMh!Rt9i7BeYO}YY2xFWbWL1kB>$%?X0DuOFN%$TWl@<1>ZI4AYP zHHT3l;lRRR!(M+?a)qX34Q9fDghz6d{651i zD>V~+yTVbfZi{zlO1bWL{lTtF00_s&*9+t3CBn4bN?$b}LJ+Gl! zRvczlfSGRKxAw!$nXtArvs7I=9@-3I`uKy!=?~8=o~AQibzx>{IlNoK#vzrj}*$xpi*)D7&CE;pGjTFoB?HaYLNVBY3-EDZ-*DdSU z4cqI@B&vSM(y%3x7V346p_losqD;?<4)r|C`chz%FcqL@&$D-M!~VC#9K%RCfE&&L z`Tim4Xv8?Yt!ip^>%hT|3~eV#rsg^+>?x!;*}*wjl|Aq>4ZUpwgrNjB4FsSp%{7i= zI2s-JsE;7v@*qlc^HGL_k{AskbnRj>Cd zZs#Z3@0@DqXLeFjKDqWuM<@DtOB&+_c@1muM)-A2AiJ@~70iT60C99c8VQg_a%DiA zBu%>FN{SFSMN@IFqHzRL0D%-&r79F34N6Y>p=72xBI%H&nJB-PyH~DjXq4WRG4P5} z4Gh}|0LrVjXE=61jqt7=8g88i{&Z;gl)A9s`fY<_ewj;|k5s==v+h#c4I&jAE2yqe zfCIod8tD!e9Y}esI+4n~6GFjq`sEz$fJGm}xqA*G^>viGC*p$Fzd=PvHKosrP3hhM zuU@TNRe-I2%XDz3PyhJDN@O|hA3Dor)y&ktOBukWK*BFR^Yrx;N1t!g1c zE&rO@07uhJ`Ll_p{B|je@N@`L*N8iK9Q;NIhx^hV>43P0z|%>nW@0nZl$z zxP0a3){Xg3epyPg6Ci~lxG^k`>=cqR5)}U>DQS@$)|qoR8piB(X~k}M(4wcKbQ}Wl zKwE&~x5z_;DJTGa6zRK<^{b&}l`?EUuHH)zR3S<(ehLL5h~)T~?BacE z*RkvzZbk%&wdSY@b&X&i^uIL@66`#%S0?u|Sh=sw94ra`3Iw5k9~mxe=B77t)6a6H zb$Kg2)=E#cCrg`?r#2=}t_%aBC!L(qTFQZ)92q~`I(zHc*e}-gt#rPXHrgXU`XaMw%xxHR{}`F$Wuu+D z0W~C(ZH>(BC`8RZN^L0P9ipgOyKn%1`^MmO_zmQ5XJE3EB}1d#A?R^viqcp>dnp<} zA>K?nEc9K>vL_f~!aPF@l*HI4m!Tiv4#S9HmdC)r0f;9nNA|QQ#>#8!D2l=yTO(UT zBV0RpYRrT84Pdf&ksJgPr7X*>JC?}`vptaSE~W3z4J&~~OIYg;3Qjtni7zx-Gyg*}DjMms9@G%|MZ z&zCm}M>}a8>agVV{NuS7xUq9v5as+|OmLT+|N8n%Y~limhnB`8wbJmN5Q=!fUsLQO zU`GZzq&&doa=)CF?FG1TiKM&UrlvoCgiIY#H$ zemTBA=9IC|?buW4{`-4r)d;JgJ_0ze)zJhO0c!`oh4%_&72BcQACc;@Mx36Jgb)#D zJog0#R%{^tTv0xbpFro>EttS}OyC3QBm_a|CW$cooQ!@;^p}z(h}#6n3ylcxJtrSL zCsWVK^h+@TEQJUs!E<7IQ^;=!`Hol=#@my}I|*pFWVNMDY{>inEyJjrAmS*W-#K$a N$bbIf3jz(-=RfW_&}skx literal 0 HcmV?d00001 diff --git a/attention/layers/chunked_local_attention.py b/attention/layers/chunked_local_attention.py new file mode 100644 index 0000000..48fcc6f --- /dev/null +++ b/attention/layers/chunked_local_attention.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools + +import torch + +from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata +from vllm.attention.selector import get_attn_backend +from vllm.config import CacheConfig +from vllm.config.vllm import VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.v1.attention.backends.utils import ( + AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata, + make_local_attention_virtual_batches, + subclass_attention_backend, +) +from vllm.v1.kv_cache_interface import ( + AttentionSpec, + ChunkedLocalAttentionSpec, + KVCacheSpec, +) + +from ..layer import Attention + + +@functools.lru_cache +def create_chunked_local_attention_backend( + underlying_attn_backend: AttentionBackend, + attention_chunk_size: int, + block_size: int, +) -> type[AttentionBackend]: + prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_" + + underlying_builder = underlying_attn_backend.get_builder_cls() + assert issubclass(underlying_builder, AttentionMetadataBuilder) + + class ChunkedLocalAttentionBuilder(underlying_builder): # type: ignore + @classmethod + def get_cudagraph_support( + cls: type["AttentionMetadataBuilder"], + vllm_config: VllmConfig, + kv_cache_spec: AttentionSpec, + ) -> AttentionCGSupport: + # Explicit override in case the underlying builder specialized this getter. + # @override omitted only because of mypy limitation due to type variable. + return AttentionCGSupport.NEVER + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> AttentionMetadata: + common_attn_metadata = make_local_attention_virtual_batches( + attention_chunk_size, common_attn_metadata, block_size + ) + return super().build(common_prefix_len, common_attn_metadata, fast_build) + + attn_backend = subclass_attention_backend( + name_prefix=prefix, + attention_backend_cls=underlying_attn_backend, + builder_cls=ChunkedLocalAttentionBuilder, + ) + + return attn_backend + + +class ChunkedLocalAttention(Attention): + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + attention_chunk_size: int, + num_kv_heads: int | None = None, + alibi_slopes: list[float] | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + kv_sharing_target_layer_name: str | None = None, + prefix: str = "", + ): + self.attention_chunk_size = attention_chunk_size + dtype = torch.get_default_dtype() + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + else: + kv_cache_dtype = "auto" + block_size = 16 + + underlying_attn_backend = get_attn_backend( + head_size, dtype, kv_cache_dtype, block_size + ) + attn_backend = create_chunked_local_attention_backend( + underlying_attn_backend, attention_chunk_size, block_size + ) + + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + attn_backend=attn_backend, + ) + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + assert self.attention_chunk_size + return ChunkedLocalAttentionSpec( + block_size=vllm_config.cache_config.block_size, + num_kv_heads=self.num_kv_heads, + head_size=self.head_size, + dtype=self.kv_cache_torch_dtype, + attention_chunk_size=self.attention_chunk_size, + ) diff --git a/attention/layers/cross_attention.py b/attention/layers/cross_attention.py new file mode 100644 index 0000000..5b44c7e --- /dev/null +++ b/attention/layers/cross_attention.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +from copy import copy + +import numpy as np +import torch + +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionMetadata, + AttentionType, +) +from vllm.attention.layer import Attention +from vllm.attention.selector import get_attn_backend +from vllm.config import CacheConfig, VllmConfig +from vllm.logger import init_logger +from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backends.utils import ( + CommonAttentionMetadata, + subclass_attention_backend, +) +from vllm.v1.kv_cache_interface import CrossAttentionSpec, KVCacheSpec + +logger = init_logger(__name__) + + +def _get_max_encoder_len(vllm_config: "VllmConfig") -> int: + """Gets the max number of encoder input tokens from the config.""" + sc = vllm_config.scheduler_config + assert sc and isinstance(sc.max_num_encoder_input_tokens, int), ( + "max_num_encoder_input_tokens must be int for enc-dec models" + ) + return sc.max_num_encoder_input_tokens + + +def _get_cross_slot_mapping( + encoder_seq_lens: np.ndarray, + block_table_tensor: torch.Tensor, + kv_cache_spec: CrossAttentionSpec, + device: torch.device, +) -> torch.Tensor: + """Get cross-attention slot mappings.""" + + block_size = kv_cache_spec.block_size + slot_mappings = [] + + # Find indices with non-zero encoder sequence lengths + # The majority of parallel requests will be running the + # decoder, so this list should be relatively small. + active_indices = np.nonzero(encoder_seq_lens)[0] + + for req_index in active_indices: + encoder_seq_len = encoder_seq_lens[req_index].item() + + # Calculate the number of blocks needed for this request + num_blocks_needed = cdiv(encoder_seq_len, block_size) + + # Get the block IDs for this request from the tensor + req_block_ids = block_table_tensor[req_index] + + # Get only the blocks we need (first num_blocks_needed blocks) + needed_block_ids = req_block_ids[:num_blocks_needed] + + # All needed blocks are allocated + i_values = torch.arange(encoder_seq_len, dtype=torch.int64, device=device) + block_indices = i_values // block_size + block_offsets = i_values % block_size + block_numbers = needed_block_ids[block_indices] + slot_mapping = block_numbers * block_size + block_offsets + + slot_mappings.append(slot_mapping) + + if slot_mappings: + return torch.cat(slot_mappings) + else: + return torch.empty(0, dtype=torch.int64, device=device) + + +@functools.lru_cache +def create_cross_attention_backend( + underlying_attn_backend: AttentionBackend, +) -> type[AttentionBackend]: + prefix = "CrossAttention_" + underlying_builder = underlying_attn_backend.get_builder_cls() + + class CrossAttentionBuilder(underlying_builder): # type: ignore + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> AttentionMetadata: + new_metadata = copy(common_attn_metadata) + new_metadata.causal = False + max_encoder_len = _get_max_encoder_len(self.vllm_config) + new_metadata.max_seq_len = max_encoder_len + + new_metadata.seq_lens = torch.full( + (new_metadata.num_reqs,), + max_encoder_len, + dtype=torch.int32, + device=self.device, + ) + new_metadata.seq_lens_cpu = torch.full( + (new_metadata.num_reqs,), + max_encoder_len, + dtype=torch.int32, + device="cpu", + ) + new_metadata.slot_mapping = _get_cross_slot_mapping( + new_metadata.encoder_seq_lens, + new_metadata.block_table_tensor, + self.kv_cache_spec, + self.device, + ) + return super().build(common_prefix_len, new_metadata, fast_build) + + attn_backend = subclass_attention_backend( + name_prefix=prefix, + attention_backend_cls=underlying_attn_backend, + builder_cls=CrossAttentionBuilder, + ) + + return attn_backend + + +class CrossAttention(Attention): + """ + Cross-attention for encoder-decoder models. + Handles attention between decoder queries and encoder keys/values. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + cache_config: CacheConfig | None = None, + attn_type: str | None = None, + **kwargs, + ): + dtype = torch.get_default_dtype() + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + else: + kv_cache_dtype = "auto" + block_size = 16 + + underlying_attn_backend = get_attn_backend( + head_size, dtype, kv_cache_dtype, block_size + ) + attn_backend = create_cross_attention_backend(underlying_attn_backend) + + if attn_type is not None: + assert attn_type == AttentionType.ENCODER_DECODER, ( + "CrossAttention only supports AttentionType.ENCODER_DECODER" + ) + + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + cache_config=cache_config, + attn_backend=attn_backend, + attn_type=AttentionType.ENCODER_DECODER, + **kwargs, + ) + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + return CrossAttentionSpec( + block_size=vllm_config.cache_config.block_size, + num_kv_heads=self.num_kv_heads, + head_size=self.head_size, + dtype=self.kv_cache_torch_dtype, + ) diff --git a/attention/layers/encoder_only_attention.py b/attention/layers/encoder_only_attention.py new file mode 100644 index 0000000..5e99c99 --- /dev/null +++ b/attention/layers/encoder_only_attention.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +from copy import copy + +import torch + +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionMetadata, + AttentionType, +) +from vllm.attention.layer import Attention +from vllm.attention.selector import get_attn_backend +from vllm.config import CacheConfig +from vllm.config.vllm import VllmConfig +from vllm.v1.attention.backends.utils import ( + CommonAttentionMetadata, + subclass_attention_backend, +) +from vllm.v1.kv_cache_interface import KVCacheSpec + + +@functools.lru_cache +def create_encoder_only_attention_backend( + underlying_attn_backend: AttentionBackend, +) -> type[AttentionBackend]: + prefix = "EncoderOnlyAttention_" + underlying_builder = underlying_attn_backend.get_builder_cls() + + class EncoderOnlyAttentionBuilder(underlying_builder): # type: ignore + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> AttentionMetadata: + new_common_attn_metadata = copy(common_attn_metadata) + new_common_attn_metadata.causal = False + return super().build( + common_prefix_len, new_common_attn_metadata, fast_build + ) + + attn_backend = subclass_attention_backend( + name_prefix=prefix, + attention_backend_cls=underlying_attn_backend, + builder_cls=EncoderOnlyAttentionBuilder, + ) + + return attn_backend + + +class EncoderOnlyAttention(Attention): + """ + Encoder attention is a special case that doesn't need a KV Cache. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + cache_config: CacheConfig | None = None, + attn_type: str | None = None, + **kwargs, + ): + dtype = torch.get_default_dtype() + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + else: + kv_cache_dtype = "auto" + block_size = 16 + + underlying_attn_backend = get_attn_backend( + head_size, + dtype, + kv_cache_dtype, + block_size, + attn_type=AttentionType.ENCODER_ONLY, + ) + + attn_backend = create_encoder_only_attention_backend(underlying_attn_backend) + + if attn_type is not None: + assert attn_type == AttentionType.ENCODER_ONLY, ( + "EncoderOnlyAttention only supports AttentionType.ENCODER_ONLY" + ) + + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + cache_config=cache_config, + attn_backend=attn_backend, + attn_type=AttentionType.ENCODER_ONLY, + **kwargs, + ) + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + # Does not need KV cache + return None diff --git a/attention/ops/__init__.py b/attention/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/attention/ops/__pycache__/__init__.cpython-312.pyc b/attention/ops/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7783abe34b3d2a006162bb7adff0d5546f62168d GIT binary patch literal 163 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVIq8?=7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?FIW ziIT}EJ@nS{?Du{1%{Sl7d^0=qU8jJT(*;50tm?XoBqW5Nn6`?B_ue3v zPZ#a+BK2X0_>fE!*R_6Xx>(HJd((>V5TfpNeQq7UcG|sHLZm=iF4BANNeNB)71Nb_ z5|N^ndE}}*ay7`S#nO9kT9Zev%_G<4k?Zrw4It0uzcG*8lt*5eM{dp|x8#vqL7uB0 zY z{-dkus_&53Eq6#y%@;V}Mm!{^_D(E>BcWMuXg&x)D{I}n=lPTry?Bzs{DGnyb^Ss=c|vv-X^1S0PO zMU!`7K>-@_#|e*Qgi(O>1E4taHb>qG<8p5I|6FZ~%+5E>2j&|hK4!`vX}G=6dHmQ+ zr@tu}xY0C!Cvr11*R&W6&Nl73&;(6rnz*?zH{++h^NfEo5Da?feN({E{)rInZ=Aow zDJJN^qIYtR(Lp^H_#ei}60xJvu97Q#tKBOD@iJEJOvs#HLI%O<=N4wYuPykQJKmdq zA06gQq9EW8d*}U(cV>~ZzbJ#6Y1&V7rW?V~#EduMyAkwz!_bYipVOu7r8#f&0ltOP zhZZ993z3XEC>#*9aH^i6(Q_BQ;{(_GIOT=D?%p(M*d@J}x_f*3dO5?D@jh?&&_K_C zcYJ8{Qr|eI8y_0z9T@5NUK<$c9lge=aVrfOUuL`$zKNTDsPdqsnxcL?N()}dOYy(=fO+gGqxzev#}(tVg| zH?|yYM0Lo3%b`a2aueT`(8Q#~4mrq(rGicqYDBeZPttYFfQ4u?Y6iIm(|4gyqs>_@ z!hL?=$;!>`msRu&S+Gr70%L*44W4Nnu?L>WEbT8qusAazlHSh09+&i@NkxXn&v=w=f9`|mv4ONA+J;Y1Yfd}cP$cRxC(sHq7 zS?xE8xqI(N1-rGi&-G-rE>|DOP@~X}=3Mpc`F+SJd3NY z-fOi;Asw<#Ne;D^nFrCMS=Ms(Pr-q?+lg$9^N=zMu}}An{IQ5_*T+s|79&1-Jc6g3 z@W2c`F3Nj}Rp~cF5GP%S)Fwx5!U%TZRE&`5iL6q&qk7MSlZWc=&d=y6bOMzMYep?{ zi>m}Wbzl~h?dF`^KOa!Jm=iq>mK+&NPEne+l(iDs@zwzq@8%2k_U0gR?dAwR>?;ZF zMkQi?^h{PObB(kFo!Ohwdv^AX;aPNc+L0Notk<+Fi^-7Uo{Sm2clJ-8J>obX;!C!3u?{fuUIPDJtw($HrjniEmCxPuV?o9eqYb@pq^Yc9nv#BxqSZWxhwj9h|elf z@Bi2{=b&eLqUTT-GGhx_wH-2>a8E^hv-ESVc;+b*b!N4*TsnNl#(noTX9?Y&YdNE5oS6%IB zV4w9NbQBF8s2%S^j-n1Ug!=_u6k@jn4Mf{f_IJ^p%a>e{a_?NOH9%}1Iw$tiT|Dkb zSEWB-WCVECalk9s@qt&>Ow&Fua;@2F(Khyf-?zLrhtEe*4LY(LDMw-VYZBVshzfM?-nAFbr?_^GynAI=(f(!mE0Tm9+&3JK^A?r{M zhj@@^!v5F1aHLFg1|Ogd0dF`Mn#YHH$d`vFd_g~_nDJ(a#SEbZU(%#EFt^AUpluA^ zpfh*}%oO4T?@1X9GcXw8Wbmes!Kes>w|R^MCr+GT{8BiMGkEvF;7cnM;vaDt~gS13XXJ|=?Oft_Y*a5i2y7<>d~@C@J-xS;|1Hm9Ch6c7!8DF)of zd&3u*xCyQB`T#fy1MU(5hW6j)oH=qs^O?-Tz09-^_+XwvI51)&*w>`9LX(qWe}q#C zav?G)Ixy=MC~|B+R+7;rhdFYHllmqm(yn9c>a5}6R2X{^#xu;3)0}Dk=ACc=1}iNS zYA?XHymMUoO>@+uH)EZe5vhWUULm~6i(J9g40)AP;ocH;Z-f@+XdiQjqp&pX^f;#$ zu`ukJbk;k~k(W4W5Z-cMaQ>!`!x4hfoV1zn2#kXJ&-yU1coLj_#?Q?8MZ{|t74x>z z%q_^rVHO^Q#b}9m{xV7EjrV6)X5-ySUCC1aw#mHK5NlW`lcw^ei=QbR+clo|Z@qgf z(bV(!8rw9KtQk&FmD|-v-k*ARD$&^exSwsj_=%5g98FeVN>CN4lH-Y!FS8{tCn#r1 zU9xU`qApKqOglA=P;AHIPE>TTmX4oDeyaRfnWB^lb>){*$?<|`gv45~G{UP0W!3t( zS*jzYDtg-(?~U77RoVIot7?5FC6#Twl#m-i$ymBxw@JR+l<4SZ%lcVE|I$FpV2^)m zBgDE+v4+!21D`{yZm-;aNAghlK)GI)sOm@-9sS787WF*te*9WeKd_}AV)aA6dg<5K zKE0OIUs<~Fyns-u?<-amDayDy#8U3}THinZ?(vPWMDy8XMfc;%UyzS$*or}x8cb;| ztMuF5t5dADWZnJ`EgSSkCtKFWYTHse<8y^fS+La2YY0afZmqo|A+vs_Hg9Wm_xo4+ z6PCt=x#^!Qn~VSG`nMZDpZ?kOQ_a{4`FT1wr7^F5o7GgMjE-%SEne_u1DMmQ{)-~i z+LiLRTGzT_UCZT3RS6GJuIjla<8s1Sn=sUEj6=VwK2|-Y&MkFst2FnItQ<+08xl=t z5`|}fCHb}LQ`J+|_|iFODfG_N%G7Fb{TgenPwE<$`kxgLRMEDzXl*JsmC_W)!Hb5? zfyd*|r4pNJS^b(kyOfJ#3fV$rD!?Cz5ORkWDTuLa7ydj@0DC3KpSEp9) zt(UR-ibPE(tLwxDj|m13R3sRr2!nOEkdf7wC#sLKx}(pOGHs_&+jA2EbHx_3#a$ak zti9zU>R*kIdmr1_V*`n?%TJA0mJ4>&#``B%PHw47Sar!$wL4|B;of+s^P$7I8^>6C+eb$pxBtTU zxRdR=m>9qM)OhXBQ_YUW8h6CL_1=y51Mdbl$<3~0^=Z~}dRetyXj$uwb;bwdCpT;m zU)0Cc)57j$Dplx;(?9BtPqBs7(Eh@fWeQx?>hE7zxv=_;w_jVYjNf7{H5*4*OY7#j zpIAOx{AX8E)4!z|WHp1oYW;QBr(H?S)nyVEf#oZ2y|i{Ub~WA|f9=gzS#23hK~(Kn zUF&3QW?8+Xw#U1ls9oCz(;5|{;&mG;*3r5-$J%=z`#!NI4VRX?KT|tX0NXp?9$S~i zud&9e4aepT>*;2VJ*=h&dXd5*KfN*?H@?&J@ce`G>n$5CPm3Fpy2fQ`+VAr#=OLoQ zZ@0whcxMc;g%wFnS~vz z+d9Mjn=3b0!}0$0YcQ3Cnqd{a$5KaO6^)-^Rke8Wtiy|^>A8xKE8e*9`h^r_TK(pa zTGr|HPS#S#Qgx4R{5bGI;3G0|;$pIK=#$G&#xH+zjcvTbQdd%xI{%XQ2Zc;t!06$d zGJ&@^;)6Fi0=j;l5`OeCELi}-BBsUstT!#-%Cz(@Xx_#9djrzZ!~z*DMEcj|NKeaV z;FHBjXjxQ`)ERWUFOtE40YkkQfM1Xi@g_+ZAQDmql0Zr!DWpmygH(m&kg8Duq#C4v zREsD`bw~-R9;qNTAT^{$q=D3gw2&4e9i(QYhtz@$kXmUQq(#UGsU4Z#B>#pa2~?1? zRSvwTi<%>N^DI1Q7mX|$MW@ji#q1Y zk&G^xIRf!!%xW#(%O7|^R}~z{VV$cM z*1sBIojW3|a~`}t(Y3;AS0}7<^}>4AAgpBB>s%u;zAlH=*@#!)Ig-X}XcMwR+Kh@I zZ9z^*TajYAN%UdwP0PeqgH9Wa-(foKd31JvHw8N#Fcyc|Ihsc&+YbDe0@jWpJEU2^ zrR1DP_N;bvp+YEeLRk4v!ff7a?k+{2!jTPHry>6^U(e*x$@cYZ9vx?vU)_0hc2^eg zrzeljsVqC^VEsAF=iWRz**^E>(aHAtd>)-_pZkUH>ILDOJb+C2>klm%M8-6Qzp{XA z4UI6^>1Q)J|v{01EBCph(u47Pz8 zB`|5=hf@RQw99BQ6PUxtb~x5E_{`3XLc(dXpocK03kL&qU~bBL3xJZ)EdT~)Gq8|4 zJQJAD+&j67@VH{5S<3vKNB0)L?ZxGP-<`@KwPVphD&lfLz`ivfH=zz?q;DJKgD0F6t- zZ?u0gFyZ&8nPFV1CL0h-;~UWKFsI)KD?x{;(u#tizl4&^cX1NM$!;exE12!W$$6ag zTnlZ!YR!U?XQxrCEH3wHJeMuq=$0F4>L#r6O*6|kNn7^#~+xZ{I&K!E8$ zBnX`+2OM(ZCMRD2?D7~O){v>hqn3_5wE%s>7{bVLs;d};>|>Y^!^su{{#(phP-pOG z8Lk%LD^&BLTX3xS3h{-eM)B%ZxDgL26T_-<|LlC^4)Yc+dmIZ?g1{RL%=kGe zTzulp2gY282Lbuv3euN*Je8bW7*0+KKq)5|;8DQD1awLmLQc7hpmORg^c3dI-X$hx z4Lk9-IKctH2~%Yk`OAs|Au|37>%D>XG#MC7z)zX)VUYpHZT5s>;#f>!L=UEAScnKt zgV_vN85XeB41kD(kkh2^ho$>I4Y~?Froy(y&Xq$5Vp0FrYztI))Qh0x+GxBnH z1Ms6a^9oGCA=$z!F{L7kYk4)MG=!yu*J4UXSlqlGQwE~2gg0W!MA(n;g_trEwo=}L zDJx;GeOsu)u)OqCFhYQ7XxZd_X#rpgIx315M!O2Y2qt1wkf zIBWPCOdTN#i=KIa;u{H*Ys*;98q3#P;IzLqxDDs;W8aT1T}WA-YcsK#^|IL9(kS%T z+VR-&_*m@Z(oo7?^3eIfxjy!wWTWlJ$3Hl}IrhOx*4_#5T*_csQ^(Zt?wD?AfTsvb zyQ*UqB`JN;{d+6--s^b(#JeY+HlN*U9%P#blg&ek@?lmFK(wF%G1`Kt=1v7btu_7lBgVI^#Fs7CiJ6#VkBJ6th#yIS-0ivV4WRFXXnaLO6!c@ zeWLYjJKbB(de&LLGW5BrIN|Ep{C3iGKA}0kV|J~L#75RnCe6*u%9OQi%UaD^tJhJ| z+Pwz4a6VmMhjlDPCz(mkG#xqc@*_nEOdMQQJ=SJVJ$!g)VMIZ{+% zin0oSX2|=pP-QAu8iXN(+|u#{RlK9IB^-^crg78AYFbm;GC;{nAj1LBg`skNn$#MR z(7tvmb_(E3t7=&VFofB@b|Q8{R81KB0Pw`n^PZwv1(=eGusXLaRjj3I{bthAw5I(Rt7$%>{ThJCuMfxKS%W+ zYRrJ0))UI|9YaydP|VAvTHQ0LT&em!(EwqlA&Ma6F{LAH#k?L<2Etj%8!=@fToCe@ zGJ|Wp1yfd{#KYS#RYW+-c{`>YL`g00#8fd+=;U3PD#5{DiYYgy%7Eg_iI!7|<_qzP zb%mo>$XZ;*wU8GvZtl(d>t0n z6UAi@Pd+%gLB4Z_Z@`kqY)KQAG!w3hhrRC1o0qhAC^ktx%`746@RT2Vby`8nz@Ye(Z z{Jdf*+mNy5`rm&83k00ZBOk$!$I85ezi~4Bsb}yGkokMe>-K;8)0+yTAWUDq_AW#M z!Lab7YMI_bBp7_=Pvb{2ewa?nAJIlTHUB=^tUGvw=Bb6T&Ism#Z*b zMSmpF#${#ux5`ohvUTN%BiGL27?T zRQx+p_gmt~?}(NioqgGVe`aMS;b>1dYPTHitfM{YIGWUTE~#Njkx*|qUUxi`kYw?f zQbMBPF$D{7TFyg1yN)~@eK49TuEDQo<7Bs#EZ`jk{7~^~>{Z?=u*HPk`B3vf!@C5w zgm9NXL=O;ODzI*xOFf|YGJ!28EVi`^u?u{Kz*dTEmB3bG%P+@X=4%A@h{$>bww3_o zR}<6lbpl&Y7)7KWd!Ha0+?SJL0?(` zGbp_*KK2Dzv4v#+1>rqtZS1FcLC7J^mMEilbBj literal 0 HcmV?d00001 diff --git a/attention/ops/__pycache__/common.cpython-312.pyc b/attention/ops/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f626ecdc8d3f61a650a53eb8a45cc365d353f9af GIT binary patch literal 15147 zcmdT~Yj7Lab>77ri#NcB_>i;|C6TZw%GASpMlnT7q9mL0gOX!Cf`PaI2@(XT1t^K~ zqJ>AEY?hpT?EpJFcdsd!>!2q=PBynv7kN1fRBqxvz!Im4LooROwN#u2;Pz4FT$LWU7%rL5*Sqjr7Rz?t4L z+*O*Lv*@Vf6lZ>i;w*P{a^*QIXAMy!H7cdbFUPD>ZwuKtd&tT;LU!#I+O=U_0}u~0 zobw&SIfs?9QruR~rQMcuuTt*`>2U0@h4a2+Jm*}6OFnIGa5byc*RE1uw@Q8eD)sBM z>%(@gp=uXgIx6Jm8t>9FuWc09#5INWT=ObxUcXBH2B5ML_JW0fhjtqnxdr&-S%puV zwCBsM#*fwb_nY#A;aaQs;U#CGjoYlf25z89^s_uuqvE z-m3N*LQ?Pw-qF1uJ0CrL9oelcj#odJ`RXvJZuB%Q3FWFsX*HgYV z=H$~J4`&0M^_55U!R&q4>sO5LtsYs&Vo}gs{1dv1e2Oi)d|cRr=`AX{@SOde%WbR zC#(~^*P&&qp{l3WxoHp@$hm3I+N(9rO+$GHt4Gn=w}vu*8>{A{wGv9GTegxWp-E^& zPNkZvdTMc=mU{|K70yo@{ifg%*a(J&r+a)nI>{!7Lu@b_-98kA93SG?Xd;wglW}%1 z&hw$cBpV724ztntU@*$^!Psb9f(<5KD* zM`GcAzk>&*NJ-2%A0Og_V}S?<$~7(-VS8TKEt!IF5QajMAsP>IlJP1YImsA?Qc}{7 z1+Pi^(6w<%pO_ev^wIc`WK4j54*8oUTh*pZ&gJZrEb4Yi7InpvTkVj56T*c8N1mza zd#0vWau7;Nm;U9&Bg>0@k{P)h;jZzhxOqeclJ!zBIXH}!l4Up))gwB9u%> zj@ZOlAXYBgkb?S74+WVfGJ+6CW{^y9a8Pn82S#nrR65Izl4Ii%p-8@^=%i!{2M2+F zl1br3GfWXbiPfI}esKrT*fAa%-<}NeL!spMYZH6-?;G74+7XRh+A%(v9FE6!T!}`< zc9dnnj`(gi$1 zy?&;pQ+_4m0 zUFp6@ZtskB+L~|4f2ru+3XN;(W=>3>$Oj7z#hPvDmzOM*wJz@vEn7;AFE=JKt)J>? z>*mLdH+NBFHhpT;SzDL%l+pdvN!c7zZP~V5d;YxWZ7(wZbk8HkKGmLW&-LVYO`k06 zy0>M{c(+4reL?i?{o5@c8vmwae*JxYk$DMPI^9#@Y&iFN{_;a-OZqsLhqJ@Et3TW` zTl<4Uh3#VfjyccV@uKrk`gqA;&g{L>U)Af@sN4lfya8-*ZhcotzQ0~Pd{(ra#U`$& zCJJ_8C_9uJE;NdD+ltN(ScbPD-!r`(#`o6Dv`@E}tTiwJYwN7-rxWvM{z?$rdqhj` zBdaqrajQAwC|P_@%m$w+ed;Mg0ejA7&*n}Q9c=p8BZq71jqDq_^M%7=ZCBB;GkvV& z@a4kUK>FCn%G$o1kBN?s+3-h(Jx}21P=lDLkBYpX;K`n$Lex2$qt59#`kbEAy+fTd zaC%6MlLo(0a`n72KmdME95ltX&Y`o-J$#M(2_k8i-maQn?Mno2t~sQ zB*XGMA(bqFKqMAP1_IaDt(ab?`sT~fE`fLtmEJdNUf#~p^0da4}fS%NX%e7mPnF(0w+pjlDvK!V2!B4pX*I4Sy1&d2>lW` z5^%xhTC_Ebw&n%f<`UacFcp&T*=M5*?1}jikv);={m8!lvAs3ZyJ%l8+Slhf(cW6P z@W8&4u&v*3LaxVxW1&DmVgiA&I5!c+ygd-OJQ0j4Z_I%J7at4+_(L!#e*}^{6d^z+ z&cA@Qn4E|I1hW1e>R$|}KQqwAmM12IahKc{TS$74i=YMpXGms{l8BEa{b2H%Wa2}~ z2|hN6YE*q}kRj*lHArD6^=YPp{#4gaN)Nk)Cve5X(TH35R(M-YQ)$JKo#%9CSu9EHF)(V#tU#QiR@*)f2s;`LMcE*$=quJ4X%dJ>OU$Qx;_GS0wE`Rk43x=}`%{_mA z^nU9<`G4gv_6{ucoLOi*TQr=_eBrU#c60K^WUl+|-^*`)_qp586<)jDIoti9aeK+- zn-a1@{z%caF=Hq_)9%&V+h@BTG8~DMqVz`7l8&~y9=U3A z12gBQ&lO$lg0p1_;BM;^J-p2vUScShcPf&N+&G@@PG8Qnlq~isCd=fyvW|2&yj|Bc zbA9^yHw4j8lkUxI&KXMeO*3yzzx9n&x#p>bvbh(T^&+#rWMwkl8B?w%m(1A<1AlS; zz4P;iUot;up6YcrQ@RHTI-Hryxt7fSbT6>dYA;!AFr=D7ruxsdRBx{_rC(ZVq&Bj% zLDBPk`s8DSFWqzV^o`RGYj(`lEEry&w?8nvexHE^Sf0~o_N7k~7$Mm}9uOl7kwm!w z--Tp>H{|mlL}aBtXr&SmVFv{cAVQ}F2NkfO6R6i^n1MaWqbiG(K3S%!<+9eM8%D)w zQE63vi6fXEL3yy^f%X~>x_kPvlg$V%%Ff+N=kh!O)Q)&6^=eL_Im11p>J%ICsr8j# zp#m9<$y&7@l!Xx}mKBOx4^C#is`qwL>D=VRh)5L7z>6h-YG5RRpRkO~E1`KjRo86z}&vYT_?m{C+_bmx=!EdOZQ}a;1^hAT12L$ z&^p_-z_b*Z?dhI>GkZ$)jWd(elLgPTP`I|(c0h!``U4qz$=fK8n=yk1_~Vw`g~D;M zcH3-pe&{~^^HH($PW_kz9#sjoRQ-nyY7a?6c(53?Cl(k|UX&Mz3xL##OX|~jt zY1-7FL=|lw&IISR+^m zAu|_JQu~o0j`M394b=fl4T%i>tkwMhX4?hhC`ugDYgFM(;aNrw!C%+*SY5>w(8gP_#F? zXYLsL3HFt-w<4eoW5L9z-$H!$JhFmEjVc+5m!3z%O)`WhqEX3AeDu3^@(`A!{0)-v z8pahQGhh#ru|-3%q2w?Ln(*Zho}WOHS1`GX$u&qM<9LJ%T#|p1@(+Xpz^fgA68>U@IDX9e$CwH-ogD)PM zCnVD~@|!>fTM_Roa*DJMj0Xot130|m!2SkwT7dt=4~evYx{d-5=F8ca^DRYtbNc8b zyK~i2$?nM=&z}D7rFSE@BeV4E-r|N`qJ3BT=&COOKe|2O>2`Q?d%k`lLzgV>ZyuRB zK7BlYYPRL}>0(WXXz9pWz)$OJ{22VI=%pPk92M*QMdx$ro{?E^iba z>%qb5sms?-Z!8=W-Or^@fV&vVrf**^Y|dZ3?V4>7*KHHs+h#*^y}xLgJANl7x(`7s z6Xk2npD7I7zA$@KT;KIU*Bm#uS8P5w9~7IvD0;v6;gY4#I zTLdNv3Rr%GGL#VX0$j8MfN#F5@DoJ;>L;QJeOcfgGjayb(n#*Zq#S}x^K3ZEoVE!z zfa})aw?OkILr@jGL&`Y^2N4D{SQ44U$OiFAY5J_d zMFbudwzISIBD;Ngw46k0*?x?VB;!el)B{?+6e3adWPB7zlIh@-lwpC?5u~C{POWF* zT7Z0~fJ~^|t^6?_U_nk#gwQ($kr>h(-oaI^p>kX0Ydye{42yOJX_t(Qh0xUl(@}3H z;Ed$v+%wI?p&)qLt9y_Uo^gGl76?Ii@JO);V9OkmUnm?UefhWGT*zwn70e+9;a|n1 z9}p|OW?AH}5Nc0vnXG7NH&D?I8f9zQ(4lm8ax{{)j6O!AoAh6G~Ed}Ii3UfUGML^vv0oZmwSJ{_aW1h?s;r?eC-N$&F?PM7hV2) zFT8*7&cV6M^KHd#M;2^7U}+h^{UjP1OBOHj^SGvLS=%iLds@k#{gJmmZ!LN^mE86C z=k=9*HUC*}H=3Sq0AE|xTGut?cjqPxhv!=6HZ8R75$pGW@um3OI*QJ=jQ(-OsKUJ% ztcLGpW9i34OI3kJMK&VG)rX_+0^jjTS=MM-mT+sDRTX220qI8Wb5+sSHkM4PJSIlL zZ*63OvE-;SmMrS(6l2LLRNtSh=9rE2mg{Hr+3q7018d zT}XmQskvkJve^8*=zV@}&wR}U{V-gAetk1`w@td>q_2jl5{G63g%RlIMg zLU-kx#R}fv23SzX=^H76$uT;sjrxs%`^&C^RdK&w!TlgNG;u%76(hjl6b9s9B?W&i zlKpjYUR4T$#4qE2%{Z?Wa6N`YT$!cQ63!%l!q)(LvR@Or@T2;AUVhT&K4a-(2{um;>Q;&cjv zw+oCWg147B$_NY~cuNo+2n-60GekbHK~_GGEC=_ExDX47o`=i1a{TxbqI|%^7?@Up z8Uo(2U{VgM9u0>hgAoM%N{BZy20==|!AWv$NfYcZJO68eTQTG~z(T0HU0IEvERR18r=VYk^#svZz`UVK)qDq9@!y36 z?!S_N#`mz|PcbQAQj5uZn7oOJ23qI8k1tRu0Er)Ap5XK{MCX5ib!A;A5kw7(S&q<& z=dBN0k{I!yK`w!xFz!)zsv}kV;?JWgJHUmJf-6Ye>GT0^Ofle8` zU}&{xj!&J+p1Sq&qJ4vC-%zx(MN3P%`(wtXAoG@7xNvcC%YJdo{twqLb`OZ%0}q)q zpAVbkRWCzp$x{CeHit;8br~kFwE-r_aQHGzo=fIqv#-v+wy@!OvHtnh5xHO2e+*(^ z-$dqP;;$ilEp5d6vYlK6?*?3&fQt^D>J<&N_8?k?sz&mVR`Pchn1Flq2^kF$Lry;u zNlI37VGiy;@Z>&$d`H1FKqQ2G$lk!ALztj{yJGkeV~ZRuvcvpwe7hTyosdYDL-K96 z!~D;n2s=+Dz64?y!ss;p#6;1K|DX>2E7cC#T4%Z0c%yMi*H66((&In|tPUec4o`c3a%xP3t>?B1(n z4!k}jPTzW-EHM1pG&8M~&+6Ay=rg<8OLjJLm zH$om}?b^6x#+-#>8lG4&*GHF^(n0sq@W#U26mAGhR#IftqKy>octN-$+p**zMW?dR zB^N2W(LH$~dtu2#ie9DonGcH!8!-DRM^gmfx(U4V@g z?dtLzr07%^;37qJJ)e27s4RfIIY(19bvSS93ovirx|u`MhnDn^FV7qDFmITn8FTWy PF~=ygeQJXeIeGsJ0D>V* literal 0 HcmV?d00001 diff --git a/attention/ops/__pycache__/flashmla.cpython-312.pyc b/attention/ops/__pycache__/flashmla.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff990d84d1fbf0854ccefcc1da13d04b91f28bcd GIT binary patch literal 8343 zcmcIpOKclSdamZHDY7Z*{h~CkvE&DBJ#3H1wzM8+{E}zZGfHOc>}Ivjwp;9`MX~u% zcayRNI?)CT#Ks1&=Vbe22M7>5zIcEf<`5vrC40z)94{cIcLs0}3yL0^9d z!9RJo99GE6WSxJvBn!%*9Dz?KeAv2jROyzxYBl7k<;>Qt4%eM>_pkhmL-y(p594xA z>p607oA$mo?d|r`4_ptlnKRg?eMlL0_B{;y9%(bmmQ3umy(qWQ*N*sMvK*DiK$atL ziW6-m_)eR3+wrptc1=NAZ^=>XRgzw&se=42@EKd@#)%S{AX)${3vA(MF}5(Q0DvWA#fB_o;BGB`v`6m%n( z5p#L{^1K<)6r-SJ6Ru+8H(D`x0J2RXyGsnn?WA+-oZ)GxdDgvg-$L9ozwtLkRn1I` zNmbI9GpZz>r74|ea|V@)Qc9JURArK?skEZ5(>2NNN#rv5l&Z*7E-2K<(NtD1B$KH` z3OJ_OVj@9{il(P>S(?+RMD;>GpVJH&FKLP88kJO*8p{gM)(x6VQbW?NDX=OGQFNs0 ztU_V?)5z%J4n~#hhLXiqehAAFT`HthPzg-~@{c{2K^A6zgA~NmKRNyJC#TPRa^Bok zZ*Ew#KHxFgM#+AEX2!_mXY#51l)aa!>xJX9C(_5288x*ulV3NMbJ-c(#EfJB>jtns z17|toiher3ZVreky}n%S2((UEU^jep)VxjdZJ1gfe^=A6GW45YyNmO zda)Yrf6e**0x19goL`Z1PfzQDMQrKEJz6(ZULfYcLpW1AgiCoz`#q;1Wo3Fsl@jSS zMOB;6MC$>98sdv4>5eu^S7$3Uh<5~re~4550J66g?^pXO{AtVM1rrInCSpbs61W*r zS5{RxN^>w<$cU?=kxN5O1Vs{;6-gG;W*9rLtT$Iy%}(Sve?m+b87b3eH_3{o0XQ8Qcnhqn)WlN?ghF#4%Jb+1h}XRFRh*X(ha) z0jf6DlDFh5`JXs@xkTce6oG}9ZJySy6*Azv^f^|YqD_LFqx+X6Be5*%sSPDQNo8X_ zj}ZcV`gAI5ym!Pl@G06x-ZoM6Aehdt79^F0f>(yG;?dCqjE;Rgt8`|jp_*=7$vMC| zdKuyk&{_gd0TCxnS;wm*SDX20p0(OI%Vv}UVem>2z?f{9hjLsRU0YVN)YfQPbNg)ojFB11aCd^LI7dHDvM|1o z%R&Tyg@d&enC6l~9Tp}T5AJqw73O1Yi@;SdR9{VHWpuAsVXK<7-ZUjI={iJsh}&0L zJQER1tfh=)*Fr=+GkfSNb=PIGklBps5Tj%VjhbQ^fG|WI_AD1vFvyHzNU~%|%}28z zHAByc;4ut+l3s>6m)t3z!i;k;gZ0A=A-b z?E|zUR+mf46`i&pM*JZTU4m?rygWvN1NSEGO@0&YyS;vEeJeU$iB4}tKd(eTuL{Fe zVZ15~Zu1B(h_55`U5E(Xw`Xt7ZiQ(jOt->wmGE3O*k29qss?-6|3Eb;Y)5z@U=iLQ zVA0IvR!!c~UNgL;<`QYqK&>}JXnmrb%7{gCu+3?kA$Gv*bj*mJOBxyJI%tIi^8*Q~ zpi8RRAuFswT&)+=ChzEK0;9K^u*_Vt1~>>@GQZhVA}elh!?%lVHo$S-;7h_UJ4!)~ z|1pt$F8cWkuGArO1yTxOo(9qC*za%X7dAfy>W+(UAYQ{>ascOPa|R+5mR1@Ah}#A{ z>`ZN)w-21Rr^J_nC4VVU>L`VtIBuTdyvU8tQl!+$Fy8CeL?F2%tweV;K$QiI<~745 zq1~EsUW&j5nYF(XNLHJy4?LE#Dnf7;S0@|XI#gV(alqmfb(9OHqpmOw16Pojt|@kf z1*1kH=@vv96mth3sBm?{3>*M!;2Bzkwbus27USK$h{ZYj5m*BSYXHB%Zca&Btoifb z=L+A#4K{F5_xbi!5}fhUI^F3tTqf#FjpXT@kZrQ*Ihs^+ZX82jhM|`k zH^%GFF8<;&ge-`45C`*lbsZckb>`b#;*@hrQYxqhMxw)XhU%-DvEP+*O=oP^F-Mnj zId!@oXmd7Q05#}Y8sc?E(rbvK=5Cr4C0>`NEd#a_Ks>+@?f`jBN(t!GFuZl0=}Bqo1EYeeToq)0rA3vJpp<0GfOQVx3mTh>QO~1oUR; zr!|tnWifr6Ey9`0fSed&qWW^K0M88oxO8~rfQ?Qa#>WvbDgdK8J@o;!8Ip>ymU+Fj zWBf=9WheufNdXv^mU0DyF6Y*e29he|>#*!K;%tqvfL43SqQRr#=;(r73lfVp8*(jI z)s-DYnr6B?s=G|rGMIiXy!c;8*X+>p{2w=tIkuO9w=NXd)g;^jl&dtAd=a8~U_y z%qAc+dy{K$Gqi7g4BLVAYJ0IKx}zr^)#5nq0A>d=0&$F*|}8 z19k5~3D9N1FvAU^)-ejBRx~6V?+vuZ-P#8*VFNxoyiWq^v#xR^*k9&Ht3$&N`1|}r z&;9TtUu9_OW~9n@midvFLDDx;4qht9E{e{95}VtKEmUF)RUuLp zdaJ?!Lzjar?_n?#TKeICdr%k+Rf0n|y*Gbu9V0^YPPiP7m3^`AVr2APo7|yF?9f*1 z!%FPK*OD87m72(nG?25AsV7}DgK&AU;iOjI{aN$wi0aMPz+PFFa<}ZN8YrLnk4Vy+?hU`#^ubxSr_8hIBAdF*2=MZCVvv$00 z=OjHcwFD7Srm(vyy^xtwuIJedc#%UVWcE1EV~+BO;B??>$Q;cn=-1F~X!ua3eF&M! zYl;qVNPp#eI8=8R%@nnBFz>hU(cgs(p4*O+XwO#XM5S}$;b)ILk1jm@=&QuD&NG`A zs^Ol^1!hZe@51~-JOCMfb1C>)k;F%t>omjc-QNCbLqmfyJ6VhS(}r$8(E6}ZL*udc z85G5=lu<;{4A^twN~gn5AhKk3vH16Ox8^E4F{z0|!rLa`;+t zO1lgl7*q7W2SHd~j^nmNgp2$K8UL0X{FaRWTSwoWKY7;i&Sv1bH&*ucZh2!BZ|t|; zF?iMY@BLkH*K;~i366nd9ot>@4YwF{=y$>R^8?eB;650%Z?fzgw*paa;GzGa{OIh{ ziN`-K?>$zb$F~VIS`!`6^T&^lKJ9&Kd^u7+_@m06liLIutswz=lt<^Dj(_F*@<4g= zWM%IM+XNb|y=2#THAbt0v6qqkT%g)Jbob;JCoM0OpN~UimV!{aV|+1U zbzm(-`iHGBmIM;+yyLrdz(N{UCkcdazJL4Vt&{iqAB^4~Er<71{Cl^f*q7wU_&!*r zx+jkRu=4o6m$9F6+-Q{-{<8GTlI6w9^GMI_E4Qv#eykt^gLg;27`1p-g~&C`xa?rn z5b5r-!mKEecg7!Fxqroquxcmi9=I!fAy`pX?Sgstj>7J`S+$2mx~*PT>?2+Mw@bH5 z_vA`+x7E+u2AbLiS=$iqc$gJONM|pyc<)>#LM^rfYqa&o$Jm5F^N_CIm*E)KTiYp= zp9dqig9NAN?)@BmUbu zA0Wcue-h5Ww@UXs1MTuc@qes7r}a?vz|1pHCulwNy^g--%fS3B`6nT8fhYew%U|$$ Sf8*sa=lKgg-rsa{kpCa;@2IK( literal 0 HcmV?d00001 diff --git a/attention/ops/__pycache__/merge_attn_states.cpython-312.pyc b/attention/ops/__pycache__/merge_attn_states.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d229add0d9d05c359a47943fa764f8538fd78da GIT binary patch literal 1952 zcmb7EO>7%Q6rNe{uGh}`FAfn2l(?jAa6)4NqTwg$EnEnZAcaFz$#OhvVt3cOnjNP} zaHJl3AVMNoLb4-8N{>W|gpiQn#2JZ8f+FGW350scZIR1`1LDnkow!LY;7R-Deeb>b z``+96DG~`I5MBJNxF#d?I|m(!`9|vpFzSdPf_1dWM?uGiuqa>@LZpubG*p7Q=Y@zp z{8A6U+`}L2;a7V2@p=#;5+b+|CE@$hqPT|B>IpyId>T9Pe1%f2%(9lAV+E5Mj%-$# zRbfuVqFSN2n)PAWhq|pf5xY|84*8IW!tL}dw`Hoa3N7cm*-&8L0DP^#K>>@H!Pu6P z(6P=&A~3Oo3L**I7dPbpIY)#!K?WSOrNmEgPWF9FM+H=st7sp$9rQWC!7iL65|MSu zkT(>JAf{AN6<397pek0SegB>d$jb`;TUI(a?_iVij?4QSdfV~~C*fffclX_V9dKawNjmEX|RaN%PB2u%Mec_S&nCxSZ1q zjY=vmk^>#6%%0{%0vPjFF;2Tu^XqQ`4hhM2`@DDEMB&> z=t%Z*&eAA7qG?gy zWA^8u?jd&oOYh*F+1=YbA09_hXy_Lub)=+@ zl@}kX&o@3gReWZejo)IarrQ6-dk^0FYUBITkK?U?5D&XZ2!;PtQ0N2P zJKiQb3g=2oZBC;eL7m6m_E@DOklxc#7s_dX{(~WDLL&S2c>cWL~U_0C8xOA(_7Q-$J1Pfve>HtK$9vLVkmUb8$BE;xQmAL?j|pB4uWR zVGwtfToW$1vn96do^Vs1yTp|}6CQ@Rh~%CMJBqgM_iR-5LQk&bEBhw`44Fw&`P7!2 zgEizmAnz&h6(D`Zsx)Wg$wFmDmF83-ud1MB;7DG;+!Xu6(j3ZwwH_n&LH@ENo^S^C14+%J;R`(u zmIZ6G+l5BQjff&$6-v^KB7j9oMK+wNI4=lSXY(by#VZJ6r2uSBt;mHdhUcQCKOzYEnVE{3NB1ev&K76X zFUbq^!~dKdQ_Iz{YO#7i&C8c1^}yWh8*d(-enT276+auR&Z}1{Gh^3ErSe!K0%H)S zu`1e*QJER7&dWhiML`-6W8i~$o3=z7J|lKa#>Hfl1tIjX^CympsWv^o?Wogj^mMl2XCRhn z5k|O63X9!sT zJ$i9>8~z@!_z(MZ5G4LM@3!+~pE%Ss)1u(X;WnciX~WO8;p05^iF=#0$3Xi&$RaIQ zG-e2Kf9p08b$P@CO}gUq9l6(x@b*k^cw|YPm1nkPG~7yYt9pU^(F(Z?+tWWV7f8-E zp7TG=0`KW@1Ro&y3j|dJa|muA_$30uK(fU|YXO=&al}R^pUs8M_XZjk?s9Ip8e)68rHW@gI*-H{X{VC_t(*Wi&qeT_azG5n1O z8j(W2a77YCb-oIFnktVLobm^B6UgS47jWoM3Naed7|pc+G8&6;lBGU*iteID<~Vr- z@f-piq`VKoegFz;&f+CgC&bY<7?U*uZ%r zdjH`EZ`S)y)*|oLeD7Mm@Hrg|PR#+m2qYQHq<;p)GHD9JwDWQ{oCUCKk=4i)TDap9 z=?1mv)}R_hixxSJn`(BSEw8!JY2Yx>A-Xm8o-G6NP*2jH9?jFfr|7pukFo;p}V30fLQS)6uCXJ5g)F{hu7n| zI-gr&^knbKr^}zNov0^=mI9j{J*(%IzrW&bn9e74KCSbe8+^9TXV>{%I^U)93HbkQ%v=mFbmSqKY<0eLbUK-zOIEuX8I&}dB5gr zl}AL^J$r&I#A@+7Oj^+`(k3TV=<|5YGax&fD(~efT`ZZ1Dh998WiT! zU4+arGMqrQ6j%=y%rS}&sD##mn}X;^D;LvRTF$9KXFOB9gKgclxV=EUnhzI=7K5eW zhs6-kf*P-dv@m!odM{!1fG5N+G>vGR4_&9pHea9f8HprdYJqKgp_?j_Ebk;b2?$S3V`v^psW+&om<5)#Tk42O{upM(ff#|4%-) z`&&NF1;=Gv;PN2^XwJg6b+xs8jxG#lXCz5fvT7y!nUu|E%d;i5SS?A}%4D-xDm$x4 zVzxMwwYb)SdGsmzwn(z@7(ZZ;^xW3IMJp1+BTCnb1u4hN7ok<1j2-d)?q@SNTKK^$`j|CavVvgWqW( z{heI5{5}e9C02~Nd3*h8X4_g_ca#%Le2@l2eQHjey|+AEF%rcoiTx8l<@@)RJW=_%MH#Dwm6@ zLMtRE-?zA16<8QvdtA@UqWwny|uA-9|r1U=W8E*{AlcBljTBT(?dG;)e^Z{e4j~} z{r!5P_x6EW|DTvF6MN%@7bNuP$1iyjPjAEq>al^f!S&e4l1EQwHV3JNpRM=3{UBfOd*@%y)_X71d|gl=#(US?_1K>0#1#+d zeBatYo!|Y4Ke0AXhx5A>(L&H z1MJkJT@I&DkEWgf5j{GrM~C$2ftSM_VB$4~L{m$lm!}xgv3qTzo*29T`lH058h@yk zIP@gb_eJ-eZm49|{Iz{YAI#U1r}R|MN@=;Ir+Zfxmlt1p-Cg0O(=P+0vv=i3%Rh2N zYsq6z;4Mvgxx6LV8D4rH1iSie|BtL^4lbR3vds`{*f8=x%Ax~lu;?{vkhOV7=;jT( zBg{+Odu2@QMhGXL)dH`~jId&FqSuQvV&!_FIbC9s#by8E3V>HvP>pmK7Rh2nZ4}sc zS_{*SgH}+mi9}%gz>Nbokb8EDhm;n+K;-Yj!*VQ5A@aw7%edR+qShMUDx>J4-m&B- zQ5>z5FC(A}9mn3Z+HrBvtf1gicdob{%dIL)lf}8V%gt*3SJ0zijR{rOu5T=dRM%4Q z2^L*Yaana&oEBPk^|i@&>-Z0qSjMWppdJxi^^7YZv?On z3PfOTJ40azXaJW0fjS6k1zKaU8e?PbSCqxL*Snm36$0d(gE-J&pNL+e0R!Ho0ZL|e zobFU~^K}N8(=`>_Su)T@#jpLDA z1+5ERJ4)!LAv}Y2<$VD3K$IC=J9j6t6w*8QSjZD!@K)qzWHq$5cz^1#@5E*oq`Ny~ zOA*~c9Er%?{s()1<(zc=&Iv|=qm3Q1qM|B=Mj--^Yy|CZfWV-gB)41z*JV<0eFEpa zMVIEXg17AYYrh)~y})TDL&IasxEEco2onaH*$VVLJdJ_hqU(ASCRbocjgS>8)p;jm zoIDFn@-+mn0WboMlPcKS9E%-q7vwpR{zfR8RR12@6n-~TJ$J6}KNNb2a< zM*5h7HOx==qwW>{gv$+INZk2ZwF2)%Fc3|k-yw4eIf6JQ4H=Ib5I}tWUnooPK3_E4 zC5WSpwV6DOtt?RR)nFFtD|rmLSP{usWy)_Oz@wSL(V|9{kTHb-rzLgBj`vV9zDJ<| zYZ9dnYx;Adb1~D!aBr@OcMtx3@i_tX1xq|}%7on;FrH+Go&~+!_ZYn+ZL)}L0>U8z z66r3}OHm&IuAic?MfIAr3-k^|mXzDyU7EYu|2%}PwtgxYr;p;fAw8bPFApXQ_+~0& za&X7uA#HjQ^P!g6TiQ)+^}{bN-MLhY6(DK4FPlN6c#`QcLx_bT9k28(_m~l+pvbVO7*cT( z2tQ8%W{(1SdvO#!q%)(BOIhJr)*;P59V?L*bBsky)Q)OfMyU z&S3nM4A@_0%pfIs0xk+sG>qenP&A6Mh*307qDeDB(GFYF+QoV*XC^5x@f!@x{HkLGpTLn9*EJco|`?-2eHL=2o?Is zno)yI4gh@94iX1Am>*5jFnN*kp;3Os0#3k!NP*q-yOkM2D(v_uf>adBqAz;x^q4WE z;trKSs)KZPn@Pk{+q{uRRwq1cRlE~0Gf08)R(!jW>T#%Er1~5S`+>3zME*EKCteVj z2n-NvJCj9X(aK{-vkzVD#;P3GD#GuI{ zr=boBPzU|Jh|>PgT!_+|&06CLBiQ)fD=9MW;MR#MjGqlhov*%%?1mO2Mw#&LM1@be zvud%V$hd_Vp`rp`fyKN4Z)T?YJIjo~zCpmQKO>LM+1;LyQ5cvvDLF zh4)*dR<$G0Td72=G8xMr>r*AoQ1lggC-Sh&FnmX>@2bb-n?U006$LkTI6*MXZ+(Q> z^&7(fH`(_U8QEk9YHVVI9jLPdkJ;V-;U3nxo*I|j;Ckv@&ttCdEBENH+0a++_rQ6H k)Qt$-T*2SD!px9K0DvD|K@-W}?fQgejx)~*y-}zBFTIRmI{*Lx literal 0 HcmV?d00001 diff --git a/attention/ops/__pycache__/pallas_kv_cache_update.cpython-312.pyc b/attention/ops/__pycache__/pallas_kv_cache_update.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36e1005c76ab97910274f8ee32a00afe60d50f53 GIT binary patch literal 4764 zcmc&%O>7&-6`m!R3gPijreTln_a2BFR6o4aIgHE4JW3X^X0@i8Xg6F0~{# zyDQr=Rt8%HsDuQSg&t%92BZ`L;=m5-qk3xYz3f^}Nh}l~h0%jTOXyqWjy-+Vp~f+CO~@mGTgJ;j7o{PoWEzoD~?Si}+nny9tJ1Oc>7 zu;uL&B!PK$!I5`PxXeB&PVd7s zVco2U_1?3wG|0DDQoAkddkNqF5`N$%d?)8;U7UZCW`iF(AJi9^2+R_x&=#h~QiS0Y z89`uV=-Cx<1{yNM^4Bx2U2_n2s2#rTc@n#A3eC9`!Wz%jH)YYdWr+v%@i|I$QO1xeanH+EiCI*C`(JuJYtKgwK?C5G^SLqO4L@G z1}jxKUe!Bk!z+V@ct=p?=xR>dN6M`-5t9CZFb6!y#uGkuw|;Q<&lmdyctj*afJZGAD2u#c+rM zyn03;&rEY^MxHHX(wU+-YuFgYu(2}CR~Sh#oQx=P1=b*MFg#4k2pl7&f`&&H_zWkf zC2rF27iRM57ijKAdir`gdo6E{2ce%~GFfhiL!=FtoRJtMlQrxzmpAAg19(n0+}DI+ zW*Rne(}?cTEpk%Y95D#qpo)`|GN+_@gD%vTtJ^DZg()R#cy=_-8vY#vOc@!!Ny&ZX zUsw8-yx1@D;%l()DNcFq=FIT%W7ET2zrbJX7iX1hvCw~A5c2(uqHqO;2eK&2{k0^5 z_gmZJjL0$y*9V@A?`Ni`IjO)2QVQ0`Z?-%DhgU+?B#OpLmo|gpW#5wTF0&C#mM&IW zyH-!FoVp*v2&s6Bh;^@NYoe%DHtsYr9vPP^8 zZM5{$V&&k?wRqz$K5?uS9!+i}Pp!*Z@@<_y``@@CvCkFHuq(D9aFF2^nJF$U^S3yu z6KDW00f=pd&!i!E3rwDqx01l$!oVSAlX{^s1r2BIw!kxBUSNZ=0=IV6fT{&>{X%Cf z1cRR$8a@*ZjX}K(s3Hf9{pHs%Hs_pk0sgTy8n2-e)ivD>7r-`(en-4SuJ+ned*mqA ze#;36hTP>Tbu0r&!?JXbK?+FMN^S>1Ssa+r082e|Sb6-1(>!GZ^!J0=Jw?oUlpREv zvw;8Zwozk$1GfQId+4-K7w82{soKGMdKZs0%BiYDB~_P7O_C5H4Rl%>=X|Pf&j~d^ zeT{xJV154{?=|@8P-Ikiz=MQOZ3g;U5AC z$LUrH_wbE_drjxLd{LTB%OaEEfOfFTtjMJtM#M~w8Af2Fvlrs~0s}c^H{6(S%G9Mz#1Q2RB20zpxM2jIkvTaPlDya`H&llEZJeLS zVHpV@ui=0H%FiyFR*qvM4g1L0FC`p;MsOUfJclhFSEQMYA`M|S`SZ)?FB`sbF3*VB zqQt!qW4-ez}4%x`U$Rf zo;(R3z0ridl_}=0@db`G(Fw-@%AvB*q**>M9mA83fN8`yU;ApYKCZ$cvznm6$ra;tZdUZ6icQtlek zyN1eLC-klpPpFfdl&2xT>aYi$rFURrIJ(@n)b?<&+?XU-&r3+OWxr3XOua@=8REJJ=fDtigX$*|;2N$m{T-~HPHpB69 zxK9uFZ6*(EuMBP^hbm3&l^F12TDCa>RF`suS4M29u+uoig*>a|Iz?q?r+ht~<+dqN{mNIh^k z0BqfpGlh&&EDCZxf&n=9aW?^b@_9ggj3D8cyLJ!a`^Wlf(JSFgU%MJj7%}$ED7+x| zq9RXaft+Y}-~Pm$+ql=>PC{1YAh)_Hftn zI;KUBKX=$|wrU%SHCK|I-^Sl0PJTlN7B4PbtU90%kjob|DL1BQ6bLW!3w+gwsUT`i gtfp2{)exqlC>W_W;a&{*rlxH-=zc~4&)mR&0D{Z$mjD0& literal 0 HcmV?d00001 diff --git a/attention/ops/__pycache__/prefix_prefill.cpython-312.pyc b/attention/ops/__pycache__/prefix_prefill.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2ece7e27f09b3f7ddfaf5e0a82f15b87c591188 GIT binary patch literal 25870 zcmeHvX>1#3mSFJ`?^B|v(>g5cvJOj@EuWHgTeoG&lH(-9rbJ4nZWn3E_7q1-PtS%- zddF0@2d0Anrn3PeCmFPZ3AWvhUbJUtegxZ#1 zMV6?N<#wlg=SPa*tFPYozWcrRef7Qf)i3pW4FT7@{9X5zNrL!a_`qM4iNLcaIYHba zoP<*{Ph7&gbY8L`y(E=jST-+PkYAE7C@v`^gp_beXLWoAsk>ADsf2qXrB217#F=qP z$NJBwU8sV*WXG2W)BTi4RT7of?;Xwv2yH-JRN9@VZ^_`d#pbUGTPcCKd!XG+U zmh(?NQoD`FEitY3eFTx)#0m<#~s9dWl@9k}^{7Jd@!i-QK0eug?M9*Xsz* zAOM9!e60ddCH-27VFdt2a*Z*2Ac50mpR-RnrY^helk<#a@#=#8%2gNXc6sc}F48`? z!svR3?B~vRb@Yz*4i7QP_JQG!e)}M!;{u(7!<}6NOs4S3ey**vv#XQQaLCc#OI>`r zA;vH|(A(KN)MKCM9qJsOU`$x#)kSh?ex9obSBV6MbU|%AUPe3G-+Rv9+0_B1G3pDW zUG~x5q5e@uiNWr3#~Afs+eQ0`eV}WI(PFS;>>~T1gh=~fFI1cjv|XHPhUW7ZDr_12 zpZy;Ts%39Nu@c0rlqX5uof2NT`I`5jlqM;U?LjF^QXZ#{QRO$FT;B;n8XUc}i_%fe zHvpGRd2$bWZ5sWlm!o$|x0Q05I!&Fr0X^nxp-xgQY^aqwL}pMePAN_rq~tn5RZ_=q z0E79?2+GJvYXCetCDg%B6MgJ!+mp5?N!z|BZFQ2igE~%CVr}+JQ^}{n7W+D>Q&cBT zRDUCN64TOLz#*P8V_q;y7QC~V4x~alj*80UV|-nLGEw-_EYPpjxm_H$YmP^3N1}y?i}XgQr1I zf+1O{WPqxik$BDgqcelAnV|k4iRss8%48X57iR?}h_yi>h~K3K)GYZx+9-|`Os)WAro!z_6PL-g};<+XG!`CawEjzt0frBfl>KW;tx$)B3C&*1? zK?suEN*$!?K2v|17>!OMD0}ngAV>XzlzAfc=ojb!A+6Y`W>C$Pk(VwCQlh4$Ogx6? zqdi7JWUTp=z5!|=jq3|Bcj^SSk1u77mG_iZp!94liuib{f30nM9;`GSya6&!6(^~7 z=7RAhe7tXvtBIhTQjCL|?B{8ybj?$ijA3gaNPB>f@eQ&41_MF0&UF%mM?qCl)pIZ( zl7|H&CD!}0wLw-ASiO+SdriE(t)kjtcroYYN=c5$@Y?Wf!A`!JE!v6WmtG{Md9i*J zksZFoJ?f|8>99w|N?Gd>t1q9*cWzO-HtahmC`DX45B>qtIU>ldcxS(?^{kzvMyQI< zG`w{BgegR3rWcJVH6o@1le&2XN>bi{{`HMewN4507%7|&L`tJt*V5(|md`}_3RfId zE%Mgdh*O&?RnP;aB0V6~tz{zdBTngOx=$0@j=%fP3vw;rnF?BZo}2lTD+H)ra->HE zsbOW5eo>>sGJRtLYO%7;Qs=Sl$=olj^-&knN(YYg^{t^^gu?z{`IfH@rQ|!cp5b{R zCGm;ep|^I-{^NVfccPSJJI2?>#pGoV`5qUOycf@Ae2Id77SnQSp6(UpdjjNp+&8f{ zDptd_39+{FCHjOMQ=HB(FruT>1dO^+j{DYn*Dhd9*T!&Ee7$SueP@HRAO1p4thH~S zg{Ul~gZaaXcXp)H=b&sp24){Nb!yBhTOxH1PqSn8$>n2Uj*(F8T;aP&Nhy}Ae6I=m zT|@`0o#ED3!Zoh%k|2%vs^b!MoO?rIYsaqhsT!(&+d4$@=D)nuIwbXevBx@O?IOGz z^s`dH_9@$!X|p1%0@C`Qr59f_?>!sTe4DcYwPQ9&>1;udi;Ys6o`$uANGs>w06yDm zmf+`xx<9N$&_*I9em5&{+QPT%tRU!R5v{lO8g*)o<7igSQyiU|2Y8N7>4X<>^aLfU z|END&|4}>iAI0iFDxv=<)PIs6{$J34pc~WXM_3d3`ke+WDsCvu4~JT%ma^#sd>l3GNbT7V_a2lGLjKS($7f78D)QB zdr~=`2&g>n#X0-3mt+*}_JO4-M&((sd!`)oE=D=W2UZdR{StHs8)5AJ zdbxD);JPyJS|s&YPucKQFJnk-&Q99rm!|BKY0n-;lZd}E>0#7-_;M=fOj67VDCSZW zb18}y0mVv+VkJefB%oMIQ7k!kZX$aaQ`)}rSF34r@-5+C8o-b{JfyAcb--+8wap&82iN zI$hV8e34j?E-o@N6^Xm^*V+68MNE0Gu_)sta0~DrTO$ZE>ZR#vkA2c(Ut}`aAb$rT zW1oTJ;cmaesGRNvJ1h5+DR9XGf zvZo#t+qC-{>bGh4{CwT=bw+ESUUk~%T;!r_o(w|8%i!;MswRka;yFyg=FD6A4SmQF zH5IS-J~f(e`U1YtNYq%ge)g%sbaNrF5NeAW_O179S@I(L4$+oF>-}HlC?fsdQ-HgBGMkV_wIkM8e?$cvwrmVI1tw_@|)12tXU_OOkv?uwRn zN90AZyd#lgV|3nFL~e`aR7Gl9>6})`Wh!7HJPsJ?W)JNX>RtBvaG*1Na#S;19WzwaZ<$GLZN zHybu*q9sRZ{n7RAcm|;<-^}>r%Kfs<)q6%--TdGLt?G(tEWy`k&AxAB5_Qp*#uUtY z?@G`Xs1Nnhngib`r0Vjxj8JHv>j;zO=C#1J52Sb0x7Fd2$bn&H3UhjCS(cP>JR0ew@ zg@+=U4L`U1BJZbpk2Rg^9nUj}tn6FIHjY8tBHDt`Y~=LdZ)MV~j5q#e$t zy0C4tI%+xMSK??*1SVoSbIf21)zOAZ=yiS7mN7e2O&iNL4Yc7Xbh*CynM`irtNO$9 zH!lV*zFUL!)W!{j-stzfH{xFn^hfdz(z=5&LvEy?hBnk-UbS0B>wAt+R&XY;7A~QU z`y&<2w4oU@Jq8S5&Iwh8J3lE2_t1GYw5jI4hBlsvv<%aR;ct{O{W%G6pD8<;ftfaz zM#_(3t<-XTGcbbM5YK`dymKX3wrdRxS;4XQ8-f#orAYB1+HeR9Z-By$IqyqCO(7+1 zD30uJq76-0?xq-;e!7M@(;j6dp zQERQgdrND+c`9(~iFO~Y-S=2q95dT)?GsUR`Dd0N+wR%!+oDy+Y4h%?-6Z(iT3?AuL(9ayD{e=F;%^CvBVF&7Qk<+T8TO_VD z(5CU2r5HNFT(V`d#7x=G)C!Bi-~QZ$;(H@-Beeem^3KZbmGI=|mB&R5QDdWD{#0kU z*&XN(HiqQykA&r+iQ9TQ^FUNr?w6o$TnVfM9U;eKV=<7LEVnW?GJ>Aa*v5^hsno9k zkqvgeUlJM%Rcs8=hEl%_c<3buGvxfREj5^*NH&}P{|shKr@z@5=nR@~**0wN4*Dgi zT&4rl!I6+87q z20~p5O7%T+uv0{-#&wV|Axq*3chSai4Ph__t%29pyJLC-7XWp`F-fQ%rdOSh6ukt4hNYw5leiGRE?2f2&rY-U0x6OPA7H6;Vqg=q+H7o+1(tTnmc$Z^H_gUTy$b84JQk{z)f>3y0NrbaliXP(|>_bVYg74rD&Ed|l?FAhSOrlGPsn9-yMIJGQi)8+Sje5?dL<+APe}GF#tzer5-^bG~XUe5SxNRmznQ|$q zr|J?eC5HsLi*8d;wYaIlZ&vt(W*aejbsBx+9{DHe(_BgpQ-`TW){&%%YM`3f&=INz zTuPd@xs=qinNKaujD3joN9zE=-u2z>I7~g$I`ar zQ*I(cS09m0*F9UDY{j?gCbE}95ttI#jkA+!?Mmwn7}Fy-rlRIxp%c_eYlDYuPKWG^dIe*4Wo7$W2VXv{c=@`9LV7zC$|R;Naw8CuuQ`C8Tne5b*U2pxZjiVdfI9OL;-x#=YuzcaWAn0o`$v-Mw?`?DD z7#E}z8>Me;wL=|ATkVWf^*fwIgzF#an|QcIZ8T+P%k$Jzi72AbYc@- z$$qe!*zZDMFY@fx^iKTUyT?fvIr7q>f z!p#kWmZzLpCcueh0(F${qCm>31x%I*QWv2X>Fa@X2~X?0&yudWtfPsJb2K4KAWIS# zWaNGXxZEOfg~Z6fy@W*f77~|R;OOCUxyVBhtjcEnL&yeuh+96&IL zUPE`lE-__qlD7Jvr_;ZNpiK;DGh8MDAc0Ng*YC3gUSUIN8eYgj*l$yf6e zc&;IG^Zy*40$bUIyq`b@+Ej#o0ceF}pLxFkv^|RZ1#GDg{A=w~c@ z>-yl+%$$%pQc%0O`Xsa2uY3wQaMu2W)eOc!%-WA;GTKxezC;`A@3+u~HZ+j8OTdJY znHTD&EnsqQrp?VUOXhQJ2DkyBYeJwj=?0)jHvq7g^A^Wfa{~|>>NcS!&5@G>v|#{E z@`Eqq1%MTJB`*MIefEcva8p=$yNEVcZuWt9LF72VJ}kRm!ukV%2@*OJjIl9YX3!NF z7MdButM{`X$baTQBM@D2CenFgI4_E%FaHoME++!eNZ=e?p?6#<`BwY51BwRu3syE-HbtfLY z`JWrUbbi@PpB|-kqcNQg7{#39SAe|`Y?`VKRZMRQIs%tMgP)Xaj%`-Z1&xs--E=`W ztpi)F`KitvEPr?KnNn`n_!VG-1GfTKKns>pl{I9eRmFZi& z-t@Q2sB6BBQQyNAp!>1mB)S4v(F4E=9sn~^BI{UuUiz=W0|0vuEZu1TKOH$ULf4GY zc_We0vB>yD)O?Zk0|1AlKSaqLSRaJxEGsu;4)uY(z3tvXy5Q&o8(nbbVa1coJ`tVw zik<>3k+$=6E&7k4zrfh5_zRRj)*naSPWTHXBr^EY{sQ00Vwx@H!wja6M@NUt{je9js@5 z>cwO#3!?c8ZKE>yKWo$x#M|1pi{382MNkB;`_nJ4l*fp@U0)@9S>EJM3@N9w;HyFs z_^J@$vpS`ee1#zYky7lDuZk}rb@#$sEq952x*!oZEnOiz(mvw*5-Q`*P=?$ogZ5Zo zu5P~U#ZpS{l(QefFbjGS-_Fu5pw$Tu(JCl8Z;gPKDQ0oB(k?1HxudLrw!fGn<5flg z>+LChm40PgY=X9DupzCWWl38dw2PP(a%7@?P8Gg@Qj!2g@T~bn9m?v2R#WImsT)qfF~6r<$@kH8+4aZ{QA#iBgM}t7Ype-WDCkO-bJ= zdZ&1o*0L14w3dxS3%M^pP~_C%i?kOHA+Q&fv|*&$M;YHzP)4T~Eo~r$TAz_J1f_4; zsLZ#NeS}m0X(BE3y@kaX1eh#R?Zwj#?Bz6Kxr>Sv##az13KR=^P{ei-&{73F-%iVX zh29cAKdKOWVg@wKhJ=#9UkZN({4?OMgue;?I{541uY$iC{s#DGi1xV&bAwogXskIA zD+3DU?0&uusXH7Khqr6IcBRk>D2P4t5U1EX52K(I$)2T%QwU2Drx2DR-Ln*N3SlYY z6v9$udzK|KgUFmmE~m<1FWd)8E(LReFA3SmvfDFikB z`wS|dvQh=074ra^y|~@VUQV6U@`3tqCAc?BY1Tn^EJ`SiPva(h`vkLHIZvOu^BROy zKpsYC<_A*WK7WzF*sqqjnidWxLPr)FZ)9Is z^juwDULw6NXEA(hub6!)IRPhruX>km3~YZ_IbDI+isGw+H1H)u;RZZr(LY|?fuANU zPMotV=8o5W{d>5`YN!d?JK)-CWe+p6Hl`Z~M&Velrx=d&6}vsfi%Z_(RJ~T+-pj@8 z*87e9SOA~iE4r>uC6*`H3PK5gT;4xJBi7%^ZV>@VKq@fMC11}VI&lSL6u3{~GAOki z8-*`D4zb5xVZ6Zc2aH`)PWMWcoRJT9jkPff?gXoxJ-w=jJ%J?o^(1|Zd6PyEqYzoy!PZVNHk{Doq{u_j(JZD1USOqKpj1}bp4lgB4iEA zKbrn{?(W?E+J{*W-Tzn=EgzwC&a-c?BsP}B*TE_ycP2MwtV$i$%|;?@9d{`3OK$!b(ysbGV0z@``Cr^@Xbj^aTSh=A0u%@88tF7%cP8! zJxLBM4UEia#vSulU0o!(L^ARfH&~Jsp34sS7$)15#zpLf<)u~lIOfu{y%A2y!qHw$ zdmDU*lKtEz`S(~CH0&@L+(BqYJ#=BvKGAmm9DJ`5&Y+V^Qw!_~(W(;8X7+tdf#xYj zvP6dQ4(G?&H+x2zc*Tc7!-L?Ac2an_7akI4i?uk6i3jp?%x?(6FlMGpnB6$zq`@=q zcDfg5?5l9JcxjcjX);>RoO{{sbWJTeU5p+(CV5;PT95L?7_M-gWye$Wha1_m)r^ii zHJj*0>A)36IsgZO2UZyM033~XIGrvhD>HCJ9={vP$Y_6VYwh~b zZ*;`IihC^^M<00C2Vh@Vr4Jft<-V9P8;|6EWc@h*ZvNxCmM3**>AJJgy1qzpKW*%f zsQM$u{+A-!(&mDxht?i?nt$*~{t-I=NHo7WFc8z{hi*L5S3S)yev)59=hp-VzOocV z3Xk0XtEium1A;){hFK+l1 zTPADJ`@#5~*KfbRS@=K^Eo_MxTfy3($_zd9Q@ z8ytDJKYjo)rGzl?B5OY+B zX-3R3A?7$@P6#n45pxPVW+*TeYKZCzXcSnh#XR&FdeKUQ3Yft>wMvx{Q3oM(2H z28{h`k0Aqwj2JQzYIEF- zAq&!FVkiqkRt#km#kG;5V}6-`;gKr;!BpIa*c?Kg70<;`9-+3x^D$HaX;3!>9kePZ zUWllDSeYUW6+>}1bpai)rZHZEm{K8TKVr&&`lc$N3Yy>5#t$H-T!^VaOr;Q0g_vp~ z<{)BfgqT{y)Cn>5h&d$0G$5u?h&hayCL!hsVvY(i&4@WB#2iP=2_fbrVoqVl7y^di zm8dE=ei~6N+fc2D0`?V>wMh^O>?=feut;EEA+n1_0{aS)JuDK~SBUIok-)w}WIu}p z_7xz3eX(?4->m#wr#DWA&2;8|IIJ79mc*>KSiymq4S2dJW-1JiMNCyOQzjl^&CQGD z7l89XEHewuuHJ}c|2K1NM)P1s$^P{M8wQg~!gNuUF*G#PJv3C6NB#rM*o*Fq>AOs_Yc_OO^K#2O z1CYb#i58&>3%G8Q!E%qu7=u;x63MPa?h)Klm-YTpodTk(^R5>1S5PD_`#n)usl;Ux ziR77qkm!F!9Q+>y&VbgFk*u;OStsbM6Av06XSD=IAGWV+V&b+JHxk!6Sr@cB(rjf7Q|Ymd7FBRiWjnvz>zt5x2O#& z-oeEKlYrNe?zS#I%puN6@Dw*JVd5hk0$iKGN$8Eh8}U&NF_xsbz#+yZL|(z2g4+f0 z2@Y|Q^?=fC=;E(&h)c;lUgr>RNC@~s!@$Ns{7nw=7ZSpjdux7UK5pj_-%D1^!687? zf|EhPMnQavLpUYqgg^zTxQjzfCrg>(5SNn_ZVmx`mB?d`LjY?h8n?h97J0<4xek0L zBc!^xl~z#~*h9 literal 0 HcmV?d00001 diff --git a/attention/ops/__pycache__/rocm_aiter_paged_attn.cpython-312.pyc b/attention/ops/__pycache__/rocm_aiter_paged_attn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87dc2b82bbc26531627ecc58ff3215b5efd891d1 GIT binary patch literal 4201 zcmcInO>7&-6`m!z{9TgEUy&3gQ~#o5lXl(IPNO<@W7&;UsBYrcs9<2R=B_MC%OCG9 zTO18UHaD^b;tG>*r;jC*<#_^a|SrdvyR?L?RNSkVS^pjztI54#in< zExH(Nb1H1bz38UcrFbgdMK41fM0T%4djt0C?JivOJIG}svG<7NmR;MOE#VRy@J$#X=T_XxXB@-IcDz&+P~%sjHwM|%IH zw8S0p<=6E(c*HoNBkW<>3%ll)c&H!fyb6zQbo& z8kbV{*!}qU*Zuj76}#~IwfQ$+OdZR)Dz97Yny55n%LiB}iKS)P3Q$W|;WVYD3l*_m zFIR6_q1CmHU6Azkx@>t?1+6410M@#|t;%{stuBk?_;WAjmZFsi3BQj(ia?@bfi0al|z5dm(?(X50gL8L{cfE#VCD7~9 zuMO{Z*NeL_*H`p#>cQTZE1@3h*ZUrWw}xva(r@XZ-mXZ_gR9BA7Cv%Xu5wjBYdM$d zXDu%XLH=gtv>JymujPFzM$d}hQOmk4=ry5^9MIh%-=(W(g^SmIa&_Kf^_p5*R%6)N zwN{qzSbnuust6){rdkuKtSyUmSrDt;Z-qN$eOai>s$N@_tHSMuSk*1h63_^+3(Sgf zRDCFL2UQH8%Avq8s2Bk?48n@cx7wenJ=3Te5!Q?Z%mx z&#s=4i%R)svA(V^*Q&)eMX40&?8t4!T3svl7tmcbG*=7tbu|gYZ%iMO*M-BvgnnvU zAl@N%l5m;k$XqKh_bHcYbHx@{H2IjxCrmzV@`EOyHThwaAAr9vBEDbn;&oS<2(GM$;as&d*B+e*xLi- zV9Oe0f}rVYS(2^f!3K&2t-K*yLkI0RH%wNRfE?gMxn4S$^q4R7*caeykNu_p ze(a^eG9GX89wRvo2BgbyfMg9PNVnkv=`mQ4Uc(L2XLvyR4KK)m;RDGTevm;U05W87 zAj3uwByWU3MvO4XsKLL-{DOIlNHN0=L~_YsR(cmaZbY}}K-Hac7^UoAX1Cx`(b1X}}2o$|0qU2*~uW z_pgPYF<(Vx`{PH!19%L5xk#BbnC?b&WE)z5rX$;3BEK8c6FvJ1p42aueX4XBF~e^J zjOaGf1r&xD(YHuGyP)DrNzI|?v(E^sqbSBuP#zjbG=T!&IDJ}Dr%;7J5H*it2E{B2 zWYs4~`%nH2MMa-CUN{K-eE-hIM6N2!l9tnJxtnrM%vBnSUal*0uC{crTQb+sWGPp! z<~r8S;VnGD1Mm^aa~%o~!Taovs7it)muix%VhmNh#4I0je&<$o;XzYVX!XmLy1p(b z-AW=0A^FfY4et*mB@Lt6 zqx*xkI*cBl0@l9!;L_IubIAnNHpQO8Ewv6HRX$!8=Q5B()MXgi_)TV~8= zp?>NnL({egk(Uh5*giylk{PuFh&XHwA_|dQ!44zhv0nsHlni9;7@|1Z5{Qx{owElJ zr4XeNWe^P_%95GyGWHO{VUj=3*vAm&$n-gT1kordexI?&0NNx9*9R;k^6o+_FxtH{ zKAAexp1RPQy0A0#+D_`?fy?8Q*;m@L^R3zWo!QGfneT1Bu^Sj|1`5s7S3dsXG5_t-%95{9NkHu*u1jKPktDA%+EB7^B?CwEVka`a={6I&c4D%Xn0$1*mB?@U zKOLEPwEEj`?~I&ldWS#vM%&(@mUqbHA|{tGxdD?)nOxlDMolh8|Dw-hexDyMOg_08 zRFA=4=2?2a`r^rXuu?T2u)Km$6)UnJSbjmM)TD-jm=o|VqI6q0O&9fYsUpJ@LFya} z_cgcI8+ErME<_(RWPfcBik9i?;IR~Up@l1iJ?d2?#ooSZgK6rPUFGi=iA zI^#rTHAK1N}VjEvhO6poVV zS(~oS9w+|rgY5n6vk7dW5qauB*x5DQt2}41^zBOduCw(+_(m-3zYz=YMl5um!}u`l zvMlY>`|q&UA=m^pHuSQh6)GaUdnxSD={GPt22jvEc`c1?npoq$LE;8)@Fpbj6eWtXWQ!E3!#ZqPvSn-SML;%4@dVj`C`)-*W2RndwC+V|yk`#|?c+bpSQ+_sxp{;OJAurP5hAG%5qDBFs%_BOVzTXVo|1 zXuZe;HRqpVlc$IwsOAv0V2I63FGOcpn6&EM7?uS7_CjxbVX1d%W~nZbFYkzpB0Ry)Ek1K7&|Z6&cA*xbmooF#aGXszI0JAy*hGoJaqB&m2tr| zADy0whbF^Ig8J+n{H;vZVh^QYf+r>PJ_8bZH!_O^PSoyFQ!Ul3+oh&Ps##C!h~p-# zRiZ)G2JXJf0L$(Cth5ahjl0%$Np+h@Bk|*{lL(L|QbS|SviADQo*0JcNbOC?mPXk^ zY7?!bmKbr9!H7-x#2Ta#VWMppd<$K6On`5v|~i6E;@?2%!!i?zy6;+j^)rQ~e26qO>M z8>D{rk}lFYH^wk&HJlwTu?mc+i@3#Ek1=W0JO2^pv5)MN^{u?(ZV@-7a<{C$(yD2R zfwW31(@ng%uP80LN!J|2S(S|XJ){R};CfzpQtq)mD@J&!K2@1;$E>4$83CmggkKWTgr`Fd1M~B5g@fj3o}p zS}E|zBuEb6^(0a#ad4OV0~Pg$$X;@guHs&@SJqZ(M-1WoWxIHg?g(krH{nD`>?L~u zAFTAQxXSH-K#4u%Kq8n{eei27181V0u4Wynl2)^xR7>$Dx|WAzJp@D=m!CvW4$-xQ z5#cT*&7YgZ{iE8wj~d#?@UDFf(>{W-dM)KKh%vH9Tpz40uC0o>0sn&L^Bf!_42L%; zhe!5*hCdqas|D>sI5sEfqVcN{PSDIOPTf}tIyQb|De|}4Zkpukpl;AvWJk zTcF&Apt}woN1$E!UD{3q5kV7qcS+F2;=pbR8ej{8hXvII!FVP#dTnY7#%X**ip3T} zvB~g!M6kUUSq{ac@P96{7z5TLbTP6#A6XO(Bk_3Xm1}Xq8v_Q2jf6OP#n6OcF6A#z z!~~NVzbe>E(TQtQp*bkB$W>`;8U}vnz*%t#oS~`fY-ldR zEk@=;FqP>@KQ{>lh&%S*fm2E{&ki#V*KOOHEfdaH2a=~BS2=DaRuh>E`KqSmsmD(D z?e4YitSawpNuJqs)@J5;=bq%5C$_3vZ>_$SUe4^x+d7jc9^0#L&8*I3RGCA0dt35k z!RAU&to~l|M8WRKs8{EcCm;K3KQ`PkWR1CDJ}{W~AK1`(3jSt*n^sT8%v)QsZ?2p8 z_F>*S{NMy{9ZiluHkxl4Rt;%YdVI~4)#oOEWgJM3Jn_}szP@&Sqp5GbJMTNTp>-EL z?KunY*`GZB*yXu>c?Ud$WX9{cJuzn^>i!O&N}-Cpq4e|RCQ&0JoK=B&JT@PYTC z_KWaC!#{80J*Nxanr8;B!=4&}cCez}H7~4G+b60_=N)tQ!X0}~m3Qn(jTWr-bl2+n z)aYiFKhyt{M8VVesXIH78@{`kZ$7|#4y4sjpj-c%Khu#t!q@h$JOAqS2M2%oPTp}m zH45`}_?|b9nqrr{vjr#8`W+|Yd+7dR`Js;=KFxbxmd(V+)!;wrpbMnrf?DCzSTyx&FkGJki9p7}GO3(7v0FJ4B^-RH8^Q}Q^GQ(#1PADCS(t#*F+fdHKOI5Hs zemZ*l#M+6>fj>FNTbsYtX#nvVHDhe&P$HR>3psdzQ_*z;?qLvJAz2JDHLF?#MQOrJ z^a%@5DQXNf$j~ROME#sn8(|bnwNw~TF4IxGRS7$>lPY4F(j=;Pz(Q;8FgiVAp`x&Z zH6)zG!RqFQ;Hom0XUVBq9n8|0aK?dcD~4Gm=2%kXCS1GJw2ACa!W~Dph6YkyoGnoi zH3(H&5QsHII}5B?Wni^pdzkU>Qm&KQs}$Rtb}5$>%do0S{g>M-mzUdX6x)~Qr(9kh zzjC=G5KTFVgSd#Bn2CiLiHSH_Be4=2YyQacM=E46gO>A8j{$+7B4_#Ze?lm8n@|4^ zl21`erhMnq{{hLMMbnkoGi1Q1T1-0)oM&;~bBJH!^fb~x+5 z*Lb&aUClS|=Uw}gXE&=I$=HD9@cyepV%$-BCfXCGI2 zzN%{HtJ<>@`Kn;@)Dx{Y9p|<61*558tQP-!Xr!uO^n7Kk=Z*Ei85x_P3Le~P$NQFT z(dt`Ot5q3w-dG2%zC*m~mVMQpUd|h93P$DlR%?S!ztzTgTMC}$f}^HT?J0O$3(mUI zf0w&pvBCdm!%&YN>4JHga@2G|O(o6dtK*_-^VDPmEQ%N;ZRLybdVddXrCgtgLR&=u z?FcM7hD0OTMV|;Su1}N%=>gp&lOdIGm-U+gY)kbm8sbYdFRF9qqA$ zw3B)olj|atG?+TlfTNWt-9;J_LDEptqqbD&D9T#ZwhHYH&R(vmRO*WX-2qajwE}BL z1+Gd>&|cy$X~d?8YggKB*GPg6vqY2b?=Y)<6(hk}^~icuo>hdVOyk`T`Cd_j1sZSl zJ9l9o2Z>zcr~GbuMQY%xCx&FJsI)s!;~kWAZ9}9+(s&P%fjL;;Z5r=GSp`kCc>ajAO&pQl`NZzCe~-Smvav#UiKDXdDXol(9Nl^L@6oqc zhOu(?FOnlN{0j4bua~^2IRD=7GJmT5JwlF@Yyw9}tz;9ZjiZi|1}LxnEt`ORTuPep z%F|bI_3@s98J6F7iAfgU8CW5@E6X$@MXNy*DO(LHtR8e%z9gfY(z>Y0OXYQuVW)2k z>D>M%;5fJ0VUEeLD`@y-rGXL=We;+sY>^@RrM);JYt zAQ?&QPpd!p6zmNRC3{09?@+cr1Z36+r5!RRqhE(Y?|L#ucL?86K*3%h|?>eS&Iwtu|*rhP`LGh%cwOi#QQMO-jDCcvv2yj1h$ar%B1kC_I z%0o2+hue~CN6?A@*Nj8baX1Dph@ca}9sq)djmE))B+4~m7FwRUeF(Y{^dRU(uopoe zf_?zO6WjpC_ahiYfHGoPg5(Ziau~s31TP>sg5W3u^cdk@LU0Vh2!c@r)Ew}SF-GM? zg0Zq;Ac4u9tOML}C@mTQsPG9==LxJoj@h<1M$Vod3l&9|Q`{Kl6|DhRxszDy6oS(T zUPf>R!C3$$HT`l)d@foNmLh_&XhMJ(S{Fjg5YYhOUcrX1BDjFyH3SzCTte_Vf;SLc zM(`$rw-8)G@OuDCDtoH-N1Stj^KC4A2f^i+kkum6Ufw(6<6|Cp*m-@i@651MDtr~2;ta`o%i^UW{u&X{22!>{Rc+=SzG~0sul?e!`){o;Kj_SNkMdQc$x{WJCsPO7)TZ1y-apLS zhQS~JmVtM(dwAbI-qxKw@%L&-0Bvb4Z)?q7{rU2GTkbmFG4vqFcO2(!#~)5Ty7cA6 zOOG!9%P4Pq9qbj7Y2n10Iot7bM{X?VyW6`y_2AMM{SPkxMU-zo@d)eSv$`cHEj?oi12yX~XK7%*rR>oEx;&@2wwtF!n|BgOmLJ zF}`v9(Zy#Pb(J}#d1hq%wcp$9VYS~nwR$Q&oqp@niF>nmXMbhw|Euu_?Y}(BTaSSi zM4`BB8#$SIbHml~`NS_~@6Z0qJp2!A9!bYPys)ErbhYso#kr}J$Unr z@FVxbY5wSWzUGxeZ3EbAe9p9H3++5Vowz-O3CD3+PAEa9LQDOnpa%F^;3sAa*nRN?7_*}K59<|fs$(Hcc8 z zkhiM{Y}wkw%>FLrW^rYJB1-M;yOc{S3p?!j?Ul>R?KO(+%kxt%FOOfjT-r-ujRFfJ zu~CbqncAhmCdF0}A6TcXZ1qQW;{8-rvP(J6KMeq-xoH41$KdUFirn-Jd}+dP5sU>u zP=3k)C>yM}tJnf}$bF4qy&hhP#HgD~KS8IdLZNa*jskxrw~5A8UD1rb;u_j>fU) z8N}?>;FN=Aw@st%vC)3ZvT8}6`e-uSnLCniJd`&MgMDkuptb6gV_R0n3?IGo9s z|Hfbkr-iQI!eDV{vR`xHH(HaR~_2Qz(iIwmsR^@7Bg4D z)ljsUef<(P0C~vb{boc3@uEY7bTGntMEY9D-wj4Ydst<|cH(pCuz>BaRv#GVwshedE?S?4l^)|_{Nxb z3eeIgn=>Ab6#<+=q!tN3@q@3m4qDa7T7eF(GLRbL$G%CG*ju6&{B3K{-VjrLAlE=E ztqVguw6%}urFjCC!QO)V1wumpm4-3~ZzTq4mFs0=SJD>Nld_djZi&T{vLe9c)38lP z*$7S&jnvQCE$c<8Wqn1gasoErHc`0w$m}Df%Hh!T#v_K+wuPp7F@72*7CA zh(V&7y=bqbw4gf$dT~-eWe0ryP@*RIDa-e1Z3Rk50y9*4`w&wiAe)=g7}PL^vPINR zdk=^^fx3$r<=)81A{EtlRFQ}M7j<}cnGUZFb6XKmeO(vEsQPXX#_&ok>h-!Y){CGI zK|g?Cv!FW%9=NXvswj98$0F3TgFA-hBM3$TNDjbJnFDaN?4%3LLKCr~%FTg|(1b%3 zZB&I;RFhHt8C9K8MHf|hp|-1}I$J4uq~bSDlT;g5p(F$4Be>?qi{+@{5=_x~w&c)i z0)J?5hlQqs0Y|eEo~NH^?R4o%`8&C*4sd@4U3?8I8~Zx|;I0~|P+x6xSAC-3WLnlR zb>(fXMFkXfP2Hrfpk>$49lSz8sno#RI&%K?b9``ww~ahJ!P{O2JPy=)Irik0d z$9c#2BQ@`MIr;Jv@E^U9wf#HrJG4BmYxsER&d^5d;QB(oZfwKoFVqBcXZV_-lpVbk zZ?4_U_H67w&bJ)j7=I)0yPUFaT6~$Fyrt_gcv||_eBkeq(XZLFb-4uJe1xw)^6=os zg-f6oLLnXWNM8T#%WIc2uV=6D4ZZ7oAKLhVb9}?OyyrZ)sX`%mNM^k`Pu|;`I*H`b z-_d8TWv}0PFYoV7okovC@VKo{9Y_CMEL+JL_@>^xzpqr7;nrRRzrmuAe42N+=S;l2 zpE^ITj&HRy{@!O@jLz`mQ$IKb>_F-?b>-!ak~gmry?KqJ1*0VuS+#)n59C<5cFjzc zyQJ56{ZAHndn|k^rWL+g`VWqo2fE>`*Ro6g!lc9r2VM^~(5_%ggwc9r2|0Q|N$a z24P1FGd6ytBUVY!G09I4Bb;^u(m*ihqJlZUBhk{*ulp8Yv!Bb)uLM zi%!Min*sP#gS|ExS)qP!_n}cAa73|x4-pU)oQ%(}?9H}aT=*?MpR(&tck;|{apBYr zZo^Wa9D6MJtl23y4(@1<0`T{?Ra-iohvR0)xUvJA<(7HXobG?$w&iiaYwBfuZOBD~ ziWQrxfxG!e2;_1!2*)pBE6endnkyS zF&w#ktxGFi`4KWTwZQk)3*mY07trJiuJmz;z}-~O=)5wS$W<`9>2efZ2w`0u&pZ~L z{+XZA1qw!2`T}_2?}slUg6r>mt5N@7CMWk93>pZEaxEn&_c_FNAVa+(8A?Jj_ZQIc z#&$Y1{{7NHI_UombOi^!kr)(@YkJh9zA&bKnfW1n!>(AX`01&#mXaR$R4Zo%O^urB z9(}UyS=`3BL2)C)VP4wGrI!FVYFUR@+Vsz@9n_vLo_=^CP`yr2Q#|Kzdr{g3Z98st z?$5Cj?r`p(AyC-aR5=m@hJD-i4ixvdTWMjrUqa;!E-0R!aD>$})|$+juj;$``tDqq zukZWP(zl`Q+pzRKPv=nbzkOnC1v6K{>Zbo8QdO|}3sz6T>MK}lXePux-`3bnW?=73 z?ccdJJ$`~JYG#L>ObLj0X|O>sghFg|G86(nG&(&EKdOKp(UWswjBw<%3mVSTWT6|_cR^;#T%AtEz>S~zg+hzrg$VS9_Ydai zmcm>tB3MG9(r-cFNWe{e5l5n#orxDeDl|<-K}!crGe`Zui-#N4n9;YA-fVc*(UXR5 z1-dl%89MXg3L1|rMDaHvUgZ82)C4|=iJ{_mOQTY$zCNf?X}>Wus;XZz4Zmjg{s*RO z)7-LQuKmi~!kb(2=C-6^(^Q-CePycUO||gxmZ>eN-!d{deeBWp*tx@oe9sdnBXt7NN}Ci}!>KTQtc=QY+<>(+jn94sY&a{!a2@nQ5gf1r6m zrQag(<34o!Tk4Q{+c;^m0|gK3nsuv_CcDI82WfJT*!5nT+$YYWnld9K8h5>VnEoN`BD54Nr@6EJCf|gaocQ8wboIWVotYZE7DqZoMuUX(rxt)glf{UWa?)m zwb{lKpaKOtp#?0V2WdA0+rdM2=&^?l=yAX<<~(2m;X{W77;=;AY#4g#_bAF{Yq@Fn z0KWJAf8TrWxK5+O9g-{1+NTUq2MAnc&l|oCQ61_xI zNI+U#qjgl|hl+C=w1^%m z#)jDOszpzsOQY#4oECeBx#t^L;y0*_{D5G$G9fo)8!l&?1-nvqxk}Bh)@)a*n)*s% zUG-7XM-5ANC9Ad)Wceszi34$!z?`YuHM6|T9Y>AG4r%xf{{&M9VN?t-hoDAH09fZv z;f6eJ_!KVfae13~@kWsuLdPrS??J|pQ5S5uhrz%c4zGCDF~TgSa0t_w!8{f)tDzco zPZ-EiXai-KTV@E*2gJ?;0%aJNwN~?0-6Rg(cv)Y!)oSI}x~W!H)XQ$Vq?>njHE-Kx z)w1)pZmDa!S=Nn=XtJPTl5;XiFeb@UlQ=U;?z>{SR#NTCnqIcREWp34>E?#Is^>M! z709Amg-zX+t9eb+HILy@ETaC{(fI7OtX-;RtA*;UorhTL+4b6$??1P8Mb8?Ax3bj@ zd$m%|{>m^)S%^|E+XX-?RVzD44fVtZE>$;NdPpR?wo6%>*m|j zjT^n>xsK5`w!ho1G;a6gWGC8=c3*AB8;kp?sehzqoz(13VK4REE;Ig+L+n(yxRc$T z`hmmzu*b)Ge4@vn3Cv25PXpTh)-HekKVu@x9fD8f3fFVj=GE61mvXNB%C-6V+`M{B z5pTb7=ao0^s5f8FE&EqWln}WEFFF4Qa|hXeiXc=Z_xq)xQGpaXmK>360ZH)@HbX}? zBLT@#HjgEv0m}?Y$SRnI}7wHkY>{sZQ= zjwBTw?4NIx?t$#j^Dghze?VPA~WgpWDS#*wCjjxXb_a&Rmgr_umvIzwYP-9#B$yMJ+R^dG;l2v+EJf5 z37AO|zcQR=0I=eak`44Qjg+yD)RwyWJ^3-{0f0@nr?-_o-%z|6K}`2sXh6L#^U{GY-^#p(7Lgm-nrpCKIe?jeKg^Wzpxj5v60*VDrP^1&R_Vw zzLWc%@wchV@8?=G@Bic@wvp>iJu!lMKPGP#o5k*#ZFVm<(^%+5mDWP@Hi!>_`w?kt zt~uBG_Ip2S%4=!zAaT4G8O?aOdxA-RCn(VIb zOgiVEb(Cj6EI7*ZAE*A#FWv|7Ogq!PwmY5OOFYwfxhJ3P799B^oY7!&q!Ef^t%4&? zZ)Y6wsRNo4p8iBkwBB^Y^9O85nC^2Zk~|30LbA`JSn6Sflek9iVGK!ea&-T%fqoLj z$67yamf$dA2_F#St&7cUH@Cg;+uIOK+8ua+uW&Cq{rJ`Y|<$F}|qQ=g_N z>X1j2^l$Ve?Cj!G2U42i;BvQaHgEQ6fIk^acdoXt_E~`8NEPqN(hn1iN8=M7Pav2$ zxhYheG_jlW$M6~432*`0kqz3a6-etU^)r*$4~utG~(5<4Mk)ux32S~i}sv-Z9;Gmc|( zUACxFydaTS6|yU+x=2U_6^Z@~5)ws1q7r{RwWVD~kuU;&sIm=Z`&U)3#5Yam4Y7XCf3Kr2a=h@?%NU$Cvw;dw4A0lT#?rU1zdkT(2+7d}`%;}at9 zSgyL3m}K+;I(j+DXujzMNfPr)s+dhHGX*J^G-)O!tJxGOqZIO*b6#9Z$-oGr=DLy2 zE{MC_wQON}zR7XL^z3ezBfpSKT^G}WtU2(Jk~7Pq=4=c=?E2a(wjtoAIdmcuu zv>!2sp>1El?*#aLiYESVzs)qU(vfeci6gwQJ?41_cn;%Q#_o!-D&e-297u2j zNf0SS!lOuXBf-N*b0~#*F%SOV)L6qdBAb%Qim42A;AcWb^_gs1zCKc1(mJ2)n-`_Lm`xcf(zD%YBNUfhfh-c72}u`Tb3s`s?&J^ee=@7dq7TizE-7ixCzo$epd9oF*?xA*pU%HOFl z+wPvy*Y5iQ>y9KN`Qc^vUz;S6)}gu7Kfy ztu$pAu5sZi*RFEyRjxyH$+8#NH)m~SPZVWd4sIh@3 zRdrD19>PCJG&6*vW;LQvb6{8+8&2a|cAw^I-Fcd4{zfBUjLj|aU`FZ>nL<;!ZS< z=YK=CxMjH&StpdtfpC8lm>7sw*h6|vMz4dKo-)=++4@oT!w6VuB*fNvng@(vHmq@! z`2t!lnDb~`?jyA7Z|6BetNxZ=>ZOkPA=p^y9bNR z!ALu`{*(yp`{w3*qlKrBlS0aFnR8%C)0i^P!JFXT-`7@r2g|8%{xpQg*pc)1z<5l} z`x^B%kIWSlH6Nj2rdDP5T5-??0DUpm*73zx28%f9JoU9$4(xLh*p}V{3jUVf1CHjF z-pG@dXDm1BK0Eh(Rxpf>CU8o76VD}@Xzy8$G}mvpI19A=(|FA6Yt32Y)`;NuDFNHm zldOVkB`CO8Liw)<9zfoeP|F+jJUMc%|4A9_+kq#gP2cPLR`6k&DY;tS-cJL2@3k6P zwal1=z&q?pSV=am(J(HS(4Qefkdk*9Aw(IVVQK;zp=O}bvIS@xwE>M$JJ2}gfVNW? z&;)e@?Vui@osIB+N{XlzY0BA1_0qvu~cbOkE41q#w;Iib0*&kpvWt3FQ zJ%xOBiJvaOd7LjS$h>h8;?t7I%L~O~K~h90r_u{hN|~WzS{_))@5=ed6E8mmH?QO{ zKU+|EjTu>fQ@mNsW~MU=4+Ht>w5;%Ow(~~G#VZAV0ZOxRJ|pva7_evb+7e%{2>2&Y zedD-auC)4Kv{m?5zG(~Zj?@Py#mgdZ~=z7FIkrgc(&%ITu)o&j?>!@gE#D183F2y4dSjOjI! zG}jAgqs~m4q3*>}OTx^pS*C@|4Uk!KqLS8_yaYLk$T$gKDoMvtg`LQzbJv9Q+2!L$ z6l)W3%T8zIvm?zB@1rq{{ePE8jSW@VxXQ*io`rJfKI^&7mAN}F-cdF#s?nh>Hu;&= z9J5K!fwqK6M8fKh6f7aG`$zM!MMOkwqloj{_ApeRa!_rG&OtUq< z8aSatQsl*qa$UkT&>Zl`*26|>L*8;DBQ9!IBb+p=5yP6>2#4`hJ#I9v$wKUC-X>$j z)WWi6&SmoTmG|bwoA4>4uqaAU*r%S;>~JlS3e!2`esZ(80RPQ`kk-ObOs53#MrK-s zLZ_I%mdR$6CCyeB8P#txDvjZqoIr9C$tfiGS4!hZcqGQMVKBB_KMuH|Xk7g>Nc}rX z=%6jMcWaTR(B32v%`yugUZnE?KXTu^TNf-Q_czTUOgRq?C--qI9Sy^|1Mr5AT#M0_p&Zek<7)&(8Gw)Kg%iFeOd zy47G`=~6A2s0Mr0VDDD2zmhg;_gbL68t732JzIgk%ByN%uyhg1`Te3A7uiUD%Jptq zCO7*hw}5iJJ6vqz4Ukpdst&%O4!-blczf`?PE1x?&3XKz&duX5e9UaJUwvo>$tHIm z3SL*d%Jr#SU**a+ckGdwIAT?{M`e4q*}ffjv?eb>t>8;TCs-qU^u0{sb$iU{{m$xDlt6YtX{p`vI z-+J#`pK?PwFBbH3YW%4e$F zF_k-Z_uOx$el@kloxaDY+?ji?e#)J_-`2iyt`gg7JGMIch&8)xrI+x|(OWs8T82I+ z5v%PlzMyV~=idTR-3m{*@kDvuj)*0(KHY)HN#X-KhsZ_zZMqwg2ay+1fP^~rAfgb7 z^ypzk5kyf$Z6wU=F+_0^Oz7>15{Nnwb>i4AL_Ij77tsLr4k8*tltlC;(EI!tAS3rB z-O?)%`I-ms$|KD$opL=L`3+%X^DMyCTsFRb)Dwz%E~7|TvS|)D&}9Vca_p~U2qMnJ-ip5+ z*G*Fl!|4IMX|rpr9yHhx35M5))`s-3!A6>F)L`52UfQ)tgOzSXM7YRhy6Ke_GZLl5!Spw(r8my0m+jPGH1H==&{q6F%^`OCq8tkJm Ivc^jO4Se}9vrD3R3n@>8NjQKBe`l2%$QMSv8)sR8tPjRVg) z$K>5r1m0dEXuXNB=ZlxOtBQG7mpVD+N@SOl+mu}ujG$QXlv37JHs#zuXjida{*%1d zg8>H6P}JV7KgzjL>EZR8p4UC^b-(U;ueB>>x-!4XNO>=qx5%TB<;mV6kJhc3El={5 zFqcl5KBac)zo|}?Y*ps!HqBN`oRYsltxcvRY%0TLP%g`5+=6ZbdcE{~^4FEM1>L*_ zJ$nnfWed7>3wjRF%aiqxFqdr$dhQnVye;VYThQ%W&yqPtzygYx)=~}Ucqe}? zKcN10@k+0h$PHwEyJV#|YBU8h0;krsYX^e8p`3e%d1G~0UCo)1zqWVyMBg#zs9?F~ zaXYU%FV8Pc&N&yBT<%G%iFBy}p@RGX5Sab4zD$zg(~wM}nF9fGhY+JpB?B9YPrHrS z7^2183DN0@n$r;-QQ_0#y+xMje0riM?3@Ei&=9Rlh0pjh7c+=fRyO+`C>`#BFhR`f z5e_x_3j`cY)uEA!QdiF}O}H8ig6--xHwO}hCJMd8z35!_a)Lo@u+ACRO+mMKZNcer zU-fV}59jOv1ogao$}41YGt*bRf@)#LC8(}X%nMq;oPc;xkyL9Gh^jM9%`Cc*5pw0K zAV&rK_jmxTFfj|0Z(r$;nsQeLq6Tvy?_15$Y+Jy+MnWuau2@w)Gh6OVuT6*Z8#em* z{I-aB$Exbj8}|mPg7)>+P{-q(+DEEY?Q>&JVD_=GJZj4iTGzYQ`k&|8R|jHN#+V(* z`*w3ImoZra6>HrqCq#8;87c5tyb>cWfC&o?;p6=^8Dd_Re z*wDzaF~K}GJk&ola@aXOG}3=!T(F3*<(zX)PE20G7fPo~2k3>oG zktj(%5~YJHfkq+q#p8k&D?WI#ThIb<9v>QsBR%IPTOp%#=>egg@~B`wk#}W48^ZPm*DQ|4X_q&Vp2eXh?Mk%1 z1KX6g2lVpQm3DBO()JPaS6JF1S!qf=f+hQ@77nCpfgDWL*KL@ckVCK1TdiC{?5>LCVWUvP57ipYky*7#J(n#8n=}f^H^+1~_t#`3kR)W&IsmVV2Zikf0 zN!ELtmZMyY_-xH5ufrB^>C(ONJN3!lNWtYs8cE@F#)mN!qZZFfBYrj{1g4sf(TK2irO?mF+ve4gcfhm70;=VqFvE zj=;Mdr*Bb;?+~x~2{K4Vkm@@rE2CJdVe%L3_z7Z1+nV5bO!kD*Tb~s3izWGwCY~Gi zmPvg5$Na+*y`KrFFTqgm8zrOgBwFikTC-E+lz(UrO7@M(N>pkknf@O7hNnorGzv(4 zb(@}3F4ZXcD?RBuO-`q^kSbccHc}(Cl=P8-)F$p=U7)UM9IB=JzB3~CV36{m%#=z|uC??O)Gh<-xJ@Z4|Cywm zlXCS_IVa`*ik#1r^IuwKznq-U`!0Z-&nrpb^;TSx#5NKzO8I_CIbQ%dLmmC6Qd=fi zwKV&Uau#h4wHa^A>ZLqR*{8M}r}Uplo{J<)%Hz8zm0Z`A_)=k%vqwMcg5Y>bpcP!N><+6*pNflV$vuNFJ8udq*Z81*`8r z;=h#ICTLH4CzY0By`27w#CedOpTKElE40ou8ao;Nt4pA?_C*J1^JS9 zmGp#TK6%%rPUuU{r8!xqgh5MuH&wnUTd9}%CS-MxfA_{8Clm6T`!36JE0xtt-c8{? z>F+0#X{F-p~sL3C9C7VFsm@aNbg53NYbWy><_5F zQ?6$GP<~pi&|}$8nz%Nnn_DSSN*SiOWLRu(!V8h zY1;=j=`#i@EtgCSJCkKe7oJU?VPXva*0 zL-ZROmbs;AZejtvc+`o}y^3+kzB^YcRH6nGoG zf|mYt>=ZI@ySXI~`U@9l7OpK&XEvmuI7oqCQRD>-I(N~!2$~s>b85L;P`Q@8)awgg z$BCO9e)I?`4?Z(FKe4b3_4h7u?gqDDh&$lWo2eI_b%nVgt+3)(qn9JwAx z^iVO0Cb_Yz;WfA#1lX6jIt29yaCqU+N^wmHupe_R2(bTiZ3wU>bL|MQeQ_NKb|Kh} zpc6qCf^GzR5bQ;;55ax}`0*mTcG2s`9YBEYbnXy>w*Uw^k|TV2e(CbWJar2j=dL@^ zXG#;=xLa4SN;yS;GWw4bBygq2pDl(MjPvq@ck+sJX3;q@KMzh&bU%ZG*nLwdQn6+qp7~(_`~dRfgKi96O)qyl?&F;IJvawb>E~OcEKoXI!Qk;w{!!W5%{2? zWm7--GUuL}xrt3q(4$}4J2UV0fNvgtM1cu9in|wGg7)h2k_Vz(apcO3V4a%bJYJOU z(hZ0rLBkg;^Al+lc2+Q;?&4L~Q-1WtbC$2%^30&d#Ew8i0oSFBx;Y8c9D`CPTmpMc z(438}esB&Txa*k@)(R96%(Y1=G#*!8IY3spw8X65ip9aC6zUwcm* zF7Mc!=F1O63JyN1;|qp9J2Oeqp#>1ah@QueJmBV3m*>lJ4aQA86aXPHFKi3unyLfG7xOUHD z?VhO4{8?7^o%3tw16PA}k*vy}ZT+HOUCUhDcWR%3#z|5jxCUnl`4w-ADP> zqrBzl%Fy$?_RS$a@4(7PG_O2dwVTh|{ZJF`8RNUh_`I=|k>|Sn?_Ld72X64XvQR6p z+YvPsyl)Nm2kpF}A~eDqcD_)t`i|!YOCa~VR|EF7wilThdPhvfWLjUE7)#Ebn`<|N znouEM(74gFaW#_N{xo|RpS|lRr+zyAqwz@gft4fAY!h^i~nfi)7(eX{GJg$ zYb0v6zSOGqnw6fIiE&h*EDCo+Blx*7_nFCj=kVI$z<6NjgKMF)zwP@;?@y2Z=;#yE zp*8k1@g9V|0$=A%HBoEHvmAR+^Ii)u%QF16jxlDfReo>h-M#C3S1Tii(ip_K4Imk@ zk^y@_AME(S<&S1RoPDBeU+H;fFx{zNs}CFwm$Za!tv|W^)7c-*J~8yH^udc|TQPtZ zm=BHf`OOh?%gW)HhOs%q#cg~}+sd&Q8b(*}EWhCH^!jwvR2+o*v}_K4s(Pu)up3s5 zFAR*%_S|F-HhjM);vI*0(;?88zA$EH^tn+(QE-|!)Sy~xF&HWgPb$;p)zZ+DPGBAh4GP###^tbq_@>ohw5z9g|~Q zIU3C_SUD2a<^|cu+QR2KRiSA0~}ZiKAsUklgm;q&+Krae(pS*U_H zReh1EvKdyju^gtPexr^rZr>c?i+dl9^2LK;U17AScEipW?Fj1%KFhP;y|8{E#D=SO zJ=F76J(0ZLRRcB}5Pi6;ZF3J_w*L{!m-al`6R{m%)qz0Gw!q%t&8J!Qt86qcKUlba zDRh?4Yg#o#b4$Y&9ei%bs{XT*visxr#zCOrvUc9ozIpUfBfo2yHw}XYC^h`0!L(NS zol|$tt(}ADWG168iW!)!viGlss)ILp6LjmnylHRLTo^8H=FQC@)~uH2=G?&P_uB*G zYfIts9lUu5h&HPoL~C&bkMh~IAF~^~Hnczwcl7b*J`i_SKZ`1|LXb79Ea?6I=<9fY zcQmNplJ|!C;|XQnFD(ox!@h2hnhJwQcvE%MTKIlN@N}?_w^oMEZ+L$^x^aWA>Ef+j z(VRSJ-Z_R<4fGXL&Yk^h`=1(1d1L7l2;62{?%Y_rap&u6Uw@Ka7R`06v(NL2?w(&i z{{eUZ_PyJYyv7HoesBDT$|e08(>@3A(|Z6!h1_j{mc*lIV5HgElr^-({V@U9`gVd&EyzTr3@ zZ&6@`kE|UD9uA!k7q$I_`|0f;-F{-~U1gu^^s8sSck1rh^|QgA;MMmo@>vzUu40w_ zxiSBlr6f2JvPUcpt37B=fI-wGSy@v>X!yt0&Hhb0Uyc3ZA!w%m(ZrwTeR};*N}vU1 z8CF^7VkSqh=drQmnI-40Ze16(76)&AY~ARG&H;_q3XPVx9(>fwTTVtT)@PQyz|?zL zUue}<^J>ov&|q;C)}9o-KN?a8$9ZebhGTP%Z|LEzy*$*t2denoQ62gk@7TF|>^bzf z3cjf6anZ=8F??`@-!ZZ}EEye3d1zm}om6%4mab@CMX35?HdM#wHAeFbq0QLMtB0|J zm|>_G1}*E*sLig;2CX0T-XFX-7;4{We^T5MF}JR&#G6;vt^_Q)^R!(ctJ4a~aKUwcXXMYhb+W zSikXJ_G%{dx9pt2fY`*>M|gAPstRaOwM^fdo_bpXyQpK1XVpmj*(HvmETM;-Ns9QJi2FFU@pKHx9vV5=i z?&0;r!J$ZQEpM(B*Fw-2gO)N5mYkNXw4eTfBJ{`I~fb388 zI}z&tK|Z7#?<9Txl(#&NZxW>^hoqa>=b%O@s|K>=IYeCMnMdbF+Lh3`JdUqa+FsJ} z6_zHC<13Z6pJac9rM)H7xKf){kpolF_+~cbh>1aq< z#gHgp)`pZzh_70K666-@@1z8oF1O(C(!X(iUM;)GY2t~iKWW!h>RzM)i4>3N9*OQ7Ll&hc0IjPxKBdm23Q3-KhLyzSAJ*F(nDS-abl_*hT_bQa)_m zN~I{*T6zj?_gw;k0P|Ss1+z`Qua{tN==h){m21s=;km z!a%fsJhe^Go^~#kmNIId_n#%_MZL&$*(mBv4*l|ZShM_&i2Kh%%-bkwQu_5>ka1Oz zFS($kCmOB4;7YD2m2**wx1S{!L>n!wHcpAB%U7vjj4tgWZ_8_*vSLK3tUhufh5IG{ z0IaQ`BSfNfXzRWKh$LU4sHLK*WAPCJ%)kL*7=j2d8HhMi|K+kkgX1sU8Q! zXnp#ZloDT=H7idO3M3Q!vq>X;j{@3<;oV;O9&03td$h&1#M8h<_P)yBjEV2d)K*Gm zCSRSR9J7B1^MMB5;0q+(dZ3+nV3KSC%aDD)x{Rot_KWfoB|+cADU*WId$6!#2H&HP z93pR}*K3<+Gvd$oD?JPA5VBq6j0Foj?m!4(8E z2xbw?A(%(7fM5y1mf`=u-=P0TSeIo4R{>xM7hV|mYH>*OAi%!MT|;mk!3_kz1wb%~ zo5ZFUU>l=Hu+m61-9Lp)fX3a#thW$go8!KQfFST8!2O7vAHfQOZy@*$1pf@dHxc|M zf`5)+6~P*UZz1?Lg6|-B55e~k{7VG?3c+0jK?L^@ypQ0w5qyB)Un9T>HTQ22+(!^X z@F9ZlBlsNzKL8-)$eTnwU~Uv?bX9@pUC8x!7?h^0w~I`7xVRrfH73E#c-{d&9z>?! zd3|L18TinBYrY`(qsvx~{3DA~i@x|*?rQE>3R1&HBT z%@Eekwys_N9(VWl`t8-r5p(4WJp@i53IRbDcr0iS>VL3%BmZ{}{6WTlGW@>biKTCi zhPITW=&S5!Is?Y+zeG43qB8&3Bj3>S&9X9VNkMq4h`%_&ZuPgkZ@_zlj`iQP7g~k>XS6^ zJ(&v=F6q(%qJsZl^lQFUh&xyI!IdE~Oa972+Sf1 z4@BuR+h->hUx6%~U?&B{h#!>F=e@5mnk6>u5EJrC3C-Hlm_brjH@U{+850sMCb_e(e>(qF z-qO=VBL=AHCw*QV$FgBQFJ80C?~;%*YS&jH%dgZ5GDr#LBSl{PHm4UjzT#yh!&NzOq@OI0owg57aj=c|^Lr_h&4g-eTpd6$+{L-@@IEds49 zT)rA&z*j5F zUzt)EO~6K=TB@lUQv5(Kl}xwv7-X)rLD6kC2+xSFv|hxLzOx*h%<$D&L2#r-X?W8=DP z2TLtxN!z?qgfujV9Cbb&L@c@OBwO4i;9Z)7V~KG8L3|U3wCzN6OrigY`*#R_4}f5q znDjbRyit1UqoJD-sEg@Gm`(IVw&5;2jd)Isj=ny?+9_|rv_)X{+9vw7s@ZaoU?1gt1 z))zuOk^I_~6VG6&@+DYn+#boRSs4-iH9_abm516#dD6B5gYs%`Wv>Z*X%>_PZ-ndj zJRIWd2O{>tusRn%;q~kKU_*N^bJ6RESc&tPqPeEQfFF*M`Mtwox-LCLUaI6y*k zyuJOQUR>V#Bzqio2f$}x?RssnfNsKoO2Ufny7jtX+r}~8(e-fVQ_qu}v6W-doRS}8 ze5CnM6S}$C#Md2&lpo}C4z3(SS>vMHW0Ac2l@X=-ZZ$Oy7t{6E#eKY?AL}}R%Sw|o zV0?v-M>o|Ql7p}!wE^@QJj5Fs zaq`_HQzw26pokh!ojJ*g_=R8#PQ-1S-G8Ru|C*V02<$H$&ZqZikHSf8XqfQ&4VGkm z11T9YitQV<#5OBtNP#XDPJ{@fcu$S@aP*-iBu*xU5-cv}8tcm?IrKM=WNiJsFW(y< zv7ns9fip1weodLa0>UbC)L@N#g)mny#Dz9K#}-_~d_}|oqo)HbWQI$-n4A25m{l4n z9f{dfF)1bv%D-d_{-V@MFb^E?fpsc%>3n678(h!nm!t=mhAtV!y0DPD98RlKWAN3Z zhuK~sj;}5~rUNV&@ERp9cuzc}$#rmsaT$o}J=SF;3_ufM0cMd5fM#L?m`#iTEkpy* zN-O|#NDe?7(E-dQW`KDl8(==Megn#fMSxHe)W;6A0%8IP(go-s4uD0(sI;LOzPsXc zX2??vZ;er^v@0bK%26qgTE?f8s1)Xdf=k>hcjZuyjBp+ZUDFawV ziU3w`(@KhDHLfAK@I)=i16W5)E){AI@OomyegRkmNZ26uF>ED3X@r)oBnkOjHf@nd z{+7*K$7u=)sRXU=B~<|Tk!pay zx=s6swO_xC-6@Cn{&QG@$gN?@JjN_wM4nKrjkmx(+!74ZdaaIoWE5*IdRQ1F<+iI)VX}d zQ|`I8yu8GD-LBjDqviCx2v7Oqk~c}GXZX@XJ#JTy!3hrS1WgE< z5wsv^MbL&|2ZDA4I}vms*o9y>f=&co2)Y6M0w>I0>dXo_-1rDQBWPUi>ob$?bXNs+ zGW>gFiqq4EEbbU)R>1)a9F9{$_97n9ungxVaB#eVbEjZ(&%1H?ij#`3y}T`+^4HA=qG$bjlfk65V2Ype z07*!JkK!tvI1R@%q}-Sjvha)$nw6d>q7j>qV7vimWx#Qy zBl@`{dOd-fWJgfIi45^|PV@=700z!4!Dc2NgPoGsNDsz_V?N;QV!$4wx6MuuD0t?h4zw zBI>Ty&ZsdrY%C6|JHw@&5rD?x=UP)(TM*68z4NuTuLaxJz8+t>H53`Yy=H-`DbZrCml8*=HA33lB8o8@6oYDK8`vAK3F z}udEt}TIB|F!-hZ#_7V4}+nOgz@wVPF5Ii7(p!&=V=@eWt5= zZmEn~ies6oEb|Lhrrz*5vr}(?{lNuAu`I;QjJ+V1jhKZgDvntZgMIm>F&kpJOny-; z53zivv>|3ktblP;#R?H~Fr|&LBE*WB!tz)NVx^3&JXVHSIg?i!t3a$0u`0k~)l5@Y zxUn}d5o~^(U;C-*Z(=o=p_a)jj@2Pn&y+OB8W3w_%6G<^@S$d=yy+=yc`j?;tawtk zE7pRp;pbIb}CcUUV`Y2ZQ4% z(v6-cC9RLz|0dRj^lql0GPVb?y^Os+whyuWjJ-T|0I-(_nbPW~B^`W8$L7$Jl7q2B zc>67uDQ%7Qu!#4vOks7b5Aj$(tn{h~wZC5!8=&bymMJI+_4B~?FeMyeLEs_dM$d=l z$HlF&Axb<7+{*5syLT=$@xj}%W0W?WLOV`rBPp~Kly)+OHcDxyQfOn8b~=T2hSJWa z(8ek4Tng~b9M-kvm&TkFf0xoH5Rbu0EO`|hYCgC5^GQmW z2QfHSO`qi#-JM&XgWc-%JdU2Vqft{4?3I6J%6V$4YV?|zO zY4P+6INcz(@b13#eXl;8n%$_$HNfEz(AJBU%Sz84=dP~~*K|EBeni66r+M9(&zWYu z;m?cfVk-Fkd0|B?6MkQ4;1r9P7BL<6B|TyW>`O+(OxTyQ5Hn+6%0|qBeaVU#9JNsu zvmus?eJKyIeC$hh#0s!46(Z)q9#n)_F=8c%m7>hc5G%({Q-N3|cA6@{pr@sFni?e5 z%DYG%QtIWD24vCrYKLe-dNX#27Q|W^dtIyzG4O$x$JzmVxihUpbl~k>*dKNy)``8L z3$R$X)ED+(YHy+^?8D^#lzwmkNzl`hz2Fd1u$v`GJuK3oyUA#Mlm^{SMjN0s=z22R zVM>GEC!-BfT6*WhGSWLA&|X02gHCV)pLqeD4iqve#zHsYcup&7B*wOfD-`~WnNCw-Uw23yP zjQFQlTE#yM!{Jm@FjI>3hhaRN8i`g)l>R0QtZep(K{h&L)H$7t6ANyqQ_zTv;HWz| zTW?9w!)bL+dc2-sq^I6>LpTiRs@WN@xR^Z;nd{+PFb^CIw#^xi26y`XoI8@O{KoEX#h8%dlDhi>dk_Ow(U5 z^*`6=2F^wFr7IdZRc6(FXKrmS?AQgTK0S5p;vKspj?Rd=YsCn|e1`6~9N%!f$Y9yx zFH}s1DTWw?KqR`Ft%=hCgVTRRdMv}n=0YxrBwSp(2+apl1`J@$ug`yOgjD<)q&G{? zfK@N4zklN1iD+>h{=B%vPODi>tdxQCh%T;QjFr)3IUbf}x@U@2&}1bhb@z0!Dw+g+ z+wUG(KN72s*(fEOXtEh=cxC-ctc509aV4s0-4tu1$sHKl zB5NepPLn&OWCu;|ioacKH%)d*$u64g#)D{z*NbC&XmYQV+((o9B~}M$@*viEczrl_ zh$g}Q#H-aK$^(o~oX{r{!1BZi10n&;4iG@rFyK=jU~}R*hC~7woH*f_NC0aSCma_E zU~1w#PKX4sF>!T{i3Bh-uxK2PAOT}TO95*W&oM3%U>uJV&Wi-FL2<&{A^~(R5ZeuJ29l+sONI;n3lom9A( zPHJ3CCsi({lR6jENu`VFq}IiBQte_osdq7*RJ@p$YhI+tRWDNHx)&*O<%<-#_C<352KoC1{9!=&;NjpszNDZKnCLL10Dx%3^DOp03rBeGYqsdCCWmeH-b-dEC w8k($??$*&{y_9UA$wsLW(>llBkKQ@0Vsi4M*1V|6^4ws4*)*bJ#X;!*0Vddhr2qf` literal 0 HcmV?d00001 diff --git a/attention/ops/__pycache__/vit_attn_wrappers.cpython-312.pyc b/attention/ops/__pycache__/vit_attn_wrappers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0a7488726d64ad5fe808276b027d1b6430b5ca4 GIT binary patch literal 8158 zcmd5=U2GKB6~43oKjZb!u8r*&82%D(0&AQ=D1nfW5WrBJB4`>R&@kQ^+hgy}tnbVk z8_hN*KZ;Q!0c}+~>O*-1K@~s3Q{bsceX&JaZ>BU&RYhr}J{Xy(P1J{;b7yCFjctOI zA3EB-bI;v#&pG$pbI*6~{;{SeLg2~xexZJzAmkrNxKBf=GcWrInIJMbMr2O*WVmA- zdwY&~*xQ@+r9B+U`i})TW_f_sD~&aIjTgI=vDrT*Kxoj^-2v`#eZjBi7oAYxptZHI;BRgS7LGle3#p~ z200Ek*8ZQ{m;f8=*zPtewMyOhvb*1pcbmZS`tRMl%k6GNUIF?W9-;nb&>v?z*P=8i z@gH>OmiJ04Sd|#{Bp;LRa)d;V_N#`_t7a4-t!t*FYKEZaU___FYwA%!GEGG@Rb3NI zUFbokY)&%O9^gNvn*A_J)BPzN1rZU+5X^psDuM)GJ*QA$)j=W~({xiX^0}N&O+^-Z zMib>VbuptF=1CY6M&P%sLJc%W+b96z$&_j;+4dy+xi=#j{S{M_ zk;qRCRqI1r473C0X|iyNO1T^k532?)s3)VR2MuQ#psi{hCxx`62}Vu+)$-q*FL767(@7%n3Bh#p6b5w^xQy)~c2yiG`T2-7$ z+cpU8hN1{VrxfjJajQ6-VLn$zq-+kHn>VCB1r~Bthb~BPSF%wU)$>9&ZA1fnHS zorWC}hQU3!D!o_u$>>pBjpNKr1{Tr2>v`en)RxrdWaJBkhVCS1x5z3?HASlQftwX7 zra`El74@9W4cXkF%?;Z@LmA2_nqdd1V&Znq;|xuLR7V ztheCjh~*y$GyVmQeDqT zb4W4G}F#hndZFV!$O(lS*aVPLJ2F4EE1D?lb6YFOZ>o1;|-g4k};^1uJwVA|gw-c|A z9V$kfr+v-ypqwl?K(1>syJM1@?3|P?u6bWMU%0gUgMF9xUEBTP{_FccioP=Kdxfrs z=`0A~)CKjNs7YCcb$at!+76>G5mI$#>CHI?vnPlNt@8|&@`Ov{d#UXhz%y~*Q_4=> zE2ZrNcXQoIAKd_a&dK&+%sa;VmO5kx&cs9HxUmJ#!$Mb<4hDT17`MRJSO?7*fe_{E z&uu@u{oJmzyJn-?W}@4!ao0MpNjFy9j&@D^x|o~bl;_>^V@I;mh?}<}J(8L37Y+5a zV%MmKNcD79lmLUNT?+su=5vr<6e$Zkm#&U5Ohzom{|g<6@c<2$fdX!ATCEir~4Nkgx>QxU(MotL&5gR|8i{ z^9x}UEoaYCKxBpGpyj_Ba?j0DP~_?@M63vaMz{b&lB2s$cT|}V5#nHsn`~1%0STpy zu`5-ca1g{8cn1GT{xe3VW8F8qt_{66JX^bZrgrs3ZhGDJ+1lrBc6~LEq5Y)1XlMJo=ZuuCKvz5n#$Zc;pubSpN`VzUh! zrY-=cLV-!!V8p=CgCp)f&QWix7$h!oJ3RzbFijeWC1d1nf`l4B32pv+wDl8y`)B-* zKaO?ZIDCt{)p@h)cC7o%J|}~IT(b@^t@LSJU1XmL@q&1}@o^yBUPEKCHgPW=i3P^? z-fJpDT`?4!u4}(^;GP#i@bf_3nSr-5bAIA)fK1J|0OYZhvLGEiJmiH6;XhaW^Ii`hm?Iv4zzI>zJnnNXvVrl!@JZyN8?@gADBlB=XDLY4AHHRJf`D&3Zu#{@Ikt zJ8(g?ST<-W&?%6@l!g(zTX&3z0&T2a>%fHrUXh*`dv4b&~c?UOk`+EN)+;J-mXF9%)T zf%sx+Wt8PD9#A&a{RM8y4}zr)RhP9MxV0IA+)a0YcDfUqYWSotz`*GZtbP^^IVU{; zF3;oKwa_rQgdlB+eEdT9wKebUo#oqR__oQz)54Zne(Uu&zM?Mz8{G{}lB0X!bvje$7hJSRZDXQ{_9RvZa;~f%J+dGKOFf<{DT~r5b2=daU~PyR~(bIs%9Trsg~a_C~``}vu~ zhVgK*0epC2(|E8LtNkT+e*5IlrsM0U>(VP_NDmZ%QuePT6g^o$R{n4 zvo+)1@gwKgL8YB&oQ-$R#5*>5Y-2-sy|)O zYxms;8AdMl5bX!ntH_R34qzm|03$cfL^obKJjG3QP7U0S?w7IT>}6P=g{?Sy}0^BH9q#X11%I-+VLSVt*n0-^l7|vidW!b1qQBwG@N=yVeu5r66%PM!41@|5TAr7Gs;g z4Ei|FT+GWIs7PjnE140lWJb7>ktj<>VvZN8RtWu1ORM!m9}4tP4O@E(J4I~g{K8Yi>my1?A1&Oj&B&yUhvZ9xf z6~&BnG_$UwnspuBtXolzNLCUcN zdyXO4bL_xg#SEm@HdeVE`el{SuTqIrMI}-Z*OgJEHV3Oci9}ft5>=8ISrNs^iYz4I pGKfJY6!}%f*oKm40^CcLAVZ=oABigYjI79Kq$~ek6iIdx{|Ed_wLAa- literal 0 HcmV?d00001 diff --git a/attention/ops/chunked_prefill_paged_decode.py b/attention/ops/chunked_prefill_paged_decode.py new file mode 100644 index 0000000..aa791fe --- /dev/null +++ b/attention/ops/chunked_prefill_paged_decode.py @@ -0,0 +1,401 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Authors: +# - Burkhard Ringlein +# - Jan van Lunteren +# - Chih-Chieh Yang +# - Thomas Parnell + +import torch + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton + +from .prefix_prefill import context_attention_fwd + +float8_info = torch.finfo(current_platform.fp8_dtype()) + + +@triton.jit +def cdiv_fn(x, y): + return (x + y - 1) // y + + +@triton.jit +def kernel_paged_attention_2d( + output_ptr, # [num_tokens, num_query_heads, head_size] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] + value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + sink_ptr, # [num_query_heads] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + out_scale_inv, + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + num_queries_per_kv_padded: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + BLOCK_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + x: tl.constexpr, # int + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.int64, # int + stride_k_cache_4: tl.int64, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.int64, # int + filter_by_query_len: tl.constexpr, # bool + query_start_len_ptr, # [num_seqs+1] + USE_SINKS: tl.constexpr, # bool + USE_FP8: tl.constexpr, + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, +): + seq_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + + if filter_by_query_len: + cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx) + cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1) + cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index + if cur_batch_query_len > 1: + return + else: + cur_batch_in_all_start_index = seq_idx + + query_head_idx = kv_head_idx * num_queries_per_kv + tl.arange( + 0, num_queries_per_kv_padded + ) + + query_offset = ( + cur_batch_in_all_start_index * query_stride_0 + + query_head_idx[:, None] * query_stride_1 + ) + + head_mask = query_head_idx < (kv_head_idx + 1) * num_queries_per_kv + head_mask = head_mask & (query_head_idx < num_query_heads) + + dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, 0).to(tl.int1) + + # Q : (num_queries_per_kv, HEAD_SIZE,) + Q = tl.load( + query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED)[None, :], + mask=dim_mask[None, :] & head_mask[:, None], + other=0.0, + ) + + block_table_offset = seq_idx * block_table_stride + + if not USE_SINKS: + M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32) + else: + M = tl.load( + sink_ptr + query_head_idx, + mask=head_mask, + other=float("-inf"), + ).to(dtype=tl.float32) + + L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32) + acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED], dtype=tl.float32) + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # alibi slope for this head + if USE_ALIBI_SLOPES: + alibi_slope = tl.load( + alibi_slopes_ptr + query_head_idx, mask=head_mask, other=0.0 + ) + + num_blocks = cdiv_fn(seq_len, BLOCK_SIZE) + + # iterate through tiles + for j in range(0, num_blocks): + physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j) + + offs_n = tl.arange(0, BLOCK_SIZE) + offs_d = tl.arange(0, HEAD_SIZE_PADDED) + + v_offset = ( + physical_block_idx * stride_v_cache_0 + + kv_head_idx * stride_v_cache_1 + + offs_d[None, :] * stride_v_cache_2 + + offs_n[:, None] * stride_v_cache_3 + ) + + k_offset = ( + physical_block_idx * stride_k_cache_0 + + kv_head_idx * stride_k_cache_1 + + (offs_d[:, None] // x) * stride_k_cache_2 + + offs_n[None, :] * stride_k_cache_3 + + (offs_d[:, None] % x) * stride_k_cache_4 + ) + + # K : (HEAD_SIZE, BLOCK_SIZE) + K_load = tl.load(key_cache_ptr + k_offset, mask=dim_mask[:, None], other=0.0) + + if K_load.dtype.is_fp8(): + K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype) + else: + K = K_load + + # V : (BLOCK_SIZE, HEAD_SIZE) + V_load = tl.load(value_cache_ptr + v_offset, mask=dim_mask[None, :], other=0.0) + + if V_load.dtype.is_fp8(): + V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype) + else: + V = V_load + + seq_offset = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + boundary = tl.full([BLOCK_SIZE], seq_len, dtype=tl.int32) + seq_mask = seq_offset[None, :] < boundary + + # S : (num_queries_per_kv, BLOCK_SIZE,) + S = tl.where(head_mask[:, None] & seq_mask, 0.0, float("-inf")).to(tl.float32) + S += scale * tl.dot(Q, K) + + context_len = seq_len - 1 + + if SLIDING_WINDOW > 0: + S = tl.where((context_len - seq_offset) < SLIDING_WINDOW, S, -10000) + + if USE_ALIBI_SLOPES: + S += alibi_slope[:, None] * (seq_offset - context_len) + + # compute running maximum + # m_j : (num_queries_per_kv,) + m_j = tl.maximum(M, tl.max(S, axis=1)) + + # P : (num_queries_per_kv, BLOCK_SIZE,) + P = tl.exp(S - m_j[:, None]) + + # l_j : (num_queries_per_kv,) + l_j = tl.sum(P, axis=1) + + # alpha : (num_queries_per_kv, ) + alpha = tl.exp(M - m_j) + + # acc : (num_queries_per_kv, BLOCK_SIZE,) + acc = acc * alpha[:, None] + + # update constants + L = L * alpha + l_j + M = m_j + + # acc : (num_queries_per_kv, BLOCK_SIZE,) + acc += tl.dot(P.to(V.dtype), V) + + # epilogue + acc = acc / L[:, None] + if USE_FP8: + acc = acc * tl.load(out_scale_inv) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + + output_offset = ( + cur_batch_in_all_start_index * output_stride_0 + + query_head_idx * output_stride_1 + ) + + tl.store( + output_ptr + output_offset[:, None] + tl.arange(0, HEAD_SIZE_PADDED)[None, :], + acc, + mask=dim_mask[None, :] & head_mask[:, None], + ) + + +def chunked_prefill_paged_decode( + query, + key, + value, + output, + kv_cache_dtype, + key_cache, + value_cache, + block_table, + query_start_loc, + seq_lens, + max_seq_len, + max_query_len, + k_scale, + v_scale, + alibi_slopes=None, + sliding_window=None, + sm_scale=None, + output_scale=None, + # Optional tensor for sinks + sinks=None, +): + if sm_scale is None: + sm_scale = 1.0 / (query.shape[1] ** 0.5) + + use_alibi_slopes = alibi_slopes is not None + + if sliding_window is None or sliding_window <= 0: + sliding_window = 0 + + if max_query_len > 1: + context_attention_fwd( + q=query, + k=key, + v=value, + o=output, + kv_cache_dtype=kv_cache_dtype, + k_cache=key_cache, + v_cache=value_cache, + b_loc=block_table, + b_start_loc=query_start_loc, + b_seq_len=seq_lens, + max_seq_len=max_seq_len, + max_input_len=max_query_len, + k_scale=k_scale, + v_scale=v_scale, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + sm_scale=sm_scale, + skip_decode=True, + fp8_out_scale=output_scale, + sinks=sinks, + ) + + block_size = value_cache.shape[3] + num_seqs = len(seq_lens) + num_query_heads = query.shape[1] + num_kv_heads = key.shape[1] + num_queries_per_kv = query.shape[1] // key.shape[1] + head_size = query.shape[2] + + # Conversion of FP8 Tensor from uint8 storage to + # appropriate torch.dtype for interpretation by Triton + if "fp8" in kv_cache_dtype: + assert key_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] + assert value_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] + + if kv_cache_dtype in ("fp8", "fp8_e4m3"): + target_dtype = current_platform.fp8_dtype() + elif kv_cache_dtype == "fp8_e5m2": + target_dtype = torch.float8_e5m2 + else: + raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype) + + key_cache = key_cache.view(target_dtype) + value_cache = value_cache.view(target_dtype) + + num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv), 16) + + from vllm.platforms.rocm import use_rocm_custom_paged_attention + + use_custom = use_rocm_custom_paged_attention( + query.dtype, + head_size, + block_size, + num_queries_per_kv, + max_seq_len, + sliding_window, + kv_cache_dtype, + alibi_slopes, + sinks, + ) + if use_custom: + _PARTITION_SIZE_ROCM = 256 + max_num_partitions = ( + max_seq_len + _PARTITION_SIZE_ROCM - 1 + ) // _PARTITION_SIZE_ROCM + assert _PARTITION_SIZE_ROCM % block_size == 0 + total_num_seq = block_table.shape[0] + tmp_output = torch.empty( + size=(total_num_seq, num_query_heads, max_num_partitions, head_size), + dtype=query.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(total_num_seq, num_query_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + + ops.paged_attention_rocm( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale=sm_scale, + block_tables=block_table, + seq_lens=seq_lens, + query_start_loc=query_start_loc, + block_size=block_size, + max_seq_len=max_seq_len, + alibi_slopes=alibi_slopes, + kv_cache_dtype=kv_cache_dtype, + k_scale=k_scale, + v_scale=v_scale, + fp8_out_scale=output_scale, + ) + else: + kernel_paged_attention_2d[ + ( + num_seqs, + num_kv_heads, + ) + ]( + output_ptr=output, + query_ptr=query, + key_cache_ptr=key_cache, + value_cache_ptr=value_cache, + sink_ptr=sinks, + block_tables_ptr=block_table, + seq_lens_ptr=seq_lens, + alibi_slopes_ptr=alibi_slopes, + scale=sm_scale, + k_scale=k_scale, + v_scale=v_scale, + out_scale_inv=1.0 / output_scale if output_scale is not None else 1.0, + num_query_heads=num_query_heads, + num_queries_per_kv=num_queries_per_kv, + num_queries_per_kv_padded=num_queries_per_kv_padded, + block_table_stride=block_table.stride(0), + query_stride_0=query.stride(0), + query_stride_1=query.stride(1), + output_stride_0=output.stride(0), + output_stride_1=output.stride(1), + BLOCK_SIZE=block_size, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + USE_ALIBI_SLOPES=use_alibi_slopes, + SLIDING_WINDOW=sliding_window, + x=key_cache.shape[4], + stride_k_cache_0=key_cache.stride(0), + stride_k_cache_1=key_cache.stride(1), + stride_k_cache_2=key_cache.stride(2), + stride_k_cache_3=key_cache.stride(3), + stride_k_cache_4=key_cache.stride(4), + stride_v_cache_0=value_cache.stride(0), + stride_v_cache_1=value_cache.stride(1), + stride_v_cache_2=value_cache.stride(2), + stride_v_cache_3=value_cache.stride(3), + filter_by_query_len=True, + query_start_len_ptr=query_start_loc, + USE_SINKS=sinks is not None, + USE_FP8=output_scale is not None, + ) diff --git a/attention/ops/common.py b/attention/ops/common.py new file mode 100644 index 0000000..2cbb5c9 --- /dev/null +++ b/attention/ops/common.py @@ -0,0 +1,414 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + +from vllm.distributed.parallel_state import GroupCoordinator +from vllm.triton_utils import tl, triton + + +@triton.jit +def _correct_attn_cp_out_kernel( + outputs_ptr, + new_output_ptr, + lses_ptr, + vlse_ptr, + outputs_stride_B, + outputs_stride_H, + outputs_stride_D, + lses_stride_N, + lses_stride_B, + lses_stride_H, + lse_idx, + HEAD_DIM: tl.constexpr, + N_ROUNDED: tl.constexpr, +): + """ + Apply the all-gathered lses to correct each local rank's attention + output. we still need perform a cross-rank reduction to obtain the + final attention output. + + Args: + outputs_ptr (triton.PointerType): + Pointer to input tensor of shape [ B, H, D ] + lses_ptr (triton.PointerType): + Pointer to input tensor of shape [ N, B, H ] + new_output_ptr (triton.PointerType): + Pointer to output tensor of shape [ B, H, D ] + vlse_ptr (triton.PointerType): + Pointer to output tensor of shape [ B, H ] + """ + batch_idx = tl.program_id(axis=0).to(tl.int64) + head_idx = tl.program_id(axis=1).to(tl.int64) + d_offsets = tl.arange(0, HEAD_DIM) + num_n_offsets = tl.arange(0, N_ROUNDED) + + # shape = [N] + lse_offsets = ( + num_n_offsets * lses_stride_N + + batch_idx * lses_stride_B + + head_idx * lses_stride_H + ) + + # calc final lse + lse = tl.load(lses_ptr + lse_offsets) + lse = tl.where((lse != lse) | (lse == float("inf")), -float("inf"), lse) + lse_max = tl.max(lse, axis=0) + lse_max = tl.where(lse_max == -float("inf"), 0, lse_max) + lse -= lse_max + lse_exp = tl.exp(lse) + lse_acc = tl.sum(lse_exp, axis=0) + lse = tl.log(lse_acc) + lse += lse_max + + lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H + tl.store(vlse_ptr + lse_offsets, lse) + + # shape = [D] + output_offsets = ( + batch_idx * outputs_stride_B + + head_idx * outputs_stride_H + + d_offsets * outputs_stride_D + ) + + # correct output + lse_offset = ( + lse_idx * lses_stride_N + batch_idx * lses_stride_B + head_idx * lses_stride_H + ) + lse_tmp = tl.load(lses_ptr + lse_offset) + lse_finally = lse_tmp - lse + lse_finally = tl.where( + (lse_finally != lse_finally) | (lse_finally == float("inf")), + -float("inf"), + lse_finally, + ) + factor = tl.exp(lse_finally) + output = tl.load(outputs_ptr + output_offsets) + output = output * factor + + tl.store(new_output_ptr + output_offsets, output) + + +class CPTritonContext: + """The CPTritonContext is used to avoid recompilation of the Triton JIT.""" + + def __init__(self): + self.inner_kernel = None + + def call_kernel(self, kernel, grid, *regular_args, **const_args): + if self.inner_kernel is None: + self.inner_kernel = kernel[grid](*regular_args, **const_args) + else: + self.inner_kernel[grid](*regular_args) + + +def correct_attn_out( + out: torch.Tensor, lses: torch.Tensor, cp_rank: int, ctx: CPTritonContext +) -> tuple[torch.Tensor, torch.Tensor]: + """Correct the attention output using the all-gathered lses. + + Args: + out: Tensor of shape [ B, H, D ] + lses: Tensor of shape [ N, B, H ] + cp_rank: Current rank in the context-parallel group + ctx: Triton context to avoid recompilation + + Returns: + Tuple of (out, lse) with corrected attention and final log-sum-exp. + """ + if ctx is None: + ctx = CPTritonContext() + + # --- Normalize to 3D views --- + if out.ndim == 4 and out.shape[1] == 1: + out = out.squeeze(1) + assert out.ndim == 3, f"expected out [B,H,D] or [B,1,H,D], got {tuple(out.shape)}" + + if lses.ndim == 4 and lses.shape[-1] == 1: + lses = lses.squeeze(-1) + if lses.ndim == 4 and lses.shape[1] == 1: + lses = lses.squeeze(1) + assert lses.ndim == 3, ( + f"expected lses [N,B,H] (optionally with a 1-sized extra dim), " + f"got {tuple(lses.shape)}" + ) + + B, H, D = out.shape + N = lses.shape[0] + + # Strides after we normalized shapes to 3-D views. The kernel computes + # offsets for `vlse_ptr` using lses_stride_B/H, so the output buffer must + # have the same B/H stride layout as a slice of `lses`. + o_sB, o_sH, o_sD = out.stride() + l_sN, l_sB, l_sH = lses.stride() + + # Allocate LSE with the same B/H strides as `lses` so writes land correctly + # even when `lses` is a non-contiguous view (e.g., 4-D to 3-D squeeze). + lse = torch.empty_strided( + (B, H), (l_sB, l_sH), device=lses.device, dtype=lses.dtype + ) + + # Kernel launch config + grid = (B, H, 1) + + regular_args = ( + out, + out, + lses, + lse, + o_sB, + o_sH, + o_sD, + l_sN, + l_sB, + l_sH, + cp_rank, + ) + const_args = {"HEAD_DIM": D, "N_ROUNDED": N} + + ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args) + return out, lse + + +def cp_lse_ag_out_rs( + cp_attn_out: torch.Tensor, + cp_attn_lse: torch.Tensor, + cp_group: GroupCoordinator, + ctx: CPTritonContext = None, + return_lse=False, +): + """ + cp_attn_out: [ B, H, D ] + cp_attn_lse: [ B, H ] + """ + if cp_group.world_size == 1: + return cp_attn_out + + if ctx is None: + ctx = CPTritonContext() + + lses = torch.empty( + (cp_group.world_size,) + cp_attn_lse.shape, + dtype=cp_attn_lse.dtype, + device=cp_attn_lse.device, + ) + + cp_attn_lse = cp_attn_lse.contiguous() + lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) + out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) + out = cp_group.reduce_scatter(out, dim=1) + + if return_lse: + cp_num_heads = lse.shape[1] // cp_group.world_size + cp_rank = cp_group.rank_in_group + lse = lse[:, cp_num_heads * cp_rank : cp_num_heads * (cp_rank + 1)] + return out, lse + return out + + +@triton.jit +def _pack_seq_kernel( + x_ptr, # [N, D] + out_ptr, # [B, Lmax, D] + lengths_ptr, # *i32, [B] + N: tl.constexpr, + D: tl.constexpr, + Lmax: tl.constexpr, + PAD_VALUE: tl.constexpr, + BLOCK_T: tl.constexpr, # timesteps per program + BLOCK_D: tl.constexpr, # features per program +): + pid_b = tl.program_id(0) # batch id + pid_t = tl.program_id(1) # block over time dimension + pid_d = tl.program_id(2) # block over feature dimension + off_t = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) # [BLOCK_T] + off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D) # [BLOCK_D] + + # Compute start index and sequence length from cumulative lengths + in_start = 0 + for i in range(pid_b): + in_start += tl.load(lengths_ptr + i) + seq_len = tl.load(lengths_ptr + pid_b) + + # valid time positions for this block + t_mask = off_t < Lmax + + # compute input row indices for valid (b, t) + in_row = in_start + off_t + valid_row = (off_t < seq_len) & t_mask + + # Pointers + # x_ptr: row-major [N, D] + x_row_ptr = x_ptr + in_row[:, None] * D + off_d[None, :] + + # out_ptr: row-major [B, Lmax, D] + out_row_ptr = out_ptr + (pid_b * Lmax + off_t)[:, None] * D + off_d[None, :] + + # Initialize with PAD (cast will occur as needed based on out_ptr dtype) + d_mask = off_d[None, :] < D + pad_vals = tl.full([BLOCK_T, BLOCK_D], PAD_VALUE, tl.float32) + tl.store(out_row_ptr, pad_vals, mask=t_mask[:, None] & d_mask) + + # Load & write only where within seq_len + x_vals = tl.load(x_row_ptr, mask=valid_row[:, None] & d_mask) + tl.store(out_row_ptr, x_vals, mask=valid_row[:, None] & d_mask) + + +def pack_seq_triton( + x: torch.Tensor, + lengths: torch.Tensor, + pad_value: float = -float("inf"), + block_t: int = 64, + block_d: int = 64, +) -> torch.Tensor: + """ + Pack sequences of different lengths into a batched tensor. + + Args: + x: [N, ...] - input tensor where N is total number of tokens + lengths: [B] - sequence lengths for each batch + pad_value: value to use for padding + block_t: block size for time dimension + block_d: block size for feature dimension + + Returns: + packed: [B, Lmax, ...] - packed tensor + """ + + # Handle multi-dimensional input by reshaping to (N, -1) + original_shape = x.shape + if len(original_shape) > 2: + N = original_shape[0] + x_reshaped = x.reshape(N, -1) + D = x_reshaped.shape[1] + else: + N, D = x.shape + x_reshaped = x + + B = lengths.numel() + Lmax = int(lengths.max().item()) + + # Starts are computed inside the kernel from lengths + + out = torch.empty((B, Lmax, D), device=x.device, dtype=x.dtype) + + grid = (B, triton.cdiv(Lmax, block_t), triton.cdiv(D, block_d)) + _pack_seq_kernel[grid]( + x_reshaped, + out, + lengths.int(), + N, + D, + Lmax, + PAD_VALUE=float(pad_value), + BLOCK_T=block_t, + BLOCK_D=block_d, + num_warps=4, + num_stages=2, + ) + + # Reshape output back to original dimensions (except first dimension) + if len(original_shape) > 2: + output_shape = (B, Lmax) + original_shape[1:] + out = out.reshape(output_shape) + + return out + + +@triton.jit +def _unpack_seq_triton_kernel( + packed_ptr, # [B, Lmax, D] + out_ptr, # [N, D] + lengths_ptr, # *i32, [B] + B: tl.constexpr, + Lmax: tl.constexpr, + D: tl.constexpr, + BLOCK_T: tl.constexpr, # timesteps per program + BLOCK_D: tl.constexpr, # features per program +): + pid_b = tl.program_id(0) # batch id + pid_t = tl.program_id(1) # block over time dimension + pid_d = tl.program_id(2) # block over feature dimension + off_t = pid_t * BLOCK_T + tl.arange(0, BLOCK_T) # [BLOCK_T] + off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D) # [BLOCK_D] + + # bounds: compute start from cumulative lengths + in_start = 0 + for i in range(pid_b): + in_start += tl.load(lengths_ptr + i) + seq_len = tl.load(lengths_ptr + pid_b) + + # valid time positions for this block + t_mask = off_t < Lmax + valid_row = (off_t < seq_len) & t_mask + + # compute output row indices for valid (b, t) + out_row = in_start + off_t + + # Pointers + # packed_ptr: row-major [B, Lmax, D] + packed_row_ptr = packed_ptr + (pid_b * Lmax + off_t)[:, None] * D + off_d[None, :] + + # out_ptr: row-major [N, D] + out_row_ptr = out_ptr + out_row[:, None] * D + off_d[None, :] + + # Load from packed tensor and store to output + d_mask = off_d[None, :] < D + packed_vals = tl.load(packed_row_ptr, mask=valid_row[:, None] & d_mask) + tl.store(out_row_ptr, packed_vals, mask=valid_row[:, None] & d_mask) + + +def unpack_seq_triton( + packed_tensor: torch.Tensor, + lengths: torch.Tensor, + block_t: int = 64, + block_d: int = 64, +) -> torch.Tensor: + """ + Unpack a packed decode query tensor back to the original format. + Efficient Triton implementation. + + Args: + packed_tensor: [B, Lmax, ...] - packed tensor from pack_seq_triton + lengths: [B] - sequence lengths for each batch + block_t: block size for time dimension + block_d: block size for feature dimension + + Returns: + unpacked_tensor: [N, ...] where N = sum(lengths) + """ + + # Handle multi-dimensional input by reshaping to (B, Lmax, -1) + original_shape = packed_tensor.shape + if len(original_shape) > 3: + B, Lmax = original_shape[:2] + packed_reshaped = packed_tensor.reshape(B, Lmax, -1) + D = packed_reshaped.shape[2] + else: + B, Lmax, D = packed_tensor.shape + packed_reshaped = packed_tensor + + # Calculate total number of elements + N = int(lengths.sum().item()) + + out = torch.empty((N, D), device=packed_tensor.device, dtype=packed_tensor.dtype) + + grid = (B, triton.cdiv(Lmax, block_t), triton.cdiv(D, block_d)) + _unpack_seq_triton_kernel[grid]( + packed_reshaped, + out, + lengths.int(), + B, + Lmax, + D, + BLOCK_T=block_t, + BLOCK_D=block_d, + num_warps=4, + num_stages=2, + ) + + # Reshape output back to original dimensions (except first dimension) + if len(original_shape) > 3: + output_shape = (N,) + original_shape[2:] + out = out.reshape(output_shape) + + return out diff --git a/attention/ops/flashmla.py b/attention/ops/flashmla.py new file mode 100644 index 0000000..9cd0916 --- /dev/null +++ b/attention/ops/flashmla.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py + +import torch + +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm import _custom_ops as ops + +logger = init_logger(__name__) + +if current_platform.is_cuda(): + try: + import vllm._flashmla_C # noqa: F401 + + _flashmla_C_AVAILABLE = True + except ImportError: + _flashmla_C_AVAILABLE = False +else: + _flashmla_C_AVAILABLE = False + +if current_platform.is_cuda(): + try: + import vllm._flashmla_extension_C # noqa: F401 + + _flashmla_extension_C_AVAILABLE = True + except ImportError: + _flashmla_extension_C_AVAILABLE = False +else: + _flashmla_extension_C_AVAILABLE = False + + +def _is_flashmla_available() -> tuple[bool, str | None]: + if not _flashmla_C_AVAILABLE: + return ( + False, + "vllm._flashmla_C is not available, likely was not " + "compiled due to insufficient nvcc version or a supported arch " + "was not in the list of target arches to compile for.", + ) + if not _flashmla_extension_C_AVAILABLE: + return ( + False, + "vllm._flashmla_extension_C is not available, likely " + "was not compiled due to a build error.", + ) + + return True, None + + +def is_flashmla_dense_supported() -> tuple[bool, str | None]: + """ + Return: is_supported_flag, unsupported_reason (optional). + """ + is_availble, maybe_reason = _is_flashmla_available() + if not is_availble: + return False, maybe_reason + if current_platform.get_device_capability()[0] != 9: + return False, "FlashMLA Dense is only supported on Hopper devices." + return True, None + + +def is_flashmla_sparse_supported() -> tuple[bool, str | None]: + """ + Return: is_supported_flag, unsupported_reason (optional). + """ + is_availble, maybe_reason = _is_flashmla_available() + if not is_availble: + return False, maybe_reason + if current_platform.get_device_capability()[0] not in (9, 10): + return ( + False, + "FlashMLA Sparse is only supported on Hopper and Blackwell devices.", + ) + return True, None + + +def get_mla_metadata( + cache_seqlens: torch.Tensor, + num_q_tokens_per_head_k: int, + num_heads_k: int, + num_heads_q: int | None = None, + is_fp8_kvcache: bool = False, + topk: int | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Arguments: + - cache_seqlens: (batch_size), dtype torch.int32. + - num_q_tokens_per_head_k: + Equals to num_q_tokens_per_q_seq * num_heads_q // num_heads_k. + - num_heads_k: The number of k heads. + - num_heads_q: + The number of q heads. + This argument is optional when sparse attention is not enabled + - is_fp8_kvcache: Whether the k_cache and v_cache are in fp8 format. + - topk: If not None, sparse attention will be enabled, + and only tokens in the `indices` array + passed to `flash_mla_with_kvcache_sm90` will be attended to. + + Returns: + - tile_scheduler_metadata: + (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32. + - num_splits: (batch_size + 1), dtype torch.int32. + """ + if is_fp8_kvcache and topk is None: + return torch.ops._flashmla_extension_C.get_mla_decoding_metadata_dense_fp8( + cache_seqlens, + num_q_tokens_per_head_k, + num_heads_k, + ) + return torch.ops._flashmla_C.get_mla_decoding_metadata( + cache_seqlens, + num_q_tokens_per_head_k, + num_heads_k, + num_heads_q, + is_fp8_kvcache, + topk, + ) + + +def flash_mla_with_kvcache( + q: torch.Tensor, + k_cache: torch.Tensor, + block_table: torch.Tensor, + cache_seqlens: torch.Tensor, + head_dim_v: int, + tile_scheduler_metadata: torch.Tensor, + num_splits: torch.Tensor, + softmax_scale: float | None = None, + causal: bool = False, + descale_q: torch.Tensor | None = None, + descale_k: torch.Tensor | None = None, + is_fp8_kvcache: bool = False, + indices: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Arguments: + - q: (batch_size, seq_len_q, num_heads_q, head_dim). + - k_cache: (num_blocks, page_block_size, num_heads_k, head_dim). + - block_table: (batch_size, max_num_blocks_per_seq), torch.int32. + - cache_seqlens: (batch_size), torch.int32. + - head_dim_v: Head dimension of v. + - tile_scheduler_metadata: + (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, + returned by get_mla_metadata. + - num_splits: + (batch_size + 1), torch.int32, returned by get_mla_metadata. + - softmax_scale: float. + The scale of QK^T before applying softmax. + Default to 1 / sqrt(head_dim). + - causal: bool. Whether to apply causal attention mask. + - descale_q: (batch_size), + torch.float32. Descaling factors for Q, used for fp8 quantization. + - descale_k: (batch_size), + torch.float32. Descaling factors for K, used for fp8 quantization. + - is_fp8_kvcache: bool. + Whether the k_cache and v_cache are in fp8 format. + For the format of FP8 KV cache, please refer to README.md + - indices: (batch_size, seq_len_q, topk), torch.int32. + If not None, sparse attention will be enabled, + and only tokens in the `indices` array will be attended to. + Invalid indices should be set to -1 or numbers >= total_seq_len_kv. + For details about how to set up `indices`, please refer to README.md. + + Returns: + - out: (batch_size, seq_len_q, num_heads_q, head_dim_v). + - softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32. + """ + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + if indices is not None: + # NOTE (zyongye): sparse attention is also causal + # since it only attend to the tokens before + # but here `causal` should not be specified + assert not causal, "causal must be `false` if sparse attention is enabled." + assert (descale_q is None) == (descale_k is None), ( + "descale_q and descale_k should be both None or both not None" + ) + + if indices is None and q.element_size() == 1: + out, softmax_lse = torch.ops._flashmla_extension_C.fwd_kvcache_mla_fp8( + q, + k_cache, + head_dim_v, + cache_seqlens, + block_table, + softmax_scale, + causal, + tile_scheduler_metadata, + num_splits, + descale_q, + descale_k, + ) + else: + out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla( + q, + k_cache, + head_dim_v, + cache_seqlens, + block_table, + softmax_scale, + causal, + tile_scheduler_metadata, + num_splits, + is_fp8_kvcache, + indices, + ) + return out, softmax_lse + + +def flash_mla_sparse_prefill( + q: torch.Tensor, + kv: torch.Tensor, + indices: torch.Tensor, + sm_scale: float, + d_v: int = 512, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Sparse attention prefill kernel + + Args: + - q: [s_q, h_q, d_qk], bfloat16 + - kv: [s_kv, h_kv, d_qk], bfloat16 + - indices: [s_q, h_kv, topk], int32. + Invalid indices should be set to -1 or numbers >= s_kv + - sm_scale: float + - d_v: The dimension of value vectors. Can only be 512 + + Returns: + - (output, max_logits, lse) + About the definition of output, + max_logits and lse, please refer to README.md + - output: [s_q, h_q, d_v], bfloat16 + - max_logits: [s_q, h_q], float + - lse: [s_q, h_q], float, 2-based log-sum-exp + """ + results = ops.sparse_prefill_fwd(q, kv, indices,sm_scale, d_v) + return results + + +# +# TODO: Add fake functions +# +# @register_fake("_flashmla_C::get_mla_metadata") +# def _get_mla_metadata_fake(....) -> Tuple[torch.Tensor, torch.Tensor]: +# return .... +# +# @register_fake("_flashmla_C::fwd_kvcache_mla") +# def _fwd_kvcache_mla_fake(....) -> Tuple[torch.Tensor, torch.Tensor]: +# return .... +# diff --git a/attention/ops/merge_attn_states.py b/attention/ops/merge_attn_states.py new file mode 100644 index 0000000..16106f3 --- /dev/null +++ b/attention/ops/merge_attn_states.py @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.platforms import current_platform + + +def merge_attn_states( + output: torch.Tensor, + prefix_output: torch.Tensor, + prefix_lse: torch.Tensor, + suffix_output: torch.Tensor, + suffix_lse: torch.Tensor, + output_lse: torch.Tensor | None = None, +) -> None: + # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel + # is not support for FP8 dtype, fallback to use Triton kernel. + def supported_dtypes(o: torch.Tensor) -> bool: + return o.dtype in [torch.float32, torch.half, torch.bfloat16] + + # NOTE(DefTruth): Currently, custom merge_attn_states CUDA + # kernel load/store 128b(16 bytes) per memory issue within + # thread. Namely, the headsize(headdim) must be multiple of + # pack_size (float32 -> 4, half/bfloat16 -> 8). + def supported_headdim(o: torch.Tensor) -> bool: + headdim = o.shape[2] # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + if o.dtype == torch.float32: + return headdim % 4 == 0 + return headdim % 8 == 0 + + if ( + current_platform.is_cuda() + and supported_dtypes(output) + and supported_headdim(output) + ): + from vllm._custom_ops import merge_attn_states + + return merge_attn_states( + output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse + ) + else: + from vllm.attention.ops.triton_merge_attn_states import merge_attn_states + + return merge_attn_states( + output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse + ) diff --git a/attention/ops/paged_attn.py b/attention/ops/paged_attn.py new file mode 100644 index 0000000..8e010ff --- /dev/null +++ b/attention/ops/paged_attn.py @@ -0,0 +1,262 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass + +import torch + +from vllm.platforms import current_platform +from vllm.triton_utils import HAS_TRITON + +if current_platform.is_cuda_alike(): + from vllm import _custom_ops as ops +elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops as ops + +if HAS_TRITON: + from vllm.attention.ops.prefix_prefill import context_attention_fwd + +# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. +_PARTITION_SIZE = 512 + + +@dataclass +class PagedAttentionMetadata: + """Metadata for PagedAttention.""" + + # (batch_size,). The length of sequences (entire tokens seen so far) per + # sequence. + seq_lens_tensor: torch.Tensor | None + # Maximum sequence length in the batch. 0 if it is prefill-only batch. + max_decode_seq_len: int + # (batch_size, max_blocks_per_seq). + # Block addresses per sequence. (Seq id -> list of physical block) + # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks + # in the kv cache. Each block can contain up to block_size tokens. + # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph + # captured. + block_tables: torch.Tensor | None + + +class PagedAttention: + @staticmethod + def get_supported_head_sizes() -> list[int]: + return [32, 64, 80, 96, 112, 120, 128, 192, 256] + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + cache_dtype_str: str = "auto", + ) -> tuple[int, ...]: + return (2, num_blocks, block_size * num_kv_heads * head_size) + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> tuple[torch.Tensor, torch.Tensor]: + x = 16 // kv_cache.element_size() + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + ) -> None: + ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping.flatten(), + kv_cache_dtype, + k_scale, + v_scale, + ) + + @staticmethod + def forward_decode( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + max_seq_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: torch.Tensor | None, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> torch.Tensor: + if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1: + # use blocksparse paged attention + block_size = value_cache.size(-1) + assert ( + blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0 + ), ( + f"{blocksparse_block_size=} needs to be a multiple of" + f"{block_size=} used in block_tables." + ) + + output = torch.empty_like(query) + block_size = value_cache.shape[3] + num_seqs, num_heads, head_size = query.shape + max_num_partitions = (max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE + # NOTE(woosuk): We use a simple heuristic to decide whether to use + # PagedAttention V1 or V2. If the number of partitions is 1, we use + # V1 to avoid the overhead of reduction. Also, if the number of + # sequences or heads is large, we use V1 since there is enough work + # to parallelize. + # TODO(woosuk): Tune this heuristic. + # For context len > 8192, use V2 kernel to avoid shared memory shortage. + use_v1 = max_seq_len <= 8192 and ( + max_num_partitions == 1 or num_seqs * num_heads > 512 + ) + + if use_v1: + # Run PagedAttention V1. + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + else: + # Run PagedAttention V2. + assert _PARTITION_SIZE % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + max_seq_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + return output + + @staticmethod + def forward_prefix( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache_dtype: str, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + query_start_loc: torch.Tensor, + seq_lens_tensor: torch.Tensor, + max_query_len: int, + alibi_slopes: torch.Tensor | None, + sliding_window: int | None, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + ) -> torch.Tensor: + output = torch.empty_like(query) + max_seq_len = None + context_attention_fwd( + query, + key, + value, + output, + kv_cache_dtype, + key_cache, + value_cache, + block_tables, + # query_start_loc is (batch_size + 1,) + query_start_loc, + seq_lens_tensor, + max_seq_len, + max_query_len, + k_scale, + v_scale, + alibi_slopes, + sliding_window, + ) + return output + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + src_key_cache = src_kv_cache[0] + dst_key_cache = dst_kv_cache[0] + ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + + src_value_cache = src_kv_cache[1] + dst_value_cache = dst_kv_cache[1] + ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: list[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + ops.copy_blocks(key_caches, value_caches, src_to_dists) diff --git a/attention/ops/pallas_kv_cache_update.py b/attention/ops/pallas_kv_cache_update.py new file mode 100644 index 0000000..51214b0 --- /dev/null +++ b/attention/ops/pallas_kv_cache_update.py @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import functools + +import jax +from jax.experimental import pallas as pl +from jax.experimental.pallas import tpu as pltpu + +from vllm.utils.math_utils import cdiv + + +def _kv_cache_update_kernel( + # Prefetch + slices_ref, # [3, padded_num_slices], list of (kv_cache_start, + # new_kv_start, slice_len) + num_slices_ref, # [1] + # Input + new_kv_hbm_ref, # [num_tokens, num_combined_kv_heads, head_dim] + kv_cache_hbm_ref, # [total_num_pages * page_size, num_combined_kv_heads, + # head_dim] + # Output + _, # [total_num_pages * page_size, num_combined_kv_heads, head_dim] + # Scratch + scratch, # [num_slices_per_block, page_size, num_combined_kv_heads, + # head_dim] + sem, +): + async_copies = [] + block_idx = pl.program_id(0) + num_slices_per_block = scratch.shape[0] + + # Copy from new_kv_hbm_ref to scratch + for i in range(num_slices_per_block): + offset_i = i + block_idx * num_slices_per_block + new_kv_start = jax.lax.select( + offset_i < num_slices_ref[0], slices_ref[1, offset_i], 0 + ) + length = jax.lax.select( + offset_i < num_slices_ref[0], slices_ref[2, offset_i], 0 + ) + async_copy = pltpu.make_async_copy( + new_kv_hbm_ref.at[pl.ds(new_kv_start, length), ...], + scratch.at[i, pl.ds(0, length), ...], + sem, + ) + async_copy.start() + async_copies.append(async_copy) + + for async_copy in async_copies: + async_copy.wait() + + # Copy from scratch to kv_cache_hbm_ref + async_copies.clear() + for i in range(num_slices_per_block): + offset_i = i + block_idx * num_slices_per_block + kv_cache_start = jax.lax.select( + offset_i < num_slices_ref[0], slices_ref[0, offset_i], 0 + ) + length = jax.lax.select( + offset_i < num_slices_ref[0], slices_ref[2, offset_i], 0 + ) + async_copy = pltpu.make_async_copy( + scratch.at[i, pl.ds(0, length), ...], + kv_cache_hbm_ref.at[pl.ds(kv_cache_start, length), ...], + sem, + ) + async_copy.start() + async_copies.append(async_copy) + for async_copy in async_copies: + async_copy.wait() + + +@functools.partial( + jax.jit, + static_argnames=["page_size", "num_slices_per_block"], +) +def kv_cache_update( + # [total_num_token, num_combined_kv_heads, head_dim] + new_kv: jax.Array, + # [3, slices], list of (kv_cache_start, new_kv_start, slice_len) + slices: jax.Array, + # [total_num_pages * page_size, num_combined_kv_heads, head_dim] + kv_cache: jax.Array, + # [1] + num_kv_update_slices: jax.Array, + *, + page_size: int = 32, + num_slices_per_block: int = 8, +): + _, num_combined_kv_heads, head_dim = new_kv.shape + assert kv_cache.shape[1] == num_combined_kv_heads + assert kv_cache.shape[2] == head_dim + assert head_dim % 128 == 0 + # TODO: Add dynamic check to make sure that the all the slice lengths are + # smaller or equal to page_size + + in_specs = [ + pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY), + pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY), + ] + + out_specs = [pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY)] + out_shape = [jax.ShapeDtypeStruct(kv_cache.shape, dtype=kv_cache.dtype)] + + scalar_prefetches = [slices, num_kv_update_slices] + scratch = pltpu.VMEM( + (num_slices_per_block, page_size, num_combined_kv_heads, head_dim), + new_kv.dtype, + ) + + scratch_shapes = [ + scratch, + pltpu.SemaphoreType.DMA, + ] + + kernel = pl.pallas_call( + _kv_cache_update_kernel, + grid_spec=pltpu.PrefetchScalarGridSpec( + num_scalar_prefetch=len(scalar_prefetches), + in_specs=in_specs, + out_specs=out_specs, + grid=(cdiv(num_kv_update_slices[0], num_slices_per_block),), + scratch_shapes=scratch_shapes, + ), + out_shape=out_shape, + input_output_aliases={len(scalar_prefetches) + 1: 0}, + ) + + return kernel(*scalar_prefetches, new_kv, kv_cache)[0] diff --git a/attention/ops/prefix_prefill.py b/attention/ops/prefix_prefill.py new file mode 100644 index 0000000..f101d5c --- /dev/null +++ b/attention/ops/prefix_prefill.py @@ -0,0 +1,814 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# The kernels in this file are adapted from LightLLM's context_attention_fwd: +# https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py + +import torch + +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton + +# Static kernels parameters +BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64 +NUM_WARPS = 4 if current_platform.is_rocm() else 8 + +# To check compatibility +IS_TURING = current_platform.get_device_capability() == (7, 5) +float8_info = torch.finfo(current_platform.fp8_dtype()) + + +# Here's an example autotuner config for this kernel. This config does provide +# a performance improvement, but dramatically increases first call latency in +# triton 3.2. Because of this tradeoff, it's currently commented out. +# @triton.autotune( +# configs=[ +# triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, \ +# "num_unroll_cache": 4, \ +# "num_unroll_request": 1 } | \ +# ({"kpack": 2, "waves_per_eu": 2} \ +# if current_platform.is_rocm() else {}), \ +# num_warps=4, \ +# num_stages=1) +# ], +# key=["BLOCK_SIZE", "MAX_Q_LEN", "MAX_CTX_LEN"] +# ) +@triton.jit +def _fwd_kernel( + Q, + K, + V, + K_cache, + V_cache, + sink_ptr, + B_Loc, + sm_scale, + k_scale, + v_scale, + out_scale_inv, + B_Start_Loc, + B_Seqlen, + x: tl.constexpr, + Out, + stride_b_loc_b, + stride_b_loc_s, + stride_qbs, + stride_qh, + stride_qd, + stride_kbs, + stride_kh, + stride_kd, + stride_vbs, + stride_vh, + stride_vd, + stride_obs, + stride_oh, + stride_od, + stride_k_cache_bs, + stride_k_cache_h, + stride_k_cache_d, + stride_k_cache_bl: tl.constexpr, + stride_k_cache_x, + stride_v_cache_bs, + stride_v_cache_h, + stride_v_cache_d, + stride_v_cache_bl, + num_queries_per_kv: tl.constexpr, + IN_PRECISION: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DMODEL_PADDED: tl.constexpr, + BLOCK_SIZE: tl.constexpr, + BLOCK_N: tl.constexpr, + SLIDING_WINDOW: tl.constexpr, + num_unroll_cache: tl.constexpr, + num_unroll_request: tl.constexpr, + SKIP_DECODE: tl.constexpr, + USE_SINKS: tl.constexpr, + USE_FP8: tl.constexpr, + MAX_Q_LEN: tl.constexpr = 0, + MAX_CTX_LEN: tl.constexpr = 0, + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // num_queries_per_kv + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1) + cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index + cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len + + if SKIP_DECODE and cur_batch_query_len == 1: + return + + # start position inside of the query + # generally, N goes over kv, while M goes over query_len + block_start_loc = BLOCK_M * start_m + + # initialize offsets + # [BLOCK_SIZE]; starts at 0 + offs_bs_n = tl.arange(0, BLOCK_SIZE) + # [N]; starts at 0 + offs_n = tl.arange(0, BLOCK_N) + # [D]; starts at 0 + offs_d = tl.arange(0, BLOCK_DMODEL_PADDED) + # [M]; starts at current position in query + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + # [M,D] + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + + offs_d[None, :] * stride_qd + ) + + dim_mask = tl.where(tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to( + tl.int1 + ) # [D] + + q = tl.load( + Q + off_q, + mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len), + other=0.0, + ) # [M,D] + + # initialize pointer to m and l + if not USE_SINKS: + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + m_i = tl.load( + sink_ptr + tl.full([BLOCK_M], cur_head, dtype=tl.int64), + mask=(offs_m < cur_batch_query_len), + other=float("-inf"), + ).to(dtype=tl.float32) + + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32) # [M,D] + + # compute query against context (no causal mask here) + for start_n in tl.range( + 0, cur_batch_ctx_len, BLOCK_SIZE, loop_unroll_factor=num_unroll_cache + ): + start_n = tl.multiple_of(start_n, BLOCK_SIZE) + # -- compute qk ---- + bn = tl.load( + B_Loc + + cur_batch * stride_b_loc_b + + (start_n // BLOCK_SIZE) * stride_b_loc_s + ).to(tl.int64) + # [D,BLOCK_SIZE] + off_k = ( + bn[None, :] * stride_k_cache_bs + + cur_kv_head * stride_k_cache_h + + (offs_d[:, None] // x) * stride_k_cache_d + + ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl + + (offs_d[:, None] % x) * stride_k_cache_x + ) + + # [BLOCK_SIZE,D] + off_v = ( + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + + offs_d[None, :] * stride_v_cache_d + + offs_bs_n[:, None] * stride_v_cache_bl + ) + + if ( + start_n + BLOCK_SIZE > cur_batch_ctx_len + or BLOCK_DMODEL != BLOCK_DMODEL_PADDED + ): + k_load = tl.load( + K_cache + off_k, + mask=dim_mask[:, None] + & ((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len), + other=0.0, + ) # [D,N] + else: + k_load = tl.load(K_cache + off_k) + + if k_load.dtype.is_fp8(): + k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype) + else: + k = k_load + + qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32) # [M,N] + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) + qk = tl.where( + (start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf") + ) + qk *= sm_scale + if SLIDING_WINDOW > 0: + # (cur_batch_ctx_len + offs_m[:, None]) are the positions of + # Q entries in sequence + # (start_n + offs_bs_n[None, :]) are the positions of + # KV entries in sequence + # So the condition makes sure each entry in Q only attends + # to KV entries not more than SLIDING_WINDOW away. + # + # We can't use -inf here, because the + # sliding window may lead to the entire row being masked. + # This then makes m_ij contain -inf, which causes NaNs in + # exp(). + qk = tl.where( + (cur_batch_ctx_len + offs_m[:, None]) - (start_n + offs_bs_n[None, :]) + < SLIDING_WINDOW, + qk, + -10000, + ) + + # compute running maximum + m_ij = tl.maximum(m_i, tl.max(qk, axis=1)) + p = tl.exp(qk - m_ij[:, None]) + l_ij = tl.sum(p, axis=1) + alpha = tl.exp(m_i - m_ij) + acc = acc * alpha[:, None] + + # update acc + if ( + start_n + BLOCK_SIZE > cur_batch_ctx_len + or BLOCK_DMODEL != BLOCK_DMODEL_PADDED + ): + v_load = tl.load( + V_cache + off_v, + mask=dim_mask[None, :] + & ((start_n + offs_bs_n[:, None]) < cur_batch_ctx_len), + other=0.0, + ) # [N,D] + else: + v_load = tl.load(V_cache + off_v) + + if v_load.dtype.is_fp8(): + v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype) + else: + v = v_load + p = p.to(v.dtype) + + acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION) + # # update m_i and l_i + l_i = l_i * alpha + l_ij + m_i = m_ij + + off_k = ( + offs_n[None, :] * stride_kbs + + cur_kv_head * stride_kh + + offs_d[:, None] * stride_kd + ) + off_v = ( + offs_n[:, None] * stride_vbs + + cur_kv_head * stride_vh + + offs_d[None, :] * stride_vd + ) + k_ptrs = K + off_k + v_ptrs = V + off_v + + # block_mask is 0 when we're already past the current query length + block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0) + + # compute query against itself (with causal mask) + for start_n in tl.range( + 0, + block_mask * (start_m + 1) * BLOCK_M, + BLOCK_N, + loop_unroll_factor=num_unroll_request, + ): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load( + k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=dim_mask[:, None] + & ((start_n + offs_n[None, :]) < cur_batch_query_len), + other=0.0, + ) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) + qk *= sm_scale + # apply causal mask + qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) + if SLIDING_WINDOW > 0: + qk = tl.where( + offs_m[:, None] - (start_n + offs_n[None, :]) < SLIDING_WINDOW, + qk, + -10000, + ) + + # compute running maximum + m_ij = tl.maximum(m_i, tl.max(qk, axis=1)) + p = tl.exp(qk - m_ij[:, None]) + l_ij = tl.sum(p, axis=1) + alpha = tl.exp(m_i - m_ij) + acc = acc * alpha[:, None] + + # update acc + v = tl.load( + v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=dim_mask[None, :] + & ((start_n + offs_n[:, None]) < cur_batch_query_len), + other=0.0, + ) + p = p.to(v.dtype) + + acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION) + # update m_i and l_i + l_i = l_i * alpha + l_ij + m_i = m_ij + + acc = acc / l_i[:, None] + + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + + offs_d[None, :] * stride_od + ) + out_ptrs = Out + off_o + if USE_FP8: + acc = acc * tl.load(out_scale_inv) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + tl.store( + out_ptrs, acc, mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len) + ) + return + + +@triton.jit +def _fwd_kernel_alibi( + Q, + K, + V, + K_cache, + V_cache, + B_Loc, + sm_scale, + k_scale, + v_scale, + B_Start_Loc, + B_Seqlen, + Alibi_slopes, + block_size, + x, + Out, + stride_b_loc_b, + stride_b_loc_s, + stride_qbs, + stride_qh, + stride_qd, + stride_kbs, + stride_kh, + stride_kd, + stride_vbs, + stride_vh, + stride_vd, + stride_obs, + stride_oh, + stride_od, + stride_k_cache_bs, + stride_k_cache_h, + stride_k_cache_d, + stride_k_cache_bl, + stride_k_cache_x, + stride_v_cache_bs, + stride_v_cache_h, + stride_v_cache_d, + stride_v_cache_bl, + num_queries_per_kv: int, + IN_PRECISION: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, # head size + BLOCK_DMODEL_PADDED: tl.constexpr, # head size padded to a power of 2 + BLOCK_N: tl.constexpr, + SKIP_DECODE: tl.constexpr, +): + # attn_bias[] + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // num_queries_per_kv + + # cur_batch_seq_len: the length of prompts + # cur_batch_ctx_len: the length of prefix + # cur_batch_in_all_start_index: the start id of the dim=0 + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1) + cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index + cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len + + if SKIP_DECODE and cur_batch_query_len == 1: + return + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL_PADDED) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + + offs_d[None, :] * stride_qd + ) + + dim_mask = tl.where(tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to( + tl.int1 + ) + + q = tl.load( + Q + off_q, + mask=dim_mask[None, :] + & (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len), + other=0.0, + ) + + # # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32) + + alibi_slope = tl.load(Alibi_slopes + cur_head) + alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len + alibi_start_k = 0 + for start_n in range(0, cur_batch_ctx_len, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + bn = tl.load( + B_Loc + + cur_batch * stride_b_loc_b + + ((start_n + offs_n) // block_size) * stride_b_loc_s, + mask=(start_n + offs_n) < cur_batch_ctx_len, + other=0, + ).to(tl.int64) + off_k = ( + bn[None, :] * stride_k_cache_bs + + cur_kv_head * stride_k_cache_h + + (offs_d[:, None] // x) * stride_k_cache_d + + ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl + + (offs_d[:, None] % x) * stride_k_cache_x + ) + off_v = ( + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + + offs_d[None, :] * stride_v_cache_d + + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl + ) + k_load = tl.load( + K_cache + off_k, + mask=dim_mask[:, None] & ((start_n + offs_n[None, :]) < cur_batch_ctx_len), + other=0.0, + ) # [D,N] + + if k_load.dtype.is_fp8(): + k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype) + else: + k = k_load + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) + qk = tl.where( + (start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf") + ) + qk *= sm_scale + + # load alibi + alibi = ( + tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - alibi_start_q[:, None] + ) * alibi_slope + alibi = tl.where( + (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), + alibi, + float("-inf"), + ) + qk += alibi + alibi_start_k += BLOCK_N + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + m_i_new = tl.maximum(m_i, m_ij) + p = tl.math.exp(qk - m_i_new[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + + alpha = tl.math.exp(m_i - m_i_new) + l_i_new = alpha * l_i + l_ij + # -- update output accumulator -- + # scale p + # scale acc + acc_scale = alpha + # acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v_load = tl.load( + V_cache + off_v, + mask=dim_mask[None, :] & ((start_n + offs_n[:, None]) < cur_batch_ctx_len), + other=0.0, + ) + if v_load.dtype.is_fp8(): + v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype) + else: + v = v_load + p = p.to(v.dtype) + + acc = tl.dot(p, v, acc=acc, input_precision="ieee") + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + off_k = ( + offs_n[None, :] * stride_kbs + + cur_kv_head * stride_kh + + offs_d[:, None] * stride_kd + ) + off_v = ( + offs_n[:, None] * stride_vbs + + cur_kv_head * stride_vh + + offs_d[None, :] * stride_vd + ) + k_ptrs = K + off_k + v_ptrs = V + off_v + + block_mask = tl.where(block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0) + + # init alibi + alibi_slope = tl.load(Alibi_slopes + cur_head) + alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len + alibi_start_k = cur_batch_ctx_len + # # init debugger + # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc + # offset_db_k = tl.arange(0, BLOCK_N) + # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL] + for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load( + k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=dim_mask[:, None] + & ((start_n + offs_n[None, :]) < cur_batch_seq_len - cur_batch_ctx_len), + other=0.0, + ) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk = tl.dot(q, k, acc=qk, input_precision="ieee") + qk *= sm_scale + qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) + + # load alibi + alibi = ( + tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - alibi_start_q[:, None] + ) * alibi_slope + alibi = tl.where( + (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), + alibi, + float("-inf"), + ) + qk += alibi + alibi_start_k += BLOCK_N + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + m_i_new = tl.maximum(m_i, m_ij) + p = tl.math.exp(qk - m_i_new[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + + alpha = tl.math.exp(m_i - m_i_new) + l_i_new = alpha * l_i + l_ij + # -- update output accumulator -- + # scale p + # scale acc + acc_scale = alpha + # acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load( + v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=dim_mask[None, :] + & ((start_n + offs_n[:, None]) < cur_batch_seq_len - cur_batch_ctx_len), + other=0.0, + ) + p = p.to(v.dtype) + + acc = tl.dot(p, v, acc=acc, input_precision="ieee") + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + + acc = acc / l_i[:, None] + + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + + offs_d[None, :] * stride_od + ) + out_ptrs = Out + off_o + tl.store( + out_ptrs, + acc, + mask=dim_mask[None, :] + & (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len), + ) + return + + +@torch.inference_mode() +def context_attention_fwd( + q, + k, + v, + o, + kv_cache_dtype: str, + k_cache, + v_cache, + b_loc, + b_start_loc, + b_seq_len, + max_seq_len, + max_input_len, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + alibi_slopes=None, + sliding_window=None, + sm_scale=None, + skip_decode=False, + fp8_out_scale=None, + sinks=None, +): + q_dtype_is_f32 = q.dtype is torch.float32 + + # Turing does have tensor core for float32 multiplication + # use ieee as fallback for triton kernels work. There is also + # warning on vllm/config.py to inform users this fallback + # implementation + IN_PRECISION = "ieee" if IS_TURING and q_dtype_is_f32 else None + + # Conversion of FP8 Tensor from uint8 storage to + # appropriate torch.dtype for interpretation by Triton + if "fp8" in kv_cache_dtype: + assert k_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] + assert v_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] + + if kv_cache_dtype in ("fp8", "fp8_e4m3"): + target_dtype = current_platform.fp8_dtype() + elif kv_cache_dtype == "fp8_e5m2": + target_dtype = torch.float8_e5m2 + else: + raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype) + + k_cache = k_cache.view(target_dtype) + v_cache = v_cache.view(target_dtype) + + if ( + k_cache.dtype == torch.uint8 + or v_cache.dtype == torch.uint8 + and kv_cache_dtype == "auto" + ): + raise ValueError( + "kv_cache_dtype='auto' unsupported for\ + FP8 KV Cache prefill kernel" + ) + + # shape constraints + Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] + assert Lq == Lk and Lk == Lv + # round up Lk to a power of 2 - this is required for Triton block size + Lk_padded = triton.next_power_of_2(Lk) + + if sm_scale is None: + sm_scale = 1.0 / (Lq**0.5) + batch, head = b_seq_len.shape[0], q.shape[1] + num_queries_per_kv = q.shape[1] // k.shape[1] + + assert batch + 1 == len(b_start_loc) + + # 0 means "disable" + if sliding_window is None or sliding_window <= 0: + sliding_window = 0 + + if alibi_slopes is not None: + assert sinks is None, "Sinks arg is not supported with alibi" + assert fp8_out_scale is None, "FP8 output not supported with alibi" + # need to reduce num. blocks when using fp32 + # due to increased use of GPU shared memory + # if q.dtype is torch.float32: + BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK + # batch, head, + grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) + _fwd_kernel_alibi[grid]( + q, + k, + v, + k_cache, + v_cache, + b_loc, + sm_scale, + k_scale, + v_scale, + b_start_loc, + b_seq_len, + alibi_slopes, + v_cache.shape[3], + k_cache.shape[4], + o, + b_loc.stride(0), + b_loc.stride(1), + q.stride(0), + q.stride(1), + q.stride(2), + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + k_cache.stride(0), + k_cache.stride(1), + k_cache.stride(2), + k_cache.stride(3), + k_cache.stride(4), # [num_blocks, num_kv_heads, head_size/x, block_size, x] + v_cache.stride(0), + v_cache.stride(1), + v_cache.stride(2), + v_cache.stride(3), # [num_blocks, num_kv_heads, head_size, block_size] + num_queries_per_kv=num_queries_per_kv, + IN_PRECISION=IN_PRECISION, + BLOCK_M=BLOCK, + BLOCK_DMODEL=Lk, + BLOCK_DMODEL_PADDED=Lk_padded, + BLOCK_N=BLOCK, + SKIP_DECODE=skip_decode, + num_warps=NUM_WARPS, + num_stages=1, + ) + return + + max_seq_len = 0 if max_seq_len is None else max_seq_len + extra_kargs = {} + if current_platform.is_rocm(): + extra_kargs = {"kpack": 1, "waves_per_eu": 2} + + grid = lambda META: (batch, head, triton.cdiv(max_input_len, META["BLOCK_M"])) + _fwd_kernel[grid]( + q, + k, + v, + k_cache, + v_cache, + sinks, + b_loc, + sm_scale, + k_scale, + v_scale, + 1.0 / fp8_out_scale if fp8_out_scale is not None else 1.0, + b_start_loc, + b_seq_len, + k_cache.shape[4], + o, + b_loc.stride(0), + b_loc.stride(1), + q.stride(0), + q.stride(1), + q.stride(2), + k.stride(0), + k.stride(1), + k.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + k_cache.stride(0), + k_cache.stride(1), + k_cache.stride(2), + k_cache.stride(3), + k_cache.stride(4), # [num_blocks, num_kv_heads, head_size/x, block_size, x] + v_cache.stride(0), + v_cache.stride(1), + v_cache.stride(2), + v_cache.stride(3), # [num_blocks, num_kv_heads, head_size, block_size] + BLOCK_SIZE=v_cache.shape[3], + num_queries_per_kv=num_queries_per_kv, + IN_PRECISION=IN_PRECISION, + BLOCK_DMODEL=Lk, + BLOCK_DMODEL_PADDED=Lk_padded, + SLIDING_WINDOW=sliding_window, + SKIP_DECODE=skip_decode, + USE_FP8=fp8_out_scale is not None, + BLOCK_M=128, + BLOCK_N=64, + num_unroll_cache=4, + num_unroll_request=1, + num_warps=4, + num_stages=1, + USE_SINKS=sinks is not None, + **extra_kargs, + ) + return diff --git a/attention/ops/rocm_aiter_paged_attn.py b/attention/ops/rocm_aiter_paged_attn.py new file mode 100644 index 0000000..bcd1e2c --- /dev/null +++ b/attention/ops/rocm_aiter_paged_attn.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import aiter as rocm_aiter +import torch + +from vllm.attention.ops.paged_attn import PagedAttention +from vllm.platforms import current_platform +from vllm.utils.math_utils import cdiv + +FP8_DTYPE = current_platform.fp8_dtype() + + +class AITERPagedAttention(PagedAttention): + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + ) -> None: + if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]: + PagedAttention.write_to_paged_cache( + key, + value, + key_cache, + value_cache, + slot_mapping, + kv_cache_dtype, + k_scale, + v_scale, + ) + else: + kv_cache_torch_dtype = FP8_DTYPE if "fp8" in kv_cache_dtype else torch.int8 + key_cache = key_cache.view(kv_cache_torch_dtype) + value_cache = value_cache.view(kv_cache_torch_dtype) + + rocm_aiter.reshape_and_cache_with_pertoken_quant( + key, + value, + key_cache, + value_cache, + k_scale, + v_scale, + slot_mapping.flatten(), + True, + ) + + @staticmethod + def forward_decode( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + seq_lens: torch.Tensor, + max_seq_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: torch.Tensor | None, + k_scale: torch.Tensor, + v_scale: torch.Tensor, + tp_rank: int = 0, + blocksparse_local_blocks: int = 0, + blocksparse_vert_stride: int = 0, + blocksparse_block_size: int = 64, + blocksparse_head_sliding_step: int = 0, + ) -> torch.Tensor: + if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]: + return PagedAttention.forward_decode( + query=query, + key_cache=key_cache, + value_cache=value_cache, + block_tables=block_tables, + seq_lens=seq_lens, + max_seq_len=max_seq_len, + kv_cache_dtype=kv_cache_dtype, + num_kv_heads=num_kv_heads, + scale=scale, + alibi_slopes=alibi_slopes, + k_scale=k_scale, + v_scale=v_scale, + tp_rank=tp_rank, + blocksparse_local_blocks=blocksparse_local_blocks, + blocksparse_vert_stride=blocksparse_vert_stride, + blocksparse_block_size=blocksparse_block_size, + blocksparse_head_sliding_step=blocksparse_head_sliding_step, + ) + + if "fp8" in kv_cache_dtype: + key_cache = key_cache.view(current_platform.fp8_dtype()) + value_cache = value_cache.view(current_platform.fp8_dtype()) + + if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1: + # use blocksparse paged attention + block_size = value_cache.size(-1) + assert ( + blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0 + ), ( + f"{blocksparse_block_size=} needs to be a multiple of" + f"{block_size=} used in block_tables." + ) + + output = torch.empty_like(query) + block_size = value_cache.shape[3] + max_num_blocks_per_seq = cdiv(max_seq_len, block_size) + + rocm_aiter.pa_fwd_asm( + query, + key_cache, + value_cache, + block_tables, + seq_lens, + max_num_blocks_per_seq, + k_scale, + v_scale, + output, + ) + return output diff --git a/attention/ops/triton_decode_attention.py b/attention/ops/triton_decode_attention.py new file mode 100644 index 0000000..aebc2e6 --- /dev/null +++ b/attention/ops/triton_decode_attention.py @@ -0,0 +1,712 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py +# which was originally adapted from +# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py +# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py + +# Changes: +# - Add support for page size >= 1. + +# Copyright 2025 vLLM Team +# Copyright 2023-2024 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Memory-efficient attention for decoding. +It supports page size >= 1. +""" + +import logging + +from packaging import version + +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton + +is_hip_ = current_platform.is_rocm() + +logger = logging.getLogger(__name__) + +# Only print the following warnings when triton version < 3.2.0. +# The issue won't affect performance or accuracy. +if version.parse(triton.__version__) < version.parse("3.2.0"): + logger.warning( + "The following error message 'operation scheduled before its operands' " + "can be ignored." + ) + + +@triton.jit +def tanh(x): + # Tanh is just a scaled sigmoid + return 2 * tl.sigmoid(2 * x) - 1 + + +@triton.jit +def _fwd_kernel_stage1( + Q, + K_Buffer, + V_Buffer, + sm_scale, + Req_to_tokens, + B_Seqlen, + Att_Out, + stride_req_to_tokens_b, + stride_qbs, + stride_qh, + stride_buf_kbs, + stride_buf_kh, + stride_buf_vbs, + stride_buf_vh, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + kv_group_num: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DV: tl.constexpr, + BLOCK_N: tl.constexpr, + NUM_KV_SPLITS: tl.constexpr, + PAGE_SIZE: tl.constexpr, + logit_cap: tl.constexpr, + Lk: tl.constexpr, + Lv: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + split_kv_id = tl.program_id(2) + + cur_kv_head = cur_head // kv_group_num + + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_dv = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lk + mask_dv = offs_dv < Lv + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_req_idx = cur_batch + + off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d + q = tl.load(Q + off_q, mask=mask_d, other=0.0) + + kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len) + + e_max = -float("inf") + e_sum = 0.0 + acc = tl.zeros([BLOCK_DV], dtype=tl.float32) + + if split_kv_end > split_kv_start: + for start_n in range(split_kv_start, split_kv_end, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + kv_page_number = tl.load( + Req_to_tokens + + stride_req_to_tokens_b * cur_batch_req_idx + + offs_n // PAGE_SIZE, + mask=offs_n < split_kv_end, + other=0, + ) + kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE + offs_buf_k = ( + kv_loc[:, None] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_d[None, :] + ) + k = tl.load( + K_Buffer + offs_buf_k, + mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]), + other=0.0, + ) + qk = tl.sum(q[None, :] * k, 1) + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * tanh(qk / logit_cap) + + qk = tl.where(offs_n < split_kv_end, qk, float("-inf")) + + offs_buf_v = ( + kv_loc[:, None] * stride_buf_vbs + + cur_kv_head * stride_buf_vh + + offs_dv[None, :] + ) + v = tl.load( + V_Buffer + offs_buf_v, + mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]), + other=0.0, + ) + + n_e_max = tl.maximum(tl.max(qk, 0), e_max) + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max) + acc *= re_scale + acc += tl.sum(p[:, None] * v, 0) + + e_sum = e_sum * re_scale + tl.sum(p, 0) + e_max = n_e_max + + offs_mid_o = ( + cur_batch * stride_mid_ob + + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + + offs_dv + ) + + tl.store( + Att_Out + offs_mid_o, + acc / e_sum, + mask=(mask_dv), + ) + + offs_mid_o_1 = ( + cur_batch * stride_mid_ob + + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + + Lv + ) + + tl.store( + Att_Out + offs_mid_o_1, + e_max + tl.log(e_sum), + ) + + +def _decode_att_m_fwd( + q, + k_buffer, + v_buffer, + att_out, + Req_to_tokens, + B_Seqlen, + num_kv_splits, + sm_scale, + page_size, + logit_cap, +): + BLOCK = 64 if not is_hip_ else 8 + + NUM_KV_SPLITS = num_kv_splits + Lk = k_buffer.shape[-1] + Lv = v_buffer.shape[-1] + + batch, head_num = q.shape[0], q.shape[1] + + grid = (batch, head_num, NUM_KV_SPLITS) + kv_group_num = q.shape[1] // k_buffer.shape[-2] + + num_warps = 4 + if kv_group_num != 1: + num_warps = 1 if is_hip_ else 2 + + BLOCK_DMODEL = triton.next_power_of_2(Lk) + BLOCK_DV = triton.next_power_of_2(Lv) + + _fwd_kernel_stage1[grid]( + q, + k_buffer, + v_buffer, + sm_scale, + Req_to_tokens, + B_Seqlen, + att_out, + Req_to_tokens.stride(0), + q.stride(0), + q.stride(1), + k_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + k_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + att_out.stride(0), + att_out.stride(1), + att_out.stride(2), + kv_group_num=kv_group_num, + BLOCK_DMODEL=BLOCK_DMODEL, + BLOCK_DV=BLOCK_DV, + BLOCK_N=BLOCK, + NUM_KV_SPLITS=NUM_KV_SPLITS, + PAGE_SIZE=page_size, + logit_cap=logit_cap, + num_warps=num_warps, + num_stages=2, + Lk=Lk, + Lv=Lv, + ) + + +@triton.jit +def _fwd_grouped_kernel_stage1( + Q, + K_Buffer, + V_Buffer, + sm_scale, + Req_to_tokens, + B_Seqlen, + Att_Out, + stride_req_to_tokens_b, + stride_qbs, + stride_qh, + stride_buf_kbs, + stride_buf_kh, + stride_buf_vbs, + stride_buf_vh, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + kv_group_num: tl.constexpr, + q_head_num: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_DPE: tl.constexpr, + BLOCK_DV: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_H: tl.constexpr, + NUM_KV_SPLITS: tl.constexpr, + PAGE_SIZE: tl.constexpr, + logit_cap: tl.constexpr, + Lk: tl.constexpr, + Lv: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head_id = tl.program_id(1) + cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H) + split_kv_id = tl.program_id(2) + + if kv_group_num > BLOCK_H: + VALID_BLOCK_H: tl.constexpr = BLOCK_H + else: + VALID_BLOCK_H: tl.constexpr = kv_group_num + cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H) + mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H + mask_h = mask_h & (cur_head < q_head_num) + + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_dv = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lk + mask_dv = offs_dv < Lv + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_req_idx = cur_batch + + offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :] + q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_d[None, :]), other=0.0) + + if BLOCK_DPE > 0: + offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE) + mask_dpe = offs_dpe < Lk + off_qpe = ( + cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :] + ) + qpe = tl.load( + Q + off_qpe, mask=(mask_h[:, None]) & (mask_dpe[None, :]), other=0.0 + ) + + kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len) + + e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf") + e_sum = tl.zeros([BLOCK_H], dtype=tl.float32) + acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32) + + if split_kv_end > split_kv_start: + for start_n in range(split_kv_start, split_kv_end, BLOCK_N): + offs_n = start_n + tl.arange(0, BLOCK_N) + kv_page_number = tl.load( + Req_to_tokens + + stride_req_to_tokens_b * cur_batch_req_idx + + offs_n // PAGE_SIZE, + mask=offs_n < split_kv_end, + other=0, + ) + kv_loc = kv_page_number * PAGE_SIZE + offs_n % PAGE_SIZE + offs_buf_k = ( + kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_d[:, None] + ) + k = tl.load( + K_Buffer + offs_buf_k, + mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]), + other=0.0, + ) + qk = tl.dot(q, k.to(q.dtype)) + if BLOCK_DPE > 0: + offs_buf_kpe = ( + kv_loc[None, :] * stride_buf_kbs + + cur_kv_head * stride_buf_kh + + offs_dpe[:, None] + ) + kpe = tl.load( + K_Buffer + offs_buf_kpe, + mask=(offs_n[None, :] < split_kv_end) & (mask_dpe[:, None]), + other=0.0, + ) + qk += tl.dot(qpe, kpe.to(qpe.dtype)) + qk *= sm_scale + + if logit_cap > 0: + qk = logit_cap * tanh(qk / logit_cap) + + qk = tl.where( + mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf") + ) + + offs_buf_v = ( + kv_loc[:, None] * stride_buf_vbs + + cur_kv_head * stride_buf_vh + + offs_dv[None, :] + ) + v = tl.load( + V_Buffer + offs_buf_v, + mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]), + other=0.0, + ) + + n_e_max = tl.maximum(tl.max(qk, 1), e_max) + re_scale = tl.exp(e_max - n_e_max) + p = tl.exp(qk - n_e_max[:, None]) + acc *= re_scale[:, None] + acc += tl.dot(p.to(v.dtype), v) + + e_sum = e_sum * re_scale + tl.sum(p, 1) + e_max = n_e_max + + offs_mid_o = ( + cur_batch * stride_mid_ob + + cur_head[:, None] * stride_mid_oh + + split_kv_id * stride_mid_os + + offs_dv[None, :] + ) + + tl.store( + Att_Out + offs_mid_o, + acc / e_sum[:, None], + mask=(mask_h[:, None]) & (mask_dv[None, :]), + ) + + offs_mid_o_1 = ( + cur_batch * stride_mid_ob + + cur_head * stride_mid_oh + + split_kv_id * stride_mid_os + + Lv + ) + + tl.store( + Att_Out + offs_mid_o_1, + e_max + tl.log(e_sum), + mask=mask_h, + ) + + +def _decode_grouped_att_m_fwd( + q, + k_buffer, + v_buffer, + att_out, + Req_to_tokens, + B_Seqlen, + num_kv_splits, + sm_scale, + page_size, + logit_cap, +): + BLOCK = 32 + Lk = k_buffer.shape[-1] + Lv = v_buffer.shape[-1] + + # [TODO] work around shmem limit on MI3xx + if is_hip_ and Lk >= 576: + BLOCK = 16 + + if Lk == 576: + BLOCK_DMODEL = 512 + BLOCK_DPE = 64 + elif Lk == 288: + BLOCK_DMODEL = 256 + BLOCK_DPE = 32 + else: + BLOCK_DMODEL = triton.next_power_of_2(Lk) + BLOCK_DPE = 0 + BLOCK_DV = triton.next_power_of_2(Lv) + + batch, head_num = q.shape[0], q.shape[1] + kv_group_num = q.shape[1] // k_buffer.shape[-2] + + BLOCK_H = 16 + NUM_KV_SPLITS = num_kv_splits + grid = ( + batch, + triton.cdiv(head_num, min(BLOCK_H, kv_group_num)), + NUM_KV_SPLITS, + ) + + extra_kargs = {} + num_stages = 2 + if is_hip_: + # https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/workload.html#mi300x-triton-kernel-performance-optimization + # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py + extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2} + num_stages = 1 + + _fwd_grouped_kernel_stage1[grid]( + q, + k_buffer, + v_buffer, + sm_scale, + Req_to_tokens, + B_Seqlen, + att_out, + Req_to_tokens.stride(0), + q.stride(0), + q.stride(1), + k_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + k_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + att_out.stride(0), + att_out.stride(1), + att_out.stride(2), + kv_group_num=kv_group_num, + q_head_num=head_num, + BLOCK_DMODEL=BLOCK_DMODEL, + BLOCK_DPE=BLOCK_DPE, + BLOCK_DV=BLOCK_DV, + BLOCK_N=BLOCK, + BLOCK_H=BLOCK_H, + NUM_KV_SPLITS=NUM_KV_SPLITS, + PAGE_SIZE=page_size, + logit_cap=logit_cap, + num_warps=4, + num_stages=num_stages, + Lk=Lk, + Lv=Lv, + **extra_kargs, + ) + + +@triton.jit +def _fwd_kernel_stage2( + Mid_O, + o, + lse, + B_Seqlen, + stride_mid_ob, + stride_mid_oh, + stride_mid_os, + stride_obs, + stride_oh, + stride_lse_bs, + NUM_KV_SPLITS: tl.constexpr, + BLOCK_DV: tl.constexpr, + Lv: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + + offs_d = tl.arange(0, BLOCK_DV) + mask_d = offs_d < Lv + + e_sum = 0.0 + e_max = -float("inf") + acc = tl.zeros([BLOCK_DV], dtype=tl.float32) + + offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d + offs_logic = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + Lv + + for split_kv_id in range(0, NUM_KV_SPLITS): + kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS) + split_kv_start = kv_len_per_split * split_kv_id + split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len) + + if split_kv_end > split_kv_start: + tv = tl.load( + Mid_O + offs_v + split_kv_id * stride_mid_os, mask=mask_d, other=0.0 + ) + tlogic = tl.load(Mid_O + offs_logic + split_kv_id * stride_mid_os) + n_e_max = tl.maximum(tlogic, e_max) + + old_scale = tl.exp(e_max - n_e_max) + acc *= old_scale + exp_logic = tl.exp(tlogic - n_e_max) + acc += exp_logic * tv + + e_sum = e_sum * old_scale + exp_logic + e_max = n_e_max + + tl.store( + o + cur_batch * stride_obs + cur_head * stride_oh + offs_d, + acc / e_sum, + mask=mask_d, + ) + lse_val = e_max + tl.log(e_sum) + tl.store( + lse + cur_batch * stride_lse_bs + cur_head, + lse_val, + ) + + +def _decode_softmax_reducev_fwd( + logits, + q, + o, + lse, + v_buffer, + b_seq_len, + num_kv_splits, +): + batch, head_num = q.shape[0], q.shape[1] + Lv = v_buffer.shape[-1] + BLOCK_DV = triton.next_power_of_2(Lv) + + NUM_KV_SPLITS = num_kv_splits + + extra_kargs = {} + if is_hip_: + # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html + # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py + extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2} + + grid = (batch, head_num) + _fwd_kernel_stage2[grid]( + logits, + o, + lse, + b_seq_len, + logits.stride(0), + logits.stride(1), + logits.stride(2), + o.stride(0), + o.stride(1), + lse.stride(0), + NUM_KV_SPLITS=NUM_KV_SPLITS, + BLOCK_DV=BLOCK_DV, + Lv=Lv, + num_warps=4, + num_stages=2, + **extra_kargs, + ) + + +def decode_attention_fwd_normal( + q, + k_buffer, + v_buffer, + o, + lse, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size, + logit_cap=0.0, +): + _decode_att_m_fwd( + q, + k_buffer, + v_buffer, + attn_logits, + req_to_token, + b_seq_len, + num_kv_splits, + sm_scale, + page_size, + logit_cap, + ) + _decode_softmax_reducev_fwd( + attn_logits, q, o, lse, v_buffer, b_seq_len, num_kv_splits + ) + + +def decode_attention_fwd_grouped( + q, + k_buffer, + v_buffer, + o, + lse, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size, + logit_cap=0.0, +): + _decode_grouped_att_m_fwd( + q, + k_buffer, + v_buffer, + attn_logits, + req_to_token, + b_seq_len, + num_kv_splits, + sm_scale, + page_size, + logit_cap, + ) + _decode_softmax_reducev_fwd( + attn_logits, q, o, lse, v_buffer, b_seq_len, num_kv_splits + ) + + +def decode_attention_fwd( + q, + k_buffer, + v_buffer, + o, + lse, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size=1, + logit_cap=0.0, +): + assert num_kv_splits == attn_logits.shape[2] + kv_group_num = q.shape[1] // v_buffer.shape[-2] + + if kv_group_num == 1: + # MHA + decode_attention_fwd_normal( + q, + k_buffer, + v_buffer, + o, + lse, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size, + logit_cap, + ) + else: + # GQA/MQA/MLA + decode_attention_fwd_grouped( + q, + k_buffer, + v_buffer, + o, + lse, + req_to_token, + b_seq_len, + attn_logits, + num_kv_splits, + sm_scale, + page_size, + logit_cap, + ) diff --git a/attention/ops/triton_merge_attn_states.py b/attention/ops/triton_merge_attn_states.py new file mode 100644 index 0000000..3c87a24 --- /dev/null +++ b/attention/ops/triton_merge_attn_states.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.triton_utils import tl, triton + + +# Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 +# can be used to combine partial attention results (in the split-KV case) +def merge_attn_states( + output: torch.Tensor, + prefix_output: torch.Tensor, + prefix_lse: torch.Tensor, + suffix_output: torch.Tensor, + suffix_lse: torch.Tensor, + output_lse: torch.Tensor | None = None, +) -> None: + num_tokens = output.shape[0] + num_query_heads = output.shape[1] + head_size = output.shape[2] + padded_head_size = triton.next_power_of_2(head_size) + + # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead. + merge_attn_states_kernel[(num_tokens, num_query_heads)]( + output, + output_lse, + prefix_output, + prefix_lse, + suffix_output, + suffix_lse, + head_size, + padded_head_size, + output_lse is not None, + ) + + +@triton.jit +def merge_attn_states_kernel( + output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + output_lse, # [NUM_HEADS, NUM_TOKENS] + prefix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + prefix_lse, # [NUM_HEADS, NUM_TOKENS] + suffix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + suffix_lse, # [NUM_HEADS, NUM_TOKENS] + HEAD_SIZE: tl.constexpr, + PADDED_HEAD_SIZE: tl.constexpr, + OUTPUT_LSE: tl.constexpr, +): + token_idx = tl.program_id(0) + num_tokens = tl.num_programs(0) + head_idx = tl.program_id(1) + num_heads = tl.num_programs(1) + + p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx) + s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx) + + # FA2 and FA3 have different behavior for when the sum-exp is 0, this namely + # arises with 0 len seqlens. FA3 returns -inf here while FA2 returns inf. + # If we see an inf assume FA2 and convert inf to -inf for consistency + # and correctness. Inf generally doesn't make sense in this context outside + # of undefined-behavior/FA2-case, so I think this a safe assumption. + p_lse = float("-inf") if p_lse == float("inf") else p_lse + s_lse = float("-inf") if s_lse == float("inf") else s_lse + + max_lse = tl.maximum(p_lse, s_lse) + p_lse = p_lse - max_lse + s_lse = s_lse - max_lse + # Will reuse precomputed Exp values for scale factor computation. + p_se = tl.exp(p_lse) + s_se = tl.exp(s_lse) + out_se = p_se + s_se + + if OUTPUT_LSE: + out_lse = tl.log(out_se) + max_lse + tl.store(output_lse + head_idx * num_tokens + token_idx, out_lse) + + head_arange = tl.arange(0, PADDED_HEAD_SIZE) + head_mask = head_arange < HEAD_SIZE + p_out = tl.load( + prefix_output + + token_idx * num_heads * HEAD_SIZE + + head_idx * HEAD_SIZE + + head_arange, + mask=head_mask, + ) + s_out = tl.load( + suffix_output + + token_idx * num_heads * HEAD_SIZE + + head_idx * HEAD_SIZE + + head_arange, + mask=head_mask, + ) + + # NOTE(woosuk): Be careful with the numerical stability. + # We should compute the scale first, and then multiply it with the output. + # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly. + p_scale = p_se / out_se + s_scale = s_se / out_se + out = p_out * p_scale + s_out * s_scale + tl.store( + output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange, + out, + mask=head_mask, + ) diff --git a/attention/ops/triton_reshape_and_cache_flash.py b/attention/ops/triton_reshape_and_cache_flash.py new file mode 100644 index 0000000..5d2ba15 --- /dev/null +++ b/attention/ops/triton_reshape_and_cache_flash.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton + + +@triton.jit +def reshape_and_cache_kernel_flash( + key_ptr, # [num_tokens, num_heads, head_size] + value_ptr, # [num_tokens, num_heads, head_size] + key_cache_ptr, # [num_blocks, block_size, num_heads, head_size] + value_cache_ptr, # [num_blocks, block_size, num_heads, head_size] + slot_mapping_ptr, # [num_tokens] + k_scale, # float32 + v_scale, # float32 + # strides + key_stride: tl.int64, + value_stride: tl.int64, + block_stride: tl.int64, + page_stride: tl.int64, + num_heads: tl.constexpr, + head_size: tl.constexpr, + block_size: tl.constexpr, + # FP8 flags + FP8_KV_CACHE: tl.constexpr, + # tune parameters + TILE_SIZE: tl.constexpr, +): + token_idx = tl.program_id(axis=0) + slot_idx = tl.load(slot_mapping_ptr + token_idx).to(tl.int64) + if slot_idx < 0: + # Padding token that should be ignored. + return + + tile_i = tl.program_id(axis=1) + tile_offs = tl.arange(0, TILE_SIZE) + tile_pos = tile_i * TILE_SIZE + tile_offs + + block_idx = slot_idx // block_size + block_offset = slot_idx % block_size + + src_key_idx = token_idx * key_stride + src_value_idx = token_idx * value_stride + + tgt_idx = block_idx * block_stride + block_offset * page_stride + + # [TILE_SIZE] + key_load = tl.load( + key_ptr + src_key_idx + tile_pos, mask=tile_pos < (num_heads * head_size) + ) + if FP8_KV_CACHE: + # tl.store will do the correct implicit cast to fp8, + # based on the key_cache_ptr.dtype.element_ty + key_tile = key_load if key_load.dtype.is_fp8() else key_load / tl.load(k_scale) + else: + key_tile = key_load + + # [TILE_SIZE] + value_load = tl.load( + value_ptr + src_value_idx + tile_pos, mask=tile_pos < (num_heads * head_size) + ) + if FP8_KV_CACHE: + if value_load.dtype.is_fp8(): + value_tile = value_load + else: + # tl.store will do the correct implicit cast to fp8, + # based on the value_cache_ptr.dtype.element_ty + value_tile = value_load / tl.load(v_scale) + else: + value_tile = value_load + + tl.store( + key_cache_ptr + tgt_idx + tile_pos, + key_tile, + mask=tile_pos < (num_heads * head_size), + ) + tl.store( + value_cache_ptr + tgt_idx + tile_pos, + value_tile, + mask=tile_pos < (num_heads * head_size), + ) + return + + +def triton_reshape_and_cache_flash( + key: torch.Tensor, # [num_tokens, num_heads, head_size] + value: torch.Tensor, # [num_tokens, num_heads, head_size] + # [num_blocks, block_size, num_heads, head_size] + key_cache: torch.Tensor, + # [num_blocks, block_size, num_heads, head_size] + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, # [num_tokens] + kv_cache_dtype: str, # "auto", "fp8" + k_scale: torch.Tensor, # float32 + v_scale: torch.Tensor, # float32 +): + num_heads = key.shape[1] + head_size = key.shape[2] + block_size = key_cache.shape[1] + n = num_heads * head_size + + key_stride = key.stride()[0] + value_stride = value.stride()[0] + block_stride = key_cache.stride()[0] + page_stride = key_cache.stride()[1] + + head_stride = key_cache.stride()[2] + assert head_stride == head_size, "only continous heads are supported" + + assert kv_cache_dtype == "auto" or kv_cache_dtype.startswith("fp8"), ( + f"unsupported kv_cache_dtype (str), got {kv_cache_dtype}." + ) + kv_cache_torch_dtype = ( + current_platform.fp8_dtype() + if kv_cache_dtype.startswith("fp8") + else key_cache.dtype + ) + + if key_cache.dtype != kv_cache_torch_dtype and kv_cache_dtype.startswith("fp8"): + # to avoid erounous implicit cast in triton kernel (tl.store to uint8) + # (e.g. explicit cast to fp8e4m3fnuz is not supported in triton 3.4) + key_cache = key_cache.view(kv_cache_torch_dtype) + value_cache = value_cache.view(kv_cache_torch_dtype) + assert kv_cache_dtype != torch.uint8, ( + "explicit fp8 cast and store to " + "uint8 is not supported by triton reshape_and_cache_flash" + ) + + FP8_KV_CACHE = kv_cache_dtype.startswith("fp8") + assert (not FP8_KV_CACHE) or kv_cache_torch_dtype in [ + torch.float8_e4m3fn, + torch.float8_e5m2, + torch.uint8, + torch.float8_e4m3fnuz, + ], ( + "unsupported dtype of KV cache tensor, got " + "{kv_cache_torch_dtype}. Supported kv cache dtypes: fp8e4m3fn, " + "fp8e5m2, uint8, bfloat16, float16, float32, fp8e4m3fnuz." + ) + + # heuristics instead of autotuning + TILE_SIZE = min(2048, triton.next_power_of_2(n)) + if current_platform.is_rocm() or current_platform.is_xpu(): + num_stages = 4 + num_warps = 8 + else: # cuda + num_stages = 10 + num_warps = 16 + if torch.cuda.get_device_capability(key.device)[0] < 9: + TILE_SIZE = min(512, TILE_SIZE) + + # TODO(ngl): maybe replace with static launch grid to avoid overhead if + # using cudagraphs + grid = lambda meta: ( + slot_mapping.shape[0], + triton.cdiv(n, meta["TILE_SIZE"]), + ) + + reshape_and_cache_kernel_flash[grid]( + key_ptr=key, + value_ptr=value, + key_cache_ptr=key_cache, + value_cache_ptr=value_cache, + slot_mapping_ptr=slot_mapping, + k_scale=k_scale, + v_scale=v_scale, + # strides + key_stride=key_stride, + value_stride=value_stride, + block_stride=block_stride, + page_stride=page_stride, + num_heads=num_heads, + head_size=head_size, + block_size=block_size, + # FP8 flags + FP8_KV_CACHE=FP8_KV_CACHE, + # autotune parameters + TILE_SIZE=TILE_SIZE, + num_warps=num_warps, + num_stages=num_stages, + ) diff --git a/attention/ops/triton_unified_attention.py b/attention/ops/triton_unified_attention.py new file mode 100644 index 0000000..565be1c --- /dev/null +++ b/attention/ops/triton_unified_attention.py @@ -0,0 +1,941 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Authors: +# - Burkhard Ringlein +# - Jan van Lunteren +# - Chih-Chieh Yang +# - Thomas Parnell + +import torch + +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton + +logger = init_logger(__name__) +float8_info = torch.finfo(current_platform.fp8_dtype()) + + +@triton.jit +def cdiv_fn(x, y): + return (x + y - 1) // y + + +@triton.jit +def apply_softcap(S, x): + Sdiv = S / x + p1 = tl.exp(Sdiv) + p2 = tl.exp(-Sdiv) + return x * (p1 - p2) / (p1 + p2) + + +@triton.jit +def find_seq_idx( + query_start_len_ptr, + target_idx, + num_seqs, + BLOCK_Q: tl.constexpr, + use_q_block_mode: tl.constexpr, +): + left: tl.int32 = 0 + right = num_seqs + while left < right: + mid = (left + right) // 2 + val = tl.load(query_start_len_ptr + mid) + mid_val = val // BLOCK_Q + mid if use_q_block_mode else val + + if mid_val <= target_idx: + left = mid + 1 + else: + right = mid + + return left - 1 + + +@triton.jit +def kernel_unified_attention_2d( + output_ptr, # [num_tokens, num_query_heads, head_size] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + sink_ptr, # [num_query_heads] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + qq_bias_ptr, # [num_query_tokens, num_query_tokens] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + out_scale, # float32 + softcap, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + qq_bias_stride_0: tl.int64, # int + BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int must be power of 2 + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_QQ_BIAS: tl.constexpr, # bool + USE_SOFTCAP: tl.constexpr, # bool + USE_SINKS: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + num_seqs: tl.int32, + BLOCK_M: tl.constexpr, # int + USE_FP8: tl.constexpr, # bool + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, +): + q_block_global_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + + seq_idx = find_seq_idx( + query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True + ) + + q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx + + q_block_local_idx = q_block_global_idx - q_block_start_idx + + cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx) + cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1) + + cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index + + if q_block_local_idx * BLOCK_Q >= cur_batch_query_len: + return + + offs_m = tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_SIZE_PADDED) + offs_t = tl.arange(0, TILE_SIZE) + query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv + + query_offset_0 = cur_batch_in_all_start_index + query_pos + query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv + query_offset = ( + query_offset_0[:, None] * query_stride_0 + + query_offset_1[:, None] * query_stride_1 + + offs_d[None, :] + ) + + dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1) + query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1) + query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1) + + # Q : (BLOCK_M, HEAD_SIZE_PADDED) + Q = tl.load( + query_ptr + query_offset, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + other=0.0, + ) + + block_table_offset = seq_idx * block_table_stride + + if not USE_SINKS: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + M = tl.load( + sink_ptr + query_offset_1, + mask=query_mask_1, + other=float("-inf"), + ).to(dtype=tl.float32) + + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # context length for this particular sequences + context_len = seq_len - cur_batch_query_len + + # alibi slope for this head + if USE_ALIBI_SLOPES: + alibi_slope = tl.load( + alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0 + ) + + # query-query attention bias + if USE_QQ_BIAS: + qq_bias_row_ptrs = ( + qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0 + ) # shape: [BLOCK_M] + + # compute the length of the longest sequence prefix spanned by any + # query token in the current q_block (q_block_local_idx) + max_seq_prefix_len = ( + context_len + + q_block_local_idx * BLOCK_Q + + (BLOCK_M - 1) // num_queries_per_kv + + 1 + ) + + # adjust for potential padding in the last q_block by considering the + # actual sequence length + max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) + + # calculate the number of tiles that need to be processed to + # cover the longest sequence prefix (due to causal masking, tiles beyond + # this prefix can be skipped) + num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) + + # ---- Sliding-window tile pruning -------------------- + # Default: keep previous global behavior + tile_start = 0 + tile_end = num_tiles + if SLIDING_WINDOW > 0: + # Query rows covered by this Q-block + qpos_lo = q_block_local_idx * BLOCK_Q + qpos_hi = tl.minimum( + qpos_lo + (BLOCK_M - 1) // num_queries_per_kv, + cur_batch_query_len - 1, + ) + # For sliding window, each query position q can only attend to + # keys in the range [q_abs - SLIDING_WINDOW + 1, q_abs] + # where q_abs = context_len + q + # The union of allowed key positions for this Q-block is: + # [context_len + qpos_lo - SLIDING_WINDOW + 1, context_len + qpos_hi] + first_allowed_key = context_len + qpos_lo - SLIDING_WINDOW + 1 + last_allowed_key = context_len + qpos_hi + # Convert to tile indices and clamp + tile_start = tl.maximum(0, first_allowed_key // TILE_SIZE) + tile_end = tl.minimum((last_allowed_key // TILE_SIZE) + 1, num_tiles) + + # iterate through tiles (now limited to the sliding window range) + for j in range(tile_start, tile_end): + seq_offset = j * TILE_SIZE + offs_t + tile_mask = seq_offset < max_seq_prefix_len + + physical_block_idx = tl.load( + block_tables_ptr + block_table_offset + seq_offset // BLOCK_SIZE + ).to(tl.int64) + + v_offset = ( + physical_block_idx[:, None] * stride_v_cache_0 + + kv_head_idx * stride_v_cache_2 + + offs_d[None, :] * stride_v_cache_3 + + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1 + ) + + k_offset = ( + physical_block_idx[None, :] * stride_k_cache_0 + + kv_head_idx * stride_k_cache_2 + + offs_d[:, None] * stride_k_cache_3 + + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1 + ) + + # K : (HEAD_SIZE, TILE_SIZE) + K_load = tl.load( + key_cache_ptr + k_offset, + mask=dim_mask[:, None] & tile_mask[None, :], + other=0.0, + ) + + if K_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + K = K_load + else: + K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype) + else: + K = K_load + + # V : (TILE_SIZE, HEAD_SIZE) + V_load = tl.load( + value_cache_ptr + v_offset, + mask=dim_mask[None, :] & tile_mask[:, None], + other=0.0, + ) + + if V_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + V = V_load + else: + V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype) + else: + V = V_load + + seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 + + # S : (BLOCK_M, TILE_SIZE) + S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) + + S += scale * tl.dot(Q, K) + + if USE_SOFTCAP: + S = apply_softcap(S, softcap) + + S = tl.where( + query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf") + ) + + if SLIDING_WINDOW > 0: + S = tl.where( + (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW, + S, + float("-inf"), + ) + + if USE_ALIBI_SLOPES: + S += alibi_slope[:, None] * (seq_offset - context_len) + + if USE_QQ_BIAS: + # compute key positions relative to query section + key_rel_pos = seq_offset - context_len # shape: [BLOCK_SIZE] + # load bias only for keys that correspond to queries + is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0 + qq_bias = tl.load( + qq_bias_row_ptrs + key_rel_pos[None, :], + mask=is_query_key[None, :], # avoid OOB for context keys + other=0.0, + ) + S += qq_bias + + # compute running maximum + # m_j : (BLOCK_M,) + m_j = tl.maximum(M, tl.max(S, axis=1)) + + # For sliding window there's a chance the max is -inf due to masking of + # the entire row. In this case we need to set m_j 0 to avoid NaN + m_j = tl.where(m_j > float("-inf"), m_j, 0.0) + + # P : (BLOCK_M, TILE_SIZE) + P = tl.exp(S - m_j[:, None]) + + # l_j : (BLOCK_M,) + l_j = tl.sum(P, axis=1) + + # alpha : (BLOCK_M, ) + alpha = tl.exp(M - m_j) + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc = acc * alpha[:, None] + + # update constants + L = L * alpha + l_j + M = m_j + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc += tl.dot(P.to(V.dtype), V) + + # epilogue + acc = acc / L[:, None] + if USE_FP8: + acc = acc * tl.load(out_scale) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + + output_offset = ( + query_offset_0[:, None] * output_stride_0 + + query_offset_1[:, None] * output_stride_1 + + offs_d[None, :] + ) + + tl.store( + output_ptr + output_offset, + acc, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + ) + + +@triton.jit +def kernel_unified_attention_3d( + segm_output_ptr, + # [num_tokens, num_query_heads, num_segments, head_size] + segm_max_ptr, # [num_tokens, num_query_heads, num_segments] + segm_expsum_ptr, # [num_tokens, num_query_heads, num_segments] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] + value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + sink_ptr, # [num_query_heads] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + qq_bias_ptr, # [num_query_tokens, num_query_tokens] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + softcap, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + qq_bias_stride_0: tl.int64, # int + BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int, must be power of 2 + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_QQ_BIAS: tl.constexpr, # bool + USE_SOFTCAP: tl.constexpr, # bool + USE_SINKS: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + num_seqs: tl.int32, + BLOCK_M: tl.constexpr, # int + NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int +): + q_block_global_idx = tl.program_id(0) + kv_head_idx = tl.program_id(1) + segm_idx = tl.program_id(2) + + seq_idx = find_seq_idx( + query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True + ) + + q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx + + q_block_local_idx = q_block_global_idx - q_block_start_idx + + cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx) + cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1) + + cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index + + if q_block_local_idx * BLOCK_Q >= cur_batch_query_len: + return + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # number of segments for this particular sequence + num_segments = NUM_SEGMENTS_PER_SEQ + tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE) + + if segm_idx * tiles_per_segment * TILE_SIZE >= seq_len: + return + + offs_m = tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, HEAD_SIZE_PADDED) + offs_t = tl.arange(0, TILE_SIZE) + query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv + + query_offset_0 = cur_batch_in_all_start_index + query_pos + query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv + query_offset = ( + query_offset_0[:, None] * query_stride_0 + + query_offset_1[:, None] * query_stride_1 + + offs_d[None, :] + ) + + dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1) + query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1) + query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1) + + # Q : (BLOCK_M, HEAD_SIZE_PADDED) + Q = tl.load( + query_ptr + query_offset, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + other=0.0, + ) + + block_table_offset = seq_idx * block_table_stride + + if USE_SINKS: + if segm_idx == 0: + M = tl.load( + sink_ptr + query_offset_1, + mask=query_mask_1, + other=float("-inf"), + ).to(dtype=tl.float32) + else: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + else: + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) + + # context length for this particular sequences + context_len = seq_len - cur_batch_query_len + + # alibi slope for this head + if USE_ALIBI_SLOPES: + alibi_slope = tl.load( + alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0 + ) + + # query-query attention bias + if USE_QQ_BIAS: + qq_bias_row_ptrs = ( + qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0 + ) # shape: [BLOCK_M] + + # compute the length of the longest sequence prefix spanned by any + # query token in the current q_block (q_block_local_idx) + max_seq_prefix_len = ( + context_len + + q_block_local_idx * BLOCK_Q + + (BLOCK_M - 1) // num_queries_per_kv + + 1 + ) + + # adjust for potential padding in the last q_block by considering the + # actual sequence length + max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) + + # calculate the number of tiles that need to be processed to + # cover the longest sequence prefix (due to causal masking, tiles beyond + # this prefix can be skipped) + num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) + + # iterate through tiles within current segment + for j in range( + segm_idx * tiles_per_segment, + min((segm_idx + 1) * tiles_per_segment, num_tiles), + ): + seq_offset = j * TILE_SIZE + offs_t + tile_mask = seq_offset < max_seq_prefix_len + + physical_block_idx = tl.load( + block_tables_ptr + block_table_offset + seq_offset // BLOCK_SIZE + ).to(tl.int64) + + v_offset = ( + physical_block_idx[:, None] * stride_v_cache_0 + + kv_head_idx * stride_v_cache_2 + + offs_d[None, :] * stride_v_cache_3 + + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1 + ) + + k_offset = ( + physical_block_idx[None, :] * stride_k_cache_0 + + kv_head_idx * stride_k_cache_2 + + offs_d[:, None] * stride_k_cache_3 + + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1 + ) + + # K : (HEAD_SIZE, TILE_SIZE) + K_load = tl.load( + key_cache_ptr + k_offset, + mask=dim_mask[:, None] & tile_mask[None, :], + other=0.0, + ) + + if K_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + K = K_load + else: + K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype) + else: + K = K_load + + # V : (TILE_SIZE, HEAD_SIZE) + V_load = tl.load( + value_cache_ptr + v_offset, + mask=dim_mask[None, :] & tile_mask[:, None], + other=0.0, + ) + + if V_load.dtype.is_fp8(): + if Q.dtype.is_fp8(): + V = V_load + else: + V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype) + else: + V = V_load + + seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 + + # S : (BLOCK_M, TILE_SIZE) + S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) + S += scale * tl.dot(Q, K) + + if USE_SOFTCAP: + S = apply_softcap(S, softcap) + + S = tl.where( + query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf") + ) + + if SLIDING_WINDOW > 0: + S = tl.where( + (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW, + S, + float("-inf"), + ) + + if USE_ALIBI_SLOPES: + S += alibi_slope[:, None] * (seq_offset - context_len) + + if USE_QQ_BIAS: + # compute key positions relative to query section + key_rel_pos = seq_offset - context_len # shape: [BLOCK_SIZE] + # load bias only for keys that correspond to queries + is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0 + qq_bias = tl.load( + qq_bias_row_ptrs + key_rel_pos[None, :], + mask=is_query_key[None, :], # avoid OOB for context keys + other=0.0, + ) + S += qq_bias + + # compute running maximum + # m_j : (BLOCK_M,) + m_j = tl.maximum(M, tl.max(S, axis=1)) + + # For sliding window there's a chance the max is -inf due to masking of + # the entire row. In this case we need to set m_j 0 to avoid NaN + m_j = tl.where(m_j > float("-inf"), m_j, 0.0) + + # P : (BLOCK_M, TILE_SIZE,) + P = tl.exp(S - m_j[:, None]) + + # l_j : (BLOCK_M,) + l_j = tl.sum(P, axis=1) + + # alpha : (BLOCK_M, ) + alpha = tl.exp(M - m_j) + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc = acc * alpha[:, None] + + # update constants + L = L * alpha + l_j + M = m_j + + # acc : (BLOCK_M, HEAD_SIZE_PADDED) + acc += tl.dot(P.to(V.dtype), V) + + segm_output_offset = ( + query_offset_0[:, None].to(tl.int64) + * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + segm_idx * HEAD_SIZE_PADDED + + tl.arange(0, HEAD_SIZE_PADDED)[None, :] + ) + tl.store( + segm_output_ptr + segm_output_offset, + acc, + mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], + ) + segm_offset = ( + query_offset_0.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ) + + query_offset_1 * NUM_SEGMENTS_PER_SEQ + + segm_idx + ) + tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1) + tl.store(segm_expsum_ptr + segm_offset, L, mask=query_mask_0 & query_mask_1) + + +@triton.jit +def reduce_segments( + output_ptr, # [num_tokens, num_query_heads, head_size] + segm_output_ptr, + # [num_tokens, num_query_heads, max_num_segments, head_size] + segm_max_ptr, # [num_tokens, num_query_heads, max_num_segments] + segm_expsum_ptr, # [num_tokens, num_query_heads, max_num_segments] + seq_lens_ptr, # [num_seqs] + num_seqs, # int + num_query_heads: tl.constexpr, # int + out_scale_inv, # float32 + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + block_table_stride: tl.int64, # int + TILE_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int, must be power of 2 + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + NUM_SEGMENTS_PER_SEQ: tl.constexpr, # int + USE_FP8: tl.constexpr, # bool + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, +): + query_token_idx = tl.program_id(0) + query_head_idx = tl.program_id(1) + + seq_idx = find_seq_idx( + query_start_len_ptr, query_token_idx, num_seqs, BLOCK_Q, False + ) + + # sequence len for this particular sequence + seq_len = tl.load(seq_lens_ptr + seq_idx) + + # number of segments for this particular sequence + num_segments = NUM_SEGMENTS_PER_SEQ + tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE) + + # create masks for subsequent loads + act_num_segments = cdiv_fn(seq_len, tiles_per_segment * TILE_SIZE) + segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full( + [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32 + ) + dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, 0).to(tl.int1) + + # load segment maxima + segm_offset = ( + query_token_idx.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ) + + query_head_idx * NUM_SEGMENTS_PER_SEQ + + tl.arange(0, NUM_SEGMENTS_PER_SEQ) + ) + segm_max = tl.load(segm_max_ptr + segm_offset, mask=segm_mask, other=float("-inf")) + overall_max = tl.max(segm_max) + + # load and rescale segment exp sums + segm_expsum = tl.load(segm_expsum_ptr + segm_offset, mask=segm_mask, other=0.0) + segm_expsum = segm_expsum * tl.exp(segm_max - overall_max) + overall_expsum = tl.sum(segm_expsum) + + # load, rescale, and add segment attention outputs + segm_output_offset = ( + query_token_idx.to(tl.int64) + * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) + + tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED + + tl.arange(0, HEAD_SIZE_PADDED)[None, :] + ) + segm_output = tl.load( + segm_output_ptr + segm_output_offset, + mask=segm_mask[:, None] & dim_mask[None, :], + other=0.0, + ) + segm_output *= tl.exp(segm_max - overall_max)[:, None] + acc_sum = tl.sum(segm_output, axis=0) + # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0 + acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum) + + if USE_FP8: + acc = acc * tl.load(out_scale_inv) + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + + # write result + output_offset = ( + query_token_idx * output_stride_0 + + query_head_idx * output_stride_1 + + tl.arange(0, HEAD_SIZE_PADDED) + ) + tl.store(output_ptr + output_offset, acc, mask=dim_mask) + + +def unified_attention( + q, + k, + v, + out, + cu_seqlens_q, + max_seqlen_q, + seqused_k, + max_seqlen_k, + softmax_scale, + causal, + window_size, + block_table, + softcap, + q_descale, + k_descale, + v_descale, + alibi_slopes=None, + output_scale=None, + qq_bias=None, + # Optional tensor for sinks + sinks=None, +): + assert causal, "Only causal attention is supported" + assert q_descale is None, "Q scales not supported" + + if sinks is not None: + assert sinks.shape[0] == q.shape[1], "Sinks must be num_query_heads size" + + use_alibi_slopes = alibi_slopes is not None + use_qq_bias = qq_bias is not None + + block_size = v.shape[1] + num_seqs = len(seqused_k) + num_query_heads = q.shape[1] + num_kv_heads = k.shape[2] + num_queries_per_kv = num_query_heads // num_kv_heads + head_size = q.shape[2] + + BLOCK_M = ( + 16 if num_queries_per_kv <= 16 else triton.next_power_of_2(num_queries_per_kv) + ) + BLOCK_Q = BLOCK_M // num_queries_per_kv + + # Ideally we would launch with kernel with: + # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks. + # However, it is slow to realize the query_lens on cpu. + # Instead we use upper-bound: + # \sum_i[ceil(query_len[i] / BLOCK_Q)] + # <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1] + # = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs + # <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs + # = floor(q.shape[0] / BLOCK_Q) + num_seqs + total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs + + # Assigning default tile sizes for prefill and decode. + # Note: each tile size must be at least 32 for "fp8" (q.element_size() == 1) + # and at least 16 for all other data types. + TILE_SIZE_PREFILL = 32 + TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32 + + # if batch contains a prefill + if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128: + kernel_unified_attention_2d[ + ( + total_num_q_blocks, + num_kv_heads, + ) + ]( + output_ptr=out, + query_ptr=q, + key_cache_ptr=k, + value_cache_ptr=v, + sink_ptr=sinks, + block_tables_ptr=block_table, + seq_lens_ptr=seqused_k, + alibi_slopes_ptr=alibi_slopes, + qq_bias_ptr=qq_bias, + scale=softmax_scale, + k_scale=k_descale, + v_scale=v_descale, + out_scale=1 / output_scale if output_scale is not None else 1.0, + softcap=softcap, + num_query_heads=num_query_heads, + num_queries_per_kv=num_queries_per_kv, + block_table_stride=block_table.stride(0), + query_stride_0=q.stride(0), + query_stride_1=q.stride(1), + output_stride_0=out.stride(0), + output_stride_1=out.stride(1), + qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, + BLOCK_SIZE=block_size, + TILE_SIZE=TILE_SIZE_PREFILL, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + USE_ALIBI_SLOPES=use_alibi_slopes, + USE_QQ_BIAS=use_qq_bias, + USE_SOFTCAP=(softcap > 0), + USE_SINKS=(sinks is not None), + SLIDING_WINDOW=(1 + window_size[0]), + stride_k_cache_0=k.stride(0), + stride_k_cache_1=k.stride(1), + stride_k_cache_2=k.stride(2), + stride_k_cache_3=k.stride(3), + stride_v_cache_0=v.stride(0), + stride_v_cache_1=v.stride(1), + stride_v_cache_2=v.stride(2), + stride_v_cache_3=v.stride(3), + query_start_len_ptr=cu_seqlens_q, + BLOCK_Q=BLOCK_Q, + num_seqs=num_seqs, + BLOCK_M=BLOCK_M, + USE_FP8=output_scale is not None, + ) + else: + # for initial version, NUM_SEGMENTS = 16 is chosen as a default + # value that showed good performance in tests + NUM_SEGMENTS = 16 + + segm_output = torch.empty( + q.shape[0], + num_query_heads, + NUM_SEGMENTS, + triton.next_power_of_2(head_size), + dtype=torch.float32, + device=q.device, + ) + segm_max = torch.empty( + q.shape[0], + num_query_heads, + NUM_SEGMENTS, + dtype=torch.float32, + device=q.device, + ) + segm_expsum = torch.empty( + q.shape[0], + num_query_heads, + NUM_SEGMENTS, + dtype=torch.float32, + device=q.device, + ) + + kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)]( + segm_output_ptr=segm_output, + segm_max_ptr=segm_max, + segm_expsum_ptr=segm_expsum, + query_ptr=q, + key_cache_ptr=k, + value_cache_ptr=v, + sink_ptr=sinks, + block_tables_ptr=block_table, + seq_lens_ptr=seqused_k, + alibi_slopes_ptr=alibi_slopes, + qq_bias_ptr=qq_bias, + scale=softmax_scale, + k_scale=k_descale, + v_scale=v_descale, + softcap=softcap, + num_query_heads=num_query_heads, + num_queries_per_kv=num_queries_per_kv, + block_table_stride=block_table.stride(0), + query_stride_0=q.stride(0), + query_stride_1=q.stride(1), + qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, + BLOCK_SIZE=block_size, + TILE_SIZE=TILE_SIZE_DECODE, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + USE_ALIBI_SLOPES=use_alibi_slopes, + USE_QQ_BIAS=use_qq_bias, + USE_SOFTCAP=(softcap > 0), + USE_SINKS=(sinks is not None), + SLIDING_WINDOW=(1 + window_size[0]), + stride_k_cache_0=k.stride(0), + stride_k_cache_1=k.stride(1), + stride_k_cache_2=k.stride(2), + stride_k_cache_3=k.stride(3), + stride_v_cache_0=v.stride(0), + stride_v_cache_1=v.stride(1), + stride_v_cache_2=v.stride(2), + stride_v_cache_3=v.stride(3), + query_start_len_ptr=cu_seqlens_q, + BLOCK_Q=BLOCK_Q, + num_seqs=num_seqs, + BLOCK_M=BLOCK_M, + NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, + ) + reduce_segments[(q.shape[0], num_query_heads)]( + output_ptr=out, + segm_output_ptr=segm_output, + segm_max_ptr=segm_max, + segm_expsum_ptr=segm_expsum, + seq_lens_ptr=seqused_k, + num_seqs=num_seqs, + num_query_heads=num_query_heads, + out_scale_inv=1 / output_scale if output_scale is not None else 1.0, + output_stride_0=out.stride(0), + output_stride_1=out.stride(1), + block_table_stride=block_table.stride(0), + TILE_SIZE=TILE_SIZE_DECODE, + HEAD_SIZE=head_size, + HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), + query_start_len_ptr=cu_seqlens_q, + BLOCK_Q=BLOCK_Q, + NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, + USE_FP8=output_scale is not None, + ) diff --git a/attention/ops/vit_attn_wrappers.py b/attention/ops/vit_attn_wrappers.py new file mode 100644 index 0000000..06a9f7c --- /dev/null +++ b/attention/ops/vit_attn_wrappers.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file contains ops for ViT attention to be compatible with torch.compile +as there are operations here not supported by torch.compile (for instance, +`to_list` in xformers attn, or `.item()` in flash attention) + +Using these ops and wrapping vision blocks with `torch.compile` can speed up +throughput in vision models by ~5% relative on H100, and improve token +latencies by ~7% (see qwen2_5_vl for example usage) + +To use these ops, you must have a recent version of PyTorch installed (>= 2.4.0) +""" + +import einops +import torch +import torch.nn.functional as F + +from vllm.utils.torch_utils import direct_register_custom_op + + +def xformers_attn_seqlens_wrapper( + q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor +) -> torch.Tensor: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask + + attn_bias = BlockDiagonalMask.from_seqlens( + q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device + ) + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None + ) + context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous() + return context_layer + + +def xformers_attn_seqlens_wrapper_fake( + q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor +) -> torch.Tensor: + b, s, h, d = q.shape + return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device) + + +direct_register_custom_op( + op_name="xformers_attn_seqlens_wrapper", + op_func=xformers_attn_seqlens_wrapper, + fake_impl=xformers_attn_seqlens_wrapper_fake, +) + + +def vit_xformers_attn_wrapper( + q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor +) -> torch.Tensor: + return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens) + + +def flash_attn_maxseqlen_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor, + batch_size: int, + is_rocm_aiter: bool, + use_upstream_fa: bool, +) -> torch.Tensor: + if is_rocm_aiter: + from aiter import flash_attn_varlen_func + else: + if use_upstream_fa: + from flash_attn import flash_attn_varlen_func + else: + from vllm.attention.utils.fa_utils import flash_attn_varlen_func + q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) + output = flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen.item(), + max_seqlen_k=max_seqlen.item(), + dropout_p=0.0, + causal=False, + ) + context_layer = einops.rearrange( + output, "(b s) h d -> s b (h d)", b=batch_size + ).contiguous() + return context_layer + + +def flash_attn_maxseqlen_wrapper_fake( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor, + batch_size: int, + is_rocm_aiter: bool, + use_upstream_fa: bool, +) -> torch.Tensor: + b, s, h, d = q.shape + return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device) + + +direct_register_custom_op( + op_name="flash_attn_maxseqlen_wrapper", + op_func=flash_attn_maxseqlen_wrapper, + fake_impl=flash_attn_maxseqlen_wrapper_fake, +) + + +def vit_flash_attn_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor, + batch_size: int, + is_rocm_aiter: bool, + use_upstream_fa: bool, +) -> torch.Tensor: + return torch.ops.vllm.flash_attn_maxseqlen_wrapper( + q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa + ) + + +# TODO: Once we have a torch 2.10, we can use tensor slices +# so we won't need to wrap this in custom ops +def torch_sdpa_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, +) -> torch.Tensor: + outputs = [] + for i in range(1, len(cu_seqlens)): + start_idx = cu_seqlens[i - 1] + end_idx = cu_seqlens[i] + q_i = q[:, start_idx:end_idx] + k_i = k[:, start_idx:end_idx] + v_i = v[:, start_idx:end_idx] + q_i, k_i, v_i = ( + einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] + ) + output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) + output_i = einops.rearrange(output_i, "b h s d -> b s h d ") + outputs.append(output_i) + context_layer = torch.cat(outputs, dim=1) + context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous() + return context_layer + + +def torch_sdpa_wrapper_fake( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, +) -> torch.Tensor: + b, s, h, d = q.shape + return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device) + + +direct_register_custom_op( + op_name="torch_sdpa_wrapper", + op_func=torch_sdpa_wrapper, + fake_impl=torch_sdpa_wrapper_fake, +) + + +def vit_torch_sdpa_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, +) -> torch.Tensor: + return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, cu_seqlens) diff --git a/attention/selector.py b/attention/selector.py new file mode 100644 index 0000000..1a092db --- /dev/null +++ b/attention/selector.py @@ -0,0 +1,231 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import inspect +import os +from collections.abc import Generator +from contextlib import contextmanager +from functools import cache +from typing import cast, get_args + +import torch + +import vllm.envs as envs +from vllm.attention.backends.abstract import AttentionBackend +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.config.cache import CacheDType +from vllm.logger import init_logger +from vllm.utils import STR_BACKEND_ENV_VAR +from vllm.utils.import_utils import resolve_obj_by_qualname + +logger = init_logger(__name__) + + +def get_env_variable_attn_backend() -> AttentionBackendEnum | None: + """ + Get the backend override specified by the vLLM attention + backend environment variable, if one is specified. + + Returns: + + * AttentionBackendEnum value if an override is specified + * None otherwise + """ + backend_name = os.environ.get(STR_BACKEND_ENV_VAR) + return None if backend_name is None else AttentionBackendEnum[backend_name] + + +# Global state allows a particular choice of backend +# to be forced, overriding the logic which auto-selects +# a backend based on system & workload configuration +# (default behavior if this variable is None) +# +# THIS SELECTION TAKES PRECEDENCE OVER THE +# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE +forced_attn_backend: AttentionBackendEnum | None = None + + +def global_force_attn_backend(attn_backend: AttentionBackendEnum | None) -> None: + """ + Force all attention operations to use a specified backend. + + Passing `None` for the argument re-enables automatic + backend selection., + + Arguments: + + * attn_backend: backend selection (None to revert to auto) + """ + global forced_attn_backend + forced_attn_backend = attn_backend + + +def get_global_forced_attn_backend() -> AttentionBackendEnum | None: + """ + Get the currently-forced choice of attention backend, + or None if auto-selection is currently enabled. + """ + return forced_attn_backend + + +def get_attn_backend( + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: str | None, + block_size: int | None, + use_mla: bool = False, + has_sink: bool = False, + use_sparse: bool = False, + attn_type: str | None = None, +) -> type[AttentionBackend]: + """Selects which attention backend to use and lazily imports it.""" + + if kv_cache_dtype is not None: + valid_cache_dtypes = get_args(CacheDType) + assert kv_cache_dtype in valid_cache_dtypes, ( + f"Invalid kv_cache_dtype: {kv_cache_dtype}. " + f"Valid values are: {valid_cache_dtypes}" + ) + + return _cached_get_attn_backend( + head_size=head_size, + dtype=dtype, + kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype), + block_size=block_size, + use_mla=use_mla, + has_sink=has_sink, + use_sparse=use_sparse, + attn_type=attn_type, + ) + + +@cache +def _cached_get_attn_backend( + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: CacheDType | None, + block_size: int | None, + use_mla: bool = False, + has_sink: bool = False, + use_sparse: bool = False, + attn_type: str | None = None, +) -> type[AttentionBackend]: + # Check whether a particular choice of backend was + # previously forced. + # + # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND + # ENVIRONMENT VARIABLE. + selected_backend = None + backend_by_global_setting: AttentionBackendEnum | None = ( + get_global_forced_attn_backend() + ) + if backend_by_global_setting is not None: + selected_backend = backend_by_global_setting + else: + # Check the environment variable and override if specified + backend_by_env_var: str | None = envs.VLLM_ATTENTION_BACKEND + if backend_by_env_var is not None: + if backend_by_env_var.endswith("_VLLM_V1"): + logger.warning( + "The suffix '_VLLM_V1' in the environment variable " + "%s is no longer necessary as V0 backends have been " + "deprecated. Please remove this suffix from your " + "environment variable setting.", + STR_BACKEND_ENV_VAR, + ) + backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1") + try: + selected_backend = AttentionBackendEnum[backend_by_env_var] + except KeyError as e: + raise ValueError( + f"Invalid attention backend: '{backend_by_env_var}'. Valid " + f"backends are: {list(AttentionBackendEnum.__members__.keys())}" + ) from e + + # get device-specific attn_backend + from vllm.platforms import current_platform + + sig = inspect.signature(current_platform.get_attn_backend_cls) + if "use_v1" in sig.parameters: + logger.warning_once( + "use_v1 parameter for get_attn_backend_cls is deprecated and will " + "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please " + "remove it from your plugin code." + ) + attention_cls = current_platform.get_attn_backend_cls( + selected_backend, + head_size, + dtype, + kv_cache_dtype, + block_size, + True, # use_v1 + use_mla, + has_sink, + use_sparse, + attn_type, + ) + else: + attention_cls = current_platform.get_attn_backend_cls( + selected_backend, + head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla, + has_sink, + use_sparse, + attn_type, + ) + if not attention_cls: + raise ValueError( + f"Invalid attention backend for {current_platform.device_name}" + ) + backend = resolve_obj_by_qualname(attention_cls) + + # Adjust kv cache layout if the selected backend requires a specific one + required_layout = backend.get_required_kv_cache_layout() + if required_layout is not None: + from vllm.v1.attention.backends.utils import set_kv_cache_layout + + set_kv_cache_layout(required_layout) + logger.info( + "Using %s KV cache layout for %s backend.", + required_layout, + backend.get_name(), + ) + + return backend + + +@contextmanager +def global_force_attn_backend_context_manager( + attn_backend: AttentionBackendEnum, +) -> Generator[None, None, None]: + """ + Globally force a vLLM attention backend override within a + context manager, reverting the global attention backend + override to its prior state upon exiting the context + manager. + + Arguments: + + * attn_backend: attention backend to force + + Returns: + + * Generator + """ + + # Save the current state of the global backend override (if any) + original_value = get_global_forced_attn_backend() + + # Globally force the new backend override + global_force_attn_backend(attn_backend) + + # Yield control back to the enclosed code block + try: + yield + finally: + # Revert the original global backend override, if any + global_force_attn_backend(original_value) + _cached_get_attn_backend.cache_clear() diff --git a/attention/utils/__init__.py b/attention/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/attention/utils/__pycache__/__init__.cpython-312.pyc b/attention/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90b39e11d733f78a2e2d15511a080ec02402d04e GIT binary patch literal 165 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVx#*YV7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?WHa^HWN5QtgUZfrc^yaWRPTk(rT^v4|PS0s!J;D6;?n literal 0 HcmV?d00001 diff --git a/attention/utils/__pycache__/fa_utils.cpython-312.pyc b/attention/utils/__pycache__/fa_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d899ddf1aa9af4bb83f3bc79bbd11abc92fec97d GIT binary patch literal 3702 zcma(TOKcm*b%wjk->&!(X*04YYHdljH<3kHc4Es2YD==+*iclsc8r>bE>_%?z0`8a z&MsxikN~ytAwduA&25l_j!N7^kM1o&awyOX*AhTxA)^l39()cmC1U6-F=&_V@bjE`vnLAF$k&?BTG6&t9Ca~M1C>O>^3l|P|QZ?}$4>X)16uWX= z#YiqvjOL;knLqbri~H%@vfl$fzY#BXA0!*@JhER#t_$|lZ`hB({_!{LC)B7K(2{CU zJEVpl1Q42K)$l_ucNlh(vp$u7$mUWzxB&QDYS%-Q>)F|fz|N7ykQ94CMN8OaHFMsA znb%F-mW|TgyBhf%yRm$Q5Y4pZvZ2_sB~qj$a=v2OrJ`IaTN2|Qo;4Kfo~+omDbFj! z&`f!@V&>gc$NqiYz9-Mk=au|D&F$?t*kRC3`@F{m;)8lwTcEPt2+@91(TQfsilNWw zuAgXjg_zU%9dCP9fj>tKd|Pk(yqLidClE8s5HmisgbS=kwFk8Qg%_C+#2CZ_Q)HKY zMQ$0Q%i|LEFnEg=*D|}ruK2E_-{B>;5ZOjvGkHDS4^u?E|-X{sWQFR%%`n7Pc8RJs zt5ghIH}8sesgq%JqoO!c=Ayw2Zkt*5#BT=nrRDmH&in@_` z!zhllRI~`K(ykwDu~4qKl6Odfyqne|yQmDQt&v$JuMryeBu<$R-PLSa)#mlQCg+v1 zGNT*1z36g9rBEV-8q!VOo|^hlzB)C2^8B?P57w z*2|}Cg}@)2TBw{GJ3n_$%R+U^mKW`NB{NGck%d|aH4`q*R&3p{vLMexM#_usVfybK z_SuODG5{CREU->Pbz4FIVYANznQ9m`ncG$ArR-FzTDc)%0*J zJ>jG$*3*}(!Jg+)u^Jt&28RJQf+#Uq>z15uY3;_-?h)tA^}nCqIP>8rIJ$<#Fcb%ShKfd1ZBcZRxA9MI) zt20melWVyrr@xRlh3=Zr?+E=3AC8Y~GfY5!N$Rm1+l&tg z{7r)B@zn<#(NQ@6!_+5J_1Kkq?8vvVQ)`9w*w}V}?TWkM;Kif&entr zj&R|L;s_TT9K-nueLC#10<4=>U;%lA7O~`;zD0&$(FByCsZdWxl(rSJbaT!k5)gd^ zpGB+Z3aazb{TEUBp1J=52t4Lmjk#IA(LrVugeb-p$ZuKLF?mG)BB5i$z(7zEl+VF)# z@gIDUOE1GHesT@3$KHlWh+o0=hGTAR+$a zHizQr&wdRhf<{lgF2w45tib{L%A&>j&&S6x`U*$K6U(6ZmYAl|-T{GxKG0te%>mkp-}vm5m3c)slo>-KBfvNZpCtlV zL7RMUjURCMfpvc9DL=6xjn$+}j&x~Vny9i9Ui?b4B#!eT=#ceY>XM*?LbYYqDah(Bt6U+f>*U^#%Hl2KDkK(c*Geo9tybW7<)UUQs$wfHJ5wqdE(4vnS#)Rt@lw`{ zEzd5m8#MTSu|%c(jEn&>^_2BB04S~){}4p@@G~^<488MRDEY@V0CZ@0`_V*mgE literal 0 HcmV?d00001 diff --git a/attention/utils/__pycache__/kv_sharing_utils.cpython-312.pyc b/attention/utils/__pycache__/kv_sharing_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17cf51f600c046a6fe437a53f4d80c29aa1fd921 GIT binary patch literal 1285 zcmZ`(-;2{o9G~gWw7YI^D@wPvG9bHb4oxbEUC%@G_ErRCTU=1jmyk`y-PklKlj&Yt zQV_W}{{RIMd_C?TxJTdbWwL#+1_eQ1e5>@$CnrfOoAYiSruqK*{FqMi&#_~9z?YeN zVl*-UdIz)k9@0U!EnmU#?a=q3&zsq1MyW5RY~n1Uby+?4yRXP6FSIG-72+_al% zmA^yI#*t&JLcHd3ohw|&!@$MA)-cPf;D4)vn{|a&owFBX3^`Ze&hsylai%^H7q$m$ zI~}YU4-BlMn|F}2uM)%BLsTVum?Bf{V}c&o1Pv<4aFAtFbfB7s{!_fehiI^_sa8j(e1-8dX9T#jteT8=Pr(9f~xY5AR}59D&6F z24`nd)JP1vMk$9kY-=XiutM6ltz%OfoDCt5%A)ZCC}BeC^!;*pFI0+^ zBh?6)6ghA3LOLJ$_6SS_uyCH8-(DdZfZFnn<(NdEM&G*?7Ll?d5#dpGu&v{U-vDz=A$|e(5-kg${e06l{8WT$6 zg$hHp!-d-0V&&OyOx*UW+oNKGEjB(ziMw9)?x=W=E#8}C!08L@!j@OP{<_-qs!f(E zj|;18ZNn>EdR^G`3Y(8^jI-63rNgYq<%}0ku=4d+y;l!e^ENB(c#AvJWTKFtfJ83O y~HB{LC*>*Th%I2_6RD8YlRcapw?DGpAub?EMN4LvZQ< literal 0 HcmV?d00001 diff --git a/attention/utils/__pycache__/kv_transfer_utils.cpython-312.pyc b/attention/utils/__pycache__/kv_transfer_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18a07133e448664ba28ff2c466544d4f306fc552 GIT binary patch literal 2274 zcmZ`4O>Y}TboSG0CytXiiAw@xQPb3J5}QhuTH2~AZGg~<^iWiZP|0lUak9>O*P0zC zaby?sA=N1oDhJe72oWR>LA~%3C|54F5-IJfl`6r7n@dtr6(R9v{&bC@O< z7L$}{rk%kgIY+EQ-szv$>^;yC-A*nZ-h&`!3TDcY38n>Nrnsgj?2zFT;M*Pt>{WDC zx+;Tj9QcHC+%<_qRAOvwWf|`aYZcvYN-J~J?-5H}){Ucg-#xRz=kM%xyNPN^;r*z@ zXG8zxObZ^*HJL^{dRu=LRi&z&5g@XPC$%h=#EWP@qTL?<2qME}frh<%T3kihKJV5( zzkytR+tOGPK83uMg)71mw}h6356}{1ZsngzoU({UEuzwSjjHpSnKrPkp1-6TS`h=b znKd;}bYKFqs%0n;@G?#nsBW36l?kY@G*i_m#iomMACIegM#Uy8h;&@NK<6=8(rr8r zs<6t+BM)+PTh&z4ny~WmfGg*M32!JV9#@w%o!V-~wGFpT$`C8(ibz?eiBn*Ey!CKd zr^i)WTLc%FlC~SxUM~m_;euLqOSENt668|6Or3!1MbgxAc*XxVl%MO`%6>Ihuvw@U zF_?a^&2&)BYXq1fJkzOmA`oKrHw@o4G#L6?xPfS^e!V=C0A)4oE)QobJX%#sNJk~^ z2kuMmA}XQPz)QeW=7Hib@m@V$hPP#*ES9Bqt$R|+a!L54yCi2}8oanXo;@_l*;X&K zU$r%I=y}`9KG4w7=JcQvUy@3~?WS)WD)}x##HV92VXa7*O@xKV8DR6EmC+n`yjkGc>Dpwknm%)AtfImrvyN`~(HXm`*Ggj=phh z;V4cR`fMU!r1O@USTu}WqE%f9Xd1&#JUb9wOgvw7e9fj;A3iR6RPi89n!+oQ912y<5tG zx^iGGyQv%oVZ8&_X{~4U=3>2Pv?@0lhU?1kT3~(Zr`)D8`7qR94Q_*xDAwupv2es& z;fT92-i^0UH|;z$wBxt+Ia7l{j~&s_ZRo^SW7&?lfQz;xxe0Lsmo%e*-y*~!j{p5) zz6mNxw`-G0!hU%%pPN)i0I^PoE6OF?4u{DgFu3Z;K7edikRP%1o<2=jr#X@TNd<`k zwTnHu1>4gKdfbhq6FV~M=-mno)dNE}CpQAGRnFdr_}DkDe060ja;P3Tv=JHK=$)vX zZ^)9=b9ZQTYiPVaG`@a$W9WEI*|*b)E>KYoqVhI?rtjd?@fR)ZLJHf6w0y0ob0}%&R*>&zocD z&Y1s9Sh#a!@JyHROP2(2EI?iXvJ*;Kh5>P~0*%V>z>iF3oms*x#yHRvrb~BfT zd=P!MKw+joQ^al)Khx7JIzJ*Z1jlY1m6JIuT`=$|5(gZcfA$1y4T0mhZ5eT)2WaF0 d8v08VIsOrXt int | None: + # import here to avoid circular dependencies + from vllm.platforms import current_platform + + if current_platform.is_xpu(): + return 2 + try: + from vllm.vllm_flash_attn.flash_attn_interface import ( + fa_version_unsupported_reason, + is_fa_version_supported, + ) + + device_capability = current_platform.get_device_capability() + + assert device_capability is not None + + # 1. default version depending on platform + fa_version = ( + 3 if (device_capability.major == 9 and is_fa_version_supported(3)) else 2 + ) + + # 2. override if passed by environment + if envs.VLLM_FLASH_ATTN_VERSION is not None: + assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3] + fa_version = envs.VLLM_FLASH_ATTN_VERSION + + # 3. fallback for unsupported combinations + if device_capability.major == 10 and fa_version == 3: + logger.warning_once( + "Cannot use FA version 3 on Blackwell platform " + "defaulting to FA version 2." + ) + fa_version = 2 + + if requires_alibi and fa_version == 3: + logger.warning_once( + "Cannot use FA version 3 with ALiBi, defaulting to FA version 2." + ) + fa_version = 2 + + if not is_fa_version_supported(fa_version): + logger.error( + "Cannot use FA version %d is not supported due to %s", + fa_version, + fa_version_unsupported_reason(fa_version), + ) + + assert is_fa_version_supported(fa_version) + return fa_version + except (ImportError, AssertionError): + return None + + +def flash_attn_supports_fp8() -> bool: + return ( + get_flash_attn_version() == 3 + and current_platform.get_device_capability().major == 9 + ) + + +def flash_attn_supports_sinks() -> bool: + return True + + +def flash_attn_supports_mla(): + from vllm.platforms import current_platform + + if current_platform.is_cuda(): + try: + from vllm.vllm_flash_attn.flash_attn_interface import ( + is_fa_version_supported, + ) + + return ( + is_fa_version_supported(3) + and current_platform.get_device_capability()[0] == 9 + ) + except (ImportError, AssertionError): + pass + return False + + +def is_flash_attn_varlen_func_available() -> bool: + return current_platform.is_cuda() or current_platform.is_xpu() diff --git a/attention/utils/kv_sharing_utils.py b/attention/utils/kv_sharing_utils.py new file mode 100644 index 0000000..93af5bf --- /dev/null +++ b/attention/utils/kv_sharing_utils.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +def validate_kv_sharing_target( + current_layer_name, target_layer_name, static_forward_context +): + error_msg = ( + f"Specified KV sharing target layer for {current_layer_name} " + f"is not valid: target layer {target_layer_name} " + ) + + if current_layer_name == target_layer_name: + raise ValueError(error_msg + "cannot be the same as the current layer.") + + if target_layer_name not in static_forward_context: + from vllm.model_executor.models.utils import extract_layer_index + + # If target layer name is not in the static fwd context, it means either + # a) the target layer does not come BEFORE the current layer, or + # b) the target layer is not an Attention layer that exists in the model + current_layer_idx = extract_layer_index(current_layer_name) + target_layer_idx = extract_layer_index(target_layer_name) + if current_layer_idx <= target_layer_idx: + raise ValueError(error_msg + "must come before the current layer.") + else: + raise ValueError(error_msg + "is not a valid Attention layer in the model.") + + # Currently KV sharing is only supported between layers of the same type + target_layer_attn_type = static_forward_context[target_layer_name].attn_type + expected = static_forward_context[current_layer_name].attn_type + if target_layer_attn_type != expected: + raise ValueError( + error_msg + f"must be the same type as the current layer ({expected})." + ) diff --git a/attention/utils/kv_transfer_utils.py b/attention/utils/kv_transfer_utils.py new file mode 100644 index 0000000..210be55 --- /dev/null +++ b/attention/utils/kv_transfer_utils.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import inspect +from collections.abc import Callable +from functools import wraps + +from vllm.distributed.kv_transfer import ( + get_kv_transfer_group, + has_kv_transfer_group, + is_v1_kv_transfer_group, +) + + +def maybe_transfer_kv_layer(func: Callable) -> Callable: + """Decorator that handles KV layer transfer prior and after execution of + an attention layer, if enabled. Otherwise, the wrapper is a no-op. + + On entry: waits for the KV layer from the connector. + On exit: saves the KV layer to the connector. + """ + # Import at runtime to avoid circular dependency + from vllm.attention.layer import get_attention_context + + # Inspect the signature ONCE when the decorator is applied. + sig = inspect.signature(func) + param_names = list(sig.parameters.keys()) + + # Find the index of 'layer_name' parameter. + try: + layer_name_index = param_names.index("layer_name") + except ValueError as e: + raise TypeError( + f"Function {func.__name__} must have a 'layer_name' parameter" + ) from e + + @wraps(func) + def wrapper(*args, **kwargs): + if not has_kv_transfer_group() or not is_v1_kv_transfer_group(): + return func(*args, **kwargs) + + layer_name: str = args[layer_name_index] + + # Extract attention context (layer-specific metadata, layer, and kv_cache) + attn_metadata, attn_layer, kv_cache = get_attention_context(layer_name) + connector = get_kv_transfer_group() + if attn_metadata is None or not connector.has_connector_metadata(): + return func(*args, **kwargs) + + # Wait for KV layer on entry + connector.wait_for_layer_load(layer_name) + + # Execute the function + result = func(*args, **kwargs) + + # Save KV cache layer on exit + connector.save_kv_layer(layer_name, kv_cache, attn_metadata) + + return result + + return wrapper diff --git a/beam_search.py b/beam_search.py new file mode 100644 index 0000000..fcd2d1f --- /dev/null +++ b/beam_search.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional + +from vllm.logprobs import Logprob +from vllm.lora.request import LoRARequest + +if TYPE_CHECKING: + from vllm.multimodal import MultiModalDataDict + + +@dataclass +class BeamSearchSequence: + """A sequence for beam search. + It keeps track of the tokens and the log probability of the sequence. + The text field is optional and will only be filled when the sequence is + about to be returned to the user. + """ + + # The tokens include the prompt. + tokens: list[int] + logprobs: list[dict[int, Logprob]] + lora_request: LoRARequest | None = None + cum_logprob: float = 0.0 + text: str | None = None + finish_reason: str | None = None + stop_reason: int | str | None = None + multi_modal_data: Optional["MultiModalDataDict"] = None + mm_processor_kwargs: dict[str, Any] | None = None + + +@dataclass +class BeamSearchOutput: + """The output of beam search. + It contains the list of the best beam search sequences. + The length of the list is equal to the beam width. + """ + + sequences: list[BeamSearchSequence] + + +class BeamSearchInstance: + def __init__( + self, + prompt_tokens: list[int], + lora_request: LoRARequest | None = None, + logprobs: list[dict[int, Logprob]] | None = None, + **kwargs, + ): + self.beams: list[BeamSearchSequence] = [ + BeamSearchSequence( + tokens=prompt_tokens, + logprobs=[] if logprobs is None else list(logprobs), + lora_request=lora_request, + **kwargs, + ) + ] + self.completed: list[BeamSearchSequence] = [] + + +def get_beam_search_score( + tokens: list[int], + cumulative_logprob: float, + eos_token_id: int, + length_penalty: float = 1.0, +) -> float: + """Calculate the beam search score with length penalty. + + Adapted from + + https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938 + """ + seq_len = len(tokens) + if tokens[-1] == eos_token_id: + seq_len -= 1 + + return cumulative_logprob / (seq_len**length_penalty) + + +def create_sort_beams_key_function(eos_token_id: int, length_penalty: float): + def sort_beams_key(x: BeamSearchSequence) -> float: + return get_beam_search_score( + x.tokens, x.cum_logprob, eos_token_id, length_penalty + ) + + return sort_beams_key diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/__pycache__/__init__.cpython-312.pyc b/benchmarks/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cd436f11175b344b6993b68d18b57534792db18 GIT binary patch literal 160 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJV+3T0&7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?&M4u=4F<|$LkeT h-r}&y%}*)KNwq6t1)9eQ#Kj=SM`lJw#v*1Q3jpn!CYAsI literal 0 HcmV?d00001 diff --git a/benchmarks/__pycache__/datasets.cpython-312.pyc b/benchmarks/__pycache__/datasets.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96dd6dbc12818a8ee57be68a8598596d3553c704 GIT binary patch literal 110697 zcmeFad3;>QeJA+(0{ZR-&^Q`r<08-?4uS_rf|o!7q(G72ElZGTsEK|60Rm|1b%P|} zZdp;h8PHZDFis+Ht*qd+lRzWQg#N^tK_&T*Tr9Q^nII!zCr~lv`EWUl+osIZ6~h%QZlCf`R}NRQxMQknx_Y>Jx@NeB-5aNBhwIR$b1E=hKU~k! zTvH9xjl+#B?w)FzZXRxCanID2>6YOZ7WYoIP6vmBEM7F#HobLtD~lITwNG~pcd&TL zROfWpa2JdFrnXIY4|f{`8@*xa)b{Be!#m_V<5bV^ZkBGG+B3X|#VV)vPWKM?8U&*t z7EInyK7sZt9(c#V?}iVGrAFbH5U&2Z5UzR0$Wz~uvko6(&ua0kPPD&clFK}0Lr>^m zcq$wa`@;43YrtP4{+jUDEFKPT5f6{K!!2L83?C88!>wXJQu@ct;o#TJYV89P!thb? znAj(FjT!x@Tzop*wjaOYtztFud&GXRnm<9w_OF|V2g4nCo^;~Lak)p+;V!W;yiI&Y z?4ugP-RRGD{Ov%Wc8YbAUj7dHaB@k1$Kzq+d?8A25l^BIa$Ryy+~Hk#t06WrJs6qY z;#2C#oMNeakh)hqrKX-{sl7%_ z#&93r{sMaXjChFN7%7L5@+_<82q3Bx5!FdLWxmbi7}QMMyjYw9cQUyNFCRuUS_G6kvgFpqY0KeiPS5)ypwar zMnT-!BuIcX_&VL;GUe+GZx^>uUU__8gv-?&JDwL7jjwsH3pY&93)c;DO-{L{3N_zu zmapy!`TT0WJuF{mPCh$>O;uBd>3sxiSf0H~o?s*dcqrJ~k4(O6&t4T#g?OpLcB+Q-Ei zQ3}N-A~T(VbF){kMx>Y+Rx6CnT}3x1W=5xG!(y*J5am3b^itORD6HzqWCy6s5`$zO4hl0ke zsqgTStUGim8k0h!v1u`OIT8+ufK=`~kjaLszb4MqpMAnI! zxvY6K6pc}_!RgRA6)Qh2iRYydzFqiy&N-D0Ufni%-r@#U@Th!l5bpRQ(g9)MoK$WagSE z#YQHm?Ga2IYQ@<%GpDwU`=ZlR7|Y9%sjw)a2FxwWG!+SjN8~wy!I0;QREYu37F@+A z3q`YqQxlh@NHm1}Wk;vP8xt7WK52ZG=Emue6cr^rac@o;)vq;q3luiKfz&qyQ2_EH z3>$&Y7=}%vS+s~|0>ZR;hy`IIZDwL2VkR|aW*eMk*ecpYJIiMg?Mj)jMJ`z&=W>Wn zHLXxibBSg(%}Qx=mY^-`I!DV$l!t%54^u4=I7^==ikG8R7sZ^xGMtzh6EXKjMP-$- zMX0VlWa?yPF=$wPS7TW_{~ehUXR;;YjjQ4)KKDpuHg7bZ4z2BQuzNBO_V+2$jM2L)pIrt8|&)O43R5YaAW4s~9glj^BP3I_2P{R)Er zN_b>8TCD3&nhlS=XJWJF)M# zrONiN#!_WR(&potmaVUxSlqg7P8Bz$&08}y^{*UXG%bd2+3%a{rR_?Co93SjsdR1p z-w3q%N3$<%1j1|*@jGS;V-Ojhi(q? za)qLth7HhxQqFKi*a^hK#SlM#*hRCNwcu9UDiqy34P`ti;}N|qcV*Zs7KMvYq8NEg zs5hv+n4v57u@6}>2t^Oq{mHPYnR?ARCpYk?%jW~G@?GmWc!Q8fMP|hIzy$4do3Du(tU&AqQ4|Tq0CBM6hX5G@*yMTF)Gkj? zj9-pn!w<|xsgL@4bRR3wiWQEYoWqL%3z(V}8LW~3)c`g|33>^KI_QHygzrYb}ElvM8+sB_g94V2B9XnV7_nluNMeg zdy;K^xA)zRtsQ$Vb?iAxuE^AN(rb?7ptLh?nxt3CxnkJg_@$MJQA_NKF*X22 zV@{qzWC^Wy`imQt+TW4eHV8AloBE;Fq~w%}(GQPOm-dUgv^pwcMaq+~DMniWyRifsSA&7cWk!{$U}}@6%#8I z7^R6%B||Z7W^p8#p>zSD!!+gDJZ70ujW{K*7{0lpnkY8eiLr@Msvh`Ilx=xx_j2pc znq%UPSkNdP#_Qn*=<9svk;p8lb8MIN2BSa`feK@j-rQ?o%#^z~kTs)M(%Z(YS&9HA z7Rs*`v}f(WR%fEI5D*5bgX%9pTjESuqR%01qoADvH+u%m6wk7@^Qd(|k|L6H1nEJu zbb#{EM3tVR;244^!9rRqe5o{{?9|+d*2@tZpAq20{9CXwnBVo-GWsILhzAn=M+mUQ zd_qm*%F$HGj%4x9g~E)ZH0h{+;PEfDeJ_09vo+%_f5Uyt{buZY{qOqT9sR+MmCJ9} zrkeJwwx#O^KJXrWP*k;a{9W_?q7FId()`hcpgzq329am}80~!9 zkS)p6_}d0HjidD4lRWLUsNDSSF)=xbJ z7+$}ONyqYE4H0Vq1S1F*NhHv9Z6XW|0oYMYu6SxrZJjqsv<)kK2}Ef)5Sa<6Dev1_3TMv`l@i*5J_FgIT$?C8aI z-XXjn&CSPg9Q7;fn$ClDbt-$X1TL&h_*!vPh1|kUTEU0yL=p$M-U}^9vPIO+@!Nvh& z4#H46M~^K)#~HGoEu>#O;zDcz{jQ|@-o8ZLWQN3(S2D&pPn&GzMsJf`pDFP7;OnlBJ64o!_KAGpFgYGhW49GR(DJZL0 zX2MC1Rhl=b>MaCya)wxBiL4{8T>^>AA zCIBtNO^l8}})WK5Fb6`vk7Kw384sN`MzG^8(V8qbTp zfj)Jlb33N=3g|2GW%1^l21mv{v_8)Ve z?ouw`HP4qnwsQG{Kc_PCmb;B@xhK=^p`>+)FF30_M$t{{N8M&2cc5+7ECphyhhLMl zvU0AlLEBWZ)yfE+vgI0tIb*)(apM)#Fa1Ew~nZ9 z5-*r6&uN6^PqKn5wJvl2MPF|{*%X=%+UomcA8R{m!JKF)#F!N2^)SrR70Yx((^p=_wJI zY(Sie&H@K!yfjw5HX@9TFf#!mS!d9bEx^V(E`m)C)_x>w0!vwqOR;gQIE> zg~Q-n5=$~`n8=#YO|a&}(I_o?W$+a$-#;CiyCmum7!}u(q` zTLh>7fwTDaXKp@|tT^)HbMIID=;D3nxs0>;Lx+3e#y1Zv?Z4+}k#j7W*D6|56|MK3 z!AxoW_l~Sszk4!O+P!czo0%v<&}yBQ_{Wt0eTz#=APe*zPV#5 z`t3d6O5AfaJt(eTHyVp-HcaLc_qt$qxz|mCv-#7Xn*`5}XaNS`N5PW*HuKwtp|_3F zUfe^LID~jsI2tUKT9F_HDQKf$AA+oF@Z88ra6bo6p6c&A!6!i%JuqL2M5d%el##?K zoV7z7AL#@IXDA@RFMXK;GX=zNTn=mjUv=E@Vo4ubjyE z{I8tEI5o9qs#-FE=0~lVbp0WI^d7g7w|liGE9zU~z4?hO~kdV~SP zpy5$rsio(kw`ARdUz8|qS@ETO9e2!k&VS#Y^7XEsPx<=RDHAHOyBESYD>rPo+vs#y z&KlNhh5F`9py^Rzou&F=N!gO`R`0q6an$51XLt03ik2?MQnsoM8{Hl@2n9YM@C{9w z+QvtYa?2^h!;-dj3q??}ymGyeMXf@0-MWoM?LtNMI?KskVPQIDtNyu*9&MPYG9$~? zt5n%)aX&0BTTG;io7XM4k!z(GuasiEPKwc}eC$Gu4?G+5MvQmXu1V>#h0_x=Oorzs z!k4o(*}aSI8QfyG{wUz#2xZm^gB2%^USSh#lxCfB0gd4V2f-U4Oh8HqwMi7jEl%Nv zL9zf%G016!cxt-R&$eHrGQ_~l?rrgc8-^(K3F3wLEx@l8zt}3{Hm3ldgCXQswn{*z z380xG$bl$UkjyjVd4YP- zM05n=8SK=`N0ELNZ5HCIMQPoGSPlpk4a>(;U%~bD7R_wZ0erEOf`{k+qrzxo{ zuzWC8*8K`Zr;W>#DgU-RCsP%DNpmSCyv={ldOlg*j_{6$qDRiJn~ctab-`#U;8UKM z2Vabz8dn7X=L9K=ViIA3K?7qS=vaHwFg>H7fnI}<=~sjg9Zk#6ryX5Ma~G#0e2m?o zl8qxw)go(Q>I<9tL#xJwzl|oW4?TZ3E@&bY1!a|*Nmh6buvJ4kC-v(N715>R*g?8T z81p;mD_zQOLdQu)O9ZvqHL1pw6qKRWj7z!^BiSdr;zC$Oq*0ejQsg686d~>|A%Wgc zsw9R|7YG69Yrbn%+A#>4z-F>o%caCX1e7Xq?N@0sZ-{AtSO$frR zVcXY?-xhw?IAI9e-!z`vJVwMjVV6=ys>9o;D8KR)+JO$G50Wu1%$UQ@xIncgOw>l) zq}Hwx11`!DcK@!*b$!*)D0B-^!*xKr=LG})y>^CcZ3R7{0!$b?AyB7f2@ZfBKnLk? zka`T@3cvsZcU*)W2+1-YF$|!lNTx_Df{-%_;DXgAD?D*JV=OW?6}e8(kk`g^pL%)5 zZir3Q!eH=294v*d%Z*dg_ZK*Vd}!vysF?6JdOAw!KT zbd5=;l@XKyFNQc2IDUHI7=h|(aXNyrqU-}Rqma(=y4YhT>0mvhO{ZsP0PGh6=9F1x zu&^O!6>0;|2ZfM*-XnbttjY^#Pskr<2Ncp+o!Tp%2XO()kI@SJ5i5@}Bv_w{jAD}- zg8~8KWY$cd0(x2nTn_pR z%WQ0{W6x05#s+s}R+`G15X_g;(Dt&#uF0$7_&)y4+94=Cf>kX2I)b;M!Xm%%yp}YoB8aGK)y-DPUhG) zabU9M2tJzRq(tJBWQ`LMP;!{K*#iCoS=*(aoxprBHM3UmsnNMvJPeDJ!Vvnboqf?L zEm>&_l|TUdYgCt=4-pMa*2&|%PBs`h0$CTZ0TLW@WjyKIs9-B?g~bRU$Z8d=#f!~v z`tBFjF7I9uQ;j%%vPU;FaXaK>5m`th5`7w1>DEP!U}eo$4vR<$)%wRPqATKl0?`=NAu zU%Kk>TGe2xYViG*bk*5~6Q5WFcO~$H(mQbJ%WZ>|9&yhza!BzaQq;hH3QcJR^0i>I)7-+S>>CUdzU~+qZ z$}^C(4t!|$W$d*}vrFUaf?-P!0$UG6yAI$VCQHF5_2|UHmw&-9g!i`f74#RFeq3Pb z_Y`I=B)f`iLIa3fZ^GXOv~fQl4cINC0Xwh_v+X^h<2U9*2sD1(q$6w|V8LVCAQA8E+O!R-2!Qew+hCzmtGo?b)ol=Mux*gnt|A0s8`}o$`fV`i z2pNEXFwphbcE`q1-k_MC6BA@%-En1sf7tDgH_le;|CjBKS3@gLhVYhG)R zURuD+l3qiQwXq>1crN`0?&r%GJZFhr(^q%qg7g1|3cgOkW~j{vo?|!jFYBQ;-@Hj; z!Z%S8yLU<0{y|@&)pmm49IMgqasjtAv6YvnDe@|G1xx_swa`My;7zSU#t^8N+qLth<(uX|V9S8J2? zPbd9@|6uS{*fM5Z7X;h5al>RJ2&+anyIE5?{4Oqvf<(z#Acffjj>*Cd?_E`OExIuYkQL8D{?U2~Y7p(zoK4 zU)dYCOp;(L=c2789~Jjxu@WdZ8Yx_pDt*ZBDOBjPgl*=iruH{UYGFCoB(V*1u79}R zfQlrpWzMAuZlHWv@`MrW#|T=8W{n#wiKZ=p+C5ORM8TrrH7~xiAznCUBGtiJSX=R} zYk>3YIW89N0AQHu;HX-dWX4dZmw8f9FNuMv#?g9NBEsgV{9N87n6`|nPwL?moCeXdoP6Dk0;uQ0#$nDk00=z1^sk ziCoxre@Tp87x5CH52NDfY;5A1TmvXTu-K?`dB;=XHK`*;r1DX~Z1DzDTWDuWmaxLv zeqPb8;OfsW^z7NQtv!IS`(mKuVBm$8A1{N5nu@lx2U;$}IHHA;S>Imh?%9Q`dw1+@ z5A592vne~}j$Wpa72&k0Oa;<-LGlBwI!L=E z;@R)vX9#*}W)c8O7jOrTAMG2^$7G~{G)^~GXv~NZ^hC4vYak3SjZpS%K`g>7D2y{# znPfH3|?-N#b(#}?h7G)OS?a@tXwY}kI!u^kByV8T(Gb_J4+JMOu5WIV+SU(3`r zycNG4&-h9+zM3^(Ys%M}@dw{Ma>tzpJ>owEVjs3W$#j3*R7kgSHflzrlyVmipcbw^|d@R-nBWMWWDmJ)Pn?0!vzpolX-JwOsN zKF5I?F}FxrmXk?coVPDSby*A&xC%W+S+2kZ9Wu1&fINiZuS`nJg%MzhLDB&D3$Q#k zF%C&|7_2eqVh37@kq0&LDWpJCn3aK69hi)=x?cwvmW=ZRC@-CT``D?$fSifLRlU!1984zj$(+wn`5w0sp$d~ znMu0=@)_9xW&~I#_GprpD{NF64`z_WKKJHVqVGstMYaPmlcv8`*2_nNSr9&Y1-D!n zr$sR6AbEKN6D~Z0!;Yt8Gge6<~#2Bm5-BR(@ #*4lbgZM|vVKFyfs8j2oe8!X=$bCe!n3`ts;jMgB#GFYjysSoa>GQpS( z4oo^i@M!xT)ejmmP}oPJ;n#m6;HmVEIhTS}zaxL>ghRtsp?s!({Dp4ID{&V|8WZj~%>J$bv`M9!gb1u32V@N@2b6g^+^&UW4s_u;+2W2M0LC(UOkT)o2ZDFPd009DX|!753ws=8L!gS@6D;-6eBGZ zo>TQJKy&9^+dDECQhEsq7?wudh8fk585PBd)M%ptI?)K)OkUc2JXiDcC^NO;`1jR^X6iMZ>nWia251Br! z;-MfA8UwR~vknM_M0Zuhj<72j`Cw+@zgi#+`(k3C0P}>Rx+sm2J=07G*5N80&t8*i zLtjS4FrIVHUe>Rh5;7@FG%lEsSrg_FUJ&eMydX_1_7jRRPH(n^Y)zDkVfzIO zs3@BmSr6V{nPqI!5c-rY;23Sz5|X6Q95c$4uFwOxHHpc(veE=4z}|)VIT3k(RKBTn z98bC89^*PUP-atfc3KiCRtQsO!n(62c(G?Z;KqY57I~D4@GUM+Tmoy0 z)Rbr8unA76qB1A{jxMd&#j29K@>gyl)!r&4Z)&8ga5kVg}C z#}){&WX5Nrv|$H6@8am|C1joVmTb{aGb=L*S=8)kGbQv3x#f=bdHnp!8 z78JpZ)k?~zRw$qPg!XM~TaTo+9?3W>mbNa>-Dz8$0H?m)p0c(s*cYE!E{8m}tZuC| zm?{k}*fZ75Yt_3_(Dy<|fO-nYmbZOgy{ei#gt%}7LG2STK)=N%~iiVlyYuQIs+M}kCb;Gcxy6M^)R`A(7JWynN`EvL+RFinYwL1 z*m38jKk7-o7022xYtR1{GTto_^@4TkeMH zaHgjFPT#$nUAKBxcQ5pl`t#7up>?AP94f0&-*S6-7glr6$_qEf8HVl z4lupqzYt2E-sNVCC%+}?E11OCV&Jon;c0uwM@GJFCLdLk-)tssgxti0YH< zU|r{gancMm&3D_CHFpeDKY?x;fdD^)-O5}QPE1o?hW$uyqXB3|{5gK4cPSXB;Ga^e zOo1}EaTf_P?a3{Er3Xlvuh_hJHMrZ2wHqBo7O*fY1aIY1-#1PMU{~0zE3MWUFW5we7HX=B`FIHNLefQejOeuLN;#GGv|qUt5;l>lHeD&ZmRpI% zncl-=ZP|6D^0mcF<+kO}#vyKp+95taT?xA`-Jwf&fV8!-1yiK^5(RN9k;icx)CYC; zLUDWC@mOAK+|KfXxYp;5se-n=;x!w!OZ6#aHOK;CG7s6DPvw%zhAl+G#0~GNZXgp- zcl^RNpyfO@Kppan)&O>bB`+DT?~yz1jJx7)CX{o`a|sSDc?C2mNES5ZR5SA@!-;=} z7nc4Ng85nA>u2;pVU0zu zD<~bkyJnn#bOpVm2h#sc&HOJE{56882LXLW-No^!3?9taKMsnOF_dxXK7z^r7;ull zu-R^Okh?@!ksP1hVtS}S$< zSF-y^DsUw2?_Y3#=&N|+_^snhFW)J;^TPM*QgwZ4-{A$@gHk_3cy!67hk^MZdHB}w zvVA3VXY1RMRN!E`^bnHED&Ls6HK9)~JG9_{RXJ>8DwofsOIs+DA6VB*>9XL0yDd_jPsd{vozx@WB2Dpfoc!vb0G|-yQD~4tc%ko-KfL(E`vc1h?yZt;2Bw-^?6xEQhZ}Od?a!1)S zhmN~yAOOrX9rA22z ziQgH#SD-wrZ01qYVhQgT+9c1>WS}Q0@i<*>Xj%f$s$x^MMHJCj<5W zfT#1#xj;QX8-oD*Zy`Sb`%a;>G2<#(6mNdz%}Z}h+@655joR%g*Y+Qbu6Cw-22;*K zAlaq%zqWgCZuw>_C?=@-76_GXD}yQjE=FxuHG>x2``wpQmAe)^A3C=ZJz3Y0so$1q zY=yryk{baOy4=tjw(VIcNP1gVYCkR(s%n$9yV8|CpshWni(gxDCp|q$YY(?eilKMG zid;j?tw_xlD{kgapB*?l_}s{efuU@Pl6JOl=-9x>*}n6Gr?N##;^5Hf3+Gurwd|=2 z=jAjmUn@e*{EN0z>;?sE6m(EfPr)S$7$d0=K2aG9iD;Uvg(<8^w5dw;5wb2>`Zfv^ zwUH6%p!`duG7G7v`2PFHl$C;ifz37gHiEzVy70*8fK$wBp|UDzF3VKbB+ccSifR~g z!6znC{?=@<>|ZbAhK`xa_Ds#TjK7T{{>n$C!v+h8e3(*F1fPdYDOr?TJ+de_dt_0k zP+GC>Vo|qHR<-V7Q7>#a*Na%Rm@<^GsE<-v10c*`2}%~28-DiahS6=YKdh*H%YNIw zZb2L^qg)}ynAPa$P5*|C?gCwwZLm>Yx8N6<&}1RStW>v+Vs4nW_6-l^xMbLB*^T;8qL5-%p{i~vdONsoqdQ(F z#rXSD%uOwNDCPw*zg|Q!(C{VeB^2`s;I7cil+s-pwI5wPvQ)B=NZD#WDMuooo%HQS z@uyx2L9p>_NP0{`#KROqOi&0hlY`O8Ji8cDRYF9q6;i-(!I%Y3KcPqhB@Q^gh3ip= ztVCji5(%V`qDyqso;&6ZyUBGZQ=;&K#`k9Zo1g)p{aGln&0GN(oS$XzlRsA#C}h4o z+dmZao@~{bhv?`iEIVh$6{U&8vKk(MGvhXL<%bSP8-Tw@v17qQGV>}m2BKj?;*UQ= z_D;+;>)Wd?{@(QH%d@ESLB_xHt-KkLTx7J zIiD-kevh8~kb)l}$U0!jDY4NNrBPWPh;%`4;m?7;jo$>TV3XWhmFWT_DwM2ooYp{+ zFW^QXzGn<^3-F`7nzHatW(Lu0%8~Ut62N^l3=S9*Q`dP1;Ds(on+v0U@STi|G=_Yc z$D+pL0;I}FXF8gYBg-+^A}2gE=vUlLJ>vaJc=GoNIGs7}d8|)f^oiupX5I&V2>Ekf zPF3764i-ziIPTWHi)-_{Fl$J{VsSC7uO#71ltSWJrZFQlUu=+ep27Q7EpAeiARoSRp4geD%GlNca<#kWW`K8E|R5ET7&lWW@e4)vyEtrE_NR<`Pxf z8<74iT(@A}(1o|ariiyFgVeShY$BvVW?PN9Qxx}cj9Swoh*vAUCl(K8lP^&dugcdC z&81DeWZV$1hJ?3Pmhj@8AQP_TV-K0G%}h>MYN_RTjb?)ywm`Cra=FYd%7=H*xxHci z#vE7sh8*t}U;oj2!M#2A%KO)$LzLr6`?f6De(I^r zl+-QP-z#YZKcep+yOPdb$&y_pn(9kAYZgv`i?LL6b7)}@a;TdpA67T7*izNqcTS|L zpISKiu(4&WabK!&9~WgM?u1tTXs-$F1s1G}^@}eMhNp_MaN{akbGD|OtxV9=n5n8J zSyN>r$(IhS6v0+8UEU4})4E4+1vf}uMwG(-1?OTU*|HB|_m`XsrPTph`+E-$}y z`%1d9jq2Tp_`!5}8>(HZza^q-Cb_ceC0E6dORkolT^>nR@A|;mvtCYusY;=;8hk3q z%wAe*U$On9XglO)KFAUwlY0I;HA!FB!oVZopLO7Z71uBCPZf1R$Tj;%z4yEa9+ozd z1fvposG^umg0bq>rGtMmn{FTYli5|%%DFp5?>xIYn`$3ORSjh7+TV?)Xwjy(Ka=iy zCRO)LrZp%F&Q=G%|8jcA(Nx`0o%GBj6z^CM2+jZ#T|G%yRoQu`=w9XaTLr6ys?@4B zX|4J6=Z!+gQFso)%>7rT&SPe;#OSV~d~_E_wg~f;{xb!CPeBm{T@)~Iz$hk0J9SVR z0~VJkCeWHUAkdRRCQ78>_h_nr6Zse!WVTrU)@iUb0#}Eln3}e=nuDpDgBkypHUFNJ ze-G%LOm*N<(Q^h%EhBj-f=Nas4~ue=heho~X*gIEWJUS9i$ysNz@o&QTW3`P_m_#A zPfF<#$N#8z1P;`zR*r4_2KxLMqR&neAp6iajj}#?0rbJ?UqRUVrV0AswqaX1$jAW) z$N@WawH-_s+%7sNE7Vrh2z0vL(2RDe`8>#P7u`6Kz>9Pbw5`3+q{ct4NsVVkIP$3& z&x)Z-?%w=VEaBe7sfPSdP}YaCK4@;|)(c&3u}nQwiZsp3!{z8rg;)~yqrM{af%z)` z%yxOLy4+9h2_tHtOOBQ*piS-wSL5%=@3grOcG1824dmKtP+MiV7Nk_w8bRto^+t{0BopZRiy+s=960}{&UtswZC7{S*Jg0C%G&Nr2ln~sQqR`+ZTkU%?aN}B$> z<}X~O|5}*|6g0reb3`=AwY#|2^62LMO6G^2!yhd5!=OO*WhATlbNw_nei*_RxQg4%5C3^<_sXkzHRb9E}HPbt;CNYWpiLBRS(PIeW|=odj)b4l67&VY1a z8(e~jSIH!e+Xn1IS~n?rMXpB!+f=6Rd9((tDW)%omm8avz#x?ZPzQ%*7IXMV3{$8q z5nC0yS{=->{X}dI&I+gDD3MpRJ;;xWqE7wnmD^C%rz2`87iojRY137xdTB$)9WY&* z9lZiYDsJ}xSI*o7Sb0y`axKt$S)?;8FhmpK*yVPn5XjE8!i4+Tul*(|h+PU@nxMuy z1789k9STs~3J+t?28-u@k=`D|!92Q9uk_C#CS*^UIj=f412Dgs07IR5WPvqUMBf;5 zW<^HK*Dy>}AB__Z0m2uS+(Kpc_svw{`An z547&yPIBGtWFyEEcjhGau&mt)xnCWD^Vi9-F`$k58m;=zcmS)HJ_X4}*r$}U$>nqM z_Q+;5rw*U%>OvG85|Y3KJ`w<6e$fWV#di}BHC=^;mF%aK1VdQr*fs%BW&${2FBA|j3*aj{^bpgB29QKl_i7Ny($awW~Bo>k3D)j&`4LbqO7y+P+ zVhbc+2|s(~O~CPvF<s?3Lf8g)atL#m@0gz{dyh}gf$z6vMj zu>%3ADcF{x`_Y()(=7lUunmx05Is}@i>$;&)Au7x2pGs_En#vZ5iHPYEifU6Wfr^N z9*L%K)JemJ*s{nbiw$QqO5c?_$w=L7g{nxaek|@4XMWogP?gRn zmDrE|Mh8$kgjQa`U-Opjwr#DVEmhG5x;NwWkPz#_QeVN{dCVLm0*hVw90$B>Q5S>72;bTH^t{3azX9|4a3MTUyOg@fwiH@L=pkKj z9!K7p^9T{ZUN+~`ZGv7CnGAvTJ7#>>q4vQZx92nbSD1TIhi3E>cD{odKo=Ou7BDEm zONW@O{@7eQts#aP7o6efgb)5EGXFdc$h;9^|ac4erA*E!(6$YzY zbIxagTe+{fCJdu2RiA!Nc+ACInQ}1bX4i9bM#c5q{JVLe6W_V?km)g~bcB!`+K}aK zPM?{>DPE*q7IiZ;5K!Rc>769Ghlac+5T^|jjy!NQ8+VdGOe%6l4P=SC&Qie$<97~8 zY%^=51>Hmo+6*5t*FGXbci!BI_|c#-TM)&WZ#YB2gENKY9xOzu;Ecc@}8)#5DKAjE#&YOJmHeeZBr9=(KqQXh28i773bvAm71fVJ$?~PL<H(%F^= zP6DzPw=R~f6}6;_T2{)EMcd#r!0lZyedwxNzP391fotG>f2N}TE%$Br!XUUV)pew4 zp7HvZx2%k=ig%w~9RVw>8gSQG?Or%R65haCNo%U4^?pg)gR+KX^rgSyI-_rr9Km=|AT_> z6nxjZ==ublH!4+Hy>u~Ivh|a#g8Qf;KAl1kc_58oVKs{h7UaQ7iJ2kVaBy>|C< zvgQ=@*OJcDd@FWsq8>MmJBilVxRV!by$CgIfZ*vf&3_*kO5uzFLciIi`E|qoW%w+$ zc?iXfubW0q6Y@z2IE!0Iz+oxQ!DHr@QNwFa?DnQ_@skh??3n?3rPh{CKUqV0KfCwJ zTmk0s#e3&~NL1+^dK5R$BL)3(L3n3Mm=bW?%8VLW{Q9PN&@w-!ZVqP|DI9o=OvYA1 zM6{ZZwi}f;W|NtJNqHX9WZq0A69C*q4$-&$4Efm=yiKnB>}{jGchQW)&A-D>)^JTf zvAGcK@RMe-+U1c=6EBZU?Iz9aax}mV@!Y#owL6!J7G7HH{`FZzKb0JqmTPbj*n3prEgbTNPB|6kIQo}0T#-)EuZ5{L=AH^u{)FL}@A@Uj?^kHBet^M()XFP3Jt_CDrRg;& zl+|v(U%NBy-gV!xizy4^q^ILa=kbSayVu$dq}mSLZ##6qVldTq=sQiz^~-1OcHe#J zN4u7amilh{l5K|;T&aq|pL(iEXV-g9Q`2Q8Ig2N`o0+t)E$!X9V95k~)`I&}!TtAx z2k(0hrGf{)Cqv0iut}v9eu1KlSjHgdUYgr782!**QLnT75c1Iv*hv^K zorr1LV-);Wfe7b5*Uehk%vx&J*;5F4eN*JLgcdEVMOu~Aviv@9_6#_}FSsSh-OCE+ zKTsh8M`m@5U&e(2atlbo@DvtRv$Jz3i4&*!Ozl$cSau@K>F{i|2zSD#JI8o9;0=5d$U4vGNtJ-mRc%8k}i7Z{@Ha0XN|8(#pt<}!)4VO4Q z+*pAzg=`oT>1hO@ng=OPE?#j0I)d*|te;}~P4`(O=wVaVt5Ui0+?sEF(n8D4`8%}2 zs6o)dWrEYY=Gc;QY*~g|rk>T7dyc*bB>~t>t$90A-j0=-)ibHiLw9SFr_ZGN&ZgPY z@u?c_?&399Ys%HS0vAquR*&6t9nlxg_+W39@-;8Nd?%D@*|XY-<2!nv!P+Y-#<3by z3y#$QVO>%~JNnDXqt7O5zLYM0F6n%ZZ|P;k^_KbC#O0Jd=wvIN#PQCS#tq>KIe?QpiORwP4?|4b_6v8S*byRdYR22!h&l z%>Z6xS1d2 zbeRwsBGbtWql%npr8Hgs-!u}LPKP?7-=8@`uUlR(kWnM_Lmewi+;$D}-q#$^AS(a@ z;z4bu1dXBzh!Iu+^dGqc_Z{h0fhLP9;F6N6A3-VyX9WW2rHmWV_@VrjDnk8 zrq!i4j#Ju&ff_+ac7^S_nlSTnFMv|2312C1O%7Ni*dH@fLn`KqlZ@3inW=sDw{}ro zil!KtC%7ZjZ_KRfI^pOn8SN`bSjd~j{4o{CeGxOS94bT4Qq=6dwmxZPtB8;$(weuO zRKy$e0piTF(`aY77;3X%q&F~MJ*0|XHj5>2z*Ka8&rpOoyKbHKDU zN-Pkxz8Tw_ue)#+hOS^!Oiy>PvK=H7!}bF+5UQF}UBEzv`L2^Fju%8$at8@PB04?J zSQ}(93*VB&a8(|n&n8Hr=DW`+a!C}G?J#o3aH3EhkI zWDEg(BTUS2O{_EY-~)Q_h=ThRkX>5P$w@x)WGf9*@B#&d4oc@JUkzE@qDN!dHq+MCLICGhzPIPjj(gr7hE^U=7eAA9K0`)nyEi3(LC~<~Yf1T9e&P$> z>|f|xc2n}f-+_B)2;&J(F^CsJi6)6Ss{i>cZU zw*fG0tF8lg{k0e=;U_D%-RVp2Ii1>hI#qcl?KulnfMo-d$61}M?Y!scB>cR%FH_P8 z>$K$~$<{r|mc3*xw{Ih#wH!3os_`EuyTVS+Ex2mfJzDgy+braV-eR+V`txH3q4s&8 z=tjJ2loaiMSXptj#QdW|=h2FSANec@C88%Ka?ZF3TcF*x5q}$jGI5VGE z?8>bw6-<&j|qEh!Kjfj381se@P#}(zIGgA*ob|x4Etgs?TY!(b7k+s z(TWaqfL-yzeEYqUQ|VjWas^;((IAnYH36`kYf{}mwFkPL)cS>XQX-hx1$KQN93;0- z+8H(wO61r6Ru6`R_IqKRIkqHiciMF6eegC&y#cd*O2Jn_tD~k-BkAYFZJbRoGmqYs zvTkh(l->ZRgRUmSQwkui$lnjuqZ{^UkZizXsvzBjr6jfVTD@6_5letwO#k^a-#-r< zsywEud*u^!0DDtKJ2(#2gkIJt?P|6WG^~>P6;{+al0H@-$IClqO44GRs+cf;Qbl89iOou^SWR}2`5J{%`#_Q>f6Xv%%=U2 zohL);GfeDC`x)CkWi$a81(CPFDAYTz;QzQv^xB6lAd%>aFJe;kVkL_r)0MTOhum%NChWUH%yKW z5@MM3<(t|JQv|*zcL+Q%tRS43TD)l`ocW`SxFCI4ZN|tW7yrE0p;HQm6^K6Do)iyPL8 zgQ?p``|s# zV8$C*u1I^^RzfLn7k)G4I0a}6UZ+18z` zXj|#NSJ466-J0EC*!ap*5W%(VU2AzN)$-K+7Lv%_FFEqSSNX=!t)WcCvG*&0e3q7x zP_6P|b8xM>FV)Ie*U$)5TV2@>n_rK{R#ym6ud~mT?*c#;D;2#jZIH3q6FIS;Cc=$;g9eiE;WmjhLmzZx?ztWMOx6s|H} z%S`$z(Pm?Ai$Fk&<0Wu1p}J;;`_pD6m!<<0+oGmnkK$LXO^Iz~TgwMI)vQ(ZWFwq0c4kmg5g-}V<$6eBacy*!yWoaA9x#HFFdNprDjJ8I0#T#JY zSFhfAH;+#k2h!>s(^e_%6NvrCJJbQ zFsH1VB_cCS3|4cG3SrD`?nQ&9e%1%KpsKsd5$?=G;;gbiq?)o6{0|gdp@7(B&~Owl z&5m==9SbY`5p;9HXX zO1^&_I1|`dus+LmzFAfQH%^-K&9#3Z=bI1f8rSOfr0VtnUu6GlGTugvQox-AkIjvv z;DMa}*7-LZ-;6G`-ug<)+XT+x)_q|16&3%qt#`={#I)2ed-(7Cz_&Nk){FDXAnmgh zRJs?AKZ1;W3vj}g4x9y3-@I0TJXL=@v8kPEfr61b8_>?EqTo!uhWvLQe4WM7(6+6e&poIk<$ZbM^2wTb@KH2Y_0O} zaNoIsk-_r=C(p6RX9vz)7&v!68&IAPT{uZac@DkDni?gC{B-2-g(J@loF6%IYUt?T zF|C1tp}xZ>21ZU^IB|aPWysXjFE_7{#ctz@E7j1mYDqR6 zfMM#oiJqdi1(96EER8OnOt~AD2a@h!%GS1Fp{MI@WtQg+4=ZYxMwfT2n3kqf747R5 zO2c}x7cG`7jxJWDY?T{@EN$DM0e*D?jq4VQAWLKGdLfHih1!N?cmZE-z5SJS8%rX$ zK1&0XxN>BRyx91t@ukbRYgc?L;+-R_CGZSzGSzwb?m)70 zFcmzW^c+vwp4o7*QqLOqS)MgKtgKnLPy~Ibs$DN+QR@>Mw6W)QeIFex34Oqkw@ap_ zbIXlOu2k`sl&xjM#U33tDlUy$)+&2bmA!c^ek+wnor9Sz{h7Ao=#Hx`(=za={)WL) z#jJoSf@HD+W>Ibh%%a@!5sPw1KrG4~AF(KRe8i&M@ezyq^zTv1lFEeoruA|bt)P$M zXVFRw2+qPep^%w56!L)mY#z4Q1npngi}RjfEr2tptZ*HKqnd%&+Hl8Xb~VON zqq4#R*#<{K_OW8-Ldd}^N}VvcDav}z!H7-7S#a`6O!L2HCrn_FG;5|IQstUAN+!jU z6z5&@W68;hFqdxPobYp}hWL5g#O_nnTDWQs$2`CQg?ol58MQ%kgv`E3oerGDC^J37 z$#BqV=j=gdRJ9rNCbGkUzYSpfU%;vg1d&XrAiH`ggg1EZva2DJ&ehNv+YNG53Chz9&Dl{2rL6k3LEbOYCg#h>lKHlN)|RK zXxY3*v7ANd#@cwfaE zm##C)$UtvIl?>r@+8I8E1IYF>t+#}w>3IB|0rHD@iP z@;kSg&ZLzv@6Vm`)XE6;`7WXO?;+1CLdM})b5y4s)k#MalTr_*i%%t;r#^HQEjFzc zH>QdkSH?eZ^{m!E;N}X;_1|_b^uIO;#eO$AajUF-%l}{dfv=a^f9h|5(SWCRxjE%& z!I6TNJCW43qv`fz?|0v8Kk;)=sKq$p@DXKQFu!I4TF)J`F7~hano_=|`_|?Mp0dSQ zvZnKn2v_d#W;o@xC$%O#L1ea9OI|i-XUBZmoiGE-W4`8 zKBNQOE+11Bq)(o6fpcwyu-vL}u2Ig8J>AZaF_LbwrOi~wnVbz;^lTQ>e1^?P0zaCy z2^3u$D4~&pCJKm&sWEThJW2yDn3<~YYC5s!SLxma5X1Fkze6P%SLHax>M3AkiG*0N zJeRBTnd^)Dkt*cq15T;NAwbe^;Kf*Pvvy4bnDsY{L+D5hOGmMmrHhf2cgyliDQ|G4CFR|oc`OC;ozl96qX3!J zfwijLsjA&@va~V>pjN#{w#Nd+U)i|allHf*`FE!LI~R`sRn3;=`EVirfYg`*)w%bbgt`4dAhDUS=7B)i9mY_@!6vNuRP5e zC{Xzir~HTS_PpPl_CLE=ka3pK*&E7fBEd{+Po};jqi_hKe*|X7J~%WgD*o7HgUfE5 zMB-iSzWMYwj;ureSXQ@w+923mYt}6(>lW0t_~m5d&O6hbO#_{&@&z|XzGF1+3WkP; zjxwp#(YIZB832cnO`?4-*Xxl+17D)25N*!Y~k%E6l-~DyuW8|+1zK-lhi+{uF zwrp8%1(lEqG=aI2X=%^YH)d)ZAC>I2lrsVczcPJ8G4h(B@h%qF6B~mUXI`e4M&JDMySC1@&+gol`G)N5GjGNXj|))*Lo{ z-7svPGr|zXH}Wj$UqT6>pCdPY(s*?KDTFvO0NNiRsP6-vhcOV-(?mXnR=`0~IQg1L z0)`IJssy$#I{+Elq|ZegmkECx%d9gxT`u5<`E;D1FHHGy0q;|{J<{fa>6iQ@VcGP= z1dLXB0uPw7c2`Bb;8#lX8({&T#Yv5tVIRi?=iseiDC;C#h%h~RM~-w6dN%*NM?y2C z?n(!Lw@7&R$aIT39?bT)qeCv*L8H03_Z3N$>$a!G}arB0y2p+mbAb67?`iDWYu0iX9pvK#39`$^~gj2OKngysnFG2vNE3xI)wFfPD6;tqU5CZCO~kg-vPu`}Vps6cY4 z8Q1t38tDP#0;DR;;u!-;V}#BG_{C*ItuT`e0fm8qc(vZ-0sh2adoXaZA>=0X$#coZ zp|#neo7NyGj^(v8=jTG#n&->9rk%ISYS>-leA$E3PAnH#wA%t3{&UV`^Dc1xpX<5U zvyhipc#sGhzQCo>LV>BU{--7jb`O)GrPG*nH1LHGLw(SnA_kq%Lctme)>7~d`lNP> zt*4-ag5RQN6ga7|gatigc*}h7uGwvoHW`-U6{N_*@d(#qGx~23TZ(B7uRroKgt95{ zHf2s@)}zGS3yx*f^spfOdVvN6(9dC2?R7+KDytHfSna=Uj5uC*%0LLi^7QI-Mcfh3 z>)v~|tx!{wL9HoL^m?((I9#$0AL=MsK^>(?SGIz5 zp>IUiM>?{~)X%X&lVf9KQ&tZBFWaoNB0rI@+lnokFWVZ~mQ|*Hjt4Y3wnuiXAjeKk zyVpk^TtSXqnmY0#oh!((Ta%+JvS$T3_F}zuuVB6IQ~D74Sh{_Des?pf;Eu? zS?$sj*n;Mcx(kZj;Z|!>gXr3a_E2QSakjuyL3>I6C@)WWFlMjrV8rP7cBL2LqKe@Q=!T4-3&V z&>|+NHe)AIoPpyY#GZ^d_7UO&X%`YEm!NfOFMA7XS*O9jfR4VV(fJ$;$`lNq93RFv z3BQ9+i965}<3Tdi6&^(ekSvDVSz$67U$bYNDa8?0HLDWnq4DHIB8FrxeCg&1-$q%y zKuxa@@(_RYFwvROGHR&YHEp~<ESa$ zl2{3nAs#Y+7VqlEvj9^er`FL+$eQz)(&PcbhhaL>5V5iq^Pv#Y55)^HjfP_+VH%X^ z-|=00_yhD85yYu%jI=z;61|BR3wuZclo^I&6T^cLorgg;qz=VP`7BcMaJ(oMFAfIz zn;@q%k+W@Pf+@aRo&Nk4K7hGrCIteCHP%WvBXP#aOevHIvMUM2D`4O-Y6XU}9_>L|q#+s{>kyHlyVBnE`854+?@cs+#<>;mR6CKqSwSCWR}rESlFC*`=pTO*ThE6Yp`}ab z+Y4*@n>nRRzF6-){<)=>%j6tZz4Pf`oSMjCv5JO zoZZWhS(D@pa$%tWlELg~YoIrLZX4{cu8U_C8$PEN`f)U%F>@ z09&X0`8@3>qzSnu(A*BiCzc=9kR5q8k* z6(+felUfRL90?vUYuIRiSl)cOb`&$9gYVL;8mKc-R>`XJlZ&cI&4kPs`WlNvI zPZKbHX*GR<70ld}>VA68>t(*4d0G#W+8yKX#JiZwCa#A^%Uo)V-0fKDbf~c7h3x{E zXAxpTLmGqN9fD{=3~U>hfTgGqKMYI7_;N`j5ReEAs&WInq|JsJ$k`MqyprvfoGN68 zN!ki_O3hMLJ#v>`fLkOdPC0d$lwR6p;`PehhDJY$kpv<6IvQhY{n8m4e_=PCFQ@;f zIdIuZ;B^383DbNc{S_Q*NcD*qNo>3Dd&`H~}F~GiPAa5Ir zmsepZ-nf!v;c-BFM>OkIpoPZV>OPUPcA2FD%?B}~+PtNDCeN99MCLv+dV;_XR)=iz zd`)wEP-E{Lx6ItRuC;UaY1B{Z>baK@bFMGRR~a-xWgF%zwq4FAgtq^lzThPutr}QE;(=Y$A&n9KB6(_DML<I1V$l9amL7_crDvL}?AnTHS8DnH|*AlITb?r6QwZr73 zN|z@fOTs~09*_sqULk{rd9XWVk_RJUR=Znnwh&Zh$tGwZ*(En@BXQys;SD`ch_G6O zmeDn31_dQc!Yr$1Mfp#uIiPlhT8ddEi{w26Gs}y0@?#!VG=qGvy#TqiuW2ku zX+$)XrwOA*(G8AYNDf18f7YA)CEpZi-wc^Zj^f!rhERu5#{luKNQQ>3eL@3zDT|jx z{7qs;-Juc`%_C3{F~y}Gg;+Y80c%(hE$0LC+*hTUb>Xx6FCLuz)awnYs?ZxINfD`{$`~(}ZJ|nL0bnmD5U&gc_MQX2OZz!fat}1`Zw)-! z8aS%;JkNyZAilz^?XWLqvM4lhGq$}r0vJ0Js?@QGj1x{N0Mv{OM8U)sm^?j`rW;20 za*;bGR8JE4p!@s@o@Pvdnz;(K33)OmvgAVp#TcQS4{B37A>RG->CB%I6Y{T73#9Kh zZl3N+S5&^#bfxLHS}y0q24V+%VqG%bDuyunD;+nBn{N5*UTOJ(zx7s4U8*J|)`aF_ z^EDkaCdl7a5Qnt{E>&OId(*e(7MzfEy2LiYMjOT-mv(tAf9|z(`EBur-C4#q zM+w*9at1&x!T~Z{5Q?vRcuesTLsAEmQ3e@l4{<(Sg53IUsKv$=z}UdjAaW1D>ZG_% zM8$#5liCgZf1_s0(JVt?WhLf&o9TG`=$V zdjGc@z8#x;LTq~AyALOuc8m30cj(>SMmVv*6n@cn(^2zQF}#-j`u3!=_Lidr&x&t4 zs^2LoPnE0^OIFQ>=S!NWyJ=1gy%M=uycQOal6BqlrTdcJeU$T&`QpZ;v+~fAAyU3X~1Yn6okTWqrRdTBX zOKWmPN+uHSY2O6soJ2+BBHB5q2D5Zg49l!T7?7j>ya3?j|!S0?N4SE=1`J5P58hd{i%pMLaG*;pzPiis!m z7z7xA{hY+!O(b?%QfCC{HH;htVTkmRVe&T8$!@zajhbMUrJv0tlKa8D1G1u(+F=qx zd|i!hH_{^JOiWU ztOZ?!JSfmjD2^R~zXKX4^Ohub7*VXEv414I0ux9f_VMIIgdJB@XI9=Ffx{TRGtjG{ z&Xhg_6A45JS=l!hXlrBlaz#sYp=Lg5MvOoGNL>g&5KcN3!@`M4QY_MVl_b$A*d^4w z95?|A%Nc$=GsQv@q!p$p_!1?y5X&qke466<4wZcPUF&KGxI=t-NsDRZ@Gu1;I4*_RckT_2-Qui>BWHk8+-%GQZx>yqug^JRTW zvoCG-rOb7rxh`$3ld~M5EcHrnmBn~PHBTG3pawo0v}aacvQ2JRa2**Gkd2cKS)_M*4Cr5w^kDV7V> zO|XJb$K2BnfbN&E?v_SKo1lg_Y9Y_$-eZ*d&x$W8TGW4q3ucQeM3WB1IFY1cagwyV ze~B?D!^~PR8(rnDbm4sQS9Q1{=$*CJf;#Si=3HcmIO9z}0P(Qo50{MpIAw4?5CBOE)*fD2LcJwasx|pvQ>YYwjC5n)`M=8QDrNL}%qW(t`K3i*7 zrD`^dHJj7cwdns^Xo8cvucazwZWhhWX=^iIS~SgTC}RuH=+;lt%O;Rcnl-GLA}VIz zQfU@d^;2pi!-ukb1JD^xq_R8D!Xncb4PP{BeV1GWRZ}Lw#0nyL7Y$@Mjb9fHdUmB? z(WV&(+=xnELpq{USpH<=;(tW6}Mn0k@+)hC8iiYT30 z9LiV3>aw2dh`?PmMyyD$2-;<(*Xt{rY>i6lE90qvnn#YsiZ2>Z)ysG4Z>-=T1i58` zE_~hG&-nj5zSZ%OzEyu-IBDE2nT2OS!ZxFPojEblDEGms)pAVzg>m{SPOT>k>t0)ymFpmcm!OTmhvG5XY3(2+KXWQ~EH64XIgtnAY4QA6lcN&}q%*muSs~580Y~6x+_1_~O>Q_>*`YlV;zvV;Le~)|^4JE~*(J+qv=@PBk_90tyk9=s2l0v67XP0OV zabDPUpX=rx`Oq3Ag-&bMEzue>EVfK*gr*54MQ*7mahH@?^Mj|mHMdeU;&~BF!Lk3k zL@V~lIqtJh?vYP$PbevJTR--VCEC*cA=`3~d_u{Dk^*hv?i1C&lL=sm;nu(GB6aMuh)}c)0*)X3J;e}&*qyA&%%r#{Jp`jyv&Y-iSR>m z`H@2X9)jzUqJ$M;@z*rGWT?X&mUF7VLKDi+gQ$EsS^bd`{kM4M(6mFbJW70RaJHA& zbx;&PnPS_*!Cgy++SNWEg{ z4{3|Ed3iii{vurbykh){k;8=3hH2waUc~>5Rf<(8u}URYrNpY0Sd9{^ea0FIyjU=0 zvn>UU*D+|>j~3~TeEm~K_*YbCnWA3Cq!w(*g*lpgG4{9gkbjBfs0MTQEKI6Qd|b}a zXb@`PfJZIAQNKj3I&Q-LS+dz+f7*W5c-A1X&sXEKul?WyY6Fk=#EdE{O|G>e$h%xE(?iMRA;m_m}gzFTe0yAaYHLqlI z5ngS{c`sPhYJu?wj*{3n$?dg~a6hcX6Myq$_^D`)d^>Tvas+^n3Rlq_m`{rZD#PDxC3-VAOZ^S=_D=4*&A@ePCn9^8+BspMk2Cg zA!G32o@r!&>Ml2wcO~>h@`ShWHF2jjB;m;jq+1Jj?`J+|g3PEnm&W7*eA=&p z)Bt!jO`36SVp5#)6IN^!_E1xcHI`$%~7 zcqF_tUQGt>IhQP^5o?iN7h>3ARPjGuFuc+Nqk^gGkXRjJ)-T#ce>?0yqyihnzy^55 zX>h-9HF!!c4zXlpULFQL(v`KT%8g>>#&rGaRQ(pQeoMNhK2@_$tXT)zpr_=HA>ZR( zu-O~9;`uMll%dI?H)(#}Q|46@TVtC2+`_kVwyD#8t_~+8K7Iwuid(``4gfYUg#UyW znPP?EqcZGHrwl(vohAyK0W|y_ zVr9oYhQCf{_-UQ99pg_C`33J7KZoJR-XQH?Y;O?$8#)_0XzZM3gR7X`YO!WKyGip; zHC(vl7^}14Lk(vW4dJPvgt*1iQr)LIASvdIq+P6;Gd9hfxsH+q6;S&=-LiS~=M;Mq0fYmY4cwFTKyQ;M zl=!?7I~NB=#w$Hiv^6qvB6T0h6iJEj0~=FJw}n-$q-7c$15AU18uFp$XFJW$XOJ7` z&5NVUZ8x<`Za3{>WqZ1!CRNcUR={R6HX!x{R;H>t#462BL}}KE73J56SH ztxl zbYpX>v0H5HPPYV8Er-RH!)$476Kl5Vod15d*j%*O3hQ+intE(i>n@U!$hXS18lbGA0#M`GX>p{Tkxn=tURWfR@R(-6~-MD8NiH_qn%vbkDHEF_f%)5`Y_X7J)($)9buc<9&S%= z$yu1GS}aqh0ZVk=YMNBDIIp&mdS$E7%tzc+2WX-v(1kS7Te_E<=xcN*Izc7dp_9PC z6BZ$iX_r{NOMRkDeFb4IWYgV7)7`a(Pj}F9zC{b2$mYW96cGJU_&tjKJ_Ub30o}q% zDkC)Q*P|viGflo&3kl}li>xHkkYI@7;jzl&lE_lfDj~mKA?eJr6+1JnL_E2W^%d6 z_{M-tAkCCWmR_>yrkNs*c(Yt}S^vX_4<9;mbnl*lJ)K88`{D88aOcte{lZV^J4s+N zMzphxjbcnbJLBl>edJ*|zwf{psA-df6I~#e3i4BT?@>ktlJBe8F!hon!!C_gdHHp* z-a|(x`Pbl!F5~Zm=J8(89S;jIY`_d!%CGF5{YT`uXSZbeKz>l7NQq_ksz1Txd*6gp#KSHow0!du8wq;W-h~9{tq@|(0`gS zv_2F)zn9r`bO`?^O$$b#E{I5<>*f|)@`i2oay285_qpvCx6kaj;b=}*H@@`AE1y)UIm-F0U{y*4 zM4#rM4wP4DsA;`purc?MPZvln8ZI#Ss#C>lZWOPX8%~9KZ-jcGfaP+hy}b~ZbGehn zz379QRrggpe5NcSF=}f+F4x{$p!P>(-d2yQO3J2fx1HXUvs!dk->mCM)jcHQ&-qZ= zQ~X@wV&Z0Pd#d(;h(FJPcdA0uz3Hk3SbgtB*G%vEN}rPTmC97z9ua@eJyOmWKb@-G zE#l9!`<<#L%Gj)z@n+q|R9&x#KWDE}XGf~`ArXI`hu*1LC)N3tUbV8CRNX!if6je! zWlyJS_lo%Q?0u(d&Egz4>vpE<9ue{9d_<{bd#d)Rh(FKKcdE92NiF_n-G)?MkBC2K zkCJ$Os`j9WKhME;s@gP(SMuTLF-_}l)@@7G9TD;8Jfal8B~^P^#GmKzJ5?JOxBr!j zR9%;dKWCTx#qm^ar-(mK=Q~xal^kjK;BwW7u9{ccQuPN!{J9R?s#+_(f428FL6O_U zz&1v}rU-mT`P~-}KrWozchNL>r8-sDE#lAFEmt;`s@)^v&$H*9suroRFRMMak{p;X z(zN+<-#c}UP&o{sPj`v6yKdI*zH2eVyH)M(8B@A>ZK}EJW^>n!6OIB(>TWl8k-=;@ zpxXT&CAwqEuZO3s8hFYou1*)V&K*q!cZ$KCaKwt0w9B~m_SzlEH9HZ15F)$C(QQPQ zV)ud}AJ${N2B$yeSbf6*OVeI5Gn=e>Ky*Hk_EgQ9zu|n@DSBG(T2U|rOA8^uYNC85 zK<+3()1rff9^Px-sGh+&IMnD~Zrx8G#1 zbl)wt`w`d>_}pdgyETOs?z@f78u#4_SH1h)fxIV-mG)`R!qa&++acp^PjSi<5Iup} zhi-VmeL?YgVJRQqnReRpzQSF(D~e9_)FJCe@6|7XEONj_kn zXKw~7j+NTqD&Kyr%8;qDeZpkQtnwe*Y0B)hAfCk#k;V4zJdKlbk((Xwv|-2?;cj-k zi{0$l43Wb7ajnUWWS%T=R7jlqo{wBlp6uplH%TA_{=crK7q-5U@S5mez1OO z#p!_<-$(MUlfz?5@p7|%VSa#w&k=+~j1x)6m-GFY&$W1I7AUTG{`oSH8sQnrM}mff zh$t8Ib+~MJ6~_F2o%@_Hz#R_q@eAe~Ak%!o-N zMQ)5(U=Ur0-OFS^Ome*uMg9nK$Hr40`MLV5N;*9O$yLSBgs!Z}DaF=>njeOXi6W76 z$)(C)Y&1N(&TJTSXiPj5$nW4TsZ7qUM|VYU=+Ir|+%RZSE$4H{)dpjt#oqadMCj<|kXG~NQbsX)I3~^v?8RBKx%~hNcT_)PXJ4nB@ zGa`6N0Kf{DL4aHVv@(B1DaR_&v1<0@O-I`;f8|S6SE{Zyrq*|f>$~3EaMOQyn!M0? z%Vz*CwTt-kv@@mo-ucqLq_+=hb(O196+y8g_``}e$?k&J_q^@0E#(b}@aSIrjmnoR zzh3jL*+VvEyb(T-Y!7v1RDTk2fXkhyccn!^n}TA8Hs+1kvyEnJrSb;DAcAw zqUp&Ot!H8|-XYaOtw;&xs(3?%sbYzJ{pxim&k9P*ly{3e0^JvNkZ$zG8X1+jAMO*%R;aPHua zSYFIKVI)^!uUqAnJ>^zQ&+&t#BkCg8KIKU`^mZ`SG)=+{FXa{(P?iA`_;HeR$IOz@LZplu&VyZ$#W3sladFZ_=>>{81QB9S^L771O|2 z^aQio)(QzT$$lB^D+Iz%g@;GTQmSmJbsL;Lv&~1?gNlW{2#$7Rk18R3UCGHG#Lr|O z!&YXlkYwBbH>8BSvJ;eRJAzCJHAg95mY`uSoP{AuV2dIJsOb~YOfj?gnoU{~evNhx z+F){h3VB!JQz&~;aK}PkN!eZLC1IQzVkTx2|L6wX$B3Uzdy)3y`_gf|iTkqmC0^#Y zXj3ww#TN{Bs|>4K=6t{Jf7So>PZC|DKJ6%*X}{s9Vx&uw^_lJ=S7i@fe2DOejkmp3 zkPZU&;51a#B%!ri7MS&Y!~e4Xhh=Lo?wjtMo`CnDWO2j1r*Y1C!?WQnZwdSgk<;Jl zZg>Flt%j4dlz)@x2f?H9YN=SVB}v|XLvZ5*ecFo6aEn|~bSeL?(@;?juRc%BSBFy7 zTg2)u^Zu=H4CJgzIU7W0gT^t>`YfltHcMgAPfZ1AV6_2`PFpozc&cVblC3+gP07xZ zy@8~)mhV{;=&P0ja>FC};SpY>0W~6s;oq=}cPRnM3{w6q_2Ad=f_c)h!jq1VdD02M zlTME@%au+;v*b;uWW6Q8yy@T<+c$X&rI_kXhwfBwI&`Oc)1f==O^0G$_?BKMq*xL8 z)hVW!kNoOT9nhSHUmZVU{M)F}S%`-WA2DN1ix7`|hIp8mf6sg%Am(FmL=(*)%45$h zcwWGKbXu{R3xxlO-ihC~)9lN^85Bu48Ex1WI5{zKCbq4u?F7#2$>SkdylXod1zqyQ z>2qzAJjiIKtgtURRSWv0qsic;WSZlE;##9oK)hRQc|dH ze=fp((3f%YMe>GG21!&i7bKG4h_*Nxsw5{r5Kk2T9;I;-y(MEh18+R+!Ji-tcgFlN ziY*ZG zLFz2p1XaM4ZEFC)BSv{!>*qd=AtBGUsN*!6i+3k_9QUeS%mEg!75z(KDfXs!Jid!( zFFp+?th4zDlGY~85pL7x`DM|7>!UXuw4g-#bWojWh3XcS2 z>NWr$)svX+u1$68TqFUvWX%2VjoLO{M=N*F`QJl}_WBKHPJ5t2X=zKb(BSnz{ zoZZ%_IckAFZcqjy)}j1}9Rxt`d7B7=HV_2u5l_?+@#5DKbq;wWg&+*N*a`2(312MO z^DrVe*)iBZ33l(NvF&GFFCF_UT}dy#Jx$U zT$@DKCT>si_e(q7rtiBgon-|a-{AC;>(sJ8qkyRjWN1>Vq!KMiNV_iEd3R#mmKs&4 zZ`vjA=a6Kc!a{~)WyOuN9!Uj0&tZvm$^k_;WhO_AuvqI@Y_V1a%`{Gf(WWUlgq{L< z10)VJ5ih3Pm=)T#qv>=JafA5>lxE(PM=g=H)bkp?pDw+Y%I~JUa3AeuBp;7TssVW9 zg(9gf%Yv>#Vxb&yfP@KW9PL#~DpcJrt7eoFCcU%(k7(}Tfg3sxV9ygM{RZ z*r!U=Z%Zf+p+OKrDZ(|?a`34lUaB_KFF#S3D57t8-J!IkG~vQGJ5MR(8?_X;vMwbO zj{L2pq&QWEF)EYBh)7U+V^p3qMt=1>*ckcc5|6EYbj<`9fa8 z4`Pq=5;M`i84~!V)lJzmB5vlIdb!>o+hp&%7!7SwoAb zFKg4MEvf_mch)0NOc{fBkZsCXu2wQ&T$p9R_*EmLEn&0yTNIMX=YBG>_M3l?7xAX- z{e}ZO8oe#q9C4Qn|8R4}CA+@qnH~D-;LPghPk+-R*&;3}GsZU_9v(a$fJ39v@Ys3j z#Kj#fHiB0+rr1%0L3Z^>-e%WQ&xDKEb^$pd>m{nRfwB*_WMS zG{^HK`^Jz@>!zH-BL|FkDr@zAZJ5|O6Ze95l)qFsN{as4y%U5|=Ik1|oFvO~scN4mP< zszdZ`PZsZ(wx)}#QpM}U;&o}KZ*I@Ejj+JwYlqKopMBa2bf>K1O3Up2`HEnwVzXGW zdA@WD@;iJp4*15!8499eu`lhby|iOC4~+D}MpZxoXi8;Z)<5?Nv2M$?ePW<%zGBa` z=T>FSi<7gbu09~H-g&)ATy(?jCI;Nesi+qxI z^IPu1OHIEzGV}NicgwAcz)PVkAqhvC8A!H02q^&21uNat4*?bPH(r{WJ@So5Uw(A% z@oNu>!F}_q_9q(;BufrXLp-#8?kTZ&Q?iJxTh}(HYPXBE+hMV8Zo{=z`1ZgKe0u=W z998&uqG2YRHqTdYNmcI_t9Q@)yFkSB_-7uw;Q_~`?0NTP_bp%DTOAO?X-gh{B;629 zw{A)|Z1{HUTF1A}i470lwOL@w+5|`Ap!ya1E`92<^^)mslL0Q;+1JdwTNav`L-C)z z_o&fO*<)l_>vR6@3bVk-njT%8=mT0P|1SmqOo56b5eY~*M?otE_rRC{R2UeFP(hRI z3UUy}WCg|q;$=m3(p;J@uS}XtfSV!?D{>Y;`q%FMX#rw)+-BfYK0{?N?O&6wXeF7m zI_`OV3;cs~8Y#uB@G8DwqZrXg7aSBL+USCdVs1nI>IDzQxR-H?aWCT(<6g!Q<9Hf1 zco5B0E?FV$4jn6tFf6x548!t_0k~E^u&KOg0X#S`{EF}{hI@G;R^U_g)`ovm9Dp#e zExQzRAm#*~g`82RA2kiRBd#n=%PYK#B^v*sofdiyEmhVhRPYx)35>4(NA6qxB6fZZ zTRAz8lQwddYa;pNA%16onQ?~nYIKCPt{$q5!435cZ#y#{gEBnK1Hkz58R*nUWlOAN zF_|3tgjkke4q1tExg+;*Ia39{N?`UbAuU1RDgEF19Ry2(+86GLKw|VOWI+y} z6L1z!vxVE=qZqkth7);CH?*AIV=yRBHRu&6#QB`&ma(vLi(A;3#XU!|hlxAL)M8CU zMax(jBYiOT6g#h>2~S8S zvDvC*b>Do^p``QBJ1)<2pSbvm8DGjzQnd4~wbOanzAqh_DY*QJ*?nTkTCu2gy5M$^ zf2Ju}-yv3PxcVuvVvAU`6%T!t)8;SR_!9plD#|j3s1T&{vBEHfTAvilViyc%ZN@O6 zPjatQ3WlV{Bk5q_9+y(Zm*uw(9nRSx4=#wls4 z!4KX<^e9G};Upt82?}5wcm#&RVU&b{X@oy>;u#Z`_!v4<_&q$6rU%DiDN)wk$P`QL z)B&0fl5jS8QOHt}4(TNN3eB1^lt)z6vKKpMCSE#!<@^gfZ#Y`g#bwWLy}UJ5+#nV= z%#K|5r8++*c7BZbj>ShV?zydfoi3_TWuCz!gS2uZxQssc^d1}{Uf*+{zWC{sr%v?L z&3hV>)&@R_o}iCCLIIOvP|gGFNVG|3;o)r@S@!Tts2cs6!^C_E3Y=N|a z^b~xBuVHm-5buO7VP|Wf?ErE*NF*o|Pf1~-bV>>*+MsQv)EJ<$W{nJ$3n-t?w1IR1 z?RM-^5qpplJQ67ZNGD_o>T7A1^@;0USvQt@8k_%kKjYLR$gzvcg=}2$#{>AQ9Em z(*_C}DOgQG69SlI7{kODhUo!=gC3{Y?^5tQ1qUgpqkv$emZDq~L}$J*#DhcrZ2gQ= z?mwg8Jc5jMWE^+^1R+>08RMypjbSGP!>}Pv?TisgU@`2kf%!{wml8)K{|&94pCB{# zT%V!11Yj_cu9`qIzxsI6(=_kdJ#D^KSb8b(!@|btPI9oaXSVRlzS+(z2Vd|^AGif> z!qM5fE04~8?8>nhY5}&ERi?^1#Ig<;0+~AxNVj|gAl-8J^nUE=6%8qWQ1l0<4+84l zvg6ugH#hH_FW)b2-gmWbrfw#DW!3Dqd4DjudEfMbWcmJdNsZ+FV|cz~9e`?Q@pF9_ z`(&t1&{}L&+4>PEmYYpjaN+z z2K-zz-=Uv{fGi=vfL%|(QpN-W@GAkm6jKEQ=uQ<3pgRa()GoLv20P+a3seF?Ub}mG zRJ2vSUyNJ6i%|my(JbYn9gD6z(+ITC?!#c!95dw^qGtJ!IA#IwfblBtzak_QAyXeG`2Ltif)tH}+Z6nSf~{!cAk7T< zqK%Vn*Iz;;hn`;mHoTeG@DTuA!^T9u#EJ)dUL)=b7VUt#J(`>(42xX+N?6VrV-W8m zC}2R{qz!r-p4`bYGoMkd+ffc=DpVFe`zrdNjDx{a+-?*9x{py}E8l@wZoMwpiA2Kp z5X5M!5AaJXJzTk1ol60V|B@y(ZF&C$rbCg+bl5RFlCy9TWp9UUQ=?G>ga2I+qg`pz)y|z4PA?XgP@bP#2ym{f!LLg{RtaR-e*W=Uh05 zr4qEGq@noI955;D?^cwJBQN*T6DCGV8JW~1$B~Nxq)S`hdCPfAOMbkuhy1J4JmV=*0y#Z;JU7AQ%hpt*~dNxVgC8_cjvAkt&A_e+O+k?sSmih8s(|f*r0EdfrGuy#h zOx@VZ86Cr|OUF1(pG+G8-?i7#Ev-EvL;>5x)y>{O+kz2+R;Y`%9=88nr_Y;0wtxD( z;Vtu)AMfn^lPFBR&-Bk0U-|f)d9MFzGyGgNKOj~=FmLX{9$XnnuObzfcU>Nf|8}W= z!Gd4x;N_JI1r);$4zdNs>;{~skt_BE2i-XhMJ1OeE>|p2J`gQH4k<+J{d$|F)5te& zYE;(7?LoWfy3>c?c|+8245CHHuv;B7MeIU`shM!2NyxB%O6LB!065mxAb*syim z@e6VU2oO%v0Xc8R_UlJ=@n&hjInxvr${yx`JQCpORPD zlocR24yP$QL~9I_;AADN0IdHWAgfagHp}TXBDp_EI37dTfO>1Iea?wDsqj&e_V zrrbF~w>mq|lsAWirI)5q(;^i>F48N)*c}A%gUkNPfcklmP~9<@p7!7#C8K>mI{AoAM>h0&%(s zJj%HsHtds21h3Dgrx2kLDRw7{fQa}2-Ya~d9PqwiF?gSVa027(*q{9eCU8vnPR<2n zC=VQ?e0g`rIcx)HVO`AKA$g`djR>-jn~0H0l|LI@2;jfp@G2zjF4Ek!QE&v46U?1I zmA4RS>-;ackGFPd7K*%gE@J6OruaUUa>4LzI3c`ND{ek8-*NEG@Qsc`d+v%Pr^g1;_Do##Px*Nf_?(If}hNK@SCVkgK?hAmH}Zl3WBv zZ>bUtk#Wf@o9jaCLF+gt!yP#f6OzM&l5)4O7jH6V+A}zs^_$fA=P8(>pn(c=ldt0u zkR2hLO(qU^;UN?6kpPtJ9}TKgv2+*zU(}%=M-@PKN)7(%muy#Tsq$tKe=UdR%MVYx z(v{W3B5-@AO=*9_>_pPPcG~?DU)}Vc#S+WKWiuYJ7`nabIyfkNxooaXtZToz;hOQA zo3CvaJGw4`5%6Pw{p_ZB|C*G4gXrHdeekX7)wA*W>h-DW?PB%z={`cB7~*uRIxt%^ zU%hU+@2$04uhd+NPah<2ho@d|xLF*!y6w904wPy@_ouz( z>8-o2?|Gy4#p>CKxqUBB-SlsKGlF_Nt8lWAw&iZozx(>8H-T6^0)a1G(dboe9Epm3 zYl~R>z+FP3?lL5*$l&r`YQMPuV%Ne0hLU=CX|cId*43gFN~GC>($F+f1f^d!_elcM zuR3*)V|Egs#dr#|Rq%Qq%DK+8xpV3)m5< z8TvcoF?35VGD4`xGX`jm7)X1KIHp`Wm2pYE;F1;q4Y4vbq<|qNTpraf$)h@en2>V1 z^L#-|p-_mbtV5d|C|0B z?fq=(eT4Q3gd=_AW{yxZOIb6o8Lu_HVZYw~h6|0{LXF(Q8`+WUISnu2!57>IoW zwNr)mY;rw}b7NkN+CBTfFodpJEm4j^do&lJBb_r#pYV48^E3I}$QfS-gcE_(|If%P z?54%Rcr7RBwuo+jop29E6eckoDBM#+<7!_+rJE5b?y zIz?eBxtY4Ug@QE{tfhcW2{s4rA1k55hn+wX8iCKCILJ2TcrmD8>;Kig5&mVjMw1jKeI{EdAq{_>X$WF&)Ok z#$y(RKImqGN<; zeT$Qi4k%wTy-J8l#@fz@N20;8kuz#pU^{K;cZS#(spmIsBau8k z7CQkQNXKY)dUQgWyU#B1sCrU*FO02hS~Pn@TE^(8k{MdX`gBM>Onk|1cZI1MuEg>A(i|vP3$9x z>x0%ly%xMo)ony|7YwN5*vmW8cGtTmW25`F!~NWri(ACvU32YokG;A*wRVTNcE`=N z56&0wns@Aa&yv@;n^UzvJZ@@7gMaRjz+bEw68PpXJ0u{Vvamd%z%;+4Fn~!12KcM{ zgaOPP1`uAvbY%3<`#Aewraa4@{lc$dw%y~jF48U$eoX&6S|DyLDkgC8FMUjdUO4aH z2ukBG&k4N*$xuC^?~7zeM-n@iu28>!8ly^wvk8Z@ogdD(t!2>9tbm7UlbN5-zHG`N z4pb&~K}Rf?HKy1{IGCwPSqHNWvEMn0=II^GVB9H(GW0j062+t_7PIXLUsjK1uXHq< zG4D%JX}Ap2j-9Mo)@bkVn1;Kn@bEbpM93<`u2U@ZT1f#U!i8v|;~E=2F-FRmu-(E1 zh)1BG8Hom9s+QbpL51_c1o^s(4aX*8tU#D-Yj5Asy1rv8bYQswU2p?}Oq(Le?ZgJ) zmW#Lqu~sH%JP3tZ0h%pP1f|CTavwH4HaNkG)7BP<4MxYp!tl5xc}!ZK@lyS-(?5}N z%&~06;0f*VJPkZPW>^UDAI$m?41xKT#m(k@$R$zKn&< zo_LZMBSBwGRBw$fKq0+)rL#ma?^Da&$P-HaZbxL_vO_Jc>mK9fJuhy8v1B!G=yYKa?|tjuN>_zK4Vjb5@1#VZudsR!s!68&dD=mTj8mmo?bhx78ym=SRQ(6 zWMni5t(Q~L!HG7yX*)hLe!OjKWb2mj@r}{Jw&=Mqaj9c%Q1>G}L8$r-LAGUVFxnOk zqrtE*(9qMqA=uNth1DTc(m_+K^T{9al!^cMIAszmK4pgGQ-%{AmpxfbI9Zt1caGlh zztqWMl}`!H$@27lo-DRL4ZVF2b_L1M98;C$x~;RcbG(dT`5n3>+be*?qiVq68vF4* z_U3n1W@fGF@)NV|V#%ssGFxj}5dz`2sk8Lx+~F zm!-LZ1!jjy7ROML!~(N`1y+s9<5$=gBYSty--I=r+i)xzoE)AU#GV$86BN#O-HoTo zi?1M?n%Ne3DiVeTXdE{S0h-j!9@HhX-J-F}$YD4Qix z%2XOf@C9MP^?g!J^?*B=V-`RnBjab6z#jxUD;8@GoyQzvZk9DNvg%6E!wSBEa)dbw zUZtR(3TQwOTYFKiJPb{Q86 z?WNsD9BCCF6|uOjD5;oql$)s|9j90hhC)XHI}eV~Sp6DO5X~ze7)m(=0vU!LQ-T5C0$d8 zzjupwO6MegrE`*E>N!bwcEU~^6yy9TigA7v#k__x;3*Ut;A;wQkYH8Mq zDN_jUbfeYTq_C5qUVI%A?EJKhIG|2skqBY3DiJ?5nf&^YGhzo{%6{Gi3%(VoCVq$= zSrWplQ6Bc7e@=oGB^T1=D8-`Vku+qqj2GC&n=ivZ-YFz}I;o*n6n0ZxRMd z!@aOO6>JY})Op%9e)NM1o;Ex@Iv!Nr-Uo3XU3AE1||fuGcFBrd&Cc zAaid0boOaZS(kF^cr;Jl%2RK`21$0WB+0JOn;^-qP}pRV118CCr1MjfWVZ?438K5P z3umi0Vvbl|S0uh6&;Er(!97ko2lAmm6J`xHN)i5GXPut>xiw;C^5@oPzq912SAbK` zoUq6zCFoKGIQ4)%V2(nW8nY3mt4A}|7>wYP=<`1s@WsQ1DaMf_`n3R@jI%Siu^hWb zih+@q_S5?|3O+}{K?>?9Xr(|v0B~^!#h;+ylN7LnMV)i%NkVpLgwN52LShR`3Tq3W z$J-cfRT`=jk7{U7MH*ja(sA-fv_*OOaRMvFWGgyH_L{6%9{B13J^AapRkOn{*IsSB z8W-2^{_iPlo%UAD^j+KV=HyRJc};HYhlTzn1gRx$)a3-J+YKc(E2Tw2t>7tiYBzj) zHq$t@P_TvqCbqyN7SscR4KS0VUh-_9t&r`9uTjU-1dxOk-m_}iAKbnQ5uO#`~Zr~BK zppYpUmw?g%bcHYh|8xTUv}ReYpq-XZqB~ze0BlE+VT7y!&P#@Km=NQ8xtT=f`K2fO zfSt)C@2XRlgjwHqg|bKIV;{?pm|?%JAW?wx2TlP;>k2ud z=djw?yntP`K{81seYNl%WQq}if%=kN>_~&f68>ca+9rG#iQlKKHG&L4kyabrzDxT> zSM>#0FDSaS`Qo7qd(*CB>Cr7$dCFBQx@u?h=PdKC(1mVnuCO{*d8HB@lX?FG0M=ni zEZMLz*{~^DwHclr)1_-K9J*avb+uM>bSBN6Y=<30Wy&SfLYU$HY#3V70nLkAAm_!6 z0Ym8?vR!6Kb6LPThHCzhHB!JNx~wgBp%R}H?`A`D56D?}IZM1>4lu`yL1O~%HUc)A zFbs-0^_vC;QNWKu(#(}F1=@XwLA(0Ut?q8NEC+QeKame%TLrRY!nU-R=3LprwghWy z^eT0AzF$K-=>o+4S&3F;ow5Nw&R4m!Q?`3TpwLgH-CKn}8H6o8mFqY~SbWYUzgv_u z%lyl7IRRxWV!o=#GlP1~uStC**8-F91b?s#+*aw^IYt3U`?GLkV`Q*!nVd@fO%N;$ zb2D<85-coUGt-p<2^5A^LDIDF(#@8P2Z-4FM5_2&xh?xqaeQHcid zJwdk&HdaAkdoBp9Oh0Y}vQ*l(u%SxZ%K)80+(yed6?hy$3`^Z0?F#tOz`E>wqjO(9 z-LjdjjlfZ2?;$&O31Snb>b8mG9&@KyxAWS>_Y!X;=IeTr)|#}VF6n3?6IiuduQh(J z^^MkRHOXqp=zE$B7v6%3;f8N)eR=B-{lSX|VEO&2w@TK|btE1eV3dlNT z)7*=7pD>A!2X?+BU`uCPN-4$K zV9vGZLC$;ScA13Byh8T1tm1_7dDIMCcrZ@BS@_*DVS;$T9BS{MI1eVWdL;rUhl1Q= zxB5_tCH|$q7!JpSJqI4{m0FYt-~kYjk<&4yjk=LU;1Ik}jROo@bR~cwF{*WYs~lUe z#5(vFCvtNwdXSqyzF#JxQgKHxM#B*mg5gN`jN}_YuK1DhbBD*zOpXlm5)*;FlfcQM zK;{mIg^}Sfc@vN;)BY~|#nCay@qj@vJkbh#p*0W)g&=LlFS+8L;p4(E=1Fw$B)v!k zIwvE;alW+;4xzvh*$ik>5kmcz3B%2tCz&XNhTLYu5fsxk0Is8YWPXTqs>c*!@ z%c=kHz5`hmcY}sGHaH9($Y>OL_hTpYXPMH~i|R$=`&$FOt%07b3gt%26?gU?kRQ=+ zBG9{Bb!t*t$64R0eKmX%9~+B|s?XFIuXxcWv*>}cRpmsMNw~}_wsWz*-bwV^51iV` z@IfK4tFhyg!ikfEfQQ>)9pQt*)6ucm$#J-uIx&nTf|>Y15$Q?LBy4+X{2bgjVPSkw z%HP3tCDgl5VD?C9%dk$yVk1J^2u)@Fa3CBLm>$K^_}1+Sh6#=j!Kk#f?OTe?w&*EJOKNrjh77GM}mO+E}5r3F@B zR2wR~uASgK@pX$rWAUOT-kfIOR&ao3oqu`{V)>A&DPS~Z$Fqkay@~uodBaBVYVT!1 zr2vX?Ch%*`OIlE|E*7x|)t}d9fA))j+8sja1Zk+U>y)CdCqIFXjSa>hrSUR?_H?RR zeJ975BxW|nn=GzKE*>)$Hlc(cA$i8dC?y1O(|$?{G3PURn7=^`BR6eA8hr7ART0^@ zBgyv;h6N(kkdGGyx`|AZg<+yI4TB;V9XkPsAjhKw$edvEWJH3(k#d&kf{#LE>-aPaQNtfLw!I!_ICE}?%AucrNWR6`tXbs?&by}lW07s zCBmD?u0cW=>JddU;qNG5bofQ4A@%6+^_0AU0@a>M0WHWg(Fmu?|F2Zj-%#+sBgok0 zCJe;5(GBxUsMdEl1mfuedRM{mwTu8rhTy3d%16R?_8VMxNTa1u}Ob`n~S_^yn6g5 zId)|J+Z%7dX=KIKtgDp#VfLw{0leM7EzuO8o zTjb6X#wHi?@(R}w#lIX@%9umTl~_O#{s033@O(+!=AUUi&rkMNS1@| zEJ@9$8Ol(A12kMuqqxt5)9bZZ-)k)3$_$cBD*7K50P+nrj~SEZ3SzmNeU~Eh<~k7V z!TgoT{cRGtAHNd%Lot=yPj{T$PccsJrx+*qBgWApYE>3mL;}fl-RVJ~qD9;)S;Pbx zD4DD41Qm)KPbUMBvPCl46aWw<+Gl|PIT(e?4CZ+Nm4(+MUV+Oc`a6 zD3&sQL?$A;Io(p0`CLjZar6YdO0a*j))6Vr=^YvQGfuWNG&O8#(NI zg&nN{PF|JS_W*Jf20mgOJb{TifD=&Zh!g;bA=QpN=5|R-ZbifBg4-pCQM|7BcZAy| z6MthcFtBS`+c~e{%jlZ&=HzmrOa)-jGsdapEKC$Y0HBZw02FCRuHHnE3~+$F_^Yrs zKz*5WsX!R9|6$R_tWh&jRAiPCSxbP*!ay$Dvwy-!mkNPXt%g}0$OdZ2>VR8f1$8LzVR2l&#@Sb+ez~^R>wp`C);j>dKn8y#8dx znro9cy?ayML!$Q($fD#02EMA-?=rr;_WH4;vlc0Rt7eZS+xOfo-b)}!mK@pIbZtwz z`GG7$Ya~YouPp#8ysDwK14a@M=vi8%EK5$B0UFB?QFA~ERXTu8q2n}<+9_C10V%Ju z6O{#Dr!hK(M?^&RKzyvxU<;2x)|4y=nIx80q3VM=YLYx#LZljgEmh;D`u;7= zQD@Y}K#Zb@7a#`wy1@3d2P?Zs$fHF-${WI)6kJ0~xNH{gl#8}fwz+>BksRC6lhfNl&V$Onu#0%JRn;h*%} zRNk_>pL1MvT-uZJHHyB*o7PpgtoG+@7j4hEFS;*Hq{^DG;D2D19I7(u7!4EgI*Ju= zL_wIEg6w^8P-cxH5s2;5P!aI>MdKICo;O@H(rnMYzEnnn%5kKE!B`LwP6MbHiqw-0 zXgLfzz)hZoDOFnJ0s5>lVwu2z^DB}+p$PG*PuW~EPaCJphHxrB17SPB0T3I9hQ88P z2#ZNm3?zy(Xa$*;bwE@@diprE%mYDj7n~4+1Cf0YZlX0pTy`giq!Z55<@03q-&ZoC)rk%q0=U#fEI}zi>{Dk54Jo$_D zHi*}xg5bqanSU28WU4-5f{AizHa`wna4K)g1YhJ}-wu(;k0pL+Uvw-=bQ=zilkzu+ zD@^Z>WU&Yc4?-=D6u!{|5Z@V`7&&j(fDGVkXu2RI&}Rv+BIs^0^A-Nv^yKRl{96QY z(ZMz&wn05cPk)bsZ_%p);ca9W`YG^WzGgh($SKGjK&pcw3o+rFl=cYk3Y2hR>wttRC6rL7FJFBVLuwc_d|I8e(dsNsgl)V$?Dlp z&8@k%BU!R~zT`lv$hv8l$UArn>7lPzAC==`5qB8{6?`LM+I9z=V zvwg|p^-1S?cA&!bPCb1ofpFZJNs?kB1veo&!nMK^PAj9*5qr92lTy-tai+L<}ey+{%e8UV5_c}Ns)(-0CcApBY!wF5 zU~FqlH?EuAd9{$79&S*b9xm9OwvO)>FBtIi7rwt3{J#JHrYJb?a3F6@*TNKUmFl(S zt>8hJ{|!DOI@e71&ODfQ24)Y5&W@xr@J>lZvNHI?lF%jNTkU;mUk%{e^+lpLbjdaI z8KQcvn)8U>%}H-#y1wmdP^^FOk|*u2Px)I!f6H9kynoxJd@@{IJ73*?$qtzJdCz4J zTouyPWKIh27MM#)e`1<4U%BDRWcBFlAG2!h4Jk1pC+1uUu zaL>_!{)fA}4}5%}XK!D|CqL_ZxR;2#c)quv>+Tasqbqs9mPH-CJx+mvCcY6t4F3jc zZZK$(f}ar({W@O2=L(y>nN1G9qut& z%9%JWMI<1bMG4AYu(7C}q7D{y!l~whi$&e^)Wf3O`5}vP?}sc(?hmNOT}jQS zm8V=@DRzgp+*~>y8T2GWB@k&~l4VSrkt~RjnG#6GlZ2UML5#c$*d+(^j%-JV%#K(c zDvp1Noee$1W5F&+^};GxKhB43ijMYD^tzl3tPqCU;?z-dYm^{kCq@(n?|uk~c8}u} zY6GKfq^*73CZ7KqzBlg4Df%Orth|eva&Jx;k%Uw*6WD+POe!Oy*w+Af_l6{Gc5joF z{|=gd%JHt26@eQ;nYcp6DD}Xgi4-g zJRu8}Tx5u{1bsokP|igbMC7(jdEk%0J7!Ke;cmL>Yg?6I~AZC+_ zbTmFB0+9}9YzecJ);>-QBD5uLJ;VfWV%s=x3zJ@ByC3d07>{5#9&3m}zmDD$&f*#T zypZLa6&R^pLfa$9)kcoW2QyNcKZI2<@TrVTCK(S*kVoQO)KCUX)FYO0kB*WgtO5fQ z#X&hL<%s6Qgxxq=$}(2SL8SpYA^rF`9FV1p`AwWI^iH;*Th}GuBAfi!7AtFCYQEAu zyKPRGuUOAK@xIh^r3W0NkIh$YzGf7wwxz0eiB-Git9DQKf@V|E#0}>&Ls&R>$Q$qM z)O_{ER5khgo%ipWKKNr;(??3lnts|>H@gidWku1Yd_1mSec4S^rzV)!uUmV`olW<_ zSqQq%`t+)fEYgqasB9~V%&nB%!)#dU`?z7#rHG~r<15iHa4_y1#ab!IIl0>DnM(Mf zZC9Y+VH%7nyd=Vp##RBt(7$LlUwo&=#8UsL!EqhDMdBPgNHKLMc5hVQJN;A!b}xD5H0E?47nf z1FXD|%$yN~a!Oam((zZrqi4pU)xztgZC9Y3l=-L1IN;nEFZT+%~FB>YqOLkdhZ3usRheg#1$zmw@k2){*7 zen>%x+D}q3!hfV&GGEF31TsHU+}i~+@LotSm?gVXijkcOrr*mHxrA$U`&|l{_$zT= zISrS=plnfRFZ7Ic`{;ONawNJ_uwjy;YC~)jCOU8oqw&3byU~27++g(n$YB4G!Sy49 z?MDX3j|^@Wy8h14@OOsIqG9vfhT69c{Vx|~*Ok8kK%xMTTE>J#HnQy=2 zq1YxvUIBd18`sSO*TTDj*zr7=A05g52W7e6hY!yp`^9HyiS!>LnK_2|f zmdzE<4bGK|b?fiY{leqMdj1kWSub(FaKvaREl<}rrdO|d*R|pQFY0`LlL(?XK5usU zv0Z+oZ2bY0AF-eyv@w9CCTfLfL{^E~hJy!dge`wj&wHvC%vmxh$JO)%hFxd?&o#<$I{n(L#mg}B zP@Vc)hwtTse;r}Krc6KuhT$v<25i7BiV?}9Af-XFm0EYKsw|Kd&Y>X1H&7%iMB^vL zPq0B&bQJ|D-jnNOUATsV6!+m7cl=@o1p|JNrTLqrss;X_eqgFA;)V@pTAWrjrN2`S zWb+REa2YcfyRZsb8d|b|f)sb)IXAtyf`S3}WP|M`nr8-@=WT$?!&C_c1GZ$v@&6PI zztof0AiT6bCES??lfB3|m_MjZtmvtgA6fb1mt*U8`>}G7`<#{tKM9bY&No`S%DX>( zPRq#ggT+;_52={#c{)qaSnEL!zJ3>t)f;>b5ERYi>Is lPD1f|NjoV5mu0{0)dn8m()8QQ+8_#CCQN%x8>|AAe*o(4;LHF3 literal 0 HcmV?d00001 diff --git a/benchmarks/__pycache__/latency.cpython-312.pyc b/benchmarks/__pycache__/latency.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6b0f2003955f3927a73a14aab5efc8ac09f52f8 GIT binary patch literal 7622 zcma($TW}Lsmbayr)Oxfm$q!k6fC4eLFp@FLYZ3xs%*)^yFEEqDo0O(ex9v81<#r2W zt+eAwr3yRC4p1{wnA%L?Y^7>Fg+KV@XRG#O{$@X<7)GLFYNsmM4?eca#NN%$)PC$a zEwv>aljX|3&vVaxoO92s|G@JO0x8b^z5JgognW$+BU!7JH4{U~9U_uZA~K>W&Wtj8 zX&N=@rFqn>m+UC3mzGfrl;*fKVH>qEl{ws~9s1a~BjFfzRQsF>*QiUcTjG4eJ?c(0 zj5g?PYuuCYj(YXFE$&M+jyCFbF5Z-A9&LuYU1BEp7#ppXqC@gat&(lb6o5|ADVoNt zqU#fOG$8UYV}A|OsEwIKH;lKx3?qy;z<9^BDdfo>`>~XaO(X<*C8$nF!MLD8dpejJ z3#Mr*CMk-X91jXXEaFn|vY^H$aF9wLWh6yaBF}Ku;Si&lk0+<0VAYSrM98G=jHQxS zC8|c%R5U%Urf6&;df8YiI+mgdLDl#vDytHLOe(1)%yyhij>}2uI2~88H9}K~w0dbe zEkVokV@XI{lmx&uB2XcrXsiHmnvF_ICax-)l}c)cCSxY!BA`T8FvI`aBgpO$706ym zIUr&VNRO8)18Gl?o2HmaCNa~893gpA&Xi+v`slArHykYV=S@rOeZ&PRZ_b(N5p`pw zNw(5opHJHATa|i_Oak`J>3(%rq6K&Jk3_Wo)|6*ervaB^bLRWiwHPuDzZmc@?r4gH zY{QzZN;S=1smPK76mSYx$D_uGGcD1WlvL$7jF~B3i)KqmDwSi3=8&$bREWw+APX*& zK&S zEJa126x7FX2Z%0+=`Tn7)I_>3EvLgO5K~ga*D{BGbok04sV^>H?t@)Sq>_DC{4I2jP4x&Hg*@-hQ{#+*NGuS`bUS z4iP3IW>(6{;HT`tX< z7N`RB*kDH38Qz`Fs6n8>9RiF}KMI4GM5e}aqG`!&up{#_c(xAS@(S?QI(XYFz;ku* zcG0oqykC6?FWr$#QWJ!W-76m0eUbHoq;d8FgUv#-!xr+ddzrvZZkAhc~b>kNnGBs{%vTuMF* z0u(%XRs~V1*+gkf0Ew7&g~OHSsl&_w40h=(63BnBe8e|$fA;r^KZ#bM$r0``B(qToH(waj7 zxeu?CW>9`xg_tU*l35?Dmq~;{eo!SmCD24B{S4(%w!5}G4hjgZ1l3fKW|Bb=O~DFq z1dqOf;n_Bv+VKNZBUyVm4EsjOligdb=*q5oNLa>Dc6js`m0ie0yYZx5xD3B>p(q!# zdq+SR;s#L&y`Vm;Dk&%@gJ&#wo5h)TBD-d!r7SD)M4N||PpeiRMT&rG#CnQ11P$>aIKXeMNzgbG+ zy3sV1#^h_!m=K!)ogT7kEe{;XesP1A;4B!Ds$Ul`YOxtN2}vDi*?;c z+^r@4^&}0q$!>AeO1pC$!GmWEt3;WAkL`TjdGq+7QINi3j zeCnAcvz}S+Y~vU!IzQn)^@-a=mx?c1Pa?^JKo~Otgby4D{u4fLRr@za-3A9SMExtH zS3O|VVE+e=daX3(hz&~~;~Xc*hoAuSHlT`CBuicch9Nh2_GP>t`e9zrb9FoQV19kXE&hp_HiOM{}YpU)Xg|rgEVS6d(L6paNb#W8!dJ7-Zr3XWo;$6Gk^V^ zsRIU7&XIE(xPf}`nm+&&Ht)*wwRLhXnBSV?bu6s0dpK*WPSXN%& zkih;^gR+WNjsO_qvKT}c5ZhqxK>_YKT2E+Q(TMIof+1>#1vK|XApw43a=hZMXMa+~ zC?(M_0b@l_g9+hUG?5aecofW7Tz;HNV0Ka*t=Kb4CV^I&ZtLnmLQ)J?y-t`D8L!;a z>aj~;%P>jJsY(g3R0RkHNSZ}WrK4%hA&)1~5|zM%b>Nb^=>_$g#U*>X!GwnR1scYh zr()+;;G(jq#0+X+iDok@xwR$8e#5>@1uM;Srb$%Uo6;OJ2hpV^Q;qo31R&8V-jwE; znvw>l+ah!Ty<4bh(Oe5v)L07q2Fvg%piYBQB}^0?Q4FMTK*}fvZZ3t)ngdPv=#(s~ z6BI=qJp>utZ36=7V^#B>Gn@p?S=UX`fS?$b(i}B1h1g2WffqB0#55S8=td}-89aOX z7QiYjWHXR-LsD}=f7C#qZA%DpGIAuIiV1P$Xrz8DgLCnT{|{K24UMISonJQWEV~+R z4$TeCcRp+=TzGeO=&|d)KS9^|x$|XD=ZYs--oEqR)w@@*`1#$RLjg{{*Y{r*8}Fuj z0hi{iGIUgwU-Z&3I-*nl_fYwb3H@~1GskY_s~d_J4Wl_rtp(6q@I-BVny4F`Fj+Mv zV6ly4K<2@4smJ?@v-H4L7I+3jSQp9fnBfqs@d`v=RHO<%V=(jeXH#vl=v`ymNH$aGnmpNl8(!Q8x_HlbD@?Ou;i?tjD0O7u-1v`Ne<*;*#zXZhw)$ z-vjs<@mz!qsQkp)0`x7pI*YE(N3QKpT&*{T^^T6BtK+e2+c$yEQeby6uzNYMXZEZ< zyy9;!`Fo1~o@M{;*)!#qZ3}J1mfqRZ<;J$zVV&Yx{#|fHG1Qq!M^6EnVo0SL1m#2=K~1FpoB1=7mgSrk>%$ zLhoBnil793M<}YXP!9y-4ElnOP&EHbbb@9(fAJ)xsj5H7kHO20Za9Q2hUO^loubvJ zIYmJgVsSwM&lMbgMU-Qz=D-Il>49C%RT1`Zj4^p!<6jW}ntema(*Z=O3lU${VFtg` zaF{hyG7ZsQPz0$2gnyw#r`4ecHln)xwyLSP3{ET>2AU;JVVvScXcmAT2l)>1gQy;X zh*p9Mnth`O_MJ`xGiw%sQehe*m>XBEvEWrFHFH)@Q$M0(^#iggfY(@z2|!OKk=BEb zp{9z+b5@S7d(d@P)lx4BYl=p*=*OTVh@Oe+%%&3*gi>W6!nmYVsB7#DdZ+^M)u8Jb zJ%Npl7g!On=!^Xf-(g*P8&XB!viJh&F97DR;a~X|5On_w64mCsp8GiW*@;re;V(N5 zueh66O$>>&Z@Rbr?)DmIT#c<=qY@0U6w#m)!>5N=(%6TKZ>5dS&(d9u)S09^iUeQU(D&01jFRynfo3{&bK zF7^*U>K|D%o4Tz}`bQQm1$+DaPrl@K7Wzk?W9Q5}-*HZh`+1wg(*A|AcE~Q{yDKei zrIvlgmVL9%6^<`>4m`N>;KP6ID!9));s$ZfmB-w^GUtW~-o}~1D)xKsyY6LQcOi13 z#GjmH%AB{vwHDzou=|1fC~)L4_ZF^miuuOlyTje)<^#)~oil@)Z}-f(6<^bx@a^z| zeeum@-yvwT_Sttn+i|b^ZudfHxouA&5MF$*;5hh*J+!i;YvIa+H~+ZfSdrT{>$r6m zws@=UmaEXQ@4*`n_B@;@xKBUk&cIST-)7(nIakT%|I+6FroHptp}U6`eze>kE(dm$ z0-<6cv>fPNY2W^ZdEwlH_EP^qv43E>|KxJ;)N=dj=Qg&{F>70Q5V!Bv>m}cgqHo8- zz$3nA@%W0%b8~QR5JKu7EqWijdcT8Xg}&i>7Mm8Q9&?9Qy@YH1^S9#+kmftez*vFr z`HSKMHvWBw^V~7^51zmfYyLOZGQ`_LotiBg6;rWjRO29|RSCok086i7hL41zy{=gy zlnTO~qOR5~ARN-ubOJ|pF=9u#7eT>EKR|$h-mZP3h``s2NCv*PDEg-p{mY2fT%Ckp zxFWE4v;ts51WtWSmwyUHJ5OuUdp22Wx^bo1hlK>h2ZfkKui}t8Kl^f%(=_U~e(7ca1<}^#kT_80PTowNk@t#fI0`3AXfI{vVVo2rU2r literal 0 HcmV?d00001 diff --git a/benchmarks/__pycache__/serve.cpython-312.pyc b/benchmarks/__pycache__/serve.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6b45a11fef3d2df3bbc9556fd8e273c8d7806e3 GIT binary patch literal 55361 zcmc${33OXmnjZK-00amEAV_duANLg?sf`k~*c3&r6eUZtUB$9ZLwq0wiklxmE$~B? zTvbU?E_a!3J1x3g6|*~)G<~e3RY}i8HPh#``%L=8otZfS1lokroQbOAobEbv&K%lu zr^=m7GT(pSN)ohK)!j2s;=_G+`S14Md;k02|Nrm*>T)?aJd>91htesI`;YXYK6@3% zqqi&^cZURXJSA!g407rmBalncFd0GsO?{Q?Tc%ovTbbK6**4Wa+&lj^?{S9Q1Q5(?fo$!;NQ&UC{hqLx`)FlS;? zjiLVvUSr(ho-FPJClr2{6N-X`@0;a#!~28z%)c1^CGTg+zV9nQh7TyAr3fwC651=2 z2TO#CGQ<|LqBguMg9jB?mFB93s|J63@Q_d&JT&GO>b`3kKAgkJ|AbRQeeg(-XHY0k z1O9dbw>IbwJ`qHUu`Ia+@-N^we%Ctuq|gMY%`BuD5L@tdh<()td&lfTE6UJk$a!Pz zwqUg(MSJjQuwTok7P%e(w1Wgq%d-RUj%jr|E%<`_1%I$#=)_+a{<`tE3x5PzukUWH zmDHN=2@VK*;kysNJ;m02(Se)w-J&jAfaZ!k}ZFZag5;8CSF4r#8PwCv&F!LdEU z5wzlQ;fe2Z!zY9%@jfZ^1x*vbt)PbCM>xvbr&#HSe$=v7$U++*Lz|oy29WA^4AB+m zSe9@c;e*>ktu;a&vj`^;`|OTmpG54Tn^~=A0*yfGqrvIXD^mgSDnBzl8JZ6A;h=ae zG(FBou81>p<5yv9Ph3$;p}PlyY=tYAP@- zgdLF?eljoz_$gsD5S%>1 z9|}&7Q=tOl_}o-*Iubsjm1T4?1P{F|20)joP71i^7<|gZ=eryjy&9Yr_(Q5=M&$85 zI}-w+$j#ZHpFb8H3(QSM_~scDG!SY&!W#ih1}+CDkz8vtF&L)U%wJDD7p7)VGT8`$ z)oWuNjTs1mNFW@H_@)C>LH>~PH9}uH^kxx_J2SPZXkzfCxnMZr69W;XRK7;&OHUb^ z9@|)&>A5N2tT;0@8wvA=@G+u%=t;V}y1I@yenFTy)S8uc_l0jxkDdrl2LX9TOxsUI z;JQ2+Y&E5GsJNq(fp8cfc6$^14E3EF=|6Y&eBY&0M+XN+j`sDR85lZtG3`-+ z9|xkIng+V3^9?@d<|6PyA@i>XLXnZN8F56#7}ThKbY}WmP>hU3W=3XjqCrNljA+O~ zk*5NYwClQvDpQn+@XRzoxv0buwWjcJkBPxxG&nLoI&vi#m?fVH;N-}anQ&xec1Dc+ z;;*nMrE}46XpNc3Wax6*HaVi(uRWn{UkZ9N2%gC)kOCccPuH z%uILEi0{+3A_Bsql*ooJ7xm14Dtaxu7{{5ln{=$Xr>P%dEL9%wC)dr2*2=W! zs5You~SZ z&&&vH3=jNy^@(WUqWWD0jtOV9Z?clk*Pl3bMp%+0}2BxB9Um zZ`HG8DsW>&Z7P~a!XwCKB%Eden3G25L=hDpQQ9{=)M^)b^rTozuR6TaW~_PXypfT> z^fdY`f;k|JVQn3ooWYc3nGA&^X=?;COi*m3G;G*4ldDyI9fc$OF<$?Gdze@7ouQO7 z?>lEw`GrYKVXB}gX(>vTRwXS}siN|vr94$qnY2`0IlZ?uX(>&4OavV&UZ~Q*AfQG?<9l^N6-3zs5{M^UH5I)7*5l=BA4VrY;9T zPb$=!G6=;sltOH$R|mcP^y;S95qdpIuP5l$hu2VR_9iS4k5SkFy^hmsQ&*oRKY9uH z3-7}#Ls!e4Z0uOMOXTh?t6s5G%S}mu(jE~j^~l`pNH{{Y$@opIqS-`u2C$L`Z_I+A z#6S&9rt|fP0W7fMY#7spk`^m4^Khy2M`&6I-blOUsX|5iv{MbHoYL81Ffu1jk7lD7 z>SKk2I`q*G@w&rFoOn_)y>eX2dgYj8j;O0CCt(SRI?xOzb)y%}<+xb3gw5U67POH| z%92F5E@i!gUj4E8eMF?^+a?39 za1kVx9~E1~iVfkG&EetP2w@_7Bu?%77#9KKV2f|I1O^c+iw$M5=w%oO-r>(4q4Kjw zhjom5{-+%G97-0;i8&&`LM4V-vPf1bN6MB=?<*rCmMi57R;{+#Qtsy3O124?3PJys za@CTXI4L)hr-leN)t`-n0?D2!HO7`Q-b~zU&q`U_rZ3tiy|3F^V=GHIFiq^|cZUCz)?GOLoD@*dC^=Y<)bdPJsxJLk(%5+4;e_0mFyG&! zTCEtfl0&k|Wg5%+=a%0w-7x{LhArI9+|eu(+9c+=&h>Dw75{VdbyKS=@FNUL2TgHo zgA3Nj^vqm%@+Plrk%D1fT~_!&gb(oH+2CksEEE(l0Y`!?<@HeH3V-fAZHf8@d3|o- zv0c~wH_z8#uCQ#y@o+D5(bvX_O7~|mc_Pk;>%wAb~GD% z5N#|limkKCM^b>Q`5?Uxo9cgV8r>2JTt$}z$Hs!ARLCIKL77FsRv|tV=0Vk>fyOqb zRk=IYXl0~W-5p95W$qQV*ZG-H82fY{E8-|thK&HufeR*H4~4=0B7T@!R?yPez6K`4 zybK3m#sgr2@dCO^3|*e1X4nX+j|aC5yfi~Q$fauO;tv^p5Aoft$cXq%VXgP5Awtvq z)C@X=LAa@wCCq9xiQeO{gvL=?BP^D~1|Thl{ErxNBIp4b{@iqkI2q`CF$4hAPin{O zRZ>Yal?>s(s_(aq0lq&l&0h}kbuuMb#{(Ve4Afwqo)Ojj)EGYmLa~FN3j+(V-w8(o z;NYpf!H~B_AEB4E#$niDz*W7}3XS=d#YQe7$c6EZ8KG@dB1|u(3Rt03zY;9sJRYb> zL`8H@A3r-24n;!Olx}Cl3$F|y;4|6`BO)ktpw}b(IO0w7X_G%%G&G}@I!u^-4IEs- z|Cj1i8m&KfIl>rl7&p^CEsqT?N?XP;CCT~|pfauDD*>>Ptl`lBwuw&z*w8QjJD%3O zv?Bx-DD}!{P$Y6kJb_o*JQaN;IvJtWV$dDL5ofBm0%>0HigoKGB*pHJK}N3K*DLyG;#SR!LuBZyCH35L1{Zf z7i5!msDR=2uyTV#B2O42;V`l42&p$eGO5kDn@S1A7^1$4zwlmQ!(VZ~y2Y&@HF3`3 zMboQC*10TecFJ+&e)Rt2hqZD0;3G3U=8t?~=j;Uw{-3n0*xLW_^!;!CZ;_Qfm)5Y~ zo=%qe=G~tbl`d8%i&}0CrE<#_`yS+0#Vz~dHT#ly=T@y}b44Y$&ZhG6Z=GDLYhNDv zxb9%WUOn$xxV~1iKjo|hX%D(SXfVGM|+ow-Z}KZ)r>^0A{NpguiKZX-1m{Q=V4(z1B+klTd5dAIECvUe)xPp4{{ms{=~PPFwUYWhAnov0a#qsjAXmTDj5 z)vpy--I;o0D&DXsS-f}Evp4S9n{pQ|#Fm~zx>?B6&^F8OY-;q zO}GZ=^X-A^6J4f1<((&5Eq~h4*N%^WTVgul&;GZiC2*&0GHWReOvXt}I9(VHPO@2r zxJ)V|ZPW}%>O*uheLp$|#Kd8@DTYJs7H>FRrv_Z7My6fyR<*gJEQ zjcCaUZJ(rOgQ2}o%ANoa7s*%Su)P-TxERfjGO*u>R}G(_xnR>H zW#2G`L9y!lBZ2W445^AYCBf$oQe~j{w@W2uVJ~4<_7b&dKf7c%^n#t zG2_|^VvP|ueH(~r1)F>MOMsz#Y$7%dR31ZlIWC-xF{(z^PAl|vKGVpM6B~vKk8@Bt zqEfyKj1y&`_o-x9MCG~)poG*jS*6NyhKvSa>+Iiry>D_-U(jXlyB`$<34s5Tp{qd< z$pUCxWDPPd6b6wr78=(oBp>JM`Do{>{$CKG@t!FS9yU%(Cxh35lOl~qu#npDvr}GU zt6Q9+=;L^$ZQ;35qNT+U`=Sn=9=(}%$fu1XL>1Vt2gGTRLE*H8T#V}|PEd3UG1}8+ zu+-DpGMgOr1Y@*y?>S zqC}Gq{8T&(i@%Lj|2O`^5KrN-{FZax!dqujp8Q* z$1Wc0N^ZeI_wC-*+$t=tDQEucgSQ74OP2EAE?+HcPn5N$&EcwfO z5=EVBg~fM{zj1tV?+<&I`jUlB%Y}(T-&#r8ottmmTq^p#^5xoONyl;oOUloWT)@g* z<@n^th5PyQPra3O$MuG5G5EuYrI!%NLyY8wjTQEJ}j2SHtJlVYMR-FZth`Cw`H}a2BEP9By z5^mGiwk+^!wD=iTBq8grS)COp2oX0uYUuzsI}@)=@~APerqOySmpsE-#8xtDr+mpH zScs=5?$=7D@xAoywXh7FFvgoHkO~MAxuf8=0d8)_66PC9=zMGphn8=K6_C}zA-OU_ zQF|prc?~6Se`VSX?O;f2M{S8_urHWiE6Lm+`r2#)KE2ikTEQx=01LhND-^7f^%~|S z$(ykhE<;|08Dm%lD`93tTj#0e^;AOkSHgGdF{SO`XMZJp+gHMO?*Kld=e8@q&MTC> zKj4mW&qH)FR-|+O4B>@RkveWbkrcmPGHPc0O<*JAZ5PK%q++S$DmI0nGMu0?%(0SK zv0B1dsZ<&^z1A(|D1DmqS`YdPbBIFMk?a9`CaP7J{wvspoVyM+e^6w(u`-OTa><0U zR4CLDXuVj4RFR2xMa~|n0_F08`&x-uJ4`1U(^Vl|LHU`$nbzhOw#{ilkMe5#SnZBu z)0SVW95-uDs1r|K$SsrVb+9zsJ*FiZ45iA{5{_+KqS25t zQ%h{0vdNG#Q%h{0viUb?2@Tp~TB5~Js!T26-nJ!L4Jk9V#P%uM3@J0U#P%uMe}k6L zpgpD~I;1j;5ua54rs)T!m|yZs*<)D>4~kKZ*HwO0MFy8o^V`PN+8Ma_SPquPfXT?X z+H?xc_*Waxe04_D-zSqx+o;Q&E$x$-OE`hk3-N1n!&C-LhW)>sml8l{d^-DTJ1$xu z&jc2?uQiJG%BL>%_8Du)^C4F^$> zImH`tplJ&+m&9*ljuyW|ukX_9MZ8AQ_1Z%mwcYr8veMxInKe}%SvbYyVBW8uTus*FY~Z+4w1;hjal(Xv_WGjs_ZQP zp~)F|hL8BQSl>oT!$c0V^Q3s?`NZLK_x3G5vs&fsCng{o3HH$Aj;E7h%0t1G?(&(h~S0Ujw4c z=Ve@0h2s;d;r#JcXH~*kwUqt)?%#EPagdIF1Gm;7yw4;;6#0D`L?p*=_!Z+=bu}&EjX}2mO1A*({jYv92p0+}Q%nYJCxdtrc2kIJvBIZ z>f%8EIcQ*vTsk+RI^prmz=eyXRbY;WX47W0oOqicyvh5Kti&w~-;SDWO>EcTR7tW5HA3S$y@YKD$I!|- zbNI^4^|W$npYrOk9v>DPAknfVexeWdyT+4l=cTaagpVtFw{rT zI&-pwd)fs_jgX*@d(0h^Gb2p;1IN)gPF4carf}K>!P`)Hgp~#%hVFlt0atkUV{?;A zMIdc2A2{gW$QS~>>g<$ul6*ReKCm-;k9v+WA`hT&I$s`Cvd}skROxCAE%lgOIqp`3 z+U0O)7GNAQfZ?=LjmhBWkW@XZTN)Z?YOJ(N&TZuK9F)Ey=|TmnQcgKg*Giz$Dt_6@ zuQ@V1nKlDS#06^mH}DG6_){0H9sbJGa{iXNSWE&|70pinyN#VCv~i8iamd0Kc-0ey zqGBA&mzLi;#*P(IRW&$epaX`DrwpqbPZ^RpWhmq2^96{~ch8^xxwq;ICs$qjNnus0 zv}Vclwqv!l8-np41ivt66&3$wam^c{M>b1!!GdGm!MSQyos9`+6vzrbijx5*W z6k)ZZCxQRYp7}E=Z{hs0R35|yUj1gOv19qE-yL3U>`65C+;9G%_GfK>-1)Q4MB}Nr zr!G~}x;%Wp_PrMpC5IL)5YNBke8ahTX~k3b&{?$VtV}p7@#U;bIXw#}7W>{feP`&6 zp@g#z(2eIL@7BCsvqA#IUvYNw>Zym$YNg+2XR>P*%8iOfd-s+rHV@C9S?K! z<3%lr+!n|ytUCFGlV2)YaW zcB04S1UZe0>cv>H+&6!Et-R_F>+el`(6+Mc>`L`eV%ORC8kTF9pIUrrsqdX@@m*&h zR1YP~&&{7n=ar{C)l0PrPyHM2bsJaVM+a_yKD6%UYU<*aid1zC-k)W2T|LRNQ}110 zdggx9AN&7j{|}p%uLA?)Wv4!A-<2#n^LGbu`)9AGbGsT2>dY-}`adzT2nf`{u8q zD=O<&D3h|Qh9{EzlM623^rCA8a^x-hKlbiV z6*ewSB@6dJOug{HO5q+fZA}}-P($<5wbkbRiRS(Hk0hE;CF@Veo%~vDLn^;yHNP&A zU$^wk@4xW7FC_AJ-}`35Gl15pbH|<4)VF+`-xJ?=^26@LzQK>n&n{%AJpSctNzcCf z=)Hq0o_(L0xr!mvdOKI%^Ti&{nHSIFlex7^5Xos=YihrH^bd}|cRarPSh8av*>pVa ztY2$ty<7JOP46|uyU!)t&L^9mg6DtAt^IRnf_a&$?~qGr#t1TO%->cVy>buI14ZxBj2+r>Y-%$;iTsXN}4L(mm%tJo&MCB zA1@eOai0CuUHv)NkeB^QLB+Zmzn|t6EF4+Mt6R6?1DICCuiM~4OB9#Iap+mpv7Uo( z)m_Q*?t23d$`7pPQ5+9f+i`a^zUx@h zKak)DJ}i1rGqj#h(F?fZ(hWzPDTn)Wv#B_5-Alp65N6tR*!iH)w_ZYFrCe$GrZdk6 zCA-(l5cXLm_ryihPR~9+r?b!X3WCXRF3$3v2SuIhwG>vDxh(Y*){wa@jZ~JFU%xEP z1ha$kJMqSe#bZA_r5yB9-F+11=WqzT?~Q$n4L@vMs!c-YBJ#l7zTQb;T^!DWLvMtZ zEWektY)TflEnj+2+`ZmSW!b|Wxj^kw35xou`zGy7SomRRDX^TM^t3Mv3D2$v&fW3D zPoZ5Z9qYRZ=Kjp>a)82mGq=k@3Okg!T@F)Oo;3aX4f6ygJZ9o*_pc9_$bFo-Pmmi2 z*MktP^TrFC;<-(6OVgK+#?4&KIn$S)^IZ99)0gR@Q(u05Aq$=T<>yZ2U8%U4s?>+J zyyrTdf9A}8Zm;Ffsx9Q_Y7X=EDYH=L6* zVn9(4h7Kf%e|$oLSqO@tGUd<9Gs4Yu2^5NingmPuH@Tuxo*uLn=@> zT3Nb_B(wx8{;~y|ly!}}Ykyx!iSqyuWAc;fWL_#4EQT0PHLm{4&Ipy=|J_`br~m3P zgFcJ#MAjxS&5umyFp?UU;VerFLtAZrw6NmMAu!5ec@DezSOv}>t76qs^(zMqr>oTw ztv^)0a=3Lvc%=$a4HpX{BvHs7fzBAKx1K|WNl(eGo-ROFs#Z#A>9Pi3T~$ z_zsuqdR9U{;xrjUW%owyJ43H&=4IIV2w^LGbW<`ow{a?^{kMj6g_0BcQ$^4gQ?LT@ za>XoxPFY%2D9|TzsNt_3mkPw9NV^&~K@v@y9!V?Bv^PSV6820GXJFH#w)?9mAdA3* zkUBO$Fn@gh#F$km{;nOTvBWfBkEG!xPV~TXh}9X&ut$xh|3b>8R)wyX9jB{&rn57t zUX6iM&Q6>J>a`;-#imE%f3USr>uw)^#dG`k1Til*J!&a%LZ$ahiJqguw^a9)yyB4v zaY5Llw}XE6C25*#YAT%7)G22*C9H+|)bta?Hre#(`bjwTFJm%~+SwemoLu5rL&?h2 zJWz|xBg8C}M*(}gaT06D;%@gz4Y!Zotx!kimNC&NHD($UqES;%J7ousI)o}F6_F)W zPn(3ASd&!zN}Zv+yi^Rlt~KJdA+$~^L1?{Hig&}0t$N+2-G*3=QjT%FZRo!y$$_-Z zQWM@SMhJ%Vty00p^vcL8)JImP-qgohtoiR{to37@^}~!$8%9`5WI#oWi45l^D(=Qw zF+SVy*DkcY{Zh#*`=k=MGoD6E9g$NS+zh9LD%^>)YApR%4HG(kl%@AfxN5 zLZuGD$Hqga{Hx+T<}IvwvF>eJtXt~ZSmW19A+^zo7WQRae|?#xNDw#En(Ld$u-YQ- z1+`uD-!bcRIB$;aitUc=iS3Q;qg61rKh}f!C@=D~8c&})Vh1GrqZo#Hp-0*;?ZV2r zTiSz_bFaK|O8f5m-&bgoSg+KUX(l$zyqV|p7O5AjYxCVsm3C44Q~hN!Znk=Zo^t~y zdN6iTEBS$pi^(`H1@7Bf*SQoSp8*v&mPXo#STi%uXv>9$E*)K8jDgZlXW~Kagk@#I!$~{b?Lusek_W7L-oP9 zIVc^1PHvu*f455=IW$gQeKK}<+7o$Ri%IeF?sltf{OXfJR_w50X4^F}qQ=#Kh26p) zVej32YAU@)3@Ljurrdw~$%z+rNUBR6Ill&^1Auf{O|SoMg(MZcQt{Yw^@5T?`_QA8 zLCT2)Wj|EM?HlHk2cCsw2{&t>rMJ%G|0?pe@GKlIp)1AkhgM zdu~4&p+gGxsPV+c$L91Ta>{U2s6tTNXu}Gcxu$)9G^v(A|J@2n%HL3~%+QVk+O!Tu zb!~;F?;|p{z>J{JU`dZvbfkkRob&{r5M3bR3Yb z=@3*`21s9tNsv_E1Iert(jmw;oCM4lRY>|TLQWwh!x%=$X@q3JXoQ?WNCpf<$RI*8 zU=%{m0*9Ok&43}GDM{vP2%#Bz6(Q#klA%Qqa$Y(JI{b)2hr6+s9+8e@TB|{Ido0|v zRas|_kjzHx2p~Kq9me}YMmRV0mQr2Y!nv3c0+A-0o^2sq$_PQ)lpLSiLU=kOgbX{| zZ6Q2^md-F9&_d55Bm;iGX?peO^t5ycp)YPNDSR_v5PUP>2z)d2amE^D;DThXQO49e zZV%yWKeD}gRB9KkQv2F%-Y`6uU(&b!CV!t>vC#-7*_ z?NE(9^b^7huxa?Bx~7xSTKENo%fjd;KLI-1W1IZOg)2fx_cP3*6Z)Efo#rV+$W=YW zKv7zx3dt-?>if*QQz`|s?f!Bj1-6qN=Cq#o?PFg>tFis-2KXlK_R)(>9;S6>D4MSt zKvK+X_VP@0Cj{HV+@)VouN{xtuS6oV;oi>9R(sky8JY@3(r)&{PM*oSV%inK;jsYQ zXQMEl0hTM|nGB!#SD|E%gY;|Qpvr8^bP>#g5=V7(5^8C%V>LQC6DG@dP)ZynUwlDg zAUqj*@e)J?NLGZ@#bL@4=JKYep+*aimE<5k5{89?xoN-Ozd;U%h;CxxJfbRVayW&U6pcoSOk0f-DN;AZo271O?4~WG zf&~$rf6P8$zH(}-_)VoQy|z`Xh)m0dOOaX@EApQB>9>|6Z=GI@{N&PVVcSo?rAQT( zSDK=}DVVkh5g(b<>S05apF-=9IS8_!0s9_M_au`i-=L4@VG7O4c-`VBRJLrndBl(L zA&Z*)PvlBdnAZpxtq-lki1;5U_%G=7AMrv56+GCBhTe4`O4VQOY5{5&(o)yqdKwq&45f%v1<7q5n9 zXUQOh%Ke9I5xD0(N=*j2*Wv zd${jl@ij6`Y0|mP-Ojpgx^2E|Ql}G*g`{lwN5QnDe=1S(9ei(ua*<5xwBWAwGB zH^<`8FlI1lqZT$@hpNNZITQZfR5pc7$SrH=J+m&5F$Qz2iXllGtbBxFKn0RImtlmY z5tV_JQq8&>#T5S=JR$ypL82tel@B^bC`sC*e1wBjf!QlK&8GhV6#II z*5wpyG6Xy7*Lqo(pZkjV7YOx_0{*WU?*E69<_9IspH#J`n!E4!d{_awg6e`_;)uN9 zmu9ZCWt9d0YCQ`czk;=a(w1Mr3;z=(&Hwe$34r;ItcrG-YAcYv8@`X=UX82 z$6_>|4+dbx-XOM{FF%EWqQ zKSfZJ2_=Rz3Gl*B5;YoGKnSOEmFP@P^k?WJ@qZ*l$kUpT$=8O{*+DYt8P)|gTWxgL z1t*uOh+KY3%_M|zwFf7b;{}7Nv!HUXkTInZSapCc{dAFRD6>*G!`6Ca3EQ3Z@A&;%s_gPcus}-?gZu zhQ2E>q_k#A32kTamId37%{ssNb~Y?M!fvXNgVQLKg&yq}b*BfMF~x=bTxQ77LeBs> zcL$J(nqW_+lH9iGrK72CL%Fv=M{Lb+OKj)nvLLqRw@Yj3M$+1gu@1$FbVc=<{_n0fZs zF4KmkwO~ke5o(i_n6$}9=MS9d#Vcw(i;Wo{&E{LCFr#3zKz72@W|--Y+Rid7;XGLZ zVdlcq7P8cxcIx&-!Xs(Z`Lu~=GQVllNUK%+m*_JHi_6CRZLoYx!-WZ;lT{M3t&2Ny zV6vWGxFLs=lUikoocOjq=(4_F`4lb|4%?P*wxjj&;n3XWG7_=#_bdloM>Z) zm2_*gBN$`hcaiz8V3eegbJoUN`V+a3{wr9syXL1>-E|3f-O}^-F5mBuyX%tfCs*u! z2uZ0jvoN;3m$siTmMrZGX~p?2*;%a(`jA!pMk|%W)eL zmMyj~RxORiJw7>MPrUufcw65G-gxam(sO)6!hMHRPVd58ylPjxa`(O6@uD7xkL`2J zXD<}ROFH7kzU7PYyzYd3*B9B`{wFBz^?1#mc=g_U7vn_-5O+VtEs2-;J^Rnrm9A6In-kG;9{>Z!d)H}s*ed7-=-p~J6&&(fRBT2`G z)q?g!LHlyxUiR{hdoSIuxbOPlnRv(Ghrwh41eu>s9qFGxzVOt-wS_B-fjGYxhSctj z#=X4{oCiO%ab=fGvL$S$2w}N&>G&nAouE{Ray=e`En=Q6J+f54<AS$2KW>`$w$d&26d4l0pts@#V zGSVI+lYqDnA~8V3FX%C0*&L!j8bQ=hy{A6=T+2aA|q={*(%sYY_2!M|hIr6&UcXH*T*{@84DIxJ zoL(I)fL@UQ*`jiEl>GYX#YhXnVup6&Psr5{c&#VJ6>@(>FGgdpk}E|oMtl=DLZ+aJ zLu0Vp_~-PwMz8;aUNl3Bx9G(P^0&zKkX}Eh*I&@a?(Ep@wV0OJ_k)K-#6?yuyHR ztDh8=J(?Fce(vDwN##-WAGN}X&mo8ZM7ey7 zf%@O&9p?Iev-PBDt)y-FaH8Y@Q&Pw;UG+31JPj##)dMV&`Kgko2PN&x5RWfQ6_=;V zD^t~VxU2$JC>ML;uDUPsbMmuqoq{xZUh%5CKH;t>i(3W7Sp8N#jR{ZV@|hLS{x576 z5+cT8X?4UM`&KMH5ADSZFD2|{t9CwN=Tq8OL&DymfB6!2pZ?XBu(z$}SdW{s6PBa5 z&Med{b>4dd0`NJdaeK{LZqdTbPa-S19iKXj7ru4Zvf{)N>`f4_HR(OE;yJONYqR!1 z>BHXhD34rOw0yg{g1VHuIOVNMk@iSQ%3DYO^UG3|?c`ZpkM|cP&VuY)r`Ibvd(Be* z16$o%r4RN!q4k1=HdVR%iyUi}p?zpsD3gQKwA33`|wY%bO|BAiyp{-!ysc(M^1WDY(FI6nL zmY<8edsplSKdEkeL-<3>{evrAC+2gZM&POW!pyk}WqqB+nH8u1)BK9n{N_Y{b27gb zDhfGeP$_`MkMoxnOHKiD%qc*Q1tkkFf1F=G-?vr)6%u~}|2t17I|q{$XQ4LYEM0Zh zB%C!%&X1fv=!g_FfK2d~-|@fUU+O?nSDl^nriZ!3@my#smB@LoIBTf5?#2};s#tR3 zj3>mGiif*4Ro;v@I;hMs z@A|Bq<6+>fu*2-kd3UoqRcHJ9&&~hN*ZHPZ{@zx-- zXuWX1jj{8H{lT(S^v+<)Ja%BzmMYvMI0*3Bl@Dbrt@&CJeR zjyvvYT9THgMB`v)hAr_NmnDf)$0Ch zr^QOhQ_0IZK*Zb+-l{mX^nH+l;s9)?kV(wbsX54#ZmqQY9_(&XhXf9+uRv zRW-i*+}qE+^TKM?-bB^jWL3{+I#uiDx(;ICrR;?%dqK)xii9b9$rmnLSN1QtPOBr~ zZl52Im$YN_IVqugnne1f^M`M?J~@u!{!*5wyqaJ$_9*_@r@_|+HBLt(D)bIwGQ zth+Q0U9IPr&&A6QCEbTV*b}!C79r?w)~}c_cYL{i(!{yC{>$gZoC~T?4M5&6*IzWD zwZB~F(C}ZpXyW{P|KNr9Ubz2s+}FR_+#hf5zjflFEjMl}N!gvRJ8nB(bADP@wOZDe zC~HfWb;LXS6J`C-VS8xvymfN5usKoK{J_?{*3^@#Xr;;)G^C1aQnhL(hwhMfw-TXtFJ^DAW8Q|T1e$X8a$xtoZ`Dzqa8$=@4}VbbK{)QHPCCx4SO$UM z^PQ^>KH=cwwLSMsRu7y`95|gJZ1=rq|Md$$eIe;MwqhAb*^1+sQ?WIhS#>ug+zs)@ z{trfy?x$Dm&!lYLx8@d~ep7m2W5d(UuezHP?&f&Q$qze{?q^r*U!zgws9bf_Cmi+h zhQskEo=Q3{tXM8?D9pb5w$=S-68q029fK>Dvun2eh31ut_7z*lhJ25FU|W6iQsT)= za_#_;vX`&gYZLa`6h>~(YWqNamSmN)KztuO$wAZTl$g%g?U*1`|FqbFD+MIt+#P zEni>lJe%mGjFiG-{!ohbk*#ArtDwMv6)j%cwr+;=A#`0!WK3GO;+vV8UKqbK{l@gV zjlN-4x^}5~wQg^sZtr>y`9No`ZmDOrz9&)Nvz|*nSlw8hLy5XW>n`$fbFS({@7cxc zt2O%`)a;92e0DvL{5_nDUn*Ix?M>A7uIH0a0q1I58eeTXmS{S*?j@f>&Qd+M%8-H*>14Rc=k@pe9Q-`Q-Y4a`0%zYJalpK!y^0 zmS-=(ZvV4F4zqSs&wO?~uMQ)7-O^}1X8N=ST8QLe%7^PY%<1GxD}L!B=lYNdDzk>k z;uCxM;=ZMQU}JzuqJ3Oiu|G{8dzTu)8No97iKTeq+Tyi%Gc9X0L$rMN+|O-p42`(k zu$HrheDPa}Lcg|-W8z_Jw`c3x4Rh(+F5r&**L}Br-|t+tlqGO+T5kTWvtNek;O6_L zD=yP-9l*L17_AOGHAxp*Y&-@8p04`%=SJTtiq`kz2RGQaF%n>pELA$J#Ll-!+7?}H{p zN**bOd!^8Hsl>8Ubmj;?9(uhmHrpRI<&(R|8_2c)+~J1%=cV32hyCa6%zeOn*=7F= zC%ONk+ke_7lK_g`5}SL~L* z%C^D1K@@`c(cpt|kBH@PhYLXjVjOm}f6o-llDG__Yn(V{uw9lhhRW$=Moly>FTg9$ z`^qmkt4DOsDy+X|fe&7-^G z4RCA*ICfnt;@c)jvYT*ISk4fRpxDlQ5k+HcACfn~elR2`c5ryw-@%CX+%&FfLhCXK?lLNd?6nzlgrA}rh7$bF`7 z@YJ!9p}w;N7jXpxtX{C)tWB|h!Os2Jo>JZ|7RWk@N>(EbP7qthSAa+1yJ*2%+{5xp z+#r@NXoFP-TkiLyZ%b>{?aRmhVDP=cWc7hK3=Gunk!SIbYY)Mo%>2oP@l;;P;>(G= zwzXnDRos2AulE9Po*t;ReI4D{Thi!1f`mfwfLqvddt->qQlT1WT2=J#>8M9q@@fwT29D_2%WhX{mI zzxpB%7mR$)Wm~;#Zty!=67H7uEVHZWk=dF@1dzwOo@>qV{+D$#qWlfJ&E&hSUEIB# zzC!cQiu>}+f0Ac~Qw&h4an}mF$0(gc%waY(H=HOELzQst5gq4{oo_aK!^W?Wg<0I9 zpDRIJ!VDQ;NcM&x)BC&|*bFPnlIa@Fmv(YV%x3bq=}jgB47p$U8B%zZ6lToR4#cBw*n4#_N{cG@hK5em_HF_sbPGWp<2QRc?|KQm0qGU~!j&Fs2sKfBY~Di#vZ zENF>aX2H(J%u6Uwtmh&G;PzPtYnlMgXi)QH<#OFhk?AVRMYq;Md`=)0bzHa&JPDI> zHm^MLyv>ILB$K)Qt=po6Y<{1H0dXd8ai3?qh#&Vk&y@_AZk=1p-;;9Xf6~^S^c;P! zdFiQpPu*|+m(MNaFFd>W)EmPKgKyLd^W`Y^7A|{zaJ(BvH&Pd#~^R!^c`{hm|k~_j+&hi zcVw5IZLX33PpE|AKR`ahO%%%=?-9*8A?+wRNSq6Y#vpH6xDZY9@{>=P8t`uLtq=?;zd*#OvKbyn_r|5A6V34;wIv6OQbl?oVt;@#GFt^r6&8 zcTnp74UmuR0P=t##c|=p4pN*HP6?-lGj|6cJI2l$V4oI-^vjR ztaD?Abeu50gPg7a8#y6#cjB?wcy$BpNnvU{OPJn4nPxWh`|J+N^wNeD;to=T4JjhR z+zwJ)6Rt-}Rg9h}S6%wAz}>x}w+Y0=ZVp9lKHp^Ea&R&#Ub+I_208f_+>tvLy3x8t zEF4m`Y=h#bJ4WdWSxtRIzVKDv@8Jg2W}i<{JZgp-1|-tK7=i8(yK|Q|hAbaH)P5|? zl#@*ejs+n4m9|_7PR>SiwA8TSi`y>)q38iB9mS~yvbacCd<9=o2jvqE`d~FG+C!qp*q{Pt-}zI>0*Wl~=RZ8tlBhzD%m%{LCrnR9xgD9^WXw2IsJ|MU^aVs*iOS6Us*R8RJ4DDK zC(GJ`6(|gzLWI8-3ZNR-LbxP)3bG<#0*E2f36t3ZDmJJvp;B>^pFx+2&%ZvH$?4t<#Jd#kw)y=Y#Dn9}1)@8w4!aE~-M_3g-cM zqs7prnyISbPmIbJ>@pNZ+YlB)!3?rbbh#eFjP;S3AKLxFGCPnu5@Ly{E}#WzBREP3 z1HOo&67iT)JbMK@ImS`(`IJig*hRf+qfp@^;tcLkCiIqrGG0I}>&LN6ZX;AR4FGF- z1qzV(I*K-Zlc{6LhIpZk1+%)ibs1K`XK+<+aCD9;PuF>qzJStFBri&3V7FgG4xBX^ zxH@Qn%b{s@A)q`LC|1PP9%Ln+3fv^TldpCLjA?xWk+E=N;uTbepF%c?YewyBxI^3mb!4UCvfdvGR`2nHWKuy~TOh|+# z4#THhpDZ1SOKZjGQ8}52!GinCmE=r#kb+qWX(E*82Li23WwPKQ#qq22fr_kXGT-H3 zV9FN`2E@@TWVxJ-$U*Bd2!*?BG#uswnw(Tabpevx9g-PnD1qwjgWRJc$A~Hu22zJL zp;01pBJQV#S#IbTsbyB0icQ^f(;FvlAUJMkM+>*$GHtXul@5(grK3hy%;bR|K0Tx5 zt|;=cOabLkRJ0msH-q2578fW`FvO~e6nC=SY|4g>v%2uYsl&+ zib;{Dj9z{kEB7U=KUetkilQM~&oMj7<}x+?;mH8hH?OD!z%zq`XJKppLjTE;^B2w? zKQ%aTVdU7U3xvG#aHJv)p5^Z6J6 z0n0hQh3NFNSFo7Ck7jilt5`yI)_`vs{Xl}z$vK4#Q3fgvVWxV3DHA%!D%B9iVjKp= z0lH-dlQe9^!p+W!xS?K|PUQ9n4vNufwU_Y06zy+P4x9Rm)lBI$|5hSj?M^m38Vjb# z+5)E)*#0QUG9`EFayn;>{=QlSs*CFTo0meRCAB7cne8iLzL$wO^eZSbLZnCRrP;9W zWo8+ct^JVcMKo1HSct?Jr=FT~DgdG$6S2&F_rbJlbnYU^THVwuKQ?Y0OM7w2g=QvJrD4Kzv$QRcdsYCo5}pl0v3e>Lk`K?uL;h$X2DNsRFJT!J z&BB2~%q{&hQ&Rzu4_Mz(H-YK@)0&{#s#Q+oK&Q~W5<2+T3JW;Fw}MMbu`N87pEumLs!7mW!f()bP_8gOWZ*z zJ17~$YDKS)PS{zG$@>f$9*ZzLGc!qTFNd<`l1pW1wQxG?z=7x=ZP#8}qoDFM3^%tQ zno4WHU3-0zD;@maF7`ur<@At`-mpPTF(B>ux?Oua_I7pj>_xS=tgdwj4%GQ+hOW!h zxpizTuFV}(9oW~?C67qVSeHS0>(Q?S zW_*DVaZmi?pq}S0`$IFG6eZk=l|91tEZAZEL!t$~rx668DwO*ALIM_TVzflF+1%m# z1)b!HJmN-c)h?!)QjSHnQJX3xXnYL@nwT3h!Ql-0(mVABvB8^iKIttsG;l|f$pI|2#Q9aPG* z^z(NC5_A-dv!Is)*b)V!Z(Y(q{tm!r;DRZ$8jWL+b3qs;5pZ0~)`*snADd`HaSnqF z=xX&7p*hAn7Pr1*?wSrvMs7wg9ap_D!!WPEg91kjh)xV*)fx?IIbA%j`21aFgtZF` zPY^djf!xuudisLq_jj3@$FN~%+kJ>U%|&Lw`TH%53uv)|;Q-bV2BjF7gvuBw_?fBb zuP$L|X*+qgjNt0DHolJ1*2!#3n$H=nLhJ=SF1E91U;SI$z)E!ZkC5#msJuL>uQEU7VPX}uI zj2}S=$7Ux%=8gCmZ8k;C!PZh)q77Bo$WCx3GcDm%KSK(Y9AnicW(y`0a=)nXU|z&_ zSLX2fdD^;@`V^I54l)eUTYVtkFBAU|)7;?9g+79e5QW50Se00W7~LgrT~x}8RYvsl zv?3V!m~3y)heyTGYy<{^W{4`rHpB=WyL()0CCp3_Gl)&q`c#U+f*EwyH#h4e%!vfY zZ;FF7LzB(m=$}%1DdCzK7KOIDt_lOPMkFE(aL6PPHHSKf6GVCQD6{M)d9v1m$XLE$ zFM>W;lopjhgLvUh@OkwzfxF{_IvZP`bbpYfPnSp$H=mso_vIcI5-8S~&>U0j5jFQiByM?ZhNZwyvnD zupJ>`p@5I>i-_LtlQ)29Qn`ENDV^<1Su~!a5%W)3|0x@F?~H;PhlqdR2GIh+UcUL| zI+(w}1&UZACxMzHSHWc%57+g+Tqp1UN5D0kzKR^{ddI-!3C3d07&FAw)Q>w_-J%PF zBkjZ)1c{3>em;{!m9>SL?y?0!gJJO)jU-a5OBx_>$w?|6v|LjuHyjs#M_Me3Boq+`6isebIEX?y5^E{w=Li37&s!h2n z4WOVvsTHpP*Eyx!`H(+!A2%s+?8vUb5T`<|rOkatjpIa|N<$ph68S|bSM4?#6%i2E zQ>1u?R)W?G1Rd34#7o=n4YIQLZ7F-OrEd<|mQVn)* zQ-NYcOF^nKE8BfQiLx0IG&d_!uI`M*Bh@tur8}#(79J{XtbOuRuGVep?nQFK5xVe+ z08oD#JE4Sue_|W(WlHKk(>BQ|LdrF;P4YsuhqK(xO8IQa-(zS-w2z9$C)Ez#jK>-h znBCs(npr96sqG3{t>$BKt4)=Iq7)j6qD3*g_ivMp7xMx1 zqr!K~-!6Zra6E-RH;!+?j|LQ z(otHJ%Ew1RBg+|WgknUAa@JMFY0uvEEQ|Y`Nogj>rU8iZ$e5GmZdbaCifo)cN~yc# zTJ72<3X05{F28K?aNN^ML;0wh<7NcWax}RyATrOn8m@jtoO+fg=7jA_4iw9BpEf~r z*Ufp7mE>fxr8(rk&>w3+y zAq+G4x-c~WOM)W-90`rKnwaQs7^@Sf+)!+xQ1M;(6rdF0VuWD5$}M>P;O&ENMVA_& z7?I3viCbE>mYT!{zk|O=dmz6po76MPgsWwOttYA*dooFBLy%Dsb5&!}uOT&A)?iPj z)|kkk2lP0I2q)IZrstJEf(vK_?Ckk~yT#*$^L{lVa5Xg$Hk?q4X+TzizP z1@xB@qJ0BIl33sLKyZ%BbdLwqAJYVn^h7b3ZOkNw#9BUsc+xa1du4=gfq$hAS9L*W zft#pSzx3Z%S~B~;Lt@^%9#3`AT^!I}$O!8lE@qqFr{AuHSPg1A{TIGX`j_fbMT28@ zAxE-*MegQUwj2ZHutC!&2k_84n;-ET(AnE23_44vp|jx-9FpT2F0+;#6YZPRL&VV` z!88DABPbS7TwGYBhdcV&_0oqhusyU~oxbPv~E zmnwK1qj5>r9ydK3#-)1$u4afWUx$pXoe9_)$ZE;*5DDBZLew*jWC@ zD9y30so_hztW6JmPj2$UCwzfLn;-ZR+hfxM-=R%j_=NBIt-gZyN4ANJTm4lmgXU5G z>jk4`rmch-pJ~?R#|m&qido8+3ZQS~v~V$RjQqS}6>|CNel=~F*$Q=R5sD-)beV9w zo8%RW2ycawSmCqK}pKt~@QIts@+#^sf<>6f+W#e5b+3>ECvhc2!%y`#GPP}=^g?BCTd`^X+ z|01LgAsN<6gw!MC1wF3nLP!HbUR1yJUxYLw+z{(#h(bs=TyTuN79po{Mv=ZdfhFnO+Ytw^}cFB!*2g0vx z3dAQud5GTjZL;@SjR!rJr*d(^&&l}qKgJ#sIN0OX%xG$FM7D?|6OP!rjr6)wYA#howInsrLsWEisutye-b;4eZul+X;=`3m8727jdLQc`8WT!eNjp&76j zp}Qy)vtk)$#Zt_QwC3K}+Nrll}QzcIee8ez0^FZl)hi{ zLRX)gh-QERi>(vVR)5^;^ADh_-M%$MIHdNHaF{5aSS282Xt`JNgd>?lh&DvOJQ1ss zOjreA84mF>j21>4XJ}pAQAk!?pG2w*p$O?iNQQPt$Wer3Xmf=0OI0joC3~0SeeB2X zSoL%T%Zngpz^-l5IdIP{H}TD_ZO5?m|84F1g4#&Z`wU1(APN0J0!c_hfFzIs0p^d* z+FlzQY-0?&Ykb+*Ia~Gtw+3V540zYZ%N5?bIzw(#2H&PEs&Z$MO9Gd= zIS*GgBaw_&wzH(pRe2p7@42=r5BYsP8iY5_rBW4&>i)j&)=bZIPfz!Jzy5u1NN45e z-cY`On%17|Z~TXvl#g+ixxA)HBeq}mYO^f79Y4|1$A6?r`FMOGFKg19kNR;yJLYU) z`p4U4=}O>39V=R#W+`IW) zO&U3&^+;Upl9oDtPm}U7o{lQ*DtP8tBJWxPzx^|0pqPix8T#7-v@pGH*AgXCl)dU8<3&?XohPX#sStUZmvk;xr+lY2{$tBr_=6 zM8J-qnh)EUvGRtxX@LCMY~NN=Ej=c3-4o3y8g zP>wKLF@tg4HY|pDb0NefgE{OdMv(bB!jn^5626|G@RzC9#j4S@>B^PUaD@(L2y9uo z`T@w>VY0q>^>Wlej)6dvt45czdRWe&fwpuT%#pQkgC^jwh#@4hHVxdco#M!&5VW+J z-XSu1k?Abo2N~Xbk{~d7WFBoyVY`psh=38+s0}XA)xb;A_ouhvYm8Oq|B>i5%Qp&h zIL-f^>vhHwJ`=tY9%%|64-bTg;I;^My{sX(g9o-W+U=D!=eq*TaNy*UV&Gz*a5L z3}5J{;1mV4JXQ&FXhEw|<^GE5{yV5yy9$-aQNa>aLih<4TcHv%mB1-gtFS;H!LWx7 z+rdEe0Muo+NPQ}WODGJY6Bul-3FY+bzn}zj?s;NNAHv_!0ck3rHZ*v!GWJ*I2TCBXtHSF9 z!&!%l!XrmuDr$6eY6rpaGx|jl1-&>C*es?&6$L}O@Htbp&XTndFgLDYyt5{zs+i!c zMTK8u#37t_)~d#g(1PwUxfY>}H~_^0?SaS|$*mN$Dd9s>b>ov`aCt-zC^iv}PE2I= z9|=SbonX1xgmR6MTtqRjB1K>gnxQ&>Nx?d$u_pXiQdv6|Q^UnsyBTs@--NG`n>Akk zP_?zIE&pBm#b*?-CfgwCCY_c4Fl*hmNVBnJD(lUm0e0NV?S`Nh68?#5vIr?_+lFOm zcg2*u3C#3el1D3N^}y>+jASjZ0h-~_>*5WfXx1JnFeU^4PBk+C<3M=psUS`i{9pc5 zww#|)y(lOrGsLekdu9Hbh(s~;!~Q3_k~{sk`xkZhiZFf&BoS5s)Sj+CtEX>Ru&)8imx9;4DOgOrFe&xIvK8UM-``+7Moc{IMe>@Aq zRhLJ0)l07Wxk1o!-kH5UD;k1Gynk-#+~PaQp4EYsf#gBa(Kc@YnezQBOIH@-3&v;8 znq>QG*Gkvg$gjqJF_w1joi7IYW}+{#FFCzByE2=c6dj%TiOZKbljs-A>gSEP+19eT zW5mIA>yuj$JV!mUhY)q}T|-%1XzwWceZ7LA$aJrDU`+>)CQ zNzI4S&4(8a31iZ}Y?9qk$sJ9*8&bB$XXPO7{DXA)`=afA*bQC5m!+INKs2x*Nc9F} zZ;Rw@0Us!FKEn=9?d_HfZd@h!O<^aufV-;@Oh_Dfabc$}wx}ewqQ4!k2{O2)Cl^j` z_&OzX=cYF>ulv*r3tf42(wV%OcD2q8KJhlLolbjq15mID!`;Nf#N07lso9;_BbjQF zm)7f~NEeag+GV3VWpqEYxMWLEvINP1UP|1~JE;rNXS@M2A!j`Pol#Y-3@YKOX0)Ol z-Y13kWkU6Gs7nfUWg@%e$N?#G02GRz(pNUl>RcH8@dt|C=qZ1xH&>QC*Fjp+E%J-4 zu$0>D^RtSJXm9>_p+_k|7D#Z4-Os_`Tk<@BFfsk$Sx+6JVm0d%g6?R~IsQjZ2p zC-rDg(khvnGfg0*O$~6(C zl96+Gm12@gNHURR=9~e=LQ*ML=2NUB**JGZu|vZ3I?C_+@A;K7(wtnmS9V2Gu886y zy_`e$J9zJ)QbC$KpX(vr%i+mzhEmRuQc1dx!-bjenH4{20WQ2p4($ht(8HmPP`6S= zi9xQlLvA@HwH%XM2BemON5dN}r_%l*spZsK>z|J%yMKB~Y&n(k4=L4Dyhb~JEwtw$ z&flVhDLul05ZB$9ayKeqnzFy!U-ezMlkSd%a|@ zU#m*nyU?265J=3yWnPh~ty|r{vLBjN(gidAY9eNaRxIhfnaUs}g9LWxKm<~Wq^r!- z#2}RtxB(}wZk4KAp<5(f01u1Nd}sEPSsDxmJ{$O-2ZOeK54%6#C%5%TZ9VC>ql>1* z@#K5U{j#q~@-?M>%_&<827`>VA3c`H_`qyT140cN8r{wxll-xNaIIhedtb_bU|xsE zJZ1EJ(E;9Oy^YljHK`7-T34*7nOV~L*~8W)g}dhZMO(F`tHA)7e_8B`$-53pyAIBq zL}x@Y)@2%+<%Vvlp_>xwBx8Lh+N2Knln|DT5gfm)bMcQvM~4_{M>yBNNpBo~%H-eD z8OlrMdoljlTnUq8t<5;Al6o9CV|9w|`joXk;|wLgBRLzC3Y0`=q;aewWvEEB%au(S zEHm!lgVN>F zDLy5hpBAS-QaJ7=f3g744Ss;4fr3Gb-YNJlMS}(BC^}bgo}zfc7)9p`#wi*vn54*a ztze;VlbzR|Y?o`>rP_9}rUNcyx=zCF%cJ)m%|4nG&*8*l;`woLd{T5@o9|vprFZ)m`V+bb)@AE@=wHlxp18da49kWG=4Es8^lE%1zCMcq;#&#yk@Q;{wZ z@ZYyASrQ`~w%R9nTl6jTEnZIU*)TV}vfy`k85Kv@oNKop_H7uC5(mAm{F|*&9fsR) zUSQ50IL%`)a2I{^>I9tzowl^(Kc@yT2l`~deZkHD#%qngUG$sik%LgAddCG_sq|KN zBb0Q6zfi18*Y&nS`O?#CK=g0HZo>t0;a7FFP&Uo_3ogT^oiYG4Cu7Or;9yjTKv=@8 zDf&*<5RZ>c4#(rd7)_<025S2o?vc{iE5L?xJa z6X7#D3Vi~DXN$>+31T0`Cdb&t?+pv|@kh{6mWh@W!|;d!sIx#w4Al*aZWPNpf%qn~ zMPSju+E<{@kpd{uIH7|orAL8lH`&q%D?;o@jWK}A*d%k~f+VcVr|9DX_LB%EI*|Lr ziHR$D=buxt+-hb@Alfj70~KfylZ}46AYHk(X2MjLedmcYKc$2ZRo6iHQp^eT`7>Mn zrk^voEc~^N4qwI|&oV0Kc~d#MeN&8nk5TRA4z}}R;;cw)dlAgmXW24#Fg5FgVYLif zIOQ%|&AB={4fj=-bJ?`|6IQ7zI?5&%$Ezc8`h0@5$)&O3OTyn$2|oo<3QkbaLjhZY zl4(#lM!|6kW+|W#gQ~Zk25SB^)U57%*!iX|;Mn?*C@r(agIG1OomFhvlP&Gep>N-1 zlz0LGU>T~2&FR_7SoOA4M~5`-SJC!w3%^4-{Bu(`(Vsv^&-2fVN_pK@kmGHSxzfj+ z{W0fy%$Xi@#g91)`-hBw>sMU+Q?B|c=YPt%zT&EWONAXwZTSywNaBWG73lebEe^rg zW{y9^f5rXbQ*Pf=F7}jb+YCoV4ilZ)2eIXt?AsYoMOI8 z;Sl6u=2k2um2yR^mLFwuzl$Ii+Wvs8HPsATmqG1aV9h#i&A>0InhG_AQrZwx*$)}5lWL(+F{Sy}1h zg^hd}2pvTSTtyq=lR}3zAj%^(l)n$t|t{WHro{g z6OEh=XjCSaU}BWGzTBXgm}UcFzL+Q{>cGex`uWWh=y37ch@O_P9g1LbUBqLH&Ylwu~9kZ59}nX?1l$HY>q zoRx_-%sJJ&Z)ds#KUnNp7+1=e>ZB%gF|nMQqk@TUuB<}wFwx67TuLPqeSi@vekKOE znn-eZ#iLX)HK^TzYNpm;UaQnHG4$Ha2(z?^){b>dt>?-r6KzXP$&loTC{dQS3-j1S zXlb9)z|=+@aB+Icq%<)#_O0I1%u-vl-qOm{-P&2&nA*+-s<+s^eR0EF##g;)wBR`i z2Kk0ei9;^&NhLm|khHpnwIl0gYsXi9pz4QvS-hz)-qIsqIm-9y`J#kd_OwWzmMxAn z<=qmVkIvt~%UrVJF@%yFS!-J_T-&>Hb&FKRI9kY8B=%xvC4rltBW*pjKK1a(`i;+H KFPYAUrvC!U@fa=u literal 0 HcmV?d00001 diff --git a/benchmarks/__pycache__/throughput.cpython-312.pyc b/benchmarks/__pycache__/throughput.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..930214f644e4f95d23c6f43ad450a09126c37ee5 GIT binary patch literal 30000 zcmdUY32+?Od1m+AN6&%5eGnKN1TZ8pBzOS4Ns!aXipc zwrd3{EE8O5O=vAgkmHKMl~R(El%u4wwRS48ojBQ@2B6Gn1h44|YZGtPR)HdISx#zq zzyEdj9N<8rU8$|wCh_|9JO1~-|6Tw8zyH1YvBP2G@PrJ%GxF6jj{67t(4I2YbAL?7 zakn^;8{|Y@)P?v#p1pO0I`-BN>e<^cXkc&Sppm^zgC_Pi51R4Thb&?1pp{o(*amGZ z&OT_z+Yl1M!l1z7j3GzZIp_?#23=wIpqqu6LSQR)(tvtHRZT z)!~}KnsDu4ZMbf*E?hrYA8r_IV0p};RpG|L#&FYM6Z^J=yy51-=5Whk3;VW)TElIF zZQ<2}tJ$|L)E@2_>|lO-s59&v^fA8>S`%J7xR&`Hp{{WEU^mZMIk8=I2K~X>S`bNe z1=or0V2fB5TrYa?S03yMZa}P$#p)7fjytC#|2aXd_=<6G<8P8bm$MQ%H(9Cdj)%%_ zW@T69%HMF#DOMx>=4B<-1bfBWpqpV_2dwK0a=66?qJr3l9J^O41M#a7zvqh1*O}

9ZC6K=&`JWLeE4m1|uVhpp>mXDFx3+0RR_sfqMh7pwFJQ z_a8oXVjvL5S3K+5J1)iI2Tz{S0?PMABbS0wED#@wMzp|+!;x5A8Xu1DL$Q(;y6V_? zC_VyA0-;iwRsn@`BhQ}-js@c*1npEHB1XenZpX6E(}D0>(V1%PNQYmt@sVWY}2HG41f{@pu1g29A5F z4AUG>MV) zuUL}Y0QZ&}rlpOUk8q+v3;hy*o|`b^kEMO~9gdr@ipHdMR8X@Oye@H4ZQP{=lujcW zY07!qq?Hz_jB8UmS31YrW-aZ64JkYt{QO&58?V&9N~efdY4LAi*`Oz;JHma5FP%DR z6D>QCCTSieWS2cjn}!e1B`u{=pmj`1UbNyLVO+Ni06vojfqUEjmPv`p!@K?+1D3Q2 zdr~*r@Nh1vmfGuvby#U^B$)6(!9;ybj^{5xq*T9S6r_O|0K^jvySvI5Pgd>6WTXpe}Q zEAmH4`-{{+omyVBVJVuF=$$G}%%B;@6E$q9mYHx9yl2Z*+LmfN_r*?*llbf9)PoG% zBjWz3R9#z0A;q6G$Gf%AB%gGHzOJ`ue!!#kXY|E?MLH*4NoN8~SkiG$cN%?-$KQnY za)HlRc3h&NbX%9zAH@phWGlc(9QQT;7SD6n_ZYY<+#l#K^SAlq zw~bj_C=?!2Chn{drUiNk>(RN9^I1nY@cfVx5(-ALHq7Q&O@_!{9(;Z*I2;d(Ls7P9 zP~dHSR_9-zHLEK^Ru8~2YAvt#>aY9P#lvIk#zw}v<5(Sn@$Tox`?hYm*cV(E8acZT ztKNlZWZfm?SeIM*V(SXnim@wM>yGom2+EaqCF-T|$PlIW?+8VQ1EJV1e=d>{d;rD% z>Yun@UFDWIgUfW!#MO8w4=x!wt8l}4&6&2e%9hr&rAM~(WQ6jx&?pOyGu7$l9Wwq4 zJAR?(t-F3!-So+QBJ!8}4gF^R*JCF`L2w`5U?es!5ziBddy6RK9f^6*OThpZM~aQW z33_AWVK7YIJng*TC%h%d=fy(r4TM5oZAJBG-D)njW)6*rvD>_~1qj{dvnC}f>kbn) zL#-4D4T)fn{t@wbmUf@HZIWn)%-Z9@@EEPvz&dM;N5_W7vbK@)k*E|L3P!;9d^$ zBQbP&L_|yO1ujxQoZFnQI%}ZTlW3-u+8L6B`*0`_iv?p@Qy?ae49By2kVD!|X==!+ zCWqE1pdT9nZx;aXnKiJaroh-(Fd|BRiUuFTxDCQyqOJfgbR-9DaX%$|KxKb1OyyVatQ&1tZ(m=7SD@H6L zCWxUi#!E>(l#ptbbrg~pDT%%XrDe^^;EH994Arb2HO$)bBTm`?C^2FTyvv_G5{(0} zXX-$azJO0!iedrGFc=pfJ2|88XEujyTmPy(?QW6XEi>Ea-Rtmub>E`hc|SL9?wc`! z9&iT1w8WW>rX?G1-NWCvla~d0xb+*Z9?W$4ub%j@ynfQWWa0qpsc(Pb%`eEVjSuvk zqjt9A9sg~AYU^Y2nxpRv$L^NZ&-8tJ*PFW*$~G*SIjd{Q#$goNTj%Vpi{-1P&!-!E z<;LDWH!PIzObI(vLmmQ{R-SZ%Nnp%JsdOrk+%N&jW|q;rxZ(YPBs@a}^CsYq)hAuO7>oJ!x~} zoVoFXo%`o&4*coU?{-XAr)#_CYrEfjdeV`qIdG?{{_2T4PS4eYAK2>_-4$tfv+Qo3 zcehR&Kd@J31Xo&EB@3$-grbW)F3c!XL+|eV(WWImSKf4=)0NvM&6t9ODweoG z78(|W#zm(m?QD{rP4mv?Yx+B`hLo!y=&a z)=ZjHo^5w3S4{_QK9jE8Bv)>lw0&6JGEgTwj2Xtvt!xjzNtNqxrdoT|LGecGI|H>C_spL}$f z=V}k~pM2E9m2Zy`AM%&B^25C02NmUqO@`8J{{O^k`I2J=HsAUiviXhzHsA4EvibWN z`(qw;0R3eT7)~&r%bz?83zoPmnf*W5R;LQAF)H!1}$= zQeZ`kbQfe%V0U>jO?$O@wbKVRNGu)ljJ;3&M7!q10f)tffs;2dm)ydq?}KY!xr9W$Tvt1IATs&L1|nUEuiY^ z>Cjr4Ra!cWWJ&zRcuZTggw}k@5_gCv{1~+XhY6+r)jD7r8G&rudrfmj2%v zjijBil+pmbTwSmjRAVtPeu~b=q)lv0=VOJ=$A&*UlZXEX3xXh;lfonDeM+Vg%}noO zc?*4po+7*ZQy}jFRfK4Tv*a-&`(+m zB2}-J=F8ANv3{jj`W4T`X?kK$Jg4tTT9dYHa$R4IQ)@7zgxBlqa1fDN(MK0>OO~7=pD*dbE}EtK39IvSI@? z3@Yvh1$PCLiOTDaK5mM?{z@NreFT4wUec7*0 zNmb!K?&P7(m4W56;~Jlw)Pi`{L)d1Jw^^=4;W`KK%dLl18Xoc zj;#uky9S zy4YA&7m~8pTwK-y4aN{YV_Cfvj43P!YhbEf5zCSPP?-;Baf3^7)94-j=bHXMBf0HtOA`UvO|Bh}#^Q>bi7whg{t;DJ;4?km}V=0ja*+ z2C3d{gH#WZ*v2~sNUDdI1y*pIdMC}9^&2K#nGKsJ9Zb*!#ESOW>USD%H_GLG5beuW z&7OZJayydRc|!J|eBb@}-O8q!%in(C%@-Cbdm-98%9jKVJa?&rC9Ga-?OGIEQ|9TW+3tB^%P;l1Hk&Lo zP4>&KrYY03Zh9{WQQPo_@R~65U&i*K#I`c&8^pigtoEGM5o}N6M5vmlxR!01s?jg>QCEs(kaL<%5Q{k2&62cC&q|dFpgVa4%it(O{n}4f33${Z~s)#Qkde z(9fV=AmmzPPs_ZgEiJUkLR+EU08+TiGxfx^FMb@O{2x=R=bZ=i`7g0abeb>MAFAMg zmp{;BdP5K2HwQWo?$-a6jX$u-^jG%&T73MV&2V6g;Rl`kfnLiGe0uVC87aJnM}{A4 z;>q93Q}~uD_b$jBeE2tP8J$?q^D zJZoe^RMx6(14>yZsUOb<;@I?zVG}r8hArDz97@M<6#JdSP)HA(Q3LJ40;6mZE?}v> z#l=;&3Y!`wC+B!>axSSaH@?kl%lU*E+g>HK=c82OvPaaht&lQoR}o9Q>`Cf|au@?< z*|KPSYRm0}IWI{UY~^vpPJg@PL8HYSsXi|7T?sidPIHb+g-NbJc)h^A6MVT*6z#_C(DMrF&=l&Y7f z06TlKOvA1qR7iTXP%2F{U93a-{PjNrTdl)}t68*w;YVMlXOxzoWe-?Gb4gaR{L^}t zXi4ft8~$060ybRL{s@NA6e~xsr8Q{TldJ$Hf`%*G;w`}_7SJ_E^?VzW=%n>Mu%2Lh z9R;KZxU6ia9rU^_dAIw&OTfkgHmIz=*V8wwg zg_yJ(z@|x zR_s{_dDv5A&F4bVK)k0nYf^XFY-~mxqUvU?*pE@*WUWA7^%{o9#ehT;8TQnIkvzwf zHA;cV`JkdJ$#Ij6LCWeetx7K<595Hcf-=KmGR1_NHI0v9za{9aQn;aUN_L4H8VAy4 za)|N@OGJ!N)-ZN9ASs-Vk%3akp=@%nEkQPq7wk<$2ms^c7-r8B6c_gI*p7>~kEjF` ze@Fkg%02LK_NKJWFWdYX zi{sTNZasVR*;Mbr2YQ{|bafvVaf|(>#EXfvxj{BJ%$VoQ9d{Z#(v3ZGW6z|ObR;&% zjm~SGzq97*zPonUtHaZsuY?!uZC3{t1$TjFB~x8{b$_O^>gsXkEIK@|Zk&4lm7Qrv z^CX`stDSDXS)VRzmCITuO&LvpzgDhXJ88>(D(d_@7RL?YnlSa`f~8@xwqg1UvxD=s zTPB@Fe!FvW{CBoZJLl}}vps*TfA`QIIljls>-*={_TOo2pLuS!=gslgqUpvha^sfC z6Pd!Gez~!K^2A+t`PA{56Z7s(i=N7<$V_zJ(|b?Pm-XE#tNh5UcUNIQrmPmSLuKQ1 z@ZEv$wJlT}OjkTES3Evx#n#HL(9KY$s_9nbW@NsqJyTVeX>Q5X)Pq$af4y(X#Z|Z7 z=PcE>N&7tyHiPa6l?q2Uv;BJmDWPRSIGnKvH|*E!ukM>RzH(&2(uB4sE1xuGYU?JQ za9(yU9pgD?%Y)-QXLmiUgi0Bzq+#kR24-u7J?9wqL1V*(JOHiRG_T&j1E-Kvz`PEd zNt*Z1^YAfS!1GW%y{;1tFf-8&>qa2NK8v|!LWen;PqGxR=&sugWDcNv+o;Xg6MBf$ zJL0)gONoM`IbHw-s!x(D#c62ssAwYGzRro}Z|Eirc|}&zkkser=PM8?RRaaxWzJ_g zo-NmQj)yRFVm8ABBZ8eGgR59U5Cpb20AHGcgFRXy7z&KVu#>NBkt*{cGiQrs-E6`g zqA+!K(@U=-Th<)Lyf!k74Py*wY!Sm0Fv{lTgu}0zu=$ytnn|K9jI0j(%!aW*{6f|e z!)9n4xw3Y3*i$;Gf{-yX;L5r?kA_<_UfFB`k| zwO6ZaMnb z3PpXF`V8y44JPWl=r0IZ{vb5h^YU>b|2q08dM#I~HzE`emp=$C!TBPT#?Z0{p(S`; zQTMtDqoE|8FM>BLwRa}!85iA-Sog9*@Cp9KKne%MdP>F`DK<_WhdmR9DP5#tg_2CH zq*Zyf=!5|&OYqU6nNm*i(#MR0vQimSd}L__xHblNJCs1+>Xtv?wD?krOlD|k*Eses z@$nq@$*}sA+A^F36{p`{0VfB;$rAT!_<%BeGmw&_X zu{xh(!XSdD6&CmQF&H?xF()U z!Kl7QOP_yhbwLTFuUqy!T))UqR=|anRqR<2mt2Z}HC%43fD0)aKMj{J#&ao3w`T;V z=0j{b^&7-)(Zcd?#8#-W+w!5B53!XL z8>xxY-XnX&s?zw90$y8jVN9tXC8mGbqm5B;1;!huVLjsu?5eWgIAKbfuJYHdNn_Fk z?!W=VQ~d-&bud%@w+69#1fzirkVp3{1B*`N1>V} z>L*MSMy&faBrOSKJq5#fm>6M+wL>a^$vQZIpy)F(#y)FhrbaBauu8K-ZFk(K-TvG}j8q}%PL4cRlsdtaqP@r3Mys*WTh5@P{OwOl{{XxTK;UcDx#_KhPqK76tkAI zbP@o#WOWzLWt|!wV&iA&oPd!DUs*>kn0SwDnOYYqXXBr>#m3KR)f0%_QIbFuB$6bm zo`_tGL@!4)guIbJIJnKLY-hJhFb)SNO5~Ql1F!Ul^lewbkjZ3B`XdTv)Xg;&fyjE) zreboOLTc%|l-5ZOt%_Om*}yP3I5F!WIzzlw$ds)VgXaQZdKr~-!4hekBr7?zj%2G( zKXLNpiBo6x4-5_TpXoon|IECv zh-4~-ScfPPTR_johw~OvxD@QXK$x{X83>IB_e)Y#${N8qhG8D4%I;NQ2w}*l*(b*6 zsKk&shSLTZ4b0ju6oNG?X%&4Ig|)mym2r?wNi;Ha@iG%0911w?)2?-v;ZRKaPn4d& z#KhznkwX(b74B1jC2OvD=|jXHAx_f_)(w`Y&=Idn*ZAZbU#7bL)|H!Az*|v|H6}=g~=PZQXMO24hv(KDUgC6C zTShnxzSwF@xekMsu5JUcq9QXJ`(QO!wQkV>XIg$x+sIXfGyL@mTbduX#yo@4X65#KbEyUb>h^~fuYlf z2lx97Sv_`0rQfGmb1-sg1V%N|MSS_pil*-GDaL+26g>-tBNhz^$(N%N_QBcK$+C%$ z;-v`X;H4NXA%s+iYE`@38DUEY}`^W(M;D6h{{pDmoq* zVKw}-ijHdokf5K-*oBm`#yp-zAaPH@ew7$rlZmH3AQH%Q;d4W_p2)U z{idQVdy@QFXZc;!WhI&T#m+dTVNWUfSJa96yg?DP+$E)+BsY|U5~szKJvcstv$UG- zCx-oj((>^TRe(Jgpzr@Zeu89iTF{q0N&XTq)#Q(X!OW;+x@DYdNsMFrkT+LuzMxni z!b?b$q6KC9^4fecXLJN3EF0=yQ0vjEJUq>J)n0(NT^mGnD5+#=0^4=jP(({do|ab~ zMKR8N%!@Uj(Vnj2_Kf_G6)~WJnJ!JGcuo^gN{%00rPVtB)+B?x>|QD-oi)zO;VY$p z6irfWi5?=dBUiEH5$dNUS{FLPGDtBhT8#Q7QspJ|QW(T@a($^-s==q=$7H@(*QelD zUWi}5&#YLViXy43bZ@e;-b6HO&$o!?d$Ia6dQ4?;j{%fY{1x8kORTXGw*ld81o9L_e284oN_4Y}10Z(6MvF|rSfb;J#F-00Z%22xDtdKz zwG`gVBRCOvHs}?DkRpcxN%W2=2kU4n5$6NgnLNK&J)1|S`FS|qBx3^@Z2SuXNeR*c zD2mno&%x`FJ)sRptcEX!f|r6JrZL%NK)Cdu;b1FuWJED&dk>$9$|rCDkR5XaGOBRC z&ASU+6eoS366}8JLNKo?pyKE#13MlUjCaw&1Xe3me)qu`h3Cctk@!f0UCfbaf4tx;SGVOg@+(;l z0o)CeXk~^VxgkJdtot83d{{%z&%~)jLw=z2Jd`A{C8vJO!D7dAF+*s*P*F<~Hif{m zWteD!QUn55D1f6$7lKmas{_z5dFebSX_#^q$v4vF5e#}QpI6JrPA8!sA&#!mz@;gX z=y(k6Lw9)4yr$i(k~`tl<;CqHbm*AIS1uHNM!Lj$hcb!z8nR|}>dac8)QTVu`>o_t zbdzT2sRHr1B#l$+kP=M#5jk_@`~XhYI3|(g$@G-O4oN?zP{YOGl^D|!Dhfe6#aPJs zAvxbAhcuqqGAIs4Y71;S%yt4by-POx7D=13{@sc{Y4}dVyUtYY?s?~)lw}X(W=GkLqc0!LumhKK z?q=9@Dh48{)!SrY+g*zz;4=Iv`z_TIbsnr%eIeyLUrhjtWopEy^bVp|j% z(?YK-^rnUFvao$#*tzIjn|AJ#o%>+=0o&iZ&gQgptL)r*&!}^3yQen@M5T(FTRU#< znAtv8?uYu%;mnj*r_0;q^0t&`b!zp|l=~R8fevR1(IyTjTA5OAQShdPEwZp>(OH{z z_Q=j2Dq_Puz20d9sZ^M~bLGAgFb}5Ohg6up2Nte;)r=wS?UB7bsiqC74X0A8PS1PJ zq=YjlFGs6GbLO_$inMQ!?Aw#_9W7|`o?MH2IB3_{;ipvdcG+w;wOmZERoxg}-UvMfd8n~m>o`jc(n**4dVK7+DO&pXegEN4&_PEO0_ z#+kA?a|;Z5(iWd=@uj-=%vtt)Zy-JJ#N5CW4+PFtyQI@OHs048Tx460!RO%i^H1_P zp=@bYtawbAN-?Juse2%JD(ar^f)iuv9^$Z;JtCji4W8#j-8W&z1N9E^Zg`k_r&KOu zQ7#j5DQ1|YV*0dvwtPOEaIwQ&^0P{E6yn*)<;O$hLo${Zr6nL z75h*}5w@L=fUU29B5TC8kC3A)o>S+D-J<^ya;z)DYW*YR=qbvv;Sq9dWVGD0jFvft zp)!WZ6*p^QAylCJXsMCdTR;;|>?W=m;#3!OtnXy9sBp(1QK^ zKcTE7A~GRG`g8IznBRvl(RQ3TTHH88+^xzKl7N?Oj&1oJs+9ovOW1{uMI*7`u4nP!m>sJy z)l&Kws%imoNq$O{7uHuJg1x(&iQ*&#CfW|uy#XYNV_)BkknI&GH;O!R|;}DYf5wg3iD}uvesJC1)-*N-Atie?s5g9Z@76-|_MASzI=R)>e<4 zL3GPmv3p?!sv_!7IFM;P3~`q5j*a{i*yO~je0DXQngL!qy6sPtGlc=TdURk2iQOb5 zCR&aKo*xP0PNM?Bk*Z%H0#aFO?Wr{r8#SVmsV1vR{$3|gj2Z4ylOH*K;y7z}+%G^a z!$g8^MI<2g4Bc0Rvq8ZFiLNI|+dvmcur!(g1yulqGn9aVYha0UAgH&tvv*&MCX8#0=g$$83Z{@EG|yV=ucGSq2=#SN<1?VWlEoX z5Uae_CB2;^{-7UMC$U6OxP-<<(sMCGTUB88l1(nrh6Yj?f*`JNBE^{xBGUiP^2QpjVyMLhp1>*C@u2l@A>q5kCAl}2&57p#RBp(>$|)?J{l!Ji)8iy1bN&~LblNP z!ql|t`9nJJStqhp+@^O8Sd%WL!! zHK&*1%Ze6#U3e z%DWb7Jk89^pMZW-bQ4c4y1RU5Rbq?Db$m*dkl75PToi(x}Q&~@8 z^w4?SI6Nj*QasCtv;WFrf1UI(1y-wc&(l`PQm93e^k7Mu(*WI4Y@METM;U`HZK#$F z)i_SQ=x}G<8!|W$W95uB4>%+4JQFyhT}x2I57pLEIp?ah!zVj@v+eVao)-@= z^-Yo5hV~^fj=1~(5el4@taPrD?F^}Nmp!mM$%VvKWeiz@YeE$h#&aTZE-7~vciNY> z*(qf;fwD`$$d~b1BHn1e;G7RIOYG#ot;<<%^M$q#Nj?{@%bx(p#niAEfB0hR2|X?{ zrt>Nlx#DsjQ876CL&R3)0&^a4LS*B{o>JBw@l{$ZjNiCY7b<`HCBONWLGCsUR{o99 z_WYOTLujYw&%bf1PHh#fB_@nG-_ccY<;_Ys(g9QUjJ9aL{JRLpN5{tWmfHVR6F)=b zAAxtN7GNiaq!40DSrb^u)NM zru>)YL)sEO6rm+(k~=SMj8MHw)ml0Ev_B3D0p(e-2mQZkotpSo4HFSpca_3 z*AbYxCSX<%v(bXHAEJQKZz*;9B@;t)bYF-7tO_wX{kq#Yrg!Ot{j8n}Q?3%5@OSP_7E0=2d8gdtEJ&)W)OE@jxH2K`$0w;HAId+Aa78ha5ACVj9 z)yu^zif_8&*_~0)R*+c~yFRIi|E6gwQ60t)ew4V+>MuZ|SOZe9yN*&dPyY4O-(z?4 z_QYgl?$uBR8z;RdQ^7PmvV1`7g;IeQz7?D8d1w3W?dgub z@7I084!V5F+{V9L*TxrBG1oJwpXRCvCSs+YQM#fuVX2jDT3$(7E!d@Kx7AK_7Xgo6 z!T(Ja^dgJWX&TXKe#B2d_eS!yQyy-Vr~61TYJJ! z^`AO^`1ry4XV!m2!=`==Yuus-|C11&yKrXF4pBul%Xinq$mZlAh)+j#jq}x zs8h5pP+^b?ij?t6@@-ykVO&u*#Nm#CH$h@{@=^Ld^5yPA$nuwePGcCxfTj`LI816* z#a4z{&5((S^q1uPF*(GmN-vQ^^Cvs|h5PCwR}{mV2MEJ0E0;pi=oqek8;`K-+Z1yc zrn7Lc>tQsdMSj-8MF*-v?F$5!c=~LGa-~i2iMDL{@o4;Te&S*$8q_fAR`s3j-c6EYJ$e)~JXHc@vk(hGH ztMaP@S(n=1ayPaRfwSex)e%bDsJBOGfl8xjfjS)RV=eIw%EzB$M}`VEkyMKx11%F$ zGltZ6X`sAIuzM;?f*2V5iyU>#=S zt?7y$xdJD#<}3O#o42Mn56YVdbI12^za7m{>{7e?dTWF8<)aU(xbDpvcMX(Bw@f!p zi|)F#yHj>|&Q{F3{h8K|bn7;`bsMg1e`CjMJ093g-A-BXO&)!19n?=Y=Z%JI4bv5K z)>Ro-dD_(}yEOLxlE&CdV5ykikIJnc`-d!9}SPcK%l$_N!?6m#U-k&J+Q z?cFyHTsy$*nCLRTVo$3qwBC1FYHZg8C@$2}>gTL&i=CTQFvz?}x8Tv8b+A~%>9(fU zbW^X~)Jvx!lQ)w<*=K_}%cO8K3eAG6HdWs>d*+?b-~RmU@l@^hdFPIlWyjCVRnUPn zc~iKIr?Cm{ho0uiz4RjiXTR0GP`;k%gNygO{OPXUb6vY%)lcnBS9Hh~9SiQx?>&yI zI$)b5(Csn~@3%I}?T>%Y^TWEosGDzpe7^N@S-}4$SfiA;fMgBpa5zq`AHdbeck3Hv zS~=z$pPa9MGG(b*Y;2ZkKAvwrGT(TF;o_&%jRWxi%v?F$_D0uhU9)TF>NYQ092sFZ zh<+n^Ejby1fzv8zS@e3xLH>c>Xtm)=BBL#1t9msyz2%jOnWS93@!fr?sx9w1GWPoE zkZfNwyD?>3ms+Yc_cQtEct^um5kn@r2Fn2jgi&Q{su9Y@uPx zMVwHD)fC6pk*t+W31VZ|@l!5d{R4`oL(c5jAhWs5I{Q_9{YhrQrD)V?l4fV)nR-w$ z*tF~bb21(Z?vgel7XP_eALj8TJynQ*BcdvS;|L_4`@Vyv#IJEu5&psw@WBeeTZKR3Cw@7?m*LvOyozV8^V_l*eusC^&LSA29-=j2TvwLH)Bwxvt@r*!;= k>6&!ydbxJ}eU84C;)W5P-!l1p8fQmc%?~*GVjb)M0?KI{tpET3 literal 0 HcmV?d00001 diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py new file mode 100644 index 0000000..5411ecb --- /dev/null +++ b/benchmarks/datasets.py @@ -0,0 +1,3222 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This module defines a framework for sampling benchmark requests from various +datasets. Each dataset subclass of BenchmarkDataset must implement sample +generation. Supported dataset types include: + - ShareGPT + - Random (synthetic) + - Sonnet + - BurstGPT + - HuggingFace + - VisionArena +""" + +import argparse +import ast +import base64 +import io +import json +import logging +import math +import random +from abc import ABC, abstractmethod +from collections.abc import Callable, Iterator, Mapping +from contextlib import suppress +from copy import deepcopy +from dataclasses import dataclass +from functools import cache +from io import BytesIO +from tempfile import NamedTemporaryFile +from typing import Any, cast + +import numpy as np +from PIL import Image +from transformers import PreTrainedTokenizerBase +from typing_extensions import deprecated + +from vllm.lora.request import LoRARequest +from vllm.lora.utils import get_adapter_absolute_path +from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.image import convert_image_mode +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils.import_utils import PlaceholderModule + +try: + from datasets import load_dataset +except ImportError: + datasets = PlaceholderModule("datasets") + load_dataset = datasets.placeholder_attr("load_dataset") + +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") + +try: + from vllm.utils.argparse_utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- +# Data Classes +# ----------------------------------------------------------------------------- + + +@dataclass +class SampleRequest: + """ + Represents a single inference request for benchmarking. + """ + + prompt: str | list[str] + prompt_len: int + expected_output_len: int + multi_modal_data: MultiModalDataDict | dict | list[dict] | None = None + lora_request: LoRARequest | None = None + request_id: str | None = None + + +# ----------------------------------------------------------------------------- +# Benchmark Dataset Base Class +# ----------------------------------------------------------------------------- + + +class BenchmarkDataset(ABC): + DEFAULT_SEED = 0 + IS_MULTIMODAL = False + + def __init__( + self, + dataset_path: str | None = None, + random_seed: int = DEFAULT_SEED, + disable_shuffle: bool = False, + **kwargs, + ) -> None: + """ + Initialize the BenchmarkDataset with an optional dataset path and random + seed. + + Args: + dataset_path (Optional[str]): Path to the dataset. If None, it + indicates that a default or random dataset might be used. + random_seed (int): Seed value for reproducible shuffling or + sampling. Defaults to DEFAULT_SEED. + """ + self.dataset_path = dataset_path + # Set the random seed, ensuring that a None value is replaced with the + # default seed. + self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED + self.disable_shuffle = disable_shuffle + self.data = None + + def apply_multimodal_chat_transformation( + self, + prompt: str, + mm_content: MultiModalDataDict | dict | list[dict] | None = None, + ) -> list[dict]: + """ + Transform a prompt and optional multimodal content into a chat format. + This method is used for chat models that expect a specific conversation + format. + """ + content = [{"text": prompt, "type": "text"}] + if mm_content is not None: + if isinstance(mm_content, list): + content.extend(cast(list[dict[str, Any]], mm_content)) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "Could not process multimodal content of type: " + + f"{type(mm_content)}" + ) + return [{"role": "user", "content": content}] + + def load_data(self) -> None: + """ + Load data from the dataset path into self.data. + + This method must be overridden by subclasses since the method to load + data will vary depending on the dataset format and source. + + Raises: + NotImplementedError: If a subclass does not implement this method. + """ + # TODO (jenniferzhao): add support for downloading data + raise NotImplementedError("load_data must be implemented in subclasses.") + + def get_random_lora_request( + self, + max_loras: int | None = None, + lora_path: str | None = None, + ) -> LoRARequest | None: + """ + Optionally select a random LoRA request. + + This method is used when LoRA parameters are provided. It randomly + selects a LoRA based on max_loras. + + Args: + max_loras (Optional[int]): The maximum number of LoRAs available. + If `None`, LoRA is not used. + lora_path (Optional[str]): Path to the LoRA parameters on disk. + If `None`, LoRA is not used. + + Returns: + A new [`LoRARequest`][vllm.lora.request.LoRARequest] + (or `None` if not applicable). + """ + if max_loras is None or lora_path is None: + return None + + # Generate a random LoRA ID in the range [1, max_loras]. + lora_id = random.randint(1, max_loras) + lora_request = LoRARequest( + lora_name=str(lora_id), + lora_int_id=lora_id, + lora_path=lora_path_on_disk(lora_path), + ) + return lora_request + + @abstractmethod + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + ) -> list[SampleRequest]: + """ + Abstract method to generate sample requests from the dataset. + + Subclasses must override this method to implement dataset-specific logic + for generating a list of SampleRequest objects. + + Args: + tokenizer (PreTrainedTokenizerBase): The tokenizer to be used + for processing the dataset's text. + num_requests (int): The number of sample requests to generate. + request_id_prefix (str): The prefix of request_id. + + Returns: + list[SampleRequest]: A list of sample requests generated from the + dataset. + """ + raise NotImplementedError("sample must be implemented in subclasses.") + + def maybe_oversample_requests( + self, + requests: list[SampleRequest], + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + ) -> None: + """ + Oversamples the list of requests if its size is less than the desired + number. + + Args: + requests (List[SampleRequest]): The current list of sampled + requests. + num_requests (int): The target number of requests. + request_id_prefix (str): The prefix applied to generated request + identifiers. + + """ + if no_oversample: + logger.info("Skipping oversampling. Total samples: %d.", len(requests)) + return + + if len(requests) < num_requests: + random.seed(self.random_seed) + needed = num_requests - len(requests) + additional = [] + for i in range(needed): + req = deepcopy(random.choice(requests)) + req.request_id = request_id_prefix + str(len(requests) + i) + additional.append(req) + requests.extend(additional) + logger.info("Oversampled requests to reach %d total samples.", num_requests) + + ids = [req.request_id for req in requests] + if len(ids) != len(set(ids)): + raise ValueError( + "Duplicate request_id found in the sampled " + "requests. Please ensure that each request_id " + "is unique." + ) + + +# ----------------------------------------------------------------------------- +# Utility Functions and Global Caches +# ----------------------------------------------------------------------------- + + +def is_valid_sequence( + prompt_len: int, + output_len: int, + min_len: int = 4, + max_prompt_len: int = 1024, + max_total_len: int = 2048, + skip_min_output_len_check: bool = False, +) -> bool: + """ + Validate a sequence based on prompt and output lengths. + + Default pruning criteria are copied from the original `sample_hf_requests` + and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as + from `sample_requests` in benchmark_throughput.py. + """ + # Check for invalid conditions + prompt_too_short = prompt_len < min_len + output_too_short = (not skip_min_output_len_check) and (output_len < min_len) + prompt_too_long = prompt_len > max_prompt_len + combined_too_long = (prompt_len + output_len) > max_total_len + + # Return True if none of the invalid conditions are met + return not ( + prompt_too_short or output_too_short or prompt_too_long or combined_too_long + ) + + +@cache +def lora_path_on_disk(lora_path: str) -> str: + return get_adapter_absolute_path(lora_path) + + +# Global cache for LoRA tokenizers. +lora_tokenizer_cache: dict[int, AnyTokenizer] = {} + + +def process_image(image: Any) -> Mapping[str, Any]: + """ + Process a single image input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key + containing raw image data. - Loads the bytes as a PIL.Image.Image. + + 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as + a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns + a dictionary with the image as a base64 data URL. + + 3. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(image, dict) and "bytes" in image: + image = Image.open(BytesIO(image["bytes"])) + if isinstance(image, Image.Image): + image = convert_image_mode(image, "RGB") + with io.BytesIO() as image_data: + image.save(image_data, format="JPEG") + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + return { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, + } + + if isinstance(image, str): + image_url = ( + image + if image.startswith(("http://", "https://", "file://")) + else f"file://{image}" + ) + return {"type": "image_url", "image_url": {"url": image_url}} + + raise ValueError( + f"Invalid image input {image}. Must be a PIL.Image.Image" + " or str or dictionary with raw image bytes." + ) + + +def process_video(video: Any) -> Mapping[str, Any]: + """ + Process a single video input and return a multimedia content dictionary. + + Supports the following input types: + + 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key + containing raw video data. + + 2. String input: - Treats the string as a URL or local file path. - + Prepends "file://" if the string doesn't start with "http://" or + "file://". - Returns a dictionary with the image URL. + + Raises: + ValueError: If the input is not a supported type. + """ + if isinstance(video, dict) and "bytes" in video: + video_bytes = video["bytes"] + video_base64 = base64.b64encode(video_bytes).decode("utf-8") + return { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + } + + if isinstance(video, str): + video_url = ( + video + if video.startswith(("http://", "https://", "file://")) + else f"file://{video}" + ) + return {"type": "video_url", "video_url": {"url": video_url}} + + raise ValueError( + f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501 + ) + + +def gen_prompt_decode_to_target_len( + tokenizer: PreTrainedTokenizerBase, + token_sequence: list[int], + target_token_len: int, + max_retry: int = 10, + add_special_tokens: bool = False, + rng: np.random.Generator | None = None, +) -> tuple[str, list[int]]: + """ + Ensure decoded-then-encoded prompt length matches the target token length. + + This function decodes an initial token sequence to text and re-encodes it + , iteratively adjusting the token sequence length to match a target. + This is necessary because some tokenizers do not guarantee a 1:1 mapping + between consecutive tokens and the decoded-then-encoded sequence length. + For example, for GPT2Tokenizer: + [6880, 6881] -> ['Ġcalls', 'here'] -> + [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] + + Returns a tuple of the final prompt string and the adjusted token sequence. + """ + remain_num_try = max_retry + token_mismatch = 0 + while True: + prompt = tokenizer.decode(token_sequence) + token_sequence = tokenizer.encode(prompt, add_special_tokens=add_special_tokens) + if remain_num_try <= 0: + if len(token_sequence) != target_token_len: + token_mismatch = len(token_sequence) - target_token_len + break + + if len(token_sequence) == target_token_len: + break + elif len(token_sequence) < target_token_len: + if rng is not None: + extra_tokens = rng.integers( + 0, + tokenizer.vocab_size, + size=target_token_len - len(token_sequence), + ).tolist() + else: + extra_tokens = np.random.randint( + 0, + tokenizer.vocab_size, + size=target_token_len - len(token_sequence), + ).tolist() + token_sequence.extend(extra_tokens) + elif len(token_sequence) > target_token_len: + token_sequence = token_sequence[:target_token_len] + + remain_num_try -= 1 + + return prompt, token_sequence, token_mismatch + + +# ----------------------------------------------------------------------------- +# Random Dataset Implementation (Synthetic Data) +# ----------------------------------------------------------------------------- + + +class RandomDataset(BenchmarkDataset): + """ + Synthetic text-only dataset for serving/throughput benchmarks. + + Strategy: + - Sample input/output token lengths per request from integer-uniform ranges + around configured means (controlled by range_ratio). + - Prepend a fixed random prefix of length prefix_len. + - Generate the remaining tokens as a reproducible sequence: + (offset + index + arange(input_len)) % vocab_size. + - Decode then re-encode/truncate to ensure prompt token counts match. + - Uses numpy.default_rng seeded with random_seed for reproducible sampling. + """ + + # Default values copied from benchmark_serving.py for the random dataset. + DEFAULT_PREFIX_LEN = 0 + DEFAULT_RANGE_RATIO = 0.0 + DEFAULT_INPUT_LEN = 1024 + DEFAULT_OUTPUT_LEN = 128 + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + # Use numpy's default_rng for deterministic sampling + # Do not use random.seed() or np.random.seed() elsewhere in this class. + # This ensures that the RNG is isolated from global RNG state. + self._rng = np.random.default_rng(self.random_seed) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + prefix_len: int = DEFAULT_PREFIX_LEN, + range_ratio: float = DEFAULT_RANGE_RATIO, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + batchsize: int = 1, + **kwargs, + ) -> list[SampleRequest]: + # validate total input tokens (prefix + sampled) is at least 1. + num_special = int(tokenizer.num_special_tokens_to_add()) + real_input_len = max(0, int(input_len) - num_special) + min_sampled_input = math.floor(real_input_len * (1.0 - float(range_ratio))) + min_total_input = int(prefix_len) + min_sampled_input + if min_total_input < 1: + raise ValueError( + "--random-input-len is too small: with tokenizer special " + f"tokens {num_special} and --random-range-ratio {range_ratio}, " + "the minimum possible total input tokens (prefix + sampled) is " + f"{min_total_input}. Increase --random-input-len and/or " + "--random-prefix-len, or decrease --random-range-ratio so that " + "prefix_len + floor(max(0, random_input_len - num_special)) " + "* (1 - range_ratio) >= 1." + ) + + input_lens, output_lens, offsets = self.get_sampling_params( + num_requests, range_ratio, input_len, output_len, tokenizer + ) + + vocab_size = tokenizer.vocab_size + prohibited_tokens = tokenizer.all_special_ids + all_tokens = np.arange(vocab_size) + allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens))) + + # Generate prefix once + prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len) + + requests = [] + token_mismatch_total = 0 + for i in range(num_requests): + prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501 + tokenizer=tokenizer, + prefix_token_ids=prefix_token_ids, + prefix_len=prefix_len, + vocab_size=vocab_size, + input_len=int(input_lens[i]), + offset=int(offsets[i]), + index=i, + allowed_tokens=allowed_tokens, + ) + token_mismatch_total += token_mismatch + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + request_id=request_id_prefix + str(i), + ) + ) + # only used for embeddings benchmark. + if batchsize > 1: + batch_requests = [] + # Create batched requests + for i in range(0, num_requests, batchsize): + batch = requests[i : i + batchsize] + batch_requests.append( + SampleRequest( + prompt=[req.prompt for req in batch], + prompt_len=sum(req.prompt_len for req in batch), + expected_output_len=0, + request_id=request_id_prefix + str(i // batchsize), + ) + ) + requests = batch_requests + + if token_mismatch_total != 0: + sign = "more" if token_mismatch_total > 0 else "fewer" + logger.warning( + "Across all generated prompts, there were %d %s tokens " + "than expected after decoding and re-encoding. This is " + "expected due to the imperfect nature of the sampling " + "procedure.", + abs(token_mismatch_total), + sign, + ) + + return requests + + def get_prefix( + self, + allowed_tokens: np.ndarray, + prefix_len: int, + ) -> list[int]: + """ + Get the prefix for the dataset. + """ + return ( + allowed_tokens[ + self._rng.integers(0, len(allowed_tokens), size=prefix_len) + ].tolist() + if prefix_len > 0 + else [] + ) + + def get_sampling_params( + self, + num_requests: int, + range_ratio: float, + input_len: int, + output_len: int, + tokenizer: PreTrainedTokenizerBase, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Get the sampling parameters for the dataset. + """ + # Enforce range_ratio < 1 + if not (0.0 <= range_ratio < 1.0): + raise ValueError("range_ratio must be in [0, 1).") + num_special_tokens = int(tokenizer.num_special_tokens_to_add()) + real_input_len = max(0, int(input_len) - num_special_tokens) + # Bounds use floor for low and ceil for high + input_low = math.floor(real_input_len * (1 - range_ratio)) + input_high = math.ceil(real_input_len * (1 + range_ratio)) + output_low = math.floor(output_len * (1 - range_ratio)) + output_high = math.ceil(output_len * (1 + range_ratio)) + # Ensure the lower bound for output length is at least 1 to + # prevent sampling 0 tokens. + output_low = max(output_low, 1) + output_high = max(output_high, 1) + + if input_low > input_high: + raise ValueError( + f"Invalid input sampling interval: low={input_low} > high={input_high}" + ) + if output_low > output_high: + raise ValueError( + "Invalid output sampling interval: " + f"low={output_low} > high={output_high}" + ) + + logger.info( + "Sampling input_len from [%s, %s] and output_len from [%s, %s]", + input_low, + input_high, + output_low, + output_high, + ) + + input_lens = self._rng.integers(input_low, input_high + 1, size=num_requests) + output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests) + offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests) + return input_lens, output_lens, offsets + + def generate_token_sequence( + self, + *, + tokenizer: PreTrainedTokenizerBase, + prefix_token_ids: list[int], + prefix_len: int, + vocab_size: int, + input_len: int, + offset: int, + index: int, + allowed_tokens: np.ndarray, + ) -> tuple[str, int, int]: + """ + Returns (prompt, total_input_len). + + NOTE: After decoding the prompt we have to encode and decode it again. + This is done because in some cases N consecutive tokens + give a string tokenized into != N number of tokens. + For example for GPT2Tokenizer: + [6880, 6881] -> ['Ġcalls', 'here'] -> + [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] + To avoid uncontrolled change of the prompt length, + the encoded sequence is truncated before being decoded again. + """ + # Build the inner sequence by sampling + # sequentially from the allowed tokens + inner_seq = allowed_tokens[ + (offset + index + np.arange(input_len)) % len(allowed_tokens) + ].tolist() + token_sequence = prefix_token_ids + inner_seq + + # Decode, then re-encode and truncate to preserve token count invariants + total_input_len = prefix_len + int(input_len) + prompt, adjusted_token_sequence, token_mismatch = ( + gen_prompt_decode_to_target_len( + tokenizer=tokenizer, + token_sequence=token_sequence, + target_token_len=total_input_len, + add_special_tokens=False, + rng=self._rng, + ) + ) + total_input_len = len(adjusted_token_sequence) + return prompt, total_input_len, token_mismatch + + +# ----------------------------------------------------------------------------- +# Random Dataset Implementation (Synthetic Data) +# ----------------------------------------------------------------------------- + + +class RandomDatasetForReranking(RandomDataset): + """ + Random dataset specialized for the needs of scoring: + - Batches of inputs + - Inputs composed of pairs + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, + input_len: int = RandomDataset.DEFAULT_INPUT_LEN, + batchsize: int = 1, + is_reranker: bool = True, + **kwargs, + ) -> list[SampleRequest]: + n_sep_tokens = int(is_reranker) + + query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len + + query_lens, _, query_offsets = self.get_sampling_params( + 1, range_ratio, query_len_param, 0, tokenizer + ) + + query_len = int(query_lens[0]) + + if not is_reranker: + assert num_requests > 1 and batchsize > 1 + num_requests -= 1 + batchsize -= 1 + doc_len_param = input_len + else: + doc_len_param = input_len - query_len - n_sep_tokens + + doc_lens, _, doc_offsets = self.get_sampling_params( + num_requests, range_ratio, doc_len_param, 0, tokenizer + ) + vocab_size = tokenizer.vocab_size + + query_prompt, query_input_len, token_mismatch_total = ( + self.generate_token_sequence( + tokenizer=tokenizer, + prefix_token_ids=[], + prefix_len=0, + vocab_size=vocab_size, + input_len=query_len, + offset=int(query_offsets[0]), + index=0, + ) + ) + + requests = [] + for i in range(num_requests): + prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501 + tokenizer=tokenizer, + prefix_token_ids=[], + prefix_len=0, + vocab_size=vocab_size, + input_len=int(doc_lens[i]), + offset=int(doc_offsets[i]), + index=i + 1, + ) + token_mismatch_total += token_mismatch + requests.append((prompt, total_input_len)) + + batch_requests = [] + # Create batched requests + for i in range(0, num_requests, batchsize): + batch = requests[i : i + batchsize] + query_contrib = ( + (query_input_len + n_sep_tokens) * len(batch) + if is_reranker + else query_input_len + ) + batch_requests.append( + SampleRequest( + prompt=[query_prompt] + [req[0] for req in batch], + prompt_len=query_contrib + sum(req[1] for req in batch), + expected_output_len=0, + request_id=request_id_prefix + str(i // batchsize), + ) + ) + + if token_mismatch_total != 0: + logger.warning( + "Across all generated prompts, there were %d %s tokens " + "than expected after decoding and re-encoding. This is " + "expected due to the imperfect nature of the sampling " + "procedure.", + abs(token_mismatch_total), + "more" if token_mismatch_total > 0 else "fewer", + ) + + return batch_requests + + +# ----------------------------------------------------------------------------- +# MultiModalDataset Implementation +# ----------------------------------------------------------------------------- + + +class RandomMultiModalDataset(RandomDataset): + """ + Synthetic multimodal dataset (text + images) that extends RandomDataset. + + Status: + - Images: supported via synthetic RGB data. + - Video: supported via synthetic RGB data. + - Audio: not yet supported. + + Sampling overview: + 1) Number of items per request is sampled uniformly from the integer range + [floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is + `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0. + The maximum is further clamped to the sum of per-modality limits. + 2) Each item’s modality and shape is sampled from `bucket_config`, a dict + mapping (height, width, num_frames) → probability. We treat + `num_frames`=1 as image and `num_frames` > 1 as video. + Entries with zero probability are removed and the rest are renormalized + to sum to 1. + 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`. + When a modality reaches its cap, all of its buckets are excluded and the + remaining probabilities are renormalized. + + Example bucket configuration: + {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1} + - Two image buckets (`num_frames`=1) and one video bucket + (`num_frames`=16). + OBS.: Only image sampling is supported for now. + """ + + IS_MULTIMODAL = True + DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 1} + + DEFAULT_BASE_ITEMS_PER_REQUEST = 1 + DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0 + DEFAULT_MM_ITEM_BUCKET_CONFIG = { + (256, 256, 1): 0.5, + (720, 1280, 1): 0.5, + (720, 1280, 16): 0.0, + } + DEFAULT_ENABLE_MULTIMODAL_CHAT = False + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def generate_synthetic_image(self, width: int, height: int) -> Image.Image: + """Generate synthetic PIL image with random RGB values. + + NOTE: iid pixel sampling results in worst-case compression + (good for stressing I/O), but very unlike real photos. + We could consider a “low-freq” mode (e.g., noise blur) + to emulate network realism instead of max stress. + """ + random_pixels = self._rng.integers( + 0, + 256, + (height, width, 3), + dtype=np.uint8, + ) + return Image.fromarray(random_pixels) + + def generate_synthetic_video( + self, width: int, height: int, num_frames: int + ) -> dict: + """Generate synthetic video with random values. + + Creates a video with random pixel values, encodes it to MP4 format, + and returns the content as bytes. + """ + import cv2 + + random_pixels = self._rng.integers( + 0, + 256, + (num_frames, height, width, 3), + dtype=np.uint8, + ) + + # Create a temporary video file in memory + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + fps = 30 # frames per second + + with NamedTemporaryFile(suffix=".mp4", delete_on_close=False) as temp_file: + temp_path = temp_file.name + + # Create video writer + video_writer = cv2.VideoWriter( + temp_path, fourcc=fourcc, fps=fps, frameSize=(width, height) + ) + + if not video_writer.isOpened(): + raise RuntimeError("Failed to create video writer") + + for frame in random_pixels: + video_writer.write(frame) + + video_writer.release() + temp_file.close() + + # Read the video file content + with open(temp_path, "rb") as f: + video_content = f.read() + + return {"bytes": video_content} + + def map_config_to_modality(self, config: tuple[int, int, int]) -> str: + """Map the configuration to the modality.""" + if config[-1] == 1: + return "image" + elif config[-1] > 1: + return "video" + else: + raise ValueError(f"Invalid multimodal item configuration: {config}") + + def normalize_bucket_config( + self, bucket_config: dict[tuple[int, int, int], float] + ) -> dict[tuple[int, int, int], float]: + """ + Remove zero probability entries + and normalize the bucket config to sum to 1. + """ + # Raise error if value is negative + if any(v < 0 for v in bucket_config.values()): + raise ValueError("Bucket config values must be non-negative.") + # Remove zero probability entries + bucket_config = {k: v for k, v in bucket_config.items() if v > 0} + # if bucket config is empty, raise error + if not bucket_config: + raise ValueError( + "Got invalid bucket config. Bucket config values must be non-zero." + ) + # Normalize the remaining bucket config to sum to 1 + total = sum(bucket_config.values()) + return {k: v / total for k, v in bucket_config.items()} + + def generate_mm_item( + self, + mm_item_config: tuple[int, int, int], + ) -> Mapping[str, Any]: + """ + Create synthetic images and videos and + apply process_image/process_video respectively. + This follows the OpenAI API chat completions + https://github.com/openai/openai-python + """ + + if self.map_config_to_modality(mm_item_config) == "image": + return process_image( + self.generate_synthetic_image(mm_item_config[1], mm_item_config[0]) + ) + elif self.map_config_to_modality(mm_item_config) == "video": + return process_video( + self.generate_synthetic_video( + mm_item_config[1], mm_item_config[0], mm_item_config[2] + ) + ) + else: + raise ValueError(f"Invalid multimodal item configuration: {mm_item_config}") + + def get_mm_item_sampling_params( + self, + base_items_per_request: int, + num_mm_items_range_ratio: float, + limit_mm_per_prompt: dict[str, int], + bucket_config: dict[tuple[int, int, int], float], + ) -> tuple[int, int, dict[str, int], dict[tuple[int, int, int], float]]: + """ + Get the sampling parameters for the multimodal items. + """ + # Enforce num_mm_items_range_ratio <= 1 + if not (0.0 <= num_mm_items_range_ratio <= 1.0): + raise ValueError("num_mm_items_range_ratio must be in [0, 1].") + + # Ensure modalities to sample are in limit_mm_per_prompt + for k, v in bucket_config.items(): + # get modality from bucket config + modality = self.map_config_to_modality(k) + if modality not in limit_mm_per_prompt: + raise ValueError( + f"Modality {modality} is not in " + f"limit_mm_per_prompt: " + f"{limit_mm_per_prompt.keys()}" + ) + + # Remove zero probability entries + # and normalize bucket config to sum to 1 + bucket_config = self.normalize_bucket_config(bucket_config) + logger.info( + "Normalized bucket config: %s", + bucket_config, + ) + # Only consider limit per prompt for modalities in bucket config + allowed_modalities = {self.map_config_to_modality(cfg) for cfg in bucket_config} + limit_mm_per_prompt = { + k: v for k, v in limit_mm_per_prompt.items() if k in allowed_modalities + } + if not limit_mm_per_prompt: + raise ValueError("No valid limits for modalities present in bucket_config.") + + logger.info( + "Updated mm-limit-per-prompt: %s", + limit_mm_per_prompt, + ) + + # Get max and min num mm items and ensure + # it is at most the sum of limit_mm_per_prompt for all modalities + max_num_mm_items = min( + sum(limit_mm_per_prompt.values()), + math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio)), + ) + # Ensure min num mm items is at least 0 + min_num_mm_items = max( + 0, math.floor(base_items_per_request * (1 - num_mm_items_range_ratio)) + ) + # Raise error if min num mm items is greater than max num mm items + if min_num_mm_items > max_num_mm_items: + raise ValueError( + f"Min num mm items is greater than max mm items: " + f"{min_num_mm_items} > {max_num_mm_items}" + ) + + logger.info( + "Sampling number of multimodal items from [%s, %s]", + min_num_mm_items, + max_num_mm_items, + ) + + return ( + min_num_mm_items, + max_num_mm_items, + limit_mm_per_prompt, + bucket_config, + ) + + def get_mm_item_iterator( + self, + min_num_mm_items: int, + max_num_mm_items: int, + bucket_config: dict[tuple[int, int, int], float], + limit_mm_per_prompt: dict[str, int], + ) -> Iterator[tuple[int, int, int]]: + """ + Iterator over the multimodal items for each request + whose size is between min_num_mm_items and max_num_mm_items. + + Loop over the bucket config and sample a multimodal item. + Loop until the number of multimodal items sampled is equal to + request_num_mm_items or limit of multimodal items per prompt + for all modalities is reached. + + Note: + - This function operates on a per-request shallow copy of + `bucket_config` (tuple->float). The original dict passed to + `sample` is not mutated. If this ever changes, a test + is implemented and will fail. + """ + # Get the number of multimodal items to sample + request_num_mm_items = int( + self._rng.integers(min_num_mm_items, max_num_mm_items + 1) + ) + # If request_num_mm_items is 0, yield an empty iterator + if request_num_mm_items == 0: + return + # Initialize modality counters + modality_counter = {self.map_config_to_modality(k): 0 for k in bucket_config} + # Copy the bucket config to avoid modifying the original + bucket_config_copy = bucket_config.copy() + # Loop over the number of multimodal items to sample + while sum(modality_counter.values()) < request_num_mm_items: + # Sample a multimodal item config + mm_item_config = self._rng.choice( + list(bucket_config_copy.keys()), p=list(bucket_config_copy.values()) + ) + modality = self.map_config_to_modality(mm_item_config) + # Check that modality count is less than limit per prompt + if modality_counter[modality] < limit_mm_per_prompt[modality]: + modality_counter[modality] += 1 + yield (mm_item_config) + else: + # If the counter is greater than the limit per prompt + # set all multimodal items of this modality to 0 + for k, v in bucket_config_copy.items(): + if self.map_config_to_modality(k) == modality: + bucket_config_copy[k] = 0 + # If all configs are 0, break the loop + # This should not happen as request_num_mm_items is at most + # the sum of limit_mm_per_prompt for all modalities + if all(v == 0 for v in bucket_config_copy.values()): + logger.warning( + "Exhausted all multimodal items of modality %s", modality + ) + break + # Renormalize the bucket config + bucket_config_copy = self.normalize_bucket_config(bucket_config_copy) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN, + range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, + input_len: int = RandomDataset.DEFAULT_INPUT_LEN, + output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN, + limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT, + base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST, + num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO, + bucket_config: dict[ + tuple[int, int, int], float + ] = DEFAULT_MM_ITEM_BUCKET_CONFIG, + enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT, + **kwargs, + ) -> list[SampleRequest]: + # Get the sampling parameters for the dataset + input_lens, output_lens, offsets = self.get_sampling_params( + num_requests, range_ratio, input_len, output_len, tokenizer + ) + + ( + min_num_mm_items, + max_num_mm_items, + limit_mm_per_prompt, + bucket_config, + ) = self.get_mm_item_sampling_params( + base_items_per_request, + num_mm_items_range_ratio, + limit_mm_per_prompt, + bucket_config, + ) + + vocab_size = tokenizer.vocab_size + # Can't use tokenizer.all_special_ids since + # it returns ONLY ids from special_tokens_map.json + # We want to exclude placeholder tokens and all + # tokens that indicate start/end of image as it + # may break prompt replacement logic. + prohibited_tokens = list( + tok_id + for tok_id, token in tokenizer.added_tokens_decoder.items() + if token.special + ) + all_tokens = np.arange(vocab_size) + allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens))) + logger.debug( + "Sampling from %d out of %d (vocab size)", len(allowed_tokens), vocab_size + ) + # Generate prefix once + prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len) + # Add synthetic multimodal items to each request + mm_requests = [] + token_mismatch_total = 0 + for i in range(num_requests): + prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501 + tokenizer=tokenizer, + prefix_token_ids=prefix_token_ids, + prefix_len=prefix_len, + vocab_size=vocab_size, + input_len=int(input_lens[i]), + offset=int(offsets[i]), + index=i, + allowed_tokens=allowed_tokens, + ) + token_mismatch_total += token_mismatch + # Get multimodal item iterator for a given request + mm_item_iterator = self.get_mm_item_iterator( + min_num_mm_items, + max_num_mm_items, + bucket_config, + limit_mm_per_prompt, + ) + + mm_content = cast( + list[dict[str, Any]], + [ + self.generate_mm_item(mm_item_config) + for mm_item_config in mm_item_iterator + ], + ) + + if enable_multimodal_chat: + # NOTE: For now this option is only provided for completeness + # given that the serve.py benchmark currently does not use it. + mm_chat_prompt: Any = prompt + mm_chat_prompt = self.apply_multimodal_chat_transformation( + prompt, mm_content + ) + sample_request = SampleRequest( + prompt=mm_chat_prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + multi_modal_data=None, + request_id=request_id_prefix + str(i), + ) + else: + sample_request = SampleRequest( + prompt=prompt, + prompt_len=total_input_len, + expected_output_len=int(output_lens[i]), + multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), + ) + mm_requests.append(sample_request) + + if token_mismatch_total != 0: + sign = "more" if token_mismatch_total > 0 else "fewer" + logger.warning( + "Across all generated prompts, there were %d %s tokens " + "than expected after decoding and re-encoding. This is " + "expected due to the imperfect nature of the sampling " + "procedure.", + abs(token_mismatch_total), + sign, + ) + + return mm_requests + + +# ----------------------------------------------------------------------------- +# ShareGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ShareGPTDataset(BenchmarkDataset): + """ + Implements the ShareGPT dataset. Loads data from a JSON file and generates + sample requests based on conversation turns. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + with open(self.dataset_path, encoding="utf-8") as f: + self.data = json.load(f) + # Filter entries with at least two conversation turns. + self.data = [ + entry + for entry in self.data + if "conversations" in entry and len(entry["conversations"]) >= 2 + ] + random.seed(self.random_seed) + if not getattr(self, "disable_shuffle", False): + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: str | None = None, + max_loras: int | None = None, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + samples: list = [] + ind = 0 + for entry in self.data: + if len(samples) >= num_requests: + break + prompt, completion = ( + entry["conversations"][0]["value"], + entry["conversations"][1]["value"], + ) + + lora_request = self.get_random_lora_request( + max_loras=max_loras, lora_path=lora_path + ) + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + new_output_len = len(completion_ids) if output_len is None else output_len + if not is_valid_sequence( + prompt_len, + new_output_len, + skip_min_output_len_check=output_len is not None, + ): + continue + if image_path := entry.get("image"): + mm_content = process_image(image_path) + elif video_path := entry.get("video"): + mm_content = process_video(video_path) + else: + mm_content = None + if enable_multimodal_chat: + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=new_output_len, + lora_request=lora_request, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), + ) + ) + ind += 1 + self.maybe_oversample_requests( + samples, num_requests, request_id_prefix, no_oversample + ) + return samples + + +class _ValidateDatasetArgs(argparse.Action): + """Argparse action to validate dataset name and path compatibility.""" + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) + + # Get current values of both dataset_name and dataset_path + dataset_name = getattr(namespace, "dataset_name", "random") + dataset_path = getattr(namespace, "dataset_path", None) + + # Validate the combination + if dataset_name == "random" and dataset_path is not None: + parser.error( + "Cannot use 'random' dataset with --dataset-path. " + "Please specify the appropriate --dataset-name (e.g., " + "'sharegpt', 'custom', 'sonnet') for your dataset file: " + f"{dataset_path}" + ) + + +def add_dataset_parser(parser: FlexibleArgumentParser): + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--dataset-name", + type=str, + default="random", + action=_ValidateDatasetArgs, + choices=[ + "sharegpt", + "burstgpt", + "sonnet", + "random", + "random-mm", + "random-rerank", + "hf", + "custom", + "prefix_repetition", + "spec_bench", + ], + help="Name of the dataset to benchmark on.", + ) + parser.add_argument( + "--no-stream", + action="store_true", + help="Do not load the dataset in streaming mode.", + ) + parser.add_argument( + "--dataset-path", + type=str, + default=None, + action=_ValidateDatasetArgs, + help="Path to the sharegpt/sonnet dataset. " + "Or the huggingface dataset ID if using HF dataset.", + ) + parser.add_argument( + "--no-oversample", + action="store_true", + help="Do not oversample if the dataset has fewer samples than num-prompts.", + ) + parser.add_argument( + "--skip-chat-template", + action="store_true", + help="Skip applying chat template to prompt for datasets that support it.", + ) + parser.add_argument( + "--disable-shuffle", + action="store_true", + help="Disable shuffling of dataset samples for deterministic ordering.", + ) + + # group for dataset specific arguments + custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-output-len", + type=int, + default=256, + help="Number of output tokens per request, used only for custom dataset.", + ) + + spec_bench_group = parser.add_argument_group("spec bench dataset options") + spec_bench_group.add_argument( + "--spec-bench-output-len", + type=int, + default=256, + help="Num of output tokens per request, used only for spec bench dataset.", + ) + spec_bench_group.add_argument( + "--spec-bench-category", + type=str, + default=None, + help="Category for spec bench dataset. If None, use all categories.", + ) + + sonnet_group = parser.add_argument_group("sonnet dataset options") + sonnet_group.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help="Number of input tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help="Number of output tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help="Number of prefix tokens per request, used only for sonnet dataset.", + ) + + sharegpt_group = parser.add_argument_group("sharegpt dataset options") + sharegpt_group.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.", + ) + + blazedit_group = parser.add_argument_group("blazedit dataset options") + blazedit_group.add_argument( + "--blazedit-min-distance", + type=float, + default=0.0, + help="Minimum distance for blazedit dataset. Min: 0, Max: 1.0", + ) + blazedit_group.add_argument( + "--blazedit-max-distance", + type=float, + default=1.0, + help="Maximum distance for blazedit dataset. Min: 0, Max: 1.0", + ) + + random_group = parser.add_argument_group("random dataset options") + random_group.add_argument( + "--random-input-len", + type=int, + default=1024, + help="Number of input tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for random sampling.", + ) + random_group.add_argument( + "--random-range-ratio", + type=float, + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for random sampling. Must be in the range [0, 1) to define " + "a symmetric sampling range" + "[length * (1 - range_ratio), length * (1 + range_ratio)].", + ) + random_group.add_argument( + "--random-prefix-len", + type=int, + default=0, + help=( + "Number of fixed prefix tokens before the random context " + "in a request. " + "The total input length is the sum of `random-prefix-len` and " + "a random " + "context length sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]." + ), + ) + random_group.add_argument( + "--random-batch-size", + type=int, + default=1, + help=("Batch size for random sampling. Only used for embeddings benchmark."), + ) + random_group.add_argument( + "--no-reranker", + action="store_true", + help=( + "Whether the model supports reranking natively." + " Only used for reranker benchmark." + ), + ) + + # random multimodal dataset options + random_mm_group = parser.add_argument_group( + "random multimodal dataset options extended from random dataset" + ) + random_mm_group.add_argument( + "--random-mm-base-items-per-request", + type=int, + default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST, + help=( + "Base number of multimodal items per request for random-mm. " + "Actual per-request count is sampled around this base using " + "--random-mm-num-mm-items-range-ratio." + ), + ) + random_mm_group.add_argument( + "--random-mm-num-mm-items-range-ratio", + type=float, + default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO, + help=( + "Range ratio r in [0, 1] for sampling items per request. " + "We sample uniformly from the closed integer range " + "[floor(n*(1-r)), ceil(n*(1+r))] " + "where n is the base items per request. " + "r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped " + "to the sum of per-modality limits from " + "--random-mm-limit-mm-per-prompt. " + "An error is raised if the computed min exceeds the max." + ), + ) + random_mm_group.add_argument( + "--random-mm-limit-mm-per-prompt", + type=json.loads, + default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT, + help=( + "Per-modality hard caps for items attached per request, e.g. " + '\'{"image": 3, "video": 0}\'. The sampled per-request item ' + "count is clamped to the sum of these limits. When a modality " + "reaches its cap, its buckets are excluded and probabilities are " + "renormalized." + "OBS.: Only image sampling is supported for now." + ), + ) + + def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]: + # If already a dict (e.g., programmatic call), normalize keys + def normalize(d: dict) -> dict[tuple[int, int, int], float]: + out: dict[tuple[int, int, int], float] = {} + for k, val in d.items(): + key = k + if isinstance(key, str): + with suppress(Exception): + key = ast.literal_eval(key) + if not ( + isinstance(key, tuple) + and len(key) == 3 + and all(isinstance(x, int) for x in key) + ): + raise ValueError( + f"Invalid bucket key {k!r}. Expected tuple (H, W, T)." + ) + out[(int(key[0]), int(key[1]), int(key[2]))] = float(val) + return out + + if isinstance(v, dict): + return normalize(v) + if isinstance(v, str): + # Python literal (supports tuple keys) + parsed = ast.literal_eval(v) + if not isinstance(parsed, dict): + raise ValueError("Bucket config must parse to a dict.") + return normalize(parsed) + raise ValueError("Unsupported value for --random-mm-bucket-config.") + + random_mm_group.add_argument( + "--random-mm-bucket-config", + type=_parse_mm_bucket_config, + default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG, + help=( + "The bucket config is a dictionary mapping a multimodal item" + "sampling configuration to a probability." + "Currently allows for 2 modalities: images and videos. " + "An bucket key is a tuple of (height, width, num_frames)" + "The value is the probability of sampling that specific item. " + "Example: " + "--random-mm-bucket-config " + "{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} " + "First item: images with resolution 256x256 w.p. 0.5" + "Second item: images with resolution 720x1280 w.p. 0.4 " + "Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1" + "OBS.: If the probabilities do not sum to 1, they are normalized." + "OBS bis.: Only image sampling is supported for now." + ), + ) + + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument( + "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + ) + hf_group.add_argument( + "--hf-split", type=str, default=None, help="Split of the HF dataset." + ) + hf_group.add_argument( + "--hf-name", + type=str, + default=None, + help=( + "Name of the dataset on HuggingFace " + "(e.g., 'lmarena-ai/VisionArena-Chat'). " + "Specify this if your dataset-path is a local path." + ), + ) + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options" + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=256, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=256, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=10, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + + +def get_samples(args, tokenizer) -> list[SampleRequest]: + if not hasattr(args, "request_id_prefix"): + args.request_id_prefix = "" + + if args.dataset_name == "custom": + dataset = CustomDataset( + dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle + ) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + skip_chat_template=args.skip_chat_template, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + + elif args.dataset_name == "sonnet": + dataset = SonnetDataset( + dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle + ) + # For the "sonnet" dataset, formatting depends on the backend. + if args.backend == "openai-chat": + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + else: + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset." + ) + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + + elif args.dataset_name == "hf": + # all following datasets are implemented from the + # HuggingFaceDataset base class + hf_kwargs = {} + if ( + args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = VisionArenaDataset + args.hf_split = "train" + args.hf_subset = None + elif ( + args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MMVUDataset + args.hf_split = "validation" + args.hf_subset = None + elif ( + args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = InstructCoderDataset + args.hf_split = "train" + elif ( + args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MTBenchDataset + args.hf_split = "train" + elif ( + args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MultiModalConversationDataset + elif ( + args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = ConversationDataset + elif ( + args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS + or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = AIMODataset + args.hf_split = "train" + elif ( + args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501 + or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = NextEditPredictionDataset + args.hf_split = "train" + elif ( + args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = ASRDataset + args.hf_split = "train" + elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS: + dataset_class = BlazeditDataset + args.hf_split = "train" + hf_kwargs = { + "min_distance": args.blazedit_min_distance, + "max_distance": args.blazedit_max_distance, + } + elif ( + args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MLPerfDataset + args.hf_split = "train" + elif ( + args.dataset_path in MMStarDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MMStarDataset.SUPPORTED_DATASET_PATHS + ): + dataset_class = MMStarDataset + args.hf_split = "val" + args.hf_subset = None + else: + supported_datasets = set( + [ + dataset_name + for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ] + ) + raise ValueError( + f"Unsupported dataset path: {args.dataset_path}. " + "Huggingface dataset only supports dataset_path" + f" from one of following: {supported_datasets}. " + "Please consider contributing if you would " + "like to add support for additional dataset formats." + ) + + if dataset_class.IS_MULTIMODAL and not ( + args.backend in ("openai-chat", "openai-audio") + or "embeddings-" in args.backend + ): + # multi-modal benchmark is only available on OpenAI Chat + # endpoint-type. + raise ValueError( + "Multi-modal content is only supported on 'openai-chat' and " + "'openai-audio' backends." + ) + input_requests = dataset_class( + dataset_path=args.dataset_path, + dataset_subset=args.hf_subset, + dataset_split=args.hf_split, + random_seed=args.seed, + no_stream=args.no_stream, + hf_name=args.hf_name, + disable_shuffle=args.disable_shuffle, + ).sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.hf_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + skip_chat_template=args.skip_chat_template, + **hf_kwargs, + ) + + else: + # For datasets that follow a similar structure, use a mapping. + dataset_mapping = { + "spec_bench": lambda: SpecBench( + dataset_path=args.dataset_path, + category=args.spec_bench_category, + disable_shuffle=args.disable_shuffle, + ).sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.spec_bench_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + "sharegpt": lambda: ShareGPTDataset( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + "burstgpt": lambda: BurstGPTDataset( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + "random": lambda: RandomDataset( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + range_ratio=args.random_range_ratio, + request_id_prefix=args.request_id_prefix, + batchsize=args.random_batch_size, + no_oversample=args.no_oversample, + ), + "random-mm": lambda: RandomMultiModalDataset( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + range_ratio=args.random_range_ratio, + input_len=args.random_input_len, + output_len=args.random_output_len, + base_items_per_request=args.random_mm_base_items_per_request, + limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt, + num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio, + bucket_config=args.random_mm_bucket_config, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + "random-rerank": lambda: RandomDatasetForReranking( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + input_len=args.random_input_len, + range_ratio=args.random_range_ratio, + request_id_prefix=args.request_id_prefix, + batchsize=args.random_batch_size, + is_reranker=not args.no_reranker, + ), + "prefix_repetition": lambda: PrefixRepetitionRandomDataset( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.prefix_repetition_prefix_len, + suffix_len=args.prefix_repetition_suffix_len, + num_prefixes=args.prefix_repetition_num_prefixes, + output_len=args.prefix_repetition_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), + } + + try: + # Enforce endpoint compatibility for multimodal datasets. + if args.dataset_name == "random-mm" and args.backend not in ["openai-chat"]: + raise ValueError( + "Multi-modal content (images) is only supported on " + "'openai-chat' backend." + ) + input_requests = dataset_mapping[args.dataset_name]() + except KeyError as err: + raise ValueError(f"Unknown dataset: {args.dataset_name}") from err + + return input_requests + + +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset." + ) + + random.seed(self.random_seed) + if not getattr(self, "disable_shuffle", False): + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: str | None = None, + max_loras: int | None = None, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + # load all data if needed + self.num_available_samples = len(self.data) + if num_requests <= 0: + num_requests = self.num_available_samples + logger.info( + "num_requests is set to 0 or negative, " + "so using all available samples: %d", + num_requests, + ) + + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + ) + ) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Spec Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class SpecBench(CustomDataset): + """ + Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench + Download the dataset using: + wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl + """ # noqa: E501 + + def __init__(self, **kwargs) -> None: + self.category = kwargs.pop("category", None) + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + self.data = [] + + # Load the JSONL file + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) + + # check if the JSONL file has a 'turns' column + if "turns" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'turns' column.") + + for _, row in jsonl_data.iterrows(): + # sample only from a specific category if specified + if (not self.category) or (self.category == row["category"]): + prompt = row["turns"][0] + self.data.append({"prompt": prompt}) + + random.seed(self.random_seed) + if not getattr(self, "disable_shuffle", False): + random.shuffle(self.data) + + def sample(self, **kwargs) -> list: + # leverage CustomDataset sample + return super().sample(**kwargs) + + +# ----------------------------------------------------------------------------- +# Sonnet Dataset Implementation +# ----------------------------------------------------------------------------- + + +@deprecated( + "SonnetDataset is deprecated and will be removed in a future version.", +) +class SonnetDataset(BenchmarkDataset): + """ + Simplified implementation of the Sonnet dataset. Loads poem lines from a + text file and generates sample requests. Default values here copied from + `benchmark_serving.py` for the sonnet dataset. + """ + + DEFAULT_PREFIX_LEN = 200 + DEFAULT_INPUT_LEN = 550 + DEFAULT_OUTPUT_LEN = 150 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if not self.dataset_path: + raise ValueError("dataset_path must be provided.") + with open(self.dataset_path, encoding="utf-8") as f: + self.data = f.readlines() + + def sample( + self, + tokenizer, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + return_prompt_formatted: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + # Calculate average token length for a poem line. + tokenized_lines = [tokenizer(line).input_ids for line in self.data] + avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines) + + # Build the base prompt. + base_prompt = "Pick as many lines as you can from these poem lines:\n" + base_msg = [{"role": "user", "content": base_prompt}] + base_fmt = tokenizer.apply_chat_template( + base_msg, add_generation_prompt=True, tokenize=False + ) + base_offset = len(tokenizer(base_fmt).input_ids) + if input_len <= base_offset: + raise ValueError( + f"'input_len' must be higher than the base prompt length " + f"({base_offset})." + ) + + # Determine how many poem lines to use. + num_input_lines = round((input_len - base_offset) / avg_len) + num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) + prefix_lines = self.data[:num_prefix_lines] + + samples = [] + ind = 0 + while len(samples) < num_requests: + extra_lines = random.choices( + self.data, k=num_input_lines - num_prefix_lines + ) + prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" + msg = [{"role": "user", "content": prompt}] + prompt_formatted = tokenizer.apply_chat_template( + msg, add_generation_prompt=True, tokenize=False + ) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + if prompt_len <= input_len: + samples.append( + SampleRequest( + prompt=prompt_formatted if return_prompt_formatted else prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(ind), + ) + ) + ind += 1 + return samples + + +# ----------------------------------------------------------------------------- +# BurstGPT Dataset Implementation +# ----------------------------------------------------------------------------- + + +class BurstGPTDataset(BenchmarkDataset): + """ + Implements the BurstGPT dataset. Loads data from a CSV file and generates + sample requests based on synthetic prompt generation. Only rows with Model + "GPT-4" and positive response tokens are used. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data( + self, + ): + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + df = pd.read_csv(self.dataset_path) + # Filter to keep only GPT-4 rows. + gpt4_df = df[df["Model"] == "GPT-4"] + # Remove failed requests (where Response tokens is 0 or less). + gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] + # Sample the desired number of rows. + self.data = gpt4_df + + def _sample_loaded_data(self, num_requests: int) -> list: + if num_requests <= len(self.data): + data = self.data.sample(n=num_requests, random_state=self.random_seed) + else: + data = self.data.sample( + n=num_requests, + random_state=self.random_seed, + replace=True, + ) + # Convert the dataframe to a list of lists. + return data.values.tolist() + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + max_loras: int | None = None, + lora_path: str | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + samples = [] + data = self._sample_loaded_data(num_requests=num_requests) + for i in range(num_requests): + input_len = int(data[i][2]) + output_len = int(data[i][3]) + lora_req = self.get_random_lora_request( + max_loras=max_loras, lora_path=lora_path + ) + vocab_size = tokenizer.vocab_size + # Generate a synthetic prompt: a list of token IDs computed as (i + + # j) modulo vocab_size. + token_ids = [(i + j) % vocab_size for j in range(input_len)] + prompt = tokenizer.decode(token_ids) + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=input_len, + expected_output_len=output_len, + lora_request=lora_req, + request_id=request_id_prefix + str(i), + ) + ) + return samples + + +# ----------------------------------------------------------------------------- +# HuggingFace Dataset Base Implementation +# ----------------------------------------------------------------------------- +class HuggingFaceDataset(BenchmarkDataset): + """Base class for datasets hosted on HuggingFace.""" + + SUPPORTED_DATASET_PATHS: set[str] | dict[str, Callable] = set() + + def __init__( + self, + dataset_path: str, + dataset_split: str, + no_stream: bool = False, + dataset_subset: str | None = None, + hf_name: str | None = None, + **kwargs, + ) -> None: + super().__init__(dataset_path=dataset_path, **kwargs) + + self.dataset_split = dataset_split + self.dataset_subset = dataset_subset + self.load_stream = not no_stream + self.hf_name = hf_name or dataset_path + self.load_data() + + def load_data(self) -> None: + """Load data from HuggingFace datasets.""" + self.data = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=self.load_stream, + ) + if not getattr(self, "disable_shuffle", False): + self.data = self.data.shuffle(seed=self.random_seed) + + +# ----------------------------------------------------------------------------- +# Conversation Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ConversationDataset(HuggingFaceDataset): + """Dataset for text-only conversation data.""" + + SUPPORTED_DATASET_PATHS = { + "Aeala/ShareGPT_Vicuna_unfiltered", + } + IS_MULTIMODAL = False + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + # Filter examples with at least 2 conversations + filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) + sampled_requests = [] + ind = 0 + dynamic_output = output_len is None + + for item in filtered_data: + if len(sampled_requests) >= num_requests: + break + conv = item["conversations"] + prompt, completion = conv[0]["value"], conv[1]["value"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence(prompt_len, completion_len): + continue + mm_content = process_image(item["image"]) if "image" in item else None + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len and output len + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), + ) + ) + ind += 1 + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +class MultiModalConversationDataset(HuggingFaceDataset): + """Dataset for multimodal conversation data.""" + + SUPPORTED_DATASET_PATHS = { + "lmms-lab/LLaVA-OneVision-Data", + } + IS_MULTIMODAL = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + # Filter examples with at least 2 conversations + filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) + sampled_requests = [] + ind = 0 + dynamic_output = output_len is None + + for item in filtered_data: + if len(sampled_requests) >= num_requests: + break + conv = item["conversations"] + prompt, completion = conv[0]["value"], conv[1]["value"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence(prompt_len, completion_len): + continue + mm_content = process_image(item["image"]) if "image" in item else None + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len and output len + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), + ) + ) + ind += 1 + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Vision Arena Dataset Implementation +# ----------------------------------------------------------------------------- + + +class VisionArenaDataset(HuggingFaceDataset): + """ + Vision Arena Dataset. + """ + + DEFAULT_OUTPUT_LEN = 128 + SUPPORTED_DATASET_PATHS = { + "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"], + "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"], + } + IS_MULTIMODAL = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) + if parser_fn is None: + raise ValueError(f"Unsupported dataset path: {self.hf_name}") + prompt = parser_fn(item) + mm_content = process_image(item["images"][0]) + prompt_len = len(tokenizer(prompt).input_ids) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), + ) + ) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +class MMVUDataset(HuggingFaceDataset): + """ + MMVU Dataset. + https://huggingface.co/datasets/yale-nlp/MMVU + """ + + DEFAULT_OUTPUT_LEN = 128 + SUPPORTED_DATASET_PATHS = { + "yale-nlp/MMVU": lambda x: x["question"] + + " " + + (" ".join(f"{k}.{v}" for k, v in x["choices"].items())), + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) + if parser_fn is None: + raise ValueError(f"Unsupported dataset path: {self.hf_name}") + prompt = parser_fn(item) + mm_content = process_video(item["video"]) + prompt_len = len(tokenizer(prompt).input_ids) + if enable_multimodal_chat: + # Note: when chat is enabled the request prompt_len is no longer + # accurate and we will be using request output to count the + # actual prompt len + prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), + ) + ) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Instruct Coder Dataset Implementation +# ----------------------------------------------------------------------------- + + +class InstructCoderDataset(HuggingFaceDataset): + """ + InstructCoder Dataset. + https://huggingface.co/datasets/likaixin/InstructCoder + + InstructCoder is the dataset designed for general code editing. It consists + of 114,239 instruction-input-output triplets, and covers multiple distinct + code editing scenario. + """ + + DEFAULT_OUTPUT_LEN = 200 # this is the average default output length + SUPPORTED_DATASET_PATHS = { + "likaixin/InstructCoder", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests = [] + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + prompt = ( + f"{item['input']}\n\n{item['instruction']} Just output " + "the code, do not include any explanation." + ) + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + ) + ) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# MT-Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MTBenchDataset(HuggingFaceDataset): + """ + MT-Bench Dataset. + https://huggingface.co/datasets/philschmid/mt-bench + + We create a single turn dataset for MT-Bench. + This is similar to Spec decoding benchmark setup in vLLM + https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 + """ # noqa: E501 + + DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM + SUPPORTED_DATASET_PATHS = { + "philschmid/mt-bench", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests = [] + + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + prompt = item["turns"][0] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + ) + ) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Blazedit Dataset Implementation +# ----------------------------------------------------------------------------- + + +class BlazeditDataset(HuggingFaceDataset): + """ + Blazedit Dataset. + https://github.com/ise-uiuc/blazedit + + 5k char version: vdaita/edit_5k_char + 10k char version: vdaita/edit_10k_char + """ # noqa: E501 + + # 5k char version will have output as ~5k chars + # 10k char version will have output as ~10k chars + # Assuming 3 char per token, 10k chars will be 3333 tokens + # We set default to 4000 to be safe + DEFAULT_OUTPUT_LEN = 4000 + SUPPORTED_DATASET_PATHS = { + "vdaita/edit_5k_char", + "vdaita/edit_10k_char", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + skip_chat_template: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + min_distance: float = 0.0, + max_distance: float = 1.0, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests = [] + + for i, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + code = item["code"] + change_request = item["change_request"] + norm_distance = item["norm_distance"] + + # compare the levenshtein distance normalized by code length + if norm_distance < min_distance or norm_distance > max_distance: + continue + + # template copied from + # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501 + prompt = f"""Given a code file, please apply the change requests and generate the new file. + +Original file: +```python +{code} +``` + +Change request: +{change_request} + +Please generate the new code file in the "New file" section below.""" # noqa: E501 + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + request_id=request_id_prefix + str(i), + ) + ) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + + return sampled_requests + + +# ----------------------------------------------------------------------------- +# AIMO Dataset Implementation +# ----------------------------------------------------------------------------- + + +class AIMODataset(HuggingFaceDataset): + """ + Dataset class for processing a AIMO dataset with reasoning questions. + """ + + SUPPORTED_DATASET_PATHS = { + "AI-MO/aimo-validation-aime", + "AI-MO/NuminaMath-1.5", + "AI-MO/NuminaMath-CoT", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + sampled_requests = [] + ind = 0 + dynamic_output = output_len is None + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt, completion = item["problem"], item["solution"] + + prompt_ids = tokenizer(prompt).input_ids + completion_ids = tokenizer(completion).input_ids + prompt_len = len(prompt_ids) + completion_len = len(completion_ids) + output_len = completion_len if dynamic_output else output_len + assert isinstance(output_len, int) and output_len > 0 + if dynamic_output and not is_valid_sequence( + prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000 + ): + continue + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=request_id_prefix + str(ind), + ) + ) + ind += 1 + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Next Edit Prediction Dataset Implementation +# ----------------------------------------------------------------------------- + + +zeta_prompt = """### Instruction: +You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location. + +### User Edits: + +{} + +### User Excerpt: + +{} + +### Response: + +""" # noqa: E501 + + +def _format_zeta_prompt( + sample: dict, original_start_marker: str = "<|editable_region_start|>" +) -> dict: + """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. + + This function formats examples from the NEP dataset + into prompts and expected outputs. It could be + further extended to support more NEP datasets. + + Args: + sample: The dataset sample containing events, + inputs, and outputs. + original_start_marker: The marker indicating the + start of the editable region. Defaults to + "<|editable_region_start|>". + + Returns: + A dictionary with the formatted prompts and expected outputs. + """ + events = sample["events"] + input = sample["input"] + output = sample["output"] + prompt = zeta_prompt.format(events, input) + + # following the original implementation, extract the focused region + # from the raw output + output_start_index = output.find(original_start_marker) + output_focused_region = output[output_start_index:] + expected_output = output_focused_region + + return {"prompt": prompt, "expected_output": expected_output} + + +class NextEditPredictionDataset(HuggingFaceDataset): + """ + Dataset class for processing a Next Edit Prediction dataset. + """ + + SUPPORTED_DATASET_PATHS = { + "zed-industries/zeta", + } + MAPPING_PROMPT_FUNCS = { + "zed-industries/zeta": _format_zeta_prompt, + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ): + formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name) + if formatting_prompt_func is None: + raise ValueError(f"Unsupported dataset path: {self.hf_name}") + samples = [] + for i, sample in enumerate(self.data): + sample = formatting_prompt_func(sample) + samples.append( + SampleRequest( + prompt=sample["prompt"], + prompt_len=len(tokenizer(sample["prompt"]).input_ids), + expected_output_len=len( + tokenizer(sample["expected_output"]).input_ids + ), + request_id=request_id_prefix + str(i), + ) + ) + if len(samples) >= num_requests: + break + self.maybe_oversample_requests( + samples, num_requests, request_id_prefix, no_oversample + ) + return samples + + +# ----------------------------------------------------------------------------- +# ASR Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ASRDataset(HuggingFaceDataset): + """ + Dataset class for processing a ASR dataset for transcription. + Tested on the following set: + + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | Dataset | Domain | Speaking Style | hf-subset | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | TED-LIUM | TED talks | Oratory | release1, release2, release3| + | | | | release3-speaker-adaptation | + | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | + | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | + | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | + | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | + | AMI | Meetings | Spontaneous | ihm, sdm | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + + """ # noqa: E501 + + SUPPORTED_DATASET_PATHS = { + "openslr/librispeech_asr", + "facebook/voxpopuli", + "LIUM/tedlium", + "edinburghcstr/ami", + "speechcolab/gigaspeech", + "kensho/spgispeech", + } + + DEFAULT_OUTPUT_LEN = 128 + IS_MULTIMODAL = True + + # TODO Whisper-specific. Abstract interface when more models are supported. + TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" + skip_long_audios: bool = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + prompt = ASRDataset.TRANSCRIPTION_PREAMBLE + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests = [] + ind = 0 + skipped = 0 + for item in self.data: + if len(sampled_requests) >= num_requests: + break + audio = item["audio"] + y, sr = audio["array"], audio["sampling_rate"] + duration_s = librosa.get_duration(y=y, sr=sr) + # Whisper max supported duration + if self.skip_long_audios and duration_s > 30: + skipped += 1 + continue + + mm_content = {"audio": (y, sr)} + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), + ) + ) + ind += 1 + if skipped: + logger.warning( + "%d samples discarded from dataset due to" + " their length being greater than" + " what Whisper supports.", + skipped, + ) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# MLPerf Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MLPerfDataset(HuggingFaceDataset): + """ + MLPerf Inference Dataset. + + Dataset on HF: + https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data + https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data + + Each record contains: + - "system_prompt": system role instruction. + - "question": user question. + - "output": reference answer. + + We combine the system prompt and question into a chat-formatted prompt + (using the tokenizer's chat template) and set the expected output length to + the tokenized length of the provided reference answer. + """ + + SUPPORTED_DATASET_PATHS = { + "mgoin/mlperf-inference-llama2-data", + "mgoin/mlperf-inference-llama3.1-data", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + # Force dynamic output length based on reference completion. + dynamic_output = output_len is None + sampled_requests: list[SampleRequest] = [] + ind = 0 + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + + system_prompt = item["system_prompt"] + question = item["question"] + reference_answer = item["output"] + + # Build chat-style prompt using tokenizer template, if available. + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": question}, + ] + prompt_formatted = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + prompt_len = len(tokenizer(prompt_formatted).input_ids) + + # Determine output length from reference answer tokens. + ref_out_len = len( + tokenizer(reference_answer, add_special_tokens=False).input_ids + ) + expected_output_len = ref_out_len if dynamic_output else output_len + + # Validate sequence lengths. + if not is_valid_sequence(prompt_len, expected_output_len): + continue + + sampled_requests.append( + SampleRequest( + prompt=prompt_formatted, + prompt_len=prompt_len, + expected_output_len=expected_output_len, + request_id=request_id_prefix + str(ind), + ) + ) + ind += 1 + + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests + + +# ----------------------------------------------------------------------------- +# Prefix Repetition Dataset Implementation +# ----------------------------------------------------------------------------- + + +class PrefixRepetitionRandomDataset(BenchmarkDataset): + # Default values copied from benchmark_serving.py for the repeated prefix + # dataset. + DEFAULT_PREFIX_LEN = 256 + DEFAULT_SUFFIX_LEN = 256 + DEFAULT_NUM_PREFIXES = 10 + DEFAULT_OUTPUT_LEN = 128 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + random.seed(self.random_seed) + np.random.seed(self.random_seed) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + suffix_len: int = DEFAULT_SUFFIX_LEN, + num_prefixes: int = DEFAULT_NUM_PREFIXES, + output_len: int = DEFAULT_OUTPUT_LEN, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + vocab_size = tokenizer.vocab_size + prompts_per_prefix = num_requests // num_prefixes + if prompts_per_prefix == 0: + raise ValueError( + f"num_requests ({num_requests}) must be greater than or equal " + f"to num_prefixes ({num_prefixes})" + ) + + def _generate_exact_length_tokens(target_length: int) -> list[int]: + """Generate tokens that decode and re-encode to exactly + target_length.""" + # Generate random tokens + tokens = np.random.randint(0, vocab_size, size=target_length).tolist() + + _, adjusted_tokens, token_mismatch = gen_prompt_decode_to_target_len( # noqa: E501 + tokenizer=tokenizer, + token_sequence=tokens, + target_token_len=target_length, + add_special_tokens=False, + ) + return adjusted_tokens, token_mismatch + + requests = [] + token_mismatch_total = 0 + for _ in range(num_prefixes): + prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len) + token_mismatch_total += prefix_mismatch + + for _ in range(prompts_per_prefix): + suffix_tokens, suffix_mismatch = _generate_exact_length_tokens( + suffix_len + ) + token_mismatch_total += suffix_mismatch + combined_tokens = prefix_tokens + suffix_tokens + prompt = tokenizer.decode(combined_tokens) + prompt_len = len(combined_tokens) + requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + + if token_mismatch_total != 0: + sign = "more" if token_mismatch_total > 0 else "fewer" + logger.warning( + "Across all generated prompts, there were %d %s tokens " + "than expected after decoding and re-encoding. This is " + "expected due to the imperfect nature of the sampling " + "procedure.", + abs(token_mismatch_total), + sign, + ) + if not getattr(self, "disable_shuffle", False): + random.shuffle(requests) + return requests + + +# ----------------------------------------------------------------------------- +# MMStar Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MMStarDataset(HuggingFaceDataset): + """ + Lin-Chen/MMStar: https://huggingface.co/datasets/Lin-Chen/MMStar + refer to: https://github.com/sgl-project/SpecForge/pull/106 + """ + + DEFAULT_OUTPUT_LEN = 128 + SUPPORTED_DATASET_PATHS = {"Lin-Chen/MMStar"} + IS_MULTIMODAL = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: int | None = None, + enable_multimodal_chat: bool = False, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + # If --hf-output-len is not set, use the default output length. + output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN + sampled_requests: list[SampleRequest] = [] + + for ind, item in enumerate(self.data): + if len(sampled_requests) >= num_requests: + break + # Split the question text from options + # (keep only the part before "Options:"). + full_q: str = item.get("question", "") + question_text = full_q.split("Options:", 1)[0].strip() + + # Multimodal image content. + mm_content = process_image(item["image"]) + + # Compute prompt token length (note: this is plain text length + # if enable_multimodal_chat is False). + prompt_len = len(tokenizer(question_text).input_ids) + + if enable_multimodal_chat: + # If multimodal content should be embedded in the chat message, + # convert to [{"role":"user","content":[...]}] + prompt = self.apply_multimodal_chat_transformation( + question_text, mm_content + ) + mm_for_request = None # Already embedded in chat content. + else: + # Default: prompt is plain text, + # image is in mm_content for the bench to assemble. + prompt = question_text + mm_for_request = mm_content + + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_for_request, + request_id=request_id_prefix + str(ind), + ) + ) + + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix, no_oversample + ) + return sampled_requests diff --git a/benchmarks/latency.py b/benchmarks/latency.py new file mode 100644 index 0000000..b4f1751 --- /dev/null +++ b/benchmarks/latency.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark the latency of processing a single batch of requests.""" + +import argparse +import dataclasses +import json +import os +import time +from typing import Any + +import numpy as np +from tqdm import tqdm + +import vllm.envs as envs +from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json +from vllm.engine.arg_utils import EngineArgs +from vllm.inputs import PromptType +from vllm.sampling_params import BeamSearchParams + + +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any] +) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={"latency": results["latencies"]}, + extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, + ) + if pt_records: + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def add_cli_args(parser: argparse.ArgumentParser): + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument( + "--n", + type=int, + default=1, + help="Number of generated sequences per prompt.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-iters-warmup", + type=int, + default=10, + help="Number of iterations to run for warmup.", + ) + parser.add_argument( + "--num-iters", type=int, default=30, help="Number of iterations to run." + ) + parser.add_argument( + "--profile", + action="store_true", + help="profile the generation process of a single batch", + ) + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Path to save the latency results in JSON format.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=( + "Do not detokenize responses (i.e. do not include " + "detokenization time in the latency measurement)" + ), + ) + + parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) + + +def main(args: argparse.Namespace): + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler." + ) + engine_args = EngineArgs.from_cli_args(args) + + # Lazy import to avoid importing LLM when the bench command is not selected. + from vllm import LLM, SamplingParams + + # NOTE(woosuk): If the request cannot be processed in a single batch, + # the engine will automatically process the request in multiple batches. + llm = LLM(**dataclasses.asdict(engine_args)) + assert llm.llm_engine.model_config.max_model_len >= ( + args.input_len + args.output_len + ), ( + "Please ensure that max_model_len is greater than" + " the sum of input_len and output_len." + ) + + sampling_params = SamplingParams( + n=args.n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=args.output_len, + detokenize=not args.disable_detokenize, + ) + dummy_prompt_token_ids = np.random.randint( + 10000, size=(args.batch_size, args.input_len) + ) + dummy_prompts: list[PromptType] = [ + {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() + ] + + def llm_generate(): + if not args.use_beam_search: + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) + else: + llm.beam_search( + dummy_prompts, + BeamSearchParams( + beam_width=args.n, + max_tokens=args.output_len, + ignore_eos=True, + ), + ) + + def run_to_completion(profile_dir: str | None = None): + if profile_dir: + llm.start_profile() + llm_generate() + llm.stop_profile() + else: + start_time = time.perf_counter() + llm_generate() + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + print("Warming up...") + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion(profile_dir=None) + + if args.profile: + profile_dir = envs.VLLM_TORCH_PROFILER_DIR + print(f"Profiling (results will be saved to '{profile_dir}')...") + run_to_completion(profile_dir=profile_dir) + return + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile_dir=None)) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90, 99] + percentiles = np.percentile(latencies, percentages) + print(f"Avg latency: {np.mean(latencies)} seconds") + for percentage, percentile in zip(percentages, percentiles): + print(f"{percentage}% percentile latency: {percentile} seconds") + + # Output JSON results if specified + if args.output_json: + results = { + "avg_latency": np.mean(latencies), + "latencies": latencies.tolist(), + "percentiles": dict(zip(percentages, percentiles.tolist())), + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) diff --git a/benchmarks/lib/__init__.py b/benchmarks/lib/__init__.py new file mode 100644 index 0000000..005e87a --- /dev/null +++ b/benchmarks/lib/__init__.py @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark library utilities.""" diff --git a/benchmarks/lib/__pycache__/__init__.cpython-312.pyc b/benchmarks/lib/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd12a57fe8748463553940bae714052d4e6dfba2 GIT binary patch literal 209 zcmX@j%ge<81UW3-nSDU|F^B^Lj8MjB4j^MXLkdF_LnWgoQenx*EXl~v(=W@($<R~lJmE)QT?XZ@{w9L3}LO-mZFbo@r zTQzQ+Fb$g~%){mh%dmy`DaNhCR+8Q}oCiaWa*iKUlinES2O2tb(S^Y;iq*fWNct6Vdg5yUU!&yvYBGaiM>5m|q2^D77L$|~2(xmQSCz>eOIRD{ zWb@#c&pEkL&RGlrvl@zG?W|%{%N9J#441KmTsefAMim&!7I6*|s%9M!TEP}?5LyDE zE*-^gA}O5k&ArK@rajnF4&Efbkt>EbmOZN)u7XsXNvd+r%vO-M+PUiV_n_aXnsq_! z8XX1kw>;om(sL{x)w7k5L+wMCvI<2G@dIXBz-qG5Oo>8B;)XyMw~ww=&dxh-S9v~HgyrLnt@ zdX9RI_A1X(Ud0z-zWM@;_&I8h4$=6QQN;y{SA?jmiYpd|@+#-(Icm6bM&VV^N!LZn z-9B4CbdGcJ+~d<+Am|#Mp7aKN{z=!UpLg{fKj`8n*(twoG8kz6CBA&nO$)l7izB|^ z$eD4@tq^qVNN~hEJ`xBBs!<;|&bpO?vS)He&>ZmvIeuhZ&>rXgLBH2O4oR8d%h>^arkC2Uchx`#x>&g zPESmaj|4H;dv1F2e84q58T5_2y#9%)aW2T|U0^1-Kw#u77jTX6oNMI5h;JP0q1Bt5 zeo66h+Qo_TZXWpvmJU%K5`fkWbwe>I;ODE)OplIoJl0Bp8y^i|7Owlp+kz8QZBxFf zmf#3~mJ7CAn(o@Z?R*#4HtsvqHZ>DG=bvo5Fg`xfc7~huo|_oq&j(15)KH#eQ+lu| zTc>6Ot;ge=^aVYhS=Ri5^SOAsHv6n1z1qRe0p|XQ?@8 z`w*2YHuUe)xkJXJo|McNAFP5^O6waLDGW4>{GJdq2J2V0kw)4a6JmHog&^)l z>XPy?>LTsd%pU1sS=Wd-N4Wf>FnPeJa0UIYAk3rUn0CR;1Jer*6Ch_Yr;zk%r1idP zIyl<0ZC0Vz{}Q|6eKpRfJ6!kGpWkXrZ)|y){s9DbIXE`3LPEZA=#(hC}1bN?- zpdOi;g26B712AI*0~dY4bAplQCj1w;DV`hkT@skFfPYd@jr&K~fM7Xv^4Q=$l3PE| z`+2vTw?U?S0p_d|2RdXdXr-x6&?G6rk|aD{|74JZH9-w&@N--)9_vrgjj*hTWHDR0 z(Oe@(DT5S&9q`ffR6kFnn&qlxS+izsNa*BDZkekSR?297uKRL#%upFIR4zBH8JgndRWF@?;e4!oYovVZ z>ZNG;-dOoSqWaLdvtWy!o^xNeBKHb-2WqptQfYe&4?wa7egNZ2W> zW5IGKpUSIDDAcy%+jhsbLst(iJ+)?Ui`z@D9lLsLdEYCCuOE)pbVh1AqcvS?_HFU1 zn#Fz3A5Q2=;)P2KmXGi0C`W0ms3}s^^dY6N6|dVJ>z1MgOF{>Z`+YI4fdd0htvKftcD(sp!awt{Ot5N(ngQx@#&HAWQfM&Vf|i@hsS&bl{mjM z8Il~vY#&Lr{@f^pFZhlO!G4&{Y;zzN@S2qB&RWwaMQ4vvp|W= z$=KGkw_B2awlqI0`pszEHo|hhS_SnK4{S+L z(2Pv^Jk$KRpcg$n;vL(l$mf}Fj1KSUS1bPrMo(AtOAA+P`k!AcwC52HsfT1U`AhLnr|eh{f+ zXyI%5<7hpCR#t}|KtHrjz%TGPSlFTWte2F|4=(m$@3I%q?_U^-GR5%%2T-S?j3ZuL z0v)zJ%9O(kN&@gm^u;;|AV>g?MNc&}iM8_> zc)0c1{RGD~%+5alxnOWgF!hc5fMGf*7M@2C!DBH=^YX(;Wi>uPWQs@Y4EzGvjh@Aq zmQ~)-sZ>qtrOgQyJb@?5D~{RfBDT7O2Hjf9lD{}~`FsLnL419FIKL*MtG#Ccw^&}1 zb0=gdXmRmCRzR?Em!>!`4zU1$Qr{&mAXY7wQ8LKvk*d7{#5F3ILQaYxjyqx`e8nq6 zigWaw;v5xHd>f>te*`q(m>j4qh!sjqNFhx~(lcNS%6Kb8e*@8#0&L|sWpwm3Jwv&f zK|urS*{E-vn{~QQojBr}m~F)U~y zqR2L@g3|!?1^wxf@o7%vj+H!C75_!Fz64ePt3YOWZJrAw<33>7IZtvS=0_p?JVwOW z^VB^(rOgZHRlj_4P2-N2wT4UE!rJ0>M_0I{DeUN4c?_&DQ?PExiy52|gY(7ya8-M> zv}4WCdE4w*IolsK7t)xBS(?@2IwI^LW1C?V@ zNsE022!hJtnQeFrp!IA|aEE{o!^s~)<IL60|7vh!H%=uLYK5V!}VkW6@{e7eE#P#-*0hmc%qQw=^|z z*Or)TSH!hzL9=L#Xv^Z7;$(o<@|^jyIb7t96?H_4I^NTELUn3ulhv7IhH}>=4{!OD zn#9_I7};hPxRkk#YSNm!&ZPQJ*4V7^QcH3^YymDyLHH!X2LX-!?Nh2O`9)HdO+f5t z;1|H=m8-1&mZmc^YBKd=Uy}@u zTYSmgb1-ilrf94+Dp%diDT8vl5YeMZS_T?ZFj9u8CvVK{KWPAeoTai1_-eDfz{x6E zkpJXbrob1$N|s?&EHGL=n$^6honuncj1c4^&3YjJ(XzUrRZ5ymsxXE?JIjEsCOMma zgZiezN6)E(MN*WIDn0)ou9?KVntaZIyk!gpcUdN+_Q(<<)-b0DBHkuutX!f}PdPVj zNDC?DF9>PpbUN(0)v34=6;i^tt6noo?bSg&L$jtQpf$g)yr%p;m5Q3vhxB7LQvB4D zHKX)tk>-!EmN`SPUJ8**VT5!LBioEMW&2Q+lx7RnP9@*2O1|9`GL0en$TCujIS3EX zS4u<1F+?U=CS;P@WX>Ei@1wr>A&g;5$TEftb(UeRQe2+CQUGyq_0KY_O^P#T4O!W| z^Ib3k=_}1aTwjP8Lv}FBgyfv(Y$03DzLPKIdx@UQ<{*k zAyCPxpp_n||Ae%&dZ?#@PyzI!Qt4wR%R}lhM0Z(c48dKN3E4wB%tIbHN%=LOiFr^5YHaqfkl-^-%~l{CAb`T0(pM)SgT=<61kk82J5v7PJfI&SA8YfUTjqoTfW_=*V9ho6mSPDxP z;RKN!rPXh%`>Rb7PP`k)vfl2Hc5=Y?7Z4khBfF&4$ z+{6@O?`fVxI5x!veJHl^Kn0GB2WLR4G;+x!;%p+l5!4_y1SOnc7M~vf6cO_Xrk-ig zT=BkHBKEnj>E%Fy#=8XbvE%)NJqJBd|DMDBj}gRw12l*r(&DlE64qw$mp=k>K%jT< zxX|%VEStuA&hG;;fQb1S6b%R}@(F@1^|i^u&7L0QF2MVE&dd2Oa45egjC)dw56uAx z&x`6m&NVbNFw{D9&KH2#qJj`m#Q3=D4Cg{^8zv68MnUt(v6Vc|wC-jRBOvE53bN!H zv>U-ELK0YMx*0|Gzp65{YtKWG#{CoJedcIf5KjCjurR?rtuj08Op zLrlT@ASbtjaHT00QLHG!;h!gO5=AvUzFSbh2Nq1gXMzgT3uu{$VBmu%rUb3j6oP%! z$3qNJPa!p@pgAY%dWjZ;uY=t9ddyNq;(<63lq&ojdOwNYI;>@qvsfojVaOkX6>xxs z<35)WbsXa4B5PUwI23%yTBDK}=Mw%;AP{7~788bIT%nkGVZ!-mp zXTud;H_O(T{eV}D$<5TW~z;tYUALO*A&TXiWil{%PZrB z4Y9)ZNMU>2UKg{sML?I3SGaC;E)6A=3a9=~fyttt-+!-^QaP8p!Zlmhm@beI=p0L3 zFK>Be$Mqer?7qG`yrnl<-50Lf7cTCHR3w2qEx`w-@~Fu*KM;2`#vGjyN9X*&8sO7{ zqWQtwI#Wzn648}Jb*1yYaYqRWikFs$nc{?Ew`zcn=a*dDd3ERV{?+=|yB2rG@(-`& zAO6v$xTWa1r!GIWTphL4#_OBI)|%TTo1-P|i<))!=9qhX#JxS{?v1#6*WCL)qUvp@ z>7}N4RZFa@Jpy33s&n!1`_8(!!?j*f7pvG5so1pYj8^PgJn*yXrX>wjVQF>D*%EQK zta#U)TjHfP>#q8kt0Us-Sp8howR=gm?sP35h&r2ZgVG49VTFrUcP#A#@kmqKcN=ar ztZt7s?h04zTDk}pR9HjX%H!8(UwQKSldGfA`d!P+?dDCf=3SBIUD4(}%etS}cgMT; zglo3lZfc7)?Tj?-j5h6lJulMKx4bXDW$W_5cMe0DYMWj;bp4R1-dXj9dmoK#eRQqn zbL-oB!miG^t97;JdyO}%BCeg+&GE)oQjD%>V|Tc+d$}M|*&XlN`JZO~Y9`z>6zw{- ztbWhc@nIiL)ifuDY0A~`(o-)yb*rLdwQ5oQyfI!`2M_Z{u!F#UgoZl5^CV4ajYNfZ z*|6Ai#T+--=8q=&6d3-igdQvSS7G$Q!G1J4g87`nKZhQ3(f zeEtZ0hP!J~y=JYAS8lzT2gO=c|FJnSL0;lE*3+gN8 z+xdmq6TY^4K^reBzIO5I#TRE{726{f+iz|QSL}-x^)INuYD{P_$*%+7fg$sBU)hl@ z^cyb6;VSBlZS^484c;)aWz5ApBT{RCnojb3e8XQ>BEJ_pV&1> zJ2mKE)uTVsrhKASExl5UruOWq*VIM~FeNV$)C6*z35mbOD6|;~b>=;mllp^I; z4U#GQh60wx8CVu^{+`{NLpLSL{9utD0oKYJbiw{5RAhc09bY7@pEy;wc+Y`w{zb?J zSRLeYL=Dp!q=1o=Mn!G%)L4Z)j5aiB_E(7u7*CgeF_<`;G0q-V$uJ#-Vv zfz3qL6LA5td0^d#1q@cpWDNe#AjZGKFYpprL?TmB8*bejYuz7#zk&l{`@#96>v{J1 z!$cs{a!b<^FQ|_dbVmxh7xcFc1&d=Z*Q^=pfd4_9oCRrE(H`rk7S+%^?0 z>0jQrW^(5Y{vgj8x1PAGROFizl)_|&F|W;EJbO!17PpiveKBHb3~L+L4UVPmh@mdb z)DbCJhRj_9g?Lp$-uO$(U+Vk-di1OzZQF<} zC`sirmY<>}3bx%QJ3I<_SksFN5Xi`aAOLG=Z%B!LWYDCA5+i4lWkOnyEU045b2?T% zrw<}?Ow5=Jza^iPuw>2|Tn9CN2=iIUO~5s&z0NyLK!nIM5l_fejJym~#n& zLKdl2P$UB)o?n1q0mUpi#6ch~H*c8eS8HgAM#COn3~etCd-Uo_{TAN0g5J4*+^1~ zAV!!5>%JBC&5irKL?F?IJLLF_7Nr||fYueX5ZCfwMe8}V7SYO$ z^s&vvk0ljx7M0Uf^8OM2wpNZWrhQUpk!e%Jx|0?nveC2f^>fO>t*cZUBErr6d4e`(OY*ZZOa9 z`*ak)`N2@Ugx}*i@OzbQ_iqWlH#El@wnrMaM;p4A4rYS+^(2N)hw|~p&8w9+8do2U zH13Ku_Cy+cUUx?uk1T7}tLsR9D(?0yi`4tTx(do4FlL|l7v;B^4)S$KWT z)v?~OD^j}o!|e##L0Y~6Xn#xw?QBla?jzy9N~ob$5wj-_&?!t_i?}}nnxFgxGzWRX zzX_TnBzd~edBjY=QSLajn|fm_eMqBzqf3eOj?P1F<(qBHArtdvJAFv6eX~P}bXQLv z1iY>9bHL-rRR)Or;}-f*J@eyM2I(yYkl#BR8t8ZQG^BXPR01*IsV6>e^`UOfJ8kr# zEyi~?X^`&HV9ebB$_>hQwF*f5uAWBPpaeQY@vMdN;ktlItj^$PWnPnl~plRuqiudP%~3w z&602G88UeaB^PU-GqajGOR!Dy8QUaLsi#y{L@SUQtBRPAbXdy zl2Xh;I6-guV?9z>>KT&J4yv1>{ni1!Ay(*1#gQmh#yMmCl3(h{%6Mb0C{)B2Cvltm z0b9cnL~e$dTuS4R{O-BpP;pLGbcuu-0JoI@ZYyDFY+RNctIe6BR7wE{lPs(ziQ60@ zE5U7mdZB;FqXKZ7DHq3|i^~r=Lbfd2W)3k)%w#9H%?!9L_ji;@rGoEp0&a8WEPJ^W z3(DA#tT-yG(`KO*z86Z`Q&RUZouLt&Y`%%9xm_c1Q8SuiHB~d4AByK!%iCQxh?DEKLh^S6b zaFcG0h>Q@=@vosp&>PMef<38V&QM+pWl3KYtokOW{nG(atL;I(X+VT=GRn)4*Q+5@ z6f)dJSog?F;0@*tjb4w zPU8ttQK?Gm)6ED9pI^}Q;%T#k#{|_y9<;uAbd);}Ve|~BPkHfpm@WW{V-|PAO52;~ zMI59J`o%Mxv&}gX(FitG>&|gMD8mE0Tcy}bh>5nqCwx3l-L~alE4fn zQMaP?@Yuc-lOd<{8G3#KxDQ~wLufq|wlB}A#V1vG7BeID{e7U|(BOoJG?YM(Lc_s# z3(E7{jG%(u7Sr7SZRk&$D}Zv29MC`IwO8!@B^&p7{-=<~CG6(NCVfnF_IHhxs$kvh z%2L~J4tMN(eKOpBYK=L)?kEp8?1(k=MBvZT8#ecWC_w$`3VqPyr(yp%9OrY@#N91- z3Je6uJJNvsL#yuZsToya!bn-GVwTQ`rE~rOs)o+J%&Zw|SBh?GBksMo%|&6yv8efY zSbO|^oo#X3(*6iJAh2S)Z7GadDk7GOsHJj&LG3YYl3u*9^!UmX3+6R#cf6$XrIr_3 z{(T_D1vFx63=DJkw|&X*enlg05U6fUo*eyrw7O?W`?LDCrTy`mrdI~953Y7a zYr11K2O>2Ge&mSOoLK6QSJbR%qZOMnw*lO)Z+d0+`s`|Lw0_$%v);PpyXS74Tb+rv z_Jpf@mb<`;w|1_M+!(uA6lv{=wf09^`(O7(TaPdAU$1Xk34H5`c-yv{TW&nL!o;`k zSW*9l0oCno-__sHi&Bnwb7y$l(Ma>rdrF175dv%5SI>TL0(KhI^j#l}Ctr9m+PWuP zyJz_nSn=+?|7H7MZ;v;0gFbI(-SWU%%{EB4y_b%-IzMcu8a5?%gE-=qJ=gc#s@}GG z1{D5B;-cRFqsM5dTvE*+RZ#l6JH3irdkB0O>sJXQR_(9CEe8_4G}Y3*IIw1SlYIow z55{YEfS9Cq5AG>QoW$rKi@9Rs-9hVL?tx6gS`6C-u&v9g76z^i-YzP^CjZ)#3kTxP zvX{DE=z4j3ta?|Zde_ZM;cD2sad2Vat4CnF03^Bp(F_eG4C`ut9rzv$+-Leq2D|7t z%6c7xZ~%6zv z$4|Np5cgdLeUwqZt7MSYltH%d*3d}T(~#ob#uAA6QwDv0s#YI$=zeOXk6O$>HEEEx zYtY}>qaSQg{cWuR3iG%1G|~-9q?=TO9h$%0q=8JmDk!#OG(bE9zq>nO?h(Zmsk1dQ zpP@>rPIK*vm*c4cwyQjllV|`IQaSG|pi$fY8l3|cN_)38Ax%y-se*uwzj?6ogwzNo zF40gX;-b_wduOZ5IwuV=W|iHS|mxZ@^R1JR28iw9t_e(A^yM^=hf ztD>cySNqp1-LI5gFZ*^y%(-cCU%b@y(t#HaEI+z(CR)0A5kf_gctfMOYpurp%E9Xg zSNu00kJj`os^7C#u8W)H;uZB7d+0u@l=jXw=34gJ#GP|is&wmsAOsL*DfrmpEF)I{!2myA-@h_V?SLtP)gr$^tVypr|JHD_4gG@q*cAv{u1RI z1$2L*`i;UK2RPnrH9*Ll+v$D>^X3i)>D>j8_^o^z=(h@Ki1U_%&?V~rX3bkJy1&}^ zR;32%1`Wn((+w0W-`Z9*U{}6vQ9;_b?JDqjyI2WyhR9rAfd-))eCh_C|A%L_UR1)4 zHv?26FKbC(dvz&O+q40bI5qyMOtwU|{ar1_cIWNKQHz z24Td4cAE2KPDXzLaZ;LU{x89KLn$7bBy$YT5^h5ph)vSkzLbPIF9ciN?XBw%Ov&!@Ppdl28 zDA;wGQ`JONR`NKK-b7JEWtC%8K`t^^mx|4jds{XLQ?gVEg=Z;WWai+87FPQkD6MqT z7m;zRC*!tAdVkiqMFx$A|(uU|6j7df(>VE_!T;h_pF=V5` z%Kr=^e_})yXO2j`Dh}3rQt*F(_^mh=KLldKvDmiiSYvhoj@QB60dN?&Ig;P}8oQ=# zziqUHL9}3XN38Csv2o>*TgKLeN(o~`LF?K;?(jf^cV%l9$3SeAXcT|70&K97k5dZq?Mn zn7LaOk0Z+i`hge5jN{0xf$Ub5AyL+DRotsX3=A-`Qxq0P;{yyZ(;)vIq~rf9S}SPX zK`pgPO@r{S(W2!zWRuuBOlI1J}K7l4+BZX1pr$c3vOAA5=a z9}p{qo#Zhf;Ob53qKezrylc9vx+QnGwkuZK8>#J$TKg7M?;ETMh0;_2GP9-5m%3l* zUVi)=yQ4*oVJiv=jnv z8BK=8B~8{0#D-0##YRb8oQW4y@W>p%w-X%*E@W<+)B_wzf(j77-wKQ@+(_uhtD8LD zao-gGKOt}dzq=TSOscDc%1>i<*)X>cW2g4GX7Ti#yfK%3R4#$HPLCg%?#0i{C`Vx^0LEa-XCfsbI z8vG?D6*ZtGC-4Hs0Pq6jkndSIcsl9wO~4`B*4eHN!^A6Vf^f!o)IUD%zlf?Z(p5l9 zy2Aw^fz&Z}8NxAu2kCHzv<@W{z*bogd6UOKIX;s_5Sdrk*}M`AyV`HL4zE7(`Xhh!)Q>77U57z>|8s+DyCOA3!v_UstM3VSsncnUBfJ^>N0T!a@eF!P-yiqb&{@q z-ndp^mJ?4yDN;)y)DQLqQa}7Xpfc;E{d=w2_9c-V9Bab)6Yhb5 z`yr4LrQTBRaPtOE{8bR@3^?vrxXE}xA! zTUVwd&Q44i(K_PgwJ%M)FtO6NYKxY)ulB&TyAYy}Xq_K0C27alf2*KyOJ6trMhD%u zoq3~E1$2gF=eN2l@pqx2qy-5ZEysjg_nFNtGF1Nx$^UWVWoXN%FGKu4V@V7GgE9j_ zEh3s(gZL!JW@n8_hq$4~mUN{+&url*yu^gL{9)moE&gQuCHWB7SA|4n?r6?EjeRlJ_|KZBpB ztobrh$XAMcah?N8u)u#_Fb>xv;y$(3kuzR$b)#TFtr7X358x6KTqwvDpMo03E%0nS zE)lo|o#Juo7k2`wc0StdJZ1L|ERz- zbns~Zu~S3h0iSlvZ2+wyv`}dTw{yTvBBSUe6VMn^M5g-%r2Ygg6fBW*#{4Q$KSc{A zzC=LF{{vDVphXxa2xe*-V7PR9~JnIP<_B2()5RFif;V{ zrTZCW_!-)ILRkNfs{MPa;qNJTgmV9aGXIRq`#Y*^J+FSTK9*M>$*W&Ezm~US-tv*U zj&|Ikz)GBe|1QVyk-|b(B`C1&D#4i`&KeS6q)QVNSjhmO?v<MVc1Po<>AX1PqeNoeWRIDuBBrW@ z3Y`$!U|F&P?&?U$IVIK7en(Y6*OSEXT(>!vDk3&_0zJul7HSp(i+u~1 zF1JQh#dkI6y%pXWZ{Bi8_a*7INTj_MooTNnPA%L+DrQZR!POd23b63grw#?xyCi9k|lcp-W7MR1Ha2mrMuGv~=J~ONTBo9a5hY>l9v1>II%@ z39&$_o{p|mL0y!oC2nqw=WmPKTjRys;^n>gS9xIFxijt>SkG_1<21J^!W7($yHvgO zM7X3eQqXjlGSdnutgS|rXs=nD?x~n^#m}uxmzf26VRKxg4VzlRZ9`$>sfgy(hbl^? bj$4~P#7GgU^LmKs~Ki$?tmUx*WLWu1IK z%-)`5Nyoxyio!_^v@U`wKv6&seMn#z^<$rkq_)wAzL=E(u~!!e-~z4R=vsx-pbwoT zc_NdhX!me8v%531Gqd~6-tQGfM$pXY4~-*ngzmFW*pxQ&XdReMq@yCzIXz->MXpIB z#YmI#MII<`MlGQza7d5pf+<>3QLB3JNAH? zJDcZ%D1BYG9&pgh^FiWO{JMu-I_ufxAk)=3Wmh~J^mL7U&7*86Nm?4EC8i=%P%arw zVabJc495=A)UdGQ(UObHj;*^vieO6CK#XE?K{Epp`W~^%LMKEd0{;fQkA4$DXby3Z zSjwDiwiOurbi39b@OickRe|nV)4A)BYi-@CNTpCy=Y9TXunj-w!&wed*xXk0d0o)O z>(Vv8)jx)+QXKih|L4Uz9G7a$=lxidi_^rO-MBBUB*R{ZGC7d{uXEShF;``fNq6)@ z&X+q;fJ9F?-l1WJION08IzNOE=>f^Ujzq|HB<+GB5T<)_h2 z{s)49s-iD`)`zVa@D3 zM$tHOxzz}p;oxD`TC>08SGjybi@@f|DztBF25n+fvpcAyj=BiYu&}CK&1-o!jpU^42eL6)n+B>N=#Qgt%}tO)Ah<_ zC{IhC3HIHJW4pL*Kr^6jbrSn6&GhgLAr6^VPc5kwGP_qzbqBll)AT<#kL3#7Wy0;+ z*9>(Q%B|F1dHpGN6xhzPO1)>Ya}>M^i+DX22Dm=8p<}mPRsD@cjZ}Sg1DjgK#d=k3 zGta6U1Y4S6gVid8)n)_Eg~~2aheD~9LAL}eS{eQ$~&7+ z!xaJh5hN>^ES1WR$I3QgRWuMP1d2Fe6*AyqR|C!tM6aT26bBN)OOV>-Kq#AziwP@P zfutGEGNqLu`?6_Zo4)MWHZD_#G&2m-?9ducAh;&Rm3)%KK#Q=N+;oPpDoR-K3}QN5 zWhUHZfe2R%yNRL|h%JbB;+EQZ36z#gCCy^a2dSp2_3@hb&tRSFfO zaum>a4%4F-y{XT?czy~OOk=T7S)_%G~D4=l>E{>Zx44Z@#W%{vkzWu@OUZs-Eioa8UilGV`Ok zH|O3_?j{d0e*Vq*D`)HJ!FNu)_1$_VyD49l-%i#u{S7gu^#Ad|Gxev(e?7I!N7UZO zC^FRhW3eHk?9k@%pB%4e2RBb&J^l84efZ$x0(kcjF&B@Z?8t5k?LWA+|Hc#BxtA`3 z_1qTshxEaF1G!D>svJD>S$BiWFnf&PS7rQk6<(`9FfA0i0ljA<%PrNjf;eY!j<>ax* z%_MhnLcFOQWc-*6(>Gr{0<*V9xS5oAYgAx-o{is{;6UcqvA&sva61-(@!K-T_(YWP z8DVBfy4@cG$$Z544#4RF`}4>vfmAB#PPtSf2cS>-*(V2|Aj$r?#@a`{1)l!!c$B(f zNSGnQ{)Ad^Q(B-|X#ZF7414T2y@wvb7m>s831VNc9NjbVaq>KL*>6uX1&|?*dnh7q z{60$k2_@kBXO#OZTBxCgMhfjaP#+q8n3(2z>%F-K5A?%~+7N)kwx;@bl#!Y;(vVm` ch6bK!#2F`}{=o**fy9fS+f{%g&$wC3!Nr{C*0UJB^IyI=0QKPHNy|ZxP?hkV> zV9=tX%2Gj13RYX$R;YqoR1=X_Jfu~mR#n?4OMU2WO=NQ8O0N3UwJ=c#Az{(vo{ z&aTeP+&OdR%$#$+bN=S{dk{2T__21|jnKc@q|{u+!R~Kia0jVK#X1@)^mqu{b8d*U zdwz(Ap3{YdI3!}E@&t|fN`m%#RUl5{BJMbM3_hx&Jrj#F2Yh#_&RfEeheH>U>biwg zH^E~y`}<3y4ZbiM@~Ix;QoY2jI#e8Ys6N>3pTQCTe;7k7f-Sx~Ib#Vj9Wg9tf|yiO z49i1qnpBpwWISa#C^08#QYlKbQorP7?~EloVC%g;CQ?8=bAK zBfvR1+@host?3z)i7NNg+PK!*pPG)Esi>AT$q1EA5@n*J#Z#kjYP76rRFzG6ST=}# zfE6C-{$8Incujiy#^ANICyZ@J40mY7JXM7JD;OH&qTHkTDLNzzDBDJ@f3 z*NH9}+AN7UEiMJ#Pm)s_O(iXUgqW77nIvIY!jw!6%Is%3QL78B8#vvi8lJt!5(Kn%2@$6GD!d(VLU6ef#zC*GQ|b4YM~NO(k2W zbUo2pzKvloO`4i+w4`UO6G|#MMW`v6DXC~Hsmv=C{DRiOnM@Jmb0G65Ux_M$^B13Z z{Y#yTol9MdUD>+xpO7tY?{-DaLf~n0_QhU*P^#5zN-Sy6MT)bUv-D%f8T-t8CBn$X~Vm zM$UJ9)tB={*W)?g+3dM1Ip0+{#$CA-SPU#red=n=cc2sP`E$ruH6PoB>mh}aQe)1; z+U^-3KLT(fh(R&HFeZGQQ#k-YfsFxz`M9uv;#d`K`G&*=bQ2FbAP7avb150`ODf<) z)WRv{Af5$4s*QZVw~@~*q7FnkHp_e^RHLJSXon!*KB6sgOT5BsNa5bOQp7e6uykK4 zXR-jWn_0>?$Sfn9SV_{q@X<~TIGO}j4jiUS{&TxQ}!P3tCKEs!Ie#*TY z6Oh8s@pIe&I-nUgkCgaJr_kdW+>78w+z7i-6(BYL|K#81-mM1taO>x!m+x4XUSn35 zPCP8J4}er$)O@-yhEt{iT4ELtsV(BOJer{;4O31k#1d3ZF)8SbEZ#6F zV*wP1>nYi^gyB?5w*+%0O)R&f6PdEyXbHMzz#;(gDBuN%ibn)XFo+(fjZ6+6xA2s~ zJd(veWod%G1Y|a}?_e!uapz#g_!E$M^u$}eEU$#Nyp4GeYH0l6;=PM2m-gIZD6r!~ z4X1ARE_W|a?1`whd9`Ux`KaS#@niFkgMU8xhoS6?{aZ&b?eQoWTJHXt<7ssbglphl zV0GeAZDdE}f-ggCH=O!{$;uuY=BKMHJTxBN|O>VvWSW1GRX#CBEfc17ia>+9WWkoWZ$0TgOCm`kp8 z2YV_VfA!b))CiP$-ZMeOLV~&LZaXeG`v>>;4k|#-Tyt zT*E&BV@Gfdl7MJTLW%>!nuJ2`WFgF6JR;Ipp|j0m!?)!apSEirw0CwA>d6N!_gglDkD5EO6@8oPKPvh@iRUW% zvaUY*0iLtGyq>%0XjgpyDk>=UFtznM`@RjWMTxu4TYv`)CG0j6UKwihNKJ+rN@D zjAqaY)CM=5h8VtqF#KlFP5uU&#t|WA;c2Cek#G!?!XnoOeW=9o0h}tM;bC>@`%shr zf>mBT$UHw=7uJk$GG&G@4_u3d4MH_p*JkBmopePwdKylkEUkbUSbW=ND1^1;Ig$)I2!t>U$ls}sLE{kXY3*W8|M>d1Cn z%{9gf?!sBTqC-`plDG_&A48TD)E}0>!~$APCepU$pe}kHq)}!z3&#y+18`#7J1qY* z%Ftd|V=-s60|D1{`xi#<@>}luRs6)|TXHYDw|!OHKL3--y3azf|GGuD=Sv=W8Xmjr zbME?Y>{8~v<7fHbiCxY}z;ZzZK_vz6)D80J%ZyY4u^fPuT5`m8X3HG|pEuI7LTEGN zIKfDS5mu#GUiQ|-Y-?Z0*78GC@J;qm@VJFw>7tim$_#JJ1IhCk<6RfRm7gQ8{dfNp z)#M$gan-ix$YW1^&QqV~+3fTy>ph#H^-K4EP?-1jKdgGFJgogo%$`3L-Mb?2?G55e kd;u?q9tWFp!KOXLX8BW2>|x;p-TFzp+dQ+!X10d^1AG5Fy8r+H literal 0 HcmV?d00001 diff --git a/benchmarks/lib/endpoint_request_func.py b/benchmarks/lib/endpoint_request_func.py new file mode 100644 index 0000000..ed0fdec --- /dev/null +++ b/benchmarks/lib/endpoint_request_func.py @@ -0,0 +1,777 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""The request function for API endpoints.""" + +import io +import json +import os +import sys +import time +import traceback +from collections.abc import Awaitable +from dataclasses import dataclass, field +from typing import Any, Literal, Protocol + +import aiohttp +import regex as re +from tqdm.asyncio import tqdm + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + + +class StreamedResponseHandler: + """Handles streaming HTTP responses by accumulating chunks until complete + messages are available.""" + + def __init__(self): + self.buffer = "" + + def add_chunk(self, chunk_bytes: bytes) -> list[str]: + """Add a chunk of bytes to the buffer and return any complete + messages.""" + chunk_str = chunk_bytes.decode("utf-8") + self.buffer += chunk_str + + messages = [] + + # Split by double newlines (SSE message separator) + while "\n\n" in self.buffer: + message, self.buffer = self.buffer.split("\n\n", 1) + message = message.strip() + if message: + messages.append(message) + + # if self.buffer is not empty, check if it is a complete message + # by removing data: prefix and check if it is a valid JSON + if self.buffer.startswith("data: "): + message_content = self.buffer.removeprefix("data: ").strip() + if message_content == "[DONE]": + messages.append(self.buffer.strip()) + self.buffer = "" + elif message_content: + try: + json.loads(message_content) + messages.append(self.buffer.strip()) + self.buffer = "" + except json.JSONDecodeError: + # Incomplete JSON, wait for more chunks. + pass + + return messages + + +@dataclass +class RequestFuncInput: + """The input for the request function.""" + + prompt: str | list[str] + api_url: str + prompt_len: int + output_len: int + model: str + model_name: str | None = None + logprobs: int | None = None + extra_headers: dict | None = None + extra_body: dict | None = None + multi_modal_content: dict | list[dict] | None = None + ignore_eos: bool = False + language: str | None = None + request_id: str | None = None + + +@dataclass +class RequestFuncOutput: + """The output of the request function including metrics.""" + + generated_text: str = "" + success: bool = False + latency: float = 0.0 + output_tokens: int = 0 + ttft: float = 0.0 # Time to first token + itl: list[float] = field(default_factory=list) # list of inter-token latencies + tpot: float = 0.0 # avg next-token latencies + prompt_len: int = 0 + error: str = "" + start_time: float = 0.0 + + +class RequestFunc(Protocol): + def __call__( + self, + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, + ) -> Awaitable[RequestFuncOutput]: ... + + +def _validate_api_url( + api_url: str, + api_name: str, + expected_suffixes: str | set[str], +) -> None: + if isinstance(expected_suffixes, str): + expected_suffixes = {expected_suffixes} + + expected_suffixes = {*expected_suffixes, "profile"} + + if not api_url.endswith(tuple(expected_suffixes)): + raise ValueError(f"{api_name} URL must end with one of: {expected_suffixes}.") + + +def _update_payload_common( + payload: dict[str, Any], + request_func_input: RequestFuncInput, +) -> None: + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + + +def _update_headers_common( + headers: dict[str, Any], + request_func_input: RequestFuncInput, +) -> None: + if request_func_input.extra_headers: + headers |= request_func_input.extra_headers + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id + + +async def async_request_openai_completions( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + """The async request function for the OpenAI Completions API. + + Args: + request_func_input: The input for the request function. + pbar: The progress bar to display the progress. + + Returns: + The output of the request function. + """ + api_url = request_func_input.api_url + _validate_api_url(api_url, "OpenAI Completions API", "completions") + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "prompt": request_func_input.prompt, + "temperature": 0.0, + "repetition_penalty": 1.0, + "max_tokens": request_func_input.output_len, + "logprobs": request_func_input.logprobs, + "stream": True, + "stream_options": { + "include_usage": True, + }, + } + _update_payload_common(payload, request_func_input) + + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + st = time.perf_counter() + output.start_time = st + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, headers=headers) as response: + if response.status == 200: + first_chunk_received = False + handler = StreamedResponseHandler() + + async for chunk_bytes in response.content.iter_any(): + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + messages = handler.add_chunk(chunk_bytes) + for message in messages: + # NOTE: SSE comments (often used as pings) start with + # a colon. These are not JSON data payload and should + # be skipped. + if message.startswith(":"): + continue + + chunk = message.removeprefix("data: ") + + if chunk != "[DONE]": + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get("completion_tokens") + if first_chunk_received: + output.success = True + else: + output.success = False + output.error = ( + "Never received a valid chunk to calculate TTFT." + "This response will be marked as failed!" + ) + output.generated_text = generated_text + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +def _get_chat_content( + request_func_input: RequestFuncInput, + mm_position: Literal["first", "last"] = "last", +) -> list[dict[str, Any]]: + text_contents = [{"type": "text", "text": request_func_input.prompt}] + + mm_contents = [] + if request_func_input.multi_modal_content: + mm_content = request_func_input.multi_modal_content + if isinstance(mm_content, list): + mm_contents.extend(request_func_input.multi_modal_content) + elif isinstance(mm_content, dict): + mm_contents.append(request_func_input.multi_modal_content) + else: + raise TypeError( + "multi_modal_content must be a dict or list[dict] for openai-chat" + ) + + if mm_position == "first": + return mm_contents + text_contents + + return text_contents + mm_contents + + +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, + mm_position: Literal["first", "last"] = "last", +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "OpenAI Chat Completions API", "chat/completions") + + content = _get_chat_content(request_func_input, mm_position=mm_position) + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "messages": [ + {"role": "user", "content": content}, + ], + "temperature": 0.0, + "max_completion_tokens": request_func_input.output_len, + "stream": True, + "stream_options": { + "include_usage": True, + }, + } + _update_payload_common(payload, request_func_input) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + output.start_time = st + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, headers=headers) as response: + if response.status == 200: + handler = StreamedResponseHandler() + async for chunk_bytes in response.content.iter_any(): + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + messages = handler.add_chunk(chunk_bytes) + for message in messages: + # NOTE: SSE comments (often used as pings) start with + # a colon. These are not JSON data payload and should + # be skipped. + if message.startswith(":"): + continue + + chunk = message.removeprefix("data: ") + + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get("completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_audio( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + # Lazy import without PlaceholderModule to avoid vllm dep. + import soundfile + + api_url = request_func_input.api_url + _validate_api_url(api_url, "OpenAI Audio API", {"transcriptions", "translations"}) + + content = [{"type": "text", "text": request_func_input.prompt}] + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "temperature": 0.0, + "max_completion_tokens": request_func_input.output_len, + "stream": True, + "language": "en", + # Flattened due to multipart/form-data + "stream_include_usage": True, + "stream_continuous_usage_stats": True, + } + _update_payload_common(payload, request_func_input) + + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + # Send audio file + def to_bytes(y, sr): + buffer = io.BytesIO() + soundfile.write(buffer, y, sr, format="WAV") + buffer.seek(0) + return buffer + + mm_audio = request_func_input.multi_modal_content + if not isinstance(mm_audio, dict) or "audio" not in mm_audio: + raise TypeError("multi_modal_content must be a dict containing 'audio'") + with to_bytes(*mm_audio["audio"]) as f: + form = aiohttp.FormData() + form.add_field("file", f, content_type="audio/wav") + for key, value in payload.items(): + form.add_field(key, str(value)) + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + output.start_time = st + most_recent_timestamp = st + try: + async with session.post( + url=api_url, data=form, headers=headers + ) as response: + if response.status == 200: + handler = StreamedResponseHandler() + + async for chunk_bytes in response.content.iter_any(): + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + messages = handler.add_chunk(chunk_bytes) + for message in messages: + chunk = message.decode("utf-8").removeprefix("data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append( + timestamp - most_recent_timestamp + ) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens" + ) + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def _run_pooling_request( + session: aiohttp.ClientSession, + api_url: str, + payload: dict[str, Any], + headers: dict[str, Any], + pbar: tqdm | None = None, +) -> RequestFuncOutput: + output = RequestFuncOutput() + st = time.perf_counter() + output.start_time = st + try: + async with session.post(url=api_url, headers=headers, json=payload) as response: + if response.status == 200: + output.ttft = output.latency = time.perf_counter() - st + + if payload.get("encoding_format", "float") == "bytes": + metadata = json.loads(response.headers["metadata"]) + usage = metadata.get("usage", {}) + else: + data = await response.json() + usage = data.get("usage", {}) + + output.success = True + output.generated_text = "" + output.prompt_len = usage.get("prompt_tokens", 0) + else: + output.success = False + output.error = response.reason or "" + except Exception as e: + output.success = False + output.error = str(e) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_embeddings( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "OpenAI Embeddings API", "embeddings") + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "input": request_func_input.prompt, + # Many embedding models have short context length, + # this is to avoid dropping some of the requests. + "truncate_prompt_tokens": -1, + } + _update_payload_common(payload, request_func_input) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + return await _run_pooling_request( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + +async def async_request_vllm_rerank( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "vLLM score API", "rerank") + + assert ( + isinstance(request_func_input.prompt, list) + and len(request_func_input.prompt) > 1 + ) + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "query": request_func_input.prompt[0], + "documents": request_func_input.prompt[1:], + # Many reranker models have short context length, + # this is to avoid dropping some of the requests. + "truncate_prompt_tokens": -1, + } + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + return await _run_pooling_request( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + +async def async_request_openai_embeddings_chat( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, + mm_position: Literal["first", "last"] = "last", +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "OpenAI Embeddings API", "embeddings") + + content = _get_chat_content(request_func_input, mm_position=mm_position) + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "messages": [ + {"role": "user", "content": content}, + ], + # Many embedding models have short context length, + # this is to avoid dropping some of the requests. + "truncate_prompt_tokens": -1, + } + _update_payload_common(payload, request_func_input) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + return await _run_pooling_request( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + +def _try_extract_request_idx(request_func_input: RequestFuncInput): + if request_func_input.request_id: + match = re.search(r"(\d+)$", request_func_input.request_id) + if match: + try: + return int(match.group(1)) + except ValueError: + pass + + return None + + +def _preprocess_clip(request_func_input: RequestFuncInput): + if request_func_input.multi_modal_content: + # Image input + request_func_input.prompt = "" + + +def _preprocess_vlm2vec(request_func_input: RequestFuncInput): + if request_func_input.multi_modal_content: + request_idx = _try_extract_request_idx(request_func_input) + + # Adjust the ratio manually if needed. + use_image_only_prompt = request_idx is None or request_idx % 2 == 0 + + if use_image_only_prompt: + # Image input + request_func_input.prompt = "Represent the given image." + else: + # Text+Image input + request_func_input.prompt = ( + f"Represent the given image with the following question: " + f"{request_func_input.prompt}" + ) + + +async def async_request_openai_embeddings_clip( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + _preprocess_clip(request_func_input) + + return await async_request_openai_embeddings_chat( + request_func_input, + session, + pbar=pbar, + ) + + +async def async_request_openai_embeddings_vlm2vec( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + _preprocess_vlm2vec(request_func_input) + + return await async_request_openai_embeddings_chat( + request_func_input, + session, + pbar=pbar, + mm_position="first", + ) + + +async def async_request_infinity_embeddings( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "Infinity Embeddings API", "embeddings") + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + } + + if request_func_input.prompt: + payload["input"] = request_func_input.prompt + else: + mm_content = request_func_input.multi_modal_content + assert isinstance(mm_content, dict) + + mm_type = mm_content["type"] + payload["input"] = mm_content[mm_type]["url"] + payload["modality"] = mm_type.split("_", 1)[0] + + _update_payload_common(payload, request_func_input) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + return await _run_pooling_request( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + +async def async_request_infinity_embeddings_clip( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + _preprocess_clip(request_func_input) + + return await async_request_infinity_embeddings( + request_func_input, + session, + pbar=pbar, + ) + + +# TODO: Add more request functions for different API protocols. +ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { + "vllm": async_request_openai_completions, + "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, + "openai-audio": async_request_openai_audio, + "openai-embeddings": async_request_openai_embeddings, + "openai-embeddings-chat": async_request_openai_embeddings_chat, + "openai-embeddings-clip": async_request_openai_embeddings_clip, + "openai-embeddings-vlm2vec": async_request_openai_embeddings_vlm2vec, + # Infinity embedding server: https://github.com/michaelfeil/infinity + "infinity-embeddings": async_request_infinity_embeddings, + "infinity-embeddings-clip": async_request_infinity_embeddings_clip, + # (Infinity embedding server does not support vlm2vec) + "vllm-rerank": async_request_vllm_rerank, +} + +OPENAI_COMPATIBLE_BACKENDS = [ + k + for k, v in ASYNC_REQUEST_FUNCS.items() + if v in (async_request_openai_completions, async_request_openai_chat_completions) +] diff --git a/benchmarks/lib/ready_checker.py b/benchmarks/lib/ready_checker.py new file mode 100644 index 0000000..5649faf --- /dev/null +++ b/benchmarks/lib/ready_checker.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utilities for checking endpoint readiness.""" + +import asyncio +import time + +import aiohttp +from tqdm.asyncio import tqdm + +from .endpoint_request_func import RequestFunc, RequestFuncInput, RequestFuncOutput + + +async def wait_for_endpoint( + request_func: RequestFunc, + test_input: RequestFuncInput, + session: aiohttp.ClientSession, + timeout_seconds: int = 600, + retry_interval: int = 5, +) -> RequestFuncOutput: + """ + Wait for an endpoint to become available before starting benchmarks. + + Args: + request_func: The async request function to call + test_input: The RequestFuncInput to test with + timeout_seconds: Maximum time to wait in seconds (default: 10 minutes) + retry_interval: Time between retries in seconds (default: 5 seconds) + + Returns: + RequestFuncOutput: The successful response + + Raises: + ValueError: If the endpoint doesn't become available within the timeout + """ + deadline = time.perf_counter() + timeout_seconds + output = RequestFuncOutput(success=False) + print(f"Waiting for endpoint to become up in {timeout_seconds} seconds") + + with tqdm( + total=timeout_seconds, + bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining", + unit="s", + ) as pbar: + while True: + # update progress bar + remaining = deadline - time.perf_counter() + elapsed = timeout_seconds - remaining + update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n) + pbar.update(update_amount) + pbar.refresh() + if remaining <= 0: + pbar.close() + break + + # ping the endpoint using request_func + try: + output = await request_func( + request_func_input=test_input, session=session + ) + if output.success: + pbar.close() + return output + except aiohttp.ClientConnectorError: + pass + + # retry after a delay + sleep_duration = min(retry_interval, remaining) + if sleep_duration > 0: + await asyncio.sleep(sleep_duration) + + return output diff --git a/benchmarks/lib/utils.py b/benchmarks/lib/utils.py new file mode 100644 index 0000000..32e9db4 --- /dev/null +++ b/benchmarks/lib/utils.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import json +import math +import os +from typing import Any + + +def convert_to_pytorch_benchmark_format( + args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any] +) -> list: + """ + Save the benchmark results in the format used by PyTorch OSS benchmark with + on metric per record + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + """ + records = [] + if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): + return records + + for name, benchmark_values in metrics.items(): + record = { + "benchmark": { + "name": "vLLM benchmark", + "extra_info": { + "args": vars(args), + }, + }, + "model": { + "name": args.model, + }, + "metric": { + "name": name, + "benchmark_values": benchmark_values, + "extra_info": extra_info, + }, + } + + tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") + # Save tensor_parallel_size parameter if it's part of the metadata + if not tp and "tensor_parallel_size" in extra_info: + record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = ( + extra_info["tensor_parallel_size"] + ) + + records.append(record) + + return records + + +class InfEncoder(json.JSONEncoder): + def clear_inf(self, o: Any): + if isinstance(o, dict): + return { + str(k) + if not isinstance(k, (str, int, float, bool, type(None))) + else k: self.clear_inf(v) + for k, v in o.items() + } + elif isinstance(o, list): + return [self.clear_inf(v) for v in o] + elif isinstance(o, float) and math.isinf(o): + return "inf" + return o + + def iterencode(self, o: Any, *args, **kwargs) -> Any: + return super().iterencode(self.clear_inf(o), *args, **kwargs) + + +def write_to_json(filename: str, records: list) -> None: + with open(filename, "w") as f: + json.dump( + records, + f, + cls=InfEncoder, + default=lambda o: f"<{type(o).__name__} is not JSON serializable>", + ) diff --git a/benchmarks/serve.py b/benchmarks/serve.py new file mode 100644 index 0000000..dddb050 --- /dev/null +++ b/benchmarks/serve.py @@ -0,0 +1,1531 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +r"""Benchmark online serving throughput. + +On the server side, run one of the following commands +to launch the vLLM OpenAI API server: + vllm serve + +On the client side, run: + vllm bench serve \ + --backend \ + --label \ + --model \ + --dataset-name \ + --request-rate \ + --num-prompts +""" + +import argparse +import asyncio +import contextlib +import importlib.util +import json +import os +import random +import shutil +import time +import uuid +import warnings +from collections.abc import AsyncGenerator, Iterable +from dataclasses import dataclass +from datetime import datetime +from enum import Enum +from typing import Any, Literal + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +from vllm.benchmarks.datasets import SampleRequest, add_dataset_parser, get_samples +from vllm.benchmarks.lib.endpoint_request_func import ( + ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, + RequestFuncInput, + RequestFuncOutput, +) +from vllm.benchmarks.lib.ready_checker import wait_for_endpoint +from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.utils.gc_utils import freeze_gc_heap +from vllm.utils.network_utils import join_host_port + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + +TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) and ( + shutil.which("gnuplot") is not None +) + + +class TaskType(Enum): + GENERATION = "generation" + POOLING = "pooling" + + +@dataclass +class BenchmarkMetrics: + completed: int + failed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: list[tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: list[tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: list[tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: list[tuple[float, float]] + # Max output tokens per second and concurrent requests at that peak + max_output_tokens_per_s: float + max_concurrent_requests: int + + +@dataclass +class EmbedBenchmarkMetrics: + completed: int + failed: int + total_input: int + request_throughput: float + total_token_throughput: float + mean_e2el_ms: float + std_e2el_ms: float + median_e2el_ms: float + percentiles_e2el_ms: float + + +def _get_current_request_rate( + ramp_up_strategy: Literal["linear", "exponential"] | None, + ramp_up_start_rps: int | None, + ramp_up_end_rps: int | None, + request_index: int, + total_requests: int, + request_rate: float, +) -> float: + if ( + ramp_up_strategy + and ramp_up_start_rps is not None + and ramp_up_end_rps is not None + ): + progress = request_index / max(total_requests - 1, 1) + if ramp_up_strategy == "linear": + increase = (ramp_up_end_rps - ramp_up_start_rps) * progress + return ramp_up_start_rps + increase + elif ramp_up_strategy == "exponential": + ratio = ramp_up_end_rps / ramp_up_start_rps + return ramp_up_start_rps * (ratio**progress) + else: + raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}") + return request_rate + + +async def get_request( + input_requests: list[SampleRequest], + request_rate: float, + burstiness: float = 1.0, + ramp_up_strategy: Literal["linear", "exponential"] | None = None, + ramp_up_start_rps: int | None = None, + ramp_up_end_rps: int | None = None, +) -> AsyncGenerator[tuple[SampleRequest, float], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness and OPTIONAL ramp-up strategy. + + Args: + input_requests: + A list of input requests, each represented as a SampleRequest. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + ramp_up_strategy (optional): + The ramp-up strategy. Can be "linear" or "exponential". + If None, uses constant request rate (specified by request_rate). + ramp_up_start_rps (optional): + The starting request rate for ramp-up. + ramp_up_end_rps (optional): + The ending request rate for ramp-up. + """ + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}." + ) + # Convert to list to get length for ramp-up calculations + if isinstance(input_requests, Iterable) and not isinstance(input_requests, list): + input_requests = list(input_requests) + + total_requests = len(input_requests) + assert total_requests > 0, "No requests provided." + + # Precompute delays among requests to minimize request send laggings + request_rates = [] + delay_ts = [] + for request_index, request in enumerate(input_requests): + current_request_rate = _get_current_request_rate( + ramp_up_strategy, + ramp_up_start_rps, + ramp_up_end_rps, + request_index, + total_requests, + request_rate, + ) + assert current_request_rate > 0.0, ( + f"Obtained non-positive request rate {current_request_rate}." + ) + request_rates.append(current_request_rate) + if current_request_rate == float("inf"): + delay_ts.append(0) + elif burstiness == float("inf"): + # when burstiness tends to infinity, the delay time becomes constant + # and tends to the inverse of the request rate + delay_ts.append(1.0 / current_request_rate) + else: + theta = 1.0 / (current_request_rate * burstiness) + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + delay_ts.append(np.random.gamma(shape=burstiness, scale=theta)) + + # Calculate the cumulative delay time from the first sent out requests. + for i in range(1, len(delay_ts)): + delay_ts[i] += delay_ts[i - 1] + if ramp_up_strategy is None and delay_ts[-1] != 0: + # When ramp_up_strategy is not set, we assume the request rate is fixed + # and all requests should be sent in target_total_delay_s, the following + # logic would re-scale delay time to ensure the final delay_ts + # align with target_total_delay_s. + # + # NOTE: If we simply accumulate the random delta values + # from the gamma distribution, their sum would have 1-2% gap + # from target_total_delay_s. The purpose of the following logic is to + # close the gap for stabilizing the throughput data + # from different random seeds. + target_total_delay_s = total_requests / request_rate + normalize_factor = target_total_delay_s / delay_ts[-1] + delay_ts = [delay * normalize_factor for delay in delay_ts] + + start_ts = time.time() + for request_index, request in enumerate(input_requests): + if delay_ts[request_index] > 0: + current_ts = time.time() + sleep_interval_s = start_ts + delay_ts[request_index] - current_ts + if sleep_interval_s > 0: + await asyncio.sleep(sleep_interval_s) + yield request, request_rates[request_index] + + +def calculate_metrics_for_embeddings( + outputs: list[RequestFuncOutput], dur_s: float, selected_percentiles: list[float] +) -> EmbedBenchmarkMetrics: + """Calculate the metrics for the embedding requests. + + Args: + outputs: The outputs of the requests. + dur_s: The duration of the benchmark. + selected_percentiles: The percentiles to select. + + Returns: + The calculated benchmark metrics. + """ + total_input = 0 + completed = 0 + failed = 0 + e2els: list[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + e2els.append(outputs[i].latency) + completed += 1 + total_input += outputs[i].prompt_len + else: + failed += 1 + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2, + ) + metrics = EmbedBenchmarkMetrics( + completed=completed, + failed=failed, + total_input=total_input, + request_throughput=completed / dur_s, + total_token_throughput=total_input / dur_s, + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[ + (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles + ], + ) + return metrics + + +def calculate_metrics( + input_requests: list[SampleRequest], + outputs: list[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentiles: list[float], + goodput_config_dict: dict[str, float], +) -> tuple[BenchmarkMetrics, list[int]]: + """Calculate the metrics for the benchmark. + + Args: + input_requests: The input requests. + outputs: The outputs of the requests. + dur_s: The duration of the benchmark. + tokenizer: The tokenizer to use. + selected_percentiles: The percentiles to select. + goodput_config_dict: The goodput configuration. + + Returns: + A tuple of the benchmark metrics and the actual output lengths. + """ + actual_output_lens: list[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + e2els: list[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + output_len = outputs[i].output_tokens + + if not output_len: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer( + outputs[i].generated_text, add_special_tokens=False + ).input_ids + ) + actual_output_lens.append(output_len) + total_input += input_requests[i].prompt_len + tpot = 0 + if output_len > 1: + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if goodput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in goodput_config_dict: + valid_metrics.append(ttfts) + slo_values.append( + goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + if "tpot" in goodput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append( + goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + if "e2el" in goodput_config_dict: + valid_metrics.append(e2els) + slo_values.append( + goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION + ) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2, + ) + + # Calculate max output tokens per second metric + max_output_tokens_per_s = 0.0 + max_concurrent_requests = 0 + + # Find the time range across all successful requests + successful_outputs = [output for output in outputs if output.success] + failed_outputs = [output for output in outputs if not output.success] + if successful_outputs: + min_start_time = min(output.start_time for output in successful_outputs) + max_end_time = max( + output.start_time + output.latency for output in successful_outputs + ) + + # Create second buckets (ceiling to ensure we capture all time) + duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1 + tokens_per_second = np.zeros(duration_seconds) + concurrent_requests_per_second = np.zeros(duration_seconds) + + for i, output in enumerate(successful_outputs): + # Calculate token generation timestamp using + # start_time, ttft, and itl + token_times = [output.start_time + output.ttft] + current_time = token_times[0] + for itl_value in output.itl: + current_time += itl_value + token_times.append(current_time) + + # Add tokens to second buckets + for token_time in token_times: + second_bucket = int(token_time - min_start_time) + if 0 <= second_bucket < duration_seconds: + tokens_per_second[second_bucket] += 1 + + # Track concurrent requests for each second this request was active + request_start_second = int(output.start_time - min_start_time) + request_end_second = int( + (output.start_time + output.latency) - min_start_time + ) + + for second in range(request_start_second, request_end_second + 1): + concurrent_requests_per_second[second] += 1 + + # Find the maximum tokens per second and corresponding + # concurrent requests + if len(tokens_per_second) > 0: + max_output_tokens_per_s = float(np.max(tokens_per_second)) + max_concurrent_requests = int(np.max(concurrent_requests_per_second)) + + if TERM_PLOTLIB_AVAILABLE: + import termplotlib as tpl + + fig = tpl.figure() + fig.plot( + np.arange(len(tokens_per_second)), + tokens_per_second, + title="Output tokens per second", + ) + fig.plot( + np.arange(len(concurrent_requests_per_second)), + concurrent_requests_per_second, + title="Concurrent requests per second", + ) + fig.show() + else: + print("tip: install termplotlib and gnuplot to plot the metrics") + + metrics = BenchmarkMetrics( + completed=completed, + failed=len(failed_outputs), + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) + * 1000, # ttfts is empty if streaming is not supported by the endpoint + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[ + (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles + ], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[ + (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles + ], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[ + (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles + ], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[ + (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles + ], + max_output_tokens_per_s=max_output_tokens_per_s, + max_concurrent_requests=max_concurrent_requests, + ) + + return metrics, actual_output_lens + + +async def benchmark( + task_type: TaskType, + endpoint_type: str, + api_url: str, + base_url: str, + model_id: str, + model_name: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: list[SampleRequest], + logprobs: int | None, + request_rate: float, + burstiness: float, + disable_tqdm: bool, + num_warmups: int, + profile: bool, + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + ignore_eos: bool, + goodput_config_dict: dict[str, float], + max_concurrency: int | None, + lora_modules: Iterable[str] | None, + extra_headers: dict | None, + extra_body: dict | None, + ramp_up_strategy: Literal["linear", "exponential"] | None = None, + ramp_up_start_rps: int | None = None, + ramp_up_end_rps: int | None = None, + ready_check_timeout_sec: int = 600, +): + try: + request_func = ASYNC_REQUEST_FUNCS[endpoint_type] + except KeyError: + raise ValueError(f"Unknown backend: {endpoint_type}") from None + + # Reuses connections across requests to reduce TLS handshake overhead. + connector = aiohttp.TCPConnector( + limit=max_concurrency or 0, + limit_per_host=max_concurrency or 0, + ttl_dns_cache=300, + use_dns_cache=True, + keepalive_timeout=60, + enable_cleanup_closed=True, + force_close=False, + ssl=("https://" in api_url), + ) + + session = aiohttp.ClientSession( + connector=connector, + trust_env=True, + timeout=aiohttp.ClientTimeout(total=6 * 60 * 60), + ) + + print("Starting initial single prompt test run...") + test_prompt, test_prompt_len, test_output_len, test_mm_content = ( + input_requests[0].prompt, + input_requests[0].prompt_len, + input_requests[0].expected_output_len, + input_requests[0].multi_modal_data, + ) + + assert ( + test_mm_content is None + or isinstance(test_mm_content, dict) + or ( + isinstance(test_mm_content, list) + and all(isinstance(item, dict) for item in test_mm_content) + ) + ), "multi_modal_data must be a dict or list[dict]" + test_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + api_url=api_url, + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + extra_headers=extra_headers, + extra_body=extra_body, + ) + + if ready_check_timeout_sec > 0: + test_output = await wait_for_endpoint( + request_func, + test_input, + session, + timeout_seconds=ready_check_timeout_sec, + ) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark " + "arguments are correctly specified. " + f"Error: {test_output.error}" + ) + else: + print("Initial test run completed.") + else: + print("Skipping endpoint ready check.") + + if num_warmups > 0: + print(f"Warming up with {num_warmups} requests...") + warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups) + warmup_semaphore = ( + asyncio.Semaphore(max_concurrency) + if max_concurrency + else contextlib.nullcontext() + ) + warmup_tasks = [] + + async def warmup_limited_request_func(): + async with warmup_semaphore: + return await request_func( + request_func_input=test_input, session=session, pbar=warmup_pbar + ) + + for _ in range(num_warmups): + request_task = asyncio.create_task(warmup_limited_request_func()) + warmup_tasks.append(request_task) + _ = await asyncio.gather(*warmup_tasks) + + if warmup_pbar is not None: + warmup_pbar.close() + print("Warmup run completed.") + + print("Starting main benchmark run...") + + if lora_modules: + # For each input request, choose a LoRA module at random. + lora_modules = iter( + [random.choice(lora_modules) for _ in range(len(input_requests))] + ) + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + api_url=base_url + "/start_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos, + extra_headers=extra_headers, + extra_body=extra_body, + ) + profile_output = await request_func( + request_func_input=profile_input, session=session + ) + if profile_output.success: + print("Profiler started") + + distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" + + if ramp_up_strategy is not None: + print(f"Traffic ramp-up strategy: {ramp_up_strategy}.") + print( + f"Will increase RPS from {ramp_up_start_rps} to " + f"{ramp_up_end_rps} RPS over the duration of the benchmark." + ) + else: + print(f"Traffic request rate: {request_rate}") + + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + semaphore = ( + asyncio.Semaphore(max_concurrency) + if max_concurrency + else contextlib.nullcontext() + ) + + async def limited_request_func(request_func_input, session, pbar): + async with semaphore: + return await request_func( + request_func_input=request_func_input, session=session, pbar=pbar + ) + + benchmark_start_time = time.perf_counter() + tasks: list[asyncio.Task] = [] + + rps_change_events = [] + last_int_rps = -1 + if ramp_up_strategy is not None and ramp_up_start_rps is not None: + last_int_rps = ramp_up_start_rps + rps_change_events.append( + { + "rps": last_int_rps, + "timestamp": datetime.now().isoformat(), + } + ) + + async for request, current_request_rate in get_request( + input_requests, + request_rate, + burstiness, + ramp_up_strategy, + ramp_up_start_rps, + ramp_up_end_rps, + ): + if ramp_up_strategy is not None: + current_int_rps = int(current_request_rate) + if current_int_rps > last_int_rps: + timestamp = datetime.now().isoformat() + for rps_val in range(last_int_rps + 1, current_int_rps + 1): + rps_change_events.append({"rps": rps_val, "timestamp": timestamp}) + last_int_rps = current_int_rps + prompt, prompt_len, output_len, mm_content, request_id = ( + request.prompt, + request.prompt_len, + request.expected_output_len, + request.multi_modal_data, + request.request_id, + ) + req_model_id, req_model_name = model_id, model_name + if lora_modules: + req_lora_module = next(lora_modules) + req_model_id, req_model_name = req_lora_module, req_lora_module + + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, + multi_modal_content=mm_content, + ignore_eos=ignore_eos, + extra_headers=extra_headers, + extra_body=extra_body, + request_id=request_id, + ) + tasks.append( + asyncio.create_task( + limited_request_func( + request_func_input=request_func_input, session=session, pbar=pbar + ) + ) + ) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + if task_type == TaskType.GENERATION: + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, + ) + else: + metrics = calculate_metrics_for_embeddings( + outputs=outputs, + dur_s=benchmark_duration, + selected_percentiles=selected_percentiles, + ) + actual_output_lens = 0 + + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10}".format("Failed requests:", metrics.failed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) + if request_rate != float("inf"): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + if isinstance(metrics, BenchmarkMetrics): + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print( + "{:<40} {:<10.2f}".format( + "Request throughput (req/s):", metrics.request_throughput + ) + ) + if goodput_config_dict: + print( + "{:<40} {:<10.2f}".format( + "Request goodput (req/s):", metrics.request_goodput + ) + ) + if isinstance(metrics, BenchmarkMetrics): + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Peak concurrent requests:", metrics.max_concurrent_requests + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Total Token throughput (tok/s):", metrics.total_token_throughput + ) + ) + + if isinstance(metrics, BenchmarkMetrics): + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "failed": metrics.failed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "request_goodput": metrics.request_goodput if goodput_config_dict else None, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + "max_output_tokens_per_s": metrics.max_output_tokens_per_s, + "max_concurrent_requests": metrics.max_concurrent_requests, + } + else: + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "request_throughput": metrics.request_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "errors": [output.error for output in outputs], + } + + if rps_change_events: + result["rps_change_events"] = rps_change_events + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms" + ) + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms" + ) + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms" + ) + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + if task_type == TaskType.GENERATION: + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + ) + profile_output = await request_func( + request_func_input=profile_input, session=session + ) + if profile_output.success: + print("Profiler stopped") + + await session.close() + return result + + +def check_goodput_args(args): + # Check and parse goodput arguments + goodput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. " + ) + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative." + ) + return goodput_config_dict + + +def parse_goodput(slo_pairs): + goodput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + goodput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + 'Specify service level objectives for goodput as "KEY:VALUE" ' + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds." + ) from err + return goodput_config_dict + + +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any], file_name: str +) -> None: + metrics = [ + "median_ttft_ms", + "mean_ttft_ms", + "std_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "median_tpot_ms", + "std_tpot_ms", + "p99_tpot_ms", + "median_itl_ms", + "mean_itl_ms", + "std_itl_ms", + "p99_itl_ms", + ] + # These raw data might be useful, but they are rather big. They can be added + # later if needed + ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={k: [results[k]] for k in metrics if k in results}, + extra_info={ + k: results[k] + for k in results + if k not in metrics and k not in ignored_metrics + }, + ) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def add_cli_args(parser: argparse.ArgumentParser): + add_dataset_parser(parser) + parser.add_argument( + "--label", + type=str, + default=None, + help="The label (prefix) of the benchmark results. If not specified, " + "the value of '--backend' will be used as the label.", + ) + parser.add_argument( + "--backend", + type=str, + default="openai", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + help="The type of backend or endpoint to use for the benchmark.", + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--header", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " + "for headers to be passed with each request. These headers override " + "per backend constants and values set via environment variable, and " + "will be overriden by other arguments (such as request ids).", + ) + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.", + ) + + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--logprobs", + type=int, + default=None, + help=( + "Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed" + ), + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--num-warmups", + type=int, + default=0, + help="Number of warmup requests.", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--save-result", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) + parser.add_argument( + "--append-result", + action="store_true", + help="Append the benchmark result to the existing json file.", + ) + parser.add_argument( + "--metadata", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " + "for metadata of this run to be saved in the result JSON file " + "for record keeping purposes.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" # noqa + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", + ) + parser.add_argument( + "--percentile-metrics", + type=str, + default=None, + help="Comma-separated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' + 'If not specified, defaults to "ttft,tpot,itl" for generative models ' + 'and "e2el" for pooling models.', + ) + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-separated list of percentiles for selected metrics. " + 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' + 'Default value is "99".' + 'Use "--percentile-metrics" to select metrics.', + ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help='Specify service level objectives for goodput as "KEY:VALUE" ' + "pairs, where the key is a metric name, and the value is in " + 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' + "separated by spaces. Allowed request level metric names are " + '"ttft", "tpot", "e2el". For more context on the definition of ' + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) + parser.add_argument( + "--request-id-prefix", + type=str, + required=False, + default=f"bench-{uuid.uuid4().hex[:8]}-", + help="Specify the prefix of request id.", + ) + + sampling_group = parser.add_argument_group("sampling parameters") + sampling_group.add_argument( + "--top-p", + type=float, + default=None, + help="Top-p sampling parameter. Only has effect on openai-compatible backends.", + ) + sampling_group.add_argument( + "--top-k", + type=int, + default=None, + help="Top-k sampling parameter. Only has effect on openai-compatible backends.", + ) + sampling_group.add_argument( + "--min-p", + type=float, + default=None, + help="Min-p sampling parameter. Only has effect on openai-compatible backends.", + ) + sampling_group.add_argument( + "--temperature", + type=float, + default=None, + help="Temperature sampling parameter. Only has effect on " + "openai-compatible backends. If not specified, default to greedy " + "decoding (i.e. temperature==0.0).", + ) + sampling_group.add_argument( + "--frequency-penalty", + type=float, + default=None, + help="Frequency penalty sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--presence-penalty", + type=float, + default=None, + help="Presence penalty sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--repetition-penalty", + type=float, + default=None, + help="Repetition penalty sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + + parser.add_argument( + "--tokenizer-mode", + type=str, + default="auto", + choices=["auto", "slow", "mistral", "custom"], + help='The tokenizer mode.\n\n* "auto" will use the ' + 'fast tokenizer if available.\n* "slow" will ' + "always use the slow tokenizer. \n* " + '"mistral" will always use the `mistral_common` tokenizer. \n*' + '"custom" will use --tokenizer to select the preregistered tokenizer.', + ) + + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the `--model` argument. ", + ) + + parser.add_argument( + "--lora-modules", + nargs="+", + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.", + ) + + parser.add_argument( + "--ramp-up-strategy", + type=str, + default=None, + choices=["linear", "exponential"], + help="The ramp-up strategy. This would be used to " + "ramp up the request rate from initial RPS to final " + "RPS rate (specified by --ramp-up-start-rps and " + "--ramp-up-end-rps.) over the duration of the benchmark.", + ) + parser.add_argument( + "--ramp-up-start-rps", + type=int, + default=None, + help="The starting request rate for ramp-up (RPS). " + "Needs to be specified when --ramp-up-strategy is used.", + ) + parser.add_argument( + "--ramp-up-end-rps", + type=int, + default=None, + help="The ending request rate for ramp-up (RPS). " + "Needs to be specified when --ramp-up-strategy is used.", + ) + parser.add_argument( + "--ready-check-timeout-sec", + type=int, + default=600, + help="Maximum time to wait for the endpoint to become ready " + "in seconds (default: 600 seconds / 10 minutes). If set to 0, " + "the ready check will be skipped.", + ) + + parser.add_argument( + "--extra-body", + help="A JSON string representing extra body parameters to include " + "in each request." + 'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'', + type=json.loads, + default=None, + ) + + +def main(args: argparse.Namespace) -> dict[str, Any]: + return asyncio.run(main_async(args)) + + +async def main_async(args: argparse.Namespace) -> dict[str, Any]: + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + # Validate ramp-up arguments + if args.ramp_up_strategy is not None: + if args.request_rate != float("inf"): + raise ValueError( + "When using ramp-up, do not specify --request-rate. " + "The request rate will be controlled by ramp-up parameters. " + "Please remove the --request-rate argument." + ) + if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: + raise ValueError( + "When using --ramp-up-strategy, both --ramp-up-start-rps and " + "--ramp-up-end-rps must be specified" + ) + if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: + raise ValueError("Ramp-up start and end RPS must be non-negative") + if args.ramp_up_start_rps > args.ramp_up_end_rps: + raise ValueError("Ramp-up start RPS must be less than end RPS") + if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0: + raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") + + label = args.label + model_id = args.model + model_name = args.served_model_name + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + tokenizer_mode = args.tokenizer_mode + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + host_port = join_host_port(args.host, args.port) + api_url = f"http://{host_port}{args.endpoint}" + base_url = f"http://{host_port}" + + # Headers + headers = None + if args.header: + headers = {} + for item in args.header: + if "=" in item: + kvstring = item.split("=", 1) + headers[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError("Invalid header format. Please use KEY=VALUE format.") + + tokenizer = get_tokenizer( + tokenizer_id, + tokenizer_mode=tokenizer_mode, + trust_remote_code=args.trust_remote_code, + ) + + if args.dataset_name is None: + raise ValueError( + "Please specify '--dataset-name' and the corresponding " + "'--dataset-path' if required." + ) + + # when using random datasets, default to ignoring EOS + # so generation runs to the requested length + if ( + args.dataset_name in ("random", "random-mm") + and args.backend in OPENAI_COMPATIBLE_BACKENDS + ): + args.ignore_eos = True + + # Load the dataset. + input_requests = get_samples(args, tokenizer) + goodput_config_dict = check_goodput_args(args) + + backend = args.backend + task_type = ( + TaskType.POOLING + if "embeddings" in backend or "rerank" in backend + else TaskType.GENERATION + ) + + # Collect the sampling parameters. + if task_type == TaskType.GENERATION: + sampling_params = { + k: v + for k, v in { + "top_p": args.top_p, + "top_k": args.top_k, + "min_p": args.min_p, + "temperature": args.temperature, + "frequency_penalty": args.frequency_penalty, + "presence_penalty": args.presence_penalty, + "repetition_penalty": args.repetition_penalty, + }.items() + if v is not None + } + + # Sampling parameters are only supported by openai-compatible backend. + if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: + raise ValueError( + "Sampling parameters are only supported by openai-compatible backends." + ) + + if "temperature" not in sampling_params: + sampling_params["temperature"] = 0.0 # Default to greedy decoding. + + default_percentile_metrics = "ttft,tpot,itl" + else: + sampling_params = {} + default_percentile_metrics = "e2el" + + extra_body = args.extra_body or {} + extra_body = {**sampling_params, **extra_body} + + percentile_metrics: str = args.percentile_metrics or default_percentile_metrics + + # Avoid GC processing "static" data - reduce pause times. + freeze_gc_heap() + + benchmark_result = await benchmark( + task_type=task_type, + endpoint_type=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + model_name=model_name, + tokenizer=tokenizer, + input_requests=input_requests, + logprobs=args.logprobs, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + num_warmups=args.num_warmups, + profile=args.profile, + selected_percentile_metrics=percentile_metrics.split(","), + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], + ignore_eos=args.ignore_eos, + goodput_config_dict=goodput_config_dict, + max_concurrency=args.max_concurrency, + lora_modules=args.lora_modules, + extra_headers=headers, + extra_body=extra_body, + ramp_up_strategy=args.ramp_up_strategy, + ramp_up_start_rps=args.ramp_up_start_rps, + ramp_up_end_rps=args.ramp_up_end_rps, + ready_check_timeout_sec=args.ready_check_timeout_sec, + ) + + # Save config and results to json + result_json: dict[str, Any] = {} + + # Setup + current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") + result_json["date"] = current_dt + result_json["endpoint_type"] = args.backend # for backward compatibility + result_json["backend"] = args.backend + result_json["label"] = label + result_json["model_id"] = model_id + result_json["tokenizer_id"] = tokenizer_id + result_json["num_prompts"] = args.num_prompts + + # Metadata + if args.metadata: + for item in args.metadata: + if "=" in item: + kvstring = item.split("=", 1) + result_json[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError( + "Invalid metadata format. Please use KEY=VALUE format." + ) + + # Traffic + result_json["request_rate"] = ( + args.request_rate if args.request_rate < float("inf") else "inf" + ) + result_json["burstiness"] = args.burstiness + result_json["max_concurrency"] = args.max_concurrency + + if args.ramp_up_strategy is not None: + result_json["ramp_up_strategy"] = args.ramp_up_strategy + result_json["ramp_up_start_rps"] = args.ramp_up_start_rps + result_json["ramp_up_end_rps"] = args.ramp_up_end_rps + + # Merge with benchmark result + result_json = {**result_json, **benchmark_result} + + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", + ]: + if field in result_json: + del result_json[field] + if field in benchmark_result: + del benchmark_result[field] + + # Save to file + if args.save_result or args.append_result: + base_model_id = model_id.split("/")[-1] + max_concurrency_str = ( + f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None + else "" + ) + label = label or args.backend + if args.ramp_up_strategy is not None: + file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + else: + file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + if args.result_filename: + file_name = args.result_filename + if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) + file_name = os.path.join(args.result_dir, file_name) + with open( + file_name, mode="a+" if args.append_result else "w", encoding="utf-8" + ) as outfile: + # Append a newline. + if args.append_result and outfile.tell() != 0: + outfile.write("\n") + json.dump(result_json, outfile) + save_to_pytorch_benchmark_format(args, result_json, file_name) + + return result_json diff --git a/benchmarks/sweep/__init__.py b/benchmarks/sweep/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/sweep/__pycache__/__init__.cpython-312.pyc b/benchmarks/sweep/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..588992f20e7b6cbb83b5bd9348d96458922f8d5f GIT binary patch literal 166 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVx$2kX7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?lc@&rWWYO$7kkc nmc+;F6;$5hu*uC&Da}c>D`Ewj$_T{8AjU^#Mn=XWW*`dy3H&JU literal 0 HcmV?d00001 diff --git a/benchmarks/sweep/__pycache__/cli.cpython-312.pyc b/benchmarks/sweep/__pycache__/cli.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebabb0f8928a350679637b86ac58574e0c5563d1 GIT binary patch literal 1681 zcmZ`(L2nyH6rSB3uh(lku1gy1CXh`~5-j1k2Ske$(VC{Bt&%uufPDZJkoW&mY0rq8O z56h{YYkrs;+Q1h(=<8T5$fSR-_c^gYoNW^F0WNhaHQ>JVDslMRLfK!UG( z71(>Ij!MJcev~fg;-PdP_RJl@?IVO1(akRr`V{OPaYw2PC0{>fNY%xXJY@DEu>9Zu z89M&|=x>qkdmtUk2fS7oJGhQZ!5$u#b?MN5&|iUoV6O16w}Npn|0CK)kr&1$F>Mxdh?~<%F<&)Z zhlO}f!8CGtXe7!Hfyu=Rwel{LiEWsbjf9W#U4Uhu{8oZ$m`chnt=Sc$?6NQrLv`_H z(fIiLQp&aLlx^5a7ydxpW z6RfoG*(N@_|MnBC(J3GkQKmp8UMsLMzRU9`XXeaTUBm33Ikw^Hq5RmqX<2~VIsOJQ zAG|DEg!;h;J-~5~5A?8O<|)7LY~+RQ`RS-frs+o2f}gH>FO+wVEU*Gi3Ef^GkC0cV zw75{HB&P&*9?F@=N>3Wb_NMxf)PSwc{!rmbA#dqbQ!wmJ6( z63@|4hqN$SE!Eay)=o7l(u~-&?AKbYKdAa+ttL(JOq=YMNlfdH^`B{|N~`_Y``+v8 z*kFqGV<*ac&&PYd-jC_(aQgbXpC3-E-`8b*7 z)Vv4?85c~R)o+5kmY+GxgDBz8oJ&c9jGMw=oj`ufn=^(<8|fhmJ;PD*9HH>{O7k@T z963Y1+XgC^>w5m}So%Ay~7-fT7{U%{h# zpnUuNZgn!&9Sg;_r~!6TR<}$~?cDXm#7?<896He*n^DK2k?zys@MQN1IT9S346q5M zTS38f$54)bz1|g@(d+D>#b$)Gy{>AFnZxvkS)dnB`zSt*PnG;&& zRTqJgQq`#i6`N{M8dVY4^deEUppXPL)$k_ybwPW6`Ow~5uC z#1$bW6j9VbBq)QKgZwo6agfph#-dCUP+(q;Rl|z9H7G^LWL3}`Z~2Y`;$@V*K)@!Q zP05W{w%>4e-mGiMan#jzvu$JA`p%wIcaEpEy(>KLa^whK?YLb5_L;$xXX^&D&cQ|7 z;0i1O=vJlK{Qd|4jNh+S`2CYnX)28MD!>2KR3MxmsSJh#iUKG!7L_zEs%W-BK%k?c z37OTxGOPuNCZL6AyrMGA5pZ>sJ$ zEtgxqBv4)9iKSNW%vI?%t2jhc+tWgNc%dWxe5P|(rfv6^1bcPX+ZjRu-V^r(lq!pe@5YU;yzwP;CXE7GO00LX2{AWRwC-SB===?GS<6a!wej$k(k< zuguqNP_M$eWS6T)1*rns4yjUh%KBRbz4;o9Rl!(|cCHH z7+#2>C6>pImtf@ofgP(338&x&;)f0df@7W#_?;&d@u&}hjsj%N+z%LDfi>fu+L5e7 zf-YwIh3<33e$fDf@kdJi0^PsZd2tox2_S}RbX?hR!@058H7xJ=QeEg8+wO6V`5hO2 zv}kMA4M-ECa4%LOXS5!KtCf8~UZ)o!wkUdHi3wV(czvErkXHftSLJ8&3=>uli+Fkc zzM7~^#bj_8zdy?SelM>HiX0wgj{%Ej_xp9f^ZOOt+~YZGI8hfg;!R~_VJy-C!#Z98 zS}i-9UQy@!FT~$!zqIjES9(phxjXCJx@g;K+|?e_U&RS|rsTc|;BD}~vJAukaET`9 zX~K3S=y);I!#Ea+N}Tydy#RWfp=oXEl^hG5W4yjtRA55J23&QaS z0}&}C1yotVpd=O!1ZA(tx?w8o!TFXLlSf0-n!sd0`x>KFGz*XK!D46-jWoWcoCY7b z?F-x`UmZR}!&SgjAlEa;GxKn&;IP38K&E~zg zD{J$9qhMI}a$jfPCR$=F7=-A?G7c5PSXP~5HQoesD5J#t9mI4LFb{B`Sq==3D>95q zGa^7Z;qz1Qto&*5zIkxy%n&h;(VH6&|a>%1^%Z9PsKuP$CTj=2@BH`<_5P(9=F$*Ymo;z)#OoeGoIwrGD;0L>D zgb&+?1eIHgy_l!PyUPe_ET9;8Q6_@KwRDl)&4-XfnU29!?R-yC_~kCB8W!;rM=hjE z8Xt(v=oZZSaT-2?FXheFUf3kQ?z@hF;i2LN0x!T#XI-{#C?zd*9LRJWxE{LEF$nFf zb7;{vWN3;9{gk@e*hbcm9*-tAjh(bGII=Y-oFx3^b(YF{X(7s+{f@Gumj?k zOEMFJY2&k^is_VI61-YSwM~3y>)>1%S>ICnB2!3TLD9hnfwL-y~48@SAsR=@w-sm#;Cj#pqkjQ#E@4_L1t(Q>yrSr~XGlJAhy0L_M7OZ*z~HSk!h8{f70(93Q~8|75{N zvzd2*Dp)5x<5#1IS#ft6A1G${9xShX0R+6jQ8(XvAvDL|s$YBY@N0+PIC{Q+u5a$t zvb$}`?aR1*?+$(H-aB8J<0@*`FRydIedyAm)I@gOw)q2}H@2o)uN=S8xM{iFyVUN> zwEI42eQ)TaHH$;XK5ojk4=4HLfnQa8?%kXoy4HY=RNtjTmmL4BZ_2rdXCJ-OOzIy= z2CmfKaCI!#H{$k5b;-RWD%sw@Mw{_K$sz{oTTc z-S+)YT0h)PfkILy$4Xqh@YoE%1jx=G5Y)jB2M=*`)8S2%)zgF$G{#s`N8_zfDPXZ9 zj}CN}OVBsrho%Gvzg+P7=&uHFnPS=xcu4mQ+z-sduvW>F&U9cd7g7N(t6+9zVEK7m z1Yy!ZGmXBq`uq6R_-_;0MtDhmi#DH*&t4kmHha7- zh9_knB&|puMbeH0-HmNTf>{zn=w_HB7_SJ10ft-t0BiX4>W<8w#o7x%wCa4K5tPGW zNJ1z>RVe`eWSe(j}O!Sq@KZ>8d%q|U!5-S#!G8&Qb2g5eXv+JI;l_~9iK zIcdlTFJk-}agPY~t70Ia3tzv!X7d@pi#)~rFo1GVo&*9uN9l@%(5laf{U4;^j-`eU m(fQ+x&0U!_-FFDK?sV*x}d5&#D?Q1^fWMgs?y zgl;Kv?y80^GgF$U0vr_RbSP2Rn>oYx$GRCknoM6$M$mE|Dq55sm*wnZwVZCg?o$> zIgt->ecGGv8AgnK#u1@U;3?b~GL4w~%*-!@EF;!FEAyK|wh?=uo%zin$B47f z$^4d(YsB5>W`1kPGg8)9#{9NW`G~jAJL2o}v2T0GKT^?G!TgR;8s&6n+8?wNL^n&OT~v8`c^TB8~Xy1)xaI*MDL57=#z}E8MT(ZmT6#LOD5ir_zJ0I zMSN=}z7p|OCF0vM@zsd0DG|Rq6JLw?x)SkgGV%3@ZzvJpF11RlrFN-JS|hIdo~f@x zS}U!S)=MolXu(+nxl*UtxC?LA&H&13lDf2by)}IsGPN`#w&gX0*7$56Y!q9ijYE9j zrcBuz3(9WGmc1Ie*Pv_`pDp`7v0ZYC9SA4L?$=;qB?#i$7lpoVab13lxL#U!RuDT8 zv$-gy3uUa!)R{sI9o0D0Ir?lKEO`! z@6VR-#gtzw9zglKG9!8rEkC5UJUgPh#lzC>vj*`9QtpAD8~%zm=J_t-k0O4r)FU1{ z-_CLJK?8@BFXpGw{L=2>-&z@Z?f7fF7TR|JDZa5Xg>*nq!2>&`z0!f85?v#K?6 zL6YSmkz$=^$3mh0b8=*C^vndl%O4IQ`(Pv_O7epdaV#YL6ZJL4D8cYhbSNhEpB)NG z;oyj*+DC%X(NH8B8aktzM<>XOcQ7ml71g3h!7~v#JdodOnMIIWK8DaM7<5|D`wWk9 zl2H;wo+do=8JN!`8YLr;MR-xrhpNvkSwxFy$_uehm^-Yh`(admXebnwG<+c#8WIDMQArL0+X83t zqd^T|#L=;VHJ?l?VKJn10{zFGkoI3dEN!6r`hK8c5Q7Xn}Fh)v5j!NQHlWH6YDKd4QYL1NdpO+?7BRM7jH6|(4 znE`q}>Ddq+8Qm~CG}@^RZ|C^f*6rKQZycXVE%+RTnL3mHk^^d1A`+$`Mk10 zc~+8;361F*oltF_+mugt`$`da? zG5`2?`!CyPjkAy4s%n`(l&D&l@>iw&HB0{03IFQ(3rT<1lD|9Q@4kLA>EFKOe<0z1 z;7w!Fe_+{UtoDA$8OyzC3s=>0r;PJ8q`ckNAGy9i&GFmXZ?$(_iznN6Ew0&}HuB{U zEE|nJcbYT0+{=K4D?nsII)=z@c=8b$!^+26dPXq>k`jmx4k`K&Mk9e>-~g78Lo#M? z7v{CEHxfNMG8&Rbq%a18jmlLl6$&jC2Jx&yQzsCRJMdCXL6kWW^A~j*3%w5s6k_eB zxI1Rf)4XVHlGow0_o^Uo#QPKHsMwXQS~bg3bW9G*_amyqrds>^X-(9nROd+cxJ5M^aAD3&(Gp%F+g>$vxNdA&1wpku#UGfOI`)_J3$Fn3^ff{{i{5 zQmMcJ{IU;?1wG4DLvDGKjG|HG^Oh`O!q6e84(h~%)Vx!Xf?*}_*|7dLE5^p{h} zl&Yw?@}V zNlp$|cGD+0+wfETa9-qYn_M3n%%-)32JR2Yr)8jIzXBRKsZUOT(4QI%h70h3mOO?B zkHT|>%Zx4j#Se{Wlv1(>l!hFdr}AMv2MhWYx{Prqc~Jdp z;_8?0x$jmL4Qv)ltrNdQy+P>pX`T28r6K1j@|@6fu%Hf8%&>cR%&-f;J-bzYm&)&v ziN==hUE%=>s4p)G5F-qE6xbv}xPT05o)EyfGD@ro4;lwrunZG1r6p$hVODOOS1bk) z%h&Kl&q+v_acHSt!~#8>6$7+5a>Xp$ejuAiC)g)JHG2p0GttUT9MLi%-f;*hSogW7 zfya}46d~GEQYOjD4!_G^;d$<2oxn|Se`I`?XX|{2LF0d@Z@}Fdx;Co3l)ucYwq56> zurxj@?}@e3m}dGT#}Iez3PlEjA!ScjHsvTvQ|jUTocm~sTl5}$!+CwvxArXg)+K!F z7J8O^U2ix)UZyrZ-`K>f{Q9fB+{hr1+wfxCem)S4eT%W_xT5;=$e>WJDegT_**m_< zZ*HDG@t((<@E*Lu|C!|t%bUjk>`ZzOCOroitp^#p6$~Rm!G@8@KlLi4J)b7+l^V8q zwm&ENixmO}_avuLOkQIx^Vl6#7>Y_GcjI^TmN0%tM&-)^I8)qD{ngV4Qsos_YA@H$ z?fd4cnSE&^S5cETgXfvH<;|C?mSVnzxF(rc6OG?3#lbvIvE&Tnr?kL%kxLPeE0_-P z#N^VO7I|>FC!&J2we1eEYvBVLeI|pFI%UO98?0rXjdm(4D>7;RQ0Yq!;IE}X43MVHuYfL!J zeoc7OdOtV0Nlm9b1Lq#4-G%*sQgMB3Y3tF%)}wE&PHuf9x#`hl)ybcfoz%oy`#p=b zD`zSVHyiTbfrt3sQu9+^ekV1Fr-bA}#J@Bz97F26!~hYP1XZFuLT*|GR*2S_0V)=) zjjAvZ8J*Dh>mg*m3%6I6n++Px{5sMy`4o(`%X8_8nJ2#S)U+X0RWsN6O8d+0m*e2C zC%-?LsOr8R{nN?U;W;pEnC`x`ZD!kyGvz5SSUfsx#ft}9H?|Wzd&yyI2_Z&FL1>U7 z$eF-Tp=7$y?DoCz5EGhhn$V0_W;ucHPGxpAq0B1ZCe!*ofM52ZA*N@UO3ke*ke2U> zUTYZqZ(Y3UZJ!lNpsKjo$!r_>TT>)>SMq_gy0B%KnQBa`#H_!ZL9eHmK1 zFGoxF<Hc=!~BNU&&)eXG{@>UezF<>A-pg&2Ly0hLs3hsCYzhjYM!&zEBHT)=x?L zDMkO8iKwK=&mqJibd4GZS;=--!Rh$k}TyHGaME z6UB;zzf-Zv__}F_MYYOf;eIk~z@#8SMLa<2-vTy(h%&W(`4pThFulgH7SjsPG+B6S z4>Va-lZIR`iA0RS`p3EN8gg3ixZ!#9&!piihG%(Qvjs*x=G)ISi_n#Z#efD#Am$1T zg#$W#yJ1AJ9+xJVmXr}&sJ-H-Tb_iY8l~}p0@PC-{V+iEYh}q_MVddvPazcu6kZ0S z=?OmNshYDVJS~gXmXB^Zd{Aj2@RqxJv94<&wh+G2`)2iVc2N%w<`Cm&z5KK{?p zb0NtmN*PLdUa)X4+140mz0JlSnAR9oJG13b{}mcNvU6-ba=nBu%qbP8xwAav7Z^KA zN+^ax355oZNhlaK{a^aa|&8B3< zeK*VRf1~eXh5F}(rdBATuvb8`$Tm6k-QXXLR89T&40$2!EXt_~Cps#ocn0 zC8-)XNym{z;Rst=RLj|+bN%9wteP}f3s5_|4EzxPhz+iJ}^6H1B<*?e*5{P1hfp56+)l;1`+} zd_UUpMw^Bg&-*KR)r``{WJ%>aV?x(x_#8NKAv6Sw1mUY{md2w}SX9lREO^T_H?fjN zhQa`L8Xxiug_y`-s}umNx(Z;Gzm0VN5kG}iI$)Hu>|4H>qu)3_b?_aFGi7yMa?Uts zhZZY(5(IRzd2gbk=WXl0+nJarZ(3K~c6e`Ate#h19smCLb<+=j^G0Q2%>&7by$Q$O zMPaW-e^Ge$p(#zzGEM1xPOLAF7waQ)EX=Qji=x2xADCc_)qG~jn(^_XaMmhXpj)v) zw_;UY2gXK5CUgV#N+L+g54Nz=G$10F5wo>{=9x`Fp+uS4XY21LJC)quAFC=Tx+@#; z7f7uTj)M?+eA5nX4qCA`_eWt9?^SK&WoAXy#X@zH`2b;P{=*=nz$+k^O#Ij25;R-K zcMv0gmz?jxVQUBs9vMhs(UQi8q5!mV1)wbauPKiPXWGe?`5BxY_9A{=HI4*FWtxQY z%WzbI+Mv3~%6)KrKpG`8m`qZ#%=noi)`eIznVdhvPk9s$)BtwZ)c&-uhT6eyb{ zB*}OXBFYo}pu__vG=m=MyR3e|3*SqImy9GZvh~C{>BQ7|LUY^mjW=J>3kozGuWO*!2F@)2_5!J3!V z6c(g5$30jTE%7qA*55Pf=?l`>;^l?;;||%G2ZueEYl4e9wHl!t!rEK>43`&&KikrWP+GY&U zDs2ZNn?ntpJYv8n>c~S`ls2 z+c~^i_vPNyRod@B?1tE@yv# zWd04~WW{9VWYuIfVC~G+uKS{`x^K8k_vPMLtIhF>cxAjw3|wn^Ekm!c)x$r$DVIw3 z#Vf8g=O7JVi3^{GkF;+58slRN;p6A+H?akM*7{n8*SK4s=~RowlWbG;#Irj;#mzW3lamP+p2U|nWq= zmnO{x+Zv3GwldMR4k4oo>4&ZiEHyFLGOL_`hsdD~h$j5j|FY`er&>vut0_gWv(RUL zudZ$8_;kb6z|#SDjpQ;->S#C2Q|Hbr~7>h1IZoxWBaT;Nhda0Zq7nI%YXc z8;5rUV&=oxPmH%U0O*v!cpzp9faO|JWUEI3e-fB z-LZ;%D0KN`w|o(OB-36pXoQ5|_>iLV_L!-?Clu;XZB!r=b5t7@!@N*X7!i(rtdUTx z;yFq}`xYnd&ruTQjoEX-@MptSs>D3bBp&iXcL&+$pGf*V1w8{luJYIoJ2)oM4%&&3 zgx#4imZ}!o+bYSl6WkOXiXz;h!LPw1$Rr8Ne@V_?k@Mf+bTlx8l1artqUAVKI6yHW z4~3(uAPx;gRl}&L+HyNJooD69NWUx%L}U>gy8-psNLXRoPi(P{FeM0)6O!`613~No z4+2VTB4(NqWANNL`J1dZC9J5nL)c%Aoj^mPYFD5dWpFF15q{MLNtdDaSsH%T8X5{q zw5+K1peXi-BupU@;$ZLsHWQyyjiaHcY8nVd6iNPVYFUk@X!!yBY^h=sME-sH`~z}c zC1-)0tK^U-R;Kk^z6OUx(2VG+TFI-aqq768sMhcpdI9LI`OppUQA(!WCzV!4A$3mc z?Z2jw|4Pn(f};?R83>#{jmf(5VLDCKmYulrKO*(tQMRvPf`1HM@hYyaBOz2x9i1M3 zzq}!3E&Hi+(;bu1Yo0oG2fHMFvpc@=n^Q+_dA!S+8I6s`J~KYb%u`41*txQXxhLj_ zlAeuIN8T4~i}u=^LftLFHhuV82j*(N`Ou=Tb%5hi13UuAMtF|K$9!1!>XV zeN)&>!4J;0&($v4TW<<&?^)cptgh+sO)Efgzi)F~s-LM}vNb1c&GXxmw)Io{Zd;tQ z`>q_jd@NyE1#Nc9-88p(Zf%<5+gektt6x3#{bNh5n-i^@ldW49Ted-^?b?B@&K^4! zB%2+o2CM7yXJ58*ZvQl&a%4dGiDT6*Z+*(!2Ibyk0AXEws(tN`?N{wf?b{RW+mr1( z7uW2f-PU_@yRFkElmV*s#T8Up0wS2-=Ene0=RHDHnRat_-=)Tx#yM}&RzI~bW%E$i z&z?!zYNqym;O3m(J2npEVVkkNbKkb4bNlP77ao~@WY#zH#H>8mbb0Jb{Brz9`;%4e zNmu*z)d}bJRJj+3WV8Rm%sJdMEz{zqp_!q%;B?DPd*HUqpLTI3%hYpUedvSwf5F1< zctLqCxn^9mXXef?Ja^N&bE@Y(i|g|0l+{KW&&7()x2;|8*gTo<*)P9sZ7%w}ttFj( z^rp2M2udkUGp5-mrcFT8cZvhvx7eq*y)Ze)|HM+0YTPwnz1X;Gp&QP1KRMAvC^D8JKOo(s8+CzG-^krn~iac{Rqw=9u1{u-2r}Z}t+9H4sb= z29p^um1SVM=)NtOUOM{Kqtk;og{oVEh53V*250CS3&V8R+d|F9A09Vwt{oqL=w

*1y};_NZW0?GZ-dnR$bBn30qo3x33HGBxTS zL(%n;;nVG$5Gh=j+yh1?s>7}VH6G<>Pfig9g#}i|Nq5|R*?7?`a2G2DZqhU9LWoCk z=Tvyi_Lrwxdmc8b(gqOGcumlS`F!PC+{09dCUBN!Js!$GM;4~+p)b+G1UY+76`_j< zVOx|~3HIn=u;-RXv&y!3`7rfV(UVg&MJx3v>`%ezA|~2d;?}q=ZjU?S&TH1!h?m3@ zFVmsa`CoQ86^0c(oNT{LSTIhyCtc6p-7;+Jjd|9t-Jn2)ACxD$$m$#0Z~#YyaQ;HG zk_N*QcjMLqV__U33!DO9MCa6?wFkX~MqG*ZN6vpy4EWE{;FSz`C2Wa_B)>vc)0|-By~8X2 zjQrE&P&+<55X?Hy%#^AX6=ZmpEF;7ETRS<(W-r)}BGyCWI|S-3c^M#_Mh_|f8y<*i z=~k|yamnAB@V73uZMo^+ddDP`ncwqooj!1@x_!a__tjm~M?N%hw({H6T_BgX@|1ro z-X_-v91?!uA|l)IM=%1pL+17(6xe_v2zW9 zdHZ)CT6pFs^&2lXTx`6#XW`|~zZmq{)hUw03eQgc?84#xrNs=N}s6ON8*)pe2%=sgTP?2*qY zpM7HV7P0dTn^gxp%_lSAE4GX5K{c;OEMiYEoH&ii33O&Efnm*CKj@YXRSkwgjx02+*^Tm zJ6wx;OT?A3;^%gzPvY3`8l8k6hQ`jGn^C~%ylSBF~P0}AF zUkP!)!=jx)_=w^e8D@HdAJXTK$f2^CaXv#aHA?AXs*r)uP(LjMvATj)tE(XL4~dlG zmJx1>d%uFV?rwN_L$YG+v_0jiS@HxDp1@5{GsM!x#=URWzUf?ScrfYhU3B)|ZfJVt z>F+#!`M&GIpSoUmEp6PN*tq}A{mG3-rmd;E##eTJXXj<_!hs(jzj}OW{qDs2-8Tl3 z>-SBYKW`k4P^7AwQvOx*{#Wb2U!U-IrhGM-{|X#UuDr7L^4iSnR%O#t<+?=WI@I5L zwKrM0^OkSblCM4C13&QNx~p|b-)0CNS590$QTV!3=PrY2@3z?)>qWN&-~hI)Z3oF; zCP{K&%js{iAcyN7a=7zyfSTV!j;m&%VCCj@6+q3F33@H4>@Y{ za^Omvd&p5=kfY%qa;y>?nW2m@FoR842%CH5f5w+;$*7@Y;ZuWB0Gz%wfLjtKpy&=q zgG1q=@Hrf{mvm)y0OtXSuU;P*!u>MqS=8yyPEA_wMDEkLIpN$Gow}!_l(SgvtqkC{ zglHFbkj=CDMd@r1dT>=BUL;$JzCCUE<(J`?QDwIi@QdTh1Ve6W*9!72F5GZm>{bKY|eh2!!dVT^&Nmu9XuwGtm_|Mb}|y_>9YMC{nwl zB!H=j`!ZyhnGTKz=|+qlfzvRM^pjO%OojquU}D#%jT<+vr$90WoC!t;1|_jS8aXe8 zm0i2LH*Vi@8XL!hVMV()B@zxzXfqodL8D6G*+B_tUtH*Ef(Di>x~U@=4p0>VT%x0; zTUifF9*zXgX{V{ROjI?S9GEL1r!M9FP4mArt$c zQKd5J2Ae!24D4s)7{#ctp$QJq21+G>UA%ed!j`O{5?B+8oLj?2VvX{Q9L3c)CnbOx zQX)G7gVE@yvSY)B%q>T_REF$DT@m@*1~T-r&Y|+Qs1O8jOyFa-dKA z6414|(DmyFI(B1rVmBttc%Xt(e^efmVtq%?VwDHYW9CFcXsq&KCUDa})c~%9f-#sa z^cfm}fzqfXl7<7cVHVS52zU4(ohS#N%}~{mE~pXBxF3slyJDXMEbyw6tb#fXWyVIB z<=a3C<^Kmq(+Xzx;3@` z_aQzF`%3*Cj@0UI&gRk6RIxOblxE9al2qoB?7LerUsM_mh#G$~&rHF*P0q0%5xgJ6HDoOUs}qr+FUxta-`@^-?RS#ro?t%;B%II6-KT)rm% z9}2XS^9yozP>dCL$7C4Q)(cKa8HH6_{=XFC?#Xa<4{IV0E~8v=po}!EGC5ZW-4gKN zRArezDfm;a@)sUVIm=+LJDL;+3uOX zA|Hyq-Lz)v_-yrD%jJfouqM^?WbW%pc9OH&WS_5I!nG|;_kYOYV_7>M4qv*P9TX4p zrpo#DrRMF4=Iv>YzHSUI?RhM*=dlmj7t8R0fg&^&J8LM(*aiCjtj=r;nqQzZD|rP` z>7%*`)y+&OUw;rvdW50}p)`I)4?;`mwGdiDH-*p=`Xz*x*a?l$Qo16pl&%PAv)ZB@ zcnHOgz~ToRn{wJ2L$CY^5_Gtkrt??UzhEa84O9L}3w9`P1cD7+4#lm7Jgy8Q2_16s zqBL&OT#n6`U~oWuFG9|%1rix7gPmK zGgJj8Gee;Q#3AiF(+=!~Cm(Tgl3ppLDlY3l3+U0`WC7Fe#W1qgvB}8huS_bN{B96f z7nCD(7j;1eN{5iVYn4p!EIy*puTBB~5|@&FVNT+fjr( zj_fc7of(j+9qgtu)thB7qg`jma48xNR8mV(0C);^yBR_V*|A9l#~;9F;if!N50han zI*4oUH07CzcHJoKFbFp0OpJndvUN)>*FJC_1&5RrfTL&!N(lYcI=U;bYb-hxQo10U zgutpGML!E*wk)+OFU8QvC@$>F#L`4jO#yr04r<~(|axL$2_r1$C zy~{Pc%T>S2HNMN${ft}x4_x=V+~$AaTGN7sU-z!i@>8Mer$P;go569%P{}vXo?Pbe z!U}9~q$wbMAB(c`YiGNbIlMAa@TDtRl#$;&y9Yajc)DVXa={1kPX!wJEpxpTMfX5y zQCqT6n`gs!&-In!TxFc2GG%W}*_%_gij=MT1BZE&Vahoz&fY&K%>729di|nreZt+j z%$a$^9j}x3&bH2eCE;Cthl3yco(zs(82s>Ya5B))=SI$CovuwA4V}5t`V!Uci%>_p zJC->EO0#RF#S-4;Olbzbey%%>$a?HU^)C!u-?DH%(Xk`Zx|4F$r|km&1V7h0r_3Lk zdoEGCF3nNE4c{_*r`26mh7Cul;7?mEd=F1`(0jqO zAYI>o!*~72)yNHHv1?zVeg86x`Gw!f`#(Sv{pnQ?8TeWd{e4>u^>eQ3bOrNPa_(|sCel^xvs(YGVV|{} z+q+!HyrR&}+d&S}M!avMEhfD4;G(Y_SoCGrfppN9Q-{$-Um9HG^AdzU^7#qC3i4GF zd{yMj!9!mFM+3HwzUm3M2J(e*l`n7za8{-{ax+-O0z|>Xn`d_~b9kj&d4A*c_>!k7 z;c24b+z6Nj=XC5R#+sC?b^6IAS8Kx6dc9$3)3L;+V>eyLrw*lr@+DzaLRd9-0lVkk z6RgvfOSamCtrpv}-WD3bHw(2Nr5`EhtT@_jjPZfr~14@~WU x$5uOsZF8+t``&Rk&fAjiwJ#h_Tafw_X1D+Ia`%y)+*>=>A6;kso3%XL{||^*gy#SN literal 0 HcmV?d00001 diff --git a/benchmarks/sweep/__pycache__/serve.cpython-312.pyc b/benchmarks/sweep/__pycache__/serve.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de7d85aa8dbadddd1586d8d4aa0f04cd37ce280c GIT binary patch literal 13925 zcmc&bX>1$UnKR^&oZ)@xrlgT2*`h5`zH&IR6JPRe*@^A6QJT^;XJk?4m6?%kF;uF$ z=~iijt%L@(ilkfZ{#gg{?z+GN>)izwNrScv6a~mi2APQhD5w|P{9__7+|BL+`+aXX zBt^?@f-SJm;+yZidGDKdefRUP91bf5Db9Q^c0EK8fG>#bIX^1lk(};=2HpaL)VIDCPn2B2w>Q=Cg!N2~-k$88Dwh#g>y zz{VX3=ZG`m8gY>_7k4K-BOU@<GwxP#)rc=*9q`lxkmG&M<2(%qEU*h5KhzM!XFu<36{8lIDM=0p{^RAr>@Y!&?ks;I7q9U5K$u0ctn<=z(PS(VhIso=5$0E!@2|b^0|lvu;+9< z5*5c%aY2+$ri6*O_zBJ?Xi%M}BT^(GDx!4eiYTU4@7nTFMNB|{#~D$&EJ~-PR8%A% z^vIE9Oo?U0@MtV9CIN$LPDheLL{>Q&+lHeFL3Kq&0de?(n2e5*3Y$EZx)PQZAvK|> zF0Ek-5KA#ZlvRr$O@^h3WVF1)lo7^ghF>@QRv8%aaf$}|&Kk$ddja*8l%@1ln=&cb zlO%!GYiX*c){r&6N&VQc?mJ`FwDw(=k=z3HL;5(wT+VLD+Zc+9%f8auQzD>|2IZB1Lon3^M(CGhmQ{P zXNI1AZs^$;GhS_f_{v&7!Drm%x@anqfSrLF`wK(E2lYNdT5`jXgGSXJNvGqJVI@Vj z$ds01Nky{ZJB#FmhvdQx8%rL{klU(OvUw8eV%A_8SP#W(>GS=AN+La&j->~bh;&g@ z2Chy#w(IdrkBNiv*oDFLq%xLD4qlGO6NBV*BqGu!c~Hh9I7qOC(vzwc&O%tjEg{!y zhM#;5vfoj^pQcJ)%G*`&4CFlnH)4yP$ET0oWw@NRb%|+P2Eg7i|BYP7V9vUAiP=Uf z0`nvDCvw){60`YZi?itTe%HFpI%bkfYRA+-jDE3)4n~H_jXwTR#_ZA zQmN|FD4?=C+RTy@i3_3>WdM_&@W|kA^(mzEorwSQNtF@K~cuMshn`PGWl$w5A*7DGI)(3czX&gzjF6mN5I$DFbB1LElX62~s z0!6{D&jWZAt00+^G>)`hm$Izhc8XJc`pYbr1x}h}-YKtZ6m_gM9PSJp-RVHqEIXA3 zy_euvJF9DS&8VIFcR(`YxYyaSaXdbCNpK0So9-%S1#&=^QJ1ooDt2gr$GtA;EBU8h zv-)+ZuHiT8)^Lh@$ZmP`vDfVu(raCUePIRfO`k4<){#8bvsts^S?&x7+dA3vu_k*S zsK4-#`cPR{@egPG0|RMbabi*dEnxuE3@H|sGhRg*Rr-~5O6iX&@qTf;7|%Ec2IR
ltHJtvF;n$7*pTlyTBz+=s6C<7o_MsmP(TIwm`^cQhv||H|ow zS1u{9T$*R!_}*c4U8a9YlGy4@!2b{E8JBbx;eyQlJ&DZakt5@m|K=Ed~!Zc@&PYh*m zEqSQgwpoW#)3s2&58SDTmWRg+)jgRriyOPqolp+HQP_A{VHzQ}iV@CyW0&IZ zn(I*fhcd_i$zP*)=SiS$+JDy%@+Xdihzfh=1oY5VIpv%5W$(=FpY1O=cj^NsYGY$j z?`JXCvJl*+1h+kP+1%_8Eamo3wobgA-JA`7*!jKJ?yCR6w=xYCl+CwPNd@Yrelhur zoG`P$;NOusUi8;Yy)*evu79Sz;NQOB?^gWXa}5tK6#R#w(pxokee!y4+f0AK8(Q#o zE8gz8%DF2A@1e}=xJ|C)Zsy%v^2`>^G4$YxH{uGb+&)y%2xyo8lb9}0^YF1Y5$6Wo zK59e(p=u?#2Ei{HYvE>2H+Tg8JJQayD-m2x7qLVv<>-KeorE}9 znT0z;kP6P2>Clz?vkTreLx>Fi3?rFFw5&mNl{JWr0npPPfbivvqHQJ2S?)GFV#WJU z;*2=4za_mb0NqU&wZup}z)f`#_-zaKLvsQudjOD*!(Ad= z-;h6%XxeSZx_>x%k?@^%FH;_^YnPBDN+?QU3GTU~1AuZN=mnRZCd@@`49dDB>L!|X zrMtQm27_8J4b!a$iA_EzgTP~qio9hOBYO?umonF;<&O>h|6d^gHBi5FF%4y+UnT34 zvihD^r!nVKU&g*Q;Qu*Nj260~MU=1Y8l5#>EMq89+ploHZpy+X)moWF+Gii}3SM01 zs?e;lu}q#Q#+VAT6KnP>A=cuxsI69jp&qAIZG&(KBmx|?aP1}}YrqG=j6cWRX?ze6 zi1;|<)i7QROUqz9;R;q9#>W6Y5a6iQ+Ii7qw+5b5T@d(%i6$Xic<;hN;-7?YY-TcJ zv&QAKz9fPGl#k#ivS2^}?i^eTcOVK2rs~5t!7w>0vL_Tr(@e!<$L4Qaj=&9Nws&UR zZ1iCr*wkH$qDv|Ez;nq;)AaXtmS&dgC_4DhlxNAl zpKdK`9?g;!^IO`<`-zR6*fVu-@?fs|lg4|E02mZo+p-NPmTuEc^d_?n=3Cm9tngmb zW`-cjy!$1^wtLwI^-G88%QW3)HpP!84NXfn%)r7`R4qA3)(NiC5&%LdWhsBLt=?jXQQPV%{Sk3{@LlYQ;?F(+ljv%p{QSDMNb~Bur+C*sCTJ2b%~18aYK?$ z87>)8#!JQ&bBUp`&6qT$%$Llh%p@(BEYN03T2r=5wv_#nJ;h#PQ;tiHl=G4kV9dNL zZh?O*{M&ffv^8#ji@D^EvyZeppxskxM_6;*32olE3t-*w@8KEVRSye|dn+Xy@8gYX ztNuG}K$vNJ+y}k=q$ke4S8qRW;4O1KIuf>j$t(ewW&f{1-X6EbnQ2eF2IdLcC>s^$ z;uB9Nc)XqjyJ<74e^#4`cmT}6K=geYrid7or>mTvynuOPsbw(fsb8P+%tSW zGLy_qMUxTO!4<`p%*3Fah+or-v2W^YLON#LrZfuf8-?{PA7t-RqCupu znT{^(bpzH*cwN+za;vWxlIOnT^QGMO5RCvtfq zF*Tp(<3oHlIW;8ku3UKbT!q{IK_kiiLbe!;RpF$!c)c0Ax3)vWIf`m;Xd& zn^)^)wpU_$x1gO;>>4Uzh#e@xwc@{yq6(6(14G*rMSGw^#nGdmWl_&1}@ae2JOOFthoQnaxi$vrS){Xwqjc z*hTY6>K$9mFareiDnP*QY@+=t4g0o=w%dkVEJGDo+_T6|8!iA84gUq(0+4d_?*Mgn zxZ>OxpI7-ToO(`Yy4qR7hiINl^L!ld#}p6TLuIvbV8qyYRG7()kP_B_AVxUk!Bj<7Y7c0@nUKzQt$Vvu75Qn?@C*BPB2%3w zOt@-Sqg;MInX0srO6l0FhhaEx#!Zc#mgjonOg;g*8AtITa zWCM8lQGO}gRU*jfghKNQLm<67AOSqD1S(Ak9Z>~&4%wH!SNv63O&!WaP2lAs3yJ(J zhzAteLLa~%f&bjULw19D&_nqfHmwaMU;6`-(Qm#nQMOZD+l{lOmew0*Kl9YD^xgGz z-DH;NrGq8bv21+%=}-O5E3bX`_|nNw{oLx=J8g2u;g15vj%P}q(1xc&_H?XX{KV5+ zwo%^h2M)^H@V^_uMb$U0oGsRYfC~PqzH7^9Xfl6V-?bcA>yVC&t@n<7n^_5!jg+ar zOu?tJm2%We4TBOpC^3T%Kes^}!O<^TY+oB@ei3FMS6nJGBAWi91b5aWQ}ff)plnzH zgEOdt#H!gA3y33vunSuEVW}WZfTO1!A2?dUFZ$J8 z!7c)#U-VtY9oLJj=o9_8?prM&n(*vN5`A;1#_dWUYIS{Wu&aO3cI7Xyd6e-a!T6{? z89&rSHQ9Hd5X_?jq(TJsF{;`EdK+pgg&Hw1hdSM^B+@!mp;oLFYfz6Z)QNRa0?0zW z7=&^?P)x&KBWv)DVxw3~;G4vzD!9t3uD}X}fY>mHy4tP;Gc@fyA+Z54F$S|c^Qfwk zB$`F1e)tQCYz7Ji1EbI)w#;?v5Zk}{%?550*rB3U z_z?7m3j_3GtME%u6SZ>x6LCHFW4NBi5%m>oTyNwHDRKg-P&5+s2QCVUTJfP(fP;Z9NZ>%rvEweGn$6@CJ5epC_?bjnv63?C zscI`xKNYvYr=o~L=nu{cSQ~XLdI0OH=?|`@jm9>rvG;UfOL1is@Y9CmbfceIm2R_PIE7kUV>+f4S@o9bMYOGk_yEI;Mx2+DZySq!Cx{|wL zWkhy&t+lPY`|q$HjmrndetBs9z}f0=eRt;M-qE6awA42wdD>S`$ey0fuJGERH1guz zu5-61mrt#nDg_!=hHp=71UltF=j!y@h4-&~5;%0Ll(cW1a*e-sYI;lJ;oS_Zb9mO2+HEuVV8Qr?=)K)s68 zjX<9q=+p3A3N-)L!gzgP^|TD!Cw(Z3({*#~zKIGnm;0!?r^{cXAH_1;ypmgI+Yy&v z7U5n*c+LMG!Z#ZLKh?-&gDdWJw!7qOS+z@p&&k|#vTyX}iF;UIi^$wD+4s!SiO&G% zudL;Y!C^Qks~b?)cmR+Xu#?iV8Ib7oZF+(mo>tk@y5Z@TJwW+2%emWZ$^CjtPQc&&rzxv@)6K2c7Tw_`=3o7_OD9SW_dBO=p8n1_T+u+?11seT z>X)v>)~*=W*(O4=Dwa3fI#<)uk%_x)XKuTe%`0YYUUey}U2FdL`#$ju!X;@xOT(S0 zX??$GEmmwjB*SJ-mE6G<-?v4)iN3bgBeJg-DCWzIc)u1-?BVB8BXEj`pJ!)fmlaou%b1~8MOO~OHwp% z_ujV7@)g@mG{lXl{pfm={=4UD2qf6ld#)Ai70a|?dp5vI#h%;VwT@wM2R`-$z^!Nj z?a+$WS=E@@YC$t*HA@KWC9oVTcHta4!FMIGV#$JyQxARt2iUlba5-PG&2%h)b_}{Y zCeT)2(CiCm!BwTD+jR_8Y3Cqz3vST`qnlxL4_Ib;q>`GeQ^%{$=gXs1W$c0}b8X)c?L#xUj=%LsAcB`6cT?N+|ljdkfukzJsa$a@ALPh(` zL0Af$o@t2nMFmjJGcySYu~okAa6#w#$=ViXfst`n_(io#c6Kp`efIQd@5?VoxC^t< zEYJ0z-kQp28ey)Vc(?T-HP>b?yzTYfm&^X41MYMXj&P$~PK#xtxsuy!k>MUyQ;)ed{c$`m1a*6AFBvG|d9Pm^|fYXeozv=;!)+m4Ae;w;=d5# z<-y~h+CIT;VaWANWirX|2=~G?_l=9fJpYY;*anx*|&WzYCa9Lj0l8_ zmx0)Mju!yOd^$5fGs~s;R7O}NJH*ST+G$l0mTK9Klmwv_94ZQZG>MNv7!JX3biAxs zR<|?x8BH()I^9>Vbc7y zyuyG{EeN1QPy&SZAZE;`viZeGGI50$4r1#8%z$Z9ie(Dk6(j%}UAc-emYr9a*nB*y zu(?Grg@sHS)=W%1#hlMz)CYER0lQzsj2P8uPYW+&_D7H@uAR(431EHql~Ee7b?@s$ zg74%(06`eNpqlL`aE6mO107Mk)jP1dy}7TKm%>j%#Foj(G7f9SKG zmr9;+$y>q)uw#-}Ok&5t*7)iR^e?^ZBNN4@GxEs9otM7< z@MkXYtUTQk?D~lHl%$By*!8=~LZCE-yez$qz=H$|;JaT8@ zXU#?Kxt}(#jsCFaPP60y3-vjGcluOIbHmXInETk#z3Head0N5nDAhFoCoy)MRLdE< zTtnI1_d1^0=opndMvEOMB&M~rvkKOY$^NbJH^w)Z2AOI2m}&aVTeJN7U2oTtY10}k zIl91REjhw#Fw=nz$5GjFwCMQC9t{mEhh;}=3I77Wbz5zAFnE034?iEKLTBlRpS!5m zQJTcM{(h)w>;Ur*2W?}A&Fa$tV8}@7J05hPRRM|wwRIgh3ci4tgUaqibZ=vU5$T*| z9A2L3n$nwe9{ZA16>%_OOov1#%n;u&lh{L5^lPr#lF@q`QLm00_SXHO6)m1!=_i%Q zNx@pB>sNyIpxvGWjm-v{;vPEs_Jae^!tFQcTeYG+Zn|)laVE}u(*Thp$2Se?J2BC!wK+u>wvl+!dGJSD;k}6^ z`T-mO5iw_@S)!jI`&W%bT}JA9Sd{{|C%~6fuXYHwDpgD6RsG~KfJQr9Z`G56-Q2cH zBdHcJBOL4qO;~PP$fO=4)sPbsiKWu%2sgQ{p;oB}MDNhH_{akSoC-v{D#vlC{qUmfmsPuL*thu^IQC{`RC&r`8tS5k6f8}a;-2X^W#_!bXd zDLE0U>KnnW`i;DAs+M<9CL&LOYe$jnkKq>NT#$iO>!A9dr?=(Ixa8^il&zJRVX1z& z$P6zX0~YPCSxy#x?KdV$tzY@*>vHR9lq@a1C?{G#NHhURsO<;i?~RvieknL0SqDr0 zV9C|J;Tn)#10{FQhI>eM4?VD%LS_&Z;LZ%z6?W zoxVf==lt_K{?L~$FCoIVhdpK7}1b(#|JCD zzXyxWp%>&osNdh9q~L^nczmsHd3Lq!dub>>%;5pN>GslOO+p@ORea61&DS87HJ?(< zgcIfnlM}8X@J=2F?2Rb;guS7J6$+S%m?5_#tV{S)EWM7|pJ8?bvo|n%6SF_Z>@CRl z=Q3YM5X^4j_8K4qE@S;NTxQc3l!Fs@E^iz@EgwGp@!|1eaH8m&*q8l$*3|kXn2&1* z^I5gZ{xD%X%dfrt%*rckY%w&vKVRAZ%M)`PFLIo^Dvm=8jj-X=WVlJOL?ZD_ED}-N zu?!gM#9GY_MyFx|>VN_ggaCd_O0i8tcry>ydQJ__664TILaT#SkyR3M70_2qtU?l6 zC9zm|?3&J#hsxwhulnB3MBW9e5nS^{;x$q2MF#x>3WMe|S*Vpx5OsTd-uM%(g8XmTIhYf63!Vz2{0C>=KO;9DEIUb?i*h%X-K6ND%=RrWDW(~!E>!9ryzl%vP1{Q?9X}A? z6U#=d+|w#1tiU2*npal&50dXE<!$K(Se*l22@XG)I literal 0 HcmV?d00001 diff --git a/distributed/eplb/eplb_state.py b/distributed/eplb/eplb_state.py new file mode 100644 index 0000000..526d3ce --- /dev/null +++ b/distributed/eplb/eplb_state.py @@ -0,0 +1,837 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Expert parallelism load balancer (EPLB) metrics and states. + +# Glossary + +- **Logical Expert**: An expert that is part of the model's logical structure. + It holds a set of weights and is replicated across multiple physical + experts. +- **Redundant Expert**: To achieve load balancing, for some popular logical + experts, we create additional copies of the expert weights. During inference, + each of these copies can be routed to by the same set of tokens. +- **Physical Expert**: An expert that is instantiated on a specific device. + It is a replica of a logical expert and can be rearranged across devices. + I.e., one logical expert may have multiple sets of weights initialized on + different devices, and each of these sets is a physical expert. +- **Local Physical Expert**: A physical expert that is instantiated on the + current device. + +For example: DeepSeek-R1 has 256 logical experts, so each MoE layer +has 256 sets of linear layer weights in the model parameters. If we add 32 +redundant experts, DeepSeek-R1 will have 256 + 32 = 288 physical experts in +total. And when deploying, we'll have 288 sets of linear layer weights for each +MoE layer. If we have 32 EP ranks, then each GPU will hold 288 / 32 = 9 local +physical experts. +""" + +import time +from collections.abc import Sequence +from dataclasses import dataclass + +import torch +from torch.distributed import ProcessGroup, all_reduce + +from vllm.config import ModelConfig, ParallelConfig +from vllm.distributed.parallel_state import ( + get_ep_group, + get_node_count, + in_the_same_node_as, +) +from vllm.distributed.utils import StatelessProcessGroup +from vllm.logger import init_logger +from vllm.model_executor.models.interfaces import MixtureOfExperts + +from .rebalance_algo import rebalance_experts +from .rebalance_execute import rearrange_expert_weights_inplace + +logger = init_logger(__name__) + + +@dataclass +class EplbModelState: + """EPLB metrics.""" + + physical_to_logical_map: torch.Tensor + """ + Mapping from physical experts to logical experts. + + Shape: (num_moe_layers, num_physical_experts) + + # Example + + For a 2-layer MoE model with 6 physical experts and 4 logical experts on 3 + EP ranks, the mapping could look like this: + + ``` + [[0, 1, 2, 3, 0, 1], + [0, 2, 0, 1, 0, 3]] + ``` + """ + logical_to_physical_map: torch.Tensor + """ + Mapping from logical experts to physical experts. + + This is a sparse matrix, where -1 indicates no mapping. + + Shape: (num_moe_layers, num_logical_experts, num_redundant_experts + 1) + + # Example + + For a 2-layer MoE model with 6 physical experts and 4 logical experts on 3 + EP ranks, the mapping could look like this: + + ``` + [[[0, 4, -1], + [1, 5, -1], + [2, -1, -1], + [3, -1, -1]], + [[0, 2, 4], + [3, -1, -1], + [1, -1, -1], + [5, -1, -1]]] + ``` + """ + logical_replica_count: torch.Tensor + """ + Number of replicas for each logical expert. + This is exactly the non-`-1` count in the `logical_to_physical_map`. + + Shape: (num_moe_layers, num_logical_experts) + + # Example + For a 2-layer MoE model with 6 physical experts and 4 logical experts on 3 + EP ranks, the count could look like this: + + ``` + [[2, 2, 1, 1], + [3, 1, 1, 1]] + """ + + expert_load_pass: torch.Tensor + """ + Expert load during this forward pass. + We use the token count each expert processes as the load. + + Shape: (num_moe_layers, num_physical_experts) + """ + expert_load_window: torch.Tensor + """ + A sliding window of expert load. + + Shape: (window_size, num_moe_layers, num_physical_experts) + + NOTE: The expert_load_view now records load for all physical experts + rather than just local experts. This ensures consistent load statistics + across different dispatch methods (naive all-to-all, DeepEP, pplx-kernels). + The recorded load will be multiplied by dp_size when using naive all-to-all + due to each DP rank contributing the same token set to the calculation. + See: + https://github.com/vllm-project/vllm/pull/22167#pullrequestreview-3086143856 + """ + model_name: str + model: MixtureOfExperts + + +class EplbState: + """ + EplbState of each expert parallel model. Key is the model config hash. + """ + + def __init__(self, parallel_config: ParallelConfig, device: torch.device): + self.parallel_config = parallel_config + self.device = device + self.model_states: dict[str, EplbModelState] = {} + """ + Current step in the sliding window. + + Different from `expert_rearrangement_step`, + each EP rank may have its own `expert_load_window_step`. + """ + self.expert_load_window_step: int = 0 + """ + Size of the expert load sliding window. + This is a constant and is taken from the config. + """ + self.expert_load_window_size: int = 0 + """ + Steps after last rearrangement. + Will trigger a rearrangement if it exceeds the threshold. + + NOTE: Keep in mind that all EP ranks need to have the same + `expert_rearrangement_step` value to ensure synchronization. + Otherwise, the rearrangement will hang at collective + communication calls. + """ + self.expert_rearrangement_step: int = 0 + """ + Interval for expert rearrangement steps. + This is a constant and is taken from the config. + """ + self.expert_rearrangement_step_interval: int = 0 + + @staticmethod + def build_initial_global_physical_to_logical_map( + num_routed_experts: int, + num_redundant_experts: int, + ) -> Sequence[int]: + """ + Build an initial expert arrangement using the following structure: + [original routed experts, redundant experts] + + Returns: + physical_to_logical_map (Sequence[int]): A list of integers, + where each integer is the index of the logical expert + that the corresponding physical expert maps to. + """ + global_physical_to_logical_map = list(range(num_routed_experts)) + global_physical_to_logical_map += [ + i % num_routed_experts for i in range(num_redundant_experts) + ] + return global_physical_to_logical_map + + def validate_ep_configuration(self, new_model: MixtureOfExperts): + """ + Validate that the expert parallel configuration of + the new model is the same as the existing models. + """ + if len(self.model_states) > 0: + model = next(iter(self.model_states.values())).model + if ( + model.num_routed_experts != new_model.num_routed_experts + or model.num_redundant_experts != new_model.num_redundant_experts + or model.num_physical_experts != new_model.num_physical_experts + or model.num_logical_experts != new_model.num_logical_experts + or model.num_expert_groups != new_model.num_expert_groups + ): + raise RuntimeError( + "Model: {} " + "with config {} " + "{} {} {} {} " + "mismatch with new model {} " + "with config {} " + "{} {} {} {}".format( + type(model), + model.num_routed_experts, + model.num_redundant_experts, + model.num_physical_experts, + model.num_logical_experts, + model.num_expert_groups, + type(new_model), + new_model.num_routed_experts, + new_model.num_redundant_experts, + new_model.num_physical_experts, + new_model.num_logical_experts, + new_model.num_expert_groups, + ) + ) + + def add_model( + self, + model: MixtureOfExperts, + model_config: ModelConfig, + global_expert_load: torch.Tensor | None = None, + old_global_expert_indices: torch.Tensor | None = None, + rank_mapping: dict[int, int] | None = None, + ): + """ + Build the initial EPLB state. + """ + self.validate_ep_configuration(model) + physical_to_logical_map_list = ( + EplbState.build_initial_global_physical_to_logical_map( + model.num_routed_experts, + model.num_redundant_experts, + ) + ) + physical_to_logical_map = torch.tensor( + physical_to_logical_map_list, + device=self.device, + ) + # Assuming 8 GPUs per node, this supports up to + # (1023 + 1) / 8 = 128 nodes for now. + # TODO(rui): make this configurable + MAX_EXPERT_REDUNDANCY = 1023 + assert model.num_redundant_experts <= MAX_EXPERT_REDUNDANCY, ( + f"num_redundant_experts {model.num_redundant_experts} " + f"must be less than or equal to {MAX_EXPERT_REDUNDANCY}" + ) + max_slots_per_logical_expert = MAX_EXPERT_REDUNDANCY + 1 + logical_to_physical_map = torch.full( + (model.num_logical_experts, max_slots_per_logical_expert), + -1, + device=self.device, + ) + logical_replica_count = torch.zeros( + (model.num_logical_experts,), + device=self.device, + dtype=torch.long, + ) + + for i in range(model.num_physical_experts): + logical_idx = physical_to_logical_map[i] + logical_to_physical_map[logical_idx, logical_replica_count[logical_idx]] = i + logical_replica_count[logical_idx] += 1 + + # Duplicate initial mapping for all layers + physical_to_logical_map = ( + physical_to_logical_map.unsqueeze(0) + .expand( + model.num_moe_layers, + -1, + ) + .contiguous() + ) + logical_to_physical_map = ( + logical_to_physical_map.unsqueeze(0) + .expand( + model.num_moe_layers, + -1, + -1, + ) + .contiguous() + ) + logical_replica_count = ( + logical_replica_count.unsqueeze(0) + .expand( + model.num_moe_layers, + -1, + ) + .contiguous() + ) + + expert_load_pass = torch.zeros( + (model.num_moe_layers, model.num_physical_experts), + dtype=torch.int32, + device=self.device, + ) + self.expert_load_window_size = self.parallel_config.eplb_config.window_size + expert_load_window = torch.zeros( + ( + self.expert_load_window_size, + model.num_moe_layers, + model.num_physical_experts, + ), + dtype=torch.int32, + device=self.device, + ) + + # Set the initial progress of rearrangement to 3/4 + eplb_step_interval = self.parallel_config.eplb_config.step_interval + self.expert_rearrangement_step = max( + 0, eplb_step_interval - eplb_step_interval // 4 + ) + self.expert_rearrangement_step_interval = eplb_step_interval + + if global_expert_load is not None: + ep_group = get_ep_group().device_group + assert global_expert_load.shape == ( + model.num_moe_layers, + model.num_logical_experts, + ) + assert global_expert_load.dtype == torch.int64 + + num_replicas = model.num_physical_experts + num_groups = model.num_expert_groups + num_nodes = get_node_count() + num_gpus = ep_group.size() + + if num_gpus % num_nodes != 0: + num_nodes = 1 + logger.warning_once( + f"num_gpus % num_nodes != 0, " + "not using hierarchical rearrangement algorithm.\n" + f"{num_gpus=}, {num_nodes=}" + ) + + # Get new expert mappings + ( + new_physical_to_logical_map, + new_logical_to_physical_map, + new_logical_replica_count, + ) = rebalance_experts( + global_expert_load, + num_replicas, + num_groups, + num_nodes, + num_gpus, + ) + + max_physical_slots = new_logical_to_physical_map.shape[-1] + assert max_physical_slots <= logical_to_physical_map.shape[-1] + new_logical_to_physical_map = torch.nn.functional.pad( + new_logical_to_physical_map, + (0, logical_to_physical_map.shape[-1] - max_physical_slots), + value=-1, + ) + physical_to_logical_map = new_physical_to_logical_map.to(self.device) + logical_to_physical_map.copy_(new_logical_to_physical_map) + logical_replica_count.copy_(new_logical_replica_count) + + model.set_eplb_state( + expert_load_pass, + logical_to_physical_map, + logical_replica_count, + ) + if global_expert_load is not None: + rearrange_expert_weights_inplace( + old_global_expert_indices, + new_physical_to_logical_map, + model.expert_weights, + ep_group, + False, + rank_mapping, + ) + self.expert_rearrangement_step = 0 + + self.model_states[model_config.compute_hash()] = EplbModelState( + physical_to_logical_map, + logical_to_physical_map, + logical_replica_count, + expert_load_pass, + expert_load_window, + model_config.model, + model, + ) + + def step( + self, + is_dummy: bool = False, + is_profile: bool = False, + log_stats: bool = False, + ) -> None: + """ + Step the EPLB state. + + Args: + is_dummy (bool): If `True`, this is a dummy step and the load + metrics recorded in this forward pass will not count. + Defaults to `False`. + is_profile (bool): If `True`, perform a dummy rearrangement + with maximum communication cost. This is used in + `profile_run` to reserve enough memory + for the communication buffer. + log_stats (bool): If `True`, log the expert load metrics. + + # Stats + The metrics are all summed up across layers. + - `avg_tokens`: The average load across ranks. + - `max_tokens`: The maximum load across ranks. + - `balancedness`: The ratio of average load to maximum load. + """ + + if is_profile: + self.rearrange(is_profile=True) + return + + if is_dummy: + # Do not record load metrics for dummy steps + for eplb_model_state in self.model_states.values(): + eplb_model_state.expert_load_pass.zero_() + + if log_stats: + # Sync the expert load pass for each model (main and drafter). + # expert_load_pass: (num_moe_layers, num_physical_experts) + expert_load_pass_list = self._sync_load_pass() + ep_group = get_ep_group().device_group + for expert_load_pass, eplb_model_state in zip( + expert_load_pass_list, self.model_states.values() + ): + # num_tokens_per_rank: (num_moe_layers, num_ranks) + num_tokens_per_rank = ( + expert_load_pass.reshape( + expert_load_pass.shape[0], ep_group.size(), -1 + ) + .sum(dim=-1) + .float() + ) + + # Compute balancedness ratio: + # for each layer: + # (mean load across ranks) / (max load across ranks) + avg_tokens_tensor = num_tokens_per_rank.mean(dim=0).sum(dim=0) + max_tokens_tensor = num_tokens_per_rank.max(dim=0).values.sum(dim=0) + + # Just to make type checker happy + tokens_tensors: list[float] = torch.stack( + [avg_tokens_tensor, max_tokens_tensor] + ).tolist() + avg_tokens, max_tokens = tokens_tensors + balancedness = avg_tokens / max_tokens if max_tokens > 0 else 0.0 + + if ep_group.rank() == 0: + logger.info( + "EPLB step: %d for model %s: avg_tokens=%.2f, " + "max_tokens=%d, balancedness=%.4f", + self.expert_rearrangement_step, + eplb_model_state.model_name, + avg_tokens, + max_tokens, + balancedness, + ) + + # Update the expert load sliding window + if not is_dummy: + for eplb_model_state in self.model_states.values(): + eplb_model_state.expert_load_window[self.expert_load_window_step] = ( + eplb_model_state.expert_load_pass.clone() + ) + eplb_model_state.expert_load_pass.zero_() + + self.expert_load_window_step += 1 + if self.expert_load_window_step >= self.expert_load_window_size: + self.expert_load_window_step = 0 + + # Step the expert rearrangement step + # Note that even if this is a dummy step, we still increment the + # rearrangement step and perform rearrangement to ensure all ranks are + # performing collective communication. + self.expert_rearrangement_step += 1 + if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval: + self.expert_rearrangement_step = 0 + self.rearrange() + + def rearrange( + self, + is_profile: bool = False, + execute_shuffle: bool = True, + global_expert_loads: list[torch.Tensor] | None = None, + rank_mapping: dict[int, int] | None = None, + ) -> torch.Tensor | None: + """ + Rearrange the experts according to the current load. + + Args: + is_profile (bool): If `True`, perform a dummy rearrangement. + This is used in `profile_run` to reserve enough memory, + no memory movement will be performed. Default is False. + execute_shuffle (bool): If `True`, execute the shuffle + in elastic expert parallel (EEP). Default is True. + global_expert_loads (list[torch.Tensor] | None): The global expert + loads when scaling is done in EEP. + List of expert loads for the main and drafter + (when spec decode is used) models. + rank_mapping (dict[int, int] | None): The rank mapping + when scaling is done in EEP. + """ + + ep_group = get_ep_group().device_group + ep_rank = ep_group.rank() + + time_start = None + is_main_rank = ep_rank == 0 + if is_main_rank: + torch.cuda.synchronize() + time_start = time.perf_counter() + logger.info("Rearranging experts %s...", "(profile)" if is_profile else "") + + if global_expert_loads is None: + # Map the physical expert load to global logical experts + global_expert_load_windows = [] + if not execute_shuffle: + num_models = torch.tensor( + [len(self.model_states)], dtype=torch.int32, device="cpu" + ) + torch.distributed.broadcast( + num_models, group=get_ep_group().cpu_group, group_src=0 + ) + + for eplb_model_state in self.model_states.values(): + logical_expert_load_window = torch.zeros( + self.expert_load_window_size, + eplb_model_state.model.num_moe_layers, + eplb_model_state.model.num_logical_experts, + dtype=eplb_model_state.expert_load_window.dtype, + device=eplb_model_state.expert_load_window.device, + ) + logical_expert_load_window.scatter_add_( + dim=-1, + index=eplb_model_state.physical_to_logical_map.unsqueeze(0) + .expand_as(eplb_model_state.expert_load_window) + .long(), + src=eplb_model_state.expert_load_window, + ) + + if not execute_shuffle: + metadata = torch.tensor( + [ + eplb_model_state.model.num_moe_layers, + eplb_model_state.model.num_logical_experts, + eplb_model_state.physical_to_logical_map.shape[1], + ], + dtype=torch.int32, + device="cpu", + ) + torch.distributed.broadcast( + metadata, group=get_ep_group().cpu_group, group_src=0 + ) + + global_expert_load_window = logical_expert_load_window.sum(dim=0) + global_expert_load_windows.append(global_expert_load_window) + # Perform all-reduce to get the expert load across all ranks for each model + global_expert_load_windows = self._allreduce_list( + global_expert_load_windows + ) + if not execute_shuffle: + for eplb_model_state, global_expert_load_window in zip( + self.model_states.values(), global_expert_load_windows + ): + # (num_moe_layers, old_num_physical_experts) + old_global_expert_indices = eplb_model_state.physical_to_logical_map + torch.distributed.broadcast( + old_global_expert_indices, group=ep_group, group_src=0 + ) + if not execute_shuffle: + return global_expert_load_windows + else: + assert execute_shuffle + global_expert_load_windows = global_expert_loads + + # TODO(bowen): Treat differently for prefill and decode nodes + eplb_model_state = next(iter(self.model_states.values())) + model = eplb_model_state.model + num_replicas = model.num_physical_experts + num_groups = model.num_expert_groups + if rank_mapping is not None and len(rank_mapping) == ep_group.size(): + # NOTE(yongji): scale down, we need to rebalance the experts on + # remaining GPUs, transfer the experts while we haven't shutdown + # the GPUs to be released. + cpu_group = get_ep_group().cpu_group + num_nodes = _node_count_with_rank_mapping(cpu_group, rank_mapping) + num_gpus = sum(new_rank != -1 for new_rank in rank_mapping.values()) + num_replicas = ( + num_replicas // ep_group.size() * num_gpus + ) # handle num replicas change + else: + num_nodes = get_node_count() + num_gpus = ep_group.size() + + if num_gpus % num_nodes != 0: + self.num_nodes = 1 + logger.warning_once( + f"num_gpus % num_nodes != 0, " + "not using hierarchical rearrangement algorithm.\n" + f"{num_gpus=}, {num_nodes=}" + ) + + for eplb_model_state, global_expert_load_window in zip( + self.model_states.values(), global_expert_load_windows + ): + # Get new expert mappings for the model + ( + new_physical_to_logical_map, + new_logical_to_physical_map, + new_logical_replica_count, + ) = rebalance_experts( + global_expert_load_window, + num_replicas, + num_groups, + num_nodes, + num_gpus, + ) + + # Update expert weights + rearrange_expert_weights_inplace( + eplb_model_state.physical_to_logical_map, + new_physical_to_logical_map, + eplb_model_state.model.expert_weights, + ep_group, + is_profile, + rank_mapping, + ) + + if not is_profile: + if ( + eplb_model_state.physical_to_logical_map.shape[1] + != new_physical_to_logical_map.shape[1] + ): + eplb_model_state.physical_to_logical_map = ( + new_physical_to_logical_map.to( + eplb_model_state.physical_to_logical_map.device + ) + ) + else: + eplb_model_state.physical_to_logical_map.copy_( + new_physical_to_logical_map + ) + max_physical_slots = new_logical_to_physical_map.shape[-1] + assert ( + max_physical_slots + <= eplb_model_state.logical_to_physical_map.shape[-1] + ) + new_logical_to_physical_map = torch.nn.functional.pad( + new_logical_to_physical_map, + ( + 0, + eplb_model_state.logical_to_physical_map.shape[-1] + - max_physical_slots, + ), + value=-1, + ) + eplb_model_state.logical_to_physical_map.copy_( + new_logical_to_physical_map + ) + eplb_model_state.logical_replica_count.copy_(new_logical_replica_count) + + if is_main_rank: + assert time_start is not None + torch.cuda.synchronize() + time_end = time.perf_counter() + logger.info( + "Rearranged experts%sin %.2f seconds.", + " (profile) " if is_profile else " ", + time_end - time_start, + ) + return None + + @staticmethod + def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]: + """ + Receive the expert load and old placement from the master rank. + """ + ep_group = get_ep_group() + num_models = torch.empty(1, dtype=torch.int32, device="cpu") + torch.distributed.broadcast(num_models, group=ep_group.cpu_group, group_src=0) + num_models = num_models.item() + global_expert_loads = [] + old_global_expert_indices_per_model = [] + for _ in range(num_models): + metadata = torch.empty(3, dtype=torch.int32, device="cpu") + torch.distributed.broadcast(metadata, group=ep_group.cpu_group, group_src=0) + num_moe_layers, num_logical_experts, num_old_physical_experts = ( + metadata.tolist() + ) + global_expert_load = torch.zeros( + (num_moe_layers, num_logical_experts), + dtype=torch.int64, + device=ep_group.device, + ) + all_reduce(global_expert_load, group=ep_group.device_group) + old_global_expert_indices = torch.empty( + (num_moe_layers, num_old_physical_experts), + dtype=torch.int64, + device=ep_group.device, + ) + torch.distributed.broadcast( + old_global_expert_indices, + group=ep_group.device_group, + group_src=0, + ) + global_expert_loads.append(global_expert_load) + old_global_expert_indices_per_model.append(old_global_expert_indices) + return global_expert_loads, old_global_expert_indices_per_model + + @classmethod + def get_eep_state( + cls, parallel_config: ParallelConfig + ) -> tuple[ + list[torch.Tensor] | None, + list[torch.Tensor] | None, + dict[int, int] | None, + ]: + num_local_physical_experts = torch.empty(1, dtype=torch.int32, device="cpu") + torch.distributed.broadcast( + num_local_physical_experts, + group=get_ep_group().cpu_group, + group_src=0, + ) + num_local_physical_experts = int(num_local_physical_experts.item()) + new_ep_size = get_ep_group().world_size + global_expert_loads, old_global_expert_indices_per_model = ( + EplbState.recv_state() + ) + + # EP configuration for all models has to be the same so as eplb config + num_logical_experts = global_expert_loads[0].shape[1] + parallel_config.eplb_config.num_redundant_experts = ( + num_local_physical_experts * new_ep_size - num_logical_experts + ) + assert ( + old_global_expert_indices_per_model[0].shape[1] % num_local_physical_experts + == 0 + ) + old_ep_size = ( + old_global_expert_indices_per_model[0].shape[1] + // num_local_physical_experts + ) + rank_mapping = {old_ep_rank: old_ep_rank for old_ep_rank in range(old_ep_size)} + return ( + global_expert_loads, + old_global_expert_indices_per_model, + rank_mapping, + ) + + def _allreduce_list(self, tensor_list: list[torch.Tensor]) -> list[torch.Tensor]: + """ + All-reduce a list of tensors. + """ + if len(tensor_list) == 1: + all_reduce(tensor_list[0], group=get_ep_group().device_group) + return tensor_list + assert all(t.dim() == 2 for t in tensor_list), "All tensors must be 2D." + assert all(t.shape[1] == tensor_list[0].shape[1] for t in tensor_list), ( + "All tensors must have the same shape[1]." + ) + # Concatenate, all_reduce, then unpack to original shapes. + # We assume all tensors are 2D and shape[1] (num_physical_experts) + # is the same across all models. + shapes = [t.shape for t in tensor_list] + concat_tensor = torch.cat(tensor_list, dim=0) + + ep_group = get_ep_group().device_group + all_reduce(concat_tensor, group=ep_group) + + all_reduce_list = [] + offset = 0 + for shape in shapes: + all_reduce_list.append(concat_tensor[offset : offset + shape[0], :]) + offset += shape[0] + return all_reduce_list + + def _sync_load_pass(self) -> list[torch.Tensor]: + """ + Sync the expert load pass across all ranks for log stats. + Doesn't update the expert load pass in eplb_model_state. + """ + load_pass_list = [] + for eplb_model_state in self.model_states.values(): + load_pass_list.append(eplb_model_state.expert_load_pass.clone()) + return self._allreduce_list(load_pass_list) + + +def _node_count_with_rank_mapping( + pg: ProcessGroup | StatelessProcessGroup, + rank_mapping: dict[int, int], +) -> int: + if isinstance(pg, ProcessGroup): + world_size = torch.distributed.get_world_size(group=pg) + else: + world_size = pg.world_size + + if world_size == 1: + return 1 + + # Build node assignment map + node_assignment = [0] * world_size # rank -> node_id + next_node_id = 0 + + for current_rank in range(world_size): + if node_assignment[current_rank] != 0: + continue # Already assigned to a node + + assert current_rank in rank_mapping + if rank_mapping[current_rank] == -1: + continue # Pending shutdown + + # Assign current rank to a new node + next_node_id += 1 + node_assignment[current_rank] = next_node_id + + # Find all ranks on the same node as current_rank + same_node_flags = in_the_same_node_as(pg, current_rank) + for other_rank, is_same_node in enumerate(same_node_flags): + if is_same_node and node_assignment[other_rank] == 0: + node_assignment[other_rank] = next_node_id + + return next_node_id diff --git a/distributed/eplb/rebalance_algo.py b/distributed/eplb/rebalance_algo.py new file mode 100644 index 0000000..e6645e5 --- /dev/null +++ b/distributed/eplb/rebalance_algo.py @@ -0,0 +1,260 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Expert parallelism load balancer (EPLB) for vLLM. + +This module implements the core rearrangement algorithm. + +The rearrangement algorithm is adapted from +[DeepSeek EPLB](https://github.com/deepseek-ai/eplb). + +Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example +on how the EPLB algorithm works. +""" + +import numpy as np +import torch + + +def balanced_packing( + weight: torch.Tensor, num_packs: int +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Pack n weighted objects to m packs, such that each bin contains exactly + n/m objects and the weights of all packs are as balanced as possible. + + Parameters: + weight: [X, n], the weight of each item + num_packs: number of packs + + Returns: + pack_index: [X, n], the pack index of each item + rank_in_pack: [X, n], the rank of the item in the pack + """ + num_layers, num_groups = weight.shape + assert num_groups % num_packs == 0 + groups_per_pack = num_groups // num_packs + + device = weight.device + + if groups_per_pack == 1: + pack_index = torch.arange( + weight.size(-1), dtype=torch.int64, device=device + ).expand(weight.shape) + rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device) + return pack_index, rank_in_pack + + weight_np = weight.cpu().numpy() + + # Sort and get indices in decending order + indices_np = np.argsort(-weight_np, axis=-1) + + pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64) + rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64) + + # Run the packing algorithm + for i in range(num_layers): + pack_weights = [0.0] * num_packs + pack_items = [0] * num_packs + + for group in indices_np[i]: + # Find a pack with capacity that has the lowest weight + pack = min( + (j for j in range(num_packs) if pack_items[j] < groups_per_pack), + key=pack_weights.__getitem__, + ) + + assert pack_items[pack] < groups_per_pack + pack_index_np[i, group] = pack + rank_in_pack_np[i, group] = pack_items[pack] + pack_weights[pack] += weight_np[i, group] + pack_items[pack] += 1 + + pack_index = torch.from_numpy(pack_index_np).to(device) + rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device) + + return pack_index, rank_in_pack + + +def replicate_experts( + weight: torch.Tensor, num_phy: int +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Replicate `num_log` experts to `num_phy` replicas, such that the maximum + load of all replicas is minimized. + + Parameters: + weight: [X, num_log] + num_phy: total number of experts after replication + + Returns: + phy2log: [X, num_phy], logical expert id of each physical expert + rank: [X, num_phy], the replica rank + logcnt: [X, num_log], number of replicas for each logical expert + """ + n, num_log = weight.shape + num_redundant = num_phy - num_log + assert num_redundant >= 0 + device = weight.device + phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1) + rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device) + logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device) + arangen = torch.arange(n, dtype=torch.int64, device=device) + for i in range(num_log, num_phy): + redundant_indices = (weight / logcnt).max(dim=-1).indices + phy2log[:, i] = redundant_indices + rank[:, i] = logcnt[arangen, redundant_indices] + logcnt[arangen, redundant_indices] += 1 + return phy2log, rank, logcnt + + +def rebalance_experts_hierarchical( + weight: torch.Tensor, + num_physical_experts: int, + num_groups: int, + num_nodes: int, + num_gpus: int, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + weight: [num_moe_layers, num_logical_experts] + num_physical_experts: number of physical experts after replication + num_groups: number of expert groups + num_nodes: number of server nodes, where the intra-node network + (e.g., NVLink) is faster + num_gpus: number of GPUs, must be a multiple of `num_nodes` + + Returns: + physical_to_logical_map (torch.Tensor): + [num_moe_layers, num_physical_experts] + logical_to_physical_map (torch.Tensor): + [num_moe_layers, num_logical_experts, X] + logical_count (torch.Tensor): + [num_moe_layers, num_logical_experts] + """ + num_layers, num_logical_experts = weight.shape + assert num_logical_experts % num_groups == 0 + group_size = num_logical_experts // num_groups + assert num_groups % num_nodes == 0 + groups_per_node = num_groups // num_nodes + assert num_gpus % num_nodes == 0 + assert num_physical_experts % num_gpus == 0 + phy_experts_per_gpu = num_physical_experts // num_gpus + + def inverse(perm: torch.Tensor) -> torch.Tensor: + inv = torch.empty_like(perm) + inv.scatter_( + 1, + perm, + torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand( + perm.shape + ), + ) + return inv + + # Step 1: pack groups to nodes + tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1) + group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes) + log2mlog = ( + ( + (group_pack_index * groups_per_node + group_rank_in_pack) * group_size + ).unsqueeze(-1) + + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device) + ).flatten(-2) + mlog2log = inverse(log2mlog) + + # Step 2: construct redundant experts within nodes + # [num_layers * num_nodes, num_logical_experts // num_nodes] + tokens_per_mlog = weight.gather(-1, mlog2log).view( + -1, num_logical_experts // num_nodes + ) + phy2mlog, phyrank, mlogcnt = replicate_experts( + tokens_per_mlog, num_physical_experts // num_nodes + ) + + # Step 3: pack physical_experts to GPUs + # [num_layers * num_nodes, num_physical_experts // num_nodes] + tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog) + pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes) + phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack + pphy2phy = inverse(phy2pphy) + + pphy2mlog = phy2mlog.gather( + -1, pphy2phy + ) # [num_layers * num_nodes, num_log_per_nodes] + pphy2mlog = ( + pphy2mlog.view(num_layers, num_nodes, -1) + + torch.arange( + 0, + num_logical_experts, + num_logical_experts // num_nodes, + device=group_pack_index.device, + ).view(1, -1, 1) + ).flatten(-2) + pphy2log = mlog2log.gather(-1, pphy2mlog) + pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1) + logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog) + return pphy2log, pphyrank, logcnt + + +def rebalance_experts( + weight: torch.Tensor, + num_replicas: int, + num_groups: int, + num_nodes: int, + num_gpus: int, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Entry point for expert-parallelism load balancer. + + Parameters: + weight: [layers, num_logical_experts], the load statistics for all + logical experts + num_replicas: number of physical experts, must be a multiple of + `num_gpus` + num_groups: number of expert groups + num_nodes: number of server nodes, where the intra-node network + (e.g, NVLink) is faster + num_gpus: number of GPUs, must be a multiple of `num_nodes` + + Returns: + physical_to_logical_map: + [layers, num_replicas], the expert index of each replica + logical_to_physical_map: + [layers, num_logical_experts, X], the replica indices for each + expert + expert_count: + [layers, num_logical_experts], number of physical + replicas for each logical expert + """ + num_layers, num_logical_experts = weight.shape + weight = weight.float() + if num_groups % num_nodes == 0: + # use hierarchical load-balance policy + phy2log, phyrank, logcnt = rebalance_experts_hierarchical( + weight, num_replicas, num_groups, num_nodes, num_gpus + ) + else: + # use global load-balance policy + phy2log, phyrank, logcnt = rebalance_experts_hierarchical( + weight, num_replicas, 1, 1, num_gpus + ) + num_redundant_experts = num_replicas - num_logical_experts + maxlogcnt = num_redundant_experts + 1 + log2phy: torch.Tensor = torch.full( + (num_layers, num_logical_experts, maxlogcnt), + -1, + dtype=torch.int64, + device=logcnt.device, + ) + log2phy.view(num_layers, -1).scatter_( + -1, + phy2log * maxlogcnt + phyrank, + torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand( + num_layers, -1 + ), + ) + return phy2log, log2phy, logcnt + + +__all__ = ["rebalance_experts"] diff --git a/distributed/eplb/rebalance_execute.py b/distributed/eplb/rebalance_execute.py new file mode 100644 index 0000000..5c1efba --- /dev/null +++ b/distributed/eplb/rebalance_execute.py @@ -0,0 +1,431 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +The actual execution of the rearrangement. + +This involves the exchange of expert weights between GPUs. +""" + +from collections.abc import Iterable, MutableSequence, Sequence +from functools import partial + +import torch +from torch.distributed import ( + P2POp, + ProcessGroup, + all_gather, + batch_isend_irecv, + get_global_rank, +) + + +def idx_local_to_global( + local_idx: int, + local_cnt: int, + ep_rank: int, +) -> int: + """ + Convert a local expert index to a global expert index. + """ + return ep_rank * local_cnt + local_idx + + +def idx_global_to_local( + global_idx: int, + local_cnt: int, + ep_rank: int, +) -> int: + """ + Convert a global expert index to a local expert index. + """ + return global_idx - ep_rank * local_cnt + + +def global_idx_to_rank( + global_idx: int, + local_cnt: int, +) -> int: + """ + Convert a global expert index to a rank index. + """ + return global_idx // local_cnt + + +def get_ep_ranks_with_expert( + idx: int, + num_local_experts: int, + old_indices: Sequence[int], + new_indices: Sequence[int], +) -> tuple[MutableSequence[int], MutableSequence[int]]: + """ + Get the ranks of the experts that need to be exchanged. + + Args: + idx: The index of the expert. + num_local_experts: The number of local experts. + old_indices: The old indices of the experts. + new_indices: The new indices of the experts. + + Returns: + A tuple of two lists: + - The ranks of the experts that need to be sent. + - The ranks of the experts that need to be received. + """ + global2rank = partial( + global_idx_to_rank, + local_cnt=num_local_experts, + ) + + ranks_to_send: list[int] = [] + ranks_to_recv: list[int] = [] + + for i, e in enumerate(old_indices): + if e == idx: + rank = global2rank(i) + if not ranks_to_send or ranks_to_send[-1] != rank: + ranks_to_send.append(rank) + + for i, e in enumerate(new_indices): + if e == idx: + rank = global2rank(i) + if not ranks_to_recv or ranks_to_recv[-1] != rank: + ranks_to_recv.append(rank) + + # Remove those ranks that can get this expert locally. + ranks_to_send_set = set(ranks_to_send) + ranks_to_recv_actual = [ + rank for rank in ranks_to_recv if rank not in ranks_to_send_set + ] + + return ranks_to_send, ranks_to_recv_actual + + +def shuffle_layer( + num_local_experts: int, + ep_rank: int, + old_indices: Sequence[int], + new_indices: Sequence[int], + expert_weights: Iterable[torch.Tensor], + expert_weights_buffer: Sequence[torch.Tensor], + ep_group: ProcessGroup, +) -> None: + """ + Perform expert weights rearrangement of one layer. + """ + local2global = partial( + idx_local_to_global, + local_cnt=num_local_experts, + ep_rank=ep_rank, + ) + + # 0. Do nothing for experts that did not change. + is_unchanged = [ + old_indices[local2global(i)] == new_indices[local2global(i)] + for i in range(num_local_experts) + ] + + # 1. Perform weight copy inside the local rank. + is_received_locally = is_unchanged[:] + for src in range(num_local_experts): + src_global = local2global(src) + for dst in range(num_local_experts): + dst_global = local2global(dst) + if is_received_locally[dst]: + continue + if old_indices[src_global] == -1 or new_indices[dst_global] == -1: + continue + if old_indices[src_global] == new_indices[dst_global]: + is_received_locally[dst] = True + for weight, buffer in zip(expert_weights, expert_weights_buffer): + buffer[dst].copy_(weight[src]) + + p2p_ops: list[P2POp] = [] + + # 2. Initiate sending of weights. + experts_send_loc: dict[int, int] = {} + for src in range(num_local_experts): + expert = old_indices[local2global(src)] + if expert == -1: + continue + if expert in experts_send_loc: + continue + experts_send_loc[expert] = src + + # We need to sort here to match send/recv + for expert, src in sorted(experts_send_loc.items()): + ranks_to_send, ranks_to_recv = get_ep_ranks_with_expert( + expert, + num_local_experts, + old_indices, + new_indices, + ) + + # Calculate the ranks to send by this rank + num_dst_per_sender = len(ranks_to_recv) // len(ranks_to_send) + sender_pos = ranks_to_send.index(ep_rank) + recv_begin = sender_pos * num_dst_per_sender + recv_end = recv_begin + num_dst_per_sender + recv_ranks = ranks_to_recv[recv_begin:recv_end] + + # Tackle remainders + remainder_start = len(ranks_to_send) * num_dst_per_sender + recver_pos = remainder_start + sender_pos + if recver_pos < len(ranks_to_recv): + recv_ranks.append(ranks_to_recv[recver_pos]) + + for dst in recv_ranks: + dst_global = get_global_rank(ep_group, dst) + p2p_ops += [ + P2POp( + torch.distributed.isend, + weight[src], + dst_global, + ) + for weight in expert_weights + ] + + # 3. Initiate receiving of weights. + experts_recv_loc: dict[int, int] = {} + for dst in range(num_local_experts): + if is_received_locally[dst]: + continue + expert = new_indices[local2global(dst)] + if expert == -1: + continue + if expert in experts_recv_loc: + continue + experts_recv_loc[expert] = dst + + # We need to sort here to match send/recv + for expert, dst in sorted(experts_recv_loc.items()): + ranks_to_send, ranks_to_recv = get_ep_ranks_with_expert( + expert, + num_local_experts, + old_indices, + new_indices, + ) + + # Calculate the rank to recv by this rank + num_dst_per_sender = len(ranks_to_recv) // len(ranks_to_send) + recver_pos = ranks_to_recv.index(ep_rank) + remainder_start = len(ranks_to_send) * num_dst_per_sender + if recver_pos < remainder_start: + src = ranks_to_send[recver_pos // num_dst_per_sender] + else: + src = ranks_to_send[recver_pos - remainder_start] + + src_global = get_global_rank(ep_group, src) + p2p_ops += [ + P2POp( + torch.distributed.irecv, + weight[dst], + src_global, + ) + for weight in expert_weights_buffer + ] + + # 4. Execute the P2P operations. The real communication happens here. + if p2p_ops: + reqs = batch_isend_irecv(p2p_ops) + for req in reqs: + req.wait() + + # 5. Copy the weights from the buffer back to the original weights. + for dst in range(num_local_experts): + if is_unchanged[dst]: + continue + if is_received_locally[dst]: + for weight, buffer in zip(expert_weights, expert_weights_buffer): + weight[dst].copy_(buffer[dst]) + else: + expert = new_indices[local2global(dst)] + if expert == -1: + continue + src = experts_recv_loc[expert] + for weight, buffer in zip(expert_weights, expert_weights_buffer): + weight[dst].copy_(buffer[src]) + + +def rearrange_expert_weights_inplace( + old_global_expert_indices: torch.Tensor, + new_global_expert_indices: torch.Tensor, + expert_weights: Sequence[Iterable[torch.Tensor]], + ep_group: ProcessGroup, + is_profile: bool = False, + rank_mapping: dict[int, int] | None = None, +) -> None: + """ + Rearranges the expert weights in place according to the new expert indices. + + The value of the indices arguments are logical indices of the experts, + while keys are physical. + + Args: + old_global_expert_indices: Shape (num_moe_layers, num_physical_experts). + new_global_expert_indices: Shape (num_moe_layers, num_physical_experts). + expert_weights: A sequence of shape (num_moe_layers)(weight_count) + of tensors of shape (num_local_physical_experts, hidden_size_i). + For example, a linear layer may have up and down projection, + so weight_count = 2. Each weight's hidden size can be different. + ep_group: The device process group for expert parallelism. + is_profile (bool): If `True`, do not perform any actual weight copy. + This is used during profile run, where we only perform dummy + communications to reserve enough memory for the buffers. + rank_mapping: A dictionary mapping old rank to new rank. + """ + if rank_mapping is not None: + if len(rank_mapping) == ep_group.size(): + # scale down + new_global_expert_indices = _map_new_expert_indices_with_rank_mapping( + new_global_expert_indices, + rank_mapping, + ) + else: + # scale up + old_global_expert_indices = _map_old_expert_indices_with_rank_mapping( + old_global_expert_indices, + rank_mapping, + ep_group.size(), + ) + + assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1] + + num_moe_layers, num_physical_experts = old_global_expert_indices.shape + assert len(expert_weights) == num_moe_layers + + num_local_physical_experts = next(iter(expert_weights[0])).shape[0] + assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts) + + ep_rank = ep_group.rank() + ep_size = ep_group.size() + assert num_physical_experts == ep_size * num_local_physical_experts + + # A buffer to hold the expert weights in one layer during the exchange. + # NOTE: Currently we assume the same weights across different layers + # have the same shape. + expert_weights_buffer = [torch.empty_like(w) for w in expert_weights[0]] + + if is_profile: + # Maximum send size is to send all local experts to all ranks, + # So we use a dummy `all_gather` to reserve enough communication buffer + for weight, buffer in zip(expert_weights[0], expert_weights_buffer): + # A `/dev/null`-like buffer to avoid real memory allocation + dummy_recv_buffer = [buffer for _ in range(ep_size)] + # NOTE(bowen): Needed this barrier to avoid OOM during actual + # execution. I'm not very sure why this is needed + torch.distributed.barrier() + all_gather( + dummy_recv_buffer, + weight, + group=ep_group, + ) + return + + old_global_expert_indices_cpu = old_global_expert_indices.cpu() + new_global_expert_indices_cpu = new_global_expert_indices.cpu() + + # NOTE(bowen): We need this synchronize to run, but I don't know why. + # If you figure out the reason, please let me know -- thank you! + torch.cuda.synchronize() + + for layer in range(num_moe_layers): + shuffle_layer( + num_local_physical_experts, + ep_rank, + old_global_expert_indices_cpu[layer].tolist(), + new_global_expert_indices_cpu[layer].tolist(), + expert_weights[layer], + expert_weights_buffer, + ep_group, + ) + + +def _map_old_expert_indices_with_rank_mapping( + old_global_expert_indices: torch.Tensor, + rank_mapping: dict[int, int], + new_ep_size: int, +) -> torch.Tensor: + """ + Map the old global expert indices to the new global expert indices. + + Args: + old_global_expert_indices: + Shape (num_layers, old_ep_size * num_local_physical_experts). + rank_mapping: Mapping from old rank to new rank. + new_ep_size: New expert parallelism size. + + Returns: + Mapped expert indices with shape + (num_layers, new_ep_size * num_local_physical_experts). + """ + num_layers, old_num_physical_experts = old_global_expert_indices.shape + assert rank_mapping, "Rank mapping is required" + + # Get sizes from parameters and rank_mapping + old_ep_size = len(rank_mapping) + num_local_physical_experts = old_num_physical_experts // old_ep_size + new_num_physical_experts = new_ep_size * num_local_physical_experts + + # Create mapped tensor with new shape, initialized to -1 + mapped_expert_indices = torch.full( + (num_layers, new_num_physical_experts), + fill_value=-1, + dtype=old_global_expert_indices.dtype, + device=old_global_expert_indices.device, + ) + + # Handle rank mapping (scale up/down with rank changes) + for old_rank in range(old_ep_size): + new_rank = rank_mapping.get(old_rank) + if new_rank is not None and new_rank >= 0 and new_rank < new_ep_size: + # This old rank exists in the new configuration + old_start_idx = old_rank * num_local_physical_experts + old_end_idx = (old_rank + 1) * num_local_physical_experts + new_start_idx = new_rank * num_local_physical_experts + new_end_idx = (new_rank + 1) * num_local_physical_experts + + mapped_expert_indices[:, new_start_idx:new_end_idx] = ( + old_global_expert_indices[:, old_start_idx:old_end_idx] + ) + # If new_rank is None or >= new_ep_size, the experts remain -1 + # (scale down case) + + return mapped_expert_indices + + +def _map_new_expert_indices_with_rank_mapping( + new_global_expert_indices: torch.Tensor, + rank_mapping: dict[int, int], +) -> torch.Tensor: + num_layers, new_num_physical_experts = new_global_expert_indices.shape + assert rank_mapping, "Rank mapping is required" + + # Get sizes from parameters and rank_mapping + old_ep_size = len(rank_mapping) + new_ep_size = sum(new_rank != -1 for new_rank in rank_mapping.values()) + num_local_physical_experts = new_num_physical_experts // new_ep_size + old_num_physical_experts = old_ep_size * num_local_physical_experts + + mapped_expert_indices = torch.full( + (num_layers, old_num_physical_experts), + fill_value=-1, + dtype=new_global_expert_indices.dtype, + device=new_global_expert_indices.device, + ) + + for old_rank in range(old_ep_size): + new_rank = rank_mapping[old_rank] + if new_rank >= 0 and new_rank < new_ep_size: + old_start_idx = old_rank * num_local_physical_experts + old_end_idx = (old_rank + 1) * num_local_physical_experts + new_start_idx = new_rank * num_local_physical_experts + new_end_idx = (new_rank + 1) * num_local_physical_experts + + mapped_expert_indices[:, old_start_idx:old_end_idx] = ( + new_global_expert_indices[:, new_start_idx:new_end_idx] + ) + + return mapped_expert_indices + + +__all__ = ["rearrange_expert_weights_inplace"] diff --git a/distributed/kv_events.py b/distributed/kv_events.py new file mode 100644 index 0000000..7b5cb94 --- /dev/null +++ b/distributed/kv_events.py @@ -0,0 +1,371 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import queue +import threading +import time +from abc import ABC, abstractmethod +from collections import deque +from collections.abc import Callable +from dataclasses import asdict +from itertools import count +from queue import Queue +from typing import Any + +import msgspec +import zmq + +from vllm.config.kv_events import KVEventsConfig +from vllm.logger import init_logger +from vllm.v1.core.kv_cache_utils import ExternalBlockHash + +logger = init_logger(__name__) + + +class EventBatch( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False, # type: ignore[call-arg] +): + ts: float + events: list[Any] + data_parallel_rank: int | None = None + + +class KVCacheEvent( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False, # type: ignore[call-arg] + tag=True, +): + """Base class for all KV cache-related events""" + + +MEDIUM_GPU = "GPU" + + +class BlockStored(KVCacheEvent): + block_hashes: list[ExternalBlockHash] + parent_block_hash: ExternalBlockHash | None + token_ids: list[int] + block_size: int + lora_id: int | None + medium: str | None + + +class BlockRemoved(KVCacheEvent): + block_hashes: list[ExternalBlockHash] + medium: str | None + + +class AllBlocksCleared(KVCacheEvent): + pass + + +class KVEventBatch(EventBatch): + events: list[BlockStored | BlockRemoved | AllBlocksCleared] + + +class EventPublisher(ABC): + """Lightweight publisher for EventBatch batches with data parallelism + support. + + In data parallel setups, each DP rank runs its own EventPublisher instance + to avoid duplicate events and ensure proper event attribution: + + - Each DP rank creates a separate publisher + - Publishers automatically annotate events with their data_parallel_rank + - This allows consumers to distinguish events from different DP ranks + + The publisher is responsible for adding DP metadata since the scheduler + operates independently of DP topology and shouldn't need DP awareness. + """ + + def __init__(self, data_parallel_rank: int = 0) -> None: + self._data_parallel_rank = data_parallel_rank + + @abstractmethod + def publish(self, events: EventBatch) -> None: + """Emit events in order. + + Implementations should guarantee at-least-once delivery and + monotonic ordering (e.g., via sequence numbers). + """ + + @abstractmethod + def shutdown(self) -> None: + """Shutdown the publisher.""" + + +class NullEventPublisher(EventPublisher): + """No-op implementation (default when disabled).""" + + def publish(self, events) -> None: + return + + def shutdown(self) -> None: + return + + +class ZmqEventPublisher(EventPublisher): + """Reliable PUB/ROUTER publisher with an in-memory replay buffer. + + Spawns a separate thread to handle publishing from a queue. + + Parameters + ---------- + endpoint: + PUB address. Use `tcp://*:5557` to bind or `tcp://host:5557` to + connect. + replay_endpoint: + Optional ROUTER address for replay requests. When given, subscribers can + request missed batches by sending the starting sequence number as an + 8-byte big-endian integer. + buffer_steps: + Number of past batches to keep for replay. + hwm: + ZeroMQ high-water-mark for PUB socket. + max_queue_size: + Maximum number of events to buffer in memory. + topic: + Topic to publish events to. + """ + + SHUTDOWN_TIMEOUT: float = 1.0 + END_SEQ = (-1).to_bytes(8, "big", signed=True) + + def __init__( + self, + data_parallel_rank: int, + endpoint: str = "tcp://*:5557", + replay_endpoint: str | None = None, + buffer_steps: int = 10_000, + hwm: int = 100_000, + max_queue_size: int = 100_000, + topic: str = "", + ) -> None: + # Storage + super().__init__(data_parallel_rank) + self._event_queue = Queue[EventBatch | None](maxsize=max_queue_size) + self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps) + + # ZMQ sockets + self._ctx = zmq.Context.instance() + self._pub: zmq.Socket | None = None + self._replay: zmq.Socket | None = None + self._dp_rank = data_parallel_rank + + self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank) + self._replay_endpoint = self.offset_endpoint_port( + replay_endpoint, self._dp_rank + ) + self._hwm = hwm + self._socket_setup() + + # Payload + self._seq_gen = count() + self._topic_bytes = topic.encode("utf-8") + + # Thread + self._running = True + logger.info("Starting ZMQ publisher thread") + + self._thread = threading.Thread( + target=self._publisher_thread, daemon=True, name="zmq-publisher" + ) + self._thread.start() + + def publish(self, events: EventBatch) -> None: + if not self._running: + raise RuntimeError("Publisher is closed") + if events.data_parallel_rank is None: + events.data_parallel_rank = self._data_parallel_rank + self._event_queue.put(events) + + def shutdown(self) -> None: + """Stop the publisher thread and clean up resources.""" + self._running = False + self._event_queue.put_nowait(None) + + start = time.time() + pending_items = True + while pending_items and (time.time() - start < self.SHUTDOWN_TIMEOUT): + pending_items = not self._event_queue.empty() + if pending_items: + time.sleep(0.1) + + if pending_items: + logger.warning( + "Warning: Queue still has %s items after %s seconds timeout", + self._event_queue.qsize(), + self.SHUTDOWN_TIMEOUT, + ) + + if self._thread.is_alive(): + self._thread.join(timeout=self.SHUTDOWN_TIMEOUT) + + # Clean up ZMQ resources + try: + if self._pub is not None: + self._pub.close(linger=0) + if self._replay is not None: + self._replay.close(linger=0) + finally: + pass # Do not terminate context; other sockets may use it + + def _socket_setup(self) -> None: + """Initialize sockets + https://pyzmq.readthedocs.io/en/v19.0.0/morethanbindings.html#thread-safety + """ + if self._pub is None: + self._pub = self._ctx.socket(zmq.PUB) + self._pub.set_hwm(self._hwm) + # Heuristic: bind if wildcard / * present, else connect. + # bind stable, connect volatile convention + if self._endpoint is not None and ( + "*" in self._endpoint + or "::" in self._endpoint + or self._endpoint.startswith("ipc://") + or self._endpoint.startswith("inproc://") + ): + self._pub.bind(self._endpoint) + elif self._endpoint is not None: + self._pub.connect(self._endpoint) + + # Set up replay socket: use ROUTER + # 1) handles multiple REQ clients (identities) + # 2) lets us send back one request → many replies (streamed events) + # 3) works in our non‑blocking poll loop alongside PUB + if self._replay_endpoint is not None: + self._replay = self._ctx.socket(zmq.ROUTER) + self._replay.bind(self._replay_endpoint) + + def _publisher_thread(self) -> None: + """Background thread that processes the event queue.""" + self._pack = msgspec.msgpack.Encoder() + + assert self._pub is not None # narrows type for mypy + + while self._running or self._event_queue.qsize() > 0: + # --- replay (non-critical) --------------------------------- + if self._replay is not None and self._replay.poll(0): + try: + self._service_replay() + except Exception as e: + logger.exception("Error in replay: %s", e) + + # --- main queue (critical) --------------------------------- + try: + event = self._event_queue.get(timeout=0.1) + if event is None: + break # Sentinel received, exit thread + except queue.Empty: + continue + + try: + seq = next(self._seq_gen) + + payload = self._pack.encode(event) + seq_bytes = seq.to_bytes(8, "big") + self._pub.send_multipart((self._topic_bytes, seq_bytes, payload)) + + self._buffer.append((seq, payload)) + self._event_queue.task_done() + + except Exception as e: + # Publishing failed; back-off a bit to avoid a tight error loop + logger.exception("Error in publisher thread: %s", e) + time.sleep(0.1) + + def _service_replay(self) -> None: + """If a replay request is waiting, send buffered batches.""" + assert self._replay is not None # narrows type for mypy + + frame = self._replay.recv_multipart() + if len(frame) != 3: + logger.warning("Invalid replay request: %s", frame) + return + client_id, _, start_seq_bytes = frame + start_seq = int.from_bytes(start_seq_bytes, "big") + + for seq, buf in self._buffer: + if seq >= start_seq: + # [identity, empty_delim, seq_bytes, payload] + # (identity, empty_delim) are stripped off by the router + # receiving payload is (seq_bytes, payload) + self._replay.send_multipart( + (client_id, b"", seq.to_bytes(8, "big"), buf) + ) + # Send end of sequence marker + # receiving payload is (-1, b""") + self._replay.send_multipart((client_id, b"", self.END_SEQ, b"")) + + @staticmethod + def offset_endpoint_port( + endpoint: str | None, data_parallel_rank: int + ) -> str | None: + """Helper function to offset the port in an endpoint by + the data parallel rank. + + Args: + endpoint: The endpoint string + (e.g., "tcp://*:5557" or "inproc://cache") + data_parallel_rank: The data parallel rank to offset by + + Returns: + The endpoint with the port offset by data_parallel_rank + or suffix appended + """ + # Do nothing if input is None or data_parallel_rank is 0 + if not endpoint or data_parallel_rank == 0: + return endpoint + + if "inproc" in endpoint: + return f"{endpoint}_dp{data_parallel_rank}" + if "tcp" in endpoint: + if endpoint and ":" in endpoint: + # Get everything after the last colon (the port) + last_colon_idx = endpoint.rfind(":") + base_addr = endpoint[:last_colon_idx] + base_port = int(endpoint[last_colon_idx + 1 :]) + new_port = base_port + data_parallel_rank + return f"{base_addr}:{new_port}" + return endpoint + raise ValueError("Invalid endpoint: must contain 'inproc' or 'tcp'") + + +class EventPublisherFactory: + _registry: dict[str, Callable[..., EventPublisher]] = { + "null": NullEventPublisher, + "zmq": ZmqEventPublisher, + } + + @classmethod + def register_publisher(cls, name: str, ctor: Callable[..., EventPublisher]) -> None: + if name in cls._registry: + raise KeyError(f"publisher '{name}' already registered") + cls._registry[name] = ctor + + @classmethod + def create( + cls, config: KVEventsConfig | None, data_parallel_rank: int = 0 + ) -> EventPublisher: + """Create publisher from a config mapping.""" + if ( + config is None + or not config.enable_kv_cache_events + or config.publisher == "null" + ): + return NullEventPublisher() + + config_dict = asdict(config) + + kind = config_dict.pop("publisher") + config_dict.pop("enable_kv_cache_events") + try: + constructor = cls._registry[kind] + except KeyError as exc: + raise ValueError(f"Unknown event publisher '{kind}'") from exc + return constructor(data_parallel_rank=data_parallel_rank, **config_dict) diff --git a/distributed/kv_transfer/README.md b/distributed/kv_transfer/README.md new file mode 100644 index 0000000..39377aa --- /dev/null +++ b/distributed/kv_transfer/README.md @@ -0,0 +1,29 @@ + +# Distributed KV cache transfer + +This folder implements distributed KV cache transfer across vLLM instances. +Currently the main use case is for disaggregated prefilling. + +## Abstractions + +The KV cache transfer contains three layer of abstractions: + +- KV pipe: a FIFO pipe for torch.tensor transmission. Key APIs: `send_tensor` and `recv_tensor`. +- KV lookup buffer: a lookup buffer for KV caches. Key: the tokens, value: the KV caches (and/or hidden states). Key APIs: `insert` and `drop_select` (similar to SQL semantics). +- KV connector: a connector that connects the KV pipe and KV lookup buffer to vLLM. Key APIs: `send_kv_caches_and_hidden_states` and `recv_kv_caches_and_hidden_states`. + +Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer. + +NOTE: KV pipe layer is bypassable: you can skip this layer if your distributed +communication service already supports key-value-based lookup (like redis or +RDMA database). + +NOTE: If you want to not only transfer KV caches, but adjust the model execution flow of vLLM as well (for example, allow vLLM to receive KV caches on some tokens and do prefill on the remaining tokens), you can bypass both KV pipe layer and KV lookup buffer layer, and directly implement on KV connector layer. Bear in mind that as vLLM's model input is constantly changing, this implementation will likely be broken when vLLM has new updates. + +## Disaggregated prefilling + +The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh). + +Here is the diagram of how we run disaggregated prefilling. + +![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg) diff --git a/distributed/kv_transfer/__init__.py b/distributed/kv_transfer/__init__.py new file mode 100644 index 0000000..2bf4e1f --- /dev/null +++ b/distributed/kv_transfer/__init__.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.distributed.kv_transfer.kv_transfer_state import ( + KVConnectorBaseType, + ensure_kv_transfer_initialized, + ensure_kv_transfer_shutdown, + get_kv_transfer_group, + has_kv_transfer_group, + is_v1_kv_transfer_group, +) + +__all__ = [ + "get_kv_transfer_group", + "has_kv_transfer_group", + "is_v1_kv_transfer_group", + "ensure_kv_transfer_initialized", + "ensure_kv_transfer_shutdown", + "KVConnectorBaseType", +] diff --git a/distributed/kv_transfer/__pycache__/__init__.cpython-312.pyc b/distributed/kv_transfer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..588f6e60a312a4a76b37182f65e2f279371fd1be GIT binary patch literal 506 zcmZvYF-ycS7>1L!*B130l+!_Qw%|25oq{;%s5ps3mrzZ-3zxJZNl)wM><@5v^SAg9 zvO2kmHyk*b9tb_W5Aq~mUcQ$%dGB^tfGZw6vpo-hkEZ!6*UxLZ@3t2}0D&9`m_gxX zp71kY1X&soW&9JX29WlBU<&IBDLnEUkogk2LJ#7 literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/__pycache__/kv_transfer_state.cpython-312.pyc b/distributed/kv_transfer/__pycache__/kv_transfer_state.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f40e2e0f3306cc6754b7f76416865bd251ba08f0 GIT binary patch literal 2680 zcma)7L2nyH6rQoyyK6g6;y7+g1I|L*xNaM}X-gZbDkwy0Nh6$+xiWE=}NL)A|l_MuaRDM8z00$&4Hm#Isl}eQ;2W~E|REZOBc5TO40XnOlc{A_L zdvD%*^UW`jNC-jGxv#W8MTCB1lTH)bMC&IY?xR~sMJm=&8Rv1?m-k^d_v!vJm*-sC zuLsI}o_A?Z7s_H@EC=&Jj0mO!AREv_<#0ai@_Bur9LYz37L-B4slp~$BSWfq1#GE7 z!WDgL=pL6J2099K80ZmoR2}HZy6c=8xfjUCf(ST+>X152d}{PHSA!}Pd36N1qiS4@ z!R~lXNFDlv9Vz8=BDs%m%*%7{&(AF`WUo2mQpMIxL(x;%i7l?onTA0Mwn^VpERw5L z2rQ5EEMHX^vj%#4SDCQ9GUW{QNG_W?+Zld2T72 zouA7sEz7gl=Ce76r^K#Oqre>QG#?Ot_**BTyN~S7I@%JhLuwwh=U?D`&P_ONc5_zM zETvSUq@>tHl@?c|0&_sJsbW|~LZymA6D!k`M%Pv|m72X~8kr3^bOtQjR9mfr-OTL` zxnr@HmMt6HN>^%598}3YQnKp~9RZUu_$>*#EwmR-)WfHm$QR&u#l@B%Mwq|g(ZG0} zOVD8uIyh`hPjA>%RK+*ZHqO#P;Lst~vE4KZofBVEtOJ~h1MevORs=fNneS9EU*}G_ zd!1mRQ#}O?5}n2Y;pT1{i2LX++T!=K^ckDu8n&5)+wLNp?Qq)zmUn0@m&zxgbZ(6l zZcAEGvLS~onH`HPacpg4YJV*qlAtYk!jP{3i)*XU&_*YAdHdiv7L?!VaSDq_kGu$lIZW=)&y8_lhO5V6a$HC9gsbYnHq*bwG_U2ud*F3Fx*^^9T~7b>Ya~X!6V8Q(+j$ z-J#KE;rQ3u<2TPg3SX#m7rf#LvpXY0XaOl`5H>qQa2?#s+F50IU1(PXkO&3fcPs0A zAXsIsv(>)Fs_S6RdathbE}Gu2zOAI)c?i%tJQ4j3D%E{L*!ucFKNq39_($J^c3msJ z8u(GeDgWk0Hx&zAg!x}k^3sz{`5oSikHb+wS2_kJQ-U+n3ma_#mY?3`Zax{5Q64&H zxIhVj4-2jb`IH^e;XhhhUYuV}`6#>Ut__RDNor?4K@6)(NuTyETGC@6dmer(2OSi^ z2ohsE!ik1(qApDCii1ra1xI(piH10_BhLIP&g=np4J4l>&h8{;8;RN7SYj_K?L?D} zXma=U(=P;WJk$`!ngJ9!+4Q5}yZ?c5Ghnh8aptfB~U)>pUCmT~<&HJ~|34 zI6DiIBU)=!TQ%<({rEU`Kn%LFtnjbHZ;eB@g`NpV>%zpIIJP4mZ-~bqixazvx4Z+1 zNe^?|HdzH_Q->Vhu2nRnM47{sL7|gocO%{No}_zn%jj+n)2r|lQ+6+$saM}r^j>wZ zF3!CAzKtmtAH2eGFL2tcKkot`bv12FxgN4tSJn(bC$L5+yEtByhh$kX;LTMSuvxO~ zzIhI}YMMG_FAimp;+7RHg{^ka-)@(7==Sh3 zrO%Ij&qIidKH(;waMMrEU96*6J$~|s3ypZ@5xUsqIXqq;Ki5JqHU}|2{jJ?XFgzDW z@Z0s|<%hXOa<++J@^h}mhUOXc0sas__mASxZuE_&JD#0>INmtB;7*F604naBs!odHl%P*G7*Q~lOz z^3lQMdH^*W4g1;4nzS4y59ma_IOW2UbLqvjsyn&vjAF&F*gpwpVC3fE<>NnhUgE;V zOY#a=6_u1#w6E*v>ggNYFuiMLZee-P%E9sBBPVAUS8pF*KYw^YV8qkNsOV=gu_>u9 z($X_tzRJwYFDQIhR9y1@Lrra6eM4hYb60myZ(skX&tJyICnl$+XTG7(nB|q#??2Yo zH*kAD_YV$#;g61glZygC^;fa}F4^zoVk65%Nli^fP4}Bz6qNqtLd8Z+bM`VVyQT@< z11}CyxiET8t>oP5P6n|ncd%UcPevKJ#pTiGaKDN67s>wn1PlL{B>THy|Bwp>V5XuV z4;~d800JQVIy8pjdvhX?hv%IakF{9>D^Ppw-&9FPPD_g?F`0^_=iPRt*BI&(YHM#( zy5+k`a{{uQXAN}#=TTS3M*o$HFhdkLa;(uf3{qa&EmrenluI(%9tgcT?n%4HxEe+b3BuFNLJJh5|B+h^GMX$uZ|Y zwkSqNZGOCZj~w1R1uW+3{eA12kkEgf@b~`u-?o>yxCR$WJcLgH^(8C71MQChUrr0< z@2)vNiS57EO#WlzHl3c1clM)_QvhV(6c7?A{Etn5kI#Rd@YnA8Kc<(AyxvRZoiv4b z96MfI_^+Y6!4$cHa#MDCFxdH z*W%%Mt5d+b?*S*paS1^J`MHcJa0;ltqI(J$N;l|DUcyc@Z+LgP*A5%-PH%x>^A)jR1Y2<$MaV+5E?n!F<%2Wu;DPZPRK60H|VWVyp`oW)=Io#P!;E?w?IUDuSJe2W5 z-oIQfz}oldTFA|OR8x*zo&x>iIWkWq{}u^qKmHC=nI@?}I0bki(=@_2$kyx~-hVb{ zEnpRq>`XpI+>^kIyG9IeY(TXQajig1 zC2$?d9U=%0-yaOby3||kyeh3Ia&EZE+t1mUz-PFvdTV^KN?~8OkF>s-_-1nzA_zxf zklTi(aBZDwl5l%Bglb)7LS-S}`@L)ZFaL}-U9=#Z9)*V8_K$Pb@swBMuX^+(G0r3O zPg);hYmdL@zJos@ba)Mv-kZ5Ks<34U@BhMDz&Y@E06bVL7p`bp1+v?AoAvY$bjUQ< zHEt~o$!oigFF`vO;a_1}4n44gy}zQ=D9&ejmu^(g%o+Zev+6npNEg3(SwHopx9SNk z9TgX}&yHpk>EID_3Sd%z0@M%F&`3_07u>@dyOr@_&Pu=LfkTt#S^E^-?bhX8fF567 zr}vgA0TeX@OTt(?%V%A5yc^f`)vf4VtM70BeY?E0!*omfzrUA{gx{W{E#&(y&ar z9PbsDB74h_X1IFNP>>>Z+W;y_492a11Y7B_{$0YtZH^Hbzg!nz-*aUIMCGC5 zZ?s(!q)Kd5547mGg9yYMr+@pw<_4UHP%rJ2g;wW-3HgUGu}k6u+H~%eEG=5Z8oDi-*t_`~bTA z@8 zdw%)1*?R_rIi5VY}>KPO;JDQ_HLqyJ38-q z(-5XeLZZmPDxOe^dOiAK@5{AcTzeAR;AOmqFp&*!i-uCcGiB$t!8;le*K{oA9<#&nCD7WHjs zko4bdT$_UnyL+aqr9!37PJ}-j7Lks32PA%PC?kT4LI#m_JfS}G%mzhp$vzdqY>=UW zgny#s?43uUJBWlqKF7@zEmq?@9l@rx+Y*9-V`c6G&&%|RxqeDS`(`EagiJs<5MCFu zp+KVgsBXjqwmH!h-1IijEzZ~OX7pL1D>u695J%V(5?QqX~V)QMcYg?Bxw!o(V(f*!O0O11vBI!k@AKN;l)}rrX2n|93 z$)#QJu6DEd-pon5VFaM(XMs)wjYmt1Qom9^*Sc5xEB^Y+jZ{!2WTz%%W$rs!6-Y^^ zfWlFW(fU&WZkLCUe{^S!E)oW48?aGGe0rFBv5rNfNpJ+a4uq-F5Y^G7u^g$2E$>t~mOC(nvGcYC3>+L4+VZyOC{dI7GT(kwL|} zIV$yDS)23I0rs{Iyg+peU&oM&g>7y2HKlbs6N1Dk;Bys`{J+}Th-^4IZ}yX<;X0&q zA+v$NNcV#iZ@p0otWodaoQm{MEA_F$Ifu_qc3M|Y0a@-$6!A+FvnDy#ALN2pB@Z}r z`Klu&)Ri`&0^rd7#czs-#dwE&Jk75BDS#G!OYhlhdsA*hDsUy%rtQdbVV@YJ$#bMuJXjvZ&~i>~!E{&#qF^Sv%dL_Oex_-EX2 zI$Se<)Iptd^Ul`&PJNMM`!*1M8UqAwfP?Wyh^G zdv2d=8B#^pc?w{hQS5ZGr^b6!9$`r+^Ph;Y9BOyH2`i2Uqe=-l4aX zN83jEYiG2s#-{|bs#=V*U%1E18k!8{B8@NZJwURwwUK>@QA4QfAgS5-SJKatAm)dh zCi)L_IB!$@u2A1WR>a{-T>ok?*ZwJ@dFMmg2L16^u&xebkf6SR?8?P|#-*a`OFU__ zJY;d%&k5aCRj=)R`}{%DUGevqLuhG7EQM!r-u>)|%lHQv5AJ?@1)k9@@sHLP&wFY( z`DWYc+=D$``@11LkRD(Zi5+({z104AC&}r(!u!wP>2G4i3#snA&$g4F@+)CbF?f*i zN3tF52u)n%RB%vg$L&E~&9OtQB2`+WvzHuGxL)vV#>*{vrS1=JEuR9;Z(~bIz0zW* z0R7x|Be5EC_;JNZ4Gp3)-m_W7RvR+_BScRmeAcZa$OA7>sz--xza(_??b^MIYf1}r z#I!l!$pa`cOHe0U=BhSieNx_BUqen65xP?Oe(rAlAee#s!Tj)nZpC9ty!Y)`8!i^M8$t)Pw~|8FHrN?DVVjJ%QgI{4}i4 zreUL)prCGzkyxy2!RP10^={eU(ipfe@zaud`FoSxneO*t77w4ehr~2*KeB67DzyAY z+DJcU^p+unw^b%hwaMW1usPZ%AL>B72t9oIaa7a0{z_Yy5YUJ@$ysWj0DJQO!==72 zHw)4S1f5b7bw!-2h_v`z3`iJp2S1y)sd(AA2o8lt7>{8RY2tgCwC+X7@;%WXvR2q0 z5RhBBC^o@}pFMO)0Y)6RZ=0xrgG*6(CsXUE1fOm7uj@1~D!2+do$PtsJq?yiD$iq8 z5fFSOb_&8pygnWx0bAtob^g&3-4p1S_TE7F#m?ZWbWHq_mb`#hsAj+VMww(z0q6IG ziHB(~f@1NNJvrPVBKWLBBp!?oiksK&wv)r{RI~|A97x8QhaalKiJkIa5@|n+-@Rol z79f_bS4&hrc^)D}l)$;?V89HTrEAA87}uXTOjZc1>v@VPUifb7{J-ow-fG8B(9|Of6^`9CLotc9iU~Y-*0k^X?Qj)K!O5H4ODUG`w*% zXXvBmgQ+E*L1<2EFmL7vl7tsGc(TuqX@J(T8zfN9Z5A(2X~$TLy>rh68?}mgazz>* zqGx1+{h5u#ioV=nr%=MQiXDRSg&5;_B;{kG5vFBk_aLT@sgd7#N#DZpiQ6?UfRO4z zCHN%!6!376V;R(K%u2k8k0{$jE%t!fyE4D39JH0Iw>IC-0CM`}TDwx)=2NrHFjLq7 zJE5_eIh_#v`TBB90@tW-X|V1N>WfPT+~8r8C9&&rxS65sFdvn&AlVUm8)CgZUsr#; z9P1Y11+VN5VHiPBU2Rz!1Y=qd?lla>y_nJj;9YiCp7*o!=ggttfmE3=za_dKVj`qS zIP`9g4Yn_-#MWBj&HU;MMGk8N0Wlr1zOqkdO}ygq!tfH;4wq3K&ZytR8C)zm#CSI3 zG%d+jLEGsRAVv*6=xl!@tHOUF?>Q)T5o^_cq~gJZoOwlFzze4*O#c6g5^yRk9#PGo z0;aFrn`*~x0ts=%Qa#&Kz;pG*6K{n#~a{8NzO@Uqe5vP@07*Ni2%J84uwkuY{dA$u7GK_-f$@^fpfy55!|!P4Vs5 zU%DFvDz~rWmj$%WFutYBF7Q9rh?NoR((6!dkAZR_pcrHo+L7J+F9yVqodG&R9T(bL zZ_`UW*A#x5V^v{|(F=WmPeZ>Obyp~dX+^FYSpCp!B}jLoylX-jP(fiwqn;Y(CkZ5h zVGmplhNf%rjdf=mMf-%BGCJl(vY=gXM>ggRklEqrcGdEDxa0;jE)W@`kQ8Niew--m zmfG_xUC!Flz+Y99^N!ND?>uJqS81hWl`qlp4H{vm(Us9{Y?9Q(jfi%>L!>Im@P%AtA0v^T~1}m9JsB69k#^xVkLTofg@Iv z3T_4>mi&9#g4z#zo(O%V5L?woeY_K^)@L#zRWR-&Bsb*N=Jv7dcH4Mz)9s6$`=2U# zaK8Q9bPn(F;r%zs%DfB$LJdU5SKAvi2e)${L?&X`g&vwMD=9zDenZz{at$Q=jnuCg zXuP$I22!stoVPaShZ~#nlTs_nbjmSp>^0ka!x{!GwUu_nef9Qm-(eJ;$yDtCloRjW zfx4Q(`N>yEci6P4F&!S|U*4*E%|b%33BZhnS32dzJ;UVyiDJ_}i>UElB>N+nF<#FX z%slISxbY#?>qkL#3wxF)4`aZPH>wut0*tiE==A_t^`h6&wK%`%_g1h>q2BHnG45|) z$ljx05xaQbYs=FsVx!i?EBhWngDBpNe~A6O-+m@bwVi#F$KUGRM{j%UFWDOy-GHk# z0DX$^MB2-}0sdhj!bIQp;#Bq09#L1Yj;X9T z@m2=p#nO@|IFbK^;{6l)Y5jMvZG@)z{|)Ke+eJapZl{3Wq^R!$+UWvFPM5&Qch&jE zUS|`%+CoF6t!0h8-2B|l zAuVpe_T3AOeR8Zym0Wm#?CfDY{<8Nn6Nqsg!dfRSV2KmXE!VzUiG)iqd0kN{rJuOa zwVW6*e7>zE?@h164{!pMIph-2^CU~48&%g3S*8|(DlGqz>foz1-r6XWt|KsSxRMu- zn>Z|0F)t_F32ytMy2y%+maN?oDOf5`@SFa2F;UI3bf=DegKSy(5kVZZSTDeHr5$^yia&n{{+Wp zA88VM8IbnnH9HLghx1ZIQu}#)TI!WcG{#-ghP8t@)2Gz#$$tLT>i$^^R!7^n;04}JNMw*Nrwxz}pNsYHPY7}OqloYAv=2ez5>OvN~AU%;)pJS{3N zKw%CY=5hI5a+)c%8|pn!J9UUINQZ|H1NuNlr^=B_q=@BN>42?lTkl?#s2bPD-(5I0 zm1z`Lbp;aj$j<#&IL{#rcRr2)!snpv3Vn;-wZCV%*{#s}^>D+fF~EMVmqL(>ZdK{? zCe+YGrw37uV_M4S@rZ#|XGFJhc)we>@=x-CR(+T$Kis?*nTra&K|zr%+e97K0+AJ* z9#>(RDh$q>v4L@Wxg6I%7n?_ED!vgkW&8QQjdqnzNohUyqYf2Wt)Bl`n4isn)C}$` zEG{?&*zzpj8Tq}}^OxVyt07Z4H4SlXQm)6W?Mo~-`VFS@t3F~$MGE_o+(uVd>Fo=| z*m2eN`AKa7l?7`>Rl*K94)pcR=&kiKgfjN4A6C^_OmBS5{Bl>8!`+*&**PiJV4qu= z-jOCq6j(?!0_^k_A=OkfDnIKc$5)}iv@55Nj0?8ws@%L~Z zJy6aNPQ-O=WKrZp958af8c4B6ME(?*4k2?XrMMdx^LpgEjUaVTTXl;x$40}R>^(>1V7M6_Q+YJiI*{zRPfY0 z!FEiLGj@h9GO4~Y*CQ_Qem=t_?Dg{uEkIfNK%LXZDJ*2Tol?zsNyq`SI-KY$q-x_K z0M&+%uYVbpbZ>b0My=+{rIrs8Uk%KdJ#W#{g(85s*(KJLY9{ z8poh|!O8lLs3UJGCpw5CU}>526hN^7spT%g1Pg@FMQw&n>reZ$O#K)f6!v4ERzLH>+jMsS3xMtXr+V zC!5`(;bkcrmL+`DOU&leoyczsrvS}yVo)hjccPrAvkV3aHP59}q)ajS>gsRbpC6iF zJq29m6DKhK3cSz=0wb>Ay$RP4V%Wik?gaKgfPLlMeMCO z&0_7+!%#sK-jHx%e!cyJMkzL^Z`$vJaq(yPU_cRvXpfzVxLjxl;>(xvh9ch2m4$0X zp4C5xpDv#is(}mE+qkobJDM?8)OE%@^H7^ym=7a$3hL61OKb@m>S8bFl?v5*w;k6S zg+P4G2V}1>zXtYI416f!Fm8WlsS)F~}Q$Ki}sFU+lHx#>4Uk zO7S(lS2YC3#R$_@g>#Bt`s3Zvi*O%v;%0 znXWm96^~3BvR7q;_+6k6Ko+CWvkg7NsD|Dz*0-?WFa#BYvPs1z`{EqMj`Za7oF3EM zu~UF+OD~Hz?n+0!^vM8})!+gsPG8bYD}T*cPNO-m&EOR96nXge6p+`11`^L@k^6w` zQPGp*PnxFy9q%p>-jLk?!bJyxZz8fKrY0;tL$FDt3DCZU+FA0cHAvhB1PwYVhOAY< zwNC*Ik4^#06Tx;x_kYA3)N$<@6Ui`49o-4nFg+42wMY&CI*8OixpDq)F%DD&DfY)x2sYh=q^>u);l~Qku{ME@v+(wStV53Rw zgMSSv|3R4l>l>b2Ym7=^Ylg%wFu!SU0Y?dG60g_(ZG=8@?JAg>zV1vh_&>;))`rIbwV(fvr}P;mF!;?yEBDGlaW0UMx3* z;q>4zrGhMXB+TA%q0%zom&&bM=y$}a71%2RXU&lTiP z6v<2hLViYeD}R<7|4rl43@yg5bBDFA3&`Yu?=Pnm&nr6xH~^3E8T8vL;nrCEaFHLP`-)~S~yfQqWjPg0~Ustnnzs6?j(52#$MbJf6#1T z`J`I6wI(o)I(~&xJtX$Xevup@u1o9DB8Ngiz{lU+t(cI&K{<&kKVK>JPh4Q4D5MU0 z!Udu@1)#5x10h1i-;=z-Zmd5h?ph4AW1o>Gk^6TMbYxxo+D`OK!e2+awjW>8pdvr~ zkDw$~J^7&M`e^TsOOmULf2jMzwevI8yJxkV4!F4X$RNV+ZcJ7+5UP?Cn-qEqz`n{j z1tg9R3Zu7k=Z<4zkNp2wN!He+Z42@&q1S=L-y9(8pY!)5&%=?OU>zL$k@prYLQ)H1T!*}ZGr`nIONFj9F%hYoO-Ql zj)@6WhTH+auZ5u5PTrpa){5~!@>KK+iQg6+-4R9!4Hp+zW+4Ai3Hp8SCOZED{#Vcc zHZFhZdF1Gw-*^12&wsJ3E3?~Zi8i?T{$9zXNC>x-PotUxce+BB5Ul%ya`i;H|B*YT z%1+|-5PCJXaCPI+#hGqW$hnhC`;TJYRm4u1<^0FuiTtbHzRj!Sa{8O@g8q8!10 z_5Vpv{1d1O{hQrlcD9Zn@hsPN`}}=}^y3jRKXP#S3eNfuq$S87h)r*Slg7ViU)!1e zk;vx!2OII9=vx1+I?Aa3AT7BDLr9v3iwB>P&x`*Kz?!a~0``FQWS)~DF_O$>vivLD z+PD~(!-J5b#T8Evr>}x;3qJKgBq>OYnWF+(HF=*=`U#|%6e)X~~@g`25IsG*| zaK`P^yz>4d-eEH;7MtFjP~t;^BWx;L0pBS!6o$p{>GVgFcuSwxTv(Oy71balhl{SM zc*=p+VFChfGGCZfRpI5*BASA?wZhmHgt+c{(p6O)TRMRQAIX)tv7tjaPZAO5SZhGf zNFvY;2sh%vuPk%i-FMETjpEvFIWh{L2#pX^yIq(YOG26X)JVUZ+nHIxGjFeJuDw^H zTEAmr7w^DZs>55qfY#wOd-l&L^nYvnqFHSH-87B76f&v=jmi1Y-i?g4Ff3kNV_Vk{ zNof6~u+5N=K$P?Il7f>>v5%Q6Qmx*&%|p}s$I&Pab)szaD{&o5qQSfCS5m_wn{Pun zl9q%*V)Cj=bh#9DO5{C=LipvS$|O53<1=sx>u8G)L!U8#g2N9EFTclBjaKc8ttZ>A zB8+j?hfmrXa1ID#yn1o}QckSjvQMP0{D~Lc`Gvi+o%8VwbYg9uj0DiuFu~MiQCRvw zn#p3^5sU2kNXNFB&D9F|7}j{QMQ&nQ!B^(GOB8qp`gNwq`BxLSB=x;DyR7CxsfKuI&tMH2C&j6mNr@^^N*O zn3q3aifEtn$j)s|G3KDy?`pvWozF|q2w~RA{F)>id?aS@>en?AsE%4;b!9}X8YO4W zC1v0+!tA7QLwf5@oKw>%R0BVbDe3_}wd(I|B${1K3Vw%=!mbO8HGQb^;r{iZ*qQm} z!?ONm(7fRh0O`0U}ZMrwL zPd-wmW#{TVbg{6o@{fF6fzb1HVcF+4`CMWP-f^2Ui|2do4iTkg4p3u#*V>N;ZcCzv zWBGr@)izAlO7fAB*U$MbUgyi12Hsz_&>9*=pc&fp;e9N=z zSlb1=eok*)3r|NWU1Ft}yZ5!Sq43z@5jE{Xe5GOZol##&Bj~M{$wHIy1M;^tJ1Jfy zKZlpSvBvmE)JmThvM|9tihlUcNV1b_1r;yJnIgvJV?H1CP$Y`#x#rLXs&{#%)*F8Q znSs#R`Pt$7xT67I!S|!`@FDEdvb*w-mnA=4FPIiyX_;X#im=YzKotZv+`H$>W+*%R zE3w!9mg;gLz*oq*GD$-O-#5HWx9y@B?u0M*MFZ)a0~x=Rq8HEfKbl6Re&rbezYd6N zO?X|k*U>RxF%fbWmk}Kzw4R=XF=n$|8<9QJ>s4k~Agy|VZ}ro{{E)i$-DMc(GE9hu zOACC)#h)K^IAQhXMW*8OX$Z=|GivrTAU$=E&b@=;Tumqi*(SzY3iG%*+!)K~2{hdZ zEIW_s8U;JJ$mI8JU22ymdML&0yqE5$y5ajEjFp zsr&Spj9kp?8n#Bq3lhneNM4PLGaF>0QcjZZA{{>OOYp639T(Dg?#4x_^gAC`L!4(` zxC`)gE*+b$I+#2@43zv#4l&tc4~n|lqt4*-`&1rloSP&n7Y>bX1coz}mN?cw%eX(- zSkvMpQ6f2= zYq+aLCF>~Nx(Dx}>xtENv-jo-K3HEK;_6DR#Ujt>-F8iXg@bf9Pd@``jT7(XF2eNh zS>LLkc#wi9J{Q&TP5k}}>r3Tz-Q1FKlY6LN&-QbeT(*(ruJtPFvCmnia-TfE&alKX zbZuDd=X(+tE<0>FL6M-{-8>o95u%j_cY69vM7 zavr0GpwgCss4i{(_!>}m)ylr$boU}~KiKrJ6Cdq_%juu;w#G}|OiKzSiuts+=y&!X zCUgn&TXb9k0ssLW<+~`@oz%m(_@584T4OH2P_;`UQ>`B2Qd;w0^@D8Hs*V|WS!Z@= z)pzJc>%uRH#GDNmw_J`@U@U%n|58{7IrUUHZ80i;aQ9PRL3tiXhs)YDsCa+2zdGSj zo2AWc-wI6t<(tryYn_i-^($|8q#HiVPf=R9YassV`XfL+?JYV4V@ezE$!iT^1PDKm z&U&dK!l!_Ye=>K=H*Ko&e8ipl7dJYuU$T5)rnLm{dz~mt6KvsIyaugV>_XPURO)lR z^PI1ujpl!5Wi`=mi8_@woio!5p>g8#mfYuCFtOt)3K=^tuPDiFk0bGJhA?2Kn*7qO zLK=dDs2AH1&!#ZcM@!6}6p!PRH*Jjjz;Pi8WMa#PPXvjj%s*L7le_h zje%LRtxVEtCztvT-E+){x=cpO~p4q!wkbi#w8qfcz9GgDge=L*r9(L@znrS;bv+may z)THPswm1Ezv(0pUxbU3#A(PNN7rGu&n_m46-*9b==z$%u#L8q;&Xp%-mDxc%pCovm z0yZ$O#P}j#6W?A|OWsWWwurEa(8mwHLugFYHa%Xns7n87IC-;)%reC~HZaxNUu(%J zK^xx;z2!)U)bWI}kfdgbx29Wd-QKx;IG>Crkh3X!CsFquJ~o=Xy6#HH_%zfq+u%@X z*EO}RgVW+GIq7Y;$Cv{T?jrFvjO@QLajWp% zymE2xs4?ZaTyY(KHyd+MZuW?6cD=0QB}Uow`;G25?{(?KV}G=o60W&k#@F1#?uMyR zyhmJbA7wmWqF%Vg?N>i0JHK-D!3mKYa3NnU%X(qZSct$=oR{ck{^O&Kb~10%fMa&6F=rR+>vwh!jmYLT`I_C~&()1E#eXayo^2SdI2cgZtR2)kkSd&TbNiZpc&_oD zi~vNG{n~Ijd~qupZ4IF&QEa*gUg*L?%JJvW_JVna<)`u9FQr1IxSKTdxme1DvudvJTMc1%!XBb+sf7*9 z>jr%stE1ZM59T>n zRuSkP(<$P#ldevouw8ri%cUD%UB1mT5adEeP)^%I*j<7c%&hbZldArF;B^xo(8k~ubx}zi#PA}SevG6`j4L;pc9<=naDVmJ2DOw^N9hsA4R1f`eY-4IIZ*=?S z6$a0vb-m=NZze{Iy)QYANc6UI!6fnCBaJR7oxItP(_MVv?XKosVqHopW+`UE`Q+lK zQ-ED>0BCOz`$Q5&q9RI7kl2U{5uIcoCn;|GabdcuWLBKFVBDkri_`@z9+K&{b1W^o zv~uvAZw-fL&~IesrQO+y?>O;nG3#Or<1vTy*)J+?q1i5^;g%qeqwf;_1B>xz0GS5i zE@Jb$wFFzMFPJaKD1lkYmngpN3Ndy)smHRPzGyNRu+yuZfvxP$45N_|;!|ysu3$6^ zMIDPk|Mc_q47b9LM^y_^~_M3FlXPV7X--NO%9u;dyhonHr5D(Uy8RL(Fxn z`?BjWYB}u(j-X3>0#SP+M{Vn~t;iylIh&x18{2QEJZ)IuWF?{TBY+Gt-rtdNF zMY3PU0^;k@t05lV-OSZi7*?!rlxRQECW27K4dz6-AC%HRR{0A>l}X2Z@8Ii))DU1@qd%>wos1nUUHJa!uAxv5wIe%$==k9j((!$PJKwJ=MSU z3+o3BTVR{0z31yVmHE@Eb@J)@`{^51839AT5$iW~ zX|Df>k$e*}YgJus7IX!ia$QI0wbIEH!>3R7vX_d87d)N`E8+Z~DnMd1)W+&QUg>Hs z3+yFRJaSK-4SVxySbzC$Awr9qwMe8FKU7X_D^!;7eu^FLTBMl^S+u)ao6*HjonD2v zn>LN*8C$w2G;7QTi@fuaJ*DRr-B)t7UlJ{S=O>1)e=#mwVv@vWhq&83c2I;piGMP0 z$>eqkbq1glwV_G)e7Wg0*dbzo@fd1=SC+{jT!%+sye7x=eT`U!cwLZ>&gNZiR($n+ z_lN4`2ItMGCL|Yv0;iFH6S#}3>>B2^#%mF*&(@Vom{cid25;q)>?rB(qzZ3c$>eUGta`P8A_*tXFSZ=JYdp!T`;rbZuH~_^4BZv&CadElU}H!8QYl%E?8FrcaI44)=)3UoJ{mk_}4c9W0eQk4oqax(@F z0bcGYM$Utxw9c>N`}6_q1&)mExCrzqAd<*LkRWQ2!y&M4D@b(6EPCu0^xIW7tD~)I z;yYt$U5w&Yh9O0e)jqkYIn0mS@wCVmdKmfV;Uw? z$bJxIZyxs(#ki#_KQHOGUPH}|#(*Oy{Fpo=ojl&$=Au7yCRfA8z9eLXCu2eKBtC@E z$QP+*@vFHlT3}Ik@uT_W8(E#CL*?O0h5gz^hKH$#-X4*QsQ2eJ*vXPP5G=NSxbId6 zXASQ)_zVx$_}A|oe1f+eIyK_-I>FHk6i~K)H*loKK4EHtCHY|!LDuuG-glO)2H!ca z#Dg1Fn{%$@vfA!4%(RZIsoa{zh+)UC*I}nh0uQZjq7u^{Coc@8+^ptFdF<}r-9EIX zFLNuPQVCbo_2!00r1>!@4D#s2qAUC2*jGg>NQ}tDepmp@#N)E_DM;0?&tD_HNvNxM zK_3urU|ExJ)}1OpB7N7GQRe0DQ$H^)Klzb=aqEkx=3(Wl46=+kx`rNjy>U9bOd!-loW1o5~xwF-|n{@PIw87!r-Y=2t9b(6J^5=M|!Pj3N!% zUBUZfcG(sX()bv<$W~>HVPAXusIT-{k8~6opJHvk(855W%PV;QiB_1cM~87}28jB= zLug?R|8BSU2|oXJ^zEQ>EgHi}-#FMSK6-Ph-j{B78D(X?GQ}se->#G$s0Wgf-gx8e zy4@pvy{=sXM_tm10820odJeX@_dF?b@lXN=I=;$(Jk{(_y)lYEW1_>Q+xfd}nZNMH z{?&c=t{^`Rc#sX1ZQ`cY2Qp70Ien4zarS7O%8DFrp~E!%;gtY?d~Lur(EFU)!$>{3UUJ#MtOQI?}Y(#PeJe#>p;@A}?a^bJHx7JPj z_D%abM={o=g~wb~B5mq_f-IeXZE?kbYNVVXh5%2H6E)67-^NKQ^e497^OAVhs^cq5 zY0=WCdZ_rW_pJV!K#0mTnS~XEi;Xo9beo^MlwC(oHbLsOO@a&d9+$lhidkW_e(YTM za)~*cV#(|aJu|(S2v+AdNP~bJMa8Zbe$B=)`HlPy>2@2oG6K=po02WPYq=25HR*xl8wu8j z_ITq87!)<#4&1-!5qFe`88tcQ)>$Je!>#>X1FNvM>F_vRYtRkD9S*r*{Z!cUri1&! z%YNRn0D(Dw4d-bR>l{KKo6zhEtyL$LqUUeuJ4X9(8wz!Mm_MVGHJGni6|>D*wlw+zR$V{ErqAD;7Q(h*CU-lBXDqr@ zRwHW_6^E6_r3+fct!Mc>9Ah442yST((Y@t5u!u&0^5A;iRcrh*X|Tezs(d}6gfcCq zG;7sRlg-qlNmY+15?3wOgD7eON8S?VLw!C;ncA3|y%#4tqpNIhJTdX4Hm!yA$0umr z`E+?86Z`@U$)f<|#pnI(vGT#KM+L#9=GDdq)vi{{+-3P3J~TrYF&LP!?d2fRb7I&H zVNnAMg0NMOatL;_<^jT2-p*dvd*%I+R`Yq}GjMQ!e3)HEUdRyH`BqbXN6ulKflKJc zP7lQ{m8E0MT||}R(-GAu-D?tnm0t>zGk8VqafD$pJZjdkF4|^7OuyoQCO%+XGOm@4=MMCI^(xb0U99`r+|*Q zVbb~Xp!Z){wGTIw-AG*cTNv|i)WTQBR?~F})o*`(_!je{ciUbw z1x_~;miVF#h_^t40eHVM}YEv0cM`qvFCxB?qA@= z(QVCRRE;2J)ss62L)Y{`_40zxQW@U*>#I+%c~8XS_I~wxB5S9jF)aq|P9Fs{KQ`3k z0&VMFDE_*3Q4vzK#5-xO8t+PwWr@rKnjiH7( z-W?iel@sbqfU?;_jq`)EXO6I6XY-oOUp$Nu5mFM+(~KoB-NQ!3VT3I7CJvb&lS^#{{52!;a4!9zhhr>za)IDFC;aCK?Xj9QsB760AZq%!n)M7BG%(uhHux%p!RPmJQ2;hFaU&lSOT8axGVQlUC=GhY`~cmTe3O z%!}>uoHJ*f&Md^0QDDn<$QT&)N5yG8iVF+Zy)zd=jY( zwME#9Bww^2b_M0u+RS7X99H`)p4X^tFxzgNs0#1ZxZQ+qws_hKT&^U$;^^~nn0Y&C z4inNqP0Of97VEAPMFpmbq-Un1fqje?eJzxyPWt(1Kp_hD4#(+azHY1st@zcO4au^_1T8_NhW zAg?VMBP}y~Z(L|fx3W>4N^t^%@7 z1c9kJSvfb3&0D7pPclhdYt0FDCrP35Ox6P~+|G{@KU0a7e0tD-` VyoS(94Q7bHvy3QPzj(r^spo3ajpLn~R&DgWWN)+dLQ?s9eREqH zJW4W7V9nh5kttCso?dM8T4r0y>`tsZ1Q+ zzC?t7Ns`~$+}pgvF*czup<1vHqC=5+vybGFUxGY3+_zPB^g)pt?WPC$6Y_@<-^*PjUS-L z0Ny$C6>=Mhb)zA_{Q=%XQqb6U{{ebSy$mqfbu4~>;tPI&LLZSo!@ixH1{SDRF>=R< zum7X+KgZ+0c07jTba-2BS;B=aACI?HWTbS#^Z)qW6r+ggM<>V_Hmv*gIMph{F$pR-}vVsFq2G@3{i3(`TWqTEPNiuFiv#66R#d` zRCROd4)NO42luUwIr%**uHkZhPd-Xaj49AaQhck%Vkq{rUHs2_i2kNz?__*?Zs*Dr zz+7$w6w90d&9DDtLdLwOD=$36l_)G*i)A0#%Ir7;2Or&#&H~ub7O3hep;VfKnnrNv z5Fp$o+Ji7TvP6jCU1n=-3*ZsPBhr4@JWIf^dwo36aW{l(;xs?R+C(*crB*jJ(Z>7W zLW9T(fL~~a56BrC-LGYI@S?&uv9k@L@5T#JygcCK87Sc0WW@$tJ+5wHhO>8ZWrl9= ztxq*;+)C&h#=QIH*En8+;$-e)x&16^heE)IbrwE_vkwG9O=u0FHcW;nrg+t}yt#T_ zd?5Q z_GwN@uHFa%%9cURHI-5bU%()QsfB|WaimI+3mdvUvhj^rEm!4amg5s*#&?ZW8ip8q z7~L8z2klF*TZ+7_BoRRnXW#E5>XZ^;W@9Fu?q(f1mg2cH(VM}YChMa9o!SN&&cTgT zhOZ$vt>oIaC&6tHUfU6k7uE8QC8d*1x4#`tX}LTujeIKfzPtXZC@q{i(~IMZE{n(ttr`uHl5$Pw;rd)H zsx6)7_B#yUg{I-n$1VoMmbC1d&*JmW3tI0{XFMUI*cKaaT&h^>Azj2$=7LFZHlBk} zl6w3ifLS){;OM?q;M8}42l5`RJjWhA=7ja_sj>N`1o0xr>oxWHr7JVG0%VTinpyP0 z+pc;Ess1}J^dDYbaTF;_F0@#`ZkPA=(Z~H-y&S^TYTpJ!1M->iI8DBbKo!E>d1;hg zr~di_56Zxh-M*b??~kTx_GePpEcT;`h)il6v1oEJs5SnnpBJ>;a_e|;{c&df4r$iT z4pt9HeEf9|d_^D$H#=bTBpN>)lzELYpXmPiVuoK=ZPNXM&XdZPj-Q|28*rFa1Zc3R z5M{>9E^DEYg+WhYBh%^LlCD}?$1Guq4M3nx%>NR_u)ehg9x*Ef@CI`+leSg2cZrJlET2a02 z=Gn+FoPf>oywtW@#)3kTXf)t}eVPX!@-`{`00n76iGWw-0jD8{x`L7OgJ7~q?C0JH z9P;nqLlG@!fw=YURL2}qI_L)|;tkV3^dS@gruwUZgK{PylX&EMxQKaga;B8@0DpE> z?pBKo{)jmg*zOY>AsxL@(d46q3*YfN%1%7yg^9qglWOf^jsK8&b2LTAC5(@cQZ4P>H5) zo@qo-dw?u}(m_cN6~H|#%Ez2gf;9Kr1zrxG3n36`+)quBH`7K&7K!){Zo_8}K^Ty?*cq|}HV z2KTB)%|5H(EVhOa1QQlMT~|0}H>esZ$vH}9GZZU8vI@T9k~X;n?n0fMhQES)OR_y0 zbec+mRvI5(RsJe)dE)irX?3OIfL*ilA+)%-1lnb@%vG_!vP1g0xvRKO%NJb*lZ2R{ zREcNxY97|S#1t%q-HF}TIDElH$Ygl)>hh$SwT4&)uFOb_pOe+2%Ub%2A~@e4(|P?< zOt{@^|7^WFO~dC2eP`50=bZ;&&~ehugicuXQYfIb^~_P3tpRVIJ%@4;M> zd8y4MC+cYI9kH(~6&a`~=U?LJleBNVPpB|yh`n%i^P-{B2YwEo)LUB5Yt^2HwnCyO zeY#Pg74u?i2RC@g&U>p_2R1p1+2Lg>R)KO*to&XX%9>a`pGm*2zjD|U>1lPAN>~tS*+AS(H$F+y_uVa;*dZBi(@YDl9 z#6Zq>E^Ijjr-w3YdS?>I8jdKKP-QIceyTXw(EMy)M6?U)t(OL8;XV_tNr#`VMp z-s+Ou_l)AX>q}o$EUCzy{#XW5bf@-++Cx_7cQe{!X*#ZLDN1~ijPT;l6#P_n=7H(` z2-%D!Z`D(Fn)jU#UHcs~ydEAg^;8d7_Ou`S?smZ#o5V}j7|_T9hol1&hA|9KvH+O)tG{s35nqqIZVD#g(H+(@HRmM}>#9W_Q5hp3ji$WJ$8fGr5&AB$o zIeb!IWOOGo9*OOJlHGD8YD$Of%r{0)o!Gx}hhpWVzgadsL31Q4T+GWtbZk=D!tX`w zw|&9qX5u=I!dcIOoYr5uIk0R)aXWp-OJpPW!dYa`*W{y5q@u6?ZZT|Ncl^`8{Vzj1 ze~^ZJ*Xj?0ZU95FxtZ%bw~-F0)m7qM$(^6L0kyh&r53P2U&hQH?*52ErSF7`qmjpf z_^QHveb=QPiTN}+*BR#BMh>BfM_}-A;bO(+rYa<~$+G~)6(keF z9)~*XkT3@n8rru+XKqO37)SHT*y*39ql=RItZhJ)n_mhA#1t~*2pO45-%@KM7i*IB zViwdM>9s+UG7N%Gs?XMo_vb2v?Ldh_!7agZ8_8IC&wjxrCv;Uyp%9}$t@=~B@A-{3 zXc%^COuht)@?qONat5l<>J1g>;L$n#jCM{HQ*M3Jh@M!u7^Z=a=6t%pQ_&cJZ->y# zIhlXj4n-K^bVK*APgSL6ps-4hCdQ{5{gDL_^xVoj&`ka!rXOzBx-rZEzrf%i@d}E7Ip?DfBc3fLl3NTd=^eyG%836V6#ZoV? zvL<0!rqS+k@^H`BTK+um1pN^UNN{M)kh!+>y;NGp&E90yfYnLO z_Zv>lM290mqAT_w`W(SKdqJ0#2tdk=W2>8k52C6K3uFSN`kr#?mzTE+WxTc(Y&0c< zSK>(anF~)c4#>>3Rgb<2hJJ)q`lnxfdh_H|W5BZFR<~4_Xc%7UYE_}k$)7sH@?=v!oDCaPV zPHifKA0;6<(Jg{G2l<`ssj4gQR0ky|UeKS5J>BTz+8g6z%UhUg(iTmxLHtxox*h1Z zM4(3Jb>&X0?wqeVqkgy0XsS;3?t>)$$;hbh`TIUVHqhb-$Zw8MEJz*;vQ7l_1V9EC zj)Of<8Q!VZtY&1}(XKMROCR#^l~e-iRV-+^9BqV00trVfhU$?rbWr*z{g$>X*BX0j zQqI^}ent1`ybn}Aotu2aL`mJ_?gYd3xBGKrQSCDdt=Dgl#jn)JzBB6*pN<%M>E0RN zrqcfz89JC|gRsqAxwVF9Ej_?mnj`mJ&@juZ`b`!WB*yP;e>l*YkbQa=dA~!@@M+m%vkt3J#k7>UjR81g?xkzel#~b-J{c5oa<$& z_*Wnsp@FGObEle0YA z6VsHmyzftJ-EsOVZ&`9J$|d4@GFS(_KIGO7e^cVN_N=InxBOoEPR%`d$H(Oy zpHvi`#MO^TM{op}Mpyp?6x$zNbuF)IV4Gx=rMvhs&3wvkUPGM{6*rO`b8VreeN@^B z3H7#G`9XXnADnyW{0hwp3VhYuHL7tR+_1%maQ0txmT>+>)U1UoF)$5(fP#LHUc zJ-S~r$m-|D$M87D-KP-rwK8E$Ugj-60c%0JG_bWHKtM$x9y}H>QyyAKzquLT6->t_ zJ9)|yq`534R483V383XeM2Yl2KtF}H%@Kt|e}KZX$sFi;$zx^+4RSsULKbY!i-^k* z2hd$6cy@9)I0%E+2Sf?ZYLKL0t*}D@WhS74_!E?*+AD|QG2e+)0&%l+e^kO?SLje>&ZwZDa zpf{5Okg-$J?ld4VO|A-{IG9K&=HK7efa@#ujT7d7)XpX#|2!XcWpyFBsQ%sTgnmLNBK zk#=(-02(0_MV4`#+=_N7nKdH<%2ZREy$@qJKA17RpAsoCT`cFk26|1QN8%opXyGDB z9A{zihp*1zmoNqCBBQ(o;+TPf3J8ZD_fogi>%-Io$si=n6jnVS*Oy%w{S!RXI@z4= z!o;Gho?_*P&r+jriGFz1skUtbAFt|~xUDWBrc^fjVq+^PS|D0K%f_8Mcs=I$l5nHs zE6e!+5Q!GAOS&2meuztB)0XsFiK$QO{qBv+osSul7(Xv?41A1r;qLmCZmc zdHID1p_>#6O3Z6twFN2)c8{FP-2PKcC~3_koRe69~a?-r((p1{PCQjzC%Rp zn^|~AC*<8|9PW{wzxs|}q`%`?2`-0#zQ;nwc0EitsVM_FVnUkfJ1|r&fj92$$V) zs%O)@;nX4E_ICAI{-q3R`BlRYc{8CNy~;9hp^hIQurjblYsp*#&eHOWV>yGc4YqZ< z=Bl_d{lMq1ZE#9*a-|Z<*mX$cTums-u*+C#?a(K%s*&4Yu>Cum=Dm3Mpvf!2%ck4T zQXkr^TPfCIEXrr)I$hZi3Zu#T7H6Y z7cj(|2$OxOyg99lS<_RnDVpgl&3+TZfrfcu+@lURpQ(ZvN)7y-p!v+HngMBE!74m8yA`#=;z`{O#@w$uBY17^mj+Gq zh1BT+P}YW?&=x5R0Ps%}U8ZPQT7!~J_Ix)faE5WGd!;<A< ziF0cuMG~ix5Ok&(SqW}~o`fgwd*Sz%n%|DYll2HP*|4>SkhhNoG8VL6RbKK)`c6Ge zb>@I#K6Sn2B!Y`z(zdBA$!HBP>*(Y=RT=R}tZG%dHJL7^?WLCB#$`^sHlC2>sdNBZ zX%C1BF%2TY2qj{%TnLF(`kfYI^im4z%xL=3HG7MvX*Jq$uWjo2UstQ@>VT5>1xF>( z%FJu#Ix~CS#jXJC??|HpQx~f_WbxCyMt}J?TwIKZWLPf-*sUDrK=2PV33>3|QC&Jm zRGrN0{6}u0u`n>fZNtH(DV-LOy-X92k*dKvp@N8`6Z_i}gl%C;pnm`-R zTAkobu~wx=d`cMiDXcX11_2vTAOPNA9zbq90*3Zw^|#F)5<;6yAbSBo|FbP1V)nvv zTN;EGj_NMABnFNdp$y1Vs5_Plxi;rjvd8he3I3$Z z*=0)|!rkCcM0R3h_i6HgGB;s4ru4S!8z69OUbASwvn^!MT<-Il0n*HKvrj0qNn>mU z%}5eLf=N#Zp`8>3B_9^(=`En;-%T&SGE(21SmbRo@xSSNms z&w22w&eu;qXZfK*7ftx-GiDs+`LgQ^@xV}Ho zJ@y${8{9F6U>Rx>I-p0?T`vJc}lS4S&mN!j7~8`n@qc zEoT65jhvm=F=~PF;sOT0RL&XXFouu1DJ=R~Dp3h;+vPr2$-YV9L&(OC%*HLGAvZF^ zGtM;WaCaVB<{Il(b+d54yb+~#sqa$v)z~W_p*D+j`vFgVOQ9ZcR4TYtH^jF;8llGT z9|?p>-UFF{>O#+%8ys6(ofO23XM-GbQFA;^oOt?n$puL+r!%Wbc1?|Nl{q&biE9xY zJs-FHX>iYB5^E5^g?ogYeDFO3B1Hlbk&=gv#p_;;1l|pMOwygcC+Z4YG{m%A4LlM{ z-ALf?8X!3k`?(zXt~KdQ3Wd&z+hZe*zg`FjQ@dX_>p!=Q;a`dP=5Q~uKJT7L`kCEQ z^o9_AThG=52Y8DtB+cRJt)zo4uFcNeDa;n`!EL|O0r{oQ=#86raTZ}fY%o?yKl}&i zOY!uhvXMLVZNNfM0V1wlGkt+dkT~YJ)$7ikB$;Y+Cu{wR-iH>p)LF z?KneX0OpDlP`#h4`2pgBV#CPP$N`xFA;|Mpy{zq>VStQy~E-k_8NMd;rR8<08wyIXUb7S8^kKa4Wz{zo(j z);d9L2%R?e2tD60AS0S4fBG;w9;7?Wbx8d$UlU^472kvFX-#3<#lpHSHF;%q1kN%M zBO?RXpds8NlkeSLQ&#)M&xlTtPXMimc!|wrALi#yJ0v5)%0lLO>zTCc4`>BAzHa8U zn&IYm>aKBic~&oF^0b^-f$&tFvT3>EpnPFyZj(zcTr5C)=E^((RgdDWrxbn7BJ}r6|?3EJjbV*@IFnYZY3kIcxAtPBkumR`JC=mZR>B1BLG?w%T{oj}RZP!*dVN(fZRqIe)9Ot38GFprQa}MEV7MO~M-o zJ)m(S!8oVkVWQL1^D*OceJy}WBaB zu(W9xw$8T1Pc^yG6k0NT&4!njT3WIJM4o4woh+dwU<&HBIeL z%NHt)4O`nDpUVCC>|3hnrxqsQlf?8w_|D+6Zb%RL41fXQlM7Lu^%c^v@rB&$^3{>v zDtiXN%xwOw7{aAc{spyU{dzGm|A+t1B&K~NjsFejD!x>jP;~kM_s-aaA z8kfIg2j9%_X4%%LA6#3nGhq`!lz-iF(sA-w=OS+Jx8qy#@sd5}KAZgW>-3Sh{2sn^ zoIl$~Y_-+Za{}XSd=D6_mQgPmUW@yOTk9VEgm4;ErX||zw2W4PPdg>CA4EC{?X$fc zYd_g0wky2ys=rY28RJu-8zZ^|k2Y@%(|y9XS;3+Ec*}AP_lCt}x|$ZtsG=xKJjFw! z<+s;lcI-O=lcpOm{$<7WTqDr~pTyxs`__1dkzD^Ir(O5q>7Il+8l@a`EJ=e9q&g~b z|2Uj}rNlc?kFCS#I`hH3E1L$YEJ6-22+<$ zb|gj#vliQaUifmU*F3WdrC>ZLE7|GsRMpnE9_bOAUQSIrB@UjT+b6RU?^b{qGH+-G z7*Nb-sO$k/(Q$Jff*IbJuFlQEzVZ>WIJ)I7Ob73=o-)SB=lAiDH?c;OjygkDbJ zIc6Heh@0(6iXuB&E5e|x#FvF_R1ja;*iPF-P~2+9OVCRbZhyk{!9E;yA|5|J{h@G; z>!;LqD~Y*F7qmF(;?k$Wt*`X}aamtJb>bFanGAoCBcxn*om|#z!K-0H$k&LD)UIbq zO7#|LFT=2C&q5CIkw)L>5>tDHJGg0}WXVZj4UJNi%XOO6X`fm3&CM|xO{O2zv{PB> zSStbJtPRUnj5CD4?1{7M>B@E()Fky{rAFmXXM;aPKY`+Ppl7VtGT-H)-mV#NdVy)z zOlEmf#&?gxr)&;It_$z$TU(3B#wUAZr49A~s_|-1xL*Fc3;lmt3;i!_ZTycaKV^bR z&K0l;=>99zg7AqkEUGi*emQ~V*?r!tCi-EIrzq*-3q+QWre0;j-umt^X%v)=Etqtq z(=04M+r!WhGrGJOX-sRKoC;{$+S(ZMiDssvqD(i1`EwCIc-wBS<Hw$i~LD`&?W168}FGFY|+$oT(st%2B4^~=Bxh8zWCc`nv8%ivloV& zF5>074?m-PE`}7^=4r`P51Y<#N8JnY8;k0gca5?ZC*I5_-O!CZjLr{V8sr}p`{r`J zS7u07@xC*?%pS8@OFG;GV;Fctt!WGW+=5C#+I#0x^ffhM;AG~v!wrtDmI=i^|sqi zpl*xkWPlPvG8xC-cI?;geULdZHWn*tJ{_;36&IAZ!SFmTP=jB7Q!N+mN3k0k21qVu zoWN@n*CZ#obqR7&C8~4QaD7)7*sTWDqui>Hi+zQlL^Fz+9XvH0>fpi$NEpKaGK+os zV4$?_w&)Rx7MgB zA0+JHN)3rlK#RIuLISz<(1aXuYn+%kf9z^_oKHRj4;;E(S2y6Jg4tb^1=nXAk2s{m4G1&sJmD(rTub;Xz z(K9fh^)pNTzTQXM;e&pw*eO)AuD@A$}R?u9`%hNTneW$iG;zppi^?* z@!lYc)j;(@r!8@Ajd%Dhlx?m<&A`O!=;e#m)lD~PSv243)0Bh2?M+Vm`p2*PTcOmm z{bi1nozT-HjVE4`% z7(w*VHyX;S)8u>^&smG1q|Ed^Ol?Z*%%tBdwb)MrL{!c>-FW-xP-Zi@w<$+=sV_av zub7i# z^>x(l`)>A5a2U|e07bxc0h$A5idueHDD5ZSq_j6H#>u)2Lsd8O>gxP#I5J;}6gu1R zow|m029!ROeCKd;ApsYON*l)NwbxC;HY9SS{NMEpSlSJDW^+ zy;3uKu8D}mHlC_RfhC$9+|IW5bElLh=5eRW#F>CHK(-{KW_x zGqoprZ~dB=;K;)0!{^_YVxLCxzI#ho)cEXskUP`m%QWRgGR&`1|mK{|M^hr4w4$sc$sU5oiZoV-y9-|E+|3o=sV@Yll5Ct z1w+RD+hpntOy7K0F$O)$ju3Z(-KBSK2+~pzO zowG4}S5Nax0O3BRpH#m9#oUG3*6q`e%)y&9`73iwVoh33^Zp(>uhwL*kEE})**`rc z$rD?({B@UIL__|ZEpIdTf_*!TV+P*e4s;W5;zi~n+ud>wQQGH*lLW6Q^A03Yo0e7`tx9aA+N_5>6fQblMy9@_dKk-_ zDVwzdhjf5xW`XvQoP?tr+G*JZS~y0-jVl6M?=-}oSSa4$cz}B4$yZ&9bB#Wjy?z{{ z2z{FUnI^Kyxn`Cl+$izZY@bk^N9$4XPG3J2jqyVB&3zx@y;6jD1!35{&%AQoTBK>T zG~mOfC+Cu`vC7%0b4%5zykGtF#QMx}nxY1F4y9A5_0(~WQOqlHo7urzRQ_u2-mq{~ zDc?sbRlBL(47%6$g2|%qIC~LK2-ImBrS;l*&FJ&q zk;|w0cPk0;nz##j#2X8eV7Tl4GyHxs{&P>4aR$TK#ujeW^OHd@0YC-ck5XhmlQJCJ zUmZ&tKR)T()9o6pEHKjKfXxgGplB?wA%ujoB)Lzlr3IJe+VP+GqEK-0l)OzVpWCVx z_*peF+_$C%))l}5}p-(dko^YmbAiw4JgU6=o#g7LY-%iWa2LU+ms_Cor{=_ zuLL^N9C(hEcyE)lQRv6!Q>&KHX9(7N_;oClB;oJqH+KfU1Ete0oSw6sz4N8&;q>it z0q%$kMOo);8td-s3O4KF@&orEVRJN17T9Qk)`pD2`=k3yUwbv`-esmd6JSzewD^Kx zK;c7m@DCfprR`NB)iIhDA{`s;oT~BHA!$^o_R{X1I!|R0{8m07-o!_d(3v6DOjyb$ zKuvOsiDkcgpGYJ>%z;G)wti-`3)dm$6aJt~}%9}asTrv3?5ztISN6AQrDW4<;&zn|F9eZ>@9RjT+@20$s zS)(1gcS40}P)=%f+@F5iZk!=AD%r5($P_^xEEsn_Q3iQM#+{XL0>}Thr}-%|CV(HV zyCB!o&%K%nWSOyoE%^N42Po|!X>qo}z0I%n9h_VCP6+0|M7?%EBZMO?qW zVJ_41`Z4LETv+MW{nXJVQ_kD87u)(S+@dhh;QSuxeut*j3WD{@tH$yQbxOhWTQM9i zb`jdoi`Lji=J^Ef+L0lULbI>vy`%l~ zWJTMD*Rg5+H+SZNGNJq~lougS%qL7rf*%D)d?86~zX% z=b;meW8+-bX|CC;H8pHu1Cx`T<((xPix-}O#5xmN*R*67pd5hRUYf!shbjvbxpfEk z*3)z@STP@H3_p%8yYCT3Ge^+S-?WJO+u} zBbLwWC}WwZ-M$#LnKe`Oz0_B8{F%x9b2v(2l0-S%lt!lQxnN1UPR#cLll&|Zx5n&V z#01m#1S%DyiG{dzSTKeW0l3-SMxkm$OHC9FSBo;mq4T=cNG|&g3jESGHgh9wUO`ER z(<}PU#9rejNj`pc(yh+h2E`S(me;glJHc(Z_7}0h)W?Q^vN@_x@*wTIQExs#?}jA7 zw&iw;U;ZnpFP4Av?|;ij94$$~EZWrq@`T=G&(=|ZD^5WK*w-hE0loD+eC=^~66Vm# zUk&p6Ca`)mYx%zi)1r;gBF71s#nAvp0IZmjja+4rS^M)2*S}`&{*RYWar4jkL3g=+ z=@wN8C~|$LnL>_TB&z~#U&VmKhOjPSAQ+(BMn@h9IGNlLAQ-4QE62{|UHr(~YPFVs zOP6xbn1;ubKgc>!4TQLe1xNFV5WzTlXb6PwrG@3kS=~_bd6A7FuVILn;N#e7TTYrl z|0}uVIAf-w-kYKfAvEVu=34>hNY{ZV3r05!cLK=qdCghY4BGbaw3itl!AAEco>O_t z?64K$p*W$9%&WTi-lt?qgrs2|QNgrss>C6@pUsEr{$1X<5MPF~$!OIi!O7d*cMr2y z%td?19OMxUkb{TS#PJmhg*54NPC-LRTI}PJSmf4Aj}Yh<8|#Kx?wwQTj>fV%FNx;e zD+ea7-d~sy{{q|QS?+YDG6$rz_!_ZJQ{koJGETP)8ap#Q2&TeCrH*DLR@VWCz5TaD^Dl1j-#uf=LI@YU zTw5`90|6CYB7Nsd(vWt;g04zznN{V6=!eoKEdpl0!0R2+HESAZs>aZBmWb;d1t3>f z>QIWu5g#*?`LpZ!-p1nH(oKhEUS=(NLAjw0^tz!9)#yOd#XL-mvdCKY!xXF^W!+oo z&g9pg)1A#)_h~7I#1vo?sQtTjMB5S=;wNHgu~1Zt9?Ye@nfY z-iB?TDq^#Rg8Z?C_Bh@1DHi`VXDoGOA7V&gD-_BsSY!5auv5e-ZgS|aT31*bnGY6_ zV#2z90`j!WVxn@^z`|q&x?L2K)^AmMZ4(Kw!?n!MsnfRy(NR#Rbn4xCNH>a!{AE(D zDuL*}z=B_n*|agZHBkw!oSXNOk{uB$lVu8y_RqX4vkBHEC`bC|IF3G9SESDFUX{FF1UQSaJjc!sf~)O0w!r? zM$Fu&92FR>HuVCtlKYddBoE0HB<45b3m<-{t)on?`S2Z9J9h4v7mGgYz<1@#CJ;Ru zRF6uh<<2mEeVL{C+VArA=CDcaof1dU>?D0R>6NUA@z{SceZOSm{_L3&V7}uqTEx;^ z2k+}uwWoXX9lZ}r7@e`Y>GY3P+?h<2)Vx{rwLbz=Chxa7mA}3QogGU3zPuo`$yf}> zb)5xz)w8=mB1lgqx3;}Hzkfr6)!`2Luk6fkd>-$9$~SfUmRn#`NP-`W@?{~C^l}dv zOrjeEPd3Tt>ax}28Vf$#M$BH}qGyB7C04`@ODGWz+%I%TguOLVKT*(1IS9yfxgBSC zLMQrLOS4ypw2Pg5U*sKFj%~}=k4SPSC*2%>5L^6Ocuyc9uUyWe<=Y9^AIy{@om0(fv6O9H%(ON%B+* z+d-Q$1z}MNU-_t^YSKC0ZzILhe&>)54e3IoY8G>sH%5vpV@mB8L^)3%)7n9E{Y%xH zD_GeLo?buzXgT^I9P*f3Hw>shKg|a$QZE$y1OX_PsCa>J5Bcw(4lN5fTEb+)dl)I zy+pRp|D51aA6kHt_#A$KNbW6zS*ViXEuJ`K0K6L9M!=Ax^E;75MF2iS{0hO>Da!$R z(N#p$4-ic%@&||qT~(01E~5}Cpc{$&!{us_Bz|TTeREL;@PutknLvU~bW=q5@Y5aa zKkwj|?_fe%lH_IA39VFbfz_sZKPSfR5O;CGwJ)SUTz)V}=5xsS=>=brM6RF_iI#6xdWX!u)Y5 zWdCu2CGID(_&>~l@^85$$iIB?f3leW)F5b#I{GmEH@XF&Vf+~!`ZPlA`1OVt>a63~ z_1PjXe*?;=-cvbJJ$@ot3DYM*U>h38L&1MqnqN)(|It>?qrza-8l9M6WuEH?K=Rps z6R9ysg`Z3-^hy)rdr-zK(=M-AL7DHn@|O7gNE05{6b;!&hN{^^#OK1Av>#VU)|=j} z;G19_0qn3gW^r8b2dE+x*f<8qi`Oh%i3gF#ZKOi~`}uA=KM^OP6X1n|?ixy*T$jv1 zlboR9HVsKBy!1l4U(T3r3a78#Q<|C~m>x-rsEY*$DYMq%=K`zfq2Nb&&)TZn?9Mh# zG46x%TOYNQt49Y+Z{4}G`7*HvC@rz@Q{c3kh9e)MZ+-7tHOco?!?)O0?uzUVK&;$h({juNh4thd?tYbevcO0H7 zM~>~VF8&W)JIkLpvbzEFW%z4j;!I$tj`goSc84uq`o?b1^=G{R{y}BeKRUqx>%#1B zI7@S*UuzqlqYJ;Gk^hyP@uz{uoxa-{-(7Rbsd=#u4C z`0D>39x)LYAaI<$L82s#-68rf9CP714!xVXz)bKAGfLAnxodoO*=NW_HUVq3vm8Bo zB0Yb&b(8Z1Tb$&^0HL_G0rjHEgrM7Ma;UQq1X#a|Ss$o^UZx5$+X)!HzxV#|;WNdv zKm|0bB!D5HZ)ute>n-5B050`;UyV9-hoUE(UFZi@{djlT9!YQbZi}J;NW3i{? zr*7dqj&D*p5)b&W?;G8nMksCz;3tUhHfW(8tVJ;;X4*sxxLv$JG{f@-YzfF`k-2u~U60u$Pav@xK)lfdcx53?L?Dh)i)_s#>Q5pG zy`a5b(f2@V8?8(Ygc|qpLSz>2Op-h5zuOUe#eaaDpn-A*VoyjgyjZ+cdloU~W5PE+ zJD)>vR27)jBDXuTbak=`eDI*6A;u1>n)8L`69yo2f0w1jY}^|F0;3v{!=x#)(!vkW z6Qsy@Co&z7VH9kWJC3>zB`0;k45pAhO|UKSR~6;~wZG*iDeml&p<*RLib>C2=5 zGi1|TZ`uDAW%<9e+W+eN|8o_~)mRKT{=ua0Ka|h@t$`a&{MGWw)TQM9^;`%yg5Q1` z!Kd4m-7%8^YX5z6K~{3#^$bZZ1<|J@klDZJQ`<|r_4OwJ8rZ9y2b%#i2SaKhn82@$QPTHALcIj-y;tc%)JlTRm_j2 zUCDmF<*r*`lB-LXR4J8sr>FmPKu61okvt1Dk7Ix!F}dLJ>=>W$=)-h5pUzvxmz8S9 zTyw~Mm{Xnt^tSk^4n45HvK%yx()z8MSJ9K0_kAg^g-F7%5X`r8Z~(1xQV$8RTHAwv zMy{yRgkbPSGyq!AYKGOaxZLU=`Xq~cS)2EhO#IsbE`ZMP9E&Eg0)WF#U>VV|W6{T_ z_<*q-(8Q)%z%c-2D?3AqkeY?$nVcG=A*{*&kOyBM>NAa z;4_Ba>r|8v1gt1?f$tQ#?*;N=8NcaqNDHGQ`-yE`d_L*IVIXQ|7Vf>{M5}ig#EtqI zaZq&Wn%Y+22Jr|7*+?dM7~**V{xPbXew!5zx15@l3{~_BA8%qVDd2B5n#Ej+p?l_6 z?U&r)I@PF&GOxmic0(CvNj!w87i!@Bl0YlL?+p)Uzad`79awO@r+77%KPnW}1b&0~ z34i=KbiswM-*WIVyK_=)1?}mhsBiq045yz`*V_k2weT{ss@E!gz7!9SH%)&K-D>A0 z&j3CbG;O#(Un+s?`2i{~9hzA|7b4V&4BY;+;cn+Mjq#IbceR|CQ{GtTg3mEcSNfOE zL-`&-&m#Eq;F%p(6q2uPc+U0zr5~81Ub}o4Io4<&{JmPF<=qu}(`0 z-_e+Rx?!aNNJs-{uT9yc<(u8bMP>7Dh5gFuY`q%&16v`AFMhrfbNP-j^#DyaO${?B zV_~{o)wjYoxiqPtm?U@744?`1=E@kz+NoQX?DIka)xZ*QbV)u?YFhfa)MWE>sp(zZ z@aY@Htc0-*!-ZpAtVxWb{*gQ}V@n%N)X3sO2cfsP#h+%qA3yUT!<^>3^7CiyZ~Dip zu^+BAUM+r-6(OSj&uaGHyvJW0ntx9<`{!u=KkLEzYp%g7_tzYR&SOD`|Ha;$$3y+@ z{o^B*Eu@HSQ&B|7z8eyfkjhRfStn#43?o9;389FwW+(f;WKGs=Bl`?xXNHVnmcH+E z?)%*5oX_WT?%(;H`}=tOzTf+I|5129xaN9q*Y$e6p3m3wY1$3p-bO9uZ?$1;P9%)M z^e{oGD!PS0i{Q2I6Rv8OHy@9@F`Q>D32B|MXJilpc|b8c-+1ey_T6G2Yz}wP{8#Zw zSk3Dz-g#Lh6%o}JXHY~Aw+N){%{z1PKULX{9)A1s|9lqab^KSp0zS1~4f}y=%=g^( zO{A=~tNMC^%=%_wKiPvvv$LBULk;Wvwm-y8()TEdNJx(vm_*G;`@^v zr>nk7S6i+Ho&XFZ84CX=&-hJU?f>A7NW4ZqK4+nS?CBZ#}}apB45}FoU!PX&5fp@E=)H*;$yuPnaa{f~BR7GRIlHH`C-tMA@)b_(UZ`Gqp~C51ZZ;1nM_be) zeQKqge9oOQ^mLbe;V{D($3;IS`NUY~hDMJU*$%(g!B0CX!!vg;V9&i=U0h~PWT?Y3 zdgJAEN!N|wq-n0FwV<^G=s@f7)mkx<2|RF;ds1#3+cdV7(*By8Rk+DOs{pw4 zn3;C=a-wQ`V{LI1!VBAC9ZZwNCINq(ij;Z8RI8*+x9X zKzLwoUED(`%7=)~748p7p80*}vYdUkX-bC&^gvTR8Y1l;%DFK3H2We z^9;n(8lm@@W&)k}3|>bE*fCagr5tyO($aZc{WzF*#Izn@zgjMlz_E$2U!d;P`B26B zucq1wXw8}oHeubEil)PuZ+6!@Ro!2Vm`ajZC4usd<TX zO9^fEYCYLD;Vr=mNJg&nh|3n9-@cQEe9KjrK2t1zLjV)PkNZ|TBsFyppC;fOXjtl)9ks{&<*#_ef9E#ee>MOV{$tO#&_ANV z-Cq^Y1Gcx<9e=@$|95=-PjKgRcfUM{3AwHXS0!Bt$2BjU3*sL#7FW5tq`V)+LFKVb zCfyYKsk}@}8MARW?FOF{vs`r8n#h91Ma#7L8VpRWZDg+Qma(+OFsiK+BKYS+MY@hO})&z%{?JR)Dnl zh3J6Ehz4Y#VZ~jI{vw<{;KJ@F`a}w`(WHkC#Ouu9OZUk?+f^ixy)15YsjJ^aH=B{nzI{Elql|hR_8E;&v zcNtYMruF1KRneo{Yf<;_N+N&sI(tISHal>3qQa@d_&Y;vUV&2BeLXMn_zEqur6bvy zl_)g);tDa4#7~y1Y|>Auo1`;+UpH^-Enp?S^E}}KS@bjFnH&m`9eP1!TC2Q1u6AKV z((<971M|6ewAauc0grFI$XGU=A;{fzsx=`bR|uY)#FrRpadl)1cNFjigBW)!?lr#& z;A5Xm5dv=g%kjVAyC(fL2KqA-_m603|MIcFVY_;l{!O)Y?L{e&!qVa`sZ6FopO-$y zjdWHF@w4$L3bI&(*!7wR=n555XaFcHFUVw_F#IhZ`Ougogy<_F1B-2iv^D_pazQ5? z|K(X`M+qMbZvyo21KN&$I0r(g8ia-YKE&&SKL#$Wk#G^fvjlSD|XE(AI{4arQCP06X>RJ89aapFfn`3+~oNGX<7bF z_WXY~K>G@4(FgoeHOgbuu26@lQj~O+$AxfBBIl#jwy^#4%)_s!+^$mzZ6*z4Bpq?- zw@3WJUbI_`A~`~wURTN+_UJyfAv0KsD|y52m?na?_Q0EQ8NKCQw@pDh=(Q2lyXUhG z9|F+XrV#uLK^mviQ|bZcu zogSX0w15mj-#=z8+CBYrQmQBLeAF9oG~>P)c*(SLm`z)7v^fA1tIMkCR5GLY3zXSV z)~(QQ?_eJ{6b1Cmz1-^r|CDk96TzInKpZ+`p$wo)D=YEVzl-A+XYo{e=B_h>^Zr#3I~=Y|N?d`qj(^mjSb-hU){dEiHf^VhesdNv)Cp znUm{RtBIAaos(-7=f%m^nbSRr5^DL!XoxB0@_58nDGu`K*1|I)GG}2x!7}ii_P8}N zbC)(aDEFR6)Lh&EVR{fR6j_Ztx$5gb)KWOTdZeGdlR+RDu^Sq^G_#83!4jHdlb@9mX4x}6fs6`L zC04pcDdjaWl){xNC}|p1uFaOTq){b%gZgFe?-^Ivh~8rN-8_e}lg$Xy^I5%YvI=zP z7@XT-*V~47_U7Z3Ndc;Y%uiQi);X#7-U1aSBuxym6U+?f!>OfUBp96Bow)ha!70qm znhA)=xyGuBzH5GhXk93jIkrS@a-`SIjq#eh=9#dYcdv7(FM>{jIK=!C3q|5~mChLb zX}KPF{6Ac+-|@QsFX3vPghV#;Wx__g%!v`2W8NE*31a)ox3&dV%D*f|?dn4bJVZJy z!!&dujGsqJvhTWuAMI{TLAhskuo}P@itwinp?=;5l-$0g%uwVu1|B2_l3Yd_tAc(%@n`?l+ zTRu>{Z3S{AfO6j1f^aZAx~wEY*_ZzlfjnC{lQkXSd(5|0h&?TQ%*Yf4?dKx~ zxWuHy<9zSP(67Inpa~uM?0u@feJ8cEv6q@MXmfvr4zLdX?R5YME`NVPZEhN=gO!1J z_3tkOVAc6|7c`I!Aja(guA+Z;5s<&To=HCe#Bi_ecXtW$o9k)%72xUpGxM{C^bAk; zV#?cf1uF4Z8VJbK29U4K%DG*x`#d4q>SkvJ9VR|1ZalmkNYbQ^g3q-rqu`QXgSZ_N z0taoe%rkmE>wYY?uMx!xn=`#FR<>8+!uoj)YqhA4tebnb7ZN1GxEUUQ-aDg9w>+ff z?-m-w2v;NeUDCxMkHZie=_k(xLhgj0N9@#C_u^%D)5zSEOt@iWvp(sLvrpWd3tSRI z3Xm~;d}R|=c-IMnc^7oimy$=lvNHYSB1IHY{9sc@#bDtd5@~E`iJb&7p#PYUeT`p` zk{NVy!{0nraWHTBW8e7%IhNRv7UN)nS(6pAv{0ACks>0pjmQCP`e$_}ihNj|&Teyy zVs?o3Hm_`xC?C|WoZM=}UV>b^pF`9mX%X+7hf9)l4U5=Dhk?fK5Da)~fa@cfQzp}LM{a504={2Nj3ZgF0{?Ih>FZEH$=+*I9JEZ>sE ztq_8%lgK<+VF`&X;MLjCZ8$l9G!;jmK-U?=FC7zrR>=`2->M#)+6QX1xpx0Uto^$M z`~Q-~`nO>B-~RkB5Nj_gEaOC%z>(nU{8)se+u8M?V?q&(pzhy zFvBuO`}d4Ybks0^*4n#kZDVP6sb#RsWVK7!$x2udN2j`m=DcgsVV~-Vd3oxc;G?3G z((h5upyVj$Tb063AARxXP-I^R0qv`jLH$%VVx8Z~Cbk9~B5w0lq9HyaVqx=uFr?&a zCopJ3u&jMLstukqX z9ErSq>?n-jq3kM2)ui{1-C)+f z-G5N~jQ{4%F4I&83NrtdV$SdW_Wws^sA6ZOqz_12*)7VroXPXDHCJ&#ww~zq0Rzy{ z$ca?Zl{TKB1;jBGR@gIp{C;vxq3hJOW(d!^RK-vJTaOU5RxeR?%c)~ZFrCGSu5z|t zpkc)yV`n-z4+e&Kz&LJavCWr(4qEV#wwktH%ay^D%{~=({s<#{STifJ;6+#f^ijB`7@MKtvQa4k5?tkY(w%m;ihP zlaIi716YfI(PlomL@~v4BT94`L3C!F=A_1U8URG-bud*shsqsF0J6-S=VNWS#)w~_ z_yWRxstIp@!+tl%F68%v{mqOP{%u0z0at&4L|N=+kFZgP;NOo26vZ+rS%}qHnH{ot z7N8O3tor5`C@Df}JJ4MVj{JiG5uJB~M%sS5a{E#_fkw++S^h7GFP-mv)#|fbb$&k_ z1XBQUv^kPz2BIXU0>mLV(hl)(B*hShwE_Lt@>pB z3&i4c>6i!gx~>skfseId=uXrO&thG-LmZfetUa>Q1b;HLzHBj^NWOS^T93;>{|UbVAbExVQ6c=d$KVyeK*z^{#t8F|0G_)I{FJw6I=;W#`@6{VKQ;|O2lF1> z*(MlFOV-5_vq>h!r*5EcS5KSG+T4cMpB5%9C?C^#{-Nbbt0JYSETU_8x@obuQ-Cxl zeKUNim%HSeRO$Cb<_yGEW(t1Bm}^h>qP(4_4q&~YDDkO0j6g^&*(xA*nkUsD&p3P3 z9*Wm6wE8i)Vd)Lp172j5Kqm{bTZr9S=DYzY8nQFKHf0e^6t+(*+c*N0Ca$1t-BfnJ z^g6$A^q24v%A^%cJ%R#qXIpl-rKRu&wOCAxiQUAK=7q{}v1x|O8VB#BJ8kRf1V&q3 zhoTo+gA8FA# z>p76x-c;wWFnw(<{r36$?d2t|ANSl{4wD1#=a2z4&j!4wDG}=A_ndldw3IsG?K(MB zQ4;XvStR|mfe+lI&h&tH$YszkI+Vi4 zaNC$`h{b%H)0f_FCXjS?x6sWoVkI`TR9f~3>M>_l(E7R5d@_4g*Y32HrZ4@NP)r|F zgmnj3r?}2}x=_PQABuC=t#nT;%fdk^{MJNddK=h?_%0qFsJz~glfH~}Qc zukXLm(Q>K7kSJW&CF)R78{oKg9-Le!i{aX%n`u-=+eN6YgR4v4SZ5zY zq>nNWv_nt)x`Dc)T2gF!i@DB-Cm`=+O_G9xak(DOV)?6Q%(aiSlDVNX1+#P~?(axa z7W-6AEyi2^0-Xe;&ylonAIr~Vp`z-8NlkgBz*(^q1A4FJ*&lydeo_DQ>5?M0^$u}D zJ_<}AjBa_QNw8WupLR0(5axy*mL{npyt*nyq@Eb?3J*X0HiI-GTH+wsSk#uD9Uk2M zk*K3%@-Z2Gw*uF%667y)r=m&w`&-m?o#bI0>+Np_rj-_FZ5t*LE{0*;%n`4)%W7uA z09z-0r5~&nw=O66NP#K}X<_{L6Xs5yG!#&4!hp|Oz{WZnbzWUN^Yr6{*rk>_PKHWD z5v@~UDHS3by)AHM5+{)pF!0{Gi1##}sO}Hc-qUM(Yo7Lo1*j_VDxXkspgn&edyv=C z>{E-^I0a>w6uudIlHXn@>7 z-IEQ>dU!lVv`T3NQ1uu1XCdH!>i7QIT=)NpYV|)gM}YJAufy3QqG<)sw!YWbCtV1+ zEFoBV-B^oOFu@-re9U66Ed~%m=Is0h`bN^=u8Sk6HFL--(5+8?oYm+L<%LhbgqiIb(Taf=C(!$Z2a zj4cPw9hL4)uYIi8_-@hhKzR6(y6R+W+>oD2s_1AqKiiroo!k%<=syP*n(faPFq=+p zoruzkQ2sPKUY?ezf)mJwzOv*f(HxgQaZCTr2KNsbZhuC*PSKuXi(hrj4pt#yIX4e* zXl%8siuqzJk9krL^LTQHay)etzZH)-$??O&ce8WJFw|v*3`Fo{#3z$#rXAEPZd)!F z8-~VB53}jGzK|ETOuvqs+{Vu?*plqNO(@*uPfYNw4=tEj`~HVt%g|o()9?)$!*!|r5arO%D%^X0 zi+~*Zg$%(|2KLuV%raKpf3r_5?BmqYlX`Bb_A?+LxbWwvdl22W)Q#9ELMV-v%>Bg(_(R=UnS$&n z&41?kS2F`1IcESAo$?J3ZDYn`1e6j@~ZASFPZLC5wr>Q?P)p zy>I5dmAl4Yx*|aDpJps4KHLYW1p>K1>}o>;R5Yhvq5^TuOT>L@7wP(BYrn))&@|Vll}$C9cXjD$74JP`03(s#{GI7I zRs=BHs@2H=}lsE&aBKa);jzFosp zq;?di+C>|{BI=%V!n(X1)Um>H_L@<<@iD>Tu%Z1PZ3F3PWC)^?H6)0Rn7Y^H51)>! zD;wZe*r{#iWC-EAJPcCr0z z>Gxsm-LawK^t_}OhrtS;H$jT+r4r$7mGZ&VGvph!c;E1DO5R*+I^}(BxrmGbW0;f8 z_t{Z7DFgMsRXUC>FQY!5`Aqoyj%_2n)20&kGSXHt&2D%O_5ATyE`B;JY_EB6caRn^=3Qc&2HQkzo@3Zh0K4 z$|e-z$YTj6-db#^9p8l2On4n0_hrjclzbmj0vOIqv0jLI|CK02y1r_2yk?`tW2}q+ z5_&%%&E0X&8T~znjrMh`S-ZkkeXTOcuiQ2}<(gZ7Gp9-uMq(3>SqNsZ;ntBGZ$F+A0d z;F+;@J%0557Tu!g79(-sP8|eycR1qQTK_ar-Z<6J&hLpahz; z4*d6b#iu{7b^q$I|JI%B9HtK@uKiS%+A$gze^U9eI&!7HA-zL~-I+bl1hUq5jr1_Kw5T$FJt6j$=GvnM^?Pd7V)tIi<|0 z5o>T{pW{$?xp~k{+I&{9cXgkpXG8F=4#qh&saSdRN-G~ z2^f4#T=>Kd?m{xI1CCf%(pXBf*V(wr#2n6_Qaac2B&@1ooJhjMxaYx&mlP7fXl@aNEu zfuW|c!u3tkdCKd+u6nEEKcNI%eb ziWYKR2pwme&dm-MT{{XaOSE;fvvuJ}Pj7~@%i%8ET2jBx@!&a#FANS{vXsW{P#Ld@ zr^8zySfP*(!PxeIjjKU+{JEOn0ie&DkOVIWHp=6;Qc7KC8#DRA$d<*1H4$3eTly9Y zZcgoP^YuGTkbJ#B7%+MBy-)Vi)3OqO7mhRxifxU09k-oQwkChoFK^xC>)iCLwnwG* z`gy};Ehp1v4`6A$bF6viA+d=Pf$#5VV?k7^07ZZ~xfLVo7|CEk;0x=npC!xY=0?4( zDzpD(cqqvb)@FsX*|Ns<>~$(>Op>)cvmTl`5#Dw9jmf+!`vuy(;#*glI^^-)lv}l_ z8^RNg!rR^?24Ds`sb^w676Wrk9Jk4@ssjuUH$ z%@vOahL7#eHEokB<_ z@yTxz*GgBjk+d+ZI~t{#4+#A7*7^nk9liYMvSy6#Jw z3QCbUWe}4=(l9@y7*voYTj1)HR#JC?=m_Q zJ={hkhQP=X=kT6b!J^AyZs?AqX9Lo&T(qw+A{y<5>Tsb6k~SK~-E{>aQNqPQf%iLA z7=Cjs2ym>x(?##w8`+|~D%Wq4wtF8L&V_WQ%tzz1NpyL29C6MF~zl-OVc=vdb za1JM;9oQ)O{&o!^7T!lkt&D%^HTo}q{ckQ3iU)6TVq-N4rNlP*PhggL zT_@Z|ryB=4ZXarYVZJZ@>zez={hl>d$BOY*uo!52H15&l~sUPtIsL4L_qJ8eoXYT;ra(Li#?Et--K6Yoo zPtlb|HMp!qRl-TMCI0>QFF=NRXLR$YwH(@TYzx?(Ya$nRcqI9hL3d5LJlN$SO}SNI(S||%Wwx!oIIf;{9P14QK4|QS8utSE2BoA`J}{zZ z{sPW-YC%te5+P20LM@FDxww0IH#bAQ zd$u*>lvv{x`g z;nT@_@YJwjZJkYYoqJcYr{~h(1HCJXkE%Wzq0|}3gYFAVKNonM!PT6-Ow4pW{x%V^ zoh7~Q*OWel625Jh5ZGvYdc3-z;XLXJvkZ{y?1y=FKLoy65?+pwHE?NNYS(c4aZLqQ zPC+htlSsFG$?;9skoZbVcu120*X!pXAr)R2coSe-&}*SL$4YfdU_9n^Nk3I@Tl;oZ zI$|%6Z>4`Qx2Gyaw?(|fqSLt>xK#S z&8WPGDeq60w}5MNZ98K?vR8pw-$h$88G zF0}=ltx!e&{0mpTNWn*!nxR&dA{EIG#H#xv)#f(X?FucU9y(Z4fh3*$U>w2+WwAboI z)Kn9tviaii6m;!0U9_wr{nWsS6|Bi{l-otp+VYf%y2oebkG%f8F^7=O1~*wZK!PtU zMI*@-KybWkYZK0WhUdbIk156Mr5?0}D)qixq`W0d_zyd)fLS91Ec0HMCtoy^RB@G# zU4Tnvco_JVWX~mN%9CC2DIL_~M0*v4pZDd1@bhBd^-5)rf4KF-?t%C5807o2L+%;U z-Op2OYK2~Gd5ft~BD(YeJ>Bk)(LMTkD*lph697X~sfOdOw7in%LFl1G&4O+KKIjlK z^5fO@Pm3Oeh-EE_hXx4Kuq^5fX+MX4OQxyb}LU~>bj^h zm?ce7$o!&XIuj+Y)#^ILYxjhjlYA!fd6^#8w~NAx0f)_4qf@_^PQ$hy)mFY7boW$= zU|=_jGe@fU9}6(maVavNx3-$Gocp9YSjeOz`T0Cn@aaj=@>75GS$r`jE)j3rDaWmr zRn(VlGqcXN;KUu)RFFJRkPVEBnpI>RPEyr?SbUwIrjNiZ^t^5ME7Q-OBSN|}&P>>b zZi(`@?Ap>D|Ze&T!B&PM$HA1+nU=DZWji#;( zT@^TIRsZQw+uv3ZTLEXnD`QzhgT$OrOhFeBc2N20(wKurNTk<WjliO3@BOayT~{FDhh&>q<-tqJ&4yjC~QOR`(`wZCEhBXNt-?Bh(x1Fv3%LA!yU zw|vy1$JJC|YW)}{@}2M_YrPKWncEIkF*0wR&Z)3JsTUcLV&M9;(F2RMCG_JSA_MJr za$rSnr{=80N`l0qbr72=@voS^4;k*Tr+11#UM|Y|9dz%!tV03y4qHg%v@xNMr9~~P zK(1)daw_^(e12;RUMYj*4=EQpluKVGoFk3T#u1NT98{=XW7W3l8%yh)n zAeyWV#2K}#umFrsn2ino#8|JR^^*`LuRU?`JlmI1T)RvSn-CS#fi`Zcqy=0XAMr`~ ze6uXpjeh+#Z0B};+;CT;{UF3OUp1XC!rOY9mfZ?Pu=H9mgZ_ zDB-e@8$dr0i|7((hxOSdhfa!NN9sC9$eQLSUoOg4ddD6wK4YkKm`_{hhI7g$pf78U z=Oiq27)G)b;&nbA_vB!?WJ&<^zW=jDqug_(7LliC?qZhav-+g-e zRpuiQ9f)Si6cyPWXq^Y0olb{1!WRGwa}6SZ_^PVh2a1PV!+)Y-kyYsyk>sCvL+aT& z2w6Yy>I#rUsq#>dWc+p!!$7l3YpnY(kQ3xP8L`L{XftqFPUR29=}`S)`9Q!lgQ8wR z68%{}fe+WPD6)_%f|z=Ae>joqFHiaDxVH>c17ZOJBR8ZPwM&KIXxFI@g9zMDa0p~? ztQ$f!m<42c0qfY2AbJ4%H)ID$fyw~h+=U*^fTQ-lw(aETpoGXj-AJonDL|z(1^){a z3^v_U12|q0nPfI#^b7!7@90v(y^Tga$N;qL{6!J$gEiv(@eMhK4AX7m@!mo|@s| z>cYX-9Fp!YkOWX82dS0+fP+bj)G?b%#AY34h9cD7h=_I>%2LeOyW2A2YTs19ys4e| z=rFTvA>hX^o{W*;H_qwP004uV;Q97n9= zM%FHf56?n3C#G{a+n z+|cd)XjR5}Rk`7A2nvQRJ$do+Pgsp+p9UaPgSg4AcyOm7S2r$YDWNx) z*a(?hoxj>MsND5Bj^pbvgPLRmi&P*Hq90LG+9wd=zK+ax%#Qkb>PB-!x6P^j7E3|~ zP9n}_bas_nE!NN0Yf$|1nJ#QYfEEAtQk5iQk(Akc5Zyhe;F>*yeAk6q)2$OOi>mSm0Jx$Cp4TaW6&u~V zqx2y+;=0qS&Z&Y5@U7a6T!5Z*s_Vg;cS&F{E1Ny`rr9a^orZzd6!?Hjk&gT=-+0?kDvg% zB$#Xh6tyRQf%?W^mej6(8;W>qHj*f2Lr#|uYumh214QN<>%z4A_X!4nqL&cme}SIZ z11n$db#xIr>Gzta|E0NS0Tr6xNtuc;`%={p5c>n;M+kE0R@+9xFVIE9%*JHl?B+TG zirCn8td)j{?kV~rAv33@`Lr~&agBOQo7E=$P5dhJMU0iKA3MWBt3k?6lEIhnCQ(lklFq1P9nxOe4R zK+?6dsvV5`Q3t}E2@JdmzK;RL{pCMZ^LZyjaVzRLbc~7)HdbjcbCRqHY!6OW=a=2( z-!$2FoD+FVM^qp7UvtcU&ASj1#J37zS(;cHc{V-h?Q$c~WK(8W?}ydEOs;!N+=wnI z2jp|?BUJ#-fOG5Czdd#zEwh$lR{g5;WMTeXzk^qNVtU5YJGo_um&+coa`o(_q!K#t zBI+fT8z^I?QRuX*>*G4eP1ZcceP#A<8=qayi~ly<-@X#8(5;IB(6|h4TwABnqL|#L zMO@|d@Y8F9U2*ewx(A|vNOLZKCU1Wp=4vmijJg(#k#z$aRVG{6g*u~4C1*N}KJhjC zpd1t7&T)M7uWKn^+E06Ch)A4+pTgzGHVXh_v#gKITbZv)b*iDB*CmJzNZrgcZXWSH zTm9M-t%A-p0wmOqiEQ0fRVWw4(`^>xX1X%2Hlw#?pwR&{Yj@E2sdf2tlLtcvx~amm z0ttSKV0n0)C|dZ#XuyjB7#$`*yp?n9fS|@Tx-|N%yqy0C>i8}u2wGIkaqpH^NBNg} zT0YU?9U!^{&e9^e2%SLm(Qv!`sWI(#92Mhrm?xy1F$GEX6N;@@{e*4FhywKFc*|pe zl*OTU{se*!%KfAdQ)_mAQ*Kwav8e0I*P^#yl0Zc`)@Q~j9vGb_F>;YK%(Ov>^~<)- z){$)-z@K|yXU8)V&Lz^q>b&Zh|CCO8i8VaqJpSNiZ+NptZfB9>m#9KFIZytH>5@2> z?~jzO0~Rdbn`uZ2L{Us@8%teYXOY$V#~YKl+4a<#!G)yrJ;vvBM3lUH0yoB1^S%Jy zlvoI3E2r)UGK?6DKPGmbteU^U#Gmm||Dx2wAk)VL+=u6{W>0LZi@j(~C4ju3M!4M5i3Pq_I2k-Cm@b<)re_`!RgoCVEkVAC|CU(>7LqZ9h7**w;6A z-SnJYipHgn6w($#)!4?>xcYF^Ksj9%(M#9d99@6RFWwUD zOF_KVea@g<88C^8R^wlND2G!;zm$!ZYa9_TO&+o|PQBHCGZWM_5y}$!+S8hkeKUBO zY>O-J00Jy$n5E5V;@z(TW|ImJbxe`=a#FW)l|J^)10KFh)i&RBFlCYMd2?m4o&kkU zb%dj&+B`(Ttd&*nh}>!$Y{;M+vOB!Zdn)u{la}~{19hOX|8Lea0wSMF%8FJyPn7`b zI2wH2V(c`Sg^HvV@>QT{f4cS9&<|B+Gh%Uv@G|C^1Ki5;Lx9%RMTe|mPaAvFj<%Za z@dZZj@^BPyi4l4DIMjV+ood#XsM~yrDLC-CxjS_%bh`CNU47L0ydoyvK3t0>;Ub_> z^;%<_zMQ36@ygc;PA}cRN*evGtg?61><6dc^8esTq-Th}r<4@PW`GUl40QGAu#+(L z3zX1D5=TrC+KvR#U-uX7A-sVsvuXpvMHb6QmWYBr9;sW&qm`BKPv1C0uT}%x&oLaT z7tEtZPqghxP3d)eWx_fA{+>)N0lnChXZLhTK$YDT#prD0kyxp{n`qEyMY6j3Y|%cU znYK5B33m`C7R2T_FG+lXEiSz=(?y@^{WCE+(YR63o#X3WkNSs7G_@@}y*4}m32eb6 z(E!g@xi!87Vr8bsLL=`r;+J$&c& z?qZE&!Vfn=29D#8UYSUu6_4|KTBEk?@bz#eqH+oJTvfkIxs^I@O9n2b=NhC=9%;Ti0rF79N9+0$7h)|xMA1oy?& z^f>s2=r{6Xyk_S-?L@PD?mF6_vHVq8Yxf6p%busnexN>O9WB~~-e9Fv)wJ*Ttlfxo za;4HG+08&$3_J;RaB!MI4gO@7=z3%PrQutE7{vC`BB-lUgG=0W`Ee81=iX8aq9;DF z0|Fq!P15O1bhnp zsj7xr<4^WV%d(G)mfcc0O@ASTUL}M+U&J${IqAi1iJOC|heeP>-JW;9Ku@cFfjm6X zqcwXE5W9)7($qc&>hen204qKQN~S}5_mEA95P(M-JI_`Rr~!E?yZGKpLy14q?3%(X zxrgiUy}Kp*&B*4q-%95JioR7fsK1f}XFN=n0Ebr~ibnPWUYu;p@9phpC(bCn7`Z)AZw^3ODTP?Lf~*7*WR<5t2CNS^Cp`-SHVyh#Ci1D2t_=23 zSv@{0D7VYoRDF-#$KMCN;y=qZzw{{4cJB^ohT!gizfmySYJ8JY55R|9nnQV1}< zD(c+^$}Wux15#{w>Izw73CT<};7qZ^vn1tehhB3le-iQ2o}&}dSq`9}3s~Iz$4*iy zR+J1T$6B+7w=Jfls_MeZ&#W(ClETRAp6O4vSBy0G^&+hiw{5Jbs{s({ftt1%+jc2u z)*RwFW*cn&t4a>91$wiVk-kN@w=cL)?F&**#wrs6h=U|UgG9LHcyGsW#4}$+2P-D6 zP1>mSVV0W&sH1Szm`*@|KFFEG*%@?pgv|G$H-D8|4wY@DeLcC+yU4ODXG8>a0&1#) zKbw)pt^%oUIxgJam^t*fQ04*H!PI124@&N{!Ha%V^hOrHFZDzJ zoc5*Zb=zk}Na5qIQ}Ql!G4BZ+XMAdck2<$r2flMvAyS9hrTUv)F6`2)O6o!VPV6@h zyU6&St`il!bKYa`&AI#ONo|U{FjM=7(T{H>{v$ze}#0n;NWg<3RCmB z*wc?a+!Itc0v1buW^fS4%K8@EOM0<)zi(13MhD-tobrd_gEEJona-b5WXz3tb}1W%oL5O6XT z!uPl9U?aLzW#Q)7a&~L$&kn8euRew=zq=I21$w4a-DAcU!emQy&TofG=2Hj3{H3SH ztm$P9*>2c3zV5sB;<4TArl0)<4?_#~`Bc|)YHj9hIeWBM?>IbHvE8_FtMY=^qjhb~ ztDj|o<(2+{#HMjp*$5|PSLfnA0`q<}8X)1A+BQ4+FD)=Chm?H@bnV98Bv_FlQv z!nA}AP99!LBhK_tk2ew%ao@e(RE8+aa!Ocr&MKI6KS|zodVkZ+M!qYkIA*T7`w;;deMB>;zSB;TLVjtdr2E&XD&rcn2 zJ_EJb`VjN+aNa!T&Mm1Gx{J>hmJ~Ov$(}k=$#Giop0QsvICjxneT!fgXfY1`U|c2u z+Hzy5+dn8#tgA>*H8-y3mLbHuX<~RNTY4nP`Wc zm$G*p3Nn3i+Cd<8YM)M8f0i|OYhP5aIPyaH`uTk1iKRI8frho$tQ=2YXC^$SvpAF| zgb+_)SYOyMy21l+8+Z8G@rExaTr*p%l0vOBpYw~pUl}y%6H+yFw=r5h__{I8oV3*4 z$TRaaDf^vLapq-n!-`KQ3w7rQw5FDIU#mPTi^#e9xm*G8gBN}Xzf9CDhgIo@Pg4&J zt`~Ja*NsVecRGC9v)u#Q2l(b@FD$Y~1YIXe+_Sfops_YAUR&+D7^vjsl*RO(g-gmI z>yf`#8Myl)eLXIMeD=VmWjsN$jF1evf9y36EBDW!dE5v6&xHai zNMiETmTW+P`~*xBsIMI80F@I-pqx@=Ec6i>)LIX<}$^u&Dv{*a#l(r!m)T5 zNd==(W@C$EvMpDBT*q0`v9&Z8Se!q(?RQmJwe_xsVmWjYvdk6;m~{vOL+iseNXoOl zxP@CpMLf5YK-E|8Xy^h1v`YDUoGj;=eyFs^b4~3p29hzSVn*MjCTz+C_;%S$-`)VW z{A%A&HEt&pAK%)eNaKm^%TEMroH*v$)mg6@GqJ_d+@EhDv~rF;X`Ue4;i7vj-s|({ zES2-mZHOeY&Rc%FTs`uwgQt-`*nR85L3yKasO!N=AgTHfBU;)-V%?%1A_~kzHpPgR zW{Wx8$rw;Rb?Ex}24unp%IeJlC9vTsyj||1Kl6$C3Zeq;qPooHbIVOY_YJGdyq(`v{c5g@{!g`RMgB^G z-b?;5F#2bSsq)LjcO*0NCE|@?&M0_5uK7a=@T0mnbJmIPPhWT?^Af3Evz!91Y3nwO zD*g^XN8~J-o4LN!^-wiQCYs~Rck9QqPm_Y4s&G7!ZeItf)s(9dQ6vqx98m(g0xT!f z@)41Sl49FsZ)R^d2uOA`#2?aa7p^@#&$?NM&&Ot$v~j8_+nAewh6G~TB78axYJ1P$ z_gWhCR#;iSC-vOl_AB)~wFg3{c6N?ui|3PxyhW8le2ulOH)Td{uBo{%`m~>_$Pf1oY z0`K38p{7U&ku-_E4v{bYhQ0<~-{Yu1FHGm2Ai6#+OpA4!)@jGX0i~18c+6*6Yg5a( z&K?Y;TDG{hvFZKWZ_&D<9$g904W<7q*UZ9#4>|V@kA9~70;$X5*~Tmh&8CtP*5@R|?CajxUp2~k&LWljh#KBt^K#*o z3^LS)^=CQVs>JgngVeH*JPovc;U2x$K*QId;pwIZX;E`EM`>SFM)flUNT%lL^464e zms7iQJP0214p(s=uY|3-*KW3RaZ-kb1woe`xr5q(hyg@2k!3R5?oJ5K;kvdZRtE_UV+!OcBDxiLHtcu2O=(v`|U9RhQk5@NlSJ_x^^lFKL#0wps-=kAd#~%cwmAWsX z7B$WK{el47;DIA@(zBztj3;;0-_f6LD|?eKGV+(#4U(28WeVt_L_H*?CLhzHC`n0W z2nhM@9^^UYTDCcUB?h5m|H&XwfqeEC(;bW`c*>A3C0DEi(-u=jB6!{pk7V%R+4T>RpaW+-v9(9IG_V&9^KUU~14 z99c!|(>^i3Noaj_HWcyc)w)eqGEjG<6We&@`6(X8fe#_xRZl555-GOwaC)RXTldoF zuxZ^HBi&le`YhPsAjKUkeHI?AoyAcdX)u90{hc#ljhp@vpEeiFeMRIcExhbB(!PaS zhuZZ@DX~F zAg9^L+qsd?`L6fm>l4&4g3$y^0h+Z>*zuE_qaVloX0^M4I#w0oT|{Z*PB!-bH7_}{ z{g^vak{lgzIoPCG`jcN;xap0F%`%r}9!E^oDfcj?=D(!UCnBF4a^JltKtgawFN>Q&ZP zE&oeFV!RHFJJZq{8VInmhqK^7QSXn;k!lA;I^Re!i10k&_Ct~-xBuI>B^{yoZ}PC->If2U3DQx*CIT2&Aw;whP-xu+3v4>;Tq>K#~jJ zHUooiMUAQV&ellevXXA!rm+9|3_!1Q-~2sOm-wzOfG;q{&-WOCiOV6=dVaq_Y?hNZ zJl%Pas=`e6=FJbK82Ifn_KvS8c&P_s3xLKU+Z^%40C?QD%`1jTuC8uV{HFZJw; zCSP)EUA|R+r<{gk&AYc7mPho35k%KQPO@VHHTgJ)Z(pbx+M~bpnE(Bb{?)uR5k$^I z_qLUc?Lwe+;HBx0&u(i0WNftBIY7{1x&3SzZ!k7fYYM@qSXnD9r!Uv6Xzj#UTo*~5 z9o=v3RcWaJwz1qXO>UC$VA-~m^u;=YZH0rvVqCm@PEz-ar;1T3B2Q&rtwIgWWELP% z$KVdDP+MkN26xPu0#CL6>$C@A+e=UCt#_z28Xx*t6tk>6BdRSZbu&v~&`>5%v@?bb zX-f+jw&*H$IT(>bc~i~STy6e^rKw=TSUeSC+ShmpsZ?E?uY^X2@vH}bo?dIvXlk@t z=QS6xnr)xX&A8U4N7cxGPHwpAM;szX3O*3!H=kUwX6cyLSO(``ReaSs5&ZuB`#zib zc7D$rJQtPUz7i_urlUDCLU76?BpVU@0loUOeW0*Cb$TMbQg33@Il}5X)mWG4hA<$E z35}SCQ_{)O@`-d^XXiz8^4)BCENuLau#kpVhXq%EN|jK7CElG4671LFFACpHYQ#HZ z=erqg)HFEh4WAGtjW+y+aiKP4`=YFSH)AVs!lo1w5eG0PeHdQOuc7&l+SM7S8rvT zSXT2|bg0{cxEMuo&2u%gu4kPc+Pb2ou&Pe4EFLh5JJo+Qff3=)9cn zOqo@NG|6%rvp^=httR3Q-7YVgmTv{ML_7(TH1I9#!1c9I_kHSfAxs0p8HLjuz6`M- zF1ZSu%s@XtRrdm)ywTz|UulDCPv)MBFC1G8@|1U;`ex8^_mg6pT!bubz%g#TX)aF1 z%vewlHxMBQo4ZIf^8IM!T;Qt7(7Zb}?tYR*uL6*SX*+OU>t>SwH9X_WTBc!7osbC@P{)?dAyrVPSA(TtkZ@U_pNW z#_{%?hpPjDd|-PG7-%t70+g%hv0h`iY!621dKo50VjhsEM6>mn6D!^n$+{p}exQULPpox4@ zQR^MIRRx}YwLlk>u9(fGE^+q9{2uwnRWfp=X27t;$oM+nJQLC}`xy`-au_{BGvuz; z-r9zKrJItPS%|O<(%0>IHP)DIxqoLFEk}@R7nG+VhI9psqN!MX9b=9M#!=unS-fmOEDeItsSR(<`y^R9zP3c!F$l`eO!CPru>GdVL@ec^zeDJA5eTd?>3n~xto=AnU8cC2*uA% zAb%}}T|;87&tt410KLw^vqY+AsOrD_UY$Rb;$mc-Ufns;uz7fi&g#h+ZrVD^pW2lz|Y;Nu0E$zufdqAd&>7;9apwK$P1fVE#2B1Eifnc5~%pa?O0UPi=C89CF?ptR4 zf6WI*^ykfJl_1*;A9LQ995{esSlXj_ij=x`Gpw z4+zw(tF|q8#se7Mb7LiM%?s`-jjFbJI1M> z0WXP{#J2=k+dM5vNwf*)37D{WhhHxkgvHUut2~n|{e7MCYoOXR6RWf7TcYb&JxJk#2XHRgHPoTe{s)%= zn8*_5BkyW%?}~)J(n0{eCx*aH;4c6IWkBN3{00H!#-V?+HWqT`ePo8lK;>l=fJ?wg zk;{=Q>lkSY6%-IuPC@?bg2r>hwtm5t$=7DxNPK$OPsr{*9!L{FK%0rU3I3NA!S({X z^=4)6d_ z4`6XQ)cpRjSq_M18V`T~G8^@*g?Rc;v(M$;);A?6t4+-1jOmhr`8^< zdZ#SA&C#_O7@Htt6KJm12?RULqIXqoY!gJ!Hj*&-VcAC*#BdRYA67hr9+NAu?rh*ka(x&mZ6vzzb0Dg z>;R;=rJ-$5&UMq=HYDD7`)s$*1p+Eu67_nI*&F8d$bRd#O96#O0-;uY`!0oj_7~tJ z`*H7|b~jMIasAVM<-(P>k{@2WvIr;YR~%4_HoFl+u-oxK-Ai7{RsmlSSCxGVerwZ5 z>*H#AhU)y+LJ{}!uj5w zdxG;peC-$jPZ{7?Z9awfQ}UT|shO};wH+u5@~r;!uAV%Vb)6yW1`d#xv_h*ADiL5p zZH&4@fn;(Ff8sEwk?P_Tta}EhF`Xo%XTZ`^K`gK}VkA17jz4~0#%@%d zwwhx)WqmPCob?Iz3m~l90>=VsZgmK@FGQ(4(!4T>Ao)Gt;Hx}MhHCY0R zA+74DE!cXzP1>ug(MxP!v)_=f|KQ`9w)Mq3L1P7hDDPQd7Z_iSR>MJ9-Po=k*Sw`o z;eRxM(2B6qXWzPts)mFlzm|Jp)Mu4NMNBOt#vjBIQx~QuaiNUYt45Y9Vjr3KyN)Hi zweJfZO4!Tq520H3GwW+%1*TVJ;`&?Su|w*{IYc?5A^(%vEB@LeT1%tbg5$d5vL?b> zI@hh!9+{_%aJ1l|2NC1`tl`Zn1aP8#)Cm+X*_{I`1m-#wYM~Vit$dzd%a3vV#g(w& z1N!vF{Fo)~h`lpRkvN}^poVZ6CHuM19N^c)Ix1YeHJYkR%0AG%O*%|5Y3uM`bXG?w#6*yLLwW+_%SVTy+{$nV z+;m7UgxcD?U`qsMWm9@>u zef1G)F}v?yez4EB=3S^pO-^9UVnto6jiR#HM#Y{`D|AOc+41r{jB`dc1q+;yLTr{I z+tdO5i+iwkNb>Urb*b;>)tUXCUscq#7aXSf14YVS+)5^ryrLN_4n zv)2pFxLaqg|B_e(47E!YC+x&XJ=g3GV6#)|l9VrRzqDK=a4cPLyeP6KZ~}gyLz0`{ zWX&@WUIkZGrKfBjU7SE%fkD*wT_!bjlsBRG0j{)!-UrH03OGxP-)Ho2$F!NJjLw zv&r5w*d|6D!1a4(efiP|IM^viZTE2pBrXH*zRlO`&!UCrT3vL$G%3J}fy3a)yjFX?QAPIyNf zvC8kPyb)S#Z1b4Mlu6@8KVqdAxN{Kaev`xGXhl`utr@U>UDi-I4(*`MXfr$>w`Pbc7)?+exN z?#tBc;u4E3Y6lbesdAWSs1q}6X^%xuis1CbY~)!(G{pl${S`(6j%H-@UZWCQe@ z$#v@lS<>B+s|;7wo9?K%gOgp)f(uH6St3vLNumUGw_|8$HlwWsKXXPEwMMAb&q2C; zV~YmoSieM9y^(4w2|No!b)BBd-jQCZ_`elT9hv<#vs)6`rr)yo6L^R!qJe(<@Xz8k zD^KEKJh_cjc*^phrrG|rS_`dzP)#I5TA@)e&1feDB>(!vl{`31uiK!1;~lN=NsR1< z5kE+`ALNUyD%iC0$DeeXBALT@R~c}S_>8I$Y`J8-fhapr^D`^c#IocB*J7!I2Evh! z5I+)CS7s&~8}#CAd11Q-79@j`Ha@SfM&nVp&C*V`XiT3JBc)Puo*;_0W; z-#Y{UJMTf@uIhMSOjD0Dsvl&TJUYJ(?Q%)m>5C^zugU{S<^gE zFo>96<@#Eko|mG0@}+yeeJzN2eY4AFEQdF4-3KSe{>xHg5pa`FUp|2r4?e{Gx(#}0V2f(j4q+I|W>r+t5Fz$UXJa7X+%sE?1>p5yRKBr$kz z2Dbis%&>8ph01$TEMtv|``-KC}+JC_MII1N$ zM)#(RmAd<4vP5TOuYJ?mN4*A)HGp6?5;ibL7Y`t@5f`xDWxtf}Ppxj|yPli9_4($H zf%7T7xgNU8D$3jSbYCtRw9KrkAR->XuU#&@#mATLNU~5IxypGSKHV2-K2Tyzk^vhux!Tb-{64(S5ZQN)N!?GFa>h^A9kHn%!z&Wg`k{hZNC z&^deZJworsjd|`Z*f5Ee(5OwUFValO2k3U#Ze0?*Ug&-3#; zbFA(?SgIV<`39y76EQluM$}%xpb`)=SpA|s*mPU28?R={r5V-gs+Z`B*D{ecH-F4a z_{d(N2oSljo-t6GFd5BiJA?)n5)=4kO)NNzmD0ZwwT9Dr@j~ZJy4* zMIfv*n|Y&T&mja0X0=*9{^(*x2tq!QaR_aUYP0#K3rPUgi-RZO|;cNYU%<= zCE1k)k5?#!>KFoyg#oTxi}l(stK~meebl*z45Z)k9sLG*n@K6j*bqp*?TGPW@{v1xqDQvI-1SU7<#>xOU>e~o^x6dfZ)PSdHF?9waKi8k-!Yh(SAx#7Vb7K5Jl zqw?{+X;!d!WZmjCb?qm$M`)x$C6VBa2|=<*y$e?OKLTMXjOCe zpE|!(k+^LxOE_I?^F@Nf(X?ak>LJ&zZ-nXP$zH(~Zox%UXA=8%+e8){`87#E^Aq2R z71RQK1ODThrG7oH9g@RQVG&uAke;P9sLq4;)Je@9*A#)(96#v?OR{7$GHfmNeEfVW z^twINV^!6rEINO3wdPuBb>h1#o-|hu9TLxr7aoA9Y)!?@UM4$y%otv3_)l@yKgHx_ zm+Eu>=KzF?B{vlMh(J7*9z@f`uLX-}$iIpo?gJM)6-cp4f=T z>vx{c0E&>1$byv2pcgJRzZQ24Egk|Wvfw3Pl!{_LK+TvTXDNWXr9n067u23*I8}tY zC+TSEwIA$ld4KJ9$R>&@J*%ooLCX*z*xJZ9HA^V(usm+ z2*IZ>aCHjuaOUk8B8MUt_5`!PlO2IsFo0*1L1{Dk;|yd9&4aJ(#1}5CW~44f%B&P* zK&MeT`TOy)m_2nf=+QS$ zG!P~uJfm`7o%1A*rG*6T12lZQ<-&*4jEGmb+6_dp-0J!;EYN7yyI_aJ2;s-ZzI>WT z>rKJwB3GdgfZtBj1&P1MI=Rwmz&k}U8N>xDGs1_h>~QKwF?Gj#s9S>o2^#WzPPBD6 zOqgh7{K)&uv}H}efV!P#2>4qy;ub!$l_K>U^kO?w9a4(?c}qVctbSnr7I~R8P#RRz zc)~;i5zcl^v_lxRb~%58Zj@UY8j-!Hdus214o*S}J%x8Amy_VTi9_Y~*@Z>0B?%N! z6x@2+S2PjRZ`Q%Zm-_5ju`!4*f?oJ^p2xGWPf_G>WW^=|#YhSywqSNs0kLGe;UOF7 z8f5HC_#jo*&z?IVNx1?P!eI1TblK)6(yuq(}y9 z>=TaxZm}o!wd@0ic}a5D!4HR^KzQ8IiXF;{A?7X^b%R-Tugl8d8~e}r$i-5FT4{NN z0=wnzyF%~}wkMA_KPEpUh!$bu->O=zt1>S>q4i?8!dFQ5>K!if-95UoQL|_w@?e2@ z8v9VYl30(?4SWe2RXAU%A=jH}4^9j=il$tKD|Nk51;_hIK4z~%Is_>PkMqz^NXOups#it7bASC2J2{qmhc?*j70Oe|2KWAae9!5Gm4_W&lE>3hB* z?aeEB-*~2MP4nvyns0Nfg(?YpM^LSq;OF9lfjK7A`rTQwb%My}np`>fnurA1#3E(p zP~|lxX|_dwK?_u|pJ%+SAb`*@`;*Mqhc>=N(R?IBux<2?L_lxFJpoug2k0uV# zWa-irB_Rl`&zi+8iR|qZP-b^V$<)0uqgO?3&R-xhFMfkch4{~^tz@N2=GO^MXu;>h zfu|sLA0qIRDAqm4Fct=eXB)@qRsGs_id!2d3e9}wC!EV~U))^T%DU02`Sqro6(G&w z0Gy0>>aNOq)h`a7GPaKHV0P#Z9*+)EN>Yb7VrC6I^ONSLy0|;`6K3BmMQMQ4 zojE3kXMkqVr+*9s_`65qZ|^=Zg2Y|VxM4W;*LuoN10IDIHMb){{k!<*KYSZ~*b^MM zmm9tTKNkQ={zpco=}WGVL^Xpgfa%)F(HK{S?dagwflA(;UK7C&zh7*8dg5*uTCY*O z`JUAh7!?&?ur}ql?q4uJ-Q(}B>i1IYUT??G2v(e}G%lHOiim3q29jy%*9uvvf*JF> z^aEe7K6yqlFmL&jWOG`>D-UkqshXTZHf85*L_=3Z6ws)%;E2!3?dXz+?w6!iw3PCb z^ZwE%gH`zdb~*rD_Y2v~B&89qgCKGdA{1ZLGl`bNb|blo7Dn_5D_k{Sf4Y0uUH(jjW|>OZ3EwbumwJU62VHB zIEkOXO^Aw{cK4^hrSD-S4ZDWbg_85^bQVJOCi$IVjSW}qXCJ=Ww1GaQ$PfdukZ@1+2f2D` zQ|7U0JHzmVK*cYejd@YGA^xpBg1GU5w;G@=alVkgkO8@kLpGzO3Z0#IisJ$gaXAMl z0bgJ00|s48xR1ZJLX?cg{jd6V~`fw5uE&x=rzaGle*&2&X-4Z4 z$cK?tZ_ygA?pVgo@%*M2SpB+|i8~Ad?fx3k55o2IG@c%)MywrA^?W-q=rpA~iMULf zU~C%D;#o8LfI8Je{^C~*VbRCFL7&D+44jTtuR~}tzlJ(zuZRnnPzuEK zlk0E?j9FY&3+p-zpf?w}O7wKYw&_`Lzibe$I>O?Ue;oJg#@>}P zpK0~P%0XtS#1Y&v+_|$MI(jdg8s6A-TpA}~*r1&mW9HS{Is8S~d*GW9)Uq}oac?M0 z8o^HBijgbjZxM>Hh2$`WXL%HR5MF<6fGQr+4OCj6RWJ{ ziqBuii}+Yur3vZt=-BD-?@nlVxh%)8acbDc?X>VCf(1DywyWHQTjwMRi3i!`a{^pO zf_E>ZLdj1`;#OeP&m;%vI5eLFcMV|2(`_|hbWzMmtN3J@TXh_Y7MB)&Y%p*VE=rbB zaUOOV-vX)t_kYf{Sqnqe9&y3w$Tb#a`J-0>)cHVPB~~8A>1ym2ER_o@b{&2x+)$b7 z7+db;_00>W-E2mFNfC0HXGA^=N&6AP=1Lu2lE=gblYIriQ6%v3meKn#vpEe za%q@Q7!=c2j`AS6+ z&!ge0X(Lrw+r_l+*9BwfzV}x#B?aZa_4KM(uQJO-2opDRN!FtZWN`txhxqLn`73M+ z%v@H_FV;U(RQU3=$*rinWWgJ%_kd${qg4uNFI1@M~#`Rr2&>1Y^djs=yWeT z%R`eywN?=?wdZ4(X9}5a^`PS1T(wlIMbXRTC@lnOU?5iSXX<7(JQ!u zHnP%z1s(=lIe12FbPo!YwOY00=Y4JYemvhxriS}snymT$*Ip3+J3Q;mVK(8&k+O>v zFD@b)X^mHxikXed7pu)a(34fn4pJB2nRlNbsKD=F#9a*MB7ABGv;r)r1t;NC7o)1KzxgBvI68LHJ>s=bBPb` z5uUf9tU2T9$(n2HgNjjK-;VGm_Ao;@& zsK@uM?vV-ZlEuvGa6zjDjgU@owP!aVYvl^&*JQ;HbcAYeagLOgbvC>{jug6C7JfaE z%WY9`{i&4*N)8K+4`U;y5`4RBkZkI%rEp}2tzZkKe^}mzVfzAm!{uEmr?g4EM|z+O z>UP<84l-MwHvmFXTb~ z3oSs(&Qmb5A2Pt@s>yuKniyg>o@OuEX~y=nyw`>!`|E4ASACCn!!y=3`6)lxiMeeV z;5dXaE}I*YxUXzKy3$-R?i3;6^rEEscEE4YT01DU}m1C2#%;I>(A zEd2D(hq;tql^)-LW7<<+YSW+9Bs9A8c2g}r#%pqnX+nss?E!*k;#)1bVmzzr&UFuT z`K&)uQ?W}B7pc5arE7kf=iWCf8hmi0d;e$Zw?s0*N`Ev=y$gTR6?S!$bYY=X*dYFX zgX)DktEM~4M)mCa^inCGU-o#?B(V2#IkcxU!l_2fo`h-e#&=E&SpG%xDBYg8aU^flf#qW((LkEZ+P`pb)R!&PKltx)f%0M zkfaD)?b1|mAb`A7Wtu*bC4wGZFn87qkI==?cNPip=X#Gms@n#nrIod@>SBbKiepbU zxiBlwxBU6G86R!vOTUNz@_O*OL~mJ$Nwr)>?A4V!eczp=y9Z)tv_q0(BFbOXn%7n0TAi!=D3-CZ#YdY?}0oEPc z6S#@nSS_65WNH)hJw6+W z*VwriUxe(wJ5#yL60LILq+~OE}*Uwb3d@v(WmdEx&cVSP& z^#1At#6@8IFv(%OI#j;NM?jDE}iy|$WvZJdSarAO)adMBNPSX22>`F;&O;dX?;_QqtoHOxy!GT({R zdY^}xpP-eyX6`$%+o#M|k2ED~3CWTEtkIRk^QMPqY|2TP+EBDwUSk!E1G&wJG}C77ORj*8Ng^NoV*JTdDWcIfG-g&z)yG*|_R z*7$rcP(0uZ2DqXBAdk!P(pu8CzKav2|%-z3iWbbRRy14S~Q;F{#jDXuFsZfNh|4xT;x54^EjxcT?bu-dl}k;azRYk(nXqYkq+0LyuDC*zR{qgLO$Net zn`bOVg1{Q7uHuesIYa!ia-#hkr2q0jO}~-xM<7}&{Yeat7`#%1yVTeeY|UbZfp$ut zheu+}y2z}p$K5&D%u*_AxS8pweTs*Zp3qzhyZF}2+jif0qQ3_$CgrqLHl=r<&biHj z)9(@v<2S=D{Cu8nI=J8Ogz@mmdq`f>gUMXTWzsuM2eM=9MY)god&V|IcaUw5&~g|u zUDuP7y?$OSdCPvM&APO6==0MJC7kD(J1aS;s|2M;L@1%JTadnPJWOzsr0L1JMp#R4 zAdFpTs7p1bKT#L7jIukam)=RtU1NMgcOeLGqyr3@%g0Iz?phLV2W(?ps==xg(E2F< zlr8J?791V)(Z<|IH$-p>2O?+V;SRC-vfVjUKISiIRm6%=r-+FdX@0ocK}0;(`58jC zS+1&iP&+>8lO!p?=lZIQ-wU}SraZ-KDyIz%=~2NIFM~=-7aN{hS)<*CTg&*h7f*fT z4&C5SIP9|K$Yd7oN{Es;m8#}xwJ zB1wX0ycCbA2A_T%`ra(B)~%(2v23^G01n>+Ry;tmWC18X*-=xDGfrxgdtIX*RO`){ zgqahp*1#}&#$l50!CJ`DUrfER4XQ8II)9&) z2NZ`ZWP$3aKC18o>zL7`8pXhQVV)eDp+tGnVOwl#@~^$$e@OrTVeAc)XM|lNOR;7T)0XO$+j_noo}n5gU}P<2aTbyjiF_*YV_3x z>?XA^t450`lcDHeH`-`?=xKb!F=7n97Zm&*28PCOn`xVAQ>DI+px2-mE-a8zV%qCR z^a0r$bOQOfC5978+blAt7lP5ac{A%YA-|10LpT`OvikV_EtTGy%+G{SIony)xIB*} zJuxna8zMA9VYlmZ}y^#)-9o7}TsLmmAk zt17jrs6;NeUTYIhdB^X}ZwCE9AIvPbYqSVo*kp^f5FvfA`YI z(DiZ?mtFl5vVu9WIr~J42(r5QBDRz_NRi1iCEw?JsGdSQWq?yPpa_7UlG7^GELXWS zFA->zYT1gM;%E~}5G?6Ue zboOV8V?NP7}Osq5MNO)wYA-^S>R;<0+M+rrN=}M3dpVw%NBf3ghSt?pJ8Ot`SEPy=uDUaLV$ ziHqhj;)6p<+%ez6e21hyAn!R%V(=uG{=nTMup|bOi=L$KwoFOpnZB?(`d*rr^5}(x z$AHR!!n@EV`wa(#_PzO<^!9`V+>Nmq5-ty5CuC2nEiA?;qNlV=8EZe&PgWc~g-!0W zQvCwZu2m?GS@T$bvJ^vUsT8T3R(Yv=e~M+~eQx2WwJfT^>TDjDW^weFUH@4@75j>Y zUG5-#@l@|Be*VZAQ#6XeF(5iZ3c_V4AkU&x{4LO!V5CNsV?JMN!-LXi55cLU+B+&aq z-sT>lEm4seZ{{IM;#b-QXhXT$$6uPe`r2qMLj~Xac-rU1 z@ChA)`&fD=YW1wXr=$P!wVm7-XK^mdDonv~WoF-A2yzhEZiUT5>30C7|M|Jx-=L3M zfEF@7*M!{AELe9>0DY;jSW!ht^E%{i_V@jN5~(G&cqssPwDHc4ah{N zvPZKt-k9H6XN@8-X5Gjo zuJj=J)M2dKXF zX1DpOld!rU(84(xB6eQ{X?;{$f57e-J+-AqGQu=@lW*u7P@6Y$)lT{6p_t@!PbIkD zS1jmlxrl*x%XCRu*e)P4@wDU8r%5h&&)Dj9wLS=l_|~5u&A>d`Hlm+}#pL~5yFTTZ zAXl`xOf~9#_}&MoLpxxy1>v@=EbErmfxH^%7k#JEcL$Fo*=S`Go^eQoCxG3S{FB)b zHdSLNyJ0jNrmO6q~?C9%?z|JY3eL&)3erobnk{T zU6x$NtzKF02aVgvF$8 z&v;di<7Wi_+61YUQB#9vzo9xK@w?8Klo8;@ufp5B$qNEwHPt7E2;Z?i?lF=AFf&9i z@C;?m{igE4594UM7rsdalGzze5?G~;^#yQT#xP9vAg;&4935F`KBEUyk8M4($-{@I zrQKNY9tIL=TB6tu05Uu6&e#!b(wvxQwbEBzbD!48^pW1xo`lMChII0;ZR5+q@1dvn zvU#jXY2%m085NrhEq;H)^behVPvrQPsqFH4D8Uq(=K>8jVb>P4E3%Z133n z19&|%^L9IW#5dc6;}nvx2^?e zg?~}=2||64r0@}aCVGOi6LLq=Dd!~beDynN{1jk+m9f_CyqJfk$);_n|A_$UBDy(f zzWq+xj{eHm0<9obi{7JN(X%IBEV>}7uQ>kuOQEM$ir^1A&k`%!j}_}Z`MNxdCU1rC z`#AWN#FSC>c=1-dR@{D}ozfl851588x873)X^1x$eYW@&>^`1bID5;%9>#9q?eB#V zj#J=qastNnty{)?BWx#C0peShF|;w3uexGruZ|asbrMz<;Lr(hcurj}WJYzuPm-EI zyJJg@fcjdr^CiH5aLe-m%H-m^<7WU?OVQoD+7jAGnyuyFH|xlzTZ7co3tbe!-!R_>smnkMbPtOpeyBl1|nMilNYNhXjG{lbc-c|2n zO8tMGf68jVqOG@*=H$5*cM^=A2CgVyZrI>&5Tieo&TZ%G;|VIKiqQ|?=JJMH4OeiQ zpXYEwDJdCsdhK^~4Q$N&KccySQtDW#H7p~!-80TuCl6tPpW1j0P80AL^%mH#Sh!{< z+v=wgE#ACuyYcu_S(0|i22e08{$~ZzKll;KeM%9V9Lo_ZR8IGz81cnBAyHI$> zKa6jVgYxvR-sLggC5iFlGB$sGOnrL!W1QW;$}DYN3DdYwsd%+^f=$1>R5e!Pj>2Vo-uS@Y%%$<~C&DT3}* z@qx@0F~1yNoCeGdS#MDdb`fgk4pH3WmmTiK^1E@ETyF&d`1Yg2gWKe+{p9DtwzW72 z*t)BTWJ*XV4l13weM`rg8{AM4laQFgqP&px7R0`%08e!b(X(EUTQdDCS?ZU|<3PMm zMFfK0)5Sns@d;l`j!kPhwh7e&<1Jf`R6QAyKi>kPy8MaJdvt3B8}EOEL@V*0l+J#> z-ymIklnF|l{HkSfJd7Do%f8J;t}-dCRsGfF{-0T=`s4V0p)pVPRnW`^5|j5Os*<06Tw1~| z+$m|9dpYWM2G4m_Jw95~2ZIsH!gJYBH5l|1MJ%l6(03kjC2AdO)_$|w0>K#Q-S}W= zbf>%9>1!$-^h&*kiU^>O1CEs`>NY6<)lZ(xZn-D-M^l@Iq+hocS|}?!rqLcn(F+R; z&o#OE=RO!+er;`}4P?C^Qs!X$bCcb3O-)y-{#P0yAiY)|1#HXt7B^3T;?>CrRQnmt zXAwHYWKC06MtI1%Z;3rh=a@?98F(an_ljP>Hd$m)s#vAS6KbC0w>tuxg$Pnb?zt`p zF1E1Zf~nk}fleB)A7{0rx!Ncog4yPD$&C`?m+q5xFzCs$MTz{CO6J43w}z&$rz%O) z$`@2?*VI^}AVA?Xruh>5b03y#lPwg6T1!K%xKL;+M1CF1tnqXsheZ#cgi|K^YS$OF z5;@pzeoi=T{FE*~$iIRmY$HHbn9=Rqu~Vk&$JOz1PR=0<^QrFgHqTQuNg~G2O^jPX zOm-%pFP91b&A0dO*=#UxxjS)}^RE3jDfEArc2h4w|GHQ7M+$dZ3V62cGAZ2xU?A~- zger1x!Ock@((Qa;0QUZWI1K&?os(*~OMzT%ne$vSyZqAvQX{IxT+gbT{EJPW&nJDQ%*BK2nCd+VkjWa za-06Y9Nqu21bjcM(*W!v`T>Qz5obgeFl_-cwz1m3S)%`CW!-B-us;jXL;d%kL}wiH zQzKKOERJAAe6*bi+4HvqeA(=#^M9<||8=3XqdY`0#D)W#jsG9)y?0nsYq|#-1jGVF z1f&xc6cG^V(h?gWBA|lOi3);r0jY+BBE3XVKv4+2NQp?Vp%($^AVnZSr9*-c0wnRS zJv004;@)#-&N{iyz2E!(O6xQ~YJL&8HK->GaX+!de?MRrdWY;t zu9j$p;UnvT=0g845QQnaW+>9QCM^mo*Ov8)oIe_GX#EVIe+Tp(ge&!P777|GnK*^T zsEO=KBlJjHp@j--?8wpa^0EwzwRPzNs=9;dO(##y@g~cM?;Cc5%>Jp}{&*nJ3iih3 zv6V+g5vXP!1n{+Hya5VZ#3Z1a1dzIa9Qx}Kj?ZJcfJF8T1?VSe-2zDRcm=1w(#C%r z+nH2l`Cs)tzgZkFO9;BRRh|5$KG~MH%iQ8>v8D`t@Q_6NC|QJ3>^-3%J^re7zzkDc zjf`c^#02HPM&7Ez1K%Bhcx?K($uxv;$faNjMX8zRp0$|{CCcm-qUMrCz_x7KgxP-Z zSV<5Gyz*d5emI+mZNRddBW{!Ipk`AOjKYyQ&!}EJS_fR8dZx`~&v-)%ch!-dZrvRVyAvRRZ0HSi{nE!>FhK5_$6nIX#4V6$N!Yq<$5p6BZi4jdb(g=HGgCs~U) zdiwfRUpNdt{|3aJZ+=VSYWCOIh%f-n7jV}^4%N}+0u&fls7t2rY`%kH6Q=2P6WfD~ zra-i0AyoW8S^1IHavt&q^UjN!)Z2*L<&<}!9Sf%Rd6{_$hx zKk{X{?toio0g9eQHaWm8_89`)UFM0~2mF|aih{IN zD>eJd*$KdTRz;^|a54l-F17gko)Z(?Q-$?N_lE+WK`=X*|FMv?OcSaPF;iW7!>?TB zvun&6!}iH@4cpLr_Gp~+d{?eqW(*VgtoazKc=D$!zq-&zUIiG!6w582PF zs9|#_N>B~A53|ha13}W33;8Z~|cSQKmkHX-^J&HhE*YxG}&DFp0-<{^e1a zRJdUEJR}^uZy4u<-D{uJk^-UFWjUaDsMl)!&`qBDQnh|1CpsCpQR94Y-wiMrXqPA> z!4vj}7LgJaBXrTl{upUBc)srA557+(d5CQi^Eg&-ol8W$XGh}kEBwtD2SvUB>RQ$n z;eJg=85MdZ%}@uV3@r4YIwAK^k+f7`ItmUYIG}>24!xqegsO?0tsJ(!G<>pdH*XRm zeRQtQ4+1E|`Vm={xAD`xhsFAgziI9Kq_^{T{(cqTN6LQH)ym#m23$#S ztIH2YTtFpZ__tk5ng4(1TsVFcj^{e%p`W zH}lwd7=Ct|4L5(A5)Mo>$vqGvX)R=WLjrO6C$h`eNN9tfkp?0Ukna$t0F(H|iWYsU z=<0*t@85*XxL7UmMW5om)$FpvU4I;f7jxJ8&F<5C-$HKn!vWE^-yG`i_J73Bf2XMa z4yASXvNv{yr8Pev73S*~OOYd-OPJ6i-Fl)2(W;eUJ_}#h>=cwS0fDU9TclvW*{w1C zEUkvl4k*ht^J7908Z-LBi#sQ?7|lM7S$JRkk`d4A22NYu_ZhH|iTdd(u+t)dg&l^= z64?vi2dLi*4Hhb31oADWD3P~@ZoJYb(gKE7&E&X69u0_1o_7ynsTA25sAr{scXg4y z({2dVJk`eX?G%rXt7EsU;*(+nH79AXwqQeFZHW`UkTcu zOKCsWIe$_5{b!#0C)|AhnUDSXwFsbK`KN^yYyxg3%fDi|`4polB~lXv-l0X5BUE>< z+oQN~4X^mb%Zqx_Qo3@|sqo9k%buQ5nfp&pN4Gx(sXctaxWNAxZc(Pq zzN{C)!kGmBBN34+K##Q-2tjLU*R>0bFMbiNtB7vDWoHPPJI_2vMVg>$r{5RdmRhaJ zE*#Df((F5(Ao_9$B(n)jPE8hRs6GchK)VfS-<_PLVicfQh2qAk9LfPB>3Ksc;+ar-j zm$Z6Ds3v*TEh9I<`v%JE1m339<-P6^o7Ec&cjHDLzh2M#2q+$f!b(`0Fi%K8$!gzd zfV_iFCZO;b+*(;xBR2XW(LHKs4@;SEFqe`2RTYt==@F-YDz5)W4eTHNUm}Jy*NA1H zUa#=3HJhRFv-4MN7PUp_id90C#_8Rb5){#1KO&fLG7Eq1`AZlc zOk%?3&btfOTvV23*MoYTsO!DDF&zXef8skDwL%x9ni9WJWht?zS~HboSq42~U%Ql4 z&bMG@#FS@6KZ7q7DG}P6AO{il#Mg9Tl4VjN+38!eEVX1@XWZC&F-Bc$Hr71;Y=1&= zf0AZ~83qux6h`*q1k?EL4vr4c*W<3ggE8fGEoN0d)kT8U9e9FZ%-j z{nFh3kD5mR=-8=0UT_EfQ6=AC{!CWHsr08!|1a6z-_2Qn|K2|%3KX{Voc?dv7dQ?z zlRLuphTf@?9QR?Q(7n*^diaoao9U`dC2Lgd>rEkvFW9j;ve6)tld4@!ksv9xV-Bea z720#w%>{2tvpJ-9X^y6~cZ=wsdR_%$>*&^=VZGzL)BgK~#I>FbyPY?7>r)>mEOj1q zDb(~HsJV2Mm;DJkzOG%K;oAwo6rsIY>1nAP)tFKxuN&PlB{4HINB4Pbh#{_GLZ>&* zQLh4lruu7rE>N<#6-wVU8~4y6}BJZGgPp1EF$0o4KkPF(o|m`84Rv)ahZ_sX3mbw)5f`;tFnzR;0F| zCX(ALRxl+xG_7X!deR-P2k1FTTLPl@DBHDCGff1COw7f`p(U@+95I-sMH-IE`HJjO z5qu9GQhtUfL)*)38ChU0N>|uW_>ZW0ouw1T3A>LQ!;&3|dU+)1H+L;xC zabUC zEJ)=8r(MQ(w(Dh({nPvD2abJ{mpDXMB1t-QcsosWm4z2mrNosDX!E^yEx%aC9KlT3 zD4gg6LeNIkPxl`?4*}Pj4|(#|+s153ARk|}bbDO#;>XXLNyAck4>+GDL@g$>MKwRNmF<%-LkeVh6HFQZ|{%N zBkUtg)~1GBF*vqzW4VzNBcV#?O{VhX)00I4ma zK5p-n_c?pU>$-{^7W$ca4jqzP=u+){f@{WGhqOen_8Z`_5ABUfSzF|GfBG59t2V{` zPVc?}v=4Z>liPU{Q)Le8@OS&(`wJ|L7?LkoP0wS)9O>u0RWXgao7 zGB)qa6I#adZv9T!6CPPLS5Ap4=0w+H7uqByQP$Jpqp-tN#)85RcA z2g#uI)Lopf=8KqW;4&~{=1`pS68p<09No}#fy(=)lLopu1ZYu}~x za=lyOEP||`XOqHv7M~8lu8?2b3i|fFH{xUR`dZIJ%Pw0J22V&gA%%%rVRQY`WlmC( z88$EJy*x?pGTvLUFvn_phziPzFH2mSz4f7LmkZfjvY^5f37C^nu&us>Tq9Tv=ftx; z+fe~8t7v`A9^F-90k>9^2Jag0-p{ztcl!JmSAdSS{}t}-C6dF;92#{W_|COj0P#EB zkxd~F+lt8B71dA6N`o8Eee6cGFfli?&c+4}T9L;oWd)NI+Zj}pnnLc>1I|Sv;Oop( zUU~mx_tPu66cJ?GJ>{91odI%!-n^vWm$JY)yY?b+iEm5f2qHKJW-Wj@NYxpvKj{F# z-(DH+9?2KZk#nlF61c*3%x145$A`1iUgBt%`bQQw)m^lThwcj8s|i#d^4)vA0a@mn zC})>?=H?qY>tIpv1#Nf|@;FJ1DhebUVlndAXviU8>J^A|AnUVr-usYnn-ng(o_SQ5 z$0echIa}+C1QvNcL^)FtGU4Vs*_rsG!Y{c=-$5(tBnLAJH7w!VP>2NP{3;j+VT(w9{smXujC0A>k6Iq2`Do+*1fklCk@e^;S(sQ;>SfBChK znunPfWw3_o`uGjmJS%@$zkC8Z622ZM4HV-xXJ8w*hTVYEGisYhcDcieY(AMbV{AL_hlxa_qJ#n@;QMyL`F2$pBmgOn=%%~4dQ$a52|M*p;sDV zVN`blCX62!D&J+c_RenHGG^s-LxTi0zm}ntd%L(Qyi@I7{`eQB{s zR!ndTGui-Hs%IFEj<%@y1viup3CMh0iJowT%gk(7szV8VfWx)1RNA8DIZ9|ILfwLE zu~+sx$YJsN!tHe4$ge(nT!N2;zgF9C8<2;I(&V}DdTuy`gA%FFgFNwWfZmm}D17bE z{p9+=!z-%JdzbgiKICHc-9YgwSUAjt`XeM}PzSs*Wu*Z}`2q%){UQ+>2gS0m{>!U7j+|V%o31cq& zm^3@{A)YjW?*W=vWjqc#wULE#22WxYG2XA6RX94vKXwQi#DQUp{`&F zB{1StQQ%J*4fr%I8_Kx|;iYr;W)gzOVh=(;zv(Ha4^c7HVKqJ}eUKv|XMJgpt$&XJN#J<&TXroGtr3`;f;U?;b z*Lv(;e1PHr`}5$VQ^ z7Cp9RNm7hYr>eliko$%<#T9fF9dvddxJX)>^WA-}8`xDT!zvfxulpF&T(jb0KpgV2 zJVvY)J~WgW+hzCAqTRjtLQ{fwTTcta)2vY3-NF8g=raw7oLy4{8|iBqNwpEhUC%#p zDUX6~CLOdZ-Oe(!6EWd?noZzBPhaD8W?_{XsFdbF$WXQc=~`G58sd#@OmdFSc~IT@ z8k;i)zw~v?G^<;-Q?llB*I}-wkyRlB2ZH>O3+Y*dy66|%?gXPo6dxssJV+EnOUlDb zz4fmJiq;H4NB3-B-Y=_G?s{yut6$%A73QGtFH7&k90XV+%V$Qq%m#}z({i+rjiXOz zhZ_Sk}Jn>%Th2YQ%f6a=%A9+zm|k#0uwQY@RBOoa-HJM$=EJzOCu z$wP%@Csfh!_`Um&#_g3*r5}*Ocft2lP%~cxvX}#Vb+sLx5X(>V_!}b37{&beAGQ7l zM;XN;5VRyU_1#NQ|8a8On}v2wAJL#)8G^+Z@*L_VP<(j!sNKNiJ*IjkUuY9>OnXUe z2k?$7+5~mRrYR_KBp$#Q9eA*n{dM<7NwxO9+NXPJ`jtwGPg@^23!3ucX!XltC99Zo zn9#CMCF~nf&=$SkcphX+Y=y^m$H@s7xLcacx{O8QQ7tZqniq~5^v>mEbPx4>u5Z@%HnzMz&mkKx(5l}&Zas$+ENHMB_eS|hvt+M zyN2)Dv%CX8tRI)W(fM)y!(zvKSa@LI`3~jAEi9Pc3ENjIQM9o0;n|FzBH#%*r}N6; zMUZpEW#!v~n@mPUgRXA3?-rA$qhMD09QM=idIPBthc_3bqp1>d_D8x0I2_kiOVm`u z<{3c+=lO1=usKkbhU(=PcgW!x-ib4|Pd4&T-M^6UV8rIkk;W7vbDC%Jvm1!Xnl+w9 zVbh7LE~&21d>c%Tp_q{lPmoiHVy)79DGv0*uhbQ}0tqAH{T62}1dhG*78h@1nu&SH zEY_y)4^<`VuP}I!tnd|pss?{^Uen!SHCP_+Q1P)#D%-(^LB(cASDCDs_;<(=>PcP) zQv`_8bk1HLZEV-61mlD%U_xfq&=f_MKgFihvjS35jhlsO?07)@bo|X!; zrkSM7&Fy|1C_2ysdP zX^CtSsxd6Nu_PTPE^qoI3Bkj-ye8^A``$#EAWx6{kq2l>nVVH^#s*U zX96}ir#0S|l@-_TUcE9Jl@_~o$~}rXDsZo|v#(Lr4gEy3$ur&+E7~)V!<4RODDSY^ ziMztZg%m4`@{x>z%$Ij}@3rZDW^VyKbnl!Kt`)?FTe1N+0yeZ-5ep8@B(g+P6^gX- z?T(lhel{rhDk-SinjG*!OC&nzXb2XW{wlf!(KtJgl%Nk-LQ08>ImA`{Q2rCcEz;N} zp7E@*k1}mt$3mvgb=pk6uSA{)MeWcOY#oH6GmrG|IUb@A&hUBVI*7)sqj5lS2nf6R ze`&p7lTj{aN6=)yS1skAvor8Q4!WZ&qL#TN^GwGEa*JCB>cGzFS_AUXdhc=$NvWr8 zdf7(?vv%K;URkgx(HNteu~a8=6Ky=;hy~yz1|EC#FstuEfb`_qFK7*EyyhC^Rz3Ul z77#BAwroc`u8m|)uge0ssL>jf3Q@25-wDC-Gm_)Kvc2tG2jub+z|3i|JrZxTG)*Gl zHiu!!3Y9OjevKCSPwwOI8teh7bYO~>tJrc5BH^8+l5Ww$ad9E9)36DV@hhtgty(z< zZd(2dwFkV{@_X9tg+2?;5!u@q* zAAppdea^5c#l2pKX@-oCk7(LbdS@bAMHp;}B_MUWk>D_vVgz6i&@^szD1cDX0aDub zT*ie(iLa0!pLkP<3j(-t2hE~aNC)v*lU7GX$GJNU=8+~0G@w@A)k~qi`wsHK#rsSL z0UBrrDb0kf2s$5kT|f0co>)?j$!q>1Fmd}}WWIZxXrv^A$Wz~l^_pZBa^>!=^0gz> z(^8cyZQ*_k(_5SCm=!;~e$38jD>j&XgA&xpAKKR8f}pY0a_3l>+21Ztg(&LeS;uFX zU3eCI{Ha^RL-RvmPl{Ge6RNCzg&QyaQN&`}j()NXy-S+BP_7BlywbR34&LaYh8O@%whDLy0WnhzAj?|V)QQ!<0EU9eQ#l-$?I zWPRHyryUNVUQ)>}B#t(Wdsg7pp*~WC`r$G;C&gS++&jOQ+`LYc?0g+;0ZXxJ^OBF( z^K3wSwC9;mAYylKBFu-7M*+lf$}uASX)|RKyxnQho9Lhlu<59;IfwX`7TLCifNY+##ehRh98LHLDdN4Zf z5l_!>u=UBia_?P5(siucET8evdW*HKkeoIFe0TxMIY^%_wD(NXP1r=Q@RUeu*TrzV zwdbibBff)nZ!9JxiM64}wr=1mb#w%D;Wtx!KA$t5H9lrzwG290j6Ccxf+`2Ms$v8` zP3BW18offMFB6xX_d2BeM=k}caRz#_e{7^59qh$QUoYMmNhIisnb!_UZqTldkk|A* zV%GPaU?(p(#2-HOMH>{GQkk=ua&%xr+40_xhkzNMOOBbtmDQ7m4&nNj&+D0>Q?bl3 z-3WPFS;~W1<44#q--Aa^y%xIIS442_dqEZ??zd5Uo%E{ppi&ELLZB(>y5u9=p8549 zgCh1p5BF-`s#(ZUt4T!Z_6x{`eiO^n!_uYtY{1g~WoB2=;*pUxNq%eD0{urb)|v&= zr$kB5TD{?6^-zQk#r7n+!227X{4B>?>v~e)`Y-}0hPPR6r12>Ws- z<(v>|@#6%%AKLU|cu)tax-%=Bdc+~RSUpKH@sOF>cz1DC+_|mXE<~FCi5Hmo3u>cE zsR&QfNHi(3dFmh~bf!o3QaS0eyJD>a~zWNX~- zK5=Go|MGr?E(*ByJBYhX*ImbjgQ9eyocJ||?}eqEX|t|E#EY)fcoPtUSQV9G)@PId z(y~canw@&Nk9v#H%Smz9tMVyxVJqFAcey{_IO_chFRQm>*hJ2?+vRmwJ33FlOPQbP zsw$go+sTx#i;l|xPe{_0CNYP8qC&>tQR9r{mm>-enZ<*W)&pnGIE#MxIBV=2u7BzeJC2d@ou9Lg4XGdmiiqGP{##M=%wFeDpX;O3uy$i`li3S999UiNO z70gCnD!$95*x#|cr#%?N48YpiCvU;|=zWmGNI|oBbc}{BVju{s zJgO-P(>%@MePr5rs1{1j+10;3gzT{{?s5`rd4EwZr2CcO>DRMj+RA!? z!&Ey$;p0ZAEOF5r-Zn1jYTo1Gq+D5ERq-r1NvvU3?T`=6A%+N9IWg-JIcge=4Wxpt zBpy_^j3bM^-q^FT)w*VOzd3enf85Q;+9*)XbK4|ypoA%;8+SQGF;3G^gZbr)8IVex zLz~(KQcf%MC?%lKTMM5UQlNIh9lxGeusGiH&2-GHyVOds0q~%ELZxd|H~~7;r|2WqZ3hygK0!D>H1!mvlMh z@T=>BcRrW98zB3pj#iM2mdl9WjLQb^^sivcoc0o=pdT5yt4m3-{Qe*i`(Dt3$+vp5 z;4)|!NvXvN+{C{Xvf-phxPql#dN@+;E2{A%?ljNQ*e&FBeT6}zURbFXHDP*em)^Vu z+qts2q3|Gi0$@UM-{JmNv9gk@M3pO>3r%G%19RK&I0xAAZ^mgm-xxc;_JR;sAC65a zTT!o9K~CVnh!ZSpfKK#>Pqh{W)$2ItXP=A1QeF+wiQw~_!FFzg_cbgqcJtMazGu)@ zIW6wonH&>oWQ_x`_}j1Onr5h-Etfgd;a~MHYcCc2UuEjIFp7V%%2X?bWyUB7Aa_o* z(xv-1w!z_uTOQ)(Wj)+<4*D9JM>nu!HwSI#TB z=C&vme+T(0^RS5E)cM@Weu&{+BjgAE-e?7NF+yUq_|ZYA6valiobJ-aUSjvUF1r0% z(1#J&qm&F~f&RSZFUTiX)jC#lu@V%XpGr*vjxJJ1#!>8)E^Vk@1>WL{W9{Hq!Lz5& zPraO-~8y@l{M9TGH}xNB5NtcgrEXmMRF9)7mM z8#Bo^&sywcD`}=`u@}s{lU^&<0wAswx&TdK{-T;9@6*%mbdKtQyCs8O=HqJvlZS2f zPkNn4v@t~Hi7{w7?oVmHjAKMdWOUONM+Vz{;nB4g1fx&6i~JI8#b)W*9qHlRAK#>& z>pHCc6D-7|pDKl4+k=;2+=!s*-#{;*MJN`p1RE0$%$_!n=ZsSbY1!4(XRh8#Cs$6h9fwMu7FVQdxZVy(rSuo9$QtrkYS;@^V;Yl zTs?dGT%0xo$aRRQ3h~_$1J^ih8pg1ewxzVP28y;5N;^Q`?2YhNjsyR;SH}tAroJZ#(avH>sx6gD=4+~oCJ^I2%5p9 zXBP&$eVICig9u#*UE)rZ96K`gne{FrTZ3s+8Gc)AL83>v`Fd?hP4&Bqs_J((kL|L< zO=fI(?>#|3f6AhN)iW#GWJyuZ#QwVuUVw(b){2j_6O|c{ z4jaE~l0~#Bu`3L>`e51rLhCG(dq|8P27i|r~XP8iTK_W{k4r?lh%0KQx|O^>{~b9_l-a1C3q z<}B$nS6d?KR3sWE*JOM*;;?L$pzCs7QZ5)TN&|KSMkHhgFCyDE=&i4kA14?GH2%zJ11z9 zD6!4l4UXiWej*QJSag}Kx{Gcaz2JytZn`qRbZnyZ3A6v^-L3PYPZ>nMequ2$rl^vF z@FM}lUV`M560toD%N9KPWNygd2ESjfrE}Wz!=y}aF}v+67x(UUSXxMnA@Chbl272{ z%5Og&a@%0&koWPowJ_pV6P>FzVNHX$nwVvw)7)!oE2yU_6Ww6T_2tN?2FkjApc24y z17M~;pPXN#6zkXOczwRbB<-60dQE>1A_ zMj|Dyjl1d#pvTn8*Ah4vs(cO3nHw=S1d&BQjRyqlB7{JP+E7q>8{k! zyAPgw?m=W7nTjR%D$eGCHsNJjO@3wsSi>0N#LSA~SkcJ(z!k}nD=X<&G7xW>!)^qO z*Ikunq}cbi$Srv$?R~j7UnD`Pf2^%UFwVV_b?#ZOrJY;NM2D9IaFbJ~IEg%=kIX_o zx#lbo&8}$ocE`R+aqT@`??Y4qVw*RF+(h~0LJ)H9z3g{>LEL_8eV}P8O~e@M~;?9a4j~Z8e#ChJr)WhSU8U}&2%M`T>bA_q3|rNH`u&AO$5P4Af}xa&&qo$GsiikBpm|S=W{YxKsC5G3L%ECgEZm zw3FD3O~&jM2Nf@|z$C88rX}XKDYyD=1yaRl{G=OToMpW%-svb_-8_??BUx|sa}PER zv^u`euVp>V6_DaX4 z%lWgaQ(+E~I;ay>@Zbbm(vR8Ci#;IQY#ukpVMi`rurvu>mN*=SD0yAfZN(yn_H}9x z*?3jQI$yGo@XT(n4EUoZ_t|A39vhi}1+9|_>jzU9b~WwGuTxEE7zOb&c(k6%a`%~+ z4Uu5`M)p;VjEHa*_eQYQG_VpKme&E&0_eF{?$+x2r-3n za&mI~-5!*WyCc+_T_z4>o&s8cKe}PsRFFrNOyrj8eF-Sd_CeU_QkMiMI&Ia^VtN02 z1FBg%v75mpF}kYdla8+8D?1TAl;qtbR1Qj`hr(PqUGPB#e^g6pBH2|yHPMQyFM5Kn zceiEq^#x#Z7^S#pC0C0@Aw|92!1#)R+FSBBuy2|5zZJDcu13{iUM_AJC8C;Z5i)?2 zsTpC5egs%wkhp#r;f9-A1dY|t<@Q?w-Ewvkv7N^45;;vV22%Ex8$XV0wgL?ARo`gt zFVn;|2rH`pHU(%I1V`bhfJVme2mTd4TGoXkh6{GW$C*GS5thcYH@l~aFWx@{bv-_MMOHT4r5-B`Tb1Y|Ilz*VGk}##C zqp_%lE;Nfr@0_0I!4ZoS0a=HPrO3CBl_od&CAx6!63Q*7w8#U|JYT=1h4+(W_m`O}-kdCP_wB9vB z5mDqbmB8}g!yrMcZTgUO3mP&(-SK-G+10-RbvrV`9@UJVI@inBhOeY)+1k3~j5j+t zFnNyjT;IhMCcyB{eTUCd+PUazy3^fEzD9^{3`1cN~CTnm_gSk2FH`4^%_O zO@lw6jE}}6JEzwjV0!^A=WG4AjS&7#(|?*k@N1$$S`k1U6kelOC1C)x?o+=Cnj8G{ z=|AF@>!(&H0Pno?k3!@Bq1XK@D?j~(A(Q_1=>u7Xs1ANW7NhR-cTk_M#T(O6Y5*1x z^O@48G4-+Zz=+pewq>S6a9fAJg9ej_{j}g`P>nLU&+SOyU6VTBL3&=dHiiBnyURc2 zOq~7Kcij1}`W^f%k^7hJFP(0?U$?*q&BFgzcq{yku+6OcA6;ehH@|3){ki1>cNapB zl8o;Ss%IXqg%f%-cfJpOS|V}#)Hn^qHDR1@m#5lCON}~!cIh|@pt#PDK#!tkE<^BI zR7DI>>)TO4he0SDEEL1X_NeaA_q!1D8*mYT0BT1Cf=Pxk1t@w196|*{e*gCJ9BPL$ z+-fOeTa_F_sUk@LGNI9rXW5W8B(DYw8{U;Ocgu>SUDPC$&!>2PzCL(Cr!AV|kOmZc1}T{DAkAVDBcOYC z6li&?m%+YX_)lCLiO(pC6D%B1rE`EBf)S5{i5YYOeFBQc-`)bI?hB%xA`g8By=0MC z_H)%hL<7={`6LA&moZxcdw=JIyrk>0OeYz8pN7FbSUW)PPk{_cKjPhFlQtZ1hPLO= ztMn9C;0A5mJazbb%>~bj3UV|u9f+Eh4xYU2;64mQ`r_aGYJSFS{>e(|8k;>pH-bb= z|6V%{NGpG@uLeTC`EeFo$NpXq`p<9mpO5A5Y`Oke=J~=OO?ToOJU3ha8?+OCBir)t zC)2?49zwi+%dCfUW-9sIH%Dot$b{{7?4l^ydA^96IyI3x8BnJ%dg{T<^+xHeaM?GP!0#X@1`=m8(IJJOqBGlLdnNg$ zRc(1$#i>?3Ue<0E{sH+f@{_k)s!px?iD>vZ=aXDo85@y;qw+2xDl40-Id4uxPocde z6-QMbiN1As3AJI?>~Dazs2Js?>~B|)jf|Pql{uCEoUMgnB=g=4^J#zN@dqeQc@v=2 zH(B;6N-6OOn~0{tR$qtJQ_$nRH|3_e2Mp|5$0Gdz*l^}qY%-m1ycwXcHeN!Wpd~2; ztxT~DtQH}P(JD9GMej}w2c9nS17Xu8=6V4P(fZRh;qRa^*dS`-o&5%Y2bx91_5xxe?3$Ge+^bM1@iSj6A-=5R0>Ei{tVSOsQtxH z^H-8~&wq0&zq%X@>xnA>j2*ad7;(#)^T3QMVAQwyt^E%MXKY}CddtvmATY1%rS*RY z6~J?`L4cRKbd;qO5wkptUy;+MXb=E@ol8=U7B`y~M^0Ivym;k?NXTrqE8JW2;iCSv z(Lj1Losq=StUuP?JOv>^6BW?~6Fm_qAkJSGMu-XC0S^17;nX7X;$nW0nl-p_EtcEaW(f z!wlwtcWmiFFvo&Q+#>)Gl6rgSMxETPin{u1T!Gq5NYe`C z?YFLN=L+pAvv{%xzJvV5EmwR6A?>+T1ENtA^k6><4<$XZ++REF3~#kmnINxRZ95rO zdX?{b=?%WOvO!;19?XJvc$(pi8gCG=Q34Uy)_cgi=F+g0#^9#s&{15?ppct+R)G-Ac7Rx*l*Ys>dvZamsk|0S-Jz>J7CHAH8mIn#6NG_?_Z%;5Ery&1L_w59eBa zV9RsFS(9?gr&*Nnr5Q7H@D%$E}A#JB4Rc~4qgGAaKodwOWgxt2M36k0V0G|3< z-;Pa0+3-?riI%%JDYk{ciHHrpg4frUlJ4W4kNcD*X~vp#I5k!XI(`23Eb3l;dO@-7 z`}{<$;z#uYz#MO$od$;@VBbN{L%p}?o%C0L=IJw|K96s6K#_U-iWeHSj9I|#m?-$3 z1OGavUHQM5#`R}E?0;zde{oE^?TIBwi8aro-`FSk+ABAeeQCPPyTjIEn?}u#7>3zF zo>YrQ3#XATHJ0DZm9)aZOSKwAAiiKzDKqjGEoI~KCwCIOd0Ck-Zd9pe zRchZ}=Qau_8t9L$I0K9~O52=yrHI6l#7v9s58h1yUjns7!G$0{T@6GDgvuq+IG!o1 z7(m=2goX>bU0j#~SKn#yP!oO)=Yxr zb3GtGIiK$!NCJFKX+1xFJ$`hBkF<7U=(q)pY%}E4RBq)2_(5I|5%6KLq4Jj~wP45F zjYEzM(%C5@m+l#%$T<;~Gcz&XF(=w&RZ~Lzp_%FR=26Ho{tT5;9LvV3vGc4>n*xc~ z=0rJ8F~z7v@n_@qqa2(}1>p!E!1j2zdY;Z1f!sg$vhg!RyU>Tn{Ggrsh;69Ln9giM zmD{@|T$ydSsp8Fd8W|MdUj2@95DCqP>?{^lOmj7f z3mmf$_pUqVI<|l_b1>csjVfh3@zSNNN3kQ1A-HwYbtc9C^Jq6v$7T9J(en8s+zcpnAZKVV1YBM~S|_aaY;ST`i9|axnXD5H0Nu+yeVWa+zi36yuPG z%eWlgGL*m5j|6FYD6H!vS3 zW^RY`pTIUj_6pTpt!s;9THPloG$0{Jhi*;)jn{otHaz+E0F@6a3A0HacCLa8kI}6;VzC~Ff$!SKOU84 z{Y;M--DlJ=&l;dgT8M>TYt1x1FC!FRYejqU9Cfw)4c8r3usvw|KH#11U>Jg029He; zK4iHZuLbm3e)Ws?BvUh4xaazwQK%oYj zEU~zFiV^UwtT88Pg+n3qUSD(+RgNd9R!6BNT6!f(QY8?cRb}$o@c1JJGdE@=OBc?r zlMkD}G>4w}Mpvh^pvJU;DSAT@o3_-@jXan9(GLJnbAL|0pmBy>E=K$(VcRboxNsz7 z1bWN?xxcK|mLweQ#P;b+{^bVIo580Vt0A@GnNv;GT0pk&E8)go4EQ_f%dsi-@1S?6 zY=Bs24LFM-wuwNA>88-^^sd;B3q3*CGKP^dEs}z3@G}4s?g3%%U!@%X5t;r+)qJd% zrlk6)XNt>93Kn**c5bn44qU2PsEq2{TgKgiYMLR6(b!WUK<1qZq27WJE7kUY2VKem zvTULq3xJ3{FXZ=c`7(!j=-EERO>2I|iX8hksJZbcEg5*xfa`gIpEJy#x7Ytj{Ed5W z&5wphIeiv7z`Op-o}s^~2K-YsGy{_ng~@JiKkbH8mK!EK$TNMVo1Z)i?MAvr+#^|A_Xx}yoKszTopLr_1-H7c0JmB3I5O*>Gii^ z#2e_)(Q8W?6jg#&nCEL-U1BChgi+KdSjC#%XO5I>MxH0J=e?z%$c~hnT#?vt>C&)5 z9sxJKHHG$UejTcHTs6 zThreIB=M6U3U{wl%9{5GJ*qoC(EX^PeM%&zc5^P`F|rBkCsLkI8YN6+ResJR?Kv+C z0?`*wECAG%#UrsIyWRPl=iYcFeSMX4@qW#al+!(4=C3brXzHy8_>$PPhGekIM>-e9;z0b-&O32}*lP(NP=%hp`VG^eDhDO(>KTZwk>X4W z^=Rklr*7;QZG+AW^W~!&^>Mc9O5KMjQ%eT14DUbAd#%0pWW9U9ecAPv8U29Hj6|zJ zDusm*8gbfh0O#5p*{|{i&I7F5_1rAm@0tt7w-Gp_)B#{u~u3S~avL7$-C2 z)2OfaqFrIq4pt0nu;31;plINEc5$T5EDhJeoSlYg8gg^x(RCJ%Gz2@v74JQwE`SHx z&!u6*LBYlSPN@t5qJpQ{-L$C=@5;+U-ZO_x@y-MzmtAvj3C{VSHvNlw{GMO*M{oVA zqW-6HoZ#oUAFEd>uph@5f2nB2c|iE6nj~Dy1Z=|n0??l4%UR@6Oc5S3(kpYaeIK*a zpN*y+zi6~?sg&&n%=?>V4@q@)HEl+ka*?WFg%2vCUMF1^7Znz$sM;go^d_62ZQIVF z>E%zEbFs%mPdFq9gK0JO!uGzOCVeemdEtdL-wSkQyyJOsEyloSMyPiEG1@er#MzPr zZ^lg=1e|;}Yd$_}03foZ<5ES{1Dv)lZr`*deI6bNI3BgooUZwXj?h>Aar0-?s1DvdCR%M_VBgfFcWDoMF1&Do2u+k;}aTgjJt zA?!E4E#q@z&xIzRS0;5AXX^})2`JJg+TnGa;;LIE(3g}8r1$y}NTE^ceMK~oaIX1F zC20qiglf9yxpdEMR>U3(yl(@{BW(9Qj59>{&FIO?fC7g!{&>^ zw~O6va`t{zbqe;Jl`bv6n%Aby*W5?dz#?pD@kmjGbWNbDbOe%bU{(I$r!x}=$lZ-6EpEO8hN)GrES!cH)>n<>lgrfC2)LCC*#P$d=%Anv*_qyNkC2D+!Q1Ve zM@~=qXdkl^c`(~%5`Mi#Pi2$vwh$1*(HHb~CwM)P2ZzB}QFxAiH4{}6yXq*F0xvAv z$iNaSLB_v@)wRv-!cjq27@wM02IZhOEgk{G?^a2ds>50p9~xU+#V0+`C>{90>R*Qk^659)(f#v~@9SdN!UpJnA{{MfnS$^^1{+~){wx~cP7cCowm!cGt z46X*XtE^^W7y137%Zpe?s_(~_H8%^MZPM#QUTiIX7CZQw23!!2`uKI1<%8crqU@+$ zzOr80IbBOnm7CELtrG_65qhS{fO7-70lOng2l{4>=5U~8_S?snJsJQxuoUx6t|s17 z&6dvFK4U~Xi~{o9nIdVR)AQTMSet?#gI3d%p%35Mn|{FL=yiUTH1cw7Y1I~a$abgXp6MR(|oXTrLZ z^K3j(CHYOuY?4bb!Ya*sR@YCfxwTKfhQi(Bt*h!dyd48M=1s^8dr$ zdq*|drt70YL<9t+cY=bV(nNX<7Mh5NNRbv5kR~9~YXAk577+!d1u243A~iG{b_V1j1_HXvuf3V0}#w70(-sidR>%Pi$saK0(cO864 z-5Mbg4OhxbGJ^5qs_&PRfObU87()95-0I(T?UBWrO}TNuA3DD==KfE@NB?wB{L{Zb z2-bXh=W>1zo1N!cs2dg9T{nEHspGm8+;lH))~^?~nuB_0E+ z%#K~EV)&^DE#5eSu?=bS00B@cUC5QVF6Ej5l1++tk3wgvkobf*wvGisKD)8HTq&(= zuUtDc@6hHisD!UMZ>TEIcH|8G@&~DTE7(k~{js6bn^khL2<}FGIet5p7C!b7%|{n| z=(HfEYA_EIzU8ZkF-Rq}UE^I?PFRCUQrjVej?iI#{*5FJpf)ecMcY8CEZ68i+*yKE zo;ql}uB{J5lsT|5Niw`LI?vw`2-qW+!`x-uAlq_J!jF9Ck7*tXHIX3C(Is=+ztMuV z{0`4pZvqnSFf;KrFq5K$uh-sPnEzt%F_C4a6X(`YvP2wOYEedpl9}6sIysL{4yT3 zid>}I%65++2CRJ}uo$CU-jPdj!>KA3XBnDb6MKc|*6fbNQ~0Sz3AE{o&=UloSBoRT znko`LnLo@OGI!Vd>uunVrw$Pg5*&;O`mu`N%Xq@O^2ggFY$RBVR8PN#pU9vw8xvT3 zO4H)Ys$(ukDSkS+KC|5uZnb|5{`Hmza04ABn9mlwOdTP=LcSFXO4a75AKwjdmd>kz zsPFqdSi@63>S3-NDllhh;ujr^39U!epfMlhjfF@7%fj zIc-b`AfZMe_!SZ$NQiBvuTz8_)m0U~1G>MEUH4fmTeVOhh@^KLL}mGqN{Hqi_y*c3 zL7p=#3hIQrNcp_^?|rL;3cq;kp4C9B49qjbW&G9!#ne+k)OoD zM(?M1*zaj5hNrib+@}QtpAugUX_^OL?0_v~yq(M3*>_0aPX}szbT`mOGha%sRrCLd1$JoWMMF(e+pw>$=z9Pz z9dT`2Dlolx8)4W7RfV-=ACy2AE6I|;*9fo@XMsC2T8m-@>vJZU&^W*S1htPK4iGk< zr^lOi<@f^GTf-scNxf}Ywk#x8k9O~J(YVBN=`ub2TXr!ApIS%u5L$iGM6LCgtQ%cp(o>p5)l}h<;ndpiDECbz) z-8wWTA^5ZiHBI3=`HS)l!XmP$UCeZ3sY*yMa6S1Q@V$x-sykQk#GzisPZ0_mzoEHOMj$$>3Fv8rA9EP^USjKl+L}^143lZtBHiiTbYM$g`2li`N$}ou^j0R(WDK`9LUDbvWPuDewDoE)g$bG-%?go^q^p zDR&x_SLrr;0z5veT4*Nb1I{Y+)vOA72d}9@LB(AR=wvTktz*Lryy(rA6Wyt#4j-s` zAa;F(Qmhz zuvXe3PgBDhUxc8xOsGL0%pDF`!$VrvE?WQ&Rf7^A2WE*P)m7G5emv{L%Od6c&!lRg zuXn8X3tQ4t3rhmMOwhx=7KHU;&5P^4h4o2uShUG!C-{8$1X6gU7c5XM<;6zBg6Vhc zZaRJ(@bt}sniw_2jPYRwc!pR+=}uf%n5KWvvt12Dj=go~`Ib+#v$?9GugE4M92_Ie zv0YMZ^Mcn7VqvR&*3B50be#BT`E8nDDz~Qy18DanJ#T)33WePE_0XOGgX0;jE05es zL;C`~+PK9aEFOI83XM0+44~|N<^nnxB#>OO&0+cBugiL?%0|?nOdF}P(}#sK*T7c& z1=C^2;N$v(g8N%vPXCAXzF(7#K?BHf?|wn~UC}I0t2GwR7d8nGe`B0|2xCfdXi;KVCGof9|cvInEpwELw_n^GlGxVqKZ17x~O?6T@aZDxu6zW{&P4D|gy6eP$xeXz>I;a|HwfR5U*$rle>KI%FpE7l~bpO8*jTi-dX@(2)d?D znqcBNhTwucuyYfDRnm@;xmq7(zV4}EOGuhTyFtvv`}I}qqG(K7557B+tmxvMnjI9(j-OaAmB zmCg>@Cf`=tB@w>PAjTQyv!Mev={k3*F5s4EiT5o}!0^W^?^92qvu9Dkn}RHEAVncF z`_)z$VB=kkvrWwvaWEyKp4L&YhZihn87Nix>TbIz0cEUE9iyrzR!3ZXvf%@6hXd=8}c6tKS?$-zU5Kkxl`;?lj+Rplk`w3aTwSM zBh-a6WW!|xZ(e?mSqzS|P0|Z)uXUIvtXL>wMjK`<;@V4S+=150Vo~1;#WOkCEf~LB zx)z3VkOT71$)rVUD!Cm${!Zrf#Bp|e{~L_YreV$3AkOXQ6rCoOQ}^yWl7A z*{gC55?O)*Zi|=XL+h&GO9XJ+2KTc#z*NgI&_Zv%PWba8Ti{Z5F1oP%`}Ug5v&w-r z5B8GdA42>^cY-6djp3vp#OviJ>=;ST^`cx0 z@2lSyTY^-S?!0|4Yi!d#d?Rwp;gpqBFhOlyYDN7F{k%Alh14`G0iYFRV*vNkdG`Fd z5|qPGBhOdhe3TY>Z9*{JLz2`rq&ei_gAtuM6UgPd?#=$@0wx~7iD1x8a!fqmmnJ|Gzaw@U3!m#)yXiQ*qy zfntVSwycAfc&;NZ_TrJ$G7wSvgB8HI3n*$*KHO{ z-ppCnp%#Cv7+CWi4b3oe@wmG(-^#DM$ary;ao*w|1Y7-A2Lil;%@q8xv8(y!2$j2m zW*p^3;rE3d$i$%wfCvY8AZ7ymCF0btTjhTelRxxp=)rq;=;jd&uw4g1wz(|bQmX+F z#vLHe?WO>jWGC8XxwP(6+!|x&y2$iK_GyRGUW2TrXZ$ z|EF{_|GMSXY1q-l0By+{q6Ahq0WDUIayL-YM_u;emNs-5zUIknDCwd)H(bqaR~lix z0JN5|;k^B!XM=F@a%`^t*}{#tUo05L^e9%paXNfeR80BfLxTrNKHq#5%^Gazj7<6z z4 zHm=X}j$ONIA|KRqT7H7V4M+b;eBA%?JPN~ zkVFN`fnlj{LT!RRK>_vV*rKzFa)XbKSibGNM<)obTu&oHT4}6x{n}@j)h!x|T}M1T z97>noChtfonQty+UpSv4{QOIufWQXSK1l!NcsO?I)bOdzZP-}v?5pwfr%yM0jQS>& zdF;(8p^NyUC2X!27K%Th!75=+Ylf;0S!V6LsrV=m+c2+Y(DRhALu10H z>p}L@h#_EX*JVryA=M8n3t_N~t?G0W4Qcv>ms}C@J;#=PdcxG%vd*%v#uF>eYm;{{ z*w%KRPH`#jeqSj&%>+MC-g6CKdbCgaJ~j?SrrbD)kf`sQAg7R`$ls8!VL|g?&XOA_ zE#}vm+nsvU2D^UPOgiGIEbs8j?FM4nGuzHIJIl7+D~7e}YVs9e(+>6hE$GdaKh8!4 z?O}UqOuZEKu?3n;^`|Kx1Zp{81Pb49_6*w}ffj~$LJn;g|jIi>@)X5elDqKvnmC5-QSHR$49_oKpaSsvT) zOJ#lkuOtos=LC%ZOg{R*Jr@ilb^ODbZt*0?^AVW_nRz|E>40H{tpj)b@B9A0FV6dufbTDQ z|GzTsdM!BkUz#-}_=kNm^fqMDs~|LkatH{Tq({&?JbVBZAeZHv*D!|QEJbbc#pN@( zXx|OZZc-fiTOpN|Y>5nv@h~!~8g?kJ4HNA(%hZ!A>p!`~EIVWSg7Hh^mSs2_)s2Aa zkYgLSUCb8m^UJy^r7AhNrdZsTIo5S@!Tm$|*)2x;fL)Vw^tCYCme+D>t%L<$H{ z0Qg{sgj6~}Q!jo9^dbeNQ}l=Ad^392615nH5swH))dkIVix-Dm6yTNzSp<@C#PNlvKhR2Q@6YL_jvuHQsM*I$x8`gYY6I|@xZ|LpE zbC7umm$Z<|@0qPAIT)>jri8h>DdN_|&uqT{AWVBbpOu99M8(w{LKyfque2{ zV5ff!n=$?*w)YLZThLS7N3_Z3Rw$6H&;pF6wyOq1Es;w!(TZehe#VVjtH+pKOUWPF zEoeJ3WEM`gnXbNt1qAkCf)s{{;MJ|~q^8W32w|c?{$1gwEGxu4Gv#xnN z>%|?8#T+j?N%m%MyRCVuaP;<)asal9_r_A633Io^a^+dJO@;Fl>TGO*ZDDpkk5Cm2 z;{++JtIrDb2zjLybG+)xu%<%k=nB?G{-R%DPo;jX-_2VD{f9w3U)^C<`oITfT~fBr zE+o>oruy?WW!OAZI#bg_&wWbU>-CXiZ0dLREz_yT2;jsjXi~Nc@qXcq1|*gyG_u`b zo?=`*e4_ET+-v}=s#S!!jLcHs6;D*vsPfxGX z;ea!zoQ|q2xJ8aVR7e>dtiOd1Qpv3uUbfWCdZAt0zhk83Wdg|1%5doN+KUn@%Kc2@ z$tEXW#U2KmTDULBjUI7@;4&2{l6U%C>ucUS9XTS;X@8a5K@-FVS_L8V+_UqwGAA?y zb+>fSBQr2FxzJ4)nmkY)W+?!7jq=cT;vY=*KIBoY_TQS|oQKrG)JUM%*xv=Ryt+n=b<${;wkD|M6L0{{p4G5bat34tdaq*jftGn1S+Y-rG6$-(Psts``I&H$#oG|asCV&R47MwL z!I;=a!VJcYv zfCH8YBkKYb3{gNZ+G_wz5afuz?a}~W7(awNut05QWeK z=gw3c5tBi_I&@EZ$_ygi(d>6!=p`E zYYVApjoQ77E4F-8mF#4ojfdi)G!En6+_V8OX{#Ty7*7S-vzD_M@{V=h(}bW~zNdOJ z0&1tp4ls^Z+*l}+D-qFV0b|aW0aL~M#%vDCWir&NZTeN7?sX{Rg*`fa7Qq0S;r=Aw ztV*+%1}-%taEh-_aY_1H@kK@~^Qk(9TQN=(HVs@a3?(H-=z;BRAs^PqM+QoTCuT6> zG6dJS4c|Wr1Oo&sOWkqGutkjAR zd}2c9CrCQNVP0bXQ7|!y{qNR(5Y`&B1m4!A0Zk~5dzu>jPO)6RIE7oU1lhmVXaAPCJ&Kn~Fa$H|viEsXE0 zmdVO$k4YTj3>}e6e4FvkdnA1mPWI9gr#;1VV5ld7xe*)~p8n0Mr0E|Yux*ISQX1Sr z03r4`vdD}DVNkFny+!$L0F3lT?09Cc!c|L``;>le$=29r^=mg7q(jGU=b1)4Quy3Fp_mu*;e&4N&S`V8*?#9q!N1Y!C098VmEN> zFY!G8TrK^-h8MrcLH`xW`u{q;BG0!_12P;p{RGW2qyYpjU_!R9jh*Is*}8fm>XPp@TVO7-v`QWF5p41O_%x2-B> zjP3)Y$q<^5Q5$006TuB5@b&^HH89kR=_g7a2o<)2sSE(FH5UhWH`z;@(!TAl{WceE z|M&k6!BsUH0xl3sME##2I$)H&ckYA_jr?;RWHv_#8}oJs*WYloGd zT447)N-s7Ls%TahA)ggwDzmLJUrL8K{Kn|}oA*b;D0kh13liI3ZLE$FFRb^>D#y78 zm6T*j=sM3L+6RGV=;u;as)~!vY&C!Q-TT5=scs2q`>mpW*$@AWKUV-G zkmS^9+Nbfo;p$&X$ahEn*Cp^b&f}kdN+v^SnwGTs!mZMCf3bLS_g1+2o3FyJ&-_1l zOg|6($M91qFc0{L(dao$F2Ce$7Y6-BAbNd^29d^0dCgn?u|2v&Fx5QBVIe^vd>(FY=GX0)Mu@FdeC$gtY9-(2yS{^&Q?> zHSMaXTv&Kunj`(ydlclpPz3WhIOnc_lxsD4$`#1;>^!+ z`L6b$=LPEN?LuF!Y5+VX^Quo*r>zHMlvq?QVC`B`T2`Z3#LqZsSRGOokD@D9dIOkl z+@SmOPmtv^^m#Wy_BEjyOe|m%TCs#(hX=hk4xVpZ%;u#t*U@+wGI#%z#RpUG>iuVm z8+U$p!Duf8@W9L3r{Z0P>B45hWY-rZ54ooPFla>;A2zq$XWl>g_w?^FDh5y?G6VbH z?-m&UD;L-Or-^od7HR$}-2A^-F8)i+^Z$bd#vEEERO|69rd5uMto~!Z1UJ0BSCaTe z7SG`)rmZnEgGWm($g$AAiz*+y_Vo~NfeD84c?10~4Z%6ujGt!v3CbVs;>3w7v>jPQ zn|AfUMn&A0pwG+#E2f+FtFjp%U#}zH>|jk$ue>nc;3@Q&XF(&tp=Q-NVWgL)4!Bi8 zW+ht>pv?|5r$0H(irnuv);2_ztx5EkevPmC;xuI>cFbT;;gvA+eBOqBs-f!ir}Qnw z_j&ab*n>it3qWxy(H3>J7bV(f;L7pt&48TMxGTxv!S)XP>kHrM%~Ypz$FnbVuh{$K zhuZhk1VU}eK2A83#9b(Nl^mB_>y@GHEazLO#@eXcX~%6_x&&W|-dACZjgYT($R~EL z_Rd0%Q6w(A`27f3!73wo9$G^nu zKnfqylsS58y}Iv%E7af_gSw*U#TQXMUBxXo1{d~XYtD&KLI>8KG6B?UJ+Na5=HcSO<^*6_o z?pXi}cqS_T;y`*^O{vy_lQC=P#Ivs<*D|gf-V2h<5w>7nL~9;ohO3h|+SFAZ%QeDg zWrP?4%vo*YHgvorb!%RQqp5uMjA5+rR0{Mgxz2o!rGGP3HZ zA5(7jjqj^3Bs1=wdxGAPN(Ir4q}y*OdU+aagi;Um`*zjXpF&w^QdAMLLl9Bu8o3oo zhPPuG@|Nq%UX>ip@7uVmE9cGq@Gz_3hmCUqTmZM6NvsrsWyC|`l)TuepkbPb+w`bH z;DBxQQ~iK9Vp5t$PuF=K+L>lNKiXwoN@VE-3!*Pjtf;34HKk_l?hRE<+Sm?y z0Zi$NoamS_EU%^Wb9G*Gp^J3zhYFQMhnet72M>zMVz|J-!Z0wAWgw@Rq}-@Cel~A& z>A2=9lQ1V!Q`f>LTaQ4Bjr~T=@aER&8~ffg2gc4}ZS8|hniy4qNKZ8xY4moQg&o(; zaksXwI*~M%?lR|ns>-zE@QKixmiebMjtL{iM4z^71#$-EvK19fKs;M-6e7WnlLy-UKT9 zcriG9I4=Ms8WI7~%TYjgfUknJc2H#iz%K>QPUBbu06+J|gS7+vUf(B7BTRh@QdK}L zG#MK4SzspzPDf)FUxeNTa6l~#c@0b;Vpw2(5@di&^CSdD#O^~vV1YDWFRu-74}9$` z>`a(lfc8Uxl8{KYp*@rA0FXg|NuLCu7VJU<)B&eS0)(s&ke5V|dAq1x_<|mZMx-v% z`qMUJ|Fz$6#EgI%$^f zB`Jf!%tHf3v?{vrxCNy439{w&#O#%}dfke%SZkZbmq4JSUkS?wmJ1h?+8>X38qQa9 zKCz$4FObir<54*5DrlQo>T}@%63u8`dplf~d`?+hOu4~hfJQ0vr;zp!va5cWr_Eww z=VUU&vhTn4UN*fBDFuZb@NAh+Q@I030aYFt!pGqSX>K7IQQDlmc8n@NxD_MfuD(lS zzD>9={Y^O?`P9!`_p>|Xm>a)C!kQ4yKD-tX{_H66Y(52;Oz^-%OUb?)^weuZZ(i0> zz|p5PEEBlu(jI4z(aeP*gd5;B1JiBz*cSf&+u9TaSx?*?%f@v0`XqTt2YlNUDuG-p zC$itrS9e}T5ta2^HEUzua$DT3qnByW&|~xYyt{D~Qw{B;bmjAcE9`5CVqm3(_y=pR zQx*jkva_C1&m00S@d2{XU_Y7Yl(iMq?I#c; zFZ_Vt^C5Spu(KpTGRJ{9y%rxPEagtX;A&o0+=;=AEWEx_+cUhmpZ!=W8Q})n?}*#G zu6IzaBo!p6s?Fjm+sO>~dEanazL(>?+QL?W*HPLSU(XK{QL4-U_Zemx-;BCl(MQ(_ z^5Y{^9qr;=v^9#XPcer|8-)>9TJ~W=StX%@70*W(Rj$^FQp0}3cYF`t%5KU*T*vwx zB`~9!C)}BY=MKwAIiPHb>7f18xIJ%(lU%%t;mKui)lX2O#57P5yj`lXn30u-*FFs1 zJbV8jPHCkZuDk+_<&p9%Rfh*lbLlgI@i4vCBK?v(VCD({M?gfVjWpl;L+Bf@QK{=T zpET`59mv(8bSF^c3dafmojp(|&Jq9_qWr1KgF!=$xoHgkzn?(X1vigi^W65^?4dp1UcVErS zgOJx+cf(S_=FN4cUSdl6qT`9@0dZ%u zDa1MTxvs-(sgXri=7-BpIa8IjpchSwmlN2L)ts>AtKt0JLWYHb&a=GLqS}Vino=6l zi3dY6s7zvy)birpq$#B&2`jvPTm%Th90m$401hNjP2(1%Qu{CNt+o9bjYNS zQSDLhmt3pHOW}?K^W^)`xJr$(O*zH6YC7-b1!Q?MvbZRR{S{CNy7eaP=K|ya49SXU zC3pJsH&UIG$F#tmIS>AgfvJ2z6-1<5vu(^aS?%Mg@N5Y~c323AK3R3>>_@-LjnF=O zP+Se;UYTb9je5KxxX3%jPt?S4`eJHaEdrv}&lqIu8dpT99MMS}NE}^*T3PmjQ47=% zD$oN}qyRwLq6hTyNaSm(x5h^bfXhgAsYk`x3Oftm1P}wB9snUbPSyGlxL`r>E6Yy_ zOuJeWw@zvjkBK=eUmnjW0~)*Ukzsd&W{LsapVt-ixy`fiJNLGRK_7i zQ{Au=3(>!lfBy9N7nJSSV2=N4yV-v*s70)!`q9P*k+8W)02p>q2I}2HrsA((AyJ>M zr-M1R{en!|*|9dw(=4Hv`C>iO%tsoHBu>sP(!c|A=-RlG4<(+9wcR02=!IAoTXtou zqKF%-3ltr4?`(L+%s3oOmTI>kJ-%Ugs@u)_ZqzCGTs#@CVRSqXBCp1eWi4NlmYApG>|itD}m~6x@*d8qP%x zB!>`vTJ}-#no>oV75!baEYwbHdalJ4pTGTb>$@_Wf$9TMHcg{upuNo?c3({Ctaol- zW5l&5l9qhIOvcX6zDVX5;vrp*Z8=|#%ogc=Zv}CfuiEC*z5rw(Ty(EvJ;DwEZEa~I zf;9yc)j+QHbhczCA3R3DO@2|Cmy3_WZ8_zmD_-~6;rJOV(8Uw6TJ8FzND6V@l-o3;4{f#h#I2GuYA>C>Q>xXu80I`lnfnrTy4r+2cQ$|{Zs zZO1MoHF^^|XT_@PiC#~fB1bGMo?aGx?|gnz_=kLpGP9!`So{3ZlNoO%GV1Fm{rIZ% z8%&`>p-e={P9)Z0<`ews8tvrL6k{r1ZIni_ohVCPwb~1d6Ynf_o}Wy7U(FNGAaO7& zRYHKQ*}b=+b?i>_gqjsz4ww}Y`Zln>CPSB`kug}y`&qT;`=+{QhfbF`J?PfQ&G-t} zhEbj?J6c<_k_lWm0N2CXBpZ3QqH{_WdEU`B9s*?%$>d*orOEX3U32~n04n(#=xTWoo+iGQAn&PsNpej@} zUiw~=Z(GBlJYCWwoykngJ`4yp7agh&EM9=gPf)+h()G-{EQJ9N6EX|jMdts2KZDG7 z|FJHc#7^j~gj%0Bv#$#lg{IGFT zZ}Z|($GOwwJ<8QdX{0MfX}l!|$4lwp{n8ov;*$6=zYXl~(tT*ktjZd+NIGU<1~7Zy z{wmAk6>2U~7@$dgo{*I3z4KwE6e!LT`Jt>-$$=x2UkB``dQ&?1SWqH+KYo7!Yq-fl zg`d#F7b}pXifQ-faRb#%w~vI}zsIJJ3QnM^9h4#S{WI{45!>xPC?|M~{f+ntm9xp{ zU9-o@2v5L;AYE$S35d76F==4jxu)dqm=l`Y+bkI3sZANAy|TkNPABp8G6Mah+Xh#a z_v2R;gq^H93HXMT&u5W~8w0?%km&6G!MaOFK-3r)pg38 zyU4`{%?qkG5CKW5s-!jZYB*VPCx5sMTiA(6On-n}I`Q*)qEFbB$Ge^E42y2L#G&MnZT=+@1W$oGNz%qZPmBiYWPUuw|F!|K(>W*8v zOlS0t-4(HBipf)C$4`(?h10@PoKD!;0K6C563w-Kq^kQaM_abvoO>lo%~MKn1#j8f zeO9vpE*f~BNj|G$g?HI|C29xn4BSsKnPU)=NodEl~I zu)FMIyC%*qXil%O-xnYR;o*a{)jWdPbrzD>6Z<1iK0KK~i&RbMiYRbM zmdbTez!8lKu5m_ut?XZPinMf-VtWZQa#75|rTG>sA7p$w_BuL5_xV zhguGbWVlJ&b0@3#jL3e0C_v35T*1t7nDqp+BKrBNgF3klP9Fz~cBp;5Jp5%%yq92V zCuqI)v%tn3}ljgT0!%esOg$B?(87AUVVHMxj zt>%QL1+1G6;WZJbJ_dX54+YUK(1!WCGr(gjt@Yu1+Gb&RRj17n)xD_PluqwG|KE$l z6h~s=T1IF^1~eANvRxv};Z+@y@wkEU$HBO){&iLD?F@*=q~Q&L@S{um_YJM8_0&{n z<`Kzjox|u#pfJ3KBCxNeg)r*}5sh#)ufJZ{99=ZK{w0j@HAs9S(ilDiUAqgztYNA; zlJ7{Qyeuc^=0|dgQ48WSlL94C_Uu#TqEFKLv9u=|7*Jt0y|vsdmsYe^Bi#2(twI(%Si` zT!@)2XZLO|Za2pWzO`_v!RyVv#+7V!8|m5XfHrHD!bVT#DH}Y-&^WZ=Oh3&ozc7tF zjZOs?0S46GcOYXM#+y`3wDqx$3kE9ZtJ)SIAtAy$8#ct8t;OOAc#8aqN1WU`Ra;n{Fwk%k~s#I_<&MRu05H;SS&Vz2pqU|%7 zXM!JA51Q%-qynkxB}0b=brVjo-O_&$U$vmdFYIH@(f0g3M{Z8f!#wW&KMxoG+h&+@ zf9Fs7A6b?Hxkw!0XQ&1P7!NzPfop`r`YdaQUy54h*|L+53- zMBPWj&xZF^P*vc~^%C008;R^~2=4J`GMeHJ0it)j`CUVoKDTtl2Jc1MXfV->s6wAk zi48QKrG$iPG!A>+?O#P|o+!HA>=z^!C;9qo;zy0+Dh@Rva8q+_iNHK#vNy#wkiAn| z3>_uMK#px0SgcW>=$9KUEwjFo#owLyLE=i2$LNWz=XFX;k2*rLH^wk_0lzz(Wj~%V z!SY!lW_e;GftNZ7WGzb$<)(&--_K2tPR%XfI;pRIZdUg&XTGm^woaKsvwCz{3dKLv z z$-fuFg-#|+et9dh{Pwy!YJEz5&+XaEtPvXppbvWjpq*>9vw_ZSqH^kHE}{(OsiuW1 z^@$z5YeIorKkSZvVV9MD9A%Y~6xFyv6Kv>6o8qV?N9I$724!jd<1M=|EK0ELhMO6- z%*a;%-i;SIXHOVKH9hV)T9U{JC9-$G#>{5k1%fT_@e?2L?#SBuTkA4YX?ZNj$Im zu)4}j8q2KH)in{EWRf1L zKE=KN?qvCr!d=jx-G&l^ZY z7>wJG8P#rob6&fZ=PE_oOY3s>?S0WGRJshV;7bgJ<+^J#5tZv^dO6&E0%HwELX`@u z_pjW)vgNU$tCslqgjjY%(vc(j<`ktsl{T~(xf%Zxlt4Sv=cx&dP?#=_FYW8a7In@~ zjgRT@T%M4lRyuDE)i{f1665xzIn{-4n!O4se24TmsoeU$H$b8%OqJqH#t`}gJD+5C z^)WUpoysWE#^JZu3K`u&58b;P>4B(T2nXaYLV&D7QO92+igaL#+|>QhZ@F|FUufVo z#TPnwjWYs7&+n~4J^M0u3U{9QTy=HB!XAbwIlduI$HQFMc>qsFt`f3r?+j2vcXGMB z0HVNm1zl0)m3Ia7y*0p@A_=DEhM{*X`w;XG))hCR&~(-HQAX}XiY!M9VjRy9Yf(Dp zFE@{h9~N9!5mn1C%F;cv*`d6A>)t>^z(1|by!D$?gBn$OKgGB{rnA)_rE;Ou%ZoMF zP@@0iE#0F`_e?-^J;wJ!r2)Guc%nAj<92)YK^u$jvZvFJda*V>FL`%XQ2CsM`Kf@j zt#pr}03W5<2-rwBF2iw%$XUYNR;bhn{xQY*My!RglHb|gp+?k;BbgrsYcxTbV-7pjshV(xySGNdoQ20KUiG$b{t$|1n>>9(L{7La(u1(#yfSCN<#$ z!=xl=X!0moaN>?5q0Ez!e+SyjO}C@Pvsq00f)U)f@@^M!MNrH~6hQSQ5IkH`rpD}%F=&r%iQDt}`LK{UDi?lO|n5@{04xjOKZlY!_B z_awG{o`_D7y29`JgT_A05BqZw{ybc6sfN2qnUNJhXqXt$yzXJ!RV=g2VDKGO)43W}5cbC-d0gBygiQ9LhJg?6s*K@=o) z+GX$roVEAMqsc(c5KFE1;r(DH-}`rqLT?b9p50(!K^(W%j|k>^qcOthM4=1swKxQk z!|6rLD?C;-Iqj3(@Ol9mJXwB_NMm#@FYyQLmm1H-6+Z0A5b`0%fsFM@U&-tRPKZl{D=pSwDlsh$ zT2m&iUQDOgF+_FTtF7As2`quQ|=x(76{!3$;ci5#xgLmG`gxXQ=ESt7Ka8RmTzetz#NbuTVB&heW zzK|dOT`=u;=VF;EXpI(L+mU?vU2!J6%ns96hmqGK0R!;Tb7Xzer$)}8>Eqf0K*1!- zwYtyqwN7)6+JB6rvouT}XeR1Zny0aGn^)wBl-8Q3Idj%zr3=$`1A>VVXBhV6!rg|0 zLfP5w_2;((Z?+$~5NOx@QGO-+)s7Z$?US9W8ALOS+Yn&--|_!Eu^RZ zAmI*iyi^`@&*b=&6tJBGNfY<+336jQuWFjKU_(W{o+19o_lr5~sTf?OVtm^(KY*OT zuVgAnkkxedPMmOxExd{ZQ@NnHE0Hk;AINwc%X|H&^Je4IZZoNMgY+H-?Z@rc06rDh z*~zB>o#Zl#U28gO-OteWy-Y~)WF0=i(SZFLdy~aC1eGh6rhCYWpw&WkrBc78rlE)dLVy&vCB$RrLi?|hpD>+&&OV~&!|Sc zi4J^EJ;itj+>)D$y`^>L#8n0EXwcjK_(b5tq2fX>A2gHzE`9ac<(LqAJJ)iJ&!Zhs zsV&5ArM(%j-KUWNpfiQI=#MvOlC8aRg0kbgmc`s?~@`)3u ztnk%mi2BxYo*CF*y*j{a`K$MqD;i#hc!+lPEB-xgOy$#hTEu-B0bT&@RAWlMrPjclu+gDjmU*}%&bFXgU9YcULJNvpX z+`lkB|Md93FMw9*kNO#O#UF|-70%O`chp@%_4@`*bwih?xGkU3Mll@K$QTSA)hr}u zslIA=U}?luwz%MGxroH;r#6N6njUUTF)L9iHeCeRDU$!{g$KvO=FW@2I0)bD1+1UhNRL zDVe=(Sd-~TlC@Ma*@$>qm;8;e zp-W7QopbKrF$f*JEss_{T--5=e^gO{@V4`Fl9{Ee_2-MAU`3vG~DaDy7T4(QC^yx#_- zL5$R@2J?$VM}(x^o&Vl zILCyW0-(el-dBL;B>6qdO1X2Y=3My+OUdUs^pQ(;ipyu(W#3PHG4vbML8U;>cfo{rYwJP4S1g*9+F?dOna2H}=h4opEispXagP_unan zJXKwv{EWXn{iyxs?Qc21o$nUkRkz;!sKzCYKb`_Nc7=Y?igQc*QG){@;05-^BCT@%}J> zsJr)0oXCZ>UnW=FUYF){Ugt@H`EH@J4GfYtVe6Y}*ca4Te)v-*-}dE1$yzl_xzMfc z*3Oo9j!(Wb=UULJrIPM@_iWplWtKTl{x{=BU=>gkQT%&_>EAeu>%UijyXj|dRB!dm zYQrDt2esS}-}RKt=X*IJ)2eV@^yK_Jj(8(?edStjp?ax1YhIlVnX_cqbkWlyO_;}T zhF>y8-W(2Wq5yY?&jqa@;Hu}gTX_Mv#gY>^o^X9n{V$HySKWbg;1l$K+X~}80#__4 ztq;FGF92y{45Kq}5y}MH`+ti~?SP6De&_#svj0Q~)-ra!W2p{MYa6W1LJeAU_WJtc zT|n1{{3~6Gxc1pMPc4%gR@JP=v!iYOwEeh_;|pQbg)O22(jthB+$-)WdDNoL;t`34EO&td@MoZmwr7f zdj4ya{i)2i>34ogm%p6|Jgri{rT#lmJ@9Dm#pnX-*JUhzwWWTF?V8{7i7;EFHmv_s z`_}ozPjc^XbN|b*PY~#g-@t4A^m@_w`*tqsUv=?cz_-osYI?zw84JV@*#ni<1J5JB zh%Rvbx=A3&)pFNr&rjT5cT@fg(+}VrdHwtU3_o_6qw#+|>#|$z{?qx^`JJE4-rrXK z*AUO~pW*#qkPpQ}@_>id^~G-y+qy8SChOWB^@Ts!J@>Z<%|E=4zprd^_$;-%LjF2y z%8p1dg;qUY=8*bZu4~1^m9C#PUa1H9Ms5^!TEURO#~`vLFjftDOCl?8`9Fv6^>1AA9Mv;9Ca$8>I zOkA^ZlT|x#i+17!$2S?wPv-y7n74b`kLllhV*Nd?=$6KsJw763?Od5`be5+f-i-)@((&z0a*G%Wsm~}?aqs?h|toxl4dCl5ip1-;M z+v-QN`$yONN6&Xp|M>d8*w^YprlXu^=eNH(uXO5aaby3Z`bU;ep3ZErn~2n$BK2M^ zCZy&sN$jDx=i20FKtwdtEB+~a|F-(ua29jBD!p@)P`lBKE;0j8RjScXQ-9`xPX`6(th9mxpGDIwe?oO1**uBi~k9pwfH zC;xu^E2%vB_v@e6_DC}QKio~;|NJKZ{UY$d9EJ;@C;xu^^CH-pYk&m?lAJw~E0{+e zhzQ%!5Tr#KyjHeX$#l}gDgQoJ|Goqqkwau7U_#}%Hy3zm@8Qq(j~#l6%mIzFAejW5 z4@!ZpMviA-GyeUuf9(E8c~<$K-~7K{0H>xHGQgSS1vn>Mt4EfSN6N5_MCN6-c5p84 z2ZvEQhDQ*NaF7~xAlA}wGz96F28HgPn4h!R`ucnG`mF8nTEOiKsD_vctRRppfzjG* zw7P}1N=DlQ1X@(rVtS;HOnV~We)IP7Yrs8T$nCV1YfGxVC2LM3a{M?}lp$CK9Q!pU rdkm7o6Gw>YK8*l1j6c=_hhTuiFq4-7cb|7o+y5)J7I>Zx`~RB&kn;59 literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/__init__.py b/distributed/kv_transfer/kv_connector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/distributed/kv_transfer/kv_connector/__pycache__/__init__.cpython-312.pyc b/distributed/kv_transfer/kv_connector/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..067f78f9a30ba181c03db0e408243090741f6cf3 GIT binary patch literal 186 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVMe3L27U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?WHa^HWN5QtgUZfi^J$aWRPTk(rT^v4|PS F0sur!F+%_V literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/__pycache__/base.cpython-312.pyc b/distributed/kv_transfer/kv_connector/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72594a5c24ab9876c181a000ac4c52a096cd9929 GIT binary patch literal 433 zcmZ9Jze)o^5XNWkE{Q}#z<`y=fhi1_TY^dy#6s-?R+{3P<#M}8j_mzmZxcK#t$hHW z!Dq2aDOUx=!cIwN0T|;OZAI%TX}UY%WG@Ft(tt(DCQ+XXzUY`A5J(A9U7!k1ii6j)EP~P z6ke3}nb3Yob|x)hDzvb)6qM*G9_NI-!d$iRdxvG1;3PcDOum<84AYK7#zXW2v3n$6)jq@<3W6mzbmZGSYL%Ofz+e8Tc6ak%lLT9#Jf2_Xm V9894$+uWV*9Zk!}PbHx*^#^CrgQfrg literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/__pycache__/factory.cpython-312.pyc b/distributed/kv_transfer/kv_connector/__pycache__/factory.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19fe0f95a4ac4fb43c54b9ebf579ba565d9cd2df GIT binary patch literal 7051 zcmbVRTW=f36`sA2;++&p(V|3JD@C$J*P>!OactR%Vk?R5IFc`N+=gw!tT-!)%JP!k zr5&+UYZwVCqYrT(3M)v0xu?q!t(5Jo>w3Rk(`qDGAT#}aA zIPO54ot-&*=625aof-Yn>vbVWGXK-;?tX;+N?xpFYZSKcG6>y57-2?6De`1xCdZ~& zhLky(%ke2bXG_^~_LQC0@p4zrk#gjmDJOlm$*!C`<<5Ch9{O&Vy*XdXN6THZKNm;^ z7$osZ5cE0Z?p!bxq-{<)lnbZAwCs|5a*QLDAIoVoro! z-PwFrOUs3Y1xX>V&E~hR!mI1DEa#x3`RoF`1O0ln^EJ|?_FWDsf0+PBUM-@U1Bo}5S8|L5#(%6RW z^A6ngE`JBwNJ_aSmlMHOJFpYGBoB5=UhE-yuuXD+toL18Q`QICd_)WM=%;=0*x#lr z0PO+X4ei~QgcNLyipRk=$xu_Wr76_b9$vO5dW>M(hP)(_eOfl0X+>Jdsu~peyqJ>= zcdmd-vXm~0+M?mg$fByIX@gynw33pa$?$ZhW=4>Kc+vJMWIthl!rej@rh=Bt)(7${ zNbze8kAvl4D$Iv~A0Km;|6XS>r!{qJcI#tv6IQuZcAmxjyPg%k!gubWLbqtymavy6 zFVa~_iXaLZ%PNJdS#1$2WU;(FCKp7U%`XV%bOkYw1v*)QOg8zo8?48)Wnpi*cdw9D z1yNQc5ibkof=P;mlS%|enXyJno@4?c_-(%g+0WTqNNd0oLQ4%$f`(?g!CKQ_L7D}# zsDf@V7KF{B-!L=af6i=Ts994}^zW16VI2V?J)o;iBtt!Jch&^TUP))ou-@hjk3Jv^X|3U2 zq}*J|@p;pHlEr1iPF7M<%1=LRT}f-HT3;rQ%Y}?6tIs9P&xXk$1<3w&4Q=)8-t5_5 z?b*Nn{f(ZZt1oSZ`qmELp4bQ_bx(2|WB`gRx{X5w4H+N-`7yxjy!>zhi*OlvuCSr zVJjG0`{DXi>wEMkPwVkB8^M=#&&ze5b&h|;f0VBFj{Orsx$Zy%Bdgr|-W%SV_)m_0 zWsr==4r>dLrg+Hov~3h@#kSxAs z15KL)O*c)qF;Imtui;2jDoLjaXb^fg_-uZ@U^ts&OE?sQ2+AIk?IqbL$@W2Jc$@G8 zLsEg8=wQ-ncENB%BjD|Ne0C#vN%vf;4e#9?K3yF?UGwbT=GYNe)iJp0y;-iiQ2*d&-+^l1 zfz7@P)xHaQzM#u}R2t%O)qgT-L5$6tgP0Q;P$H?7W~YN^PHlOA&9gP#!$1_4x^9L7iwSBT64! zlL=u4Rz@h8I$J7QVr|MU(x_;MzDo~`g-#>c>`>{2H7JK^OtE8(36|0u^+PCAS2zsP zD%mBWL0$cw+;eW3x>h;a!LYwU{t{6>0U6kM40-x)hkhOZWxVQ6nq2mrzHdUGn9<|s zHiGAM&-t2v|El`_%8iw^H>&>q8~&v3NK&(+9;y(me!!pzDU%3fyY@{4#a7t=MNpVs z;VLW`6AxN;BnT+s$k(~8LSuK!&}hiW&9k!*oOU9z6JaW) z;WT1FG%dkVeP_ZBpugD8T5!9DZU!_~=xRKHTJ$=MlwB)b8i_Gz0&0adz_2o~63u|1 z(siB1K=D87y2@Nd?{Kf7t4zWp4t-_GWRorkoS%CT4T4I-2v?rw(K zB$AsD%JwO8>=4R6s${-U0{Q|qlnXAEuebUFYAX>Wz*13>GQhxC06NHva8QxVX#XHU zz}uxP1nUCCy``L#*VHD-mWr6LHt=^gpOH&g5}LdPV}g)N5LJpRss!&X4atPx@DNdJ z;M8Hw$h5b;z7e9+!<0g{N+J8A#33_$Cm?30aW)NO6%t@6Pmqidm_lMBWgIf2>#C^a zf#8)SyjcWE$Vnm4pd2FELCB1r=6KrsH3B=U4MuvHB$e-git?cj@bRc{U>-hH2f)Dr zLma4Y|7P?^HF{*#S93>o_aLRoS2lvvx@Wr9)3@q^>b2Qly#B%KRrkJH?C9FmFQz}3 zUQbqIM>k@}R;TNuC=g#;(*2{lW3)bkMxNOmK35$+r+Wr#{$bs}e=8#BPfY&Sr%#^K z2hVRrF6e;^TLYt;1Ba^vhxKP(*%*MXVs+N$jcy5h_0c1rBsYXJH@!fj@xfn(KihM( zIymvk^hWIE)#a>^Sf+T7uQxnJSDGSJNn+0wpji zQx*uLKFEd-5@~{=n+<0vN-)%vr~xr)mEbI4L^E&-b)b&&o(?wBLGC@!qzaIMS-4Pa zV6%U++CQm3e{rLKw(97G@aXNDKc+ikCV}*UWTJ=KxZz4$2AnoL>GazrQ6{L8PMa>q z<7`GVI5?&%WR*;(Ayi}d(`hjezykqsA+JK5>83N!NpQTy$_NoAK(3HLMfa!9c;wKi5CEqcabEb z$?BeC`p~gz^tc{AUiCk_!JoWq3)ESsZRB>Qj^ObLM<0I((T9HF)%&D&n?v?)Dprq> zk_#Q4xaZt!d!D%)h}CVR0H;m=?s^w3f};)H9O)jAW=qrt57jw%-gWv`H9fqy>K|PPze`q~e(tdp%em%R-qc`NI;YRK27BWF(wQr^4b?l7eWz{u#~w>#9=>hU<(n?aTDCR3 zCF|zA!Gly|MJQK6Uj(4 zYPjil=8;+<7cWK+ttn|)SxwHD@|iR((x9RtooTyo+C`jTfymw6Vm&r zY+jQTxB`^ul@!Hn8mgE8Nn!RCvL=Hhqt|VY;N=h|OLv@mhOx%k0c&2panW&EswLeW~I@3<`gC(2+vZbf;#O*XQ?FoHLSSsdnzT*XN!y%oB%M>vq-)AW`j&({>6!A7 zwl(2R`lfuOZA&yH8>bqR{wY6A2`143H0_B%vT3S`rc9LJ6PjPQ>!721W4RzT!^)mDSR&W>qw>uX`pq80EOA35q5x?udN z`gvO07&>jjwWl3?;2Jx%mv0ifg*}hLWH~!3-@riSjXI1O@xGad# z#AKLOom?yf*hwxuF9D|GVj__oPp78im(`Zbf()agD5PZW3c$G-8Gu=h6Bhv~CB$Hz zr=wUeAgy>RE^~?W<;#Kyv>NJ=p3lhh8Ch+3F3k&xmohL_)Ic!JTZeZlDhV;|z_q%< z1Jw4npnsDRs3{t_)x^^RGi~Bca1Jcb2y zs|W@~$*ewO)}ps+zb*=Js~E5X5&qgZwu}x5W`R&>Wr2F1o`k1!X-}s;i0X|?94`p- zk|4}+$wX9Th13;E^<6xF{yFZs^C!8}=PsN)b^eU%%t!(vz-h>;cRmUnmViU5kO{~_ zN=l1d6)wdW1=T9gbJzfOVI?6kEqY-q5}rgT^xKQWa&mrnK0ZGrM@3-9p{toEjvkqN zLKsfOFAdKx$TR8GFf!&aAD3h?ekmgh{4iXHEJjn(v>;*|me=@nI3vdsQe=KXwQ(Hb zRBo~Rq4yZkaZu_c)Fw+*sjIKpbx7$tRO~vXbe&peKeTmhb@ddxMwPD7eETQ@f~A(u zV#|=yGE{6irL+LjcU;>rn^Jw+TEbN!mXV{E62cA^F#`%=k$)x{`LnY~&XlF$606K3 zSFbZ`nziA|J4==_VvR*7Md>;?$G}VjdCIW>H;}2E8EE=;u7W-B0Kgj!;9Wo`@M$f8#nth$Q9BTDed-O;~0_BY3r z;MqdpnPq#axx3gLQJN#m&eEQN;-2Hmp5u92d%2l%`HIf2O=s8o>0uM* z_q;)U8L<rujD(`?;DdGTV4WzKE@O0LcWvaZ(ekLq) z7I>OS8~BCDg`bTw&wNJQD*Z4!`HTOCBH|N?A*jNPSmS4_C`dEWc_C~e+^O17rK7te z9zgs-s8l;j4FRf7O?XI0LNi&Ro#wcna>nt-IJZptKEa9NJj88!oh+UY;o1kYNtwy?uFGU)fBx?p<$K za~6XKl;DAlQ+Molj~0Sw$kg?bwFAYrA*F3-vFj5Q+k#n|1AFusp%|>QK-)GkKznEjrTC&!x zt&K81B_ks|i#mE&`5d!5c1?xu?aE!y)B=??Yr+n{GzxOglXKUd6{_i71@6!2tlW=0 zE5>k+=aF;VSFPf*i-cJzrku+Ayf zhMqNPv81+2uV(~dQ4sgTJe7e6L=Awi8;iOdbV!zbRsIv0Lm2&CG4&F2Pz9HS5hrR+e)fA5nT{OaRhhbk`iff z0;2Qt@ziD2E`jSVbMt9QwF54uNKU|;hLLEoKbKsjTQQ#>A`*%7W_dccId;0(E#5*U7Jx?k< zPcEPN?(^55|IigG?diUiSW9e-{4l-ze5tkn&l~cCXEzeX$XO+F7Ml6+S*7)v=BegHG z5q;-Nclz&+{KbKlHfq2}%p|bw;<(o{(S$Z4qFNGBU#VuuFT`e4Q!1re zo+HTO5Dt!FH38N86roIbLc|D#ROs|e$Dz7Ll}(I!oL*hrrtr7yXfyY2WPWt@{i{Fv z+WTMIrl9-FBUbaia$}#_bFZ!IR{vW6t>Lxd2WIGNq>*Y2zSV-37Qfqn3Dr$XuEy!N zD=0a1-wfQe9G#`VenPG~YDN||TeE*A(6IGZzCt})3pOAy`Z_px7bv+J^Ut~~Xmvj| zn+n>;1lniu#JCn5X9S=oX3 zRY0DDuk;#JOVk4k@IZ= z7rQ5v?umbzD0IJ)w{?}A-FauEor#AyeX@l;@!MWunBhFj!93n<-9{+ zd8ZDUwm?$V4CyR81f-D8)3R|VFJBgg%TbWGMfqh&m&E0T5VKbBUge~Nks1(X4r7FG)l-K~^utjD0cuZCq$V7=f08yKO!DCmTN=D&XQkS(S0kDLYa5p%}J>=2D zCe7|ffo3H$rYSnqjf?G{zPf7a>T5UyttjE(eFGo#d?0P~zw^~X-(ifXySMtnyFsP? z6U*#RZGD6%ReM!DM9c@N4!Gt7Px274>M_DV^+Fzp&P_;nW--TE)rbJB6#W{m(vpd$ zvL@YozEC+t$Sk8^?!5ajqcX9CB+dX4F%Ff4Yga|tqlI(2DqGxZa3VWVzJ)X}VX;Jg z>}^~*TJ#Sq{^5duWYd4*&Q!tsRNnTKb^un5d!Hj=(k!R zI;+|#{Y*eRS>{f|^}cL8dfhO0J@*96I3`z9lCvm?y9#eLo@>00o+JU@G~DHNsusBd zmotEcnX+94Ry0a)vNh_Jxd!P@m}``*G}q=r>d_7B8pMG)OX^AEk*jphAr8SzChGvK zei<`Qq+~5}SGDcXkNy5Bs(d-1UPlk zBxp>~qvBbvVS<)G}vFn!DuB%=1xMs3~SRv&EwZ zini(_M2;oW5LimbBL-9d2zDR|Q2hpiF9+4${ez+Pmwqs`Vkvb$UhE!Iy2na^w$;D>;aHWl3#_ryx9z^V}ski8cA2t1MP{j$#ff)e#533l;~5S!GFlZ?E2m_WEzTCnn_N!YF7e7MJ16_GXdMZChu6Sf>qT>4 zt^sth<54{H2Y?~{7I+@n#6tO6R^Ht7_LTfBWt+=>c&jP+w*ST}>+B7#%s~H>?tSk` zAB?YGeJ4^jLm!;>_C0UM*80m<=t1Ju)A%bp!aJyruD7MNBR?F=+k#twqqj@_gOHVl zg#CNW+A}|VHg9X)>H)XCtK{F8_aDl;4nbWC9w-J+D45^>nZEE+&Oh@C#0+lFeGBCdJ#-W(`G)e|(R&Av z77w0R-~&PY{)5H+Q7s?FPbO@mCJ?pK-y~h;zxQ=bQI?FH8RPTZEMVe}CU2Rty z`j##CyiIxU-n?xu)LVUzmpX?wI}hIpRDZjUZFZghF3}y`hszr5^jWQO&)&BBMa91Nff-RX z9fjLa-I5rq=?vA)OLAS;fuogdo&$%pwr0Rsn6DQ!AmE8o+wVeEpPw)?PT;Sb@X@tm z^e*57TK7QeeMduDV46NNt=elK(%sh}Cxy$%&^DhuAVENU9T$rP8TcvP$6J3aQCCfkS*ulh&b%e+-m9g0J)>R1h+` zD1Y$g;Elo6+?}a^$o%7*fB)u(jTe?Cwi;Uupe1MXsx$BG$+JCL=s>O!)hFf^B<&h& zT{{nnj~Kqeg?~OUGlzJU;SbHk>a9LzRmcUdf(AtYL+s4bVtdvkwq_ZzrRJK~!l*h} z)mI=+GQB{{NQ;zO>Io85xXP}g2MM~jq5m2>^=3o5^#7TzMMjAw1>as~;Z?~8rrQ+; z|1uc!oHb|T8J=I>!e8M#wo*E*LRz*K>|61wI|T#sJrp$~w^( z%elbA(pqkC)*V?_){U=f^c~RDTdm-G(P7JCcsxD7fax+~=cUjkL4HFJQXxJ*4Szv_ z*Yu%F8Ei?B$uMigy_#~PRiFM}tNBD!hJSG-#UH>m7U}`L)&yvJHfnl`;Tmb=6@7k^~LtfDuEz%p&c{JH|Lo;bf4tvC#NOu*hnvktw zw<>Ak_ptkYtjH?X7(6^Tol+g-6#&WVszgUs#FpqNtu1+An(hN=znXXHAw)npw7rTO zfcGEpm3{)%5_P|ka<~1=+o657oSx-e!P)tqfBh?ky^n1=AG>GseA3YLJ%05I#r9#P zeYns*QfNL@Xn3sHFs3w&Ej_#S$k9)_j+B~v@Z;}<&jY(TWKr0TrHPgHUwJ6=!FyKU z%9%}TYpJoj*f^#%j$OCj)0@lIe`)Snc9fdB*TUG>JB>_h|JT!!vl=-q-zs97S@jZ1S4JWg=Z*M`#C)oe&%Ved z_#Yq18EGaP-B&F!kC|0sZpCk7s|PF8`otfGOn+t`UPf8INd7BExLfm)ktKFnZq;^7 zdvkDH{3{s6&`9cny3EitecxiD*))v0M^w+hQcr$Fy+9gIea!eiVkbYg z2UcDw*!L`1er{UQM>m&GGd8*x{LWS8#?dmi?^W7n zXoF|kK-{}kN%3`;tq202unuHLkb|-~mk)pU*!5$pO}9GMI`Ymw#k_CZiDS*Z^eLLq zUw!%3m)E|W4~7;0K$$@>S!xH^j9^$b5G4DtlCDkf+DR9@z?PM);_WRvN#8|T91q;s HBuw}}mpTaL literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/base.py b/distributed/kv_transfer/kv_connector/base.py new file mode 100644 index 0000000..011bbb6 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/base.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Defines the base type for KV cache connectors.""" + +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 + +KVConnectorBase = KVConnectorBase_V1 +KVConnectorBaseType = KVConnectorBase_V1 + +__all__ = ["KVConnectorBase", "KVConnectorBaseType"] diff --git a/distributed/kv_transfer/kv_connector/factory.py b/distributed/kv_transfer/kv_connector/factory.py new file mode 100644 index 0000000..df871dd --- /dev/null +++ b/distributed/kv_transfer/kv_connector/factory.py @@ -0,0 +1,192 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import importlib +from collections.abc import Callable +from typing import TYPE_CHECKING, Optional, cast + +from vllm.distributed.kv_transfer.kv_connector.base import ( + KVConnectorBase, + KVConnectorBaseType, +) +from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorRole, + supports_hma, +) +from vllm.logger import init_logger +from vllm.utils.func_utils import supports_kw + +if TYPE_CHECKING: + from vllm.config import VllmConfig + from vllm.config.kv_transfer import KVTransferConfig + from vllm.v1.kv_cache_interface import KVCacheConfig + +logger = init_logger(__name__) + + +class KVConnectorFactory: + _registry: dict[str, Callable[[], type[KVConnectorBase]]] = {} + + @classmethod + def register_connector(cls, name: str, module_path: str, class_name: str) -> None: + """Register a connector with a lazy-loading module and class name.""" + if name in cls._registry: + raise ValueError(f"Connector '{name}' is already registered.") + + def loader() -> type[KVConnectorBase]: + module = importlib.import_module(module_path) + return getattr(module, class_name) + + cls._registry[name] = loader + + @classmethod + def create_connector( + cls, + config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ) -> KVConnectorBase: + kv_transfer_config = config.kv_transfer_config + if kv_transfer_config is None: + raise ValueError("kv_transfer_config must be set to create a connector") + connector_cls, compat_sig = cls._get_connector_class_with_compat( + kv_transfer_config + ) + + # check if the connector supports HMA + hma_enabled = not config.scheduler_config.disable_hybrid_kv_cache_manager + if hma_enabled and not supports_hma(connector_cls): + raise ValueError( + f"Connector {connector_cls.__name__} does not support HMA but " + f"HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`." + ) + + logger.info( + "Creating v1 connector with name: %s and engine_id: %s", + connector_cls.__name__, + kv_transfer_config.engine_id, + ) + # NOTE(Kuntai): v1 connector is explicitly separated into two roles. + # Scheduler connector: + # - Co-locate with scheduler process + # - Should only be used inside the Scheduler class + # Worker connector: + # - Co-locate with worker process + # - Should only be used inside the forward context & attention layer + # We build separately to enforce strict separation + if compat_sig: + # Old signature: __init__(self, vllm_config, role) + return connector_cls(config, role) + else: + # New signature: __init__(self, vllm_config, role, kv_cache_config) + return connector_cls(config, role, kv_cache_config) + + @classmethod + def get_connector_class_by_name( + cls, connector_name: str + ) -> type[KVConnectorBaseType]: + """Get a registered connector class by name. + + Raises ValueError if the connector is not registered. + + Args: + connector_name: Name of the registered connector. + + Returns: + The connector class. + """ + if connector_name not in cls._registry: + raise ValueError(f"Connector '{connector_name}' is not registered.") + return cls._registry[connector_name]() + + @classmethod + def _get_connector_class_with_compat( + cls, kv_transfer_config: "KVTransferConfig" + ) -> tuple[type[KVConnectorBaseType], bool]: + connector_name = kv_transfer_config.kv_connector + if connector_name is None: + raise ValueError("Connector name is not set in KVTransferConfig") + compat_sig = False + if connector_name in cls._registry: + connector_cls = cls._registry[connector_name]() + else: + connector_module_path = kv_transfer_config.kv_connector_module_path + if connector_module_path is None: + raise ValueError(f"Unsupported connector type: {connector_name}") + connector_module = importlib.import_module(connector_module_path) + try: + connector_cls = getattr(connector_module, connector_name) + except AttributeError as e: + raise AttributeError( + f"Class {connector_name} not found in {connector_module_path}" + ) from e + connector_cls = cast(type[KVConnectorBaseType], connector_cls) + if not supports_kw(connector_cls, "kv_cache_config"): + compat_sig = True + logger.warning( + "Connector %s uses deprecated signature with 2 required arguments. " + "Please update to include kv_cache_config as the second argument.", + connector_cls.__name__, + ) + return connector_cls, compat_sig + + @classmethod + def get_connector_class( + cls, kv_transfer_config: "KVTransferConfig" + ) -> type[KVConnectorBaseType]: + """Get the connector class by name.""" + connector_cls, _ = cls._get_connector_class_with_compat(kv_transfer_config) + return connector_cls + + +# Register various connectors here. +# The registration should not be done in each individual file, as we want to +# only load the files corresponding to the current connector. + +KVConnectorFactory.register_connector( + "SharedStorageConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector", + "SharedStorageConnector", +) + +KVConnectorFactory.register_connector( + "P2pNcclConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_connector", + "P2pNcclConnector", +) + +KVConnectorFactory.register_connector( + "LMCacheConnectorV1", + "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector", + "LMCacheConnectorV1", +) + +KVConnectorFactory.register_connector( + "LMCacheMPConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_mp_connector", + "LMCacheMPConnector", +) + +KVConnectorFactory.register_connector( + "NixlConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector", + "NixlConnector", +) + +KVConnectorFactory.register_connector( + "MultiConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector", + "MultiConnector", +) + +KVConnectorFactory.register_connector( + "OffloadingConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector", + "OffloadingConnector", +) + +KVConnectorFactory.register_connector( + "DecodeBenchConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.decode_bench_connector", + "DecodeBenchConnector", +) diff --git a/distributed/kv_transfer/kv_connector/utils.py b/distributed/kv_transfer/kv_connector/utils.py new file mode 100644 index 0000000..b8eb5ea --- /dev/null +++ b/distributed/kv_transfer/kv_connector/utils.py @@ -0,0 +1,268 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +KV cache helper for store. +""" + +from typing import TYPE_CHECKING, Literal + +import torch + +import vllm.envs as envs +from vllm import _custom_ops as ops +from vllm.config import VllmConfig, get_current_vllm_config +from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.logger import init_logger +from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput + +if TYPE_CHECKING: + from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase + +logger = init_logger(__name__) + + +class model_aware_kv_ops_helper: + def __init__(self, config: VllmConfig): + self.is_deepseek_mla = config.model_config.is_deepseek_mla + self.use_mla_opt = not envs.VLLM_MLA_DISABLE + self.tp_size = config.parallel_config.tensor_parallel_size + + def get_model_args(self, model_executable: torch.nn.Module): + model_config = model_executable.model.config + self.model_executable = model_executable + num_heads = int(model_config.num_key_value_heads / self.tp_size) + hidden_size = model_config.hidden_size + num_attention_heads = model_config.num_attention_heads + + # Deepseek's MLA (Multi-head Latent Attention) uses two different + # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0. + # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied, + # resulting in a kv_cache shape of [num_blks, blk_size, 1, + # kv_lora_rank + qk_rope_head_dim]. + # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading + # to a kv_cache shape of [2, num_blks, blk_size, + # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim]. + # For more details, see vllm/v1/attention/backends/mla/common.py. + if self.is_deepseek_mla and self.use_mla_opt: + head_size = model_config.kv_lora_rank + model_config.qk_rope_head_dim + num_heads = 1 + elif self.is_deepseek_mla and not self.use_mla_opt: + head_size = model_config.qk_nope_head_dim + model_config.qk_rope_head_dim + else: + head_size = getattr(model_config, "head_dim", None) + if head_size is None: + head_size = int(hidden_size // num_attention_heads) + + return num_heads, head_size + + def get_kv_from_cache(self, kv_cache, num_heads, head_size): + if self.is_deepseek_mla and self.use_mla_opt: + key_cache = kv_cache.reshape(-1, num_heads, head_size) + value_cache = kv_cache.reshape(-1, num_heads, head_size) + else: + key_cache = kv_cache[0].reshape(-1, num_heads, head_size) + value_cache = kv_cache[1].reshape(-1, num_heads, head_size) + return key_cache, value_cache + + def put_kv_to_cache( + self, + model_executable: torch.nn.Module, + keys, + values, + layer, + kv_cache, + slot_mapping, + start_pos, + end_pos, + ): + model_config = model_executable.model.config + + if self.is_deepseek_mla and self.use_mla_opt: + layer.self_attn.attn = layer.self_attn.mla_attn + k_c_normed_k_pe = keys.squeeze(1) + k_c_normed = k_c_normed_k_pe[:, : model_config.kv_lora_rank] + k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank :] + ops.concat_and_cache_mla( + k_c_normed.to(kv_cache.device), + k_pe.to(kv_cache.device), + kv_cache, + slot_mapping[start_pos:end_pos], + layer.self_attn.attn.kv_cache_dtype, + layer.self_attn.attn._k_scale, + ) + else: + key_cache, value_cache = kv_cache[0], kv_cache[1] + ops.reshape_and_cache_flash( + keys.to(key_cache.device), + values.to(value_cache.device), + key_cache, + value_cache, + slot_mapping[start_pos:end_pos], + layer.self_attn.attn.kv_cache_dtype, + layer.self_attn.attn._k_scale, + layer.self_attn.attn._v_scale, + ) + + +def get_kv_connector_cache_layout(): + # NOTE (NickLucche) When running disaggregated PD with NIXL, HND layout is + # used for faster transfer. + vllm_config = get_current_vllm_config() + kv_config = vllm_config.kv_transfer_config + if kv_config is not None: + connector_cls = KVConnectorFactory.get_connector_class(kv_config) + required_kvcache_layout = connector_cls.get_required_kvcache_layout(vllm_config) + if required_kvcache_layout is not None: + return required_kvcache_layout + logger.info_once( + "Connectors do not specify a kv cache layout, defaulting to NHD." + ) + return "NHD" + + +class KVOutputAggregator: + """Utility class to aggregate the output of all workers into a single + output corresponding to Rank 0 for scheduler.""" + + def __init__(self, expected_finished_count: int): + # Complete transfer tracker. Used to track finished requests + # [req_id -> n_remaining_workers] + self._recv_remaining_count = dict[str, int]() + self._send_remaining_count = dict[str, int]() + self._expected_finished_count = expected_finished_count + + @classmethod + def from_connector(cls, connector: "KVConnectorBase", world_size: int): + return cls(connector.get_finished_count() or world_size) + + def aggregate( + self, outputs: list[ModelRunnerOutput | None], output_rank: int = 0 + ) -> ModelRunnerOutput | None: + if not outputs[output_rank]: + return None + + # Aggregate kv_connector_output from all workers + + def update_finished_set( + req_ids: set[str] | None, + remaining_count_dict: dict[str, int], + finished_set: set[str], + ) -> None: + for req_id in req_ids or (): + remaining_count = remaining_count_dict.get( + req_id, self._expected_finished_count + ) + remaining_count_dict[req_id] = remaining_count - 1 + if remaining_count_dict[req_id] == 0: + finished_set.add(req_id) + del remaining_count_dict[req_id] + + finished_sending = set[str]() + finished_recving = set[str]() + aggregated_kv_connector_stats = None + invalid_block_ids = set[int]() + for model_runner_output in outputs: + assert model_runner_output is not None + kv_output = model_runner_output.kv_connector_output + if not kv_output: + continue + # Allow the worker to dynamically update the expected number of + # finished sending/recving for new requests. + if ( + kv_output.expected_finished_count > 0 + and kv_output.expected_finished_count != self._expected_finished_count + ): + logger.debug( + "Expected finished requests updated from %d to %d", + self._expected_finished_count, + kv_output.expected_finished_count, + ) + self._expected_finished_count = kv_output.expected_finished_count + + update_finished_set( + kv_output.finished_sending, self._send_remaining_count, finished_sending + ) + update_finished_set( + kv_output.finished_recving, self._recv_remaining_count, finished_recving + ) + + # Aggregate kv_connector_stats from all workers. + if aggregated_kv_connector_stats is None: + # Use the first worker's kv_connector_stats as accumulator. + aggregated_kv_connector_stats = kv_output.kv_connector_stats + elif kv_connector_stats := kv_output.kv_connector_stats: + if aggregated_kv_connector_stats is None: + aggregated_kv_connector_stats = kv_connector_stats + else: + assert isinstance( + aggregated_kv_connector_stats, type(kv_connector_stats) + ) + aggregated_kv_connector_stats = ( + aggregated_kv_connector_stats.aggregate(kv_connector_stats) + ) + + invalid_block_ids |= kv_output.invalid_block_ids + + # select output of the worker specified by output_rank + output = outputs[output_rank] + + assert output is not None + output.kv_connector_output = KVConnectorOutput( + finished_sending=finished_sending or None, + finished_recving=finished_recving or None, + kv_connector_stats=aggregated_kv_connector_stats or None, + invalid_block_ids=invalid_block_ids, + expected_finished_count=self._expected_finished_count, + ) + + return output + + +def _make_src_and_dst_indices( + src_block_ids: list[int], + dst_block_ids: list[int], + src_device: torch.device | str, + dst_device: torch.device | str, +) -> tuple[torch.Tensor, torch.Tensor]: + src_indices = torch.tensor(src_block_ids, device=src_device, dtype=torch.int64) + dst_indices = torch.tensor(dst_block_ids, device=dst_device, dtype=torch.int64) + return src_indices, dst_indices + + +def copy_kv_blocks( + src_kv_caches: dict[str, torch.Tensor], + dst_kv_caches: dict[str, torch.Tensor], + src_block_ids: list[int], + dst_block_ids: list[int], + direction: Literal["h2d", "d2h"], +) -> None: + """Copy kv blocks between different buffers.""" + if ( + not src_kv_caches + or not dst_kv_caches + or not src_block_ids + or not dst_block_ids + or len(src_block_ids) != len(dst_block_ids) + ): + return + + src_device = next(iter(src_kv_caches.values())).device + dst_device = next(iter(dst_kv_caches.values())).device + + src_indices, dst_indices = _make_src_and_dst_indices( + src_block_ids=src_block_ids, + dst_block_ids=dst_block_ids, + src_device=src_device, + dst_device=dst_device, + ) + + from vllm.platforms import current_platform + + if direction == "h2d": + copy_fn = current_platform.insert_blocks_to_device + else: + copy_fn = current_platform.swap_out_blocks_to_host + for layer_name in src_kv_caches: + src_tensor = src_kv_caches[layer_name] + dst_tensor = dst_kv_caches[layer_name] + copy_fn(src_tensor, dst_tensor, src_indices, dst_indices) diff --git a/distributed/kv_transfer/kv_connector/v1/__init__.py b/distributed/kv_transfer/kv_connector/v1/__init__.py new file mode 100644 index 0000000..0e16bc5 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/__init__.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorRole, + SupportsHMA, + supports_hma, +) +from vllm.distributed.kv_transfer.kv_connector.v1.decode_bench_connector import ( # noqa E:501 + DecodeBenchConnector, +) + +__all__ = [ + "KVConnectorRole", + "KVConnectorBase_V1", + "supports_hma", + "SupportsHMA", + "DecodeBenchConnector", +] diff --git a/distributed/kv_transfer/kv_connector/v1/__pycache__/__init__.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ef6cfe145d8b9e32464800fce971c18d3daf065 GIT binary patch literal 528 zcmb7ByGjE=6rI^;Fo}ZL`2|~K$E~6uf{BG7S}3Gggkc|JvSem=nVkevTKfU^c7BV0 zFs+rH7(@^YXVxU9^%jS7?=a__%YAOQH-IW?-DlqrfEV2?%UF>4o+1w*1u3L}!4NVd zG?*Eh%nB{2=LWTz6FLwKL25q2(4ASH{i?rvmfSuY^ou+vi70q4st6tga%*jNSx_RI zSCg_VxTsFgk7cV`G@MD$hSE<&lm9(bp|4SyVbK+PB{WGXDNf+aYQjDL# zTnU4(Yb02S%B<{&h!2VA+)la&`{OP_G>cI=6Qd$W+8wPef!3DLe6<9c1_QL=U$Ry z&4a#oMiQrPqur>8oJFuQkitwFq%zWn41ZD34{gv7P4lG-v_MIC0}Io2S4ENn0rFui zDhLo1ZGXRW&OP_egQf!n{nPg zth9S!w|JH+?kVkE*emY)i{qtz3;V?VKyiQRz`}vj!G(jRLkovWhZhcu`oZF3C3C?P z_d~_UOGg%ti2HQ$XzAF(F>ya!e4=!G;W)`^EBSya_d+hb13-t9cK6tnl- zj4hlJt@fhTxD$ELmaW32e)Ad8avxgm5A}Ci)E+?X!BFk9qV^DK4~J@>TkFX_W_{9Y z7^&AT&s54~Ctt0&=Pb|3T|Q;b6fMs)mnyD#zTj2e!eXuJ*yd}O%^54d;+V(HR|~f7 zl+C%SRdrJNN~u&U7xGrMP$`>*vU&CO*WWO&o=TcU%T+5^oU)h6nv;BN;_;&6nw2FJjnE$->i5}4b;YWh&UN6-2ikQ?m1<}>^Nqst zO4VEhUaVpffR<{-YGJkLwDwad0ae7mWiA6dWz%)uuQ}+}D%+;FQmGX&MW6&onskx5=gTLosEoW}*qSxZ&N&0)MsUd%;41P=7Vo1&p98n!~>lvXRQ<*u2FHH-s* z*o%)|K@M+NWoy}S15B#1^FCGODz)lrt@?KzTE1msMO`QF01GzeSSgv|HPvHy!SD)I zxurt6;9(WTxC+~qa^3;80HKyfDF*PHPPEMZTA{iks0aiO_%Z&KtCDKWMIhmdTp^OC z8Zv-^#Bj-})?AP;hC>@ylU!*1xMsFlu3R&*)P*Gu)&hZ^o@fJ}tdTBU$6KwGy~1Ll zSg5WEGGRyLr}hpAoPyQz*2?)6w^FXuyyDuFdA8_P%*rZx7HhhOUM0A%U2XxP>RM%w z3%_c)Rw;z$s&&Q5IiSs|WDx&-vIe+IK9L-8&2d5RV0zH0W4Cm#_%xWPSKty6Zpe+Q z;F777c;%M^UJe9<1144!F(KI76a4K3*sfNlNj9Pjc01#frMU=cXPXdNtoDUTu!dj> zx0<8K$X&TAh)C3UVi+PDXu~j`nJHOoU`B!9te)L9t5D5>g>yw~4cy0mL?>!FYUOI7 zh&ke!SrA>4((Ovwk+UTQQBq-S*+UotJRkr-Gi8L|$qsO^w@wH+2|n>m3yah`K1HI> z8sbgGMS?L3kid|H{&SxPya}I@G$uF|)mkwM)c_>r3MDa_tLTus1OiRZDchLbG?<=~ zoJ##ARp4y4rx87SZl*D0EkYw%`6?D{rDCHxF@rf+i$$lAxClANyKJm6IRE{(F63rj zy)g6I#o1TzZh5WI_j-X1ijBmZite+KMnA*^mw6d74F9IJ3VKsWEmZ9a3Cf`IY_$s7 z2lu?8)DT*xE*Fa>5ZF>-8UGF7J?fYXWD6Lugu;^F+Y`ue< zj}6Cw?Po0X*pY>(6SI5ls2y|Su$=m6=h$&+<=DNbOE^g<<@BqXzB;T)dB4^#2$-V%!B{aV2VslgV2nIITH=(Sn_!BbKT z`fll3sy%iqJ8bkRN_NLF!$wjtbq+*6)##Oia5k^ZJb?ikg8YAL2*t-nRaCP0Q6w9& zezON=7`cQ$QZpcQu7k!Qmb+=J6)R9#(nKX)fdHO0u+3SAG}59D(1*Q}y?(wEEG zm3%Je#_&!fUabKyjaU&gX+#UjNY+k&+hn<2fOw+`7V{qW{!rrF2pgpo?$ zs@|SoKf3O%XFmGXJND-8lN;%io5?31CedIkik`k=PjXxi*+^={x{u-iyz)tLYZ#o7 z5KD`=fmqVdJ}|0~N%HrZx^suCala1F;X1ZHpKE=;%kxhovrvPn0&F~(Qn2$HvFh5Y z!y2!)D71h;Q#Rs0jlMmCV+!mw27Hm9TPa!Y3wTgr3!?Xi@$GyIpuRs4C>1viZc!@F)=#EK84 zfB3R&$_Cvb9mbPZNa4mu8wW#Bz6uY~TY+uvnMO9<+=5js)EydLQfke98f(LidlHbhW8)>( zvG~XM+ZsSYj);OIQbs%}alyxbCtQq%aN*CnF0KEw%5pUQdWj3nA&6kWU6RwLZ6Kzu z?Lo-i8LhgAH0h9ptJU<~?*i)f+0L+z#c%NDEDJ#xtZJDd5y9WqL^QnGNy9#;OtGjP zhgsH3V+>|i-_7^}j59moM3;K(WXS01vr|sOPS_CynEDZ5O4>8_0In&tO`z={?)z~+ zfcqib587#G2!X5N_l^h%S_);>_1ME`Ib!c}M)B4N?#Gb5zx|O>_=$pfRh%+X~H?E?;|uvTmMtD zh}n+;@yimUd`z{@+9&NNox@nQiO|z0@O08Sj(!qYsj1MD6ZW)|!H7>`#H<+a zlyg!-EJhs=>$OLY>BFbSe;V&RiB=Q#GioNMQ8VdG2`WlpP2_xH_Oqc8r}2iG3&wp8 zZ#?Bp3EF$Ub1by=@f5RPxEa?ga?1V|S|4_fD99uYok?!?)Q)}IFe|6A*&F=9>`%&bBC|ZHk< zX&BdxY;3lE?IPS{A?RsiccNLTg|ly6q?PMjUxiwr^=K7k%(Axl9;1{Lwooz4G+BlF zwN!&W%;1T6jXnz8bA(H2mLgbOST0Z1pAFfGYM~G`MQDiePXuMD$~wWR|Fp9XL#ipj z_@$3W4cJBlY8se=?W`gM0cdPQe_`^$A~jXubgfczLn;$ff18X@~ zQsDOX&u;c-H~OEwf8yycPF&bLapBhaKOH#s=OYI<4$Xf)^7h8S+xN|Bwmx*f zKmA3&x!G@S^k;rPF!Gbx4`-V_(Oua`(fHsHZ0$Y!Z^eEqC3SCyK}-fFqi*d-@iCbA zJzU#=bB22wb+A&ohxBPg;3j0O3Jlfvc0mGhNg3%;$KNm>q(^^J|FFK1*eAegjA%1V zuWDWk5xl{B0o@3003F2&?&J^hugCf}CNKx4Al2#4bZk?`&-qO6rGy;iRwqB$WKcmr z6{&@jQX9GyQsZ;eDVL+1=}jP7U&TzvU>F31lZi(9gjoZ6biCw7_lDC01i@Naw0QmrUVCm1=CuCt7277R96c9^~H1#gHA47~*( znwXh!rk1Bf7qb`Nc^!dv{aU9;)>AlRQWA5^5qbl`2f1~{KpXJyN3eBAv%-h=h4H(X zs23K-BR<^PYVqn%gg{gd078RudO@@QhOV|~FoULtVT4Fb59KSXYq<(6Fpu8(2X!crlRy;ZqE%wDCW}7h!_7^lagJQOk{B7qS(Fb~_5k!9O_;l)H{S-kcl_DDdjtMDuoy|AO5r zbfz>q(@^T%sUnMTtI8#zH+rw2Zn=||`|)rqpk?d7Ly!mf2g`anwBL+($y|u=q`c)sCT@G6P?7r$B>f*90?l zXiD8nCy%sY%gnsoy3r?ffIqqdFpyESd`_N}$US3PwhdQ@JNw$z2(lHp6%FcCWFGJ> zg8ok1HVIG?B)vFtKR`$W&C!C4emNHZ!;t!1pgyFx;u zg^M?2+;^~L7rx%b)0=8(@mK6Yk)mlN2PJ2x0swd7YHL1Bcw$f04EAzBLRbOQGBua0 z&3Bm<0M^BhfKPsUu>~1^E$vZAgNZGFPgw~@L0%sGW8Ib9Blks$pxc)D3+~+$AedouF-RUk`|ROfIfX$_ZgJ$0@>{;|9DQypw1Hyh`#U2~e(!y71ZY%%&=>?>%i0@o869Z}^WcA@UH<${m<;BG z{E@x~7GI(W(`=POVWFc|ThAUVNfyvTs=ElPK>+nDawb0<2cU_>C7RVpg3pC;{P-9qcLSwU;$sRlEl3#&@3t+2MVHaXf zT@x6$&ydm`_`@Sp0Kpg7s5I9_5~`oXCVLJjd7fIAnXF8yMY3%r?;zBexmvIU!CVlb znTqRWrACk`F|ajd+JGLI!V4)^^vsZF>u&_=gt5>%c6DdB&;8Of3g>fZ>sF6XR?Daj zewAKAS}BN3MwR&i)hk>)L67ap0GVty$cJHXViTW&D&wb{DRKg=OkfkKXVWy4h-6f% zL?XV+9}%m8Cru&k5P56N&3s?1q>4}%D`JgkFd(3S1z-}i$kMP{W=A`eEC(w<!MJ@4JJc`ZyLkLSraKw*YLG7wP{Exig- z_mIt6Z0jyO7mA$r?k5D~1Tc&TK^g3`tv73RmC;TyDXt!NW*GalxQ&EdQaTp`MeQh` z`x=Y~LGCcp1w_*MYJ)-U)RdJRL_6fK0mnAdSDD!y42>MBC@8H%5gjIm#d1@BK^;6sM=k~9>M2@BLiv5&h#_;HmYqthExt5GczbfQm*EjX#=2mJ|yK+r#$Zc<; zil_vUBh`$rPjok+Ee|~jy(GJ%cTD$1E|G7iNIOD1pd#~FcM?hm23d174UjM_5K3nD z94H^s6h16c+4Z~IX#WJEB0ya?ru>IotB*}v-(>a0$e}-ICgLPgU}=s{v>yrpY=T6>N3R{ zAB<4akZ+I+H(4q$zf~|jVeRRbAR8Uqg5fb)S(@A~CrJb0C+URqk^-lvs}&OBtd1Nh z=~e`(Pd;n`$Q0kR(7j4`bT%b%T2~<^gET1dj!Xk;-x3iksXNiG>9#A`+0??C_#wDpd@9yf8T+9~7 z@fQs^7mYt3bx^Q8<`vMh8g&P1JF84g9eoeZy!m0Y)=(SAgndwkL#hII0qzwDLeK4h zcXlIx+pfShs!LL9(RD1FncMPoQ!4^xCl{9l)g!`O#eNA+0?Ea170WUe5E=N8^VkXo z#yYTR3LWW|kZ3GqdN4P2)J~x}FbTBV(xoq@ws9g3y9kWO2|?qNgSBiSxFGC<&M8fr z@6t_pmweobA%$o^JSK^|3jkqv1)ESAUmR(mA@iM8I1T0-GE=2XA1B0i(+L7XbS@QK z>!c=zNQ2R2UI^kI)X>a+DjxMFiV*`QsFwyZ>UWe!jml!g=k#5 zc$#ZTC;2Lpp)pM@-Vmo0FpJejmw>;eJ95;N3EB~K!VD|8G3ByMkM|`8md&dOY7zV; zwv#5H8$0MnX}iKz9(H4|@fEUnQB4|7OH73QcKi)YlzEue|;EV6azE(Fw} zd?*!)776Y@gnVobMenCdrCjMeAzGgdX%~OnY5V5yQwuY{3foWTB09d%EByeMo>fE0 zZ=XVa`OW>2@UP9Jv+jo7uD8Vz&~U^&e9BX~VOS|1mtlHFAknaL+XGKh+Q#+9sswJW zVn?nHh)5qEQBDp?1bU--4clLeg?%I5=i`7xC2t7xQ@KjCx)tcuK0*Y9Ek$2q@^FM+ z|2D2wgh+~zV#jkpYHIV%g5wq1M3KV=J%xwI%RFBzh@E{->)kob+c#%J*!_FaZwe0? z&>i{foILv_^`eg>xwIktD9i?QPa#i=ZlkmAn$pK9)rXRh8X;0!Ahz3T4s?pxU~I*6 zDzi-R+)dE7`%h4KBy{#EfB_15 zgm*$jh@|5m<}DalQ@Q(;51-5zU= zP~GDChIf9(7Ky`!exi#+?bh(q(5mhaR#hnt9KV&Z%$-)X{wI1#x0Rd1$n6@u z^&_TVi_dnDOfs*(5-k`FjiI6PIZXEwjOjmW#q*Eb@w`_thVDV3rfV|LHf{eMuolLoI;>3+-_dBqxs zeD?qJd|}Val2Y9xBvINhfIv$6VWpxpp!K66Ch;w+_C|jak^#q!;RBL5d01+v&_?~H zM#DU8MIkD+?W=a9eBTzjT%A7-5#0BMsBU%TX5|+UL>{&O9fYo*H6Z5slQ9BuR;)cx z&>#BDLj7Aj+q3fDm#r`9OL1}Nlh;FE<=CZ}qeor)%uwr*xF2Pf#Idf`)|9)~wVc<&nG z!FF~K8R!Pslqr(dUY$KZK^J27)$^@v;752`^|@Vv$3c%m&DXSA(mycapqKhXmn*Y+&^P3jezX;7R`p+w4E~^N|Bc)ZaC^6^)J#HH|2;>A!ghvBhZe z=|^~BG)fdh%X1@)q{ErhrZcu|$~Hd-!Ee9>`}M)0X1SdyjBTGmyj2XMb|+r=gA*gzQJviVXRHWS|In)BG%YzvF(-oT;?v4Hc+_;S*Y zR?+_qp=ibzf5?VGf&gsGN>L>C)8GZ3Ri1%DI=!=NI7PFD%o2RjAsk}!^D(i^T0V#^ zHMujzl__AO6Z;V%vXz5M(=yo0mC*{u#0vg#b>y9RReUDMQfo=v;`|H~GNnDuiHkJU zfNQno^xMSGI*JzF8kdG=m&kj!EXnN}{~s)XAF*X3wOoe;t=(_57Wed8a1)dwielGa zly4e?*PV3AVIPtf!}L|`F1DN}RuNPkkq+D)@E6>TE?7NEto~j*Ru7=DOgq(m1%dEx z66KpIi4=}+)>t#u^xGVi;g(g?=|*d^C&03eSo|jRAUDQ zp(S@08s?ec6aERMNvBk0TK^Y~#u)@r;aOQ>T74_{xC=g#ArH2zJwv4dd=}8%0GP>O zE!hD-MN$*V0Q^Xd+~n0XURlJkYwlHJ zVK)maK@_)9<^6t?N9l2j{<$+>CCB6YAMAd-8OIf08Au;)_VF%Z>_2j6dUM~?TWrug zZS?XKUVQ)U%0mN}EqqBrRy2oF@#i+qywNmxy>xkt*RAp1c&0g=if1;m-(gLriJH@Q zV_OC;&EZ5mefRvm(R%=`yu}&4EQbv?wUu$U4MM=ZoX;Ya?gIqi)w+p=6=y)G(PxXY;QA;E9N>n z{_(^|6U{#UmoUbkY$kb^GDi0_`*}BDq=p_2;!gF$x|8VVtDc0e!(DT1IKKbBdGyZA zCy8bh_Yaaox4hfWe0Y8R*oVJ%C(8P}r$0$=q^CENPi@7~qIqm2zWZME-pn5+whY`p z%=E^`n!~a9*xkg&shQ2p`7HzgGzU`gv5n06Kdx?OULzXEnx>eCiFxo_Ah=>4haYeD z@lJgH!@E8IVSWAdM*6YMr1>z3dIddOoR5gtN8*F`_nOTpuP47HuW!vg1Gty2g!pECK0ZwtyZ5b+esr;!{!+2ACL_sVBMxz(H4Hl%! zfTk2!N!|%L`7d!NPrXmclfy0(L>l|(N=;ooHHFV=Ia4BRHwEVfpEeW{s*(1e%Sum3 ztA;6Rb;|A3A`$_a>IQh=Z((yt3SReza2VwMaE>q=QeiIW##A z5D!M?=8Nth;VG;~oT^0Gj7B1nuX-Pk#I|0J7?FWr7)O6$O#Q-m;dA4KKQjjY!Z`Y+ z@$8qz>;#B>c>FZ@pSxaY>eS9^@q?%VHt zo;-Y`_fhYm$dTJe9~rnbzZ;1~o?f468o1n{0E|5SXeb^@-?l$4d{lU3;P%mQZzOr= cnMVdLk48|e<6UuosW&pVKKj=NuY!60FWS8*kN^Mx literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/__pycache__/decode_bench_connector.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/__pycache__/decode_bench_connector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c7e76c10b6401e727c220f5dfd8a851814dbad0 GIT binary patch literal 16090 zcmb_jTW}OtdhVX-8TDMG8QstgiPiCWW+T(v$X1dRd*2s+p`onup_ORL})HdIBnO!G)S=}zwwK#Qsi$bI|{+YmW{&6Ldm6a1pCNX<5 zo5?5%J*$pM$ECAxNzS`8omC~7b(E4BO^;_1iZrjNs7|x8u4sBPGZXduFV7}5DPd0} zDd}7)j#eoy&8y0EGL;&#=8=N@LPWT+69STFK~AcG`kdErrjY%gBlxH9IKkS=GX zoTf;6`9~ABrU;?wxOxQ;4iZA9lQRHPO6s%(ecP3Vn6Ab%+O(p^%=R%qoAlJRczQl% ztw5_Rq*7_Xt1c+gku)|dbxax>y0S224<9mzOZ$I^ji(`$da#1C6Fp;6PqnS?8GLGt znoTLx9mA)x`fQaR&a5%z8i26C$Cz>e7!b6bF>Fx=%?G0gM_wpju-He_WwT2eKr7DM z%6=0)muD61Q7$7bVwZ||wvLFRB~@%0+vDMEPN%K5cG%o9T96@>?U04mp`od`rVQoO zlr*dB^JBxqBZpp$0^I1x*x`c*4+7r|@ecmK?SMt|4Gqb09TVt7nRr@}u&Z)5jasuX z#DEzB6w>pU>!4IQ*y?6N%leI&OUAF!)+TbQiqo_xjT|&FaMC(WaXp#MNZEOM;QTHm z<8-L1Z%(B6^+Sxr*q%uJ)2VD+kBmv@u|p8UYrAlc>U)md-@^e8t%$>5`&LdR98f0KwM~ztJ7CKW5fjGigLB z#4Y}fxLt%de6$mZR9w>{Zlm_{x4wBQcJkFzC(ph%amH{T&nz0^1%|J9Y9it?8qU52 zWSUIx1U4}C)`$@*se1#YfJqxms?IBVoYp{fAel+(u~c>jTm`Q}LQXE=S$|yD!78vR zC4K#Gfr!nGLzlYKS@kN=g9;ragqm6`L7eVHDx0{Xp(1n%Z*nfBs2A8NF6rY;Q}YHf zF-=ccQh_}jdH4!mZ*dAY$wMZ(-h?EabSr}5QM~x8QN%X1%dTk`Jr&V>T5mqt4Kd3> zN_dRj)!2spWlMik5uZG(w*mhwuPtGO@wVM0^T|Ub;G)Ow5AqvTxC(i~<4% zhfJzP{!b@Hk=aB~i2}BADZiEGbLxCnQ^2fg<){(F+*(Y}#%P*}h~E%nF{0F1%<#uz zplqNS#G6;n8#8%!ZIkYN$zObR!gt#WR_#&d#_N zi+#dv199Ac%?<16`QiEG{E$vkM<2SDdvWyel^2!aRB~!~eo>#zW`>Cm4MPy>YH}(E zNi$p`6R4c9&*t#L$gn9lW9&nVEz$W!wFwKd>?IVmgUEi!eI*DUajS`IY`Ll3K6PjB zny_~16W_h(*6Ip9FWzs=_Z%tgKALYiT4*@-Kscs$+f$SsC8l(tWqS;2$sd#)COS$! zPVST4vLJgDLG~&hxrPqXw0D)8=4J7{+DWhMqhqG{<_N%wWFI%#|DG@zkOOj1u2q6^ z9S)>d4xz4A4l5#i7vy@p*Rl78>a)p>a+5ROFE^uqNQTNY8J1h*R=G{7m+O=UXq+%C2ca?#6oy>(isOD)o$PXtqZ7bwP73 zb7p7WUW{p5<*xCQ?U%U7UZbfRWyGOQa_orypVcio-xVqsopFt67}d0={!BJAw4|un zh{aLCY8eQYruM&7)CQpx#ixk#mg!jpj-?BdDn@FKd2CfvJU>$dGpTjL2oPCVI)z$` zb7Kgkk{BVP2;z};TRqwm&|#hALCKB(?FYzy!r$U_r|E{U%>6Oim-%JavfEaep@MVE zT+C9H!4Bz-W?*UArHgiX&ZY&s0?#Y`pu2jVKxAo|ziX-QrE@NFW%Jb8y^Bw49{L=X z%I0yPS>zVEPxy(5&+uqDDBr}ym;j0yjSi!TX|~ib!o|v~S@nvds`N32?-JzU_?zcW zy$30YZo4fM;^a+epPrNN}DIo(6jS# zAoSyxHyQ^Ejf0;D1|RMp|M_pN@4vV@{zb5Fv$p=nv5n?|Li51qwF3|Pj@|#A^}e&z z)V<~A>INCxGko^B(h|-=!0Lo}c9`Bm;T8_Y9G;cwy6biB2YmImIcMvNql#P0V>{Id zS|ToW2oIwHwh%-CtN{m}Ph9FDG^x)c(+D<6GO_p#l$Z8b+n4Cmv&_(F998Z*w;5{O z2=x_0efiMAym%1#rWo81I}2jxhIp_b9=zA~h4_+*mJq}==_o{Pq@<{Hat-el)Xivy zS5@?!nkmB=kzxS<+kcMi*%;Fu+yHl_B#e=ibX$wzY@1bk7N&jXsM5Upyf%RrvF|al z?@wyBWotAR#VfRAQ{7|o)u@Fos_IOQz%E7H=Jp)My6Px0jlK$Joo$lc849+H&P!Fz zuc$ep)#nP$aSBfd=R>r7cb^R7fUv9pdb#cjc*K&7Y4__ z5Ko!g=ugELai&PADcdZpVjge8{w@13suJFCBH6Kf1?AFXXEa`oL*d1_h?$cT*h~FY zk71Rb^O$a%*uiz~u^2R=6LHqrTt;nN*E2vJ6DmR;$BfQTEZbN^dO)ZbiY|Pw(PPA^OD2H z1__{cD7!HNDx+MDrQCPuwe%;-`>n8rjX*qx$g z;wc-UghV^hiDC$S2Zd*IxnD;vl&U2aC~eKn!{St8;1$T@(Do}k zn6ius5gr*XFQf3A!==sTEAZGfl}o0WJTonR#%*k7_XZ#+Wp*lov=pg|w`%j;rtwi^ z|0B+h6hcSxN6)W^Ci3Ehc>;ax1jdqbIqf>=`OVU}cZ#F2aZABba{p>~EJ zQ%~wpcVV>JLRl+iWQj4QRwZqLsg`Oxl{%2YR?n)5S;Ko795Jh^q%x^QyXwo7ouKRt zWv@}zMcHm-hA@@QrWh|`l%tYr!Onu(NhSJVqt;Sy6AFBru?Vj|hgyx8{41t@)<<0_ zBg)jK(DJMIxUXDZnB=Y8H^#p5)p^eGkDI%j)fdrtHDp`%+I6+UY;pZSY#ZnKIlzspm*mH?ac z)ZUY6NbQy=dTJjxwQPCt#G;L@w+25P+^V5Bk!u~;@=?jpHSXF9P$|gy>$YoA`i~|W z9Tq+Q$Mx+r+KQSuvXkqgyrf#@5kqUcTLXTf5!= z;dk%Ih2{hK(1C();FmshFi#30J%N8G*%sXP6sn5uF!Er?@CU$77c4J^2VM;LF_ap| zimySd$XZ3&C;Jr-yaNH$`N&&=-hOxo>hK4`l=|#tm8q$(7{Ql(9 zryGQ#Gqftzl(9!r`5Aj8B?RmdhDHn}GcEIT)LxOQDbuCas7|U3OLwg&S_P}fCCUCJ zB4NokP`u2%GlHeYx8XNgmX=&n;2TqPBh1ud2z!h0HKQ&zuViHS2w00|bK1R_yo}Qh zpkirXHFP*gbc{oS+LHI19s9@tY~5_%YpTT??MDmkM^}aSLr=WiK=kfjW*_1hpXV>| ztHKwebw-%DB<+bQA?g@PWh#>+;aAZ@XrIHgGOf6m-H`89wqSu%`9&V19JCh4F1wce z9JeAAw?TF-;{Z~vtx&CamOZkIIR$7Qk1R~P;6(Do7r;mSB-kmzgA7mURmW{Qh!R<3 z%;^ICYYI1A*{8WL~pb&RgpA2Af_h-XlcNeDc2IpJnjwPaf0)oE)bvzGtaQh?ky`}v0)sOPRCQY!s{o1XK~Ay#ggPg z!=%MAIJ&gar&<1Q#OO~)1jBEOxTLJcQD^vzUud|~@oN!piGD^3FPY8XgP1p(sb}$v zEfg9J#R+V5Y9!m(XIuIlQ+Q?w7NOSbv6=_+p=jAz_^kJ6f8tqc?DGN7h^k(~g*<+8x3>IU7{^p8!mj1;d zkdv?qaCgEi1j#gNq&O~}nXy3k>{00wv?98&R?K$kwQ)_-vJxmN5zjai5dl!AbSZ0i ziX&Xg&Ch35U6bgVSfXn>cI5{{-n{+-QGtq5CD-^fYBo0?kJ(q7A%(o9d2oC-2;D0j9?)O3s z{L&lMmPXhHS%oa1VlVhSvudSyLd|NLgbIhZo5*F{(PClJUM&((6+mFr67yhV$sSPy zs8dPnVCu0tMJ2}U+;Lepyyoqv;gOZ8+>GJIC3i$DVS0{Hs?O5fL33VneWTG?1&V?i zW3_4){WMbAXw#7_jhb1p->CN0MIB)$gc~&F8Zts{Qwh_19O$~!{du5wGuQwvp|xwH zWuVY9u--DX8hj#hT|N1p7w+jl`~IijU++8vPKUZZebA)Y-oV=J^W$ZYKQDG{)p51b zW^m7z$GzXbDr|@y1+nAy>4Mk`mguXsJ*?dYvHQ-x2V&$)vF?N5jo_AxukCys?0Ogs z-3+b=I|0Yn0k7qjP{|^F+GM6f&$#9fP{0ib2_u%ODD*VW5O=Gh z%FH?1;?tvf+-pEBaV^a|bbNyBYsBN2E>ntnp(|e!&pTcET4xm3{ls#diOLlcoAHO zk}=0J;vZTD*?ZhS3dgsF0GX@pk|)BzG%<4+Pi76{`DC}?88bm4C=&R0lC9Tl6I-pYYJ;{_%R)VB<{T}<8O>Q1 zu1gk}HS2*T>yo|9lC8llyr|$Vp51f%lpt(gx}&dK3;?|&m}BXA^YZWiVc0rM>@frX zH(}UnSs{@JDV){Cek|J2wqYDD4aab~{nHi_YREMCS&X&)FHCBJp0n?&YI?i%@L zi9?%xCdti0^RZMC0zj%zMUq|BuK(>b0!HkP2usIJr5o|D3I*~Z8W!@7RB%TjgTf3S zC8A`Z7&FNuvE7r>@%ed(&nQ!Vm!7M(J!3G;&3j{L{-sKux$P5?8g?F-_tFrqD2tl< zLwtlvTd0y>NzGCABC;|jPI9jLwSjdYp1x=vp`$_G6kPDN^{~Bz3EA@(&#(FZy6%&@ z_5Q=_y`%X(W8{|EZK*(;!Q-U;`2J|(YmeK$kP&SaqHLRqAlWv{SZ60|Q6ra0RbA$K z&FctvvvnVKX-{mZBC%fXl3rDvKhM&O#<}nOHphJ%+SH1R>%W9ooj3sf)9^&Nosu_g5FAJsZb_;Iw zEL{=i(iF@kKH`nP2w?rT-gVlt3XAq+@dR6nT#4fLRHRXfFNuXLX-PXcd^Wy7N+M#! zv-2!oyu?-^Mn^4X^tdz+twkw@5R+xgg1}`nI=xCzdRgR=70V!z+md3r?N9q(o3e~9 z9Sc?}TCS|2)L@-h8nEeGlhQf(q~Lh7i3YSfrQjmnbY$VF_4Y{2x%0Z_;KGXbnH`WRz@*dcw|Mfk8a{#Iow?cci)Muee1sei||Q=4@J)Ka5L97esJ!_IT&*F-M<(<@nd(sY5y8u zXgcs9Jn$L68E$_VZba33xPLRe>+9N@+Pbd-+}PW^DarmAAHvYWa8>3UBmIFfyFWz^ zX6LdP8>%al6N6O6fO}S3HCedK&lLU8!m=>V-Q>SB4ps)AG+0#S(s*Cs)s|wkPF1bR z1?QCo8(2%E`aH{wO|5u!hYeP9sUeikal*~jJq^yXx0<^9mrlUnO3iW&`jGR#BFO@= z$t%RRMf73O(JQ_Z2b|All(JwtBj5kg4l{XVzMWg~<2&iXvLazL@Tq>Ys#%X@pzeFRvb9Ad&k#@k6*A@UPfy{Ubd}KORq-U;K9rj-b61VN*TnQ`( zAOd~lWWCA1e+J^u(YoilD}chQTDh# z$~Dq8iFrDrj<_#_iQIc*(mBK&(nS%gV(wsT0ERkhy+yz{(iA`3@W}e&ykdCi=Oqb+ zNikK%{!>6I{n&`Y^<~7SLT;)bpk5`@VCn7Z%RMDPv6zWL2vW9h0L64dLEvNdqkE&z zO%K9LBC}Ei*E-PLvuQ(wenEb4JyKsHP2Diqn8-A05Ijl#F!ddPV|Y!qA3rC^qKY6y zXzy$sIt)|Ef57^fKfy7rJNgnsn9ikAOmmL}O}SexMvMT{{8_}GroKW$D9Vmq-Wp+N z=VR95etcL`Mx>T@#R!>Yr;iaZiwr0uz+|wA2aS|aOOCULDa_wTZS}ZR+lpj!nCIZz zB+Gw_>^k=qWO-YK6puV5zlR~YVGXu)Z0tHv*mYoISG2Gzy0&<~XMNYv)c}Ms?9taZ zLv44Oe%keCU4_u_!|<+KbsyGkHl6)!@`>A3-+T*H^Xgz(frf)KWo4D>d!8GdI9~Vx_odqZo%3QtiAfz6Q4}nA1MqTEwmlQB3;q7 zr9#)(>V+>u4L4t25B1*p-sd66_*SgY(fe`qqv%@wS_2&St7oBue6Vt31okLDgWx8B~f-WvXHwceWeaH3+6&$FlDW&i<;k>^)my%}2% z?%jHm2Y#Ns!*jln?R)O~(los`gUuE!Yy9V}ksGhBEo}zFA57espvUG2$JhUnxXOe!LScan(>6tiTL2i!O|) ze-#lmuUk~ofsAH0Q%+B&v?u~CXJf3$E^G|;6UV4?6Nh;c zxDS`LXnYF4Hi{>7l?d3H6p-TAspd~Dn%Ja}NKc1;ce191k!S*T3;RO$V+{NP7{7;A z&FIOtvOX*TFj^A)XSCbxHb=rw7!b}l9S0L9n-wgG(27MY2Ql;Onum>fEZ8CHBWyCy zEITU_y>?U9gUk@=N3!@u*D>`|yy6UT8u8XGH_!85c{_RG>th_>^H1DxfgAp3F8E8q z&AYx9IsOg)5!d^O+xH81_>pkpk+AQP@Z2NerANZ(BjNBPVgE0DO*h|J_jO&T%W_xX zi7UXrobUa{Hizf78#P;0^JMpL@q97)TcJ7|vp5c%){QnV6wN0(Xp~#^a$+0ABWlOYV(UK`g;xd~#q|uzNA&2Ve z9#{8JB9}shM8HxolGq3qBCrUc1#)np92DfTK!8O~L4sU3kPRvgE0M9uA-8d8!HeXQ z_o}*kddT70I%FU;tE;PCRlWDU@4c$hzqGd}1^DTak4y0pLHISkc)zGy*!@Q+JPLPl)EafFuj_QeWGM_BB<=e`se2Ue@^!9Q` zzJrzHdb*s+XGDRBbOhu`dS|&S-^IQq^zL#`zK4~QdT+Te-^a>rdVhH!KfuZ~I#7At)CovxD;XtQ(aohLLg6)i1IE*;I-%F9cBN`VRp)uzCWc)y z4NneKc3d*)EtP68u1#)Zvv!b;lf9sug_~BO!kwn*#IKW&s>HGjTm}5eEDpuptDvu+ zf@O)27d0Ut(nKw!g-KY8XwhP9Qz(jB>|QD#(c+q(s~HU8H8q95-jMI+t%ZP(EL^mVXI<9 z>xJHFXx&aStR|%OK~1ODuMKE}B*WxnG%lgb?G@98piMVxb4VN3ULhG+#|YH*XopEJ z^yvf1GQ9V*Hm{NaE}cp1B7-DOqFff2?8|)(`V4Ifx5fOS8$x!}>DrGUr)?FGqZAl) zXbe{2L?|G^N#9&m3TQv?TN=I7f5o%U~GTa__L+=ZJEVll(0@~jDLnJ&3@!C6)jR-wyOJ19mHm}t9t3^ET1J8_V zF+Em}--*I(qAQt3E3b4Xwh>$D^4=PM@5GHm-!V?SF^=tH?M^%-Y{Z!^1Wywig5u9} zO~^`f*)Z*fp(ysEz3_07Rq$LxwM(ny()tf(!7WRwURp01OERyOU31D{JPU*h_&{;G01q}*4Xa3~3kj!p ziP%b!nq>uCo~mATjI)OmDH%o6nLvQ$w321h(qh#nS}q_7%l_LzB@J&icbli{Eq=Bacqn{o5 zhjcwGBnH0`LWyKOCS(rnq$g|X$?f#%ZRs>TpZ3Z-y(enDCw6*gYrV5iQv+K+*bRsJ zlAFn|V?wI$%T(8&U%7u}d*JQQQ}6s|>c`Lf$9MYA)cVhCUVh#+w9_?R>zckl^B-QT zna@-2e%Up#IrG)w*B-w0IPzF~a`>&y$QRP^E=&)sSLC4pSB&K8Ij9sd`rl-DX(6=`oU34uBb~y^QAza#cC0tpOfd8 zOO_1(3aYLXts%r+H98}%T3#ejLr}gi#cNUlpy5!&P#1L~tJa!P@TaI%DbBQPYDAaG zZBnSB_vVtml=Gm*GyWT{bC1I1u}jUt_zYX6H@Z8)T$70Lac-Y8vZ`soG^U~IG8n8{ zhEQ+e2uud*Q8s}Y7^O7kG-ukRc~0}7X`@tH7b7P)oD;7>9?2@d*?} zk7CUvJk~w7oAEdke5^^xS?qTh9(RTMh>$q88QGDBYtrzJG+vX&KYP0-UHoP7iFA>U zK^rI8n12(ZabOl4&p_e9L3qX%dwv_@BR5fW(L5hSGfSrADr8?evj^Z3O{2`9qFNyGt&*;T*=2}$#j4Jh#MUvnxHicb1O!ZU2t~$H00#R#p%NSO(c}gA^0G9EnOHFl zR2d0S<^Kl3#i`O#S3RawU9*D%n@KaskuHvWy#hwiV|`9-&6o!Q%$L(djvl~Va{PX@ z*Z_k(2H8h;FC++GUNaGT#*oKge^pq{%@e~iy}sOW{9wS51#7KR3N-0Y9T zSxjjjrpeOs*#0dQpKQYcJybM{`JS{5HU&5EXuBc8_knh70SwB4n2k2jPp1KE^mp*V z7!^Y`TO8-qlknP_Q-cEY(8@wq22Xf>AE!P|^=(NHC%+Cu{_^z1X-G{HN#+^_mkFVx z07-$%sHdRtGK~5Nd&AG=Q#lCvAfG}!XUJ_<7K`Q3M9GfS7qAN>*9WHdhI~MN5$9BT)BZ z*UeA(JY=>)e>M$V__T&v$%+gY7MjJ4slN-wmiBMz_z|?S`rrZf&jgqdBkC8@1n0pF zUmejDNzO4Bh0TuSZqWbSa^F!JKK@;?I%cM66owQ0?fVpIt18NSYgU1wT zW{4uopa@@4-jwAfVz8tLO=I2!&InlLI?H(X(Fw$X)q`D5AaM3XeT3Sk--JZ2FC)8#1_95B}j=w86U zFood}5eSvp29w}y=FtlnLSbCQi#3IUEF78XvRs87=OWQfrX|c;h%HyBxe98>Ohs0k z=7SkFIuN8HmJIy2Y-+&D_`<%hyr)7-@E6=OFc&X5OtI~$J0UNkzll%`M)cE&;KMB< zda(CTu-7mL@UW2S*-4Mq(xW@+$r5wC-DP;d0g2CNu)o1M z?D^plmv=yLN3^jw`!yg;@hCy`8105&`-dZqx@>s;SH2$K0}UhTpr*mHi+C6(&%Wzo zxu}+K4+AfH!0vBA>sWlmMiuen?$K78+=t+3kK%!o(+yiA*dLeSL>2gsGe!ZC;{75}V94P+;sf@>!5(f; zJZAEa#x$`Q0@4eqzY3BbLZY!VGsrEMu{gaIR0A_{v?Lgm5eMMMHx2A;3B|pkpAasR zH52x62(lK73Seq~w(hD#9=pyN*3HnoYIAV~PPrSfKzOtmLLP%Xj+fl|J5B&PKw{0G zxQ8uI>E5Do`@}fK9ksEiZAshiGTm)0H{AD99FBjW@@`>Z0bWyeN>ptPl4p*L#@a7= zq&9@Efm#5=DP`5h<9QcA-uCw$fC4=akLK;~4AiVo9)SGT68{o~9D@hM?IQu&J6B7e zdz$Lr?j3_f9^Zhcp9iZJm z2}eA zc!vBA%$L54kBjiIa2+Uw=f>~B+xoE<3O`8d-=T~{@Bnqg5ud+de4{44@px(b?dxAi zH{4vwLc8O&jAF8Tng5quX);I~7pf&Vn?d~hTRRNO>>|lM9|8nhHmL-Bo+YHks!>1% zd*?fI*XHqft5jTrbb;uQAVBCx!Fw0gfd6^jGACHr=gBXOZsJI<&m}(NrU}uw43H5Faz(7!WaTEfdJ& zykrxyWw4Z+f=zha7~B|fet;uJ%uWg13Va9gvOE@&2ehgeAOHo61Ob7`riRI;&39LB z(hPYp9x?Kvd|E&Ubx#@%6Z26%fxxMnD<5(XBxOI9#$eba1*zCu+*lw1#SyMm+(XnW z3-fO3{-H|MGTRcum?fj2SKYKXxF(J)t7#m628(NK{rrxj;T<0AGTy|!Ni{`;D~KG- z7+`jspzwfdeY6Ge_R9HZ?D`5kfS5)CpgUPhpM3oLzbrjTU*483Gt^~=@zMy--4w-1 zDhfC{&=izYitjJIk$7nvKJZqMqJKFN9d4kUh*m1t6xZ#vyEi@sqU(wR zTie&6mThg{;o89)K#C7sxb-K1_21kR>Y>5t+2_5l)}!zjE?d$g^*EL!VJQ0|fu*zQ z=p}J`@`rT+3;)#3|3Z?u%PQ)xwMBpUS)ne#pU2uR{;3c3Mw8Eb`s-2n3o^S09!z{X zQIF%BBn%v@C$N+hy8G&FSV{@Wj@@=B{dZ1^PS$%mqBG)CdGz7TFQj@HU!EsAHmy%i z+`qkbw{X;*H zT7a_z&C02Z;2SDH7te&99@fk~{%O8ud!=av@2Uk7MFZ}$C{M?Ryf*%4R1`ajr+CZH z?_c3IHs|sH+q2L)_(ahcnEL-(;HEf=9i+3|&Ck&yyn?3))&e~0VNn#njS1rGzZOz2 zLLo8qLK4K$Z$l9=^tAww7fC@p@=SQ+t8nI7G6Rg6@ycu^ucRXI{QRW(vIRXtM8(gKN1 zQ#B(sQ?(PSIo(vT*5!-JThH$ID|QM`a(&+k(7X-T`dT^$!0LMvfUz zTk&-3x~KiD~R9tl+lzr9@(?e`b1O@$Ir_c1q$1lofUO10XnoG#aQ7Jx)e(2TKqnBrt%XWIn>6y569&O2I zXVI6iRq*Uwa(0e-TkdGbN(`ZoYBD~ZJUz)?sr+a%DNoaTn*Bq`5K2;a2Cv{z%%@j7 zYI;GLnL<`&a$MCTj`Z`%cv97i?ZR_qS%(2AG_+=Fd zK^A*Hu3mIJH3rzD7oC$oHz%t}Jvc@MMkgh;TQ5}PfB#F^>Dym%FWp^8^0lNV=D;Ntw4GAW6duZZI`hvHOxIu3a4 z7GFrn084QeD?se(k|tGJ7+pMj*X1i+EPI!cUA%fxo)!TqN^)}gf_MdxuFyzf(G*Xr zlMIH&W!=xZtoxGlv$F0@$kTf0+#JAZN*+{{87vZ|8HH5pzxdY|dy-SLJ+qUuU8weg zob0+bw|D=(%X{UX#N=4d>^%BE-9s()pmTsQyuK{;T)r}zRN~X>gsf0}++1TlS9*IA zQ>@dYQ?qt@_w2l0p~w?6%2Yf#Y7Mfo9ewDb&b)@;hOk-`Nf*_uhRYX&4@;_UP5yG# zgOawaP~b0IE$;gb>D~SRY9`$`ln$I-^CB%>()KtY1WFe7f9VTLp7)AsAC}c+3p}M~ zJ!@WXxHv0#gT>imAyjqiQ#TK$eVf+MhM*U#vT{XMM(31-UXq&n9CealUoWAzq&uLF zaRWtC34zEb1Kr<2BFLhLre-8LVZ5{MXLovdHV#aiu+xhH%L!>zolMEP z53_z*5Ahl1W2G1K`K0G>dzCI!taMYrCYA+t}?EEwCjM=+*+=j@x!E(C)~*T?=eq zD=i8YW`!buq0)(Z|31&E(u)_wyq4ofd4__W6zrm4HwAkr*h|4a3icyVb0(+Cm>j}f zSRdxh97M(;%52X}%TIh}^bm)kiy$+ysx+d#9O^;Ki7mk$h&Y#^O*4|#k|VqhN?|O^ z^8z0luR`lb_YoS`{bTb<8PrYbLmJ0~96}oa)4&cUj^h0pP0oz60?JWjf0c?b z3Zcl?d|!L>TAVVoFa>ZD2-p7oR$x;q!-Q&9v>X;e;zCt2$MKB z3pNQ0mkN!>C$MBkfg@(d@$8WJ^3fB+$TxcY?77j?XO9lB3i1=FmRV(HTvk@21Fx{6L$RP97#*FCPsyXBdhzHe5EIxh#6zQ_pPP#(j3-5-qteVc zl6~^@+?4K%&n0KT;n+RbBl*41{j8a{1vXD?&Xs=53k${(QE45$A$(Zi^9QmaA-efi z>z&x`*w1gjacXswcq@5l{`UOOe?IN2Tn$w%4!m{Z=83PI`sqPl8&yPgSeMs`twHi} zO#O?3?3I0zhnAM?mkSACabLjiC9iSsqkBwmksSDr{~aRNxJH6+rHXO*|&U5Nz>+DGf{VM0_rh9G!?0pFgih8E67hB?qfP%TTWboLJOH z;`OO|IZGwB-6){yQA%PA!9_?H3^}QKH6@ztjU(O9mc_Wa&}l?z=IQ_1pCACc>$!yA z{A)oNwUUs-S>8xT4rAU0!NCT5+&BpnLQuwpJK+Kl;i!ceC9u7t7BkK6ERKZTS!{HY z*aLQ5sxV{rFKV%n*1Qla7~))LiSh#SD4(K$@FSZag~*2TA_Xr|@G=Gc6r7~s(+D63 z`cyeFp%fJVaP8Gs0wgg2TJH-M9FGsn-Nm`dmwoMc^drmj<~>s<7`8D4E@%``lt zH9WKEdl0C5xVe3KQzq7@#rjq@AEKn%)h%1;S6ppwq2I>UEv@{1TO0jud05qvsoJAe z?OF7FwV1E<7BrJXqNvYo#uY?Jz;T~(w^=4Z<(fxn@K7mhwor|6C9PR<;}$Q2s6&kL zq(=KM%Hx+wNJw6k6%m0rKBp)U7e!-W#c5fVRNR4)m#@g3kWUf`5_9HMSz@S{sDX{^ zFXZ6*n4ixcn@5^BK98pnd)uRo;KxZx%UD}L@w~0nwvA>lH{UN(HN>#o5V8TGv^*1P z(?V^_t(lHv8va7ZJ`y~o2Ua7McZzQpKWN;UY22^jFS0-D2fNRyueD@QjWsQ=;bx<0 zvAGDh;lJF@rU%+BOpmohP@Qp^)S6Ag<)=@=B+WovninAsnR5YZ*lA6Gb5_L3=?%$d z%*WObpN3QDjY9ZD`6&d>nbk^;1P!KoHky(R3j7jPO#HnuH*cN1dGbMB zPp0mOhQH7enwo8xnoMMi7TK~A*=o;;UPiccTE1%FTz6ziVgt?5HFtFBNtI&J71&5~ z-r(&#$}mdC3K$S7W7JKWcN2v)OimMT3-u2}^oHR*-t@gwZp$$njERSnp6#sEmejC5Fnt%LNHo zU^aXakT@|nJx-b?3l)r$>?d(mgTb$nvVm1Zx(mrfCTh1;6V!SP#hWuA=g&Z0?hH6d zZR(j%HvG;q-=+FF3l1a^EX@QO-wQO}g+RFDj{=Q+^h*iYxKsg#!F}*a>f8t?T(XEL zG8xx`=N$vLcm(J)3sSGY$oOJ5I;<@bZT+1Nd;ue{dLc9z^0Z`5z(sU8X9+|Mq;3sJ zs(qtj&(Ha5R96iGtokxG*jyTK(*kYFrSAv!tVg;egfd!KhT($20epuWHT*>wAd=QF zyVE{%z>(LRlh>k&Xv?@9d3`x~U34styneK7@iS&!f`Aqa@x(>Hf+t0IQfB8hV|Wt4 z6N`H?(^+l7Z z1C9?2_O&ln(aCg%qc-c>m#XR%*K@ResX!;`8Iy9K?uo^MoUsELG`KsGnq3(?#uen# zA8>GrJgba8k%Qwo{tYTtzlGq2@Tjc!UhA**e77gvdn_w>`aN|I>l&7%Oydr%amPws zZ@TswB<~a-id*guWLkG?t-DvmJ?W;sNZr@^u(fU3muWwuwI5k&J(}Lat7u+r-kNFN zt2OUUi~Epof75EymQ2%Lt!Zz%ktMXOwrtC^^l2@9>E=Voaky%=s`k#v?U7rj?|QP~ zqS7r{Z}5qHMhkk+aEg-)K^k2)r@(a*9%UM->j*aFTBuqvUQHVS3OHVt;SW(-B`^dH z3P<0%e)IZQUQZhca0s~IGgP~XAXcXgP<)Vr;|O#wD$v8Dqw(qK8B8=NfQo4%oT_^# zr;`ea3Cdv#7z0oA^~qek@>weD8=IL)a5f}PMI;I?P;i+7_PVnaJ3|2}<+u!fmSPml z;J>OP_!$AZ+w#gUoy`^smEv8m7HwVewXRk)E`46B*uLW1{$W7aw*CID2V0Lox4Tri zH2(G472grigXWgBuV!h>imw?pG&ZGuRkzd?UnA-fn<;B_#V4+|Y)Sj-mikwGTaezm zKeJ^(+cI!tFkK?9_y*Q|0sr1?U7^4Du5V4iPd0*Jxsu1LJ^o$y`_}~gWD9-%-uuxt z0YBMLk$>-_P4!Ftx98XVh-Yhx{l!aL)&%@yYdrp@rRZZ6YT~a=FFTI^)@9l$*0=y+ z>WOld1roD1LO}O25uU_;rY4Y!B)=Sx3gw_wg#Q642uoq{8$J`_pdzqy(9=Sx#PPIb zJ`^j}Yx2R(G(WNR4>8Sue)#OUK|OG8aQNJb!IuVMznfHGI05wN1v9gH5sU&%at|{+ zr9yHW9ltmSjd7UC(j0q{qFAA2vL3$82E_$&Xa;6Gi#Q-RA#u82`^Su^*p{OlRPRHM zyvs!OIr*MIK7hbo2*yG|3OuE0)4c9o@FJfJ*+?Pnx^Ka^;Gz82{R@7{a~W$xNxJ91 zUbs-W;74w(xD3U9{hUqpbGbV{q zByq|xyKclOi3S{X7hAO)RW7JpGs$Ze%{&LhVF)i!syLO8(mD-Fe)s+XT z2l9Ao{+PtDxT$~;$z&`tz_$9Qv|?MCE!zFmy^zz@64YvrE`GU2Jo22JWF~EMZKh%yp^5JNtp5=esy{#g zRe6gbZh06A-zv&PTD3^)N@%Nbw_S^DUkP=thN78JyB2C+KC6YET`Tg5#p%Fi!r;%` zdoHubfgEC!_8%jr-Afq#Z1jkt!C%F!TVd%HHQ{Y{IH^9wZ0`& z-=)=eL4Wix+PHlB-4hR@LywDm(Xz#UaKELMnNYVD>Rwjwz4-R$AB3J=jaDz7d=!py z9vBPWx-4|{8beoDK={qSO!TaJ=7~pkmN&9;G{Vut^#bH^K?n>Pc~U?EFnR;9)FPP8 zy=L9H=5aUFW-i|aFCi*c+9Dh06| z#N>A%Cf|bpJB1cvDzXq0kS7q1Cl*M3WG%%Ugk$QT{yyRyu6Iq15dyT80LJPR!R4Ep z02NLM2{<>9Kf@F><&%a>0}1tZK_{j&WCD)ly2aFMxoxnlA{RqMyY>sryi2Ob__PS+ z7R)>G1mWV`hSvA6TU4$Au$0#+xPc&6ti&m9>x^xLd6XV7t~3`-#)=#m@@FX1KcIji z*=CCUECQ7jcm|T=2rde&bN^>Ex?Ik7kn6k z#p!SiE)J3Ct-YCOn-*KH+5)XV)$X)|I*n<{_<==hre>Qyglp1@1xqySwDV((CZqqMHCAN#oIp$ zBDUrg3M*Ld?590u)TJP@Q$d#fL#mu}=+vKa{I@O>J+B=^($7N5GkNqyVZtMM-Uy6% zr2^RpLzfrEEx+V@Ll`NP{0NJrLWF@uXf{WJPy!U`bryj2o6a*P@a32lh~uUIYdaA9 zf(N#5{POE2NXA#eHJXUt!_7TrF9lOO>IKc8n}w4Fkh`DGCMRC>w0NfU3L@n6La`KZU_$pr8kSTKn=_+tNRYXT2jh}QjeIL4S)o5xcmR0 zk27?^1^I+?DS_WAy$Mz1 zW-YKe-O%-Zpqn!V$YNdAY?9>b^kZ-*{w85IeHaQj+ek0yrZj-77nBAq8$%Z6jO95< z-6ka+W(?S8*PXuGeJkPSw4u4q0YIYQ3;+WtbO#`IeFJUYfbc5x4d78cZmLu001MoB z7ii21-2I*K#A`e6A&-l&WeyPX`rLE4aUjU;|7OrE>k;03)dyXlRCLJ_mGI8SWvhIN zeU}0hVgUz-QuXIJ^q9*wM|&6Q6fM!%(&cdi+n2&a79~q|pE89SbkFA2qEsQ*Pg-naSyxmzg{CkO{@m=%{RYGkDH%?onn@>?x4JG)hMZGJX!3 z0nnRj7elKxo4-~1jnc*ES3@zL@Dgs&yYtcX72^!)n9UOhb>>(39T0 z14^^r#)pl~ciVqu`?t5RGnahp6x9S+x}}AlLldJEKQFsSd4eLhdTL zhP`tEbH%TPlf5ufhjhQxV5B!9XT8)UiEyj0VfEKYyyneDZG}<`@-(nKTclQLD_r{v zrEN%Sgp+&|$^_w}-;R3`?(xkSj}F9JSpMzu7TyC^^JX@d5o2osYsG4#6J=VFYpc{{ z^sE~x+vGMjgF%cV&+n6Z916a6lrZ{>c6XpeOm1WI-n+gny!@2UB>)Bi@tTU@iNm2 zrGrT8$xG`)+79Uu!HwLDaXpOtXV_Ts`y;sD$r!F(z}Lt0iuGhM^~c2|36N+5w?Dp1 zfT^U6tXx*=GOVB?xyKqBsljVdG*Kb=3aetf!yVdirtOA3PKg^gF|N4kGA%9LQ?uO{ zXVfHDHBpR=ww%*};D@<;`ybPwg} zCoc!AWtH8qqVxcN4RCJMUxHYlXb0Gyv%e>l9&c|F^i^4 zTWmW8$P8U<-iWH^U6+Uin+KY2Qj8!Ucp=B<>RIFcZdKGpqj7~tcJL+6_x)=Wa zdJ$N8=HMSNcStC|OqJ09>A~~ZXf*KR>A`c%1zj&+=bCQIE{1Md)eB!fd+roU9Hbh` za{Z;T0V$rC<2x-J?$#vI!j3vZ4fjyMy6{h^f)Zmx0u7FJLj&V*rDwbW8HfzcMUIeG zzaFz$!WraVK`llA;p}lp2$nJNY^naPw0!nK;NYWhT_)V6g}c(>{prAdgsaufnd)6y z^{!0yNv-8#mO478ZY=>4$itMJ`*Jv_m-NU*Kb8YHoDjld@&)Ql&+4F>+7#P;ouLAVWle!1B z7NaRPX*0!Khgl`p2guOmu?UqH{V!wIjVp*44b|3j5F4Yo6-@dbyj&x z9c$ZS;K>^$%EalgC6nh5vA$`P7W?{)sSdYw;GXve%>>~%XzVHlc&vrnv~XKGyfYox zi4ewgBPGzr=0^{jv~vSbB8g3vt($Qdfo+i!DBp)7d6ye$v;|Q39Kn-6^@ogUAw+Nj zJ0B+ak#HUZk9MituAZHEH0m#10C>&Sj+0ME6Jp9whX&?LN%sp^c`>E=4 zj-Hv1)#f2=axCQgI(6UVG<+Aoi{YYeWc)owt@cc>r+n@C7)bpea@b$pc| zg2Zg#$%qgPzg2RxBwgA0Akf9U`|7noJ>PM(JpUlDe>G6X;UbJ~(FRI_MrWWV4`7Qca3jq52Tp1>2Cpu1b} z2v4cS;ylb&As2YQQ()FW#_>z#_A@hPa$ned?WKR_yOLV-x*z`w7rbuX=+HYoPiFIl zqDz%#UG}f!BmZc{^Ja9xe`%AQZpKiGeB1Iav#;Q93pXg`!jsvh-aP3O9K4yedqeWG zO{u$*jb?+Fn$4K~3ogxXzdcZe8#lJq&Tq!xMo{>=WwZyn@XmX;2))oMT*VguS1@zX zb^XdU?<>MpPpl~ZCjj|klc6|IR$%TA!PO zo4xJ%N#iKaKOWH`pwmfAQFv6T`HBOB{ZFP&{@N zo4y7Iup5iJ_#pDB7YEM|pBJ~Pv8eLvD5((WF%=WG8qX->7qJ(APKxV=&>+h3DZR|t zbwEC9bnj3ZP{pLJXVJkUPbW7M*`3(yGH3baYi(~O5IUSFxb7NE3iDQ~` zP1U^$h3N?8{w)gLMW6>vHb?hkkHa*0YEwZ9b*t{3nN@WkWl;mjZB7F>1(q`32(~>9 z)Wxykq~Byqx^r@ToruQ&K{a3_8imT5OteRf_N+vE7XuH&wM*>}S`R%4A5I4jr^APT zSX%Mc$jy;VX`@!!_>H-5rM{lZH0{%x_N_D>c)#@7hn2N=PToG5sch9MTko}|D_d78 z_rjqeRQ}fKo2TC#`q(d&9fWBMO4*XtvdE(MQRU`L<<7lS8rbhznl#URofsAgu;uTUn!|yioaLV z^e|NM)~TDPRwMoIe&(^a06N)1w*7*1zL^lY_pe6km(Jha`Xlx*D1^k1>(Qrl>9c>$ znA+cN>>b$c{oZc>z@Z|&*oM~}QxC?9bJ@iG++j}C@nI((I!4C`h_qcDhGWKzFzzER zs@S1FIWY+Yz_8S}>|**hZaGWUDHUb;f6A z=ZSG*Xe=8nf<--3^~ByV<#AJa-_zu(4fbS$$^s0 zWN-OIIBAeE+t%A-CJBCk#koM?PLVbQ1FN1K;$wn=L3}N1}yh&gXJ!ai%uNpBz4%r zVK8vc6~Y*3369YD$je&dag`Oz<5|HEPl6k@h#B&LIuJVK)Qsa*s}fGsoa@C&)JzV2 z0>nuL&E$K*j20cViZT39G!W0FiZeJx;i`gDn-HDgQUW1wE*+Rv8u*>Pq1BsV<##Z; zL(G?!Z!9j)V>^}aA%pU7DfnFq{v8G1r{LdHzzpacGMJ35Y{+9PnOm~WQx z?^Eyx6#NGY{t$sm$butR107i;k}#YrI?BoMp9mfO7gPfB8TmWr8&}u!jho&JxBspA z#??@SI|gJzeOjpR{^jrQOuz8r`=OU$(+7)Ko6jOPdB4}>A80SqBOIW>+k-zRSBM-& zk#I4?$E0N6NFD2@WwZ5Va78H~RcQQGpmYnoAcBF71XyyteHYVhT3iC#zXyOdL7f8VrD3YCuPa-&r$vu0IYkj#1pw5IUgX|u>riM;BZ??Ol^zxqx# zaD=Ly(;>ddQ+953UF$X<$O4_ki>k8p-dNkzqmD1XS$JG6WGBW}BxI|rThh^z4O z`oP8&%bR0Lta-Ra*+sD z8rxw51H(2rx_xDCF~r2?95ICCVMg{|$Xm5Iu+U@bgq1olsHiUgxwYRdj=Wi9l{A$Mc9r2IaT4PI6%Fk4&_g+ z@L=KKYY>_l9KXPhqBL|2H1;?kHA#9doLm@B*sspT=KL4bs|gTE zdu&$Cj_pz%`?mVcC*AH@duRj<^$xZl82<3qZeo5q5=|N%25Fw?L9c6GmNeapnl>_a zIXE@iOZf=_8d4_Jo_9GMg*|TWKJS2#(q%3oc9x4$((hAwr}A8FXoiv znTJfc!TY$F#>@qZ5)s}77ER` z!MOMry!P=%3$~d+cfoKsC2U;*qOIBf+m1KnsGnTe+OrDNI#B?AbC- zS9X}BevEOaYWTI`3|u9efES4;U>d>^Nama74j`}O#01*}-YH@u+Y~5(!H4G1E8|3~ zDP}^%3AY$A3k=N8h@u3#uVA+5=i?G?biOvsNv^$#@c52=SWMK4+M0SfA-UW2zGU_iwQdox z!{CG*TOhpAAs0@Jfwt5c1}zM!FfSME*nr@g2S3JJPF|UWNHVFaV9(jc#w*;xB}N?f z49V*pJ24j9Xe${<-8o%jma6zT6M<~#!1;N@T+da&evYTMke;6H-<(trh+CyjaR}$O zp$#rGVOX|Gv9R9A9HC&pF%E7uP!gTYOouQizY655*F#tt!@-^0e43bMDtwlb;MD$0*cTw6!n3spo%JsE4+ z;6_qi8!+hZ+4qBpsrak_SibBLj}YCIso1Sm?7nwlrQ*X zo$RX*#J>9%GKbD-ht92t=V3I(dD8(VrMGHjt(mfYTG_t)n=<=PY5PyDl%2*_&yuo5 zY#DEOR8n@U`OEuOqxEEDjojLmiN>^OEE7GXMGvh+kG#7p)Bm!Dzv#ti)uWoP=M;wZD3eG+ae+~O+mwX_c33OUij7+b%F`;xp_&MHGy67x$PfIS{~GHj22@8+ZPIz zh$EkiVPrb&cwXt~pb=g0jarQEJQ&PaL3|ltYGD-P=rO0a9t_hsXM-KjYiMtP4B>d4 z=NSX!xbey%Y`uX_4_MRIml@cG5Fikeb&Q;$8!^0ol2<3kL0%a90=&&+f0J{vsQ@QG z>~MPyR!mqox=supEAJ#ZCR0LZ^jiaxM;Hz6kdQ8?>?-_Ezyz{(;?UPPJa~Asc1@68 zt>#gf&O2uGgzD@u6{Acv1wG5N{QpE)NMjt@bHB#Gjj`q!@u_u;k1?_~X(0}X5MJKe zbZ17*F(nSoU_z_`F-!P9wk;f9CDONXi7R32Ikf-VFj z{2cqcwGRT@fsd=;ztf>rcC1u(E(TXiD&Fe5+4o>`Z)S6!wz==UlD-eI^}gY;S18|> ziR^hVvge0%2<;B7YDcyR=a78$41Pj~I5v)0yx%Cd8RVE!EbL3BEH_OsJ6FUhz9S!9 z3zp9&6PI1IOSBs(N2GSa%ei?Rg1S{Z4oDinELJCrdAs<6fr}VU`-B~BkzzK$gz#+m18uM4h0r6XJ`MheC2Z9KThrV2+((Y1KLo3G?JIpt*rc+VT;t$` zm#*D=zve;c*rV{KMZ@DMVvV5N!b2CG<1wN9EQ!6$-BewRcdBIbI? z0TxFaRAz)bf!5fvA!1)X_eLh~36sn^d;upE2qccd z&y|CT^?~UJuE1owQD;dM9-B;%(pBNZOsSvSq=L`}`^MIo#CHrmR2?+48$-ISrY*1KFRvn*w65C(0zZ}PjDLtj3 z$0l#bxVx@E_;vC!8+EK(WIL&z9~$T+1=7&-19ra%OzF5wQyph@9ey3aCA1KYYQ=bt zA2eRrt|fJfmYmr>3_yIZw=pici`o#+41bmQ7`hJ(31@q#ve;ojbb2iDV0v&EUlAK* z-wi9!z2ga$f7S`}Etn?+)<2=$RJRM@JR$#|5FoXpav=U1_@_UpXw^zuA6C>u?@>;N zT}8`q_7$AdEwj|q)>$r#Xu=QE!R5Ha$mS>Dpk^bjG)Z-ELYS6%;D;hl=D6iq%ZI!0 z01sMIW*ATn=Mg-iPb-k>4qhSVW<9J$tA(*!c z5Rq$!FK%^zB;*MPXhc^ zRg~tj@o&HjwJAtY~#yh2+(y?~~E>mo~rd~}dUrtitYmHrF0n@998kzts=Oj`^-Wo7M#$%7N+Z%~D+xx$jAftA?zdMr7+mH^}p zlmnFv2%E)BeXmyEo2fsp)gND}e?A?kc|_;h@6wufWtyJVnx6fmroKg-rP!DWx8kJ9 z2km{C_93l(=)Ld|{C*$S_GW4awc5eeNK?AG7Y8xal`WoxhAL8*4sS(;oQp2g`5AC?OuS3%{7WtR8H(BBp?y_j+$0(lpJZvntZ9u97bCZ8Kf znm_{22v^S#sQ6s~nH(LU{`b2nqMY6mPXT54QlPIc%71gm0Bu{o;dwKGJr~x&G4@Nt zU%~?MH^1NtxO#=D@{yPs41lL9e}yV&za_aohCWQ!ELP5}Y4X`{0gF-i?u z)Jq&%9i2!hKlGsf z(Bi4p#+FPY8L94QG{V!_xYj|mKXf;%y3GLTH z`|o!@2%Y$0Rad6!kXChw1NOaLnVrvRJD&r9SC=gwCjhV3!p$~-PrM)5{b7}``7jq> z2)CM-1=7nuVUUJh%K~M0P&7dR!2dNt_$+Anbw5B;aI;+5pqX=wywB}?p#?tyxmi1o zkuP%F_OlQ$b1-X}3n=q?t_M-x4TK%%EGEennO&DGM%**vs(hOMBqVi<_yiX55axDUGIBXtc!rZ*RoHGevX^KM526bC zuuJXIFh4XCXJO)F*umn}G7#hC&)(mh4jl(Fqhq32$y9nr!&(t#aMP{I%q zph0sR0@^l%!O0iQP~wUQ=bKqfe~$a+1`eDH1@sc`wJoqunV2Uk86QfZA!G+eZuJ>NsO5Fiaq8-mJ4rD^jTB!MMhZgE$#{{h#asFwa-O2zecAN(K9NmT~AfE)| zEM^D?kZF?h{5UXjD_~pQAR9wE*@@de%*Vb6gNgN^=v&!ej5te+y7s&Herh(!ZJUCZ4bt@fE8u{`9vQ&5B`cp5i(QS6fQ-WgSSyU0eCs9q|Y+ckJepI&Gv;7OlC{dAl=PM0Ww9 zW_vbBv0@=woh_kQNGL8_D@E+1jvD{Vp6sp)e{>a}uZ18V}JA9Vz9=y|2jU%4De_x5Y;1N1o} zq#X8lKPs=w`tgg7H8y99D2CI~!)RgnW@>3yI^3iM#kC-%A1m16FT*E}vtIh`*w24o zIKM{s*^{0sf6c1cll9{FQE>B`AHUh|fIpnxcVbPzZ?-b(uVd}u7wuKnEO~G5r-7*B zZ6PUw>wBYtNj89#*qsU*e^y$0PTEp%&e6GDBTOkzUocD;o z@?Ih<;OAZS`@_GT`iP=5rTkg8HR!Lr+rQlU?UQQ)qBx46u>8^Xo)5hk5uClhY4J#W6f93eqZr)!kso(PO>FpBogw5PFvBn20H1Q&bvJ~Mfh;HQozZ%Z zrR$Gr)&1EbdKwU_YL}w7Ph^91S1dGdy?cJSby&(g=h_@*~s0Dq3q^{DTYq@r>>|s zJW*w+RWCJ5@i8DyS!ZA4>$c8TK=;9BVEZD*o-5=PVwm+hw>RK+8D$e?*yWaC!$C51 zQA~x$Hm*2Sf0e8zSS#0`Y1UxA0DEm3d3KcSRD2pIKq=f2u--ine`VTHZ@dcz$EpCT zYJNrvJC4>KJ%iCwL=@w+^fxHR$lGCxG4i&ZVxObn9SROmKrDxHgo2|8bl)JDd;X2w zvy^y+0!FGB-C?#b(ueY`+<}AqWAlfUqaa536V%Tk%z8Z@&sw4234I{6{;AOZr^2E4 zg+qTPg#K74{y;eRpM{o>3w)k}4+R7t7n`vk6$_p{9|&C^2-|+(jeOuc_JMEP2fn=@ z7x?{!YXX8Fg#`bu4}8yl;M@IyZ{G*LwjTsHFMWC?xa~$^Hc;xRxx01Qw|xHX;Juy8 zWm?PL`>k3FzDkG(@9z4(_qX@2Q8X)tJcX-~Xx2-=EnDf>bsj%>oc%uXH?H~dEZg|h zjP*uFuV?#eMfELwX%F8!#LXjraB=HTe`d{(_~TH}Q?oSitrK5A@mN3b zo~EooA@E#5NtP!Zr})POuX;R{S%E_1Aw?e_@z!`6)6Kgc3;2D!XE@-&R~@(iNT3MA GPX8BQ#G@tv literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/__pycache__/metrics.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/__pycache__/metrics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b65dc95a2ce6a7fb11c20c8ecadaa8426f6a737 GIT binary patch literal 8692 zcmbVRTW}LudOqE1-DOF>0lq=o7_bbMmxUQ-7#3_`2WAFbQ`}~w$*gFrkBy9Od%6W< zMbx^s*pw%k4NRqKVQMQ`Qne2zkSgA?$AIpk$Q4OqYiU`FFf&6}R1J1|&l6kc&hox|AVVDIsO$ zge>8`oN;B9gpze9+*wb;!~0wrZ`POaW&H_%HjseDFi*(@v!O&N+m-0z_wGzM8%acX z+mnfAyA$2K?alOLdlS79p?>Nsc?qEbtP_3IP5X7<5!jMUs7LqHzK4Mave;o_3-tBU ztq)+ISQ6W)B$HD_58fnts5R?>7$k`SKGp?e;nrAUkoQHPFY5FS@xE^8>v8&qmwRJ< z|A~}hvgOw`Q%hwu!?4^7X`0bvF3UBWTekccmJ4(?lhzE&i_J?KgCXyUd@*NIX1Pyk z#YJlQPp1ttzsR&K(2Bly=>pSo#sXz0^0|fdq7}H5$z(-4CRsi2U4nLwrp!EhTf=%8 zdi$?vM)K;lq-ifrF0y>F00Y5vE^Q_=`Nc(eUf_L}&z_|wOQ)>hdiw&*bI-$`^pwpS zesRQ}gWEfVl7ys_gbd=B5-wo0LfsE#ht(bsjFCWHPa|H11gl`nn6|0Yjq*d)b}o_ z&NP+J+ox)3DxWj8bdEAUr^OVtKpCvbYPl5Uw6dHPh6;4@<`v4Wr-61lTgWq0%bBW4 zGZd#Rs|&z46=on|!TLBSFq_G!Msa?M?}r*{j#6DU^XfcR4a(A5CSB6zGgQ?K)xdX9 zHHITVn^U}=XIEj5W}XuhLQ)GXpQ45#9srVw=yX9VX3XS*khZ08ZL?LJOA6aA_%*{& z-61CL#Oh~KOe&3@qXvbyf$j56>;v{u4IXQE(D;4H>YdA*?_>-3!l0V^4;jld=vEAx zSul{k`rjW;o7uv2AzhdMa{ICqFzs%S5z&>^+3JcxsA(eetrpTPDlmUvu*IOn9RUg z63fk(8MqG^8OAQZ)_Oo`u#;Nc85@JC272BV^4v!vWAJKh2i&Iq&*IgU2_Kn<<9j&Hs>U)o6W@zTIc+iMN& zhYd0`2CgJ?F@>Q5u|BNSRx8$R`I5<;mZiz06-Xwtz|{=4L&@Y%idx1V@g|dcK9x)| z1r~x+NvBM<1)DA~UWU%!iX@X-4opHQ5lkJyXYAyYKyx72W%8QIc43DypU-C)a`3*` z9cQ`N8t$(be#ZMy{hT~g+-~2qFc}_w)c0iQ^_7#WwKD9ePs z0WAo#gDsShhvT0T}#_Gc@rM2W`UsX)9WWrttwyo_V8U zalBo~C@{WN1%!A9#U^&Sppc?<69^DR2S+c8Hrb{ahxJN*)FX$ zH)LJfRMX{~q%1G_Tl3eit(7N9tJQL_x%9n7cFs z!cw|J!hij67;s#TH!y4&uZKL;)O z0Q{QYgX%qizUVVP0x$ljAdMRm*t_&Aba?Ww$lthrDczAI@@Z5d%hEr%u1j#%lzuSB za6Oz!<@on3>8d4N+o+WOBa1XgKQ6GNrPwC^4mK7)lF7s7jiYgAY8pryqfq?^`L7jH znYdV;x$sZQZ>8VvUz=S!u%JfSJU(x5b;oVC)!{xG^EJ zu#v`6KAp4NX_IE{u%ieRj;5Gg$XTI}G5p#LwxaEK;M^I1D43mXcP?19;oNaXa61nD zm*JrX9;a3a*g#izEwuNG(B69Awp!nTYTtoc-;rwHktcme!IFm6+R)+Z(Baz9Y;|b% z@z9CYGxZ%if4%(qa&_Y3??SbibJdx1Pi8JWp1AmU$Hhu$_^VJ)rFUN?e)LJ`7$<(~ z-A~^A#hFGg=|9$Rk*iPc`LP}>-ut_2*M!d4tBGy?WRtd2uvDd`+Q zWrROm$ef%2Wv^i$Ry0vzp6azGDQwh=_jD>Ex6OrYvSj1{S zRj#K!j8cN1EN0dDjX4D{$tfpPsJH;a1TEqr25lA&MT>aA<*1lb&~iF-rt(Y&D;0+j zD~Kmy8v|M)u`+2w4#=;fvz%CZ7?j_YipsDRaD<6lf) zwO+n=Lt;vxVuowsaI%OAFpeLQG@MXUA~nXl@FZr1lW>9**)i<2f`UzWh%v0K$<5gp zbBm~Z2`I1+u=){Je*%?(Z;RG1%%2-lX-AuHYf}&hnO?(%r=bEr5GLc(wVkuoowKXT z6Q5ci-2Izkb#Q7``7}}=o_M&mI=mkT2I|`lh3OEDgDX z_*r2$m|}5=;0azUz}y1l$$%PHen^v1Gp-`yD?q9Y@?^HjWs7ju2UwQkkJcS3MFSxq z(ptAEH3JH>o(cd2rop?iIj8`wq3IcLaK^k1o5wy^TL0Jzwi#bXfQV$ZtF%qgO1od7 zEgkbvcE(w#Kxv|6SFAQZQyrhFjlW$TfBVV!>E~`&H1N&%>1tHH?YjLTy8P(vk=tAD zjC?k7FIef`R}Jl}kDvY)r!|PnANXUFK$CCoZwHTf1f&euz60JN<=8kp6MX^KAUxA` zrUbP;FzoQ~9QRzDiAJ$d0FglsvlWUFC<8kYsQ@N<$rl=2ScXWaqHeU!$6Tk_-IP(F zsq{iR1-V&pmBy8P5tIeQDV(OrnQFGIoqh3)T&J4s5ft|bdD~BKX-PT`@53qRg&78ROTm!AqIloU?@_r$xJS?@aKIwA(65Q;kC3h{+F!(5 ztP1cmeh$@5@=SKQeT_iKee~=89St{J0fKBDx_j_mpgOd7?ZTtI)v2SEp`+C;#~NN7 z^^yL8yS<;i)9~Y6fQ;_CcYbYr&A1=C_v5O1uo=YeVZS?YZ>d4x@+i^7%d;(ztyzZ< z4NTtr&UlU&u|wQf3Vu#$1(hb+=0@y5l?3311bk&k_yI=*07nFA2m=-1hAtXz-Pvft z!!tP%{Q3Z+AsV&OffsOs0$&>3iEjR-p}#|LBG{Myo~yulun71Zz`rAixDR+8Tn@Uv z@4yHK*K|(70S2!!D$qCh>>wN+*waOW8hl~fw(z4zTpU5sySA|Hgq>(*JkB{GF4gYd zF-gQ7f%tg0G5xW5lIez0R#>jAY!)T}^*E4Vvn)Vad0@kk8*UQ=I4^KUa)3pvj}Q}m z6E851D&A2{%-HY(*{8$t z+VJt}@bSuti;ssdt)BgTwEwnw=ck|j^l5bK-M-quWOZQjX>{Q3!P-!~IuvhsmErK} z8MN+rO`WN#Gxg|DJvv;E4%VZ)n|}8wKk$z4*@g}Z$q|KDpLs;DQ%>RswCZJj&bxeK? zS0UjT9IuCZ>b_9TH&FEr)O@2=-)Lp*wMV5deaD3dXIEiKtY=gI{hc}^yn7Z_3qOv2 zo~=9cSNich&co_+_!$_5(2pzN$3vuN>&m9p;W5uHe{h*X_kS z-%GCB58co9x!fnDc1Ml(=&A84ZbYLGVDN0?#Po5PBVkDsFY7 zfeAIX;h?Z1&LWQR`AI2mp9}cm7JY@a9}iIRwoi%iwzCDc9rqizzl@4xb^L!@*TObD z(Mv_rTkHW&{P3BaMR4=zkJvO;7LQ56D#xzL0lFV+{gT(*?H;ynS&}IwTD|BT&6C XrQOolGXm9%bJDnULi(0q%?0{Dlh{0t literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/__pycache__/multi_connector.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/__pycache__/multi_connector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6393dc3ab257ea0b25968b90339cfe68f2889d83 GIT binary patch literal 20049 zcmcJ1Yjhjeo!<;z1i&EylHmIVz6n0{phQ_OixMSSl115)tk_H(6ygkJN&rdEfO^nS zqBw3A>MCogsUtexZ0Ka0(6zTwc0WYhbJ|t_C(8K<>TeCit&nAU_8L`ywS>7)p%8`db~PTGhP#`9j|5i z#nHN0{dhe~3({FA(pO)_QZO}dwI^rN%d0Shgc5^i%`5p@!d5&toB8b03OJ za^!SWLfRun;*p7HL{UOEt>ok@FYFKRdv5=}BZtNgYW9)oOQ_;K6^+LB%}h^Doxz*` z$f=WZWLlY&WGk=Y$SI_!rHS~Ayf>mqCoj!X-{pDP2O?B>DP&)jOO?Z?Hfs4D9F^h` znv~|tYkqtNvvp`yd08*WGcnYarzRAdlr@19@km_Jyt!hnIF~+&X%$aRPsPL0nKNf3 z8Lz$*6R%0)TvU>e&BbTu;>aq?>n@i$5|2yM@u`_4~rsIWH+8ujUSirz0^b9M-(yaBPN_3TYu6etj+yHA zI2@Utp20R?t(0(BZlYX=I5iR1>`GkL>{HY654aWNa(|c@jK^jNXQyWS;}Q9c6z{({ zH~h@DbHmbLbn5ir?4|f?Gt-0T0f2+z6xvRmo{LN3;JNc*%##5jr6)|_8$7>pFh(Dm z%Nm%yBsX9r7Ih<`^dkD_+@sQpRG@1KQJ-4czwGGG3~o+2DzC>E_AOQ{Ja^+OOAfWV z@1A3`OpRFURU70$E`^oX5al*VPe3K68gMsyKGO`5?rAk&hxS>VNY-ghAP+U;!d-xggp*4 z(L>?335I4S$^Ouoy<$7ey~$ILgoI~3W*@m{!fv*)#y#cW65I*y1AYwH=wJlSITe>; zAXQV!)HIfKdP341aga{UV=Xsmoq!c(H=WZR$+AkyB;AtdA>MDxuB zVPwGSbqgvf3Zg4qrmg!&+uz@wZriN3ZT`4ziz+lHpI_+6`05vje)P=yn>Oh?ak z$MrAW^|$_}x@EEBqrj5*VknOiLpVBT{Iy%~jow^n88cRb?%AQ;Swa zSkcPbo3Gq>C0*H}R(5<`*_GU%5z5{^a`nhJkA7n3s=C)N00ALefNvo2|F!^c^KaWG z>{Hx??bRv+CHTv>1fQ@yZArdmyH?}iChR~T{sh|cM*704MUppST7<#k+=TRjU0igILz2P%U)1k%t%4_l0k)i{khsak zEm#xzw+OmuW7mYIEz&4wLXe5mC%BMpY}Jr7+-Cf*(DxtVc(#kL1`MFICXgG|?DE_E zO&)+*MS%HN_6xiYoDk8bl&}<=jbDOPW*gYB4k9(r&NI@qba7VRJ-@N=a+^I2?1bbI zi7LAXtR{yr5v2~%KXSjn!llYb)s6d>oY%M9+FTA02G!S@a(C)8CZaJxGc*IG*^G%UV8&12PYIrY#-K|w^e9f= zg-D^MCSK88L7zTvu2z*X>cEc(b8YQPI^OnN^<3|}>uzRK9Siy7W)#Xnidqn9Md7eG zGXYu6M_I%~WTJU;J4GE7kwPcaOf|>pnVG1(hu#lRL^~^YQPfQlV?(`^qG$mB$~i=D zai2KsPIuN@43OXWa>$plzSy8lNl6*;)h3+L2o6rOZ>ri>ZBqu<0h(=CJOa@o9 zs#!@6OVekjrll~;Hdvc53pp*UH&Z5P8_hsz2B6!Jc!vk6G)uUvt_1f@5K`hG5S|<# zI`kr*OE?o;*dn18JB2hdhY66fz|^a-Ruax1I8>zbjicyCCh(MxQS<^uFCx-B;V@G} z0qFFhfP922Iy9X`#(eu)49#1p0dv$7sHqS{gRVXHbDq+xMXI}Ip?)!X>u_q%seA4( zWEw+B$9;Dl16HlVR3pMnm<~_0F*vdPvV#QB zLX77;r?emD6886CA1PepX{_lz?z{F6F&7FXkQN#tVJFZ%OKZ9&Bpf0WU=ub|umvaj zB5|USeQf?^691vak#tWg1@r*u6(s#fp@@-Y0fhi5335Tc25@02{#uZ>Mnc^=C1?O% z5a2O8BgeM~b)xKGQic2y=A=2!NtYC@Xy&vc$>*hzQ-_ocj7+UnI^}S0cZ}qDEyHoYM)v9N+MO5nMDr*-iZXC*b=*`QubS$1&YF|=52rYhD z4Gv|CDO;c}Hm8DHQ}tWbs^QN{koifK%lQ(&G<3^;>%=Yh2fMQzz28xO$DTUZpotZx z*5AfDt)|wbVT*Rr0WI4px+Yx~&2>sHR^rw*ZOO~1wq2*clm`8UW^JMjXtY-nq!Ow0 zESV_Q1cRd6MAE(eq9B!tZn0$2!A2_ug)bMku(S{93egW8KOk-u%TT*gESI3YQ*E&V z?^WzQP|1SOJ#rev2C-3WlIqEtfLQOL(P-ron^C8U)d`9%Vyjd?=@Q$J*DSV6L9}VXH+0~= zRqT}7V5#Urx?Qi2Rc0fxo*UUpcIf@;qgZ2fqfIAjb%`NkW0-O7mv7s#>aD`J`ak;P52Idr8lEa zz4>N*iCY?&^om1y^BQE_djq)lHm#IVoqeYLDN+WGzzirO!3%O^795e8SmvPNp9#ju zG6gePzD-66M&i0n1tzv&d@3dlcv%N0L1`!EWEpnFAZ(RW6QIZ2SxJBJ`I(4_PILtrK88q3l4xmHy8WGP+a(qgWFvj`F6l4Mn9iKTTP3yKB)>$wJ88|#W7Yk2I7s9bf zoQ$Diy;gUqFL)XjS`nrqHpdxR#0ME{K$l8%Gf#i;1dN5uP_)V(q?gK!=`-=gA;XZX z`N(ERW>ZtNp?#PzL=1+8Zp2J9X~%*-O*Hw4H;vppcLv&BPBdX})CUV-j)m+r{D%e$ z2fM~BnJvk286wWv{Mi_NoGmsIc@G=B|8ca}tkHK9;13Flc;I?bz{ z>*?At2RB7uXR&eiMytdEShj5pHVwAWr;#NSEe&~Ol7|?lfCgcKhoV)7&&4Ogc~(?o zBeYghiE=e{mLe%I=!!-aMRO5cPm5Z4f%a8lSvO`YfT3JJT6uWAWjS9WwC1dpSkuJd zkbt3{%^E}NEUmC~SiKq%ErIZLEP(PIL?jRjra+jfXi=oC^?FsIGkJ7j2bd&r`36;JNDFPM(Dwe(TfMjA zcf!lEexu{;H~2HhB0-+*nQzx)`L#6es1^~>c0(~a`P6FM0K4^ z?#CQgDZKQZ3}L@9_*r22~)%*I>hNCWl^4giAOlZDtxW@z-IxwS(L{#x$=5or>nM)C8ve0^6q0 zhgO?^AYp*LPAfG#kV|t+%-~337FA{XeEBtsywtLy0Pcl~G&}HsVKxNiI1an)u%&Bu zB;^=2rJnLsCc`9i2Dyf1XrIZQ20kCTze%0_S46PRxjBD*+Sjl8`cuAVQtoFEXMA;f zPTF@+^&L$6j;OvP8KGu{40A~baKj%+`I=MiX2hAjqe=UJtEf)ydr;T(z0kX%#Usmg z8w&Nfh*DX;v*PS2Je82tuH zlSl3eB%hY}-#&cx@NWX0>A-e1upM}uuGy;AY{djBt59AU%zB(<<)7J$N=maOT%ab~ z!1*hZUR~Ssn0!ZBg=s|U8Kyml#FI=*z=aArXn;l4Vk)R^gd7e4UMh2DW}7UbhE{Is zNyjB1FatnCL9Ka4)#|ke*gUNLb?ns7!xJ6D(6gAF4_(Hr=?sx+CE69Zl z^^ilr93eoT1^6V03u#1af;mfUTvh%l(OFWd)`Z-74Ym&^@i@*C^l;SyLr4pKe{Mde z(ZefmLM3)I$j&(Bb9mKUbF;7_NSb|qYL@N3+(_Bgr>ArtsvnbEJHS}9sq>OkQ=#)kI8-t zopK98v?&48LxO*kzr?lWEnYk9ESK$ZOJFS|F2hD;J4>u;O(=wvCi+1PwqNF7q1oHN zV!!s1{=C3n;4a!<;V$qHT<5#?U!0I;$z32Q&dH=IS(GZ+fkWlj=HlYah3Nqp5!pVv z%^b~RwbWd?M+exl*%{kJX6X(&bYgoI6W8&Jm)HphS&?+}2pL4?*GiG*IDdyig#Z=1 z+1ZoyfVnuM*haWOA8_ zh+tsGVro7(50I}5;S)K#Xs&a3J)^J%UbDK_Fg@d3FvrkV^-4mA(t&6V!qg{6-adBq z*upVY*t&FnS=gG#&|v<)1fw_kpsZRIq6m!2{C{KU6&E7#A;JQ{)J{{Nr7=1B+ZbJ6 z1D$>t0Coe}^Q~*>M}cmZQBMgXTC)&f>Mg3!ve%<=wpJ*;U)L2epNVpXX~ zKZ)ZPg={!b#Nj2fTG9n`)Dhzv1iSpBag#$_w&jk?t-L3kU4Th{8=&GStn=jrCT3J| zUUns1iap^}_-hRyL!{lV2?^IlULn8GJW(s*H1japs~D{a9FWweHKCwUrN|W7F1r)% zE9Kx_1@y#(RSTqVQ^GCU&W&QVb-fzySm(L{rlo~PO%(0ou04-7_IYr{C)L@Bxn2u9 zxN8gGy*A@}K#I-Oq>bs8`VlGt#&5AX94US{&(#^8K$@Ysv+`F_%w$bjMheICq6p=a zX%p{KMh*rP*lEB{^KxWq;yiM6CRk*agRin|^70_4(Moi44cU26TOpyH@RJ3dUw#)^ zhD%J2qUTTZA5cO0ABbRhadDlYAH0-u*F5OxX3snZnrRMx?1rhBD$8`X6;?);4)9VIsw-m)2HdrF`5oB zO|%uK$>)@aK$uSDbp9>L@{DY1pWqp1&g7~U;95UKkMsR&xYpDA<^aCRCbWar(c1BY zj$}#xZVR;wFQyxJsg1jq8?oKbUOmbL;YYrPl)FLKQvr?4Ayy_A6|I7d(-Si>xaL}x z=~c(F7E${zs17lihZsv3!~&3#P-;yu2?JL9aGurIS;%-rN8qL~%Sf3EM96gogz6GF zp~#L`but0c)}4T#CnF&IwE32UO^RrB-*()c7f#FgS1k zK;8u;@G)m#g(hs^zah_jgq-mUZ{<{fJDj*P!LRoe%%POy|{!svB6afJV1;+NI!WM-;eD7%_iczxLyTs*ln z@`IN%d-q*;r2W0Bzjx_ix^K6NKmW6t#+JqHsfK|Yo@^~wxglGJhM+<^n7J5wC~^%g z0)sLTJxQL9M2+*0RVNZ8q%e%e#y?S5D5QL0w}~3831D&tT!Mkx3b8|r{W{~8%>?iS zMzc2LLGwlfni-5H9E^Dd5=8|WA9L+6Pc6Xwnx9w?ZsvLQHj}&u7lGgYZv-7Aix`+O zb|n85rl)!6G>JK|$@CpMnAYf)5$0+kg(H(#>@Zx*W+qm_1WS>BOf?gT3P9&ik-bh^f719(1e^0{0oZhL4Z$?duIsm^>$j`*+iwr2cN|rB z9R0bxTz?#}lc{b@S9ht^UCY&>q)-rseg0IS{jRV5k-seM?@|3dOT)Jgq;?*=?|%W^ zmz7(Y&);de?;ri71fZ#dq+c=bB7|qyWrS5~jRk^>nChR9zm7%b;sly3B%#c~(a{Z1 zDFAlsof$u}EZtx_1<93MQ#e0igHU4xb{u8RAwd$2eYJ}uRo#sLl|6{w;Vmag;&$Y@ zYkdlejd0bPn&BqNG}C-H$-wEksVGxG^Q>yn!$!~pgAAD&VrXKP9cM3I=C`h)TZQ^5 z;5a!*fYOQ%e$PB7;fh*c`pdn?Z@+k*Py0Jne`l&|IPKr2?mezMP=39>iHD~L831A8 zYrsRM)(jnV362~OOvRDr%IfOI;D2Ti+JBF1!@VUZX%(7qFU^jWyWcaBE713#vJQT- z^`)7;2hw|w-`#s0^t2wHb+)R~$96}+n}j zCr9rIZ5dy6(q{lC^n7EP3jinWFQbpoA@LMts)9*31yXfy%D|IK;N~~FqiK? zVBi80%pGK}vzR`it8KJ`EoiV-(Mk(s=uCBPy)O*sw5`Ho2pw7LVhC(qBm$tnwE=?1 z1}uaVO16bHxjKbK4%rED!!{2c24{0cSNLnK3CGupbJrO}>(T*y!HOwy@ zE6%&~#QYQD7jeTatdFD%Mc!N$O~@}Uz<&NCeUg3+BZsqV=ed)hjTjXZiK=Uo#hJRM zq%U7u?&?l=J)?F#bGr+MjIP7vsdjil_@%El`bg}@P;p2P3lVxCsZ$#{flQa+xP#R|J%XjsD6UIcre|*RmGn$4F2RP zNxR!rcU#)s1w;N)QM!A(+P(eu=sowyBVXOZi&ww8=u7zqQ|>`V7GxZ*aI4HJLbEh~y>FH{e{tZ>Xb z>oa@EJ#&4bj_bSDm~ZlN>rgHZ2BC*s?VPX-xuPgS1lDURBL^kje~OT6ihNN)DA);W z5XMLFDdv9_r|WDMUD3UgsDlxht97R;v%5Tp>tJ`@#nTRAhv4Nm8aAqL*+}J3t;>p3W=SFk$qF%@7e%{n18`qk8P@I7~7rEsc-3B8; zWmcM)nuLRx{=vCk#&pWMZbf2K3Ul<@jPw0Q0QTZ3BJ>Rni z_n$JsF9OIEI`;u_u3GO2@t~pkz5U6eOizEh=LNOrh4o%ST|?>4V`}FyP->9luF{my zMv7~f>e~e6_Ga;o;#9-t`~IP<4}L+9%Q?Z9yzr+x7h3KX*FPw4$lCad+6T1_3txDz zVWAW*$X$U?Z6xF`H`*IkmQsQky%HzwBJqE0J6ltX^>+I^lpPz4tJFT_E9rSG>)f z9V8)?l6xWS&xL7{>my5Uy2xKHf)QsbQDn_>2DH$if1r}KTwUE}OY1rx+J6x!2 z!S;Wz>}B*X*&qKTh#fWI9e`UK}96PO|P+rh%C7_wTTfmO=mk_jEqlbK*mC?uFO zB;1*AKgK0kw9~4HDz1kw9cWLXM*fkyR<1)4VAKLHUGpAB%{8aM50N$BQGf=oAMN`D zD&wesg?qH)Sptka2d6+~8n^ZXeanFX+^+Qnl9vGx8DC4v7y3=d_T`S9*rm+y&J-?U zy#Bp6-hCt8yjgADd{+RiB{A5~`CC)&7F~QDqYv^^L?TG;R?>vEz{sZ4Dz=f|N8%eWvRNa6X?()QHwQJ`>BX>yF1X_M*;Wrp`(7ZzSkH*QuNH{Uh+ zDfYM*JQ==;8KEj8w33rzv5RgGyF9u#APnpMn8O&{8NP|9Tul+n(N9x0uDK&AAEF8+ z6w%cit(aX`qiaqxqWmglw^BqlE}0~3w)%`_vzt4b6Tg2l@tWp33IBU&rR3trelJ7* z8l~Q%=o=JWqv!@j?@)A;BBCGicPZ+m=!X;?rsyMzNQIP1Y}PGIWG|A*dqloN5s?F# z^ii20nCU?BFDOLD3HLf}MYz{l zUgzAxE)wC1t?@Nwizwyh>O!A+D7B4qm1S+!&WcQ3XV#AABTq?Ex;_dszZ&RYN~8ii z)Y6@*XV<4r6t39K>e)>;XW!E3t%_UHPoqDHrZ(N9$N=C?b0KlK(n8&-lg=Wy0-a}F#`fRZ~0Zi&=vAnSHJkMNHI)mbMcFxiTln|(L> zvPJCG&DHc~JuE43-qMv4N`4x!Ih(TXTIXIq6Kq@D_r5!8ruk<# zscnbogSXMVH?R@$#E4~;*H7R?q=??wsPXG37upt-h3*?)UKG{Jo|Lag_4KZIsKQp8 zvuoky#c8#13w73o7POBI&d{xBmc!$Y^7E5FoBx!O#-<^a-HBR{dIvwTcQ|)vDykRk z3nv!bH+C3^XjD|>M$M|ArYpQ$K9f#VD+rfCjsSS--<@7T6 z!V}-XE~*J?eg9JY)=_nEZ>oN;TD31*M5VclYV=m%q8epmZ-bwfAjQUhOx^1YfxtVo zNmlUCd=oR#C`3E@A)A57=?R8HS~1CW4ErTTc2GSRuH(|FI#FO9;eJio?E3Bi_@Z@v zx0*CaBWD0y&opTO@0+hTP${f?&UBtqiy`-8%^pz!lZkf|6jB_zfm8r zm_ARSh{&Ke^v92k9*m2iI*DPl$KtxiLg@~Cayj)~%lbyMi5XcMV8zN88cv_Iqd(vL<(@*TATjw`PlP1j@6>;aX}A14@{f9E=W%?ba&wLavA z9&%$3xfgzCbMUrLI7GkmI(gd)hv+kb<57FdL$3c52=eXDfbev++Zz=1m|7{1EWRR-C0Apc<70`R-Ih-wKDPQHIpx1AICE b5}#___L!r@IZE%OtBz|6!_=@s1|9; zmhE;-*`2hM*bzH+((H_rR)4Kb(}}Z|lT6Z1ylZDCGm8@023)I|uu+7STr-tri8v=+BQEB*M%wwvT3AgvU#L=GB^^P+%mF-#knIbldU7IlWikylkFqzlN}=+ zEIcpLIoUPRHQ7DVJ-KycD+}{PdM3AxY-4_JWc%cfksZvRAK5wCJJQSi1(98oyGM3U z_KozhXJ2H`KmSy(&5I<&C!VbjFF*K5puLTA`7 zbcIg}-Ca zH9U1Wio3Tz77I^N@PW~>OW`R2VgAgpgHyAUl=eb678;Gkrb6emw7JhjB9jBtQ|HGo z1Z}cwV0vcmKxBICQuO4ETyXpu#7u?9P`?AC(QxRQF1aEl?8xYp5WP5hDg0D8HcFM3 zeJQb8peH5p^mHT~w8}*(L8rv&Nu(9W$D*<~CHzcmG!~U}^=R3t`-cIdD*7CJd~{k2 zO->8pNN8qM9F0Wa5l5#k$@QDZzcejIgiv&RJ}kRfs+rIQae8(JK*~EbExt4=3i$F^ z_zJb4b9`z%7K%(?xDcjh@{Y}lB0wIRiHye1Pm7aiSob8L5SpKS5mks?ls(#G6rc=K zAwGJ|rOTnQ(Xor+P-Jv&dX|DcXYeS@M#AFB+1LyV%4P5$9z6$0ndN+~Ejc_9=mwam z9HpgF0@Q{y%Q>gRFV2Rev8W5Zl3qQDCJIePFCcO5)cBQ1X!HWUfnGpQ&z%p8!{JC6 zsl>VY{Gp@Iod}Fa1EZHm$0JmcpB3_;sp*&=%(E+ng~nJnny+PPS@{7ah{{UOm?c}s zW`$ANGBz_CG>dJhWzZ_N;wD=F2ibCEW|oDHAxOJaL}z*sL(nZ#*hG`>NY+_X*b=r1 z=01dmZSZ9=Uv@Z$&^TQMtLC$1q{(K_R8EfOvumjx>i4sN6Y!C z!!$HCIvJKdyePHz@buJ+hk|z59%AW2Avrfh;{lxtepe{;;_PUIO2CsP8WUwtC^R~S z<`^Zc5k;u&+*~Xim93HSXiT<@PsQFcJ;3Lhes-=SHaXKVGd|N68x_%u+OEv*=-qi~ zN4O(0ey(F?E_QKxs^c;cNQZ!|;`q7QSXk&lJ!2SvQS>SDqYEe~q~mf|2kUxTNc+s3 z*o7A{Ck#)N!2AW%x~JgtLrH(>s?s-jW_}0T)B{j;{fks zd`OO|BpkiXLIM+Kn!QxJcXqSq z`@zfX z=wk^4)Pu*ek&|CEIkA@c7Cg613)ThOJi?+`$4s9y$Fx>i$ikmRFjJSnoe}Q1ThwPZ zU>7VCR8qzzU|@10^-j*)YI|PIUa*Oth3o~>gw~pR#sxNyELcS^OOaY%HIb5H&NOEV z+K0qr2o;|q=Queh$Qgq3W2!&KA%RLfO3r2}G4s(_ku5;6;xMH=0|!}0$p3M2SRtPv z-wWhW|COx(^NUgXrhsxOoxzDYCNvd(iDKt#jQc}Ml3zmp=x5x_dFeO zdq>>Uaowy5|aRSWK=;*2{*1S;L2du1xIpG5TQJ z>F|q31s#*gR-&;LROX-r)^nh!MFHMS)OL!($oVvWQS!WO`Y_98>)9wV)iot6Yd>%o z+q%~Mr5iTfH?mD-RW$8v>dI6Lh{Ku$R#L8)ts^d$#x3Lt9^lSw!HY1rkRQ%N9?yBJP=IGIOXCaY zhaC*lQfba`LD(1etJs#6td;9WoHZ^=YA``R=|Eh9@P zLCP|rG+d5vDucg5C=XYLecJm~KMzy$S%nIutwP-^mAWE#B|@qZ5>UP$Ayo*e3Fk1V zRzLb3DQT@j;AI;}=p(fXl!Y4PsSDRrU2tz;ZNU(YPzx9-b*y7wuRd`Ygc~s*)u*&j zlh7$Npf=4yW7sDsFlxeckUckt4Phh%g)J#OElCZeyE z>$-Kp^vj4}FcS{u%WX5Bjd15RhPjR7CJZp?KQ%q(Uk%GIWW_jR&NRW8cf8KZ{ZccA zd|k}VL-7SBqdM^U0cET60|*bI5wz0hOlj{b;3evJqKksW8iDHuF^;^1kyD6Krp!o0M~RoniLjT4LX2w>3Pq_S(Oh(mO+~Ec^XV1R zNRc;?H2O_AKfPvp=rTF-uIEVhlBM!x;r7Wjd*8aJEa7RBJZ*7LZ`|GsH_1a1o*j~B zN8B?Iw-3Ng*0m?<`lY)5MBQ^x&827xhUte!c+@=om9WDD%2j6w~rd5TG(H!K|~E(BlltFzUZo zK^=+aIa5mMG#O@GLRL!YmY{WL{@B?mTG|I<(*YqI1CuA9GByIwoH+56x^iz190Es$ z*hNY@#@Py7x*TAnT|nVuw9g++>n#CLva#{03xVU$q@|-GjtmV3z%&V*Lz1u!t9m2LoWJ9t#p2Eb?cgs-7E~q*wnznA5(V8-LHA2+&stIaaY{GkFAMpvsrYFH7Ld9d#;Pgf1Y%vUW;?$Ui z5|NV7I_rwhz=N)6GVF>9nl_9xBBKfUb2NEIMNe6xNEIt1Fj3`*;+&*Ki&np&ML`$W zkWM5DGp#Memk~1G{3y(g3yV|!ZE!%Q0!aRZr%CcO#XVc&_N{P}9-k7D@Enjl2a+yd zys-I!CCi(;VFFwgkY+247O?@vC=dmMFD!zgszTIwQhW}{f02;Xr_l^pmdwt8X%r4c z!I}Z9=sfBc!sweG!vcXV1+Jkc;upwaR6Dgby0+4{#YH^LH>SgLvjUSy%NF=ggyw$9 zvmemB@l3q3GZUU(Z6XPQn~APzj1*}_^NWKYM`IbV^Pig?2TNC<&%nwY4Q45DyH1}= z$d|+F%kYCwnaVn7H!5JAGAiq34A{r@x<1L%7vJ;Ls%I#UHQ^xg$qt2|2Y%20&=CqQ zk+N(*4<11j!!vDMzCvG^iX+8eK}ZH4p*^#R|C@4kz(N1XeJosdN}io}i{A~ddQQac z3S&sNQ`@l34sAk2QB;Y40nEkwi`g`14rcvqDVIhJU1!3vKApJH2XhNHmn!` z5jL09OP>0;r!#KvgqtLIR7&>BguPR;cdnGaXWy;BBX0~FW}#_NFpQi2KNKh>)ZoNp zQ+m(2+4Izire~sWsc7)mGeNUNpvjmrHY+3xZShe6v|X}qU-`7wVZ0pc5ZOh%?=%om z7iMFgRy@C`IfAe4u|77>w;xuAIj={67_mg0Vx(gPP6^~!@msVVy@+)%wkyc9@UHb6 zneh8M!LKa6YY{!fGiDRaf=C29kqDIZPQ^nKo-WDLwaQ2t)~k%U*Z*Q5Y9SD@xhimU zFbaG%8Vj7q#$Q+vGjEc__sA%-6 z#K~_Fq*@<^)aFGvKO*b~B+tOA=V07^@G-C>w%n%O3u`B1GTDd});pK7N>}?!5jA`2 zW)~yk9)-g)!J+<9I6RuwOW#kN8XzR6C*cf8&OqGRob(lI_$lGpEqQjYdiKQadlYnJ z-F#NtSg1FZoZ=}I?tH&aQ!2LSwnA`#W#=_d%b?DcjgW~1_$v}S(;S5E0-t) z)uUgQiE%bd2hk=D0kB!U{BQXLJyZDk&*{|7T z3>okujLUs=vbQO?ICnfcK7}x}~!2wVtDEo?~(QvAE}0 z3enIa*;{T8CRzuj*1`8wL`8jb4n2}~=F^&rK+F0dwb?C3wa&Ms)h!@QhZTVe0r;6p zd1*Wv4q%%Nb6NWk+xy^i*6*W`v|;IcX%x~d01Jr`;Qa)!jBTQJgCv~<0U9cp%Iq#> ztl2TPSs}8Ui7Rl@SFA_6w6!#zmF}1FO--A86YUlT!dGS@<74Bo$Q=J3mG?;BBmN;d z{}VaHK+GIgZnodZ>-I>y(q<%&?4hqBjxFe5ugT$N-<3A~QiWM-{3AU7T?dZF62^|7a6Y&#}xJC@k? z%)M>TBwgNwt44CwBwSsRt81k`(S1znKK7pLI9SEopEbvwosV`4jd1-f)MT@6@!7IX znn1&_$gEy`%Eiu#;jZ*&%Z=-CGUi)SlVn`90vO$aqH}3wP8VMdq);&*bt7>%&Mrog+3H#~!_-d$l6Lv&q|B)FP{3zSPh~j-T6-2Z73UkBHm`aEpqOX^KZzZfh+zw9NDez z#*Kv|kx&T4`(=c(rPzLDQ*NA+lCw+^zk*@&)0c^hw|&$I@(cnzHjZvb?s!m5C2cxIM7&fQ{B{TW+#AuwkM5j$Q1&?#Nq= z-Wa|g{?c>4{YSwsLHbS-5y-=t%ScF`DP^EbwFwzcwj={hCK`M zD+~01basoa=k}Qu-&@aZnBcwJ@UG?Ef$!RXQ1^qezif{0KP~M#^ME2YaxJz)<`v7z zz&Gs_aL7y{d*jVJ@AgQ|`zUBH+8{fBy`|#=OAa9CD_*iJomsNq+`Bv|`IfXH z=FtrYJ>{Be8kW!8uD>09E4ciE6zJJ-Qm_kH^Fud$>Wj!%q|`ji=Dt0KjTM`FMIb=9 zH|%zsd);5WVZ$9S_m$jgz1g~vLr->7Ny~6*QwJTxPrEpY^8ng1m>{_9FGTer5vB+YLKwvg@rZ1}2C<18hzde< z5aNQ07)Op^N4x{^&aevtG!*C7ggEjhsEnh4tGPW#xCTg2z_`3kKl!fjYG&G zk(zLUD!h`-gl&8*)+hKOKtlpKBr@ZN$V?$p7ok=p7*ovBl!i-GfgCEQ0JXGfwX~C{ zk5(=T(3C<1vRr%-6`pTCIti3D39)3xhzv}h4`_S;f#|ppCR`6Jr*4gQBaiW}#33LA z%)bzpAR#m5!e%r=GcO|n5aiNQLSQ6CT;uV=8c3-vD^pVDWxkUHQ6>n5W?U15bTh7b zJfmFRaVRI@*o;e|DPn|U*)>tdefnK6GXW2~KwqTQX{yS&M8mGcN=VHe3-IzM>bOt8 z3l@FHl{Al#Q~N?Dq)PJ;yJA7^o7lpA`kj-N$gh1n%TE&vyV#TV_3Vj+pV$@K#(gQ} z5buFqhOsmQvkStJ)*&Xse`vnyC^RU>M+eDZN?xQzuUk z%N|WrM0uH9K#YV8{gL!SN#a_Zrs@!;AT1vz!_Q{vKcWyX7L_W`U5OK^1IkvCO_0k% zQ(zON?C6lJeRvA%@0c6_S85FUE5c^GKC&am^o0OC*>>K?RY2aJZvN3gUS)+C$D$LN#e)pPDLF+?a0`PSBTAYov2dNH%G{I&CGd zwX7!U^D!dqY;dr0C~qX$;*w-pgTgUMltRXk+5_~cF8R`=IPzbsNZLX0JOu}9v*(=bVYyr7Na&Cz`x3dLgi{xraxH=_Q z=bEcK+1^cU*#cQC*A9hPc$Mn)IXFXUoHW84wA6kugs&mN|EpOsJSvyrj7AKujElqN zss-5oF>o6+ed3cQ)5r0qtJbTwn1dIyV2f!pkj5}|>SBqy*w^RG5MrlQO2)Ndca&bMKXD&wK%)8e{8_&vv{8TJJ|g=B+nsFAn@9aM9+z?b zC$u?}mjS4#UNB&?CtCDy?$hJvEOQnJ=ifxk{1Ic+NBm1j)J560bDFXkV-#p>Uu^;2 zKHoDu6Ckz4K!dO=piT$|*-6`LSD+ypU^-hv!*cN^6AQE~vlXo^*-9L5Ia?ToR>qWU zC605jNX~&aU=+1vo*78^$vFqvJS57t@mP2=s_^)_@R@S%GqhK7koNZEY^8F_>}C_f z=VmXk6|ii9=9-v5a*?oum?Y;%lsj8dca=*vn*roK+y1(U3`+H znOe?KD&MsDH@gtnlD5443v*Ot~r z*ZqRhTOBt$5(Po2Ah=r4vY3->Y=3j=jj10tZkJrOi-&J~;=ZqPX>zS?@0xGl{h|P5 z0}Ho2uvm%h1Wu94jOXJ*K7|&|c`mYHXOF10<6+XDw2im4}G~Ec_ath zvQdQTG2v2>FtJG2TwAg0lyFr@u8Q|uRU4Hi*H#4)KcqJQDK%Ga3VmWF+9_>6fbb~8 z%wJ-d`SDo!sx_wJWx)(3Ax#V&cvf3AOlTMvWvdr3p^f8Qvnl4};V})rO=#3SkHeZl z%d^b{RiW&Od_9i)QcBS1avlfqaf|*|%o@bH$+&cELV9m_hlZu~ggnj10jaAwSM66F zF^#%5!nzYRdS32ZaBwWoW+ung+$TWHx{#YvcSlUa&zE+fX^T>FH}H4pcfp4Dp+)Ip zc(h6Jr@9MRm@k+Xa%meJz|@c3o$izlJX z2VG%Bx{PD~=zQCNB8mphT(um?M_+)>_3V@~rKDtSh_XR96!fzdlKAfc4zZdXT6l>6 z1&*9UI_YDvE3yS1Is5d-&K{43kV*V&D*4A$a<&krMT_h@d~i7Q)WPBY&{Jm)%Q>omwwy=H485G7 zQ`B&S_zzUb&!`Y9TdG-^(!7;wB-^o&gE@oPoNR`=GNvdwPnpF+Dx#0lDCWF)+!dzg z2MC<+*aV?(HtoElWAle4jN#lNc{&!Y_dOL$XO{#lg=wv`wZ6YjhH zUpaGYoyME1uJ&$4vS#wn;3yLk({D^C8vCTizPo2u8~Y`1-3`l)7w&tDZnWdkk%Qo$t(R2(B;SwbYQvv&@D}8h{{pCWeO`ck2f74Z3t97^+OOIcY=Y&|AifH!se<)<))~{BEtoyJ6G$$X zcfvzZXHXPg12bay{CFhN8km`lL;{fAMhJxVRq?85;5?qxv|9Gj=-kxUMR9s+dNvxF zLoCcn07a1Ol_ISIVv=lx041dC5-khnole8zCmSQhzC!fn*c?s@9j|S4U`H3 z&Cvf>vm*K&&<|LNOkF)Pr`4H%I@d@*Wl1AJ!Dh5SroL6nV7RHZg~`V1Z&3?j0%&8S zQ_vA%(r2{0MsS0$u2!B>0WQ9;E1H}O&rJh!zeF-@sK#i7k%8EaQ#2>iKOgTGXW6D0 z$pJrFm|CBg(4u->GOCc#dd+-qPZNy~fX_1%4eD>6f0me7DxOEzA;~dgorhgU4Ya2r z+6o06&BTJLc#U8=Um!Jerg^P)M3E-rVQk~k{ZKGlWGckk^dO!?u=@X_Fj2;$M1(0VqG02f@jTzP2{zvRd+!lGon)#>VyQ zLsf~*v`H;Aq%1CMl0svUh%e}jkHu`JNNpE#2DQaKhlIG5><2K!?!}iqhWMT`V~Vih zn{4cx49L>WziFx}r9T)Dt%4oBfkq}SvuP@a)u|a{fXU$rj_cWqsS$!rFn4JFY&zoD zOu0uv8yuafs!fkU8(tl4C^Q|z7GeQ)!{Z(-3=(N~WDsPg1L zM6F2i`Mi8HxZF>{t6OlGKXw>I_xKcKM8^e8FT<)qn^O!05<%at2!inGl}!?4;|LQq z_Blwcgy%UF%rnBf;(x>&WeZsZ`Z&G&kLb}&*ijRT7Vj%bA`wNhVs`4{C~5Gdr1;+{ zs~ynf%#v#qz;M|wk&pCy#3SS|kQ1q-U}{wwU=abE63&|e{O_stj>93)LI(FqnEHb} zleY+Hd)=A;>aOd%;ziAC&LHFLlD1^RRVTUXmJcWDw@dZg*IYXo|F`;{vwHc^o5$Wb z7Vqd^t3Londns^xt5njuTGD>|MX97~@#qJT3ocFe4ZQn&eD}#4)`YiS^42dun`qb} zHSDjC+}jZ^?pWW_v3Mx%YrN-b+SqN% zYf~nn>u8QN9yZ36LN|XJJsOJn0J?H<)C=`r(?6A$4PNzjl~--y9P0(Ek#eyrag}An zU0JX!*o<=tW+Cg2sR)u3N%hE7@z;?DcFnj`rcS zko#3NUxOJa>|)-pYy4MiXETtjuAz_$Xf-W570g8wP1=f|dJjB9)gv)*58cq4P+wBMut9&2{2t(1Mjnf&<1g8w7Vwg#AD3@h&@K;8I5qmclY5V z!Oz706ZtlMM-C@ESv?Gibnr0@2xi80B}r}rjABH#8bgS`9h^>UOv4zEFi~d{$hQnC z+z#>56WYLT{DxQ=KA&9^72K!a^(9Y?_>=6K2&kTv8^=GV57x9CsYhtO8;*Y6YUx}WHbk6@q zu0ogv0w5szaQFwMpLE23#X zHbb(fl~<4H!_7uFUxoy;4h^w2$04%dYnuPqlM4^fCL!_Jq0NGA(?Ap)X;`FXk19h> zS9XfgjW_dy<4h|h2J`x(}KXvNhpfclCQaQwCQG}bSb>XJ|0}xOO zoj&;V*@I_>LyR+jb{NvY<1hj0B1n>lyRVBHqU~1ykq3c7$;&Em>a+QUHuS#1u1lOEfK!Eq|H@{o3 za+S==1teEsxj0d?Rl>ikC#jOvlB+3+e5uqvo|DdkSNC4uyYz`fWxrI}f6v*EIgmCt z6%~bM$)yqUYpy2rWaOo1-CLaS)=J*m`^1Lt zk-R-%O}XRc#-y+aUnCzn&!KZfTj0EG`jNGJ!&+vrul4(6AG+h2u+7`AW4%Av0|{yU=_lDdLOHC&7PD}C2U3$A!4mdvdaspP5iAv| z!euDA8gb=9AY7r@7_31^C9MDDhN}>&h7<9mLLp12LkebvFbvCs6vOXY3fKi4sYPAY zc)W)!1J=P3AVIDIn+}d}m6BR7IVHVSAj^UEY8{*5Z%Cs6Wu?cNQDwxr~C@;3sS zu;C9|g(weS*#mzYzN1|!5h2?U(!sur*_%Yjc7${aJ5aMOVJF<~xxCSSD9T;`XXdXUdVy zazKTEwXJH|S+Jw7q(a~b+wm2-DM|otE!3G3n#bPhVr9XmXUaR>6o#<}wFHD(NuglG zgKPj)umnO(Zl1C|v$Fy(Hv+rtq!bx*PasFz{13q-ZR z8@0`H>?vdqiKEE-V_NRQ#u#{lCMI?-yJ$mF1D=Cm$n1*$m{@wsYQQe;#7L71h+&PC zis(L0gFxRPx)u`e5E(%P$Vuo$s>CGL1|m$_k$9$TXB&ZJ**^PJ|A`a*XCTy0TEsKh zHXp?i7;O7d&dVs9UYaI?P6}@`Wb_8stzx0je0$pFfk9V>Sb9qOcZfj!H0r}R*;TcP z%66%;T_F#XJCRpl)XfxVmWzX4N{b;~bG-sXGWf!iDfr1Ga0Pb8~b);b0f9mgd6S05u`<_cv- z{~YR();aO9O6R2I^=6$@vqd2iGb>XbTQ1$S9p-mE5muY#{ZoQl9)sJuuQ=iBmVDiD zS2z2Pv`;`fXVJ|*6z8RD>r+v(#Ekb|>-9tDFt z3X1~hsyO>V2wnaN-6SpjpVQ~I;LYeJcF^{>oHw2KyiYI164f12bqBmFJ&CPPOIx4b zuw?o3pd07TQ`&SFGRgTWs#11#Xc3mi~(ohzXI+ADht{#WFb?yNb z_G}oIK&MdBI3%1QP%=);hYax#1^??vzYVCQO`8hR>c+LE11IQU1wKnC%ng$G&;&+c z6k{?B9s~(R1hk?tC8ZU|&VGqfy{KsdpJbwDn6EyW{t*oE&6g|h%mfI_Ip4*PIuXV% z!)yuJS3O5EJ?#C4S!8oN4nP?~!*i1zI3g?EWQU#CC&R*<5pv^6|B5>7(@KZ+V|j6_ z?`Gd}yX5O!>A&aOhW$`?Ub3L=w(w5mt;p>cHcaNe?qv7=cvt_s`5P8<{vZ}ZNCNgY zy1SZtujK2ByA(s`|9|Vg^Jmilx{;<> z;Br`uj#66_M&NCIG_$R}I*Nc?JNjUT9F0+%XJ)xiZBOVH$SRLoZ{N**@rLb^Z~NVv zd%oRj93@)$f=FG_<)b!MN)gv@1!AvXH`Zy zH}**9bEpd|YLf|6al*8WxqyiWY*}Ps z6J3~u_f+votcKV!Bx5=m#TNq+us%*!N_d+xb}r2b>4dIVA;$D-e+1N!ndK+cYP6Su zR?CHrfJ9MHDhl3))3bOu=_^|j5>?wI{QI`u&6j+;;;vl^Nk`4ZFRgFyN62PP2C@x` zG9WuBb;hMHj`Wev$hWfu1=U(5_3U)ZRoa)cg#}x2%=bL89~AfIgBkQtdhY|639quHH1BvCwoUJN%&Lnj zg|AeNsa(zPcp~g+iT7wm3%W$K25>_s^g`R{*43L=mtR;d?pZuSc$FxXoszF5?rQlc zO^^wHJTI`Lf*AOXdY`uMsrNqicPh zI|(1ixc1whUiIybyY?~&4FyXw*!?2eYh%W2l;gh6qd?hykeyS9BXk(k{`&|MiDxX5 zX(!nZFAK6GW-WmuGYIslk0}YJ_a$Wjl7?zPMq+?QYKlx@~x~9LLOe?PQn!nEBmda20Uh zusa;BtP;4Z#gebc=GdneyN?Aw#M}0?xR(u=VgB`prwtK+_jdhXNjoPoj1 zSkMHf0cEXSbsH!#qHWlv`10k(Ve6 zb_q^ARcTLoc+%uvV*7dc!~yQp??|aZW|MXu)KhaGa@Fd7qjF=1c>HCvSfYdYQ(h5Ge7shL7+zFG?_Y1L~3gn<-(86GV__T)geC<}Bn~tJ|bZAFF{y@L$cD%HJf7Qp+r% zkd^ccgxEq3zJ^w5>|%C=g8YfWPa7|+&$jS5_SBm8)Ry+t&YxH*9cfRUX;0cHyFl9y zG*z*S~;%)nQOzgAVH}N^{)9GzcUM|ddk4N-#`*GHIje>Ecu9`VITd8l!@NRiD`u%2;@ zKVgg*lsNJ0JYK(}jN|+%_8RwXlK*QwUcc*5Fnr_J^_1L)vW;N$TRdLBBW0fc#C-@g z_>McJTvq<7eZfBQ22ZWu^_qr=smrd|?{HsA>=_=0H~lX6>vup*+u2W{&v*fPLX$wo zYkxZ>K=txIokrgo<_t*e>7@Fl2q(M&A!9^`rY`O5UOFrw6u5hC+$*v={MUL~?pv_msd(G2JmDK((85giJbjWs&4H>l z<9F5ak||g@B+@iAU;3z-xR*o-353P@a)T&#CLuJj<~y-R%ugIAL+`L27nq!lLhBkv z(ZNz7@zAuzxooEs?O>oFDqGGDJSUsGaE>%tmK#2J;vj4poPIuZaH#*liG#9}Ho_G< z5YZo7QAxHV0IRB_W^k&^Vi>nW^96i200%abt&_0OKA#QlOYCA`zJio>6}xbcregN- zlwUNgOCRvJowDffrGMFH6h3)9H79Rd3sM& z(F*hvP}rADVkc3YW2all-pmk@2EN!s&Ng~6>%4T-(KHSV%%z^pto27_e^eY(j14lo ziPXJtPHdr3XjCo~_=!R(iKB8J1yh-9p8$1->bY_z6Z8`WoU5S(v9oqGvrJ-`D%Vey zvvc)Gxd1tpWPA`uA=Cj$Vs>u2Zm!U)g@IE=hoK{C;>Pj#lQk=TiI|@+H6?1Jxw=7zV$HWBm0SDPD&MTrZ;P2GIV|xr!a=2 zOga{4RAaO75;ExV=ogp7IVz?|*;9FGdFJ#?+~uG#m1OIOiE%27r(F4<|JLCG16J5gXoyz=B~!Kt|G zRI;`$QM+HN-Ou#0)+?J6mAj?N-HFPlrOKz1uA-&Fr5qgZQiUVc*X(8Bns#)*lJ#ot z_1w5`3(gk7JDXb*O}+P;dRNPjNKL)VHFuwWx8aXRexpC$gyVBd;^jxM6I@)IDC)sZ zPon6URCFxqDqZSAp~bn2P8f-CRmPq5$->g>hakG-FOC;Ct@@g8cir=~;?SM^+S@&e zmOWC-p7*?a-!E)j*}YZ>Thb;sS+S{XzGJyF_|CDnj=gwekZ)iQkYE1vr$^dR4S0 z%6CZR$Uq=%=u9*WNDTvthGD5;n8GTVws5)REMJ4#owFD1L_0a%Qf+sTc1m?S?;gYv z#bi&7%#6}U?U%~;ua+M`>m;4!ac7-&tkdo6d!F`WT~P7{lF%Z^Z5N|m!sT}+Ly1eM>{&uo_ll2z3JJe^ltQK^7hX$dsjmKF-ANp53pP`j>t zc`cImY^Rv=WNSO!$qwsIrde2Zzqaj8_*=t&()`UA;_U-c?cn!@S8I<^qU!aot#mJY zzqRA-gXF73Be!)b{_6MPJF66yZ8~5+#41<+e*6CQlG0@L-iJBX4mJi_9_E??4X=Od zwNKsI_N~G{$yupFYf07ptJMRPyCQk)1l_COuWo+*Gp~JSwVLTaR;(MCovmm}RP;y{ zJ(RvQnf8Q%*1Ma+>zM64vYV7B?v#pA*ekC03u>9_XQkw-RIKBzy4q4gNU1&LF(|o6 z<((`;!QkGl6E{yl!^M}on1g{?S)ZtAl`2|SD;OY4?^k!PRHR z)e=g)uxCX`Y(Fe*KfGo?q6K!Z3?_PxNIgf^>_^wzyWg35YwE7>XVc%Aj<*f5!PT|x z+g0DJde{H`>hD&^JD;W^O91_zeSb3hy|KIV($-^twRLqX8?T-B%i34V=tDj0WH*q$ zo$3v}3AOszMtObDYkSlkyJU4;qPkP6?o3o4mZ}f0Rv!h(ZhhwFXQ0%QsA`w0+LNV$ zjeJu{@PWxuf;N1`1y$IJ!0Ual^(D%;O69Cqlq9#IOTB0B{)Z2*T1^#4A^Hcj{TJ1_ z&pWMuvBe5kB=hggco?HcU~<#s!!DSsqJxv^&A6`S=!Re?+)7xg3wgdAu(#FJSYDH&4!cs!y%z8= z{l1`?wu8Asd(6*cjJ6bv)WvmW%Z2Ro8q@cx1B?ibq0P$#yWY$FMkWZ;3?JGjh%U*v zbc2Kt4zN$uWklmqgMaQ5zNO*Oys<_ernQH2!D(df6=n{3i=`Zh_t^g6enR_M7u*Xr zG8>k{;&2%%j#`;oZ$(UJslt!~nBCb6hUpQL16fYBzF^0@Hes9?uv0E!DHF|sw5XlK zm9g_RrYwgSSavR$-PKKN=INU?kA;fo2~cq?coy90P!V$JF9k~33wh(_Gp01D*Gr!SEanE8&=ifAWG&DN6KXxq`jz~p$8orvBZILOQa zV+B5++ls}GVggb;juxG-oH`ZQ8<-k=ggFqH!|}j4q&lM>uon|hUc@Ug6f6~6QIyz5 z&gnCQ(IUn#c?rf1|POGlD%L6&CZ0B5PD~n5SY3$ z&~Azlp(n#o#o;a ziIUw?$?m&Dt0l)5-Gm*8jl$(1OG2LE7XM!p7vYH=kV+*4%Z=G0n(uvsBQ0 zr}eHo9weG}abVrR_@wvMqt}n#7<*-iv`C+S?fI{LY;ov*QRS`ao70J+HmRs>t$p8W z(SC?~6<6K*^vzE%PpuYjTRd_o|-r;?09zEFRFzcMsXk+=#E!!1)1CfZ*wBhEwO-=t_@pO;r0g>E7?1?!rn0i zQY#X^M#%@;neTMG)p5_)%cSL#-VQ8ANaVN|G$+XuZj%DrlHQV~w%b*RsmVjFKsuwE zjLulChQ2|CR?VKIHvmyqQoyx=K>x`eQ(ir^Gfnwewc!ZouDHGHCty}R68N(yO~v1B zKj=H`wtU}hJ6xJ0{u)BkEy)w}fF|Y#e~X6xVvb@<-a&RO@SJ&ZL<{>hzsIfYkZCj{ zH2S3+gcDye<6^`Iy*1-f#Q(+fAWc-xY-$1{>lmLs_0iKB?QMviF^#KkWC9LTz6t}J z9AE{HS0jH?OtIyhH{(zqyO8~;DYh@==oacH49u%t&w=I5oI7?-7?B0gK78~m?^wxl zG6X_H8lg0o1slS|yJHvZ1y_I^(`#=)&Ik@&qQGqW9F!`qW`8>SB{R1k9L$~nR4zvo z8V?$xp{hNw0O#yZ1qM!?#RM_QxXq(uIDU)H1jkv(TK1un!vSF&C>5t#1~g7F=?5?z z-yCkg(B2w|&CCY1EV5HUtZLN)3fk{DrPwruaI7Mbj@b(K5Uv2kX?i|1 zJhtNZpczhzVK{V$t1t_vh~%lJ;haXM;Y4TvfzvZC)3KsVkxUiCyQVO3DiEDv8ct>4 zD4aa9y7tKN=LV+XYB1cQMX01=8ZKUc>^mJRPs4id=lI(1_Z>_bbz2`D5UjVJhy)4FD^Ers)5Pcl?w|9Aohh zJD$akf@E{oJD#^ZKWy%W#kL!{OGlFBb+7MzZSM`c=2<6W2h6tG!F&ELW&@#FDrr_G z(|F0@jF3dhnR_K?-uIU&R_!;kvPy@{4=rSREzh)b&yB$~f6x8u`c-OgIIGpiZXCJa z()rHrw{|D}70X9f&SHjZEx6%Y8oB3f{-Dkj+=XNAYOx}J&tCP@hYp-E^C4vQ3N)_Z zj=MNCW_Qw+|KBjjISPIjCG7Qu0n3Lp_W>$077{~;)h()Nb3(eX#*W7`j}!Ivtw&CWW^IQm`f zHXzL&8L*-0nP?03nLQ{6T}C<9fDS)jK6sCe2EvFSPD9uEY_0;vIIFwosf*j|;-0!> zd61e+wdz_=W8^8ST^)RBR$WQFkxlBqix^Ac?XDGt1uaq&hU%QT+2H-d1 z6(FU-)D81bSUF~12qgwc#)VLAQq!ZrX#nqdOvh$@B_Gqc3i^sW zMi`P^y3on_tVI)YX1(kNd15unH))eI?X{siGfK=7EGQ91v2s z!4wqjG~-MdW#9LqXA`#kQnLlW_E}|&qm=K!a#X5CWy4E&iFC;!pk3`u^Yex~b;(3#EXKU!}? z|3TB~{mIA{+dFh%5JrG%6{=bf1solbBYGaSMW9rl#MK>d+WM4|Carx{VTcb z@CUMaoA5Tn7QyYp_q-iHz3;2|(3Ioe`o6a);SJC(lK)iv^mCwYu>8(u3+Qc9^|nNH zuTR&uR+RdP!1=IZ`rk)w|mi&bmlGY#_1o4f^Aa4wnRan zRM4lj74xumA)u-rcHW0K@Bn*4d5Z&}SbO`(x*2UE#p`<{*S7UNPkjJ0qQ}hND7s_4 zbLLxhx1W0}Ppa8Lwnd>o;x)O|W9*mP)?A%v=kA?%{VRDNm|TvnUw`Hs1FwDT@5#GS zVRGQSy-LZBL#f`gw=z|dG(_jbw=uHl1c;2y=u&YU6Q)K{B8Pe{nD1zYPk^>mf`sHj zjV4fzFjCTKxk+j|8YD9ZDY1_&1#EIlN?NU?ubCi2uGWt+zcu8_YYmbqYpQkuNYJ>R zq964DZ^MXiL+M$1=}(yGX-mn7bTcd&Y4r+fzPD8}lbu~+q4P_S&IoO5M4gruhU|rG za6}4uh)(1YImW*NYXxlr`3Px=f5tAC0cvFXlh;EmNBQV|nU~>KP73#F z!MR|)jJ4V;!+5_zD~leK+nQM_2uNl-Y$|V)SUa`NRTs)^e>Crci>(j)Qu>|2)(*H_ zo8;zPHKwU+SU#z$q7A7M?7}vYDO9XIStm%;EaO^m@{x^lb0@HZ&Ad3Q5!;&?r}xBp z;E${Bh3soZ3$#EmhF&gJ4RzAEdyh{yvFp+IFN%N6Q}iO?&A==nwNrCjM~JD4P*3jx)=sAdhb~!KGj`- zHrB*5sUeF0PQG5+6W(0Fzns1Ixkt+~?3H;bxris$#`B>*1|t}uALl9bJ3=>URm2$0 z#4qT1xlcSF8|D6#wl>5Oyw3cNeWgy|MGxLq#>>#}3m(D~f{z)C%1!IVhN-V)U13%| zkpufEW6(II{`pEbo!D^2JV6Vhj0-72Ta0Dty%lG2nnK#VgL;KCvfvS+2J;L5Cgi8h z19lLJg0u2n9}{+C-@0+;?E(dP#D=c*?ZLsYIc^# zZc_GT9g{;3ZYGKX}=y2GFj{B6eHsE?W0=Xw2+(qPOZbcm2MUb`5{|0O7`iwczobeSeyz!lDs`|HcC|`Q${e&M@ISsijA_J{Wzph zQ{Yi7%|#V0sEea?nj~^CTX?!vB1HoK$EfYk=UYE&cBVN1@uO_&roX(I}$bRb`utP_Cl8v_L*4X2ooz^wZ-3II@p{A_P$Ax*f7F z8iAY=flUuol#FQMr0fdYX#j*~Vj@!=lHIV{G&U=YhH&^8`oW(NKy1(%311Qa4b_LL zs%#Q3Q&cvo$U$1mhNcZia8P1cHEQ{=o%&LKFm4N?*p!T>=S zAUQXR5kse)L30PfT;*CR;N3zWgbihqs)F2v_*p9AbL7lX{T&zS)B|dKId>9=4o4IP zD3>CI0pr3jttb9RN)CY=$c-U+4E=@Y$(zEtnAq6V)SomxxX{?eS)3ZrI<*xe@S<#m za9%|G3YA3!jbh(cksjmHXyOvZvQEP`>A#C~Fu*f{8^c`UyNFkHcA_C_TJb+q4(cs( zzA`kG4#C9Qlr4G>!)&2xQQ4@6zLpX|he_jTBtnz2Lj$3lui%Fi4Ia6O%tGP2r=lo? z^*2Ro&ZZ125>aC?lHJPKQlTS0plao&0K%*ImCnpS<6r{|>Qz_d0`yI#!89O38Hw2M zA!Q)4U2s*P;E|gm$?;V{YQE8E4@v72Two#kNwB-3yR%dFg6Uv5CRg7oxms7|;;z;; zS3gd2yAewiw@JlqtG@Pc)vb*GLHj-5e$~`r^*1cbgNgcGQvI&g!0vc;ALP{bRN-Kw z<>1$*;+1V=^AOT!%;4pIFm)>eiSjO~yzBaj_40Zc>?HyCwtK~GitY!_eQ07QI`>IU zeeuRUlzX3=yX|4F$>Aa8AV|fzYe_!u#Pt)bx;2ukhI#1l2X?vzn6|~Gi_Z0`Kpgrp zeoa`gTPo>ZbSEniT>5@p(;dqjz5j}-Aq4KHJtZMsSn*l~Wa;ygEjtwb98wl=H7y=n z^8Bc^_qFPGJ@I{~7LUDOT6e>itf+Z?=(VBMiq;zr_DL=Gid)t?_a{1sq|TwW&Qr;T zEpLAOjgKp*=iM0m3alJf!~(6P~<(|4xCcW#W2jJC(Y|66|Z|<^CU}xI39xuWT5H@h`01fC4Dfj zTv)p1-wUR7(@wni9~<5KVO)!wJ1;8SsLr>E8m2sMBvvOa484yCgFOWKvosf9vhjlD~6h@Y^T9c~bHZ zD2eT4%WUCS7uL&L#%B z`mHl}{fXYErQWAkx1B~MAUui#ZGAU-622B#P+Rr1e#;W?IlS6=n1&nN|pGUeEz23g}jxcZU;uPD^`EuX@ic+BBV%*4vlX?7P^hVx^M3G+_@)_TcS? zyY-1(C!}2`;`ZR0{V6SV+nT+dr9~a867~kk-mn~Av$w7XAk#s*UeIbm^H6omC42eO zAa!Ucz(EJ>)xPU}31`_z4EkF zOvX@)cgMZE|(tZ^Bt2 zIV%#*M#$TN0JMQe`jfk(M?lN_R`8yBCk$udaK2?*A+9O`zkt&h)@nKow9M zPz9i{Z-pI&rLb}X34qwR6I?~flw}x1ffOhbpk9HbfPgL8QDRW0Tj23Y6YMAv)UAmi zOC3X ziq;KhwJlA4DfN{GuesdgtLy^J;sSqx3-BMw9xUeTmrpMbQ zFBDdPUh>Tn0Ej9{Wfvlvf9cyXUm|K82031HH>(Cy2Mc$1x2SoQeC1lCj27AP=y9&I z7Jf$etd$%tji{z|iZ0du=RZS)wV9%eBV=C#3>Sjas|qhLK3JC$H-)VPbNoG7jbUbn zy!57oZN{uA$~Yb*1l-1IATwhG8D=ElfegcFA_o$cTPHPTj8#{fI^2T-WhRxSvLkG# zoygg14pKgYs=`Vz>J20!IuB7$;$~zU0MN28Z&JFZO-qMzuyQ&# zDJO}vKfRnSX5q&7Aymjw%4uYW05xNwa$oc2LheP%RqYp7jhS+T7E^4pn3&S5<)IuI zDYBSYRuu&+DIunjMXgYY2n-Xf&o>|&5ZbUjW`6pwVIvm45Quj+tqtO;T@MV)$WsBGUG~NxxpPl zI8(>y*9Py^QR2imJf-Fdn`MP!jiLE42Kw?4kFXP-eN^$|538Ad6D<`|l ziZV`N^zcd)NiBi;`eE?!6pfG5#(HK3>8TX!-Ut?vassUm!nIjV-*@=&% zKpJW*P)88ONto{LV@SgR@VC6Rc*HxPHz>YGxB*7WOFlJ}2zVcg%!3|^xyVv^2+`?I z4)yFE7(D6oa5o;?C~=T5{~TouORR)AMg?eK`0B7YN{|Bl>!AkXkBFrp22&)r2=OPh zV@xe$j)?=*xgY&^O2-(@U(yldF4O4+tgSeY!kqYBKrgu@UnTAbwj_+TqWPPFkj5OW z2r_HM_?rKLcg38>g#*|NV^07IA~PxzXV1d)G-f7dG?sFOF{Pq_rey&wl(E5oImhz1 z$luuXS?K^WQ>6d!>6ns(R8JU+l#r`y_wDY*^HSk>wZ!9 z^+U5gvr{rn+9tW%mWM;`wy=A5sQ;woK8cksEg!2k{K;7a*X0(eyfXw3_HvL1rNj!9 z-YyN*%QQey*+Sj@qDEL#m)3;pI;GN1EbF=6gt z>^cy#9$<@p%*C9wH2z&5G>C?Wu|Y6_g@7P18OcatZ?TCjwxMC4c}$(QVH@Dn4Eqex zQ%%sMs1Czy$&jaf^nG=kkqH7K3r--mEfxd2dQ&|B@}i3^BnJ0p%uGWVt&Y zn2)-rYgd#bY}dYq1t*L!D^dS z2?BgiuaZjpRoa|c)}WuTSnzg`g@Uom^!u98yAyUGsbqHmVk9+?%rIh;V5I?)ESQPs z>$JBhM>AH^Nx~Tz^CC1vW@wi+{j14{q|(aw9AYlkt-7|lqx*ub98EGJ#@zwZdIA^6ys`fpab3e^v#=T%2L2N)U8`%SaburXeNWnfh5(3nQ%Qf6GY)c}KTz)?|6`u$f` zFg7?K9@U-l5Yy~kJVL_N2|Z5&4TZu%s3AL`D^!|}F*t-b=N$&6KYk9*lWHaJ*;p-xp`aQ@A>@jg|igl3QAYH*5y@`JWJR5m?HMMb-AmV z{v!^W*5%Z)A%}J4#BEm2Rxm9I|k}lP!X6*IWY9p<)-Dj9BUDPLQ)w=Jd=@V}#VVAs}(Bw8X zd0TPNs(owddI@fjG4M1ADqR%%__4FzqmBG?V@i1*q60S1aW?+h)&>f^!@1PpQKr3T zv3DBQNfsEyUSRB^*5UCI#7t50JY?cvqK`Y8qiM z*(^B>X@t@m@WSL1oL1(3Z&&B~Dk!CDW|JqTc$%)1CTJ!Ha=sYQL`iK0KA2TmE2a8A z7JrimBjVY>rHeg7T51*nPr1NX3%}YVEfD`8ZGgo@VnQ2K9W|o)TzPR2Kq06#o7B`- z4quo^C>Olop`Z@J4Gl9qrJta7!Ae6gp%=X6O52uu`*mFxUj%jxmoCG;Kwd!9CeTgP zgB0jbjg&KdW&+Q!f)8K7Yo4FDfR7>~wi98(Qd{X4H%$CAnM2qENe*(s%uTMa-q9<= z7cN680^!gEeBZTBdVKWUu*hn!EClf1N2a_#LZYO`9s*{|h7eOI4<**G7p$b`3A%Rp z!qwp^3N7VrR$G!iAjbf@imK>d#1^ywb%jk@PwR5iPaS8FD>8}<^Bq*GSoB6f5u8KAD@yPNZ}5e&OZX{K}i?S4s*GIAkltbzJSxYY%P2!vbDn1t-g zh^JBVG|t*q9r^Pod9b`lal2I9zVw+pN#Wu?$ z91~_?3PC295a6gMUYFzb?vx}wg)h_g;|)?gU^dW|UAPV1Ch6Li zr@|%sNfVz*!a9aaw^u6M`=KpdcpM_TRsO~%si;YEG({VCMH=@?jeF^hrINFB!G@es z;-;>M|Crdti=PGpXK<&(fl>)SZO_B0lQu(g?MV(AexY)$o z@D5Nh3BzyBiDKrJF`3~~y_DOTC^&RQwjYzW9}86+zvn&?cL~K^Xga;<@F(6Xs>g1r zdiQ;A4-+3QPDK3sB>%n-GbH~(sp6m}NL+eBD(so-2Wrv6kw|5iRN1xsqEy*~(p2t| ziudTGi_!ejg`&lmBTf6HrhTEj{l6gf`XifA)@zD43dILak9Vmu#-}wIV|LV8K$1r2 zFS%?nld1nVl`Te_+m-Om$E0S!uoll>M%QV9Qc>3G?!Xr%jc(EWFpN@U(JhlwmPT3* zN-YP&?n5NEh5oo&a#kSa>Ob)lOuuDJW%X=oTX6+)8XKE`z7 zHj~VCg_;Q~SWe`VxZ!D)sn_o5o-))#o_AT3Dv@gy5F21yX2^rVVgQ6x>DC_#*658NUPbe^vVnM<&zx za^<@A8{i@GNS|yV-U?Fi2BU9jBy?1D!s1~~;HU*!Ku!k2K&2MSuWV3mHBLRloZx`2 zh(2#zAD)7>py@_6dWP*ixw2n0nT7FP#!um`=wnfjKY5%Q3Nu+Q%&kb*hNkn)+P71Vl(y5QGc2i+PRn`9=}Vg)vnvF^ zDVYS~^r)UXRLLNf@Z(BLnc`P-Ah&yQY#31b8jY=yoZLjYo-?gP(jykQY^M!`L=g0H z)KsuBXNk!;d6W^7(_2phQ>*q8QVa)&uC(~P{YM5)cnL>K!Kg}m8#ODTO9ta?)!o)b#-7=IvX-s)qS#Nsql>g-P*Y+g{zCaek zz8h6$Ds07Mk*rL?>iNA#MEMmgok%>-B;IirVPc{6l{Lw%Y+x%;BDGSW;-R0Mq&D6Z z_^b0IaWSx&Z)|+wN}~ct*2t}7{nIcIwCRjx3rbqc$+ z2}jL2MT+Lm@T>fUddY}pM*TB8Zf2Lu$o*rOBRRR>OGkf-5u-EMouShS+G&_u_vn-% zS~9Z!DSC`i`HZ4xn(pIbV41{BDnP%oHXi}OJDGy6tbgvi4 zQaniNIb|Hm(X7Q>OmoJOZ9k1$`5}?zr|>w)@^qo7K2q2&6}A)g3bVg!*RR3YRdV^4 z{C8cuL)P6P*X~tU30IvGcJzSMK5*AH5V8)0Tmxj!sZyvE6gO-CEWo)+HOe^ZECM}L z1X4N=q$`jb%U+v-<~3A&AZ$G_JHsK>?`BmjytLHv2iIT>X|qS2{dmM?54qYweG}C$ zTX7MMpi|oriV2kY9sFMyYB9&ry}BM zlRRxpm+yLZ0rldR<0hHkX4gGVP6p(KRcLOXGv9MJ{DiP7ld?}Cy15RHk{W4?RJdhn zSXRxGE#Ygs=G%Eix6{=f`&SAN{KUwvkb#sjgV1A=rzzCd2W!x1ZgnKLMapeiDwA^C z*8oAYC61CDF`<<_M@(y}>=*^y@{m~=rVAyFk>YJD#oNd**9*J{#KM)0RR_>qWUSf_ zId+z$h)HE36}RYe$x`yueqnS4{iaa_P7Ui)iwmmhdX@Y&uly;~ z6o3e)>zGv#q48q9g4%#d0LlAEfpNqN!j~%lU@RFxZqxgNQ{y8@tUZJ2S4Ne2wkr4> zEkE+&hWxmHQv+M`WCNeE0NkvATBf+mbGb4@?$x-sg)iLcNPKmypi?BiG`+cerwp>_6sKYNEV!aHmDe;@jCNVVz$`G`t|?Z$TDOAjezMC zq7_8(cVt@t#OK1OlD+v*9(993raF$)iz?c|OtKc+PdB16w3Nz`20iyAn-H-}mrJhl zto94&e1OOqS%_#xof=z(vKv^vC_3SdV{p?s$k;l>tCdSRGJd1+Rj{<`Ty^6w0hCJP zAAxxQ)2Ix5dE8!twz;Pjp#B;+te8>Z31B-{xDo+W1*Pd?!Nh67$K#WWpD1}3C0 zLpSbEqBh)JtqAYn5qmHq*Eu3#(qIEwWRnC<4WbvBiDk4ar`_MuHJS^=3Oc2^fX^cg z$5fsbUZ<<-)UbY55Wj_+#>m{P`h1?Cnr@|Wzk<}i$+#~<2#yx1s3lUgO)A=Uzi0=f zZhIh&YG{r$9F`gmGn;rq+QNF=m`-tAeB^48!zx4qN(R_BVV6ckoH$25Pr%(jZH7HffLm_)BS7J9oJQg0D4PXpJTW zW%5vlPk@68u>Cc&E=Ti|CXUzA0?F6Y0!d^Np;?R1rk%xd`l@v%c_Vx@!dSbIm~BQT zATXse8_8|nwvi?gt)O6_V3SNB1wUP$zP#coP?qvNah-3>`b;6 zOf$*I135B*$~;WErVc&tcVgnQI5H|@TzAyfc|ZTG zrbmSyn1}|_gl@u8etunMCx=CtBX61+=h(;{a$Ux%BKh-L(R##l!45TVY9{_LBdL14 zWYkQS80?~Ve?Lqt)45TM2Qpb_pGqxnqapkcyXKBu!qa*vct3aQ+lStlhq6e;pcMgDMZ)2uUEP&DhDJ9gdmz>^=QuZCUN(}}z?b8j`= zY?z-~DvGr9OD+8?nFF(_tL4>^@@-Q2wn%v=Z20e&!(+Duw*N2dJ$6509=jE_k%}!+ z1!NP;x#5Z)ivCu9I8@#$y8!G6mk&hB4@>2T;WrvjPO8Y9b3w+C0o+g0YAuv4JiR!9 zhFQ|m9p<=g*(G3bx0VX5{O2Et<*3l6}ATv55gwJpbbiu2DNYo+%fi9qdmax5t zPfJ@Mi6qScEhqYcSh|GmB~uW@oiS~JdpYc$0V{7rIXKBLPt}1Km&?eW*FFxBww>$O9jYb$_$=)?tdYT#H1=#Fk*~g5NkHl+vuno`zCE+z=Y}Q zH7eK30qPU(m(0G%WEq~9dYEw~1B75OMzVOjutc(jpb0H3He(k{MYs@( z0gZqWTj?SRsu3Ajp_78#fMyrvN0FUlR={?Uq&g^WLvnqr2t`CQB*)^F{h!WF%2p!N zhm7W9o*BbJ0fVMItF48~mBieFG;L3!U4ax%fCYP`<~?^_yd#F14@k~NpdHv3_D8*S zA%BdBe6F5rImoqQ%x{Uk-m%)gmCpE2Jv|03qeW8sXN zA7$k6D_<>!w{uz%*fUQ>$XU)=E!3aW8PjD)#u2~;VY_xw*e)%Q3*m1-#dBfGFV<7z z{uwnReGDwtOp8EEG>`Smv=W+^To2{lK+pVFI-)gOY@}0K>qS2u(dQFeumexdBxgJ5 zLO1Og3QZdwvCo;#$uFAQ8F)x4keAtkd2Dj@qR3Y9EtJ||VdlMpeQTO0h;&`Iak(51 zW6U5%K>Mkmak}m||K#Lh6`VlrUJ)K8W!owq`vs?Kb~56slw6fzN7Z7+T}K@Sd*JpO zcb%scYabjgnZM!uz!_;iB{iSIs+Vt{O#?5USM+J?cC@)K(!5`4-haROAh1K=6F)y5 zg?eHbj^rWdp^)!T82hZ6AGYaWfJy334MbSJ6OE=j zT$N{a{gcWH3vlHmqlpbq8LainQ=8HPSWgo-rF;Z z8myC;&JF$DbvdeU(_K^FqO5rwy4eU?S*bNS8DhQtJ{B8-j5e|?F<4uP5>ZxDq9tCj zAAn^7V6n}5h-&0ll-P2Z4ajNBz}^IlmnyAb-@LD;EHMDN{CjXhvoI?n(~UP!lZM1RK-;A-`v=pF_u-txAeYZKZA^s{ zJi*cel*JF2OBgYe37h78x32%G)-iG)%RuSKNjhQEETZrF#;pi#uUnMKuM|no+qt*!ludASdZ>VxQ){pODEYG9WlD2 zw8v)8qBnk9X&0Y$I%HQ?(7li@$pU!>GXNuRn1N7?1DJb25CU@#7^RcCiyZkcF8%NSi9cxiOxGF-!a+zy@q6Q9YkVU^ZIw_8gk`5InHdO&AKu3TkC)c$WXpAHl!|QR2yD}hfTW!oWL{%Ig zy*dP&rLnWYSk};`iAy{<1rvY)BO@rz5~&uV-3{86(~ePnObf!)9?W!{R#~x!cK;kn z5SwJ;5?8WR(wJffDNK37s6%G4PCb~Djbu{7VHWn$Q6D|AopufMwno}rr%cTdC;{I9 z^q$Bm$~Wkim3FjjiuH8#8l^;dpo?&k;L%PTjIxY5S=bG^OUh^R2Z?z!;u7*kMOsv; zU4?X5W*HKxd`+Rl>D1)$iL0?x{`rC;n0_Hhg;Pcgez{Hh8@M>_H6ktbJ@0KIC@Jn3 zrmKTru;%Z4EqbW z;4KeXiyky}e5*a&1iPDtk8;Xq_b?12pOoc`WHm}zjf-cO0^zKkv;8oe31xxrII)hHShqwdN@bEI}a!k>E}|y_Ijy+@6V) z9+XNCvIj}a1ai`gc&Y^ImYsj=&{q#dJ(UYDhCLMGU5U9D_Oz_J+e2HB)t-p^u;f0h z-h{z5v{zL^ZbiIKu%$(;6%s`p{Rk9sdsktd^7L!h zqVDW?f#5Eh|7^(R19(_G9~Fprc1fOH5zlVPv-?iYhZ$kd!Px=&JV22k>z6%nHh>B~ z`vKVvu@JHByKwCr>GLDbTFF_vn1jG6tcl7cclpBBkQ*FGp6pL>YQZ1aw911`FHZ2n=V%u?Zji z?lb8Wk~ZxjcEcaeK_v!It&u;87GG?tjQ(3KRfy z%1T(%FOOptnm9L(g@x>zUWB4!k}dEXmiFQGD?4*gp8_uM6!#&NNd}mZx;O~#Wm*au z7?HNhb9qb(-ANIdML}a7#lhy5G4PF);=e)KnQZDQ0@D}r(lr@2)#&7 zjg(Up$=NF9Yz^mZkK}YqIo^0lwS%w(2Cl3cD=Q0DK%WNJyOvvRdnA;zH=;G zu@@o3=2GUZuwPiHjFh)a`17WQ7nkArZBE@!gjUa|1o;nF|om-dM?|HUA^a%xCSV-ogFb#s+8?u&i zMTbo#&^AbX+5G%Vw6Wp*oTLN$Gd*cmR_0TNp>j+XpF=(zFwxw}uGY&}Qy6?7tqlpA z(M%1t8()+*QWnT6mE)L&jGi}`%XQBadWO*~9WdJ%VRjCD z;`0J>g2d3zikHX7u@o?bH_iS2_$Ztr#uLmY-~~a367tLmwiV9L%nO{>8OB|bF`KrY z?TMMGlri(s<45{VP9J8FDN2dt=%^dmLyDcss)e-#Zu^@>l*DS0$ybXWG&E330)5I7QYCq+BAy+R2N5vud3vJRr3+i{BK&N5 z_1r zx{G+k$c3C0H<-2J`LlODRq-8ycU!yz90|OaIWr0DLtVDm18kkhsB|)nK_$7BQFv3d@XR5)xJ=%2DsVuAkY#%svSK&y`SQrSVa;Fy{0+raGkd>MbAey{YN zU@xoz40R5;YLu73LZ;;PyVNGePd%vwrW?Ir{VMgPk= zbLTj)4?$=#(IE-E$6>$479Wkgcp6MAEv^b$3yH49@t2svq(ZKcxOf!cRanDEFV*_A zho{eB;Swi}?BR)PWc;qoHFNkAw*C)Zz9>5;sxnRP217U2IrD*+lkdP2@dUmTrwM8> z_3CWk56}%J!PVrzOf1aSc7|s1ReX3cf?do5CK1N_?9IU;5~YIlVKJ=5|Bddm5H1j# zg6$Tcri@93%NfsSa2iVuj0H!4NW(eRr*KJTLz#DpP5?^<`cfL=mb#mwx>m<%&g z+Afv0Lt?3uy2@fbFG*YbBc2w?(-L;K%4ftjhTV0K(^4UU%@q8tgwsJzJPTWa(?Mt% zlaZPZ34gARRWF>v467T*E`eXPe>xsl9jDYH6~*&t;$tYrG5&MGPfON!U;lxlWH zYW7Gqd%`vQBqxH^Cp>a5(~q*}mPT?KR&p9<_b_oP7g+Mw-E$&#iGaW)R+g$|C8uRw zDj=o!7OPjBEvs4i3!X?>n^e|zH>)k=Yzt+z!NW4Q6{W9jj?_>nlSoakRMUIErXMk< zA#yzub!E?cBKdwPAMT=l0G`a5d5{WFO71$C7t{`zKG}g5kg|P$A0!6zJ2@5msw^K? zS@t#A)`40OVZca`_i+Gz8+*PPO<1&08yOPJn6zaL(-T>3C2XX2Gkh}1XsJh9gM}R! zr!t{pg4via8DZ#;>pkKzbqrq_9=kxk;|=g|W^ygBe&8w`2Ido3V0{*4jHLnbXh;QF z;?o1o3>co;w7>?XQ=b!eqhSe?xz&&66hqCa>PW_j`mQw^sru}c8RY^E*rRy<6+w=? z5q0iU`xdNj9;cdZc8p~b?Nii_Xl`+o9OIg#f+iM)Vuw_;gAwDcQgLhCCfEvoAy{m7 zMtf?t6US*Tqy`vKX*Zk@)x|o)BFeFuxt&qR(AZgsR-i{_ptA?i74*3ybJ`}=?qT@p zRWZ0`e|XI_>EO?TT2t@quBj_Ddv7{iJO!h>1p*W9D_QA&jT}yObxN5r!&U>{8we?G z7?(kzg07|N-m8|BiHW%2}Dh$_~-pTtC&N?Pq##;UN9%{z1hV2EoVe5?J1qg0rH(@%Xj*|i!#p)2q?2=JvT9MSF^X2NHdce!e0%jXf ze1=vh6!x%I%|Ja+Xl52~p-#Q59>{N^V73kFMfD)qg6mD{sd@lDL8Zo75Lcr#f)RCx z>CN(fiZTgabC3TqK%O)3KXX#GM=>soLR*3^y&%(oS=T%>o>$0tHG3xMTFyA&kY_w{ zYvYP~Ou;sdN|-#84Io__$Jkh~Poq@E^~gamQmTnO(O%0#NK+%57+qMbT*@`3WMa*$jAV8ZR;zeJvj?Z@^EU-nf2 zowMrvM+;+xpgFKt0%;FcOBlg?GvXy7=GDyvsF@MxSEl5k{VH!a37^4RuL_*QNO=1n zed^n-32*P!-k=;A&J3z54+7=_CSdMY(rdrx_iD$=(M*A%wGyag+BW545Ayr^z19LV z`ClMhCZj$7(oAy^tPXzSP76!4i{WT%;$7e0@IdWksUOSO%G!P;61? z@;HYUOB%lvOC1gli{kJUH}*M(l;Z!1))AkjBgWq`DY0E^8u35SEk^hKfR31GxM00R zmun427<0x<@p&}NGnDfe=%If~I|@??rjZK9xm6ZMg|v(u8plrG>L--Q0p%}pgXVT} zG?sC3_yt5G1?CUcg=D-jt0I!-c1)T*5o1%oOfO@$P5Y<-pQmhxXm^8d6l3Qr`*?aB<*HKn7GOU(8K@nmX@uNTX%unoulbb`HWh4!ECK1|`*V-a+o0ki#G6 zB5FVd56KejTsiHA8xWe4jy$ocV%fTK@HnSTPwVEFd6O~&`V2V(?Gg*fBc`A!kP8K$ z)z^vTlEW{rdo^i+J^JL7AxkZ1qI(Zr76a7Q6=>!lN&Gfhpnw zim(K8s31M!Wm>rK6seFMg$&q`g)HwDi;C7z>;8Y=N6HN##!X~*eWP$}oge>nP1&5>p#GxxjAqvxH4?;=+ z4kf=q%5PYF4sr074@&v_W)H-ZY-ycLsK8+!8zgtb;^{lvL+*wZU^8YSo*jx`>0RN6xrJglXfYct8h%r5vE zBegwJEmKou(j79SE;!JCG>aR|04RJ&`%jcUlwIU!%yJ$E;R zCJo0;kGBg|HSY|*H3%f7aAkkUS@r>>fG9 zW3l8cmVr92tT_G1j^sKOkcKS4%cv>hgyd-Ja>{qjk=@6n-N(YaPlUFeyytxCSC4y= z1yAuKAvLWN_zzq`LqJp8kwrEcM!;AOz)jh=TsK`2cZcNexQFLdwL!SW&CSSat#Q>|07e|*D{jw7v2A5osk(4(XUNq#JBc0Z+#C!GYBE2`ull5> zz7=;rIQrL*Tt5ZZK?-+*_8;(x*@mZsh7u-*>jg^HHF`50Z%Z zJNd;2i-hkt_V^DK3O^{?aVRhOKjxZfU)bArBqRAhxlP!w6ZjBE&UpM1%v#{_4Ruc7 z;K6l?Rhp@Vk?`i6Fp2q$Z$=s>2yEp0utSFch=qaC#cDL<|5aL+9QojYm;BFQ`gWF~ zRS>=?zrxRHp(c5Qk-?lon;CKTPcF4WuR{@U9ws+0yEdoy4PT%z91L(A23fF4f;n;m zWB>%|a-4I(laB&RfoYn@Cw19CFi!SdwU>03RY3&#K)4DJRIvSqaW@DTO=9l#4)EX> z9kL5Q)}GF1;iljEoZhLQqoQ}Vs;}BW7*SQ4S$$gA{@GUc)Ak(uX=nQ_>f`Jb4*z`B zqmY0z&o?)oX`Xq$h4$>Eg`Kn{oNUN~E!mWVoQ0olc#3jTPg>OzzgBrY5rBI0LSfJA z!B3xU;WglAS_3e87_RxhhwnJ(OO2VyH%_Du5VNp0rts~PTiX$;9pnv^AG_#Q%5ZRU z>e8qpqQ8ND#;I8w7q1hUkjevz1YgX*L~zNwfrl3(#^F6TxL=U|3Fza(z&;VaN~%`*N5Q_9K=HNVRDBw$ie z=?v~0=S)wDUCa&4;28!4Ygi&pH1Op5Am?PtMf;`6N15JGpLUeX;LV$q7LzNl*zpl;-dogL!49MtpFAZvDqq=@F#bs?0 z*ngHImxAhkiKY>l$6Ntx0RR~}6e(p#As?YKQ}Uxr=fqWK2-=!Ngc_8I!y3k5HHgl*$DrVOT z$f9$`Y@A_1fj4?^i!UL!O;iU4d}Sj|Ah{?HCXhT#AR$N5gZ#ob^Ka)bv`2s|S#WBgVZPpgm z&2}Lx_Z44fYN}3ZWT3m5wnL_blE~cq$<&ahnE)P#EPchGGME(DG_`T8sMCp&(J2O> zYViSF4Tti$@u0;dp(N1#LZcv;>r;p7J$0z|O^kzzAv*yEEXU~&He_K$&LQT!m!6*r zj!{rzChMS^+E`cO4xV(WHIP&A;{1&6gHz;oLgH!O6chb#n@=dw-;IP^ugZ=N^gVJ}u=w z9d)M7ir{zi?6Xc14-`aPe&7znoo2o=QqV6I^xwIF+iv?$?e6Q2j5Tas>-Uudm$j4&%uC{ z78dGJ2b)eQl(@I{AcONd#Z6YJ4>8vY>Vhy+QS;k_NB8uwq&$SQ+@XMq#r!nUtI?^^ z?a4!vIy8SH>|*JR#Q{?DVr8PRd0yq+1vO8`_Xyiu(zf1Pe@5GaN(nTu7aSy>QiTwtpiTf>U*)n!;M*#po)ow?Ph zGqV*6POBt#-WZ^Igu3${-kmeionPN|eOJWMa@Wza3J1ZwW+|_EDId-ZfOX2-J-Z(Z zfIA%4!W%<880&kMM;Y5!DbY4tf(rxGkMoV}Wg8vYEj-x3Q+tM@DSAfvc=JWCMZ~M_v;&mux%%+7UiYXQ%jpk-fetj=WkB zB!c#XJ0>gok74h6MFy1s5d*YQ%2Qgi!;27ZEwJ zl910cHjY{`vH8Lfi?LRbSXY1Fh~Uki$y9_u5o0z3SYo!%%k|u1c*+S zwSQbMN@`agBoBDlvNDZn6vcl=cQSxp3LYK?!y*tS2#Hoo89|Sapg@LTw+Wc#QFE!oevQzq6$<@9w{KmhObnz zmPV{qlC^5ZS`&4+B96+tj!Gu==SmOK5G=Jv+73x=hgO`2qqROlxUPy;*78Vf+&K4s z<=&9foA6kuZSOtjKD@1JpNUnVT(Xu&thJK0cEwu9j0Bsd;%3CRkczk6FWxbGI9gI4 zDQT5TTJM*%!+VFE^0{)3=_AhHbN8SEr3jfJlyt}yD3q*)5o?)bEnBfxa7UUIPaUul zNO8fd&;%9DlKGyKp3CL|_evydNyJJx4)?6pY+1l!NlPT-(0oaQgT6MAtZ77|EyPaR zHRDT&G_XbbCL#%&I7s=kbks|`v$XqnboVdm=&xw^T{?Bs(LC)wOQ(D3i0rz>e?doo zOGhnqL<5V(co8e9zmX*-Lp2wB=!gM&e2$VZe2?$aHCpJ!H)uyr!6G?ZiC?EvGW--t z-z(DE&qdCu;{k4wK%yOy*`pH$E#}Yz3$!EBK>S0x#+KebI_;<30PTnYWV?Tj6hRBt z%vXgUnwwTF>7OLMXtH$21=_DA3n{KRzi>ABa*}1+!hyIzzsqfF>^DAc*=1>6P0xta zFTSKJGoDIERw2JAZlfcZi+ke^I&up3^x2WQw%O;em#n4Hl^Ij1#lLiNxnlY9pU-@A zCe(3I+Hxo^;9}_5Q)~1a?={&hbxSjA0)FB-wU)+cPJTR@ejCvZ=(n?fjeg@NO&5|Z z_Qn3CtfkR!T=?KZsHInG=vx!$YW%a7Hp@ZNswX$j4p35i+FB}Ii(fJcrQT>s=|kJe zBum+9eqG!`2S|b{q_RU^8+K@?2OaG2f_ab0@)=XKv^k#4k9rZxg&oC@CS_UD7lUyD zKg%_1^b@bkwWNn?JJ$sK##_rQ-HYu@$?rl#iNp9FQ?q5SX}NOw@}0nk?V+P5zQ5z! z6QS;>q>fW-0^N!SO#LQH-Tdf6-(vDx`xZx*D&BoQRJm0u*|sLoo%o@oVvBbnYvIIO zd20d=<6X6uqG*2c{K!K4q6H6ZSv)V*?vN@vLM0v0pHuS{t=gQkC$8tMS#XRd+-B(v zRqtFA@EaFR4zy=mT)@xLxi$KU7Yrp?+Eyvg+xc+|ouHixOXq{PeeqOw3Xxx7+{O;= zLU!JogAO0{S~4u{alcSj74R_VY;q4IV^oSvW#s1=0wli!m|_IpXUOrVI9 zzK{hgdINeSUp2{zX!5Z+`M$ zH|Fe;pe`TJ>Tcupg_(yPL0D9y3A1k9r;XOjY!SIwtW11Y#- z4Wwdk3%CO|T(bwPqYm7+d!AdZa$*8%lVN5=vu ztaY(!fjq`<;ua*#K1!9aDT9f4S%sTT65m3qn8jFv(vPk+T~6fS!BZ$P7s{Le$y{-0=ufEY5R(AD7J=KQUYjBy)C1QKeC1Hw0x8u-9+&1DRuL}_ZVBgf{{p5r zz=sUsI^W>ck~HW`iDf9QG!(eZ5WvS{X{-$DX~vl;7$Iox<)eX^i>&d8*cHbvL7)Uf zHoB1DhaQaO>8|Q+8H3^>Kcn_2#gmXpqzL7e(PG#>RYXh6 zqLtOr+Pa5nsf~!r`xP6Xqt79?b(1koTC8!7qWw1$0PuL8M&B9ME0jaR7D$@L6sk>f z#(gJ{OnwuazJzU5=|P`v3vh|8TPPFEP_-PAdxUEq?_^2-{uS#036hXxu8Hw6KFR7^ ztl&z<4!Qc>zKQ7 z@Y{DxxOJmL(}pQ18M^P<;|yTJJFM6`uNNdvU^-+b#eVV6(0V`R&&In4-{sR5#}-T! z`;xiubp0lSli^t;4FCU~+PeUm>)IRqpf6%=maNT7S@*14HcnvwZ#PAgVN>YmB5$o` zlkM}1-$FIT?_f75ewPjto8c-oL(^~24ElFS22Ig`sNtN$UpjHmx`WSIUqo{mcMW0> z^$>6x#OT_2k_e|Z8Ze;82q>-HPYs4H`p`v|V9DPEvox;B|dnXMoUJdy?#+KsI- z!OJ5EAQL=$`GUGk1vh-UH>nQAhILT2A5{1(R(J-d*dVzYmQ0chHo-0I3&kuDc%sWWzsPvE&k(!Rr-~SaEiho4A9_(T%AjqM1^I5q1;Ri?;GrbTq zU5Pn{M@B9Kb#9pG18O2B5@WK3m~HqR@Mg|I6FFUC_&wD7AJYe+F-Rnk=k?#Y{yQtK z(r8}soA%rGNM5a!S9?FN?xuOpG}i+A^f$+Dk1d*RUzl?|$Sqr_e}8f%w><>(%G~y- zyHNI8Zn+05lWSGwawm2Vp(uJ@FaLMxUm^5FOlDNw-gW!}_WA8^*R8j7m@@ zbG?Lv#wZGuKlipJ>nN+RQ+N58q*jQuodC0BimK#k-S!qj#qU3~DuF zGrVekVmX~^P1*SHrO7k6bvkblpc>i?uBxwt)0sV2&T%kHat8C(UJUxCGmq|Q-f)f5 z5f#at#&qctu4PfW!@U$|Onu9A#>wZ!iOc7nCsXL!!@W&-M?R$k?{Ws|U}2$SbQq1R zk+x%k;O6wLcmq=(BYuV5ukU67KYq?Mvcu>kJ_nycneoqMWiE}ST!uYxTPyb&`wujA zCIgxiTq6Ns$IKT;hsR@floe2$lVcY~RcC}lz0+j1t4M=Rd`8PC4lf(&>9>QjK zUhlloz1q+it*+7jYHFiZ)zL=(!wj1zJq~?D`r{lSJNMRsn+IScICXm}QpQO27KYji5y!lPrZCj{(3xKu4Y1@{oq_pnYB*iJI{mmV> zcZ908N`>3LCI0!;H>XzI`)4f=oUS>`mk-REqAs_B5(48xtm+{bP%*M^ZN0hmtJ~p( zl#xH*bF*kRWwp5a&F5}E_b-P+)&ls6gsONnV6dIOx(DSoi?*+)zndP0$goYS?Yd(M zm-pi8`$=C`rF5w*O90sy*;wPq=*Ww@zOH5aOkba7EjLvg(DGmrdcajt4&f z;^nVTzdQZj%<{3YuN%$iwxbzkw1st)6wUKASFe#-sGerTEQ$`2xN(kGne91kMVuUH z!T*l5eA!PVgQ>SpHyW{a2%=JrSCWJ~@dyA87L9E)Wq9L3<>WZC&B zJ;}25ku}NEv1Uz%fZ1-cw9Z$q3HXVpq(H!2+7uV)x4nOje&g;ci+$cae`29x(ehS( zsJKze^REfG7>74&3Po(Uv@KRVLI!Q5W=VfY7kKfgWFz>mZpym3MmBO^!dKTsbE6Q= zjhaU-qbd2ghhMfPbHQ35dDJ?}FdZPWGmqMpGg<6rofQys)$*$Iv?E{(q#%WD)G0ri zI%g?L+Whi37GDk-j}dxdUnXf zPnGb{{PfDbhnM75@#wc0MHNPJRQ0`(Kb)aGb`X(mnXq^7?snh z%TpOH5ZJPia7ov!2nd%w=u12I>vtdCJ$&Y9_ql)K`L}z2JUyu#yxM*4_iwXJIe54G zTpQc|opR-y%E9-QTjB0=yzr~a!OyyxctO7G**#r&<-&!F9peh_83W-`*68V-`(U5| zaL(b2M2IM&TCXCTWjYWh-GSL(AMnoz8ja%m0uR;C_1k9grzioihbF*|fskO5pIGXd z%cL?1vJt8;f=W=RMc8Fr1dX&JtJlrS@N=4Tm}|q&={xG*v_YzzP+4|gtqxgp6WO-e zi(oSHGt4GGCo=ozCA6c}@8<_;7ogpT*p1E6RR2HdD17r+Q(N56v- z=4+pWRx%y!H@$mZOBm9JS{VtpJn@T6m#oh`L1Gr1_6@2;C%?COeUsUD75|3Tt#?qP z>2kxo^`k=M=}0vq(k{vGk$mm9986av>OVargij&u~O52>YoEYl$=} zHSvzx$Hf1J?-Apef7*i^I!3G{A?t(>6LZ4m6hdN@g-TYYoUo@TZDKpesy0iqt^t&g zw+8E(_@AlWAJXm-?Y==BYmJT`(~bbe#ZPE=1#P!6fbA=sz@UZ&`-65nDo$KH%r#Tc z0de(DCJ{`3f{cF!fZTM!mVP}|vgXYfE?ihT5ZZle#rpIkb9$D2HZ@)%z(l8Ft5mTy zn&FQYJRS8EBBEh0Fl=+n;D22S^}nSOPUl|e$H`B7msGne)cI7n_7quiRz=~6t~?E@ zqtLtRDWoSI?8y}VWV+FlKdEb4GA*?(z4(4pIJ4v3riEj3E%TZ4!?$z3+PT;?n;OdO zh}Jhn>bs=+F12X9Pz}(uTN+TPS`?~fOQgA9YKA3G0~`Wcpv76SHa<#mvD$eB1k?j} z;M*0=XrS^{s^zPLsYFU0GO312`x8TV} zJh`H!(qu`6woyDe_Bq@_P{V)si| zTCJUcANS2&j28(dElb5K#ocqZs5@`IKyp_`+zdM7Gxyxv$h>g3DPD{R9(x5pq6EQ$ zZ>!|q%CaSRgSaIbumx5jy)snQ9dhper6a|U%zd*5L-{_8*OC@sX{;3YQuc-O2LK5X zao0=k`o;bC+*>L8KGQ=N9$_pG>_bs$3Zl9bKRQ!AlIC&V?t64Z45mmXs!Tr3jrMx! zl;&CyO@r9u$Vj9H62Fffb8Xno=i{%@McQRh8ShCjQBk&cp2?}LCejHwEJD! zMUj?A|DxJZe}XT<2%#MXMVd+A)S+GR51$zkX~u|UB5yfdt^e$0Anya)oj45&%ZglR zk{W?eP!N|vEkW21YZ@5Jh%|-prnJ-1&*+{7>nxN=Ov3e-bd@Y=VrHUAShTTNIvx61 z++ix1NHamqyZ)kIaeDG|%UmGllT}eHHz64hSU(|(G;zcn>@MnZcI+H8)r_^UD}cd; zsmGbiFbDF(Ek&7v=@^M+SC}g;cqxW5v)ZKDLnMoJk(d{eXvtXnZ*cFhUhoBY{{;g92ndEOKerOFJg-`4Zh=G9YG~J4!cF=^C3< zi1h}V8OSmJ&@kc&*3#_@xE)L97s+M}10cu+PxmOfWB=Gpf*j8Zl${6&O+zxmzrDxW^V|4T)?U;RGy8OxIy2nyZFf%u1RmQ|p zT-wq{5A@S+0K1rV*F^+vx-i-$mSgCn1VXR{Q${@5WHPO#2&T-B1jk20I@{Ym5}Y3i zZg%YWSg80|sQi(z<71)hM?#Y%H2pOtLwcmK|BaCSu~7f9(DSj-{;^R0Nm7a_=@$Za zpQKw%Ng(BLE5aen;z^FjNj8%wln3!0ej#E@asVjG9K&{$Ni?uNv6(_r}{DMoK2mdT(b08O!kKY?LRqWe>vAw^H89DyuVpUaz;!A zlBpn`lx`}Y&x#B9S?FUwi~E*NeB<;7r=^D7kLWVJuaKpSm$O1XYcjQk3hUPdI*?04 zhmU8Hu9{5SLOBh;5a{4n@gwl1v}QdLl1;@?G}Yi2218kOE9UxWX3i@I0c3sGoE=Za z&3^zy6s1ct?ZDNLIa^J=?Y?<@jKg!_ofNo%}&|dD10xiU~gUW_k1SW PH|FiLCV!Y}!v6mOc^sM+ literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/__pycache__/offloading_connector.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/__pycache__/offloading_connector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7aeb2427adb1eb9836c4f0d155d078506a479c2 GIT binary patch literal 24140 zcmdUXX>c1^eqZBcg9HhX0PhRD3Gosok<>xWNED^fNzF)_W9ji?A$C)u1d!YYD2ZIq zJKi0yDO>hXj=kcPQ?*Ru#8l0=!mV=3s)|2oQ+8z?rz&Vs8erN|mM)Kvt;z?^Qkz-( zLn^=j>jTF$HS6`{LtX)|U%&hO->?6&xVVsmi`aj2>b)}@_c!#Sy8il>wsBk3K5l34wumF@9Cxy?JyHvp7Cd67`IGSlAWuMoY&_qh;e|QQx?a#T7>U(em-~XvKI1doPMqMytlFSlAt@ zj@FFVuyAptHd;4c$HFC%`e?&=0}Fd1jnTk(fQ7x0rfBncGYgkSTB5Dvtt?y?X^Xax zx3jP>(h=<(?_^ zz2kc=oVag%z{-hr;@}6EKXT(kqSMN~!U@&yazc$*^?^-||3H0ku(%KZ!{STT$SO2XTIngiJZYmR{U@zL%e(gR zBlFH+>yPoy1uaUEAWnwoB5`4AA|A9V{HeGog)c`$Efk-TkjW*4$zdk3%;uEx*y3_U+w>^lMdz{FPTLf<0 z`Z_1tuuB};zHkbbVWj89St)Mb7PKq1nI0L-q0h78mc*~eLh%_)YgkZ<^|u(U=vc5o z;X|S6a8wM1l)_LbIztnRusam``dm1orPyi9DGrR_lpxiiS;Z3yg{P-y;$hlKu~6tf zw~c7-pRV@BqqBXpQ?otsuyjR?_uQHrIP@D5p>KYpG75 z0;%OJwNj`Oszu+VQ>Z~)g-|P2B6k(~SBLj%pq0lp@+7wAuhjid_N_Q9QIQ2=r*t z6ZWjhb|6eAY5V`c`#sA&E^Z{=b|kppLVCiIuqJFew6<_@D#P40ryj~Y6BZpCEZPMw z!G(<0*qc!{VU4@>QrAlKQ095P6MM8YUaH3=?0UV~bIcHn6=)k53&b2Vqb^%vv;`XZq8M5yFlivR%%y;weyep;ZHoR zDbH@%vwOXE!*h7azIpQOlKn9sNROPLXMMWJ^SdvneBFQ)kBYjVb{u;+y3uiJY4nM^ zHC){8&M(Z(3YTB}KB`R1^T>2J`O=tuF`l4=3wmILx%ROwS zYb-j~d&hd3`X1UwBOVnUZQSBMn+18ly8re6$*ei7JwF&8qsK@W%Y{^;k1 zTFyQ<5}}EOfY~|8F8&r3>xO4``VIJ;^7PA|{^XvK4bQP8e@vZ{0?Y(4B2oj2E57MD zBDJDmpfE}Cnd{e3} z6Zs@I137?m9MKECJI$PVm2aa=Yyck4tdsNlQ=Sgl(~ zvr7F6720t_5TBw#@4d2T?}ld}$q%p*s~nSEB05td_FNi)Hx_J>TIi*fyf*SkG{l%- zse(epg-Y%46dMqz;(!os;;Q1j1f)45NyJV`M7~RB$U9FS5jDsxLVOmVn}x8Fwwhv_ znvN^>%QG_(iLe#}vJ$cPl9xP!AW}Vf0zAbYAz2wu@3bQ%+O|?Lc{E2#vBot|h>=Jr z6tqZ;RU=C5h{{nAXDpFN1Vijsk@B;5X`A#9eCqNzMlGA=jav>100nv)whCB~=W06E z{Bm{APg$O=W~*a(ZFJqYF8)d6`;lb-h}?Z_n?vL_Z*!cmY!zA@#cLB=93Jb!Ha)hS z1&-oPf8~|~PgL!zy4QWTd#ixnc&@5z%S9nKS6H%LjL=WYY>rn@j};wRySVPV{}ym8 z!VlUW+8!SJ0sq66A5Q#5XY$Ae`QSxr7~SQaj>2?hV9Q3&p@ZzX=_*-@EsrkE%dYBe z2NJft*|~NdP;)kk^rn(Ia?8FWMS1XF0ho&usi}tiMu-3ub8(XPF{jQn`Eesv$KGg}KnQbRQhkO5R2NBkl#(m6b0HP^ z#-auWC8m_=L9HV2c7TLP&L}%7 zz(d@MgYG5em67e1MCeWGATLTDO`%#{EOc3fsz@u58%OdBQMj*((r6Fn5PgIh7TQd( zm9dEpLFIR->?%AEF%8YBhJLxBpG1(g>DJCv>tVSS(#gksQ@W`w)pSs9I`}|Hz4V%l zU(;)p(2(BMo7#0y-gR(e*YJ}4j(f8Y#qX4C9>m*YzHxK^L3-8zNF6vtp_b=PuBG=w z@sBV5Y4!KNn(RF$H;?|=#f|1uEc?^u9@C^Jd?N!=V~P!vs<>2se@eKe6o}ITNU2*4 zXXXfAX!A1K;UNML!)K1u4hPOmq?fQR;7FK@x&I^ixt zIc{D$ljFX1?8h_!aGi7v6}P%cuR{AM#-y8+qVZ3^im>!8^1cl(Mr|1UlTsR@ot$0P zlt<0lCi(f_r%DO%FuAt3BJK91+;y_MF75WF+|9DPdG#x@yLa86>VHMX&;7~|n`HNy zk9Q9(jcVy{%I_qNMH{Hw!Nh%NHch{Ys9MNbBwog)x17I_d5Z)a$020sZ>F4{7> z7eGn_-eG*2%Rs1W2E^0TaL7ppWwmlXom`O#+#4Vm=9in zP8*s(XLv8BsE|$xxmUz^0P>m3qC{vPf}emYhzi87hU0;W@O0p^7*GK)Li01z0#Zby zxQPg>x6nvE3JLo`*X5%D#ytdXPQ|aX&Q8uif^!qdDj=#qR*ihQXhm>sG`q{RsRF(gLMs)9kg>5^{Mg> zxx8a-U#e?V#;<&Isqhm|+44}z*CG2lHawlAHv;=d648g(9vRA|%Gy+Ak6hWac4eb- zaLK*NyWex)hDfGH=4)1lAM>4?b%9jffLu55pir(GN%Ga{qPk?`9=T}G`lZysm^?7{ zsAz1{SG!r$L&)s3j9<;^B@d*q?R$ljS9`6Z@hl?=hu%uUNho&Fm9QmTJGYia z3KSrhSz7=Zi-kM(gi?#L+P7RYP+_LOMxQt{Qy)f@6kUh-+#pISGiCJ<_CjHz=tIK{ zWzmiP8qAuW3#3VHCai}NZoLPK#c>i1F$XflDR4EMkye5lf4-KB6D5;Go{$zO&%I!$ zan}NQnel3#pLW@`7L(k@NI)?fvl1a27gw z+%IDt7cJ-=;_{uq%~`%{xo5F(cl>tL`bV~#7F8+;d&ms64tqaq?{3ARLT*Amx_a0( zd_@F%IV-)iaPozKcPBMT?=YhkVlVX?HT^y6j|JfU0~xVE{MBCg@!*NI*O!i~1pn&j zV|S-Kc;X)!kMXUl8VmSQh6jo92>K*CbJ8+--+-qSXIKehk8QEJsKlVw9AzO~C;cXQ zX1LBhD~YpVNn}P05-p~bAn%*-6sI~UX%R0$PZq8-V4}`0U6E$yW@C~_r78*k=!8(n zkPO6_E>HR$%2GjDX2D<*8=jTpUm;PtOCI%A;bYQ-YRAM2F>v^>Xo?X%&ro8tW~wqM#|d+K0;~H{EyFT?p5cdOqpG#HH+%<|_@};>)l)-qfe~gED{c0mzM$ z>4T$Gs72;mR$ocA4$7^AkN6=KcatnIM}|n8CIOM{x53TaIQi4&AO3}q8=$a4uBfvkjGsx#0op-t1Ax2(6A-6W76nCV$#UmDYBS?``2_NVzok`yTO`zGf|$*?)kKNDUW=ZD=bfsI^z#}`O`a2M zATWycc7PCm!qAe*2(|U}Ttm{x#P(~&8E4GetCBsb)2(b zdfiNR@$=27#|)h>mR;c%eeo8(jm&J0a=#3v*4x;>i~hy(M0vt{!-5sWswVtw-&7bc!yD^X)b*PXb0=GSe5WyBi&w}194fIsG|00G4(Q7w^}=Q=QTSQ@6rn- ze0u6)b)q^^27(7xz}mSYWmuj|*o;%?U4U3vKZRNQr!TJvUIjN6JEVdkZjuN zv;b<{skej3CfSEdzDWNXB^H`9tm{D%bikw@JTK(c#Hy-92Uy*2Q`VtOR-L^i_TR{d zam@QMv& zL>3oZtWyT8qtb6vmI4?uPNQw5JVQH)OcJkVVuJQNl(UdUsL0zTqIq0mfkg^&+KL00 z0IrCNGdxQqfJ$_bhF5xzJR((sH7c<|@LOtzM+T;3*agEI*bAwRPYEHF>`+|jsu-Px zX_Hc%ovsoj5(#d>nWLb_fxF|l(x98|sPqO=9x4lNR1j0VMmVI(gB7|H-92L_jm)3Bz}k~X}3>xX6U5j;1m>QYsGa#i2DQ?43X zI)y76_j>R4u2#rpolD2k)xlKtOLFx~4{pfSCzej9Ya3Fvd*s?Z>*I3m%S&g|)opA3 zRB%MbuX+UOq(3tR_LE{~0 zO*Ib4jRR?~PObz7L9PT!lI}WCTy=r(4ZlB}svVGP2aTNJg~{_hmzhFS{ETTcvi+Q zaCW))X?4R&Ty5r~>cQosX@677-zob$*Q(bKKkS$Hj>^H~Py8o7wQ)5=TSZ*+;pZh> zRp(kU$#za|w)L#tO1AD_I=9j;yPKFyrbTwQte)6#cP0BqW%noopSa6tXld`Mw724^ z*O&AMH@v&j-p2p!auk<$$=5N$EQcl|>V>fX_SfL$sFTd8BJLUq zU-Dc=6Yyb%60tIUswT_;?j*Qu9fNSO38fiafz~HL;RJaP;pxVQ1bGxYnS1Bj>Vy%$ z(Eq|?W@i3%h7Ou)fC!nCw<}#;pQ`SXtGm)Q4XK)bxrUi<)*O58uz3q1L$f6tmoE2HH0v zHm?~nCiF;ixQ2sXkps$p2}sdmMrfM!&V(TQ7xs==#ypqbyLn5{y6|5Lbx8X=111WAn^&RA3F`{300^akZvEa3wGcF5VgULLjm!-| z)`U|eibB>wG1Y~Unu%$dpxgqLL*^h*TA@awed(@&(1-mU59I2b0%2$kRf^i^C>@U2 z)tR}75Fo-EN=DX!z#MK!N3wN-ATX&3^)3^JMih2VEktqYgX_%FS5XV+XX%n`I0C4l zUle{dEQO;n=%%R6sp*Nx9BfMv$Zk^u8@bvzqX(DgSzdAa%G9)&(T|M2@S`fX%5zM9M^*Xwi~g!bl6NPYlJ+10cj?hjvd;teW1Y%tql

gZ72#OR&LZd+E#@;6QQRBA^=Nswj;V;|56nqLX)5qeOZ zQBjzhi_(5&@1-Pbp3XLuQCy5E!_kW=1(6vkOkAEOnkQQ_>A$0%AkQ$DY6y#)E$pKO zQV~*A_KWgJL^w!E%B@;&DGqgn1b{VhzG`i3X0)F%%*i%8rfnTTaaB<7<|=CLy?yuX zRC$kF-t$p;-_r54$N%2qZ!A8oXkK@uc8|!rM}9c)KZgHe_;JNs8{W4*sepM`d#Y)# z+_X2{zAN1jNY@3@jo?oDsWuLmZirxRn?!46cT18LLrGbMb%!JG(6G_d9-htxtq$(Pt)rxdE?~y$RlC7YKi=iLSFL1 zgT!6ly1rj!>3eGHqNok?dxmiXXcu?tt!n8-W-;5yHL8eQe*eWiNG{wyb=0 zo1^DeH&;^&hVD~Wk)v?4*uUk#6HFEihzlshb7ker@w+u!lnxwTShgILVXs?6uQhud zt?9~!EgL-t_OoZMB`^}Uu39=B$1N-$3v>=_*;wH0`E3^1x@0+San!HByv@;bYs|_! z{LDHSPi7p9=a%2=IKWJU@!V=HckJ3Mt=e+nNxjwVg6ZvrX2F!9x71OlSp`4G9F(b3 zMzaM-lW{s=+5Rr#cD4$}7YdxP+;zaNwgBHHfF-b4XlApc+2pF>5~wL0tq*%6n`7Y9qEL+IQ>+vlx#)r;RhuLuMmXO{ zmCmnHq*|t`c>@N)p~`ybE+r@+cI8$Ty1j!kpEpu%HbLO(8W|R8bZAtg=G zLxb>6np|LxED*1FA?BK9(ndD*Q^LZo7feN7-anuzAxMfTDJ@TxHp`{WAYpfaz`{g|~hM<}gYs>}u*&P{_ET5|cinDBM-jz9XQ;>y+cmlv+R%&4~ywGwEMh zKrc?mjmX~YZw#2-AJq1o(TwrAg8MYp1OzeF!JhwEeZM*d^PS+!4_zMxPs5;R>6PU- zSFVuU?9A;mDJaCh^gmHbN|X07d2BzF=wIjw zF}~`)#z*`uWgxC!DK;{YI$nB)xO@iU1maA;0if*#)&7ly3b<1KN3;!ydBHYl>q)g8 zl-mxb+K$L=M?PvBfzWa3#PYdxd3CCM7cSduly^fE(A>G!w%WKf`rhf=rDy_fX2$Bl~*RCfB1+e4|PCXwouQhbIa(Q&2u?x^IQTA z-i#`wzb7xVbrifRwy+?u{UiMYrT!RR{-$^hai-U0b2f#lUb%kbKhds;qlmUE4#{nY zQf)`&wxb`l9ounNw5~Q}?+UV;CjdIc3WH!&f`Vk!c5Fc_)um^zh1pf7z;Nb%|xb$U{)2*qhsno|ws-aw@3=b$u5 zoGs-1Q-Z_i;o(w}pQ~&9ecY|CdRp7Sp6SMx?|tL_Z=@QBG>iN411awT*?ZtY`xEcc zbZL`Z+LKk=V{bR>bF1t}-Lk)Xs~FWntKkhKoBEUZQVZYDNY^o#a<=M^m)@Pw;--`XZq&ATxwlOmGYQobMnqCH$S| zK(yO5VQC{^qp5Q9H>?>G`rD@F^N2N}ua>O%dZhJT-n>kfu|3fgaK|~p_6>x;YrSXv zj>7@9>AY=-yW`LVxFZZqeT6K1vhDzN(ovrhy6HBRVOFA<#dHS@vzQ<&d?IIRsoA~w zZy$vBdlr3ab!%#L8}a$>2Dg(nwMP_TYL8-2q`!~SD$FWk5@(jCe+KOx6&U?K z@+>sv8ZMcbOn*lEmCWIO#ze76{x$jVtLuYH70aONdu3nmV^=R?REz0vSJmSChy_Yn z#^Q;l2OU#EsA{HMo=MH>onl*0}TxMAH8DcfF0PWige~1`~ zyuYW8(y>)Lic??Smhulh_75%b>5_`~W^T`{+)6d}%Z>f(1Id#88$6_#T={-T5q&MG zvi)+|{&&ti^^~P~Uy84lVGc}zCYf*gEB>_yzmz)s>Z8N2Zt$;d7udbd&p5lo`4jT~ zDMob7w~y{VddTvFy_TZ~>_6D&L^ya9--E;*GTd~dFSs&slj6yW=?!0=PzA=s-{RxL z`getK`IQcz;sUQI#qkM?n0i0f8K0j8K2+tw1dEgs`i^HWKGp`oJ{pl+tRfZ(l>$EQ z$v&^4)UcSW@4@y$6rg<+gz%IqzbB4X={C!%qiZUbQB~aNmx$t2zhK$_&nee?8J{v{ zl5fS&%H@5wk9{yi{YYuAdIL$kn~;_N0HGTrwa#2A?~Siw*q$0$f0V A=Kufz literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/__pycache__/shared_storage_connector.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/__pycache__/shared_storage_connector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d61021fee5bacbee7d75c6b8d9f96bbabb9e54eb GIT binary patch literal 18463 zcmb_^dvF^^n%@lGPXc@bB={Ji99T>t4B+z)!GoU2W zpjX+mv*fMsBwa2g`CM0~udcG(*p;I5s8wB65@jp3*;M`rMvw#SoLtAfbyBte!ArUJ zHh(1fzV4X;2!Xb%k}0Xt)7{h0@A3P-?&g1WyB!QX3Db|`znEm0-{6b(Sk%bY36^2* zFg(Mv31*i5va>9_8xn@3an_hL&6-$DHzv$U%d917owX)yvo?~(ChW6z@~vam0l%h% zGwGUjkvwz4ovfR!OL}HKN$;$eq*)U6$%fem61OHAlTEWtByLMICtGG)SVpjjoluWG z;Y<2w{UpzkXic`wwvo6q5lFVrwv)Ik(UI(&?Idw`qASVGax7zDga)De6BrXdvps^* zz?^3Ix_24g^NE3`enNkk*puiNUjCS8A;44*jll?nKzMlJI@MM!GUSKS&vx+ z2X7QCya`fBdEUx1F&A%p*Q67KbI~4jkU`))DE3VHO(7MI^OE9{66s7h8JVAtr!FhD zxD=LPc?89tOop#Sq$`4?*e~I7hs3sZg)m=6C0B;Y zs3gcRSw@J!a+#w=UDr97*(v24v+Ypn&(;w&!PD(WF3 zrJ_-Y@@z#P?OFSrw&E4Vcth3@GidEWYcI+nBlgoS7JDm zoFAT#&+pAd#LGfv@AZYzvBPhU3d4!`rQ!L-%$0O%_$tVsVLmQp#Q3F!jKB}Sc{QAY zaY->j#CTMbN5fY~hGCeZz!Un9To%fCq4`B%hsc{kI0?+NR1cy6xhSM31)*!w&mp+M zJoYl}!=FkUW1qZpZz&&mHfQrc3UD7C`{3BRJs;SY+drK%^?_^crFBQ%zkkKO>1r*wx@A}Q+Q{16+Hl^ruUOYvsM{si?OJzi zOsxGNUpIPhw_G>YDLgu}@+idBfa!r`|TA_IO_$Q?gknm->?=lC0T4Bq8HGu_lI9g-Je3jmLJdD<#1Q<5 zLl_`!k)R7A(lP|^F(nJ*ZN8aZ&E`#k%?96{q1!`wQ|D$~!ynBQJ&m`fRu}Uo|E9UV zWN0xvijBUz#@k~hBg8k=xEbP5NwdFX#h8t0^p)%wb1+SQQrd}cF2>RLJ2%9(jEtq> zE3A$73_G<=@Vg)ZpRG#}R0@93(ni7bsV;tI&4Nj=5Fu{miFgN5ZBoUwNspNY8*dTp zycJTYc(?I(5aABc!YI0(O6Pe(oacd6fi`tDigbu76SW^N#3BocOgI)n1q+H;q$*3= zkyFKX=>AN!QsCMlg$i&ovMe(Pztzv2F3ih08C4uH%$&8Hs>QM(NKtHM&7Y`(UK5xs zv&j6EodL1+G!zD@Ge0k+cuEII9YKp?l7vKT$3aNgeJU3S2`SD9&xh5qFYW&3c?(sP zJq`6pNQ5_-l9#c&Z@O3A1zWppYk%ZwEVw#kSI6D>dh5os_ieun=3N)QHX7`dK!PrD z7`_vq!e9ggoDA_G1~@7@n*A6%jKKj64q-5f!59W*7C4Hp7)-%`=?w_pV}9M)yD|N+ zbL_@5EBvj|)nwi@Rx&v%C>-KI;VLy-@y&+xZpWAd>D`GjmsY}!Z*`2L?y(1Av~g^5 z8V~*ZA%c9CN6zsvh3U*X!Rq_y#EN;v3&(X5D-f-^{lNb-YvXK*%lu zwF%ys0eK8s@@@+yLqy-?Yfo~TY;hQFCy#wBxNf{@6Y2rKKy@l@*d_o=8 z!gCO#Knn9Md^hC#Va&Vuo*jBy`GpO>=zZ5b+e*gw-M+B#eb7eR3Ui&E4S+%)P+Dr! z?Tq3?m4>rl(r#%N)^WE@alSY>oXd~=$>)Eb1}TDS*^O2dRugN~syI2iKG zp)9U?YS&*DFyH(&4`ZM6R#geFT0c-g1? zt{#U_8pJh+SzKJ=`XvzMW@%ww5K+Mrn+BBXwsk`=r8p}=TB9b-q;N_AfRac@c%?C1 z0cF%RSDFC_g=1nm83wyXjA+Sj9n)Y(6bm(@6jMAEO9##31P-ZcaQ1MR*m~ixgzJd% z^#;|@XW@0Jt2TLtv@y*>A?ZB`{sAD7i?P?OT4jJZ{@!AJbFqC-$;#MUA2TL93a9pg zLSVlf*k1@7mjlOFOb?;p=otQT#g6Vm$3eN{V4>rz+;O%D;OEem(a`C@+_uLS#_qZ4 zTy@@>E;J3vO+ycDdmq{AFeTSCywOw`nUY7QKDSL1`KEYc)a=7qLAy`HMOQ3uf+=`e zBC0ToC|Y}v_z-h$4FNO!$aM!tSO_(aGjhb!?Vl}*{tbCOGanPnyhFoGti=Wr?IcPzO^3P zwt7spSZ$hl{qMmTGH1 zcILUBszi8&)V?QE^%fnOU0Uv(PK8jYuCZ~Y`+-A=@ zcbHdQQM~_`Cg%MwP0XS>=!`@`!<+}REW@F!0>=WExgv09U*U+P<4{786y5ydE-l2s zB;zt^?&@>TJwG5hD5B#)bY2#C{FIgmIrN+fVA~`58H;-o9-g#^!C61H^>r%(cutlVLxftNC<2dDxeRe+Tx zM)fr62B0usT`IEJDAOC^|5l%h1%N0?#`a*@SoYhjTFMY#V%xbue26=f0@4BB4fp+hLr8RCWFBdV@K6|ic0=6t>(^iB;xP^k z^_n#;NXV_=T^-`S{|z;yff&VWajAkBX-_z=lKg6pN=h$?(Xuosce`eXw4qQ>(zXF` z)t#*@Q?s&f>UM=2w05_l0H^v%zpc+WbF?$Bm$?-ui1_J5R1LrVKk$Tpi!vLH-(L7t z9C4U^aFV2=f{0#3u^)n0oi-7=#d#5+Qxx(gKi)&-DZ)p8e zMU}^%!7Bf1WG%5VmFpdqy@#tRB@lAyIB=M*vqIZuVh9uz6%2X-4&8+~z_CcWd!e}Vy z9o1<=d>xCRBS1Wd<*a~>Qy>ti7lU|s&toQs0inhER2u3@&d0$$Na#jVtWYEVH3&TR{ay%s{erPp?DpLr@pgeIR^1`Ks%hb1n!d!7mk(fYz#IW%|s0_<0rsx76 z3A(AR@-jC63I><(8=E%I=&L5XG>U`tg{V`-OyY`Ln~pH;oc>C)ZCaPaC{~OUCLs?} zPNCrm>R@tjTll5E@4_fatZx+*8M1<(X&o&156b?7EAEo1+kTRL#P$Eg^|7mD#FsDI zx;`5EV5nrq7cg?be~hnIdt)AHbSe(=hwy( zx@*AIo)P1R{>j@Hm!Y@dH=^?Jz4;0-+{$GPpn5NauvA_ zRa>Rr!*|LD#FC^urYb+mTRK^*icwc}-;rIoLD;ZTQFxhY)K<4cI*o*Dg2pQAmGYjz zMHfU7>`A?snv$V|il>MXyuF$2s9Ku2P=pe;j97jp%@gfaCE3?dLzd6kuB;;@bqZw7H- zlPOu6+6h|$eWJZWCr&p)YF6?kHuob40D3)cV7zU*C|cQzJ&_V#CQ z-yiwxox5XTi1^y?EZ<&U``$*Y+<)ZWWZpM^-}TFA?!-&7?_AD%uGl;HlcOIWU4QH2 z;~NKmcJ#BO_ul&K_^rj2GsQ;#ou_Xk-#q$i5IEj)m1IJ9Am&M$~4ZxuW8ECJSV zHe&({q7oOcVSqDE0rL{Xlm=ZKmXUu7i%1w?&lIbQ!L?%-Z>SrP{<7m#- z21qy8HuNy`+(XavIotC&&+|oJu;APG(6>)Rt9d}H-;N`{0vtKA;lDRpIQoKo^o4xj z#ap&wpnvW9Pk!+65Ap+t3j@dGfn)iBS3K*8HBd%FwXJ+gPt`ss~}pL>rZ zn%qwo72TfO9ux^%D02WvGU7NyD9U2V0!dh0#aC_}GvPAsIHTK2{crf3@X-yXXse@S zMV1jgdrBknHkxL5?nU)RM6pBKgYuem@aGVL?G0wGy8Dox)sHer>|*6cM-6Di>!+Z| zz9E9Qs^>Z@8Cd2r1Aaw)lOlrvk33o3)5jZ^05(C7YQQ5+e=M88zh}U5DB(fMnj`-n z2B6Em(?X`g@tP8@>9#V0-)JO-2wR2fnjqFc20ijb^r$hh6|(zD z6s{Vj+~7HCovOG(OHN-xpJG|ZBB#}8f*S+_L4zc^Lz%c=5{r7rRW=1L1zmV;(A04a z3P5eO?6N*bPVrTF($!(AUTe+C@Es-&+uw2Wu3-hxhe$#cBK)G>$@192{y?ka)KcR~ zZ3U)qQ=5Axp6Mojvr|DUI1*y%g%l5`7BkT+#3)fmMZ6Xwx+oOyJZv(|XDH$0*+mgg z6j9H`^AOYaJqg475oV!;rCdYD3!jFeX*dvyv`!|LO20OAp5O`rQPDo=47N0~gTmVR z6BdY8ia-k>7WXmUrh;ck_6)7R`p|Q**ckXy+s_O>wC%<-9qn|h^FhD`F`>(KoXdEfMXzHsUl`P3`<&R27`z^11s*MC^{9KP3|^NiY8;+R$}h3+Kbm7p`E{a}3@W4U2}Lap~XbwEFTq zIL8p#Pid#{V8a|!A~E1Tj`R~~1xXw?Vk&s-OpNAA+%-Xj^0W*=H9Y#nxEed=LwdW? zLi7qXfv-^spc)uxf>Q>E8PPVAz^ss@GJz8EgyYcS42(!|sB{PP1qf&Yl0-sJVBIm| zA44GFTq5aE%JDCdA;We&I>E>=!=&>|!#mK;P9N+-su_UFi9PM=TC0EH>?t0dT$wJo z`eaw%+N*{BQMrHgf$K1w1a$S%wf9Ou(By=4I8Ba7Zv*fsh{;;ppx1Cb1e&0aX5}p# zcoz1bYpyWZwD-ZL1)*FEWb1+-Hf3rZ9+_rtTB3%_;EH*@j_ki$Y8~3WZFt{n0=pUZ zXHB*2>e>q9DkJXC8fx0p*qa(hKj@Yg)Kp@3Qgze+*yL z*qsL3+o~xo*vtO&|Brovnk)PA{?AQ#N}2{3{%D-2cWoOnY)xDz)=4{;!dKv=Gk&!;t@3=PgJ5qz^ z|COGjhGk3TDppX`FJL{58Dw+v)E>3Aa&HcTNTB1XxhA#F0%)8KbPNzG3g8TTbFCr< z{+K}^jmd>L>_{Ul&}|3Z!=UbpgJPF5!hD68CT`8xLMn>lTiey1dEo*F=ZWL7MKD1Z zIdYXmm%P>TsVsOW5>k2yU|Ry*p6C&R*7TZ*zZ8zC1LDITQYE6A%X)V!`t>A4t8SMY ztiDS^-79)Ij>=E28oW{X0^ow!Yv>WHIvr0y%!Qd?GudWQ8nhbWygZ-@&?l1kPv9%X z3HL8zLIiGDzzqyXoJX$?%$11OF5UutLlh7_KT29bOauEm)xe0NAf>t73^^-64#>g= zivTc&?8uO<6LQ7F0}VnVDM4@(f+M8~@lLrCijWjH_L-j5$GaqMSe**M#o>qMXhMjH z!3L@w{}hu1qGH_NrQE{Tps!jj}Vr-0DTu_{q{bl3gKgAk+S|5q4U|tjifO25n8z}gWJoFs_gVo!Khvm-U z4OZ^lSM2I7bPda0!^Mss$SgUT`tHY!rM>|U|JHMBBeJ*ewg-*{``Zh?{jzWW#+iGI zpZm@foBV~QKDnu{*t4hD6D;)XmwWbaOvpWlioFAc-u-g#{$h7OWI~0_{a-Vd=9W@F z<3CzDz`(Ftj>#>@R-BJod)AH?dJoCHhyHu(=<1o3iIuCxT|*nr!c%AEr_Sbfoz1y= zf7Q5qJ$7%paO{E%|3VjFhiS8CZ=na;nz-LB_e|$pUB!;Qg^o$NWAgqXx#QWDnIdfc zv>%k)4{k){_QNaB!x=(fpwJSMTSDts<(5M*qk+&yryLkxd5+BOUfI8QJ-MFE`H!uf z0TA8PoogJ*xrQJt`uo=+vVU;p%%;m*aJ9>>_F`Kv{zK=%9yze5*xFefIb0Z-elRkf zZ#yZEOs`Me9r@_khsSaw({kHMm;!eP%qi@C)iLh&yKl)ZZuN!I7}IvRbd2$K{dMme zU)XhA-gP|RGyZ@8ae(7Ro2y{sWE)qo?UHS~9@+-sD#MEN55EJaK}0@~5AcA#1zcDCXvj>{B2cP^;STuBP=U|j z@XXHm3^a&-u}jw}hHFfaUD`c`_7t8!A)16(b|A>F#0lIci7KKFfe8I`=qo)$j}lS5 zjlrKm07r2rFixQM8pI@=1z=#pwc>m5vLl{;LO`nR{SX=TEPM%g+Ci+aJ+f!dBbWD< zuTUS9>w^#Khu6>LT~8HzcLPm7=slEojRIxgjjX(M^R*wpmIFF>jc%EZc#pdBh4UenuYhyqXHwaO?M{Oc2P4=B|U%7iDRwG`Sx zkUv2qREfTiB$9-RL6$q1u6>VOo}8D+10<6DGZFt1Ea7(-Q@u58#d5e+6v7F z#kY^}XcxcsQr3RoGA z@73CJDLeg$6;yHhxF)HYc;lxgja+M8mDKxX$>_vZ%SNl&- zzzgfpAO=9CGOktFy-VhWOl{yAX*NRS;q;q$cL1Ob+(xu! z(xiUHG?$L26pN6Grg^xki5-C3kBTwLA5`pDgzJ194gh9EoK*2s3|@i&4$Q!1Mm{b| zK^v8%1gsLq`6=c*2|xcJXDNY7HL8!J=G|B zk~q0-M0o)mTsk={xk43(vcS)UOI$%x{}x^~2~4(}%G5+?WV`oNj6?CA)$O~N;4-;fh z1>37{?KyE^h^oZ17$6Q3@w^DVt9KP+-5Bh{;06Xi#NaRlaEW?h9{%#c0ZdrI;7>3> z5hjA#X2gGv0YVdsXb83;6N&SoxYg@6Q2{KdaF7)*Kq}cEIzicL8o$JVWc&x1j(>QU zDK#;Nj(laWH=krTo4ZS9jDX(pbe60nYGc}Z)`sQQeP5F@rCk?U^UG{*@6#oQ#D3YR zMshE`yhW2r8H3&IS&x<&_}SpM@Tb(W+dRJ6dvf>+DFy2$LV+?K? zwv?P0gL|}fTW*N`?jp;==aydc33ju-tz^asRM6R7vXZEcad=jwTNhTB?jFc_x@0@I zWhWWWvJUeUTl8@yBZ=&W3%w-r;`uESDfype%}twa!IBvxP^qJ1%Sxhe7;JDc5}Six z%&6P4Vyww+-o4q@c~|-XK7u%8Y4;`Z4f;~IWyjbq#@GIpy~*6Rscul3Azo1$-yDp? zz2(H%kPD|DX9pu#td;C)TFEOI8}x>;oxI{_zjtDkZr9<8D&*$7YLlX`Z5zl(;X#BB zZUv*zQE?!<1UXq2Pdk8uGA!+#!0Y{l<3UGMZQRh)q12&{=fFe63!$C(`j1dE@lsTt z_@dAZ_e}01?;%g$U^ZPXck5SY?k>sB;QF(&^H9!ohzuk+DE;Y>zSaC}>& zn>1}&89iG+Dk!4+Lh)z?RAWZO4PYwV&S4Cl()3g>5hEzC=<<$XEQG-j1d8n#z36{j z{BQ6I%g#u+pd};AvRf90_53Yk|2t;z@0ekk8U78^@VAWXe=t42H<(z%R}2JSI~ca_ z3ufvI=HRc5-Y-m(UzqlPVLJMS>F^h(fnVF3Ze7gVJ8xKkM-0y28#oqBB?#cQy}?-` zuczSe+&CW_+^hq3uC^HXRWl*>n2VcPIQiW%{FouJ(nY6%9bKC$G4ONG_!xi4VE!)% CzAVT9 literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/base.py b/distributed/kv_transfer/kv_connector/v1/base.py new file mode 100644 index 0000000..f85eb41 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/base.py @@ -0,0 +1,546 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State +communication in vLLM v1 + +The class provides the following primitives: + Scheduler-side: runs in the scheduler, binds metadata, which + is used by the worker-side to load/save KV cache. + get_num_new_matched_tokens() - get number of new tokens + that exist in the remote KV cache. Might be called multiple + times for a given request and should be side-effect free. + update_state_after_alloc() - update KVConnector state after + temporary buffer alloc by the CacheManager. + update_connector_output() - update KVConnector state after + output is received from worker-side connectors. + request_finished() - called once when a request is finished, + with the computed kv cache blocks for the request. + Returns whether KV cache should be freed now or if the + connector now assumes responsibility for freeing the + the blocks asynchronously. Also optionally returns KV + transfer params. + take_events() - returns new KV events that were collected + by the connector since the last call. + + Worker-side: runs in each worker, loads/saves KV cache to/from + the Connector based on the metadata. + start_load_kv() - starts loading all KVs (maybe async) + wait_for_layer_load() - blocks until layer i load is done + + save_kv_layer() - starts saving KV for layer i (maybe async) + wait_for_save() - blocks until all saves are done + + get_finished() - called with ids of finished requests, returns + ids of requests that have completed async sending/recving. +""" + +import enum +from abc import ABC, abstractmethod +from collections.abc import Callable, Iterable +from typing import TYPE_CHECKING, Any, Literal, Optional + +import torch + +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import KVConnectorOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.config import VllmConfig + from vllm.distributed.kv_events import KVCacheEvent + from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorPromMetrics, + KVConnectorStats, + PromMetric, + PromMetricT, + ) + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +# s_tensor_list, d_tensor_list, s_indices, d_indices, direction +CopyBlocksOp = Callable[ + [ + dict[str, torch.Tensor], + dict[str, torch.Tensor], + list[int], + list[int], + Literal["h2d", "d2h"], + ], + None, +] + +logger = init_logger(__name__) + + +class SupportsHMA(ABC): + """ + The class that indicates the corresponding connector supports hybrid memory + allocator (HMA). + This is required to use the connector together with hybrid memory allocator. + """ + + @abstractmethod + def request_finished_all_groups( + self, + request: "Request", + block_ids: tuple[list[int], ...], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called exactly once when a request has finished for all kv cache groups, + before its blocks are freed for each group. + + NOTE(Kuntai): This function is only supported by connectors that support HMA. + + The connector may assumes responsibility for freeing the blocks + asynchronously by returning True. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + raise NotImplementedError + + +def supports_hma(connector: Any) -> bool: + if isinstance(connector, type): + return issubclass(connector, SupportsHMA) + else: + return isinstance(connector, SupportsHMA) + + +class KVConnectorRole(enum.Enum): + # Connector running in the scheduler process + SCHEDULER = 0 + + # Connector running in the worker process + WORKER = 1 + + +class KVConnectorHandshakeMetadata(ABC): # noqa: B024 + """ + Metadata used for out of band connector handshake between + P/D workers. This needs to serializeable. + """ + + pass + + +class KVConnectorMetadata(ABC): # noqa: B024 + """ + Abstract Metadata used to communicate between the + Scheduler KVConnector and Worker KVConnector. + """ + + pass + + +class KVConnectorBase_V1(ABC): + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + logger.warning( + "Initializing KVConnectorBase_V1. This API is experimental and " + "subject to change in the future as we iterate the design." + ) + self._connector_metadata: KVConnectorMetadata | None = None + self._vllm_config = vllm_config + if vllm_config.kv_transfer_config is not None: + self._kv_transfer_config = vllm_config.kv_transfer_config + else: + raise ValueError("kv_transfer_config must be set for KVConnectorBase_V1") + self._kv_cache_config = kv_cache_config + if self._kv_cache_config is None: + logger.warning( + "KVConnectorBase_V1 initialized without kv_cache_config. " + "This is deprecated - please update your connector to accept " + "kv_cache_config as the third constructor argument and pass it " + "to super().__init__()." + ) + self._role = role + + @property + def role(self) -> KVConnectorRole: + return self._role + + # ============================== + # Worker-side methods + # ============================== + + def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None: + """Set the connector metadata from the scheduler. + + This function should be called by the model runner every time + before the model execution. The metadata will be used for runtime + KV cache loading and saving. + + Args: + connector_metadata (dict): the connector metadata. + """ + self._connector_metadata = connector_metadata + + def clear_connector_metadata(self) -> None: + """Clear the connector metadata. + + This function should be called by the model runner every time + after the model execution. + """ + self._connector_metadata = None + + def _get_connector_metadata(self) -> KVConnectorMetadata: + """Get the connector metadata. + + This function should only be called inside the connector. + + Returns: + ConnectorMetadata: the connector metadata. + """ + # Should only be called while set to valid metadata. + assert self._connector_metadata is not None + return self._connector_metadata + + def has_connector_metadata(self) -> bool: + """Check whether the connector metadata is currently set. + + Returns: + bool: True if connector metadata exists, False otherwise. + """ + return self._connector_metadata is not None + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + """ + Initialize with the KV caches. Useful for pre-registering the + KV Caches in the KVConnector (e.g. for NIXL). + + Args: + kv_caches: dictionary of layer names, kv cache + """ + return + + def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp): + """ + Set the xPU-specific ops for copying KV between host and device. + Needed when host buffer is used for kv transfer (e.g., in NixlConnector) + """ + return + + @abstractmethod + def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None: + """ + Start loading the KV cache from the connector to vLLM's paged + KV buffer. This is called from the forward context before the + forward pass to enable async loading during model execution. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + + """ + pass + + @abstractmethod + def wait_for_layer_load(self, layer_name: str) -> None: + """ + Block until the KV for a specific layer is loaded into vLLM's + paged buffer. This is called from within attention layer to ensure + async copying from start_load_kv is complete. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + pass + + @abstractmethod + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs: Any, + ) -> None: + """ + Start saving a layer of KV cache from vLLM's paged buffer + to the connector. This is called from within attention layer to + enable async copying during execution. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + pass + + @abstractmethod + def wait_for_save(self): + """ + Block until all the save operations is done. This is called + as the forward context exits to ensure that the async saving + from save_kv_layer is complete before finishing the forward. + + This prevents overwrites of paged KV buffer before saving done. + """ + pass + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens on the worker. + The scheduler process (via the Executors) will use this output + to track which workers are done. + + Returns: + ids of requests that have finished asynchronous transfer + (requests that previously returned True from request_finished()), + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + return None, None + + def get_block_ids_with_load_errors(self) -> set[int]: + """ + Get the set of block IDs that failed to load. + + Returns: + Set of block IDs that encountered load errors. + Empty set if no load errors occurred. + + Notes: + - Applies to both sync- and async-loading requests. + - Async loading: failed blocks may be reported in any forward pass + up to and including the pass where the request ID is returned by + `get_finished()`. Even if failures occur, the request must still + be reported via `get_finished()`, and the failed block IDs must + appear here no later than that same pass. + - Sync loading: failed blocks should be reported in the forward + pass in which they are detected. + """ + return set() + + def shutdown(self): + """ + Shutdown the connector. This is called when the worker process + is shutting down to ensure that all the async operations are + completed and the connector is cleaned up properly. + """ + return None + + def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]: + """ + Get the KV connector stats collected during the last interval. + """ + return None + + def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: + """ + Get the KVConnector handshake metadata for this connector. + This metadata is used for out-of-band connector handshake + between P/D workers. + + Returns: + KVConnectorHandshakeMetadata: the handshake metadata. + None if no handshake metadata is available. + """ + return None + + # ============================== + # Scheduler-side methods + # ============================== + + @abstractmethod + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + """ + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + A tuple with the following elements: + - An optional number of tokens that can be loaded from the + external KV cache beyond what is already computed. + If None, it means that the connector needs more time to + determine the number of matched tokens, and the scheduler + should query for this request again later. + - `True` if external KV cache tokens will be loaded + asynchronously (between scheduler steps). Must be + 'False' if the first element is 0. + + Notes: + The connector should only consider the largest prefix of prompt- + tokens for which KV cache is actually available at the time of the + call. If the cache cannot be loaded for some tokens (e.g., due to + connectivity issues or eviction), those tokens must not be taken + into account. + """ + pass + + @abstractmethod + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + """ + Update KVConnector state after block allocation. + + If get_num_new_matched_tokens previously returned True for a + request, this function may be called twice for that same request - + first when blocks are allocated for the connector tokens to be + asynchronously loaded into, and second when any additional blocks + are allocated, after the load/transfer is complete. + + Args: + request (Request): the request object. + blocks (KVCacheBlocks): the blocks allocated for the request. + num_external_tokens (int): the number of tokens that will be + loaded from the external KV cache. + """ + pass + + @abstractmethod + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + """ + Build the connector metadata for this step. + + This function should NOT modify fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + pass + + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + return + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called exactly once when a request has finished, before its blocks are + freed. + + The connector may assumes responsibility for freeing the blocks + asynchronously by returning True. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + return False, None + + def take_events(self) -> Iterable["KVCacheEvent"]: + """ + Take the KV cache events from the connector. + + Yields: + New KV cache events since the last call. + """ + return () + + @classmethod + def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None: + """ + Get the required KV cache layout for this connector. + Args: + vllm_config (VllmConfig): the vllm config. + + Returns: + str: the required KV cache layout. e.g. HND, or NHD. + None if the connector does not require a specific layout. + """ + + if cls is KVConnectorBase_V1: + raise TypeError( + "get_required_kvcache_layout should not be called " + "on the abstract base class" + ) + return None + + def get_finished_count(self) -> int | None: + """ + Get the count of requests expected to complete send/receive operations + via this connector. This method is used to initialize the + KVOutputAggregator, overwriting the default world_size. + + Returns: + int: expected sending or receiving completion count. + """ + + return None + + @classmethod + def build_kv_connector_stats( + cls, data: dict[str, Any] | None = None + ) -> Optional["KVConnectorStats"]: + """ + KVConnectorStats resolution method. This method allows dynamically + registered connectors to return their own KVConnectorStats object, + which can implement custom aggregation logic on the data dict. + """ + return None + + def set_xfer_handshake_metadata( + self, metadata: dict[int, KVConnectorHandshakeMetadata] + ) -> None: + """ + Set the KV connector handshake metadata for this connector. + + Args: + metadata (KVConnectorHandshakeMetadata): the handshake metadata to set. + """ + return None + + @classmethod + def build_prom_metrics( + cls, + vllm_config: "VllmConfig", + metric_types: dict[type["PromMetric"], type["PromMetricT"]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ) -> Optional["KVConnectorPromMetrics"]: + """ + Create a KVConnectorPromMetrics subclass which should register + per-connector Prometheus metrics and implement observe() to + expose connector transfer stats via Prometheus. + """ + return None diff --git a/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py new file mode 100644 index 0000000..9cd7d93 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py @@ -0,0 +1,419 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +DecodeBenchConnector: A KV Connector for decode instance performance testing. + +This connector emulates a prefill-decode disaggregated setting by filling +the KV cache with dummy values, allowing measurement of decoder performance +under larger input sequence lengths (ISL) in resource-limited environments. + +Usage: + To use this connector for benchmarking, configure it in the kv_transfer_config: + + Example: + vllm serve --kv-transfer-config '{ + "kv_connector": "DecodeBenchConnector", + "kv_role": "kv_both", + "kv_connector_extra_config": { + "fill_mean": 0.015, + "fill_std": 0.0 + } + }' + + Then run your benchmark with desired input/output lengths: + vllm bench serve --base-url http://127.0.0.1:8000 --model \\ + --dataset-name random --random-input-len 40000 \\ + --random-output-len 100 --max-concurrency 10 + + Configuration options (via kv_connector_extra_config): + - fill_mean (float): Mean value for random normal fill (default: 0.015) + - fill_std (float): Standard deviation for random fill (default: 0.0) + Set to 0 for constant values, >0 for random sampling +""" + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional + +import torch + +from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorBase_V1, + KVConnectorRole, +) +from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata +from vllm.logger import init_logger +from vllm.utils.math_utils import cdiv + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.config import VllmConfig + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class DecodeBenchConnectorMetadata(KVConnectorMetadata): + """Metadata for DecodeBenchConnector. + + Contains information about which requests need their KV cache filled + with dummy values for benchmarking purposes. + """ + + # request_id -> (block_ids_per_group, num_tokens_to_fill) + # block_ids_per_group is a tuple of lists, one per KV cache group + # For standard attention: single group, e.g., ([1, 2, 3],) + # For MLA: multiple groups, e.g., ([1, 2], [1, 2]) + reqs_to_fill: dict[str, tuple[tuple[list[int], ...], int]] + + +class DecodeBenchConnector(KVConnectorBase_V1): + """ + A KV Connector for decode instance performance testing. + + This connector fills the KV cache with dummy (non-zero) values to + emulate a prefill-decode disaggregated setting, enabling performance + testing of the decoder with larger input sequence lengths. + """ + + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__(vllm_config, role, kv_cache_config) + + self.connector_scheduler: DecodeBenchConnectorScheduler | None = None + self.connector_worker: DecodeBenchConnectorWorker | None = None + + if role == KVConnectorRole.SCHEDULER: + self.connector_scheduler = DecodeBenchConnectorScheduler(vllm_config) + elif role == KVConnectorRole.WORKER: + self.connector_worker = DecodeBenchConnectorWorker(vllm_config) + + # ============================== + # Worker-side methods + # ============================== + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + assert self.connector_worker is not None + self.connector_worker.register_kv_caches(kv_caches) + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None: + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, DecodeBenchConnectorMetadata) + self.connector_worker.start_fill_kv(self._connector_metadata) + + def wait_for_layer_load(self, layer_name: str) -> None: + # All operations are synchronous, so nothing to wait for + pass + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs: Any, + ) -> None: + # This connector doesn't save KV cache (benchmarking only) + pass + + def wait_for_save(self): + # This connector doesn't save KV cache (benchmarking only) + pass + + # ============================== + # Scheduler-side methods + # ============================== + + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_num_new_matched_tokens( + request, num_computed_tokens + ) + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + assert self.connector_scheduler is not None + return self.connector_scheduler.update_state_after_alloc( + request, blocks, num_external_tokens + ) + + def build_connector_meta( + self, scheduler_output: "SchedulerOutput" + ) -> KVConnectorMetadata: + assert self.connector_scheduler is not None + return self.connector_scheduler.build_connector_meta(scheduler_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + assert self.connector_scheduler is not None + self.connector_scheduler.request_finished(request) + return False, None + + +class DecodeBenchConnectorScheduler: + """Scheduler-side implementation for DecodeBenchConnector.""" + + def __init__(self, vllm_config: "VllmConfig"): + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + + # Track which requests have already been filled + self._filled_requests: set[str] = set() + + # Track pending fills for the current scheduler step + # request_id -> (block_ids_per_group, num_tokens_to_fill) + # Note: _pending_fills doesn't need explicit cleanup - it's cleared + # after build_connector_meta() is called in the same scheduler step + self._pending_fills: dict[str, tuple[tuple[list[int], ...], int]] = {} + + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int, bool]: + """ + For new requests, return the number of tokens that should be filled + with dummy KV cache values. + + Returns: + (num_tokens_to_fill, is_async) + - num_tokens_to_fill: number of uncomputed tokens minus 1 + (we fill everything except the last token for decode) + - is_async: False (synchronous filling) + """ + req_id = request.request_id + + # Only fill once per request on first scheduling + if req_id in self._filled_requests: + return 0, False + + # Calculate how many tokens we need to fill + # Fill all uncomputed tokens except the last one (which will be decoded) + # This simulates having processed a long prefill + num_uncomputed_tokens = request.num_tokens - num_computed_tokens + num_tokens_to_fill = max(0, num_uncomputed_tokens - 1) + + if num_tokens_to_fill == 0: + return 0, False + + # Return False for synchronous operation - the fill is fast enough + # that async overhead isn't worth it + return num_tokens_to_fill, False + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + """ + Called after blocks are allocated. Store the block IDs so we can + fill them with dummy values. + + Supports both standard attention (single KV cache group) and MLA + (multiple KV cache groups). + """ + req_id = request.request_id + + if num_external_tokens == 0: + return + + # Get the block IDs that were allocated + # block_groups is a tuple of lists, one per KV cache group + # For standard attention: 1 group + # For MLA: multiple groups (one per attention type) + block_groups = blocks.get_block_ids() + + # Calculate how many blocks we need to fill + # num_external_tokens are the tokens we said we'd provide + num_blocks_to_fill = cdiv(num_external_tokens, self.block_size) + + # Extract the first num_blocks_to_fill blocks from each group + # All groups should have the same block IDs for the same request + block_ids_per_group = tuple( + group_blocks[:num_blocks_to_fill] for group_blocks in block_groups + ) + + # Store the blocks to fill for all group. _pending_fills doesn't need cleanup + # as it's cleared after build_connector_meta + self._pending_fills[req_id] = ( + block_ids_per_group, + num_external_tokens, + ) + self._filled_requests.add(req_id) + + logger.debug( + "DecodeBenchConnector: Allocated %d blocks across %d KV cache groups " + "for request %s", + num_blocks_to_fill, + len(block_groups), + req_id, + ) + + def build_connector_meta( + self, scheduler_output: "SchedulerOutput" + ) -> KVConnectorMetadata: + """ + Build metadata containing information about which blocks to fill + with dummy KV values. + """ + meta = DecodeBenchConnectorMetadata(reqs_to_fill=self._pending_fills.copy()) + + # Clear pending fills after building metadata + self._pending_fills.clear() + + return meta + + def request_finished(self, request: "Request"): + """ + Called when a request has finished. Clean up any state. + """ + self._filled_requests.discard(request.request_id) + + +class DecodeBenchConnectorWorker: + """Worker-side implementation for DecodeBenchConnector.""" + + def __init__(self, vllm_config: "VllmConfig"): + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + + # Get fill parameters from extra config + kv_transfer_config = vllm_config.kv_transfer_config + assert kv_transfer_config is not None + self.fill_mean = kv_transfer_config.get_from_extra_config("fill_mean", 0.015) + self.fill_std = kv_transfer_config.get_from_extra_config("fill_std", 0.0) + + # Will be populated via register_kv_caches + self.kv_caches: dict[str, torch.Tensor] | None = None + + # Mapping from KV cache group index to list of layer names in that group + self.group_to_layers: dict[int, list[str]] | None = None + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + """Store references to the KV cache tensors and build group mapping.""" + self.kv_caches = kv_caches + + # For simplicity, assume all layers belong to group 0 (standard attention) + # For MLA models with multiple groups, the metadata will handle the mapping + # We just need to fill the blocks specified in the metadata + self.group_to_layers = {0: list(kv_caches.keys())} + + logger.debug( + "DecodeBenchConnector: Registered %d KV cache layers", + len(kv_caches), + ) + + def start_fill_kv(self, metadata: DecodeBenchConnectorMetadata): + """ + Fill the allocated KV cache blocks with dummy (non-zero) values. + + This simulates having a populated KV cache from a prefill phase, + allowing decode performance testing with larger context sizes. + + Supports both standard attention (single group) and MLA (multiple groups). + """ + if not metadata.reqs_to_fill: + return + + assert self.kv_caches is not None, "KV caches must be registered before filling" + assert self.group_to_layers is not None, "Group mapping must be initialized" + + for req_id, (block_ids_per_group, num_tokens) in metadata.reqs_to_fill.items(): + # Fill blocks for each KV cache group + for group_idx, block_ids in enumerate(block_ids_per_group): + self._fill_blocks(group_idx, block_ids, num_tokens) + + logger.debug( + "DecodeBenchConnector: Filled %d blocks (%d tokens) across %d groups " + "for request %s", + len(block_ids_per_group[0]) if block_ids_per_group else 0, + num_tokens, + len(block_ids_per_group), + req_id, + ) + + def _fill_blocks(self, group_idx: int, block_ids: list[int], num_tokens: int): + """ + Fill specified blocks with dummy non-zero values for a specific KV cache group. + + Args: + group_idx: The KV cache group index to fill + block_ids: List of block IDs to fill in this group + num_tokens: Total number of tokens to fill across these blocks + """ + if not block_ids: + return + + assert self.kv_caches is not None + assert self.group_to_layers is not None + + # Get the layers that belong to this group + layer_names = self.group_to_layers.get(group_idx, []) + + # Fill only the layers in this group + for layer_name in layer_names: + if layer_name not in self.kv_caches: + logger.warning( + "DecodeBenchConnector: Layer %s not found in KV caches", layer_name + ) + continue + + kv_cache = self.kv_caches[layer_name] + + # Convert block_ids to tensor on device + block_ids_tensor = torch.tensor( + block_ids, dtype=torch.long, device=kv_cache.device + ) + + # Filter invalid block IDs + valid_mask = block_ids_tensor < kv_cache.shape[0] + valid_block_ids = block_ids_tensor[valid_mask] + + if len(valid_block_ids) == 0: + continue + + # Create fill values - either constant or random + block_shape = kv_cache.shape[1:] + if self.fill_std > 0: + # Random normal sampling + fill_values = torch.normal( + mean=self.fill_mean, + std=self.fill_std, + size=(len(valid_block_ids),) + block_shape, + dtype=kv_cache.dtype, + device=kv_cache.device, + ) + else: + # Constant fill value + fill_values = torch.full( + (len(valid_block_ids),) + block_shape, + self.fill_mean, + dtype=kv_cache.dtype, + device=kv_cache.device, + ) + + # Batch fill operation + kv_cache[valid_block_ids] = fill_values + + logger.debug( + "DecodeBenchConnector: Filled %d blocks in group %d with %s values " + "(mean=%.3f, std=%.3f)", + len(block_ids), + group_idx, + "random" if self.fill_std > 0 else "constant", + self.fill_mean, + self.fill_std, + ) diff --git a/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py new file mode 100644 index 0000000..0c24a53 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING, Any + +import torch +from lmcache.integration.vllm.vllm_v1_adapter import ( + LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl, +) + +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, +) +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +class LMCacheConnectorV1(KVConnectorBase_V1): + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: "KVCacheConfig", + ): + super().__init__( + vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config + ) + assert vllm_config.kv_transfer_config is not None + use_native = vllm_config.kv_transfer_config.get_from_extra_config( + "use_native", False + ) + if use_native: + logger.info("Initializing native LMCache connector") + # lazy import + from vllm.distributed.kv_transfer.kv_connector.v1 import lmcache_integration + + _adapter = lmcache_integration.vllm_v1_adapter + + cls = _adapter.LMCacheConnectorV1Impl + else: + logger.info("Initializing latest dev LMCache connector") + cls = LMCacheConnectorLatestImpl + + self._lmcache_engine = cls(vllm_config, role, self) + + # ============================== + # Worker-side methods + # ============================== + def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None: + """ + Start loading the KV cache from the connector to vLLM's paged + KV buffer. This is called from the forward context before the + forward pass to enable async loading during model execution. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + + """ + self._lmcache_engine.start_load_kv(forward_context, **kwargs) + + def wait_for_layer_load(self, layer_name: str) -> None: + """ + Block until the KV for a specific layer is loaded into vLLM's + paged buffer. This is called from within attention layer to ensure + async copying from start_load_kv is complete. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + self._lmcache_engine.wait_for_layer_load(layer_name) + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs: Any, + ) -> None: + """ + Start saving the a layer of KV cache from vLLM's paged buffer + to the connector. This is called from within attention layer to + enable async copying during execution. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + self._lmcache_engine.save_kv_layer( + layer_name, kv_layer, attn_metadata, **kwargs + ) + + def wait_for_save(self): + """ + Block until all the save operations is done. This is called + as the forward context exits to ensure that the async saving + from save_kv_layer is complete before finishing the forward. + + This prevents overwrites of paged KV buffer before saving done. + """ + self._lmcache_engine.wait_for_save() + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + + Returns: + ids of requests that have finished asynchronous transfer + (requests that previously returned True from request_finished()), + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + return self._lmcache_engine.get_finished(finished_req_ids) + + def get_block_ids_with_load_errors(self) -> set[int]: + """ + Get the set of block IDs that failed to load. + + Returns: + Set of block IDs that encountered load errors. + Empty set if no load errors occurred. + """ + method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None) + if callable(method): + return method() + + # Fallback for older versions that don't support this method + return set() + + # ============================== + # Scheduler-side methods + # ============================== + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + """ + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + """ + return self._lmcache_engine.get_num_new_matched_tokens( + request, num_computed_tokens + ), False + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + """ + Update KVConnector state after block allocation. + """ + self._lmcache_engine.update_state_after_alloc(request, num_external_tokens) + + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + """ + Build the connector metadata for this step. + + This function should NOT modify fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + return self._lmcache_engine.build_connector_meta(scheduler_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called when a request has finished, before its blocks are freed. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + return self._lmcache_engine.request_finished(request, block_ids) diff --git a/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py new file mode 100644 index 0000000..07e05cc --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from . import multi_process_adapter, vllm_v1_adapter +from .multi_process_adapter import ( + LMCacheMPSchedulerAdapter, + LMCacheMPWorkerAdapter, + LoadStoreOp, +) + +__all__ = [ + "vllm_v1_adapter", + "multi_process_adapter", + "LMCacheMPSchedulerAdapter", + "LMCacheMPWorkerAdapter", + "LoadStoreOp", +] diff --git a/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__pycache__/__init__.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc78414153f58bcfaa8e348a2ed3412fb6d89ee9 GIT binary patch literal 448 zcmY+Au};G<5Qc3hX;4eW1QOy2G8EGR6(od&SX)5qfRx2@W7Ar-6U%WUIx_PB>}>EV zyg^nbHlRZ#CT!G#xZ%_N_n*#perdOxz=aI&#WMup)r)nYRkAp6$pc`3K?xL0p+YHw zULy%rkOnGDLlvbF1Y^L0N0>I|VG_N$c_2acwwg&Rh&DyW%aTw=wdJPXs-#q;>aGh( zQ1A8!=QNw}!PU@_%_KKxtL~j&>$))0zj~`*P&TxM;g>pzOz568fk))g@Q58QlDJ+O zKP~^=IwpilNyrmexMuLBu~iytp*xnEF}I!DS?~C0+T&P?5!Q1%DRS)h$4r#gh|$b) zhNl&=hUVoBH$KmbJm;BP4Od+(Rpu`vBDZ{Os1P3mNDalR{IT#4Lq(I?8?Shnv zg0-AF4cV$Asm_e(u`{OI&XkHhV|petQ9e3#|J?S6D@p;oQ5vT4OeZt#6cOF=Bwsz} z?gI~jvXV4^xkKK)uXA?qz2}^J&bbHw>UKLRNHOLwqyN)GQNO^7ksNwq{Sr-4w<&=V z=ol5mrzJ+mEkO&7WoyhDXMzkVGcjA-9<-CPEmjd{gDff2F-Oor;GIDyJnb=8oC|Vs zchF7xD`K9wH|UM~g1&fVu#&W~v8s4=usU87tRZzrtTtX3tc%wN>uJhDiEgpsL-;OC z!A8L;`o$)(a@5iYb-@KCM&RCJg3W?ktQI_Co!}K)L{{&u(MqGN;Da%(WQYhQ7@rW1)^wLd{#0Q2U{Uyq@|{dr-kGVta5az_uf7aGP1$PGIT) zrrv-FK113Xpsmqt3ry1i|1S~SaDZ0X6Dd&&UyO-Rb_n5AI1&rX@*P^WN99;Fg3Z>x z#B{)-)`ep6NH}sy3?-&guZO~kL^2gliK;7_h^9iZGa?oUGa<5ug$gec#BkRX}cq|Mee0 zcAH9(M$E>swIm62%4RfWsBwfXNyc6nv|+3*O~H7Bq6=V*@dMNiI${~4B9_D$q4LsC@h30fb6;^rd7{Xpp%ffdr}j8RXzO6gOFXL z9x~4F481wD?A!iP&0_T9#GS-S*TI#*q5I6?)!u#C-ae(b?|yIpifcgW?O&L>K0N>2 z18(zjZ@=OiSY`%(ea|%@+X2%*QB{4ljOvJA0fHe^#z`7Ug;GdU>_m7vmJADWq>vy? zvN;GSZsX<_5*4&z%Fr20%7n^q|qMs(qBuz05#`ha~y2wMG^bg2{shhAq&A;PNw&U88bN%2S_pZ&ln|SJ|4Im+tdB zmkzA(hu5tZkNw)woSovTuAN?ou~ZR*Ai?AJu3vx*7{r`shDzxI2$%&ws3eV1<~->P z{Um0Yv1BY~DTxOJFlGF1$qSk4IW~AM)PMZ^@KEUNiQpjeuME@@uxiV)8OJyymnIXJ zLvl1N%J`Y)qST=*FbFO`ND9E9?eH&S;TrXYo$_p62z>O7Z09kh^VnxqEACU*j;>bL z-)_0p@^;&`ldG=UWu{hYhtVo`F_w&64qXb%mqa;Y&K|mulE^CIxZ+jH&;p$XHWMsB z0s1?O-j57B9kW*>T)6t9L2WR!|{oj z*uh6fHAHeKnh23Wd{pLppX)$e3I3|cUya6MWswtP_yYWz-se)6I`~~X6G8{S`voj_ zb#>{ewb%X<31L`5@&NhqEId>%nM3~7x**Yrv=&7K&NfuIh*B2-cEi7nlnmS912VkX zvEW_Wr0~0z&fPt<;u>0JhRFIyV2GJ;wK1(9hU_*aQbAgvf|m1?XcZYe07aWXAHb(o zwCi;X)&;1KS_B4;(+a`%78PWt?STr_bqbjLEC?#`%!D=_hxVaSL6)iju81q}?PR|@ zpisQuJwZ_B*c?}$>j4Z}RWn534p({cm=njR_)de5br8gdeadI_e zAOpQsH+N^N+m-6}74Oz%cB{5bCX2!#Q&4>hm8bvELN+hQ+{T11gK&{|odn&i2;vD# z@GdYy(O8`&;PcF6V1JkT9%G!4-=e31X@=9o=YTvDlktn9#3yxXH`qqLl$?wS{6&!m z2_B9i8zC{5j)ooOk4&ccv1E!*+xT!o;6bMBg4dHBk$SPT8?wSZafd<^NjU|41yVYc zZY2)grDBc5PkB0*el?{aw;kXzIsm@R+o)jiy zSmr_@oV(T|)xmeDwu{p#QC7VnLp6hQSB9ZhIESU@V1)DnWOpcnnQ*HF$~Z-`eNd25 zUVfWetE#&>d27cC)3WNVzS%!Fy}~r)7}~aNzIvU4$GV-dIW-&zUc{{&G+b%m6aN%z#g=0wdZ58~odWXE~t)c~)cv7J3|_Lv$ivgKF%2%OsPaOYe69 z6qg3_hkkcKKL_eL^1lagxj`*=Lfr#nP!;!1bAd{=@ziOegr0usEGVbAtWuw*mZ(ly zl%_-}G%3Z@3Q%oQ;_E5Z32QIL$X=+QvY)~^5))Xj~`Y7s2G74bK z?*qNYV?gDf8G9PeJAo$5UjfW-$usGO5b(u-nKiIXTLVlu29qMk4xVFu(wMx@tYJ%j zvG?)F^;(@jKXQsU&MAGX__$^)@L&r}ibY8DUD7ZFPruCT4bGxb`7P82k~)t}#81ki zK;CD(3-#wB;QXFzz@*OOPCo^(P9z%Sx zzu9L@MAHw*@Vw*<^F$*DZ34wBg6paYjsvkHAsG<`O!dT)$;*=yAzdkz2CxUUbJYP$ za2fa{DiD23LKRf%ge*`Y?ZGk%NvR(*6G%FW)nky!NPkEGkTm51M4c{oH*d(Ca6W2d z&LE0u$UvRgM)^AyPu{IqX*@j3T=#&Mdx9RKXPNtK)0)2>n!Ky}qmjn}pbj?mF4g?7 z^^?{WKZ1Bc*4=nx?u~^u#oM#gch9>gM_C-sRqi+_&JO1?)a?}C{{9>9ys^}+wDrw0 z57=fR1XZVDN5CqeepF;Ym90XhM+)`0&Gb} zwAP(%{Jj}QWe)2xDK96(G1`>8W$hR(lakke`Gz%^EB zS#{2{Y+%KDb2_iRjNPXLqfMKh3Hqt;xt^EKJV4h8;P z9cx_G=iH`+jwL4BGobVgtZ;*~^uHZCyyETq(Um3tO(yH(6(7Igf8gu5du5hc_Vzuj z+`7v7ZVb;2KjiAOT&u#huDUC4?w&ohUcpp3XYF9n?dY9lvTT#WHqD>B$8P`iBRAzc zLdz)fzu(ZepCNkdaFL;cv_p=Y&!F%mx|3a~)n#bZ*S=>h67T?7nldyV#c0Tt^P3pZ zGSD)NFps1>7GuPWHI+YntOkVw4I>qshesYR4Y<&l8cw&KBEpE*caTWzJOjR%u16)2 z2hS9!hhPb5+YSzkfQ`_dYJ;OgnwGW$Zq+JJ#)-^=P9m-deIXfX{Z7y*HS0}{F5?f zYbKoF!D*`b`goDJP0`~AUXua9NzPtr2ug-q1mW&5k_$VFN8GV!l#TN&w9_R#J=0q^ zUdsZZd>e+YBg0&yKKJsg^-Z_mxb=qdsPo@W+)Aw0Z~DSv^Eq>r&7s)_0iU!NKqNFQ zr2UWqx1}W2nvP#lD`4KJQcLJnlTKlVJWs3`32k@jEM{NB47Wj>7=E#YR%{dEa>s^ZED0qBU^O^Y1O^>j+ZZ-OH&7|bAbu7Xtcq~Qk4tz5H2#fkC zDCPV%Tl+#}oq|WsS7CE5h&c)#%RPNx;8U9wPACHZW-^ZA^#)Mtem{!}>?yFQM2B{L z0k)9?Y$2!Uf@=*cSVx?|iSBVcdrQ(d`Xo!zsJ+PA^8LtXd9 z1h|@ki=`R^4tAvx+Wi7tG(l@KSgcJ@ZxNbdHJZq3)Ig~jOVHN>eXY~AfmXH7WaU?h zPOh7>Pu9T8Tc`tYCKO4&4J_3BB}qAHXY4_trAlmCQ!OAhQ%;hasT8E}rh=pbJf_-O zw%1f-iLLB1;EAnla1ljpWi)L_N}Hy^a5Dp*--)Pzw`X!HoQQ~MMUnd%w~X#d!`xDRaGfPZ#i@LYRTklr zY!u8Oa8QF60*A|L^@Jo&MU#_qY&vB0m7E5emDY)$!#;(9MsyL>~vds zf-}vtzX34$Bgnu!t133n!1`)#eShyed*9uUCRziC=v{k=-NsRFZ_Y+J-1k{OD86Q; zxNu_TRAqq3NE2qQMUUql`fVU`4PLS`h)o z8(7@@ard3>74N=Tri9p96t0Ds@!9GvO7)h7k-xw2!G)zg*gUj4eLU>=Y zl&f*BYsKl$IVgvh*!oMG7n{-E_baS_e&0QI8zIO)65ww|fYR$o66h)_m09e_;JJ|# zgRc}&oRr3Ke}Y*8vkYV;XrPdY>dl*b{WLb{C~uG!XW=Z`(Zl$Cd7 zLuoSha`v%tBpOc$o3WL}X)mi;>e7~`uAgA2-_x0i4kvUElqAp-txuAM1u_#4#*$=6 z))~|rAud3m{F>y%d%BVY{Un_10JKk2cdWxw2b7577~oVpn8^_JQ9#ZpB1>pmn#MMA z^5!X-NJ?GHdlUP=g;`!w5wLG=uVmJkEE6^{?NKZMvdjs+LM5zD9SA8Ec zkfR>TS2fG5*3`{9eo|FCdj#E}Lvus(OtxXC(y()B=MN8ka%iQY|2{Xc%GGAMW`%2> zA9?@6I~N{s&w`RzT?cw()wV+KX{p%#`jRc%b6Dv)ywY=Ind{#`T=51DDd7qt-~Svs z$`CwXLb|5irHe~B{ruC>{9}N4`SS-P`_d!g>BxTP3wN77efiUoyKT!oC(Kjg^5Uj< z68G3n?WFJ-h9m~8fq|-p$P`~SpteP{;b%ZcF7z81q%czu2x7LueReth_(_)raQ+As zbpf>M8HVIGIE!rMF1?Jm$n z=~W@o*UzL(^Q@e<4VNs8aeW6T*C+$tf^iHuDoJ0l0)O@gSOM!-u7LHcRv=}bf|&|q zUF7<&SP=_YF%^dG0p@Xj{ZN-LNrJU#{k{$BhZ+g^Cdt)j8Cz={on$2`!w6KtF7SW_ zE>cY_<@V$0uu$XZkS(Pd){8AWD{8#0Wb#loW|ez&53Fa3xaB&oPg^uk@!=>_sI{btWNB zPC{8O>o=S*%J3iBbrJ7`k(%Lyxn@Cgh3`&p+29x~98`@*-U?*k{$r`8)_1qgdLOp% z@19>~{i{8@vpq+Yo+H_w<4Vu*ANQP8xQ^MAN<+td-GYC$y7BgbTL->-aMjnaR@-#@ zwOg;f9hyD5$~9%VHic_jaNg&3klJR2+q|&vKG*dh)s3I~>TYwlIEb{XeQvF$^?xgv z%Box~)jLGzc&c|#wr5c38O-(!DLq3!?m7Kb?`zF+<{MTysBKlatqbG#x!%vK>uyec zcYnb<2jIe~b+yJ1w_uf3Uw|948g<#_hQUwM54e%sCaQOsUf)JFJv)066jtq?-$1{Z zdcf_@S)ub6epm+5ze(XX882LFf51JD{hi=@aJaKH zb^fGo8?2I-#3OVnjh%|ObE)p`?(F{43jFn+UIn*>?-|ASOxD+-_&OF}dEnc<=JVh7 z-15v%J@9P<4@CWT(8?XvS+-GO8|T}z{udShi!1&^_u0d6MZMhAx$Ny+)^&s5enQ|AqF9B^yuOX^-$cnxO-LOO>`FALQMgqB($^|VHIcQA> zF33>xX2O7`sITzJQRUpWW3+L?jhlA^d5ngHBtIM}^Tu+TL>@+S-qxQ_7|zjR_pM`N z&u#Vj;SGGZL|oY-u_T1ym28sA%9kcnLh@>2L+`&a;8kS*<1kOSGd7&Eo%h+UhitQM zu)Q~wZQidm@4qW#4-P8_hnHVGL&6;X)9IVrcKw+(mb>AZ^Jv8WfNgzzmvM*5F6&PD zvVm3Qb{zJW$?t$sGR~-(4MMca>6uT)ME11Io62G;xF*QJ4}Su5I;}+qjEN+;8GlP7 zB_P-tE{h<_mk(b|LTIxg?V%O#fSiWh##m<%?eHronLuxs!=qx!X!A@1wSGI;K9S2N zI0PDy(}W{$mEngy5K63_NR>i#6n^s~;)RuVHo?yV@DNhH_>O`DswmvxuVF+%v{kXE z?8%2*7)j|Iz=O|#qng^X{w~GemG$pe{QK_?WDlHE4xC%@pP%)vvYa6syA-x-F_rB; zrgR^>&mJc|yu$KXcBjJbTmpgCzvgXS*2zl3=!l{VAJe9}06m2^4eWRst*bm8Kx-L} zE{+0`@E%DDV|Ed;qBX;Shjcy=vSiKjL344Ggb_s1ju|>V%TETKKSh&4QZF+dLqWzk zFNiX<7DO4M*=7n!K&PjS8N;PT8N#pE;3rS|uh*;)`A4TajbN92s0)xW1iO@rbdllW zs0(mwfkC(sK$1k&!P!YKV}_e1BYPpyYvJtZ^i6kcpq-Sxk(ff_+6!PrO>u92+SonC z+qca2{raA_k9@+r4bL%wwvvb!NdUZ(h}kG+X0(gS2+>GN!s{Ce4<{}9r3Xr?4fr^6 zNoDZPL$wX!)suvghSD|6(0wR<2eUcMP~QZINeJSVFi1xWmU$0L$ZuLG4XXVl+DG%2 z9KtH06SOR(OGfqq6y!&+8W=#c&GsURpMhsV_zc#x=ou_&!82IWVrQ_Vh0b8Zy;j>ezvb4~b2hBNd^}Y*cPQ@GTm`Aabx!S#^jtdY zh0x!&M-FV%(UAUj^36Wm$wE9QOYlJbaXk224ih-1;n`>FTIIJa^nbwStWaKacxL6B zyXIb>Z->}pE4DB0{=mEJ?N%H;k8ChPM**eW>vTSX=#i(w)|RVr+S=9}-0a!8hIJd1 za}7mtMe7(>l&i8q6x3=nzfk+mP|k{#HHUlI)22B1FKk%V$N?!K@T@cr57iq<#$uY( zm%G9jBNCpos>}R!F#Mnt8BcY9nIol=$(Ssmq=R3UPEUXeB2_>|`a8^!`c=37`@F8n zR1}8d#ioQl8MWQ$nCk7)!qK(ghjv9K;XsR%UuJ5*UDZOskYiN;MnLf^T8IU@5mbJo zQD9o8k}yAVgH3*asj+Phe%0S%M%b2IVH1Aaf^CF%16V>oy(XIo!!j3P`l{8DilPpP_JP~1-_=g+7YzqBy4dH)%8>gSG{n_pXTv|h94>=t^Eo);E& zEmkZXP+Bpx6dIOZT_?|+lclRxy_<4Yc&=5or?41-` za(^yWJu|yYigFMjBXOp?`c+-kUG-J%KQ=W*2t2CqBc)j+A4}^)l zTkiZ&nCO90x4eV;8LiPXmmu^a>^8tql%`O8C9Q}q717Sb(yNQ z*2EjCl%>!311yzuw92VS>HiOe*%0u3Kmf3h%3E|ZjJ;iZvjn!O<^ zInkV^;*>H=HPJAcqD_f>DN!**;8lSN)lAXU#onBx8nic^@>_mhw1np@kEUCKW_k5o z4))_$^*NLVi@}lxvLXIDKWOF)g9W89U`lL?ngchBFTQm2#*1`NRjv;f=FMqc8^rAo z$_g@Yy=YQ-@WyP$WRhm&D8n*{L!((!XM?jtgK9pD`T*gWbc#tPsA;fhDyoq#%v&MF z$Z+o1c36HV@{xqs5_uF&d~)p8v5!uayN;Km$IHIskAY6I_;Me(2(~B>f>3W0)1{N8 zUzsRo{ya~F#+C=B-)%8*Jbz|FZmIs2J+k*pXBPK5vM$p#7-XiY-q$V3KDQ^V+3Zpm zi54&u^_uKA18!}m)h*S(vjRKzzpN8zSm(Pr3f6nVQQKBu#Vxr@6YzNBpiK|-a1OoHpqm*~|E9;~4veIL|LQZ>wSBo^j>}{p8G^Gs z=Ml(N(0rrrFO!?zt7J|{g{32~{Hql130x+}UWoP`F4`wl%rQOBo2fRdFkQ>zwVIWf zB3)N$M1*G&2A%@@lNot^{Os7PnX_XTMkmwan0-aJ8itot(@oK!WW>bSC(l`Dyrs2#?lc!=n9M$Ya^j-PBSXW1BR5NqNK^9k}s$fnKncNtb|HU zSfL=9CS_VbpMq*eksA;c!p68uEijsua7x@5 zm(q0#g39Z&R5Wy5+atcAQptb{O#94bVLjw9%{|0s#VusV^=F*~MWqhUX_#Rid4;)9 z+#8t%g?!hcG13d~T-A$WRsz%QoV|@=&*4g-NWC~U%}>BqhiJIA!-jLWu=@h&3ps=B zqqTK76Zw<}j8rp=!p*W`uBTG<>|}J7$%2e))36lwgg@iw@LD2Iwv$jtIoSQLO`Q+o-Q{@igSMT2boxhQpN*A!UirFb zwQKBt+iMS!J8qAEI$rg7JNG^D`nPR=5+jF(D#_k&4jwKidmpv+{gvmhBYzR8v<=s~ z3Z1VBPrP2}8zS3w+QWs;b2hCSB7Mi!dPgd~BdfimmE`_!_U`|agP$KP51g*-9a%Y# zvz-6*d|7qGaGq z`*z-s?u7TRjx~^eZJ=NNy^(!qeBQtF`OmZlEy37;C4A7%v4jW(aDa|cC17^zrc(AS z03A?-ci||%#GuJp%1lubr|C^`S~8|xC;`V>5{F(GxUQHYch{8J_x@XSU&?QOLkPeI zFEjyYI0z&gG}zi}f!Q)RTgKGU*$=(I($M7sc0=`o%0eU-Uw(gO??=OT0+s0QvTrv- z4`PK){RY)CifmZne4bB24J!u4f^H~i91VspI*XfiA3RHZ^M4P8X8|b?gCmqhuUQ*i z5=tJ|lbj*fULoW?IERbAMSsaZgF@Z(0JY|^>rIQn#Xu=g^4@rl5GH)Q-weBLKn*TH zrBp)@iy;#)Ab(0Vlu#njGqjU@eA)*p<3La0Dta9hG{+PZ%z-NzP{=?5QL32JnV1L1 zpyYK~QbkU*p{pSl(UkHjXfCH@74%w$s=z48FqA3o#td7h46j})VEGJXfm+@LrC~lGA$%cju|izk00XUP0~eMba4|F43)sa=HQ7MH zpv&ZsbJwLpL7mUkw2g|5(NSm`+U#rd`Vla>CH%d^Y=^=M+{)b9?j$C3}4TUHL->G?E$ zch73`{$=lj?&t1xtacAChk@*^gu1FxviD>)Mq=B_zNCF7V<26sIyQ_URNVUNY0&bl zt9MgieSkpf=Z*DjE32`?zw`p}QLJMvCRSo%HR$sPxVW;7oT9q<&HN9zd$$p}c~p`Y7BPGl zAZ&(kdeIMZsofVB<-vU-9~^*v*Dx*2KGKUJ^d=l{_DK zO5UZY+2Xd5%uRKwegQ5dxq z_6^l6Qmho1*|BL-s4h0a$X!p(4+h59yK|}OR-Xvd1SvP$bnK*sks!-`4}dQV zH6=g>k`0t*FI>2|5kYMUZs|=3Ewg}QvwA)cAU5w#!r=h|XFx4V766lEigS`7%Eba$ zJOqrK5h=@b!>}zTU}6r5zy$KPQ*6o@<|Q;u7}h!$UhI1tDmLax4U12+D(cr~0OunA zrUbbKfL`2S*x1^(bkBMhBqpgURr@)h<f<1}#z7-w{(lvK?PCRNn*+-?UIV~V9SC(XW5QD; zkTJbEUsUp!6+*0?UUIOto3f4j@n4Vq>G&xE&kQb_U00n3vgEXyhQB`x`61N%2;! zad|89HfG318Pfqg*e!hDmY;vPTe6x_I$ANGQKfk}%~m8kUDR%5IABS*Z6LjAh{X)6 z2`N~#JNWsLw<2>oQyr2Ia#S}`tyZ+*gm7*dqEvPfU${rU7b{@z8X&2S1aebWtiCCz zg`FKiF$7%LjC0Q$t})Fn!F*N}>kgwQB%7U61jnr zJBVdr5F!y{H)wH*{AWM$KmT*z%ilDmR+|njoqZH+EeCh6C%e~@{gq_@a(KOc?^^re zO8epESdDrs{3zO9j&|RDxx8=qE46&`(u3Bnsz+$KB0Tnb+nTGy+tdsJ(6&7|z6VE& z%i&)HNTTnH@LJ!AO5cgE=KuBwfAfRY`0GpOo&-pI$DI`5t)*A10n)N#Ezwtjo?WYn z7na7>TiVxJ_EuWvE1_;)=&Xb~*F(+gp#=XQe;f}b0!!zr zNfJx0yj|uomJRQM%uek>P>NuKAr`R$plKyHpT$eyKD998BnLAaV=US8B?29I*x@?N zlf5541H7H7Wp;nx4mms!mekUa5R~8@gr|Y#b!q5eH>~b!Sbe-Cup(_@;;-sVOG=v*D$SD*VH73PA+l51a zFuo$aiLoZQ@yRrpR?Sp$3T2b%p&jQliKE?kbTWmu*J(MNa+6muMg;~M*TWDAS3?8b zNOxy4CLJs?W&->8a{(mt=5);FgJqwG?A-!nzW9cRc~Yn$^uxKuI=4oRTksqT1Aoe3 zqXiuR0_>}{U@b@KNCo{aVi-(LZq|kkXCNKmvNqZG?zMjL-D@eA?Trx1owCo-8(^m^ zkc~3*B7+#z7PuXLqG_|Qo7l(W6jxui%>&>om$;MO&q1DKD+v>%thlVWtVBryu{GI~)k!7BU~G5~8S6r>0Z_I^R~#%2tL zt7j3vfq)^EmoJ(CHEo{oR6>Fhej_PDdL@?wfXGjjBNY&t%NKq*Z#)Hx*6+23jacy>4PiG6tN?aecDB1aZW?~B+IYBl z7yy}I$n6?%+w9R+wBdq|Z-}?i%zh949;2r|PXwOvxz>fhN0fvaplyDD+Gxg|=A7pb zh1-HaKHlvk^W@LH@N0m8za~cNw}><0ZVZ|{>E|pVw;@p(K0ZY?Fc|FQLXQ)kU?#&g zE$QQsUWL^5$#kt>1=cr2c>O#1uO(7$8ml~i?w+vx{wJkdr8{r_=-Ox3?l&I-+J74u zNdIu}eu3v&Qb8*Ss0PD(hVeO@f|un(4e@r%hdSpOnUs&o*n{G$mZ#y<;W@GKL)#YI zviD)cU&5d9V|YQ9?kEYxm*?&@-8JtA`ycE*@RO#od%v~z{JDR6{@iNQSk>#<7Xg0@ zxs&7I<6E|^T>XRiA1Cjst9!mzjvpz9jywn_R@(0zg5lx*pF~?8CEC{#dn$=NYl)sp zqGvVHw~mzkmBjwFM5>ZVttJjt{obU5FCt`H*IM#GC3)cP+*|%Q@(L{Qo`9(ieab>DMbDQ2wXQS0Qecc437*$aaw+kQ`u`sK!48mAT{oy zTg-an0L<}OEiQF-$c4Rok@<{kV&8`116aj0tQEKdKlp${(2;`qH4d>ka0L0F4d4giPYIW=eOyAv z!9o{RdXoJ%3_xsR48gnV6$Am2xkBV$NaEi}@6Sl=AvyYx^gbjfe(Fs;^z}aUz4*{~ z6be5LCs*EC4ewqGtT*pkiUJT!A1;%`O3$66mDYWCL8{ z6srb2!iklUDuK_PGmr68jiiLGySXoJR(f813_0ko>N|o@=)TidCGff1_833a=7g|q zJ)XonRrNyoQM_$s6-DH>DuvH_U=y9P1ldtPlw0D(+%Sd%*~T%oNgL# zVs>w$dAeo1h1q?H*6FtKHfAqPv`^+n?G(9{% z%csKs6XPdMT-+`+qYY~kr>0MjpJslw zi8Iq@$Imi*UE=vUH2%UnCb_T2 zUyNJL+-Z(){07H2y~uB*5?%89{Bb?=lgknPrT-a zncs_azQFJM1~>jia^d&$eaI(tqnPi9PgF@`0BH=y_q>BKPCu=#Ut;ZW0KNy+9L8gJ zO_8BrP_VIxDepKPpNR{x)T|IO=N)`36`M%JlF7XF#$-IfM~d?9tDn7mGJ5>X$>ZnG zj-AdIjn3T7+b_+fCTC}22_#^hPfaG0u#{c6csw?7D}Hk3=HyKLVmuY2)ZpQqoS95T z6SFsO#s$235$;MVmP%fng*);!<;$aq=?RJ(ow<{GEgGAdnMHs&O61;_Xg;{@_1OGm zf@f)#!u?nxK7%*a(VP!#3w(Tb=EmgBd|mWTEHR0a#gPpkO~zBv3FbrY^Kn$T{@R5L z7mv-~xDgjF#O}t0S0|J4)0baD{F(R!%A2pE0GDGoH{%*%-qLX|wABew}5>N^9 zzY&k6=7o4NU!(T&O|<%~5S179losJMUutbJ3f^?ED<6>A4b{o=Z&5+`Non(=;G06a3^IYC;Nm`%Y9IVQANh zTTwoBcPtvqfzuo65~4>#cSzk3Z!X7Gi^__6O5=k3wxOl&$Hjpj?D)bwZ_*j>@+EAz30;!`v^ zlgfLe>ZgjyKy52i`d)2Zd@Ays3S;&60W@qc4X7*WF+w|yhLcWHZS zan|;Ciz1f1{YvakoV9o$d3$m$nuy1Ej6=1_S=%NNWK0;K91oMh@}}|$qpEQhmZTR3 zD{WP6)>2bYP6}pR&dGoer? zh{tgj{HKrnsRgcYg6>>2rMW5c+41}8EBr1#cQ|3fIL?$heH5g{n^fv((VRBlGu`D< zR@FOgPN`Hfr&0+?^EvKornGs=p@!0*Ds^#!`$CxGK93$-ELyari>91<7}ds`)qInt z*DKPdd*(^1t0iqpv*Gx9qlHVGu5gi}vAnKOPltPWN#GhjTw7S&J|w0BsGtS z_;ZbH)^)k5hT-6w!Y-DbmBs?f6{hs!)ab~!XXp>pIIBsP$tvXfc?`*>0t>ZQ}F%+|D!iH(W z!P|)5#nfR}+)9v25v8K}uxW(1jXUGTe91N+rxM2%cWa?t%*{RS;oaljxDP3N$OZJh zpZCTCypJ#C{c$TFz<*EN#|L@thKn!zhD9Bf>T^8Em#a16L-8`rULjNNN{&h$sTN4f zcB7cD1YKQz*A}VDdz7{Ls{CmSr%;f5$(%4dJqLO=K9hG85Ik&F13AM-uVZ?O&PdWS?a)Xlp_?*r;eBp4IZ4Y~QW%HRsl0`9~;Bg!UFTr8~u&3D3O7!%r;j~5sIvzf?)@AysbCbwu!X){&Y zG)43AjvsHLUQE#(#(vtIQEeQ43T?adr}omKb*9!hmYmDS$4%MQbdBa7q)WBi@xzRy z)M>F8A=EFVAe2y^Dpg;NL#T7wqHV^^n-^`VfF7PNde@@nz~)B#q64V})e!pg9-CuR zA=SmWyeVpI_A^dJEsL=)kJ-M8`?yw2Thm49;

Oz^rV0*ZK~_QaBECa&Zj&BX^up z>EYC^cvu-`VP&lg-@vC3j)gG-Rj;0s5;l{<7{xq2q`i^r_K_QIH{j1ALo6f3Cmd(I zWLO^RJ=+p181~`z9b~nfNI|i}#ER~SPb9|9k0jg<+EOV4zM|`KAv}8{EC)luVyW<} zAX2uKgDQ4C9&S@UGrp}WJUKIwm?sjD)#Xd{IjX)&mR}0D-vJZZmV7QL%pI{rm{I59 zTa&3s7+flA;GKn_Op^-p!f&sd*3QIysI$%|S1l&QpDT{cCZI-PrAxNwq)kiVb|U@J zaV+5?*U6xT zP2kChlW~KLn`F=w8F35O@W?x~$v1CFfUN)%I183wjQon`X6NLTi6RhaH5VGl!>3H? z+H5Y+DxWXH6pympqHvRB5P>F7qm*ynKRl&9XJ!ypFPny>oFNya0 zCCBpcrngG+c8cE4jCU|&AB4H#t<44Mq(F-pXxRu=tsIi72gT~aZ0JC)yg@48CzkKa z>_407JDVv#CzW3k%P(ciU&w{`SiJDkOLGLFZTrgAMFBIw3>F1p zerATIKy}TS)aN>F+6u>R#wKU9nP7}f1q<~r%_QgNh(C<8sZS@U!i3s{e?hH&6<=ly z&wMecbpU^YfERfVu%_nc0O-t+Z^<=6^-c-`a61ddW^)N+JNftoV5R6B;46atEXjD{ zhA>a5+<}p#A%ibg{#=XaS$!AEc6_9=^Z!BNzYPPQrk``T-|*b`NcLLMUb`~AZtvQ3 zR%D!EAP}|n_b+a^eap33cm3+#HFtBarjA@fEAH>iue-Z*HIau6VojIomRfgrIc}slA%@zm##kl(V}edswuGC3~A_Z+qxow+}qCb1v`FV%Ax^ zTDsizs;k4^8b>Fxj4@SAE81bKA@0wnzq>fQdjC zXqe_*V7yY36Vq4&W_f|AC?Q400vTT+V+Tu46D^xpK%BKkWXUKRa&vz7x0+1yTC{>q65pUYa>@E)qUN7>?m z1PC;;R_CmxTjm4Sl3YdSRuP_?Wi4A)JP|D1v}Gfkooo)Wm2km|EhpJrTuIeOZrGj{ zam9WzYPKxokuud-+c!hiE5{$)-Lk@tgeq!RuC6w{_1T9-D_;~tT@O=YX#XSrz5RbY zDTa=0*(i{mEAc#Yz$Swb)Or*6ul-cI_33|t(=MQpHC{}!4>&6s8nPM981QBMId2(v z#7pp|0!7Zai!WAqJ{c}59(Tt*nw!iA78CELxyc-0@i_5tYTOdW)dd`!{EzqZ&Ulb_ zfk$-Xzg0))ES+wz{ zMSI$YI2P4cf3gXi7!o`D3v*P0EILx!L~a~{FQP9MOM|iirOgW{<6;RyYm1Yf-)p9% z*~Cq9Q`!=x`|-f3)l?Rph;P7X@z#0nHIr}%%d6qCh4isWxl2ilZ=c{OwiO|4>5|(h zF~VbW72*AQIaC`$VuLqrbx|mkI_+F^rCmZ%x(HyHHEkEnX^TKDYhFM~a=m9vY1^VZ z?Mm9xZY+XYf20UgWM`DTYCw#_6bAg=h)iRD+s^{#0{V^cIXmZmLlaq1v<( zmllN7>X9x@n`jJCZwUjbW;NDSn`+aayj@KTB+18zbnKSbI~F-Tq@Xd*hAnMamsEvX znkf);JAdz%Xd~KRkb1|?aD=F*O`n*SGM*L#>A+OCK^jGA(+!QZOZ$z&q)Yjtf-tP5 z=?h*Arh`)i2JQH%y~Qb8b|OG#lN*+XBT3ORe`8m1+Rcy_`V zwlnO{FJ*|VH*ci~xp`HX#VZ<2Vd2OVf*q3+4GTSE^V14aHWY5*yC}!7oRa*m z4^4Xi9wbNDCj3H&x&_agjt!j_vJ!sH#*g|Lv?L>Toxl^mj`oo^{Juf9Z<4{2L6lsi zPDYLxSNJ7zogjl29p*2*L^c+ju`rC886~$kjJ%ae3FM1q`2!hi3X?~1YL;LnnTsrz zWm@vqxXfQB&^Y;Gh#z44KW~G?!>!mHEH@G{5+M-&J4&9WKGZyZeg38lB{7QiYjBN} z3BOJueuIo9GFX-FlkGQQfN7lpkY9j$Y;youCAY>g3hQAXNA&va{0tux?&e)eBSvW) zF|IA|M+YfzBpz&sLSQY)*lM2!t|&qn`4S2*cc?H;dAQk2boRy#NYNx&f3V&5!m+N| zWj;E{P%X7A$`D?tc~Pd`Wz1`bFQ_&y`4OgGFxNd?b)!_(DOPnZc{i$RS7x(SJxkur zs`@2w&RMl`Q>xn|*6mqy?#U5AwdLXJTcx@Iv2Gw^uYTh4Er+Dih*%nVI4YKQXL?7o zu49>#FN?01!5uq7VAd+bQbn&=(YsdB|7f2S8WKZ8nL}fl%P(a^*OpwH!D=bkCI;Ic z_GN;-A92>QlAJ3bxoSmM?MGaZyC&xgNxpW`*Zy#?=<8bZz3{01@rbzhg^!Bhxk;XF zqOWbuH=5Zqn&}$-$O>oheZCs(;yL^udN^0vN@b?91ABuX2DxC9*gA}od1^D^eWH6` zrvLPs`}AgMc+18)su9{(nQJ?^bb8&@yb-L))b4pS`KUV^Ji6pSnq|$I=D|$MU?zBg zy`!AYnl)$5Mt$?Q({H68p3T+|WL!15K$R3|69a9T!2T`H?Dph*hn^PUiFe?F?nurT zUTu>a_lu4DAH6K~pB4MhJ|ll*Q5lgND_gO5>FkrTij|O5*(X-^Wy|`PN}f19OLu>3 zWaZq$DzRq&nsfh#%m2oO`xmy%rqZLPXJ`-)kP3Hz+;YC78@|Bu+22L(Rrq`6u)^2F zl~uiY;lYK~kkl|NHVpsy$$x+D$LH3AmzEqG6}8`Xz2#bc;bF-STYl8>gO2rzlS}SR zSEb}?5M2$cdsnAcd$X>6IZsIP)Qg__)vAXtta`JaJ&%S&&*2Y?Ozsit@-lGh70ptp zQw()xL*1F4k?$8hI{QC)GNB_&uAH}8@-~RxhVQ)e-SM}_*Sx(MdvC_ui<-O2fak1! zR%#hsYZ+83mh%nge3g=~SM>E_9K7jy;Q0re1r6olDr?a`-g30{n%g58X%20Axz*w~ENJ84PV%$wK+z8(~&E7pz&~9~{|ou-6i&< zqH}!2#I_WYlz;*+mJ{I+y*W1@CDA4U(<~D|6#fW7egol>gzG@yz*{SM+eB|$#@Uv! zw8`83m>IR76*l53o^B!%#7ayM_Tawyr=V`NAiCwN|0boyouZgKe%LxRjHmKIjMLr` zhhB}42fm156O{${gUN{EW~hP67h&u2$}Z@L5#pfoHo`f_Zm|q7h(#a(Qz~p$P#e_U z07X#w(?#Oatja89eK+nHrcQ<$Pnec4atL;67r5)H;^zTCN8STM|cn=O7)d%nD=U8q)GtC$=4?0Or9CCe9nTlge1ncgeVcT zo|2I82*pcUU=WQXlQ(6MwoI_*j~9mg69oD#`i%b(7JSBK<@d&NHh;!ew|Yow8Wo#H zGffva?4CEA_nkS{o=2V1zRTji%UEa~1S`8jk~1PYBa(BE=-l&YY|VLWqqaHctC4)I zqOUd6e(dom7VSFE(%4ob=c>wBDz~aRPo?DU6y2SYyI*woue%4g%uaVxu4RwZa$IaV zF11_}Tdrk%jk$*XQo}K^;aDzMF9mzWU@y{Y^e=m`{+EW{bUtuqYDOM=)_p`zvBZ}~ zki&m|=H`MRd#vuJ4PQA0!_r>Xj+MRCk1{tlJ@Dk5l~fcrg8WmGBqV-3GHN^S;Qq`J zKHgpQXI&|a)M)D{Hk|;E%;`bURJ^*_3*_n){ zBYgCxDTTF{W5_0?*bJv7$i!$6XPQfC(}+41CJB@?sS^-DIc-8oTaBg>OgY*_py#iq z2Ix1Wx~7Y?DaV$!39XoxG>A$K1MtLrdl1bSGWPwdGer%|e!P|*_6m!pYE^grnYL&- z+6%x%gx1REAZd{Wv9r8ZzwCx?6 z9DK1fMHC+Ur8%wl{b+w0Q6#Fo<0ll3D2EyYBMQC5HVgv0#M=Y+qdOAAtKd%^S6$Nq z^t>NEUYzy;a7A0wZ!tL2oFY<{{dNuGPn|Xj1Gy%rR@-3etQtleZDj>B%sJIpf9f+# znggI6OqWfOeDRJST*r1e;Z>_adAb}hqATXYM0XY^U$ARLFde~3JH(x?-({{hv4+n8 zq`}z-MFNTm1F7p*39UPdWK{U-E#NsfaOQ)g2APl&;ITL@vPeEYb7vC3%`}cNgzv{Po2GR@=LTXGZcoR%IVL`cvTn@+VP(7Sx7{YC{#hR zEHyRhFp7c$JYk>$T?qMH!DM-827M`5+Yr?WaG7!lNX~Q!mC~DLTR~);Y=}t%?2M$e zrF;tMI3tQ!>6APQ)^|Ar!brDQvoQ5-!LMltX=Ew4A-N`UIwa{XcPYQ4KjrKND(#T! zB^QtREOe{ge`oLZf>1;1Qd4s{mKEoF^46Ic2F60`#S5e1*_p)M@T;@J?IijBgWYfWI|Zaf7#%0@&P7@B;E3`7wk%>b}`F(1D(If3Iu!Vgg>;eREA zA!Gj*HaQ;37-<(s)w6voA1TZQ~%A|JA(K|6UmWlshz%IVTNMEKYA zEY2FSrV+kOY5g%7M4GWO5xqJzl3DmS6rcJDvc@2RmG%J%t~L#eD}$YO3W0uy3_g=&WM{QlE9C7eSk5{= z@4Yq2^EfTWPeMAHZL_m)jhJ9D$ByqJ)G0=JM0uxc&E0*uQ zcZmtH$*xbFWy>?`tq0egLjcUI#X0*R=w)kh#yJE)3(%iSYCkHrAAQ^<9lI(XyPEaB zbnna)PvuHi*3)+H^k3UNIeWQeZxZcItFNxnnSmyGOqnkJQbPB(X8iK#(oSa zMrp0&>lA&R4=-hX!}rc^lyy8j^qwnQb`so!#l2i7TI%ne1^ywGc8H}N_s-?~b&|hV z^!GmUX8lL+oo5_Ehv@6b`nnL%SNXQ>t-TLlcvSM<`JYr~yjSj>*|3!S#`&+FM;&r@ zZ_e&R(Ez?#i=X*8U)dWA_ZK8@tLSabdfV@v$(4qs(iX9_<)K|H-E;5UhQ;w4=e~MQ zvQ&ze%9W0FOUs7a|HkL{0=;X2-VJJ<(^=1%jQtE+C)Bul zRxIyE=h(_5Thp4Y3H$~ar8Q^dNyE{cuO1Z!xR$Gi1Yun+RQt?rYxF=A@|mA&JZ}0} z|GEP?S9Q)+mJ3$rLJc{8RW48o`!lb-t^{KQDL6yF)pPIohPy#>_loY`4SN7h$Etx! zI^S^LcjF=1sfthCHc#=rQxFVu`R`qnF&m6X?WZCyPZMw|6xqR{W}Mo3R};t^$3YE6 zbjK!(?mQR5jt~we++$NtdzJ8r!&~`c97(q6ha|?G5Sy`UhfA?c%G3Ep{nW0PkKc&R zCsNTH@>$fpT><8k`A~u#RaH+NB=Olrld1R|3lgQzoC&>*ZDZ+(oumIxiB8=OFEbQW z)LOvALHN+TkQn9pa7+^{1i3<#iaNU#Y~MXEfsQJ$@`Y)`l`6Yqr&Uixon*!$X5r^3 zG4^%l<}fAW_1Ew!{4I2b~H_Rm|5F z#ID2X&W3#X&cS+Z?^ztB)}a~DH8=pCF9wwk7GeUBjP|4cWnZ-b$Od1vX-f0bE%HGZ zlD&vx6v#gu0OX?d{ji^LJR|6)P!7u*B^5ldktAv7|KIR|PzF z!aKW^)&SC3w5F}XRe(T5yX^R>x#Jk+sB(<5c+r+Fp3>x&^awkSQ5N$>JO^nVyAJH& z@D2bIRW1gHEjv@1^w?Cro|0994o8JP;zRz9Pgi)viAgqCtHF_+b!2IHCg^ zQ>WCh`ZIM#wIlwpYS*8}u2Tju23kt_c2TNUH3j4N%HcGp(>P}wq*LZPl8{9Ww2_Y4iIvZ$)*4c=P-KKLOl0VepNq<`1CW-fxg7!UiVz1?SEQ0s?fMfT4ykXyLK*G6 zLoFjx(8fPfSWr{bpHqvvU9};kUA5~^wRe!F_U-cyyd71;q`t1%zMP_9Q+SIQTAog3T8)Ao5yZ%hm4zf#sS8Z_B`cZWQ6>R2RGxf%PQ~$jl zPPI*aOSS3Gg8A)|>WUQqg=*KIQ*Y~b)uz{M#>2iat##O&mf|O@tZuApf2gK8^&Qow zKh=Cvf26vlzNgxZ>(8S+>TLLq!ZCsy;v(L$g@eBee8>dq802|pZ;wE8@-OJ)Sm=@u zbnyxR3H(nMXJtX<9bBL?IB!nSAdYGVVx*4HAa%K5XGqgO_+os=QU|BBzjgj$zEG?ipXi2$WSF_cqL&x7}_ zFo8zlLI(x?kws1wtV?4Ksq_L#tVCYgu5LoET;4|KIY8q=+YLjbRA*pbR00`)49Zf0 zOhpCLN5tSO7s0U5hQg3QfhZ>(n#@-s0i-1}*-_S>^p5rvX#EjBO9s(ZI?9+YRx%GN zh@jfyOole^-u@cY5Jd2xP^J^7CK2e{%R5iUQK8gCofIBZB$B;^0w`Iqk##FzOy!tRGVfDE<5&^X=#(#2Ieu0B3p6Yd zYRtqO3%gl47=$L9Pbi0l@*z?kV{nK_###d!j#+@9|0b2s&tx@$FthWEtoXv;Q|TE* zPO!EttEmV)xC!*dn3h3irX445o=xU$%Ki<**z;a|Wx%mDF=OPSb>W;aIV()2?&d4# z82$EfkN$?t!%iQEqXyV(#=^YM!q6NS=Y#NI zx9IK8cn@am2Vnw%T(Z3G%GIK8-(Qs}T4I;?m}HG^W!pj2}~thvB~ zZIpIrOM91!p(JVKWlIw)%Izf_+jUPfv zu_aeGDAgSn>kj|4?#P3Z<)Y=6pyG&R5LY|i4|L}mnpa;~eQ`Ckd~PGu|LDehQ$J~W zKXiGcqVC(yx170}bGe#UC_sV~V`UWtRO^qLwt^h1g>$pENvb^{)*hfjR^_gJhMo-@ zb=^|kL9y;&w(bx)ROg)D-#a1&y2L=&nzQRkL!Z=eR%`$gUA=ORgiJh3C)RN`y_h?E zglsLj!GrXSr`Dn-7PuY~n~m*KL%-P2&m!lVB2v>qv57?t zL;O(kHm!M^HtP3C^&?{a$VRX}7p#!Qb~TF>>=%RmxnR{(hc)b9w)~!J%ZDSIQlM1~ zv}OYPGOm3vH}vubC0~>1Yg+xBUTT2-P5W~`#Rp~F@bzSUd!E`XYDpoW8-xVunzvz< zms$^qtp_r#CpJP2tIbklpV-*9J}|Z(x|H!<%7iXy2}!8Oz_`279FKzDY z!}#!QR<}yk`^D=0QuT;fJwh#9fwtST51!u5LDuAS5EhLIuQ~|f=;Pw3=mI3({;&lV zaC=BtxeFnK5dNiIjf}UGB#?1@xEVlXbBEN_CpIC^b$d96)@~dU8;9tveiKRr8jp&N zN2SJVV&gT^chrCsn@X53azqiU_CXVYR5dDAjpiV^#P&82rW#>h_af7}dJ0_iq`v7p z$G&^+?Q`E9dwVRi|H4n=+2*U+`j^PFhSeDPp|`0Hv^^rZs1A=u*IXyDQzLmhL~n=W zJuZ5WKYnG+dj{Q6zvq!9TZdus`XwBie&f>pOR_ldx~u6)(_XYN2`S;h+PXpXHEae# zYM1wXSj3eEHp?odvUahoT`JovmhF|w4vA%lfZsQ>u@}Z5ahFQ2u;>a)u3pjA`@U;0 zX+oe3>6rDpE361{K@`e8YT9)9He856^5I!m{f4Vz!xdOQp~#R|J-V884gG`LrpWoq z=R_5r5fc%|(UQopwJl67yCLR_CS8V5S507qoM}`iO>^w0D>z6wbvGl5o$y+f{?uhS zKQUI39!uqdJ~0lCC964s%Q4^{rZoIcEwhd;nYm9)A2>6RU20eSkOQ^yj-OiAPm!7p z81;X<)Es=tf5X(Ayh~3_?S0<;E&H}M($GA;5x{L|eXvV!8^!YKvD6m)#GHJ39KQ5h zMZEu8R#H$0L7Bk9AAoShStDqIj6=g4*>I+c9h`z))v!|_zQLqpggAs>A&LqKR>-6# z3wDDH4|9?1>rnUvOvugPpbYUph{!Zo7-n6f5(vUypuxsunFjZC>@`K|l#byL3ARmE zoSHU(|4%_4;7h*C3B(n$-5H#NGDQnz!R^C|*z|QiHnMPN7m9*nMkPoN0fSD?yT}pz3lnV-)+q^g(Co*s<;#iNnudgoa5qZooF^C9XBH{a z!w7vy8tP?Tj4xJ+Z?7tAp*YAiH{(MdBVp)hc}h7oA}%A{g$A;*O-iEF8OM@m650io z_}Ap~7zXm!2>HBc`;br4432M6mi+KP|4l4JKK1AGXVl7bC>h&`4OL6!ZDM)bk`qKl zZ3HK#r20K#{hn<7zK!a-l`m(jJD*gBSF5FlgV3b(-r(aC*~+sUl{MdbdG&Kr%VDwQ z@O!WR#FDMOn5`V!XlQ3*)uhz%;p}@Uq96Srg6ccub7@lDNwMx^w(j(n*tPp#&9oIWzQ`@i#QLAcX@QHOBt+rqU|xCZ}sYz5(t{ z8n%UNpm25mZ@sLAdpP@pG=kOnpK|89;D-%?TFv@KSkall^=t#XcWZCF+4AY7YC?kbqGcvYfL2{Ublu4*rh6>bzC1n%{L`z{> zyDpfYCh|Yg7)l`1d)#JKM5;QpUUg{cT&}WRsvHt4hu(v6YU$iYplW4kbzvPWEKX*) z9=I~K=YKN3?)yy6SF!1>Tn&D=?(I69XUugEO5I1r?xVS2lN8)12KRkfWbOe!Oota& z7S?!osI(z(VmlM17!-;ix|#w# zb>4)XGcy=AMk+s5Di<0d+-hp+B7*}Pvg%Fx)YuPYpuXJkD=6s+G`t+AR1IVtXNpLJ z9Y3{HYI^LLNpMFXyppuQz4DZ{KW-=qpdH`<{!jZWgrn{FDe@MEe#S>Wpt6)^Ui?f@ z0gK+$c&t=|?2aBOnrc+-`csXENeMM#3QtLj!Dh5ed(ZTGAYHs@f4$toefo5l&0oO2 z*(atPM4qjBxl|jJuiA`sxlDDcA@rx(Pm3i5V^}-hP~cXg)|&0cv2lT(=+aYCZAJ%7 zjN8UY{z1(JJ8k>56#G=0{wye&UfO-CzmZlI>`p7o4s8fZE#)A3vGaysUIL6a}3G)i6nP-P(=uj(3p=!ErXa|V7fQC?xO2~)0>39v1um!q* z=jbjOiYkXFAY`AHVBJAZgxgQa^u=-ICK6fou@Z`*86aQ^QXIo6i0xXLiN*qRsOUP5 zC*pKYC<)41Be9YUSS9aME#)h+^3P(#(PKP`qokAH^ML-c{s+3gJ3LoJWn}fvCYJ-PKEz|Csys7*%iZMgs zirF?64y$Zyz5;5uh>~=W@&8bGM;v`k`iJ7O{O~s^r5G7sC*$9d@n2x%>t)@hAWIPx znaCUBP)V)s>C=*=9ECwgCE3QJ;HQjei4y*jjK3lS8v1bjh3(+xJ!%8V6m;G~r_hDJ zLogB^S2nK4DHin`uJ@Rq#u0E_>LC0M-Xj4S5E!Kti*JLjzA10#%bk9XoQo#;*GTye zyX_%wXP1kR6EydbMj=+sdbSUjbFR}YguRAYQvhd2|FT^t6y~WKbl>4ox;w?o5K%=rNE#V7|aF^F4<{=v3DP8Umw$j116}d{@Yb=Rek69 z!{R?Y|NZ$tS@`3H^~z&Q=a|kPJv8+C$Qy3ZKvUYa=`WZ39neJkq19Xpt+Je(_Pcuf zmd<5D9hr)bjH`3Axl?NH7n}Q~=JR6n`HZU}M?0Lfr-;h0+#ws{>NqTR9F{sRh#eO`WeU04hAk)8dC~NgEAA}M1lw?Ws0?b1y4Hi;8$q1e zAq(4k?G}UGQm{`9_GS9d{4{tL+FzuGK9TlW4~q?lu_2qO-kYh}n+XCzQcE8k$kn#w z8lau$M5gydrr~6c8s(7Kd?;7l`0bImMz-8s^dxUT^8{dyo@$4ZAP;N+cO=9*8_()f=uHOcJC@Ll8&OX zav*?Z)zqfBiu1XOE@=P#N1LUr{KMmBj5@{`3&$`^Y&+;;#YBbAk$Z}Z5DCLERH6a} z`3eCJg((zc$B$TQ!%aLT)^5sIz=mB1Ib_ziZFD`~(0X8)CR`0r9NtWX6=QH9Ob#`v zGMiKMajI87ZfkDQQcz=!iPozPfpkrf8^{w6GOH+JUHIE$*xI;FMHyHEub zM6MB&ksDhGyr`^^(9A*sXhnm#3$5%0OFknia4jZ|jFRyp8CS?4;u{BV8K_R`X=z$Uo>-cK941m{^P00+TN;MnhK7*9-Uxo7<(P;q|8B47SQ^8#0ak+1i1nF}84!8m`r=-;KT<#q9+x2eRG+xll_c zv^V413p3Z&Z}`b8dP!(P*?;~R=giaK6+u0Tk<+_Fs zbhi&#!&#Sd|`m>4-9SRZ^{7g0y3n1rbZv3uG)jn$r*p@ zANzq?3uNW32B%oL@ajC_CwESrRf+i@#sg|!u6OxBObu9i(-_;m$iqU(AbQ_XrwR2!6g1D}fhA&O^?92u2u8cpW#r0ltsV!Kb zwc4q=vFTl&*j-b54N_H{ppM0O92Zjx0?zR*x@QK_HX7=rrv)s>y<6Ub>Ggk0y0wi? z_9(cL0p7(_jJQ+XY92-imr;%dEogva0rOGXaM1%5DIP4r?rN^!o6}A&sQF@)JB;#4 zyJ_h(_Q5DIkbPHiq1~4CDea7co;Q@OK;L-UMOaCJq7>3RZiE(%>M>PYAzGBS0XZ?f z{!QM#Xv6cqg$NgebUDDyKm{-%Ch}TdqOws~F9}D`c;-*JJcrw{52&02rrS;BMOxW+ z(lzo@F4)?USLb%7AGSx=-2?1rHDs9hZ8!I1fAARluTA89i zFd#Lg??pvT^kl~JPTGgU4RXZoFN|ylVCWJAwJtgUqcxnCo{GXS80RWW5?3Yh^AiyE zXJE`fUA}^rpkC9v2_#GlAD)FGV)PztGL?c>e*SLM!@xjxs`$0$EyR%PFiyT$)-Wp& zKhEf&NF(X;S0ILG7)ml^5-@^G9>6JPuVGF!H6W zcNo{7Ly9gxXEwR>%S!9-;r^Q42Ff^)^g=WQd-euRKv;Al!p?- zJ_I7xBEAB`>^enbWXV<7@_wyvbRdXuuDl)hr{0(pFoIvA7}v;P<%>~nUsuaQH)5h1 zGFst#u5u&;r;bbYsUi}EV`XM#RG(S8mb&afbjpTwgo2157_9I!}$h%k9L;EtmeVNd{ zO=u&o-!InhUv`mK*Y~Y|;`(FPdgyS*cQ_L|Oc!JgXG87p9)IL`XKW3shkZkt&=4r& zaF-^5vg}f@o7KZ>fjzm7PN*Z^?AR}LoEAGygZJJHb-X+ADDuwCT4*>|L6;6ycIB$; zWx-*<4wZeIRduqAE?|es{a}Hiw9m!WHGO;ht?_q@9-e*2^TYU$rhYKBUVC=AB)9*- za!IDTH&fG_@$Gq{EBYIG5)7?4es?U>azG3o*pOdx!8WMLcysK*7?f>54PZH==?BE7 z0jcSj*aQuOwYkbBnVrv7_v9)Yh$KG_;K0`G&-h5dwzN!EYADM=LkecycjP|=s&n}z*jZ{&nL@9)BHJ%+8J!k~HNFB97aHV|VKYO#d-_3)}qS;EZ)1d4Xe1UPRj;djji8VC(F4yt8?V%HNI znXi%Y$hShxO@A6~J*zFMrpGV}Bj!gYQccDr3ATt2gD3%lFcsGGQ*E@Z4#M4#1xV7R zDMoE;VHy=pulFF$W;G@K33kL{#0Y@t&_{+Xr7;9X=j)V!OVbBIUx){S2Y>6FqZRO zd@$a@9v0IUKU7l4?L;;RUMOyKU6sK%ZMc1o_)rvL+ihH-UZ)D0NwH$vSrH+^hOSX7 zTcf(GVau>><7UIShT)@{H~_YUkFqV{l7+*@SS8!NBvP7g0%1y$x3MYeuCKsjzL!-x z3vpOw3R}1V>(xlHaFd+$z0F&A&AV}f!b}nZ^71`qv~Lz^X2kGLN|)&6yh}O3IX{Ct z(S*N2Xn|4Be@V96ls*Fuji}!UB@HCZ&gmUFd??%}QvlN6SqTI#}3H*F1bZg$6(Khwqlq7jDm9}M1g}~Kt zOX{6n3EXX={*ee=3k(prcCN1Rlj+fd&8~LQ-kx*SNv;9WHK3_@_7#%Bpo&3{t~qOS zz(kve)|){JhBth{Oj*nOzE;pVEr&oX??U75wnIl$_jWoQx4wqTyD>}+A{&V@vx>{s_qi2yRy|iOBXg9JEg`W zV&f42z}V(u_uenAdAo92Sc+T`BUd0~+y)x5 z2_52V`p(3|$REzEdHYaiWK-JwNjq<0kigm=p5qV4lxrm_PeQs<-jVCfRCH!sT~Bb0 z#rgZ^u|-rm^hCb2B7-d~U!Qh^MU!&-14v+SEPi$cNxz75X_x-mTgUF(Xk81m=7vU= zPs=C4-VY8wsi|Le{QlUoV>4L08k7ag#o)pBPDsO7Mf?S?qSH&+m6xp|1m3}tE$fmW~Y%yf-px<@kgN5CfN`<5*tj#fe$UKy^_ZOGL|a-oh~U5iw= zSFGEc3%5z(K`}hI<>lP1&p117-YDm~_Ye!Rbe7Iy(w=MMy0>xT@X^PW|H~JbZSq;j zpZbPAykzDo_RCnhuka{%bdpbQ+Y-a4CllNjSaALYTHK&lZETF4mrymTE#O1mrcupB z5)f*-3r)N1I}`@`4AM{1O29r$pG^A8Ofk_i@{atDtGFkK6HtE?m)@T~gSi`LpwQi+ z06t3myGSRy=(IR%KjY66?O;4BlFw1E1x2qREK zHLQclMqGh$$|@xao#Z;C4B9ts)rF#4Kn-9`!DZa$Gd21qLaIrvoe@W%)>kVvQL(LR4E<@O zw>({N@gZW-SF+=0B(4fN2`axpr79zsHZJaU6@c8mI}*VwT})^+w%}?&?W-wcL&*ZV zjO_@6@^l!7X51)rwez+Gj0XtO>KhhDC9(f1B|)AgI!#=_f0#j+I{<$xZGhG@*f228 ziG5Ec;9x<^3G{`Q;E;(kMl?r>N+}#?O0;QP7II3sQ#zK=kxs}$eJsSR$RIIYlk@Rp zO2LU$XIVe54%t$cBuBJkek#1558jHH9y{-sxT}>@Ew36`MO~@nOa-@+tE{-1LSrX+ zUYIes+fL6!L2j9{?+FOTX}i&~ZWvIp+wym8HT)Y;#ZB!1o@N;{%BL4B=}B20+GqV(V$~@yG5#Hn)(Ql zrI4lvM60UL5Gv`lMCl1ElENz#tAPxLAHPAiFd55aTq5HOWc)oD|3t<`70$-%e^5}uRnU823~;b+(FgPV=5vU&-r@sQYfh_o2vlChdj$jLPgKjkb< z&|q8-oguEKjI)D8F+k1ce5aXQTce2UVCb4UvOHtAm$9qNaaY!UNNkDKP`ym_w5w_L zrmWFo%?VY>HLE7vx`u{%qF!P1BD)ulx}v}W8N`}r&SFOi2s^fA&r)Va>`bYtC>k5Zv_EqtG&;$qu(_qfs8)+ekQ)+MvbY>7T;f zjf67-cdC=f&wuR5vCugV}yeN35RP-%pf+3zIuU&d506uq>K0>5`-X_ zlDCkp;B-7LSSJ@Xw*?AUc6e53Z%tB zmJ9&FfrY7nMmq&+=*lfE4D;p3z* z?4qmzOeU_>1rQ<$B&{f4TRRWs6Y63pyolBtlP_F61FJx1cLauior4WBdCY#ATy=l) zVzkexg}RHANw#*>9+?aYOeEqlNb)Hf`_OFj zN#Sdl-2_IePyrcrA)F-J7#Sw=C6a+rHnLFsuhT1$JP}&~8S$IsN=f9sAa6*oS?uOb z8I)vo;^D;@bEQFK%Ai6yV!-+^%KA{IB^G*jp$m44@I%bp$^VK%fGQ}LcK~tUfMo9x z?L7})m3l|T-qCgYF^Kc}Ld&ma!hP%Ben2YjpyX;@bG2@kma$7!Y1OIAI3o|IGe=$! zdtS)Z?Riuy)*ZppT@`uQD^?Bt@Ct4YKzz~H55#H2^C4%h_AH%8!nkK3)Vlhb7}^6J zU%9&OhqGecNUmugWDmurGr8vdnZC1P^Es@56aibE1EVHrzgl@!s^2T(57(6*faWI2 z-XPi==+0Hr5hdFD8A-u5rdPMxWde>c8lj#mN zcj*ov$#qgB^+`ZZ4 z!~6C=1&mT|rv@$#F%+;G+rTJ|K=?cs^xX&q8RLOcTnv9qU)cTF`Gbr#c0S~vI@#?i zVOusRgrW#e7{^ZT`rq@iw0RqV|RF-QLi}u!m zqcp4OOtq;t{b`&U2u0dM(=ZcdXV0a_Q*FlWN(5rT?r5hPT7R;gLJi?5aOaG6n<1#k z86iw?-KWP@Z8(lzEZecO=(JgCNkI4Zs_ z`KRtl3#$cp;JH;Y9N!^0afYLM7Kw~8YCrF8On*~tYWcu^ zw5x80SRn1&e(K)9zN}$y083QnG}i8>*YACDtH1s`X!D{gTw(i7VchQg6}0L9t+gph zUl_HikG^ZPpYco}`;y`aqgjp!jvc?WPt6@SgZhPb{RTkv5K&n>ero8&K-zD3_&IVL#xjJ`l#XKhV@P;8sMpQO~A5td3>7j1k;1`2TI+qcuPIOEkUy zvrjQ1K9om9#2%w9ZHTXeU}luFCdH^rm?}cM3MdvphY?9x@b-fr^7 z-S8|Zasd|;#uGd{vrGi~&gEZAEY5EFWkvatEFP<9WL5=eza*!m6YislRrsmIt7k~mY zi3T2b^fB>)3L^fQJT1c_nOp&F<_R7$=BO~lNkB2UQaPSMf#`B*sHPx{_aP)Q&koY+ z!4&9{fvW_bG9A8oj|!+p6O+@EDfzbg6lL=@GFHiOQfMZ4P)fD{8UGO;`7%9qxfaCm zC1si58H#zHf_{Z;^JLtiNOYW-HZ|qz2nBMFk-Hn?LOBOazF&r0q+Di;-k@ZtWgyNB zRfNzfRG=%HuwMqhvW}Ue^r((`AM&Bz0sbv(To!JUGx1c+8R8xj&@s*~pHgl*WV*ML zdDCRR2%%V0<~=HODz{(Wqt}#@$@|G))e5b&u~u1jBb7R?wm)UAEqHx?5;quXBb-Sj zFLdt0u^9#aFCd1KzYiK1+*%D+UH|Q_x4Is7i&cYbRhQoDlMY=H4_#XFKJm6?y^$r$ zRx#J!CAFOp+fLBQ;O1O=2R&Oh>wBg8qwDoYmq_2M>82f%G;=>5>p(&^2Lns0mG@=6t*Vnye})BkGhdt>jN z$X1M!Gz8F4x`Qw0Z1PKDw*#Q8Q#5 zKH^NJ$4t3k^J?n5Uw-?`QroDAzu+j!?k>k+=XxkwCcVy@Qrl1tdC_0xQ)h9#XNm6W z9yn=QIw3d9>fm=r-W~yq`GjOC_KP8CF2?_Y&#Ae z%u?e)vGE{8bSwHGplNR{{0PTGZ8wpZYqsAediqXbaNnp*E>}SS%lg>bw%p zRla=tJALcU_Km8h)$>2C+Vh}z*|fZ$C44|^Jn&xNM^!(l5*tq}dr=x$>~}ROwH*-K z4!qYS4V@Q<&ae9}JRw1oL9u!eTB_J7;$|_}%q#@l=js}BIJMk)TI@W%<>YD(;1F_+ zf7y=u2W!Ltm|7@*rWmWAc|X{*RmoNF-Kv7F=cV&cDjVtcb&^j0Y31PkvrD5(caWM? z*C*EXJ-Q~=4P#u@c1yL##M)zz2gKTQOJg}YuskF-3`q?mV#COLcd`wqGTu{5zXoHY zrTt;O6gehFj%8bp)5YrH9w|I5hKF;()=cEkdoyC>!WL=Zd*B9KfO`eY>sIcp+yX1y zHVC8az|*3lkbjGlFHo;-mumXOn*K*IvF5cG|ki0m&p87D1FWBS>KuH(y<(RG^E3omAW4ih&cz>Xg8HiBA z#Y=&Y8zS#A&OK>b1k##bYwaD{arr~}%x$`)zHan|R`9j++lb$+Xs{v<0RVP1>oS^#; zpU}rZP+q72sH|ObV@h%PGdPjZi=o(fXcH>aUzTby3I}ikw)P;#QLSg`B0~AfmZvk` zri{Ib^%xfWQCfM36@V@j%`RMq!&sz#=WT+-5(_nCv{JN}$##%zw9zOWBAc6ROfgB6 zUSnji?aF^i9tXtznaNaVPS-lS@wm3W< z@gLDc2?N_!4-0YKWDRcyLt9p|pg@5Nl9=1Fu~)1zP_53aF0RD$k(;ba63q1sW%Hpq zYO-E1Dela^bxh8`$Xb;PH?1Cj+rCu<`=-OQlw96_|FxCo`(IftBKL=VZ+kP|UeU4V zBP&8EiNFTgGpfPMRKC)xEh|}2zPg4j8?)NE68BQ-e$__~dR5{vAACg=bHqCQ$g#!Y zk-2>3BYG-hnY25gSl|;RuNe?9W@_W$)bcK>(+I-A^~eND+Qh?M6hC|=)u7) z8@-_%-lf#?)s?1|LGueEo`B^&`c) zN%l420#*5n;vQVW!)P=0PG%34h2Xt>MPU$GUZRJo&|x>F;DA~|3Uc{MDXxEK+->)$ z$d^e6DIiI6~`5KSHLMa;w^5rn{y)k_S$!+uQQ9PpRr8Vr< zASEho%H^92qHe#lrDtLK74`ZS;+XRd1wrL&L3(!F8p7@t$va6?vD$fbG$p^6xnOew zdWNnu0TeKSst9#YAwx(Vb3KN>WK$!XwrHlzpZb+jP-{(1dlM>`a8$EL9tHJCOqTR$ zAX_6DO_V|Xa~a@5B5)b;1iQE=&aSnQF98p-pwPE92W=ELl{HIfrBuq87t5*#yGFN| zQs<;V=wZIRzz>^X6RKxBc~&w{8kG>o?qRG_kGy=c30&6x+@mUc?m!f#GB?kzVd~Ki zq@wM>r_Av5Iij&~fiTmqke7=LnWRFEsBEoLiZzW3hbO_=#}Yl0GhlR+LN^twc3ZKi zqi72b!X)e#9-)WI(MtinYCvTL7WRToDdFSl#G*PjFG|CJbAgJ*DsI}CF5&k z{1F*{LdFMVFgBNFpdG7K-ad?CgK-}b{sMCn{6cW{Q)=dGwy|7nk^>tha8N5DB(W|va^M3?+s_@r<W5$4;_!HEd7S#O^C?*&yF>Zj zHJRGCAP;P6SJGh)uTd#}X@1FUD%s-5R6Lo(oCTpXozVD$=STKRQ`OdiS4}2Awy(B| y$g){lyJaN{>Q)?hYGYQ&>YAO;%uZ8Vrhea34o@Xb*nU}a#cgW-h$E9NG5;?}avr<@ literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py new file mode 100644 index 0000000..ab2eeed --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py @@ -0,0 +1,379 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from collections.abc import Iterable +from dataclasses import dataclass +from itertools import islice +from typing import Any + +import torch +import zmq +from lmcache.utils import _lmcache_nvtx_annotate, init_logger +from lmcache.v1.multiprocess.custom_types import ( + CudaIPCWrapper, + IPCCacheEngineKey, + KVCache, +) +from lmcache.v1.multiprocess.mq import MessageQueueClient, MessagingFuture +from lmcache.v1.multiprocess.protocol import RequestType, get_response_class + +logger = init_logger(__name__) + + +def wrap_kv_caches(kv_caches: dict[str, KVCache]) -> KVCache: + logger.info("KV caches keys are %s", list(kv_caches.keys())) + return [CudaIPCWrapper(tensor) for tensor in kv_caches.values()] + + +def send_lmcache_request( + mq_client: MessageQueueClient, + request_type: RequestType, + payloads: list[Any], +) -> MessagingFuture[Any]: + future = mq_client.submit_request( + request_type, payloads, get_response_class(request_type) + ) + return future + + +def get_lmcache_chunk_size( + mq_client: MessageQueueClient, +) -> int: + future = send_lmcache_request(mq_client, RequestType.GET_CHUNK_SIZE, []) + chunk_size = future.result() + return chunk_size + + +def striding_block_hashes( + block_hashes: list[bytes], + blocks_in_chunk, +) -> Iterable[bytes]: + """Striding the block hashes to get the block hashes for each chunk. + For example, if blocks_in_chunk is 16, then we will get the block hashes + for the 16th, 32nd, 48th, ... blocks. + """ + return islice(block_hashes, blocks_in_chunk - 1, None, blocks_in_chunk) + + +@dataclass +class LoadStoreOp: + block_hashes: list[bytes] + block_ids: list[int] + + def __len__(self) -> int: + return len(self.block_hashes) + + def __post_init__(self): + assert len(self.block_hashes) == len(self.block_ids), ( + "The number of block hashes should be equal to the number of block ids " + f"But got {len(self.block_hashes)} and {len(self.block_ids)}" + ) + + +StoreResult = bool +RetrieveResult = list[bool] +LookupResult = list[bool] + + +class LMCacheMPSchedulerAdapter: + def __init__( + self, + server_url: str, + context: zmq.Context, + model_name: str, + world_size: int, + kv_rank: int, + vllm_block_size: int, + ): + """ + Args: + server_url: The server URL for the LMCache message queue + context: The ZMQ context + + model_name: The model name used for LMCache keys + world_size: The world size used for LMCache keys + kv_rank: The kv rank used for LMCache keys + vllm_block_size: The block size used in vLLM + """ + self.mq_client = MessageQueueClient(server_url, context) + + # Request futures + self.lookup_futures: dict[str, MessagingFuture[LookupResult]] = {} + + self.model_name = model_name + self.world_size = world_size + self.worker_id = kv_rank + + # Read chunk size from lmcache + self.chunk_size = get_lmcache_chunk_size(self.mq_client) + assert self.chunk_size % vllm_block_size == 0, ( + "LMCache chunk size should be a multiple of vLLM block size" + ) + self.blocks_in_chunk = self.chunk_size // vllm_block_size + + @_lmcache_nvtx_annotate + def maybe_submit_lookup_request(self, request_id: str, block_hashes: list[bytes]): + if request_id in self.lookup_futures: + # Skip if there is already a lookup request + return + + s = striding_block_hashes(block_hashes, self.blocks_in_chunk) + keys = [self._create_key(block_hash) for block_hash in s] + future = send_lmcache_request( + self.mq_client, + RequestType.LOOKUP, + [keys, True], + ) + self.lookup_futures[request_id] = future + + @_lmcache_nvtx_annotate + def check_lookup_result(self, request_id: str) -> int | None: + assert request_id in self.lookup_futures, ( + f"Lookup request for request_id={request_id} has not been submitted" + ) + + future = self.lookup_futures[request_id] + if not future.query(): + return None + + result = future.result() + num_chunks = sum(result) + return num_chunks * self.chunk_size + + def num_blocks_per_chunk(self) -> int: + """ + Returns: + The number of vllm blocks in a LMCache data chunk + """ + return self.blocks_in_chunk + + # Helper functions + def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey: + """Convert a block hash to an IPC cache engine key""" + return IPCCacheEngineKey( + model_name=self.model_name, + world_size=self.world_size, + worker_id=self.worker_id, + chunk_hash=block_hash, + ) + + +class LMCacheMPWorkerAdapter: + def __init__( + self, + server_url: str, + context: zmq.Context, + model_name: str, + world_size: int, + kv_rank: int, + vllm_block_size: int, + ): + self.mq_client = MessageQueueClient(server_url, context) + + # Instance id for GPU worker + self.instance_id = os.getpid() + + # Registered kv caches from vLLM + self.kv_caches: dict[str, torch.Tensor] = {} + + # Request futures + # request_id -> (future, other merged requests) + self.store_futures: dict[ + str, tuple[MessagingFuture[StoreResult], list[str]] + ] = {} + self.retrieve_futures: dict[ + str, tuple[MessagingFuture[RetrieveResult], list[str]] + ] = {} + + self.finished_stores: set[str] = set() + self.previously_finished: set[str] = set() + + self.model_name = model_name + self.world_size = world_size + self.worker_id = kv_rank + + # Read chunk size from lmcache + chunk_size = get_lmcache_chunk_size(self.mq_client) + assert chunk_size % vllm_block_size == 0, ( + "LMCache chunk size should be a multiple of vLLM block size" + ) + self.blocks_in_chunk = chunk_size // vllm_block_size + + def register_kv_caches(self, kv_caches: dict[str, KVCache]): + # Register kv cache and send the request + self.kv_caches = kv_caches + logger.info("Registering kv caches") + future = send_lmcache_request( + self.mq_client, + RequestType.REGISTER_KV_CACHE, + [self.instance_id, wrap_kv_caches(kv_caches)], + ) + future.result() + + @_lmcache_nvtx_annotate + def submit_store_request( + self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event + ): + keys = self._block_hashes_to_keys(op.block_hashes) + future = send_lmcache_request( + self.mq_client, + RequestType.STORE, + [keys, self.instance_id, op.block_ids, event.ipc_handle()], + ).to_cuda_future() + self.store_futures[request_id] = (future, []) + + @_lmcache_nvtx_annotate + def submit_retrieve_request( + self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event + ): + keys = self._block_hashes_to_keys(op.block_hashes) + future = send_lmcache_request( + self.mq_client, + RequestType.RETRIEVE, + [keys, self.instance_id, op.block_ids, event.ipc_handle()], + ).to_cuda_future() + self.retrieve_futures[request_id] = (future, []) + + @_lmcache_nvtx_annotate + def batched_submit_store_requests( + self, + request_ids: list[str], + ops: list[LoadStoreOp], + event: torch.cuda.Event, + ): + keys = [] + block_ids = [] + for op in ops: + keys.extend(self._block_hashes_to_keys(op.block_hashes)) + block_ids.extend(op.block_ids) + future = send_lmcache_request( + self.mq_client, + RequestType.STORE, + [keys, self.instance_id, block_ids, event.ipc_handle()], + ).to_cuda_future() + self.store_futures[request_ids[0]] = (future, request_ids[1:]) + + @_lmcache_nvtx_annotate + def batched_submit_retrieve_requests( + self, + request_ids: list[str], + ops: list[LoadStoreOp], + event: torch.cuda.Event, + ): + keys = [] + block_ids = [] + for op in ops: + keys.extend(self._block_hashes_to_keys(op.block_hashes)) + block_ids.extend(op.block_ids) + future = send_lmcache_request( + self.mq_client, + RequestType.RETRIEVE, + [keys, self.instance_id, block_ids, event.ipc_handle()], + ).to_cuda_future() + self.retrieve_futures[request_ids[0]] = (future, request_ids[1:]) + + @_lmcache_nvtx_annotate + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + finished_stores = set() + finished_retrieves = set() + for request_id, (future, other_reqs) in self.store_futures.items(): + if not future.query(): + continue + + result = future.result() + finished_stores.add(request_id) + finished_stores.update(other_reqs) + + if not result: + # TODO: add error handling here + logger.error( + "Something went wrong when processing the " + "store request for request_id=%s", + request_id, + ) + + for request_id, (future, other_reqs) in self.retrieve_futures.items(): + if not future.query(): + continue + + result = future.result() + finished_retrieves.add(request_id) + finished_retrieves.update(other_reqs) + + if not all(result): + # TODO: add error handing here + logger.error( + "Something went wrong when processing the " + "retrieve request for request_id=%s, result=%s", + request_id, + result, + ) + logger.info("Retrieve request for request_id=%s finished", request_id) + + # Remove the finished requests from the tracking dicts + for request_id in finished_stores: + self.store_futures.pop(request_id, None) + for request_id in finished_retrieves: + self.retrieve_futures.pop(request_id, None) + + # Update the internal states + self.finished_stores.update(finished_stores) + + ret_stores = set() + for req_id in finished_req_ids: + if req_id in self.finished_stores or req_id in self.store_futures: + self.previously_finished.add(req_id) + else: + ret_stores.add(req_id) + + # Calculate the final finished stores + ret_stores.update(self._update_and_get_finished_store()) + + return ret_stores, finished_retrieves + + def num_blocks_per_chunk(self) -> int: + """ + Returns: + The number of vllm blocks in a LMCache data chunk + """ + return self.blocks_in_chunk + + def shutdown(self): + # Unregister kv cache + logger.info("Unregistering kv caches") + send_lmcache_request( + self.mq_client, RequestType.UNREGISTER_KV_CACHE, [self.instance_id] + ).result() + + self.mq_client.close() + + # Helper functions + def _update_and_get_finished_store( + self, + ) -> set[str]: + """Converge the internal states about finished stores + and returns the 'safe finished store request ids' back + """ + safe_finished_s = self.finished_stores.intersection(self.previously_finished) + self.finished_stores.difference_update(self.previously_finished) + self.previously_finished.difference_update(safe_finished_s) + + return safe_finished_s + + def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey: + """Convert a block hash to an IPC cache engine key""" + return IPCCacheEngineKey( + model_name=self.model_name, + world_size=self.world_size, + worker_id=self.worker_id, + chunk_hash=block_hash, + ) + + def _block_hashes_to_keys( + self, block_hashes: list[bytes] + ) -> list[IPCCacheEngineKey]: + """Convert block hashes to IPC cache engine keys""" + s = striding_block_hashes(block_hashes, self.blocks_in_chunk) + return [self._create_key(block_hash) for block_hash in s] diff --git a/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py new file mode 100644 index 0000000..0e87dea --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py @@ -0,0 +1,221 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Standard +import os +import threading +from typing import TYPE_CHECKING, Union + +import torch +from lmcache.config import LMCacheEngineConfig as Config +from lmcache.logging import init_logger +from lmcache.v1.config import LMCacheEngineConfig as V1Config + +if TYPE_CHECKING: + from vllm.config import ModelConfig + from vllm.multimodal.inputs import PlaceholderRange + from vllm.v1.core.sched.output import NewRequestData + from vllm.v1.request import Request + +logger = init_logger(__name__) +ENGINE_NAME = "vllm-instance" + +# Thread-safe singleton storage +_config_instance: Config | V1Config | None = None +_config_lock = threading.Lock() + + +def is_false(value: str) -> bool: + """Check if the given string value is equivalent to 'false'.""" + return value.lower() in ("false", "0", "no", "n", "off") + + +def lmcache_get_or_create_config() -> Config | V1Config: + """Get the LMCache configuration from the environment variable + `LMCACHE_CONFIG_FILE`. If the environment variable is not set, this + function will return the default configuration. + + This function is thread-safe and implements singleton pattern, + ensuring the configuration is loaded only once. + """ + global _config_instance + + # Double-checked locking for thread-safe singleton + if _config_instance is None: + with _config_lock: + if _config_instance is None: # Check again within lock + if is_false(os.getenv("LMCACHE_USE_EXPERIMENTAL", "True")): + logger.warning( + "Detected LMCACHE_USE_EXPERIMENTAL is set to False. " + "Using legacy configuration is deprecated and will " + "be remove soon! Please set LMCACHE_USE_EXPERIMENTAL " + "to True." + ) + LMCacheEngineConfig = Config # type: ignore[assignment] + else: + LMCacheEngineConfig = V1Config # type: ignore[assignment] + + if "LMCACHE_CONFIG_FILE" not in os.environ: + logger.warning( + "No LMCache configuration file is set. Trying to read" + " configurations from the environment variables." + ) + logger.warning( + "You can set the configuration file through " + "the environment variable: LMCACHE_CONFIG_FILE" + ) + _config_instance = LMCacheEngineConfig.from_env() + else: + config_file = os.environ["LMCACHE_CONFIG_FILE"] + logger.info("Loading LMCache config file %s", config_file) + _config_instance = LMCacheEngineConfig.from_file(config_file) + # Update config from environment variables + _config_instance.update_config_from_env() + return _config_instance + + +def hex_hash_to_int16(s: str) -> int: + """ + Convert a hex hash string to a 16-bit integer. + """ + return int(s, 16) & 0xFFFF + + +def apply_mm_hashes_to_token_ids( + token_ids: torch.Tensor, + mm_hashes: list[str], + mm_positions: list["PlaceholderRange"], +) -> torch.Tensor: + """ + Overwrite token_ids in-place for multimodal placeholders using + efficient slice assignments. + """ + n = token_ids.size(0) + for hash_str, placeholder in zip(mm_hashes, mm_positions): + start, length = placeholder.offset, placeholder.length + if start >= n: + continue + end = min(start + length, n) + token_ids[start:end] = hex_hash_to_int16(hash_str) + return token_ids + + +def mla_enabled(model_config: "ModelConfig") -> bool: + return ( + hasattr(model_config, "use_mla") + and isinstance(model_config.use_mla, bool) + and model_config.use_mla + ) + + +def create_lmcache_metadata( + vllm_config=None, model_config=None, parallel_config=None, cache_config=None +): + """ + Create LMCacheEngineMetadata from vLLM configuration. + + This function extracts common metadata creation logic that was duplicated + across multiple files. + + Args: + vllm_config (VllmConfig): vLLM configuration object containing model, + parallel, and cache configs (alternative to + individual config parameters) + model_config (ModelConfig): Model configuration (alternative to + vllm_config) + parallel_config (ParallelConfig): Parallel configuration (alternative + to vllm_config) + cache_config (CacheConfig): Cache configuration (alternative to + vllm_config) + """ + # Third Party + # First Party + from lmcache.config import LMCacheEngineMetadata + + from vllm.utils.torch_utils import get_kv_cache_torch_dtype + + config = lmcache_get_or_create_config() + # Support both vllm_config object and individual config parameters + if vllm_config is not None: + model_cfg = vllm_config.model_config + parallel_cfg = vllm_config.parallel_config + cache_cfg = vllm_config.cache_config + else: + if model_config is None or parallel_config is None or cache_config is None: + raise ValueError( + "Either vllm_config must be provided, or all of " + "model_config, parallel_config, and cache_config must be provided." + ) + model_cfg = model_config + parallel_cfg = parallel_config + cache_cfg = cache_config + + # Get KV cache dtype + kv_dtype = get_kv_cache_torch_dtype(cache_cfg.cache_dtype, model_cfg.dtype) + + # Check if MLA is enabled + use_mla = mla_enabled(model_cfg) + + # Construct KV shape (for memory pool) + num_layer = model_cfg.get_num_layers(parallel_cfg) + chunk_size = config.chunk_size + num_kv_head = model_cfg.get_num_kv_heads(parallel_cfg) + head_size = model_cfg.get_head_size() + kv_shape = (num_layer, 1 if use_mla else 2, chunk_size, num_kv_head, head_size) + + # Create metadata + metadata = LMCacheEngineMetadata( + model_cfg.model, + parallel_cfg.world_size, + parallel_cfg.rank, + "vllm", + kv_dtype, + kv_shape, + use_mla, + ) + + return metadata, config + + +def extract_mm_features( + request: Union["Request", "NewRequestData"], modify: bool = False +) -> tuple[list[str], list["PlaceholderRange"]]: + """ + Normalize multimodal information from a Request into parallel lists. + + This helper reads either: + 1) `request.mm_features` (objects each exposing `.identifier` and + `.mm_position`), or + 2) legacy fields `request.mm_hashes` and `request.mm_positions`. + + It returns two equally sized lists: the multimodal hash identifiers and + their corresponding positions. If the request contains no multimodal info, + it returns `([], [])`. + + Args: + request (Request): The source object. + modify (bool): + Controls copy semantics for the legacy-path return values. + - If True and legacy fields are used, shallow-copies are returned so + the caller can mutate the lists without affecting `request`. + - If False, the original legacy sequences are returned as-is + (zero-copy); treat them as read-only. + + Returns: + tuple[list[str], list[PlaceholderRange]]: (`mm_hashes`, `mm_positions`). + May be `([], [])` when no multimodal data is present. + """ + if getattr(request, "mm_features", None): + mm_hashes, mm_positions = zip( + *((f.identifier, f.mm_position) for f in request.mm_features) + ) + return (list(mm_hashes), list(mm_positions)) + elif getattr(request, "mm_hashes", None): + if modify: + return ( + request.mm_hashes.copy(), # type: ignore + request.mm_positions.copy(), # type: ignore + ) + else: + return (request.mm_hashes, request.mm_positions) # type: ignore + else: + return ([], []) diff --git a/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py new file mode 100644 index 0000000..94572b0 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -0,0 +1,1411 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Standard +import os +import uuid +from collections.abc import Generator +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Optional + +import torch +from lmcache import utils +from lmcache.config import LMCacheEngineMetadata +from lmcache.logging import init_logger +from lmcache.observability import LMCStatsMonitor +from lmcache.utils import _lmcache_nvtx_annotate +from lmcache.v1.cache_engine import LMCacheEngine, LMCacheEngineBuilder +from lmcache.v1.compute.blend import LMCBlenderBuilder +from lmcache.v1.config import LMCacheEngineConfig, _validate_and_set_config_value +from lmcache.v1.gpu_connector import ( + VLLMBufferLayerwiseGPUConnector, + VLLMPagedMemGPUConnectorV2, + VLLMPagedMemLayerwiseGPUConnector, +) +from lmcache.v1.internal_api_server.api_server import InternalAPIServer +from lmcache.v1.lookup_client import LookupClientFactory +from lmcache.v1.lookup_client.lmcache_async_lookup_client import ( + LMCacheAsyncLookupServer, +) +from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer +from lmcache.v1.plugin.plugin_launcher import PluginLauncher + +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, +) +from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import ( + ENGINE_NAME, + apply_mm_hashes_to_token_ids, + extract_mm_features, + lmcache_get_or_create_config, + mla_enabled, +) +from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group +from vllm.sampling_params import SamplingParams +from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import get_kv_cache_torch_dtype +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.version import __version__ as VLLM_VERSION + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.multimodal.inputs import PlaceholderRange + from vllm.v1.core.kv_cache_manager import KVCacheManager + from vllm.v1.core.sched.output import NewRequestData + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class LoadSpec: + # Number of tokens cached in vLLM + vllm_cached_tokens: int + # Number of tokens that are cached in LMCache + lmcache_cached_tokens: int + # Whether the scheduler allow us to load the tokens + can_load: bool + + +@dataclass +class SaveSpec: + # Skip already saved tokens + skip_leading_tokens: int + # Whether the scheduler allow us to save the tokens + can_save: bool + + +@dataclass +class DisaggSpec: + req_id: str + receiver_id: str + receiver_host: str + receiver_init_port: int + receiver_alloc_port: int + is_last_prefill: bool = False + num_transferred_tokens: int = 0 + + +tmp_disagg_tracker: dict[str, DisaggSpec] = {} + + +def extract_request_configs(sampling_params: SamplingParams) -> dict | None: + request_configs = None + if ( + sampling_params.extra_args is not None + and "kv_transfer_params" in sampling_params.extra_args + ): + kv_transfer_params = sampling_params.extra_args.get("kv_transfer_params") + if kv_transfer_params is None: + return None + assert isinstance(kv_transfer_params, dict) + for k, v in kv_transfer_params.items(): + if k.startswith("lmcache."): + if request_configs is None: + request_configs = {} + request_configs[k] = v + return request_configs + + +@dataclass +class RequestTracker: + # Request id + req_id: str + + # Total prompt token length + prompt_len: int + + # The token ids that has been scheduled so far + token_ids: list[int] + + # The block ids that has been allocated so far + # NOTE: allocated blocks could be more than the number of tokens + allocated_block_ids: list[int] + + # The number of tokens that has been saved + num_saved_tokens: int = 0 + + # Disagg spec for the request + disagg_spec: DisaggSpec | None = None + + # Multimodal hashes and positions + mm_hashes: list[str] | None = None + mm_positions: list["PlaceholderRange"] | None = None + + # The configs of the request, includes tags and other configs + request_configs: dict | None = None + + # Whether the request is in decode phase + is_decode_phase = False + + # Whether the request cache should be saved + skip_save: bool = False + + @_lmcache_nvtx_annotate + @staticmethod + def from_new_request( + lmcache_config: LMCacheEngineConfig, + new_request: "NewRequestData", + num_tokens_to_compute: int, + lmcache_cached_tokens: int, + skip_save: bool, + ) -> "RequestTracker": + """Create the request tracker from a new request. + + Args: + lmcache_config (LMCacheEngineConfig): the LMCache engine config. + new_request (NewRequestData): the new request data. + num_tokens_to_compute (int): the number of tokens that will + be 'computed', including the `num_computed_tokens` (vLLM's + local cache hit) and new tokens that will be scheduled. + lmcache_cached_tokens (int): the number of tokens that are + cached in LMCache. + skip_save (bool): whether the request cache should be saved + """ + # vLLM 0.9.0 update: request.block_ids changed from list[int] to + # list[list[int]] + # Need to check the type of request.block_ids + + unfolded_block_ids = [] + + if not isinstance(new_request.block_ids[0], list): + unfolded_block_ids = new_request.block_ids.copy() + else: + # According to the vLLM code + # (https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/ + # sched/scheduler.py#L943), + # only one KVCacheGroup is supported in connector for now. + unfolded_block_ids = new_request.block_ids[0].copy() + + # NOTE: Initialized in `update_state_after_alloc` + disagg_spec = tmp_disagg_tracker.pop(new_request.req_id, None) + + if new_request.sampling_params: + request_configs = extract_request_configs(new_request.sampling_params) + else: + request_configs = None + + mm_hashes, mm_positions = extract_mm_features(new_request, modify=True) + + assert new_request.prompt_token_ids is not None + return RequestTracker( + req_id=new_request.req_id, + prompt_len=len(new_request.prompt_token_ids), + token_ids=new_request.prompt_token_ids[:num_tokens_to_compute].copy(), + allocated_block_ids=unfolded_block_ids, + num_saved_tokens=lmcache_cached_tokens, + disagg_spec=disagg_spec, + mm_hashes=mm_hashes, + mm_positions=mm_positions, + skip_save=skip_save, + request_configs=request_configs, + ) + + def update( + self, + new_token_ids: list[int], + new_block_ids: tuple[list[int], ...] | None | list[int], + ) -> None: + """Update the request tracker when a running request is + scheduled again + """ + + self.token_ids.extend(new_token_ids) + + if new_block_ids is None: + # https://github.com/vllm-project/vllm/commit/ + # b029de9902aa3ac58806c8c17776c7074175b6db + new_block_ids = [] + elif len(new_block_ids) == 0: + new_block_ids = [] + elif isinstance(new_block_ids, tuple): + new_block_ids = new_block_ids[0] + elif isinstance(new_block_ids, list): + pass + else: + raise ValueError(f"Unsupported new_block_ids type {type(new_block_ids)}") + self.allocated_block_ids.extend(new_block_ids) + + # When a request is scheduled again, and the number of new tokens + # is 1 (excluding chunked prefill), the request is in decode phase. + if len(new_token_ids) == 1: + self.is_decode_phase = True + + +@dataclass +class ReqMeta: + # Request id + req_id: str + # Request tokens + token_ids: list[int] # torch.Tensor + # Slot mapping + slot_mapping: torch.Tensor + + # Whether is last prefill or not + is_last_prefill: bool = False + + # Skip save or not + save_spec: SaveSpec | None = None + # load_spec + load_spec: LoadSpec | None = None + # disagg spec + disagg_spec: DisaggSpec | None = None + # the configs of the request + request_configs: dict | None = None + + @staticmethod + def from_request_tracker( + tracker: RequestTracker, + block_size: int, + lmcache_chunk_size: int = 256, + load_spec: LoadSpec | None = None, + discard_partial_chunks: bool = True, + save_decode_cache: bool = False, + ) -> Optional["ReqMeta"]: + """Create the request metadata from a request tracker. + + Args: + tracker (RequestTracker): the request tracker. + block_size (int): the block size in vLLM. + lmcache_chunk_size (int): the chunk size for LMCache. + load_spec (Optional[LoadSpec]): the load spec for KV cache loading. + discard_partial_chunks (bool): whether to discard partial chunks. + save_decode_cache (bool): whether to save the cache in decode phase. + + Returns: + the request metadata if we need to perform load/save + operations, None otherwise. + """ + input_token_ids = tracker.token_ids + input_token_len = len(input_token_ids) + + is_last_prefill = False + if input_token_len == tracker.prompt_len: + is_last_prefill = True + + # For save operation: do not save if the following condition is met + # 1. has already been saved before (num_saved_tokens > 0) + # 2. number of unsaved tokens is not reached the chunk boundary + # 3. if save_decode_cache is False and it is in decode phase + + skip_leading_tokens = tracker.num_saved_tokens + chunk_boundary = ( + cdiv(tracker.num_saved_tokens + 1, lmcache_chunk_size) * lmcache_chunk_size + ) + + # NOTE(vladnosiv): for disagg, you cannot skip saving, as saving is a + # trqansfer. Check if request_configs has lmcache.skip_save set to True + request_skip = (tracker.request_configs or {}).get("lmcache.skip_save", False) + + skip_save = tracker.disagg_spec is None and ( + tracker.skip_save + or (tracker.num_saved_tokens > 0 and input_token_len < chunk_boundary) + or (tracker.is_decode_phase and not save_decode_cache) + or request_skip + ) + + if skip_save and load_spec is None: + return None + + # Calculate number of tokens to save based on discard_partial_chunks + # setting + + # NOTE(vladnosiv): for the input_token_len chunk prefill, + # we are required to discard partial chunks, + # as new tokens will be added in the next iteration. + num_tokens_to_save = ( + (input_token_len // lmcache_chunk_size * lmcache_chunk_size) + if not is_last_prefill or discard_partial_chunks + else input_token_len + ) + + # If we need to save, update the number of saved tokens + if not skip_save: + tracker.num_saved_tokens = num_tokens_to_save + save_spec = SaveSpec(skip_leading_tokens, not skip_save) + + # Calculate the token ids and slot mappings for load and save + token_ids = input_token_ids[:num_tokens_to_save] + + # If the request has multimodal hashes, apply them to the token ids + if tracker.mm_hashes: + token_ids_tensor = torch.tensor(token_ids) + assert tracker.mm_positions is not None, ( + "tracker got mm_hashes but no mm_positions" + ) + apply_mm_hashes_to_token_ids( + token_ids_tensor, tracker.mm_hashes, tracker.mm_positions + ) + token_ids = token_ids_tensor.tolist() + + num_blocks = len(tracker.allocated_block_ids) + + if len(token_ids) > num_blocks * block_size: + logger.error( + "The number of tokens is more than the number of blocks." + "Something might be wrong in scheduling logic!" + ) + logger.error( + "Num tokens: %d, num blocks: %d, block size: %d", + len(token_ids), + num_blocks, + block_size, + ) + + block_ids = torch.tensor(tracker.allocated_block_ids, dtype=torch.long) + block_offsets = torch.arange(0, block_size, dtype=torch.long) + slot_mapping = ( + block_offsets.reshape((1, block_size)) + + block_ids.reshape((num_blocks, 1)) * block_size + ) + + slot_mapping = slot_mapping.flatten()[: len(token_ids)] + assert slot_mapping.dtype == torch.long + + # For load operation: check whether the request is scheduled to load + if load_spec is not None and load_spec.can_load: + logger.debug( + "Scheduled to load %d tokens for request %s", + load_spec.lmcache_cached_tokens, + tracker.req_id, + ) + else: + # Do not load if not in `can_load` state + load_spec = None + + return ReqMeta( + req_id=tracker.req_id, + token_ids=token_ids, + slot_mapping=slot_mapping, + is_last_prefill=is_last_prefill, + save_spec=save_spec, + load_spec=load_spec, + disagg_spec=tracker.disagg_spec, + request_configs=tracker.request_configs, + ) + + +def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig): + return not lmcache_config.enable_pd + + +def _calculate_mtp_layers(vllm_config, model_config): + num_mtp_layers = 0 + if vllm_config is not None and vllm_config.speculative_config is not None: + logger.info( + "vllm_config.speculative_config: %s", vllm_config.speculative_config + ) + # TODO(baoloongmao): Support other MTP methods + if vllm_config.speculative_config.method == "deepseek_mtp": + num_mtp_layers = getattr( + model_config.hf_config, "num_nextn_predict_layers", 0 + ) + + elif vllm_config.speculative_config.use_eagle(): + try: + draft_model_config = vllm_config.speculative_config.draft_model_config + num_mtp_layers = draft_model_config.get_num_layers( + vllm_config.parallel_config + ) + logger.info("EAGLE detected %d extra layer(s)", num_mtp_layers) + except Exception: + logger.info( + "EAGLE detected, but failed to get the number of extra layers" + "falling back to 1" + ) + num_mtp_layers = 1 + return num_mtp_layers + + +def _init_lmcache_engine( + lmcache_config: LMCacheEngineConfig, + vllm_config: "VllmConfig", +) -> LMCacheEngine: + """Initialize the LMCache engine by the given model config and parallel + config. This function will check the environment variable + `LMCACHE_CONFIG_FILE` to load the configuration file. If that environment + variable is not set, this function will return None. + + :param lmcache_config: The LMCache configuration. + :type lmcache_config: LMCacheEngineConfig + :param vllm_config: The vLLM configuration. + :type vllm_config: VllmConfig + + :return: The initialized LMCache engine + :rtype: LMCacheEngine + """ + if curr_engine := LMCacheEngineBuilder.get(ENGINE_NAME): + return curr_engine + + model_config = vllm_config.model_config + parallel_config = vllm_config.parallel_config + cache_config = vllm_config.cache_config + + assert isinstance(lmcache_config, LMCacheEngineConfig), ( + "LMCache v1 configuration is should be passed." + ) + + kv_dtype = get_kv_cache_torch_dtype(cache_config.cache_dtype, model_config.dtype) + + use_mla = mla_enabled(model_config) + if use_mla and ( + lmcache_config.remote_serde != "naive" + and lmcache_config.remote_serde is not None + ): + raise ValueError("MLA only works with naive serde mode..") + + # construct kv shape (for mem pool) + num_layer = model_config.get_num_layers(parallel_config) + num_mtp_layers = _calculate_mtp_layers(vllm_config, model_config) + num_layer += num_mtp_layers + chunk_size = lmcache_config.chunk_size + num_kv_head = model_config.get_num_kv_heads(parallel_config) + head_size = model_config.get_head_size() + kv_shape = (num_layer, 1 if use_mla else 2, chunk_size, num_kv_head, head_size) + logger.info( + "use mla: %s, kv shape: %s, num_mtp_layers: %s", + use_mla, + kv_shape, + num_mtp_layers, + ) + + # Change current device. + num_gpus = torch.cuda.device_count() + local_rank = parallel_config.rank % num_gpus + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + metadata = LMCacheEngineMetadata( + model_config.model, + parallel_config.world_size, + parallel_config.rank, + "vllm", + kv_dtype, + kv_shape, + use_mla, + ) + + use_gpu = need_gpu_interm_buffer(lmcache_config) + vllm_gpu_connector: ( + VLLMBufferLayerwiseGPUConnector + | VLLMPagedMemGPUConnectorV2 + | VLLMPagedMemLayerwiseGPUConnector + ) + + if use_mla and lmcache_config.use_layerwise: + raise ValueError("layerwise MLA connector is not supported yet") + + # When use_mla is True, num_kv_head is 1 + hidden_dim_size = num_kv_head * head_size + if lmcache_config.use_layerwise: + if lmcache_config.enable_blending: + # Use layerwise connector for blending + vllm_gpu_connector = VLLMBufferLayerwiseGPUConnector( + hidden_dim_size, + num_layer, + use_gpu=use_gpu, + chunk_size=chunk_size, + dtype=kv_dtype, + device=device, + ) + else: + vllm_gpu_connector = VLLMPagedMemLayerwiseGPUConnector( + hidden_dim_size, + num_layer, + use_gpu=use_gpu, + chunk_size=chunk_size, + dtype=kv_dtype, + device=device, + ) + else: + vllm_gpu_connector = VLLMPagedMemGPUConnectorV2( + hidden_dim_size, + num_layer, + use_gpu=use_gpu, + chunk_size=chunk_size, + dtype=kv_dtype, + device=device, + use_mla=use_mla, + ) + tpg = get_tp_group() + engine = LMCacheEngineBuilder.get_or_create( + ENGINE_NAME, + lmcache_config, + metadata, + vllm_gpu_connector, + tpg.broadcast, + tpg.broadcast_object, + ) + + return engine + + +@dataclass +class LMCacheConnectorMetadata(KVConnectorMetadata): + requests: list[ReqMeta] = field(default_factory=list) + lookup_requests_in_step: list[str] = field(default_factory=list) + + @_lmcache_nvtx_annotate + def add_request(self, req_meta: ReqMeta) -> None: + """Add a request to the metadata. + + Args: + req_meta (ReqMeta): the request metadata. + """ + self.requests.append(req_meta) + + +class LMCacheConnectorV1Impl: + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + parent: KVConnectorBase_V1, + ): + assert vllm_config.kv_transfer_config is not None + self._parent = parent + self._vllm_config = vllm_config + self.kv_role = vllm_config.kv_transfer_config.kv_role + self.worker_count = vllm_config.parallel_config.tensor_parallel_size + config = lmcache_get_or_create_config() + assert isinstance(config, LMCacheEngineConfig), ( + "LMCache v1 configuration is should be passed for vLLM v1." + ) + # Put the leading with "lmcache." and matched configs from + # vllm extra_config to the config + kv_connector_extra_config = ( + vllm_config.kv_transfer_config.kv_connector_extra_config + ) + if kv_connector_extra_config: + for key, value in kv_connector_extra_config.items(): + if key.startswith("lmcache."): + config_key = key[8:] # Remove "lmcache." prefix + if _validate_and_set_config_value(config, config_key, value): + logger.info( + "Updated config %s from vLLM extra config: %s", + config_key, + value, + ) + + self.config = config + + self.async_loading = config.enable_async_loading + self.layerwise_retrievers: list[Generator[torch.Tensor | None, None, None]] = [] + self._stats_monitor = LMCStatsMonitor.GetOrCreate() + if role == KVConnectorRole.SCHEDULER: + # Create lookup client using factory + self.lookup_client = LookupClientFactory.create_lookup_client( + vllm_config, config + ) + self._unfinished_requests: dict[str, Request] = {} + self._lookup_requests_in_step: list[str] = [] + self.lmcache_engine = None + else: + self.lmcache_engine = _init_lmcache_engine( + config, + vllm_config, + ) + + self.use_layerwise = config.use_layerwise + self.enable_blending = config.enable_blending + + if self.enable_blending: + self.blender = LMCBlenderBuilder.get_or_create( + ENGINE_NAME, + self.lmcache_engine, + self.lmcache_engine.gpu_connector, + config, + ) + + # Create lookup server using factory + assert self.lmcache_engine is not None + self.lookup_server = LookupClientFactory.create_lookup_server( + self.lmcache_engine, vllm_config + ) + + self.offload_server = ZMQOffloadServer( + self.lmcache_engine, + vllm_config, + get_tensor_model_parallel_rank(), + ) + + # In case of MLA, the lookup server is only created on worker 0 + if self.async_loading and self.lookup_server is not None: + assert isinstance(self.lookup_server, LMCacheAsyncLookupServer) + self.lmcache_engine.post_init(async_lookup_server=self.lookup_server) + + self.kv_caches: dict[str, torch.Tensor] = {} + + self._block_size = vllm_config.cache_config.block_size + + # request_id -> (vllm cached tokens, lmcache cached tokens) + self.load_specs: dict[str, LoadSpec] = {} + + self.kv_cache_manager: KVCacheManager | None = None + + # request_id -> full_token_ids + self._request_trackers: dict[str, RequestTracker] = {} + + # Whether to discard partial chunks + self._discard_partial_chunks = ( + vllm_config.kv_transfer_config.get_from_extra_config( + "discard_partial_chunks", False + ) + or not config.save_unfull_chunk + ) + + self._lmcache_chunk_size = config.chunk_size + self._save_decode_cache = config.save_decode_cache + + self.skip_last_n_tokens = vllm_config.kv_transfer_config.get_from_extra_config( + "skip_last_n_tokens", 0 + ) + + self.num_layers = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config + ) + self.current_layer = 0 + + self.force_skip_save = bool(os.environ.get("LMCACHE_FORCE_SKIP_SAVE", False)) + + self._requests_priority: dict[str, int] = {} + + # TODO(baoloongmao): Internal api server & plugin framework support + # dp > 1 + if ( + vllm_config.parallel_config.data_parallel_size_local == 1 + or vllm_config.parallel_config.data_parallel_rank_local == 0 + ): + # Start internal API server if enabled + # The enabled check is in the InternalAPIServer constructor + self.api_server = InternalAPIServer(self) + self.api_server.start() + # Launch plugins + self.plugin_launcher = PluginLauncher( + self.config, + role, + self.worker_count, + -1 + if self.lmcache_engine is None # scheduler side + else self.lmcache_engine.metadata.worker_id, + ) + self.plugin_launcher.launch_plugins() + else: + self.api_server = None # type: ignore[assignment] + self.plugin_launcher = None # type: ignore[assignment] + logger.info( + "LMCache initialized for role %s with version %s, " + "vllm version %s, lmcache cache_engine metadata: %s", + role, + utils.get_version(), + VLLM_VERSION, + getattr(self.lmcache_engine, "metadata", None), + ) + + def get_inference_info(self) -> dict: + """Get inference information including vLLM config and related details. + + Returns: + dict: Dictionary containing inference information + """ + # Get vLLM config information + vllm_config = self._vllm_config + + # Use vLLM config's string representation and add specific configs + inference_info = { + "vllm_version": VLLM_VERSION, + "lmcache_version": utils.get_version(), + "vllm_config": str(vllm_config), + "model_config": { + "model": getattr(vllm_config.model_config, "model", None), + "dtype": str(getattr(vllm_config.model_config, "dtype", None)), + "max_model_len": getattr( + vllm_config.model_config, "max_model_len", None + ), + "vocab_size": vllm_config.model_config.get_vocab_size(), + "num_layers": getattr( + vllm_config.model_config, "get_num_layers", lambda _: None + )(vllm_config.parallel_config), + "num_attention_heads": getattr( + vllm_config.model_config, "get_num_attention_heads", lambda _: None + )(vllm_config.parallel_config), + "num_kv_heads": getattr( + vllm_config.model_config, "get_num_kv_heads", lambda _: None + )(vllm_config.parallel_config), + "head_size": getattr( + vllm_config.model_config, "get_head_size", lambda: None + )(), + }, + "cache_config": { + "block_size": getattr(vllm_config.cache_config, "block_size", None), + "cache_dtype": str( + getattr(vllm_config.cache_config, "cache_dtype", None) + ), + "gpu_memory_utilization": getattr( + vllm_config.cache_config, "gpu_memory_utilization", None + ), + }, + } + + return inference_info + + def get_inference_version(self) -> str: + """Get vLLM version information. + + Returns: + str: vLLM version string + """ + return VLLM_VERSION + + @_lmcache_nvtx_annotate + def _init_kv_caches_from_forward_context(self, forward_context: "ForwardContext"): + for layer_name in forward_context.no_compile_layers: + attn_layer = forward_context.no_compile_layers[layer_name] + if not hasattr(attn_layer, "kv_cache"): + logger.debug("The layer %s does not have kv_cache, skip it", layer_name) + continue + + if layer_name not in self.kv_caches: + self.kv_caches[layer_name] = attn_layer.kv_cache[ + forward_context.virtual_engine + ] + + #################### + # Worker side APIs + #################### + + @_lmcache_nvtx_annotate + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: + """Start loading the KV cache from the connector buffer to vLLM's + paged KV buffer. + + Args: + forward_context (ForwardContext): the forward context. + + Note: + The number of elements in kv_caches and layer_names should be + the same. + """ + self.current_layer = 0 + + if len(self.kv_caches) == 0: + self._init_kv_caches_from_forward_context(forward_context) + + metadata = self._parent._get_connector_metadata() + assert isinstance(metadata, LMCacheConnectorMetadata) + + assert len(self.kv_caches) > 0 + kvcaches = list(self.kv_caches.values()) + + attn_metadata = forward_context.attn_metadata + if attn_metadata is None: + logger.debug("In connector.start_load_kv, but the attn_metadata is None") + return + + assert self.lmcache_engine is not None + + self.lmcache_engine.post_init(kvcaches=kvcaches) + + self.layerwise_retrievers = [] + + for idx, request in enumerate(metadata.requests): + if request.load_spec is None: + continue + last_idx = idx + + for idx, request in enumerate(metadata.requests): + if request.load_spec is None: + continue + + tokens = request.token_ids + # TODO: have a pre-allocated buffer to hold the slot_mappings + slot_mapping = request.slot_mapping.cuda() + assert len(tokens) == len(slot_mapping) + + self._stats_monitor.update_interval_vllm_hit_tokens( + request.load_spec.vllm_cached_tokens + ) + token_mask = torch.ones(len(tokens), dtype=torch.bool) + masked_token_count = ( + request.load_spec.vllm_cached_tokens + // self._lmcache_chunk_size + * self._lmcache_chunk_size + ) + token_mask[:masked_token_count] = False + + lmcache_cached_tokens = request.load_spec.lmcache_cached_tokens + if self.use_layerwise: + sync = idx == last_idx + # NOTE(Jiayi): Perform blending before layerwise prefix caching + if self.enable_blending: + # TODO(Jiayi): Need to make prefix caching and blending + # compatible + self.blender.blend( + tokens[:lmcache_cached_tokens], + token_mask[:lmcache_cached_tokens], + kvcaches=kvcaches, + slot_mapping=slot_mapping[:lmcache_cached_tokens], + ) + else: + layerwise_retriever = self.lmcache_engine.retrieve_layer( + tokens[:lmcache_cached_tokens], + token_mask[:lmcache_cached_tokens], + kvcaches=kvcaches, + slot_mapping=slot_mapping[:lmcache_cached_tokens], + sync=sync, + ) + # NOTE: retrieve for two layers at the first layer + next(layerwise_retriever) + next(layerwise_retriever) + self.layerwise_retrievers.append(layerwise_retriever) + else: + ret_token_mask = self.lmcache_engine.retrieve( + tokens[:lmcache_cached_tokens], + token_mask[:lmcache_cached_tokens], + kvcaches=kvcaches, + slot_mapping=slot_mapping[:lmcache_cached_tokens], + request_configs=request.request_configs, + req_id=request.req_id, + ) + + # Check the result + num_retrieved_tokens = ret_token_mask.sum().item() + num_expected_tokens = ( + lmcache_cached_tokens - request.load_spec.vllm_cached_tokens + ) + if num_retrieved_tokens < num_expected_tokens: + logger.error( + "The number of retrieved tokens is less than the " + "expected number of tokens! This should not happen!" + ) + logger.error( + "Num retrieved tokens: %d, num expected tokens: %d", + num_retrieved_tokens, + num_expected_tokens, + ) + + @_lmcache_nvtx_annotate + def wait_for_layer_load(self, layer_name: str) -> None: + """Blocking until the KV for a specific layer is loaded into vLLM's + paged buffer. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + if self.layerwise_retrievers: + logger.debug("Waiting for layer %s to be loaded", self.current_layer) + + # Wait for the layer to be loaded + for layerwise_retriever in self.layerwise_retrievers: + ret_token_mask = next(layerwise_retriever) + + if self.current_layer == self.num_layers - 1: + assert ret_token_mask is not None + num_retrieved_tokens = ret_token_mask.sum().item() + logger.info("Retrieved %s tokens", num_retrieved_tokens) + + return + + @_lmcache_nvtx_annotate + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs, + ) -> None: + """Start saving the a layer of KV cache from vLLM's paged buffer + to the connector. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + """ + assert self.lmcache_engine is not None + + if not self.use_layerwise: + return + + if self.kv_role == "kv_consumer": + # Don't do save if the role is kv_consumer + return + if self._parent._connector_metadata is None: + logger.warning( + "In connector.save_kv_layer, but the connector metadata is None" + ) + return + connector_metadata = self._parent._get_connector_metadata() + assert isinstance(connector_metadata, LMCacheConnectorMetadata) + + assert len(self.kv_caches) > 0 + + kvcaches = list(self.kv_caches.values()) + if self.current_layer == 0: + self.layerwise_storers = [] + + is_first = True + + for idx, request in enumerate(connector_metadata.requests): + save_spec = request.save_spec + if save_spec is None or not save_spec.can_save: + continue + + token_ids = request.token_ids + assert isinstance(token_ids, list) + + slot_mapping = request.slot_mapping + assert isinstance(slot_mapping, torch.Tensor) + assert len(slot_mapping) == len(token_ids) + + # TODO: have a pre-allocated buffer to hold the slot_mappings + slot_mapping = slot_mapping.cuda() + + if self.kv_role == "kv_producer": + skip_leading_tokens = 0 + else: + skip_leading_tokens = save_spec.skip_leading_tokens + + if skip_leading_tokens == len(token_ids): + continue # skip this request + # Align to lmcache chunk size + skip_leading_tokens = ( + skip_leading_tokens + // self._lmcache_chunk_size + * self._lmcache_chunk_size + ) + + store_mask = torch.ones(len(token_ids), dtype=torch.bool) + store_mask[:skip_leading_tokens] = False + + logger.info( + "Storing KV cache for %d out of %d tokens " + "(skip_leading_tokens=%d) for request %s", + len(token_ids) - skip_leading_tokens, + len(token_ids), + skip_leading_tokens, + request.req_id, + ) + + # TODO (Jiayi): need to make layerwise storing + # compatible with disagg spec + layerwise_storer = self.lmcache_engine.store_layer( + token_ids, + mask=store_mask, + kvcaches=kvcaches, + slot_mapping=slot_mapping, + offset=skip_leading_tokens, + sync=is_first, + ) + self.layerwise_storers.append(layerwise_storer) + if is_first: + is_first = False + + for layerwise_storer in self.layerwise_storers: + next(layerwise_storer) + + self.current_layer += 1 + + @_lmcache_nvtx_annotate + def wait_for_save(self): + """Blocking until the KV cache is saved to the connector buffer.""" + + connector_metadata = self._parent._get_connector_metadata() + assert isinstance(connector_metadata, LMCacheConnectorMetadata) + + self.lmcache_engine.lookup_unpin( # type: ignore + connector_metadata.lookup_requests_in_step + ) + + if self.kv_role == "kv_consumer": + # Don't do save if the role is kv_consumer + return + + if self.use_layerwise: + for layerwise_storer in self.layerwise_storers: + next(layerwise_storer) + return + + assert len(self.kv_caches) > 0 + kvcaches = list(self.kv_caches.values()) + + assert self.lmcache_engine is not None + + for request in connector_metadata.requests: + save_spec = request.save_spec + if ( + save_spec is None or not save_spec.can_save + ) and self.kv_role != "kv_producer": + continue + + token_ids = request.token_ids + + slot_mapping = request.slot_mapping + assert isinstance(slot_mapping, torch.Tensor) + assert len(slot_mapping) == len(token_ids) + assert save_spec is not None + + # TODO: have a pre-allocated buffer to hold the slot_mappings + slot_mapping = slot_mapping.cuda() + + skip_leading_tokens = save_spec.skip_leading_tokens + if self.kv_role == "kv_producer": + assert request.disagg_spec is not None + skip_leading_tokens = min( + skip_leading_tokens, request.disagg_spec.num_transferred_tokens + ) + + if skip_leading_tokens == len(token_ids): + continue # skip this request + # Align to lmcache chunk size + skip_leading_tokens = ( + skip_leading_tokens + // self._lmcache_chunk_size + * self._lmcache_chunk_size + ) + + store_mask = torch.ones(len(token_ids), dtype=torch.bool) + store_mask[:skip_leading_tokens] = False + + logger.info( + "Storing KV cache for %d out of %d tokens " + "(skip_leading_tokens=%d) for request %s", + len(token_ids) - skip_leading_tokens, + len(token_ids), + skip_leading_tokens, + request.req_id, + ) + + is_last_prefill = request.is_last_prefill + if is_last_prefill: + if request.disagg_spec: + request.disagg_spec.is_last_prefill = True + else: + token_len = len(token_ids) + aligned_token_len = ( + token_len // self._lmcache_chunk_size * self._lmcache_chunk_size + ) + token_ids = token_ids[:aligned_token_len] + store_mask = store_mask[:aligned_token_len] + slot_mapping = slot_mapping[:aligned_token_len] + + self.lmcache_engine.store( + token_ids, + mask=store_mask, + kvcaches=kvcaches, + slot_mapping=slot_mapping, + offset=skip_leading_tokens, + transfer_spec=request.disagg_spec, + request_configs=request.request_configs, + ) + + # NOTE(Jiayi): We assume all tokens are saved + save_spec.skip_leading_tokens = len(token_ids) + if request.disagg_spec: + request.disagg_spec.num_transferred_tokens = len(token_ids) + + @_lmcache_nvtx_annotate + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + return None, None + + ################### + # Scheduler side APIs + #################### + + @_lmcache_nvtx_annotate + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> int | None: + """ + Check for external KV cache hit. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + """ + if self.kv_role == "kv_producer" and not hasattr( + self.lookup_client, "supports_producer_reuse" + ): + return 0 + + self._requests_priority[request.request_id] = request.priority + + token_ids = request.prompt_token_ids + + # If the request has multimodal hashes, apply them to the token ids + mm_hashes, mm_positions = extract_mm_features(request) + if mm_hashes and mm_positions: + # TODO(Jiayi): Optimize this + token_ids_tensor = torch.tensor(request.prompt_token_ids) + apply_mm_hashes_to_token_ids(token_ids_tensor, mm_hashes, mm_positions) + token_ids = token_ids_tensor.tolist() + + if request.sampling_params: + request_configs = extract_request_configs(request.sampling_params) + else: + request_configs = None + + if self.skip_last_n_tokens > 0: + assert token_ids is not None + token_ids = token_ids[: -self.skip_last_n_tokens] + lookup_id = request.request_id if self.async_loading else str(uuid.uuid4()) + + self._lookup_requests_in_step.append(lookup_id) + + num_external_hit_tokens = self.lookup_client.lookup( + token_ids, + lookup_id=lookup_id, + request_configs=request_configs, + ) + + if num_external_hit_tokens is None: + logger.info( + "Reqid: %s, Total tokens %d, LMCache hit tokens: None.", + request.request_id, + request.num_tokens, + ) + return None + + # When prompt length is divisible by the block size and all + # blocks are cached, we need to recompute the last token. + # This will be removed in the future if vLLM's scheduler provides + # a better support for this case. + need_to_allocate = num_external_hit_tokens - num_computed_tokens + + # In, full-prompt-hit case, we need to recompute the last token + if num_external_hit_tokens == request.num_tokens: + need_to_allocate -= 1 + + logger.info( + "Reqid: %s, Total tokens %d, LMCache hit tokens: %d, need to load: %d", + request.request_id, + request.num_tokens, + num_external_hit_tokens, + need_to_allocate, + ) + + self.load_specs[request.request_id] = LoadSpec( + vllm_cached_tokens=num_computed_tokens, + lmcache_cached_tokens=num_external_hit_tokens, + can_load=False, + ) + + if need_to_allocate <= 0: + return 0 + + return need_to_allocate + + @_lmcache_nvtx_annotate + def update_state_after_alloc(self, request: "Request", num_external_tokens: int): + """ + Update KVConnector state after temporary buffer alloc. + + For SharedStorageConnector, update _request_needs_load + if the CacheManager this allocated blocks for us. + """ + + # Clear local status in lookup client when a new request is + # successfully scheduled. + self.lookup_client.clear_lookup_status(request.request_id) + + kv_transfer_params = ( + request.kv_transfer_params + if hasattr(request, "kv_transfer_params") + else None + ) + + if kv_transfer_params is not None and "disagg_spec" in kv_transfer_params: + req_disagg_spec = kv_transfer_params["disagg_spec"] + + receiver_id = req_disagg_spec["receiver_host"] + str( + req_disagg_spec["receiver_init_port"] + ) + + disagg_spec = DisaggSpec( + req_id=req_disagg_spec["req_id"], + receiver_id=receiver_id, + receiver_host=req_disagg_spec["receiver_host"], + receiver_init_port=req_disagg_spec["receiver_init_port"], + receiver_alloc_port=req_disagg_spec["receiver_alloc_port"], + ) + + tmp_disagg_tracker[request.request_id] = disagg_spec + self._unfinished_requests[request.request_id] = request + + if request.request_id not in self.load_specs: + # No KV tokens from external KV cache, return + return + + if num_external_tokens == 0: + # No need to load anything + self.load_specs[request.request_id].can_load = False + return + + # Only check for non-prompt-hit case + if ( + self.load_specs[request.request_id].lmcache_cached_tokens + != request.num_tokens + ): + assert ( + num_external_tokens > 0 + and num_external_tokens + == self.load_specs[request.request_id].lmcache_cached_tokens + - self.load_specs[request.request_id].vllm_cached_tokens + ), ( + f"Mismatch in number of tokens: {num_external_tokens} vs " + f"{self.load_specs[request.request_id].lmcache_cached_tokens} -" + f" {self.load_specs[request.request_id].vllm_cached_tokens}" + f" for request {request.request_id}" + ) + + self.load_specs[request.request_id].can_load = True + + @_lmcache_nvtx_annotate + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + """Attach the connector metadata to the request object. + + This function should NOT modify other fields in the scheduler_output + except the `kv_connector_metadata` field. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + + force_skip_save = self.kv_role == "kv_consumer" or self.force_skip_save + + meta = LMCacheConnectorMetadata() + + # set and update lookup requests for unpin + meta.lookup_requests_in_step = self._lookup_requests_in_step + self._lookup_requests_in_step = [] + + for finished_req_id in scheduler_output.finished_req_ids: + self._request_trackers.pop(finished_req_id, None) + self._unfinished_requests.pop(finished_req_id, None) + + for request in scheduler_output.scheduled_new_reqs: + # Right now, we only load KV for new requests + load_spec = self.load_specs.pop(request.req_id, None) + num_tokens_to_compute = ( + request.num_computed_tokens + + scheduler_output.num_scheduled_tokens[request.req_id] + ) + lmcache_cached_tokens = 0 + if load_spec is not None: + lmcache_cached_tokens = load_spec.lmcache_cached_tokens + request_priority = self._requests_priority.pop(request.req_id, 0) + + skip_save = force_skip_save or ( + self.config.priority_limit is not None + and request_priority > self.config.priority_limit + ) + + request_tracker = RequestTracker.from_new_request( + self.config, + request, + num_tokens_to_compute, + lmcache_cached_tokens, + skip_save, + ) + self._request_trackers[request.req_id] = request_tracker + + req_meta = ReqMeta.from_request_tracker( + request_tracker, + self._block_size, + self._lmcache_chunk_size, + load_spec=load_spec, + discard_partial_chunks=self._discard_partial_chunks, + save_decode_cache=self._save_decode_cache, + ) + if req_meta is not None: + meta.add_request(req_meta) + + cached_reqs = scheduler_output.scheduled_cached_reqs + + # NOTE: For backward compatibility with vllm version < 0.9.2, + # In the latest vllm version, the type of scheduled_cached_reqs has + # changed from list to object `CachedRequestData` + if isinstance(cached_reqs, list): + for i, req in enumerate(cached_reqs): + request_tracker = self._request_trackers[req.req_id] + request_tracker.update(req.new_token_ids, req.new_block_ids) + + req_meta = ReqMeta.from_request_tracker( + request_tracker, + self._block_size, + self._lmcache_chunk_size, + load_spec=None, + discard_partial_chunks=self._discard_partial_chunks, + ) + if req_meta is not None: + meta.add_request(req_meta) + return meta + + for i, req_id in enumerate(cached_reqs.req_ids): + request_tracker = self._request_trackers[req_id] + num_new_tokens = scheduler_output.num_scheduled_tokens[req_id] + if cached_request := self._unfinished_requests.get(req_id): + num_current_tokens = len(request_tracker.token_ids) + new_token_ids = cached_request.all_token_ids[ + num_current_tokens : num_current_tokens + num_new_tokens + ] + else: + raise ValueError( + f"Request {req_id} is not in _unfinished_requests, " + f"but it is scheduled to be cached" + ) + new_block_ids = cached_reqs.new_block_ids[i] + + request_tracker.update(new_token_ids, new_block_ids) + + req_meta = ReqMeta.from_request_tracker( + request_tracker, + self._block_size, + self._lmcache_chunk_size, + load_spec=None, + discard_partial_chunks=self._discard_partial_chunks, + save_decode_cache=self._save_decode_cache, + ) + if req_meta is not None: + meta.add_request(req_meta) + + return meta + + @_lmcache_nvtx_annotate + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + params = ( + request.kv_transfer_params + if hasattr(request, "kv_transfer_params") + else None + ) + return_params = None + + # NOTE: Used to stream back the first token + # for disagg prefill + if params is not None and "ret_first_tok" in params: + return_params = { + "first_tok": request._output_token_ids[0], + } + + return False, return_params diff --git a/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py new file mode 100644 index 0000000..55831dc --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -0,0 +1,867 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import enum +from collections.abc import Iterable +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Literal, Optional, cast + +import torch +import zmq +from lmcache.utils import init_logger as lmcache_init_logger + +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, +) +from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration import ( + LMCacheMPSchedulerAdapter, + LMCacheMPWorkerAdapter, + LoadStoreOp, +) +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import KVConnectorOutput +from vllm.v1.utils import ConstantList + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.config import VllmConfig + from vllm.distributed.kv_events import KVCacheEvent + from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorPromMetrics, + KVConnectorStats, + PromMetric, + PromMetricT, + ) + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.core.kv_cache_utils import BlockHash + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +logger = lmcache_init_logger(__name__) + + +# Helper functions +def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]: + if block_ids is None: + return [] + assert isinstance(block_ids, tuple), ( + f"Expected block_ids to be a tuple of lists, but got {type(block_ids)}" + ) + + if len(block_ids) > 1: + raise RuntimeError( + "LMCacheMPConnector only works without hybrid kv cache manager. " + "Please pass --disable-hybrid-kv-cache-manager when starting vllm" + ) + + return block_ids[0] + + +def create_scheduler_adapter( + server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig +) -> LMCacheMPSchedulerAdapter: + # TODO: have a helper function to calculate the correct rank and + # world size for the MLA and other models + return LMCacheMPSchedulerAdapter( + server_url, + zmq_context, + vllm_config.model_config.model, + vllm_config.parallel_config.world_size, + vllm_config.parallel_config.rank, + vllm_config.cache_config.block_size, + ) + + +def create_worker_adapter( + server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig +) -> LMCacheMPWorkerAdapter: + # TODO: have a helper function to calculate the correct rank and + # world size for the MLA and other models + return LMCacheMPWorkerAdapter( + server_url, + zmq_context, + vllm_config.model_config.model, + vllm_config.parallel_config.world_size, + vllm_config.parallel_config.rank, + vllm_config.cache_config.block_size, + ) + + +def convert_block_hashes_to_bytes( + block_hashes: list["BlockHash"], +) -> list[bytes]: + return cast(list[bytes], block_hashes) + + +class LMCacheMPRequestState(enum.Enum): + """ + State machine: + PREFETCHING -- update_state_after_alloc --> WAITING_FOR_LOAD + WAITING_FOR_LOAD -- process_loading_requests --> READY + """ + + PREFETCHING = enum.auto() + WAITING_FOR_LOAD = enum.auto() + READY = enum.auto() + + +@dataclass +class LMCacheMPRequestTracker: + # NOTE: this class used vLLM data structures, should be part of + # vLLM integration code + + request_id: str + + # Read-only lists to track the token ids and block hashes + all_token_ids: ConstantList[int] + block_hashes: ConstantList["BlockHash"] + + # Block ids and hashes will be updated at update_states_after_alloc and + # during the generation + allocated_block_ids: list[int] = field(default_factory=list) + + # Number of scheduled tokens in this request. We keep tracking this to + # avoid saving half-full blocks. + num_scheduled_tokens: int = 0 + + # Number of blocks stored will be initialized when lookup the external + # hit tokens and will be updated when processing new requests and cached + # requests. + num_stored_blocks: int = 0 + + # Staging load operation -- save vllm and lmcache hit tokens during lookup + num_vllm_hit_blocks: int = 0 + num_lmcache_hit_blocks: int = 0 + + # Main state + state: LMCacheMPRequestState = LMCacheMPRequestState.PREFETCHING + + def __init__(self, request: "Request"): + self.request_id = request.request_id + self.all_token_ids = request.all_token_ids + self.block_hashes = ConstantList(request.block_hashes) + self.allocated_block_ids = [] + self.num_stored_blocks = 0 + self.num_vllm_hit_blocks = 0 + self.num_lmcache_hit_blocks = 0 + self.state = LMCacheMPRequestState.PREFETCHING + + #### + # Check the state of the request + #### + def needs_retrieve(self) -> bool: + """Check whether the current request needs retrieve, will be used + update_stage_after_alloc""" + return ( + self.num_lmcache_hit_blocks > self.num_vllm_hit_blocks + and self.state != LMCacheMPRequestState.READY + ) + + def is_ready_for_retrieving(self) -> bool: + """Check whether the current request is ready for retrieving, + will be used in process_loading_requests""" + return ( + self.state == LMCacheMPRequestState.WAITING_FOR_LOAD + and self.needs_retrieve() + ) + + #### + # Update internal states + #### + def increase_num_scheduled_tokens(self, num_new_tokens: int): + self.num_scheduled_tokens += num_new_tokens + + def increase_num_stored_blocks(self, num_new_blocks: int): + """Increase the number of stored blocks for the current request + This function will be called when processing the cached requests. + """ + self.num_stored_blocks += num_new_blocks + + def update_block_ids( + self, + new_block_ids: list[int], + ): + """Update the block ids for the current request + This function will be called when processing the cached requests. + """ + self.allocated_block_ids.extend(new_block_ids) + + #### + # For debugging + #### + def __repr__(self) -> str: + return ( + f"LMCacheMPRequestTracker(request_id={self.request_id}, " + f"num_tokens={len(self.all_token_ids)}, " + f"num_block_hashes={len(self.block_hashes)}, " + f"num_allocated_blocks={len(self.allocated_block_ids)}, " + f"num_stored_blocks={self.num_stored_blocks}, " + f"vllm_hit_blocks={self.num_vllm_hit_blocks}, " + f"lmcache_hit_blocks={self.num_lmcache_hit_blocks}, " + f"state={self.state})" + ) + + def __str__(self) -> str: + return self.__repr__() + + +@dataclass +class LMCacheMPRequestMetadata: + request_id: str + direction: Literal["STORE", "RETRIEVE"] + op: LoadStoreOp + + @staticmethod + def GetStoreMetadata( + tracker: LMCacheMPRequestTracker, + blocks_in_chunk: int, + vllm_block_size: int, + ) -> "LMCacheMPRequestMetadata | None": + """ + Generate the store metadata for the current request tracker. + + Args: + tracker: The request tracker to generate the metadata from. + blocks_in_chunk: the number of blocks in a LMCache data chunk + """ + # Store the blocks that has block hashes + # NOTE: the invariant here is that `num_stored_blocks` should + # always be a multiple of `blocks_in_chunk` + # TODO: This should be checked everytime we update the num_stored_blocks + min_available_blocks = min( + len(tracker.block_hashes), + len(tracker.allocated_block_ids), + tracker.num_scheduled_tokens // vllm_block_size, + ) + num_staging_blocks = min_available_blocks - tracker.num_stored_blocks + num_chunks = num_staging_blocks // blocks_in_chunk + + if num_chunks >= 1: + start = tracker.num_stored_blocks + end = start + num_chunks * blocks_in_chunk + block_hashes = convert_block_hashes_to_bytes( + tracker.block_hashes[start:end] + ) + block_ids = tracker.allocated_block_ids[start:end] + + ret = LMCacheMPRequestMetadata( + request_id=tracker.request_id, + direction="STORE", + op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids), + ) + + # Update the request tracker + tracker.increase_num_stored_blocks(end - start) + return ret + + return None + + @staticmethod + def GetRetrieveMetadata( + tracker: LMCacheMPRequestTracker, + blocks_in_chunk: int, + ) -> "LMCacheMPRequestMetadata | None": + """ + Generate the retrieve metadata for the current request tracker. + + Args: + tracker: The request tracker to generate the metadata from. + blocks_in_chunk: the number of blocks in a LMCache data chunk + """ + if not tracker.is_ready_for_retrieving(): + return None + + # |---------------------|-----------------|----------------| + # | num_vllm_hit_blocks | + # | lmcache chunk 1 | lmcache chunk 2 | + # | need to retrieve | + + start = tracker.num_vllm_hit_blocks // blocks_in_chunk * blocks_in_chunk + end = tracker.num_lmcache_hit_blocks + assert end % blocks_in_chunk == 0, ( + "The number of LMCache hit blocks should be a multiple of the " + "number of blocks in a lmcache chunk. " + ) + assert len(tracker.block_hashes) >= end, ( + "The number of block hashes should be greater than or equal to the " + "number of LMCache hit blocks. " + ) + if end > start: + block_hashes = convert_block_hashes_to_bytes( + tracker.block_hashes[start:end] + ) + block_ids = tracker.allocated_block_ids[start:end] + + ret = LMCacheMPRequestMetadata( + request_id=tracker.request_id, + direction="RETRIEVE", + op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids), + ) + return ret + + return None + + +class LMCacheMPConnectorMetadata(KVConnectorMetadata): + def __init__(self): + super().__init__() + self.requests: list[LMCacheMPRequestMetadata] = [] + + def add_request_metadata(self, request_metadata: LMCacheMPRequestMetadata): + self.requests.append(request_metadata) + + def __len__(self): + return len(self.requests) + + # For debugging + def __str__(self): + request_strs = [] + for req_meta in self.requests: + request_strs.append( + f"RequestMetadata(request_id={req_meta.request_id}, " + f"direction={req_meta.direction}, " + f"num_blocks={len(req_meta.op)}, " + f"block_ids={req_meta.op.block_ids})" + ) + return "[" + "\n".join(request_strs) + "]" + + def __repr__(self): + return self.__str__() + + +class LMCacheMPConnector(KVConnectorBase_V1): + """ + The connector for LMCache multi-process mode. + + Extra configs (kv_transfer_config.extra_config): + - lmcache.mp.host: the host of the LMCache server. + - lmcache.mp.port: the port of the LMCache server. + """ + + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__(vllm_config, role, kv_cache_config) + + assert vllm_config.kv_transfer_config is not None + server_host = vllm_config.kv_transfer_config.get_from_extra_config( + "lmcache.mp.host", "tcp://localhost" + ) + server_port = vllm_config.kv_transfer_config.get_from_extra_config( + "lmcache.mp.port", 5555 + ) + + server_url = f"{server_host}:{server_port}" + zmq_context = zmq.Context.instance() + if self.role == KVConnectorRole.SCHEDULER: + self.scheduler_adapter = create_scheduler_adapter( + server_url, zmq_context, vllm_config + ) + self.request_trackers: dict[str, LMCacheMPRequestTracker] = {} + elif self.role == KVConnectorRole.WORKER: + self.worker_adapter = create_worker_adapter( + server_url, zmq_context, vllm_config + ) + else: + raise ValueError(f"Unknown KVConnectorRole: {self.role}") + + self.vllm_block_size = vllm_config.cache_config.block_size + + @property + def role(self) -> KVConnectorRole: + return self._role + + # ============================== + # Worker-side methods + # ============================== + + def _get_connector_metadata(self) -> KVConnectorMetadata: + """Get the connector metadata. + + This function should only be called inside the connector. + + Returns: + ConnectorMetadata: the connector metadata. + """ + + # Should only be called while set to valid metadata. + assert self._connector_metadata is not None + return self._connector_metadata + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + """ + Initialize with the KV caches. Useful for pre-registering the + KV Caches in the KVConnector (e.g. for NIXL). + + Args: + kv_caches: dictionary of layer names, kv cache + """ + logger.info("Registering kv caches!") + self.worker_adapter.register_kv_caches(kv_caches) + return + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None: + """ + Start loading the KV cache from the connector to vLLM's paged + KV buffer. This is called from the forward context before the + forward pass to enable async loading during model execution. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + + """ + metadata = self._get_connector_metadata() + assert isinstance(metadata, LMCacheMPConnectorMetadata) + + with torch.cuda.stream(torch.cuda.current_stream()): + event = torch.cuda.Event(interprocess=True) + event.record() + + request_ids = [] + ops = [] + + for meta in metadata.requests: + if meta.direction != "RETRIEVE": + continue + request_ids.append(meta.request_id) + ops.append(meta.op) + + if len(request_ids) > 0: + logger.info( + "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids + ) + self.worker_adapter.batched_submit_retrieve_requests( + request_ids, ops, event + ) + + def wait_for_layer_load(self, layer_name: str) -> None: + """ + Block until the KV for a specific layer is loaded into vLLM's + paged buffer. This is called from within attention layer to ensure + async copying from start_load_kv is complete. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + return + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs: Any, + ) -> None: + """ + Start saving a layer of KV cache from vLLM's paged buffer + to the connector. This is called from within attention layer to + enable async copying during execution. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + return + + def wait_for_save(self): + """ + Block until all the save operations is done. This is called + as the forward context exits to ensure that the async saving + from save_kv_layer is complete before finishing the forward. + + This prevents overwrites of paged KV buffer before saving done. + """ + metadata = self._get_connector_metadata() + assert isinstance(metadata, LMCacheMPConnectorMetadata) + + with torch.cuda.stream(torch.cuda.current_stream()): + event = torch.cuda.Event(interprocess=True) + event.record() + + request_ids = [] + ops = [] + for meta in metadata.requests: + if meta.direction != "STORE": + continue + request_ids.append(meta.request_id) + ops.append(meta.op) + + if len(request_ids) > 0: + self.worker_adapter.batched_submit_store_requests(request_ids, ops, event) + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens on the worker. + The scheduler process (via the Executors) will use this output + to track which workers are done. + + Returns: + ids of requests that have finished asynchronous transfer + (requests that previously returned True from request_finished()), + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + val = self.worker_adapter.get_finished(finished_req_ids) + # logger.error("Finished req ids: %s, %s", val[0], val[1]) + return val + + def get_block_ids_with_load_errors(self) -> set[int]: + """ + Get the set of block IDs that failed to load. + + Returns: + Set of block IDs that encountered load errors. + Empty set if no load errors occurred. + + Notes: + - Applies to both sync- and async-loading requests. + - Async loading: failed blocks may be reported in any forward pass + up to and including the pass where the request ID is returned by + `get_finished()`. Even if failures occur, the request must still + be reported via `get_finished()`, and the failed block IDs must + appear here no later than that same pass. + - Sync loading: failed blocks should be reported in the forward + pass in which they are detected. + """ + # TODO: add error tracking + return set() + + def shutdown(self): + """ + Shutdown the connector. This is called when the worker process + is shutting down to ensure that all the async operations are + completed and the connector is cleaned up properly. + """ + if hasattr(self, "worker_adapter"): + self.worker_adapter.shutdown() + return None + + def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]: + """ + Get the KV connector stats collected during the last interval. + """ + return None + + # ============================== + # Scheduler-side methods + # ============================== + + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + """ + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + A tuple with the following elements: + - An optional number of tokens that can be loaded from the + external KV cache beyond what is already computed. + If None, it means that the connector needs more time to + determine the number of matched tokens, and the scheduler + should query for this request again later. + - `True` if external KV cache tokens will be loaded + asynchronously (between scheduler steps). Must be + 'False' if the first element is 0. + + Notes: + The connector should only consider the largest prefix of prompt- + tokens for which KV cache is actually available at the time of the + call. If the cache cannot be loaded for some tokens (e.g., due to + connectivity issues or eviction), those tokens must not be taken + into account. + """ + tracker = self._get_or_create_request_tracker(request) + + self.scheduler_adapter.maybe_submit_lookup_request( + request.request_id, convert_block_hashes_to_bytes(request.block_hashes) + ) + + ret = self.scheduler_adapter.check_lookup_result(request.request_id) + if ret is None: + return None, True + + if ret == 0: + return 0, False + + assert ( + ret % (self.scheduler_adapter.num_blocks_per_chunk() * self.vllm_block_size) + == 0 + ) + + # Update num stored blocks for the tracker + num_vllm_blocks = num_computed_tokens // self.vllm_block_size + num_lmcache_blocks = ret // self.vllm_block_size + tracker.increase_num_stored_blocks(num_lmcache_blocks) + + # Save the vllm and lmcache hit tokens + tracker.num_vllm_hit_blocks = num_vllm_blocks + tracker.num_lmcache_hit_blocks = num_lmcache_blocks + + need_to_load = max(0, ret - num_computed_tokens) + logger.debug( + "vLLM hit is: %d, Need to load is %d", num_computed_tokens, need_to_load + ) + return need_to_load, need_to_load > 0 + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + """ + Update KVConnector state after block allocation. + + If get_num_new_matched_tokens previously returned True for a + request, this function may be called twice for that same request - + first when blocks are allocated for the connector tokens to be + asynchronously loaded into, and second when any additional blocks + are allocated, after the load/transfer is complete. + + Args: + request (Request): the request object. + blocks (KVCacheBlocks): the blocks allocated for the request. + num_external_tokens (int): the number of tokens that will be + loaded from the external KV cache. + """ + # NOTE: the `blocks` are NEW BLOCKS allocated for this request. + tracker = self._get_request_tracker(request.request_id) + block_ids = reformat_block_ids(blocks.get_block_ids()) + + # No matter we need to retrieve or not, we need to update + # the block ids into the tracker + tracker.update_block_ids(block_ids) + + # Update the state of the tracker + condition = tracker.needs_retrieve() + if tracker.state == LMCacheMPRequestState.PREFETCHING: + # If need to retrieve, change to WAITING_FOR_LOAD + # Otherwise, change to READY + tracker.state = ( + LMCacheMPRequestState.WAITING_FOR_LOAD + if condition + else LMCacheMPRequestState.READY + ) + + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + """ + Build the connector metadata for this step. + + This function should NOT modify fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + metadata = LMCacheMPConnectorMetadata() + + self._process_retrieve_requests(metadata) + self._process_new_requests(scheduler_output, metadata) + self._process_cached_requests(scheduler_output, metadata) + + if len(metadata) > 0: + logger.debug("Final connector metadata: %s", metadata) + + return metadata + + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + return + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called exactly once when a request has finished, before its blocks are + freed. + + The connector may assumes responsibility for freeing the blocks + asynchronously by returning True. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + return True, None + + def take_events(self) -> Iterable["KVCacheEvent"]: + """ + Take the KV cache events from the connector. + + Yields: + New KV cache events since the last call. + """ + return () + + @classmethod + def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None: + """ + Get the required KV cache layout for this connector. + Args: + vllm_config (VllmConfig): the vllm config. + + Returns: + str: the required KV cache layout. e.g. HND, or NHD. + None if the connector does not require a specific layout. + """ + + if cls is KVConnectorBase_V1: + raise TypeError( + "get_required_kvcache_layout should not be called " + "on the abstract base class" + ) + return None + + def get_finished_count(self) -> int | None: + """ + Get the count of requests expected to complete send/receive operations + via this connector. This method is used to initialize the + KVOutputAggregator, overwriting the default world_size. + + Returns: + int: expected sending or receiving completion count. + """ + return None + + @classmethod + def build_kv_connector_stats( + cls, data: dict[str, Any] | None = None + ) -> Optional["KVConnectorStats"]: + """ + KVConnectorStats resolution method. This method allows dynamically + registered connectors to return their own KVConnectorStats object, + which can implement custom aggregation logic on the data dict. + """ + return None + + @classmethod + def build_prom_metrics( + cls, + vllm_config: "VllmConfig", + metric_types: dict[type["PromMetric"], type["PromMetricT"]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ) -> Optional["KVConnectorPromMetrics"]: + """ + Create a KVConnectorPromMetrics subclass which should register + per-connector Prometheus metrics and implement observe() to + expose connector transfer stats via Prometheus. + """ + return None + + ############################## + # Helper functions + ############################## + def _process_retrieve_requests( + self, + metadata: LMCacheMPConnectorMetadata, + ) -> None: + blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk() + + for request_tracker in self.request_trackers.values(): + if request_tracker.state != LMCacheMPRequestState.WAITING_FOR_LOAD: + continue + r_metadata = LMCacheMPRequestMetadata.GetRetrieveMetadata( + request_tracker, blocks_per_chunk + ) + if r_metadata is not None: + metadata.add_request_metadata(r_metadata) + request_tracker.state = LMCacheMPRequestState.READY + + def _process_new_requests( + self, + scheduler_output: SchedulerOutput, + metadata: LMCacheMPConnectorMetadata, + ) -> None: + blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk() + + for new_request in scheduler_output.scheduled_new_reqs: + request_tracker = self._get_request_tracker(new_request.req_id) + + num_new_tokens = scheduler_output.num_scheduled_tokens[new_request.req_id] + request_tracker.increase_num_scheduled_tokens(num_new_tokens) + + r_meta = LMCacheMPRequestMetadata.GetStoreMetadata( + request_tracker, blocks_per_chunk, self.vllm_block_size + ) + if r_meta is not None: + metadata.add_request_metadata(r_meta) + + def _process_cached_requests( + self, + scheduler_output: SchedulerOutput, + metadata: LMCacheMPConnectorMetadata, + ) -> None: + blocks_per_chunk = self.scheduler_adapter.num_blocks_per_chunk() + + cached_reqs = scheduler_output.scheduled_cached_reqs + for idx, request_id in enumerate(cached_reqs.req_ids): + request_tracker = self._get_request_tracker(request_id) + + # Update block ids + new_block_ids = reformat_block_ids(cached_reqs.new_block_ids[idx]) + request_tracker.update_block_ids(new_block_ids) + + # Update new scheduled tokens + num_new_tokens = cached_reqs.num_computed_tokens[idx] + request_tracker.increase_num_scheduled_tokens(num_new_tokens) + + r_meta = LMCacheMPRequestMetadata.GetStoreMetadata( + request_tracker, blocks_per_chunk, self.vllm_block_size + ) + + if r_meta is not None: + metadata.add_request_metadata(r_meta) + + def _get_request_tracker(self, request_id: str) -> LMCacheMPRequestTracker: + assert request_id in self.request_trackers, ( + f"Request tracker for request_id {request_id} not found. " + ) + return self.request_trackers[request_id] + + def _get_or_create_request_tracker( + self, request: "Request" + ) -> LMCacheMPRequestTracker: + request_id = request.request_id + if request_id not in self.request_trackers: + new_tracker = LMCacheMPRequestTracker(request) + self.request_trackers[request_id] = new_tracker + return self.request_trackers[request_id] diff --git a/distributed/kv_transfer/kv_connector/v1/metrics.py b/distributed/kv_transfer/kv_connector/v1/metrics.py new file mode 100644 index 0000000..d6ea4f1 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/metrics.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass, field +from typing import Any, TypeAlias, TypeVar + +from prometheus_client import Counter, Gauge, Histogram + +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.distributed.kv_transfer.kv_transfer_state import has_kv_transfer_group +from vllm.logger import init_logger + +PromMetric: TypeAlias = Gauge | Counter | Histogram +PromMetricT = TypeVar("PromMetricT", bound=PromMetric) + +logger = init_logger(__name__) + + +@dataclass +class KVConnectorStats: + """ + Base class for KV Connector Stats, a container for transfer performance + metrics or otherwise important telemetry from the connector. + All sub-classes need to be serializable as stats are sent from worker to + logger process. + """ + + data: dict[str, Any] = field(default_factory=dict) + + def reset(self): + """Reset the stats, clear the state.""" + raise NotImplementedError + + def aggregate(self, other: "KVConnectorStats") -> "KVConnectorStats": + """ + Aggregate stats with another `KVConnectorStats` object. + """ + raise NotImplementedError + + def reduce(self) -> dict[str, int | float]: + """ + Reduce the observations collected during a time interval to one or + more representative values (eg avg/median/sum of the series). + This is meant to be called by the logger to produce a summary of the + stats for the last time interval. + """ + raise NotImplementedError + + def is_empty(self) -> bool: + """Return True if the stats are empty.""" + raise NotImplementedError + + +class KVConnectorLogging: + def __init__(self, kv_tranfer_config: KVTransferConfig): + # This should be called on frontend process. + assert not has_kv_transfer_group() + # Instantiate the connector's stats class. + if kv_tranfer_config and kv_tranfer_config.kv_connector: + self.connector_cls = KVConnectorFactory.get_connector_class( + kv_tranfer_config + ) + self.reset() + + def reset(self): + self.transfer_stats_accumulator: KVConnectorStats | None = None + + def observe(self, transfer_stats_data: dict[str, Any]): + # Should not be called when a KVConnector is not configured. + assert self.connector_cls is not None + # Called periodically when connector syncs with the scheduler. + # Note that this is not the same as the logging interval. + # We expect transfer_stats_data to be aggregated across all workers and + # consist of observations from a single connector or a MultiConnector. + transfer_stats = self.connector_cls.build_kv_connector_stats( + transfer_stats_data + ) + if transfer_stats is None: + logger.warning_once( + "The connector %s is collecting stats but " + "does not implement the " + "`build_kv_connector_stats` method. " + "Stats will not be logged.", + self.connector_cls, + ) + return + + if self.transfer_stats_accumulator is None: + self.transfer_stats_accumulator = transfer_stats + else: + # Accumulate last interval stats. + self.transfer_stats_accumulator = self.transfer_stats_accumulator.aggregate( + transfer_stats + ) + + def log(self, log_fn=logger.info): + """Log transfer metrics periodically, similar to throughput logging""" + if ( + self.transfer_stats_accumulator + and not self.transfer_stats_accumulator.is_empty() + ): + # Produce a single cumulative stats object for the last time + # interval from the recorded observations. + xfer_metrics = self.transfer_stats_accumulator.reduce() + xfer_metrics_str = ", ".join(f"{k}={v}" for k, v in xfer_metrics.items()) + log_fn("KV Transfer metrics: %s", xfer_metrics_str) + + # Reset metrics for next interval + self.reset() + + +class KVConnectorPromMetrics: + """ + A base class for per-connector Prometheus metric registration + and recording. + """ + + def __init__( + self, + vllm_config: VllmConfig, + metric_types: dict[type[PromMetric], type[PromMetricT]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ): + self._kv_transfer_config = vllm_config.kv_transfer_config + self._gauge_cls = metric_types[Gauge] + self._counter_cls = metric_types[Counter] + self._histogram_cls = metric_types[Histogram] + self._labelnames = labelnames + self._per_engine_labelvalues = per_engine_labelvalues + + def make_per_engine(self, metric: PromMetric) -> PromMetric: + """ + Create a per-engine child of a prometheus_client.Metric with + the appropriate labels set. The parent metric must be created + using the labelnames list. + """ + return { + idx: metric.labels(*labelvalues) + for idx, labelvalues in self._per_engine_labelvalues.items() + } + + def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): + """ + Record the supplied transfer statistics to Prometheus metrics. These + statistics are engine-specific, and should be recorded to a metric + with the appropriate 'engine' label. These metric instances can be + created using the make_per_engine() helper method. + """ + raise NotImplementedError + + +class KVConnectorPrometheus: + """ + Support for registering per-connector Prometheus metrics, and + recording transfer statistics to those metrics. Uses + KVConnectorBase.build_prom_metrics(). + """ + + _gauge_cls = Gauge + _counter_cls = Counter + _histogram_cls = Histogram + + def __init__( + self, + vllm_config: VllmConfig, + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ): + self.prom_metrics: KVConnectorPromMetrics | None = None + kv_transfer_config = vllm_config.kv_transfer_config + if kv_transfer_config and kv_transfer_config.kv_connector: + connector_cls = KVConnectorFactory.get_connector_class(kv_transfer_config) + metric_types = { + Gauge: self._gauge_cls, + Counter: self._counter_cls, + Histogram: self._histogram_cls, + } + self.prom_metrics = connector_cls.build_prom_metrics( + vllm_config, + metric_types, + labelnames, + per_engine_labelvalues, + ) + + def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): + if self.prom_metrics is None: + return + self.prom_metrics.observe(transfer_stats_data, engine_idx) diff --git a/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/distributed/kv_transfer/kv_connector/v1/multi_connector.py new file mode 100644 index 0000000..c9d08e9 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -0,0 +1,454 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy +from collections.abc import Iterable +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +import torch + +from vllm.config import VllmConfig +from vllm.config.kv_transfer import KVTransferConfig +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType +from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, +) +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorPromMetrics, + KVConnectorStats, + PromMetric, + PromMetricT, +) +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import KVConnectorOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.distributed.kv_events import KVCacheEvent + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class MultiKVConnectorMetadata(KVConnectorMetadata): + metadata: tuple[KVConnectorMetadata, ...] + extra_async_saves: dict[str, int] | None = None + + +@dataclass +class MultiKVConnectorStats(KVConnectorStats): + """ + Maintain a dict of KVConnectorStats objects, one for each connector. + This is used to aggregate the stats from all connectors separately. + """ + + def aggregate(self, other: KVConnectorStats) -> KVConnectorStats: + for connector_id, stats in other.data.items(): + if connector_id not in self.data: + self[connector_id] = stats + else: + assert isinstance(stats, type(self.data[connector_id])) + self[connector_id] = self[connector_id].aggregate(stats) + return self + + def reset(self): + for stats in self.data.values(): + stats.reset() + + def reduce(self) -> dict[str, Any]: + # TODO (NickLucche) Adjust for logging on separate lines + return { + connector_id: stats.reduce() for connector_id, stats in self.data.items() + } + + def is_empty(self) -> bool: + return all(stats.is_empty() for stats in self.data.values()) + + def __getitem__(self, connector_id: str) -> KVConnectorStats: + return self.data[connector_id] + + def __setitem__(self, connector_id: str, stats: KVConnectorStats): + self.data[connector_id] = stats + + +class MultiKVConnectorPromMetrics(KVConnectorPromMetrics): + def __init__( + self, + vllm_config: "VllmConfig", + metric_types: dict[type[PromMetric], type[PromMetricT]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + prom_metrics: dict[str, KVConnectorPromMetrics], + ): + super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues) + self._prom_metrics = prom_metrics + + def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): + for connector_id, stats_data in transfer_stats_data.items(): + assert connector_id in self._prom_metrics, ( + f"{connector_id} is not contained in the list of registered connectors " + f"with Prometheus metrics support: {self._prom_metrics.keys()}" + ) + self._prom_metrics[connector_id].observe(stats_data["data"], engine_idx) + + +class MultiConnector(KVConnectorBase_V1): + """ + A wrapper for using multiple KVConnectors at the same time. + + The current logic is: + - Load KV from the first connector that advertises available tokens from + get_num_new_matched_tokens(), based on the order in the config. + - Save to all connectors. + """ + + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: "KVCacheConfig", + ): + super().__init__( + vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config + ) + + self._connectors: list[KVConnectorBase_V1] = [] + self._ktc_kv_transfer_config = [] + for connector_cls, temp_config in self._get_connector_classes_and_configs( + vllm_config + ): + self._connectors.append(connector_cls(temp_config, role, kv_cache_config)) + self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config) + + # A mapping from request id to the index of the connector chosen to + # load the request from (if any). + self._requests_to_connector: dict[str, int] = {} + + # Keeps track of *additional* remaining async saves (beyond 1) to be + # finished per request. Not needed for async loads since we only allow + # a single connector to load. + # Propagated from scheduler to worker side via the connector metadata. + self._extra_async_saves: dict[str, int] = {} + + @classmethod + def _get_connector_classes_and_configs( + cls, vllm_config: "VllmConfig" + ) -> list[tuple[type[KVConnectorBaseType], "VllmConfig"]]: + assert vllm_config.kv_transfer_config is not None + ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( + "connectors" + ) + assert ktcs is not None + ret: list[tuple[type[KVConnectorBaseType], VllmConfig]] = [] + for ktc in ktcs: + temp_config = copy.copy(vllm_config) + engine_id = ktc.get("engine_id", vllm_config.kv_transfer_config.engine_id) + temp_config.kv_transfer_config = KVTransferConfig( + **ktc, engine_id=engine_id + ) + ret.append( + ( + KVConnectorFactory.get_connector_class( + temp_config.kv_transfer_config + ), + temp_config, + ) + ) + return ret + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + for c in self._connectors: + c.register_kv_caches(kv_caches) + + # We must override the base class method here because we need to bind + # the metadata to each connector in the order of the connectors in the + # MultiKVConnectorMetadata. + # + # Note: Call the base class method to ensure metadata is also set on the + # MultiConnector instance itself; otherwise, `has_connector_metadata()` will + # always return False. + def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None: + assert isinstance(connector_metadata, MultiKVConnectorMetadata) + if connector_metadata.extra_async_saves: + self._extra_async_saves.update(connector_metadata.extra_async_saves) + for c, cm in zip(self._connectors, connector_metadata.metadata): + c.bind_connector_metadata(cm) + super().bind_connector_metadata(connector_metadata) + + def clear_connector_metadata(self) -> None: + for c in self._connectors: + c.clear_connector_metadata() + super().clear_connector_metadata() + + def shutdown(self): + exception: Exception | None = None + for c in self._connectors: + try: + c.shutdown() + except Exception as e: + logger.exception( + "Exception during connector %s shutdown.", c.__class__.__name__ + ) + exception = e + if exception: + raise exception + + # ============================== + # Worker-side methods + # ============================== + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: + for c in self._connectors: + c.start_load_kv(forward_context, **kwargs) + + def wait_for_layer_load(self, layer_name: str) -> None: + for c in self._connectors: + c.wait_for_layer_load(layer_name) + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs, + ) -> None: + for c in self._connectors: + c.save_kv_layer(layer_name, kv_layer, attn_metadata, **kwargs) + + def wait_for_save(self): + for c in self._connectors: + c.wait_for_save() + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + finished_sending: set[str] = set() + finished_recving: set[str] = set() + for c in self._connectors: + sending, recving = c.get_finished(finished_req_ids) + if not recving and not sending: + continue + # Aggregate finished recving request ids. + finished_recving.update(recving or ()) + # Aggregate finished sending request ids - only include + # once we've drained the "extra" count (for cases where + # more than one connector is async-saving the same request). + for req_id in sending or (): + extra_pending = self._extra_async_saves.get(req_id) + if extra_pending is None: + finished_sending.add(req_id) + continue + assert extra_pending > 0 + if extra_pending == 1: + del self._extra_async_saves[req_id] + else: + self._extra_async_saves[req_id] = extra_pending - 1 + + return finished_sending or None, finished_recving or None + + def get_block_ids_with_load_errors(self) -> set[int]: + agg_block_ids: set[int] = set() + for c in self._connectors: + agg_block_ids |= c.get_block_ids_with_load_errors() + return agg_block_ids + + # ============================== + # Scheduler-side methods + # ============================== + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + to_return = (0, False) + for i, c in enumerate(self._connectors): + toks, load_async = c.get_num_new_matched_tokens( + request, num_computed_tokens + ) + # If there is a connector still looking up the matches, + # we return None to indicate that we are not done yet. + if toks is None: + return (None, False) + # The first connector that has new matched tokens will be assigned + # to this request. + if to_return[0] == 0 and toks > 0: + self._requests_to_connector[request.request_id] = i + to_return = (toks, load_async) + return to_return + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + chosen_connector = self._requests_to_connector.get(request.request_id, -1) + empty_blocks = blocks.new_empty() + for i, c in enumerate(self._connectors): + if i == chosen_connector: + # Forward call to the chosen connector (if any). + c.update_state_after_alloc(request, blocks, num_external_tokens) + else: + # Call with empty blocks for other connectors. + c.update_state_after_alloc(request, empty_blocks, 0) + + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> MultiKVConnectorMetadata: + metadata = MultiKVConnectorMetadata( + metadata=tuple( + c.build_connector_meta(scheduler_output) for c in self._connectors + ) + ) + if self._extra_async_saves: + metadata.extra_async_saves = self._extra_async_saves + self._extra_async_saves = {} + return metadata + + def update_connector_output(self, connector_output: KVConnectorOutput): + for c in self._connectors: + c.update_connector_output(connector_output) + + def request_finished( + self, + request: "Request", + blocks: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + async_saves = 0 + kv_txfer_params = None + for c in self._connectors: + async_save, txfer_params = c.request_finished(request, blocks) + if async_save: + async_saves += 1 + if txfer_params is not None: + if kv_txfer_params is not None: + # TODO we can probably change this to merge the dicts here, + # checking for key clashes. + raise RuntimeError( + "Only one connector can produce KV transfer params" + ) + kv_txfer_params = txfer_params + if async_saves > 1: + self._extra_async_saves[request.request_id] = async_saves - 1 + + # Clean up other state for this request. + self._requests_to_connector.pop(request.request_id, None) + + return async_saves > 0, kv_txfer_params + + def take_events(self) -> Iterable["KVCacheEvent"]: + for c in self._connectors: + yield from c.take_events() + + @classmethod + def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None: + """ + Get the required KV cache layout for this connector. + Args: + vllm_config (VllmConfig): the vllm config. + + Returns: + str: the required KV cache layout. e.g. HND, or NHD. + None if the connector does not require a specific layout. + """ + assert vllm_config.kv_transfer_config is not None + layouts: set[str] = set() + for connector_cls, temp_config in cls._get_connector_classes_and_configs( + vllm_config + ): + required_kvcache_layout = connector_cls.get_required_kvcache_layout( + temp_config + ) + if required_kvcache_layout is not None: + layouts.add(required_kvcache_layout) + + if len(layouts) > 1: + raise ValueError( + f"KV cache layout mismatch: " + f"found {len(layouts)} different layouts " + f"({', '.join(layouts)})." + f"All connectors must use the same layout." + ) + return next(iter(layouts), None) + + @classmethod + def build_kv_connector_stats( + cls, data: dict[str, Any] | None = None + ) -> KVConnectorStats | None: + if data is None: + return MultiKVConnectorStats() + + # data is a dict mapping connector name to their stats data. + # The stats data can be either: + # 1. Already-instantiated KVConnectorStats objects (same process) + # 2. Serialized dicts (cross-process after serialization) + # We need to reconstruct proper KVConnectorStats objects from dicts + reconstructed_data = {} + for connector_name, stats_value in data.items(): + # If already a KVConnectorStats object, use it directly + if isinstance(stats_value, KVConnectorStats): + reconstructed_data[connector_name] = stats_value + continue + + # Otherwise, reconstruct from serialized dict + # Get the connector class to reconstruct its stats + connector_cls = KVConnectorFactory.get_connector_class_by_name( + connector_name + ) + + # stats_value is the serialized dataclass which contains {'data': {...}} + # We need to extract the inner 'data' field to avoid double-nesting + assert isinstance(stats_value, dict) and "data" in stats_value, ( + f"Expected a dict with a 'data' field, got {stats_value}" + ) + inner_data = stats_value["data"] + + # Use the connector's build_kv_connector_stats to reconstruct + if reconstructed_stats := connector_cls.build_kv_connector_stats( + data=inner_data + ): + reconstructed_data[connector_name] = reconstructed_stats + + return MultiKVConnectorStats(data=reconstructed_data) + + def get_kv_connector_stats(self) -> MultiKVConnectorStats | None: + # Group connector stats by connector type. + stats_by_connector: MultiKVConnectorStats | None = None + for c in self._connectors: + stats = c.get_kv_connector_stats() + if stats is None: + continue + if stats_by_connector is None: + # Lazy init to allow optional return value. + stats_by_connector = MultiKVConnectorStats() + stats_by_connector[c.__class__.__name__] = stats + return stats_by_connector + + @classmethod + def build_prom_metrics( + cls, + vllm_config: "VllmConfig", + metric_types: dict[type["PromMetric"], type["PromMetricT"]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ) -> KVConnectorPromMetrics: + prom_metrics: dict[str, KVConnectorPromMetrics] = {} + for connector_cls, temp_config in cls._get_connector_classes_and_configs( + vllm_config + ): + connector_prom = connector_cls.build_prom_metrics( + temp_config, metric_types, labelnames, per_engine_labelvalues + ) + if connector_prom is not None: + prom_metrics[connector_cls.__name__] = connector_prom + return MultiKVConnectorPromMetrics( + vllm_config, + metric_types, + labelnames, + per_engine_labelvalues, + prom_metrics, + ) diff --git a/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/distributed/kv_transfer/kv_connector/v1/nixl_connector.py new file mode 100644 index 0000000..a70c98b --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -0,0 +1,2440 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib +import copy +import logging +import math +import os +import queue +import threading +import time +import uuid +from collections import defaultdict +from collections.abc import Iterator +from concurrent.futures import Future, ThreadPoolExecutor +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional + +import msgspec +import numpy as np +import torch +import zmq + +from vllm import envs +from vllm.attention import AttentionBackend +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.selector import get_attn_backend +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + CopyBlocksOp, + KVConnectorBase_V1, + KVConnectorHandshakeMetadata, + KVConnectorMetadata, + KVConnectorRole, +) +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( + KVConnectorPromMetrics, + KVConnectorStats, + PromMetric, + PromMetricT, +) +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tp_group, +) +from vllm.forward_context import ForwardContext +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.network_utils import make_zmq_path, make_zmq_socket +from vllm.v1.attention.backends.utils import get_kv_cache_layout +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.worker.block_table import BlockTable + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +Transfer = tuple[int, float] # (xfer_handle, start_time) +EngineId = str +ReqId = str + +GET_META_MSG = b"get_meta_msg" + +logger = init_logger(__name__) + +# Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used +try: + from nixl._api import nixl_agent as NixlWrapper + from nixl._bindings import nixlXferTelemetry + + logger.info("NIXL is available") +except ImportError: + logger.warning("NIXL is not available") + NixlWrapper = None + nixlXferTelemetry = None + + +try: + from nixl._api import nixl_agent_config +except ImportError: + nixl_agent_config = None + logger.warning("NIXL agent config is not available") + +# Supported platforms and types of kv transfer buffer. +# {device: tuple of supported kv buffer types} +_NIXL_SUPPORTED_DEVICE = { + "cuda": ( + "cuda", + "cpu", + ), + "tpu": ("cpu",), + "xpu": ("cpu",), + "cpu": ("cpu",), +} +# support for oot platform by providing mapping in current_platform +_NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices()) + + +@dataclass +class NixlAgentMetadata(KVConnectorHandshakeMetadata): + engine_id: str + agent_metadata: bytes + kv_caches_base_addr: list[int] + device_id: int + num_blocks: int + block_lens: list[int] + attn_backend_name: str + kv_cache_layout: str + block_size: int + + +@dataclass +class ReqMeta: + local_block_ids: list[int] + # To be used when logical block size does not match the kernel block size + local_physical_block_ids: list[int] + remote_block_ids: list[int] + remote_host: str + remote_port: int + remote_engine_id: str + tp_size: int + + +class NixlConnectorMetadata(KVConnectorMetadata): + def __init__(self): + self.reqs_to_recv: dict[ReqId, ReqMeta] = {} + self.reqs_to_save: dict[ReqId, ReqMeta] = {} + self.reqs_to_send: dict[ReqId, float] = {} + self.reqs_in_batch: set[ReqId] = set() + self.reqs_not_processed: set[ReqId] = set() + + def add_new_req( + self, + request_id: ReqId, + local_block_ids: list[int], + kv_transfer_params: dict[str, Any], + load_remote_cache: bool = True, + save_to_host: bool = False, + ): + # save and load are mutually exclusive + assert load_remote_cache ^ save_to_host + _req = ReqMeta( + local_block_ids=local_block_ids, + local_physical_block_ids=local_block_ids, + remote_block_ids=kv_transfer_params["remote_block_ids"], + remote_engine_id=kv_transfer_params["remote_engine_id"], + remote_host=kv_transfer_params["remote_host"], + remote_port=kv_transfer_params["remote_port"], + # P workers don't need to receive tp_size from proxy here. + tp_size=kv_transfer_params.get("tp_size", 1), + ) + if save_to_host: + self.reqs_to_save[request_id] = _req + if load_remote_cache: + self.reqs_to_recv[request_id] = _req + + +class NixlConnector(KVConnectorBase_V1): + def __init__( + self, + vllm_config: VllmConfig, + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__(vllm_config, role, kv_cache_config) + + assert vllm_config.kv_transfer_config is not None + assert vllm_config.kv_transfer_config.engine_id is not None + self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id + + if role == KVConnectorRole.SCHEDULER: + self.connector_scheduler: NixlConnectorScheduler | None = ( + NixlConnectorScheduler(vllm_config, self.engine_id) + ) + self.connector_worker: NixlConnectorWorker | None = None + elif role == KVConnectorRole.WORKER: + self.connector_scheduler = None + self.connector_worker = NixlConnectorWorker(vllm_config, self.engine_id) + + ############################################################ + # Class Methods + ############################################################ + @classmethod + def get_required_kvcache_layout(cls, vllm_config: VllmConfig): + if vllm_config.model_config is None: + logger.warning_once( + "Unable to detect current VLLM config. " + "Fallback to default kv cache layout." + ) + return None + use_mla = vllm_config.model_config.use_mla + if use_mla: + # return None when we have mla + # as the layout should not matter in that case, + # which fallback to the default behavior. + return None + logger.info_once( + "NixlConnector setting KV cache layout to HND for better xfer performance." + ) + return "HND" + + ############################################################ + # Scheduler Side Methods + ############################################################ + + def get_num_new_matched_tokens( + self, request: "Request", num_computed_tokens: int + ) -> tuple[int | None, bool]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_num_new_matched_tokens( + request, num_computed_tokens + ) + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + assert self.connector_scheduler is not None + return self.connector_scheduler.update_state_after_alloc( + request, blocks, num_external_tokens + ) + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + assert self.connector_scheduler is not None + return self.connector_scheduler.build_connector_meta(scheduler_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + assert self.connector_scheduler is not None + return self.connector_scheduler.request_finished(request, block_ids) + + def set_xfer_handshake_metadata( + self, metadata: dict[int, KVConnectorHandshakeMetadata] + ) -> None: + """ + Set the KV connector handshake metadata for this connector. + + Args: + metadata (dict): the handshake metadata to set. + """ + assert self.connector_scheduler is not None + self.connector_scheduler.set_xfer_handshake_metadata(metadata) + + ############################################################ + # Worker Side Methods + ############################################################ + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + assert self.connector_worker is not None + self.connector_worker.register_kv_caches(kv_caches) + + def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp): + assert self.connector_worker is not None + self.connector_worker.set_host_xfer_buffer_ops(copy_operation) + + def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + """Get the finished recving and sending requests.""" + assert self.connector_worker is not None + return self.connector_worker.get_finished() + + def get_block_ids_with_load_errors(self) -> set[int]: + """Get block IDs that failed to load via NIXL.""" + assert self.connector_worker is not None + return self.connector_worker.get_block_ids_with_load_errors() + + def get_kv_connector_stats(self) -> KVConnectorStats | None: + if self.connector_worker is None: + return None + return self.connector_worker.get_kv_connector_stats() + + @classmethod + def build_kv_connector_stats( + cls, data: dict[str, Any] | None = None + ) -> KVConnectorStats | None: + return ( + NixlKVConnectorStats(data=data) + if data is not None + else NixlKVConnectorStats() + ) + + @classmethod + def build_prom_metrics( + cls, + vllm_config: VllmConfig, + metric_types: dict[type[PromMetric], type[PromMetricT]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ) -> KVConnectorPromMetrics: + return NixlPromMetrics( + vllm_config, metric_types, labelnames, per_engine_labelvalues + ) + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, NixlConnectorMetadata) + self.connector_worker.start_load_kv(self._connector_metadata) + + def wait_for_layer_load(self, layer_name: str) -> None: + """NixlConnector does not do layerwise saving.""" + pass + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs, + ) -> None: + """NixlConnector does not save explicitly.""" + pass + + def wait_for_save(self): + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, NixlConnectorMetadata) + if self.connector_worker.use_host_buffer and self.connector_worker.copy_blocks: + self.connector_worker.save_kv_to_host(self._connector_metadata) + + def shutdown(self): + if self.connector_worker is not None: + self.connector_worker.shutdown() + if self.connector_scheduler is not None: + self.connector_scheduler.shutdown() + + def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: + """ + Get the KVConnector handshake metadata for this connector. + This metadata is used for out-of-band connector handshake + between P/D workers. + + Returns: + KVConnectorHandshakeMetadata: the handshake metadata. + None if no handshake metadata is available. + """ + assert self.connector_worker is not None + return self.connector_worker.xfer_handshake_metadata + + +class NixlConnectorScheduler: + """Implementation of Scheduler side methods""" + + def __init__(self, vllm_config: VllmConfig, engine_id: str): + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + self.engine_id: EngineId = engine_id + self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST + self.side_channel_port = ( + envs.VLLM_NIXL_SIDE_CHANNEL_PORT + + vllm_config.parallel_config.data_parallel_rank + ) + assert vllm_config.kv_transfer_config is not None + if current_platform.device_type == "cpu": + self.use_host_buffer = False + else: + self.use_host_buffer = ( + vllm_config.kv_transfer_config.kv_buffer_device == "cpu" + ) + + logger.info("Initializing NIXL Scheduler %s", engine_id) + + # Background thread for handling new handshake requests. + self._nixl_handshake_listener_t: threading.Thread | None = None + self._encoded_xfer_handshake_metadata: dict[int, Any] = {} + self._stop_event = threading.Event() + + # Requests that need to start recv/send. + # New requests are added by update_state_after_alloc in + # the scheduler. Used to make metadata passed to Worker. + self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {} + self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {} + # Reqs to send and their expiration time + self._reqs_need_send: dict[ReqId, float] = {} + self._reqs_in_batch: set[ReqId] = set() + # Reqs to remove from processed set because they're not to send after + # remote prefill or aborted. + self._reqs_not_processed: set[ReqId] = set() + + def shutdown(self): + self._stop_event.set() + if self._nixl_handshake_listener_t is not None: + self._nixl_handshake_listener_t.join() + self._nixl_handshake_listener_t = None + + def set_xfer_handshake_metadata( + self, metadata: dict[int, KVConnectorHandshakeMetadata] + ) -> None: + """ + Set the KV connector handshake metadata for this connector. + + Args: + metadata (dict): the handshake metadata to set. + """ + encoded_data: dict[int, bytes] = {} + encoder = msgspec.msgpack.Encoder() + for tp_rank, rank_metadata in metadata.items(): + if not isinstance(rank_metadata, NixlAgentMetadata): + raise ValueError( + "NixlConnectorScheduler expects NixlAgentMetadata for " + "handshake metadata." + ) + encoded_data[tp_rank] = encoder.encode(rank_metadata) + logger.debug( + "Tp rank %d: encoded NixlAgentMetadata size: %s bytes", + tp_rank, + str(len(encoded_data[tp_rank])), + ) + self._encoded_xfer_handshake_metadata = encoded_data + + # Only start the listener when we have metadata to serve. + if self._nixl_handshake_listener_t is None: + ready_event = threading.Event() + self._nixl_handshake_listener_t = threading.Thread( + target=self._nixl_handshake_listener, + args=( + encoded_data, + ready_event, + self._stop_event, + self.side_channel_port, + ), + daemon=True, + name="nixl_handshake_listener", + ) + self._nixl_handshake_listener_t.start() + ready_event.wait() # Wait for listener ZMQ socket to be ready. + + @staticmethod + def _nixl_handshake_listener( + encoded_data: dict[int, Any], + ready_event: threading.Event, + stop_event: threading.Event, + port: int, + ): + """Background thread for getting new NIXL handshakes.""" + # NOTE(rob): this is a simple implementation. We will move + # to a better approach via HTTP endpoint soon. + + # Listen for new requests for metadata. + host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST + path = make_zmq_path("tcp", host, port) + logger.debug("Starting listening on path: %s", path) + with zmq_ctx(zmq.ROUTER, path) as sock: + sock.setsockopt(zmq.RCVTIMEO, 1000) + ready_event.set() + while True: + try: + identity, _, msg = sock.recv_multipart() + except zmq.Again: + if stop_event.is_set(): + break + continue + # Decode the message which contains (GET_META_MSG, rank) + msg, target_tp_rank = msgspec.msgpack.decode(msg) + logger.debug( + "Received message for tp rank %s", + target_tp_rank, + ) + if msg != GET_META_MSG: + logger.warning("Connection listener got unexpected message %s", msg) + sock.send_multipart((identity, b"", encoded_data[target_tp_rank])) + + def get_num_new_matched_tokens( + self, request: "Request", num_computed_tokens: int + ) -> tuple[int, bool]: + """ + For remote prefill, pull all prompt blocks from remote + asynchronously relative to engine execution. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + Returns: + * the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + * true if the external KV cache tokens will be loaded + asynchronously (between scheduler steps). + """ + + params = request.kv_transfer_params + logger.debug( + "NIXLConnector get_num_new_matched_tokens: " + "num_computed_tokens=%s, kv_transfer_params=%s", + num_computed_tokens, + params, + ) + + if params is not None and params.get("do_remote_prefill"): + # Remote prefill: get all prompt blocks from remote. + token_ids = request.prompt_token_ids or [] + count = len(token_ids) - num_computed_tokens + if count > 0: + return count, True + + # No remote prefill for this request. + return 0, False + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + params = request.kv_transfer_params + logger.debug( + "NIXLConnector update_state_after_alloc: " + "num_external_tokens=%s, kv_transfer_params=%s", + num_external_tokens, + params, + ) + + if not params: + return + + if params.get("do_remote_decode"): + self._reqs_in_batch.add(request.request_id) + if self.use_host_buffer and params.get("do_remote_decode"): + # NOTE: when accelerator is not directly supported by Nixl, + # prefilled blocks need to be saved to host memory before transfer. + + # save all blocks + block_ids = blocks.get_block_ids()[0] + # TODO: skip the blocks that are already in the host xfer buffer. + # Currently, the host xfer buffer block is 1-to-1 mapped to device + # kv blocks, so host blocks won't be flushed as long as its device + # block is not overwritten; and it will be safe to skip saving them + # to host xfer buffer. + if block_ids: + self._reqs_need_save[request.request_id] = (request, block_ids) + elif params.get("do_remote_prefill"): + if params.get("remote_block_ids"): + if all( + p in params + for p in ("remote_engine_id", "remote_host", "remote_port") + ): + # If remote_blocks and num_external_tokens = 0, we have + # a full prefix cache hit on the D worker. We need to call + # send_notif in _read_blocks to free the memory on the P. + local_block_ids = ( + blocks.get_unhashed_block_ids() + if num_external_tokens > 0 + else [] + ) + # Get unhashed blocks to pull from remote. + self._reqs_need_recv[request.request_id] = ( + request, + local_block_ids, + ) + + else: + logger.warning( + "Got invalid KVTransferParams: %s. This " + "request will not utilize KVTransfer", + params, + ) + else: + assert num_external_tokens == 0 + # Only trigger 1 KV transfer per request. + params["do_remote_prefill"] = False + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + meta = NixlConnectorMetadata() + + # Loop through scheduled reqs and convert to ReqMeta. + for req_id, (req, block_ids) in self._reqs_need_recv.items(): + assert req.kv_transfer_params is not None + meta.add_new_req( + request_id=req_id, + local_block_ids=block_ids, + kv_transfer_params=req.kv_transfer_params, + load_remote_cache=True, + save_to_host=False, + ) + + for req_id, (req, block_ids) in self._reqs_need_save.items(): + assert req.kv_transfer_params is not None + meta.add_new_req( + request_id=req_id, + local_block_ids=block_ids, + kv_transfer_params=req.kv_transfer_params, + load_remote_cache=False, + save_to_host=True, + ) + + meta.reqs_to_send = self._reqs_need_send + meta.reqs_in_batch = self._reqs_in_batch + meta.reqs_not_processed = self._reqs_not_processed + + # Clear the list once workers start the transfers + self._reqs_need_recv.clear() + self._reqs_need_save.clear() + self._reqs_in_batch = set() + self._reqs_not_processed = set() + self._reqs_need_send = {} + + return meta + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Once a request is finished, determine whether request blocks + should be freed now or will be sent asynchronously and freed later. + """ + from vllm.v1.request import RequestStatus + + params = request.kv_transfer_params + logger.debug( + "NIXLConnector request_finished(%s), request_status=%s, " + "kv_transfer_params=%s", + request.request_id, + request.status, + params, + ) + if not params: + return False, None + + if params.get("do_remote_prefill"): + # If do_remote_prefill is still True when the request is finished, + # update_state_after_alloc must not have been called (the request + # must have been aborted before it was scheduled). + # To avoid stranding the prefill blocks in the prefill instance, + # we must add empty block_ids to _reqs_need_recv so that our + # worker side will notify and free blocks in the prefill instance. + self._reqs_need_recv[request.request_id] = (request, []) + params["do_remote_prefill"] = False + return False, None + + if not params.get("do_remote_decode"): + return False, None + if request.status != RequestStatus.FINISHED_LENGTH_CAPPED: + # Also include the case of a P/D Prefill request with immediate + # block free (eg abort). Stop tracking this request. + self._reqs_not_processed.add(request.request_id) + return False, None + + # TODO: check whether block_ids actually ever be 0. If not we could + # remove the conditional below + delay_free_blocks = len(block_ids) > 0 + + if delay_free_blocks: + # Prefill request on remote. It will be read from D upon completion + logger.debug( + "NIXLConnector request_finished(%s) waiting for %d seconds " + "for remote decode to fetch blocks", + request.request_id, + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT, + ) + self._reqs_need_send[request.request_id] = ( + time.perf_counter() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT + ) + + return delay_free_blocks, dict( + do_remote_prefill=True, + do_remote_decode=False, + remote_block_ids=block_ids, + remote_engine_id=self.engine_id, + remote_host=self.side_channel_host, + remote_port=self.side_channel_port, + tp_size=self.vllm_config.parallel_config.tensor_parallel_size, + ) + + +class NixlConnectorWorker: + """Implementation of Worker side methods""" + + @dataclass + class TpKVTopology: + """ + Helper class for tensor parallel and KV topology information for + mapping between local and remote TP workers. + """ + + tp_size: int + tp_rank: int + remote_tp_size: dict[EngineId, int] + is_mla: bool + total_num_kv_heads: int + attn_backend: type[AttentionBackend] + + def __post_init__(self): + # Figure out whether the first dimension of the cache is K/V + # or num_blocks. This is used to register the memory regions correctly. + kv_cache_shape = self.attn_backend.get_kv_cache_shape( + num_blocks=1, block_size=16, num_kv_heads=1, head_size=1 + ) + # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D], + # we just mock num_blocks to 1 for the dimension check below. + self._is_kv_layout_blocks_first = ( + len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1 + ) + + attn_backend = AttentionBackendEnum[self.attn_backend.get_name()] + self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS + + @property + def is_kv_layout_blocks_first(self) -> bool: + return self._is_kv_layout_blocks_first + + @property + def split_k_and_v(self) -> bool: + # Whether to register regions for K and V separately (when present). + return not ( + self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first + ) + + block_size: int + remote_block_size: dict[EngineId, int] + + def tp_ratio( + self, + remote_tp_size: int, + ) -> int: + """ + Calculate the tensor parallel ratio between local and remote TP. + We can think of it as the number of local TP workers-per-remote TP + workers. Local workers will read from the same remote TP worker in + groups of size `tp_ratio`. + """ + assert self.tp_size % remote_tp_size == 0, ( + f"Local tensor parallel size {self.tp_size} is not divisible " + f"by remote tensor parallel size {remote_tp_size}." + ) + return self.tp_size // remote_tp_size + + def block_size_ratio( + self, + remote_block_size: int, + ) -> float: + """ + Calculate the block size ratio between local and remote TP. + """ + assert self.block_size % remote_block_size == 0, ( + f"Local block size {self.block_size} is not divisible " + f"by remote block size {remote_block_size} or vice versa." + ) + return self.block_size // remote_block_size + + def tp_ratio_from_engine_id( + self, + remote_engine_id: EngineId, + ) -> int: + remote_tp_size = self.remote_tp_size[remote_engine_id] + return self.tp_ratio(remote_tp_size) + + def block_size_ratio_from_engine_id( + self, + remote_engine_id: EngineId, + ) -> float: + remote_block_size = self.remote_block_size[remote_engine_id] + return self.block_size_ratio(remote_block_size) + + def is_kv_replicated(self, engine_id: EngineId) -> bool: + """ + Whether the KV cache is replicated across TP workers due to the + number of TP workers being greater than the number of KV heads. + """ + tp_size = self.remote_tp_size[engine_id] + return tp_size // self.total_num_kv_heads >= 1 + + def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool: + # MLA is always replicated as the hidden dim can't be split. + return self.is_mla or self.is_kv_replicated(remote_engine_id) + + def get_target_remote_rank( + self, + remote_tp_size: int, + ) -> int: + """ + Get the remote TP rank (on P) that the current local TP rank + (on D) will read from. + """ + tp_ratio = self.tp_ratio(remote_tp_size) + return self.tp_rank // tp_ratio + + def get_target_remote_rank_from_engine_id( + self, + remote_engine_id: EngineId, + ) -> int: + remote_tp_size = self.remote_tp_size[remote_engine_id] + return self.get_target_remote_rank(remote_tp_size) + + def __init__(self, vllm_config: VllmConfig, engine_id: str): + if NixlWrapper is None: + logger.error("NIXL is not available") + raise RuntimeError("NIXL is not available") + logger.info("Initializing NIXL wrapper") + logger.info("Initializing NIXL worker %s", engine_id) + + # Config. + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + + if vllm_config.kv_transfer_config is None: + raise ValueError("kv_transfer_config must be set for NixlConnector") + self.kv_transfer_config = vllm_config.kv_transfer_config + + self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config( + "backends", ["UCX"] + ) + # TODO temporary, once nixl allows for telemetry flag in config + # (next release), we can remove this env var. + os.environ["NIXL_TELEMETRY_ENABLE"] = "1" + + # Agent. + non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"] + # Configure NIXL num_threads to avoid UAR exhaustion on Mellanox NICs. + # Each UCX thread allocates UARs (doorbell pages) via DevX, and + # excessive NIXL UAR usage can exhaust NIC UAR space. This can cause + # components like NVSHMEM (used by DeepEP kernels) to fail during RDMA + # initialization with "mlx5dv_devx_alloc_uar" errors. + # Ref: https://network.nvidia.com/files/doc-2020/ethernet-adapters-programming-manual.pdf#page=63 + num_threads = vllm_config.kv_transfer_config.get_from_extra_config( + "num_threads", 4 + ) + if nixl_agent_config is None: + config = None + else: + config = ( + nixl_agent_config(backends=self.nixl_backends) + if len(non_ucx_backends) > 0 + else nixl_agent_config(num_threads=num_threads) + ) + + self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config) + # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}. + self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict) + + # Metadata. + self.engine_id: EngineId = engine_id + self.tp_rank = get_tensor_model_parallel_rank() + self.world_size = get_tensor_model_parallel_world_size() + self.tp_group = get_tp_group() + self.num_blocks = 0 + self.enable_permute_local_kv = False + + # KV Caches and nixl tracking data. + self.device_type = current_platform.device_type + self.kv_buffer_device: str = vllm_config.kv_transfer_config.kv_buffer_device + if self.device_type not in _NIXL_SUPPORTED_DEVICE: + raise RuntimeError(f"{self.device_type} is not supported.") + elif self.kv_buffer_device not in _NIXL_SUPPORTED_DEVICE[self.device_type]: + raise RuntimeError( + f"{self.device_type} with {self.kv_buffer_device} kv_buffer " + "is not supported." + ) + self.device_kv_caches: dict[str, torch.Tensor] = {} + + # cpu kv buffer for xfer + # used when device memory can not be registered under nixl + self.host_xfer_buffers: dict[str, torch.Tensor] = {} + if self.device_type == "cpu": + self.use_host_buffer = False + else: + self.use_host_buffer = self.kv_buffer_device == "cpu" + + # support for oot platform which can't register nixl memory + # type based on kv_buffer_device + nixl_memory_type = current_platform.get_nixl_memory_type() + if nixl_memory_type is None: + if self.kv_buffer_device == "cuda": + nixl_memory_type = "VRAM" + elif self.kv_buffer_device == "cpu": + nixl_memory_type = "DRAM" + if nixl_memory_type is None: + raise RuntimeError( + f"{self.device_type} with {self.kv_buffer_device} kv_buffer " + "is not supported." + ) + self.nixl_memory_type = nixl_memory_type + + # Note: host xfer buffer ops when use_host_buffer is True + self.copy_blocks: CopyBlocksOp | None = None + + # Map of engine_id -> kv_caches_base_addr. For TP case, each local + # rank will still only pull from a single remote TP worker. + self.kv_caches_base_addr: dict[EngineId, list[int]] = {} + self.device_id: int = 0 + + # Number of NIXL regions. Currently one region per cache + # (so 1 per layer for MLA, otherwise 2 per layer) + self.num_regions = 0 + self.num_layers = 0 + + # nixl_prepped_dlist_handle. + self.src_xfer_side_handle: int = 0 + self.src_xfer_side_handles: dict[int, int] = {} + # Map of engine_id -> nixl_prepped_dlist_handle (int)]. + self.dst_xfer_side_handles: dict[EngineId, int] = {} + + # Map of engine_id -> num_blocks. All ranks in the same deployment will + # have the same number of blocks. + self.dst_num_blocks: dict[EngineId, int] = {} + self._registered_descs: list[Any] = [] + + # In progress transfers. + # [req_id -> list[handle]] + self._recving_metadata: dict[ReqId, ReqMeta] = {} + self._recving_transfers = defaultdict[ReqId, list[Transfer]](list) + # Track the expiration time of requests that are waiting to be sent. + self._reqs_to_send: dict[ReqId, float] = {} + # Set of requests that have been part of a batch, regardless of status. + self._reqs_to_process: set[ReqId] = set() + + # invalid blocks from failed NIXL operations + self._invalid_block_ids: set[int] = set() + # requests that skipped transfer (handshake or transfer failures) + self._failed_recv_reqs: set[ReqId] = set() + + # Handshake metadata of this worker for NIXL transfers. + self.xfer_handshake_metadata: NixlAgentMetadata | None = None + # Background thread for initializing new NIXL handshakes. + self._handshake_initiation_executor = ThreadPoolExecutor( + # NIXL is not guaranteed to be thread-safe, limit 1 worker. + max_workers=1, + thread_name_prefix="vllm-nixl-handshake-initiator", + ) + self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]() + self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {} + # Protects _handshake_futures and _remote_agents. + self._handshake_lock = threading.RLock() + + self.block_size = vllm_config.cache_config.block_size + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + + # TODO(mgoin): remove this once we have hybrid memory allocator + # Optimization for models with local attention (Llama 4) + # List of block window sizes for each layer for local attention + self.block_window_per_layer: list[int | None] = [] + self.use_mla = self.model_config.use_mla + + backend = get_attn_backend( + self.model_config.get_head_size(), + self.model_config.dtype, + self.cache_config.cache_dtype, + self.block_size, + use_mla=self.use_mla, + ) + self.backend_name = backend.get_name() + self.kv_cache_layout = get_kv_cache_layout() + self.host_buffer_kv_cache_layout = self.kv_cache_layout + logger.debug("Detected attention backend %s", self.backend_name) + logger.debug("Detected kv cache layout %s", self.kv_cache_layout) + + self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size} + self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size} + # With heterogeneous TP, P must wait for all assigned D TP workers to + # finish reading before safely freeing the blocks. + self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int) + self.xfer_stats = NixlKVConnectorStats() + + self.kv_topo = self.TpKVTopology( + tp_size=self.world_size, + tp_rank=self.tp_rank, + remote_tp_size=self._tp_size, # shared state + is_mla=self.use_mla, + total_num_kv_heads=self.model_config.get_total_num_kv_heads(), + block_size=self.block_size, + remote_block_size=self._block_size, + attn_backend=backend, + ) + self._use_pallas = self.kv_topo._use_pallas + self._physical_blocks_per_logical_kv_block = 1 + + def _nixl_handshake( + self, + host: str, + port: int, + remote_tp_size: int, + expected_engine_id: str, + ) -> dict[int, str]: + """Do a NIXL handshake with a remote instance.""" + + start_time = time.perf_counter() + + # NOTE(rob): we need each rank to have a unique port. This is + # a hack to keep us moving. We will switch when moving to etcd + # or where we have a single ZMQ socket in the scheduler. + + # Handshake only with the remote TP rank that current local rank will + # pull from. With homogeneous TP it happens to be the same rank_i. + p_remote_rank = self.kv_topo.get_target_remote_rank(remote_tp_size) + path = make_zmq_path("tcp", host, port) + logger.debug( + "Querying metadata on path: %s at remote tp rank %s", path, p_remote_rank + ) + + # Send query for the request. + with zmq_ctx(zmq.REQ, path) as sock: + msg = msgspec.msgpack.encode((GET_META_MSG, p_remote_rank)) + # Set receive timeout to 5 seconds to avoid hanging on dead server + sock.setsockopt(zmq.RCVTIMEO, 5000) # milliseconds + sock.send(msg) + metadata_bytes = sock.recv() + decoder = msgspec.msgpack.Decoder(NixlAgentMetadata) + metadata = decoder.decode(metadata_bytes) + got_metadata_time = time.perf_counter() + logger.debug( + "NIXL handshake: get metadata took: %s", got_metadata_time - start_time + ) + + # Ensure engine id matches. + if metadata.engine_id != expected_engine_id: + raise RuntimeError( + f"Remote NIXL agent engine ID mismatch. " + f"Expected {expected_engine_id}," + f"received {metadata.engine_id}." + ) + + # Register Remote agent. + assert metadata.block_size <= self.block_size, ( + "nP > nD is not supported yet." + ) + remote_agent_name = self.add_remote_agent( + metadata, p_remote_rank, remote_tp_size + ) + + setup_agent_time = time.perf_counter() + logger.debug( + "NIXL handshake: add agent took: %s", + setup_agent_time - got_metadata_time, + ) + + # Remote rank -> agent name. + return {p_remote_rank: remote_agent_name} + + def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None: + """ + Initialize transfer buffer in CPU mem for accelerators + NOT directly supported by NIXL (e.g., tpu) + """ + xfer_buffers: dict[str, torch.Tensor] = {} + try: + for layer_name, kv_cache in kv_caches.items(): + kv_shape = kv_cache.shape + kv_dtype = kv_cache.dtype + if ( + self.kv_cache_layout == "NHD" + and self.vllm_config.kv_transfer_config is not None + and self.vllm_config.kv_transfer_config.enable_permute_local_kv + ): + logger.info_once( + "'enable_permute_local_kv' flag is enabled while " + "device KV Layout is NHD. Init host buffer with" + " HND to better support Decode/Prefill TP_ratio > 1." + ) + # Since NHD will not support Decode/Prefill TP_ratio > 1, + # we can leverage host_buffer for permute + self.host_buffer_kv_cache_layout = "HND" + kv_shape = tuple(kv_shape[i] for i in [0, 1, 3, 2, 4]) + xfer_buffers[layer_name] = torch.empty( + kv_shape, dtype=kv_dtype, device="cpu" + ) + except MemoryError as e: + logger.error("NIXLConnectorWorker gets %s.", e) + raise + + self.host_xfer_buffers = xfer_buffers + + def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp): + """Assign copy (d2h, h2d) operations when host buffer is used.""" + # Set a no-op if the host buffer is not cpu. + if self.kv_buffer_device != "cpu": + return + # Set a no-op if self.device_type is 'cpu'. + if self.device_type == "cpu": + return + assert self.use_host_buffer + self.copy_blocks = copy_operation + + def _background_nixl_handshake( + self, req_id: str, remote_engine_id: EngineId, meta: ReqMeta + ): + # Do NIXL handshake in background and add to _ready_requests when done. + fut = self._handshake_futures.get(remote_engine_id) + if fut is None: + fut = self._handshake_initiation_executor.submit( + self._nixl_handshake, + meta.remote_host, + meta.remote_port, + meta.tp_size, + remote_engine_id, + ) + self._handshake_futures[remote_engine_id] = fut + + def done_callback(f: Future[dict[int, str]], eid=remote_engine_id): + with self._handshake_lock: + del self._handshake_futures[eid] + try: + self._remote_agents[eid] = f.result() + except Exception: + logger.exception("Handshake with %s failed", eid) + + fut.add_done_callback(done_callback) + + # check handshake success before proceeding with request + def request_ready(f: Future[Any], entry=(req_id, meta)): + try: + # check if handshake succeeded + f.result() + self._ready_requests.put(entry) + except Exception: + # handshake failed - mark blocks as invalid + logger.exception( + "Handshake failed for request %s, marking blocks as invalid", req_id + ) + if req_meta := self._recving_metadata.get(req_id): + self._invalid_block_ids.update(req_meta.local_block_ids) + self._failed_recv_reqs.add(req_id) + + fut.add_done_callback(request_ready) + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + """Register the KV Cache data in nixl.""" + + if self.use_host_buffer: + self.initialize_host_xfer_buffer(kv_caches=kv_caches) + assert len(self.host_xfer_buffers) == len(kv_caches), ( + f"host_buffer: {len(self.host_xfer_buffers)}, " + f"kv_caches: {len(kv_caches)}" + ) + xfer_buffers = self.host_xfer_buffers + else: + xfer_buffers = kv_caches + assert not self.host_xfer_buffers, ( + "host_xfer_buffer should not be initialized when " + f"kv_buffer_device is {self.kv_buffer_device}" + ) + + logger.info( + "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, " + "use_host_buffer: %s", + self.use_mla, + self.kv_buffer_device, + self.use_host_buffer, + ) + + caches_data = [] + # With hybrid allocator, layers can share a kv cache tensor + seen_base_addresses = [] + + # Note(tms): I modified this from the original region setup code. + # K and V are now in different regions. Advantage is that we can + # elegantly support MLA and any cases where the K and V tensors + # are non-contiguous (it's not locally guaranteed that they will be) + # Disadvantage is that the encoded NixlAgentMetadata is now larger + # (roughly 8KB vs 5KB). + # Conversely for FlashInfer, K and V are registered in the same region + # to better exploit the memory layout (ie num_blocks is the first dim). + split_k_and_v = self.kv_topo.split_k_and_v + tensor_size_bytes = None + # Enable different block lengths for different layers when MLA is used. + self.block_len_per_layer = list[int]() + self.slot_size_per_layer = list[int]() # HD bytes in kv terms + self.device_id = self.tp_rank + for layer_name, cache_or_caches in xfer_buffers.items(): + cache_list = cache_or_caches if split_k_and_v else [cache_or_caches] + + for cache in cache_list: + base_addr = cache.data_ptr() + if not self.use_host_buffer and current_platform.is_cuda_alike(): + self.device_id = cache.device.index + if base_addr in seen_base_addresses: + continue + + # TODO (NickLucche): Get kernel_block_size in a cleaner way + # NHD default "view" for non-MLA cache + kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3] + + if self.block_size != kernel_block_size: + logger.info_once( + "User-specified logical block size (%s) does not match" + " physical kernel block size (%s). Using the latter. ", + self.block_size, + kernel_block_size, + ) + self._physical_blocks_per_logical_kv_block = ( + self.block_size // kernel_block_size + ) + self.block_size = kernel_block_size + + seen_base_addresses.append(base_addr) + curr_tensor_size_bytes = cache.numel() * cache.element_size() + + if tensor_size_bytes is None: + tensor_size_bytes = curr_tensor_size_bytes + self.num_blocks = cache.shape[0] + + assert cache.shape[0] == self.num_blocks, ( + "All kv cache tensors must have the same number of blocks" + ) + + self.block_len_per_layer.append( + curr_tensor_size_bytes // self.num_blocks + ) + self.slot_size_per_layer.append( + self.block_len_per_layer[-1] // self.block_size + ) + + if not self.use_mla: + # Different kv cache shape is not supported by HeteroTP + assert tensor_size_bytes == curr_tensor_size_bytes, ( + "All kv cache tensors must have the same size" + ) + # Need to make sure the device ID is non-negative for NIXL, + # Torch uses -1 to indicate CPU tensors while NIXL uses explicit + # memory type. + self.device_id = max(cache.get_device(), 0) + caches_data.append( + (base_addr, curr_tensor_size_bytes, self.device_id, "") + ) + + logger.debug( + "Different block lengths collected: %s", set(self.block_len_per_layer) + ) + assert len(self.block_len_per_layer) == len(seen_base_addresses) + assert self.num_blocks != 0 + + self.kv_caches_base_addr[self.engine_id] = seen_base_addresses + self.num_regions = len(caches_data) + self.num_layers = len(xfer_buffers.keys()) + + descs = self.nixl_wrapper.get_reg_descs(caches_data, self.nixl_memory_type) + logger.debug("Registering descs: %s", caches_data) + self.nixl_wrapper.register_memory(descs, backends=self.nixl_backends) + logger.debug("Done registering descs") + self._registered_descs.append(descs) + + self.device_kv_caches = kv_caches + self.dst_num_blocks[self.engine_id] = self.num_blocks + if self.kv_topo.is_kv_layout_blocks_first: + for i in range(len(self.slot_size_per_layer)): + assert self.slot_size_per_layer[i] % 2 == 0 + self.slot_size_per_layer[i] //= 2 + + # NOTE (NickLucche) When FlashInfer is used, memory is registered + # with joint KV for each block. This minimizes the overhead in + # registerMem allowing faster descs queries. In order to be able to + # split on kv_heads dim as required by heterogeneous TP, one must + # be able to index K/V separately. Hence we double the number + # of 'virtual' regions here and halve `block_len` below. + self.num_regions *= 2 + + # Register local/src descr for NIXL xfer. + self.seen_base_addresses = seen_base_addresses + self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size) + + self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle + + # TODO(mgoin): Hybrid memory allocator is currently disabled for + # models with local attention (Llama 4). Can remove this once enabled. + if self.model_config.hf_config.model_type == "llama4": + from transformers import Llama4TextConfig + + assert isinstance(self.model_config.hf_text_config, Llama4TextConfig) + llama4_config = self.model_config.hf_text_config + no_rope_layers = llama4_config.no_rope_layers + chunk_size = llama4_config.attention_chunk_size + chunk_block_size = math.ceil(chunk_size / self.block_size) + for layer_idx in range(self.num_layers): + # no_rope_layers[layer_idx] == 0 means NoPE (global) + # Any other value means RoPE (local chunked) + is_local_attention = no_rope_layers[layer_idx] != 0 + block_window = chunk_block_size if is_local_attention else None + self.block_window_per_layer.append(block_window) + logger.debug( + "Llama 4 block window per layer mapping: %s", + self.block_window_per_layer, + ) + assert len(self.block_window_per_layer) == self.num_layers + + # After KV Caches registered, listen for new connections. + self.xfer_handshake_metadata = NixlAgentMetadata( + engine_id=self.engine_id, + agent_metadata=self.nixl_wrapper.get_agent_metadata(), + kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id], + device_id=self.device_id, + num_blocks=self.num_blocks, + block_lens=self.block_len_per_layer, + attn_backend_name=self.backend_name, + kv_cache_layout=self.kv_cache_layout + if not self.use_host_buffer + else self.host_buffer_kv_cache_layout, + block_size=self.block_size, + ) + + def register_local_xfer_handler( + self, + block_size: int, + ) -> int: + """ + Function used for register local xfer handler with local block_size or + Remote block_size. + + When local block_size is same as remote block_size, we use local block_size + to register local_xfer_handler during init. + + When remote block size is less than local block size, we need to use + register another local_xfer_handler using remote block len to ensure + data copy correctness. + """ + block_size_ratio = self.block_size // block_size + blocks_data = [] + for i, base_addr in enumerate(self.seen_base_addresses): + # The new block_len is using prefill block_len; + # and num_blocks is multiple with N + kv_block_len = ( + self.get_backend_aware_kv_block_len(layer_idx=i) // block_size_ratio + ) + block_len_per_layer = self.block_len_per_layer[i] // block_size_ratio + num_blocks = self.num_blocks * block_size_ratio + for block_id in range(num_blocks): + block_offset = block_id * block_len_per_layer + addr = base_addr + block_offset + # (addr, len, device id) + blocks_data.append((addr, kv_block_len, self.device_id)) + + if self.kv_topo.is_kv_layout_blocks_first: + # Separate and interleave K/V regions to maintain the same + # descs ordering. This is needed for selecting contiguous heads + # when split across TP ranks. + for block_id in range(num_blocks): + block_offset = block_id * block_len_per_layer + addr = base_addr + block_offset + # Register addresses for V cache (K registered first). + v_addr = addr + kv_block_len + blocks_data.append((v_addr, kv_block_len, self.device_id)) + logger.debug( + "Created %s blocks for src engine %s and rank %s on device id %s", + len(blocks_data), + self.engine_id, + self.tp_rank, + self.device_id, + ) + + descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type) + # NIXL_INIT_AGENT to be used for preparations of local descs. + return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs) + + def add_remote_agent( + self, + nixl_agent_meta: NixlAgentMetadata, + remote_tp_rank: int = 0, + remote_tp_size: int = 1, + ) -> str: + """ + Add the remote NIXL agent and prepare the descriptors for reading cache + blocks from remote. + + In particular, handle both homogeneous and heterogeneous TP. The former + requires local rank_i to read from remote rank_i. + The latter, assuming D.world_size > P.world_size, requires that two or + more local TP worker share the xfer from a single TP worker. + + Here's an example (non-MLA case): + + rank_offset p_remote_tp_rank + (kv split no) + -------------------------------- + 0 0 Worker0 ---- 1st half of KV ----> Worker0 [ KV Cache ] + / + 1 0 Worker1 ---- 2nd half of KV -----/ + + 0 1 Worker2 ---- 1st half of KV ----> Worker1 [ KV Cache ] + / + 1 1 Worker3 ---- 2nd half of KV -----/ + + + Decoder TP workers Prefix TP workers + (world_size=4) (world_size=2) + tp_ratio = 4 // 2 = 2 + + Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim] + then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format. + Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio + first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split + along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0. + + Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1. + + Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0 + so that the whole cache is shared by "tp_ratio" D TP workers. + """ # noqa: E501 + engine_id = nixl_agent_meta.engine_id + # TODO re-evaluate refreshing for scaling/recovery + if remote_tp_rank in self._remote_agents.get(engine_id, {}): + logger.debug( + "Remote agent with engine_id %s and rank" + "%s already exchanged metadata, skip handshake.", + engine_id, + remote_tp_rank, + ) + return self._remote_agents[engine_id][remote_tp_rank] + + ### Register remote agent metadata + if engine_id not in self._tp_size: + self._tp_size[engine_id] = remote_tp_size + if engine_id not in self._block_size: + self._block_size[engine_id] = nixl_agent_meta.block_size + + remote_agent_name = self.nixl_wrapper.add_remote_agent( + nixl_agent_meta.agent_metadata + ) + + # Handle tp_size>num_kv_heads: replicate KV cache. + replicates_kv_cache = self.kv_topo.replicates_kv_cache(engine_id) + + # Create dst descs and xfer side handles. TP workers have same #blocks + # so we only register once per engine_id. + # Example: + # block_size_ratio > 1: + # remote: | 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12| + # local origin:| 0| 1| 8| 12| + # local mapped:| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15| + block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(engine_id) + + if engine_id not in self.dst_num_blocks: + self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks + + # Keep track of remote agent kv caches base addresses. + self.kv_caches_base_addr[engine_id] = nixl_agent_meta.kv_caches_base_addr + + self._validate_remote_agent_handshake(nixl_agent_meta, remote_tp_size) + + # Number of D TP workers reading from a single P TP worker. This is + # 1 when P and D `--tensor-parallel-size` match. + tp_ratio = self.kv_topo.tp_ratio_from_engine_id(engine_id) + + ### Register remote agent memory regions + blocks_data = [] + # With homogeneous TP, D pulls the whole kv cache from corresponding + # rank. With heterogeneous TP, prepare the descriptors by splitting the + # P KV cache along kv_head dim, of D worker's kv_head size (D>P). + # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..]. + + # Register all remote blocks, but only the corresponding kv heads. + for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr): + kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i) + remote_kv_block_len = kv_block_len // block_size_ratio + if block_size_ratio > 1: + # using remote kv_block_len as transfer unit + kv_block_len = remote_kv_block_len + rank_offset = ( + self.tp_rank % tp_ratio * remote_kv_block_len + if not replicates_kv_cache + else 0 + ) + for block_id in range(nixl_agent_meta.num_blocks): + block_offset = block_id * nixl_agent_meta.block_lens[i] + # For each block, grab the heads chunk belonging to rank_i + # of size remote_nheads // tp_ratio, which correspond to + # self.block_len == remote_block_len//tp_ratio bytes. + addr = base_addr + block_offset + rank_offset + # (addr, len, device id) + blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id)) + + if self.kv_topo.is_kv_layout_blocks_first: + # With FlashInfer index V separately to allow head splitting. + for block_id in range(nixl_agent_meta.num_blocks): + block_offset = block_id * nixl_agent_meta.block_lens[i] + addr = base_addr + block_offset + rank_offset + v_addr = addr + nixl_agent_meta.block_lens[i] // 2 + blocks_data.append( + (v_addr, kv_block_len, nixl_agent_meta.device_id) + ) + + logger.debug( + "Created %s blocks for dst engine %s with remote rank %s and local rank %s", + len(blocks_data), + engine_id, + remote_tp_rank, + self.tp_rank, + ) + + # Register with NIXL. + descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type) + self.dst_xfer_side_handles[engine_id] = self.nixl_wrapper.prep_xfer_dlist( + remote_agent_name, descs + ) + + if block_size_ratio > 1: + # when prefill with smaller block_size, we need to init a + # new handler with same block_len to match + self.src_xfer_side_handles[nixl_agent_meta.block_size] = ( + self.register_local_xfer_handler(nixl_agent_meta.block_size) + ) + + return remote_agent_name + + def _validate_remote_agent_handshake( + self, nixl_agent_meta: NixlAgentMetadata, remote_tp_size: int + ): + """ + Validate the remote agent handshake metadata ensuring the + invariants hold true. + """ + remote_engine_id = nixl_agent_meta.engine_id + + assert self._tp_size[remote_engine_id] == remote_tp_size + # TODO We may eventually want to skip enforcing the same attn backend. + assert nixl_agent_meta.attn_backend_name == self.backend_name + + tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id) + block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id( + remote_engine_id + ) + assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP" + assert not self._use_pallas or tp_ratio == 1, ( + "TPU (pallas_v1) DOES NOT support heterogeneous TP yet." + ) + kv_cache_layout = ( + self.kv_cache_layout + if not self.use_host_buffer + else self.host_buffer_kv_cache_layout + ) + if not self.use_mla and nixl_agent_meta.kv_cache_layout != kv_cache_layout: + if ( + self.kv_transfer_config.enable_permute_local_kv + and nixl_agent_meta.kv_cache_layout == "HND" + ): + logger.info( + "Remote is HND and local is NHD, enabled additional permute " + "on local device KV." + ) + self.enable_permute_local_kv = True + else: + raise RuntimeError( + "Heterogeneous TP expects same kv_cache_layout. " + "Or enable experimental feature to use HND to NHD support by " + "setting 'enable_permute_local_kv'=True in --kv-transfer-config." + ) + + # Block len can only vary across layers when using MLA. + remote_block_len = nixl_agent_meta.block_lens[0] + if self.use_mla or self.kv_topo.is_kv_replicated(remote_engine_id): + # With replicated KV cache, only the number of blocks can differ. + for i in range(len(self.block_len_per_layer)): + assert ( + self.block_len_per_layer[i] // block_size_ratio + == nixl_agent_meta.block_lens[i] + ), "KV cache sizes must match between P and D when replicated" + else: + # When MLA is not used, this is a list of the same block length + for block_len in nixl_agent_meta.block_lens: + assert block_len == remote_block_len, ( + "All remote layers must have the same block size" + ) + + assert ( + remote_block_len + == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio + ), ( + "Remote P worker KV layer cache must be of shape [2, N, " + "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype." + ) + + # TP workers have same #blocks. + assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks + + assert len(nixl_agent_meta.kv_caches_base_addr) == len(self.block_len_per_layer) + + def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta): + """copy recved kv from host buffer to device.""" + assert self.use_host_buffer + assert self.copy_blocks is not None + + local_block_ids = meta.local_physical_block_ids + self.copy_blocks( + self.host_xfer_buffers, + self.device_kv_caches, + local_block_ids, + local_block_ids, + "h2d", + ) + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "synced recved kv of request[%s] to device kv buffer," + "local_block_ids: %s. ", + req_id, + ",".join(map(str, local_block_ids)), + ) + + def save_kv_to_host(self, metadata: NixlConnectorMetadata): + """copy kv from device to host buffer.""" + assert self.use_host_buffer + assert self.copy_blocks is not None + + for req_id, meta in metadata.reqs_to_save.items(): + meta.local_physical_block_ids = self._logical_to_kernel_block_ids( + meta.local_block_ids + ) + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "save_load_kv for request[%s] to host xfer buffer." + "local_block_ids: %s. ", + req_id, + ",".join(map(str, meta.local_physical_block_ids)), + ) + # blocking + self.copy_blocks( + self.device_kv_caches, + self.host_xfer_buffers, + meta.local_physical_block_ids, + meta.local_physical_block_ids, + "d2h", + ) + + def permute_device_kv(self, block_ids: list[int]): + """Transforms the layout of received KV cache blocks to the local format. + + This method corrects layout mismatches from direct memory copies by + permuting the tensor dimensions. + + - **Source Layout:** `[num_blocks, n_kv_head, block_size, head_dim]` + - **Target Layout:** `[num_blocks, block_size, n_kv_head, head_dim]` + + Args: + block_ids: A list of block IDs to update and permute. + + Implementation: + - x = blocks_to_update.reshape(src_shape) # view local kv with sender layout + - permuted_blocks = x.permute(*inv_order) # transpose n_kv_heads, block_size + - cache.index_copy_(0, indices, permuted_blocks) # copy permuted kv back + + """ + split_k_and_v = self.kv_topo.split_k_and_v + inv_order = [0, 2, 1, 3] + sample_cache = list(self.device_kv_caches.values())[0][0] + target_shape = list(sample_cache.shape) + target_shape[0] = -1 + src_shape = tuple(target_shape[i] for i in inv_order) + indices = torch.tensor(block_ids, device=sample_cache.device) + + for _, cache_or_caches in self.device_kv_caches.items(): + cache_list = cache_or_caches if split_k_and_v else [cache_or_caches] + for cache in cache_list: + blocks_to_update = cache.index_select(0, indices) + permuted_blocks = blocks_to_update.reshape(src_shape).permute( + *inv_order + ) + cache.index_copy_(0, indices, permuted_blocks) + + def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]): + def _process_local_gt_remote(blocks_to_update, block_size_ratio): + n_kv_heads, block_size, head_size = blocks_to_update.shape[1:] + remote_block_size = block_size // block_size_ratio + n_blocks = block_size_ratio + # actual permute is to convert + # for local blocksize > remote blocksize + # ex: local blocksize = 16 tokens, remote blocksize = 4 tokens + # local block[0] = remote block[0, 1, 2, 3] + # remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|... + # local is |h0-b0..................|h1-b0..................|... + # permute is to: + # 1. view => view remote as n_blocks * remote_shape(H,remoteN,D) + # 2. permute => (H, nblocks, remoteN, D) + # 3. flatten => (H, localN, D) + permuted_blocks = ( + blocks_to_update.reshape( + -1, n_blocks, n_kv_heads, remote_block_size, head_size + ) + .permute(0, 2, 1, 3, 4) + .flatten(2, 3) + ) + return permuted_blocks + + if len(self.device_kv_caches) == 0: + return + split_k_and_v = not ( + self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first + ) + sample_cache = list(self.device_kv_caches.values())[0][0] + for block_size_ratio, block_ids_list in block_ids_per_ratio.items(): + assert block_size_ratio > 1, "Only nP < nD supported currently." + block_ids_list = [[item for sublist in block_ids_list for item in sublist]] + + for block_ids in block_ids_list: + indices = torch.tensor(block_ids, device=sample_cache.device) + + for _, cache_or_caches in self.device_kv_caches.items(): + cache_list = cache_or_caches if split_k_and_v else [cache_or_caches] + for cache in cache_list: + blocks_to_update = cache.index_select(0, indices) + # because kv_cache is always using original layout NHD as + # virtual shape while stride can be either HND / NHD at + # initialization. + # we need to firstly get physical view of the tensor + permuted_blocks = _process_local_gt_remote( + blocks_to_update.permute(0, 2, 1, 3), block_size_ratio + ).permute(0, 2, 1, 3) + cache.index_copy_(0, indices, permuted_blocks) + + def get_finished(self) -> tuple[set[str], set[str]]: + """ + Get requests that are done sending or recving on this specific worker. + The scheduler process (via the MultiprocExecutor) will use this output + to track which workers are done. + """ + done_sending = self._get_new_notifs() + done_recving = self._pop_done_transfers(self._recving_transfers) + + # add requests that skipped transfer to done_recving + done_recving.update(self._failed_recv_reqs) + self._failed_recv_reqs.clear() + + if len(done_sending) > 0 or len(done_recving) > 0: + logger.debug( + "Rank %s, get_finished: %s requests done sending " + "and %s requests done recving", + self.tp_rank, + len(done_sending), + len(done_recving), + ) + + block_ids_to_permute = [] + block_ids_for_blocksize_post_process = defaultdict(list) + for req_id in done_recving: + # clean up metadata for completed requests + meta = self._recving_metadata.pop(req_id, None) + assert meta is not None, f"{req_id} not found in recving_metadata list" + if self.use_host_buffer: + self.sync_recved_kv_to_device(req_id, meta) + if self.enable_permute_local_kv: + block_ids_to_permute += meta.local_physical_block_ids + + # post processing for heteroblocksize + block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id( + meta.remote_engine_id + ) + if ( + not self.use_mla + and block_size_ratio > 1 + and self.kv_cache_layout == "HND" + ): + block_ids_for_blocksize_post_process[block_size_ratio].append( + meta.local_block_ids + ) + self.blocksize_post_process(block_ids_for_blocksize_post_process) + if len(block_ids_to_permute) > 0: + self.permute_device_kv(block_ids_to_permute) + + # Handle timeout to avoid stranding blocks on remote. + now = time.perf_counter() + while self._reqs_to_send: + req_id, expires = next(iter(self._reqs_to_send.items())) + # Sorted dict, oldest requests are put first so we can exit early. + if now < expires: + break + count = self.consumer_notification_counts_by_req.pop(req_id, 0) + logger.warning( + "Releasing expired KV blocks for request %s which were " + "retrieved by %d decode worker(s) within %d seconds.", + req_id, + count, + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT, + ) + self._reqs_to_process.remove(req_id) + del self._reqs_to_send[req_id] + done_sending.add(req_id) + + return done_sending, done_recving + + def _get_new_notifs(self) -> set[str]: + """ + Get req_ids which got a remote xfer message. When multiple consumers + are reading from the same producer (heterogeneous TP scenario), wait + for all consumers to be done pulling. + """ + notified_req_ids: set[str] = set() + for notifs in self.nixl_wrapper.get_new_notifs().values(): + for notif in notifs: + req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1) + if ( + req_id not in self._reqs_to_send + and req_id not in self._reqs_to_process + ): + logger.error( + "Potentially invalid KV blocks for " + "unrecognized request %s were retrieved by " + "a decode worker. They may have expired.", + req_id, + ) + continue + + self.consumer_notification_counts_by_req[req_id] += 1 + # Wait all consumers (D) to be done reading before freeing. + if self.consumer_notification_counts_by_req[req_id] == int(tp_ratio): + notified_req_ids.add(req_id) + del self.consumer_notification_counts_by_req[req_id] + self._reqs_to_process.remove(req_id) + self._reqs_to_send.pop(req_id, None) + return notified_req_ids + + def _pop_done_transfers( + self, transfers: dict[str, list[tuple[int, float]]] + ) -> set[str]: + """ + Pop completed xfers by checking for DONE state. + Args: + transfers: dict of req_id -> list[running_xfer] + Returns: + set of req_ids that have all done xfers + """ + done_req_ids: set[str] = set() + for req_id, handles in list(transfers.items()): + in_progress = False + for handle, _xfer_stime in handles: + xfer_state = self.nixl_wrapper.check_xfer_state(handle) + if xfer_state == "DONE": + # Get telemetry from NIXL + res = self.nixl_wrapper.get_xfer_telemetry(handle) + self.xfer_stats.record_transfer(res) + self.nixl_wrapper.release_xfer_handle(handle) + elif xfer_state == "PROC": + in_progress = True + continue + else: + # transfer failed - mark blocks as invalid + logger.error( + "NIXL transfer failed for request %s with state %s. " + "Marking blocks as invalid.", + req_id, + xfer_state, + ) + # mark all (logical)blocks for this request as invalid + if meta := self._recving_metadata.pop(req_id, None): + self._invalid_block_ids.update(meta.local_block_ids) + self._recving_metadata.pop(req_id, None) + self.nixl_wrapper.release_xfer_handle(handle) + self.xfer_stats.record_failed_transfer() + if not in_progress: + done_req_ids.add(req_id) + del transfers[req_id] + return done_req_ids + + def start_load_kv(self, metadata: NixlConnectorMetadata): + """ + Start loading by triggering non-blocking nixl_xfer. + We check for these trnxs to complete in each step(). + """ + for req_id, meta in metadata.reqs_to_recv.items(): + meta.local_physical_block_ids = self._logical_to_kernel_block_ids( + meta.local_block_ids + ) + meta.remote_block_ids = self._logical_to_kernel_block_ids( + meta.remote_block_ids + ) + remote_engine_id = meta.remote_engine_id + logger.debug( + "start_load_kv for request %s from remote engine %s. " + "Num local_block_ids: %s. Num remote_block_ids: %s. ", + req_id, + remote_engine_id, + len(meta.local_physical_block_ids), + len(meta.remote_block_ids), + ) + # always store metadata for failure recovery + self._recving_metadata[req_id] = meta + if remote_engine_id not in self._remote_agents: + # Initiate handshake with remote engine to exchange metadata. + with self._handshake_lock: + if remote_engine_id not in self._remote_agents: + self._background_nixl_handshake(req_id, remote_engine_id, meta) + continue + + # Handshake already completed, start async read xfer. + self._read_blocks_for_req(req_id, meta) + + # Start transfers for requests whose handshakes have now finished. + while not self._ready_requests.empty(): + self._read_blocks_for_req(*self._ready_requests.get_nowait()) + + # Keep around the requests that have been part of a batch. This is + # needed because async scheduling pushes the misalignment between the + # moment in which requests expiration is set (P side) and the moment in + # which blocks are read from D. As P can now more easily lag behind D + # while processing the next batch, we make sure to only set an + # expiration for requests that have not been read from D yet. + for req_id in metadata.reqs_in_batch: + self._reqs_to_process.add(req_id) + + # Remove all requests that are not to be processed (eg aborted). + for req_id in metadata.reqs_not_processed: + self._reqs_to_process.discard(req_id) + # We should never get an abort after setting an expiry timer + assert req_id not in self._reqs_to_send + + # Add to requests that are waiting to be read and track expiration. + for req_id, expiration_time in metadata.reqs_to_send.items(): + if req_id in self._reqs_to_process: + self._reqs_to_send[req_id] = expiration_time + + def _read_blocks_for_req(self, req_id: str, meta: ReqMeta): + logger.debug( + "Remote agent %s available, calling _read_blocks for req %s", + meta.remote_engine_id, + req_id, + ) + self._read_blocks( + request_id=req_id, + dst_engine_id=meta.remote_engine_id, + local_block_ids=meta.local_physical_block_ids, + remote_block_ids=meta.remote_block_ids, + ) + + def _read_blocks( + self, + local_block_ids: list[int], + remote_block_ids: list[int], + dst_engine_id: str, + request_id: str, + ): + block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id) + if block_size_ratio > 1: + local_block_ids = self.get_mapped_blocks( + np.asarray(local_block_ids), block_size_ratio + ) + if len(local_block_ids) > len(remote_block_ids): + # NOTE: + # get_mapped_blocks will always expand block_ids for n times. + # ex: + # prefill block_ids with block_size as 4: + # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + # Local decode block_ids with block_size as 16: [1, 2, 3] + # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to + # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + # Then we clip local to align with prefill + # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to + # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + local_block_ids = local_block_ids[: len(remote_block_ids)] + # NOTE(rob): having the staging blocks be on the READER side is + # not going to work well (since we will have to call rearrange tensors). + # after we detect the txn is complete (which means we cannot make the + # read trxn async easily). If we want to make "READ" happen cleanly, + # then we will need to have the staging blocks on the remote side. + + # NOTE(rob): according to nvidia the staging blocks are used to + # saturate IB with heterogeneous TP sizes. We should remove the staging + # blocks until we are ready. + + # Number of D TP workers that will read from dst P. Propagate tp_ratio + # on notification so that dst worker can wait before freeing blocks. + tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id) + notif_id = f"{request_id}:{tp_ratio}".encode() + + # Full prefix cache hit: do not need to read remote blocks, + # just notify P worker that we have the blocks we need. + num_local_blocks = len(local_block_ids) + if num_local_blocks == 0: + remote_rank = self.kv_topo.get_target_remote_rank_from_engine_id( + dst_engine_id + ) + agent_name = self._remote_agents[dst_engine_id][remote_rank] + try: + self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id) + except Exception: + logger.exception( + "NIXL send_notif failed for request %s: " + "P worker blocks will be freed after timeout. " + "This may indicate network issues.", + request_id, + ) + self.xfer_stats.record_failed_notification() + return + + # Partial prefix cache hit: just read uncomputed blocks. + num_remote_blocks = len(remote_block_ids) + assert num_local_blocks <= num_remote_blocks + if num_local_blocks < num_remote_blocks: + remote_block_ids = remote_block_ids[-num_local_blocks:] + + # Get side handles. + remote_block_size = self.kv_topo.remote_block_size[dst_engine_id] + local_xfer_side_handle = self.src_xfer_side_handles.get( + remote_block_size, self.src_xfer_side_handle + ) + remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id] + + # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from + # corresponding rank. With heterogeneous TP, fixing D>P, the D tp + # workers will issue xfers to parts of the P worker remote kv caches. + + # Get descs ids. + local_block_descs_ids: np.ndarray + remote_block_descs_ids: np.ndarray + + if not self.block_window_per_layer: + # Default case: assume global attention + remote_block_descs_ids = self._get_block_descs_ids( + dst_engine_id, + remote_block_ids, + ) + local_block_descs_ids = self._get_block_descs_ids( + self.engine_id, + local_block_ids, + block_size_ratio=block_size_ratio, + ) + else: + # TODO(mgoin): remove this once we have hybrid memory allocator + # Optimization for models with local attention (Llama 4) + local_descs_list = [] + remote_descs_list = [] + for layer_idx, block_window in enumerate(self.block_window_per_layer): + # For each layer: + if block_window is None: + # If not chunked, we just use the + # full block lists (global attention) + layer_local_block_ids = local_block_ids + layer_remote_block_ids = remote_block_ids + else: + # If chunked, get the last block_window blocks + layer_local_block_ids = local_block_ids[-block_window:] + layer_remote_block_ids = remote_block_ids[-block_window:] + + # Get descs ids for the layer. + layer_local_desc_ids = self._get_block_descs_ids( + dst_engine_id, + layer_local_block_ids, + layer_idx, + ) + layer_remote_desc_ids = self._get_block_descs_ids( + self.engine_id, + layer_remote_block_ids, + layer_idx, + block_size_ratio=block_size_ratio, + ) + + local_descs_list.append(layer_local_desc_ids) + remote_descs_list.append(layer_remote_desc_ids) + + local_block_descs_ids = np.concatenate(local_descs_list) + remote_block_descs_ids = np.concatenate(remote_descs_list) + + assert len(local_block_descs_ids) == len(remote_block_descs_ids) + + # Prepare transfer with Nixl. + handle = None + try: + handle = self.nixl_wrapper.make_prepped_xfer( + "READ", + local_xfer_side_handle, + local_block_descs_ids, + remote_xfer_side_handle, + remote_block_descs_ids, + notif_msg=notif_id, + ) + + # Begin async xfer. + self.nixl_wrapper.transfer(handle) + + # Use handle to check completion in future step(). + self._recving_transfers[request_id].append((handle, time.perf_counter())) + except Exception: + logger.exception( + "NIXL transfer setup/initiation failed for request %s. " + "Marking blocks as invalid.", + request_id, + ) + # mark all (logical) blocks for this request as invalid + if meta := self._recving_metadata.get(request_id): + self._invalid_block_ids.update(meta.local_block_ids) + self.xfer_stats.record_failed_transfer() + if handle is not None: + self.nixl_wrapper.release_xfer_handle(handle) + self._failed_recv_reqs.add(request_id) + + def get_mapped_blocks(self, block_ids, block_size_ratio): + """ + Calculates the new set of block IDs by mapping every element + in the (potentially sparse) input array. + Example: block_ids=[0, 2], block_size_ratio=2 + get_mapped_blocks 0 1 [2 3] 4 5 + # remote is |h0-b0|h1-b0||h0-b1|h1-b1||h0-b1|h1-b1|| + # local is |h0-b0......||h1-b0......||h2-b0........ + local_block_ids 0 [1] 2 + """ + if block_ids.size == 0: + return np.array([], dtype=np.int64) + + start_ids = block_ids * block_size_ratio + offsets = np.arange(block_size_ratio) + mapped_2d = start_ids[:, None] + offsets[None, :] + + return mapped_2d.flatten().astype(np.int64) + + def _get_block_descs_ids( + self, + engine_id: str, + block_ids: list[int], + layer_idx: int | None = None, + block_size_ratio: float | None = None, + ) -> np.ndarray: + """ + Get the descs ids for a set of block ids. + If layer_idx is provided, we use the region_ids for the given layer. + Otherwise, we use all regions. + """ + if layer_idx is None: + region_ids = np.arange(self.num_regions) + else: + assert layer_idx < self.num_layers + if self.num_layers < self.num_regions: + # If we have more regions than layers, we assume that + # the regions are organized as [K0, V0, K1, V1, ...] + # and we select K_i and V_i + assert 2 * self.num_layers == self.num_regions + region_ids = np.arange(2 * layer_idx, 2 * layer_idx + 2) + else: + # Otherwise, we assume we have MLA and select i-th layer + assert self.num_layers == self.num_regions + region_ids = np.arange(layer_idx, layer_idx + 1) + + num_blocks = self.dst_num_blocks[engine_id] + if block_size_ratio is not None: + num_blocks = int(num_blocks * block_size_ratio) + + # Compute the desc ids for each block. + region_ids = region_ids[:, None] + block_ids = np.array(block_ids)[None, :] + descs_ids = region_ids * num_blocks + block_ids + return descs_ids.flatten() + + def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]: + """ + Convert logical block ids to kernel physical block ids. + This is required when the logical block size (the one set by the user) + does not match the one required by the attn backend. + """ + if self._physical_blocks_per_logical_kv_block == 1: + # Noop when physical and logical block sizes are the same + return block_ids + block_ids_np = np.array(block_ids) + block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape( + 1, -1 + ) + return BlockTable.map_to_kernel_blocks( + block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange + ).tolist() + + def get_backend_aware_kv_block_len(self, layer_idx: int): + """ + Get the block length for one K/V element (K and V have the same size). + + For FA and other backends, this is equal to the length of the whole + block, as K and V are in separate regions. + For FlashInfer, this is half the length of the whole block, as K and V + share the same region. + """ + if self.kv_topo.is_kv_layout_blocks_first: + # For indexing only half (either just the K or V part). + block_len = self.block_len_per_layer[layer_idx] // 2 + else: + block_len = self.block_len_per_layer[layer_idx] + return block_len + + def get_kv_connector_stats(self) -> KVConnectorStats | None: + """ + Get the KV transfer stats for the connector. + """ + # Clear stats for next iteration + if not self.xfer_stats.is_empty(): + return self.xfer_stats.clone_and_reset() + return None + + def get_block_ids_with_load_errors(self) -> set[int]: + """ + Return and clear the set of block IDs that failed to load. + + This is called by the scheduler to identify blocks that need + to be retried after a NIXL transfer failure. + """ + result = self._invalid_block_ids + self._invalid_block_ids = set() + return result + + def __del__(self): + self.shutdown() + + def shutdown(self): + """Shutdown the connector worker.""" + self._handshake_initiation_executor.shutdown(wait=False) + for handles in self._recving_transfers.values(): + for handle, _ in handles: + self.nixl_wrapper.release_xfer_handle(handle) + self._recving_transfers.clear() + if self.src_xfer_side_handle: + self.nixl_wrapper.release_dlist_handle(self.src_xfer_side_handle) + self.src_xfer_side_handle = 0 + for dst_xfer_side_handle in self.dst_xfer_side_handles.values(): + self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle) + self.dst_xfer_side_handles.clear() + for remote_agents in self._remote_agents.values(): + for agent_name in remote_agents.values(): + self.nixl_wrapper.remove_remote_agent(agent_name) + self._remote_agents.clear() + for desc in self._registered_descs: + self.nixl_wrapper.deregister_memory(desc) + self._registered_descs.clear() + + +@contextlib.contextmanager +def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: + """Context manager for a ZMQ socket""" + + if socket_type not in (zmq.ROUTER, zmq.REQ): + raise ValueError(f"Unexpected socket type: {socket_type}") + + ctx: zmq.Context | None = None + try: + ctx = zmq.Context() # type: ignore[attr-defined] + yield make_zmq_socket( + ctx=ctx, path=addr, socket_type=socket_type, bind=socket_type == zmq.ROUTER + ) + finally: + if ctx is not None: + ctx.destroy(linger=0) + + +@dataclass +class NixlKVConnectorStats(KVConnectorStats): + """Container for transfer performance metrics""" + + def __post_init__(self): + if not self.data: + # Empty container init, no data is passed in. + self.reset() + + def reset(self): + # Must be serializable + self.data: dict[str, list[float]] = { + "transfer_duration": [], + "post_duration": [], + "bytes_transferred": [], + "num_descriptors": [], + "num_failed_transfers": [], + "num_failed_notifications": [], + } + + def record_transfer(self, res: nixlXferTelemetry): + # Keep metrics units consistent with rest of the code: time us->s + self.data["transfer_duration"].append(res.xferDuration / 1e6) + self.data["post_duration"].append(res.postDuration / 1e6) + self.data["bytes_transferred"].append(res.totalBytes) + self.data["num_descriptors"].append(res.descCount) + + def record_failed_transfer(self): + """Record a failed NIXL transfer operation.""" + self.data["num_failed_transfers"].append(1.0) + + def record_failed_notification(self): + """Record a failed NIXL notification (send_notif).""" + self.data["num_failed_notifications"].append(1.0) + + def clone_and_reset(self) -> "NixlKVConnectorStats": + old = copy.copy(self) + self.reset() + return old + + def is_empty(self) -> bool: + return self.num_successful_transfers == 0 + + def aggregate(self, other: KVConnectorStats) -> KVConnectorStats: + if not other.is_empty(): + for k, v in other.data.items(): + accumulator = self.data[k] + assert isinstance(accumulator, list) + accumulator.extend(v) + return self + + def reduce(self) -> dict[str, int | float]: + # Compute compact representative stats suitable for CLI logging + if self.is_empty(): + return { + "Num successful transfers": 0, + "Avg xfer time (ms)": 0, + "P90 xfer time (ms)": 0, + "Avg post time (ms)": 0, + "P90 post time (ms)": 0, + "Avg MB per transfer": 0, + "Throughput (MB/s)": 0, + "Avg number of descriptors": 0, + } + + xfer_time = np.asarray(self.data["transfer_duration"]) + post_time = np.asarray(self.data["post_duration"]) + # Convert to MB for CLI logging. + mb = np.asarray(self.data["bytes_transferred"]) / 2**20 + descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32) + n = len(descs) + assert n == self.num_successful_transfers + + total_mb = mb.sum() + avg_mb = total_mb / n + + total_time_seconds = xfer_time.sum() + throughput_mb_s = total_mb / total_time_seconds + + return { + "Num successful transfers": n, + "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3), + "P90 xfer time (ms)": round(np.percentile(xfer_time, 90) * 1e3, 3), + "Avg post time (ms)": round(post_time.mean() * 1e3, 3), + "P90 post time (ms)": round(np.percentile(post_time, 90) * 1e3, 3), + "Avg MB per transfer": round(avg_mb, 3), + "Throughput (MB/s)": round(throughput_mb_s, 3), + "Avg number of descriptors": round(descs.mean(), 1), + } + + @property + def num_successful_transfers(self) -> int: + return len(self.data["transfer_duration"]) + + +class NixlPromMetrics(KVConnectorPromMetrics): + def __init__( + self, + vllm_config: VllmConfig, + metric_types: dict[type[PromMetric], type[PromMetricT]], + labelnames: list[str], + per_engine_labelvalues: dict[int, list[str]], + ): + super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues) + + buckets = [ + 0.001, + 0.005, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.2, + 0.3, + 0.5, + 0.75, + 1.0, + 5.0, + ] + nixl_histogram_xfer_time = self._histogram_cls( + name="vllm:nixl_xfer_time_seconds", + documentation="Histogram of transfer duration for NIXL KV Cache transfers.", + buckets=buckets[1:], + labelnames=labelnames, + ) + self.nixl_histogram_xfer_time = self.make_per_engine(nixl_histogram_xfer_time) + nixl_histogram_post_time = self._histogram_cls( + name="vllm:nixl_post_time_seconds", + documentation="Histogram of transfer post time for NIXL KV" + " Cache transfers.", + buckets=buckets, + labelnames=labelnames, + ) + self.nixl_histogram_post_time = self.make_per_engine(nixl_histogram_post_time) + # uniform 2kb to 16gb range + buckets = [2 ** (10 + i) for i in range(1, 25, 2)] + nixl_histogram_bytes_transferred = self._histogram_cls( + name="vllm:nixl_bytes_transferred", + documentation="Histogram of bytes transferred per NIXL KV Cache transfers.", + buckets=buckets, + labelnames=labelnames, + ) + self.nixl_histogram_bytes_transferred = self.make_per_engine( + nixl_histogram_bytes_transferred + ) + buckets = [ + 10, + 20, + 30, + 50, + 75, + 100, + 200, + 400, + 1000, + 2000, + 4000, + 10000, + 20000, + 50000, + ] + nixl_histogram_num_descriptors = self._histogram_cls( + name="vllm:nixl_num_descriptors", + documentation="Histogram of number of descriptors per NIXL" + " KV Cache transfers.", + buckets=buckets, + labelnames=labelnames, + ) + self.nixl_histogram_num_descriptors = self.make_per_engine( + nixl_histogram_num_descriptors + ) + counter_nixl_num_failed_transfers = self._counter_cls( + name="vllm:nixl_num_failed_transfers", + documentation="Number of failed NIXL KV Cache transfers.", + labelnames=labelnames, + ) + self.counter_nixl_num_failed_transfers = self.make_per_engine( + counter_nixl_num_failed_transfers + ) + counter_nixl_num_failed_notifications = self._counter_cls( + name="vllm:nixl_num_failed_notifications", + documentation="Number of failed NIXL KV Cache notifications.", + labelnames=labelnames, + ) + self.counter_nixl_num_failed_notifications = self.make_per_engine( + counter_nixl_num_failed_notifications + ) + + def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): + for prom_obj, list_item_key in zip( + [ + self.nixl_histogram_xfer_time, + self.nixl_histogram_post_time, + self.nixl_histogram_bytes_transferred, + self.nixl_histogram_num_descriptors, + ], + [ + "transfer_duration", + "post_duration", + "bytes_transferred", + "num_descriptors", + ], + ): + for list_item in transfer_stats_data[list_item_key]: + prom_obj[engine_idx].observe(list_item) + for counter_obj, counter_item_key in zip( + [ + self.counter_nixl_num_failed_transfers, + self.counter_nixl_num_failed_notifications, + ], + ["num_failed_transfers", "num_failed_notifications"], + ): + for list_item in transfer_stats_data[counter_item_key]: + counter_obj[engine_idx].inc(list_item) diff --git a/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/distributed/kv_transfer/kv_connector/v1/offloading_connector.py new file mode 100644 index 0000000..582e42c --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -0,0 +1,504 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections import defaultdict +from collections.abc import Iterable, Iterator +from dataclasses import dataclass +from itertools import islice +from typing import Any + +import torch + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent +from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorBase_V1, + KVConnectorRole, +) +from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata +from vllm.forward_context import ForwardContext +from vllm.logger import init_logger +from vllm.v1.core.kv_cache_manager import KVCacheBlocks +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.kv_offload.abstract import OffloadingManager +from vllm.v1.kv_offload.factory import OffloadingSpecFactory +from vllm.v1.kv_offload.mediums import GPULoadStoreSpec +from vllm.v1.kv_offload.spec import OffloadingSpec +from vllm.v1.kv_offload.worker.worker import OffloadingWorker, TransferSpec +from vllm.v1.outputs import KVConnectorOutput +from vllm.v1.request import Request + +ReqId = str + +logger = init_logger(__name__) + + +@dataclass +class OffloadingConnectorMetadata(KVConnectorMetadata): + reqs_to_load: dict[ReqId, TransferSpec] + reqs_to_store: dict[ReqId, TransferSpec] + + +class OffloadingConnector(KVConnectorBase_V1): + def __init__( + self, + vllm_config: VllmConfig, + role: KVConnectorRole, + kv_cache_config: KVCacheConfig | None = None, + ): + super().__init__(vllm_config, role, kv_cache_config) + + spec = OffloadingSpecFactory.create_spec(vllm_config) + + self.connector_scheduler: OffloadingConnectorScheduler | None = None + self.connector_worker: OffloadingConnectorWorker | None = None + if role == KVConnectorRole.SCHEDULER: + self.connector_scheduler = OffloadingConnectorScheduler(spec) + elif role == KVConnectorRole.WORKER: + self.connector_worker = OffloadingConnectorWorker(spec) + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + assert self.connector_worker is not None + self.connector_worker.register_kv_caches(kv_caches) + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, OffloadingConnectorMetadata) + self.connector_worker.start_load_kv(self._connector_metadata) + + def wait_for_layer_load(self, layer_name: str) -> None: + pass + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs, + ) -> None: + pass + + def wait_for_save(self): + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, OffloadingConnectorMetadata) + self.connector_worker.start_store_kv(self._connector_metadata) + + def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + assert self.connector_worker is not None + return self.connector_worker.get_finished(finished_req_ids) + + def get_num_new_matched_tokens( + self, request: "Request", num_computed_tokens: int + ) -> tuple[int, bool]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_num_new_matched_tokens( + request, num_computed_tokens + ) + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + assert self.connector_scheduler is not None + return self.connector_scheduler.update_state_after_alloc( + request, blocks, num_external_tokens + ) + + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + assert self.connector_scheduler is not None + return self.connector_scheduler.build_connector_meta(scheduler_output) + + def update_connector_output(self, connector_output: KVConnectorOutput): + assert self.connector_scheduler is not None + self.connector_scheduler.update_connector_output(connector_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + assert self.connector_scheduler is not None + return self.connector_scheduler.request_finished(request, block_ids) + + def take_events(self) -> Iterable[KVCacheEvent]: + assert self.connector_scheduler is not None + return self.connector_scheduler.take_events() + + +class OffloadingConnectorScheduler: + """Implementation of Scheduler side methods""" + + def __init__(self, spec: OffloadingSpec): + self.gpu_block_size = spec.gpu_block_size + self.offloaded_block_size = spec.offloaded_block_size + self.block_size_factor = self.offloaded_block_size // self.gpu_block_size + self.manager: OffloadingManager = spec.get_manager() + + self._requests: dict[ReqId, Request] = {} + # list of GPU block IDs per request + self._request_block_ids: dict[ReqId, list[int]] = {} + # requests to load for the current scheduler step + self._reqs_to_load: dict[ReqId, TransferSpec] = {} + # request blocks are stored in order + # index of next block (of size offloaded_block_size) to offload + self._next_stored_block_idx: dict[ReqId, int] = {} + + # request ID -> set(block hashes being stored/load) + self._reqs_being_stored = defaultdict[ReqId, set[BlockHash]](set) + self._reqs_being_loaded = defaultdict[ReqId, set[BlockHash]](set) + + def _get_block_hashes( + self, + req: Request, + start_idx: int = 0, + end_idx: int | None = None, + ) -> Iterable[BlockHash]: + return islice( + req.block_hashes, + self.block_size_factor * start_idx + self.block_size_factor - 1, + self.block_size_factor * end_idx if end_idx else None, + self.block_size_factor, + ) + + def get_num_new_matched_tokens( + self, request: Request, num_computed_tokens: int + ) -> tuple[int, bool]: + """ + Get number of new tokens that can be loaded beyond the + num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + A tuple with the following elements: + - The number of tokens that can be loaded beyond what is + already computed. + - `True` if tokens will be loaded asynchronously + (between scheduler steps). + """ + num_blocks = request.num_tokens // self.offloaded_block_size + + assert len(request.block_hashes) // self.block_size_factor == num_blocks + block_hashes = self._get_block_hashes(request) + + self.manager.touch(block_hashes) + + full_block_tokens = self.offloaded_block_size * num_blocks + if full_block_tokens - num_computed_tokens < self.offloaded_block_size: + # we can load less than a block, skip + return 0, False + + start_block_idx = num_computed_tokens // self.offloaded_block_size + hits = self.manager.lookup( + self._get_block_hashes(request, start_idx=start_block_idx) + ) + if hits == 0: + return 0, False + + num_hit_tokens = ( + self.offloaded_block_size * (start_block_idx + hits) - num_computed_tokens + ) + logger.debug( + "Request %s hit %s offloaded tokens after %s GPU hit tokens", + request.request_id, + num_hit_tokens, + num_computed_tokens, + ) + if num_hit_tokens < self.offloaded_block_size: + return 0, False + + return num_hit_tokens, True + + def update_state_after_alloc( + self, request: Request, blocks: KVCacheBlocks, num_external_tokens: int + ): + self._requests[request.request_id] = request + # the block ids are updated in _get_reqs_to_store + self._request_block_ids[request.request_id] = [] + + if num_external_tokens == 0: + return + + block_groups = blocks.get_block_ids() + block_ids = block_groups[0] + + num_computed_gpu_blocks = sum( + block.block_hash is not None for block in blocks.blocks[0] + ) + num_computed_tokens = num_computed_gpu_blocks * self.gpu_block_size + full_block_tokens = num_computed_tokens + num_external_tokens + assert full_block_tokens % self.offloaded_block_size == 0 + + num_pending_gpu_blocks = len(block_ids) - num_computed_gpu_blocks + assert num_external_tokens == num_pending_gpu_blocks * self.gpu_block_size + + start_block_idx = num_computed_tokens // self.offloaded_block_size + num_blocks = full_block_tokens // self.offloaded_block_size + + assert len(request.block_hashes) // self.block_size_factor >= num_blocks + block_hashes = self._get_block_hashes( + request, start_idx=start_block_idx, end_idx=num_blocks + ) + + src_spec = self.manager.prepare_load(block_hashes) + dst_spec = GPULoadStoreSpec(block_ids[num_computed_gpu_blocks:]) + + block_hashes = self._get_block_hashes( + request, start_idx=start_block_idx, end_idx=num_blocks + ) + + self._reqs_to_load[request.request_id] = (src_spec, dst_spec) + self._reqs_being_loaded[request.request_id].update(block_hashes) + self._next_stored_block_idx[request.request_id] = num_blocks + + def _get_reqs_to_store(self, scheduler_output: SchedulerOutput): + reqs_to_store: dict[ReqId, TransferSpec] = {} + # iterate over both new and cached requests + for req_id, new_block_id_groups, preempted in yield_req_data(scheduler_output): + if preempted: + self._request_block_ids[req_id] = [] + + if new_block_id_groups: + new_block_ids = new_block_id_groups[0] + self._request_block_ids[req_id] += new_block_ids + + block_ids = self._request_block_ids[req_id] + + req = self._requests[req_id] + new_tokens = scheduler_output.num_scheduled_tokens[req_id] + total_tokens = req.num_computed_tokens + new_tokens + num_blocks = total_tokens // self.offloaded_block_size + start_block_idx = self._next_stored_block_idx.get(req_id, 0) + num_new_blocks = num_blocks - start_block_idx + + if num_new_blocks <= 0: + continue + + # NOTE: In async scheduling, placeholders may temporarily make + # len(req.block_hashes) < num_blocks * self.block_size_factor. + + new_block_hashes = self._get_block_hashes( + req, start_idx=start_block_idx, end_idx=num_blocks + ) + store_output = self.manager.prepare_store(new_block_hashes) + if store_output is None: + logger.warning( + "Request %s: cannot store %s blocks", req_id, num_new_blocks + ) + continue + + self._next_stored_block_idx[req_id] = num_blocks + + if not store_output.block_hashes_to_store: + continue + block_hashes_to_store = set(store_output.block_hashes_to_store) + + block_hashes = self._get_block_hashes(req, end_idx=num_blocks) + self.manager.touch(block_hashes) + + new_block_hashes = self._get_block_hashes( + req, start_idx=start_block_idx, end_idx=num_blocks + ) + dst_spec = store_output.store_spec + src_block_ids: list[int] = [] + for idx, blk_hash in enumerate(new_block_hashes): + if blk_hash not in block_hashes_to_store: + continue + offloaded_block_idx = start_block_idx + idx + gpu_block_idx = offloaded_block_idx * self.block_size_factor + for i in range(self.block_size_factor): + src_block_ids.append(block_ids[gpu_block_idx + i]) + src_spec = GPULoadStoreSpec(src_block_ids) + + reqs_to_store[req_id] = (src_spec, dst_spec) + self._reqs_being_stored[req_id] |= block_hashes_to_store + + logger.debug( + "Request %s offloading %s blocks starting from block #%d", + req_id, + len(block_hashes_to_store), + start_block_idx, + ) + + return reqs_to_store + + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + meta = OffloadingConnectorMetadata( + reqs_to_load=self._reqs_to_load, + reqs_to_store=self._get_reqs_to_store(scheduler_output), + ) + self._reqs_to_load = {} + return meta + + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + for req_id in connector_output.finished_sending or []: + block_hashes = self._reqs_being_stored.pop(req_id, None) + if block_hashes: + self.manager.complete_store(block_hashes) + + for req_id in connector_output.finished_recving or []: + block_hashes = self._reqs_being_loaded.pop(req_id, None) + if block_hashes: + self.manager.complete_load(block_hashes) + + def request_finished( + self, + request: Request, + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called when a request has finished, before its blocks are freed. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + req_id = request.request_id + self._requests.pop(req_id, None) + self._request_block_ids.pop(req_id, None) + self._next_stored_block_idx.pop(req_id, None) + + request_being_stored = req_id in self._reqs_being_stored + return request_being_stored, None + + def take_events(self) -> Iterable[KVCacheEvent]: + """Take the KV cache events from the connector. + + Returns: + A list of KV cache events. + """ + for event in self.manager.take_events(): + if event.removed: + yield BlockRemoved(block_hashes=event.block_hashes, medium=event.medium) + else: + yield BlockStored( + block_hashes=event.block_hashes, + parent_block_hash=None, + token_ids=[], + lora_id=None, + block_size=event.block_size, + medium=event.medium, + ) + + +class OffloadingConnectorWorker: + """Implementation of Worker side methods""" + + def __init__(self, spec: OffloadingSpec): + self.spec = spec + self.worker = OffloadingWorker() + + self._job_counter = 0 + + # req_id -> (job_id, store) + self._jobs: dict[int, tuple[ReqId, bool]] = {} + # req_id -> active job IDs + self._load_job: dict[ReqId, int] = {} + # req_id -> set(active job IDs) + self._store_jobs = defaultdict[ReqId, set[int]](set) + + self._finished_reqs_waiting_for_store: set[ReqId] = set() + + def _generate_job_id(self) -> int: + job_id = self._job_counter + self._job_counter = job_id + 1 + return job_id + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches): + self.worker.register_handler(src_cls, dst_cls, handler) + + def start_load_kv(self, metadata: OffloadingConnectorMetadata): + for req_id, transfer_spec in metadata.reqs_to_load.items(): + job_id = self._generate_job_id() + self._jobs[job_id] = (req_id, False) + assert req_id not in self._load_job + self._load_job[req_id] = job_id + assert self.worker.transfer_async(job_id, transfer_spec) + + def start_store_kv(self, metadata: OffloadingConnectorMetadata): + for req_id, transfer_spec in metadata.reqs_to_store.items(): + job_id = self._generate_job_id() + self._jobs[job_id] = (req_id, True) + self._store_jobs[req_id].add(job_id) + assert self.worker.transfer_async(job_id, transfer_spec) + + def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + Returns a list of request IDs that finished loading or storing. + + Returns: + ids of requests that have finished asynchronous transfer + tuple of (sending/saving ids, recving/loading ids). + """ + finished_sending = set() + finished_recving = set() + for job_id, success in self.worker.get_finished(): + # we currently do not support job failures + assert success + req_id, store = self._jobs.pop(job_id) + if store: + req_jobs = self._store_jobs[req_id] + req_jobs.remove(job_id) + if req_jobs: + continue + + if req_id in self._finished_reqs_waiting_for_store: + self._finished_reqs_waiting_for_store.remove(req_id) + finished_sending.add(req_id) + del self._store_jobs[req_id] + else: + req_job = self._load_job[req_id] + assert job_id == req_job + del self._load_job[req_id] + finished_recving.add(req_id) + + for req_id in finished_req_ids: + pending_req_jobs = self._store_jobs.get(req_id) + if pending_req_jobs: + self._finished_reqs_waiting_for_store.add(req_id) + elif pending_req_jobs is not None: + finished_sending.add(req_id) + del self._store_jobs[req_id] + + return finished_sending, finished_recving + + +def yield_req_data( + scheduler_output, +) -> Iterator[tuple[str, tuple[list[int], ...], bool]]: + """ + Yields: + (req_id, new_block_id_groups, preempted) + """ + # new requests + for req_data in scheduler_output.scheduled_new_reqs: + yield req_data.req_id, req_data.block_ids, False + + # cached requests + cached_reqs = scheduler_output.scheduled_cached_reqs + yield from zip( + cached_reqs.req_ids, + cached_reqs.new_block_ids, + (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids), + ) diff --git a/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py b/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/distributed/kv_transfer/kv_connector/v1/p2p/__pycache__/__init__.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/p2p/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fe9ae38b6c0d67d4c9b683539dd600be7987911 GIT binary patch literal 193 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVCFqyr7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?@$s2?nI-Y@dIgoYIBatBQ%ZAE?TT1|7BT{HF^KVz MnURsPh#ANN0KMKbVE_OC literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/p2p/__pycache__/p2p_nccl_connector.cpython-312.pyc b/distributed/kv_transfer/kv_connector/v1/p2p/__pycache__/p2p_nccl_connector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af15664848237285506347eda197d8ea42b6772a GIT binary patch literal 20406 zcmd^nYj7Obm0r&i^8#jo0Wf%@@gRvI@gVpn2vK}LMer%fLTrr&=mrUJFavcDNCFsKv?5)IqQX_%#YV#}K zx&4?KU_fbi^CMMh5_fL*ecyY|x#xWMbpLgExrf6Qv;Wb!be`jWOE<>l)FW$qEFAZU z6F9*V<0dV`77N9#G3%sl*fwb&wof{S9qgGc<{U0Vnl0uUcH!3^b5DAPJuJ-;^G^DP zeUs(G<&zb|73`Ta=AR4<2Uxr;RykQUT*cz9Sa7m>xSGY?v6{)+;aV2=#6pvG!*v!; zv`H`_7EbfoBO|}oWTR1BxR*M}U zU|e(!cZwD(cbXFd_c)>Q1FQD*1MP?Y-D$8M;T7K<~OD7}8&zwAd{_N0c)pj&Kqq;6mCC3x-Xlw|%J+H-L zlgAVBvGME3UUmL8#N*;4E>cn2g*`NHHODT}n((;gNSdKAw!k64$Sb5^gImymIt-VsbJOH!7xLS4VG% z!gNfOE>0(>rjwNKXfi3rsnZ4dDo!P&+fhkChm+zR%2tj}MN#-zEHQdhE_k4i&!U!H z5#OE`<>aV7-|WY$->gZ9JmN%d*dlPl)>pCeqFt~I;;HDE@aeeUOOioMjQF}rE$`B_4S{p|2h)e1u$Z-=o*%jpFQdWaR z=j=&`k&@z4_9WHAu9U^d^K*+qd{?=!btr6;LdYT2Q9uI^+f@f>X!M5a#LCGDNvg$D z)fN-uVTYEt!7R{7Nda@X5v_zwo#5&E-xGbw$*I1n@u{9Yj|=6eE*ZjoC_23oT2p`Dh2aB=;b=%RP5-O>OTFzBl;Tz0$ZV?W+6OUz-iJ zWox&79BNoR_=9(zm)QfJ9A|fX)=+>O3VT&oBodEKijj!wi9{w7MC6G3B9XVJqcJ_j zh5@MlNF*AMCz4U3c{vi1T2O=PyvC+dwT;JaP0CfqZK=&RO*-mDHN07K^xx$$UIqdj-LI?jlkV9U6}b)_dHr zQ?P+g?BF~X;~$3^bBZpZOmKmJ-1qFmZt#sqt={E; zvA$gAQd|Tv?V90!W`RVM`f>X+i)xp}*qBVEc{Z1{cuaalX4^Jvw%5pV2*qSKg1cO{ zu3_H(y>e}K7?TtezSM+lL!&0g;m(CW*3B_Q9-)&H_+5WZP>yw1(&YXLlV|f;QkIl8 zWlPyV&~q8QY~f}&X*cqP?QHO>GdeXT#s!UfIw-**?Zrzwz$pIA z%jAhWps#lexWdf|TLh~>JBUqiU{`Sx7mPXQxiJg&jq+iK;1b+|M|28a!7}C)eBgyL zp&YzH_jaLzc!S;hMb88=|TaS#KN4G>DDDR-v87j@&NPt@X(+blh{8J8Tnjb>8F5`{sG> zj%Bz7+txNUu%14LR4;8Kv?D+vl8A{^y9E74_0tYQ%7u{@L|V~qODZhmT06wdk1W`4 z&8fd{PjTM^hie>|-#aaMGGX$pnVp+3)ZzRi5>;l-ak?`l2INBt#n#(u%aci{eN)XwCSz8^tuDao|AN5cm3Rypcp0XBDyYFofi|R*uv#%L)8i=#n#iaqK|7c8H~!HZ)A5@qKP8D{(?`?xznCUvP`DW7f{K3oV((Fnc}L;)^%hnYqCu{a%G&m{y8aO5YJ89 zGL5^H#@(65!%E}fdHXYD+`X56`D|mWk#&MoS@!olFXze|Sfe|X#vR#EV?@b?J25%OtXGyEy?_mDrvTzH(?Q7mzhp6N~C67w0N{0Yb_8_?{e>! zKjQ9}R4XLQj0Gu$W%WAlTF-Icwv-lF{|tNfU6d58NmF!9n4-q$1neQ(m!#XjB;AoT z_D&Aglx55_Uyi-eioMY{XG6cs^KBWi5A+Dk8o%)jMvW;#3>#e5#%HH-r0f$WV;f~S z?gaOZ|HW~0j+A4<6qfnaIVajD1#jJzatbb^jjv-~=gQ_>sWQPmi+o916LzIsC{5xng>kuBWod^ao>?N1(rb)aus%OQ)9|ggJ59_rj=kG{QPA21n}L5=xMYDA^#Le{u9ThC?}Ek$WJR!#S!u(Yap?Dsl)V9vrS>#0(n^Uh z!CAWgl+Cc?V81yw87muyppA86ld-b-n0wJ!O)>Gpv3hB2fg$<`MMp*c_IUCJe=8aTu&9ZKw+wQAtGlRMH^3+{ zk|x49Negf!KN1}Us>j$TF`Zl|4|u+Z2SUnsU6_s~$9rA@tQwVg6AKOV;O0{RdT*SK zqq+eauMVAo<%^k*hBF=)#5)j1(O5#4Xk+7&oJ4QH(=vvb=S<2@t+U$A2G_SxqDEPS zSQ3gxO*IuXQIvLejESSXfPpT0Yg|AlnL^8-Jt6Z#0`m$ijGv4qM{lglfBv)r>QWw==DPqjVst=92}|d(UclzZ#xViYGM0`u8JarG=(@SP!Z7L$RL=85 ziMZIyrpshEz@t>>Mt&qdCEua8` z*4gOUU4E1UTjX$=>KT{E<3L*Du#d=~g(Y?)Iwh)gKmq{H>+BX0?~IC5$q2|?;whI? zJK|8wbjk{sNw49#^g0C$B&R)2b(@W=<&a2m20N3mVsM*G@B6g%zhf=DjOGHdba)0v%uhoj=%E$ zrT3$we_4^fa&`I0)zzlX_lMpadfcxx^(?rvf$B`4M+x*i7N30W>D$i&M?VhMF3kMk z@X{O0!QO25o`sW(;iapKz01|%)dNS^-MOVV(hYr2g6W1mO7-54s~a8-JsiqeE%p0; zSJ$|B_9qqDy0&ax`{!O~O)XhGp+}7m8y7D>Y+34GuI$YIecJEazi{%$ZHsR_wxt_( zELV509y+c)Sekw8PB-s*ay;FzU#ULuC7tkdRa=<3bxO4isFsdtTDl_If-E80?dCx( z)pV-KNtY>zQb3Bdw4DM%A=OZX4#Z5IF~8XVgt8Dw$lzCovZ{fpsD$&kJUvPmsgBBc z0yRudjpO8it-h?5p;i!&q*mM-my$S0h-j9!>XpP%9D!)ZFK<%y1a>p>3s-1>?p1nl zje-#bs*8?27=B)+H>`EWIzFY^Nqnjmk~j$+F#_#U5@noBu}(0(uYkg7^fIx_Ta0}5 z;E0A0mlsjzY|D$OH$heXOVH=v0UmDRe1VL&aoO9L^;NCaj@>u;KRY34e{UAHa?0yjO84NKX-C<&5u$KQ@Jv_ zb8+=eIX7a!3M#7p;H6l~!i9S>T_=>T6DwV(=3O8A>leG8bq+rBzm#^pl=i>0>J6q@ ze}E0w@tIK9vryOk>1;ipso$a0@0dTwjQegS)ctt)Y0XOL<@;ymk1g1<&84N&C`QN9 z-pAXY2rG?;7F^k2V>;N8t!`SaU)~2cj7s1#~s6}UnbjM3iZ)aYpg|SFgn)=8PJ8TGB zHBO^xk!hRad5sN%x1%@;$A~pPX3~1&fk|(PT>pcrqP+neQN_9a8X!bA6zhN{dkEgx zU_LPf=nF8@ySB}am1IIcUV#4q;W3S>QZO|To)@RvU!0D!4jaxo9KdtPs4-0|3`f6A zJjYppo;Z^x1a2CD8lK}U1%v}FxlKq+s%(NLYTX5VhMOQ$bKRA42~MMShUB<`x44Be zLN$^#M&^_UrAwT=P_7R!_jFNw&I`oCfg`!HQn-)N4$zU3t1&^q@45^9b1{U)!a`rl zn>SJ)%rI=j9eso@XxN4vy+s^I>Yu4n>gu4z6Ab6j_)W(#G(Iy)9GWMUHn+3_b?-&2 zV1P!(jMklD7@MHuzQ*yvY!13V=SBj{yTB%RwarJQtF>FWsafEjR%H=}qU19ReFnQ5DO zjov%5%xGG5%j~6%x-D3rGHDZST4p7S!|s8GE!pe)_s2$$#e9w|xrXx21a16wbrR++m--VlmW7P5?`Nebc= zkO9E#W|_`*uvl;5PrD3zqHs=2*%O87pP>@@a|D>w+*TaQW&Ewn{??NAM9$vqt}|^1 z#D5oRV%CBKao7$O!S8?l{?|V*!(Hw$*V3M8KBzPwTxmX(b~V6GNH=sp>pk_%e>&|t zo%Wy3hQgW9u4kcLn_36eE1|<*ZXNsvmc-Yd+|KMjqwGJk5;{A7CJU>`$16TQAJV;nGToek530K{5RkP_cd) zsvCJ689qn0Jl(0~rnU=8N=kcBtk46iAQ?+qVx{xNk-nGMhpK=Nme;SFjlobRxJ?Oe zO9ywSeY+7B%w4bI>;08)XU@U-^h4Ib803!(Z&b3w2{9@%Y-mSj8ugzbvf0$VaEhpg zbyq2!dX5|s*5#wyQra;=d9%)_JQE+iAtmC8X|%1oZWcr6 z$!Xg0p`~__tbzL4r)#mOzMCh&gPs|uc6b;rmh{)4#hLAq9wzruMKX%M&4xcY4McN9 z)F5&?FMtCEZ=%vr9;4=5kPw2i1FTn4gJGsZDTFU3R8_iRDa@XolE(21l&GLD@Teo4 z2)>^M{zfDd3EE+Ya)y&eE#MOPd6K0_-pQ#dTdD1Y+MOXJ}9;Ln1xF}A4YqV)>##Q8{ zfa9z}{a7!5$0A!T+;wiwR^Y^J`vBvjOR&zue;~!eIeQ9T+c!~%1dls*hXXnAj3fFV z3wh==2q1qz5f^Xk1g~+&Xf#g1f{suaDwRni*w{r0uJ(BgYebyE2?-)J8P~|w=!Z_OriYS_VJ|(#f z!y0dvHC`ZIeWvc2?%e;hZMpN%Y9N#i)Mf&mN}zKkuyyHeB@oUzY=af^Co;ZH#n(vy z?0#j-{$<|*c!q5mO#54zcvS?TrMXn3>_|0zxbhV;L=2u)Nz;Q1`N;fQcNy~CkmyW) zgDOkmcp3$ku7}ZO2_G-yF5$OEbot<2(7TE9<-1k7MCn$I-XpaH?x^>-BzpLB>j(M@ zz*p%%ZPI2*53*a(p&$Is|0{h5C6Nui?kb69mRt#A-7TQa2bb*RK;5*iGThENldZQ{{RagMz~Xk`&~En-=Z;xWF}^ zV@9AG#wXcamY_nKOHh>?EtE(t-GVWCMn4sEew+u`*^CBK6{Y+T;Zz=I;N~h*8`rC} zbaACBzsjtj-y*o7bkD=u92ACz?8hqzn$FvWz3JHWIQ;fVXEVVW6Y17{2cdPzNpY$` zPi04KrrU<@`!{s)8qYj9X5eBo1CvXP!N`L7M3t}Kt@8Wz(U_d*W~P&NGJ)32ns37i z9cCKYW2_RU0O;oxW_JtvwNZaw-!9*^{#8>A<~Q^4+HT6P*Q}XZ0j+^SRSlR{&8LHo zWQLf=8LnaH`D&F>mB5rk=*+UFid2&dm`~IcV|s~`UkAD*|bhPGuw1HTFl0H-6nmcfHv2Qpn3l&%XnM<=h9Pi@w&s&t?eXZ|g_ zo}A9?xv1cO^Th?vYOwiV+m=peItP``!R5|lA6i%V1e~!P>1b$=5Wml9LS&Brn^#_5F9=dnennCDXD` zY1y~jGWfKe-SsIgeNQY(%dTu|N2aw;Y3<9oxyrWZoTIV|uc1}KG_U>sI{<>KGi?Wz zwu8(5gUog3A6IKlCeWn>x-x-oCD8r&^*?*_XK((Oz+mnWS9b_NDCVzrP^lf9_kLX8 zzH}(lu}|sPm#N=>|LpwH`CHlco=p2urTyrKZA$wI*eO}OmeI6VY1#{8rs=@^h3wYu zC*I7?^UBWi>8-*llx?qk^qR8m8m2$n+@IcaT4_EFM~P4~jd8Dn z|DnCdo?<8Get#?|Jf?!Vf;Ztne>?7*_c6 z3um((+cF*dm5%*Sol3{cA6k@-V`yXU3!%NNtT zFPSS?t5okKe~kRvZOOLuW?QyxvO<_kQuwtQeO&vb`dMIKHcid=Y{uO_Jwz)0SyjN-7t3l0Ax2`lFedt*zTdd5s?s{@rY2CNr%Qkgnn&73?|D@;1 zV7lqZf*ZtCx242P6!&r4hCcp*QPJxbuHisVu;A^JjEh%Xe8#m^aczC(>H>Srdo_TE z_RAs|$BqZg=(vQ)X69rm^AQRWm0fq?RRwb+C*v@B53K7kku1)0#sK+e(cwX}evq53JdP5zV1i^DWEo*h69K zDLO7=+tVLo)ijIRrtKNbq9)qVkIQt#Bw&ovSe}iqRQImW^4~~Scc3buBRr!;dPP{H z$fZ!GZMV|4`^mLuu7Op5gGQ6II~N(kbRxlM@-GkpODbAok zn8P-y4<=ZN6Ambx=qFux-G~ftc5RZ^Zsaw7q2}9D4gw+bKK}yTV19y^o&MCU`P-Io z+0bl#BoYzeZ7pcdc#*E&?css0uM0cEv&~4--`Zo2dIf1n$scykdasx+y!>oO-cc5t zX%a`Tyzx3BBV{@`4Lek;gpW+>9kit$z1mrwx^g}J1 zqElumt~wY(mOtQ=2?ipnHDKy7+?Sw-_5P55AbJ=@-n-nVPR>`Gt=)D1^lI(C`O`R= zs|-GJKXfl{TN+pi^v*ke?G3C})@tsgKWLdhM!u@kkG}cvn@jyacxNTJb^gR^ux25- zaA5x8#}##phn8lSV(DG4rt4o@sQ|wHI-{o|@kV}0jFhs7yo;Anpn2;Hj|b*uXxShc z%L~)(Nve%qIKR4_O?8>J#U4#l;xf)Pi%S8VK4jZV$-Pe1ZVWgcp`iA zepG__5s+_NW^m{|RCHEIUEGbowIc|AWH}F`iU$4-fFjKNv9QoegS~HO9ko30(>Dj` zU6?zWAT)6qvL2dw$(@>b=Ir%Jg71i2!Bcj6eMCEcJ49J2I7bP8 z1s?z69tTh3eFLAmTkwgAnzoz+zc3X1Ex9s^xwwY*r9P#8*XNWWcgAwd;;2gZyp-c8 z@=;LxO<#U>jXlUs+1w8QfwU5IWLRS=PTCADf-zdtIg4$^He%svt;>>rQ<(!aLiV7k1yjv)Fn6 zTT3>|@p#Wq{qV+C+&kAClsDJg=-9qm*^qPKmn^2XTp7h&oX3x{C+24qcm0~19u0UL z71d79VCSoDmo%F)PLz%LSvZ*NUO$pZtCbjq62YntNxUxJ zk^T)mB^uSfJkg7-*ZlHCKb;Nt7JSmE7oJ1Lxj4m5qqp}~fA0vs*Cmmp(rA`gLp}2M zs$pu^R4@Ltk0NP`p7s)d>xOEu8x4*O(YwC@>qfpa(8K%*zDWr@MRi34ns3XglBafMEI+063s^4#9(DhS^PPH|$^d`l~@mG77gYkPW z-F8yYjzDz{YF~XiEd6KP(l##>Jd(3nESAq*oTdH0bA1Ze_gk*&zjD66;oAS!YPVQF z(gF$RS~Lf!xu&fxJ=MfNj)1U?0sN$RBkKIEbG< z;2bR&C?IA-pm4Nkpoo}_0oQ2pK(T^SQa*)$k5Dn`>nnW+>V|5-O`8VFe0tjKD<4+U zS+A+z;XSB<3ZGF)9ir&$*C^WZj*@p4eJhEd75s7pzk#Yrg(vqD4A|#Ua2ne8(u5BT z*+XN&u84QMoeN6XQaCqdLN5h%= zy1S1Z8#&7iGLxL??8Gp5dc%|BKF&NeK@Xk|Grqx5Z(W20!cqPYjs?}u=` z!HH2!yWbZKjWNf4qhrkE$+58jxM|P%!rqZ_Fl7%-FpMu4_J&A$pSYm{oEc(zhh7{E z27Q6hP_nhKYLpKCD&X(lHfWQhlmf=c2sN!Z2jk>-71PRyvWNO=B}GlE!Wfz;f3o!i zJAVDbI7@MisDyrrC?YECxU1D_Dy$WP_zi+d`qIjXLg)bnWuPL|qw}Rnx{nQ|#@uPm zW75)+P}FZL=M@Ty)O;V-97|sS@xSg6Opi_{S=Z7>hw_`GS_`HK%s;92SD#Q*lh8T; zZZ0md>Se_rs$N!jbbTHzmy6@sd%Wu_-oEZ9yZZWikDV6W4)>+F<&bdQXO8zC z_xAR69XQs@Wf7+XXAT}beR!akGZO2`Qz!aQa0X)TI(F#9lqUp(dc1Yqi<7{coOku^ zF=pi4NN_OVy)YP<@P#_uRUsx1nvk)9#wl7DJA9#h%CQiq^93)AFk?YZ1rvnR2FA|8 zoaNLb!QnAZH9j`(Q8PIh%aV~Or-z7;0Tbd>sH-868`S6Ln!=;wP2(fu4dFos(rmam zv2*vX^E-V_fswOK1bsu{F{bH4 zOA}NO|M*J!f>1SIN($*Qvf@)Ro+Ejt>eJ!Oi{o2v5AB|c&Z+q9&$2ZkRO>LrX z=T+r3;}v5xuZGoc%e2-u2)@;t^^ zyVPL$z}Thfv8aAzQGsO$!iY+hG~Igw4D*za!tHKAd4}?-d}>;Olbx89#H8_?li^c7 z5RNP`lWJJMvrhYh^ivbQ3E$zcZ*b7I@8}pk5kRxq>wRfrFp%_7!9d|GUN0=qW8uN@$XGB0VKgv4hF;{f{Az!T zx`*XraII#l1+5zlAqQx`LTy-bzuA|tIbv!@!klyM*p*{zYA4ewgiLFhEO142?;C_D z5L(6n3$B5pOK7D}MPnPOeHvN=W-WXbQ?OOgI$AMoruDCB2DEf3Z4lgHvo(U74q}(m zCU7e4ovy!Q~uW?V_2~F5Nb)#8AU(+5z$MX(y!S zfZ9E*W(7VKge`>9IUfVP{9 zPKJB!Kqt<_eiEU)VC7Xr4K|S?fE|wViu-#QsRN??dch=pX$b4bg)u|DMSWW-?);Qj z3=tN0yoe^nUo2@v6*7h`LSh1K29yV`&_qFgf-&VQH_*iN)l(31S{u>Q%JbV`>X^c~ zI}k;S^vO41hlhGg<`MKD4AF*RY50SzNt>S7_h^rbbl zR@ncibzdT$E+w8m0yeB`NE6ZXCdNWzjl{MkHmi_ta13IQc8ch+wNgWVi8AOV+GW}h zF4YZy%jwoo;+sbJpZ4TqwMU)@6u?!V43VjAGoAqtgbZSp!M@*@s3i|8eePT$q zzgWL;jgXZeS|w8n;|yvojtoe>HZvU1xD8v~!nhO@?iUe0kb%$V&IO!$P5 z9<;PzO!@ld)(C6(q!2`CYhj#Co5QCCzm!$eOy^7ie9Vw-55!@m4S-~uQ{XWSon1pgFfJ275h%=Vh#JNXo+l4s9CeAiu+bQ?~uS6_8I;>?HA;Fgl19^JY$ zwywBhMAXTh`vjy_l93QtVTXejWgb5!UX_WPJ|?6*DSu>Z=tq}2d|8O$|E6G)zH}j! zUZkrecW@Qyqcm5yWD`sfR%SPlZxwKrnTy~mff7Qp(AxeBqD3^klSF3}fI)bQ`ld99 zQFSm_hbikxpg-M1gkN!=^|_llMF*!JXT~m00tm%XoRK#pnaUYn42Q?P#Ar=Yxd4gr z{O`yUrNF8yIddvg%oU{CynwYagUB)i4Lnue(10oJ-PPRO>~3f{f1yF(s~h+bduupF z%~Z!jQ)m=R;~ojQ836T07$4o}?!7oTIv((KxIfp1hsHaanmFsIZxo6>7Vw5frhMLW zXFq|eKY<9_=-@@~Ss>8Q6CchH0=|$3@(d@>^n1HbKik*MsSfq__e0gRJWjxA=|Nx; zgIo>)Gg9$5PO)p^2Bf`t{q^_K*Z}uPa3oysMkb)6imrD{JOQzinj~4yj;c_-dusG0 zfdwUYz7tYIh$SWdNeGfuFZ?2Kt}l)S=#Hw!*5P{hmQq3t&jALPGER*pVsMRPn$WJL zeHTWCd<^mkoEl>?*=SUa1jC#Tx`K4R@o9u>dKqSn;S64pm(lRPrj){Sc0@CWnX%Ed z&gZhyD#V#n8-P>$L(sY!dSY}OFdImX)4?!9q?Xgc&>#kF2s#aMpi?K#^!J|PvY0V| zXBcnNS$%dSNORhgV}SsGMA~uwt7m-Y0Lnp^lP8WHJKP7HKQz`LJrX>}86XdO1X*@2 zi!^QsxOSg+=t9y8nDGQ}Wqd;y0FzDns&O`PS|0P|Ov$JS0CT2fNMg|p2gU}&oU^C* zVAq*r{a(b-PM$b%%!|00_s{_bxeQK?!<0eNg)>T}V36laI*@VzmOu)gbQ=s!28RHO z;0FcoqY1%q1qfOyEmus;I8%Hfz?fmAk9kGtBg7Sx=nwVQVB3Z`%@E+(ArbY)A;)FI z$VeH2oIS5|v!()s$nOBIjdU;ob(fQ(=z2nd;f=!JU25Be;Hx5r0 zrw;i7!~Dp_k<8#;%-|NxAoYvw$b=c(yqNu{J%NrUAHOtshlbAqp2%Amr20bG_=pOM zioi37`QXg;dZ+T!=5wRqeiS3`frbAvwP~ds#WU)wri85&K88eLC44OF`9<@Axxmty z<@WdX-`XFo?Tl9JiRJIbkn6V6C3DQyfWiNZ#j)-ve&gcIlk1k;+4lL)xz2cAHJeu* zv(zltv6k&K>V(NM+w$wX*K=Lb;-*+`bJX0NC@6!-*$I0Ae2kk~sM5I(Pq&@TYhM|-lN-zHnbAY!`RC@Ii`#dw_8lvcJEvmy zCufWaQ{F7|>-!c4mRnbg8xmy|Gd*kOl7zc_rsuA?c%#g-+bk|eSr%pv}rvcf_L1fcWwy9Lu-3fQi!sMn(VXwWXQaN%rDOFA`RHJg+&3!la zMax@P%2p1qbVl9#u?)o@I|}DtnS14pX=n>`qPXn(^NY{Ni<{Zv=0DB*Ug38OW5s*0 zAqo?Eb6j7@>I>ugQdVEORK0AAdv>#)-O+6wQFrH>e$U7G1@rzn{~G~F$ppc#nXZ`P zdKasAE%e8Wo7mzeK1B>#T+QbZD{dvJuRqq|<0ALV36Q9WB!A1i7ky|XUo2q6<@ z>$Rg-j>gRmthpg!EnlnLb5Etr%D$&kyNpr2^S+U?SFIP7L`$1iYS_}M)t4l?zmiqO|hInN=TiKo{D!smcasNs?Bwv*~t6sB~!FbED$E{_o zwJdI}VXZZ5Ru8z<;efUz4)kEfYQ=9=4LBPU{AZ)c#aFwk7A8jLtYn^|{rBF_=et7G%(R(haRWyHT2 zN?^!}>x)@^aa>=)>MPdtRU0lh#w|#g@)ychO+`!UxVx2gx2~LBb?;htd-&vTtAAkl zzTu@d95+|6=883Q)yMj591ELDWmX&Hla+Jr@Rh^ZMJKMD5JMA# zi#5B5p@fBL10njZ^bvbCYpy1Sa@Jh_ONZHD9KWl7Zw7^->sL-0m}!z>wg174J{=?BvN{h0pQd^ z8p0PK91*IJgU7l^s0wrSjh1TG9t$((FZw(0tVS`=4+@8qV@u4oVKU8>lo1AIdFb~^x@bz z5m!KhT7S5wx3B+j|Fg_jFeH-h$SN|IFtSL>F{jYef@L}63!fpyJxp_&vy)+8h|>e# z;bY*080kHr=yju^bK`?U=b5i!BFM5ZLDZf?trE39(0Eyk)T()O%&(y@uCb656qLC} zj41HIAPUE+Ndw}Vo1_7G3NU*JsCPIJOqPDil}-ny`~C!Chu%~mHJGpzM=hSH-UE8V zRIp^bS$Lyx)l>&N*8HBiJv?dmozTsx8&k2o)|no0NBB{x1ymbF9+TGz~N2~WdJ&$T00j?BJ_yNRWkPoK5aty}ZwO>;nl9!S{A6F~1B znLBb{rv|dgNI8LexjwKsu$;{n?~FQjMy)%MY%?qx;?5S<*%EWM0m)Y6es^N!+4%ND z?Dj*krYG6F!=!Y$(Y<+YespehsVA22Su@vuoRdFaI9C|YDQ9!am)dW3-srrW)0A*H zXVvR=&vIqF_6fH3iM#f$L`~hz$s3dLnl84c>rUC7XQLM@HdmgB$`~!-O#@3ad z<>zPBRq~bF} zPwFkZtjcZnQf|;%vgv)<_JCAmwgbL5{Njm_RAr%UVT+X^5d>ev+ov@V4Wur!*`M!$ zp|UG1qIt6DtOmCg`S{RpvRLRlTE3hpIhDEjO~MJF-!0WcFhw*7(8}jS%6V5J8z2-; zhYTf<3$zZyhX}bTol1pqN)Ywv*g`6BRx6u}EU}>#W9cHgNA{NPvAv~_P!|=lEu^1< zo*q(qtwK#wdQK!{1V2Ee_0T_j-?=4M!4!dfWC=LjL_m>o?Iy-F?Xmr&xO$hYA2K~8 ztRD}lpJobxj4x3?L$ZEKInqtOcHk;kr-!-9%$X?@w5*YwVn8F~GRP zAb`nZTyFFRdlGYXlaQhFBjau)Slr<;Af`m%_aLFdCccJQzV#-_@^nh5M`xdcsgX0TaWf(ZQ~iN;(;Csn`~WG2w04k9G19svrGfx`z1C40|KoZ~d44juzf@`;;ylK6w* zfXt?}8fQ&!Nst1|N^{|I)2!sEAY_0HGr&R~`{h%fDoCdj3AD<4{SSy4x&pKTK!#?@ zTo^T#B>+XqsfguNBpTX)46@8wmJC1!C9Jl%wVbt<$E_aL>RGeaAu&@4xP#64vBj0h zFI~yLa{+)4D*zt=ivgi1U^+-wiJEIRl5o#+tEO#IW>y8A7hvx$&&hSXODG&>b&qU<(^!g-t8f->d&_{hu_lg-^^JO;`#MAKMA!<*H>T zQgI+Oh(O*E#fGgIa8W|eHL|wGByoCp?(lt!9*EOi%31uNfXa49vr0D$DS&ly%h}xW zrRUh(=9Tt%+i|w-_-gL)bw}YsJEBlAN8R$^s-r1tZHhXY)+=h>RWJ9y=e^~PZQC2G z*cY_|bXa-Qa>EjLx3TWFXnRl0-8-ALVJnN<7IHXv)n21<=S3x|G70`TT)RGD3GILwp`;g&}#Z z_DGBBho&7z_NqR}?{XhCsD8A+yBR(oHFxcT&re!3M>=#r*{eCK)&10m~PKI5`33l>_r>D4zDaz!<0|V1|10xMaxTOC2g%k-oby%dO1%QKTj;No9 zHkj5-Yl-xj3{)XYQ)?pH2tpe&bg%$x0HmZw`W`3<5h9aXD58E8bp^hNXJja$m2gK- z(pN?y2_h~6pBm)%7oN_ZU8<9`a%y4)H~pV3VID|s=Tb2 z*G&5mEB-p>!M(D+LKTSt>U|e4mGPe%d%~qJamXwE*auBL+Wi zNf}C_t>r+sh*_YHg)fxJAScBp^u=^mN{WpcnSnKuC8Pu;GSfQjR$6}^PSF_GV|u_q zdLS7`7*!}+5N;Oo@#BJ$q5PsGjpP9>hlh6=N+=o56ar|*A5HIG*QLH4%0ZHJ8<-n7%zA)U;pqY0u0S%2eo^C0b zKgTo?f%^oIFUnKGA787(Bcr|)>QMdi;0PQ?kAy~oa55bn0%|Wj0dg7)%CT@JI2?!b zXd;Ktqj{IXnfZOx{s1*x3z(az5mJy~k-vjh0w*q@X%V%rf(Bv@&`z8N(+o){A*bap zQZX-K7>#dqJUq$iKtN$I9A-S(DdZyZ32?NI2k2f>4di?|HOgP`wIvOp7h0To3%r?c zqecq%Hip)X0+b5(%0jqyi)B9UxCGl7o8Qs~s9O1baA=i^fzEZ;8Gcyb-)@ zidA>77ZhJFSS)y}Xy%AS-Q|}qg>PQEaf!{}LCCIZ)>a+2)v>m^HCqFaI}SJRznZ^e z1LBu1mM#~t1#Pq12Rft@jFdG$YAIXKD_$7>j&CimKH+JC6lQxCs+R0{xccnkv+?3q zwzzer{kHv%=1%y-SKfamR(yK)AYgO0!kOcncFJA==-sNVa#$?|szty>1$QHHD>XYr7WBtX0Mr>HsDZGzzg9l9S*$*<_4CsS$oqJAfIRG z4To!aH-F9!2fl_YhBs^G8|E68)Un*EHGTC)ZvMPy&htjyjBeeMe{K58^ujYSOBH`A zues7mjunxF!*f3C{zQ_1v(28FGXZRwl?}u`knLHy*9NW(#Is7+tdfOrytIieZMvJ) zylyYVGk75DW}lhM`pz@&o>}R+?fCBTSk-}#+W)rmZ#sd>&(EGY2#9HR3-pa43rjd# zxu9KAFTHd_k7#QLTidaw@7%E1XR;ogS3oDSITfEjcphlMmS|S<=ONrH-=}sS(W>6x zWdZ%8%AH5@R6o&bj#za+$C!F~iX>e>Y|QgO;e~O269o$&jL1y&D8Now)TbXp zNJSK|6)Uc`LCP|u)T4+qvc)Rla)C)Gj~qBDU13B5h$d%vNpZw~ggn0>ffddZ@_;nR zRYIX&!WT}&M6ycmvVj}{1^LUsBtnX~4btjqZCE^4gzzq*9MU&6{6lJlTylm2!UTv$ zaTC;KZi1I|5iL1sb7w>ZD*&f5gp)vm?12#BN+FgU(Sw&4;TeSJy7a(dM)u&hwGT(Fr+?6Bgbe9ZpMMs?^ zkK|fT>^&f1f#A$k!uSE--I;zs&Ly1)EN_I`cWez4F(!d<9D*5&0O7+BZ(oK7Sp8jF z-695|r>9L36O`$ITo`}PR@Y(Gv>|2uAKDs%&Vq6vt&yP~GyZ}6lvtdgov?E9t&l~) zU?iQ0g4~V>X6Z}VWn^NGNp6BEqQ@o#Kk<5l-~+CDp=AhzB$KiqiNXve^xsyyxR2jemV#GRHpaB&kC{S`&_8I z|2ZL?^rds@JYh!DwzpNV<6!&Blu>M7BwI67YWoJ^&LgcJQEUYhAl)xK|5J9z7cw#f zt8^fjwn{+GIP2*ga`eYQer}n#kK73|R*gZK_^@jDhh*FzxihA$GxXND(k>BnJZ2u| z152wG)>s8KtG;?+3)`l;`m6E>5Lcb@?!f5%t>7cA__8#?lUqw)NinC%y!yhuOJ}F{ z8%;fS^}$PN|BBT|YS(8esAHY*oIqxaUHy(w`*g{dS;f`ocps0Wk7r1K6Y5U{GML+F zng#8PN~z2s>nlpB@O(4!w}eOKnYHhM^&XP0%@eqIr!&+2gypPvpO^@{PYkWgrjryBPgEY7?j-Y0MxJ!s!^_cwi9li98Y&iYXql zK{Q;bnFS1|gq|A*gU8Gx2d|7eug9?IM2Ya4xLdGfuro)b(?d+#*6!U*aAfETVjaTntt9Qk#53$vUV%2aQQ8ClEaA-ZRaDMOH-X%vY&jUg~mb}^CE3Yj0 zRxK3?;LQ&&9*(;@SXakiZogx@bDr%w73(}5bM?;}kb4DYI&QCK?QoKG^OYN~+_kqQ zid?fj>je$VbiA>LZS1*Q(3_}hdT;lw-Mp-C!d1L5xm*c?z4e6)YT#@O4=UgkYtIvk zJGR^Pvpp!fwD4-Yw4E(&kCpCVZ9CU(h4V+}jwYP&-xYUmW1ZU)&g!_cfps>-oK4^e zoVGpxfrZL%XYFlpkd&K0>zgZD(!_GApbASxtgB(xu&FH1$zLxhxn8(f7%!-03qS-p zUf0Rib>1!5vtCw>lE)a@v3PvF-1BbLif!fGcZ*}?`x6y4DCGq5RON*WS}gHi;OY_N z;K7yctZnD|CFvL3YQ%HQS*?`L=lZcD8(btbFI~>L1j9zy2>8+43i6k0u;NaYqg7!0P~x z?YGtOU8mSxr&b-OHbCfqa&a{ncHmEfG|7Di0pb$>v3W}kLSDo9IkFw64 zcb(e+*2yo7+iPO>nncS^=${jFCzcMA!#ohuYG&a`tzga8nF2O7u(pOZTN40E`P*1~ z4d52C{Q+1^GFgC?yo+95o_?j^c2BIhYxZEmQ5pN_UZvu6FuMxJ9qGuOFPwwhP* z#X9xo;TwmS>G#gxIv=a-xZQK7<@-Oj;eGlGE?z$HgGc`hU|k@8_9$|TbxlMa8X zMFrX7J!+6h7!QnunQ4f{sqm!_6;KklR1)Hh*R+INSs|=8>wBO0i!(DladRVUZd{&X%^l=I zOf9RgUG8A@JK}oKJ8tj3tM4T@6aIhd7v?`f8^KdiKZ6hOGN7%a_H)$!Gid3JzXmQ- zRq4C%L+FXWgAQOeFqu}+(r{Fy;K@)v31JXNg<%IynJUT2$_%(FB%1e0J| zW#>v~D{!l->03=Hi?4L70=J}^-fG%#6wP0nyA*dcvW~`>qj^>hyZ8L1*Ds~S5q3uq zXRd@j3+@uZ(ZNFdn%TW+fvw>0b1I^M--mJ```>Ci;8MM>?4scFLzm`2RXS2Hjxby# zVX>6$6JGd&S@>ed@ubIru*_7+QHnWcCnt-U8p35nwn;DLE0`W0gTV6YX$@X&^NWLDh==gM z+_9MSN*DNBrO<*&sF`0p_!4|j7Kc`gFsJ5iIHH0JZFb3DFohk0N$!{nj&oqlp%T8Z zb*UbCi4G*hlW&Eb54-rdu;s`dZt0-i-9iZIE8mKE5GLPBj{)SB{wgVL!Gx0&6)rNg z9%Y_Et{nk@3?+nnC)wLTJ>n6mOgqB+Zc0C*2N?=N*Je0@RcOtKhBkbm!x-7?d$f_j zzBn6Nw&o!}(uAC$CMJ)$?;sD#{S>0}D3@B2LP>$T{s#Of5Klo!?KDb73((w2pN|pn zIUeL`N`vX#cv+2LUm^|&(l7#M$3sR0zT+jihYuaV3v{7zc$^U>otZ;eG6Fo}V%`ro z-3W?9@SNX4KZ4jGzQs$C6W|G^hdvDLE zrOD7@N0PmagWkct3|?&h3$hm|!2~1$93-h|1=6Dt{xwAYUJ}?d)dje0di5bM6(vcy z%qU=&-SJ>Y5mtF}=qt0o$;MMj@dz`LT;Ec&eSADxiT{NBK^_p+QC?lrZk3ybI=>mw z63JQJw0;`z=g1IuvXZkpY1O*==W6Qee}MO`#XV10qv1$ptD7-mxTpxh#a`HXje=(f#pa(r0EZMeul?oFY=3s?w99+%o7v^1eKUeeLim=?bu2x2imi0TV9 zVM63lCsM@GiBK4bz}|!i zg2%+K+5a8P@G#UE+!r1lAAuKw1A~)3CdB+bx*Lgwyduo+BxGax5qc@SDQiQr6;oW2 zZ5YDQFAN*_lGUCwlQ))<3(Hht`m&$u=K@kUgiyz2>Rt&|R5x?vXC+Ou#)PYSIS-Iv zfP!4RXAOxw*OD5qPORoN!I~Y1Q-p##ggr}dl*Y^2*|PSPXYZ(EWxWepf+}{f&W^Zq zAM4zAd+MY1wItTK#T7u{!g_eR!&$ym{chD=M>C+gi^c`@!r)R4q=U;Vg40XkyUzMg z^Qh8h5K%0uyI)9Edw%Z>zH_1#OdFQcRZGoMd(^Xg)zfvmKWgcU>bo8kL;BHzy3fgp z${%&vyUTPRlxezaO=(~Z()47F{oni=E6XbTF+_$6s|5!+3jZBQ4HdA>3OR;1CAJ)G zE_?qqWz`c`F}byiyxxn)NEu3s4`mY9Eh!MJSCrfxfQ%HTgY9qB7c=nJh6`$L7`@2?b0(ih&=P4*K!z|aA2 z-%l0d!%}V{>6`iL8hL(!Q=H@!JsuUOMqy{>IY^7pa-ZM`@#HaILx(}sUO;UQwbxO5 z12sbRAsNWv4#K>N+5l=hK;yDULSB4Qje)ns;mtKZ@eD!|d4GV913urlz~mER@NIBU zWAYhf+NW~UCqGthKTdi)0ta#rBw!;~B5_MOYbl3!20CtZ#0b%b?;(}2`jWRp*CUIO z<^FiXUbbQHnttCpJa=QhVvgEd@Df%x+t$6N@4@$wz;X7%0v#`DW=oo*#Vt{L>zcl8 z1HiA&#m;!q_SK^8bKSS`Kq>BOfI~NUJSbjN#}?JW84V%tjDT?A%LZFs6<7yHeAwWn ztH(Dr5Qrz7RCFjjgmYI=xDt|itc3ajt)%n|J}F-qb162#h#XC*>10OC`mqc%pw3*BrAqQinaql^)De`8a zOv0##TFI;fLZ}om#{8{mL0sonS21O{C!Hdos(zG!s&uzK6XJvWMdFN?@)F92=PFxh zs=_1;tm>O;AX_}XBm&ga=qLy>3=P5=J-!afKhr_3<06p-`@Vas7;m-$N(aQpNN^lY zL=*&rgSW@Zvt|}3M~dXb9kiYTjniXBr|}{yb||MuhSNJ9W_Shy(!1=n9a8lHyuHiY8&zR%Hg2ec3leX{ZM3?8l$nUfunmL#*w0_}k7QSDFGdFI=$F0__MGFV7pIAH*b2Y@Q zAYedPtv9mQt?szBjYvF1B6}6bP`F@rTE-Zprdkq{p z8!Y_8Y=6^#RY4S4v2d#gO8qQjV0>Xh&fVOp(s?qBX1j?MHhyrEKQl#X+ z2D6j_hLK`aprw(g;KOfg2{1hDBu9=sl@>|Z)I?Gh5=pA$=?_8za+{J2!JC$%4DA6+^;Pw^=Ea}#LF*PwXt;Op2(kjK9!!Or{&XdVSw3`5XBRFcop3_#tZFL=DkDW(~EAsQm+K)u{a*YRG5u%;#rl`c>3$BjC|S1CFN_w6>yF zfm*wu9RWjV212}sm&gi=Kr{$R%nbHHpiw04S4&#Y*R;!dMYn}(?EKxO`>IzGp$7PhVA)Y7Q!vPp`D!8n{QHdGm;Zs;>Ria7dxCZ9rlg zG+-9|b$T7K>Ztvzay7o33h&ZDDkig3pTj9)31YD#|nm z6bX05Quhu0rV1?^hU}TpY}=KK3zb(cEve9Bx&4L(ek6%CG~d%;kk1Y(%$mYaT{)WK zCHqa+4c8_G#+B}S`1!P2*7<(5MpMOi684X$nt2{RoDP4{3g~qPc_(HBHA4KGhVb#? zA@W;7$#;MO8^TAB@z-K-rgJ8wJ;=}2`9cg*3h-3*BzZ9xr#cTylacoV`Z@47tr{T> zc!j$W_zwZoAp@3cC%!4ac-6>135|e;NK%F1XRVO@@xIL9V;&57FPdLP$zo~3YzYa} zy(PJn95Hth=W4vr(m38a-pEs7UU;$_eiM(p+RY#Vz&S}u6XB6Ss4?gZBN^uCKnO07cqOA57Y zC1_eoQU7zQWy4T7GSc0F}D6v?waMi^4Mc5E}^)2Q3ioL$N`z1r{h!Xha2MXWc@(o1zH%!i`dF zk-qetJ2NCjkL2cpymM#n|Gnpa=R0TSpKEI!6oi=N&muKl6!km2(F3C;)^9^%p5iEu zj!~oPl^&(Zy=l}0S5wS9X&JT9Y92Gnz%z5qI%ylV(UgheZM^+sT5WB#hPRlgS18VM zhvJxzP3qH+)eAN1sDbj3YlU2!o;&KCq67A?vCMFQR%*DA6bi>eq6jz6NIW71V~O!` zUI=Tgkc(NnS>FkXd5WiS#G|Gw6mRA&9DNL)l9UNkQ|5q0abDmj6T;M~SR#B~v51j0 zuUJAHCkzJ|#U2dCLz8?ks5pYb$pn{*VcHc8zLN^Yv>bCJE-9X1Fcgm`q>vOz#KmCn zF}02Z`7YWgO(y%2k>oxpB#iUYzFVooM~_@T%=g72*ZPuE(nKQOcOw>??BgP$Bt)*I zB%bTLej_Lep}0843z!Zk;&DDKC4{~k{e8*($v%mXiwPk(Nd_KFCK9pU$H06FV+*s1b*arCH}YvoL^X0UE!7T7o3Uk2{2 zKF*>Lv9N2f zqalzEvDe{XAq)!R;s8nE-DK!i@EQtTkQCny1p@=@h0v|YWNK1vA~B}-Kng;i38)qi zjq~MJBJt{0de}q~#V{1Z_TZpw+Fq#@98an3UiJ(>7D~k=(%+%}{o3#^g(4zfF@dY0 zSc)GKgoH4_o*QGy5XrbjHYD(DBz^-{ienQ(^;9Dw8{F|r~)ap6qs}7ko=HrpB3gvuh zo1PV|(^KUu)b^F{^prjynwp{lmSF)Wm2M-@sO{`zN|e}ZJczsqN8tva-VYL6vG=TA zU|irs5--3pQ8%`tOu(r)%EF^G1cgK@&IP3eK-o=R2qwmY(#?cW51k7*NX0QG@O&@^ z2q`MwPz(+YG|j1UFG77(2I{d4R+MZE>P3Ue2p}k@0j6Sw+P5N-Vi)*vXptusjg|Wraj)QX2nedE4~XIkrkktrp;ym31qH>wfao?=a5gWzzT#|O?0qu0 zvh&c>y0+!*FXifvExV3k*6@mR4|(=_uI|#Z>(W~L&V2h(x&7!;Z?64Z*0O4EEj;^Z z?d20$%Uw_5$N-kuzGipj?VYl{bJgBmz;u^v?^?CL@Ri+t&okp$usm?xcYSH^R`K74 zv#<)hloI0MlE}hCoF@L)dm#d?4@ZKd)1V!qr2_*QTF`|o4K+rnfNA(^ECg!VaSgRH zA0L+{9-D*?cqm|xBKD-Mq0k8F^sZ-@&rk?Pur?fFTuhI5_VU8u+#7jcK=uW4zP++% z@3MU_nOL}Tl%=`cfr|5ZP~ch}rt#2I(aJT@id=sShTV;aF(U6dOYKV~Xfz%f zJVk<36+W@(+*8|EJoHt;xIN?^8QKVFk!MOJsEwlZR|SqS9*uEN`dSf)whn)dB6>Ms zQW#E}O7e;WpiaX%IDvTh1YE}S!RiCo!(I!Cd=Qx~=;DEHuBS>(!=r#pVSsjqClrRC zOiEKkdKKGPED@3p>{pzm1&I(P#g6nmn3M#?RoSr0N{OuFI25bO*QpLcc0u8zfi+12&Q(5Dw3 zU-*Li)@*8XAbvI#tT`6zO@p?Al%$!hhQ=H4gQp z;S}|NQcF(SzmSur)0Pcnq(mQptQZmusZ1QOMCl~xolul6+q#mUHf2mu(%_U`KLcV@ zNtG(8kZIbSF>g{2&`Wq1@^q`B+&Pso;~5ctyjAX2gy$_-v1!XRo|V$sE;pY6il@&H zs1cSjI=jqfCp2T2GXkULENP%m&zUoQ_uVo+f_P2>d%K9w-S%=$1yjfqqok4)|n))d1(mfaL?F*C~){fndE$MEO_nczg)P z#{fI1VgU{c$D&u|HVnC&L%OR*&r$%w>Hg6%IRV?c69s{#>eJU{_!ne8aGxQh`12q(^8afMTY z0JBx^=%B)o$|4?7T(c6n)EC@A`lgqA?YfWwW zrUP=*fxkb!WLe_!$6uF^zn(jIDc3Xt$mMKWIGAtkm0NpPoV}#ltGT*!%dT^4ja^w+ z!QHaJ<=gw^_Wl)jKM;z0@6WuykXiBc6k0nToVb7D(J{I8NWS%?+A4%xe7vHQ{XRqx@#!K1U5ytiBSb}#ZDMIT0GZ-1e&Z9YAhUiJe0Xj&KqM9luE zSW9_Y_3m=+{%^gMr+K-#H|OqKw)d%I^<|i+FaXg8BM%iGLCeb;S{kuCFy|1%?t@N- z;?+UrC4do?D6ADC>@TxbJ-`CmkQtiO(8jbyLaj>z;H2e~5-tIDLCv6^siKqt`l?6? zh=en(3dkmm$EMA=-8lETGp6sRk*N~4Y91yhMd*Rf0*p!`kiifpQC8@>%$K$a=zogi zrc^oC(#S7B;PH92zzavA&GhzXReQw=%R&+HFChYk>#n~Sorx~IyyD*d=*p*~k4KmK z9|ym%E{!KX9`P#TmDXS%){mcV6N56>ZG#nQ=9o0(PMhO&+zR-w?kRm2c6F1v8R`}?H$z>98UZ{EU& zNofKsJA&_tu`!XCgh6Z%Ple(n2aO}fWa$REkBD2WpIz05CJe!&bjLI2Y~x{l2!+J& zAOhTPY0G*F-fdZC&0Rm6SaElQ$>wjFKR1bQ|UQfYO0~M~XKH!Q*7ow=hsshwys~*8^ zX+(ggQ3QjkP$m#yHJeAMPD%oqQbQNGCjt4s&wel&8?1Us5;C?AOq_Mv_MYt~eUrLn z{t0!H4%kACK-@>buY@#&J$>cOAgc;Fn2D_Fsb?n=U=XWs*R~`iTE*SOOVz7_3cU5I zY!nEBh@Dk%lIL2$Hz#;_gM~)0J~B~lgsW)f`Q~rAGO`)4V9B`Pg_vf}p;0>qc5nGr zSi0k4JT_H1evu6WSA#b$WD;E3uNlPPA}Cw7Y`O#KlQ;+f82BUCk;!5O#*wpC%m z@UBHlx1WV>p$S-=FnLji4ii?l&lO-dTiKiy{m~b3g zR9P2MD`&9-YZ(2IB3y&lrX;X_$ArYB?q*aK(LO8(?{pHA*T5CP3+l*hAP*BEaA7jq zC7ei31p{_fFTI9ckzPvtO|h!g6-PLk3aV?Zn4vili|ATMeR1O=tEUw*usJHTKZn|4 zFUTmdvAqrV-uv)9Li3L2>P{@XPON$Ss;fQc=~}$;QRZRhZ{L$Whrsa;CiC>n^upyF zal`9uaFLU}dqI0Sz4r!Y1{N$U&NlD}-a9vQZkD@y5n3+PY4*orrby8TcNBJXFItOc zdfV=Gv$@_~q|CMMqQh1*Oh0XCo1dJU)GX_U7xVt(vj6x}>KEysr*r=E*|SfZ+w;u< zxj9hqHY|)ih~1ApdExV(&wBo@7d)OV?*DZAXU1p8=3{fQoVyd=0n|1Y-Be9|-rgkJ zn-&^Y?d+PTKI{CSZ)+)U%Qut>`rJ6y<&Kw^!b?|{x1Y|{o%vPgvg^YC78xw|orsY4 zkN&|EXLeD)+*LE=H~;cQcF1e~CocnOvd03tXL}VZ$KQyA`7mH>DPWrDG@qu^0C!56 z<;3?jnldzw3h=npmmxvn;7A1 za^Y==0t<7f zBkTCqPJ!e2`rLJJEvJ^Qt~Q*>I@j!=tD0qd^THnZ-eJwtvTScrT`GVLrBHIIdY3r_(DDojFDF?z)o403!d7&MrH zmgB`4IH4N&3Sb8C^7YT%WK7Uz1imY!0*wMHb-{rV79@=(!oi?`9{~s~M)>MSKwC{f z^`=r=G_;AAT0m+`JQoBi6sx+LLI@w@p@;Y62;=IpkKj`>nFOX#@ssh8Gc3T{M2bX* zAR+z+qC3>@O%BHX`z8exrL_NI^N2Iq?ESK&5GV-A>W&UsJWe zrkwvq?f4z#`VXq}H#NT5H*z)Yx2-Tylk*QI6Af$|qIEOe7Rjxnjkf&Z_*-VW@f!+b HGME1Y1>WX* literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py new file mode 100644 index 0000000..a124a0d --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -0,0 +1,531 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Optional + +import regex as re +import torch + +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, +) +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import ( + P2pNcclEngine, +) +from vllm.distributed.parallel_state import get_world_group +from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import MLACommonMetadata +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class ReqMeta: + # Request Id + request_id: str + # Request block ids + block_ids: torch.Tensor + # Request num tokens + num_tokens: int + + @staticmethod + def make_meta( + request_id: str, token_ids: list[int], block_ids: list[int], block_size: int + ) -> "ReqMeta": + block_ids_tensor = torch.tensor(block_ids) + return ReqMeta( + request_id=request_id, + block_ids=block_ids_tensor, + num_tokens=len(token_ids), + ) + + +@dataclass +class P2pNcclConnectorMetadata(KVConnectorMetadata): + requests: list[ReqMeta] + + def __init__(self): + self.requests = [] + + def add_request( + self, + request_id: str, + token_ids: list[int], + block_ids: list[int], + block_size: int, + ) -> None: + self.requests.append( + ReqMeta.make_meta(request_id, token_ids, block_ids, block_size) + ) + + +class P2pNcclConnector(KVConnectorBase_V1): + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__( + vllm_config=vllm_config, + role=role, + kv_cache_config=kv_cache_config, + ) + self._block_size = vllm_config.cache_config.block_size + self._requests_need_load: dict[str, Any] = {} + self.is_producer = self._kv_transfer_config.is_kv_producer + self.chunked_prefill: dict[str, tuple[list[int], list[int] | None]] = {} + + self._rank = get_world_group().rank if role == KVConnectorRole.WORKER else 0 + self._local_rank = ( + get_world_group().local_rank if role == KVConnectorRole.WORKER else 0 + ) + + self.p2p_nccl_engine = ( + P2pNcclEngine( + local_rank=self._local_rank, + config=self._kv_transfer_config, + hostname="", + port_offset=self._rank, + ) + if role == KVConnectorRole.WORKER + else None + ) + + # ============================== + # Worker-side methods + # ============================== + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None: + """Start loading the KV cache from the connector buffer to vLLM's + paged KV buffer. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + """ + + # Only consumer/decode loads KV Cache + if self.is_producer: + return + + assert self.p2p_nccl_engine is not None + + attn_metadata = forward_context.attn_metadata + if attn_metadata is None: + return + + def inject_kv_into_layer( + layer: torch.Tensor, + kv_cache: torch.Tensor, + block_ids: torch.Tensor, + request_id: str, + ) -> None: + """ + Inject KV cache data into a given attention layer tensor. + + This function updates `layer` in-place with values from `kv_cache`, + handling different backend layouts: + - MLA (Multi-Linear Attention) or FlashInfer: KV tensors are + indexed along the first dimension. + - FlashAttention: KV tensors are indexed along the second + dimension. + + If the number of provided block IDs does not match the number of KV + blocks, only the overlapping portion is updated, and a warning is + logged. + + Args: + layer (torch.Tensor): The attention layer KV tensor to update. + kv_cache (torch.Tensor): The KV cache tensor to inject. + block_ids (torch.Tensor): Indices of the blocks to update. + request_id (str): Request identifier used for logging. + + Returns: + None. The function modifies `layer` in-place. + """ + if ( + isinstance(attn_metadata, MLACommonMetadata) or layer.shape[1] == 2 + ): # MLA or FlashInfer + num_block = kv_cache.shape[0] + self.check_tensors_except_dim(layer, kv_cache, 0) + if len(block_ids) == num_block: + layer[block_ids, ...] = kv_cache + else: + layer[block_ids[:num_block], ...] = kv_cache + logger.warning( + "🚧kv_cache does not match, block_ids:%d, " + "num_block:%d, request_id:%s", + len(block_ids), + num_block, + request_id, + ) + + elif layer.shape[0] == 2: # FlashAttention + num_block = kv_cache.shape[1] + self.check_tensors_except_dim(layer, kv_cache, 1) + if len(block_ids) == num_block: + layer[:, block_ids, ...] = kv_cache + else: + layer[:, block_ids[:num_block], ...] = kv_cache + logger.warning( + "🚧kv_cache does not match, block_ids:%d, " + "num_block:%d, request_id:%s", + len(block_ids), + num_block, + request_id, + ) + + # Get the metadata + metadata: KVConnectorMetadata = self._get_connector_metadata() + assert isinstance(metadata, P2pNcclConnectorMetadata) + + if metadata is None: + return + + # Load the KV for each request each layer + for request in metadata.requests: + request_id = request.request_id + ip, port = self.parse_request_id(request_id, False) + remote_address = ip + ":" + str(port + self._rank) + for layer_name in forward_context.no_compile_layers: + layer = forward_context.no_compile_layers[layer_name] + + # Only process layers that have kv_cache + # attribute (attention layers) Skip non-attention + # layers like FusedMoE + kv_cache = getattr(layer, "kv_cache", None) + if kv_cache is None: + continue + + layer = kv_cache[forward_context.virtual_engine] + + kv_cache = self.p2p_nccl_engine.recv_tensor( + request.request_id + "#" + layer_name, remote_address + ) + + if kv_cache is None: + logger.warning("🚧kv_cache is None, %s", request.request_id) + continue + + inject_kv_into_layer( + layer, kv_cache, request.block_ids, request.request_id + ) + + def wait_for_layer_load(self, layer_name: str) -> None: + """Blocking until the KV for a specific layer is loaded into vLLM's + paged buffer. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + return + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs: Any, + ) -> None: + """Start saving the KV cache of the layer from vLLM's paged buffer + to the connector. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + + # Only producer/prefill saves KV Cache + if not self.is_producer: + return + + assert self.p2p_nccl_engine is not None + + def extract_kv_from_layer( + layer: torch.Tensor, + block_ids: torch.Tensor, + ) -> torch.Tensor: + """ + Extract KV cache slices from a given attention layer tensor. + + This function handles multiple backend layouts: + - MLA (Multi-Linear Attention) or FlashInfer: KV tensors are + indexed along the first dimension. + - FlashAttention: KV tensors are indexed along the second + dimension. + + Args: + layer (torch.Tensor): The KV cache from the attention layer. + block_ids (torch.Tensor): Indices of blocks to extract. + + Returns: + torch.Tensor: A tensor containing the extracted KV slices. + Returns None if the layout is unsupported. + """ + if ( + isinstance(attn_metadata, MLACommonMetadata) or layer.shape[1] == 2 + ): # MLA or FlashInfer + return layer[block_ids, ...] + + if layer.shape[0] == 2: # FlashAttention + return layer[:, block_ids, ...] + + return None + + connector_metadata = self._get_connector_metadata() + assert isinstance(connector_metadata, P2pNcclConnectorMetadata) + for request in connector_metadata.requests: + request_id = request.request_id + ip, port = self.parse_request_id(request_id, True) + remote_address = ip + ":" + str(port + self._rank) + + kv_cache = extract_kv_from_layer(kv_layer, request.block_ids) + self.p2p_nccl_engine.send_tensor( + request_id + "#" + layer_name, kv_cache, remote_address + ) + + def wait_for_save(self): + if self.is_producer: + assert self.p2p_nccl_engine is not None + self.p2p_nccl_engine.wait_for_sent() + + def get_finished( + self, finished_req_ids: set[str], **kwargs: Any + ) -> tuple[set[str] | None, set[str] | None]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + + Returns: + ids of requests that have finished asynchronous transfer, + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + + assert self.p2p_nccl_engine is not None + + no_compile_layers = self._vllm_config.compilation_config.static_forward_context + return self.p2p_nccl_engine.get_finished(finished_req_ids, no_compile_layers) + + # ============================== + # Scheduler-side methods + # ============================== + + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int, bool]: + """ + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + """ + if self.is_producer: + return 0, False + + prompt_token_ids = request.prompt_token_ids or [] + num_external_tokens = len(prompt_token_ids) - 1 - num_computed_tokens + + if num_external_tokens < 0: + num_external_tokens = 0 + + return num_external_tokens, False + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + """ + Update KVConnector state after block allocation. + """ + if not self.is_producer and num_external_tokens > 0: + self._requests_need_load[request.request_id] = ( + request, + blocks.get_block_ids()[0], + ) + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + """Build the connector metadata for this step. + + This function should NOT modify any fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + + meta = P2pNcclConnectorMetadata() + + for new_req in scheduler_output.scheduled_new_reqs: + if self.is_producer: + num_scheduled_tokens = (scheduler_output.num_scheduled_tokens)[ + new_req.req_id + ] + num_tokens = num_scheduled_tokens + new_req.num_computed_tokens + # the request's prompt is chunked prefill + if num_tokens < len(new_req.prompt_token_ids or []): + # 'CachedRequestData' has no attribute 'prompt_token_ids' + self.chunked_prefill[new_req.req_id] = ( + new_req.block_ids[0], + new_req.prompt_token_ids, + ) + continue + # the request's prompt is not chunked prefill + meta.add_request( + request_id=new_req.req_id, + token_ids=new_req.prompt_token_ids or [], + block_ids=new_req.block_ids[0], + block_size=self._block_size, + ) + continue + if new_req.req_id in self._requests_need_load: + meta.add_request( + request_id=new_req.req_id, + token_ids=new_req.prompt_token_ids or [], + block_ids=new_req.block_ids[0], + block_size=self._block_size, + ) + self._requests_need_load.pop(new_req.req_id) + + cached_reqs = scheduler_output.scheduled_cached_reqs + for i, req_id in enumerate(cached_reqs.req_ids): + num_computed_tokens = cached_reqs.num_computed_tokens[i] + new_block_ids = cached_reqs.new_block_ids[i] + resumed_from_preemption = req_id in cached_reqs.resumed_req_ids + + if self.is_producer: + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_tokens = num_scheduled_tokens + num_computed_tokens + assert req_id in self.chunked_prefill + assert new_block_ids is not None + block_ids = new_block_ids[0] + if not resumed_from_preemption: + block_ids = self.chunked_prefill[req_id][0] + block_ids + prompt_token_ids = self.chunked_prefill[req_id][1] + assert prompt_token_ids is not None + # the request's prompt is chunked prefill again + if num_tokens < len(prompt_token_ids): + self.chunked_prefill[req_id] = (block_ids, prompt_token_ids) + continue + # the request's prompt is all prefilled finally + meta.add_request( + request_id=req_id, + token_ids=prompt_token_ids, + block_ids=block_ids, + block_size=self._block_size, + ) + self.chunked_prefill.pop(req_id, None) + continue + + # NOTE(rob): here we rely on the resumed requests being + # the first N requests in the list scheduled_cache_reqs. + if not resumed_from_preemption: + break + if req_id in self._requests_need_load: + request, _ = self._requests_need_load.pop(req_id) + total_tokens = num_computed_tokens + 1 + token_ids = request.all_token_ids[:total_tokens] + + # NOTE(rob): For resumed req, new_block_ids is all + # of the block_ids for the request. + assert new_block_ids is not None + block_ids = new_block_ids[0] + + meta.add_request( + request_id=req_id, + token_ids=token_ids, + block_ids=block_ids, + block_size=self._block_size, + ) + + self._requests_need_load.clear() + return meta + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called when a request has finished, before its blocks are freed. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + + self.chunked_prefill.pop(request.request_id, None) + + return False, None + + # ============================== + # Static methods + # ============================== + + @staticmethod + def parse_request_id(request_id: str, is_prefill=True) -> tuple[str, int]: + # Regular expression to match the string hostname and integer port + if is_prefill: + pattern = r"___decode_addr_(.*):(\d+)" + else: + pattern = r"___prefill_addr_(.*):(\d+)___" + + # Use re.search to find the pattern in the request_id + match = re.search(pattern, request_id) + if match: + # Extract the ranks + ip = match.group(1) + port = int(match.group(2)) + + return ip, port + raise ValueError(f"Request id {request_id} does not contain hostname and port") + + @staticmethod + def check_tensors_except_dim(tensor1, tensor2, dim): + shape1 = tensor1.size() + shape2 = tensor2.size() + + if len(shape1) != len(shape2) or not all( + s1 == s2 for i, (s1, s2) in enumerate(zip(shape1, shape2)) if i != dim + ): + raise NotImplementedError( + "Currently, only symmetric TP is supported. Asymmetric TP, PP," + "and others will be supported in future PRs." + ) diff --git a/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py new file mode 100644 index 0000000..0e748db --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -0,0 +1,632 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +import logging +import os +import threading +import time +from collections import deque +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Any + +import msgpack +import torch +import zmq + +from vllm.config.kv_transfer import KVTransferConfig +from vllm.distributed.device_communicators.pynccl_wrapper import ( + NCCLLibrary, + buffer_type, + cudaStream_t, + ncclComm_t, + ncclDataTypeEnum, +) +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( # noqa: E501 + TensorMemoryPool, +) +from vllm.utils.network_utils import get_ip +from vllm.utils.torch_utils import current_stream + +logger = logging.getLogger(__name__) + +DEFAULT_MEM_POOL_SIZE_GB = 32 + + +@contextmanager +def set_p2p_nccl_context(num_channels: str): + original_values: dict[str, Any] = {} + env_vars = [ + "NCCL_MAX_NCHANNELS", + "NCCL_MIN_NCHANNELS", + "NCCL_CUMEM_ENABLE", + "NCCL_BUFFSIZE", + "NCCL_PROTO", # LL,LL128,SIMPLE + "NCCL_ALGO", # RING,TREE + ] + + for var in env_vars: + original_values[var] = os.environ.get(var) + + logger.info("set_p2p_nccl_context, original_values: %s", original_values) + + try: + os.environ["NCCL_MAX_NCHANNELS"] = num_channels + os.environ["NCCL_MIN_NCHANNELS"] = num_channels + os.environ["NCCL_CUMEM_ENABLE"] = "1" + yield + finally: + for var in env_vars: + if original_values[var] is not None: + os.environ[var] = original_values[var] + else: + os.environ.pop(var, None) + + +@dataclass +class SendQueueItem: + tensor_id: str + remote_address: str + tensor: torch.Tensor + + +class P2pNcclEngine: + def __init__( + self, + local_rank: int, + config: KVTransferConfig, + hostname: str = "", + port_offset: int = 0, + library_path: str | None = None, + ) -> None: + self.config = config + self.rank = port_offset + self.local_rank = local_rank + self.device = torch.device(f"cuda:{self.local_rank}") + self.nccl = NCCLLibrary(library_path) + + if not hostname: + hostname = get_ip() + port = int(self.config.kv_port) + port_offset + if port == 0: + raise ValueError("Port cannot be 0") + self._hostname = hostname + self._port = port + + # Each card corresponds to a ZMQ address. + self.zmq_address = f"{self._hostname}:{self._port}" + + # If `proxy_ip` or `proxy_port` is `""`, + # then the ping thread will not be enabled. + proxy_ip = self.config.get_from_extra_config("proxy_ip", "") + proxy_port = self.config.get_from_extra_config("proxy_port", "") + if proxy_ip == "" or proxy_port == "": + self.proxy_address = "" + self.http_address = "" + else: + self.proxy_address = proxy_ip + ":" + proxy_port + # the `http_port` must be consistent with the port of OpenAI. + http_port = self.config.get_from_extra_config("http_port", None) + if http_port is None: + example_cfg = { + "kv_connector": "P2pNcclConnector", + "kv_connector_extra_config": {"http_port": 8000}, + } + example = ( + f"--port=8000 --kv-transfer-config='{json.dumps(example_cfg)}'" + ) + raise ValueError( + "kv_connector_extra_config.http_port is required. " + f"Example: {example}" + ) + self.http_address = f"{self._hostname}:{http_port}" + + self.context = zmq.Context() + self.router_socket = self.context.socket(zmq.ROUTER) + self.router_socket.bind(f"tcp://{self.zmq_address}") + + self.poller = zmq.Poller() + self.poller.register(self.router_socket, zmq.POLLIN) + + self.send_store_cv = threading.Condition() + self.send_queue_cv = threading.Condition() + self.recv_store_cv = threading.Condition() + + self.send_stream = torch.cuda.Stream() + self.recv_stream = torch.cuda.Stream() + + mem_pool_size_gb = float( + self.config.get_from_extra_config( + "mem_pool_size_gb", DEFAULT_MEM_POOL_SIZE_GB + ) + ) + self.pool = TensorMemoryPool( + max_block_size=int(mem_pool_size_gb * 1024**3) + ) # GB + + # The sending type includes tree mutually exclusive options: + # PUT, GET, PUT_ASYNC. + self.send_type = self.config.get_from_extra_config("send_type", "PUT_ASYNC") + if self.send_type == "GET": + # tensor_id: torch.Tensor + self.send_store: dict[str, torch.Tensor] = {} + else: + # PUT or PUT_ASYNC + # tensor_id: torch.Tensor + self.send_queue: deque[SendQueueItem] = deque() + if self.send_type == "PUT_ASYNC": + self._send_thread = threading.Thread( + target=self.send_async, daemon=True + ) + self._send_thread.start() + + # tensor_id: torch.Tensor/(addr, dtype, shape) + self.recv_store: dict[str, Any] = {} + self.recv_request_id_to_tensor_ids: dict[str, set[str]] = {} + self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} + self.socks: dict[str, Any] = {} # remote_address: client socket + self.comms: dict[str, Any] = {} # remote_address: (ncclComm_t, rank) + + self.buffer_size = 0 + self.buffer_size_threshold = float(self.config.kv_buffer_size) + + self.nccl_num_channels = self.config.get_from_extra_config( + "nccl_num_channels", "8" + ) + + self._listener_thread = threading.Thread( + target=self.listen_for_requests, daemon=True + ) + self._listener_thread.start() + + self._ping_thread = None + if port_offset == 0 and self.proxy_address != "": + self._ping_thread = threading.Thread(target=self.ping, daemon=True) + self._ping_thread.start() + + logger.info( + "💯P2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s, " + "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_" + "threshold:%.2f, nccl_num_channels:%s", + self.rank, + self.local_rank, + self.http_address, + self.zmq_address, + self.proxy_address, + self.send_type, + self.buffer_size_threshold, + self.nccl_num_channels, + ) + + def create_connect(self, remote_address: str | None = None): + assert remote_address is not None + if remote_address not in self.socks: + sock = self.context.socket(zmq.DEALER) + sock.setsockopt_string(zmq.IDENTITY, self.zmq_address) + sock.connect(f"tcp://{remote_address}") + self.socks[remote_address] = sock + if remote_address in self.comms: + logger.info( + "👋comm exists, remote_address:%s, comms:%s", + remote_address, + self.comms, + ) + return sock, self.comms[remote_address] + + unique_id = self.nccl.ncclGetUniqueId() + data = {"cmd": "NEW", "unique_id": bytes(unique_id.internal)} + sock.send(msgpack.dumps(data)) + + with torch.cuda.device(self.device): + rank = 0 + with set_p2p_nccl_context(self.nccl_num_channels): + comm: ncclComm_t = self.nccl.ncclCommInitRank(2, unique_id, rank) + self.comms[remote_address] = (comm, rank) + logger.info( + "🤝ncclCommInitRank Success, %s👉%s, MyRank:%s", + self.zmq_address, + remote_address, + rank, + ) + + return self.socks[remote_address], self.comms[remote_address] + + def send_tensor( + self, + tensor_id: str, + tensor: torch.Tensor, + remote_address: str | None = None, + ) -> bool: + if remote_address is None: + with self.recv_store_cv: + self.recv_store[tensor_id] = tensor + self.recv_store_cv.notify() + return True + + item = SendQueueItem( + tensor_id=tensor_id, remote_address=remote_address, tensor=tensor + ) + + if self.send_type == "PUT": + return self.send_sync(item) + + if self.send_type == "PUT_ASYNC": + with self.send_queue_cv: + self.send_queue.append(item) + self.send_queue_cv.notify() + return True + + # GET + with self.send_store_cv: + tensor_size = tensor.element_size() * tensor.numel() + if tensor_size > self.buffer_size_threshold: + logger.warning( + "❗[GET]tensor_id:%s, tensor_size:%d, is greater than" + "buffer size threshold :%d, skip send to %s, rank:%d", + tensor_id, + tensor_size, + self.buffer_size_threshold, + remote_address, + self.rank, + ) + return False + while self.buffer_size + tensor_size > self.buffer_size_threshold: + assert len(self.send_store) > 0 + oldest_tensor_id = next(iter(self.send_store)) + oldest_tensor = self.send_store.pop(oldest_tensor_id) + oldest_tensor_size = ( + oldest_tensor.element_size() * oldest_tensor.numel() + ) + self.buffer_size -= oldest_tensor_size + logger.debug( + "⛔[GET]Send to %s, tensor_id:%s, tensor_size:%d," + " buffer_size:%d, oldest_tensor_size:%d, rank:%d", + remote_address, + tensor_id, + tensor_size, + self.buffer_size, + oldest_tensor_size, + self.rank, + ) + + self.send_store[tensor_id] = tensor + self.buffer_size += tensor_size + logger.debug( + "🔵[GET]Send to %s, tensor_id:%s, tensor_size:%d, " + "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", + remote_address, + tensor_id, + tensor_size, + tensor.shape, + self.rank, + self.buffer_size, + self.buffer_size / self.buffer_size_threshold * 100, + ) + return True + + def recv_tensor( + self, + tensor_id: str, + remote_address: str | None = None, + ) -> torch.Tensor: + if self.send_type == "PUT" or self.send_type == "PUT_ASYNC": + start_time = time.time() + with self.recv_store_cv: + while tensor_id not in self.recv_store: + self.recv_store_cv.wait() + tensor = self.recv_store[tensor_id] + + if tensor is not None: + if isinstance(tensor, tuple): + addr, dtype, shape = tensor + tensor = self.pool.load_tensor(addr, dtype, shape, self.device) + else: + self.buffer_size -= tensor.element_size() * tensor.numel() + else: + duration = time.time() - start_time + logger.warning( + "🔴[PUT]Recv From %s, tensor_id:%s, duration:%.3fms, rank:%d", + remote_address, + tensor_id, + duration * 1000, + self.rank, + ) + return tensor + + # GET + if remote_address is None: + return None + + if remote_address not in self.socks: + self.create_connect(remote_address) + + sock = self.socks[remote_address] + comm, rank = self.comms[remote_address] + + data = {"cmd": "GET", "tensor_id": tensor_id} + sock.send(msgpack.dumps(data)) + + message = sock.recv() + data = msgpack.loads(message) + if data["ret"] != 0: + logger.warning( + "🔴[GET]Recv From %s, tensor_id: %s, ret: %d", + remote_address, + tensor_id, + data["ret"], + ) + return None + + with torch.cuda.stream(self.recv_stream): + tensor = torch.empty( + data["shape"], dtype=getattr(torch, data["dtype"]), device=self.device + ) + + self.recv(comm, tensor, rank ^ 1, self.recv_stream) + + return tensor + + def listen_for_requests(self): + while True: + socks = dict(self.poller.poll()) + if self.router_socket not in socks: + continue + + remote_address, message = self.router_socket.recv_multipart() + data = msgpack.loads(message) + if data["cmd"] == "NEW": + unique_id = self.nccl.unique_id_from_bytes(bytes(data["unique_id"])) + with torch.cuda.device(self.device): + rank = 1 + with set_p2p_nccl_context(self.nccl_num_channels): + comm: ncclComm_t = self.nccl.ncclCommInitRank( + 2, unique_id, rank + ) + self.comms[remote_address.decode()] = (comm, rank) + logger.info( + "🤝ncclCommInitRank Success, %s👈%s, MyRank:%s", + self.zmq_address, + remote_address.decode(), + rank, + ) + elif data["cmd"] == "PUT": + tensor_id = data["tensor_id"] + try: + with torch.cuda.stream(self.recv_stream): + tensor = torch.empty( + data["shape"], + dtype=getattr(torch, data["dtype"]), + device=self.device, + ) + self.router_socket.send_multipart([remote_address, b"0"]) + comm, rank = self.comms[remote_address.decode()] + self.recv(comm, tensor, rank ^ 1, self.recv_stream) + tensor_size = tensor.element_size() * tensor.numel() + if self.buffer_size + tensor_size > self.buffer_size_threshold: + # Store Tensor in memory pool + addr = self.pool.store_tensor(tensor) + tensor = (addr, tensor.dtype, tensor.shape) + logger.warning( + "🔴[PUT]Recv Tensor, Out Of Threshold, " + "%s👈%s, data:%s, addr:%d", + self.zmq_address, + remote_address.decode(), + data, + addr, + ) + else: + self.buffer_size += tensor_size + + except torch.cuda.OutOfMemoryError: + self.router_socket.send_multipart([remote_address, b"1"]) + tensor = None + logger.warning( + "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, data:%s", + self.zmq_address, + remote_address.decode(), + data, + ) + + with self.recv_store_cv: + self.recv_store[tensor_id] = tensor + self.have_received_tensor_id(tensor_id) + self.recv_store_cv.notify() + + elif data["cmd"] == "GET": + tensor_id = data["tensor_id"] + with self.send_store_cv: + tensor = self.send_store.pop(tensor_id, None) + if tensor is not None: + data = { + "ret": 0, + "shape": tensor.shape, + "dtype": str(tensor.dtype).replace("torch.", ""), + } + # LRU + self.send_store[tensor_id] = tensor + self.have_sent_tensor_id(tensor_id) + else: + data = {"ret": 1} + + self.router_socket.send_multipart([remote_address, msgpack.dumps(data)]) + + if data["ret"] == 0: + comm, rank = self.comms[remote_address.decode()] + self.send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) + else: + logger.warning( + "🚧Unexpected, Received message from %s, data:%s", + remote_address, + data, + ) + + def have_sent_tensor_id(self, tensor_id: str): + request_id = tensor_id.split("#")[0] + if request_id not in self.send_request_id_to_tensor_ids: + self.send_request_id_to_tensor_ids[request_id] = set() + self.send_request_id_to_tensor_ids[request_id].add(tensor_id) + + def have_received_tensor_id(self, tensor_id: str): + request_id = tensor_id.split("#")[0] + if request_id not in self.recv_request_id_to_tensor_ids: + self.recv_request_id_to_tensor_ids[request_id] = set() + self.recv_request_id_to_tensor_ids[request_id].add(tensor_id) + + def send_async(self): + while True: + with self.send_queue_cv: + while not self.send_queue: + self.send_queue_cv.wait() + item = self.send_queue.popleft() + if not self.send_queue: + self.send_queue_cv.notify() + self.send_sync(item) + + def wait_for_sent(self): + if self.send_type == "PUT_ASYNC": + start_time = time.time() + with self.send_queue_cv: + while self.send_queue: + self.send_queue_cv.wait() + duration = time.time() - start_time + logger.debug( + "🚧[PUT_ASYNC]It took %.3fms to wait for the send_queue" + " to be empty, rank:%d", + duration * 1000, + self.rank, + ) + + def send_sync(self, item: SendQueueItem) -> bool: + if item.remote_address is None: + return False + if item.remote_address not in self.socks: + self.create_connect(item.remote_address) + + tensor = item.tensor + + sock = self.socks[item.remote_address] + comm, rank = self.comms[item.remote_address] + data = { + "cmd": "PUT", + "tensor_id": item.tensor_id, + "shape": tensor.shape, + "dtype": str(tensor.dtype).replace("torch.", ""), + } + sock.send(msgpack.dumps(data)) + + response = sock.recv() + if response != b"0": + logger.error( + "🔴Send Tensor, Peer Out Of Memory/Threshold, %s 👉 %s, " + "MyRank:%s, data:%s, tensor:%s, size:%fGB, response:%s", + self.zmq_address, + item.remote_address, + rank, + data, + tensor.shape, + tensor.element_size() * tensor.numel() / 1024**3, + response.decode(), + ) + return False + + self.send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) + + if self.send_type == "PUT_ASYNC": + self.have_sent_tensor_id(item.tensor_id) + + return True + + def get_finished( + self, finished_req_ids: set[str], no_compile_layers + ) -> tuple[set[str] | None, set[str] | None]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + + Returns: + ids of requests that have finished asynchronous transfer, + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + + # Clear the buffer upon request completion. + for request_id in finished_req_ids: + for layer_name in no_compile_layers: + tensor_id = request_id + "#" + layer_name + if tensor_id in self.recv_store: + with self.recv_store_cv: + tensor = self.recv_store.pop(tensor_id, None) + self.send_request_id_to_tensor_ids.pop(request_id, None) + self.recv_request_id_to_tensor_ids.pop(request_id, None) + if isinstance(tensor, tuple): + addr, _, _ = tensor + self.pool.free(addr) + + # TODO:Retrieve requests that have already sent the KV cache. + finished_sending: set[str] = set() + + # TODO:Retrieve requests that have already received the KV cache. + finished_recving: set[str] = set() + + return finished_sending or None, finished_recving or None + + def ping(self): + sock = self.context.socket(zmq.DEALER) + sock.setsockopt_string(zmq.IDENTITY, self.zmq_address) + logger.debug("ping start, zmq_address:%s", self.zmq_address) + sock.connect(f"tcp://{self.proxy_address}") + data = { + "type": "P" if self.config.is_kv_producer else "D", + "http_address": self.http_address, + "zmq_address": self.zmq_address, + } + while True: + sock.send(msgpack.dumps(data)) + time.sleep(3) + + def send(self, comm, tensor: torch.Tensor, dst: int, stream=None): + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}" + ) + if stream is None: + stream = current_stream() + + with torch.cuda.stream(stream): + self.nccl.ncclSend( + buffer_type(tensor.data_ptr()), + tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), + dst, + comm, + cudaStream_t(stream.cuda_stream), + ) + stream.synchronize() + + def recv(self, comm, tensor: torch.Tensor, src: int, stream=None): + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}" + ) + if stream is None: + stream = current_stream() + + with torch.cuda.stream(stream): + self.nccl.ncclRecv( + buffer_type(tensor.data_ptr()), + tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), + src, + comm, + cudaStream_t(stream.cuda_stream), + ) + stream.synchronize() + + def close(self) -> None: + self._listener_thread.join() + if self.send_type == "PUT_ASYNC": + self._send_thread.join() + if self._ping_thread is not None: + self._ping_thread.join() diff --git a/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py new file mode 100644 index 0000000..899f1ea --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -0,0 +1,273 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import atexit +import ctypes +import math +from dataclasses import dataclass + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@dataclass +class MemoryBlock: + size: int + addr: int + + +"""A memory pool for managing pinned host memory allocations for tensors. + +This class implements a buddy allocation system to efficiently manage pinned +host memory for tensor storage. It supports allocation, deallocation, and +tensor storage/retrieval operations. + +Key Features: +- Uses power-of-two block sizes for efficient buddy allocation +- Supports splitting and merging of memory blocks +- Provides methods to store CUDA tensors in pinned host memory +- Allows loading tensors from pinned memory back to device +- Automatically cleans up memory on destruction + +Attributes: + max_block_size (int): Maximum block size (rounded to nearest power of two) + min_block_size (int): Minimum block size (rounded to nearest power of two) + free_lists (dict): Dictionary of free memory blocks by size + allocated_blocks (dict): Dictionary of currently allocated blocks + base_tensor (torch.Tensor): Base pinned memory tensor + base_address (int): Base memory address of the pinned memory region + +Example: + >>> pool = TensorMemoryPool(max_block_size=1024*1024) + >>> tensor = torch.randn(100, device='cuda') + >>> addr = pool.store_tensor(tensor) + >>> loaded_tensor = pool.load_tensor(addr, tensor.dtype, + ... tensor.shape, 'cuda') + >>> pool.free(addr) +""" + + +class TensorMemoryPool: + """Initializes the memory pool with given size constraints. + + Args: + max_block_size (int): Maximum size of memory blocks to manage + min_block_size (int, optional): Minimum size of memory blocks + to manage. Defaults to 512. + + Raises: + ValueError: If block sizes are invalid or max_block_size is less + than min_block_size + """ + + def __init__(self, max_block_size: int, min_block_size: int = 512): + if max_block_size <= 0 or min_block_size <= 0: + raise ValueError("Block sizes must be positive") + if max_block_size < min_block_size: + raise ValueError("Max block size must be greater than min block size") + + self.max_block_size = self._round_to_power_of_two(max_block_size) + self.min_block_size = self._round_to_power_of_two(min_block_size) + + self.free_lists: dict[int, dict[int, MemoryBlock]] = {} + self.allocated_blocks: dict[int, MemoryBlock] = {} + + self._initialize_free_lists() + self._allocate_pinned_memory() + + atexit.register(self.cleanup) + + def _round_to_power_of_two(self, size: int) -> int: + return 1 << (size - 1).bit_length() + + def _initialize_free_lists(self): + size = self.max_block_size + while size >= self.min_block_size: + self.free_lists[size] = {} + size //= 2 + + def _allocate_pinned_memory(self): + self.base_tensor = torch.empty( + self.max_block_size // 4, dtype=torch.float32, pin_memory=True + ) + self.base_address = self.base_tensor.data_ptr() + initial_block = MemoryBlock(size=self.max_block_size, addr=self.base_address) + self.free_lists[self.max_block_size][initial_block.addr] = initial_block + + logger.debug( + "TensorMemoryPool, base_address:%d, max_block_size:%d", + self.base_address, + self.max_block_size, + ) + + def allocate(self, size: int) -> int: + """Allocates a memory block of at least the requested size. + + Args: + size (int): Minimum size of memory to allocate + + Returns: + int: Address of the allocated memory block + + Raises: + ValueError: If size is invalid or insufficient memory is available + """ + if size <= 0: + raise ValueError("Allocation size must be positive") + + required_size = self._round_to_power_of_two(max(size, self.min_block_size)) + if required_size > self.max_block_size: + raise ValueError("Requested size exceeds maximum block size") + + current_size = required_size + while current_size <= self.max_block_size: + if self.free_lists[current_size]: + _, block = self.free_lists[current_size].popitem() + self._split_block(block, required_size) + self.allocated_blocks[block.addr] = block + return block.addr + current_size *= 2 + + raise ValueError("Insufficient memory") + + def _split_block(self, block: MemoryBlock, required_size: int): + while block.size > required_size and block.size // 2 >= self.min_block_size: + buddy_size = block.size // 2 + buddy_addr = block.addr + buddy_size + + buddy = MemoryBlock(size=buddy_size, addr=buddy_addr) + block.size = buddy_size + + self.free_lists[buddy_size][buddy.addr] = buddy + + def free(self, addr: int): + """Frees an allocated memory block. + + Args: + addr (int): Address of the block to free + + Raises: + ValueError: If address is invalid or not allocated + """ + if addr not in self.allocated_blocks: + raise ValueError("Invalid address to free") + + block = self.allocated_blocks.pop(addr) + self._merge_buddies(block) + + def _merge_buddies(self, block: MemoryBlock): + MAX_MERGE_DEPTH = 30 + depth = 0 + + while depth < MAX_MERGE_DEPTH: + buddy_offset = ( + block.size + if (block.addr - self.base_address) % (2 * block.size) == 0 + else -block.size + ) + buddy_addr = block.addr + buddy_offset + buddy = self.free_lists[block.size].get(buddy_addr) + if buddy: + del self.free_lists[buddy.size][buddy.addr] + merged_addr = min(block.addr, buddy.addr) + merged_size = block.size * 2 + block = MemoryBlock(size=merged_size, addr=merged_addr) + depth += 1 + else: + break + self.free_lists[block.size][block.addr] = block + + def store_tensor(self, tensor: torch.Tensor) -> int: + """Stores a CUDA tensor in pinned host memory. + + Args: + tensor (torch.Tensor): CUDA tensor to store + + Returns: + int: Address where the tensor is stored + + Raises: + ValueError: If tensor is not on CUDA or allocation fails + """ + if not tensor.is_cuda: + raise ValueError("Only CUDA tensors can be stored") + + size = tensor.element_size() * tensor.numel() + addr = self.allocate(size) + block = self.allocated_blocks[addr] + + if block.size < size: + self.free(addr) + raise ValueError( + f"Allocated block size {block.size} is smaller than " + f"required size {size}" + ) + + try: + buffer = (ctypes.c_byte * block.size).from_address(block.addr) + cpu_tensor = torch.frombuffer( + buffer, dtype=tensor.dtype, count=tensor.numel() + ).reshape(tensor.shape) + except ValueError as err: + self.free(addr) + raise ValueError(f"Failed to create tensor view: {err}") from err + + cpu_tensor.copy_(tensor) + + return addr + + def load_tensor( + self, + addr: int, + dtype: torch.dtype, + shape: tuple[int, ...], + device: torch.device, + ) -> torch.Tensor: + """Loads a tensor from pinned host memory to the specified device. + + Args: + addr (int): Address where tensor is stored + dtype (torch.dtype): Data type of the tensor + shape (tuple[int, ...]): Shape of the tensor + device: Target device for the loaded tensor + + Returns: + torch.Tensor: The loaded tensor on the specified device + + Raises: + ValueError: If address is invalid or sizes don't match + """ + if addr not in self.allocated_blocks: + raise ValueError("Invalid address to load") + + block = self.allocated_blocks[addr] + num_elements = math.prod(shape) + dtype_size = torch.tensor([], dtype=dtype).element_size() + required_size = num_elements * dtype_size + + if required_size > block.size: + raise ValueError("Requested tensor size exceeds block size") + + buffer = (ctypes.c_byte * block.size).from_address(block.addr) + cpu_tensor = torch.frombuffer(buffer, dtype=dtype, count=num_elements).reshape( + shape + ) + + cuda_tensor = torch.empty(shape, dtype=dtype, device=device) + + cuda_tensor.copy_(cpu_tensor) + + return cuda_tensor + + def cleanup(self): + """Cleans up all memory resources and resets the pool state.""" + self.free_lists.clear() + self.allocated_blocks.clear() + if hasattr(self, "base_tensor"): + del self.base_tensor + + def __del__(self): + self.cleanup() diff --git a/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py new file mode 100644 index 0000000..016d1d4 --- /dev/null +++ b/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -0,0 +1,450 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import hashlib +import os +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Optional + +import safetensors +import torch + +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, +) +from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import MLACommonMetadata +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class ReqMeta: + # Request tokens + token_ids: torch.Tensor + # Slot mappings, should have the same length as token_ids + slot_mapping: torch.Tensor + # Is store or load + is_store: bool + mm_hashes: list[str] + + @staticmethod + def make_meta( + token_ids: list[int], + block_ids: list[int], + block_size: int, + is_store: bool, + mm_hashes: list[str], + ) -> "ReqMeta": + valid_num_tokens = align_to_block_size(len(token_ids), block_size) + token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens] + block_ids_tensor = torch.tensor(block_ids) + num_blocks = block_ids_tensor.shape[0] + block_offsets = torch.arange(0, block_size) + slot_mapping = ( + block_offsets.reshape((1, block_size)) + + block_ids_tensor.reshape((num_blocks, 1)) * block_size + ) + slot_mapping = slot_mapping.flatten()[:valid_num_tokens] + return ReqMeta( + token_ids=token_ids_tensor, + slot_mapping=slot_mapping, + is_store=is_store, + mm_hashes=mm_hashes, + ) + + +@dataclass +class SharedStorageConnectorMetadata(KVConnectorMetadata): + requests: list[ReqMeta] = field(default_factory=list) + + def add_request( + self, + token_ids: list[int], + block_ids: list[int], + block_size: int, + is_store: bool, + mm_hashes: list[str], + ) -> None: + self.requests.append( + ReqMeta.make_meta(token_ids, block_ids, block_size, is_store, mm_hashes) + ) + + +class SharedStorageConnector(KVConnectorBase_V1): + # NOTE: This is Simple debug implementation of the KV connector. + # It save / load the KV cache to / from the disk. + # It does extra work which will overwrite the existing prefix-cache in GPU + # - to remove the overhead, need to add some "mask" in the ReqMeta class + + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__( + vllm_config=vllm_config, + role=role, + kv_cache_config=kv_cache_config, + ) + self._block_size = vllm_config.cache_config.block_size + self._requests_need_load: dict[str, Request] = {} + self._storage_path = self._kv_transfer_config.get_from_extra_config( + "shared_storage_path", "/tmp" + ) + logger.info(self._kv_transfer_config) + logger.info("Shared storage path is %s", self._storage_path) + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None: + """Start loading the KV cache from the connector buffer to vLLM's + paged KV buffer. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + """ + attn_metadata = forward_context.attn_metadata + + def inject_kv_into_layer( + dst_kv_cache_layer: torch.Tensor, + src_kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> None: + """Inject the KV cache into the layer. + + Args: + dst_kv_cache_layer (torch.Tensor): the destination KV cache + layer. In shape [2, num_pages, page_size, xxx] if not + using MLA, [num_pages, page_size, xxx] otherwise. + src_kv_cache (torch.Tensor): the source KV cache. In shape + [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] + otherwise. + slot_mapping (torch.Tensor): the slot mapping. In shape + [num_tokens]. + """ + dst_kv_cache_layer_shape = dst_kv_cache_layer.shape + if isinstance(attn_metadata, MLACommonMetadata): + num_pages = dst_kv_cache_layer_shape[0] + page_size = dst_kv_cache_layer_shape[1] + dst_kv_cache_layer = dst_kv_cache_layer.reshape( + num_pages * page_size, -1 + ) + dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + else: + num_pages = dst_kv_cache_layer_shape[1] + page_size = dst_kv_cache_layer_shape[2] + dst_kv_cache_layer = dst_kv_cache_layer.reshape( + 2, num_pages * page_size, -1 + ) + dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + + # Get the metadata + metadata: KVConnectorMetadata = self._get_connector_metadata() + assert isinstance(metadata, SharedStorageConnectorMetadata) + + if metadata is None: + logger.warning( + "In connector.start_load_kv, but the connector metadata is None" + ) + return + + attn_metadata = forward_context.attn_metadata + if attn_metadata is None: + logger.warning("In connector.start_load_kv, but the attn_metadata is None") + return + + # Load the KV for each request each layer + for request in metadata.requests: + if request.is_store: + continue + logger.info( + "Inject KV cache of %d tokens to the paged memory", + len(request.slot_mapping), + ) + for layer_name in forward_context.no_compile_layers: + layer = forward_context.no_compile_layers[layer_name] + + # Only process layers that have kv_cache + # attribute (attention layers) Skip non-attention + # layers like FusedMoE/MLP etc. + kv_cache_attr = getattr(layer, "kv_cache", None) + if kv_cache_attr is None: + continue + + kv_cache_layer = kv_cache_attr[forward_context.virtual_engine] + + filename = self._generate_filename_debug( + layer_name, request.token_ids, request.mm_hashes + ) + kv_cache = safetensors.torch.load_file(filename)["kv_cache"].cuda() + inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping) + + def wait_for_layer_load(self, layer_name: str) -> None: + """Blocking until the KV for a specific layer is loaded into vLLM's + paged buffer. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + return + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", + **kwargs: Any, + ) -> None: + """Start saving the KV cache of the layer from vLLM's paged buffer + to the connector. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + + def extract_kv_from_layer( + layer: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> torch.Tensor: + """Extract the KV cache from the layer. + + Assume the shape of the layer is (2, num_pages, page_size, xxx) + if MLA is not used, and (num_pages, page_size, xxx) otherwise. + """ + if isinstance(attn_metadata, MLACommonMetadata): + num_pages, page_size = layer.shape[0], layer.shape[1] + return layer.reshape(num_pages * page_size, -1)[slot_mapping, ...] + num_pages, page_size = layer.shape[1], layer.shape[2] + return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...] + + connector_metadata = self._get_connector_metadata() + assert isinstance(connector_metadata, SharedStorageConnectorMetadata) + for request in connector_metadata.requests: + if request.is_store: + filename = self._generate_filename_debug( + layer_name, request.token_ids, request.mm_hashes + ) + kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) + tensors = {"kv_cache": kv_cache.detach().cpu()} + safetensors.torch.save_file(tensors, filename) + + def wait_for_save(self): + return + + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + """ + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + """ + # NOTE: in this debug implementation, we assume that the prompt is + # cached_prompt + newly_generated_single_token + # Therefore, we use prompt_token_ids[:-1] to determine the folder name + + # NOTE: in current v1 scheduler, the num_computed_tokens is aligned + # with the block granularity. And it expects the returned blocks and + # num_computed_tokens to also be aligned with the block granularity. + if not self._found_match_for_request(request): + return 0, False + + logger.info("External Cache Hit!") + + # Now, first num_tokens_to_check tokens are hit, we need to prepare + # the metadata for the worker connector to correctly load the KV + token_ids = request.prompt_token_ids or [] + num_tokens_to_check = align_to_block_size(len(token_ids) - 1, self._block_size) + + return num_tokens_to_check - num_computed_tokens, False + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + """ + Update KVConnector state after block allocation. + + If blocks were allocated, add to _requests_need_load, + such that we load the KVs in the next forward pass. + """ + if num_external_tokens > 0: + self._requests_need_load[request.request_id] = request + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + """Build the connector metadata for this step. + + This function should NOT modify any fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + meta = SharedStorageConnectorMetadata() + + total_need_load = 0 + for new_req in scheduler_output.scheduled_new_reqs: + token_ids = new_req.prompt_token_ids or [] + mm_hashes = [f.identifier for f in new_req.mm_features] + if new_req.req_id in self._requests_need_load: + meta.add_request( + token_ids=token_ids, + block_ids=new_req.block_ids[0], + block_size=self._block_size, + is_store=False, + mm_hashes=mm_hashes, + ) + total_need_load += 1 + else: + # NOTE: here, we set the store and load being exclusive, + # but a single request can have both store and load. + # NOTE(rob): for this debug implementation, we only cache + # the original prompt tokens. + if not self._found_match_for_prompt(token_ids, mm_hashes): + meta.add_request( + token_ids=token_ids, + block_ids=new_req.block_ids[0], + block_size=self._block_size, + is_store=True, + mm_hashes=mm_hashes, + ) + + cached_reqs = scheduler_output.scheduled_cached_reqs + for i, req_id in enumerate(cached_reqs.req_ids): + resumed_from_preemption = req_id in cached_reqs.resumed_req_ids + if not resumed_from_preemption or req_id not in self._requests_need_load: + continue + + num_computed_tokens = cached_reqs.num_computed_tokens[i] + num_new_tokens = scheduler_output.num_scheduled_tokens[req_id] + new_block_ids = cached_reqs.new_block_ids[i] + + # NOTE(rob): cached_req_data does not have the full + # list of token ids (only new tokens). So we look it + # up in the actual request object. + request = self._requests_need_load[req_id] + total_tokens = num_computed_tokens + num_new_tokens + token_ids = request.all_token_ids[:total_tokens] + + # NOTE(rob): For resumed req, new_block_ids is all + # of the block_ids for the request. + assert new_block_ids is not None + block_ids = new_block_ids[0] + + meta.add_request( + token_ids=token_ids, + block_ids=block_ids, + block_size=self._block_size, + is_store=False, + mm_hashes=[f.identifier for f in request.mm_features], + ) + total_need_load += 1 + + assert total_need_load == len(self._requests_need_load) + self._requests_need_load.clear() + return meta + + # ============================== + # Helper functions + # ============================== + + def _found_match_for_request( + self, + request: "Request", + ) -> bool: + """Check if the cache is hit for the request.""" + return self._found_match_for_prompt( + list(request.prompt_token_ids or []), + [f.identifier for f in request.mm_features], + ) + + def _found_match_for_prompt( + self, + prompt_token_ids: list[int], + mm_hashes: list[str], + ) -> bool: + num_tokens_to_check = align_to_block_size( + len(prompt_token_ids) - 1, self._block_size + ) + foldername = self._generate_foldername_debug( + torch.tensor(prompt_token_ids)[:num_tokens_to_check], + mm_hashes, + create_folder=False, + ) + return os.path.exists(foldername) + + def _generate_foldername_debug( + self, + token_ids: torch.Tensor, + mm_hashes: list[str], + create_folder=False, + ) -> str: + """Generate a folder name based on the hash of the bytes of the input + ids. + """ + token_bytes = token_ids.numpy().tobytes() + # Add mm_hashes to the bytes being hashed to avoid path traversal and + # to create a canonical key. + if mm_hashes: + mm_str = "-".join(mm_hashes) + token_bytes += mm_str.encode("utf-8") + input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest() + + foldername = os.path.join(self._storage_path, input_ids_hash) + if create_folder: + os.makedirs(foldername, exist_ok=True) + return foldername + + def _generate_filename_debug( + self, + layer_name: str, + token_ids: torch.Tensor, + mm_hashes: list[str], + ) -> str: + """Generate a file name based on the layer name and the hash + of the bytes of the input ids. + """ + foldername = self._generate_foldername_debug( + token_ids, mm_hashes=mm_hashes, create_folder=True + ) + return os.path.join(foldername, f"{layer_name}.safetensors") + + +def align_to_block_size(num_tokens: int, block_size) -> int: + """Align the number of tokens to the block size.""" + return (num_tokens - 1) // block_size * block_size diff --git a/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/distributed/kv_transfer/kv_lookup_buffer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/distributed/kv_transfer/kv_lookup_buffer/__pycache__/__init__.cpython-312.pyc b/distributed/kv_transfer/kv_lookup_buffer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..575718de21a2443250e3c9edd3847562beb348f1 GIT binary patch literal 190 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJV#p;*j7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?Er873ThGwfgf6%^EwM!dcbtsc6WSc@@;Or z9v40f+;Y&eoGs=y*|l5D(2W*n)T9TNOLBb%@mvI!vni)YI&miu7WdhFU3I8e%9azj zn20l5PR|;0ex^Gn`!M99Lo)bX%a_7pK_tNxu2=K9&5ZzJn!;@li>a1V`_^sWJ#zCm z16S~8@4bx0wsl9}~$BQPN`c~V6mbZ9kaa*@T&U}cb0a2$ND`>HX?psFL z3xlOi9;9?97tDPe5}gV(r8nGd&UAslh93ys2)M}_9O^fFCnpz?T70$qXEK&bwwqhI za;rpS+wT%o+GHYfway)P5&D=-%&!YYtg-*6o=$?#r z-J5Xt>t%*#C0ZfvF4_TWx&k`s*HcyYUNc~p$wTk7l!Iv$549j^_(`Uc(NCfIs_jB5 zSxT&Js6gQSX)?imHxvflnm=%y%$A#rOn_R%sxyjAc0^Zdv9N}^zgch~t2CHNrZMXtA-x9(V%Y7?t zhkk$qX3~mOFL)F4!$xEhN_!;h(@gjD2zixEHxIvP`lM6#j~^}tZEwl5yv0Blu(rj! z;p^Z0#@6e6$+j9xUI!-OEN$C%d&z{C39AueVM|-vHCUVD0}JR~IcH62eu>Vv>UFvW zgU)34#UX>J%7*ip-zOmav3B@^HZ%9Qbm4%`6?Rz^JP3ueRw;H%wVI>1d9Bth*J^Fo z3~hR!tkpgYbvtSqtJO@`sMSOs!^I?uPc<<`?-NuM-I3#I*%R(#=U zcDyimki**(dQ;QkmZFBXxf3t%pFmmKcc>HM1Apy`Z2TAoHWN% zJ~#%iJZd$i-7!r&a!smrjF*m>OSYfG4x2@@oSJdGQ>sjKPYnkMy@=zG*8ObH??d;< zN2G(Q@Z$-0+cF97oCN3)OCtE_Vd?y}Ta_r_jFu>YmjcsJNKt;c2xkK9)&h5nJN`2K z9KUS!O+OwZIKqw}$Zah$;abbUuZHF5q66gj5%e4k@~C@SQtXwB>_#X&LMn8Sqi(}2V zFhXc|5Dyi8L6!V&z=fmRiM%*~GKA-C%ixs-S|sy5Zdgr<%rs5145b~a1P`E=cz7ZY zb0SW;h9yChLR(J}YwxVTwJgOOSPHh=3<;$ znh+6Tsl$at9@q!?AlVnWUu5y@#4oii+0LTR?oktl@&2`Vv_LbHe~gNstUr z9XRzg&sq_i10*M{FE$VQYqYBC(iJKx#Ep@Hs0@=hG1ztG2;imFC@fU?m}rIskzNJ` zk!eu%9*VP6yhsI-WlazV5t2L-P^isIqSq~|v~l;`Fd3<;rte~ePoxl> z^GZgMPRaNTAW8BZ4?1J$dQ&Er()KS=rgi%;;@{_YmBNBKs zut(ui)pNc1w695QswCqS$3{xi4{k~OYLnH|!|QUpbzpa$;G@3gI=sF>c}^Rt60lV2 zgv?HNKw>1vN`%G1jdFhp7ix`Il4SY|Qnui2N@pQ4QC{3F^m#Mv5fzzXxy`2S>N3p7 z;t9x;_c*hT$V>|2{s%BA(agJ20Rs?eBiFk9j;CVP?KDK*xwFhx7`00kOW5?Q?5g5W zbS9~NJ|8#D4Y;XKz62{*PC1dfiP?Q+glDGO>CP)o2`{I-vr-l_Xc8x=I7!7RDk$CS z7NmB%MHNBBEY;;nKjIbYbDD}?E=Vb$pyI3e^M8wia=|mm1*^ya_p_`p_xahEAKD+= z`vtu37q#ir58l83{{9$!mEx~)`YLND&OEqx|K9!teNAd7&pw>|cyfP=zNWSE^wSx< z$;?v9-G4J9r#M%b`_t@&Uta&k^UxQCxnX)4mc;s->t1|Bmnlf#0UM|xLG zrFSKjP~wj9m~qX#IhGnT-pN-=bP+UgZKnSP$SMv0*%@K%yb1uAycR=@jxWtPuGs?1 z0MQ^g9hoXV^QtYr_fo>uof}CTzIUTMit-QcP&x)WY~#+660`W?C%z0}o46GT!6@aj z=qgpXcI1%H!h4;iZdizq#CwH-t4vPtI7_`oX~YT(9Si9wSgMTn$dI8E$O$KI{eB2$h-$ca3Q)Q&8HNF9(Ffoz+bav&*Y5%*ILX9y}_leo&b9|uEoqR7GQD`cCaL|u5oCW2OOe*Y^pq+*-AdA>D9H6V$uOuBr<;dz zKr7!3C}eeBvh?G8sy+yyJv=wp^ECC1CkD+`@oAS@Het4tU8{W zJw`L)sWe*{j#AeEm!OmthnUjK|GD9eBp9PjXqic$LqZqbOf=O$SCDK<&qw~p#SX5m zQ`&nzR+Uv8gRIIYdx+UZ4p-Kr>V%M^M5?hwN>z~#{+6l!OS)@n~ zduJ$H1eJPS7m(2uvDydOKnrW1Hh~xDwpgJ1xa&=U0DYmBUYMQAfQ#Ke>`TE|UL;K( z+H>!4hLkF;b0yt5bMLul?$0^rJLl@(d_E6?lIH$8`QPmb{hGX3g|87?KLKJ5$tZ?o zMs}o`7(<_qm;;_{n$2)AjxqcA7!Pe++L>|1nrIsjv^(aZv@`9^2r+@uuCy=XkNGLx zly1%hVgaDtu@>1AYjq$cptL<=%o)Uj3g{L?iE%Nj+z=D#xTc98oLALsBEFzp(p9W{AnNmR zU5uyG>J3ell|?13<`k@ny6VyAli3APofFSp6%+Bqydvs2p4H|QEE1l$o>Yl!P0ZzW zJ(;~uKVtJLSsf=8&6CXPDog+?8C6$66#eY*Y+O@hF|Dc#`J6bLpCdx!Sy{ZUn4*-M zNsY?150#7@ffwObQyk$!t=W((3i=3QpH{H^8DKi2tL6x!GoI>&q>8UqcE4JwY+Y z6{N5VCo^wDE2SKi;-QVTDbDWYZOUm=E=oD&CWV#VGMeLL&mGP_f0yEveV`llmZl{k z>^IuB9j&=Y!=F|Y@w7CrYI-)FQHPL!Pph-Qp()ohaA=a2EGR~c zIm$d4v#BYERb5S}X~PS8l1wNPmDiq$Yr2AK(xkX7V@1;pH_<>T7uV+vCsy=4&L(Qg zP4~_VHQCy&0<^9n-R>(Q%Lv{;%Zy&zeuPpS9k;D2G8XSLF--WeOoc(Z%VPZtN6Kx{ z^{4Ex#uj()XJ66J>C}-BiPl&5ki_eP_1#x+I%U@@IGSYlD>yvXynp0C$_#38UNUoW5`d`o^wIPELG}c7<8oN@dI?vxXBqA1p;9+9QhM z*NLc}$whO?+^`udf>%H(t(mK1@ z-o18h_1b;y@BVCM@=F%^`+n2hU+F#YaP+}wY4FW*?^~5n|HHEn&X$J0Qw|-kh=V`$ z{J>KQ3_KYc+hQFD{8cB~KkzX1AXNz*d@^tn+V+#Sj?h|cHMZ{gWTM>udZlaML(c=x z`a3`FFL%9B>3{8qhktOmG(2AJpLp3R1UywFxIJI;@bMMtYmFTHM;&7$?BfxBY}B>Q z_mQn7e_J`I8VxvALn~;u3m9h2z?V@G-SX;|=R|cgyv^uNYrt*>TXzhXP?#;8q-txP zv!${W^t7CkNX{hzU6NU<)=G9UiPbFeZQ^1aC*!keC1M26U$`)H=JdG<>CAEKYTR3qU>|nP6;?QpVLn5G2R@FsK(Ia8raHyK$1pNXD05tJ*UYWpH#ZZ*8qOWuQWG;M3-#8_h>c!|#-vPnMcaZh-{AP6uv-C!KEC zZ8S;L!%LFkk)(_&=fR;%h9F6Qo{y)A?}s-Q%*AMyq<9v#!HftCRoP2Vd zI$Nr@^*Qv^p+LAAvj84fcFE2;fNmJ$rrfUI z#|_Z#22RiKm1|0b5FqCLRuUmJhzU#1uweMMX%Oy%87CSIlprHD<9~+MHRLvfT9jdy z0V~|dfr<_*>?k6MaMGeTSyYNRi5;b3gtf0oC!j$~iU7rqB6F65V8B77>``^L6w%Bb z1vqVFt#_q@b=so#;Owya>(4znyXqVkwFhU9)n9+^!AW#}N7;ju*vgKw2dB7$4PLD% zfI&+T2k3SjbqFAPnQd^z^@xYP?U2zj`yqRSxq-|`0Y>&)vSDNhM%GVs!0M?#WxmK- zkqU!uLo#K`(}@qDg-YRGA;nqbc1Ynq`v=;FMt=d{si^m;)q^_L06T@j{ zN(PtA&Z)oy!G7*rM1lAakuD4LN$>nZ;CY-7Ug6`IT?;a_4A6ps!=;F@>U`!sNPBXHP$IkFKr zV!uQ;0#W;Acq1@uzZ}{K9I{`AHUdNT%g9Dx)W9o2`9Shoj}q zEtUhsZbHHKs+;q*e^X_l^P87PQK07qa=2T`WT)>=m%2~<^t(@lQx&1*o6Y8~?Ohj3 z-@EcexJvq}JdF8TI{}+=+j(XL-90lj?nl2EagTf1UyKUlPWCe=5A|beO>;fo+Uz_(WROl*0BDH9{Nb z2`foe;GCSu?V}KFunWqP!7q{^YKDd-2^)QYm3)!nRs@JOK&kJyfk(8nNNB8;A_E9k zLpS;UEna>65Lxz$xJT&DNCn|&p-zrhssJE%%F?~zCz1(9fPI}VI+VG(-x4> zQu2l>pfWReXYLTV)Y87@U-jSD%7KF`e5EXcMtRzU{$zdj}he5c9bk!umP5ke{bjc%emCq6HwN*?EKK#fM-pGzKeMsuF1}r7jCMJ7$S9U2A9ao@!$S06i{Ja)|YZh z*kL#!<Bn9-0$O~IWI!+-O(K_W_)X}=?-CX_kx513 zw@BkCRB&ZLd_~>(@I}JvtF5HAFfX0IG`plL8lVBj)L{@XfMVtnlQ68%X)UA%1+a8# zEgwRsb`L6QtVjOfACG=|^y%Rf8;4K)bn%ymC+)f{4BFEbh+GdWyM={aPBFFSUiA4&Fa0s5{`aVbHY=M z?Vu+=XP*euo2|X2(A(wKcYb!PBur9&JjHyik;wArgP}1NJ!aiw$Job0|5${59N~f9 z<~MAkyhDwWDl?4TSyGLLTY~>yILU; z-e`r?&Zv3NpchZWE*NYgmk)bQBO_;q|Cm&^d2P2x5(DC4GK@>=N(Nshyx%474qXkq z>y~J6syfRtYKm5tU z#{SWg5Zdel13UUOG_nyIDTj`gyZ&(HJqRd*ee2BMo_xuomX0-HRk(k3{o0d2^f&E2 zYwxeVzdlfIf3505zBYiyV9(m|)#LZCmfH@MghS7p+rD~v00p{9NcC;4nIOUK-mzx% zxY<2+ihbPGKQ_uf9_4`!`!GR#946IaQW2fRr${wMsw1Q#Ct^4+Q~ybwCPj5joEs*a zNaHJt*rwMTA#J1@hd=ETsP3Su;{f0KJkYbn!}G-?!}7kW$Hfml_XVmvJYiD;?H?EK z7Vpa&&HYsuX=*|q|CXCjtxmq3a>5fhJ>i$o6@ZO)4-TQ7iVLKh(FCB-NQ&j{NTyHr5 WkbrtQ!+gLn!BTMGcZd*llm8C~4~&uk literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_lookup_buffer/__pycache__/simple_buffer.cpython-312.pyc b/distributed/kv_transfer/kv_lookup_buffer/__pycache__/simple_buffer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cfccb6c280107f5e9d59bb0cb6bfbb0855d5396 GIT binary patch literal 10955 zcmb_ieM}o?nxC;f#vg+*V8F%%hCoOhf(c20d^Jf!KH7$owE5^3rE!dB025==nSm76 z^wMs1XKr%}s5(LJdP*v-RH9TWRV%gk&#Ak6?XB+q@dUDV=TeEg?P;(7oZOOC@4A27 zJ1)R=w8G-@8|P=J@4=Nc>OQM#Wn&$#Psv<;Rb^E9cHvZWfLn=NX!x}(Mzx- zYlx7&Bz%n#!?3Z}NUCL~UK5m=BGjcItwU2?axg93%3x%rF-k3`WDku-|4o#f>v3xS%-7bHaXG3v(=w%Bc(a zJ%_`B5ROHeQ6U^1V9t#9ghCN!Bs{_ie(2W|i?;MdVxcQoc{v;vI9|NGgUJ=Kd~75j za1kyfUd9gR1&--DbDj~x1JR&Lh!JBf9cbxS`_;1#@t+Zdh^YxYg&8>9b)30+j`2yLzJncr|4!8x(_T zvP``>0lYZ@aS*P@V91omPnbrGS^0N}W(hHiLIiO)+bB*1^fH!&9B{1~;zr12@rfWr zt5!awpr@8ozj@TmT1_0O0S~GnZh)Lc%dwc#DT)}n z2!&3u))p|73ics}GTD%~Yx^01Ilv~HUfFZ{+;J>zVY#s|Kn2f*#+a*!9tyxQK}g}2 z23Y}MBMUFF5FpSFW;m!|B0!E#P6t*1EOs;ma5fqZ_C)}qV@w197;z}ZM7gUl5a5g; z0yc%Q8WDUrEQ$b*j}Y@S-J<}%k#Pl9l^MpOkW*$TD02#p@;pe3>{J;c#$4q9&vO$) zKvsILHxZB++J@p63k&U5|QGiCX+R*3VkzZu9%9OG_ zk2&RvyiAr;V-Z%isyhs8m#tlzWRR`d#-TADxr9fSldH1(763WNMxcj6CdkT^7~?~O zvKh8mi1Bi9G{z4HBXB2B2+1}s%4X9(lWYQM)t@8E+kvuxw*+_KhrU*Ec%&6BS&JCt z2RO0iwb9Ofd#`kIt&wnF>&Un`7>l-!MIytkd4jq1%2+@X!aW>~4 z8J8`AfFe5r6BYSa+^-dTaYmORx<)*#A$E0sRFyDYw=a8Z5~d}KJH4e7zndR;Hm5v0 zB+rgTk00tCX>XHS@7b(mj~v5nkodroSp#fiVcV zRE%ALeGBlhFi7$!%kisf^+XceA*ZS7y@6G%T1b5;cvD7}AjkCX)?uBTK}K zs)@Ks{c8PqL%irLF;4nSJqjm(gJYco_Am~Iqp~r=MR`Ob*#a04io`(vTli`y;A=6e z!>Z!!zGe4J%c}b&+w!(kNo(amKLF$h#6bbk5V;NmkEaf-F&q>@s}t6;Mz**e8VP?1 z(KRCDBs|;a>m~P&L~+{f(^3^pw_lYidPsc z{zAHZQLy@LAwxi~gdPPLMASu6A+dI48?4Orr#PtjIE{`jmsDv2@; znl@Sniny1l=;$yP0ku^eAK_GfHTA_}P|_cX1w|D!?MN-4BBRfsGBKW(fQO{t1&8%+NySEkz_wa*NAjUMY5zmX?ZT$@Z2KACG}|wowB&^S=@6c zQZ=noP3vM!+ro)Yxp@XlVGx%l>_ z#0lts!!hMZSvE_S&9^)5Ted4)C!283R&kpSP4B~0^3|O!e3 zxkh<>r>iZG)*|1X1s)g3aZ*Idq6F3;>im=AfWIaF&Y*F_o21XE0?|a-`RJ8s>}r(3 zwJ~_IjIT(xg~2{kEgJQp&F2(Z$G5-;ydNUj@w9;E8Q4|Fc^;&o3Hxd%z6V;YI`Pr~ zNVh;fJ5AKCKIygfy-4C&h^`S2>=o1OJI8Nz-|Swp`_koAsq#jtym7I-Y2Nig^_}Xw zHBxzdl5YR9q#O{(Rq?f>CRx|^apQvc(e{O!AdIVf7H{aCBr;45(--8v!?E zBUrjJrI@<1c`(efTr`w_ck#|DnDSo`KhY`Uo!;P-C&`>hiW?r2bDVL*hBOU%G>wl< z)3_l`V;)V@W79NkNYj)@lX`5L)P^)EFa``D`b}|afSNStjET7U7;&R0WEcRfex)3g zh$28!{pP?qXG6S5yDKw>>lG$wJ*k&WTA+_^5@q9*#fZ2%UxJJcQW1CTQ66#H%5%!)PF73rH#03BV?hllpCddAZk2 z0J1}q5Gn_2LT)uIxYt-hLtd1KAp@zL>{X(HXd?vjD&${=Q^dO0 zj4u;cNtBse!P3(7ZOwB8-kko?NW`ytl)yId`~AKWz5@nV0F&>8luUsx7v*H5Fgncd z$0`>_$OJq}YuT*&bI=piH##7jt_H&*U!v3mM@B%9%I0WH4EK+NKvvYj1K9N-Mo(dM z2&2Og>84LulJ&cDjHP1eHD|sG6T65M)72$Fg?9gm*Os}zx?sBRY5%NQa`!B{PA9C(#ichc{rpn8q&!tpCzaIQ_DLnp z^K8ohjO2gjUdc0{kCs-=(lc~&(}^YL$#i9Psswbo~?C3o)B_*ca|d)kg@I(VYNW1`&5l4=2%y%c z)~)4*Rh54Zqy@ZrirzKnO1z%5Y*GYkC~H1JU7pFd2PX6vFc~ZSO%a9k?v1qWNfP=L z(7SV54PPbdl6t6srj(DI(`1wedbm4l!iIA&JV75YI~BXbprH@g9pH*ZtVgpoZX7U7 znpl&lW0n5zQH*cp3pgyb1 zHxf^(0L>ws#+sn5?~&2S>wAF;a_^e35k^m%Q9i438G57LyC&r1GD)CZUT3EDYq~); zH`BFpV4A$%V0n3}@ z%OH;?Ew5XyA6CCt$*Y>&vGq*2UgSbtcnn@*;Bsc4IO-=l&qO$QAkd#F@D&1-SoPrn z%_R^<48D`JsP812_wV=WG?ou5?gYLWva7BvUNcaLh?Rea= qH8}jmxwp^n zy6^U9tVC^l#zxfCE3iD#T3CAly#k)evWxIDFM;NvYB3WYaEnIafeoMO6kim}^o=tk z9LG<1yTCW4@mKa8PFtq0T=8pThB%8v~{+Q|_FrjxkrzP<_v5}$~ z0X!1?af}otsAwbxFB1JCzZ;8ALL}2Kjz-1sFsIs_MxdjqM$coLoPLDZ!Cz3isLm`l zpy)=3d@`x(M3+^KNb^dR7dX5L=EshA3J40ra0B23GlpiXJ6%?p_H0Ude3HkP^0Z5y z_C-(Uv{^A*o22rl+ha@R{s*pVRCSU}$C9h_OP6Q4uHn|}H(yWH?UU;EE!G{Frk5+L zXQMOGROL>oa_3^@u4!Yss(Sj*GK8VDEbV%J#Ry->nu)6FxvrToRBvfp*qv(FJpv)uPZ!%px~RSA%OO!w=aEJ+m_tbwN!iTfxGq=eUnaZ zxv=DZDGe;;xaml_L4j&sa<_a-5p}zu-Tm5Qz*ddh-aGK_fmB0>)X=fmuy?xqUpM>b z%+M$0?vUIa3uE`)hd?oEXkFH{C7L~@xX<|OgHO}&`P^cIvd?|g@m)oq`-?E$Zaq(`5<`5AaJtpX_ghCEK&D>DjdIb~|O)P;60B4~DZ~;6X z&025)Y(TS+VXKb^FJgs^*de@mYXY<^V7O{%uNY&x`I_rmkkV>H_HldZj;nRYDquA0 zD@BKCp^-M~V~88xP~H+0l@0gmPo)5_nSXL0(-ZVr<>~-!-`F#8J`czkVK6ot0SJd5 zI>6%sC=&qYs59u#hxq#kaP0Do`s{{v+21aVb$36n|A4{ApMkUW*?HWyHHIz^;-28! zFhX|ZcVUE9y#o0BOOU!_-q?Z_P>omdSQWPRyhTj7*F)jIhhBmg&={ly2DEIU`1>|U~LU9R%ZzCQDMs;W(@YFn(@ouJc>vK!H;exg$ZPODkq;XKGWW%~EOeVrfglyzH!* z3*Fv5PtA`dOFEL44%Ix<%_P_{)XEgh`vMXQ=D`lG4LKLuUBpNT%I!ldb4|E*t{dDD!ou-SJC-a*LAKG( zlzppY-YJF%l>~JKGEz#SuE`n_my)8xiCE9H|(5>R8j zMtn_|PajKG?_Q$!qyY$=J0$0hq;u~woCmG$a*IT_+^1VXi&3025?!O#fO1`Tr|v$z zPuY5389tdugTq`PAlm|gVfb}L1k-dN@akwVqSo+uCsfF*!PLtb;Tx{v`BQ8PRc+mm zWePVc$}ln@--povMprQE#z=YBM6V~0cSVWNdKQLYAKn7Z(oNN+y<0Ly{O&raeqVfP z1@oCbHB{-cx9&Fe9{ny2-na|}1;Ai6j9#=-ghdA_s(FF>g#M8JRFrmhWeCXs#7oiE>CIOw&86!!O-cG-!@pTw(?40X)?PC| TH2=g%?oXC)`2&FoWn=yiw5b5b literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_lookup_buffer/base.py b/distributed/kv_transfer/kv_lookup_buffer/base.py new file mode 100644 index 0000000..f48d03d --- /dev/null +++ b/distributed/kv_transfer/kv_lookup_buffer/base.py @@ -0,0 +1,179 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file contains a new class `KVLookupBufferBase` that allows developers to +think of KV cache operations as inserting new KV cache entries (`insert`) +into the lookup buffer and querying existing KV caches (`drop_select`) +from the lookup buffer. + +This file also contains a new class `KVStoreBufferBase` that allows developers +to manage the KVCache buffer as a simple key-value storage buffer with basic +put/get operations. + +These classes above are abstracted behind class `KVCacheBufferBase`. +""" + +from abc import ABC, abstractmethod + +import torch + + +class KVCacheBufferBase(ABC): + """ + Abstract base class for a KVCache buffer. + """ + + @abstractmethod + def close(self) -> None: + """Close the buffer and release resources. + + This method is responsible for cleaning up resources related to the + KVCache buffer when it is no longer needed. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + +class KVLookupBufferBase(KVCacheBufferBase): + """ + Abstract base class for a KVCache lookup buffer. + + This class provides an abstraction for a key-value (KV) cache lookup buffer. + + The key of the lookup buffer: + - input_tokens: token IDs of the request + - roi: a binary mask on top of input_tokens. + - Purpose of roi: Since KV cache may only be available for a subset of + tokens in the input (for example, when vLLM is connected to an external + KV cache service), roi specifies the subset of tokens that the KV cache + is associated with. + - NOTE: roi can be further extended to describe which part of KV the + current process is holding (each process may only hold a part of KV + due to TP and PP). This is not implemented for now. + + The value of the lookup buffer: + - key: the key tensor in the KV cache + - value: the value tensor in the KV cache + - hidden: the final hidden state generated by model forwarding. This allows + vLLM to bypass further model forwarding by transmitting the hidden state. + """ + + @abstractmethod + def insert( + self, + input_tokens: torch.Tensor, + roi: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + hidden: torch.Tensor, + ) -> None: + """Insert into the lookup buffer. + + The functionality is similar to the following python statement + ``` + buffer[input_tokens, roi] = [key, value, hidden] + ``` + + FIXME: in the future, we should only have two arguments, key and value, + where key is a tensor dict and value is a tensor dict. + + FIXME: we should transmit both sampler outputs and the hidden states. + + Args: + input_tokens (torch.Tensor): token IDs. + roi (torch.Tensor): A binary mask on top of the input tokens + key (torch.Tensor): The key tensor in the KV cache. + value (torch.Tensor): The value tensor in the KV cache. + hidden (torch.Tensor): The final hidden state tensor generated + during model forwarding to bypass model + forwarding. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def drop_select( + self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None + ) -> list[torch.Tensor | None]: + """Select and *drop* KV cache entries from the lookup buffer. + + The functionality is similar to the following python statements + ``` + ret = buffer.pop(input_tokens, roi) + return ret + ``` + + If `input_tokens` and `roi` is `None`, it means selecting any of the + KV caches in the buffer, return, and remove it from the buffer, useful + when offloading KV cache to KV cache storage service. + + Args: + input_tokens (torch.Tensor): token IDs. + roi (torch.Tensor): A binary mask on top of the input tokens + + Returns: + list[Optional[torch.Tensor]]: A list of tensors. Can be None. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + +class KVStoreBufferBase(KVCacheBufferBase): + """ + Abstract base class for a KVCache storage buffer with key-value semantics. + This class provides a simple key-value storage buffer abstract with basic + put/get operations, which enables flexible KVCache transfer granular + control. + + The functionality is similar to a distributed key-value store, where: + - Key: A unique string identifier for the cached entry + - Value: + - Tensor to be stored and retrieved + - None (indicating deletion or empty value) + """ + + @abstractmethod + def put( + self, + key: str, + value: torch.Tensor | None, + ) -> None: + """Store a key-value pair in the buffer. + + Args: + key (str): Unique identifier for a tensor, this tensor could be the + key cache tensor, value cache tensor, or hidden state tensor + generated during model forwarding. + + value (Optional[torch.Tensor]): Tensor to be stored. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def get( + self, + key: str, + ) -> torch.Tensor | None: + """Retrieve a value from the buffer by key. + + Args: + key (str): Unique identifier for a tensor, this tensor could be the + key cache tensor, value cache tensor, or hidden state tensor + generated during model forwarding. + + Returns: + Optional[torch.Tensor]: Stored tensor if exists, None otherwise. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError diff --git a/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py new file mode 100644 index 0000000..7861bea --- /dev/null +++ b/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py @@ -0,0 +1,164 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file contains a new class `MooncakeStore` that allows developers to +think of KV cache transfer operations as putting new KV cache entries +into a remote KVStore-based lookup buffer and getting existing KV caches +from this remote lookup buffer. +""" + +import json +import os +from dataclasses import dataclass + +import torch +from safetensors.torch import load as safetensors_load +from safetensors.torch import save as safetensors_save + +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVStoreBufferBase +from vllm.logger import init_logger + +DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB +DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB + +logger = init_logger(__name__) + + +@dataclass +class MooncakeStoreConfig: + local_hostname: str + metadata_server: str + global_segment_size: int + local_buffer_size: int + protocol: str + device_name: str + master_server_address: str + + @staticmethod + def from_file(file_path: str) -> "MooncakeStoreConfig": + """Load the config from a JSON file.""" + with open(file_path) as fin: + config = json.load(fin) + return MooncakeStoreConfig( + local_hostname=config.get("local_hostname"), + metadata_server=config.get("metadata_server"), + global_segment_size=config.get( + "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE + ), + local_buffer_size=config.get( + "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE + ), + protocol=config.get("protocol", "tcp"), + device_name=config.get("device_name", ""), + master_server_address=config.get("master_server_address"), + ) + + @staticmethod + def load_from_env() -> "MooncakeStoreConfig": + """Load config from a file specified in the environment variable.""" + config_file_path = os.getenv("MOONCAKE_CONFIG_PATH") + if config_file_path is None: + raise ValueError( + "The environment variable 'MOONCAKE_CONFIG_PATH' is not set." + ) + return MooncakeStoreConfig.from_file(config_file_path) + + +class MooncakeStore(KVStoreBufferBase): + def __init__( + self, + config: VllmConfig, + ): + try: + from mooncake.store import MooncakeDistributedStore + except ImportError as e: + raise ImportError( + "Please install mooncake by following the instructions at " + "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501 + "to run vLLM with MooncakeConnector." + ) from e + + try: + self.store = MooncakeDistributedStore() + self.config = MooncakeStoreConfig.load_from_env() + logger.info("Mooncake Configuration loaded successfully.") + + self.store.setup( + self.config.local_hostname, + self.config.metadata_server, + self.config.global_segment_size, + self.config.local_buffer_size, + self.config.protocol, + self.config.device_name, + self.config.master_server_address, + ) + + except ValueError as e: + logger.error("Configuration loading failed: %s", e) + raise + except Exception as exc: + logger.error("An error occurred while loading the configuration: %s", exc) + raise + + def close(self): + # MooncakeDistributedStore will automatically call the destructor, so + # it is unnecessary to close it manually. + pass + + def put( + self, + key: str, + value: torch.Tensor | None, + ) -> None: + # A message queue needs to be introduced before making it asynchronous. + if value is not None: + self._put_impl(key, value) + + def get( + self, + key: str, + ) -> torch.Tensor | None: + # A message queue needs to be introduced before making it asynchronous. + value = self._get_impl(key) + return value + + def _put_impl( + self, + key: str, + value: torch.Tensor, + ) -> None: + """Put KVCache to Mooncake Store""" + device_id = value.device.index if value.device.type == "cuda" else -1 + device_tensor = torch.tensor(device_id, dtype=torch.int32) + value_bytes = safetensors_save({"tensor": value, "device_id": device_tensor}) + try: + self.store.put(key, value_bytes) + except TypeError as err: + logger.error("Failed to put value into Mooncake Store: %s", err) + raise TypeError("Mooncake Store Put Type Error.") from err + + def _get_impl( + self, + key: str, + ) -> torch.Tensor | None: + """Get KVCache from Mooncake Store""" + try: + data = self.store.get(key) + except TypeError as err: + logger.error("Failed to get value from Mooncake Store: %s", err) + raise TypeError("Mooncake Store Get Type Error.") from err + + if data: + loaded_tensors = safetensors_load(data) + tensor = loaded_tensors["tensor"] + device_id_tensor = loaded_tensors["device_id"] + device_id = int(device_id_tensor.item()) + device = ( + torch.device("cuda", device_id) + if device_id >= 0 + else torch.device("cpu") + ) + return tensor.to(device) + + return None diff --git a/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py new file mode 100644 index 0000000..f046a34 --- /dev/null +++ b/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py @@ -0,0 +1,242 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Implements a distributed key-value (KV) cache transfer mechanism. + +Key Features: +- Distributed KV cache transmission using PyNccl pipes. +- Non-blocking `insert`, blocking `drop_select`. +- Use CPU signal pipe to avoid racing condition +- Handles buffer size constraints and provide backpressure mechanism to + stop the prefill instance when the decode instance is slow. +""" + +import threading +from collections import deque + +import torch + +from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVLookupBufferBase +from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class SimpleBuffer(KVLookupBufferBase): + def __init__( + self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, buffer_size_thresh: float + ): + """ + signal_pipe: on CPU + + NOTE: on-device recv will block all threads in the process, making the + KV cache producer unable to listen to new request while transmitting + KV cache. Luckily CPU recv only blocks the current thread so we use + CPU recv to listen to new request. + + data_pipe: on device (e.g. GPU) + """ + + self.buffer: deque[list[torch.Tensor]] = deque() + + self.buffer_size = 0 + self.buffer_size_threshold = buffer_size_thresh + self.buffer_cv = threading.Condition() + self.signal_pipe = signal_pipe + self.data_pipe = data_pipe + self.request_handling_thread: threading.Thread | None = None + + self.normal_signal = torch.tensor([0], device="cpu") + self.end_signal = None + + def _matches( + self, + tokens_roi_sender: list[torch.Tensor], + tokens_roi_recver: list[torch.Tensor], + ): + # tokens_roi_sender: tokens and roi of the producer (in the buffer) + # tokens_roi_recver: tokens and roi of the consumer (query) + + tokens_sender = tokens_roi_sender[0] + tokens_recver = tokens_roi_recver[0] + roi_sender = tokens_roi_sender[1] + roi_recver = tokens_roi_recver[1] + + if tokens_recver is None: + # consumer sends an empty request + # semantics: DROP SELECT * LIMIT 1 + # so any of the data in the buffer can be drop-selected + return True + + # Assuming that roi is a binary mask on tokens + tokens_sender = tokens_sender[roi_sender] + tokens_recver = tokens_recver[roi_recver] + + # simple common prefix matching + min_length = min(len(tokens_sender), len(tokens_recver)) + if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]): + return min_length + + return 0 + + def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None: + assert tensor is not None, "Use self.data_pipe.send(None) instead" + self.buffer_size -= tensor.element_size() * tensor.numel() + if tensor.dtype == torch.bool: + tensor = tensor.float() + self.data_pipe.send_tensor(tensor) + + def _get_element_size(self, data: list | torch.Tensor | None): + if isinstance(data, torch.Tensor): + return data.element_size() * data.numel() + if not data: + # cannot perform `not data` on a tensor + # so this check needs to go after the check above + return 0 + + raise AssertionError(f"Unknown data type {type(data)}") + + def _add_to_buffer( + self, + input_tokens: torch.Tensor, + roi: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + hidden: torch.Tensor, + ): + if isinstance(input_tokens, torch.Tensor): + input_tokens = input_tokens.clone() + if isinstance(roi, torch.Tensor): + roi = roi.clone() + if isinstance(key, torch.Tensor): + key = key.clone() + if isinstance(value, torch.Tensor): + value = value.clone() + if isinstance(hidden, torch.Tensor): + hidden = hidden.clone() + + buffer_item = [input_tokens, roi, key, value, hidden] + data_size = sum([self._get_element_size(data) for data in buffer_item]) + + with self.buffer_cv: + if self.buffer_size + data_size > self.buffer_size_threshold: + # log outside the while loop to avoid this message being logged + # repeatedly. + logger.debug("KV transfer buffer is full. Handling...") + while self.buffer_size + data_size > self.buffer_size_threshold: + self.buffer_cv.wait() + + self.buffer_size += data_size + self.buffer.append(buffer_item) + self.buffer_cv.notify() + + def _is_end_signal(self, signal): + return signal is None + + def drop_select_handler(self): + try: + while True: + signal = self.signal_pipe.recv_tensor() + if self._is_end_signal(signal): + logger.info("Received end signal!") + break + + input_tokens = self.data_pipe.recv_tensor() + + roi = self.data_pipe.recv_tensor() + assert roi is not None, ( + "Please provide the roi when sending drop-select request" + ) + roi = roi > 0.5 + tokens_roi_recver = [input_tokens, roi] + + def is_buffer_available( + tokens_roi_recver: list[torch.Tensor], + ) -> bool: + # perform input tokens and roi matching + # FIXME: this matching is O(n), ideally it should be O(1) + # but this buffer size won't (and shouldn't) be too large so + # the fix is not urgent. + for _ in range(len(self.buffer)): + if self._matches(self.buffer[0], tokens_roi_recver) > 0: + return True + # rotate the element we just accessed to the end + self.buffer.rotate(-1) + return False + + with self.buffer_cv: + while not is_buffer_available(tokens_roi_recver): + logger.debug("KV transfer buffer is not available. Waiting...") + self.buffer_cv.wait() + # need to clone the tensor + # in case the tensor is freed before sending finishes + matched_item = self.buffer.popleft() + for tensor in matched_item: + self._send_tensor_and_dec_size(tensor) + self.buffer_cv.notify() + + except RuntimeError as e: + if "Connection closed by peer" not in str(e): + raise e + + logger.debug("Closing drop_select_handler") + + def drop_select( + self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None + ) -> list[torch.Tensor | None]: + assert self.request_handling_thread is None, ( + "drop_select should be called by the KV cache consumer " + "(e.g. the decode vLLM instance)" + ) + + if isinstance(input_tokens, torch.Tensor): + input_tokens = input_tokens.clone() + if isinstance(roi, torch.Tensor): + roi = roi.clone().float() + + self.signal_pipe.send_tensor(self.normal_signal) + self.data_pipe.send_tensor(input_tokens) + self.data_pipe.send_tensor(roi) + + input_tokens = self.data_pipe.recv_tensor() + roi = self.data_pipe.recv_tensor() + if roi is not None: + # convert from float tensor to bool tensor + # as PyNccl does not support sending bool tensor + roi = roi > 0.5 + key = self.data_pipe.recv_tensor() + value = self.data_pipe.recv_tensor() + hidden = self.data_pipe.recv_tensor() + + return [input_tokens, roi, key, value, hidden] + + def insert( + self, + input_tokens: torch.Tensor, + roi: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + hidden: torch.Tensor, + ) -> None: + self._add_to_buffer(input_tokens, roi, key, value, hidden) + + # when calling the insert, the current process is a sender + # need to launch the request handler and start listening to request. + if self.request_handling_thread is None: + self.request_handling_thread = threading.Thread( + target=self.drop_select_handler + ) + self.request_handling_thread.start() + + def close(self): + if ( + hasattr(self, "request_handling_thread") + and self.request_handling_thread is not None + ): + self.request_handling_thread.join() + + else: + # TODO: have a explicit close signal and have a explicit way to + # check if it's requester + self.signal_pipe.send_tensor(self.end_signal) diff --git a/distributed/kv_transfer/kv_pipe/__init__.py b/distributed/kv_transfer/kv_pipe/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/distributed/kv_transfer/kv_pipe/__pycache__/__init__.cpython-312.pyc b/distributed/kv_transfer/kv_pipe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6da9bc52f20eccae45ac298f198a33c7d23fbfb4 GIT binary patch literal 181 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVh3J>$7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?%`knAV~iX&(}w=?IPcz-c7 z>u?qYLKY1TU7!Lgh|*B83^``97GztNYP_GBa|7d@E>O<6)^Vub3kP|(e{-@7I=agk}U>6<Qo3yFI4irk5uAo; z9OsD)8G6zyzP}blG>n+m5Jx83u*67)mOYim)L_n&9GManO!FAto0N88MXy}FvOzP* z9GqS4(LtIkdTd0RMa)3{F=dewj1Q>Jvn*9c(`_+W-eFNLXq2YgdA8((D;h$Lx`PaY z{w){yaxUT9O$1&HQ^uyexxEuu4?#EY^@Q47@qa}$T5jR3UA$DB8r3ourl0cl&_ZR$ zG2WlS`VJAK;{fI^cRHS^aaYuD%~Z|poO`=d1hD5`ug2?pwR=O8&J>)fJ?Tu(<&BDc z6YQIOsK6P3S`vqRV1DplISSN=$3gC(ni7*sC>%y{Z$dEDRF<-j8 zOl6G30lt=k5J~~*`=EyilmJ|`PM}g7mp4`^;~07qNVZ9t+5!f{Ih{gW5ZY}~@+{Ya zg2=c;ZLnzUVZuz#4C{hGK?G&S$MZZEA><-;3?k{vgpZva@u6#KORtU%FmGET`qD;b zQ+eN{D(tV^DCPY@YZc&!RLj*+htLnq3cZAC5vBAi6}ZkMsQYk)_0(LAvq)ea1b-8Y zhh9PLRuwePwLxD{<&hOM0+4HW*ext4Fx$IMF?WPN&|6K5sHZ?1L?cxw{o`8O#915K z#WE;%OPJ-4^7F4<*gh}Xk?gj!0VpzQ??h4DuGF>-k_KQB4Qj?YcAGZhu#ydm6W9s^ zTa4X>iBMK5`_Ezq*yKMUk13g5{JHtU0jx-fq#Zp^CU6U0UoRUiHoXx*WqYy(Z;d*p zB{HxOJ0jKxpX;Sy{sO|$@E$_f_V!6Tt~f`u$Np`UqsRZB5ejR4*HPFjHP)4?lQ0%1 z;Bx@VlDLPks_egvXNFiUSgh16?EMR@79ixMC`CCB+X|vFyelJt0#-u9MHLFX0!LM= z(jSBXCK=q3vI{phcGD0%mX|E}JamngjSC};J=t3sBiTE8Fo}ovXByfo_O>*ADorJY z|5~agFzRV6aRrt-ewx^Qsv@3p=RyUF`3e**SHb^DWWD7V%^*lvEP|l$gCI_M9^rf@ z2tLeNG<-7^1UwCcK-J(-Q7?O0(YR5vK;dnnp21-rhP$N1x@3!L;4Thx@TYIVuty%c zUcI^Rlg9kMJ6m7;=Ed)pzFqoZ{{G^t`vlfMo_&Nr2j0kFf3ckT^~EpW{rcutH^0Ap z|I90o2&{)TFxx*3Hjmut;od`>*)%YCtD#QAPvNm{sPKX-KSNuPkH`NW=it3 literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_pipe/__pycache__/mooncake_pipe.cpython-312.pyc b/distributed/kv_transfer/kv_pipe/__pycache__/mooncake_pipe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..985e5456441b41582c07b6263d1f7f7e2b9c13d9 GIT binary patch literal 17391 zcmch9X>c1?dSExs#zBDKEuJm$gbq-YsKb&hS)wS}I!H;Bl$hfLfzVA6Bo4Z}A(=2Z zQ9MpUd7QG8Nv)_Pv!ZvCHMRDx>Ds9>XY*s6qbghDKWKw0U_11#QgN!{f0&_C-ksUn z+V6XP0N{XtQR10kK!nf zj#A_5lOCr@-ZXB4rzvWVS;j3i)|sQ$m~Grf(sa~5ZYS-SaR#22s3Ybacab_Pq}}6H zByEd&V%~8tN!z25#sPEhHmiitUi17vJi@k6TC2# zh)+bWLz(mJ)eDgs{zOQ`7OqG(_a~Z&o~0iAX#+nGmJmOhS;9s#s{652j+T z24_Ojq~aB4qR`x^kG#A@e-+kVj696RN<;P@#Z$OI$}XF%W9wt(+T>T`aNBn*CKu>-j_?9)=gtaWEXe z9*OfRCKT684>$T*8=MGLE z;QOPIYyC5`(qtmueYzX5+c`<63_Kd-v~;d`OT@L zgs{qq3Ng)!GkiEQ5#c#D5+{Vm$8SW0L_EgFCH6*0h=i^I)hTu7M@L77j-MS44vmhS zdg*lV!tu*5rVd{QO0Cp^?XA?lmyL*QJRz|nF9m=EO$kx40Sy5J#d$RpP4dHnkPrwZ z3M?Q38NWJbLKcJ{iHI6j=zv(MLMag>I+n-7`WoGkZ@97M4dL1B8ev{Au z4>D~(qcB0jLW4oY84Sh}ToM>*P;m!?uO>rLWUi1i!`4u|!C)v3=m<%XL|nu+BBsN@ zq9+p^L4eQ;fFDvqAHv61R&j${wF!P=5Tf6u9{cLDmb!d(?e|9W4ORcErQ1Hlf5B(gU>t@PULn;c*kovjFZhes^g+C)nb!;55 zjG*06fMNZALsS%x96fI4Odtk9NP=*)ylEM?lDw6-aW*Ko!;^uhgL6(0Q3v{r8_KHS z=YgLWe%0`EqBzxB`2Y&T)nG3IwHERYt`72gnT@N5GAAi(C@E*SMksfY@+RJGv}d?2 zP+nC6!#!c+nxV|Ywdnm}hBmGh%Df!*#Ti?;Hpo}QJlqo|t{u|8S*O24X{bzjsgo~; z;#`y$$B%-(?$f+tF zaPe6J!4S<7N+A-aPylTRoJHqVAkJI#$}}~Fw6q~iwQeYYtls+-;E;2+Ir}qVv_As| zGv`R#(#$k$Cc&AuryV3^O*=SS3c5nA9cIV;Dil)dOgo-Z>yl6wkpN>jb+Hc^Hr}?R z1(SA_v_OHb_N1md^>p!>2E~ZX)D7cTz*F1>nslS7vNx^pM&wN^U;wa;444E7IXekcyY*D@Swh8I^ir7V9z9)r zPVFnE^^_FQQ&ao(RPi}=u$b0UIt{}N+w`*kXXBwMu_2U^EsxADOd%z?5LX*h?^@m-IHpXPSncYELt1N09<&K{=NUp4bH{_MD|e`zjh0mN&pm zpR;_|a+AJEsp>oQMCF?V=-C22i~O-suPHv$CH(OXF!WVu63TTF07mPa_31D;Yuc)F z5}G1Vtlza3=j3;c2&iC!BIPN!9aBHN5aq#g2DJ~gj%d^nL+rI#b|MjtCT^l|LQ@1= z2+1(fQrM8hPD;{@c&NYsdPJH`UJHa1G0^G5q3|TXD-`M1C1L-yXyRIbEEI|NbBS<2 zAMd}Gj6}IW4D6c(DlxeMazzuY{c4AoB|(9NW{EX z%;3#X?0_DLe@#;C@O~s=6BA(TONs*uU}@u$s8}vs8GBKw62Sw&3qh?O3eTNWyaFHQ zBj5?pi=A34a2P0VqY`ogY7UC$T5YT1h@g<>6*K=@SaIu43v|IKzH2~=LA{7LY``5s92um(n$&A0q}u3T))+G);uTC) zBdE3lsO7{%6ijvKq*cBn0(yhzp}+VY*fzJQg30cv|J=R*XVp|=cebHt-tx99-`oWc zZ^6PiYSxA%V&$}}9-31%vZdy3}nX4zW<4ER( z3!m+{kniYU?vy+BW!$Zg8~c_9KRovEnA~`1-kopgS*rW6<6(!~aB$wW*5Y51^`}obF@B{f_sxH_!Gi4g4GT{K=fV^)q+ta@WWGANA+m)i~>% zyH|Gi8W~o0vqq*pS(ErE%D^l**sC;Y`#{+Ar zspa15cVGX(9Aw@3wyh6dx&KP8ZMWRE`$x4Ow|&%>8yJ!YhO%uZakkC*j?M?^`{`WA z9=T)BkM@52{728{_72N?hqE20ut96Sx&6V;`#W>ZyX59wAA~=QJ&ff7FUWxxvdzbF zzKUwf{%vkE=ea%I!WO?>>=j9YO$GzG&aQa&fi&7-m}Xj6274$xK(? zU3=%1Oy}V8bY^JmpUozR^M5=6S66k*&nc^;eyyghU^4mkubVAk1u|4!eZgU=s{dud z43)op(o9u1|C}-b9l*MuyM1oK_x5Ps?fd1Lx2d!^v*kkO(v?;BRjeymq5Ut!LqKG2 z*v@oNe;PP*g!*ZRb~RGerEXm_hp5A_B#(*s}P*7*`~Msz6wH8kM> zL?eDim29Y8Dwb=JI42-M3TH8o0vWX#;T#s7hX|Tqz%&tw8xSc~GsLzJ zCMG68sw!1nX;#E(=t50(28}{VDzKS2t1Ot4&_}!iS`SD!C$*(x(Fb19%kKr>4KBYb zcOR6S4$fDBtFvOUtL9GVts@`@a_)B7-JWx!SopK^%iSvtpR{EzUXqVr%65&dx-XL^ zC_r*<6c%gMZ43P3)urE9xhU^Cnyo(e#0(BKkaShPcSdiIJ_Amh>~71sx5@5pfA;)x z{fh0As?3Fp@(UNUotGZD#~$NszSdA*Ytl~{Y}Gx6B5?BwKXP9t-ZI6b`e{&gL5E)@ zO|FbL8~&TmU~zyqTW@v14uJH=Fohg^LjXI*ls1Dkq1i1oH4EM!&V1c82fVuy&(*Eq zIj|nmwi~qIE#sNCKcP754-H%A4SJKgRPs7&a{;SF<9aw7zb&=(B}1CzBBoC0RJ}p}Ct7rX{VhY^)K0d3ixnpm$tcHOD3TC?g=(^e5SW}?Pz8cu) zu+fNy*CV@oi~%w){yG^$C0bh)3>s-NTq+!p+=$ZfdyoLa{02m+i|#p78cz47n*p6{ z(X^g2oQMEe3&q0?a2OiIRV)QUG?BxMJ;%X34TmJ24Y9FMJairGJ=HA1Wk-{>c&eek zRDf{qRHN1rtS|NwIn`u|EvJ!*(g=ny0UXr8234>Engqfzj9!2U4#)@}k3m{7!||wo zI;PS=0*W`~%A1Vv=g>zy4ACv>v8U#p>D$ve&w%V1SnkPso|`x4U7kBz-#VzNnVBtn zR^5BoYFaX_`?59rGw%I)Z_T_@TSj%tW&UC03}hJ(LE@V&W7?$q7XbBiYR4&oN3?^f zb|aEViqTnCe-oC4+sH0 zdVoG}?K`jEem&<6$lk!xjpZ9z@6il%R7Fnh3=g)eX2OOzP5>W57_p-V)R+mT79sJC zFty1MH}YnkGtXJlaO^w{8w|FBp_^Aay(Ty1Rrdl}NYpLJY6>%u z5>W&Y4yjt;Rb>!UU7K*qvQ{4<98{-D_Qg)wvr`kq3s*DU2UmLFkk0FzH>2t7z2kf9 zfU!?@uet}wKG~bC8O*o`OZJH)2{#~uh^I0Fz&R5FK+^q21VBWeZn%)m()DXX1enyd zSqV>o)Ws<77L^K(T~dD&8wW|_pp?*v)Nd(4?<`D7-7HA5h*_24ybih4&P~u+5$s<; z53vQJlKrw<_U>M8SlRLkoNb=XFlWh*`TxX5x-S7aX^0_L9FR^oMNI<|pN1K7*sF16 z3Eu)2%fQ4OWKz2}!9-;!KS3ZHSjT|u1P$A`|UcS{iQ zx|c5i$G#K=RjJ-nA#k6no?6vE$X4tz0+JKW8^r=JZj`VFWZ*jOeizb&Y!x%O5ydjv zRopnGAXs2Qs7rNjvUH`5{u;oEc-Eq_gv9Mc&a+MSY|DDOR6*JE*7F7fII-#;0@qiL zVP%G0Y+UMj#0;o}U8?37`u8EQi%M^2lwMH+q$7GZ1VB0T!7yir7RWv}1klZKVYI%) zlVR}wV$(_sTIBqdtYMA@$)df*Gg}&jaow0NnMozP9j_H_2=Hd86>kXeW+m@6Yl3?w zTgSkTF6Vfv#~L(-1bXZPzc6jRIO2CIX3)$8RFwo|U&2ceDIT=KgRx{(ip+%I(w@{UG?(p|s`B^59JDg%A1qm_;%RpUS}whp0KL%A+Ym}2!!EIc zFt?_9E@_NV73YT3+CbjI?*cH@PlPTa;TA-Gr|J>%Ad@VTw=fTNiY*x@)J?X+YuE*a z81Wb32@1HB2rsn}JwUOb!a~Sg<*0TJFStsAs&L_|f-HofYFuzy1Ot)@Z%X^cCXb-E z_#Q+=Dad&{Wp8KByGQo!$$AIpEo(JR_jcXgwP;!7mImIvo`wDJs$4TLZ(s8@-fO$t zwjeEDUfTNZuVsC^mO`>`_a++$x5>hR#nVeySDC@QrzYp|%O3wy@ACev=Md51aARzd znJqb{U1r)BV1xCL4W_g3Rc7F^Vej~B1V7?yJ4QPoQtZJX95~`7t4LfDUdHmB7@@O_ zI096zgIE<1--?wWNM%C=_sCm>?_>0b81-Y+hY_Bs3j=zD#*{b-yYWrxzgk8Lmi<=e zqPal9W4U%6AB7pZ&U%H;H#Qf{B+=EYB?bny#L?qg;_Qf;c zJ_^+)>#?OyxX@-jwk(pzinxxs0>fBq7l#TIJXW9&JpRjG^2m%`UB_&p&SX7DFF_N0 zt_-VBY=n8z3KIp!WIaJIjbZT#dWDR2g8rl%M>zqtHtQZ71|G|`B@V|tO&98_thEcnPbhe(i-7nK!_Nrl7saq%LpY;;QyZ_M!A+=Yyo$zI;AS;m zK{MgCGPu@dd(#RR&#=VKGnHf;T=in$dY2QfdNEVR?Gkk$jG-T1aD&_1<8U=>vo@Rw z+Q5BoxN3&uRDc85$lOrx*%StuF$bPJMJ;upmYOF%vQSLrR#7o zrlG;b8#aMz$gk4Yk|Cig&lO6Iw<%6am$U%Mm_Nv2DyY*mB^mN*%D7!p>@OJfmNRS-hT2E+E1Q0vHET@U(F(C}iNumf(6s0KVZ67GMB> z6(I?gIQBq+;(^-$Q8@Dmszjnx!_7s_+oG0`euRZ8Uc=B=FH$JgqyW9FL5X}IqFCW7 zNO)4Q!3|+C0pBW&jE)QkUmCfrxZ>dB4XVYu7HE(t8KYbaBw`>{uM`R0$p$+UVZy~KSu z{ct)LI3ownWSh^l1vnqM8eM47H z9%tb^Zs@`le;`$-wj)Pz7OR16>QUo|5 ziLVyvRm2^rJ5Bx=(y5kmiYO!A|A{?@AOcf>@x0@@?aI_`UplqQ?1fdobM-S<%UW&I zy(4#zEOswhKJ-5Hf-gS9K9{XMlyM*WLaVZ7YkM;89(C~^0I(YvGBy6&vEC&Q>OT=uT0n-Up;%kfQIB}KwQ@{JhZ zM(9`oR|i_2``~VtDzgY4)QK9$x^N#W64owUi=hep<^VW~7!dH~N5G-~r|={?pHf{? zAH>hkge9meLy!~L92lx7B~y)M_$gLc5j|U>Cuq~?fM}JO)*RCd7l2m#4zBclaw_xO z*=*mrN6dLb9A%$ZA)W}X-h{+A!Ie%S@adHy$BI4-jz%kO6G>TNgX z4{&=7_vceF3>%an81?CL1mg`o#RPnw2o#MX(VMmb;lxa-gm9z6fe6PtW@Y9Jo&sdO zyTN?%b}U-5-tNUo+1sz0HOqq^AO7fY)_W|&98(veR76_<6j_A-28qoU;g+xXjT>kB zpj0)0UTR3wzhg?%RapD|H;5=*YNzwRrl6>%hpWG zuDcD(gSov!^4_6&+nTrL&VJ;|SueXd@R_$O!*pf5UBrNFlbJU4)|YnSVd4?<{8M({ z|AN8)uNJ=YA{jdqz-s{(3HjrOpf7wSFG_Kz1JP7icD-H`@lDXm&`^uR=&dBf4@eXO zdKNe>?0QS8>yqa6&?tkrJrdwF7Id=UK_4s<23&C|(gY!aWf)C?Z%9S(>M7QEGR8;Y zTCBv!ghi;SKpbR4S0s*-ng%3}5CC`@aro+Uz8=}vlk@G8!70Ign0T1T`i|y&!?JHU z>pKk|3YYJlBe#!y|LDTS&s^<```xOH(v)}O+k-DsD}r;-YQ46S&#w5* zq^1va4dD%Cxa_40rkj&|oDD}4;-7>R#mDxjBnyOXol$o75W{=G5L7!T=wvl6Y$;_@*roW^d zozyv|UuOEjmsV1a7h+Wx_A0YuO;>u)JkkyW{q%_Tt>ehAxZQ*wz+{C#!U)Bg>T)B_ zx1v6dDz{?0OjIx7A+{p&0Xc;5pDrQor2<-=DVzJgw-UUZ1{E_T1S^WB~3 zu%R*c=vhfBwT$)r&jZmIy?BuRGR=_%HJ;FS**~B(L6*-4cr|hA2P(H zz(f*0zvo5u`n`)BSmS?^03{Y|PtZGroqQ#VYk*kJ>WAI`5u2fS5pXjpjrg4*e6@jJ zRRl^t%qHum_(^Tax7(#nFirLix{3tkD2j`;S3h?U(A%#xk&>hoiHdRctOUG8;Nm?P?^v`2-%fp)MvC@}>Q+;pPpb%vtde@8j~ zj;i{c8u*;r`VSOjou5cjwo$j`mx?`|EEpKtU$g uSEhFQMY=#?3`d|QSAi6Kb%LHW(X~ILFn)3soR{=D`p%($rbvP;-~Rz!y0D!9 literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_pipe/__pycache__/pynccl_pipe.cpython-312.pyc b/distributed/kv_transfer/kv_pipe/__pycache__/pynccl_pipe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d484dcac4ae1f68e29a74948e96e53b81f8ad83b GIT binary patch literal 13145 zcmb_jYit`=cAgN64 zV(xh9NGVCPv9fsiNI6M6VxD-#NCiz;NTBdy)gx>P7NTGvlm{AHmppbO{lXjJ0jNj?=5xM+ML zCd7q=EO8<3?9^a59OEXU69N}Wiku`Q_-Nu17fSG)D1?RRW%%VhGa*NliBRl)IVpxm z1H(c>N{SzF7X^7z5E2|8m1Hq`F(nJoD3lnNxXGwI%7ysLp+s2Vo#A9Wo=QZ+A*8}Z zgpiyP1u5Wk4hU1+iTuwV=RR&IH8GJCVPK}i;AGNoo_bu6LwraML9;htv@t>AE+(PZ zAdyV$yBJG`$8q9XADj&HkRpP}#f9)_C=r$7 z(CTz35xRu^q#{@(MbiT5X+lh1j`CO#3Xe~~)FhZ6befRGWUP2v#R~1P~eItqH-{n zymUzr!$k}T9IS}wx)T`_L2KBTRLB&u(daRpETZTe8hHaKr!WKa6$>- zlweSG27{WAL)sk-em510X(gmbwKPA)un>y{gMM19z+O-V9Th7f7F~M8K|;c1`Rz68 zHx>)CCr3dfdiC;6i~xHiTjNsty zf|IwRPUA}iHp24kHD<)cI|MiH6iS6Mpn`n1B?#;#3vbo5;O|pMs~NHJu4}dt5B3?c z@NP&~faWyqAwMRkPzkNdpr)5E7b;;)rnRAt2kQ8+4p6RubQRRabS0##pVbS@#m0M~ zX3b7LXyXBTKB!gu40=_twy>%ZJ6{d8>-d^$)JXj^+F*SfUkmjco;ij(sMW~V1J$M} ztG`ioYKjJh{2LE!)i-4j4XRF1uAx{Ej|SDQ={VJ{sbt{0VGH`;X z5B?s+5wrnFeL))#-GGcsg1IIbv+3o9FVHab2(&V(F>85~`W|Q=Gu<*~FHq7`KoQji z3Defuz0gvZ2R$3UJJ2KOq^KGsb^Qf&jouQ~J5aTn)!v~m)Hz!M;+%W=^_XE~)CMGs zp)%MKcA)CpQVo9YP_x>krY)`2H`xcEV$8qLa+>+E^FYdWpSGWe6@$Kw^@KF?ItkF)WG=iuPm>v?ArB}RYri7};1wv_ znS!xrJkmo=O#;urFJqVFy?Xvwm!2wo^|gT(HF_@esV9rRW{4D8As8hA&R+YVS}Fl1 z5rT3wE+kWOv9-a6k~i4@s!vC?Y}|E7LhUi^&;+(*a(qY z0;egd@4;gg&`~TV#7HZ(EC>g=5SD{^t%5Z)l$OsC@$gWq;1CZ+#AG}uT!Aw>sOch+ z#WH&s(3A`)0h&f9NM~e-3;}3r$;FT;Mgcz52a$-@5A>SggMgEv2_Q<GPl8|0J~ z7J@XXEUtmgrBQ#0i2O^|SX*seD{&V_xErbs=O&>(=Cq)J1p;Ec1&wcDmjKk1F2f$q zQn@m!rU}4{D_h?NpYm)?!{XH&S2H!8N=+w%nrylE`t#CSLlC5vf)bCU3_bv0Ej^7>61m08vZ?4u4%=P_nVBIK-tk%CY*Z0vt z&Q8@eed4<1%GAE1)V}hd`{ysN)}EOweO%R8euk0_HT{XoSILYPyt~u5A6DCoX%n69(+ec_>Heb?3Y@!`3To{TkZ{Q^=GDl=8r` z{Av1bdbPY~&X#@oRfRo3#(z^@+jU^Q-GAr&?eq70mG+k&_%g2yD)6^$5G%A~pHc=h zIee7X)}f&hzqOA!|DCeiWvkrtT6Sy4oxQjBW?By`t*~H^*j*Y*>&Bh4ivpNhiA^{g z^%I_0#P0n*;%svXH_l0@n-4ra{v0qxk8GUN^LSF4=%gPP&K1CC)0Uv26{js@CZ`_c z8oFHXjiP33GuCPAv@K#8qUPx_1QT1rjCGC%W5afhonfY#8Jk|)&kU-zaEcEB$VF~G zv1ti#`lui`kW_xMT%7rp%GBm{7Lhz=4I<@UDpD=u=_9V zWS{-4rrB)r6xcSn6Pm7M!6HFBkw+6J0Wr}hEYVjuY`Z4&giCB_9*b{a+mjgeLnI+7 z?gGlYChhy-ce?%en#bGNJ_ZpC8x(-sbN$u%S2L~##no{0@|Ujmuf6R{!u`I_&wO@f z&D%HEmu+kTsXd(qnXS83x72s%fu z>s&su#=Z!Oc;W3YT@8=DwTp*u9A0W)wtZTDw|r&Kp%w0xRd3IVyXWhC5wq&;SaEld zJyGqNOJ&pE;11xwjbjj@sXxmA7==~;8xZwr+6;0)u9-?AE&SHd1PHJb*%iZaa+GuO zr~v*O%}vCG(A~vH!^D#%PJypR0(TYoqp(&qF_DtQ4&WQjxdsp#gdcA730Fx zmbN&seOxP#I-pe>8+yh#{LLjJWQKXaR(_a)O_R#EmqtTIdy;-u+rhgQl_9-?Zv`8R zL$IAX=luz!i#ZQKT0Ar^1ofG2X2MNHlh||*M4&)wDR(6RfTot^KE=C##l3&MY4<1R zZ=Jt?;`39Vol?BLEAC#6(O8}GCRW3P&d#G(QqUsF5}HV9IL z0In@tC}hHVA>BzRC-Z((p?DiMF`v#N2Co+J%61OaCuzng$Qn8y5YJ?*wx?a&wi~>a zVQ|?Jk0)@K#vOYkuE00w0Vb`BMq|pk8luGE=sc!zoHQYXqmd{G8R0{N-uA%&kOv0= zdXGUIFQ7{jB7^98{* z3uc3$0d{TJ>0QN|(4@i~lIetKmU>c4d8)2ooxhrO`xf3&+)YdGE?-jG4?d88di5t) zpI9v(C%DNR&ZitjIM^K#SDvS!iV@JyVP;Vlj;blzZ%Kb_?C%g9_M7!yk;(&JL{*|0 zAjDH&w7361oL1N{aJ>>pB)$o=5f5NA0@0=|L9-1ED4nG@YlKZjH?b+6Wdu^_2)sdI z8*aX_#AiCbqjY@d5qnfydPBsaJ}xQ||5K3ogO=WW;q$aX@agnJW7*##S0$Q=>$53V z<7jg}W$jJnN~iOO2)Z#KROuK@<-D*-`X1piEWQNMU7ElK;uO}9AE<2tcJ<+G-lR=M zw}F6!b|Y+(2e8}q^n5x~-l~+hE*;41KCJ9MyjuR+3j5l20Wp;H187Zlsdx_|4?_Wu z&ww+~48=_ws0%UNhXJ4M2bp3d`Vu zFI>23_g`=zBWrI~y+m6_;pR(UW=<1UgWm5ev^xS6pm$Ngp5-nCD;E&0T-XX%jP5k< z;h+hQ6I&ISW)Q3g#Q5R|7`*_IY60{~yap)ez*HVAK>#2UHH-oX6g8{JsR=zg71CA3 z+Ls7CW}_cULRATTR7v@2Gu{rx+p%&v#dMAYJ$JUwgV_#aW!S#o{YOiakngW z-0|P`FZX>qaCcy}_0WU1OwW+gGxW$koU>By=I!di2E7pV@DU_-Vm#e1HUmb2h7V@x zshwERFd7Mynz0|YG_jcr%rp%k*l$h$yRjQjWhM|o?V6SY#7AyOw=)y!AA{P?XMm|5 z($9Asgw_`x>7=8laFS8Az&GUB9`xJ93$WJW`xw255$YYCXOLY~yVhmim^Rpk=$h?p zV`TpX+DZcu5%r_9&8qh~U>cWq;dRB;xZ-NddTYSHld0%bDms@BtyUbEvlEtTQrM=O z5&bM#V~?(vSIw0Yj!|9ObcwGqjO?NZboCSVL0Gv67h15jPt{z+?QCMlyhLHi7252u zpSGlHC@O3zTwM@bsmNZPqIXmK;U;g=LR0SnCAj>guUOxsCLv!oSgnl``5CKh*sEhM zeHw)?Z;=g0jG4n_tfEj?PwChkGaD@*q1UNyzc?Me$a8hal z;I5z}b@Y!Gm~}Ay;```qZs$Mf;fxz=ZgLdfJILY`(cVd-$iY1C;^NXJVyENQ`rYKB zTnw0-5()j^IAVcmlM=NO7>u2WA z+&r=7-o1`u)3I97k@fg89_~vIw=|%5x*vJC-&m>2Jy51pbo}n?^4ech4oB-_cjfg{ z^QW?&{>PrGjHgBMv}8P;il=kc)AiWnT{!Wv3qQRd+n+d?(y|RVRob-fuFAM~DehgG zHve7D3xvLv@QC?dQ?HZyr22^UA6ZEJ6VqE#^3ZC9t6ONR}nKZbsOf-S=-QG|mU()ca}k9O4bnl9BItAIZuhA$aJCfO{ttl|d2$fxxA|x`pBH zcMu#P&x`0U&2u|1Tui|g1>OUJqaY>-6Cz&a7K6-E@&G3ojRcW$y0(Z*Hf`f>`l3~ zx+I*!6O|6T0U2AvY=w+8t56JH++=1!Y2tZrGX=Q={bj1dxCO}j#O+(8WOTxn2 z2$C?k%IJX7YZ6W}fv3iyVyN^&TR2qUeFq#W#__QAP%)qR7+(8q2~ZOO-UIJ$c@q}Q8elvPPoug72ObwPh-`Y$H5r!0gebGNK(Sh}9muG&`XyYF{=-t}46KLnKeqw{dj)-X4)j_5mo z@wHxBktuCmEyY73<8J)Y-MHjZ+#+W;gE`h+vPpBJ{1ACu_+#bKRFci)6ZqMX3g|r_ zbQzza0p`(WuRL^l1bGDn575hyNq}AvfL?|inn@9FTm--w^7NX_w6$O@Jfp7NTsjNw zd3eVfa}W?#rO&nS)#n5{#`?YO@QY2~H^lBO%?a`}@iqJ)QZglm!JYuw4RGWLLqr9y zgknX)YHE)B(PqdX5mWIB%rU*MKM_u*67a|pyEot=suMW-G;4=IJ#wU`E$vde^z>Bm z#S8&_u7Y@r(?C~@K!mriQYr>7V=)JDpNL0^W+x+F6O$0-(VS|Jr0`X&W;9P=sTrhm z5o1SFQ=5dnDF1(AiunsC@E4_2C|1MV!WWJTq#s1Ld>8++7Ry&R})`z=#TaHuKf2J7tu}KS{<|JAm z28m)XK3>i0yevKkyl#TvK5Q`0Hgg?k*)X|U>l4X%(sH`-alKJFhVjCV}!q`>XQ>4#HPAw=w z2@Pg&N3)dMyKrEoZtt3_>%K3u|Aeyt#LE7+3};Bj-Kn@cAGy20_wm9Iz2fR3Xa}ya zwL_^{^Z_9egE3sP!~KIpXWk0-ojeVRlV=8p z@-BBUM-CoP%ZxUC!|$E#SFMmOqRtlM7$I8F-q$d43f`TIxQVKLn79wb{aEV99QJhn zo=)>D5$6DLFI~VYVT@3w$bqOGmNWSCLyZ0mqPrBCn|4=?Y$alZq@_QHCA&stt7~V^ zWE-1TY<1bH`o;Heygxgbb5qT&*@nhnxjLBF)_pZO2EK4?RkY+vFvU_8z6JhEQem6(K=Rx~VyMEHOfyo?K!t~NhS2ig8<|_4Uj*F_jGV=*Yx}b z1;005V3>W{Hj&L~P)mYAJ{b-M#Y4DbGDK>57%n>DB^Ufp0Vxo=7}gl35?*TM&6|Lx zC?x`>G*+y^H`oE~F;@T{Jn7a7;X}2Hlonap`O77E{iXl6grq3Jvuu2#O`cGw{%uW) zsA3N&{sFwi6A{3x)!XXBbGMkJF}{lwYM1pQqIga0$bmqP1{adB;XREeGaq4{A7Vrp z`7bbqt`*IyXAf&nypM=@Cjz`pNuv0V8}$@j@e9iJ71j0?wf~ot`@g6I zUr_^JQAfX`4t+)K{!fQ*;oVh7^Q`@e-AeDd>3>4O2V}wRS+FjgTReZ`{8FD%_1uc* XImP9lWuMq(E4^=_{(mU^l9Bu$TS%}F literal 0 HcmV?d00001 diff --git a/distributed/kv_transfer/kv_pipe/base.py b/distributed/kv_transfer/kv_pipe/base.py new file mode 100644 index 0000000..1fe7a90 --- /dev/null +++ b/distributed/kv_transfer/kv_pipe/base.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file defines an interface `KVPipeBase` +that provides an abstraction for sending and receiving tensors, or None, via +distributed communications. + +All classes instantiated from this interface are assumed to be a FIFO pipe. + +If your distributed communication platform already supports key-value lookup, +you can bypass this interface and directly start from `kv_lookup_buffer`. +""" + +from abc import ABC, abstractmethod + +import torch + + +class KVPipeBase(ABC): + """ + This class provides an interface for sending and receiving tensors, or + None, by distributed communications. + """ + + @abstractmethod + def send_tensor(self, tensor: torch.Tensor | None) -> None: + """Send a tensor, or None, via the pipe. + + Need to support sending None -- important for error handling. + + TODO: add a `key` argument so that we can use traditional + key-value database as the distributed communication mechanism behind + the pipe. + + Args: + tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def recv_tensor(self) -> torch.Tensor | None: + """Receive a tensor (can be None) from the pipeline. + + Returns: + Optional[torch.Tensor]: The tensor received from the pipeline. Can + be None. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def close(self) -> None: + """Close the pipeline and release resources. + + This method is responsible for closing the communication pipeline + and releasing any resources associated with it. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError diff --git a/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/distributed/kv_transfer/kv_pipe/mooncake_pipe.py new file mode 100644 index 0000000..542dde0 --- /dev/null +++ b/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -0,0 +1,295 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +import os +import struct +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass + +import torch +import zmq +from safetensors.torch import load as safetensors_load +from safetensors.torch import save as safetensors_save + +from vllm.config.kv_transfer import KVTransferConfig +from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase +from vllm.logger import init_logger +from vllm.utils.network_utils import join_host_port, make_zmq_path, split_host_port + +logger = init_logger(__name__) +NONE_INT = -150886311 + + +@dataclass +class MooncakeTransferEngineConfig: + prefill_url: str + decode_url: str + metadata_backend: str | None + metadata_server: str + protocol: str + device_name: str + + @staticmethod + def from_file(file_path: str) -> "MooncakeTransferEngineConfig": + """Load the config from a JSON file.""" + with open(file_path) as fin: + config = json.load(fin) + return MooncakeTransferEngineConfig( + prefill_url=config.get("prefill_url"), + decode_url=config.get("decode_url"), + metadata_backend=config.get("metadata_backend", None), + metadata_server=config.get("metadata_server"), + protocol=config.get("protocol", "tcp"), + device_name=config.get("device_name", ""), + ) + + @staticmethod + def load_from_env() -> "MooncakeTransferEngineConfig": + """Load config from a file specified in the environment variable.""" + config_file_path = os.getenv("MOONCAKE_CONFIG_PATH") + if config_file_path is None: + raise ValueError( + "The environment variable 'MOONCAKE_CONFIG_PATH' is not set." + ) + return MooncakeTransferEngineConfig.from_file(config_file_path) + + +class MooncakeTransferEngine: + """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ.""" + + def __init__(self, kv_rank: int, local_rank: int): + try: + from mooncake.engine import TransferEngine + except ImportError as e: + raise ImportError( + "Please install mooncake by following the instructions at " + "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501 + "to run vLLM with MooncakeConnector." + ) from e + + self.engine = TransferEngine() + self.local_rank = local_rank + + try: + self.config = MooncakeTransferEngineConfig.load_from_env() + logger.info("Mooncake Configuration loaded successfully.") + except ValueError as e: + logger.error(e) + raise + except Exception as exc: + logger.error("An error occurred while loading the configuration: %s", exc) + raise + prefill_host, base_prefill_port = split_host_port(self.config.prefill_url) + decode_host, base_decode_port = split_host_port(self.config.decode_url) + + # Avoid ports conflict when running prefill and decode on the same node + if prefill_host == decode_host and base_prefill_port == base_decode_port: + base_decode_port = base_decode_port + 100 + + prefill_port = base_prefill_port + self.local_rank + decode_port = base_decode_port + self.local_rank + self.prefill_url = join_host_port(prefill_host, prefill_port) + self.decode_url = join_host_port(decode_host, decode_port) + + self.initialize( + self.prefill_url if kv_rank == 0 else self.decode_url, + self.config.metadata_server, + self.config.protocol, + self.config.device_name, + self.config.metadata_backend, + ) + + self.remote_url = self.decode_url if kv_rank == 0 else self.prefill_url + + # Initialize ZeroMQ context and sockets + self.context = zmq.Context() # type: ignore[attr-defined] + self.sender_socket = self.context.socket(zmq.constants.PUSH) + self.receiver_socket = self.context.socket(zmq.constants.PULL) + self.sender_ack = self.context.socket(zmq.constants.PULL) + self.receiver_ack = self.context.socket(zmq.constants.PUSH) + + self.buffer_cleaner = ThreadPoolExecutor(max_workers=1) + self._setup_metadata_sockets( + kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port + ) + + def _setup_metadata_sockets( + self, kv_rank: int, p_host: str, p_port: int, d_host: str, d_port: int + ) -> None: + """Set up ZeroMQ sockets for sending and receiving data.""" + # Offsets < 8 are left for initialization in case tp and pp are enabled + p_rank_offset = p_port + 8 + self.local_rank * 2 + d_rank_offset = d_port + 8 + self.local_rank * 2 + if kv_rank == 0: + self.sender_socket.bind(make_zmq_path("tcp", p_host, p_rank_offset + 1)) + self.receiver_socket.connect( + make_zmq_path("tcp", d_host, d_rank_offset + 1) + ) + self.sender_ack.connect(make_zmq_path("tcp", d_host, d_rank_offset + 2)) + self.receiver_ack.bind(make_zmq_path("tcp", p_host, p_rank_offset + 2)) + else: + self.receiver_socket.connect( + make_zmq_path("tcp", p_host, p_rank_offset + 1) + ) + self.sender_socket.bind(make_zmq_path("tcp", d_host, d_rank_offset + 1)) + self.receiver_ack.bind(make_zmq_path("tcp", d_host, d_rank_offset + 2)) + self.sender_ack.connect(make_zmq_path("tcp", p_host, p_rank_offset + 2)) + + def initialize( + self, + local_hostname: str, + metadata_server: str, + protocol: str, + device_name: str, + metadata_backend: str | None, + ) -> None: + """Initialize the mooncake instance.""" + if metadata_backend is None: + self.engine.initialize( + local_hostname, metadata_server, protocol, device_name + ) + else: + supported_backend = ["etcd", "redis"] + metadata_backend = metadata_backend.lower() + if metadata_backend not in supported_backend: + raise ValueError( + "Mooncake Configuration error. `metadata_backend`" + f" should be one of {supported_backend}." + ) + + self.engine.initialize_ext( + local_hostname, metadata_server, protocol, device_name, metadata_backend + ) + + def allocate_managed_buffer(self, length: int) -> int: + """Allocate a managed buffer of the specified length.""" + ret = self.engine.allocate_managed_buffer(length) + if ret <= 0: + logger.error("Allocation Return Error") + raise Exception("Allocation Return Error") + return ret + + def free_managed_buffer(self, buffer: int, length: int) -> int: + """Free a previously allocated managed buffer.""" + return self.engine.free_managed_buffer(buffer, length) + + def transfer_sync(self, buffer: int, peer_buffer_address: int, length: int) -> int: + """Synchronously transfer data to the specified address.""" + ret = self.engine.transfer_sync_read( + self.remote_url, buffer, peer_buffer_address, length + ) + if ret < 0: + logger.error("Transfer Return Error") + raise Exception("Transfer Return Error") + return ret + + def write_bytes_to_buffer(self, buffer: int, user_data: bytes, length: int) -> int: + """Write bytes to the allocated buffer.""" + return self.engine.write_bytes_to_buffer(buffer, user_data, length) + + def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes: + """Read bytes from the allocated buffer.""" + return self.engine.read_bytes_from_buffer(buffer, length) + + def wait_for_ack(self, src_ptr: int, length: int) -> None: + """Asynchronously wait for ACK from the receiver.""" + ack = self.sender_ack.recv() + if ack != b"ACK": + logger.error("Failed to receive ACK from the receiver") + + self.free_managed_buffer(src_ptr, length) + + def send_bytes(self, user_data: bytes) -> None: + """Send bytes to the remote process.""" + length = len(user_data) + src_ptr = self.allocate_managed_buffer(length) + self.write_bytes_to_buffer(src_ptr, user_data, length) + self.sender_socket.send_multipart( + [struct.pack("!Q", src_ptr), struct.pack("!Q", length)] + ) + self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length) + + def recv_bytes(self) -> bytes: + """Receive bytes from the remote process.""" + data = self.receiver_socket.recv_multipart() + src_ptr = struct.unpack("!Q", data[0])[0] + length = struct.unpack("!Q", data[1])[0] + dst_ptr = self.allocate_managed_buffer(length) + self.transfer_sync(dst_ptr, src_ptr, length) + ret = self.read_bytes_from_buffer(dst_ptr, length) + + # Buffer cleanup + self.receiver_ack.send(b"ACK") + self.free_managed_buffer(dst_ptr, length) + + return ret + + +class MooncakePipe(KVPipeBase): + """MooncakeTransferEngine based Pipe implementation.""" + + def __init__( + self, local_rank: int, config: KVTransferConfig, device: str | None = None + ): + """Initialize the mooncake pipe and set related parameters.""" + self.config = config + self.local_rank = local_rank + self.kv_rank = self.config.kv_rank + assert self.kv_rank is not None + if device is None: + self.device = self._select_device(self.config.kv_buffer_device) + else: + self.device = self._select_device(device) + + self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank) + self.transport_thread: ThreadPoolExecutor | None = None + self.none_tensor = torch.tensor([NONE_INT], device=self.device) + + def _select_device(self, device: str) -> torch.device: + """Select available device (CUDA or CPU).""" + logger.info("Selecting device: %s", device) + if device == "cuda": + return torch.device(f"cuda:{self.local_rank}") + else: + return torch.device("cpu") + + def tensor_hash(self, tensor: torch.Tensor) -> int: + """Calculate the hash value of the tensor.""" + return hash(tensor.data_ptr()) + + def _send_impl(self, tensor: torch.Tensor) -> None: + """Implement the tensor sending logic using safetensors.""" + self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor})) + + def _recv_impl(self) -> torch.Tensor: + """Implement the tensor receiving logic using safetensors.""" + data = self.transfer_engine.recv_bytes() + return safetensors_load(data)["tensor"].to(self.device) + + def send_tensor(self, tensor: torch.Tensor | None) -> None: + """Send tensor to the target process.""" + if self.transport_thread is None: + self.transport_thread = ThreadPoolExecutor(max_workers=1) + tensor = tensor if tensor is not None else self.none_tensor + assert len(tensor.shape) > 0 + self.transport_thread.submit(self._send_impl, tensor) + + def recv_tensor(self) -> torch.Tensor | None: + """Receive tensor from other processes.""" + if self.transport_thread is None: + self.transport_thread = ThreadPoolExecutor(max_workers=1) + tensor = self.transport_thread.submit(self._recv_impl).result() + if tensor.numel() == 1 and tensor.item() == NONE_INT: + return None + else: + return tensor + + def close(self) -> None: + """Cleanup logic when closing the pipe.""" + self.transfer_engine.sender_socket.close() + self.transfer_engine.receiver_socket.close() + self.transfer_engine.sender_ack.close() + self.transfer_engine.receiver_ack.close() + self.transfer_engine.context.term() # Terminate the ZMQ context + logger.info("Closed the transfer engine and cleaned up resources.") diff --git a/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/distributed/kv_transfer/kv_pipe/pynccl_pipe.py new file mode 100644 index 0000000..526c5cd --- /dev/null +++ b/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This module implements a PyNccl pipe for sending and receiving +Optional[torch.Tensor] between distributed ranks with advanced +communication features. + +Key Features: +- Supports sending and receiving tensors with metadata +- Handles both CUDA and CPU device communications +- Implements a non-blocking tensor transfer mechanism +- Manages buffer size and provides backpressure control +- Supports distributed process groups with configurable parameters +""" + +import threading +import time +from collections.abc import Callable +from concurrent.futures import ThreadPoolExecutor + +import torch + +from vllm.config.kv_transfer import KVTransferConfig +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase +from vllm.distributed.utils import StatelessProcessGroup +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class BrokenPipeException(Exception): + def __init__(self, message): + self.message = message + super().__init__(self.message) + + +Metadata = dict[str, torch.Tensor | None] + + +class PyNcclPipe(KVPipeBase): + METADATA_LENGTH = 16 + MAX_TENSOR_DIMENSIONS = 14 + METADATA_DTYPE = torch.int64 + + def __init__( + self, + local_rank: int, + config: KVTransferConfig, + device: str | None = None, + port_offset: int = 0, + ): + self.config = config + self.local_rank = local_rank + self.kv_rank = self.config.kv_rank + assert self.kv_rank is not None + self.kv_parallel_size = self.config.kv_parallel_size + if device is None: + self.device = self._select_device(self.config.kv_buffer_device) + else: + self.device = self._select_device(device) + + # build distributed connection and send/recv implementation + store_timeout = self.config.get_from_extra_config("store_timeout", 300) + self.group = StatelessProcessGroup.create( + host=self.config.kv_ip, + port=self.config.kv_port + port_offset, + rank=self.kv_rank, + world_size=self.kv_parallel_size, + store_timeout=store_timeout, + ) + # add a barrier to make sure the connection is initiated properly + self.group.barrier() + impl = self._get_device_send_recv_impl(self.group) + self.device_send_func, self.device_recv_func = impl + # set target rank + self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size + self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size + + # transportation-related variables + self.transport_thread: ThreadPoolExecutor | None = None + self.buffer_size = 0 + self.buffer_size_lock = threading.Lock() + self.buffer_size_thresh = self.config.kv_buffer_size + + def _get_device_send_recv_impl( + self, group: StatelessProcessGroup + ) -> tuple[ + Callable[[torch.Tensor, int], None], Callable[[torch.Tensor, int], None] + ]: + send: Callable[[torch.Tensor, int], None] + recv: Callable[[torch.Tensor, int], None] + if self.device.type == "cuda": + # use PyNCCL for send / recv + comm = PyNcclCommunicator(group, device=self.local_rank) + comm.disabled = False + send, recv = comm.send, comm.recv # type: ignore + else: + # This send / recv implementation here is NOT intended to transfer + # KV caches (and should NOT be repurposed to transfer KV caches). + # Currently it is only used to transmit control-plane messages + # for PyNcclBuffer. + send = group.send_obj + + def my_recv(x, src): + x[...] = group.recv_obj(src) + + recv = my_recv + + return send, recv + + def _select_device(self, device: str): + logger.info("Selecting device: %s", device) + if device == "cuda": + return torch.device(f"cuda:{self.local_rank}") + else: + return torch.device("cpu") + + def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata: + """ + Create the metadata as a dictionary based on the input tensor. + + Args: + tensor: The input tensor or None if no tensor is provided. + + Returns: + metadata: A dictionary with the following keys: + - "dtype": The data type of the tensor or None. + - "shape": The shape of the tensor or None. + """ + if tensor is None: + return {"dtype": None, "shape": None} + else: + return {"dtype": tensor.dtype, "shape": tensor.shape} + + def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor: + """ + Create a buffer to receive the tensor based on the provided metadata. + + Args: + metadata: A dictionary with keys "dtype" and "shape", + describing the tensor's data type and shape. + + Returns: + buffer: A tensor of the specified type and shape, + allocated on `self.device`. + """ + return torch.empty( + metadata["shape"], dtype=metadata["dtype"], device=self.device + ) + + def _send_metadata(self, metadata: Metadata): + """ + Send the metadata dictionary to the target rank. + + Args: + metadata: A dictionary with keys "dtype" and "shape". + """ + self.group.send_obj(metadata, self.target_rank_for_send) + + def _recv_metadata(self) -> Metadata: + """ + Receive the metadata dictionary from the target rank. + + Returns: + metadata: A dictionary with keys "dtype" and "shape" + describing the tensor. + """ + return self.group.recv_obj(self.target_rank_for_recv) + + def _send_impl(self, tensor: torch.Tensor | None) -> None: + """ + The actual implementation of sending the tensor and its metadata to the + target rank. + + Args: + tensor: The input tensor to be sent, or `None` if no tensor is + being sent. + """ + metadata = self._make_metadata(tensor) + self._send_metadata(metadata) + if tensor is not None: + self.device_send_func(tensor.to(self.device), self.target_rank_for_send) + + def _recv_impl(self) -> torch.Tensor | None: + """ + The actual implementation of receiving a tensor and its metadata from + the target rank. + + Returns: + buffer: The received tensor, or `None` if no tensor is received. + """ + metadata = self._recv_metadata() + if metadata["dtype"] is None: + return None + buffer = self._prepare_recv_buffer(metadata) + self.device_recv_func(buffer, self.target_rank_for_recv) + + return buffer + + def send_tensor_wrapper( + self, tensor: torch.Tensor | None, tensor_size: int + ) -> None: + """ + Wrapper for _send_impl to handle exceptions and update buffer size. + """ + try: + self._send_impl(tensor) + + with self.buffer_size_lock: + self.buffer_size -= tensor_size + except Exception as e: + logger.error( + "[rank%d]: Exception when trying to send %s, msg: %s", + torch.distributed.get_rank(), + str(tensor), + str(e), + ) + import traceback + + traceback.print_exc() + + def block_if_full(self): + """ + Block the current thread if the buffer size is larger than the + threshold. + """ + while self.buffer_size > self.buffer_size_thresh: + logger.debug("KV cache transfer pipe is full. Waiting...") + time.sleep(0.05) + + def send_tensor(self, tensor: torch.Tensor | None) -> None: + """ + Sends a tensor and its metadata to the destination rank in a + non-blocking way. + + Args: + tensor: The tensor to send, or `None` if no tensor is being sent. + """ + if self.transport_thread is None: + self.transport_thread = ThreadPoolExecutor(max_workers=1) + + if tensor is not None: + tensor_size = tensor.element_size() * tensor.numel() + else: + tensor_size = 0 + + self.block_if_full() + + with self.buffer_size_lock: + self.buffer_size += tensor_size + + self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size) + + def recv_tensor(self) -> torch.Tensor | None: + """ + Receives a tensor and its metadata from the source rank. Blocking call. + + Returns: + The received tensor, or `None` if no tensor is received. + """ + if self.transport_thread is None: + self.transport_thread = ThreadPoolExecutor(max_workers=1) + + future = self.transport_thread.submit(self._recv_impl) + + try: + tensor = future.result() + except Exception as e: + logger.error("Encountering exception in KV receiving thread") + logger.error("%s", e) + logger.error("My device: %s", self.device) + import traceback + + traceback.print_exc() + raise e + + return tensor + + def close(self): + """ + Close the pipe and release associated resources. + """ + if hasattr(self, "transport_thread") and self.transport_thread is not None: + self.transport_thread.shutdown() diff --git a/distributed/kv_transfer/kv_transfer_state.py b/distributed/kv_transfer/kv_transfer_state.py new file mode 100644 index 0000000..54b46d9 --- /dev/null +++ b/distributed/kv_transfer/kv_transfer_state.py @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING, Optional + +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType +from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorBase_V1, + KVConnectorRole, +) + +if TYPE_CHECKING: + from vllm.config import VllmConfig + from vllm.v1.kv_cache_interface import KVCacheConfig + +_KV_CONNECTOR_AGENT: KVConnectorBaseType | None = None + + +def get_kv_transfer_group() -> KVConnectorBaseType: + assert _KV_CONNECTOR_AGENT is not None, ( + "disaggregated KV cache transfer parallel group is not initialized" + ) + return _KV_CONNECTOR_AGENT + + +def has_kv_transfer_group() -> bool: + return _KV_CONNECTOR_AGENT is not None + + +def is_v1_kv_transfer_group(connector: KVConnectorBaseType | None = None) -> bool: + """Check if the KV connector is the v1 connector. + If the argument is None, it will check the global KV connector + + Args: + connector: The KV connector to check. If None, it will check the + global KV connector. + + Note: + This function will no-longer be needed after the v1 KV connector + becomes the default. + """ + if connector is None: + connector = _KV_CONNECTOR_AGENT + + if connector is None: + return False + + return isinstance(connector, KVConnectorBase_V1) + + +def ensure_kv_transfer_initialized( + vllm_config: "VllmConfig", kv_cache_config: Optional["KVCacheConfig"] = None +) -> None: + """ + Initialize KV cache transfer parallel group. + """ + + global _KV_CONNECTOR_AGENT + + if vllm_config.kv_transfer_config is None: + return + + if ( + vllm_config.kv_transfer_config.is_kv_transfer_instance + and _KV_CONNECTOR_AGENT is None + ): + _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector( + config=vllm_config, + role=KVConnectorRole.WORKER, + kv_cache_config=kv_cache_config, + ) + + +def ensure_kv_transfer_shutdown() -> None: + global _KV_CONNECTOR_AGENT + if _KV_CONNECTOR_AGENT is not None: + _KV_CONNECTOR_AGENT.shutdown() + _KV_CONNECTOR_AGENT = None diff --git a/distributed/parallel_state.py b/distributed/parallel_state.py new file mode 100644 index 0000000..8091c1a --- /dev/null +++ b/distributed/parallel_state.py @@ -0,0 +1,1794 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2023 The vLLM team. +# Adapted from +# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +"""vLLM distributed state. +It takes over the control of the distributed environment from PyTorch. +The typical workflow is: + +- call `init_distributed_environment` to initialize the distributed environment. +- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to + initialize the model parallel groups. + +- any code dealing with the distributed stuff + +- call `destroy_model_parallel` to destroy the model parallel groups. +- call `destroy_distributed_environment` to destroy the distributed environment. + +If you only need to use the distributed environment without model/pipeline + parallelism, you can skip the model parallel initialization and destruction + steps. +""" + +import contextlib +import gc +import os +import pickle +import weakref +from collections import namedtuple +from collections.abc import Callable +from contextlib import contextmanager, nullcontext +from dataclasses import dataclass +from datetime import timedelta +from multiprocessing import shared_memory +from typing import Any, Optional +from unittest.mock import patch + +import torch +import torch.distributed +import torch.distributed._functional_collectives as funcol +# import torch.distributed._symmetric_memory +from torch.distributed import Backend, ProcessGroup +from typing_extensions import deprecated + +import vllm.envs as envs +from vllm.distributed.device_communicators.base_device_communicator import ( + DeviceCommunicatorBase, +) +from vllm.distributed.utils import StatelessProcessGroup +from vllm.logger import init_logger +from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.utils.network_utils import get_distributed_init_method +from vllm.utils.torch_utils import ( + direct_register_custom_op, + supports_custom_op, +) +import ixformer.distributed as ixfd +import vllm._custom_ops as ops + +@dataclass +class GraphCaptureContext: + stream: torch.cuda.Stream + + +TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"]) + + +def _split_tensor_dict( + tensor_dict: dict[str, torch.Tensor | Any], +) -> tuple[list[tuple[str, Any]], list[torch.Tensor]]: + """Split the tensor dictionary into two parts: + 1. A list of (key, value) pairs. If the value is a tensor, it is replaced + by its metadata. + 2. A list of tensors. + """ + metadata_list: list[tuple[str, Any]] = [] + tensor_list: list[torch.Tensor] = [] + for key, value in tensor_dict.items(): + if isinstance(value, torch.Tensor): + # Note: we cannot use `value.device` here, + # because it contains not only the device type but also the device + # index (e.g. "cuda:0"). We only need the device type. + # receiving side will set the device index. + device = value.device.type + metadata_list.append( + (key, TensorMetadata(device, value.dtype, value.size())) + ) + tensor_list.append(value) + else: + metadata_list.append((key, value)) + return metadata_list, tensor_list + + +_group_name_counter: dict[str, int] = {} + + +def _get_unique_name(name: str) -> str: + """Get a unique name for the group. + Example: + _get_unique_name("tp") -> "tp:0" + _get_unique_name("tp") -> "tp:1" + """ + if name not in _group_name_counter: + _group_name_counter[name] = 0 + newname = f"{name}:{_group_name_counter[name]}" + _group_name_counter[name] += 1 + return newname + + +_groups: dict[str, Callable[[], Optional["GroupCoordinator"]]] = {} + + +def _register_group(group: "GroupCoordinator") -> None: + _groups[group.unique_name] = weakref.ref(group) + + +def all_reduce(tensor: torch.Tensor, group_name: str) -> torch.Tensor: + assert group_name in _groups, f"Group {group_name} is not found." + group = _groups[group_name]() + if group is None: + raise ValueError(f"Group {group_name} is destroyed.") + return group._all_reduce_out_place(tensor) + + +def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor: + return torch.empty_like(tensor) + + +def reduce_scatter( + tensor: torch.Tensor, dim: int, world_size: int, group_name: str +) -> torch.Tensor: + assert group_name in _groups, f"Group {group_name} is not found." + group = _groups[group_name]() + if group is None: + raise ValueError(f"Group {group_name} is destroyed.") + return group._reduce_scatter_out_place(tensor, dim) + + +def reduce_scatter_fake( + tensor: torch.Tensor, dim: int, world_size: int, group_name: str +) -> torch.Tensor: + new_shape = list(tensor.shape) + new_shape[dim] = tensor.shape[dim] // world_size + return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device) + + +def all_gather( + tensor: torch.Tensor, dim: int, world_size: int, group_name: str +) -> torch.Tensor: + assert group_name in _groups, f"Group {group_name} is not found." + group = _groups[group_name]() + if group is None: + raise ValueError(f"Group {group_name} is destroyed.") + return group._all_gather_out_place(tensor, dim) + + +def all_gather_fake( + tensor: torch.Tensor, dim: int, world_size: int, group_name: str +) -> torch.Tensor: + new_shape = list(tensor.shape) + new_shape[dim] = tensor.shape[dim] * world_size + return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device) + + +def patched_fused_scaled_matmul_reduce_scatter_fake( + A: torch.Tensor, + B: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + reduce_op: str, + orig_scatter_dim: int, + scatter_dim_after_maybe_reshape: int, + group_name: str, + output_shape: list[int], + bias: torch.Tensor | None = None, + result_scale: torch.Tensor | None = None, + out_dtype: torch.dtype | None = None, + use_fast_accum: bool = False, +) -> torch.Tensor: + # Copied from + # https://github.com/pytorch/pytorch/blob/50c338c2da905062449e4d9ac807832d1b5cd90e/torch/distributed/_symmetric_memory/__init__.py#L1189 + if A_scale.numel() > 1: + if A_scale.shape[:-1] != A.shape[:-1]: + raise ValueError( + "For row-wise scaling, the leading dims of A_scale " + "must match the leading dims of A " + f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})" + ) + A_scale = A_scale.flatten(0, -2).contiguous() + elif A_scale.numel() != 1: + raise ValueError( + "Invalid A_scale shape " + f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})" + ) + + C = torch._scaled_mm( + A.flatten(0, -2).contiguous(), + B, + A_scale, + B_scale, + bias, + result_scale, + out_dtype, + use_fast_accum, + ) + C = C.view(*output_shape[:-1], B.shape[1]) + res = funcol.reduce_scatter_tensor( + C, + reduce_op, + orig_scatter_dim, # need original scatter dim for 3D+ output tensor here + group_name, + ) + res = funcol.wait_tensor(res) + return res + + +def patched_fused_scaled_matmul_reduce_scatter( + A: torch.Tensor, + B: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + reduce_op: str, + orig_scatter_dim: int, + scatter_dim_after_maybe_reshape: int, + group_name: str, + output_shape: list[int], + bias: torch.Tensor | None = None, + result_scale: torch.Tensor | None = None, + out_dtype: torch.dtype | None = None, + use_fast_accum: bool = False, +) -> torch.Tensor: + return torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter( + A, + B, + A_scale, + B_scale, + reduce_op, + orig_scatter_dim, + scatter_dim_after_maybe_reshape, + group_name, + output_shape, + bias, + result_scale, + out_dtype, + use_fast_accum, + ) + + +if supports_custom_op(): + direct_register_custom_op( + op_name="all_reduce", + op_func=all_reduce, + fake_impl=all_reduce_fake, + ) + + direct_register_custom_op( + op_name="reduce_scatter", + op_func=reduce_scatter, + fake_impl=reduce_scatter_fake, + ) + + direct_register_custom_op( + op_name="all_gather", + op_func=all_gather, + fake_impl=all_gather_fake, + ) + + # TODO: Remove this once the pytorch fix + # (https://github.com/pytorch/pytorch/pull/165086) gets released, + # in either 2.9.1 or 2.10 + direct_register_custom_op( + op_name="patched_fused_scaled_matmul_reduce_scatter", + op_func=patched_fused_scaled_matmul_reduce_scatter, + fake_impl=patched_fused_scaled_matmul_reduce_scatter_fake, + ) + + +class GroupCoordinator: + """ + PyTorch ProcessGroup wrapper for a group of processes. + PyTorch ProcessGroup is bound to one specific communication backend, + e.g. NCCL, Gloo, MPI, etc. + GroupCoordinator takes charge of all the communication operations among + the processes in the group. It manages both CPU and device + communication. + """ + + # available attributes: + rank: int # global rank + ranks: list[int] # global ranks in the group + world_size: int # size of the group + # difference between `local_rank` and `rank_in_group`: + # if we have a group of size 4 across two nodes: + # Process | Node | Rank | Local Rank | Rank in Group + # 0 | 0 | 0 | 0 | 0 + # 1 | 0 | 1 | 1 | 1 + # 2 | 1 | 2 | 0 | 2 + # 3 | 1 | 3 | 1 | 3 + local_rank: int # local rank used to assign devices + rank_in_group: int # rank inside the group + cpu_group: ProcessGroup # group for CPU communication + device_group: ProcessGroup # group for device communication + # device communicator (if use_device_communicator=True) + device_communicator: DeviceCommunicatorBase | None + mq_broadcaster: Any | None # shared memory broadcaster + + def __init__( + self, + group_ranks: list[list[int]], + local_rank: int, + torch_distributed_backend: str | Backend, + use_device_communicator: bool, # whether to use device communicator + use_message_queue_broadcaster: bool = False, + group_name: str | None = None, + ): + group_name = group_name or "anonymous" + self.unique_name = _get_unique_name(group_name) + _register_group(self) + + self.rank = torch.distributed.get_rank() + self.local_rank = local_rank + + use_vllm_comm = os.environ.get("VLLM_FORCE_NCCL_COMM", None) not in {"1", "Y", "y"} + + self_device_group = None + self_cpu_group = None + + for ranks in group_ranks: + device_group = torch.distributed.new_group( + ranks, backend=torch_distributed_backend + ) + # a group with `gloo` backend, to allow direct coordination between + # processes through the CPU. + cpu_group = torch.distributed.new_group(ranks, backend="gloo") + if self.rank in ranks: + self.ixfd_group = ixfd.init_comm_with_store(device_group) if use_vllm_comm else None + self.ranks = ranks + self.world_size = len(ranks) + self.rank_in_group = ranks.index(self.rank) + self_device_group = device_group + self_cpu_group = cpu_group + + assert self_cpu_group is not None + assert self_device_group is not None + + self.cpu_group = self_cpu_group + self.device_group = self_device_group + + from vllm.platforms import current_platform + + if current_platform.is_cuda_alike(): + self.device = torch.device(f"cuda:{local_rank}") + elif current_platform.is_xpu(): + self.device = torch.device(f"xpu:{local_rank}") + elif current_platform.is_out_of_tree(): + self.device = torch.device(f"{current_platform.device_name}:{local_rank}") + else: + self.device = torch.device("cpu") + + self.use_device_communicator = use_device_communicator + self.device_communicator = None + if use_device_communicator and self.world_size > 1: + device_comm_cls = resolve_obj_by_qualname( + current_platform.get_device_communicator_cls() + ) + self.device_communicator = device_comm_cls( + cpu_group=self.cpu_group, + device=self.device, + device_group=self.ixfd_group if use_vllm_comm else self.device_group, + unique_name=self.unique_name, + ) + + from vllm.distributed.device_communicators.shm_broadcast import MessageQueue + + self.mq_broadcaster: MessageQueue | None = None + if use_message_queue_broadcaster and self.world_size > 1: + self.mq_broadcaster = MessageQueue.create_from_process_group( + self.cpu_group, 1 << 22, 6 + ) + + from vllm.platforms import current_platform + + self.use_custom_op_call = False + + self.use_cpu_custom_send_recv = current_platform.is_cpu() and hasattr( + torch.ops._C, "init_shm_manager" + ) + + def create_mq_broadcaster( + self, writer_rank=0, external_writer_handle=None, blocking=True + ): + from vllm.distributed.device_communicators.shm_broadcast import MessageQueue + + return MessageQueue.create_from_process_group( + self.cpu_group, + 1 << 22, + 6, + writer_rank=writer_rank, + external_writer_handle=external_writer_handle, + blocking=blocking, + ) + + def create_single_reader_mq_broadcasters( + self, reader_rank_in_group=0, blocking=False + ): + from vllm.distributed.device_communicators.shm_broadcast import MessageQueue + + return MessageQueue.create_from_process_group_single_reader( + self.cpu_group, + 1 << 22, + 6, + reader_rank=self.ranks[reader_rank_in_group], + blocking=blocking, + ) + + @property + def first_rank(self): + """Return the global rank of the first process in the group""" + return self.ranks[0] + + @property + def last_rank(self): + """Return the global rank of the last process in the group""" + return self.ranks[-1] + + @property + def is_first_rank(self): + """Return whether the caller is the first process in the group""" + return self.rank == self.first_rank + + @property + def is_last_rank(self): + """Return whether the caller is the last process in the group""" + return self.rank == self.last_rank + + @property + def next_rank(self): + """Return the global rank of the process that follows the caller""" + rank_in_group = self.rank_in_group + world_size = self.world_size + return self.ranks[(rank_in_group + 1) % world_size] + + @property + def prev_rank(self): + """Return the global rank of the process that precedes the caller""" + rank_in_group = self.rank_in_group + world_size = self.world_size + return self.ranks[(rank_in_group - 1) % world_size] + + @contextmanager + def graph_capture(self, graph_capture_context: GraphCaptureContext | None = None): + if graph_capture_context is None: + stream = torch.cuda.Stream() + graph_capture_context = GraphCaptureContext(stream) + else: + stream = graph_capture_context.stream + + # only cuda uses this function, + # so we don't abstract it into the base class + maybe_ca_context = nullcontext() + from vllm.distributed.device_communicators.cuda_communicator import ( + CudaCommunicator, + ) + + if self.device_communicator is not None: + assert isinstance(self.device_communicator, CudaCommunicator) + ca_comm = self.device_communicator.ca_comm + if ca_comm is not None: + maybe_ca_context = ca_comm.capture() # type: ignore + + # ensure all initialization operations complete before attempting to + # capture the graph on another stream + curr_stream = torch.cuda.current_stream() + if curr_stream != stream: + stream.wait_stream(curr_stream) + + with torch.cuda.stream(stream), maybe_ca_context: + yield graph_capture_context + + def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: + """ + User-facing all-reduce function before we actually call the + all-reduce operation. + + We need this because Dynamo does not support passing an arbitrary + object (`self` in this case) to a custom op. We need to pass the + group name as a string, and then look up the group coordinator from + the group name, dispatch the all-reduce operation to the group + coordinator. + + In addition, PyTorch custom ops do not support mutation or returning + a new tensor in the same op. So we always make the all-reduce operation + out-of-place. + """ + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return input_ + + if self.use_custom_op_call: + return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + else: + return self._all_reduce_out_place(input_) + + def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.all_reduce(input_) + + def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + ) + + if self.use_custom_op_call: + return torch.ops.vllm.all_gather( + input_, dim, world_size, group_name=self.unique_name + ) + else: + return self._all_gather_out_place(input_, dim) + + def _all_gather_out_place(self, input_: torch.Tensor, dim: int) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.all_gather(input_, dim) + + def all_gatherv( + self, + input_: torch.Tensor | list[torch.Tensor], + dim: int = 0, + sizes: list[int] | None = None, + ): + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.all_gatherv(input_, dim, sizes) + + def reduce_scatter(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + ) + + if self.use_custom_op_call: + return torch.ops.vllm.reduce_scatter( + input_, dim, world_size, group_name=self.unique_name + ) + else: + return self._reduce_scatter_out_place(input_, dim) + + def reduce_scatterv( + self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None + ) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.reduce_scatterv(input_, dim, sizes) + + def _reduce_scatter_out_place(self, input_: torch.Tensor, dim: int) -> torch.Tensor: + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.reduce_scatter(input_, dim) + + def gather( + self, input_: torch.Tensor, dst: int = 0, dim: int = -1 + ) -> torch.Tensor | None: + """ + NOTE: We assume that the input tensor is on the same device across + all the ranks. + NOTE: `dst` is the local rank of the destination rank. + """ + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.gather(input_, dst, dim) + + def broadcast(self, input_: torch.Tensor, src: int = 0): + """Broadcast the input tensor. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return input_ + # Broadcast. + if self.device_communicator.use_vllm_comm: + ops.broadcast(input_, + src=self.ranks[src], + group=self.device_group) + else: + torch.distributed.broadcast( + input_, src=self.ranks[src], group=self.device_group + ) + return input_ + + def broadcast_object(self, obj: Any | None = None, src: int = 0): + """Broadcast the input object. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return obj + if self.mq_broadcaster is not None: + assert src == 0, "Message queue broadcaster only supports src=0" + return self.mq_broadcaster.broadcast_object(obj) + if self.rank_in_group == src: + torch.distributed.broadcast_object_list( + [obj], src=self.ranks[src], group=self.cpu_group + ) + return obj + else: + recv = [None] + torch.distributed.broadcast_object_list( + recv, src=self.ranks[src], group=self.cpu_group + ) + return recv[0] + + def broadcast_object_list( + self, obj_list: list[Any], src: int = 0, group: ProcessGroup | None = None + ): + """Broadcast the input object list. + NOTE: `src` is the local rank of the source rank. + """ + assert src < self.world_size, f"Invalid src rank ({src})" + + # Bypass the function if we are using only 1 GPU. + if self.world_size == 1: + return obj_list + # Broadcast. + torch.distributed.broadcast_object_list( + obj_list, src=self.ranks[src], group=self.device_group + ) + return obj_list + + def send_object(self, obj: Any, dst: int) -> None: + """Send the input object list to the destination rank.""" + """NOTE: `dst` is the local rank of the destination rank.""" + + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + assert dst != self.rank_in_group, ( + "Invalid destination rank. Destination rank is the same " + "as the current rank." + ) + + # Serialize object to tensor and get the size as well + object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8) + + size_tensor = torch.tensor( + [object_tensor.numel()], dtype=torch.long, device="cpu" + ) + + # Send object size + + torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group) + + # Send object + torch.distributed.send(object_tensor, dst=self.ranks[dst], group=self.cpu_group) + + return None + + def recv_object(self, src: int) -> Any: + """Receive the input object list from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + + assert src < self.world_size, f"Invalid src rank ({src})" + + assert src != self.rank_in_group, ( + "Invalid source rank. Source rank is the same as the current rank." + ) + + size_tensor = torch.empty(1, dtype=torch.long, device="cpu") + + # Receive object size + rank_size = torch.distributed.recv( + size_tensor, src=self.ranks[src], group=self.cpu_group + ) + + # Tensor to receive serialized objects into. + object_tensor = torch.empty( # type: ignore[call-overload] + size_tensor.item(), # type: ignore[arg-type] + dtype=torch.uint8, + device="cpu", + ) + + rank_object = torch.distributed.recv( + object_tensor, src=self.ranks[src], group=self.cpu_group + ) + + assert rank_object == rank_size, ( + "Received object sender rank does not match the size sender rank." + ) + + obj = pickle.loads(object_tensor.numpy().tobytes()) + + return obj + + def broadcast_tensor_dict( + self, + tensor_dict: dict[str, torch.Tensor | Any] | None = None, + src: int = 0, + group: ProcessGroup | None = None, + metadata_group: ProcessGroup | None = None, + ) -> dict[str, torch.Tensor | Any] | None: + """Broadcast the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return tensor_dict + + group = self.device_group + metadata_group = self.cpu_group + assert src < self.world_size, f"Invalid src rank ({src})" + + rank_in_group = self.rank_in_group + if rank_in_group == src: + metadata_list: list[tuple[Any, Any]] = [] + assert isinstance(tensor_dict, dict), ( + f"Expecting a dictionary, got {type(tensor_dict)}" + ) + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `broadcast_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + self.broadcast_object(metadata_list, src=src) + async_handles = [] + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast( + tensor, src=self.ranks[src], group=metadata_group, async_op=True + ) + else: + # use group for GPU tensors + if self.device_communicator.use_vllm_comm: + handle = ops.broadcast(tensor, + src=self.ranks[src], + group=group, + async_op=True) + else: + handle = torch.distributed.broadcast( + tensor, src=self.ranks[src], group=group, async_op=True + ) + async_handles.append(handle) + for async_handle in async_handles: + async_handle.wait() + + else: + metadata_list = self.broadcast_object(None, src=src) + tensor_dict = {} + async_handles = [] + for key, value in metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty( + value.size, dtype=value.dtype, device=value.device + ) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast( + tensor, + src=self.ranks[src], + group=metadata_group, + async_op=True, + ) + else: + # use group for GPU tensors + if self.device_communicator.use_vllm_comm: + handle = ops.broadcast(tensor, + src=self.ranks[src], + group=group, + async_op=True) + else: + handle = torch.distributed.broadcast( + tensor, src=self.ranks[src], group=group, async_op=True + ) + async_handles.append(handle) + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + for async_handle in async_handles: + async_handle.wait() + return tensor_dict + + def send_tensor_dict( + self, + tensor_dict: dict[str, torch.Tensor | Any], + dst: int | None = None, + all_gather_group: Optional["GroupCoordinator"] = None, + all_gather_tensors: dict[str, bool] | None = None, + ) -> dict[str, torch.Tensor | Any] | None: + """Send the input tensor dictionary. + NOTE: `dst` is the local rank of the source rank. + + all_gather_group: The group for the all-gather operation. If provided, + an optimization is enabled where each rank in the group sends a + slice of a tensor and the receiver reconstructs it using an + all-gather, which can improve performance. This is typically the + tensor-parallel group. + all_gather_tensors: A dictionary to specify which tensors should use + the all-gather optimization, which is only effective when + `all_gather_group` is provided. By default, this optimization is + on for any tensor whose size is divisible by the + `all_gather_group`'s world size. However, it should be disabled + for tensors that are not fully replicated across the group (e.g., + the residual tensor when sequence parallelism is enabled). This + dictionary allows overriding the default behavior on a per-tensor + basis. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return tensor_dict + all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size + all_gather_rank = ( + 0 if all_gather_group is None else all_gather_group.rank_in_group + ) + + group = self.device_group + metadata_group = self.cpu_group + + if dst is None: + dst = (self.rank_in_group + 1) % self.world_size + assert dst < self.world_size, f"Invalid dst rank ({dst})" + + if self.use_cpu_custom_send_recv: + if self.device_communicator is None: + raise ValueError("No device communicator found") + self.device_communicator.send_tensor_dict( # type: ignore + tensor_dict, dst + ) + return None + + metadata_list: list[tuple[Any, Any]] = [] + assert isinstance(tensor_dict, dict), ( + f"Expecting a dictionary, got {type(tensor_dict)}" + ) + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `send_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + self.send_object(metadata_list, dst=dst) + + tensor_keys = [k for k, v in tensor_dict.items() if isinstance(v, torch.Tensor)] + assert len(tensor_keys) == len(tensor_list) + + for key, tensor in zip(tensor_keys, tensor_list): + if tensor.numel() == 0: + # Skip sending empty tensors. + continue + + # send-allgather: send only a slice, then do allgather. + use_all_gather = ( + all_gather_group is not None and tensor.numel() % all_gather_size == 0 + ) + use_all_gather = ( + all_gather_tensors.get(key, use_all_gather) + if all_gather_tensors + else use_all_gather + ) + if use_all_gather: + tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank] + + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.send( + tensor, dst=self.ranks[dst], group=metadata_group + ) + else: + # use group for GPU tensors + if self.device_communicator.use_vllm_comm: + ixfd.send(tensor, + dst=self.ranks[dst], + group=group) + else: + # use group for GPU tensors + torch.distributed.send(tensor, dst=self.ranks[dst], group=group) + return None + + def recv_tensor_dict( + self, + src: int | None = None, + all_gather_group: Optional["GroupCoordinator"] = None, + all_gather_tensors: dict[str, bool] | None = None, + ) -> dict[str, torch.Tensor | Any] | None: + """Recv the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + + all_gather_group: The group for the all-gather operation. If provided, + an optimization is enabled where each rank in the group sends a + slice of a tensor and the receiver reconstructs it using an + all-gather, which can improve performance. This is typically the + tensor-parallel group. + all_gather_tensors: A dictionary to specify which tensors should use + the all-gather optimization, which is only effective when + `all_gather_group` is provided. By default, this optimization is + on for any tensor whose size is divisible by the + `all_gather_group`'s world size. However, it should be disabled + for tensors that are not fully replicated across the group (e.g., + the residual tensor when sequence parallelism is enabled). This + dictionary allows overriding the default behavior on a per-tensor + basis. + """ + # Bypass the function if we are using only 1 GPU. + if not torch.distributed.is_initialized() or self.world_size == 1: + return None + all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size + all_gather_rank = ( + 0 if all_gather_group is None else all_gather_group.rank_in_group + ) + + group = self.device_group + metadata_group = self.cpu_group + + if src is None: + src = (self.rank_in_group - 1) % self.world_size + assert src < self.world_size, f"Invalid src rank ({src})" + + if self.use_cpu_custom_send_recv: + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.recv_tensor_dict( # type: ignore + src + ) + + recv_metadata_list = self.recv_object(src=src) + tensor_dict: dict[str, Any] = {} + for key, value in recv_metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, dtype=value.dtype, device=value.device) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + + # send-allgather: send only a slice, then do allgather. + use_all_gather = ( + all_gather_group is not None + and tensor.numel() % all_gather_size == 0 + ) + use_all_gather = ( + all_gather_tensors.get(key, use_all_gather) + if all_gather_tensors + else use_all_gather + ) + + if use_all_gather: + orig_shape = tensor.shape + tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank] + + if tensor.is_cpu: + # use metadata_group for CPU tensors + torch.distributed.recv( + tensor, src=self.ranks[src], group=metadata_group + ) + else: + # use group for GPU tensors + if self.device_communicator.use_vllm_comm: + ixfd.recv(tensor, + src=self.ranks[src], + group=group) + else: + # use group for GPU tensors + torch.distributed.recv(tensor, src=self.ranks[src], group=group) + if use_all_gather: + # do the allgather + tensor = all_gather_group.all_gather( # type: ignore + tensor, dim=0 + ) + tensor = tensor.reshape(orig_shape) + + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + return tensor_dict + + def barrier(self): + """Barrier synchronization among the group. + NOTE: don't use `device_group` here! `barrier` in NCCL is + terrible because it is internally a broadcast operation with + secretly created GPU tensors. It is easy to mess up the current + device. Use the CPU group instead. + """ + torch.distributed.barrier(group=self.cpu_group) + + def send(self, tensor: torch.Tensor, dst: int | None = None) -> None: + """Sends a tensor to the destination rank in a blocking way""" + """NOTE: `dst` is the local rank of the destination rank.""" + if self.device_communicator is None: + raise ValueError("No device communicator found") + self.device_communicator.send(tensor, dst) + + def recv( + self, size: torch.Size, dtype: torch.dtype, src: int | None = None + ) -> torch.Tensor: + """Receives a tensor from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + if self.device_communicator is None: + raise ValueError("No device communicator found") + return self.device_communicator.recv(size, dtype, src) + + def destroy(self): + if self.device_group is not None: + if self.device_communicator and self.device_communicator.use_vllm_comm: + ixfd.destroy_process_group(self.device_group) + else: + torch.distributed.destroy_process_group(self.device_group) + self.device_group = None + if self.cpu_group is not None: + torch.distributed.destroy_process_group(self.cpu_group) + del self.cpu_group + if self.device_communicator is not None: + self.device_communicator.destroy() + if self.mq_broadcaster is not None: + self.mq_broadcaster = None + + def prepare_communication_buffer_for_model(self, model: torch.nn.Module): + if self.device_communicator is not None: + self.device_communicator.prepare_communication_buffer_for_model(model) + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + if self.device_communicator is not None: + return self.device_communicator.dispatch( + hidden_states, router_logits, is_sequence_parallel + ) + else: + return hidden_states, router_logits + + def combine( + self, hidden_states, is_sequence_parallel: bool = False + ) -> torch.Tensor: + if self.device_communicator is not None: + return self.device_communicator.combine(hidden_states, is_sequence_parallel) + else: + return hidden_states + + +_WORLD: GroupCoordinator | None = None +_INNER_DP_WORLD: GroupCoordinator | None = None +_NODE_COUNT: int | None = None + + +def get_world_group() -> GroupCoordinator: + assert _WORLD is not None, "world group is not initialized" + return _WORLD + + +def get_inner_dp_world_group() -> GroupCoordinator: + assert _INNER_DP_WORLD is not None, "inner dp world group is not initialized" + return _INNER_DP_WORLD + + +def init_world_group( + ranks: list[int], local_rank: int, backend: str +) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=[ranks], + local_rank=local_rank, + torch_distributed_backend=backend, + use_device_communicator=False, + group_name="world", + ) + + +def init_model_parallel_group( + group_ranks: list[list[int]], + local_rank: int, + backend: str, + use_message_queue_broadcaster: bool = False, + group_name: str | None = None, + use_device_communicator: bool = True, +) -> GroupCoordinator: + return GroupCoordinator( + group_ranks=group_ranks, + local_rank=local_rank, + torch_distributed_backend=backend, + use_device_communicator=use_device_communicator, + use_message_queue_broadcaster=use_message_queue_broadcaster, + group_name=group_name, + ) + + +_TP: GroupCoordinator | None = None + + +def get_tp_group() -> GroupCoordinator: + assert _TP is not None, "tensor model parallel group is not initialized" + return _TP + + +@deprecated( + "`get_tensor_model_parallel_group` has been replaced with " + "`get_tp_group` and may be removed after v0.12. Please use " + "`get_tp_group` instead." +) +def get_tensor_model_parallel_group(): + return get_tp_group() + + +_DCP: GroupCoordinator | None = None + + +def get_dcp_group() -> GroupCoordinator: + assert _DCP is not None, "decode context model parallel group is not initialized" + return _DCP + + +# kept for backward compatibility +get_context_model_parallel_group = get_dcp_group + +_PP: GroupCoordinator | None = None + +_DP: GroupCoordinator | None = None + + +def get_dp_group() -> GroupCoordinator: + assert _DP is not None, "data parallel group is not initialized" + return _DP + + +_EP: GroupCoordinator | None = None + + +def get_ep_group() -> GroupCoordinator: + assert _EP is not None, "expert parallel group is not initialized" + return _EP + + +def get_pp_group() -> GroupCoordinator: + assert _PP is not None, "pipeline model parallel group is not initialized" + return _PP + + +@deprecated( + "`get_pipeline_model_parallel_group` has been replaced with " + "`get_pp_group` and may be removed in v0.12. Please use " + "`get_pp_group` instead." +) +def get_pipeline_model_parallel_group(): + return get_pp_group() + + +@contextmanager +def graph_capture(device: torch.device): + """ + `graph_capture` is a context manager which should surround the code that + is capturing the CUDA graph. Its main purpose is to ensure that some + operations will be run after the graph is captured, before the graph + is replayed. It returns a `GraphCaptureContext` object which contains the + necessary data for the graph capture. Currently, it only contains the + stream that the graph capture is running on. This stream is set to the + current CUDA stream when the context manager is entered and reset to the + default stream when the context manager is exited. This is to ensure that + the graph capture is running on a separate stream from the default stream, + in order to explicitly distinguish the kernels to capture + from other kernels possibly launched on background in the default stream. + """ + context = GraphCaptureContext(torch.cuda.Stream(device=device)) + with get_tp_group().graph_capture(context), get_pp_group().graph_capture(context): + yield context + + +logger = init_logger(__name__) + +_ENABLE_CUSTOM_ALL_REDUCE = True + + +def set_custom_all_reduce(enable: bool): + global _ENABLE_CUSTOM_ALL_REDUCE + _ENABLE_CUSTOM_ALL_REDUCE = enable + + +def init_distributed_environment( + world_size: int = -1, + rank: int = -1, + distributed_init_method: str = "env://", + local_rank: int = -1, + backend: str = "nccl", + timeout: timedelta | None = None, +): + logger.debug( + "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s", + world_size, + rank, + local_rank, + distributed_init_method, + backend, + ) + from vllm.config import get_current_vllm_config + + config = get_current_vllm_config() + if config is not None and config.parallel_config.nnodes > 1: + parallel_config = config.parallel_config + ip = parallel_config.master_addr + rank = parallel_config.data_parallel_rank * world_size + rank + world_size = parallel_config.world_size_across_dp + port = parallel_config.master_port + distributed_init_method = get_distributed_init_method(ip, port) + elif ( + config is not None + and config.parallel_config.data_parallel_size > 1 + and config.parallel_config.distributed_executor_backend != "external_launcher" + ): + parallel_config = config.parallel_config + # adjust to take into account data parallelism + # offset the rank by the data parallel rank + rank = parallel_config.data_parallel_rank * world_size + rank + # adjust the world size to take into account data parallelism + world_size = parallel_config.world_size_across_dp + ip = parallel_config.data_parallel_master_ip + port = parallel_config.get_next_dp_init_port() + distributed_init_method = get_distributed_init_method(ip, port) + logger.debug( + "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP", + world_size, + rank, + distributed_init_method, + ) + if not torch.distributed.is_initialized(): + logger.info( + "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s", + world_size, + rank, + local_rank, + distributed_init_method, + backend, + ) + assert distributed_init_method is not None, ( + "distributed_init_method must be provided when initializing " + "distributed environment" + ) + if not torch.distributed.is_backend_available(backend): + logger.warning( + "Distributed backend %s is not available; falling back to gloo.", + backend, + ) + assert torch.distributed.is_gloo_available(), ( + "Fallback Gloo backend is not available." + ) + backend = "gloo" + # this backend is used for WORLD + torch.distributed.init_process_group( + backend=backend, + init_method=distributed_init_method, + world_size=world_size, + rank=rank, + timeout=timeout, + ) + # set the local rank + # local_rank is not available in torch ProcessGroup, + # see https://github.com/pytorch/pytorch/issues/122816 + if local_rank == -1: + # local rank not set, this usually happens in single-node + # setting, where we can use rank as local rank + local_rank = envs.LOCAL_RANK if distributed_init_method == "env://" else rank + global _WORLD, _NODE_COUNT, _INNER_DP_WORLD + if _WORLD is None: + ranks = list(range(torch.distributed.get_world_size())) + _WORLD = init_world_group(ranks, local_rank, backend) + if config.parallel_config.nnodes > 1: + _NODE_COUNT = config.parallel_config.nnodes + else: + _NODE_COUNT = _node_count(_WORLD.cpu_group) + logger.debug("Detected %d nodes in the distributed environment", _NODE_COUNT) + else: + assert _WORLD.world_size == torch.distributed.get_world_size(), ( + "world group already initialized with a different world size" + ) + if config.parallel_config.nnodes_within_dp > 1: + if parallel_config.data_parallel_size > 1: + world_size_inner_dp = parallel_config.world_size + group_ranks = [ + [dp_rank * world_size_inner_dp + i for i in range(world_size_inner_dp)] + for dp_rank in range(parallel_config.data_parallel_size) + ] + _INNER_DP_WORLD = init_model_parallel_group( + group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="inner_dp_world", + use_device_communicator=False, + ) + else: + _INNER_DP_WORLD = _WORLD + + +def initialize_model_parallel( + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + decode_context_model_parallel_size: int | None = 1, + backend: str | None = None, +) -> None: + """ + Initialize model parallel groups. + + Arguments: + tensor_model_parallel_size: number of GPUs used for tensor model + parallelism. + pipeline_model_parallel_size: number of GPUs used for pipeline model + parallelism. + backend: name of torch distributed communication backend. + + Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we + use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize + the model pipeline. The present function will + create 4 tensor model-parallel groups and 2 pipeline model-parallel groups: + 4 tensor model-parallel groups: + [g0, g1], [g2, g3], [g4, g5], [g6, g7] + 2 pipeline model-parallel groups: + [g0, g2, g4, g6], [g1, g3, g5, g7] + Note that for efficiency, the caller should make sure adjacent ranks + are on the same DGX box. For example if we are using 2 DGX-1 boxes + with a total of 16 GPUs, rank 0 to 7 belong to the first box and + ranks 8 to 15 belong to the second box. + """ + # Get world size and rank. Ensure some consistencies. + assert torch.distributed.is_initialized() + world_size: int = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + backend = backend or torch.distributed.get_backend(get_world_group().device_group) + + data_parallel_size = 1 + from vllm.config import get_current_vllm_config + + config = get_current_vllm_config() + if config is not None: + data_parallel_size = config.parallel_config.data_parallel_size + + # the layout order is: ExternalDP x DP x PP x TP + # ExternalDP is the data parallel group that is not part of the model, + # every dp rank can generate independently (in verl integration). + # DP is the data parallel group that is part of the model, + # all the ranks in the same DP group should generate simultaneously, + # i.e. the `generate` call in the same DP group should be called together, + # otherwise it will cause deadlock. + # to get group_ranks for each dimension, transpose that dimension to the + # last dimension, then reshape to 2D, then unbind the last dimension + all_ranks = torch.arange(world_size).reshape( + -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size + ) # noqa + + # Build the tensor model-parallel groups. + global _TP + assert _TP is None, "tensor model parallel group is already initialized" + group_ranks = all_ranks.view(-1, tensor_model_parallel_size).unbind(0) + group_ranks = [x.tolist() for x in group_ranks] + + # message queue broadcaster is only used in tensor model parallel group + _TP = init_model_parallel_group( + group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="tp", + ) + + # Build the DCP model-parallel groups. + global _DCP + assert _DCP is None, "decode context model parallel group is already initialized" + # Note(hc): In the current implementation of decode context parallel, + # dcp_size must not exceed tp_size, because the world size does not + # change by DCP, it simply reuses the GPUs of TP group, and split one + # TP group into tp_size//dcp_size DCP groups. + group_ranks = all_ranks.reshape(-1, decode_context_model_parallel_size).unbind(0) + group_ranks = [x.tolist() for x in group_ranks] + _DCP = init_model_parallel_group( + group_ranks, + get_world_group().local_rank, + backend, + use_message_queue_broadcaster=True, + group_name="dcp", + ) + + # Build the pipeline model-parallel groups. + global _PP + assert _PP is None, "pipeline model parallel group is already initialized" + group_ranks = ( + all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0) + ) + group_ranks = [x.tolist() for x in group_ranks] + _PP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="pp" + ) + + global _DP + assert _DP is None, "data parallel group is already initialized" + group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0) + group_ranks = [x.tolist() for x in group_ranks] + _DP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="dp" + ) + + global _EP + assert _EP is None, "expert parallel group is already initialized" + group_ranks = ( + all_ranks.transpose(1, 2) + .reshape(-1, data_parallel_size * tensor_model_parallel_size) + .unbind(0) + ) + group_ranks = [x.tolist() for x in group_ranks] + _EP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="ep" + ) + + logger.info_once( + "rank %s in world size %s is assigned as " + "DP rank %s, PP rank %s, TP rank %s, EP rank %s", + rank, + world_size, + _DP.rank_in_group, + _PP.rank_in_group, + _TP.rank_in_group, + _EP.rank_in_group, + ) + + +def ensure_model_parallel_initialized( + tensor_model_parallel_size: int, + pipeline_model_parallel_size: int, + decode_context_model_parallel_size: int | None = 1, + backend: str | None = None, +) -> None: + """Helper to initialize model parallel groups if they are not initialized, + or ensure tensor-parallel and pipeline-parallel sizes are equal to expected + values if the model parallel groups are initialized. + """ + backend = backend or torch.distributed.get_backend(get_world_group().device_group) + if not model_parallel_is_initialized(): + initialize_model_parallel( + tensor_model_parallel_size, + pipeline_model_parallel_size, + decode_context_model_parallel_size, + backend, + ) + return + + assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, ( + "tensor parallel group already initialized, but of unexpected size. " + f"got: {get_tensor_model_parallel_world_size()=} vs. " + f"wanted: {tensor_model_parallel_size=}" + ) + pp_world_size = get_pp_group().world_size + assert pp_world_size == pipeline_model_parallel_size, ( + "pipeline parallel group already initialized, but of unexpected size. " + f"got: {pp_world_size=} vs. " + f"wanted: {pipeline_model_parallel_size=}" + ) + + +def prepare_communication_buffer_for_model(model: torch.nn.Module): + """Prepare the communication buffer for the model. + Traditional communication libraries like NCCL are almost + model agnostic. However, emerging new communication libraries like + MoE all2all (DeepEP) usually allocate the communication buffer + based on the model shape for optimal performance. + """ + if _TP is not None: + _TP.prepare_communication_buffer_for_model(model) + if _PP is not None: + _PP.prepare_communication_buffer_for_model(model) + if _DP is not None: + _DP.prepare_communication_buffer_for_model(model) + if _EP is not None: + _EP.prepare_communication_buffer_for_model(model) + + +def model_parallel_is_initialized(): + """Check if tensor and pipeline parallel groups are initialized.""" + return _TP is not None and _PP is not None + + +_TP_STATE_PATCHED = False + + +@contextmanager +def patch_tensor_parallel_group(tp_group: GroupCoordinator): + """Patch the tp group temporarily until this function ends. + + This method is for draft workers of speculative decoding to run draft model + with different tp degree from that of target model workers. + + Args: + tp_group (GroupCoordinator): the tp group coordinator + """ + global _TP_STATE_PATCHED + assert not _TP_STATE_PATCHED, "Should not call when it's already patched" + + _TP_STATE_PATCHED = True + old_tp_group = get_tp_group() + global _TP + _TP = tp_group + try: + yield + finally: + # restore the original state + _TP_STATE_PATCHED = False + _TP = old_tp_group + + +def get_tensor_model_parallel_world_size(): + """Return world size for the tensor model parallel group.""" + return get_tp_group().world_size + + +def get_tensor_model_parallel_rank(): + """Return my rank for the tensor model parallel group.""" + return get_tp_group().rank_in_group + + +def get_decode_context_model_parallel_world_size(): + """Return world size for the decode context model parallel group.""" + return get_dcp_group().world_size + + +def get_decode_context_model_parallel_rank(): + """Return my rank for the decode context model parallel group.""" + return get_dcp_group().rank_in_group + + +def get_node_count() -> int: + """Return the total number of nodes in the distributed environment.""" + assert _NODE_COUNT is not None, "distributed environment is not initialized" + return _NODE_COUNT + + +def destroy_model_parallel(): + """Set the groups to none and destroy them.""" + global _TP + + if _TP: + _TP.destroy() + _TP = None + + global _PP + if _PP: + _PP.destroy() + _PP = None + + global _DCP + if _DCP: + _DCP.destroy() + _DCP = None + + global _DP + if _DP: + _DP.destroy() + _DP = None + + global _EP + if _EP: + _EP.destroy() + _EP = None + + +def destroy_distributed_environment(): + global _WORLD, _NODE_COUNT + if _WORLD: + _WORLD.destroy() + _WORLD = None + _NODE_COUNT = None + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + +def cleanup_dist_env_and_memory(shutdown_ray: bool = False): + # Ensure all objects are not freezed before cleanup + gc.unfreeze() + + destroy_model_parallel() + destroy_distributed_environment() + if shutdown_ray: + import ray # Lazy import Ray + + ray.shutdown() + gc.collect() + from vllm.platforms import current_platform + + empty_cache = current_platform.empty_cache + if empty_cache is not None: + empty_cache() + try: + if not current_platform.is_cpu(): + torch._C._host_emptyCache() + except AttributeError: + logger.warning("torch._C._host_emptyCache() only available in Pytorch >=2.5") + + +def in_the_same_node_as( + pg: ProcessGroup | StatelessProcessGroup, source_rank: int = 0 +) -> list[bool]: + """ + This is a collective operation that returns if each rank is in the same node + as the source rank. It tests if processes are attached to the same + memory system (shared access to shared memory). + """ + if isinstance(pg, ProcessGroup): + assert torch.distributed.get_backend(pg) != torch.distributed.Backend.NCCL, ( + "in_the_same_node_as should be tested with a non-NCCL group." + ) + # local rank inside the group + rank = torch.distributed.get_rank(group=pg) + world_size = torch.distributed.get_world_size(group=pg) + + # global ranks of the processes in the group + ranks = torch.distributed.get_process_group_ranks(pg) + else: + rank = pg.rank + world_size = pg.world_size + ranks = list(range(world_size)) + + # local tensor in each process to store the result + is_in_the_same_node = torch.tensor( + [0] * world_size, dtype=torch.int32, device="cpu" + ) + + magic_message = b"magic_message" + shm = None + + try: + with contextlib.suppress(OSError): + if rank == source_rank: + # create a shared memory segment + shm = shared_memory.SharedMemory(create=True, size=128) + shm.buf[: len(magic_message)] = magic_message + if isinstance(pg, ProcessGroup): + torch.distributed.broadcast_object_list( + [shm.name], src=ranks[source_rank], group=pg + ) + else: + pg.broadcast_obj(shm.name, src=source_rank) + is_in_the_same_node[rank] = 1 + else: + # try to open the shared memory segment + if isinstance(pg, ProcessGroup): + recv = [None] + torch.distributed.broadcast_object_list( + recv, src=ranks[source_rank], group=pg + ) + name = recv[0] + else: + name = pg.broadcast_obj(None, src=source_rank) + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch( + "multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None, + ): + shm = shared_memory.SharedMemory(name=name) + if shm.buf[: len(magic_message)] == magic_message: + is_in_the_same_node[rank] = 1 + except Exception as e: + logger.error("Error ignored in is_in_the_same_node: %s", e) + finally: + if shm: + shm.close() + + if isinstance(pg, ProcessGroup): + torch.distributed.barrier(group=pg) + else: + pg.barrier() + + # clean up the shared memory segment + with contextlib.suppress(OSError): + if rank == source_rank and shm: + shm.unlink() + + if isinstance(pg, ProcessGroup): + torch.distributed.all_reduce(is_in_the_same_node, group=pg) + aggregated_data = is_in_the_same_node + else: + aggregated_data = torch.zeros_like(is_in_the_same_node) + for i in range(world_size): + rank_data = pg.broadcast_obj(is_in_the_same_node, src=i) + aggregated_data += rank_data + + return [x == 1 for x in aggregated_data.tolist()] + + +def is_global_first_rank() -> bool: + """ + Check if the current process is the first rank globally across all + parallelism strategies (PP, TP, DP, EP, etc.). + + Unlike group-specific checks like `get_tensor_model_parallel_rank() == 0` + or `get_pp_group().is_first_rank`, this function checks the global rank + across all parallelism dimensions. + + Returns: + bool: True if this is the global first rank (rank 0), False otherwise. + Returns True if distributed is not initialized (single process). + """ + try: + # If world group is available, use it for the most accurate check + global _WORLD + if _WORLD is not None: + return _WORLD.is_first_rank + + # If torch distributed is not initialized, assume single process + if not torch.distributed.is_initialized(): + return True + + # Fallback to torch's global rank + return torch.distributed.get_rank() == 0 + + except Exception: + # If anything goes wrong, assume this is the first rank + return True + + +def is_local_first_rank() -> bool: + """ + Check if the current process is the first local rank (rank 0 on its node). + """ + try: + # prefer the initialized world group if available + global _WORLD + if _WORLD is not None: + return _WORLD.local_rank == 0 + + if not torch.distributed.is_initialized(): + return True + + # fallback to environment-provided local rank if available + # note: envs.LOCAL_RANK is set when using env:// launchers (e.g., torchrun) + try: + return int(envs.LOCAL_RANK) == 0 # type: ignore[arg-type] + except Exception: + return torch.distributed.get_rank() == 0 + except Exception: + return True + + +def _node_count(pg: ProcessGroup | StatelessProcessGroup) -> int: + """ + Returns the total number of nodes in the process group. + + Args: + pg: The process group to analyze + + Returns: + int: The total number of nodes + """ + if isinstance(pg, ProcessGroup): + world_size = torch.distributed.get_world_size(group=pg) + else: + world_size = pg.world_size + + if world_size == 1: + return 1 + + # Build node assignment map + node_assignment = [0] * world_size # rank -> node_id + next_node_id = 0 + + for current_rank in range(world_size): + if node_assignment[current_rank] != 0: + continue # Already assigned to a node + + # Assign current rank to a new node + next_node_id += 1 + node_assignment[current_rank] = next_node_id + + # Find all ranks on the same node as current_rank + same_node_flags = in_the_same_node_as(pg, current_rank) + for other_rank, is_same_node in enumerate(same_node_flags): + if is_same_node and node_assignment[other_rank] == 0: + node_assignment[other_rank] = next_node_id + + return next_node_id diff --git a/distributed/tpu_distributed_utils.py b/distributed/tpu_distributed_utils.py new file mode 100644 index 0000000..4ff1f0c --- /dev/null +++ b/distributed/tpu_distributed_utils.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections import OrderedDict +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_xla.distributed.spmd as xs +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) + +logger = init_logger(__name__) + + +class XlaQKVParallelLinear(nn.Module): + def __init__(self, qkv_linear: nn.Module, mesh: Optional["xs.Mesh"] = None): + super().__init__() + assert isinstance(qkv_linear, QKVParallelLinear) + self.skip_bias_add = qkv_linear.skip_bias_add + self.return_bias = qkv_linear.return_bias + assert qkv_linear.tp_size == 1, "TP > 1 is only supported under SPMD." + + self.q_weight: Parameter + self.k_weight: Parameter + self.v_weight: Parameter + self.q_bias: Parameter | None + self.k_bias: Parameter | None + self.v_bias: Parameter | None + self._load_weights_from_qkv_linear(qkv_linear) + if mesh is not None: + self._shard_weight(mesh) + + def _shard_weight(self, mesh: "xs.Mesh"): + self.q_weight = Parameter(self.q_weight.to("xla"), requires_grad=False) + self.k_weight = Parameter(self.k_weight.to("xla"), requires_grad=False) + self.v_weight = Parameter(self.v_weight.to("xla"), requires_grad=False) + xs.mark_sharding(self.q_weight, mesh, ("x", None)) + xs.mark_sharding(self.k_weight, mesh, ("x", None)) + xs.mark_sharding(self.v_weight, mesh, ("x", None)) + if self.q_bias is not None: + assert self.k_bias is not None and self.v_bias is not None, ( + "QKVParallelLinear should have q, k, and v biases together." + ) + self.q_bias = Parameter(self.q_bias.to("xla"), requires_grad=False) + xs.mark_sharding(self.q_bias, mesh, ("x",)) + self.k_bias = Parameter(self.k_bias.to("xla"), requires_grad=False) + xs.mark_sharding(self.k_bias, mesh, ("x",)) + self.v_bias = Parameter(self.v_bias.to("xla"), requires_grad=False) + xs.mark_sharding(self.v_bias, mesh, ("x",)) + + def _load_weights_from_qkv_linear(self, qkv_linear: nn.Module): + q_proj_size, k_proj_size, _ = qkv_linear.output_sizes + # The weight of qkv linear is a concatenation of q, k, and v weights + # along the output dimension. + qkv_weight = qkv_linear.weight.data.cpu() + q_weight = Parameter(qkv_weight[:q_proj_size], requires_grad=False) + k_weight = Parameter( + qkv_weight[q_proj_size : q_proj_size + k_proj_size], requires_grad=False + ) + v_weight = Parameter( + qkv_weight[q_proj_size + k_proj_size :], requires_grad=False + ) + self.register_parameter("q_weight", q_weight) + self.register_parameter("k_weight", k_weight) + self.register_parameter("v_weight", v_weight) + + if qkv_linear.bias is not None: + q_bias = Parameter(qkv_linear.bias[:q_proj_size], requires_grad=False) + k_bias = Parameter( + qkv_linear.bias[q_proj_size : q_proj_size + k_proj_size], + requires_grad=False, + ) + v_bias = Parameter( + qkv_linear.bias[q_proj_size + k_proj_size :], requires_grad=False + ) + self.register_parameter("q_bias", q_bias) + self.register_parameter("k_bias", k_bias) + self.register_parameter("v_bias", v_bias) + else: + self.register_parameter("q_bias", None) + self.register_parameter("k_bias", None) + self.register_parameter("v_bias", None) + + def forward(self, input): + # Same forward functionality as QKVParallelLinear, but doing qkv porj + # separately. + q_bias = self.q_bias if not self.skip_bias_add else None + k_bias = self.k_bias if not self.skip_bias_add else None + v_bias = self.v_bias if not self.skip_bias_add else None + q_proj = F.linear(input, self.q_weight, q_bias) + k_proj = F.linear(input, self.k_weight, k_bias) + v_proj = F.linear(input, self.v_weight, v_bias) + # The q/k/v projections will be split outside of the QKVParallelLinear. + # Because we are replacing XlaQKVParallelLinear with the + # QKVParallelLinear, we need to concatenate q, k, and v projections to + # match the output shape of the QKVParallelLinear implementation even if + # it seems to be redundant. + # The concat and the following split will be noop, and should be + # optimized away by the compiler. + qkv_proj = torch.cat([q_proj, k_proj, v_proj], dim=-1) + output_bias = ( + torch.cat([q_bias, k_bias, v_bias], dim=-1) if self.skip_bias_add else None + ) + if not self.return_bias: + return qkv_proj + return qkv_proj, output_bias + + +def partition_column_parallel_linear( + layer: torch.nn.Module, mesh: xs.Mesh +) -> torch.nn.Module: + assert isinstance(layer, ColumnParallelLinear) + xs.mark_sharding(layer.weight, mesh, ("x", None)) + logger.debug("Applied column-parallel sharding to %s", layer) + return layer + + +def partition_row_parallel_linear( + layer: torch.nn.Module, mesh: xs.Mesh +) -> torch.nn.Module: + assert isinstance(layer, RowParallelLinear) + xs.mark_sharding(layer.weight, mesh, (None, "x")) + logger.debug("Applied row-parallel sharding to %s", layer) + return layer + + +def partition_qkv_parallel_linear( + layer: torch.nn.Module, mesh: xs.Mesh +) -> torch.nn.Module: + assert isinstance(layer, QKVParallelLinear) + xla_layer = XlaQKVParallelLinear(layer, mesh) + logger.debug("Applied qkv parallel sharding to %s", layer) + return xla_layer + + +MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict( + [ + ("QKVParallelLinear", partition_qkv_parallel_linear), + ("ColumnParallelLinear", partition_column_parallel_linear), + ("RowParallelLinear", partition_row_parallel_linear), + ] +) + + +def get_fqn(module): + # Get the fully qualified name of the module + return module.__class__.__qualname__ + + +def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None: + """ + Recursively check a PyTorch model and apply appropriate sharding based on + the MODULE_TYPE_TO_WRAPPING_FUNC mapping. + + Args: + model: torch.nn.Module to process + mesh: An XLA SPMD mesh object used for sharding + """ + + def _process_module(module, name=None, parent=None): + for module_type, wrapping_func in MODULE_TYPE_TO_WRAPPING_FUNC.items(): + if get_fqn(module) == module_type: + wrapped_module = wrapping_func(module, mesh) + + assert parent is not None and name is not None, ( + "Top Level module is not expected to be wrapped." + ) + if wrapped_module is not module: + # Wrapped module and module are different py object. + # The original module should be replaced by the + # wrapped_module. + logger.debug("replace %s with %s", module, wrapped_module) + setattr(parent, name, wrapped_module) + + module = wrapped_module + break + + for child_name, child_module in list(module.named_children()): + _process_module(child_module, child_name, module) + + _process_module(model) diff --git a/distributed/utils.py b/distributed/utils.py new file mode 100644 index 0000000..debf69c --- /dev/null +++ b/distributed/utils.py @@ -0,0 +1,543 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2023 The vLLM team. +# Adapted from +# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +import dataclasses +import os +import pickle +import socket +import sys +import time +import uuid +from collections import deque +from collections.abc import Sequence +from datetime import timedelta +from typing import Any + +import torch +from torch.distributed import ProcessGroup, TCPStore +from torch.distributed.distributed_c10d import ( + Backend, + PrefixStore, + _get_default_timeout, + _unregister_process_group, +) +from torch.distributed.rendezvous import rendezvous + +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.utils.network_utils import get_tcp_uri +from vllm.utils.torch_utils import is_torch_equal_or_newer + +logger = init_logger(__name__) + +# We prefer to use os.sched_yield as it results in tighter polling loops, +# measured to be around 3e-7 seconds. However on earlier versions of Python +# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0) +USE_SCHED_YIELD = (sys.version_info[:3] >= (3, 11, 1)) or ( + sys.version_info[:2] == (3, 10) and sys.version_info[2] >= 8 +) + + +def sched_yield(): + if USE_SCHED_YIELD: + os.sched_yield() + else: + time.sleep(0) + + +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, "{} is not divisible by {}".format( + numerator, denominator + ) + + +def divide(numerator, denominator): + """Ensure that numerator is divisible by the denominator and return + the division value.""" + ensure_divisibility(numerator, denominator) + return numerator // denominator + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, + num_partitions: int, + contiguous_split_chunks: bool = False, +) -> Sequence[torch.Tensor]: + """Split a tensor along its last dimension. + + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = divide(tensor.size()[last_dim], num_partitions) + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # NOTE: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + +def get_pp_indices( + num_hidden_layers: int, pp_rank: int, pp_size: int +) -> tuple[int, int]: + """Try to evenly distribute layers across partitions. + + If the number of layers is not divisible by the number of partitions, + the remaining layers are evenly distributed across all but the last + partition. The last partition is excluded because it often contains an + additional norm layer and we are attempting to balance compute. + + If `pp_size > 2` and the number of remaining layers is + `0 < x <= pp_size - 2` then the remaining layers are evenly distributed + across the middle partitions. The first and last partitions are excluded + because they contain the input and output embeddings respectively and we + are attempting to reduce maximum memory consumption across partitions. + """ + partition_list_str = envs.VLLM_PP_LAYER_PARTITION + if partition_list_str is not None: + try: + partitions = [int(layer) for layer in partition_list_str.split(",")] + except ValueError as err: + raise ValueError( + "Invalid partition string: {}".format(partition_list_str) + ) from err + if len(partitions) != pp_size: + raise ValueError(f"{len(partitions)=} does not match {pp_size=}.") + if sum(partitions) != num_hidden_layers: + raise ValueError(f"{sum(partitions)=} does not match {num_hidden_layers=}.") + else: + layers_per_partition = num_hidden_layers // pp_size + partitions = [layers_per_partition for _ in range(pp_size)] + + if remaining_layers := num_hidden_layers % pp_size: + for i in range(2, remaining_layers + 2): + partitions[-i] += 1 + logger.info( + "Hidden layers were unevenly partitioned: [%s]. " + "This can be manually overridden using the " + "VLLM_PP_LAYER_PARTITION environment variable", + ",".join(str(p) for p in partitions), + ) + + start_layer = sum(partitions[:pp_rank]) + end_layer = start_layer + partitions[pp_rank] + + return (start_layer, end_layer) + + +@dataclasses.dataclass +class StatelessProcessGroup: + """A dataclass to hold a metadata store, and the rank, world_size of the + group. Only use it to communicate metadata between processes. + For data-plane communication, create NCCL-related objects. + """ + + rank: int + world_size: int + store: torch._C._distributed_c10d.Store + + # stores a reference to the socket so that the file descriptor stays alive + socket: socket.socket | None + + data_expiration_seconds: int = 3600 # 1 hour + + # dst rank -> counter + send_dst_counter: dict[int, int] = dataclasses.field(default_factory=dict) + # src rank -> counter + recv_src_counter: dict[int, int] = dataclasses.field(default_factory=dict) + broadcast_send_counter: int = 0 + broadcast_recv_src_counter: dict[int, int] = dataclasses.field(default_factory=dict) + + # A deque to store the data entries, with key and timestamp. + entries: deque[tuple[str, float]] = dataclasses.field(default_factory=deque) + + def __post_init__(self): + assert self.rank < self.world_size + self.send_dst_counter = {i: 0 for i in range(self.world_size)} + self.recv_src_counter = {i: 0 for i in range(self.world_size)} + self.broadcast_recv_src_counter = {i: 0 for i in range(self.world_size)} + + def send_obj(self, obj: Any, dst: int): + """Send an object to a destination rank.""" + self.expire_data() + key = f"send_to/{dst}/{self.send_dst_counter[dst]}" + self.store.set(key, pickle.dumps(obj)) + self.send_dst_counter[dst] += 1 + self.entries.append((key, time.time())) + + def expire_data(self): + """Expire data that is older than `data_expiration_seconds` seconds.""" + while self.entries: + # check the oldest entry + key, timestamp = self.entries[0] + if time.time() - timestamp > self.data_expiration_seconds: + self.store.delete_key(key) + self.entries.popleft() + else: + break + + def recv_obj(self, src: int) -> Any: + """Receive an object from a source rank.""" + obj = pickle.loads( + self.store.get(f"send_to/{self.rank}/{self.recv_src_counter[src]}") + ) + self.recv_src_counter[src] += 1 + return obj + + def broadcast_obj(self, obj: Any | None, src: int) -> Any: + """Broadcast an object from a source rank to all other ranks. + It does not clean up after all ranks have received the object. + Use it for limited times, e.g., for initialization. + """ + if self.rank == src: + self.expire_data() + key = f"broadcast_from/{src}/{self.broadcast_send_counter}" + self.store.set(key, pickle.dumps(obj)) + self.broadcast_send_counter += 1 + self.entries.append((key, time.time())) + return obj + else: + key = f"broadcast_from/{src}/{self.broadcast_recv_src_counter[src]}" + recv_obj = pickle.loads(self.store.get(key)) + self.broadcast_recv_src_counter[src] += 1 + return recv_obj + + def all_gather_obj(self, obj: Any) -> list[Any]: + """All gather an object from all ranks.""" + gathered_objs = [] + for i in range(self.world_size): + if i == self.rank: + gathered_objs.append(obj) + self.broadcast_obj(obj, src=self.rank) + else: + recv_obj = self.broadcast_obj(None, src=i) + gathered_objs.append(recv_obj) + return gathered_objs + + def barrier(self, timeout: float = 30.0): + """A robust barrier to synchronize all ranks. + + + Uses a multi-phase approach to ensure all processes reach the barrier + before proceeding: + + 1. Each process signals it has reached the barrier + + 2. Each process signals that it has confirmed the arrival of all other + ranks. + + 3. Rank 0 waits for all other ranks to signal their departure to ensure + that all ranks have departed the barrier first. + + Args: + timeout: Maximum time in seconds to wait for each phase (in seconds) + + + Raises: + RuntimeError: If coordination fails or times out + """ + # Generate a barrier ID that is globally unique + try: + if self.rank == 0: + barrier_id = f"barrier_{uuid.uuid4()}" + self.broadcast_obj(barrier_id, src=0) + else: + barrier_id = self.broadcast_obj(None, src=0) + except Exception as e: + raise RuntimeError("Failed to broadcast barrier_id") from e + + # Phase 1: Signal arrival at barrier + # Wait for all processes to arrive + # We need all ranks to confirm the arrival of all other ranks. + # This is the key synchronization point. + arrival_key = f"arrival_{barrier_id}_{self.rank}" + try: + self.store.set(arrival_key, b"1") + except Exception as e: + raise RuntimeError("Failed to signal barrier arrival") from e + + start_time = time.time() + processes_arrived: set[int] = set() + + while len(processes_arrived) < self.world_size: + # Check for timeout + cur_time = time.time() + if cur_time - start_time > timeout: + raise RuntimeError(f"Barrier timed out after {timeout:.2f} seconds") + + # Check for each process + for i in range(self.world_size): + if i in processes_arrived: + continue + + key = f"arrival_{barrier_id}_{i}" + try: + # Try to get the key - if it exists, we'll get a value + # If it doesn't exist, it will throw an exception + self.store.get(key) + processes_arrived.add(i) + except KeyError: + # Key doesn't exist yet + pass + except Exception as check_e: + logger.debug("Error checking key existence: %s", check_e) + sched_yield() + + # Short sleep to avoid tight polling + if len(processes_arrived) < self.world_size: + sched_yield() + + # Phase 2: Signal departure from barrier + # We only care to block at this stage in rank 0, which runs the + # server side of the TCPStore. We want to make sure that all + # clients have departed the barrier before rank 0 in case the + # next thing after the barrier is a shutdown, including tearing + # down the TCPStore. Other ranks can exit the barrier immediately + # after signaling their departure. + departure_key = f"departure_{barrier_id}_{self.rank}" + try: + self.store.set(departure_key, b"1") + except Exception as e: + raise RuntimeError("Failed to signal barrier departure") from e + + if self.rank != 0: + return + + # Make rank 0 wait for all processes to signal departure + start_time = time.time() + processes_departed: set[int] = set() + + while len(processes_departed) < self.world_size: + # Check for timeout + if time.time() - start_time > timeout: + raise RuntimeError( + f"Barrier departure timed out after {timeout:.2f} seconds" + ) + + # Check for each process + for i in range(self.world_size): + if i in processes_departed: + continue + + key = f"departure_{barrier_id}_{i}" + try: + # Try to get the key - if it exists, we'll get a value + # If it doesn't exist, it will throw an exception + self.store.get(key) + processes_departed.add(i) + except KeyError: + # Key doesn't exist yet + pass + except Exception as check_e: + logger.debug("Error checking key existence: %s", check_e) + sched_yield() + + # Short sleep to avoid tight polling + if len(processes_departed) < self.world_size: + sched_yield() + + # Clean up keys to avoid leaking memory in the store + for i in range(self.world_size): + try: + self.store.delete_key(f"arrival_{barrier_id}_{i}") + except Exception: + logger.debug("Error deleting key: %s", f"arrival_{barrier_id}_{i}") + + try: + self.store.delete_key(f"departure_{barrier_id}_{i}") + except Exception: + logger.debug("Error deleting key: %s", f"departure_{barrier_id}_{i}") + + @staticmethod + def create( + host: str, + port: int, + rank: int, + world_size: int, + data_expiration_seconds: int = 3600, + store_timeout: int = 300, + ) -> "StatelessProcessGroup": + """A replacement for `torch.distributed.init_process_group` that does not + pollute the global state. + + If we have process A and process B called `torch.distributed.init_process_group` + to form a group, and then we want to form another group with process A, B, C, + D, it is not possible in PyTorch, because process A and process B have already + formed a group, and process C and process D cannot join that group. This + function is a workaround for this issue. + + `torch.distributed.init_process_group` is a global call, while this function + is a stateless call. It will return a `StatelessProcessGroup` object that can be + used for exchanging metadata. With this function, process A and process B + can call `StatelessProcessGroup.create` to form a group, and then process A, B, + C, and D can call `StatelessProcessGroup.create` to form another group. + """ # noqa + launch_server = rank == 0 + if launch_server: + # listen on the specified interface (instead of 0.0.0.0) + listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + listen_socket.bind((host, port)) + listen_socket.listen() + listen_fd = listen_socket.fileno() + else: + listen_socket = None + listen_fd = None + + store = TCPStore( + host_name=host, + port=port, + world_size=world_size, + is_master=launch_server, + timeout=timedelta(seconds=store_timeout), + use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215 + master_listen_fd=listen_fd, + ) + + return StatelessProcessGroup( + rank=rank, + world_size=world_size, + store=store, + socket=listen_socket, + data_expiration_seconds=data_expiration_seconds, + ) + + +def init_gloo_process_group( + prefix_store: PrefixStore, + group_rank: int, + group_size: int, + timeout: timedelta, +) -> ProcessGroup: + """ + Stateless init ProcessGroup with gloo backend compatible with + different torch versions. + """ + if is_torch_equal_or_newer("2.6"): + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + else: + options = ProcessGroup.Options(backend="gloo") + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + options, + ) + from torch.distributed.distributed_c10d import ProcessGroupGloo + + backend_class = ProcessGroupGloo( + prefix_store, group_rank, group_size, timeout=timeout + ) + backend_type = ProcessGroup.BackendType.GLOO + device = torch.device("cpu") + if is_torch_equal_or_newer("2.6"): + # _set_default_backend is supported in torch >= 2.6 + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) + return pg + + +def stateless_init_torch_distributed_process_group( + host: str, port: int, rank: int, world_size: int, backend: str +) -> ProcessGroup: + """ + A replacement for `torch.distributed.init_process_group` that does not + pollute the global state. The created ProcessGroup object can be used for + some operations such as `allreduce`, because it does not depend on the + global rank. However, some operations such as `broadcast` cannot be used + because it depends on the global rank. + + # TODO: ask for help from PyTorch team if we need the `broadcast` operation. + + This function is useful when we are not sure about the total number of + processes in the process group. For example, we may have process + 1, 2, ..., 8 who want to communicate, and process 9 might be the same + process as process 1, or it might be a different process; process 10 + might be the same process as process 5, or it might be a different process. + In this case, how can we reliably form a communication channel within + process 9 and 10, without affecting the communication channel within + process 1, 2, ..., 8? + + One possible solution is to figure out if process 9 and 10 are the same + as process 1 and 5 beforehand, and then form a communication channel + based on the information, adjusting the ranks and world_size etc. However, + figuring out the information is not always easy, and it will interfere + with the main communication channel. + + Our solution is to always form a communication channel with process 1, 2, + ..., 8, and then use this function to form another communication channel + with process 9 and 10. This way, regardless of whether process 9 and 10 + are the same as process 1 and 5, the main communication channel is + always formed with process 1, 2, ..., 8, and the additional communication + channel is formed with process 9 and 10. + """ + init_method = get_tcp_uri(host, port) + backend = Backend(backend) # it is basically string + timeout = _get_default_timeout(backend) + + store, rank, world_size = next( + rendezvous(init_method, rank, world_size, timeout=timeout) + ) + store.set_timeout(timeout) + + group_rank = rank + group_size = world_size + + # Use a PrefixStore to avoid accidental overrides of keys used by + # different systems (e.g. RPC) in case the store is multi-tenant. + prefix_store = PrefixStore(init_method, store) + try: + from vllm.platforms import current_platform + + return current_platform.stateless_init_device_torch_dist_pg( + backend=backend, + prefix_store=prefix_store, + group_rank=group_rank, + group_size=group_size, + timeout=timeout, + ) + except NotImplementedError: + # If platform doesn't implement stateless_init_device_torch_dist_pg, it + # will raise a NotImplementedError. In this case, we fall back to gloo. + return init_gloo_process_group( + prefix_store=prefix_store, + group_rank=group_rank, + group_size=group_size, + timeout=timeout, + ) + + +def stateless_destroy_torch_distributed_process_group(pg: ProcessGroup) -> None: + """ + Destroy ProcessGroup returned by + stateless_init_torch_distributed_process_group(). + """ + if is_torch_equal_or_newer("2.7"): + pg.shutdown() + else: + # Lazy import for non-CUDA backends. + from torch.distributed.distributed_c10d import _shutdown_backend + + _shutdown_backend(pg) + + _unregister_process_group(pg.group_name) diff --git a/engine/__init__.py b/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/engine/__pycache__/__init__.cpython-312.pyc b/engine/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9207e9c97c8f8baf6c6959aa79fb4642f491aa9 GIT binary patch literal 156 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVS?ibN7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?c_`t=4F<|$LkeT-r}&y d%}*)KNwq6t1)9YO#Kj=SM`lJw#v*1Q3ji}jBD&Bb|HC`F+pt?7y@A&i(V8oGcrD0msJ%JdfFJ|3DV;%hX=p`Q0>|?W*kon`D#h z0h?ES+P!wRr+8ENnd(jDr^D;uXPP$+&y+xVFvFW+CqJoyl%Uh=3}$*WgIV6JV751# z-5mi}Fvpw2?`eVDV4gRR-_ry6!2)kVu+Uo=EbVKG@)G2sU~f zH6BgDW^XgUX9Ze{lx&k|b zJH0!DyS%%C9o~*$r?)fMk9dzTCNFR_c)$1l;4$wpw&w@BgU7wc`Mn_UK=6e3MDRiHgKRGhJQRG``!K&3 z1-!vWypQmEap2M5V_wt*PTwdZTt*$muWgM)7|Z z|L5^Pnqu=m<{x{-uGVDl2VgIiYIXY){)_&i3K0EIj%L_w-lzN}{trrZTgm03{|OW2 zX{OW@MQZS8_&?Nd_dn5>LI|lb(cb5fnx1xeKRiE9vQucPZ=joj{)p7#{}4ER$Ulz% z52-XlX;_EU3e+WOx&JAt&Hprz4gRMo^uBb-1IkCGcK^rxPjP%y`ejg7fb#JuDR z9`;Y59A1z%JZ@9tF+O? zrx1MR!)+2c?*!tQ3T~?{xNW|AsN_Tra?L<$w>|9;De-=00ql0<%xC?X zDYiW}>At6J(vDYB)Ub69FKX1CpziX&_)2OL{c}p z;1ka~(GwUz+z@`3EDy;Po81)aeWAV~fYS-Q4=g1AuDwiL&+J9>Ig`#Xk0eFOdRf)2=}fdKQ?oU+s>@r=7V{bvVy1vtN}yB%==w^+>|!!J%V2wfF2pLy|uryd4?|LCBQdvhHf8Vcy~%IiiFr4hrSpnKTgJ3{F^D^gR|9g#J7bs`W5YQf}i`9vbJPa5hCx1_~0cLj!e&vXxr`r}!gk)51TMF&nC z>FWz1eFLF>Ex+SghtPTSsDP5AvcGTOJb3|PZ%_F2&OrYViYZ;q;+Bkf4#GU^mm^1p zsnFutd;5;SDi270)XsxL{loImsW4>~7*mF+j9te@LLtAbdCEH86F#H8=NunGu_LB3 z!KqRmI>b@t80Q^2)D`LlK|X$z>8LTC{z%V2z%QZPdi`)j28Q9UvU4CDkq1tVpkj5M zM?OY|WRb?QuJdRpk)r|l2%_|nsi=Sb7mzZe5RMK7271XUld_W=94y%bp@E1mFx21g zmnn7~!y`N8-qTb~*`xMuTZ`jx zJoRi(Kz;u%EM^vzfO0fR6axqvh`29+K4V zQ+{`^+SvS(J2K>s+uKn?vKYo9HtvK_9;Ske7l?AC3{~ZCs{9yY6u0|jFP>qFtJ{6? z7h6|Eg2O9@2ZlWn`1VIU=SMbd+IVJze??&6)QaH?D8|r=vk)#T{2|m`{|cadBawkX zxPACSJR8*#;c?lF))?>YTOo0J0^#lLCP0|NJqY9H7j3gCX_+NIELu7do+_$~x$1s? zH?QJtTWV&>-NMT0!ZxL_ZKkk%qG%#5wq)I{Gq)alZCNb8^PSX`qTDx&if7YOa!USV zHWlz+g!94hgBjVjk7ce;ePPL})K}Bir@ovLPetpO9|Wx>g%d3MDRim|C*87yM?A}j zu6!B}RMg?6FgSYYX(cZf&D!v|Uhp$|%;wu%WidzTJ3u5>7RAJC zGETdldt_aCU?W znSW*(fSkwBU;RZ$X7F#2r|QC{J}pZnG2fVA5Vk0 z>^&{_!#5{B?idWCe@&zA1X7g?FHEJbRp!DG8tM6@fnE&FLL}N{c?9H-;6J<)#zouR z?3`yeUfLKdZn{x)C);y3ukgy6kDVXSo5`-fyzRx}FI0ZIaw@z2TK80TOU%*2Ss2fx zI^aX^ABj8Ywbz_cBtls3q8=hUqqeAh1Y`R!RtSs=X=E2sA#?4ctsT@Oxd-~(k<)%R zv2{}j?t!qo$4#BrK;J+wySF2y)BsRT&f+1TgTd9rttC4HeHk&3FM#rKqE^GWXnQle zc)}6OuAa=^c0KFHrYZNj+r=^WwwPm^nyxH8UEzcz={1DA60|rYA>(#*9rmZ4QQO=} z50aHKy3#sXVUpSyqJ-uoE}?Ex{aS%m+hPr=oeDjSDNySiV4zQZQ_|*X*_a%Sl&J=;-==J(YkfS z3-B1GIor>FMC~q?+xgAP8~ZMwyK>=L{d8W_H!I(Jhw|)$r4@E`=BYiQ3oob0qi`{m z)k!tj94#NT5_=k;=kXsdh5^=ZW#>*2cNkOIoiRrzmy~tsw8P6s7Aze}&nc8o>S*g8 ze}rpXLKSoO$wR@!iWqLUE}5+e)l@R5nDdoP*nT-bYS+rAM3j#_1_m;Tv6fVsBxXD% za$3zNbNR>@K^m>1p%&Fri$eJ{0O2YaTA}1`|McKQ%FR8ucHMUTpl#a=g`XaLFHGX- zg9X+0Pi9oxX-?@tkw@eYl2>ZPZmm>!GRIBm5?rqmxfBK$NZu52hVhzXD!VJ@=u(SB zuZ+NDwkH+x}G96s5emF327UtfZ&8w0K(y+ylE zq7OLP=04|lkA(g1lR?ar&i2SB-I#zx{5_I;2);$w?aWW6ieX*%e|4$EwI1~^YamP^ z_k?==qWY(+oqRl7eR6dd15=h6YE!7k6q=_`ks;ria~OYyE!xC&L_I-Jb$#=}TkA3K zXuP(3I;aM;z59Ch)@*-hL9H70vn`JAgn5ew|;8ecn;=a4NrB^OY z#TI%ZStIbHU5Qd9G?FyhM0de>>IY`Ql3^)+`YHeA|w`{9dS zGp@ob2dT%kq^ZI2?(BBWf4T45ER= z3;&FUHI^DpRmH?d9fm*|cWB}v)j~XE@5ihwiY6{OK9?#M>d?sb@!KW6=^G=th(3}# zopRPD*G60-E=H^je`jrB3MK85V@@c#zmGaDIihw$oQ&AR&InCm_!mt@C=MxI%6Ji? zBSJ$T&0UBTmD1uoJh!bK($$Rxhmr`R>(De+!WdFM%Nj6uVv;xwh<<{afXeD|e1R zer4a~54<#@^Jq|gkd7!zp`!tqL z;1Mf!!vVFuhuhqk8tKfMAUT`d%>(WJc6U<%>qw8#@HFa%^J7gds&I(gmt$1QZ9OeS zGN~m?6$C5;oFc75SBj+M40+Y zlMpDj%47K@fWAWLKZF4}a5ukvqVn4Dsr*(*0ax*rho@XMXrcKfS9+dvUflO?y3Lh0 zm5XeLaVLA%tSu!ieI{$yYwKQXdc7d#Jb+ntT6!#d7vzbv@Jh>6M)mcQo0&29Y9(X! zTX}^qoSwKaRobHDwXiH8@!^^AtiN&K*5f26HbYEgX2tSaAS6o4$1`WsZ3UGx<<-}_ zl=7CF58W<>$ZWLE~E_c#A2^53W54qHN*%+ z*8*YaLe$`Xd8tDU!9J*uk&O5m=~Ux^@>}19fhtx-xrlgb7*@wd`i_NrNOtp%gbC8l z60s!{D?k2-+V;39(x@<=J~E8G2*0|XN+qujq>R>y4qKAY7^&2`MaI+BeT}$-8aK!= zMRRnkgOisDOU9q$KYSSmW-;!%FQr}Fe>bOWB6BLI_Tuil*+p0OO=T~=*!gBg&I=Vv zM&*pNaN1d+I4fRkzkzP{<+7>DRd<}LX?>|Q<|vuXvt{MK(DmY`SW$zL+3*iVW#b() zuF`4O62-OT#n6rJsimua;95PKYAY_M^$zxvX3H#kHuqBQl?!*A%ihc`0Uxe^ao6Lf zMcIU*o!+Xbk5iYH&T{*q$wKNtsP|>e)#a~J(okK+IzuU1!B8lGg+QN~v-~wc{)VtL z4x%-nvWX3`?An;4c5XYMHc*HEcj{s26J5+##&A+0YCkJFr7UzxcAD6>r2Hb?<2mgQ z3?G(Xg&zoBnm)-a^(Y%*{D(_msI%6LwwE?ebc{Q%bX?ARY2$lg2PjYF z=3_xSc!u^!Mhn&F3P=FV)O|jzZT^^+0+AIN!JhMB+B!+?4TKYNkY-rio>$@x?HN|} zCubYO1EYS2>;p(4{w>>cr|nAWV?RxvqYP3QNk7rDF!QiKZHqdL!OmEkskP%&hdzi( zkERalt=nK{wDS?W++~d!%P`GsG9uKZ_$PSF2cj8+G=)$4U39_SBFs=8+^J-ee!2|I z2-PAc790~|n-9!Axqx1p6a=hePDn7CUMBsbPGhlPkY?6NKRtGt(ab>_y(ImlRLLQ7 zWGwTEjA*8m_F_sjCF5k3Yzf@VA;`rdxjqDQ!55B*gFcUqw2pU%Dhp4@4e%Ri)a zj$hF}>^6k+*@SS`3V**sICK0agwvl8&iemGICK0agtI0g92)8KPft%rM0XQ8sMe#? zHYxW@_Ayt~8FigS&d9lu&BAvgQn#6EoPiyr>XGzIu8pBL6Joht!1Y+>M07VjG_LKT zKT7bqV~)=i33RWXlFt!?w!W`dDE!-(?eBrxG-`^#8WDW(p0rD>V#QjY)N8Td_-V` zZ!4)OWr=i(XGE717?Iw)4I2_dmvfvG#ZTH7j&fUV1*fcp1#6doHTovX;A~~@H|N39iZev(rB%( z7l&wYIAZ=4wYR|}TnN;q;Oe33k4ksQn5+7~r z$uyFYd&oFNRELZ+QCN6+j9hM$(F;Rc8C54A@jP|+T0i|Y+B|QGCLs&Zg2E)nXcs=U zpv!)@igsqJmQPo0QmQuHuA8db9xLAd^Sfn@Z`|<;E;SlJ#i8*B@~X<_)gJos9(a$5>5vtj>7eZvl@K$ zhLbm6!_gMJ3KE(Ow#{fdFaI74^Zp{wB^J|G)=-|(cvEdFg5VW_G`7%MGdcJ zse5Ekm6X}7oBV$x=JMj@pvjx+4!+NZA5-yjd4BIEO9yaXfepoKp9*Xb3r-$=ah z^hmG=C$z9tLMJhCFe?(z@X?VvpD&(9$L1j1Xun;4ig=8Wahi-U84)s0l0kw=-NH}r z86L)w#+Dpz({U%}lYBxip04g{fGZu4RF@xc>Wt2e(b2FRy^-VGR#?rlza*~IwBk;Z zAU>>R%6~}KqPfuy^Q4^C5%r(CsgRc>GUUGlhacfT{O4$-7i~XFw>7T3JT}pNCvVxU zV;A?%{^C&982rs^31I_0x1)$u9d#ty9H2$20GiS5G{A z!!cFv8PCB2psV27ZI`xP&rq_PW=gBB2A>O#?|Cb)VxsB#raO5a0Lp8o%i5K)_E_0^ ztZS51Up?~tkvA*c(-kX~ij`9ptACndE1<=W+@k56S|z9UW|NY$3aq9}mn)^qr%FBJ zd%z{XD|tA5;u&|mnODSwj%&p@jWX_lpZevu zI+e0blji)n z>#CS&z21Ar)r!+83Q5ta!Yz05w!Brh&Kh6mxhehT;LC$kb(;j>9%XSyghhF9GyGFc z++o3cP1d`%l$BXn8?SE?j1aT83aeu^D{ocaa@{`s`r6kUV@L0gEjcz-*d23qV}09I zdZlmT_;Z2j(nh7U@!Hv`()OwB6<^<=WN%K&;Z?WR->!dk>uc%Cnmw`Ad#Bv{?zr|7 zhx{v!k8Pg7!JE39hio)w@Pn2Uah{}r>yFls@p9hL@CQFxguSPPUqDq zc{SHE-^gnf`Tnqy-!z@yuH?7hIQvHa`q}rV*b0}-MpJCA!i$IIuEEh#CUwN`Y=>c9 zi?%unWwrh+hB_F@+I!xQZsZuB(xeW7#~W@{rg0}hd#n98#2j+t+$TO+DLNx#hM#MQx%@6?Dm+WU9C9)=sz(Rd*>l^)>mzQ8|jyM zQ=oNb_on(CR)^obY5sJ7hQupXK+u+0($DJzBGaEvLlM@dNhK4095f$;9~w0${k&QJ zbV=aSG#m{el78N76E0oDQJRx}UY9=yA!cZZ>LfC(-dukk5WJ>`IJqql`ThbuPMI2~ zdJ9~kzX+UoMh;HR7Kma-WNSS1d9{J`miSA-!=)kUOjOd(TgE(iMGv9qD`f`ETh1ku ztMQLSKn8klrOgEY)g3p%y<9 zR!Ki^4Rb2e5Id5{uzKD8Wyq&uM$``Owm{TFODjVv(RduRKs5TB^z@f%xT6-hW)rTA zaV>+#EpW?`t4&nfb*|<2K)238R}dPSPi_7dsZy+dc|E4kE16&W;QK5>Ux7SZ$$7iV zzj{uatI~Y-S@>MzZ$eI0YX}tlo9ZW2uIqb%UgzcReF&G(;$g z46AnoBitHd*aESU5z91$Y=PLsh*}LnBle`9cQYeYJ;V0Fvq@xFy;~Te>Kmqdx0>?5 zf%rMdZu{{ZUeTFv)_h3|vxd!2^(j0NJ5|1fy0*ASn#Kpf#XZqN|d zED%Q-v5^r62ES;5xStW5G{kiaL^mTgYls^bhzA(4MMK=QKs?B)-O7me!CMxHhfH;B z8{=vRzh;4Z*zeV4)piZ{4GY{O&>$7P@_icawgv7{X#8eKI~d^@{FVjceH{0l8sgg) z2p=PMX`#GkfjH^!K`0#>;=2}zQ;g`;5Z|*v^fIDLL%eQ*kW6{Fn{a6Nev{iC_S-l3 zeG9++T!wo!M9czlni2an#H0mcfD!w(kOxg6A7EdO!8a^?onc=GHN=bsf=*dwNQanj zt2e~CeOR*%vF(WF`%Meq!zQV5RKxwf1@3VZ?tTsTmIY2mi0KKV!()VI8Q_q*bs2#3 zABc6g#_1;(oJLF`9@lXHVu3sBKd0B_2Q=KQ1r9pw2=N3X+6UjZKwMzNgM{Gtj!F;d z@qNEP3K`v`nubQX`UbuaGvCp{Us&)RW4>Mu@qa83A7I2IrjQ@iLw`(Kf=g?#1M{0$3KIO*ybPnkj96rcx%BN$T7MQ1@X)h$nNge63 zKz^8Oa*vM5wZMD?n)4abDIHU2fw|=WXi^R9B|KX5$4oUw(s`6y@VLyW_v@HS3(Utk z^?l5{)%y(l=w~0#BFEhpKCW;IPwSXk3(Rwj8PG9S^N#1a_Y)ky zGdiZ(0`r1N<^*)4)vV={sBNDb+%cC&Ldy>7Jlf|Xy;nJL$5vWkCOFnZjHv}? zwFTzWrdSU%vJc3$7Rb+Vh>sJ7+i#^L>#gXs$kPl-HqpX5?M0;4G_M6NOgPl2&zbtk zh#oe*B>lXf=ah`-7^_*(7Z`JvFdV;Y(m6d2FZsWSaL=*-FCu-rlLCO%`z6FPLpo0w z#O=DNwqMZPzHD+E)!cr~l!o_fZoh7Fi)wB+Om1W3=64_-$1LLUvMH1gXuf~LQrbMh zd|m;chb;KqH1WBpBfS>Lub7Ze>c~eekY7bf{$|3Q?J34nojGZNf7SmrOrHhj zw~+6v{9m6l2YZ@*Q7&1{pnlu`4ZZDtNJoY&`2P;_?i&d$>%)ZC>c~g*I&$0Jg1CN! zc~Og!EqHyC>&Uo{v6?acE@Lhc2KDIo{NK{!`BBa7_x<12-9Dzd{ek~A-R&~-r<6Zs z5r^+E|BvgKr!6pl$Zg>n!f-kr(zALx|HxdIkm6@`PU9AwzRRIq(J@wYqd#WMbBrN5 zV>K`O9>?f;9rKJuSbySw9r5{uj(Oe!^QVk?fiRr1TIrK|%KpqG%RZ?iu1TNL5r3YP z_D^wGm5B3%MOc5qalWc!tY%7o$!VV;4Ep`=vyV@+4=O*anbTj9G(?S$i5j2Ko<2i< zGi`oqQD3x(lVXx1pXG1}dEEjz$@2O|#!#vJngwQx-+VgF`%ls>=KCtJ{}b_n4b&eY4dS*Ohx9d5Jbnz!Z=u!^ zCMtOPo3wH392aeb>5#r|;_(xX4TmJ9A|Ahuu>bi&YRm7$v%2`cft@&aJz5EqSMqTX z9Qy-j4<0<^R<9#yho&@Lk8wC&K=-ZSh=6)q2JaYW@^JyWOeCIrx(}P3wE3W3q>(Q3 z4J6RzQm<5idNVE|i5G>>3=FHM49FYb#t|>}hj<4VcbNo-BR+rdlplBRU|5r8p&1kb!Ce$4e6Z zPzrH5O1w1DSsw27QF?dB%MuBvFPy@~IljQDcts)sY4pn>+{ge@QW$z}YjqTA-MHCC zPhUI_*|r-0da-0_K5M=QoM?2*muK~F}MkWj@_7ubtrjm z5~1GS0M$F6H0%ps2;s(pAza?Wdu8!#9ML$XH9A<-Hbidm3K2_j0gq9!3w=Gq0|=L| zm4W9GoVqATaN{y4p+iBq1ZgBB;m#RUK1y{sp3gy^!wvrfRC{dPSoKhJ82Rxgq%s_S)iEr|1qPm1&7C5Rg4+9_sx_@p8^C z-R?V$3zK|3xG@PeK+CT%j=V8fj$*u6xZ_@}-X7Fw>|KW9S>ba%$j)J$Z;9vj4v+YT z)LWW-{iouk{jh6ir05nFEwkl6M!S~3M~%B&oS8yW=q4dih~anzQ369q)ErW^>O4W%oSMHP>JXGajTfzix6 z;M#%=%?}I&af1`;3)M2n^LUZ!5)7Ih^R*&|t0wPDOp*ky>XF58JWrM2141=9UPO$< zwMg3O%J>oyAtvHgfxf ziWdUlS8vMlMTXU)A#TEr>X|MQiV%4tM(Q0^=uW98R3iWxK*d#vbC`aD;;s=?BI=6t zb{)V?Qt?c+X2`fXj?edzqbf7=#gQ&BB-e1fsBZ)(I+2PV8T;^>Bm1HnuVS*8f!ckQ z@H<^$(BnIcyM<1vO|g{j%+lJZ4|?vtP_jNa2Dp)-iNCL_@;Xl+~& zkko+?P7cIt1Vt0@df$XY3~F^nWzueS^Qngra6^}R*Iho&pW%ELHslcnJyI+ItXDa9h)>P2uG0yUV9jE+#%MSn||xy5rdCloSH(#3Oz`Tji-fl^UP z0H=HTgnJ>K0MYIn6mD5+ontn|+BP@^!$Xim&M(k0Mgse&!3&jhP>lAwZ9vXTe0iw9$d1z4n9%_GDmtRH+ zgs7pq(1-D--fF3owCJ*90BZmq#d?nfHJkJ0UXssmlx$WE%>YQB2PxgTXH(|(&ws)lkISt+V<7hSa1 zuH6zT(>iTD@AL={mT(ptO3Nds;+be!bd3tSt=z;|$v+})`^Z=Ymhp5NXQ9V%4A7OU zI3pJJ2l`a5P7&WkrGgwEL?2oQUvlXiEjOm%?Fp0}sMg^xBOn|b<71H0bg4%v_1uiy z$zFT6af8x$>~{V1=KB@=H69!9yyL37n~jr;>Osky4!VAgwooqvLz;Ry5sk{ASk6S+_cW=g_N% zrYd%gcV6Cq*H!rJflCKw@{i5rm(S!EU(I?h>t`8`oZQ(An`^DA37xL?pL1{7qXTVx z6!I;1V)Hd!U!*lU+@oO|q~@OV!-bQ2Lmx~}BEou8zlmxrMM{n6jn3+@oaB(wDs8w2 zq(n+z95P&jW|?B8Y%9G1VFHy?YFpH-xdB@vry0qLrwbrw7@4^Uo2DaBzX;^tGO}P1$p68}!iABD>6?t@tIqG8 zQ^5WmV_EI&-dO4H8CkM$45WWxWa+|4g#Q*}%NE8;|H#PlMIir)krj(T{!d0$E&}-@ zMpi8X`C~>_FN{R$f5O-$i^BdhW0x)p`!9^GSrqnvG1k2(?7uR0*}_=qe>1YyKti_U z%#~?UouQB~)l0ZfNNONIQX`qVjSo5AY<@+PiuaH}vm9=V5$?QF?Wc@vT^NaSdYiG! zRqSG95d6Hue%cnr|6Rs<7KQy8W7`*n{Wr$0SQsn)J0n*vj6}+Q&e&Cp!oJ5?J|22+ z`2xPbVC))$uQ50^28lvt09V&o_o8bJ>;k>019ih;+2j2OVv;JYGh#9?^8YY${lZA3 z?>`y4VNuxsV(i9+vC{u#Xz4yf(qNNoN2FLtBiME(*mjw1eYhIs zv86WKqZpfwW#H3>C=JMk!B zo?6c(m&>`^XXNez^*)dB{foh)r1KeldXeY?Mh`5E2IoS?^1TsypE*xDfi7b7nT0uX zo)j}Wut;jW^H(t3SQh%$}#$e9E{QYk^+Vh1_#l^_ocG zQp7o1`k)c#`LLYA8bO$&p?#VslUQ3Pyi&k?56- z{?sDTs~CNCk?7Too-oizqt#gK)5%imp6Oe|Tt8!Qy(fAtqd&Vy^g2erxJdMRMt^RR z=nahi{36jC8U2MtqBk-6nt?Xe*V;MtHE*8!`cg#4E>K_dqxqI4`Nai#<}cm6ZuCVc zw|uKQ4+*vz>B*D6Y^3KN*e#6xwS}?VmbNnb*B6Q2#^@UhqY>xrjD2}wEXU(MM*qgb zXmH-a*jEg!sdVe-ly1R1rF+x3d~$)(Ekte?T9pjPd?#X4AbrJ%%{|e(82wcP4Zpuh zHt80bV@1-d($@x?L@JDDv`CCgumbQ~pGzGpjuz{FzrGbf-F8b{!5L{0{v*rvw>I%6 z_({Kg^BZD=5+9-{(LO|h3xUQ;=821bRUAG_f$xUkqZB3PYd-Ip109?LoMv?-V=6se ziAKb2taP5z`<;l6U7+;J5Sy}jORsZ)R;)^;+m!oX##lj08?b-b*#DW|XP4&lo4>Tr z-5lcYl0SVcH-9*L82x*TMDJzvw-$-s$LQZ*Bzixizr9HG0WE+2fbhRm&K~4&URz{1 zhZy~xMWPQg`VSY0KEmifS|s`?qrYpQO*LCr{nBf8`8+lIkBzHh7pU14sM!_s*6jNc zn{w%UMr@=%5q7DGbTIZqD!X+*~^kONhb zb%JYTy?7hkDBg^x2s2tK{Tanh`g1J@s^-nV<4B8!)?4ZP=m!}67e>mFyOwD5Undy* zmkVRLe|eD6-(Mv9Ax8hzBGC^sI<`o(m(hxWM)_HdUnensoidMX=E2{|BH-&C4&|XkIO%yS6pvyliw!eJu?1-jDIHM9Wve}gAT69tI1eL z#&^l>t7PD#(l*&cMmrhxWHgZRzllmp+47Ic_&JQxEL@9rEX3#MAgRjX{*x5*FV|$M*@zcpn)-GKR>&N6~FEzJ_j-A14FfHMhx~WYByUTZ=!$zF^+S6e&M~`)N>^$Dp`IvjOTD?9C7q^mv z3-oO0F0QjH+gGh_Uy09<8drUCdCN4;lK(FmHcBfFp4eo32+Jm?lHnjDjf`|MGRSbk z7%ldA=(L{)M|E&C)gSQCVgFHwM;Z=|c6Wk;TVYj-n<(@NtH^1b#S$NuYIi?)8n=w$ z0)RH?2a$$C*e#SN+;r&Bt)2Ln`FvKp_5~O9j?&8Hz;Vh-ju=spM^|=}3kr!ZFiVOH z&YvS-R40ip@M2At_R+aKp)KNJbs$bwV07K#5$F@iloU$3i78H&4L~n|4wK=;H9k%MfLzUJqOw9ImB%;A(NWyH>AV zd{rnlU^$m6Odc5pWZ*mTxMq`#Vlqm|C?$iwv!b4^tH2xO84iDYbhQzWrZzIVBcZO( zgSuA(7%lX8NU6l5s-Af080qMiB(BMTdon~zU_5EA;A=JNG1k#-RxA@B_hgC;V5T@1 z2r*-*zRbx0OVqtsLjwE}l?)dTNqpH&d4GhA4P=}qV*tiz3nIZy)5D)&;AI3>bv{Vj&dDE)Ayo@+Dlkvx7ZNytVTUDLl^U3m2qSca7M@ADFG8toJ z(5-kfjiTfcGCn}Y6J#_Hw~h3+nT)MuY$M}77^5|4E9yCaPcR61X{d5|IBazO^j9U_sgODBQPUt>IzM*k$7qg~)B}s#`tI*Y~#0L#e58$LXlmduDcnBZv z3P;-AM{(CLK66N)I#k0_gVMAaMUd*J58LJcq9oFUU8eKYYVS2jZ)eB|kimy^&(hn2 zFh+}vf~V$#z9AAGt?E?k0{Rv{q>GEpahyFIf#w3seqHkk6`Vev4av{A(Srwv`rSGe zof)oolp)`7AtaP1G_;8u5{-Us1Yb~0>U##lL3|{4G{fWJ6TNtKdOWIbE8a7a6{>zK zEck>D>A2!88*il12PuWp9nCjcnGM{KTr|5dx(Y)Q-Fm>4Bdok7i@Wy0LgE5f@IVqZ*~V1 z>PKoec<9S&DWka_kNG1-qbZ)pRi2u*n1{8+-~=G~Md@G!bPC@Bqkx4?WC?YXTGQ9m z_%v}3c&S0tRQWulVg*VGvI&}0ra05#YSoJ-2N5Rp&^)T187RnmRhL;y4Cyk%ne&44 zSXK0e6XK1W7YZI)yjA5DtvmoiyafWiIw`j3v|pf@kWA?4(9jQRHBdFCGJ>3;xFaHl zZi((`wA#ebiVkmD%2-`RhwK~}GDkKrf`$bG zpmCS933DgKl*UU(D0YYX~Fp)tN9rXKe*uTrmqscJ-)A03a^qs-<0NDDoi1!~_Q#=79x-hN^sxHbN~25`xBt>fkrU zBf|-aQkAjigb^7AnWD*B*_30Xm7N5`0I-+hLb%>x&zZAof2odbAiK}{Sw@4?oJoL| z6p4aCR?#5TV6N)ac>kvzKgH0VoIx~3hdHva@b0|1&hAuVdExELdQlC-G7$rvfnuuOBH z!KNy4;eZs2R0xe~bWrM50U#sGNxJD(gDNJ?G&QwJAdnjvY6}Oh(yAV%%3Q=#)uyG) zL4B1*@F8E$7nP3(N^h{EnrL>)c2 zBfNu;xu%0v0$9m3a84-#jM4~!I zfc7%1d4`pxB%BZyjpenDgO!v;fuQa)NtCqao2nDJ6GPM_R*B|113g}yfP&Gqrz!pd zjw~a6S&ynOi$X%Wdc!P5G<{j{L}MhKSq$Y!XBK&<>dbbDl~0UpNW&M-hGf?iu?256s=6#w)wd~2SxXyK6pF4NtjR3ovhbjFG`g(O zl(*pz1;iSt!nZ0*&~j8QRDJpmC}q32X$5tiZ(89xLRkeq$&)0wqNAY^lJ!DLh!3zXDk*B#f&w%5OL8yz%qPm-*lWJ3gE-=~zqy#INWU+<)hpQ2u`yM~_ zqc&V)dIJMgv%;fgCKXnyN`i0#%N^lIaG)O=tXa15+Uv`w%G$?WGo{tjr7M)u6*Gl3 zcgyRh%a<$V%V*PUWh>vYrIuxlXX6tw1?4ZU`NGCeZ@jki+OCPM`0U1wz28dx1LrrL zuNAyjc-y74bJ|Fid#mmCIc4>pN!Q*Pr+Ykm+UZv4Z}*Mp?Y+v1u1V+a z1oyr8*vP}m$~}|LM11#+M{hSOo}G%b`*mr`*`46&AOd;6viiWJ^I(GeiCfWF=W%7j z1C!1ZCc=zs*-ukbGP7nHR>f9#DGj^dCL6walex@@VCRj%?F~wM$E35<@<8#BBaW$xrmUXf9hm@AXb0Laz|E=oVgUYIXlg|AK!5+UA zcu%sHB=xK(!4gd?ts#K&C z!R?!4TOL%}AF|5e1-U5B!?(7+=2F%insgpERgV8xakQ!g5NKt~u2)W#wvA`clvGWZ zv@0d;Glfg~w<_hW5NM?|j`V)s=2o|tplOkK-#-BzV;>rCPDyX7mU%Qqino1S1Db~DG$=&sK z8f^F&SZ>SBx|pkdCTG>x*T!5MXL2@tt2pM`K9jTkyN;Nv>z&NB%B)NK#yhSQ1Dw+^ z?OLU{R?XxW&6HHkls8=Kz4kz?b$6_E&rDU#Y^E(|_0Mc+xV|6~YCr8L!#=T1B86ldLxt4)jSlGxI% zO71o_vWbw!8CSvET5HN%8e6(v$=#qQTxW&;sy3y)!kq$=x`&jL}q+aGR9e%~rTZHTPOK#Wo*P zTD$e!GX+_rLh8`{b}DsUb0O%jZwoo?XL&@!&3z@&_T32;skHM02Mnp`n5e-~c-MFn}p5k&d;PCu_S_T}QAJA~OsA{@s zl~S~7rXVRG)J;k;}wX(K*ZiS7pDakGC>r0CX8D9azTjK>O1 ziq*v2#sDTMUqo@B{vK2s4_Wd-SS*9VCqWvT<1=4+%otI}&_ees4eC%Xxty%x^GgqN zi_}5xjH}8dL${ImaldQRCK39$71PXHRkt4z6Ha~Ti8L+$OzC(h>7B7nKLy>0YJDsDvi3=CLw9Wa@6c)4FY{W_!$8J+ria zdg%sb>4urYrn}|Kr_0wU)Sz8dQGU1JIqhs#oEX=LI#L^}>r`^P^q874 zLcXJ}bt}!4cztN9YCQzOOr?9ea*a~C zW~Q+2Zh6ym`3j|c1+u1UEwZL6Ydnu~reQp9+F7qS>oIkIJC%lkj?}zd48db{Ta?1B zv#GGXnO`x1St;J$PSXa8CU4aV-mYy|3Rf?}myrL+;RlsAV*+YoEc)lAv6_Ujk_jRP z0)T8&a<}VcX@ZE_h30-pY0{QoFr_v@L@Ql)>$K8vpDqwgh+i>l7Lmv$9Nf&ZRkv0s z%XV3XE$S=MxI@X^X_ZDH#?d_YDD}pAkSPU1iepmQrPS@_fFNa-Hs5L9G__=N%vm{8 zT{~U9PN`luQ`m5~yk)w4l~TS6)n&vq|{ zTmL7m-)Ws%duTj6Rt#A{t2oY82~&O{i?MXLTWQ>*7l$s2jNW`+$USEE3$w2mRfc<~ zHcLq>sw;*Zok~NORV2}LXjGE8bb(5bwqB(;tFT5&L9As4#H)kHiJsD!{r)X zo&4pOj_17hcDI9sQW&4GwY|HdeCJM2mV6WMy>!M-{9JfR#s53shhe@o)auUAsO=T( zM9>e4WGR{=SuPaGQgAt!L#3EasE_44Y-6coDNvWdmwBOJkgEC|Kr9}^N;+=br0DT4 zYLDs{6o&1W^P?$|Oe@+aQZ6rLrL+`M+!%929lBaU$~o}0lfoL_T2cYyCNWI69@T=_lM*nLzm@vM5aZI#}&bw>7VsZrY?YZMt1|r)m3C{(VZ*_KC=q z;;R*(sE9RfA5V+r-}iT=73fmRvVOR3^HlD!S0A`>{9FC6Nw0VQVEg^!kry7EIDT#I zR7tav*L>>%C71dj-W$Lt=&K+k-g`USRIw!VSEcYXa0e%ryou`iYt#=SueTbR(>e!8`HcAN=aqObR#T?~+KCuvw~@ zqzLYIe7^_$^g0qcXu6!MQ1CFGlH<$<@ZYom|FnB^5$ci)-7Ra58B(fW2xY`au3jJ} z>86xsfM2%oYurRj>cNuJx)eTPm+0~*{elqT+hzkL3l!R|O8S99S6?hma)uQY7btE^ z3aS0qT2gXBskfw%>NOqwP5L36d{CN`oM8op6na}NDTScW25Hg{eu_Y8PjZG8lwwd; zT2e|t(G}4Q1pJUv>RQ7^yn#{%imr$z+&LUl_uFW=h&TAD0A;hV8&6P3LC!*If>H$v zX|N^zK&b|0dy+G(pezAphb3hxD7!2vHK24_Qrw{IwxlcrWv?Zr7L@&#lsZrjT2ksk zIc!O30OhD9r4f{4mXsz?j$2ZiK{;VbX#wRSOG+y!=-rZk$m`{xc$3|^6xu*})RN)> z1v|aTKX7XY<$cNS%xwiICoL%}K{;hfSp|w@Nm&g_pCx4tD5otcYe5;bq^tuaU`bgI zO30G30hD1&%0^IROUfotB36`V?A^AOEr;Wd{{E4^(Xo@pAsL*e!6`T6EHR%;IQh#Q z^*0~n2V-U^P9}@vWbTzQ}QU99}5l7s#qi?J)X2c`zps-XG%zs99$p1oyp@;bs zvKktghRUU(i@6G8OjY@E4ZX1{}RR+Lx?R z!z!4sf^*9jnWUYX3ty$Te;u+_9Wcr#0YxBe%{?J)`{A!+hBVQnItj1GM zT>gK_)y3+yE+Rt>)g#tA`qi@WWxG6%WRrT7&8?2Ii+XW79bfNJ4?d}Zl!9IMcT?%=c9*Sn`FQ6KT#a{g3Z`>fl$@3u_fO`un~)uLjOGhUrt{mB{I+*9 zv0;iyY-{^;OPA8pMS(Oj{|%Gb8%(|o{!Jdfn*Oc4%C~K6^0RP|;cDyit+T0kf3vD) zx~hG$3Y)92VOgoT{OXqHx6Edc&1oyCxO(vUgR_}r%d){=#q$-j*<{0%yXtD_`Os_* z*>Y{wZjDbK+4F5Rb<<1NPA*+LTR`?gTVv~V!>-AOU9&}GFSgY+P1kOntlc_WLiSQy zQ5nanjBMp1qzbZC+Da?0KKA@$vsGlnCUMPlW!q$B+w2muFBQ}pvbk-QOQ$PZCo5WK zmyx~J=B}TvSwC5`ezuP6^@i^TvNsy|CbBmh^cJ$W8uaC4Z!_o~vbWpno2ToxPu6Xp zT|xGhw%W$&Wt%6LZJu34_SLp!4b$$8lkSbPYskLVpsyqQdLy4UkbR?p-$eG!wxzYx zOIA-VSv|Xj>|2d|+eY^7wk6A^t5;4|ubjP)>}VQ##_zO~3!1f_VIAxOA)-gBi(MdF z^y1jVE>O2?Xqm3xF+tJwr>;m;WJ#!DS3sm!TUq{#lx`}$| z_p=L>#q})fW*6wz>6POFc7b}Go{9(A1)Ai|ZPQJ=C!2Q9KFls&6Xp?idDM_%u-Lxli6qRaurJt z_;`pdXs3^dU^qe4q#y3+#WaByGNN{?TPe;$7G0(Zh=t!Mf3Dn`CZ*DS3eglA&0k*a zpig=@)K7Xq%Q62re^!wc;5xJ~?5N5ur^OLp&fnuu<(^QyX;-au*3&Py8@}Yf zq~3yb{^J?Gz5#zg3PUTFWGPePsb~Bb62G=lj*rm|pwotis~#Km)MA}UxXuT97x@vU z>A*$X-#CxHSyYBy;X6gE#ye*63S*^>)1~c7Y5Qbe`sH9f^LhMXd$hj33gP!{9Tc7*}B+1}elc$ZSVYpS@D+vT17 zc8oMS>|OTp&S_Vx;%bd8->$gsdsF?Q4KNr9ncQPLjw-JE;f0)A6j#d~*YdaWaGtDs zy1ZE_Z=R`Gf=*wlXq|z;Z|qbWJMUJnzO{B5Cvw(w-l^_-Zr$r^bXkF- zI+v_gmaLwEz{l3Zvd!-nWTMBkWoq(1UW9oD)@>68gz-|%Q9r1z=47wFB8lvxX3qlU zDZI=lP%hdp&}VjlNd66)d1*viyOvU3G_6}2!bPI1z2(OmIy$-o8nq_dI+I5l+J2`#omd)+TS%B*7! zznmifHMPFGWbh0vRaLpaf%21EVTevrhb4BB6B9y{@_lEJ4rv0hqO6mmC^*x4(qg-!y8cx_N0%y8ivoQ$*tBHzHS@O?BrKXqkGf~@;NU-)f z@J+IfrrRM`&Bb8crll3zM-JpQ4Vn12fWNdPfBN_XrMz13FrF7v_S>Gq7c9ooBG`0N ze<*K@Z2-YHoDcdIfi<2YSI6iZC^WQ7`oYy=(SH;pEGdJ=vfK2iOU)(23TSONA zLCN}Dnh5(Tdt|2wRvN*U6KxXh5-!FQ3)Zwr)M>bgH&AGkXt%H%Pf%!+Xs_WS-aw&E zA`*>BKTs&QXdx}>2MTQx9ZqtF6_mng8cf;{T5CzC4WXl!6xt9vW=Ww9q2rbm+7Pnd zR-g?b>um+v5VGDppbeo%EyJM=A?qyy+7LQv8P6-kzPR+B`x<#8B%&+HNHjT z9PAfV<0)`fTLNPl(F{31nlX6Bz>7EGFNBT$u}o}8WumQ?;ma{gr~zW*KdTqt;K9Z} zWb9a0G)r2Ft^5q$%FiMcRo(?!XiYTJkZ4FdHo8XLJNsUi;QOpj)L*1pZp~ER&g+i)i&V!{ibuV{ zNy-?b_GpbqE4^Zu*ML5%v(aDR8r3cO%P2|*6bng0lrd`I7QXJ`BR5KZEaEqs0)Bb( zQVr==jubaUb0Dv1>(7EyvLplQ1A2n=7i+&~5DS){w0UUZ3%qIb@K+Btk8?DaQ%faf zQ4%Owl$3lQs8)4nO%(ooJ48YjXd|?tc~M}ECtNL3XWUO7as%;{ffsKViz6Qr@6o&{ zr$ed}tq}3jdkyQ@pSOo`Q=WC4r1TN{7;abR8hTNLVmvQWU2iq&x^)btc$GV@82Z2U9pGKt^Ggw4Qux?>SaDG{2Na>Gnx|4kg%ca)42m_B@OCuYd5!Ad8w(? zE*ZSUo767G2pF??#_|*69;KLA)Ei#M%1FXm#u=rMf>KDRN8o8utF^i~Y>Kbd7>l;1 zFN)Y3Pq^N@mURMQl~YuHD~<94%1S=2;uM!gJ}N>Sye!^~r?i?;sZ<9-^RzfF+h+_v z;_YJio*~yoo)btB*RjF{a%xSqP_!15K;c+X=+Hc2sM5a(IYHa*7U}Ox5utMb^Sp?Q z@kBZ2W32yza2foRcr%`e0p*HC>c^<}nlO;!?McF0xQFsgAz9FIi@j~|GlGlpl$$ZK zd(p6qw{SIDmBV*f*cZszbw-;(IG-23jOUXyZnAKP>oo%<-h4;I8|YsY@5Yn4S-8)` z?U?Wh`mc$1i~bcZ{g%=!V6Z z02Dg9VIiA9p`#lY1%KnI$|5NPh|SS_m|LO+Ft-A2QKLcG2Fh0r|Kbgl?Vwo5Fi@y3 zwU9)h?1&b@+!-x~xeF+Zb_PlZC>C-G6p~=KL>!DKC|#g@&2SNKpzH?a>%wk4LD>V! zHw+i?2FhMgZVS8d1Z5v6ziYUNH&FJ2@-1OEo}e56<(lCl-at7B%D08xc!EN5>NUef zyn%8Uls^=9;|a2GsWi&3J;+4a)0=i+BU&I4FN6?8Xz6 z2SC|wxQI7UPJr?k!frf4c@UKE8!qAvl!rix3A^$91RXY}IpUPzCf=fsXvxiog_P&7 zTHFyd<2hCqErld49kirHrUjB)q8Ac!Mz|PH#P1QrA#S*cH&7l0<*u+BPf#8Mx4$)9 z#2YB@iYCJ(X3Cf#>i+BU22R#2?*o`MBr$Bkja1n2y^n%+z3A^zGMFQnV zhKqOu#Sh9)gxz?8(g(`F7%t)slzveDRoIOuD5pW8J(Hv#C>jxJqY4|HZxeFNC}Cl%o;4xR!t5_g~`Oct$b58#Ju?n|0`n zPSHX73%fgXtNz07>AF>aVRxr))n8;BLPdNmC|IYW8BWf8bnF?X-weE>9rPyKSVgqr z$qi^R>!RgW^&PGjbv;^&@U!#?=`T_pbx|}EqxoJ!DFjWW;DB$J?n!@f z{lkKQ8Nf%3cla$3HsdKqY-3p60iw{Lh&L_QEGX=wSm2DOPoF_BqST;>H;tzSWnP{i z6MpAIjA1l|6qE~`@q7{|BdjUWD&J*+V@##68Bfi(1qHsJ5jgg}RM?EC=G%e--&X{V zeb)+`@zi`LcS)GTbqligblyLe;wHeoZKH;2UBd#oD#R|pF7aiw@S zp3G&nVHa?}Qo=@Iq8b#~%#-K~Xf(K?NOAz{cgC^ddG)t|=1)ii9@D^c4 zm-s#_{4&OhXX~OulaoTT;4~++jUsI0`J@_}6~&yka8hU%c!c(%;EQt8SMxCjNRO68 z%cH5$ifCz6ZS59qH(DK4%WA9OVmvvow*P;%eF<<|>3!b=0gwOz5+DKI1PGEKK>#EO zp5i6mq(q5_BvM-KWtbu;iMtY|eIT_fSlVTsG~vdXF%>s8WXD}@n~u2?Ps?pPQ%$DL ztdln0NqXQt$axSfWx12hI8NLII=1ULy?+1i;UWdC?Y7}+|M>3v`+wj2{#V+cbg}T^ zq%Zx;!bj8or0Y9tzhjWyH?23UDejJ{?O;)GJVve;N9jSHM*PMd&cbvA6<-=S1n{`7699RlZZf-vcEEQdM-a%hp#I zckd&3V`WUK+S9bs;Ds?+E21r-s+8(ny|QKttY0NX!5|w$ORa6>)C{AOpO|}Y;#9M3 zzN6QBmIa)enjdR9V;ehhI?xZIaRAI8jIt~nk2uo?viTC!j{7~6!Sg40(!FXUmTs2j z@24V};VYGlvoLue8wA*&p7jNT4f8h(!NC5w zrrIib8^)t%?`x9qWeAw33Y+HoF@c`y*=^``$A~B~Q@%i;i+|e1B;fHK&Gi zd5vl%!w&n?rn=;bDX~#vDrkgW4R**o=i(LY@~IQp9K^Ij%B^LV9V)_>+$gZA;peeb zf*t!yPOL>*0(%|oyMRYG1Vdj!~B305NDXfXQF8e)+F zXotrRAl6ZjY!XRLTSY@aOMqFk(UD6*058TIuOy%Xh?VVcpRCN?w{n#)pd@&9OxVOJ zB3Cg169F@U5(1?J_M5|zEwnJ>EM%r7Wp31s5~o`DY1q%v_?F{fex$vWkE{+m=f zc#xkmM*^Pc43i_lW8=%`7a@j2Lk<4Jwd+hdF1F(mv!c=zaN^kHmvO))_@HQt7O{(0 zvBxd}(NZ=xPnVo|vSAhQd?zfa94FU)J1fCCoZSI%RKXg_DzPdtybXoDJ}52rCA25| z|u$+j$i%P%~1)gkZv?TyuY>xjefZ!QY0*uL6yn2ygN33n&=u8c& z;6{QG4o^y1urHxwIGG$hzr4P-4%M>Pf!xRz6joO@xq5nAibbxrr!d zIOc#cnQJtM?_&cv|0$2JaqMW^uzdXT1sIC~!#SqWw|*6ye3>VHBc48H`3_14gL}je zEAOQl3T79&pO1Q?t8XOoNgs13g#jsr-n3jp7kv~F>R}qV5tcnfEI<5_?Yrccn z5Tw-v+GhBdA!~-uIf@{(En{{<>Mu)Hmnp|ZBEC4Z`eEnpbowrg*0T~DLCGCgt zH56uqT@_%awD~XjwD~#CjtZ*DMj&-;96S{AC^mf_d_>-p1f0~~ACrq-pJmHZ>_lMJ zqy8I3{51g*CiyuKy=>a*i&WGR%27|QDRMD%odr|JN}qvniM}DTp~omcL~3M{Ps~Ci z4|y7ne-R*NOlX4lCCxALkgVZirO9AeT=b=ObPsOtYgE_#a0uvwJ_xj1WNQIkE&^D%+P1pbD=pA(1^_)7vm1i->rq^nAB z+0RzL;#K~yD7b1RaZUE^zj4sjGM1mdm&eR$H$Hk`gmKm9hZ1mhTlJ;m()Svu{J@or z?ksF^vxWSSn!^lmu_f||yiW*xN^Q{wt|V{=|KBN)h9j0MW0L?eUD`hXkM|=@%$(pK zUY50p$k~H6PqPm1OVEzz%E0#uiEH?YnbSg>k5|xm?`qmVDYt)T{NI$`?VXZ(PYb=L zBh6>-mOr;y0Kr3}ZCmy2{CD&38r+{4TcefMcU-q!QES~htG8FdQ~%D(w_j$=_5(uW zK(yH_HIE6+V^4IsMa2)n*gvY+T=?}DLgwGH?wId7ham|FF8;Dsp{zAh)&|AoCACtC zLnv`-t~?m!YLz804JOIp5)7`e0s6MO0rwi5n}zocme3{1+ACOl?;83b zQE7EamI1*s5U~ty&O&a|XqF75U}dLMa!(8RXCOsjqTytmgVwodA(N=^3T57KX!D`HEzi|AX*3F1>`SGOZXY!e`w^YUGXA{DNdJb zBELB;eOE&M(zJ+Z8Trf8a&{HuuS}O*MgHov44{Sl)^zzbDKG~74@#W*d+TLn-DZggTD z$_pr5e&j>TUHx+yQ@P&CySnNh=u0u;HFfV?ynRu!jtbV%h;#Sccr zu89MsO(3bGDGZg#I>T4{-|i0;eTAfMB%=fR{UH}D6}Je*Ef1B7BF`TA+(NPYp*p9i znPhUxuo_6H&Mg?-JATP?M8H4e5sIosxi;6%!oPg+y%&X=p0K_;YJ%kRw{`F81e1H` zHPPfnOA4S%4m#6AFNK|xAFO@vOW*y{2baV13t{7F^rWCLj6_TTdGOZY_V~Atzk6IX zwI;GvhZf%X!tF1FtDD2k+4HtyT#HJ+UU;i8T-zoXp*FSwEq6DD^^U~&Gzx~sog&fT z+F6rYXT;W-sB2JaA_b4R0%^LRwILQff)mpQQ1b5IhqfE`&WNF+$q9hZ;`b0AUQt z?BM#nfUxEgIVI)AA%a&fP}Q*n`++RNaKi zaI~Di@{@Pr`mVm6iRak_oh__uj=EbV_psm| zPRBQ~cv&LI6NMmm%ec@oo{o31czOKVqTWu)dra^iOUJi-QrYx~vsdLuQ-_HXzC9*6 zN7C+U_|VFfKzJVcHL_nI0sQ_uwLgh#5&AHna||ouI&$i{#=&?2B`svxhQ4?ixym_vU%Y}+RHm_i75S^vr;COB*0dCQ4f$

1f@(rdH4-AoiSdmUXqM(q`jF2%FGOkF^=L8FZqJWfO{t*g+-cYCkARPt$f~7xd zYmU~pLrtW006WGCHIbE=Sjy2YSaB<=is~$3oike3Db*bn>W+r>HJ?7q{>dQvNNuQA1kW1Gp?f;eR`k@$X}Sggc!(g z~S&$X}VBs4DVT zr$^O7ersB3LJj#dFD>M+OE0^6_(^kf6KAZJ46UM}b$49qoDw^y?iW|nG2y@=U@DZ) zggS;JYeYlMlQJC4xP41juVD4kZQSz>50hx^7HYer(6wM65bXo(Mm{Vy4#V!kJ44?X zVsi8i5mUpCU2+YGu7NwRiLPNFDw-{C#OmEEsrb~Cj{}O7am1@&4h4-ZC=cuFqsF?h z(GxY=B;z5$cxaCtbQ(lW4U%a>zDh*ZZtEj8)bF$i^hJ!>ZQQciN+lkb%(>cDHc5{8b=e+9iIXVcChat z(w!m88UtgpnIZiePx4eYOnmjHh1f^$y!?%q*_kW4Mz_h@Fl`RtO=DjHj99BS_Bdod2h)%c>Lf7R}&r6pSFh?dt! zYuX4h!ENBfXl)mTU==d0EwYR@)Ssk7XsMH8nI|>I;{45A;AUAD>CJrH!2`rf^q+?E zU++_~H-UYcNb#*J3#pz5gU++0veytN|GkoQWc)JSkNOPBYXt;4@x4xeOay`IKk%0p zXw9aufJzL2T%V1lC?jY7fkNSZsx8U`ZOOuB^bcO%r?MeSinQL!B(otg`eb8Sdd2i~ zoBV2jN+9%R&T8p1a%N##AjK&DT*d^YTPsVJ?9WS%E@NQJ!WiV&Ft)E_YbhaF)LEU( zpZ@i0{rM?&808IRfb`pCNw=0_^T`(L*ZKAS0*Htzc!R(2rYgM>Zm2Rx|GG;4&uIv@ zp4!K|K*Um#AWytmCn?vdr)v-YEvzS~ZqX-SwaF}YuyX9m>XQ6IYdaxZpR_U7f=icX zW4W-3dGU%LkLdj3`GmTH;BStTM<(&uGUzWvnz=%Dai6N%c6~W*U>*-hGPy@K#t!l~ zNIzMNkt+u|b{%w65Fw#-Z1Hll4dzrKIo@o8?Qvq_Ae!RZRS%;8$8?GA27FhqPO^tO zMpBOzrKzloP$GaT!9cNc6*OE>g3)beDiEgNZS)iei;K2XpjP7P5t9Aq)`@K+sl0|& zM|7@)iazRYW?Zr!+oeQ3P9qO}L3lL8iUD1L@tgS8S5wpCV~i)TDUCGjiPuFg_NGVN zXMB{ve<5H20*RSsudGeM{P*%@JgG1HCx~lBricC><@?{1kY2mNxUxwB?ER8*@-+nh zGolXY0OtP|>4KRGbDl-~;PQ}}8oI%;gRx1_HGSup)I29N&xtxbdfCJ6v)|Xx;cQ6i z)A){0YB>$FR-*ni`*>;*3@!H!R>{yK7#{~3ngzQpfn@|aZ zPSAR2)erkMdbg|*D24bj z_^G2J2jhsNA}a?hGy(y6^Cmx(K_f+GeiT7}dgvs8z9;4RoOvokC5L80S;9MeT(;wa z5}PCR~EGISNU^N=uv$`jUK7fZFj0? zV|iDv!1V0ug@u?3YOdE}TB4^dVv7?2nFt;%L}pgtNhe=SpUA<8dNK8-E1*d8wEkmS z(9Gl)YGV8O@e@X_~xnV z5~{kSs!^e8baQ&oSiL=t$B#!`PSFtsOcP4xM#0<|F*of@LJbzQz9aY(b5&?ss%{sc z5a;*@=7@O`4-W+{Y%#~Uf-dt7qo{q)(3q;5Z3fkZH{*`{^8YmX>ALF-v3A?b@uX>A) z@+)pCp9f#_ub+RNdjqc56;7kfz{xLQzJ4?b0NVgje53JB1`Pm)Vze9!nX7=uz8t!HXIqyFxXK0?_00ouO7eG1J<-SKL*)_T9XS*jiv zs>dVMlbbX5D_v4$pHSHssT`0hM}^AKNagtEal9|-D?+2cU2%IB_m-V&u&dO16z>B? z7RlHs7#nxgQ9SO*uh}ZREPD#WdKXTJRJ>|`yuDD++wh3>toWz$E7SRvDBq6qZFqjk zAdE75VLd{5Po%tWb862}9-0shwR?1WxCFCnXPg}*OpoF?N@y>}%R5f1;;PWHSX7@i z-Pq@)8%vibr^DOP{ubdJON@?cCnz-arn{ip4GP|oY~!Tdc|~aINvBlqz9M*slRd_W zlG?|yumLY9V?TNFx`H!S;F#1}_o^*lnTf06`e}fek2bp}OSwNR)lD`i|In@g+^5wl zbqoA4HXnO1z@j+#2a_U+(3h~pVTuI$fU?~MH|oH*&(Rly{I`ZO<-b?3-VXlxachL8 zLnY*tQXkX`ZnYg)DGP4*s~J8eCHQE=!9Qq7Q3gsf>dSJz`ZLT#pp+EGN|%N9qi4R< z14l-A+1j6XL-l{v{wy`4{i>hYeob-~*fUJ(evocIdX?o~KxYBdiIn9r<3>tb6PsCR zxyAg2N}#l>NzNT&8Q*oXV#E< z=twssqlVNPxuMzk4b+llT?HD}YLj`fF6)x+^l$e5XVjAF$67-&SN0xdFQ1*{S?45c z9iPABo-tcD6uchaOpVF*rGK$E70GWVtUB!m$)09eAM!We&+K8=yczvS&B)K#dHK&6 z!Jo4q*P)iusSDcY!H+(kU@{)zFRarC7IZt}9R~s~@@J7}HfRCkdE#p=B}{V;Z!&Vm zm}X*C7LJK&$CGk0yd7&P_-jm6Bx8iO`L94nBuQCZIS&SAsHjW}Q5-DXgQ^qaHekjI z*e)-w%3Qve@e)Lzqstlk8Ht~MiC+v{q~{>K1o_vONHAmxZ!uu|wBZ2>svrZNU^O}+ ze*vcCbkcnYzWsBybbkqdFAY7Mm=Jo-%6>zlE+>RK&?$EF1LHK~1N8(uvSgLKaZ3J&*d{XbBYPC1C$pG!d}4Cs z^z?#nX5_4I_VkQzbYuahE5>~bbH^uU=VRuB@$(ZW=Xnhk_a7*~^76`RObtEJ>&wpk z3^qXac;=>L%>>v@bA()@0I{ko{K^Gi_R)$J?Te(b2oOq2-XLwVWdQ!C=v=Tm<1OGR z_Tq2Qo$(Cn!xw)cdsiC}^aFQZis+9$%p*oFd}`w>Uw%5Hm#r|A=(-re%kc@g%BJ6* zjv8COaE{{$Ub$di;93e&{Qn%Y8RJXuZAbXBXb^fU2nigE>w&lLDMlucx#=*yGrMr-E z+R?KKI3q~ma9C%L+B%^{U$EiECnxR{jGaha%Mx2CG01ry#$DqPO`cCYZS2}f3>=J; zqg61sB7G}k)p0*Da&8bXF`l)8v6eV5h;0M)%VP}orgTH_VhDCFHa&Bx=zEUNtpdqd zFBp9{bf)HK zBGs6EC4>72CF6y%fj!E=+}xz|$1#nMed_pp{BP5)(EEI^0R__64}$&#LA;&7Ul6!J zDGWX^-UGcdS)Bk9{fMF>1pbo1UlI5*0TM=)O{sK}>#qrr0xFpcTOe1Qz(WFk1QAmI)O6;7z$EQq4a8-7Aum`QZ9w%5zr9O5@4L_I&!6%br0~n7g6jp zMU5CwI`QbrLPv~oe2Bbh)^NtVo=@nAc-MIyxrk?-CuQ@DQ(a~?uca^#<*g$Z698x@ zubsRN1R4oQ6y_jT6M_FoUWRQMS%MKL7;VB$v3Miq_!a^l0wmhZx01_Cpp8H~fer$l z1iC0m7rFigxlR%2CNE=!r&}7|OI}9s=p$D@fdK+^cb9G643c+a{D7;E?5%<|y<80mfl{id^#q zn6Soaaxnp_W~%Qw^18{zc;KHWZ;Jh$F}pLSc9|f=NIZ1;JIKF&n8KL27GvcfCa)hw z#d2lc+3XFRZ=&GyR0FYf^A`!M5O{?^Eycb{E*rTnk&CY7{AF^j61YO(DuLGs@B{(` z+9=%`xz-6V$s2s=a{LV%@Yjfu^Uh^j8??U%eWf_EmY zD%lFaRAv3`=Wo5WV;0I7b@B}X0|Y7?clVK%AG>F z6GPY+Q8`(&dxY8^v=W2S6D_Ng${a$O1D&nkQoW6u9IUUsf~}Xz2$^poUvJdeCplrb z{Zv?G-tvb!Zmn$Fh0?}|>eL=t1a5;VU69+Tl_ZNH7VXo=cUR8?q!ifMGyG-KJgqqVJ4 z?SxP}!E#1a6HyF9{ZXMF>!>m`PV1=7E7c7Pb;BETTl1mLTQ5XZ!)z!>h+yN`mWmd2 zW2e+OCp2QMVFGB0Il3grNx^Y4tg6|@;GNvlluDXDLDNU`*tBC#PG3#aPT`$`h;QDx@49Z}&v-N}8Yq#rnRPVZy z4Zwb@?-lBMF$CJsxL~xgb=)aAj|t9W@}9Hb?vJRB?P*MsrdiMs6Pel)wsvHe0DIXO zdN$G%Pj7nKULK*`gF4iNCI|<03)S7&UK6xwY0pmz_DL!M0h3Xf=xv%3ny`yg>tO1s zb|T0#Qsy8#<+iS)pMzvGr%8O^2kW1(>E!tkVTBOQlgoJ=^qNq1;PhH2HNc zQeD4L*N+W7KuE$`pFm2W2D+#ge?;YCc(GNeYTY=xxwJL@He0`h_70)~Dy>_n>qa{N z*8JPAggS-78g%xa&AH>=eKTU4M7=;`lhFpR)G#JAV1Hq=$U9^#+S(_z9uZoPY)prX z8+KGeaWie@$_Ck5J(`Gl?O^M5R%m2h#x|I}*FPTBl!q&ufeZ$KMu3GwOWPeg=AEV8 zj-89|S%vyTVJjF7dXkf8^+=XM!7|7&79r%i_5>5EympS>YwKf0x>00%E;i3Xs6Mo| zJ+@=oK6?Aja3wH>Cs|ngs6uKxEw-Ib)f~Eb+qz>HDm)R@>8K04z802~vE%DIr1~kb zo=tEU(Da_B95Pm-yg#Z#FYa^-mR7k~7*q%D>=89RwD9cP^U2!W{gP`&bm5#U4H z?Jpy2Y10I~WI90NFoFpB_C?cNh56MfWhaiE?ZC>#yE=l|I9;Cz}XYF+3pBQ7Yqs zJ%-`L3=HS$N)AX>Xnwo%Hc;G5w8kUVjEXg*tm6^Y=)K0)-Td&tbCJfg3FLY9p2NFa z89qE4am>lckxtoLKH$GSmxt|Y6n308E^VnKQcEb!1m=+UvNYoHBKPG2^jWlFB z`or2Xm&%+ztVBx&&a-raWF%IPClJ_plxj&e*~2PLd!wB_VU>N`zq9b}t8^Ck zVGu!&+3kD}O&W}j0PnQzcHXhS4;Ntc zCR`Rq8K!lK8ki#F)K#JJ?E#^>b=MArDUWV0*V4WFN_g~4r1?2?EL`r4sGf_K+NIJ~ zv9y&|4{b4aY_qj2AKQEF-7GP#4MKSf z(9v6O()G*YkSsl-1uF@=lU5OJ$RSoM8yylS>q0->I}14fXknNkRViv3XbK=}h_dY4 zZwYmWgz7^=SDGLv`(*9cR%vq*Pe(p@mymsN;K8E~%mZ6;bu=QNNCe zc3jNZ_2tN(qg`@Lhz=}ZU>mG3JuP55Wu2nNNflRyt311A@U|qYs<0-|(Ib^vT?qM! z+~Ro74UM`qY=OWKbM(wUVvfhkRAl@{5tmyO&&g9)ZI?^69b#?ABMu%)A_vEd=N0Pw z?H3<&^!w-)BrT2?mm}!dV-7#@&I%+Y1Jay?_S^c(OD09WtT$)Mxh$?`u4I@G~w}+ZVHx`bDyL1?tA@V16U5Q#^Px zu92dpRogYMr>))N_3X1BovjEYhEjw@zlVPjV@+=qsXuaqi;H_)4e%^Rcs)0d*8(Jf zrQGM~^Zk$LYn-ymkK*9PU=$rv-IOsN_5!e5CVeUt{x-lMQeml4hF2N-lel3l$(~Q{&j* zDtdo=s zs~{Us0{;{1H1I8yNUwrzd$xwew?@OBy5OGL0A_Nn6U^kfx$%63x_R6Gh{Mlg9Rle1 zgS~Bdad_rdN@|;+wmp@lWyelgT2fgc%?mGODc&2Nc&IE>mqv|^arWDLnEmc)YaZbo zFQpoaALhU%qYBpOYBYKu{-sUF&_fd(gnwZH2AWO*bH1gzsh~$)`R_)`w%wN%PUURS z^_0wp$Mzf@573PT`n;bAx$;?I3l__|{;pqH=ZOML>aw6Mvkq;+GNdMfo*KBYZyDAi zlqcY&)kZJmBtlTO>4D;jisERobqg;#c-d2E*>{2FiH6fz@pzeE7gp7=4)5!x76q_w z{subqEcU4+M!dT)BOm`#!x8SMP(8%pFigQ+W# zA8gR664Ivg@|ozL75`9A#tJvwaH7^&@9f2|BAZ!*8pPL^Xt%ty9h;-vD^#%mG!FunefSIdtanv ziv-RQApTo0(b9zoB1FhsoETnNp&%RHSdrYW%%EJTfjYO$1G}=e##A&L{1Qrdnn{-b zKGj5^hju=P{fdiEF;~FU-F#(cW8%KPkci;nibHowBKo0?iF58UHwh#+l8CCTe(qPQ5Xt-bPk9MZ=Z*N!{+#XQ|Uii51ski zO}rwNxg)0Lu)aB3U<#Kt2?b42lYRRP-Vip&;hx!OnPux@=-N9sZr}Lkn_;VG zX9b*~CB3rX5!B#R#I>rTioc61QNm9HbjW|Vyanz_WO6i6;!l6ZYrnzQ^YhAY28} z861Fec1!@rZxbMF63c~lNnCJvVoTw51Q?YSL^0^XBtZCrCsZM8(<>$yU9I_Fq9&-J zdvEy-w(9s(SVA~}ulpCF4sMC3i!@&X01cRlFpq4{oiC-(BDrQC$;v?%5PjgwC<_ft zOw0Jop*}MpD|xdqe@bVtC!tz2;E{Fz@z)utH>P9BI+fh$&&QK5&#hG5H{QZSAKFS*(Xv{TyR z%(Myn+#M9LuMozv#&=S#E&`0Eeu!K>1Q@#nu|~v7GmA?r-IR&p^mQDo$V_qN9BheS zTRgWy+B|upE+?4y6Nh(Fvm-x9)ew_GtRypk0DbeX!wNY+OmX|#Oj`Z|{0N21GeFm7 zq#UJ?LRJbsN!P&KcR9e1A&?&@@MWwZcFT>KSQ=P6f&b%m)D@K?bSr?6O_A#<-XSD39v)SgF%6*chXz81ZBBk5l} z!miT4LgW7&rGK5kJY_mdF1kwd&y(u{fr|vL5Qq|3Cvcs>w+XPz_b-#{Z36$8z`F$g zIf3^G{3`-qCa{evV`_}`H81Jv_LDl~i~JuWDyF@}uQSQJW&UGI`sW0GNPyjP+2!&3 zRUqiD|J{R~^AVORt6$;!S6~-TMx*u@GKj36X{sYeb5jPy+ zh97Ws54ft&a&(HE&$uLbT*)cQSaSVGT-yV#;Umub=Un*%ZsfC^5=!_qc+A3@a}_zC zas&!FMdN2V8cGcCkicifoMKqZq zbOR4K&jZf=fb0KQX?&nM{y;VGK-KoAD(9Y|c;jfa(t>Z2XmQEL@u;bEWBOyQdFyOM zYuz9MxLUtC_DjbWA*^ka&5nqB8Wxj%i1|^V*I^z&#zEt=zr% zf%4A!2a6xtzkBh{TSE8TV+xHID;2KL7*s?kT%d5-cLKX(J8!(_5$yEhYsZT(BI5cS z#o*Q((9fh8+`i6!cF)|I61pbwN~;)*=jSLU6repc-M&nb6N+7EPom!sh+2$aN-?Pb z73%D53&l(-aN@GxJ5!Wm5-AE5z1#KM7vF7(b8zmC-!a`;x>NrC>|^rA>q-^5cxNwv z_vJVT=ba|@^TFDOW8sqvA0GYgn~y0t-kziIgf3Fio}ErA*%Qw zV^b~VHHvwS+iyhjyb!HaH}B=?fA!5TzZoudMe^Jbs#NH}_?c6(Q6VYJqQV?Hbyop( z9Oo2@;!tIrqu(9X&it-@H*jZc_l@^G;g(^cY2*=$!DnMmt(-W1Oqs7}-8vTM@Uz{y zqug28t=|poHofP ZpF|{Falb;15dcXWKaY(~3i&(r{{UiPp2PqE literal 0 HcmV?d00001 diff --git a/engine/__pycache__/async_llm_engine.cpython-312.pyc b/engine/__pycache__/async_llm_engine.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c64a3899ed09cc45267a4ed2ec78de4bab2af064 GIT binary patch literal 251 zcmX@j%ge<81UW3-nMZ;2V-N=hn4pZ$B0$D;h7^Vr#vF!R#wbQchE&EVrb;GF=9eHP znvAzN9E&USl6`!9{WO_xNtET}ZRtTXXd5qC4%JQfnr6>K-IVSU}{~#(kmH0 zgADoQtY4B_pkI(#pj(nyl%86mTTyCmX_0N7s-KgYq+d{3l98XM4>DOFVyr&g*m$Tt u1(mlrY;yBcN^?@}ia3F0G6HciKalvq%*e?2m4ks%?}4CXBVQ3aPz(U15<~|8 literal 0 HcmV?d00001 diff --git a/engine/__pycache__/llm_engine.cpython-312.pyc b/engine/__pycache__/llm_engine.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2538094384736b684f181ba4dea079c02b881d3b GIT binary patch literal 244 zcmX@j%ge<81UW3-nOA`HV-N=hn4pZ$B0$D;h7^Vr#vF!R#waF6hE%2~#!4nl=9eHP znvAzNeSCaf^U^c(QvEcUZ%LNrX+me=oe%b=$0fFrKgtYR+O4sT4bB2>gQx8=@(R%WaQ`RgUr>3n52&|RIi}&7Kcr4 oeoARhs$CH$&@@IMF6IXkAD9^#8NYHcFxo%h5pU!yVh4%=0BKP`(EtDd literal 0 HcmV?d00001 diff --git a/engine/__pycache__/protocol.cpython-312.pyc b/engine/__pycache__/protocol.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a26002ea025ded024c9943f5793105ffea10ede GIT binary patch literal 7586 zcmb^$OKcn0@sV79MN$+=Nt9$sD@w9WOQLMapV+aBIgus9>0rh4|xdrj3`9$XrzE|ujbLc1+VTa z_&nJ5X@c%A`1L>`pa%;K zo1QErIUd$hdb*I-+Y9Zy9nm`U&O#^0TeOVcRp{b)RO{Az3OyW;X$SP)LNCYTTA$uu z=;wH=Ch7x)0gflMtUg#6^bjwh2|Dy8_=Znm*h?-ErEQrgNlM{4E-Smlak$C{e>)#X>9(;_uZ13QViU(Lu& z)=eP9ugiKF`>9i6*DK|+!7QrWkj(`UiGtlXj0IX!m$)Hq7fW+$iB1`e*39wP;kuS$ z%P1OJzP!jX&;tiufS3mWER&5$ z>Vu(=b06kD8hLN3q4BdRHc%698ZGPupY0h~lRKmJD(ujLj|$YUc+Ucx1_1MMEC`su zv5?|l3Ifls1b914BTA6A%zBm3vQQh>+Pe^?JN)ny_^}3lFBEKc#>tPx38`@sdO*< z>w5cP48Th&(lq+QJr9Dcp!DpFSYd$AQx?YYKm&EKK6||l)FGgrQ2H7u!<^FJKsm@M zVguz6rwlYu4s%Mjfs*5t!3N3^pqx~Oz>?x;}F<3Eqdd$^0PA8P3w8Lr5 zhLmFI}(o|ZqmWCi?@%<|nHQ6-9S%ZmFoMnoZ zAv&^l3+{NS=!EMi#1*R5T44wPPGp_7qpBg*n|6RvtHMe}HwnS)I}fIsZ2`b%WRcwU zOfwPiyB=FGsWxk(w8*jm?TBhhtWqjLU|#BMv^u}TI0%BQI)_O@6u5JR0UY)t~+UG({9DD-!Q` zms2~8i3WWmd9IIP(#i$w7MM@S#*#(vSawu1m@F|T{n{zZ$%nF~8YOArHslM_j$ute zrFl?FnQ4d0Of{HlE!s&%wq&UcDOb~A^)R`#U5)+1CjcyZvYa9MN}L=Ma1k z!M>_ZEH1_R<1E?GoKybLrS_(Yt&y=u6QVyX*J>XLNBLOW@v>9K)ZFi|VA&UCi{ftL zqmLtP7Q^si7{ht{R-*_`(>`%E z3E>*U0>p8NmheQNEcG`TvR8nDV!p`uFl`*2{ElpiDh8e=B-4^DbE&JzibmFav;HGv zq)WRurq)r3_y3+oSpp=Ql5K4pfY#d%s>z=#P#yDVyCEr3PrWdzSODK$g7uR|ek zi4aI;RXpPGRYQ60{uF~^FgDD>&;wu4T}!5kSAIT|x~@Z#{a-AKV)CsU)I4d8tjhD* z({qiQ?sn2Enx&3GV3aknD8ukSWsKqsKaq4Oico5)r)eSWVm_Z$9XnQ)8J$({>}ycc z(dkIKMm6uTy#haNGR*dC^cK~gIVdNtYm}DvET^0s65oK%n?s_B^1DJJr!6WdRxTH6H!*=GNd+9EkS*c z`Opalyz-x|K*e6qkY=qmaHkZDfu<=9$%+Eel*zlQof`Sq@WD>VvOB;Asksa^;X1{R zDNI&NlBMbtinOAy<`%r`o^?ROsS2z*`+aBdJxht}MOmW_&cJCRHEmdz;Z}$0aLl2~ zMXa}TJ||rlp$nz}Fi{s|d?#rqOzcLYa1gT8q6CYzw`nqVvsbW(H~^lb9*0P4_Fibf z&E~wX6@chwBX{+~#b`0bmQ73aa12_S*<DtJRJ?f16qg#oA)tA>3`Klip)d1-}Tn!=?BJG)K z7_lhn=vq1cX`vd!R-ANZS1+u09Im!vGeP>YtAnfN>fz7dT2t10C#r2oPLfRjin%(q za%a8s@Y?b9&hcsr>1mScsJ0{4L3+j2q0e8gq7l^$iDXt{tJb~D$a;A6Q5Vp-2URa} zMHn0~WAvzN`q(jW<6$a_)^}I^@CL)X4pxJRIff$^shiq@&FEg6Vn~mpO|6I}$iUF5 z@;xA@XZITCID7o5thE=NkM~GK$a&8N&qJT!&u$#d zRr&kOIsV=X$F}`IhtV7u-ss3|^kpBmg#6j9M5^kCH>`|g=Suq11Jxk5AhV^bVZi$^LRka+Y8Sc~{$5szoi&12rVd353^53vcnkj$$k__@c-V@bi|qATtETEsBa z+ihGBYm~fu?sr0Fl+)ciAtx?kthBp1y{uK>*Bvw8c%NxH!8_#$G+o&6;A*EiXV+Dh z6RIPe<((pprvN+COw2<%ac*Fp z9D}*Z4mg8g3m4(X9_NP{9&klq?gU{wbk_Ni=zHvAU|_~GF`8F>9*<``L_E>2$>7)I z+&Ve8Njm;c#y82KO)|Jia$l3Pn`B~>jBO%uXp@}TB%_-o|Cez3lh+@F``!yY4tPBs dAKdu(jSt^|b>ZoF9J%84WIvhwC&4$j;lK0!Tv-4B literal 0 HcmV?d00001 diff --git a/engine/arg_utils.py b/engine/arg_utils.py new file mode 100644 index 0000000..ab6e5e5 --- /dev/null +++ b/engine/arg_utils.py @@ -0,0 +1,2144 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import copy +import dataclasses +import functools +import json +import sys +from collections.abc import Callable +from dataclasses import MISSING, dataclass, fields, is_dataclass +from itertools import permutations +from types import UnionType +from typing import ( + TYPE_CHECKING, + Annotated, + Any, + Literal, + TypeAlias, + TypeVar, + Union, + cast, + get_args, + get_origin, +) + +import huggingface_hub +import regex as re +import torch +from pydantic import TypeAdapter, ValidationError +from pydantic.fields import FieldInfo +from typing_extensions import TypeIs, deprecated + +import vllm.envs as envs +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.config import ( + CacheConfig, + CompilationConfig, + ConfigType, + DeviceConfig, + ECTransferConfig, + EPLBConfig, + KVEventsConfig, + KVTransferConfig, + LoadConfig, + LoRAConfig, + ModelConfig, + MultiModalConfig, + ObservabilityConfig, + ParallelConfig, + PoolerConfig, + SchedulerConfig, + SpeculativeConfig, + StructuredOutputsConfig, + VllmConfig, + get_attr_docs, +) +from vllm.config.cache import ( + BlockSize, + CacheDType, + KVOffloadingBackend, + MambaDType, + PrefixCachingHashAlgo, +) +from vllm.config.device import Device +from vllm.config.model import ( + ConvertOption, + HfOverrides, + LogprobsMode, + ModelDType, + RunnerOption, + TaskOption, + TokenizerMode, +) +from vllm.config.multimodal import MMCacheType, MMEncoderTPMode +from vllm.config.observability import DetailedTraceModules +from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy +from vllm.config.scheduler import SchedulerPolicy +from vllm.config.utils import get_field +from vllm.logger import init_logger +from vllm.platforms import CpuArchEnum, current_platform +from vllm.plugins import load_general_plugins +from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized +from vllm.transformers_utils.config import ( + get_model_path, + is_interleaved, + maybe_override_with_speculators, +) +from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.network_utils import get_ip +from vllm.v1.sample.logits_processor import LogitsProcessor + +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.model_executor.model_loader import LoadFormats + from vllm.usage.usage_lib import UsageContext + from vllm.v1.executor import Executor +else: + Executor = Any + QuantizationMethods = Any + LoadFormats = Any + UsageContext = Any + +logger = init_logger(__name__) + +# object is used to allow for special typing forms +T = TypeVar("T") +TypeHint: TypeAlias = type[Any] | object +TypeHintT: TypeAlias = type[T] | object + + +def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]: + def _parse_type(val: str) -> T: + try: + return return_type(val) + except ValueError as e: + raise argparse.ArgumentTypeError( + f"Value {val} cannot be converted to {return_type}." + ) from e + + return _parse_type + + +def optional_type(return_type: Callable[[str], T]) -> Callable[[str], T | None]: + def _optional_type(val: str) -> T | None: + if val == "" or val == "None": + return None + return parse_type(return_type)(val) + + return _optional_type + + +def union_dict_and_str(val: str) -> str | dict[str, str] | None: + if not re.match(r"(?s)^\s*{.*}\s*$", val): + return str(val) + return optional_type(json.loads)(val) + + +def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]: + """Check if the type hint is a specific type.""" + return type_hint is type or get_origin(type_hint) is type + + +def contains_type(type_hints: set[TypeHint], type: TypeHintT) -> bool: + """Check if the type hints contain a specific type.""" + return any(is_type(type_hint, type) for type_hint in type_hints) + + +def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT: + """Get the specific type from the type hints.""" + return next((th for th in type_hints if is_type(th, type)), None) + + +def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]: + """Get the `type` and `choices` from a `Literal` type hint in `type_hints`. + + If `type_hints` also contains `str`, we use `metavar` instead of `choices`. + """ + type_hint = get_type(type_hints, Literal) + options = get_args(type_hint) + option_type = type(options[0]) + if not all(isinstance(option, option_type) for option in options): + raise ValueError( + "All options must be of the same type. " + f"Got {options} with types {[type(c) for c in options]}" + ) + kwarg = "metavar" if contains_type(type_hints, str) else "choices" + return {"type": option_type, kwarg: sorted(options)} + + +def collection_to_kwargs(type_hints: set[TypeHint], type: TypeHint) -> dict[str, Any]: + type_hint = get_type(type_hints, type) + types = get_args(type_hint) + elem_type = types[0] + + # Handle Ellipsis + assert all(t is elem_type for t in types if t is not Ellipsis), ( + f"All non-Ellipsis elements must be of the same type. Got {types}." + ) + + # Handle Union types + if get_origin(elem_type) in {Union, UnionType}: + # Union for Union[X, Y] and UnionType for X | Y + assert str in get_args(elem_type), ( + "If element can have multiple types, one must be 'str' " + f"(i.e. 'list[int | str]'). Got {elem_type}." + ) + elem_type = str + + return { + "type": elem_type, + "nargs": "+" if type is not tuple or Ellipsis in types else len(types), + } + + +def is_not_builtin(type_hint: TypeHint) -> bool: + """Check if the class is not a built-in type.""" + return type_hint.__module__ != "builtins" + + +def get_type_hints(type_hint: TypeHint) -> set[TypeHint]: + """Extract type hints from Annotated or Union type hints.""" + type_hints: set[TypeHint] = set() + origin = get_origin(type_hint) + args = get_args(type_hint) + + if origin is Annotated: + type_hints.update(get_type_hints(args[0])) + elif origin in {Union, UnionType}: + # Union for Union[X, Y] and UnionType for X | Y + for arg in args: + type_hints.update(get_type_hints(arg)) + else: + type_hints.add(type_hint) + + return type_hints + + +def is_online_quantization(quantization: Any) -> bool: + return quantization in ["inc"] + + +NEEDS_HELP = ( + any("--help" in arg for arg in sys.argv) # vllm SUBCOMMAND --help + or (argv0 := sys.argv[0]).endswith("mkdocs") # mkdocs SUBCOMMAND + or argv0.endswith("mkdocs/__main__.py") # python -m mkdocs SUBCOMMAND +) + + +@functools.lru_cache(maxsize=30) +def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]: + # Save time only getting attr docs if we're generating help text + cls_docs = get_attr_docs(cls) if NEEDS_HELP else {} + kwargs = {} + for field in fields(cls): + # Get the set of possible types for the field + type_hints: set[TypeHint] = get_type_hints(field.type) + + # If the field is a dataclass, we can use the model_validate_json + generator = (th for th in type_hints if is_dataclass(th)) + dataclass_cls = next(generator, None) + + # Get the default value of the field + if field.default is not MISSING: + default = field.default + # Handle pydantic.Field defaults + if isinstance(default, FieldInfo): + default = ( + default.default + if default.default_factory is None + else default.default_factory() + ) + elif field.default_factory is not MISSING: + default = field.default_factory() + + # Get the help text for the field + name = field.name + help = cls_docs.get(name, "").strip() + # Escape % for argparse + help = help.replace("%", "%%") + + # Initialise the kwargs dictionary for the field + kwargs[name] = {"default": default, "help": help} + + # Set other kwargs based on the type hints + json_tip = ( + "Should either be a valid JSON string or JSON keys passed individually." + ) + if dataclass_cls is not None: + + def parse_dataclass(val: str, cls=dataclass_cls) -> Any: + try: + return TypeAdapter(cls).validate_json(val) + except ValidationError as e: + raise argparse.ArgumentTypeError(repr(e)) from e + + kwargs[name]["type"] = parse_dataclass + kwargs[name]["help"] += f"\n\n{json_tip}" + elif contains_type(type_hints, bool): + # Creates --no- and -- flags + kwargs[name]["action"] = argparse.BooleanOptionalAction + elif contains_type(type_hints, Literal): + kwargs[name].update(literal_to_kwargs(type_hints)) + elif contains_type(type_hints, tuple): + kwargs[name].update(collection_to_kwargs(type_hints, tuple)) + elif contains_type(type_hints, list): + kwargs[name].update(collection_to_kwargs(type_hints, list)) + elif contains_type(type_hints, set): + kwargs[name].update(collection_to_kwargs(type_hints, set)) + elif contains_type(type_hints, int): + kwargs[name]["type"] = int + # Special case for large integers + human_readable_ints = { + "max_model_len", + "max_num_batched_tokens", + "kv_cache_memory_bytes", + } + if name in human_readable_ints: + kwargs[name]["type"] = human_readable_int + kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}" + elif contains_type(type_hints, float): + kwargs[name]["type"] = float + elif contains_type(type_hints, dict) and ( + contains_type(type_hints, str) + or any(is_not_builtin(th) for th in type_hints) + ): + kwargs[name]["type"] = union_dict_and_str + elif contains_type(type_hints, dict): + kwargs[name]["type"] = parse_type(json.loads) + kwargs[name]["help"] += f"\n\n{json_tip}" + elif contains_type(type_hints, str) or any( + is_not_builtin(th) for th in type_hints + ): + kwargs[name]["type"] = str + else: + raise ValueError(f"Unsupported type {type_hints} for argument {name}.") + + # If the type hint was a sequence of literals, use the helper function + # to update the type and choices + if get_origin(kwargs[name].get("type")) is Literal: + kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]})) + + # If None is in type_hints, make the argument optional. + # But not if it's a bool, argparse will handle this better. + if type(None) in type_hints and not contains_type(type_hints, bool): + kwargs[name]["type"] = optional_type(kwargs[name]["type"]) + if kwargs[name].get("choices"): + kwargs[name]["choices"].append("None") + return kwargs + + +def get_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]: + """Return argparse kwargs for the given Config dataclass. + + If `--help` or `mkdocs` are not present in the command line command, the + attribute documentation will not be included in the help output. + + The heavy computation is cached via functools.lru_cache, and a deep copy + is returned so callers can mutate the dictionary without affecting the + cached version. + """ + return copy.deepcopy(_compute_kwargs(cls)) + + +@dataclass +class EngineArgs: + """Arguments for vLLM engine.""" + + model: str = ModelConfig.model + served_model_name: str | list[str] | None = ModelConfig.served_model_name + tokenizer: str | None = ModelConfig.tokenizer + hf_config_path: str | None = ModelConfig.hf_config_path + runner: RunnerOption = ModelConfig.runner + convert: ConvertOption = ModelConfig.convert + task: TaskOption | None = ModelConfig.task + skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init + enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds + tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode + trust_remote_code: bool = ModelConfig.trust_remote_code + allowed_local_media_path: str = ModelConfig.allowed_local_media_path + allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains + download_dir: str | None = LoadConfig.download_dir + safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy + load_format: str | LoadFormats = LoadConfig.load_format + config_format: str = ModelConfig.config_format + dtype: ModelDType = ModelConfig.dtype + kv_cache_dtype: CacheDType = CacheConfig.cache_dtype + seed: int | None = ModelConfig.seed + max_model_len: int | None = ModelConfig.max_model_len + cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes + cudagraph_capture_sizes: list[int] | None = ( + CompilationConfig.cudagraph_capture_sizes + ) + max_cudagraph_capture_size: int | None = get_field( + CompilationConfig, "max_cudagraph_capture_size" + ) + # Note: Specifying a custom executor backend by passing a class + # is intended for expert use only. The API may change without + # notice. + distributed_executor_backend: ( + str | DistributedExecutorBackend | type[Executor] | None + ) = ParallelConfig.distributed_executor_backend + # number of P/D disaggregation (or other disaggregation) workers + pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size + master_addr: str = ParallelConfig.master_addr + master_port: int = ParallelConfig.master_port + nnodes: int = ParallelConfig.nnodes + node_rank: int = ParallelConfig.node_rank + tensor_parallel_size: int = ParallelConfig.tensor_parallel_size + decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size + dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size + data_parallel_size: int = ParallelConfig.data_parallel_size + data_parallel_rank: int | None = None + data_parallel_start_rank: int | None = None + data_parallel_size_local: int | None = None + data_parallel_address: str | None = None + data_parallel_rpc_port: int | None = None + data_parallel_hybrid_lb: bool = False + data_parallel_external_lb: bool = False + data_parallel_backend: str = ParallelConfig.data_parallel_backend + enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel + all2all_backend: str | None = ParallelConfig.all2all_backend + enable_dbo: bool = ParallelConfig.enable_dbo + dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold + dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold + disable_nccl_for_dp_synchronization: bool = ( + ParallelConfig.disable_nccl_for_dp_synchronization + ) + eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config") + enable_eplb: bool = ParallelConfig.enable_eplb + expert_placement_strategy: ExpertPlacementStrategy = ( + ParallelConfig.expert_placement_strategy + ) + _api_process_count: int = ParallelConfig._api_process_count + _api_process_rank: int = ParallelConfig._api_process_rank + num_redundant_experts: int = EPLBConfig.num_redundant_experts + eplb_window_size: int = EPLBConfig.window_size + eplb_step_interval: int = EPLBConfig.step_interval + eplb_log_balancedness: bool = EPLBConfig.log_balancedness + max_parallel_loading_workers: int | None = ( + ParallelConfig.max_parallel_loading_workers + ) + block_size: BlockSize | None = CacheConfig.block_size + enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching + prefix_caching_hash_algo: PrefixCachingHashAlgo = ( + CacheConfig.prefix_caching_hash_algo + ) + disable_sliding_window: bool = ModelConfig.disable_sliding_window + disable_cascade_attn: bool = ModelConfig.disable_cascade_attn + swap_space: float = CacheConfig.swap_space + cpu_offload_gb: float = CacheConfig.cpu_offload_gb + gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization + kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes + max_num_batched_tokens: int | None = None + max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills + max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills + long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold + max_num_seqs: int | None = None + max_logprobs: int = ModelConfig.max_logprobs + logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode + disable_log_stats: bool = False + aggregate_engine_logging: bool = False + revision: str | None = ModelConfig.revision + code_revision: str | None = ModelConfig.code_revision + hf_token: bool | str | None = ModelConfig.hf_token + hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") + tokenizer_revision: str | None = ModelConfig.tokenizer_revision + quantization: QuantizationMethods | None = ModelConfig.quantization + enforce_eager: bool = ModelConfig.enforce_eager + disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce + limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field( + MultiModalConfig, "limit_per_prompt" + ) + enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds + interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings + media_io_kwargs: dict[str, dict[str, Any]] = get_field( + MultiModalConfig, "media_io_kwargs" + ) + mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs + disable_mm_preprocessor_cache: bool = False # DEPRECATED + mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb + mm_processor_cache_type: MMCacheType | None = ( + MultiModalConfig.mm_processor_cache_type + ) + mm_shm_cache_max_object_size_mb: int = ( + MultiModalConfig.mm_shm_cache_max_object_size_mb + ) + mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode + mm_encoder_attn_backend: AttentionBackendEnum | str | None = ( + MultiModalConfig.mm_encoder_attn_backend + ) + io_processor_plugin: str | None = None + skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling + video_pruning_rate: float = MultiModalConfig.video_pruning_rate + # LoRA fields + enable_lora: bool = False + max_loras: int = LoRAConfig.max_loras + max_lora_rank: int = LoRAConfig.max_lora_rank + default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras + fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras + max_cpu_loras: int | None = LoRAConfig.max_cpu_loras + lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype + lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size + + ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight + num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override + num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots + model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config") + ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns") + + enable_chunked_prefill: bool | None = None + disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input + + disable_hybrid_kv_cache_manager: bool = ( + SchedulerConfig.disable_hybrid_kv_cache_manager + ) + + structured_outputs_config: StructuredOutputsConfig = get_field( + VllmConfig, "structured_outputs_config" + ) + reasoning_parser: str = StructuredOutputsConfig.reasoning_parser + reasoning_parser_plugin: str | None = None + # Deprecated guided decoding fields + guided_decoding_backend: str | None = None + guided_decoding_disable_fallback: bool | None = None + guided_decoding_disable_any_whitespace: bool | None = None + guided_decoding_disable_additional_properties: bool | None = None + + logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern + + speculative_config: dict[str, Any] | None = None + + show_hidden_metrics_for_version: str | None = ( + ObservabilityConfig.show_hidden_metrics_for_version + ) + otlp_traces_endpoint: str | None = ObservabilityConfig.otlp_traces_endpoint + collect_detailed_traces: list[DetailedTraceModules] | None = ( + ObservabilityConfig.collect_detailed_traces + ) + scheduling_policy: SchedulerPolicy = SchedulerConfig.policy + scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls + + pooler_config: PoolerConfig | None = ModelConfig.pooler_config + override_pooler_config: dict | PoolerConfig | None = ( + ModelConfig.override_pooler_config + ) + compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config") + worker_cls: str = ParallelConfig.worker_cls + worker_extension_cls: str = ParallelConfig.worker_extension_cls + + kv_transfer_config: KVTransferConfig | None = None + kv_events_config: KVEventsConfig | None = None + + ec_transfer_config: ECTransferConfig | None = None + + generation_config: str = ModelConfig.generation_config + enable_sleep_mode: bool = ModelConfig.enable_sleep_mode + override_generation_config: dict[str, Any] = get_field( + ModelConfig, "override_generation_config" + ) + model_impl: str = ModelConfig.model_impl + override_attention_dtype: str = ModelConfig.override_attention_dtype + + calculate_kv_scales: bool = CacheConfig.calculate_kv_scales + mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype + mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype + mamba_block_size: int | None = get_field(CacheConfig, "mamba_block_size") + + additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config") + + use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load + pt_load_map_location: str = LoadConfig.pt_load_map_location + + # DEPRECATED + enable_multimodal_encoder_data_parallel: bool = False + + logits_processors: list[str | type[LogitsProcessor]] | None = ( + ModelConfig.logits_processors + ) + """Custom logitproc types""" + + async_scheduling: bool | None = SchedulerConfig.async_scheduling + + stream_interval: int = SchedulerConfig.stream_interval + + kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill + + kv_offloading_size: float | None = CacheConfig.kv_offloading_size + kv_offloading_backend: KVOffloadingBackend | None = ( + CacheConfig.kv_offloading_backend + ) + tokens_only: bool = False + + def __post_init__(self): + # support `EngineArgs(compilation_config={...})` + # without having to manually construct a + # CompilationConfig object + if isinstance(self.compilation_config, dict): + self.compilation_config = CompilationConfig(**self.compilation_config) + if isinstance(self.eplb_config, dict): + self.eplb_config = EPLBConfig(**self.eplb_config) + # Setup plugins + from vllm.plugins import load_general_plugins + + load_general_plugins() + # when use hf offline,replace model id to local model path + if huggingface_hub.constants.HF_HUB_OFFLINE: + model_id = self.model + self.model = get_model_path(self.model, self.revision) + logger.info( + "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]", + model_id, + self.model, + ) + + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Shared CLI arguments for vLLM engine.""" + + # Model arguments + model_kwargs = get_kwargs(ModelConfig) + model_group = parser.add_argument_group( + title="ModelConfig", + description=ModelConfig.__doc__, + ) + if not ("serve" in sys.argv[1:] and "--help" in sys.argv[1:]): + model_group.add_argument("--model", **model_kwargs["model"]) + model_group.add_argument("--runner", **model_kwargs["runner"]) + model_group.add_argument("--convert", **model_kwargs["convert"]) + model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) + model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) + model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) + model_group.add_argument( + "--trust-remote-code", **model_kwargs["trust_remote_code"] + ) + model_group.add_argument("--dtype", **model_kwargs["dtype"]) + model_group.add_argument("--seed", **model_kwargs["seed"]) + model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"]) + model_group.add_argument( + "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"] + ) + model_group.add_argument( + "--allowed-media-domains", **model_kwargs["allowed_media_domains"] + ) + model_group.add_argument("--revision", **model_kwargs["revision"]) + model_group.add_argument("--code-revision", **model_kwargs["code_revision"]) + model_group.add_argument( + "--tokenizer-revision", **model_kwargs["tokenizer_revision"] + ) + model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"]) + model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"]) + model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) + model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) + model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"]) + model_group.add_argument( + "--disable-sliding-window", **model_kwargs["disable_sliding_window"] + ) + model_group.add_argument( + "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"] + ) + model_group.add_argument( + "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"] + ) + model_group.add_argument( + "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"] + ) + model_group.add_argument( + "--served-model-name", **model_kwargs["served_model_name"] + ) + model_group.add_argument("--config-format", **model_kwargs["config_format"]) + # This one is a special case because it can bool + # or str. TODO: Handle this in get_kwargs + model_group.add_argument( + "--hf-token", + type=str, + nargs="?", + const=True, + default=model_kwargs["hf_token"]["default"], + help=model_kwargs["hf_token"]["help"], + ) + model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"]) + model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"]) + model_group.add_argument( + "--override-pooler-config", + **model_kwargs["override_pooler_config"], + deprecated=True, + ) + model_group.add_argument( + "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"] + ) + model_group.add_argument( + "--generation-config", **model_kwargs["generation_config"] + ) + model_group.add_argument( + "--override-generation-config", **model_kwargs["override_generation_config"] + ) + model_group.add_argument( + "--enable-sleep-mode", **model_kwargs["enable_sleep_mode"] + ) + model_group.add_argument("--model-impl", **model_kwargs["model_impl"]) + model_group.add_argument( + "--override-attention-dtype", **model_kwargs["override_attention_dtype"] + ) + model_group.add_argument( + "--logits-processors", **model_kwargs["logits_processors"] + ) + model_group.add_argument( + "--io-processor-plugin", **model_kwargs["io_processor_plugin"] + ) + + # Model loading arguments + load_kwargs = get_kwargs(LoadConfig) + load_group = parser.add_argument_group( + title="LoadConfig", + description=LoadConfig.__doc__, + ) + load_group.add_argument("--load-format", **load_kwargs["load_format"]) + load_group.add_argument("--download-dir", **load_kwargs["download_dir"]) + load_group.add_argument( + "--safetensors-load-strategy", **load_kwargs["safetensors_load_strategy"] + ) + load_group.add_argument( + "--model-loader-extra-config", **load_kwargs["model_loader_extra_config"] + ) + load_group.add_argument("--ignore-patterns", **load_kwargs["ignore_patterns"]) + load_group.add_argument("--use-tqdm-on-load", **load_kwargs["use_tqdm_on_load"]) + load_group.add_argument( + "--pt-load-map-location", **load_kwargs["pt_load_map_location"] + ) + + # Structured outputs arguments + structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig) + structured_outputs_group = parser.add_argument_group( + title="StructuredOutputsConfig", + description=StructuredOutputsConfig.__doc__, + ) + structured_outputs_group.add_argument( + "--reasoning-parser", + # Choices need to be validated after parsing to include plugins + **structured_outputs_kwargs["reasoning_parser"], + ) + structured_outputs_group.add_argument( + "--reasoning-parser-plugin", + **structured_outputs_kwargs["reasoning_parser_plugin"], + ) + # Deprecated guided decoding arguments + for arg, type in [ + ("--guided-decoding-backend", str), + ("--guided-decoding-disable-fallback", bool), + ("--guided-decoding-disable-any-whitespace", bool), + ("--guided-decoding-disable-additional-properties", bool), + ]: + structured_outputs_group.add_argument( + arg, + type=type, + help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."), + deprecated=True, + ) + + # Parallel arguments + parallel_kwargs = get_kwargs(ParallelConfig) + parallel_group = parser.add_argument_group( + title="ParallelConfig", + description=ParallelConfig.__doc__, + ) + parallel_group.add_argument( + "--distributed-executor-backend", + **parallel_kwargs["distributed_executor_backend"], + ) + parallel_group.add_argument( + "--pipeline-parallel-size", + "-pp", + **parallel_kwargs["pipeline_parallel_size"], + ) + parallel_group.add_argument("--master-addr", **parallel_kwargs["master_addr"]) + parallel_group.add_argument("--master-port", **parallel_kwargs["master_port"]) + parallel_group.add_argument("--nnodes", "-n", **parallel_kwargs["nnodes"]) + parallel_group.add_argument("--node-rank", "-r", **parallel_kwargs["node_rank"]) + parallel_group.add_argument( + "--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"] + ) + parallel_group.add_argument( + "--decode-context-parallel-size", + "-dcp", + **parallel_kwargs["decode_context_parallel_size"], + ) + parallel_group.add_argument( + "--dcp-kv-cache-interleave-size", + **parallel_kwargs["dcp_kv_cache_interleave_size"], + ) + parallel_group.add_argument( + "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"] + ) + parallel_group.add_argument( + "--data-parallel-rank", + "-dpn", + type=int, + help="Data parallel rank of this instance. " + "When set, enables external load balancer mode.", + ) + parallel_group.add_argument( + "--data-parallel-start-rank", + "-dpr", + type=int, + help="Starting data parallel rank for secondary nodes.", + ) + parallel_group.add_argument( + "--data-parallel-size-local", + "-dpl", + type=int, + help="Number of data parallel replicas to run on this node.", + ) + parallel_group.add_argument( + "--data-parallel-address", + "-dpa", + type=str, + help="Address of data parallel cluster head-node.", + ) + parallel_group.add_argument( + "--data-parallel-rpc-port", + "-dpp", + type=int, + help="Port for data parallel RPC communication.", + ) + parallel_group.add_argument( + "--data-parallel-backend", + "-dpb", + type=str, + default="mp", + help='Backend for data parallel, either "mp" or "ray".', + ) + parallel_group.add_argument( + "--data-parallel-hybrid-lb", + "-dph", + **parallel_kwargs["data_parallel_hybrid_lb"], + ) + parallel_group.add_argument( + "--data-parallel-external-lb", + "-dpe", + **parallel_kwargs["data_parallel_external_lb"], + ) + parallel_group.add_argument( + "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"] + ) + parallel_group.add_argument( + "--all2all-backend", **parallel_kwargs["all2all_backend"] + ) + parallel_group.add_argument("--enable-dbo", **parallel_kwargs["enable_dbo"]) + parallel_group.add_argument( + "--dbo-decode-token-threshold", + **parallel_kwargs["dbo_decode_token_threshold"], + ) + parallel_group.add_argument( + "--dbo-prefill-token-threshold", + **parallel_kwargs["dbo_prefill_token_threshold"], + ) + parallel_group.add_argument( + "--disable-nccl-for-dp-synchronization", + **parallel_kwargs["disable_nccl_for_dp_synchronization"], + ) + parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"]) + parallel_group.add_argument("--eplb-config", **parallel_kwargs["eplb_config"]) + parallel_group.add_argument( + "--expert-placement-strategy", + **parallel_kwargs["expert_placement_strategy"], + ) + parallel_group.add_argument( + "--num-redundant-experts", + type=int, + help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.", + deprecated=True, + ) + parallel_group.add_argument( + "--eplb-window-size", + type=int, + help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.", + deprecated=True, + ) + parallel_group.add_argument( + "--eplb-step-interval", + type=int, + help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.", + deprecated=True, + ) + parallel_group.add_argument( + "--eplb-log-balancedness", + action=argparse.BooleanOptionalAction, + help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.", + deprecated=True, + ) + + parallel_group.add_argument( + "--max-parallel-loading-workers", + **parallel_kwargs["max_parallel_loading_workers"], + ) + parallel_group.add_argument( + "--ray-workers-use-nsight", **parallel_kwargs["ray_workers_use_nsight"] + ) + parallel_group.add_argument( + "--disable-custom-all-reduce", + **parallel_kwargs["disable_custom_all_reduce"], + ) + parallel_group.add_argument("--worker-cls", **parallel_kwargs["worker_cls"]) + parallel_group.add_argument( + "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"] + ) + parallel_group.add_argument( + "--enable-multimodal-encoder-data-parallel", + action="store_true", + deprecated=True, + ) + + # KV cache arguments + cache_kwargs = get_kwargs(CacheConfig) + cache_group = parser.add_argument_group( + title="CacheConfig", + description=CacheConfig.__doc__, + ) + cache_group.add_argument("--block-size", **cache_kwargs["block_size"]) + cache_group.add_argument( + "--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"] + ) + cache_group.add_argument( + "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"] + ) + cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"]) + cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"]) + cache_group.add_argument( + "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"] + ) + cache_group.add_argument( + "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"] + ) + cache_group.add_argument( + "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"] + ) + cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"]) + cache_group.add_argument( + "--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"] + ) + cache_group.add_argument( + "--kv-sharing-fast-prefill", **cache_kwargs["kv_sharing_fast_prefill"] + ) + cache_group.add_argument( + "--mamba-cache-dtype", **cache_kwargs["mamba_cache_dtype"] + ) + cache_group.add_argument( + "--mamba-ssm-cache-dtype", **cache_kwargs["mamba_ssm_cache_dtype"] + ) + cache_group.add_argument( + "--mamba-block-size", **cache_kwargs["mamba_block_size"] + ) + cache_group.add_argument( + "--kv-offloading-size", **cache_kwargs["kv_offloading_size"] + ) + cache_group.add_argument( + "--kv-offloading-backend", **cache_kwargs["kv_offloading_backend"] + ) + + # Multimodal related configs + multimodal_kwargs = get_kwargs(MultiModalConfig) + multimodal_group = parser.add_argument_group( + title="MultiModalConfig", + description=MultiModalConfig.__doc__, + ) + multimodal_group.add_argument( + "--limit-mm-per-prompt", **multimodal_kwargs["limit_per_prompt"] + ) + multimodal_group.add_argument( + "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"] + ) + multimodal_group.add_argument( + "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"] + ) + multimodal_group.add_argument( + "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"] + ) + multimodal_group.add_argument( + "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"] + ) + multimodal_group.add_argument( + "--disable-mm-preprocessor-cache", action="store_true", deprecated=True + ) + multimodal_group.add_argument( + "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"] + ) + multimodal_group.add_argument( + "--mm-shm-cache-max-object-size-mb", + **multimodal_kwargs["mm_shm_cache_max_object_size_mb"], + ) + multimodal_group.add_argument( + "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"] + ) + multimodal_group.add_argument( + "--mm-encoder-attn-backend", + **multimodal_kwargs["mm_encoder_attn_backend"], + ) + multimodal_group.add_argument( + "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"] + ) + multimodal_group.add_argument( + "--skip-mm-profiling", **multimodal_kwargs["skip_mm_profiling"] + ) + + multimodal_group.add_argument( + "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"] + ) + + # LoRA related configs + lora_kwargs = get_kwargs(LoRAConfig) + lora_group = parser.add_argument_group( + title="LoRAConfig", + description=LoRAConfig.__doc__, + ) + lora_group.add_argument( + "--enable-lora", + action=argparse.BooleanOptionalAction, + help="If True, enable handling of LoRA adapters.", + ) + lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"]) + lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"]) + lora_group.add_argument( + "--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"] + ) + lora_group.add_argument( + "--lora-dtype", + **lora_kwargs["lora_dtype"], + ) + lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"]) + lora_group.add_argument( + "--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"] + ) + lora_group.add_argument("--default-mm-loras", **lora_kwargs["default_mm_loras"]) + + # Observability arguments + observability_kwargs = get_kwargs(ObservabilityConfig) + observability_group = parser.add_argument_group( + title="ObservabilityConfig", + description=ObservabilityConfig.__doc__, + ) + observability_group.add_argument( + "--show-hidden-metrics-for-version", + **observability_kwargs["show_hidden_metrics_for_version"], + ) + observability_group.add_argument( + "--otlp-traces-endpoint", **observability_kwargs["otlp_traces_endpoint"] + ) + # TODO: generalise this special case + choices = observability_kwargs["collect_detailed_traces"]["choices"] + metavar = f"{{{','.join(choices)}}}" + observability_kwargs["collect_detailed_traces"]["metavar"] = metavar + observability_kwargs["collect_detailed_traces"]["choices"] += [ + ",".join(p) for p in permutations(get_args(DetailedTraceModules), r=2) + ] + observability_group.add_argument( + "--collect-detailed-traces", + **observability_kwargs["collect_detailed_traces"], + ) + + # Scheduler arguments + scheduler_kwargs = get_kwargs(SchedulerConfig) + scheduler_group = parser.add_argument_group( + title="SchedulerConfig", + description=SchedulerConfig.__doc__, + ) + scheduler_group.add_argument( + "--max-num-batched-tokens", + **{ + **scheduler_kwargs["max_num_batched_tokens"], + "default": None, + }, + ) + scheduler_group.add_argument( + "--max-num-seqs", + **{ + **scheduler_kwargs["max_num_seqs"], + "default": None, + }, + ) + scheduler_group.add_argument( + "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"] + ) + scheduler_group.add_argument( + "--max-long-partial-prefills", + **scheduler_kwargs["max_long_partial_prefills"], + ) + scheduler_group.add_argument( + "--long-prefill-token-threshold", + **scheduler_kwargs["long_prefill_token_threshold"], + ) + scheduler_group.add_argument( + "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"] + ) + # multi-step scheduling has been removed; corresponding arguments + # are no longer supported. + scheduler_group.add_argument( + "--scheduling-policy", **scheduler_kwargs["policy"] + ) + scheduler_group.add_argument( + "--enable-chunked-prefill", + **{ + **scheduler_kwargs["enable_chunked_prefill"], + "default": None, + }, + ) + scheduler_group.add_argument( + "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"] + ) + scheduler_group.add_argument( + "--scheduler-cls", **scheduler_kwargs["scheduler_cls"] + ) + scheduler_group.add_argument( + "--disable-hybrid-kv-cache-manager", + **scheduler_kwargs["disable_hybrid_kv_cache_manager"], + ) + scheduler_group.add_argument( + "--async-scheduling", **scheduler_kwargs["async_scheduling"] + ) + scheduler_group.add_argument( + "--stream-interval", **scheduler_kwargs["stream_interval"] + ) + + # Compilation arguments + compilation_kwargs = get_kwargs(CompilationConfig) + compilation_group = parser.add_argument_group( + title="CompilationConfig", + description=CompilationConfig.__doc__, + ) + compilation_group.add_argument( + "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"] + ) + compilation_kwargs["cudagraph_capture_sizes"]["help"] = ( + "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0," + " whichever is soonest. Please use --cudagraph-capture-sizes instead." + ) + compilation_group.add_argument( + "--cuda-graph-sizes", + **compilation_kwargs["cudagraph_capture_sizes"], + deprecated=True, + ) + compilation_group.add_argument( + "--max-cudagraph-capture-size", + **compilation_kwargs["max_cudagraph_capture_size"], + ) + + # vLLM arguments + vllm_kwargs = get_kwargs(VllmConfig) + vllm_group = parser.add_argument_group( + title="VllmConfig", + description=VllmConfig.__doc__, + ) + # We construct SpeculativeConfig using fields from other configs in + # create_engine_config. So we set the type to a JSON string here to + # delay the Pydantic validation that comes with SpeculativeConfig. + vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads) + vllm_group.add_argument( + "--speculative-config", **vllm_kwargs["speculative_config"] + ) + vllm_group.add_argument( + "--kv-transfer-config", **vllm_kwargs["kv_transfer_config"] + ) + vllm_group.add_argument("--kv-events-config", **vllm_kwargs["kv_events_config"]) + vllm_group.add_argument( + "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"] + ) + vllm_group.add_argument( + "--compilation-config", "-O", **vllm_kwargs["compilation_config"] + ) + vllm_group.add_argument( + "--additional-config", **vllm_kwargs["additional_config"] + ) + vllm_group.add_argument( + "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"] + ) + + # Other arguments + parser.add_argument( + "--disable-log-stats", + action="store_true", + help="Disable logging statistics.", + ) + + parser.add_argument( + "--aggregate-engine-logging", + action="store_true", + help="Log aggregate rather than per-engine statistics " + "when using data parallelism.", + ) + return parser + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + # Get the list of attributes of this dataclass. + attrs = [attr.name for attr in dataclasses.fields(cls)] + # Set the attributes from the parsed arguments. + engine_args = cls( + **{attr: getattr(args, attr) for attr in attrs if hasattr(args, attr)} + ) + return engine_args + + def create_model_config(self) -> ModelConfig: + # gguf file needs a specific model loader and doesn't use hf_repo + if check_gguf_file(self.model): + self.quantization = self.load_format = "gguf" + + if self.disable_mm_preprocessor_cache: + logger.warning( + "`--disable-mm-preprocessor-cache` is deprecated " + "and will be removed in v0.13. " + "Please use `--mm-processor-cache-gb 0` instead.", + ) + + self.mm_processor_cache_gb = 0 + elif envs.VLLM_MM_INPUT_CACHE_GIB != 4: + logger.warning( + "VLLM_MM_INPUT_CACHE_GIB` is deprecated " + "and will be removed in v0.13. " + "Please use `--mm-processor-cache-gb %d` instead.", + envs.VLLM_MM_INPUT_CACHE_GIB, + ) + + self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB + + if self.enable_multimodal_encoder_data_parallel: + logger.warning( + "--enable-multimodal-encoder-data-parallel` is deprecated " + "and will be removed in v0.13. " + "Please use `--mm-encoder-tp-mode data` instead." + ) + + self.mm_encoder_tp_mode = "data" + + return ModelConfig( + model=self.model, + hf_config_path=self.hf_config_path, + runner=self.runner, + convert=self.convert, + task=self.task, + tokenizer=self.tokenizer, + tokenizer_mode=self.tokenizer_mode, + trust_remote_code=self.trust_remote_code, + allowed_local_media_path=self.allowed_local_media_path, + allowed_media_domains=self.allowed_media_domains, + dtype=self.dtype, + seed=self.seed, + revision=self.revision, + code_revision=self.code_revision, + hf_token=self.hf_token, + hf_overrides=self.hf_overrides, + tokenizer_revision=self.tokenizer_revision, + max_model_len=self.max_model_len, + quantization=self.quantization, + enforce_eager=self.enforce_eager, + max_logprobs=self.max_logprobs, + logprobs_mode=self.logprobs_mode, + disable_sliding_window=self.disable_sliding_window, + disable_cascade_attn=self.disable_cascade_attn, + skip_tokenizer_init=self.skip_tokenizer_init, + enable_prompt_embeds=self.enable_prompt_embeds, + served_model_name=self.served_model_name, + limit_mm_per_prompt=self.limit_mm_per_prompt, + enable_mm_embeds=self.enable_mm_embeds, + interleave_mm_strings=self.interleave_mm_strings, + media_io_kwargs=self.media_io_kwargs, + skip_mm_profiling=self.skip_mm_profiling, + config_format=self.config_format, + mm_processor_kwargs=self.mm_processor_kwargs, + mm_processor_cache_gb=self.mm_processor_cache_gb, + mm_processor_cache_type=self.mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=self.mm_shm_cache_max_object_size_mb, + mm_encoder_tp_mode=self.mm_encoder_tp_mode, + mm_encoder_attn_backend=self.mm_encoder_attn_backend, + pooler_config=self.pooler_config, + override_pooler_config=self.override_pooler_config, + logits_processor_pattern=self.logits_processor_pattern, + generation_config=self.generation_config, + override_generation_config=self.override_generation_config, + enable_sleep_mode=self.enable_sleep_mode, + model_impl=self.model_impl, + override_attention_dtype=self.override_attention_dtype, + logits_processors=self.logits_processors, + video_pruning_rate=self.video_pruning_rate, + io_processor_plugin=self.io_processor_plugin, + ) + + def validate_tensorizer_args(self): + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + + for key in self.model_loader_extra_config: + if key in TensorizerConfig._fields: + self.model_loader_extra_config["tensorizer_config"][key] = ( + self.model_loader_extra_config[key] + ) + + def create_load_config(self) -> LoadConfig: + if self.quantization == "bitsandbytes": + self.load_format = "bitsandbytes" + + if self.load_format == "tensorizer": + if hasattr(self.model_loader_extra_config, "to_serializable"): + self.model_loader_extra_config = ( + self.model_loader_extra_config.to_serializable() + ) + self.model_loader_extra_config["tensorizer_config"] = {} + self.model_loader_extra_config["tensorizer_config"]["tensorizer_dir"] = ( + self.model + ) + self.validate_tensorizer_args() + + return LoadConfig( + load_format=self.load_format, + download_dir=self.download_dir, + safetensors_load_strategy=self.safetensors_load_strategy, + device="cpu" if is_online_quantization(self.quantization) else None, + model_loader_extra_config=self.model_loader_extra_config, + ignore_patterns=self.ignore_patterns, + use_tqdm_on_load=self.use_tqdm_on_load, + pt_load_map_location=self.pt_load_map_location, + ) + + def create_speculative_config( + self, + target_model_config: ModelConfig, + target_parallel_config: ParallelConfig, + ) -> SpeculativeConfig | None: + """Initializes and returns a SpeculativeConfig object based on + `speculative_config`. + + This function utilizes `speculative_config` to create a + SpeculativeConfig object. The `speculative_config` can either be + provided as a JSON string input via CLI arguments or directly as a + dictionary from the engine. + """ + if self.speculative_config is None: + return None + + # Note(Shangming): These parameters are not obtained from the cli arg + # '--speculative-config' and must be passed in when creating the engine + # config. + self.speculative_config.update( + { + "target_model_config": target_model_config, + "target_parallel_config": target_parallel_config, + } + ) + return SpeculativeConfig(**self.speculative_config) + + def create_engine_config( + self, + usage_context: UsageContext | None = None, + headless: bool = False, + ) -> VllmConfig: + """ + Create the VllmConfig. + + NOTE: If VllmConfig is incompatible, we raise an error. + """ + current_platform.pre_register_and_update() + + device_config = DeviceConfig(device=cast(Device, current_platform.device_type)) + + # Check if the model is a speculator and override model/tokenizer/config + # BEFORE creating ModelConfig, so the config is created with the target model + # Skip speculator detection for cloud storage models (eg: S3, GCS) since + # HuggingFace cannot load configs directly from S3 URLs. S3 models can still + # use speculators with explicit --speculative-config. + if not is_cloud_storage(self.model): + (self.model, self.tokenizer, self.speculative_config) = ( + maybe_override_with_speculators( + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code, + vllm_speculative_config=self.speculative_config, + ) + ) + + model_config = self.create_model_config() + self.model = model_config.model + self.tokenizer = model_config.tokenizer + + self._check_feature_supported(model_config) + + # Set default arguments for V1 Engine. + self._set_default_args(usage_context, model_config) + # Disable chunked prefill and prefix caching for: + # POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1 + if current_platform.is_cpu() and current_platform.get_cpu_architecture() in ( + CpuArchEnum.POWERPC, + CpuArchEnum.S390X, + CpuArchEnum.ARM, + CpuArchEnum.RISCV, + ): + logger.info( + "Chunked prefill is not supported for ARM and POWER, " + "S390X and RISC-V CPUs; " + "disabling it for V1 backend." + ) + self.enable_chunked_prefill = False + logger.info( + "Prefix caching is not supported for ARM and POWER, " + "S390X and RISC-V CPUs; " + "disabling it for V1 backend." + ) + self.enable_prefix_caching = False + + assert self.enable_chunked_prefill is not None + + sliding_window: int | None = None + if not is_interleaved(model_config.hf_text_config): + # Only set CacheConfig.sliding_window if the model is all sliding + # window. Otherwise CacheConfig.sliding_window will override the + # global layers in interleaved sliding window models. + sliding_window = model_config.get_sliding_window() + + # Note(hc): In the current implementation of decode context + # parallel(DCP), tp_size needs to be divisible by dcp_size, + # because the world size does not change by dcp, it simply + # reuses the GPUs of TP group, and split one TP group into + # tp_size//dcp_size DCP groups. + assert self.tensor_parallel_size % self.decode_context_parallel_size == 0, ( + f"tp_size={self.tensor_parallel_size} must be divisible by" + f"dcp_size={self.decode_context_parallel_size}." + ) + + cache_config = CacheConfig( + block_size=self.block_size, + gpu_memory_utilization=self.gpu_memory_utilization, + kv_cache_memory_bytes=self.kv_cache_memory_bytes, + swap_space=self.swap_space, + cache_dtype=self.kv_cache_dtype, + is_attention_free=model_config.is_attention_free, + num_gpu_blocks_override=self.num_gpu_blocks_override, + sliding_window=sliding_window, + enable_prefix_caching=self.enable_prefix_caching, + prefix_caching_hash_algo=self.prefix_caching_hash_algo, + cpu_offload_gb=self.cpu_offload_gb, + calculate_kv_scales=self.calculate_kv_scales, + kv_sharing_fast_prefill=self.kv_sharing_fast_prefill, + mamba_cache_dtype=self.mamba_cache_dtype, + mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype, + mamba_block_size=self.mamba_block_size, + kv_offloading_size=self.kv_offloading_size, + kv_offloading_backend=self.kv_offloading_backend, + ) + + ray_runtime_env = None + if is_ray_initialized(): + # Ray Serve LLM calls `create_engine_config` in the context + # of a Ray task, therefore we check is_ray_initialized() + # as opposed to is_in_ray_actor(). + import ray + + ray_runtime_env = ray.get_runtime_context().runtime_env + # Avoid logging sensitive environment variables + sanitized_env = ray_runtime_env.to_dict() if ray_runtime_env else {} + if "env_vars" in sanitized_env: + sanitized_env["env_vars"] = { + k: "***" for k in sanitized_env["env_vars"] + } + logger.info("Using ray runtime env (env vars redacted): %s", sanitized_env) + + # Get the current placement group if Ray is initialized and + # we are in a Ray actor. If so, then the placement group will be + # passed to spawned processes. + placement_group = None + if is_in_ray_actor(): + import ray + + # This call initializes Ray automatically if it is not initialized, + # but we should not do this here. + placement_group = ray.util.get_current_placement_group() + + assert not headless or not self.data_parallel_hybrid_lb, ( + "data_parallel_hybrid_lb is not applicable in headless mode" + ) + assert not (self.data_parallel_hybrid_lb and self.data_parallel_external_lb), ( + "data_parallel_hybrid_lb and data_parallel_external_lb cannot both be True." + ) + assert self.data_parallel_backend == "mp" or self.nnodes == 1, ( + "nnodes > 1 is only supported with data_parallel_backend=mp" + ) + inferred_data_parallel_rank = 0 + if self.nnodes > 1: + world_size = ( + self.data_parallel_size + * self.pipeline_parallel_size + * self.tensor_parallel_size + ) + world_size_within_dp = ( + self.pipeline_parallel_size * self.tensor_parallel_size + ) + local_world_size = world_size // self.nnodes + assert world_size % self.nnodes == 0, ( + f"world_size={world_size} must be divisible by nnodes={self.nnodes}." + ) + assert self.node_rank < self.nnodes, ( + f"node_rank={self.node_rank} must be less than nnodes={self.nnodes}." + ) + inferred_data_parallel_rank = ( + self.node_rank * local_world_size + ) // world_size_within_dp + if self.data_parallel_size > 1 and self.data_parallel_external_lb: + self.data_parallel_rank = inferred_data_parallel_rank + logger.info( + "Inferred data_parallel_rank %d from node_rank %d for external lb", + self.data_parallel_rank, + self.node_rank, + ) + elif self.data_parallel_size_local is None: + # Infer data parallel size local for internal dplb: + self.data_parallel_size_local = max( + local_world_size // world_size_within_dp, 1 + ) + data_parallel_external_lb = ( + self.data_parallel_external_lb or self.data_parallel_rank is not None + ) + # Local DP rank = 1, use pure-external LB. + if data_parallel_external_lb: + assert self.data_parallel_rank is not None, ( + "data_parallel_rank or node_rank must be spefified if " + "data_parallel_external_lb is enable." + ) + assert self.data_parallel_size_local in (1, None), ( + "data_parallel_size_local must be 1 or None when data_parallel_rank " + "is set" + ) + data_parallel_size_local = 1 + # Use full external lb if we have local_size of 1. + self.data_parallel_hybrid_lb = False + elif self.data_parallel_size_local is not None: + data_parallel_size_local = self.data_parallel_size_local + + if self.data_parallel_start_rank and not headless: + # Infer hybrid LB mode. + self.data_parallel_hybrid_lb = True + + if self.data_parallel_hybrid_lb and data_parallel_size_local == 1: + # Use full external lb if we have local_size of 1. + logger.warning( + "data_parallel_hybrid_lb is not eligible when " + "data_parallel_size_local = 1, autoswitch to " + "data_parallel_external_lb." + ) + data_parallel_external_lb = True + self.data_parallel_hybrid_lb = False + + if data_parallel_size_local == self.data_parallel_size: + # Disable hybrid LB mode if set for a single node + self.data_parallel_hybrid_lb = False + + self.data_parallel_rank = ( + self.data_parallel_start_rank or inferred_data_parallel_rank + ) + if self.nnodes > 1: + logger.info( + "Inferred data_parallel_rank %d from node_rank %d", + self.data_parallel_rank, + self.node_rank, + ) + else: + assert not self.data_parallel_hybrid_lb, ( + "data_parallel_size_local must be set to use data_parallel_hybrid_lb." + ) + + if self.data_parallel_backend == "ray" and ( + envs.VLLM_RAY_DP_PACK_STRATEGY == "span" + ): + # Data parallel size defaults to 1 if DP ranks are spanning + # multiple nodes + data_parallel_size_local = 1 + else: + # Otherwise local DP size defaults to global DP size if not set + data_parallel_size_local = self.data_parallel_size + + # DP address, used in multi-node case for torch distributed group + # and ZMQ sockets. + if self.data_parallel_address is None: + if self.data_parallel_backend == "ray": + host_ip = get_ip() + logger.info( + "Using host IP %s as ray-based data parallel address", host_ip + ) + data_parallel_address = host_ip + else: + assert self.data_parallel_backend == "mp", ( + "data_parallel_backend can only be ray or mp, got %s", + self.data_parallel_backend, + ) + data_parallel_address = ( + self.master_addr or ParallelConfig.data_parallel_master_ip + ) + else: + data_parallel_address = self.data_parallel_address + + # This port is only used when there are remote data parallel engines, + # otherwise the local IPC transport is used. + data_parallel_rpc_port = ( + self.data_parallel_rpc_port + if (self.data_parallel_rpc_port is not None) + else ParallelConfig.data_parallel_rpc_port + ) + + if self.tokens_only and not model_config.skip_tokenizer_init: + model_config.skip_tokenizer_init = True + logger.info("Skipping tokenizer initialization for tokens-only mode.") + + # Forward the deprecated CLI args to the EPLB config. + if self.num_redundant_experts is not None: + self.eplb_config.num_redundant_experts = self.num_redundant_experts + if self.eplb_window_size is not None: + self.eplb_config.window_size = self.eplb_window_size + if self.eplb_step_interval is not None: + self.eplb_config.step_interval = self.eplb_step_interval + if self.eplb_log_balancedness is not None: + self.eplb_config.log_balancedness = self.eplb_log_balancedness + + parallel_config = ParallelConfig( + pipeline_parallel_size=self.pipeline_parallel_size, + tensor_parallel_size=self.tensor_parallel_size, + data_parallel_size=self.data_parallel_size, + data_parallel_rank=self.data_parallel_rank or 0, + data_parallel_external_lb=data_parallel_external_lb, + data_parallel_size_local=data_parallel_size_local, + master_addr=self.master_addr, + master_port=self.master_port, + nnodes=self.nnodes, + node_rank=self.node_rank, + data_parallel_master_ip=data_parallel_address, + data_parallel_rpc_port=data_parallel_rpc_port, + data_parallel_backend=self.data_parallel_backend, + data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, + enable_expert_parallel=self.enable_expert_parallel, + all2all_backend=self.all2all_backend, + enable_dbo=self.enable_dbo, + dbo_decode_token_threshold=self.dbo_decode_token_threshold, + dbo_prefill_token_threshold=self.dbo_prefill_token_threshold, + disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization, + enable_eplb=self.enable_eplb, + eplb_config=self.eplb_config, + expert_placement_strategy=self.expert_placement_strategy, + max_parallel_loading_workers=self.max_parallel_loading_workers, + disable_custom_all_reduce=self.disable_custom_all_reduce, + ray_workers_use_nsight=self.ray_workers_use_nsight, + ray_runtime_env=ray_runtime_env, + placement_group=placement_group, + distributed_executor_backend=self.distributed_executor_backend, + worker_cls=self.worker_cls, + worker_extension_cls=self.worker_extension_cls, + decode_context_parallel_size=self.decode_context_parallel_size, + dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size, + _api_process_count=self._api_process_count, + _api_process_rank=self._api_process_rank, + ) + + speculative_config = self.create_speculative_config( + target_model_config=model_config, + target_parallel_config=parallel_config, + ) + + # make sure num_lookahead_slots is set appropriately depending on + # whether speculative decoding is enabled + num_lookahead_slots = self.num_lookahead_slots + if speculative_config is not None: + num_lookahead_slots = speculative_config.num_lookahead_slots + + scheduler_config = SchedulerConfig( + runner_type=model_config.runner_type, + max_num_batched_tokens=self.max_num_batched_tokens, + max_num_seqs=self.max_num_seqs, + max_model_len=model_config.max_model_len, + num_lookahead_slots=num_lookahead_slots, + enable_chunked_prefill=self.enable_chunked_prefill, + disable_chunked_mm_input=self.disable_chunked_mm_input, + is_multimodal_model=model_config.is_multimodal_model, + is_encoder_decoder=model_config.is_encoder_decoder, + policy=self.scheduling_policy, + scheduler_cls=self.scheduler_cls, + max_num_partial_prefills=self.max_num_partial_prefills, + max_long_partial_prefills=self.max_long_partial_prefills, + long_prefill_token_threshold=self.long_prefill_token_threshold, + disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager, + async_scheduling=self.async_scheduling, + stream_interval=self.stream_interval, + ) + + if not model_config.is_multimodal_model and self.default_mm_loras: + raise ValueError( + "Default modality-specific LoRA(s) were provided for a " + "non multimodal model" + ) + + lora_config = ( + LoRAConfig( + max_lora_rank=self.max_lora_rank, + max_loras=self.max_loras, + default_mm_loras=self.default_mm_loras, + fully_sharded_loras=self.fully_sharded_loras, + lora_extra_vocab_size=self.lora_extra_vocab_size, + lora_dtype=self.lora_dtype, + max_cpu_loras=self.max_cpu_loras + if self.max_cpu_loras and self.max_cpu_loras > 0 + else None, + ) + if self.enable_lora + else None + ) + + if ( + lora_config is not None + and speculative_config is not None + and scheduler_config.max_num_batched_tokens + < ( + scheduler_config.max_num_seqs + * (speculative_config.num_speculative_tokens + 1) + ) + ): + raise ValueError( + "Consider increasing max_num_batched_tokens or " + "decreasing num_speculative_tokens" + ) + + # bitsandbytes pre-quantized model need a specific model loader + if model_config.quantization == "bitsandbytes": + self.quantization = self.load_format = "bitsandbytes" + + load_config = self.create_load_config() + + # Pass reasoning_parser into StructuredOutputsConfig + if self.reasoning_parser: + self.structured_outputs_config.reasoning_parser = self.reasoning_parser + + if self.reasoning_parser_plugin: + self.structured_outputs_config.reasoning_parser_plugin = ( + self.reasoning_parser_plugin + ) + + # Forward the deprecated CLI args to the StructuredOutputsConfig + so_config = self.structured_outputs_config + if self.guided_decoding_backend is not None: + so_config.guided_decoding_backend = self.guided_decoding_backend + if self.guided_decoding_disable_fallback is not None: + so_config.disable_fallback = self.guided_decoding_disable_fallback + if self.guided_decoding_disable_any_whitespace is not None: + so_config.disable_any_whitespace = ( + self.guided_decoding_disable_any_whitespace + ) + if self.guided_decoding_disable_additional_properties is not None: + so_config.disable_additional_properties = ( + self.guided_decoding_disable_additional_properties + ) + + observability_config = ObservabilityConfig( + show_hidden_metrics_for_version=self.show_hidden_metrics_for_version, + otlp_traces_endpoint=self.otlp_traces_endpoint, + collect_detailed_traces=self.collect_detailed_traces, + ) + + # Compilation config overrides + compilation_config = copy.deepcopy(self.compilation_config) + if self.cuda_graph_sizes is not None: + logger.warning( + "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or " + "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes " + "instead." + ) + if compilation_config.cudagraph_capture_sizes is not None: + raise ValueError( + "cuda_graph_sizes and compilation_config." + "cudagraph_capture_sizes are mutually exclusive" + ) + compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes + if self.cudagraph_capture_sizes is not None: + if compilation_config.cudagraph_capture_sizes is not None: + raise ValueError( + "cudagraph_capture_sizes and compilation_config." + "cudagraph_capture_sizes are mutually exclusive" + ) + compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes + if self.max_cudagraph_capture_size is not None: + if compilation_config.max_cudagraph_capture_size is not None: + raise ValueError( + "max_cudagraph_capture_size and compilation_config." + "max_cudagraph_capture_size are mutually exclusive" + ) + compilation_config.max_cudagraph_capture_size = ( + self.max_cudagraph_capture_size + ) + + config = VllmConfig( + model_config=model_config, + cache_config=cache_config, + parallel_config=parallel_config, + scheduler_config=scheduler_config, + device_config=device_config, + lora_config=lora_config, + speculative_config=speculative_config, + load_config=load_config, + structured_outputs_config=self.structured_outputs_config, + observability_config=observability_config, + compilation_config=compilation_config, + kv_transfer_config=self.kv_transfer_config, + kv_events_config=self.kv_events_config, + ec_transfer_config=self.ec_transfer_config, + additional_config=self.additional_config, + ) + + return config + + def _check_feature_supported(self, model_config: ModelConfig): + """Raise an error if the feature is not supported.""" + if self.logits_processor_pattern != EngineArgs.logits_processor_pattern: + _raise_unsupported_error(feature_name="--logits-processor-pattern") + + # No Concurrent Partial Prefills so far. + if ( + self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills + or self.max_long_partial_prefills + != SchedulerConfig.max_long_partial_prefills + ): + _raise_unsupported_error(feature_name="Concurrent Partial Prefill") + + # N-gram, Medusa, and Eagle are supported for speculative decoding. + if self.speculative_config is not None: + # speculative_config could still be a dict at this point + if isinstance(self.speculative_config, dict): + method = self.speculative_config.get("method", None) + else: + method = self.speculative_config.method + + if method == "draft_model": + raise NotImplementedError( + "Draft model speculative decoding is not supported yet. " + "Please consider using other speculative decoding methods " + "such as ngram, medusa, eagle, or mtp." + ) + + if self.pipeline_parallel_size > 1: + supports_pp = getattr( + self.distributed_executor_backend, "supports_pp", False + ) + if not supports_pp and self.distributed_executor_backend not in ( + ParallelConfig.distributed_executor_backend, + "ray", + "mp", + "external_launcher", + ): + name = ( + "Pipeline Parallelism without Ray distributed " + "executor or multiprocessing executor or external " + "launcher" + ) + _raise_unsupported_error(feature_name=name) + + @classmethod + def get_chunked_prefill_prefix_caching_defaults( + cls, + model_config: ModelConfig, + ) -> tuple[bool, bool]: + if model_config.runner_type != "pooling": + default_chunked_prefill = True + + # Disable prefix caching default for hybrid models + # since the feature is still experimental. + default_prefix_caching = not model_config.is_hybrid + else: + assert model_config.pooler_config is not None + + pooling_type = model_config.pooler_config.pooling_type + incremental_prefill_supported = ( + pooling_type is not None + and pooling_type.lower() == "last" + and getattr(model_config.hf_config, "is_causal", True) + ) + + default_chunked_prefill = incremental_prefill_supported + default_prefix_caching = incremental_prefill_supported + + return default_chunked_prefill, default_prefix_caching + + @classmethod + def get_batch_defaults( + cls, + world_size: int, + ) -> tuple[dict[UsageContext | None, int], dict[UsageContext | None, int]]: + from vllm.usage.usage_lib import UsageContext + + default_max_num_batched_tokens: dict[UsageContext | None, int] + default_max_num_seqs: dict[UsageContext | None, int] + + # When no user override, set the default values based on the usage + # context. + # Use different default values for different hardware. + + # Try to query the device name on the current platform. If it fails, + # it may be because the platform that imports vLLM is not the same + # as the platform that vLLM is running on (e.g. the case of scaling + # vLLM with Ray) and has no GPUs. In this case we use the default + # values for non-H100/H200 GPUs. + try: + device_memory = current_platform.get_device_total_memory() + device_name = current_platform.get_device_name().lower() + except Exception: + # This is only used to set default_max_num_batched_tokens + device_memory = 0 + + # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces + # throughput, see PR #17885 for more details. + # So here we do an extra device name check to prevent such regression. + if device_memory >= 70 * GiB_bytes and "a100" not in device_name: + # For GPUs like H100 and MI300x, use larger default values. + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 16384, + UsageContext.OPENAI_API_SERVER: 8192, + } + default_max_num_seqs = { + UsageContext.LLM_CLASS: 1024, + UsageContext.OPENAI_API_SERVER: 1024, + } + else: + # TODO(woosuk): Tune the default values for other hardware. + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 8192, + UsageContext.OPENAI_API_SERVER: 2048, + } + default_max_num_seqs = { + UsageContext.LLM_CLASS: 256, + UsageContext.OPENAI_API_SERVER: 256, + } + + # tpu specific default values. + if current_platform.is_tpu(): + chip_name = current_platform.get_device_name() + + if chip_name == "V6E": + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 2048, + UsageContext.OPENAI_API_SERVER: 1024, + } + elif chip_name == "V5E": + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 1024, + UsageContext.OPENAI_API_SERVER: 512, + } + elif chip_name == "V5P": + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 512, + UsageContext.OPENAI_API_SERVER: 256, + } + + # cpu specific default values. + if current_platform.is_cpu(): + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 4096 * world_size, + UsageContext.OPENAI_API_SERVER: 2048 * world_size, + } + default_max_num_seqs = { + UsageContext.LLM_CLASS: 256 * world_size, + UsageContext.OPENAI_API_SERVER: 128 * world_size, + } + + return default_max_num_batched_tokens, default_max_num_seqs + + def _set_default_args( + self, usage_context: UsageContext, model_config: ModelConfig + ) -> None: + """Set Default Arguments for V1 Engine.""" + ( + default_chunked_prefill, + default_prefix_caching, + ) = self.get_chunked_prefill_prefix_caching_defaults(model_config) + + if self.enable_chunked_prefill is None: + self.enable_chunked_prefill = default_chunked_prefill + + logger.debug( + "%s chunked prefill by default", + "Enabling" if default_chunked_prefill else "Disabling", + ) + elif ( + model_config.runner_type == "pooling" + and self.enable_chunked_prefill + and not default_chunked_prefill + ): + logger.warning( + "This model does not officially support chunked prefill. " + "Enabling this manually may cause the engine to crash " + "or produce incorrect outputs.", + ) + + if self.enable_prefix_caching is None: + self.enable_prefix_caching = default_prefix_caching + + logger.debug( + "%s prefix caching by default", + "Enabling" if default_prefix_caching else "Disabling", + ) + elif ( + model_config.runner_type == "pooling" + and self.enable_prefix_caching + and not default_prefix_caching + ): + logger.warning( + "This model does not officially support prefix caching. " + "Enabling this manually may cause the engine to crash " + "or produce incorrect outputs.", + ) + + world_size = self.pipeline_parallel_size * self.tensor_parallel_size + ( + default_max_num_batched_tokens, + default_max_num_seqs, + ) = self.get_batch_defaults(world_size) + + orig_max_num_batched_tokens = self.max_num_batched_tokens + orig_max_num_seqs = self.max_num_seqs + + if self.max_num_batched_tokens is None: + self.max_num_batched_tokens = default_max_num_batched_tokens.get( + usage_context, + SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS, + ) + + if self.max_num_seqs is None: + self.max_num_seqs = default_max_num_seqs.get( + usage_context, + SchedulerConfig.DEFAULT_MAX_NUM_SEQS, + ) + + if orig_max_num_batched_tokens is None: + if not self.enable_chunked_prefill: + # If max_model_len is too short, use the default for higher throughput. + self.max_num_batched_tokens = max( + model_config.max_model_len, + self.max_num_batched_tokens, + ) + + # When using default settings, + # Ensure max_num_batched_tokens does not exceed model limit. + # Some models (e.g., Whisper) have embeddings tied to max length. + self.max_num_batched_tokens = min( + self.max_num_seqs * model_config.max_model_len, + self.max_num_batched_tokens, + ) + + logger.debug( + "Defaulting max_num_batched_tokens to %d for %s usage context.", + self.max_num_batched_tokens, + usage_context.value if usage_context else None, + ) + + if orig_max_num_seqs is None: + assert self.max_num_batched_tokens is not None # For type checking + self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens) + + logger.debug( + "Defaulting max_num_seqs to %d for %s usage context.", + self.max_num_seqs, + usage_context.value if usage_context else None, + ) + + +@dataclass +class AsyncEngineArgs(EngineArgs): + """Arguments for asynchronous vLLM engine.""" + + enable_log_requests: bool = False + + @property + @deprecated( + "`disable_log_requests` is deprecated and has been replaced with " + "`enable_log_requests`. This will be removed in v0.12.0. Please use " + "`enable_log_requests` instead." + ) + def disable_log_requests(self) -> bool: + return not self.enable_log_requests + + @disable_log_requests.setter + @deprecated( + "`disable_log_requests` is deprecated and has been replaced with " + "`enable_log_requests`. This will be removed in v0.12.0. Please use " + "`enable_log_requests` instead." + ) + def disable_log_requests(self, value: bool): + self.enable_log_requests = not value + + @staticmethod + def add_cli_args( + parser: FlexibleArgumentParser, async_args_only: bool = False + ) -> FlexibleArgumentParser: + # Initialize plugin to update the parser, for example, The plugin may + # add a new kind of quantization method to --quantization argument or + # a new device to --device argument. + load_general_plugins() + if not async_args_only: + parser = EngineArgs.add_cli_args(parser) + parser.add_argument( + "--enable-log-requests", + action=argparse.BooleanOptionalAction, + default=AsyncEngineArgs.enable_log_requests, + help="Enable logging requests.", + ) + parser.add_argument( + "--disable-log-requests", + action=argparse.BooleanOptionalAction, + default=not AsyncEngineArgs.enable_log_requests, + help="[DEPRECATED] Disable logging requests.", + deprecated=True, + ) + current_platform.pre_register_and_update(parser) + return parser + + +def _raise_unsupported_error(feature_name: str): + msg = ( + f"{feature_name} is not supported. We recommend to " + f"remove {feature_name} from your config." + ) + raise NotImplementedError(msg) + + +def human_readable_int(value): + """Parse human-readable integers like '1k', '2M', etc. + Including decimal values with decimal multipliers. + + Examples: + - '1k' -> 1,000 + - '1K' -> 1,024 + - '25.6k' -> 25,600 + """ + value = value.strip() + match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value) + if match: + decimal_multiplier = { + "k": 10**3, + "m": 10**6, + "g": 10**9, + } + binary_multiplier = { + "K": 2**10, + "M": 2**20, + "G": 2**30, + } + + number, suffix = match.groups() + if suffix in decimal_multiplier: + mult = decimal_multiplier[suffix] + return int(float(number) * mult) + elif suffix in binary_multiplier: + mult = binary_multiplier[suffix] + # Do not allow decimals with binary multipliers + try: + return int(number) * mult + except ValueError as e: + raise argparse.ArgumentTypeError( + "Decimals are not allowed " + f"with binary suffixes like {suffix}. Did you mean to use " + f"{number}{suffix.lower()} instead?" + ) from e + + # Regular plain number. + return int(value) diff --git a/engine/async_llm_engine.py b/engine/async_llm_engine.py new file mode 100644 index 0000000..ede0277 --- /dev/null +++ b/engine/async_llm_engine.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.v1.engine.async_llm import AsyncLLM + +AsyncLLMEngine = AsyncLLM # type: ignore diff --git a/engine/llm_engine.py b/engine/llm_engine.py new file mode 100644 index 0000000..a0fe38e --- /dev/null +++ b/engine/llm_engine.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + +LLMEngine = V1LLMEngine # type: ignore diff --git a/engine/protocol.py b/engine/protocol.py new file mode 100644 index 0000000..462d2c4 --- /dev/null +++ b/engine/protocol.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import enum +from abc import ABC, abstractmethod +from collections.abc import AsyncGenerator, Iterable, Mapping +from typing import Any + +from vllm.config import ModelConfig, VllmConfig +from vllm.inputs.data import PromptType +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import PoolingRequestOutput, RequestOutput +from vllm.plugins.io_processors import IOProcessor +from vllm.pooling_params import PoolingParams +from vllm.sampling_params import SamplingParams +from vllm.tasks import SupportedTask +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.processor import Processor + +logger = init_logger(__name__) + + +class Device(enum.Enum): + GPU = enum.auto() + CPU = enum.auto() + + +class EngineClient(ABC): + """Protocol class for Clients to Engine""" + + vllm_config: VllmConfig + model_config: ModelConfig + processor: Processor + io_processor: IOProcessor | None + + @property + @abstractmethod + def is_running(self) -> bool: ... + + @property + @abstractmethod + def is_stopped(self) -> bool: ... + + @property + @abstractmethod + def errored(self) -> bool: ... + + @property + @abstractmethod + def dead_error(self) -> BaseException: ... + + @abstractmethod + def generate( + self, + prompt: EngineCoreRequest | PromptType, + sampling_params: SamplingParams, + request_id: str, + *, + prompt_text: str | None = None, + lora_request: LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, + trace_headers: Mapping[str, str] | None = None, + priority: int = 0, + data_parallel_rank: int | None = None, + ) -> AsyncGenerator[RequestOutput, None]: + """Generate outputs for a request.""" + ... + + @abstractmethod + def encode( + self, + prompt: PromptType, + pooling_params: PoolingParams, + request_id: str, + lora_request: LoRARequest | None = None, + trace_headers: Mapping[str, str] | None = None, + priority: int = 0, + truncate_prompt_tokens: int | None = None, + tokenization_kwargs: dict[str, Any] | None = None, + ) -> AsyncGenerator[PoolingRequestOutput, None]: + """Generate outputs for a request from a pooling model.""" + ... + + @abstractmethod + async def abort(self, request_id: str | Iterable[str]) -> None: + """Abort a request. + + Args: + request_id: The unique id of the request, + or an iterable of such ids. + """ + ... + + @abstractmethod + async def get_tokenizer(self) -> AnyTokenizer: + """Get the tokenizer""" + ... + + @abstractmethod + async def is_tracing_enabled(self) -> bool: ... + + @abstractmethod + async def do_log_stats(self) -> None: ... + + @abstractmethod + async def check_health(self) -> None: + """Raise if unhealthy""" + ... + + @abstractmethod + async def start_profile(self) -> None: + """Start profiling the engine""" + ... + + @abstractmethod + async def stop_profile(self) -> None: + """Stop profiling the engine""" + ... + + @abstractmethod + async def reset_mm_cache(self) -> None: + """Reset the multi-modal cache""" + ... + + @abstractmethod + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache""" + ... + + @abstractmethod + async def sleep(self, level: int = 1) -> None: + """Sleep the engine""" + ... + + @abstractmethod + async def wake_up(self, tags: list[str] | None = None) -> None: + """Wake up the engine""" + ... + + @abstractmethod + async def is_sleeping(self) -> bool: + """Check whether the engine is sleeping""" + ... + + @abstractmethod + async def add_lora(self, lora_request: LoRARequest) -> bool: + """Load a new LoRA adapter into the engine for future requests.""" + ... + + async def scale_elastic_ep( + self, new_data_parallel_size: int, drain_timeout: int = 300 + ) -> None: + """Scale the engine""" + raise NotImplementedError + + async def collective_rpc( + self, + method: str, + timeout: float | None = None, + args: tuple = (), + kwargs: dict | None = None, + ): + """Perform a collective RPC call to the given path.""" + raise NotImplementedError + + async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + """Get supported tasks""" + raise NotImplementedError diff --git a/entrypoints/__init__.py b/entrypoints/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/entrypoints/__pycache__/__init__.cpython-312.pyc b/entrypoints/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96c90f0ca1c5a307eeb7ceb0b06d1d3b6f89c57d GIT binary patch literal 161 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVIp~+<7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?zn4{1AOZ>&089rJ~K zv6^rV!O?PUtS($f;3~O3<`4S`?35c~jp0TDyX2-=ARHjDTW*fEgj)#gky~SJ;Wh$$ zPXy9it>cgJ>zcN4fy?uqq=gA}5X)F#${ zL>Zh2_d(nJk;xX}1(6o^Xww$E7GATUaxMC;v0{Vdo2A9Zo2)fH9I6=0m&aI;*i;%D zkUV5OD>he*-z&9BJZA@A(LH9)CZ$>wYmL4v2g2@B53C z`=z76ucH#PaNU@*vMkB?#UHWeAlco4A3=E#D2FQf2RraDELW~UTIDxyUbalC5CHU12454R|BiWEu2 zVlWJ-@GcT5g*%^=;v;7`=t#oWW?*gO1R>WIi4zp~$!dbEIG4aE)hItz-&~O_Hb|jL6uyQc_9vmeEpg%#|EUhrYt3422CAEz>-3AKuAV;bFJjk z>u?}dlLRs3`V*GJWRTLGlR`{VlR`v-Mm6EoX(=uN3lj=pn34EWkk%dJf~o=Mb;m{N zrzuI*f{gBYdFuS+MHmY#m-MO?obSsh?EWnM3|5qTn4~6cp5EKQUD%*mVdYb?D$(? z*L(CT^K+XGtxy_f9{JK#8Ud?lqqUuK3Jsxmo8f0SSSWPEGNuko#O7p)SS>bUZ5~@F zQQ?rw#ld%}jN#PzL`oLH8E^uJwVO-KEJzWJ!XyaWWWwFGCN%uT8(SyMUKGk79vOoC9F zYZE%vGypD%EkM#hX}R7h30%VWP2+XbL25!&R)fZT*#5yj7I~8+VCd|EnuzO8-jZ&R zR(1m0b&pBHi_wUtVo7kEvIF;2S)@ugG(+%H2cRI@Q>;TR9Urv4-}amK_4=O7#71q) z+OY>$9@ZXTcOTEP$BPW|_dYGSkAJ5IV9cBTLFzqfFS`o$N7R1yzHN~DM{1B#x}a0( zfFfe*0DuVD;74?<1+Z^<{y!}fVLmbjtXdYBZlkL8G+Y+JkknDTuLG98UDMXFqxoU}8sujn(9p!1fQgO_%zy zh_`*G(65{kdJg3JH?;_@IDtRTg_dnZ(8Qs~9s1pR7R7B5ngaS2=OS8htvJQ1v@1%j zxHYVd4ZfWF@G> zLgh#kUZK*Ic~rDesitN>9F?v(Dh*DJUNFPN%IV5SU13(>iIisGc&r8ixA84hSx~u7 zv(}LzGzav8yWD3%px&kle9XQ_tx^5eo0y6vRoxB&6uzA7dbQ;$3|eEL-Dv0@JTGs~qBC%1 z>a~hQT%F}}5{in20j`W&#ez%{&M(ozI8e`eEGrj+W6h+ ze|YV8ujSkJW`{4|n#i_J1O3{uY+diI(T)11)gyO~tVMo3eD_kWKDb`DFITtkK~KK! z$V1POq8nEF?i)L5yFxv7pzed;s(WE4H<*#*)X&+Gqx3JRkwN=A3_!o!3Am3NM-JLQ zWA*{~*&qej&ki~Oj&6ax6zm5KZnpvy^yy48kyNnpD%fr@-*H?^oUek_Nw>``X_Bg7 zzoI(?9QsBRdiAIgM@izCq9ha&K@iC(d+=05oLI4f-mbn*C)bRsyDT9` zMP(3EcxIzQKVU0kSGx4>;naA4Zb~5WB`7e>pJlCk-Ki)-H~&5e1j-kaKqPhcl4pJ z_sgnYAh89%gE>5bth+PIc0x(E&*i)voTOrpg`v^)p|f8Ooz2%zT?@?vI%|e zU{5yCpS!7(_4dy_42FFy#@ACAZlEySNa&k(o;=Kaevrmv4^bFC+zs#-?Tw=c*e?c~ z0M=OuE7k7>ci6iz@Y^~B#p)0EL#^Zk%h`!CZVIB>Nr*R`N+O|wQbGL82uTS0mE$m{ zianJXUpQw{QCZ}nQ#lD(0eAbM$ROZtL-Xq7oynV{H_vUjt1~YvZP1~+AX?`Qv-n=T zfUV;O{I>oDitWL88d>gfg)ZOz&vVcfluVj>0~~q0>o2%xrhLmUlx}(0g!$d4tjI%f zXw?=Lu7}bcZQ&5&OHoCdHt#nY$8lcr+Jy&#@$hjGW-0g{(7m$|o$?Vm%410?cmsM3 z!5K_4&~X0(>)?&YPmNui9=oU<#0VRHcu2a}q=ZS}85CrRl$24RP)0DByJ9BtNWkW8 z#)ss^hQ&7ll=JXYu?&sZMHO=UZ=ZT+AT#=;3f1pguN%nK4Lo4;bq6wMvFS8s-5rk^ zZC&^7$$9q_JWU^_*LzOndQLoH=xSHdfo$$4)o5pTX7sHyZ=89^wid7|d`m8o zH%B@~I^mg^x)zNjlsJENktlOXHDM!|-|xV0>q}5NQ!Aipd<$63w|yd|m9A||ZV)S# zsGd6pqRtB1(q(SNa4^0sSClU41H5&NIL>+%RcMkKZ(@&1zBB`|A~wd723b82ccSO% z5@4Sb`vDf)*S1e9MRyC&iQ`~Q%eDc~1S~elZ69FqJ9FCySZ*5%nj-UCSDMDr_cNSe z)9iK}`4zav4)k%z_3uC-GS*^WnS_05S#HGwt%+H$_8Z{5|0Z?*Eh{ivm@P~|Q% z-d6k9-r7Rz&h^&)xz_y!Z|$#Mefw3bs%u;g-3e{r-ebAeV+DU;-QSh-cNH3%*Bf@{ z8g`d~#+LQQ-dtnv<0hE07(njk$ISr!8F?G*nRCT<57@@Fm92` zE#rhMjwquFBwKq`NMER$2>B}xSb_7ODOu`KgUjyzejG3K<9K0t*vv_Cr4Yi{C|oso zx*2Ic3Q1T25+jhjnSo@f2$U(38HU&3^(8zTSX{qh$~IA&NzEz4u=C)E?%;Vb5#f0S zM>xhihk>o=*jqpZ3>iawG3dSs~KgHQjqVK6yiwq#%KDC`SC@fyGf_vniQOX(ES9n?L|PIBxV;0MM6eE z0zy=b(1MIICMGWzV^;)$&qUa29t@`+^``QGkh%nUYe?dfV`E>adq}V8$Q7KZQ1Cx! z%B#4*&dZ2j7|4i{v?g}m@I>r}9-YNWEaT0ALrTN1Vh@ImH4a%co`;M95wq?QLCM&Q;9d8)BUJE;?Er;7-xdsW}y z3X?VwJor=^X4ysUxZ71kP(8Rps!s#o;1(t{x{EJUEakg>rih@r+p~o$$P@~zw&nGz1{HeVqJ!xU#q&SJ~)+Sn;x=d@@LoFO`tlK<_>E6JwT<61GqkSX1wv$_~8(8BSZ^3GmPkN3gk#l?zqv1z&5S zrlH_HRdf;p8xVLZ2$~Dtny+f=Z;!5?yK^pIv#a1e{{xgTNll@~U+_*omtf6%Z}gKU zvg?t8H+YL*_Xcy`;Jw{>?>_PhU3a(T+--Sxd*;;FtS@sK-_dtmYlHb3?%&vs;xJ-c Hh_e14^%~;K literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/chat_utils.cpython-312.pyc b/entrypoints/__pycache__/chat_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c44691afaea6ddfa1a9d4764146f28dbc3779508 GIT binary patch literal 63079 zcmeFa33y!Bbtd=f6yf^iUsJz{z327k;Wsa0j#~OHQER`|MDgZ`Eo$$#M;-l+sI%Y6 z!YmP2)ZOondip)l{Qi6vW{ng?z5U*(uiqCf>@Q?twn$O5xWAbB?Gb;pq`xFu+Fu$i z>o1Fz_m@X2`YWQ9{go`<6shX3V)dw2eruH9b;@t8@*7Zo>!S4oc_uE}(BH^Pa75Ne zH}r3aHuX0}oBNxiE&VOgjr|*=oBB6JTl-sC8Yj}U^|v#>3;xahTbSP+*&5x}zm54l zk?qlr{to8Pk31IL(Z7@V3nIIsyZd)Dzc;ccy0?FCbYK6z=>GowEX)@<5Ixv`F#34^ zc?}~Q!ceD4R$l>Ub{v*s^961_2)_;uo{gIyN@&4n?UlKVHJ=uSf z`AZ|GqP_jS(bN5>qi6chu&}boXQF5O&qmMnpJVUkk-q2?{ZBA|MdZooXZt_P{FRZ< zMW5<_D%#)Q&)%ydPe-5We}?(1BcG3cq5lgeE{_wA3&FQcGKBO$%h&Mh_*y>jnz=ue z#~tPPx|cYpR$ms!kC#O%t7 zi3sh&z^I+$gl6UrGPi}f7nvJoF3;Rc%)QLqD03s^_O}V;!VtfEADYF!voAsq( zVT&-#^6X@JHnTkYv^@Kf=Rj7TD?%B6umYv_KL_9A%qQ}lLe-_em3z?WUi$aOH`&i0 z5~6&UFfwT4yI->OU;V^+4=ec{{1N28=H&R_{#0d{sLiY<4*uv%*8c1Ku}{{DK?~o5 z8r=9K`3Cd&*1Q9(T zVk|!MlI1O>i2g6KIJtk$3MEJp<>j(${5h2MC6skjctMC_6!QoIw0QQWmGdITrcXF1 zJkEOh3E@~qNl&7rFB{^ib@DeqE4+mAqe6*XI(=-=!Yp5vbuovZsI z3?rWfc7+6sP%3>n!#>Vh4n;3oo-cl{)=0i{lX|AkHnvW~BHPmSPYoE(jKl zs=dNOFCsK7yrP9pvCvBhy=(}*g*F!pjFu>)p2tVfo^S9CeDpO_b}T<6jAV@K@JqJ- zZxXZ#-&A3(|67lk_X_gp?=S4t6l-fvN!T7{bmtR{mO!ag~8BBB+iEi;*ymYo*NMm?d-f33dcj| zBZB1U3PmEauLl{(hXme%(7;834_*<6uLxrNhUAQhBS97@*{_7ecsLYkF-ev)q4>oX z8)`9hL$V(gh6FJ@AlXlbsicVH?7MMA=!}FzG09HeCqtrS83@JVEq1A)>tZP0H5|PX z5#r(Dq0U$=9E*pB;-`dIEObFQ6B0ww>_lC|Xwy&}aq*t0mZ&+iBx9nXDH^<#+XHEkxR~vqJ33@K8)(p?VJ?%C2OD*UBOY2o8l}!l_|ih)6Dcc`$sTn}Ed4 z28|D0!5AV{@flI*6GP!4f$tl>EDVK50aax=0*_!^rGhi!@Bl_|SUeTJ9v%XKI?e*- z!lJ;-(7@9-Pz#GNbTx)wH>-gBd3p^Gh2z1<@P!NbxJfEHH*)35uo#b>0tknZ6i|mM zoqFPAU(c!2-JK_cXAd9kIoEggDXB>(Ouza6<1f-zk7Ma6${YgT{)XA$~X-92p7oD99aC8XLTPZPdN5p`qtg=h4Fr z4F}ju=M&vMr`4Ayd%6$P3p&FZCQOj5Ap&#BdNs@o!vjVlt8i!B>NJt>(-#7OTQo8$-vheA;y7?fPWV04%tiICqD3_dp! zipViyGqOu&G*I#egQ20J;dqFUIu;Dx;Z~_KKRvWL9=)>pO881!JS3v4+OCi6+`a4a zPGNH-e17wl8=zZ5o3BP9(ak_`@y3e)Dnj`2@YaA7cmQ6d7 z-nx{fZrN2dwR3uJvbZVfYED_2)0QSVGv&)@w<5pbRzHWB>~^EAW}W|1I0Y-uTI}Qm zn_%bJ+DYjNGxY?*&00ze>l;&xRcgtkR|bm4$j;&Dd4Z2H;?^s980ZEC7AE$hP|3n@ z!I9ll-7Jm==@4005xnK>ug_2fIrJB!UPgZvl}uZv&&@PUf9}l*PZ2XI6oVO3dg|GXm&&M8tZMFcXL+ z9z>pA@o~7>qx~d>krN_M4>=4JC@A-*e<4uJYAR1*PDW1%irM(A1;ni4YXUKWoJh_^ z@|-4zHH(6BH;Zt@2&BBJAH?dC1HoC3pnxgZMH_E^+oCay_8V3(CXO>bCmG<4Hn;+K zIs*u3XCbm17(Raq#J-)yN=_LfuY@CgARlER@q8FWTf7k@42}gs`G>Cwd~ZuOfJ*jY zkRKih2E{tmPlnM3_#_VqrVt4Zu<12>xSyu9|Idirff<1_p#H@mL^49Ufra+U{bV zeE52ZO4!3*o@XwNOug3i^?L$rfWQ(8@#ZG*p&~}$d}J7GMYO4<9itOuiCNxp=Jtxu zp!MQ8IQP?8T8XfJmIsE2N%l}Mgd^uV{$llTXeewL3LobypKhG|Qp&O}Ct09OKf`!f za*W5O2aLx#*8Qx1 zOiybd6u~4ta^Yei9H(@cV1kge3YyV|&MzJu|SeWh2VSPuBEF10#|%i9c;v?HJY~&R0CuH#wTJROB31 z%9MLp2{;~xW?-c((!+;QXBS3qf9he>_>6}PftGHqAz%sGqgS?P41stG?G*)bvO4zy zg^}|c_=^?8G2jgA+yzU8OtdIf?%sSPqV?&JM4!g~9(q@sC)es-mH@rmk=eUXp=TIq z=j`Gz#gg-F{Auv3kfjI0FJ;Qz<1~u*!>?dv%pIl&g}JlSe5&{@g2TW7+IV^Dp!u~< zCRLyvLznf%yw3H7qNB;q0E-Uj@$j;Re1MTKdDhS;44FE`vD6M=eG~`<&ST*$goYjh z-*Ku#c<2HRKu>^8Msh$Pbbfdw&St65Q0$r@*0<`j&H$zrfk&P`aYQhL(}YhYX{8#- zlH^cBC7YaIVUCA*UK0gk(_z_#kRsBgmQpd`8jXdRBSSuvQ(ul zwaXsgmrtfGEi(7NCt189>1s-WZM9U(=*9}FM*iFo@FHR!0RpUoU2qUoFa|=01}h+g zK>(HjR^F<_+7wuE3NFf{h1->IJ*9&}z@0gzK(2|-DB>91a(IX&QnA1QV^IPkAOW+~ z5F1TmSd37GS*l&KP!h=wj*CP^l7&{NSZc)4>5yMVz&h}fOM8n+PEiQOhKB$+E$&Ae zz6dSksdGirIuI+#$)b!`D2$x{3xBa^;e3Vrz_N$NYx^XY9%X6ErsaI!R7awGN3wWl zGJj{vvNLVjueZ)0SIUgH)$2JC}XsiHf~R-@f^H($|%;bfqoFmc2#l z1ZLSSk30J=@sSzhZbbk+tG@@wIPTd30F0+fX|d$P=8r{Q6lyS_Fp-9IRt*x=oET?i z#2tc@L4wL~st}^5RAEC8kzpebuu%*eK|CfCBq%T;K8F_ciXvPwPEL%R5pu4=kqWi3 zB#xWLzr~e3rkU6%UZc!?TFD5J7 zllko_OMBX~ewo&v+DK|k%d}F;Dp>Ru{IEH9(91b8KOApC@H6#)WNH$T3kBCFiQqUg@yk`8Zz6|F!7dNROJq32AHf{s z3dh1jBpV(;D=vkHE`_#9R%%>Kvh-5Z%meZ33>47*FQQ2COK@V;fIvVzh99XQ7>)(e z6G56~Vli<7fi-AtYzI6yxg~qyRO4&5*FCR#7Ry_bLjcIT3E7vElPw#-F<*s+n z4FB!$8{xT~$qjoFjy=n69~%wHd|9|LKwYRlT9>|w;(u;k8j3sB53JR7YWJgqdrTQa zc?p_N+0R3USJj#_WHXds!3Ho#sh50)42pLwkB0qRMvWfw^`iSrQNJ!WeWIR_LCt@E zd1|e&my*|e1eE{FNF%;P&R5{jUPc_77*Pf(45%bC(x+X1jvO{p*WHb?6f(QA&&e14~;O2E7D>luiZ0Zy*-xTU_j$@MHmDYqGp`!QS?vr*gXE_1&-T zp1F|nw13RyQP8eecfI-Kx1V|AnZ@-xlk0b;e7in2BXZg2zwNl?xLt6oV5TAI zTfbtrI9>NVoTEsGZ>8uZZ_PCSdid4wY+1^?^{)5G9REk*cf#*(PwqI9a2!z~JOj9C zCSmlV6v*{+1Ma3Z=mj@o0x*{WnBDj;dvd27GmYh`n~0wX&~kvV254o!u&+J8CdqhIrqbeJF(^cY zD3F2?Wy#VfU^@Fc$`xrolC0+<;Q>J;G?CHp7JL~vCH@-4kpuNbF7~hBxyjvgWq|vp zWcj8Adn=Li8Jh5a8T>(ePLl*ifg%Lv}`9y&>FnMBxh$W0k-mDPG6=LNFX)D(pxfqko;>B2u z>V;{H9(gLO4bldGknvzhOI2drm5R~g?;-~@eYx28;Q9C5k8g6zu7WB1MB80U88D%| zX8OYPQ>n71gvI}Xt1?}=aj|k+vT|FhvSY%#T)FP`wpZKUwtdHQ$FtbHH`%=RFDv&= z98VWjFBUZ4}q4k+0*ZP_brvx%{07u_1j}_j4d|pN;d6UDBGRz z?M{^KPL~E|8ou59MsuQc<6Ymz6&vR-yS?kyuIVqOirdrv#zp_8q<_N(~+*F|s zcqk}%n1k=7*7%NH!b98!QEFpuYro31e|0wX6m0M(>D5(7s^Fetu8j?UJZj|_uZBj6Q(i(+!1 zj)F3hDJWx2p<1MlHjc>f@D+w5PUfTXBscTQczHj@1ae@ERxo2Uzk~WfDV>Y`9y|d)OeKGivO?Q8Z#{kau0XD$7RLpXLYODb-kx^?Q!4c~5k zqjj-G;X28z25jjCg<- zeTR0NzTr8v-S%6ZZsh<_$$q zL--LD{{Y#BvykA>yB!Pl$AALmK!IaA3LGO8C;fkjmv%D@StY&PNB~iIG1tXGsdY{GHx0(<&?<~ zmSsJ9u5&=ZxI>L@P)T{l0)U84o_Xv1Mq92NuD zE(${d?A~EV7E-9dagxmh)Yd}BFaP48tUSe3c%(8&oj?|l{~A}zf`edvEo3o;=ufPs^Zi{LE2lh=xs@QTNb^Wlitm9`6=&#gyX=E zm)w3V8<5^nu;?gHIv|fdu;4iOfwy?Yir}BdXnuTg^EU1e)^!$}zh7j6`*#6^L65*5 z4PB2xZ%(o+%9t7;PzMvkZ2$xv4D`Vx_TXmTnCTpWi9%dBE>1bwg9%v~D)raI-7sOm zi7jCd2HebsF)srMaQ^HeKw#EO95W5%Jq1aQ1_+j3CTV0c$I)g3NYDrpKy;q#BmIDE zxL^YT`6P`EVZ4-oE_KL(bihoSbpTKD`v#}Q6gY1CG(mSO`j`L0*VM zk3*IF1_y`5XeciJIWkSskcZ&G+R(!l*3HzUe4B6fr0u?hy*%wLohVqbIIQJM4$ne< z>+Dmh{GAJqT`L9H_y6&|e9lwxF=w)tf8Z@sK%mS30$mG^Zj|t~0s?}vpT>xFx#i!$ z{hkxU|4yw236kydHHeR5+Q1_n8+4LvtUeBh)^Ilqf!WGHB1n65Onkwu4?nLB`irI; z=%rq0B4#Km%AXv40vZ$s{l%sL5H~Xj`i4}wroaGG-3y!-0>co!i(#GzexoM?QCWF0 z5LY!o+e!5pYJ8IGNs@%Iu~(2%{1>Qr1}4^sLTos4RR}7@WGYmOv~2$^8lx@n04*Jy zqaOId~>v(H+bq%+s0LtDNZ6-^Q3u9K%VC^yV&I$2lF_;u^y}XwLoh zw>3s$+-zXK&3aCi4HwrN*bFlOLurdxF=oD$gXz9*8n?XQ9YmUB4wnjLzB zrVfoM2PIh#s-Z2Lb1i{+5+r1IG2VI0+g6o1B5L&VPK}n>NSQrf)1M6vL}F55T*R(l zP!yuW(7PRg#Gg@u(XB^9m_&F$V}SMu0-e|j(e#c31nt@+#GrS5tnZFVa$xodhKB}+ zTk<6jt$%}9u96O~_}S*`H>4PR`Fj`Qi4dy$+U!n*e;NqS}R$;7?E+q;yS?Q zE#Q`}O`L@=X=L~=Aq;sK!W3047B(ddn`WyQTaG4MjwXD^7=c^xwTjNyI|Q zX`Dn4v>6p;DGv!dfqx1fG3Lpl#wi#;UuDlFZEr>m0i@@lT#dalgp-P*@;$;n+ zl0_jDeSU>h1VS-VE)WIixD7RtHlq*#+eYUf;13)-5;!(;zJ0W%Pe}tkxCa*q!5-nM zR7fbDF%=sao%fMKyi3k@O6w%XS=Jf-D+(k!0$n%|ui!9kgBB6p%PdTCF`~obRmwv= zGV5BI23d&OWs{q7b#hjDb(Mb?{|33!L}q>r56Fy9o|n_!>V%^jVl+qo&GDu3UGo+1 zo=la0W+FfBE>E}{74)~+t**4Ue9^lx>D@TH|6OasyD{aZoh6x-eq=gu&&PQ-rHgA9 zi(8V#EsMn+$>NSQh*a%k$=b*6S7q^CTbnv7%-=6J z!Ocy4h+)mx6U)>0YV;{}P@kNAah+`dxd6#AgXAb=X52CalN93?0|B$>Y=nxOV&0f# z$Yb28;`83tkQkXF8`30~%+-_1m*kZhd5EcH7XE%I!g_E7 zGC!GtP1&sfe?$%JAcQgOKmdI270|v;%fh;?3)_z;YECTJPcBt$#|)INY)TiEr7PCo zb6TriSlNN~uMQ*}<@W#rrHRrFZ<`m~tsj{=XT^v4wKMtijSKk)Fyc+LXX@!qPh$?`BIXVzi*( zok=tmsSVLgEVGCc!=j;SeXOZH@YL`KEhGX^-VQw5)<*f-RGQMJ9;)w77w3A~2K0G)G3O*DM42r{~l};p_ zmWL=vsj*zwp25KIh!`MU8Z;w3pd=%`b`%pH8W4;TVS)AblH} zN0-wyqa2*|>%bXpukz)m! zng}^g!Nb?i}xwAuj94*%5EyY`xy*164h>pO{cE0`>Ridz0* z6PU$((9NnJdL6{HaSew|340BL@BxL2VRO>Jp7r7d3$Q$B0uhK}X_dKKPZV&>@^+p^ z1FiaM7xXc2%nI730qU3)%mYLa1TdgV#ARazs&xtcL;*xYB!M*|fYLCiWu2TDc3Eirm>!YJGd>$0} z%z|~$GQka+c*{$kemcWtC51lTW;i(w6+MKy1UIp7h_f5wJjmgAiF-@=WPd*1ord=U zyt@qVUXVK0#?*x^;}R49Z;QJhj8xGadH-*wSYvbR_uN=e$l9fl%6Lueb2 z@@Oc)2yj58y zQ4j@&qa)D(PE`kh`q3-O(IW=GjL*xM3bdsSVfn9`jFZ767TLcHg02n_5&tnRs7wV| z%!8*0D%b*Hp|(iLC4z`+3l5&6fs(<2T!XNIa2XFu)#S(zDOCkh2o)ZoRIP?LIC)3h zC0TJ81$%2ag38VX2c>*EfvO#-!iiLk*B7a)WCr9K`BvdbMQmFk7Ce80@`b?2+&~ze zG0X_VVGT=6hz#n3AUT5&L11e;7>p4u13=wmd*324!=v?ULSMUDRhV|-V$UG)kD=P- z$e*->vsiFcF4we7Sn!*!Y?!dT?ESF5?d^f@cKm7HeBbw+?;K3DA4=AD{aJUazK2rP zE-BIGdH#E@lN4#Sm}eD;bhS8Yvq2)k-*PiM-A!y=)SH zf>d`*;(x_YjIzo^kafoBhP7L29El-nEdddB`n|=A-a2SaCLDG0WS{^JVmXdFlU40m zcy7aT4y>+mQPa4I>~RqQ=i1s(Wzg7$)>vkp8P)-s09NOQKqt(-BH;sM@cFd*QfeHD zL8~-ChmG6i5aGHo02V-0UTJX!@hA3|W#NtJM;T1TUu>h&%&e+w=t14JBo8D!hIC({M%JP@1)^$` z-r7a)mZW#foPTlaq2$&>DQ{Q8(Z$+M3L@$>Kw2PCcJfMNR;l3@{IMSCM$LgsYi!8q zGIlJ{DLs2~_sSTvBB0VR-aLx5SekP=m3XvCP$?2Wq^4$A;pL=|@;5HI^<1h?-y2*@ zYx@Ctjk|wbPN}4(sHc7p^)>1BB^G^DVw?}A&_E|o=ee?ZDOv@xCFTfLTUfNod=WeR zB_CQ~5;?69oDUliE)GX%9VVZDpaMo>LTeyAh+H?6gJSH1IH1fnk-`4aw9l4hSgMAw zwZ$t7#+V>p1>Xj_Unnp#6b4Hez#4BTP76=0^w{}k?c)sh1|f+c_1v}Lh;bFzH%oFiGjJLTUq|Cyx!z=U(z;gRKMi;mW$qjmNR^G_`} zjV@W!ehuu9dH z)QBG5a=9DBWGus~#zE;+$qPvhv$~>u*+I)<#=ofHHufzFw<+Q502QR_R6`24E8*-I zl_E=*7mu`L{AmX>!E+%XhOWtHy%|O^aAjg}20`q|H!WF|DN-y(D}K7|G@aom(YK=G z!|>)}2(paln@Gssv?tKSj%+L7-!!f-SRs98P~V42$iV#<%toFM2oJ^=?QPm!*qKCLBxEff@f| z-L7Qau2l8z1jNDaqC{y^(%m%Mn>cVf*?M}Zs5((|E>+Z*@boRaD-t#BNq2j?Zrj{} zWZj{JXWf#wC{esV<=rq_c-OmesWgzNdm>f(WWx7kIx`y&pda2HE_T&0oAIs~JP2TiiPg^I4E+$tU;O-u-=(alSfdlQNDt+O#(L z$Vm9l(8;R`&rzVQ0N8TwP99sRNL86db!j%e@ST=Qv6Eos5IJ4su=D+!$oCjIwEB`9 zm>`%{GI_m5lb+6O^2bmrsb4j#0oYElmC(u0-P0Y3z_w)7_K!KdPy#s^qBvW@N65QZ z%e0p=v9MiQ*e)%sK?`eOg;U;clTsjgnV9<{XNmQwX}PFo#Y!He^cGHaOomtN?A5{H zkZJeir4^P80zX&&M0~Px)y-nti>>8}bz9~-lIwP^a(G$U$l3fKQv>Nc{(IzOAj*1a z06nEW1f%qZ=pgtlzWF-_=Y7^exdM)n^(O{_5;@R%T!X@Z^M#&pBZ*0O2B z*{e?zVl_cM0k23WBH4P`K+7!#T7DlHcY+4Xvxrp4glids7GDM(k3eOMaErJmhCY&8 zq0ZomLZfv#N4DLNXP&$3 zZO)+IbzA4^l64&uJyREwo^?9?2JzpStlRr(!@cF$W4fP!Ymi%xre^FXY8s&G@(LD; z^j^!LquJz~5wy_;J+%RQ+A|1vf_j;D3L&+kGfydRYr@egBXJYp<9-V)n~_-DPM||n zQ#?YiN69%x4(!d#&L303#IU>s#mizVX3))ZjmerFL^vw$Z8TZigBdayB>zxghlXGJWGeit$%I2G4BQc;h_U5>dWPJTT+KY1fm5e+pu_lY2Qrhp<{e6ak{#+LTe- zk%9RyVsK%4kv0_AB2Kc02;Rs7V=N*FS0vj7=vqLCsMAb`ypG#K2(gtMwxZhupJc^8 z<8Vw~-LaF5#6c0Eq({ahLa@;dYXh~BQoc{9hRYh{VO!+%v?Gi3pDoG5Nc`zvmrymfNM|Gt02vh07e?AtYO)XYAY zs@wX$f7^#ckd7M+NT`K+$8%k;RQi5}umri+4RuD_$;e5R8wf_)+-je{g!d zQpasNYWgX&X7u7AN7ruC4_v0MowgsiJKOM*o5y$*N?n~bbdNx;o3pa4NwZgtG<(x* zH_me>99wmoE$$$2%OcFXD2$v1BVn!vVQzSA-jZzCx60w=-aeCcn_Lp51@+P&0#bTb zzlfNOEyxnK?P$ifqnU>l5p3upGme@(wjE z@6>!iZwP5@ z7)mH$b@ggB%&sf8G_|^Vm9G`Tgmvz^6&X?&vXYC}EZJ{JTf{a*iy>qy6Psx~Z=xk@ zFv7G0-ImM^MEyj8uGyzjXl(lt`^Fx1gU%}c2=8oBCU3vd&YR>98YFZOVGWBo zk%#SgCdOiGD5A3&`)Q32SQBbHYCk4GU5BihJ8Do6+tF$IGk@x_BeW+b?>RBzD!C06 z$=FuKj3zG+J_nCc3NOn-aS%FBti}9J70r6a*esTFp4Jrdary%Pun)6R zo6!R?+ROr*SGj%eSh8~eg!cn?DUQT1de$dB>t}|Oo*j#x{YlUM`OEKnjzho0Q^p8E zZW`E%vNE~VT%D%^lgh#7$)pjJeTiclwRz#vIbWP_F2VwB9f z8xp!vgt&o#gK^R%&dY$`PgF6=&Dxe(u_D9w6e-?l`-5QU{`6kzSK=I*1`+KsDcJYS z&s1T`ouk5DV+C-GY;@D3))tA&P}NnXs*q~vRXz1HRh4q*sH!sP%%b9>3_43#lQHOI z3Y{eUEb7w_ejfqRIn*+9@C%i`)ITxW_6WVHC$FQ*enx}Pr+%hhq}(|wsq`YVocJic z$PzYX^dhF6n<%_jrp5kj=^}{}#f{X8Zt~$mGQ}7eQz1D{uNI2Hjbsopl2b>oXUKPs z9GZ^AC&~9YayF5J>((;FgluLe=SMVw(P=b@34fU9Bzaq@8S=r|Dx7D+@4MUI^M8NO zD)Xc%y%XWQcRI`M!5XnfJXuO{xgP z)reymRvb`g{m5>$)-RVjeaF22ojuII?9BhzibP6Y_#ij@ z6!l*2JS=~#YW=JwS=B0ksAK1R<2#s*Q0&k4A%k#|#%2HmsXZhPpl7uMF&Uh#iNuq< z*`QcwAx=%OL0oAysH)ka`euXGOfyq^gB?Z1M|o{ftF!hynR=X4a6tXep>o13b?!Xi ze*4`lk6Z9uq9sSxv&Of;_x9&AMLe|**|h;*+h4#+_6k_c=d3ZQ==o4eq1KY@QozIZ z7czdd2>hsDd<URQ7*k6&W16MemU9a{=XScfMltji zl!$1@fISc(;4%bNrxe?H`D#qA&F;8Qi!D-p`Y&`>O}T7WAhc9ZrTTO&S1{HdHS`8| z2j@7wvVuu?SUH#p4&(FaBz;rkPQK1~Lpe_EWYVgT3Zta)$kZH^BP5a?+sf>i+EyeG zUnFNECG|gf^5m&t)@>O2SD;-tpk9S>$11b${Zx-<$ze5O+&qb_!0u!)^Z{M+mdWI( z>Tzut``BgV%w*=5%WI()m!GZ_S*$;ptUn0F+fA+s%P&7@x9n4D zPGL_gNS#4Ff1k{i0!JIy>IjUGrpLG!ov}(F#gtfH8J`@N&!8t9Ee~Q(spA<6&A>yt zDq_{6%OuCNl1d5<;n=ap`ba|>AZyMOa4@f=YpnpV^W;qj><-fk zLUIoZ@qvpnUo3Yb+lZqn7)jV#k;g|FTOiBiXjznXdWlWUM!9w-uo)P}gVe=`P~c7O zgFRhSmfPN2-bLT0q;Jz~&w_8qd;SUQa%tu5=&k4?_I*p6X6=A<3T6B8_NjE!y=7YNqwmxZ@Mm2Qp<1vh8E0n>C*#H&5E91Os@+>GUHB zyt5wO{3W~EB9ap*W{j-kI0>y;IC9A5f)b|mdf$BDlZzw`3gzyVzd$AB16@pMgM`vDmX;W>R8CB)BL;l! zrko6EoifsDt4*lJM%5X}bXhW8m2Aq_n0&*jDWb$khlVB)_Q>LH;^MkXjQkv*hE|N5kV{i1(w(!V$5-wzW=bQZDYuD1n>9BJPP*!wFkfW5!`0(Q!4 zD)MIcg1ZH5a81i%b$hb9eW7{_4i}ad{B`w~Tivj_^c(~rg+)^xw-4SrxLCX^S-fk$ zFjWkb%zM+-TUM-Ge(^^fl7n@wswMMfiRl|~pDf=$A4!#;nCMB%M$fcdJ@3X-l_w`oEW67R?%M3w?spqgl}8av>Qn~G z^pn*p)PX60eZOQwcZrSNijlE+)aH!y==4uQr9kx!=fu`B6-aY~5xT`N#svHengxKTG3JGAN9{jVr6lAmkPE-Ko;fwFvor0syUx zX5=pVAAVY;jj60@l}Qb@3|Ya-SlP*0nuh4h zS&z|r9jukwU}u{255yvTPqN8}14rEyw1+it^QdF*c^v)Q(a{QK?_Z)cw(!R`IAc*X z@d`{Q(gkcm$|-qdzg!x-KqFfzlgSK5TOI=4y0|5RrmQ}VuixZ8vOyA?u4sI{_SM?i z(z&8kMF)%{R&99w;Hw8`cR^#bYS+Zc54;uWbuF{4v+L%rzw4SGOH`k{n-+YJC(eELld_!OqV^PL%dkjcLf(RDA>Sq>lGh{MAsHZxxzu9oQl!DOH8FV+|=4VhmC$dLt>-s*`-XxK7hy(?@2MlPQ{_097;v zKO{!0R{VbCf%D+%6mW-3M^I#eb!%D^K+%gTfP$YDhu@C^4=Ch1uBSb+3_fd z$u2V3A#E<5LG=IV5*wL!SHv}i#)V*rbe*zh(ffopFdafS(_#q-Q<1qQQ<|Davz1&j z;mokm`&IPsY8Sc1n11(y96hftteGegA-<&9B0g=AJta59`O{|mveZupMQrxwdJc)x7o5r7OC;5)z%<-*u>*CLM)`F~yvz zAnqq5$HUIU%YNuOI@PurWIMKT>orb{;iBR1#ya z#5g%4sr`f**MbVB(j}52Sqt)Kp|Jzd@4$&9t9J26OO;>3{@PemV5x!_){DP+;lf)Fv+untc`0)KC<|nH) znkc4aLwEgyL@^5(0i*6rPR$tw+8}r_iA4|;(^Qi6=qO_1h6;11$pU}ch0A)@5awqu z#oSrh@CxBi#!^h4Kx`s`xA+PZA}WehE%) zu}}M>fUqw6`B&5>FKS)#Kf8cNd!*c>HKPfQH+C?Lhk z{VO5@XBZKH?95jINq&CeE2k$!5yE_WsZ8~nQW?Y;q%!m- zOJ&G+K#|J0vZXQ*Hew0|aHHU5T)*e3azG~AA;vU_6HWX`*jR3ewEGL% z_c4Pxu>>i+iub+ap4~dtIbE~h-|&w6C#x2G;>!hvrqPDJwcD*& zw`d-1Y6+-4Mt#v1315b_i}SeEIHOnUM(fFLAv?|=V0F#vNIJUT;*^&dw8xJvS?Wz3 z!^aJ60?vgW86FwpFese&^J#~C@n_628Cy0m)2{h_ZoMcoQ#!MAV8Up`%v@;5E|+mrtG1^<>0vCmud z*j#ij2;IoTrd@Cgah}>t+rH*}80_atw$h&N{(D87xBRt^nfz2$d!ll4V)JJbd*`FRVYB-);wPC;N6M)}g1`NcdVJ>hL{ zYf2Gx$pk4RebG$D=;`+wZg^7e9&z-J?#sYEf_EUNY}->dLkeRwtYM_lH^#P6Hr=}a?YbZ|9tX8C_uOno1159XT8)(i!B}zaAj8vj%@15so z>!!9&=Pwkmzw`W0#2G5A`T-Q8dGtUFjNlJnjs=J&1VVv9Aaa0+giIg8uz{*s%Ri5c z9bmXr;M)Ud*{#=f?Ky$BktT?5;-iv<65=v#^rd7!eNIOF-$9H_4T^I#P_?p2s3MsL zC6gfjE)`{t#x5{M_*nxB_cj$p&VMCZ@O@+fEr99~~ zbv*JAnxslxa6WJmmXq%XK80K}GG@G+LpIjDjBl(bSR^STmddzTW5Z1JUDty9M8a}{ zQ9-FV$5|X0^5 z{B_B~?j@y`5X7=Ihg2OTWvVc46&GO(_$qpt+D-<`V8c`V6Xb299z2drOxRFbJ3aE3 zCF>`h6XuCNT!4@$-f-8wAzjf5t^0~?ixoSP6+7mhOI7TiI6T$xwd0tNN;d$maV6>Q z$=y>=q};G0T_tyYk^WV(2n7YKSZ=v>8nwuTm0Vq^P8yI4n_sagR}s3wCSfQrG-w6g z8*t0~vJIDsoSD&$SMbVl_d8MoowyM4@xFWtbG z9FRgguIF8s&vDywO>$$JTz3@G<^l5n8-B`1z{2NU#*`)Q)hD%FT90%JU9Bag(%J-g zNn>wOl=48S#AZmVN!Foe;>c!^$LyDi)qMKznBxL$pvmr5>T8v#@pjcLuBO;-p3nSQn39wD7 zcJZGPh|}ag`i(RCsXAD+9)ZO*Vj2aUSch5iy1-#(30F6S+jKwpH3bGEp$n9a564J7 zkq!J?s|5t#9~0UO$(0kWCRwuRW6V#O`iKI-csd56*($~@{} ztSD-an;4o#JDV~xkP%;51yZpU2dmpK>pKv~myEqb+BmY=-}V-lNH8neSp8xQW5szw zd7QpSAINsLtvIUyo8RvvrT7;tIgp$#Qg!MyLi|(e%0DKD*cvV`+~)ki772+iOKnYhwbHx1>9e~z#2CSv1*2gIhFtE@|xZCotdnk?J8 zP_})+aVS}~eIouRee_r1DM!=n-o;J(lAHF;Kb_jtlXUbTXxUve zeL3mgIQyBq?)D{5(JLn>Pp;&d3-^JJ6*k;++n`-p0L@4WPvG)}rx!fCN&T{D-SqR* zQ847KTZk*)V&KYMoU>-p(U5dB%sf4FD$#M|-RF}X=aP-*-goq^tmn!%uCyv7GEvep z`}FLoWXbN7dr!i$=O?R2O`NmK#AHRU`noo_Bu`M^O^j=^D@IkHU3cLnlVYmamy2?; z!y*cU4COIgIBKDBxsxZkgMILvVaBqUmKf}8;kpemP7tE(C`*ogkx=wJA37lZ8yYwo zK~i$Q$-P!J)0e8)xLDDitZ1LTIu~E4*vCl!&%e^dE=2HA+?nDD_FW7QT{cu&{MV?L zp~kG9%z433*=J@OQ#G3xYqlqAw$B}Z*RoJ^_!m@J@jcW>{0nl{tl5i(nmrFyw%RR= zwH?XYj=3knpVc1v1=WnYk(ax%GI&;qUBSfwLW~iy9@_POq)z=ZRh>3PZ*mKPJ@Xx@ zz~hU7?qs0*-KP_0pIiuh_5qdu9I|~9zzu6_!<>MP^`koF$vhJ^PCO{L&=Y+0Hob&4 ztv0eYy%xc6tlP4<4)(!1=AM4HKDF-f;<}#Xx}NveCqDDU!n!9P(7vAqF==pUb008X zBv<_~Ou6&1M;er`paE=6hC!*@KG&V9+r3z~KUuebe&{`Zp-z?w{9^F(8M)5h;F+G2 zkKW^7r3x!_WO$ZOt=qo1ZYM0V&AZ?2TUdAe0e$?7h9R~`q`^=p>Bm0O5d0EaB>p8~ zJ?#PgV*0ud#REPHgA8XMy>1f*@PGlw##CVEVqkAFuy_8{dmRgbGrzbhlVRDxkyt2# zY8Qa!F#jX<=cEBbdV!PGTNbN3lGPn^Pl0$>KZtn$q9KI#vUDo*Bm~8e1|eU|fDqhi z{4WVwXkLTX5!*SVd09O|&lpZduH_>#gl9}ETO=qDm&u`;NcpEu1hs-}R)@QRPPSoVx;K_Fhc~>2&TiMs4>zT}B5LAd)?@M=e7@t-p1~0DKRXYG_@Y3BKYAAYCl+` zn+I{b3$EG2!fVR(!Uz^sSmleybZMY|nilm^m?~O$=33G77DIno4Siv~zu%!9k<*MC z?=Y5u`oLxoB#cnyLA^E@>Q$t+=@IJXG?ZyWSvUudGF|tT>Hh>}x(sFFij^@tD-+U@ ztp3vu$QkPC7<1(8KewSwr=kCxIs4C$2?T8c$HcU zzCdlSaYru~reYsZ&*jx~0Uv!-W=mn{yC6F*5% z0*$1Km36r}%|iQy_EvlpB)kc~jC==2#7I+%Zg{;-EqssMP1rJ2xj`;xQm@LB+V#bI0zLAUrLnan;!sxa%z8F1dz5z;s7#6+qzyB@+(pkh(~DGj`lx?MY^UeGz9bX;Yft;RSRDjf)s>r zXEUXIXpY!h!xTod*J$%8Sqzmx;!*|ZAc$5wVzuG z44R7UTvRvJ>2GN={bTa|Cvu4Hi38+>$oUL87sv^dLlPF8Ri$|rx8kaBBln<)ILRZA zJ)%M|v`C$*H(xzvr9=|53De=a+6-0E13Rc{h($LzSQFmK%5YNna8z}9ve0faBUh#J{wn9a@%ps zFRz*f3^2^DpYx}xcT73c#p`CAiQ<-or$sI8!>ZaP->&&aNR_G! zCQc$x#rmugGjdf-_a)tREZ6S&4n0?O^J3MeWEGv2?M_v7qGrW)Gy4+7n-iYR>bHJg zHJ1nARLWhSfW)e9{hR#T+rMj`-Sx&0`k-zv?6}kQrS%<&;*Nx;C6hNmNJt(KisH;1(uDFa4 zM?^nDS|%aJ;L(tYM97%*9m|w!A1@Nr*SdVU(1S*yEI+WWY>( z-Qtly^PebSg&eI0?9=QRKKqQQ6Iu{CD3Y@sFOe1Z&yg`kW@t2$2#-t^>mF9gnD`^) z{*OcxzlRb)6MY=6t=g9?2&4+?ZysItRWACPlfLGZZ{y9Lzwwn!SeE=9urx@Qua?!N zYq!Ee3*9yTF>X&SNqdTks!i{H-?M(DkSi?x$d8L#aZUNGf9~3X?_k3Fc*5~GEWUip z|E+;r71Kvkp2nMp(={6wYxXB=_NQtN;{MeS>@`cJJJO!|nZt=q`$22UT$66lL@4&@ zV(t$;wKIKldlx(h6Yj?omdDxBUHl>1qHSmrBx;HtUj?KX@n|fU8W^lX18QlZ$R4n> zD32ay^_kh%&{7_dXZ87%U2e1L5Y z5!Um^Y`pa~u$FWQFcy2ccD!JaRmD2&&DmdY>#Ah>X^wGNVWzVeyqzr^w~tw0s5I0- zH;$zF;z`F>Sffy&Fb(d?a~$udcmN&#xgzV zsq-i^|GqLweSkfBT|oy;xlXTu9%h%h3uPAESLRy%U#EUc|IMnW7xgTNH>mNvkB;vf z6$k;b7QS%I5eDt21OD+wHLbjxip4eE8DA6{16rLSZjIJQw7JH#Ua%O#eN7?aQLhSO z3>%umL+!-Z>vJSA-F@PS(8`tr*iIk8t|7QO&A%*nScqZS(Jo*G^L#xmgX;GL>Zz3aR_(XFy?va@WQFLd*ca-@ zTX8QbU3^Ttv5_0?fzw!_iPxZ1X=LQt&Ze?JV8vhql&5{z^EU!AtBUJXT3N;ACa8-E z`9>{TVZ@Z%v#@4l*Ef?cgmOzMTLBuE4f>98OkV8tj-EJoIeg^`tsspF!Je>tpaad+ zcH9z#JE5`uqZOq9MTr}M&4H02nvQ7KSY1oBLj{8|Iv-+GWqBOQ@>m$ARMIz;!yhII^EBku{Ya&eyi%ap2{9OCVzT1Wu< zf_+b&IeaeIcRG0PM9-OE@9Eyd!LHN2eTRGdvR5~3?NdkD>3mA|f%|h5mSrWfk;0C^ z0Y^YFN>6=iR2;RQUCVv> zuCwluJ`=s`Y*;F*dhPhEJyo`4!ox~ryW1z{8W$WpKUs+%;zp&$o%7DQ=M(kHLIPX) zo>Hi&xqMT&PNZ@LCJ<~ms4C*0)lDy1~P^hyJYM7dO%)z^e2H}E_E`uyoYk{t8 zgp}-ZjfbYPP(c5B=eQH{Q>vu{HS$m`QND+ALVhW0++8#DDMBgD{XL7t z+}APkEEXl?QwcAB3rjaUCgOr#!y@M@Vnotll8H4@8F8FmSK*VKxJ?}~UMjvI%p#M%j zrmCYa!>ERm4bv+WYb2Na8-#kQ+9?Y`xnK9TY@-}RlGJ^Gy!cTOaBok+Ew zgmlPWaPv#)BLAd)35xJXAr}ITDSm8z$9wjL;xmb&&rH~t^4lS4N;|x1M^V~eKXVb9 z?&+#FDB2X*ZXQ|nab-0VE=ZZC_TGiC2lqENr7E^f9EVn|uX=hk>1~<4G+&);Kayxa zyR>QZoNInxa?3Gj^Gvy>_hEUqj2nr(jY}Kb-;T}oq&6NvFzlN(Ep6LD_dix(6}7!+ z%9W^Uzw0AY#05oDPk!wSOBJAwsfsq3AoAACR3^Qf(rr8E3zBU|k^Ml@w{h8BIQ7}P z?&>!yXOGSuxpVqY&%IyQnXYS|Eq$YQ;`r3nyO?oTupKDR~k0Z45!~owxrR zF+_=gsTYB%IN2f7B&Y#mHw&=pfvTU0>5~>3Wc|2?hC#Q+x~g;oIo6{L7K<_g+_=~$ zkGD|jT;dR3(_z1E8aIp8I@P(P?Y|+b^60mHsb%t3hCh#mVJcAmiV|`F%9gPHc!O)P zDL7?9Wo1-x42}#j_7X@WUVsMcP&|ecbPQFPNP#d1%B->BA<0V0zcDaW)Q2)UcnK4T zpUApHpd@%@I2MNn9hlu1u6vHfAS{oG1^DhQ0_PWza z)4u8%`(1#BuX1|l%)VsRmhU#c@7oFEtuvm5^0q0<^1Aw&J;`GW7@A|hcckD2}e&psV+CR$2r{x`Wja;Hm z)X`@At&%~x+Z?6`i*cxRO;4JJFsKj>kJ}8Sjw~NQ*OfOB$Xqv(p?$1~j9) z3Gtlgks4l12b%tzI+P0fBt0R~=sOIAm}NqB!wi zlS9;1@)*}yGS&SK3y+PQ*Wig=L9NnLnVJ?eYMUWvkBKGt;_njJy@v8ZMg837tv63D z6>gXrn|mr%c<|=&zrmH4=B2W2X^%Gzp}VJY*|%;6f~Acgb9se5rnEn>=--y~Z<}vS z_z!$+rm$r-Y|(!#=|6VQ>nwLocvea|XUU?YD(R@2KKZ_*Wu=1im4h9&R!+ArSfJNf zHeI<`u{l|>IaRTBZqMTO^Q%{eR}o?b;7FG9!~6}&LWM{!R@8aQbf0;G zc@#QYnhgc*tUCXJu<9}r;L4!FlBq#5v6?DuTrAv_EZj6htP$X+z-@9$##a=cae39*g(yumWW5lIZDnkaw;f- zoPS46*@B>#xU{Ey;=s-B6^qr{Hhn}kI=>|elfE1R=AQhcXWn^c@v#%h$4=a%0J$(K zB7-~C5a4f7CJ+Fw{Nw`voLnWiSbbZ-ke^tctS1*0xR3|7gIFRw#1rRY91JVNJi`}q z(TPI2B%`Z?JRK7P*qeL%aG=VK<4Dn<3`|l!?syf$SJYP*gr34s45T9lKqsJ;>@i3w z&ktXhYI>s4kvPc!&&eSIe|YF>SR5WAcxQtonUP=e;Bey=5>bf5kr*3Q$r6QHg;X&Z z9^!*gz7Pf?A;@ooS3_c$3MAh559woOVd$zCeR@^2=X%<=PrDueX@PgwIk`;k@lCTODmWBjc*Qq`_dbi z7B^r^e}AI*z>1x7?}YgRx9w&ZmSUw9x1YN8)bx0&r0r(!lE3oyiCZTY{kxO?-Sh1! zf6vWRX?Ho!UdT*_aRl&Tnd>F~$Vg>#0hzvx&&8vv)N7p61tYxaCE6X*Xr}(@T)?DG z`u86I2597EUO{I;QQPsi`dv8xe`(hm+s2ih?~t6~o5PnVi4ra9WlIVr*^+EWmZO-m zto2)V;?1K$S5YK6RxC@|q4SVnZ8LOOHqojdn2bI-Z=o^$TGrG5FQ3O#kb zoD#OKYT0Ut*s4L_@Wd8Wbk_8;pm;zjTQs=fC*ZLSegZH&TF#@(Wvr9aTsohzgXw_p zV?$uZ`9%D&kzy_GB8dd*wE z;%(Qw?eD#^(s@AZJh0L^taT2n@~Ny@aGe&g>Co=DD*cJH>TCa!4+gKtL-8u1)vENN zuV?HDm>!LCuVl?ADieUZV`_F|6*G41u$}{*Pe7u=t4nVrq5Kn?i!so~8-;wzwv}Y+ zzp=&2A(U^THV&d-u2M#eA0z9y|W9^Lk} z=R`#3m`FxBq9_@x`(xe=gp;R{UCjsFl*&vDZ^?0|4u=!FNGo7Qu7i#(Pl*pU^K!h}&Z2CcoYk5yB z@^}RYC%k2PW19uW7+BhtCPHP~EN$z{%3|H1cC}uz(fK}?>>OI=(EFhdG<`jZUzM&p z3+@A9j=>e#(q$)9dyc}sUFnh&ZUJS!Rn&PIYG!_@f$6gJN^uJ#VzGr0k`2z8Z5LA8^$5@D=(%pGp`U6FCLO(1$|D;Cb&_V z(u>!p#HkNA6*onHEo-OlrtG6G>oV52R8NI^i!wyH$m=!a14kxU7BM+psDnaI_A=sG z3M+IBNXAnnYlDz+=K0vz#ETbs9(?!U1B@16s8uIOV*zXe+o4FazeXN&SK|q8SF4OpCrG&it{i>2r=zDQ79F2B zI|>o_CSrqT9QLhKGtwah~{yb04GN;-jXqdPO%FZ7}2#$BsMP5zWhnpGZDdH`|g7Y4AzH` z*jj!JKdfIRF51cg0t;g(q%!%hOd(eys}k8roNT1;Nw1Xo)8!;8t#3=&%0g*f{Ev40 zpUWz_v9TIw&0`!3!Dt&DdkS0oFU(9IdX+{8te7}8z`lVC@u@e?4Qx#mFixk(vA&__ z|2Jge$W(89sAHm7YC?H}=;RZWXAGVmhyo@Qs*vFThSL&}%qB9zgppGaukJbpapK`w za$3Q}gB1k91+AJvDW&R^JJnL2Mped%T>b`+&9g5&OpFV88xtFF(vB#BZ0Hn7Y0crK z<0d8DQL%{(EFgo52*@BS0)@s(uE<&mBU1V;$j@D))^@4RZaOuy1=O~w$D^#RNaWNv zkw_Hv{Xwve{k3yIH2=JUT){Y8KtBw%4KVXZOa#6`d_eDo*nRGG0HPJz5(d4+UQEtN z8O42LjGK!JuB|7aWs)x_z%H3z3ziKbC5l^DNEK;;?D6I>7+n_LfKP!-WL+g1z0=U_ zmb6e{-=tDbg%Vq^>}`e*Qk z-=HrH1r5G%+Tol(ct@^TFsGfKg_p1NB_B#To72^`?^Ir^T&hh~cR@D1twBbqd0r35wJ2+0Z+dS?Z{Q3W$w--Xfh6YFniWO7^?#X~=!_tSw)@P4eG z#nxNanzXMbc_i7lG`ehFdP$YrRa-kcw?Fz*=U=#!f27Kts;$#l?YvEe3nZtJFm{g5 zTsV2M^&bp(ytKYkTvRLb=yPKVy%B!YEhLRIQ&VJo2C)tIm|>jkPJJ{Zm)+~2$7p(^ z(CR~>Wwg;iYIi~*-l&k+RiTMa#!4QkrY4_LNZgc>@KrEqE2O|uNGPq)Bv0`Z2oMMo z_+x;K3lqp+Q`fOGkes}PvRIf{vlyAZ-O8w@MsDFln+rtRf(hmMynPEI8{ zHVn=qJT4;|m%tYHl%G>|eB>5ubh3lGm63%~L3JgAQKgaq?Mzdu34{rd?WTei9u^|Q zcrTGJ+?~GD?VuKE;$9WIsA`aP{{XqCc0J96NY}4C(9i8NGRmT$Fh6 zT`TZgR60wVB2W<7q-8=IvcUw$@MECV(M^wWbL8Bpf}v$riBe?HsM=v8u283p89UaM zdb)3WadsRg9+P8A1A;e=HyI;lYUX*&S(O+i5A!s}(F_x>PK><-*3-d*38kG9$VP&aZxw~e zOX(y)GT}_cru#;JQFc*+E=sVSCn#@^(S0^{sf5x^iR&MfxS)53x0#-=O;?&!E4!%% z#I`aHo+7K)o`owqAR&|dJtgwO4nfng&-C_Pa zEO3Vf@34w{tY2mQzcJYa6TI&VrYwOk{3Za2113Br+-3XtA5!nt*xq~W+Zy}!ZzLvk z{hC$WWsUp~Wj(2}C-1SNciCfi*}y$^{4Q&_%i2G)2N$18*|*Kp?mJIS+8xFp=6n&8 z%Bu4oi(p%rT@t@wKq%C*+fPFwwVK5?%xJ8!tlc8Uu!OUAW}PDLx|p>qiv?L1){54hS=lP?gb)aEJ{#ivgaz!t zg3Rj3RyZK92{CI+w$drKz*tnk7Q-AGYRX3|g?&r2H_!ke`W_}Wr={&wksWJNm?~03 z$uZr4{sPIpL7a(|Y+XhZ3DbA7#x}KWP-{4(vO2u-ZOv*kq&yF+EUZfw!IuoZQ+us; zrMgY4Zo{lw@LhkLUvAKuUxH8fIFV-iJF#mq@9sC8T;=dVLwW$KO4^ z5{qlG_{~8rb~s0|SwAy57rSmF2a`{zN(W$|iz{`>7OGbjRV#WQ)vCe?X{Jh^N^fxp zUL;Kh-`uC0fvLCJ0YL|D> zviNwiaq+1)J({y_342Mm>sGo4x0u^>Ctw$OMu3tp=&4B{X2zn2zirlw02nU69 zpfcI`=DsD17Jz7%V$eErY0pvgIHMEUDVT*`W7`gO$G%jgk7(7!a}kB(n{uI;LM|L# z&xHrQR-ras9Z8NY^&}^+)#+vm@7jHR>;w4N2filSaw4vJ)lQ*rLqdeLPHI zQ615(gi#SWJFqPLD$Ak8aTVR#^C(_>MQ!qo+IT!waYAMO)nH9>P;EGx3LaCLZ`D_o z461d5Dc_-a&#J#V*`!7fr~F6eldnCfm{zNEVQGz-X8 z5c{)cv)7cvSYpzf&?mam4)AV5)-MUCY`j&0fBY--nb2vHbqPOqnD7w5ubwo6ATTbQ?M}?z;An~#ZS;OXV zkq~?+Ja6ZygPwPC6ol(v9Mr_BoSVW~tb6gLder*CQ~-m7chyrtpLAc!(>L$brI_GP zyWwIN*EMge4u0cD9VR4!e?dhDGLZ<{tkeAP>U5B*xAsWTytX~v%CKE9G5sZ^@ F{|CtbCv^Y- literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/constants.cpython-312.pyc b/entrypoints/__pycache__/constants.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdeb03a53cf9934414472444167f1f213615557e GIT binary patch literal 320 zcmX@j%ge<81UW3-nf{Cn439w^7y$CY;Ijabn$D2I5XF$f7{!>v6vb4@tjSWP!4;g5 zSd^Ngker`aT#}eqQml}cU!+jxV9zMmco$bU$55Y;TQV?Z9jZxtqE97R5TacS!L?Pe~~;I+_|2QfJ3wAztwfN>jT~!plkF+0@W@G9@Ha$Y~Rj7o?C?@tzS<5g|D`9*ZZ>DDCuy)*x5(R6>YM zL`6vuBd5p3NK}&IQYxBEDOIA7jA^l><6{#EA*Hs)c5-|?fwD{Jc|s5`2qNA}j|(54 z6r|MA$<)LoRb4uih@%lF&{pJADlQ7AB9aglN6$tkMBzeQxTut&Jwhao#)^o7uee9i z^k_VZBCU$^lsJA74O1KwQ|Ne7ti^yT?i8wxB%@=3;y|WJF*&L&78-BPfger#$`yE5 zxoIwir|K^7;+7bHPFoauEIo5+&e`+h)$p5Il)flu_>{gOvC*cvFJpU6^Izt^YSU9Z z&*w^<=4ff1;6k?Hw52zg4(y$bCsKX!WRN-%q>co;)9br~$?;S$CY(m+V?lf!i%v{n zvckc^3(Xk-g(*oXc_Nya z6!weaxEQjC4QPbokLqPbsB%f7Z4(TBu)9AsHqk#3pXf_PMJ$fKXD7Eky#4$(p+6Bn z)vu0z|AjIG7FgFSKk} zw(_N$R;*Tk$uehkm)vu4-nO}r?CD*w^{${?Zj?OTY2bic=|#*{PT&Yg0zmR3RyU{q zi#cMJOZG}WQoY!xM>^SjFmgZUFV0d+Z1m+iOq(gM&l%t3_g zy$-E=?Q3+;08L30aFt6Lotfs&8C}6A%^}vNhjUNFQaMEqv96*hyuPXB2k_z3>xKaD zQ$jF`F`1YOj-L*u&c?B@u|~V~_02|03Ww|>?N_q}A#TEl{GqZ@Y3Z{3k)oJKDA{Is z&v8HU*Jr(D8E=#9ZDN6D+1tzl9kREB1=?kA`-Z_Qe^A#KuP>5pe_cgW4fJ1n)Y>%4Ta>?IQJaBYtdB zFL8fvdvw{wRo7(kzp*LX(756$u|M>Ft=-u($YeN=7Fzj_2;{(xV2$yD zEoO;XL9y5_*>prQV#gPM%$^tHK#WCj3NA*6SYnR6Z*KOjxL65jACFKL^9tocnNV@sLOEEgN@AsXxhj#%AM@qK1g5N^GNlsqgaOnL zCZMl4R2U*QqO9T3;^j+wafMdjRc?wCLkNfi@UC0fRueyhu!eh7xKava2XZ5#7MgA~ zsWDuTgy^^x0#4Y7F0TqoK4c@r-Nm(6XhrA2#&p+xnvj?736v!cl4lCdK7%hSUU-jk zJR6HYv|i+MFE^vGD~U6)9ANvwAXC zRbHJGfG$oEc@j^m_?zkj=}uE|1XtJuFPCxd(qD=P@Ln*zw9i;`g9xZ=EQeNNe2yNab$8t|oUS@? zc6>4sW5nhukSmy9Ar`*QE0x3JsY68NkAcJ!VvKBDwL<)(2_c%CoB$!udXsK9o1>yc z6KI-L0gv_MN50mDwucve0}HMJZOwXpXqpO0LRI2cq!hQ2x1Bs%w_(6+Q)~(F zC?c`T;(qpi3dF2ee`UeGKrGjHB^Buf+77W2T z31mKFpM$5t3}7ps$>t*pydHNJG{PIdj&%}Nc@ z%m>oU57Mkm@C)U|q}fP?N|23IGD58MM4TWrU@pye0U{vUhql?uldQop2tIO*<%egT zSV{u9%$=c9j7>f=(HYk=MgxH_i9l9HTzD}aVpM&)x?nU69`p*DE)kakNkc;mv1GN`);4-J7eDWI zbW(il-5|c-b~UMxuMv3*DOv;C1;MgGrc}U}=0P_;WelATnMpl<-X?cp;`Jl~Y}95(x+)4q-`(DSbvmq#FuaSIAd<2}Ma%D6O7W zwtDXP>#lXZ-nZo6aHpbiq3O_K#o-0-;XADzb5CSCcgmeRZ*}f{$FkTuq^+^;OzXpP z>%+HNcfEZ~jdFQ2uI5{==GWWjPQQ_SEqUXZ+_f#!H7Iut-s*bfo#Tsb2W8j6@41>) zK$OqN(4J=dnq;5(GJ|~LPCSl};tBFr^>_%e>7Z$k^Vpbwj8rSMsGj5QSYISubQ{5fPbi>@Qyjg9MC)awnHWPwRaHb zzb=ixLNxxp3WJoVc8%g^ID@)*oZ+*abexHltb&bED5}o7U2sBT?t~xYtf_IOaOYkC>2k)g2NkPyBTFILyWbz}f3~>-ejg-@OgMNZ23KS5< z=t3?(Vk%>0LIA43DvesI5ZrmW985J>{eUz9lv5L{5^7^kCL*gA>Yy;RK}}dM*n|eP z9^9w zrlOFvU{ss|eVKZ|zzuPt&5V+A2>ng1vvJB8EUQbL@?{dFB{_!H9;Fp@8BB ziO6)J(0)L37MIRyXiwHmB8qv2zU;1IG>EYNcl?RQsi`81Yb?xlz)&$YgHHC}Jpnu<{BcJNC9X&)`x0<7Mw)Q`U){?6B9G=C_OfwSWFK}Y@G!eH%h7`G&MI%a$ zy)p$MG|19j6rs4H$!KB5@HN3+wjDo_$O~(s|3+Xu~PiGFM6mbg#lso@941s zOLB3j&4k%GSCdPjhxAtJj2UZ(Sza34BB$qNc?@Y1-za&5>GL)b05y5}8Bu72pRGOi}s)s%5{$gYmLlbOz4a_6p0=aAevwAi_S z$#sBbXq8>9YPKa86UFc8yG`w{KJ&^m^Mi6zIMcLKZrVvuxE?uQ z^j-F;6Ung^>vcBC5Tjstlo$Him;kWK^T( zE6Aa>NwX?3>fVGL(ZuLv0@M;&n!xH!1<+bWEZf*5w9b7*5*OG|Nk%D~!$EaGB&=wgt^rO}1#P1Xgwu_=6d`kXD7Y1J+qm-r%fmQWW|YJu#|^O1S~hsYA8v z8rdK03o?l~qxXxx7;Vd?$VXneF0%0!1Y@OZ6V)IEdtg}? zCq`yU9Z+PMK%WFDlsD;v;+ed5)8wi34zT4=Acv(vC&Nq9R^`tr76$Cw#@KQ)`k`t8 z(pw5kP?0EFsud+c$*h&74iaJM@PVKFAGowMESy217c^~G;d;mo{6MfX7VObSoGsFi zbjEsV(O-yehkKbqVe*)~8Kf^1gn7hBHJMVWyk3E|U8&Jd#_Lt(k}>3_&aYBN z97KU)gy23+moA?BD&EuUiqND^TWJ758)F71{s+=a!{BMa1GaJAij1dC_OvZ{!go4B zfn-8^MK zw#sE&vlT6wie9;*_dP3L-glSY`s9kf_w1JPFj$(>HrP0nb+#oos9nKzUA#+ zSieX1?z!#l&hB`8c3;NZDSJDYyxUP^V_T+Sv)r&5MZMe!HE3k*uiL(*2;VYVT{pnW|&`ud|;Y0 z5FDNyJ0*xjrxdh{5CxlkZ4k9VF$&c5&MHGnVmg&8WimO&Jd*qbCZjn9v0mj%b(=yB zJhG`&%5%#>GdYxqtfvtc+o^EUhowVn0NmBgY(Ms4P`bRkvAP#3=hj?_`eQ-rE3cyQfYa{tHK`@X0@quA3&^( zck?ap=7nv?Wbd)t-pyHmXu%ayL6HU$P%+0*n1Lb^BCI7%JE0n5!sPh{q$-waF}{11 zU{RNX9zqSwmJwnqf8~pxy!^=pO=WJb8vDk5+M&vxW_SQaK$T(#Ad0h0ll-&1xDB!r zi`k)!n89~KCEc-RDuW+Gs`52c8L|%mI{UErbu{P$mv;IjEYg5}0AZnaG9?V2mI)}% zB3>y0nF*~n?Y8Th)b?q_Gs~GmUiToP)15_Y%3L9Ogw|6O>A~+-)?EF_%O9DmkSjZ8 zUCe@|OAd6+y0iY;IopE2<9*InR+9BrWxPSz8=R|O^!C2bSxPtId&b````hOZF8cf4 zw<7AU5%t)jf8(;#rrBB-%pYxp&)~Om3yn~ftw0}$D0Y(N434#!e|zx!etP~v^D2U` zGUKt$ky*@2JT9bv+Isp7PQ85s`{7HLtJdGM+o`4I>&TdUR-dQ1mnhSeQ;jht*{@Nb zT}R}sK7;Se)w@Q0jtBKGWk^F~PQ8xj`EA^lPvLh6ze+L!4Y}0yEq)U5 z6kDmXe*v#>hwMj8y0&PCu>4fi6SOJ*Cz3D)eHI{S=7pJTW!3DVwfBf8u;2;icEaH$ ze?RmRnWjy0(>hs8fEjP{<<%15zU$;=#x9AHmxWdadXvZ&>U55XG zv=v={ez0Yzo4?u054GBEwrxZB+jaagcAorB@(cDi;RuE& z2xI13lwjSNz+!zRX_y1DJyO%D%V85j`4%7aCEZapf2L*)=V&e|IqXgR5P81w^lMKq z`azCwW{6GkGX6-@+lf_FBSF;y)hSWF3{W+rJNE-rW@5{n`3%qHlpWJ~mZLf#5$k}Y z%oG}iZI&shF`=x@6dE}+rUEcn#TtS>Gf|!xc5^91fnh4#vYe~N$P_shT$zcm^a-A^ zC42v_O2!Bp>fc<$aKg>7;VmA+8OOBaSL?@&Geuy9xoIb?d9lRE&A3wSY6#~%JM_@G zE`^e?$5%A-A2R5eoqbwJ%YV^1}y`G`lGw(G4LwLAq>H)H>#{eFG>u}HCKBBv$0LIuJ$dxV;K@Tnk>mSM9)AjlQeoez*c0PXSnZvmlB1N_ zPg(Xc#SpWWX7<0#pufl#StRO1HPkszAtJP|m#C!vzo7_5aIpifHdstcs!=iy2UDZz z&<8!~ymOv)GEFp*`>Iok{%;{J-BJX;it*eSL9T=F)N>gQrrD{JU3Hp@CBEe9x#OvN zDe?NgB~K_@*O{r?AlGfU5m>C-nyK8n;N5D_aG5}_9O#`tu?Q^E3Y0MGzT@#{JX>yg zwk%t$r3d&sQ1PT+PS5pZ+BV5;n{I5B+aAia4ajW+i#0o5w!s8CPM6 zYxdtR+yB4lEbjsS1L&|Sc7Gr}j$wa(kULPy-?Z@i8*MkOn<%`scYh1_?Y2P+Uf%8C z4|r_vb`EaG%XeBT54fz~+2lE3wSMO#JcW1J5&vDQ9pUe~YzUhpitl0C{(d0}*tVDg z9Tgdz^I~Xmd7SSsH->i`j38qh(70V6b+Q4A`o z04hM@G9b{2egOy!g$M|Mh`njuLg8&7Uu`$H4~FsbZZ{8KJdBk0Gl8YiWn|$ljF-0C zdGU*&ZtB;4`Jp>xyqTFOHi>HoW}H~- z?&6w>S2eSOK})RDPW>hkeb=qp&#q~gd5fp*J5d#CH0LZ0;xgABy$;yjeAWirk0}r( zS2oeEVh2^RwA+xF@ptSh2gP|-h+>mTN+q;ZxFIA;_n0si4|+DmgERK}`9Q@#F$t4A zH31#OWXJAs;)rb)lbKYnsZg)@V0T9>P}Q`n*kMFS)aojdS}l9oyuNZMQ17-3jh^`_N+Wz)OejZXbN-DS7+wH8-xA zsO`>hKZ-e7zXfwj*xNYzMdnU-TO5Rd}UKBaAb(yo8(AX4{wHq^bUX1OwzcWTj z-ODE0e0`t%+KkeNDEUJ%N(JM*CZn`wZAFZdAuIZg86}s2?f-TdC3hjV2cy*c8#784 zjZtDx6Go{dIuGKZLDe761d_g2*3$NP&AR#?U0Ga`03N}(Kz%^g>?omvP0X^RFDpf z=2TIF-V(BNNF>HDl4?oS*XQgN`j}pc1S;&Zp@E?AtT1Y{jC$Hf2ThBVH5C7ef>abk zu5ENI);91M%us$CJk`^9*ZKt|I~6^d654vrb6y-5&&T3o+ran)U62z^=q0Gu(&(mm zp1OcE7~yeoYOR8BtyNwjeXnykFMP;;RbT0AWa2|R8xwq_G1aZ^EFcC~7XT&vz0V{nF$ZQswr}Dl33UTQ+GrreYg-mSwO~Rv3!cz&*u42~r z{E}aE|kc@9)rzw75ITxLi7cdP6auBY4u(qH1AUc_(Z z0=&<1pS7wKG97Gw!J0e7Zzc(zwa#)lrenF}x@>wg(LU}?lu7OjAi!hRd};tT#a2+2 zcE+pO(mg{sP7k<=RZg~5fOaD39MpI0m`EhbuI-2vw4DULWSFypld&3QpKCELdLyon zIUNs4Pd~s85ocME#;Kyj9gr zv0D-$ywksv1vaC?eeJlMNa8P|!%v}PidO}DY#edWwq*ziff;R0PN+&79cGa&L8qkC z+d2OB4YWnEkThO0goHs6c-)Vh1?8>Y4<-nDllNpSYhyAj2Nfx zszf-Bx=<5vCrG}?KE1OToM?%(=7o1A#vxggc7@Gy|7R30okJB2(A=qPx|Vor-%{m9 zvJ3LO?8ycmx-p&E_PD(5@q6H1E5LMC2QyVYa#hb;b+@ZFXY0E%^&8~+4GSChzO#3+ z{*g@Jk%hn`Keuw#TNbJ|FT1(GrsWc@>LC9MJ?^ii4XEzBTwPY_Q9Rbo7EDUZ@^6tq;308D2H)Tn&HB8&oNl8xOb9$#OaJ&Grq)!@T%ghzPY59~pfTvFh1RS{Ce}L*5-UghRz4^1e#m2zg{MqPSqh zg8L3rQ>uFGDf&RGL6ssAP!OHukzif?7I`H2$=y^V{tms807Jc_h7hyqMKzzL59Bc& z>ZdU*m$+q1tvz(NqI%hmC)m#V=4+#`)U7!2_Wl;Def@Gtg?&BUVuQzx*a|(Cdo1?# z^MUy@*PEd7w6A}A%L+Y~UAyc@_#1u896jIJx5A#wdr-l;8%T-AJDXSNvAl(I_~<5^ z=4-G+z+(mX;;0eJ4qTT;r(N+_uBfp0>!)22T<+oRTXLseVg6EQFUdBx&h_8eve33o zZrr|XrI>w>vgciQ>Fk-yO)GZ9EpM$}m6T#ADV|1BgqJr|*!T0fL$vgUaiN=XSWvwx zhXqMot>2Zi;$cxep;h%z4Aq0DQ4hlRp5V*$o8f3qM)5P8Ca>gHW)byS=v(!}w!o!4 zu&!$a$8B*%h#i;}$8B{CI^u}gaNyPwv*YM2T`U4if5EODxK(5I%SAlMNk?+MD22jw z5Vw>{6np?Zz}T#&l1ttZ|11m!N6KL1;!$c&=%#3eTW%FdBSG`Z3-G?k=PQdZyaE+ldPF$`KSlW$(egSQ^tMsx{cGX_%fRWZ**DY5_zl0?3gk^2Vmwh;DS(>eF zWRLt4lI%2J!7QNk4{g{b{xK4pn{}<(4c)%$uThcs4a!f#Sa!;4jcZW;29vy&g_&3m znhLV0Vpni3&-C0TCfUdYIf$_sj7XT*(uy{t*dWJk0-mK4Hw2xFoSDoi|J>4*7Ixl{ zR!xz6aI3{5sI#mn8(#mOoH8V>Xpz%59M64i=77Oa;s(bK~?pzB_6`XFwrR$v3f~^iyRn>~W|MX)?PJFxL$WKhTm0r+F@rY27B{zh~ROZht%Wr{}+QKC|n%yzBVlz=>O)6V$-r%flHjRhumf5Z{m~ z>yXPjUOzD(_@n0Q&5NL9w`6u4lkvav*h1%tg|d@BgOwzRM$UUuJzrgL=N;iL8ugU{ zj9xx2%mIlQf4&F9>pTp0Pv2(G!^l-tvyxe>8Dni`?VmEhpBcELTEt3}X0ipIbK+2- zCI6728S84MK+b_gsF1rEsv`$^Z zk0Y=a1~*%xd2$qEVWJ4l(Q?%718wRdaXUMGClbG)#vhdZ!Pg(nbnKMz-@kLgwR81Q&`ze` z$}qgOh5~D!hOEe$nWO8w{em$NR*^w;wh&vT)Hn%9=$iJzKeG_QUxc8;h^vOB7F{*g zyq1ccFiUS(YdNl!V?7N~GdiIT(ahzUIolhhua#zkN9Ewr#o)08|FH$vvDIT_LmoCp ze}e#{|BaEj42AjUv03zf&w!bkfk8MKr%bck<1luVzsQ~6hRxZF!@CA`pG%*F$y8b;VXxKyZza@XMGuA9h%V+It;)oDr^7Q->v>EZei26!0g3OU99iXv1?ST@Uq z)fE>GB|?oKL7>YFnU+I?Xw=LE`JnO$(UP)$PHM1pIU(>bIvab^uvFUo# zE#KCxw_3fZ{o45aX}S5KC77$hXe4_1(b)sbcFy0NmweH;6_U3=KXNfRvn6{RQdeMCyr2>jv_chBO4kOY_Ct|`5!wfjg|f(Hpp@L1mB zvA62N5(G(Dg2!?rZ~w>~WF&adtzO6Z`3`zH4q1cG-l9r5e$FBI9(in&vF3#O#lNJc zeV@GVz*C%bBo#mqCYYy2McR1k?Z!{S*UD=p=s1yRu%(h9f(ckt^O*)$?7>4dl|1%$9Eq7!xg}-*e zSZkHjX2V~#Dmkm3BwR|P={MMh(8b(L>ZN#D5*2q!Y~qy?T+9urFx^qh48IjOEKna~ zhjrD(^9D7Wz?mwc>!1)Z2<+x&@!wL2t)(#vjgv>)fQ@<4h0ls>r~3B;c8O(J9JFX8 zT8pqx4{SYm-Sx9I8F#(xuD`Z@(cST!1J^H?b!2_PYK$g56ks9 zH;cEscK2-er=MQ2BYdwn#P3)Lbl<~YeAsdCN&Zi8MuYT4QupWOYvD zbYA4WF-8mNf+*-=G3@nOEuu%os2&qzUZ2zAdXw0sH;c`Bi`e3g@mfM}6mKHY?lRvy+F(1cUiGxg33njk%_@9y?AY+h!6C)AOv6@OA<$rk*G*Z`Y`1M((o|V;l)|WO8U^gTGH(23j7=PP zxb4SHNj`244GhSV1|s?^`^F7_{W*Ahg3 zYO!PKM8$@`Qh^NnswFui08-p%Lct!9EM>AS zKfW_hfCk*aUaFG->|(OVI>jie3Q*7Ya-&$f6p*R75;Yd~{vrHozk_%MQLFGRQvn{A z=`uYUBw%n{Xv-+Dcn6a3O|bZgzRtYIap*GRG*B-y4j(uLsN%d64#vvNXf)85+W=9R zZD77UJGut8(^L?B7&(c+R;J6$G_yR@jjo>hg1JQ9WC!8GLq*eY6vJ_&wj()XcA*G) zaqoL@&eHf9XmEz;VN5vcCZ4!+H=^3A0lr32!Gx!n9Mz3@`CO34bj#%wlDaNSB4a}O zn=Ec4mhHqcG$I+Y2B{}tzcAPYluq~_9nL$tmA6!@%aL$Nak}0Z>+bEj*sbI>^?cr% za7IicKdxzd9!`KKEK@ZcJ1>qT@|V0%}Qr^3v_}T=Tve zVP7?lQu`m(NIJdUIq)L&DK)U0{cIosDkf-7rb#jsED1}xo%1>G<;r9S$%iqo^sUN> z|G8f0I!?p=1tdl!iE$WQz{ zm?q(8e+Oa;Epq93ZtpC&_ikp(LT2-Qj_HUjF&xiVV~CHvf9|bwGl}irgys&NuJpV* z(|u;@T!lMZr3J2UE_<+wp#8D*fHXA*@vl*}IB{)){!$9@nR%{fmg~72>zK*xt}-+U z&BA=d54|94`m$rLcd)Yaw6ze-)S|?{1-jKV zinUIisKF^vHtp`HSIH5$Sa^{@=`|clHCBR>YXIej8!Fxo?r!LU1lUKm{9KMo_CsrS~9qJD+B4Vbd{-rdH1wj|l{0 zck9y}NN94z`ZEv5iE-33u0Omm`8#ukf>+K81()}S3Gg=&?t-Sy6JbAu7}Jz~+zVsm zDP|u6QDrEKddMT{74VFXb}sGrd0pTntZXDi_iQvldj Ns`W-@2|?qn@E2_>)L#Gq literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/harmony_utils.cpython-312.pyc b/entrypoints/__pycache__/harmony_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..123f2a7cbe0f1ef5fcbe23a7d541d643a2103e4d GIT binary patch literal 19789 zcmdUXYj9iFnb^H}zh4AM00c?!B@%q=LA|UOB}x<}inK+^t{8g}iWj0l10ZuRC|NQX z)1I^ot-T|%aU*K&jG1m`Le<)a>14W0r|l}~w#hc#4lY3j$TfGWOxQ# zrWlH$W7L@VOOMg;u8OJR>M?a(Gp3=jTpiQKbz?e`*TnR3!p%kkRIC?>5c670YKyc<2VApj8iUQT#-)BzSpuy+n;9F0^)+7PCsXKX{q)I%G`m52rmly*bunUdNnrUyy}pd<)=?}q$YW=}*NA#IaV4YLl%A7k zPx(D~O)d4i51`NAqC*FM3FD!LLn^^An2fOFS7Q;uFdF&FY$P!e3DJUnD2m0gkXG=|nU!i7kXQf~k-6b0(T#1e=&2nTaHx$A&E8`)FjGowzRe(i`YJ#~FNk%+_q;#zc4u)yz)2+NJ*@B~MHU<7|97aqDzqVwwRwhsWa)K`)QZuHb+A+O=smDQGTE!*n`ElWb%>j#K;mILk$# z*U?)Xj8JaXjI$uW*b&422sgn-X9yOa*z`CPzAknghGii=hiJAE77l*AiO`=%Y#Fg28Y9!Mn;B41yd4698TVviEx4rYR>)F@2P?I}|bo>SK&6R&c?=(S+-dL%d*y%9+SvD+Z&HaN~kf)le` zaym{*IZ~%J+7;;vjwf!3kke3;h3eypAdDe+bv7D)WN=}eBZlaXZSYqJLq9enjbq&pV9+C6g%He{mvMl2TZhP}$( znwf^V;Khu=w`mS!Gy&3zV2i_EjD@2N%w#kPa|Q=cq%7fxcr;BM zEJ8BOj3*;P#Z`9tCY;MKQ*!KF;(}}`^C-K0OR*)I;F9btp;esVyctbi-?Bgx20&bX zfE|gu`7dF|!lWyEc}m&AGAKoTC}rmiDe6sU$}puZev@-4Wyb@Q?#+WqWtpZ*`<1}i zR%nHYQ*wRrcg~nHvig+qJz&u(-FwtG)d~2WGwsl)NnUv9lYY|+P}Z&O;ak!sm!)Xj znk98q^PNeD{B=sn$t4suB@1@NB4swFa0hLB7?skx3b3i#_8DGNMF>MmO=c63+V-T> zN;`&>W=AMZ36yq6C~XOpp1TNm`L}fkuXAQbJ!eVc(IC%Mt(+_Vrp!~p;+vf7r|w!O zRFl+%>J?bgIcv(Avb=f#>CHRFlyy<{jzt6C=Bx@9Ho%}!u3h|1*%SyvvRN*l+9l_T zzsXiPKZS&C+mkYUDDB}nd&)ivebc1>n^Z_MEYPOghUP1s&GF7=MlelG$K!A~$Jtw< zN)~5DP$z&rRFK27_4KkhAwuhmLqiv2Y7v&H$LN*OOQ$b{2Zu*5UA%ni(%{JOC<`+{ z2_DkWXO6w&;_1H8k>SDNf$-@wXGShw5)4QJl2PE}>csR-!2l=u8hH(f=eR^52$#?d z6ZuN$uV9-Hg@`ba2aqCI7a{;8=PR>OHWHRTa)Rx~cr3~k-^HXAfa=7?bBPK~bGOipeoZ1Uq+&3&*)hxgFrtvn&e@ z7U`n}=3aSYksS#3XkrV#0zDy^5KJ7TpqEUIFMA z;g5R*2;V&Q%uQ7_pzc; zRAuyyr0L_l|M;dQkg4h9EnQF5s%lf(4iyZpjNbRLrzYpw#d~(;JZ-$EE$ivXd3t$I z@9I?6)0Z~oE1PqbC;7^gj}B)mhY?|aM!$=+x{vqo%k-Yj`UkNyUq;_tQaq6LpT*)p zM&DXee2$>@b8{f?Yt8u%@V*0STi(AbZQt}Xwq%vDYR z%}YjzWC%jawP+%WY5+S9aBm?j2gy9Osdp|kZFqY&^t(6hwW6??cY5yCEY{?ljl8ol z?+tuly>HEVck|xe`Ks!BbBlAis&2lj`s}$xpO7& z_U7D8yt^r1QFZV5;&Fhr=m1u$6Xh~3A(#`=b^%N%3Z)gY!G5%DgMF?%qyl-U{_{ms z&Y4n88A%^ok`j4I1+q(OlA2Se=zi*zbHGD2DNVWU$j~<^mVU>z!&lX-z!zCpN?lG0 zN-44eO}(=>X_t{<)a6?P2~6`%MGAR~)=()m8%n8x+iN6l53<&rCh3%0D)s<8->#wN zv@i};ISDFNlt@%bMT(k2rnBuyqFPCwaxx3ZQqmgE=~6m{(`$g!YeDp9QA}W4FhLxtbse8uza>3a*{Z`MZ(sg(kAG} zXTTN!qQr~ivDwIJmYrtFE|+8o`N&8*5)P4l9upjxVC+Br;_0E0=TBb@=|xs~6^rdm zt(VtZFB8u39PfW_ZAfA>mWRf zRiLyrwofRn!&Y!nPR|oK*uF*I($u=MHSeg+IhuJ#^D^_;(f)B|aQW~G^WEr!=*pMY zY?=CjY~|UE<7}ah^7uaqQWpO&4h;Z@X|w(O(7>vD!?9~Q`Pk91Rtr4V+ESq412CSt zu3T*&U)z^=v;l0LEj{u~N7*YC?lb&gcx@zW8&bHB9qMs-@PFiVoeF3Ky(Hmm;XE4n zP6z{>%PpLz9AWr;QUFI@+ODG%y9b(sExK((XfBV!UZB|-_*zO&9k^<6KuOK6oVk@Zw?1@#*Z07;I*>IV$!Ly<^v=RcCNtbrf(?xzVY!IgLI#m9lF^=k zaeon?X0Z3{kxpkwpDIyb}xH91u`OgChXqr((2(fN2@s zoQl!UX%Z^N00agIviPLbMWs@nN!FiKh-*2LnNp=RDfMk=K&~;TP4p5nrjy7R9QQe0 zQc)ZMUdIr-DAszYsF*<1DuJl!0k+s<8J-J0>Wh8BsLW(k&XjgaA(hgA;D@m&N{=?B z9R;(JG*ebm>mR^7u~^R;7;4U#RMgQFE!P8<0n}~IR5DMdB$6=l$e4gK=1!T)k=T?0 zdxGyxDbt*hrV>c%jq|o7E(>`Wd&&T+xT3heMVHKgUxq6Fe!dc!49bvG4RWsd%Q#ZT zsixw)oFn~9rw8W5iO9)go7*~bE;4g{#hsNz7C@e&TFq3DQLD~TTSikx-YUTt`N_5? zrDoiSi@0NUz{o=|x>_=8fBSyGd$E1YW%pB+yf1K8rrNimi6#Mg&N9%x z()K`N#fs!%{Ll_L)urvcMN8_o4w!KDFnbMP1-(SH1Pv;agt!ShQ6m$K3NN~C8!Rzk z9X1IRC<0)Ten7J+Y{fXvaZ!{^lY$lm%JG=sRyg1U*#E=R#Ox1b9ABc$ZU69e5c5H) z`{1?d*#rYA8~m+FuF7=@I;o;j(bIp4G>66QCRk;_Fg689u#>ZBGT_)ggh5GI&`g0< z@3#AL;#JVy6Tv7Nwuy9cI2fv8!vuthP9z1BXi{Ki<1>O9RQwY1UyV?eWi@V2u!u$g zQNI@qK)*p)k1#AUFZNALu4DBoQhl8G6t<}BNO(kvhKL;+w0MiCt7DObVC)+m9UQ&X zH+)I3Nd}9F=u8wwDVq^R)t^|$LuNLK&CO!+8YalB1T!ub+Sji|Ujs>tn`BYH77P+k zPcJjdh5c=yuN<<<<(t$^P#mX>&<8K_UffGyuC54$-8QoZt<>A zTAw%D7Y?qQs|yCoT?ZnJzx~enC;mYC{HEEnF!?^WVQ$=X`Essq-qpRTS$*}<0Kfly z)-`nJ)Gy9o%vO#5*?~vn@0(YuSHH4W`Mq0w=)}*weLp#nKAWi;CG~oESI??@^|h?) zMEVqV_S`z?D=tsY)xf(NmJeiI?dem6gOtTSf39$VGF2{IwHNuG z{!I7jj1K??GKPWQe0r4fUZRP3_`}`4v(=g{QXxtJL{4h^zmbz%<>e%XeyH%&IdmwY zu)ghK6cua^WT~CRH79 zm2&MEi!iUz9GF}|cGG_W$5ZSWQ|08Sq7-ESgK;_8Ns-OK{BJ0yXfkHvL@3TO!YK-8 zajHwCFfhuOHQO6xJ_zdb15}1VR-+kFO8e#xE38nnXJfd8UF=rsOACT%~gInmog@Y-W_P{Nfdx zq*2~2yPHa81+Xt_0LG*u8_NB#wi5Zy_|PGbdXkQd4;K76a|yJ$Y~Rk} ziWFM|=|2e4zX|FW=dzqUNT*CPX1FVHjFLoT28qZudC=FNPUwAO&29WBFK>e zfi02B&bx1g0147VbQQux$Me0ntmQ={0(6PZ}m}qDw>^>#}V=rN5K9bmVSWAR*jzh z`&jn-n0y_RcQKiSM9{=Qm<7=h{OjypEG-IY3;6mrCV0x(cOVH_K$L=z3e@05sq33i zDrn%96Es9L)u5y*ZW*#$zJYadd$9K~L2-rLa!`%F&qB>Wxi?7(V+MlV|m903GHasy`+&yyV$dWE=uAA=%M(gt53oHhfVp(VV z{JFf|oPK>>@4tULUmHwoK62NBs0BvagNp~3&aJ4j7460MGs`El6)o@%Hekbj!*cb? z)vR~-y7$29#q^*ei+x}7$F}d;9vQQ)b0AkK^3;*6t4Dfo;$2P4w^y51D>Kapv#vu@ zaSQKiSP~72QL?w7 z*yz94vDmS6Yo#IU>Rfm2U3I1PqM(@EFgHQHD&M`y#mP^k{q|G~`|c^Fsj{U9o>_=d zs+`c3T)CEM01wOlH9c6c?56)>s=U*&kjk2xU{e|`(5%y)o=sy05Q^Q2Egf3Enze=I z2hfn-^8Vfpech(JZuzARcW0)e>yF_Q9d(i(q_Y)+Kdf5od-weE3*Wi?t;;L>SNCL_ z_y5d&;89iDkf|8lG&ysoR^HUQ;$C?zYdSa&Gh$Eo&JX?OQ!V8_CrUv-b~T=LfRPLx ziV&Rh8JbgKJqN)9g%#K_+aESlz8r*<9|DC^6$3s}s-iJXrYh_R*ppG`+4f8swiUoD z7?jjWMa?efG9d%oA_#xUWo@9d z;=GE4O=KD&&^yIw7==mfG??#0)p}$TKZXqO2p!eYHGej*x8?M^*1@S-Rhz5YzfrY6 zJ@B#Jx74!HcwobRFkiRpgRkEIYOZb%U$Mt6`i-XET+BzeK2q>W&EeCTg zC;66>>s2S8xC42I_nv*xzN~-jX#Kdc?@>#(@m$6c{HSs7!ufn-SFW*_Z|u!A9_AYl zFAPC@zUGbAL%G%yeCvsI?}>bM%SQW=T>BZm{mgpxnJ3<0-W|9%vN*DQ{;|8~6A#t2 zuTV#M4;1!M7We!JvHP;CFrY2x6qFTFZJCEmsd=}YF}uV=MHd~FA3{xh%85=XI;ot* za>iKs9D|~wl6RSoMVls=SuoZ>*#P^otO23}7%+LmQYz7qVX)pnfgD#z+$e;{f;@r4 z6)dIDf-yP$Y9s;1AdW-&1Up3Xf=EqoYwe!_%qiRoy^sM$S5cR%=;SLpSFUF(4$Kd2 zIs#v}yuS75Mu%jR|Xc5KVBtu5z6S4mq3p)F!JW}AyP?vzHd0Plp~ z_BK5aE9D3Pz|U4Pzf~#KR$FROGh@8TdU?$95ky$;7JtDf z0s9&snvi%{zb5g$G-U#0rY@IXU0`GVxVl9Rim54&)g;B#CPQ+|Qs&*Jh1C9fByEUbHliT4`G|Dr=UImDD8QbD7(1A6U|v46Lg1BAZ0bAA!mgBw?An%;k!cn44@1gKn0J^7vy2%q?&H!-sNPKP zFw!;|nrqkrqFsT&aD{R-q(g>a|3BItC1QFLo`^wMLRyx&K5@b82_#TU>HBqb&EQ0#-GlgxTaE$*khWFKTnk&?T`W^L6 z>NWKh3XBK6w+Bop+&zz;fZ%vg5;+yW4n(L=Loq7U!Rc#3(VY{Ny$nHOQRpHq5ZlV| zZw*qYbr<`00m<8)gJO_jFfkjy3eok@hU~TgXHHN8mb)WYH2`FINz8E5Zo&lB*PGbAa%ARg{4k7*YWcPL3yMInn87 z7zY>KE%4Y0YL*MLUqb|Jiia&sJbNn9AcP}(pk@CJg6O%~cpTz!MSBr3U;G5o?ZX6Z zUjp6r%l{3JphXPA?BByO^|gs`NGn>&-b6@jQnZDTR&`<+y`Yzd$nJvgw=3jaJtShO5&e&5xS}%+1B5s>d5E#;$r*O~Dp+8^4ihG7i!=_=@Iu^U1bxKk zwo~p8EID}H2X=>#{l|zYu>perQFgnD_X0R{#<7bCKqlfk=F*S|TD*x ziC;8I*Mea0S>%l~o))!1xqiwUg z?t{bk4=+!CC-$vawt8=Rs1QKa!rQuTZr$|M=e>2ymRzuh5B989tuBQDGL^?4 zU4I54xVAfIKohZ5Ep@G10lG0?S+f+&)pqfqb<#@4fNv8|i_~nmu`s z_xpZ5emcK76Knm%~>{>z!36EFx*WkId+xjAY`O z-WOQ9vUEOE)%wh=t9LC}!Taj0$vGN%N8|E^$BsRP3M$ZEU}$OVFVL`D8EZq{;>lTp zyd}8o-LSOf?cSWd5nR;EgB$j);@kNRd(RUEm-_wC+xuU7@0F#IOjSodP@DHPeeB(r zuW$WM;9G&rzJ8b=pC9I@rV{2S5QO<@a4pzieiV#`A3OHK`~XI0B#fT5lcn9WdT6yh zQ*$iqI-ao{&l??NeFoNzjrj^s+E831e@EUExcBnn%hR!>|i=Y}_xB+qPrnrSFCxgfksSGrnV4+wqLy_}@Icpa#pWICY;=W}^+pU2%A= zRqyG19 z8d5<=?BnbfjUFi>kO%7NKLtv$lSa=Gy&WVdNk0X6D=mzbu|2da z*nl|}u|a!E$T7_Rb7+qJGfe(1CMcz_KgHzVVe;pgpg_PPOJRS8$& zVpWHiYYcUHqdjM=fvJV40tWUAjuIkrV`H<7UgG-MYE%5LF|H@&`fSFePONt0kFTqHAG7)y639i zK9(~-$D5zanvZ5QM?X6(JVazzXCYHMtfK!#9>nL9DY(ockv2F^aKVHok}P<8(NIT* zU9`|{A6=t3I#3D7rF76>@K*ZPnPLA1+W8HR%>x-6TlM=x?+t(7{l~T6tKBf4$Y@T8 z^YsJxyoEk1Lp%pxN{1*93|&-Uxq&&uxS4Y6xW&2SDRqsg2wtGfvC|Xa_W+d&`zL2n zFeoNxoMAYD;`#H7=T8=wL1UFgV2=Ht0QkRfkj;>RL0WsV);;rQK62Lpqczy>+V0pE zUVCf^0`v0JJ~2DzFNjMDRg?#9He^ZvA(WPmx73Ue<1E98AljM+dMa7pO0TF&Ji-q3 zbV^T3W=;b!$=bw1*_vu%M+t$&RE2w?91oMen3M*U3|N56_oz)=ffj)u`5|T-a$@Yb zp3#=Q@}h&-4}%o@0k&TfB~*r&?u-XSebkW4tBU0jFw_4n$D)OZa&1a|4I+l5NDu== zg6M|N5JLj7K(HAYI(q^VxCer}f%q~+&F{bCD^dSanEN&Y8B6w?tg4bo085l3H%*Az zCFjO{R{+zpwceep&@doZ#H;tC` zm$F9R(hKXx+9%+*rWc1rfR)|6yEEtB!@Kt&EU=p{-rIG@`o!C~d}{e3C_`y$9xA97 zN8r2Ldv9=Y@CWg1?a&Y6tK1Uz!Rz;5U)fE(bJ@B*YjM7IDC@p}E!F3|A>JEWspq{N z;-xZZ$n0K-FTb2I@6Kp;qsP1Mg6Q%7kv@3JVh9@8nG zsTx*-l8XizQO`#aizNmC)cA$2FKVc`8;JUhnlk$YYgX@26R*y(4ur6Y5VC(&j9)BKOtVY4q~sVi4=rV_g%eK>nQyA~Fe@vx5U?1>J?y{ezb;2 zl<_v^>|uzTR;)bXu%Nk$Gwf%A$fqN70?XR@vYKB`3ea+^=dW`nkWL_WdWy z^J|rcRzZvk6q+cy5riXaKvY8Wwwj!+iMKW7&3^G}B`8X6RfckaR_F0$DCegZO&z>D z7xpf2iw7aLg;o^+gZ;}D%cI}&!qqQ&{|bZX?EloEp=%(%w_#WQM1Q`fKEJCi-yF)f zcI4|CL3{qh8KNul?y9Ax#d8HU(dI% z{ToWC+*82JS6Hh64f!O9?Q~vO=Pqg)bn9D=zt0DgcqP8^|xP9vA=FL5;`*`z# z0>S`x(3&1|XJLVkE=?3D_*tHOia!OzUfP~_g$io;ed6|JeC?U8Q@rmK@9r;Xp-ifd zxftC;J0ZIKrSDmvQjjfNqw8pQ-da;o!|xNVF=MLbwRKOm@K)%jwX|osf2HbM7jU{g ztIXQo??rLSJq5=cO?#K&Y8?KqFqyrl`K~ih$y?!sriHd;8uk<@_+3qAjtuh$MxNqJ z;S!*KY$dsR>cK6<{aC?3(|vTN^C-Uc(FKEscCECo?tRb+k#w{xb7Tk?-c_(w(0d?8 pYDs-(uAqke6Rjz|cfoz<@B+7VY4P^*zKpqr*S0>>!Z#wi{2$FjqOSk| literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/launcher.cpython-312.pyc b/entrypoints/__pycache__/launcher.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4fac713aa87544f9526b41e451eb51718984b39 GIT binary patch literal 7957 zcmai3Z*UvOb>G9^!yf`52$JF-ktdOi1d9YjIhL(Iw#5-aiIPaUBrPZC#Mcw+xLEN`7cdPegvQ5`homjJ3_xD1^e?=iS5e_LN}3! zL`FerD`(OSEj!W<$Q_C^>q@&A(&kj$Sx?$SX-4s;y-?>;*sL$@%lgy)Y#<$=ZEhu) zZAv#$+M|TB;dGeNUL}%kPB&+x=_oC;N=r7Dj#1jDv}W7VZIt#a?b$u)J(Lb89of!w zXSOTd#UKZg+NAEgAT#;Wdwoc37lV>dY?2%cE~~aN6`5r(LSjhj5yO&amK7t{-RWMj zS@JHNYUHzMF)D>;U1H01S9;$gcw)e_pU(VsW3`IRtVe7EzP=?UzURNl6q9kr^q-xX zd2ZSejJysd=ZLx#cbeWYK{rO88#ldE(x2uf-7wiHNzZAjE&-EEQm+!`U`m~nRq3=M zOR5ofnB8ZGhxv(-Z}DeSBcrJ){`AG?CujK4)Y!=L7iP=>yLEi>^u>wiE~I8seCkpP zIHt$bDVq(3jZ9BpI4xmgN}9!zJ}+UY3&^T$@QOA!M~m&VvMTa9tYsu!=kvO(&ha@7 z6CuKuP@|F{rZCp9*00KB8I@GwvLf-iuJG8J(hTLV${7u- z{FOxk&*>Rw!{HDm{sH{94L3rsMqX`xHF_Q~ptMGfX`zw7S}QWPnlPwH}GeE}8x z_G)2-Ub}vQv#EM+&$JRK1Qz=0^)_V;*i^v}dhiXqRL>U<)@hrvXSb#MesuTI3;u;e zcAt7)V6d~m%rPn>I^V6Wur^kt;Lt(O)T73b-3N9tVbk>-Y7f^-HU+guY`UI9?KkTs zn|jxOw`vx@-z+w-Mb?_vqO(pha6PyZ6r081b=OK0tn5jfzn&XU+4RD3o2uu9puIzI zyB^j5WWAqFHTL(R^~fvWT>cF1-A`OMoqz6j!~J%kLX&;A>(Pa0?Vk1AZe`FkdJ?_X z)rX88yLC77-jDE0AgxnuA_}__GEUpvg^+zVE8#-;cJ_mln}A=eN-!)AvYToAYm- zi%dMa{11K(KJZ%D-~_JftT?dg9Q6g<^OpG@ ztUO0Y({hFk4lV)XTKqaUZx}gl9z2PJIYrLOhR$7;1#TYPTvoWo%W6i;<`l`0cu^}h8d%q$r z@nlldmjTynk=~3ziA?Qyfgnb|1%BHKBtLfEMC)jwihrP@_SuTSpmoP1+8Nxu&g`ac z^)Z&v68af43GUsK5!8&NfbTZ693Yi|F5#;Z<_$pyS5B8iY7 z!<|N}tE*q0N*dW*GAHMTK#ArgW9VA`$zxAlc~VL$^5tZ12@XO{UR9KA5)KwG)eIn#L~h9p>GGT^DEvGexFX@@{Zta3=uG0IqGbd{e8-*z({q~MbG(N%5o>F@6ipCK&otE@vyB%-v9>$*k?N*b(3jMVteFv*hsSwU zB}h}zw452pU-gp$lAiQ;2ELjkhsw$?>X%yfdLS~n0b-mI)hX3*B% zhLXUO%cz@CD%}osGd-4S!4k&Lj8D#(UdU%sQxkX}89#KIUK2&KKrkji%k%=6B?~aU znXIS)ayMC9A>u8V;4e&|7be#m$4C!1-DccIDj;M-4IAjOOee@;Z6RzZ?EL}4bciH_ zkeJ@8I^aH1)lU)ve{?bfvEYs0sZ*ycUG>uu=|#d*z}Q84WWerxNj3378*&F;J^%9g zjmUv}u7l;~*v-zr>RfYgMLTW{-0rwDx)~i_^KOOPewe>Cdt2NLA6j$W4@B3Vy(QjK z%E8!YZ0JT{J+KiQyd%Ch`t$Mk$3K|c{N~sv?3qd!MY}68H1K$-|7fxQXqi25C${Z$ z40twN`zjvP{aC4MsMs~Mkr@4@xY>1n!?g!$*?|r3;MT#34R)aH?b`76ZS|hmV7otS zirsi|{l#0Si%myvKLg3?*!@FiSI3CR`;U^mx9swhbzGm=s`8F)HubHJZMiycJ$~DB z&voFlK(rL-D+c;D0|Tq0_hWlXvHoJL|LyO*cl3kD|LNE-j+IVK6;DiU4oq*xW>zQ4 z(Y8`_e=)lM_R)7vzIF1S`#*^N!t>GS?US2*f3z8$T0Q?ciz53!cOqZYXH9#{dwSmV zzu_;pc9mKW6k89J+qz3_@nT!N9P2E_4isYt9{AiLf5nU3?Z0nxyT7p=V%$xzV7IU2 z>MFXremeH%#2XWLp1J2bUUspyy2&>t?@Zovov64X?%r~!qr`S>upPe+@7)adR=Sa| z%`$bLusxMt!`E^=Qnu)2ZAIgE`ZuG=HE%f(-3at-1*116 z*C)%7bLB{^9BD5{T5hthv$xvc?0utmv*WSN$RQB8DfA$Sd|~T^pSs5$D~DTeytMw( zW|%9o+Mz*Gl?sSX>alAzT3AZTIgp!l`N0FpLS_~cyv zM+!swxEHFfkt5!dGv)<@0~-hST!n(fT^2G|=D-4|;vffJFLMQKN3|z`QW6QTIcn&Oz^5~r#3!0P^4;Hs{)>YQm;0(JD`$>bb>?EK{fcz6Iy8BLUuRpw;F zz{%m^!;jN$<3zjQq%(ppaf=eYe3LYAQoTqCGE583BL>SA|?$7Qj3= zD`QA4pg=S(qiDcF^cOsoDrE?&r@e{J(uuM4re!J9%w*hcdaJr(`eogA^O%J#HG8ST zoKK~m<3}!xUrO;8C;8MQcwF*gH8OJ+ocO$!S43XACL23cEmZaX)uDGP*(DeolVlZs zIth1IQ6+-fx^KSt`isA6efrK=Y4GXd;L{ripI)8Z3WQ66P6Ins@LBE@hSrfwkt|EYKnEYYEiz1ucknc;H#;j%_lMj*+=@# zI@Xw3Ms!>c+Q|Ole8sWkjJt$5Oh=E5Y(95sm>VH5jL;ckvA|H&EC+;6s#raJo%YdG zWr;Hu!SpQUB>gy-q9AO?^B?VF)z{t$6MouzH*U~XEP#EY0GB9M4h&jViEP^>9gNfd zRtT5iW{8Gc77`*q1-e44XXzcM=cJ50D`&WaM0OYuq7@BGv_F{ zTS<9{btFcUnq4r_vYN{qHsA(>8Q-yl+QLY;o#NN+ItM5k2C)I}*R23Xm{YXNfNYHKS|*dnR1o4*tndN0NTn{vy3@p_*~V!WUb2OpoG@fTozoPJnkgf1Xjm4MB()^4 zcSCen^(5T^Vkok3Ae;!ng{_(*XHY_V$45d`kY;ltDw=ivck3gUPP z!{diUIK;ZrWZ|`3n1d$@*j<=Ng~%TqdtQYl5F|cL$Nzstp!PE>3E(IwTp-Mrxuzp5 z%8Eio7GQ(3f+0f`47ag%Pzi2a1-%deP=Ij6l#>DM2%Qempqvzyki($hgFfF>jj4q? zEU8Uz^M z5cQ>|CPyyt)2XRT@aLA))YQc(a6vq2b?`hLy6SdTtxaX@jw2)?9bwHhc%?8; zgbHiXe^Lc>#|R%u{(>|S7zpnfl_SXGE4e$1?#`c{+;kr-yV+{#*-iJMTIpU_V$+?h zI6Uq#=I`4dAd+u8k>@Z;21yeC+(Rm3ZlPn=bPDWPy+vArls!NI*z!XcRj;Jfayy_fDT|iuHYszqYGPiIp1}lylRK;bDZu?gKj_{jS*3G7GsfKZRuIYGYyWxA Q`_NA4UyeSYIn|i|19Tt8=>Px# literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/llm.cpython-312.pyc b/entrypoints/__pycache__/llm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de54cb05206ab7f0fb5d3aaf0198db0554a0fb10 GIT binary patch literal 67802 zcmeFa3v?XUc_vu>0MO`epd0TuP`wdffW&>%aH@@B8N^C2k2v)cLi@@3-2dzoHv)IQ5H1 zSL~AXO-YdyTT~jQUwhORvya+ij!}n=?j2EQtYEYtRybN1D;h0g&umfGsEd759CMGl z*)wO<6Z4LG*>yp*Bvv|F%B~BezLs=aMXYkPGFCNO6{{YtX3t&G znpo{=HUA8QzGV9$%Ajj^WDCU)(PHpf~&fb?scf@v%?u_jk-No*G(cQ5-`b_NH=sEVRHhMmGVe|sKu8Te! zyEu9=_NmcN#XddyY4)r>`kC11=qS5xh+c|4H~L)c`O)XuePi^qv9ZxHcHI;mkA+4< z?7BI6IW{pm!LD1PN-R7Ywn^avwGe$RM<-)bqf<7?E`^^6UwO;MF=#Y`d%w~eF1*wB z7I2A<(P_JMLQ>klA}Jllv#U33fzF@M>ydz6a~%mqqoK>uu;w}+eqk;=GZ79rw2}*- zIeTpE$jM_zo;o#hLURny+|Zm8p+quZ(|i|0(TEaCM&dKaR5h;Rku&*%5(BWJ3pZxN z7elJ%R>HGtcp{VxD*>lgemES8oezi9i7TgO63Gzq(ER4JGjqw=xui9fD2diQp@sou zED=|eW0wIXF?Kb4V{CG6W`gnu?6M>A*lZ-qigP4BGZ~rEyl3O_Xjs+nYR)IsxryYQ z8di8l3H?P$Brz75i6qA+BjKn*6|;bPIs~@|@F$PZw6j~kqJr)hmB($0dv;zu~7)w%{ zjU~~VV-Y2x`KQ83J^8GPG)bUWoe2niIs^1V^C74CuvQt3PmRssP6t zF#JK*Q(+ngp40Jj!{=xev4K+Y%v@+D8JTCrd^()G5?8254M(D(L?SX7;pNxU>w}>B zSnP6GQBb(t=aq!h8=jV|OGI6=J`u|0pO{ls6nTuWiux7LN_2p%+Y%V^mI2A1X=ORk zpNh;VTIu-^P!n0`pP|k=oC~4ERY37#rZBKy^FPhe(R@-#O*s*oxDr-)kJBhYalFsc z7+?thB6W392!k^j#!P}y8U>WG_#~1Q9f{9jOwbo)d`ukyLh0qi!{XDYpXSeT<2xOn ziX^EUCxBpVqAEl;sF4ZHK6Z|v>Burclc6{c=@*Zlz@q(4DJ+fJ6lv71*hU?S9slU; z#J>Vu6|#E=t{h5{;!=v)8-8y+yWeP3JYlEe4HqaS;X`|)2Uj0dp@olYfRfkj@UVXSkX~172{+a-@8GkKdSJ=sN z<7wn@sh+wu>aeB@ms?QsI9ja)!Znk2Wy@EbqqSJn>XbgQ$f4zGSW2hT{}tz3dRL9s zE7z1DQa9ksMtnJd>!xB!xvp$weKYumL)jK?4!4AxMefS>a3k&;jeFiUJJ43LvNPPu z+k};W7fRkn>B4Q6`s~J2x1|mN+_%&Hq*vMVm7>v(4LRn8GwD?JqO_gLP`Fvy6>ihp zejjT~r?qD$3zhvy*;OpjKjlT`0NRtv+HEQ8LG(MX$)RwM(qrur{W+Dho0Ze8Jn@x+ z(SR}>mczZ%7xl)p{7l&dY{mS_sT@Wr`cT$BF$3EO?x5b<$5D%I9HKZY^U4XN*iI?%t6$L_>%o)8$T=0>p*#u5 zPvP%0(pkm>rPkZ*Y3ph-Z`s`w3%At=chjR9fg7sJ)-pyM0neZNru02MJbDTV9^xwxHXXnFv^%lR7rx&Qj zhgggIm1i;P_F=s9F@Nz5=lZfKADw^xGUvy0%6C7@*QgjBgS-y?1J;qZtydXEzE21&R#I#pzJxY#^06N7H%9z(d7nPE<@G+^ytRk*UVa{U zcz9h&K1+Ghd;A`uUxtHYd7u9LN-~a;9L?#A&xVfy>yL>Ngpcdx$oKT;RgO|6guXd} z74R}<)|2?1!0##iD)@a;Sx~}ZH`c^Sl=dmCh*P3fc#GrB6}&mE0Hc)%Yh$}IjW3^O z7=85(`}$F&kMN_p>t}cm)ir0#=fnG5Nbxb&QqTfkC5n2DfcN$5xG^KHux7?l!ZXS& z=J6M>ZcwpI&1Ki60lW#{|sS~@_DxMZVtgY=~fbGp>t@DJwnct+-YMnVqFC~`ulw&hu34#OaP6G69I#&O|FBq2`_BOym-CZcm-Nac{scqpQ% z`*0Ht%}mXKsFq{Ir^-FEac~`%qc>!Qi1NtgIWXr8R%k-SD>+829C%Xs#Mx&P0YE?j za4E;avAB9ePRxcT!ZL&famLoOdPIm8kA;;8m{kS}%zY9i=?Ps8$xmIB8KEBt$|qRO zLh|KMa^i{{pL|>;`xvmKSx3FW0NT=C$t!x@`s6wAoKq+<8Y3BwMk7d{*q7AYtOS1{X9U-F+XG2L=m5_XL zZVKG})NyK`3uBw8 zF<|{V0@j>iCLcvPpx79rCSc+Rv&1MF*h>{y4nfldZ%le5^G7m0l6%6zsbC);r z$k(rgXL7Pp;@4-Q@er}_tik!{qrqg>#Q1=Q({-@IG^nCuY$S~lVhzc|^oC#zpFM?= zFqs0dKaT4VM1YiDj>Leo2?H4QArSK#q*HNyN>L(^!X+UTz{p9U@gkFvup(c+!Ah10 zs}N?%lM#&h#Ek?bKxU&5EH=krnhD8?FeC^N<;ZFzaWyEPzY?E|Dsp@VlgVW;4>N=@ z3fh%=8E|sDFg$ZDqQ+-vY+?2yz;uY3^ExZ>SST`+V3?8!C;K>T_D+8G+-X!C!Y?}aU034HwSiV zB>@`0%wFL>NaJyw<_5gNx2T-aEd-JZGR}%QE{Br(Qo)8PUTmD;y0H*3!D~=H31kLJ z<(3Z9iWMSZ3E=QgWT2O&$S904S@FjxE;Y?6Md@!BS5_q1UWzyzd52$Mt6fLR`DG&CtqKCD3}BPym3 z$hao1@~T@4OafL&sjW@Rn8h|f`FFPT%5tM2u<EsY#yUGm1{ephh&A22} zNVfS11<{FxNT(^pS2Bs>S-Fx|=0KQ2vL8q026lGm&e_Q;YJ6_$3T9<<<{%=3(zCp1 zc1XtK@)eY&Pe%90LN|asgt_s#WPg0JKc{>;Dc5u6%+moGTADaf%hauOT)oD{&Z2!U z-vFUHB%h}eW1>eDj*wJZ#sFtjFUBNG5|j^vV&+PPTpVTODN2a-iaMT=9yGA7MWZo> z;xxlTaRWJj6bof!W)h&OZdb42HS#jbM&)BocNr~8>#Cd|5iHGwHXs5M?QtDC2IQXu z4Akr#%ao5rzTAs!1*%Yv!Tjx#1B|WIM9iwdBTNweQ!3VKn!uB4JW7ps1*BkX4g*>5 z0m%3np{Q)6j!1nHf+~)$lbmedtg<5@$k1S+WKMDNcBI@xbeN1;uAllLAV;QVP~Kd8 z%b$+{JaIiVJI1&ieMA##^qhzsW@GG#$q5M*L8lWchJXV2lru;s!cnI(3Tiz~LYlU? z3e^~pB0;p7PAKZU6OqF}5<&7A#UpsEl6V_Yv7AH61f|HhF^yQ5L>WyC{!fKkiW+c6 z9%Rc0dNK!`$b%2*D(gq zJer5~Z%~1p)}dZlM?0|o1T~xFqf1PJM4mFXSquP%MLEDkQxd`;>nm~4&l@tp984`t z#1ZHRAOeuCpIC5Oj&s04#|t(Sy-VdMlIsRtT@R@WupnWX_QErGJ`o-ZgHuq4w!RUpV2sS<_~j(p6Ln^C zi1A?%2WLekg7ge^f^wRr88?xS%60)$f+3laja~y;G8+cLLBzE&R|yv3a+oG$piM5Q z<1}BqNMc2dJF4^9E0O-~{TFrmef)E6AXUd85S&f44aqweEG3y`Ry8{N0*nz-9Eil? zGyOV=BvUHNnfJbEAMrS+$=T!@WNj>ulNo`q*@38%0b$l42ijuc&&kfP=&`fsjvX1kaO~*w^0SzF$8!>xtR-8ia|$QWo)$6Fn2AkxY_wt| z(Ad7l$Jw>5!NG08txWtf7~C4%noBBPzY+nx2c#iri8v?})$A4`iF5-dqj8Txia_%T z7iiKlj;^u}P&TZHAtEV3PQgh6iVwsMeKmJT!*9LayI z69Y0OipEM=PR>EVXa&kUe~z&%76k0=-8fhY2QQ7A_EF>dq-;KUUYE!cIcQ2`IcdcO z;3NZw$VK53idFtYOFQwjjQA$owxCQS0Fo;_HR?$`5^$Ep$b9X(G@Q6GGhx`0=tWxh z&=9{{q!XHxi2jk|n)DMg%bWV2{(_GG_0Uhq%u^++yg;K?NRv5CTZ+(3z=@<*Y;dYt zsd35Vx>`jZd8$^nL1d}bVm{3)Nz|&un;c1-R+&%yr4=x-kLI-sEHszkueEZsbjC9$ z@xWSje&Skln^d({noFQ*B{{^2R=4RwuDQe#ta+`=mR6BxCD2N8#<*6Ue{#_3pajwF z0>{8jl59{Pep*#t9z!d)ptwj{Mr$s0;p^(^)HGjuvfr}K2WK&NNvMAjDS zr|9%)I(>#tqjb7Nr|0POJe@vEr!hK>(BXN1e~;8cW`JxyN$tnIgY<(7 zElJ(vRFYCHNdZd|2BvClvH;20F%6jOk5tQB`u7Bp<=|vF^jhFHD@>o8bcwGG=ryps zDvZJmdO@~HfWXo%v=+l0HYF=tBT}mMP1`>~EfgogPF25Tk4OvEsm3o}x4>^qR&Rh@ zouulqBiSIXQjMatciiG?p?&634y=u(JZQpVWon#m5)jQ_VZU@#dbQ)#&Iw0Eny^1# zD@o6xO&2;AI+HCTZ4yuUkunTuk!m-e~&_VS^*-BE4IJ?afh zQ}thb#!~Y7Jnefh*(&m4C9nU_EV(S8J+|asD0%%t7fNoFz$kfF5*k!~q?)JO#kKjX zcoeU|5u=~FH%VJ!r07X?E8eR|F;IF^Jxb}-9!W~HCp!h)=`L|){)#dse90>DRPm*{ zza=UDZ#x9!FWHg-<1MgKDNEVDZd1xrcAP6x4xB4fPMoV!1vpoy3URJU72#Z)a^YN; zD#p1!<;J-o<-xfz<;A%vRf2PKsubszln-Y)<;S@-)qrzbsto7$R5{KasS2DsQq;6Sgv(&djyfJ^3qEu_D?Jd2ZfCVP@ zyHai6lD=&>`l#(ocBOcs?elHdZP%q29iNe|+a~NbeU`e8SECRF;Uy$Q! zP%9Ip09MyAl8YrSXzuHA^=eohn}{Z~axBn71fs9RoVbFm^HUeOib^mO^Ouf8r_D)2 zo#2wYH-6vEh+N`n*8>3OHVq|eP(BLwcu4Mq9*+4v(B*vhsTr=0&I=M$0cm5QA{c?W zv6Bb`t(-W^T&<%zf>zcVQE6e9F~%h+P)}YCO+f#xXl1Yy39B>E5=TQYeFt|iUxRk* z*$C~x{BJ=1nIK`PPz$2=L}!zcL^Ry5(<-2L$vT8Y)+A?xm=MhR*(j6h3qlwQuP6*9 zAbBU_5aUbAv|lJr1wTo`t8SgQObqChHw*wErXtFFAg83k*&E$KHHv!wJP9R6_Iz?` z$sm+qc8cQh^Pp-~nqgHeYm$159{ZW7!BR=}33^P1 z^J7O&51&7;xgir9;|Vmco{(7@sYfWKPnQ~_RhTKhRwG<4gg%QQcPyb*kgdNV`yFFV zra6HSam}~>v6m6ef~qby$Z5?RiJQcAV2cKJBRctWN1RWNGr1JD8bi~lFVKm)5sehO zu2*J^^|~#rHEd)bFgkRsv0c?{*R+N_Jc*ISHKss60r`j41elvo73zyrjGH(mX!Rxm z(#JghcHclUHajpIne9(P{t-_0zc{yh@1Co>!voRC<$>87kZ#Tl5dIE8AgJD$jRPAK z10+M8y`j0rn6=N?*t{3W6cl%VNCjzkO43Wx8XW4pEw?5!p1y2vaPfG$qATO=e&DHD z+LiINX6ssRc{6qWi%)(~DEaG`df!x5d|U2!Z(Hr&lj+`buYS4v=*_~Lsf@qlfv4pb zJ_+2J&U(w!72O$c&z+7B9QFbC&kCiIVcUb|j&$eVO!LsA=3_y8Q&W zJmcSjVwE?h8-qV^u9WRWg7>Auf%3INsdM*g$No&m{*{h{9~3xis{gX%;Oj@1hL>Kr z-_WxX7`ivS(r|FCK&q=>8hqWE?Kt>TO7y-19|P!`Um6&;;o@QMzUAHnX=;y-v}Y?T z%l02!Sn&B z&CP4yoPTXT-L~tVZMkM>xnkdFv)f_nb?+Iv;o|Z@%!=)7i?w^v;v- zR-U|H*|A!=B~!U&rLzBFd1JObaJ%Pr{hH+1F>J%RY}oe5QBYC3CKZ&Feo!jawEk3T zsd4|fyk^aT-@mM=TPwit`-M_f<6042xuhC-t(dOdQe*F$hpxO*>yEV&x+;}gwyyc; z$}e>tTq~oia;bgKS_NHIN^QH=s_3d(s;FJ7p{rV{zH6VCquwuLVH2y{PP z1u44$y4p&G9;B;n^vQO*+CiD@q^n(0_hH-GZo1qf$@|v!($x@Zmo2S){{U_lJs-eP zwYX{N(uym9-f8Gqt>2NU-;u3t|H1y%?WZ!^PXQYn+>6dve0bf|daLp`TE5$|+&H-C zTyfPsZ0Y*e4Z3Q~9z0CH9S`c8zccXqz-s-0O#Ol7`a_Gp`>xVgJuiFGm4VxbR$TkC zEkX9dgGTu~XI?+E+PEjvxQ7AD)-`>n<@J`;x^0=dZOE`Ro15f4SKZ%yFy@dd+kYw* zV~pMRSAWy{ns?1^t2$zPP}!cXtooq1psf4@KM)|@{^UD@@3f~|N0zJ4q@TT*_Fw!z z*DC0<|M;jE!{Nh6jW{QWJpH3)&r`dkKR(oSy1(>?jZLSwm;U*#;?up3KR?uUy3_F& zodvjtXRLXU1cIp2N4s$PrUZJ}d}mxu>vXH}02hMNKQDdX247h9t9YiB(5hv~xxQ~x ze;rQ~l%K4A8)x;qIL(*Q6ql0{4!Kf-NaB~ItgqtLFTDJPRbOAm*SG8ord`2Dcp=U2 z8;1_n1kN<1A5XZvJ!lAR2ohC>Rr$Zg1*&YzTV+EuL!aOyK;uAtgQHI%RzM_^$%?5v`EfSSWBREgl0Q4^8FR7_A#*0E8CSk%S3TP@ zo^5ycta$ckdj=PezIx{6Gw*pjANK6O_nGCM6UGDed-y&-$`J+ocR@KYjMB8xrU0TJ z(=ei6e+V?SNjD_*7TyM&>aXDk0##7!>KKHe%$|hfZzC2_7uFj0BbH&w7_r8~XtFEq$tLr^i&&Kv3tYY~1$GF(6Jzi=- zovHKI9rec@npGpvXv2%U z2sAm|uoIx&b*}`S&R2g6B@3$mJJl30ry&Xal$+MeDAjoQkEc}>F0fZOapx06%sP`@Y7*sqRGPlamZ*-dkaE{@};w^9uzx3*9Pk5*d?gYSAG3iG2000sG?L<* zW<>gG3TtHom|;=di*|t-6U>u=^2{zu_Y)as5lUdikO-np$=jueK=q+Au(A^d41ruL z0fNY@fHJXmpZp)StHeI4UV(=uw z2tY56^SN%^EQ*N(noBg*{1-SG!(1D~Jxm78T!ai+Uk*aS*p$iqAWpvk^F_*)I%2%P zpPNSr%nOeTbfluNPpEZ5;yqjo`O_e%wI5&|)e2dQB^X^|k|p5eh)@Z*IW_0uxiAwW z^~-pn-lcElF(TBE3oMP2H((CTe5NMgpv8%zgf*HCT6-*{D1ul@Xr3`Z)n%vZ*9e}U zX$N57CTtAUom7Zo&dyMA)!(5L70oOwE28@Qc%~KUwh0NMVF-cP!8y_2#Un0k6MgUn zq7;7;cWBq3~Fi0L4VV3 zQ5^I4C40&auE8JyQ#MnLCJ4BMBW0ViE!Zrn?0Hi8QuZlZ(jW>=Zh9WX<_w6>1!s~( z`|Pk1tS}fTc(riCF+~Og;MthLKmo{TY1&ZAiSJ+nQIIOkUl*awd(Hkz2QwllTqrm% z)N{DLuJfB7&RTj%^ebfa9yTN}1;NOZ52*@h9GISn4IgH|6xIBKrvaBrUnJDep=2tt zzM2C|yXKgW%xcXTLp&H2*HMwkAIt`EvGh@($TD6Sqd|P8Rw~#n4jBWVeWiKv+#IpQ z9OMj}{{au?J7}_c++hC`8cu(KWy<{#tysdN5NxxLSQ6Q+;631DQfeZQ9$C z1+rIkWGXsVEB3rwvFFGBhFhLhd4ERUpO%k4sAxfwhrYU{?l9Cq8gV zmCZ}hJ118vcCI<3()z`bH_qRx`POsqdIJy2n%0V>;+hX?r9j{96L&wm(sT5Kk|IBe zMFYnm#O=6Qxa7rfD=Ay`4rIIoS$}ob-2r|l+GXtAoOcWmR__a zp!AJMFWMHI2|j`hrY+^n$Dj!Y?k$voj=T(-?Fn^E5|ofq<*nnfXVpIX?5T zX)uB+5ThF`AZCD6!L%_pc?sz(N{zATNEI@3sUlNRo-)isrfbY^#8s-$EE7o|z((5D zu9SV+F#ZuKF{WLNEv|Y@sSTEtm)T<0RG4z5ip2OQhI2&ov%$+3zcJF*GQITt!>2`H z0I@6zSKgwwNHq4W5q=QpQfs!QpTUj2WyT;h?L7 zhp+NJS=K|AfjESBfR&736g^D}%vDP#F!vEjkbnykC~UmyNd&CrVf4QVoz%A6S&z@fZBB;NNljeY|PjiA)Oh8i#Ej7aYF&lpM zPbtY4DM=9w*VXhoQb z8A|*sx+@Wwz$CX?xpB)XMh@&iCP727XK>KZL%{q?0Gn@PGx0~x|NoBWVMH{=JPc(} zS?m3Zrlm=!2P?K@TROip|N4BocmHzBfp=Suyj`BIJaY2{PWq%she0dC0vVE$3F~A` z^F1S_4&q`n8C*X7h>(B1r7sScCTyWU11N)UIL7?m$%sTgclHQlgMq$G0L0C8;EH4< z-YCN1WruYPi>bj`M*f{H0Aj)fo(EzAkO{oOX>V{0q%|JgA&j|Ma0xI)G++}nD#6c^ zOiC}31*oNTI*GtyRS=`WJ<0*l%#%qvIOhb{*@^Hh$*yE@0ElNnQeUH{0@gSgD}b5P zT~qMAnOw{T;w2nYzyhJvNHb1-IKBZK)n}ZW^1_M#0&!-!85pw~u}XSF>efp+z;s+0 zlAjK}7>UirWO_guWA=#8AeIIi?0N{Y2$BF8u?>Rqxe$Ez&>}g&sIr&FFL3l@Rw}UJ zf1c1l&n763ljvz2Z3?e74wz;iIFrJBCAvk-!3}2i#!NJl^sa+}bDxIK>2M2+je& zcEfXl%jJM_6!QZGWy>BbEoDj0wjy|IuHcB{c~YL6WuAil2=fWy7lEFe(W8tKI0lI( zag%!&O3%`XIQ+558$3Lw&Q5@nX6m*dW5Q51I{j-p#c)c{%)*+Q^Pv9*w_Kf+3!)K< zoR$5gi_1RDgs1r|<|KJpehudy!1!d7)>}Ho$fm0= zQ!I3q;)sBhV`)(+mWVKioS%76 z3*7J`B|}%KJV3ck=D&;QhM#UVN-7Y|qlYPmC1g}6b^mAduvMw6KEQ){ACU!?)cXmc zh@b{SRT~J^oALGD-m&c4ns#m7C`2$;z8~ScZ%U70V%kg{lgWx(o*8;4T_a^MFeaM= zjg;++{+xUam84)FR38Zk^1&sxAEE_A6Q$TK0t7?VZnd=ik}cqv-*AXebFh6vfNNJIl$>8-B4#Y7b`9>K$iI+(eIA1S}UvubL z7SIM9yTRvkTJ+240!CqK#iT`H3x%4(!+?WNHjLOLiCDsc6A+S3Fc1xpM2=#Ak0n?L zaf7p`Fv|&@eEi5XrwDZ-0Q!>j0YqHYH=}Q!T=De(p#5%idE1d6Cf?coC#iS0orgdy z>pu-Dk;Gwk(0D|o!9wiKq1(|vcxt8nWVTYidGO66w~M}aI#aPV?cMsYv?g7@;|CY+ z^)2r_k=}7KU3+S|^vSgANrsB*J!GksVeaP4%)|{uGDZ}W2|`P85JyV&eY6dTf_cuK zPsa^z(wNCQoArCfK41qZy~o01m%ro&CjW7hnr^!!y_##Xja4rihY z#hogLOOz{BuCNfmDLi40OdjL8ho&k#_ps+)_PpA3Y%xoiDmPlC9PqqMW>7g(i8gE( zwJ?8?qJ&bsrLS=dl_~#pr}WC~-%sF4e(sx--?H zUv;IrrirXucPKteH$AlOF>YV7z0&McL2R1?!vlhnof1{7>o`rzvGMDO{ zCT@A%k?NVUQP1OTo>Hc|jki12eaOFgBG1#n^x=(9^*oMle1n{iZ+v>E_APx)S?Epm z605_?`?T=0K8&2t!|A;x|DFHv#o+?r5?$nDp zWMNBci(*R^VtllXb%|6rZLbVCF}`2`N|%^`jP2+6R;qGM$K{8>v`DOIim z7&ENkxL}k>P^9GBMsa-zQF3a0Tys*#s58LJ90OPoP!g#>9^JwT+g6d;Fs{}g5hy|N z1{k{j_@XcW^OQ}oGr#S;=CSN6oqVuD#8=Fd+%)D?oG|;eaT8Cb+~8Pj+E}Qh*W=Ff zG~Hq=p+bhH3Cjy27Qm(cTl{Dh?5$8tasiW-Yc=*N;!s3FW)@0qIKP`1TDsEtnBaaE z#Wis-sBdz#$%ns!)=NBxBEb4{@W|U2{^g}Vx|H7f3@IH+o!*i0b)5g3)|LzC$mWh{r+p`CTR}Va!Iq)odyEo$>x?j1$IL+G)77i4>jABIuyW`BYMK(;0jHgred3uU`QETTxKvrdTvA;Cu&*8<@~R3`!CkHmN}6Gz*C& z9+9aZi4+r8z~QQp>L_#IU=dv`8K0dc%m&(XNi9rHAb(Thm>g-0sNqSHeL?=oow53( zP)J3MQUr4lpX#(7e~)O1e@e13+0^skc7W)4t?>V;?VC2t+~rPKZ~u3W>o)ZkK7==D z0iiamyr1kkUYu1AaxOP7@)&9Uknn`m|9>t0%}Y|c_T=yRZl7GL z{Z8|@n}5&uVS+}@m#S-RIklmOtTxASVblg>7;G2xl^@iOE#CkZtCC*7hTQtuIn)0$ zfK~rJPS$#Ex@63mfSj}2m zZ9nE_fl zz5G);k-4x+^9?g6SN{cFk$_1n#41jH!-bHAK~k)G>0to`gzyk8rfVmyx=;kp#k6AD zG6@m&;3DmYf?^H|U6>y65&63W@t@O)wZt#c6>FIPL_mr*w1-x#TfT)A^<7F#eTmY* z4H@ppg$hL1D3y~OW7QJ83Jh3mG)VHnv;8f)dPpbmHj>$Js)KHwIB6x;bv?m+Jhal> z9pBXd1rW4y%K#8grp3m{p&5&-X4y=n6+>mI$2}>rPX5r8>bK0@MDrOC;A{=!XwA)L z7QX$IZA>eM>WJ?>M;g?GRxQ$#XF9f}!`uUtFNLoXdQJ%)RH-PEeqYF~xwHbFwS&I< z8&)*~=uG&!o|`SAd37~7dqUw`NY%hPP;=>}R7i)*X6iAhdNh~#Uh{HIF9*hYUi0gZ zjGiT=Mi@yp6(xw77WBx+{N*A#kELI@+}85oGc<^AV~j#>3y-Vk`<4`9evyDw9_@80Qmx|eIuEtWi}tX-|_%2akOSN1Fx z->=)9g=>$uE#qy=dg~t*IO^StrSK*6R;FtM>Drz5g%a0R*873O!DM^2O^YWVcpB5N zgp=EEo%o(N?QL3f78UPa89YMfGQ0o63wxR3-7vvv?#otHFV(;H*}JxTpMwKlvm5$d z*u@PDE*3w;$4}qg{Udo_#@o1f^5*1&+WN&)Szqg|iy7aR?ABfB-KR2JpS)QN*V2r? zKkMtbU6}C&v)lKkv0iUKOD_g8{;d!6FYebhy%D{Ae7SD>&BCnQmG0h~k%wL@c_0T? z<*gZc>)q;muFT-@vV8bv$pfl%Bl5lPZ_8G;t~uat_ncb=u%m)~^5`LL`;sH=A~ZS@Cksan2iyT5Dbeq~LD zmY?O?lkc=GS3V7q>hAKJ#biiWcXQ&6qxUPDRx5ill|Aq4Q#uj^svk-As%jV`HkH3G zIcv&qIskCkcFYE7OOB-@OXrqcx7u%oZe70By&KOKDB9o-X!ARRz=LXd^uJFj&}@Tn z#qGaYvh?}2atR)jO|LhhIkO$RvQ2H-)`7KRslNM>R8U`jv+#pbX<%EnviJ6x|G?aW zD|=r%ee>C+Be!}`z23fbW!HVQ@6OxpKWYjt*|ObR@0NWxwdBpVcHg#rw{EHMK|||z z4!(Zyc59}gZ?$1(hW-yj(ccsNjn9Ag^LIn=bYATq&U6pI-J0n>x^yzTcle(-|6cQt z_MXbfJ8!veZ@ItaKz7US)h!1zTMoY6^LBk^%Tw9@y{r9)GyR9(zVh~HrvFTK@WATe zlbOLMvs?GCZataVdNSKJxZ1Th)3x_~hi!1^!Or2;o#!$;&t>-wH8eXAfRJ9e#h9LRJWxOe5fjuQ`Z?%r>ghMu&o zRY}bQAMBAjdb72?cT3-K0#Q1uZ=Oc0bZxu4s|8(T1Cv5Ph{^=1LkY%kw zYC3LPo3u+6Rf|Rcy0&%kWY$}`>TS+=n^(P^8E@yUE59-G-I*2d&igH$S%2-Szb)f$ zyIuM=L6=L@-bR40@4da{_E5TR*W#0ZRoiycnQiP`ZS2c5_T9elo0qAlaqoB=&Q^Lgmr%HB zCAr&eHs;)tO(d2@Y$Zy5FiFH*gH^C@lMBlLofpumDOqG@E{rWlDUyc{oTy@2vMY08 z^0jM%)_8G{=$?eJkIrFIeXKsK#ndsGCVQ5vLIIY7m z`6y#NreerrPP^c-=(y};sG4%8Ja=qw5r>Im!KzlUlP^}P@Q&ROR+gkn@|o+UiiE+T zAX9B5QYuXqLx{5os6LBftq;O}cghPNFQO!IJcl-3@Gn5vZ!z6-=ZP5Rvj_~`lQzYf z@wyHAKpVI~VSl@*!0^-OB4_J*TP!EHEcug;=Q+>hhA%e+7cf zEf#-6n8fKNrs#$>3~j{$OeP|kZA`&UQ4qaU#D}qL0HS|M5hKCEnVWOK=#BeKkc9~` zcX6Clwm}kCT3Zc=&V?z|F<7~A%6nXtgF9xBr$b^wjWFG(@H;W=JxocpY}xrN<6VQG2aj#=soM7v(flF)s(q zM~tGxzP2i3=Se=Im04{<=jLF>XAzl{sfUq9Ev6C>uc9{4>AQ4#jZQM1$W~D+GyaIP2amEr;33V@J@E(eXbs#pzYfnw1xeW$FWxPfLx*17~ zn%rMbA^XtB_=Z9jvs~vZNEo#4p&!#u;@6NT)YRKZO?~A}=ZdHEVN3g)NAFaw1W#rH zr!pN+zH@T9!M@Vt=s;eX8qJS#zb{@KS zZpHP)gRY*%qQ9!_SUd{iYv8ag>#tD;;E z+jYPFV79X9eTTE7{6SOeS^<8seAo27UxcfSU;e-=HSa*okmhpGquJJe{0AA^%*fb2 zkg-H}!tS!8f3c+Siut-$4Oi z0vUhr17A(r*PadTq6fW*xbnc)mIfid^$0y6+FU0OmUg}8ZCg8Uld3w`F4`Drz)eet z^%e&$Wo#+_EH0RR5;|`E0Ru|%y(q$DT&yo^w1neW@M5{7@7EnEFLyaMmpDwzj!+s@ zfH4ag(=s>fAd5VjMf1(frX|t^%&{A0Zjuu7X#uiinySb(mpFP7e!;{;*Bw|VEH+1A z0z`h0xsf#{cTT>flRLsamhjtnB&;a`UpzBv1K^Ork$nm(Vo^;ntMi<#n}p7R?AW-S zBz?>5Q?NA|vs&U-=iI;#22N%s++y$yBq+nD$h^*y(MBDJAo>tg-0&f^&YsNLDkPj> zt#2$~#Wc{*)h5=!hwh_CLlU1uDT!1$K zVs+v!Kzs`rW^zkUH*qfhXh_tizi=)Vz4VXIxj2U*;y7v$R+vi&)oA7uQPW7sPg4SY zwsH6XNlDs|nT>g7aJ+z*LO6;|0R*VIbs&Bcdf5@uD$1?YP<%u@dP{t04EVTcSJl^)HE11l=G3TMS@eL!b zUv7(&oI3KDyu`F*gn);8POjZ3N3R4Fk}yuC`s%YR`EJK7w#Eh;T1t88b8TcUh^+?N zXxX7%*Vr%abIQN){J6Q45um&bHgn;pEj2)AE3_k|SZ%}=Ip0_bra4GDs08}Gr5sku(j=_F1@ zSRxWEkx&galO7#~$cM}q;A>e3+ftK`BVtU}1VzDP(rHc?4(H(kh14B`a;7)cGuT zA->bg$GDMvUyuc2W5QK6i79pEq zVh=H0AGoAai1%s)1oJn4j?*N=Tz8~OMIFU2#9cv%yGj=9GeeeMu$vaSM(J1$K0#ha z=>Z#j+)Hj#$SCX$5sr$y^c&X|6SvrOq0WLYBu>qf3PI-GIYH*D1f-OaJK7F{6|`OH zv|*JhQbEXDm?{=Egk7xzww1LIx|WU^7Td4LXdfxB^|1}yDTTxGyrqju0YS=V$(C~7 z@ro8-AXA?%vjpO5Iv}ph-;^)eEAB)o7a%xQp9YLgu`K_)X_6ggB z6bT85l+sk0X&(&hX`kX37K~F81hi#BJUxX7aw$J}^zzV$m{^9vrXZFHpQ>o_B--st zL{T96#PUviqi=A>pd?Jt=N3~jXTGg0+b_a-{i6@(mAg1`9zh_ACBzRAKl)>~9g_&! zWgv$@lJ$J{Mp(J@7}B!T1_NMoP%10P z18wY+HpVD&7%W0>WKy^g!^{XjgO|M+AKU;4v-)WxyiHm!u)qe;_|68XAS0i zHadq*Lzx%?vV4fS#1E!)NRnLW9E^A~Oi>7n{U+1&$f-=p387I(8r5r3x40p*jVOc zX|jpx@8MghHTCTzxV4@7G#+YYY-3A!FhCQ?`ZvMslfittb(sN$Kw$=wXTcUR9jNBw z2E}0|P{)JBF}>#R1D+)i%hw5f3D$6|usSHs1L?kye{d}+IXtPubRrF>)?fyPQ-!Gr zzMxot2^JlT?ckslLdy;$Rv9n*2lVy-NT;vS=@02dX2`@GL;tBuRA?VJ7<9rTHk3@N zfpUvk<|(Qe1I5fTnWeVDAZx{JyAfD5s$6|b)zm7iJ*ji&W>h;xJgF7c!4rL@;e?ZTaR=`+ja-L$2GsRUgY2ruxir(F+(*P0UuucagHYPj#Ix;gbma>dhe zyZY|#OkgP6)+OxjDU4m(*K;3cOm;`rVcPMarW`^InBDm$xns3uYo=xEYRkb)%fWls zms?J7V{LOEc$UoUsYSQt(dyif>Q$ ziNk4s&;6#3Tc7<=(~g_RvXy_3~{_e}bkbLpn@%jFl+-U|<_ z8kau%qpH5eV_9z*#oJkGyp>w^JUQu@?b_DT$K-Ic&Zz5FpOe9&15X(@U686LUxf%H;{{iB++LeSU%&{PzSIHDW6}iZ#eVfk?<% z0Si^tH$AU;NRtBlEGSpR6lIQ>rJaHSKrfs7E`xNy;eMJfumaNWhl&wt@!%a*>d zQHV|~TZNWotMD<)mJuw|Shfm{Wvh@aTc63hY%!h`uEfO4GRe;`)~gF7UEnc&gv^>y z^&k5N3crwe!nf=1?kAti$Nj?!tRJ^r9ihFW3>nVu5FmE)yr6Ma>8ckPV_1*S1S1eCuo}2S1Z0)g=w*3dpL@`U!nEt1*F4zwI7zr zOBYs~1~d5Y8Pu1n6Pqnp=33Q`Xw|e<6|%L;FP*ZDV67VYvx2vsEB-C3{+${B&b!Kc z{(bKhWPBrO*9g|C5w=$SpmDgu$#^Q#kE1LmPqh^{A7OoqsLjw<+OgEw1&74!X}^GV z&7h48jz*U>e#EM<(NnNucQ8zbXz~B2+oR+N!~VZDiJs+U`{c7H ztqgx>rI0`jtu$B44ho!2Q6wc%O+h7zCm|Cik{q7=zfLcIK&Lz;ICI?N;JGam{G~~C zz-5N~`uAxjgJlWLWJ451@^deLj;LzBfd!~)kk@qqP}Tkh7A&8|&1aGxga?6z;@F`6 zyKTa-+5t2}#)C~aHlL5MnHYr{VNPs_LT8d8?!hvoC7&TI_GomOL>xp zRfb`&;Z1o#%$Hb*`4ZE~FW<&!DZ?hiAm&R9V!i~#eBp>1A`GxHa3-QK5jHIidh8PJ z@KnTDEp(WEu?;L7z_U#to|Wj0IMsdT|>Wn z>9;OnXXKj~Up}Mri(nRsWh8!a7k1~Bigv+49?s5tj@#Z2{YCpblYhP?z55~}lV^7y z`{BepU4L@*?O1yE1w44@DNXxYmOb*V;decq*_MGHw5N9+U*2{i!(`eI%i6QmE$P7C z^pW#$$}M*<7D9+zR<-JH&G=ha{o6ABZFhIA>^rmUKf74)kg$*G3+3*TS$ev3++Zeg6j^WMJlZ>vSN-~s8WhjRdB|wH0|18uSuhR-I1yk z-`J#?vQ5$$!Uov8dGbOjH=(@p%glbRb6rNbg`;sXDAFQWQ0kb-?B}}kq%1&eQhhdG zsD^!8&&D*lPjI$d=$ z{j6nSsk1CBW<~QXEZljuB9{qA{9rUBSu=AtReN$!vnRJ{9%erX?Quj+XtozLo1!)% zkye7O6C*Ris9L3cWO-0xx7mJN<^4-kN5&4)(9K(jqN?reQBcO`oW{c2>Tb#TSAjg*T`XO_#)roCse8)aj{9ds^=|_1-O7Y1*?m^1xNS>T1fknwF-ZZFKFp-`w`*flTw@a`Sem zZyy8^7QEr1t2OJbUG;Wlyj|&@gXt$|oho+2S+t_&*XOZ6*>ZWuV$s9k;N7l!T`R%i zmFmNp;P7{k-#UETzGPoI{A(|!gTsrSO!eWvs%&}r=;H9=3-?K)4nGcKaW$ku6wnvKU3GgTsLsD@P5;dyPsWd zI(pNc?LPeWlj*KAHw#z%?OA{Osvi!jy|;J0=im0gxI4Jw9}+IA2j24r*4m{id95o~ znaWJ_>pG!m{U~KBJa%&y=-n&(&o29)SuFU=$1Tv04oNjfY#;vgpyWSkOOQwG9|ud1 zZ!7#46^+ODIGK?>mSc&gH(EmJ`q6*F`Byx@;{BD9&6eq}Vn_OXLB>AD4#0Ugb^EH< zT%s4fh|J*lic4{t;eo~Umb0&Lfq42Q8%kr4wWxu9sGTMmixfn(tvj%I8Jj98e2XUQ z&c()*&Xl@k>psSIyI62kefztV4a<&O zUv|)*3vT2ERztKi78fr_2pb3*gAjn5z~f+J(>$2b+dmd;dk$=ySz4p4)iCxZSBr9l zxoZ`5F@F`qfndQlQ(-Bmr40;NV%tA!ZH)FYQc;d}mZSYKIofk_w6u%?%RyTRF4&Xp zqQqntAl}-fP4Kt|1O+W(>yDR8Orrv9Mh3j00|xHWG_8>9j?Gd}AkZ5^PrD5w0apQE z=+^+wkhE`$w9JLbejv&_2=S0mgma)h3aSX|~#apx{dgy;AgR>Nc$H_rJJ z_}b0T#@E7M6wck)tQO|}*so{A^u470aP(blA}aUSHi2x@fxzrv)YB-h-YVqI38eDB+9lU)7{2}{)Wgc05xE+h{f?`86?9Ge- zGCXtviv>zs*6G0m7%ucgVxcLrH4P^xf&p1KpG8yBUT+38c$uqlSTs`L7Au0WW`iN- zak5slfaZk<&>6OGinZN&>lv=efM?lwoxy|GF=BhdloX}l=!3pPDhtAH6gP8x8zR3@ zIEBg~5rlwe2ftS%j)2~Mx#Nao);J^ApQegDO}n(6C96Y>f$LWy*u{oo^`NV1iz7LV z)*&lU490wbtZhLh3vzdSUI^y8|55vS8_h289eL>^3WI@gz#Iycgw=f;m!Yyw1y<1H zV2&hshQcNQkEtaYj`D_0#`EUIl$?4?>Q&6?US6g3+rMuBb&0k`4RTxvo@FY43t&{9 zXLFj>xHaGY?;D`ysG~*8(C;tQng7MAGif}6kNV`2c-?CSGYDg)Ke_3R`V_db`8gU1 z48lL8O**lg#sLwRI_y~f1Aa7Hl3DTS z8wkC`)tMjS&3vO#Lo@8Q5jpt=v8O~h{|yhFTzVoic>Ff_yd~cXB4`gD=h2hDTv}#> z7j_86R+)}8!RFEYVvqk!TP5b6VHz+AMeKM`bCVrAv}xfeH(+O?l=)yzQP{?D2<~v1 z)HFm9nBT|;4#52f;A2GLv zoF*p~3v2Sf{&-FPwBF=aj?&=U;8yNn9ny|Y&?$F-H4c*>HU-NmL(r-?2>svE2jqe@ zQ-fHB)V&p!?}*uFhvhpbeYao{TjQM$CF$3bOLGYSP0rvc@Dhl{&th)R53nBBX*8W$ z0)>}r@kgCb-Gs@J2c;w55NuJKiv_BmdYvlGL&%W?lx;1BsMzp;u(PIT(!?CBaC5Sa z93JMI^2`tB>u(V*d=XHA3zofSYXY|~W@@(IZ|=!9Qjpu0dc-KNqrkxReYdY=>UL&Z zdskZrGp&R78|60}Z(mz(WYK^dpCohp`qv97WgFrI*KN6vP;S$AyOvw`-s{P<9$6~9 zU(>L3{OhSV6PcRcw7(Y-%d$01H%mAweGT;uRC4hqd1-LT=1p8_4lq%gA3L#IPdxDipRlhXEjN~ABXpP=zZ&YjzlHShIh$mP4?nIiM?(v@ zu3YqCXWwNuyW_m#Il+uq>}3Zmo*$)BN!_Y9fGybX`)iiGtF^&QZSb!5UTE3>#M{H~ z`j0&%LFLoSMU`nAJC4z1Mm6<)||y9)%UAA@0ZuVaVS&11z~q<`W}{d zyHe7mFW;}0N*b4ruXx*k>_t!GcgpS^i_Eg~)aC4Q0jCe!KYY#Z1pW z2w1;4`r7EN;^nH|myh2ssaYy~)3H+0k*#jL`RT>uB%=L9>0NENcHQ2!;@Sr1kyTI6 zyPh8A*V3P<=+FA=*f#apslE%*e6#*$gbjS;bCF|(3udmB5~6xn^kgb}9+UFJpB|E` zcYgR&FV0K``6mZ`Cw4l2*ylSj&+v>!g2cBgRKG#UTgGM+79Czsw@OG%0zGWv;qt!fd)c?@>dd%0 z-*a`dA^rDUXfJJoLB3jf%lu6$v!9pQOs3^2G6jNs&(-x0QRH$8k4H{z*eCeKu?ceA z+tAprQ^BjL;2RVFn>@wu&A58sbM>*}YXxkZ55rXleZw$?Vas#4Vc0@FyY4VC19CCK zuXV>XGG2eAY=~&A!vaRx5z?_66qj4@cV{6yBJP*#YlwqQM8n}(9$8m^sBg$cUne#qiz_hh`ZrT^;P63hkCgA}M4uvBiab!G-G5eY|8;Q)21Vbl$EfSxDe3Im=8;WJL43g6Doe75( z^0dY#M0^+Mgr#Q*Sc1ffrtk~)Zxok%1Z*zNczhiyV3ozH#BAc|ea;>`Z{UGWMA1z0 zfM&x^OiQ4lxW=$0<7!!fYE-O7q(!}T3^9M-bM^8G5?cAGME?142#`L>YHHOt-2f9E zT0!xQcKQ_UJe}Js5(=M7j~LcBrr_L1`v)f2H^ zXDk~i>r!=kL&HT4V?K?OMBj=!d?*@Oi;>2d76^99K3yNKvi>>3i$IKgF%-q##Sk?j z;wyx>dE$JG_q6FqLm?5bUgPUuTVUK069ym!bYQ$$!gY} zqCG%yiZ>Gw6FeGxL+$iogDjaUf=^~fsu4jZ73p~j1pW4U1pN*nsE2w`PxUUhr+>TZH5@%bBm{ z(5%nkZxXuu=V5dP_i;0QQPPkBiIz&(Ua3qPmJoTj{Vp1#K4ix$y)ccVV2cj5$rAR@ zFrrjpemGNL+BGZ`Gm8ON&Yl=91Pyc-OI9O8(Vi&t8@d^y2RnzDCHI;bB1*1q1^sSl z>O~1*r030NZm1YR4>x&l*mel`%(O3G06uHE^7TLgg6y&F!)Kt&x6~mYR*`#)NQ=OH zD*PzLz2Y{*^ua{aJ>6>7QCyjZ*9eRFUz1?HT}rlzxBpLJR~p>bb)DY>0b(JDjo=Cp z3jyvVD1w_-QW7a@hZBiOoHPkbiIOForj-ZEwpcn=8h3`CdM0Gj)?n&5P~(Z=iDtsK zX+yV}Mn9Uw)9DX9iwU1G=9roIM}9sE&bVoQwBI=o3m3%UNEdf|_uYHnx#ym9?m6d% zA^L$EL5t+UOoj8g#otm|0Y&W&jNb;SBveGdYI{z6PI$iT^TKmtyEQi7Hvw#?-IWJ4 zbe%X4IGfiB<*MPfS!V{Q*eX= zBBL^v@w2fL7bYKzUHE*aLi2+%3EpBlW1>x(aI<#LpZOZ3_;mH=wTx#n?`~xhZg%-x z6g)}ph6)ndWvr9r6gbyk&Et&u)bm%b@hvGiLB`Y-oSf(DwA*(WSFBOz7#;r;#b`ho zE1e9yVqwI}|#9ypebZ*u9mToSd{;;+o zG4!p4MJxGkx&GRX^@`T}qpRgVIKOuE-7XUNr5i6jbX`tcFH?n?H_ohAbpI%{I`c3( z`e5PR=w;MU9OogQCEIHqgi_^niCl&MjnS0pdly%~_-^BW=e*`?{V*F#CEefJRHIOJMudYDTlJ#uy^5;$!(FMz*TaJ=rk{*0o_){Rkcg(k1MAMg#gkjD z?P*iJs&|>)?>Dw2rPsf-Y+e$V26xD(>5ja9+1;b__ykt-|Om6Jf58T_MfcAifk{E`ehz2ThNDe(w51xL*uU)&e4YREkymik zKN8B!?)U5g=pMdmU$$>Jx^d-3Ix@2EIIg&=7Ek{{HKM}*@H4x?)Ef{lF1Z4c0I(^Vs*`Rks6g?qc!%y|MQZO~p6)`I3$NT-(4$_wRF9DV;-7jRY?^-lt|h-r z!L!^1qRB~+;B}m2PkZ%0h)s__brTnrP6ygc)J5f|20#1gnF`#;!>443}r5}t} zKl7QhV+FJesb7)mjLlc?JSSqc58|)pgY+(HJ#YoGGj0GGIpG*P_=maJUtz8^JZvWX z^Ckt1eF+)otz-Y2DiIRLEkC3UgMC0dMr7>umnhh8BAY_)yni>Y+J}iBBV=VmP0+nA z!HSzELomwO@=350Pv8R~{3?lsfuJaEuS@d(1;Q5-~u zd^m_~^Z^G2!@wJ>H{|fh!vLLQ1WqZT{?*6j(D2$<1OVm~Yve~D%Hge*b@`cKD3@CCQi;#V<}ojtwb?h3 zm$BuVn$()gyCNw}q2>gyuvO09w_)DOz_gNn6= zZ=Q0I(~>|RtSzx;Gz?+SpnF=OFAt|<{%G1BO`D<&9#xVi*@Vf&cOI6|6E9`v#g{+8Al>MKB79j%PD^+zdV^VG9$aeNiL#8SIOqv)9aVv4Q^A#3ymnTTkF*}qVXKrv3E$wNsXcck?e3bf;)owV4u2{8T| z)j^8cgvKZiuf2z4Q~hGa(x(&R`_*-?xf9bHzNqYr;x1bpTP|9*BGHWqj4MtuJh%Wg z1P=g)3ZZ8dMv3J;7TMIfXiL|10w(ejppa|=-UI+Ol-_C-C>@eqHXGs*2%S_LHDs!OFjYf(89#VRq3%aBmMGgeVwE zkX0haQKHUtMSj%y8u}*-Y zlVey>4)l>WuswgpH_Wx?i{P=)`jP-Y{JFzZpg&t|-uWqQGXm+Tp}hFSybJt%Bmk$# z$mG56aDX}O`Lam#`n(gn>{Ia`D0ORG^Wng}F_+b@r%QF`re>H=&s0v|nVow2=G954 z!Q-M`IOC1a;Zzxx;WH5RUYjmp3+f`#6Mguf9Y;{mLj=gYuv2s)e~+A|qGy+; zA|#wYLyKUU$=^5e$H-x|MQADA+I4vId(_i()FyE@1v@*k;wx&Z6%-eLNjR9`m-QCG zuSR|E4PRJ|zfqPPee>eIiyMLH+q1uy{(u^+Cb(;ygb}OoKih?1*S1=c&dQJNazP#9 zC2E$45Lj@E&+kxz$e51KOg(*NdTMw+$TV`vLFB6Zmc~GK(RYRWmsg!X_Wq-Hy?dB3 zIm9gghl2KMZFBRGOqrw*w`cQic?;By^nLV0latimn2MLGlF`TKaOoa-XkdYe#uyCH zAn3bjc96eCEK>mv1B&>Qbmx_1q5A(p!BGSmaW-RwN&=Xd8*C05zb3s)eazj+!$On$ zRQrciTYu}yl$7&*%kT(NiwsudWYG%nB)_*>8X6kcZspzJN=?+le^CqNcp!djI%A}o zi2RXzd)O_oCVAfhGC!udeH+c(705-ZTk-h6JF+se;Te}b;}1P&7BB1(FjRS=Q>9e5 zDZVDf)18`HElYha-GA!A)H`JlKKIV)bo~VG_jKD9$FM(-oDdg{N?rY8rBdIpXj5#i zZ#?tzGaI%6**5S?+fd+X(Bt^a*b zbMpHAr`J74KMaJD<8Q|9#U2Lw5+{^U`$p)n96Fp1jixPKMb%tw97@Yqjh|7{kMov05U zV0(PXZS*B#;GMj z&q2pCc$^#!HwIt308a;v!S9}5IiC{06Wj6zwCHOw8vEeNVDgI?w}R{oD;HjmsojF7 zRW%6qs^UET$~Ls!PanqW_{O=H&%JU1R;dn1wZNCH+egx-5p5mlq*=gNOw&UGsW?w5 z(2V1;c4JJsfYiizEEa<)_C#!FT2E=Lq8LG_E~YuWo{-)|GGmz}W9Qp5*Cr<+Eu8$y z?Wq~yNJ<$eC*kjVa+2vxi8fn_Zkds2*Uadz!1;@ItE@|x}- zfKQN|6u2lLbx_vIl*TFcIt9llxJLnYV?lb0g8LM(cyyIw9Tey(AcVzBTslB8Ub#P} z*uPWo3kuF40Or4a3qsRPitA=)Gvya~oYEH6=%#=$p7a|E4pBhZN9v&^=b@OFf>BDI zqJZ`-=^zD1Dd?x5oC;`@(6qs$6eE>z=>`RZlq6M2DM&Haob03+ZP5~`H?o?TM0!`+ zi9VgyOLQVC-J^g+p%Td$q%n#yeNR-l=KPz!zDDVOhhpdR!Iu$UX2EvWx=*pc`zwGZ(YdNn@ z@HeUT9Blx-uX;G@6>2+DV{**_wUN_4p{gdKU+z%-oD2w!fn<2)oEqe0lVEo(jlKM& z8scPF2)3xr9BmP*8`M^gwh8qr0 znp9cpa>{gXSg{15IT~)-ve=+G8n&y4xXQ2qBssA%ua0o?xX5En##5uo=kG;i?}2pR znCu;UFeiJ@t0(wVsV7CubFwTsof?r_j>v%{vghd9oa`A@wQs48iI_O`lvaCO6fCyv zX)STdAR2lS3t0g_+vP&JBkj2O&=6B~M#C8~xd0bQ_+8UK;@@nUP!31zT$Xj|4DG7P zUpLCI?VE1vR$t4l3iVDh4zqDfd2FmI9gA2$~6OPdb#GP zTB#M0bMdA4a(iMLw*RX3Vr*v?+v3x**_(B0wa)0843<6pK@QfzzJ{|#DmWwd;Q@f9 z0l!o&2MtC_ngowO5l@aLo?nRoq#Js&X3D-L;tFr2rcP<=+OF^!E{a=@s-+vKcrx)E z`Yt=LUr;u!K3Yj$)yPScV0Wu#j*@HGrOS!X(&Njvq+510tK1L-Fc{F4m+P{XTf2%#^S-q0-%Yj3xfZVm|2hq20WGSjP*bL{zt(u0tQ)i%j#sNz+$uGz? zJ+kwFYUE54z38dLm~0Ek=3v&$nQg*xaZFU|eKfMl+d4z-7P?t{)IeEi&lyyW95wBq zR@KZU7JM$Xf}>W!>rc8@fKNDS!<*AKpKSJL?VRzfL1=DQ&cq0JwRUNU3m9y>dQwCO zTs7(1UfC5@(BHYVJ0jbAWXl1puc}_vKsE4{>O6^w<$2Z2NsG`rE~*tAx00~6lA|_! zGu6&f2(H{qV~MK7)kNKLOty!xup$3W>2=3>zylui^Pkf($hzXINVlZV4h1NVD1wKMl~=nETM4ft8&M%TtEE4E!Uq? z&0IpWyLSa~0TxgWfGas|%g>FSlMbF6C&yhhH&q;U)7(^Zw5Igj)N&zE`JTC{=lq6! zbHhVay)-wxJR7;*tWG!w_mQe`LT3m9k3^z0O+Y2QWj(5*>|Kj@^$W6dL^W{ra_mDX z(`uvKHX_%JJP6BmEi9Xx zvrf*y`57>@)}zWU!()4UL|N$3J`gf6GzVY#|FWtFS@;n0*2HcH4N+TIf2Z?VK*zWiHv-XU;gc zr_&?(PE#b;>1A_bEFBz_y+g8NNHuba*6H0PW-cjqnv_F;&FPYz=8~PA=8Tvw z_uYA>c}3f{9R6e*=$ED=vg1g4_>%0n#7mkE>79XnMT8t|0EQfB(9*R52KGTKtz0^U z0C4aG)5;~^c>7_3)-IW;6o7)(t}Q#G7Y%LMa?#LALHBmK+t8Wl%L@3}?y?$M5@X4# zm9uEv+m#iNR=pNOcVdDl!>oX)+JvT2BZ#o}5TmMu-#w49+W zOD%{(XB|o^s+Nw)pcRc}&L-7JwH^D1>7;y&%AfcPxu!Gqlw5OY4fN`9wTcSdc%VoX zQAc<~%GOfst761Zv(?a;)UR}?2E_4BZ7a?iDfQ=Ky`gW*U8@@Ki?TZOf?|Lmf3guy z{!}w1ckC&Y#HXlH?Gyvzv8U~z)AKZJ+I~Pb_hy|`I4pqesH@+u@E97HzQQlhhSsu? zk{};)yOo)ez=}00)h9BjGFwTpqr#zMkIH%t$3-wZfR(;ged>_hG$4BhRL_$l*yO2rZfjx#G}7SWc~_Nu2P&2$5bT z(HRCa$+&!p+znj0Hudym{2H0RN>(akCYBWOBoZxWD!3NRAFkhgLSwy)oH$ASGPOG@ zM93W&^gwA;61f-3ICqwk2KJ0517Pl+d484)ut=bq>*j=%Q_>TYx94ucIJtq^@#M

Co-zuG}QRKE8QCA{QIT9(%NdW{I@4iRb95DB6km@l%9imv60+Mh`EeJ9- z`<~VML=tMLjcRD~qaCPfEItbwm`!K#00|0m)1cLgKs620K01NRJ&j8Qo`7v-Pq8K` zqQMr4ZEILck!j_g@C@Q}Ub{-)I5IPJ=b99cK;{A%nAfJAWhhpzt%k-+~shX{P}Vo z`n8C5=|ld28yYAih|0kVTFEIjUdm&M%``!|KqrqI4AoZg+^wJCIeAWX=@#HP^n zu}&}Qwgm*gvj`$Oc4Sj%*%VNxXHz)x>oUitzWdi^_tN7J&Az+kzcyGGClY<@2LD~% R$K~f5#N*;4fqxh?{y&ji2W|iW literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/logger.cpython-312.pyc b/entrypoints/__pycache__/logger.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7f5b460fee6c1392d3f4799d584b45aaca1baa0 GIT binary patch literal 2680 zcmaJ@O>7%Q6rNr0uGil6Ptw%+qm3JrwuUB#R-*hAL{QZt5hB#~fXrdF@lH~Q{V}tf zseEa*J5R5^UlKbwNN~w8gpzbIMLRQeEOc$(9|Z zu5ewp(~eqKxvtn5CtJ@tT3r)}NT@{hO(D)+&xzzLvC`LxrEZFeFTXBUGryuy6+-nq z{a}r{4Y~-1)^wYJVS6hpl%cucotr<0aX$bv_m1b;O?TxTlbMcR6~p2i)O5~MlQk~J zzA%5@bXvRr(}*j^vXB_tUxRXsP*N8x0-F?Q$`adY0-LqOWyzAR!-g%H%9aAZIFhCc zO^w64#8zo~wV34FGbPas&Co1nq#Dl~=g3lhPo2D_L7iNN;GZ?j>*-K0NApiq%I`cw zfflSH{O~FB*3fmiUIZZbg}DTw1rA%NIp$@A(6Fhy7%GegYs_sV{UA7FA5!q!9tLrX zbc7BuK#RvZGzD70G%x{T$*1-*H+)=HF8}pfEpS@3RN1>hj7JE2|WO1hK2mhXtyP3|b0$sra9v@NyRUQ7@z&-i7F24>UtUoOXZ zyj-4P#}a)PE%5p~-Waj6f6;7F49vBUpQ8=hyhN>X0vvcDAxAV`x))WxrYdu`gi74; zP`2pO+Da(%tfAC&m%XaW@~|U@^ke%_jG)+$qJ&};1rpF#L6noh@BoKdq;}J7tp$F2 zEZOb?Xa3x71oQbJ^^tS5Dknxbt4GGT$qfe^&D!>mTZoDCUkt3dxQ3_2It$RKN0EuX1YR?eF!eZ}q9W zPd_TCne3wxk|}(weW>;I*ZX>j|L54_jGW0bBy*Vd)&hLsuox@5_=mY=I1K(pAg1Yi zZm2hX!wnEW6FIh+bMb>G8zLs!B?Z`~&C``0O zAYNpe^ckWh_Dy_pc26H#6aNqe|_bY~;LMc7BRy??K-0q57I0j;(d z&Qg~$GvGAyTb}Dv;2AP4h6{KF)cq%PG5?-`4_#F$=AUFYp*~*O>?sh{%pT4iMDG-e zGKxbercog0LM?u7Alb6#X9$!P}{!kIJGA-r&1)1ihDW(~vftTh{T-7r2_Gwmdjf=im= z3oT5+?%51#a^S&>HB>H8*Y_Ah<}qwDLoP79sn`J&NUBg;f_F1J0_qopSLawUyvDh} z-2QJMt`m4_`AFCQ{K9ALh=6*>+QunT%5wI>$Y>P`w$>h{+|I=#M z+M0MK%AhMt#tz)--0XmB^qELQmrjPJx(Bw3bG_&1w~F(3Pj409jB*&slhNt!>8;Yy zsDNgXWb@lYs4WU)rrJL+-7ii4UO1T4hZ*V*?~l@`LG4QS#HX{LKMm+d8FWGI$=erh zwj&KK9WIzChgzN#MWDmZ_9IN;{6oU zT0IqW<28oN#qR{a1|mucg78=+!puXW|3WJFN%cON{gKT7lo`44?!C;xtID6sX+g;T JK~VC9e*+G}rk?-+ literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/renderer.cpython-312.pyc b/entrypoints/__pycache__/renderer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bcc90371a19b419f21beaab2ff9b6a911cba943 GIT binary patch literal 15816 zcmd6OS#TRyx?VTX*Z~40!F_9Lp$LkER%@jfYO^KVl&&SuLN6eQZc30rfYS|1A_mHp zJf4`!of}KJH6tl^QlrV8s>mg!;)hg<^N_@vmzh*v08=-_bmY36+)7pMLsHOC6MOuW z??0y-04Zq3Q^`YG;=x(}bNZZr|4;wtK)@^DNlAZ^{N({b_ zri2OpD^7?g+fud}`-FW)nvg`Qx2GI4&Iu>WODWfkdjdn*IM0NK{dy<7_;sXwGyVxb zYjdUoGYu0BEbmGMXF?N=Ebm6XX`-3sJ;;YAT3FtjYMp7DXq#!DXlG?#s$-^eqLbzQ zsjivsiEdG_393^G+;6ytmD6JrUKW(#2Z9p1XX7>Z_>VBr!`d3r)}-3+*?ILuFRN`v zZCLHC)b`Dbk(Pg;F5?lKVIMnm*l37P=~*qF$j+$Qa~UNf8XhH{jVDrZU8llWI-SYJ zvnsVXjwRI;Wqhw^6g71?lb%kVMbUqBW=d7`Q(9(bHjA?NjCv_sDf!Q2-c!?7tIcRR zt7c=ftZyuV5t+1_&gw?vY+TdTm=?ciHq!L|vGhC}p3JEl%I#zNd^+)ZGNEOr;@QNx zO4}JUI;XWvPE9Awonb!}9QYB6Hw9IobDyxi30#=4E4B$qbttx+OQ8L>kD`wUSTW(M zsg=xH$Anw;RQftAeZ8u$QtL8n-4p(rz84b@hY@4xx0-@gOkCEII^1^vmy*9aQ%iFGq^J3nn*xnly1WlYr z4f24b`SarWT8QRQj1kUDYvEAovORCV+LpKF?e87I=%y^KfIW&mZ~JSTBIWH!9sj~% zk%#o=O78RXLtk|V9oJ}yxwiT}pU-WS z6LF#!c}kVfYAWW?^SPm@d@7~Jbye2Yl$yxOaaqR_Qq-598=bn2B}F5W z7RH1|fX4Gfz!7WBROiKO%&A8;Eu%#wLjtU6tsM<)1nZ*HzR7sLdTCpBW_H_ba&{ye z*Fb(Fm*)2F-}l~Lbz3SqwQY7Dl%L*qA(fih22#}KXERAquW^bhtO!J_nkgyf2_(!Y7Rf3(;?T5$K4eEkJqq}1HD;_JWjXP@~7O3h)) zj(z6aT-v;2dGm|K%`X=G8&+C6ZpLoJ7Q7{2O95jBMi+CR4IF>e)Lm*GDl|t5OWFG*EDFUd0N8NYHS{VriUiEM|CPw8e8N z%KKxn@65$hW{WEpQ!4ZPenX zqz}o*f+kbGmy#wcp+5R$B;OZ)BaN=~^j|q~-F{uU;a!sY%Qn%mzw8u-q6Mk**5Nyw zZl73^M#?suV`tedY}s0nT5n0WvbTLp(ojiyb?xAiv}a{-q#(85+H@y;dn*Q(q*uyP zvtwWp`-GogZ7iHR^Ph&EP@b=h7PW?l`cXm8>M2O04{Nw6@lf!~CdIBwN`;e7OAErZ zs5m|dOgL4S;#9#mRd)vfFh1*1y)}8WWYs7xa8v5LuD>rTbWS}()!rEZt!MjW!{j7q(bvk;{$4=>V>3kQX1|DEjBZ4BM4j~ zC?VBt=IEV*ChW!!923n8!XYx$?Q2gn^`b(Q_k`}d_PCzvDw+w z{Mwyl3pclumW7FQ&Q{FHU!jdu<(k)GIDkD#&cx5EI`)4~NoIy+HJgY=eAg z$wYiArJ6uqkEhYs)E;UD4GBT6H^>*bV8oV~gHCCg3rU5rP=yfFN!j2!$&{MCsA3vA zqEvhyZ%be*TdVO|$C*GfJSG~cTEY(6upY|O-|7oUekk4)vQ~?5&7K$j5~`a;f})TY z^FpkmA3*L4u?nnt;bUmk!I z46>|o(t~0&OTNv>Uh8!X@gI=v> z5O%J11m!UW8wHj`qs)n*pO3JEfikCnemI+W_UHATbHp+&0Y>E}v#{8Eq~?^~V3xtUgJLgrlyTiPhkrokl_rL= zeKJqA7B%09Cv}SvTf1IWeGkaTr%ido1ro3lhEX!jcp0Iux!hE8CJCOcUP{2ifyGpL z-H%11c|4On4zZ@rK>bw}u2b7N%h(Bu)OI1!iQixWj7=B;E^$rL&ULO~7;D#@@lm_VjJII z(C;>E$`o{qJ-bIBv|!nRanqa%u!C3!)C@zrMoFai^t+I*8tWbF;kT+2S6I%pBCNzyel**G;?3$Fp7BxaZAWUWG*WyLw0(pvi zLSsm#6R9~xt>T~gT3ADjzculkHK#K*p9QlWBwQvo3;w97ndu18cs34EN3T&?9D$=J zo6!>IM)ddv>_iss7mUX2uuJFgGNcQd=U6@ZwMznUoZ$br{N&9vw3N4=<|@M`Q#o4W zk$UHyr$xdV*0UC+Yfk!W@E(SC7TQ*2{UpOItKLNCrw*w;x+*GBrDmN_)uq5N7&wZv zOin)qY%QZB&`&cv?KyA~OB-UWjra`bZ#Y+1&Et^n+pG}Tb);gq zdft~T2lr($ODM-49G;aG&W9@5O2(Q`I|$Ds`8?J=HKPez*F80@q_eMkYFfz#S@%ra zF9^ErgaD7Ec)n|0BZ`*QbD7%7ZdvzO?TUl>J_pc?tdez);^EJY31~*;pzA=QQ~DKM)6CF4c(06duM0B8`TFNiRc6fjkUbGGn+7kqkV3(s0kt zbKS-8OzqjtIi=xL)CZO(Zq#Bttg2&v=^~v!^uUe5#R)7s)20zjJN5;e;|dg zaI4a2$2-Rr#vZNx;p{XwwuYOsUYvGKb{id~-Ad^%M4ttM&_VxSNMM#eYU#W=abx09 zsOhHnhWE$&mb*rZT_bnZMde}F;ib@#QcwS*o((^F`}W&+H!t;Uf8vr_8y1eim<{*d zn*N#kuTnovJ#5~!==_iVpZgz$T5bkz1n8&XMuS-@yQJe{!(&0(4|l2rcd8(T$*0=d zO87VIl^^!=pwcJEzrsblE6v((meGj}8RSNS{N`K)cQ z5E?AB4VHqN3&9=eSLokcknw+Cp=1Ab=cDfE;x`_4A6a<46znJjH}G^7|`7xj_hpmxFRKC@%+}`z-hzd8|W!wraPLW7>wD zbca3t_m7qZ{5;(A%KxU{C&Plj_4>@u&OG!D{d#PCsrlrCo<&D7Jbd@fzYXvDw5Qm7 zvfw}Yo6z9ho~6*|*cPD%&@zajmFWx=#$;OYrUF_Pk7+zF9-SzY2&-;p9 zuasTX&n-0els%O53f_i7gIx6V|JH}{s=WehlyYw2wD?W2w0qxgJ$+vb{;zfyyH1u} z)Y&bx_uR_fIePoM#kQflXNqk*$_$8?CihVfKcBp?@wsBdb4Aad-v&@?E`i!#5rt#o z39;0_`LSoSWB*E9cLiSAfeO5D|IpoyAI~j{_rF^hI#84kE-FR&P}xO|ZlMJp{~IUD zG`8kNzwn{VD2Hu$0vld=0%mU23ASA8=4bBjZpEhBnWJAMH#xccRY`T4?rv5$?W{Qd z{~x&hCjyFye}B5OpRaGdv!B;K)7d`}QUY+%v(SXnpn4VFpWODW4xxx3!nKWX={Ko< zwgUvKwx@sEtu&%fGXn8$rMUv5+9#}rO}H)SVa?#5_A9Lz*McBJPt7XoL8y*vtH5%r z1fd;4g;s=4a3V?vptaGd0b*}mh*db9YgXb_x&Wu01t&TXoai%}4#QWUQb}BM(_YoM zTUR5;su7M;K%U2vDMlZgMeu8lCQs~yY4V4VS?A`JxSJQw-HQ4LS1+8uRadWqU~;Xi zmoYO~Fxvo)^DBkc5nQ(p2x`)tsvcjBx^dQH%4WINY0 z_g43v`G@WurA_Bxfgx9)CET+FoEMKeC->!tnp9{yGrk$gzDjl?~^*KFsB zDXx32)m67GMeLtQ*Ca(`!F1Ror-cREg0LV><7UGL?y5>&wLp$kl}mdyWPY9`z`7@I z|EE}{V*f7A?nkzp_J8bf0RCR#2NLr}OSxZGJZ3y302irgoN&vL+!Ex@rC1NgA2W7i zg>-7{>^I@zgHHmOWVsv3`~@aiK$0_nO|A?$_?e#%Dqd7Rt#W&bN6EaBpKshG7G&dxEj5W{X+jy@w&h4 z69SFP{=TBWuN3Nr(pK8?eAy)g#xu3#j;U{XM;^EKkalvZ)V{PLV@^!@?Vj??l(U(h{woG>@I@+zgdNCh`%(S0hn7?xM%gmx*u1=RnAmzm%(@A5k4Fm4E--U{kQoKD{p0yS>=Ez0|v{)Z9^a2z@(N1$&?O z3&GyuT?jxW^)!AIycWC_c<3Hj3AQZ+%K@}-8hd|i=b;Ai!EW)8UwZIdJMw1KIF?j6 zlUNfvCK<|fiap4{GibYF%fq*pw^dz6Rhjo5%ovTgL(hO?L+peki`a4-c`&}_(iPlf zJWNsv)e)<?+)$n4dq&?NxQ4zP zP+=^(1^)(i9jd$F@5a@rpPjzj@i*^&=G#?jrka&td!ciCp#%SS6oNYo?wwricn_o3 zC|?zpw7tTT>TS{6c$v%6Wd{-Q)q@~^dbxE^9!#j-O+2V;wt7B4)Yefe?wKqM>_ez! zA2>XT&0LT1NPl4FI-bG$>v{IfS+afE^S%i7^fLbZ#G<&W>DFipqi3zWj^VE93|D&@ ziZ-JUr+>Q21%J;P>2CH0uV7ciG(ua8N6?v7RRhDxIiC?SH37~eKm~>vdzwWgF*w&> z569NbkK4t9{%_EiJLCPrmeIx6mIjU&-5VF2g^+xwf5q2T@NI!n+1b0?aj@8NuoRL@ zq1NS4e=*cw3JuicQMnafm{p&%v*8OE%5Yye!4yJ*zp3EsDRuNLckC&4?7@hdK^#D& z7>ZOX*8zYGW|7hIW`O*%+=TQirggtR=04OaKJbc%nxqH5QRIK&5)Xx?U$}jg4~s|* ziNf>jejeZ9R=jM26glgm{P*y;`mG8j0MC(M|J*YxKnr!nbUhlly65ZnttYN=sm*Q) zRxJd7ZpB|-U0r2e5W`oR5mAV z&ex?ba01bS#RU3ZP|PMK)Lpra%soW+Y1xGd9(7>a;5jXm&dlkldE8bGC>gjjFT!3- zAl7*d?q9k(c?MjTehqJxC$lq+xQuX&u8ps=^0@P_frqgHi^VJrJ9-)p2Ht3~2$9i| zZVFq&!`4NV8*y_{{pXmM;}7!&(lR(bPt$^2W~5vvmSzfRQ%ZjqHFc%5HM2>`oM8Y< z_Y*PxTa4mTx=HBjTkhOd?A*1~`5c6zo2lMCMSst-f4Jx$F8KE`iA;pJ9NJzCZC?)U zE{1l068ltr7B5`Hw|P87;u!QT(5 z!9>mK7>1g}+W4hn8m$q2)r10%6_U>Kk-M?_M)HSr+wWHXzLof6Qp`lg5LAd1rZaIY3=f7fc;`a}Zb%_679BY&QLnm@;EOI({`dh^(Gtd6K z0O7K$T+ge^xY?%!ru70?mFLPcqQyXuS^;AAzv8b3GvT!|1;}MFy|;ecHcdSv^Rr@{?OHrKriWx7CWO0!IH1DK%WO}psxYK?Kj77j4yafzTihMTzdgu1R$th zHXK;ozqH}#^QgUW)hFv#;__8k4;bs)Rcj00+ zD!&3@j*H8~@_c4avu+1y|~bTsv6 zzGzcz_&(ty%?%3;>TWBr?PrO*k-tfQf=^J!D4^H0e{pEB8y}z?6GtA6fNS1+w=g`4 z^7DtpG7RWL_!32;fFI0gxS=|>E*^i_erzGYKV~Tn?Ou5O7KHwmlCP0;>49OC2aEoV zrC@i#@-5Sa!iab*Vjo9DuXBAwk1$XpgT|1QWFo6E!env1L27xMlFgKmYm04P-54H8 zBqLZ^Wd}-FA)<&b%DIL9O?Q-^ zddnUv;l;XFeUy8l&*5EZXe>MMi|%GH5TQwL%@8Vig`WO9yKlcrK}j#xYW74h(hJ=X z;DVGJ7lo0Xj{_|L+T3=_eq(>xfqV^MDpl7)C9eg_M0P#-TQF!bNJ9;WpAzw2Um8#)QDq*j~^sQ8xM$vwEe-S{AvuMJ+h^&!L zqrHN}u*ato8u48g=r(+nuU%DLBUX;P9rLsBF7hz;3F>%^s&Vr~iKnyp^wdQ~AG_Jk zEe;kzH$v<0x@%;`@whnMdeTTHXk=$GvNIZM`y)(&&veM#(wGCipEcsc+_X;d4tx?4 zo69CsdXzCF64so^TUkHe#Em1C=VZ@Fd@I3ZL>Qr*ObLB_X1GWB=WZ`({~SeVc)tZMiQgopV(vup~ zEH|@}sX;qs+vb>NnsuJ78+nk}sv|47;rA=0A{qfW^&xf2cFCbW&Q}IB8DTtWY2L|- zZr?`DtyX%+<( zo2^*1#2q(0<7ZfMMMGyD;EXHtg)&8MW}DPiI3!msLfJU16Um*vjE7WhFNGikY;Le| z3n9O1=vj$%p-iY~mpE8-qfKo{Rb0hzj1tywkuuB`!(cKQw>}h1_~nLmi)F?VNW;*& z(k$f+lbsb9=FY*;rvE;g=TWAJfm=i|_>=keXuw#!OsgY%4hL*R}kjaIV>bJ~tt+*XLKc zK(3v5qugt}Y>}Il$)W&#Azs|4q+uPoz?oL$PMXirG=?aRrSGI?m~DBuS!14-Jc^U^ zBwWcVma{=ua-jCSc8EHZ%~PhD+F5Eccvy(L{iE^m3H9j7Q|i?8v^sre`qYsL_0-hV zxH>U*QXN0^-jVTXSJHHBkL@T>?=jCdO98^c#DIS+jnocWce7q|b*{I8up54k1{)Mz zm~0vN@|`Om{G90eOs=-Yua90GT@E~K>xbG(bUuxzAH>t&9DX`D{$OzYVSK!r=v#{2 zj4ku013MoK?0k~g^(@+bee&w$^4{B{w?>~tchr(3*(le&U>mhcEbJWrfcyvs{r3`*j=0G~*50=bu zI)jr6ak*R$uR3*#%qZhy)PgPJV@Ek<4j?Z9>*dRyTIgu|{|q#ZZIJXPU{>@}>%n(% zx_CM9rb|5|!!Q}}yXqNvH9EMh7Mm}#qt4jO(m2aObjIN4V7*)+wFF6ShW>pKkfOmF zkpe+4yp!1;H))q?Ni%X@j&nG4TIL~6I(PIx93ESX+2)vPPAJt26T^07TTw~(1|?1R zy#efQemyaV>KHzuu8hO8Ug>EBb1ic;*N`qVo7+W@z{i_mk9%{{l}-4!!iG?d1OUT2 zc4gft*2Ba`kkhNSG*ZF`EnBac^j)?eu9yL57Pgus2*S&-EQl`%Y=0>tY(XL%{FMy< zPP$%-k|5Tiy+Wd@Y^zCdJ_`)21>mghj|tt&`klSE&eaHj-%a1!bocZcg0+ky1n==0 Nf#XMc9S_fue*peXj5z=R literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/score_utils.cpython-312.pyc b/entrypoints/__pycache__/score_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8dbced7985487b6b01e1aaf1ff615932ba93a496 GIT binary patch literal 8667 zcmbtZTWlNGnVunscT%FL8+G-_k}T5~%94}#5+_+(vK^(do5XU`Wz%ID&d8$8i+pC7 zSXy$S1X)0Cf~`;#h?=HYJroOdod++_-TE<)1@>haW-DgkU{$R9&^HEh5MZD7`_J$u zS6M8u1M>@mAYZML+Oal{-Pwx^vLSIm`h$K1Rwr9ByM%*$a%+L!Ui{Fy*3kZFsxW!huy znP4oK>4yVBj6o>&iu-Ra&;I2PuxC*7CnkM(odn;ysv#s)J( zv7yXxY*-Z3HZ`!~5`@?R)uzgd?;JK^6D|vi|9wFTsBWe0)AoCkwX@iW5>$uOVZv8B z)Prh&A3Bs!1L~|p3vQ+BeJM7obfaaAu<$-WZ$7qXzt#mw>BX3Fj2Tl8)bQ$1SoLD= zL8VhYgx)bVtn?Aaf}LnCc$I#PoKOZDBL`JKa64*PPGt!FhZUkStd1!M@b6T|uu2*2 z5#Bzcb}5}o--1&)h_)lVjbq;Z*A6^x-5(ZzcPmL$CPpG@i-sTsUFAP z4xu%=+!vYnGgd4_?56#EcG>jIFE6R*)2W1JO38$#M?^C?lhaaJ^;#;EN+(!KUyj&J z_iO6ye2S@xH9(n7B?;AcC8wzAnOt@ubtB?3`z|Cj^-4air_i2A&*^Gro+XmERAxqI z78CkRF0+(Y^;9m4A-bB?Ur8{1E|a*SUd+6uDjKSZj5+$hqgqG@zY}R*y+d?CsC)mV z_~R|Jn{s?9!8A3V#QNeHRnxGVc+y%Pb#VbQN8p=A5$$JyAQSu2*uobSis;PEpnL@8|WUypAjI;|e(K)C$h1^A)Dzym0nP zv$1!d#!I*;v-=v&-Le(F=_Fll{jt}8-NOJr5>z23DniT##URG)s-!wpCou^9RhMET z(Ob4hB(sZ?ZtjFrPTUg(!7JlAFKbI`GL=YU)ryQWN@P@BWwH)dk{5DJrc=RgUQ^{C&*U{dm)U#0SFC&C>-e#(fDw~= zdig9THWiima%DZC-O_5bvMN@lW10z;ORE#|LP||5@?lKCCKH;L()2`DpVXE$TwMf+ z34KDt0xTYvQ(2kV0gJBltH!6Aj>>b{WIC_V<_P0BKZu1?(wYFf$>lK*HoHK3q3JX^ z@m4Oc(_&IgUdpqj9C|QA?T|R(lTmqoF{R1)CFBgQ6~~ywXa$Hk*@Z+><%^Va+4Qok z-XR)vbbQ&gS75drJf~^8YG-~m;x^s!cow%5kDK0j95<3r6YP)2Z|4)~T8}FpS8~aC zoI$S%rj*7Zu`q${@pvMe&FKk}TrD2|jj)Sm;lF1l^~}=bQfleAo?tgr{rH{ysWYc< zol+;$skbJVmi5J4cJl3XIx`7TXUj`DY(|@;eXH@jo=R)crDfI+O1u~Wpbeq;d*N%F z%i*qugtne_#TYnI>X|A9rpwZFMH*qFb^J?^HiwQ?rA?1c^2ue$rIIFzPnk7f+H^8i z&$Dc@E({nikPW5!?iP6ZBSA-LJ&Jg*1{du&grcN3Mis<@?H=qZJ+s1({#6j(M6Kv3 zI&DJ1UU1y3*(eLcDsMVlYZhE^{$DZcvlU%fiCg#7X%%3>72#fOWp~7)=Uscjb3-h; zu?p@T)G=!_x>o>T-hKM0r|2z6Hv@aeP)%>Iw}Ec34maPw6cjtC?%VS#d%;!kVqB<+ zUxzSepH^ClB7N$(=dkeV_aF)%{!S7K(lsICyvpbTO#f0siSsLurxa6+&qo}lmxLFp zm0DboH|ST*3@&kv-N4n%j%+>?PbQL!;1NziGu;cRY)V5z#LMWuO#4b|iA4#L6sU01 z1=ddJI%AX6=1eRtsab_}p~8++aRNoe#|{xZK?P}B)5-aq)!v%v0AqqJ`D@zrE+taT zvL2=j+aZm@j`DHVvNRtmq-<*s8>7w;EEC!0xYjVv>^XGM%Cv11tHNWi&^El0C_rS)*f4uUGE8Bz7(qQzy<9C6732dLZP&#p; z+A-3fKAxgHLVZq9A}-2=TmIBWQ4*IqV!uU2|T*RyMvcRG5uJ0?mU6I&>z?qA$~ z>ZKBX9WSjpcie414crYp^vjju(e2@AX*g;OO>W)3-@QF`zBF~-IB~%kn5l#hRz@Z& z2Pd`J={arjxD%XK1~r zlQn6?wk>Jq68$kMtrA@j>YP-xHRZg}BIgC0BHo(Cy#))lg8kMhK)$BjE4KRh_n-k3 z`z_q=o_?vN-@Z@3quIYqJp5-;Fe}IAvS3j;t!8iNiyADYrps@svYLXOVDi-2sbeUv z$}px^&suE1GI5^pVftYOU=HBs$ZA^6K({uA-$=c!W>=2=1tJ_77@vKy;oRA=3e)>~ zBAr(+GL~Zzi9L@svKLU84s;UV(ra2HB}W`-v15PyZ8!nA6)+^%>M7b(9H<(*1k5Y= zYcUjHOScdRt~)>Y-s;7N{&ox4arxJ7zqh=8eM9}(-#ly!8-16{Z7&+`7b`CRy7=A- zZ@N%!n=#xotxYwos$Fo0EY9%5q^W6g+u(>^P0}js55b%<{tdt$#=vDaRBWrFhq7t3 zJDQ9D1JG=DDx%()MX`bko8D0OaJTL6OGOt2g!DlXS1so3Rv^iyeCFhkiAU(zC=15&t&4X5*UkyjIhguLi*qI zqFb+pJf0c_NKXTtW+W4)yJncC8ql;x)|o`hdMp78K6wD_IQb@7vzx1K5uPF4k~{B~xSzrXei!Tya%xqWQ) z#V_0YYS50abNvS!E87R2E**HfeBhMfJ7q|xEZ^%SCT6do*rR{lXy9VDg1g)!W-WYH z=Q0*(av^cqrwz}lC=|q-4Gsj&+EW(?==8iv3fmo*yXM3w8#vzJ{KgWI&&I z8dM~`;qO4GYZ9e6NcU^eW?$0-(mU#;Ag&~FPds(Xby`opE!waf-SvS=Lk+wq{{QDu zTolAXRk*L35}5|5b&29J_6FK7q2_&uC6?j};&#mp@u&i}z!o`J43%iOcFLVIf zZ&7f&;|{HlePaK@HTbZ7>i*lG&3@57w`^Y@_R+(^WlD?oFF0Ty8uF(|1!2FK;+Mj&T70I6!*XAKBbos? z(iD7>rA})rqnA~9FvzIDn2`4Xm&Qt?$O~}W5e5TsMZOMmz>_Tqae0%RV97^IKJR7vh8T@{?2C1lG^+*M5XzRrtm)I9~Xpzi;DM**~#*@u6#Y zBmaeKywWqU-4iYKM7Qod2$y>X_Rbs9yyXx%t<~*O(Zo2MdB`JcVICW29$O3Z*jt#Vas4b*uOd^|F$|Gw=IR$vMJX^#4N7;&iqE@cxYEDQoJ^SCHJFx8@DY-{B5B-NbLSE`2>ry)T#!@SMvGH5) zAZel{8}1lL5-;%^y7QzFwHF|42wG@Sc-{mSr5=hdt^$zWFCu*p72s~UMNiX}D|qU6 z#Cn?o42q6|vz|yOdNE!sco)PJ%;&Gge-Mkl?6doeIFkui@z8k1{)w%Q598f1tkz1Y zEcg_O%u{_0MQ^s9MpDRc&8m15N5PvyfMw0wG>G-FpEk^8(O>Z0Y?x!BZ#{~uP9G_e zz`Kq@px{?rl+)B31{*u}*PRd%zB#a5z;Y87(iQ#%8%(IzQdpdYA~tS7kzsEg7i?Gk}C2rl=(* z5ap^#s6a+rHSJJaoc$tgO>dKBiB!`KZD=dgRr69U&L?(eDLzHb@8nz1%mAWK$|hK? zi~&U>Png)g`P5v|rsAxj)AdnMrGfPz4UgM|(UTQ-8~=5|mhO5Sqt23aVD-hd#SPbY zsJw?gN4CyA2!D3E-1G99@5}JW=Fw94qA@j#@@}Bw>E8B?e(o9lV|b)0*!sMcj=s$w zd~GLaYL>s(0#&EbD{uE4D)k)NEZ+ZNY2rLo=UpE)AJ`0+dLo=|Dg4sr-x}eUjA!Nm zt_2=-$@h~1u*4$a7D-eZpOgG1z1tuKi zqWIxq-h;X(ei+?43UgCt;cn(#D_Dqq&mU9bSUoWH>(h*aXrl4dy0)J#P z>YbE8=7UoUGP1`=Q0KFXrK4Bo=P6@reKCNvz(yeVaR*-&bjsW3C>4^BHA?9rXe5I8 zK9bWwBAw2?1C+OxpXi6rs#%(!9NCE7axgtA@_l3-)Vg0uH)DGEy5kswG%;iGjCR8m zQ>LpfynFphvzNXD!0D_RjE0P}cR_3if9*C3NPt)9=vu#84vwzQRVC4L61v~vN9Lnf z-fa6r&*WY2nz(ifLF#(qC+9Ziw#1LGz_KGi4N-tv33P3UKY4Q_wKei_rV{SEJBOC_ z+aLU}1*aMidQVh?g1_64x;U#vCQT=nP3c4897c@B<7?9mFDSVf&1S8vW`~6phaN-h zOh%XI`56UnrjL72R{DgIgJ%x#mX^GIl)iUZX?sSYD)Uj{r-D>IkD#qsnfv_~MN7|E zDs22ay)-S__{yPKURsZZ+xUD01WM>KQskzech+*eJP&J*@FoP_S&du{e08aPV5nJK zTrve-tcQwTD(J%o8=<0~3YiMZq*~$3B|!MNkjK|bOJDQQg0wVO!bl7wj$)($O*_`k z8G-9}OZ>;E!+j3AAKzJ?XFo-igi`B*uvYD&D1Pe{#KWHp6MquM|5KRylQ8;ft0Wle<3ZJ24=ht#|`QaJ=Llr+u8= z2}WoKXF4-f17GI`Bq!|L{g~^W?_jFR%w{p@e2_uO6yhHtRs7}^a`+q@VOySHZVcm8IsDgb;i`j}pvYyJm7PNDMv literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/ssl.cpython-312.pyc b/entrypoints/__pycache__/ssl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e45a1d4ed88efc79d0d8dd81f04e7daa7fca913 GIT binary patch literal 4121 zcmb^!TWlN0aqr2;ogS7cKVj+gvO_td?D!?AbR0jDIJV1=*nu6Q7JI%sN|cX}?wu@? zr3xc8S}PH1D?lT+Kb4ENj^P${fdVy(e%MBV2I!BJ$Pjx~iWF(l_K%KIxJiFHyT_Mp z21$!9$=TW2-I?8)*_q+*8yfrwn#sSBxC+>ZWKeE=30ctrnMFF%852cG&zek6gknV|O{ zWDp55j`yHyn(7(T0Nj84__2fOlx>{1gRCPS992^zh9jyIs;!N}yp%{K?3kGz88NWt za&w_n8myRr%pwCt868Dgor!WjL_WH#a|W;ThM==K4??<7!NdyY(b>3P_g)mDqV6+1 zhSv~ASfH`%fvZktdEI}JkNWfmU4q`9WrK}QGdZ+@?U#)>Hmp$t=T7cdG*h)K#U52{ zC7Fgpr?I8LIz@wpiFiU&Z9|DCOv6&tl==|uQ(@LTy4*yr1zs5~=;Vq}K4 zJPV_eZpv>mr-BB2%8tmURn0o+Vg&aTMBSXk>0YR z1CUj@arsnm%iaIuZR+qEVe(>~b5~A~MOhRSBaY}gkK-erM!fJx0HoIMbiW+N0@I|2fl4 z_8Tc1XEW(U%C`DJNGOwaTE{awM1mN7)#M3NX}N*NBT6V1qmeHb3Y%$7HLYhub$52a zp{#uX{&fK@%Pp6WP92?JH@GMd6;`jE-TUU=xf9oie>glZ@0ee`qaZh5K0I}JN$y^d zyUE10;aewub@~^l@5={QfW@P3Las~QYVC6dPQQ1(5opn5_l}%}hv?FQQAl^o)a?D*zlaRBjW}`|M1U+~i@WmvkJT7np zp|t_9Ad{l6lI#Wfi>k<5_SD?^&(sc4zx5#iP(!g11==t1)84{51-gd9`cCK?3v1h; zlZztfJH-5Sa}kmLH^<%n&WV50anX;upSr{^dX<9Ibb0UVd#|4O!SMHom$!tLwhS$7 z87edfmYO>jnmY=u>t|2ToSr*$J#sBFAJ{R!>&Obnu4-A~_@^2#9a@yTi;bLb@JCe6 z;4i6|!P`{IAPDK)OoX%%A>r4<%RNso^$aZZ3{=?9?Ytho7M=^;>U*#GIbggQc;1&g ziyV^IQM)-@d_1MqMpL48D^4uYG8a(bR5bY=VN#S$&b9ugAyoCf=IyWip@@$^3rc4G+Z|BkUzCCDq zs0&pKf^LMBNwG#=o#?Lf)fC2{SK<7V;y1);!|f-S33Q%&6-_X^QI`D~H^BsjT<|2R z0+r;rX(uwKTRthI{DB&Zc=#wzr_$q=nGNN-9{0U!5k~!eNa?X6xlJ!XS}YY~#c#EG|7YN1&I%o5@m-o4T! zbmr*1vhU8C-)+3R@twqC;M5-iYp;rPzWagR2LkHdUKC*GCm;J!VADs4YiL<+UNbAt z$c4Zw^CwR|B#-O?0`Tu52a}&ZkWkyxpIC?CG_SQEXlAbQ2O8NQGy4_sCmdim7@*zU zxxbUW+fLBC3Io);oxTH-aChsP1EO$`5rBSA6o7J1BDhw{0KgJ~hnDKwUjqWwEgB=# z#~SXaPp?<*+$`;d|2Ry(woZbs7!o~dTl5Y z*P zDIy@gW3Q0z;|(66*F6?FQLq7bK<6}RY14#oLh6W+dPZ}vLy8T0PsdLUOO5JYl7OEiE1@WP{kLmpg0ieA81K;X^umAu6 literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/tool.cpython-312.pyc b/entrypoints/__pycache__/tool.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76904f565115dfa5b0231e03da351e6baaee1ed0 GIT binary patch literal 6851 zcmd^DU2I&(b)NgXcmGK(MRF;vbSdg@X>w^%w&g?=l_f>VvSL~hSvT5;E>^sk)KdGG zxtEr>B@IIs3N~?3s}Gd`BY+ANX$`5U(WgGNfGEEN>C0|Jf$IwzQP2iy`b3K?B0CT5 zIdkvbT~dUR=Cy;}Gjr#E&dfRAH}h}7ppQVvi9gDA?IPsg@xu+0li2tdNZcVB(YPE* zVa(^aJfGt8LQ3H9TgZudDJA9Ql+3=xoF}iOl)N|P&HGZmyg%h<`BE;B52k_~(IqNF zJ2@B1hf`sW@I(*lE%!Nl#8j*1N$t{^Y;a(yp84hAvd76)pK{V z+#uwJnsVFcxp??D*l;|~TS9VV)Cy&;7$(h3nR(s3TGSvrIC=TvSbFr0vC#|X$In5n z!hBq` zM#jt*3#`@+vn;)wcsX%m%AUS0JrLn~HvS6)XqN-6l03Ia^86wP+TuW~LS9_t^U@+{ z*$X;a;55#A^Qx}Sl+6C3VW?U76$~?z%jue$DQIfeG*r|k=n@Mvrkc|;hN&`rCRj)3 zr`1By>{DqbYv|BOJ)bWXsX0bzk=ivVXn?>jw_VIkU4wlbFPdkIa|Ml6u{=)yP$6`_ z@NOoT)!wm3v^@0$OfU@-EcZBFs7IWfBRDM>zcKH`c=@o?Pkrie+_Hrkb-G9!4Zp3H z{Z1$9@F}&t3wxsa59YE|*ZR~ebEaAjZP7>~u2{RWY<0Qpl?0sLjFvGo6t|p)L0GLN zwn7*y;jD_bU|}m3`kNGEMcz6;o<4VRGX3_%MEaevOB3hc9*=t(T{X1}m#)pEo$k`G z63gF|VR5>FM;W&i!q-CsX1+8~%9i?JVl%qge`D^%w_d+?LLY!B50vJ?pb7);=5qM~ zupT;JDrO6&F<=&pxkPE+>SXHAn0ngLOxigPbPtTt27luxAl@gx^${t_{aXCix+qJR zxX-(~@5LW?^(}=zZ;#zAK5kDe`PTzPiQN9gF6ROyPld1ExQK{|45$KsXp-p8SJwo%@6v{{+oKH1ymw-3O%x)>T#71(D^0 z-Ahy5n9G^vFpiaIWb|WYFNpsn&)+9cmB3eq2wyjOksF7pDd}_}lh@N}%a=~)i`rZc z(}8sQ2XmR6T|%)wl~Ei-aR9^y{0SSJ#xRAV5B^3Eh+Cv4Mb>#v3RaGug8h?%4|{ME zf*S&n+rGpCdem*TMZGu@hK<)jG}NncfG+|nSQD9wrCVZ3(q;C=YjTsqJ@Xzky*Dy6 zUo6az(BgGNr>K%C2eDZHxQzmia}{#Yqc4y&D?njl2@A9CeB2C(wYtw-cxS8H?h3;=B_# z;Yo0O((;deKbcNmJfFTWcKK|%uj$M6&V~*MaRoOH{V+IMEo)@10Ls|dz*(&m;8=5d z)7}D>`^Ii$M!=)$1+6^f`s(feUMqb?MB9-(3srS`F;42KHCf@s+^awZN`*(d&(@1rDwR z4u26i`$gc~mn~$^p}!yd=aVa4=KwbL_J3Mj*?V?LTv4KHN@!K-tSX(A=BEzgzc1E!*?|Yx@XC9&z50W5Od}Y@|3&$s7FUg);RE<@{0xNcs_VQEEZ1~_0+`?e?^=h!ze#R_*O;x_CNy?}CZPTo zg&DA(1v0C+`HgtCZrhE#1;MrBL2?`{?oYvY{k3pM_%lf&i_(HPTj!EZZR*^RBOmTL zK|XAMjVy>0HqeOUu9=0L?RALpZ~g~D+yl4|j3Cj(2of%=C1D85mj$y0S_tRcQt~z# zH|V#q$OG0l1v!?SDV4xZKe3Oat!36&$pK+Z=ZzVw1&?mJIA@mTScYK;52HAOVi*OUZMHF%V!K;&o{h+OgN^%0S}5v?i9tvh zuYtHlp7Fb-SJxAYq%!6J@xz`C0?~RKkwZu+`tMF4mFPz*5ke}_Z+D5kO~Jn&&_FS4 z;G|xp57VQd)ODRd721?GOxtFR;#0}i;NXd!r|PqadkfNPi(NQ zd;)MDtkMomI1g+l2lsNkK?06*ZV6yGl5`drGS@u;LuOtB@6g0md~V&HEM1ctzQ;jk z{ym-{GsI2EERRpRX76h(@of3b?~9~UMNJ1FG<8}6UaQk)+=g?4aA3oodl_@!&c0=& zwIU!lP1q5GEs6(_!5PJ)K~I2y+YD1bs}L91xwh+}>oicwE=;kbSnhgZ*XjV3!WMr7 zq7G17VCT&*cagn^KOR~VpD3NrH(MF1$UC9RJGGY9O-TS$Q5dQOq1M;Ta{j#KF!_l$ z*)IHJEZHh7w@Q%Qa%ym&KY;&64UE&qkeyB5;22x1pP~l3G_kz;Ie{LX@PKpCp}{mY z{y`lxX5f_q`Q`)9eFv`=IlvztW`Q@5ZDAJgt=j^eTXZZCn(#+-@JCqSXYo3_Ei^Ez z34eG^!A+cnm#yu=DXBBmM!Dv_2KNXXRSU;~#kRX;4vY0|v6W7@bQZ6u+k)o*{aCEo z=mjr5+rm~}jaRpLa2%+kNZ~oE++KpW1@q{UH}qVt*r#4E(wx?PB#z~HF|?FSA(NXo zvH+3xNw-8athfY_>AY_-uq~KynY@}Q6m+0>RG-S0vH-|dz$r|d^T2WuCd#2LL;22@ z-x}R+sp$08=}Z>iam->9n{`O?H<@<1d*|+&DRJ1gemtHOPdvqok>yQJOq`#XOoHP< z7e?PfaTdin6n}!^G6>5%c4JB};R{{E5zv!R5|@|*usjaSuta1C^a8eI{s50O_=7re zbssEtatm(REVMD!KrpsY3rGHP;fD*zN%;tH()QR|N9=E3`Mcr28D8laT#Fv8^p31V zM=PCYYMlpaT?cBho=v~p5n4L89wKe+OJ{1a?$ua-HP*ixd$k&S_0yS^*f6k&VE0E; zwU)L!{@eb$qfc5?eAnoHUWpAqjda}QZlAbw`u6F2$EuM-tC8Mnq_>hd{qxA_O5k)g za{BpNxbqqDyy%Gl99w&zf-mxj;ER0B7lonT*PAWG8+J+Z&=cjUp~YohvenM0;b~;%pc+~J$wMtk5mroKkD|5whE7WW1|7#Gmiv0p9Lhy z`K&bx>Hmj(nSKXU36F835YZ%xGbqMTY&+6#K;F&-cJGTY)!?rP5Pv9tjaQ@b)#$Nm z^w_6ISE47sk-wgF_$&P`47dfR(7_}0Z2aM@l&Jz~Cw&{l3tZG?@D1w0=t0_Muz!I~ znyBdhYIy%@_@!$2rHb;BLq)wiQqgG~vCc>7Mf_y5IDjdD4(AE$5~fg~$1%{kAQcTs zN1v%3((wA=H}KIun~(O@`KV2fS#R+U`U>=A37IQX6g9|R(kvO=GrR#%^iOPdRtR1t z3Yl#Bs{LwZyGiCS(N{7DY5Cc`%YH|ocoVeG9OAUS_SZ1gjsnrmewIU4%%-x9d|S$p z{nj^3^H2qM*ub@eBjUIXnQ*>eke*+VL%$}c9+OkQB0awpT7D%C{nFd^(f1#F_q{Ja uZI4w*O>l>NVOkTzQY literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/tool_server.cpython-312.pyc b/entrypoints/__pycache__/tool_server.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..375f93dcd08f3dc49af2763c25404d861f119df8 GIT binary patch literal 10188 zcma)CYj7Lab-s&z0$2bf0g~byBq2%=MN%>qk)ri5A9^~X6-!AhPBJirT}q$J0QowTW_b|y2K=>VcKfVc8Qo_Z$jpAIC-wXEhx z&$){Q2?%y_$-R5;x#vE1@44rE=PZ8hayba382486ttLYL6DwBY3x$nGP`E)9qA)QM zXC|1qZNkQ2-4MC)A49(PPQ;?4;tt@ANC?wWAL-4pJ( zXTn2iLaZuYJyFdN8&Rv(ns*s<)(NjFY68q5#(eSGiCW;5)VfKVV!y^s)GH2ko8nZr zO>&A1^#(gJ|CCzA1@%T7IZhP!HKKS_=3TZhViToS0kv9fDpH#%wFao(E!38IU$E}K zv18C?u)`yxhC6&gPio;vGOi{sO)Efd8ZJ- z7lW)(b0Vz8rxWud+Vq^RYRF_%7ieeZVOa@E;xG>~hBKOoCPT65i?AT5dcGRflh~xc ztm?C|q~U-uE~{E-R*OXnubsXs3Gz4K+V~rwzDO9{dtM(iaul9|A!o?|xqb<*Z^3m8 zu03$I!sY2CdkGzpgQun|9Uwf>nK^hr=i%)!B*DJUo+oqgW}aCE_MvlNABN16lF{h; z%14_g=4d}*=a?WruHi%7Vhq=4EUG4wXH{JX5dwlk*VRyj5@GjQ*yW&Lu<^)@=7k=H z8%n)qb1!gewMcZJ$V9`j=z?lEVjwS}BnnJW)T*GFb~G79I1Fkyt{b9mPH1>HyM{Cn z-06g_8s5z@ibP$6q5NigI@*_v&-BeiXL^!h?V_6Od3|=^z~E~GYF{jRp>Ku?b>HP! zEZzstrp?bxM-xfC4;K|OrMGux-l(Os6r?$vP|QVZewe2QF8v+I{);@hLO$Z!KjMN9 z1!DIuFJz^T-$)&+7eAD0m*=ulTSjVI9e5Dv&Ia~p0{hnkgW1->wA7S)mg4D12L{rf zfzLqe_@QT7YWs{O_D&e_uGH})C$&9!&^q`e2UlJMmM3{^eezqW&79_`#SI~HWRU%PW<-?UWuf#_-$*ro+s2I8rQ2*hmK&K7r*ykX z#BzrWEVp}^5tnqkkHd1m3}f6r!i;!?H6lQHO=N(+W@mtZ%}H|?rFa-@trkW)?Q8YS zNTYjgn~3E$5&65E*t(Y)*~hItN5>fyMjhhXVP=$(*N%u#z9WdhV+eZWk{S;if~F>C zwZt<@1vT*ooC>N?Qsgqx*c7OfLtsg0O@f-O0hq2J-3D~H1J`z_u}B{yZ!;0wML4eC z_7Jkjq!@)my28IU0OHA8_3zuRS94_11{^T5!W{%290Nua=!pGL6R_vhR>cQ(OqM!< z+1R(a2ph%ustNfPoU}!DkxQ}3f=z)@i>mJjw(DMwM4){US1*`zo`t@sUKW`-vqanH zNKhO%_+&I0Qw>p3C&Q?Z`EX+XrAdQJ&d;a@mza&k49AQ%J)>&LsH%gV;G;IUhZq}ZSY*R^hv?HI~^mc5m0Q$E?lo_;A;;zYVOmuYPT5`1*HO_P+He8M!6B>)0b~gJDt`sW~qY zx#r60&mYyp_@8f7L5q&|;T_-be(rXC-SG25Wg)x^OgBIzrr=(l#>lsuQMk5jGBX8w zx;$AnFU5okc7=dN;J$FwqUZw9g3(b6(v)JR(BhUQw73>X(QGLOM7jmU3q_iQio7YX zOPc}~WXFx_Xd)7uRn$;45dz_f&m_+Ud5{TI<8=EG7R<~h&zeV71f3gBCN;wuP9&zG z2(|-7t-4fkqj++34183-pGuR_Ix`Ix0%nPT4KS5%NZEuj+92@arO!Zih2$kt-I1-@ zaj$B}5_i9;Yc-kOF_76Yu--Jd#QjFv23oRuCkR(s}hW@2pwg zm-TjLyq)RogC7L$IgdO7xpL)v-4Br|&DGxHn({IM3E`SQf8+uh75#to4+ppi%w9|= z9+)<+LUx0wWCG0cgiU1?M&%Tn$^)ikuW^doP@Sl4CM89>#uvwy=eeN6aGZm^IZL4!9AmU>2|#{Y2k=LVf&~>! zRD+@{-XQZ#(6(^GVcJo@|7EIj{pYk<)gPVoCoidf9Lpco{rc?8%(RwNmEHbh;h3)a zr=eAwi|T4`vHSQfa?56Lx*D6*o`>=Et@|up3Y*)zK!K^b)?SPFBC?}2!wx%OQ#PPT z#{b~b=ZkX}mR?-)77_AKPHXg8E=Dh_3G<2Y?QAamWa+bl3~zm2(-8T`rxRPcjBAHr zh1y}rN*>b(XMsxbRv}(!+uWYsN(?XKFonmL;qkR$ARxA;I4l%Z4Dpg0R#Z)olq-?) z-P}`rC8GeYlBz%KFAP(B54fkB4{tunElxXv4}|-sjbU~aG970_=`hc#Goj7_WTJ~& zsN3>PTj&$mt?(#rVwdTo++xOqE<*}YsSJe-M<^7ZR%T-=l;u$9JG0>!?no%4;e-YY zz|i0>z%sQLpst~Rsf}U_ht^3$gQTIp(@tPUcLLv-X`hC$gxM*$^dCTWjpTT5-p28+ z)lM)Yyz6G*5#BafBKRmBjy;xmXo}^tG$lT;2UgE!?7>Go^q>Phv_Xl!7P`s^EU=iU zCk?fYOOS197KOP+CRoq`pc7EDOz;ZT52!^ZghE+>vZzX+Nt{RlUzbztC7MK}x<^nP zTl9xxUJS~H>-6ZE&HXH~ceo=|$j0fCV0ch=F|a;Suu_{=3%!nHQPl*9=3>aE^#eOp z07(kDYMUdb_W`zsdL9cWz>h#JgC{SV(39a6yP%ZR{&>!%r3H9a0F|ORP>t*-YW&J+#;VW`r0Yp;svT@5&_Ik$;E{g3sW#> zc>+#!2+LB6pF)4DECGxb3ZNapz@m^uPm`vpI;*sKPYD(06Q;Ip@>nI{ZM4dpcZf!y z%tHdqG(<6z@E&i`7R8hZxUVSfCMV`8vC_;G##d~CQSf|~rOS-gm;wi&Qrj2LaQ%Oh zfH}=Hh1k3-f!k7f4XpKC3`hq$JzK_UFB=4^ikR%QsFmNs)OM?V^KJnaOJ=M%3dSo2 zaNg$%7*v=eG%6vD8_ezn@lrB5Gt}3&!1Y0Z0peF28VbX?#lC}s6V`y31oH^z{}q^1 zJCE6OkS!cQwdUWh`@=D?hswNPeH}s)dawWF#l*B0O(;2x2J!ZyJ7#neWfScxI5Fh`)*awCslPf+~0TSd@tpE?O9(} z#@Che)x%uXHTfRmYk9n%*gbDM-*o2W{rBanWnZ?sFH_x@t=^ZZ-nU-8e_bBBzhI8`3jZ|BLvhV^=;&Ve+L4hxQ2DpLNTSHC#C)K7 zLM66=+YgQ^IDM5|)yv@iX(xdt$WZZp2GdduR&EC^;EwQ0_d<4sp#MIWZ63-r52Zar zX=#XJoRTHsa_EvkiQ|$gE@%ol%2KLm;jG2l%(KPuy zuHKvV^kh6eSO%M1qL3%fR@U=%0L($ zg4rqYJ<(FXTLS7+f4r{Xig!>_TgQ4GPzBGJZ zA+QaCV0{8G&BXOH5G)^qs~4`Ok}!A0p!xM~6KDc=M+x^I4vI@AG}RD{Xg8Mol(H{7}AUC zJcX+o%45(`$sOB_ozn>nkEVWSR)z3M!8z0*Dnv8{0XjMomS!)wZ`4^@Xed1uTRgWP zVZVQcjOy}FC1UUTNa}guu6aB9X7ug&oAH&C>+Y^4HrEuqIh1bPzvRg|YtqivoZOU^ zJ2G;|>cBm@`$JF7{hG#=Z>*kNeLmg$Lb_>qy=EjWk9;iGt@PjYWaYu%$b%mo|4?pN z8OX|=8M$+H?m51M?x*Z&S4A(#p~2g!D=-Of@ojG z?CY4F#_S?yFJOi`O%pJC39}QJm3h8jLmFlZT>2%*u8~JJiQk>~b?}b&*$o1>{0WBP zpI`0VAaKhI41efm1j~oOI}sYFZ(GebQr{Nu=-cAG%(s1nM095x9@?N}y0bWS1g>I2 zDa5of1KG3gEI73gC{xjy70i&>gjlFcickoYiQv}SRR?uzkynDtt~ym7!fa-M%;YR} zYvp+X+*<#1nplP)`ZhO4Gb)5^pgSn1N-);E0ZHl^F`5JG4+= zI()C?9R3WM5zK*E>Dsx({e{Cch1lOp7$SKCu8onR?MN2eEq%H*fLx^6Qno4H8fNou zwHI}8@v|d1jVL!|N%K%dP{ArG40Tv+U(gG_hE%d;iVb@utt}^&td+uU+Na8+e3^Bz z?3iV{3N}Hb?gBS%h!@On+J<0$^IHg(?$4JYc)R9=)obXw8q#I>cuAGAs3yV}AQWN3 z^H-7M0%j=W8Xhw6H&8MRxiB>iF&IvBelwcQLOW<;6JA4P9X_Yy=S6Fd1%K%xd!OO> z2N*z4K}PlR{rcc~efN@lzi!8RU2w^nb2nt&9T|7Wx;v1S0%<9bbJk~_ZFik*xxk*K zvA0jXd1`q+Be&JEwkhD(&e?OI;>nD_IZ+e7Fr8tB`#O5@s}J9&ob4ytWe-taD6@TT4H*x5VnRd zb?DTZ0ns9p;rK>4HmjQG;V`ss4P{ZgvCvs2WR<}1ZS09E9A2r@w=#yWG~?{bdF!!i zzw2zj@9oNZdo$kNb?=_EyeH?XTiJQn)j`QU8E?1!tG-@)6p{Jl=1-U9a1`rnUZ0?ByUv2^4+P31`%KcIdLMp(a4C2sv zGrRzy#~4txIAj3W=xar9Ryz(^Y24ccPK{-+a0_z!HFiOlhMv&8MC97y>#GQcasaBP z-ke%#$hbT2Nu8h5`L6EmFW>=JMLd8izyt*ti8Vn1YTG{s8ED9d*-zop@w_22VEEu8 z!2yt98^iCw&^X*~1`rJF$O{rbgt2hAS+Vd9tmJ(bO5phbMG1IElz{hVPy!GOC;>{Q z9N;p>gPgV>?i#u)8fp+)3bq<-_(DX(!)ksCa1?(21i$1k!~S@#3;{k*Lo7r?BT7F6 z7;gB)od`!mg-|aIrI>-2TKwfl@0MSF^kNKTQ{8LmRT)moWyV@G3|5+3@G7?1Aym>! zKZu*lrNGROJQRs_V+nVkhVv=@qOZCPof*^6M>h!E@{M&&%?g(%a9j0l z;4S~0hiS>x@65Asf56vm@Nj<|sAuG5?_&bD$NP>mOylyY#{}<>&)GVe!1BbW1a5S1 F{vQ=Q^9ld} literal 0 HcmV?d00001 diff --git a/entrypoints/__pycache__/utils.cpython-312.pyc b/entrypoints/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e16bfef622952897f7dc0fa8e7b135529269951d GIT binary patch literal 12679 zcmb_?dvF{1ecvvyc!LB;0t89%u_PsuIFTSFk$RnU67@Ri^t7Zq+h{yE2)iUf;X&^% zMUe&@-D6^^QG2>3v2=+OE3Id$n{&o$nrSX+{*dc`uG5(gAkqfB)=kvuWhVX)I;u_V z=}i0i{uT>>67PDM>0+?^dwhTI@B91vuMG`u4o^z>-Q?eoa@_x*jQOw{WY)(m9CwG4 zIEhbjQRB-;d6u_CE$nNJTJg1{gtRSc;|a!^vZozU2TKbnXWA8Yv9vAaPJ5yrmbRz7 z>4s=Sx-r_w@{W`*?T`A?P0=Qnccz-tEzuU1cBKO8U^K|m?o?|!6b-SoC)Jj2kG7{f zq8;fi(JidZo9aw=MY~wKA=RDkiT0$$sF?1J_Oi0ZRA0J3+RxIy)YkO2=r)%2rv}o4 z(Lt7ON)4sM(J)Ilr(Q~Lk8bBV3ny=rhd<(tEsBmvEzyV+kUdgR;->_u^<5!4Duv$V zqGMD#Ws%xYvO{V|$xf*Q?_DlVibz}Jaj6r3e#yacQkOg-@0MLtmUeVoDGQplQ%i^kx}+&`Je|x;n?(R=JQ7b_omR5*8R>Fdy{h{*=2dj)Ih&Q_)X{8aDmjg;`*>zL znUN=zX{z)ZxudD1oYBHoy*ZVQOR>aET#IRPdM*{$WW7z1)okjT9GjVXuBOYZNheiJ ziKjMJqkCUEepQaemFd`ATv25OBLt74^P^exEo;earm|YS>BXW3W5MU=2#zM(cqV&s z604a{$(23xBr{1ZmdZ{~qiySNpod?KreQ}d~mm_ob#V%>8sl}d}>QRlBD($caxGBP8l<_^YFDK|BV zi>jQUiT=Nv)SU7@<Ze9N~>jEEkE^WWa_EHs-Q(|0{l4>HG$pB|WumUwcEsIH2 zROEyV43N(0g0?s(FZt+vA~jfji=2rAZ)2dPYqA2&2epFo5?8?%PUtQ5#c5enk_q)! zbS_~#ohj>EcTCG#T+Gk=Fa;ur2t?n80|>JzfjfBgtv z-v#1pRivM-AQCF7&rQyhjVs&~FY)g>O@bf-xP*OLR^s};;9TyF)m@U z=zS_{%vPI`#W^65tYpL~C7WigfLaQ;ye3Cp5MAbYYBD{a(&8C8JFlh|#rQ%zsi|T% zBWep-QG-NNN05_Qch~`{#+YEY8Iee%Zrvss4Wo`GCyQCU*uF{V3VF69w#5rZhYRtH zh81V9sISj}a}v;0Qc*QFGLo63IwPyYb-iK9)g!3v@}!94lf}5I&Zn8E60=ii0Z35b zOjaVgW}{N~SF+NgQC~d@oG?y11=%hwvUOgOF&D~4M0SE`59t!0+Asz-gJdh)jHx(w zdM;_AN@cj2_%&mhmA-L45W&XQY&H!LYZe2|ElSvyWI|*Th4nuVgd!v}@TrQenAd!U z%?k|)x8#&6FJRAQoB`MzkSGZ`3O0wG!Q|PoDA@%%Wn-m4bupPqWJ#$p&yR45D517L zz|0i5qb8=Z*{eo%J^y4=t8l8yk(@Hlk_0R88WY00-h2TE-!wQb2{?nEXwV9|(U~Vz z*7bq{J_$r(i&3@g9McS-OJL44)f(yHoUlen@(O*>5)9={%wCxVt2g?iO(D|2f*3>G zq$6X$6ID)v1T}EpN z!rgS{xFnb3f5`tU{-X+STeg0AD~)Exo}9&OV}6%~y0%UeuJ+AYbHcPmtMbmx1nZBi zrjUjj!PTn~zlCsg&N}NeA!=U} z_7bLp56niUp z$-2{KXj2&Nb zd=~01hF&U!Ui$FB$ASFzlMh=9+fS~BPOW$!clH-MM+=>!Yh%YBPOpxgTkSlbcea1l zz3m674^pLu_WYLNym#bp_7q>fSa|s&WO|SLA2>_9`)0$6urj@F=YpLp$4XxRd#7)n z{+DOVR^-ce&f9i(yx{36we%EQ1_~_$t1Ux$?@-CtQf@-q->5(i?w!d4M<)269O94c z68pym!~(ggC{vzdAUJ#-UI z8D^)UhgOMd5sHR^z+tP>kB*hC^kSMG&(KRHQm?EtD9ONFGK%VzOywvbXkuy?UN^Xs z;Q8+9w@w#@U_l5Lh43RGeE(|&A@Xw}tV{x8^A#RJ&HoRz>D&g+NTvbfjQ6k|CTGm? zn;9e$U*ku09Me+en9L~2z?)<8oMlVSa>Julm3ql)^0v46dd@ey33uC!o7EdFsvjH| zeA?u=&-1rSoL1##Im=fbp=#8qTO`LgXb($&2qzWPAtTAu3)QY{W z8*#J#v~CmH>_O?OlMzp%(uA(?z;7x+yZYn0?sMhF&IqV7(rq_vW5snA|2#LRT}m8x%1A>Dz=0EJv1x-%}p z??5VsKmt@5r|Jp3!uDF4ty6cJIkta zMDpGUlpKF(Bka*N-?7r(eJiKmJAdMjmuv34d(u#${-S5CX|m*REBgBj z{{EtWxZodN8#(gu^qT)dx!Zv6s)z4hYdVY-wGY9qa@$t&ci*<%X?(x26zD7l1`C0~ zVqmxs7%p3drsgNXu3~Vg5F9E7M+(7_&uu7!o)zjTwuTF>;bQApp>^zY2bH?HrdH}^ zt!Mnf)iwW-(!h3O%5c+Q%4HkZ&{o^4qIcqvm)tMj{XfID`9gr0kFq|4AQzvPt+!|Kt!dKlOS}hPa=G_>%$Q zr)>hIJCE%~#U}wCnNLFFr##mG;<2JcA^izDT?@ub-<;ujXq>78$!IdeAl_45}`AHNto`}r!HJPe>8UX z_~n!5k1dT}g7b!~(=US}$J8KV84H;~I1}j`7IaHi)g5x?T2cXbu^aw*-6qLb=BGd8 z*p3)Vbpu>jF~C99)aK`uZ(wSFg1@>0uNz!hsN1pH$29C7csKOS~Ud>iVU$ixTWS0)}T0ms<*Ij za>+2E3{xvXr(!;|&+L@ukbqjwzcmayn!gI^tXOi)g`MM=-q>8T!X-B8{ zV_QvE%vs)YOSYUvveW+Oc%&Rg>JntmTQnu<+<%}3zl#?R z*UdS*^Nw)I(|hmYBhO%I+s?eV?+J2eA9;2V({3Ki?|3yocCyUzr}&+vorfOuAUVk& zEL$y2udG`IV(3CcW7*4jd-9%_N@L@BZ_hn%!8=kKJ7FfNwl6RAmK`S4gjt2bP+_2r zH<_86M<|IcRu4(?R2;^5OpPN@1rnh6RXnXGs&t4okW6G4@*k0?_XQK4z=EJUJlA>E z3%!EhjJB7B98P};GIiNjvoSUccGP?;%l2BRL$bWf)CSO-D7~RCmG}q9f*xbdjpmrfUH_JRAIh4zWO(E7M-sMz*O zq3xBAbE|D<@%w*h1w+hoW7 z+3W@jCarD5Ig?h#F!~lw{0{8WWxmGSa$L<%SG7zMHLR^!!ylm=dQ3cn-?0{)vuuW^ z(^Xir&6oM{;OzqyRH}6E1{fO~0N&=x%?EH=%%CE@i0u;PF)mc&L;)yd zhzM>RO?exDpW&}|z#9k;U#LAV1Yj;Wn{RLZzH80V_qa)1>)o^3yKl8=|BC%_r+BZm z*t@gPyYs=$)!yB!oqLL%2Me7Cf8KfM=9!hFw?idg>-PumZM|<@?HFG54L=?lg>{x# z9onDwi}zX{`L{i_ab1Ub7((vA?PK46Y0b6maj5sc@MHIf?zPY^IE#Z~F*sNV4&E1j z9voRY{Ga`-y{F>%IT7?%4-SUiF^K%^bfbUs<;1EII3x1$rg)kYy|NVC3Nj zK7kuJ4I8GX3s>O&)>{##gRVc1IIJ=482ka68WxmyFlFUkdi@SwdYjq$`36k%)*+cX zlbugV8}~x_7wCcmeyOBC-r$}By;@6c1I4!SLfiPGwu||_7grie{bR5Jx4Qr0a>?6K z^zJBlci=>JyYo(wj!Lc%%?k0(W;Ou1BdUWjr* zLR9eJeG5gQ5Vs=P#oZ8cHaa1)0&+I;n;N+UPCo&NW$OaD?_r`YU{1P#Ao~n5WaKF% zXrSAWKCRrMTJMInI5-AR>Y59jKB$^p3C$a%Z+o~@-VV=HISt+y>NgmAiu2X)f%ube4) zLMZz6dI-=A)9!{2+4W|2!W*xvH9&>O1-F=SJ3{vwwji(-_un>Hc%gOsVPL^aod9K zhT#6gR5Fo_<7z-Xyg0Z`9jsh=pqRQM&p}8p!k%3eVZOo(niUa;2Zv3Q@D?R`T8Tr} z!gh^7hEg0tZy+7JHen{lDO8})E-0iXDopS?;A5l!K9krWq2H&PKcE+_OK+{*Ykck| zr1Ak(`k%iB`2z6-P2$l3Eobw_3l{n?EGxoky7_Kf}#VBu$9XLhGqM#hZFafI{Hi9eWlip zQWvPrCIX-nROJ`*{revI_C5A9A6?mE37+9WV8QLBfLII+6aoXFR(SD1fXqrB<$`-F zH=%kb5Apw&pFAl1aIynQWeU>{A66pxDD(6p_nUGFFZhF%X+v2d)It~bB^NQB#A==YIY5^rjQc~+d!pUc*u(BC8icJ7$>>2=V$gpC zQ;M;>X9|&t9;mOw0cqJpHl&0`)!ddV(#%| zggjZqjhM#x#bDKu$%;@~=oO%T2VR&%rC*8zkOP9TF_rQB%E}G5nnFf~-pHDp*Ire_ z%I#x?2p(iqq8eF&XlIx4suZFsL>+oN12pQ)ldS9=Q+=JW80oLEm_a@5%pZ8^^&>~m z9*bR=ym$%D!{ZlDojLz1yXc@CMxREo;5?O;KCR-WhAogDA`CD;5* z&i_l!^(oi=DcANX*ZOzdmcQi={w;U-@3{U?x#3T_k$G}S`v&Z6t8EWN6W>wxd->-3p)!ZhgUjpm`MmojzNH++Bu4M; zS?BQi+&RLxeEufi47b%7w{!PXM;qT)YH2N7@qNTz6mDMJYEL zuX`vp+|KX6yLf+pp=V;9L%O`z&F?A&dhZ=81cu61%A9zOeV^FeE918VEBgz!)^!^y z${l{btrXgF_go>gy=s{DsSfLzh|U6Ps%#Qn0nJ+W=A?=)pSM mI`666hi?u s$H!;pWtPOp>lIYq;;_lhPbtkwwJTx;8qEmA#URE=?y*%D=2if!3WY{#kMTOHd<8aqu&wAs+yu}zz|%&r`h zZ~-B90VzdMx-Vhc6mZa@jp3L4fc}62eIY?Vu!{m!k%zW#Hl)6^FFj{wxx9#pLtxL& zoVo1GnRC8#&hRgRfLDTFmi#>Pr?4b_jg{63EopHORD@pQk~TG#3ANNdN|hwTz3c8%efxldONs2 z&h-J;-@)~BZUDH!4sL*RL%zI;YuoNwzw2UEu?|%fVqv3 zqNMWqf}X-{YKrn$+Qd!#_l*TTS6nD&iU;%*yFv8>cUO+R^78FtbRnC$wookTHw*cN zJK1b*f#!8qDi$(%U0VQQ3)wETfX@?nQr<< zo{|wU3`g`A#xk_@3ZtJ^sU+#7L?vYx2uN0C)HL9^I(SY~`&=E`_YUe(dB@elZJLj- zU?rZ(gWcq~h8X@#zPO?*IyP%Yps=F1tINh%I+eajl{BNgx6;*lhiHU0wX31OSPchg zl#u8+teovc(j!(pdxiubZ2<_1b#fy**dT+=*ieHEHRli2$gW4RNBS2a;zP~3`5GCk zUU{_mS@J2F7qZ6DU9z^&wh1lUWv$wA1jv*t9hb9fo1ER$1M29#&ih)eT;edoN82M@ z=E(rETK(K6FVJ@ukJ$>SOGBMKa9w!b);wObXP8DTo)39oAbw-eGLX}Sybgx^dbW_h zT^=@T@oQLx+(uNY3Pu)@;R zuxC_*AOOKoR3O8Z7GE^n8BO6fmWcMXuC`&}gbW;`#P;CSM9rjihG5GOXxR*sStJP{ z2FYds-{yGn8Z5I{cjDL+$!>VGDiGA2-5cIWEqb!gIL7Ocl+>VcUrr|W_F2079kkFS1zO|4vAzx|ZNg_7b@ zyOb1nj=#;9fxJjbDL8)WZm9>TF5yZbgU<+CN^n^?l5)f>w1E-4AAw_Lh9{+IAXmV7 zgRnxDa#=4YJTEvpero$+om}F*M}daRkQ{_Z!}GwS7~@B=r|#X`AbY`0R2FN)2kNnT z5RqlMIHfqSuFtL#f0|Z2hW<5keMatLW$rPnk047ygXp38<$rCOS0cu>*8QVk$8mrC2sdGUoxkLSB7 zz+N{OAW8RZy4CNAS<-xZ3$P%>Xf5_*2oLYs?${W>)IBKUdEc{LkrHINJSlKld^nz4 z(e(H=8c)U1qA=QFC2qrpvvBPeP3sU9c{5rjkLRAIY#+>Icz>A6u2AksAKMK_I;$DB zI~uMi9%tzDj6pLxq9)jjua&2|+2Vqq41iUTS~&;R~fkqatD0`X6Ks_-~n(c8NPHDG&@Hj ztWnu1?9k(C7LkVptxY=%1a)*o#4r=PS1$rC=nuig>~2PfDq8h)NWQ&c+i z6V8~VR`i15NJ0Ed5Y9~;M_ldiqC0{f4!&H(J#t%u(G1B2c(h$WP;PN<+Yt850)lp6 z=2J2zB!VOQOXNMUh3yh)88+fvr*o$67S^e9(nGyq;y%+Q%ck)Qmndz*f^LN&cS zVYS3tO+15QqTP^rj%B!W!i#8zHDuxrHr7a7mA98qX12`a}Xow5EJI$ zUD9-#gII#*ijaL_)?)@1X|B75jB2UjtEk)-uVNQ&B4*;V83*zy;2mHi=!^JuVk1XF z`!cvX%;(}6EsopBFseSbvE$I6odD8>XF&_9_(sb!mONVGFeTN-G~V$8oM;%xs`MPd zEW%OjrAG8%&3~|sTK%^nGDv`L+igf1Sm$P}L;u*uH{)1lh`!~qw$v?TFBUX#LU$;; z3O%;(EY{wcbz#QBq*d11!Cb1o57shhgs^ zc^63!Xq`hI5*)An4+w(R?1nFR|BYraT8mB9gVUI>g=6>MZ0?=o!P?=fqw?~4zN*)Q zQw=iLoY}X!Tsc#XJ-qOU)CZvZzUJsI9ziayUu=+F&0{B^cDgcM+p)hMJy7=__=?p1 ziw$z38J`A!GFTmWIF28gKeT$W7MXse*CYEdg_+&YgUUeNH}Pno?%UHK`3G_hkHsqb$135Z8(8MzW)#qNxIav8|L*Biu!_;%^AOIPq z(I1dMhL&DHHj{G)2R1tgTBis(p#=g_wZ#)zsU4uX%@crZPkI5mIhb3$MOI{o?5W+G z12Ww9csp{~Gc>EG$~*as-eWMl{41m}&0SNgmnu_L z@(_}msb^lv6L=;|a=7UZ)xu}%?sI=0sJkz3I_z=)OnGw8>gCE|(8S|U$+XY}{Azzq z_%*cjY69Fg^vjlI)C@S^Rz3{bvX!5<@?kIaS-e(0?56>X*Uo}N7Vq2h-2n^Ju5g!g z_73E{<&dbLc2Hy_+(y83VhV)3OmA0(s`%Tjg`yENGg*aa%jSq4rWhCP?Fpkv5x&cI zfHXw7WsF#BY`Ybqvdjm)3|Y(-=$Z?&YL(vQ^B6L}wRvGWjk7UK99R|!*Qgwp;z;sH zI0Eyv@k=e#HGbXFtBt4*{0{V|P|$QBW^f#NW_Nia&mA70b5ruj&SwMC(c_T&zE>MM zSPvX(kXJ;C+)j-T9bLU#i_LzySdT3<$k85Pe-4M zc}nO~Ib?OjjBqusb}1`dw|GtE<%-I$-h_I3__YpiRXt{ODPlhlM3=k1+LNBfs0;~>V^$-%g0%7NYEVK8q*ai|j#I9fi zRlotyCLTG&OcJKfy+19X+H~y2f9-tp7G|nFzb9rLU>oLcMV-HqervxcDOTT-RFV>o zWrO6j8%pM1Fn7WsE^<)oAOp$dC|)~gG<)5cQY=aIT7{x;OSr3d1#2N5dMOb2=#YM9 z^m5`GCz$W)`UpO+VRW%7HAAsqe)P#lpoE`a`Tf8jN4^;O-Po6Uee!T)=VERANNwoo z+L>RRf98GJ+Gx*DseL2TeLlj-?!X#)Ee+Lz>&CBem1Q7kX^9QD@BaDb zu33JYoTz&tZkTOO?&sBgb?*d*2m!d*f=eczabX|>=>8p{AA^t%Q-d3q6F|PV2OupOIX6VLts-H zOML&`zR+-=gcF2+I!>_?bg{vsp(lFgu-R-7aoY}lE&2W~CF)Y*pVGpXlh`7aA8kqS z+6ofeXqA^ow}MXF0`lRt74_RDs~5H;cx_D(;Fqxf!d5T{V}TE^t=+^nUQHl>d@DF^ oi&UWxytYpHY literal 0 HcmV?d00001 diff --git a/entrypoints/anthropic/__pycache__/serving_messages.cpython-312.pyc b/entrypoints/anthropic/__pycache__/serving_messages.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4830054cd01a9f571658e7e75f56b63804a7a2e GIT binary patch literal 15848 zcmcJ0Yit`ydSLVY){CMfQWRgJL{jgkCE1c~S(4wDEk9;%xjRd6)NPv@J~Yji9}2ZQ zyF0mPa@t)vxr=3F7S=8n7#mhF+DU+MkO1pEfHQD+IJT%s_jI&(I%ERe{yEUG=5~|Z z0ryoko1{3@%+6&}kgBVzzWVB`ufD4K9?kz`GU*9OQRTmj{ND=%@ptG&N~&DpWsQs= z?h_;tBuE)4kII5F{FDdf_^Ajg;HiizW2&G^hG3LYbxae~;Ib;Jjp>3qTvkW*F+!J3#kXvQ$wXl=|AwBWKXS{JJi*5k51+7Pn_ ztub5B7HbSP#_U0RtSQ(Oa|9hRXV8gZ3{h9i9dzTeG3tpm2b*PtoS@`$r=a>d zm?KKTR?BM|w)^&kO zCc`(Vq-*@r8CQ~`Z$;uaTsOmUGD^|EM4hGsGNEETc{e_Lii%Tom`Tu3qZp6hg@@)c z_2vSVWT0rAir@ZcsGL!-M)vB%lw4K&HbnioBXgL?>=0BD2(`FddEwmi;gi#dW3C zIdbj$>qVL`Q@PfL{jF%Mq&SQTvnUT}5<7~dCxQ0ri}O@`{LB?`6%iV!H1DgtkRYjO zQc#j%;Ju1Me43SuvVn87caW^Stc48rjw~hnLxPm22*?#bQe7be%IQT7?swR=sL>-l zP>}2OkBNYswn8u323Zmzb-DiOayJv3@1Bp$cQRr62E}yVSr{4_eq)I0jz(VZp1;f7 zOvJlyMWeB9poG3VpNPbnWOrDSk?x#ChGID}?V7(U)ZV7U^C2{H=yrs;8N$@i4j8Z< zja(1eTLf>aUA?w?YHfIZa{bVw+O(l7t?YUUO~fq9q=X1Y>E-`|>~~+ca`xeeq?IxQjw--m101#RUep6xirxUG5nwFK#2s0%4py?MXtU*HM%r_W?We^f zz;!)AyYjnh|7k&sdp~(nfI~tp*l3DbpyMGX@dg!7hQdj)6pD~Zp$c|oG7*QfB{UzV zVNr!@Djt44N`=A;OacHw#svdkCBj)6cdgBs4dqD)Xh?%NPITBKvJOv zw9N$*g%6_8EIx?@bxag_6P?>&AvzJIgo+&Xp(x(yf@v1EONhyx4T62&YiL$H!a^_} z92RslEC=mHSqsWqQRYLLAF^2`WGLz2zXJa8e!2{W{~)_hFz^)9JEj!zeb{d)SxTN# zq?9RDN}bZAv?*OmpE9J38?q-x5fbhKB#BCpK}CuP<@f7dBA}iQ80i3j(QPP08-VUW zStrW6Q1-dR3R)^ohS0hR`dIi*j?aP?FU3NV3Md3+l1a=9>L{?4xhd#z%WOCrO+uS0 zxiAllsSUvzM3_)Ws6@aaakWFS@O+>G`cu*C{~{W7Ay~|CG8qab5kFnU=`FemO>zjb zMR#6MOT@WLqOT9Yl9wR+^;^WAi_lfPqhYlTyv@g(ttf8{pgb_e`^NdEPTtYYH#Cc{ zSu3Hl?hz)PKI~IV!-vH@P%2v$Kj%HYrpv8s6yj5 zEdI~1iW>OWmWKq&WUsydMckv?%ku};R8w|M(OQ(@HK{4))yh$TDz{=SGuqw9AMpv#2mx(%&$pT&4bmUO{^SiwkFH^DSY zP;*H2C`nIp?~;uFA@L(QSqZr9C4le>xaELb!JyrY)0`_;D0)C&Ptj|xDE5sI(l}(* zyK=JnUB!|vr7gF1DV+o<1KC)qI)E27cvr~*!d+gXNi*iw$pdyy?SWjxGlkYNa!p$p z$%rD#LrAp8!GIzy?<%BsJeOPL`+Pa9OZo%kPf6CL^mrBbNjY%n12{jlvQfrhio}St;T)e}v@u_+M77c$j{`mVvNEMj%2(wp!{xc2 zQcEKNKjbUtlmvs+1DxRGf?ZddKn0M^wUyQ)<=}u?bjg<_fJGO6NrKv*k}7zGT67nd zq>aWWxpQL401yKu&G1Thmak74LixSNpvxGixnZePc!HAK6p=~%fqv-z#tAT^MJq{f zX-Nk_A@v@o_z%j%&6PXhoOqqJq;nsJ*q6ZI%3*Nt!*KpfV5r3y`k7Y{DP=5oQZZ8!j`EbJ zOy$l}s68q5D?Fk0bm2`Zk=}O|#WRwmLq@DzfIV#}D;JvmtC}(bAKVf~G}l1OVlNo^ zwEn<+ z1}Nq(sKDc>Q5A_H14M@(p#?fBC?LZ|G1$fvs1sr(jdq@(LOOIv&|Y7N&tl&Lpo)WY zLC}GrybuF(7%ABD2q|_%T+3NVV8|>)89@o^ISs>u(G*Z%k_2O3IR~o?6S!!)3hgej z%V8Hi2xxwZTt>7UWr(JLR#0IZNzh7GHr)&Lf(01_vw6=)&NiU?QJVo-O2`|uKvRMW z>HPEJ;|;;I&m6ltEhv$VCn&=4yMh*}#Sxk!=`(2L98=KTU<#;8FpyMomX2U|jGzZ& z5m>@dbTZIFA4To@NHP*nGU50vB`A^%EvUou;PuFH2`vMC8D;q;f(1=1fR!jH!2Ljj zm=l5moD9IYNQ@H4RL+5IDyYbX*nAQeF+RV*2nw)w=xT&%24$Eb71+2*BF!F#PtsRW z%Qcj}0vQa1oli7US_JK_a5Mrw3a~WA>QDmDNiuV1vHOTdm#!ye$ZwPt`7Rn~c zq6H2!Nh}tEE-yVo6)uWW14i*3;0LT69Y^C%L-tua%!TbR!HY~~wBterQwR+F+{ihj zXmRXdL+v-{j+mXC1jFYoBC91Do0coHYQosW8Jf2Y%{!IWw5^@9^|7|TKbhFB9OkQ> zTvaPu)yg-ur28hgzOyX+)t>uQA+N51{_f^g6W`RuH4Sbz4X#e{_4fOV_ZHV*$<()R zsy@{Jv7T>ed1QIt_SnWY^gl3VRfOXZj92Xvv2JRgy~g^kY*$^~HG4U8=SSww(#%K+;_YIE?tNOK(OjQeD8X5rt`Uh9_yG^c# zU;ok9i)h#qY4`xqz%_KR4IP`;pHcsD?l0!p&Y6!IF7sB$L)!z}`t2=i*XH1dqaTds z0eAiVd{_VGwGY4Xvu|*NQ|#c>v+K`a&kW9_J1@UbDViLgDpXcm)=1c#7{%h2wHJtK z_P^)>0)^n7v_Hfm` zY;`Z+5a1dH*@nRaywq{8T34xGR%Gk)a?e}aIBPF!?OmQ&IlJTT;@m^5d+4e2xi>w0 zG2_0pJh5$b>>BKx!Ot4}X+sb1@GMV!{~~X4aHcla)W(^*SyT7sp)J!0@9>su8r(7s z^A3NxCe&-kah!8pWE~fG?XA4qmku1}0w>wPNiJ}X4V+86&I8}8YhEf;&I+LMVzz>) zHZQAqM3RR!c+!S8-dwkQmalE$YTLJK+m|PKWAzWuzjJ=4%Fk5|u~kF6uC`sT|GhIm zKEt@NLO=Gn@Clz=4D8q3^-}OD$ZS=Li}2Z3?cu zxnf8FLaE?Im{W^2#kr*6(B2{};0-EG*lS}bttE&jxhl!FmJump4!kI(DV5i>mlQ=K z{p-;SuFSdLl-|lxYQZj1q~y|^z6X~0Zvmx$AiuBpw<;BQo)~l?xv zE5tq+bcN!Shp7fWp$I`qN-3ok0Ueo`Lxx{TDux&$R!;ws;J~Yig^<=o-vu@cb+h0xW~dN$x)(G~`ZZ{zkynXEi!Ibk69`>j0NpvKDu5E)kZTC# zRIPkVKqcx*iwMVWAskvf2FyUv-H_s$Nwm4~GKxw}WsV$axJ7?{4ND!%C6NCe5GDT_ zvj2%a%y2x{a0Ul!aJ+l_;nIVpZA0gdu?C%KY3tC_dT!(rJ8~&Aa(UZ$g*Vr8=26x> z`t;R|d1_g^W2ip3X9G7f!;Z{kMy_lduktnrXB%W~gPS*>88Wu>T2QpP1^`blc4%+or>Kq;A&Sy{XHX52uZXcPq_YWd~c?@#t%r%AvG&C|gZf zF3A2VX@-e>_r&0&T=thkvPnYum&4;ac%gp>>i0Rc(ZPj|l$R$Uhl3Fr7&w8siR_G$ zbOV{8dB=63k;%KQpo>##&O*b6QCfu?G8||)m}}**=ABFh+Q5174dhranzIfZkY(f!YRfoN=JKwk0#3#) zmFGNCsqj1ikGB9Jm7e5w!iFg?aVogtR5xU~kjFQ{y#ZHVC@-%mIXEsVropk0Gy24E zs$fbiFkqMFjPQUGTe8^1#|EmP!og3`sz(+&-Gj0Llp)&{TX@*kaL1Y&H#}R5<#elt}dwFK97N_%^&R_8fSeODBHG1@n*4EBPj@B}5IiL`^$i+3>^EJ1M@lj<0FF z-+ixpr^dxMwLT0z2ysn^*`~w1!}l=pAi+6~v5sSVDuuaXPp7TXFRQaPgweto?5x4Q zHp3d6oS~C7bZ*vd8HRXw`|`<;j1JyZ$C(_g$+7k(YjWq*^5HGh5#HT-Fu>@R=@`ZU zYW|*@FnM@O zvSk_BwbbwFR7iW)^yrs0K(Vcvv$nC;wnt!JbaK`~);h>pk8fL#FQ3`*4Dk>RHQVox z-W%mD4$k6ZEk4fD!CE?Y>TLJt?#-=RId?zn?&sWNta~g|cNDNUSEP+jFg{JS>4x@n zeFtmoc;{5MlCX`-(0J};^-9HVrDfTK)$@IK4U$U*Z0?t>5cm=markmVwj?JZgh zun%rjL;1OFR7$5|$;|RMLViB~jmfL%FnXE$NWw(QF`qw9spzi4tWrv{YNI+&l@Z+9 zWx0JIMO9K?i6Srw%G^A2`OrmyFDXSXh*%@hxMV6&Mw$zu#gu8IR)Q(qH!$BaHx3OU zuX6QsW$qlfW##3e55&@1pkFE>EE^Vy4;yvTYC~Y?%dW3k!Xj}KLUsSO{JN6(OQ&)H zvhOs0*&a&@2)!vjw^x~f+gzEz+IRjcrAm$#RuSZ@p6Ql2MK+Mu4VyGa2%+Wn#s3GV z>Ln}^r{G*ueHAkxt$3fHkmbt0dDsi`NH%>HBNcJN@h{+ny_6F``7gidRLDtBxm88l zP)U;Z9-JYQaJ|b{6)Uzc&4Ds_MJkpmQWfRS)9?HQ&Z@zEYw#7HRrvl>l1{8)D))_4 zEVZuk^g%5OPn9G%Im*_T3lYP~K}3iRx1{lY^*bW|jeWB!mSInMIuE4(P{~B$716)Z zEXm)-S1ZNJmGcGKNiQ=ZjZe0at%aL(!zYcuRGF&$lDC&v>YG1B@|wb`a%E+RfPDE7 zahGT&n8VWeUV^^9RK?^2@N@ZawFCj5N%*BbBO?x)1v4(er>Z3Ae}#D|@){5>eGPQI zrK)dL-OW93%We~Q6kj85%dnd+FntwUMkg0l_=DwNA_Y#6=@->spS(Cd^_pNTIFJ(a z0U0)lAOdWn?&Q8Kg|EyR!6?1P6LD&>Lj3kw#Guy;rUDL#sX;vR91J4bC&+@5f}Q^! zm1Xq1h=Lk$RH0AfLpVwY-YXbfjv|mGbw@PIaIiIx8NS_S!qEBm5$4}Th!hwMdDoyt zw9m|?NP(IXqYt{RBNP*m6+(exsIa6vMhstlLJTk@4qw!l{t3cz31#TepnrhyRf)k$ zd%ws8|BNdaeg#7$J7D8t*J@Tv$#pHfwFv@7cGvO+-qQ>gCGQQS zjqd;9@coLoCO37ilHlvSf9&3|^`Vcy8i1?Cw2<+;3YmLI}~-!8wnxP84T)dNr#cT%#`uxy!bu1RLAgzd7*K zk#U^PG@RxgJ=+aEpZF#?->aDdKpcCG4#`-eJCFV8=hs`}*Je_K&~KcXaV> zhq$&;wrzAzqYQZVRGJpY+7!e~+q&1z^38s(d6;b;e#WGmhcnIR*Cuy7K2=S zv~N7)ySA>}X+8XOAU%F9)A~xr^$L!m9c9~&a&0HrwiCRobIUcf32`)eyJPJfjMOrM zsBay3*0|kz0iL_Qe$F?t4781Q5p2yftX=lQYSyW8mF8oSxX?oAon*vB;BF;I=I!8I@B)xOOerOHZTn6y!{|H|4K-syjsdXP#xhY(+Sr}(aMzH=Pt>1$m-3zE}uh-;r>+o$+L zhhe2VPRc;`J6hK#Vf^-EyuTyebpi=VD^$0%0_Vq0$kN^s-aGWPm-UXOy+fbude_J9vs4oY@+jfl|>E<4C6Q!s_IXt?Lo9`D(^CmZ=@%Z5s>D{qzHUwFW=^(1${Bc`m?jdP0IR6nb+Z z9L=?8LLmrmhC(#*#?r{0L!;}AM)6-7`Tqsw>)^4)KjNVEs1jWtI5bEjO^rqtwdjz> z!N6toiXwtEa_7?MLJ>^4--ygo5E%@Cj)XMZBy#s17q1;R0!0}z+>^(k<=1Z$A1epp z8;Xuz-sRyvKHk&HclPn^J^#?;QR#Q*=5)F-69V!_KYwIH=e2|kh zq1Os%Si5>}FDhwEzqiOSW~QBUDL5TPrBtOt2_FviPcTN)S=b^zp8H63?1PK9revGHK=q|7Llu3 zcFpef=5=O0z?wUDEl_bW+4+QXA{vVg+FeD(Dv)I zco_vXd@l)cVOohmG}ptRpUp?`G7Ginhk#w;&$qfD%D^N50~Q*gIoFi`jbKr%JP(PL zE3AnjZAz3Rth-@N4{2vY<7e>Tc<=%PzbX?&vHmjCp@=qf9tY#;KS#qN9a7wB zSi!|R00qFrl|#-;K|3n$*`xG-fmhfRMDlx(Wfd}+?1hn#b^aY;_*sh@!Qle;?Gay_TBu{zt@a7O2SOPy8A fWZi4YECG*4{V&mDPy3o$){=Jid`h4KCi4FQ&fpLn literal 0 HcmV?d00001 diff --git a/entrypoints/anthropic/protocol.py b/entrypoints/anthropic/protocol.py new file mode 100644 index 0000000..626ca74 --- /dev/null +++ b/entrypoints/anthropic/protocol.py @@ -0,0 +1,162 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Pydantic models for Anthropic API protocol""" + +import time +from typing import Any, Literal, Optional + +from pydantic import BaseModel, field_validator + + +class AnthropicError(BaseModel): + """Error structure for Anthropic API""" + + type: str + message: str + + +class AnthropicErrorResponse(BaseModel): + """Error response structure for Anthropic API""" + + type: Literal["error"] = "error" + error: AnthropicError + + +class AnthropicUsage(BaseModel): + """Token usage information""" + + input_tokens: int + output_tokens: int + cache_creation_input_tokens: int | None = None + cache_read_input_tokens: int | None = None + + +class AnthropicContentBlock(BaseModel): + """Content block in message""" + + type: Literal["text", "image", "tool_use", "tool_result"] + text: str | None = None + # For image content + source: dict[str, Any] | None = None + # For tool use/result + id: str | None = None + name: str | None = None + input: dict[str, Any] | None = None + content: str | list[dict[str, Any]] | None = None + is_error: bool | None = None + + +class AnthropicMessage(BaseModel): + """Message structure""" + + role: Literal["user", "assistant"] + content: str | list[AnthropicContentBlock] + + +class AnthropicTool(BaseModel): + """Tool definition""" + + name: str + description: str | None = None + input_schema: dict[str, Any] + + @field_validator("input_schema") + @classmethod + def validate_input_schema(cls, v): + if not isinstance(v, dict): + raise ValueError("input_schema must be a dictionary") + if "type" not in v: + v["type"] = "object" # Default to object type + return v + + +class AnthropicToolChoice(BaseModel): + """Tool Choice definition""" + + type: Literal["auto", "any", "tool"] + name: str | None = None + + +class AnthropicMessagesRequest(BaseModel): + """Anthropic Messages API request""" + + model: str + messages: list[AnthropicMessage] + max_tokens: int + metadata: dict[str, Any] | None = None + stop_sequences: list[str] | None = None + stream: bool | None = False + system: str | list[AnthropicContentBlock] | None = None + temperature: float | None = None + tool_choice: AnthropicToolChoice | None = None + tools: list[AnthropicTool] | None = None + top_k: int | None = None + top_p: float | None = None + + @field_validator("model") + @classmethod + def validate_model(cls, v): + if not v: + raise ValueError("Model is required") + return v + + @field_validator("max_tokens") + @classmethod + def validate_max_tokens(cls, v): + if v <= 0: + raise ValueError("max_tokens must be positive") + return v + + +class AnthropicDelta(BaseModel): + """Delta for streaming responses""" + + type: Literal["text_delta", "input_json_delta"] | None = None + text: str | None = None + partial_json: str | None = None + + # Message delta + stop_reason: ( + Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"] | None + ) = None + stop_sequence: str | None = None + + +class AnthropicStreamEvent(BaseModel): + """Streaming event""" + + type: Literal[ + "message_start", + "message_delta", + "message_stop", + "content_block_start", + "content_block_delta", + "content_block_stop", + "ping", + "error", + ] + message: Optional["AnthropicMessagesResponse"] = None + delta: AnthropicDelta | None = None + content_block: AnthropicContentBlock | None = None + index: int | None = None + error: AnthropicError | None = None + usage: AnthropicUsage | None = None + + +class AnthropicMessagesResponse(BaseModel): + """Anthropic Messages API response""" + + id: str + type: Literal["message"] = "message" + role: Literal["assistant"] = "assistant" + content: list[AnthropicContentBlock] + model: str + stop_reason: ( + Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"] | None + ) = None + stop_sequence: str | None = None + usage: AnthropicUsage | None = None + + def model_post_init(self, __context): + if not self.id: + self.id = f"msg_{int(time.time() * 1000)}" diff --git a/entrypoints/anthropic/serving_messages.py b/entrypoints/anthropic/serving_messages.py new file mode 100644 index 0000000..340dabf --- /dev/null +++ b/entrypoints/anthropic/serving_messages.py @@ -0,0 +1,460 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from +# https://github.com/vllm/vllm/entrypoints/openai/serving_chat.py + +"""Anthropic Messages API serving handler""" + +import json +import logging +import time +from collections.abc import AsyncGenerator +from typing import Any + +from fastapi import Request + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.anthropic.protocol import ( + AnthropicContentBlock, + AnthropicDelta, + AnthropicError, + AnthropicMessagesRequest, + AnthropicMessagesResponse, + AnthropicStreamEvent, + AnthropicUsage, +) +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ChatCompletionNamedToolChoiceParam, + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionStreamResponse, + ChatCompletionToolsParam, + ErrorResponse, + StreamOptions, +) +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_models import OpenAIServingModels + +logger = logging.getLogger(__name__) + + +def wrap_data_with_event(data: str, event: str): + return f"event: {event}\ndata: {data}\n\n" + + +class AnthropicServingMessages(OpenAIServingChat): + """Handler for Anthropic Messages API requests""" + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + response_role: str, + *, + request_logger: RequestLogger | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + return_tokens_as_token_ids: bool = False, + reasoning_parser: str = "", + enable_auto_tools: bool = False, + tool_parser: str | None = None, + enable_prompt_tokens_details: bool = False, + enable_force_include_usage: bool = False, + ): + super().__init__( + engine_client=engine_client, + models=models, + response_role=response_role, + request_logger=request_logger, + chat_template=chat_template, + chat_template_content_format=chat_template_content_format, + return_tokens_as_token_ids=return_tokens_as_token_ids, + reasoning_parser=reasoning_parser, + enable_auto_tools=enable_auto_tools, + tool_parser=tool_parser, + enable_prompt_tokens_details=enable_prompt_tokens_details, + enable_force_include_usage=enable_force_include_usage, + ) + self.stop_reason_map = { + "stop": "end_turn", + "length": "max_tokens", + "tool_calls": "tool_use", + } + + def _convert_anthropic_to_openai_request( + self, anthropic_request: AnthropicMessagesRequest + ) -> ChatCompletionRequest: + """Convert Anthropic message format to OpenAI format""" + openai_messages = [] + + # Add system message if provided + if anthropic_request.system: + if isinstance(anthropic_request.system, str): + openai_messages.append( + {"role": "system", "content": anthropic_request.system} + ) + else: + system_prompt = "" + for block in anthropic_request.system: + if block.type == "text" and block.text: + system_prompt += block.text + openai_messages.append({"role": "system", "content": system_prompt}) + + for msg in anthropic_request.messages: + openai_msg: dict[str, Any] = {"role": msg.role} # type: ignore + if isinstance(msg.content, str): + openai_msg["content"] = msg.content + else: + # Handle complex content blocks + content_parts: list[dict[str, Any]] = [] + tool_calls: list[dict[str, Any]] = [] + + for block in msg.content: + if block.type == "text" and block.text: + content_parts.append({"type": "text", "text": block.text}) + elif block.type == "image" and block.source: + content_parts.append( + { + "type": "image_url", + "image_url": {"url": block.source.get("data", "")}, + } + ) + elif block.type == "tool_use": + # Convert tool use to function call format + tool_call = { + "id": block.id or f"call_{int(time.time())}", + "type": "function", + "function": { + "name": block.name or "", + "arguments": json.dumps(block.input or {}), + }, + } + tool_calls.append(tool_call) + elif block.type == "tool_result": + if msg.role == "user": + openai_messages.append( + { + "role": "tool", + "tool_call_id": block.id or "", + "content": str(block.content) + if block.content + else "", + } + ) + else: + # Assistant tool result becomes regular text + tool_result_text = ( + str(block.content) if block.content else "" + ) + content_parts.append( + { + "type": "text", + "text": f"Tool result: {tool_result_text}", + } + ) + + # Add tool calls to the message if any + if tool_calls: + openai_msg["tool_calls"] = tool_calls # type: ignore + + # Add content parts if any + if content_parts: + if len(content_parts) == 1 and content_parts[0]["type"] == "text": + openai_msg["content"] = content_parts[0]["text"] + else: + openai_msg["content"] = content_parts # type: ignore + elif not tool_calls: + continue + + openai_messages.append(openai_msg) + + req = ChatCompletionRequest( + model=anthropic_request.model, + messages=openai_messages, + max_tokens=anthropic_request.max_tokens, + max_completion_tokens=anthropic_request.max_tokens, + stop=anthropic_request.stop_sequences, + temperature=anthropic_request.temperature, + top_p=anthropic_request.top_p, + top_k=anthropic_request.top_k, + ) + + if anthropic_request.stream: + req.stream = anthropic_request.stream + req.stream_options = StreamOptions.validate({"include_usage": True}) + + if anthropic_request.tool_choice is None: + req.tool_choice = None + elif anthropic_request.tool_choice.type == "auto": + req.tool_choice = "auto" + elif anthropic_request.tool_choice.type == "any": + req.tool_choice = "required" + elif anthropic_request.tool_choice.type == "tool": + req.tool_choice = ChatCompletionNamedToolChoiceParam.model_validate( + { + "type": "function", + "function": {"name": anthropic_request.tool_choice.name}, + } + ) + + tools = [] + if anthropic_request.tools is None: + return req + for tool in anthropic_request.tools: + tools.append( + ChatCompletionToolsParam.model_validate( + { + "type": "function", + "function": { + "name": tool.name, + "description": tool.description, + "parameters": tool.input_schema, + }, + } + ) + ) + if req.tool_choice is None: + req.tool_choice = "auto" + req.tools = tools + return req + + async def create_messages( + self, + request: AnthropicMessagesRequest, + raw_request: Request | None = None, + ) -> AsyncGenerator[str, None] | AnthropicMessagesResponse | ErrorResponse: + """ + Messages API similar to Anthropic's API. + + See https://docs.anthropic.com/en/api/messages + for the API specification. This API mimics the Anthropic messages API. + """ + if logger.isEnabledFor(logging.DEBUG): + logger.debug("Received messages request %s", request.model_dump_json()) + chat_req = self._convert_anthropic_to_openai_request(request) + if logger.isEnabledFor(logging.DEBUG): + logger.debug("Convert to OpenAI request %s", chat_req.model_dump_json()) + generator = await self.create_chat_completion(chat_req, raw_request) + + if isinstance(generator, ErrorResponse): + return generator + + elif isinstance(generator, ChatCompletionResponse): + return self.messages_full_converter(generator) + + return self.message_stream_converter(generator) + + def messages_full_converter( + self, + generator: ChatCompletionResponse, + ) -> AnthropicMessagesResponse: + result = AnthropicMessagesResponse( + id=generator.id, + content=[], + model=generator.model, + usage=AnthropicUsage( + input_tokens=generator.usage.prompt_tokens, + output_tokens=generator.usage.completion_tokens, + ), + ) + if generator.choices[0].finish_reason == "stop": + result.stop_reason = "end_turn" + elif generator.choices[0].finish_reason == "length": + result.stop_reason = "max_tokens" + elif generator.choices[0].finish_reason == "tool_calls": + result.stop_reason = "tool_use" + + content: list[AnthropicContentBlock] = [ + AnthropicContentBlock( + type="text", + text=generator.choices[0].message.content + if generator.choices[0].message.content + else "", + ) + ] + + for tool_call in generator.choices[0].message.tool_calls: + anthropic_tool_call = AnthropicContentBlock( + type="tool_use", + id=tool_call.id, + name=tool_call.function.name, + input=json.loads(tool_call.function.arguments), + ) + content += [anthropic_tool_call] + + result.content = content + + return result + + async def message_stream_converter( + self, + generator: AsyncGenerator[str, None], + ) -> AsyncGenerator[str, None]: + try: + first_item = True + finish_reason = None + content_block_index = 0 + content_block_started = False + + async for item in generator: + if item.startswith("data:"): + data_str = item[5:].strip().rstrip("\n") + if data_str == "[DONE]": + stop_message = AnthropicStreamEvent( + type="message_stop", + ) + data = stop_message.model_dump_json( + exclude_unset=True, exclude_none=True + ) + yield wrap_data_with_event(data, "message_stop") + yield "data: [DONE]\n\n" + else: + origin_chunk = ChatCompletionStreamResponse.model_validate_json( + data_str + ) + + if first_item: + chunk = AnthropicStreamEvent( + type="message_start", + message=AnthropicMessagesResponse( + id=origin_chunk.id, + content=[], + model=origin_chunk.model, + ), + ) + first_item = False + data = chunk.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "message_start") + continue + + # last chunk including usage info + if len(origin_chunk.choices) == 0: + if content_block_started: + stop_chunk = AnthropicStreamEvent( + index=content_block_index, + type="content_block_stop", + ) + data = stop_chunk.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "content_block_stop") + stop_reason = self.stop_reason_map.get( + finish_reason or "stop" + ) + chunk = AnthropicStreamEvent( + type="message_delta", + delta=AnthropicDelta(stop_reason=stop_reason), + usage=AnthropicUsage( + input_tokens=origin_chunk.usage.prompt_tokens + if origin_chunk.usage + else 0, + output_tokens=origin_chunk.usage.completion_tokens + if origin_chunk.usage + else 0, + ), + ) + data = chunk.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "message_delta") + continue + + if origin_chunk.choices[0].finish_reason is not None: + finish_reason = origin_chunk.choices[0].finish_reason + continue + + # content + if origin_chunk.choices[0].delta.content is not None: + if not content_block_started: + chunk = AnthropicStreamEvent( + index=content_block_index, + type="content_block_start", + content_block=AnthropicContentBlock( + type="text", text="" + ), + ) + data = chunk.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "content_block_start") + content_block_started = True + + if origin_chunk.choices[0].delta.content == "": + continue + chunk = AnthropicStreamEvent( + index=content_block_index, + type="content_block_delta", + delta=AnthropicDelta( + type="text_delta", + text=origin_chunk.choices[0].delta.content, + ), + ) + data = chunk.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "content_block_delta") + continue + + # tool calls + elif len(origin_chunk.choices[0].delta.tool_calls) > 0: + tool_call = origin_chunk.choices[0].delta.tool_calls[0] + if tool_call.id is not None: + if content_block_started: + stop_chunk = AnthropicStreamEvent( + index=content_block_index, + type="content_block_stop", + ) + data = stop_chunk.model_dump_json( + exclude_unset=True + ) + yield wrap_data_with_event( + data, "content_block_stop" + ) + content_block_started = False + content_block_index += 1 + + chunk = AnthropicStreamEvent( + index=content_block_index, + type="content_block_start", + content_block=AnthropicContentBlock( + type="tool_use", + id=tool_call.id, + name=tool_call.function.name + if tool_call.function + else None, + input={}, + ), + ) + data = chunk.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "content_block_start") + content_block_started = True + + else: + chunk = AnthropicStreamEvent( + index=content_block_index, + type="content_block_delta", + delta=AnthropicDelta( + type="input_json_delta", + partial_json=tool_call.function.arguments + if tool_call.function + else None, + ), + ) + data = chunk.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "content_block_delta") + continue + else: + error_response = AnthropicStreamEvent( + type="error", + error=AnthropicError( + type="internal_error", + message="Invalid data format received", + ), + ) + data = error_response.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "error") + yield "data: [DONE]\n\n" + + except Exception as e: + logger.exception("Error in message stream converter.") + error_response = AnthropicStreamEvent( + type="error", + error=AnthropicError(type="internal_error", message=str(e)), + ) + data = error_response.model_dump_json(exclude_unset=True) + yield wrap_data_with_event(data, "error") + yield "data: [DONE]\n\n" diff --git a/entrypoints/api_server.py b/entrypoints/api_server.py new file mode 100644 index 0000000..154cdeb --- /dev/null +++ b/entrypoints/api_server.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +NOTE: This API server is used only for demonstrating usage of AsyncEngine +and simple performance benchmarks. It is not intended for production use. +For production use, we recommend using our OpenAI compatible server. +We are also not going to accept PRs modifying this file, please +change `vllm/entrypoints/openai/api_server.py` instead. +""" + +import asyncio +import json +import ssl +from argparse import Namespace +from collections.abc import AsyncGenerator +from typing import Any + +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse, Response, StreamingResponse + +import vllm.envs as envs +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.launcher import serve_http +from vllm.entrypoints.utils import with_cancellation +from vllm.logger import init_logger +from vllm.sampling_params import SamplingParams +from vllm.usage.usage_lib import UsageContext +from vllm.utils import random_uuid +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.system_utils import set_ulimit +from vllm.version import __version__ as VLLM_VERSION + +logger = init_logger("vllm.entrypoints.api_server") + +app = FastAPI() +engine = None + + +@app.get("/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + + +@app.post("/generate") +async def generate(request: Request) -> Response: + """Generate completion for the request. + + The request should be a JSON object with the following fields: + - prompt: the prompt to use for the generation. + - stream: whether to stream the results or not. + - other fields: the sampling parameters (See `SamplingParams` for details). + """ + request_dict = await request.json() + return await _generate(request_dict, raw_request=request) + + +@with_cancellation +async def _generate(request_dict: dict, raw_request: Request) -> Response: + prompt = request_dict.pop("prompt") + stream = request_dict.pop("stream", False) + sampling_params = SamplingParams(**request_dict) + request_id = random_uuid() + + assert engine is not None + results_generator = engine.generate(prompt, sampling_params, request_id) + + # Streaming case + async def stream_results() -> AsyncGenerator[bytes, None]: + async for request_output in results_generator: + prompt = request_output.prompt + assert prompt is not None + text_outputs = [prompt + output.text for output in request_output.outputs] + ret = {"text": text_outputs} + yield (json.dumps(ret) + "\n").encode("utf-8") + + if stream: + return StreamingResponse(stream_results()) + + # Non-streaming case + final_output = None + try: + async for request_output in results_generator: + final_output = request_output + except asyncio.CancelledError: + return Response(status_code=499) + + assert final_output is not None + prompt = final_output.prompt + assert prompt is not None + text_outputs = [prompt + output.text for output in final_output.outputs] + ret = {"text": text_outputs} + return JSONResponse(ret) + + +def build_app(args: Namespace) -> FastAPI: + global app + + app.root_path = args.root_path + return app + + +async def init_app( + args: Namespace, + llm_engine: AsyncLLMEngine | None = None, +) -> FastAPI: + app = build_app(args) + + global engine + + engine_args = AsyncEngineArgs.from_cli_args(args) + engine = ( + llm_engine + if llm_engine is not None + else AsyncLLMEngine.from_engine_args( + engine_args, usage_context=UsageContext.API_SERVER + ) + ) + app.state.engine_client = engine + return app + + +async def run_server( + args: Namespace, llm_engine: AsyncLLMEngine | None = None, **uvicorn_kwargs: Any +) -> None: + logger.info("vLLM API server version %s", VLLM_VERSION) + logger.info("args: %s", args) + + set_ulimit() + + app = await init_app(args, llm_engine) + assert engine is not None + + shutdown_task = await serve_http( + app, + sock=None, + enable_ssl_refresh=args.enable_ssl_refresh, + host=args.host, + port=args.port, + log_level=args.log_level, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs, + **uvicorn_kwargs, + ) + + await shutdown_task + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + parser.add_argument("--host", type=str, default=None) + parser.add_argument("--port", type=parser.check_port, default=8000) + parser.add_argument("--ssl-keyfile", type=str, default=None) + parser.add_argument("--ssl-certfile", type=str, default=None) + parser.add_argument( + "--ssl-ca-certs", type=str, default=None, help="The CA certificates file" + ) + parser.add_argument( + "--enable-ssl-refresh", + action="store_true", + default=False, + help="Refresh SSL Context when SSL certificate files change", + ) + parser.add_argument( + "--ssl-cert-reqs", + type=int, + default=int(ssl.CERT_NONE), + help="Whether client certificate is required (see stdlib ssl module's)", + ) + parser.add_argument( + "--root-path", + type=str, + default=None, + help="FastAPI root_path when app is behind a path based routing proxy", + ) + parser.add_argument("--log-level", type=str, default="debug") + parser = AsyncEngineArgs.add_cli_args(parser) + args = parser.parse_args() + + asyncio.run(run_server(args)) diff --git a/entrypoints/chat_utils.py b/entrypoints/chat_utils.py new file mode 100644 index 0000000..3b722c2 --- /dev/null +++ b/entrypoints/chat_utils.py @@ -0,0 +1,1690 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import inspect +import json +from abc import ABC, abstractmethod +from collections import Counter, defaultdict, deque +from collections.abc import Awaitable, Callable, Iterable +from functools import cached_property, lru_cache, partial +from pathlib import Path +from typing import Any, Generic, Literal, TypeAlias, TypeVar, cast + +import jinja2 +import jinja2.ext +import jinja2.meta +import jinja2.nodes +import jinja2.parser +import jinja2.sandbox +import transformers.utils.chat_template_utils as hf_chat_utils +from openai.types.chat import ( + ChatCompletionAssistantMessageParam, + ChatCompletionContentPartImageParam, + ChatCompletionContentPartInputAudioParam, + ChatCompletionContentPartRefusalParam, + ChatCompletionContentPartTextParam, + ChatCompletionMessageToolCallParam, + ChatCompletionToolMessageParam, +) +from openai.types.chat import ( + ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam, +) +from openai.types.chat import ( + ChatCompletionMessageParam as OpenAIChatCompletionMessageParam, +) +from openai.types.chat.chat_completion_content_part_input_audio_param import InputAudio +from openai.types.responses import ResponseInputImageParam +from openai_harmony import Message as OpenAIHarmonyMessage +from PIL import Image +from pydantic import BaseModel, ConfigDict, TypeAdapter +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin + +# pydantic needs the TypedDict from typing_extensions +from typing_extensions import Required, TypedDict + +from vllm import envs +from vllm.config import ModelConfig +from vllm.logger import init_logger +from vllm.model_executor.models import SupportsMultiModal +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict +from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector +from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path +from vllm.transformers_utils.processor import cached_get_processor +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import random_uuid +from vllm.utils.func_utils import supports_kw + +logger = init_logger(__name__) + +MODALITY_PLACEHOLDERS_MAP = { + "image": "<##IMAGE##>", + "audio": "<##AUDIO##>", + "video": "<##VIDEO##>", +} + + +class AudioURL(TypedDict, total=False): + url: Required[str] + """ + Either a URL of the audio or a data URL with base64 encoded audio data. + """ + + +class ChatCompletionContentPartAudioParam(TypedDict, total=False): + audio_url: Required[AudioURL] + + type: Required[Literal["audio_url"]] + """The type of the content part.""" + + +class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): + image_embeds: str | dict[str, str] | None + """ + The image embeddings. It can be either: + - A single base64 string. + - A dictionary where each value is a base64 string. + """ + type: Required[Literal["image_embeds"]] + """The type of the content part.""" + uuid: str | None + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ + + +class VideoURL(TypedDict, total=False): + url: Required[str] + """ + Either a URL of the video or a data URL with base64 encoded video data. + """ + + +class ChatCompletionContentPartVideoParam(TypedDict, total=False): + video_url: Required[VideoURL] + + type: Required[Literal["video_url"]] + """The type of the content part.""" + + +class PILImage(BaseModel): + """ + A PIL.Image.Image object. + """ + + image_pil: Image.Image + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class CustomChatCompletionContentPILImageParam(TypedDict, total=False): + """A simpler version of the param that only accepts a PIL image. + + Example: + { + "image_pil": ImageAsset('cherry_blossom').pil_image + } + """ + + image_pil: PILImage | None + uuid: str | None + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ + + +class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain image_url. + This is supported by OpenAI API, although it is not documented. + + Example: + { + "image_url": "https://example.com/image.jpg" + } + """ + + image_url: str | None + uuid: str | None + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ + + +class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain audio_url. + + Example: + { + "audio_url": "https://example.com/audio.mp3" + } + """ + + audio_url: str | None + + +class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain audio_url. + + Example: + { + "video_url": "https://example.com/video.mp4" + } + """ + + video_url: str | None + uuid: str | None + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ + + +class CustomThinkCompletionContentParam(TypedDict, total=False): + """A Think Completion Content Param that accepts a plain text and a boolean. + + Example: + { + "thinking": "I am thinking about the answer", + "closed": True, + "type": "thinking" + } + """ + + thinking: Required[str] + """The thinking content.""" + + closed: bool + """Whether the thinking is closed.""" + + type: Required[Literal["thinking"]] + """The thinking type.""" + + +ChatCompletionContentPartParam: TypeAlias = ( + OpenAIChatCompletionContentPartParam + | ChatCompletionContentPartAudioParam + | ChatCompletionContentPartInputAudioParam + | ChatCompletionContentPartVideoParam + | ChatCompletionContentPartRefusalParam + | CustomChatCompletionContentPILImageParam + | CustomChatCompletionContentSimpleImageParam + | ChatCompletionContentPartImageEmbedsParam + | CustomChatCompletionContentSimpleAudioParam + | CustomChatCompletionContentSimpleVideoParam + | str + | CustomThinkCompletionContentParam +) + + +class CustomChatCompletionMessageParam(TypedDict, total=False): + """Enables custom roles in the Chat Completion API.""" + + role: Required[str] + """The role of the message's author.""" + + content: str | list[ChatCompletionContentPartParam] + """The contents of the message.""" + + name: str + """An optional name for the participant. + + Provides the model information to differentiate between participants of the + same role. + """ + + tool_call_id: str | None + """Tool call that this message is responding to.""" + + tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None + """The tool calls generated by the model, such as function calls.""" + + reasoning: str | None + """The reasoning content for interleaved thinking.""" + + +ChatCompletionMessageParam: TypeAlias = ( + OpenAIChatCompletionMessageParam + | CustomChatCompletionMessageParam + | OpenAIHarmonyMessage +) + + +# TODO: Make fields ReadOnly once mypy supports it +class ConversationMessage(TypedDict, total=False): + role: Required[str] + """The role of the message's author.""" + + content: str | None | list[dict[str, str]] + """The contents of the message""" + + tool_call_id: str | None + """Tool call that this message is responding to.""" + + name: str | None + """The name of the function to call""" + + tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None + """The tool calls generated by the model, such as function calls.""" + + reasoning: str | None + """The reasoning content for interleaved thinking.""" + + reasoning_content: str | None + """Deprecated: The reasoning content for interleaved thinking.""" + + +# Passed in by user +ChatTemplateContentFormatOption = Literal["auto", "string", "openai"] + +# Used internally +_ChatTemplateContentFormat = Literal["string", "openai"] + + +def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool: + if isinstance(node, jinja2.nodes.Name): + return node.ctx == "load" and node.name == varname + + return False + + +def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool: + if isinstance(node, jinja2.nodes.Getitem): + return ( + _is_var_access(node.node, varname) + and isinstance(node.arg, jinja2.nodes.Const) + and node.arg.value == key + ) + + if isinstance(node, jinja2.nodes.Getattr): + return _is_var_access(node.node, varname) and node.attr == key + + return False + + +def _is_var_or_elems_access( + node: jinja2.nodes.Node, + varname: str, + key: str | None = None, +) -> bool: + if isinstance(node, jinja2.nodes.Filter): + return node.node is not None and _is_var_or_elems_access( + node.node, varname, key + ) + if isinstance(node, jinja2.nodes.Test): + return _is_var_or_elems_access(node.node, varname, key) + + if isinstance(node, jinja2.nodes.Getitem) and isinstance( + node.arg, jinja2.nodes.Slice + ): + return _is_var_or_elems_access(node.node, varname, key) + + return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname) + + +def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str): + # Global variable that is implicitly defined at the root + yield root, varname + + # Iterative BFS + related_varnames = deque([varname]) + while related_varnames: + related_varname = related_varnames.popleft() + + for assign_ast in root.find_all(jinja2.nodes.Assign): + lhs = assign_ast.target + rhs = assign_ast.node + + if _is_var_or_elems_access(rhs, related_varname): + assert isinstance(lhs, jinja2.nodes.Name) + yield assign_ast, lhs.name + + # Avoid infinite looping for self-assignment + if lhs.name != related_varname: + related_varnames.append(lhs.name) + + +# NOTE: The proper way to handle this is to build a CFG so that we can handle +# the scope in which each variable is defined, but that is too complicated +def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node): + messages_varnames = [ + varname for _, varname in _iter_nodes_assign_var_or_elems(root, "messages") + ] + + # Search for {%- for message in messages -%} loops + for loop_ast in root.find_all(jinja2.nodes.For): + loop_iter = loop_ast.iter + loop_target = loop_ast.target + + for varname in messages_varnames: + if _is_var_or_elems_access(loop_iter, varname): + assert isinstance(loop_target, jinja2.nodes.Name) + yield loop_ast, loop_target.name + break + + +def _iter_nodes_assign_content_item(root: jinja2.nodes.Node): + message_varnames = [ + varname for _, varname in _iter_nodes_assign_messages_item(root) + ] + + # Search for {%- for content in message['content'] -%} loops + for loop_ast in root.find_all(jinja2.nodes.For): + loop_iter = loop_ast.iter + loop_target = loop_ast.target + + for varname in message_varnames: + if _is_var_or_elems_access(loop_iter, varname, "content"): + assert isinstance(loop_target, jinja2.nodes.Name) + yield loop_ast, loop_target.name + break + + +def _try_extract_ast(chat_template: str) -> jinja2.nodes.Template | None: + try: + jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template) + return jinja_compiled.environment.parse(chat_template) + except Exception: + logger.exception("Error when compiling Jinja template") + return None + + +@lru_cache(maxsize=32) +def _detect_content_format( + chat_template: str, + *, + default: _ChatTemplateContentFormat, +) -> _ChatTemplateContentFormat: + jinja_ast = _try_extract_ast(chat_template) + if jinja_ast is None: + return default + + try: + next(_iter_nodes_assign_content_item(jinja_ast)) + except StopIteration: + return "string" + except Exception: + logger.exception("Error when parsing AST of Jinja template") + return default + else: + return "openai" + + +def resolve_mistral_chat_template( + chat_template: str | None, + **kwargs: Any, +) -> str | None: + if chat_template is not None or kwargs.get("chat_template_kwargs") is not None: + raise ValueError( + "'chat_template' or 'chat_template_kwargs' cannot be overridden " + "for mistral tokenizer." + ) + + return None + + +_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], str | None]() +""" +Used in `_try_get_processor_chat_template` to avoid calling +`cached_get_processor` again if the processor fails to be loaded. + +This is needed because `lru_cache` does not cache when an exception happens. +""" + + +def _try_get_processor_chat_template( + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, + model_config: ModelConfig, +) -> str | None: + cache_key = (tokenizer.name_or_path, model_config.trust_remote_code) + if cache_key in _PROCESSOR_CHAT_TEMPLATES: + return _PROCESSOR_CHAT_TEMPLATES[cache_key] + + try: + processor = cached_get_processor( + tokenizer.name_or_path, + processor_cls=( + PreTrainedTokenizer, + PreTrainedTokenizerFast, + ProcessorMixin, + ), + trust_remote_code=model_config.trust_remote_code, + ) + if ( + isinstance(processor, ProcessorMixin) + and hasattr(processor, "chat_template") + and (chat_template := processor.chat_template) is not None + ): + _PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template + return chat_template + except Exception: + logger.debug( + "Failed to load AutoProcessor chat template for %s", + tokenizer.name_or_path, + exc_info=True, + ) + + _PROCESSOR_CHAT_TEMPLATES[cache_key] = None + return None + + +def resolve_hf_chat_template( + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, + chat_template: str | None, + tools: list[dict[str, Any]] | None, + *, + model_config: ModelConfig, +) -> str | None: + # 1st priority: The given chat template + if chat_template is not None: + return chat_template + + # 2nd priority: AutoProcessor chat template, unless tool calling is enabled + if tools is None: + chat_template = _try_get_processor_chat_template(tokenizer, model_config) + if chat_template is not None: + return chat_template + + # 3rd priority: AutoTokenizer chat template + try: + return tokenizer.get_chat_template(chat_template, tools=tools) + except Exception: + logger.debug( + "Failed to load AutoTokenizer chat template for %s", + tokenizer.name_or_path, + exc_info=True, + ) + + # 4th priority: Predefined fallbacks + path = get_chat_template_fallback_path( + model_type=model_config.hf_config.model_type, + tokenizer_name_or_path=model_config.tokenizer, + ) + if path is not None: + logger.info_once( + "Loading chat template fallback for %s as there isn't one " + "defined on HF Hub.", + tokenizer.name_or_path, + ) + chat_template = load_chat_template(path) + else: + logger.debug_once( + "There is no chat template fallback for %s", tokenizer.name_or_path + ) + + return chat_template + + +def _resolve_chat_template_content_format( + chat_template: str | None, + tools: list[dict[str, Any]] | None, + tokenizer: AnyTokenizer, + *, + model_config: ModelConfig, +) -> _ChatTemplateContentFormat: + if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): + hf_chat_template = resolve_hf_chat_template( + tokenizer, + chat_template=chat_template, + tools=tools, + model_config=model_config, + ) + else: + hf_chat_template = None + + jinja_text = ( + hf_chat_template + if isinstance(hf_chat_template, str) + else load_chat_template(chat_template, is_literal=True) + ) + + detected_format = ( + "string" + if jinja_text is None + else _detect_content_format(jinja_text, default="string") + ) + + return detected_format + + +@lru_cache +def _log_chat_template_content_format( + chat_template: str | None, + given_format: ChatTemplateContentFormatOption, + detected_format: ChatTemplateContentFormatOption, +): + logger.info( + "Detected the chat template content format to be '%s'. " + "You can set `--chat-template-content-format` to override this.", + detected_format, + ) + + if given_format != "auto" and given_format != detected_format: + logger.warning( + "You specified `--chat-template-content-format %s` " + "which is different from the detected format '%s'. " + "If our automatic detection is incorrect, please consider " + "opening a GitHub issue so that we can improve it: " + "https://github.com/vllm-project/vllm/issues/new/choose", + given_format, + detected_format, + ) + + +def resolve_chat_template_content_format( + chat_template: str | None, + tools: list[dict[str, Any]] | None, + given_format: ChatTemplateContentFormatOption, + tokenizer: AnyTokenizer, + *, + model_config: ModelConfig, +) -> _ChatTemplateContentFormat: + if given_format != "auto": + return given_format + + detected_format = _resolve_chat_template_content_format( + chat_template, + tools, + tokenizer, + model_config=model_config, + ) + + _log_chat_template_content_format( + chat_template, + given_format=given_format, + detected_format=detected_format, + ) + + return detected_format + + +ModalityStr = Literal["image", "audio", "video", "image_embeds"] +_T = TypeVar("_T") + + +class BaseMultiModalItemTracker(ABC, Generic[_T]): + """ + Tracks multi-modal items in a given request and ensures that the number + of multi-modal items in a given request does not exceed the configured + maximum per prompt. + """ + + def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): + super().__init__() + + self._model_config = model_config + self._tokenizer = tokenizer + + self._items_by_modality = defaultdict[str, list[_T | None]](list) + self._uuids_by_modality = defaultdict[str, list[str | None]](list) + + @property + def model_config(self) -> ModelConfig: + return self._model_config + + @cached_property + def model_cls(self) -> type[SupportsMultiModal]: + from vllm.model_executor.model_loader import get_model_cls + + model_cls = get_model_cls(self.model_config) + return cast(type[SupportsMultiModal], model_cls) + + @property + def allowed_local_media_path(self): + return self._model_config.allowed_local_media_path + + @property + def allowed_media_domains(self): + return self._model_config.allowed_media_domains + + @property + def mm_registry(self): + return MULTIMODAL_REGISTRY + + @cached_property + def mm_processor(self): + return self.mm_registry.create_processor(self.model_config) + + def add( + self, + modality: ModalityStr, + item: _T | None, + uuid: str | None = None, + ) -> str | None: + """ + Add a multi-modal item to the current prompt and returns the + placeholder string to use, if any. + + An optional uuid can be added which serves as a unique identifier of the + media. + """ + input_modality = modality.replace("_embeds", "") + num_items = len(self._items_by_modality[modality]) + 1 + + self.mm_processor.validate_num_items(input_modality, num_items) + + self._items_by_modality[modality].append(item) + self._uuids_by_modality[modality].append(uuid) + + return self.model_cls.get_placeholder_str(modality, num_items) + + def all_mm_uuids(self) -> MultiModalUUIDDict | None: + if not self._items_by_modality: + return None + mm_uuids = {} + uuids_by_modality = dict(self._uuids_by_modality) + if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality: + raise ValueError("Mixing raw image and embedding inputs is not allowed") + + if "image_embeds" in uuids_by_modality: + image_embeds_uuids = uuids_by_modality["image_embeds"] + if len(image_embeds_uuids) > 1: + raise ValueError("Only one message can have {'type': 'image_embeds'}") + mm_uuids["image"] = uuids_by_modality["image_embeds"] + if "image" in uuids_by_modality: + mm_uuids["image"] = uuids_by_modality["image"] # UUIDs of images + if "audio" in uuids_by_modality: + mm_uuids["audio"] = uuids_by_modality["audio"] # UUIDs of audios + if "video" in uuids_by_modality: + mm_uuids["video"] = uuids_by_modality["video"] # UUIDs of videos + return mm_uuids + + @abstractmethod + def create_parser(self) -> "BaseMultiModalContentParser": + raise NotImplementedError + + +class MultiModalItemTracker(BaseMultiModalItemTracker[object]): + def all_mm_data(self) -> MultiModalDataDict | None: + if not self._items_by_modality: + return None + mm_inputs = {} + items_by_modality = dict(self._items_by_modality) + if "image" in items_by_modality and "image_embeds" in items_by_modality: + raise ValueError("Mixing raw image and embedding inputs is not allowed") + + if "image_embeds" in items_by_modality: + image_embeds_lst = items_by_modality["image_embeds"] + if len(image_embeds_lst) > 1: + raise ValueError("Only one message can have {'type': 'image_embeds'}") + mm_inputs["image"] = image_embeds_lst[0] + if "image" in items_by_modality: + mm_inputs["image"] = items_by_modality["image"] # A list of images + if "audio" in items_by_modality: + mm_inputs["audio"] = items_by_modality["audio"] # A list of audios + if "video" in items_by_modality: + mm_inputs["video"] = items_by_modality["video"] # A list of videos + return mm_inputs + + def create_parser(self) -> "BaseMultiModalContentParser": + return MultiModalContentParser(self) + + +class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): + async def all_mm_data(self) -> MultiModalDataDict | None: + if not self._items_by_modality: + return None + mm_inputs = {} + items_by_modality = {} + for modality, items in self._items_by_modality.items(): + coros = [] + for item in items: + if item is not None: + coros.append(item) + else: + coros.append(asyncio.sleep(0)) + items_by_modality[modality] = await asyncio.gather(*coros) + + if "image" in items_by_modality and "image_embeds" in items_by_modality: + raise ValueError("Mixing raw image and embedding inputs is not allowed") + + if "image_embeds" in items_by_modality: + image_embeds_lst = items_by_modality["image_embeds"] + if len(image_embeds_lst) > 1: + raise ValueError("Only one message can have {'type': 'image_embeds'}") + mm_inputs["image"] = image_embeds_lst[0] + if "image" in items_by_modality: + mm_inputs["image"] = items_by_modality["image"] # A list of images + if "audio" in items_by_modality: + mm_inputs["audio"] = items_by_modality["audio"] # A list of audios + if "video" in items_by_modality: + mm_inputs["video"] = items_by_modality["video"] # A list of videos + return mm_inputs + + def create_parser(self) -> "BaseMultiModalContentParser": + return AsyncMultiModalContentParser(self) + + +class BaseMultiModalContentParser(ABC): + def __init__(self) -> None: + super().__init__() + + # stores model placeholders list with corresponding + # general MM placeholder: + # { + # "<##IMAGE##>": ["", "", ""], + # "<##AUDIO##>": ["

ixj4!6NhPHBd$Crw38D5+H3xZz4(Tij6NG0l>#q-kraJT~J=#Z}9#S7LVjmbTe& zz(^-#GR%@wnn^m`_Xw&7X4#5qLvzgCXC%FGdaTd`^Q0OI;O1->v(*>FU?XR_oQY-WSPhGqZ0Zo`#;3u zegKP7nG9I*4%YD%h=M->zmR=*MBxV!*>|)tP=Q#5C{*r7<$jcD#-9T?Az!f{$gVAS zW!Ws3<+DOIkVTJ$N2U3HN`(W%VaDZEdj^hQ9Z0{%$Kf?T30GYfPFYwmv9JjaPMO#w zGl*YUwOIqNaKPp5q#2uVc`cE$XIwsR*k?|=d`2^qmMhqr0gm8uW67j$V!{pwO%V^q zrV)*M<6BsrRDS+7615YlXeyrS1;@Cd*}c>0_kZ%^srR+09v_RQX6%Wi5xu4Bi70lW zXfkC*Z8L5s4H*}SV|gSmmpQ&KHREy#WlGB&*y`hbxUmhiTRC{$rZy0Dq+{{(h0hCO z)7PC3dhYkEjBl{akx&jbL0{z5lfj3BTl*r=cVeTCYL*sz7J8QAEA96ae>nTc!QT(Q zytUT!*c&-cZa@ie9(NqdF3;}ZG*ARFjneZ{9@sYo&6H@w}9T3 z_QmrH=a(Co1HVl_%RJ4z>|AU9D2ERHqrSD!cB;@jRA}ibNM{P|XA7;p8?}Lbp&}Is zhSp(DRLu8%s9-N9yzo~j%u`uv629A?$&#;EwyX4`5)O?=Rbh#$qnItDs(9pEjH==z zRY$R#x@M>_H~jkF@R;P3hk0_WU4A^S>+<%~O}m_>>Ejljpky|~+ZAL}vot%?xb3)j z&DTK0`UGCLsR9!7sKr4od34l4N0&~op)S%UIY`Q*E(dij-CRRq($?;v_B=Z7pySK* z8tNu(tqy9NZ;eb~ska(sf(Wne$6?2Jkhk}PA=Gi;Wg#4ft+QT^SWWdWlaQ?XgLiJJ5oY7+qL;2!&# z+7_!ZLv4GAv{26LuERRaatdgjxyX)k4)!DF3AylzWkPnn2AJHnz20-h3JEcLZ zu=g76vJS#ln*jY_RrJ7rg&x?7UU4=K?bMPDsLW$jUgcQ?wlwXOQiEzRi}tn#N5oy` zi?&(>nRzJlO{mQMACX*KD<$chIl32NENMJxO7?^%#nPs!88)u}TrpKC1?hQT$cMRP z-mnJz8V28)%1I|kNb4IR!7Dr&&AV3~ol0%PP7~Vlr@69KtVlT$hg}Q!_HOKsHK(xGSWM_|3Go8O&WFmQGmd1PSU8 zp4%}Z85}6(oNlNhx1?nc2a#wBY3(LwZVlERUDkVdC`~TyQ|8Dv-rUwd1lp|(P|$6v z*gyqqzpS}i^Xpm&lm`yYhwp`REqyCTotFO9hO={lLPz%;mq(Z=R!pRVDhNVL2XMlOHDZx$)nQ_I$aQtERPgfDtKmtVuGQK5Eboz zGWl>4+LvsIz|oL{dgka{O-~UKgkA^rE>A6Ia%dorE;{I9L5w`_#|W3`GDNuViTF?? z5YPJoBn%*(b12u)Suz6$gj{R%tLeqrg;}t$Et*3o^XQy|&aF(pRCDOu8v4`^6S?*a zxsG%BjzOnm@cE_Hjte>TK^|Rk(3OJNwya{^SAE@(Dr6V3xyT1EK3;3P2I+wxS(e(D zlw9|PeD`Ii`|^uRtKCc2By6QMdG6%oq+rv7;!kA7}HEV?+T8yu*p?8 z*q<^cNBm$;-nM4G%msULI1W2DgOh@GS$Iif@g z?T*-9^iyi$HsN{~$+Dp&G+A~-vYbe&X&skES-zQ8bg#!FY{3gFZoMokhLN<%|4bH^ zg%DuJV_@_|QgvBaadJ~~S<5!f7MOv@`e&d7w=aAw%yUqFND&hV?~=O~dEwiZ^>=vv zl6rNhbx~XpSGi+_uAT?_eSMWXU1)5YzkcufD%V~(d}Og@p=FipEHt$(MiwHgTvzcR z^`n+~{T@gjeZ}uDGM#*=aNuB(#orS>-uLXlv+GZ$P?NI~&M|2qB@Rk|6_np%r2d|w6Y3aIu=@vit5pJELkJottU zgoF(q#(L8yw@ls*$g-M@!D`m{|I%rei8Elb`rZOv#8sSEd%+5w_x9_8c^TSp9&Gi& ptH{zc{YGGD?t4VhN57>+AakbXpUepTcG{mme&mlEXMtir+Maud zL((#3WBc0yHFM6n=bn4ceVlvF(Z9G{4g#K}=}Z1_A3^*FLi8t8177@{ks!7RmY644 zk~JjBc@lpO^9J}Dlg5;3-bA7{Q_`HW%v(~{c`L@vNn6T3Z%;Yq9T>MHsg!fx39yy5 zC0!}^yqhFA$4a+8WaG=)Ir|FTz;%dq>_eyap*#1XyY`{G_n~|Cp?h1?7rm_SXQoFr zwdZ{Xf-|i2>sB-2XS;fm-!?m-)C6Wt0rXQKKb%s-5SUIY%dE>h> z+JJRvfX;Gvp$+j2(9;4hE~gWzoFFZ)#psDRJv4fB z^msJxxSUJ#3yE?1VlFB1N3L<{L|UR}GFK<)IZn96rip@cgTv_D5-kc3?nWJe0r5x<%Besj2cuAv&2bEG9x6I1e28n zHBiShS!N-VUgVcV;4?@`)glSJlu3i611v9a3lbx6OS~v?0<(}4rA&&+WWOU}FyL^C zxokEgNFtMEldAR1%%!u_%talT)q!c%GWjQyv&^(=ox40UbqxULg{zl7y3EW?%}+uH z``pab*-2*hql>ENqq#|D?%LIrHOM3k;d{6T>ygk`vcgq9vkW9#=Y=GTpmJiD#G9^1@m+Y23vRC%We%T=h`_Jpcf)dZ!ZDrT#-bVwHGFUTsLt;N8<1s<9(78+Po1Lo5-6Az!HHgI*#tLw zdC<#?UOx2l!%MXRc-i# z)`wc;1rb&Ov8qmuEb@0bz|0yEf6}T=Ou_=7F8UWNu-x$fRNz0k^$gN~DHG+m8$z;V5?Z$z7MZ`4y`pRVE*)=s#Q?vO> z6DC2k0Q`yn2#&Zw{O}`^@CCN4o7N&(94mYK3RB;B{V(C)LVMR{*Y?D7-$3ERiYrtc zR9wA<^SkbF@q*&+D_p3A`nGu`G*U7JE1vdJ$FSlVE__h&cNDKH{%B$Pn?U=$we9zm zQ2c9WIq-gArfMNv(7815j?($g)0h%DuQ)FhCM&LBDFng?AIvDu6NSldJ-&_ndVah2 zxu?JI!8b2q-@3gUOY2Mbmbb%7d$jBx1~shidk4SB@7mno`h#18n}el}BVXH;juU17 zyM^i9K)5)zdAcxDu?Ft_g@h%?EYA(Z|v!i z(s!mDIa@M?cWpzZ!>1M7>3@hRw%MwYFdz8AMtJ=jYwK%8U-3fO)4#o@c#eXa4(G<; z`e4aF@F1-CkCh#xg|pwn#MoRFo4aE3R&1V%&8GwQiVc2%QoFWL#TMGNwO8Bw6_S^wo(r) zpiAwkXBZ^9@I=Fh#1|xRBEP!l(K+E#G|0qCix5JFd=KRvt=gKTw``Y?&&SV7iwmi# zwb{a2k43{RU32mr@hP0|yhC=h(xOSu|7jF~{HvpAhD-#^K>bRrsR19B)PsonT(sgc z>0B?(3i8l<9(~Sh>ta}WGf!*1jQWUqN;bCCyv`w0OO_Hpy=akK_Z*P)h zc|E86hLKnjg*Hv{3@hkH-1F!&PZARH$oPSPhgb>fSmUb)a@XNZOFeqm6ID0`7b6fr zN8}DR@N?r|lMp}~qQS)9!&Y;k7-m9P62~!wFzv3Phop?KupEb!8?Khb&@>sozKRP!h7?PntV^3O#YUUc0jDJzu)HUKvbl-8k1%Kr3@`Cb2PHz2+z_2mrOt~uaOgAEQ<8w^aOn?k<4*)W|2N}3}ivz4{RcI zO-L_^!-ldMPD~Fo5I+as#Ztyxl!J738mUa{+H=Jv=nq>eT|jJ_yjQgwOQ7_ejl z?1qr>4vhn6Fy+RYHvLoQ#CV;%MSk+ifGv1F;XkE$WHv`b>kUq*3q-@i>|m?3d7%>m zhm60$(U5#WqJ&CA4Q23@L+mbs%(@i3nY5XyXGpK;H9;@rQaM;B{uW1zxfG0B7l71g zp?6#xZI9$nj$&&P$=iuFkv>X`XfNzp&{i6O#xXjP%%qp-+q|^gYQScrkWP@rr6qom z$9sl@ZMMW`&_Zexrl%M!Qpr~}!dnchrA;0!E@yH{R-fd?8tXi-Odys3ozd#lmZs5b zZ(#UJ?+N&&^DHcemKT~J{a;lbXvg4O5|Z#6kd#Bxssm{qScUvNo8%>CRkb7j+X*2n z*1AfGB~DarhzyydD2yRZP-J*sXoGjuD0~bb;S=dA>*tttZh0!l!pB`C3oG_;{} z`npFrhXgpAGj8DykUob$@kfyA6o~2vB;oAbp?Vdnw@meI-+V?5l}tk=YN%oj?N~b% zYv*%oH)N%|lyBqY`pM#{2fa!pwi6juBBRf!(UNJjM2%Lo#-C4qb@Bei$8SG5_3%_F zGOkeLCDV9`8vnJQ{!2^6)3zb6%YSSBrRx{2$GImzefZN-mgQzhG}lIIj;uYq1j z!JYn%_txLLck|0r#m^p0C|&WLuH#DA@vqO6yT+bTVUmD}mQ1}R zs<%Rgcc=k{8hEbj53=l@4QidbH}R#rFj1kpcBq32b#VJwnL6~q_lz1YnTAW$@XHOQ z=+sVhR*BC3^UZSf$}{Rp$#kVeU8zukjrZ5zFZ#+@j6EgGJ>$=GBP&$T z4%M$v{o7Z{)Zl{?Pu_p{{!?H1@Ow`a%Hh+`sM96W=@NB%*W}wVg%nfhx#<9;_4aVl z{lN0X_0Uzez4JTB;HSs}b1$>~=TALoGCGQ|Ft9vW@zL$$C2H{VbA^e*&2OMY@LAc{ z548_(c+0ivDkjQa8dnP1hEN{wk5t^AO0ccsa#sRfKe()ImcqrVm+*8Jrxf=96eiq} z*FoN{;;P~uE=*N?-9=gP#b6}pJFj?#po$WpcLM!Npnv)d`-TOBKTZcCf7Zcl|vOlWWqL0r%nSMC^lzci?j-6F(XDc>; z)kfI;RhQLiDV+PkPng3+lVa+^)d?s;wBd?`R+j+tA{ASwUaW9$xYk|w5@l4EXogYY z0{_}}Y@Ld&vv{R!>j6ox!S)n1pgiJ-nnv~AUbCe-P6RqDz5_6qFHR%;JV}@-;XItA zmue4yYyZ+fPHqboeaYE}MAY%eJ1!9={EgGCon>G1S!Na!;AwsWY<4Jch{$Kcfe!0> z)e4HWo5ciW;Vc9Hui`MUm`g9H&Qwl9Q6U4BJyEqIaSWfzCZnCIm0{S-0>h|=j41pW zNLCM_B3~T&&p1@};*BC-yyot55}y=tVOs6pmsev#lcRZiALgtH(;874hc5uN9XDcX z9N#pI#(^#>)vUp2mQ>>kkBX$3H2h!dPFBz^4L6@&geyD^1=mytYSI=hEr3Eg1pKNQ z-&EKg;we;(V3le@BMLH->7I~4ts6Osn-qkMAfV(KueY`mwv!+({GMj1KGQVT{If^aw`dgv+xrvp!QD!RS%F^%zD+i9p*{Y%^9pj?uSJ9T5C&^=*uf z6Yb%x>zmiVe-EKAjD*2iQ(+$9_CnhZ*`|VwJ>b&2n z1P7{SL;>FvJg{|g^JLYEQ5$if`zz;tXVs2T2Qe7?W%3uvDuq!e(b0ptS6vu&6A}8W z*!@`5gHiBF!Hwj4vg*UAp9pnsB{!4R07ip^J5X)IU^`)Pd>=v}(m$*zTAejHo74sP zWo?1bq%6W3x+1KhD#99?BCMe(!Ww!atf3~tG%XQ?lzb6FP`88Z`8RN3UifuZ+7Ff5 z1}gr(-`maLJ)A@!ycOMyZpAm_JN}sBk9|JLBjig0a}l3umAu6 literal 0 HcmV?d00001 diff --git a/lora/ops/triton_ops/__pycache__/utils.cpython-312.pyc b/lora/ops/triton_ops/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6fe6ee2bc06427e8c790815675629685ca1ef068 GIT binary patch literal 13075 zcmd^Fdu$s=df(-9$z8rgJt<0}UbZRgVcBuwIF4hTW&0f4@hrGLV zY~2b`?t(^kgC29u)$-Zb=+HkZ!znI@9#9`}Kpk9wbI}4vnrg}JDTRu2f%=c4Ol065 z=pX&QT`on-^b@@UijKsY*_m&?`F3{Zo8Nr%{e#VBB_Ko$e-eJaiy(f54_c@xWL7`a z5yU0pB*79a86k$%lpH4ESr;Lqx?vrOW%`IdY8W;sc|*h)H4U4TJQ*<$o1xwqp`w;y zi&AEaSflhXt>n!SCTbhDk%W%mXwH6}RQnxvgo$A%=i(|jH%pD`>Yy0%=7~+3fNQB% zU!baDExVu++E+hgdn>kQX|85e$1?94hHF_HSI62x5(oU8@S}lq9i-&6nIP1D3wbN+ zf)*aOf~yBhkCjy~hXqxwLLt4pK z)=Eq~Cnoq1J{fxzk$#311}49Lb%oafH% zAM)#D^B5-vMUj_{Vw?|+Un5qL^mqGu#OPGdRCuaO4DvwIbuRJ3i#sM?;CdqAk)EmZ z;&?pPb2bu*_C(@*uqQqx^oV>|jK>0)O^D%$&^>itb_K#hK5%^HdILTiwv}{j#pD|xtB%#jNS2zRB!gs<%o2t2NEX%v61lHh z28r{8zj8n}ap$IjG4`$#-uW($#$6vqdoZFfn#Blb$2GHT9*M+5lYyv~iD{Wh*@A=m zR*;_(WD2t(F(?#9S`3bHP#O)M3&ePCOg0JQd^k24CZIdNp7#Q89!8ho8z6#$Mo8}B zEaW@X0xZ9eq8`Qm(po?m(`#?!v*G zts*&c@#|{^D}}1LgDW+Sxf)NdrY=|40<<|#eXiE~g@bZh^8{tF<||8?_FsHCw`psd zX`efg{M!6juIsrp(*gO{e_&m4)Xg8i_;#-I`LxrYG|$&8Y|QoUNISbBU-yGoS2`iD zm2c`yJKK|%`JK6@jz#u%)7Hz)H^`(}5e1}C{tMEy<-83jklTs^SwWy$f^n*A)~<|4 zFz^-^9vc@0*~D^ZVU320`Ur-_1V3eoeh6`?)?&$bVTttzKZ>KQ8LdPsl9`}o)%jG=i+G#1YH$FxC;#ifmH$6prv(`>aN#gx~H4q}MTO~+@Rs&z7A6KfVVMb!2R-;-= zL2;5IE_fwO;QCw_KBlxi5)`)))Vv9?(u`%TPZ9N(5~SkSy!HbMUnm}la3N9fiQ}A)=guU;JjeRNF~l-KF&vNi*aYkiW4_ZhK ztuQ*ljUuD?PUUG_6sLrpJw0P#aXc~79g0T_XrwC=jExoYJ+SeO^zdAS3kqD%w(j2U za~hBl699g~!HCeaWk;8?dvI*m7#EEKhEX~Q6`>W3Br30M@LR=UJw%WS&Uh2{mKjna zCrT-y9)q?Z|D3!;!ep*Cz`p(0`nSkyiON%Ey@;DP`1#tdlzi#Aph>}BF!miDd1fH58^koM9OjSC*6xLCF!iQ zE$wW}Iyd~J`?ssGc@y9Cc*(VW8068j;pkm_N4~g!U(@kDP#_yw1Z2~j`yvtF;rNL@ zUqMP=G$Dw-@!(lbnG`}W%J~GqgDltWcLK6ZOhq^z^{T9gty4C{pbOcc06DV&pPBTon7oWpcCY712k8m*_^`LALxtI{=Wm6&?6Q6%h)(gPbdMX%6 zaQk>Z&ckNk4Ros5`bqA*Y&wgTg6vR6O=Tzq*e&Y|de;|hm1(sz{6>OoyKjpt5JOEs zB`S*;9P2(8KLoNJg`egA(U}-q|6<& zXCcZts^)9}6lRYCZqTBN%GpD|tlKze%h9!2x*<(BEVypdEjhO@>uyiG+ZTcv_ol_Z z+wQIu)0J{}<-Go^w>RzW{Un(3_GY|0E*x3WXct|#-5n{WBjtv^yje#_+R?G(=uAx9@%pbpPZd~>>ebjlmbKx5|p8v{@$b`!}u~(^FrTks(qQZzb_iZBB3Pt zCSd6qMq(a`g_x(nLd;{a5Cd4Ke1D}_h?#gi7AlV>m?y+S<&hLK@uXOYDvM>xw=aI1 z>xFFru$1K~+LysX55Zz|Ne?uP^CMC0G>kFK1nWa2d1TwQev;(-?T^G|M;?RAOp@-2 zahdHgyo^u5%O108cxlG|Hd8=*wz7!qfiIw*0Qfh`Qd8RKXla3ZUs>uRl3Pbz9!Xj? zDlIMWu93`zH*iQ)A?HLRfr4ROlC_YlU@bH5*dIM+9LwVM}EIs31L%~vUG-fI#yJV9bl2dX?6_Q)39HrJYrlfxY>|0!=fPI;u z9hRyUv9AVx*Q{{=|G=*@5WgSv%Ws8kkVhcRZ-+=Ifx0|euS-Gh2x6}?Aon6RECIO( ze$shA$nE?s|04EKG&J!$@ewD*8ocFShR306q=FXr`GZgy59lkI?~)5Sj8{#pobLuRUF^PA?3XfDc`DPz;|m<2aD9 z9+=&`{BTeV*0PkLGKW~W1R5(K*dH}- z^R{elN4mCSu{TrOm8QDp7%0qA^=Ybp{z%r_o%VJwp2>K7|C!pFuO~bk?lmI7-jy-e z&iCFnd+(G1*mI8MhL(@sxco+r@h+VD@%F#n^`l+)^twhX;Nv_+Gw2z#3%PKsYv) z5QY0}pKxcVqW`ln#R2`pqZ%eq`>hKzI!Tu#->DYyn689!XuTZL(+mzXdP!dvm*cHa zp~r{P=u2r>u&07u%%E6FdnNr9xPz65M`3lO5K$v4K_WHO@|ZTp@0(@}vFa!6!T5c+ zxDkssP)P^9nk2&&;`_xt9jarMnaufsYy9ev?FRIVEZ*oM&MVpF_U1tDQSTq zYh8dV8fd{<)?^?F5jP+ul#Iy(w!!UStzNSJ9pPjFS5)ww z%$T+ky@Wu%1vlicgMo@9-iESs`qzoKNWXPJCVes)kjX9{rJpu-PYCgtO!iE-48&Et z&LqdjxCq+i;8sKMjm8r(Rx$GVrrS>nXxtMMF(^=sKXoX@z>QT5L<+=XKyE=pq?$Fv!Ga_kl$SF^;z8C=sW(7l$i|Zp zI3Btj4a18R?r#F4@dyi@QDajHRSZTQg21RE6kT|lf{R|69F?_awBQMNve$Mxb%%3f zr*QLo-xbk2K=upx3A+Gi&JuT;+q2EP)6Kh6hQ^$^Ic@G*cGq6o`N7VtyDRPP%D8*x zsGPe#>uycETNm~&x-;&sawRJ@^>fS$U6CBWO*gK%>ysVxee+`ZF=!WRjU4# zxx;s=J*mbWsfHJCxNhva(VeP4n5lksj?OteSx0l)(Y)knU1ln>OjDX^O11Rg{QAu! zDc^}DW-#ZjLDP&QOVy^S+Eo3XoAozsw~nXkjxJHhmTj&NTa%NSik2l?>%y63Pvb}a z%l?JFE1h$Peyu0$ZFwuf)T9j6c_RTYMz7G-@bGK8q~>=51EctvwYtAs|5>-Oe}`GN z;PoU9w;&t%U~E#ZipDum!(elca%?y#0z!ey;}Cu+Eg;HM>@l|>evf>Qevf(2_MTn6 zIzC&t=?UY#5$H2T`_;30)&N(u#!1kia7Al^D_VWYvl*{!Sqp20n1&U~T(!VGP#v_l zoh5i0u9eNe!NS_1-pV?znxWPTWk%MZw8Az{EM2?L*K`%{7Ex9uNQ4A8U6S!**Y(2w z02jxWcXSf;Ch0EPit-wO$}p;nK`iMPjWI){R5acj9fMl9NKE);dY*6tiBG(%f!i7vE0%6cX{{=L4w*c5?E)|TUjfd;_C+dCf*MMC_9{7Hwwz04WKqq z&9uGS0;&T;*(%kYLatrkYOB_$OpV$`?bC}L0> zoe$cv^#+VOF+yJi#goBr;Qtu&?_o5D(FKg~JnJ{FbC6K|B9x{Ilg&ukjFg6Qo)FW` z>dp>ZVlWXA%kAy=83Is$pv`m@&+*?SOKsVuT1xxQ7szSQwc67lp&b9=%1a*{5oA3m zX>dj&&+*>`W~wP*R^3FYU6EjPgbnVVZbyu8|86Rj*aQk>{V>`B%JrLB;-`b3xo;l- zS@kV{x^HmlYuBHB{>_3E*v1gU?^55Ve z1)f9hJYlXeeLkTA@se;4}?WN*U+*s_`R=x zz%K@Kbslt*0-q$+Ps(j|TQ8V_v(=R+j8(wdXf@^SM6Kdcg-%sqw5Uh=np$wGQi!XS zTGi!B;ikDvI(~x!TrTP4hjAvCcYY0~ar*E=9NHs$0Rc zi?Yr3t+eZO5q`e*xQ&j2EJZ&(;xoX%A2oIUg}boDZ2 zS1mOu?<-5xo}8n4&JNhuQu%Gi1;o=$CH)ra* zGTVlhYEGud7?qW?ne9 zRDV3}I03?1D%C>*-I6l2q~O9^IU_u%eh@L^Z}l~ZN_VMpHWv+PpCkDyIJLtw&F4FwFu}ap{0(U*^XoBj$^6ggPD$@l%cSx>IMds z9o^q5TSh@PF&>Wy{CQx)Y=Mh5`%mC<2tQrhx%yz4%d)|zV z?MjK}FfFL>Nd=T6dH~#ukPBW~$_6$Z5>@m4pJL%xvCt6+1Y@x{xH*G!k^uC0=93L0 zAfc*AQC}+BR5nDwv>=;?6z3AvRFAG2ifLL^wWz|BK~j85mD#4@lxUnyM7Z7jCqRPZ zE3|@^%j-#!yhjnF>z4%mInnevQTsVj{VRg`ccSWZqV;oP^S@atldokgjkBhe{{1P! znQTiY<_8yA=U@Ba?M34v_ru7w$c??J?pM;A_M~d|q$~O|#Qxlqljcn%xpn@zRRWSc zwUu<{Y|gwM(iPgCH$n=UqM|Bq#vDbsD)Sc1SqYPMu60gG4qcd@>q#3u^FwK4^D2#X z6J!_JzRj)j4B`ecpWnSkcYZhQnfaH_58= num_tokens_post_padded: + return + # get the expert_id to process curr shard + ind = lora_id * stride_el + pid_m + expert_id = tl.load(expert_ids_ptr + ind, ind < max_loras * stride_el, -1) + if expert_id == -1: + return + # get a_ptr,b_ptr,c_ptr + cur_a_ptr = a_ptr + (slice_id % num_slice_a) * slice_a_size + cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty)) + cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size + + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N + offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + token_ind = stride_tl * lora_id + offs_token_id + offs_token = tl.load( + sorted_token_ids_ptr + token_ind, token_ind < max_loras * stride_tl, 0 + ) + token_mask = offs_token < num_valid_tokens + + # get a_ptrs,b_ptrs + a_ptrs = cur_a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + + b_ptrs = ( + cur_b_ptr + + lora_id * stride_bl + + expert_id * stride_be + + offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn + ) + + # accumulator + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, grid_k): + k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K) + # pre-fetch lora weight + b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0) + # GDC wait waits for ALL programs in the prior kernel to complete + # before continuing. + # if USE_GDC and not IS_PRIMARY: + # tl.extra.cuda.gdc_wait() + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < k_remaining), + other=0.0, + ) + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_ak + b_ptrs += BLOCK_SIZE_K * SPLIT_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + # if USE_GDC and IS_PRIMARY: + # # GDC launch dependents hints the runtime system to launch dependent kernels. + # tl.extra.cuda.gdc_launch_dependents() + accumulator = accumulator.to(c_ptr.dtype.element_ty) + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = cur_c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + + if SPLIT_K == 1: + tl.store(c_ptrs, accumulator, mask=c_mask) + else: + tl.atomic_add(c_ptrs, accumulator, mask=c_mask, sem="relaxed") + + +@torch.inference_mode() +def _fused_moe_lora_shrink( + a_intermediate_cache1: torch.Tensor, + # (num_slices, num_tokens, top_k_num, max_lora_rank) + qcurr_hidden_states: torch.Tensor, # (num_tokens, K,) + lora_a_stacked: list[ + torch.Tensor + ], # [(max_loras, num_experts, max_lora_rank, K,),...] + topk_weights: torch.Tensor, # (num_tokens, top_k_num) + sorted_token_ids: torch.Tensor, # (max_loras, _) + expert_ids: torch.Tensor, # (max_loras, _ ,) + num_tokens_post_padded: torch.Tensor, # (max_loras, ) + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + ## adding for kernel + device: torch.device, + N: int, + M: int, + EM: int, + K: int, + num_tokens: int, + num_experts: int, + num_slices: int, + block_size_m: int, + block_size_n: int, + block_size_k: int, + group_size_m: int, + num_warps: int, + num_stages: int, + split_k: int, + mul_routed_weight: bool = False, +) -> None: + w1_lora_a_stacked = lora_a_stacked[0] + # use_gdc = supports_pdl(qcurr_hidden_states.device) + shrink_config = { + "BLOCK_SIZE_M": block_size_m, + "BLOCK_SIZE_N": block_size_n, + "BLOCK_SIZE_K": block_size_k, + "GROUP_SIZE_M": group_size_m, + "num_warps": num_warps, + "num_stages": num_stages, + "SPLIT_K": split_k, + # "USE_GDC": use_gdc, + # "launch_pdl": use_gdc, # triton kernel metadata + } + + b_ptr = _get_ptr(lora_a_stacked, device) + + grid = lambda META: ( + split_k + * triton.cdiv(EM, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]), + len(lora_a_stacked), + lora_a_stacked[0].shape[0], + ) + _fused_moe_lora_kernel[grid]( + qcurr_hidden_states, + b_ptr, + a_intermediate_cache1, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + N, + K, + EM, + num_tokens, + num_experts, + lora_ids, + adapter_enabled, + qcurr_hidden_states.stride(0), + qcurr_hidden_states.stride(1), + w1_lora_a_stacked.stride(0), + w1_lora_a_stacked.stride(1), + w1_lora_a_stacked.stride(3), + w1_lora_a_stacked.stride(2), + a_intermediate_cache1.stride(2), + a_intermediate_cache1.stride(3), + sorted_token_ids.stride(0), + expert_ids.stride(0), + slice_a_size=qcurr_hidden_states.numel(), + slice_c_size=a_intermediate_cache1.numel() // num_slices, + num_slice_a=1, + num_slice_c=num_slices, + top_k=1 if mul_routed_weight else top_k_num, + MUL_ROUTED_WEIGHT=False, + # IS_PRIMARY=True, + **shrink_config, + ) + + +@torch.inference_mode() +def _fused_moe_lora_expand( + output: torch.Tensor, # (num_tokens, top_k_num, N*len(lora_a_stacked),) + a_intermediate_cache1: torch.Tensor, # (num_slices, M, top_k_num, max_lora_rank) + lora_b_stacked: list[ + torch.Tensor + ], # [(max_loras, num_experts, max_lora_rank, K,),...] + topk_weights: torch.Tensor, # (num_tokens, top_k_num) + sorted_token_ids: torch.Tensor, # (max_loras, _) + expert_ids: torch.Tensor, # (max_loras, _ ,) + num_tokens_post_padded: torch.Tensor, # (max_loras, ) + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + ## adding for kernel + device: torch.device, + N: int, + M: int, + EM: int, + K: int, + num_tokens: int, + num_experts: int, + num_slices: int, + max_lora_rank: int, + w1_output_dim_size: int, + block_size_m: int, + block_size_n: int, + block_size_k: int, + group_size_m: int, + num_warps: int, + num_stages: int, + split_k: int, + mul_routed_weight: bool = False, +) -> None: + b_ptr = _get_ptr(lora_b_stacked, device) + K = max_lora_rank + N = w1_output_dim_size + + w1_lora_b_stacked = lora_b_stacked[0] + + a_intermediate_cache1 = a_intermediate_cache1.view( + -1, a_intermediate_cache1.shape[3] + ) + + b_intermediate_cache1 = torch.zeros( + (num_slices, M, top_k_num, w1_output_dim_size), + dtype=output.dtype, + device=device, + ) + # use_gdc = supports_pdl(a_intermediate_cache1.device) + expand_config = { + "BLOCK_SIZE_M": block_size_m, + "BLOCK_SIZE_N": block_size_n, + "BLOCK_SIZE_K": block_size_k, + "GROUP_SIZE_M": group_size_m, + "num_warps": num_warps, + "num_stages": num_stages, + "SPLIT_K": split_k, # Set split_k = 1 for expand calls + # "USE_GDC": use_gdc, + # "launch_pdl": use_gdc, # triton kernel metadata + } + + grid = lambda META: ( + triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + len(lora_b_stacked), + lora_b_stacked[0].shape[0], + ) + _fused_moe_lora_kernel[grid]( + a_intermediate_cache1, + b_ptr, + b_intermediate_cache1, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + N, + K, + EM, + num_tokens, + num_experts, + lora_ids, + adapter_enabled, + a_intermediate_cache1.stride(0), + a_intermediate_cache1.stride(1), + w1_lora_b_stacked.stride(0), + w1_lora_b_stacked.stride(1), + w1_lora_b_stacked.stride(3), + w1_lora_b_stacked.stride(2), + b_intermediate_cache1.stride(2), + b_intermediate_cache1.stride(3), + sorted_token_ids.stride(0), + expert_ids.stride(0), + slice_a_size=a_intermediate_cache1.numel() // num_slices, + slice_c_size=b_intermediate_cache1.numel() // num_slices, + num_slice_a=num_slices, + num_slice_c=num_slices, + top_k=1, + MUL_ROUTED_WEIGHT=mul_routed_weight, + # IS_PRIMARY=False, + **expand_config, + ) + for i in range(num_slices): + output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i] + + +@torch.inference_mode() +def _fused_moe_lora( + output: torch.Tensor, # (num_tokens, top_k_num, N*len(lora_a_stacked),) + qcurr_hidden_states: torch.Tensor, # (num_tokens, K,) + lora_a_stacked: list[ + torch.Tensor + ], # [(max_loras, num_experts, max_lora_rank, K,),...] + lora_b_stacked: list[ + torch.Tensor + ], # [(max_loras, num_experts, N, max_lora_rank,),...] + topk_weights: torch.Tensor, # (num_tokens, top_k_num) + sorted_token_ids: torch.Tensor, # (max_loras, _) + expert_ids: torch.Tensor, # (max_loras, _ ,) + num_tokens_post_padded: torch.Tensor, # (max_loras, ) + max_lora_rank: int, + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + shrink_block_size_m: int, + shrink_block_size_n: int, + shrink_block_size_k: int, + shrink_group_size_m: int, + shrink_num_warps: int, + shrink_num_stages: int, + shrink_split_k: int, + expand_block_size_m: int, + expand_block_size_n: int, + expand_block_size_k: int, + expand_group_size_m: int, + expand_num_warps: int, + expand_num_stages: int, + expand_split_k: int, + mul_routed_weight: bool = False, +) -> None: + assert len(lora_a_stacked) == len(lora_b_stacked) > 0 + assert ( + sorted_token_ids.dim() + == expert_ids.dim() + == topk_weights.dim() + == qcurr_hidden_states.dim() + == 2 + ) + assert ( + sorted_token_ids.shape[0] + == expert_ids.shape[0] + == num_tokens_post_padded.shape[0] + ) + assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1] + assert output.shape[0] == topk_weights.shape[0] + assert top_k_num == topk_weights.shape[1] + device = qcurr_hidden_states.device + num_slices = len(lora_a_stacked) + w1_lora_b_stacked = lora_b_stacked[0] + num_experts = lora_a_stacked[0].shape[1] + N = max_lora_rank + M = topk_weights.shape[0] + EM = sorted_token_ids.shape[1] + K = qcurr_hidden_states.shape[1] + num_tokens = M * top_k_num + w1_output_dim_size = w1_lora_b_stacked.shape[2] + + a_intermediate_cache1 = torch.zeros( + (num_slices, M, top_k_num, max_lora_rank), + dtype=output.dtype, + device=device, + ) + + _fused_moe_lora_shrink( + a_intermediate_cache1, + qcurr_hidden_states, + lora_a_stacked, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + top_k_num, + lora_ids, + adapter_enabled, + ## adding for kernel + device, + N, + M, + EM, + K, + num_tokens, + num_experts, + num_slices, + shrink_block_size_m, + shrink_block_size_n, + shrink_block_size_k, + shrink_group_size_m, + shrink_num_warps, + shrink_num_stages, + shrink_split_k, + mul_routed_weight, + ) + + _fused_moe_lora_expand( + output, + a_intermediate_cache1, + lora_b_stacked, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + top_k_num, + lora_ids, + adapter_enabled, + ## adding for kernel + device, + N, + M, + EM, + K, + num_tokens, + num_experts, + num_slices, + max_lora_rank, + w1_output_dim_size, + expand_block_size_m, + expand_block_size_n, + expand_block_size_k, + expand_group_size_m, + expand_num_warps, + expand_num_stages, + expand_split_k, + mul_routed_weight, + ) + + +def _fused_moe_lora_fake( + output: torch.Tensor, + qcurr_hidden_states: torch.Tensor, + lora_a_stacked: list[torch.Tensor], + lora_b_stacked: list[torch.Tensor], + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + max_lora_rank: int, + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + shrink_block_size_m: int, + shrink_block_size_n: int, + shrink_block_size_k: int, + shrink_group_size_m: int, + shrink_num_warps: int, + shrink_num_stages: int, + shrink_split_k: int, + expand_block_size_m: int, + expand_block_size_n: int, + expand_block_size_k: int, + expand_group_size_m: int, + expand_num_warps: int, + expand_num_stages: int, + expand_split_k: int, + mul_routed_weight: bool = False, +) -> None: + return + + +def _fused_moe_lora_shrink_fake( + a_intermediate_cache1: torch.Tensor, + qcurr_hidden_states: torch.Tensor, + lora_a_stacked: list[torch.Tensor], + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + device: torch.device, + N: int, + M: int, + EM: int, + K: int, + num_tokens: int, + num_experts: int, + num_slices: int, + block_size_m: int, + block_size_n: int, + block_size_k: int, + group_size_m: int, + num_warps: int, + num_stages: int, + split_k: int, + mul_routed_weight: bool = False, +) -> None: + return + + +def _fused_moe_lora_expand_fake( + output: torch.Tensor, + a_intermediate_cache1: torch.Tensor, + lora_b_stacked: list[torch.Tensor], + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + device: torch.device, + N: int, + M: int, + EM: int, + K: int, + num_tokens: int, + num_experts: int, + num_slices: int, + max_lora_rank: int, + w1_output_dim_size: int, + block_size_m: int, + block_size_n: int, + block_size_k: int, + group_size_m: int, + num_warps: int, + num_stages: int, + split_k: int, + mul_routed_weight: bool = False, +) -> None: + return + + +try: + direct_register_custom_op( + op_name="fused_moe_lora", + op_func=_fused_moe_lora, + mutates_args=["output"], + fake_impl=_fused_moe_lora_fake, + ) + + direct_register_custom_op( + op_name="fused_moe_lora_shrink", + op_func=_fused_moe_lora_shrink, + mutates_args=["a_intermediate_cache1"], + fake_impl=_fused_moe_lora_shrink_fake, + ) + + direct_register_custom_op( + op_name="fused_moe_lora_expand", + op_func=_fused_moe_lora_expand, + mutates_args=["output"], + fake_impl=_fused_moe_lora_expand_fake, + ) + + fused_moe_lora = torch.ops.vllm.fused_moe_lora + fused_moe_lora_shrink = torch.ops.vllm.fused_moe_lora_shrink + fused_moe_lora_expand = torch.ops.vllm.fused_moe_lora_expand + +except AttributeError: + fused_moe_lora = _fused_moe_lora + fused_moe_lora_shrink = _fused_moe_lora_shrink + fused_moe_lora_expand = _fused_moe_lora_expand diff --git a/lora/ops/triton_ops/kernel_utils.py b/lora/ops/triton_ops/kernel_utils.py new file mode 100644 index 0000000..ef68119 --- /dev/null +++ b/lora/ops/triton_ops/kernel_utils.py @@ -0,0 +1,364 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Utilities for Punica kernel construction. +""" + +from vllm.triton_utils import tl, triton + + +@triton.jit +def mm_k( + a_ptr, + b_ptr, + ak_stride, + bn_stride, + bk_stride, + offset_k, + K: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + SPLIT_K: tl.constexpr, + CAST_TYPE: tl.constexpr, + b_dtype: tl.constexpr, + # USE_GDC: tl.constexpr, + base_k, + USE_STRIDE_LOAD: tl.constexpr, +): + """ + Given a_ptr and b_ptr, that identify the rows of A (m x k) and columns of + B (k x n), iterate, through the K dimension to compute the partial/complete + matrix block product. + If SPLIT_K == 1, the output m x n product is complete. + If SPLIT_K > 1, the thread block computes partial outputs. The partial + outputs are then atomically summed in the caller code. + Args: + a_ptr: Array of pointers, identifying rows of A + b_ptr: Array of pointers, identifying columns of B + ak_stride: K dimension stride of the A matrix + bn_stride: N dimension stride of the B matrix + bk_stride: K dimension stride of the B matrix + K: Length of the K dimension + BLOCK_M: M dimension of the output block m x n + BLOCK_N: N dimension of the output block m x n + BLOCK_K: K dimension atom + EVEN_K: True if the blocks of A and B can be loaded without any + masking. + SPLIT_K: Parameter signifying parallelism in the K dimension. + CAST_TYPE: if True, cast the values from the A matrix to the B + matrix dtype. + b_dtype: datatype of the B matrix + USE_GDC: Whether to use PDL. True indicates use. + USE_STRIDE_LOAD: Whether to use stride load for the B matrix. + base_k: Base offset along K dimension for current SPLIT_K group + """ + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + # Step size along K for each iteration + STEP_K = BLOCK_K * SPLIT_K + + # Total number of iterations (compile-time constant) + num_iters = tl.cdiv(K, STEP_K) + + for k in range(num_iters): + # Current iteration's global K offset + iter_k = k * STEP_K + base_k + + # Check if this iteration is completely valid (no masking needed) + block_end = iter_k + BLOCK_K + + if EVEN_K: + # K is divisible by BLOCK_K, no masking ever needed + # pre-fetch lora weight + # tiled_b = tl.load(b_ptr) + # if USE_GDC: + # tl.extra.cuda.gdc_wait() + if USE_STRIDE_LOAD: + tiled_b = tl.load(b_ptr, stride=bn_stride) + else: + tiled_b = tl.load(b_ptr) + tiled_a = tl.load(a_ptr) + if CAST_TYPE: + tiled_a = tiled_a.to(b_dtype) + accumulator += tl.dot(tiled_a, tiled_b) + else: + # Check if we need element-wise masking + if iter_k >= K: + # Entire block out of range, skip + pass + elif block_end <= K: + # Entire block in range, no masking needed (fast path) + # tiled_b = tl.load(b_ptr) + # if USE_GDC: + # tl.extra.cuda.gdc_wait() + if USE_STRIDE_LOAD: + tiled_b = tl.load(b_ptr, stride=bn_stride) + else: + tiled_b = tl.load(b_ptr) + tiled_a = tl.load(a_ptr) + if CAST_TYPE: + tiled_a = tiled_a.to(b_dtype) + accumulator += tl.dot(tiled_a, tiled_b) + else: + # Partial block, need masking (only last iteration) + k_offsets = tl.arange(0, BLOCK_K) + mask = iter_k + k_offsets < K + # tiled_b = tl.load(b_ptr, mask=mask[:, None], other=0.0) + # if USE_GDC: + # tl.extra.cuda.gdc_wait() + if USE_STRIDE_LOAD: + tiled_b = tl.load( + b_ptr, stride=bn_stride, mask=mask[:, None], other=0.0 + ) + else: + tiled_b = tl.load(b_ptr, mask=mask[:, None], other=0.0) + tiled_a = tl.load(a_ptr, mask=mask[None, :], other=0.0) + if CAST_TYPE: + tiled_a = tiled_a.to(b_dtype) + accumulator += tl.dot(tiled_a, tiled_b) + + a_ptr += STEP_K * ak_stride + b_ptr += STEP_K * bk_stride + + return accumulator + + +@triton.jit +def do_expand_kernel( + pid_n, + lora_index, + slice_id, + input_ptr, + lora_ptr, + out_ptr, + N, + K, + M_LEN, + ram, # array identifying the rows of Input ptr to operate on + slice_start_loc, + # input ptr strides + input_d0_stride, + input_d1_stride, + input_d2_stride, + # lora ptr strides + ls_d0_ptr, + ls_d1_ptr, + ls_d2_ptr, + # out ptr strides + output_d0_stride, + output_d1_stride, + # constants + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + SAME_STRIDE: tl.constexpr, + SLICE_NUM: tl.constexpr, + EVEN_K: tl.constexpr, + CAST_TYPE: tl.constexpr, + ADD_INPUTS: tl.constexpr, + # USE_GDC: tl.constexpr, + USE_STRIDE_LOAD: tl.constexpr, +): + """ + Given an array of integers that identifies the rows of A, ram, + a lora index that identifies which LoRA to use from lora_ptr, lora_index, + a slice_id that identifies the input/output slice, + compute the matrix product and store in the appropriate output location. + Given that this is an expand kernel, we don't perform any split-K reduction + as the K dimension is assumed to be small. + """ + + # ls_d*_ptr can be either an integer or a pointer + if SAME_STRIDE: + # integer + cur_lora_d0_stride = ls_d0_ptr + cur_lora_d1_stride = ls_d1_ptr + cur_lora_d2_stride = ls_d2_ptr + else: + # pointer + cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id) + cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id) + cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id) + + # Identify the input_ptr and lora_ptr from slice_id. + if SLICE_NUM == 1: + cur_input_ptr = input_ptr + cur_lora_ptr = lora_ptr + else: + cur_input_ptr = input_ptr + slice_id * input_d0_stride + cur_lora_ptr = tl.load(lora_ptr + slice_id).to( + tl.pointer_type(out_ptr.dtype.element_ty) + ) + + # Identify the column indices of B to process. + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + + # Identify A and B block pointers + offset_k = tl.arange(0, BLOCK_K) + a_ptr = ( + cur_input_ptr + + ram[:, None] * input_d1_stride + + offset_k[None, :] * input_d2_stride + ) + b_ptr = ( + cur_lora_ptr + + cur_lora_d0_stride * lora_index + + offset_k[:, None] * cur_lora_d2_stride + + rbn[None, :] * cur_lora_d1_stride + ) + + # Compute the block matrix product. + SPLIT_K = 1 + + accumulator = mm_k( + a_ptr, + b_ptr, + input_d2_stride, + cur_lora_d1_stride, + cur_lora_d2_stride, + offset_k, + K, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + CAST_TYPE, + cur_lora_ptr.dtype.element_ty, + # USE_GDC, + base_k=0, + USE_STRIDE_LOAD = USE_STRIDE_LOAD + ) + + tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty) + if SLICE_NUM == 1: + cur_slice_start = slice_start_loc + else: + cur_slice_start = tl.load(slice_start_loc + slice_id) + + # Identify the C output pointers to store the results of the accumulator. + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start + offset_cm = tl.arange(0, BLOCK_M) + c_ptr = ( + out_ptr + + ram[:, None] * output_d0_stride + + offset_cn[None, :] * output_d1_stride + ) + c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] < (cur_slice_start + N)) + + if ADD_INPUTS: + tiled_out = tl.load(c_ptr, mask=c_mask) + tiled_c += tiled_out + tl.store(c_ptr, tiled_c, mask=c_mask) + + +@triton.jit +def do_shrink_kernel( + pid_n, + pid_sk, + slice_id, + lora_index, + input_ptr, + lora_ptr, + out_ptr, + N, + K, + M_LEN, + ram, + # input strides + input_d0_stride, + input_d1_stride, + # lora strides + lora_d0_stride, + lora_d1_stride, + lora_d2_stride, + # output strides + output_d0_stride, + output_d1_stride, + output_d2_stride, + scaling, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + SPLIT_K: tl.constexpr, + SLICE_NUM: tl.constexpr, + # USE_GDC: tl.constexpr, + USE_STRIDE_LOAD: tl.constexpr, +): + """ + Given an array of integers that identifies the rows of A, ram, + a lora index that identifies which LoRA to use from lora_ptr, lora_index, + a slice_id that identifies the input/output slice, compute the + matrix product and store in the appropriate output location. + """ + + # Identify the lora_ptr from slice_id. + if SLICE_NUM == 1: + # current lora ptr + cur_lora_ptr = lora_ptr + else: + # current lora ptr + cur_lora_ptr = tl.load(lora_ptr + slice_id).to( + tl.pointer_type(input_ptr.dtype.element_ty) + ) + + # Identify the column indices of B to process. + offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N) + + # Identify A and B block pointers + offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K) + a_ptr = ( + input_ptr + ram[:, None] * input_d0_stride + offset_k[None, :] * input_d1_stride + ) + b_ptr = ( + cur_lora_ptr + + lora_d0_stride * lora_index + + rbn[None, :] * lora_d1_stride + + offset_k[:, None] * lora_d2_stride + ) + + # Compute partial/complete block matrix product. + accumulator = mm_k( + a_ptr, + b_ptr, + input_d1_stride, + lora_d1_stride, + lora_d2_stride, + offset_k, + K, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + False, + cur_lora_ptr.dtype.element_ty, + # False, # USE_GDC is always False in shrink kernel + base_k=pid_sk * BLOCK_K, + USE_STRIDE_LOAD = USE_STRIDE_LOAD + ) + # GDC launch dependents hints the runtime system to launch dependent kernels. + # if USE_GDC: + # tl.extra.cuda.gdc_launch_dependents() + # Identify the C output pointers to store the results of the accumulator. + offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + offset_cm = tl.arange(0, BLOCK_M) + cur_out_ptr = out_ptr if SLICE_NUM == 1 else out_ptr + slice_id * output_d0_stride + c_ptr = ( + cur_out_ptr + + ram[:, None] * output_d1_stride + + offset_cn[None, :] * output_d2_stride + ) + c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] < N) + accumulator *= scaling + + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(c_ptr, accumulator, mask=c_mask) + else: + tl.atomic_add(c_ptr, accumulator, mask=c_mask, sem="relaxed") diff --git a/lora/ops/triton_ops/lora_expand_op.py b/lora/ops/triton_ops/lora_expand_op.py new file mode 100644 index 0000000..3b6bfdd --- /dev/null +++ b/lora/ops/triton_ops/lora_expand_op.py @@ -0,0 +1,336 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +import os +from contextlib import contextmanager +import torch + +from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel +from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr, get_lora_op_configs +from vllm.triton_utils import tl, triton +from vllm.utils.torch_utils import direct_register_custom_op + + +@contextmanager +def _temporary_env(var_name: str, value: str | None): + prev_value = os.environ.get(var_name) + if value is None: + os.environ.pop(var_name, None) + else: + os.environ[var_name] = value + try: + yield + finally: + if prev_value is None: + os.environ.pop(var_name, None) + else: + os.environ[var_name] = prev_value + + +@triton.jit +def _lora_expand_kernel( + input_ptr, + lora_ptr, + out_ptr, + M, + N, + K, + token_indices_sorted_by_lora_ids, + num_tokens_per_lora, + lora_token_start_loc, + lora_ids, + slice_start_loc, + input_d0_stride, + input_d1_stride, + input_d2_stride, # 1 + ls_d0_ptr, + ls_d1_ptr, + ls_d2_ptr, # 1 + output_d0_stride, + output_d1_stride, # 1 + output_hs_ptr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + ADD_INPUTS: tl.constexpr, + CAST_TYPE: tl.constexpr, + SLICE_NUM: tl.constexpr, + SAME_STRIDE: tl.constexpr, + # USE_GDC: tl.constexpr, + # launch_pdl: tl.constexpr, + USE_STRIDE_LOAD: tl.constexpr, +): + cta_n_num = tl.cdiv(N, BLOCK_N) + cta_m_num = tl.cdiv(M, BLOCK_M) + + pid_mn = tl.program_id(axis=0) + pid_m = pid_mn % cta_m_num + pid_n = (pid_mn // cta_m_num) % cta_n_num + + slice_id = tl.program_id(axis=1) + lora_idx = tl.program_id(axis=2) + + lora_id = tl.load(lora_ids + lora_idx) + if lora_id == -1: + # Early exit for the no-lora case. + return + + lora_m_size = tl.load(num_tokens_per_lora + lora_idx) + + cta_m_offset = pid_m * BLOCK_M + if cta_m_offset >= lora_m_size: + # Early exit CTA. + return + + # When the output dimensions of each slice are the same,cur_n=N, otherwise + # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's + # qkv linear. + curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id) + if pid_n * BLOCK_N >= curr_N: + # Early exit CTA. + return + + # num rows this CTA should process. + cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset) + + # Identify all rows that this CTA should process. + lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx) + cta_lora_seq_indices = ( + token_indices_sorted_by_lora_ids + lora_m_indices_start + cta_m_offset + ) + + # Load all relevant row indices. + offset_m = tl.arange(0, BLOCK_M) % cta_m_len + ram = tl.load(cta_lora_seq_indices + offset_m) + + do_expand_kernel( + pid_n, + lora_id, + slice_id, + input_ptr, + lora_ptr, + out_ptr, + curr_N, + K, + cta_m_len, + ram, # array identifying the rows of Input ptr to operate on + slice_start_loc, + # input ptr strides + input_d0_stride, + input_d1_stride, + input_d2_stride, + # lora ptr strides + ls_d0_ptr, + ls_d1_ptr, + ls_d2_ptr, + # out ptr strides + output_d0_stride, + output_d1_stride, + # constants + BLOCK_M, + BLOCK_N, + BLOCK_K, + SAME_STRIDE, + SLICE_NUM, + EVEN_K, + CAST_TYPE, + ADD_INPUTS, + # USE_GDC, + USE_STRIDE_LOAD, + ) + + +@torch.inference_mode() +def _lora_expand( + inputs: torch.Tensor, # shape [num_slices, num_tokens, lora_rank] + lora_b_weights: list[torch.Tensor], # shape [num_lora, hidden_size, lora_rank] + output_tensor: torch.Tensor, # shape [num_tokens, hidden_size * num_slices] + token_lora_mapping: torch.Tensor, # shape [num_tokens] + token_indices_sorted_by_lora_ids: torch.Tensor, # shape [num_tokens] + num_tokens_per_lora: torch.Tensor, # shape [max-loras + 1] + lora_token_start_loc: torch.Tensor, # shape [max-loras + 2] + lora_ids: torch.Tensor, # shape [max-loras + 1] + no_lora_flag_cpu: torch.Tensor, # shape [1] + offset_start: int = 0, + add_inputs: bool = False, +) -> None: + """ + Args: + inputs (torch.Tensor): input tensor + lora_b_weights (list[torch.Tensor]): lora'b weight + output_tensor (torch.Tensor): output tensor + token_lora_mapping (torch.Tensor): A tensor mapping each input token + to the lora-id related to that token. A value of -1 indicates that + LoRA doesn't apply to that token. + token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from + the A matrix grouped by LoRA IDs. + num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number + of tokens that are to be processed by LoRA ID lora_ids[i] + lora_token_start_loc (torch.Tensor): A cumulative sum of + num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that + lora_token_start_loc[i], along with num_tokens_per_lora[i] + identifies the region in token_indices_sorted_by_lora_ids that + LoRA lora_ids[i] should process. + lora_ids (torch.Tensor): LoRA ids to process. + no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates + if there are any requests that require LoRA. + offset_start (int, optional): Offset start for output_tensor. + Defaults to 0. + add_inputs (bool, optional): Whether to add the input tensor to the + output tensor. Defaults to False. + """ + + assert no_lora_flag_cpu.numel() == 1 + if no_lora_flag_cpu.item(): + # None of the inputs require LoRA. + return + + assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32] + for weight in lora_b_weights: + assert weight.dtype in [torch.float16, torch.bfloat16] + + assert inputs.size(0) == len(lora_b_weights) + assert output_tensor.is_contiguous() + + # metadata sanity check. + M = inputs.size(1) + assert token_lora_mapping.size(0) == M + assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(0) + assert lora_ids.size(0) == num_tokens_per_lora.size(0) + assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1 + + ( + slice_start_tensor, + lora_ptr_tensor, + lora_strides_d0_tensor, + lora_strides_d1_tensor, + lora_strides_d2_tensor, + hidden_sizes_tensor, + same_stride, + MAX_N, + ) = _get_lora_b_ptr(lora_b_weights, offset_start, inputs.device) + + K = lora_b_weights[0].shape[-1] # K= rank + ADD_INPUTS = add_inputs + MAX_LORAS = lora_ids.size(0) + CAST_TYPE = False + NUM_SLICES = len(lora_b_weights) + + # Triton kernel configs. + kernel_config = get_lora_op_configs( + op_type="expand", + max_loras=MAX_LORAS, + batch=M, + hidden_size=MAX_N, + rank=K, + num_slices=NUM_SLICES, + add_inputs=add_inputs, + ) + BLOCK_M = kernel_config["block_m"] + BLOCK_N = kernel_config["block_n"] + BLOCK_K = kernel_config["block_k"] + NUM_WARPS = kernel_config["num_warps"] + NUM_CTAS = kernel_config["num_ctas"] + NUM_STAGES = kernel_config["num_stages"] + + EVEN_K = K % BLOCK_K == 0 # type: ignore + if same_stride: + use_stride_load = False + else: + elem_size = lora_b_weights[0].element_size() + use_stride_load = all( + (weight.stride(1) * elem_size) % 64 == 0 for weight in lora_b_weights + ) + + if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [ + torch.float16, + torch.bfloat16, + ]: + CAST_TYPE = True + + # TODO (varun): This grid formulation maximizes parallelization at the + # cost of wasteful thread block launch when only a few input tokens require + # LoRA. This might not be the best in all cases. + grid = ( + triton.cdiv(M, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N), + NUM_SLICES, + # Each LoRA receives its own set of thread blocks for output + # computation. If some LoRA doesn't have any tokens to process, its + # thread blocks simply exit. + MAX_LORAS, + ) + disable_store_stp = os.getenv("VLLM_LORA_DISABLE_STORE_STP", "1") == "1" + with _temporary_env("TRITON_DISABLE_STORE_STP", + "1" if disable_store_stp else None): + _lora_expand_kernel[grid]( + inputs, + lora_ptr_tensor, + output_tensor, + M, + MAX_N, + K, + token_indices_sorted_by_lora_ids, + num_tokens_per_lora, + lora_token_start_loc, + lora_ids, + slice_start_tensor, + inputs.stride(0), + inputs.stride(1), + inputs.stride(2), + lora_strides_d0_tensor, + lora_strides_d1_tensor, + lora_strides_d2_tensor, + output_tensor.stride(0), + output_tensor.stride(1), + hidden_sizes_tensor, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + ADD_INPUTS, + CAST_TYPE, + NUM_SLICES, + same_stride, + use_stride_load, + num_warps=NUM_WARPS, + num_ctas=NUM_CTAS, + num_stages=NUM_STAGES, + ) + + return + + +def _lora_expand_fake( + inputs: torch.Tensor, + lora_b_weights: list[torch.Tensor], + output_tensor: torch.Tensor, + token_lora_mapping: torch.Tensor, + token_indices_sorted_by_lora_ids: torch.Tensor, + num_tokens_per_lora: torch.Tensor, + lora_token_start_loc: torch.Tensor, + lora_ids: torch.Tensor, + no_lora_flag_cpu: torch.Tensor, + offset_start: int = 0, + add_inputs: bool = False, +) -> None: + return + + +try: + direct_register_custom_op( + op_name="lora_expand", + op_func=_lora_expand, + mutates_args=["output_tensor"], + fake_impl=_lora_expand_fake, + ) + lora_expand = torch.ops.vllm.lora_expand + +except AttributeError: + lora_expand = _lora_expand diff --git a/lora/ops/triton_ops/lora_kernel_metadata.py b/lora/ops/triton_ops/lora_kernel_metadata.py new file mode 100644 index 0000000..c3bef76 --- /dev/null +++ b/lora/ops/triton_ops/lora_kernel_metadata.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +LoRA kernels metadata preparation utilities. +""" + +from dataclasses import dataclass + +import torch + + +@dataclass +class LoRAKernelMeta: + token_lora_mapping: torch.Tensor + token_indices_sorted_by_lora_ids: torch.Tensor + active_lora_ids: torch.Tensor + num_tokens_per_lora: torch.Tensor + lora_token_start_loc: torch.Tensor + + # The V1 architecture uses the traced torch.compile graphs to execute + # a forward pass. Things to note about this process, + # 1. The tracing infers all python scalar datatype objects into a constant + # value. + # 2. The tracing cannot handle dynamic control flow. (dynamic control flow + # is an experimental feature in pytorch) + # 3. The internals of torch.ops functions are not traced. + # We disguise the "no_lora" flag as a cpu tensor and leverage point number 3 + # to early exit from inside the lora_expand / lora_shrink torch operation. + no_lora_flag_cpu: torch.Tensor + + @staticmethod + def make( + max_loras: int, max_num_tokens: int, device: torch.device | str + ) -> "LoRAKernelMeta": + token_lora_mapping = torch.empty( + max_num_tokens, dtype=torch.int32, device=device + ) + + token_indices_sorted_by_lora_ids = torch.empty( + max_num_tokens, dtype=torch.int32, device=device + ) + + # +1 because "no-lora" is also a possibility + # example: let max_loras be 3, active_lora_ids of [-1, 0, 2, 1] + # is a possibility. + active_lora_ids = torch.empty(max_loras + 1, dtype=torch.int32, device=device) + + # using running example, [3, 10, 5, 2] is a possibility. + num_tokens_per_lora = torch.zeros( + max_loras + 1, dtype=torch.int32, device=device + ) + + # +2 for this because, the first index is always 0. + # using running example, lora_token_start_loc + # is [0, 3, 13, 18, 20]. + lora_token_start_loc = torch.zeros( + max_loras + 2, dtype=torch.int32, device=device + ) + + no_lora_flag_cpu = torch.tensor([False], dtype=torch.bool, device="cpu") + + return LoRAKernelMeta( + token_lora_mapping=token_lora_mapping, + token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids, + active_lora_ids=active_lora_ids, + num_tokens_per_lora=num_tokens_per_lora, + lora_token_start_loc=lora_token_start_loc, + no_lora_flag_cpu=no_lora_flag_cpu, + ) + + def _reset(self): + self.active_lora_ids.fill_(-1) + self.num_tokens_per_lora.fill_(0) + self.lora_token_start_loc.fill_(0) + self.no_lora_flag_cpu.fill_(False) + + def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None: + """ + Prepare kernel metadata tensors for the current forward pass. + + Args: + token_lora_mapping (torch.Tensor): Tensor containing lora indices + for each input token. + """ + + self._reset() + + # Check and record no-lora case. + no_lora = torch.all(token_lora_mapping == -1) + self.no_lora_flag_cpu[0] = no_lora + + if no_lora: + # Early exit. LoRA kernels will not be run. + return + + num_tokens = token_lora_mapping.size(0) + + # copy token lora mapping + self.token_lora_mapping[:num_tokens].copy_( + token_lora_mapping, non_blocking=True + ) + + # token_indices_sorted_by_lora_ids + _, token_indices_sorted_by_lora_ids = torch.sort( + token_lora_mapping, stable=True + ) + # start gpu transfer + self.token_indices_sorted_by_lora_ids[:num_tokens].copy_( + token_indices_sorted_by_lora_ids, non_blocking=True + ) + + # active_lora_ids, num_tokens_per_lora + lora_ids, num_tokens_per_lora = torch.unique( + token_lora_mapping, sorted=True, return_counts=True + ) + self.active_lora_ids[: lora_ids.size(0)].copy_(lora_ids, non_blocking=True) + self.num_tokens_per_lora[: num_tokens_per_lora.size(0)].copy_( + num_tokens_per_lora, non_blocking=True + ) + + # lora_token_start_loc + lora_token_start_loc = torch.cumsum(num_tokens_per_lora, dim=0) + self.lora_token_start_loc[1 : 1 + lora_token_start_loc.size(0)].copy_( + lora_token_start_loc, non_blocking=True + ) + + def meta_args( + self, token_nums: int + ) -> tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + ]: + """ + This function returns the kernel metadata required for the current + forward pass execution of the kernel. The function returns all the + metadata required by the kernel, in order, as a tuple, so it can be + unpacked directly during the lora_shrink/lora_expand function call. + + Args: + token_nums (int): Number of input tokens in the current forward + pass of the kernel. + """ + return ( + self.token_lora_mapping[:token_nums], + self.token_indices_sorted_by_lora_ids[:token_nums], + self.num_tokens_per_lora, + self.lora_token_start_loc, + self.active_lora_ids, + self.no_lora_flag_cpu, + ) diff --git a/lora/ops/triton_ops/lora_shrink_op.py b/lora/ops/triton_ops/lora_shrink_op.py new file mode 100644 index 0000000..71a4e2e --- /dev/null +++ b/lora/ops/triton_ops/lora_shrink_op.py @@ -0,0 +1,290 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +import os +import torch + +from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel +from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr, get_lora_op_configs +from vllm.triton_utils import tl, triton +from vllm.utils.torch_utils import direct_register_custom_op +from .utils import supports_pdl + +@triton.jit +def _lora_shrink_kernel( + input_ptr, + lora_ptr, + out_ptr, + M, + N, + K, + token_indices_sorted_by_lora_ids, + num_tokens_per_lora, + lora_token_start_loc, + lora_ids, + scaling, + input_d0_stride, + input_d1_stride, + lora_d0_stride, + lora_d1_stride, + lora_d2_stride, + output_d0_stride, + output_d1_stride, + output_d2_stride, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + EVEN_K: tl.constexpr, + SPLIT_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + SLICE_NUM: tl.constexpr, + # USE_GDC: tl.constexpr, + # launch_pdl: tl.constexpr, + USE_STRIDE_LOAD: tl.constexpr, +): + cta_n_num = tl.cdiv(N, BLOCK_N) + cta_m_num = tl.cdiv(M, BLOCK_M) + + pid_sk_m_n = tl.program_id(axis=0) + pid_sk = pid_sk_m_n % SPLIT_K + + pid_m_n = pid_sk_m_n // SPLIT_K + num_pid_in_group = GROUP_SIZE_M * cta_n_num + group_id = pid_m_n // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(cta_m_num - first_pid_m, GROUP_SIZE_M) + + # Column-major ordering within groups for better cache reuse + pid_m = first_pid_m + ((pid_m_n % num_pid_in_group) % group_size_m) + pid_n = (pid_m_n % num_pid_in_group) // group_size_m + + slice_id = tl.program_id(axis=1) + lora_idx = tl.program_id(axis=2) + + lora_id = tl.load(lora_ids + lora_idx) + if lora_id == -1: + # Early exit for the no-lora case. + return + + lora_m_size = tl.load(num_tokens_per_lora + lora_idx) + + cta_m_offset = pid_m * BLOCK_M + if cta_m_offset >= lora_m_size: + # Early exit CTA. + return + + # num rows this CTA should process. + cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset) + + # Identify all rows that this CTA should process. + lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx) + cta_lora_seq_indices = ( + token_indices_sorted_by_lora_ids + lora_m_indices_start + cta_m_offset + ) + # Load all relevant row indices. + offset_m = tl.arange(0, BLOCK_M) % cta_m_len + ram = tl.load(cta_lora_seq_indices + offset_m) + + do_shrink_kernel( + pid_n, + pid_sk, + slice_id, + lora_id, + input_ptr, + lora_ptr, + out_ptr, + N, + K, + cta_m_len, + ram, # array identifying the rows of Input ptr to operate on + # input strides + input_d0_stride, + input_d1_stride, + # lora strides + lora_d0_stride, + lora_d1_stride, + lora_d2_stride, + # output strides + output_d0_stride, + output_d1_stride, + output_d2_stride, + scaling, + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + SLICE_NUM, + # USE_GDC, + USE_STRIDE_LOAD, + ) + + +@torch.inference_mode() +def _lora_shrink( + inputs: torch.Tensor, # shape [num_tokens, hidden_size] + lora_a_weights: list[torch.Tensor], # shape [num_loras, lora_rank, hidden_size] + output_tensor: torch.Tensor, # shape [num_slices, num_tokens, lora_rank] + token_lora_mapping: torch.Tensor, # shape [num_tokens] + token_indices_sorted_by_lora_ids: torch.Tensor, # shape [num_tokens] + num_tokens_per_lora: torch.Tensor, # shape [max-loras + 1] + lora_token_start_loc: torch.Tensor, # shape [max-loras + 2] + lora_ids: torch.Tensor, # shape [max-loras + 1] + no_lora_flag_cpu: torch.Tensor, # shape [1] + scaling: float, +) -> None: + """ + Args: + inputs (torch.Tensor): Input tensor + lora_a_weights (list[torch.Tensor]): LoRA weights + output_tensor (torch.Tensor): output tensor + token_lora_mapping (torch.Tensor): A tensor mapping each input token + to the lora-id related to that token. A value of -1 indicates that + LoRA doesn't apply to that token. + token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from + the A matrix grouped by LoRA IDs. + num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number + of tokens that are to be processed by LoRA ID lora_ids[i] + lora_token_start_loc (torch.Tensor): A cumulative sum of + num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that + lora_token_start_loc[i], along with num_tokens_per_lora[i] + identifies the region in token_indices_sorted_by_lora_ids that + LoRA lora_ids[i] should process. + lora_ids (torch.Tensor): LoRA ids to process. + no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates + if there are any requests that require LoRA. + scaling (float): Scaling factor. + """ + + assert no_lora_flag_cpu.numel() == 1 + if no_lora_flag_cpu.item(): + # None of the inputs require LoRA. + return + + assert inputs.dtype == lora_a_weights[0].dtype + assert inputs.dtype in [torch.float16, torch.bfloat16] + for weight in lora_a_weights: + assert weight.dtype in [torch.float16, torch.bfloat16] + + assert inputs.size(1) == lora_a_weights[0].size(-1) + assert inputs.is_contiguous() + assert output_tensor.is_contiguous() + + # metadata sanity check + M = inputs.size(0) + assert token_lora_mapping.size(0) == M + assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(0) + assert lora_ids.size(0) == num_tokens_per_lora.size(0) + assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1 + + output_tensor.zero_() + + (lora_ptr_tensor, lora_strides_d0, lora_strides_d1, lora_strides_d2) = ( + _get_lora_a_ptr(lora_a_weights, inputs.device) + ) + N, K = lora_a_weights[0].shape[-2:] # K=hidden_size,N=rank + NUM_SLICES = len(lora_a_weights) + MAX_LORAS = lora_ids.size(0) + + # Triton kernel configs + kernel_config = get_lora_op_configs( + "shrink", + max_loras=MAX_LORAS, + batch=M, + hidden_size=K, + rank=N, + num_slices=NUM_SLICES, + ) + BLOCK_M = kernel_config["block_m"] + BLOCK_N = kernel_config["block_n"] + BLOCK_K = kernel_config["block_k"] + SPLIT_K = kernel_config["split_k"] + NUM_WARPS = kernel_config["num_warps"] + NUM_STAGES = kernel_config["num_stages"] + NUM_CTAS = kernel_config["num_ctas"] + GROUP_SIZE_M = kernel_config.get("group_size_m", 8) + EVEN_K = K % (BLOCK_K * SPLIT_K) == 0 # type: ignore + use_stride_load = False + + # TODO (varun): This grid formulation maximizes parallelization at the + # cost of wasteful thread block launch when only few of the input tokens + # require LoRA. This might not be the best in all cases. + grid = ( + SPLIT_K * triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), + NUM_SLICES, + # Each LoRA receives its own set of thread blocks for output + # computation. If some LoRA doesn't have any tokens to process, its + # thread blocks exit early. + MAX_LORAS, + ) + # use_gdc = supports_pdl(inputs.device) + _lora_shrink_kernel[grid]( + inputs, + lora_ptr_tensor, + output_tensor, + M, + N, + K, + token_indices_sorted_by_lora_ids, + num_tokens_per_lora, + lora_token_start_loc, + lora_ids, + scaling, + inputs.stride(0), + inputs.stride(1), + lora_strides_d0, + lora_strides_d1, + lora_strides_d2, + output_tensor.stride(0), + output_tensor.stride(1), + output_tensor.stride(2), + BLOCK_M, + BLOCK_N, + BLOCK_K, + EVEN_K, + SPLIT_K, + GROUP_SIZE_M, + NUM_SLICES, + #use_gdc, + use_stride_load, + num_warps=NUM_WARPS, + num_ctas=NUM_CTAS, + num_stages=NUM_STAGES, + # launch_pdl=use_gdc, + ) + + return + + +def _lora_shrink_fake( + inputs: torch.Tensor, + lora_a_weights: list[torch.Tensor], + output_tensor: torch.Tensor, + token_lora_mapping: torch.Tensor, + token_indices_sorted_by_lora_ids: torch.Tensor, + num_tokens_per_lora: torch.Tensor, + lora_token_start_loc: torch.Tensor, + lora_ids: torch.Tensor, + no_lora_flag_cpu: torch.Tensor, + scaling: float, +) -> None: + return + + +try: + direct_register_custom_op( + op_name="lora_shrink", + op_func=_lora_shrink, + mutates_args=["output_tensor"], + fake_impl=_lora_shrink_fake, + ) + lora_shrink = torch.ops.vllm.lora_shrink + +except AttributeError: + lora_shrink = _lora_shrink diff --git a/lora/ops/triton_ops/utils.py b/lora/ops/triton_ops/utils.py new file mode 100644 index 0000000..dcbb8aa --- /dev/null +++ b/lora/ops/triton_ops/utils.py @@ -0,0 +1,362 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import functools +import json +from functools import lru_cache +from pathlib import Path +from typing import Any + +import torch + +from vllm import envs +from vllm.logger import init_logger +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +_LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} +_LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} + + +def _is_corex_backend() -> bool: + return getattr(torch, "corex", False) is True + + +def _get_corex_lora_op_config( + op_type: str, + batch: int, + hidden_size: int, + num_slices: int, +) -> dict[str, int | None] | None: + if op_type == "expand": + if batch <= 16: + block_m = 16 + num_warps = 4 + elif batch <= 32: + block_m = 32 + num_warps = 4 + else: + block_m = 64 + num_warps = 8 + return { + "block_m": block_m, + "block_n": 128, + "block_k": 16, + "num_warps": num_warps, + "num_ctas": 1, + "num_stages": 1, + "max_nreg": None, + } + + if op_type == "shrink": + if batch <= 1024: + block_m, block_n, block_k = 32, 16, 64 + split_k, num_warps, num_stages = 8, 2, 2 + else: + if num_slices == 1: + block_m, block_n, block_k = 64, 16, 64 + split_k, num_warps, num_stages = 4, 4, 1 + else: + block_m, block_n, block_k = 128, 16, 128 + num_warps, num_stages = 8, 1 + split_k = 4 if hidden_size >= 5120 else 1 + return { + "block_m": block_m, + "block_n": block_n, + "block_k": block_k, + "split_k": split_k, + "num_warps": num_warps, + "num_ctas": 1, + "num_stages": num_stages, + "group_size_m": 8, + "max_nreg": None, + } + + return None + + +def _get_lora_a_ptr(lora_a_weights: list[torch.Tensor], device: torch.device): + """ + `_LORA_A_PTR_DICT` collects the required information during `profile_run`, + After this, it remains constant and subsequent usage is through LUT. + Refer to: + https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py + """ + key = tuple(lora_weight.data_ptr() for lora_weight in lora_a_weights) + + if values := _LORA_A_PTR_DICT.get(key): + return values + + lora_strides_d0 = [] + lora_strides_d1 = [] + lora_strides_d2 = [] + tensor_ptrs = [] + for lora_a_weight in lora_a_weights: + if lora_a_weight.ndim == 4: # shape:(lora_num,1,size,rank) + assert lora_a_weight.size(1) == 1 + lora_a_weight = lora_a_weight.squeeze(dim=1) + else: + assert lora_a_weight.ndim == 3 # shape:(lora_num,size,rank) + assert lora_a_weight.is_contiguous() + tensor_ptrs.append(lora_a_weight.data_ptr()) + lora_strides_d0.append(lora_a_weight.stride(0)) + lora_strides_d1.append(lora_a_weight.stride(1)) + lora_strides_d2.append(lora_a_weight.stride(2)) + if len(lora_a_weights) > 1: + lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device, dtype=torch.uint64) + else: + lora_ptr_tensor = lora_a_weights[0] + + if ( + len(set(lora_strides_d0)) > 1 + or len(set(lora_strides_d1)) > 1 + or len(set(lora_strides_d2)) > 1 + ): + raise ValueError("All LoRA weights must have the same stride.") + + _LORA_A_PTR_DICT[key] = ( + lora_ptr_tensor, + lora_strides_d0[0], + lora_strides_d1[0], + lora_strides_d2[0], + ) + return _LORA_A_PTR_DICT.get(key) + + +def _get_lora_b_ptr( + lora_weights: list[torch.Tensor], offset_start: int, device: torch.device +): + """ + `_LORA_B_PTR_DICT` collects the required information during `profile_run`, + After this, it remains constant and subsequent usage is through LUT. + Refer to: + https://github.com/triton-lang/triton/blob/release/3.1.x/python/tutorials/08-grouped-gemm.py + + """ + + key = tuple(lora_weight.data_ptr() for lora_weight in lora_weights) + if values := _LORA_B_PTR_DICT.get(key): + return values + slice_offset_lst = [] + tensor_ptrs = [] + lora_strides_d0 = [] + lora_strides_d1 = [] + lora_strides_d2 = [] + hidden_sizes = [] + slice_offset = offset_start + for lora_b_weight in lora_weights: + if lora_b_weight.ndim == 4: # shape:(lora_num,1,size,rank) + assert lora_b_weight.size(1) == 1 + lora_b_weight = lora_b_weight.squeeze(dim=1) + else: + assert lora_b_weight.ndim == 3 # shape:(lora_num,size,rank) + assert lora_b_weight.is_contiguous() + tensor_ptrs.append(lora_b_weight.data_ptr()) + lora_strides_d0.append(lora_b_weight.stride(0)) + lora_strides_d1.append(lora_b_weight.stride(1)) + lora_strides_d2.append(lora_b_weight.stride(2)) + slice_offset_lst.append(slice_offset) + slice_offset += lora_b_weight.size(1) + hidden_sizes.append(lora_b_weight.size(1)) + + if len(lora_weights) > 1: + # note these are device tensors + lora_ptr_tensor = torch.tensor(tensor_ptrs, device=device, dtype=torch.uint64) + slice_start_tensor = torch.tensor( + slice_offset_lst, device=device, dtype=torch.uint64 + ) + else: + slice_start_tensor = slice_offset_lst[0] + lora_ptr_tensor = lora_b_weight[0] + + # If each lora has the same stride, there's no need to use a + # tensor for storage. + if ( + len(set(lora_strides_d0)) == 1 + and len(set(lora_strides_d1)) == 1 + and len(set(lora_strides_d2)) == 1 + ) and len(set(hidden_sizes)) == 1: + lora_strides_d0_tensor = lora_strides_d0[0] + lora_strides_d1_tensor = lora_strides_d1[0] + lora_strides_d2_tensor = lora_strides_d2[0] + hidden_sizes_tensor = hidden_sizes[0] + same_stride = True + + else: + lora_strides_d0_tensor = torch.tensor(lora_strides_d0, device=device) + lora_strides_d1_tensor = torch.tensor(lora_strides_d1, device=device) + lora_strides_d2_tensor = torch.tensor(lora_strides_d2, device=device) + hidden_sizes_tensor = torch.tensor(hidden_sizes, device=device) + same_stride = False + # MAX_N is the maximum hidden size among all the lora_b weights + MAX_N = max(hidden_sizes) + _LORA_B_PTR_DICT[key] = ( + slice_start_tensor, + lora_ptr_tensor, + lora_strides_d0_tensor, + lora_strides_d1_tensor, + lora_strides_d2_tensor, + hidden_sizes_tensor, + same_stride, + MAX_N, + ) + return _LORA_B_PTR_DICT.get(key) + + +@functools.lru_cache +def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None: + user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER + if user_defined_config_folder is not None: + gpu_name = torch.cuda.get_device_name() + gpu_name = gpu_name.replace(" ", "_") + gpu_name = gpu_name.replace("-", "_") + + config_fname = None + # only expand op needs to consider add_inputs + if op_type == "expand": + config_fname = ( + f"{gpu_name}_{op_type.upper()}_{str(add_inputs).upper()}.json" + ) + else: + config_fname = f"{gpu_name}_{op_type.upper()}.json" + + config_path = Path(f"{user_defined_config_folder}/{config_fname}") + if not config_path.exists(): + logger.warning_once(f"No LoRA kernel configs founded in {config_path}") + return None + + # Load json + logger.info_once(f"Using tuned LoRA kernel configs from {config_path}.") + with open(str(config_path)) as f: + config_data = json.load(f) + else: + config_data = None + + return config_data + + +@functools.lru_cache +def get_lora_op_configs( + op_type: str, + max_loras: int, + batch: int, + hidden_size: int, + rank: int, + num_slices: int, + add_inputs: bool | None = None, + moe_intermediate_size: int | None = None, +) -> dict[str, int | None]: + # Add support for fused_moe_lora ops + assert op_type in [ + "shrink", + "expand", + "fused_moe_lora_w13_shrink", + "fused_moe_lora_w13_expand", + "fused_moe_lora_w2_shrink", + "fused_moe_lora_w2_expand", + ] + + # default config + default = {} + if op_type == "shrink": + default = { + "block_m": 32, + "block_n": 16, + "block_k": 256 if batch < 128 else 32, + "split_k": 64 if batch < 128 else 8, + "num_warps": 4, + "num_ctas": 1, + "group_size_m": 8, + "num_stages": 2, + "max_nreg": None, + } + # The default config for fused_moe_lora ops + elif op_type in [ + "fused_moe_lora_w13_shrink", + "fused_moe_lora_w13_expand", + "fused_moe_lora_w2_shrink", + "fused_moe_lora_w2_expand", + ]: + default = { + "block_m": 64, + "block_n": 64, + "block_k": 32, + "num_warps": 4, + "num_stages": 3, + "group_size_m": 8, + "split_k": 1, + } + else: + default = { + "block_m": 64, + "block_n": 128, + "block_k": 16, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2, + "max_nreg": None, + } + + if _is_corex_backend(): + corex_default = _get_corex_lora_op_config( + op_type=op_type, + batch=batch, + hidden_size=hidden_size, + num_slices=num_slices, + ) + if corex_default is not None: + default = corex_default + m = batch + + k, n = (hidden_size, rank) if op_type == "shrink" else (rank, hidden_size) + + config_data: Any + config_data = load_lora_op_config(op_type, add_inputs) + if not config_data: + logger.warning_once("Using default LoRA kernel configs") + return default + + # config is structured as config_data[max_loras][num_slices][m][k][n] = {} + # slice by max_loras + config_data = ( + config_data.get(str(max_loras)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - max_loras))] + ) + # slice by num_slices + config_data = config_data[str(num_slices)] + # slice by m + config_data = ( + config_data.get(str(m)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - m))] + ) + # slice by k + config_data = ( + config_data.get(str(k)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - k))] + ) + # slice by n + config_data = ( + config_data.get(str(n)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - n))] + ) + + # slice by moe-intermediate-size if applicable + if moe_intermediate_size is not None: + i = moe_intermediate_size + config_data = ( + config_data.get(str(i)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - i))] + ) + + assert config_data is not None + return config_data + + +@lru_cache +def supports_pdl(device: torch.device | None = None) -> bool: + """ + Refer to: https://github.com/triton-lang/triton/blob/v3.5.0/python/tutorials/11-programmatic-dependent-launch.py + """ + # PDL requires compute capability SM90 or above + return current_platform.is_cuda() and current_platform.has_device_capability(90) diff --git a/lora/ops/xla_ops/__init__.py b/lora/ops/xla_ops/__init__.py new file mode 100644 index 0000000..b5570ce --- /dev/null +++ b/lora/ops/xla_ops/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.lora.ops.xla_ops.lora_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink + +__all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"] diff --git a/lora/ops/xla_ops/__pycache__/__init__.cpython-312.pyc b/lora/ops/xla_ops/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d7de05adae7d603ee97dbbe6a132b96b631cf43 GIT binary patch literal 313 zcmYjNu};H44E5zwwY2Jh*b!@n=Ei~qY^;nRWwBBcrKkFm(MWRM90BNAbPBrr0O0t)tC zL=tvEB)|N6B>UN_Fgwg!o0eIB{7+y_`XL|Y_1fuja~qBN(;bnCsd8yt**b11Mc=*Q ziIW`bLJuQ`evFtBA-=+aCH&l|##m*wT{NjH@@CO)mlx-oWv)oClR{5mgxTlxONxFT?o8{0!qr2h`wLkipJ$NDYsL)yPTa5=lKkI-q(F_NgLOnYxbG=~`_i6Y6iK7uXEAXFHw!>qnzbZ)J!_+u!PGa!?BTb zKeWQB_;d`AIyGbC**KLFWy4{LuY(@PTWip*5VLG(grvYFb~lO35z|${rrZ^1`-RMayJO!{XCA zjT>-v(zIdVZHS`y&lh@4HPf3>GRMp~9hc2x)7gO&gOdYtZ(6z5o4IaI=vwboI<5BN z>Rvr#^iHSaG5jI|fu78D%X4j9or=LmXag_~z~49x-5j~^3of2qI9c>G7d^qd9^ayS z!Mzw<2rikcm)?2h?N`<#Z@+r`**^^ZerP_p;d#F332fEzes_uRyn72ckeNYbeRNdT z44sPhQM$$Qke#SH!h=P{W2eirmJ3%}PB0OQF_1!FBs2ou4%J zQW1u&I6n*>C{XauKeKph;narEyxRAP&{hmLtpt_>Ykfg zH1OTsuE0giI%i!bQnCgrsDL%!%JqK}MeWV$3P_rD&GI0#s>qb5MpPNhIdgo{0lVHK z47~5EQ6PVw9)vX%BLl^|2H$o|@|2R0qi&``+J;kyptE>Wr-=!Rm(`4Uoj#7#?p^k5 zaibO&v$zohjTzh;nypqp^Gi+FVyN0E%pwEhGIVp~o-eXEv@mqz%-rzYYj^*Xo{PbT z#mR-q8)uPvzT>_xj0#`nR{NHw){^h3@2a2r`tz=STjN0%9j?V*mzD7e2tENlYeE!? znaq!^1~#o|5**p6AiZ%(irF5XsP!qlbWj1=DuTs>hXQD~z5FwUMzpJ?s%qPwV18AN zzq5SKv6o6E4s|r^&hh)tN)8qY8Xv_0Xa+!`4#@$$_LgMliM{2pxyduYVZiiN&ahWT z$?=fA{!9KdV73w`amlKw!mYUIW02W+$(eKi$^rc&c>l_gWc!?f4PGSQYz;=WFM7bU zy=c3`blf<}ARO5iAs%IZ&|^pb=t=P$W;M~aCs9v|Ea!=@XjwI8>XTq*-Qt8INwO9* zlo|PIkGINjr)fUKctrpCuvoU+D&y*6+z}H}&OMA7jcb$pirZoN4@t9Ucc0^7aeJIe zd{V^I8W2!re zn&mjmv0e65*Ka$tc=VdCr$KGMoRG~4nToJN(?Nf_2t*>FY&};l0`Q2lZ>P!+FW=U< zErS(CER3Fy3iJsuJj)3ciRHv-*91d5A96pooSC=;B&-g25+}Zo9X=4~G3cOnV(|&9 zBt=zB10x1XgJwrkAB&{&bSADz^jVn2WyZgtn4?Z!h@U*SLcJdvh-oh2WdBH)fYE6v`=e?a*dNf1#uQ_Ju;nz@5HN zI);kj$UJ{fXxtQ93qtE^UqLvuCcGDTH?STrh{rz>j^C|sSm|8uTou+j)~;@L4i-8G z??einr|yh@q<*OWb+~Z)e4+Dv{%05R7f16Qmp1BuF)w@}M2exd)z{ai*7>!meDB%Y zhc^4p6#CA5e4)^H7RLFmvxW8-3!xYDo)_bD1JAQV~4c= z2RAP3S3H_b>v6MN)H9}{YjJevi=cX_8P#msRG_j0$Bt+43oc#j!M3|bUCc-6aWDzm zhaLJTV7J|pc_gGtVxTeMP1L%dF+mk=lAeI6i%8Q2-5e>2#NY7NrInYLUtYPoe08&_ ztI*W7?!Fz&KYwwfX|&)Q&AUczQ}wf88Pk(VLpCk1J;MBus4Wo?W(CKe0r-{*4&l)c zfSizXL9(qr17IWNW|t%|%VVNtNgK+Fnxrk{+!V^0oKQ~00(gj%_5!h8aNrPh*Ggh^VJVG28L-B+j404nrXAht9xi7?p+1R#>DaUfL;&{s4%QNUcH#QSO}5;2X1D6jX{HUbCmk#P z;Ykcxn58d^XU@!g$#N>1$<%X`u&c6F=owOVDVvs0)1QJod~F)HSmhSBS;s{fxBKMh><;@IB6* zDEcFdBMT!XC&2d_TUJJvM@l@va1Iy^Ef19h9C=7Nv$WJ1T Pr2s-f*xT30k!kbapZDMk literal 0 HcmV?d00001 diff --git a/lora/ops/xla_ops/lora_ops.py b/lora/ops/xla_ops/lora_ops.py new file mode 100644 index 0000000..4924890 --- /dev/null +++ b/lora/ops/xla_ops/lora_ops.py @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import jax +import jax.numpy as jnp +import torch +import torch.nn.functional as F +import torch_xla.core.xla_builder as xb +from torch.library import impl +from torch_xla.experimental.custom_kernel import XLA_LIB, jax_import_guard + + +@jax.jit +def bgmv_jax(inputs, loras, idxs): + return jnp.einsum( + "td,tX,Xld->tl", + inputs, + jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype), + loras, + ) + + +XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor") + + +@impl(XLA_LIB, "bgmv", "XLA") +def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor): + if len(loras.shape) == 4: + loras = loras.squeeze(axis=1) + + jax_import_guard() + return xb.call_jax(bgmv_jax, (inputs, loras, idxs)) + + +@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd") +def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor): + T, _ = inputs.shape + if len(loras.shape) == 4: + loras = loras.squeeze(axis=1) + _, L, _ = loras.shape + + return torch.empty((T, L), device=inputs.device) + + +def bgmv_expand( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + add_inputs: bool = True, +): + """ + Args: + inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. + + lora_b_weights (torch.Tensor): LoRA weights of shape + [num_loras, lora_rank, hidden_size]. + + output_tensor (torch.Tensor): output tensor of shape + [num_tokens, hidden_size * num_slices]. + + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + indicating which LoRA matrix to use for each token. + add_inputs (bool): Whether or not to add the input tensor to the output + tensor. + """ + + outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor) + + limit = output_tensor.shape[0] + if outputs.shape[0] == 1 and output_tensor.shape[0] != 1: + limit = 1 + + if output_tensor.shape[1] > outputs.shape[1]: + outputs = F.pad(outputs, (0, output_tensor.shape[1] - outputs.shape[1], 0, 0)) + + if add_inputs: + return output_tensor + outputs[:limit, : output_tensor.shape[1]] + else: + return outputs[:limit, : output_tensor.shape[1]] + + +def bgmv_shrink( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + lora_indices_tensor: torch.Tensor, + scaling: float = 1.0, +): + """ + Args: + inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. + lora_b_weights (torch.Tensor): LoRA weights of shape + [num_loras, lora_rank, hidden_size]. + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + indicating which LoRA matrix to use for each token. + scaling (float, optional): Scalar multiplier applied to the output. + """ + + return scaling * torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor) + + +def bgmv_expand_slice( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + slice_offset: int, + slice_size: int, + add_inputs: bool = True, +): + """ + Args: + inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. + + lora_b_weights (torch.Tensor): LoRA weights of shape + [num_loras, lora_rank, hidden_size]. + + output_tensor (torch.Tensor): output tensor of shape + [num_tokens, hidden_size * num_slices]. + + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + indicating which LoRA matrix to use for each token. + add_inputs (bool): Whether or not to add the input tensor to the output + tensor. + """ + outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor) + + outputs = F.pad( + outputs, + ( + slice_offset, + output_tensor.shape[1] - (slice_offset + slice_size), + 0, + 0, + ), + ) + + if add_inputs: + return output_tensor + outputs + else: + return outputs diff --git a/lora/peft_helper.py b/lora/peft_helper.py new file mode 100644 index 0000000..975c3d8 --- /dev/null +++ b/lora/peft_helper.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py + +import json +import math +import os +from dataclasses import MISSING, dataclass, field, fields +from typing import Literal + +from vllm.config.lora import LoRAConfig +from vllm.logger import init_logger +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + +logger = init_logger(__name__) + + +@dataclass +class PEFTHelper: + """ + A helper class for PEFT configurations, specifically designed for LoRA. + This class handles configuration validation, compatibility checks for + various LoRA implementations. + """ + + # Required fields + r: int + lora_alpha: int + target_modules: list[str] | str + + bias: Literal["none"] = field(default="none") + modules_to_save: list[str] | None = field(default=None) + # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732) + use_rslora: bool = field(default=False) + # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353) + use_dora: bool = field(default=False) + # Extra vllm field, start with 'vllm_' to avoid conflict + vllm_lora_scaling_factor: float = field(default=1.0) + vllm_max_position_embeddings: int | None = field(default=False) + + def _validate_features(self) -> list[str]: + """ + Check if there are any unsupported LoRA features. + """ + error_msg = [] + if self.modules_to_save: + error_msg.append("vLLM only supports modules_to_save being None.") + if self.use_dora: + error_msg.append("vLLM does not yet support DoRA.") + return error_msg + + def __post_init__(self): + if self.use_rslora: + logger.info_once("Loading LoRA weights trained with rsLoRA.") + self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) + else: + self.vllm_lora_scaling_factor = self.lora_alpha / self.r + + @classmethod + def from_dict(cls, config_dict: dict) -> "PEFTHelper": + # Get all field information from the class + class_fields = {f.name: f for f in fields(cls)} + # Check for required fields + required_fields = { + name + for name, f in class_fields.items() + if f.default is MISSING and f.default_factory is MISSING + } + + # Identify any missing required fields + missing_fields = required_fields - set(config_dict.keys()) + if missing_fields: + raise ValueError(f"Missing required configuration fields: {missing_fields}") + + # Filter out fields that aren't defined in the class + filtered_dict = {k: v for k, v in config_dict.items() if k in class_fields} + return cls(**filtered_dict) + + @classmethod + def from_local_dir( + cls, + lora_path: str, + max_position_embeddings: int | None, + tensorizer_config_dict: dict | None = None, + ) -> "PEFTHelper": + lora_config_path = os.path.join(lora_path, "adapter_config.json") + + if tensorizer_config_dict: + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + tensorizer_args = tensorizer_config._construct_tensorizer_args() + from tensorizer.stream_io import open_stream + + lora_config_path = os.path.join( + tensorizer_config.tensorizer_dir, "adapter_config.json" + ) + with open_stream( + lora_config_path, mode="rb", **tensorizer_args.stream_kwargs + ) as f: + config = json.load(f) + + logger.info( + "Successfully deserialized LoRA config from %s", + tensorizer_config.tensorizer_dir, + ) + + else: + with open(lora_config_path) as f: + config = json.load(f) + + config["vllm_max_position_embeddings"] = max_position_embeddings + return cls.from_dict(config) + + def validate_legal(self, lora_config: LoRAConfig) -> None: + """ + Validates the LoRA configuration settings against application + constraints and requirements. + """ + error_msg = self._validate_features() + if self.r > lora_config.max_lora_rank: + error_msg.append( + f"LoRA rank {self.r} is greater than max_lora_rank" + f" {lora_config.max_lora_rank}." + ) + if self.bias != "none": + error_msg.append("Adapter bias is not supported.") + if error_msg: + raise ValueError(f"{' '.join(error_msg)}") diff --git a/lora/punica_wrapper/__init__.py b/lora/punica_wrapper/__init__.py new file mode 100644 index 0000000..e664ffa --- /dev/null +++ b/lora/punica_wrapper/__init__.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase +from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper + +__all__ = [ + "PunicaWrapperBase", + "get_punica_wrapper", +] diff --git a/lora/punica_wrapper/__pycache__/__init__.cpython-312.pyc b/lora/punica_wrapper/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92037bedf7e9a2756e9bb5c467127b2703903707 GIT binary patch literal 379 zcmX@j%ge<81UW3-nU;(U439w^7+``jKC1v3(-~42QW$d>av7r-89{8O9Hw06C}tp= zIfW&iDT<|%Rg>){NQ)-pEx~}&yv*dp@S?2KczG$)vkyeXfQAc8H#0r#0O?ZM#irk42;fc0;#%x^Jad|o~BbRHr%8IhS{ zM3?AJyXM?e?zy_DIv1wv(w;f*l$XRkY2TDj^oqW8{Tw^R&iSYOE=KaoK7gxF2j+rP zK^Nm@US>r0DkJ(O_eXWKjVX@g1|T;mab|94!4u;aZwC$wiX`&c%tYYGtdtq$PsT_2 zx1>>i3Vz=#KpcLjW((O-{wTzbNQ+R0FkAQ&a!Q%a2y+EFKf5r>PsaJ-9ou&7jKu?I z3z<|>nBY$p()rZbIVmG#^8Cr{Hz)bmCHZnHb0Hp>&F6E<#Q3-%&!;ZOv+{*;VOkmA zv2**j`1ZZKckKy$iM@}xwYtf}N3;eg%*#SDKPTm9vtrDph0eWo_E_S`E60wUc=hzl zr@wSjXfBzZ%N6oc0ssXqoXln}OL9IjC**R_EYu5PH%`%Hk`jo9&3fYL-vZrkG7>Z8 zf?08kuBkf7Bf7;p(Ia^!n1RK5hUoxc(Fdt|_+#PEFM4MuxMk&b4n9%=<8R`Orm zq-R-vL~h27IGjtd1u^7YCN`{xQw&Q%n1APXN$+O@V#Lx@PV)E}KO=TQf1PWE)@#$b z9vMgDdUeGnfbba<5u1^k&iE+pfzF$*dQCcqu=iqgjlMgNsj)^IEmFfe?FYnGOZ#D% zXRp}yQ`d%ju>MT0Pdl7zJN??%r8nV$b;cMGJFfan8b`ju7&k`!_8PPDJbmWGPD`H~ z7re1WBV78(KE@zX%bV5ljN?Jsk2lvR4 zaR=!F_U?R66!H?EpOtuQYD|{WkQR9qYEUAWfDc?+^XY8<)wx_+nv*gBb4-@A@`HNK zqe$r)nS&PO5C+(y9L695f#$pPo*-XP6p%Cwwg}&Uofyy0<;HWV+*lsUr2N=?Vb8w3 zm-a~G>D2UiZUH23X8dwGJvWY>8qX0SpLmaw)<{mnb`sAmXyF1G1ChhnV8) zUC@=MxMP06JnLk_{YAF#DWn+9wGh`_^I8L$3?ZT9h2$kk)VxYkNJ~j$Cq>Y;>3h&5 z+gVAT$;xvI|MIa@r+6OX{2XpoKt*0jr;?JwXJ-iM69Q!nB#?|-!~>t)3O{lbgKi9Z zFzCgg4FUz{8=(Qd9im!55XFQtE2lD-7Q1Y0hcqY`jv$6k43H4|Xo0)*(T1{xJZx+U z+C7cS&dewhBw?!ZNt{LFTbl+i)u6#zB$%!w0b5Ce0cczv#9%W7bjYa;bXTD?VIh~0 z#7-4yJ||?v8dRZ$`=F+xGewj#1)+~T3g_#*img7i4 zgL9B;*5;gD=FkZRfl9Dg@>xEe%1DB2iYRQna;W}Fej!VSjF+q%!a3L-I&u`f2Dj*o zC~OX%Hs>HsDj>cENDRj5w=j+=kJlgZtM(Ds=9!b(3#q(vR?dRpP_nYYER|%aS~eAA zC{v(%y;ul98e5Y91mco~06i80JaEfyC_4ARbuQMVv55p|@KPe71rmw5tXN26oJ%C$ zEeL77q&|@lv&lq4_CP(&lTImltuB?x%c#L>-h2VXCj4aO zgd8K)rnA|!Jc{Y<80>=J0hM(`PRh71k^sf4@*mL0tIT7!+sl?%#gc&8-AUh0FCQuPjH~V2 zjAkP(wAltG5dJWFJ$lQd`lC<64y`sY-94+#ZI8nt7*sGqkb5Bx1LGQN^`#yGSf;vl z%nvZ_-M3!9J88XbI+6Jhy^>uE!Z*TG%m~!RF5}^`5Xh(4q>dpI`S~)G%<~>Kap;x zZ=~-YDYg!&k>TZCYGk~GT`0BMh|vaFIx(`Jpa>-_s|cP}t2;(GBW2 z8l_;6UPKMA;*;vp@R|Wr^{Quz6~Raa6V(s!e$#XfKq?4-VETbE7y`He7?$3Pxb`YO zF)aFLJZLaZaZ(73%NlUbwQJ;hB=~Q}hXQ2AEjC(=(J*Plu_pl8WPyyV3E6CcY+Ms^ zlLfMAO~@7tWb<>-wH0W#Noo<>YSBz=*WCb)<0a;EhcgAm&h>DLT~aH|mLq1$2{7|f z%ZS>f7UBkY0s8A)t7E-3o$HZtbgx%e>;VXmK@qVR;%oPw6M2gn-j2N&_pH%(=P`9V z_gUWs7*cR9;`o;>(dVB+D^a0b?Hr{!8&7%h?^}Yb=OxI zqx0BnQGc+;tUOPjIdO}n&r!nDd&Qv|7>9w+V(>Qte<&5V)_@zWkxzRxM!4l`kc=~R zL?$+g@vGjcZs6#Be*i!Eo%@QI#ctRM9Or_Rjyso$+iK)*uaUo_M*dEq@UGfao@o_# z!?=4E{ILnG(PrGBn|kq|h(=zO&x08zN=oWN27Dx7lFSJi;ey1YiI$VIms6r-dQOOq zhvr$1crOUPOOl+C(sBMAWX}ny3_nxIB++C;yaXL6yO@vlz~BbYOe%x^17g=9MuLVq zyNQ)ZnueN2>sS$RfY(7tN)=t8R)W@wex*!dE-@|Sle3Zt{-#TyN^3#Pr+JFz6Q#@G zok*f0ZhYu(AU>sEgUSp(CA4eEcM;Y6if_qP_Bma&V$tE2>OOWECj?7OqRy!PwPH2f z{aUe_exS9nitgv4f!;@;f$6^!nG0|tvk2`#u3k7PN8u1WGOFtPg6aQNdD^9$fW zc^NXlM0WlX=Y`m>U;Yn#$77B1OOT^^rMX;w0c_K3=7QEt*A8`-fb$mpd0Hf)2oc&r1FWxVrP}Gj2mTFp^n0WxB!1^12Y)AzyIF;Rj&Wz&7X`s82Rjt zXLatuz%|c9mVf4B`nKE;-V54G`$%awln#&GfB)Y5_R?WeItZl(=1?)b`Q!7SOg)(T z!dFHP_S^W*EfWhU8BJ$}YprTduF1eQ6uq-@F!0=9D z_#<7YyaV%g#doFt3VX$WB>=Fx*g@pRCo)UEi>44Va$zTT;f~$OTr`(=xopYzk-iPM zKv-QgmueXTu-0!FYrPGtxyWr;*$rc5ZCFk5V8iOKjr9j^7>jc`Ib!{Ro7z~8bn;u; zM*5oq$lAKdbTJ^1>P{2EBja4e{4!2HU4dsH2B#6TLJr(u$!u;RA)m#~Ifw}- zzz2X+si!g{%7mh;T=OaiVaqDlN+OXDPh3CoG*ll7JP!H%fmN=3mFoh@u*&tUa&6#S z@rPDeUS;`W-`=|iRt9#f1G_&J)PcQ4cHatnP-PE(!98S8t+Lz-+pV(Q#h$%)!z=u5 zl}9k%0v3MAo+V&|Dmz%*e7rdH;>yqwb?C?!lj_j%BKz_RdrAdNTt7$ye*gr5(ex@8 zD>m>e4O`TPEqC8l8)8Ms*!oIw?BL4SF?HQx+|V5 zUMdxwr3di4+fh;&5&_$)_?Fy8D?}ogibfd>u-0uDYn=_Nvz+m47^}yE6;%dP`YoZ# zV2*C79#ZB=mRLxcvJlp*o37VbK*k#KISl>;g9HZWF?a)mw=j4U13WX3--ZB|H#y{! zAg(pRA)L9})9KY3%iDcI&uS*qZ)gcR8#v%ru#e`tr2BQdu`2aFzkF&sQhQ-m!u}fY zD!+sP_VXqt)KTmh`3G+6<3<*jA*{acRj!vTQ&_3+f3vQG54% z=2CknitPRsc2Z?0i-*rWWY3dz)}gW;#m*Os(Xo~2PBpso(@8b@LXq9G!oH}oFBV@q zV_%R*ibE4CLxB7_BH5*u^_uP(@l8L`qS29?!`%c6N51^3^D;( zK&ueBiQ5|HR(zt%dC7Dlywm2o7QD%JCD&@vXtbyzAhzSn_ig|`4r zz~GN_c)|!)B&d-21;dtJYCWn%OlmWYzoJI4wtdc_GxdO!w zib7YB9qa35_NI5n={dZyuuNsziUOA&7}!LwA0@`O0LD9B+kg(5L@(pWqLpfzIQt4_=r7o1Ram3kv+gF)y!+dJ4FjpoeErcPE`6R8|k-QLxIR^&;F&!uplIFpb zn3iOvM%B|X9~Fc|FqLJXDgI(EBKZ_tdHUIFq*Bl<_E!s~N|=9v`pPr}A221J2{o>8 zyvo6*zvJ$%`v>nGEOI+nxyTCFt#Yu%kKgs&58n$HxosqKSmlO`v5DnfpB#JukYJQV zK0I^%jQyt+s*eOpOuauq_F$W+Krul?1=@zSMFr~5S850VbQ5$kupsLe|H@>b8mke# zJ5~;ag(Sl*fLssR5x`z+0_$v1ps4%8MT+ru7f=abGZLj+(wf^X5twy@K$v z0KkiV)s(KN$H3}SMhr6d4y^RV)t>nByK2w&BHKmiJgA02>D^b{vacA1-Fo7WP3hN6 z`q(M0dpKcB(zhRQdJ-=RuaC(P%VqH;zhT*M%YK??-dirZnF&6yb|pG`(XDY+<9Xv7h4`&8JSc^CLeN#S9|)e z9sTi{M?C}hL&!|7IL0gPI6nWCJK-15Gn2>9Zs^9PS?zTR&T>GZQYOqSEtK5eiiu{k=*R=?8w`(z4EsgOa9=-1xN8St&-h8&ras^EufV}d3QS%Za+q=MaHp~dP@b<*0-u49T%T`jCufz@^l3k+E}){ zEnx3}WjDG2tUPUJM#8D)FxF@4X_KjKRI`!PzyhbNbVUM}AA_eq3zQIm0Sz{Leaj7M zei$l36*R_(8H=mKuw-FcczE9=N;3jn5+_H(+iaMSLzgF|I}C!<^G%lUXYqAP!8T^9Y(Lltr)+{3wTPMGmQy zx}(S+4#payCmk^LtrX0!@t$JT$|qxhVuQE8g~Om5+L>kT3*aVfjcKhVOl$7qNEUM^ zH?d=`@u8567cvTXqu`O^@V1yERJ95~-9}J(l-}isXFl_zJV@h-YAKCvRmrXq-{pH=jq&SoqWr|WtfuIuVk0CF$hN#i${Hhuf> zJ(T}w@_?WJxXZGz&9i9$SVxa?S@o!OWPeSEh{>Hb*0@eIN6ovec3YXB2z$L+9smq3)ly%xOHMtq5Q#|2IgF3~$KLCgo0297oq4c@m1IdGyU;H@}3 zEE2T-{G;-wcWALXKT!p|vGH4xhS&8oMV7Fb!B+67=+*)mb)dc>fWwLnd zWN~D&bi+}L=%l9pUowUIWMI6tJe>*9kn_epgb+E@Kua79^Gli^P$n+n@kG+R;{~Hb z(Nuf7E}B#}Z94&OW1=e;6b5*)3X@Ylq%vm+6m+xVsq*w3qrAn_=^^j*%2J((;#ioT z^Hx0O9KgBwa}2Iwa2q2-avqhOGy< zRwpe|!Yc0|P7i>~kNe1I;U$)|rjlnu{UZ=mejmGquSVtF7`y-hoPHmVxsJzr28U;fx&e0obb?|R1$nRj@ehi3@>LA9@N2RgG`J?6i z*wYZk4!fGYy`OFV`RM1PB?h9u+VvEFN+XP~q2zAy1|Invt`$C9y1sPl{GE4hzq7pi zlLHS9eAfI~@{66t@Nw1u@)IvW;S(PUUy75M&%5vLvHK_QoqWPTR4;}Yycj}2pLhS= zmiy6rQ7qnX7Vp=Kz1~UJviB420r!N#xDJ3(xI;$~9Cei&IBG!nyhmN7NRxMvPy~J} zDS}y)A{e6-!5F0o#)ipLxi<17s@C%~FBAi2zHqUDu+4rsT{^Cp2|U%-XAPHk-hhYjR3UB2wRxw=2($QlfFv%u(DTZ(5`YqTYZo!Tzr-5O<(_1;rr*GvjW`8S( zIW^zPVPVBvImptxi-q8Zfd10Qa$T1GLfq0{OqHpDsq$kvOjSIV!>o$Oa*(CBiiNMa zn62Zhy@QX*d%~u+o1-^IOJ01t=fh{8^%!F@=Eqn7rG)-o5T1wX?*%)1??i7$OXRI! zglXx#dGf}|ySvm)L#0Nn)Wmdj)3<`ylj?d!M!->|qzv#Xs(CwTvI)mhoc> z#@|gPh1xz+@?#bd!IBlkEMUSYzF91VFx3Ef?~L9aErl^<^s>~5sV2OR%iZHj&6wK6 zw0D(SFxHAsVB0VjW#|~6bwi2@SCp%<1|t5oI$=5~<1S2J^!nhAHTb(^JoeD9Q-dCp zR_MJcR1LKV{<9BpR9WL>A63Y|k3rw;5{Ye>;u?E^z9oK1z6I5B(^qiMDb=}LuE##F z%k%pN#x?lQ4F6k(`yI3GU+W_O?iqas|CEud|0l{bI8AZ&KRaEQ^}70t?Zf}cV1x|g Fe*w_~wtD~o literal 0 HcmV?d00001 diff --git a/lora/punica_wrapper/__pycache__/punica_cpu.cpython-312.pyc b/lora/punica_wrapper/__pycache__/punica_cpu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b34ad7d6b9d1911d7d4b68ac1c905f88f89186d3 GIT binary patch literal 13293 zcmd@)Yiv|kdgspjX^(BNF$QyE0(gS436N%?KyY{jNMb`EBx{0(>$!LAfjcwKxz`xG z<6X2xTCyvI2<65RM(#+dmkg@*I2NU+XR*e0Ju(MA`2=> z2?K(FutRkWIN;Z*I#aFzSIRx$PI(4ADer(cb9hr|eK%vJ?JYiu*#Sco2sb?}V3-ghzH?bq)Ar5A;-hFUhk_>?^gl|pBw<3 z!H0}92slFz4QEIR!#cx&v+g0|tOK0&4-IEMoL~h0alIkA0dPmYCESgGyTMQ7jo%Xf zrmOCO#)pQR^Vup`a}(fPRf8K>CmgN7)YV&&|7-Gh^GY~nzJeB5!D>GNx;=WIy?y=l ztyys$t&@)0^BCA172AV^ltuI=eedjy^seP?tt75xLf3P@`j%X_t|xI&9O3N zSKc$wUM}6YxFVg_vGYdSMC3=VdIr{f?e$mRS1qcEHKm7UI8B#yMNFmvPfB8)0AgsKA650_ zw&yZ0^mehSVWc%X4g?$uO-OO2u!M0vL*v8dQpC)N*3A$Z%_-?}EIpcv4N7`^Sdn9T z=Ax3;j3CzYI?eFN$|W$XaaWO{@&Fu!|K(kfT_-vkPL?h>EU56CC0QZs$T}BH3<4P^ z?+SfU2SpTyFBVItlX@%`bsH{CQ4_Qds%brD5$xc(I402{O+&8(PeuIi<2`z6q-P{K zvQ38;Mc;OL^vT`dx%i~gqb3J?M#lBwOuFZis-}9>43&EL5yZy0J-3SSkAon0aK>WF{sqOT+8>zLbhGy2rWZJ#9W_+DIwMq&u#hH%;Nj>YsC zxGIKQi%Y5!w;Uzxf-pKa%P&B7on*-c_$^OV7$>^v0uh2#%L+qI=vz#EbI?zPtZPPi z-}5>_ic)kWqE5q`&ctxn)DLANGD4L^QdMIXU%u<6tDu3d!3_6B*J9QNS%JW`8Hz@I zjJriK6URh_Jam`!*o;_qLw1GaBP0}gWABglE`=I@%q zT(~3e$8vy#8}9`HqAf7Gglu3hINAOPfMQnT!~xP&V>#D?N%{h592B|MXkNK4KrtEPLW$L55&U5gv{RZYv$k9N{LS6+4ySOlwa@?f9p5wj{C(rGOd_Evx)B=ZK}bwX zPAG9a6D!%gzyqv@r7N=T86oSb&>5U3d@woZ32#*u0YM<4TQOtYfy-s&5=Bn1)fT~6 z6lU?j#O6x5G3i#)?-f9&Jr5bEj#b5~qqRtN_{vlVmO*u_&Ib_+k#K81j8Gj3x7@1- zh^v!g%A!_eT6hlt53WwK?yP6Q#8a$N@Sr#kZ39TFAj+yzqL{q2X$6|_ZWJ*-0vTwO zMld}Li0+I+($Hb4BTY+400$dPqT$m?wUK~HWr_18{Jr=OFnETId-FNwbX*|!{ zSmK!!s7dDibctibN$_F7Q_@D0I^G3X{*B%JnZwUNFJ=-Vn!hcB+{eMi^nqCnBsGgA zX*aBxZig)DdjL*1A|?C^KAVQrMZ}mm^M_+_SD=oYr#;bLdBDZ(--q!Ux=z6CBH7yW zw>^K;Gj(V&wE5G}<{SFrQ!nS9dO7bT{?@x5$YvX6duPYyeIImv9Ju3qw$eo6A+W># z7XaX}g-Lf-xM2HaP-!Mvfch%tQWnq?STI{Ah3dyokzLFKw=m)_V;3YzS(@|!yLc>i zDR_Ud26X(&Qr5j-u6R;_s~@M7fTDf0W&BjkE#f!f5~;u?FAXbc@qB?zfYr{6IvrKI z#H0>hHMk1kk%OZ?EI|?eG%-F(sgl;!apZG*t){z#9{dmt#^?b3^7EEMVYZ(yxc}O4 zW>l5MK}7_wQB~x^qWWPZ9ZPEFB5-yELJ;&w%JEz1%~mV6u$NVPk}v)T65g!2E+v&Q z@Lh)`xcAxt2Si28HbQ22u`MQP(dyO#9Z16svQ3AKfaw^as8nb~OHz2B+LFTC{~XY2 zXCVVgX(kPgQ*O|LZydRHWO~Q+tJlY7#%AfvpWF&<#~ zKbPD69EwpBic!<_83>iPv2>Azx8{FoLfi8qz7aUZB|(Vsq&!AGAEcr zu7ZTU*6t!qhW;Kz+GYS#hBT4sNv1th7)p+SDrL4;6O-6wWl77sAPRNydZiHeVjEK&^EGxS2J zTZTgHd@&8T1ZVY8II6QOmhV0dae9V!iQV1Z=epqX0OEVI?p$<_*gvX+;S_b&Nwaa| zf-Z}l#l4~PF^ombz9n5tcLw8X%nvwt*DabjrX+`k_0lQX4#m!dnvrzqdkU0XGCd^1 z9R;k!vf^+pp-^V(?KMyg`TC-6`Xu~P46qD0qr2e*!qdG_QK^mgVPDinbTtm}b1yR~ zU#?U|ED|&IP+Nt?#xRXH{xu-7eA8VdTvt{9eBRpr6K!t)tqnW=t##+rv87OCG1Q(5 zwa>mf*ZO95{^SRBH^y_@kA3pQozTgzSuAc}4KL2Z7F7RTidI`RTf|uZox}?*!H5|%nEd>YOq0MGh#2f{smZ;ljRW9l{ zh;f^^Pb@h_5OjMo?1TwjVoF7*oDbhYWo{L16B5g~N^|Q%bq;silQ$9?>8}1Qr zxDGD&!fR&t&9~;-cIHAmuRWV@0)+-~JLtwTzpBKILV;(hY7l^j7M0crGcY2R&J_m{ zy%3@$!GlMJ1W_GP;>iR!^(y%9DC}o4acS^`M9~FT52pqdSq3xpQc@qryY`h8#yHBM zZRTx;f0#m9Cc(urS72S_p$eHRau6(YU63>LJ;36=Dw1Lf3cga5$_d2QJW|*KM@plR zRhncb47Lau0W+Si8Xh#_z$PH6lzjOjrJy2+;K1H=OD^K0+;xEY2| z3_OFQiG)IFAK=MDa@2NEUI8UWn|Ed_QkO8rkr)YQ4 z2J9Vtb#Fj?2_B8ug~TuhIx>Vj+eTD)R4i39e~@6Jbjk|Nwn9)+&~l}C?i-YpcKsTj zM=1iXTEx!&5q!v%)S8hT#%{$vNna*iMDad~H+0N1iJKaR$swB6o$$Xr2iadZo`vgJ zG_L2s^_Ub2x}e(K#5Lyy`!)gwn@J`MCjM)nTXd_s>g^w3sG>RE6RZ!pek@!U1oGN5 zcq8$?b4&nxfxCFuVYp>|9AZ9vTTP}H(BMuwyXzyh)1;d0oCu}jp2|$B2Cl#Z`F7F zEI!|TYxC~g_5J^F>f`md_Vs^S-+$BBpJVyW`u@*?4O9EkNqjXId1NuNITzVHM{h+Q zo9eyiA~kaRpA-gj%my7$r^|PKiVQ&8iPvm?D zZyfp5cW}wq!W;0ZHvLtiTqvJ;wEb)6p zu9t{hp^V1|oS3eacd{IFfgOk05|rC|IofyXwzn zhLXB=f@a|L99k1ATKrIdr-}O2uN2B&}+PI zEQQuChBo9v8|EDBwPOi2FjKbI&(55kd*q$XZ*QJ&_^asr$+`BAMt?r};p8VLfBEt+ zUcSBh%+!(3eNFH+%J=3j&FhOj2Xj3KKYsDH?^*aNXZpx&-+ZrCd-KF;_?)TA+elP6 z4IgWv)jl0vi{ERdrIZqj8G%?Vm61nPghR2|tD}-?ws>PPITMF!CtT3*!@?snoSIJA z7gvUdUtk+!bn!+ zLmBuLr(E4?PkFR>%A=)ILa}&CC;J-Id8^M2OAPnv8$ZAdJ#SYcbV|{?ESwDMfZsY9BGvCh#v4#@?p+*v@TW&(=6#+>bY;`|j z&kAtS7O;d{aXxONaXW-J1eaowLOM`KW z*EdY>87=r7YByTeZv60;1vxQ*C^JmxSTV!JKgq*LihsBn9AV!L!&kVP!oHK`sw{~e z@Eb{PihO&y0^UzkP=}tdwiU9xQxJqNeMH#wYZCgc%PTm3OCbBQkqBGw1&+Ce^*38O Kza$7SWd930G~5FK literal 0 HcmV?d00001 diff --git a/lora/punica_wrapper/__pycache__/punica_gpu.cpython-312.pyc b/lora/punica_wrapper/__pycache__/punica_gpu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e3f4d7eeab94a369b6d8d631b0614f7ff7ae1d70 GIT binary patch literal 14975 zcmc&bTWlQHbu;^%-JRtwNs%H&QXG+##g(WfC0mkB$(CtFlJ%lwk&b6wu!)R8Sy4 z?KyXLW*;1ivJ<3N;+=c%oO|wL&imf|cdyq?L5kVF6Mgm>iuw&!^yDZO))t^}lM<*A zN}vTxjE-AIEHsv_G3$sGp0=1RZXdCeHhatwcaAvY%m_p3j+iU%9&yu@=#*H1b;dk# zc7%<4N4)Wxk(#(~#7E$mm_J@SQU_(1;EvVDw~TCoJ{Ajgk`g>`Qvxe)S+;63S}sm( zq+!My@Xp?Hzr@R;z$Fv??qlO(qK7*h?BQM$d$+c z-2l&EdoutDE~sIPN3=!U0t;{+!3)@$)?*WDHfZ;W8};xGh#$rbeuXj3_n}FwCAi%}GvKZh7-8GH^q0oI z`L1ogX4qGua@{Q(d?WJ3$*lcC>)Vcz`ma1|^F68GVBF2;!wPLMpDlzgdWG#9;Aw~N z$-&;Zdf-#j zohEOeGJhxR+MvCAgZ97%?Ym)z_iVh&kw&2h*4;S61oo-^3h{OF!ll{&c5{%bo4KgW z38EYwOMqffl3bim@M9teLXt~K$*HIya&Zu+0;cX;T;~w~1 z`vS6?l!A(YqzQ{&Dm^n)hR#?r*5x7wn#$M&8p$9-g$#HF1iF?c7`;?_W~hns@ry42 zuk}NVuEGJEUGMQH!KNE4)C?7{4yrEP6;zFKfR^fEq<~YZfrrY3LeWH235BEvs7g3} z)fVR`MYWD>Xeh2J=Ma*IS`VKqo>D3r19sIWi?N7QiwNp4!vR#sa&22>I&w_M;xpO6(NA0MMm}dQ-EVGR^yHd* z=561t`J$t5wc|jp<3OJEzcYAkFx$HSlY@_}mQMG)?Q^F6k)P_?zuI{)*LkoCpbG$g z#c*pd5T!bcTc!Gm4H8WV;xtfYC@Kt+%~jn~$uK_}lB2Vt>KCUKFjPt{VdIMdE~KY2 z8qLrXI6_qdnlV~{#)`(E(F6sJb~Cn&eYr?oK)b9QTHY7G*o?$g=%(L~`hMkKucXnrA7zAo_ zxeIQ}dx)0<;@@7PxRPUiYfe$Kp1x^X#%TV?ukOy8X?tNV`U_8tEfb3$sv zDbh14J+0P|E#N~SgTs@e5LE4Qn2(8JL%0AaepEtS{|JSfRK_x4h(Ks4CIQ5rnLyNp z-*Qp(%~=!!Gm6O!J!S>6DJSJ3z6qC(x%7k)a2{1ju;_%LKYvJPY&YnCaD1PlDe!P? z{B4-9yZC{)3!pPV)5zS(jIySDz6?n)3xb8?S$twUm1k?ns8V6b$a2a|!3BD6i^+w+Z{|iY%G8qgw zRNGWkyb829&I2HRbB&K($WI~E5}-mrDh3=HcLoq*H)eY<>%ojLCK48IgwB_4UJyc> ze>dB*P8pP5UVwh`2xP#GJ(RcRofFqiT;G5F%FU}ct}aM7-dbTh=jnS)eV+B^*=>*) zoDN_|hN`Js^>*dFU8~+;&Kq31@{_3#rta{|Gb`Tb3N8e3Q{K8q9w=#CsPP}jc#Z#5 zZ<2h?kVc3qn~X$cQNh=eqBf=1PqF7Y+`)~ zmdS3|63c1o+XzuU1DF6N!Ma==RWVpMW7FF%dzf?aUDYxzD6+J~M%gQF^C7#)6&o^b{kem{_)n@|j7m7)UKeIrRR`glo!2se@z z#>cOg7TAf<0rxytK^7n@j42`zrkU6Kdq8d>we)L`J)|#@H`6G0iW0{d&J2sz6Gas1 z{6`3)V8sj{FNi872oW#FEH{@#qlP4^7_wr=bS&U87-H=Blqk(;>jVraPAdANFo>eG zl)3KFWHJWA>}z6#2Tegdm|-a`mewc*fvz5)MMXrIt?6sr=TKH7{1`P%NLicIvvg!nR(ci8v)YVQ#zp9jv0^J4VgL&?xL8Fo zma#_YqJ3f_5Y0p&;50-Z3|CA7tz*=mD4M`tf@q*7nr3VPJO3sOYY^Z-Diw=@A%eUB zLLeoEqY;Rv#$q!>j9f;)-~ul}gBUv=9~A`wlqV5DESgcZSdZh_Z&Ot(E~81m<&iT56ng~-p|#{dCS&mv||BS zcrt9po2k(QLee$@iD8ebq&we&1&x*uoy$cd3w#($#`!JhX-Z(0i+#WdD~6RenYB+r zf0d*O41rXovK1L$Zi78Fq*%rmu_{}1L}q6XfB~K6s`Q~t4_?hVVGpPY^pvYo#z{;* zd&Cx{GR%)@G~5U*`rlOv_R9&O3P%LaLM`&tRxR znC{`g`s-hB72;Uy?Y=LIL3c~_nU_BU!J_CaU%;1kTZ7!I5X0d{MJ_y!zO(>d_1;tr zLc5jn{_%EX#Rd`qYkVHv0DKaMwsJ^K0qG9f=6FixXeFMwctU%_;(M4(A}uKRon z!|8mi!jDH6zTA=>OeV^9S#!aOJxi?EF^%gAM;n%(mx2p za998l;yBpTToO#|agi5#NSwboD|tMbjtS&jOCn*x(HSmiC{B$%y8}Q0vI2cE7&9tj z3?aqKcuXBl1V6B-c0rke*Z@Y3iPbk8@Jr7_MRj37I4CBjAVxs)Etk)pJr}w(bUbwS z{KbLLiAzJL;Iz0kIt@G>7o{K^JcyE*08-NrDP+}UG^#G6RkPHa$?C_%_!JxqE75o; zoCJQteO7%{i%{)^+*G>=G4L7HNrECz@7E^%G|qegGMG3@OQ|Y|K`adt$smS4YHiKN zx5BDaD;!CyF2l@M>9|Tys&qo7B^k9)Ig4VM*!|KlyfFzyR!#e#07{+)jSt4iOEl%L zzy89Cuj9wzrQpg_&wcJY|Ib68wyivW{xjeCEOS0b@~rRtZ<+SIr)JgD`I)El*X+S8 zd+>fs>+OTL4leHa!E=lJ`;qq|OBa7MasA5tseJv8#T~i&uGRWLu0F8Tu~NUc7%3TD zo}IrkKXA`iN5)*_KlAO%_MBQew%T(l*K_J_AbaU@_Uo@@Uw`BCo;UOU&g<4ye`n6$ zxj3+RWhwYcFyGVnN#|YV_g1?05`Ev_c;DAB|Ca^I3jTtH_CCL6g%UI{l(%JJUyf~C zWjk_g$Kw7K_NjUL5ku8&z1ec3<(}7ny=}$YzF7Ny<9m%u^pBdC2C{7MXX3~4kK%U+ ze);k*UWV{%oqOH^m|RWsws(Bje0jDGEHn}gs(n}t_4pcohoPPVoG2Jj00MvzGiugN zwWijB2kknR@--0OhFEMyd>>6OBLw?xVxs*?qCAF42l$KWNrSi<+k_Fu)%!vO$S@J+ zTp0&3Sga+3Ws_4Xiz!CK*vz;h7R3k?Pnet|fYD(RBl^&87#$A9F!0dG7^5=|vxtyj z`_M3jNO0Q3RGTo2s?zFVe6u|dzf@C*&9{9@s}_-p447I^SR1gz`)m_bQ*Fk z@^uDF$1p=Lc{t!L#k+|9lrG`B%b2~28ByTqeM-kML((JBFD)2`r()~FUwaU!q#q5{ zC>_NY-+;_C>Q$zm2|+$q6A}n3@T`0YvQ6V(8^y&y(yy}ZIktV#Le31#6%rgdYno?) zza6<1S$yRO(WTmjZ~W}Q$IpNC{N37LHvOXM^V*B^j$*V7!vU?JMpoI@9NW5B3pGGq zegv^I=iRyL?ag_6@6aDRK62c-@>3Ut(Mm@UYp9ptoEk$7Rd`6W0nk^RX_O&!eC+wi z^U1(ZYgW8Re^*p=O|813TOX8D86}yT1TRIFUW8Ahx<_Nl@T7LC?Li9$VvVAt$Y?(C zDY(%Qg1ds?s|l)GdxN(J)S46zCpSq6TsA}%E)TTf#RLrIt-V~-j%Mc{x+Vgb5w2X$ zf(Drdc|8l=KwB{qs~F+Hj0^5>047Xu8evFvv!F%Bo^c_b2(1`F!U@#YRSBf3iOwqB z=a_RVXkC*uf$OPC6?5`zIw#McWKQg6@ z*{uMZXB+bDjyx(4D8v8uJlg^QkD9&Uf3#9;W7e}H-`u%)I@jDgfBL?!d7(eMt3T`O z&ocemfi+8ndGX|5^%p<9cJ&KIJRTXTjV50i>BOtwDht7liAmK3xI&Y7DKBi8=FlCl z*${D_(!@D60S(p3?DS>>6Wnl1bQ7G25KVyi=1_znSCOR0U_o3sKqxREG~5|K#BV*O zCp#86^e!rj4Ito*XA=mB^=<+IruiliaN~ay2#>&S0^t>E31r}5HgJ!vbZ#b*;UM@Ayd8Dr5X%|MWSP>7nB>^6)skZ z7Ex>A?nDTFnSmCZCO;$z`^%6$M#8J5Gd1I<8;?6vGl0@4>0i(!eK)l;u-bJv*L8Tc z>v*o~_^158MSdQ+`^vvWSGq3bm>u)A^8@o&uJ5@1^@S_<7!ZsQrO{$J5U6R=1!e~X zZHNd3i3(z~d6jL07}7#wX>^6%_hst*3fotLpIq9v!tO2AV@m@oY_L=ZUtopZQ>y>D z#%ufip;rC)&_cYs;Q&1xune9E&?f?0@hUL<@<$AXRCfq|P$Q&cSY|_^D``GfeB%s- zgk%_+tzfBWh6`GEs#R7bawk_JBx{RBp`jb!IJApx5Ie;iXz(ly9FptOqsc(x>=ZAmiJDDrnfr@T!)wAt)Tfo-t6?{l36J3I>x zs%??Q33A>}rG2JpOl%Tsu;*Lm9BJwWQ|68bdX2Ul4D{ZpD_4Ww-a{ zn)(VZYYwYV4+&Lhwb~yJ^_@= z7K=4lY@3E4*R;Ff!X{X8S(68wfKpm-#%{z4UaZvs-rGI5dI~9Z`aXcFdc5diO0YnQ8-BbqFHLHc`W1N}qE=OHUt zX_|iMu+z5R*HCoxZz%S+RQIo`zW=0l|CiPKKNh>)u|`3LZ@Yd=?S_wKaXzw~aM14T S*6uY53u{(rdq~>Ifd2M&hqGItq$G{; zW@5PU4|vdU@lW93!Nkipn1CBN7~_E(33&13%q|P!B>Ud`y*K-t_ujnu+S%Cw=)%HV z;}#FVPX^*eakvg=s9Fa)&>;pUEJMg_4s#}7=0lsuf+?0oY74rEB{NY@K)?Y6k-PyT zl5$d)P-2$T6K{laM?=mh_gNqh1I4fmPsMhvh6sfhs&zt;<*5$Vyjh!=)YngtYvVpWXpV!(J&z73;t| zP7Q#0F52*~+&BiSoF~TqCKs2p%5$K}H-Q0~@I8D7SB0j~>U>CB)Z|v^c0H6a$7|?G zF~@u|wobLqH@SJ`6vq~8=f-hvx|pW!7(vh!o`SqIy)xjqpFgs`duDY};+X;`ziYFky-S=Yga zmlp#;BQ-aW7M4QYfA;^AB!Zs*0#)n=U1xnRJ1ooApC?7nbc&AQjChn5c_Yj9(ebf` zQB=f6rRX%gIom2OVr&+%O|;@!o*23aqpD|K+r?%1$e4+1q@3jvpzjy3YU%Pj#HNB&xeP!?faH6y$U-k2)wp`l0)0Srr zdFrB}BMFGgwvcKIskN7WZhSjeYUfIu7dN%7+|-xucJ9$P;W1+o(ueeHFr8Ndh2>v} zT1(h73%Wv2Jo0XIEB&Ff?ituE7^Y(rPYpqIHOg`66*}&OjdmcbDti^GO3qOj*8CBW zUS_Al_=IWeb&T$k0SejlyF*kQ@DRe^GJse1!0;Zp{R<5JNT$|aY$Y=-=|Ga1!;exKnvJS38mby$~2hb>W-rTAJOahxc&|t)Dl`i$|Arjzygus0CCYT4!S@Tbb&RxUo=a%B&J?8#b&z*&<{+MerXG| z|NonV*T~L^K1grgeZSxTcz^z`qQXsqi`jl2ef1fN`XfekXV(H7Z$V&*;wX-eQ6u=Z z#OMjjh=o?utRq&EXPdA`AZ@}i;?#1O5e9OsG1r89#695|@l1F}yd=*StC*-9siZ01 zA~>LoJ?5M6kN6?a!8v1riK>w*nzB$QDUP{KaW1~d*qNc9XIbbp4C!62;RXkV5CT@iI>B(@MQvwB%kQj?bcr9PNDn#SgpuR7Xh~MA^DKrtD zoQ%fDAjxwkak2lb8vTrhejqJyekvY~gkKXNnHOFRi+n^Kj_L=r$VL*@Y>DEj5t^e$ zEF29hW8rO_75;6!ecW4ENe%PPStmt}Iyn1n8?2TCN>!DY;+&Aar96Gq2J2%R@$g&7u-P5X$R6MbyyP#y{repDJGDa`&gLzcIc>G-DCS^98-)3$4_`oQ``Jv6K z@2t%L>_!#*L))9KS|wKveO5!CTQ-@=mQl-ASXJ{}+|y0hujXlOaq&NY(>3&S+ip8X>c8=vzTWD#Y_jT4 z9;26IVT@b3R<4a}=kY|4@_0|U4v1}s{~f$-)XjCm>22h?HtEI5)dI2wp-dCkU3AKv zg)u0z6UsDmeO%9NdZcBhHu#LSn=}CbLnI<#;hV7UwxuO0W~*czBFw zfo!moLgGf0g&1sdz++CgRvG zX)YRI3NO5{d*HBaM9V9NC<#d(apM*}l>a8FB%QP*t@pGRXevpK7q+Gq9R~m`brPeej1#6qkE!4H>3dqw zNh)OY46U0G&2QN{&X_o8y;2Y7zu&TTW99|(_2*mHLuNoZUZdyczvk=9S?+0TL{r~c zKPD^vz4T+U=HE*{wfR-lL*Gk3^?K|1?TQ~>hwWRo zX||UIk?TSt0W1~U^&JI^Z z@ZsDX*KUS|F;PT83P}+AUoZDb6O(8S&U4qiLJ_r;=D`X*ouoG? zOP3oJ>TQl{^u+;{TfmPgFsW2T}!_qg15e2sg=oHorp zE%%*%IQ)pYXqx-vmW{ffLce50>cXfi@!-NE=6P)OE9N{ew?DrY_$TJr2CO%w(9?=T z;H4=c9x-Sbq$A}+0d|7}@98wwjG}ThM<;1B+Qb=@Q>V&9RM0X=?pn|$AWcym@vtBy zZVJ^Hv5C=HK4=kGhzK3vi8xl4724pV)P>++n%u?NuJW^HUh*I|xgESWsmH!8carnT z<>Q&AeR9*j`{4(H2mK!nd^oVS`|nSseFN*vfY1&lBL>$25nL&y; zb`kcinAJtrj2x^Z5GcHcgsG**F|WiWjmwp1dWT?N2jTu8A1MUEgImU881ItMhaTwC zDA5KUo(A<^IaH<7Y+db*(?$Iq=Y)_6@ExgX%o# z8O04`4$jczlq5!UWgXPdqs>u>6ipL2cbKaAL@fsYZM&znHbe<~pb3~NPDP!nU7hMa zOvul7KSc9%PMff*g)^D3nsE{9AmqS=9r}c?uXbd*2juR7wd?8bQx6ZPeXp)Fuc}jK zWP3Mo0CmceL ztOVA_od>6`g;$-jI_lZxvLjHK{TfsjcY_D(QbqY&v)+oVx0?9=jJHencC7}|-kn+R zmaMlv>#g1JF@ATBVq9*rOub4a;a}U(d)-E@As*BTQZynSHjYYc!S!OHg(Q5FHqG7BnI-5<@3~fCo3er<_^7 ze!t@pa{vI~&cOV@!k&feOE(v9E(?oqq`mDa`qQl~?}#f$)7$p^^VYqoQ`wsKl~Zy} zZ)zZ0Rg3KSa(Jb7wI*HFmpYbh?MU^fqQ40IqCb`Ry|*Rjq#E|+7|I_=F)GBW&~ZgS zXi7zboCg)RM8&J(FJ=o;8SWBT`45-|@N6_(HF{jTw2)*t!o?!LS>iYvcgcVOnzPKg zB!j9+2F9C0K{o(^8vwurb_i*AW7_}~>erRDX(7k^)_a;ZXpT-A>9YVqbFQ~~5E905 zRwXXU+|y=80Am=YV;sG&4j5OS3>bKhHUkY~gGxaIE0OcbKF1bq9YBULL@b1E=F;IF zfG?z|ht52hJMXOe{q=c_)$q9EH)l_+yc!NfSJb9nS+^{IS-Y^MesmS*+&JcQufV|D_dKa z@|40CNH+df8Mq#Q0R9$&i{7liWqC&S2ebahtiO3VD*L;#joWh-RG@8xat10>{W(8X zQNR4$>Q=dNuk77Be>zu&C5xf zN-fOwIYjjW@lj@s|C+SYQMy=2?Q&>_Bn=|-8W3^Gt zIW2g{Ox_uQNk`e#K5C0nN#!^(mg8<-48< znwBc(_*EqfRPJG8;TcqabJIO65b(q0gdKY-C*5DS1zo@>9K9Y6_7+UQn1QJTQUWc- z6rX)98jG>iQ>g1C}3=rPOb~bW6*82 zs6q6LdoIJgr${q-{3~wfiPB7Q(Qm+L6gQw^6x(%f65pG{u}wul6#5ETkikT}VBysV z8{|Vk354P`dD$R5#Bh*N*$)6HO4TykXL%tZ7L*&ntaDhqj4L5h6~S`Um8tv*BC5)o z(1Jn_i>7>t&%kc=O3#)(?e~_H_#HGWJ_HZMtK&4~uUh$ zn$ypn`@QenI&)4Y{<`nnC!Rp+DDocHfg8#AcgX%7D?-}eo$3d^WXZAUSh)Vzu2lcy znueuIi2^ZMBuD0eEn!YF*f}v~6)4 zh-87)!3QVgz?rpI!Am)^zLu19=hpl!;FeN17Jj-i_qo;LcW3)gJI}GA|O+^X$;mu;ECn^!%EKepYKmtsN2WKG~BAi8u%ab&n z1q0F0&=L{C;Y!+#JV$}v$N_``1Ut_9zM<3vL60`o6cq|}_Apo$Muz{;3{9qZd z7FAYrH>>+Kz_c6-3;F^4E8sy((9aOShK18`C_y)rm%nNtLFh!U4n2Z`cqLU``56cW z-KD&13FsMkC!B@iim&9=Lqu(9e)m-|tN{MVv3nx?krMv{t%z@d_jG}5GZjnHj{{B17gj3o`_^jGfs-l6V{hHE1LE}k z=O0Rsyu&Fv>+_e%Vv18UyA==*6hGN6AX$`H|xafrV861BqSgE)O#~VUB7@If~kATe!yx@l5nS5f>Fnm`ki{pY5 zDk~MFB-}-!D<{?is`3d>hRO;O@FTCiO<2RDy#|LspRO}~m$2}U(7TM@kI@T(r+BY` zCB6O%F1(KEz3BCz7X{Cp1FQ*6O@dt}4@NmqUANMl-BG4tH!ar+cUKI7N1oSQRd*`q zD<3iSz?`-1$h01kTMs?x%p5*1A3mRMy^wM~Vp_fg^->dgf&;$>Ufi-?2n04NoE7fR zD;zF2d53GvdfT$zmaMlfS49kK_RH=2A!oTG)38Tw*pq2EC^sB@a2?EJI(`7wF&*wd zyS?*mso^`9<}dxcFK4G*6<~9db5O0@GwdFj-IMh;E)Qp#_sGqAGR+6&X0>wbUZ~t^ zTDci2clBgC56PW}U^FYoGaWC;9WP`$2IP)`hZoQ==PI{k9UdNtUS3RU=o869keKJVh>AbiGb#Jx-Co$!Xh@SKik&pCumaO zM?q4}AGYA8hpn{eSv+retly-v4UH+=TfXe!(^+qGio27TPXKfOESO1Um=2lgSl|AG z73sZO@7|J`17!JxF{oFxU`Qy*g8d@|3W;0ZP+<;!crs^|48p`cg&i+o)=k**q@hpJ ziN`FIf>#)A`R8oFVq0bb!sqOqb&O8hX9=m33?MCF)$_&_j#(JL-XgxD41R{43la2) zyy+y;rD6pZK(T?@^i{1$?o3072o|J3$rhWU?nSH7b8?TM-aMa{P zpuh$?Y%50-ZQ_|E2vp?Viyqg@7JSt_vTwHhSuuO?hihm_+Te#og)9g5TH*@qkPU*K zFOuKuuRJb0lq9c$z=Y)+GD2z|nStXg%)kM&0R7^3H48Yg1Wr^lJFsxQ5sppqpss!a z2B^>jiWN?YK4ys}mg^1!+O+Yo$dIhotd;(S&JY8U`C#s=HA|{GuC-W+l zoNX~bnL4@t37*M!pdbK`$G5QiFArsF+g5fjCRVKGhU_YxJY)7cD8JKn(j8`C=5T?{!!nDeH$bUl^vBii`~9+<-~hu z-aYdL1;Gugo|vm7A)CE#)&7C^uJ;QHf?76&a$bl19Q}#Mf9J^jk(?bP&>)!gHZ3;g zoFod?0yP^h67*Z`uhCf;Am&<0q^BlHjM2MuVa!cc*XBGJ@=|pTxe5$bQZ-w1J`DLOx9?8VeA7k%V>_Dd0cd!k2FJ53 zt_Rv=Px}i0UhLhN?AepELv}%57-OidEzA5bV{gY+kFB@#$@S0VT$m2Fg*7=3hAOol zmTj`DX~Ty}FVj?WOSZc1b6=O8H6{XK()eb*JF$RSf0*v3>YJB`R@#?;xOgk)!4!42 z7^>3eSdFnC(^M1txu@3toY6Cc3#NrJ?B~GIuL+~1`9hMr5y-au8 zyFUrE fI{a@||DS9I4L-cxG*MCwZL|XqpcKe{? literal 0 HcmV?d00001 diff --git a/lora/punica_wrapper/__pycache__/punica_xpu.cpython-312.pyc b/lora/punica_wrapper/__pycache__/punica_xpu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25cf8490913dc2955ccbfcd77b25d5b1271ca713 GIT binary patch literal 11244 zcmc&aTWl0pmbbdQx~m^{w{843c7X?QgJ}bCCV>!$191it3@tQiX`%>CNuG=)$3mWQRZS~L`uYTUDmEEP$zy%OsM2@nm4gRD2VZ}+|25Ay zNNAAJ*k~*%#D@U#B8D?UIt>VdcFj9FmY57nQ)wY7YIP-*uo90&B;&g>F2|BrzF~A^ zbvk7q$$&+Yap@6*AM{mcRM!6sxHV5CGQz-4*hB_6Z<8FN9sV5>JK-xGq{EVH+C@mz zDYDlcBb?}z+@eeJKz|O~TcH!|fujGh&L?86?iZy_(Wg|AR7TVi3wnrVJAI8;+ zflbD3hOcgu{`HbS%83ntGw^4_xeeCX2>(q|UDPKw1Md2l#@!+{Y(jBs)JD0fQ{D#H zx4DU%h`U~Tj`l6*=zty$(VTHRz_lU=YwEntjx@rYyT8Mn)%P2CMR9w?F2LO+aw7jZ z^U~P2=GxY42EK~r>)x@+PSBH0sSb!cuRBMYr51W_tDNVkP3*qzuyjxBcV1t0?zL}1 z;ntM<#P0#+ZCNh3M+=lGhl}TaIQ{P)9-hi(KBn-Zq{POOpg7eOpAeG5n8bq`;?r_! zGA2rVLQ(}$Pz64kl6iGp;?qWY+9G%vR6ZHnd;rAg4L5l4wqLZjt{|Mdf)UgwF5N|rtoHWMnob0o*) zY&rW~0|P@;3)h5c{)*knguA3-fH^WjO=$TSk-2Nk_Z@LoQRkxVPsCYUP9xSz>CBR# zeMm;KWe$oUC*yK8E*y>}V`@08v1&?=jBBp2m<1D?(tHU)zM@xR5v@Vrqi{SW3ot^2 zol&4sN{mXP2q{dF}A)S6-9);<3@bbXFZtCHp4h@kAd|)2E;G@HO4U zn$@XvCX~*~0l+%lS}9N=bJZR2Ss8=kQ}V2qxZ75_oq2BOV*hgR`2F?=(TCg*A9FP~ zYUXOp53Aj8mhkSw{w3B<~MMeXr?!)zfeVw9>miN#iyiDPuh2`w)N)QdS@M<`PVp~ zzENwwhULaxi`lRCE*~5Equu8A$mrZ?%#`MoB{d@_Bc&PxB?9@eE<&-jUYW4;2y{0d zaFiL~J1(!{0Vy=}l&Og>kf2SM#h^n*7f^F11zApAleb~Tfjb_w>D=4}Eeg_V@~b%v zB2h&er+YS@^6EXmk0=rEr{uAJ+l|~@ZsFu=M}NMf|105M-MxX|4*q8F{=vUHwc;OK z<^~PMWi{8euo@O%`&lI-#HEOFL;w#V76adcN+|3@U5J37b^S4PbIk#7ehCha9FbGvu>Q2g%xkcHs*^Mm-$l z>N)U+z=u-!ci%a4hNn(Z0;5wfjXXU(3ZIJdIvYYBYuE(|ygM}(QI1$m=+ec&zv5*f zIVSbQrDRVz_kzJcG%H?HlwD>I(fM+G^zt6d*pm zq^9=9;N0NNLpQI^U%Pc}LB93zim!W?dEDChxw3d{rR&hYw!SiZYRwl|^>yZboeNhN zTW{x<&VN;VFPlGb>cOjzeCMCqNn5|Js&w%J-$55IttLfxPuE(C=F{~UewNh@I*W?l zX*kR1Nz8Cy)eIp4TCIm#yHXo!e5nvW6jFyZ$O1&$l^$5V?KJr->|qI})q#kXh^m%A zf|)QVDUMc=TKbgqv&NeJLA*g>&bUFm(etc$sNdtJ0&dUgZx%GK4mrPc+%zzqS&g$y~4Cu_2jW2o&e9veq6$kZo zQi=*-8)!5>EN7(R8s*pFEPaE10gc`6Fw~q6fze0bMLq(B=74Z4AqTO36pJ^ocoo06 zP|1Y_CFrU25sqV@x3Cz%0(}Hsv$5dh$Yb)G(4csrpl5&x4H-oo)Yk|&%Y(5ZN3v{dnwlBo;fu6PYU4puZ60Po%oP;I4fG_9V;LAD7oLnYo6E1@? zvrfhUxUDfsp;iy1Gc848QAlaW<5?=-A7UtYPLQENioat<-5+Oh>3@yqoIGcWKSs{yt(vXDyv+0Aa14cgN!cVMw?D$z&iPIKb}~$ zMD(Mh{I8HOx~`D~qr!V^>Uv6%A#?HbqUX>@WH1yo(s&p=OjtBg_zL zid~E$5=XI~#Sxz%>7i`5Ran2JQd`UQW3CU`1z)G5cqkpiB5Z zOjRzlAei9N!{GxDR1)O^rPP~nFjTStYXvVI zsf75CAZ5jmN_=D-!zU2}sr~6V!et}-j1J8mo5qT27NRp0 zKk9u9gG)ob7wX5*q!>Fb$tgv1(ID@YLwet1IN%3RZ1ibpj7F6@CM9(52KA`!*f4Tc zf5va=cU^p|q#rAK{4JnVzJ>ylMkg5(Xt;T7#lQ1sk)_be?!yoLXaD}f{p~Anp8eW? zc9}byr{%K$>=SR@>@oCBuI2+fRs*~9f!&MpN?`Bo05~;G?emvzU3y%%?dIi`y6(mE zKmYJ2A1)32^haMY%Yj3W;cMbnVy(Gle(KiLGnb>Oc6PAfA?}u0#|{6Sf6Z49uGN~a zZOvEr)a`V8;G4hkTQ5S8)3$m+7w8DRTA#XIv{y?pT?H?}2yagSpQ(f?3V!)0U(RkHhT}ToAz~t62o_o=V|`2;3Y{M_0cV;Eme?5KDTg zP&{sM11~qhRPe$Cmx5#&d8(%BR}9m8HW=pW9sdLIH9`T#*+T-`R{i_({(W}_z{Lbd zQ|E#a^AVgDO%XM|{|U7}Q&e2aWk8S5i>9=4jtMKNrH|A{-!*VToa_8_pqTEOsBTKD z;V3o6Hr3=o)mtzy##IdMmv+k-J4-02W@gDmph&$5i8j#&n4e3VnR=V?s14>gJvEo} zl-^yaxW)9zf%gq&-6J}_EVtETLV`8v}y;+s13#;{65{Qu|Gms-UtUcIoHSHHm&RvBD0e-$CYOW3~`HDBn? zm0K?wi0k*Kn6n#}F%S&;ieb1OfTQ7(&tie;J>6vW0Hg&fuSNCi4Ls-aAQVAwWr$v3 z&MZqNzl&qeBT|2bdQ(tIm7EgehOKD8==prEiK9Kf1vJXJ;QYT_{HKc#8$X;KeB9RcMept2 zMRj%0(fppH_k{a(_XmDA_z!~*4*uh*mA0W-|6@*%pC*?+xz9f22C+W4FtjwV^vQ$% zN8CBgA!O&W3ymxOu4S%EkFIQV`-X$eaIhUOzTui%f{SlYIGjj{nK;6}aQJFQh#OyA z;joyBK(igvJM^MXzp}O~s!VT|We4tpBOZf>?;+^a)8(4`R8rMJ8i2xY$}mw)bIY0o zlRkQWhNbCm+45=R(||<&05Nf<>6C}D?IIL+2wex=R2g@g7I} z%oA_m#?iT>1r{qXgRhQOU9`%P`sVqLTO9>At%5CXT=!D29?)AuajN1-tQ7}h%|jZR zOwyX$3Y4^3(%8E0$FChN><2|Muu_7IRyoqzxgdQJza3vXvE130Z#htK(+@g96nu*z z+dJ19nxFdjv8@(m0GA-cS~XnQ?jbGP7cMMzFI>F!Nx_RPK7zC&R8Ks$>kSBr4C&yX zdK=l(MT)UfG6}8XEMNF;`wDJag}rGl&{5#>0WLpn1(Nsd`?leY5>5hKf(mQZ=*4!= zMxJ=F-3NPAs6nWCxjnx zVYi)rw6(!=8o#B=9qj1+R$B7FwV))J=m_r&)H((`X*C{Kp|K=uL; aY_4avw;c?>+}yKH5CFc}Tu*5mMgPAPkrx#J literal 0 HcmV?d00001 diff --git a/lora/punica_wrapper/__pycache__/utils.cpython-312.pyc b/lora/punica_wrapper/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..270e56598510ddae1b6f70a333c25ba0bea904ae GIT binary patch literal 5789 zcmb^#TWk~Ab;h0wEfLA5*LgKE_e{@g-8+Mo8^JD!R2 z0CuI`#54Dvd(Y#ZbMCq4%%6fm9|bMV{8au~Gev!c2X5$i3nh}(PS8$mq?|M63vRvC7l2<7c(iPn2ao(_0VEA9y5gZ{n01^s zgCG3x`ywazK3cQPS(N%}TI7^+5?vmUO7#pYPu3ODH zi=2+`x?hu^$mop3N?eSZ^*Yl|M|Vq{D;gNW`hpM>DY5c7H~rj z*y-5wuf*kadN%B!I33AtOI8HuD)PEVV&{%fl;Y88$wPWBi3T{MyL4Lj=&a7^;Fg!E zxL3p~@(IvJktS#}NRYErIVB@%axN<<=xRY$kR$+zOsdnFd0`GIS(NS*ejsZz1S(7+ z5sZ!0K|<+xpKxwUNaeCBN)}uE%1p{xvN{7aw45LcDmY%+A)OH9OkT;&15l_F67L2nD4RlrSRN^4b1?2B zfPp??*xL3anwGO!jGmhkkO<=C6*PsJtC~1X)M61zz;04vI#oy$9jSt-Sj4jkNua7p zQH3Wc&r)W-pe2OKf(DTEB0Ox)AK+7EosncJSIBCrf?XY;Zb{BO!;X3J*?*zE7x$X1 zmQzwQW~7jnuNDxjcT{U~sX|6AWK7=@QgUiCEzcp7P37{7Nzatf*Tp$)6(YBG(o)7p#;@&WxMn6{AC%*bRCF93xonllefYAB}9 z3Ikb?$n+5#CoxOYlg*LXpyI=YeT}sLjPz@ne1Bff_X5Fbr1dToPQCrs+$q$bmM8o3 zi`q;s+drR9XZo?M{dw4ZN=#l;Am|{aA3Q6q_T?AN0C)#z86`6a=yU)JeE!r5JeH{! z`>DPY<=zpaccj9%t~b4KyZShTIa*@&m0FL!;Hb#HyKQ&cN*(Uq1g*@S$=zZiM6OotxnVENh4^k5+k#kKDfW;~$j{^lfxKI&iWQX}Rus%Ev(5 z3V)!&N9_MlHN*$FD#f$hbJ!bYdNNk{Rd6IU8*PK4g`}2C%980NrUBJ~HRf|E5v+%; z(TEl_h|_ftPD1nG2C4cOiTfm0$AJ1>YAcfJ)Lzm0el3#DV&~e??ATW1bncb&p4ZIt zz`I_?8lN-NEpN&-O{H8P$KaOm>RySSI|K1A4DoWWc+ z>fQVnL*aO>d#2sRpyu2ivyHZ-TW>w6HQ1xsh~3+I>nz-k*s9Qq4X>f)`v$Z^b+qh6 z%wj{HdV@q#3!Pa4|8Pcb7{fef=Bjdis4NQCr8$%XGUxa+wkNZ#~DXGe`dnZx!d zhV@V}qDN-&U`KPjG5Q)l(H(rE4xf5V@VK2#qU(GoYFk}rU#-svp8NP5L(`*s;bSdf zZ55+>c(#o!-1d-Mc(btQm(N9CG1vI=xoM`@q=)s09@QK5rYZI^oHQDKMej8#&Wj5m z*y&t$9w~6a*JXtQT+?vBpasE73DrKI4+s8XWxAFq;887x4Q|7%0R<&@jpPg>+^RTZ z0E(D_AWJ|DW8=0n?g1%T)oj8J!WOc1=WP*i!sMWEL0phCg^U1K*d$VLDuDUbG24oo zusA27v_K$`jV#qt3M|sj2sR8Sm=Wgzoos2GWB-!6Qu8$?e(e--nlB|@A+BwUxFt8K z85EqeT#UO+R?-$BHGTtz&u}rJ;BOmo7Xc_exF0f^Gz3R8NCI=R7H3Tc7Yt?t6i}JG z=0I7p$Yd2UJB>^?s6hz;5tFm_4y1BW`?F9Eq~=Wa+6>f3P;0hWWh(Kbd)wN6O;a`4O;`f*N4ukJlx%!yz z+;AEEKxt^a>ZaIem4c;!+t{>x_DL}E^An|giOp#5UxIztX{b>e-&nh3gnHIHABB#= z`e1uG&}9U=O5JaL9yYq)c^nve8je;yls8=UQHf#tVc0l!`l0yf*y!rQmzPR>>q@7= zw^d?Y*8{gN{UlTkQw{yq7RuXE;hUh+sd|``-169q02OMwmA;W)8G2Co&C+K}4-Y?X zIJZ3dq^n?Z}|nfH@iB!(E@c`I|r(O3NCi=-htHvPXp20!#B>CpmqzyE7oA; zXsPFv5j$lB-m0>0ukX2=jR8^%u(6OTd~9XX;DxpLy7*~t>G)|-7+C>|XgaWR{LbL= zSf%CI^7kt3iE{gZ(LS)*4jPX=Y45uC(dtJPzGJP+;18|I8=aqKN^iao=pDcy22yo& z-@CGUrNVn}o+yRh*z_M>pZ~*$f1LRJ1W4Y&J@p>^Rd?y|&}P@WhWFinu&$2p09rLb zd7I08%mA?lz)BT}$KL-_fgX=s#tB*WbJ~bYp0x`Ci*<+gjn* z`ltHF<8oZNq zc`Dpg^^&fS+83+(NjE?RBe%wGj8%hV*g$dqJ$QfkyDPuF^6=!B?l0ah`Ns_Q!ix}I HPPY3$9*?)M literal 0 HcmV?d00001 diff --git a/lora/punica_wrapper/punica_base.py b/lora/punica_wrapper/punica_base.py new file mode 100644 index 0000000..b6186e8 --- /dev/null +++ b/lora/punica_wrapper/punica_base.py @@ -0,0 +1,492 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +import torch + +from .utils import compute_meta, convert_mapping + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.layers import LoRAMapping + + +class PunicaWrapperABC(ABC): + """ + PunicaWrapper ABC. + """ + + @abstractmethod + def update_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: list[int | None], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + **kwargs, + ) -> None: + """ + Update the lora-related metadata + """ + raise NotImplementedError + + @abstractmethod + def add_shrink( + self, + y: tuple[torch.Tensor, ...] | torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + scale: float, + **kwargs, + ) -> torch.Tensor | None: + """ + Performs GEMM for multiple slices of lora_a. + """ + + raise NotImplementedError + + @abstractmethod + def add_expand( + self, + y: torch.Tensor, + x: tuple[torch.Tensor, ...] | torch.Tensor, + lora_b_stacked: tuple[torch.Tensor, ...], + output_slices: tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs, + ) -> torch.Tensor | None: + """ + Performs GEMM for multiple slices of lora_b. + """ + raise NotImplementedError + + @abstractmethod + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs, + ) -> torch.Tensor | None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA, + and this layer only requires the expand operation. + """ + raise NotImplementedError + + @abstractmethod + def add_lora_linear( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + scale: float, + output_slices: tuple[int, ...], + *, + buffer: tuple[torch.Tensor, ...] | None = None, + **kwargs, + ) -> torch.Tensor | None: + """ + Applicable to linear-related lora. + """ + + raise NotImplementedError + + @abstractmethod + def add_lora_logits( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + """ + raise NotImplementedError + + +class PunicaWrapperBase(PunicaWrapperABC): + """ + PunicaWrapperBase is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the punica. + """ + + def __init__( + self, + max_num_batched_tokens: int, + max_batches: int, + device: torch.device | str, + **kwargs, + ): + self._token_lora_indices = torch.empty( + max_num_batched_tokens, dtype=torch.long, device=device + ) + self._sampler_indices = torch.empty( + max_num_batched_tokens, dtype=torch.long, device=device + ) + self._sampler_indices_padded = torch.empty( + max_num_batched_tokens, dtype=torch.long, device=device + ) + self._embeddings_indices = torch.empty( + 2, max_num_batched_tokens, dtype=torch.long, device=device + ) + + # 4 is the number of indices tensors. + # base_indices, sampler_indices, sampler_indices_padded, + # embeddings_indices + self.indices_len: list[int | None] = [None] * 4 + # these attributes are the information required for sgmv kernel + self._seq_start_locs = torch.empty(max_batches, dtype=torch.long, device=device) + self._seq_lengths = torch.empty(max_batches, dtype=torch.long, device=device) + self._lora_indices_per_batch = torch.empty( + max_batches, dtype=torch.long, device=device + ) + self.device: torch.device = device + self.max_length: int = 0 + self.token_nums: int = 0 + self.batch_size: int = -1 + self.is_prefill = False + self.no_lora = False + + def _update_base_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: list[int | None], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + ): + ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + indices_len, + ) = convert_mapping( + mapping, + lora_index_to_id, + max_loras, + vocab_size, + extra_vocab_size, + self.device, + ) + self._token_lora_indices[: base_indices.shape[0]].copy_(base_indices) + self._sampler_indices[: sampler_indices.shape[0]].copy_(sampler_indices) + self._sampler_indices_padded[: sampler_indices_padded.shape[0]].copy_( + sampler_indices_padded + ) + self._embeddings_indices[ + : embeddings_indices.shape[0], : embeddings_indices.shape[1] + ].copy_(embeddings_indices) + + self.indices_len[:] = indices_len + + def _update_prefill_metadata(self, token_lora_tensor: torch.Tensor) -> None: + ( + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + token_nums, + no_lora, + ) = compute_meta(token_lora_tensor) + + self._seq_start_locs[: b_seq_start_tensor.shape[0]].copy_(b_seq_start_tensor) + self._seq_lengths[: seq_length_tensor.shape[0]].copy_(seq_length_tensor) + self._lora_indices_per_batch[: lora_indices_tensor.shape[0]].copy_( + lora_indices_tensor + ) + self.batch_size = batch_size + self.max_length = max_length + self.token_nums = token_nums + self.no_lora = no_lora + + @property + def prefill_metadata( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]: + """ + This property provides a convenient way to access the necessary + metadata for prefill-related kernel computations. + 1. seq_start_locs: Tensor of sequence start positions. + 2. seq_lengths: Tensor of sequence lengths. + 3. lora_indices_per_batch: Tensor of lora indices, and an index of + -1 means no lora should be applied. + 4. batch_size: Batch size after clustering identical lora indices. + 5. max_length: The maximum sequence length in the batch. + 6. token_nums: The token numbers in the batch. + """ + return ( + self._seq_start_locs[: self.batch_size], + self._seq_lengths[: self.batch_size], + self._lora_indices_per_batch[: self.batch_size], + self.batch_size, + self.max_length, + self.token_nums, + ) + + @property + def token_lora_indices(self) -> torch.Tensor: + """ + This property provides the lora indices corresponding to each token + in the batch. An index of -1 means no lora should be applied. + """ + token_lora_len = self.indices_len[0] + return self._token_lora_indices[:token_lora_len] + + @property + def sampler_indices(self) -> torch.Tensor: + """ + This property is used to access the lora indices specifically for + LogitsProcessorWithLoRA. + """ + sampler_indices_len = self.indices_len[1] + return self._sampler_indices[:sampler_indices_len] + + @property + def sampler_indices_padded(self) -> torch.Tensor: + """ + This property provides access to padded sampler indices. + """ + indices_padded_len = self.indices_len[2] + return self._sampler_indices_padded[:indices_padded_len] + + @property + def embeddings_indices(self) -> torch.Tensor: + """ + This property provides access to the indices used for lora embeddings, + specifically for VocabParallelEmbeddingWithLoRA. + """ + embeddings_indices_len = self.indices_len[3] + return self._embeddings_indices[:, :embeddings_indices_len] + + def update_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: list[int | None], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + **kwargs, + ): + self._update_base_metadata( + mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size + ) + + if mapping.is_prefill: + # Update metadata required for prefill-related operators. + self._update_prefill_metadata(self.token_lora_indices) + self.is_prefill = True + else: + self.is_prefill = False + + @abstractmethod + def add_shrink( + self, + y: tuple[torch.Tensor, ...] | torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + scale: float, + **kwargs, + ) -> torch.Tensor | None: + """ + Performs GEMM for multiple slices of lora_a. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + @abstractmethod + def add_expand( + self, + y: torch.Tensor, + x: tuple[torch.Tensor, ...] | torch.Tensor, + lora_b_stacked: tuple[torch.Tensor, ...], + output_slices: tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs, + ) -> torch.Tensor | None: + """ + Performs GEMM for multiple slices of lora_b. + + Semantics: + offset = offset_start + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight + output_slices (tuple[int, ...]): Every slice's size + offset_start (int): The starting position of y, defaults to 0 + add_inputs (bool): Defaults to True. + + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + @abstractmethod + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs, + ) -> torch.Tensor | None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + and this layer only requires the expand operation. + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_inputs (bool): Default to True. + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + @abstractmethod + def add_lora_linear( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + scale: float, + output_slices: tuple[int, ...], + *, + buffer: tuple[torch.Tensor, ...] | None = None, + **kwargs, + ) -> torch.Tensor | None: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0) + + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. + scale (float): Scaling factor. + output_slices (tuple[int, ...]): Every slice's size. + buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + @abstractmethod + def add_lora_logits( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + def moe_lora_align_block_size( + self, + topk_ids: torch.Tensor, + num_tokens: int, + block_size: int, + num_experts: int, + max_loras: int, + adapter_enabled: torch.Tensor, + expert_map: torch.Tensor | None = None, + pad_sorted_ids: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Aligns tokens and experts into block-sized chunks for LoRA-based + mixture-of-experts (MoE) execution. + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + def add_lora_fused_moe( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: list[torch.Tensor], + lora_b_stacked: list[torch.Tensor], + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + max_lora_rank: int, + top_k_num: int, + shrink_config, + expand_config, + adapter_enabled: torch.Tensor, + mul_routed_weight=False, + ): + """ + Performs a fused forward computation for LoRA of + Mixture-of-Experts (MoE) layer. + """ + # TODO: implement it based on torch ops + raise NotImplementedError diff --git a/lora/punica_wrapper/punica_cpu.py b/lora/punica_wrapper/punica_cpu.py new file mode 100644 index 0000000..1a700d9 --- /dev/null +++ b/lora/punica_wrapper/punica_cpu.py @@ -0,0 +1,351 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch + +from vllm.lora.ops.torch_ops import ( + bgmv_expand, + bgmv_expand_slice, + bgmv_shrink, + sgmv_expand, + sgmv_expand_slice, + sgmv_shrink, +) + +from .punica_base import PunicaWrapperBase + + +# The platforms that are compatible with the PyTorch-native implementation can +# inherit this class +class PunicaWrapperCPU(PunicaWrapperBase): + """ + PunicaWrapperCPU is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the pytorch punica ops. + """ + + def __init__( + self, + max_num_batched_tokens: int, + max_batches: int, + device: torch.device | str, + **kwargs, + ): + PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device) + + def _shrink_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + # No LoRA request, so return directly + if self.no_lora: + return + sgmv_shrink( + x, + w_t_all, + y, + *self.prefill_metadata, + scale, + ) + + def _shrink_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) + + def _expand_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_inputs: bool, + ): + # No LoRA request, so return directly + if self.no_lora: + return + sgmv_expand( + x, + w_t_all, + y, + *self.prefill_metadata, + add_inputs, + ) + + def _expand_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_inputs: bool, + ): + bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs) + + def _expand_slice_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool, + ): + # No LoRA request, so return directly + if self.no_lora: + return + sgmv_expand_slice( + x, + w_t_all, + y, + *self.prefill_metadata, + y_offset, + y_slice_size, + add_inputs, + ) + + def _expand_slice_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool, + ): + bgmv_expand_slice( + x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_inputs + ) + + def _apply_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool = True, + ): + """ + Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` + computation, which is suitable for the + GEMM of lora'b. + """ + + expand_slice_fun: Callable = ( + self._expand_slice_prefill if self.is_prefill else self._expand_slice_decode + ) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs) + + def _apply_shrink( + self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, scale: float + ): + """ + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + shrink_fun: Callable = ( + self._shrink_prefill if self.is_prefill else self._shrink_decode + ) + shrink_fun(y, x, w_t_all, scale) + y = y.view_as(y_org) + + def add_shrink( + self, + y: tuple[torch.Tensor, ...] | torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + scale: float, + **kwargs, + ): + """ + Performs GEMM for multiple slices of lora_a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + x = x.view(-1, x.shape[-1]) + # TODO fuse these kernels + for slice_idx in range(len(lora_a_stacked)): + self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale) + + def add_expand( + self, + y: torch.Tensor, + x: tuple[torch.Tensor, ...] | torch.Tensor, + lora_b_stacked: tuple[torch.Tensor, ...], + output_slices: tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs, + ) -> None: + """ + Performs GEMM for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight + output_slices (tuple[int, ...]): Every slice's size + add_inputs (bool): Defaults to True. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + offset_left = offset_start + for slice_idx in range(len(lora_b_stacked)): + self._apply_expand( + y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_inputs=add_inputs, + ) + offset_left += output_slices[slice_idx] + y = y.view_as(y_org) + + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs, + ) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_inputs (bool): Default to True. + """ + + # Embedding layer only need expand op + expand_fun: Callable = ( + self._expand_prefill if self.is_prefill else self._expand_decode + ) + expand_fun(y, x, lora_b_stacked, add_inputs) + + def add_lora_linear( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + scale: float, + output_slices: tuple[int, ...], + *, + buffer: tuple[torch.Tensor, ...] | None = None, + **kwargs, + ) -> None: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0) + + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. + scale (float): Scaling factor. + output_slices (tuple[int, ...]): Every slice's size. + buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. + """ + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + + if buffer is None: + r = lora_b_stacked[0].size(-1) + # We set the buffer to be float32 by default, consistent with the + # triton op + buffer = tuple( + torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) + for _ in range(len(output_slices)) + ) + self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) + self.add_expand( + y, buffer, lora_b_stacked, output_slices, add_inputs=True, **kwargs + ) + + def add_lora_logits( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: torch.Tensor | None = None, + **kwargs, + ) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + r = lora_b_stacked.size(-1) + if buffer is None: + # We set the buffer to be float32 by default, consistent with the + # triton op + buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) + # LogitsProcessorWithLoRA always using bgmv. + bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) + bgmv_expand(buffer, lora_b_stacked, y, self.sampler_indices, add_inputs=True) + y = y.view_as(y_org) diff --git a/lora/punica_wrapper/punica_gpu.py b/lora/punica_wrapper/punica_gpu.py new file mode 100644 index 0000000..5436f4f --- /dev/null +++ b/lora/punica_wrapper/punica_gpu.py @@ -0,0 +1,422 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +from typing import final + +import torch + +from vllm.lora.layers import LoRAMapping +from vllm.triton_utils import HAS_TRITON, triton +from vllm.utils.math_utils import round_up + +if HAS_TRITON: + from vllm.lora.ops.triton_ops import ( + LoRAKernelMeta, + fused_moe_lora, + lora_expand, + lora_shrink, + ) + +from vllm import _custom_ops as ops + +from .punica_base import PunicaWrapperBase + + +@final +class PunicaWrapperGPU(PunicaWrapperBase): + """ + PunicaWrapperGPU is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the punica triton kernel. + """ + + def __init__( + self, + max_num_batched_tokens: int, + max_batches: int, + device: torch.device | str, + **kwargs, + ): + PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device) + + self.max_loras = kwargs["max_loras"] + + self.token_mapping_meta = LoRAKernelMeta.make( + self.max_loras, max_num_batched_tokens, device=device + ) + + # When speculative decoding is enabled, max_num_samples is + # max_batches * (num_speculative_decoding_tokens + 1). + # This line can be optimized by replacing max_num_batched_tokens + # to max_batches * (num_speculative_decoding_tokens + 1). + self.prompt_mapping_meta = LoRAKernelMeta.make( + self.max_loras, max_num_batched_tokens, device=device + ) + + def update_metadata( + self, + mapping: LoRAMapping, + lora_index_to_id: list[int | None], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + **kwargs, + ): + self.is_prefill = mapping.is_prefill + self._update_base_metadata( + mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size + ) + + # Prepare cuda kernel metadata tensors + self.token_mapping_meta.prepare_tensors(self.token_lora_indices) + self.prompt_mapping_meta.prepare_tensors(self.sampler_indices) + + def add_shrink( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + scale: float, + **kwargs, + ): + """ + Performs GEMM for multiple slices of lora_a. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (torch.Tensor): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + x = x.view(-1, x.shape[-1]) + lora_shrink( + x, + lora_a_stacked, + y, + *self.token_mapping_meta.meta_args(x.size(0)), + scale, + ) + + def add_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: tuple[torch.Tensor, ...], + output_slices: tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs, + ) -> None: + """ + Performs GEMM for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensors + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight + output_slices (tuple[int, ...]): Every slice's size + add_inputs (bool): Defaults to True. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + + assert x.ndim == 3 + assert x.size(0) == len(output_slices) + num_tokens = x.size(1) # first dimension is the num slices + + lora_expand( + x, + lora_b_stacked, + y, + *self.token_mapping_meta.meta_args(num_tokens), + offset_start=offset_start, + add_inputs=True, + ) + + y = y.view_as(y_org) + + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs, + ) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_inputs (bool): Default to True. + """ + + lora_expand( + x.unsqueeze(dim=0), + (lora_b_stacked,), + y, + *self.token_mapping_meta.meta_args(x.size(0)), + offset_start=0, + add_inputs=add_inputs, + ) + + def add_lora_linear( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + scale: float, + output_slices: tuple[int, ...], + *, + buffer: torch.Tensor | None = None, + **kwargs, + ) -> None: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0) + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. + scale (float): Scaling factor. + output_slices (tuple[int, ...]): Every slice's size. + buffer (Optional[torch.Tensor]): Defaults to None. + """ + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + + import vllm.envs as env + if env.VLLM_USE_LORA_FUSION: + import ixformer.inference.functions as ops + + num_token, m = x.size(0), x.size(-1) + k, n = lora_b_stacked[0].size(-1), y.size(-1) + if len(lora_a_stacked) == 1 and ops.lora_gemv_optim_condition(num_token, m, k, n): + ops.add_lora_linear(y, x, lora_a_stacked, lora_b_stacked, + lora_bias_stacked = None, scale = 1.0, output_slices = (1,)) + return + + assert buffer is None, ( + "To minimize overhead, the buffer should be created by " + ".add_lora_linear() instead of being passed in." + ) + r = lora_b_stacked[0].size(-1) + # We set the buffer to be float32 by default, refer to: + # https://github.com/triton-lang/triton/issues/1387 + # Note: buffer is zeroed inside the shrink op + buffer = torch.empty( + (len(output_slices), x.size(0), r), dtype=torch.float32, device=x.device + ) + + self.add_shrink( + buffer, # type: ignore + x, + lora_a_stacked, + scale, + **kwargs, + ) + self.add_expand( + y, + buffer, # type: ignore + lora_b_stacked, + output_slices, + add_inputs=True, + **kwargs, + ) + + def add_lora_logits( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: torch.Tensor | None = None, + **kwargs, + ) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor): lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]): Default to None. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + r = lora_b_stacked.size(-1) + + assert buffer is None, ( + "To minimize overhead, the buffer should be created by " + ".add_lora_linear() instead of being passed in." + ) + # We set the buffer to be float32 by default, refer to: + # https://github.com/triton-lang/triton/issues/1387 + # Note: buffer is zeroed inside the shrink op + buffer = torch.empty((x.size(0), r), dtype=torch.float32, device=x.device) + + lora_shrink( + x, + [lora_a_stacked], + buffer.unsqueeze(dim=0), + *self.prompt_mapping_meta.meta_args(x.size(0)), + scale, + ) + + lora_expand( + buffer.unsqueeze(dim=0), + [lora_b_stacked], + y, + *self.prompt_mapping_meta.meta_args(buffer.size(0)), + add_inputs=True, + ) + y = y.view_as(y_org) + + def moe_lora_align_block_size( + self, + topk_ids: torch.Tensor, + num_tokens: int, + block_size: int, + num_experts: int, + max_loras: int, + adapter_enabled: torch.Tensor, + expert_map: torch.Tensor | None = None, + pad_sorted_ids: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Aligns tokens and experts into block-sized chunks for LoRA-based + mixture-of-experts (MoE) execution. + """ + max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + if pad_sorted_ids: + max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) + sorted_ids = torch.empty( + (max_loras * max_num_tokens_padded,), + dtype=torch.int32, + device=topk_ids.device, + ) + max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size) + # Expert ids must be set default to -1 to prevent a blank block + expert_ids = torch.empty( + (max_loras * max_num_m_blocks,), + dtype=torch.int32, + device=topk_ids.device, + ) + num_tokens_post_pad = torch.empty( + (max_loras), dtype=torch.int32, device=topk_ids.device + ) + + (token_lora_mapping, _, _, _, lora_ids, _) = self.token_mapping_meta.meta_args( + num_tokens + ) + + ops.moe_lora_align_block_size( + topk_ids, + token_lora_mapping, + num_experts, + block_size, + max_loras, + max_num_tokens_padded, + max_num_m_blocks, + sorted_ids, + expert_ids, + num_tokens_post_pad, + adapter_enabled, + lora_ids, + ) + if expert_map is not None: + expert_ids = expert_map[expert_ids] + + return sorted_ids, expert_ids, num_tokens_post_pad + + def add_lora_fused_moe( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: list[torch.Tensor], + lora_b_stacked: list[torch.Tensor], + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + max_lora_rank: int, + top_k_num: int, + shrink_config, + expand_config, + adapter_enabled: torch.Tensor, + mul_routed_weight=False, + ): + """ + Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer. + """ + (_, _, _, _, lora_ids, _) = self.token_mapping_meta.meta_args(x.size(0)) + fused_moe_lora( + y, + x, + lora_a_stacked, + lora_b_stacked, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + max_lora_rank, + top_k_num, + lora_ids, + adapter_enabled, + shrink_config.get("BLOCK_SIZE_M", 64), + shrink_config.get("BLOCK_SIZE_N", 64), + shrink_config.get("BLOCK_SIZE_K", 32), + shrink_config.get("GROUP_SIZE_M", 8), + shrink_config.get("NUM_WARPS", 4), + shrink_config.get("NUM_STAGES", 3), + shrink_config.get("SPLIT_K", 1), + expand_config.get("BLOCK_SIZE_M", 64), + expand_config.get("BLOCK_SIZE_N", 64), + expand_config.get("BLOCK_SIZE_K", 32), + expand_config.get("GROUP_SIZE_M", 8), + expand_config.get("NUM_WARPS", 4), + expand_config.get("NUM_STAGES", 3), + expand_config.get("SPLIT_K", 1), + mul_routed_weight, + ) \ No newline at end of file diff --git a/lora/punica_wrapper/punica_selector.py b/lora/punica_wrapper/punica_selector.py new file mode 100644 index 0000000..d8763e9 --- /dev/null +++ b/lora/punica_wrapper/punica_selector.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.import_utils import resolve_obj_by_qualname + +from .punica_base import PunicaWrapperBase + +logger = init_logger(__name__) + + +def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: + punica_wrapper_qualname = current_platform.get_punica_wrapper() + punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname) + punica_wrapper = punica_wrapper_cls(*args, **kwargs) + assert punica_wrapper is not None, ( + "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong." + ) + logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1]) + return punica_wrapper diff --git a/lora/punica_wrapper/punica_tpu.py b/lora/punica_wrapper/punica_tpu.py new file mode 100644 index 0000000..090878d --- /dev/null +++ b/lora/punica_wrapper/punica_tpu.py @@ -0,0 +1,359 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from typing import TYPE_CHECKING + +import torch +import torch.nn.functional as F +import torch_xla + +from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink +from vllm.lora.punica_wrapper.utils import convert_mapping + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.layers import LoRAMapping + +from .punica_base import PunicaWrapperBase + + +class PunicaWrapperTPU(PunicaWrapperBase): + """ + PunicaWrapperTPU is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the pytorch punica ops. + """ + + def __init__( + self, + max_num_batched_tokens: int, + max_batches: int, + device: torch.device | str, + **kwargs, + ): + PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device) + + # PunicaWrapperBase defines some tensors with dtype=torch.int64, which + # isn't supported by the TPU. So convert those tensors to int32. + # Not all of them are used by the TPU so only convert the useful ones. + self._token_lora_indices = self._token_lora_indices.to(dtype=torch.int32) + self._sampler_indices = self._sampler_indices.to(dtype=torch.int32) + self._sampler_indices_padded = self._sampler_indices_padded.to( + dtype=torch.int32 + ) + + torch.ops.xla.dynamo_set_buffer_donor_(self._token_lora_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch, True) + + torch._dynamo.mark_dynamic(self._token_lora_indices, 0) + torch._dynamo.mark_dynamic(self._embeddings_indices, 1) + torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0) + + def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor: + return torch.narrow(self._token_lora_indices, 0, 0, x.size(0)) + + @property + def embeddings_indices(self) -> torch.Tensor: + """ + This property provides access to the indices used for lora embeddings, + specifically for VocabParallelEmbeddingWithLoRA. + """ + return self._embeddings_indices[:] + + @property + def sampler_indices_padded(self) -> torch.Tensor: + """ + This property provides access to padded sampler indices. + """ + return self._sampler_indices_padded[:] + + def shrink( + self, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + return bgmv_shrink(x, w_t_all, self._get_token_lora_indices(x), scale) + + def expand( + self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, add_inputs: bool + ): + return bgmv_expand(x, w_t_all, y, self._get_token_lora_indices(x), add_inputs) + + def expand_slice( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool, + ) -> torch.Tensor: + return bgmv_expand_slice( + x, + w_t_all, + y, + self._get_token_lora_indices(x), + y_offset, + y_slice_size, + add_inputs, + ) + + def add_shrink( + self, + y: tuple[torch.Tensor, ...] | torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + scale: float, + **kwargs, + ) -> torch.Tensor | None: + """ + Performs GEMM for multiple slices of lora_a. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + torch.ops.xla.dynamo_set_buffer_donor_(y, True) + x = x.view(-1, x.shape[-1]) + + for slice_idx in range(len(lora_a_stacked)): + lora_s = lora_a_stacked[slice_idx] + y_s = self.shrink(x, lora_s, scale) + y[slice_idx, :, :] = y_s # type: ignore[index] + return y + + def add_expand( + self, + y: torch.Tensor, + x: tuple[torch.Tensor, ...] | torch.Tensor, + lora_b_stacked: tuple[torch.Tensor, ...], + output_slices: tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs, + ) -> torch.Tensor: + """ + Performs GEMM for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight + output_slices (tuple[int, ...]): Every slice's size + add_inputs (bool): Defaults to True. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + offset_left = 0 + + for slice_idx in range(len(lora_b_stacked)): + y = self.expand_slice( + y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_inputs=add_inputs, + ) + offset_left += output_slices[slice_idx] + return y.view_as(y_org) + + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs, + ) -> torch.Tensor: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_inputs (bool): Default to True. + """ + + # Embedding layer only needs the expand op + return self.expand(y, x, lora_b_stacked, add_inputs) + + def add_lora_linear( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + scale: float, + output_slices: tuple[int, ...], + *, + buffer: tuple[torch.Tensor, ...] | None = None, + **kwargs, + ) -> torch.Tensor: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0) + + Args: + y (torch.Tensor): Output tensor. Will not be changed in-place. + x (torch.Tensor): Input tensor (T, E) + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. + scale (float): Scaling factor. + output_slices (tuple[int, ...]): Every slice's size. + buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None. + """ + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + + if buffer is None: + r = lora_b_stacked[0].size(-1) + T = x.size(0) + buffer = torch.zeros( + (len(output_slices), T, r), + dtype=x.dtype, + device=x.device, + ) + buffer = self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) + return self.add_expand( + y, buffer, lora_b_stacked, output_slices, add_inputs=True, **kwargs + ) + + def add_lora_logits( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + + sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0)) + buffer = bgmv_shrink(x, lora_a_stacked, sampler_indices, scale) + y = bgmv_expand(buffer, lora_b_stacked, y, sampler_indices, add_inputs=True) + return y.view_as(y_org) + + # This performs the same tensor ops as the base method, except it does them + # on the CPU then transfers the results to the TPU + def _update_base_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: list[int | None], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + ): + # Make sure we don't accidentally collect outside operations + torch_xla.sync() + + # Pad the prompt mapping to avoid running into recompiles on the TPU + # TODO: Should this happen inside mapping internally? If so how can we + # avoid having backend specific LoRAMapping classes? + mapping.prompt_mapping = self._pad_prompt_mapping(mapping.prompt_mapping) + + ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + indices_len, + ) = convert_mapping( + mapping, + lora_index_to_id, + max_loras, + vocab_size, + extra_vocab_size, + "cpu", + ) + self._token_lora_indices = self._pad_to_shape( + base_indices, self._token_lora_indices.shape, dims=1 + ).to(self.device) + self._sampler_indices = self._pad_to_shape( + sampler_indices, self._sampler_indices.shape, dims=1 + ).to(self.device) + self._sampler_indices_padded = self._pad_to_shape( + sampler_indices_padded, self._sampler_indices_padded.shape, dims=1 + ).to(self.device) + self._embeddings_indices = self._pad_to_shape( + embeddings_indices, self._embeddings_indices.shape, dims=2 + ).to(self.device) + self.indices_len[:] = indices_len + + def _update_prefill_metadata(self, token_lora_tensor: torch.Tensor) -> None: + self.batch_size = 1 + self._lora_indices_per_batch[: self.batch_size] = token_lora_tensor[ + : self.batch_size + ] + + def _pad_prompt_mapping(self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]: + num_reqs = len(prompt_mapping) + + # From vllm/v1/worker/tpu_model_runner:51, but need to avoid a circular + # import + MIN_NUM_SEQS = 8 + + padded_num_reqs = max(2 ** math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS) + pad_len = padded_num_reqs - num_reqs + + padding = [-1] * pad_len + return tuple(list(prompt_mapping) + padding) + + def _pad_to_shape(self, src, target_shape, dims=1): + if dims == 1: + pad_len = target_shape[0] - src.shape[0] + return F.pad(src, (0, pad_len), value=0).to(torch.int32) + else: + pad_rows = target_shape[0] - src.shape[0] + pad_cols = target_shape[1] - src.shape[1] + return F.pad(src, (0, pad_cols, 0, pad_rows), value=0).to(torch.int32) diff --git a/lora/punica_wrapper/punica_xpu.py b/lora/punica_wrapper/punica_xpu.py new file mode 100644 index 0000000..b95087d --- /dev/null +++ b/lora/punica_wrapper/punica_xpu.py @@ -0,0 +1,279 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +from typing import final + +import torch + +from vllm.lora.layers import LoRAMapping +from vllm.lora.ops.ipex_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink + +from .punica_base import PunicaWrapperBase + + +@final +class PunicaWrapperXPU(PunicaWrapperBase): + """ + PunicaWrapperXPU is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the punica ipex kernel. + """ + + def __init__( + self, + max_num_batched_tokens: int, + max_batches: int, + device: torch.device | str, + **kwargs, + ): + PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device) + torch._dynamo.mark_dynamic(self._token_lora_indices, 0) + torch._dynamo.mark_dynamic(self._embeddings_indices, 1) + torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0) + + def update_metadata( + self, + mapping: LoRAMapping, + lora_index_to_id: list[int | None], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + **kwargs, + ): + self.is_prefill = mapping.is_prefill + self._update_base_metadata( + mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size + ) + + def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor: + return torch.narrow(self._token_lora_indices, 0, 0, x.size(0)) + + def _apply_shrink( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + bgmv_shrink(x, w_t_all, y, self._get_token_lora_indices(x), scale) + + def _apply_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool, + ): + token_lora_indices = self._get_token_lora_indices(x) + bgmv_expand_slice( + x, w_t_all, y, token_lora_indices, y_offset, y_slice_size, add_inputs + ) + + def add_shrink( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + scale: float, + **kwargs, + ): + """ + Performs GEMM for multiple slices of lora_a. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (torch.Tensor): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + x = x.view(-1, x.shape[-1]) + for slice_idx in range(len(lora_a_stacked)): + self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale) + + def add_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: tuple[torch.Tensor, ...], + output_slices: tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs, + ) -> None: + """ + Performs GEMM for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensors + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight + output_slices (tuple[int, ...]): Every slice's size + add_inputs (bool): Defaults to True. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + + assert x.ndim == 3 + assert x.size(0) == len(output_slices) + + # TODO fuse these kernels + for slice_idx in range(len(lora_b_stacked)): + self._apply_expand( + y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_start, + output_slices[slice_idx], + add_inputs=add_inputs, + ) + offset_start += output_slices[slice_idx] + y.view_as(y_org) + + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs, + ) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_inputs (bool): Default to True. + """ + token_lora_indices = self._get_token_lora_indices(x) + bgmv_expand(x, lora_b_stacked, y, token_lora_indices, add_inputs) + + def add_lora_linear( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], + scale: float, + output_slices: tuple[int, ...], + *, + buffer: torch.Tensor | None = None, + **kwargs, + ) -> None: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0) + + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight. + scale (float): Scaling factor. + output_slices (tuple[int, ...]): Every slice's size. + buffer (Optional[torch.Tensor]): Defaults to None. + """ + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + + if buffer is None: + r = lora_b_stacked[0].size(-1) + # We set the buffer to be float32 by default, refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros( # type: ignore + (len(output_slices), x.size(0), r), + dtype=torch.float32, + device=x.device, + ) + self.add_shrink( + buffer, # type: ignore + x, + lora_a_stacked, + scale, + **kwargs, + ) + self.add_expand( + y, + buffer, # type: ignore + lora_b_stacked, + output_slices, + add_inputs=True, + **kwargs, + ) + + @property + def sampler_indices_padded(self) -> torch.Tensor: + """ + This property provides access to padded sampler indices. + """ + return self._sampler_indices_padded[:] + + def add_lora_logits( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: torch.Tensor | None = None, + **kwargs, + ) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor): lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]): Default to None. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + r = lora_b_stacked.size(-1) + if buffer is None: + # We set the buffer to be float32 by default, refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) + sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0)) + bgmv_shrink(x, lora_a_stacked, buffer, sampler_indices, scale) + bgmv_expand(buffer, lora_b_stacked, y, sampler_indices, add_inputs=True) + return y.view_as(y_org) diff --git a/lora/punica_wrapper/utils.py b/lora/punica_wrapper/utils.py new file mode 100644 index 0000000..584745f --- /dev/null +++ b/lora/punica_wrapper/utils.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING + +import torch + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.layers import LoRAMapping + + +def compute_meta( + token_lora_tensor: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]: + """ + Get the information required for the sgmv kernel. With the features: + 1. If consecutive requests in the batch use the same LoRA, this function + will combine them into a single request, improving sgmv kernel inference + performance. + 2. At the beginning of each prefill stage inference, recalculations are + needed based on the input, but only once. + """ + + lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( + token_lora_tensor, return_counts=True + ) + cum_result = torch.cumsum(seq_length_tensor, dim=0) + b_seq_start_tensor = torch.zeros_like(seq_length_tensor) + b_seq_start_tensor[1:].copy_(cum_result[:-1]) + max_length = seq_length_tensor.max().item() + token_nums = seq_length_tensor.sum().item() + batch_size = lora_indices_tensor.size(0) + no_lora = False + # -1 means no lora should be applied. Use `no_lora` to determine whether + # the current step requires LoRA. If LoRA is not needed, the prefill stage + # does not need to launch the triton kernel, which can improve performance + if batch_size == 1 and lora_indices_tensor == -1: + no_lora = True + return ( + b_seq_start_tensor, + seq_length_tensor, + lora_indices_tensor, + batch_size, + max_length, + token_nums, + no_lora, + ) + + +# TODO see if this can be vectorized +def convert_mapping( + mapping: "LoRAMapping", + lora_index_to_id: list[int | None], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + device: torch.device, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, list[int]]: + """Converts LoRAMapping to index tensors. + + Args: + mapping: LoRAMapping mapping rows in a batch to LoRA ids. + lora_index_to_id: List mapping LoRA ids to LoRA indices. + max_loras: Maximum number of LoRAs. + vocab_size: Model vocab size. + extra_vocab_size: Extra vocab size each LoRA can have. + + Returns: + A tuple of tensors: + base_indices: Tensor of shape [batch_size] mapping batch rows to + LoRA indices. + sampler_indices: Tensor of shape [batch_size] mapping requests to + LoRA indices for sampler. For generation, this will be the + same as base_indices. For prefill, this will map requests + to LoRA indices. + sampler_indices_padded: Tensor of shape [batch_size] mapping + requests to LoRA indices for sampler with padding. + Same as sampler_indices, but -1 is replaced with + max_loras. + embeddings_indices: Tensor of shape [2, batch_size] mapping + requests to embedding indices. First row is for embeddings + added by the LoRAs, second row is for the LoRA.lora_a + embeddings. + indices_len: List of lengths of the above tensors. It contains + (base_indices, sampler_indices, sampler_indices_padded, + embeddings_indices). + """ + index_mapping_indices: list[int] = list(mapping.index_mapping).copy() + embedding_indices = index_mapping_indices.copy() + lora_indices = index_mapping_indices.copy() + + prompt_mapping: list[int] = [ + lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping + ] + lora_idx = None + for i in range(len(index_mapping_indices)): + # TODO index can be slow. optimize + lora_idx = ( + lora_index_to_id.index(index_mapping_indices[i]) + if index_mapping_indices[i] > 0 + else -1 + ) + embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 + lora_indices[i] = lora_idx + + indices_list: list[list[int] | torch.Tensor] = [ + index_mapping_indices, + lora_indices, + embedding_indices, + ] + + indices = torch.tensor(indices_list, dtype=torch.long, device=device) + prompt_mapping_tensor = torch.tensor( + prompt_mapping, dtype=torch.long, device=device + ) + embeddings_indices = torch.stack( + [ + indices[2] * extra_vocab_size, + indices[2] * (vocab_size + extra_vocab_size), + ] + ) + embeddings_indices = torch.where( + embeddings_indices == -1, max_loras - 1, embeddings_indices + ) + base_indices = indices[1] + sampler_indices = prompt_mapping_tensor + sampler_indices_padded = sampler_indices.clone() + sampler_indices_padded = torch.where( + sampler_indices_padded == -1, max_loras - 1, sampler_indices_padded + ) + sampler_indices_padded = torch.arange( + 0, len(sampler_indices_padded), device=device, dtype=torch.long + ) + (sampler_indices_padded * len(sampler_indices_padded)) + + # Contain length of indices tensors. Used to index into each tensor. + indices_len = [ + base_indices.shape[-1], + sampler_indices.shape[-1], + sampler_indices_padded.shape[-1], + embeddings_indices.shape[-1], + ] + + return ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + indices_len, + ) diff --git a/lora/request.py b/lora/request.py new file mode 100644 index 0000000..c97e435 --- /dev/null +++ b/lora/request.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import warnings + +import msgspec + + +class LoRARequest( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True, +): # type: ignore[call-arg] + """ + Request for a LoRA adapter. + + Note that this class should be used internally. For online + serving, it is recommended to not allow users to use this class but + instead provide another layer of abstraction to prevent users from + accessing unauthorized LoRA adapters. + + lora_int_id must be globally unique for a given adapter. + This is currently not enforced in vLLM. + """ + + lora_name: str + lora_int_id: int + lora_path: str = "" + lora_local_path: str | None = msgspec.field(default=None) + long_lora_max_len: int | None = None + base_model_name: str | None = msgspec.field(default=None) + tensorizer_config_dict: dict | None = None + + def __post_init__(self): + if self.lora_int_id < 1: + raise ValueError(f"id must be > 0, got {self.lora_int_id}") + if self.lora_local_path: + warnings.warn( + "The 'lora_local_path' attribute is deprecated " + "and will be removed in a future version. " + "Please use 'lora_path' instead.", + DeprecationWarning, + stacklevel=2, + ) + if not self.lora_path: + self.lora_path = self.lora_local_path or "" + + # Ensure lora_path is not empty + assert self.lora_path, "lora_path cannot be empty" + + @property + def adapter_id(self): + return self.lora_int_id + + @property + def name(self): + return self.lora_name + + @property + def path(self): + return self.lora_path + + @property + def local_path(self): + warnings.warn( + "The 'local_path' attribute is deprecated " + "and will be removed in a future version. " + "Please use 'path' instead.", + DeprecationWarning, + stacklevel=2, + ) + return self.lora_path + + @local_path.setter + def local_path(self, value): + warnings.warn( + "The 'local_path' attribute is deprecated " + "and will be removed in a future version. " + "Please use 'path' instead.", + DeprecationWarning, + stacklevel=2, + ) + self.lora_path = value + + def __eq__(self, value: object) -> bool: + """ + Overrides the equality method to compare LoRARequest + instances based on lora_name. This allows for identification + and comparison lora adapter across engines. + """ + return isinstance(value, self.__class__) and self.lora_name == value.lora_name + + def __hash__(self) -> int: + """ + Overrides the hash method to hash LoRARequest instances + based on lora_name. This ensures that LoRARequest instances + can be used in hash-based collections such as sets and dictionaries, + identified by their names across engines. + """ + return hash(self.lora_name) diff --git a/lora/resolver.py b/lora/resolver.py new file mode 100644 index 0000000..bcfe264 --- /dev/null +++ b/lora/resolver.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod +from collections.abc import Set +from dataclasses import dataclass, field + +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest + +logger = init_logger(__name__) + + +class LoRAResolver(ABC): + """Base class for LoRA adapter resolvers. + + This class defines the interface for resolving and fetching LoRA adapters. + Implementations of this class should handle the logic for locating and + downloading LoRA adapters from various sources (e.g. S3, cloud storage, + etc.). + """ + + @abstractmethod + async def resolve_lora( + self, base_model_name: str, lora_name: str + ) -> LoRARequest | None: + """Abstract method to resolve and fetch a LoRA model adapter. + + Implements logic to locate and download LoRA adapter based on the name. + Implementations might fetch from a blob storage or other sources. + + Args: + base_model_name: The name/identifier of the base model to resolve. + lora_name: The name/identifier of the LoRA model to resolve. + + Returns: + Optional[LoRARequest]: The resolved LoRA model information, or None + if the LoRA model cannot be found. + """ + pass + + +@dataclass +class _LoRAResolverRegistry: + resolvers: dict[str, LoRAResolver] = field(default_factory=dict) + + def get_supported_resolvers(self) -> Set[str]: + """Get all registered resolver names.""" + return self.resolvers.keys() + + def register_resolver( + self, + resolver_name: str, + resolver: LoRAResolver, + ) -> None: + """Register a LoRA resolver. + Args: + resolver_name: Name to register the resolver under. + resolver: The LoRA resolver instance to register. + """ + if resolver_name in self.resolvers: + logger.warning( + "LoRA resolver %s is already registered, and will be " + "overwritten by the new resolver instance %s.", + resolver_name, + resolver, + ) + + self.resolvers[resolver_name] = resolver + + def get_resolver(self, resolver_name: str) -> LoRAResolver: + """Get a registered resolver instance by name. + Args: + resolver_name: Name of the resolver to get. + Returns: + The resolver instance. + Raises: + KeyError: If the resolver is not found in the registry. + """ + if resolver_name not in self.resolvers: + raise KeyError( + f"LoRA resolver '{resolver_name}' not found. " + f"Available resolvers: {list(self.resolvers.keys())}" + ) + return self.resolvers[resolver_name] + + +LoRAResolverRegistry = _LoRAResolverRegistry() diff --git a/lora/utils.py b/lora/utils.py new file mode 100644 index 0000000..0f43ff0 --- /dev/null +++ b/lora/utils.py @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +from typing import TYPE_CHECKING, Optional + +import huggingface_hub +import regex as re +from huggingface_hub.utils import ( + EntryNotFoundError, + HfHubHTTPError, + HFValidationError, + RepositoryNotFoundError, +) +from torch import nn +from transformers import PretrainedConfig + +from vllm.config.lora import LoRAConfig +from vllm.logger import init_logger + +# being imported for _all_lora_classes below +from vllm.lora.layers import ( + BaseLayerWithLoRA, + ColumnParallelLinearWithLoRA, + ColumnParallelLinearWithShardedLoRA, + FusedMoEWithLoRA, + LogitsProcessorWithLoRA, + MergedColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithShardedLoRA, + MergedQKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithShardedLoRA, + QKVParallelLinearWithLoRA, + QKVParallelLinearWithShardedLoRA, + ReplicatedLinearWithLoRA, + RowParallelLinearWithLoRA, + RowParallelLinearWithShardedLoRA, + VocabParallelEmbeddingWithLoRA, +) +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import LinearBase +from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping + +if TYPE_CHECKING: + from vllm.model_executor.layers.logits_processor import LogitsProcessor + from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead + from vllm.model_executor.models.utils import WeightsMapper + +logger = init_logger(__name__) + +_all_lora_classes: set[type[BaseLayerWithLoRA]] = { + VocabParallelEmbeddingWithLoRA, + ColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithLoRA, + QKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithLoRA, + RowParallelLinearWithLoRA, + ReplicatedLinearWithLoRA, + LogitsProcessorWithLoRA, + ColumnParallelLinearWithShardedLoRA, + QKVParallelLinearWithShardedLoRA, + MergedColumnParallelLinearWithShardedLoRA, + MergedQKVParallelLinearWithShardedLoRA, + RowParallelLinearWithShardedLoRA, + FusedMoEWithLoRA, +} + + +def is_moe_model(model: nn.Module) -> bool: + """Checks if the model contains FusedMoE layers and warns the user.""" + if any(isinstance(module, FusedMoE) for module in model.modules()): + logger.info_once("MoE model detected. Using fused MoE LoRA implementation.") + return True + return False + + +def from_layer( + layer: nn.Module, + max_loras: int, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None = None, +) -> nn.Module: + for lora_cls in _all_lora_classes: + # specifying kwargs so they can be easily accessed in decorator + if lora_cls.can_replace_layer( + source_layer=layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + ): + instance_layer = lora_cls(layer) + instance_layer.create_lora_weights(max_loras, lora_config, model_config) + return instance_layer + return layer + + +def from_layer_logits_processor( + layer: "LogitsProcessor", + lm_head: "ParallelLMHead", + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, +) -> LogitsProcessorWithLoRA: + ret = LogitsProcessorWithLoRA( + layer, + lm_head.embedding_dim, + lm_head.weight.dtype, + lm_head.weight.device, + lm_head.get_sharded_to_full_mapping(), + ) + ret.create_lora_weights(max_loras, lora_config, model_config) + return ret + + +def replace_submodule( + model: nn.Module, module_name: str, new_module: nn.Module +) -> nn.Module: + """Replace a submodule in a model with a new module.""" + parent = model.get_submodule(".".join(module_name.split(".")[:-1])) + target_name = module_name.split(".")[-1] + setattr(parent, target_name, new_module) + return new_module + + +def parse_fine_tuned_lora_name( + name: str, weights_mapper: Optional["WeightsMapper"] = None +) -> tuple[str, bool]: + """Parse the name of lora weights. + + args: + name: the name of the fine-tuned LoRA, e.g. + base_model.model.dense1.weight + weights_mapper: maps the name of weight, e.g. + `model.` -> `language_model.model.`, + return: + tuple(module_name, is_lora_a): + module_name: the name of the module, e.g. model.dense1, + is_lora_a whether the tensor is lora_a or lora_b. + """ + + # LoRA weight qualified name usually starts with `base_model.model.`, + # so we remove the prefix `base_model.model.` to make the following + # mapping correctly. + if name.startswith("base_model.model."): + name = name.replace("base_model.model.", "") + name = weights_mapper._map_name(name) if weights_mapper else name + # recover the prefix `base_model.model.` + name = "base_model.model." + name + else: + name = weights_mapper._map_name(name) if weights_mapper else name + + # In some situations, we may not start with `base_model.model.`. + # If we don't (e.g., ibm-granite/granite-speech-3.3-8b), + # we should keep the prefix intact. + start_index = 2 if name.startswith("base_model.model.") else 0 + + parts = name.split(".") + if parts[-1] == "weight" and (parts[-2] == "lora_A" or parts[-2] == "lora_B"): + new_name = ".".join(parts[start_index:-2]) + return new_name, parts[-2] == "lora_A" + + if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": + new_name = ".".join(parts[start_index:-1]) + return new_name, parts[-1] == "lora_embedding_A" + + raise ValueError(f"{name} is unsupported LoRA weight") + + +def is_regex_target_modules( + load_modules: str | list[str], expected_lora_modules: list[str] +) -> bool: + """ + PEFT supports passing `target_modules` in the form of regular expressions, + such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to + determine whether the suffix in the regular expression is present in the + `expected_lora_modules`. + """ + + def is_valid_regex(pattern): + try: + re.compile(pattern) + return True + except re.error: + return False + + def is_subset(sub_list, full_list): + return set(sub_list).issubset(set(full_list)) + + # Similar to PEFT's processing logic, regex-related operations are only + # executed when the load_modules is a `str`. + if not isinstance(load_modules, str): + return False + + if is_valid_regex(load_modules): + match = re.search(r"\((.*?)\)\$?$", load_modules) + if match: + suffix = match.group(1).split("|") + return is_subset(suffix, expected_lora_modules) + return False + + +def get_supported_lora_modules(model: nn.Module) -> list[str]: + """ + In vLLM, all linear layers support LoRA. + """ + + supported_lora_modules: set[str] = set() + for name, module in model.named_modules(): + # get the embedding modules if the module's embedding_modules + # is not empty. + embedding_modules = getattr(module, "embedding_modules", None) + if embedding_modules is not None: + for name in embedding_modules: + supported_lora_modules.add(name) + + # get all the linear subfixes. + if isinstance(module, (LinearBase,)): + supported_lora_modules.add(name.split(".")[-1]) + + if isinstance(module, (FusedMoE,)): + supported_lora_modules.add(name.split(".")[-1]) + + return list(supported_lora_modules) + + +def get_adapter_absolute_path(lora_path: str) -> str: + """ + Resolves the given lora_path to an absolute local path. + + If the lora_path is identified as a Hugging Face model identifier, + it will download the model and return the local snapshot path. + Otherwise, it treats the lora_path as a local file path and + converts it to an absolute path. + + Parameters: + lora_path (str): The path to the lora model, which can be an absolute path, + a relative path, or a Hugging Face model identifier. + + Returns: + str: The resolved absolute local path to the lora model. + """ + + # Check if the path is an absolute path. Return it no matter exists or not. + if os.path.isabs(lora_path): + return lora_path + + # If the path starts with ~, expand the user home directory. + if lora_path.startswith("~"): + return os.path.expanduser(lora_path) + + # Check if the expanded relative path exists locally. + if os.path.exists(lora_path): + return os.path.abspath(lora_path) + + # If the path does not exist locally, assume it's a Hugging Face repo. + try: + local_snapshot_path = huggingface_hub.snapshot_download(repo_id=lora_path) + except ( + HfHubHTTPError, + RepositoryNotFoundError, + EntryNotFoundError, + HFValidationError, + ): + # Handle errors that may occur during the download + # Return original path instead of throwing error here + logger.exception("Error downloading the HuggingFace model") + return lora_path + + return local_snapshot_path + + +def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]: + if is_moe_model(model): + if moe_packed_mapping := get_moe_expert_mapping(model): + # This method generates and returns a dictionary mapping packed module + # names to lists of their corresponding submodule names. It includes + # both static mappings and dynamic mappings for expert layers, where + # the expert indices are expanded based on the configured number + # of routed experts. + packed_modules_mapping = get_packed_modules_mapping(model) + + packed_modules_mapping["experts"] = [ + weight_name.rstrip(".") for _, weight_name, _, _ in moe_packed_mapping + ] + + return packed_modules_mapping + else: + raise AttributeError( + "To support LoRA for MoE model, " + "'get_expert_mapping' must be implemented" + ) + else: + return get_packed_modules_mapping(model) diff --git a/lora/worker_manager.py b/lora/worker_manager.py new file mode 100644 index 0000000..b85151f --- /dev/null +++ b/lora/worker_manager.py @@ -0,0 +1,279 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager +from typing import Any, Literal + +import torch + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.lora.models import ( + LoRAModel, + LoRAModelManager, + LRUCacheLoRAModelManager, + create_lora_manager, +) +from vllm.lora.peft_helper import PEFTHelper +from vllm.lora.request import LoRARequest +from vllm.lora.utils import get_adapter_absolute_path + +logger = init_logger(__name__) + + +class WorkerLoRAManager: + """WorkerLoRAManager that manages LoRA models on the worker side. + + Every request, the requested LoRAs will be loaded (unless they are already + loaded), and every other LoRA will be unloaded.""" + + _manager_cls: type[LoRAModelManager] = LoRAModelManager + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + embedding_modules: dict[str, str], + embedding_padding_modules: list[str], + lora_model_cls: type[LoRAModel] = LoRAModel, + ): + self._lora_model_cls = lora_model_cls + self.embedding_modules = embedding_modules + self.embedding_padding_modules = embedding_padding_modules + self._cached_dummy_lora: None | Literal[False] | LoRAModel = False + self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs + self.max_num_batched_tokens = ( + vllm_config.scheduler_config.max_num_batched_tokens + ) + self.vocab_size = vllm_config.model_config.get_vocab_size() + self.lora_config = vllm_config.lora_config + + # Use get_text_config() in case of multimodal models + text_config = vllm_config.model_config.hf_config.get_text_config() + + self.max_position_embeddings = text_config.max_position_embeddings + self.device = device + # Lazily initialized by create_lora_manager. + self._adapter_manager: LoRAModelManager + + @contextmanager + def dummy_lora_cache(self): + """Use this context manager to reuse the dummy lora model + to avoid creating it repeatedly.""" + self._cached_dummy_lora = None + yield + self._cached_dummy_lora = False + + @property + def is_enabled(self) -> bool: + return True + + def create_lora_manager( + self, + model: torch.nn.Module, + ) -> Any: + lora_manager = create_lora_manager( + model, + max_num_seqs=self.max_num_seqs, + max_num_batched_tokens=self.max_num_batched_tokens, + vocab_size=self.vocab_size, + lora_config=self.lora_config, + device=self.device, + lora_manager_cls=self._manager_cls, + ) + self._adapter_manager = lora_manager + return lora_manager.model + + def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: + try: + supported_lora_modules = self._adapter_manager.supported_lora_modules + packed_modules_mapping = self._adapter_manager.packed_modules_mapping + expected_lora_modules: list[str] = [] + for module in supported_lora_modules: + if module in packed_modules_mapping: + expected_lora_modules.extend(packed_modules_mapping[module]) + else: + expected_lora_modules.append(module) + if module == "experts": + expected_lora_modules.append(module) + expected_lora_modules = list(set(expected_lora_modules)) + lora_path = get_adapter_absolute_path(lora_request.lora_path) + + peft_helper = PEFTHelper.from_local_dir( + lora_path, + self.max_position_embeddings, + lora_request.tensorizer_config_dict, + ) + + # Validates the LoRA configuration against requirements before + # loading weights, throwing an exception if validation fails. + peft_helper.validate_legal(self.lora_config) + + # For some models like Qwen2VL, we need to use hf_to_vllm_mapper + # to ensure correct loading of lora weights. + model = self._adapter_manager.model + hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None) + + lora = self._lora_model_cls.from_local_checkpoint( + lora_path, + expected_lora_modules, + peft_helper=peft_helper, + lora_model_id=lora_request.lora_int_id, + device="cpu", + dtype=self.lora_config.lora_dtype, + target_embedding_padding=self.vocab_size + + self.lora_config.lora_extra_vocab_size, + embedding_modules=self.embedding_modules, + embedding_padding_modules=self.embedding_padding_modules, + tensorizer_config_dict=lora_request.tensorizer_config_dict, + weights_mapper=hf_to_vllm_mapper, + ) + + except FileNotFoundError as e: + # FileNotFoundError should be raised if both + # - No adapter found to download from huggingface (or in + # offline mode) + # - No local adapter files found at `lora_request.lora_path` + # For NotFoundError + raise ValueError( + f"Loading lora {lora_request.lora_name} failed: No adapter " + f"found for {lora_request.lora_path}" + ) from e + except Exception as e: + # For BadRequestError + raise e + + if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: + raise ValueError( + f"LoRA added vocab size {lora.extra_vocab_size} " + f"is greater than lora_extra_vocab_size " + f"{self.lora_config.lora_extra_vocab_size}." + ) + return lora + + def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: + if lora_request.lora_int_id in self.list_adapters(): + return False + if isinstance(self._cached_dummy_lora, LoRAModel): + dummy_lora = self._cached_dummy_lora.clone(lora_request.lora_int_id) + else: + dummy_lora = self._adapter_manager.create_dummy_lora( + lora_request.lora_int_id, rank, self.embedding_modules + ) + if self._cached_dummy_lora is None: + self._cached_dummy_lora = dummy_lora + return self._adapter_manager.add_adapter(dummy_lora) + + def pin_adapter(self, adapter_id: int) -> bool: + return self._adapter_manager.pin_adapter(adapter_id) + + def set_active_adapters(self, requests: set[Any], mapping: Any | None) -> None: + self._apply_adapters(requests) + if mapping is not None: + self._adapter_manager.set_adapter_mapping(mapping) + + def _apply_adapters(self, adapter_requests: set[Any]) -> None: + existing_adapters = self.list_adapters() + models_map = { + adapter_request.adapter_id: adapter_request + for adapter_request in adapter_requests + if adapter_request + } + if len(models_map) > self._adapter_manager.adapter_slots: + raise RuntimeError( + f"Number of requested models ({len(models_map)}) is greater " + "than the number of GPU model slots " + f"({self._adapter_manager.adapter_slots})." + ) + requested_ids = set(models_map) + for adapter_id in existing_adapters - requested_ids: + self.remove_adapter(adapter_id) + for adapter_id in requested_ids - existing_adapters: + self.add_adapter(models_map[adapter_id]) + + def add_adapter(self, adapter_request: Any) -> bool: + if adapter_request.adapter_id in self.list_adapters(): + return False + loaded_adapter = self._load_adapter(adapter_request) + loaded = self._adapter_manager.add_adapter(loaded_adapter) + self._adapter_manager.activate_adapter(loaded_adapter.id) + return loaded + + def remove_adapter(self, adapter_id: int) -> bool: + return self._adapter_manager.remove_adapter(adapter_id) + + def remove_all_adapters(self): + self._adapter_manager.remove_all_adapters() + + def list_adapters(self) -> set[int]: + return set(self._adapter_manager.list_adapters()) + + +class LRUCacheWorkerLoRAManager(WorkerLoRAManager): + """WorkerLoRAManager that manages LoRA models on the worker side. + + Uses an LRU Cache. Every request, the requested LoRAs will be loaded + (unless they are already loaded) and least recently used LoRAs will + be unloaded if the cache is above capacity.""" + + _manager_cls: type[LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager + + def create_lora_manager( + self, + model: torch.nn.Module, + ) -> Any: + lora_manager = create_lora_manager( + model, + lora_manager_cls=self._manager_cls, + max_num_seqs=self.max_num_seqs, + vocab_size=self.vocab_size, + lora_config=self.lora_config, + device=self.device, + max_num_batched_tokens=self.max_num_batched_tokens, + ) + self._adapter_manager = lora_manager + return lora_manager.model + + def _apply_adapters(self, lora_requests: set[LoRARequest]) -> None: + loras_map = { + lora_request.lora_int_id: lora_request + for lora_request in lora_requests + if lora_request + } + if len(loras_map) > self._adapter_manager.lora_slots: + raise RuntimeError( + f"Number of requested LoRAs ({len(loras_map)}) is greater " + "than the number of GPU LoRA slots " + f"({self._adapter_manager.lora_slots})." + ) + for lora in loras_map.values(): + self.add_adapter(lora) + + def add_adapter(self, lora_request: LoRARequest) -> bool: + # Note that this method is not thread-safe. It may be invoked multiple + # times for the same adapter when using multiple API servers. + # This is ok because it's currently only called from + # the single-threaded core engine loop. + + if lora_request.lora_int_id not in self.list_adapters(): + # Load the new adapter first to ensure it is actually valid, before + # evicting any existing adapters. + # This may cause the # of loaded lora adapters to very temporarily + # exceed `--max-cpu-loras`. + lora = self._load_adapter(lora_request) + + # Loading succeeded, now check if we will exceed cache capacity and + # evict if the oldest adapter if so + if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: + assert isinstance(self._adapter_manager, LRUCacheLoRAModelManager) + self._adapter_manager.remove_oldest_adapter() + # Then add the new adapter to the cache + loaded = self._adapter_manager.add_adapter(lora) + else: + # If the lora is already loaded, just touch it to + # update its position in the caches + loaded = ( + self._adapter_manager.get_adapter(lora_request.lora_int_id) is not None + ) + self._adapter_manager.activate_adapter(lora_request.lora_int_id) + return loaded diff --git a/model_executor/__init__.py b/model_executor/__init__.py new file mode 100644 index 0000000..b50f0cb --- /dev/null +++ b/model_executor/__init__.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.model_executor.parameter import BasevLLMParameter, PackedvLLMParameter +from vllm.model_executor.utils import set_random_seed + +__all__ = [ + "set_random_seed", + "BasevLLMParameter", + "PackedvLLMParameter", +] diff --git a/model_executor/__pycache__/__init__.cpython-312.pyc b/model_executor/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d67a13e55857544cbf3585079f360583b92d63d2 GIT binary patch literal 378 zcmZvYze)o^5XNWjHYTV+L@X@qy&~C4q97L9CKB6gmNh%XBfGg{Z!aXJwGUu#=d<_* z3szQkVhX{^JuyhYDL&?#Kfi%_Znrlfgl6~I00CZWvx?|9SssPt0Td|IU@%1!r4iOX z(y?JFLl}dKA2Dq#Skic_*OJKZUdWPW{r+Gmtu*Aw`n{nXO-QX36ZAVJIbr2gykTD@FFWZPTDcK&b;cKo=kegb#}vxg}cqCyk_$C<6MYrnmHl5#lpAFLbK9zZnpz7 cd=kq2An!3EezpLw@db`Q7~6{OnznxVH}`F6c>n+a literal 0 HcmV?d00001 diff --git a/model_executor/__pycache__/custom_op.cpython-312.pyc b/model_executor/__pycache__/custom_op.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8a0b850ba38798de2ed1939713321fd39e0f4ff GIT binary patch literal 7872 zcmcgxU2GfImA*6l(U2O6lt{_)kF2ptTc#~jmMs6oF}#WG#<6S5LG4W)A&W7^8A+5! z9QMx8wg?(gvtC5nZZ|TD!YYEI3Q%Beq*$a+b)K5OrPzm^iCj<}xL6n4eaOQCitPZ| zKJ7VoIHV{uNVhHaLb`K*&b@Qy-gCb5!+&dQ3lk`s|9iQPUPAs8J6;J?8&C7lxJwkG za2lE6#yAe!yvEP?#(b>p)BH1mu>foPwct$4SPMsZq6Spq1J2!LEX0#DLJsd zBEZ)>@Kq`q;}kN+|1Esom~Wm;a0-8081u81 z4_X1mf18X2k@K+@B?z^kv_KtF1gOJG2`a@Jjn_{|8kL z#yZr_>S$!$Xe>v@;%b-Lt!`4tW}uzmu~vJO=y7<$?B=Gk9XJc`=~TPnJ+XB=Zh_Hm z99QC{7Tizix*e!-v2~u@?OPXQP=yf$9aC%b2P(p2=QQ^NPDVTa^W1 zn-JiOMBx8v7)F1{-6bJXCet{xq1^J7eJ~T;Fbv)69EF)%{`_F``LYks@s{5dytUJ< zUaS68V1KjC>zDnW#Cr0`5wqQ!T?mxP_jeFdr@BNE#N@sCB%xb@-jgzrspjAG9_Y`H zecqcC@-}~i-1NOcZg468XeoXwlh5m>l*=1tCU3HX>@R(<|Cp3B4XHn$nNjVjO@y?)Bdu@ z@HZ&csS~<9TTtC*L@&rL3v^c2b<+vD+Ux`rb-Xy~_zaUe0i#$@snY@*syAdg6?A-A z&2an~I%zn;Yaoa4Y=NEdeNGoNU|KeHdEF^GAsI$NxMkTuc|wVo=Kgqk(3~j@7IKCC zChVe``)?MHym0i|5p__@jSm)PLAdjS*EMZsa7I^DO;&HJ*`le_L03Iwy^t=<(hk_L z6!i`*UF8#j&iDybU(S&=z9rPN+P(R%c1N2(vlb;{Y$;LMvfqZk*8Qu=ep}q}IN4uG z96QhM@b81Xd+5TU7T4t8qpHuB ziCo2bLT1T*ZZzemQ6NJ}VL&cBQKgD9aur{kh}V)`q)G9PALWp4f|UkppEO5%;6b-S zRq9xG3GSXLr0atUZsC#8w$Ne=Jr9K~t9^+|--r#T*LQf{{~KZ3Qy?J?pN^lxU}`rs z?t)QkuFz#JN6P$9xqs#^5xNCPQyc|*O78pUc6iVjR!kP~>p|)myG~2>3)jg_b!0z~ z8Mx6LS&by-%}OL;M-t1It;oo|*X_uNC5%w~pay0UUm3FqSnF@;Z7#)?{`p2#1_$V@fwhV;0eMi&R|GDCNpQC%4GuG-c zHkB)YWvaRh>2;$o!osJns6LKJD5{bjk(QHIWa!>*J2GSmL(iYnU%)78Vv`fBF?zF5 zY{Dpv{tYs^4Jv@*Ac-=q+G$62E}ym{2k*UNM-E!T!M`4>bnn+Nnr+G`jGo4Y16Vz$ z;(zYB(e&3aV>V?5Mqh=6M)lN$|A$liI!3`kXmY(UI*MP2O8We%p}yn)(*hU^07e8# zRy?zH3mrA2*idToe3_?1Wp3KPfdIPG=*n&=uv^XPtWQm|V5{E8)+Fb-fgH&nfH#PZ z^Vs|Ay`jt-BJhUxVMDn@O!Qz_neOyj^{00d6xTNi-x}6;*VlQihV^KxH^sh*vo`>+>;9>513_Bx=x zbj0fb0@6Y3v;&!5G-Z84HmR!8!!XF;KjnHWPUrK1Hf6>KKLKp3>ucd9ls4Ba!bY#{ z5=5tNt zV;AgW7pxaXeF(M*TD zR{g2)Ae2o&$6F7vY7K6t(R18TJfXitAWA0F=%{Zf?~xw^G+rmSe2V{TDEws~?JoOi z{2b(gzM9s1=iA($#@kKK;34Yp-GpoaJpCf5aw<4V(Jd+cqbr`^u3}*7Mao{hi|*=; zsT_EsGioMpuvl5o_nQ)=Fw_i$!g=Nr!(uNy1$P(X>71sae_PeyK~F_lpsYx`yJ}L? z$1@s6<>;vzX@}eAaQjPJs=>XxSjyxTsk&ZDgZ4KE(`$BSM&)huvg7 zpH(SdI7Q>*gcWrnQ`A7E^X$7*0g4b#PhxcjD^y=c@HjOPJV22>RmRzMF-Y~pW?D`CWXsUMz-c9tbStJYES9Kql)9|OR;%lq;GFRqHz=`6!22QuZpCfN`Wlr&5 zJ57iPxj6$esb${ngDeM1|51x`PO4Lx>p9HK>3Ngt8dwr(d{(-8s-77=#S)|Gx(r=C zCS8PhY*wlYiIkhD?bn~tw0=nYOiVNqK$c0*tNr!x@7q&qz80IE%`3$$L~arUf*9lt zRLQZAc|q?**E3pC9jLubhdFv92gxM%+WO{T>)>sU519E3&z9n$f-%B7W8&)svU)LZ z!VpTd6J|Y`Eeg49SlSTP(Q`mpiq)-CO+mg36UI1HAl1>jUGk*5(sk7CI{Kh{rR$~n z@M?T_B|c=whbr+ScKpbLz)JkYd}uYYdFiYj*>$hu)5u;+*lR`hK92P+b9YWyksY5# zhL_*?u;cx|vxH&S611*MZ&8F>rlQ&Pf`e@)py)WgL*9WD%RHIqAaTpx7H$E+f*A0l z4YmZyF{|)Yf$E3+U9!M0aMRGOH#ZDIsS}u#3T>wRNP>x9{o}#IDamVKb-OQVdKZwEZwamE*8YfguLh7`XtX+D4XU z=U@4!Z>}cy&Yybs;=;wH(~wVFjcuO4!0zEbN)$sy=-*@>1U%qvLZzdc7VIUaIpwAPV zGS?_J0A56{$u5XZ)753p6C3(EFajsfj8#qgA?x>T7Sb5vgk>4B-bD>__@XR-rI8GT5_D|%LzS>~H% zUg=I?z>Q8=HfbMrG2~zo35!JBU}GDMIj!uh+>EZ{AJ=uw^-@GQGc%WUCm4=1fN}gv zE^AWsQmKsVBeKkXCt>MJNa?%SI)?KZoNucp`?IR1!4C$Un=MAId&v!8&~;+|6Nbaa zXHb3l59D*+7Z9F>V}ZlZ_VR%vPXsP-=t(db*!d*NK^5c!!%qZXVB|@dwfxW$S+^&! zYi)!CJJ$G)z$tFY|AgT4sjoJ)wwnYy*LX2-5GUb*bM6349T=Yb=3;y?yEL*KwzuxH zy7$=~11tWa)q{uTE-qePN-UYn=j^^gYs;YBbzsGR2%Ic@SxaE^(y1o|9@SStYb^;= zpYvUTv)lvU1LdRe6M}6<@Qe+!Yi8e`5*@)!!shc15C5(BO9y0R@#j_7A9Y9E-()C? zrqj-7ykA%iiqPb0ICE2j7nsDb{plMxhj}bFu=b#HUgYpi}W>R=?$D-hxB^6&S<-lQv*nC zC~3QiQyY=mR8qE?Q=5_6A_t5XS~#^8scmK7YvuHIq<56G+$OD+*GNIRU0R3#o${I? zQR;f#KG-33zb*`}mDVE+N*fTa8?%Qt{+v1$vKGXLBk_3TR9r5Ik3~{vC>F95#7C4! zG#yJO5U+b)j+}cg5+9Wh#W*dZj13^&{d7c$49jU*K{Fkv<#aeLCsIizJe-u|cz6W4 z@i-z%Byq0Ly>kAAq!O3HsaS?S?2RR2>2N%G`ZSsq3$Xq7C3Hpwd6&+HOQ}7CBGK=wX5W>LH-)4j+PRQ zF2<@JBUg{qfZQN8@K^_4cN=|eoD?ov2Ahrxp+?1vamoA_PY@4bWN<`B{oqhi3DWEb z~(HpqH zxEhQnBa*E22cM6{q?4Rwv>m)~Moy?nO7eV6lJ)Xho55Hr zh(!{YrG9RqV3#84$UxLuGWQO|sn3f&2>zkvnvga!FWbh2?;?L(7`I54A6j*CbzHb^ z8+asSRq9c+P^HMPj>Z%@6+W#*r0dp#BQ-iAD+O0LoRBYs!y$XY7LBLW!p?ISBFgDh z!5t1qg%Uj~HQ(H#H(r1#1 zE$8F$;VmlJkuS>8Q4IJNV?6pt#*`K`muX%`O!_rwx{#eB-2dez;f}xl@`cN!K*!2w+ljJsNIOr- z_Y^rjsw7f-gOQb%4|PBxq{X&T6CG6)Igt*J#3N}e$>D*JyHHJN(XXeZlunc_G>oR? zaB3_O4VOVwXdaH7Q!`|#BCERmU7^t|xY8rMvng){j*m|dN=Un6cllC`!cN!mHebLc8MwN6JI*TwAs?wwQ};W2LW&I+9Es6kIS!oWncj&) z1xGZQkhZBm+bf7hZ8(++Zbls|XaK`?=`?~vm|8@Ud&z_sday<`DI`TE`^EI8hn$GD+u4dz{nnz}i&i5V2 z^&Ob&JNUt|T;C%o;;;YCYg2)&*rDQ8>68$DGmif;B41<@&4-@`KUIfGHQ7VQE&3jr zvK|puqB5qtt5MS;j0ueEmzG!Y|X;p^IBq$w(0jD?yom=#$`^PCV` zXF5%0q@v6qwQ>jnCd6O&=Gf)2H($H_+Vtl6&Yii=owxiuZ`RN6c`CQ(sl2aw$!6W_ z0fBK-P~!oWaaX)weCWS4iMiJ?-G7nb#) zcZD99l#No;k5E8cY8mAsgeSIa@e7VRgL~apBmqhrT1ioJ#N9Br(9Ez8%iw;EdC-P` z(+a?w6&^yH|1Y3djx2-N&k(vsffn23 zWj9`bP9F%_3YJ$3mU9Kmc|<+2R4kE7M-oxFV1q)tGcKRX(>h zVZEauC1Cv0rWB;24D~+s7y<|@-oTYJ@7ivAyYqFelTR&(LQCs(?M!X1ZquZA$KO2F z^X|j9{XJQ+C+qLY`x~cr=loq+u`BEEqBk+;UzZivsrXz7I4W zrf)ts=RcGc4{^^!*6YGRs9EVpoq{J!`m5167|DV!9Da2)5?9$u5?!Ta6v+zJe%knUkY-f^^LPJof(q!bQ9FH@vOUy5GXAh|ZMP!Ke6--H*u||IRy6}nB?hqGz z7RMpW)Xul}y|r(qJJ;I(nLr5(&SuAhQzzbj>8+O*1VnH4EYi=yF2Py7V0AmXrY?Re z;AhdMCoD7w&boO=bI#GcV6`~9r+0sfmb$4x3nyTsq5>^BN6RNDz*So*r)$A!bF7^i zeSiG=_@aR5g2(RII3vFwzaC!{5G5El-wga&+fUjS(dOm_CxTh|$MGK_-*_MW@%WEW zUw@xD@%~HKU((*)j?S67nbh^>1pwVSn_8qFg1C0cYI7X65R_V!XP?I^ia|3TnoD{X zrx2K_>@>xk4T?F3?1X}@s33Dm4p}@)?0v-p1sN&nJgelpknde3A8NfpA1kG*K$6^r z`mm~sEK@#4zJnQO;rnSNc0K|rictt6)DYxK=$h26gC%cz)t4o-0O>KV*R$e&)5o>K zMXDX*3Zw!vv@ImDQM=R#G*7IaDpQTA3Imm&;ObPB6veUBaDVXGGtf@O64CgmB&UMM zo_H<@em8jZ$)`2N8kAeH=$T+dk-2UnnGQzciX4%~(32ECn1r+#OP%WvKBFn1&_FOb znjRTVBV!mkVraFwF7#%nWOZDi3JX5WDlvu*>Nrxim5h|E1d~HtOOD|4$ssPFc7pp( z9ndv+nhKDM7d+~>!}xaDUPjcGIR=umn8KC`gGHg6^=%^bm6n(=ZhhDCyB6Zvu3HC` zS5On1k2hTW1k+rmL%Li+n?(s(rERZBREc0^+Rf0&P(xfskra_@P}6+tdQvkq-4UxU zyK-VvPVAi8HK)nDWx^%x%(RRaN%6Ll5TNfA7A1*N%fw8AJhRpmxMGLDOCPC509x;_ zoA z$}_QozQhzVQ=yfmrqyI3LcNN|L@2;cO)`;kfC8G2f-@0;%H@Kxky0K)pmRAi)&-Bs zsgNeDjL%VMtIKRPolB-R>Po6)&aa|Y>MR1FUZDBfi&tN~7QPyue)hfaJK1I>XUM8ErOK8SFh(#l8_}#Q z4X9D2w^aNk7VU>Nz1^37#&lDemf@g%SkVMwig%ONMXGc?Mou5xgFhr!n*cyS$PsI*Kw=s*{VT62Ga>LT)Jfe!fECr)el2q~Gu=Pe zxNXvx542tzygE4Tn+t57^el-&RpXWP?T-6y?|dxV{`hU*5mFxcullEVym#Q81Gj3n z-`%)re&dt5jZfy=L$hZWY(mdb%c5ZIsm}SjCyz||Ac{1!(2||>+zqUq4{XQ1@2j^(CF|4s=eu_0x^~_2@5+k1vi@Cpam_q!LQPZcABh{8Eq4}(G|zI>ug;`6}+O3+1^)5JzQAR_g>RsLMdEkG~sh8h3g(Dg=?Tl6cm=}-B93$ z&w%$#DI{|$#}I^k%0-HolM5yh${1y2C?L9}QURt1MN%NHNDBCq$^=#XEd(hVN-%Q# zsDdBKkD2b3=|Pcgf1etF_s!=}fqY%_wY^vOPMw{r3r&jo`j%@)t{$2Ct-1P5lkTsU zLgc$P&7^*C;L46``+jfVP5Ynue&YMko!fm3U%gk_@xqKJ*S>xBh`#HKLd_ST8=-2J z{*^FBMmb4AgaQ`CiAh!@3YeEZMX@La#MG3C?q}$g^jM|R_&G|X;Omq@GCfPKxR9X+b#`wIb4mF1;R z5pc$%vl@%^?XOyxWnt6{VbrCN$ea0VmU*5woFhy{Pqy=|CVHBvUPJ6>v4W7vA6-e% zL}Nt(txVWey&p=?7SB5mm~IhV?Y{DFeo$Tr_){_g}iC4qq>6(Y2M+Siv0aj&7w z>Ox%eGZjuqh5(mC-4uyoB*3-68u*6)E0^ECLGV?Y2?^*-%QCo(Zhl075i`Z28yP`# zdy=wDa5MCg-{-BosL8#7`t(2y6U@XT)wZYjum#xFlBHpy);wKeC&h2sNWf!O**0MZ z18r*R&Hy&`E2*X<@Z1`=!T4=AFN++-75$N>j36r+(FFogcEQh zr}8l$gK_7$!(h-YX+ttDg2NCJn6|{u!)O4Xmms-;jj;b?PsSQr$2NxkkcZt&6khoU0HgxhPTP=( zXDHa>a-!gZgB4pVnW17h~%g5`{7sI(=)!c4&pBL(c0Fhmr5sy|gY6^}(_6L#V; z)XRjHEngPbmF0%Cze4fUUgjz8tl2uZX8WY;?uOp?67M8#T5fIFGdTh8h+F>7n-lW~ zp2^|g@#nKAp3feCK6l`S%l1jj!VmJK#{`^}o62@}4&j zTs|;0Jm0Y`*Rk!EcN=LL9>4rJYZ{Jz-X?U%&qK4{C31`PvmbS=XmFP3@i9 zG@Z(}^xmr3^r8JfyMOA=K64`b{Hg2vq#IBmJlfHQ7-a5)Kydwn!Tf*+py^36J1#s|5+# zFVsCEE}6f-V)rx^GWlzuN19HY`mJW9Sl!IEPm|X%^w%&rCDO4>gq*L<2O?-b2+HpO z6xIos^qIg@DEJjc-k@Nup7e$KNn1eXx4(jZMpe)2%@<;Dqb=|3VR($fX9{k7#DAFz zGU|Mrc9kGhD1hfpX;&$!tgEEZqi2!W`AUwMy@0K+i-UHK*${J(fd?Mg4umdFbs|-S z|FGLk*px)sEBn}g+m2%h@QG$Fxf)!uN1?qp^0xqR#gWud1wOor>XZo~^AwvOf?8HA z#g)mtQ5-F@R}@Rae?A733l2x%N(bsr4#ui^L|t#(fNG`)_Q?IG`l6)YN|;E6*%Xe2h!4{m7;YJR?y9wlW5sQ^VbhbaE>nH$Ep5s zWP~j%+*JkM)5%~Y$ojdUG^#*o;JPJ-86@?qUZVrav@lgq1fqQYDA94hG;nBV||)=h-<;li=j>mUR{;cRIwt#+xuep=sR z$YIp5>gb;-3Um-s?gt>v=A|TdK9bn|RCi8N(;#x$$60g%GQhcQEjj!4ib9UG8n%T4xT9_={UQBT6+o%Os?;68S z0VnVIx}IEJ&rH)?-S$ZlI!>Q|nWRt`oUhxEtJ^UBYOXFcbL>`Kf7aKZt?P#fF|#h$ z)(7Dsuy$I^1wxQT{LS}ny9oJBxmZ>|GJ{X_MD-G)b)Xd4|Dw|njfA-%K%57&A71MF zgLq>>!0Ql~THu$nO!Jc zqxf$;9xb*=OSqD`atm%js*-H7U3TyhEW2!nzl&Y6@|#nS4T;QVxn8dtbjmIqXDj;G zhfX@#Vxd!m&x~mBi z>Byn(Tf%k8Ed@uKMv_We7gA_tXpSr>8XxNq9+%~ydT6lBM~53wKM6idlGBk`JVmY3 z8q&U{d6C8BlqTd9s??nD5Kd;|(8Cyppina$QNYrL(=j-#k=eaadn%Sbg+o-uq^LoS zP=n%3<6c5!x!Vb!Gy;OC4=JU>P`GJtpnPG2@)cEAmz`h)pnzcR#Zkt+Tz3|#+5QR7 zRTXS_t7zSTnzc|HUIzb~@UjjR^`X~YDoyxDfJil`yiKub3W~gi3$IKaboPQdqYiZ< z9xgw0y{M zrr--g^+#q<(Y$9im_t>GQ!iHlwm#=ypB2}C>EZfgGr*<^u#M(`EyL_c1!Nm?{ta2( ztKV2^XpvBwvFQ~=$~~WH$(m`tkxI-{#e-B6hUABq8ybbq8w;ReL9|HDj4w zw-?rhKiAcvqK+oFqyY=JS zefjov^X=PV70h>Z&Ufs{b?jL3I&q3ZaJoG_`RMAv^;Il?G$t&0DH6qKIV$B{Q_Z=i znzvdiO~FHeG4*{!3_sj@p{Fn3)Axzj=jgiI*fRCP)vqr&5QjoA(6r#9m?*S%Oxxe; zU2xNzM`&9+UH{hJio0+jtOVhn{+0|A{+h8d5Hjra38yYtnX&e8i)8$adgTajg zQseo}-_A*o7y0KH;uUvvLnpDqcCPNPhc8BkN8->RJo@yr$AU+BTQ$@fqz-}Pw<8EP z0l&NZ^j(^*jtK7CuD4P=^=)?Z04X$Z=9^lh&7NUaiD*zIqO5??TgMiu`m6S_7i@Aw z)ohR+LZxwBpZ5MB?ftC8Q8R+)WCiydK$Uus-ICOE=&GX=7u6N2YLe>v#Pv(nM7qzd z${CewUHH>5T|}bMrbb;(ys!)s^K0Z+lpg?_l=l#% z%$bNbl}wd!a|R2M0TwDV)WVnAh?PH~Cbkd|Xiz`G%DjNa!<2ts#UCb@Y@K30LFEsp>of^LSW2(c`*K|85q+9#(Fa_ zmhJH0BRKw<0mrXtPgnj0LGWV=euBVc|J`BMo6p;Qr>JxPQtzDduPFAfDTvWWkSSi-((Vr#l4WcVA_6D_dm3a1B%O))@%(I2dEyhvKr! z{6I`Ii~zH%e^P0QkI@Czr`p*S}YT;$KZG9Vh)%-##k)} z^@VDSNny)Yk}3SC8V}Wa@(7tgR2B$E4!S@lGBOe$qpduR3!GBe+lSd4Ln?~S(>Pbu z$$Na~=tw6veI?CS7br2*Q!*!y@^RIo&`NrdC;5h)lPZg2v}!O}>=^<71i&kkDkZJ5 zM>^Etn34*vU16J0JPt!J@f(_bX;>akDr!dwooGxm@wN5e1INNV9Z8)IAUKvfO1Uon#p()W( z&_-aGKgc^P({4J+U@rK+oDVS9sWz1RWyuXSIsfjN`mBHVYy)Chaksi~U!yO@5tJ#n zNfzWzkaC+9KQ_~23Whfejwu+bpUZUC6AEMa!`N}-kCtu`(mG8r6mn1z8B!0Z7=<;{JU_f4(4O~SQ$1GRkTm0ZZ+qw4_5X=BO+3gXbguk*}j81FoEMnGLjo%58^BL_*STrtDCrONVu53z3LAg zm@Ib6WC%J*-dJT&iHm-DTg?z!dLbhl{#%EA71Xvryb z?#iuu>gJB@x~D$a@fQ#Ln+NbVY3JKV967NAS}J$dyx5WxTjs?MBuu4061&x1h*ari ze$uop(5|;gf@GNr(^P$VLql!Nk+WJ_LCtAg)?nI-q*_U>#Zz{sx540|hvKG&`(W+| zh&Vv|E&QmnNNR2N_t#DS)V+e+n4NNo>M$}h+sx;Sof*uH)c=EK?`x<_Tl^Y0Lms$# zV9rktivFINW3$~q-twa@xy=W&;=!!{V7|WT+EZ7bnm(AT-!SR^SZtdYcjv_2H=92= z`jPky&%ys^!~Yn~!90TU!KUO)|0;uR;;g#(B}aN^PTV;g`vAwzo?1TAv;#AzwGUsp zr(6;n&C4&dTZfAOrr|HpAW1qp7XeCrUeCP{CKr^ll zN5ZgIPq@ALfT)qcAsOl=ckrH{`K0!50{i3N-!DlR9xI-&>D$U<1S}8=NF)M{YWu~a5 z%$8Mf&CH+YSA=ZeM$O`x%ig)h%{X(}*gD_Xmuu{sd1xxjt1X}Sm_7kC)1292%P#;=Y~pUpLHo%At1 z*p(A^<$ZxG6SMn1JOa+jeLtLiHPc5KZJ>@a#u5X;jxYGFd~mRl$alq4Jou~=Y(7-1 z&0=ZcaXwAK?lUG9sNXDXm249t`U{Dt@{x)MIo->yjCWv1A zSc7s`ru8^+~$Yp8V`M?Se~zIs63a* zh~JkJaU(zg7pRdr5Jtg3d%j`qLY3g&{F&g=S3oI_lRUx938{=%a{W3ay!ndf!7!Ch)Z3-Z4#o1lx|);?x(b*oA5~U(<8eFaq|$&~xJxm1oI<={O(YZ-B1{w^ z;u0bhY+!QBZgy*5)q^#2pZS!pS_v1K3#mu&H6Q@4%slYD1CtNm>FRlJ_dC1iyLRTf zcG5i#bce%n_(#nL`g4K)n=(jK_b!|r@2;La#AhoWzU2#kViW4S)XlYYqG*zMqM8sX zJUvb%t7&%nWoV>2_rGRvD<6=w6t|aj_T9 z`1X+`5V0%NfN4%!m^PjSZP3SMV$}zW~@{YNU7d*4U1%^p^sMK?F(79(iKLJ(ac;C(+AQ#svZu62g*kjko3Z?+a+h4|0|_Q6cD!Ym09XlWi&kI4u^k1ITX<3 zrhbLUFTaKJHcp55NxiW7Aq#F3&D!g;4WT*vK{z%vWp6gHH&^q(oc)0%yT!5R6K9R1db(>tz|U;zGx}NRZg8xhzHdRm&n)gQ!_UH2 zyW_Y;KR`|K1)tsVv_-#6jN%JEufwCCsYZ06rPkrmZxciG(+zgC?PH$_rvo?2Ahv)m zcou3r4$obGaKV9J0Nd0~urIjiO%&F3Pe`yT1rAu%eMi#K|v^?=ZIcrcwB zq+B@0>d@10tE&A{E!v}0XGy9A#xzcWyZom)UX$&{T0e!1dC-Paolzu zOjhycb&R>F=PLp9#YANXfyO3@B%;xOV`Pm#Fz`(apFp6+UGbO+S>vwXC3U(n>t(Qb z3|Mf321*e_&rew?Z%HYT=&-@wYW-^{EA2B%d-Q&w4l#)QjN@9&C~OH)>xA=y5UPrJ z&=C*uj_|`CrrWz&`KT%xH{$GD1iK%f5n@H6dZ-3QDLxLax{Fj?(w6Gu=CJJ!jmBA- z2ZdggEmG_zt0^$i0-q8(mq=b z5$!b^8&SuKSPES8qmEIy=VG5aEY!&UvQO+U`@sIK>IHlN7mC}719%?w?qaSpMAgs(VvdOCS*Z~fB$jOk)EhIN2Et@c zns$}_q;+FTD!Ba+ZTBk9kpDGCO@YwO-c40;I{#)NLJSM?A!e6-+z z;qqdaD-~;{VlCSz{otkQo2LSA zx4hNzcE?*CbIqaKb-lRi#pnF1y57miOyp+uhyLvSFWl;X;X9UWT`xomhZ}ch`<&RE z9qw;G@r@_0xThYxZST$7MSk;4J$2jOch^;S<$TW7{;{v_%HgSfw|zbNn!tQbFjo_t zJ~Y!X>&u1q=ei%nP3a2`oBL7Rl`8liwJf?gjt(EdZoj7cO5j?{)t33%u3T-`^t0L8 z-b;sHKXS)aJvn+Me%sZRZ|$CR(6x!2GJM;H zOGol`Yp3_;>b72bGVf~4y1MT8w`dO0?VFcec5-}%U;$R!)0%Z}D7s5`Y$?gbAHHa~ zGM@EzW$mi;9qI(JNy2@DFK7S<2^COl|MT=#?vM8>xs5ZX;0~OC7 A?EnA( literal 0 HcmV?d00001 diff --git a/model_executor/__pycache__/utils.cpython-312.pyc b/model_executor/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2641bc321372b6997ed458760139c9090c86117 GIT binary patch literal 3798 zcmb7HZ%iA>6`%30|6l_)#+VSoWkY-=P8yRYgb*&uzav$WONe^>X@q*WTD$`o)?R0J z4VapWUQa6YB8BKK#nR~{qE@;)MU_tK7w#LUerTkAu>%Rd6(>bXE4ANz;woMHsc+U^ z^GAtJ9mzAZGxPq;d++z=`ENBfZUjx?KZrf3LFfhhuv%4lvek+aT0$}!LNb%A{4fs?8AK(AU$4lx!BM)&B&j`=B znH+h!26*}9TH-0os;@}#!cHJ<3;Sl82K1Ps2_q^M;;Kv(flQNVN>^#*Urg8DFgCbL ziS*~#XpCthtVhR0awjDzqDsXCnIhESH9};=L5ZHCi6|Jp(m0s0-_}X!z#YsLD2`__ zMj1SAubit)GU!p69-}l0bG_6Cdm!8wO;Jh`x|mcX9h8lSIiq$`QQ{G6L8B}z3@=kg zB$I@u^|4rDG;E^|ST{E{#!L|Yd$C=QC)<;;VgXKF-2oKfpCv$ zI>gBiR%Nl5=&06hk})Mpigg1^vR6sA&Du_HF3iP8pOX!n*A%+0uj?bOHk)G?9V1W6nANl8hO62KUwO0&vqNk zU)gkd7adtHJMh$X_}fFM@fh_o-Rvf|ccm+4MiYFWL3I9>TS9sTb{UqukQK>h<`{jz`m?E zlRRtFE5M%TMj!>h@5zyoVRKaTX?f+V;&Ll|0}PvuD`RYXp;0bOJ&aT$)mdCdQ>?5`eSIxy-Sr;J0-V zx(E0zgjs$??=+NiCkE4k*hw^vsS5+^3}(-0#x`*Vq2J>r3?dsLd!avZQ#g!!KSyES zutiQ7c*4MwhC3;l?-PjDSTq0%=1L%_IyXuZWI9RD&m4!xMtr%hk#mX~l@#rKq;mW+ zu+fe~_x(T7_j4#$JDBeo__Al^(#oAhyg0Pzed6m}zW&*rhd00YSw7Uc=Iez0|CrK$ z)BGBqsd-aMwCO@BDNB&(X5MJigqO<}G--q=X)cvm9XXQ57CRmrO{qG>eYK6dCSsa8e>o1E90v9%O0Z3Z*KX*V9R2xI{4_#fc~y zj*?Dcp0dKou$$yYl{qly(Ga_O*^C!UX59a3+fi0toMt3XpaWBF*6qtS@HXXKO+^RtH!i()|Fvgzhvw~@f#&R$ z+~GG?`qu)N=c@`Q&gM>B`KV{<=KY(?;q}0eAD^7B`n#ugt#)wr+Fx&db!)w60OsqS z!A<{xg8$$j`Q?E>cs7KNywH(5c)EyiXUAqo*Gj{m-`eQ7n(w%pJ9X`GU6I3eH}Dq6 z)z=mgS5sSbqngHz>Xv+UOE&hj`be<>`CE%YR2|Im4Q9gH70i_~;VxYG)p=?fN-gTN zK^b8o+RQ)Y=NL#U&Dw|j1NAH9jP1;4+m(N~3^#}G)&oz@qKVot>O(vlg+6TSHT=v| z5b%WR780QfYXe#ID0F47n?9x|*y>N9v%D>oyd7jb;wst0hcYkUV8?GmH;2A+pkPaO zAQxh;Zn%(hUYMJK?rU$;2br9!85|pIUV8WbyU!Y4hlgO_!DV}{jX8GT<@sv`@4mb@ z^mU`~;V+-~T8mB;{87<`eE#{W(gWeOO3q=HToUpw2QU+wPelYKERV?8td6|fhLbC4 znDVUBis3#KVuo4DtijW2|uaDP;%MPD@-$4--+2Y0|Jl#64uR8%_=}l2+*EP{A3@9)02rt%onJHm`?o7JP@6Pv^Yh zoGbj?TUWkqxWRCWqO3+mkwTRRr7A!cQYdAYH0);GZv@SPyHo_1{t1|K%3ghgizRfk z3^pA7X7y(9tN=1N4N4F8>>IX3!f@O)Q?$V=P!rOzirD7i^U!sz`Da@uPt}KN5y9t5XhmD? zd0hY1wUuAxyL-3TS~23m{({GMuWO;}-i3t=MGn57+pFeZU*s11vWK%;&exiEAI{rD hTXxtg9^kS6i&I4e!%D|0x7xp2_2q@{*r$2P{{ipJtQ-IU literal 0 HcmV?d00001 diff --git a/model_executor/custom_op.py b/model_executor/custom_op.py new file mode 100644 index 0000000..9ef696d --- /dev/null +++ b/model_executor/custom_op.py @@ -0,0 +1,194 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch.nn as nn + +from vllm.config import get_cached_compilation_config +from vllm.logger import init_logger +from vllm.platforms import current_platform + +logger = init_logger(__name__) + + +class CustomOp(nn.Module): + """ + Base class for custom ops. + Dispatches the forward method to the appropriate backend. + """ + + def __new__(cls, *args, **kwargs): + try: + op_name = cls.__name__ + except AttributeError: + raise TypeError( + f"Cannot instantiate '{cls.__name__}': its 'name' attribute " + f"was not set, possibly because it was not decorated with " + f"@CustomOp.register, or it's the CustomOp base class itself." + ) from None + + if op_name not in cls.op_registry_oot: + op_cls_to_instantiate = cls + else: + op_cls_to_instantiate = cls.op_registry_oot[op_name] + logger.debug( + "Instantiating custom op: %s using %s", + op_name, + str(op_cls_to_instantiate), + ) + return super().__new__(op_cls_to_instantiate) + + def __init__(self): + super().__init__() + self._forward_method = self.dispatch_forward() + + def forward(self, *args, **kwargs): + return self._forward_method(*args, **kwargs) + + def forward_native(self, *args, **kwargs): + """PyTorch-native implementation of the forward method. + This method is optional. If implemented, it can be used with compilers + such as torch.compile or PyTorch XLA. Also, it can be used for testing + purposes. + """ + raise NotImplementedError + + def forward_cuda(self, *args, **kwargs): + raise NotImplementedError + + def forward_hip(self, *args, **kwargs): + # By default, we assume that HIP ops are compatible with CUDA ops. + return self.forward_cuda(*args, **kwargs) + + def forward_xpu(self, *args, **kwargs): + # By default, we assume that XPU ops are compatible with the + # PyTorch-native implementation. + return self.forward_native(*args, **kwargs) + + def forward_cpu(self, *args, **kwargs): + # By default, we assume that CPU ops are compatible with CUDA ops. + return self.forward_cuda(*args, **kwargs) + + def forward_tpu(self, *args, **kwargs): + # By default, we assume that TPU ops are compatible with the + # PyTorch-native implementation. + # NOTE(woosuk): This is a placeholder for future extensions. + return self.forward_native(*args, **kwargs) + + def forward_oot(self, *args, **kwargs): + # By default, we assume that OOT ops are compatible with the + # PyTorch-native implementation. + return self.forward_native(*args, **kwargs) + + def dispatch_forward(self): + # NOTE(woosuk): Here we assume that vLLM was built for only one + # specific backend. Currently, we do not support dynamic dispatching. + compilation_config = get_cached_compilation_config() + enabled = self.enabled() + if enabled: + compilation_config.enabled_custom_ops.update([self.__class__.name]) + else: + compilation_config.disabled_custom_ops.update([self.__class__.name]) + + if not enabled: + return self.forward_native + + if current_platform.is_rocm(): + return self.forward_hip + elif current_platform.is_cpu(): + return self.forward_cpu + elif current_platform.is_tpu(): + return self.forward_tpu + elif current_platform.is_xpu(): + return self.forward_xpu + elif current_platform.is_out_of_tree(): + return self.forward_oot + else: + return self.forward_cuda + + @classmethod + def enabled(cls) -> bool: + # if no name, then it was not registered + compilation_config = get_cached_compilation_config() + custom_ops = compilation_config.custom_ops + if not hasattr(cls, "name"): + logger.warning_once( + "Custom op %s was not registered, which means it won't appear " + "in the op registry. It will be enabled/disabled based on the " + "global settings.", + cls.__name__, + ) + return CustomOp.default_on() + + enabled = f"+{cls.name}" in custom_ops + disabled = f"-{cls.name}" in custom_ops + assert not (enabled and disabled), f"Cannot enable and disable {cls.name}" + + return (CustomOp.default_on() or enabled) and not disabled + + @staticmethod + def default_on() -> bool: + """ + Behavior controlled by `CompilationConfig.custom_ops`: On by default if + 'all', off by default if 'none'. + When PyTorch Inductor is used, 'none' is the default value, + otherwise 'all'. + """ + compilation_config = get_cached_compilation_config() + count_none = compilation_config.custom_ops.count("none") + count_all = compilation_config.custom_ops.count("all") + assert count_none + count_all == 1 + + return not count_none > 0 or count_all > 0 + + # Dictionary of all custom ops (classes, indexed by registered name). + # To check if an op with a name is enabled, call .enabled() on the class. + # Examples: + # - MyOp.enabled() + # - op_registry["my_op"].enabled() + op_registry: dict[str, type["CustomOp"]] = {} + op_registry_oot: dict[str, type["CustomOp"]] = {} + + # Decorator to register custom ops. + @classmethod + def register(cls, name: str): + def decorator(op_cls): + assert name not in cls.op_registry, f"Duplicate op name: {name}" + op_cls.name = name + cls.op_registry[name] = op_cls + return op_cls + + return decorator + + # Decorator to register out-of-tree(oot) custom ops. + # For OOT custom ops: + # if in-tree layer class is registered with an oot_custom_op layer, + # the oot_custom_op layer will be used instead. + # Example: + # - @UnquantizedFusedMoEMethod.register_oot + # class HPUUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod) + # or + # - @CustomOP.register_oot(name="UnquantizedFusedMoEMethod") + @classmethod + def register_oot(cls, _decorated_op_cls=None, name: str | None = None): + def decorator(op_cls): + reg_name = name if name is not None else cls.__name__ + assert reg_name not in cls.op_registry_oot, f"Duplicate op name: {reg_name}" + op_cls.name = reg_name + cls.op_registry_oot[reg_name] = op_cls + return op_cls + + if _decorated_op_cls is None: + # Called with parentheses: @CustomOP.register_oot() + # or @CustomOP.register_oot(name="...") + # So, _decorated_op_cls is None. + # We return the actual decorator function. + return decorator + elif isinstance(_decorated_op_cls, type): # Check if it's a class + # Called without parentheses: @CustomOP.register_oot + # The first argument is the class itself. + # We call the 'decorator' function immediately with the class. + return decorator(_decorated_op_cls) + else: + # Handle other unexpected cases if necessary + raise TypeError("Decorator can only be applied to classes.") diff --git a/model_executor/layers/__init__.py b/model_executor/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model_executor/layers/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb42630e27cc79303aed3f5d10b3d08b8922debe GIT binary patch literal 171 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVdFhwr7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?E|R?rWO_J r$H!;pWtPOp>lIYq;;;d#Da}c>D`Ev2%?QNBAjU^#Mn=XWW*`dy4^1l} literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/activation.cpython-312.pyc b/model_executor/layers/__pycache__/activation.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f344773d2aa8b65f39689efe01e94e482ba965f GIT binary patch literal 33991 zcmeHw3v?UTdFJ2^0w6$wZ;GNoQj`c$B1OHUH}#@vOR}to?a)qOh!|3!0FoYnB858WR%D|bQ9V8txz4VYcDqq~dK!Drp2qIBUBIB1^iFY|ob8+XsX70WJ|K0z;_n-evde%++|@ghLAN}&{Yt&b=ku9 zE<1}eh8*F-u0j?zg`8nmmy3nXp`vhcS1}7)Lhf)$R|yMSL#5%et}-2`=lDXt{AHcg zuC5CFcJ*zRo13`?3gk3;u7eBMzRU&eyz6Cyl;Y)d!mg^k`5jsL3z6Tck-s``epgoh zBIGaD$iFIYes@;>667z{$X}B;e_2-ka^$bj$iF&o{>rTURmfkhk-s)?{#9A|Ymk4n zM*cN<^VeqOUxWM}jr^Xx`RlUs*CT&}M*h0I`PXLUZ$$oe8u{z<=3k$czX|!h8u=U8 z=-hz2&AhSivCR0DL-L62YD7(CfsIJH34bm4YsKGY{I%h43;wp^ZyWx$<8KH4cHXed zZRlDT*u}38d`7#D9%EqlmyKObtcE?vxfg%?@V6g-2W}XZQoIi+`K0Vw?uR$~if|r5;UaalqqrNEL9~lsQ;eh}j z@(ub0e<*~2;P1aEuFZdcct8jRe34*`_v%D@us;~}g$8zDIQX`%mM7St|Du(%B z?}ez(AB_qTit8Q{1inA&8w~lQJp)3R@*MZaE*}YYM@3hUKPvE{A)miL;0q6hy5%mR zE2trJOTUTBzXoOGIZ#B7s9~2OU=8SbV~>7}>(K=aUoPx21&qA8Po|~mtaJ&OwByku zX1q)5BjmFnU%?nRqU*Aq;k*T6;W6nk+WP}1hC=@Bpq%afTK(d|8}J-!KYN-#ey+u4 z^T3_Gz$@M6=^h9V4n_HhXT-B;gXD(4a^8HY86%~!xxNA+6c2&F0|T;^2ex-;d;A*EdRDA!ji z$7Zf5%P#-A%w6WZhLZv{c@Fc-*UhH2Z(uOub%>_O&>$~}R-cc}JD)%!TquAi76v0o z8}R%5q2NVcG+qjY!a_0Krm{u-z@TWRB!feu6XR~!F9dx3pc9vP(dCmP!hH0?K)`Dh zjS)W7!%7zos9G%W`MN{?NW|xh&_u+bxcc$&)@XRJbuc)%De4zMFE)(~?flHHi#z$& zQ1E=~;AIrm-%8DBl_(59!gmiv2ZYv;|1vK`S~HZYW$?04f?{LkSyQJ)e#~{a7dZv* zCs(<9*1~a1!df|RtzN9EA2WW}5fR7r$uvz><|sz6R(!xH z^CgsN7&XSwI#3(a$@l*b?tL0AEkBma0iicBIw0zfu}<`wglgOr*1!{W0Z~8FhQBQl zdPMZ4$0Mw!+m-NQHQ9Y24-ILov?4|147{t{qNDIy$9TuYq46iC+7gbMG2LRJb1X7( zXyU1fLu13^U1NGFqJ1*`^IoCz+H>R2O}VGe-u(RZ=V#9(3)_&$Rg5x*$DBef%GK_H zAL7o#_JC$D0Vh|ZUwU*glyt@Lz&_CRnUOHZVFW01K6C5a2Bl|==sRU5XHtwBi3;^O zGBI*U&Iu}GJj2ZyPYU(8B{abE>P1r^dU=os&*m=$yLqogXhghd!W8PhAe#8_VDz%E zkube!h2Xm^<7Sm7x%Y9X{WSL;g_u zj>YE-40I#f)Ds#2115SRw7?V1XBp06qg&WYF@9s6lLmRyYTxiBO23 zKpsuv$m8&S@@4J=V}Z%~LAk}WbEfe_4wsbMWU81k-FDn^q&Niclzq46J2gubOgT-a zk}3XXXgWl|n*s)HJd+Q{ z0Oy##Y?L_?@XjaCPdKUI5%LjEvN4=w2j#MgwljEo2_(h-ZUuyNW;odKFaU{OrY1V_ zXd@#AGX8;Jo$mDVnsvGYaE>ANK>#Nz4;4;}WXT;JDOIJ@AYyVl74*>&czOuvlc|3t z=tGdo5P~@3p&?bK+?Co(kc!w=jsv8k>B}5JDk81KBZVS(z*W}ZAV0!Dl~bT*OXPxp zlt2JYs35P5JO-imQiupF!%!oHp$OQt_~c@QLzIFVA`r=#GfMn+6-BUHjKUDV2eZ8@ z5gDRTv69RrP~c7{ZbV*$mkoE+IA^}~z(VV>MC-Bn*2iSbvo2A-E*JCQ-n-?s3+2s; z^5)!o8hA*eO@tBz54@~VNeq#}ZUF%>V@{LNs^O6C0c@57SORO5q0!6fIWUTNz$T*{ zFF~C-)5kkbT|P^^Z z{wc%M=_x<{ER%gnSKUneO!sWt+?JW%TaPDPEpcnh3b5x-QT#t{*z+>IQ8)?U2+au2 znWZiu-YgKF%*8MyTg<^HjBUp%G&+MY$#clC^fbIr8Iyd?l>sB#xb&Z72MNciQ#5)( zw98Wf6fjn<(Hn?APnl1UM{wTj5VlcxJ9&g1g`MOP=>z(qiQ6r$IT=(FK0`Md6(yV| zJW1Ye^7g<}1GN1VNgn-0!tfYCvzV;+-8LYzO3v&~=?$jVsm7Zdr#GfJ1ZTIuvUhIp z-%@bNAm2~fw1666xDpA&oxAz`^z#yi<6jBQg;W@h>J}XkR55`S5tDAjD2}K-xusu) zmmk9cNtv|P20(C{@e+dTvOGdQCgEZTtN;kiBHAFI0sy0q{8SJa6HP%lM6Uu!4B{!T zB%k_6i@<6>H3XJMR>rXDiiE5jY;a_GKrM=Z3PPm}0wUCs4xrD4pXprMj7A-QFBV;8UM9}NfL{S+4^%N3GE7ZdjIzk@9MGRRnw55P2;VAAZ zVkR|K$_!TpD-BUv9$tQ|^!JF5(%cQza~$=Y8XrBZxX73h7d-$kNuneL7s+@@!bd^} z>K6z)0^=yc9tqY^vw%BQLY^wstCWelglm2gX(L40u5#}#m+-iXCQ2sGPrQh~it*Tu z_M6A1kIl5roSChgtDkx1*6w6Y%Rf(4lb30xp^L`*Cg}37@S;`R(w3K7YVt=-)_8`B zB#-cUgr-{#YXtw3g%$|PLJI`7d7+BoQK@1{uS!MZi$HG_;k$7|aWdIj`mcz}<$D1e zX=;df^%$8P!w7AS8Cn@jp9YZ&n6=}<1DV$45%O6{3h@Z}3Xspvc&7ur(=HZu;5A6n zi!uKB-&uE&nAa5*HK+mk+En&(QljNbsR+JnUo3+$dsftiTg@2_vKK)9Iz;5 z8b^(N1Ql}Js6l={ybZg26Y4SR$vM4f>>ucdnk(w>zwmMOx&Dj&7)&13+xL*tX}5=o zkYigk`ic~yn_D)wcus|Qe}pIHs3$W=TObwQh&q|n=za7VEoCy2qURAI+Mf1@hWMj` zFd(EQ&5}l4v@j9$)-7UTAQ*`X!Sh2*4KJGeq4M^JMDt1MY7BMz~^WD|))g4LqV{yl0ka;-P%rv}dZ={>6pGvw< z#~r8d)vdc-bE_ua(*CDq|7GKKtgPc_n=5%Si-6OF`crF&^P9 zUCKvD!X!?W($x>@&3$E^)X=cX~kJ*Vqh&v?(onVG=Cx*dsiJLa7` z(U&eal*N&$_US_t!;@VJ*P6I>&GIgzm(O_TS_GKnS()>QNV`#;+yPLe>P~`u(}HqN zpIHlKNq4v6W^DDc;VC~ZVe>yik_hoGh!tHW3$FTvtA3^}>1vEy8(AY!hYz>Cg_@{+ z7jDt~5;CetNxsFph#JS12HNtiF1k(LB>o~ajTwlyK>=CmFm>o=kES@fzH0c8T~jqy z(`FgfBPgSK1XHCZlSjUVpnMC#lv8I~Cu=DYl(m!yN)(t{AUbFut-L4ezagDA(*Jqx z$;zh#U11NHygI;MvOq54w`E34?EkI^^dCxH*>Uvvxlcu1xxyYSwnuCE)-4;u2=J7x zS(;^}_h_l}r?+M?vXUhdH(~^^IXbf(-(p+DN}IBX=Ptvyx@>&AGJch1R?u>0aIS`Y zO-?85A@a)ba%BJt@sK5KO-s)R;^jD?PlcynCbDnV%&9GRQ{{w2FVi}j|2a-+T`SsT z2^EH&R}7?sO}ErNYUq1#>kNGl(GDPJ;q$0n@RLV0ETbG1UZ9`t@Unz@1Vn`ZdGrjL zO447V8&tbk%qRvybD3-ihw0`K@(#h1aRn=Nl^iQ=!yx8K<1Ob^ix5M~ z2Cs_`0bcWIQWss-Q$26G*1l7;b+LN&LiL73^@a)KuUi(2Yi7)E7O#`zTNBl-6rVD2 z?)CT0oU>NhWJU_tbSaC~^$XQ<(s${{wnX(dmahQ+_z)|D96R7!1JX}WLm^JABxxO#bGRDd^-WRl)f0O;X zz}pn3a6AIjIH;q`tSm9RG}fXrJ|)hE)t>%j%%PH3?8EvEYpy|oO(#IhY81;$8-H5`IWzQeD-KOm`hd<IF^!r?xWps}EhpLLVxb$iNnjKXLIxD} zk9bX@2^;*wgRDO#36Uid>si&sRGYykp9CIwXR>ji-NbT{}lpbH2NI|{* z4W|J#)BJxz6H#TnX^H%`#Qc3O^gxbF?_Jl>&0#bWDadiM*i?m7yfz6ilGrhK!xwQY zQSUn}T%(&A`mq^5bA*eMBDE&pKq7EsqhSt-Ue z?X%HWVso*(+rGE^_jk`*kE!;Dsi}xUi)fMc$)G_%C7{L|CK8(C_vEUqdd{`}{h7s=$OmEId}7`z|w30_TCbh!=?nN>to1;>1SRbXT&n zDPG~7t^D_E??k^F`%Wym$hvV0(*XXpck3pY7Q>n0=5hB z`5^`aAo(E%3B&~QeuBS97X`32-)u0gn&N4j*Qz^5Kcq{_W;NBw7z05WVjxHovE`fd z6&P_;C*g>rItfTnWump0gkb&rFi}p$$=$Q`$B0tn2d3KAVT(Rt2&2{ngbG|U9;$64 zc)$4(@?m2?u*N5^FLzs*&N=)e%k!7&5AAEnEZqiA~_D zG&Z@8%oQmV)ijk!s3!{#rGG2OgFluEdG|m%kUH?=k93cPfB4>S%aqG{&}$Zr zP~>00M)4PgsHh(dO1p4gKwi->I4}%4Ta9q~SY5Y<_@i9z(rmm_M;%LRsjJ+3j-qRw z2?yZrl?UoU?BaPq*Z z#I{q3&8HI0PbEF46V;~^t}}7#8Ah1l|_i7@!_2}692ZTBN z!jUAlR(gSMUxk-Nj7S(4t59BBHe=aCFOO&ogzC90WOg*syytG@)o0_Uo_g*1_}O!b zqvsL_&Lx_kPI{h6R6mn&JsY<^`}1NU0`12Teutix`XHM}lr4K)ViE`Nu)^$dyUAY#4hh_l6?@?h_;bpyXM*d5eH39Vh2({opP$p`DK-#EfGSNSvNb*{#Z&U=_ zn<~|tKC2LL3d@TW3a6}(PQa-;(T4MI-4}gC#=G%basMp z-)Cdz6H#)v(g6x7e}YE*Cnn|nll}QfxopRR$1L*mG1 z5g?>(wwg8&q24IVk`T<0>5VH=X(Hfhm`w;g4YLuH*ag)knrNTp7HtB36$x?-U43%t z&d(%Gx@^WV!7?&Xbq2OJ1U}HNm=}vZYXg(^v=7p*b;nnr@T z2&WM7w9zxnlUaxXA?lG@a`@blc2?Gt*gr1}2O~U7!$VQ-@#CO! ze=l~|!%zg3;qmte*f9_(>nUczjAicq&1rI2pV5-7_v?T2dov&X@tr?9COVkv#W%?8 zvB|&srC-MLeAF)*_`wJ}Ng@^x2cs8!m-ucNaDvH1e<0FrQ4JOnZ!(78145T0V7KS) zpj?@}srDIa2u)RYHTFoV?rIEqY!NVhTaf_1jOj%V=Fd=%{Hw26idFjoyho?45fVB^2~{iap8P^M~^W_mzr#yq8&@%ELz zkCGCWIn+`khcnkc()s5&S%uSAmvaL@m3vCLnKa71hD@9slJBWTh@xQ5YU4UNN9Ou- zY@*UxM_ot2e8tM@v4pC_tF9D~ zvF6bN^dhZ#a-8a|5<_Ng)EcnHz$~v=qxkL+7ppx-+`+?-;22hrSs4nccuET_ z*E+$A28KcGeZ7Kz@Iq{z=V-tGJY!E}rl`dixD4ZT%oLG2XRO(Ccwi_L@bnKvJv_?+ zgGJgInqnKgo=sqPIII}W*OZhd|~K(3#`|+YOv@Ry!EOrWKpzOEI0+*!!QHGr;kw@ zU{4r)b7WB9+3lfzU~!(0@uDToZskC~=o0u|oU!Bu-}#}Q9+dDFvIsQ(1?oyNo*4DR zT%v&tpBjTvJ`B5o#F%}NZhy!xu)cwfWCzv4tf2A{ z;m@gtIMq;?)px=I#o1Lw;82{ZyqD9{cF?3y5n zfsi2YP4bkHDV1P_nEC^_kv z@4LAIr)*vIUPaaQ(aF(!j$+bJ{u+NhI2nXC62=Z^pMSM%-qVpRf9&q7y9pkDV2C=I^VaB}=-Tl#MGWT(H(8tTj_lzjW^Q^S7SAv-U1O-*{x+dK5+^ z)3?sOboBP|TgUH|+&wzK_Rzfb@M0Yf5zbp{->a#cS$o@i%ezp&B~ia6S-)+eepjM? zSF(QhLjA!+{lR4Yp@sU6M14oHzB5_#IAyB3x5_h9db{#grR;)!BuH zy@`gs$%g%7N9G-#MfaAm!zsP3aOXQkyMC~4+nuuu+m9x;A5E@1HesAEYFI2PiB}v* z6djl_+;?(?B@2!<2^_dJyyaN?llz5S>FU4bOocmqiBqLgxpl!THma0qH)D9KwM5e*M0!B+T}#0#Y*ZGJ}-g5^5tT%v4!m-x}{L(;nGwEsk31CBuqdyFjl~7=kBS zBLh9r!O&1dxJPk}a=k+#!at&knJk9^+mXR=oA4VHKMxOr7e?{ei5QlTvdH^nq#XmW z+$zyh$+~o`Bu&fIk-tj>?OVu-tpo1T>zgJwEx4N!?xxw&S8C>JlI|U2mPL2@b?>D2 z#`&A!>F`4BwnQ!F-*=z?&hyFILkahxG0Qy&X}&Q21(-OV(2XBmbd*f^Vdc1P{K=`n z&ERx!wr)C{a5Ud5bdx>f8)Y}ErmJ8DCvf}1tqaMj)@0%4J8g-=ZFemR$AMSZe&73= z_jUhkn-Y#ELD7nADb83RZDr1q_bA%OgCNFCWrb(RbyqEqLoX>~L9+XxHFPoHbz0Lz z&KJ;ML>)pSQmI#<>B=%|wJTIUiQRHsPC2@-QAsPO!;ly1kxE(pWQwE{c(`NF#K<8S zy+&t}lLs7HM9+rS=emnKQA1r+S=DH5W(}D0<(A85E3Mq3@mp(6%O;lAgv(>(UHfF7 zw)Iji%Udtgwo*7U0Iu~OM-_V@5Ybp-`;P?$ei$e(V^an&518tbWFeCPCw*an%nS4g z_;N&q=LrzD5J+nB#_HMJk`N$J5X2UqFl^B?)Ow!xoP_y}_lamrxVfG}-SA?*XC!IL z?lf);_QUkFN3x8;&~$e+;@Jud2Efz=G$S6E$MEz5r8Y79)Vn=^s4bq;u>9{Iqz0kY z>N{Si@E%pxNZtT>e*q5$?}Pj>ne>eD!e3GBUsJ3lEr*o|5u-srq?WIz( zA=n%2_lMG!vSAgT;C40*68;CW#fq~i4~b6ykm$~R+)Im28xpRDI2j>07jHY4gzsuy ztg60QF6;z`EmZMy;Hhhz6jWK-9BlbYX63FtQ}_QVL3E?@2eNzwL7kDAK(7f zofA*}@*YZPo-|JgQ*BfJn?2J#GpE1Vmn`y*wJ+MMrk=X_%=9yF+SjUhx~}+;?%kqV z%4eCh+$W}!VB4Jq z>JyIorQ!m&ZOoi1<%-IQLQd^Zx;DhE8<;#+G_k(UqG~ik=x>QAdOoW5i>I{bv~VP9 zL^kHM&MRuo0+}{NiowJ*Xi7%(e};PiwNNYyFsh&-)l!#5=ec%gEd|KHj5NV8iZ?>A zNP{_hkwtbrMi?CT7O+t*TGF3!kXWT9y@V7U6wM|p3K8DN8_5iEfd<`N+R(xOK{#vk z0#i+vN~1MfDX~ZcmegF>yk>VT*w-ZNYvT6iJErdz{!Zb&l1frjE|hFalx&(WXoNu zeGy@?e*FiottRKY<*QOAT=7aPYEl*oSvgl}s(?Z^&RLSOQ^-LUQ!oirh4jnG)p$On z+^IgD#blcrx;Z*Mnu48r8+z379mf&{Q|@Y0v#gUuP|`^*aR{b%Sxw#iOOKW5T`bVM*r2DeYB5*nZZzYecd;@3i5>bA!kjrS;K-Y= zFiQ{O5{u42ijbuheKC(^2onB#1_vV1O%clCq3`Xl8>cBGBSNaG(gNXW1$W9|GhAD|xJ7IvFKeqJz?a zShH-5_#j85(Gd-jTuPE5QJEU@qpI8?uZWWV8kI!8hFm{^D9Nc3B`ub$hB(M7i(}rl z*W4@Loh;uw)^V?8TYUSm*DcAGQ&>>F+j1c8XkN5e#O-S!epy(xC9!Hta@Dr60AQ|qn;RlB3`-k?&!SZIK)L2Bxtfja8Xb-ReE#P^s1R(xNBV4^>|{}$3{K@NVZmh^*xAt2ZnLbbztza4|dSlL@A-ZDupN7 z*qilHRyI}GhwyAzh2TqBjE)vS`z(Nt21X*ZY);P&=_muuy;$wC&dr+iChB4VIRUuJ zs;_rVc8%E>@~V5&UU#p2Te5t|SjS>j!$OreQRSVjOH^$fdwj8S&CG^G<;Jnj#TxHI z&E`bS<~zlAI`76vPI+R{Rkh$+1H)M}X({8I?36#w`Y&4D3)V_}C~Io?3BnCT9Ae~G`yFChov9@t~I zrrcb^`o-qf5A3Bl>;GWdi~aIw2!Z3HjWMo)KixNc&EOKSKdhC94qcw8AKR}9{8d*KeE1{7_vk8?wd!y@Ts_3Pa zDk@b~)N!M8c}4VQ+1n-2pZbZkNNGU#ZDl$`h5^?30#*J6@;rEld7rn8nn7~ctK3Vc zZ$ESEnY-n0H6C2v8s+^yLSKWpvsq%KT~v8nQjRKw25KwYla3}SPO1FCo%;qd<*fo= z;N>F3oUd~8RW0*Xn^&mnXL-&u!4;mf@>6-v*H(H?d?{EN#K9XE|DK-mEUZ-JebxBb z@B@4EXL!YyGieJ8*_0zQkMXf$E|Ml1Tga-xjlC~nHVINLymUj5o%g*|`S zL*KM(i4{xYr2hUE>6>?v7Evb|d&T#w0%RFBdimqh%#PfQp!Z`ptt3hFj}L~@Za6;B zf{nZ;wr4=JF&q=26)C9 z1}eA}uUT}VL(|_GW}T`Q%+L`6eDjy}xIl+=@Xco~GK@C_(Wra%g1aH%Zb-TtuRigv z#l2uzov^Hq*S5`Dwk%dRE>t%ss+(sIC#yH#al>5Nm}}8qJ=OQ7z3E;_+4Vh>d%m$R z4((}q?Y%J{0*JNDUMi0?lYuX-x!JRP^5e$T#U#-6Zmj2kyT`ka9s zrmvvgc?rZ;JlRs3(b{e}9=#%PbXtVut!Nu65@<>O=v4^qls(_<-%fp$ev;}NHOs4^ z3FX4yQYAb!Cm-d4m=-VTenlrQgLKS50~xf!hhZb4`69MA^hVS}BtznMP`$Lwzsh~v zcQ+8yF9nYD2IyUzr^kiv$s9T#YX!wb%Sqxck5BcOz2Dd0*K^AD`C01#rfsyr)Bne4> zolpMActrX_@+y{zrGnxMVleh`bf1)sh_oaW{wu1$DsL-dAL^8*<{3)TMIIvwlN5T1 zJO;&IqY%SwjLEJ>C$JTeu#ZAtqPy(8;uG|fx+rHRV!0qu`TfGvxQqW>_s8xKP4|6h)-ax@8GHQbJtP2@y)+Au|c%B~3}4_r89UZr#Ml5{C<=+>oM()C=mA z>AJV*c2eEAq>DrdlI^S_*Clo9@9V2{M|BeymN>dn+NKnXO6^jmo!Fh!)!x^a>#E0) z5*JFiCPfjcE$Vc8lDakb^<_whR^meGJSmDuZC0n-o78#k>nrix)+G)XN>`Vnh}2$n zx}8bgs{4AoZrv2#I4+b9`-2cl)htgJ(&;vgAt7DUDJgjW^M>`hrU~Kt<;lzUIRrmA ztK%v?KPakuyKMd2rHyZ|KKM4m#hb}5_mW?-{_SGV2Ne#JdCB0c)W^BfRDGdt)70S< zhs(^N+nu*Mmniswhx0W3plI#e_+Q-kcG-y!3QUyA;z1^u$}#I;wlBruGJEKi&biL} J6lApF{{dGs(Gmav literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/attention_layer_base.cpython-312.pyc b/model_executor/layers/__pycache__/attention_layer_base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd41e35f88b601ff5469b821f61fd0e03720ef0d GIT binary patch literal 1564 zcma)6O>f*p7#@GFcasg9rB#KLtg3+0E=W|7@MYw+$5|`>Vm!O* z?x}~|`UA?5BdC9XpMV4hdbN}TLI`o&bm78@cgAZst%SrJX5N{3{eC>pGybXH?;#k8 z{xsgw5&BJRnqjPs)fkLtC`29#l~7HTRQ1$U^E5^5wM0)1&ycpBm}$rBNZUxPwCi;h zL=9$wpPBU14R1q1Dmp}A=M04wRiA5h4sUbbwR?*X4*ZhhAo0r*M>)ej=QQJSp6w;^ zDaDCDr>u1T6yMltrR^UKt1W+Aa^?p-rF@cywo>&Uef7nCGW_)Z@aVJAAsqCMlO!GH zSrnhZZsX{9=m!(}WkCbj4)&WI2Yzr$voNSDs%LY(Sv>&b8KTHj00uQwJdNt1N{vu^ z2W}!Y)X((iYiG}dJA-!M-U`h{2kv!f_|<)PZrWCLwaxrM^d(wf{M^GZ{tpWHX8ZXr ze(0xTe-~38IJS(yCqBpJv?y}MF`rO8yM<|X5@*!uNry+1xWom^XK_eN?BgI$(>%j* z#wm;ZfJ*pJDCco@@`?sLt}!ZQjELna4&x}I3|Ok!!F*oO(vkO-$ta()j9CCopsdkp z7Qy@qBnPn06S{aTKWH?9zA>lS1MsB+-C#bc_R` zUR;8FB}tbcX}ur%b3CSalxGyjkraSz35Pkbo#h;7l!mwoR?{WYj94Gau~oJO3%d$a zDQd)+y#-5kMKpOjBeDb{N?7dt2NbV)y(a>#vDgGG*cCXidzD2<=BJd9sz(TrGEIcN zNyyhzKUv>&2nq9m5GJ%z=|W#@045V}lZl{gTTG&G8CRCd#BWVbqVV!_n9k6JY8uvZ zfXwZssv7-ocYgWs&Wr1JUn1D7wB~qeNy{+$-+4;}%MV9C?!34+dMRw_cAe3U`g-@3b+1PA6ZSqN7tf=-3DZ(j6y?H1%I+WN#yP^jqwDABqhGbbxxW9GxvliS LdHZh!i|p-RO01}= literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/batch_invariant.cpython-312.pyc b/model_executor/layers/__pycache__/batch_invariant.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fd9bcf0f6f64fe5c8b6f624601f803f1acb869e GIT binary patch literal 31725 zcmd75dsJIjx+i#!-ftoCep)tQ5C&}U0|z@!Fvd2v!Lh-W#FdmU9tn_3}@J+fydir*`??2<}uI@^&?lmJtR9SU~n)ckAS>5?#)*`Wc zy6^4B%=3(l2lF49GiXi^E>{-xDw;_!@Weh~Pp0LT3m1>~nN<9Yl< z!*O>wALrvoxr_8xjq+oviz=S{>QVKW=AwrAHKW=w-9;Vq^P~EUdRE3TX1r)*aoSPS znE9fa`E{d~G3!Mu^Xo@#WA=;ov4V>QENmEcj5#klc}~Uo&Hlm@j-*$cDZ`Uclz1_&6SDt_7_jmet_!>}$8Zp@Af@Et!Z}kGxzlKBLBLm33|u;VnH}C>JND){DNn4!siY9rwf~18{?iA zReRboNjwu_K{OqI>7eKAg~69D41Pho*cU1|m+?o>i=uXTbbKQ8g#iIkdub}{54lxB z5rr)NQU4gtqEKMk?=}c#L<$yitdy+Aq!9Trkq?UeRnf+ZvU*Xm5W#b0p1(P_J3KbA zdm=E=62^S;hg;s5?0D|r)eise(ZHqM6I0uh)0=A7R<7l;+bh8D}?$Ce7;>y2)oy*AsGF0Q=u-G%SY_r;6i-uS?hX6f8g z_fmMdYkBXoH{m=u+a1;2Je}5;#|D%7?T_rmx8Iz5GddRA7w2Q4cwPKjd}yg?$-6YL z#_?^gw5u*r-;s13yzfkGJAA)uP0d?hSW|OGOZ51hdCkBX?DKUu{~%pn9lN|({jhv* zvV3oOJEIO9GQCqrk*OK?1etG*n^RoAz9d)Rmx5XXFf*lEaZNgBO z)YPX7x5v*dIo`XlbZ*&k@4|bpFQ2>b_~b&ezB5^P2nF;v&!o+^*;9|v72FV}X7&Lk zB`w_7;JL#^xClRuDKUcu7*W%-oY93d)DA6WOkXVZi26gCHh51ItLI&BFm91iXgw$Sp!XJgno?mWprZNq z4f9j996fWk`?TCdWvG8$+G(K$5QJ7ZqQ1*>rtAEP9#MZ(_H<`FhLc?bo};~814AXq zDL+Pd7;=6E>+lYj8_$u?c+aE$@Jw+i6Sq}lnoD1m^C{SO1H?Xjo7kz%eKg? zi`cVy%Ojz@ z)??hf`t^ByuO>|Bjy;i@NHuCy5OI8%Sx2wYJl{6sifoHmSv|K$ToD`d?TBnwd^3Md zvpvYxPirp_YJ!`{*T4eh==UZ2gZw|9A zH>-2L8D%|{D=|}}h&5sxRw+iOY(rHM{79Q!0rg>8!R(2Y%cFLTd;LFR-tCMyLn@44 zXJlu@PIFGt2l?k)c4dA0@n-X`Az1azzbTc(9#IQUoclwSkCQlO>=vkdq=8~7J`-!8 zSSrKHHAZ+ThoYR3m8;uO&a|$aThS*KnT}MvB8`!%4dr!m%Lv=hEcEF<(r0IG(-!dD z5FeDiEiq(v&LoK0J_GY@8vF~Ce)q(0KdWa(uZOr=ysJdV{aCvfkG9;S8 z5`+U2AT`H_MeVd-7!OgM#=YUa?V<*6pQ!eYhlMVDTQtcGPIyYxhQM$5-Fro?mvK$n zOY{;-DWBm|zzdcOLE7mA2RXEFh{g~YYM$VI9psRp zD@XDP&rz_GoI~Ugn@AwM0waEfW)_wS=b)&+>S0WvL`-MA8Is#GB-+P^heMvL9>@Z~ zH-)YWy9j6#Iql@^C2)EP{Fv+kR8|bFlRTG#qS-q%G>Ja;qPIlV)qtpzAY-_wgCHtk z{T32=ugG7@ltYsx07F4h$Cy&&8)6j^jdI_HL=AaGekeq>$tFDbhdSO~=V_~BH^p4qeM(%N_+S-Lx+ zDf+yy_|CzFgR$QD)N;dveGdyyCJIkR^{b_|X-i?ExE|5X5A=`Ksxs4@C8~{1f^#h_ zi!~>mJEQuvvoz}~fA{F!-o@Uycd05>bs*_HK*@O`e`9v0ORDdTER4MG{GjZ;vQ$Y^ z^hDmsRcFzi#)U>OHs=-(E$NpXDd%3)Y_r&h&f282_Wh$D^uO1?+;RU*s;>9b!zt%0 ztjOkx)HhcauEZ@%qls;Ye$oER&YyR#l=MfLfoN77ou;o;;<hijLGs7^SX3#MY^E$$9!yG zOrI=p#h*(SG%sCW_WecI^5uJf@Sr@o`&6>v)R$^ii7k2@-CRWh zJ6W)6>9UkN*?I)oTNT-#7--gDo}fs~vo!b*+Lm5UR`0*h-#_}GDtWjsd9W{8-IuuV z$`iH9VT*P>(JAx)*|}xSmia5I@9tdO8F$3H;%}xZ_RgJNtLF;0uQhT;=lsh_Lsits zCQUIo&!zI^GY=Z_PMVLLAD7=NUv66-O}V?1PDljgN%V9|-dXYV>bnL%xd~o8^W1pQL`1C~b*kJPDVA3%d)nQ8hczbLx){u1Wh@ZnGUg}uh_ZN=k zj^wUG_r1woFC-l=Ms>e27pBWx@s4C!Q?z##E2LwgBlaA!p1XJG{?JO{(T9aS$-9n6&qq>)=c31m0pFLjY>S?T?{@!Oe{BEVLyLzVf+4SLOIg~Z{Hnop+c0OCFNrzg zB@YdaUykuyQ7_LJav_NIFN~8j31^)^lh8WjnIFJ?+wivWZ4*gHMi9O6$%r}g3N$6r z=ha5w8T~2frasIRev1bz+AcRd5Nzg&&6dyVL{MIJvjY2jh6h!Gk-y>Nv*omM+*^D& zOZkoD;ekiYN=TC5r<19J5t8?9c)oAS-z&rwf8PKxaNd4lJhS@y#{9i7qX|PtBR%;z z*Ez!+WJ)NTkJr>FD0OYr5H&_k!w?_eGC-1*YkM9}&u7Fxlh5q4;LrN5GIt(s5_xT# zd|BDdCh5F9R_OOSj^A(vhFw$RlP+O0=mKpz?h0QCfS4JY^m(Cn z(fRz>14DjBa1Ld~s;C(;_#~%(JPYT)RV#&bbLwzLWC9&UMlgj;*=G)@L_$Hbz+j8~ zSU3_|lJgS-B>RVXbeFn6M~b@Df*aowsD{ExIBt!o>FXKnQjotSmJCtVT(h&YjPPRW zf$41->MKWfwH_WFAM%cdjm>gX#nuuHmjeFT2lE~y6UICg94x{pg3&lp|PFHbmEfc)vqr%;G@CsMFXdvSu z)R;}8mZ{Q&DF6|Rq2KUGgYE%SgM8rtK3L2c6N(w~he9C3hsGy^VNvTJn+Q({#7u(C zV2n!Jfegpec0eG}G^1NZoy2O0S~S8x>NaoK$rwMvY9w5zGKADcJs6|mz~#`!aic_; zs($gs7bUvc@zlr(w~_CU@fZ3_bkW~Hmtx>7*4rI(9rJB-oiWGV(#6uaW3l2hb8SL% zBvF4P1=n1gHd|O8@14sFmp?OCCp7yK+xDg4nyb@hd-QN@-`#_Y2Y=GJ3Yg`;GykI(sl*}w*>g+vDvQKYwH83)`__j zvpt_1thdc`=J|6me*Sp8?Sq5w9bDeOVt8)0>;KZ}wWcRUoY8jMHfKv$ZDapy(pA;z zipq3(#g|rnx#>v(rCaB$dDF|bOkb-lx9et4JUPH=tD!H_>_{7o@_&o$p%_M>IDTOM zmfp3>BCT$%igT2u3mim@{?HifNfvB>X2?+-8%#L1e_2mucXDRueEnSOR}BrspY!NSQU`+5H*U_JcJ9x$wnD}XZy?e&V&Zq^cmYT z+CTeWtNlLBCT-0_SW#<8?3!(fJz`U|KM!%GNX;WIgTT-3)5tw7LuFz!+dFS*lmhb# z<>*O%2=y~_cc$POJ^A{7t)5h<1*~#UMGBZw%%SLqd}3rnW;Qw^7C^~tZ#?&o2jV-O zTR%N*qHUmCQi{4@KH>?D4~HRAb9qO{LG?gT=Nk2f!a0)3H9qX}y4sJqAW^|C*2)0% zz!=4K3ETWt*A;Kj2m1uqc+l?>KpP=D0SN@5c3oG75oV$~5F8e|(eJ`ha*o64ciWjr zP&h#`r^z`3huJB3!@?mV_T^Cm!6cxf2FVaoqSml`m=Fzcs03u0Y(O&6IthG;kb%5+ z=ibs5UPu-|@(_9?0hn+P1^!q3h5i|4HKbluoZWGIY;G*pm9o`5v~5q?w!gpQgU0t7 zQ?^~RCm;2h^ zRDyE59-sboNqf|kc2>PB+?`sST5>F%N^R>%Iy)eCu$RP2lJ;#;4V84>zA<-WekOi{ zbn1h@sQ*;^Pn#c9uh`GBWJmyZF6?~QdADM*0#+Igsq%KnqUQ8#4z6_XS_x-&{?4C# zP`G06VIM0hxpQFQK&&oay4ad3bVm(ogZ)SC6!-3qyN!#D@qJ5W%X?E59jU^DNy94Ft|XHam2dG9f5eW$gdOzvd zyAg*;9K)RXQ14`j7|uavsppawajS{t?t(2|C-c$DXXD@L>Q$iD zY#Nx7PS-J+&&^=-8&?9r(_m1Co<{iDk_@Lbjt9Z~wzRp%g)IJeg(fE^#)Yuo=VqCY z9B~bVy`d|9pKAzMA50aCKF=%#71bPM8-?peAbe&04l^+^8bIOPP7|8&4+VxPF3X`a zkYD`Yco;W7eS8qSwJR_N1rJ%KfuSD{Vy{7EGqnq?|HdY}*cTjT7TbZKh9a!)Hp0A) zI62dIyrYx;9zhrvgj+}ey7x0~j}W-5WfJ~~QcR3>^Na>CMucRAcNiey&&W^9QnZZ= zfy;p)%qt}j25GZt_lek1uzQ4rpHki>a;O%JYYWk$V52~egxIB>K-!vxpCP>h6Dahr z-~m%s#u?1fOK(lh55#)zo?1M$!WStPQgyH+PoIp2Evo7=3Q`*s7*9rb%k5(}sey!3OuXea;@^Q-%rziKm+P zrVM42R9OA*ta@-APR`=IePr%PtTknJCp2z`*M^8Wl^Hzxo>bO-oc1`2>Cn7OJ3^vm#aCEEU!}mn{fAr2Rx9Im66Pps5kJ;6t1I? zK|61JBH1Gg8M=2ECIq&KVH*8AQ-oSc7Od8sb?Qv9q9%E`BJ3ZIV1NvK5OOv4!Sc1)b=vJ> zN?un#N#2Nz%;E*tB|m`|{JzN{zfS_h)^oo^ND@iu)+^}9^<<-P2k8=JSRyQv|Hp7d ztF%5PLU7OeVB7j&I~KP@MasF?huah~7b4jt+EER=Ls3gg?l(m3RrXRtp(^TwB$pDx z5KCof#E?WAW~f%872l^C8mL1hdNT8QB!!4U%3_Rlo(%e6r84y3FOm5;p^r6qKo3F^ zV7=Wk*Mg;Ls+jGj^;`h?&PfmfpKZ~Dio9s z5I@tuE^Gp!S5*1#rMrQ}z!JaInW{OEEIJT1rtM|1wz(OQI0nw@obO9ow#R#yt;xFQ zla}WnXg{?kU%ZgCTp$}+=M$4mA5KAMSK7S1?|x{--W}DX4d!=*+t=r=&-;E5Ng68W z)N7?&(Jn|p3JT}X&D{X%f@-QJ>1bH0Uv5n{y$I#f?5S0a?(LH|Pew1snv&MK_?2I2 zb~6-n9NKcfe*%+9lFDk(5%NPObqwR&LhNShA2X<`A=MkIs}&endw!~lR)44>$nzYO zKzxLURXfp3@^=((q^N(YP}hKBW3mQ7-h^VKRXD>()U%fXQAzt{poX(88p(sL6sdXe zjUqJ+$7zY zEpOQZz9rw<_UTCBC4dUZTRQu7oGyW!l13cTrwj!IUcwA+!Q>@I&O~TE!xL;Ie<5O_ zlRT|#vt@Ml#+_E$D2Z79cAUh(hMjvehJtFqYD%@u;Mh*r%UAql@kN{m#j%|{vvcQm z^lUL^HP-1m%XU{63$?m>hh6;;DK)z$Ll7>pZCXmo7NalXX$1L^G52Dm2o!RDoQaZ( zgxnGz^-Lax^&;dillYLtn{39HOgnpVB4sq7;8$GJ4)$3|Iwmna)8Q5AGQ;m0}?(aKTG-}_(TobaFCh3K{G_wZ+qLBL`+f+on~1^VdBL26(%1O z)uaBP@Ci#98lRXF<_IF;D2SP^P5S+?iz6~X;#llCjz##CV%eu}k*|P?@BxLzTZV<% z>k}<9{lj!&#KVZjOjNs%aWahN2@%iDh9>7oREb2(&@TKdWO)OVJk)|E500i0l)~*( zbEl$*Vzn#grnvS4(|aa(BxRAQDh5s`vAunH|Ia!<>0EBUe>Gt^B`c7?=71eK_tuTH z%`x9|=hVWfq^%~ZW+cq)49;iVJ~Vge-Tikv7dzjd{NT;^-u#C>KRf-&>0cE8vhwGZ ziHg3Yxo@?o{7&CO-(1flcup^zj!Fd3+i%`{bKd`%9>zk&cMJ=LSn&_-Q7srFt7E?S zPUS*n%2Lf}C8KG!YZbK4-aoz=<)Pt;UI0Dy%A9qM2bE(hLsg8n=+GQ;Ju(&e;U z0M>D2UU$c~V2fRg<8V=`py6S`?qtF4Wqzfg{lS4>9{Kr^#QA~5h3}-gU!LtnGpx3G zey%5aaQ0Z*Q9OIdukv@!oqs63T8Ohc{-cklQp%KYo{z=7& zg$T8fWEK=6STft~92@g&IC>_ek^ULYRhrkc+^V@Mh5<>LcO*1BSZ!eWs>V9V%H)uj zBPxtl%7*G~Mpa72J3@PY^O;O)ZB9a`Y!GNC)y=~E#z8p>2|EgoWkqn>5c@l-wDIw& z$b5(-h0jQ}e zf3JY6Gvx0LoDt4arEl?ML{F$n_ycl&L(WI!Py@a|zj4!7Glm%(Fp(>B#v8KF^#V{R z=iY#ZuGzJhJbPsi#jqrbftJp-kCOKxaCtoJVgkyL>uBhg)$!>xiy;Bb~a1RH&vHKs$o(AN2{iMlHRpU)q} zF#->SqedwQc}-r@i2;}&a+525uP+4S8~-(r7pDsXV|ei%$#+SLzeFHFoM^-tiiqr?W|+t+VipLe|V=KQrg*B7qG9e?uXs$)mI zE$&S@8e#Qnwnbln+~jueT<`qwilsVTSQ0&zwiidOX`|_O$y~{N+xIJBI#aS(5^wu) z<>z)*c4=V6-j*=5<;{bgzh>Lrmlt9BTLHW9(ERbVz2NrL+*HgN`wk?vQO#;mecZRy zl`3k5yv0%R(6J+lI_+PoT@q4`_HSM*e6ggIM8$E(55M=YxG7oOwAA)-$Gwi_wtJn+ z!u|GC@sX%$6?N}iy0$z74OYU?nZNEjZfC;>FTaPn?;xA@d4KFey1GVg4Igcb`sXiz zlqhhp78S>f7ps@FOOq-4eu#ZC?OY!V0;aeT=d^vPqPqw3$t}QIbmS7llw3w zw%EyMHo>OS4iUVJ8xs`!ReqfryGP+eq9O1=icm|q7CNgudbga#3HD`X4hh`P-ZD0n z;lKi*9##-p+?rp5=A)fm#-lJ&W%Nb9eJQIe%g`4vzf|By4e%r3u?-LKBjgkLQ`X@} zUB84HsU{%UNU<`#HxDjy0a5se!XZL?68jZehuVbSqNI%2PQfqfcXZx~2|th(p*l&g zgOfnsp{v6GMiC5!={BM;(g^{Dg@h#K`AawJ|BL0Iyi`($6L^5Cy5&w-C{)_(3N zA+96#^3wJdNAuP*p0`)P)a|Jmj~fawbx z@m|O(R5?O^Lxh0`# zkqCZe{b+8n`nAG?6{!-)?4JJ+A$nl`o>@JTxiyZu(@es{tX`+>LF|56iHhGRJV6p# z@iTg}oIle;BvXan$poySw0fdG+weqk)>oQb zkL8megZ^6o4U&~=Nw36ubnu>vEm9^+aBHMBrSZZJM)*^Flc`={p!;X!r^U>~4YGL- zS@USg0Ok!Gc9Se$Ja24VTw-a)O6e))OpT!?8A9kZ07JKdvo|GoObe#i{`k=HzLkQ5 zv%P7HW4<>@ocgN8!M16C%2I_8n0n)aF;*WRSgux?2w6R1Jra+Om2n(L((X3c%BVS#^uU`6FE6{hLhWs z+Y=P@F*kzwugB z_?BiQEAf}hVU=g4swh)Ta{B>%lc(=$Z^%B~4OofyPZxq}mCYI?fnukNk>2bqtKh$m z(}RBDzW}(~#Ab3%=qCKnh?UIIZovlwZDt%H5zZvh5)xF*pqS~OQfwDFe@PDQ9wLuB zO{9_@P_&_S_vT1=vWRK#HsTcAX2*5VR8IlQEQiS&eLmLn{)G=-fA96rs@fBpbBP1J zpVog`n%H+Pg@C#J5rk^5%)Rm>|DCagv6QuL_W0*|Q*`g&d1Joqt(nz|ZSSk&Js+HY z?{vzwXQiSoWou7Xw9TLU{6d=qV(fNk{s~{wU1AtgGQVUa!IobUCg)a2 z@%LhkOt;+NQmU*mG88fhv&*1JusWkODOT1LY4rG1mwCm4&a(g&#Fja?z@R#WT0mxV zu}t2V1DP1vDD4dMl_=|s>owL z9C5m?;|BfS8{5y`iv}87xGd%kmCeBp6AH_ReQ?BPBH`(n9T@%8Mgqj=Q_+Go)waew72)SKNt{eWq z5O$X{3mDNPRh(UO@MFsS6LQ`qhpg0KTRkCJKnwpJ#nC}7Ax;T8BGdr$ z?^4WvPtGxN-lsGJ88Ul@arXigPm)!$o^5BLYq&pd3}(-`)XhU4o%H!^HK>eis0}|e zk_(ZjlLj^KBkh(21kFPuEgL|W0^yydo${Z+1ESFNOath2QT3MQ&$tKx8|nPD`5~OF zuXwLw#o9c3{2N(9e-0aLr5$v}DEU!gDV>0Rx1H2esluHwNyb%VuOyxIu&y!KezYTF z9bL9q7O%rW{X*Owhr@A61MRZ1iFEP&^|^7FNt-OU8|E738)DrHEh&?W?!a@*ch3cI zKIxm;EU)IT>?+Q|moAjXD&m)uxA?T>QQ@v0f3$qN4rl>A+hI^y+bFpmwEEHiZ1>D@b{s=d1LX0dpfbj+| z$d4e$#z&^R6^EtQ)8Z(*@saMa1iXA{Ovb{f`$karTF}4wa;a(ZLh7M=78IoG*VzK) zg+*LlK7{I{_)Whp69kYSY4XXB@*TWz!*5%J&s!nGHQnj_1h7XGF5HZ`jco zpqbIas>7JubR+EMPU114!O`T?gMa|YjA}%`MLJH8MGPMqKgBcWiE>EpHUr%?4%!xf3v#rX=f!aR?olp#fy(Uct|V?jf9#MP_7?P@!b$EveM8 z=SW5m%6*5$TlZ78BaIvXz z55$IaXt!m}@(oSP(^lA$*CeeqDXVMt_&P)L&#lgNX6DSYAZ>9>N} znO5coGZ~sYjIb3b%4=v&8-@Ik_=Rm8PYF3C756$Jr}w8af;dW(q9uDTt?PzB(qvd5 z!3ycAS(`DvVka1eac3<%yMXiH*Kz6$rbaHGKkOH9*#Jy=0z=I%#_%g(XJwR(dSx>l z3;3z8uy4akO0qp7%(Fpq;G!#bzpfdWVx|B84@W554#ROt=+8O^@>`+}@S;T_i*d7S zZN2QaOkP1xi4&_(_!C-DBxM5Sh!s^d-xE6!53QJYB{aL9A*?-#5=yxQ31**l-E?+y zCPNO~!;3=$fPjr2#v0641x6}gAx7lGkg5I|d=d#XNxmgGAq^Zt3=&!>84U#eUg4=G zZ9|j(88xX9*&$uG*2a654t#v@-oXz$Q`Q4mjCylIUz&E5#_ATTNpdH)uhb)}h!V!anJpg| zXk$4&F%w&MRSN zQ*&=`)$a**XAY+9^D6FH&aHG@xL*X5=;9d|s{MYx?WvZ(_&S|R@m(j{i+cH@;go)p21@S#?l+vNYcdL#cKQ0=ECNF{R z9u3p|p_ib0hd$oz5{ox(UY@?Q$9g1vq8`5+Ab5o-QGX`mDS9gRnckxUQOpAIvSJZ_ z@FH_7&yaT_!+gND8b?W)O{YM5XST%(v`306x;WGyydDt7gQ6OD`iM37k&Mi7PN~A4 z8{@*&&;;&*@(c}M7PoIq9}O#V&?WUu5iF3BQXusWvkwr}=m)VdSC-lGdnnS7>5Gtb zv5-!(QWAH9t-=)qmxkL8crs@6xK;>9XX#2SDTYW(aYqhjE-$6za?7xWC%!(cT;2_^ zN>GVmKYn+_=b_=m!B{p#7}VmYt4VS}n(P^JrP3dyUp9Ga42f11UO>iQ6V0N5Gt2!u z{E7ljnc5yzxe}W4)k@dh=NF%USh*)zxd*4@OH!3DN|zK^YU77j%zIYNl@HB(lIA_j zg)8R$5Peyzs$0>VU6r#Y zYFbzJocOe4Mf1}7IBK7!Pg(ft%B~|1Q1}Eies|rAICqO1xN2|p-0DRx>2jophDw~V zduZ4JqpgRA-lU-yN;+)=8vE>Y6Ia>3+`Llp!kp~^9q7|mz>v&X^8`NA+x<8DA8JaI zn$m}wnxv*?Mbq=lM5^+-Laj+n>+*C$)A}pTQHk!x?8!bF_!$2dfd{`6#?L_GHlr~8 z$S1JErD$1-b4F$4a>t@_$Mta(uLE>A4)6_dwOklVZ0V`v1p19JeB)J6_5<7v?hW;; z+zsBX?H7x_bLLE+=V;ep_eoE0|9861_jdIU9vAsNOtTE~eGewIr~^&`exWN+!jhel znjsvBt*3C-G~^kF@)F`Mj4`jPSY`bGQEq5 zd^X^LR+noaYuF;w5uGlPZxu@zdO`(z`UiXQ#iL!_r+fO3O<#KzmspZ8S`u!PKxzov zuR;U`vDO$#gxI$rkd}UqX~^H|VnVGO0mysFoDO1EIgtY48-9P#MbrUhAk%XRa;tEZ z5MiZMpEGC9_Z+(bTJ*v|k7wZ3zCKT1Pv5k@bKjolKza)GlvT@q9!M-@m!g z4-890p;*Xby3d|D)88}b>FMt}dZs6nt^Ye`1~M^10|6;wd0%RO$wPUM^$xJiP8QqO z+wTF;uKxa>GXr8KuIm*=oNj;#;zW}K<^0*fvmkg!MOIrr)`=SI4 zM^D&yLVny(m##aIsO?z3Mh4cMpbd-#IP7n^|j(*=!M3S+pUuCl}}2>OSZ^n_)f!aZh$D-V$OStCd{oDMFp9{^KysCfbEfHb(MGYLgC% zn?f6egYe-%2xET2WFDZ!6}!RUJ&uuslGwAD{SM-KT!;xwYT$mFz&!wji)ZyUD|*+e zp>D;nbJbA0VyMRuv>kXyOIL#`soxbtOS);d9AAaI*MDVbShI45?b0_%_yZyHNAyx1 zhw<~}|K|Vv^Z392=gUXjhb1b{j;r2AVU`ZIE7a<}G$ha{ibf{?AE!Izh;qg6GEPlk zQ42I&5`$|b`mU9&e-E|W%)$~95QqvDZ7fFqfya=<N`&m? z#1fXsVp}CHQ_4XDE*d-@`h7uPX$RTb~F_UQMg#bAu70l?&4uh|p9Q7X&rjdaE zTHiN=a1hUE%kwsb1xIH?BItt#i!_&1=;xSR)6w zv7k#1ZfC(Aao{1fyf*jRT0INyl!FZ{*eD0xEZ8Imcd=kIS<8&hjegZaL8|Be zOpS2AQa68s|5AO5=dEeIk4DMA@g6n};1@T(B6x-X3{^SqN3poixUuc;~U$Y@#9k_TA8=3LV= zFRnUv&{6SNQPNz!#tPwRxz7CdYd2qeY^E$1GE+u8JlbMU6G z8hO6;D}#z}cx=-1)n67B^G+PKKJdN|#h5xWaV3JHhS9xZCR!Tur*JF~M#BN_U9w=GP(^2>q8T a17A*+*RPT9tCC%O%j4I01DIc0fBz3&pUh4G literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/conv.cpython-312.pyc b/model_executor/layers/__pycache__/conv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95803985cdba2ea91969082d5e056d2dd1eb1ca0 GIT binary patch literal 10853 zcmeHNTWlLwdOkB8-iIQo`-Qegv1OaKOj+_Kv6DC(Nsi-OD(B*?vs$MF%^6vgDN>mk zS~i#3dbci?>TYW4p%9JjB61NRGMr+SzQjcznihEo&=)asOLpL(YLgc58y#tpU|-t* zKf@s@dYo+1ycF%&I_E$4Gygf~Ki~fi|Iy=d5qRRNW!9AgeX;>W>2K1!(wcD z>;|LpGZ8s5t%ODO(u}0JV~KEdGLlG0aYf^kDYcezpO<9F4lA*=6w+*pD#t`gv(H3C zF_t)|ImB2zqQ;U5&2~;sre+k4I~$8YCzT4}>7*#hPIxoTCQE8cPDJU}hYZ|aph<#T zabpF-KW7$+3JLw0cW20(Rzi%e#*mEba;pkC`ZHx56e9y-qwbtb#UbgFv8cv~(swpznY{<13 zG;Xrj&+jF$m*qVSQFj=1p=P@w*J8tX@L1mqJA#;+E(Uvq+4zi>8Ajb{$WyxvsTM;& zCbKn0!yN&=wHWo)pdn4|Go)Hvw?q4DxrPLqXEOFQ?89?x#`2fUU)e-9co)~Z7ULw;jcV9RatZZ%s8-*QL{6-pAO>m$tz&ob5NlRy zMS_fsOD6jv2zF~$B{d_-nj;*JC1PqgtXZj%*8JxrH7rUKkyIRFYJOZXC6-XsNFpj} zmRLfSaihw(2W8xPGVVATH=2z5OZH-b?Tk+X3r9FHZWh^%qb5p`1I zX5^%(xnhYKuxMz&G&YgYoG+s3lT?_TRhphrFKM<}DRypBr8C!>W$9c@fzmKFky!5r zuW^bLpZI;Igl|vhr5inGI83c?IIQ4X2({<(?~V_s(=!7zv6%yEL#k zN(1rO*@2l$>SQu8a3LO_9zY`+4@(!NXbO}#KuxbZ1 z-hoy6@-o>B5NFf8z2NA)?dV>2cdWX1-02aDJwt__p|zgxWnb8I5MOtG|J%xK@BXrd zG<9Xi@~7T9bM4H>d~n?#DBFp%b<;&chi}@Rd1RFz z$TB5|Gdq_*RooUTYzwUh2A1Vj&rrcJq?cs!$BKdeLZE-O=fLvItDb`e$3eYhHm5HB zc;Uyn>7}7n&+dX_x1N1b-_N<9EJ5wP1;<`JJC!@N_``)CE{Usc`&L``uX_3mj(#Ze z`mzprKgF6%oQF4&>uKdGGElbg*&THQ?Q?Z#}LG`Hna`B#>9=3iU*qk?~T*@M|lq@_J?%}19G?bWz=WVXII}TzQqT)ScX5Yn%taZYTD0B-qH&@|bRXm{k_ERfy}4bQ+hbGA8qhRfIx?E#La zsBg|X$3dS}BwlL*IM9}{87{PG4GSP{yHNv;_*50O*Y(4REcmZhAPHU^ zceAoi56~Lwj5X?@cIicQ7aPe(A-IbhsAfd(q9=Oyzu>vy(JbIr1LB>G0OUoTYPM8j zA{iIKQI_QClnNfREa}C)Sj-&PY&ukx`!LO2h)J_L*rk_}gI-GVleF%L#*Avr35_|e zvCj{}-;jd)*?^Mzh2;%TB<0zNEQY63aX`Q6p1KngYAF0eXs&z#0bE>O-__K7>KC*5 z-apN(I(o91Qd|4t>4nqtt}K%s%tmtllE1C!?=ARy=WSUwI|40oBlEL)Cjas^JJeuG zyyxnP`4hR9@@#%&jo(q^cNh5GOK&Vcw0w4ruT*$;{@Gk4@6QkC-&pEhdTEW{Q{)E< z{J;vka_s$+@10!Zk7b!VdQZ?{iT5GmD)NB>A26=a74CiSd)_tvDJ+E!H|(4TW(l_3 z({%Ot{PEl%9RjY^1BJkWH9l12A1UyUth{ln_tr}vFl+os{k6EoK;S{524WqAZ(<-+ zY(*b4Gk|Wl8GeFX6TAeuCin<)O^zw#R-J8Q2I!`m9o2W)n-zmo+18dZ`6+00gERB0 zf=>BqD3qVW;4utvi&E#vtojWdK+(Or1wwhMn=T6GI^AWuAF*f}y5IN#1ZKPOzAaH% z`%+8C;^PaCms&d)Us!knz`)|`3$Ir)H@yzJFPe$ZU%^0I78{q0E_N1#Z?f2+-uhbw zTH_=8As?+l!;`7{^snH;nArUK8bLPwaG^_%l2MQ1p6 z!z~|#a(NJgAq);-KuHNiwVLgelu(j#oqETy!Wjr|5FN~+6Coq9OoP`zP%c1lh5Q@W zyWZrz{7jkiSp7?F=mwVwh%0^X_rKS_fns^P8=zqdo#!$EamD&R{~o`AV!7L94K7*V z;ji;$0^&;B`+@fY8z`22thFyMy%oO}FB1@#Q@_dF$ZVphPYGueg}>@!T(29qAgi7n zM`T$e#KmR=6hMb_KwH;Q;-Ul6K!b}wdjtD#6?w^qw6>WA5?M-TH;HV8&h`SG?F2g8 zZbk?KF>aba!Bwbme%wvzZyv7b7R^5zNhy(d-3Ctoy9;l}VIfLOgbR|a07oq(Cxk>I zq*oMXCnZ@Dq)2p9h$o}elY!GEsFRW~bWDINRE}K~W|OJ7NIRR1T#$rBQiw#OlA;J| zvf4@!pzD~b2r{xrLQ)pQOt2@vwUZoGl@u)Xhb@7?oyJq2jy3 z)M79zWfBJ$&Puc^MN+ZNSTdof$7H-g9_-xKShB5@ln#!Kdj?@J-Y(P}{saUh;WO5s zy45N`7OQ3q;4pX`X=n>1Wwks5fU}ZUHIAYMXzMCLPpN#E1>A-}dQyO911RgrIDv}f zZdT?}p%l|chbs!Js-vn10hIunMMrdJE)QRS4CS6ub4%9#nZrYQx)R?};yY>BT;f|w zd~1mZ!H1b$edRXd?D{QnJ6-w#N3^)byrRXT3ws3w`gNtcXlC~q1uGm>ZFs;eSC_I@ z1pWmx!2z_WEIRlLRY)@r-xuBlbY3&vHQ<5O;j5hm)&Pi!=O87Y!$8D9!T?u6vsY|( z&{BiKI$eNoBnX%F#h8QyK#Thpqtg1Xp;$ri4r>AICcwm}=bz5?=64ml`wPI+h{e!w zAvFBK;0KY_t`oQUXAy(?=bxEBnIB%`x7WkPptXkm=_WfR4?xF3kLhI*fFk)33?9Xx zfebSZU?v7X$AW)=;L9uI9&5EafI#T%D#8C6|Iur8Rrn8(DF1;u zI{$$Z@*ia-6$c;QTCvO#75bRlFntIq;nZ4>0Aa%on#A_fmu42fzoltX7#VT zeOI5Df1+#!8HRUxuLkA=U)Uja53Z*Q$AK@Ua~y~MpW^^s02kB1aUA+8j^mIRl82!g z1>5(t9B4@Cc|Qw>>>J@)bCv@zt-*4@7esTO1IC-P&pGCtVAd(a0p7QRG9I>!6JuaO zzKZjxV>HYVGLYGhjKlEyO>1y)jZ}Zo@IY1se0%(=FWSIv*+s|AimftbM&L`PnafpY z>HhaJBe;(3uumGEn-%}XlpC>9y@CFWS`};V8k0w$AIgsC`|>GF!lLUu^Ig0)?&2*Z zk795Pf}pF;p8o;nB8#pur{t5EcoBn_FnAfW+2;=_xT_J(>5p#q(Xr7Pj>>syWH|mF zDk$BIxlaiO^ey`}`@F|sIC8__Ko;Bj3T=HrdjQ9gW8ogo$qRe(gG=nSN0)||o3B5U zjr<;LN0C2V;1A#O-x~RVUE@b}cBFy)SZ0<7*Z2cP{^0`u@XFa+yZ&zAZwA)*=e{cf zWH_%DcN{G2I0)=Zkv~%4kLUylUUO^wiTYb@3tsp?M}pAZ`zmaeEu2RTGQOJgxX+!Y zE~!p&+_xc-;#kElU&8=D$ZSzM8f7rK51qC_q)N5bphnH{g#Lx}NjVKw;D#Z} zFCi#f7>2oLBTVO~#QiDZKOt@ZN_MQ09sfawJ|Tmfw&M)r&z;yL7?)pW3EOhHv&b|T znC9H!H72mh?q~XP2Q~;q8y3QDD`Upy8D^02=iQqGqVgpCh_kWPLA@13wADQpB&y;6 E23I47`~Uy| literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/kda.cpython-312.pyc b/model_executor/layers/__pycache__/kda.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10233ae144913582a30defd7cb028cc6f9fb0ce4 GIT binary patch literal 16543 zcmd5jTWlNGl{0(}-w#oINR&*yMcbk+zwOvc96Nr*k=$5Go8>xSY0XF`#fLPc6kDZI zx9e^xq=lW`E-a&4Sozu5wSg!fi=@~Eng9jjML)`kydW`aBVIK7kw0bSAjN)k_nbR3 ze3^>vCdKxOy7%03?>+Z@@44rk`K`rbq`)OKZ^RM@DC+kZQJh@{)_$p?s4Empv2=n8 zW2j2dNmW=yBdktPNp)CFU`;}k)P}VL)+TgGeOOOmU4ls(!iJ?N={;Y+rJTL^4P zv?l#wKfqSjmS{`1huf1I!W)u-a3I+c?jU%3qBGeQ?n-utyOTZP9)fcudXs(OJ_0)v z{mEcBnA{lNNa9V2P07vS&B=lA0ExR2q2!kE7MfB~oQvCfla@v)Jjgb4RGd`?uM)EE zb?}~b@ZNRszIE^|>)>0t?W~`r$L(y}MNN3eUvT@{p?y1HJX_ce+zz&7g64MK3f$C4 zV~8VtlwvzBQfw!ujwdQ3t3X_zk~h5jJRR)%3=>9zv}oYDD9=Yz6C6ZTsTBD3gV`*X z%Er>EXQN}2T#5ypql_EovQajgjRw`C=}>g+EO#iK8jnqgrr|WpCFF=_g3Ct6W_TXx zL}n9-WMquwf94jA*jAo(G;q3V-?oc!_Hj{{E z(|mQ(=-Fs>5|p!#MQ1Y61PJ{Y=ePwGrRZhncU57NyN@< znL3|6n@(-P-MB^CaNIdqX)yU)q&97VVcE$ z+qcsD)P<8JQ%ljbSukzB9u`b{E*vgtm>&=SV7TP&l7A*k$>IQb)k0~VUr>6jZVgCK zHPeO&GHrO2UBO|cM{ZC-{bLMD2B%-{Jii5C(OJ{-$ar*;<99#;ww&a?K&?6{t%)B5 z1nC&;QGI$!T2%;)&!omgb8?0()d-AYMl>LqNGv&(7=uz}=a8YbPFPbvhNg2=m}aT4 zic_;Rrx{nVs*4(B?S-{~)3It?X^^6U!(0y-hBL%nWr}6s>tn|?a!p#O$wX>mI5Vpo zS0NtieI#6_pa%*T1Eu`Ib!mXoM))^zR+gEANnp*KO@=I-U52cjgSBxQmS&l8d!8Dn zS^Gsx*vUGecep|79P3;M-?R=M=`ySfTIYgmyD?4oI(W}Ic<(xR-#Yk~b@13WE87Zf zYtB>W=&*Z~3bu*P6R~9MFqg@5v!EtRC_ zX&O#N=#zSc$1HH}H&x$)3frI(yYRFL%?^gj$4Tgn#8YHK6Ksl{?))=ILPh#Q;0)3X z$UU;-bxtk;x!_z^gMQU#dIczCOyFIh?lC9s+d2zOC+TBN_wdLe-TMsySKz3RLs;*A zUdI+zgE$Xrl5 z+u^E)yEhO`v4Jt<(qslKkObBxaE1#2n-kQ-Nk=#@DYIy<&M6wd9Zk$|&+&Yk=i8tf z(T1lhxx^zZt}sz61rBLdNrefDhj&uVy0qa0RTZ7%}I1LOU zY%B?^C(=OzUo}C5(jz8_*(46un^%D=VdJ$q2mN3*ll?H6s^=*KHM@&D?cll5x{`i`pgTc-34 z(RbF-?^0U$4bg9^qu;I2hxx)Iyw1g+uR}mI{z3!X$%k>QOPSxtryc24=s=kx3S0@l zvGzPB?V9!+tF%LbzOnW^ChbalzRp>|x>y&CZ$RlqC1hP+y$4|T%V&3GZX<9nNv>x1 ze@nqu!kXRRtWc=g{a^3&b=U2Fe7g}>vwOb!^!3!y*jT24oGx)Nc8zH0Snn<0O_@J| zQ*pDBKl|4Tl*6;P!3|kHzgp^Q$LAw)*@yxu+)n(v3Z@dyn{xV`DSoDcQJ`6hw^ngE zt$3RKLZ!^X4}nu@TrQ`_V;qD&{*k%1>bFSwodS|W6c@8R&Nz<;0eNzY?#$UVPng6S z&iKfjJ|u}7L2#-$6N_f}m$3|vE+k2!M*t+D9;1M2I2|t1!@|fh^fKqq8d*jjxld6i4;!P*l*lNfE;7oXi#ln7MGfjR zM8ixfGd;s`Fp=7Ws3{N`WhM}k+k~u~s7sS~2;VA4P`u6KgF$pkVqP*a@>s!A1!-bz zOcD4ZnkXeKlE{M$AQ^YEQlA4d(hPhXAsks4G0<>;DAI^8a43#absJU+S_zP|OHaDj zF{dFC8B0VnnMfpqPXk#xN`ebSM8vazo@;IJ{8rXR5;A0t!`S};-gR!uU|rA)jQ0-H za@X_JiswLH^Aq!3&(0OkZiGs$y?M4}=4LwWj+o`uw%j|oJcM~fc4q@f~`z`I)2Cfdkqce0PB(&_w+kkqZd%xhr zR$D)HZzQ4tzhL&4%&jGdyX5OBx&4=4S^Q3+spo-7=P_c%53H1D(>I;dPWggWH0_s$ z7lx}_@K=SVK4^gtTVQzrxd$LO$yD3Y+I&{Z+j1rMX6~-pap}auiGS2Bet%i>p7~w# ziub^8UjFFS4`01w9tDwy7s;9*FqFZ!=(^^=>c4Kgvv=h7^vd2B3w@{VFr&3&W4|=A zFjC#R+VLT+^R!-h{ms`)-ag28*Vlfn?P{Aemh~-!o_zCpSKU-k@a^HZhKt>Mgzi1X z?gK*ifnxVjq5J4z8)t067o!u|oesN{@$wJS!?l3QY>g|90 z^d8rR(ncB*~nO53ht+i{`oc!6mtnH+`YzGCx` z06)_ZZt)E}m&cc1DYOl(IG-+htL!Us&t#z(GU%V{Jsbnm(T6j{|IVBC$l2qJQmk3e7mIWl0)cRT# z22Ad|p~9l&3_t;yjyea}M$Bi!XYnnSBB%tR6%)tHu+CM@50f)IA_ukz<;-;suw{W% zCFv18HD=3Cb9j2I8~5s#Mq#JtP1!DmcD!4GDj};yp81zDf7uT`eY;261a}EYlm;*DXq;f$&D8o^veHNO)$Iy$T$Ac$o z36scw7ZC~cCeibtcOE^$$4(=34!!T8H-{dcC434zQ46Hg8GZs0*cSc`#H!Q4g=BI0OeAmWund$VjHm{ zc!-meRbzVDCSb0m0XHY7{SoM9ehx<`C{fImyQ|pTCp7n6r&pRc0v`r918@zCmKAgF z(zIX>=IO^n-3Qj}dkf!NJg{O5Uhe?jtZ3UN*tV_Mb`)*<1>61=+fZHuy8lbBF1)(9 z#fd#)Paek0=mx4o1uM;t}uP0D~z<-k^>8koVkUX z2b&%l0^1%L0vjJ00$U#$(&j9ufi0Q0&fDhg^NyS~j)dwQ>cy3@o44j{kIZAs*&mt5 z4!y&dah;3%>ktrabU{8r>GD-sq0Fk4Bm8U9DNEtYTFP{9dw_#jvDQc1`HO+KdrzFnLs|u+|u8a+4V=(*IZSfDu?siXvH+H&-N$O z=dP+xopY_To*LI@{}bx-q<)Yy!8zCPbcHk0Tcxd}VFaIS#JwzY%b>8C^UYPXR5{xd zj%F<_cNHyl&i;fWRZGj0MZ-bjh>e=@B9m**xpSUzwX%Ls!(BVyJnx?OeC1Af3f7mw zMv;5Q2Tu-1&WEyX2u+BKH(=AtkGz(OzD24Xtxa<+EW5^ip*($# z2y-RVHA&JcGW!!?jmJiZ<^~!t=!!Tvw4X@KGeZDK*%zUi8Ep6;E>J&el!YhKw@ZUi)|M%$q3_OvZAkPTVg5v$k5Ibds#CR-K9(E7doJXc7BZ+Oq zlqQyv4U0tM*i0nDO()=S@=yi*KVemP4e)QHcLTj&f+spMnWSvlB~MAvFBv^sh}E7l ziJ&OS#8NQ3sWA=&?y<3~^kl+*^1nbz-V{o*r?|05{Yq#nr;wGS%)R{e09aSUs|xfX zGtfu*am7r6kRO+qXq}DmB!x>&fDxr=$bz0D1F|QM&tTQFShZ$0#=XY>1mjusW)Nvc zvc$+Jk;w2UgCIH`K7fhpWONEf3?%uniJA0FhDWIhkG2$|3CpqE7+BV2f^CwFA1{t| znR|W!Q*WV%1Hr$8-ZFacf+yOb792NFc43b)V5V2u2GcdM$(q=#XsdimKt@iqO8OA_ z1BZ-gm9!$2xTG6V;`Pg{N$M7=AZC?W5E*t)-}kC8Y(*LK-VA>m~SM+;0(O&(bXVJr3@5#ZNyG;h1_?J4x`DfI0r zc=y69@T+1LTfyF0aC8w{pxi2TZZEYzRSN7W`FbBT86DOy zU6jqc5JSQL8|l?%%3=kbnWN+Xi@@jDf>kfo^$cCI4_+Q9ItK;k;Hp~HWi6ODfcnAF zLz?Dq)V;1?snpe1>fc;y-A*R!bG_PPE!lfZ_U_ddYUmW5KUwha{n)$@Bt4F9r6|yl zN)Jdy3p+-alf_*pg*gL?RLayNJI5EcxEx8R=Z!hgs1Z#e{!> zL--FDsgl;Rs-m@>i--Q3f?!Qe>8u2?s#9ybmbwdldxg$@YZOFQje4!|zQLNGA@aoM zuAR7g;(GHttv6bix%VdDoh;Z63WjIbw2*7nsg<){9$oCZ{K~@jm$XY`*S9a7eap7o zyqtOe(DJ!=H+;}@TYY=UG(UT~nL=7Zrq%2Of!Cp0ysF||4E+rNcs}>Rkov~EbMxu78=t~EJ zk{l9eGq^_*ee1Yb3gp}prwf-W1?&`l&{6)HgGU})4AdekpLj&izybx%j7JfpXd>Bw zcL4K+ss)J8anVlFSopk>kDZx;ZyJd;2{BF~wko2FWT^WJjz>|b#BEB%rSCg=RG5ha zq=aNtq9TNU6dRx)Mhm)uqCdUxj z=KOX<<5O1p-iX;Rol!&l?*RwfpURAbzpAEbdX1rI%RS0}k8=GNwdo!;^a*ubppJj3 z($K2UDe%5DQuK55J*w*-b@Cpy>mIfL9(DLjjh0reQQ&=PRx*A88GApWMn9oW+@tpV zr@?jkl@&w#1>LI7N$W~YUb2VO0NA!Mg?#x(~*8x6FnToJ(KiU6*v2xFCsFjlDuW0i_9CRK!xR1rX*dnvkw n=;eJSX9w_sC2Qad0}RFkds0gq7oFF9SA9#zzo0Ni=HdSU6NU}U literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/layernorm.cpython-312.pyc b/model_executor/layers/__pycache__/layernorm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba3eaaad36cb7e36a0509f8a58981ac237c64dd1 GIT binary patch literal 24933 zcmeHvd2n3EnctiH%{{;j4&opl2oMBF93m)*BuI$j^1h&hl&yti&kivU!~lbXZw4d? z4Q6BAeQc2cZn@VN~Iv7v56>RJ+ms6Fhp+T)3 z{jtgK>vzn72T-!KRr(`MqNlrGcfUTq_ubup<90hZgsA!Jk-s|1alfRD8oWwk^*s~E z&2eF_mkS%h#;Bp!zZ^0VVSb%*yI2x-=s)Y{!_GI&y`^?8+_G-JF8#I} zzR%6x>DOOgKH76D zs;0tzvAo|J4q#lB;j(q!Q@&1l#X9Ac>y&T$1S?mC*=#a$qGRxBb?mv{aP_+eIcMaK zap9Ubxp1xMcoz^$;a&NM>#bh5{pN!9b!ac>w69sWeSJauEok4M)4q1y_KgMYx1xQM zPW#PlcH8i@Su_nkr%WsFN1bu6aK+fNW3uMZcq~3TER2jw!=Y$oG8B)DjtJ4v6;X<{ z|AyN2v=}lTX|!)R7>dM2DL6V7LkTapdtt1_n5mS8V?pX5cs>;G8wf^5E`_8>Xe6Gg zxD<^J2P3iK2k5}pH!exyNIWIAfT|STBo_3j^_tPwt>k42om1NOW{0W0l1C zxHQtI>=&Dm6*1*kzl8{Rhj0!zY`6+s6SuG$h}4~x+;N*)mf#YGgi&3Ivs~P+J{e?K zPR+@`2~J)4tLB7h&|UbHatX82qgj4tK*?F|j;Tj7q5zY12K(4Dv&FMz%(N_$2hSv$ zgG7VPSmL8n-$2G84v)pJ1f!85F~h41+hUP;sz~FMXowjTwgJ(2JVuQL3Ll>8h!2l- zj77%U;vwmR7;l>x-@SLw&~C9K8adxFb|pSAI?_R}?id~oi_xGsA@+^qtsU}q%RjV_ z?PFJ@QhF6d^cOpg=o zmmIa%4yDbuub#esI?X%Nygki((tPD(yTxM7x;U5j+R0V)$64Rj$ORUT{X}XY9|p0U`vSZ74Kqc|9DIc2Sed-P}@AKym=K=UE4valCpk8F@gcM zgMw5>g;i?QKu`M+Vd*{jOP_YRXZY#U^G(SuovS9J&#~fkO-)Si&svPW$}DHJS7xo8 zzkIp0IaS)sHozzEL7FdLbz1$_>!+?Ayf%^ba4z@Mz|_8^xm+s8dot$pkx;Bp0b>;8 zt$sf;d8=>EU;ROvLQ%;3Y%B@0?n+zKwN_UeAC5(DeSPR#DnXR7kBvsJFl5uBEGt{l zYN|jV23t`nfYh30t)<5lbsZNng;i=qI4Hx|^t43_)Coazn+FAlZvQZuF@wNSL3q8${qQbedL!-E3JE2XA}Vg!0JI#c#lBBqi> zUqB-E0wM*WJ@f{ay>%&X-GXh&+kEYKah|Bi3)f#r^S+tp`C|*8UmUo{zrXFV$%rq} z%DDnFuO^+fNpmgRHTH%+bgl&zj90-GabLx5h#W)`uGbhg^qRo-`oc!h95&(4ELy^r zu(h9`;`$9?+nerQD;Rv+AT3K#086c3?pUYXxlXx7T&E}f`ig+y2)n@MIzF?bb7OSQ z&+OvM z;&@0B1$vrelEJhJFO7@Rm5x*56(KH;#74n0J6O|0qr>IqJdUJvcYRGV`MJd9=2tM_d#Y}%;zZ*zZPepiuL zzXEAx!*OVIKR41fzgt#JZE7QzFvt7DzSM}F0wus)$D{v-C}NZ1o5rB6q~%u*=RTW$UQ zSI3ob=(f>?Ewv3XGcAsu-~3;M$>!HIH1P%D$i$e~7Z<~V!pcFX5sM4q$gt2YeCTKu z&X3213&8Iiwi?QBHTg#y_8%Hi-!&u*gf59fEHo?%OeDy??L)J0DJJ}eVB*{zW5$H> zWNbsCIEF-v8`yIo1oTNbGvoMNC^{}4k)%;c#-GGAXYA)7Vz9!D5u$q@FTSfYV;UR1 zoG}lBVr486Q5YHLIwcZz^fAC}gn zOS|qpy}a*CYTub;S5KBRykMwKS8uu1eY1P9G*#W2tZK`e3?&__CX?R-?%wUm^8Ylw z9lLer=9&A>mWQR)S);+Pj$*!JF|c^y`@?sJ7lSmC=kiA4;5=nFdZ&B8W4YzN>0WX- zO&QbfKvBtxuXN5jYn{3HmVM4M>sjbrjL&$Md{0kV((67!o3~$?du{f$g|fxKa?5im z{I7p*sq8??cYw9q;OUC5Y@Sc~1gfW|R=j~56Vns(Ha1Rg)1u}3{2l&o#gA%#P`lK8 zAn84jEI^mG z9%(;HCwJwevqP^OJ}5x;9_ZQ-b4uMGqh%rXJd*#qHP|^Hd&WoCN1zg^v>}!r!jH^E z9;OsyFqu~22xXbTDf64_;y#VhRFn@c6>s&8heMO)Yw;PCzk}Xm|A^>G*zCyr;pL;R zrjEXvJo3lDX1$*boBiRveSB>8nCE=eX_ue<52RiA?^|}&r(E^W9Qqtt&T4n8MG>Sb zWrD@mkSJ7pfOn#&MFHRn5kik6RaLcDiR;p9X=FgI!}Dq_!E4EijyxSa9~l!Tm;&CM z@sJk0J*bvQpTpo}EfoVAeqErE)r+LTkX}TzR@aens$euRHa=Ol7K%{(AE~9L1+nj1 z@%iUmvo3YP)z!9{OB_>{tb;2FZ1C8{`O0R*sYH^mmUl&w8SsugzqL){-*TS?mOo;0=!Ev7s%kQV-q zfh~NDrl5S44sjwbg@TeeCQSx%tC=hP@2DFiq;QQ}@iZ;Om$&UrZQGk{>cUsIuQpv< zzg*jqs_jVDJcWXt{?;kGw?myjquEj?( z>gIETpqNZhdUY3G8sS!oP_=~-WC%=D_dt|m+ z_-r{>UiHZCur#hzRAnvrg=Va>I%}g8&w2b=dIl(Rcs_7asIj%3y;ZMVw^Y_y(n|FE z;A~(~bxXZ&tkXSnegKkboO!zFNS@UT6s`UA&Qiaq(X$Hj;J7?<}mwNm5+;`>aE zs0<@2!P=KXqgxJ*uAgai10YP*8Jj{dCT}rSE(yj^yfRfT8J09vZjsVfm|z-M=wByt z`qzn^{#CdXe?T29xSuuK?Y0a$-E#onN5XVDO5PKyjC4iZU_ z)T%7xwbHSzfGM=TXHZpUXh+sG7WAbWsWGnNW_@)SH%Q35D5%P#{Ql#l(Pd$f43$Me z#aq(8I2n*x7$i$(QBdn$wZ-ezM_{eMjJ}uWfe<81HVb*R%xh|CBfr93AvOza0ZBMV zr6$DZTKp0;&790^L@7laRHj_>oMCo>(MzHvMZ%)Yj+DSDBp4R^L*r5WlCfGEp+@G> z5i!PG6Een;k&OMtkOamD&OQ?D8;qc_AQ@z33vVgq6w9nQ5;`9h!^}9VI0?~*1)KOt zU#5Z?akac=4sNl_>@^!Sfx1LY4I2rYV8*4`T4i&3ne28{{7S|l5qKzSDIwop;8Q+) z&06rkP%-v5h{#&t%eNL(H<7iVGEG*32Yl^Hpkd*S`+?5s!}q*d6K8in;2Vgcs=Z&a z_j{IQVRuT{y(H|Na;2*pr_A5*J}j@E8=M_nsJ?spLHR+HIMYt=jRRjh@UW`pR_o2y zh4?#f+TB*uk|PHKgkr7WXF)y$ZW$E6RWFYR-B&SCh=Kkmf<&ddH{~xu9$- zEsVq*PA($Vr+t3NP?||<4}m10%;5&{TO8tw)|%7h{nZcXD%>@$m&4F(A^kFPx_!a< zMtK9Y@VXLTnE5^?j7&MQIp>W93yG2BjE!0qsbZ|xAXQ$qjWj|TVjaRPq>{Y;ED0;? z54{#xSIBqF@TV{f%W=Z`r^bF0VFYOHZG#%3%dJw_s?JGIXB{?%ZSV0a8oLVA8>ba# zK~Fi8;AN*IwNIu)fAVU=%j`P2(8xM>?7}rjapb{~urR&2{ljNpymF4b%GzWttq>W8 z7%~j6JK4=oyf_}Y6pA8Glu8j?n_Joy1yQx8rO6{hRP($jOuA*IcK_)3NVrRwY}}x3 zC{832+IQ&ymf)Q9JYuMDj^c;mCFvNY4pGFEIman=f})d%fG#u`ml)1S(4#^K&sZjc zzQqP=c3~Le?oO|5oVFOOiFa(1zmjU8%s+Q%6=H z!|k2jJO9#K`D_Zlf8&zBYl>fSx!<zPGSYDTl70K-u}pQAiQU4o3FAORuixpC@oDI@sc$*!&UQL0`u;3SRGm++Dv z!0dJWGHfeDBGWh$36+VbIUIuLC1G4RIx(9IgIvPH1&&Tk5wI5gL$I?RMJKpblrw-pZHGOK?Rde4}bN8I8xZ9R;ZCmy7 zq`2c1H3xG&EN~0ZXu^khFWWNw7KiKbF&oU^Xr%%vo>g2&K4xKV+yGBmv(%Qef>Rx| z*4ae8E!5vY&=Ex&2(Pq!wf86NSJfKA@o)>K1P?dbL0MTtyr7I~{u!l+#bSCwcz+h| zFI&@$i}`&juB=50W~TVjbXXeBA+7QW+VU&t8(R+lSl_aw5?d34DS_T$*vub8%qeZ@Wu?o$B?Q^F*5 zVr%tOx}O4gysO|1B7k~CnPHqy1b`a>3hiDopiH2w#as+1{~V<;8Y?lbvN2WD84pCp zCjBb!sw$FE*h>IH5C{Nqd>jB_0MwE4cdQF0IUoT^3sf%q6>w|({*F64mbZ7Mws+k- zyuAOV)c%*2{4Xb6FK+-UHJ=z%m=8bB3y}|h!g&2z_}ddXDGCZUp|7zh=plK4gPuW5 z0T<@xWF&qG@6rUbT(=BtKG~ZToC*B$3i<^e~A-GA+HUIRjeK+?lSMNww?^v#WDpmc|-IAs1UCFAa zldfmLeew3GOEcl+@*S!29m%rRq`NJ}x1rqYo8sA5h(Uc3o%NjSX)*Qm^vLAP*z;?Y zA!HA$+eNx*d%C(V-Prud zz1iYfDXYv{@C%t6rVAUTc&@5;K7O+*Yo{C--c4C2rO5JiV_xl^;J zvUbYB?p98AxA}O=*N}BmkxQ|JECf=eTeEH|@^JNy3*p<2EcKuD!8UTEX1ZqHGB4hW z-i$6D`TprUr<0raq$>BO?0bJ(f=a6<8kv!*|A~QXX?tYz!epY1%z`vVB->x)*U)1O zZNrXN?Nq55y+M!h2$|!CydIB}g{P=Xl=0lA%~@6#bKSUmC-unse`NDm&gMF&OmXL| z?1=+rwVK%c%k$=sba8)F^u+F{(o-f&30ZUS!#)oNbQ#YDad;ROMCFVc0lyYhy0Gd3 zRIWAbgguIbxFu{3Tl%fcOk*YIaM1>{4zH(Ngeuws7jgY(j&;f{%vfaBe=4riO22lV z$;CmZTi;bC+v|kd-uoFDnH5g#mR{FqYD8{~$jyug9#8^b#&wL~MztXP8zhPjNrFEU zSRKWDaPIOb{0sY`_ZLU{#F)@8;anxz2SVqeh82{aavtG3+hJeWE;R3i1Nh|@_$kKX zVkj((_Os&4vK@h##Ljl1nQC5cX^)RK=d^rP94d(XVH@AkgM zHg7m@lX@_3nXl9oxgz>L7dX{xTf=^*Y_pN8*5*gVWu8Ekknoj z)UO@9>;_MX%Nj9b%}tb|^J7<`Y2By=)zRa`!`iOuYDtY@(|cxBH$Xnvd3sP+(a@1i zz(bryq(TVOgW}Eb9OpI~9ko&G4r>0MMn|gFrp3wNK8|C4bRscMkHoxvbq2iRe4Wl zzy6#6^53HtICmdcUNQIi+0Rco6y>9p3vB*g**mqjYw!EpVWuc4owLo_-m=dhztwZI zXQ`(3zOQwqtbD$5_Vug{Dohag%1zMP$@-$Np8yl-56nF?`^Zp~^5&lMFOYT}<-37k4fWEVd!B zfy79R<+>_DT?L`8dQg|QCn+(f6lgK9eFn=+OP^!rWu4BODAQ5V>2Hv%r~6m5K|ACP z0xPQM=@y;?z!b>PNtqY`LlyRn^^L`n&e+yyAm2+_y@KUgyrP$QDq(V zi>DKydfy0J1#mYM@=v-a*m-$Xr4ic^$8_YNi@iBL$y)uY@-aC{S~*PqQN{$R3tm;Q zD)!;fR0QV4qv!D9F)Yt3MN-D4J&%lBU{XlN25V+09+xsUwHDjidCWpuyPqVpOm5Qs z;7A4c6WYx|)M4n~RdVCG>E~oM+yiH`jvDS)-tw9F{1+Erc>p#2mTm8}-EL!|aRY<} zs8kgD9X^@*tz^loO?&;aB6nfu!oZTZBguEjJSrWDlm$2M#=@Fx>AgYK89rK9Fw+&n zzD4aQnxcvBN5q6M3xu!$Igr_Ga3CYu3%?jO$zGHK-|hLpPN`BRwNzO4z?+=JqO1vN zCPd*EEwE%HTG=S4x?{-#jGc;}AYuI0iAsuOMAi2&!N6+yC#WRKR8cQK+2x06cZ@KO`u2 zP{k}-Y!8hOz*LC?jANrxTsgYgCY+7KHytNkh=DJZ035`ew+Y9j(ebf$pT1ByB6*l+ zBd1>xLVY+n!wj*kqn;d}o^f2DKH1UEjE{LxoEO0!h(Us}j5RD?iu8$$H_R2Km#7wckyoZRdmUG-78Pke_N&*es6;q2j^fsn)0*6l{*+9UBZceKvW!nJR-IV;ZYfs2&`o=5kq|g0;^xpgZyGS#z`~4 zxm=0Ciy+Dtew;+kua}?Bji8&}OkYN!CqQ0>yhT~Jzvf80*iI`YiQ8hr=t%U6uu~pU zOW`1jA(<_}IcXdsG7is8(m=+f(M#}Gf=dAop9$UUgKM5>f%J=efX16AbH}9fXOv50 zMURG}u_yMJ>#(p#`P1qW4zo{;oE`ck5g#4S#wb2fG{b_WU~Woi0o$)YK2;0~a@DoE z&=r3dkSmK6bYV$Piz`pSWdeG!l*=vyV$5o$R1z;lpdyfhTHn$F^>-Ul#)k`Z;M}D> zimmms(kkbMk@55c3yn%wfycP;aEDZXyOx(H9KHpRgzEj+~> zv(ogqHpSQeoUfDFXuMYoCZy>tFD$@?KyY_0+q}kkQ^{ z%yjG>Cnw=h7nFsJ*HD~yqCxsg(tFtKj)ie9)z?F8N(zy>aVZZIR|t|3Ww{BV8&6qIK(P(1r4no3%cT!W=>&3HhFGn ziW{{Z#`<}9X=r#mSN^e71UM3(Gwth{?bx9q;(v?68npo&5HL<1{)o-NGBQE;eMB9@ z&a1eH&`39<;NB1YnZcDySs&&O_Q~gH`?TiB#%&I3hs-yc@%j)TRpMCr$AJI~-O|+J z2z?oFs-d?Yurg3{uESb^l&;~QWFl97xCa3gb(&tIkV1DW{ak7e$5!^^8{Y!ega(s?ZlSI6kbPZ94 zpJ3N^!CN>$WmGd`n+P&qW9>OQ*+r_l08Oe2GMS10KZGRz5xs!yG;+Adta~A_T>tF- z`e$Lbe*56u(b=Q(JKs7n>z+DD;;HY(f$0ND=avy6Kj!!K@P?&|DS@`{_ex>V6M zbsTuBGaIsfvAgR$3+d*x)|3=|i{&&)xEO<#h{&WW(Nj z=YAUeaqvO;nWf6afGYI}aE{$Q%&;MDP7lvcwNuWA2MY4`i)zjFRNXR7qn)FB!+rZs^p{A)|=M(i{G}-c1#^w@s>_0M=`ap7wbXziSny6i1LnN!leTE0vHxpLWctC<38rL zMLSs(P}@;dge*#RqyqT9E6QkQVplIkbf_-f{UgxIF+gEF&;mMQ5ZGeNbh3k*h1V6f zE4NejM?>v^IXTN%%L_Q#5te>P?P(Dtb}L*_pZV?-?449ywG+txII^eKCSBMgT~PzR zvDouy#_o@^zijwB@1~sF1HAKYoR~h5_ISy0f}iEn71eWJp8YaAirAT|?R;G5jn`ti z@Mz$dQ$PIuub%5Usv7ZqPtXPZ8BLFUh&h&W50w#jA=~V8mihrIiy+Fk%HN_wivBZw zM|6s^;6!1%ervkEiEQ&Q%)_ECD290fKq(vM1!zjjD(4$#Pm*C?fQF~Kjtuhx%;fo& za~S3Yn91{O^<<$Je*3(^U}-@gd%GW*nl1jcaw3tXp-^kR-Fi2W+VV`63RdhMGOMpz zkj{S2z}mPi4y_HP3)-MSY4bY^(lW0^v+cv57D)L@VYLTQv3@3OQ;omuJbNNNg-c+|HyA9notzL3TnvYBfr_H zc7h`_#vW-5*#ap`TWP6K)RJPTDLbMw)bTcD34Z`JF;CB2!!DGn2qRxSD%JK|la2Ne zrszd|$VO}Bp*3Pr*X^#wzMqu+wD!lf4;qdwl^-X$=wYCI?)2E7X=9{=%i z==^<-Upq>sx6RaleLK#T+sovgA^m=f^9ksjp>BrGnO6VbqK&>U)jfJb(JmTfI}(Jo zjYbRqh_}E;XZ}zYhnbHX7Q_$OZ`Nb7l+Bmjs<~N1elg0Jr~}x;n2FPHw548l3#0dE zcxVIq2HHnP*!Pj~tJ!us(k1IwGmczZ-V(2Toq~2hw0uykR*!aT_aIBef@HSq)nOYP zYcIM?n%$Dj?n_p#Z%{9ej>;{fz7!0^h)X?xt&h@uk z+5h0`l3d;2bKC!x>-?S3WH5ffbB5Ne)no9cZLY5-t|zi4O4n|oQ}lAW`;XY~ipBBO z*RQ|6YQghtD_1K#as~_)S}UYiJpLQe>1fu1G$^vW^v3Dw(^(tkc&?%*Yo`=k<*KsO z5)|3$oO*@~G%_qCom*3urd2mRYj@`jh|=nSD82L9AJzPzCTpVfOP^!E+Tc)@-NGF- z95p;L`3!+{pfbyT>zjznmD78>SE(%90%!DJt6VmerVOPpV;kxp8zqCmF~hHN^b5n1aZ8pJWe1SX)$Lj` zY<_G!Z#0xl4XkqXOLgnAtSEcZl2&}%9MP3+T=Bq%^2tGv-S@-BpG=9KYE}assI20 literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/lightning_attn.cpython-312.pyc b/model_executor/layers/__pycache__/lightning_attn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1dd8dec6854f7f93efc6a0a13f9a684a42796dd9 GIT binary patch literal 23160 zcmdUXdu$s=nrAnk&G(zMWIagPmPA>SBPWh+C9y3(WZ80@k)1e+k{m&)DeL`cQnICK z&N12ShMX8HYG!bvXJ;==Fu>4bW24<$V4c0(#WT3WdhQMwb?6xp4KHBe0_^2KCHCy$ z?w`A_s=L`ursYR23(Pg7VpVnZS6_Wy-QVx~zN-GOHk+A%V^a72*k68+ApSclNQY0% z-2Yrp5UT`D(BvdBh<-JbV6jCw$ z|1FgfJMC1A#{5S%nyv2Tlio8Y?NW@=q9INawEJCx_QZ(sZPIfl|Blznc`0{YE93_4 zG5dIfR4e}uI!%O$pm)$-pxGqXOF7!NMZJHE`oI?T+hT6IW{Y#=l93?j+87#_3J>bF zBbpju5@y2oclCqbEoMNS`qMzytJja{=mv$veIS{g-KR7NrXAb@G^{98I#;>!M6RcM^Z9p$TD@3e+Eq0;QyY zXb2S0waGe`RPyv>+{m=PEjRLbFgNwAhV`Hv8+fddZ6zBl@5iEGdWdWqSS{v#}*z$Jq49HIhLyIP7511H+qVnVE}B zbSg4N^M)vf3cPM|CQ1WbnBk2hkWD}S& zdf`oHUwJWt!3bkPW$nz!2ycKSGBnDY@bBcz@C0v0+2J9a*P}d~otub2E*?wp)IvEo zBjv2Pu~c$E!$H7>Onl)19-6yfZR#iH^`?(TkCZ7&9H4 zWcqsRV? zb46Dub^4ywogK)WPMz2^5UxPZwPMZ~$|d3L7G zl>ANk0nXo?da2|K<$E~auGCpM2Q7V#c|YfENxfL|@5o0ve+#rM=Qb(AWG<0D5NF=~l}Tr6e_)oI z%)P%SxAP-1*TR{bzOv{|J2th1-nr>k>Ei+Pp`@Hkd^Dh-44t*n#B0j{fu zYwEcL$%)(N?@$99)XN<8G9qivnD1*fCPS)6xKpi})@*OCC;Qybhkh~s;rJbMSLzr@ z+V07AW#?A*Ex(y2?|OWz2UiZ}2J(iY2X1v?S2=c$7Cmve;x7wqC(U28p@YhTIjdtlUg%>NTj!%Wka zbIXI7!S_2i!Sr{+Xl#z<{Z0=RC~!*lRNoQPd<@9$y^s@ZTpyfO+Pr?a&VaCp*fLNq3+<6Di*4SD zt7^c8#kQGX87a6_$H{gH5mn+~fhNM4N`ShOm*k!d}F^;%)w zDR4cEB)J{z#Tsn=VEg{tj_f<)rVearw}>_$N6r zRQj%aY!A06KZ!?d6Bz`Xh(WW3K|^Cugo;7Vgs>eOJq!wfG02)^YEgom$-Hiu9$VnG zQ)AQ2c2tQNV;WHsL_O*Ea9*&2n1hH? zH>$MuR~Q=;0@D$&OqgTnyap@^V|;dUED@RJ&6BZ_L?k{sHj?1Y)ALh8cX8$@YAKu? znwg)bnUjcqAF2gY1B)S!2Kl%q26Gx+Di{R7^ekAQlapW&3_exlnC@I0c|x@egp`=Tr>wia^;Wb1DrXGz<(RU_umN$N2K%KkLD^# zV;F9!HcymXSFlv*;mjS#Zs}B-4(g;`V3l}kb0_jAuD-~*!#9q8eCpb%wdhTMvHi#` z@2}@>1-SN8ocmO&{|kp}c`>t?bLD!Ajvc8!=-zBy-j>;xJ&=2C-Q9d+=;QHg<9AF4 zQ%B*Zbnd?yz7_qX1g&~Zf@m;{0Zpbb9?yqE2)#n@(Zj4a)Eqv(G@QAa<08= zuX3*EQ>X8`ygxa&`s&K7xmWWeMPEl@sp#rXoi6Dd+0*%2&bzy?zj zV=UGr)dCv&&mcjc0x9XiM=2wda@;EESMaEYKwRIRHw%i!ffXcil?*8QQj>>6YWeN^ zOm0*3p@xzitXGXX>ljzTJL4+;Yl%ncDImdW1B(q=d)QtAyU+nhT6h`&(Ig#L{-FX_ zsd43Dy)r%U5^DrR6t-5QAB=G@vOy3?z&+raMwFvP*==5<7D5D_!P5eSJhFQ9@8P{8OI@*^)wURAH$nZhS(KyU$^&4(~Q z0vd{sEyZ|?kQX2sYBfC1#L@V+#3TX;v16|!v}a=U{BVq!giF4L|2PLI6Cg-aB?Ku+ zF2{RYK!vi>{Q`8I;Qv!mwcu{+tk%Y|76(O)0fGyhX?91}mw6qqkLc(!yzfZ$fV!+{ z%)73d^8;7y>pCC!y-Mco$eCC79dujs0J?Q9znXb<-4`y}+BR$*oUKF9)bdEeF7uhtjzMPDZD*~*wDFN-?>)H z)jwDCKcBV$3DCGmq1P2%%^R*T=L+9w`nc^{ThX;IZTJF4OJ%5RU2frn#jA^jx{dI2 zT==<-a4#3`EjFIKL-m0S-1WHy&b6yx;9Q+)Ly7W&&#&jI9_dHxW({F)c+eo~$8}pM z|LpH{ZGtXTu3anZLO;AsoQS&)zrFTWr zQ}!Qu!E+$2WAK6h$bzj@KkH?&mn9hh&J>zVs+WB_$!+Bp_-$TpA-vmR@(U{xW1}jH^ z#$ADxl%6Z)I}h^!TFVRTVUWCa*tcX<_?EszFKOAg#2~~Mwmbr2VPGYDm1K0)$}}NzyHL`M z5`?oJ2U}rBQk4%00W2Y4K0OSp03b3xE&Pb37*9=0=!zOH=VFr62z#EWIjQ&*Vg=*1 z11R?#N(NE#JW4QZGY&`4Lc>J{e4GSx7$rv_K{0Cqh}j@u9hsh)j^Vg3PoW};05d(% z>|3~ud>E~j2}*(F?*p}`0k6e1g5Rih3()oj1ioZ<62fxhoHdv_fpUYKwJ~+#uGPN$ zV&=tMsA#QAow(=lX6G|W6h!uB7ldfCH+va^cu?ipo*U&n&8agL9;N)YLLV1;<|fGn z4|0x!74he)b2s}q$8l)z4J283Wo_Wv>o;B3-r}6yH(%qNCqy??x-Vn>3A4JmvY2<} zdy9eHzw9X_3MaY1Gi!5P-~dM*_;l}Q2R}J@Yv4Bd?+@R8=JSDbxA$|;o#&|YLO>aU z*5%-=j1*M@)BsT19}Q5Qn|27PreDw4f9C$6=4ws8DcI>AvV{?pJujQ?u9NDV-+*Fb>Z3i)QDtMm%}v%Md5U+pgXo7cfjvumySTIbFE zzwWw4-vYVzb8Y>c?@ZDDBJRf*SnXZu&2{C5e*7W?D!sc?F9I>Ad-+o4Quh0Wz~8vm zj;{r{@WGosT=<1s=kGX9Z8*+wjx+!5+;0beGgx%ImO`l3;>o@R%K#v_Y>*?P+4HLx zRxafFH|jdLx{i&y16<9{J8pkN*#dStk`Qrft|Wau;e&A zan*-JD@n4Nho0B4+J~Okvbu+!*CnJe5#uU&U>xE4s-w)Cfi=)1vJGhs()o-vQP#$^ zmexr+@f{Mz2r-!I<4x!8hP-#Ky5%r`cf#ih`wS6?Kk)>D@YZ=i&KWh4! z99PkOvTjORX=BnRuRK)Kb3orFjVEcRsiXr|W;?JA0u~75;10toMsVK8ji(9PB+2L< zQo*~52Lo0<+kk5|{>+l@q%ByNyA+g=*P-fQb%1DWtZN*_U#bq;LYG&DqUc%$x^W}< zp&Bt>=p4}`J&AgW1nWWf0Bix6X~=IWO54m@pf@|~xM95xo)vV!z9$QKT+HBsf!UDs z;hZcN&1nK=f<-**$GNJrwC!8TRSg-UIRRD|%nH3?hO6x3*h2bc=D;uQgG2yYwu(rG=*vx!9^3X2DYZZC5V zB{-Oio(QiAzz2#-rf1!_fq9nxBr( z&BtO(vG6viNn93iEIEOLoC4T219laClrXUJE!z^U^W=;|PDt=L-plI+$Q(sX=`J#3 zv=9)M?*RhSarCC#%5^DtE{!l2G>j#;tgt)9y7yqR#QmUsQUo9hn|=9A=FIzNA?Rhb zJv-yYMch}uea(0)To%i)xkA0KS+U%QF z!XH5Kw+uwO(p?a|bD?fvfm!xwrXPY|z<_(XKhqC>!k*k*K9sW-dOkjR?c~j#&-y;; zyJJ0(CV@$pH?6Ccn-eMN6LZMx_3fM3s7n;Th)03QMf&$~W%G_6y8>pT03 zkUE;61DAi*x?%-4q`naSc=X!n9q0a=AlW*>6uu}5q7EWeR?BNu}4nhITo=z7glpXxue zePX*E{q5*)M(@~OO>0U{7aCA@aorL6=2$!>tG#+q!a#iWCHM=3U__AqA>@KVwTaTn zF2Mj0F@H7z>-#VPAme`sr9afzfIv_pu$b@W0v)1w>vJ?m?ND62HA@0B^$wu56HJ&5 zsvexYxhPmcJplHb?ob^iZ%tZz#dOc%{K+nS!_VbwH$wZk(7v^3F?3+P?%>V2JMNx! zN6$Ldg8}l{u%4$Pk?H7EED{0K8-az)^OGoVjYQ_=qm$wpYdjH6j15o4V97JhT!rWI z8i5Jm$B;=v1y249J#*->eU~WdU0-Q_KkiDhlR}=n`eXrv{wi zdKKr@4U+_?lltif04UNnerXW!E6G!BS(S7ysaMF_A}raHMh~H0aPc5nvLu1dS+(qpT zv^#hn0;OoAxGo+<5EN%S;;;fM4oFh6`{jX_Ay0SkrCH<>h)xD4Vv8LMcnBgK3XU){ zQ$YyJE{p-v6YmDRGiRpbg6z-X7n(x+GxG^RzTLqdsmGvDqv*Ih_{uB|Bc#9CA8y5@ zU>;$B`z6l(DJsI;fz4aT0YXm4q5B{mY+b4im!UGO^&5+i4FOUfS`1DFTX{0fybYIh zhHcChbp4+}!s`~sVwZ4O9QH=d&c>!`UJG-8Nuw6EC_ytF18iPDJPF=S<~?+sw>(#6 zKo$wRF_e^F_RI(9;_};GUMt9xQ$ko<3jh(XBLrtImXm-#HFI6CTCa=wH|Tnd5z*@c zP3Fo&K!+X`W|+T%3v3{x_zLJMfQC=OE)cvSLX)widP310Na+t`E(3YIhgS?Og5d>;Ry~x%c<2*X~XoTQ|3tjOO&- zw=ZQw@33e?0PMrac3rXEr7Y=V?^Do|&7SH7#74Q-jWu_DwX1Keya9CrgPFmc>x016 z00hTtVe1FEZf*P9U4Pf{*B!UX-%`J!9%wawpa*OE{J{<1F4*%^^tEjG_Hw?xMc*@N zGeAjcKgXZLevXpWktH*|Vg})<%xNgt?CCgce~4y}r7vgR0IXGAao4#sf3Dyz46JF^ zP8XerGkQSAdrPoKDYR0XF>Kmk-vhW#!Rx#?4dU|sudh1@OGg~M3dFAtSWmR-c=9-J z#%5mR{Hehc!)O_+bRe~ip}NilvVGwf5y~ABY&~s|cb+C6 z5fcXg3yyXRs{yL6RqlpY>sQ0#-TUI+m4=FbLlEbzlxV0@mPnFGSm$5Ws-dbC%$4M$ z;<7%Aywc1E{;@hFNpVeC&dNbta9HW0m&f8lFxRXI=9ZVr z@%pjp#Iw)H9w%Y@fHpoqg;&glTX_8(TF}PpCxjnZrv@b}IF0a_aLF8>oSBV95OCuu z$R}X`^<~)5jI}&!!F&wt@m3Tj9g4)^t1Wa)D2+-byFjgcRxFp#JB0J}Q_=-eMVDB? ztf4OSvtx8*Ncb5QerVVSAh5(!7zCru;YI`}Q3m#a1E539L})QW9Oo?%%`F$Cj-(1^ z0g3uGl;G_NR)I5dpSBSQ;qB^%1^ioulJFL1kV}k#9R@-R9|;>r@>2M8Ws%v{Wl z=6$fYxM*+LfR)|$wi|mtK5*?o(f&;8B!VuEY!jd=$_@c$2msqXIDRPX%7jJhKF-@* zw6>&906DK;+)+yY_;BjXU05`SyjeZDrJ}nH;>3eo$6?NWIMsiT^8RQbr^`ZAwLb6Q zs6A`DZ_>Z+xp|S>+k3l_+xr?vy@sN#-c2*%+?H$S9Lxb^fM>LgzH=yd(al|7yfp#9xB`lFuQ#jB@bFi4EPA4PNC z>_uQC?=R?n87&w;Y~<`uNy6>T^kz?FpMC!<1pHw`=^4(}S~$$vx?yo8tj^!RvOo81 z!MGN`<9QA?hl4zJIEr0TNy7s#|1KQ=99?Z3Jwx;|Wuz_{0`R2m!b-maxgm>@YCnU~% zf}WYbhh(_y9)W@j{xR;e!?fJW1ym86B-PwNC{6=N$2@}3R9uK|sXC0*S}GzaYC2HX zo)#PrGQ&zl;C4g{&J|Vk2d*Sh%fQ#8)LY2km$HLXN+}T#2s#ur99y)IX*joNp}@3T zw9u6~b<6Q$){1OFSotW#8FheoE#N%C5k^?!=>lYCfW3brey?yoS>Q`CARBZfEv#kS zCghc=yyJLWX#t2$B-ai^p^VYL12JNC`@I+gnsEvoCtK-SK})`jKM5x4*Z31UNshJ$ z``UwD?ZM-+r(r9nh+5mNogiwhB^?^aZWqz_fS(o7{ecIL-uSY9RdJFiTVc%$9uZJ@ zLHN7|#Gyog2=e~K;^n5Y4W{ZofzPPGl?Zh_p`$~|xByJhRA3M~`4TeT`YIjsk8tT#7=FAR<}mh3xba@Ew@;^fQgi80dH|dY%lk9?vvWA+L1rgE zw=}Kmn$|5%Tc5$E{D!R_J|F@c25mv?GXO(#u*NQ$3QdRahZupx)$LHsY{xJG9NM#xN*Ei-8oNg!hRir;x30CS; zPSE_g#tUoKOOzk~1xi$)8yFQM22*@gASeMQ@|@c9#qYxz*-5G+cw)NNQ= zI7>^>5?1ZNb%Qz{DIc7YED(LB{VKRt|GdpPC+!A#_(WeSLp z;brVg<+T8nA>2C;5ozpk5~d@%%gnz;$r+TKMpsbJ3o(~N%>RHQQUviEFl{%rBuU<< z2-5bCMBP6Tt#F$()O)pW*KTT(Bx%^R5|sHp_HB04hKqKo=)gs{RP^AYS1S5&(NDk% zzvna0Zw7F&7GXOGj()uz6(L%qv5K7>(Um(LCdnQWaFf`pMJUT$2xX^q-X)Y}K7_JY z>e(lhw;`xFnVHr+)>Sm)*hM*cg|8)qLMc##TXJ85U)S!TE>iwqZ~E;nm%QMz}V+i_8n*Vpx^C~}0d4`|I14SAHz`c@lO8a7<5 zoU8Q#fvYzAbi}r%jX(<*Xjvz`F#JGmsk!Yd+kVntGFtH@(?b59nAbxd;ddK8V?-rb zQfXhd!iEzQDw!dUy=g%?s}z(@aLyf@h#-jCU`g*ovoGiSpzdnjx@8xqZ@%w9=fK#O z?$}w9JV`3vK*?gj zSWetERO4JzX*;4?cbU$lV|v-MLbsV4I_s`hzOy=UrhgPiqyreEv`Xtcx0%kYOIb?O z^sbrS-?#U90X&o?$GJB%w@1{*KKtyw&p!M4y}v(qx$FY25zE(y{_yVv;cw_hx!4oj z-2K;PLAWM_gpg@O=;yzteiOSl_nYx+9?2TD^jl2yENjF%n%$qx;--RK5`fEmg{l3xK{@T&H{<_in z{`%2|{sxxEG154?p??F5J4c#EoBNwt+%@7K-Ppg;B$$P8QMl!86Yq}x)PhKkeDZK0Bv zE&baD`i6Ec(%hV92PLQ8N4jFR2ehwaS;Va&;(IAKs;udt1)hzh~X@4Qb^YQGSC#`Ms?6COmBp?^SB= z-^Wt@NZqJQePrEtsiVNkwW3^`!Ml1cnf&cPrIz>kO{u)#bCIYR9EgsFqvys$NXhCM zyW}^ga!&@uAQHo3s`$*<(AZFPC^#}S6%MKC{_IqCXz0RFD4ePu3`Yae@K|JA42+J4 z!XtqRlpYyDL=29dPc^Ln{NlJc5(-36bE+aTF*2mq6C4>I8w`vDBhf%;Xf)+Bt{yK5 z3i9OCbc93L7B2SwgHsSWEqK8-mLp{nJ41^b7h`;YgWN-JD)_~hxwjUE*B zJoWgApg1x#mX_a*2zbfx7kIDPWW(XnWH6&}i-maXW} z{=>Z|PGr8Tgbnax6Vc2E1xHWzKArgqg_O9aYWI=HB()f6&Mo1*d#E(pUGIjim`CMY}VaE>ln zj)t;AmUpag<0Pglluakgv^i#unWh|wWyP|FZR}B6fc6=r^3tKKvu3S*y0$6jzOJU2 z5X++XC@q|}=<+*t8Rclq5;Hl3m`U03r!6tduzPL3sQf+X5zpGE%G>a5_b;2je%Z6; zQ-VLI@2B*lL6+@C;E|uwpq<9nm_-|K&iQb70uhn&`0e6${G_bWadF^WDhsDbDr+?O zLdqIB7n}&Etm63OSSV#37zvI}q|DLrlw}BuD3x<|WIPz{+~T*UtV3fHlhKsrc`BBJ z3Ie=5?U1QlbzhCpT=LQNk4H8|M<+H-3{A8}gCb6pwihP1@7!^IdwA2x&~uw6F5v(h z+jL=MWONfhgTgO_2PV+>Ql%orl`N@UQvZwR%(WImB)!kQizh-^ie%-$8sGmRc=F@LH{Z9Bhqi>Ba zH|~x*cE>Hd@1oJdfVyUIqprT)xVa`o)wLG`Y#I{O*@o5dl09a^oW+o0jj&&u2}m=u z8DZ{Ad7m4KK8I5{5FHu`vvc|_Qz{3&PX`D#5$goO2`M|qU6jvpCy$SxJ&W^3d=!m{ z&_bp$6b~bkDh!5(aqtF|s#$*VaXdOjnY$5O7H*Z+%veI;l8~Cq$bxHNyWPHPprH18_ zeQ`@+GN&Y-Qj1l*$&v(MfSEt&3<)vkg0j2sUg}$V6M`Gf1D#!nTA?QSYtO+1N1gH1is! zS~9K>D>t1Jv&M3!s5{vto3_QW)9!O)xiK5-;;gBIJ#2RP0Mz;9ZGgdKK0cu$zTjBM zM*w5Q2cW_Sm?6?L7TSL)iUjtNLt}&Ov@=D6BYd+mr!1i$_6%AN*tyt9ok3iDl7cTF zNSOjD$GA8&7~n5S<+2+Fmb%rwD8TAWd9(yMcb=BQ8b)UWO?Q6rNnfo^`Ut+@Icjg9 z3rm2Hj2J|j0sN1A3BhGy)h>9d=AM$=jV~R!+&j~etlGG+PpaA_S;{XzI$L(TxZ>LI z)!{_(2B~<%{PR+=|MFwWlDheTRMNF%l}h%|+$pM_uaSy2FZN1BJ1_U$c9g{{TH=nD zxTR(7BB80uX25?!1ZXUdVmeG;%bL#8Z6IdtC{hktF*}yUXUDX8tU4_()yXa;ZzM0O z9@-&m%>4VpADETn>#L?{u981y8a6mym7D+=aQKL2#<}>+4e1VTWj0Tna6q&97&EcC zmL06Z-gpR;=!c(2;jM}_Ez6>Kv6=-$0|@u28Kec-SkRYLhLD9K;kwGjJOrSTUja7^C{E$l*N{%mCP%(EzDrmf7yX7jGvwgdM#kJv9UTM6db>ZoSWAU=> z%XvHEwjK9ug3U3rC*i1;9M#J?HS--eaR|996Ye_6T{qu>^||1^>E0N(ZH&7&CQBRV zgHmbpjO(_;Gh2FP-`t6~!ymW!*?LQPo*R2E4gX35nzPYfwgNv!gn(u~lQC=AS}Z3( z))caY%;mH{TWCL?!YY~0!b*}s-H?@1hfDxG)k8m|14036%IP5sl~vZ|LE%;FfO$w5 zFh84z8m%!af3DOUWA*;F`4x{vn9iQIMhV1V7mi2EJ2^@{(>b~)S+Q*06I(2Y@9+a= z6W%jzy(su?!5^YIY@VG8(}IZdS(>(`ceP=Uvyhf-6O99(X7ZWR)Xf)Q^KWc!^>wxScC`9B zwms`_O)tWB`2c2jG>igY-rVhLZ*NCV`F9KZo#ut6d0S}>pPd{VAfkfr;?T&5kDtB| z+oT6;i~4T*N*po|{oNf~9_&TRN2k3IuwrJv_OV@q?;ivmq}-H6*N=X)NQ+gBAn<1a zZwQWoG;^M^T0%nuQGc#DLC@*9PTBZq1xD1Em~!bV!voV$dWx8`jD*idQ87)92%Rw0 zH|*l)aveR#Y#2g_nPzx4Povn^@jtSJ0DHf^oO+@)3$Z zJ_nf8H@|1mE7f;O?oOl@6$508`@5ymT}wSu>E3wZzRM@>WefQwv(LX)KR3Eu-n#I- zRK9t|+i};LRh0b`Z^ul}YL<{|pXs?`jk{~&K5R5w77=WfY+IAwj#aB*&0iIe7aNq* zGvB;)V#RR`C<@{`mY!X4K1wKvd$wq1SKLzki%(q0$uOz!7xc7Q2+U9~mmIt6#9F*2 zgax8mgI;aw&kAF+30sE^FxO#h6PZ^iD{Ko{!nt7^Z9*X{?y~Ve2Q?IC<^??M4B1|` zye&WIcZ4mupd3Hc`|PN>bZyNc$2#e`Qvg0fPUEL%Z6VjoR<%8@OSyh`%A+T;5Mghs zn?fJZ+6ad}LqliJ;+P$a`kF_Bm!4B0)^N!0mIX2ssCE0vHOA&?LZzK?bU1cD?T`sMO=)rL<=Gw4O=t%WmH( zU48Bt2ygqP@W9rsUNI-ru<&*ZNA?2 zL7W)I?_tmvKu`YVYbql+5d##70t_6B1_5TVe!4gmik_pkJ@tI6@2T_ne__4P(>g1? zdvyK6KT&RXa6lZ#JO|;A^;X2NvuGlwSC`&F1YbT-efqiF?K=pBjb^pd>+C79BXq3* zC8D5jJbIKMfl;i%aEO7Jep|{C36Go=FJsD!FH!JS3cg0c%M|=3f>eI^1(3GW;IRmi z0hPhy$3xx#TPJe1$W%?{ylhX;u!tHWguGl9?rj#_b#YrQ1OEC_rWaC{=Z1n2@hLnV zupm$_TC5E6Gg_=u7kefqMlSKOW*|-kv!1UYAOS3$4{_w3VwK3zCy8YAjkm7twCzxRPmawVd?*Y z3dlW@uj`SwOnv^mlr0b#1D`1nNZA9#I|2fRxHCXN!ifANClCmY4+H`tb&|LfLCQQf zmdXav8`;GnO0dw%NLkrVDAHM-%H})@aTYgk37kN37;*7MCHOapMBYR2vhb1R;B9Nc zs=35kciUa~>I+w1c=g3AFRof~znU%7Z&=Ntm`$i{oImwO?rJXG*@dFgRR_hKLUGxu zi(+{~N#(WkSI>Xq#@*d4A-j;(xw?g7cA?RKC)bwo3f$urPA`%AR&(giCZGu;S4QSe zOL?`cQ~*Wo?pG_XRDR;X9jo__88QA<^g#pouU^WDeRuPkGaV&Q+JqIUx4o-B-umLTZpXmMehk8S9$bQxma)68L!1?cfsIpG9!|42Xt&?tN=YOv8 zQ=I>KFI)Rvm+XE|s^pBu*watysd|1VpXmVTL35srrje=I@4xM&oziq-vs3ziMMO`j z)$tAV8-+Z;8WAOy^)_uDxPXXdzoT*zLgsg}RHV$NBdh>I3t2`viasq7q+d~w%oFn&K1XNJ3q8_@s&(RL*Ia& zj!MJN&W1ij^i#x$+WI!~&?S0_em7zs&6)XW7HAiqZ@&QnfxU>UWCCzY-idn}~QM#N+9d*dzeBm=O~~R%~pu`4#j?t0y;^> z->2Xx1?({W1By}bBm9rtKtP9S+p49=x;5=2#l3uzQcOEZ>5iYI6yql;#gvouuAA=K zE!JMs;@0o&cz4J5_Px7rRiOJD(Yx$-wU8aG+18_`v{zDs?r3GV8D}d?Qx4aA4!Y$Z zN1v2-rYa}iU1E{^iwsOTSK$9b9wN|%tgo4IQd;Pw4B5igP;NNe7-ME9k{wub7K=F` z83p__?;#8d ziIZYj_XaV1cIbs}-%0k%a51$~9|58|#DmlD*mvo^iKp_ zC@U?vJCvs}`*5cc*M37PM?q;vEcb#S0&URu0eZ1o zE2G53?F5RBl#fB4he|G0YL=?v=stsX*_X8_bN3EKcd96Uu?iO zMZ^JZs8ybfZH!qjd!oI{z0$+Tqt-rF8nX`5v6gXxZ$|CTbfKz3&m^hmS+eFHvV?;2bDe5pjvyE^on{ZY)+%QqF-RJ$-%Xzigh9 zL0l;hjRoJY<;=8CrwZirdV-&?jBd;}Z;`gfR8Dk)N#%;PHL^Q8*!_7NHvbwDIgyNw z`ESy#I=0$5kjkea3coNgG8qa(N|ywzQg-%>*@HulCLa47@lOzo5M@gNC)n?04nBS# z=~D$7IMc37J{J~$MGslya&Gf8H{VlFUQAe`0VZjnFZT&v@UNO;YKmmCnQQ(!<1$alPUK+orlMQMFU5+8M_?d!I`5J}dP; zyV4tY?Np+wO{!{(w?DE{)e{d7;`uE|>l8P?)41Z;e5<4+ls-YGyN{%7oR5fc~E$ED|pR1anG13 zs=@%uUK=sJQi#)o5U`9ZX41hAh60?(q1N}xb_+0Gj(%BFb``RVuGvjsC9dPL5FA#CqCqIn>uDeh*dnU52RK5FUZT zJU%|cu{wsq5f~z#qkvYLNJv`BCGWulVF=F$K)qor6b%#};fwM+l$C;i!v6>j=0ChF z+%Y?>w$(bJrXks~DY<1^vbc2By~+BNX*FM{s!dk-J}IKR+l3{oR+gMiM9~};wF#w_ ztGO&{mkFfv2c_cXRR>FR3MKx9E~$9as*By{3Eo1r?&9**0+w1R6qWsqz2onnGg+){ z98aTWro5smvmD@P&;XLvt5A;J&BI5ohq{6MSi)qU)*^4#kbNBl&LX32Agi&M69^oT zw@@AsxO|4dxfufI2|Gew)L{Tig8Z2e@}~=GF^12fDr1-)&RZwl!P~-BWT;hmLlNHK zMoqbA&7oq%fm{Na^a7bI6$!!%zKChA34Ukr0qwM7|9_855}v3-k8rv%TuZ~E($E=Q z&R<_-lh~?+TdMdsV}^pgLpUuHGy$qN5N!o&OeZa2%fyYt_62rVgpCV3O*4Y#FxZSF z&K4d~sBoLxNtz0zvSGhkOot)rdh2sSV^0Uq`3rJg&beTOq$I>2Q{q=Cc#i^hB>y?Z zzD|J#db8QiY9Nq`ALrku7+Vn3>Egde;CE!g?4Q11_+jrrX>HHY!{TMyDZqgm6(WBU ziz$+ifFSZp0h$h7bJZ)3=H$lq#Kv9H#$5zwHr;O7w$Mh9Wz)wt7lYZ8&f=uABvq^tnDLE_IgDMFiZFzN~yj3c1U3Ck&{(FKg*M6rEQc|}{YZWbv zgNe#ksj_wPV4|y6>grwb9J-rjF3X!a_{vcL_^zs?r|??t)!f^4^~pwmqH(*_xc!bb z%bWi*Yql$I)dS!=;jEXO_49`o@X;L{6d`a?96^vKT2I>s+mybG>7i%ffF%QxjY6tO z6SD@hM)w3nF0m+P9;R9{E|oxF(zY%v(d{}8p((0tk?JHDn~ZsWc!$C{*1ZR*>O$0# zwFL`u8a6|I8P}jJ>5!E)sYEYw6(}XN-%z%aj|5$Zi?k<7O#NE({hl&yE&nkA9L)RX z`+Xr~yZ?J2o~}x$eHDj+5wTFr(>B4FD$LFlCm`gC`VB~h!&S-`X}?-oc6b<&CB^Qv zZO~vNQ~F2k$Lv^kD3`ViC}89~XiHwSD&=W21e?&_m&ya5Z*Xw(tjwA_lqyJ12rxpB zx(PCJ)Kv9paR{;{kT)R~o+>^DRs#_nd{ZTO64WU71Ou@Ls(KcarT2Tq6ZirskVV6z zDTlo6viA|CN2K+YazMHsf*?#onF*J3{?&PUmIHP)Q%bW??cxn8bcBL_dTJdQpSTnd zCn@eBP%Ol;J0@!6w`qtVTiN1YQ^CKXpqENG7$OM)JOF7a2Wu}czE+Zmod{FaYqr<4 zbL4-aJ$4%cY_WEsymr1Q-n?U}ak=b~nY>%ojq~An+x{E9%hku?wu+A&)k$Al!nZ~8 zZCUnp&GZ7BzXL&%^4EqJidHrsTB(LQ8ulsL66Zz})g4lG$5K^d=W%K0@wl^U#d+dm zN8Zddmt-gHvgdDNGq3a~Dm$dgj>X#L%C4Cc#7~>LGWGS?+;pO*L#pXmuIaq#?u6)R zu05I8y6|kGZJ*S(Z@H}}-nu`ThZep${KoLV8kzs%YL>~h^==mO#gX5J{g)^ix9T@7 zx|ZwrN>1O*iMg#{h`(yTVo%sgLEM?ETCp`jYL(?Hubzu0YPzJFuH~9-%jMf=9!-`v z&W}muTM=`XB%HM6C!BSXvu=LthfY87aLf2U%y_uONl#;>(@Dzq;g1nv#0mUnUJ)l& zIrizy3uIanzc&QCRG`iPBs0=c1yBdTQQP-XYv${4FW#>uAg&u}tVW>22kpJAn1$3^ zWMT)0pBZ5A=Tz@31@9v8TlwT>-9^t+*&OJBCKQ)LC4@`UzhtxlXmD}{)>P@5nJhnf zhvx7&>H}|@*u0pbH+@@XqOV^1txHRm8B5Y#ka5=E_7)|)b&|I(;cb?@%?pJK(5qbb zZlB3c);A^Ux8AJZIvct+^v$8At{KN|DqeHbT?0U32GZ9>vtN#rm>rjeN~1^7fUqH- zK!kLfjJuk&khqowo*N84J)d$Bt%{^5kjG%4!CNxv^8vHA^!=lAXe@9RVzyv3M)v$; zzAE^srRB6T9Ci;t%&(!5obpP80XULqP$^R+jRIxoLW{tp#HorkBcMI}J`KWgQ~-KY zUin-(kLfP&RrirI-e<)9b{QA~P|6Q^911Th4eHsz*U50R$i>Oq+*qO@WnOwn6b zCrp*rjq^KK1^g_X`O|0Kd*&`hS9d$D_0R}n5`I?1(F`umH)ov-zdmw(WZ~$NMQYt0 zuih<{?vZl$+|8j}l@_aa?!eC>Scv6y|L(mLhn*2UoHs#Go2)v5|$T+*a9eF9#F!H870gMN|+;*A9gZImjPG= zI#nTPQ+deazE2*aWqDvECff+02@{pgpl-aCwOB_LGc&_7`KZYbs#qy%Enrl!Lag0# zaWlF)_}5tJ91D`=ZAst{!UJTOL(i9&#Vv@M!^w|wcJSrFDuYZ#hy>-o;M?PSy2sbz zd#>HrL&EL^JTT5a%&LH{1A8ZSLdu|o0@-?qD1=DJT$Zvx`5u)2jh;Y9D_1+}2~0fu zsd64?heQaBYt;b8;);mA9*u6M!*#pYzS*$Ir_$oMQI-I&T1Qn#t3pmJ%>-8iC|MDL5qt+5r~z9FNDQQl*}b1n4B|p zCx69ywOu?n1XWE8$QJ@bY>Wa!YSs*z;%Sn#xJX(LAt+T7Y#Sbl z>qqL;O)gMxnf?N1lsx7Hk<&ULdpDqQqC1&BD+51Tkr}fAz3PKf#Cji0gi==N#qr4z z5@hE%M5i{pxd;naHu;~F)~bqCabzN*tnrKEV@*-kC#0fF^Yt9q--G9T(Fvx}!8>Uk z!fVQvwuUq5u#A?KP8J(b$eIL1QJeu^e}s^PgJKQ{K+u8CE(KQwWKCpNHJ#-m@E|i6 zpl=<+kIJZEm?kVlVns&GLWR7CQYsl2Ii1RTv|mQ_mC2ogp0WX(ZAvX-C(0U-PoYL@ z9xxA*8Z8dsY4d}(>#F&cato=&(pPG4SBbG#fDL@3jY!Tfnq`7>Zp9)slBgD z&3;D}sd`(U3E9l%tp7A0w6qLE9Nu$fhT4BhRjk47gQR zB2jx}lt!Fohg2s0{R4|hQMgWdK9Nx&#hX>+oyDp9sk zD%%JQtFSat&>$5wBnsN4g7!qgcBx?d%C0j~!5K);fa2h3yy;s)KC+%njl>)X~R05OE76hJHbWf}?%L}FmG(=)ny~PORvg&eW zr`4RTdjg{JLop_@#feX0OngayWEyR!$`*=aE$gLLa9y*?&_USP3=W|wbZ#hzu>&~Y z_5Qihzf|9Ds?Rj+#ZShCr)HyL{~ev86iTXxwpJWczF9AI#w?KzNc$Qc66`A=acjt8 zh}p?<(zF9_r;p9JK<;zI?CNNnqZLX9Qieh=dli2nzr z{wE3sDWC<$HCkw3`N{KtQtUbf{S?Fzq_W3gK4$zPGd&^xEhYbF1b$~G1K?Zq2unr~ zvncp$y8RmjDF^U;h@f&-0~at0FsoCK(bHTAbBIx9&4$KAzwvXKE24u<)fw*lX{S^J zzB+ONuZBKTQRUp3`Qsp!7Hyxge_UBV-y3h)d&9b1c_?lxyX9+Iu*NqX`oOyE>xm}MzTWHEtRw{7Jjet-AY)Zy5YT9 zvOn(GA1~RT+_-gdSlZZg1OO=}!%a^|J zfql91)XWJSPEPly^Et{uRhHZ}^LrBY+obw!H{ILfwrz3uwxlgzVNZN$hfb&FU1X5q_o9nTnbE27kX|Mw8fom@q)HBTsWC2wuX6@u=ypMf5{cM`B!X*9+-F5DLFfDG{&8sA3Bfd z7-#GA%!noVYsND(nvoTlhYAe>ovvI^=sAeNbVk*guIWurTgJdO!_0hmE*uFR8Gs$& z!dbwDgJ76!Gm=eRu#y>O@C`JI<1MokV<>|}FVy!$53DvYnTw4yn9jv0*9uhGn_D zcg77F^@OA9rlV@U;my`JS{J=H>$ZGcv~_WOx#-Z9L$`|>W)2y!@fa8IA=!9KX!8HN zY&=q9GNLExS$$0S%wRU^^MIw0-6!5f`^9?{{0zZA+FL<~W7-*eraZ7@#>|+@6!Sgc z&NA%szf*QLY1k}b^X2c6rsoPWuGv(L;gD-A!BHCZDHOdgOYjk??vZ$HPu#U%vhBY= zOVE}(Gd0)ndh7Mpx$3x!q_kw-V$HmTizUQsG~S}dK+HLP2pmM2g;?}(EW|nLj&l?+ zDgYgwTsN8ECoaZM+q3v*bW1DYJ{-l(^q7M8Q6kccfN>N{z)|d$If^P%(ceb$?nQ@8 z)(wzWp#F3<2&DoqIi6739r0yHQfAvxXm+pGdpZDWDu93F4%KSrH7(z?< z51D&#A9IXkNOjWooDb;&w-NI2;5cTVEo9tA4{;m8-7JONy%QWoL-8kY83B+m8ToAR z9@CdAV*$K4z{FMvV@|x67 zQ7w7td9@n9=vT?wr0(NA%RZY!m1&{M`lR7}P#))cl`+;QEVS}hYM=PeOigY?MyG+V zS9^bOV*#fzp8vD+6loIjonu|TA}xJvCrtg*a4sq)1Eb8C8k2?y`sy_Kr8*zq7G+8| zt0SE)$>`_|4l<&$XYe73$v_t%9)ZX$3_rh#$nToyNQ#-S2m##jD`p2@l5((1!C8qf ziCk0zyI3xnFQoBIkc;ZFD7iFDn;k+~FN0)wQ01_^jU$mR?&~5-Eo0Xe#}(llAi_Zb z(yOFtm^#VxX-XzBJ9vIFj>D7{fFBe2izy$ZNx#u2b>jcs{Rn>7^eEsWLgAjp2~Co0 zSIomm&%7ATR3IwezcgJlnS@tLp*LyzH(3`=P>Q6^fYTsGM1!$bbNgm4uSUEO*_03G zVx)3+4~ECWFHDGgr?#zs;Hjzw?YrS2CpZ$>+pd=T7G4(dq2hn|TjF!X^H04$viRlM zUGs%+R=!aw6|^mSKP=et{>U$w@tm(!)|lS1>$pGDS}y;_&Pm} zKg0ut9QZZL^e{7$vwUd4zohN|g}BlL<6APR|J2BmyAh*C7yGA>T*HzJp5iv`3g zLo%7NtWh(yGvne^|4V|le1Z0<%Tvp&EMPGDvtP{_$YK_=v7Q()EgM4i+4p}tENEjr z4P;H{z=wiPI~kfu7I7hs&td`NR^F7r>o6O+c7cI5c|$>KJz6W37`gQT=fO}^A(qQ{ z3U>7N2x>1<%4omfcGzKEL}>%5WZ+9PFJq2`(k`fmXxs>sk>-|?6SY@o){PBO;l@<1 zeP;M(mApw?qwy@7cC6peX-}*=6&zH$uCXP>=+pztSJ?%Yo=6Ygo!~V>UxniEP zdQvd%nJl#H6`w_C1`Rwz5;ag}i#O(me*b?%{YaV4pDw^W^u}!TtztWTeVB$Dw0==y z@_a40&(nfvlad=k^trJD<&Eq+a;FP)_HPWI$MVEiqAf}tpJELsn}mUk_5q@zC^J|2 zoz~ebGA^a$WV=Vvn%$#F6EA@k9%W@0#+VbUSkYNC-#w;_VS&F0Z!i!+3BfU9tT>cQ zH1X(G5gd zN?Xojui{`)%r+OF9#^qJM@p|QQ(^gsf-2Szg$!YCN z+Xd2|z#OYGD$}L0QkmbMhj^Y?YrG@Hm;raJSi}8_KZPG35G$q&u+tfz9r`&#f>!y; zcxu1u8B`j1bltg^b_R`Y$L{M*-+e*Kmvj8(Z2g+h)k)mj~(uUzZN=*9= zIh8pHSmIZ{r*!R}+m(P79+_Cq!l zf@e7xKA@6}S?H3xiyM>|Pt$|PDInr9?9}3|QCLxdJ)_{jz~ty89Akz#&G}`@6{LWX zn+YXIWpi($sT{eVp*ukM5HlX5LCTt-fOv_X{ssj!y4+=04t){40Sjbv4py-gr<}ZN zxZ4XZaBzmouy&B|BGwDc;TZYT3x!`0sjHy9uDP;`qzX8FPU&G*Cl_I;Mir5Al+cip z%$QC(q4_>IN}7zqB5md+*dM_4^gMu^fxDV~=fEYN{i=O#pX6ytcse9c$70)x=aJ;D zBZ*xnq+KVLcRe<1O?a9hF0kO1JUgJa<0`mk6?X17-DpVcKPBxywX);%J;~x~UFO~S0OIoDDmiUg-pHPNb>x!odTrO9^PaI{p%6#)h3(m#Ra@j7( z6O?U8l(k4@Eepq%%eF6d{ps%ac1vZ4GX)K9^EQ+IWtZgZTI|13_QA1b-&3nDkcQ2e+@pLCW z#j4mthKNUoNXKnQIjq?dx2_xtT^|)R-`adUSz0%LVzC2)Y8}N=VcYDnc^6cjO6w9O zO;SnI!k$FSUa4j8&62%w&)#^+Uii~Xczlw_H(#~l*@Rao>l@-u->rt0h2D7Qu@8Ee z8=iJQBx|9I2>WNFph)WU9j-DZ4UE56PN@s8F*Di^14{=JyH7~?+je5Pwr{!evDsdA>~Fb3Jp>ds#wRG^nT&+PH}5(%%~bb^{2d*{?*>D$lhn~VZYU5O3d(uVFNzTLSnjHgZc zaSthmkz^v9A6w%a_bk2eq36iz9%1`I)9PMKS4_Ka7tEhdG;Wg`w=LIhzvAwNMv57T-#DKXEs}x?7@Zvs49%9;s?K zHK3J?94&TBHG6N=NHr&CowrFSh{Qe@x))iUVR?pE_b zEP}Zc3;WPgKU&&2bL_ec+Sk>M^Jn6%yO+YtRR?GKJ}Q9mh+8U1$TB-`v0=!GX>&R21#So{of~c6ChS%Yq;dNPr*TpQT<;ybmmNRXGT~U?v2_mIR zL5Zru7H3t1^5Q1oZw9mzq#6iIFpRJm&s4%EzFfY@CnRj@5S)Wc#_)(1QovL6p>ge? zVQ3T#lCCs@RZXCC}W@M^$fXukN56x^80c$c( zkbwxI&i#(s2+p9Re&`uf+^&(Sh9LiT&YMI{)FDiPjlWa#_j4{9orrXA+H`Jma1bp6 z)D?ye%}t|2u=c29jM9Vw#X+le6OZXaJ@@2hLB7L)&d? z&wU`OPcOb3yF?^~cY_>UF8FHWw#sDJwi(y0>ZS!tqIsv(yz@uZ-B1^E(Nr z4^|iEYvYYOmv+T#53e|nBys-r!7kIg<&*dP@)ELaoGJEZ0vHw$;%ZrPk@d0c9FJXzykazePi^$F8mK|6?X z+|_SR)NPaMwlR2zfg7!hcI>zHd2weA0b-E( zFk;Vf@tX7@s3q#Y#we9pX`++{Gzq0W8l5EVt5xh;*)7H2!kY~w(Ye}}X2A<~(NwPx zGX2HS;x&nG$|_?P?w1mxoXjgE^9QL>c?~Hs8Ppo<6gnehB@dJ_D|0?=Xw{3+17Tm; zh_)lsZyu&5+gMK;HOiouGuruCTeW^UGB{*7Yj7Tu4LL)#IVh0HWf;RRNA}ub=GZ0M zknEVQDX)x#JqKpmd2aQ08#)@2WQ<{xv2zh74VihibCw{UDx)(X0JIz8DT>f6uQ6p| z=5|Vvi;Gy1TOlre%eG)TkHtTy;5G$hb3ptNrgC_D$2d-WlG16b zH}cp*+w@e+eKyTBG!*i;U!pDY4V1zb=@guBL4%qx z2>`c4Dr<{xdgO*z+H_#{@tMOPdkbezCyF*mMH}Kx+ZWvNq8-cLolA!$?|~aZ$$R*N z!;+suU6iHgPQ7@~xX7Bvuev7|OxSbFXF)#IxUA-DNwLbl9a z^k*~CKxXBZo3ANYUp$bB0w!p>5Uuz@%#6@3DYCgs6Uo@jNFhC;-~tgbY7t?!Zw-Y4 zArV+mZMDMy2p0$}!wr)Mq{M7ld~Y>}?zluC#kfEq#kf2m#WKVJDM=>{NVoYU_$R*% za(^Y1R!U-j^v=~ICZVb2PVO@&<T5M5KR%XjR2o2!LbM)!n6kx zWtlqTqO#0edI`n+?DLZ_pV8_&A4UL##|LxzlU$_l0&KXBi_C*bS_62?xUX^Fy@hBA8&_My#A>EKfM1oK2F|DBPTa3!X z;_#PuV=?^hiA8Dpo zh#JRm=5GR~VWwEfda5JVRuT#(?TcymP{VV@`@8kJ{gAl>+0h=NZ?#ehIZk zgSY-_N)6I{QRjl@?dpnKHd*6}<;7r6K(Uf8o2`*4#|)!YL|C`t>ko-#n2Hdz+>Re{ z2L(GR=%#>PBhtvEa@FH0<)o;xG*X4SyMTQDMmVMLJu3Zu1VDSQP~7Mh-=GKgU0NKg z{Tk(C6v011EJDNt{V9dGUc9p$P})h^+M6untT?(M%5ww&H%Ym!RtuF)B-L4(EUSa< zj=W?|JKa_$9aV8hW3m<)V|B8wi_&U-^2o83{B0}wJK`rFpUJsZ#&e`9dDY%ePf zYaXFQahp`!cGJ<8^wlztFSnZ87cMSY5<4D~c09J+e3Hy|H-7BfFdt1c?UkDL-pE<@ z9pS!Vn*H?K2#+O+qDHBxamIez<$d*wSH3tGB=eD-68@??C0FO-MajAMM&S>t->;UO zkCCgKLf{C<+jpgJ?)2+ly#7VW*|IPwIk&1-y5}u#=Dh)SOZ#Gp+K_p% zO~a@zWz$%023c$uxM{PhbEV6AMT5$029m0_5#9{*E^mLD|WdZU~Wt4BK#b8COm-#j=k@I-xW#P6rpNJXN zSF53I+WtnnQ>BV^_!KpI_)owCtAf z*R)$I-hIRJgS_|iq@un%xmKSYWOHKVH6+|kk{hxG%kCX<+YU~7*6+(SBXZ}D+d+DQ!w~J`5M;W(ScO#fyU!2SAV@Z_O}9Pc1~W8-%i)by#li$R(7TYENZ zFl#z1mZf9;TBgB5*0V*7*&zzJ^*OSBvtw4+VUw4iETn9GDAq6BvM5Yl2)^~->vUz+tB!dhVbU)}N4j(v+*zzi&8=M#es86%^J z@)uK}vVg@+bdE8RL8AJuHR*qbvS%9; zbV%BvI8)up^Ul^1)!r`^`4E(OC22H}FRJo~ zHT>+X8C>kc>$M-fWq?8IoxUJPTD$p;SY_P$7wKU8lp2|TCSpZqT2$`Fb&GS znEj&4-{ze5koAJ1X&@Ntyr|kP?ATB_+ghV^#$#-0^F!qysPp4ES_NsW%uD9N*DC4Y z!mCC?RW5v;@>KhUOn$D+g$K8x3=d+tp&a_1cEEaE4lKnX9?D^yb-T41>y}Vb%cUR3jU4)cFHhzxts0zHBsp>>PN*DI^ve$`I_;+F z)*e=wJn|}Ru8|Gc(KwHsyJ>0aTs*Pdv>yVzv#z`hZ(sCEoTYHk7dy<>GzUO?`2_C%*JbJ5ZFGff9v12TfWdou4Gb{d% z5DPez9m)Y3i=+3Yeb}PpV-|B;_y_! zuxV25HD4}OReP({e5Bf|Vuy-y8C1dy4s?v7LFz-Q_SzZt?^E8d{o+tI00xLb0Ez)A z*F993b&mM^y@S{+#_upVzGbS!IxT+qw}UE?uJY{Cjb2{bxaR3rmOJJ;D9oCkZI;E@ zJ=0!*Pwq4&Kgd+N$2@Ds!)&NtV1r7!rXE0w!fnbd(0);4 zfmWjlGwPKVuG$>RQ4tB-tofztSXRi+)EO#6mT50)Yu9Q~Vn|(QET?nFVyMBvY9Rj- z4_BJrw^@0M_Iuy=9mM;b?0x&zeV;*{_g~-a%MD81uO;_c!|Hl{k&Fhcw<2?2{H4}w z+WXhvT|Bydr!ZEis!|uK>;0j|466DMIm*cYP(6E}HyG4&|8f1i-p$s#(c5vF-M>j-9ykI++QZJkt=r!F3Ucop_F2(TtRc1~4)f`lN6&#kMu zUTLJcF4j=%eo9{FIe&tD{&w1*#HyCiS|3o-$4JAeHz_3K(gEc&{SS z2*f4|ni2R*?yq5<${7m1z;xcjR1NE9Mm9*-GUu92T3@R~kd%k1j}vKwGj}5*(Zjh_ z0Xt_BLg4hg$kfLP3m~F{!KKq3uURt;?P+v~FA@==AB3a_R2i?Vn(tZYk&4@A4ky#p z$G1bIy#k8IT?tRO5GFf@IREcIOr0!&1o(Geii!SpsVB;%?9TR6J>2sO<1&iFYnzv&?MrX zeGr9*n176YKFu9+|7y5+LO zLG7V<)#2NfzCbgW#hg8); zj_4ni@K^O{vbyf|vFl^d!>DRwRizgM1H2rpAGKhQBFQ6lJtkE>mR`CFeOp(4r^M4- z94~CR*C1@}qpykAY+5XZ_MNIj4_TcDR!IL56w)6vF^zOcEisjJ9kC44!C4GVbS%pG z`kU^?WQ}junXIZM5g>WQt9DOj_+2_tt$zdhHLdzka*Cf(t;Y zUmq{qa~n=8@~-DC6n>}rt!k-sJM&-Umr508U}-ldNO>_PNO|?^)z_;R8WJ0K!TAN( z+5gIWUy({-TL3HU9VCmz@(lVNU#dnJG8ent4cduG#Jw^r; z;uS3m0Za?kf&!^|G^|yuSB4g&`porbuG(+&$inGF+aU>mRfnLa4>J_~Qt76dqdzg$ zyoaaqy!v%??``+hz2Eq-y7!-)vI;F{Ox)K}bLmt2EHK8dIx*&YMuqF^871ft-ATc9 z3fvT&qu~1#XvBS{UG#gDMr>k{$nPS6TKOLX~g6;rg*v2i-Zz-?58g zd1Qy&O))4L>HHmgDG64`Wq-#7bjuBnQ>+vw(O1hT#?6gWY}k}z-MLh7SHRC|eztY{ zV&txXpHT=!)F57bMcKHlAc&cC0M64ZHqIY-IX5Gg2ebLHzR zC^=v}JllwxDmzDZL2W4SK6%JFvYV+KDPN|3svv|t#&w4syj^RYBa`wHsY#J@WKBJ( zTqNu#_%lpyy^ayL1b_}FjTX|jfm_y5S)+w1_rPKStdy%dKA+Jua;8UP(o=exYZtBO zIa#wsAOopw_-)J3GKu-%TG}7iNx$*5>X`XyIA;FTYN#x| zaw4sAR-@KBm_>&jtg&cB`vUG1ghcAPeVJVWC7@rab7x=Qc$9RQg77Izev5%Mjg3dO z;f5P)P1%Mw6n?#w*`RX*eGw8;iuEv5MyMB8Y(kBhqEkvI>wcM)3ty1zM{vB2{6I5> zHB=lLXN_WcGDbmHh0hlp3=Tn6PD%S^Rezx1^?AE_R@1 z_fh0sv*a~Jid0_ubP~Ty#r`b??^5tb2nH1SDk2t9oXLKXm6OlgeY9R`2gE#(9WpOP zf(N=S1jDh5=}D=1$+9l(#rp;JTO0Jtj?jFcj6N|Em=bkzsgQZ9+=_%`Iq#m94$cavP&oOWHz!~^-hk` zM(py2z6c=RL0*=v58!3l;H1)dSvERNGrcTp%)0ybQi$8U_ky`rqkO6M`?+{ouGQ*O zVwxO0)33GniErSMNW7jyoSpX9bpJOLyiNf@T-;5i_u;%K{}~lzbg_PlX)G|t`1)H) z`_Bm03b?8b%4ko@Df5NEXPO9eo5(8jY8^>e+WDty)^53J9Hy6O-+cx>0LRIN6?47w zyB00WgtbvhL7Z~Oe?6Gd*rgEqc^6O zE1&ph_sIWayGIstF{AaakVOX)G94k8GFkOBS}0!|I>~ky%B}GwOMul>-KVT9*jf8F zEiT$ger0}+H%PV(i_gYw8&+%w9@x+EX34pEX%be}K6Li3<|x9;;!|j7nohWS#HaAR z>na+-SN61dtV^eEMD9T>s4U}p$D|xpq)&-FD$=NBEJJiyHJhNNzb!|ZH*dR^uEbO{ z(4bCu3qOc)AZbklnPJf!5*P|)fc`Q6#|CdRl%s)*B~5D*3R8y89YpfQY|uBMEZG-x zk@7_`6*-&EhA(C^T5Q~Y`YwaE!|Vu2XA4H|iQtPl8@`ycV>!$hvjyJEQh8({0t7GF zF{SS8bBYWBpnlLKI#PD>C@?fQIX)TT2njy-z>;)Nj36zjZcrZRndrOICH+AXx0 zPkXMnRYuDd`YSnveu*a8%M`HL$Apg9>?2baYaK#YuAMNNH|U!*XJTy5B=cZ@GEuQZ zs@M@P-x+sy%Wj-&$;QVPscuWWR&(PFr_UE>qt`B7y|iG7m$b%<+v2Wv$=1%wtm6;b zM`99PQZ0<LyK{>%BfpO7>l`rt_C9( zwiS!ZlXV+pMr=(z6lIt%=mTlKpy`g;eq_nq7c`4<%a1H-7b<Hx0u@q*iJrI5&JTQs(w+~E4qOit2J|WVXlIk?d!!`Qb83SQxiW^sa4(8g) z$;{cIK`sm6L2;3eZPB~#%e3#zr;LR3iiH%=DaaMpNsf+b4yMYCnlc$38i_C}Rw|D@ zpMYB$GWi%0%TY2_GcZ0n0rFi0hYO5DMqsQlt=%FawVY(e1PT9%ZvT{mBNRMN0h2$R zq}VwM{)~bl3W5|c*~Mv!y-2|yQ1BfDDVrD`#At;@v4he!Q&32=p_5{4Y5hLMj#6+K zLCUt9Ysv2wN01MHLgX!kt63(K2{y=0-oFy^ek@e~SSbD*!TDFhBR{v~n9M&H5d6#| zn96=E)c;u6@^f<*?veYj>E~t}Jx6edf}iCJrtLo#T7E1%@{`=c*{7Ftt1oANVkxRN z$A$7w+O4JdE&L?UX4>;fq1DvziN|U>W?J<)O|8JARMA!+vjD?K7RTSu;Cc3f#4##3$8-rj4sxYD^_dg-ZiBI+xD9SAAE&?Y)Dh z4W_zyiT|E}-+NEs4vyHX+v)bG3HJv~rncGAdjkEg9ybf-g3F}|Q@&)%j~8rMHZ|Qd zzhE+XXQ%GMJgCWgH%l-#ud<}o3yAA7w=A1l@0p(m0 literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/logits_processor.cpython-312.pyc b/model_executor/layers/__pycache__/logits_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a424a394fc04e7fcbd720bc1f9c9e711fa30349 GIT binary patch literal 4444 zcmbtXO>7&-6`ox#sTF@jQL-pelIgWA*6QNo?K(;x57{g|L7{Lr4km z^cZ5!lk()Glq9fu$?)cUDIcf3Mo&&o$(;5Xy*Yo%FCY;S-(s|-eCvsO9tiG8MCTEf zZz9}Fd=EVC+=HfID!44F{$k>cV(80+D)xeID`_)VSh9&?nDbfNQZm%cDGOPQNnW#T z-L?|{Vyov=(NS!Yw@j+#OiT=|pi|v2fPh|`*X;#Do$>AxE>{&C`RtNqo4FqpfR4Ul zru8eYH?_Q)yFxI|=I4RapI)Mr09Jgs_)*v5)w04?Nu&8H<@<8D#|Adoz^k=|y-D z`-zMLq<3-8_6=Pc2X~ooYWrW@jEqA!y{Q21+eJ$5KA(|r_@o5TWW5+!3AhWQ={zsyCRPblX(%v!oRR5=*81D03`F8)G7 zxs=w8Hn6i9B~J(@*kvD2C}(y#=n)Xr^UJC$IOq7^Wk}VmY>_y9CY{%|jlCAz!wEM5 zs#{t%58>pGmAo)&?`}hea@%5mi`sg&c7+vkYj#;s z#o{GyQ|75Nbl!3u!bYrd6TGaP;RN$-Mo-%&q!+V^1t6F|T^qSpW$WhDUdL-K6(DiQ z8rUssYZ_(2OarXzV_lGS?8p>i7CC)O7TK0?)E-B&h>>v@a9O^dFJ-HoUQJ6Ix@Bpa z#T2k@73Idif0VRyg=8UHn6`B~PweSyOQ&9X@#-m(G_qHcg=Kre%qQP9j9k)9cH|mK zFTpz{dE&H^o%twHSf+!Zzc|tnA&J(DjRMzt4Z3e{pl2hfx9?U@RUUaFkJg7q%hCt_ z`sf5a!W;V%W$B4Lz7aq6IR5fOKO2l~j3~eheHBmC; zLJS62xcWVC&K}Uv2HciRNdzc*TGtThdevrL;BB!Du+|LIRuUHZWx6%TsNMIU7vXfw z@f{6j83;PsXX4gbO49pp!>PLj!TZNqSastR0hrgNZSOG9u>lNA(u3wzd|ldA57-L0 zsOz=jxq6CWp0Dk4?G#~8rWq2lG)I|Z;A-|^@@_Uw951$)3*?{fsT!}S0UkvVWea+N zH~|fyuDRHCya4C)g5xE*g1zkc-Xq!h1)J`Lr8Leu9zK$-=NEWPs$M$8rkT?z12Y|i z&hfe0EarS3TXdLB_rSINo=$ObxHBG_j2A%4`U`Y7P$P(f{k6cp$ANwIp#!(hHGHUl zZ!LVF8a}Z0d^J3IUw9l=>#@DHm|Bgg_op8X|2eT9`{5nmSF!lLvDL9P{r>RpTFe=eWX{_)0N3HpHJ5=yi>jKP9@{?Hcwyz$DR04bJ;lU5D-@0B2AF0Ro ztxbF~{qb})mb~rzJ187o9bG%O9-9K$9lQ-#Q@mANx%4=!)cd0=6Zej;9Fu%7PV{rY*)`5W_Etg{+US8;xPRFyh7s!%-b7FJ29z@}YtTb^xJ zYl@zxre!JaJ=rSSlzD0{6%wk4GK5jKJ!Sbp6DQh7 z3v7K7x*KRC5UvH{j|1^%K@^>Nm{}j1EuU-1s6Sc`J@cUG3l9&i54}`Aw`Ioend;Y| z4J;^j%T$YP`+-oixw+>vVOzv^&rv(#_5HyT6o{B*#P*fh(umN{#Ei%Spc#LjIjB}8 zmRu96yvEPgm*I)aDp2S#!1zYW}u-8VtN z`U!LpJbh>&R*OtlBa^krR5dbn|NO(ZKaW=;Q|pm8%RY#ufoLr9 zECzk7_z?C4x_(iy-xeSyxiwt)i?IVO)cI@H-SS__yLk9HD6$4w<*laO!OxanW%@*Zj zZhx|>}Vx8RTk=UZzZg(?WyfQQQd!{qMWRRW~%ZGXOyq4Nca4!{@bO>!IKYhZRSFC z=0fGg0dT&ckFVt27vSJP4raqdF4hF9XeVc9<3{r_5G7iLs9Q(VPns5 z!^;Q&YixW4ulgH3dMnU1o-n*qcC8jV6IS2B4a5cDQS4XCp;%LK<{NWGBZ;h`=)!y+<*dW??F6fP} z{rHo&K7Ol#fP6^)X#Bz0WMpGJ47!KMxkfM0EV_;~dKlHAn@x4-W>W_wZ)D}{HwYf? zg~2SPhUf`qFmC*QR&*wC7L1*}0-xo0D>plPId5k{nn?36YRxQAc2hXVyUDmkYr+kb z))k<&%R}yar;o2}eR5lFy)9ER4`(5iyYo}zo|<3r@^@UMeV3krjhXkY??cz{2!im` zhlG<~qTn}@M-ac3k#PJA^u`xx3JfL=-WaI~165(5GB~*|9C{|cB?yt*3!8{NH$6x^ K{FG1et^NaleGMi6 literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/mla.cpython-312.pyc b/model_executor/layers/__pycache__/mla.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a96b047d2e9e3bfc7749e5eefa73849788eb68bb GIT binary patch literal 8685 zcmd5?Z)_V!cAw=xE&qwMC`+UyN*hVGMcJZk*^PXzY$vtixUMZFXWhk7l3T2~D~U46 zrFK`2rBJrh`cUe6S4AHR$yXHChoX?-_L6=&e<}hLC<^oo7c{WMq5)i7`yt<$Du)F9 z(!RILT}mp`#72sCWzM`e^XARWo7wk!GwXi}27LscjO$O*%TYppgN*ffYQ)Z80kJ`N z!c!TNrKTwgX-CE}?SR~wab{i9E~Ctqac4c#9)osgnzHmXZP1>KH|v}BQAD5|Qu?goU+2ln#F2~E6X4D;t{;e75EuIFyy ze!dpGXSvCkP16guqhGL{_XXSeUa+121=|ITb~LPau%@BqzwZXyip?mqR3^nPamHoF2s5x-t;1>GCX?jhfOqUbrntx;HJea-Nf9L60b6 z{w50*>69R|Hx_4R1S#RuX_n1#S%GDBAIoNqNdO&S*@Z$rWSzb&*v0D+P)ST|f z<#dlZPd&u4TrLMwz;{pv8P^R_%;=seAt#H{BeH`Q{Eyd$m27@EpUw}#TxSJk==S31 z4^H146^1kE8^ie}Wlqcu-^ygN!!QIP!@}TGi_qDyF-v(ko8glACCLlAj0E3O`R^h5 zDft&y_p{d6#-;U38iP#Z=7F0 zzcr;s5@lDSQNz0COym% zzvUasLygO4JbdK7YdXlc-Y3(|&#QID#UF&$p(43WO^2_MM7!Q|6%BU@>~_U4n++gu zNnAcJNZhXh0pLw!jQzw&!faYr1PR_yPG&4bVHA_kthe>Di86gX+V{UpzoMDtXfe{x$CBE;qUIx;+RA!M+=Vlp6%nQt6#(cdKNkiF$ zs0h%GQ$VdGfws(&(KtN|E<%8QL8Xx6Ze)aEJ}qO;WQEimmrKi8?9v*xq3ni`;=toz zBS~P=G9%_dotOhUH>5}ce5hGTNV z`^+1e#o1XL%N6ux^#LV)#xSN(+<%%n3JSQA(qn54w;LJB-at)Q;S90M=b z>^LLhy247FF~@l#X_)#ANdV6Z{!lhsjwG2k1PSbtWsEm*2;W4;eg$zW>>B~h%9uE8 z4ztC=q985pZL(*Gj4XgZhWTff%sx*hnI8(uqLech9QF`cq-FwBU_$p>c1te{x;M9& zH9Ul@yX6#@5%loFO*RJ`%=8<4I%|~RmaCWeZq;f30-F&fj+MCFO+5hkQs?~OXlp1S z>wy$TUM$!iLN7f4fB^oe#(4}o+}1skG4@o@^hZV-h7?1uoqvGz&!`Qe*p;in0{MlT z5GxR*NP#L4wiaFzQg8u>ZH=ek2F_)3u7Z0R7$^e`=2P$hX9d5c&;%T~wOSnw9Bp$B z;1EBIr{JA$wdi^d5_U)k5^ePwivkJ1O?w-;LXfaSNYK$?N!N3bu&o6WUG*A^0*Pju z_Re=(bUg=&ki~%x#-i&vNZ>jc4@mS{bUg=&0~QAo{T5x%LBbAzK;m${#-czXV$qZ=)s=3RR%lH3FY=~+f`2H?R~bV)ZPhuO6|4XQ);i)u2NsYUkLm;9N_)}c5mZV zZfx^L_5lqIcEMjeAJ@hMmWWw>7XrZ9dv}Q>T;PR_0Mhil%kCt2#ayCUIs#23Tz4sf z$si`UoD!~vG=#}8CdV;B`$~RDbRU3KEi#pG2T5qEL?^hkMSyFJO4GTt!m<)>y1HuJ z55CR_kHF3C@5dMlPDa9aLqdZ}xWgq}N(tX=qi>@t31308XWX88(;O$mj3nt4mLJ3f z9gj|WoQAm~tz9ptf zS~fz{%D;p_@aucz%WmRrUTsq8=u^6*(s{V(dP+wt%yG=)cHViklsu<)o-=DZ`irig zhiu`l0W6MHV!cq@T#5E+(PL`#7%;&~?~vB}irV`MO7v9X16urLHU2Vk9eV}aUeekI z)HYD{bAP2fp>>~DyHBHBXC>CJ#gb|)X=v{_q;(uuJ79b;f<2Aed$smKwS5q30(;s# z^M}^PR>w;IzJHE%Y@T{J_F!x~{xmXL3XGN_qm`b%hlK|Pt!Gs287+=K4RlqGjA%zL zsz)v&*Z;54{z_ZNMs7Vj|M~D&P`WFE?+)=PsFVa&I+4g%L)puB>6}mwGudUR# zSOsii14G>=jl!7uo_f7S6)4NT5c-jM>l|xULk>Zor_f}tlNY4$fV}T1*!PU)&bUS% zxhEvtcAw+Ehs!}HAjIeY2~XmHiA6Rt0LbJsX~hI51F&qU&|SCE!uuwuod!K7Y8d!& z23XyNrzkySotz*DhG1SWz$D?5eivJu#pE0cI~UlL-c*aPb!tI(;av`-Y+lfPwe$T= zplJjc7D9g+-zXal7y}ISW~(>zoVJ|&HeA>%p8=Kf0vP2UsrpDLQgl{Y+KNr!MKyn? z>hCQ1k5s~~8^QJ9X0{wYw*96W9xlFW_`Tz5?D!|~a_s!4lp6azEp|zbUHZpT#8*P$39rSTpa(XAJl2#UNzjib)pO>ED4-Zk$~| zyLDh|e7nCK87|V5P-N|c)ekmLmqYzqZ>pgqCHjcr^iwrA469(ec64()7}abL2fhMW zzV9heX;N_fCH1EiTy9WsDK1TdM8auoFx>?=XtM6P31|0NSw`OvJI3590c+_~i#)e{ z?7MChl>RB`k@3a8M=GJ-qN0U*)ll!&cqw#p`=S~;S)xyx^VtnH7z-o6^D9UFuU9kFp0AV<)Ms9r{Xs&qSJk|*6o?~4b1RJnc-)guNqjm zF#*7|;nENN;X!QQQ}FNIh1ZDm3Wy{Q?t;c~D5zti9x<B)$XngOz#MLhO_-V)8mB7z-u>`{6PG*HY%4H(;W483q3UlPi$u6f0dq<|;Bl zcK5doX#&b*JU`Zv({hM@%OQ$+W%=lL^@ku_M4f+Qq9#oO;W}1!_B=fM;Out1+&TJ$ zQajHSuT*-DYCR{^o)e#((oSAbPhR+RN_*|c>T5qPU4K`*o>i}BOQNia1ywA3MM$1{ z)A5CwRS7xyLkFbS9dA2IJtyE>2{Y|b16{>ei^@k+n0%yUd2nSr z{>1s@?Mh)u zVt!3R7bM}KM|a9_{2xUYVNk;0)xcfp8d7gV0vpdfx~GIphGhYl@e`no2>*(Ca6g2i zn4lk#zlP-N`{WB(vzxBIOpYC|jJ#ZFi+<60#vOllFj{p(4nan1d$kEEnuHHly-4{; zXLr?)Q~;?UQq2Tfgpdl8w%F#W_37#XT>}D`{h@2a#$c?Eu|y{I0em8zt=! z{-|xk)floJ-ib$ZPFBzjH~fW?KD--V>=q$ujQ9zym0h6MhN41GrWJ|iQak&DkL-&c-i yYN$#e*>MtwziP0(#1Xj{)u6&aTq5EMLbHBz{L-sY|&&Q zyGlYOPDM3sMK$S|o~8{oaVKorPMu7rb((3%({?(5pcFKYd@84!(@C35VUe!m&S^W} zf8Pr5QL@wIoSrj}#C`YO?cL?S|91a--=F2>xj9@B+ZTo{r#bGg=|R2hif8$k1&+JQ zg}E>v;YRp=e#Fvm;py2Dv5wgKZ6iXzFkn*JK*_eN?*g8f0}&x_QJ)c4mje}1H4q_Mw|`F)Y5 zk#+s+nBN~+KhoUaJkrwN!gCf*tP(eTotOKef1_v#;cBGp z&{77e?+{vp(Cu1iy%JiB(9UpB3=CPqb@1;{{PpnfWIfV=RyK+SXC$RF3^$Sgx=nuU z-_6oCA?-SGx7NpdSm=6$HXB0svd|WUZZL%IW1$-ny2%i_A3dKRUZcJD>QSu@2UxkS zD7Wptt>|KDw;=6Su}dqVn}xO`beq_%g&t&~9SGfS2tC9?I}y6W5Zc2+cOrC`A@nc{ z-Hp&ahR`D{bT2~p-M79Eu(bP;_JH_+R)?DSZ;#hP{=8i+&&PDaGEwEG$Ok$F%p+v9X9KDNm)R#8_x>LXyPMSm<0NG7=h;bMa}< z@aS+X6d5~pN|aD=A-n_OP&5`Am&VSBgRwCR&7;y#+eZhaff1^qM+rDF5Iu{>Qb~-C zMb3$#v6E*)C(nnTnHY$S4vdJ1QY!MuL~MK_hMrN2Iy50g$E37RDd?CO8(>{vnAV(v z>bd1cgkI%DuAdKc{gyD_Zw*`eZDDJ_5VrN(!$Q9!Z0~ns0GxWw&{>n)s24?PAT~TU z8XOvvg5lwzA(46^7(;wCI5rd9|`}hffLT_){dZ_n!+IIZJ;YZW9V~4x@u5-&M zk^9F7+hQZ*ZR5k^n_~kK25Iwi6FYY8Ji9||iwvJ^8$S=Y8EvBhYs1ULNJxB69Gt+I zv_%Hai&C_WDlbZ{->nzoNKyd_aj_YG=r zfG6@ZC|bvAFvXcc(G(v-tbCp4uJe7@d1)sefz51DF)}3W!b6mbQXxP9a*v#!EH|lP zD~mcnIs9-ga7&HNsm48t#yvCk?>XyOPqjF62bErzww=Ummpb5;3dz||PR`)%pfGZF z;}>m!gTbq9jN1Zpwxq3b$@Wmj#tVDrj%7I97Q!oZlZS_LrSnb?hr{T$C_qPyrv2*V z1kf=2%=$2^3kg%QF06nD6OlLT!sDE@2hl`1Kvzb_L2<;hO4>Y-jvcvzi&10K}LbXYGq9)=91BB>Yo80BCW z(yO4U2H~#q=RMqI{<3AzdWsvge7ua~p2o;e^V1gdw@iU#O<6u;k!64@mX|7Q+@N*Z zGR+@HeqQlEOXy~D-=h(6ASwob1We*mvp3O!rJQW93}sDQ8Q<*|Jc# zaAIK)zx7FH`%+-vT}qhYtb+ahWe(mbq4!s-yYuW>oRb()_-->UGIuR^el z0|0UaZBj3sERZWScBz6djv>(#l#s9|nqJbBD(Og+bX@3L@|4GI<*NZtqe{Pk99OxR z29#48_)zr#LY6@b10c*~fR5#i5i21tUC;5;))y>ORsc=0jhnVjTVhl`yQX**91USM zHS8>~d6dz2Z|Tk6&~U&IRsP$@+Nn&Sc513w`f6xtdMOf-Pu&yL#iyv8Cd=RPnk*@w&Ko z-Aqs1v+b_Hx$>5(H)aG*aA!EATXAz<-^{0yo|>;V%!|La;RZm>258%K+vC5``~F=Y z(y*$`mUi20X}iqHW`T(|3ZF-HvPU7zhb^jjKqC>hQeR1Kee$aa;W$uP*rvp4{hk#o zD6uN1#Obm6sOVuW_P(?iGZg&)h|KM9X}2Vv0GICfrrK17nm@X+~mnH+zN3hSl!s4TSvp3U6~MMn&ES-UZJ6hQ9O)|N}oXPQ}{(6g>!*hv2(?hbDi`17u%CX zofmqS97VHJHyjPOeAP)`ZQNOlfOu)g4M#_I$hI4fZMS^uli+}yEw?=7Nl#_mR=Ikn z+3*&I;yWM34j(UWjjvxztqpd{^Uv4cuz%nLO2}-={1pHaO13d}_Fw z8~iPxu_OvG0oyNzAWV)%At6Cpil+07<2#yejGh!Fzfjgm*{Wdx#}Flqfqi zlXu%u9Crkld{rsm`h*Y2b~Rkz$Xh)2x1!cHjGppZ3>rLxUqNR(A~inx2(CSN5-_XcOV+0St~WrqoX?#VNZ?h2`Uu!M50>Sc}yG_rN?wp z-&pLSk#W#e!e!yZl0+&iHZ)Q6Kc@)g1x(@|S9oU9HB8o0exN6LFlCY4nBa@Z9i^Fl zfxGP~{mPzH<;Fzi#v7iEw~8viv^(ytUZT*K4_rHN!?WR5QN@>b#l2N)hONqdC%-P< zxbwBD*Ye}_4c@k7yQx` zoNTh-0jYd;KnjPT&(*|y6^`_%SCDa;rXTnrFXyY*y0aT$S(UOOfORJc^=qlKrmbKv zC{@li#ZPIhi7NS6SgoZ`BWa7x&=z{L+#q_T&docN7EP99gNPD*9f_kfNui{2a}^s> z-JoDvQFHvQv9uhG| zOi3l3BOg;^JP%)5U@Sq}eggV}F-an5OWPvDU>c}tQYkqvlCz#1@bVl}!bxYzN6t(5 zMN@Dta!W!%#^M$p;uky_j_$Ad?y~zbR8@s?RHlrx7lWz__4CIUDiclH??OXTPYLAE zjGY&nC_irToky4H4*eI!3PSzu(yFUdSEe!o{0J>ymvN8}O0oPI@p9x+B;l%@6JL&8 zgJ;8C7vhwXDOJ8*IG_~(KMJVaKx(xEYZR~oJ=+1f&D7FC{IpBuG?yn4r0do|%b{6Q zD7FQ3jdZs5GbV^WYy$=7MVEOjqsylG6uJ!kZ@-)LzS0;fAghgN=Bby4KWKVfuUq=|ldzQ*#F3L?I4wj6$0RZBR0NRoX^*U#kYn@Z2Q^ZLfeB?SCaBC+A$@`m5(Yk*YqFz`w62?vxi7`mTF&P(bc1zlhSM&yYjYTKX(GY*y#YWd=dy zd=oz=UtZ*Biax?CbChP`BTz%Tgu}cxgXlq-gUo9_30GVcDUUe4;@{{b8lk>f#c685 zm_3Q4OO%>*qUmI)Qu!>i+3;P;dzl=DOuNWO1kijI1QCN-KyVFT?j^&us)`Fh3GpF- zvRJF70V@9luZWhwVNfZ<-N8il!G!Nn+p*h;h)N(G#Tqaq2t7Is z(SwG^;av-cD4pD2^ooC@Ff@v)-4}*PL3Y{#L~QMozDOBUeEi+_xp}+;T6#A}Pm8*_PmZSoEqZ@t|taRCXvxxd5b8e$I79lb`em6@jZN z6!i`c%yNb%Th0ZAu1!(sg78k;rmRzp$U=4Ng1+A}t*c>evabJwAbPAEYlz-1UBNpU zjs#I0j||7sWzgV9G^(rV-Rejv9vHe8Wx{}ewTy;#u>AzoG!>( z|WzErw3)4Gw)i{oFZlDr5F4go#x}H+I^?F z%5UA5RH$;c+5G-Kz3G0Wg61Kc(OqDaKJ1J(XO~RpHg7h!!A;4XS?gHH>fki+zWj}>m&uID+ zJzvMI;*&6?Kq(hj(;rG;}gDHh32L4rbv(MoQt~k+cvdU5G@A zIp|^t$E1CRl~;yNEU~e)JuIFB8z~XbhD8^w9Zrm-9Ye4difwC`egUD_`N5Fq{-jc!LWuEK8Q|T;?pL?imlH^up4sRadI!MivWx zzv&M*e{1v2f(K{px69V0%JwJ9_Rr)k1sYRQh?DUa6`!_*PGsROl~gXTL#!?`#U7^%l%3@&HHo) zF}ZzehJK(yhdiz_9{qM$QdvdknL=dGxkxfM3&$uKt*+e2??GCRqJ#I6eVinsu(VxX zqRG-)Yk2w%q%bP{Yw&r=cV;OU9cvy(NeT9Fk`i$l`0^nRK)St?=T%-G~6m0K1B8I13=QLN1} zg8=Pg7Bng~yums5H6Ln_VM))dQ zGx8$BCzbVb9l@!Xri@1Q5RyOvjo~}{)sCB<<~MisB#RDzt8%e;wnw&en2&w0c++c@ ziK4@C@8R2|EUioU>gG2leXVh4>-#W&C_ch7zWbNTx_Mh)OI|L<`Z>Hq`Zm?kevBEM zN(5}O*7PFzK10sy zCEuvJUPbE8`q!HNaMQOotdR-Pqv>|}5)`62%GVH6!K^^N5n;AjsJE|z8Szkrm zS;5-bcYPI&$r+U#8bb(&)w`C;tHzT|z`W#c#PCj&YisEpCFThC%^MIOV_vEhpr*O89>8FSx*k^5fEP;E{Ys(ymeM z<>n()Esu~^$Lt|$qAr!SL$`eX%gQXiko8Yv8_e=P#X~t4#Wu;!c&}HS2ZOHgd4L%h>Dd^j=@5_xd`y z*C~BguM<4dwLAppb6J=rZ21Z-yuimY+u<2*h==`fK6VZ`M8QnJ1Gi)v4xHm8yW<(y zfT* z!d4ZWc_?wA;`p#BfZW*8_sl(_<5naPXuU+#lS+%zUzK%-DX1=%(Nh zR#?e|=;XlQS+Wp{$_!d-P_dPT`U?@mL&4Z-F(}&tPAD5bU|od8BMF;WTCsIQz$%I; z(GCu_<0T{!!zlH`NqgDL)ZGTI(?g0^`X0sF<;^i^2NrxsQ5AJLm4-R(G3=&EyF#*| zTqqQ^Arb=Y1$i-&mLDfe<(YH{T5vMOYgH@KM!-+7`(w-Q=?p zWW~0bqql0Bmi(0|e{;g$obtCP{Oyaquh&7gt<9h5dFc_zfBxdjpT6|z`MMjvhFku? z)%+{@8H=T)hkw^u=2DbF~dKJzso8o<)RFvY0ojFheFJ$$(s%noKkMHpJ&% zqxn#{YV5ln8|F#XLH4bHD#S}AI!v3XYji6&2oCDJ=FH%n=!&k|k*Ug`;$4iifI>qU*8v<0lebPhbdKMUc+DfjR#3`)?H$U)^e)EA=rYvg+xp|>5}1(I6j84TB*Vv zc=|CIz>b}FIiJw5a3Vvu*G{}kw@kfLXjSnqyfXfMkAqi+94add{}SZTwLA+a8~@T; zvl&LhMp@Hv*+_oK!Z5Yvq8EF+)+hm(+PnlN0~;{4UHSu5OF9iF3*HIV8N6?SCkGqD zUNjdQ1G{PPeO|Mp{!1#FjR=*mf)n}WA)pszBY|3Tky1kjy);0?!DJ(Y+l9q5wl6yL zxw~5W(;C!>+-w4)sNBmq!IWh=)-`Jo|Hd_Hn>7g@qbkd!AJ!n3pS|?#?DI)qL)_WG zs=FGFX<{2dnDyW56!tbb--ly_T{e8=9HsZ{hX=5mu~>zYxt;S*eC_F1o}TYpJp6mT zzttOW>`GL2zl&W2B^fs_G^$VxuL{NR%6&)$4&tX>>LSoGaezkYgmxl977?JIF5Vng>(Zzi185W9;i<(VuzgsH{{DeyF`#n@vxbVIuxeO(F z#k{av%olxxVNo~OC~D>T!(P#c*aGk^d1SkYwE1F@SbS!K)?Up+`ORZkD|})AC6%Hb ze#0BeAbl03?J`B1JaPiGcNo@p($K)52s2%_11rp^I>AAGWWI%XW7` znZ{Gdx!&Gmtx_ARqOnX=K@FzEcrbFfYExlYZED&b<{1;?)rrUy|Egw>2;!o%QCZfNbyg|ZtDn`Ku=4NzwjRb*y*{+7juUq5%#*Y_d1JPT^I-u()S zG_v_-wtHkN3&>P@8Jid=H@nO>3xLgXs`2_=?Rkg?9yL^(Ay!>mI)%+dxlGgNkpae; z1Zu2buUpa%WqvZ~#VXDwirYrcS#l(DHp595naql`_xLbb>nx$rV6(v3b!t{rP})v?S09@M`j$dndpq|j<0h5fyKs~z8!Jr4n_&inuFxeol{_oX-o|B z@E_@sO3X0;NPB5CvR|0U<2 z$+3|`e1MtVvAj%hkzzkXPANIWmz%B|{tZQt^91T0?Sw;zFFbmi?2YY0&4u@5?JXn&MM&MWl3BgiT*g)|sUwHh*ZL8!i6x&}a11B_*Ws%8MS0-O43nS4;e zSztsHp04ExoLof(Hn>@bY)tWBgW>^d1RQQ+YT(X5$uve403micAa=Q!y4Ve%zyh=k zMld27vFSQIorDcHG2nd<*l?5FnN;K;O4BY4WWxx`0$G89Di2V_ zSx9tNikU8-fvRZ?167>X15mXARGkb|T>w?P^kcj#i|wLHFo^tpc+6F0x}2@@H;^LH z7`k|aw5`U{hG-~*Riz7-JIvICY6+ushK6Bz!92B@Jsxrqf?^jIB`kfC54D0BjIOzR z1CjDt0O_zKvO7+@U;!r|ewB8U8X&}sfQI^Qqb$iP{}Ey*uAHK^o+|m+}c>)}<=i z6BX^LihYTSefOJ*BrdYS+YQ&qh#F5sQpL{?uAip?G&7urI)>;*&ZQ0{Q`dNNBJub_ z0rC;4%ei!QB?M!i8mSwEKtW+Om|lYggz2@-jQ+!P#E&yHglU^`o9abwO6bv6-PBr2 z&&3Gou}iOkAxuzHk{U!lXG4#T5vwTY)zYcy6~#%d!6?eaG$!hs!mccuyD%GtQ7VAn zt{xzRGYMK^6vZwwAXUc>{ouH4SB|BD)fhXL#(@&-7c5hbQRfspnM*H8IaP$8rZm)x zPQ`$odeJ%MG~27K^rCCpY1&?w+D`SZw6|=^CAUYSajsTtuiy1vCb)tE$=E_yRtw!q z3oEo1x~JT0mg`V!GRjXmQ2Q#t0JYdnZ(+ypz%>6^*=L=$PdgO9W!kBQx+%0p&=+&5 z-ZZ7Q;m3}qI|>{}MDF95Y}9}-H^Kihf0gICm)tfij{c_gS$VfE;|5{rU}@cw=EFwW zEbZERN*ooR8<+M^wwmD(g)?j23nSivNOXUzmi|q`Dg?0qoA63Jzw?!``GT2e=4>xb zzcThdTZw!z-_J7>UuOE-+TxSGh4ebn!{R7TA3Q5vr_dslCl!)Ia(&tjD`D)t#1S9S zv~^(gyhI2rZN(WyY3tRLTvM9DaudV&WVFj>CY&E zFccX$6;1O)Y*R02y?PmB#L9d0%O6uG{4N}bt~fv@pcrJ`s;GT=X>YQ8 z-%S2HcJ5*RDLz^8)PJveEpKjLs&ZSha@!j3Vg0#plNDJ!|HSJeCki;9;&p*HT$jycxoTpS<_;y3jHOHK7ZU7{qx@PX7 zD?JMxi`K=*7YFcjT;G={-nrD!1C>}o)jJ&0zvJcBZ%_KSzcM;sFk5i7@=KNTqccL> zza770S*TotRYQ!zs=+6- zYK%)lcI_gWi>~Egfs@6mvEwSS@MFb75gDS`6+;wS-n1DG&|oqHj>B*wln&0o(H8w4 zEWVf?n}MYt({RF^jJfk*3CJe;A<}wLP61=^3c=vzN$(@c0L|F$AUj49DyM-M(=|nd z(Q$E*4jsax3^2|JJL4xvM<)d3Eki*vcM+qlZZ}QP6Q^NgA(s&z!*QCVyBZpcM8=-Q zO37}PWNT~i@N?`GR72rxV=7MHl!61NaQx9JDl$kGm@xC8H6onWsi>xugCbyX#T3PDSvCY7MzK1epRlBlJUki|#Ok%&$ z*G4Y0#HeG}*vo$=kgh^%1A7TWlG~*q%pp4c09P*lLR)bm9Hra`7LtPjtBNNM_%Tt%8Qogo?uPy1@8h37$CtfRx&r-E& zgR>JcAEy2~*OAi#(vz;CJzd=d1yEHTs)L$PiUM+)uxLxN6FucdbkQ5S;biG=G(oEm zF>6cB;D2e%aIpnC4Y=emaIpa<*!)WYozPXAPIafVioB zwAjF;?9$4skt>n;eew2#$-to*&r+a*tcmA6@y+{^f&B>3EoE1o2qD!`70#050qDwt zc#b}42gS1SxJW*xi)FGpb+JJq`!+p(ACA#wUfBMPz1R09*6n0U zPQ)-;k#UfZl$`Im;FHIlYBY$ScBv9DcCfFe8k(^{j^Tk~jsU@3or3+2EOP`HA;84Z zCAt|U5Xc1`a2m`9Jj~$0C3-<$^1>`dZjP`AvA9ZW!UnVYqy`Zw@ zy=Bzw$TN1v8Qvn(E`{D8TrMq9-n=E}W z?tgHppmc7>je>d>(UmCedc7)H+86it-3bJt)Ll5Tuq)ot6R$g*3>=C3k1XXC&3pnI zwBj9yZx$Vidym{Hs+>D=lWpgU7d2otdq$oWhLw7fVg|J##e)V+w3#S}trT)?5Lc^M zq3eS2h|XLxR#PhxBcZkTW53kN5M2ck^1 z0S))Q>|8AEEec~wT@vI7?Pqb^<*UF=HQ&b9Nx0}V@__5c_F-4PYQTLbu$d`ymlXb+ zR!v^=fkR3KS`vX4wn4Zt;ca})_WF|$(>NENttCNkZ8f?O^AEmadOOzFJ55#lMVO!1O5_?Br zwv11lpjH4<3|a0)#xE)k#jbq?EQzhvF`^m|EU9Vu&1|D^GMe%RQWl#|3Da#5hFqN4 z$^uhaYq(QyLDu^?vx)M85A3;V%p{mvw3mo1y9|nMsI{S@6$~m`VM;KYJc^3eJ}u~q zXoEUcJ5NAupF*FirtI>ew^9-6R;kuNzZ=@+m~wzhS*O`OR;{K%tL*BikUIIYD@Rr9 zoOVsQn5x#Se5YC?sA`>xs@5@0dGIzxRV!#pS*W0)l!dh>^uU~apn*?D1dQ}B5#b_5 zl@m(P^_qWAVcRH-DL3II?=ro=QuDUQclp4j198tf zQfanbYg^cutmtG4P$)Z<7;ouZ2OVCS@NSqndZ&D&s_J}mk%JzcS=lp`6r~prf>Yz9GK8d!ahj(w%7OhBsc{od_J1mut#Xp86YL<#SORTVs%=H&`v_ ztQA~DO=R^E)<(^(_!xAMGe%Wk4-c6m6$zC~mBE+yUfY|ol7Hu6cE9b)zg%^xYDGYp z48@c)3yM{~WYxwf=w94_Ssbj*v?17FBFY*V`Pmlcl{EqZdY9}q0{Z-o3z{1DnE+mw zb7AekOKS&aahM0Ro{-0ME%}h_Jknic?=zSIubK{;d_pjiBcIqlfP7-jmQM_aE13bR z)l5EsWMWxEGBHhW#&F$NGBGddgGeSvq;_=8YLSGo84T+aS;-MeWd5UwGAbvtvE(^o z2z5TO4o1Cy{hFI3v_*i=5$v8rUJ^ĉz;B97da<&ax}b*aGiL}0rvhX}6^ksQ+b z)0IQeOc|$Qah%B^r?K=-mN-9;=X(eso7Zlui6k;d%_gX#qXvUkNG;@S)h+2QcvwjJ=hy@KKOPW~* z;umG4yaz9axkC;yV7#43?P9Foy}3M{Uc5~?z7NN^$Us<>q0-M%LnS#AInezVuteHb&;O!ktrIkTjYp=?uF2!@75YJm|TyE%~oVe%q%9$Rd%_2)BUKL(Z zp|-70!?M4Fv0f&i0+u!G*}qd-^QH42Y_T<4D9H(+Zv@mHi=m-SOU=5lN zX0Rr}7>zU4b_=?80os9b7;;%*&Ii*qMLR&|ThI^4`$PnJgh|GiWqOk28Tef{o=Qla zWDw`cD2d7RQK~QzUt9*sy+9uY1-bnfn)4cR%Uun zZan*G2l-F(2JAPd+7;kU>?<`4;}!wqdgUFn46hMmpZ(oSLIeyM{ERTJ)J>3g5RM5x zEz-A4YY6qv2!$P@O!q)xgm_r3A4S5S4hoB@G}82x5gM_t<3|@1s+!bdP|Vs53u{1b zKvGmgTs_>!9|uyy6YNcy>eEb(3`nbPcHVWN7%1}kWpCC4C;F!LU6(!=C@ zmYj>^e1;r`PYIQ#JyC3X$46yO;RCDm%~kq7uX+YGeRGvzUFP~oDW_o6u@o7~kx|&lJ+Y)qk&r)4$s;)ax*B$rP+=f^J ztIvjb=Yz@ihu-ME(SB^HwDMQFp-GRCB9BumYW|YGL*FGb%ziy1qLSY&6FgMm?Xko4&w zvQs98#sh}N)E@c1cFJzpDO;JHGBGd(@E778WV2kvn3-Z=jl6Vz&v0~nAU1efwojVe zqdvo+i!4ak#`pnjh(EcRkrqj}Bqpma1%=4lQ*G34urs303)zVk!~!|`QQu=vy(kkHsc&0khQ@!B1W zjfm(zT^R{qghJ}@d-^Fk0)@|9ay&vw$0eSr)+f2 zqNoH13>G%QW29}6RDmvXFO^YKb!`@TL78<7wTj^#MXV%6?PPsUKbVl;eCiUi3!vjQ z3uil0Me7r6A4%NVEOX3jQ{?@~#+YT?bFV3OK%bd$x4tf=w}Um|d6cn+akN=Ol$oaz z?*eKFDNdMqbjsI|@HNc$Bz!F?-@b%z-)muf{6qbakIpZeD>8-vYHKyjXf~o7jhLxh zvv_;t;dyVjeTR7n8cFgT#yD(N4oGWZ)dbtH@3rswttz!gyzgn z*Hn=KCbXtb!3XL!!w`c>jK;~UZ<=;lILH84H<^No$dqdmZ?E7~r3>T@o2f+LR`cWR zQObqZObajJkPP~QIGx!`kUm*x{Dvo;IkG!AS!jYTJPaq~iVuWT*|);eR=~K#a0kr* zMOXlnPQrq+jjW3zy2&B1OFLwf*=X86HZ+9IXEKfvPDx`EtbWFW=Fhu$LtX}gq{LWeJ|is_k0`h z(xy~%SE9Kq*?cg*{t)4%!v?&B)tmCFuk_59eEDdyta;|4H(T296(y{0%m)_ME$&Ke z*cb{OBKxO5~_%~f^rnIPvcoylJkGLTC`Qu!EP7sh zGQQ{h4KoVFXsl%^)0`0_ zS(@pz2nBP8-oudt1u{9H%C$U>4VxziB^qIN+D+Owh|O6z50^G`wo(+4Wx30Utw?)l z57roF^r%D=86TNCKMt`>qUptS1SGJ_pYN8>t1nU? zTh)(j4@A;#-7lX%?KOUST=G)M>`>iS`7mFJ(5ig6k57-2kK~p76w*LP@X|-e#Yjke z4&MsLYvo6@F+4C%$0kX%w?AFPQmPb*9jX+S3Mt0TV%R}Z40zI|EMOuw9ErB#JTrW> zIK&?0PmB9l?76M23Wi}DQPL%9D7~Q--#NwG@ez~&ULcbpj2vQQ<0ljJpZ1Z;~W2qYfo_PZ*g^Rab<6D)o*bXf6aN{;!6ICtBrHDZ*eVu!#$kf z9)62!c#Avu7Pt8=uKq1<&s$vcUvXV;yNYL@O1f$;*l&A_mjw6co_^tJ>^d!Zx$0UK zcFZP(bql_@ZRe8B^|{^`dS^T4gqOY7yosWwgsllR5Z1qAwdPqaxG-68hz`p(m%ku< z!+YHu$4A(G+b?)m><&I}cKr&6+dIXa#WUM@lP`VG;^9y58ID{i-7RGq7Vu7$oF46yYmgJMsGF#J)$dJkM8T+-|;L-Us}` z7cBPR0}~yGm&u>0aPw91ZHMTgDpOH{hxUWZ9PTSy8u$aV2d_SK<)OLg3WtZ4UOtaM z%+HLhaCBb@x?rL=GrhvmeWlD#Pl%)Y%DQsCiK1|OXD4URn>jJNoi(y}Ui@0*l}Ms) zTTUpjpkBZ~2<*71s&}?@B+9&{?|@*$#hTg~M&-4ByTd%s%-p9%Rh>{{V;^ BA>aT2 literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/resampler.cpython-312.pyc b/model_executor/layers/__pycache__/resampler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61647e04149053217e5a48bf4d91d640f7140112 GIT binary patch literal 11454 zcmd5iTWlLwb~7Z0&*4K7Wl^#ukNgyEnTl*@W6O>=a%?AZWF?NBWTSYO9dkw!X+ETx zk!2}VqYb=WvQ8?hy9?9U6rr&JEW-w({V3|6qUleOwm>VH0>qsxZLz<@JEZUD=Nax;r?)%PpU;d-hX{R7XO_#&J*iKQu!%8_hN`(hL14Uh-1Zsp5 zXu%MrM`*~6QA5l)Vx$pgikf1~2t&$-sCmRf`dLS;B)7%vAp=dt93w30!9=Yw`-q*C z%~5B}HR2*=OVl0njCg3uK#7cKk8Cb)q>>BPTeh398%!gOf?c#mVAr)taMZ_(-dPJp z32gn=QM2H@WEyFXH&JE)|VvFb%y`zR!Xnf*qR-xe%Gvd={Yy4l% z)&#S)+9;T>o$L;FP`wXwvSzXEJMPXdY=HST8mN~jq2&@K_~5?xKYrGmMh!R8L3*}A z&o;3`$9B#b{2jBuvY#I3B~jocQRZV4QBmS1L@6YOFF+|SD$@z+JU1mnhj5&WO+}S( zEFthw4xkv7IX*7fB{9k?&|zGR!cdtTO-P&^j)xL5H<6ITN;naR0b=Z&CjU% zk6k2dm1gPZPECt(t{dU*H;4Q^+{th}{NigTcfZN)2|Q!}3ePd@r&a5Vd^F0Ri;7UT zOz@Ht=A%$F#Nz-xuZt7Wa0s_D6po9$1h~diQ+!+r&+@p57ZdT(@YvXg|M8c(um5iO zFf<#*30bx9=j0%4RJAH&N>C2ZimF)>l_@D6DiJHWMk_$P&AQS~Y3TsFYcnO45qnT4pH8l%$jNs9}yy8j`@`NZ#kW zDN3@FIw5ybhxt!ainBaH#8oa;a@avbrB*K8EEOV6WlSX(46uqpFy1WPK%}|SRxY8a z5ex!T?o-|+xH=878EA)yYT0WAR`pA;p2DkIaYZ%7cx4<0OiGH>2=hoAa58%sm_k^c}HbNqX}M7Ex?51 z{DcU^ElRN|Xt0bzyCTL_gOc!@Bpc3#LSMDYQiu>MnOw$u=OF#pE4@l=qIV)Zv0LG# zF;Urlaq77P`_Dfo_C~|!dM9R-@kG4$LNprdB^(?SFN&clB_Z`j`58D%uPz1x6Ems< zCJ2_;L)r>U?1g_h4`7~pXr(+&Ki_xdz~X`ITOXgjdUmPnm%)6~?z}rN|8jwCS~!_L zIe+Y~r6JS((Ti&quHbFYj;}QBNLdROcV>TfVDaD`OJ{EX(#6~Kjd$;C=`Xl9q+DNG zZ41t{vo!wltLyBRRd!2mAa`-ucY9BsJ-p5yUuBPfHu-ti=ciU)JC$c&FEqAg`xbX) zFD}tn-(6|gRcL4{v~7B5V{G=XS<2p!Zpk!c1~QY`uDq@Nw>FPfzufU#n4;Gf8X%|Iqss3fS-Su>~;19F&b zI6=KnSL8#aT8iKrkjMzYSNOP|?6_qt)A5`!X@oJRS&#vIpffR! z^hLC$$cM&tok>Oj3HEb57Y)k_ml)OCWKKzN=R{5fEh~sZKj_<8k?TI$P0!=4=!b;D5A55Lu{CDtSCJ{%Jy z)FB}IO#H>LJZl@DM)df!;J1Q8o>0xa#K*^o1Ow5wcaPKw-Bq&)GH?ob3uK{c1QS7Z zgn+p~2FBsvZ3_CJ__Ic-068Q7v$NAoAi`0mm8vBO@u`g{Y(I=E_ThZ^h7Y|Az;_N=m<^T+O4T$$~6 zEWU!PVc~rGe0KBYM9O&Adl%lw*af67>7Db_uI;-QUlH;|bexzkJA@~&t8fSJy%xela^1&3??gtQGW$%OY% zU#b;!p6EZ}~LuwLpZ)AV8+_mP~2gC&HATb${m@18%*Ids&LLWAy56v95g`#p$ zNt_31V}iYw6P}r9!4aYbtG+s|=sK?~!6gRm(>Z3&6t~w}eu!02r;-%Gd23SAr06Tx zll@hi()^T}nD0E3rD~>nn$MCnmrplmt?FA(_klBg%T(1{*JMd+)qd*e!txlYF=>sU z#I8vKlQbonQ6qAaW_cuyf|>Bu85q%2(;cc{IT&!vSVePN72RG>Qb}u-_O~SsQlRQ= zXvx;3s#dT}Y7?wQjTvs1&YQFWPgdWAEolQze)8(j>bK1PHBnePLH2_$O4Ok;E^>$k z*E2jeuAn!{O@{?#TvKqwiR&k2T?h1Y@96XF)Jr`T)%-&)A(0^(J8STrzFm6XCyeTz z?uC)ubZ>&w*`=gtkr{}#B#KrOHAMNMvMiV*#EcF6VtugQiaAB1J$02lH zwZsKplK2^@qn|)@s)T}sNGg??6{Upiw`&6H`-p!50g+k`a)NPE#e^Ty6K=d<(2@7BXPoC^oj2QwG47xIqn z^GEMOeSdm?=4f_u&C!MR=8W*sk&jPYJ+bE4v2+xb_U*pg=RZHV(*M?-p0`#^Z>>1q z(zUd^X(5?TW>4kl+`(nXO8cR_`}vgVuJ_sH@Mqr7rt{ucQ?}nYymx(VAG@x)ZW%tc z-morrEx*0oneTq#Gylqtp_S8T)_iZI28r}uF?m6(Hh-^bbiv^TEYY|jxD-8=`SJk@ z|DS@k6)6bJQ6s<|BSyg>ngoD?X_OHd5iEW9x4^#@{tYBpWD@~5MnhPtj!Y;8x9p+~ zf@${B2v&3mPQftRn4(5$!F9fZG~W=Mi2OUoVG)X4eTf z3~+mnRw6^YbvIFu5lx4caV`-Txe$;b1cuOS#i$FXIhNeH8J(-TE4;LGkDrS|tR{|8 z9d2?;l)@s3Dsg*+svr{DZvis?Rbn^G8doDx4sfuLjGTuXK#4$D1Fhvp)~RA~jV*`I zsFdo6PsM_zWeL})HpGmJydbOQ^B0ICXpbkPSdg4UZ4?q{P%6O*DG?DvctVw(#3&Pl z12Dd%nkOW2G<;D4w~Pv*!(L8ks1kj-9R$KJXow+|Td1@>Nqq&QXdcB~UZftSK%_^Xr`(TNX{D8KmgP1T)K8K${kRkKhvEEl zAX!?gWH(xLbkdTv{t`Ulzc7GKF6ngGEnWsmajRgu1paRYQ?>s(`U6g`d%-YDk6Rh2 zBm>rDWq-Qve5^&kfutE`(D?<<&XqP*l}X1Q8?VR0;Cu{v+)uevh=i53_AK1MoUQ7F zb-qy2cFSCK4h$HPG(*jR?i)TnYYhM$BDrA{>RgzYr8l61^d^8uWD4H)Cis7V^7{$| z;wu(ey%=CkLVQ#{3;}r_N?TT~zBOyxy^Y&brZuat(7HKg`oLAnzHDx}a(40T zC)=*?y0&XcSPvXt4IKW=mfw6L-~0;nv)^+xfXAELzBG7e^Rst1?kadT6g;hk=C(qI zztF~IeL3Gs%g%=mbGs{b^uwY1jYcedFa$xs=9YD@f7R<>@-M%g_kJH{v;D@~nVrdd z{iNKyDQC+!@5EIKoA=@7HWeK1b;ssAj?KT>xGfjXZ+r%)DtJ4x%Hpe^OkIEX+PljK zZin+bUS4f_Idv4ZFME|;+PdcJg~bPp7K+`X`LOhGeFye9+zdFdbfhF6psJ@z=sltf zRJ{=bc__rnt}HNvVVGEMzyP+XxD0e7YVQLRBCmj}7=@vdfFKS)#u?!wY9^8;D2do@ zwz=jbYUqE$0UQ9}0k?PIz4Uw8$-KL3-QB(F?p|tG8d!pA--@-5oYn6}Vvhqk2ZO3T z7=-BUR20i>5cC%xEwxyJK_L;sE=P|YA9!QvOmJveGc9pzCAJvCQ13A4G!Q3=L{zds z6GUVbiO`%RVbxip&X5?527@>@ACzXJcpquA@~Rz5PWlqx^G|@Ytct_ZZ7OQ zT-dUsu;+V)4ZizL-OMxhnteqEa&|>$i?t1uwiKx{2xM0b1Xn+I6#U5B~$W z61}<`q;1Wkpca87PlNd4$8#5WFy53Iw7kgM0*7xRat|@v#K2gfhdlG`z7T~NB|h+l zCSn)xeJ{YB0jn*>!vmub9vu}Wh%<;>7-F1&=Hbndn2|6m)X8Lzn_wI$r5nW1t%c)vl&HBt5+mWB|ZK_Kx6&H+&EBoqv$KD=_dByn;o z44)4|!@LlbUV#pNRuk<{`aTkw`O$5tPTge=f|ssIOf>j3wN7BQZg0}LYmaU!snA`P z0enMT=aOPCkl^$XWaO{0z-}mP>MOKvD>Sth8z@`rL&|2e7dKF?U7%;3cYVFfw%Z3j zA4S*H`DN?IkK3=d-(?%t+4fboJ^RBvyK9~8UuFAmKbvOwATFJ*nAKp)*E6gQ0y-gO z$~8U3Y}dnK^@3(#OL*bTH3_hF0%BExJ|=!p(Vw7Umu)o@K(*p5{9mpMYjxCFOTXo_ z!0U7L9B_SP)3^1DbUogR5A1?jzgGzBMmE>$g$JLrzzW+QZv`w$y_!_!`#CGL`)eAZ z3fNtBu-2rd>~KMkI(HnwV17-i*nzXq0`4~Roq8E>S#FxNsRV|c5JH&W>6WNSpk@nD zl{naO-{bhC{-xjt9S{nL?0LM;EbIeYL33L*D^+{S_%U<^d(|w67s4Tt{60W?=0n%l zPfI_=tWC2D(c_lz7}^gND)N%(Bovj>`v{N>q%;B~c=B5SA}1vgt6>C4N769_;|PdR zJccE-C#4AlL|#U)bRGamYGOYB1QCCN0Cy%`LVz+@rGvz(R_RIUS*X{ILt4Q00GCvl zZLE@}B^)|P^7}Me*!YCjyy9rT>)ZIz(T@kO4(9mlqt`~4PW>vJZ+|B5>q`xSSF-T@ zpFV%LeJ^;QS6qv(oNf7g4~>QnSL!6ZwzPJx`?^F3xEk@55F3WlL)nJxK%VW)d9Qa~>s;?Tu-bLtc2~aZ&?tS#z*$NHsP<6)7?B9>V8g!yc+5j-?W{N{&Fzf+ zUeksm19{0w!II{rV5#DyV6DEBf-Nqpsio+~k_X+D1}rt|?n)C@z38ssf}pd(U1`D6 z5t`Z=DEN2Zw{2k>?m0XQhth|N43u#*tyc~$9?ET9ZQ5S6APRO>g~AcAJ-d6MJ>CAb z18SPTgNFx(NAt*UIIiWJvUHj*|Tr@9=7Eg7v^m?1=2ww|;&Ss}wg0;PGvjLCi=_TJ^Uv(xvk$PTxQ`m3 zU!w0DeYCsaZY+``w=aI1q@>>>&GbK<(iXdM~{va|9*8Lkfxy Sqx2w6H)J|~Phn23{C@yAlD&`s literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/utils.cpython-312.pyc b/model_executor/layers/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b8bf9d500a09201eeeb69fcea36d3a0e4a7a3e3 GIT binary patch literal 12499 zcmd5?du$uWncwAOx#UumCF^C0wy1|?OO~a|PGnhCY^$~;KUKfm=#pb8?n)xXmv@(X znM&n0g~hZuN!J9X+8iVo#l?nQ#72vY3mnkkaChzDZ~$rfAaze^T{Jzw{}ha!qDAkI zyYHLjk`zNF?IFG1036PIGxN>k`{tY9e6#;rP+*}T_zYk7{983e{Sq@;u%!~uDvT6$ zgW{ZbTp>*KK+0m{tZ5Z-#EY!ni0~b0TW3xKGts@Fq5>&XYq3b97(f2 ztG{5Nz;7F{k-XVg=-)E1g`_RMB7gBfF-hxvTm2<&=*wR^P)1WaO5lX@JG45kfeM}z z^xl2ia5MkBH9vjGz!$t~7^viJ`OEo2p^D!EKdWHRUn4(tNVg3-A7c3;;J%$N&R>h) zx(U5x6ME?;^s-Ip<(tqqTMZju0ps0arue=5HfVb&vw~X6dqhiR{`tyH`l;H4zR6nh z`Y97xwe7DO2Wt2ouTlfGWGtU5|4ns)<*T9Jy8qK29nfR_pLe}CT@MRi1Ebpsd#sqR z73zfgyLET;s?=dG*2AvY3BLxA@#%~%Xer>j3-o^%=+o{$N|$yEyLcA zT$6$*wLHOfYNQqRsLSVb4f=$B$Z$?~SdxQ&XD}o|)*uAN@ZTl|-F~ObBMV}xg!2SE zveOqF9u~w#ox&Xrg}X)fNKYW_SGKsrq9_DpXUON0ps z9ZyAcu%~qt^);QkO4E9zYg9f%H^?c?qA0InBUj8t^by05j;HS$?f^Fo)5eI=3)Uhx zL}=|am}Vl(1**~5|Kz_Rv^>Fvw`oPsd;EnZC86iW}7n)yt(T zMZ0EBu5qPtL+LZbGjg%;OR*A7znya7m`GT z(sV}SMT5rifN#ErI!tG2ub$VB9s}(&s2U0SW$1r`6NGJODM1WMioq8Q42wIFVwK!38MmH`=bef<7!V|<&oe4C8bv@|l<2^y25VXdJppy+ z0t3HG8kKM}5=|5BSYbQvZXz9LrmE8w;GI}oY=%0G@RR%y%}^iOsN(V)RSQ*X=A!w| zWqRRIoUOgP^X;bFO%E>r=;a^2{Gnb~%*>ojGL*4!)lii%RK>buW8V$G_2rvizBjhA z>(C>^;kBZwZ*|A6E?Z((WBrMuJw#Pvbo9{7vBzBLyd3AM;)W`52M&(`JLQojr6eQ< z{h`!IC8rF^6cm+`U|0@?bE(B4*oi{GEjU9$z~z%Yf~0I65``;aAWmixOGF_g$R61f z3}mS8Y(2p=(Z>IT#AcT3B@os$Gi}OPp$Hu@MHo#VNI5%g&MMb^q}(#i5j3px>!+=V zO(WI_6){J+i1AL!rh#5j5kp=r)q`zI?UaaR3Pe0r4r9Xl`L#N`K9^k|ZJz62U_utu zk?v5)2dn2uucN~gaKO>v794WWA&&?SRXh$=9;8i*(&@Me#hw6&XJaKtaL7UIq~k9% z1JLZqn1z=KlYG+dbabcsaeyI}U2u#AhRGn*+At)vF)gHHl_U{nI1`sBI6|;n1YR4t z-!CY`&Bk=$9NWN$nlj;z-6^zObP7 zPLhrdl|2%$68JPHJk)VO*m6*CyP(kFksW*xmOBuXVTy1NOCdNhaKNs>B=YxQ1}5n^ zaV*U!HCi&Tu+Jq*xh0gL|{Pi*d}IA*@A~`O9tLZ;&V`@ zn8N`HOo=chG!}?^v33hatr&dsQu4{zT!l-Kh~34W zqymUgdDB5P*@fy%(wkEJEY$1728$5E!QMpKYnQqfyApTfKRvFjIx9kH?HtM^XDyDvR5Tu!o-k&D}E zR&9F{wmtXgxNXmhZD0KP-h{0;Vd$OdT{BqXHpi;1DPe26btP_VTCug=JC(2IQELSa-`q8J7}F3Fv;aWDajO5??MAp=&I&W9Ielq(7O&im2bu>5SSAWtsR?6Wc_)$tV}8i)?63SYu5Y%C8JzO@aDq!h+Xk@3X0hY=msu-yok6};h=CR)ZrWo zgr~AKt*u>Q2Z_$LwZqkg4k)x=p+|{E<|tGt973t9Vpcced0-!apL8Fh8R~s&<*KzU zVQqV4-TyvUwaV>EaJwFHjceSNH%phcFK)kUeA{;07Oyyv;0`=4ECDRYnrEN;$^_gX z^ncyAR#@`bY?&vGJ)Mh>wA|HG2G^ zwKi51;}^F*H3AVB7T7kDO-N>uOjy(c3u$q8l4FdFs<^1ATEMj=iu-Gb!4Zj0NXk!7 z>p@VIH?5vpb&ArI5=!%28hQPl6!e+`(E+%py=leG@JL;4jhmDj$iI#wc$LTrp#FzRJH0kTpx_qB+pAor zHTPC%smy;?X=R40rC>YaW@CXj!!uJL0iMP!FwI6--V~<6RA;#3y(A}i@iLGTym-OM z3Ayu!nbhS}fh-S>gpZeG+EHMR7#H17cr;?P&^ItFq4_|MRYSgSz~U>9)p_3G4rgZU4PmU|7rB0(bBJ8 zt!c%CR&;3MpjZf#BM1(b?B?<_xjt;u);jkI_+#;8df;9*l77hzLa#J#?b5<6{@#-X@BoF0G3C-TtZtMHTl4(#L%^&9XFG(Tp|bEc^2>*o3H8@&s?v3=h>nP7K-1B$iH zS)(;NIc&djK93<%KdegX6uvh@@h*xgD5-)F$Tic%+p;SFl=x7i0dopPT z-$l|!vBj~HglYRHK&8jvLwhm$;=CAZTWViyU#|Uj=So2XKwc9&+xC_4e9`>5*DGT+ zZ&a>tt^VerS$5r4H0O)@V(rWAw>kiC**a(SYlW4w(%jYP)p%iNtb3_%v2Xe4^7+_l z$lhvryZLtWy(_m{9};A?21n<)Xz6^-Yn3rNcJ5oua@#G#cMjewxo1bto~S3rEHk(0 z6$Vz9-zPWxS{=H4$(o7Eb)w_4nfGK12J7- zE^`BT$LN&l%2!I>ztD7P2P+^YeZRUB) z!Fqstz`z=JV>cAA`-Iz%cd^a5vYhNe1j9WWDes5}ACo_5;moJSRd(;W&cWllimF8_oR z{PJL2!=X@6lmWG)C07idfZWljSB-Ic$u??liF-tFk0LLpH0*PN&ISaZI0nuC9)8jn zAOcj*7R<3xcCH`_2CzB7Hm|TPXag(O3W{G3-*{!=6|j0!H>cud`{D)r&?Z(OxYk8` z=UvfLXay@&E104r&BhdVRtiMH9~=`>&j{}99h=tnMu^?$7iTBgCSRtIK@H-MawHPSqwI^CBk051)%nPyG1W_ah) zy~;)?XL@E4T0EG==MMzZSw4>1oIyc0bLHU;ca0iYhF;TECn-5S0xw#DoFKlL1vO!N zy4WcAZLi_)U;7XY)68(MN4J=;SI1YlhaaK&pe~e>osL?k#jU?1lV(l~|Kd0rrutK?kXCutFslU-_`a@GM!fVwu z^9u8gR`q|J9;YVsFH++)0%mkQZ7|}v^KeHz3qbUS?$a{k5h`kk=&qTf@C}f5ExG_B z%g#0G`Y&c5e4+?LP)bx+qB8{Vu;2qz9ckdp-RM38KL~glokQSzD@fH;+i7oNGczh& zV7q)S{~+%=G}X#`q!7LX5bK)@Z(F*2@SN$B4naa~kbuz;Ao05y>IcuibMXGbce?I( zt?s{=*nbh($D2B1Bk%I?Q1+Z88}uhR-=bhxv%LZ;e9(ymIpFId9|$=8@UWe`V5Sjx2=u`x2L|W|(3HA9 z;xy#`=%x7?G?H-8xHQr`)Ay&}{lZUN@pG3R9k@(pKh@Lgf)kRwceq8tB?}H1+OhtA zhbqrz#}It(FbEU5>?q7I>uNTPv&NH8 z!=nGY#ltT?BnLz@)ZOsg6SpT;TR)d*{oHEniA3v(HE^%B&p!tm^%>RM)o53J#skoR zTtU_Q9``VZw%Bd4SwXGBrJQc&Uvq%hr zqJvKkJ_j8r76c}$h{(lMF`spb01e@NS}_ucu58g@kK|OfO?6q8kS|N1wF_hkU#Pg@ z%Mn3D$flUie*OzR=g%I6FH(pqq}qX&DzvOv&%>uE9>35ditq}IZ^ZB|i((yjiTEZ6 zb=^XvMfISM;~JDBOZt?09WMxqo?&p;~s&z-gx?`qi-Ndb$ zs=*)ssJ`Pt(Sz;>;Yam7>lNFV%!}qF>!NkFqBBv^`QZ5!T~xhR?TERPdb+6Yw|aen?E?y; z|4M?B*!Ft~q0>LU(EWUG7xm9w=DuC}9~&$BYV|*^H9~sBqeU99BIela+M55mMT^&= zO`gk?Qi3}m_izwzK7o~S?#)$nnTEYJow8wt<`^*)tWO&X(aQxn`@h^K25N)-ljFkdn~ z^3ZgfQV%aXmcqwjsRQ#P>+vR({}z4{0=XG#-Bb>`p8(xA)vOoR$JzRI^R`v9BVl$t zGS{vb*2USn_X~H$*`Yd?wP>P=>)OG6BrWU6FaxiqP_h|iaAY$Yjo;GP!fBwA?qafY^PJ4$1HP^eECAY zG{Vyr&uc*5BsM7bSruV42pHm(0 zQTyJbx_(YI#i^!Wm`morxMJQh!z7tvx@=y!;al(}DM&7d(XH|flgZsS+McqJNj;<= zZ`qnO!arP13yYH`OtBQh%|4H|HCDQ0U$n=;S!HZ^X2!DG2D&)i9MX>~tCyM=o0CRJ z0}HT$m@{FHrMB%r6Pq++&O$*=V3y=CXVq#JV9o{|UVrJ@OV0{1H$YQ0J3lb9^e%Ys zLYLXx;ppL{9`cV(*6WdLk)#pQc$F?jOP@4hj-{Y(bF>*8ik9uqu<_HfVwy=dQ^)Dk z^apweePAusnxwbaF1@t)Qj(hXa$3_bojS(6pkJBj*dir3R1Ew_& znBKqv2x$(GPL`1AFmyM)VLnJmPY5$vGa}gyQ%af%&7!RohuIH`DY|maTDN9xShMZ^ zjmbdkk~W5BVn>q{{4J}0_l)mw_c=m&*YFGrlEn=)vtclhf4sFEr3lJHF_vc_K}M4V vL+{;C8S+^J-@9SldvhfLm6OE`y?eRi*3o;VxX|(D7ZP=cK7@hnCi?$BvK)ST literal 0 HcmV?d00001 diff --git a/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-312.pyc b/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64d6f6ca3b9f792076373ecb123c39772c5ba4f0 GIT binary patch literal 24473 zcmd6PX>c6ZonQA{n4{;wN!)1M7!m*p5CBP#lt>W}4}lOZQlb|tc|624zyargdj>pU za7B6LHMsUhbd@)v;tTsBS*qm9(y3HTr;=KJ+^rltP0%~DT~sYMcBRVslA%GxTltdw z{;!WYIH0Jicq?5quV26SfA@R;_rKozPhPK^!xOW8J6iJ=$NiE%jHg=nY{?dm`;d!p z5kAJn`8l2^za?ggTj#9IZ;jdJYzVW)>~Y7uh3Dc`b50gxi@Dr0U)(?Ej|b)gEZh;Rj@Qi9Fn?97HeNSZ7q6eIk2lOUurOzAN4#;ak@;P*rg-yQ zGxNJ+E%Da5R_6D_+T!hV?eGh89Tx657xBK&MSN2ICswT`bE4F-z(@R2XC#1swbZ%b zh}67qpX+jQkzJ8mgw^3+&*FHgvkA!-oRJ1VbTfz@Iz;0$Q0_<*pn4=LE4LZtwtRwK zqvsRt#m)6vILWp&qkk5?BCS&U(ri(H?)^!dkw>H*DbG9$5$QlpMEpDP@4~+u|7s~% z8M<=lf`zry7wLK5X4Z8l>RPje8BzX3dOzW{FxJvOw6tHUF?!fcKe%oBfs*t=q~9qG zK9hdvnl(81Ybq$ntIo61*H)y&yac~15>AKbW8qY4HfT{@XTowgE~O<|ZFn;gO+?et za4fniMa=M^O?5<~m!lC$ZC{kqp|q4pCFM{&8IfY4Wu%V9;E}_LOKQ*d@mG>^ED}l~ zXH~461M&&UQe=fy(s^tpl}^TAUk+N;nlG({6Y1z`I2}zUjwKTd(M2;%IxVFyCL^ze zQ^*(1fIoCGoC-zb%P}b~CDKwPB*iaCkw`SLh{iWXqN(L@dj4W)CGj;aTR<#I@i-!@ zDVxhPGp7q}=hZ+8ZN4H!7cZtm;dEL~0WdJXBFo4;v>Xek7m{-P*GJIL^VHeKBj}#3 z?_sb%e9&H_wv4r^6~@N3SXYqk5BEQQmOsI8||X^)DmJBL5!RXc0{ zt5|7X8Hg(?OR3PJ9FF{&T1|T?N0>|N#7H!*y7fLJzeCG=Ht3M;095Viq&$C7wM+5k z^flF*T8YcFEvq%Mv=~hR?=c+4FksN7+EP+%L2jU8c927pTc)KaHN3D^rrsZ%6 zPLR71k)pm6jpu*Q?M}y+cP~emhtgpgh;iuZ%Dw~pFYS|d$D$W@FJA+$O6T;;wN)`>ZxIc(u($^7G3p z&mekzTE&x>CB6TGHkpvA>WHR7NTG@}D#N9tGOUft@Jc!wOUb(d9<<5>cxjV`0#ej6 z48Ob+Z?$b%PR>iIlrcu3@B&S?STamo(dz!^jM#Hi9-xv$I5)UQo|deqOYw9)^z?ix zy!8)4@J{c&mA@6H^HrSCD-TjN_^YbzLNuJ3FGday#IRvF1kZ=)(gHd$I(?0&Zmu*e zKqPuzXSt7g&=9g8xv&+0jx2P^7=@0*Nf9_L3$0`9;pOGnwN>#s9l#)7rlcKkZg6>l z3+#I6AK3Koyfc2c^X{8Jd;6zv-yhrbpU5~*Xzg@p*kGQTWjs_aYvwEPeBNwh&D9%b zWI%6YmYwC~efR`D!oRAz%+;v1yLc+F0V4dG&*LZ@!zi<2&!*y8H4~z0Rs1bF+%?Uf&2{pjkC z-~H&_JcrMF_P-Q$3)&EWFU$;J`P^}xN2UNF)p4qPLSPn-*z}!QKC_#6G)Yt^1 z9{!M9!v?wvUxZ%;`3FDZ;bNOQiw$kH?@Tx%rY}k&%UYzY#dMN|EJiO&3Gs3`wjvG7 zBC?QQg@dI>6GlD(Cq-o$OUy?VDpXKDg}B!Us1?o)?y+~r?LNgTZj312o;wYSu>0gl#yy_T^Bb3ywxNt?C}SJa zI;DCsmb?&-6=kZr3rtN3dZlWzzLEKVfJI5bpr;O zXgJn_pk020{H%}8l8-uy^;>!0bUo8I&(vQg=Jd<>r~2T49&m7CU#4&0z2?o1sf?}l zk!LK2{k`YT<)>DQ;K*|pyMy(`JiyI|8cNR=_GUEJV07NZET6MVHpw2b5DsBJtLd|u zKD+62n7%60=QMpT=5s{cl5N34?255d%~eT8s8AH@G(){bp)NDjR}|_tL;Xde9y1gy zgIStjhE^AadLaejz*f`->(s8dh;kzyk~m|N96Nh4EJsczB2iEUtE0MyQNUp_MR8(s zK@97QpZEYuUEnu{)mE(P2)NaxylBiqP?qTsTPAArGqF-4qQ|xCaawvEF%+dWaxRW9 zq^_@AU~M6|0K20&i&Rk&^~Dh;yoNGOUIz2TM43cB5XxM_yZk{Y!IeFPRE=HQB7X(3 z@;h*_j)*8pQ7Jm&2qaeGtQ#dwNRV3H`}t!wT%IUNxe{*xjOwq=`ui1s|DB%f&M5^y z|5U~~rLC6VN%Ovna^(;?7vK~(u7>KV^|sl(@fU1fX|fdMT-3mz;t$@L$_}1T@GH<> z^S_e@5|1MPUZn=^C?5zgNwhFlCtt9YG;xit(bHZjYwED#AHG|c9eG2+&;Lfo`Nr>X zEZ?S9{(hyVHa$C*Mq>}XU}H;%R8Rf&vgYnl{Cn<>W%s_N;OBoU<9zFP*xW^GDeWQ8 zOx=28WH0NyFB99X?tj6i77!9m2yj{NmNj#i;@@>Ak{v##;O9SCAuHffCo^c-kf7?_PSHQJwL|J>wkx?y;XxUm@ zop{0VEKB#TvL+g%Iin2EDE^s@bLKfs6tQ7^vF+qiGq9iK6!#5~vXw+@2qiwc>?uj* zEQ`b^Qem4ae@aqS5CbEXuRIOMm87a%*0YI9Q&ld@_H43}vMRIyHLS2Dm!zuDf*0t+ zz_WEhstUsIlCtV5rXsms*;7b$O=s{0ZP#J@4@1ffJrgCN(G+ac{VIJj9LDB8TjmuP zfHxFcPNs-Ri>5=N)jcmT^-EHG8wI5t*kN@dgJJ5a?ATdl?Cb;o*^KjS#(y?P^$*+Q5&E+*b1Kw`sKqk z056gAGC4=cIR;1dheF{*BAI5|q7(!oPKeXM(5A0xl##u^xx)2Mm!-r~W z{X|`K4l?1Cp6P`2XX1&KckO@d?s#Ii zxWv4bvsaUpZs(m@rEW6ktjp}^SDgJwTIaYigDF=dX1Yd|>d_mga*!rXY|JPvdlc`; zjaPG?>RYF79n2tKTRWQ_YNJ8r=(=K4+(`)3d`OB+&(>2o-zpL%XgX8U3Pvmu>w+En zP&+iC&`n8>O50+@!nBN4Bzj;zCwwl+seid8*Me`IgBgS8eeaxG@MI!lCa>5)RwaO5}=4D(sF|+q7M2*KTIA1kp|SKi<(gSK)bpFFg1}zhzM$7=D^L> zA#WGbHY2=#ox94SkucQ*wIq9Gh0HN6;36l8`gv9Nqigcb|}T<9N% z-ENvoMGcxC<0qT(s(hPhF%Vt7YQM?40TB)%PbzgB#EgN zSrVgZF`7bQm!qlZg_tB>xW+01g71PPlPrTNP|V7T$IrZ((ra`wAtvPrpdj8%tt>Am zTL&jJg*y7Zr z%~zs&JHR`MtR5F6l(m09nkdbOSI{fyqlKuPO0#)F)j&=xn~^=EV}m04KpdIC8|_Ci z32izhStdcP&E*wg$_A2y#W8BJWbzW`KopaVYPpb%#gbQ0n@Lv2xice!B8i~IJ>ozD zbBDxchz)AdMcA{%yLLz%(w|q3o}D)Q;!6}kA>x^%Q&T5rj}HxLVFWSrI_4RBv@rTE zfnznAPr?u+wVZ@l4_%f{V$TajBa`Bqh$@mhLhkVJFx-(ndk#?0P*Irv!8)=4H#%M! zjulzbH2PrjzvmWNp8Z}{o6#}6jH>OWs*UU!+fx*_f1kLfp^$TDMvabt;jY#?PjidA zy*T_u`x`^>1-e@tEp+eJoD)F4=<|D>8bkd_6KZ^%Gpp{iwW{D)cuKU>L?hsYwmh7WPFAr_ls4ccKtB;7JXk zV)MBe1%^(DkcFqE0x<0zhMmu(NY*~IQ4z{xF=UU2&C-~6L}QlMqjX~^R7e~kEDnPh zwqxlymon1#17?CWX?$tbB*P{un{5Rwt3uh#N-i|d%I1q&UE}@)p`Yj>JV!9vV}!v-3hYB^*culyWhSKh z44J9c?D#U-oHg53J_SV?gE4cVuS$e0V`E%t!-|+>N0ntL6{I+Z*#b(Z_OhoWrwS4j zFhhVtDbLT(l>Stf!yvSR^rawYK|EPFls%PGZCip_he9@ht<*bCgcpy#p5POGOwd1O zYr#ZvA+M2Rgjht!d$nthr8Xn1@CLJ31%<6H^CD*QhCAJ1#G7-)o$fTk*0@i=!qSsA znS>SYx+mSMg(AEF>o#DG_jO^-BloNcsjhUt0UxncO0hIx#24P*hh_N>O#?tkG{O3Y z$c}+MYt|ce=^^85Y1r@;-e!BC%&#QBtlamE8RtTcMm}jWFJTYhE6kB?MoRBv)EE+3 zGrgIs5e5TD(zlmA$aAbL2%neGN2YAIQcnAyO@o{&K~6pkskfVqKG?HON|IicJ=;JY zDFef&UPfWYa7#O%g<6_^HViYDTQYmH5RcFbJ+(-5v2J+;AduzxUPbaC~#7>CifXqs-bEvet!1p0NBa8OK1((|mxU@^h$G`+gjHgS}a z9PCl;;5A{d3%e;z8Y>fCkO@7iRq186eaP=o(EAirB-tSbm>JQi#kz0=>gqI=DXz?# zri@?#mgP)8Pdb0q%h(lC$af$JDW_OESR)U z%QE;55`n7vcBd!qp`dEYGa@a7S7LZmPHG)(t&Be{Si`57U5VOSy6cvj&}c%@uTVWm z0IrGzOj~eqTZ>Je<*yPf3Ha2mN@bR1zl)&P;RJn}R>KQhK@*k3qMnnE4IW%S1 zP!SI(|8olZ9tG9FaMP?&h|w8pXK}9<&I1$ys;*FoO}$VkMZIiLTkL&9ll}fN!dE*g zOJ#C~BRgZQY+}v?^-S!^Y`%KtI{~sCREp2xNs0Md;Y5%XW?`ougY; zOP`y-yO4k17>n;yguYDhup%73m(IR?MtS*60SeN(KwZ{v$l#AFL&qO5jbO%q{Bc`n zwryBx8#Z$(^-#HH1#&_y{2$ERoZ0eLwX-TT=L1~(E~-rP;~g#QLe8@z%7TRJlyJ2kV{)@ zeRX6(dgO7R2LK|W?M#;meI|YvGtk9JpfHyis>}P(IRwdSA;U367%KwaulV*8@BtloH-KHRwQf?JOrW(>`UTvKnZxjoxFs5B4e{hTZK8RvAlHKA-ZBla26)6tduCz!{YMd5eI z_boYDa#?Wk^cu^&2iuAJ#V-`6Y?)(5TtKHk-&Z3=h9}oQb=YWP5 zqnYd9pP77J>3con?EaN7oY_5ffAO!bYzpU}VDTK~GtQn&-;t+Sk-Kw&U3bpkUA&)w zd7{@x(bI_T%M46C6}ZlxOwW<~!<*ulGS0R~?VXvfL-!J!?WZ%&mPcK^nZBv}OPgJY z>v$w~6XecK5pivg{3AI5E37Xk>|~zK93jB&oY1*d?FzW_oJ(IoC00)+OZg+r-R;Z_ zBK#%HO_{%DS>&$s_|w~3G)pVmx)}_>6nBe%2X;HKqve)N_7a_@KM2)X8Y6_>2-W9w zS!iVj%4CHK(9o?_u9sfgbuhRU87~{N|Aoe`a%{x9ib5Dn6%HDhOm3m%qMmU8Q`jph z{V5M)#Tz{Z_S*Wn`E8pUE2#o$yG==!7bt0)+@4nwT3f+bVVgR&@?7`_XAP{;m*D9=mB4*DC z%x(*lktrefuo%Zl6DGJP<=1>TAwu5)EeCWLNl})VV4ZOxi(#O098NXSZ!EbOO>5`Z zpww9gN?+7G;gjJ4dm2~|{I8}LvG4nHbnql!o@``j>lA)}jlSxJG)YMQob6egG z3#5rMv(lTQsniG*z4=UKlL=`kogBhT96%px@uWD^G<9ZXIOYc>7eScqgo3f4UP&Q5 zsP&Vv3u_0=Fj!3U7)_aZY0MCHZ}JL;ld2Y>a}?opL9F>}A`@oWSqktTMCGV|!g>G49%JfMYkQC+u*6DT2)xxzYtP3qOrZMNR$ zq&;9uAkKWGi*tvc@ z*D7XPhm_W#9C(euP9?Cj)YGj5x><wbMRaE4I< zM26RWFnetwS*@lBk!$HvG^@kqQ*M!nA0V|36eOq=rWm1Jq0wu*hFA-{3FIE6JtTY^^TwmI9 zRZ*@k~$H&-ll4{+e~2C$KZ+gX(1x zlZbwDvE-N!5gtUIZw-KGFFD%6wA7| zHJ$8LhB448ERFM8ZMbKQ3DeFTv$C{Hu&|VyZXTGXL`xh^(C`ZduuuuoX zhof^S!CYN*0D7Al8ezss%@wV2KW~JI#HuvwwLBx-Ho5LHYFT(k%(>0vy^dL#21|7> zlT~1Z6r^e`WOj>-D+@Z72{DN$DO$8s+zo0K_9?n>L$%R`C8~WsxqL08+M;PGu6lIQ zE)#CLVStptmWk_4WFr6MDJsj3Z4ok5wrP7{S|Z8X#U{8AUn@RK|A;ng2WB4*3)i>aesg2yZqH`@zIAu$7ledD z?XB@l^X|KK4?Pn(oQ^$p^Hf%7RfN{t?>rQC<;0<^II4)Fo8s8|^h2R7C)7NGQJ?)` z$M{3<#G|^#+ilsVL8WPMvu(dCNyL`ZMoXU4=2AnnXTcZmlNn($Q#)Cq zWr$JgC@8J!zTcTYX&dkL+s zYric$?45X6OLDjBhHRkwL7@9_eRHzRn^|)3n%7)CaPbw29AJm@A2q!bO zCn-^1wsBBt9K17kuj&4&&BoW)y|_am*%L8|oQNsagh4`q@3k2O1jrFP zakd&i^XvbEN5i5TXTwZ|uVpcWq^d&;ND-FS#Dy9g;{*441*Uws&mfJH z?yJ4qpubYE|3dihuaVUaF6Z%^tje$YkK_UkY$bOofv#*|zY^GA^f{pfCW=1GVkfcU z%VNQ!4EPkGXMN_Qo%tHBwF_zllB(}gns;TJ$Cc*syODb{*+Zw5L#HevJ@Bw4 ztkb>tG@muzmG%BM<3%&pxyBumTkYUehq zqX6>XX|zZjhtcx2-Hv42Usl>*&a@rL2uIiXoYR%@cWl_SO+VAIH{%^soMU=~NNUD0 zrE?5t2{PUZ#W|rz^wTk&SCoNQGX2Lg-YLa7#Uj>MZ}Q|X;OeqZmc^3=1uBkQeA)mn2)aSkA^O4U_z5wSQ{7p4{bl^zK zEq{uC*PVNef);fSct4 zbV&m{Z{f!xuU36$^m`U&PM<)n!=GWqjUaZqV&=3+&CzAMue0mi3>mIQj6>=P%^eIn zG{Lcc9OEpr=a)5sr4C41M7oN}OT)e2A=Aa`26j%_n(#-h(-_rHfS@{Mgr)K{14f2hXL$p>UBb0ek_!$rR z*&Z!m_LSL^x-@%I;;bRtAcZ$wmZ!_BAiJu9Rhzl1>4fXMS9y<+*KloNu93e zXV3&F!IY1cSb$a_li5yOL8cY47GDF%BtaEqTevXLRL1FcUUGq>w`hzp!^_mUL63YB zFNWAP^B#u08Sd|;2)0GCVITyeA@qJq=+?-Z_bBx9$4W!;M7C-qC){G=o*l+|*HD0I z)I&<56jhdtdKwI)o}AE}6Pj{DV-B_#kVdm_bI*mo{(7jGrL2R zD1)OWvGm5Pccy+JXqQi^0;4P->&HoxR0+37!_KDRAHcI6wW3NNFCz7`+CS$NFo}p~#z&^M~9Y@CA8x#H7Q&!Q%p_ z)!o{Z>PO?*{jn(=SW2$MunXu~Mt!qjdO}Ga#(kH<8!Nc#HZExve*d++=~X&SzS>aM zY&{&826HJ|Eg<|L#PKy}T4pwVFl;!j(PM#1^q9Pf1dQ;)Re{N5tc?7UA5t`J-I_t- zWAgoyoWCU}3#XWbQYny}{|RiE`crrq3FQR|)yFlo8N{|s3R&Y>LyYI_MCDGQxIBvDA(w&s^?2_bA za#{9JPJ1iCFl)%vFtdi#?@%6Qx=HO2hfKSO3?@snG?homM_Q1|B^DaMWTj1&O4P2% zq*I z!S$z4F`Y(8`L_2o|Ae#flebfl8uF8Ojyd#In&fZ FATReLU(x[:d]) * x[d:] where + d = x.shape[-1] // 2. + This is used in openbmb/MiniCPM-S-1B-sft. + + Shapes: + x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) + return: (num_tokens, d) or (batch_size, seq_len, d) + """ + + def __init__(self, threshold: float = 0.0): + super().__init__() + self.threshold = threshold + if current_platform.is_cuda_alike(): + from vllm import _custom_ops as ops + self.op = ops.fatrelu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + x1 = x[..., :d] + x2 = x[..., d:] + x1 = F.threshold(x1, self.threshold, 0.0) + return x1 * x2 + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + self.op(out, x, self.threshold) + return out + + +@CustomOp.register("silu_and_mul") +class SiluAndMul(CustomOp): + """An activation function for SwiGLU. + + The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2. + + Shapes: + x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) + return: (num_tokens, d) or (batch_size, seq_len, d) + """ + + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + from vllm import _custom_ops as ops + self.op = ops.silu_and_mul + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + + self.op = ipex_ops.silu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native + + @staticmethod + def forward_native(x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return F.silu(x[..., :d]) * x[..., d:] + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + self.op(out, x) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + self.op(out, x) + return out + + +@CustomOp.register("mul_and_silu") +class MulAndSilu(CustomOp): + """An activation function for SwiGLU. + + The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2. + + Shapes: + x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) + return: (num_tokens, d) or (batch_size, seq_len, d) + """ + + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike(): + from vllm import _custom_ops as ops + self.op = ops.mul_and_silu + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + + self.op = ipex_ops.silu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return x[..., :d] * F.silu(x[..., d:]) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + self.op(out, x) + return out + + # TODO implement forward_xpu for MulAndSilu + # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + + +@CustomOp.register("gelu_and_mul_sparse") +class GeluAndMulSparse(CustomOp): + """An activation function for GeluAndMulSparse. + This activation function is used in Gemma3n. It computes: + up_proj = self.up_proj(x) + gate_proj = self.gate_proj(x) + gate_proj = self._gaussian_topk(gate_proj) # sparsity + activations = self.act_fn(gate_proj) # gelu + down_proj = self.down_proj(activations * up_proj) + Shapes: + x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) + return: (num_tokens, d) or (batch_size, seq_len, d) + """ + + def __init__(self, activation_sparsity: float, approximate: str = "none"): + super().__init__() + # Gelu. + self.approximate = approximate + if approximate not in ("none", "tanh"): + raise ValueError(f"Unknown approximate mode: {approximate}") + + # Sparsity. + if activation_sparsity == 0.0: + raise ValueError("activation_sparsity is 0.0. Please use GeluAndMul.") + target_sparsity_tensor = torch.tensor(activation_sparsity, dtype=torch.float32) + normal_dist = torch.distributions.normal.Normal(0, 1) + self.std_multiplier = normal_dist.icdf(target_sparsity_tensor) + + def _gaussian_topk(self, x: torch.Tensor) -> torch.Tensor: + """Get % sparse percentile of the Gaussian distribution.""" + # NOTE(rob): for TP>1, we could all-gather to get the means/std. + # But we do not do this because in expectation they are the same + # and in practice the eval scores are good without gathering. + mean = torch.mean(x, dim=-1, keepdim=True) + std = torch.std(x, dim=-1, keepdim=True, unbiased=False) + cutoff_x = mean + std * self.std_multiplier + return nn.functional.relu(x - cutoff_x) + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + out = self._gaussian_topk(x[..., :d]) + out = F.gelu(out, approximate=self.approximate) + return out * x[..., d:] + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + +@CustomOp.register("gelu_and_mul") +class GeluAndMul(CustomOp): + """An activation function for GeGLU. + + The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2. + + Shapes: + x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) + return: (batch_size, seq_len, d) or (num_tokens, d) + """ + + def __init__(self, approximate: str = "none"): + super().__init__() + self.approximate = approximate + if approximate not in ("none", "tanh"): + raise ValueError(f"Unknown approximate mode: {approximate}") + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if approximate == "none": + from vllm import _custom_ops as ops + self.op = ops.gelu_and_mul + elif approximate == "tanh": + from vllm import _custom_ops as ops + self.op = ops.gelu_tanh_and_mul + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + + if approximate == "none": + self.op = ipex_ops.gelu_and_mul + else: + self.op = ipex_ops.gelu_tanh_and_mul + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + self.op(out, x) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + self.op(out, x) + return out + + def extra_repr(self) -> str: + return f"approximate={repr(self.approximate)}" + + +@CustomOp.register("swigluoai_and_mul") +class SwigluOAIAndMul(CustomOp): + # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110 + def __init__(self, alpha: float = 1.702, limit: float = 7.0): + super().__init__() + self.alpha = alpha + self.limit = limit + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + + gate, up = x[..., ::2], x[..., 1::2] + gate = gate.clamp(min=None, max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + gated_output = (up + 1) * glu + return gated_output + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit) + return out + + def extra_repr(self) -> str: + return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}" + + +@CustomOp.register("gelu_new") +class NewGELU(CustomOp): + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + from vllm import _custom_ops as ops + self.op = ops.gelu_new + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + + self.op = ipex_ops.gelu_new + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + c = math.sqrt(2.0 / math.pi) + return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0)))) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + self.op(out, x) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + return self.op(x) + + +@CustomOp.register("gelu_fast") +class FastGELU(CustomOp): + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + from vllm import _custom_ops as ops + self.op = ops.gelu_fast + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + + self.op = ipex_ops.gelu_fast + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + self.op(out, x) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + return self.op(x) + + +@CustomOp.register("quick_gelu") +class QuickGELU(CustomOp): + # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90 + def __init__(self): + super().__init__() + if current_platform.is_cuda_alike() or current_platform.is_cpu(): + from vllm import _custom_ops as ops + self.op = ops.gelu_quick + elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops + + self.op = ipex_ops.gelu_quick + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return x * torch.sigmoid(1.702 * x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + self.op(out, x) + return out + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + self.op(out, x) + return out + + # TODO implement forward_xpu for QuickGELU + # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + + +@CustomOp.register("relu2") +class ReLUSquaredActivation(CustomOp): + """ + Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2 + """ + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + return torch.square(F.relu(x)) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + # TODO : implement cuda kernels + return self.forward_native(x) + + +@CustomOp.register("xielu") +class XIELU(CustomOp): + """ + Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010 + If the user has installed the nickjbrowning/XIELU, we import xIELU CUDA + Otherwise, we emit a single warning and use xIELU Python + """ + + def __init__( + self, + alpha_p_init: float = 0.8, + alpha_n_init: float = 0.8, + beta: float = 0.5, + eps: float = -1e-6, + dtype: torch.dtype = torch.bfloat16, + with_vector_loads: bool = False, + ): + super().__init__() + self.alpha_p = nn.Parameter( + torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) - 1).unsqueeze( + 0 + ) + ) + self.alpha_n = nn.Parameter( + torch.log( + torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) - 1 + ).unsqueeze(0) + ) + self.register_buffer("beta", torch.tensor(beta, dtype=dtype)) + self.register_buffer("eps", torch.tensor(eps, dtype=dtype)) + self.with_vector_loads = with_vector_loads + # Temporary until xIELU CUDA fully implemented + self._beta_scalar = float(self.beta.detach().cpu().float().item()) + self._eps_scalar = float(self.eps.detach().cpu().float().item()) + + self._xielu_cuda_obj = None + try: + import xielu.ops # noqa: F401 + + self._xielu_cuda_obj = torch.classes.xielu.XIELU() + msg = "Using experimental xIELU CUDA." + try: + from torch._dynamo import allow_in_graph + + self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda) + msg += " Enabled torch._dynamo for xIELU CUDA." + except Exception as err: + msg += ( + f" Could not enable torch._dynamo for xIELU ({err}) - " + "this may result in slower performance." + ) + self._xielu_cuda_fn = self._xielu_cuda + logger.warning_once(msg) + except Exception as err: + logger.warning_once( + "CUDA-fused xIELU not available (%s) –" + " falling back to a Python version.\n" + "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`", + str(err), + ) + + def _xielu_python(self, x: torch.Tensor) -> torch.Tensor: + alpha_p = nn.functional.softplus(self.alpha_p) + alpha_n = self.beta + nn.functional.softplus(self.alpha_n) + return torch.where( + x > 0, + alpha_p * x * x + self.beta * x, + (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + self.beta * x, + ) + + def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor: + """Firewall function to prevent torch.compile from seeing .item()""" + assert self._xielu_cuda_obj is not None, "XIELU CUDA object must not be None" + original_shape = x.shape + # CUDA kernel expects 3D tensors, reshape if needed + while x.dim() < 3: + x = x.unsqueeze(0) + if x.dim() > 3: + x = x.view(-1, 1, x.size(-1)) + if original_shape != x.shape: + logger.warning_once( + "Warning: xIELU input tensor expects 3 dimensions" + " but got (shape: %s). Reshaping to (shape: %s).", + original_shape, + x.shape, + ) + result = self._xielu_cuda_obj.forward( + x, + self.alpha_p, + self.alpha_n, + # Temporary until xIELU CUDA fully implemented -> + # self.{beta,eps}.item() + self._beta_scalar, + self._eps_scalar, + self.with_vector_loads, + ) + return result.view(original_shape) + + def forward_native(self, input: torch.Tensor) -> torch.Tensor: + if self._xielu_cuda_obj is not None and input.is_cuda: + if not torch._dynamo.is_compiling(): + return self._xielu_cuda_fn(input) + else: + logger.warning_once( + "torch._dynamo is compiling, using Python version of xIELU." + ) + return self._xielu_python(input) + + def forward_cuda(self, input: torch.Tensor) -> torch.Tensor: + return self.forward_native(input) + + +class ScaledActivation(nn.Module): + """An activation function with post-scale parameters. + + This is used for some quantization methods like AWQ. + """ + + def __init__( + self, + act_module: nn.Module, + intermediate_size: int, + input_is_parallel: bool = True, + params_dtype: torch.dtype | None = None, + ): + super().__init__() + self.act = act_module + self.input_is_parallel = input_is_parallel + if input_is_parallel: + tp_size = get_tensor_model_parallel_world_size() + intermediate_size_per_partition = divide(intermediate_size, tp_size) + else: + intermediate_size_per_partition = intermediate_size + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.scales = nn.Parameter( + torch.empty(intermediate_size_per_partition, dtype=params_dtype) + ) + set_weight_attrs(self.scales, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.act(x) / self.scales + + def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): + param_data = param.data + if self.input_is_parallel: + tp_rank = get_tensor_model_parallel_rank() + shard_size = param_data.shape[0] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(0, start_idx, shard_size) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +_ACTIVATION_REGISTRY = LazyDict( + { + "gelu": lambda: nn.GELU(), + "gelu_fast": lambda: FastGELU(), + "gelu_new": lambda: NewGELU(), + "gelu_pytorch_tanh": lambda: nn.GELU(approximate="tanh"), + "relu": lambda: nn.ReLU(), + "relu2": lambda: ReLUSquaredActivation(), + "silu": lambda: nn.SiLU(), + "quick_gelu": lambda: QuickGELU(), + "tanh": lambda: nn.Tanh(), + "sigmoid": lambda: nn.Sigmoid(), + "xielu": lambda: XIELU(), + } +) + + +def get_act_fn(act_fn_name: str) -> nn.Module: + """Get an activation function by name.""" + act_fn_name = act_fn_name.lower() + + if act_fn_name.startswith("torch.nn.modules."): + activation_name = act_fn_name.split(".")[-1] + if activation_name == "identity": + return nn.Identity() + act_fn_name = activation_name + + if act_fn_name not in _ACTIVATION_REGISTRY: + raise ValueError(f"Activation function {act_fn_name!r} is not supported.") + + return _ACTIVATION_REGISTRY[act_fn_name] + + +_ACTIVATION_AND_MUL_REGISTRY = LazyDict( + { + "gelu": lambda: GeluAndMul(), + "silu": lambda: SiluAndMul(), + "geglu": lambda: GeluAndMul(), + "swigluoai": lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs), + } +) + + +def get_act_and_mul_fn(act_fn_name: str) -> nn.Module: + """Get an activation-and-mul (i.e. SiluAndMul) function by name.""" + act_fn_name = act_fn_name.lower() + if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY: + raise ValueError(f"Activation function {act_fn_name!r} is not supported.") + + return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name] diff --git a/model_executor/layers/attention_layer_base.py b/model_executor/layers/attention_layer_base.py new file mode 100644 index 0000000..ffbef47 --- /dev/null +++ b/model_executor/layers/attention_layer_base.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Base class for attention-like layers.""" + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +from vllm.config import VllmConfig +from vllm.v1.kv_cache_interface import KVCacheSpec + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + + +class AttentionLayerBase(ABC): + """ + Base class for attention-like layers (Attention, Mamba, etc.) + that support the v1 engine. + + This provides a common interface for getting attention backends + from different layer types. + """ + + @abstractmethod + def get_attn_backend(self) -> type["AttentionBackend"]: + """Get the attention backend class for this layer.""" + pass + + @abstractmethod + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None: + """ + Get the KV cache spec for this layer. + May be None if the layer does not need KV cache. + """ + pass diff --git a/model_executor/layers/batch_invariant.py b/model_executor/layers/batch_invariant.py new file mode 100644 index 0000000..7920d11 --- /dev/null +++ b/model_executor/layers/batch_invariant.py @@ -0,0 +1,854 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from collections.abc import Callable +from functools import cache +from typing import Any + +import torch + +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton +from vllm.utils.torch_utils import is_torch_equal_or_newer + +logger = init_logger(__name__) + + +def _matmul_launch_metadata( + grid: Callable[..., Any], kernel: Any, args: dict[str, Any] +) -> dict[str, Any]: + ret = {} + m, n, k = args["M"], args["N"], args["K"] + ret["name"] = f"{kernel.name} [M={m}, N={n}, K={k}]" + if "tiles_per_update" in args: + ret["name"] = ( + f"{kernel.name} [M={m}, N={n}, K={k}, " + f"tiles_per_update={args['tiles_per_update']:02}]" + ) + if "c_ptr" in args: + bytes_per_elem = args["c_ptr"].element_size() + else: + bytes_per_elem = 1 if args["FP8_OUTPUT"] else 2 + ret[f"flops{bytes_per_elem * 8}"] = 2.0 * m * n * k + ret["bytes"] = bytes_per_elem * (m * k + n * k + m * n) + return ret + + +@triton.jit +def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS): + group_id = tile_id // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (tile_id % group_size_m) + pid_n = (tile_id % num_pid_in_group) // group_size_m + return pid_m, pid_n + + +@triton.jit(launch_metadata=_matmul_launch_metadata) +def matmul_kernel_persistent( + a_ptr, + b_ptr, + c_ptr, # + bias_ptr, + M, + N, + K, # + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + BLOCK_SIZE_M: tl.constexpr, # + BLOCK_SIZE_N: tl.constexpr, # + BLOCK_SIZE_K: tl.constexpr, # + GROUP_SIZE_M: tl.constexpr, # + NUM_SMS: tl.constexpr, # + A_LARGE: tl.constexpr, + B_LARGE: tl.constexpr, + C_LARGE: tl.constexpr, + HAS_BIAS: tl.constexpr, +): + start_pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + k_tiles = tl.cdiv(K, BLOCK_SIZE_K) + num_tiles = num_pid_m * num_pid_n + + tile_id_c = start_pid - NUM_SMS + + offs_k_for_mask = tl.arange(0, BLOCK_SIZE_K) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + + for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True): + pid_m, pid_n = _compute_pid( + tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS + ) + start_m = pid_m * BLOCK_SIZE_M + start_n = pid_n * BLOCK_SIZE_N + offs_am = start_m + tl.arange(0, BLOCK_SIZE_M) + offs_bn = start_n + tl.arange(0, BLOCK_SIZE_N) + if A_LARGE: + offs_am = offs_am.to(tl.int64) + if B_LARGE: + offs_bn = offs_bn.to(tl.int64) + offs_am = tl.where(offs_am < M, offs_am, 0) + offs_bn = tl.where(offs_bn < N, offs_bn, 0) + offs_am = tl.max_contiguous(tl.multiple_of(offs_am, BLOCK_SIZE_M), BLOCK_SIZE_M) + offs_bn = tl.max_contiguous(tl.multiple_of(offs_bn, BLOCK_SIZE_N), BLOCK_SIZE_N) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for ki in range(k_tiles): + if A_LARGE or B_LARGE: + offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K).to(tl.int64) + else: + offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn + ) + + a = tl.load( + a_ptrs, mask=offs_k_for_mask[None, :] < K - ki * BLOCK_SIZE_K, other=0.0 + ) + b = tl.load( + b_ptrs, mask=offs_k_for_mask[:, None] < K - ki * BLOCK_SIZE_K, other=0.0 + ) + accumulator = tl.dot(a, b, accumulator) + + tile_id_c += NUM_SMS + pid_m, pid_n = _compute_pid( + tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS + ) + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + if C_LARGE: + offs_cm = offs_cm.to(tl.int64) + offs_cn = offs_cn.to(tl.int64) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + if HAS_BIAS: + bias_ptrs = bias_ptr + offs_cn + bias = tl.load(bias_ptrs, mask=offs_cn < N, other=0.0).to(tl.float32) + accumulator += bias + c = accumulator.to(c_ptr.dtype.element_ty) + tl.store(c_ptrs, c, mask=c_mask) + + +def matmul_persistent( + a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None +): + # Check constraints. + assert a.shape[1] == b.shape[0], "Incompatible dimensions" + assert a.dtype == b.dtype, "Incompatible dtypes" + assert bias is None or bias.dim() == 1, ( + "Currently assuming bias is 1D, let Horace know if you run into this" + ) + NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count + M, K = a.shape + K, N = b.shape + dtype = a.dtype + # Allocates output. + c = torch.empty((M, N), device=a.device, dtype=dtype) + + # 1D launch kernel where each block gets its own program. + def grid(META): + return ( + min( + NUM_SMS, + triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ), + ) + + configs = { + torch.bfloat16: { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 3, + "num_warps": 8, + }, + torch.float16: { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_stages": 3, + "num_warps": 8, + }, + torch.float32: { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "num_stages": 3, + "num_warps": 8, + }, + } + # print(a.device, b.device, c.device) + matmul_kernel_persistent[grid]( + a, + b, + c, # + bias, + M, + N, + K, # + a.stride(0), + a.stride(1), # + b.stride(0), + b.stride(1), # + c.stride(0), + c.stride(1), # + NUM_SMS=NUM_SMS, # + A_LARGE=a.numel() > 2**31, + B_LARGE=b.numel() > 2**31, + C_LARGE=c.numel() > 2**31, + HAS_BIAS=bias is not None, + **configs[dtype], + ) + return c + + +@triton.jit +def _log_softmax_kernel( + input_ptr, + output_ptr, + input_row_stride, + output_row_stride, + n_cols, + BLOCK_SIZE: tl.constexpr, +): + """ + Compute log_softmax along the last dimension of a 2D tensor. + Each block handles one row of the input tensor. + """ + # Get the row index for this block + row_idx = tl.program_id(0).to(tl.int64) + + # Compute base pointers for input and output rows + row_start_ptr = input_ptr + row_idx * input_row_stride + output_row_start_ptr = output_ptr + row_idx * output_row_stride + + # Step 1: Find maximum value in the row for numerical stability + max_val = -float("inf") + for col_offset in range(0, n_cols, BLOCK_SIZE): + col_idx = col_offset + tl.arange(0, BLOCK_SIZE) + mask = col_idx < n_cols + + # Load values + vals = tl.load(row_start_ptr + col_idx, mask=mask, other=-float("inf")) + + # Update maximum + max_val = tl.max(tl.maximum(vals, max_val)) + + # Step 2: Compute sum of exp(x - max_val) + sum_exp = 0.0 + for col_offset in range(0, n_cols, BLOCK_SIZE): + col_idx = col_offset + tl.arange(0, BLOCK_SIZE) + mask = col_idx < n_cols + + # Load values + vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0) + + # Compute exp(x - max_val) and accumulate + exp_vals = tl.exp(vals - max_val) + sum_exp += tl.sum(tl.where(mask, exp_vals, 0.0)) + + # Compute log(sum_exp) + log_sum_exp = tl.log(sum_exp) + + # Step 3: Compute final log_softmax values: x - max_val - log_sum_exp + for col_offset in range(0, n_cols, BLOCK_SIZE): + col_idx = col_offset + tl.arange(0, BLOCK_SIZE) + mask = col_idx < n_cols + + # Load values + vals = tl.load(row_start_ptr + col_idx, mask=mask) + + # Compute log_softmax + output = vals - max_val - log_sum_exp + + # Store results + tl.store(output_row_start_ptr + col_idx, output, mask=mask) + + +def log_softmax(input: torch.Tensor, dim: int = -1) -> torch.Tensor: + """ + Compute log_softmax using Triton kernel. + + Args: + input: Input tensor + dim: Dimension along which to compute log_softmax + (only -1 or last dim supported) + >> Stashed changes + Returns: + Tensor with log_softmax applied along the specified dimension + """ + if dim != -1 and dim != input.ndim - 1: + raise ValueError( + "This implementation only supports log_softmax along the last dimension" + ) + + # Flatten all dimensions except the last one + original_shape = input.shape + input_2d = input.reshape(-1, input.shape[-1]) + input_2d = input_2d.contiguous() + + n_rows, n_cols = input_2d.shape + + # Allocate output tensor + output = torch.empty_like(input_2d) + + # Choose block size based on the number of columns + BLOCK_SIZE = 1024 + + # Launch kernel with one block per row + grid = (n_rows,) + _log_softmax_kernel[grid]( + input_2d, + output, + input_2d.stride(0), + output.stride(0), + n_cols, + BLOCK_SIZE=BLOCK_SIZE, + ) + # Reshape output back to original shape + return output.reshape(original_shape) + + +@triton.jit +def mean_kernel( + input_ptr, + output_ptr, + input_stride0, + input_stride1, + input_stride2, + output_stride0, + output_stride1, + M, # size before reduction dim + N, # size of reduction dim + K, # size after reduction dim + BLOCK_SIZE: tl.constexpr, +): + """ + Kernel for computing mean along a single dimension. + Input is viewed as (M, N, K) where N is the dimension being reduced. + """ + # Program ID gives us which output element we're computing + pid = tl.program_id(0) + + # Compute output indices + m_idx = pid // K + k_idx = pid % K + + # Bounds check + if m_idx >= M or k_idx >= K: + return + + # Accumulate sum across reduction dimension + acc = 0.0 + for n_start in range(0, N, BLOCK_SIZE): + n_offsets = n_start + tl.arange(0, BLOCK_SIZE) + mask = n_offsets < N + + # Calculate input indices + input_idx = ( + m_idx * input_stride0 + n_offsets * input_stride1 + k_idx * input_stride2 + ) + + # Load and accumulate + vals = tl.load(input_ptr + input_idx, mask=mask, other=0.0) + acc += tl.sum(vals) + + # Compute mean and store + mean_val = acc / N + output_idx = m_idx * output_stride0 + k_idx * output_stride1 + tl.store(output_ptr + output_idx, mean_val) + + +def mean_dim( + input: torch.Tensor, + dim: int, + keepdim: bool = False, + dtype: torch.dtype | None = None, +) -> torch.Tensor: + """ + Triton implementation of torch.mean with single dimension reduction. + + Args: + input: Input tensor + dim: Single dimension along which to compute mean + keepdim: Whether to keep the reduced dimension + dtype: Output dtype. If None, uses input dtype + (or float32 for integer inputs) + + Returns: + Tensor with mean values along specified dimension + """ + # Validate inputs + assert -input.ndim <= dim < input.ndim, ( + f"Invalid dimension {dim} for tensor with {input.ndim} dimensions" + ) + + # Handle negative dim + if dim < 0: + dim = dim + input.ndim + + # Handle dtype + if dtype is None: + if input.dtype in [torch.int8, torch.int16, torch.int32, torch.int64]: + dtype = torch.float32 + else: + dtype = input.dtype + + # Convert input to appropriate dtype if needed + if input.dtype != dtype: + input = input.to(dtype) + + # Get input shape and strides + shape = list(input.shape) + + # Calculate dimensions for kernel + M = 1 + for i in range(dim): + M *= shape[i] + + N = shape[dim] + + K = 1 + for i in range(dim + 1, len(shape)): + K *= shape[i] + + # Reshape input to 3D view (M, N, K) + input_3d = input.reshape(M, N, K) + + # Create output shape + if keepdim: + output_shape = shape.copy() + output_shape[dim] = 1 + else: + output_shape = shape[:dim] + shape[dim + 1 :] + + # Create output tensor + output = torch.empty(output_shape, dtype=dtype, device=input.device) + + # Reshape output for kernel + output_2d = output.reshape(M, 1, K).squeeze(1) if keepdim else output.reshape(M, K) + + # Launch kernel + grid = (M * K,) + BLOCK_SIZE = 1024 + + mean_kernel[grid]( + input_3d, + output_2d, + input_3d.stride(0), + input_3d.stride(1), + input_3d.stride(2), + output_2d.stride(0), + output_2d.stride(1) if output_2d.ndim > 1 else 0, + M, + N, + K, + BLOCK_SIZE, + ) + + return output + + +def mm_batch_invariant(a, b): + return matmul_persistent(a, b) + + +def matmul_batch_invariant(a, b, *, out=None): + # torch.matmul can handle various dimensions + # For 2D x 2D, it's the same as mm + if a.ndim == 2 and b.ndim == 2: + result = matmul_persistent(a, b) + if out is not None: + out.copy_(result) + return out + return result + elif a.ndim == 3 and b.ndim == 3: + # Handle batched case like bmm + return bmm_batch_invariant(a, b, out=out) + elif a.ndim == 3 and b.ndim == 2: + # Handle 3D x 2D: common for linear layers + # (batch, seq, hidden) @ (hidden, out) -> (batch, seq, out) + # Reshape to 2D, do mm, reshape back + batch, seq, hidden = a.shape + a_2d = a.reshape(-1, hidden) + result_2d = matmul_persistent(a_2d, b) + result = result_2d.reshape(batch, seq, -1) + if out is not None: + out.copy_(result) + return out + return result + elif a.ndim == 2 and b.ndim == 3: + # Handle 2D x 3D: (M, K) @ (B, K, N) -> (B, M, N) + # By broadcasting `a` to 3D, we can reuse the batched matrix + # multiplication logic. + a_expanded = a.unsqueeze(0).expand(b.shape[0], -1, -1) + return bmm_batch_invariant(a_expanded, b, out=out) + elif a.ndim == 4 and b.ndim == 4: + # Handle 4D attention tensors: [batch, heads, seq, dim] + # Reshape to 3D, process, reshape back + batch, heads, seq_a, dim_a = a.shape + _, _, dim_b, seq_b = b.shape + + # Reshape to [batch*heads, seq_a, dim_a] + a_3d = a.reshape(batch * heads, seq_a, dim_a) + b_3d = b.reshape(batch * heads, dim_b, seq_b) + + # Do batched matmul + result_3d = bmm_batch_invariant(a_3d, b_3d) + + # Reshape back to [batch, heads, seq_a, seq_b] + result = result_3d.reshape(batch, heads, seq_a, seq_b) + + if out is not None: + out.copy_(result) + return out + return result + else: + raise ValueError( + f"matmul_batch_invariant currently only supports 2D x 2D, 3D x 3D, " + f"3D x 2D, 2D x 3D, and 4D x 4D, " + f"got shapes {a.shape} and {b.shape}" + ) + + +def bmm_batch_invariant(a, b, *, out=None): + # Batched matrix multiply: (B, M, K) x (B, K, N) -> (B, M, N) + # Process each batch separately with our persistent kernel + if a.ndim == 3 and b.ndim == 3: + results = [] + for i in range(a.shape[0]): + results.append(matmul_persistent(a[i], b[i])) + result = torch.stack(results, dim=0) + + if out is not None: + out.copy_(result) + return out + return result + else: + raise ValueError( + f"bmm_batch_invariant expects 3D tensors, " + f"got shapes {a.shape} and {b.shape}" + ) + + +def addmm_batch_invariant(bias, a, b): + return matmul_persistent(a, b, bias=bias) + + +def _log_softmax_batch_invariant(input, dim, _half_to_float): + assert not _half_to_float, "not implemented" + return log_softmax(input, dim=dim) + + +def softmax_batch_invariant(input, dim, dtype=None): + # Compute softmax in a deterministic way + # First subtract max for numerical stability (standard practice) + input_max = torch.amax(input, dim=dim, keepdim=True) + input = input - input_max + exp_x = torch.exp(input) + sum_exp_x = torch.sum(exp_x, dim=dim, keepdim=True) + return exp_x / sum_exp_x + + +def mean_batch_invariant(input, dim, keepdim=False, dtype: torch.dtype | None = None): + assert dtype is None or dtype == torch.float32, f"unsupported dtype: {dtype}" + + result = input.to(torch.float32) + + if len(dim) == 0: + dim = [i for i in range(len(input.shape))] + + # Sort dimensions to reduce from largest to smallest to handle shifting dims + # during iterative reduction. + sorted_dims = sorted([d % input.ndim for d in dim], reverse=True) + + # Iteratively apply a deterministic mean. + for d in sorted_dims: + result = mean_dim(result, dim=d, keepdim=True) + + if not keepdim: + # Squeeze the reduced dimensions. + for d in sorted_dims: + result = result.squeeze(d) + + return result + + +@triton.jit +def _rms_norm_kernel( + input_ptr, + weight_ptr, + output_ptr, + input_row_stride, + output_row_stride, + n_cols, + eps, + BLOCK_SIZE: tl.constexpr, +): + """ + Compute RMS normalization along the last dimension of a 2D tensor. + RMS Norm: y = x / sqrt(mean(x^2) + eps) * weight + Each block handles one row of the input tensor. + """ + row_idx = tl.program_id(0).to(tl.int64) + row_start_ptr = input_ptr + row_idx * input_row_stride + output_row_start_ptr = output_ptr + row_idx * output_row_stride + + # Step 1: Compute sum of squares in float32 to avoid overflow + sum_sq = tl.zeros([1], dtype=tl.float32) + for col_offset in range(0, n_cols, BLOCK_SIZE): + col_idx = col_offset + tl.arange(0, BLOCK_SIZE) + mask = col_idx < n_cols + + vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0) + # Convert to float32 for accumulation to prevent overflow + vals_f32 = vals.to(tl.float32) + sq_vals = vals_f32 * vals_f32 + sum_sq += tl.sum(tl.where(mask, sq_vals, 0.0)) + + # Step 2: Compute RMS (root mean square) in float32 + mean_sq = sum_sq / n_cols + rms = tl.sqrt(mean_sq + eps) + inv_rms = 1.0 / rms + + # Step 3: Normalize and apply weight + for col_offset in range(0, n_cols, BLOCK_SIZE): + col_idx = col_offset + tl.arange(0, BLOCK_SIZE) + mask = col_idx < n_cols + vals = tl.load(row_start_ptr + col_idx, mask=mask, other=0.0) + weight = tl.load(weight_ptr + col_idx, mask=mask, other=1.0) + # Compute in float32 then convert back to input dtype + vals_f32 = vals.to(tl.float32) + weight_f32 = weight.to(tl.float32) + output_f32 = vals_f32 * inv_rms * weight_f32 + output = output_f32.to(vals.dtype) + tl.store(output_row_start_ptr + col_idx, output, mask=mask) + + +def rms_norm( + input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6 +) -> torch.Tensor: + """ + Compute RMS normalization using Triton kernel. + + RMS Norm normalizes the input by the root mean square and scales by weight: + output = input / sqrt(mean(input^2) + eps) * weight + + Args: + input: Input tensor of shape (..., hidden_size) + weight: Weight tensor of shape (hidden_size,) + eps: Small constant for numerical stability + + Returns: + Tensor with RMS normalization applied along the last dimension + """ + assert weight.dim() == 1, "Weight must be 1-dimensional" + assert input.shape[-1] == weight.shape[0], ( + f"Input last dimension ({input.shape[-1]}) must match " + f"weight dimension ({weight.shape[0]})" + ) + + # Flatten all dimensions except the last one + original_shape = input.shape + input_2d = input.reshape(-1, input.shape[-1]) + input_2d = input_2d.contiguous() + weight = weight.contiguous() + + n_rows, n_cols = input_2d.shape + + output = torch.empty_like(input_2d) + BLOCK_SIZE = 1024 + grid = (n_rows,) + _rms_norm_kernel[grid]( + input_2d, + weight, + output, + input_2d.stride(0), + output.stride(0), + n_cols, + eps, + BLOCK_SIZE=BLOCK_SIZE, + ) + return output.reshape(original_shape) + + +def rms_norm_batch_invariant( + input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6 +) -> torch.Tensor: + """ + Batch-invariant wrapper for RMS normalization. + + This function provides a deterministic, batch-invariant implementation + of RMS normalization for use with the batch_invariant mode. + + Args: + input: Input tensor of shape (..., hidden_size) + weight: Weight tensor of shape (hidden_size,) + eps: Small constant for numerical stability + + Returns: + RMS normalized tensor + """ + return rms_norm(input, weight, eps=eps) + + +def linear_batch_invariant(input, weight, bias=None): + output = matmul_batch_invariant(input, weight.t()) + + if bias is not None: + output = output + bias + return output + + +_batch_invariant_MODE = False +_batch_invariant_LIB = None +_original_torch_bmm = None +_original_fp16_reduction_precision = None +_original_bf16_reduction_precision = None +_original_cublas_workspace_cfg = None +_original_cublaslt_workspace_size = None + + +def enable_batch_invariant_mode(): + global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm + global _original_fp16_reduction_precision, _original_bf16_reduction_precision + global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size + if _batch_invariant_MODE: + return + + _batch_invariant_MODE = True + _batch_invariant_LIB = torch.library.Library("aten", "IMPL") + + # Batch invariant matmuls are no longer needed after cublas overrides + if not is_torch_equal_or_newer("2.10.0.dev"): + if current_platform.is_device_capability(100): + # For PyTorch 2.9, B200 uses GEMV for bs=1 + # Requires https://github.com/pytorch/pytorch/pull/166735 + _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA") + _batch_invariant_LIB.impl("aten::addmm", addmm_batch_invariant, "CUDA") + _batch_invariant_LIB.impl("aten::matmul", matmul_batch_invariant, "CUDA") + _batch_invariant_LIB.impl("aten::linear", linear_batch_invariant, "CUDA") + else: + # Only source of batch invariance for Hopper is split-k, can disable through + # cuBLAS workspace config + _original_cublas_workspace_cfg = os.environ.get( + "CUBLAS_WORKSPACE_CONFIG", None + ) + _original_cublaslt_workspace_size = os.environ.get( + "CUBLASLT_WORKSPACE_SIZE", None + ) + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" + os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1" + + _batch_invariant_LIB.impl( + "aten::_log_softmax", _log_softmax_batch_invariant, "CUDA" + ) + _batch_invariant_LIB.impl("aten::softmax", softmax_batch_invariant, "CUDA") + _batch_invariant_LIB.impl("aten::_softmax", softmax_batch_invariant, "CUDA") + _batch_invariant_LIB.impl("aten::mean.dim", mean_batch_invariant, "CUDA") + + # Also monkeypatch torch.bmm directly as a fallback + _batch_invariant_LIB.impl("aten::bmm", bmm_batch_invariant, "CUDA") + _original_torch_bmm = torch.bmm + torch.bmm = bmm_batch_invariant + + _original_bf16_reduction_precision = ( + torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction + ) + _original_fp16_reduction_precision = ( + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction + ) + + reduced_precision_val = ( + (False, False) if is_torch_equal_or_newer("2.10.0.dev") else False + ) + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = ( + reduced_precision_val + ) + torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = ( + reduced_precision_val + ) + torch.backends.cuda.preferred_blas_library(backend="cublaslt") + + +@cache +def vllm_is_batch_invariant(): + env_key = "VLLM_BATCH_INVARIANT" + is_overridden = False + val = os.getenv(env_key, "0") + try: + is_overridden = int(val) != 0 + except ValueError: + is_overridden = False + return is_overridden + + +def override_envs_for_invariance(): + curr_attn_backend = envs.VLLM_ATTENTION_BACKEND + supported_backends = [ + "FLASH_ATTN", # best supported backend + "FLASHINFER", + "FLASH_ATTN_MLA", + "FLASHINFER_MLA", + "TRITON_MLA", + # Not yet supported MLA backends + # "FLASHMLA", + # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance + ] + if curr_attn_backend not in supported_backends: + warning = ( + "Forcibly updating attention backend to" + f" {supported_backends[0]} for batch_invariant. " + f" Supported backends: {supported_backends}." + ) + logger.warning_once(warning) + os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0] + if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]: + warning = ( + "You are using a decode-invariant form of batch invariance. " + "This will not be invariant between prefill and decode." + ) + logger.warning_once(warning) + os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" + + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" + + # NCCL determinism settings + os.environ["NCCL_LAUNCH_MODE"] = "GROUP" + os.environ["NCCL_COLLNET_ENABLE"] = "0" + os.environ["NCCL_NVLS_ENABLE"] = "0" + os.environ["NCCL_P2P_NET_DISABLE"] = "1" + os.environ["NCCL_MIN_NCHANNELS"] = "1" + os.environ["NCCL_MAX_NCHANNELS"] = "1" + os.environ["NCCL_PROTO"] = "Simple" + os.environ["NCCL_ALGO"] = "allreduce:tree" + os.environ["NCCL_NTHREADS"] = "1" + os.environ["NCCL_SOCKET_NTHREADS"] = "1" + + # torch.compile settings + os.environ["VLLM_USE_AOT_COMPILE"] = "0" + + +def init_batch_invariance(): + # this will hit all the csrc overrides as well + if vllm_is_batch_invariant(): + override_envs_for_invariance() + enable_batch_invariant_mode() + + # Disable TF32 for batch invariance - it causes non-deterministic rounding + torch.backends.cuda.matmul.allow_tf32 = False + torch.backends.cudnn.allow_tf32 = False diff --git a/model_executor/layers/conv.py b/model_executor/layers/conv.py new file mode 100644 index 0000000..e6f2d29 --- /dev/null +++ b/model_executor/layers/conv.py @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Conv Layer Class.""" + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from vllm.model_executor.custom_op import CustomOp +from vllm.utils.torch_utils import is_torch_equal + + +class ConvLayerBase(CustomOp): + """Conv layer base class.""" + + num_dim: int + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int | tuple[int, ...], + stride: int | tuple[int, ...] = 1, + padding: int | tuple[int, ...] = 0, + dilation: int | tuple[int, ...] = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = "zeros", + *, + params_dtype: torch.dtype | None = None, + ) -> None: + super().__init__() + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + + kernel_size = ( + (kernel_size,) * self.num_dim + if isinstance(kernel_size, int) + else kernel_size + ) + stride = (stride,) * self.num_dim if isinstance(stride, int) else stride + padding = (padding,) * self.num_dim if isinstance(padding, int) else padding + dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.padding_mode = padding_mode + + self.enable_linear = ( + (self.kernel_size == self.stride) + and not any(self.padding) + and self.groups == 1 + ) + self.input_size = in_channels * math.prod(self.kernel_size) + + self.weight = nn.Parameter( + torch.empty( + out_channels, + in_channels // groups, + *kernel_size, + dtype=params_dtype, + ), + ) + + if bias: + self.bias = nn.Parameter(torch.empty(self.out_channels, dtype=params_dtype)) + else: + self.register_parameter("bias", None) + + def extra_repr(self) -> str: + s = f"in_channels={self.in_channels}, " + s += f"out_channels={self.out_channels}, " + s += f"kernel_size={self.kernel_size}, " + s += f"stride={self.stride}, " + s += f"padding={self.padding}, " + s += f"bias={self.bias is not None}" + return s + + +@CustomOp.register("conv2d") +class Conv2dLayer(ConvLayerBase): + """Conv layer with Conv2d.""" + + num_dim = 2 + + def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: + assert x.dim() == 4 + B, C, H, W = x.shape + K1, K2 = self.kernel_size + H, W = H // K1, W // K2 + x = x.unfold(2, K1, K1).unfold(3, K2, K2) + x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size) + x = F.linear( + x, + self.weight.view(self.out_channels, self.input_size), + self.bias, + ) + x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2) + return x + + def _forward_conv(self, x: torch.Tensor) -> torch.Tensor: + assert x.dim() == 4 + x = F.conv2d( + x, + self.weight, + self.bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + ) + return x + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """Expected input shape: (batch_size, in_channels, height, width)""" + assert x.dim() == 4 + if self.enable_linear: + return self._forward_mulmat(x) + else: + return self._forward_conv(x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + # By default, we use CUDNN's convolution ops with optimization. + return self._forward_conv(x) + + +class CausalConv2dLayer(Conv2dLayer): + """ + A causal version of nn.Conv2d where each location in the 2D matrix would + have no access to locations on its right or down + All arguments are the same as nn.Conv2d except padding which should be + set as None + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int, + padding: int = 0, + dilation: int = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = "zeros", + *, + params_dtype: torch.dtype | None = None, + ) -> None: + if padding is not None: + raise ValueError( + "Argument padding should be set to None for CausalConv2dLayer." + ) + self._left_padding: int = kernel_size - 1 + self._right_padding: int = stride - 1 + padding = 0 + + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode, + params_dtype=params_dtype, + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + x = F.pad(x, pad=(self._left_padding, self._right_padding, 0, 0)) + x = super().forward(x) + return x + + +@CustomOp.register("conv3d") +class Conv3dLayer(ConvLayerBase): + """Conv layer with Conv3d.""" + + num_dim = 3 + + def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: + assert x.dim() == 5 + B, C, T, H, W = x.shape + K1, K2, K3 = self.kernel_size + T, H, W = T // K1, H // K2, W // K3 + x = x.unfold(2, K1, K1).unfold(3, K2, K2).unfold(4, K3, K3) + x = x.permute(0, 2, 3, 4, 1, 5, 6, 7).reshape(-1, self.input_size) + x = F.linear( + x, + self.weight.view(self.out_channels, self.input_size), + self.bias, + ) + x = x.view(B, T, H, W, self.out_channels).permute(0, 4, 1, 2, 3) + return x + + def _forward_conv(self, x: torch.Tensor) -> torch.Tensor: + assert x.dim() == 5 + x = F.conv3d( + x, + self.weight, + self.bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + ) + return x + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """Expected input shape: (batch_size, in_channels, time, height, width)""" + if self.enable_linear: + return self._forward_mulmat(x) + else: + return self._forward_conv(x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + # PyTorch2.9.0 disabled CUDNN's Conv3D, which caused a + # significant performance regression. + # See: https://github.com/vllm-project/vllm/issues/27406 + # and https://github.com/pytorch/pytorch/issues/166122 + # By default, we use CUDNN's convolution ops with optimization. + if self.enable_linear and is_torch_equal("2.9.0"): + return self._forward_mulmat(x) + return self._forward_conv(x) diff --git a/model_executor/layers/fla/__init__.py b/model_executor/layers/fla/__init__.py new file mode 100644 index 0000000..0e89cf9 --- /dev/null +++ b/model_executor/layers/fla/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang diff --git a/model_executor/layers/fla/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/fla/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8930b3ec915442f7db2b937c8711ac038c71aefc GIT binary patch literal 175 zcmX@j%ge<81UW3-neB`W439w^bnux4C^DU)l0lQvZzV$!6Oi{Aq~@2Oeo1bDenDn| zZb@QMdTNPoMX9-^MYegWeokhRenDkPMt+`tSx!!_er|qBYEFD=MQU%fF@$XUvg;F-r;O&D#xa~g|yyFXjh`4UoTRRW39Rsz;qQRi@8S5!kas}WUh z|I_SFuWu#=nyYR+@-ZYR`kf2NO`={KI=-v5!#_kPYKK}j3Ui`BT1!kEr0oGntpQ%x zhDhvtDy%6*nUy0eF;#Z7s>f%i%drwVyA@?)7ey}av^K&Nv<9dZz1tQj9K8~AEk#jQ x0wBvX3vgUE?e;loY1vC0&=uW>R0rffe9rmT9^*%AcClvBXAtzg`yTeG_X7#~axnk^ literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/chunk.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/chunk.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eabb7751f55f1b34c3fb77b465dbb44bd819bdf1 GIT binary patch literal 9841 zcmdTqTWlLwb~B_%4qxI!Z`-o3Ew4q|6lFPf{FD=IVtM_55Nh zvQ!GNg8-2?1#F`Qti>Xz@)a-akNr%)Ht8<9S@ffnH-L$OjRx55hk<_3Yxk!=?KyWi z98#9lI4DqbEZ@10Gxt31x#ygFF8?_g^fBu#CLdPlvq?vZ_2|EQnVIXRFG zjs|JnD>rAGN1JKgCx^1((Qr008li2!9L=_jwy=zs5w;2ek)4WJ1ln&QxIusO2K}K8 z`okOaN5nQEy1`hB*e<(M|_*ni1MRVT2BVK48Jv@2P`! z-KyPpnzhlb@XZmc*M3hOv+GvP3$n8Q3ETdE%1r14sdkC&6Yav5Ph9t_tc`A;XXD*Z zu{9EB4X-Nls>qSVyIKeja3Um(fu&^eH+8 zw#YqjSSq#X)T_$HFh#c5RCE?Oy~%=}a$7a~{lIzOWwsW*dJSjMTjYxD6g4C^EY==0 z)o^+Bpfv`#9DvP>_#U|KSMhuVMrP5s=+{GaaK54+pxi|dK)Ub4R>J2apn5T|7+h@D zBW4fKj~oPx0f2S^tjB^~4Czq|M&PCahl+SZfV~+E82p>?Tw)qvKB|W)`e4o=!pt*q z@5s|PVKP{0UsZ3U1yOCsE|-dAtB6ZQqG+!Xs4knJ&*#Os(_n{azq$oKahsonYu{~! zn!#nNx@OMnp+E*ZXRrlx;#P!j!-69AsLfcX*l{X@oiyQEERd(T$Z?;_VF!xPaOsMg zo-{lYvcl`H9Z+%Q3>PI*#Z^*qkyTt5)el8HyatDffbvSQG?JtuA8I!i+p*XIh2fe_ z<-|F|U$4Ndq9Mg(&GVn{^y}Gte_qP>>Aac|^}cI`!$*%y9~S#%X}mu_uTLtu{#jYh z_GcB?;wkZ(m@WWI{W3o4N_n8kOtJi+0Al zb$Q@k^ef+Y+*{X&{>+iG&wqRO^3Hp{-?ja|`|8Htu#x?co60rw3Uy*~3I%yb{ELYQ#vJ>{)^2o&PF-K$Mq z099C;ei3E7trlFUT4`=E;VAMb!?iE74?SJwQ26%F+ZUGoE0^yb{B>(7bg0A~dhG33 z9(d^8W)EJt$KE?s3hgg(`zuby*ZmxrWztw!PtDoyp;kj=(7}&7M5ov!x~2janEjrr z?Nhr?V8Oan?F~90Atd0}xXNoD!L>oZdxL(@2L0Ry{a(>4_%;~xi#~9Qt|{C=YaYRg z4jaM=-$bj>Y&#yy0a@?Sz`QdaHnzTn?&dUlo43)~Tm)Y;T*#$$Ny!;a>4Lz6-S-RP ztds^X0Y+ftm9~=6QP9NH%yg>ibfjErT2ynQoW^}=Jy^#*{`_kw?lMK6+Fob$S!+QS z*}C4^GA%k9oPYz2V-uKAYI2L-2kiZ-ng0mHve>lfHx2J%XfX^hObQID`GBEhQb+}T zivFpZd$9V@$*kujEc(FAIxc~69f^BiNl0Kw5f|ObDz!%#h#3yB2t81VM+|3LzxGN~ zsyKtX2McP(a@NwyP?p~%CE6pgs8pH3x-uu=fLML^lYLY$tut}oU8!Fb>_SR*7 zC9-m=($2JPS#ti3x4dU?$@v}E1GeCnepA0aboca~)1NIaPnSaR5*JtFfEi>y91j{? zDwX52Vk%|$QYkR47&oPiKq@s;;N|K_0OJuUofW|d3o4ogbte>tlh5YCD;6|ep|O$S zz6^1Rq8cvhWFWXv6xr~dS8^G1K*jj?R}95ZrKaw~$*?#J|Fz#j@d;CQMar&l+2#B6 zgBu?_a&@h`x>hDiuC712`hdg!*UGWJXC2#Jd%yRGZ=JY#qT+&j#mxjFwB^ATXSKZ8 z@>wlEwgOD3^_F~7t^~2w%!J!2A*_X&&hER#JH<)_TT!NaXQc&etsvA#t-DuScdzKH zt@|o%*xAnP+VdAA;<=M?C$WfN5q;){K9fJ>%J5B!yZ{lLs_9g}8ul^gJO;Z?Lta+?`El}#{*G~KEqE`*-e(MvVNpL}cSET`H^lI+|o$Jgh zc+gvKFy_KB8sr_Z!^m~r54XV#boO5|f_H;)iUG!bpS$WQW|<;)V}65g{79AHK^dys zbhJ?P5dxdR1CHS5PD_R$5%-NVMfbWAq4U?R24{UGcouvh55&8#4sY|uuz)qR85S51 zf9|s82dZ4xqo9@Rty6?@9bJI2MVF2tB7LShteX9%!F!|U@!bc?@u9~8@H(I`0Uf8L zPth;5SlY7aheJYu`i@rqA7LZ;&@2zB8SNOYVQYrOXv~{5jZfSjp`!&+l|+pw6GWTj z^CB7h@TUoKIYEAwAm^@*5ov;q*_(H8n3pwijEGR0kN^h20GX^IoZb}Sv^f92ifwjN zY_q&v*nqn$KbSj!w7JV{iZ2G94Z`*~fq+}eWr$8a0f|XDA}bl9<@vO@Gro?re;_gS zl;D|pQ(_QcJz4(!rJR0sjV4{fL;{)M)1Ykx#6a{(kz5q@5m6_+ z4uZf>iI!GWQM1K~I(TM+alb2 zu_Srh963hDMwFabw>%BGgF1rix4>G;&zSiJLh}vvYcpbQg9JfcL6*iwU`JqLgFyD7 zAm-9*Ss4Z(7~$nU$W3JQ$+gf4It7K$sDQ?KJv~V@I82kVpVsk#a@ml{mop-DoMN=G zmc#caA!rd{Nfcs!OL5Z!>E>M5OWNG~_R`ro7)uN75NC~Uyj92+WY`h2Voh+w6i|bH zj_ex%K||axP-)Nds>F}Wq9sdRO&^VDkk%}(C&=8S1Zyd2RGYOzKCh^7I9it#b3U7> z#^tsoP5Fc^R8Vtu23@hly{l$i?a>plPi^b99^@Mh0F_fK|mK`Cu;|(RBqF;(y?G*dTNF9TZyKV+_?CS|gU zB*Yp?%uq>y&$fnb8Nb^6jnhu4ffCvKJ+Pn02oYnJ8nF`3Vf3p}}Y48E$~k;JYgNbfk&NivxveJ24)3|et) z_Qn$=wsxlYT9$ZXdTP3T57xYx1`fTRQaXGHZJ?_6i@---(cm%zTom;k7OM?h*xv*F zGfBu5r_8W1_Tm28?%#hDcJAu~2f+*+88~|2YNNKukXx`RM$srK@Sdn1`2N3Bkw!hT zIUTgCHCdPpES%jWaYO_KqInD!tL~Ll^Y}^ft}OC!D1qRCSf(zi5;!p}oTnBd3;7vT zui8P!I_F$PpJWFG<`E}BAdo`{Jl|X;3Pw+Jk|rh>ayBUHr6A_vY73K~RBdV43RqMz ztIR?l1PeTwD1ZkS$t)!3K@%ECwI?(2((RK|=HNpJX?P*|6!QXj_XaAJo|eT~QC2ZY zvGC>@8i$&iQa!tAg-k}&bPco);~}_1tHvfID`^m-q$iJ|=B6O@BC&-~oSY!JLNH5yd)eO|?Q+i-$aH8@D? zVoNicux9wK;Mw|=swyf?T^cSvpO@!VOyvQJ#;dvi;o9sZ(qku{G3Tt|Z5%qT+G8e| z+t+t?>UjWH@VkG9OPK3SWz@k0g0~LeJbatKo4J$u&L3NeeA)3u$3uUt)D&B#Wyv2a z`vVB{cjDdbo$Nz@Pla)~+-2_ab0@q@{vNpN3WS#QTMIWAK6_)Cf9T&)g}73M*zt^G zxai|%QgN`sw6Yv0`1E?|GxJZy(>F^v3F(W%egP+?#+C$@Nmn( zV=jCvcr#dT9{G0WU*`TWS9<@i{;=?^TpE>1Q`3*8W>%+WN~*RxrI(~ag>lZZV=TOa zP;yLX*MIokULPcFxoFALU1j0+z|CXJk)OZuIJ&JI9r}9b?|Z-L{dVU!d%xaRVb~G& z9DL74&a=;*&X#7Jsu|P6T_ykSd)<%XCsyMpzPhj)f8%$reZBD8H%jkaS$*ru!}t}L z4ESp4dfc^bdGwb%?hUMVB}$!r_pW|5_~`h^>T#fc_4tL-v5TdPmnu#+@?PWoT=4p0 zIUKp`x#L-O-0?5#tKps{*W)18ukYQrEyA0|%B^V0vkK>13&?%S_|5BXRf0$nePUaD0dQGtdJwH}5v% zd#aR1&{Q4srxBC%rq}S8eG0wuHCoLU&34p&6JcH)yUm|L(5qy_ZC*;5moJ#DGh78- zk~Ml~Os@tE&)j@!f`@!Jy;VMn6yeWR)Zv{ixS6Qab@T-T9hI{zSo;FSLY1O8mHcbX-bqxoV7TVj&MA|BC YwA#*W-}M(t7uNl`ge zwCjuk#f^YE9fDKO2=aJFV62Yena-5%WE!@=G}Ax4fl@*jW@;yy$><+!shu`I+P=L5 zh$9CiMN!ElN8;Vvw|j5j+uiqe_ubyVX|*a0t_jHxy>HcH*soC{dmYLkPk$oDutki( z2z&y&hVG&Xd{T5xG%3C&#xX)nNG2qc(reO5*)0Z zALG~5BFrNmE6!GuyAz7{FhU7G75voj)4)#)Kix;NPjS8+_iSnInJh*WbYXX}JNO;Z zkYofK61`oGVF8TLcVTZ=0EQC=z@W~62apr<@_^3NED9X4~nvq6U83Um{pk1<3!00^wJ&Z zhjyTs?Lar~KtH?#y?h6H#SV1q4s_cN^veA7;bOx6p5)U^18u)wGbl0PAJIrud9bmn zJnea82YU4m^rJh_@tblCBWiZdb?lm3>oF0>JZ9oJ-0PrZZ60a)TIxN8`8nmM4vR{F z)o>xv@SgNq(L6pR!Z*j~9Oh_xf-35Bh*%Ld!OAGoOHKKnAPWMLWt#L%yGf60XmrLm z?(+Hw?~up8ojf%>?DtT92hNH;H>TmHoAJXVGlbhULb|6%9g-&^;P*s=fC2$MfT4_) z0*YZMA6e2YfFOM=KC+B02>#4T!R!_r(cd%i4o*JF+>?WZ`&H5m6(Q&2*8*6;qD0KOIDkuZqnP!Vo@pJbApv%!0(M2^ha67JbjVogn*-ghbF7TJeVa9)`}w}}m(RCfbPZf?zue6l1}?we z-|gx--?vH94YXhBcJ*~%bzNz{_+~e&JU;+f|HW=FP7+PLq#A%j%;8UvweSSd6D#0P z&H`XX9aq3?mA;ut*PNT2_OmL4`YHE_$M2A^crS~8o5im{xyueQD<7Kj4SPrYta#ir zPbv`u{4&_DftYgnrYM(x+B4*JPk3*7$U2UD?+6L=K1Mbm(1?Hyfl37I2p}Iz9zg)+ zRBT8%aU=GeiT^7Mh((OrH2*PV{M#-9&rXGVloMsk1xfvMEFKUx8h8)>5p--}RHPRl*E|6kRiQ(yT#3fK6 zcuKQXN0cJOv|&Uf=q+TU+pY(a&6cpzIzl|&1;h2|OXX_=zS7?V-%8*s`#tcr(=xgO zYzOtzA=E##j>8KO+`}Hwl(y~4+QUtlU;;84r^~kRei>ZR^1F=I@)FY;z+iN9Kaho& z{(y30v}l{$T$>z_tB5WnkI~|7oL@$Zh@llYK7lI01pOB{LTmYSpgNlqI7*vIH7y5E zDJaQVrku7wjv`Nvf;NY7@|M6-jN3XMwY{9y(FMaIx_}T3iwK-9rVXf<00-LyDE|~Z zGr9sbv{k?>pC_sz6c3bHYu%&e^H}o^!W}2gDrUA>+Bg-Oi!1aW9%S_WG;FQlxHwisN zpjl8F?Jc)mbn_1O)|^kR}q%x{bP(~`Hl7t>Pq2hlRWZSF%$!Dz`hPI0h9wBoB0+F@?SruEy_eJuih z`K+-;XpOX!+uu52-8>|CE}tCY1IUqY25O<5dA(XoK0AN@d6j~f4E#kO=VJl9=c3hF z9|2Ufr$0d($^XduHG%Hd?DM%htg|=w_uSe-Jr4V!z)8VV`C3|%-G^|`oZxq99M0_m zv%V|PD&Ui^<+t+TcaZ8~oYufEK@mZBQ&9fMN zk~*@9x6-Fj3ufhqXi{cXLqJIR2O)h^* z?=gNS$KkK=kk+A5K&T&Rvib+U1$(YsJAEsw@3wZ?s`mxhxp5o=r(cS6r#b0P=aKFV z?5A?lo!LdYTn}6zq#5a6eJRqt%1QTX9_e0#y;4rP*LIOEHx@1svW#?RUy5{RIqA;k zk**!~K{@H#cahG%gI~k;>({_`a#7rTegXCnv>H}oBDxx2&KICqfF3A{O>fX4;FNFN zcL?W%KSZVeA31PgViIi%TvW6$E^< zJUSyVI5p!V+~ho*E$|GDJ8&*{1PXE&yv3AEy8Yv z!Ad5k+ytCjm|~?~ALVRjB}0UFmX*TW$|*l9ABJp1MVTjCSly(1+~XRYm>L>)O;aQ* zCEdOe4=eS~mr-hzl?=K@n>Yw91GtleEv&*1XFLcGFv()0o7ng!Hpj}Q zU89Yzrp&!Lb9ZL$EvR0Kl|Uh=1Rzub5Gnx(nbWR05Q7J^C<)a~xcwA#+zn_~cvLay z8gU^K${j(4M_9>}YrKh70XoiQ0c)T%3n)+xP@Dn^R0)(80fno69B~}y#G(A%(5%*% zad8JW*m}N8N8FT$aPbF6$e9U`Yj}=ujeAHR95xx{hZ^CXJlR4nK=sJ|`=jteYyo@r zJsdNb7PS#=)Sc8@KdHQ5cdssiCo7wS=7r8wfhl}FG#{;s8J22-riISOa{X^Kx;y7W z=fW3aCdOQq)ErsY)G(Tw2gc8emy45{=8Y}+vFnVvCaH0(YmPIT;}7~jdu#cvq~>H= zis=kNaaw_C^n_?Zj?vD!jy;r^&G z-1|w#{R{UlJW@9-bfx5qVDr1P8wG|tH$pd}#%O<3@j&#MYFYKDz!{XFZ0EakFPNQ{ zVNzMp88(Dk!vmo+(RN04IEFJS+q&`yqdfA-;Qi5iqe*20BWYM@Pf4WPimoedjM4^u zQgg2+sXWF=j%7<3L6f1VJJ!53!W33B>gu?SQ5{iZEJSfUb{6>Z;N;V9pbwed%sEjqo@pw~gHi0La z;m-8AUhroLUnkZ=c+-6s=pT8B29aZ-l{xhkK%zVtBMC)*m;;$>ri$U!s4-nCM*@ zekv9f>ViGcXIjILI~LDH&P96?Cm)$#SvQ|y%x8W&__MJu#**gqjOKi>3vu_~Zrzdl zK-``XE!V_ePYkY9Cf-_^{UhQI#-O)^`|r$yy%ZSM3#?3mHKoyp#o^J2j?t9I46*i@ zEN+Om$7KmaqI0G4^WKDZRkYf%a^qnkD56&dCBQ_fzM}{!!salwI2W0VQg?4I-CVC| zVJcdZMJHB=f3EmWwhv5}byF2%s!Hh%;m&AHq>s^8#xBM2SWo;?0+07V7noMaFNza= ztNjm+tGy3VXP9-tE~o|kLT{)yd^y^3w{@vC)^h)qd#|k9+ZcOW^3bV=)j!u<%J#*c zwG6D-S4CgcBwk+~d|0{q)rN@)Hx}Z4QBS!PO z(asp{s7Knv*TW~Hy)k$6Ox&3;F1IE6HCQ_|GTXqur!#zuCpYargR(y=O9X0L!bIbTO5IvHJSQ2dp;WFyx=mM9)=3eUzAaa%MH z_b2+7Z}66}R^K5ojZBYZOk*HwzgxXj9W&i8xmU7oJHgmaBuh`O&i-7}_Zyfb#ZmW? zJk}if%h>g#$q{d7Om#uUW3@g7dB#ZFM}v30aF7&_Hz&*M8B=}2#u%N!&W)lH&Z@7+ zoN>dw)}`;q-&( zgM=!LwtW}~_NFxYa4(}N14j%KQuFQEl-?XwFnViPf(XvrbGs0Jr8aE9U_KQ&1yg}2 zX|e~qHcAfNEm|r=$S6Ju@yH*5PG9{tt|!I(QzrQn5^YEq?#p^|cASOt|BEFX!Z7yho`CwROiE@z4x z8GU0;U1*uhk(+(amSJ=$k-*dS_JuDb{B0W)OOq?`VF`BB}3SlSq&Wv$9 zkN4Y&{RgdEOptfvyz~cRf4eK)DZ%t5-}Fe(;w-yJh#6zs{Jnu_FkGDV4mK}Jm1RfkAV5VYc8dgn8uVo1TD^;PAB_dn?}9?fW>D7 z+j_0n=cV8%G&)L6d06Ar3^hGNxrV*IO`3||6#C(`G^_KwXFcfTx@*=wG2`I_0W#7< z&5*t!;bB*F)R;RIO+jqr{lfQ)-Y;H+uZC!B3^KM~ALFzrAD#bJB7*O4c^3|Wj=_$4`$em0eW;MuhKfJUNFFfKGO{Xv=}-}0mfAc(IF{!~>jPPq z7uPZ-T0vSU6!zh+v3$YR&7S!at0$K`CPn!T5XeB@+Z2)MZ)d0<*LVy-p0}$E)U{Rp<( z6}}0u`e&x6r%1{=>TwfR2yei*65c7Fbpm~YLt3d(x6j&q;$%J93JtU6qZQN@=yR8+ zX94`Jc&{%2LgMU4K-s|+5`}R7Bq{ev5LpX+AgGAm7LgqYmq5tS1K$Fn=o*XnabGF; zQMPqqVS$g_NJb#`b+dMt_|0hjX^0WJC;YG0qp}^)7=9Fx`|ky8LtnV4hSgSkl>0a* zsjrQfPfVMi+Auc=7N@uh@1&-W`o8m{P02lrIYlJlb@T2y~ zADcUS_AK9RWxI74xd;qOP(LLB{B8kDmtlp+!tTY9$jF+dHhP`gD3MFwXDqev^ewb6 zTn|CM#PK#G0ZkhzFZP0km<}hgCm(J?dsO6&sp@ur{eF-_Y1% zwei8ErV02c52K>N04y!h=Kprvn&irw^g+8dYCVoa+4T7gwHf6%?IDTC#C^wJ|uMS@!B{x1nG zb%IP%XQG9Xw0(marLxB=by^Jfui!3)``0o|tx3xfroal!X(hr`h^9uE1|oQAEy8q& zR)8=)rYT7q5N5OIJhd1U=zDUsGGxBv#I{9P+-tz8>yH!{t6lG5I&p>8t^L-dE3mv*?Mwl48YUUW!!?P5Jy3eC0@Tro_rFga4Z%A+$aJ>lU7D15hd3`8c6BY<3&AI-asjg9;WSDc2A6d{KEDtRBsFpe=l`YtLh#&P^< zA%<&zi`D!Vd+nE)>6e)8SDKOqCB)1`riH?F+{oa@uMEz|#uJYZpGE(rXCGV6eyx>D zWlzOQktl7(M23aJ_bcA1fbK4}{-;!`m3>uG5j_8kQhV^iFN~$Z4m30i?r1~WFcs|& zkHycg)&{i@OoB>Ghfro8{$;5hBFKQGN-CZr7T;*VN>V1 z(xP!O-jAnCF@@@V`W-rbh)Y1z z?#}7Z>GV-PU4x=m7eg1*4nAFr6_+m35juU0Pan^w>-cm%W+_^{7`d2k;M0wm-gt)& z(cd(2>8D~$bmXZ8ROCRqmM=xfvGU3k{5$GW6;-Li;?$vvRBip&me+7wO0MA)76ble z2A2X3jj7ObIdYUzWKv;*k(Z{Gh@`@lI@C5{icuU&s}V_qnM)Q=M^48emRFtDB9aa( zIh-y)m>x5j(*}eYF-u9>gfKH^C|rl_mfjj`Vf07SNTPHhCQ}Dbpk|6%?v^c;t*MVN b(&}eLkoP~&uFIh}{b*YG7t678I^F*R-BC>I literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/chunk_o.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/chunk_o.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2275a630c0438d022d76ffe19715434483313e3f GIT binary patch literal 8069 zcmb_BZEPFIm9x8Ca!D>f^ebKD($=1gHuWXzpKu`{M-1{sSX=saUV94NwT(LLB0V z5E{l`GDL*QVKPh&Qv~8D&JZ$$>0w%}(;;KnG;C7q#t;)W51Yf5VN2LLY$Xthc+0(Y zv4Vj!aZG@WP@I`F1;}BRw*t+=SurhuIQBzg*akhQ@!k56o_{&p`-ro{&+!3ykI?8U zZ~b!~y~eX|pgA;0%#m}HpAH~DdASpz1j6*oZBQe48`K)1MsiN5RYQ#ey?b;C&%Y9q z2+s8ZDUlL&-+d2O2A6sIW~~9;Y=WN$elHo2pH5H-GC?E|%%to?;(qx(QsY%e*`(Er zAy*L~?pNw_;db%lcuR4+R_Cg=p;vE1ui1uPyA6H!HuSn}==Iys8%onh-CX1Q2Hgj? zeuvAb!!=MD2=#TkryBhx`H z=<|+6V=-QMdK^SQO=8f90rthG1^^yPu){}iU;+cczrLBEieSGYgk5~i3E&ozAOyhP zfDkZ6oI@URKp_Hm(Op6@_=G?_j(ax#_h6S8j&;R?u>&GddGWy2sUxqvJaL5Y3I#{H zVl(1cG}1L43WdAEQH~FJ`K!EtN{k9!A>Rxy#JfgAzOHC2-la`G+8LWsn4=+Ic!cvE z6Lx`$TKL6n0Fo%n<+>Mwi=tfLy#|CA9K6Lp<&E=`AwCipAowAn9Dp_pp%UX*n+k9M zisN-kqFhT(lAC&#-;kS*3spb@8`AHbJK;U0n9rQ^Ug#YfGV!_q8=nH^-4GM8uA>LAm3&3|C_)Pw2WHcCvE7Sx(qY&r816*9x z8;OeEc#QW4eWBniFW^w3kf#GeGggK}hR})u4unDj26({)JOw|VrC!WXQEiLh2)KpB z4ZDwHt8XbBp$DcN_%?#^4t$g@FHw;wPn1a&l4Tb3s$V07-(87H$t`>-l_#o-Bw>%; zQY4d9&QY^8LbK4KOC5(TytRlMgmLWXTZ-QEMYM6`*P|0wpn35zd~GwU zP1H!WQq3&*Si)SSTTMGQn?N#4cKwh{?2hbv5pJnc+KqicD#xpor~=%q;`-OD((!1J zR3ll{wq241YT$M15Bf>0>PHY;5^&6aB|ZWYfg12#ta}&MA=x%$kP^LZ8!b^%RdK%^ zJeJ+ORv6PgRvUGZUaDy2aM0RPB)jAckVU%z2lp-IzG1@>bvsJ4zksxQaU%v5$t~p# ztmM$gpwARivp<1t8j5=2kiDgVh0PK>N=jC3WjGS*<&s^sQG?{bU!d3}$0*6s_aNHf zl4#sPDj~6w8G3BU1sUtobm>5GNOs^cZb;p;gH#Hn+Mu6J9i=X>OmeP|(jhrFjk2jo zUa4r<$QkdO^mR)#@4&rh6L-rF+)bOfTcu*pz~|+bA~j2``bcK+o~o1Ti!_w((Xy#W z{mnlckmM4sNQTWjNtI_Pl2*zO3}7!pvcPZM`cSiv5*)K(oxMd8O0DKzjwxErHkdI^ zf7Bwi>0SvJBJ^F71E0c;*ly}fi#RuPNNpI$eb#$`_v^CM-u3!;i$+(ouD0mI!E82G z3+Rb&Ln!Lw;I0!@=wL+bKBO4@TyR=N4)-WDq_NSsVj2Z5+(-+)NPt)DVc!Jr z9SKF@auX8;h2o;3fD=2#=+ijpD`ULC3;1@X&~dm#@QQ^G@nJq9dc_$}r$S6B#Dqdj zE5w-K##}^Hz*|YNZMs`QXTLAR3;4RJCe%U)t~F!Of0!o`B%=QxKvrPf7MZznM}`QTXL?2c1-x( z8$Too8P!(8%seQO_*M=1`@+nJbgaTk>q{3fnHT?f2b3uDV;5XAkKT zwf5YkT;85%JuB>fncaVX=*O4tUdpqF*QFI0WOHVk(`Qn|BM^HreK9kbCBOIWH2I_$ zO_4C4%uxDjwmJRxDZ^@6W!5!+B-@|cvv4ZsTX-vVE=3iLh&HC0Gi16wb1plWYg)J{ zSG3BO)KHt&`VyY+r0$t@Wd)|c*F0bcX&4~A}E zTDX)q?^Ts94$9_E+0dEnePUtf+JUhzOJ-jF=+MWn-g)(*r8C+8XdC(yNBIwiZe5(e z2ooW{|LtEnT2j;_yK}|fAln;OSzCt49867T$?V{QF-K(H%bi<1bm!7yd}-+J)dwvv zFCSVKmfJr&xkix=3)swv9ImZ>iR_^qv2gfr?0YaXouhIicbLWICGu|jgNE*#eP zOUu)`P6l+&1=4|xe{t_a`@t0T#P0m7I2h2(-W2Ah)6*G$wma`=keP-Q^_a14C8nu@ z2RU26*pC?7Dq~Ka%Nzv9iKj2iObs}azD_wJ*PH8E+_PA|L@kYc!YsbGJhXh`v!1;D zw7Qz>qiK0)?@CdtcGnO3v()!b|8Ov8$X=AoTV;Fe2Kk4*SuuNRfz8Didw$|tJaR{p ztGe^$J+i$AMrg5NuLFa9YyMl=$y^icSMWR7n9u-T0~M?N#$=`?btQc?+np6Z9L!!> zI4WD)7W;ofEcGmz?w(!lmiHf%E#G+V65UxUJD9gLB>Ta6RxGu$r54Y%H#M0$oaxOH z^Cz+gGp}xM1gkOG4~Dne{y%-^+F&m<=SCk|4kY_wvs&#LI=$}$^0slon2q0_UYO4L zKMvjr<{MsGYR*?3UapXe6@8V2L_)7MUeZJKZGo!^@+yIs9dy;6NZu0D`=cP>x= z%5YMB$Pr%0LH-y9-@xEF2Kdkr`Y}+C4T}~o0Pn8;%5%FefZG+}m)giRTTch|gxV z2qZ^E+C_Yrt3@)28xJC!LBvl#Y8i*rb8C?pJxS0L6#bj9O2%ufh@alnA{iyC-WDM^ zlwc)eva(2esmmu*)W;&?_0nXqy4D*miB7sDV*7RB=vi3v1pWLfIbJG3+<1yg*iw#^ zGv!LTM-80ueI{WS@ynj3*N#s*{Hl*%^x>LegITd~ac9y_NH8sFw`7GY0Dc~YLcsMN z9zL0Yr~eD(>6ZWmE6|_fD|7&JAoTbbo1Rdy4e*tBtq;II8g_bUOB?h@R()mDRGLVW z>UEG(A6?Z)J;=mx`YId(;Gq>mm=}Et(WfOZZ{rr6x5Tmb#v~N&v^NBVAU9k6{LGJb z9){}w1dOx8bI1WEx4r`B} z#=o93d00rT#(eMsjduy8FG0wL!oI8O;1x%NzbblT(JQ>*jgEQ`3HUCqPD!EQRaT)P zqw|kJN*9iaGu}`TG8CMfc*?%QRu#U3WvK~W2d?2+X=xlbgwTsY5Q8xcnlO02Y}jC1 zlBwfRv_-TQEuSz6oc{p7_?H0wCy5H>sB&M%cPlU-cu>)ioxFWz;YvOhp#*POK2X9u+)uwB4!t&5u`)Pdn_56`D z$-Y$+n@p@aUAIj0rYs4j6lG^~ihjb_R+uW8smi+YOl`9FPX>ZM0`ZyNl@TGyfT8`x zfA=a=u?i=g#ftx1pI83a;G~U(CdAs3r!|K#3=9mYd%{yKL}5)77k=yS)c(TZof3ng zxOzR%JOFM39Dh}{yFx}|T6RxsT-aj-9I6Byr4**`tqb0>XU?5h%meSd?Y-DL^v!dM zWsIK^f^jkEk1LE1{)`|_MIbWws}JW2H69cdGrXe5Mfl5!pgNCYJP+^vQ9+^A+lFE} z8;u0kNfg{oREv5HMah069>N{~alGSRf};5Wa}@r7F%{yE3DIQWaC->A<W|m!G}K{17@OK+ z!Wg4tW{g?Zu?#1hY6@0NViD`g^uT~@7_%c+<%+XWb~fgEWM^B!fr(D!DlcH21vfHU eufKck-DhQh{rQi7N5Lw`aj5&3*6x!H&i?~h4VA6{ literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/chunk_scaled_dot_kkt.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/chunk_scaled_dot_kkt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66e099caccf8036fe0bcbac14a546674a1dad389 GIT binary patch literal 6716 zcmb_geQX=Ym7m?^=W>^ylB_RVs}I{!tPjh|C643dBFP_e-58Lpvr}Ijn&Pe~N)$=$ zE^W)V1ZW^d$U6g~t_4IF1xgnMLe&DI0x7Bk?hp5$gMt3zh181O<^UH>@8JKK%KhW+ zpSw59Pb+kNJ!B-#zIikA-kUdXW_~ljVp$qN3K{+>IN(O;54ccHt`f2LuOvcih(|mT zLf7z@3=xauHFA-OXoXlI{e}SYlh=v z$PM-(Rjgk%+-o(Ud!6v>H6XtsPR5Bi3LxJ8V`9Vc5XOp=Zc$T_uFZL8fY@-A?dIJA zIS=~xC*JcCe#1-njW6LdFX1=6gnytuf7Z@7e{6VIGIRg;8p{*O)2UoMQY^JC*R&obdBiOS60 zLVaD&u#iiAq(;gi%_NIKITC(`W3P|U8l$2R^@)PlKNkxxc!Oa+=ociYpoH5|ek&2l>ATb$v2?}~QNhnIlFLY6=oK09(hn3L5*BYVI=f89uE zf6OZhOCcdFiFk()ZIBg*Bx3oxHa){@M8digDBF`&)UMaJ&Zu4Qi4I_a!;OD;`K6qqwTKs&Mvh$sQPyy5(R<4$ zMkS5Lq$K+SFi{4Ln9zue&}drp;O?Ym_D90A!GNSu3&OHSOv7Zl_=qSaDu}?piUz{eMQ0VgUr1xx88Wqz5L`}S@rYG}aFI<8H&;#*RiW9|!dn=|{LNUXy zq#G!Dmf#H!A^LENAE=R8FZK@bh7Duco_KRj>)N$b6owR9p#x-16xIO-8>Wi178$QZ zJ=MrOc|YV7-;DwATY9W#80DY^{2+OeyW0QsG=kQwNz;g#HW8%tV(!JWEH13n2m#Y#M&cOUP?Q7F?2ljmY zi+9%8TL%y+ok|Ow041myubshq9;3=0E3HX=7bvbySdBQc^i^9McRZo+8TDB&iLh{x2oa0)W4n*g8F1vr)<7SW8susm`? zGx>OMn!xX(%2UuqSy^Q9Yu(T zF;#^TZMay$5*iis%98$V^c$)~uOkieGBRJBqt3 zXjBwuGmse|136J|0G9)r1u`!moSnrYpmo$cDq-umS0^3LX&v?OrFZrg?_ChYun-a_ zK^h*BbOq)pfu6rdAeU#2U1c-AZ727Wu1^OZ3~UnHT_=(aiSfL{lU_FY-Ee=_yxzRcoG42y z)@PU0hTd(aZ)i*Qa+0Ogtv%dQm zlb4fJo@Uczs`vX-Yu{P@PG%|FwcU7Fr4Pe!p@K*qO^koZI(FGZDtjnzVUkjMGIc9u z&v;Zzdu9pX_+<7>Ze-KFelmA#i`qj~tRBs|!Nu+Us;hsKP@TgnGn{AbX(D+p&pOhbsVnKpd!5MPS_z~AX@Bm>j(r$h;Bcp}{9rUAXU?yBRZwrpTWt5lm7A%X zX?`uR8p!y66uckYZtC6W`mA@ocXQ<7z_#P0YB`yr3I~w$@K>$K!d6Vq62)M6V9WVc zZg_Ka^YyLHt%gU`qnTe?w*KMq)Z?>HE??cYf1nSuWJ%ACeF!Y^H2w2ZrYC)C^>ns7 z%l~vV8+g#KdIpo0yxsNv@r>nr@2mFCB=yv0|Hq|tPx4mkbf!DQ|9CVLxZkhZj^svu zy0qDoyS09Lt6M$xj!M4^?k%}}BtvB;x9JX@vrD(BbX(qHOzNsq;ZzHk z9nQ{VkLJcUUF+vIM{{p)dA7uVX?`@k#Xcq&Ky4Y=b`Ngd{Egw9eq|Ez6;3>fGraYPZ{bYeeCW~=g{zCkL}GF@ zCVTb$R5OW!923L-5^{qc2;oZ_-c9%3fKvmx$@MN4(s6>;D&jCHn*IHBJ0nVzRwJC(yMA$`?pkPFdksPHkx;(&lifzO2u%w$n ziXCqBM&G}|;Y8y-bUd;M{uVgAGjp$T7xC|-=|Nhr9EI!tDV@Si7r0(HQ2cX)({Oo? zh<()>Tsu7{aBwN*WWAD$%yQD4FDh_1{(h{Vo9^c(`nikOZwywX=?Y&ZP;c~7iM8Sa zuQbjdTa1N#a`2|WNwGz2pbRDld@%UGq=7pl%=%&>`4o5KQY0+YjG+3GP}cqJNI39P z=hW&KmSRHKFL3Y-5s>GkvN6Fh%tgtXs;97bD@I@&Dn6_(9Ih8oM4cJYTW}IAt--2{ zH(sv_uU&8DVHCm4*QZzKmu)R}EE`)Dexn#8CAe~<7)Vq4I9YcC z!f`9&RW4%SE}no)qhP84Wa0SNjKX46UKX$7N(>+R9Q(E51u9PA>@s8;F;-MqMN=4I ziyY3dIU0FsTErGg_;Cr(cTFndnC8-4zwl0mag6v8bP;ghpF@^Fg?EvsVeRPZ(YqHD zqlu+tSJDTE_6nV%;Y_qAM)OurV(jVg+fRni?^q6I=5t*;mVw991qvD61ry?4d(ts{ zuW6+%)s~*jkUQ44&!w$*w%@#v82i$~uGmtx^k{l1)0vsd5uaJsEnCcvmySpD_6GO!_(G1&Sbuy(UDk|A~74h<~Q#@^6XrM8SzH^e4()rQp(wZe+EsFe#?s(Tfe`+D5(DRIWXs7n{q)7QNVtobDAR zr4-uqVmkuJdpdRcPX}>vk3!_(Jr9_&$KnxldYcIn9qP(+J^9xL^G(fp?(nDV1NJNS zZwNMTW^}VCpr01$G_RUl3r1kTm~GCL{#1XaL8T8C%$Q|CuBKflr#iXp>#Fl;!HStQ xay1lem|~EpdF|Nhu>uxeup^W0lMnBH_}l@d|9XCv1ZPQjz5kLpcJ_e%e*xs)r_=xd literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/cumsum.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/cumsum.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b96776084a9d8f9e135c22846d4999abe7d3e35 GIT binary patch literal 11751 zcmds7Yit|WmA=E_b2y~<(A#l5wropuWXY1gCY zmZc0)InBak8muT_7p8YnlwK5w3=4>}2oMD*&;(c%F80TYp<1!C2I`>MU3h;fk+az5 z*Pc7Wp=gG(oK`4`9Y|;H^PGFn+sf~Jspz#Os+SO}y;ocUIs9j|AN ztZ7&mCRsCU9M%m`P^%xU*2J#8Sqn?OjRM3iLM>C;)vncRDc0JBW{_Vu4F4CZ5Q<@b zTeRA>cK}Ifp1!41OJ;TN*65)}3=JdJ_D{sTU8_;#Jh9eKOR*Kh#Jocr4eR8{(I>UK zwKwZpr`)|xdF49gmUYUj%9jt(Z1vlEP0OY40mmb>T=O_u<|xoI0eU7CJ;Tgn1Y6sM zF4U~Aakfq(0>ndFC*Q5pqjwq5JQ)KHp!@p2fbO@H*BQIsH>q2v{bqeD2m8b$zLk^R z$`b5Wju_p!wyw0)ke+=KYIbY+wKuzst7I9jirv0WJuziMh}{7#8lXk}T`cfD;GsQIfM-EJ6NEob?w9be_!g=A|W zJRTLj!%-j4-e;iu{ll0zfXJ4fe($SC`c8HC`t!yWN)gy;gh=>b#f`3kxnzktNUU>3|bue6Gx@z(Lc_+Al8IdSjhXvee9jCF9W z!N6ea_@p=z3AbJf21Bi(2+IY%+-1%m6(f9W&^O8PLhDe_*BTiYTK&wdMnufH3#XYxnFLxEvICdar*nK%PIoovJ#j)-1i zobv~K!N3&9W1qrf$G~sJ1RH;uI?;Wk%X_S+um6nS3I*DOl zZ#|w>47Z~A*NVob;JAlBk%S}%tdRVdmh37g++WfU5v*RaN+w)8L`Wvc8*afG!cWY# z0bk-zs$ZSTY?i{xO_ zR`N)dEHQ7=rhMWF@y$Rc=<2 zdlXyZlBXPY*Jh~b3EM$<2sDX2=3X8fK*e1103E?dD`|M)a_}kk!ALaB`uIukap2Ox z6zhGL1A>RabeLbPI_u&LXe3FUx6jr9`dpLb|@?70u*P$ zH{c#*z$?o7fOl{NCkHW6Qel}KjDU0DFd6qMBm!kPHRx4RQvy~7Q~u%cSxjcs@xbJK zL9aR=uK;Nu?;Yd#Fc;)Ip&53QLNDlY8a>#JobGG3SzF4Nbud5Q@!sAWdv6oj9qsYT z>7%&{ck*IlGPN`9yxtUdPaplA$?*kkpE;g5p8S6L=-l=j$1{~rW$ET+x-~<$eyID% z{DC=3KYRCEZRUKL%GNBsZ<*eoq4$5-`O%3FPGsp9Rt(7QjFS&2WUElPo=m$ll{>Ta zu4TF@LpRO4-mkq`o2A<|j%t1SVy3bwOM8~-<_z6D-}nBxo9D9h(|K+yW@K|D8xlPU z;-0%Y<(%zE9Zhe)emw2F{!+X@PJV83-nBcH?b|Z;Z8_SOB$K{bQ{tsmLz=kWn4!0) zkE9RGRo*6V4}M^tdvl?0q5G2qtE8@if`J-Qh4WA7nIIpS3q8|>s=6E2Ie&&}yM5&L zfraf0m7kEG41Q`}c=Jx*o$kf{vswGOI9V9ycj)s4Mb9)Q8j}rac8PjwnQF;UE%W^Q zQ#Yrw)N|8E??SaF;Yl7%`IaaKe^NV0nHfq9CHqqsm+af)WUj&$XCYTP`&!2SBzzJ3 zP8f>bc=g1U6Y>6J!?oSByHoC;)m^V!uG^cb+naT@EQr6=cky^#Jw90Eu{Gndf#RKb zK`IbYS;h*MQ^-rW7S8KP;q4(6bHL*RK99{60JVOPac#%RM8Ar=ut8P87vBvjzYU1? z6j0Zb;LEO|A}fcDOUd)yQP&3iNX_cSDikbMk;iBFjqw@w+SbRwXZF7+J}Z*8X(-12 z5QZq{L3NS~dm>F^<-E+)?gBn@G0-I^SDIKkJZB|LJt@&?`vb#;if;6f4j$~;8VT{H}YVz2a~6O$VMOU3l9V0oEYJFj&H^ldvOKHMnrxe z&bDFF4#cyC-;dJ=Fu^#K{~jhK=o5QB)f;x<6b?l^j6((fX)vKxr?q!LD8~gZ2TGwxjCW zYZqoOq}#GpyME5U7rPO=eLCCN@e9{4w|>0!lZ)ABd*Unxr2jt=Du!?vqa8`|S*pH( z(MA#@N#P&bQf#VYR!R$V2maYP*KtG2)a<|AztE7iA4-s40F>LX*|#Au!2H>o)P8{Z z9U142v@hcTAl#U<*^@;4SmEVJwkOUdg;d|{ zfhWLbS2>=uThrS!j)n}~P#Bv%=}i0}c_e-^2iX5l80MeH(uJ(HJm?-;qV_IR`!dwN z4_zPCe^8&L4yvGgcVai_#GmGtsAed6G@QQlTm3Nw@*b0kVF;-zkz4@+jIxR6E3MyqN;vJqDIfFr9>pK&|dA=o_<0 z9)C3pEVy{IP%Nm zA0Jn19Mf5vgZG+ww!xw&QbGq zmDONesYSGH`h)rIVfVtCCA3wp>^<0vE)}Qawzz$WWDRedW7eX*YgP6xrR-gcWK7Tr8|+@QvU`oU@}tl1 zUQ^NT@`f_Y>D^_m?%aGLqt!-21ZFSMQg$oRP4np-oT_pE!dzH z+oi*Y53g^SRORp-^!XM13ts~HMzc&+)z{9>o=s5LGR1wf&U+6RB%L{nd%6=W3thcv z-Tr_?mM6jJ>{&8@@1D)MV!~F3dZg9)4#v%l9L`6XsQ=Luh8G{u$gt}X#tE!c!9LZ{ zV4v#Ou}|RB5#AN6zX1D$Et7Dzyk$~viPx!97W)_+o%m?l~Oa{LN7hXvGA4&i|==${sXk#7+mTkqO^%hcf>{JQ`~ zggg8Q&nR$$5u(GxoFEDSNuXSW`)+0^!iRh!6AB3M@{c^nz`Z|498A+xg@-xF;EQz( zPz5OXm8XR{GZGM(kZ+Pvev{x6IYu1eAOo@l25=6?jQa$EW1)z_{q`!rRul8X&SvIB zGjpbyX=!P>z`!|KZKcBB7moRY0>@1F1V*`z9|~}63-d~l^Fb*f=D^S-GZ~5UsyJ_bbLI*V@AqI*M1ejz|SujxS_ffU^(_aUhhgkd#d&a;k7=BQu0HX>TLlY0nB|Q z5)HB#88#J$)#YIVL(Cg%2LV+RZ!qxXa4S`1&?*AT<|hII=P@a!S2n|s9>Rez{G>sT zQ>yLt;qhQICsVKbf>ExU=OaAycqAH>jbp$8e&&HOdfuk0w$$})N5hAp(bw=VkYF^x zhE{@IOl|*JhQECsNMOEcK6Q3->;CIs0yPR_OpR*w+Na25?kg0sQn9N_Cp` zd;oN!xINj4e*@E^{;bCv6$3#*4IE*Ui0?63?y}-IWrJFY)y`vEESoy}y{CHm&&Y5x zjPe0N4EP1v?1LZCh|w@78;>eKVv)(wfGAtw7cGJa>z!9_1$pdZWrK1ZDjUxzdmpYR z!*E?W7>NW`H(XdA%2x#RQIAD;0TFQMYu`TgvzrgX4|t+M?hyYbl)`65LIPI$3P}*e zY9%5Ze?atqp{D;pO}|I`KCj-rRNWMBkMoJ9IsSJ0+{m;EoTbh^UA;`WGK4GnQfB+9 zMfdl!L~o98Ojlpsc4gb|x=&~6zE96DSmLkFTu59x=%Lmdte}S zI#>oe8amWpsZR+hZi(2rqH89)i4_`|sjJczX~m{wVU4KS3MK167V8Y1pjVtq)`cLf zaz1f>#jRv3k;643C8U)qC0mVeQ0ZCvs~VhLC6R8|ss&`@)XN}AY4ke$!u65!`O=8o z^$!esoAGm3UA*((>vqI@;6l?-hF;dkErq@pRSv+I-bK9I({+JJq~PQ-E8|S8DkxHD9f1q(;rxYAUHy^Yxle zwkUa3DNYz-UMZDov)6T~sy?@SPp;;PTx0WlZ{B$GOWO{@mNU`Hw;>^Y&*5G%z#A5X z-L+!EDKm0bV}++g2m-BGa1n)^RV!AUqLI7y+EcSnrS%!t&K1lJ41m#k_579ds}(r+ z3h`~3%|$XHt;mFTUM5bJ$izjeOq?o_iHp``;vD|^m`9n(Qbm)c@{^?slch=p?;xnc zWT{fYxYCiI>{m|6VQt&K)IsWxU&V~b)=&I#b(DZLBj8Zbr>^>wd#HagHk@pw{}20j BQE~tP literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/fused_recurrent.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/fused_recurrent.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58228ef8fc45641b42924fd11c04fdf230465825 GIT binary patch literal 16544 zcmeHuTWlLwmS7cMq9{^)iF!XuFH4jt%9P}{tymAsmhD#jYNzAGLsP1fM2RA$A|+eu z%eB&z#gN@SW2!TNsF_)q?hO#7<3)RbS&RcTkj8v0f?Z&tYGxq1;130x#UcwVz_bG_ zu-L`!xtA)wLQ3f*yI)(jPMve^``mNyJ?GTrf6(bP6g(cqFWl;WiuzBOkRIDU^6Do_ zirS=Dilsf&9sHJg=mpsw*@FCzoTgYgtMDimly{UgC8Id`oK~tJ^JbOrQmhL8O5jfo zf2Dt?_=rxWlQQL*yb{K0*s>YfQ>~Pb)o`*ogOr+ivt>p2I^df!`6P@jFT&Rs;TwwZ zjYasTB7AcZenk*Ge^x;N;8~!vn@sVtws24MfmMS_(wP`+fgK~lhd(QNUP8EN5WWo zt%Rc3t|BE@%wR6=6}4| z8(3KEU34$@1c1v0dLAsFICXOV1lQ|vPxdaZ24;QU-W88$p?AT@avmr5fSXzl`1oFr zYn9{uz0)37uWu3F%YKe^@{q{$oHt-!Tou)4J+6gG)-}lMps5P@^J8S4+HTtpeAxAT zROmYY3K-O%HVVzuveVBkc{s11FNM-PKBZEl;qmc8S|h-M6Y%M=PQ}_{NN5>=He1f~ znDwW%Y4rOSoc@3-z&YJs);+}?)->L4{CFNG)0O?V%f2FHDG)fR4OAacg|y$FKkRrgM1c*2tytiHo8E_&2Xz7;%nX7%wjpZAwLl* z8IoANpD^wAx;!~4rgRY^dOZ7LJ`d{P@qCNs@ri3!hej?q$HphpgTp-^zHn>EdFjH? z=o{lVFNjrG0@trNul7y6e&a_V3+I&!HzqDztt;}n2qaRYwl#Q5bOl>Y?Mj5#cUyhkee?b=~wKTK=E&R_@91+AzP zb*#Zg5Y%~1s151Xz${I_b+|2(vpkgo`E@Kk-wBgUuTrQPnQ{KCo)g12cYrL7%pdK~ zFR8B!wV;Y`RjwkZTsNXn2h0xc;SpRDGGd%yU8H6kq7TW@gJh7yK8DD` zoIGS)1?E(e$QoE1!Q%m=w2ol~>!jHpr6TgX>sa!K^u`97_=EH&c$iN0{9rAeK4eUG zSWQ37n9iVjeiCJmtbkLU)f;RC<~`^XI)(L5nl)CHt<`H~Fh{3Q_K5!U7EYtnhqQ1S zdQpb+KwFG|D67W>R{z7JQeVHjZU)c7I>1&2dC*y@#x)pS@VNKlAw6(N`M*aG&gJyr zT<$8C>QD6toyqD?p*cK`PUn`#vINUVIxk_Q!833o9k%v+e=yA)^!8=-xxg%qXdI)lC;19T>D|HG$zYEQ~ojD@9n>L6(bU&|U9Hd6B*)Q4UxG$zzHSDTj= z$U-3qNit~QX^FHC(-f-ZhfzsZ+l8e1f9Hg%#QKEyh2VLxSW4i8B4;1U;y*n9z0aoe zSz{|C*YoIjLAjp)ljV9WN3J%c-?ss=cGyr)4&4LuY1lt+ae5!J=g@Z8u~ZL_f;EU! zk9yS)9gv=~$uP%(dV#LvWys{E!QPOxus0(A<)164Rm!TK;PFD`A%*%9V-O=yvVqk_ z4k8bV1($zbRQdw596$4y|Lxz#{`HP{9?&8WC-;4iOLjP;a}ndqTIi{z=QWfiAq027dYNd z>g7D#0z?y>fmQ2}NH2->yhyKz^o*#O7c7 z0O7IL^ElGRH(=0+K@$cH2H3OXTQF$Fpbdj|431!ceM-Iq1Mn6o-ikr4i^z8YPn5fz z^BAm%3b)fci%~B|XIC%^^dyX3&SBv4ctp7mXZJyN`J!_P10WZHR4qEm(13tZ0w8>! zsKV0#nNn}aM`&CxuH5Ym;LNi}F%8vC;_4=G?UT3~08nyrzvfA&k7N$`hoI-o_kBpv zMeNQC06AwMSi(9XWE5~Y`DG91oW9RG=Q-ZXd3X-m#Z%?4hACgCzWxbKRaS4_-MAZb z#4YXb^B)AC1fN}xcbxd0>63@WsgGm|3W3i)-6=wVi^t^Qq~#)gM;JwS6yh(ql_PMQ2=V z-PPIzt?lWJ-~QyopTxDtlS-=G7?LN|l->}MC$#!VYj`|N@0lxcQ_+!F^VY?fYwJqr zCZv|@Bh6thB;PAhhlYN(vZpTJ)950yaOYp&*nD&2&FE6B1x6)k+Hr4-BV|E(YElzShG;xDrj0`Q-a2dg;A9-TB7DCzf~O@ z-D=t%5~_OyW6v|WVC)a8_8?6jo!nAyG{pu4V@F8!d6^LprWGXI8TLePK`+AcJ-s0^ z5^dSI6b)>=4&pL*3i{6NenD>!DWKO8d1Pv%G&-`OkM#-q_HBirw>^XYI|Pm68T9|e z`pBNn5OIXxSRYGd5c98wB7w-ours<6V}COgo7s9O=Vij;ixQf^8LOK zPCq&QnZ}OCQvIT=`GvmeB`C_w#>`G5uD$U@yz=C3URg}5ws}eZ?xxxs5(ixKms-iL_s+eO-3o@wg1Tln06MA!mjb7iFjTtvQg1#ko z^Ebxr(e0`yZ$BFn+K&nPW6$N!NB^n$`Nfa5f_^llNT|zx$v?Uuz8?w1jy$73)A#M_ zakhV${KvV!pNs2<1ocoz{)N6Gay?2v`e{i1vfS`!COi|FimA3;pOqioE$2RI-+ZzfXB^K)1?FVD`jlWg6&m?= znaYik3*q^Y{Byl!U-Hq`_iUTJ8@=zhe9-lzD{kt6swueUch!|JGE0L+_$gkN}b+uKK;)1O&R?^@0YmUF+G{ABLqxwvIK zJobf@;XUWBr9-fEJeB=c^Pwhg=?{&;M1@wuI#|d87`jBj(7|=JdvN^ycF?x8B@s za0m^KcxB)7%70d$|4Kz^o4%Wtc4IB=###y>t)=Fuf2(Ha$e>U?D3}MogNjO>u886oBC_H)%&Gc;wWBe81!92^t;AO^2ta2^AE zZ1JN2L>-PIL%=x|E*8yh@1n;w1!qkrhMaF0RKbS|=hD2>)93Z^V6k}j4<%8>bAe^v zJCz+t!ckBX>HV(&H>nv4(R0Ace|}9PSs{Mv1Iz~fZ&YuCZ57l8b;0tWK4>VCn|y)` z8j*TkhO}N?0JlsYq?RmlR!-tYrU16p$TNqnro0E~rThW>g@inZlrTso*i1yMB52Mk z1#znU2kg{zO!SbP29%D11}lRWWMSnv{^~EeOtDJo!|l&#Ru!yb<-uwhe7-7lSpZpP zipcucAmzWJf;Fgwmm^iM*2_Q*VE^VnNQaEjZv#@z6^I>yUA`~bU@hcG$Br_!u<5x7 z)}h)%a@3>JbroCsHW<~xhG3)jI4Ui8?SROFuYlA8TLj581{=KQ!ucdAO06Laszj!g zConA!HU+RPOCHojJ`8rbDOQ81_h_~Z(Eznb22h9O0LzgApdQr#1X%$zqFR6^qy`8* z09b)E04+!f5LyCQg>(R`krrSLDhF7Li~#G99$-B(0Bk^JfQ_gEU=ykX$e?TnU2nM-64B6tozXt=-+4ozB8-u--F|ibXQ@rou7yY6J zqwpbaX8)t6RSVA3tZR`IfN+!|+P4&KlCY24Qt4Dv)Vi+RL_BhDQ0<1#KB z@#u)Fmv-hv*#vxVoVd;Va7ix)SlD7&_R-La*|`p%sF;DzlYRzL4s(oBj!)aSaeVk7 z)c0HX^Z!Rsj&&+oPg%Mm*Rg@JQ(=v|9?xve#3r}qg$nCCKUyDJUkWvcu1DxZspe5t zxGG`{*RBsIbX7aLrZD~f&F$KM?R>T*w4YobdZ{z->RJU|Yiua4JMvWibZpl)EZByB z?}*zjfZxemuTwRZE^@rj%UsB zrhy$r{a#7+P6?B6z|n8tQP&Ff_C#szZfTQH+7xsAp|maG7~XY^3y$$mo8ykFpSpJH z?Si^?PvXWMS5gImzo#va$Rb1FzKrdZ)vb@dRP;Vm2#RBWq!^{@g{CYihxh03R>B)h zuQF}21Y>H9m10aom79`f7}H{&4rAq%wkD~^n1Rw(C5;#}Q5B6zGsY^YI$P3$u}Z4p zXtD}p)s(&>S%a}!TuU9s>M_=Uu||wFVT{38GsapdZF#a4V{Mq%j%dqiPPJmJ z3uE0Fvr)|_lRX%-Q!S^Hy%;-6)!UN}jP+rxAF$*ARonQuW~(OV`e6FW^dD+$$zwRf zaa{Za#!ljHpTgK_%3w*J!Pr^KT%SCLu|XKn|ZZv>yrynWG_==&i)x=SEXqLJou30x&q6ue59x_DJqq+}{pdtj=LE19ll%im4gJ9kRql8-^G9K)AqZb)}$ z#+JQP0k_XP@qISSsch2+8#?%SdiC!CWM5bSa~AXS9?6h^L!_MsVz_615!(raw{DoM`ODNZXmOYjm}maCwLhok=!Q24kc<^e=YQ4P+UA zsJt~XF)?PssKPPEq4@!qqPv!E=gQr*mo9<=Ics;_Ex^sRa%$iLs&*; zqUsi0PxA4ilEir-_UQ9@M9o#7cP5p<|2d2%b#V);W$Mb-LyTeYKjF{61K?dMp)@8d zsCH|@-Up{cqW?sqv*)Y2MrGq(nc>md@Y$piaERAvjf5(}lv<)nF{P2HGE8YM&J~H#+E)$ zP-jn8VOBNO*!uYGt+%&F1=vYzFsqhoIr2x`)mL&#W%^16SjwBlL(Fk7(<}1`*(#Cx z9Xq%!kSqV$zso0^KLL`e&$x!H{Gj}QORgtYgLHSuWU<+cxHQc2{t}FWw%)4u@-T z(c=bJ!#&O1yL21ekV{!zo;k2NNM*Sx*Q(!IVDLHZ;ZF|-R)sSZ8h%bK2KG7zJ})shxWEJ#U|fL!eDi^#jDHF~1^F{GgT*mE zO?)r-;1Rdk&_T!QYiSuUPNdnbB0rY3AO9&&y?r zEhl=E)!}R{%J2DjD&oYDtBcw4oB&ON`MW}`Ntq+ppdAx}9* zmKT;iaI!!XlHiz>fO1rCqT6xz9)nFMBJC9y?{-akI7yZP-UajPow21CHkJ~c^nSM= zD;Q`ENF3|kn8)Rx9ZGB6(6#Z*n7(oG_gkw8xvPq0wcuJL&E%;9PQtD4D`9VzsI%0h zz}??(*@KDYg-MuB*zTme!_XqWJ4~d-xv(Pbt0luO^B^f$LQ_5#5W9xR4AvtAM6)(n zpEPv^H`v#g1EA|UQ?$Rq=5K3RKKOw(j^QxwAuDaauNNM;78X66|8z#F2L}fk_X3X4 zz`(&YN$Wv=g5B%2PwyMu&>n1XV>x-JdEWxVxxH}DU8<1hTs--H{mqPUBi;@Rx^AuWS+9z2xvDJE?{%sFPj#`GN>IBCNOz+;X+0&>bNLB7;Z z2P0vv7~( zxKg+uSZNOI$PDR-$2a45&n)=dY*tUCv3#R~+?L64mo0>%i1v#^Ao^Zf!ZI*}2)Hnbb_DTMKTL zWM}f*$aa)_#9+$;V36;Sd$!tKS5iVdzt+l4mm%=gMPYsj!pL$Yi53bVZg8% zcBXz7!aO()rS_Lm=K~wlG06lNyWP(8;23KsP*zNKSZz#aeofZ=A-R}T$H}HXklxu~ zP&qv%$;a@P!UuXe7{{W|4_}brP+(kenIe!AF#YqHbD5E)2b}6< z=04C`?=3i^vXr8bLh!DXtkvLSKzLaJr#LZ@r#gA)ZE5a z_645z@q8FFO2PQ^J3Vf=go6DrQHf3JRcrYV|HJ(}B`3D34D=66n z{-6I-Kp;2_aSpXp=+LZkBA6Ks2;08W0SvVdWlNELi&a>W;c^r?xMy z?hCCZO3K#JOYn;yh9AN`mCCKvxZWC4ytH)hS{#DKkx-Xr{nsYcx`f)2P}e2Yri8kN zAWg4obd{=iCLn74QwVEob=hG-{#_Um+%KD$NW~Q_shKe1AC%csKWB3;2izV%xkr}z z)*$mOrf!01W;rlR{19P)OH*_106zegiKWz*(@&(H7l(kKtffL7gZ$q@JuoGdKLR1Jq@1SdSCt^=|3Y>C zXUYsw2bp=ja+fv3`WDgCP2FSdmNw}iWS>O# z6LNs6t|c*(V}v|T)z&|*->OfZAmmA^s%F!(;Ypq%3YVtKgo~Nb4Lo{JP+*zl8bTxeSk0XQuAr)EID1 zF6&n{mub2-QKBVclw)FVpHN}~W>2jnX(fcj%Y~4ogw#+FU$TX5$udG}A&8XJ5wx7D zsM$QbaW<(Zqya+fNh3i`l(BNxzz7Bgu2LB~l4ioK*pDp5SfQ#bX(7Bys3NA`P zEWV2HAcEAAB%KD`P?d#_;eb}ug0tTeqYkRq4Sr$tvYg(n2Ld1OlaXOQDq#f#vQ_oZYM+nO(O@ zE0 z{U6`V{v3$}5s)c-raux8`i(Ij`I?*h9&A>ThBR!V3AS=3&TtbPMjEdPCZ7=|1dNEF zi3Az0KhH&D)ud;iPYIguQ(>d2JR$vy2S4#W*NdbDI=uM*x0hgtm%u!Zhjv+HSK`=_ zY^vM2>@Ef#*sx8qR*ojslsZkEpjt>;WY&Z;N20_o(5%W7J%X|#9!5O|(<&-L88vi* ztx(7l=q?oOI*RjGC{x`13+Oxe2$g`dn4kVX6N;V5D|tOXXs2kJ*n@M0^CPd$oF|H@ z-%|4P_H-_*+&0aOlF4brOp-aG7VI2V%+x%gmNI3glw96Y^sGkahVt`HI8RAFMM;tc zO~de!DEr!Rm=@9Fki3#wxpnVS>D@JJohv2Ep|jh<**XXjBs*Qm&Ll1UE>Ssl(j-5u zZjv8C$g9Y16~O4!YAV~JXJ)W0yT}fB2Sv2e^i#weO+QA+leSk6L1So{Q@J!!xet#a zw7_*l!E)@PBIkL(&wYMe0IP-8$pYVDd!Wk@*dlMRigl5OJ&(rWE)lS8sm8IU7x^@& zajm@Me_FKjzYfu1H^MtER9G=wyu?F}2%T1^9bQe@l(oPSR87C_b`^k|Gj+>$q!dkM z)5H-iJEhJz9GP__wUDt28HclTalu6!Wr;dC>-YdDx=JjIA+jHcvZ6b(hnfZkno9Hq zkR63#t-!R1_WkJKk!5{J-QjHCiV*~5g zDlujAR5|v_=FRP6uW!v(LT^+b7w8kG&B1u^7~T>>YjC|7^IfgZRJT zT0}+*u%O-~4X0b}w!Nnk8SEl-k==r9yP<7Br(2*iB3*{S0zjHqpz%hyjb|4m?xQ`h zPzFOMm?d|TjYr9a06oM+4D%E&0*Z6({#wk#b2^%vnzD#((LPW)4a0g9Cby%|gps9@ zoxtI0psyV0Tf-%*5;*RXWtC#gXOf?>3Tt9+v7pd`Q2RrU&A{+a8tD;;Mtk?|Ac(F literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/kda.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/kda.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00aa3f47a65f36e6f7d02788abf6bb32f9be70c8 GIT binary patch literal 53731 zcmdtL33yvqb|(0+?>o3};3kP$D3MwyZ?dR;GhL$OrQ`)B36P?=Ne`g5da$XAs}0yr z1a?(MP?a-=-2TRN>`c)$l_`3X`AX?;raw>5*J;9#i!h>^s58Gu`u_ z`|yBA>OqujRXXV>@#5b5?z`)`_nvd^Ip_YLHk+BlGpc=U`0{Ua+@DcEdg}6?n;X0w zcZmyd0o5pXO#D@isgJ2unWwd;({8%~j zTSqI#DvwpFxFIfJdrNh!Dp<*D;kUn~8rR_0fw*cDC;bbY2{?n?i7is&ioXF@iFkL3 zcu$FVZ;AM_U~Qnh#Jh@MU7!;09xm2L@q4ICG!D)FWGw)&25Ymj?wut7PEa_peG z3KbZv4%EG@JLZ#mPy7v>{YTq})Y9eqv2-tgwO*?!da2weRH{B>f(1E!lJJ=&kkm+kn50uhnz^RC>r6=v3&T74jJvI2YKcJZl4` z)}&1((r+#iza_Xb(1rDCYl-&{ApRRgxRiW>2TR016s#Nc1|G&NX)CrQaQ+{p9i`eJ zuAGlx#n^^Fv=-3$C~CM`&F$p^+W~FeLG_6zrM4A+-_wZxV{1#o?T*0Cm$_k;G|O4K zW5paKUtrfR?jm%?=?SH<7JUIC>D zDnb4JKB-pI+SjYK-0O|_Tcza&v=Mbgg*^8Usop>Eo?85t)kkUFIfzt;q*@0aQ=Y|v zo)Yn;R;T?X((f%1|9FY`9m?^8_P{r8pEHHkW&IsS>F`~R62{}AYCz3KsEo!%c{GMO zLG^yG)W_56*DKKf=>5RnKSb%-$Q^olLqL`PcNA+te-TC{-=O}Uv4$r~ ztUlt*Ern-^ZwcEHeRswOo(y&do+`0weKl;r9e5h6$i|XT_f4!U$ME+I{+@k9Bh9vw zYs7N|eOMuY&0>WBHlAM>%1f^c1c{5*1kLM}SPvQij~ZGJfEPdi)~|m(@GH;Pl2}Zs z_27jP`Sg{D?=KNw0ps(ErsS z+!+`Igl{eh;X49DceSpHyPt+Zw~j$~J7Ag%9LKJ=5YK|0P!G6O6`OKCujo0 zZ>X2Rz3%lIzsDcOSmnza1*ax^GfFo70qm0`j!{-bTlvJ^E| z$&+Z!1Euz_U={W+?M2;1y$r|I2CMTpt}bvYkK^iqW4 z&?3jPO3>C`<<&mGmx#Y#w0J3+*cu4kK39)DnBQGqDoN9onss4dG?OJ}n1ngiqP4Z) z2`O^IuzH-vh(2@)ZwyDnv0qkj0XVu}+ ztUk;ShbP7n(hdZMPiGzD!Lwoi$%!*T-aj$u@BEzDB5kT{?ZENL@e%(}UpN@>2ZE#F zK0iM>8uSmI3HXoWy<2=gIx)~U>K~XK3r&t8cFo{q2fk}TTcxp5_9?H5xp)oP7 zKi`zBXMA$Z-#0K2JW2f+o*1EY73d`YsS$s^k;CJDu}89cJ{X?l#|Nyamh^!2$Pu}F z^Eli~V6qCT6Vf}`?^{4OmEr>`Vk1tgeoz0b4x6_%VvE=#j)=2FYQ}VoxCG;rTCk4W z!g)fQ;{>CGCI>XET9=##c?6S0WP4Eh|6!Q0{TcrKohA9TcrF>$onrJDHK=%kt)H!YXp6y zdfX>)CmhmO#b3cDxKJmjpg%!iv*dZ-@Ls;^!iTX(0xJNmi)N4{{LYYu5xCPgFv}jNhX$aG- zV2{v1a1&TP%mvH>_gz)MBBcL2sLnhLM_}zp$=}7P!G3R z&?@y4;98;g2yXCAi(rAElm^vS#vPIUNdy-sRe?SZR;6nY}B4;Z(4^RT_5JoKIgV(pr^Y@Rf509N(e{%Tbx^N#q6b!FBJK6QX){(B@ z`qAP3^(W7Tk57!RKRr4+wtj2^o3uZ87CSHcZT)E9xgZ}}KRDXAe&S?k{Yap%{p7i9 z#nSB=@wCVCJ*W(IO6VvjNZ`*~(rn;w2 zMH{1sp)74x=+nCk5#Gu8jXKYJ?Gx^=4ix~(>GIBjcq+w-=6 z_HeqjYtEH!d?>1(+Hu|HiZ{l4Uwh__mc*%K(+}2AWX58PyI$>@*LKW%JJQ;YXjjJN zo;r|mx~BGjX0$CBD^teGgez^Vnd<)3P@O4nj#+M$S6mvI9!WMPduLoT-LuavY~GjJ zye~eIF5e%u+^Ao(P~VxV@0{&O*KeNJ)?^x1FEn(d8aieh(+!*EwY4`4mGcH)rekx; z*gkKpPSv(&Ow|jfhLoux+3~TdCDXBEp<{olWB*5u>5fAm_089|r;OD%q}X)FLGeSx z->}-_>UeiTJ-uh%QZu#dx_14nHl^MCCFj%WKQ&u&8vOnOzdHPWq30~toPm5s@|nnI z=IrjAg?v_uvysovS*vmm@;NzcMb1S&H|MR-dC2GGYS!k;$XCwQZOB!Suaa|kb5-Q4 zrd(>sS4+M+^3{{CfqXvlHIlE1v)Xgbde z@;%5oy}5_T_b}(F%{@ZCZD{BhkHVMRrQ)o%=ty+iyte8lK$ROP?Dq)R@kimCy}(~D zaf0bWJN9-&EPX_LD4-s}q~vRh_)S1_r}RI-0s8M07=_;kF*M~?#=`m=>R$m>pT)pvZWx#L^y9p8ee6AXe$Fb9C)0kg?tP7FT)t|(Xq8`}?! zJ#0f`+u8Z`M~?@6{e9tqRINiq&_w|nk*I_>x3Lp0Yk`K8M2K$s~^G6Hxy!f1Ah?FA%fM#3qSuDqOu-^)tP?+Z${~>pTJ=& zv(Y_os7p4!v+~NynZ_%tXH{>lzizM0*dLke`{~e!Ls$DgJTdodj#C{_?N#B&woi3a zqj8vXoW^9%89AeEDuM;|l406_h02l$r<~1E-DhQMGiJ-hwXwCYwcp8Ob6xbu{ApW# z#_qxLo2gx&@ziCi*JNsa8DH0}DqFe!LJxL`Z($25OjrJK)O~;mOX(rxh|r~*gkNL2 zLZXFD8*Wl{gnL)rleO}}&~RW9WY)xFnBR_6z^t^?ks->*C-#`};nBXa{y^V0{uxC6 zHvVo>=T31mn`Z5)HIH5mr`8<&3Z)yR(pky;H<72TSXLNw{<~s{$)hu?Q>~llj;30> zze0(sGs_fpiHfqZ#w9n+Xs$dkyDQba^{bTBzf4I9SxMgHz?G`mmQ>>wR#KZb>ppVy z@!sA3-rYy`?|K5L-s4Xk&06<$A1TCS%@lfS_x`>6j%JPI-?6{@NLIBA=;~4Y?aG?F zcOKpUWcShikN0FPz!7^NJK`sNk{?FNJT2P%2ss4y19D`Z2zdrIAH#Qv3l|{j1i{mi zCr~8#yt0(U5m@pBRD!KQXoAcXEF;J-QZD<7P}3z3_}yl~GN=|T0X1B&U?n0dpn{(* zngPvw`L7}svYe<5OCF&jpnYE_)!+neV@sZJA*@8&O5}BT}iu z1e16`J~~2wAS+o8yJ8Yfs0^sz*Gpq9Sk7a#BQ>&bl-g|&%!8_cL4vVCm0(5KSb(ou z+4oAVtrcn%TU&e2tu+Ep3$2xvN*uK%k5C>UsYLEOm>smX3zb5R7!Gh^46%^{CMmy2 zoh(14o~sk;*bKY9|LVsdl6`wub-6#~2Gp{0iSAtT2=#*LJmMs1VWZt3`$lQBjmwO- zPxigiX#0c)#nJYCz0nrje3z_brS(7~lu>Ji^7qXWge%qLE?2Wg!Lw|M+WT5!6XtKZ zQ2CMqoC*y~U>9rsxCJn+5FcqqUs;9bml_w(0D|6emCzh%LCdT{%S%m5-lOj#EwVmV zf)3SPuYs+yoRmhZ_5Q5hkrjeHU}20|9eh^u@wGyWu!7nvs{?VpmOR3W!puP0NXvK~ zae_ml0X-=2?e? z*}zXfjQ9c?Z110#91ryI=WuKi92jAU@!4S1hXk9GULp_(cl=J`Gn79H8>8+>Ol1IK@@n35A!dfl(tn_)mg^) z%&N}u1JtA;a{S~x4JWG|3-*m?wR|WXz^Tv4kjRT0qxa+FOptSuoKxf|AdFefus@vD zO!$$(7&#~XXS14qIJ%QKmkMP~?1#vTtiIpR%G00B{~}p`AD`9r`$34(YgQ4w7Aw^6 zKSNpdXAMZ`Ki-FO`u%9^N&gV};GFcI3jzL@kavrTJ$XnZV`><;&d*M~!~-`vWfRYY z__vYWckvhcYYfyBcWWDGv0YpjTbEF!&DB#oKQmh|w#C}w-Ep3UGiF;{6{8l#zIU!EvuysItm zJbdNhh2{rS%@3xl9{On4C&t5+cja_tqB#+Gt2;4tdF_m5X7ERc=d`m=y*rUw^~hW( z?b?Z)obF5dX?^^3Qa}CD%;9U!^|R>1jWOM43f~_|yE;mK64hmF_V2kbRZLeT%95VT zHECCC%GNry=QEpQ!B(HL)hEK}wWAR;J+VPEvg z_>-5uIsMJV={JST!a~EARKu2ZS=UF}PqdHy!r;1YcU~Nd4aNHs&C^54o!9KE7VK+M z_BHSC_~C(f52Wo~G+d6mNCn=evnHqE%(m$1f8Lky#)o1rrA+n76~F76*)>yfW&Ov- zjvF>-e9QM66Gsy(ryot(nsNqCX9WYab$xVYd}l)Y{l0`@`q0N_-|Sw3KS##yDn_U5 zzNiLZV6ojl7|$47@uxmE)F#a{gCF}gvGQWWZ#2ER`ts`Jksqv2+1p~8jL8!}_p!-` zx8K`w>A>`XMCiMZr|ivNp*Hy;ITCNav|@Ti!b6a@P_-^qwJz;$pF90C?Vfw2?)j%^ zP25gSNsh%|(c5Rqxi_8+|2%!Oot%<+|24gRft-8HyPv+dxS~Q{&+J3{Nx@u4sLJ<+=7`fAV%GCf_hPyq~zr9Rf2;zLc&HK z2W$#)`bBYi8E&G?$*@S6cf=!j1oH@SGUWUZCESP;JPdF5f;QDi7|Dsk@jTj}e`J_D zXt$-0gv;Tm-oZa4l*#IJ=skNZpZJ`lajg>IJ zvVf|{r@`nF?JnmMDV1gr+#*dRjf+Z-6FZmoA*d%W9m+G>SS{6V5eG@BW)(sOqm6H` zU*-5FS*uEv7q~Aa1?12vpGEmbf?B$r8m^MnPO1Nl`Tnaa_8&M!a!OPQwlorKVT1{F z#`87c83GE+mZ-h2CDs5MjY8Q=3=V`UMmd7k6@3v1N$@(M_N8hTuM%n_b*Q0EsC%h~ zU~Lg6MX7gGQeE%Ue6JU*0Yjbwv<8fM3b0P7r~1jpSBa}rFRdDs7J@5v$jg+c0GHzk zE%{bfk`g4T^R2F1Jnunwne%iPbslIf0Y6#Y1f5^;--4e%Lk^wK@ey)fB!|&67Dhbr zQxtK5oZlelWpchl&Oak3N=}TNSIJ=?Ht?^JpUHr{PCf-EPc-HvfV0`!;4E2$u&J6D}F8F6xP%id9@VG}T=?0Lmj{bOQ5hoZg!<)!-7XLinpt~nlBaBNFCw*4Xh)AJvmPdg5zj0Z$wB~KdECN(KnbMnYc=o90{lC(xX zF{(RVop|z^v86~hgpyBQ7G~~<^2@1&P;zIQkuS;wMM>&F!u*fMcgBt-%CBjC_Y=|o zbluiyDE?$jNSyxI-a0pMb z!*NgSsrW!r^D}J=L+;qL%j(Yrhlh@bA?!Ka2a%+{f$;F@zVPtGxH!@L0NShx3<5us zH%8L6t0>2I(|+kbIP z{03Fw0*Lfz#*vj5m_ksnO`6`~nY5>fFgAu9YI;FB)QiqP{IJBZd)az^L8 zp(az+_-6g(`lvl)v0m(obzRs+5|0^!gWfqRF1{Fh5yFOtljR>fR?Zt&&O26S47Pb- zC62O-LQF`i65)>>&GW|Qc}H``V3{{mWE`H0=VRx=**yBOqjBEYIPYk@VJn}v)k9oz z_d?62RLiE>!*j#eTJ`}I5(_ex&l{VPFV4AsTJd4U$1AsgR#AKDThren&giV_C&mwq zSMAp-9*x_2yBEVJD|+4sS|PmGV;u(^o8X1ZnO ztged>#U4ti;^DV9zw^+Qhh|UxRL^yjHTwJ; z{+Yw+y7g((1_)>xjM2_-pT2Idp07Eaw)f5(d%@;4nxZ?ueKuorMAg@g&2JwCU$ysN zJd^B688?2(xlQIz?T(xVzrVn*4!@|b!cu`!RTO zW9NXJ*#fqLh(>q;mn0mew53K&h$m!Y$s=tVCukZjc_8Sbn3pJgB`x?7(lELAoE5cF zmgEe%$qU;Aol<)QM)ZY5q(=7@%2&*FN%=aIZ$%Fjr@#0U9xhsMf(rw?f*&+ zE#z7ASwG9mk26G88m-+~m7g8zm6Uh1OGvb+CC2|teDpf4#|Ai9kIOl?hmJd@4%{%B zE?Q&O*X#?%`joLg!6!T3>AKQ2(|P5gv~fMw%;nxMTD+@2cC48a?*Ag6!U!Xve&E4>px&7g`@kwLUPnI^DW^zHtvCMV2%=rtsv^9PZ}dBkC2b4?*r2 z5Y=O9nKG+0pblyRnxHnI4eA2Apgy1v8kCNw^9C9Wc&os$%fB^3sLdYbAaZ1S*>=pF zSBrE7%=k({*d|~p5pOLKFRDb^P+L3xjLNA7tpNvABCS#Gtm>Ez=$t$2+C$2QkRI&8 zn!1-16(LV}d^ivYj{6Bs$-47M3(mcR{=PxzWdudUgP|aV-iW~80^b1fcch2>v;y8d z21xvE71m(LNe5JKs3V3V3p@sN_&2f2N-Hi9Ppq$gtl8oNRF(4=)={hyoF8jK{vA+$ ze_@5js;ofv1k|Thd}VR9VG`;J>3+cdj#^qth#=(Ghb0){l*YVBJSb^O393OAj8_=m z1|lxyin`LC#GY{+ZJa0&3Q{ZtxwHXJ+Ck70%4u}+G+?~xd-xwCi#E*?#@hMHC&r7q zRYm&zK_6dOjoNIGKA6P%l{NZB?I%C~zaumLUy(!0l89;c!k1N#k7rHafLy~^Fbu99 zTkZa5Ni<8{;zu00zr*V1=33XxK9Q<=C~CiMw8g;ds=Nlh zlbRJ#?KNX%rfC!XHe@{IQSIx-Omz!>95*bEi;u=0op0KfvTVEVuAOh_n$!N&@u6eB zYu|j`{k9v2SWa=o(wPw+gF>8?K(xMDpDtH zdj8)aN7+WWxSb(tr#QeOH`D+f1&I0w&R5+&!uePKh6eaydGN+i!{fyMnwS{nY14oS#fnZ^}g*xWkv4U6;Q!?A}! zb?D5rk-J#PKp`XNsl2pydTr91a<_s6L6n(D3k&(ITulQ=5cuqz*>SNpR{OUOgha@v z24+nVTz|5=_t5U1ds1xRefXH)PR=*sC~6h_J9M6;#}Jr(IMu%WBW2 zH9(pjO3<-NQL!D1DT`QHh}}eK)uM*tqwr_dJ9a{=-T>{E!Qr7$Rx=Vj$LQ`hRn`@l z0ACZDq)_L9<~tGl8z`er_Fd1;0T(>dLz@ws-G2$6e0D3%>!Liq?3_kn{^KkX?Wbp% zbOm*+h$rHP$~sQFiBraYF*hS+5iicp9W0K1DXtR}~D(*IEG(l!Q3vgDQgOsvR!MLF0C&h0jt&x`}wy^Xz zlt4&bsdu!8m3pTJQMmXW(+^=%3fP@WzSGHiQvun-j+f7s#x7r56Ep(AB~@xKVr@X& zwFO-PxpagQRMg0&f+KRx0&4--3m))cZ9{6=EU*_|6+A_4hehYoUq83{B1c@-X9{Et zh-$lpno4tW7Z+DQhlLR)4~nxyslTC0%CvlNIzceObbXZIszI>RduacFH$A8h7z#b^ zTc!ln*Dj%PnG)1ryM(4?O2CRGK^LLR;l4j<~LfD+=-xTG`B9 z0a>D=?u%UJTq);zJ)rHmMJ2=bBYX_~S^?XrX)(CR>4<+rIHerp_Cv07nr&sDl-=g}g71qk? zvuxi8de%ehgnC)4m1?zaNv-PR>esupoKPp!Qtubrd>(7nqBT)z{lV!Lw7qCviTZ`4 zYpAsTOg~S?Bu`A{J^s(LQH5Vt7F<1;xXC+EXotLQQRwV+BV2^(iy9k1}v-U z&T6_(jfmz>d=z6_WL`@=Vk=WQs^1qMH;cP!mc7=cNA>IV1cP%2(U(9{~vp)d{?sp!Z{N z5IZ!^q>v~BF$~QWO3PF%PEm?eOwj_lpQPMF)Dnr&a<{)C#lBJO7czpv41U4T8AzXz zkubAzChA{+wG`&@ezKo@;2jZjnkcl;|9}M(bo|3ydF7?&rk_i8q{~|0=HH22iNMmy z%C0|j|FrJIx~r$s5A2TyqMoUpI0cNKik(ZeBwd$Ri4pF2KO&lwYBAzdgYz@1{o>x( z-uPq5olx%IoARznTh}gF*Qc!Of2^iszqIv%>$jzu=}URnr>z?ntQ%9-jX&N&r+#Vc zBRL&scR}jL%-I|)*8@pU%G;8*wk}v#rL3#ocmJ^R-O9AJQ_4|%4@C*@s9loAJ~t4~klb&M?T@K$c*+y5>8`}iWaH(%$-c`6qer5e8wPW<_rlqD zGpq}&N*QpX__3iOW3*=wKk)7I@ZEsqS2HQ8KqP5hq9bLhO=)XI8L+P!-I+4grL`cs_@T82jiBpLH&T@C5PA`J#-Uu<8z zKhYF_`t9C#j$Ju+&AMS~_jQN&d%a{(CviBb{_gV&j+UqfniLE6hLpVlOrW?b-Vr^W zP$jxA>k_Bp=P~|GSN6?>W_#Z~n`-Hr^UU!dR?au?xZ3-X`|7@r22nAGnMqVRT>n@y zfrgO7mvZ>z&FM&-y1cp2oPpUL?;aQ1l4|c>+#IJlszpD4{Y(I%4tIP`zyt{PsHJf0(sAg-bV(Z+-RK>Qd zN9MnA_?n|Ps{Jgl9y2hr;+nmK4IfjN5yL;;Ij5NmT;24qU01t497}CInBH_KWj|Cb zDm=62nqy;Bdp%#Y6u#l~elL{R^p+~QDQUR;$jqix&H8l3hLm#yg!-6h;_2Aw_&{QF z+R>OYHX_3%^~gBf-#e1cUe?=~j)UXG-y)~3tWr5x*Kbt%W@s5WD?L2-yQp|pu$ z+SQUWw!Es#?No8j*4!Qyr!z)3zIrO&5!Hrp>dVx!w=X&NtF@;jgaxXiss%51C8SZt;!rqlkiL{5q<1LizO{^)nH8d{2xEl7;v$hKV1SChA|s5kJoAh&SCIrS zGQwoFRzTDj8Ap+`6qVB!NN2F$s`6xWH7sc|;su0DfdwEkA5jZHC9GtEEi0yB#PRLv zL?T+uC2wkFY071pRmAcW^YOB8izNB&EFOg#Gt(|2bEn8olariEEG^Zqp-edes#K=> z*HEUMO{N6fbpf4>(I;nlDW$Golv>WdQcB&hD0LH9pmzl=Y>e4RN1BD@%4mjh5*6&9 z<;mA2ggUWC8o>xIUG7pQk5^$-B!vP+3H-X6dY&>Yee>x ztStGL)nc9HE{-Q^8Fa2-?rNc?$c|jZyixiV%4f1~*DU@Pra$t`a>kTv0sB-QZjGy6 z-=*ag*(gQ63FD>wE-K7ZK}-!RZ>oZyqKMxh=VfyKJvkT1`8GL!LC#KczC#Z2PGlNb z{|v#lwMuLg{&PzCm*muu<0I!kki#UBiOC?EgC%KWzQ`^qa7y?bz4$9~{+gV>A?HhS zZjy5g4lz20`TviC|Ct5Hz#~CiH~%Y&`+IV(kfQ;Af#=A1i-Ny|L#&dKF#rFOS9OB_ zPZY1B6fg=u_I%#$KA25AoX@kA5U58!K|^g%&1c4HZYIy&J*XSD`VS4iWRStp_FAcSU=|V zvrQ@UCXp0NnIFB{{E_;PR;A1bmQZp>;vLY23dNpF^rfsM&jb6tAn)|_b2wHOS4GVk z2z>LgN~qg4P45F^Ceb%<^kqzzXm8XA&dp7m!9v^+XTk6SBrl`?H1d_1H(6@#eQT&bFDuDM$OPE9K}+SvqHr%yoS5RLb)3 z90qj9)Gp{9-0LTw+1&At?+qlH5`7|pet9%cqStRumv33HbwyPLz5}bLWBmdOcSx#` zj%_l=M5zQ2NQxi*enxv^7lGBOu-^5_@W7I(4EmV7z3+u*cC7Us zUF+Mo)_3r^7upNCu>zONkR>XaqddEz$sm4;L2<-zppoDU26O!3<=}8nR_mF?0DkBQ=9*94lnT z;-$KZA4-)iR`3N8g7U`MUJ${omt83%gS5rKs0d3tjz0GSne2v=;|n{EzCZ@PX=r3E zV*T(rp{PCLO*%iujDmt9(J1027li}KuYg(q+5OCH6J=%feU=#z961Xkms7N|c1I6Rk zBH3{WaiKCe1a!AOBA%YVl}Y);lCMAwKET=&dKw;xkZ;Q?&|Ekm-PyoUi&wW!Js#Z( zOIA}mZge~}-?8_au`Y39rs|^N-JEnGg z2K=x$?tZNX7dV(~(f)YTYqjLHzV2uDJS5#|Qwt)DHl~Q!O@`tUy)&vG8s9a}S+5y) z<4Op~)2@lFfsrv+qC3IQH2!eayQ}8bUNghYSB71n&#GzsvRqL{6vbB)tkTKF{XG>i zA@7hK*-mlEmZXqs+BCN-)wEqaBhh4yqV4d*dnMn&y-dfbo%@f#y!b$2H%BVMlTbfkZ(y3<{{xvw zWuyWYj%@6hg?{Ong``l+mOW;%zykO`oV@X(((dD7#B!3QW`S8JdJn@y&oWF!Io>UP zP>>FjvBbc5So#3ZW98hpmes-D5@ffSyfLms!?A{V3?cnmi9>~}yX3O6V7@MWfQ8n| zjz6mDE3)HF^0_$1VK^;(&PCF?a#p`mTk97|><*N;NP_H4wzoS(#!zP%cTXvKS`g zny(Q%^Z>>Ui88&SM5|J*d;x<@TCuT6XUIm5R*5AKaPxxI2?4VuEesD;Z(qZ;mOX>v=)O8wcqs6R;_Rl(}C(tHwH zl_)o&{XjmyfK#}G-P-=@LXO(7%e#y8pneIFkL8_Z%y9r_zLD8!G^?3wEu0>EJ zm!?%J4VK#$!R*&7Z8>S&)r+8L4UP;HpeV9dHb<4}Y`U*?UMDL}3G&x1s&hLec8R=a zkI*iSE|a5OD|q-nmgS~YckaH{T`qsBRGRv}mbO9GHl?0gzo@4=gbu}?0_;XQWjQI$ z;tfKl;w;|q^=7f)<^N1()hEBdV8_8|7Wr1#wb7T6jgVDmmr}~=N;KD!2f_)3xlZ{+ zHVK;q`v^#&$Y$A>O8rG}ypfg7L=y75MsWsiV(&HwOt{M+zY;~Z2y0}$x3h5Bopo2%&8pH(ol&oXu=jrxN1Qmi%-dVvg+npdm9EP@Ge=Uj8)iFZ zTjpGI+PTox-Ve{tjeNA@qs{ZZPo~XJ$&c0wg41Fii;oWbQ&wNf=!1Q${^Uwr3}o9oV^8C#s(1SAywwMzR**~Gm$ABW_Q?dUot#|Y`o3i;lO%?i zveOeE$lJyrO*xwu94k_e70J*$=dPSfJ34SwS2$&BnYCTBZ^LO?{(KA{UK*SpTqFit z4l&pTWS(6hDW9*3I=q{B>un@NgG;@UV%{GKMh>$|3uu_~(l)Z)4B_Aa=a zQ|@Nms)%FH4v6@Npbi3+jFl$nla(~|PEWSnlC9e3k42F>&s^fJm;rJSqoF-xyKx(iu)$}X)$ zcQnroUUP4b?u6lxH#~`>Kd731G)jWlOlje0V$+*lm%EaiNS$G!c`I%hOxHZ{(TY#3 zhpC|z)ncV_74Rc7jp>TDGpAA&8>0JexXVb{0Q%Z)SOV_Y1r6xrOH=h=_ubY+np-goN{iS)1;gaL2}{9EsdIL3>LJvf@+xDb@>?* z-}kgdcYWq5yYKx1)fm#(xI@!P-d;S?Zh3R%<(0|iciOJBrK{Jb?d$Gp-hJ`78WWE( zLD@FF$kY(_5|JzU*VtYzaYJP31W`+$w^Yif&7y&`Y1QxPpC#Ph5Rp!vHA=aNsUuc8 zX~v(OEY!+QqxD5RoS8avz^<$OG}+GMlC* zW}_5#P!Vro_Dyki55HvPlrF1xu#4`U%yy}Z**A4F`=%ad8`aA!q2fjYq%03e7tFf^ zH?G?CFpH#Bq8T?PhHw{X9^VWQ`KpK>VRq2t3kHMiC_*v?x8y)<7rKbq0A<)tn^$&U zJ;OHHX%#!;E25g~>6>Of?FSnJuV>rEnOjF*$Leu&1y&ZxrxrrR7U5joe&-j{>_Sef6OjRjURig2lsdjc3 z&cw8}iGJ94>$}pLcqXN7`;sfmUkZ+2b}2Z1VHwj3%b4Ud(pB3g@|o!(ZVUOWbUnC@ ze0GX+kPot0)|`ucZt{7^=cRAU$X8Amg;$WTlB;ORRgq7;6r6nGrQqZfF9j!`cqusf z#7n`+2iunRTr2rjkZ&dVR&f<|U$nuOTT5qSeR1q+>~ij3!?{u#gDumxQHwLi1Gs@* zn@t2wbPTk}HcjEQ=PZ2p|FlO&rGneA^nnx&Ad{Xu*rsvv|5!67Rm*N1o)3evtuv2DvM^XQ>CBSCf}PK}JzOGcs3E*g3H4YEq5Y42-ka-}4t z(sWBoE2qI9f?4qD`jqmRTwaCXI-;9 zU>9ai`@i(f8B!~^r_9@#g3d)nGa~<#dD%t2uNUmUO0Zz-!Gd+t4ZKdWJ0s)6GVc3w zEO#$h?zb_1pNM^H!Cs%T*C#@6p1XXGsgx|WjB&&1fl^HKcaKlo7aUC~M^kd+I}coW zAnjOpyEPB+0j!RA%6)%_ZXSZ zxYeWjs%lUhVNGM}ys`E^YF6EQmiK<*Vx$|~7M-{(x;0un>mg{~2-T@0X>;R(xg}+8 zNgjFUn^(S>Hm}FAnAY%m=fy3tE%D}fC^>n}vM!}vhwtw4E?mBgRy|tq7G0P@^aSlX zH*djt#r}%p73V9iuggn))gCdu>WG+M#m!dG8x3u1AL54t zFJOCN_YD5s6n7n3Wrtyz=ICW}+S#1!opH?^ez$Div3}mTUX&cRFWIzfARp~{ZSJK-Fnpi@ZcAnT8D`K)UyqZ+PzGwbVh@p!#Ny3(J05W(b}!zD zsi*jzsL(;`pm*`%e_|J5+#P=t0{yUkWG$b!G}2{2^$_Y`ea*NzfBBCa_N|qr`u|p| z(rqA1IyH$+DXp(4PG69#7sc!$Z(oRE!uG$cbAbQn1OIPo9~-KyJFnD&yB7LJ{h=`Q ze-=$u>U%a-+tJ8N0K$ljZ2~|yvCH1_C}czqH-GqSgjBdqzlTh zKtMT6st4K=al#Id`$g-Ewig{T>kVYDK?B7!C(1#~1QSSUlh~k$=S9nlt{0t9VS{ig zCZ(~5@jb!vWcnsPBm*12sByzs@$7E)xW?9{7|9J2We~@BnA6e$`A#XX6^lb*u1o*wk;xx_7 z&+sXmm~QXXf$PTVM8{0^Rr59D!KrS{Bm3(7Jh3ok6{eqji{Ndz-bKO7P`yrHYaL?FvoFa=;OjL3~2A;ZGI2P7lK-N>IqK(mBSi3iwuh(wP zKov$zi&Rj8saZ#99Y*Q0naSt~E|Gw4cSp1zRb*)$%(dme6ZL2Ck;iP(Pj%D9G}b7@ zAfP1_9g*?!&1kLk&;+Kxn_JNrbTP`5`%Wj5;Iv{)BLHo_Q$CrME;)KHUxu_tMmX6Q zs=N9^CDW0m1w(zOuW%nG?E*_4sTQz*dZ)Lz5ay2Waq5VLC_k0iLWIkpNCQ=VffIii zg-a8OO}XVk9ajIl@fA<(U3QZbgUUABUC)#6_^C_6wD8>*)7BLrt(Js8 zD{IO_q1keAZEWpp?Fi;QHrGXe%%8T^lZID$^Fn!hs=Pg2z9A3d9&@?=LJy|Hw*av? zdXj-E)<$9-@hjnEP2uBo$JNAWU_j0bC+MXO=H~wcSB@3kpIZyCEm*M)urp`e9*Q0l zA4MDd(Ed1&*Am1labdzz28uVGNMAzVyI7KZzUIG%26+Cxg^M_7dm>yff{UVX5?}mM z-cQn(9JVx;nre!z%?Mr_lXMsCjK2oWYVg-Ii}bzkoFmE6-&>9zSsej?OCOw($;?lE zjb4P=CO#-DE01N%%flIpS;FljWw?{wI0D}HvgdfXi&WNTkuP5kneBph#n-5H1XIP6)i)gYgd4TTJDD?%_^717~5j;Uy!jea-3xf%o>3A47Qo84z&@#q@ zY|}#owJ#K8D{9t&OK^q*!BEyPF*q0shC^9PKZuqQ@oqcZ2+s##|Ci>YC=N_*WfH%C z0S}(sqgaKhJbsFt-yr8@a=t?jV@Wa&<51p6a@LG%*ZiZnOs$X4szWFFB+3dAX0~YaSP~HT z5)WZgWE4QgcI5qNQyQ~6G!GB6BLtDVNGt-4Vn@Tbad?6at@tV~t4&FoB7t)Csf z8oK5>xZvtZxq3*F??CJTWMtjus2)5m@GSShxU4JDHT`(X-W=6{3WRj&(b)N^eYm19 zu|MT%1qy4jT&#rMpVf&0vg>l?z>k|}2mWx!PxpMd=jztU`&w6}fk^w07a7-r4KgM9)2h#yxb^N^e77F`lX zvw)_}%_QEuVt&O^>dw;uOr%tE5j{lY^3xh$6&_qZp(r{RP!ncVSh7n9*ab9{>Q%ae zypUtj6VTGz2n59Lg!H{?j+q6j3R{`vrs)`bZOyAB|#HVO_H5MLW1z9aBa0}UER#yhh(`#w5T!kpzoRX_I8*k zS-tBy_Ms1-;FdjKH{R@f&WA6b=~(N-e;6_8kdly*iU)n6iNWy6(aDf#K&5awUp|`x zdM8U*h!YE;+y3&Yv9C|^<33-ZK5KpX3gxBnu#0ozg<2T8#ftGn2ryRK3V427Ek_z- zfVw5DG!;Yf)ji_E+zp@x1-=76qJUgG@(2<9f@{WBMUiJ#J4m-#`lSVOF~4jxY7nAf zzMWro7#C3E8fuYKrWS9+rhTBzDDG<7)5F0tS>5>LSa4J%ym#ZRxF^xd%S>OuJ_fVb z!FHV&D3)evam&e?C-`A(0CZt&R=sZ*PdgK(RmAeDOu)DV3{ab- z|8@v?XN)c>=)P{LPCS^ltjM@365)AQ^Dir`;Ca<@CP!3t5%rumu8y1GB)Z8t|FLm3 zt3{q6hI{&YdV2P>8Q>m;7q<#VsOvt4!(Qmb+xzzH8T!q?`u*hJ{;Sz-xW^Ca06xSw zY*Y2L>5#!5y55gJjJ_K}ES^o<+a6}anKg=5U&hDze~tL8>-6a8SUWJ8@Wi-(GCVvQ z5^)l2uw1r`y=1ZNV}0S{`2+;t3=X8LA_BqDa32mGi64{I!+Z{G`Y`@Y)*%ijgfb_2 z+-4o-iP)gKECRu^jI)~6k9N`}*Ni~TswYm0m=8mBh})CZiIpPcGwa-O)PHFIk)!_Z zCyqXT^ogF`S$EG9kNKbK?)}D*LLC1olCpaW`33sOT(tF*)rBWdjt2Q2O7R4Hc|17D z<1)|T0i3U%7!GGm0~6!8+w<&6o`0KCy-!YtoHla)fSi8`C#!#q-Id7hFU%VICc_iq z$?;%Tzmr{)$L@V#)?mb`^E3Lcf+ooZav1lDILVBA#Yl2SEi>eX!9191o9LTu$b_gvS{xfMU>_Whh&^>eQNPr1sUb4@?zs{WnFH_tggU9tAN&;6)VQgBoiSKn5iSTh4FzuG13C%-g)rLKI|BT7ZokC=E7fSE&Q^^QmEKg3Kg45A+e?8D>jruVmry_riyyV zhl}bLHMoo-%hBG2ij}E~l`}1=iuN4Ut7MDl9gZ8-o~iN$l{=+!|DyijrXYxT=~rt(UF2W-;7CY9+WtGuJAHS4iKl6vM0NrEHc)Ga1>UVSf(+OATyqT}*ys$juNsR>mqSS{6{h6QV-2Gp@&y;T1O z7W7HgZ)CwH>Dy)&Y>_J8NY7mLFFehFk3u~uP^cyaWVK|$VkKEntRwSU z^VPz2toFv7jm6nXPTj%0POhRR=VD$r8grACd+U&Doywe0ziGN`y2Zhp1I|$d?)bq} z@;HNVv9Z2DigtD`&qq^dQ#|4*u$qX%#YyUB6U7PG~} ziQf2$nf+H+M{S=NA43ajTIgku>PA&7F0X+n?NL3LtMT@>KBZEX%bLo9#g?*Qv7szjY$ppAo5_Mcsa1^>6dPqi+4)9USt1xoYG|fq z8A~*lUXx=L08A+EEFPBj_@Xt-+x02_mu>&T zprutw1C>GCJA5xiXy=FC-=`WVBLMAx|tw!k1A8u^d%+#(#)B< zz;QrsSksrn=OH!0XRWjl)`{VIDcm52eNwnl3^z&e*DQuxq;RVkULl27ieYK-&b5i* z)%o=>w?+&@la#@4Zk-rzhuR|T_+PGPVR7hbvA^k{Uh#>kj%%?mX7GP?`xlP2vb`Ao z!dSzPq0X<11_~ATVhWk_SV%m88>Vs=in1<(3>!sZd>4&66^HIr99kE>d;YFgq5r7U zORJE(hL)nj$eQvao9IpQP;i_b{^$U$MNgsjSc~xXXxLFR^4#d(4ap}%hD~% z6BTC5lFNkd^4>n|!)#A4?tfyqI9bIo4rRnWzeWrfr>ht)&Q~#9T$aReaam$vaTv=` zrTh%86q6m-aBJ3M=6>N>rLq-fEc}H*gdeSyuty3N2a!V3Am%I-vPvm!6tWW{=^&qT z$soEY5-6niCKa4Ju0l1w@NAU<68wcaz%Q!q8;2KIALbheBd~$R{G1 zuw>ZGP+~w2D)RelYn*`>glPXmyh0Xw-BH{nsGN3 zwokH#XfAA^d8G3p!W_^9n@AF{pjBEWY%J)Opwq*GUTMuKV?mrii}+{-3sy=PSQP~c z{;X_+%z4@CU23j*C3K846*XUA!WW?s{z8w#kCtGe$1*|?C?a1Fbg?$%OcX_3PD==Z zptCGT8KJxNmRFv=@a!!t8Ma^Dny3QO@T)JM*KoEYs$bm<(+d5oUw%W6Is2NYpmDTZ9Hyre%7$&xXb$g19-#zlmGw# literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/l2norm.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/l2norm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0772990aa4967c3d1f1f12b9231e374a51d77687 GIT binary patch literal 6366 zcmbtYX>1$E6`ox#m*jGn5=9-B%PUWKou#}cgWKH(x8@%7%^E0CrTLZwjq+1lcsIRW%b>R#fPYhz z+P(vWcBwr$V4r1qc}mvOJCcOQv_DP)tW;h?0*=SX(fOFwD@wQf7|FG{`aqctNjs!U znHp|*^em(**(p^+-~JMP)U*X5A1l?oVIFWzQC^xd&VB-NC>c_mKzzVGo%1PEhvQs3BL8e!}eD#djbEdcJ<%|7d@5L^$@5L*Zbn1|seL(c~ z0a4-y3}o0$GL~e(2y#=pEi@JsCpb$pnQjrjAr0HI-{M>?RE7Tgq^4Ol zE7_q$EinadZIZcYRtc2RY#Kwj%u|`R!_GKZiz91A?M1W6(Qiv2$Lt>Mnq`U2?~y`N z$ZhGvA5{@V>fwoW>2b&{P zdPTM<`X(fCM8+XG(2Ac0vSk0t2av_k!=1=3%r?cFl2nGPi=AF&%rmZ;(Rj`Ei?OaP zmHRNRGv1lFoTM}M`dH7Zy)3aeek|6r#__WU;|CLa6Vlw%smjHxce&Pk+>tbQx?9AmvW&G8IlB)cKY1{z5k=tyXB?bdSm}uIjwSyt|>vUf?7`#&o zi=nV8DiPW1_XPaYvX|gWK9RCv;X6QizJ!Gr+eQfs23uq_vyaxy5)HW(wAQc`D6^Uc z?l04=oCQa1MH*kqs%;XpRJfP_4{PR%=#;W%E^C2+9c;mx?OO4yQr2t-lmijwtr`3d z^c`A4KDriK4cq36=$4Wc^*>8WC|ao$HV7JDN~@V@no0 z=&afiuyNA{=HxI%cMOKdLXrnhiQa3ncLalbf!uZjV=$42?g)BDWN|PM_Kt`V72|qn z#5+m0i)7%KAiRfU4Up+(Nu7$wIwuF@pd3;~b;|9)dr9|GB-=-_W-^fK&mvG0X(2F# z_mgH*M4ZHpq`GCo*5vpiwb+^(Up6fd-nQP^yBNKLKW@8o?N@Audl5`IhZ6fzzPnu0J?>DNJM=-t zhjq8>GTe#B-~%VTL(g00EJ<_9v|!Ja?|<*|B9nS0UEY-Dnv5mC3cR5afi|Agtq&k;#beL8!o`{IkIFL;Y>k~k3tTAcb)XrY;nyA5vg zA`LEK3vlljEPb0L0pX}cvuGy66LGHa1fL>olzXU#(RC3bE26V^N) zl(Me>(U&O3k^h1|;3Y9bbe`T8Z=eMF6iKrb9#}Oy_+&XdmI|wGm29*apGw3z8ow3c zqQxjw0z<$2g);u=`s+h7KOBOwfU;@{HIez zzed*0n7AYcWVjtV^{P(2W?+L(UBaDYrRO2jEt6rNPmxvKg2NNw=zv2qX}Cu$_J<^S zQl}>somEE1Jg_8}{fgl5;FB=>v54`gchQE=!Gu0C=^bl*`v8;l2&T^YyjRCU^M zF!sW#qv~DyrftEN+MD`XX2)|sy1dLRzLMT?H0?MFy}T>oiC>7FSrf`;UyHw%7)|bd zOHB5^*P49Aa1PdN_Su?vO~RS*B}Y>o3*SyxKa+MoyZCa(cKDvHJ#B0M0Dm}rdpcw5 zj&+ko#JHHel6q?4mDKp+Xlf)Qw8eVYoUZxmxoY6Ux?oLJEwam0W=Gp{SH{`?lhHf- zmnYKBug1<3m*s5yY+@j_FI{yY!#4s~1%IS5*_xbKIG(O(ObZ7WD;DMV>y}^sEpyy} zkStIWjMn+^m>L;VMFY;gIeml(FBPmQ%4kd}T-c0;B-6;#pbhkS^!nrY zWeC6jZ7*K>O?%WVL0mNq@xu0WE7vd$QjD@HaR`kBcnj|)*)&`VA5}P0QI;9qDUU^osM6D{hpIr$#-B(FUVTDR3Ad=&6Y_hI5MfeS}&VhGnH|F{t|$`E=Tg(iiZRzoc78ZkrI zFHP^*bdZ`)fEVWpC?}gqPn3)&a{osRt?k%-)4pI&b*0qBo*zsDs;tx;St&naz=E4G zTsDKiX4vR9WxcsV9O*8D5c8S`ygz2#FHh)ZzLo{+H^_K9see`PGB`h)TrjKdg{z~6vsOt9MU^NhIA&2J(X zJ`Y_wHz^vdh!7(`MaB{?b;qTySH)g8j>f-pNAa=E9 zY#p)gRf~0|?e+0BN8L*O#f;<9iuKZ;?0Z(7t{L{eweH>Cn->-?fH&Np+1a*IeuV7n-w=aN?Niher%@^<XlVA;v(p5zZ_D+ zST`H*D_PVassI*AP}{Ih!v+r>%Ib{u1U!0-1>}?X8uSxHP<{ycx{0EwjY>pue?EA)zl?BlWjLU7dlrC9KUllQ{R=T>0UvVaKN-PR()g7^*x`R>fbQY0{gpD z{mY#*?5q$M5`(F8Gs17J%`2z+KPP=KoD*iu4{c_eUgr^Qy-|0)Zk={f2iL6#{186Q ze_?~#h6&O9hCoI+Nau3}Dxb5c?wN;XMqodyb zX+Y^*yVQYox}17}0-mhg4eh$NE*Mouj;ZxB zqgtNl(MhU=$;J&QS+2t9ZY*rbMx9usf6TwnuOlel8T@(RXMrzB(b(8J4W;!ubdh?A z`rOn(?OwI;22rD2w`bk>wjcj+-e9JZwfG_x4A1XDY3~ KsM>d?mirHAZ1u4K literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/layernorm_guard.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/layernorm_guard.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e324cc067d39a1d058a45f82163acb48803546a GIT binary patch literal 15289 zcmdUWX>c3YnP4~202((yg7*cUA|#QttXsBZQ>HFULL!NYZN?cCgl!9UJE@&NfFLh1U78xFqOFvFY!z)On^gA4 z?)SY$10W6BPR2WxeG*>3<9pZlzV}`4@t>?#69pl}To2xShN6Cj88uWF5>Gm`6tzTg z6i0`sFg-%knAU_eBO3D6hM5tjP^Jqr0S!%s^&HMeP{ z93Xp35Ne9E@aFODYOeIm|Y0kxK z#`{Y%S5sUKUq8xlwWU@g4O|^x|B3K=uKpS`(pZM+om>O-zfbM8^yM12C~w-LyqRz2 zTDGWb-J-k==IiELxOTo}w2JGvrW>-0m$v20@*XG@eF)`op@;37!sy@ z0e|2U@5P+J`vt)t8RI<~SrZG%`j`-mMI(@70$lKlY>)6WG2cXVniqW0QQxy4AR$9T zb1*V774wZv`2{Y(mQBzAXP6 z2+dLR8p$AZ!5E*^7bR(-FYPD6L}*DnuLZb9NiWt*R7|JNYFw|TN?%E<9_KlLo2MjQ z{EHkNGpZ2dgl(xgtazM za0xP#LQK|+lW=%=nF;g$h`_-3Z~&toi%ieR`e{Bmb}1$^7lVFLrsrfWKOxGdF(EoN z;S+;%yzGcXg|P1;KN=Ny-~7%fjzp z8;FG`1}1_NeKEf<#>e_*ruH8^aCtvJ5DH!#n3#=Sibe*mghJtgaFpXiK7NJ|Ou-Tc zLjGA^5C=v>{({%7&y`KGo^(~iWRg+s|hZ#{q46(9O%gEeolBH;B?2|C}Qj^7F(%Jqg)?Jl$lD!n?_D~ycql!2V`h2g`*Ulp1KHMr zyGQRcnbv_^>#<*Gv#qCd)n~HyGYKZY^O?-f=M$5uJ;|ALPuA{U8G2yry}#>z?B@r5 zF`9j8IH`SH)s!C0wC%mu_^@gyVJP)KedBA(U(2-(thMgVw(h;xai7by?#;EH{zV|$ zdM;OeK5IXpU>@1((uS3;Wm~5ANY3`c{hqAtWI~%a+cV~-d{s@FS=^p@Nm){ZC(oMS znq0V&ypr-?pUtxM2xwc_m)w`?zJ55v)IR}aQh|a_foTfa1Yx_l{m6DXx&cxgHAMq~ zYrHaj^cHoCmKoS-QN)&Rp$lS}eLmz5U*!Bpgq@Im0e&JfsyMacS_#~2xZ9O&-zT7U z^9K$$B6y&YfLcx2aAXh?y@(acNNH|m_i}JGmTlclMuHQ_o*ca3J9=vHf~>>Th)jEB z!_hNecEGQT>h7R#}wQ0|hKPIE`6(gN%+Y6b0*OEO_EQVrSL4yi^`cB-?iH{=qVf~1o=2!7h7 z&XRUOm!Axn*XAI#!0OsJ(%>BEKb*FxUv!m?TWJ+tQWws*!dki``^wsGG5Y5-5~rPS zFB{9Xd7XgIZpni9QFm&-XA`Ek6*1jg)>9=+>!oc3(>+peIi{sr=&^K4J&;$<()P0c zDxDyW?iVpYxbIf!^yX(~{J)iN1hZ=KCTRfUZ~rv#*u$n>i+{hCa_BB19)KqrIDoCTd( zX3nFMYXO}%$nm}zpAelc=4LnK=17h{E9}B9P!$!P#b`H1MfE}0gW0_hfn6M(_KBbt z$7CJRod5Q{tm-WoB|OZr=A|7oYhO#MQhrh zbGTQIWF5QWryf~s3ulvO)9pD+bNtx4p0d=uEiBD0&aOCDyt%sFS@Z7r&?B>T;b8J$ z>Rj58GrQwM09IZ1?a6dkYI?D6WoNdkKYlu2Rhx=^?;I)VUTweW$yV)%p9V;qJ=K)- zrQgU}w}UZZwWVC&_NQH`vBj2^=B%?XYwf#h&RU;~9|u^6>)V5=Sn5P_K0OJ#e*9$K zUY#2HUVC|oxjJ3 z4VG+8civf*8vb5;dN{pp@vAGZXKQw4oxARy&N^R6=yCmExulM-xK=MbwCq~5?8#d8 z{KNSlkNj{XXL%8f7S@ia@H;J8YipKmP0$|#rs^Kqor*D#_CEy6WLMVS zm0;elYfeuB{`O?+wk0fiSABZd;>m>Z5o=mtlWfYBwq%`ME8N|N_t@tiLkBZp*f44T zO4ko2SG!iGZ}y=vRGqW$Mcc%&Fq#}q4X1||zp^s<(B@ek%-Z_En;kvR8cl=b|>P!tLUyUEfRpAcMsZ)#AmE(6iAF}(_*n?U2;7>b# zw*4pDbL=tP1-pvo!L)Jl?5h55;GyNvn&oiTa`>kge>VP;@tg%LNrKZ2o0WBTtZ;w7 zYn5Bwe>0qQ>`v<5cQ}C{oav!--QwA-qdlSf<&ND69ZaC>dvt2sqAANZrq88gKRB13 zULJYCZo5}=-+6z}e{CU4OIBcpzNKr&l{v7RWX(MpriY+YI1SK&a$~S)GUB-9CZkj|59c*8 z#4-sbHx*jr;Kl;1s>*s%iPXlhR}v&Gr%~(C8438*dOyGke84~isKSK^=xi0Rk*o~| zBT$O-R8YaHOO|aYt$ct$-^EZga9P9-kk2c8ygJ|y1yCfGfd~SC+<~8nLNQJ~vNxx{ zoU?buPd=*YdS`gGef8Yk`dhE(x)0@Qo{M`Q)zqh3SDm?<9dU2o+>l`!6r`bC6B;pU z!ib1kBHlVM+lo;eMBqObeX5TrP;T)dIS@kq8q9es|MBa9m3brh1od%^WSZB5mry?k zB+1d2K}irbFH>KqNg2X{kF=8S7m!`B`8jP_E9R<4V*U{LO8#xhD49kzWxdiBp-iLY z^R4I+-(r@UtG$)I<6FUhhVgY*ny7l zI|d*T3{t^!JW9uqzDcO$AAtR@U~_|Sy>L{?O@ig$XRLWEc;Bf?{{GvA70e;c+RFUn zwph_`HIAoaQ;_s39T%C0UD8YT%R_M1%rR6eN#Kfbuw%vBfEXTF5+plkz_G!eh8P3K zs;PNL439GjK-(4ZS%h`Q@YJiYPPJSzk80+tVtD$Mx+>zYg4+}DK54*XW09;|oYW$F z-GY8f^~_c07N@re1G`Pb8uXG@vP(|MAhD9;j`4QEyFlZ7U?!u;eoX}n2&spdK_GL` zdLrC5NKrI3w`cfLP;?W$*p2>fHy7o_$hMeUoSK-33Lx}&KL}&q4L1qyBZuAl_MAT2 z{{gh5K0u20=*kR{J=nEkenh6izG=TOA$lBO7G3hgWrAL@a};@L;=>cMSzjo484M+o z7El9M#DIkl@nJp!5)f_*geaDyxF(WAwwwo191Qcv1R)9rMc6+>I6sjnM3eVa6XWDF zFmXY|UWxcYyGk}hl#2#p$|z=v?d;&IzT;oIaBRqT;nc`6WwF!PY!X{AV_>?8NZu%% zHU&ji**4l7!q)-#zrasC15^rk3AGel5$sRyzd95jj87I^|24G2z{By({fc_moP_&V}cb&!-2|dmfnEGE7^>+?I#3!^y+xs+_rbg??ae&oJ#7b9>%k z&KT;z%2;?K`9}KL>u;{KKd^UZ*v^c-6HDikbI|qh1ABXhZO_=--#3`zZ-Pp_Fq@oB z58gPje4+^c*jBw}Yt7nP(~~({yK*xCS0<^8*UQ%mIJ?C_zmoFa z>3+BGR^NU4AK9O?@3E)ew>sAil+O6nL|L2*&n2IO8;bU2bLwk%zVz-_Z+-Rtu75o6 z^8*jfXW~Qe*EFX6HzLcCTx)-}rXS?L&6(K(^z+y~bR}P|kJ?Fl%t8PCPI) zJ+d`sn$G2H=QHg2$Byc^UxJ=@FWXk>oNN2)V9wDWKlQk#ch$K%`0mMDCv!D> zN6MM<<}95nuVl=9U=1=2>8>o(29^=rhvZpXp0$uP#F!vvj`KRym={b*Q)<_BYrd{N zHJ_lBnzziW?F7CGn>*_Ab=`TmT5+}ItD1h(U9Hov_fi&nib-*a7cxvE(O&{US;e3V zl_%&cSfY4p1g@M%G`yB)%5LyMYr96P%v=4A=w@jTBb(3QO_>+?!tscAgPLQ4E{U3R z9>^HfoQetpT=7;`3Qz+tPP3Fy?tD~+3zz70B~89`nZao;gV#oI09;WEE%m0Z1^QvG z4g*^)RTWc`u}sOStTsXGN-a0dl>w*dPS7vqE6xN) zpnOF&2S%fOMa39|6y-yjAwe=#QlliJstADSVt)UnQ={%VHwdRlH0Jh3Bm7SHG>D-n z5mUW09`|-PUP|}QdECKKQdoYSywlBxM7{u90`&+bS;#1UtD;U%!~u8!XJC%9LEwql z5@s=H3PeHZj7>$SM8d#jr|7@J`@kRs2Uy@TNIFh7&G<+QA9xOl*|4c&tE>&gW;V+v zVp~)I7ybi`D_q0~H6S95$aUieA)|;G1Iz=bFjrl6T>A@!{~KD1Z6L2f*0fT!bxW@- zzLGG#4?^cu@>J?#y8Xts-3Y~|we<(y+XfNv5+c~{NSnZ+~T@uuj!vu@4V0X~bI zvn!!5Tp?H2ExoY#Law?iG4!au2kyl4ma4qPk+;}E?B(m)^KemK-}%&OY%nFQ2|BSS z<$`;7qh*b4$+9ir7ph%#t;X&S-`n+&J)CdoOlaS-!M!wm3~H`zAfbK8Hs>3=q128O zMCsHBMq-BTBJA^)&im_c{?8w+{Pw@zJ>nfc?$OJv&lmBBd7n=<`F!wHAxLa+j9}@e0QaV-t7?HA=#U|( z%-{lCt$S~2$P{Ire_|pu>tR1Dsfu{!RkBnl(n6FT@_u48A}m4ucA!8J5d!`T7q|r1 z%*|O)S-@$yFSRG_Ouw|!nCUxw?~Nx~jYYpsX>@u;o)x(ZH2pU$75H$0GXZbWc!!1W zLVq5^XIP?$GbpT1S)2*%esyWfPOM|AxpxSp&5>WJwGGSdNs7)M* zZVrSuN6*2eIEL48I$l3s;ZCvup@-TEw(wyO%b;Jk{L8%cFXND)A^oTh@ z&CnwjK$Jzc6*0>tn$ToFpysvpx= zq;7&rSfpnauJm#M-c%F|E4tq+jZ!?X#isbSR^@x8!lIj=re-POZKwb??ShYR6C#;G zGJ?z72oL^O^wY|E^btS?9Xz*afjG($L5iWTpK9SQL?)!X5f{f$yQ{6aE6K!Rb!)PIAVG@`+w8kDf^C&1bM!e@jb{ ztH?}jFX9T}_$UVlRi3%pO^7Zip|2x;Q9+4QkL^uFxpOR>PM!vx2h~ub;WaJ2x%ehr zuPw2QY`&#^t>xKl%d=1Iti!ZUu|^ZYu}Amo4=?>H@uS~82KCJACFF6-Dl9xo5Jo$T3S=jdx6r(|0 z#brqR?i#huQniiw_HO*&vpwI{`=Q;XbK^B`^|}F5ECn|VU5hWT8!>01n%dUQn6gmL zT2!gwKGD&+Zp9)S<)|TkmEo+t{WsXpM>{pT-ruq&ow95)dH@D42KC(hsp$(Ac&mtl z>5tMD{ueCoPf=TdbA=?eDd78d)l~ty%fChHYvDltN9bPoD~ynlf#?)&U}gm)as+Q; z>IWDRZukbK2m`DV__3e}aU@rhh> zOA+%P+x!2kARz==)Yw3fpj}iZNH*3B?;`p>E<9Wt5Lgf$Uq`&DqNAar=qSjHIz@tP zDLRC^Fs%1@L2L*=#3}v=X96Qo1z!JSEW{{?!+wm{_o%$S8rZhN>oHa0^_Ww5J*GDC zdMw(+>!IkQ?X=GGTYZt&!p<4MRd0dYj(kl6IIW&ITj+-M7imgU9j{%ZtFm;}FJ1fo*}3~eomQuR zqSa|MKmZ!{8;w^R*EN4e)1AOZS<^S9tI~P{$u?8)ZVCQ+WW9xCTdA5lv$)PLl1S>Kf3%Snnp;9;(W<)V$dI(KgJ2(@0}^V#OKQNcB-D zFt2O0^tN>Ojh)Lo*C|M@?*0fr2&~ktTNt`Ay?dR4&#LZS%Pk8@-m89sB{2KCrhI*K z9`v4;_I!QIht)$g-IO<2$da^}c^|G}&Lq#Q>o5Zc#OzquncTTip7ci?2PUAo=u*bSD%%yL52z;BP2M I5(fN#0R-GMMgRZ+ literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/op.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/op.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5be779b4ac9f9982a08422d83a0cb2599d123f3 GIT binary patch literal 1656 zcmah}O>Y}T7@qZh*jYP~gs*@Uh|qN2*@9@gumzP$@8{AXK3qkkaPHsqgqBv4{nu-FM!3p4n&SeP(xm zo}4rhG@^YI{AeKbryLBMHb_njDTE#)ANdMFw(L_x2~&0|RBbiXY)wJF>T5&~4ckx< zMvbYV#ChMCKF^1lulvSR(3Z_lH{U^szKzvJp^uD4+XQ@e6zS)74)7~u@Tq7Kry9jE zj0yM~WAGf{b7Sx+Y_KBM8VeUWKj&w@R-O(BJ0D36>*72+pF(&VG!`#%{>0D$dU^be z#?r;7>|$H_4^8ubHNoqDH>o*+##cIlpw{e2tuVJV`9ex;lvI)8L}WPe@pj;0VQ?Do zIC`M~TT;YKz?_=P>zF#Ml_W9c*cbWrySJVDn;$#tw>R%OpMAb5ly`-4-5b7h!_BFX zoJ#O9f)!I={bX8$_DN6DiW5v-9>kHw>n^t*#O|rxTi~vkv4i@=!vpHb3m4K zp>4Sg3!Q;4Sfm-)1>z#RMPje%^iZ0InM~?Q8!Re+81&LGyeG-Q-t(|`IwP6O5K5Os zDrRC@TC&_f2B*aX!l>K}mA+D`D)c@oMMN4$+X0Oup@PF=qUtg(ZKw(r;$TeT8jK+d zv&TcWQWcu)gxUyr&r^}EyUgXBiVSh1TFb3rDt|luoy+3;JC)Ul%!hDfErM6+RT#?e zgZ&NRL{${!w1^b*FSOD{E6>r=v8wOi+`GBIy0>~*Sp44mRlWLy_ltV{H?weL&UMYX z-L=s)J9Cs>=w=u8?i`!B&eFk>q=!j%vQs&z?B024pv;@cdUoI5vtOkQy?Cl3BmYun zCwVlL?>snou)F?;YCgKrxpi>saPFgL>e})ATi-d~I!E(gbm7vj{I2I7ec5pioWq41 k&-BU}z1E#y1NubM^vp>P89BKMB!$^m+4R{`koD|%4Y!ng+5i9m literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/solve_tril.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/solve_tril.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8130193a9d059a30dc2459e8771efa84ae881a4c GIT binary patch literal 27843 zcmeG_ZA@EPcJJB7U)ToQV1r45F$uv5AF&}MLr6A|4EfG<2s<-nlJOYc15Aufo()Op zbC;~PT6;PYrdkQJyH%Z0r6HY2L!^<0kyhHMYSoeY$F8#r`%=iYPgyZ4@R&prNUtyYb}7<4-*_JmVTo5VD(#bw%KBtDMo0;nw`f$}CujXdUd5=gPs#e_Ue&0&Pd!@P zS3Iie)8H71jaYaz+4soqte^K03PSmbBpvE2A&V&)p(Ip85}$MkBh(+^eWhdxSvr9Q zwC6FRm@NGXMrg>ApJ0D2`-B7WeO!W(vJo2(G5aR8gzg8J2mb_5$FceLX482eQF0oa z#-{NBz&Rkf2DJ=fMCoblnhAV3z~0a2%)UJsjnVjh{U_3N+>G?Y3K{lcwG11OhHxlv z7{c!x`IfPm>$9bKA5jLm%K7>cCQ>?bfKSW53Bx+^=5^w=>%>=x#}BHB${%Ft$YC-hjdtq?*7B(72l-7`Z3la*q;BDZGQDZdYJ@Y|KvuNrEwSU+i@C z_ME%u`jej5Po2AR?v2Zg%tMkSBMT0;IPFqO1682V8UmGqm|HEAwzXxfqvL^~wV$;5 zeefH!PO2NM{o_IFkUwZ;ROha^-sHnhhcexhFe=~psOzSi8VfLL3)h#doi~?&`hN$!Jf%(VOkC0e0;NMCeh$kEQA^7`wAr zFhlVXZuMnzaSoALGxOw=@M~A7IHabF>7q%1#=hmSc4|W!T1ow<00yE+&O=LSH6fh@ zf=ogR4{ro&nfnAY5hIAGIS*fQE7%B0VQ_}jAzdE45?WUnT${sFu$GmEO6XF$WD>?w zNaoYh8d`?>D)W}1R)rv!pkBgp$Mq6{6Ec1`6w=dr*j-|OAtA6q3Hr9vVHA@NUq1~n z4S93Hjw)n?eOL*MJ5(nvstYaW$5IG(Xsgo*w(e|=aTs;!mdGd3SS_O!fp-OMK+{{! z1DIsAd{9CcrEw5Y+YFR4SVxKo`6mDiJ)v>|EU{5vPAKkYcD{K=JDO{99)7fkOg?u| z&nKp9I3%dykc5yCvLVSQTpdDYL2jI8)99i!B?!ry+!eHW1Wi>rPqx(nnVU(r(*^;p z3^}MENE>Jq%t+i$yBG_Zg5`WJLK;L9_T)rBkWk%M^Ausf5U$s8>^AnjiV?JS<~)3F5jc!{aqJah*oByQ zj0lUp_%emMgk8re)W~OOBXUnd9YnFn0YCaE*lgg>HaO-Y{5O3A?m*B*p*$YnkiC>b&12MKlz)hFkGebrBlG&*1nl+vOp(VIY;iJj zH|6#Xk&KSr()D}&1J_++L5fifLQcp)p_P&;x;aczBvX{mD;@Rt7-?X9lqm|p<_6-4 z;O#MzQIlSBl=KB%!Q1v-4Bp8|Iz1E`XN-2$4wBl3nztW0HsrvN!>AJE*ui0>9#;@{ zgRcHzjRwqe%P*%$nm>8jC9o1iJWtc+&?%NAcKr_48`<=BSYI#Mvh#UpTQw> zzst?YAl(x{gT=bRLjh-wG3M+h&E;sj2WL~7<10m& zP9K)OP-EH>7V7!9FY2W>@(X-lKEk#iCJiP03(&m4)Jir37YiM!`6gs+69kJXxKd&C~?j8ZAJ z1zM@re&CLlg@+;yF=MRvBS-A*xm`*1zSSwUG2)}n*w9=WNYnz7lmr-i`($ z*JAxijWwyXPIWG;io=2En~|F_V+bM^JhAV{V#UL@=Tb?D8Yra1O7zo1k)i0o zeBC3RV@cPV)U|%r^LgKczLf4{MEbO3U)()kGv70BUT`c>584(T3kM%c9(FC>N|l@q z%O30Z&r9atT#(G4S#U4f9tF!DMvg{LK|Xh?*qTsVr%pZ98lsLL4a96Qcf599a<_iY zn=&2zrM4lAKZQ!)jNFXgfF7{TosHj^yExyOtZH14B&(cBvvc7}(tLEWEotsdm2@R^ zU1906PXD8>+3uO{Sj&%Z#P`N-&UMV!LW%hssWL}W=YTrsN~ec`n9|sdIps$+@!t6U zr1`-7sRd(7cQ_(lQDM5q6*X3*47c1l4?`BnU7}wqu_jBb%Nku2dLpWb)x`Se>f$xA zs(4_&=kBcqeSCD`oy7^DcBwindm+bE#nZ}&a#>TltSOo9j&w&`XWM7mmyGpEWBuYp zxI3lkUaqS7Z-q>yUeRExl4(su6CH?k%?!u!=(YH%kGt=7&$oTraj)ac!FYFS?{|Np ze2YTcxy`e46xzB_M&y*C50+)(*sq)Av{+{drJ@Cug$8h2qD|z8?!XES9z#@?#Z(+^`d;1bD?NTF6I<@rqSB99}N3h`wzh3^5w$X^RN22ppP#R~(aJ0?McaytEqe z(i>6+hEZWC1AHe?O+cSr^e(}85Tnno9Qv4Zcxgl*A&hnC1DK)LMX0okHfN}0{%~Ry zM{VYKYctHP?+NO;)wwnO{p8NDy9E%%#@g-z&4+t@ZNN{9%67Q9gsWdczVAJPRw0%; z=h--wHq(1()%{|=M?ktE2*LMMO)H{!;E(`{SYNQSYp(Q;Yin*VShoR7ykBd!4Ka7+ zvq(1iWW3J%E5O43J`aMn5C33&WI$^r3|6D8_d0UFhcA`T-59mrBg zkaH9{uOX)$IcPjm$B@&3oa4wj0S+T2{6XqFD0mV%o#5DOsV?L{g&b59>NIjT6pB%= zqb%LXIfI}78kQ9xD#ClLt+T#q=0-y~fYkX2#!d5#~@61@^R9_vb~_atR|rn-Iu5+A*BNpDN)ZR-oz@c6Zbvx|cu@iC~wa*+5K z%JLO4+#Ox$N$fkiSpTp)Th3TMyMJbXtZu>d$Z&MYa4cy!_Oq^^pa0@~%5WyEcx)(J zKRKeKnzhf^V^`wNxp(IAxvTSz#IDAap=rtBNE#fU**-t;;6Tc7Jgit=l~Xcz zZN4k9t2t#jv}9;W8jwi#`~#@+cftywMX6!>R^%3l*xru2Qzi8YUH#Ma2Oj95XG*AL)-Q>D!a~ z_MbU_-u^{_lYZu~I!P zkH|p`Enm<+Ql3~=mMmB7d!dk(%2zO1iG1qJ*BVS?1PU%0YLkZA&B#%WYIYx486q5?0x7rO)3=p9PXOB@C9FhnILD?U_GH z9UCNN$s>IRzC0k{7RC7TZ;t$VtF=?G9G@abqHWqndzCCiWe!B4bW}_xR3_AA5aWLZ z<8UL1Z-t=dVtC36geRW{j;TO+^2u$@1;SGm#eH&t96!I*f0rP4eyRU%de?@Mx{Tz3 zpHa3m2J+xa%N~$M+4l9mD)ZptzORDdvgEw|B zRes_3Bc9)~D!=eM5YJz=Du0#WOkjlK?tvZp0`&}^6Yw4|D@7R*ax zrvcSDrvVirrvXA3Vl>dxdqmCws_6<|>Kv;0aP?_PvEZ~sv8mINf>377X^E8rqLu&Y4mTZOKr{l0zk}fJ(YoNF{dG6Hv)U3r`iqsAS8bQk4jmgfP~ll5!m? z+32bam8#g2w+imOt%9!Pe3i6?^I1USG_C1c1!q8#yk|fg?KQ>EZE6K`&&%x(bI+Fc zZX06mNzlG+KuoWDxpl1DHpJXI?%y`V+N^AP=5>1;TSM z51zUL;c3W&XMch4H0HrmUm!e9u**OFy^VL`nUgtwPTj^%+@5F0n%*53M%U-Srf?Nz z*MBe%E^bCD$WFXrQ@9G#uEtH_QWT_JO@c9x1d=(=#;LH5+ChV&gkNXr3SkKLO3fRW z-#8W4=7U0bfK7l4pbPahwqExtx=9ERu&svf$XB~O=?7L*BTw#wbh8jY%g?N?k24Ph zA+IPx4Z3kP{qjI-q7UV%PfH#M`RdcKCHge3rB5FGO|&yl3tC~PCk^~T07qw+%6OA9`#X5XJY( z9ovSOTgT(uhM2ooJ|Tc8rYm+BbbZA%whoKwEQ#uhm4Yo4bP*07cE1HTuh4@FZ<>o? zZUbE>G0bf{j`;{}5ygCD$1xv;`;cO7KDy(WU!$8vF~7FsnA>TmDCYJZ$9#-FB#Qai zj$`hiTSYN<>^SD*bc-nF<2#P|1nm&Td}8~UK?8I5CN(g{XS3a6bsz*Y7e9l62Ij_? zxK$tYBE@H~!eQce1As|&!-l!<6BX*O`3^jR|SWiqAHM z!<6BV*O^@kjS00$d^RaGCg6?NnLW0E3G|E&`Sgt6g`LT>L*QgJj;THrhi@%Lt zO^fU4+HdAwh?tK4X3DL-PqMkW@m1p!tLOT65a7B7SQT3Lio*Y8@&{YE9DhoUM%_fU$^J(L%(9ss()wyIu55~vs4 z*Ql&}Q7lI_le0HuN_X^z%-WkSo4Z-_s>{9XO`ex6_pY3S%3h&Mi@1>LCZq9>2a&<>v;j#uMFz(y z-vD>I2N^J|7=9@H`~$cVfb4Z6p-t|?+3=RYKRHOwW8cSV$&?0iawmqu+&)MPNyMRC z1wFrzG$acZh2%7TM-$B0F9A!xa$b7x!wG3f0qMoT%qbhK;5D**|(Q)XE_VW7_EXfB0Y1kJ{&~$FWD!U(a{BK!d$!H^g?w7v#Z=4 zKwdQxf}Qe@LLZXW;4o?BZto*&>?k#YjpyN~3yL5}(}y+ZD!_xtb8e7B-@x`22O$mhrpN0=v_YL1 zaF3E!cYy0(=;}0Pu0wj*Z&uJJ^=xn3-)nSOZ+e2mSrCC?_ZSN(Jrth(4?JLS@gNK} z3dzhV3Y^P!JZ&mtb+lczX~DQ1pxCuwn--x|g$M@k$~D+=8bwPk_p)|97-ibF#h@>M z$|GuXPg?w|&Kj4cehL}>0Fztb*Wj7LRvIvkcKQg|WN=1~#Tt^ubqQHrqPT8(&w<1q zXVfu!e!{JuBR&9lBzPPs*KgdJ07VHO?53RRNvPlk(sB?hI_yi=$kX;;Xh5v%**QHLU4y3DfAN&ZUiP?q($Z05yvUrbqvlfIf5h-wD@v(BloHZLYBW1U8BZ z@)pJRp2R=K*&6|~BxWcH1 z$#KdP2zmwrjM5GN&k!8pWDURdMO&G5JEvES|to}c+`hUaf zevKVXVn?5pn`irG`exsoc`vc|=we-}yfc9re$~_(Ym2ufEeHOky*GiGmlc(X-K|MQ z>k1~pCCl>qg#BcqzVo3aaq+#x1y@4$KHTQVB?-u&#w5lm(-Lk-;)dvjWX;8d@lp!E zyo{GlnSNM(r}`^7W~qBoB-6@&Q@JmE?q4l+;qzbw7%tLlBid+x{M_P!u=W?q*PmGG zP;@uGT)FRSlpP?ebzt>FtB^=mN->EB;92fiVu2X>2;aLRIf1{8gIQOl`iJx#dPU0y zb;w!(Sb1G3VS}X@*rB}|xw@ifg9gOd(h+*a$Og->GSh6wOvl&dDEM58N%lS012W`$ z179g)Gj&OJs(oLBX@i<|TQ`Q|$Y?F!IyRVX9UIKHjtyp8#|E>lL&3GJV>1mxQ}gy- zE!TUt&Gnv*?Y)g#cTTpdb!;%(IyRVX9UIKHjtyp8hk|Qc$7Xs*iAl6DIqg5VY~A;9 z@!jID6b@XstkAF{MhgBXrRB4l8BHvhEUjHBf(XcMte8DGb8rP^1;M4f_=i{TT>V-F zA%sQyl>~ghfBt7W7@@y^{+5J&O-kVvKs%$qm)g#^YyP3db^+7;(_Y&Jx#oWX&7=Mp literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/utils.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5f82c2628d57d703f332c46c328f5d6e81c162b GIT binary patch literal 9355 zcmb_hdu$s=dY|3po0KR%^p-5@;Rh{Qviyi&iep+2%NI*>B-QDthBXeZ3TD{-Z6odHmCU zGs`6@#r7dUhv3ZY>^C#teDi(Z@0;0wE-Er1NWA7Rf}d3&^hf-lCPTKe`cE2!E+ZDP z6py@0NqH&qOnYfkGF}Esn%9K1UM+=b7+x3Bd-bHQ;SC|9*BCN+P2`#4&0aImXn9Mh z$Xi6(bbN8h>a|jcMjSonQklq?ob4v1JbCRv!;Edpzf)_hUZr6T<698&I-g5vTt`{@ zTuKvbzDX-HWJfIn+VbbpTk`bk9Nr4Hh|{vgoOx`A%4@zHrYqrrwf-rJVGUbyPUEfm ze;UgMV^wq1*xo!RY7Ka)9j00PInB*%lXt^&DNU>c^AGgS0{S|TPtn*Rb;NwBjwsM< zX;=qT8`KtWJv=pVX0D#4ZZb*_ZzHtT_(`SliDVzRgl_dZqpPc)Pg=$ zf~1e<=TvKBLuxHwvPj*fN#xziZH5tEff1@;gj0Dcwf3h*n4&}%g`U&*v^q+@R9D6# zq65|AZ-CZrBV5$MILsJ4uTNvX^(~-1HF>SeYUy^6TsvnSW1m4?t7^1&Cz-4M`|zBX zSRvOSw=KkapEu_Qbxx3xpF+LyJ>U;vGwN=+ra5_n(@+eG;$N$$knn< zTsdg*o?#uk@tjtbvUl$^<=*@y7D&IFk`0}Hp7)>OIjC#8!_knNmKjfYTGk&6N}S;5 zp}iy!6$CCU`6hV3G#n8^(591wpcDzqww_}iU*|yI@uSDO2Yr15UEMO(E}KZ}QD1le zDY+Qek976>dXB#CJA6W>w#zm6aj+YAJ9(npH+1s&@xkt)A>Xm?Q{Bg8dN|xF>jX}U z3gJN3-;w>(Lym1?bqccc=sbO%xr|28`!q|v&qz7Sv8<;eeX`knPQzJR0$;8qmMN%f z^7Uce96iU(p<#yAp0lWuo26pZ5OVAMCt(7U3zEmha)F58mm-2o8ud%AfIl$GiBRWU zp@=BC1TFw_gF=k*lIV&IyZo-(fN6HQTU|dK<-%l+T+iVs&$lT2@hk^Rhjk0nutwnh3KfbgvB#g{nVkTre~kE|O%i%X#dngUsUB?YFbga7K^g2sQy zoY!AQdEK8k+|1fR7kcvpgx&yEouy})81n{L&aCEr{j64knv!BUt0-vGs3TxJ5vdDJ z1@9>JUA}ET7jIJitx6pcW;&hkIj0|{So)kHMvWQO z{`pev`^-^d^9B~h=qfaCh|w%F1Kn6ug>VTRP%-T=GeY5~_YE;^fTqw8(m=#DZ;GKH zyuU&HB{i!nSYJ+}^G1wJf)>VyNUsX|&l|?fd9KtNnMF6us4JsTOb32L)4#@<;Z4{H z|JALKy^1IZFJtIU7>;ZiHA@4lbQJv+b(x~jg(3}_M!#UrQXf;40>$B^ZBx-b`}U6S;oA7%nYM{(X*3dUo89T#p05lvoYff1$7AC{cbW+5F1&T}$mt{zYo>wW~UK{6@rn_&uA0S|btrJb2n?tzaWcbmVVMDNmYJF0gscU>90IR^y-b*O zn*|5-5I`ufsGzJR&O|nng%S6mtWg0T45k=>gnCR~L^jcfdjvUAHYtzT%?nm|FJg@b zf#7r~9#`Zd>$6)uQ~N?qz#Aui1=%-o^i+o`8>p&?m*=;u3^rCX=(H@}i9 zKlGXN5Hzl0(WZ|M{AuvtnY!M+bR<#M z@|dAYw>_$;SsY#(Pn7RSl??0%*AuFDRb>o)je}_y2ia+zub_h**4dgHk7UyYLbSUMRuuYxhGM(^S)tM+FCkS zvH#piwR`W6OL+KLQap)LEVm_bn72$16Ef&alzs|?arlGmAC z$l{3i3+*t}8k55w1pikLLv|hOa=p->x^Fncm-sAMhp`GzH9Bbo#>is0$ln`dJ-? z!juMZC%zNbiYV_^)RoN|W5(S8s|49;Kz1!g_5clcF7Q+L2ctiZ0RkZVS>`Ni zMC}msoQ0Ot(3b+xonlU-vy_`&i{99rv9nv1Aj@GC81^icH=YVEO+QoTR`Em zZ$U{YP!eh^7b+FdWUJXYV8w3h(6PA{;=fYSWw*0H?CeCHSTnicz&qc?5w1GH&wx`Rk`zN!{>%K2kOIc0N!l!acP zd=360RvwV-QxnozbNIJPS7~3)LLJmHdkdfr=yM5MA%VKThRS7RMA&|n^em`*meoGX z{l3V(kCE$Tbrz)@0!Jg|oMaOc80>JSsk3=^19~_Sb?XIkB-9I0z{U^M!0OB}P<|`oU48GsLG+JCkBPZ!q#y^6r3+66*`Oa{+YpOQTjF?G6N-xd zH7K6jC>;0{MkG1^4!F+3`Ht0GWpLavhPQQHf|FRz$-)Ut+b;|Pg)oG(6Og$X;bkaf zW)vc9W(wp@69LE!$Oh_I2C_Z`F*h913;m{S^}$6?U_5stCj1CU--f@q4YD};qNqMq z)EMu1R8o<)I@8v&$9j!b7eA7*fMQ-eFn?fSa&ah^V>?1EdiLC^7CSvTT`xJi(k;D|TD z37|G%$PIt*Beq+IAUhV^`1dY0vk<*X)8{LCN%yPb6@f8D{76XPZNLYD@3nN@=T zsNoPmva21zG9i>==@AGVP6JiWKAC>GIOL!5iK7s?vc3=(`tpAvxlOWG@P|h@xkRxi zAAw{ahAd0AB@h8yCT1b=S8y{YQ`_e&3~8nq>+#|Bi^ zu|Pf5qaw$}{`vkDOMTK(zhc>vv}{?i00OnHSh|vyt~;-#EJxBME`YP5;*1VDJ;vQL zNNY5G^8`Wl>!+aMmDmwD(3uVH2F`EAMquN;I?r}=-J|z3Ywm8Lp?>gO1QH?&%}_z`LpzEwT|3pfCtq^;SxX)uxFGFuX5Aff&hlB@9OUHoIG{{ zU*HFLQQ+bA8lPpunvz%8h%h}Xj*Y> zO**zFEL*>RXf21$ChspFxO8B}(FCvWS(;$ijT^HNP*aDl)p;o9E}|gscbjE{&lmQG zIG;~8`FwC473Fc=;`9A^)X!(%FrF@ttn>68?;h-yb-e?}p(0G+HJbx)&mo35xY1%| zMwA4Z0T>Ykc()GZgmK)4vtPhp#A+L8>d(QaHC${;9~ zho0a92Nbr)xLtA5WHc;S*_3spL3S!oxNkYV!Hll;lsBOTCpdmWypQz-U(3GOf95?F zzIr6Ol=GO+j2e32jsaV~vT!UB*_ev};2maOh~eh4tcS@f1b8V?l5K1K2xy6VMV56q zB?|K8`w8#Dc)y0f*a;cd7cImL^?OqF`x1r?>6#t4_9tr&CJdEntLw&*m8O@IO)uYO zQ%#*oYiGjH34w!iUEpAWao*eHMjsn@Gk;I*W(y`5n?>n@ALD)aV{H`z;u2Il%B)=tSVe&KhViK=z`-b z1C&aLWrUg%c?tOZCR?$|XS-pGlkF-gOaogS0%cI5qe46i_$T~lf_zY#mP>L2`bHzz zA<24Je^7u(^E95II4W`q+aquTHY)2!{UYXAHt>LuQ9ppB1BOt76`u$`Eh50TCo--S z^5a!|mahw)Kd25kX1L{Y0wMA<(_C7+|>&yn%>sOk5p z^xsk4ACcuhQ8PR~GFHu%uNbS6#;V1=DPvPy_o$?Dfxc|KWK2|TOO>?VYD|{wju)ki z%H{?yo}NFw5K9#`Ei=iY=C~Bc z>2#v*<=bMav@>PviUVr4QjUe!7XwS=q@38$mMq<#LC|>X_1l3vyMNblwBJn-|Ly^`2zq;U@?lvP4kJ)F9`MgrjlA(Vld) z-*@bsFH7$0OYIwo>mE3&7I!agT5d`?+7hVbfxSLuZ;V?X87mgMlg9eEE^V$_97&ot z#tmsp_2N|0vN>){m)71ZJ(Xy9>GsCoxPRqd**}!rKa@CeGPNHXPbH8&?WlQz7$=?9 z6fbO44xlYbd&}}`DSKN&(}vx{j*Re@`%PlIYZ-=8XWrO3c zz51SQU{Sa>b7kh*+gILBv>v|GnX2pkq%&DJ5HEgER=YTSEpjEYJe4Zjn?R1Vv-X~| z55&6l`qq_(!^wujciK}8-FN)ShTcy+$%a0{(gTfouIJ*w{6M0%Go|Umtn7cFFP@vc z_|E)03D@3~ejl0tiN#1cpH`TujdSc%1O*6D*@(xq4mM(E20?KP@CAxH;PId!5+|<- GzWg7Yp@NG5 literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/__pycache__/wy_fast.cpython-312.pyc b/model_executor/layers/fla/ops/__pycache__/wy_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a816d2bb9e17c5d0e742e54e8075e99f4a06646a GIT binary patch literal 6858 zcmd5=Yit`?6}~g$$9O#Bci!(;8~3r7v51%Y&%5a#_yL?dwo@lKN>^GW zuH-Z4o^$Ux=brn`+~fOOLEsRyh~fKTj}xIklTN+4Ys}^^X@pjgh(s!au97Dmp%&Zi2ob9>3&WYv|D2H-XfC-@h z{Z=fC5a4N}&)ExgS#xtoLCR zr0Kq0Rlbpme297=)FTva5)K_`YzKh(ki`==3+_j6`y^Gwli`>{;9(0H;ZyWB{ z7O?Yq)hf11Xs*4<`r3Apx9=kF*hSvCi@XcuolX2wx6BBM-IA^Oyyns%-2tY8HnHbz z!&Und6`-lwt}h~Q?^ELDyqDH!C8C)WIjqE@PYD9Ol-3fLrMO>~e1U7p=)5l+6~h53 zp;^ve^j)61@W#pWff`ha4ZsXS1YVmCAS(bkks^*ol%y^qFMVF)0!d#&x*n0DiF?Gn z!7qmr31XuEpEG01LVPS9jvrP)Dk+C=CXasMwfUpcSR_0<7GF}X#iC<3B9Vo$g_tNs ze9}!RkW^ywSj4|1$%(OG#6K2`C&m_+d_jLgc`3f6vBx6*g;~-6Wmy0hRGo|5378VF%^fovx{S&mGWbC1?6(d2?}(J#jn8b{cK;txp) zuUVtcXw*54x~$R1FTw8|{4UF|Pq1dq9Ee4O;ZQHY z!nY(DURg!pOu!B2Rr4zHP|4z+R-qJT_SVKh|n$2)a}iYRP1QVR)3D9 zdq=C!{4Cx?`t$Lba-~}L@N34cpO3E>^Ygi_tUc9++i}}1ILuO(sDM4#LiTFwX(6jh z!Kj9~^n-8v_Fx*f0Xk~Cj9bVoYzb1BgP#lAhz2-YkP;2|0Xw9nI`*(>s@tsEdm)>h zd)PE?w^@y>Y8j%VF;|s{H1M^!MGANu(Tc{c*t;71G(&xdXn0_%ADpS~Jw|tl=7!Nd z4WpZs1P$6|o;w>o*Wa2{uR^|EdTW?ZzG6EX7OS0P& z(d@g12j?2U)U|xqbDJ7!@Y|$p!aaCMWSA^~m(~ z)=EPP5bzFTKrf6U9^0Fu2oBrSNW%)6p$NggT1hr>3%rk3a^~niSSjWk5XA$ctX0W2 zM2%qk7NYFi5Vda)&&&=)?exZdCU5&2yfs7AzHNva#iPv-g?nERWuwjZx4xGtZ$m82 zQRc1hSL#5+*k%aZj}J6O*#2e_d?tiB4QL5@x$`n9T4X=~(n~Hcr3<`tRYa6fGudWi z$*Aa;m!J}r0`s8Vnso})oIetYE&7z;~c`j+B6y;y&Fm6ZeHQAXA!YwkDZ{4l_%{Vc!j+(@B!ZNVvWm zU~$$5{jhJIsCBYPWb;Hlud5dm2O-e@7jH5-yR>#$3d9!TNk#H4`jWojqUf8K)EDHsH+0p-#!Ji(!dw7*94^Ct}>FEu-JAXa9R2VKg?~G>L z>FGb19e?3%xzpLx`L7nIO9OXLtDb{p{?Ix<{5_syKeRMqFeP0m-&%(epKa0AGqFczt>*oC+Zff z`r>ufGg{`o>--^=KlI?j`)}TRv&_F*vsPh|;K&bU&t|EQ-K_=Z^3lR{ap2DBqW{jh z%*71zM6j<5y{gc=!Sb0zekQw^wH4ed+g-R`5Q{U#6Q#*j*S*(D2iNG;*m7Q>#;@FRr=Q5Sh_PTu-Qebne+xk>n-v%$_VUJ+`H1h|5bSwZ=FA4~+7!%o4tCIi z?9Tsz$y_KK$_GmO9^1w<%!b3A5B_MTXeeA+j;W53b-P!!drL#_d+&M6_9GbsEGrkx z2J;sS!=+dBDNk4~$7WerV_+q?94uUX?;Cf%vEDPT_KcTXCq8<0oqfIHMUIhYhmfOX zuQbWa?SQ1Ik_%wpeMog4TBTISxXO=jCCr{5$X?FRWX}D6CalE#Y^tP`PT%8I$B}jW zYpVUVHTQ2iKI|ylr?+LVZ0iG%?47$$NbY3vS6+UZka0pLfM{&U7f3E7k_+-HM5cd> zO|qmUcNh8sUJ`jf`4+(APx6Tt>>Tq8`{%<5%LuXeFs?NNg_K!hz^!l z2}hD9A0^~P&umiEdM!{RT=T~znQV(paw3xhRAb<(OEW_C5V!{A!$MqH@`lvmGe z=YO5qvJb{Dl8I4xJJP7)K&=P!{*};j=uyjo!u9tS?<^MOJKs@T4%|MUo=RWO3}*a! zC!94obJm=vv)1&~hK0+uW!v(uY*+et#e^n~JsLm#nC&Udl?ETPhaX<5Fv#cvMgM_E zedF)6=DM<7`I!R!*wXbuV(m-iqi55{H!Ri~dBO_oY`e;~7o26b3#@}HYu1{d%3m)G z6lTh7UwZ0^p}R0!94PwljugJB8oZw(m(lbG&RSvM`6qZ9;rR(1Xs(Kxa4fR&6${}w z!dVH&6HXwU4O!h4JK-G2(_3*8&V|~BD{jJh$ha25wGyt4aP5en2+>+NuSfdYF>wY71FOc$#v&T)f-t~QVB;AdTFcU zi&V0rbW;bM#$A(=ayX%c0||}w!>y>2j7pkmTEBYKn7Oc`SpuWdwsY2U<*2jo*4~-=}#SlnX`ZF z9Ll^7C6vXU6S6{nws>~!Kt}i-d-CJXAtF9SZFCHNN``;RAav4TF!N_sqUwyrz+RY!mEi0as15s1)+OxDaRxntR5 ig_ifa?{q)1j;O|c&+MT4$L2`}GM6C7#@gUXH~$}oxqa9G literal 0 HcmV?d00001 diff --git a/model_executor/layers/fla/ops/chunk.py b/model_executor/layers/fla/ops/chunk.py new file mode 100644 index 0000000..4c8bf9f --- /dev/null +++ b/model_executor/layers/fla/ops/chunk.py @@ -0,0 +1,240 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import warnings + +import torch +from einops import rearrange + +from .chunk_delta_h import chunk_gated_delta_rule_fwd_h +from .chunk_o import chunk_fwd_o +from .chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd +from .cumsum import chunk_local_cumsum +from .l2norm import l2norm_fwd +from .solve_tril import solve_tril +from .utils import SUPPRESS_LEVEL, input_guard +from .wy_fast import recompute_w_u_fwd + + +def chunk_gated_delta_rule_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: torch.LongTensor | None = None, +): + g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens) + # obtain WY representation. u is actually the new v. + A = chunk_scaled_dot_kkt_fwd( + k=k, beta=beta, g=g, cu_seqlens=cu_seqlens, output_dtype=torch.float32 + ) + A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype) + w, u = recompute_w_u_fwd( + k=k, + v=v, + beta=beta, + A=A, + g_cumsum=g, + cu_seqlens=cu_seqlens, + ) + h, v_new, final_state = chunk_gated_delta_rule_fwd_h( + k=k, + w=w, + u=u, + g=g, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + o = chunk_fwd_o( + q=q, + k=k, + v=v_new, + h=h, + g=g, + scale=scale, + cu_seqlens=cu_seqlens, + ) + if SUPPRESS_LEVEL < 3: + return g, o, A, final_state, None, None, None + elif SUPPRESS_LEVEL >= 3: + return g, o, A, final_state, w, h, v_new + + +class ChunkGatedDeltaRuleFunction(torch.autograd.Function): + @staticmethod + @input_guard + @torch.amp.custom_fwd(device_type="cuda") + def forward( + ctx, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: torch.LongTensor | None = None, + use_qk_l2norm_in_kernel: bool = False, + ): + if use_qk_l2norm_in_kernel: + q = l2norm_fwd(q) + k = l2norm_fwd(k) + + g, o, A, final_state, w, h, v_new = chunk_gated_delta_rule_fwd( + q=q, + k=k, + v=v, + g=g, + beta=beta, + scale=scale, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + ctx.scale = scale + ctx.use_qk_l2norm_in_kernel = use_qk_l2norm_in_kernel + return o.to(q.dtype), final_state + + +@torch.compiler.disable +def chunk_gated_delta_rule( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float = None, + initial_state: torch.Tensor = None, + output_final_state: bool = False, + cu_seqlens: torch.LongTensor | None = None, + head_first: bool = False, + use_qk_l2norm_in_kernel: bool = False, +): + r""" + Args: + q (torch.Tensor): + queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`. + k (torch.Tensor): + keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`. + v (torch.Tensor): + values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`. + g (torch.Tensor): + (forget) gating tensor (in log space!) of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`. + beta (torch.Tensor): + betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`. + scale (Optional[int]): + Scale factor for the RetNet attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + initial_state (Optional[torch.Tensor]): + Initial state of shape `[N, H, K, V]` for `N` input sequences. + For equal-length input sequences, `N` equals the batch size `B`. + Default: `None`. + output_final_state (Optional[bool]): + Whether to output the final state of shape `[N, H, K, V]`. Default: `False`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + head_first (Optional[bool]): + Whether the inputs are in the head-first format, which is not supported for variable-length inputs. + Default: `False`. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`. + final_state (torch.Tensor): + Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`. + + Examples:: + >>> import torch + >>> import torch.nn.functional as F + >>> from einops import rearrange + >>> from fla.ops.gated_delta_rule import chunk_gated_delta_rule + # inputs with equal lengths + >>> B, T, H, K, V = 4, 2048, 4, 512, 512 + >>> q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda') + >>> k = F.normalize(torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda'), p=2, dim=-1) + >>> v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device='cuda') + >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid() + >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda')) + >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda') + >>> o, ht = chunk_gated_delta_rule( + q, k, v, g, beta, + initial_state=h0, + output_final_state=True + ) + # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required + >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g)) + # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected + >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long) + >>> o_var, ht_var = chunk_gated_delta_rule( + q, k, v, g, beta, + initial_state=h0, + output_final_state=True, + cu_seqlens=cu_seqlens + ) + """ + assert q.dtype == k.dtype == v.dtype + assert q.dtype != torch.float32, ( + "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16." + ) + assert len(beta.shape) == 3, ( + "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise." + ) + + if head_first: + raise DeprecationWarning( + "head_first is deprecated and will be removed in a future version. " + "Please use head_first=False for now instead.", + stacklevel=2, + ) + q, k, v, beta, g = map( + lambda x: rearrange(x, "b h t ... -> b t h ..."), (q, k, v, beta, g) + ) + if not head_first and q.shape[1] < q.shape[2]: + warnings.warn( + f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). " + "This may indicate the inputs were passed in head-first format [B, H, T, ...] " + "when head_first=False was specified. " + "Please verify your input tensor format matches the expected shape [B, T, H, ...].", + stacklevel=2, + ) + if cu_seqlens is not None: + if q.shape[0] != 1: + raise ValueError( + f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`." + f"Please flatten variable-length inputs before processing." + ) + if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1: + raise ValueError( + f"The number of initial states is expected to be equal to the number of input sequences, " + f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}." + ) + if scale is None: + scale = k.shape[-1] ** -0.5 + o, final_state = ChunkGatedDeltaRuleFunction.apply( + q, + k, + v, + g, + beta, + scale, + initial_state, + output_final_state, + cu_seqlens, + use_qk_l2norm_in_kernel, + ) + if head_first: + o = rearrange(o, "b t h ... -> b h t ...") + return o, final_state diff --git a/model_executor/layers/fla/ops/chunk_delta_h.py b/model_executor/layers/fla/ops/chunk_delta_h.py new file mode 100644 index 0000000..f0b78b6 --- /dev/null +++ b/model_executor/layers/fla/ops/chunk_delta_h.py @@ -0,0 +1,344 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices, prepare_chunk_offsets +from .op import exp +from .utils import use_cuda_graph + +NUM_WARPS = [2, 4, 8, 16] + + +@triton.heuristics( + { + "USE_G": lambda args: args["g"] is not None, + "USE_GK": lambda args: args["gk"] is not None, + "USE_INITIAL_STATE": lambda args: args["h0"] is not None, + "STORE_FINAL_STATE": lambda args: args["ht"] is not None, + "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None, + "IS_VARLEN": lambda args: args["cu_seqlens"] is not None, + } +) +@triton.autotune( + configs=[ + triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [2, 4] + for num_stages in [2, 3, 4] + for BV in [32, 64] + ], + key=["H", "K", "V", "BT"], + use_cuda_graph=use_cuda_graph, +) +@triton.jit(do_not_specialize=["T"]) +def chunk_gated_delta_rule_fwd_kernel_h_blockdim64( + k, + v, + w, + v_new, + g, + gk, + h, + h0, + ht, + cu_seqlens, + chunk_offsets, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + USE_GK: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, + STORE_FINAL_STATE: tl.constexpr, + SAVE_NEW_VALUE: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_v, i_nh = tl.program_id(0), tl.program_id(1) + i_n, i_h = i_nh // H, i_nh % H + if IS_VARLEN: + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + NT = tl.cdiv(T, BT) + boh = tl.load(chunk_offsets + i_n).to(tl.int32) + else: + bos, eos = i_n * T, i_n * T + T + NT = tl.cdiv(T, BT) + boh = i_n * NT + + # [BK, BV] + b_h1 = tl.zeros([64, BV], dtype=tl.float32) + if K > 64: + b_h2 = tl.zeros([64, BV], dtype=tl.float32) + if K > 128: + b_h3 = tl.zeros([64, BV], dtype=tl.float32) + if K > 192: + b_h4 = tl.zeros([64, BV], dtype=tl.float32) + + # calculate offset + h += ((boh * H + i_h) * K * V).to(tl.int64) + v += ((bos * H + i_h) * V).to(tl.int64) + k += ((bos * Hg + i_h // (H // Hg)) * K).to(tl.int64) + w += ((bos * H + i_h) * K).to(tl.int64) + if SAVE_NEW_VALUE: + v_new += ((bos * H + i_h) * V).to(tl.int64) + stride_v = H * V + stride_h = H * K * V + stride_k = Hg * K + stride_w = H * K + if USE_INITIAL_STATE: + h0 = h0 + i_nh * K * V + if STORE_FINAL_STATE: + ht = ht + i_nh * K * V + + # load initial state + if USE_INITIAL_STATE: + p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)) + b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32) + if K > 64: + p_h0_2 = tl.make_block_ptr( + h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0) + ) + b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32) + if K > 128: + p_h0_3 = tl.make_block_ptr( + h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0) + ) + b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32) + if K > 192: + p_h0_4 = tl.make_block_ptr( + h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0) + ) + b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32) + + # main recurrence + for i_t in range(NT): + p_h1 = tl.make_block_ptr( + h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0) + ) + tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1)) + if K > 64: + p_h2 = tl.make_block_ptr( + h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0) + ) + tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1)) + if K > 128: + p_h3 = tl.make_block_ptr( + h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0) + ) + tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1)) + if K > 192: + p_h4 = tl.make_block_ptr( + h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0) + ) + tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1)) + + p_w = tl.make_block_ptr( + w, (T, K), (stride_w, 1), (i_t * BT, 0), (BT, 64), (1, 0) + ) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v = tl.dot(b_w, b_h1.to(b_w.dtype)) + if K > 64: + p_w = tl.make_block_ptr( + w, (T, K), (stride_w, 1), (i_t * BT, 64), (BT, 64), (1, 0) + ) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v += tl.dot(b_w, b_h2.to(b_w.dtype)) + if K > 128: + p_w = tl.make_block_ptr( + w, (T, K), (stride_w, 1), (i_t * BT, 128), (BT, 64), (1, 0) + ) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v += tl.dot(b_w, b_h3.to(b_w.dtype)) + if K > 192: + p_w = tl.make_block_ptr( + w, (T, K), (stride_w, 1), (i_t * BT, 192), (BT, 64), (1, 0) + ) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_v += tl.dot(b_w, b_h4.to(b_w.dtype)) + p_v = tl.make_block_ptr( + v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0) + ) + b_v = tl.load(p_v, boundary_check=(0, 1)) - b_v + + if SAVE_NEW_VALUE: + p_v = tl.make_block_ptr( + v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0) + ) + tl.store(p_v, b_v.to(p_v.dtype.element_ty), boundary_check=(0, 1)) + + last_idx = min((i_t + 1) * BT, T) - 1 + if USE_G: + m_t = (i_t * BT + tl.arange(0, BT)) < T + b_g_last = tl.load(g + bos * H + last_idx * H + i_h) + p_g = tl.make_block_ptr( + g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,) + ) + b_g = tl.load(p_g, boundary_check=(0,)) + b_v = b_v * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None] + b_g_last = exp(b_g_last) + b_h1 *= b_g_last + if K > 64: + b_h2 *= b_g_last + if K > 128: + b_h3 *= b_g_last + if K > 192: + b_h4 *= b_g_last + + if USE_GK: + o_k1 = tl.arange(0, 64) + b_gk_last1 = tl.load( + gk + (bos + last_idx) * H * K + i_h * K + o_k1, + mask=(o_k1 < K), + other=0.0, + ) + b_h1 *= exp(b_gk_last1)[:, None] + if K > 64: + o_k2 = 64 + o_k1 + b_gk_last2 = tl.load( + gk + (bos + last_idx) * H * K + i_h * K + o_k2, + mask=(o_k2 < K), + other=0.0, + ) + b_h2 *= exp(b_gk_last2)[:, None] + if K > 128: + o_k3 = 128 + o_k1 + b_gk_last3 = tl.load( + gk + (bos + last_idx) * H * K + i_h * K + o_k3, + mask=(o_k3 < K), + other=0.0, + ) + b_h3 *= exp(b_gk_last3)[:, None] + if K > 192: + o_k4 = 192 + o_k1 + b_gk_last4 = tl.load( + gk + (bos + last_idx) * H * K + i_h * K + o_k4, + mask=(o_k4 < K), + other=0.0, + ) + b_h4 *= exp(b_gk_last4)[:, None] + b_v = b_v.to(k.dtype.element_ty) + + p_k = tl.make_block_ptr( + k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1) + ) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h1 += tl.dot(b_k, b_v) + if K > 64: + p_k = tl.make_block_ptr( + k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1) + ) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h2 += tl.dot(b_k, b_v) + if K > 128: + p_k = tl.make_block_ptr( + k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1) + ) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h3 += tl.dot(b_k, b_v) + if K > 192: + p_k = tl.make_block_ptr( + k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1) + ) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_h4 += tl.dot(b_k, b_v) + # epilogue + if STORE_FINAL_STATE: + p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)) + tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + if K > 64: + p_ht = tl.make_block_ptr( + ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0) + ) + tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + if K > 128: + p_ht = tl.make_block_ptr( + ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0) + ) + tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + if K > 192: + p_ht = tl.make_block_ptr( + ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0) + ) + tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_gated_delta_rule_fwd_h( + k: torch.Tensor, + w: torch.Tensor, + u: torch.Tensor, + g: torch.Tensor | None = None, + gk: torch.Tensor | None = None, + initial_state: torch.Tensor | None = None, + output_final_state: bool = False, + chunk_size: int = 64, # SY: remove this argument and force chunk size 64? + save_new_value: bool = True, + cu_seqlens: torch.LongTensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + # This kernel is slightly different from fla to support Q/K with different head numbers. + # In fla, Q/K always have the same head number, so Hg is always equal to H. + B, T, Hg, K, V = *k.shape, u.shape[-1] + H = u.shape[-2] + BT = chunk_size + + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, chunk_size) + if cu_seqlens is not None + else None + ) + # N: the actual number of sequences in the batch with either equal or variable lengths + if cu_seqlens is None: + N, NT, chunk_offsets = B, triton.cdiv(T, BT), None + else: + N, NT, chunk_offsets = ( + len(cu_seqlens) - 1, + len(chunk_indices), + prepare_chunk_offsets(cu_seqlens, BT), + ) + assert K <= 256, "current kernel does not support head dimension larger than 256." + + h = k.new_empty(B, NT, H, K, V) + final_state = ( + k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None + ) + + v_new = torch.empty_like(u) if save_new_value else None + + def grid(meta): + return (triton.cdiv(V, meta["BV"]), N * H) + + chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid]( + k=k, + v=u, + w=w, + v_new=v_new, + g=g, + gk=gk, + h=h, + h0=initial_state, + ht=final_state, + cu_seqlens=cu_seqlens, + chunk_offsets=chunk_offsets, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + ) + return h, v_new, final_state diff --git a/model_executor/layers/fla/ops/chunk_o.py b/model_executor/layers/fla/ops/chunk_o.py new file mode 100644 index 0000000..4e8e04c --- /dev/null +++ b/model_executor/layers/fla/ops/chunk_o.py @@ -0,0 +1,183 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +# ruff: noqa: E501 + + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .op import exp +from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper + +BKV_LIST = [64, 128] if check_shared_mem() else [32, 64] +NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8] + + +@triton.heuristics( + { + "USE_G": lambda args: args["g"] is not None, + "IS_VARLEN": lambda args: args["cu_seqlens"] is not None, + } +) +@triton.autotune( + configs=[ + triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages) + for BK in BKV_LIST + for BV in BKV_LIST + for num_warps in NUM_WARPS + for num_stages in [2, 3, 4] + ], + key=["H", "K", "V", "BT"], +) +@triton.jit(do_not_specialize=["T"]) +def chunk_fwd_kernel_o( + q, + k, + v, + h, + g, + o, + cu_seqlens, + chunk_indices, + scale, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if IS_VARLEN: + i_tg = i_t + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + NT = tl.cdiv(T, BT) + else: + NT = tl.cdiv(T, BT) + i_tg = i_b * NT + i_t + bos, eos = i_b * T, i_b * T + T + + # offset calculation + q += (bos * Hg + i_h // (H // Hg)) * K + k += (bos * Hg + i_h // (H // Hg)) * K + v += (bos * H + i_h) * V + o += (bos * H + i_h) * V + h += (i_tg * H + i_h).to(tl.int64) * K * V + + b_o = tl.zeros([BT, BV], dtype=tl.float32) + b_A = tl.zeros([BT, BT], dtype=tl.float32) + + for i_k in range(tl.cdiv(K, BK)): + p_q = tl.make_block_ptr( + q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0) + ) + p_k = tl.make_block_ptr( + k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT), (BK, BT), (0, 1) + ) + p_h = tl.make_block_ptr( + h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0) + ) + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + # [BK, BT] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BK, BV] + b_h = tl.load(p_h, boundary_check=(0, 1)) + + # [BT, BK] @ [BK, BV] -> [BT, BV] + b_o += tl.dot(b_q, b_h) + # [BT, BK] @ [BK, BT] -> [BT, BT] + b_A += tl.dot(b_q, b_k) + + if USE_G: + g += bos * H + i_h + p_g = tl.make_block_ptr(g, (T,), (H,), (i_t * BT,), (BT,), (0,)) + b_g = tl.load(p_g, boundary_check=(0,)) + b_o = b_o * exp(b_g)[:, None] + b_A = b_A * exp(b_g[:, None] - b_g[None, :]) + + o_t = i_t * BT + tl.arange(0, BT) + m_t = o_t < T + m_A = (o_t[:, None] >= o_t[None, :]) & (m_t[:, None] & m_t) + b_A = tl.where(m_A, b_A, 0) + + p_v = tl.make_block_ptr( + v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0) + ) + p_o = tl.make_block_ptr( + o, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0) + ) + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # to fix mma -> mma layout conversion + # already solved by triton v3.2 or higher + b_o = b_o * scale + tl.dot(b_A.to(b_v.dtype), b_v) * scale + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_fwd_o( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + h: torch.Tensor, + g: torch.Tensor | None = None, # cumsum of log decay + scale: float | None = None, + cu_seqlens: torch.LongTensor | None = None, + chunk_size: int = 64, +) -> torch.Tensor: + B, T, Hg, K, V = *q.shape, v.shape[-1] + H = v.shape[-2] + BT = 64 if FLA_GDN_FIX_BT else min(chunk_size, max(16, triton.next_power_of_2(T))) + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None + ) + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + if scale is None: + scale = k.shape[-1] ** -0.5 + + o = torch.empty_like(v) + + def grid(meta): + return (triton.cdiv(V, meta["BV"]), NT, B * H) + + chunk_fwd_kernel_o[grid]( + q, + k, + v, + h, + g, + o, + cu_seqlens, + chunk_indices, + scale, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + ) + return o diff --git a/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py b/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py new file mode 100644 index 0000000..7724fa5 --- /dev/null +++ b/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .op import exp + + +@triton.heuristics( + { + "USE_G": lambda args: args["g"] is not None, + "IS_VARLEN": lambda args: args["cu_seqlens"] is not None, + } +) +@triton.autotune( + configs=[ + triton.Config({"BK": BK}, num_warps=num_warps, num_stages=num_stages) + for BK in [32, 64, 128] + for num_warps in [2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=["H", "K", "BT", "IS_VARLEN"], +) +@triton.jit(do_not_specialize=["T"]) +def chunk_scaled_dot_kkt_fwd_kernel( + k, + beta, + g, + A, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + IS_VARLEN: tl.constexpr, + USE_G: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + o_t = i_t * BT + tl.arange(0, BT) + m_t = o_t < T + + p_beta = tl.make_block_ptr( + beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,) + ) + b_beta = tl.load(p_beta, boundary_check=(0,)) + + b_A = tl.zeros([BT, BT], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_k = tl.make_block_ptr( + k + (bos * Hg + i_h // (H // Hg)) * K, + (T, K), + (Hg * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kb = b_k * b_beta[:, None] + b_A += tl.dot(b_kb.to(b_k.dtype), tl.trans(b_k)) + + if USE_G: + p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)) + b_g = tl.load(p_g, boundary_check=(0,)) + b_g_diff = b_g[:, None] - b_g[None, :] + b_A = b_A * exp(b_g_diff) + + m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t) + b_A = tl.where(m_A, b_A, 0) + p_A = tl.make_block_ptr( + A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1), (i_t * BT, 0), (BT, BT), (1, 0) + ) + tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_scaled_dot_kkt_fwd( + k: torch.Tensor, + g: torch.Tensor | None = None, + beta: torch.Tensor | None = None, + cu_seqlens: torch.LongTensor | None = None, + chunk_size: int = 64, + output_dtype: torch.dtype = torch.float32, +) -> torch.Tensor: + r""" + Compute beta * K * K^T. + + Args: + k (torch.Tensor): + The key tensor of shape `[B, T, H, K]`. + beta (torch.Tensor): + The beta tensor of shape `[B, T, H]`. + g (torch.Tensor): + The cumulative sum of the gate tensor of shape `[B, T, H]`. Default: `None`. + cu_seqlens (torch.LongTensor): + The cumulative sequence lengths of the input tensor. + Default: None + chunk_size (int): + The chunk size. Default: 64. + output_dtype (torch.dtype): + The dtype of the output tensor. Default: `torch.float32` + + Returns: + beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size. + """ + # This kernel is slightly different from fla to support Q/K with different head numbers. + # In fla, Q/K always have the same head number, so Hg is always equal to H. + B, T, Hg, K = k.shape + H = beta.shape[-1] + BT = chunk_size + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None + ) + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + + A = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype) + chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)]( + k=k, + g=g, + beta=beta, + A=A, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + Hg=Hg, + K=K, + BT=BT, + ) + return A diff --git a/model_executor/layers/fla/ops/cumsum.py b/model_executor/layers/fla/ops/cumsum.py new file mode 100644 index 0000000..99b4179 --- /dev/null +++ b/model_executor/layers/fla/ops/cumsum.py @@ -0,0 +1,280 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import warnings + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .utils import check_shared_mem, input_guard + +BS_LIST = [32, 64] if check_shared_mem() else [16, 32] + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]], + key=["B", "H", "BT", "IS_VARLEN", "REVERSE"], +) +@triton.jit(do_not_specialize=["T"]) +def chunk_local_cumsum_scalar_kernel( + s, + o, + cu_seqlens, + chunk_indices, + T, + B: tl.constexpr, + H: tl.constexpr, + BT: tl.constexpr, + REVERSE: tl.constexpr, + IS_VARLEN: tl.constexpr, + HEAD_FIRST: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + if HEAD_FIRST: + p_s = tl.make_block_ptr( + s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,) + ) + p_o = tl.make_block_ptr( + o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,) + ) + else: + p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)) + p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)) + # [BT] + b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32) + b_o = tl.cumsum(b_s, axis=0) + if REVERSE: + b_z = tl.sum(b_s, axis=0) + b_o = -b_o + b_z[None] + b_s + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,)) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[ + triton.Config({"BS": BS}, num_warps=num_warps) + for BS in BS_LIST + for num_warps in [2, 4, 8] + ], + key=["B", "H", "S", "BT", "IS_VARLEN", "REVERSE"], +) +@triton.jit(do_not_specialize=["T"]) +def chunk_local_cumsum_vector_kernel( + s, + o, + cu_seqlens, + chunk_indices, + T, + B: tl.constexpr, + H: tl.constexpr, + S: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + REVERSE: tl.constexpr, + IS_VARLEN: tl.constexpr, + HEAD_FIRST: tl.constexpr, +): + i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + o_i = tl.arange(0, BT) + if REVERSE: + m_s = tl.where(o_i[:, None] <= o_i[None, :], 1.0, 0.0) + else: + m_s = tl.where(o_i[:, None] >= o_i[None, :], 1.0, 0.0) + + if HEAD_FIRST: + p_s = tl.make_block_ptr( + s + (bos * H + i_h * T) * S, + (T, S), + (S, 1), + (i_t * BT, i_s * BS), + (BT, BS), + (1, 0), + ) + p_o = tl.make_block_ptr( + o + (bos * H + i_h * T) * S, + (T, S), + (S, 1), + (i_t * BT, i_s * BS), + (BT, BS), + (1, 0), + ) + else: + p_s = tl.make_block_ptr( + s + (bos * H + i_h) * S, + (T, S), + (H * S, 1), + (i_t * BT, i_s * BS), + (BT, BS), + (1, 0), + ) + p_o = tl.make_block_ptr( + o + (bos * H + i_h) * S, + (T, S), + (H * S, 1), + (i_t * BT, i_s * BS), + (BT, BS), + (1, 0), + ) + # [BT, BS] + b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32) + b_o = tl.dot(m_s, b_s, allow_tf32=False) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_local_cumsum_scalar( + g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: torch.Tensor | None = None, + head_first: bool = False, + output_dtype: torch.dtype | None = torch.float, +) -> torch.Tensor: + if head_first: + B, H, T = g.shape + else: + B, T, H = g.shape + assert chunk_size == 2 ** (chunk_size.bit_length() - 1), ( + "chunk_size must be a power of 2" + ) + BT = chunk_size + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None + ) + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype) + grid = (NT, B * H) + chunk_local_cumsum_scalar_kernel[grid]( + g_org, + g, + cu_seqlens, + chunk_indices, + T=T, + B=B, + H=H, + BT=BT, + HEAD_FIRST=head_first, + REVERSE=reverse, + ) + return g + + +def chunk_local_cumsum_vector( + g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: torch.Tensor | None = None, + head_first: bool = False, + output_dtype: torch.dtype | None = torch.float, +) -> torch.Tensor: + if head_first: + B, H, T, S = g.shape + else: + B, T, H, S = g.shape + BT = chunk_size + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, chunk_size) + if cu_seqlens is not None + else None + ) + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + assert chunk_size == 2 ** (chunk_size.bit_length() - 1), ( + "chunk_size must be a power of 2" + ) + + g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype) + + def grid(meta): + return (triton.cdiv(meta["S"], meta["BS"]), NT, B * H) + + # keep cumulative normalizer in fp32 + # this kernel is equivalent to + # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1) + chunk_local_cumsum_vector_kernel[grid]( + g_org, + g, + cu_seqlens, + chunk_indices, + T=T, + B=B, + H=H, + S=S, + BT=BT, + HEAD_FIRST=head_first, + REVERSE=reverse, + ) + return g + + +@input_guard +def chunk_local_cumsum( + g: torch.Tensor, + chunk_size: int, + reverse: bool = False, + cu_seqlens: torch.Tensor | None = None, + head_first: bool = False, + output_dtype: torch.dtype | None = torch.float, + **kwargs, +) -> torch.Tensor: + if not head_first and g.shape[1] < g.shape[2]: + warnings.warn( + f"Input tensor shape suggests potential format mismatch: seq_len ({g.shape[1]}) < num_heads ({g.shape[2]}). " + "This may indicate the inputs were passed in head-first format [B, H, T, ...] " + "when head_first=False was specified. " + "Please verify your input tensor format matches the expected shape [B, T, H, ...].", + stacklevel=2, + ) + if cu_seqlens is not None: + assert g.shape[0] == 1, ( + "Only batch size 1 is supported when cu_seqlens are provided" + ) + if len(g.shape) == 3: + return chunk_local_cumsum_scalar( + g, chunk_size, reverse, cu_seqlens, head_first, output_dtype + ) + elif len(g.shape) == 4: + return chunk_local_cumsum_vector( + g, chunk_size, reverse, cu_seqlens, head_first, output_dtype + ) + else: + raise ValueError( + f"Unsupported input shape {g.shape}. " + f"which should be (B, T, H, D) if `head_first=False` " + f"or (B, H, T, D) otherwise" + ) diff --git a/model_executor/layers/fla/ops/fused_recurrent.py b/model_executor/layers/fla/ops/fused_recurrent.py new file mode 100644 index 0000000..0f27504 --- /dev/null +++ b/model_executor/layers/fla/ops/fused_recurrent.py @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 + +import torch + +from vllm.triton_utils import tl, triton + +from .op import exp + + +@triton.heuristics( + { + "USE_INITIAL_STATE": lambda args: args["h0"] is not None, + "IS_VARLEN": lambda args: args["cu_seqlens"] is not None, + "IS_CONTINUOUS_BATCHING": lambda args: args["ssm_state_indices"] is not None, + "IS_SPEC_DECODING": lambda args: args["num_accepted_tokens"] is not None, + } +) +@triton.jit(do_not_specialize=["N", "T"]) +def fused_recurrent_gated_delta_rule_fwd_kernel( + q, + k, + v, + g, + beta, + o, + h0, + ht, + cu_seqlens, + ssm_state_indices, + num_accepted_tokens, + scale, + N: tl.int64, # num of sequences + T: tl.int64, # num of tokens + B: tl.constexpr, + H: tl.constexpr, + HV: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + stride_init_state_token: tl.constexpr, + stride_final_state_token: tl.constexpr, + stride_indices_seq: tl.constexpr, + stride_indices_tok: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, # whether to use initial state + INPLACE_FINAL_STATE: tl.constexpr, # whether to store final state inplace + IS_BETA_HEADWISE: tl.constexpr, # whether beta is headwise vector or scalar, + USE_QK_L2NORM_IN_KERNEL: tl.constexpr, + IS_VARLEN: tl.constexpr, + IS_CONTINUOUS_BATCHING: tl.constexpr, + IS_SPEC_DECODING: tl.constexpr, + IS_KDA: tl.constexpr, +): + i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_n, i_hv = i_nh // HV, i_nh % HV + i_h = i_hv // (HV // H) + if IS_VARLEN: + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int64), + tl.load(cu_seqlens + i_n + 1).to(tl.int64), + ) + all = T + T = eos - bos + else: + bos, eos = i_n * T, i_n * T + T + all = B * T + + if T == 0: + # no tokens to process for this sequence + return + + o_k = i_k * BK + tl.arange(0, BK) + o_v = i_v * BV + tl.arange(0, BV) + + p_q = q + (bos * H + i_h) * K + o_k + p_k = k + (bos * H + i_h) * K + o_k + p_v = v + (bos * HV + i_hv) * V + o_v + if IS_BETA_HEADWISE: + p_beta = beta + (bos * HV + i_hv) * V + o_v + else: + p_beta = beta + bos * HV + i_hv + + if not IS_KDA: + p_g = g + bos * HV + i_hv + else: + p_gk = g + (bos * HV + i_hv) * K + o_k + + p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v + + mask_k = o_k < K + mask_v = o_v < V + mask_h = mask_k[:, None] & mask_v[None, :] + + b_h = tl.zeros([BK, BV], dtype=tl.float32) + if USE_INITIAL_STATE: + if IS_CONTINUOUS_BATCHING: + if IS_SPEC_DECODING: + i_t = tl.load(num_accepted_tokens + i_n).to(tl.int64) - 1 + else: + i_t = 0 + p_h0 = ( + h0 + + tl.load(ssm_state_indices + i_n * stride_indices_seq + i_t).to( + tl.int64 + ) + * stride_init_state_token + ) + else: + p_h0 = h0 + bos * HV * K * V + p_h0 = p_h0 + i_hv * K * V + o_k[:, None] * V + o_v[None, :] + b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32) + + for i_t in range(0, T): + b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32) + b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32) + b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32) + + if USE_QK_L2NORM_IN_KERNEL: + b_q = b_q / tl.sqrt(tl.sum(b_q * b_q) + 1e-6) + b_k = b_k / tl.sqrt(tl.sum(b_k * b_k) + 1e-6) + b_q = b_q * scale + # [BK, BV] + if not IS_KDA: + b_g = tl.load(p_g).to(tl.float32) + b_h *= exp(b_g) + else: + b_gk = tl.load(p_gk).to(tl.float32) + b_h *= exp(b_gk[:, None]) + # [BV] + b_v -= tl.sum(b_h * b_k[:, None], 0) + if IS_BETA_HEADWISE: + b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32) + else: + b_beta = tl.load(p_beta).to(tl.float32) + b_v *= b_beta + # [BK, BV] + b_h += b_k[:, None] * b_v[None, :] + # [BV] + b_o = tl.sum(b_h * b_q[:, None], 0) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v) + + # keep the states for multi-query tokens + if INPLACE_FINAL_STATE: + p_ht = ( + ht + + tl.load(ssm_state_indices + i_n * stride_indices_seq + i_t).to( + tl.int64 + ) + * stride_final_state_token + ) + else: + p_ht = ht + (bos + i_t) * stride_final_state_token + p_ht = p_ht + i_hv * K * V + o_k[:, None] * V + o_v[None, :] + tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h) + + p_q += H * K + p_k += H * K + p_o += HV * V + p_v += HV * V + if not IS_KDA: + p_g += HV + else: + p_gk += HV * K + p_beta += HV * (V if IS_BETA_HEADWISE else 1) + + +def fused_recurrent_gated_delta_rule_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + inplace_final_state: bool = True, + cu_seqlens: torch.LongTensor | None = None, + ssm_state_indices: torch.Tensor | None = None, + num_accepted_tokens: torch.Tensor | None = None, + use_qk_l2norm_in_kernel: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, H, K, V = *k.shape, v.shape[-1] + HV = v.shape[2] + N = B if cu_seqlens is None else len(cu_seqlens) - 1 + BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8) + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + assert NK == 1, "NK > 1 is not supported yet" + num_stages = 3 + num_warps = 1 + + o = q.new_empty(NK, *v.shape) + if inplace_final_state: + final_state = initial_state + else: + final_state = q.new_empty(T, HV, K, V, dtype=initial_state.dtype) + + stride_init_state_token = initial_state.stride(0) + stride_final_state_token = final_state.stride(0) + + if ssm_state_indices is None: + stride_indices_seq, stride_indices_tok = 1, 1 + elif ssm_state_indices.ndim == 1: + stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1 + else: + stride_indices_seq, stride_indices_tok = ssm_state_indices.stride() + + grid = (NK, NV, N * HV) + fused_recurrent_gated_delta_rule_fwd_kernel[grid]( + q=q, + k=k, + v=v, + g=g, + beta=beta, + o=o, + h0=initial_state, + ht=final_state, + cu_seqlens=cu_seqlens, + ssm_state_indices=ssm_state_indices, + num_accepted_tokens=num_accepted_tokens, + scale=scale, + N=N, + T=T, + B=B, + H=H, + HV=HV, + K=K, + V=V, + BK=BK, + BV=BV, + stride_init_state_token=stride_init_state_token, + stride_final_state_token=stride_final_state_token, + stride_indices_seq=stride_indices_seq, + stride_indices_tok=stride_indices_tok, + IS_BETA_HEADWISE=beta.ndim == v.ndim, + USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel, + INPLACE_FINAL_STATE=inplace_final_state, + IS_KDA=False, + num_warps=num_warps, + num_stages=num_stages, + ) + o = o.squeeze(0) + return o, final_state + + +class FusedRecurrentFunction(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + inplace_final_state: bool = True, + cu_seqlens: torch.LongTensor | None = None, + ssm_state_indices: torch.Tensor | None = None, + num_accepted_tokens: torch.Tensor | None = None, + use_qk_l2norm_in_kernel: bool = False, + ): + o, final_state = fused_recurrent_gated_delta_rule_fwd( + q=q.contiguous(), + k=k.contiguous(), + v=v.contiguous(), + g=g.contiguous(), + beta=beta.contiguous(), + scale=scale, + initial_state=initial_state, + inplace_final_state=inplace_final_state, + cu_seqlens=cu_seqlens, + ssm_state_indices=ssm_state_indices, + num_accepted_tokens=num_accepted_tokens, + use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel, + ) + + return o, final_state + + +def fused_recurrent_gated_delta_rule( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor = None, + scale: float = None, + initial_state: torch.Tensor = None, + inplace_final_state: bool = True, + cu_seqlens: torch.LongTensor | None = None, + ssm_state_indices: torch.Tensor | None = None, + num_accepted_tokens: torch.Tensor | None = None, + use_qk_l2norm_in_kernel: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, T, H, K]`. + k (torch.Tensor): + keys of shape `[B, T, H, K]`. + v (torch.Tensor): + values of shape `[B, T, HV, V]`. + GVA is applied if `HV > H`. + g (torch.Tensor): + g (decays) of shape `[B, T, HV]`. + beta (torch.Tensor): + betas of shape `[B, T, HV]`. + scale (Optional[int]): + Scale factor for the RetNet attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + initial_state (Optional[torch.Tensor]): + Initial state of shape `[N, HV, K, V]` for `N` input sequences. + For equal-length input sequences, `N` equals the batch size `B`. + Default: `None`. + inplace_final_state: bool: + Whether to store the final state in-place to save memory. + Default: `True`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + ssm_state_indices (Optional[torch.Tensor]): + Indices to map the input sequences to the initial/final states. + num_accepted_tokens (Optional[torch.Tensor]): + Number of accepted tokens for each sequence during decoding. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, T, HV, V]`. + final_state (torch.Tensor): + Final state of shape `[N, HV, K, V]`. + + Examples:: + >>> import torch + >>> import torch.nn.functional as F + >>> from einops import rearrange + >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule + # inputs with equal lengths + >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512 + >>> q = torch.randn(B, T, H, K, device='cuda') + >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1) + >>> v = torch.randn(B, T, HV, V, device='cuda') + >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda')) + >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid() + >>> h0 = torch.randn(B, HV, K, V, device='cuda') + >>> o, ht = fused_gated_recurrent_delta_rule( + q, k, v, g, beta, + initial_state=h0, + ) + # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required + >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta)) + # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected + >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long) + >>> o_var, ht_var = fused_gated_recurrent_delta_rule( + q, k, v, g, beta, + initial_state=h0, + cu_seqlens=cu_seqlens + ) + """ + if cu_seqlens is not None and q.shape[0] != 1: + raise ValueError( + f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`." + f"Please flatten variable-length inputs before processing." + ) + if scale is None: + scale = k.shape[-1] ** -0.5 + else: + assert scale > 0, "scale must be positive" + if beta is None: + beta = torch.ones_like(q[..., 0]) + o, final_state = FusedRecurrentFunction.apply( + q, + k, + v, + g, + beta, + scale, + initial_state, + inplace_final_state, + cu_seqlens, + ssm_state_indices, + num_accepted_tokens, + use_qk_l2norm_in_kernel, + ) + return o, final_state diff --git a/model_executor/layers/fla/ops/index.py b/model_executor/layers/fla/ops/index.py new file mode 100644 index 0000000..f023e13 --- /dev/null +++ b/model_executor/layers/fla/ops/index.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import torch + +from vllm.triton_utils import triton + +from .utils import tensor_cache + + +@tensor_cache +def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor: + return cu_seqlens[1:] - cu_seqlens[:-1] + + +@tensor_cache +def prepare_chunk_indices( + cu_seqlens: torch.LongTensor, chunk_size: int +) -> torch.LongTensor: + indices = torch.cat( + [ + torch.arange(n) + for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist() + ] + ) + return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens) + + +@tensor_cache +def prepare_chunk_offsets( + cu_seqlens: torch.LongTensor, chunk_size: int +) -> torch.LongTensor: + return torch.cat( + [cu_seqlens.new_tensor([0]), triton.cdiv(prepare_lens(cu_seqlens), chunk_size)] + ).cumsum(-1) diff --git a/model_executor/layers/fla/ops/kda.py b/model_executor/layers/fla/ops/kda.py new file mode 100644 index 0000000..700f287 --- /dev/null +++ b/model_executor/layers/fla/ops/kda.py @@ -0,0 +1,1351 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 + + +import torch +import torch.nn as nn + +from vllm.triton_utils import tl, triton +from vllm.utils.math_utils import cdiv, next_power_of_2 + +from .chunk_delta_h import chunk_gated_delta_rule_fwd_h +from .cumsum import chunk_local_cumsum +from .fused_recurrent import fused_recurrent_gated_delta_rule_fwd_kernel +from .index import prepare_chunk_indices +from .l2norm import l2norm_fwd +from .op import exp, log +from .solve_tril import solve_tril +from .utils import is_amd + +BT_LIST_AUTOTUNE = [32, 64, 128] +NUM_WARPS_AUTOTUNE = [2, 4, 8, 16] if is_amd else [4, 8, 16, 32] + + +def fused_recurrent_kda_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + inplace_final_state: bool = True, + cu_seqlens: torch.LongTensor | None = None, + ssm_state_indices: torch.Tensor | None = None, + num_accepted_tokens: torch.Tensor | None = None, + use_qk_l2norm_in_kernel: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, H, K, V = *k.shape, v.shape[-1] + HV = v.shape[2] + N = B if cu_seqlens is None else len(cu_seqlens) - 1 + BK, BV = next_power_of_2(K), min(next_power_of_2(V), 8) + NK, NV = cdiv(K, BK), cdiv(V, BV) + assert NK == 1, "NK > 1 is not supported yet" + num_stages = 3 + num_warps = 1 + + o = torch.empty_like(k) + if inplace_final_state: + final_state = initial_state + else: + final_state = q.new_empty(T, HV, K, V, dtype=initial_state.dtype) + + stride_init_state_token = initial_state.stride(0) + stride_final_state_token = final_state.stride(0) + + if ssm_state_indices is None: + stride_indices_seq, stride_indices_tok = 1, 1 + elif ssm_state_indices.ndim == 1: + stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1 + else: + stride_indices_seq, stride_indices_tok = ssm_state_indices.stride() + + grid = (NK, NV, N * HV) + fused_recurrent_gated_delta_rule_fwd_kernel[grid]( + q=q, + k=k, + v=v, + g=g, + beta=beta, + o=o, + h0=initial_state, + ht=final_state, + cu_seqlens=cu_seqlens, + ssm_state_indices=ssm_state_indices, + num_accepted_tokens=num_accepted_tokens, + scale=scale, + N=N, + T=T, + B=B, + H=H, + HV=HV, + K=K, + V=V, + BK=BK, + BV=BV, + stride_init_state_token=stride_init_state_token, + stride_final_state_token=stride_final_state_token, + stride_indices_seq=stride_indices_seq, + stride_indices_tok=stride_indices_tok, + IS_BETA_HEADWISE=beta.ndim == v.ndim, + USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel, + INPLACE_FINAL_STATE=inplace_final_state, + IS_KDA=True, + num_warps=num_warps, + num_stages=num_stages, + ) + + return o, final_state + + +def fused_recurrent_kda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor = None, + scale: float = None, + initial_state: torch.Tensor = None, + inplace_final_state: bool = True, + use_qk_l2norm_in_kernel: bool = True, + cu_seqlens: torch.LongTensor | None = None, + ssm_state_indices: torch.LongTensor | None = None, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + if cu_seqlens is not None and q.shape[0] != 1: + raise ValueError( + f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`." + f"Please flatten variable-length inputs before processing." + ) + if scale is None: + scale = k.shape[-1] ** -0.5 + + o, final_state = fused_recurrent_kda_fwd( + q=q.contiguous(), + k=k.contiguous(), + v=v.contiguous(), + g=g.contiguous(), + beta=beta.contiguous(), + scale=scale, + initial_state=initial_state, + inplace_final_state=inplace_final_state, + cu_seqlens=cu_seqlens, + ssm_state_indices=ssm_state_indices, + num_accepted_tokens=None, + use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel, + ) + return o, final_state + + +@triton.heuristics( + { + "STORE_RESIDUAL_OUT": lambda args: args["residual_out"] is not None, + "HAS_RESIDUAL": lambda args: args["residual"] is not None, + "HAS_WEIGHT": lambda args: args["w"] is not None, + "HAS_BIAS": lambda args: args["b"] is not None, + } +) +@triton.jit +def layer_norm_gated_fwd_kernel( + x, # pointer to the input + g, # pointer to the gate + y, # pointer to the output + w, # pointer to the weights + b, # pointer to the biases + residual, # pointer to the residual + residual_out, # pointer to the residual + mean, # pointer to the mean + rstd, # pointer to the 1/std + eps, # epsilon to avoid division by zero + T, # number of rows in x + D: tl.constexpr, # number of columns in x + BT: tl.constexpr, + BD: tl.constexpr, + ACTIVATION: tl.constexpr, + IS_RMS_NORM: tl.constexpr, + STORE_RESIDUAL_OUT: tl.constexpr, + HAS_RESIDUAL: tl.constexpr, + HAS_WEIGHT: tl.constexpr, + HAS_BIAS: tl.constexpr, +): + i_t = tl.program_id(0) + + o_d = tl.arange(0, BD) + m_d = o_d < D + + p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32) + if HAS_RESIDUAL: + p_res = tl.make_block_ptr( + residual, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0) + ) + b_x += tl.load(p_res, boundary_check=(0, 1)).to(tl.float32) + if STORE_RESIDUAL_OUT: + p_res_out = tl.make_block_ptr( + residual_out, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0) + ) + tl.store(p_res_out, b_x.to(p_res_out.dtype.element_ty), boundary_check=(0, 1)) + if not IS_RMS_NORM: + b_mean = tl.sum(b_x, axis=1) / D + p_mean = tl.make_block_ptr(mean, (T,), (1,), (i_t * BT,), (BT,), (0,)) + tl.store(p_mean, b_mean.to(p_mean.dtype.element_ty), boundary_check=(0,)) + b_xbar = tl.where(m_d[None, :], b_x - b_mean[:, None], 0.0) + b_var = tl.sum(b_xbar * b_xbar, axis=1) / D + else: + b_xbar = tl.where(m_d[None, :], b_x, 0.0) + b_var = tl.sum(b_xbar * b_xbar, axis=1) / D + b_rstd = 1 / tl.sqrt(b_var + eps) + + p_rstd = tl.make_block_ptr(rstd, (T,), (1,), (i_t * BT,), (BT,), (0,)) + tl.store(p_rstd, b_rstd.to(p_rstd.dtype.element_ty), boundary_check=(0,)) + + if HAS_WEIGHT: + b_w = tl.load(w + o_d, mask=m_d).to(tl.float32) + if HAS_BIAS: + b_b = tl.load(b + o_d, mask=m_d).to(tl.float32) + b_x_hat = ( + (b_x - b_mean[:, None]) * b_rstd[:, None] + if not IS_RMS_NORM + else b_x * b_rstd[:, None] + ) + b_y = b_x_hat * b_w[None, :] if HAS_WEIGHT else b_x_hat + if HAS_BIAS: + b_y = b_y + b_b[None, :] + + # swish/sigmoid output gate + p_g = tl.make_block_ptr(g, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + b_g = tl.load(p_g, boundary_check=(0, 1)).to(tl.float32) + if ACTIVATION == "swish" or ACTIVATION == "silu": + b_y = b_y * b_g * tl.sigmoid(b_g) + elif ACTIVATION == "sigmoid": + b_y = b_y * tl.sigmoid(b_g) + + # Write output + p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics( + { + "STORE_RESIDUAL_OUT": lambda args: args["residual_out"] is not None, + "HAS_RESIDUAL": lambda args: args["residual"] is not None, + "HAS_WEIGHT": lambda args: args["w"] is not None, + "HAS_BIAS": lambda args: args["b"] is not None, + } +) +@triton.jit +def layer_norm_gated_fwd_kernel1( + x, # pointer to the input + g, # pointer to the gate + y, # pointer to the output + w, # pointer to the weights + b, # pointer to the biases + residual, # pointer to the residual + residual_out, # pointer to the residual + mean, # pointer to the mean + rstd, # pointer to the 1/std + eps, # epsilon to avoid division by zero + D: tl.constexpr, # number of columns in x + BD: tl.constexpr, + ACTIVATION: tl.constexpr, + IS_RMS_NORM: tl.constexpr, + STORE_RESIDUAL_OUT: tl.constexpr, + HAS_RESIDUAL: tl.constexpr, + HAS_WEIGHT: tl.constexpr, + HAS_BIAS: tl.constexpr, +): + i_t = tl.program_id(0) + x += i_t * D + y += i_t * D + g += i_t * D + if HAS_RESIDUAL: + residual += i_t * D + if STORE_RESIDUAL_OUT: + residual_out += i_t * D + + o_d = tl.arange(0, BD) + m_d = o_d < D + b_x = tl.load(x + o_d, mask=m_d, other=0.0).to(tl.float32) + if HAS_RESIDUAL: + b_x += tl.load(residual + o_d, mask=m_d, other=0.0).to(tl.float32) + if STORE_RESIDUAL_OUT: + tl.store(residual_out + o_d, b_x, mask=m_d) + if not IS_RMS_NORM: + b_mean = tl.sum(b_x, axis=0) / D + tl.store(mean + i_t, b_mean) + b_xbar = tl.where(m_d, b_x - b_mean, 0.0) + b_var = tl.sum(b_xbar * b_xbar, axis=0) / D + else: + b_xbar = tl.where(m_d, b_x, 0.0) + b_var = tl.sum(b_xbar * b_xbar, axis=0) / D + b_rstd = 1 / tl.sqrt(b_var + eps) + tl.store(rstd + i_t, b_rstd) + + if HAS_WEIGHT: + b_w = tl.load(w + o_d, mask=m_d).to(tl.float32) + if HAS_BIAS: + b_b = tl.load(b + o_d, mask=m_d).to(tl.float32) + b_x_hat = (b_x - b_mean) * b_rstd if not IS_RMS_NORM else b_x * b_rstd + b_y = b_x_hat * b_w if HAS_WEIGHT else b_x_hat + if HAS_BIAS: + b_y = b_y + b_b + + # swish/sigmoid output gate + b_g = tl.load(g + o_d, mask=m_d, other=0.0).to(tl.float32) + if ACTIVATION == "swish" or ACTIVATION == "silu": + b_y = b_y * b_g * tl.sigmoid(b_g) + elif ACTIVATION == "sigmoid": + b_y = b_y * tl.sigmoid(b_g) + + # Write output + tl.store(y + o_d, b_y, mask=m_d) + + +def layer_norm_gated_fwd( + x: torch.Tensor, + g: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + activation: str = "swish", + eps: float = 1e-5, + residual: torch.Tensor = None, + out_dtype: torch.dtype = None, + residual_dtype: torch.dtype = None, + is_rms_norm: bool = False, +): + if residual is not None: + residual_dtype = residual.dtype + T, D = x.shape + if residual is not None: + assert residual.shape == (T, D) + if weight is not None: + assert weight.shape == (D,) + if bias is not None: + assert bias.shape == (D,) + # allocate output + y = x if out_dtype is None else torch.empty_like(x, dtype=out_dtype) + if residual is not None or ( + residual_dtype is not None and residual_dtype != x.dtype + ): + residual_out = torch.empty(T, D, device=x.device, dtype=residual_dtype) + else: + residual_out = None + mean = ( + torch.empty((T,), dtype=torch.float, device=x.device) + if not is_rms_norm + else None + ) + rstd = torch.empty((T,), dtype=torch.float, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BD = min(MAX_FUSED_SIZE, next_power_of_2(D)) + if D > BD: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + + if D <= 512: + BT = 32 + layer_norm_gated_fwd_kernel[(cdiv(T, BT),)]( + x=x, + g=g, + y=y, + w=weight, + b=bias, + residual=residual, + residual_out=residual_out, + mean=mean, + rstd=rstd, + eps=eps, + T=T, + D=D, + BD=BD, + BT=BT, + ACTIVATION=activation, + IS_RMS_NORM=is_rms_norm, + num_warps=4, + ) + else: + layer_norm_gated_fwd_kernel1[(T,)]( + x=x, + g=g, + y=y, + w=weight, + b=bias, + residual=residual, + residual_out=residual_out, + mean=mean, + rstd=rstd, + eps=eps, + D=D, + BD=BD, + ACTIVATION=activation, + IS_RMS_NORM=is_rms_norm, + num_warps=4, + ) + # residual_out is None if residual is None and residual_dtype == input_dtype + return y, mean, rstd, residual_out if residual_out is not None else x + + +def rms_norm_gated( + x: torch.Tensor, + g: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + activation: str = "swish", + residual: torch.Tensor | None = None, + prenorm: bool = False, + residual_in_fp32: bool = False, + eps: float = 1e-6, +): + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.contiguous().reshape(-1, x.shape[-1]) + g = g.contiguous().reshape(-1, g.shape[-1]) + if residual is not None: + assert residual.shape == x_shape_og + residual = residual.contiguous().reshape(-1, residual.shape[-1]) + residual_dtype = ( + residual.dtype + if residual is not None + else (torch.float if residual_in_fp32 else None) + ) + y, _, _, residual_out = layer_norm_gated_fwd( + x=x, + g=g, + weight=weight, + bias=bias, + activation=activation, + eps=eps, + residual=residual, + residual_dtype=residual_dtype, + is_rms_norm=True, + ) + y = y.reshape(x_shape_og) + return y if not prenorm else (y, residual_out.reshape(x_shape_og)) + + +class FusedRMSNormGated(nn.Module): + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + eps: float = 1e-5, + activation: str = "swish", + device: torch.device | None = None, + dtype: torch.dtype | None = None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + self.activation = activation + + if self.activation not in ["swish", "silu", "sigmoid"]: + raise ValueError(f"Unsupported activation: {self.activation}") + + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + def forward( + self, + x: torch.Tensor, + g: torch.Tensor, + residual: torch.Tensor | None = None, + prenorm: bool = False, + residual_in_fp32: bool = False, + ) -> torch.Tensor: + return rms_norm_gated( + x, + g, + self.weight, + self.bias, + self.activation, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + ) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[ + triton.Config({"BK": BK}, num_warps=num_warps, num_stages=num_stages) + for BK in [32, 64] + for num_warps in [1, 2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=["BC"], +) +@triton.jit(do_not_specialize=["T"]) +def chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_inter( + q, + k, + g, + beta, + A, + Aqk, + scale, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + K: tl.constexpr, + BT: tl.constexpr, + BC: tl.constexpr, + BK: tl.constexpr, + NC: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_t, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + i_i, i_j = i_c // NC, i_c % NC + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + if i_t * BT + i_i * BC >= T: + return + if i_i <= i_j: + return + + q += (bos * H + i_h) * K + k += (bos * H + i_h) * K + g += (bos * H + i_h) * K + A += (bos * H + i_h) * BT + Aqk += (bos * H + i_h) * BT + + p_b = tl.make_block_ptr( + beta + bos * H + i_h, (T,), (H,), (i_t * BT + i_i * BC,), (BC,), (0,) + ) + b_b = tl.load(p_b, boundary_check=(0,)) + + b_A = tl.zeros([BC, BC], dtype=tl.float32) + b_Aqk = tl.zeros([BC, BC], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_q = tl.make_block_ptr( + q, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0) + ) + p_k = tl.make_block_ptr( + k, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0) + ) + p_g = tl.make_block_ptr( + g, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0) + ) + b_kt = tl.make_block_ptr( + k, (K, T), (1, H * K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1) + ) + p_gk = tl.make_block_ptr( + g, (K, T), (1, H * K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1) + ) + + o_k = i_k * BK + tl.arange(0, BK) + m_k = o_k < K + # [BK,] + b_gn = tl.load(g + (i_t * BT + i_i * BC) * H * K + o_k, mask=m_k, other=0) + # [BC, BK] + b_g = tl.load(p_g, boundary_check=(0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) * exp(b_g - b_gn[None, :]) + # [BK, BC] + b_gk = tl.load(p_gk, boundary_check=(0, 1)) + b_kt = tl.load(b_kt, boundary_check=(0, 1)) + # [BC, BC] + b_ktg = b_kt * exp(b_gn[:, None] - b_gk) + b_A += tl.dot(b_k, b_ktg) + + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_qg = b_q * exp(b_g - b_gn[None, :]) * scale + b_Aqk += tl.dot(b_qg, b_ktg) + + b_A *= b_b[:, None] + + p_A = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0) + ) + tl.store(p_A, b_A.to(A.dtype.element_ty), boundary_check=(0, 1)) + p_Aqk = tl.make_block_ptr( + Aqk, (T, BT), (H * BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0) + ) + tl.store(p_Aqk, b_Aqk.to(Aqk.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]], + key=["BK", "BT"], +) +@triton.jit(do_not_specialize=["T"]) +def chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_intra( + q, + k, + g, + beta, + A, + Aqk, + scale, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + K: tl.constexpr, + BT: tl.constexpr, + BC: tl.constexpr, + BK: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_t, i_i, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + if i_t * BT + i_i * BC >= T: + return + + o_i = tl.arange(0, BC) + o_k = tl.arange(0, BK) + m_k = o_k < K + m_A = (i_t * BT + i_i * BC + o_i) < T + o_A = (bos + i_t * BT + i_i * BC + o_i) * H * BT + i_h * BT + i_i * BC + + p_q = tl.make_block_ptr( + q + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT + i_i * BC, 0), + (BC, BK), + (1, 0), + ) + p_k = tl.make_block_ptr( + k + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT + i_i * BC, 0), + (BC, BK), + (1, 0), + ) + p_g = tl.make_block_ptr( + g + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT + i_i * BC, 0), + (BC, BK), + (1, 0), + ) + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_g = tl.load(p_g, boundary_check=(0, 1)) + + p_b = beta + (bos + i_t * BT + i_i * BC + o_i) * H + i_h + b_k = b_k * tl.load(p_b, mask=m_A, other=0)[:, None] + + p_kt = k + (bos + i_t * BT + i_i * BC) * H * K + i_h * K + o_k + p_gk = g + (bos + i_t * BT + i_i * BC) * H * K + i_h * K + o_k + + for j in range(0, min(BC, T - i_t * BT - i_i * BC)): + b_kt = tl.load(p_kt, mask=m_k, other=0).to(tl.float32) + b_gk = tl.load(p_gk, mask=m_k, other=0).to(tl.float32) + b_ktg = b_kt[None, :] * exp(b_g - b_gk[None, :]) + b_A = tl.sum(b_k * b_ktg, 1) + b_A = tl.where(o_i > j, b_A, 0.0) + b_Aqk = tl.sum(b_q * b_ktg, 1) + b_Aqk = tl.where(o_i >= j, b_Aqk * scale, 0.0) + tl.store(A + o_A + j, b_A, mask=m_A) + tl.store(Aqk + o_A + j, b_Aqk, mask=m_A) + p_kt += H * K + p_gk += H * K + + +def chunk_kda_scaled_dot_kkt_fwd( + q: torch.Tensor, + k: torch.Tensor, + gk: torch.Tensor | None = None, + beta: torch.Tensor | None = None, + scale: float | None = None, + cu_seqlens: torch.LongTensor | None = None, + chunk_size: int = 64, + output_dtype: torch.dtype = torch.float32, +) -> tuple[torch.Tensor, torch.Tensor]: + r""" + Compute beta * K * K^T. + + Args: + k (torch.Tensor): + The key tensor of shape `[B, T, H, K]`. + beta (torch.Tensor): + The beta tensor of shape `[B, T, H]`. + gk (torch.Tensor): + The cumulative sum of the gate tensor of shape `[B, T, H, K]` applied to the key tensor. Default: `None`. + cu_seqlens (torch.LongTensor): + The cumulative sequence lengths of the input tensor. + Default: None + chunk_size (int): + The chunk size. Default: 64. + output_dtype (torch.dtype): + The dtype of the output tensor. Default: `torch.float32` + + Returns: + beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size. + """ + B, T, H, K = k.shape + assert K <= 256 + BT = chunk_size + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None + ) + NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + + BC = min(16, BT) + NC = cdiv(BT, BC) + BK = max(next_power_of_2(K), 16) + A = torch.zeros(B, T, H, BT, device=k.device, dtype=output_dtype) + Aqk = torch.zeros(B, T, H, BT, device=k.device, dtype=output_dtype) + grid = (NT, NC * NC, B * H) + chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_inter[grid]( + q=q, + k=k, + g=gk, + beta=beta, + A=A, + Aqk=Aqk, + scale=scale, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + K=K, + BT=BT, + BC=BC, + NC=NC, + ) + + grid = (NT, NC, B * H) + chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_intra[grid]( + q=q, + k=k, + g=gk, + beta=beta, + A=A, + Aqk=Aqk, + scale=scale, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + K=K, + BT=BT, + BC=BC, + BK=BK, + ) + return A, Aqk + + +@triton.heuristics( + { + "STORE_QG": lambda args: args["qg"] is not None, + "STORE_KG": lambda args: args["kg"] is not None, + "IS_VARLEN": lambda args: args["cu_seqlens"] is not None, + } +) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"], +) +@triton.jit(do_not_specialize=["T"]) +def recompute_w_u_fwd_kernel( + q, + k, + qg, + kg, + v, + beta, + w, + u, + A, + gk, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + STORE_QG: tl.constexpr, + STORE_KG: tl.constexpr, + IS_VARLEN: tl.constexpr, + DOT_PRECISION: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + p_b = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)) + b_b = tl.load(p_b, boundary_check=(0,)) + + p_A = tl.make_block_ptr( + A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0) + ) + b_A = tl.load(p_A, boundary_check=(0, 1)) + + for i_v in range(tl.cdiv(V, BV)): + p_v = tl.make_block_ptr( + v + (bos * H + i_h) * V, + (T, V), + (H * V, 1), + (i_t * BT, i_v * BV), + (BT, BV), + (1, 0), + ) + p_u = tl.make_block_ptr( + u + (bos * H + i_h) * V, + (T, V), + (H * V, 1), + (i_t * BT, i_v * BV), + (BT, BV), + (1, 0), + ) + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_vb = (b_v * b_b[:, None]).to(b_v.dtype) + b_u = tl.dot(b_A, b_vb, input_precision=DOT_PRECISION) + tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1)) + + for i_k in range(tl.cdiv(K, BK)): + p_w = tl.make_block_ptr( + w + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + p_k = tl.make_block_ptr( + k + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kb = b_k * b_b[:, None] + + p_gk = tl.make_block_ptr( + gk + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + b_gk = tl.load(p_gk, boundary_check=(0, 1)) + b_kb *= exp(b_gk) + if STORE_QG: + p_q = tl.make_block_ptr( + q + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + p_qg = tl.make_block_ptr( + qg + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_qg = b_q * exp(b_gk) + tl.store(p_qg, b_qg.to(p_qg.dtype.element_ty), boundary_check=(0, 1)) + if STORE_KG: + last_idx = min(i_t * BT + BT, T) - 1 + + o_k = i_k * BK + tl.arange(0, BK) + m_k = o_k < K + b_gn = tl.load( + gk + ((bos + last_idx) * H + i_h) * K + o_k, mask=m_k, other=0.0 + ) + b_kg = b_k * exp(b_gn - b_gk) + + p_kg = tl.make_block_ptr( + kg + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + tl.store(p_kg, b_kg.to(p_kg.dtype.element_ty), boundary_check=(0, 1)) + + b_w = tl.dot(b_A, b_kb.to(b_k.dtype)) + tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1)) + + +def recompute_w_u_fwd( + k: torch.Tensor, + v: torch.Tensor, + beta: torch.Tensor, + A: torch.Tensor, + q: torch.Tensor | None = None, + gk: torch.Tensor | None = None, + cu_seqlens: torch.LongTensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, H, K, V = *k.shape, v.shape[-1] + BT = A.shape[-1] + BK = 64 + BV = 64 + + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None + ) + NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + + w = torch.empty_like(k) + u = torch.empty_like(v) + kg = torch.empty_like(k) if gk is not None else None + recompute_w_u_fwd_kernel[(NT, B * H)]( + q=q, + k=k, + qg=None, + kg=kg, + v=v, + beta=beta, + w=w, + u=u, + A=A, + gk=gk, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + DOT_PRECISION="ieee", + ) + return w, u, None, kg + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[ + triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages) + for BK in [32, 64] + for BV in [64, 128] + for num_warps in [2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=["BT"], +) +@triton.jit(do_not_specialize=["T"]) +def chunk_gla_fwd_kernel_o( + q, + v, + g, + h, + o, + A, + cu_seqlens, + chunk_indices, + scale, + T, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_tg = i_t + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + NT = tl.cdiv(T, BT) + else: + NT = tl.cdiv(T, BT) + i_tg = i_b * NT + i_t + bos, eos = i_b * T, i_b * T + T + + m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :] + + b_o = tl.zeros([BT, BV], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_q = tl.make_block_ptr( + q + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + p_g = tl.make_block_ptr( + g + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + p_h = tl.make_block_ptr( + h + (i_tg * H + i_h) * K * V, + (K, V), + (V, 1), + (i_k * BK, i_v * BV), + (BK, BV), + (1, 0), + ) + + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BT, BK] + b_g = tl.load(p_g, boundary_check=(0, 1)) + # [BT, BK] + b_qg = (b_q * exp(b_g)).to(b_q.dtype) + # [BK, BV] + b_h = tl.load(p_h, boundary_check=(0, 1)) + # works but dkw, owing to divine benevolence + # [BT, BV] + if i_k >= 0: + b_o += tl.dot(b_qg, b_h.to(b_qg.dtype)) + p_v = tl.make_block_ptr( + v + (bos * H + i_h) * V, + (T, V), + (H * V, 1), + (i_t * BT, i_v * BV), + (BT, BV), + (1, 0), + ) + p_o = tl.make_block_ptr( + o + (bos * H + i_h) * V, + (T, V), + (H * V, 1), + (i_t * BT, i_v * BV), + (BT, BV), + (1, 0), + ) + p_A = tl.make_block_ptr( + A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0) + ) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BT, BT] + b_A = tl.load(p_A, boundary_check=(0, 1)) + b_A = tl.where(m_s, b_A, 0.0).to(b_v.dtype) + b_o += tl.dot(b_A, b_v, allow_tf32=False) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_gla_fwd_o_gk( + q: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + A: torch.Tensor, + h: torch.Tensor, + o: torch.Tensor, + scale: float, + cu_seqlens: torch.LongTensor | None = None, + chunk_size: int = 64, +): + B, T, H, K, V = *q.shape, v.shape[-1] + BT = chunk_size + + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, chunk_size) + if cu_seqlens is not None + else None + ) + NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + + def grid(meta): + return (cdiv(V, meta["BV"]), NT, B * H) + + chunk_gla_fwd_kernel_o[grid]( + q=q, + v=v, + g=g, + h=h, + o=o, + A=A, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + scale=scale, + T=T, + H=H, + K=K, + V=V, + BT=BT, + ) + return o + + +def chunk_kda_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: torch.LongTensor | None = None, +): + chunk_size = 64 + g = chunk_local_cumsum(g, chunk_size=chunk_size, cu_seqlens=cu_seqlens) + # the intra Aqk is kept in fp32 + # the computation has very marginal effect on the entire throughput + A, Aqk = chunk_kda_scaled_dot_kkt_fwd( + q=q, + k=k, + gk=g, + beta=beta, + scale=scale, + cu_seqlens=cu_seqlens, + output_dtype=torch.float32, + ) + A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype) + w, u, _, kg = recompute_w_u_fwd( + k=k, + v=v, + beta=beta, + A=A, + gk=g, + cu_seqlens=cu_seqlens, + ) + del A + h, v_new, final_state = chunk_gated_delta_rule_fwd_h( + k=kg, + w=w, + u=u, + gk=g, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + del w, u, kg + o = chunk_gla_fwd_o_gk( + q=q, + v=v_new, + g=g, + A=Aqk, + h=h, + o=v, + scale=scale, + cu_seqlens=cu_seqlens, + chunk_size=chunk_size, + ) + del Aqk, v_new, h + return o, final_state + + +def chunk_kda( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + scale: float = None, + initial_state: torch.Tensor = None, + output_final_state: bool = False, + use_qk_l2norm_in_kernel: bool = False, + cu_seqlens: torch.LongTensor | None = None, + **kwargs, +): + if scale is None: + scale = k.shape[-1] ** -0.5 + + if use_qk_l2norm_in_kernel: + q = l2norm_fwd(q.contiguous()) + k = l2norm_fwd(k.contiguous()) + + o, final_state = chunk_kda_fwd( + q=q, + k=k, + v=v.contiguous(), + g=g.contiguous(), + beta=beta.contiguous(), + scale=scale, + initial_state=initial_state.contiguous(), + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + return o, final_state + + +@triton.autotune( + configs=[ + triton.Config({"BT": bt}, num_warps=nw, num_stages=ns) + for bt in BT_LIST_AUTOTUNE + for nw in NUM_WARPS_AUTOTUNE + for ns in [2, 3] + ], + key=["H", "D"], +) +@triton.jit +def kda_gate_fwd_kernel( + g, + A, + y, + g_bias, + beta: tl.constexpr, + threshold: tl.constexpr, + T, + H, + D: tl.constexpr, + BT: tl.constexpr, + BD: tl.constexpr, + HAS_BIAS: tl.constexpr, +): + i_t, i_h = tl.program_id(0), tl.program_id(1) + n_t = i_t * BT + + b_a = tl.load(A + i_h).to(tl.float32) + b_a = -tl.exp(b_a) + + stride_row = H * D + stride_col = 1 + + g_ptr = tl.make_block_ptr( + base=g + i_h * D, + shape=(T, D), + strides=(stride_row, stride_col), + offsets=(n_t, 0), + block_shape=(BT, BD), + order=(1, 0), + ) + + y_ptr = tl.make_block_ptr( + base=y + i_h * D, + shape=(T, D), + strides=(stride_row, stride_col), + offsets=(n_t, 0), + block_shape=(BT, BD), + order=(1, 0), + ) + + b_g = tl.load(g_ptr, boundary_check=(0, 1)).to(tl.float32) + + if HAS_BIAS: + n_d = tl.arange(0, BD) + bias_mask = n_d < D + b_bias = tl.load(g_bias + i_h * D + n_d, mask=bias_mask, other=0.0).to( + tl.float32 + ) + b_g = b_g + b_bias[None, :] + + # softplus(x, beta) = (1/beta) * log(1 + exp(beta * x)) + # When beta * x > threshold, use linear approximation x + # Use threshold to switch to linear when beta*x > threshold + g_scaled = b_g * beta + use_linear = g_scaled > threshold + sp = tl.where(use_linear, b_g, (1.0 / beta) * log(1.0 + tl.exp(g_scaled))) + b_y = b_a * sp + + tl.store(y_ptr, b_y.to(y.dtype.element_ty), boundary_check=(0, 1)) + + +def fused_kda_gate( + g: torch.Tensor, + A: torch.Tensor, + head_k_dim: int, + g_bias: torch.Tensor | None = None, + beta: float = 1.0, + threshold: float = 20.0, +) -> torch.Tensor: + """ + Forward pass for KDA gate: + input g: [..., H*D] + param A: [H] or [1, 1, H, 1] + beta: softplus beta parameter + threshold: softplus threshold parameter + return : [..., H, D] + """ + orig_shape = g.shape[:-1] + + g = g.view(-1, g.shape[-1]) + T = g.shape[0] + HD = g.shape[1] + H = A.numel() + assert H * head_k_dim == HD + + y = torch.empty_like(g, dtype=torch.float32) + + def grid(meta): + return (cdiv(T, meta["BT"]), H) + + kda_gate_fwd_kernel[grid]( + g, + A, + y, + g_bias, + beta, + threshold, + T, + H, + head_k_dim, + BD=next_power_of_2(head_k_dim), + HAS_BIAS=g_bias is not None, + ) + + y = y.view(*orig_shape, H, head_k_dim) + return y diff --git a/model_executor/layers/fla/ops/l2norm.py b/model_executor/layers/fla/ops/l2norm.py new file mode 100644 index 0000000..4d7dbb5 --- /dev/null +++ b/model_executor/layers/fla/ops/l2norm.py @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import os + +import torch + +from vllm.triton_utils import tl, triton + +BT_LIST = [8, 16, 32, 64, 128] + +USE_DEFAULT_FLA_NORM = int(os.getenv("USE_DEFAULT_FLA_NORM", "0")) + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8, 16, 32] + ], + key=["D"], +) +@triton.jit +def l2norm_fwd_kernel1( + x, + y, + D, + BD: tl.constexpr, + eps, +): + i_t = tl.program_id(0) + x += i_t * D + y += i_t * D + # Compute mean and variance + cols = tl.arange(0, BD) + mask = cols < D + b_x = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32) + b_var = tl.sum(b_x * b_x, axis=0) + b_rstd = 1 / tl.sqrt(b_var + eps) + # tl.store(Rstd + i_t, rstd) + # Normalize and apply linear transformation + b_y = b_x * b_rstd + tl.store(y + cols, b_y, mask=mask) + + +@triton.autotune( + configs=[ + triton.Config({"BT": BT}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16] + for BT in BT_LIST + ], + key=["D"], +) +@triton.jit(do_not_specialize=["NB"]) +def l2norm_fwd_kernel( + x, + y, + eps, + NB, + T, + D: tl.constexpr, + BT: tl.constexpr, + BD: tl.constexpr, +): + i_t = tl.program_id(0) + p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32) + b_var = tl.sum(b_x * b_x, axis=1) + b_y = b_x / tl.sqrt(b_var + eps)[:, None] + p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)) + tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit +def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr): + xoffset = tl.program_id(0) * MBLOCK + row_idx = xoffset + tl.arange(0, MBLOCK)[:, None] + xmask = row_idx < M + rindex = tl.arange(0, N)[None, :] + xs = tl.load(X + (rindex + N * row_idx), xmask).to(tl.float32) + square = tl.broadcast_to(xs * xs, [MBLOCK, N]) + square_sum = tl.sum(tl.where(xmask, square, 0), 1)[:, None] + rsqrt = tl.rsqrt(square_sum + eps) + tl.store(Y + (rindex + N * row_idx), xs * rsqrt, xmask) + + +def l2norm_fwd( + x: torch.Tensor, eps: float = 1e-6, output_dtype: torch.dtype | None = None +): + x_shape_og = x.shape + x = x.view(-1, x.shape[-1]) + # allocate output + if output_dtype is None: + y = torch.empty_like(x) + else: + y = torch.empty_like(x, dtype=output_dtype) + assert y.stride(-1) == 1 + T, D = x.shape[0], x.shape[-1] + # rstd = torch.empty((T,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BD = min(MAX_FUSED_SIZE, triton.next_power_of_2(D)) + if D > BD: + raise RuntimeError("This layer doesn't support feature dim >= 64KB.") + + if not USE_DEFAULT_FLA_NORM: + MBLOCK = 32 + # M, N = x.shape + l2norm_fwd_kernel2[(triton.cdiv(T, MBLOCK),)]( + x, + y, + eps, + T, + D, + MBLOCK, + ) + else: + if D <= 512: + NB = triton.cdiv(T, 2048) + + def grid(meta): + return (triton.cdiv(T, meta["BT"]),) + + l2norm_fwd_kernel[grid]( + x, + y, + eps, + NB=NB, + T=T, + D=D, + BD=BD, + ) + else: + l2norm_fwd_kernel1[(T,)]( + x, + y, + eps=eps, + D=D, + BD=BD, + ) + + return y.view(x_shape_og) diff --git a/model_executor/layers/fla/ops/layernorm_guard.py b/model_executor/layers/fla/ops/layernorm_guard.py new file mode 100644 index 0000000..89352d1 --- /dev/null +++ b/model_executor/layers/fla/ops/layernorm_guard.py @@ -0,0 +1,396 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Tri Dao +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2024, Tri Dao. + +# ruff: noqa: E501 +# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html +# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate. +# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling. +# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. + +from functools import lru_cache + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from vllm.triton_utils import tl, triton +from vllm.utils.math_utils import cdiv, next_power_of_2 + +from .utils import input_guard + + +def rms_norm_ref( + x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + upcast=True, +): + dtype = x.dtype + weight = weight.float() + bias = bias.float() if bias is not None else None + if upcast: + x = x.float() + z = z.float() if z is not None else z + if z is not None and not norm_before_gate: + x = x * F.silu(z) + if group_size is None: + rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) + out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight) + else: + x_group = rearrange(x, "... (g d) -> ... g d", d=group_size) + rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps) + out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight + if bias is not None: + out = out + bias + if z is not None and norm_before_gate: + out *= F.silu(z) + return out.to(dtype) + + +@triton.heuristics( + { + "HAS_BIAS": lambda args: args["B"] is not None, + "HAS_Z": lambda args: args["Z"] is not None, + } +) +@triton.jit +def layer_norm_fwd_kernel( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + Z, # pointer to the other branch + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_y_row, + stride_z_row, + M, # number of rows in X + N: tl.constexpr, # number of columns in X + eps, # epsilon to avoid division by zero + BLOCK_N: tl.constexpr, + ROWS_PER_BLOCK: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_Z: tl.constexpr, + NORM_BEFORE_GATE: tl.constexpr, + IS_RMS_NORM: tl.constexpr, +): + # Map the program id to the starting row of X and Y it should compute. + row_start = tl.program_id(0) * ROWS_PER_BLOCK + group = tl.program_id(1) + + # Create 2D tile: [ROWS_PER_BLOCK, BLOCK_N] + rows = row_start + tl.arange(0, ROWS_PER_BLOCK) + cols = tl.arange(0, BLOCK_N) + + # Compute offsets for 2D tile + row_offsets = rows[:, None] * stride_x_row + col_offsets = cols[None, :] + group * N + + # Base pointers + X_base = X + row_offsets + col_offsets + Y_base = Y + rows[:, None] * stride_y_row + col_offsets + + # Create mask for valid rows and columns + row_mask = rows[:, None] < M + col_mask = cols[None, :] < N + mask = row_mask & col_mask + + # Load input data with 2D tile + x = tl.load(X_base, mask=mask, other=0.0).to(tl.float32) + + if HAS_Z and not NORM_BEFORE_GATE: + Z_base = Z + rows[:, None] * stride_z_row + col_offsets + z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32) + x *= z * tl.sigmoid(z) + + # Compute mean and variance per row (reduce along axis 1) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=1) / N # Shape: [ROWS_PER_BLOCK] + # Store mean for each row + mean_offsets = group * M + rows + mean_mask = rows < M + tl.store(Mean + mean_offsets, mean, mask=mean_mask) + # Broadcast mean back to 2D for subtraction + xbar = tl.where(mask, x - mean[:, None], 0.0) + var = tl.sum(xbar * xbar, axis=1) / N # Shape: [ROWS_PER_BLOCK] + else: + xbar = tl.where(mask, x, 0.0) + var = tl.sum(xbar * xbar, axis=1) / N # Shape: [ROWS_PER_BLOCK] + mean = 0.0 # Placeholder for RMS norm + + rstd = tl.rsqrt(var + eps) # Shape: [ROWS_PER_BLOCK] + + # Store rstd for each row + rstd_offsets = group * M + rows + rstd_mask = rows < M + tl.store(Rstd + rstd_offsets, rstd, mask=rstd_mask) + + # Load weights and biases (broadcast across rows) + w_offsets = cols + group * N + w_mask = cols < N + w = tl.load(W + w_offsets, mask=w_mask, other=0.0).to(tl.float32) + + if HAS_BIAS: + b = tl.load(B + w_offsets, mask=w_mask, other=0.0).to(tl.float32) + + # Normalize and apply linear transformation + if not IS_RMS_NORM: + x_hat = (x - mean[:, None]) * rstd[:, None] + else: + x_hat = x * rstd[:, None] + + y = x_hat * w[None, :] + b[None, :] if HAS_BIAS else x_hat * w[None, :] + + if HAS_Z and NORM_BEFORE_GATE: + Z_base = Z + rows[:, None] * stride_z_row + col_offsets + z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32) + y *= z * tl.sigmoid(z) + + # Write output + tl.store(Y_base, y, mask=mask) + + +@lru_cache +def _get_sm_count(device: torch.device) -> int: + """Get and cache the SM count for a given device.""" + props = torch.cuda.get_device_properties(device) + return props.multi_processor_count + + +def calc_rows_per_block(M: int, device: torch.device) -> int: + sm_count = _get_sm_count(device) + rows_per_block = next_power_of_2(cdiv(M, 2 * sm_count)) + rows_per_block = min(rows_per_block, 4) + return rows_per_block + + +def layer_norm_fwd( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + z: torch.Tensor = None, + out: torch.Tensor = None, + group_size: int = None, + norm_before_gate: bool = True, + is_rms_norm: bool = False, +): + M, N = x.shape + if group_size is None: + group_size = N + assert N % group_size == 0 + ngroups = N // group_size + assert x.stride(-1) == 1 + if z is not None: + assert z.stride(-1) == 1 + assert z.shape == (M, N) + assert weight.shape == (N,) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N,) + # allocate output + if out is not None: + assert out.shape == x.shape + else: + out = torch.empty_like(x) + assert out.stride(-1) == 1 + mean = ( + torch.empty((ngroups * M,), dtype=torch.float32, device=x.device) + if not is_rms_norm + else None + ) + rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size)) + if group_size > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + num_warps = min(max(BLOCK_N // 256, 1), 8) + # Calculate rows per block based on SM count + rows_per_block = calc_rows_per_block(M, x.device) + # Update grid to use rows_per_block + grid = (cdiv(M, rows_per_block), ngroups) + layer_norm_fwd_kernel[grid]( + x, + out, + weight, + bias, + z, + mean, + rstd, + x.stride(0), + out.stride(0), + z.stride(0) if z is not None else 0, + M, + group_size, + eps, + BLOCK_N=BLOCK_N, + ROWS_PER_BLOCK=rows_per_block, + NORM_BEFORE_GATE=norm_before_gate, + IS_RMS_NORM=is_rms_norm, + num_warps=num_warps, + ) + return out, mean, rstd + + +class LayerNormFn(torch.autograd.Function): + @input_guard + @staticmethod + def forward( + ctx, + x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + is_rms_norm=False, + ): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))""" + + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if z is not None: + assert z.shape == x_shape_og + z = z.reshape(-1, z.shape[-1]) + if z.stride(-1) != 1: + z = z.contiguous() + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + y, mean, rstd = layer_norm_fwd( + x, + weight, + bias, + eps, + z=z, + group_size=group_size, + norm_before_gate=norm_before_gate, + is_rms_norm=is_rms_norm, + ) + ctx.save_for_backward(x, weight, bias, mean, rstd, z) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.group_size = group_size + ctx.norm_before_gate = norm_before_gate + ctx.is_rms_norm = is_rms_norm + return y.reshape(x_shape_og) + + +def layernorm_fn( + x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + is_rms_norm=False, +): + return LayerNormFn.apply( + x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm + ) + + +def rmsnorm_fn( + x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True +): + return LayerNormFn.apply( + x, weight, bias, z, eps, group_size, norm_before_gate, True + ) + + +class LayerNormGated(nn.Module): + def __init__( + self, + hidden_size, + eps: float = 1e-5, + group_size: int | None = None, + norm_before_gate: bool = True, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + ): + """If group_size is not None, we do GroupNorm with each group having group_size elements. + group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). + """ + + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + torch.nn.init.zeros_(self.bias) + + def forward(self, x, z=None): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))""" + return layernorm_fn( + x, + self.weight, + self.bias, + z=z, + group_size=self.group_size, + eps=self.eps, + norm_before_gate=self.norm_before_gate, + ) + + +class RMSNormGated(nn.Module): + def __init__( + self, + hidden_size, + eps: float = 1e-5, + group_size: int | None = None, + norm_before_gate: bool = False, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + ): + """If group_size is not None, we do GroupNorm with each group having group_size elements. + group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.register_parameter("bias", None) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + + def forward(self, x, z=None): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))""" + return rmsnorm_fn( + x, + self.weight, + self.bias, + z=z, + eps=self.eps, + group_size=self.group_size, + norm_before_gate=self.norm_before_gate, + ) diff --git a/model_executor/layers/fla/ops/op.py b/model_executor/layers/fla/ops/op.py new file mode 100644 index 0000000..a91975c --- /dev/null +++ b/model_executor/layers/fla/ops/op.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import os + +from vllm.triton_utils import tl, tldevice, triton + +from .utils import is_gather_supported + +if os.environ.get("FLA_USE_FAST_OPS", "0") == "1": + exp = tldevice.fast_expf + log = tldevice.fast_logf + log2 = tldevice.fast_log2f +else: + exp = tl.exp + log = tl.log + log2 = tl.log2 + + +if not is_gather_supported: + + @triton.jit + def gather(src, index, axis, _builder=None): + """ + Gather operation that works when tl.gather is not supported. + This is a fallback implementation that returns None. + Just to make triton compiler happy. + """ + return None +else: + gather = tl.gather + +if hasattr(triton.language, "_experimental_make_tensor_descriptor"): + # For Triton 3.3.x + make_tensor_descriptor = triton.language._experimental_make_tensor_descriptor +elif hasattr(triton.language, "make_tensor_descriptor"): + # For Triton 3.4.x and later + make_tensor_descriptor = triton.language.make_tensor_descriptor +else: + """ + Fallback implementation when TMA is not supported. + Returns None to indicate TMA descriptors are unavailable. + Just make triton compiler happy. + """ + + @triton.jit + def make_tensor_descriptor( + base, + shape, + strides, + block_shape, + _builder=None, + ): + return None diff --git a/model_executor/layers/fla/ops/solve_tril.py b/model_executor/layers/fla/ops/solve_tril.py new file mode 100644 index 0000000..da85aab --- /dev/null +++ b/model_executor/layers/fla/ops/solve_tril.py @@ -0,0 +1,556 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 + +import os + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices +from .op import make_tensor_descriptor +from .utils import input_guard, is_amd, is_tma_supported + +FLA_TRIL_PRECISION = os.environ.get("FLA_TRIL_PRECISION", "ieee") +ALLOWED_TRIL_PRECISIONS = ["ieee", "tf32"] if is_amd else ["ieee", "tf32", "tf32x3"] +assert FLA_TRIL_PRECISION in ALLOWED_TRIL_PRECISIONS, ( + f"FLA_TRIL_PRECISION must be one of {ALLOWED_TRIL_PRECISIONS}, but got {FLA_TRIL_PRECISION}" +) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8] + for num_stages in [2, 3, 4, 5] + ], + key=["BT"], +) +@triton.jit(do_not_specialize=["T"]) +def solve_tril_16x16_kernel( + A, + Ai, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + BT: tl.constexpr, + USE_TMA: tl.constexpr, + IS_VARLEN: tl.constexpr, + DOT_PRECISION: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + o_i = tl.arange(0, 16) + m_A = o_i[:, None] > o_i[None, :] + m_I = o_i[:, None] == o_i[None, :] + + A = A + (bos * H + i_h) * BT + Ai = Ai + (bos * H + i_h) * 16 + + offset = (i_t * 16) % BT + if not USE_TMA: + p_A = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * 16, offset), (16, 16), (1, 0) + ) + # [16, 16] + b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32) + else: + desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16]) + desc_o = make_tensor_descriptor(Ai, [T, 16], [H * 16, 1], [16, 16]) + b_A = desc.load([i_t * 16, offset]).to(tl.float32) + b_A = -tl.where(m_A, b_A, 0) + + for i in range(2, min(16, T - i_t * 16)): + # [16] + b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset) + b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) + b_A = tl.where((o_i == i)[:, None], b_a, b_A) + b_A += m_I + if not USE_TMA: + p_Ai = tl.make_block_ptr( + Ai, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16), (1, 0) + ) + tl.store( + p_Ai, + b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + else: + desc_o.store([i_t * 16, 0], b_A.to(desc_o.dtype, fp_downcast_rounding="rtne")) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8] + for num_stages in [2, 3, 4, 5] + ], + key=["H", "BT", "IS_VARLEN"], +) +@triton.jit(do_not_specialize=["T"]) +def merge_16x16_to_32x32_inverse_kernel( + A, + Ai, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + BT: tl.constexpr, + USE_TMA: tl.constexpr, + IS_VARLEN: tl.constexpr, + DOT_PRECISION: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + o_i = tl.arange(0, 16) + m_A = o_i[:, None] > o_i[None, :] + m_I = o_i[:, None] == o_i[None, :] + A += (bos * H + i_h) * BT + Ai += (bos * H + i_h) * BT + + if not USE_TMA: + p_A_11 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0) + ) + p_A_22 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0) + ) + b_Ai_11 = tl.load(p_A_11, boundary_check=(0, 1)).to(tl.float32) + b_Ai_22 = tl.load(p_A_22, boundary_check=(0, 1)).to(tl.float32) + else: + desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16]) + desc_o = make_tensor_descriptor(Ai, [T, BT], [H * BT, 1], [16, 16]) + b_Ai_11 = desc.load([i_t * BT + 0, 0]).to(tl.float32) + b_Ai_22 = desc.load([i_t * BT + 16, 16]).to(tl.float32) + + # [16, 16] + b_Ai_11 = -tl.where(m_A, b_Ai_11, 0) + b_Ai_22 = -tl.where(m_A, b_Ai_22, 0) + + for i in range(2, min(16, T - i_t * BT)): + b_a_11 = -tl.load(A + (i_t * BT + i) * H * BT + o_i) + b_a_11 += tl.sum(b_a_11[:, None] * b_Ai_11, 0) + b_Ai_11 = tl.where((o_i == i)[:, None], b_a_11, b_Ai_11) + for i in range(16 + 2, min(32, T - i_t * BT)): + b_a_22 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 16) + b_a_22 += tl.sum(b_a_22[:, None] * b_Ai_22, 0) + b_Ai_22 = tl.where((o_i == i - 16)[:, None], b_a_22, b_Ai_22) + + b_Ai_11 += m_I + b_Ai_22 += m_I + + if not USE_TMA: + p_A_21 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0) + ) + b_A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32) + else: + b_A_21 = desc.load([i_t * BT + 16, 0]).to(tl.float32) + + b_Ai_21 = -tl.dot( + tl.dot(b_Ai_22, b_A_21, input_precision=DOT_PRECISION), + b_Ai_11, + input_precision=DOT_PRECISION, + ) + + if not USE_TMA: + p_Ai_11 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0) + ) + p_Ai_21 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0) + ) + p_Ai_22 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0) + ) + tl.store( + p_Ai_11, + b_Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_22, + b_Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_21, + b_Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + else: + desc_o.store( + [i_t * BT + 0, 0], b_Ai_11.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 16, 0], b_Ai_21.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 16, 16], b_Ai_22.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [2, 4, 8] + for num_stages in [2, 3, 4, 5] + ], + key=["H", "BT", "IS_VARLEN"], +) +@triton.jit(do_not_specialize=["T"]) +def merge_16x16_to_64x64_inverse_kernel( + A, + Ai, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + BT: tl.constexpr, + USE_TMA: tl.constexpr, + IS_VARLEN: tl.constexpr, + DOT_PRECISION: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + o_i = tl.arange(0, 16) + m_A = o_i[:, None] > o_i[None, :] + m_I = o_i[:, None] == o_i[None, :] + A += (bos * H + i_h) * BT + Ai += (bos * H + i_h) * BT + + if not USE_TMA: + p_A_11 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0) + ) + p_A_22 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0) + ) + p_A_33 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 32, 32), (16, 16), (1, 0) + ) + p_A_44 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 48, 48), (16, 16), (1, 0) + ) + b_Ai_11 = tl.load(p_A_11, boundary_check=(0, 1)).to(tl.float32) + b_Ai_22 = tl.load(p_A_22, boundary_check=(0, 1)).to(tl.float32) + b_Ai_33 = tl.load(p_A_33, boundary_check=(0, 1)).to(tl.float32) + b_Ai_44 = tl.load(p_A_44, boundary_check=(0, 1)).to(tl.float32) + else: + desc = make_tensor_descriptor(A, [T, BT], [H * BT, 1], [16, 16]) + desc_o = make_tensor_descriptor(Ai, [T, BT], [H * BT, 1], [16, 16]) + b_Ai_11 = desc.load([i_t * BT + 0, 0]).to(tl.float32) + b_Ai_22 = desc.load([i_t * BT + 16, 16]).to(tl.float32) + b_Ai_33 = desc.load([i_t * BT + 32, 32]).to(tl.float32) + b_Ai_44 = desc.load([i_t * BT + 48, 48]).to(tl.float32) + + # [16, 16] + b_Ai_11 = -tl.where(m_A, b_Ai_11, 0) + b_Ai_22 = -tl.where(m_A, b_Ai_22, 0) + b_Ai_33 = -tl.where(m_A, b_Ai_33, 0) + b_Ai_44 = -tl.where(m_A, b_Ai_44, 0) + + for i in range(2, min(16, T - i_t * BT)): + b_a_11 = -tl.load(A + (i_t * BT + i) * H * BT + o_i) + b_a_11 += tl.sum(b_a_11[:, None] * b_Ai_11, 0) + b_Ai_11 = tl.where((o_i == i)[:, None], b_a_11, b_Ai_11) + for i in range(16 + 2, min(32, T - i_t * BT)): + b_a_22 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 16) + b_a_22 += tl.sum(b_a_22[:, None] * b_Ai_22, 0) + b_Ai_22 = tl.where((o_i == i - 16)[:, None], b_a_22, b_Ai_22) + for i in range(32 + 2, min(48, T - i_t * BT)): + b_a_33 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 32) + b_a_33 += tl.sum(b_a_33[:, None] * b_Ai_33, 0) + b_Ai_33 = tl.where((o_i == i - 32)[:, None], b_a_33, b_Ai_33) + for i in range(48 + 2, min(64, T - i_t * BT)): + b_a_44 = -tl.load(A + (i_t * BT + i) * H * BT + o_i + 48) + b_a_44 += tl.sum(b_a_44[:, None] * b_Ai_44, 0) + b_Ai_44 = tl.where((o_i == i - 48)[:, None], b_a_44, b_Ai_44) + b_Ai_11 += m_I + b_Ai_22 += m_I + b_Ai_33 += m_I + b_Ai_44 += m_I + + if not USE_TMA: + p_A_21 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0) + ) + p_A_31 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 32, 0), (16, 16), (1, 0) + ) + p_A_32 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 32, 16), (16, 16), (1, 0) + ) + p_A_41 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 48, 0), (16, 16), (1, 0) + ) + p_A_42 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 48, 16), (16, 16), (1, 0) + ) + p_A_43 = tl.make_block_ptr( + A, (T, BT), (H * BT, 1), (i_t * BT + 48, 32), (16, 16), (1, 0) + ) + b_A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32) + b_A_31 = tl.load(p_A_31, boundary_check=(0, 1)).to(tl.float32) + b_A_32 = tl.load(p_A_32, boundary_check=(0, 1)).to(tl.float32) + b_A_41 = tl.load(p_A_41, boundary_check=(0, 1)).to(tl.float32) + b_A_42 = tl.load(p_A_42, boundary_check=(0, 1)).to(tl.float32) + b_A_43 = tl.load(p_A_43, boundary_check=(0, 1)).to(tl.float32) + else: + b_A_21 = desc.load([i_t * BT + 16, 0]).to(tl.float32) + b_A_31 = desc.load([i_t * BT + 32, 0]).to(tl.float32) + b_A_32 = desc.load([i_t * BT + 32, 16]).to(tl.float32) + b_A_41 = desc.load([i_t * BT + 48, 0]).to(tl.float32) + b_A_42 = desc.load([i_t * BT + 48, 16]).to(tl.float32) + b_A_43 = desc.load([i_t * BT + 48, 32]).to(tl.float32) + + b_Ai_21 = -tl.dot( + tl.dot(b_Ai_22, b_A_21, input_precision=DOT_PRECISION), + b_Ai_11, + input_precision=DOT_PRECISION, + ) + b_Ai_32 = -tl.dot( + tl.dot(b_Ai_33, b_A_32, input_precision=DOT_PRECISION), + b_Ai_22, + input_precision=DOT_PRECISION, + ) + b_Ai_43 = -tl.dot( + tl.dot(b_Ai_44, b_A_43, input_precision=DOT_PRECISION), + b_Ai_33, + input_precision=DOT_PRECISION, + ) + + b_Ai_31 = -tl.dot( + b_Ai_33, + tl.dot(b_A_31, b_Ai_11, input_precision=DOT_PRECISION) + + tl.dot(b_A_32, b_Ai_21, input_precision=DOT_PRECISION), + input_precision=DOT_PRECISION, + ) + b_Ai_42 = -tl.dot( + b_Ai_44, + tl.dot(b_A_42, b_Ai_22, input_precision=DOT_PRECISION) + + tl.dot(b_A_43, b_Ai_32, input_precision=DOT_PRECISION), + input_precision=DOT_PRECISION, + ) + b_Ai_41 = -tl.dot( + b_Ai_44, + tl.dot(b_A_41, b_Ai_11, input_precision=DOT_PRECISION) + + tl.dot(b_A_42, b_Ai_21, input_precision=DOT_PRECISION) + + tl.dot(b_A_43, b_Ai_31, input_precision=DOT_PRECISION), + input_precision=DOT_PRECISION, + ) + + if not USE_TMA: + p_Ai_11 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT, 0), (16, 16), (1, 0) + ) + p_Ai_22 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 16), (16, 16), (1, 0) + ) + p_Ai_33 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 32), (16, 16), (1, 0) + ) + p_Ai_44 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 48), (16, 16), (1, 0) + ) + p_Ai_21 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 16, 0), (16, 16), (1, 0) + ) + p_Ai_31 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 0), (16, 16), (1, 0) + ) + p_Ai_32 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 32, 16), (16, 16), (1, 0) + ) + p_Ai_41 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 0), (16, 16), (1, 0) + ) + p_Ai_42 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 16), (16, 16), (1, 0) + ) + p_Ai_43 = tl.make_block_ptr( + Ai, (T, BT), (H * BT, 1), (i_t * BT + 48, 32), (16, 16), (1, 0) + ) + tl.store( + p_Ai_11, + b_Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_22, + b_Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_33, + b_Ai_33.to(p_Ai_33.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_44, + b_Ai_44.to(p_Ai_44.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_21, + b_Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_31, + b_Ai_31.to(p_Ai_31.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_32, + b_Ai_32.to(p_Ai_32.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_41, + b_Ai_41.to(p_Ai_41.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_42, + b_Ai_42.to(p_Ai_42.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + tl.store( + p_Ai_43, + b_Ai_43.to(p_Ai_43.dtype.element_ty, fp_downcast_rounding="rtne"), + boundary_check=(0, 1), + ) + else: + desc_o.store( + [i_t * BT + 0, 0], b_Ai_11.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 16, 16], b_Ai_22.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 32, 32], b_Ai_33.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 48, 48], b_Ai_44.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 16, 0], b_Ai_21.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 32, 0], b_Ai_31.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 32, 16], b_Ai_32.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 48, 0], b_Ai_41.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 48, 16], b_Ai_42.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + desc_o.store( + [i_t * BT + 48, 32], b_Ai_43.to(desc_o.dtype, fp_downcast_rounding="rtne") + ) + + +@input_guard +def solve_tril( + A: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + output_dtype: torch.dtype = torch.float, +) -> torch.Tensor: + """ + Compute the inverse of the matrix I + A + A should be strictly lower triangular, i.e., A.triu() == 0. + + Args: + A (torch.Tensor): + [B, T, H, BT], where BT should only be 16, 32, or 64. + cu_seqlens (torch.Tensor): + The cumulative sequence lengths of the input tensor. Default: `None`. + output_dtype (torch.dtype): + The dtype of the output tensor. Default: `torch.float`. + If `None`, the output dtype will be the same as the input dtype. + + Returns: + (I + A)^-1 with the same shape as A + """ + assert A.shape[-1] in [16, 32, 64] + output_dtype = A.dtype if output_dtype is None else output_dtype + + B, T, H, BT = A.shape + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None + ) + NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT) + + Ai = torch.zeros_like(A, dtype=output_dtype) + if BT == 16: + merge_fn = solve_tril_16x16_kernel + elif BT == 32: + merge_fn = merge_16x16_to_32x32_inverse_kernel + elif BT == 64: + merge_fn = merge_16x16_to_64x64_inverse_kernel + + merge_fn[NT, B * H]( + A=A, + Ai=Ai, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + BT=BT, + USE_TMA=is_tma_supported, + DOT_PRECISION=FLA_TRIL_PRECISION, + ) + return Ai diff --git a/model_executor/layers/fla/ops/utils.py b/model_executor/layers/fla/ops/utils.py new file mode 100644 index 0000000..5a48e56 --- /dev/null +++ b/model_executor/layers/fla/ops/utils.py @@ -0,0 +1,194 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang +# ruff: noqa: E501 +import contextlib +import functools +import logging +import os +from collections.abc import Callable +from enum import Enum +from typing import Any, Literal + +import torch + +from vllm.platforms import current_platform +from vllm.triton_utils import triton + +logger = logging.getLogger(__name__) + +COMPILER_MODE = os.getenv("FLA_COMPILER_MODE") == "1" +FLA_CI_ENV = os.getenv("FLA_CI_ENV") == "1" +FLA_GDN_FIX_BT = os.getenv("FLA_GDN_FIX_BT", "0") == "1" + +SUPPRESS_LEVEL = int(os.getenv("GDN_RECOMPUTE_SUPPRESS_LEVEL", "0")) + + +def tensor_cache(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]: + """ + A decorator that caches the most recent results of a function with tensor inputs. + + This decorator will store the output of the decorated function for the most recent set of input tensors. + The cache is limited to a fixed size (default is 4). When the cache is full, the oldest entry will be removed. + + Args: + fn (Callable[..., torch.Tensor]): + The function to be decorated. It should take tensor inputs and return tensor outputs. + + Returns: + Callable[..., torch.Tensor]: + A wrapped version of the input function with single-entry caching. + """ + + cache_entries: tuple[tuple | None, dict | None, Any] = [] + cache_size = 8 + + @functools.wraps(fn) + def wrapper(*args: Any, **kwargs: Any) -> Any: + nonlocal cache_entries, cache_size + for i, entry in enumerate(cache_entries): + last_args, last_kwargs, last_result = entry + if ( + len(args) == len(last_args) + and len(kwargs) == len(last_kwargs) + and all(a is b for a, b in zip(args, last_args)) + and all( + k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items() + ) + ): + cache_entries = ( + cache_entries[:i] + + cache_entries[i + 1 :] + + [(args, kwargs, last_result)] + ) + return last_result + + result = fn(*args, **kwargs) + + if len(cache_entries) >= cache_size: + cache_entries = cache_entries[1:] + cache_entries.append((args, kwargs, result)) + return result + + return wrapper + + +def input_guard(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]: + """ + A decorator to make sure all input tensors are contiguous and set the device based on input tensors. + """ + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + contiguous_args = ( + i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args + ) + contiguous_kwargs = { + k: (v if not isinstance(v, torch.Tensor) else v.contiguous()) + for k, v in kwargs.items() + } + + tensor = None + for arg in args: + if isinstance(arg, torch.Tensor): + tensor = arg + break + if tensor is None: + for value in kwargs.values(): + if isinstance(value, torch.Tensor): + tensor = value + break + + if tensor is not None: + ctx = torch.cuda.device(tensor.device.index) + else: + ctx = contextlib.nullcontext() + + with ctx: + return fn(*contiguous_args, **contiguous_kwargs) + + return wrapper + + +@functools.cache +def get_available_device() -> str: + try: + return triton.runtime.driver.active.get_current_target().backend + except BaseException: + return "cpu" + + +@functools.cache +def _check_platform() -> Literal["nvidia", "amd", "intel", "musa"]: + device = get_available_device() + mapping = { + "cuda": "nvidia", + "hip": "amd", + "xpu": "intel", + } + # return the mapped value, or the original if not found + return mapping.get(device, device) + + +# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'. +# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs. +# Therefore, we need to check the triton backend to determine the actual GPU vendor. +device = "cuda" if current_platform.is_cuda_alike() else get_available_device() +device_torch_lib = getattr(torch, device, None) +device_platform = _check_platform() + +is_amd = device_platform == "amd" +is_intel = device_platform == "intel" +is_nvidia = device_platform == "nvidia" +is_intel_alchemist = is_intel and "Intel(R) Arc(TM) A" in torch.xpu.get_device_name(0) +is_nvidia_hopper = is_nvidia and ( + "NVIDIA H" in torch.cuda.get_device_name(0) + or torch.cuda.get_device_capability()[0] >= 9 +) +use_cuda_graph = is_nvidia and os.environ.get("FLA_USE_CUDA_GRAPH", "0") == "1" +is_gather_supported = hasattr(triton.language, "gather") +is_tma_supported = (is_nvidia and torch.cuda.get_device_capability(0)[0] >= 9) and ( + hasattr(triton.language, "_experimental_make_tensor_descriptor") + or hasattr(triton.language, "make_tensor_descriptor") +) + + +def get_all_max_shared_mem(): + try: + return [ + triton.runtime.driver.active.utils.get_device_properties(i)[ + "max_shared_mem" + ] + for i in range(device_torch_lib.device_count()) + ] + except BaseException: + return [-1] + + +class Backend(Enum): + ADA = 101376 # RTX 4090 + AMPERE = 166912 # A100 + HOPPER = 232448 # H100 + DEFAULT = 102400 # Default + + @classmethod + def get_shared_memory(cls, arch: str) -> int: + try: + return cls[arch.upper()].value + except KeyError: + return cls.DEFAULT.value + + +@functools.cache +def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool: + try: + device_shared_mem_list = get_all_max_shared_mem() + max_shared_memory = device_shared_mem_list[tensor_idx] + return max_shared_memory >= Backend.get_shared_memory(arch) + except Exception: + return False diff --git a/model_executor/layers/fla/ops/wy_fast.py b/model_executor/layers/fla/ops/wy_fast.py new file mode 100644 index 0000000..a66ec1d --- /dev/null +++ b/model_executor/layers/fla/ops/wy_fast.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang +# +# This file contains code copied from the flash-linear-attention project. +# The original source code was licensed under the MIT license and included +# the following copyright notice: +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +# ruff: noqa: E501 + +import torch + +from vllm.triton_utils import tl, triton + +from .index import prepare_chunk_indices + + +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"], +) +@triton.jit(do_not_specialize=["T"]) +def recompute_w_u_fwd_kernel( + k, + v, + beta, + w, + u, + A, + g, + cu_seqlens, + chunk_indices, + T, + H: tl.constexpr, + Hg: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + IS_VARLEN: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if IS_VARLEN: + i_n, i_t = ( + tl.load(chunk_indices + i_t * 2).to(tl.int32), + tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32), + ) + bos, eos = ( + tl.load(cu_seqlens + i_n).to(tl.int32), + tl.load(cu_seqlens + i_n + 1).to(tl.int32), + ) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + p_beta = tl.make_block_ptr( + beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,) + ) + p_g = tl.make_block_ptr(g + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,)) + p_A = tl.make_block_ptr( + A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0) + ) + b_beta = tl.load(p_beta, boundary_check=(0,)) + b_A = tl.load(p_A, boundary_check=(0, 1)) + b_g = tl.exp(tl.load(p_g, boundary_check=(0,))) + + for i_v in range(tl.cdiv(V, BV)): + p_v = tl.make_block_ptr( + v + (bos * H + i_h) * V, + (T, V), + (H * V, 1), + (i_t * BT, i_v * BV), + (BT, BV), + (1, 0), + ) + p_u = tl.make_block_ptr( + u + (bos * H + i_h) * V, + (T, V), + (H * V, 1), + (i_t * BT, i_v * BV), + (BT, BV), + (1, 0), + ) + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_vb = (b_v * b_beta[:, None]).to(b_v.dtype) + b_u = tl.dot(b_A, b_vb, allow_tf32=False) + tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1)) + + for i_k in range(tl.cdiv(K, BK)): + p_k = tl.make_block_ptr( + k + (bos * Hg + i_h // (H // Hg)) * K, + (T, K), + (Hg * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + p_w = tl.make_block_ptr( + w + (bos * H + i_h) * K, + (T, K), + (H * K, 1), + (i_t * BT, i_k * BK), + (BT, BK), + (1, 0), + ) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kb = (b_k * b_beta[:, None] * b_g[:, None]).to(b_k.dtype) + b_w = tl.dot(b_A, b_kb) + tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1)) + + +def recompute_w_u_fwd( + k: torch.Tensor, + v: torch.Tensor, + beta: torch.Tensor, + g_cumsum: torch.Tensor, + A: torch.Tensor, + cu_seqlens: torch.LongTensor | None, +) -> tuple[torch.Tensor, torch.Tensor]: + B, T, Hg, K, V = *k.shape, v.shape[-1] + H = v.shape[-2] + BT = A.shape[-1] + + chunk_indices = ( + prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None + ) + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices) + BK = 64 + BV = 64 + u = torch.empty_like(v) + w = k.new_empty(B, T, H, K) + recompute_w_u_fwd_kernel[(NT, B * H)]( + k=k, + v=v, + beta=beta, + w=w, + u=u, + A=A, + g=g_cumsum, + cu_seqlens=cu_seqlens, + chunk_indices=chunk_indices, + T=T, + H=H, + Hg=Hg, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + ) + return w, u diff --git a/model_executor/layers/fused_moe/__init__.py b/model_executor/layers/fused_moe/__init__.py new file mode 100644 index 0000000..658a07d --- /dev/null +++ b/model_executor/layers/fused_moe/__init__.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager +from typing import Any + +from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, + FusedMoeWeightScaleSupported, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEActivationFormat, + FusedMoEPermuteExpertsUnpermute, + FusedMoEPrepareAndFinalize, +) +from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe.utils import activation_without_mul +from vllm.triton_utils import HAS_TRITON + +_config: dict[str, Any] | None = None + + +@contextmanager +def override_config(config): + global _config + old_config = _config + _config = config + yield + _config = old_config + + +def get_config() -> dict[str, Any] | None: + return _config + + +__all__ = [ + "FusedMoE", + "FusedMoEConfig", + "FusedMoEMethodBase", + "FusedMoeWeightScaleSupported", + "FusedMoEPermuteExpertsUnpermute", + "FusedMoEActivationFormat", + "FusedMoEPrepareAndFinalize", + "SharedFusedMoE", + "activation_without_mul", + "override_config", + "get_config", +] + +if HAS_TRITON: + # import to register the custom ops + from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( + BatchedDeepGemmExperts, + ) + from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 + BatchedTritonOrDeepGemmExperts, + ) + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + CutlassBatchedExpertsFp8, + CutlassExpertsFp8, + cutlass_moe_fp4, + cutlass_moe_fp8, + ) + from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts + from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedTritonExperts, + ) + from vllm.model_executor.layers.fused_moe.fused_moe import ( + TritonExperts, + fused_experts, + fused_topk, + get_config_file_name, + grouped_topk, + ) + from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts, + ) + + __all__ += [ + "fused_topk", + "fused_experts", + "get_config_file_name", + "grouped_topk", + "cutlass_moe_fp8", + "cutlass_moe_fp4", + "CutlassExpertsFp8", + "CutlassBatchedExpertsFp8", + "TritonExperts", + "BatchedTritonExperts", + "DeepGemmExperts", + "BatchedDeepGemmExperts", + "TritonOrDeepGemmExperts", + "BatchedTritonOrDeepGemmExperts", + ] +else: + # Some model classes directly use the custom ops. Add placeholders + # to avoid import errors. + def _raise_exception(method: str): + raise NotImplementedError(f"{method} is not implemented as lack of triton.") + + fused_topk = lambda *args, **kwargs: _raise_exception("fused_topk") + fused_experts = lambda *args, **kwargs: _raise_exception("fused_experts") \ No newline at end of file diff --git a/model_executor/layers/fused_moe/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f9ddfb54cfe18fa96e8d7f57f5dafa258c7ba16 GIT binary patch literal 3100 zcmbtW&2Jn<7O(1=X^+SDj6Wuh9XrlAaZDz$X9C%UWEG(eBz6L_`LF|0Us|2%vfa3< zyH(wj#8FsHP!2&{-~y3WD@EKQw0HgoEq$3-ZB7aKh+HNvaVaN)T9l$}a6_c5l$KLG zEizU{%V}X8!x_CkKTm4$j@&Peu4QjcY z4~e2x1UcvIW5aAeORX5A5R{v{(M1RwaeL0fh*7uqHugXi`kT^%GUxvX4hlk_|7Vmph=zcH*jLE9x-#;5B9Al?7M_M`9*cRW9Ie(?f(-OQm0ytFZ{Hp%>1^ zj$CE&?DhI{XHKs_$Ew`BQVlmCRKI$ab5Rw6%ec+1vsyh4WR*J`Oh(leycAmmtZLhy z@5Q!V2{%+BxXPsTT-Nj^hr!zzywL%GZ^%y#^1Vq0M;kAFnJ+fx|C8?iGCy)_{^tlA z@SXlas!Yk^y7X%>Za;V^DR{T@0J{%iiO2y^$w7eoWE+skeE=#48MQ({_fzmj&`6{~ z{@XLySd_+H80IDDo>X_(UBwE9JeZh%qYs@{*SUAswXch|0 zOf+Yfr56YOEAq(|X*Kd3B_q}S$NV1 zRmD9~w4%$Xz9g)sLiHur$dWQwxnvjz;kqh5K0o!WXSSSiK&rYexl&|(l@dc5d@ zlmQTtJPvLmG+9cjtKXK67cuCG8Viv<sImOyy*KV%d@%F+gTqT-htY8=>c~O1w&S0(qvgmuD&o&s5qxQ%Sf?cdPnaM?2bk?WBn8EAXYzovw~| zWI)G|kOWeLI@QrGNTkjkX|FTsGoDb1CKcop_1BJUMEXNww=>W*)M3>1*yE9=EH$8= zB?WFLdgM_3eNbuFtHml6#Znb)+wq}A^tUjw?Z;dPpC;Sk+_pPB^D7B2+Aa{Iy$Z~T z#HE{Bo$n~~mVsu_fqcT0W`&a-nc7923Ds!QDd7Pi5vgZ7va&Z5EKwm3vrFMD2If(hijNC9l7 zD5XDS37vjKhQB6rJBC4xAI%g%HcgE!0Y~=b(i0tA|*|Y=lp3~dD{)$&M2p3WJCc^#T{5#v z$5Nyu2y$@H0%_4hV)T$66Uc|03gpmV&YG@$*kfl`B(z` zWxo@#uo#njmA+D+k}YKckE5KJKul%Y zTg8RMBnpaMD7P9BQy&IO17aEl#BLN4*{v|cz(~Y_RzqSBj1O+IZ-J6XI*?*z=A9zxkb;o^w;ZxsyjpwH|Ua=F{%j| zT~o!^Bvp{5Ed-U>QW^wla>#(j2tUC?+DfeNT*M+j1@$q&ATcrh@VT?XX1 zEIUt__;pDXk;;pb!fRzgM*PaAjx?gi0tj4OzVYVqsUWm}VR3PJahZSpm)GvR$zNZ( z`|2(J-i>R^i|)LorQ`EJQD-#`Y_X$4S?5K4vxe-9Qbj!2O1;CE9k+!r((70)``sWx zAAmoCKmR>6pD^3L3iL-moYmWNDrClyZ=2a>cYO|VhsX#WVE`2O=~0*3S;j7LFMJ?@ zxyOvN!dF0uaq#^@x{p*&7B&&)9D_MxFej)Y2fKoE?5krzm#V5(8zI@!OJy z9KP|O-3OpH)Iu!>|2cR`HpVMW+nK|Iy1UgK$O-J?S{^TElo+@73<$>V<9 z4IS`n;Fe}6rtE48VbRGZKR3`u0-LH*+8OB)x_lq{`G)i(9@sycOFmX#@` zDk7Oj4^X+TSMd~GUz=Jbd4`8DO|@+2>$)UsleJBpBa4yd6j|^FH19LbQ6`c%!_!Z~ z)8AyL&Fr#~UA7XrPY0g+eYx1fcr(&*+#5~g8iDL{kj2=3Sk+FvMG{x*yp~%hiVQPx z_@n3auR(fFnNH`la&QKn2CA}~j1F_0&rR%|PH4rwm|SM)cYNEv_kA~+4_J}e@ozH~ zdaWH`01$_7!WHI9{(uFp#9-)<{k{RaHE6W9srCtR)<>Q`_%ydHQt4|5^Nh9fTQ^*}xCVJ-yJQ5W|5 zj+~u66xa?v_V2dx>`v(aU1f{-BMef;JZ)S^8vyz_7xVf`(Su!RJ0u2P1_WkIxiqie z4(+;`KcdgL^zFzCeSg8_@%ruXLaJCJa#Zb#ymN$59U8te)sNJOq4|uh%I6RsiCop;dGj+}GZUHS&T8 zBvI08f?kH&Km*qEz>3ch{vM)JiMp2u>}a`G zcb-&uoG``6D{CDJJsDwgVN+jWa+x|2N`m9PEknyTOfFR%A+w{@XGA?rZgWfvvlz}3 zn7lsmON1+3bDmuMYU{y;&k}Nv8$|DcL>&krA};5lc+;Hlgjx!c%%_p!1IoPChX;tL zyi-Uj!UjBejxI*d?yF?xSCC_%UW+h^lo=l};v;5!?n!*kN@bp9NB*Rm*-J+Dl9|0> zWN*BG*NP7|;-ihgD71T5UTFk+n;|BhF;g>7QZtRfy~galr_jb{_QEkUoPQe5Td8~_ zb+!>X3+=P1n-;8+C>V(X03YA^=nmjoY{TG=ssiRM$O!e zk(=4;J!SS5jozZ!d(r52yDf8TQ z;qrssVn9}?e%Or*xfqe58%fymqbmU2N#7gyi{LV-3 zd|EXN^G0F*Nou|knQx@#pA8oNJo+$2tvj&HH~E*yGDGLhp*dq{&LXDH7`-#r@T56B zYYfkRSvKcx8Sopv)n50iyYB3PWxWu6Ggz?Bo-@y0HqKrKw|=HRQmuh8b70mOn6-vS zK6~%cdxTtlRQ+<;oVjJduW)NW9-@xyVtP&#GLSEIev{Hvl1k57jFm@N6>Hh0Pb8^_b`qQkZIAK-2WIEc-kL0sTJUV|_n_0<7;l2Ac0zH2)(<{^H1-|51T!D*{4;WsqRqq!HfMGBHvQ1=rFzMMXa)g>L^C+u3==5A ugrd)*1iH%pmSu;mXs+oez+TUp&jUtJu^A*7%m{IRIc)?-_rnCIru+|chWY6L literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/batched_deep_gemm_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/batched_deep_gemm_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b49480142fa3e402010c246fa09f9769492e429 GIT binary patch literal 17338 zcmdseX>c27mRRE??u!Hn-c3?Gz*D4T9h7XDx~9 zaAvkdWt@o0I%|4ovO`6#44tHEw3TeFHap2W*;EB>dY9NXe$tnwTeEJ5fvGrdVCvJK>G{CVV7qj@8Hg6aIL^L_@rBqA}hy(M0ks zvF3QoL`%GNqE$m_Db~uiJqRV#`(6FSHr6+NqFkoN7!T{6KBcD0Z>DyO{JJgjy<6n_w#cvFBHzD7ena*A zh>K}_SN}jke4>YGVwze1^x5)A)L2Bvw6N6lc{N>rGp$?Xw^hu1)t1%U$?F62PQPoI z=+jciD5m{gis@jD)329%Rbvlya%^H?Mic0KiitA;tyDY7rh}0rcQeE>!EiE>W^bjX z<|w~8FBp0&6pe+h#n^yGvPKipbTF2joMbsj)*j99EOR<}A%dyr%K=qwuyhr&~s;RF7c@6l0H`z9=CLZFi zvy9Y@BaB}UhGNmlL>%D*Y%@O=4X0+(Q^~~eTd`PtIG$wKSdhKNhBJW2Fzg|g?p_`O3g?OWVi4^f<#cpA;_d>QY^O(5NU=V|N9WkQAH!=a4u*TMslyr9Vs|n3nTNH zh0vlq7oR&;sPR2EwXB)CL{k@uY!^-2iw0ej@mG}2WGtE}mv>?E{mcj1d)d5mAa5F2 zw>Ey(x?`<%uh_cxQ|sQmesA8ow_tZHXx`hos97BO{#f4DO8QaL^WR*&vDBV-Y=iEs zu7#fWU-)3(y?uFWcV6H93^1f5&2h`{XKAQsQ|dX?8rk-WQQOLEsK#K_je_?htttZ#Qwy3m-v;bE zD%w?B9ZG1$brCv*8nrK>nbCcuS0R1H99Cg=>KbP|zq0SPFYi0MP3U9{jG+RKOC53f zEp%E-`wHRBTHy!ll59(RRO_MZmGs9N|BRpdMZjQDOsr{bgAQ2--1ise}V^b z$#x61+`bAZ{!JY~vZA!{OG}Y^ZhC}nwdc+6YWB+E~#_@MHGYLsM1rl(j=$uAxVXt?dzs+NQKp(IO+ zOHOj3creO<#+D@Fch^oy50aUYj3F+Rm}Dgbd2u+=ASrI8q?&l>Ry3Z8OFD>1y3jRV z(nJ0z=vyE#u=ONJ^>8c{Pe}$MAtei_k3@w_&qxLy1TP!tmCVsZDg#<$nv-f~f;ZF% zPa>8uXl-c{=enV5u7@D3$lYj=l`N%rx*|R$S(W^m8&Ihtk*-WsRO2hF@xh8}d~i~7 zD$o>kSPnESiVI-She1CE0~icqfZB?rOTm09M_}MNM}`(#-^U3~um zN4+2Ru3G!&4i~J>g|7MibBEXc4T~r4IhNha6Zcym_KAVRqW^GS?=95#F7I6F6Z`jz z_51Vsx`MY`t|NMPKiu}HQ5-uddQZ-sDR_O0O?QGzHQ$TzgM*NE{`nF-hWB7 z?R}*GBbMts||h2hgKciS9Xhzv7GL&t=`9;y1PAhdfxB&pyyr> z%xBMsJuSbLN`$C;e&iOF)?G;dm{Uwlb)2bRZItoL6P-TNM1c$EH2)2YW>mHw4!{8#OiV~p#9 zP4*OJ|$q5qn>?|7w30Zp3yg{_m|EiY{aMPuta)i;4 zLsQKOu<<-lvhny-O<#Qjpk)nNW41=9VRW;geKPt|YePkAEJZELB$xyp@^i*G3tAn- zG1n}Zn3_$kO%<(mjJX80GG9{_e(f=YLM*@4y_0ndZpME74cHGO6)VzFuBpbRx2du6dm2UTriU0k ztb!A?KZUY1lmO|77K{;^X_SMT9u*gP=Vd*xO5}VH(PV33&cCDuS--lhH|rCuUs(4e+taINcG$1Kh)H%yG3XRh!q&Gd;CdeP!lnKdfGvk0$kaHT9rA<%8ypC=vf@Fzq zo$5Tiqe;J@))Q(8HIf+wT@iSLP$zf=ADnpMv?|yYYPB;)QqRs13$5UR^H(#Rrd)!K zaWTe^+z;wyh~>2icrxz*i4yx7ut6H-ckj%Vp`oDxdhGa{0lJS?Qpd;Mq|-_I=-EB= z&1iaxCcKjdJ~~JkD=nKjET}5X&{H9vCI%0>_s9T!aeyA}J02LKFHW&^B+Bt=s1S?L z$p{TvCLIew1``E6kB=r3bd)DR5R<`BI30ZploMuv2Bjh$O~s-tLtmRA?RXaKmka?l zfEfvzC>$^hh6Z~O!F7nhx`dr%65%O21pVKF@<}iS$^CO|h~aT$p>Q}E2Qv?JM<06a z@Ho^7rKb2H`V^a{ubv&fN0Y(Z`+*pi_MQ1~4L^%R@COI1Hj5r~Dlux7CQMN~d zp|YeD5&Y=nTP%n85_nTlh5^h=!ymGcY|^JJrW8eur(dM6Y=4vPr>~&mE=#kaa3;=Y z;=O@4uhO9eLz-SqC)1&rQUMxmgmPtYaPb;9YZ8|MP%jN?KxO82-q zql2)s5fW@5Tq0$+Zomo7K4GvTvhp^)dL;@pf=z$i>oEH?@Ez?Cm{H57TAKTL4mLM=>l<c+{ zjPbZDV9MffU}DvI#(@HHazF-gHlBnzmo4IC|L!CR^D>Qx=+QxPLZlVAFd?wVFGNqh zM!!BvI)r%!ilyI*@)MA|3^6rIQW#=t2T5W0cWtNi`(DND z%5(OwD{?C1|9_M&%A1u4HYEEvFe>25kq8iEBFxg)*z`>n*!1fogrHs@g&d?YtUDP? zUJJztxslnE5B{YlNX`5JX{JI!;WtR~hv@TgJ;o7MhOpowEDbDG1|d`QO%OOd4_8?g z{D??T1b`xpC2W(VkT|d6(qFBheuOKFN`hXgbCh{kGbyF2!UWHg`z+wmy5kmHlwz(zrTJOCM?I1UAG*#LJ{fdF-)41>Q)Gi-`Y0EYtk zDH{q(*J|>>X?k#T0sZ2Ov~n|~)&YJ2DDecP9iw!0oao_Und=hnj2OySL3HoMzGDOQ z2&gClg4-5-;{DSXJA4K2oW53%*#FMITem^=ChWr5@$(msC|LgA9W^Q&B|!okm+Z8X zbXNod5JU*n!^C6^%PVAbN$)7jClFa9qkFEZpzwgLDtn5GChJW^63WWRQ#c}1v)&od z^FSK}94ZvYr+7dmX3JUFR2HDw!N=z)$h~IqN@O0tWZFQ9cTn*{CV( z;lfZ1rcNWnz7-9#+{+|MUK}2k!8mG`i@?D7^ep5|{+mBu`|F%pM zlI0W0G|gvHsU(-C{~qkkA!5tSDjM<-{c0ixY7Smz4k>q;YIT+w;?Uq7(3MStk{Q)K z!Yu+#k{;BaIEP0$ZU+X4F4uwq4S{4Q0QUsho$(!!gk)gjsq_rFXO!$Du3SJe}VB<9`S{9SoyW8p>wB zy=#8g!nyD4%j<{p{^3>qaBd%53Zo(LTgT_d=WgUW3Re4qMy)iK_m8dW$DoqQB3n!g z#JJkJ*e;rzmMo&F4{WAZ8<bfj(>do zr$_$$Nft; zX!};=?@xiT)Zl>YbCY}F;5%Q^UAVH;|HSRP+jOUC@f)k|o}97Z@!##c)0e9$N4$+W zUBT(OJu^S^{*ez(-#h)OGXMhwWV9OlDa|t-MDxzTa}#CmUSdR3KVV(3Imv$uiTet+ z(tmxyCja{uZp@#Cf(f#hyBpo&eF(mvb?7(3gk6_ChP{$H3|$CdLQ(wM;!*H;qhW)5G|; z$Zyypzj2HFrY-WDx5#f{eFWFmEz0FNX#3Jq`b$gOw69h4n&R8O)a#PrWg4!DC1qM%W(*_;?>qI`-}=Xl8`E2Uty0PnU9{#~f4 z;P2TVK=6+=cd4>t$-G{meisa$g6@IhzyhW=YFaT3SFUUtHjwuMWQG^CkSak3=RfW2$0fCK_gbu5Jzn z;f|T-L6bw%G@>^r%i`}ra<;Adh8a?^_+LgYo)y|WjI#1?JC-cseB%c}mNv*rV8Qp$$4=LL+`Qu*Lj-u_`!2Q#@MVVWKUR z-MMD#5p6wdwq2ra*JGQbG6^!@Wg#L_mQ0m1h+`+@id1-)n$g_Xo;iA-;{GutE0J|8 zIy)~L^5CR1+q(s>U#N+mp3iVM%~1ts-Pgc@y9brIe}chgJZekO6>DWYl$!1PDoo0C z!3Uh;-bct?5Ny~?CC+siN?Nw=|p=TurbZd8%!wvhy7D)q&UZrq{nX`z2DwV@$Q#zd&*S5XXtk`wJs#gA)|7um*1u z7waILyTPLxovee=cq%r#Yl{JWMZ0Y{9xEBo6KCHl@f{_&OPbS?=8U8{De2S6)OE=t z8@h&aOQtsQIN1=2c_Z{2&({V=_D;JkR?e9lp*?^rss5)`)`{%QD=_D|0J zSv2oE3)Ow?OE0awDs~=v6#8W3Pa}EnnVhBIX<7W{ic@Sm@W`EaAA&=Tt7&m!*_U^2 z&zrW(yUUBJh_XpQ%q#n6gQ8X1D5n77Y6Qd8m06X_!m6>X1e`>`{a!!UQsF9Zh6&j8 zfZwS0_rRaypTcPcd_Iw2;Q{y!6tHRZTi^7QFhLb%)1#Jw^FO%%6K_nTpkWNx zcf*npKZ~-TLK>af$piihWRpICdBBG;s<>rq(rEl35BQc;=A$Yh=WB(Ua($r&*mljC z*^$F%PXlejb!UjqBuZa<3;`RDgz&Caxm^px?Q1ZO#$rh`BpC?7gMrMzhm){uI7Qqw z2m&@vk8vby$;zY%;UjR#D1SF1*}#T{7gNC`Skxsy5`wb)p)i*OeGD3fQyEDI7n+hj znr7o19t|ZO7@dgWh?~WJOfta{;`*1^Bhs2wqx2}b`i$dwiLuw>b{6%_U{2K(O+BxpE(Dg>nJeYN&d&h(_#w{UC8vs$|?XL?-g%hwMqPp#JO z&Y23%`ka$cEw_S`ZI>^t(RmamPSV~Sf#0ZPLouO9Ctfm>~q9Jmrug&ru9#Q!mj3W55dyd(m}1=|q> ziepapSaMVqEG|J?O%qo9xs;D!s&NqosusLubko@9rbnf1;*a9V)@JK~XH=;N{$dIp zgHL_0O7|ens&FJY%;~^~-uM8C3!XPXq@kw^LeF~ARyh|AoHpVQQXA2gD7_vaR^d+I zJ>nCLRq-kvTFd_KS%133#lSDx5B`wxP=Jzp@HFw4{e8;(Yr5nQrQ!?UJ}6P} z#wl|ugwXJjLG{{t1J*O!m~E=Sp;2gpp5T~0UDbCAykj=t03OfAGsKTsz8Lu5@PH8C z$UVTj8C9}?U2-eiv&=rY1e9>U3jv%a;T}VB!FN4D@Bx6!1oZ5fWjS0n?%!d6qFOTD z98vZ%&c0;DlXNi2g~BnG7~i~o#y{{;eYtsc1{X@V8{2uc!J-6(4qBQi0{d`DovHKA{=BIl!gWUzm_ub>)9(l7YjT?0NG`N4`^YZb`0niDx$nH4H?`*8UhfL5 zb&ZK#V+C8o;^-o`wC9;l+YOfR4ruez;>coX$qhQZuXAbNihJeEqw)OqqpRLyIm^1Y zVa+=zdIt+_BZZFb1$V>UmOCwr+^V|+7_o2Y8Kw7Ga+c>d%G_13yB2^SHy7;Ag*vR3 z2L_JTRweiKA^KrvIr9yN6^BR4lW!{m~<-YMH&^$7< z&k-8o!+{1gizMKl70e=*Abf$wWZ;v;pkxc8*L6%T;rog3n z`}l+jVt8V#0pvm?i?Ir~Is-}1q=fn(ypvApw`Xg>b8 z1}J@|Q|lF78iRLf=Q9dkMI**9DsjExl4fNatTMcdHmzZB#j`eYOdL7(jDkebRBISn zw>pXjcmpanXR!ujCXAUeW}%#(q7`E{s)a7vG3KC}+lo$%xu}-5wdNgS^NtlpY#uAR zG1Eg?oVm=xg~iT=%kyt98J5D!JC>$Cbgp<;_=ks9ZhhRAcfKr|$Dh?=?Y;(sztGWD z)ZzQ3arynqDe_)7+nyO9zqsoqmTkI#+!2a3m@-jz&+Sw5r;27w zfq_L%Sutg!>YMMz?!=0AOgX4p|J{K*14SpMT$I;ebYskegR8|@9p!Djd+N@qrJZ73 zPtl86KB}(a?*2Rbm$r$u-Nky$@+)wb>O@ahu>rFhDPPmwH}1SqY{FEt0;br4sn$*N z)P|Wf)zkm$c8JOP2yT=#4Um;$KSu+rzZXTZq}M3mbJ}Bh!AFeZIu~ z29l_n@qY**sNNdQGYh4$|B`z7uc()QPM!Z5)%i2(E`NZ#uPOYVh*+J*3f zfBwMY5xi?$>b>{s@@qf7a{o%+x<@qZeP)36z~Ho|Z#RCcv8X+&(HztOS(+^07QQ7E zopRbmxoZiryXEwD%3{0SJl|X#k<+92MaUcTZxnaP>78o&1v$M-ZNFPik5M+q?fvun zi~HpCex>|xUnFT@q*|Mb%VjOq&{Xg>6b28j4W1AOPyEVyNUNzU)Yu4yI!vqsPeJZu zzy#is))mbpYN34nMN%IakI|ZY0efBae$afcIdAO}4c*V|Snv5M{Wr84%i{U9mIGqT MfzK&S5jOCD0Bfa)7ytkO literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/batched_triton_or_deep_gemm_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/batched_triton_or_deep_gemm_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f9d430de011040ac62bbda07ded720558471fea GIT binary patch literal 6732 zcmdTITWk~A^^V_of$7A z8dP?R)Sx~VDpjFcwMwOafT&dZQ_-*PPrv%(0krmxv=UV-RsEY1P*?rxId^R5fgPg$ zb!4A8_ndpry>sq4=bm%_;`4b3JQ>@M(tl_p zlR4#dJ~xw|1#-AL;Ifof^11cy{j;LNXBYTnCOw+Sc*M4ImS>5I)FW65u7(|52}Nvy&%O7G}lerG+V{pfcpS<&v(8m8~S>H9}v7E zH^1jqK8$NfpRoxnj0*}r_#Qv}1MuhI-va+2{6jEbe*<~pB`1Mxg>Q91uU7Lbn&C56 z`i-0rfw{s$8?0rZfz0+M^o}ONzWJ@)+j3*ua(kL^_cWpR zZmab^u#MfqUeIORGP%Y~w_hUhefsWA_W!=L#c_0hHhGQD6|=mOUl4P$9za}3%LVFZ zlB}~Iqd%c%fJ5nlWG0ip$_wCr{4DrisunD8;}A!;U7KbI`8l&f6r|GUmR%)30lsQ| z06cscqSVK#jA~KIyu;*dgui8Kfj}j^t;SJp*BEJtA&tk()6iDyYrdg>8|}|-rejM9 z)MzzNTi6dPCA_Ofhk3XfzW$cll~Ap4(hAQ@s9ws{BUj9*$tyGp@}e0vbrwP< zKT{avuZ|_hbnlf+KDEHhb4iecdm*8+B-~NGqb8xg^EDxHhi;R_%*>Vr(XqOl=Tn)a zEc3jK0%Q2x_|L+Sk}V7s(uIRcQi3>q@LF;7&9Q}1aVV3%GE`Vp=JL6rPcoV8P&O}! z8JI~-6_va+1Tk5Z+1u4qOd68<}U5q-U|{no#8Iae+KaF z4f4WI-2Pim&DDOl^Qo)<@BQPIXuQ&SpwiJ*?ikQI23|OA`#j6Gr>y)mA(C%x4+~ap>~31Q{s+RLT%;H zuofCFht6oBGqjn|LWy!{LJLhiw=&^VjA2Lfuvl6Eb&+s;IkeuR#7;b`i717Rm)PR~ zw+Jn?E(?7Pq;;Nv-B1SdWAe>fP8AMm|IBaX75|+HtDu2!mOxUuPCC75ex#4xo2I_ zSUU8YJ$ps!X&5C>0k}b^&mGhP2bXOXf3WP2YySAM75x6L?KbGn z2nn30!Uk?1S$^*^J977MIeJKo9(ocTF{~E1=l@*yp}LmL^MubP3liqYn-z|E=hDbd z6}~R&+>TW~jw&D1Vq;HY6yKb-QY#Z-;7^!m2|Gs#euTL!|E<)MKjk3#hkL+ZW`Q_442|pnhsm ztq>G#)c|ON6wrEM>D;NjG8ao{3mFkB9U9f-m^_y+W`tOBRuaWnJ{S8?OwZ0K6FK1n zQ7EQF+Fwq@Ci6-RU5US%l*VIAfx%cZC&a+5Ou{B_?4C*IprBd;9u>wzS6-sxc$_}& zq`oP2gQ}$%0Nqs+tvjnCb#Gec>r@Xm%(-MJEzpheFdFJcJFG zyxlNl`Tb{sNGY(V_<(w^$WNslvs!nf~AbeI-l0s5RUPz=i=E0co(y#mqkYE`)nGxKhmhj8n9r+K+dkG;6d+}4Az zCMkhn2*Du)!w5zY90ox5h`LPOYu*6z($J2#+nja|Wql3+QkGpbW$Av5wpz_m8YLK~ zw9u&v*HZh1E8Lz6*Hhv8*8;Av-ykmVh;Th=DTT&M>^Q&;iO$jw|G+3?4_3|1=Y zuH;A+^VWA__^|G&H%9c}tJbS`CJTkkA}_(ZM5z`GA(c$$V410nwPXwElv?%A0BLH~ zY~ySz=LNbA25xz-b1GN0Np8MsyBoV~wm;RSx>b+rRaw=i`qjXLYCX(At@(;YnrWAv zeqX%BG{d7HTN*_$hTu&E;|ShDa0~&p%HxQgK=3w#lL#gdoI-FKfbOh@2^sZLv)lUH z4Etw+wbb_-1Evf7G4zzP0KjVZ*9$GIjj)7z%nm$jX)U)5YAu6Uk8u!!xwiGV>k54C zyXC8}!3x_^Vg2;iUSY!(Hd0}CQ`kzIZ8ggNm)%@7t&217#5=GU&n2@W&+8r@3Ra<* zL7e4zxKznhIo5Pek#G|sWzFX^v|!a;1u2g=n~S<#DZ;s}Te1s!d+myIqF$W5Lu(US zh)9^(N|((jbz;XX$OgWqSW}z8ZTaj1s!zGF%De zCEY43l7wp4ebs_KC1x@_4<%76P-#hY(%c-3*b)LXBAJ6T=i8eED%hU>e|h)VJ>c-7e;2(UH3*=O zTK^RNV{{Es!?DNif9Cb$D*doZHh`=2W1SYjRr+C-fuI4a#JapIX7~{Elg@4XcV)V;&2bH5HeY<(y{I{on4@4HKZ zlbU;CErh+_W%h%jxYiZ+4r|x}hh?;Pt*CcY!-*{y@dZDhx;16EvE`|?yx3w%TjxsV zPR8(I%TGe>D}#3ijR3Yd(%NpcAQr@hhY$;s)?F)8ccxa4XyJWEE0Q84+_7@(&au@# z4YHCpB(+!NtcJDVUZVp^yGSIu^3k1-j3~A`t742UZ0#mp-Q~{1TIb=1g4Q{1bR#iF z_8s_l4`4K8Qxk#saS^OUKa6)hiOqLC3G=QeQNQa!ele4j_ytkQi5a-K+;lgjJIn#P zyFP+;*)T-9fRl~yXtFES3$snvN#<49fgK4j?@LX;KDwh3=DKSW!|fHlz|+HYa8XHT zKyMZbk{L64xNya(8SbHMqh_y zlwsCfgz^1@ocxNMd}*;TmX{8~Oflb(55FOkFC3zU@vMGO-aD!7oqS2KMaBLXJzXIQ literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/config.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e47128d731049b794378e2f6ac96d3989e39cc8 GIT binary patch literal 37749 zcmd^o3ve7qdfw~{*cY%1EZz@d0D>SF#DfIEhbT(o0|cLfD3OvW=%uZmFOM7Y#BN0pQu&vtCtaq4t&9J_o@d6!^}y^U0dt8<)ODwQk+ z*_T{Wm3)8q%$HcT`QH(CU%Ap7N}cPt$C;bw&E-%%T)l)4D+vgn41mf;qQ z;8OodPf0$x^!$L<@cggjmp4ik_^ZTUKyH`pGo^wcRcXFauT=epV|bHPV_sfvXZd-l zTGUa8zk0bt-b~2VYuh4MbHDaGYNhm1x76^4eK=?pjtElY8-mm%+upHp=!QFG3yW_? z{Dya|JXUMz@K!ao1*zhS)Gn>%`CsX2sTFM)=Jgm|scn_`jjP1-*0!&ix*9Gvty!vL z&D70nrfyj?wMy<9t&@VFWe<4dna>>;^nS0@iTC#sO{D#Rpkx(@w{q?PT^^RBYn2)} zg}T>FC5YTo4+jNWqd`9mw;D3xntwMBSYxM};9?@bGrY%dGbmf*mTDSOb8bxGl5 zcqAH5BoHV)7E2zEO-%%?S?6<;$w)jFj%FP%#^4!3a`~yS5{^dY=%ILQG%|)<-Z42D zk|slAN_=WEyMaPUIhKekp^3O8M?;fpfshi8T|f<*vV!7e&mB4ydiHE+ z;LOXX4xiS8r$@%+2^rblN2uP@GmrFPjzq{v15`c5hGNF;&g@D|LR84O3FvLq(sVq{`6t|Y}XtgNV( z?GaB!<#0k4C&CwGF)^jc;^p|1BJwwg7iEPi6C(+6Bt9`2PDai}<({lVN?x9nvtGaz z8lBt|n%)!MlP#xEB$i~s?Ys1Fe`vZtjBr_eWHK~yDU@K1&z2~1a!QGfI1FT^i3m+m zmT5M1TbQ*dmdln|D_rZW4X$n0{<`I|eO5^7h0d#!7d--=3tI#MxgD47!IGJO;TG|{ zz+NVX#ROUw8xskmgp`qZ3^7w9NijJti=)6*EIc725KdBBG8~B|D2Y{)oQhFd5GO8< z#ud~C@+TuBQ_-*@esL-sOGajZBH)X+u}2h7%W6^Oa)_8G4yKz3M~Rpr)DI>@b)rt- zp2^ES>{VH(j#swt9ev<4M>4LAjAtDzCg@NqfdwT%P8B)T);!j`bO-@YqPDUoXlVJs*bzhp={luONyXD?!Yug)_XA;o#^G9E?<(t zk8!0p8oms!=+$BF7?WeN5*ZQmVk*J;b4a0n z_hUj3*w3cKr?a+^cr<8J$`F-xpE~?P=*;1v)6cz-bus^QCk_t!8pY)m;frhFSeE)fFF^Nn|&|N4fzue@LJ?O(WGIdHA?gUY5Aaj3%^xOpVy zX-wH0a{(bxd#yCnuqoBheXIR<_Ac%?n%;5rTImlx8$PHJ7i&7xHJ#Uv{Ls^A2rYRk zwem_U;2Oc7{%CBq3^@hXlZ9c6Bn*QkhQVgTcF8vEknF=Hk^`&Yl3}M*I_$da47#!v zFMzcnfu5DoIi)iY!9j2=>cG!t_VZ&N`Y|zf(DGyA82k{ckKqqb-TtAVJL?RENX&&o zS$BvAvZ*Nfy&*z#R81)zkVnH)(PY;1f*gx05L%HLIqM#fqW#uDgsk^KiKMDZh+ zum!f2T)9-T(S!vAaLO)8i?ge7ZrOu6>WXVXeurcoWikOZl&HDOR>`Z?TB_zNe@H&3 zn$P!;d@jm&*$T>54ARJke*HWRM+0(VWaf*k%O5E0=n_(t$Ry-B8_Ae3XW}3}ndlME zjMIQma}RMO920rwb9^L|BNz@b_KhSX7ujH!5K+mr92px=>cWA>pR9ukU8REiN5WA# z>kCIG$HO5^pT-g_!a6gVwV#WG6G6X1ql2=GoZVEUG!&BJBcTu<%$h06hS4PJ3x&e5 zSUkyJj`V|6gv5?=h#V3g*^)DCmZl6)%wci{$$5qx>W`ce50H^6rp z=NC3_y?XfC&Kpl&e{wF6_B72U)1HkB_Rd_TAhsiA@0=a63-*l|vz~m0vfiw|-E^xd z)w1nwd#Ykr+OvDX{xlN1wq1Sh=8Lz#aO(^61L^853-(?xc+1AC2d_D9c&~eJCezNw z1$!GJTQ^-DxVGs=*Y&Qssu4h%M7&J%-qFJ=SZ8 zg~kl;r5ef%v3@x6ZrmhO|z-#3n=RRz0@E5ZeXKjcJd^JGI*F z(iW`qyQQEUkUH^KD)-0%IiTtedG9Hawjxh2bP9pue8|a$L={%2)Qz0mI- zLwDI@4xLdRNxiaH+J?XFa)Dfs`s6;8Hsg{y$*ID$V;y<=SE+@yT-v#2>aI0Ycdwbc zXU)_n)=b^IX6ln`rtVua^{F*epGN9K#t#X*Iiu~;GjBL_39th*+5_1@!E^=75-f-F|LQYb7eNx1L zI7}I}nhXF;OeK<}oQXmu6ibR>EV`0;m5;je;cIGE588&=f1&astXItDeH@VtJWL10 zX-wHvl?Kh}dfcoY4HQ;_a0=Wf#4uKU@mTaS*N{*H+4=wq4pn2+rD`kWOF&#Q!K&Ia zs>CM>v>3rIQPW?^0^+DGRBIuuv6>1rTTDv=5T?V51nNA; zCZ4orBKn$QE5r;ZRkx6ZY885U&z>YZ0)I*pVpOZzj0Lhqi?_~|>#w;52Bq%YJ=BP|e zO)}M6A~F$KsVYi>{p7KToM%^&GnLNBU83AG)+3&agN!OqiHvK;!xvf0#RMs>@{4o^ ziO5>Z3K=KstRQCQ9B0KP3C&<^`O=8Y)T6+VT3M3T1o#g`YorBhfVsva)B&Q=_;f-9 zONb-mn4Zhg!u1A_rcp=~v5K9X!eT)L@{{sjq@dmc920by;-lK>@MWY$#^O+vCO{7; z4@WM@64yd>Qw8*H&U*~&qf;@JP#W4bDWT&Ny_X?$MpokDWE|>_M3>siv1t5UI12q8 zRx8NHmMiEK=+m@F0gW`spb8}ADC!1}aQfz<8l0kNraVDqdAj6_Gkl?dC>}5_q^RSm zwW$CRMwy1Iiy@aQpJ~N)qMK>9@-O{4z68@DROByCMHH^)WQ-CvKtcj>gz?U0Xq~}D z;iw{qrOWDD&&hI3BOtwjMf3g#vd9UI*NfE6ynX7Te}sD1kyB^H=T4tiwayyV4j?0> z6Y-2VHAx+DIzAPZ#7PC%KL;FeZ8Vl}05J}Xoa2kpg3e8HSOpj8C`dBMKZAx=%VLnd zTFo+-V}y&9rJ*G7Ka=1zG+z$|h)vbydg&Y8L}r?*+Er zNiOz2lkR;c)q5l-SY7Unw{6*m-<0nNqEdmad?j0pQbPrJAV3%%NQ<``9?WoX8jTr3 z+nz-^t|52BcW!|HYKEnE_k?s!p;E z$i5n=k}zdCEhsM`%i9*N?JR;T>y0G%pbmYSG-JlNJ;oe=fvT1R8ib3!mb9;BKAG}u zO*uJM&4E0%29Rd>OxjU^J?Am~Dt>53MQfIw*7;`0v}9a_(G%G>@&-$`?!;$vW1|X0vt`G@X-sk!tt_a$4bh z&QuH9mFMWyYg3IW<;@924Ki-5{D;5e*06q`Z%rqGQ_O>9DtGZg(_FL_WcdMVNtklq zu5|0Jv~O3+xoZvTorE-7O)!}Cxz~TyT<)9YeMtX=##9;mM75dd#V=58``2IFd1eL( z+!>xnDD5>AQ@vx@DZ9+YXt1kMy)wE|WGGfifw)KWh} z+2tqT6pUFZ-==pP?hgFk(QhC9y`gUp{hPyoa`L+;?+2exIiFAYp3gjFvhgYAMQrk+ z&5Q=j+T{Y|eE(&pd6kCoDvd2$^leW2Hm97M|K2mp5Mhp(MWLjWJbc55U9f;Jac%A`ihkOOjyucYWi`<&ee3@v> zQ9;<__EpA(Y|$X}-2LF0l=Do=cjj{#fEYiR%cQaGA+B08AAt1a8tPg9U&tg{xvp=x znO<|a(NY@-bB|3XF=$tjNqwu!Buc{389|w%*0#ba=+Q{|q{<*|(!&5-MF#b)C4+e4 zMgk5;nM{EDe^}fU@zyHhrthKR#*kw(A?h_#M159NTt#y9tt~mU zZQw{!F7})!$`?HbeHct0+mOm8h}kk`fV(Fo_fPB?jTOsuW|3KNR{C}7Z;{#LSTyfp z#@91#W^*1RYPRp%L%1Q0M!^blz6xu|d%nRt?cdt+_Lgt;yxsGMhre_D_m8JL2UE_$ zHSk3$jZc6t9EMy!z9b9B3HBYb#3B7}+YGvRHuM)`qZ(kXi4E1VeIH?C^D5Xln8(I@ zgXzwLDd)kJm>`U__O{}NnN+5 zy;IqX(k9w6(|7JJc;`Axt$b%9Ac3wSUAF=VOf+t%g``hw;d#I2%m1#w)$4DehLtOp zSi{a>$3{}c$%EZo35Wk3g=Zr;|3%&v{_7gjQ3+1i~f90#WnM^fz-}m(xREa5n z=kM%&Px{XI?~i{c_WQ9vAGyEdg@+_Bb#&ge-}i5Xp}N0n(btytwWXYG9|wfio<;HL zwD@!;uale~_~y~CAN}Ug*N3nr*IIeA3`VI{t*Jm)rhWTj`|fo6?wijpb?jjEVdGWD6=8!?e97%T`N;wa$ zfeR5pttM=ZLedetNNqT4)&_QTiltK<`d1f{`8+>IEhyd=+@1DyrktG&TDB=VVw7(r z!I)H37|2GcXeI~Ik4?r-#Ojg+)u<0*by=sy;na-fUCTSF9{4nNsAv9I-LLLGMQRh={9GdtD5rOgtfkadal@T zpejGqT4(bItsEE#*bypoW>aun3P&+$oH zWNI51Yd5EBH>YZMr2ISJW|~_Uo4eD^-M8E~EjM?6(6C`{=FaYgh8;IcGj(nAM^d$2 zDSua{qI$8S<6cDv#T~f$rIf#8skT|$#L9YbiZ-NL?-iV@)7j<>Kw6b;nm|M&vmq}% zvpDCe&kF3bc9mT$W0qO_Ke7C?5)jt{IdWB}&cjH1E}VqL7%Vbr8)t12t43j4&@eLU zBjd&{vD1L)pn(a{n;Ff&0ie<88IgYz{ye1>N=lIQBd$@6Z2~Ghe8WH`u&DuoNyaY` zp=o^wLRaMpP2IdR-Ozbw@7L-AL$sTVSw_R=Jn_E&ib!fXN4<%qh=TKg4oHgpJs_ zsYq0U9MtS3ndCFIBhIsZH@7U~WZo-kC+G|8%|tpH-owml#mJ~Z%rfIZ9Sl*Nis@U` zWY3rnOeztCVfTkV8-?ArZYSEtrax_6VjG*vwP{A1)W2BNu8_QAqEE|!P~XIkHf9h> zrhLq%4^#RG!~s7A&gS=dQ7EeWMSr-T6?%@x!&wVQ!Aum7yGm6bOhF4=m7#x7E@T!mjIot@~5h`01GIjsYP?c&EZ!ZAEp^;wVX${l)37dhQ5Ll=mNI88Le+-33-9i@ALzaNWKM7#vbY}zj()6J94>PFZY+0@CHn56`+;qD zPa^6t%d*puC08zZ>r$SUOk>BjlYiB?E90$A)pw$ zz~4H*_0IHr&i8{0{*x<08GqB<*!-ot_3s^ie>VzOl%rHdn@Mz;tImDRiwT*(Y0=l4 z_VuQA3_@uO4Q(im6YeK7RQFU7vFm2 z%~$U1NH_P~tx7lV{769LvQ0~dogmNHH78M0m!8z6C+(VVr=(qa(k?^Nnbnh4Ewy_t zNu|Jt>dSYl6*?Vtb7$r^zWMSUfZF|$fUueeKCq+;>imE?s8%$0(_39{cAE;x?@k*cTt{ZJr7+eL+v7%$5Wg4q&GbAxEua61TdJ0^X*DrvN(D;oAv0p>e2Ce z#4>LuJRY&=iAOAY>LE4tP(JmLmg;wGoJ-7~d2{9?0baE7@cLR5m-)>kYZ?37zQMXi8n{(h7&{fyvO3CLG z+`emxn}@Dly52I^pLVq@yC}NLhup~Hz^|H{d`>FoBA;6=Hy5~eCGBdSx2Ik0%N|N; z6G|$Vw)JP)gPGQj%$Baqfy4A49{dk>=l^fsn!*1~LHfUa=Yx$WERLp3b;IpfZoM+U zBVFBro>$$Svr+U?b8F5)9>CJrk}G9ir%=-{xAE4?^T@X;=VEbgAy7Bxc+2~yH`THG zy{c5*Q|Zd5a~_uFHBBpHY2`?}-F2%AbyaT4`BQ66HK$_G~R^5KDv3}zaRyk0}s zb&Pm0dT38MJWPM_Dm>6%Xz`}%tF?{vp=EGEC*g)xqIA{<%>k~2nk|8f+{-IvDsugT zCRJk@=F?L3d@h@@;%2bIWi?JGQk;I=n5JVvkHVUpACTDyY&Mt|%^~f47-i#YGays0 z!_lfM(hn6DH7JR6WF`oihWR0%8rEyKo3SIl#yO?(yU70o%nB2O@L*~v3f`7Q&qmz7 z@`1B#(bLRg)(nRPAGGfFcv7dXpS~h8C7XJ=2KIZ z>6vEj67%owT=)5n}`PA#)Xs z7%n1)OC<)g=FDpn3lzw@#_$qeXVe%biDK+|NBvzZhNuB;q7|-H)4=8PvsC0js!dA; z?NnVMquuilr=3xSRZK#rqAKi@Yymi=ybI{xw&oXcpjn9k=>qy!;|2!yAAM)dYQD{1r{WbYsv9fk)qs^c`88(Z*p zFluPOk47i9LLbpp*{EA57M((+9hm%}rK{Ef2Y=4f+O6opKm{UTm|_FeY$-a6W!4Gx zwK_18Qd#*|C^U#+p$)%WOOR(`Hp;{(A&;^mIb~C*vb?Ygx_C|bO72j)SxuXeke~)L z)aTJ(tD0TVg%brs%6E}>FJb-!JlH&G61=s)(Vg~ePT4o7Jexo8Y~TY7MvhE%YhHpu zl=*Ave^VIDIPmF}~~nRrJ^8su}t{gno}Uk}xgA>!_$;fY%A} zHKd>q*k5-Vwv+6?D#XCkR~&g!W!N}!fYUWmruU9n(^wKyv}J`MM$MV8how=^v1w_A zf^25y;l$m(tNR>oM_^W^E9QI{sf%dLuhTeSxKV;`C%`F!ZdEg_-A^!t!|4Zs8RZ5! zG$5%rcjiZ>w@VC+q9d6}zN}}At}W)(s2I7eeX8I@7(D8g}Zw&cI{m79wTXK3>S}FI??bBoe};V zfyc)PYnzX*&Lj^lSI9txGnCJOfr?fQ>t`V4kAY9lISzgez)6ZYKu$9`UnS??!ZBqK z^^)k74Dx3v@fBi_VR#;aL4HGi>-?MN?`*w$>HVtrrxv`=f=O1y6*CInqd!jgnCPY< z$9#(Oj+!|yHCD#V3Ruu-(u3+_53__)WAod*=!V>+|iG?U%y#5Yp-W*ANf)2fM) zg->Lx`4Kolt<|O~wX7JsRJ?Gkq;B#i;h)s~kj+r37tbW;ci;DJrM^kOMJUZS4Ze8# z@IdI<=MINn9D4r614Czy4I_N$xuL;hM@CSm{&-a1@_&WLc#EZMld+Z}LkH%w>OOv# zC8hGaEXl|2vSj(#<*Tv-aqN0keBZ(@u?tjj3!du@K_eBecP+)emHDs>VUHo~M%Zfz zdvMR3MaIp%_;cW|OfJLwZ0c85oRaab%5voOA&(z_6|zq%!@Xxt$#`p||8gK$m95b> zMNSo5IG-&|LaNd|yQR2uo_l1?gCx3So+WV)l9;E2Od#b+x?MHehriIdFz!f-Nd<>% zi297b37xCi<-lNZKp@+BFS7+V_79Yz2Rv)lO`y_aLP#71V-1GIqob|-lPo$i^d{wV@D&!RK`JQz&5smY$);OcR zrhRfUdI`(4@1s&>k(}Z-`U!5zrCr3IzY3&~tgL~?jDJY2qZ9w?2@l!~y5~E7XJD~w zf4Xb``-ADO<0MibCgQP2R zzfaBH%$f}yXIaIfw>|BpU5~N@78pD)`rFg~_SBaBY5)G5!=~CQJ~|Vxm%^1QfVhP! z6vZ8Ev60IUZ##OI#?I?#dp`G{P}^yfs(|EYHIkw<2RfZDb5haeVSV)sITUX&0v&`QkgguG3V-nyL z>RRSoe{J?w>;2%7h2XJkrKBT=sbace>os4dp_TdWjHMfPBIw&N?@M7HRuvIhrwVl_ z>r|y`C|gBeHpqXygYRb-U|d%bwnM}K<0?K2B&78`;46Lz_rN-aFp{N8)ymm|nO!y3 zBUY?O^u^?PUF`X@HXi+{ucW_QbaGM}qhDew1 z(U(xZYV>+FZ{YQ+M!8p+ew3)U8P^RKR@#M|%C&dW z6^%M5QTd`AMSk4LsTGMNCdkALQjI>qK`$SLUt(7pvM*eym)=oJ4k%;ljdv8HM}tt@ zg2hg7uMYC7sGshFmNUq<0KJOY0GYbfHY_)H4E1fv=N>;33etW?8e2FxA46sC(!i?H`| z*9^m)FSm}d;?tS^|MqDJqGYuy8)sSxflsYi{8obbLJG<*w-Zv(C>Bz>|yc_hNB6& ziviB1mfDr54@r?xd^ZRemllvxk}pO^hz13*-RLY7FK!d}itJ^t;ceTl@E5)Ygu>6> zsQP<*3(8Z2aN8iJnJE7n;ThMvVko#$kV}8dcH>*Nulkmv!iVA$>{47n$nNu3FPNk& zT#0~eU=jP&mS`8-5@ht`(TK({H1>TOG*XqH2$dYI22>U`Ql*1AhEN#TTD`D}As|S< zytYr2V`C(DwJVPJo)WP-zX`1Xiv_??8x2?;VT|)o?tJWM)flwjz@Y2Js-djTcIzzL zXDHjJvFyVLqEd5*q3n+JmVIo*?AJKggqB^r_oH3KqUMY;^=Oi2JzYe@3l}i#l?B3v z(WZbY^D<-|Ww#g0A0tk+a~;eiJTV!CYW19)oTkfesdua;NK`DDNRrPDOGd7JR4gIC zE=HKY2*XA!VOS=$@fu=8Jsy)sV?k}i1?xp?A@3Ok$=?Z9nS3c|%i6qWqzKBc;5}6q z=yJr!^G1rGY+o_o7R!+}%I+vG`#7m-5G6!tdiYT|TA~iZR+%jzY1wF4Mc$*}zbA)j zi%6YvCRmcKF`q`#C!>XI8O`ah9FrGsweKa>B*^__BK6|ZXsj>lI3F~{}y#v3s^=nvURYU=7x*@1s;b%l#!Qv=KY)$PZHrC&(@pzPp3Z-|RGrFDu%g(Y__qj$$H=)$ z&QWsc&^p(DeMr6~a(+b4?~wD`HJM!2axE~6Pa?-CN1;rWWNRObKk$;@ibl9y7e z>lW;FOOCP!u*lt}oBnM>lS|4e?nK^J@C3INd|_8l?FJzT~@9?}#`&S5CEk>+KgGJG`3C;OrF&MVL>5cV0ul?eL{ z;Q+!FhHw?am4z^=h$ntmL^++Prc$hy z#9-D*A8MstmNVIMT)GNl6nwS}8!l7PtPkO6TnUG`_$$=Y+W{Bx#_}6**4ESGs2K@e z_L$jmkO7q0X?^XuB~ydz8YitL(A-9vE3Oy9&GL4xC!E>*B7H?g*JyA&?%iA1@TWFO zT%l-Z;y+u$zlEE1OY*s?u~3|DT>TiYD%Pg6BMkX_R#gF9aPj;rKuh8&l*PsK9^Z|9 z*Z0k}-osZHy_JjJ=Crpt3MU(oYV3Y_pb?Lv5&Cjxql{PD0~zkZE7XN~Jn& zAux>hz_S|0%?`P1H_KX;kElg#$aw+21l7nlLRjRCzTKlXOI06{zHzwC8O3{zUor)FPujO9<=pf50A2!YEBYx# z=D!+X0C>d?;FkgQN{RPW+V@n-`PAcs`bTC^Q{-Au=lAY@8Bi+@h~JeE()f#1VOfGx zVDP7p%t%llY)s%kM zOa>h&uFynN%~6zk@W3H_<9VQ%JF**yb+Io{{8f8x8@1b{L$03vtH5@m4?w^+#mn}P zyV72q@v>qM`2+v91%Ka182b-c{)+aKx263!D@xnRrFMPk)3o~1hpktdQ8~NJN1^MJ zvo#DG)ck={p+iSs96Av?4U-l=iTNAK%N9(3OTM3wbB3I6k;5hgY_z>cK{lQ+0nQ{Q z6H!b6EKw@4Bbi;SIMn#xDS@0L)bzc`fkjkB)q=e$Q_BuB`U78nZi##Ad-?ZeeN{Iu zT)(hjuV13uL3Uza^y<*fLksquOVtfmk6x2-26Cyf1#trl_LillR(zuN%!0l3BYTNs zGroPxS6OB&ug@c{S6}hbr*b(gPv~(^=y6Z$aZj%i=Tz5)53ugzw;wTN2JM45WgB!2 zX@_ltmO)sy3PnWSr_fb^-!1MoaABQ5vE#DXw97Uvls~{ZTfFRLc{ORtq34Qbeliz~?xFX<~TIT22dt7(3u@hRMe?-oolJjTekYOBGa{N2;F;&T5kdO7xXUNw|4%v8Qo%{G#)t^?H(O>Z& zU(CWnD`&GV6=!{YdcsN1^URq2mX_u@8lfKe5^@*1vHImgg-W3I{(F zb|K}5uBw}77hD^zmi*9Gecx7_vz9pAA3C3cKB55!-9`E@wm~iu<*m6=igFf2 zxhTplG;GLu$mgYomXWW#prJmB@)w*;te~jMf~WvR;aUL0nZ|00f&$MFMF8jOgp&H@ zdh(&^tsM_M0YfX`*INO9aVsdQs1+2&TR}e1w|=>de0>72wQ=(US2?Tx=Hc5XZ=FoF z?MPMaNSF8LY>392ilZHf#!G?7FI@k^+(5ca%#~7s)&FTg<-CKxq^J#@JV-KB!(Ro-asSZ zB`qO-%>YC+1F#rd=xPLJpk~f~tCPALL1<(<^f`ji7=?~U5E?P;qbPqtR0Ty<7BC_K z2^K>;s>!#n1~8*ySWD(?@Gq%;2mAn*vt6!~LQst`Cd|1gq(PYTP)GwVS4N?7s@_Mw zeW2+BLa^S!;0JD4XLArDO9XEP)62A{JsWbR6y+?4a#57KpbaX;@DfI-zR%+&TfAsQ zS!JG~ZHQPhMmP|G<^ZZv@`=LEqn5?~lj;7G3;jbWp((Xt`+_ip93;YCvRa^ACs!3z zECPptDeibyr*_#=J1ubkFZcSL1ONa4 literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/cpu_fused_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/cpu_fused_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76e063697b7953c85d81565188834d0f42ebb8b4 GIT binary patch literal 15307 zcmeHOeQaCTb$^fFACKaPB$ARSiPV=R%aSbFu@l>IVn=oyf5>+Hk;HMs(t4Clnj+;# zDz>%9MJdviT(e37ikU_?SRRU6o;_4WyVb=CECmYG!v>7cbjO!B6;aU)L;f0FyDiZD zv7K`tA4xHE;Xmulx!?EQd*3p2mAhZ@}<0&K1x}YlDpmbV(vijQj z6B^d=j^>ovNU^@hwlxMB)&#%D^jk+4rR9ZWIG=_V`uHYSQpIF4Kv1B z>r~up9&tKDCdN{kMOkgX*gVXNjn}1OioE!*uC*{l0F_f zI~fbHeR^^vxO%9eW;IQm1LtTt7h>69)E`g4yTm06yizE7)*qWrOid>wO(HgRUNUpRXpEZ-L_#w` z$rhcSgcCj$bUdO7j z9q0bE1ITJB4y-ZS1ub=?r5(noe4FCfr7?`rx&$6b5?DR2`v&#bD&-)<42_R}qoj#9 zygkMnlV;w?8^RW)m-3UPm7efJNE)FxJg-%OH}M))bE~)#VLX*pfi+aVpUjcN_zhGA zQme>wlT38riNI9ON{ekK z1WBGz)+<@Z*M${rtD+)n<}IvQSu2`?TD8`_#N@)Q6^)A|uJ>d1RfDqxW(M*`86%9G zqAKtv*e{ex5Rz8t%O}&7F>eibR;FcIId@j7EJ@kYTbKH@!O0q`SgTrVY~xim*Vz7+ zIo$p6kzwWRrT!gJ&u8uZG`w%pk+kz_&dED?NA2^+T5i`Vqs+i}m2~iW)_Q&`e8XC3 zr6-~RTJ!esx{uE>Q9Laa6^5_AIw)sjLIt{mZH&VWm}wfDo;(xeys-&yB6dC)je93U z@yS4f2#wf80%Q!4IyXVyu%Sr~&(uuYiD-O!YAVJ7PxKZUrB_j-wt0z!+Iv$i(dW4h zP+!u;NmUMy?2Kk06rDJJQzaMa&j*83(D8pk5ro2XeT_u{HW7&h5?nK6a_cd|ktNf~ zKx8_&hvR_z8mFW23)8_M@N&(Yq2NWX18b<`)00FTO1d|(Zd@|W1i4t;9|@fgN(SKI ziA0d|OFCfvpsq-IE-0%_=E*?(Jn;C5P$VL0&ctF7Ne4pYT#%DgiI}7fMH8Dw`sz6^ zP6iJw8IQw5%5`CchlcCH2)BmA9e_7S`4E?kC~U|AOIDC8XG5TE%CaLaF+}<(DWjYU zv6m#1OyayGlPqXt8nig2x`D{75DVQ9>S8c-_FN(!NA6aX;gt_rp35V`yJ8dWAXN3h zFFpz<0z|o!Vw&>iHo@GMxgeN*i(klhKP_}Wea9|zZ@*{Wp4GgXefHHH#OCd*4ZeKC zI-y}*uAy)Ckf^VpzkE;cU9mff%uX(M+_yiK)``Z(Ow&DMhiGrh+j|6iPtM+()_qv# z&bB;rXTzPgY}3A6-Ah@+ORF~L6=s3S99^)^z9j0~vikL6vo~cG?XGO|aK3rF(7gT5 zu3Ym@!M-!4TV)(`2hs;J#%$-N<(G5J(|P7ufqC{$;Juf#%(FRW-(9=F9LO_=1?F(} z_{l7DILEy5KusAN3nt1^U)^?x!0dR>k!5z|n9)3Q0GbxC?js!quVY#;Wp*rWyEF1$ z@V@OJ=(7*hD!Xp>B`9^c@^+tK_bqP7*@tHLi<-K;#wBQ689MXKV&J}}ch%s?8$5!+ zlX*F3@P1&lrk(S~Y)k*r$#0*&dHPO=Fz{@4*HK~MXwLZZ2S!`kHQ$ukoinx<^pw^L zQ+2rW_D;dxxp?Y^|GGcdy(QbV<%ilk6aQ$*+4s)wUnT3bQt!U9ZDCs`yfnIe=Do)I z^)KF~h5G#|gXrqXy9NZ;z*1n@opWtVGaveTZZuzSUfO@B>%HA~r|&>7gu2YF^5qFSwSWf>dO$gq53<46gyjs;G7V%dMr^n_2a3ox;$Q zmS}%PeFMBtRe(#Ln$_PnD8duOxS6k;0kuC|xf;b*w~EiCg}2la;|b*OB!ue}s{B;M z0ndC&+Zw`CX^Wyn1uLqo_-&)IMj&ELV|~WY#4Tk$4=+pY3%E*ZRBY;9+AK5; z<=vYF_vV~?OX?-j>Yn4%{9o<4cIfJ%oOS)uzHcA6dEkfIe=`5ToZB#(wT@;Dqd&Ly zR8IVnQHwLx8cl>~abCByhV5EAesj6LTqJwQf`0}#4&nYo49oh5hUIcRv9o_?=3Zgq4F6`KNiDf_v zU^&JVB`!AR7Adr5i#zNRCf zu$f;37=b->@CdGu>EiGvzpPnF#)~6Gl0$|sHJm(n@R0w7BPaY&(|_dUJx6xEa6mGL zrh=CcA%jr>nbW`rKf#^G`Wh^Wqtqy|1B2fmiiQ$?|4d`$#tbUie+HG~&qFjz{fA*> z)#JUk@9Mt1XM^C`koP<-c%Bv+d!F$Kj0cFh!|B5-ZcpAlEVze3g=l@|A*E?GrL;da z$ViEd01axw89zdx z8NdI+bRZ&^5HT;QqEShAh*aQ0(2t}$4j(h%ZwNI@YWTuAJWp^BOmN&L9B2xnn-q~U zWZk&eF&CpR!!PcEqw)?_Q0cX%Oz_&dtLLu8uEq)!B$vA$;-k<(b@Yhq`o)eeas8mU zX{+ez7T2v8w{8~)M&N$(ptDnJTD3OJT}oe?dn^4`K?`YECioV2q<0kbm@-f;?FA!J zCdyV{FeAlK?$#^(0$;FTs*Z9t7pzFxs780DYhi!EjwuJ#(zd9%y1r14sRpXmTWCbe zNi{ZKIk<3eaWh=M3NFlYQyuGr4ccB%jHmYOY4bOGYvPST2 zF0^Boms;0f@FCR!;}-B;3SE@RI_F7y9(6-XUN~H?arl)2atsWAl+hsMz&!ScmeJ;8 zFCJ7xw9HgKE}(J!3G>tdafP}}-K593KLHv{KP6^VH2%m(AHSHHhKjCBLgpW!B#sgv z%{7d?=_tNs&G>PW;z`-q8lG>1ZppNN3B;4~{EF+H3V1kj@-iQ{@O8J&-u?wYsr4nP&P@P`&{`A>7Js5%1PirAY@mNZ-p z+TAo)+(gPk8NV(WQ0hXehKM-67mPW0agf=f8tj#l%Kw67m(6mru#Wq|P{)m9Efyj` zuS&<_4=AI!1318bh&~Z^B8<5D@X3}WH2QQ8KlZW22*7rgDF|aAXOz~Fsm$NFeM8aq+}t3RF#myELq4PB?}p( z%0dQG^1|VA;bAQW{0uPsQOJBwLI&Vkky|I^9f7#kE$|Y}HPBRD0{@~5w5?Ne5|!rEqr`P7hGHSK(6-Y}tJweOUyuxoa<)+~%qI12`5QIv_5OILFi z3~z#2=qhLNuJ9S&0`7XyUSHY%GmH-4F|;;Rww|ZowQE3h0i;NDOq1_)Z|?dMcP$9F6z0)W3HA+rU2$i0-yU zdT}#Aq6cUv5lDaF_u}4#(YUWcv}Wqu0^~gDticmfne~sye=_-_$?~jU{QYN53@0Do z4GTUGr#+L)Zu&JF`1=)yz3|XezfoI(d?0<*<;7=kDmB_*S^+U za9@Y9xW9zxcb~Hz=Zx$g=Ir}P?~i)#4&M!AkG_0=_c1c%y?{7cxZs895SV(2$^V6En)I`q%-Pfvjj8@E6* zv$2a&C86a4(X&A=h405=l!i#MjdMY4RQiM@Iw9!*q62`y01si0Nd>ch9roy@u4G_pP24UsvAul;C?RyJc^F%RXVtzTB4m zcX#J}hv(}=K;Ud$f~_lW>lbYOV)v#ygTJIyjm}4EjotCcM74M1+eU=8k>%&|n?{9A zqq$9c@2YZb2jV)+=^zTuzdvzrv<)U9daXbnV@^Ja(_)1TaM?MN@d* zR?*r!wEXP7hGV}br2~|s>%j;(53ICzUAuJkQoemyXdnJw_xJn1(|;%M-NEdU6WR9R zT>Ht?p8Jev1(pE7wtU-0ScC61e!t~AEq9K7x9#q9wryjs?L=zNUmPxIp)>hr@Hrt7 zc#crEKK!CaK_cFy;-82;m*p8C15boQfQMT>`1_P(IE+^ekPG8h2Uhl?s-GzN(Ip%B zhFZ217JPzgVHSy!^sc4rDS9reRX9E15)HLH>%i!(Q)(w0VXTt2T%I$lS8CMqXi_xL zq!U&P{GMvQuNL)b^cf?1wTo{I*YK%RdJ4PBV=I(WmTWvrCEcth*_6PpNZ=cl`HfOv zh!)YI=41=s#J3QyLJ!}zRG+ug14%B>{}k8+(SF>uBA)z?aCOif1YI1yHgq}(XI5yyOmPT zRlXU8dzA9>lZO>6J1InMyer&SE>$SFLje24kn~pchfsc1;Ju}qKHkgwc;^nN0dMOI zBl1@j;67IKRQwIi>KEw0(|wH!4^%aQB>2A#emqwhH$43DGH8L;fT{q_7NrwjYHh8m z1W8s~u2*wDzE7G@vV-pc+|M|+6IFU*@{o)rpK5{wa&Kb^DaMNxaea{8y}t{|zGedT zfg4%yqMx|45hpor8&-WDBRr|ZM{Uby4!i{^ZVP7U!Ob6i?<6(gfRYyEH@NB>gYXP; zpdE16uqIwfh{ry+h}1VBlGKr46dDB*@{mL|01F^$2(lxcCUF`5tAUm!4-NSI`?;r} zVP9uiefV4uM#&5_QyeEG&3F`Ci|n%Jq90tjizQ%XWiNpfxa@Z*JKn*aMRo=UZ#-}k z2d{6a!y)UCOsLq%ok)66kg!B@7JEe-apD}PYcixN>5Ix($sh{7N*aj3#|gIw96=_C zdmmxJ69iQHC9*N>JzNK#iwuVyuNeUCv(tu%L&XTsw z2ddn@mWS^fo)Trd^pZWZ?;D4+h9=SKnoFjW;Phto=B>SgwHG`?QkqqFQ>JUN_r~D$ z!DYM9u}Ns&1a2fLi|851dxiziaL%(S?|DY>Jd^Wmhi+CK4Jp0Y(3moQ*xHj__w>KF zK9fF_+Lf9So$ZV31?Ps;i{L7gIr`^si8i;`J+l1FomaBmd$LT&s&imIn0G>*^-IHo zb09l7nse@vtG^B&F)PjeOE2b{H!tWvZ0=dSm}?%I*NaU9nZ|t6dZ7v1Gz~1B%{6UF zfxk@s+*o=nv+X|9Bf8tuFBMv-wr!8vsDX{k`r8NJou0d#zMOF_27YSoy)&8Lc|zEE zB75?c+|E~1niW*myn@ZU*pRdJq_nHf=9D(Yrp>F2Gqq=Ke|mqG>CYIh)m^OvVo7z6 z>HolDPamJ(kau(mj;@@gyOiz~9KA~~XC1vc$Htsx_ydbAeQf?{>eyFL$yK_q^e^=1 zENuldWpn`AYir6H+_DCB0S;G9pc=gkIE?-qe(@1FX=n}Y)}Ev*k!nmMRE^KV8}z4v z)dy&8(;BoIGb-R}l9cgkLdt+OA z6H-R530VbQlRO!_CZr5r6S9n6V^UsXT*N-rS;KA5DR%q+Jcp(h2%hKrv@&BS+#Lm` zZ2JZ~&>CBKpG&fi$0Fz%23`{J!N8euV#=0zJNTKw|Ku5rMmba&IdUb};Q!U)5TGU3 zOyZEjUBVcQ;`xj5#IDP%}LnPsL zv3=(QwSjIH^~_iJxA_90JwEw4vXwknwWhC}e*5%8Ez~P`sn#~p3HBYI*x3W7_XqV` z=@Aip(snHDD5!y6Ez(+`;e46v3VNgrl*3stB4t7|xEU$9+cXp`NYzpG&Vm&w8wI1= zk#bOQAvt~Z^wOx%y0L)6fE`3+%0H(9yVgx^D3&XL+o594s9w= z&b5Iyl_m#tX>vg0;kOp_NEs+sOTmZ~IQKdWW~6X(^X!7DV8K)!Ogb~V5H46TWuw}9 zmNZxW1v{o3IQe>{8gTNBNIBO`9;X3znwpbug0kPdK@ZTT2NXoV`69iQwm+aC`b~n4 N({$4>DU8X1{y*aWf(-xw literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/cutlass_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/cutlass_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..763c2393926ef3791c21792439a81e8f2a3d739f GIT binary patch literal 41691 zcmd7533Oc7nI8CRUr~EuUkV$6!YY8^B9fxSRia2jqD6^nN_M%3Dv$uN@f9GFaKQ#` zbsMm{Lol`v!M1Ym+Q~~T%bsZ(4C!ERgOwLTHNw+(m zb7toI?|rZ8q3{YIZKab(;?{rn{oeon_y7O<7bPWD0pE!JgP~J%g76>7B7Z#+@c8Ry zLAWkNgwsMq6VZ-nMzyE48iI8ry3;!Ls~H+3B*;^3&zaZXKx@tvp>h zT6MaL*=-}$qcx{%G=f%$I-<3=HN4!X>mv53Zulj+I{Dv7NfCEP5qD=1cUKX2cT^kk zMC&8oBL2E*lORNV8~XY;^ex@cw`@b-@*=(!McgZkxK|Z%uP)+VQ^dWth&ykkI`|&U z_nizy0@21uJ^mW-*SJv%gU>~p-q)XQ(h4sNk>>Y>$d;&O_*FTb{O@fYhfi-|AzKi# zHM%9|+QM9eaBWk#wps-KFYc^?^q>z8CW|0~b`;kWZ{ zGIaVmix7FG=(

2e+!%DbiQOeOnRt?M2*o6mj2K#Qpgq?z@V(KkJBk))~AT{k{i( zFW_%4{$4~m&d6<^|BcWX>51qD+afPvEcCJ&wGT6D|0XkXE6uUUk)kuKSU(hNzh`X^ zngNA%+<0wgHC6A8!OF-1)V?qBth>!Yv>wevN_*qnd!mhlj>sXbdfSS&*|X+eY_}@5 zPd#fJ7h5-6k;ACR_RG59tJ8r4Z=O81?~OMCXTq^)Byf0&%;V#S0_UURSac-T^>Z3* zK~2VdAUrY>J~I;Shs72en2Jq~kA}u4Vq_f}8=4G_jGsLl6=C(VRFKGtsqon3f$_1y zp|e44#x^=04NXMF(W%L3ri3h0W0Dn-DiHp4F**?zqx;4phlj?(BSX{CemEj()yeUR zqo<-nXU|Q-^9#|))IjuLbR>E?#C@I=OZu_r{?MFI6nkR{x6 zL3Rwwj<)a_WGxOwqOqQg!qS_u4&*)ZaBs$Taa=qfn+Oj?d$wk*IZJQGE#*&TnjAkL z9gAfwsAMQY70#5PkReosdu6O-CEtN_84t!m=uCJL%@hd@j*Fw=$&8)V|03&!*g$!H z5fU7LX{&$i0CZi5Yi7#h=2`8l!q+u%i}b4t!had%jf=VmuYyZxp+H z7|3Da7tzSuT3oB@ua*5-IZUq#5uLn&qF(B8^m3fIZsRl~+G&Fz$QZ^4;3d<@8 zvVTO+;_Qi=RnyPqFNJGGICdOan_@&?0f)`%kSm>0&Yk-6sY+*g%F=4zNL zok>wT^Z4|Tnk9-fZRj1_Q}vGRDSKy=a8K1c4n+yQidD|Ob{>*V z;Y;gi!8fRVPl(al<5Z+*x}uMJuJDyneQnFN*-Qn>tG4>(=Fv+i^T!NKLM??&_5ei+7OYDuDMU0yR=nK0lq*9y_I33X)OYB0 zDGWC%YtLbMou0AATZWt0`zv~T%X;728tyjJGWPvV`r7z8kDX^tFBmt?YU9Q+SKOq! z^5%Mdrfo7P*EEhckL%-Y7*VFLZMoyUE$vP3VwIgKpRvxg%(Tx`#akjEhF1zREsF8k zCa01Ajn}d9X=G#5$lWnEeK9{PTdC-dHw<^?!^==4S2m5H$e~O2&;Q07P+~30Y-V%8 z9=FFGabMhjTiVHqL>@Q98{-Y~s9KZabNQ5H2w`nVq3MqAwzTTMi#ajVG1D10U2(^o zf5Y-N$2;P>cxT*#@&@B2_~%?(UhHwrabb9?T%PZD*Z0=XfD_X>Vr!-&F%fjR~ zIeuJ-H^tl^$Zp@BmJ~XQE;*zv;>&V>srkMx3tYREAi9 zpPbjlMPYfw4MIp|DjK5x$k-&vAkoMW=nJaF_?7R-*n1#=L;{>75nu=QTbhi416q*= zR>l~KzBe=w{W;-Jpk|ax)RahQ4FLmmka9ZC)A61W0YXEAp~&U2@aWKh=tR_$NZ39G zTE>}?@qzP0V`qc@@HxbZ4E(NQ25e$>#BIpGrgCpbN$(^BS z-{{uCv5cNNIHQXUjb;q2%QE_N;gP|N`3wu!vt8_^R1Fu;Ma8ICLa_e5q3FeoAv!uS zc{yVk8k^kOi$YAqGSz3JAW2M3j)Y^e(8R>Zr4W&hMn^-D@MJh+0!4?$1ZX4UAo84@ z8lM8iW^$Y*vAs{E=fif;O>OcXJsYn^a z!1%=FkVyMru#2^xNVu`+CWEk7(MyJp3_>KuQZfkn6A70P=?o_l)*x1qQB4MI!6NOx zVjUR)GH4=-4P?+3BQ}vin+fj@9q2|G&GC$RA`D7ZBy>EZ31u`#Gn#(hunu%2NKrY0 zm7buCiMt-km_|o?h|-fW5%q@v3!{;7x6#pF2AGLBMMEm%kjO~mgM+as$Xbpyjn`9P z2c0TU1Iw7@k(1GddoN@xFgV34MyqY$nZ940>Yf~(=$;sw=$s6T=+w?jQ#+pDd45N< zdt~TL_r&GNbK_&(?~RO%c8`unq9dTgMbV?{#mg-mVo}b3*@8qUEJkn6_q$yJLRGLeB@g z7Ckr1Zj>##zgM{wzEQhkYf0$$CxZJ|VcJ?Anq61z^Y#xMi}fq!s>Mq;XKu_apIEKy zzT2}>*LTnSQ^&`S`xlZsk9_ile{Wbi!(PtZy%t@1aEVp;i4MgU!G!*JqW${_p#5R^F)m!J9w)(uZG4l=mfV zeF^=s#P(yWFl~KllRIImNjn1xXKTXT3Nu?RSnOBL^X3Kj2e#RLX{YDvrTI&Xnh(A* z`%2p3x_WN@+?AJS_sw3It527>uAZJhJ$qQ$LGEw8n9x_wy_mMuEO}UCHL_;%C2WC& zJ^=F}&(qxZX+`bLsvA{ld+EZZq&={-KWX2xY`wcP*}CW6na4V9rFGtsH4BvubB>3$ zij=K3X{%kluxhKHJxC=vJAZZ|{K4?-%Re(!eCnxNvMryzXHIs#bie+7_)nS>ov*BV zj?9_J$GUv#?xkesi}&4$jsvTngK+iLr|o4adv(%Y{ex|(#_h?*?FoDJs(r`3miw_IvJoCqFK~ z-*c}7@qN}gGmmdu4&QCMb?%<#-oB3wiO#*No)_iPow)0}_4eHh%g0weyXH)fikGea zUgYD}`}^+I6qIaj0{^S2g7s)e!rYN?c0A(Eyma=*(LXx>qw}9^c~Eg;&LK7a(%Iz; zOYi=~|I@0Et3G+_LB+|9d>)x?DRXPm+?uvm%KvN9c5mA5OWRBE|B2gTv1UsJUvw*`d(i39htLWt*LBHRkS55+U{)oX!ot% zs}(!u9a7EHC3Q>gguRjb+y<%}VQQMc?`6}|ZxUQDWmn?)In9CpO zO(}h8QeT=jJJV)c+H6gm-D$HcZH5c}d)a?a+U!r8Jx?l1493rD1w;9wX{qdiKKRIA zp7IBi{vcv6l;1u6Nf7y!8n5)v9+;~@MrIE!0cPiKmYumWZFbOVzi{e$=vrv$)G~(p zs(;rfzB&6}n){!iLZxXJnyD>a(VVVn{u_(IVttJLP#7RX{$#SjV+WA#Z{qqVDxB7U zvq>A#;I>vG7|4O?7o;@orx6`|RmhGJeGzv<5qDz|cT?0jXrB`XH4*ds_S2?_C2Ce9 zFAi!WR@{bKBeo*x5XX~RIf>&*&7E$6)!d2WNzI*jP}JPrQ5(2F?Cf4z&7Zd--60qE zrJLm9zQpmQ=1aUzYVPG|K^20R)Snem2RKqH(U#6R;gaUG>kT1TmGK|oNEJVd#STyG z+>eW9hS~P3p}-kC$3%h6D(iIJJ`rmb= ztG3MPv54hlK@JH)IoISxh_oR{A7~Y8-h3j zub@Vx;dM(RUcgU`f(2N0GIfK}y?=;bGl#-s5!~6+b$@7RbYf(Bn|f6@iDp9cfEYMujF78`;!^-xR)Sx-QHMYGeBf-2`dm=KQ(_p{UUdJ{C%w4;2w+ zTye)0P_EQ&*=JmHCM=WMi0*y!j2kKDMteS4eqTJ9Tg=s#hhdewExP}i@!;M?xSD>IT^*w&_CS_UXkgEz)&nOHV#77 z)WpQNI0?p_p|Kpl4%lT7toJE`4Jd~QEo?-gc-#i#yIKr1-EeN6<_kBEB{8m%iGKNc zdl5lE9ny?vG(#d;_!OqaGZ2{Ni&DnCXCyp&CKBE|-KjoZ*~IAD!^jb_y#RRdR+^^N z?^vjo`#yT{){FOc-M{cj{e$*ne6~_Wf*z2KsCDo|wP%!)j3+-+C3Y21fkw|~%4Jrh zeL0Su!@)nXtTvS<0V=e(M~u$jcr*O7dc|GgMWsz90h%> zbv9mPcq`^fMJlU{c!?VNG8y$`FyaysgNV4od3j>w_GP{|NI%@d*Hr^+@#(4!y2DA0 z5AJf|zhIYW5?sEk)AQ2_XMMu_YNGMgRhZ8Dw9~8N`mj2{4)Qg0ZVvRO0tb_Ug9&@h zquOH$a~1IX5_ihpn6x)8?Za{LPD85gP_pe%s_kSF|LrHUI>Ax}q}ErT@@`3bx6GM8 zHRFWoN!u&Zb_e^9vr1jsUW=n!+U~k~eExXuha>bZek_D*4xlq0go~R$M&Z<-86O`J=O}22gbEBqL97g6{}kcM_Rp6%IxqygNXCC5k7MY~zkOfGYMqAW zhu-q*yRYqD@+G~^SpyuxeYa^Rowv~|0A(XZK z!bwnr*3fy^lj?an+4J&a0hX-!d4u(#%YWT|&7L&?MkeJo*W=gXSrggJLW$?$SYQiwemXTArln(0cY0g%VQ>9$fDza6}b*mv;tx!{!s@|Hc-g-BZtlpKa zBgcTy(*DbOKztFR<{iLa?xVR&-{XId_Wtkp02DRj2xN&`)pkmC*Fh~UM*M&zV-u(C zNTmdBHX!;A3APi`57?@<>mh6j{N0Q>H-|y67GnnDsU8yt^${oHc_OZ;H{!;h2Y+7t z`S9n*Un%kfeo^>I#}TTl1)<7#EsDW|I#x$lq#Swq*}^)+V| zswczfh{#Q6mC0N&*?9!pa1Q4&;AA4{dAqHPSCIA(~{VUfMGl%!n^p7L;Zh(05NU_s<-A{Ny0UzC`n#ZHVNv^q*p*r>7=YIvs_cA2mD)5aLo%IJoU zEuG>wkafoV#w5-HVKE|3!Kyejh*w~rZd%_A>oI~aQG;|KINAe?k27U&N!nXd_UDrJ z=RUPNl-8_4=dxx6ozyUE+5`#kz9p>Zx^P)@OWS|)mQYx*j9bFnVn7I{#^_#cx^aCm z*9AI@Qa&a$ZC1d3%hsHYvlo9KfyM8UQP4ISZ!UG|d;&(~>E;azkq<^#q4*CdQ9AFf zDFLBjY;bRK&cg;}Qz;eO_NIH*mtvDs+$eP)B}k{ZEvxJV%L<1}C7@#LW^@g%i}1Xh z+W~-FpnjyXtGmB}2`UXl)T03@+|qJ%?hgRRs0{+rsnwa|xaNhJu3ulbf*=>Dlj#_O zPOfkMVeN;tiTd4hlUKhw|5YGd_bXuV-R1!~jRAja&tp4hnZCv!S>$=CUV+8MK2h=^tywOtd=f^a&7C$?C-OSf#iv5J*nTt-+{Vgij0H0O5gC7395O;5pKSf2~H*cz-4 z4hpP~Yaj+l(Q}AY+K%+>UcTX@p8iXa?G2rBW%VAiZL9MvrQKdQP_l|}HQntSY;gMuH` zWQ`?;=N{S|Sp$Ai5xX;MA_!D?XVyZHRd9N;HiGPgCzcT85UT64PJ&zo$eo)UJ%ZIa zH?{D_V&lTw^IuysEDbDgT{`!n^REAH?B0R9mwr^2aK4nZ?E9q`q4|`gmL9-g?xXog z-{XITrY}SQBig7Av~Il`$t0p@lyC!lRfs1MV-a^#5qEPDcT3bX=wg&`D=6XSs0Flc zYt*Jj98q`8A%Z~kZu7bdr%RBELqhJH66^pH<%&2_4yzPQ@q;j`?m+Zh3_*0Gr9Ak9 zBo;~X0?E?>$dKJja%Md*u5WUJwxwL|;r3Zje zohijSCX4bg?ipHrK-zWEq8k~4zzxaAh_THOLyMy9ImGa7I2Is=77(flF(g75kV3FJ zoAZcaTmuH^gP7l(otg;PvgzR(>^N zJi}}GA(Q$uw;Xdi4<|m$d)>O`%i*EFO{)@0&hXIE;&|v2!b34jU0aqlE6!lT985Ta z=`DMR7TuVx>zdR5rMa3X7f+kmFqiWV`4N6M(|KST+0SgMcp%9;*AGpt3D#e38;YFH}U_oh2G z&pVHtE+cpGV@i;aQ}GXA{8z#J#2--d{}6@};bS}VbZgN9=aCeKlhU=5_(MvF6?Sus zXe|y$m;}S&?gJ=>#R7Aa1@2U@1c0xZ8RkbAA^X-~e!3Z~J!%IuwRr3BK^^eGVzG6D zx)~ech;bWXiQ@l=%!4|%4vB05qWvUje`;jW$kyHo@qa=zF+c{x7HRv;El&VAIgC)I z>EKiJ6Gs$j`;HOn#SleLY0BN4bT4F-d|rCP~mbOp+X*9+O09z7$YfXEfoA9v2!HVD^SC z@GY0!tX!tg6|pgQyy%kl&*XiHL(^X3#)hx?keGjiPgB^Q`8VhiU`hm z)tI!*6nqK^V$JY|{;b_z(ls0a;qnFWIne;H@gcOC7t{kO%U@N37 zY;pS?!)*z92C8lh{}>i4E%PVO;c9enY7Ccp<70t=@lo8UM`M9-AiVS9&Q5k62exh~ zG8no9E`?}dd@#W8R$_rxT$~02j31aKH5CJ4E--mtPZ1PhcX5?PB&$RJ4`x-FNZJrr<7? zC1WfIOtSl@z~p#f62(Amd1bp;RBF68h+lqJ;GN?gfumoP5@Z!rOLK5YjNt}4I!P@9 z9v4O9a&#;pHPZU@6{IQT)Dh_I|00TwGD$I5TTFnfAGMtn!(*|DaezU&L}+Q%TE?Q_ z4T^jYMe3*eKSPNaH@#eTwYHM|&{9z-m_Qdm<3i_NshvQ2oxoV(?bHcRu1rF;j;b+Z za#VB{mxfU*#NfnMStbTUAQF{FGaJJjcG4FE<6_{QHNA!|%Xi1GlP5ayS@X<|Nrf5= z)kb2&y_+|BKvI|yfUXKQ?)kXolZXaAee%FpR8g*8KCj+q&r2GSPn}s_i-&4nU?E`Q zNn8cUuUn&BCh8R`lF74HWep!hp!Ky0T9Csd@6es~7qKjzIv2%^=Uu|b4;F4j>SQAv z11`5J@h&Q6vM6j}lP5v`&>Uz*=SxeV!Zo)&U>blyFBdO2d2)qSE~QjxTE54nnqbXS z&ZpdL3a=a1RK2js=Fsunu#$ATC|c&}Gs`tiE>l`O!6=ZI9fR-__Lsn!=;Xy{bc_vr z8imqIp&0XOh07nOHz?i_p6URYHuJk^09tyW_9i;gkt<{Ly{LFO@P+W$dE#0^xhBTP zVndLyV&gXq9;P$WCc?9gbx9I-q#K8#gmXzr3uRzX93KsEr8-jBP6))IH`&cc-l>a5 z*_a}^cZPWTQ~i;0XHQ39XfiMw#TJ7m3Qxj-vs_l;i_#6UKs_^WQi?&`5QJ@v0MjF_Y^qX`do9`b z+G^hkNOmRr-db^OVEtubLY_g-}^(+4Zy6&Zf&7>3>;0{jX?d^yS9} ztJOHW|Fd$4ilxn#wAs%7`$#s!TLX?~GMo23@mlT1tY9@5*-0HY%srvAYy-;}fqo56 z^kqu$%LHb*vjx`h$i%rYRBS<&S1xd`6u5V7;9fbfW;CN2%V^GLG*KqIIpD)EkUu&W zvcvu#V7%h6Z&f?CL;Z~$Uh&Hgeg*8pQ1qfg44Bl*(bSIgxpWi^#Sr-a_0v%*(Nbmn z;cAj_5I3nEl_lTWxPAj)Clpbr9RDQ+Ow=k$g$C;S5=RH8_EK+%>zULLU! z%~p!-idVAORC=ZIr?NO$uX5qc@hWbMJ9D9Jid3PzAm^@7%@lz$4N60ihf>LsW|RK5 zDw;+D6{AY!ZZoIWNU12vs!Bw0-o|oBU);f3!iVv$Mru7@+(@z!z#4KhLeidK_*_ZiCF_;-{~Q`{?d#A_pb#?zej%B98J)~G3C!s<@Ehy`D~ zMjoN^FBFUrPQpG_p#>Y%qSQliNn^YT;uKD(u?pOlX6Tx@Dnbn)z%}NGavDWG0VzYp z40yHT%?*F}60f29x)@L!p@s1d%}{M9XrQAC_s1>Z1XgazMl zu)qOUD5PnTTX0Ql2UNov(N~2n*f!I?Nv(oVpks=+Gw!Q~4Qkc)Rb@(Q{T+-q$)%kM z7P-Z%VLJvVB70+RBUHOeU4m+gYtU!4ahJTVVT5<8#6*}#PG#1{bAU$DaQFH&BJMk- z^8ARD-6@yr1cqAQuV3D0%_&$B{=a~(Sqa%E!fTGu%ZR`1Bz*Ywz5r7{6XUeq#=y%s zum^frobWF0C(&?0qaE!CoCl4Pl&5sdx>GoXG>4Gx3hc|twn)lkKrNBZbx0!m#Xz8y zsa@IH%cEZGX(h$EfvJgLpp}ll;W5x+&*CsOhO_Z_VC})A4+qD0aDwd(v<1efIIO4~ z30~_H=T=YT!XJ8^U_P>Sh46f{lmv1_N0*Vy z7v=H5!?XIbkkF)DJQNFt6llcl3cNWc)4*_J!W0m5O;E91VKB;A6bvRD!Aadyp0=?j z5DSG%2aI-^8bkz?JpuY7YDyr6*%}+X9O%SY>@MgQHmwGr7aD_1qO$$U%P266g&?nE zs#Gdw4{m^PfpP`|n~cIdu8=j(nKb8)QmJV-NG7EZZoU|??DeXmJQ4{cBdkaU!A3Gb}-HuPS^T`y#56l z|B{TqAmd+=@vq7FU WaOz{Inib@K^{pmR>)W-qn8XK%7NEA6wc_*!_Y@VP?T;Q zyVOg+7TU5x=LsZ_-i#h_ujD)ikh8KkP#R_;no)mv3p9A1TK>rUI;q&evO+ivc@v3td~ zB>|mJtz7fdmW})#LT!g_-nMXI1v2_}&()*zM;Cim?A4H%OxbIb_S&WT6?@}a8)+Nb z4eFbDH)$K%P1=S?eUi(4_2T@+E3ZJYh&Nr^baUj!NK#)ldvI>cLNHz3aP!!WV@Z7l z>`e=nhc@Rz-?#QG9$eb@`$r%fV=$(nGQe8BcyjUZQp3{8rNhe&%O{o(B+So&DY4b~ zn1Vow6=SPe2rh=Nbwb6{(!OQiA0WDxqHhJckiv8l6OL!$#G?D!o8NkI@$z!djhVZe zmAW3%$+VU0Wa>%V>?@odS~jR zuiW~|{p}C@ug>;w;A3~Q&M&`Nd!rUgtbW}5N9{jqzd!iEeth=OryF_x%+&bMS+>~o ztr^tOVolqRppF*n+zhH{v0gRK8|UIzzc&B1#mlSC-~)3Tl))IxDScg1U#HMw14T9^ z>_6zJ{D0*Wzs+ELQVESzDRVjLm2fW}S~WM!?)$U>8khDa8}_oG(1nA@P(b5)Y%?|* zk?ib|gtP96L$H=Dd}ZmxvhTyU!6nyoYbw#SZPl`UcK@fA(uHXZ9{q=}EQgm~She(U zw_M<*>YjaqHs#gt-p1UV{<`RG(|RrnJAm~r_sNIL$A5+uP$>j~XJ(p76%Yg=RN4O_ z9ASjgAJd?ji6|J}FJU4I#$QuJ!L&w1!H9IVPyj@-svNceb^^8y2XbZ40SHUVfZWAI z6_`kd3b7L+3?u{Vp=Z!Y&J`jMI>@c+Ah#-uG}zP#2oWC>0HL~)*n(dn!k_}@L>Nku zmk(kQ`r)=*(Q`ngyoh^65%?o~zHtBbhTL`zwHYm4~vy4G#z8`#jdena1e4Sl`n zaTOAd)B}weC$$lJ)U1q&hLnTfY|DBX;6Bc?OjrwN)NBScAW^?q58(LEZ*2Mj6v%f# zc@UTHxGhn%B}GJ-F#KC9#{rs^LM@Rj7;GpM>0pJSa^PC^0)Etr$YnT;@MXA-QhsBd z0C8VVa3h0)#fZd30>lnkaD!_KlEpG4jX@24B%M*m*;*OTs#Hb9RwxGgKVT`w!Y=mZ zbvCI<602QKJ$nq2F5V=TnU#5s@$513Qp^UfxAp5tkh5pMPtYP6Q!q9cW$sfE5c%J! zYX~U@nhCCDSIX9~VrzKh@}yktNmqNywJYh`_32tGkTRbGcw33%iCu7e<&b?zSDzvz zS@R)ThyG@Q9qYu6`?_+{BzYE;U!X#HiH=Qj{C#RXRZ$X>qg*d#t;kW9U-?_g?>7@~ z7C)q76!sy>53Cn%?(5n_xcUE~4z(fftnld?;pYD~!Y#5?*CEawa(JSzt3ayd|D^_e z7O9s0B4RC%P-#4cs+lq4&w@V^CV^YwC+;pnp4Q~AbP9x8%*A5Fwa^lQ3P3oexZn$k zu2{$FaLvJmrgO`DL>p(XH?0wg&WTRPwGfO}5s(h*`uUip6%`^dv}ET+V8lDHstP@e zOp0ZloU^h+81*Hl!QK zh*Ws^*x*BLTkyfh9yh=&i5p=$;&zzMcnM5b+zrzm_rUbTy)eCT7fc_NwyE3!!^aL2ISI-VCi`Be=5>=$QIjg~OqO>pkKX&jeb4>3SIhh7OXf6lJ#*m&_X5=1Iap*63UneDi=y? zvlRqY3KdP+DuSwo@`h{;LA8R@m8~NvKu|qF4Fok3)I?A-pzIc=DXeA1+mdY|pH`vm zu;#CW02#7!OLI6_v55>FEgM`wZk(Wx$hb?!*U9(>8CS@-Nd}=5g<2UF$c@%zuI%CO z5k!5cuAkv=$cK!-!b%gXhCy<5cA+Wwnayh0#)Rweiyq-3by*YH%#e!9S_rZVHTA!s zkWgnux)oZrB8FW{HRV9>ol5SN%E zh)c{7#3kkk;u3QNt&^A|$ETH;dkopY=dxXG3f*FWQ3_eN7}@f=#mJV|Ek?GyZZWdu zb&HXW>lP!3>lP!ZK(`n<73dZtrvlw#g6<%pn0V(sv8>QuhdReaHZK$@WEAO(}JrC?E|jLa!M(| zRb`8UtE(U@SmCO&)xcHB3Moaqqm@}%W(;|WGc|d$JPXBbDQ^Cy9|$C?>hz2oB05N? zNr-~XL@;BAbQ;tI$N9X5AMM2Oq&80FG{USR5|kY0^JJEbh?ci zm=Z-?w1bm-t!_`c;sTFybOdq<{qF(emKm2@C=P%AguhJD*M)u1#_or>!Xd^E&A6X4 zHtD98WV}QHLq)_+E$xi|3hj{BZh07N*8hZD8pt4wo1vuicFV{VbVN``so`jJl`=7c zw`NtDv|UW|4er6h(3J7pl*I3lVW2{6SRB6g{uu?fkiizs=Kuvu_#GCzzGC-P`OTS? zIzhn;E~rY$jcv*Q98iq-XxHq#sw~)jBI&<8MBD9uKm=^J)q=e=VXMgsdQgb9S_|$g z0Kw%6)`OOmw>{}?Py5Q#)vc-OJ;~}l;4)5CY)@8fPx}HXUq{l{!3eXhNl)uW09c~k zTaxZAX&C?&-DjLV20FSGRN=L}&5cbmdo#4M*EKz1TG6|I#%w*9%<2|*R>-dNrE;+l29RbG@kzQ(}vG0C4Sl%Jckcx z(WpV;Nok*JVe9-wVBhf-XUCFe>A;7k<(?1ivd;18V-lXfpE!!%U}b)p3;i-yFenxx z;hxB@z!bfKes&BD3(|Rqu^)8>`gaAI!TEBOehyE+^R&s~Atv^un8caFbd$8kf?eSA z=wcS0?8V-^ZTiU5CN5F(WWg}n=+-cj+D6~X|Bj>H6>LK z`BO5q69LzD`5&4r-@J6?(s#CTjm|5kEz1obwccv|aWu94NOJp;m9`^m5(qxzr&j6k z%C5)Or!K<8$$q&qnPof%Ec>b=PZ6de8Nx;?tu2ud8KpRvqVipN^%U7#*F{of zez~-iLO>g(r7HAQ(2GhDp^ef~Czh@)mtE0n!X`bL!`|{kRWV*v&XqaAA*FbdYA&F` zn^Xmb@-LqgA(g0q1z#i|!d4^=jf@A)*wI&NY2-tI=(M*so^em+RxGtMbKD9<3keNs6m3iUO2HR*D_*YlY?>y$T%DiWo2BRd zoYV7dlAcjbkU$=p&rojv-%&oLd3uGaP>&Jsgm~#CUx*P}@hXWuFT8?LL3bntpQ4U| z0xwwc^0Ku5#Qj9mR!aftO5;x%KOb|Z45CFW_DuOEtz?Tkm=F+j5Yg|T7Nb_g8b$*#B(;DH@Q-;%e6|;x@y8gP3ucC z+_K&+UKOv9PsAt5WA4fVbPAs71U)+`y_n@jg$i;g7pxz zm5gpOfRYK~zl9Mj%~*0;kP0}^MLHxDa-P%X$QiK@5Ok1?KPBT385uJ6laYdvX&jI! zi0q_%aR?8VNi65`X?b*1{Pz@`C|{g`{UM-WEhoO^k0*4!#-B2#lRa0F@Kb{BlR-03 z{AXk^s@w`eNit5szze`oaQ8oxwU-Ry_Re#5i~lEC|1%7F)S0-tFJwv>AazMh{J$wg zmO|LU#hp`wk|Cb%oDEsfjEoUOV*Y%j2Jth~7Uk2wYX}MawP4=QF|iU|`cE1 z*CONbUDn{aOuQ^CF(z-97?a!RM#gn@*Zi*U`%@Ji$%+oVkekrIn&^6U6{f8N*Z&*( z=Ckq9+p|miZqzV-^nHoeeXII?2^;v)SqckbVz<8e>Wx>IF5Ec2yf0Y|p6$(RL?FUx%2iWVn`NUqi*?*&dxkpau^+fmUtNPaq651e8QBLny6Mek) zy$^G2+Md$u!E3?Aec$ewJqkW##^-$D0}CF_HQTRR<}C~LAJ_{lcXLN z@y^`2RY!2SIq7&V<=B>VY`b^hr$;_Ml5lKWbsV03Iqj=neDm6Cv&SD<99OI6s}{nf zLLbzweIFVU{w>Qx_l*xMFB6OQ8t;WKE(h-&eqcE;yPuV-J*jVJjM`wAh9(a1N=r|q zN?h7_pcU-oK3UhOX6fLEU`$?|xqAd$?%w0G$JX;}k1X$hpznTE*O4~+79)3??+<*k z_5NveQ~<2n)wBHzuSl%gxxB%ojl7YuO5Vg5ZkyXSZ@KE2cPw03Y+BNQoqg%uRP**^ z^Y*(J?ls-ltTyjYH6KbgAG$xZ+WdOb{Q7?|TCNPtg=Yu8F`UKI+HOWwz;z3e!1@nM zif{lkoyfr*8B-_}86Uu-QqF;KTpKab$5>bR%z%jXhSP~MTj^QzuC>packyS#1Te?hdTw(B~MM8p9nHK_a1PBGK?O+ny>F2!etViAOo5=Oo#cQfBv4 z#bpQwr;#e2pJXpqvgb@PZBLcV)a1}eOk|rh-9BYpEEa9L=pH8TM)#wfQNclD;djY; zl?+B4yhhL80ZixsOaeXTlM z#jMq`wT4-1r4nT8m^C0<>zTDd4&2DBO|ow@vu=?%A(kRZ@Qk!Dw^l0ESLVNx4Kiz+ z;49CzGq6K&`?H-4>;gMcwwr;^DPKx^+(T|p-ZQ?Y(O4Jvr>gfRtM@(;$o3?n-L82q zQQrAP!0!`L^E_;IeNP1ZX8W~5SxtINTRPa4t_^%vb55f%rcL&5#;?S)IpI6YwCMpMj-< z+m|h4V7U~CcX|bLs}vervQ-SMrs2V=0FDi#?VImjdG~Q0^La}rKpmE+J?UvrdAgIH z?p06EtTXMYS!llAeyx2mnyT$d*7mG=dbt$NtP`58+qyps4r(>F+(0JFL#c5LWI=oI zk(7nd3aojS7G|~5E=Z%^#;inWr(u`HYitPYkgZN;b;*zGxtY}?$MrI+PYw(z6T~f* z8y=6P!72~lYz4FC$1Sr~%S~IuthIvKmaSu8fWUeNHW1jzz$OBl8Ms9*R134VQd_e& z$+ihb>*IEEeR9$;M1xM7D&LhX-}OWwTXsx`NBA(RKC9TR7*T*9O3Q))FxpRgU(iIh z{BR;$el(HIP6MKZAO{uCNsx;mH$fhPyaf3O@)Lw{qgWxz$f=ylPXz+sk2F52$QIY^ z&;XSvVQZ2O@IwhU07fAlo~(%=GYv%xK~_r1Mi4mj71@-KlY`pENzi+mFKIOH+=L=a tE@`qLZFwbRR||D#RHY|wvN8??Xoj#ue~p3b}_i?i_=45 zO_q{O8swBXj7d|y8HdUnb}>N zwIO%?z~6k&-}nB#_n&PxGX){0dq4X90gCz$%&4JNNj%!9rKlSeM{#tF3gA~0qvLdd zrsXnCTpQ3SX?@%f*3eYkNMeS>rhu8$YGanTHDD!aUCb7@2kdc2z(Ml*m^1DQxZ>`B zJMIa1NSPt#jr#&Vk~YTb;`M=gl4fEJ@y0+SNtQ+>~ zoM(-4?;7PYOdr7X^X!-IUAG3*`ZdZM_^ybVYrL$xrA#@nMME8?xTeb#*UX#8Uazb~ zP2Gai#SrK*QPuyru0PuzeA!jBtg*V*HOkxgOCJ{^~$n0&SAu)(c zBuypyghUTZ^pIo__;f}{gq2H-`wwrNqA>qx7@`|gT6ydgr9KLdZiJdCZC0D5#+xYW zA}zRSDobazlLojX^msQZW1vh^U3QA{YlZ~`@23R^esqEdK7tn`A4HO#oRg@7Gz;Os z-rAOqr?#b{sor!*fW7Ixm>K-a?(so>TP%8RTk2AJG@00TAr^~oizhig7UVDT;Y>Oy zY=eE`1#w#hFAN^^w({aJ*O$5^)derWU|eNag5qch&IT4lZH7N_2Sn4l4hbG{sva;GXw3&KK~ZBq}D!*b!prz62|mus`BM%6%o_aB~6wZgD1K=a9Xtu zoR_9n3-wgb!d#|wQ~D`G+N45J>o_erabwz|*1*{tvU(NyiOo-$sZMU6HD-;s;ErG* zb2e~#yg@?aQ~Dc}@$h1(ed#f)oU4x0b~QKVR8y7j_#m8XE9}Gnsi)BMlvx_HHRn`& zJazAsv%`n5DyX|JumE?xdMAvqPQ7^GDf4FF?yS>){n6JS`&QM7T{8L2lfO3O!#*&| zhsW7ygiVj~>`3$ipI}o$@?4CMvxqF&sK|<$R4OT?d5%4Ii8Ol|(AQzW+bltwJVyW_ z=3~csA;HJ`SayFb#wH`|i5_;ihaF-8fs>wuC`2bi>1Z;+o{J^Jf-{2Yq=jfYndplmWthx8Icp$M z>@d6c1$I~ez#5Yz1IGDKLS(ClSUCcVfYLZMF~x_Yk!To}1}K@7M=}E36#=VYUoxq_ z$wN=fWDgrn5Tp)+AT4IO^raNvE&BV|iT*y4k+*JQppu(>zXlw+KkOH5OE?|9K(3Cw zUln59$F4x~eNnNBQg{26q59V-AE;G++HV%zfCeOk%=AcRjz58jC0itx45fDm`JM3{k%VMhF&jT4v;bHEg%Y6+BYf`?9VBE|Ccq6)A|g@3 z2Z}*Jq%S#F(@6oyJJo}Y-vHKNntJS{tc~CL>M~i`R^)PU5`Cxlett!IekTwITya#P}MSmNQ8e3OYpGD|nwJ%v)3f7iIYg=`?YN{58(2RS} zO!NoEa45z@YCugTGI8J^Q{&-kmH=znGjPyG2^H1> zko&sz$jphxV<5GvFa3Q165;PnFUG-_?7^RCP0K@(@+QKkYHr!c@*mbRYx)jK#<8q4m z+)_xz_=`X;kWm0yB`*R9sensD$V4zeLK=z2;<;U#tqNX$8s>VHna-4{@# zk_?#?2STmiA%M!05_VvO0<%OXBzj!ZhWgJ-W{BkLD@s~aSm;vcLV{$LX&2}|M9GYE zeChZr2Zl~bdNNE}*R#RurnI9V{hqoHg&vyNM-%t15w;#)CA38imcT2wO$fv$fjh~*+cb908d8~X-N+e z+!z<=$oNn?2svJe@qi>H!)Zb~{I)9g(~m=K!DuTEWxYtTE1V15hHe@{~-mIBsqaAt6}|NT94?vLs} zsGoOz*mTd*nb)1kcb!>;*wXpXzH`z3+@13U`&Xt97v0_)o3C%4>-f%=>7k;{aqX>{ zx26w02ClVjnXxP}T?MAA$T*9Pr^q;p&FzmZ2Ag5}Ah4zG`nk;oSI6|R<@%NzQ`e_} z!<`+PpPY~1J@l)tf7$xWt#BI}40)Xw?vT5F$<duf)ruWS4_odr|B)-{qt55scH0Yw4~kIn-2 zy8_xMo(j+$70_@r4|p|4MQBcQ*&5LCdQQt5cw;TLsdi~ZgJMKP!|AHj>L!3uqqxZCpE?n=c1i;y@i}fLY10igHQTYNh=Ob_aZrr33|8riN@$kt|fUh{!aWW7Ick=uv}}NtaoXT8hFIy=zDrf(p5Y0#dHK zMBSu^{U%8dIxb$2m>^(FuyX_@OOOym-Www66ND=hXb&0@fM-<<38aza#6lpp z!SJYTQ~(2rPUK?|;S6k>WC{kuv5+VRgCg!J%gQ8FcndO<4XX~NPo2{ps1*@c0D)X) zY%|6J(=gX^pV?G&G%qJR1zrAZ|>%PL)efJ#u^31-x zV_(tJzU0|)&$A=X?8rNIknHw*p6z*Nd)~1<@7Yf9l28YOg%xHJl2rzYG00TkfiZQ$YU1NkgKBx&U)Ba4N(%8%T4GM6fdGaC?j|jK4LXQ8 zb47LHuIP@?jswlb$qlPF`pGswK`^Lb0o{t3vN>dJ!;*D-!Mc4Jl?SMNkh3Md)@d$L zT9aAE>DAi?0OZGmZ{h}ZiN2{BK7ErCILKDd(xqrxL3=ftNsNQsG}*Cwl20`G1ps-3 z0ywZu4$m_TE{s59VHBg)6Y!LwD}+BDN%c5liOBr&{I166O1C_Q1FNxn-(42d8S*w?I_Jv8NhU? z*^%5MvE;c>ELhDIxd`W4T_p5E-{HP+H3mb*jja3cAVI`K_^uAqGKQjX>sANFxW`n) zEzmRQz}Ulp=>`uR0tzWvref?9Wq78nCZd@#XH99P9bjh-r$JG#qN#uu7Svq8*P1nF zEm>%T31KXGD0N(1L z)M=)iS#2$KhRO{PC5dawJ>{w0uA(cQ@@5%MgY;j}WW7^NR#yvd#pIgxRCtAoQ8jC* z*{jUQtWjS!`PF9a6|Lx$ucEok+Qv|{TNTpflNqa_*#t$1LQJy0tS3^| zJxr2yQ;3fs?LUWoIzIfl$9zCRV}Rl&sjYQy|D^9j$b|RrZjpOkdSo(<}qZw zfW_?Dy4%KEh-o04g(4_FC}qv#w%P(CQfIhQz$@4reuN#-<~LFsSri_+huxJ1BVMZ z8F^ZyJ0X8$ZGV9k?_xBK5n^VUB)*5K7ctt5QQ6r>rkWWjgAD_99WG8X#^e3CJCYG- z0!f%;351lz;{#Ym)GosN*wYjmQ1%)We*@=5JY!WCv&L7ZvU)-^CVi_(D=GdBKoMVn z+X?h^Bjs$&J2vK-jSv?d^?65oo@s{|_|DaI*5SD}H8Yhb>=ne&VnzOey*Z~0oio?^#9-rDxS;kq_4GckMq{mHqLSKrMuZMkY*HvuWwp_P&v2Nep^M$(o#rA&C^dMyi6REdj zzUhbUAGiNFvov_DFnDafeX(QcQ$By@^-lwfo;P#Mvb%lWvcTUtxe!=%?*=9T^5%sz zclO_P{pirbD~s;EQ04{3pO59W*yw;d=t+(jSG7$#b*?|=ywJB(txZ_SzKkCfb$({t_O z%*8qS{cp_iWme=N(_OSVXZ^?mtlQE!io9y`(Yd%@0iWoUr8B zQgCcpcxkEUaG~dL-f_5KI`Vrx)IQRx?MhBs@15WIh=Nbafa%lawEvEKssC`H|1g&L zO9rog`vZ%;q=zr?t5!$Jh$%Q0N6CaKGv#oXESR!Vt!&ALDLd8DUUFc{Nwu~wwd^Rg z?6|`fTK1G&Sm>tAj$CH;)LiH6o51|)=fev-=0`tv-0|HJ?;f~w@kj95Uo4pRKk{Jf zo-;J8%;~#fy>2b(F$42xXuXlWo-G+k7WA&}YsY7fmrNuJBD9>fkgSygRcq{etYjlu zJLPG((R010&NGJ7QCBFK2lakc^gshI=>OT2}<>( ztU&=b?*%@;)JV#jK#zap?DeyyW|D1DK$Tiawr$mhw3AYn+Pw9P4ou3-JUR3-=9i4X zS;s)Ekts--1Rxrhi5Q^+;cDF!8p8gGPxk)C-i*LshyfPCsDD?hl?S+}XD-J_w&PIg+kL(J}a3Y!>ElZOGj^fP5qRpz$Op0C&34pFZmJ;`neD6v>Bet73 z{PZbSl|5v9B5SEOdi2?XIlrhaDJx2Zn$dscu!cEh$h%qRk%cM@>* zOO81ZJpX0KEMJ3a-9XJ@)x$#{2k>Qw^#V!j-KSgz&=;iOZOFs-LM3P#K(|m6Qq7*S z_g|HQLItHDJGL7MFL>1*jj*SM4Bx{7l}4>9x$8)>W&h4-qL(1Y%0|4*M?&BM34Sn1 z_7E6<6}OJwD%U=?8z?Zq#;06ShLoLKuqRc?pINQay*K6 z3XH>n%ZXSXAtEH@eUwL5n~@Q5@%VU&{&8guTm`I==TZg+zmWhB8$O1A#1PqlWOKxR243=#io z0ckT~JwfnAS-++^zs#l79IrAd99gzYX(9;`RZ;kis)H;ix;XV$E9Cb&K!}9Dg~o$4 zaKFf9gN+0n7`?qkS7Z7AvUks-cQ43ZUc>ZpkiE>-B~wSi)UnKTEHa(TO#Pg7k?DG* z*P0E}FQG_bFEYL&V_RN)aQLmyG(1pP_(_#44B?%Sc0NS} zK!~0LwHeZ`X9ALve_Rz*Eh*%>NzXr1-zwTwWvUU7>6NxT69jqUL}4?yP6q`e5K7ZW z`t;%NujJlDmE$G7<*M?&$ZrcTx#X*=N0wIUi_T?IeN`QBjqvLSr55$G!e3#;V)T8C zKE$XABNQQtQ#SEsk=oVQ`>Oogwu7>R*ctVAZL$dY5q83tB-`Xre<(jVW-qyf{669D zp$wiGC2BMjRnpQl{fMDx+h>&TKdJiPP%r)~_2Oq#=NB3+t@(n{(VEXGi2h*KKx7~h zMIWU9pn>EU259pK=)Cnas{h|j-q|-7O>NVL5;WU87cNoonU_Bc^5+is2n$Q*7irpA z1PKitT(l(9wdIFrKR&Bu2KW9X`=hL!d7w8v(qog-Aq}N5mo$6nrXpjzW}UH?w2*$F zr1g*nkr7<7Vl%Ol5p#^1Ghq%z*xy?q1ro9X=kZ_A2Q{>L_S{n4?n2$}#}wvDU)O;! zSFCF(wz0)cTZ$XI@qfpr&s&evw4pp}%#_E*3=D3vlW7@AwlX@&!tnCcEF^1H1ymc! z+SOVI$vP>!yW}ECH|6q{JS6F*8e2*}lB}a#bto#&r3=o^QaveaP+-iRF4#Ltjijt; XMOiZ`YoiR7D`($3`?wvm1o!Oeo1y@$8zlWA=ybb4?BuemgZ2hWKmRxBPX&) z8+sccRufd>Vo}k3tQBB^XdAasiU4uY7V9k*SOfjxgxM7_(4t<{MUg)`cKXqu_S`$1 z;fN-sU7!Q$^1jZw=bn4-dE9&c&1y9h@J;DHn(*!=i2pzZ`SX-APaf(CVwqrwQGy{E z%@jFG;$KZd8`F?PLWfg5P7R|*67|wfQ3=zi3FmcF<^(-TCoH2DT-Hxn6Sh$s&Ksue z3CE}d^2Rth*(MKN{$;4Ri!sGrV+O|jfo`;pp`mPne=Gc>He1{{xue`)&M}(nMuHeK zGj@RT#4VFOa!dJ_aeP30Rw|A*)ac`cG0^xr#`S?t>Fur2yDm;o0v{_s#$BV{Q@tK$ zTdx|;Ut_d}>d=h0M!l~_y}w3%phkUT_4+Xj)1;z0pdnyQ2P%0%C_HDHYpi)ojrw4X z`qmotZ8hrg-eB5mw06{}@2pYZRinPUdi|K2*`eZ6)Bn@jhH9*df!V1Vt$Euu*;Sot zrl&^z?i%&IHR`uslN!8-oz=rj|YonfL}oC$Kt;MfEgyapJ8)A2Zy3Zho%pO~0VP9?|Z zdd=S=QFA^-iVd;k>@*jNzc&+SxybZvBEltMW~q=y)NoUxfnz7QaAXGG^SXho9(FB;f5H_J}PG%B&D zg*^Iy@>58b37(iFRjs8gr$9{Vo^AcC1i^#qy$&k-I&|U5>i`2OsONbNhep8Pq(RP= ze|e3JVbWBtm2SU@hn8Ob=;%7# zEYQt)I+)W1p8ys@B+rSIw2fx2iq5HIEINg^l|)ZUbd?B`wpk<*=b|xK73D)Kig&;V zkbxbbyV=g`CY9!L=4L~A9jo`?wRg$SN_*v1*h|v-^BP_YCJ)*fFfQ61^LkF9fYtUU>a8B^@u@>kY(u$sEc^jLV#q4Ckk|II|wjO@i29$VFf)=0kK_cqUA=anR zzJomgA6tj2op91;X1Ta@yx_2(W1FDf5C5q*A%WGg5vD+n>MBtFb!v}5?a5JtkL=-s zrFq@5U$E@IZ+fcLILt*tV=@;lgw4L>UAUBM>Rk)ni+;XaXgVfXj-^j-SlmmmXV2s< zUFnkr+L0b%k?FeW2r3J}LJK;A40*L5$PaVMQJho`Aap5xJjr8bDu+j5soVn&AW`kW z$q@6xdljf)$h(@)5KZvSQ=B4-c}f|bQelMST=|uI@)SDm z^Cne)bd)PT*ufG7CcZ?&U9C(3spd&F>Pg;!s8?t+tH!Fn4D%KhWObhE z6rNgDeQg!ffgtjuKu$UrW3^5%`k>d1y_v|7T$)}!irOaJRV7W9MlnWAmGlk zXz#;O)Fq;+tD-)M?2w<6EIMG{qOpEXie|JCQ(SaBo(fUYVK&dO$#FKCh``A;MA_){ zc$^JFZ@kS#LkuJ%=uLeJv|L=&;yBiYG9Gg|(tjV#l6j(H+A)O_4RL0DL%p*ic83av*yQWTUy(aNfN)eQv|+zhk^@EKqGLZwu7kg4LgW zOR(-J*z8Y@I-en@^E@>ZzRq0d!MwL0N*h-D;+qR^<{Eb7t)X1c5y5&S{o;ns@Zss3 zr*qb(6{}#~y?X85xd+3-;Dz7lEEVKi7Wb zzFBBLn-2`5}Gkyp=*o)qn>LgeM46Qc9g7luYoMpV_J=b0+hGZp}b%O6teSUggY z{t@CapgLw%kFp{z8dW{3hzmH#UilEtH}Oq;0}_$*&D%ox6{r@zh4*2oph}nqK9(Yf{GTK;tSa(8<@c77p1N_~Scyk20T5z7vd~OK<|o zj}NHEs3LMfiF6{oOxcR;Qb9OM=v6^?|BCM@sS z1CSRjNPL$Be~LxM40{MAL6merBATOIGBFX0L>Y!fmtdigXy=j~cn*q$WuHSG!DR$3 z@SbRsenHl<6HI(hML8^5O_RI&=pGgV>H3QjLr?hy{0 z<|He6HWd}Xs?}k!uA(<4H&CTgL9|69OuGaL`Y-;T=vJ2$1l4N8?2BkE+EH=}C8tqx z1|?@vG7O1mlQfvLBA~yiao{}`!KRWfotVb@U!;|CBqbV@e)qD| zfb37;KeZ&O-$A0@yZmF#Z(!Yh;uf_5J-ajA z*{+p=)oG!9Q1B1ln-lz}A8_BeUIQH|xhw^XH#?B)IxGYZ3zoz8-V!XQA6x?;&||)3 zdP)(Xj(6PNu@YXnvPRyEuKSPOI$N>}R!pm>bN#1{)&Do;lz6!jkEs-J2cy=!YBaou3~0pd>C9~a!m?~e-ZmkREVr+Tf^1-A*Fwp_>lT>psBF(SA}idw?iR-_2ej@-_}xg!^a zofif7MO3@vseu6F0Py8}J6GQnd%;zWlru{MpJTC^I_TYU8c*cAl!*1H*&852~KgUWGTAl;$k) zBT{vtVILzA7G4V%5)^OcExfsWNm0#<=B=_S18W}{EbxJtFZ)fJ+LSPgg+FiSXbBd^ zmTA}m5~xF7y`nZ{SgZg`u5yibolGyNOmZ6x)!WM$G`waT4AtAq7!15&8w}Ok%NUHj zaT^TP+shbCylEQ@)!P*eV{YI>c?p!9v?F;P_=3bN346)5&0ex8Yh&YSbalF2f2o%Y z#TfsF`U{doo+5Rg?FKr&#dn~{K9nxebl5i&O~^EOH_FbW%5K^O5d;f>s#HnuAlNE; zDP#nH@k$gNH{>R<=+;TJPsiWmA~VT%!B0$%MfQqXxcSC%Tr@}~L&)*R%M47Se4}!L z#RMS_Q>3p0%ytt06fbJ?|H=b+ao@Spk@tXYbF!tP3+D%hNm zIGmt7Ts`Zqy@G4+T3Emu!{uG>$@FC3$@Ha%AJX;d6Ax+k;`0m7!!;1yv|<$K?$z+u zbWcv#lcRec=^C?4_JW}61W&L)c?*=Y;BG0n0tH9IhOcGa*C+V;*66%%@OwL@H#~I{ z7W=I^NtX%s)|COc{{^+B!s1-CFW9rBK>0B`TY<7IS{JNK(LBPg&btegdz}ghR3JN) zr&#Pu%95u*dDf{$fojaYlBZfKOVB^CPPGZ(--h!P%x-gO%dv~0 zCcUA!lW+x=TQV&xPQlq;sB0=Tw63~y4Z9I@+$|g4=5=qc;O$+#k@r4V@OQ2@2>v}A z%^mB_`-SHHYj52@B^(&eH;+6ygpdsqdK!DB>@`@+7`j!LKgDr8jx~$QR4=(oa!2&5 zx>s{=(^Gk(nD?l9R&z1SJ?puWRshNF0jXUtoAuc3dkOSeg9;ZquN5D-U3IcsGVfLO zHCJ?mB40O7a!U1Nl{+T5TPyKk-dCm!#&F2>a98yeP~}&RRL$Yi@eRCQc87TXCZ{gI zyTGXfmsa}t0PmEs&j^4?AJ zZ03D@^JJYo&+_lqYaq*Oa1lXnUI#5c@B8?lbn3fK^=_8AXoVTs6iS|ZE%|Nr#YZD?o;;I&RntK2%bx>WY1s?>r43{{D-8Faj7UdgaLYP71x`N8BJ)J@@LZ|nEE|h&^6|{j zHBy?l7G6j3t4F0Cttd-s)l+eMD!q)eKZfD{6aG^_#Txc$FLE@j8EZD0cLdXCHo%ih zWD+Yw`TDN(OBJ4+$De(E^`(0k*F7)ZG9gz)^6M_-9i6xI2=Yqi%8D^N_l4$5^XKMo z>YuwsZ}?kQ$d!TL`g;^_tKeu_G39!Ogy4|i7`lH}aGZal)%a|;&OD|FM{_lotF!yl z3!hv7MYIyi^$rPbLxLLtx)ETPE9Ynff2lcm=fZ91)3Gv_+czw94-4+$2b17XL7!&V zk{PBf$ocyOTOZ)I z?s;fy$aV|1RzPiS1Jrgmpticf>$7?n&o7+M9$63c3W47Bz#$=UC~rCZ-?RZPb`KEs zr%ImO57GX(iINXc@*yOfUaHWWD)!Y*fcC(?nt|6>wRttz3p3X6(IgL!Qh8hyQ+`)5 zfzM^a-kC<_ogv#2e^vc2fRH7Y3ahbK%wGu$s>R@plI$I?OIt* zV*hDt3}n?2${3LUv^9q6?PUyF@RYa4P`$m30Uq18!BD-ujDf7W4h;H*_&(;u@AU>@4a)vsJdS zbRqF30z#;y=P@iOQ|!B_7U5>!gCvC*sgiP%gfoHTP?BoP(B2LLEwv9zwW5n~y1-j- zHm^H71ZT&}D}u8-eYW6gUUzi}E>w1Pr-wIe&ZS*hE^q4q_kym^HVAZZW$0_VGpFm! z(VZI>*W&QPaF$-VkhkoKpxR5gO zdN@KQVK`57=tUEMlhs}lv^>QrX7Qv;?pyx7YmglgxNbAttb>b~YDOo`8^=__m^Y+5 zwiy{ifFLKTt%Pb+!}fUKst!0grr9)?`tls)oC>Rz*TZEC@gA9i@NE!nRSaF!K5ye~ ztV6{izl^K=J8f6BSK;xzT?3bO@J{rHbK0@ZoHbR~;19>|WQcheufGXgm~1T1M9$5- zrmY-`Xu%(f$l`U%E^u+JN()5OK&uO`taQ8qu0b8V^RDr;lEVpnE!BqmW28KEuXS5P%n)n7#&4h$v1H-jLzMOOIFRxPsJ2t6D8` z3t}A%O#B+e6-aJ}x#|THzHe?vD6pX4eCZr=iaLl)NJu`1X(9zTM&r|QM%1C3Bld3* z7<&B0peK9`%Y3l z$lhT88WNE_jRzBr*pQDT1+N+eba zqAn&lB_0b2QVJN2Vl@(&$P{u$o9sau9eb>8LYGS3xEwk2}uLdKG#npVcXIQiwd z&(8sNw;??O$ie(Rk!v3Kj?gq4l;A(8q6XmYL?3u;Ai4&B5?{XhDTM zJA*5oYxVy;`->Z2-T1~ioIbl@cQ4*pxRD*WbM*GnwXR?3e@*|A{>DC%KK;8W99D;0fUS(g0SF6FLF8xXtiXq32Ei!X(&mB?PGmJfx4ZBo9gV+7IYfnYKK=o6 z&je;RDf@DE8f|49qY3e|Y@C?|QLqLu8MHysnTOCE2VwCPJoSb8=7O)Y;BJ)o^VDjz z7>a~ZZ@{bwk>^8gqA?O-lCelcI_UdQpl|QyK;K>|(6?6!|3>ZLN3*3UAw-acAnxu z^sv&U|1?5qoK8j{wv54YP%_5Rienjq*NNSMipcbrL=H8G*%?2|{wvhMB7&HK=@zvl zNj^4cN!=4OL0bQdIQCoO*zbt*|4!`q9dYV+1h~uwvbE6AxXfqxq88=%?8Cng_0$tR zl#4DG=`0w{MJ@b3v^k4<_yup+8z>r44*dSQq6y_73@fni?GwCxYaN33;1e2E?j$tS zhyI)X#~N_gpJ)kO1W1vX@;TTEx9q% zSTv!MnXow)FD+au(x_xXHw06esqd^%Dq%qN$4Ldn^W;Sm=z$mRsfDTHE?n#-JPpeu znUP{2F7A;>2;<^j^f-PgbE&uw7xxohKjzZ`Ts%lLHs2Y%J@{QeDwf8?E#S^;>}k)3 zO*fnVtfiF;#?eAfOlh5c<5?CPaj1+DgZAXo<)9vFB)-?B3ipXwR~bJnsCuf zfW)66CkHy z@bBRAgrpJGPKmSe;0yMY38W@I&fPx@h+Hm zX{uVtKZKDHnR>{>tV?q+q7p1^gS51$R552%Whu?^J1dk*RHOPoC5iU#$MzwTJXoNt ziZFP8=^{4=YfzCjyZp1|j?OqekRt$yfESooS(TZzrgw=(XFbe_dEl==> w#nT!)X($*iAM!Wh^gv!27VfT80iIf+gpc_NB{r; literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/deepep_ht_prepare_finalize.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/deepep_ht_prepare_finalize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c094127de3d63e1b2f643ca97401fed18748dd4 GIT binary patch literal 13810 zcmdTrTWlQHb$9kTyF2^f?(+Rwz9cT+lw`fF*cK(rmPJ3Lz$vR{yxbX*OYMW19m*oT zG!(-qmvRb8ZBj}NTGRQlR0*|$6o~uP6bMih{n-(f?sVb+3Syw}SJ$CjyXi-I&YhVh zMJ{ReE9eDv_dd=&_ug~QJ+C|ZSFhJiASG?@C4PB`kl$d%Om?HN_H!sK6M+a!lBAe1 zhQYEWX&JN7U+b8a{@POZxP>7p2hE){XUAL=+M0ByJYybOwk5fgcg&mejrnNZp7f^z zV*y%rB&$-vu^=rwlc7|2ES#zyt4`I7)ud|2YEyM%b*cKX`c%VML#lDCks%iHJQ3J; ziQp1l(+AC^um9e$>gBQKIZM?28CIf<#vYF)ldo ziOHx%Yd)8mIsLkrn4FTIO$)Dy!fafGPB}3-o0-jOjTO2_#H2VGlSM#UpUQ~*1yM?i zN&Y*CH!&~riS*2@jMGb**|fmV&O}+QS{Tpp@mWa{(=tCh9+Tr!d_vGX*es@Tje=H# zl}S{XJC6h|&_#SJ6IYq5u3n(`8*|~IFi(=ngLQD#L z43=dKo{%yriqb$$(d-OLQ5p-0ub%-SVWYD4BVejUU?!1_F#-VwV#chZO|Xb|!74hY zeH#bqWx*!eCme$PUE3X_bBq-naM&i!4MPMc;9=pP3&yZ{GQkM$ceybS#bFUV@7l&V zD0?R?0taQE=!aThx_tu*qX-b*4KoH2gW#KY5&}Jb7#o6?%Ih_u0ss{TDA#n)R{UjL zRRC4+$MBvGgs#}?Oofn;|_Lh)2CB^H&9hSfxz@gh1CUwZdg&rA819D=oJv6 zu%b?(XPbTXZqxpMr5#eH5AO4eTd5-SLm%+0aqj}c0IX=&A9O`{gM7jutYh~db{&Eb z)-eR@=+2W%%vkg+i4JQGBceES;+0e9UXjF^m?VM0DY-*Taw5Xq7wXNhc>0EA%g9RvS@ z1!|SC0R+&%I+_6M0&u`AU=FYeSxn)2B^DcOQV4z|R6<(ShTZVvbF!G#e3vFfiI2&k zLUBwMH-bd_-Nc}rni-r)%=F7K2~>LjrP%|A4qiAQ4ki=hgEMpTR3<%mF_}yarZNK9 z4&o&-J}YOW!DMVsl(K^pXshsGTnq|uK;jHPMXdrnB7VX+pn;h=tu+;!8y9(MXw1&= zbTq$Vo%4scKj49{8QTV%$)1AjGARW}bN8*Tj|OfJC`|_oY`t3Fl=rEgrWmP)mVp`LwCC(6#?#-)htI2<4BgocM6`gHgr5r@-!616)LNjL%L0)ii&4HpL5x z>u|gl78BIAG)XMV+I1X4$Phk0yNnkS`l#`w&f%$%fFvuq~ z%^@?L1M3rK`wY>3hJBv}Fss8Xg4r{eUB>JhW-E|IU79^R3)|IL9<0ZN%=1z+v}=J) zG~%UJXqF<_=Z&FNhqfYeh31?Bqb>&P)7NAvSW@QkVd?L$B)aMZ9`*v8)7HCv{ zYsjp0L0i{RCK67(@rF#7?*eofX&M*1DV-(KQ_vK(YL?UmtXJuf#m?Ui7Vyq0yPf*|;x3~rR3{0Oz$$Oc6>aQyLdlY|9(Z5&m?=7%<>5fMq z;|tsXs()%pG%mc14H9IPhGL3WvZ;tyYa zYw4{5+pu-@NK8u5Kc%a$e9@q_0!LkBBxa7tWxkp&OHx2(q-mE~TK}y)#~tJPn-sFs zYb~L~0j&WFk{4$*$8)+~384pV>J{*8(a+yP!!Y%;H$lx0?m)##173#(WcNV^m(b%c zdLoJ^a?4iiKCZx@=eQcG{ls5$_3fp%i~eE7KYXvVxaWCg&-1JPQw8?a|4({LXb?y* zAmxKdXb47(C(`0}bPiFnJq|QK@O5N<9RB>1Q+Y`758VqFhfgZQCs+Mof;>;DtU2^+ zu5t92&&Fe@DZNxb>eoZaYE|piH)%X5t9ejF=!QsEV`IbmNR6YR9%Ik(vH<|=^us@T zWqfcc7RjQ0(Xr@U^e+Y$s}_Tcp~W!hD+s5}GFL*f!PFa#m313-iBuBGm9CL_ITqM$ z##Fjy)@cu{bW<{oCNQE{ff1OymOF+ybRK>eTnp|6&jNSFpJT3smqL;aOe@QT1fMAMs!L&k~y+JFX#T6Rdz2>A5MHtIXkk(YD%@E-{jaAu`(SG@0-m1UB#5 z?ii~&4+7~*_)684;FZ9F_ljSx-VDFs%Xwjj8nb=X?RjUONxBIkE1#PC)ZEh4&F{xS0LE3kqMI7V^GkCeh7H8Xw!amnOu{X-@g8~%2lac4gG5@ za;*y2y2`bwT&>FWt6Zzfg;lQZQOI5ArUEhQmGI_BcqqE3;f%*-z%LiI$JHk|lwf*l zwqzzfsj;TMluqNgLCkQ65}txIgc+L1(P~Nv>1Awq1v6?P;koOy!V}R6M)~F2Smhyu z=(XVvn$F+)IZ1CI)Sn=fGqxuNvGYsP6gDL=Lk~c@fLRi=6l7Vv&~Kn4^Tr$W2TIS- zq#pyvZ=mq}?k7Bo7-Aj-zWx1wgI63THSM>;h3fvi`;nWt_B>$sK4MA3&^_BvJwNso zhmI>l$5-n|@;>mNy;WCFFP$!Oy$aW>R<{?ccPrJqSF59HU5DBjRqI>S&S=R)ynSoL z;dOuRb-3L0;5t=qm&!H4`IdqNJSq3Td#crj$Vy+KelY(cw3&z_3Kvm*bzpR;k-ciy z(`s|S+J5x0sp@-54${&JAjMFh66#wG4bZupi}k%qeeeDHXaD~6PtFvEp2fAa5^viY z3BX$XPFxExS0h~!FjNqh^QfDIx>tkIyi<+r%G<7TOWd{B6}I_Pw*3Ly`K8Sra6clD zm5AG=bDe%*qB0QMysb8*4OTPxc(VY0Sgg(~Mxj5JDx` zc^G3pQ7zcy@=mAG2;Y<{>AajH=dPsdK%W`L_X3x5O$Rrjf@=0C?ifaw_1ikk9is?4 zIRLk2w_I!Xfre%Q9|kQpCBY8u=y-2RcO7M0!Jsza@PMAT<(w0C(0;TlXui z`#)~G-+Cl}`bOK8mwyHtsPTqSY)VQ_dg1fmPoOApVGXq*nC`RJ`O7B z-qcSsKhBh_q-BIzBbF97_@5vn15HK$z5Ds{UGZomTy=E5rBwouBlMsNp93 zi}V#E2bIV{wY$Gue&liV1(iVeHTK2;%EW4wYf!lsU1XI!4irNmQMdYBKpkDspsR}U z8tEVuDhCaG5Tl#PNOG2^M%sLI2l_t_`W50VhX^!FY*-qHrm``Z)!fF}r(!c&Q*34? zIfp@A8Sa5@a-ac7-<2P8H&F3XG5L48m~==-z^sPy<|HudZUP$jDAuyh7a@je3W0YR zH0LXcQxLL9oK4BZM}zWA&RI!-LZ}~CNE2XpPL~Cr*$TqdSxE>&E1D9U60}w@`0rXD z6FuOjAl-!kf0V0-6m&ND$q}18L z=L#DLLnnx96d;g5AlgT%7@R+{(`iQ%*+?dxoQotC?l~9N>yN zpTN|LauEK-XBZ2_mclSf`YvWvo>AFEh1y%#hLTNs2QtleF(JMw%|bEC>5?vweaO1f z5@uA$UBS|On4uA@*>Gu^H8%Vm%>|hu+q~dB&t$T)VKk%OTbF4OJu0htjbOHMzq)t0 zUynil4`9pwCuEn&=V8KitD(l_r>{S~QnecD$d5b>)!rEX^Ft+K@%q%jnKdi?UONPq zP%wNgxtzJ4SsDLu>Vv7(nn9&%F#p^`Pi@iDbKleRY4d(H5L7`Z1-g_#m)g`_Y}%(Z z?Nd94ik%}$=SaB)q)&@)X*7TQT7Ah!0`&!d=dJpWT5q?mMh|}+yC40A+SK*_=*`hv z!?$9qO+(jQYExwC%)?+fe_X9=%D+Gn4J%E<`7;kOx*Spb5j9k!*0-w-?P^O8+KAQd zYJFG9PXb_+u_(@1Q=DzIP09iRA82W8yn-;mi=MLrOWMJ%aGET+z*n{WU6qwR*3XZO-W1w%VrR}6J2p)Qs4fhR-#8Srhu zChdmmnx753^X%tA5~zD`9v+^IsE-~s*FyShnjvA0#u`Je;-@fr1fIt`22M^!XTQpV zb_%SUMP|{v=!0->nYJd?7FZAi0I@mcSPT{{xNkC-4PgO7 z(jk{m*%VBYTlSb^&5;X^oMXB|+y$a8#)+@TbQUafIl5wwM~k_Vh?UEhWu@~!H$-=B;U7E@U#Bb#>_*%9p!Hz*-13W%+ZA#{T!A5l&@uD^p z#O#B3k?pSCydCF3&0g^?c&_;5j?IujHK*jtd-J|LGhr1R@3LTxbMPX)1*b&4jwNQg z%fzw%yW|y|cUkjv83NCVu2KD2+AiGyIO!&4>%#XvsLuDE0JAnuwE}%IevHW@;}gkD zOdj6f7n#h+k&~b)&La+Vqjf|AmcE5q<){rRepI7t--GN2R*2ecQN@J8$ashY)QLx^UBU438&;k5qxUJlBXBTg4kV!_SDsEit_TRw9ANFmgt zhU<&rekI)hAUya~YS z4m4=)1blaqg}1i2DB)v@gaSmfQ{zDo=1{FZf~~B1?&|>_DT5tOg96ehVnAz5LTMqb zKo<4uGJZQ&ll}@Je~6ik8D%159gX27^hcx*Fq?ubiv*9nQ5OCfdC?Vst4zP0y%+Nv zho_OG$07S2JSvCF0f(yU<)-URHzt*;U3rJ0I^fy47~y z)BB0PzR)oAV9%Kc{?SjOCs(7V3V~f}`@v%SF{S;OTHC009#ESh0@J4UzW@=Kj&DE& zro&g@TJaSdV>+t8z3A^&{M~m?d^CD{^uGTv*b)#D@f|80K2tn=RylmOaOhlt`zDwc zE_cCy_>rSh5D0vx_usntRQ$-?s*N7|PR#*uBTzmG~?4uguYJr#YUwcCTWpjCtmTE}2xxh8StaTmVUnp>$ zg;3{1E_n5&rI&6r-0RH0^ng16)9B96#v50*7LLMsJ&CmdX0b7T!I^U!y37f-!a+a) z&+H$)*y#Y z4c6S^jvru{K!1T0K*d5S;PPJeErG2Bp}I{JsQSWh+K$m4{Nf}XONl(Mxq0~DNSIAx znd9+u=A_Z%fav~0qiK&brO~PC>+M+uYjaWT}*$bHUW7YzhW*g6Bk`kVc z#?D9?{H$k=24y6)Ql)1wdk(XckZBJ1`Wl~XPh$(pDa~;Xp6D}@gvv-laZOzkjXQx~ zN52fA{vuVlvxcq|RgB_}Pi zBv@T?(W09)N51gTVjpAgS_!QY_$fIUdw69Zmcua6;VW4f`@WSu#g1c2$Fbi+^FEwR zZ>GSvLbv4G4aNY*-UwfAo&MnTy-uZbKQ=a&99H`a%*y14=?~KPWM$WpHG&N#x7B`% zS$X}#Z-4OZdxFw+7@JQ)^DB(q{V-6CnyzHW3M`|xX*qd4xpGdaiI$wSjU{!>%em{h zl8aW|q^a%wqc@M1JhaM@n);HL7JURXYTxy~m9P>7KZv#kNNrE4iWY;!9b601qF_0W zQ~PU*{tm_8QL{G%|eQf_n?jJZMJW_H| zC^vzSSJTa=TQ;S>ujHX^`g&;5NBV~DS#KXI`DrykS~}i8fAjpU5v6&kR7Kl@k1aVw z+rqfyYFexzZm#HRR$R?%wY1?yIHK^1%{YQy*aL*?K!}&@P%cxiyu$k zpA1WzCc`FuGA!wnVF@QQHX6cu&o}IKWzK*y%%&5s{lZDOMa{eNT{|{g(YN#Ya_TftUYH5ZI}(p8)n1W`fONRpABpJY*^Yj8`cBJ zZdF+7soKEiP~LzZYwPH-wvHZaI(jT^M2~eq@B6X}N>n9Lc@k~W{P9dO3C2Ere3c!D zjmLFaNG0wF3a7pJLFvHbACwO0ACwN1KPbii*Xb878UyI1RZA~dsdBAKP(Ev=dkpbhZJt31W)eQ11W zb+qr|@W7iH3BFw((En!+_!7GLO)X>(08)2Cx0Nw2Nj(Z`b!!dgmvM&5+3*Fmt_%(# zJZb|&rytdo>@n$Ipbb)b}T4$(CqYk|jI-j=y3#PHfAL?KFkpc1v?t5+#0hmvk(J z$~i#}p_=q8wJ4fCynd{nAtoZa#D@F84L)Oy0z1|u!haK{v1Y*` znx_tEZz@y4`IvfF_4;v}z`bD^Yc-K)iQs;N2p-Wfb)=G_mcMV-e#hEpOd;>*SO_sH z`%E|%3*U%|qfl@~6VWsuOHNFP5)=bxGqNaLOpd&e2`AFeBopJ&38-$lmQ0<0NsLZR zrcWh=E25Bzh-bu@I1x^ZA(Pq^Pm27sC?&)g|6S}DofY|LB9%$2?TMrm567_LMmQar z6a`+6gkz%2%ah@h7&5DM!i^*!$w-ozNb?!mh>r@Y3#-Kh@+qhdScr#bZiu{;N$`>w zxyer^lhdEyhAF9>cq=7JY5o>Jo;sv@%O!0f)h3DQjFgC&aQ^yZg3Q-xlVzs#<*e3bjsY4-b<-m-hA7OLpoM+YYbd5!qi%|dxlz)7jowr4 zN9(baH3_D)$EY_pd(I4;KBl!`F1KQ`&~6quuG+ThT#WTFc)ChLmf1dM5S}Y^iGzY@ zY`+jpiwQX?1*3`JcxvA;Z6)CreEz@ie2&WBdgAeyhR3~QBCQj zYE37l$fRl?k0rzDeZ#8dW>kDxZ8$SBa(U!3|H6x>Mz8UwFI;-&Jb(4Mv5}BP!r4o> zFsdt&iSwF3$f~<68OV`rfd~SDz2RmT|IeAPu|s2tlwFv->cN`U9BI! zeW~bgFZjC@f7i;XRsXKr=hte3OP3Ze{e0y8i|<`rtsO4Z9#m=%7TLhUm4{7x*UhFn z+wF@b8)?{8sNb#B@4kEVBhyExR_l)gDXwYxlEU>BTtkX$=)s=C@H5KrGpoZRtFE(o z%h`3{h(s_k9>1Ph7eHX%1U)N~F-9PuTbVKQ1nAj+V8$$>RWOS-!6MqmErRt8iyUVg_^plJzIO0Tlt|?gAjmLd$+a% zEnmI9YVEkG2}t1RLhWNltrMGpqd$wP!70)nXN7v;w*_e6>NG&URcNfvqwttfgyzSz zZ-KRIeat($rb3(O6pR_P!Hyi;x?p720nJ>$Sr292Mp^TwfPJflab38E?m}32>`FN zx-pqOM$M`IV$i4#X5?sMBB=EX#*#1h#{i!tA~Qh^o5=&TTZoMkz#LHma3e}MOnjKs zV$z67D<(lSpOaBR5EDF_V#y0z9I1p!4yd3Yssx!Ol7>`!rYfP9UN^_2%kU^$0gBRP zNR&o;^Z{VQ%mR1!_0i6dJLZUKf<#Q;0!|U&WX-7I@E2M_4olN8>Z0jwzXjUOnT@^% zcsoPB<$}JJtR+jPuY=uX-gaayi2tt=3rWK}?GXT`{e%cGhG0ReKOsM}7_?qzusu?L znS`vP5}LC=a_8t81+T&y1!wmKqjGRU5@F$`V0to~fJ6+&!?&XGOgt!OQmLer7KNa; zut8kdpi~2NLN=)re$@J_FJ8WU>B_Z{GyJ*d&YT$;rJ&}jYL1Bs)eOj8wMyZ{gs3{d z5sqcV5lKo)A*-|l`?~q^0Q{t^`Vjm)Qo}@5=9Y~6fWRjxwPCLg9atwb2_XaE1wjp8 zvGKD<9xsUlf1)ocefdL3Zj-NUq-kHF@j0dOxt#m6miFbj)t3G|+xV&153ia#@*RT@ zI+c!n51aO{Tg`6Izc=k)I8`!}n)c-QQ_<7Bd~~_LM3~RWS`w_U}q13Ak>HdqYq z#ox9POZtZ%bl*J&FS%lHP^)y3uHKyGS8Rub_*->r@LhKjs(XpQhK5V99ou?|%nSh! zue@E*MSrBWba|tyd4L-}B7_uh6U(hp<YW`31#gIG)i4l%{dIj7 ztlDuvwZ@VVUVxB=RgNSfzQC$R4>+@Q2D;F~aC#&{6Lp&w15urVcq22xCle7-Is~hy zaxaIa1aP2LVUz{WOClT>c|K&44nu`>6cQO%)zGQlR8mfBhlepzUMj6e$O-eOJ-Beesm7oOAg`=eod@y4?PZ# z7k|Z637xx&t9x`^+tc!#iVB6Vm(x0zDdl|(SXCEkd zyA^MDp6%Ys95m6G!_%@vW|(`X(QEgJ^ehxB7*K1HnKVSAc!)GeG;k%%?%8HDwzPf~ z=&hq#r3`K5qr6BM*sIj%)l|V0GP~)yN+i^25DGKJr`1Ly0(Yu>9?oJ2EvaC~f>gEx1dYAE-rj zx#_9Cw)b_owy}oQILq28Z)KMTV$D@Ye$AJ)-lnnYGy;42WUX1du}T|z6tZZ}3))F4 zteOkr)l8x51{l$4EkG~fTxhUT!WB@RQSdS4bQsMeJrK5QG|nsGYDp+55=tQzR|*nT zmnN=s7&U;n8BH%e@7iU71kNZ4IWW_iJ8)&>iKJ|)nX z4;)1Iq^;oVReZg7Yd-Pyt<^P^?8IGLauQ#|!{(t+yhCg49gD67X5kPxBg?%DujG9@ zfJR4f{!(sa;n1hPmgSL^L#w_$1>d0J8+w^Jf=MFFW&ru2rr($9!5pSmfMAh$nNM z3fD<9-3r%DGkX+nPq}8@WovNGyK+qK5CougO6OPnD`WY<-c@cW$E>*;3a&22)s=S* z76UE$Ku@0Qf%Macryfjx>{@NOkUO^q?pANEtgjw160nGmT50JysF9w>WV12w=-5u? z!}3fbG8@`vn;vcYk4WwUCKS?}!b2)YRqV_3&PrBUP2mc%E;ss3Ks=mMTf?bTY=)P> z@Plk>9RTZT_g4{=P2)kuMb+!SfXaK5yve+2f7AP>Z;8xTbqqFO{T!QR=c~E~H;LTv zJc51}V-D5Z&YP1S=|Q*$yMwj9#xZ^Y*FA z+36i$YpY&A<=UWZy(gl)))=0;%w4U zP=ScNHa!(B1D%qKZaSttFrGgu+mlTFs90;Vxu>V zVxB?`x#=FTU@2=eRldoF~gcpK2z@fzg+&2Tr#) z=gZmQZp8|BCoCH2kXthVQa3c2-4#qE(|A*-pDJ+kVo^CAOpaH^sbmQ^k>26W9@dVg z5m+pHV>H@DZ+>N@N42+%bm(nwaA^3@+1Zm2nS?0QBaumPFhx1hlQuXgyNT%ffz9Zr z_Cj|%XOC`oWYOD9CSo(eXgn1Y<8Z4C(*ZmT&SPr&>7i?Yv+a6_M&+oY!ArWRJgNG$ zi{g#z8!7f#THJ$Lh*#H3s(lo0MWI>GH z6ea1~&`cVG1Y&&X7^*e8>e>L!DyvuBqBc|Wzz|s6#z!-L?6FulenSW!-!_<+(l0IP zstg>XuC#m{GFsoSfu`I834~u3&Mb{Cjuw55MPFyp*IMxHRD3&&KK~NC$U^xmm#t27 zC_^3%8tjCA5^7M2G3dNhD@;O+0ah5+j5~!Gbhb2vq}EnQN;%4;T4Kq>gyv45%N458 zkOQucgbu1SjS1odDTYZL6GRhQkklR?4#Niqx76CnuolPDu!t7YP%S7Xs!a=gsSK}L z@IwO$gWFOD5*Z=pcfLb`g_Z_`Cr@gt?0#ecXM4A?Uyro>27O-mDt z6U*TrPUT!hZ}YNHXg{pL|K7tiKB^F0@-gN$xk-qVLs%zg-BI-Gg$;jf$|(7Wmn zNu&cbQKDp%Q_ZLD6>}Zi(HVJd`;Cd#L%^-o1co+)WyGC~!Rr*R#rn z?!NE|HvsW81V6n>pf?|Y7y<=9xbX#FpW^FV_4Pjh6g0e6*IaTFN81;EQrindIh;jq z6ox|9dcI%@*9%Bu>FLF%S87%RUAZ%BfrjOwAHd`&Qx_>y3x@&P_-hwpOUcFL%8mCY z-mA5>}vbEo0Op<};%<-=DB2VYVSzEtEqMXvVF#rcbG!Ns<(z-dsYp;+5ks0}H# zA>g{XquAD2sw0QK$*dFCA*aIb&#`&9O8>^)7amwjjij?DXSu`8a|{?xQE-4FNMj{c%oBJL7Vw0y9XvYfx25$me2Y;< zLBM4Q`G6^7cstOrfkyxV-*GD_a?>aTp-?*2x-FW4kEAjovosEGrBQ5BJ3?az*Jup; zqL>at+%FtrwFC6%fux~j0j9J?^^SzAY;ZNJjCzt}$baT;P??N0EIVQYPj z%kBBl(T}cuyg%=KHqSmwcd9yW!}#$wTVsNlIcyIuZ`snKa|O1> zJdH;~tFqu-QM>WT(xW3rPsK@K4?->FGm*8WVQIC;kZlT383Q0yg}UrUOu=Y(i;>|{ zE^7zBo>T+0Sr)xggprzWoHft6vR0{XBg#+>v}QNFZCAPs7L74sJW>rTR`uRZSxYs5 zXwb^KvQ7cM%1&07H>lQ3;z$Yqj)~%znC$z8df{1{k5gINJ!d3)n zXcL3yLHMjL1ui+lFT8q;B;z-t2~kIC183p$s1&S+>VO+v`0g&5nvuSXWi#w=MB9&z zl;R!KNf2MvqLiBFM%_1bwX~yt6LPZ$wvo||&!Dz|isw*nafsVwMT_!=T+($UjoMX1 z3@Aks+IlVI;7BGQ$QFyjt+op*@zP2SRAL2v($Xxv0RTc{^xwm#Z(%Zp$uuSu8oZ6U z?_+|oXO#m%E`MMsVf0t6)#*Xh0-}7nPbvKkzH*eC($J%!0<~Wa6*$VX-(DR83ZoWs6A*_>u3F-2&NuIUIDGM;cT_i}Xduy)j>4s;Zsk91*zv=;Pa5{#WpiAyrf#Wq zv2}Suso9mY6}@$NZ)cwEgmkT=FVD9A#?|v#SMU2L-aGN&*lO3wJQsxPS4Z$6+gWUG zf4A?QzC!b1rTK7?>!>iR!6N4`HnkV)TZ(O=qOZPGLt2ik6Gw}4-V6T4P%c`kC%&e4 z_T;_0^6V~62kYJn%!Cj`epC_Qz2cMTV1a>q)saJ_ z>RRmo3)X^O+L=esO7B2t>BpG-1d^>YWexVXy;?OIeH&-D8wk8Jz%?gYaQG8Uffa znG{qr1k|MgthWpB!3!^@RNHCozTI*onT*lfVGX@>7*`{gMJW!^8V$_IFDAf2#Ike- z$j}QX={yo-QxaSOO6eKt625o=ldG7ZTduho=*vmpf&@M?&49ZoA&k=i(ERM1+q;dAL41 z0=4woO7kV?#~?Hr!WaJmjQrsZ@`cG{Wxw>1_E9EpX zeDH|Ud{l8AEm@%nj238I>Raq9*|EfuhTSCx=A6Xo&1LRn=d;T+SbF!2(zZA6-K#i; z)?L`(8iPDrbe_*JR_7X5bLYbRg_0EuK)9iGDYh6Z*=Z4UYU9$L#XThlEjmekQ^`fM z9I0zwy0Ca*WxrC}Q*zTf4_v#Jyfo`WmTG9$k38L4_P^WlPDkF`t2jdI0s7*C$zwgo ztoa*ogq6CL%=_8*vJYPT<@FD*=X*{mou?K5>5>)O09&n{?;d;SSjkR{23sq&ihpOx zLF>TD)0V_ViySiGrdbbhas@}5;%Hm<(u!cMwYfZP$XCo7O9u6l154VRF}G>TSf$Mv zbJ~P42lF+C3t)NAPHWGauWpH3_l^t+)eAbQZYaE-Z1lpl-~0 zNM~0ect8mrD0#8$BY}oe4d(RaZ3L61ICiWDux^y8fsHlBgnY%ASlSp9OB-WiNgES$ zm1APrr){J$QVrJag5>{WAkXiSyyjrJB_UaW z2et2shS%;XsbW=+K2eVfl*Mnj859qyyPt?cCLN8*^dgF0E~$;Q*Uh1Umy;5p#sTfm zOdz&cfiEePL_;Q>#H1N#*jVjv49=apJVI>-y%tlO%T{U2cbb|Ke;&z|!LrI8)2!ri z>7U^Rm|Y?-LIUv{hWXk?n4P~Po_{0Uf0C2GCMSPOx_(Q}e{C``rr+DlkgyiU^aa6$ zBhI>oZ>~DpZ`(lDTiWxaq1d`F|I~$i%f&otEZMxwvrJJ_H6=4G3?2Ao>xZpcVa@7% f^ZIMo*RA;GtL|4VjB~|b=sd1;9{-wPiL(5E{qS9y literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/flashinfer_cutlass_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/flashinfer_cutlass_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c3fe0d67a5048dc38e482e2887ca17a11d6f487 GIT binary patch literal 11265 zcmdTqTWlNGm3R1lzeGxuL`h@GFUqoHNq)tT#I9r6@k4H7ISB+K6N)pGDDjnFy=uKd?|qJ!N41*9JnC2~T(?MaH!^ zlQzT+3`rYlZlbw)+`?e3A!SY5#%&ZfrtE3QxP!u`lr!xbcTw1!a;H7x9tvAh-n4Jr zm-dhQ8Db!UO$gj)w7JKdc&k9BcUMTNzIodgc>5N3hY(8Gc;`*ic(Z|=A-wA*;oX8| zy1zPR1-ft4pz*MsRQ~vmzZfN+EhzMEf%gdx-YKDkbeba|mX6r(pOKdmb&2Z|UjUI3Jd3 zDDu3p*0;GfVZ6qOuYLU= z!*sYALCgrLehSZ@sWv=)Jtv5=#PUfg27~iQ*cTl@Hj|av*tJ+Pg^l}1=3jl$NV9_j zvqNk;FUjnLz{*)MKGmO3X5_={?nxMDcJTPTxsRP4IzE3a+Q)7<+Ip`OMG<_*#Qv8Y|d6R%jbJFGZ(VIn`tWixR>G4t|V4qlW3#Y{<1tmTo* zz~Di}Rv9*G6!A2RPR#I(NqCW9PyT0OKu+fda>?9YIVMgD^4{zDL$4g3IV23Ek`n{D zIe99Z8Mu~8r3cbkUPy7mbpbAMRvbvh<^)k1(C_*{11^II=+Dh5JCYK24P*)bLQx=w zfut6s?dOo)AkRI-)%TTi?-QG6&DL79wU%60o*NBzhiW0d5M*}eN48tGAD>=6zjXeQ zt*hj8e{}fP;UDLhKUn(Uk+Zwx46ivmi_XqZ@}Dl;UHGN&Mf&sfKVSLHyZ`j=>dvvE zbL@AOfmH+JZYj02E?-}|etUYwyfX3W)ZM9-ag{LphDt-Ptqq+n4xL^d9QoCRYBspf zG0%-AAj1JJtV(T_k>31pn%$+%&IGTOy6f^k|GHb->ju{Pd!XLWdx0(=Jbrir@HD{_geL_3wr}poH&f+}G&`VG zi$={>aKcDw_3>ets|nPgv7w37wgtX@3w&e?e8(2}&Moj=ppJ39>=yOfTDQHhw)=&( zJuj@?{=(W9@4yZ?OO(DuC%+T8+q_7wGvh6nNp!d3-z5GNdp0j~n!s_rkJbYhr{;rV z6>^*y%gj&!MYv*>D;NMM#sang$LEv?eG5WP5`-Bpkvq&yq_XiDPKw7;LL3EMC7A%A zYW@)bKVd-8S8B*Ddx88dDE5M(s$xJzVUN1RGqSGS8{RPjs*y<6b8y=#H{l0N!B`-i zZa5T7P|`;!fXd^*_|#It0ww+K78oe$YbaQuq#F+f8$7Tr2A0P`zusv-_^!v6GdA|rMJPDE)XvY2hdEP`1#W<8Ka9g0a3QV9*+f#^n^ z^!AFKM&7y>tz-KY#*gCU(y)Yta_I6Zy)v zv*c}G^X@8ocYSu|msi#foGl(W`!Cy9_gpG^FR8=`@))G}@pU6WtAMeQaMz;gZ(OC8 z&IYg6Za{iU%@KGxq3zx`m_^gS*`gxK9MKI~T(>TvfL0|vlG6ax9>{5|sY{KHWAzBX zY>$NDGP%c$MvaOgJ)?vx*89o0oV-R)!bnz3$7IEJS%$L`6Zw0L*b9_MNHibGS~zMaeLQXyNa$|Yp#7o*S;q%Pi;{(8+{JSf6BWJY=Z$b zw&4X`Cg7eJMn~=uaS)2NbT*Zbu>{ry+HPDrmYd(Pna~E!-o~Nu#@-+$Z}8B?;^n{kZzDC-kp!VyH{*jug`k`EEABH#)M(P z#xo0cl$>~}>Jx96*$wRQLv4dACYlRI-Z&5Yp3%l~fRwct90?=nFjK4tCZlcvik?&1 zbS^K0u?|j$G!@GU5({^M7G)s~-bPHEW2v}Bf0qSa2hAlmtGOxiY?hrA(rHN_51UqR z44cf*p{N0jja6E!Bf9I7LKlG*GTHnj%${YZvMGK86)YJD%{(eJ<<7Jvo>vC8_y+Nht8T0THbbgg*3iv9cxK{Ti+&h!j7$;xmuvZ7Qw%4k-GzHof9A`fiB9j{qJMY`aSIE;2cwBx`h(=Z&IFMW9|<##L& zn?ko$cTj;cxV;lFJ}!)2&?*g4M#Q@%?ttuTgrWu!E}f?RqfV+$#3P6}irF#Dj$`&J zX0Ks(0y4#+uXC^l9#UKy>p1P!p~S(hFDFey`Sh;kR`cDPp#HhU^AW50jSg z9sljO79FKf>!RanxOL_Awe90%3G@~Z!fyQ zbM&|0Iro`s)q7yoc0dz-7LC?AFIjP+AS{bsNwLvVl9v=mb*mIdRcR^q*x>t`Nvqgm zLuJURFJN$3d>uI!iRE&sIZlL0A>dY)?AZ+1Z6JH%wQ(FjG^e67VM5W#GZ1+uX=cI4 zkbw1?}`4RjlXzG)b6cUguC-+^bTf*fhP@4me|#p_keA{ zl}W?K(~_~l+1YS@WR#FJ6$}N-H2N@gsb&XJg4QK{2hD}1LJ+7xL%l8)0tE-upwV5I z0NddE4Pf`xHH0EyhZsIcg9yLK$yp2p<^{Cy`)7nX zX?OHE)j#vcW(PHKNo}#&vh36>4-Cx7Tp2 z{?v%*Z;9ykiSIzBFr0*vsbW@ad}~IAnQGhlmQBZ|pH}=3x=SXoe{PU(wiEktun$5# zYr$wS7%fG5*CP9hk$un1#$Aq8TMvW^d~K^8hl}3BDuGaR$=kL#znETazqs0adad_d zvG?5L-V3YI(N))_b&OT7LPHmE``28NqAT)=d5s+|vcr#D!=+$&EqLHj@Bj?y4=sC^ zJSAUn^GmgxBc8E8Zn%iWd-p7oqmyTlO)YItRwdlEG^xX4pez)TmL#@H6g$ls0QSoXfau%SGk!To$YnfPD z;5fy>!52p^XNvZ-|80iabz`NS>SN5Im9cdKFV$i&2Uqs5 z6L_gM#vFYZSQ|W396W=yQPt9IKKRt>QO)oMs$E{yijWNpam^&y zx_oTu*vj@|(@wR8;=<*@R)WRA4z-oy+DLQz^3|oQE3X$ryVZ7zi)x=-&%uYXu9Y5Nf%)R4_wpW97#&_t7hcwBR(5g0p~;D+7EZFhTU+szsb0nC_R| z3Wz+JA-c)X*g1S}MN3np0dur~?>V+skRM>nIxUH7zh|bRK?zlYzZX&7#KiOT!?%d* z)A)(9ChFRH0!qbJv%oh2h*{{Cl7vjI)uNgXZL1n9T;)d`@O2kH~-zGATNiLLkHj<0NopTNi4=AO0YXt6np;<+jO+zMY%*Ui?D<;KNl0ph}X z_<)Q5p=~99YsuGAYU(Jp@5Ow_$>$-b%c2se*`ggI7G}~cE^22{iv-!c!FSe=%N?kB z`QuXXKG1(B^6ke3j)kk>E_mRGmHlr0(D>lc_!=G>W6hy4jfx9EALyLwiD3azzr4Rf z-1o7i|4-cMnvXmixh8%H!%BFHYB!$3+U_?u-hJ^Vbb;T!5h)1(?ELl(JHPWQ+bi`u zA8apG{-3+8cmy2u2!#LGBM>!@h;PEIU;i(p(UGV_D~f2%jgC~cb`=_lc4!~LhVkbN z{TqMI(69ZRp~euW$~`7>+RscB&&Kxfs|@=42gR->ir^ zXeD|vyN;QJ83t;Uu8J)Ct{|o%`lZ=1(v=KYL|E1Y+8nk=yNSvcdd#$MHuUoxen?bo z$LIzLuZl}hL`zNbg0xnR48yG32;=@Qa^l~~iQkf*-;(p+7z~Wz8w+H=w-_12GY4^m zZol)`-g(2KTEdJE1QM!4)rjzp-cPUIy{f^7j(t)1yg=cnX8XDs>YpEc-@rKT7}ml^ zis2*A36|7rX3&c8IkD8qmUi}*qW#ZWwlNlcSb(3}+#k7afz}5Yl)l|VOI9q|Dr(%0 zC9pz*AEj=kR40~P1QfIzArJ8cR4+ncIt5ifLIKhmQJWA75`QyV6?f#KuSX3bs+sic zQjrWbOf1eHUH$OtdMlRRV(MruH3e(_!_T%wYe)xJCmrA#(y>$}9ZOZxv80iXkVZN} z8tDjWq$5;EI-)ev5egHJUu{LG4WV{~A_#RL)QM0RLM-@LDiW@CmlylI2TD|1{2%0L BApHOU literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/flashinfer_cutlass_prepare_finalize.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/flashinfer_cutlass_prepare_finalize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..878d05b76e258df064f5d71c6ac8be070a96183f GIT binary patch literal 13006 zcmb_i>u(#`b)VsL$Qiyw%915nq9v^@%C_W(Y_Hb4j-T>d>)rM4W)(LC%^BI0_}UrD zwiK#$3Zs#e6t)8t_M!!%GzDVU3&be$VgG`nK|hpSdrM~Epf0jNk#F3{iw5~n^mp!X zNQ$Oh1?jbQdG4M2zW1E-JLlei^?2MILelnw#6OL4+;1phhW0A4@)^%@_c)o8`6M@G z{PHPF+`@AyD~oL`wof^D`f5o!Q^J&xa!t8d+M0By#3_-{wxpEuOnDe>PkK|nDIcR9 zNq;IZ6=1Y8*_H}U1sN?QL#g(ucF?Yy+|&jOcY>4MH#u2U95W;4K5$xY(>~S5M$!F>Iz39yv>tt6u#Q>Da$5>2GDbJSc<`s#FcEP5%KiO)v0cq~bua*CQ% zVpo;8x%l*J!5{7M$^@c&Tn;DG2#>12GgP7}%-EYUkTlD`$35gHBUaUcJYB@7FRQxMk}bs(z6e_6>9YlnbBM-JDN>ocjRI!s_eKvw}1Tg z+5O6BGI41%JD1U8-@|WpVo!w2KP+teIkZS;`XAr zy(FF~if5jP=L)uSsvnf@`xhadxPg&XFYLhLBO|~j_*{0a0 z9ZTFaFWYZ=rW}e>b|}IOb=?%EEwb~b&0O9oH_F*%VZq69_~OFB6_F#&&{&&7t&|(3 zyi^Z0M9@CaHqa7iKNwn?>2Iovr0iWI-?v7-A2V~v0$3%0cH5S?>-<#kJuVW|w@lDZ zoT8mLYHS2f`gvn3aIPn+o3U_^m?RvhoeG0T!u4ulR0vl$FC0tDVRHk+nXID5a*0e@ z8_^xudK@R+m!3;S<%E`v<>FUxCiEWmK~b`rqRd98v#&R@M%>mgdEy1e4^trih6hK~ zKHjqBxxdEAH@AZ4@|HXot?@^mM~dpQAah97TFzUMs&UOcr%EUkap-n!4&&E_DEKpx zi$-;S)6kkUkDM#AG$+Ta&u}rUK1>6i)gmU`#>}VP#8q|}CNgR&meYmzayTpywGVl<7j-r2In0(r55#JvNg$*6!t3i?oR?}wsBID% zH@Gi-fs${y=o>Ei_7r`43c?xVA*H^CsBxSfso3=>oeE>~ zqc|^2UM?J5=V;fczXScO&=I-8m3=`&m^E$^zN~wUZMjT#7Drf4#39$^+e` zUaqg-ngIDSFHZLgSm4$ul99f^7ZF>ul5eo+ z8!QNeFAtQ)#Ey7%hxv$Qa-#JH2QqUxsO@M%i>ge+%j-UfFV!T5Vth~I3$bD8me?*` zVyk?OAqhAY%_ejo1XkXrp3GZknhy-V z$^4^g?xHv5gSA>K!k2f%_XC{C703d?CeJzOxmb0063MM=3PXQ7}af5S)&<|Xbx9}(WdCD z2C{H$ou~Dct&F2j?F^zD+*f{18Y&07@4a>Rt>wUz;K0(ca`|jJzM}BtlCnukTc6=d?R7AAFPS`_>I)-K^A|X;_=~zmE zB6CN9!Q?q0&!{d%38E&eU!2jXoQWgbb}5rd>Q*3ArkivjtCBX)&8q}Q)B%dNP=pi2 z>2_ee_!aea%GgE`(|KgG=#C4}iWyaXgA$&q0*oukWHbuF--lc#P!Ab;kmR&VQ73)) z9MSJ@a=)^jthl-G=JJl+U%4dvDgL=XRIw9*CFt5zaWYciwr=~HeXjIy4sXT6+sBr7 zmj(_N2M+#@Lt@2hW>*}%J@g>>Aos9q1z$oey?p#s_qN~NUU5;%jZsxZq9o4k{cjH_ zwl8dwam=q42(eOFIgiud5}=0G0;qtD&Ka}5(F%)Y=NkFK8u>23SN9sabJHInLJPBe2(c|Jp6wkfod*^Tgc-UK}nr%Db2LjrhW z$z(2r2nuLAqx(SZrc}zXp^e#-M%9C|aur|$CbYRc;Qvkd!n7X=)<)F3v%cJiU$rka z*n5^*B3Zc#_TJyH+*jB=`BZqjylFGR-7fHDc#x=`vgo^g;QI%j4UGQuU}@lZao~7q z;Qivj`%A~Zkir!!Cv_M+xmqXaA^{BLI*C0^cN+wzo1HYYg~UvIaumQJpoTG^c;lwf z!qbMW`~v*U;80cdBCi&VP0u?X(ZV6XZsA%1ZS5oESIZbBjaWOG16LXWZ8=*V3*?%{ zuUQno$Zc^nkV^=vDYPDw#C^&?s;d8su(TG1MOXUUd45rlIZz+v$g*L=Vu zTmJ?8sIJYTm@_?bv=S{OOy0fbJgs0LF>lK|Td&op)<@NKxCnN{fv?Sf(Yk1%(dUJ{ z3yg+#rtOd!w#kRQ-VW$T{u;wmqg_iZEF=|~Db)zi%({q&DK~=w$ox}Xh;&zMSM)Ng zrHjT%VfaH2l92!_#jqI+|Bx04;@LS;=-KG3Mk_@8Y6inPb5@Q0 zMuYX$tXr)xh32%c5Z&M^AQcQTnr5}VtX;Poe^Wj&{#cZWcSZ!AI?4)ec^HTx83#LvEKct z(#CacykmtCeAo5D!M!&2WN>dm+FTCx+&g#o-16opp~0o&_z>7p4t177L&eZgITTsl zVGnvJhc=c&eMIcqTiW$*ao4*q+QmTIS60pgqx!;APv3IqgEv15ewL{8a{l0*Q}=Vr z-+YiS_zo0=0|wf67&d_(H2h#yiBiqnP(}DKimSFpl(22Q#@|>sqM1zjXAoFg49>>axUL( z>6TPzA2MA!>H%~T@vCp*M}3Q;gA{#(qC*rNrieh3dX%DL6w#?-uwD;T9e?jC+FUYs z0j>h|B0lQD3orY+RfVb@r)UCEBZ@ysnG~I3A0XQqia*?l;y*A^{0DUu57I>OAbWP& z->l(m&3)?r*;*-IXeK(CDsr;Kvs~xma;hN-#mbi&AbA~kuLy7yVm&TW`JMB2qZ-y&_6?lSyKSu_X22(xMXVGvww~JKX#-vjsV6$ zhGA2XJ+d2szYXK?Fu?DH@($@ktAOQ+SPt&ymWY2{z4A-vrR%BnReU|?5<1zv`%0$OG2h2Y$u3|Th-Otw-o%T^`4Y6YPwIAd530U{qD=QcwCGkjk;1hLT|lk^)fcEt z4Oy%2QSyC?K0u^9;GR>`vPvgc7s({O%y|8(0i@~3tREn&-=uawVoe#%M3V^(w-rW0 zky~S8rY4(k!7C)rNEHy%PHG=Q^uH1iJF0;Ae!kSVzu33`v;Cj<9VrLf@8s@ZSbqJ% z#X@krAdNrw1xvo2pZj(`+qAFjZ!7zIO8zZH{}x6J75zhu*je=Ntmh1m{CwNPZOa+J zwz=P*3))JfXaO12w(K?Bdq_x`tYMK z*Hve9f6>3qr~(9k$Msz7D2W3_abWq{=iy&pO8*fApmD8yK&ztp)5>>zI!~ z-YP&%GKWdLe+1EZ#h!QMYW)4EN@5b1HsaLX2{@)S+^5AAm4vvV(HKMcjC%_*_j0&Ls4|7j>a7D3Yc#7W z5H?)21p5}&ueMAG{2Nr$$OOe*WgF-3Ds=69y8F~q-|2#Iy5KuqJ!;STM}9g|>OWNM zKLp_WAn=RPC%ccrLo;xgU*RkU>~*XKdwtEY_Yk^!HR4sLsR&_Q^)f|kW7-6Lrs(hS zr~MBiU|OKUs$1G&KW_k9BET@@mS&^@Xc_6>*8pfiqnPP9rc@o!Qpy0dL^T0g$}#{g zQ3jwTsu9pqI?MYS04->AX98ME)d4M~>VTF~2B0OX5ztcF0JNYCn88-%3@r`W_BwxE zw_MiLU6jG>D`rqJ@IhP8WO%$$hm(VW@!#Kp$MEcm$Ac;^#V@*aH?OrVH#Z6buF=T$Hn=@t_d-C>HxO>sNhNidSmW>=) zB;bYChr*zMJ3N>6gE}rSutzI|oEKU- z%8}b+wA2{O_D>zFaF1m%fZ5SaO;bRBKFlrLl#L|KticV?x^`e)wF4MsE6netyGwOP zZouq4AE>QMAiWt_@CXX$(&?+XJ5c_xtq$Jnmjc%^>MUNj z#FcP5lMBbL#uCZcr6k^$vFou31Mnm<>PVa3S9f8Ikhgp!r{?f_Dr)-r)bF62{u*x3 z=I}7e%#GH%U@ub=?6s7v+vug1?#9QY%w9|BRy;3IFQEz6wZ0=o1E=R#wWhAtUR_;F z;PDo2y3vUNb@a|ENNMOU$L{iU+tq|}jd?j$2F>bNEZCw8-CrNC(SXV@S)1XcA;F5Mh`lV+s*4yZCWFy ztoe9qgrVg=q_I%+HU6~w45EGA$=S!Bi(Mu0P*FVeR6K$US-Ybw?8Sww-BCbV;C#I$ zZ-3F-f8#{iyQ$2BoaQ6v6X>hO@94u@*QV0${2_C(1_CLj~&)T+?B@cY9XtX2^7NtQ(Y%faN z>pA^ffdqr@Th1kZX$-R7(_WH>inxxZn&=z@F)Gj~Rz!!#y~LxKzq91qR`j6*47Nov zzkA6BO@fZ=(Ct}p^s((paPO^Sn7==?WW#*d(HUCVczg5552K8e9qt%OjrUviOR5idP#k;eXcP=e`qbS=#wUF3`P_MQR*M+?b z?iSFEnnx6|sox`di6W*_X{NZHTciw<5$c~K(z{!5AKk(+kLX`fvA>|`1|mFzI72jB zw_1~oL0E&$uOQSom8R$zctIolc!T@e`+qTngTt#( z-vlvS^beQ4LD&uzA1Ccx;RMN1;RL(G*j&qfh(MhIv<{%OXIB?9w{*C91wZh~(0N2u{F+ORz!@V3Ou34vdU-Oqw=e75gN@JK;A@@4SAXA=+- z1?lLQ;-0dvy)bqj_b{~n5Nd4wBm6|fIx#U3>1R%3eGk3x9a;Utcf@$%J5qb$OW)^` zF}3;*TKBFluRDwix~o=$6=|qqxTO88Gu(OQ)Wjvcgfi@Wm6k$3*!*pz#m%!eatSY2 zqbz`ANQXr^fWx`$bA`tZ71n9C)SnsLwA z&ML5sk@`_!Dwa7w^e*!WX4ScWxc;h_y+WxUJKaXU4#Q$PhUut>C}OIRu|J(AY!ovZR1#vUF!saj~SE^M!6F zZzU@tOG>6&-@}qnHol6Fk$$eNy%Jz#8yDPC2{JOo4QyRuy;r_%3Gog#2K?5pEkI)g zXvImCfJRnaMD2#bf^mLy9 literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/flashinfer_trtllm_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/flashinfer_trtllm_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec4469a3b1bd083963442f7d8abd35df29316731 GIT binary patch literal 6560 zcmeHKTW=fJ5kAX1xy$=altfXKD9V&&De^tF;@q4x2^=GCf*??XE@ov2?hfMF2(hS+W8D2e@CT%m3rXbTW&(G5Q#{fLRRqUP`J8d#lazV zD$crV#l^5oao0U79){hDx9(f*Jej*+EpNs`Y;qm?t_k0XIk2;7PkjCYR9EAuK`ulG`*NZmpqYC#`_J+&S zIPzb`&f3F?NAHT18)BavV$Tn;PYtn853v`9*k>Thk7HjE#*uxqa)R;h0Pt>D+z&`4 zh+DaO`=a#EV8a&iiG>&G*8{s!bzyf|`+D=pkXc&@~m(O)&&daY4? zRo2&<(i@jrG8i-UrYvsAYC~4UciQEKUVC2_YmHW0$HCRIvf5V4x-9B71w?a0ZfK%Z ztK(oxRxOKIRh#XW^|@GSoy0+@rpl|jsLIuvro)S?ZB1|1#b)cDxEw<0(`E?1!u5zs zvs%+NQ|Rj#YGuvjE}DV&mX3(;$+ha5ZYFFjYOpL^NK+b{K&MCvlqr!)JImMhVdzuaMtzhEH-1XkN+nKFlL=8wYmZ2uW_qU3Cmt zypqTMM(>e^5W``FBrleh8xFQ2@-{p-W1Dd!z77Rx|8=8hSoiNAff;(H&4hy(QE9wJ zB%eV(~_nA7tDJfJ_)X$fO~F zOc_CtX(Ix1M8f$qMi}I%H1;X?NA7h}%AQlv8KCA&MJa1*wMGT$tLlnUN6w19y<^mZ zGjRv|mOGgG?`(mn5%~HJYzR@&OBF}W;958hzn7t`CYxT?l+K?yX)El%zss)`MfnM5|)#1kv!RlTX2;eE$i z#SY1gw>9AWTDwwF;1lZrzIdIK+-CA!2-gCfT18v1!>Eqq3Xr!Zr<;M*CS;LnyV=%C zNjq99k1q)*f+#{L!YCpr&?Pm7V%Hrti7GB%O{2iIsYpxqd#1B2zGJ$|qE<0IC`Dbv z4<;XV{ky!V*ISFNTI-NrR;#jp=wkcClP5P$$cswt!eZ+ZWW>fII=5JFN{|ZWi}Gq4 zBDAQKFUhL5Sb>m8BJg6-cDv6Wur9PNnbQL!yRTccvJw`sOg#oGL|3)*z@;tnph(BWVBFRqCSwNH|+@5<7S+XaYVj^Y9bRNJJqT>l>*Lp9t z^>ut2jvKx4*U5G6+;s=m3UzhO49NACeo0hn8*(Z7vrt+?Qar?5^(a`)13!XTaG;@x zFT?9E!OafB7P(&}Li83tw!@EIefB0l(Q!?v$6=VUu;enmPzW34y39bO-B>mKIE`4V zw-nQ*m#dnZ#|7}LAFvXUI*k*|pg9RC1^R>~Yd~XnN6jCqYX6dNBa zU4B-y0<389tc(e)bOl-I3bE1^W~D2_N>`MXu9y*m8RD#HC0J2RvSOCfF>SKTaI9nY z+;>SFW<(=k_-|k&;DUM5MzmM-3=WYpT}9VImwj21pW%7tbMj9tAQ)J%fU6E=%vwiwvfB+;A(s~dDM7r1Y2`$~_! zHnuFbB3z{zx1zYG)SH~hR-$38dgu@{gLRTi1L3tx#6!`=LU9hJs=}?!0JXOWog?$h zorAyJNzeZIe8-jO4wQ`Vv&Xx>WY-t}Ya(+iF}stP?T+W~``r_s9oJ~bmA)E%5F+mL z+;{x3A4sZ~71i zM`R^=L{^eVWF>h-R+2{qN&X3%MPwy;M3Cf<(mbLmGV=^arvctCkj&w(khvw~c7z;M zK4I$G3tyhP?{s7WbOzs@!Y@B``%AY{_08iN_`SUDgHAPO2d5i z2+ml98ECS^#}mdX0I$JhU%=fj0CRQ$c=aYf*Ky5R1pwJqa;s0lLp=?`6zXl(#EQ@z zY3kExe+EQ}Q+@bITSxF9Yyb9kNDEenw6MFp1?c}OwW^AdH|KuT7p$RVN8LRAm=AjY zPs@6(*?6c46|G5FGC1)QCxQ0g`kM8yDfYG1^&o;O%d zG@bBg(R3;BN7r<-B*h9WtH!4DjdIl-X~?pqiIv(#Y{7N4Cdpz`m1Na)U1&BHcGy+i zI~B8@wMDG-`<5ZtZACtP{E$P9Nnw-9+Zf+3mB`3HWv_ePw=op1J{35Ag**FfJR^i<*x_Q&)s- zf%;Jkkg>_n=RcdLJZb`oBtJGjGH4LB5E;#VKK|J_4WkwznQnh*@S5dQFMa^~p?ILQHi-^!gie3!32in~Q!i{!?rLM5p4IS?W!XKAD zD%}dq?*!&M{)GpAH|L)ak|B?1icA&hG@=5I&LAqn?@@G>9_W*xgxZRO#URNY+$90v zy)gj8Knzf`V}P0+1Jvvopk~JaH9H2V*)c%PiUA@k28iq!pk~JaH7f>)tQa7&Vt~ks q0U|2~h^!bOvSR?+-NTG2C%VG1ZYPGQ{|5mQ`~m!k1VvKf$COBYELsw2{V2_+X*Y?bY9gD0p3`l@c3sb)IqbUJXPG}(4$ zch9*G4*;PDQSN8j?(_}b#X0AG-@W(Ud+s^s-uL?kgO-A8M*hxJAWBhxj)-vedoIdK)lRgSA97)myyp4E(N2rQq`&g#Z>1Xj%G zXAR?qS>w2o;K~`(ta;o#YZhWp`6|A1_ z6$)7V4`_nU>HZ=vKmlz9x~>9UUx99@KsQ#Pn<~)F73dbA4+y0${s*iT=(Y-Udj)z` z1-hdG-C2QNU4dRBPM>TFxIQm`KmS1Ebro8#QYmU7Rk2q6q^(_7A)WdP^o9!be`&p@ z5y~mnYupv+6UtzG{KTf3=3t$)&9&|D#%N@GE#-qBJaoqz4U=97*! z|I&Kd&OmEPFKf_Hfqj8(LCN&~;Y7yk<(JDymxPk!=ef-QjoxOc%_sZMvp^B*YY zD?kCh?nIt7Fc_>*qZMkBuOU)*llH(6)Ke#U`aSUUd$)M{t}9aS{>7S+g>W$7KI@N6 zoQMAtbBEm*f^;Z2)BPhX@0eG@*+>xZIkn&qMGnq|Ca2B;*%hI$_|63*zPb6x)U1ET zH#-;fP4K=_uJ+8_{IOSpQ|Hb{hC_iDgMo#K;Gy75@SHyqU3RiSq^XUBqIb*HmbJA;L9;=4i+s3Hhh+g(v(o!LToU-ajAY97X?9 zIy`~t96h%%?>kG+`2!RFaKtO+q>&j;8KI{lb0JO>1|Sgh`7ac}vllq6Kn({=;Mo$m z80KtAn6u$hdWci;nHPWyoa(vbCl4O;oe+RA0XW9#C+241S%bdF6?h^wG@0+xZvSph zlZSc+INN-X_W36wzR28#V2H1ziCUr0!M_G>d`BLBLxJ3GL?ZZ2O9iO-ACNGP_w)2< zEmK2xFbYP-=oTUFA_QYh0pOPd)Xz!zH=?5}l9N?cL zK=Dr!bweKJe~hOKb!<@%d7&oUTZB{;Le`grR0?%k{AcV;)uc31AWXx`ru_OqUT9fQ zL?`$LBqb^91#0o1krsP|5dRPrm%Kg#?b1{dSFFt`sBzL;gj!~5%W7E4G%*g+;;>%3 z0ByYpy`;32qRm_6A>q^fIcXt}$I{%CUJLZ<_L4M>#b*^DrkQD(wg_19pK%K9!FWDU z2>mGP34IHC^j8W!`i(-5z9>9hw3TUPY@{!5E6H1=&aO5v+Ze6z4s7nR(JlJR)jEi) z7vc(Kh4;-SQ_!vs#&fkE=}&IGsE6^Krl8FR3T;+hXfqq*d8*i+t8UEc6Xf?) z>3Wr8G!DS$gl-43lYG_>vxs6Yaz04kl5mOuP6sfN$@~)&UJZ?JEvNL;{?NG~C!c{? zjZ-WJ>A5hcnglG;*UKsLo)dwoH#k{fE&?;&oVN{P`Q{^Zm{XqR|0TybP4F^IDG}dP zfYbT;DHZ)V-Ps~SV<)0b=$O&LcZIg1V?)P|P8B$ua&B@m?3>O1!=y*>vms82kK~)B z9T?P!PBl8n|IsdVa0;XA(5XkK0i1j)7dU-MQQ#r-7^g2OZkSW^S%o3fJRP2AVq#%- zVa6YsqdCb1PU7bzXF1gcUpV5YBl*Hz2#4DsGxyV7fBX85$n5-%`KkFXNCViSuFDI% z26tcB72Gj1b#}-6mB{(I(2h4|W@dNH&IN)qzToBH!~*2MW5$0aNQZY!A|Hl4R2~v| z8K88}U*S~90L{)`pkb<{u!_POn9P@`M@J=;(R}UI-5H+M~~{YqpWv7Tf2Yx_{U1h=-JpD z_`ak1`ni>JiJ3LWdib8>;O`FKj@%w)9miv%7}B|B|MtZ-C+i(thh#?B+L74tLX@;) z`|>DA%%uce3tb*cc=NDek>;W zq0WBaRrl_;8{3kW+GJ=tKb=G>oGvGbIrLv z#P%F!n+~U4BXQM+x%TVyyKmfhBk5nYe2rN>_ig{WeeE3Ey`OE|f7^M-{@*X&ak9gw z*v3<=`P7OIQfl;@q$Ol!c-MN21W@feLljA#=B ztm*{_6>Wxzv2407Vbz;9PufMsAld>GDDf>}nhK0O<0*@0Boi$>L$oyl6CQstEx^#I zU}>OcG>o1xFc!wj*pLl{soBkV7$-1M)l3aj&olt*E@xa!Ei3`@3zKLm6D0n;g&K;s z0Sj9GlqR7}Oj}uL&4N#~ooPnDl4mRrtHfGFhBN9dVPnMD>vmwQydOvfmQO-mp}aDF zqV`H?H`Iw&PUc~{6<{)0bBZtw^FeP1ry!Foryz4IrN8O%}#LwGDnbEv2y%4BSn>jeJ-W#Mn@SYvNr1 zvT8f|IkM{7hleH2Hs#W>N5>_U$#QLKc`6}E>`EJ(mqx`)wrjorcP;C?*uI0er*MKd zoFvcv3&Zh9e015DRISQ?)xWA@-8_?PhZY&|Z*TO%y|w^}aBm?ji56v8*d8!Wld4!uuu zWh=pjtR$b7E#77o<1UP*tvHhL3o`=YXxs*Gdn1k`Fov`-gw2n785mD4yp}Hu3RTDj zV_P%Rz6gugLW&)cqOTjqNbe@!?J#CHm5kEuCF3$KY?_$Dc*}U1;%Fv;QLu@z2%{U6 z##(^Au(X8IV%T zdpq1pxYclL;MT&ehuZ+R32rmoR=90&+u^Q)+X1%|?rOMe;O>Om1$Qmnb#T|i-2itZ z+-|r%E0SMReu*OEy<8aYIl2GxRQN~OtA*8LQ+$qJ3k7~_aO?X>Z!u0Wg`w!Z;B<{RC5(W2%JZ)aFTM22%SgxWoSqhA^09UXxH3+jYZ= zA=z`!+`4q+J}fhKt?WwGcCKDwYj>w@gG+ZS2m^Gl;YRN1l)Px8o`mTehg9YZl)847vm zZN+6J+p#a}YF*W>O|$I>Sl5Bru?O<%9JNgWiwi?e2LDjvoC5wKqQ;n0A*6;WHK#!c zCfBN*4k10IvE>X184)reWTrIQoCP5(rLpI12-y*;LdZdBEIB7aE=;c$p*n=>5o$oF z5g|819)y|@YDTC9p;m;pVQy^*wIk$3r~{$x2z4UVg-|y_I}qB5P!B@=l*XKU2BF;u z4I(sz&>n>LYJ56GVbwvrQ8J^j8*{eI9PD0%e_hJQDcP=MV7$CNJY~qhA=a_fDGkAQ42tF z0$Bl45Xi>JMB^)kGDNW4@lQF{byja;Zh z5tGOVl=uunyr_avzIBLEPK!?0gbjKId_^W}!4Bm~xnTS--)hQfoEOTjfGPmiPr?Ex zs$>*zf$m4NENYTg?NN$REU5y@uTet&&?6LL&$%gt@~y!qJ!80|f^RWbqEKQ8A$fAWR1C3o8l~lsZkdaCh^^BoEr-B-BI8-glp2<#0cA+e82E7k ze2WwbwhT}NW)~RUUsLizmw@K$vM)&pi>e)?Ux#o%02VO`UPc4mZsEjFJxPDs+8$GZo*?#Y;@rD4H)dA-cN=!(C@HXS zW@p?K>q%+*9?Aes>2_pWJKuZ#=IbRC!V7MSy0+^(dq3QJyXS7-A@NiWJ(kKLUxfSC_4E#f}Qj^3jbN_qz=@8dfFgnhvB_!j_LhksQvL7FsOt@b#0e zs=NM~mCq!H(^cEhqY}2P-6+H)X*`hEBwAQ`V~+A7ORLGr;C}=D74VNus?Dho0+!a5 z(;&pNv&%%D;u%?lfRWYaYzP7SYR)+ksz#^=Az)+;xmtvPebru5FRSC0cXYW% zqyQURq}-+Jj1*vNi^r9+#LJF|IMamAO z@N6(by~q&vAp{JuE%yvU0|@Ox2v}oVZV(}0jcvI-2<@kgH7SF8S(3c4=D8WlJ&Q!( zk~dR_B}hI5%y3oiFoBU_b^^nUbYz(8K0ZQV5Up9Np~u4S(J7=5ws`_x=)#2(RlFYF zfmi$s!0iSpunVWgr2)yfEFcZa12VYfa4UlHNp*~xlmwKY*N-cL%76-NuY~w~D#V?X z2Gk{%eHsWS4`|Sm4t%tqmyc_KqMODMt?Vj<(m^N##)BAofQ}sDW}gV$@~HxbrmSb#XP%@kVAZ(7|~>9R%yNk*N@|A6xvs7H5TFHyxY| z(!tO~&^^xw+A*0A;+~lChr{m3d4B{#&C%!lArMU2KJ32i-Yn1o!m)0*`^@<%NPgOS>!01a*Qc3xwLI99WQa+4JF3|Q+zQSS)bVQV|kk1wBmSoJU1m0l* ztPD7{543Mn5ucBr-M;|w>1A*@b6I(P5Jw&k&P1m372cq7RanjifT(w}1IXn@W zdcz-?nhTAPuEiPiPj{FYZiH`1=xY!yjQQtPEfz2k%*XNZy=!rMc@h5jd91s^!)9_z@<)04JXJ^5Sz79@lk)4}g6Cav6 zVwYX)sa)_1QGSBOM{&jyl{MKirY6?Zlu|eGLqFD1=|x%$JMVSS@$vQlc;T#(x%msw za|2ToU~fvyaTYr(m+SG+Kf@y7OF@eCcuBBSq$E#QgI|K+#^vqT&l7+?C;8I zsf9JQq|_~smkpBT%a*St$t~&F2>o@DB5;t)fo{Y{^v#9x zmc)8uS)8|Fo>1fS-qXKiw`s#eZO9?D*aUWFIJn zxYOH<{sJWQ?r2p>dYyuA@t>)BQtU#h9gHI&#e8YW7heMN8Vf3uqfRig(J{`wkQ@VR z6G#g=m$FMJi3|LRth^d(NTff%^;O39);93lS(X5xv=c03pEmuPkb<#AYeHa&9j%F= zISILl8IWkskY~RTs`w97D^^n0~#QEZE)8^K71`am1rp-Ew3DMc+0|=r-wvR zQekWIvqS`qYRDzF#|v~E)CX#zufc8%Bb1QX&rd2~MWY#`FF{-)ydzS-#XV1&<#2H^ z7n-@^o|>JX3C;#X5!m(vydW%J<}d8)xh18WAvI1J2)+SK4H_4NRF7tW=9Ti=q^kUc zijyqA=J8F;FL>SbuVFEw7+Vg;wLThqJD-{YETAUN)A-_XGFTwdAD|a<`J9fQbP4ao zuUhoLB?xfY85*CBV0ugNB&DfQ!Dl zOJQhu1==36;ms9AX0a2+IpE96QSFE~}@7tM>F$~xpFf3+z`Wr z)bhwil_yix#a4C2Mjn{#*WO%yGizu_$}{dh*4?)@p3)!4TC1~Qaxl0u2*$@|ZA^}q zvDFz}%UxYd*0n9;8e&~TS+ghEpJ^Uon+Mjl@I1A~*fFq|b~V0Rd!sh_>F+AOXZXl) z=j;!rzCU%Z=2IXGwKZmJ?X0amW!st3?*unnQy)9J;cQ>+|JLBGLDtzH(`Ow`sn!A3 zF%Z)|uz6QIGab*e9nYq12V&|6H4O=7_0yS}!Pt@e_L}QQR*odjuX@>rzIDqz`<{&b zAZtIEwjYiizHh6}*tW5@ZOMiAqBo=8yqdP{iXDQrR{Y}ISF+~nthF{V!&*CH2eYl- zl)gSO!Rni`R(sqRU%39}%9~$~rmU^WAZzV}5T@FUsf{(YrA&R$qA6Qj*4dnK_TP2( zXWMqHnbumrv;D*EzuuK@_~}F|n1i=|!xDd;H8&?ugB-THHhwi_-WI=_#md`x7wW8Q=UUZT z%j$UiXd;|EoIH~}oLEfTJ2nOe31uuf@!r(UsWr*XndDfiu{Uk++t}vaoO;^aoZCaW zhCrNYcBM?7Kill{cL@IwqeKZ2-tl$Xj5tz$coMO_;||G zw{~W|KV^D0rGA!Q1S$EY5@+Lg59D9jJkgO~-}pm;{Hn``WB!B+-eTcG7#9i;k%=m` z;bW8$csKb=n0n}{61!h&8YL`cSAktog3=RpLuD68i=@Cn$r;555@8}I3M=WDS3&;) zgrhHXoZatZT9<6T3s9RWd zOV(DCb@yd!TC-L4S?}STj?%RLgi>m>r1iYjDE12ZXM;W;r}crNDX=huu-=FB-%Q>| z{5KiL#K7U?XXoZ-=$i;C z&%oq0N7LBsoFy*>eK8nbn2DT$?L_qdM7r(FS$`$IIUZ>c(QC*u;z(0(@b&WX{LNIpK=QId`(#rZ1 z9YQWjY0OC_ilJn0rg=Zxy#L1(AP;2%Ij58;npbVBkz1~Z;M0t_J$d-OV>gejwXiJ% zNVFrdHEBb*72&F6Kf+ZxZH5B08Ok77b_CTyjo6AooXYQ`1)YSd zMAxB%RP;at{RikTcvu6l5n#=9qQR+#_0BfV zR(e(d&yt51e_e|M`*4Zw2G26!XT(yi4Gx@e1_3UD1mFA$16M()p&sguJ9Xa4{9Ei=)Gz$DA zf>3}M@0I48FM^n0>nURvgmA^DdCHg~LcO=jON;uZ0tZ9%O}J(w7wpB6j&ZWFSC8D% z&|d@+nJxLHm!Win*Y2Wm31^s_nD@L+D_id4!EL}#H0UIRZoi999!5d7&! z=-`8JN_@1yJdsFox=VBP1+X5N2=?@G+5*x`7GGflgvWucaPg}ob0HircM31Ih&5Gt zsINiWgs}yPbjRw*SXx;N?jm-y!7c&VFnlP}vWsomwZ4sQIdI#PavZ|V1Fnq2%R0QP zBdnu$Z8%jm0OnVgI*4ViO^mNPQl_0L^-j{noFoGF19;ZY1U8Rw;pj}>)h3uZD>ao^ z1LcLW_==NDZ1#+T6$6fo<(Jre8Fo|%D>g&gL0N*;Uv?Gect(od2ac=QB3#5IBmuLd zsLDuK&-3XK-8SrBLJ1}P>*$aLWIIB?1rBG(OG!bBit9!kOlf@MXq1k^cGqhN|1COy z2M#Bj?CT9<9bzZSO9TT!BG6r|73(&6&kwM-gdkpCJI~KzT?NVBeWUH#?DA|(merXu zx;j=@m(g|I)pf1eGd<6-JiAbO5d_kBz^tr>cX1; zJLf(;mu}q8+V-dP`}x5F#Ri-qFN_5_12Bl3^imJpy+vUJr!CGeM6}Ck{5==*;#al5 zH!p_%DWpaJ44qYQCd}Xnml^=lHGdN-2)d@|o(Ew#-K0l0<~15dv5pH`GMSjxwx~TO znzAjz5Pbw%^IN_K+O4fp{2KL3GNGivM2e?fIN>OuAX_V~C3_((g0P5a-dNDv*dkaz zU^XRMP_;WdOk1k|)k@GbsNqqx27qNf?{Td21cSuy8(2sP~~ ziv%bP1)#q}9hsmh!{WgFLY}5j5*)C^V$1k&;`x@P0Xj+0uGvAm=A3SRyd(iBSR25r zj@Kf*1yM(+i7y>vFDqR>*QZMb%GS!KTI#~KTR4bdx~-JP!=Nk0A+GEa+D_1blZDmO zwoMO|p`$E8K%dwy@xkL=$MBlh5ckkQhCR#_Lg!DRiC7>SzpeB%L;>d);sqqZEsTnZ#Lj@m?0B_96 zCl&&Jt`=zvG+13mU>hhMge|la{&}#agCJz0A>vO?J{1Yh(zyOB&R3lJlAr`X zB48gDo}UW`X&G<;C2C0ZrTiG}ttpWfQy55s4jlGeDtjie2{ZK`W++Er1vt~lvTR*nsOe^R@c3&zoAdQ zzBce%J-_8&+sn2dxb3|&!ajF8U41655d2?URo)!u(=#=k6f9-6YL;tw@96OF+F&i ztv(%BZ0OC`PA;EJ99@;IzVxk6-TKtp#CN7XoJw~cxTha}>=|mWM^~aL=gyRQC%8{2 z9+ttC7`UN@9JYB^4}a_At&{7+>lf3V!`|Tg0lR+no&WGqAv%*VJ{um%oA1{E>K#A7jN9fO?^H=En zH9CKa&d2EdHahCU z2=EU=b5IQq5&+7X zOBDkMpUEo;q>8RJYo=$E?HPRtJ6yVQYLlX8Lubq>;2$#An{q0I;AA3GPJ;v$+&&?`WjDAh$GOqlS$1$RsN7Op1C zE`iYlrnBH9wpJ9=fC_Qx&jGWbD?~2{VF!!5GQ5DmTkEbj2o~U_KotX=DAc=T{SVhUfA?o#i4^Lu=G#C67 zj0IT_F_u}#LtusBBpKL_ibwQ@C0}fX{^o%`0mr$Pm{FpfL99;6CVio7!coCdD=`eT zmi9tSkC6%?-~_S9`W*C_LKtL6qc+BtXRK`?Btt)&Xc@@Xdck1ti$5pFMGxtFUey@= zCZxn$&ov?RIy(6Lgt_(F`DqJ2_(9QXbl`Iz6s<*v@U~YmBnm-zv$3}j!oh{p67w|Q z%oLnhjY|(=sK(p8Eg|Yfhs>`!Ow0!;WHyV`d310Xqn+r5yd<%@qv6varIu;`1w1&} zu7gEL&-jdS0y|G4v5(ul-lrx&%X89jH-!MHQiISe4zN2_Ler&KE}3>-D`gV z<|2oaY>$2J%i|#2GdW^clWp&H-t1iS+->RuOD&By>uyWA`(l^S)GKA&4^~bZEh;_+ z_myD4W4ijXdc(Dm<&k)Ad@-$WN{+6+li#@N6b zVIS_Q;-0ZPCi|hTA!}<$*|w$h+rZ7%)+MG_Jt zAMU$j`9byftMA!ggrYjyGLA0R(Uo@W07ETO%;)q}TkJYi^`!#i``fkXj zz9&;Zz}63>>vyNYbPMd*Y7)cmd^T%yVS(#glBZJb$3AFYonCMIUe`xmcO>l4(R9Z# zAf{T6vGvDSMmFj_@sayIN3-^-xIa;w=t$JYL#(|uYjS)F*VqGC%GGn%+?#8p ztgc)WWwzxTf=uZX(3=aFAn$m19y-~-*)HJV{SUNVa8}UB|Kr&#R1sT)N;V3-twAMA z1>U!zLK%7AN>&V&>=&MFYv3Wa22EgVP~0ur%-QoctY^qK0S@y!>lS}gizDW+o2SAS zPyB|$|0nhauzmX9><#kc4sM46UK@EIenWwD6ZT|V1UMJ;&mkS&F#iEUe*q5J?T5Xm z)Oes?td|w62Zm%2?H0uS**~lGfT=zyxeJmcH3*YDSyMx)QMRx*_FtK00sSMW)x|c^ z+Ar#Q@>^xYN1+|)zr=<-49?$IKTjkji`zu=ENJcp%YoCFbucjQM{BeN$*GAap zq4iU@Js-WCsyYlzzqLN`*)=8GIIu2BS%$>4`@A{$lZEpY29pj_F z0ZM$aCzM?!zSc4?Kudh>kNZx_-w>FCqK;lfSOq$IY8u)0vI~SuxFJIMzMS`X4caJb ziCUw!s2w!J;%%l8gfonV^edh@k&Cvkcvx{GmfEty0SY|}gkj~Ekbl$>sTcgj%PO=s zBU-wGFCG_fg&Gppc|xM_|AF>>9KPX#{jQ>=DQO{*_QM*YEfF_K0vs`2f3!OEjV;p_ z={b+3EfNOOj%=B>i1_TWv_--|+9O-0Ez)ZrOIxJgAnod+$S|N2&j&`0!U*y?Ny#p3 zJ%1|QWKlp{Q{tQdPyj!6mV|LZ9}pU<5A=L3!^O@3J&dE2Py6^%LGuM9uTEIPNc;8CBzpC z`OnneV6kB+5p*xwpC>zx-FT3rdv+=eJ02#^FII_B-C*9~+2>w#^PYG-H*TDmT?j|q zXM=7yEI2ZCZeeaA%x^nyIt_xBaorPWW?V?%2CWpF*yah=!nvQ{yav|gYbozh%CQo zUO30uTmMgE%|bK~WKr=V=7>uzP#nz7`!1jXHGY&5KLyGkd&D2oiTnd+gAM*cdNvrC zg6_h9zq3b-*vr{3J|46ej4FHc3XL%G7R8l|2vlMs;}TuW5F42}T*2s|MPLQ9jnbP1 z^U>{hVRyF8ow2pDw$|j?G;Dl6#oD^pn!eNaVcWXr*E{ao_NMe_Qv1)O!L{w(*c|r7 zZ@qHsm9SWQ97dnhDB83iQwrScpn$U_38bgYO$8I7jZzb#uAWTS zb8OdhS$$2S7tR3P4G++*W!3Oet=RrV59lmv@KLPSE-hb*UwmgVaq8`>DcCuG6^mL< z+i=!qob9Z$9X4{qw)3^uGtWHFKJ$Fqc_gNPVA-DK4T|1@gY$`H5du|NeRWpv0y~&Y zbvs+#{>Y%RYGZgVroJYlYhZN^zufv>_s#CRx}Epo^H_Us>{&jZHMlYcH*0YJ^5OSR z+&ppD(1Tt>H-?_pi?>U^YuHst@`1H3)i}7`l^Ht04xLC1j-~2PrmZie^e@2H`^HW< zir=&TnLDRagD2AsFQnA9_Zzma9$mMrkKGvtOQ7$Mq;{W9H=F?v&_)$Ip!crMo7FdF z^sTJEHR->nZ+}ptm;(Kl#+x3udwc>)k{F#~# zwx%Ofvx}|Sm91^e)OND9otfG_Y%P4mZ8M4+l3OzF-K=|ep+wVijsTcaf%4F%-44A84AtrqQ;*V~s z=@VUrBkxym9Q;LeeuT~^GJJAi0+nr^TG1PwJ{+w627sc?lQD1e1pm-3X!C>+Z}WtZ z`=8S0$(*-&f`9D0VDp3!Z}Wr@Z}Wr@Z}Wr@Z}WuEV>VAne0rNF4Bc{ATC!Oh5h#aX z%|y_mH4{OL)=UH~S~C%}Xw5`W-kOPkyfqU6%dD9Qt<0K<(8{ct2(4(%M9`u&6G0!d zW+GJHnh8N(z)5<;JJ83x-Mlp(vL{?0em}B%^Y_;m0QlAzz0iHX|+6AtRCCf$BmX60}$?1T9tz zL5tNw&<>%NwiC2dK)VRqEucFHx>G=V2-+*4eFW_n&}RrbAk@+>g6$_btTHJoph zlRV+YLK>%e!kGf$C7$rIK=|=1gph9(Ql!twsaEgDo)8o-tI~ggd5g)(5ZTBh3PfOZ z09!HiQxQ}x%gBOqJs~wvu7+GA0o|0<5uaG8Nt|IVo`>Xt z9-Wn)mP)iqX~w;eb?SE83k0lV}@l`o!>{7Z;R?+pa3IaSjBpsGXw29tK?LM}4-%lvS$ZYWc005;n Ar~m)} literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/fused_marlin_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77a99d36d6bec6242a61b6d944171817894148a6 GIT binary patch literal 30989 zcmeHw32+?Od1lXj--7`#gTXXz42b*S0g)oXQ>H{pJY>0GI+G!~K?0n?fIR~qa6^W( z+60J_5R{V;T01Mqc1qC7o27PLC3&ynZc$2lhD69<(j6bzd?ucjLFO+tBs=W zPz*IrF&ajj&?L3vS`G5Lgl=3%e)UPixKWmwlI98UPg)4I5^5W_6JLG8k#vqb32#Wa zlJ0SL(lhQMZezlm^o{!nZ%X)+HRCnOz<7YT&57D%-FRKHe!QNzEs0<Rl(MP6S;9%NgY5F23{*fu6yC5%*_MhF?H5`wNG zZ>l24FVxIx*(&~(V_kx8GO4YHkIA%D4#7wzZQa(lZCl^=ZGAho_3hl&H@dBF*S5a9 zw)O4a*0*O{-)ikds>xo^3TNf`lZTkTR}ACrOg}TQZHv({9%xJblm7oN+s$C*x0f}o zTB{nW5^EUR5Vo`t`29uOz;0+i75li#(o0&k^+?}#`_}#sEsP8r@^}74S{SyW?OWNN zZClx1Xk{H1YUlr$qg7kjzREhm{4$SN+qb;(mh=xgT`+PeS!@5_jT*n0PLDF9pwpdN z>IB6+^$Nx8XYJExi;6F^@8~3Eyo-c903n}dyHx2e3&k8`4zf<>5NP)ow#J_x-zt-9 z#tAuI*y=lJWX7OmeI&obkl&G_{Q7U|qQ~dM$LG>4L!U|=qvtYHiK)yKo2DmI9DU-2 z^JjWL#x$cE!F(j1NW?ED*wf(fkth)3%v^jXb0jr0Idv(j6&fzaGZU9tCYDUGv3O$Y z(oF1PA~kU(mY$kt1%D+eBvudUyuh)uagH6EVUADDK+JjeGzcNd(D~HtvoEnzmo8_( z^DN8EO|VDV1bZo-VTIZXo~KjKV?0NUV=>p*L>x*axR|Ll7CtsLGnqS!K=+PDlKh$`YB(+E(^H8#!IHi{bty5Iicihg zy*QJeo1IN@8Bowd<&Dzw!@czLGl`ot#-QUf46Q~g3YVrqr)V{W-l$P9WK!J3WkHvk zO$*xC5%@rmKnxOyCFc@?M@>Z)SwA07%(2HfF2zN4oCk`_d6D>lr19&}H2u>HeVOEJ z-|Wx5vVGIslruN2wXe7R(A=N5_*TAr-_rb1v44Bc+`e{geeQ?mk&m2pd1w3D z{8~b!G(%$=ppG%kBc0A=6)By?`Uvu&xB!szg5&ZO!>}{4bOu@mROa;o_!|@q7pLOs z0qJK@Fkc^tVH0418CM1NLWq| z8%xb#^M%G@WpB)~Tn1X-tY9znFtMcIP7_^$1g}8RrWj~*a#-X8QeNPdj?Jdhnb<6} zSe6l7WfdzpE+tYI;|Zz21EtPE!FUN$zA+&G4hjy;6pF@7O|bClB~Gad=27ZblO2@9imro=n@XITY8yM0}{BNMxO#t+<<&nCMwDu(R zYZ?fF;me+PWcLN=5*8v0bOyU0@)S4Zg(g0@7|5vI38ZCK!WDiQwPRTb@}b2;g{D7k zFN9J!KEj*lA-9Z6kuVy*2_-zObWrG;#FAIY4Ww8HWzTnX%DB1^;u{tRo~#Uf)1>z2 z%%vfNeFAy-kn&E-n@kpJq`6W!o-PW}QWT;=NeN4))W$+Xrbf-{XQ&e;z6Q0VLY-v@ zeugPeV;cH9Zg|YHhJ5_iashz;PG?l$gm=B+$qPEi?<9Plt8USx=1tY^= zgDy!hFlc^o#0$si^T(rh!HL#bq3^_jaW-~EaKkW2x?{D2HbAe#&I;y>v8z(wY$SG7 zQjmVZk|FLZ+yO{haKcCyE0{xq3*D2mLxW(Cq3tHvAi4S3*kmFV&wTuU;Q1I$fT&l{ zL;smX69BqM2Z%WADY(N(P{#xVo1D$u6m-e>4ekg^QCB%9dg(966O)2|VvdOmH8A|f zq!$OX%T1?|gkGKCoLc+%B<6$n)A17<~rpnq$#64z+=6MuK0KYemwA zq#a2I5@-k%7e&&AWEYZdBt1xak@O+yM?!3d0ptde3?UgtGJ<3`l08WFBH4#z6vd>E)C9O6zv3TZ4p*eN?d95y(?1A`1PNK`X~i-R0rDgE1^ zMqp|j*HHHQOy7J8@9+-k1mcaUVUJ0T|2w(-7t4Ajy*KF zv!=QS{+{*ZX7Ax#@8Nr68~)=^%uD4iE3nhJcw8K%?B2yw91UuDoO=N%7-}%NnzH7 zX`boSiVJ#YblDWt(c># z5quM4orj(k|B6zu6{SE;tL9~)niPLhR=eUNW!)*XqbRg?$F!YAp{Z$qp#%z-Q4PlF z3MOHZZ%~O*_?^@$S_k#uQhD8Y)yyYwHU{bo@kz4Bp=mQoy7CINAykn8% zlR(LH8sgi4f{_mt3`(Gl zyc=kQZwE^A9YCA-PN2uOF1{9M zH(v*|hpz|P%Ljq>@gbo743=|%Z}^(#cQlm@S;ot^@hyt!yU+!uZ!7eDjtp89C_~BjDQPix)X58C`}L= zdM*va3Z1#0qSI_94WUSCGKSGjUmqYC)b+t`@(aUG&lOr4i|8afaXCISl}`3rNu(Fz zTs+AFtdSli?jE`XNTE9kLe+a7Knj4IMxn?oO<)f?Lqv2YMQ1>Ucr7Vhubd1NqEg(+ zDK4GCV&l6>gB>jwT#Y$cF=m>bNFlPeZB!ZG*^`DppsO|sF!+0pCMslb;&ALT3-A*q-~Tbpkk+O3H>>#M#+Z+j1XECVE;*WCX-g)0suyWsg*`m zs`>d@eB*e6#xV433ShPLwo3Vnrll$>nwA%*Ah*tosZ>H%OAuZJJpr{OUQ;t=4U#5i z>eRgaoygZaOBM#y&Z4+?kU9rh*X7t|` z$25|DB!_@p`tv`-=kE@ZS;Q|f0065?Iz&ly@Xg<*qBR^Mrh;jbOC_Pl;_#aYhB;93 zVZnkwdoK`h6XvD9}s|m!{td1%qt3DcSLJqiK#2Bpo3Xebk~zq>qU%@h5$E#gh8?k z35%ovq)eid04_f#q=IOQgsWUa4pAJz18s7cj7o7%Ua%vJ6WCNaTw3M6guyh&1U>dw zm9QScijzpGL8h_bINUtpBSA0jEy&9fB=fJ~FMS_u&qeB!mo=2xm3P%Fy__`O)HvLt&3xMoA1_vnl@am>zeznXx4l@ zyX*J{P**f>bKKgqyk}+L)zMY$^1;>keOn}}AI&z6ZUD7Kiv2s+;;;AQY@O?4_ia5{ z{WICVXEuP^dI&bvp0l-oXzR@CpUZYVw*l1F36VE#p`0!Bp)H)%AIV0JYyh=|tIGl7 zYRuUhzdiESfj19)x9z>&cYCwVV>#PcRzH@t;VZ6O16(Yt50hkSB}nYsHE)IA4Bxl) zX7#;UTkj(?Ro}g2&)e)vndQAJ@#X!i-B7r7--fOGM|KZDK=s{-wggwt-Zw?^;l?a{ z$)}zN_-Ca%OAp>X`ADbTZ_MfUX7&D+>1^{}2y72zZB1Ey6VL~q=1(cwYy455MbyFX zU)vm_0e(L*Qg)|kLe5ON{GtUpD{?mE?36bsI*@ZB=R(d+1)4<X_Vpv1_ZtU4 z40Y#gnwO7%R1@5+Y0K5LttB^VhL?^%G&vW)^1$W4b#wXVYVeK7>ydTu`uV#98}%da z58P}2(Dk{j`E!5m3arep?cQ*8WzAg#z7jl=VID)PK^kc%if6Zo+C%H79{pPhecEn` zuMROOz*8`M+4GL<{yOysMW_`Y0xT;?uq>St0(|-}WV_kOyE{VOT;66D&&)Se9X- zar?R_2m&mx63k0o#48jX=Ce)8+_lm-1Vl=y`DR|bLx>hAi#i`K7g-pt zgx#o!&2+I&%E@I4z7-bhijW=6_GOh(FKK}~QiOq(qG%Pgno&_S#zt_SCKZ7zM#OX# z^LHq<$u}|%6@e?HP>g$NcdScSQC;eG4DBuo-N<+p+>5a;v@+g>wgR#Q3w~|9<~u$G zE^<>RyaVxTFKIOt0F1<7Jb7Cn9X79|K-7eUB&5rnLpAY?rRA?qawSsy{j z`Uyfd0R5ns5AO6XGa7=S4I-WfOUc2?F%aX1wk&Hbh9X$hcx5aq8BeFxxJL@Htihv2Qz;cSx*X4dcrL-Hh|Kk=%w-aP z%W|I#Rh>>{0K&nD7gL$bw6eRe+(adq*A^jcSek;VBAd8L@&F8o{#Dp@)mw25_E!l! zM6eoIku5@KU}qMIu0~CkFW>&EvcU^rIouIe`t~K*nuHk?-U{=>;*@VyEFgpg(ZT&q z-J}#~%JxcuA4>U%@i*i#p=+2VtqKGuaCty=hnr7xWc{;8p1aV-oy_>!^ z6^Ad4Tijp{2WgUhfl^V6@{SgK6?Bnoq?CkK$!p#uH0lu zsj!W}tIb|H01$6ls&G= zYg9o}zwqs&P&Ses$hO#nay!P46_;Q=RX6F&P?BVP2Hpv`y-B+sA?rEVv3rxg2HMSF zK_z8D@P>SGQY58GZ{G?45deT_ZUV70C|i-TD$P@UH3&iK0=_@=UZg>C0NGuGoqc^$ zNtA{&R^D1jfr$Q*CWE~hpwB3na=xkHe)1R}kF9IPNwToTCqq&k$o6xp6C?QR9Mrl_~?lY(P)--P};k;2u76anQ3fW=+G zyb)#N5=dr{q>#)aNdti`&VInO<>^EW0N@yTf5BSV^}{V==rJU?jP-NudLft$L1HRx zefc@j88o!Xb|UT?Z~$|iMF9d-1n}anBZu<=?go;ZNam4z83~VM9LWNbBS_G!R-rqz_a& zMd%+1C?&yD{}==QI+9A?q7yvQ2z23~@??($N!2NxMEMZU-G`f3^1)rOVXG9ff{j~%sJz~h_cSgUADbzE{hiUFAPWwP!Qf zn+x`Smwhk!ZW6X1`HhRum7!dY#!sn;)0hvmYz8`WflfeM&U_f?6LlaYkQe;QkQZ_i zx;`@ zT=irdy4T|yj{e2t4{aWKr_}_sdG*2N2Yenq4ZXl`z`iJ!8rQ(ldTrroALC}=P$#2f z^_8$8MnevU>$mkaZ0l>>*4M-uC+$nHb(}H3;utrvX2!x=Cbf+96}>Y19=F25)yk!_ zSCKoa$emT>7S_f>zS4h_2HcD~X(dPVp=>ua;|`b^dIZlYvfuKUwEy66Jk6dK?9#?H zu)yGmm0-C(Fcc%F56EExi3_O0Sn-msPy@#eNQe?K8CNIN{rgZ7=({CwaUGc7)4=97 z%`|Z3Pud4^Ga_ zC~n!rQ@;(?39oqvlnfs{tQ6Irp6@s(oeh#Fe#sk?v%|6L!?3-VhPe^0f=RBZMcRg3 z6hPXiTnNJb9;D0tK9V~~R*{e`w+1*q#Bzc;1_wN*GO^g(X4s?5CMLPx#UNFqSz@t? z1Wf5;u{45@@_4&QMpW((Kr$cP^0j-FJpLz+sc6Cc1O`<<<%Y$PUH4tP*7?o8llS{h zij>x3&D%#{1;Ap>x=z9f6Kq-1|CTdfN5hXJ-@boI|Dm}h-!_WB&H4If^o2t~%ch*U zZnfcu=5_)*bGVOdLatR{v1(;qz^bTrmZXvvaKY_23{I;MEM_A;=cuAM6@b~wao*OP zjvA%f|2m{2*@0;N30hbJMdR;n4fh88r152Fsm9Sxn~!X*@se$!R?yls+SE|%R#+F< zbkR8%z3J+^@9KN3qa5{{b~B@P!Ha$H#PmgEvj$$pYf>-4QrJ)v|xbN zthP%jH>8YuByYi3SOS1Hty4hh1rx;FuB>{pkuJvTE2giOpi2I#KLK&OPEtFi0|yDK zy^ER_EO-Q~KnKwaCY?(B`kYqm1y$U)ZO?1hW7hA`?o^Coo13hqnCNokv7n#9e zWP;`ZE@~wWko!ZFHX~_4f^C2!lN4M?=Y9pr1td6UNn;04n3l+^;fPDX>>p0<7qxv=JaEAVFETXpMaz8TJ%d$X?IhhQ$B zSU#~b_Ug$cOz3clV>;V{Y#pMu>QPzdH1V(SFZl%frq}(oddTIta&zk zEoL~z}H6CJg*(h z+D8l7tR?QY-o1KvBHKE;VIN&NSd`S#*h5$CEqH1;Ztt(}BU2Y~X;CMRT(WmU|20!Le*e06uaZ+EZx;hczQL6MbeYJbcArhPq=qlZKritH5K z_6Gktzs}?$gQ5dPPRd{RV@zG_q>N5cYcyC_)1Shg29WD=rJVta4FA8@lY2)wPsfIkM_oomd;ny4rJ=j?Wz6L%sxgb8IH+qpYbt zqM@^{VqtguGFjMQUmGR+dYstWmG*FhzYVUXvaxmu?O-jH))gyDrOT2azP-qPcgcA= zAWb)1?&D!x@aKj<5Bz!I&j){gNU>F|SCx@xAXp9ugsPEpsK!s~i0$rX0+5%FsjXt~ z*Hw|%SCI#+$U{}+4OQe}7Oy1oFpX9GrP)?wTVHxx-=-?Q%~j;v*Ln-+N+Z(>by>r- zRY{>*9m4@lxZI*0;s(eZsul*UPQg(!KAhg(N-voPp@9l*jC#}y1gAkbrb!Q(f`IV` z$4b#` zJ#Vke+iUW6f8O4lw>JU*V~0sLF}IrU5LF1kDi&i1XVJPJzv||^kyvs*At&#}@cuOU~;~-Fns%fH_vghR5(r~z`WlOOl zlk)u^%AtR}nn5}`8kf_@{NXFjbSho1 zQBaj9jWMZJDnch{lUIbgW-n9pUP4&ofMrLMfN#v@-R5`)!1 z2%Mg$ORHyVh-a`y5TXVxvr-P%roAI)@7T2W=j{Ct?9O7RXwbQ(dmpw|86;4uY214t zsjO-qRgWev&&^zcbi(Bbq7~Zt{bzizO1=V`e0-QrwQMr z8A%J0P9zgZ)I}q~5+ArKUkhT6D;I>=)KU%%aUWoWT|g#q!AE)2+4FtqV5;F1(pDAN zngTCb{>a-{P#4I<1mW-&d{x*w0$n^%=;na}UR7!P8~6ZrRSBN}AHcCH;b-6jNLHn? zfRCBMV`DJ?0%|4OVQdh#0wx0fcCx|6L5`0($)*_>gtaSgQuu|iZtzi&v=Sc=ZvlTV zIjH6%+iLt|iw$n&DXwWb5`+za4}RCO2YhP52ce|02Yl+l2S*dn+JZI5o_YK@lj7=Oyk3uE?iUgWLL}V27xK z-@kUju4MRy^%S>PGy#XxtlB%>uXT$SbiqtVa@o*jrvkw{UwZ9Jq61w{#pMDQtgnP3 zq6axI6>JxM$oZ)NY$?XD0r1@ncHa+ni?!%eM|B<7i1o+@DSA*0A=g0F!72*|Z=}La zZ#@4`o);tNqN$qtJI}rLoY;h+nlV%ha;=m&ypV zsN_B`JqXCa(+^CsZB(gKKwDFR+=^{XoPThrY=F8T%1JJwl7<9K8}vY$NMn%sno=D= zqy)}l{2^)q>dzwez-+r^TehtnTiyLbbF`S(cfA^DF;{sj=hFRgf-C!1GJ!FbKY;To;) zqn8f$op3n}N!r)D$o)$o#gM^wQHTUCxueoA0jUfbxD8eU8NhEjWPqYF$N)uUkO7Ly zAOjRhkO6Y78iRlB{38l}M5D$Kz3bf^IFTDTfs!cXDFX@*EzTuw)76!8b*&%X>^_m} zK9O~u$XTBG)KG>qV0y!HdSOHVLz`1Hz%L|acZnwCV8PcVT9AVkO|NJ}&Q67C(Se+k zYKVv~lfhKGegVWk@xn$ zyPvT4j(&y`sd*4}`(02s!3j1NOidKaa{@F^eT=}7wtg0vo!WW=7ggw%oOm{$fm6&XKQ(2ZrU&dtyxLHZ{8euWYXiQA-%y!>SqEX zsE>+X{T-yW1A3)nU=^4#u!c&ejEvG4xAiq`>ucWD*TR}6-DI8E3hTsXSP?c!DATwF zcxxpi2K*6mva=2d;e;^C+p4cmQI(}CisdXqvD|W=cwsxDS8&NR{K3WSc%?0%S2_kR zSBERqt=phjSPr<{nyHmhtSY>UV%0%jKBm4(%>=8+LsjGrRpj9+^2RFi2%uQ_zH}9T z3B_vK*0*_E-f&_sF6z>q%XBrs)&l?4{f$plqGt6)*bTtKYAwkTer#|3-2XQ5Oy zOJ9U$J%IGNzXGBx?5Kk6ODm!KIEw+ZM0yq_>jW)_`EaEgQt`0wV)uh)`A?w#2~@|! z9wQzGqe!F&7rR6GNbi#Vhvo*Uv-U&$5_}5WQk|hag9ur5+=5s(^PLq7sR9-S@a#{a zyamXjBvbH_GR3J-@nE2I_ezJZ;qaT513kosnjfxMkR9-^KgUY6L7HIdxw zudAUTfL|E_m#T<7z&(bV8AKXT1Iix2umHg#D3^3Qr;3#T1g?bv8QjVQ1rRS=X=q*x zGCRuTI}3G_x&;SsgUfUrh>CIl39^GJ4E8D<=}rhZ=6q6cj6@V{MZlkwN@1#ubL})@ zA4=CF{Yw20PA=x#Dt;*ye3Buw2l$~DAUw>O{bcHgn-Z5?`9Q~#E#KI@**K7E9LP5A zd)@PXX7k|L+`+R;&U|h2+OOnlhwi3wwa5M>{lUNo@$A`i-)H}LAscuRq6M1PPUixH zcVoH0(R=Z1%`*t6g;!s`>&itA-t%NVN0y9vcW8BdJ&<+vXU+YRf;<6!fC|t3DUu%` zDb&XdcOViTv9%?As^>}1$F6+ z2p$;0>pG@j)*<0z%aBYF#M+9^HYDvxI*?pM!XQBnD1yd_rVx#KQb5d3c6>{R2L3Z9 z_7_MBS&>|-VPISq0#}Ev$Si+8Sg}bG?CO7DJM34nE7;u7lQqAT?R#khsH^9JtK(Cu zRz|Df_hDoIX5-=ejfX`8xMa)kz4`M=ppMR0ufQ61>&ETBK{1@~_@~`lV;(8R~HTLo?9zMUJ?_pg?G{7%F4Z*NzLe5Op zc8M0`tR;wm4Mp$`i?)3@P~VK^u%U^wV1U^wX7g5jVLF`UnufRhGE zRIR92x)GGzRgb%yl;=FyZr{WJ1A-|QV^R~azsWU`N+jSg8JvYo_r@G$`Krl)X z1dAGj1VK1Qjv`?_yPhbs39?0FWG$BDebpBTtkF9Q4G0#UrLnN(clc zJoQ9z$rWJIwP57mUXa%OrTGv}dWEhhN(TQ0leBa-G){Yj(QRUtT((6b_R2t*lnR;@ z(pA)G7zmOs;J;=d-wz}mMnls;>hivWTm%W}RtZENN9Pif+dzbf0@b}h?qg%+iO!2N zFvi3EY%JLTMnF43I|Em1pCWtkxI89+m>lUhrO5}4PK2BU^M0aS?9<#&!HE;Q^#20z z4-Q#rG%)aKTK|@6{Ts^uw^ZYQr4Icib?C3CXFt>GG}<4VwHobbM&Ld(Q<~3fKGW(o z+D|ATpIIr*ZqaDhSo3b5sKeiIGeJfPKe*?;?svP1{Gq|}$N>K0Aq`a!sR=ng~;HjQIX6>WOlgC=2l`Ez-HofDoz)1x)t!*L!4ej&3%OR zQ=X7mL)ZY-(jnFowvO@y5b|ElBHZNMnDkzidVW85wx>n@S?5LvGb zx33Q8>=Ch#$oflU14K4h5_gEmhD&54M7CS5wAJ%DXOp;x$o7`V_7T}A<*va!r>jiP z)hs?mWc#TQ{YLle-Ru5butz*VWKUDU$Quo>H>~S(aIMinA_I+))(E6Rj}dji<415K zkE9!T<&M$z8yCKM;n6Y4cScWj_2nDse4s(nwIe6;?Oi)-Tp)-_=dTB)gMaN4?h==k z4jrn}p`%FYh_g)Th^$QMh^(M=#92@};&hi(v4_aKiX!@m)32y$4ROLySu55OwhooC zp0GixzCjETwgGi1OxVVqbSXlDwNlQ&${a4(tiAZwg*Pu`Vg7kE8`zt3?0f%c&T&X= zBf;8B6uyJVI!nqQC9*QrB(m<3FWN(7WvWSJu)tchV|Rea%2bochD%g#gviQNlgRd# zsNp^$8!b`8r-*ESi5eatvZqns4ia{YMiMC=i1UI*5=XH{62S`wStG+OKbt1*`(knL zhmMD^NQTR426PlDB5|6P&d@@fWjaV?_L9!fL1YC*Bu=-YWgg-Lv$yCw_=pV7Io66b zgq6BM!q%bY)e|-dnzzE-wu&L*1Vt?BAj3ok)0?7nBSZ#AAd6&8MAlrQFD*pYS|V#B zGMEAt#qA)nGMyx{t`dFOMP%J2vK}JqEzy@gBI_^FmjNOhEYX)CA{$1186oWMlA`V* zvc0G*p>v7gmwtl3cak11q` H8T@|%qIwW7 literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/fused_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea2eff93138eb5f518d386197795d4ae70c0a953 GIT binary patch literal 68741 zcmeFa34B}Ec_#{hAOK<`0FnR+Zr~#Bi%3zVcA45&%GxZOv4cRoAVrDp*ui>^rMC`W51F4b)%+!6HAvj zW**RKHDi{3YbwSzX79JB{P|-J7VBiLi@62;g)BCAv}nxT?_qw!Xz^G{e+ly&N4;aE z{iV!r8Z8?u?=NTmywQrW%KpkRU%zjxs=tbbnMbR~YWi!&YWr*1yJd94SY3bJSbcvz zd$*1@j5YQ*GQVxKX{@=wnfdLbEn}_yt<0Z4+BVkS-#)gnf8$t3f5%v7ehEHH=ji6K?*4A(ca8RpZRy{_`~{<1$F}uvWB$U??PELocQAj^=+3cS{kz6?_wQ!! z?$Ia4di#5s-!r;rY;XVGv3>pf*t>3Ye|j8rqX)(gvQawJf0)GFE1I`cul=XiNK+czHS#N&JhG1~ zTO+)Djqr*!!YkJZ_pK3LwMKaL8sRl-gx9VSzG02Pe6WIRdPSd} zjWcV^MzeZuu4Rqzf9#{%x<CHuHmq;A7sBmNrj*N(s4!DoX{vs}_xJh4ViQfx1$8*JnD zypr4h+#2hJTC*4R|3I!a^ULj9BYZ#d99SdnAmR?;?=a%kc13EDi#wu#9Y1$;jXoS( zBm6kZ?+f;GC)SAXRSzGua!?E zFf%T)F9q|DmUIT?N@bo+`+70sPp4;`#iBh=y^@>W(FRgb@pK9*&SAEnMt{T8TsIv&UUnBYb#`@DYq(FIuX;Bc=L+ zIkZLMRLJzP;E@1>mfl--8lJeTf{BnG7$H=|RyX-^yZR#;H zR(31=c8-*qR+{vW`|ihXzdX_rjQAw}p2v#6931D?>enY4d1++~7IH5rR`djL{42mo zFZE9_jPqlS&#kGqb#m{qU#)dAy|~8g2y28#*9cz^4pmnXVCp4BfBEz*{rV@KrL|@y zd-m2`+3NjE&(Bq?fw0`p$NzHQTeBB0tJlJPevR<8_PVuZ^=AO1uV8n`#sGdceOeJN z@N6mF2^UZY3KTpTJ%wm3pVzS3=t#|IXern?>kLCZGyMQVz zq#)+g>~L=R zrv`J%HT=IO+^weDei}$y8Q^NalU%$w{>!!(uekV7-Sg9h%`(306Jy2kHMl^mE12RYyI@%_Ft{BUGE zJbe?AZyxZocSJs;qQ1EEM`PO|L4+0*;?`H!F2 z??2mj@@#M4nZx~f-+Q9(z~O_*0@ggV?&Ktr?;Q^f4i6=Z{6oQre}KLCxyY4?pg$bp z8+FMB|MP*-VGfOpj0VD}ZFGF#qCY$k7!7j%A%1*vf_{TzW63&xGNhhtkjBFwd|@KU zN5Ux6kt&l51}CJv&a~G*HXig3PHaIB3(1ShheC3#!&5Y1C20GTd~hPb2YW-@f#DF+ zOa=Q8coV~NW_;r4x!~~7g$N=~1-Z!qM4foDQJ=K=`Cu3+{R4r43&Es2Fc2AjJ`fom z5BV<-M=p#{M*L%wqse@3I80N6A)Ocv36-M$i}=7i3}nu@?p> zy0QbhDd5E3C;i8t_Mb+jW7PigvA~t{LBCwh5FePh;74yF!_*cea12cHd@vO8qyEU? zI6sCl(?v#;h6n>Eq;O;2GeZ{KGPcoAV~lhq+%O*;h@h)O!(jrRG*l%$$P^jp0W8$y zQU5sa4+Sp+m~=_|fpPwFfad@Wk>CrFWC1|~YNjxP3JOHERtly$qj?`tt}_Dwn`oCG zg1HY#fk=watkI05XnF+s@*@}p)1XesyQP&Weo?cKBbcUevsC0v;em24`sH+jNzOBc z_ozkC2wEYRgJiNt%=^`depR?rC+ilMmGF;S~vSe++3rJ%ovmQSG{QJbPo)X%I( zunK0ub}L8TW3-wpnnr!!O>HulEhL^s0B`!SK=@*#o`)!fCYd`GpBn#TtRX)-ScLrF9H@l)wZ>z-pL z_8#@0#(MW3U-j0Pv<{4q0cL}KTHQ%YU|?Vp;2uE5Nh2$6Ae1z)>d~$NR$iF4Lf@+A zCuiFuV-xKY!xOEM01tR*ePOa^>z0c>!S>PN^X(H?u#iISv^TYnjdQ_KEc+n#j&Z(y zG;k%zhufvi081+4Yn!-|EV>NX4*J=!pksclMl_Zu1hW@^;W;?dnulIZLDBVwYYp?q z7F%MaJB4ZB7A?&*UGn``cw1@E_d7bA-Y z#i}l`s7thUEnT_Y^G?^Dlc(P45%-=JdrphC)6)kMHqTeJ^Ih{s(dJt?xVZ1TRmf7? zwRB0W-6GnyP9M0ZH+xkNK2VRu@?(XEEzuuf;jXtt zIqfZ-jO<726$?ORQ3@8JOkR}HhIP{B3FYaujY6f+NQ*VvB-9H{Q&=d`W}#WAWMM54 zAbipzGzcwIXthw8&f6-qusm3hLhBS0Bo%BWsp zzi5xJf!~aSs7uI~!v&TjQyQ1cN^_*+H)oaTlw+b>RPv%foPknAw}xtk zI==exwFw`sp4zY;E!T0z=r*A?oQraEI8$_6s6eRYoruX7vRa8=WLmOaF}^Mt7BasC z2ZK5V{HvhbxAM|6up{D;^Kx26$!a@}SxC>EoPVEYPWRjijL%NN8!pNoK^)OLg$g#p zfIPNlc5&wD?vS3#iSEMRZbgl1Yp+Mxlm#8x5b;P2ggkKOTNW8Hh(7W7I<9U>uk2oo zz3FQ2)vZ%`D8tIokY1ojT|%J@CD}70xYIFNkSA0KJJWHc3<`_n_voI-w@KI|>=Je> z`mk5nyUyBFndNl9_5!YW^ljpi`FW&`(S3p*{q@K#&F*h%J(Z;6_X)+aKe~UNUhU87 z)dAta)$NM0#`~w}<$=%z^m7oaiTm{Z6#l___-T5C{%pS5`^ihVdVtRFt9xmN)Jnm5 zA)my(p{`F=)3%RZlLAVL<-I5Kd-aK{y~2)Lw)FblA@pD^NjpDIdbaD^ge^kFs=b|J zw#@0ic!*+B`>-5#O0N;BcqKliw%6neo$M6b1S~AGdmI#Y2nSd1K{T4t9m0-vMs&x= zKY!V&c8HzpS*N1B#&F&rT781aG10@1?3Gz5g*@7U*{*g(F$dnv*vmfQFnWFDmR+6$ z!X_H?fAo2D+)Ve9!($VpxK)QDVPE7z(8n$pKIx*vZuUN0VtoPM@!=ODlYG!OKIq#o zks*8yAdfWqCd0#_AyyK*Mfn0D&L{q@$>nzMmD-(3#^^U{Ek53Y?oM1pyVK))Jk_8~!^b0mQAN{J6Era% zj`$`59EVAhN|Ae#>170ioEd@4=u%2d75W z0-*U35Mi*Tlz=pjfRYAXYI#e#XL1wR82DIY*=PrNMr7c8=dZAmQt*wEf-g|O6(1;R z=~6)C1H}!%5gC_9U&`?K<|*5S)~4#9)kou$E}h+x+kA&3K91JW*l-AaxqM+5fSSs} zmfZ`!L4JJ9w+HKBd~y^m4`QXFO(-v26dQk59tOqND>e4a1`Mg#&!XkBATw$ z68v2D&K5aKCpz~q&kv6dN3I|P>#vfmLpukJ7djdW0u;gl z9`ibm{R)Ht{Glh<2|)V`4WMTspEN|$u(HCkh9WQkN)%N)IEW1?6p=E7eG{NSqjZdV z3U0#K`%=x920KmJ3cN5J?tA~g<9Q$Zy5@aC+25y=?fq#ud>x!0( z9{+wm{2-3yfE>l+R;jsZmk4BLR1eB1DBN69KY%ew<^Wid`qA+K$2U^}y2yAk7hUe| zVpJl&i2{v-ND$f7naq_4YdPF_gl|Diq_vD(*ib;#g={THag0pF?_v$#e>~}+loDlKTE=NQ zUm!#B3tUWxo)2cciqH_>h7$R9D%PB$8J-V=@|5;pRQN$Fl*6YKX{Hourb7H~D)9+& zy2v4#6;C8Meh)c&$=OHFesYK!#UCK&AUTKNBy&KfOy*GsiC&5B$lj3DZ3-O@NkrtN zNurSry*l`tQpc7N??S~F@7VJ?p7b;q%A`d1AR`sd_fV-?WW6X6U5yw043?AHKvH|2 z5;OLQRIV}g#+niKU-U=#NzA1TYT=}wyh$_h7$kt>w@}l!lCup?_$VC04E}*0emlxb zmZZtBGG}9GB67(exO|EK3gXXF#d~nm2g&tWoyKB&t#!6_{^VVA<@DZr4%c<#HRHT) z{#e}6IDJ@t*Us;VI~t}BCprY}t7yJGG-F~4s5P{QS%e^zugP9MEj;GVm( zuwC@DEm`9QJExB&te*M3vFc8-taE8pEZrko_fGFm*c`Ef8qrp>Xcetp)B7JfG!?!# z&fPc{+ps%c{zOdgO&F~+f@my_Rc^jx+?+7k6NM%7$Hc;hTb_r>2kJa~53pT=T?AT9<5M$h3c-?U^|9I@=Dbapv`rtiV{%g<7J~MY|z9nv}C+f5-VRC(`V~(5a5lyA@lcK41 z;fiS5xYWH|`v=s{{9)0);e#Blt?qt) z(U$`Ag>yr*R~M?#8WdsQ_(87D*73lg$#>6jU+tY6673ZWC8E9Q17nV@`9TgE|G=to zmwmN&J~Dssnt$PvSk$oCD;BoS=DnNed87(^;g?R%>*vnRjxXei_C_kb>9M7MVAoh( z*FuSc!Wq-M7AN`Kv!-|R@~`zJiYpiN-v}%i#o~s=4zajpNh^9fW~>kGt2qHb;D2xG z`%`yaM*v`;`FDKf{Pm%0L-T=!vbei##nwKf#c()$iK5a>FBUr%1+jU%SifB?+CFn2 z;Vzjyc)zHGeBPOZ?^*4y9h^Owu(|*nQYO(^yJ%R9eD~zy<(v7-Rm*|fE#j^|aa*6* z&?h=iBwR%gje47TM*GmL!So^b+<~7zF{6FgQj&1G=7z6TENl`Tbu+yf4O=nI%MsCD zNApsD-(Hj%d#`A3rcrA_I;ZDLm*#8dE?>)EXjyWKSSF6nr71DL_jbQ%KS49}q?V?r z=%JNHgaEf?@u}sWyRN-6#)PZndegP0*H69Cf1^L{YM3$phz)OP)il*lQJipRRWe`u4eflr=&D}m7hUbto{b;5Jy{Ky+4q3~0GMzUT(7-W z`xXBB)U~NN3O{@Yb5gl8W%i@j$v%UATlRYiET9-VcbIXcjhv?X`{A}ED zWTy9?yZHLxwZZw53)+Pj@47e5SkSz@{J9paELW3gYML<+SXfZx*ea zUp;UiVWSJri0(Gg+V<*!2UoQkEYUqWjj7~bk>~okYv*F+txIOHeA{hr?DUztMQ3Ns z33t`Pk#AjEs$IM+*7k_*o*B!1SNZj}Yi(EnuD0o;s2uyxPk~9s@1*m_D4pja*#lN3 zv97@ACr;Itq%FnBO1(OPyZZ+&1VRv-ka+G%)1lte{ym3#PY)3GCqHSX*8dCg(#K+I zH7f`YY9wZBhQu6gStqS|6=SLj3@IYaw1Jh?#+X{|Aiz|yu#F0)R*SHaF}12>5_Ysh z*vRtW;03D(E+fNqD#CzXugmPods`-@b%7njh~7e1nyHlvQS=BSm;HdLl_s#O5tkzA z4C^wrluQ^kVwX|aOubCExm)NKY>YtPqiCaAPLo6?lXHIZc7oxuMU|;lAQXa~Wk!pL z@K-5pSjE)ZBrz%#tyCY$G}H2vw~$6iFf*ps)^+RirQvBCm|EG)fLg`?U_?w|+iC({ zhIz%{ZaY{^TNI2WmO8`G5*Wiu*qT*_OC|%dGIj_Je3Ouiax#Sc>>OzVcXX$sZ)!8X zQ=aSSE}<^06YBY270Nm7MchaDTA?iEs|4eqTafyhVRY;kwt~S+>*tBA_B@dxu}61? zUJ)Akvso$rVOp%(s&#t2Tj=3(Zy99r=W6nAv^UhcPJVrQ#Cq3Bi8)%e4tEJnd?Q#+ zTLiB>qZq+b+5Zt3M_UCmtz%{Tqk9#tP-7Wlm95J%G^XKu-^XAXT7>kvkn@9OsC|wD zM6Aay+KyRR%^J>LITm5R;A3NhJ%D=6n170yiylz)Tn)B&NUKq?z8)!a)kq%1%2Enj zs4*}OTs^3^=U^X8oe)>|-7?9r72Oy5SE*IClwMWOr&rabtEnDF4+)3B&Zm`h`0;(Z zx*O#C3|n#)BuCeQ$E!P$H_gzK+Z)|=b)TZnHTo>iuh6@C)@XhNFngs?g+v&!XZD5q zz_2S>zx1pz+w^f-G$qwk^6~Qs+XY{0f2K0FgDv8dc3@WKjydfYhrnEF;;mQrUEQ12 zN1PsDr`D{?crpmx?4;=d<0-x8M2}qECmeZnBvE1mUxeCr(YQVy&M>-%Q?S*QhAmGT zwp>^Dt((6QHT669)Mj%JNM+RXAD zdt?{MN-6D^j4gIt(Izj7&3c4mn3LlWo*;oPJd7=7=8wS3h_XBpjr#cG^nQw*)8y2V z^AtG^j{3CMQi|Cd5e@3N%)Ck@k zA4$4M6TgAey%Li|dh&!GX3!j|HyO-~L?egs9xhsptrK+2j z<-K?FpP1201iXxJ^ZF!BG(#dYep({sg8N(A#jZu;o0Ur)OP7{+h&>0zO^}y9_)hq) z_4JJPN0yR%-m*99ZqzL}7kb~?_vXIEvUqLx^3l8AL$gQUwUyj>E|Kq^cfR`K{N+XG zS6@tcOXo*LZ_}JH;jLOYCwg1vj8s7Vjrv5PZ=q+gU##g73wxGFVo#oan4>Fo%^A^| z(sD`xI;UrOo7k~mtll5@9+)#?MqlUOc>c!o3xP%F*M!BP?*^6&mxjc)U1H^~+n#p{ z|Mb#39{N#WL{r2Jm{5(aaxVZK{U60i-q61 zwCH*BxajVfvAk<6{x}r6ca22}Pc??e)3V~;D7rT;fmpq3*79y%(T(!NswLsw@J8E> zHVnGAd-f;+t4A!|xYQ$-Zd>v061}^Yg}C<^V%$~NL)StJ&&J)IAUK+Ssx&`sZ&JG_ zsok42_DX0((zqw((eELqmsQDTO`6ynWFV4`Y-VfHoME)G132jf6Wb3h6cV10*k;gK zkW`s0k{mu0wn-ZHFl;7i;DeBp2@SY$V98G*JcK`e_yag!(0-x(x<)We>jew1JEHl# zRxrHm77Qb)o06P~emSjt9|6Sys8m==Z&FR07Nb|Djim54;LUnv$&z2t%xZyS=75f2 zAYB=VRhuDXZxw8UUC0-lz%h&7PTih?`Dq$+`rfBzGSsipku*qL-ee9`$)NQUdLDWr z{zuO43d+$BC#h!|3ZG$Kh(C-aOC+%iri*ns1VsYUE`lZz7XUAQ9NHR&$q5e9E=eyb z1ToenOC^nbY98#FaMBNBd`0kv$0r!p&{)BLiyH7YIk(99RdRlfoNtq}M9z1}`7SxX zP7cGpzDK_O|Z^CKAG*-UPJqD+Gf?-Jf?^1Q-)Zls?rf&WHwC2G^jkP#tsS%rZ#PmC2mK_O; zCuXT!*s)U6E!K1|U5eLiT|RlIW>-wVD`wf1u#~P?szponLPy-PVbOWV(iGD-#Vk$t z^u`r^rKqn2N1Yx^wjNwC=O?S#%0}>GOC?Ofp*bVIGGIbJ`N@NaEx-#Ht@Vlb@}i5Huq1K?Bat8& z86r>d4m=bwxNCSSzliFiM~evX_LIF0$z#`9+s@aN>vGT{Fe zIsclRe?v};9J*6Nf<)5I<&tJB3qM`@f{~>50v$(T2&5&o!%+PrUD|Ly(r4{2(xuBI ztL;G_5F(|KD=C2uK7kxR#zcp=V(-V@%Pos(Xyaz@^xpgVuDL1jMm_Cf{zk}OI1A^^ z3nOCjM$y@^bVMYXhx^9-xovlh$7@f|K0P-zH+rl3+f6r{?%Fm(oeq5Z&A;&Mj5Z}f5-V<8JhT)QTlR^?`)*$n zix0&ehi7sL1FM<~UTcKZ!>r*!uEy?#+(oXDRPi7YG50h$&_%>h1yX71`9byXHGaSG zcJJ+}__kA`=@iL@*o#%0_jg8pu0h)K(t=!)bY;@EYQsxf=-`lofSr(zV(LxOrL?Wu zF$b{uq#p}>Y>WFACg}^7FIc}|1D2st!xZQwq~}%Ww2q|l11XH$4Av3Nov{pRIo&I! zs6hZVlc{5Q!Rx_BkA-a%v?Izkq>7NskD53=HcaCVgtI&V`Dh-O3x6JkAr_k;`7*r9 z5w(pd&083uUS>Ule#Dv;gxB;2Vn+O9`}nGLjTE_Z8L!%AtTVPj9cQ3%h|mIJk6Yj!BaidVgo;|Y0Q&|X9G&?kH)kBe;*RLjc{?7|p4Qyd^|1}2QOBndFeqT_Ayyt>l(aDI-M(pQ@tbyx2FA{Ffn8Ir z3aAT1B|D|c%M`-fc0egVFdE(kkCfblu?-iap#M6pdENa+>5Wn>P5t7tV#Bt0$#$`L zrv&yUc~6mugjDmY9@DYFdRzw2FY2yqW_6ifvEvtGLcQYeBf(7|qWq8Gk@iB`8pAZl zzGWByV(2;rX!qn24S1#&^3dUQMl{c5K7o-|A9#iPTJ5PpuL4P5Yfh*Pt@l>Z%sG}`%h@xv#{`Ac%{ zk@N4!*-y?pa8_>tO!3;Km>%hl6HdB9kUoZ(E&P&Da|HKm# zKA9|46q`NkJkU3doyG?UoaQIMGwqr@&x)x`G?m3o71O;5t9Qj(DOxKRba89V^!~rJ z?3~_r-{_ggp1&bc+6Y=jX-~{loG5QvDc>lTZ;Y3B#!TKsJGrsaX3^AgzodR~yI9g4 z)4LOmP1Aj`f(@d+F453DeLUu>74;k5v$8ek^Wlft;0SYhSXhY}_Yq+!wd) zhuE64V#Qe_I%^j8#GUmdXD7e+#+`K#wzC|W-gi%5I=@fUS0zlggsCWjHkcsnW@oN< z#Z)Dlsurr^rdo)F{by%huHk_j;yTmEB_P)B=j$;514JKP^f5( zKb#5;(@>|Dg|r_QQvK5X(myy18y(b0{=XwxH!Xx;f(MuoD2z2Lu1?X_8FzK9xORxH z9m}V0*Th{1rjOn?+U81EoYkVUdSMeFQZ&{l+%+KGRMuWQwRmu8|58t^Y5R2Fk6h*7 zJiSE~jW&&Kqhl|a=f^%7Kn$#ZcPu8;=WQ|OBlXk`F}Qo0sk zWDxYco~Q#ux)8p?5zWm|6L5wTA5u#4Cr7l*_)%FE%HOJRcbC zN;zi4C;KwLf>o|BY7=ZDHJKRM$LWD@5<-9neY1^hko-zlhRc1L*X*z559ADM26WG1 zRv8-DG@jiG((#fTK?Ewf5xz6y zz93Z$0u7~LT}mlEK?5-08ybE-2*n?|e6qrQ=fluxLs7tn!a*pyDWaeuL@}&Y64{Rx zcRuKw7-mX>q@zSBpcpnjFibjbtJ~=txPU@Y$~d%*h}6hZ%i~3(-Ip{<;~So8I!g+d zSp_g%uMN_u(^xZ0F>Oh^WbVlSJoGIwq*Ldlyj*ZFfV(rZuO$`R=6h07^$LUrg1#}b z{X~-yo;=^mW;iff<=e|9*hhzkFEk#(3{Rq8n4k^e#v+~;EIUS$`e5k!aI*NRW5{!>XEtoG<90+9@~iC~CJ8ew;d4x^-r4+chA0+U}l zqx?xj5DHQeWE@A@q(QRbmDCT12FFnh&R2=-@JDC`RGK2ev2Zd6noUU~>@Sf5V51}Z zLl?z>;?jp>P0 zQC+uEwM(o5S+ycztcn?1?=>`i+j!IXZR<^I!dkaz|1d|_XpWgSkYF~@OrL1-#j1AQ zHSK<{vg(b?H!i;++z{?b6p`2RXY*gzyE!ssyk6&l^4vf2x(|5Ff3z_N{y+8@ z$Su!3P?yL52`!k51g{2WQVrTr>=!C#1piQigRLT{LP^%*$Y9#oKpOc)Ho{0SznTw! z2@`xB_hdl>)Y*VDmwt7_I0FQ2*acjMk$F=zj$E8dxt!s3-DyY_!Y}!Zpk5e3?llXB zi#RoSoZHNvn3)lWEg(EfJ}XKyO`%dQhsj-=rYuA!pb(sbshR{6E5n*0G=l4rxdPy{ z6f8s@7OafOYzO_t#7v^(F)B=<;vD&5Z6m{hD4b?3phnG_C^ZOUgC^hiOYWki^b zm89crNDE$*G_H;IK85n%Z)23+_mT4i98fPd<459t2mIlSz9ryE{GkNb?%I{lB z=chzV+5m((ogKWPZ;bYemB0hSrc19DC8xym;x3W20I~*+R}8M@_=vogcX4sJdTTzgQ)fHnUpoS;VexL440C;u*Ho}G?F$iJD?;?WvC&*#2ORWymwnoPr zJJ$Hy6iLpX5>V}d2S*uH21}%~CT^h#)LJ0u4DGl-{!!7-|C)kjalpeHWx26 zKjrf;iq<9u84vPuQ}k;7k5N`qPYm+`B{_o77&cJfK;xAIB@YVrBKF-zNGM__eLWYn zVk351?5S+`Y6Q=_9(%EYGcp>gfz#732a*P7%G@&5c-C?aAZO&^R*F68bz{^R8bR4s z?C4LTnyM4`Skv1yoOozg;vE^{31DBj$3b7raXkehBLl#-25h`wA2z)_4@ zs;zY90KC{Pij&L873@HZ*od0B!c;iegPCxyNcq<2?vucb0o;?w%hNUsVoMkKGdnDS z4!;*p(j;q4Njxk9*!OA1*@g?k%bFRP5;o9DNCdDA5!~IOAQ@vdE6fLyKV>?QC$19T z00*GYN~QYWFNA{?!UEE*dLIicC5DM_XG&)QIziBwSv6$OmcLCw>?p7>7*?RQyC`ZC zWlxigq|<^HmV|Ho7``x;EL&o<%noVPdvm;khIn@_t!G zBK5yK;VJ&WVsM#f2orL+XKeS4);aSXqj&zrSY%VP=|Bms%#X zBHfrCNh*UqC77>VI}`==6zqjqZfg zhs9kjfoi#?pmf$aqn+uP2>`GQJfsfo!YS)6fBp1==dH3g%YL~czWMOo@*`r=(U}8S zy#(S9b2RRTMJ=?&153rrRo^LH4&FZd&e3?$)6f*pfV?Elma>DBEM#UqWZsV)Y8wK9 zDKwO_Zkx2g-U{Op;exb^6ih3Ht&S0Rln1#Q#zRLJt}i+AaS7Z;&(ndXS%aIP%Seb2 z1{5V+NIQs272!a~QRA7Lx?~Os@)$3|L^nLx@_&Hvl*C?md@wQ=cp;e+o*ctIPb-)| zN-%(L4r}=1;gXNQ5)jZnhXsc+)jLnnNC5PCH^%0tRFYE{8R7~jZq^XOR(_` zIteUpvtAEQh6ea&5Py=oOq!6MI< z4zHh-^rh{$rHr7B4ToVTYTyD;K8A$-HR|M_Mh%T-#!lzoM?f++I5rWvB8f&(3TB0g zOfE6oOANr-(nuEKvXnkS{J{_z#pW8V5;vV~6Aa&@y@Ky2hn*bkP+)t?cc^;WQ(%u? zxo`SeMajJMq6(9!3EOE>j&yLCrOGK8PwW2~T6W+IVA%oRvzDw_%0)~0d?0S|EgWB| z?G|fE7_)XOF4-~tQ?cz&#o=1E-nSG>DdU#Pm-kKgPG5Sjp!oXsYugj1yqT`AOkR&( zi!MI5(h55oy`bS08m5muG-w<}ue~_?B9bIb)XsP|4En`@w>ta!dSR?`!+ z^u+W%(*EMXR~`mncpfM-*b|I#!NjFkZ8hn$Xb~fatwu{{QuZSWx`4~ARTt|~SQ9ru zs5*_WWy-;|GtK1|bZS@}m*$yIAsaS^G(k~b4!BTSY00enei*Lf$%g-Eu!Ozj9u7 zlAENH9EZ|LOdGo`-prBI$tVEiT4s5WT{zG1Pa>$%3Ry%#*^@^3fY(1i9K4*&n+%07 zO$LLIMPXMu+6355Eos2Agb_FX3{^n_HPWW=St`wc6EOcV4$+*XnWVE*jb%s%Z4K(T!2Ha5XJE|psP{!$^gl!%w&;sDtazG4Pt#&2 z>`#$}Dw}u3RxR497xg_goP45k}OJTQIHra z3&dwp@R5-8+QP`unqtUCAJ#3fQM^iz8mh#1Wo~0?2fn;kc^^p767i%|0KtWkOWY7e zD9o&D8MB4Bq8`UDlp(N>q*{i%phivbFtbZwFG}wQa*K)HC$Ez<(xaE#L`olAE+bJD zXC$Z9Oq_s{s95jOJlJH1WDd*eM7_>TeX>uk18MD%^u7Q&9!TzdJuN?k7UVs?)gU^V zKuXJtT5wA>jTFh{XMQt1WcwtDXCrDN2iVWJw1C41VWU|OhR9mW?5fYBfdLRH%B?qNP)25gw5_rsdAg2!ECO*$%N$>bpcIUS% z@>OSL!mC0+B45Z?6Mo>V#Dd8vU?Y{8KE0~ z)`QV=rOPh}m3`dO8CqKqbwM1;4Q2cSb-3j0xANs$*qK}a@~h)w4^E?8IR)yA1k00|*TR!rGF2^1}U zc!)kbCSlKDI7-&dB;z|~tTi!f`+NCS32#09cNQgzYZA5F6XmT5PgSC1L!!FpVS(9UexNZk)0H3nqGT+X zyK=|qyXSOYFTYkkA6=@x>)bqRNaPjGm)yy#X0pF^qO&gUY?w8?XD^PGZe4C!u85T! zirWvzOo#7F;t=z_U$(=JsIekuB)gHNRWtbsrzhs^Sn+Nbz1x?2&x4K?C zI(u|JFIKZ@=~UdhWyQKvwC;pm##1rt&bal^J5JGhbj5l?w4R8ac`9Z-5x1U$rg&Zv z$bYti?7UC_-f`O%v+jsn_pMluA}grd(0a)rdXA#&=4E$tB5W&!38$e1DafKkJ^$ zgQWH1Q{Vo~&Ce{m-uB0ycv5VAGM;zxukswTo;h%5<9U@2jGEkhwAfXg5~c2c%m1c7 zzM(r-+x>&w<-y;v#hnL8oreZ2g^Uu9b?N+V@7;o9@0=71PRy9@d+Js^tzbP988hzL zHfw#as_rfCn_ieJov&Tkvp9vzX0;ha{~7Dt1<2VvmfU6}X0Wja$ZV_*wp_O}Huq}m zb<4N6-`xJa$=`bMJ1@qY_r@%HWBR=edj+lALDD|TT>>PBFs)Q#y(b+h+?9+y``I1D z1DPq>joP)9Z6aX{XQ)b(KbwebR z>nB2^SJI~Nefyt0wkIW$+RG26K1xA-VYfZ@Ie{}5f@lp}JBp@Dg;C1vl_q^?LD>yj zf0^|1a?4tWQZ~6Fh=Qia3ybv8+^ZJr>b&wWuyvftsicHU8B!rW`kY~E5G1i)#W+y! z(=ADL3d155dsD~%HP(hi>Lj)Y!>T~nX9N6=NHZ+v9ts20j1;*xKQk{JnBAGA8$pKb*1n3J6Ny)Z7e4wJTMyb)50?ff^qgPKG1k!=x9*`cxB8(58ln&bFcrp(m5G8CPQtlt*5ZpYI3(3!UZ7-H3oW$xJrTQo4-Z;e@MWBS?$8#HD&L-H9OtU&)~4EHkC zxIRHOWr}JZ8lh{ReQS_-88BipI+@g6?!aGX(wzCYbFvH~`lDAQk~5#O7UPGwJo`iu zz8f(xfsZ%iB1WHQPP&Fh$Ik=LRBC+7)@KOcWP&X~L?P%;axKi43t<J8cKe3Qk8-sDc91MMX6DDB< zx8Qj@h4wH-<1;|CQx|gS?86~}?`7RAmaf0V&*5*9li{{~i(YB{tD01~MG-d$+JBXN zzedit$yp-jJLG(qoL?vBd*qOI0{jNEh59%nB(KrWl8*cJsf3+=B$| z&!Rxwbh|VVS|wFcGy4x&0SEEfNNu#1i4Gvj&HlqvB5RZ)4gN#N6#pfELBf=e{~vDh zS`800-jsahfO2+dX4vesz@wZUHR^W;Nl%Fh0LdVJ2`m^9(^NX_fBT~ig;#Yb{DT4L z*8Cod_&SXT(Hy1maIRQNMMx$ek28ZYY*nyvo67BM>uKA_{~t&Sc^_~InA#-sFcyGy zY*MoIf)6_53n)|p0~AP@28P0`2E&Uo*kBxhM>!Z7Q2mI2ni^tRngZW6mq!e1aX)Q zlIp_Z|20x?z@?fgDotzdS$v|UX`veq>@P`(Ub6;edgQxz*pQ(|s7pA}m7I-> z)DSH%W_6)7E#b6!^)gYXI1;x~} zo;G9{mn6BZwtYhn$ij!z*KREFV>-qU|zliZ5CN;a6aMpuj z#Hhdn@{}N^PMKQrBgTuE2Gy8S#5AeJ&<@;K-gf||Q~n*|c{Q|>whf-)mTHJ&23;hW zfEL3SSC^APF*AtjnpORe#b6~Q%FUAyf`lTIwj@szz?1z01v;GR6)j+%_9mWoM70Au zSwJ7W!JP%v7Kn04pS)2Z)T`0xk2cZ_i_Z-AU8?#o$oaSA5b6-7X0JZ#r4!z^YBT5G zMTX6^kADWo3a*+Jj382a&$9_s|ANYuf)=r$W%}3$E=?YN@@⪻f{5~JzLFP+lITa z7To%d>zz|~8=t(NUmDA=kD2P>Ci0sXC&m1(n5iq4-*qqFN1Uj9C;bNls;E3sSW3*O z$_{dis#aj&tElO5p2qzT%kzn6m7iz+0DQ2G?^=uRS#h(q&*|n4&iBU}_unQ4ht14L zMCpdy%io-^m!_(Iv?A7+hxsNd*#vG_p~lg5H@|7IC!W9QZ#5+j^Sj>q2RZotQNHs* zE`A};>u^0VlFy{^);-80pIPIscwix)Ra0F3z(zj1rs;t8K|Z_>97woQ(s`$(^MRAz zU7FJR6>rxaZ`XqYdN0(JHLR3&-zn{WP(<(U%)5u)i!d1{3S5z2m~wp5E&+@AdTFkV)T2 z?@gKaW_oYYlx$cj-gu{Yysk<+{AnD_e9shrarHbR<2xVeZ z7}618Z24iUkRLV+j<88^h7E!%tQQLKIhjIO=mDrpPlg2dd2NPlB_I51!p_kWNQ7u6 zwNbB7!gmN>PDjK$Xd6N)DuX6K|3sre14`6bJ#sDb5xGy39TF`K={Q}qRLD_d0m-RB z^i!flsh-KPY0{!n{z;WpBU}Ei3J0CT|b^2xS@8bJvu(4t?`Uf049S)CrH%6>HmrpR=R z(ip{_ z2<8f1U^(gU(&ddLeE(0{-l8TlRhTn00Uc+me@~xhm@A$ePfekSqw(T#xdeUljDs=p`YDzeJ{FKKs~FT8Wl}U1kU% zNE`zC$onq>&%{_VsR_yeWmKXIz}hxGBo7G{$Zp7f+^WNp(ux_O6p1uR-I=~dBk8YA zOh%H1)Q2Rjl8shBGf|mzOS*A>x}eLviJ^%}{;yFt|0`6R0op21o8#Z2kiUeJ%nM&0 z9vYn-4-8KgE5&d#Ov!CNwEJ%;t;G42ss`zU!uSW|d_b|TM{Cd2%zua^;7PB}!;?gl zl!)S48`=E`1zFiKAvhF=(@Pgr;*(_kr+$BaP2gme7C2}$*c=u(m(j%SUelE5M6k0e;6=`NzA zmpi)Y^A?h@E@;TlLqrFBWIrXtCq^LIi_2#E&6%FpbPy{g1u%S&n*`?)TG33LIhivs zF_~1YW{iIh9c*k}n_~7qn2g1y|G%(7CjF}5MA9-2(J~O-X(!@;NCQeMimCM{oy@-U zDxP>=YH1Eb1e+~Ncr3CVbnLhM|4s1mU7BQBnvge4*&t)jOV-Zfz3VUb-6y#sdW3i@V zak!R-1T4;No!vTj>gRXP_ujFTL+&UUBcj_P%#e61>t1=~$ELn3-xC+@9=&QcmG<}qj0?S1!Y1mqQ)BAs9 zw88qw8wYP3TPUo|)O($! z^Ph=18WWz{6;HG1X--%5(5Nr8KhWrHb_j=+)GVA8i<@R_(Bq|=n_#br)m-y1HwSfE za#z>M3>myvT(MHzE*7^F-P42rwF##iv`(1iD>0)em&Q}R;_ejPoe38t&x#Typb)l* zB`t}XO$kqF!daXs@PIDqG2=5&u!apgWp^x<_q;wyM|kmK+`Db|C;?#i9b1)J`1U)t zIu_oE@3k#$k9&8|9(~VM7OU)8HZKXW@_ljF{+M+?#Cb9;DSxCTP#;jV1mB>j+^|xy zO|00KaFwsPszq1z!nVc9rSoyu)19-J-V}p9@QPO47#G zX(CITsqsZ+_zKuVXC9>|TJDBL9SpI}98KgG%@@A<5^O}SI9fzUOTxQh1qRx@O^e%> zCLtB+?M?Vv5~WbtsQtUVT$qluYwBB9>h_3rd(@Mb)BSI6Bu6J-sU)f#)uS_usWe5#}H^Diaxt7E2W$YYw^ z65BjQGQnP~A)9~_VRjTALa^7jy&-5=21rgp zmPhV|2snMDOlk-ad`A>*>CldLktNfOHX+bDC738M{^5G)b zaaa%*z7;epv@-k2a3nY#>4f6Osg^3OOH9il{HHMLk|WcbK(V*J92Tsc9*bh#TGlHc ztpH-gYPpVDlzagzXFv@b*Xzf+H82fL7+6JbSv>=*>&ftCAR=I0O&R>Ob+h`KQJ;Ce zlIY6|>-7X}Q*v2y)MieeC2(z2=2ebm?IKe>RyhQ-s%q&;j%QpOoBYPOHYw__62iE& ziFRotWx4VdxzufPtd~pG=uU*VbL>$Qgiyp1gF5R`#Ht84tm4|Jj3VcFI~59Uo$C6L zd(7<2d84J^?&#F!BeNQ#<>33-X`@aO0^cXHZrLg;_K~tx(xB`E>y}M&G+EChWviq?*~Jke1h5A}R;p`e z_35FQt2Q}j<~Jkd4SK$HPW!R}HYn4azY!Hqjv`itPo;=e;k;cSWOh?zt6V<>_8jVc zNW+>IwlN)w96D$TsP;OYLx&vB&^K1~LxnTP#=9r7Q`r^>r>T~m;YLKI_GEIQJs%;L z)TXCbwP}}JbLLmIO~s$6P3z~9+GKwm<`Ll;1_WNcjZHkv(BJXq2R`827LW|Jq)kNEbHnkH*-f@|-Ry)qE_7rC~2i%D!T#?#^ z%8O_X-zGF8W*>Zw0`UafQD&XcK)&_PE@`&|->^QFub%DZ?Q-9uwcvJ3t=tH;Gs)Kx z*_$4bHfX9;gR}3xRU)5Jti7GlO_BZSR2#uL^AciAA3T*erBIJ-Pz?-9~oe(K(6w1kbI~w^K4A7!Lzi#^9zQa#${F{R(@UNwERADM)qZXKf-v(7*<1Q8l^ua=Y6E~r{$Q;@5d_rBuej>^Rm*nD)wz+ z;$=OGo_#he3a=l%?liPFgz^<9OC|dXoA2vs3saCB%RqhuG_=$pzbCpy0na^DBrQpTz^Ous5h#6YNM;Z2qDw7bOdC!)IbJwZg%YhkDeEkz16s*I&d#@a~1)k-0|ZA02^#F~K=QJzyG{I~+98LFSv?axN@pN-m|UCOap8-qFXvr}i^9l5fP+JGlvw^o=+};a%t( zP5~ZXh4x1tI4#8mm!HA?3|DyO$bvI+c`x{ho$FVoUZD=-{Zmeu59^tE{kvA&?N6yE zPG`cB?=HB!5AvO|>S*(-l}RxN}MK{YX?t&iagg2l2* zAxMMGisz=nP?B}zMnSYNGUr@u?A9B*Zy#RVeNx|j5|=j0paQuVEu=88XbI_-(0jx0 zkKGu%V`;rxTzVbalBXQ#B8%2?-CF)V^9RnG&O6pNI$gYWaa|{&f^5tDtBYOx^{)NV zEeE2d2Nzt2qJ>o8TnY&-O})*97fPTqIk05$>y}Ek)6%<8wcu#d9Zdpg*B$LCz_28{ zDA?nT%xJSuP&f1ry=}*Wr)P2}#A=hhcRdxWIZXAUr@r^+fIK_q@EBP2IdFoZ+;|Q^+03as_J+AV^F80+kLFm@746zP z|5mhmAHMgON&C?rZdp|gg=2^7>)fV2(Jhvc(lLgtnKMT~(_AdmsOzvGOsho|> zhodD=PvU$+<6PucI9k4s#}k@%#$1&#e>wHzRd(Qw-q;NF7xmG?vYF=|6e&B@<0^D# z%yiAR>yTr2;))T*vmGk@bVI5=(;ThuropGm7_#SZiFY98@S|KYM>s?426Xc^#vE|+ z^7<40a4MjziGLLvn7S|Yu&o%^k8qnhA?bIx7airg1C|Ci#_G2%)<31!KXpsH-LX)= z=il>G=RS2QDf?f?=fjjwo(A}WKC5&Gdv9)>(kI%TtK>a#a-h2AgZi8G(WdUjrXIbi z=XUk&frX~th3fsVH;!WmkLRiTjeAkH4nJ7bzuV6n#Z zw&PU(uz4tNxM0|_+Rp6i)H5c|kk7#A)!QlA(>*VDw4TD308&A2<^gWiiSK$ z%QNI1_O7<4Kjd43?q7o*Sc4v1gI>G_y=2%g%3HdIyDV#HUDvX8UCY;S4X;68zt$_j zM}5i^U0@zxT0GbcOKV9CeDrRxq!2*6XV)XFfAQKt4lpxu=9VV7B6A5DCIPT zdlbI|2}xSz9?UK$4565yH~u`LjKkpV$k@n4e?PR7=R?cg8IeTX+TTAYF7Edy}z8K8PQ+qOoIY_m}DJ8>sWsab(DT|H)wfvt7>qQ^ZZow_<@HPt~GXF9FXo|pF zKC2zeUhVqcuOEV%uoAXl;-*($iI*gn-+OwXWcLkmsAR7=8XPY?Hi6TPIO+j?{0EWK zaa4U$F@-n1R;S*3oFpEo+x42;WIWX-=PCh+}Q}a0N%XJ)1i(HIkXY&ycpCFejXf??tkV%H^jx5u7-BN-awl+p_caE4=?D8OmJ~C?0XwFSr|{g^jYMb44!~|6KP= z9m(Qaw;ctDdlN-Xtd@CFyRUr^k)QOg=#$3o*%j~?iVWGvwRLl@onrQFJO~9FNrS^38So}? zJbiV=ghi~Ds+}ehKRG%yl1egs)O&d0}0JoyI2Q51RA%e z^3>6(eUoHv7!78gU!as;bcNr8b7o@{>ieXU5X&Vuq*`##eO4RJ*6_*Jkn^nJGqVjP zLagD_mJ=b+HGHljon6D{Dl%fx-c~PvW=*c2SN7$cXN?^AbG!jv!zag&oM#Q6+`Pu3KGos1)l=xOVio;M5SfERt)V~8OBBL@x}GLCO-tzwPL zd?yxxN(%h7#3Bu`NK-8GwOFKPF|tvQY+Q^ysYjm7K%dehPi3G_>yf9|Msqyx)+4*A z`+`ZN;@_r6wylkJ>5;As*Bo^Hu-$s3J0t89dgO^m(7J~mCDsNp05%nKrU%{^A@W|PhCVIw(j*8nH+H-XF!(-ep=dZ_k4`}>w@@CQtbQdQ;3yy<+PJ`y& z2~8VdgwDhSbZ~fd6q8d`I|Qnjg6)tq(R~ce*9T{$2w|tmZCDA}6t~x~6M{+FO2bLWrIbeuH=!b! zsa-7Z)XO{PhxGF8hKEENqMbZHeQc&?`WsVk&6;Ni=Q?L!y6K(|%#YvNHGkpbNYuSU zxAv^~=)0mBMTK&E$2Esx2Fwkn@bydAE*TbLg^JUA_0ZHI!%7VEO^MlwVH2|a`slS$ z!%57g;O+#}{Tgm!MM|K=@DTDc-#$WqB@nuP=-Q!KT=-}(0we`tB=>svweHzk-B)iE zlT?z3XV$NK>x@#8LQ1gg`sr(@jWS~8iD-;4v5K53h>%#RG_-tK1xS9j$=|pOR~Ru# zOWx$mTK=RaN=MOhGe5zreR3bw1(sIvkv<-TL6aYKJjsW;swwdj`fR{&ytEk1NU2@9k6|D3?^@-iIjoSxM^%G z4jY0=T-$)S<5poVcX;fLaVZJyd>r$+G_OlG_d|6j_#AfuA09<39EMUmj8z5F-Y$iu z5a80Vukex)L^hjzCZ=&kCGD}gMdJMfBeaQvtCG_0qFqp-jQuQZvtbiPi8rs4$rpuz z(vh(@2DA~}l@``k;w|e0Nae9z$`hqBEHT`30TbW&cwrOT{7_bn(?X#jE57Nm!lqOM zvZm6?x8mC#D>TCN!`9i^v7{zJURUG{GQQ=p!sCKA1liC~%gcuh%U&D*+GB;ykXL3b z7&RYbSdU5RIzK-#GCHnta;i}^pT^0jkbSRP`oxtFu_w_kqG*>_Ib?L4mwuJ-UHvjV zXNK|Z>(sz1=_ihJ$Ot&yjhC(#fpq8cDDD#OQ1Uv#eUfo$TV7ZdZcmJc(z7^4z{zgh zFG2`vd#cfBRM{2}W;jVboHB4F9JMO?C=x#U2XU?a+^g(e@p4{BhLl`uhq0t#7;hdN zPK_5(oSPyh4n5EopZ4DzhGaevme+m6W$EDy6IwNss!1mWU79Gzi|26~LtKy6UgTg_ z#0c_U9+}X(>D7u(H$H(?zoz{z!+*%v2tA5vl$avr!xF%R#?O|#i2VyhAx=eLy^GaSFIa#xIJSjD6Nwen|5(N7ioPZjGurS_f@{F&nTT!}=L$j_CM&y^kjtn9d_)Z9~=@qTDBsirRr z@>LUr`+#hDcopLHun-Op3zDxd3(Tr%MZxQ#LxJq+o^tG-a{QjM>z>l`xzcz~+4{M1 z;+|6RxzZL@+CEoy-&2}Cv-+pMv0$yZT!2ZPsqmf9mC&-ONZo*$s>xvxf^IiSH%!BM zKyl5<_Rgg%myA+@hLUKRKw$_Tl2uOlc0ob`@4HpS;}bK-J(AF?D!zc21n!pvw44Ot zpd=hh5e`cNOaLc*JR=FustSbcr>9OEMs+5GpjP#f!99J>Fx!OI|ZJdyVuczWT zDGA?Du`w#9tiiBctUv|kkjlqx|u zC<&Q`5I#_dM5&HQA1FkEa7+?Vhy>w;B%lz9NKZ-v3Xv%HbCQ5UB+C7~B%lz9IKC+f zFQf?llJG56@dr7zCVl@>E;Q;t-7`kH}I2F%v<1D|8Asi$Dt%myax+*{wS(4XdEpvS@Zeb13$rtF2S5 zGk)C`Hk^W%Tvy2TfLqXtc!U92G&};um1(&U@d?zgzzP}HGXa4Hm2ma@hi)7)iUnE% z0lHBtzz~;NWdbZ$oStdl^x$;KRG$$RqzYDUK4ZT|!G@fo)}jaPCd^R>M@I z7R0bT%){>z`rez@-ZadBxkw8M%tEYC@%RlZAsaiS^_tbN6LXO6B;->3#fF9G`5w^1}3xZVugdg3O8t7$$nyVd_zlbsEo0wv0SMSps*>0zz`A1dD{R zRxU#0W*8&|*e+q51__`s3%ZLeaZ4Di@S_C_x>xZ9S9}ugH5G#G!BQiSzYWducU!mo z-L-dxbg1d9Yg%^iRlzIPDgjXm1dy#lyI~O^&d%5ks{m~Z+LB#>j$}1C1?tLbr6N|e zlyv>9c!W1>wIESv7-_nE)4OJT)4fyYj6C2=8Mqm^;Dccy#LW!B5i2p=6t^38La3IE zp20}bOf!;nld?lCQtM;VkdToFcqsv!0V5F{Bx=3UYFLQjc$?TvFs#I2G%sk_39&&= zZs7-G#utTR4Rd(KNqQrUuP^7BRj8aTG5EV=c1{jXZ4(qbb_FeX;q7-h2e-1aYX$WWw%DxPr$RN~&Tt^|8*a%f2J3 zx;vc%fu*t_utaGkEXyll8+(Oa!g3IjFvjwr;g)a_YTxik*qbQJ+-^O*(U1uvLn*e2 zI*QusbaVavfC#c;XP#{WemG=O)B~)MWMQW#qIFvq>kjF4hkl|SMs1UAy{hU*vlZWs zJR(rGB4#sYA{q55&_ZPI{q7sx299!wF(-?Mf|eZU?mLNc;OJ8glW@ZHJ)<_4eJyHT zij{y<)d)CIiG*Yo60#+kYbS<{wvgc@B!>?|MXX8>AulVEDXdXrD*?K_gvnbt*3=#= zt%!vpv9e13Ro2ETYGM_YvHIp%Z6p7x>i8#NylWcpx7@KA<5052173oX7{?U$3jrhh zI2uFDLJWsR{N)s zp%(gYj5?C)vq%jjHD(2DBB?ox)Iw5g)|l2tQhU}sX9Gzavq-F69Ph&Czmw3#--J;i zwI#(Uz$r!n&R`U=G^2>68AVJoijZU!A;~C0l2L>tqXIj zn5%IaGmx6utv|81(A=yYAI7UAQzMJ6jk;@N)Vb-Q$EKQ)@2}n&J2N^X;1rJmXYiO< zn#aV_JSHZ2Oi1#WkmNBT$zwv2$Al!038i^VO!AnJV%2^tb)S5Liwvm*>GLkkX`(OvKoc|f9&`hNAzol z^i79XNJRAoLa|^uKd18JxuG*wUKQKCZMp1u6$|6!w~|PT>qIixF3>dJ1)65OKpl$9 zYd8fcKl}nL;>Xh?Krb2U6JT=DD?fh$K?~yM#vCS{twyoHOO#;9C>3BxY2Rds%%U)f z5iLxY=`MnnGsFoR#<=3^<=4uww(!&%ZGx85 zV+1W{loPZ~S=$hT6UOFbW7s0RTq+1+jg5s&9)6cF`k3(xrX zsp0aqtvW z*zgI^&!IISK-n+^Si)hpRDhvm>B|IK&beqttu||tY#pyjj&L2J#5uT9X}vnHh*1TCjM2->Es!B;RKqdm~egVzd=s%pdZ&P9LM z9e>vYg_r?p!x8lawsfQz6HdAnFK67;2=**nsdO{vKl z8qUtn?(ELa{C3uVOC+KMJ}vNDbtgf{m)K~2L66z_0GKVJ5Jk|4E))d;d7tLf{Y5|L z{aTY-wY%Ys^$9xjIUNHN0OAuX!MiZMN2jPrJvme9M4-2(9u8luSuf@`gq z@{tQf3125lgmyphd(s1|{(ST+Y~%&ITP~RvlgpM)ttCSNN%(bH)8vvy3($zERn?L- zV{wtPywC2QubWhP%Q*kKQLU(pc5k!&v$|ZhTp_59G{tXIrq?Zcex*j4W&WZHKiIaj zEd@qvGNZ3nm3g%)Yw8*Wqdj~!;NRkFdQCQI*D8B3QVt@t}KMlRx-9Fgm-%UL)Qvx)i1mQ0Pe_b#x{k%>k3hoR%@H&byaBRHd zQ=-=c#RNK8iLG@J0+KjLl1c(3KV$_FnyQ49ZWw#u2d+Irdpz3#CE2#ZUYdq=b=*s9 zt*JKs4A;j!9cYvGwWK|5(ta9@kp2EClRs$py|B&~p7*(u-b0>&bLyjUr62wVfaktf z4z{=~aD#h{b#-&}pf_rWW|d(&sEq8ZFM_2}&>Vx`@qINX_U%2iZ!ex#2S3o<9`cUb z72iQ{C6C4_=ONUiCW`pMI$mQT`IO)bsV zRsqnevsX1upVbY8Y7$+c<+^3CSxsK0%$%(tm`PyvDJQuwD3t(nW@@YK0GL`EeBMhl z&9!Gi!5jke-3Iw*II-Ea9Ul05INKOMd?WC?aO3Q`8-ag>hju`L*g+6RaDucTqJ>(1PE!VmTFY(IIyyAN{Y2wqva6J>U{|LG5Fg- zuxme}4+NG3(S5=0Eg06DdQGD`1vAQd#tfG4V!hDI`jHGE!Kh)lD>ewk?x8DiJxJH6 zy0~OXvSl&T1cU?s^N(dvsk_W5uGrNw*G4-yY`4k?3^Vb%+#uhiNpkFAc$D?wR3S#K zI;)nwD-1;L#{)0`{1#ai@;><`P>6bq5Z~crOz|?SQ&Fvm){<(9Je1Tm8679O$11{s zo8qLm8M7>FR60H>)?izdRa?_l3?_o#r!87FfpyhIu5OA)=*;5GF>(IwIq|B@R0vly z4^3G!sc2YBlwDIHbfT@vF5_A}^X8IK*A%e?npY`f{N%;bs%X|rJhowR){<=DGJ&D3 zIrdtys8&sj%8F>bU2h(TRmEm29Am#I{9>kO&VtTF<20?M4@@F&$RRWsQU7{rl=_26)y zn-?fGadOf15m)Qzi+zlpz)5mIHpmy@m;aTVx;OX9sr#osz5Mx!i`yqIewMuSFnq}k z|J1V!Ew|7U#0e}c;M&%OuV$PXZu3Tqcdy>40}U@a|M zqFk#nqXv;}QE`##x~MiW!%ZcGMu@m7kGN--oA zJAQvM>JWb<%ALIHazjV(Alw5t8De`g8u-%NqQTC>ts_++4fr*(>-Jik?y!fw)$v4| zfEaEA#9bK!I__dI5C8}Xhjum&Ej9t9J(+tfqqYnetfH17sriMmcBsRWo>yDInef4~ zK^`T0U6-{Y4aE3ymwBrY?)wV!ofnaSo4#g1=`YcmR$3D~*x#k`HcY+KG@bq)rrC=y zVh`Jp+r~BUKuwd&w4?1^8kcb{_4KB9mF%8H%_zy5^t8jYqi!t#KvlE*%aDnNE}><$x(KmV zExR|rSvFWxVrTfu6bV6^3*iB0FuNa$22i6=Krsfn3^$)tQf1T5K~ih0d{)?xcmF{r zIHzveQ5daW<=Jh=Xcgan{JiXxw=&I;73r!yg-!IiNZ3W3G7|Ko~!DMG=keo&G6C_xUvR9CtL-Hz;*O0u9f--#0D}{NeYE`WcI)(fQI|f2KDnS*h)WPkN9hVSBG#5A--gUePWtedof~>q` z`>Itt^cEjs`=MsEgFL^Ph&q9iVQB0S4vZk-7h4#)X(U*@F?>PWVZ?cOm#?xPp==Jx zN$dz+=7kT#{I>n@-vf5A0wDlqtU8pB3<1JUc=dRh!WAmz1rGTffX;FEu{h3$1S?AO zF94h0T_=vuAAD7~U3fz9*NKu)+VMq#V~>*Qn=6|uH{aiU-w8t736Z{oP8V`vdu{e{ex~IDO%-6Wh~;9k90m_M*Y) zqwJg$gkLa`%CwAjp(ISA`JD)I1AIzVd z8)FlVshP%$CmT6vPBl)RX-pq)WW`2qwDH0mHcy=X`e^?%E`>JOh#$ClcJr*$g{?3d z7}{FDweCc)6(y-&Cx%>{^ktm{a@}Nb#7QETBAJ2P6Spop2e8#c28TZi+?jHEv6Uu6 zqLV?ckASZhZY_K?x1FAJ`cX7M#-{GQad*Tyh^;K?&pLz1<%oF58A5ItHuiDuZtl~- z_UN25f+CSj9(6{M8w0z}IC2xXszb;fCWGRK$L<`vm)_1!JCi6nvTKe!il)H8hwFFN z?&1i_`Qa4z>t}=3@;a|F`nR<@B0pKe-q7MA?H1{8N!&0MImnoJbig(f$uUG z5yNL)n0lE}kRhI%h{TRx5QHaTB8+@NCcY%`f0OZVeSX3BZ5YUtuut%P4Vjhs! W!yBP*LcbJ*v4;nbJ|W0(NB<8e(N)O+ literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/fused_moe_modular_method.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/fused_moe_modular_method.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf726340969595fad50a2c165feb4216dded8171 GIT binary patch literal 6844 zcmcIJS!^50mDRjYk)jStBqdRoCEC_u$(Ci0XM9C=e2n6Wvx|X{MT26OWXjDW)y=Ub zv=Q&Xz}i?WCi+;&#$sX5S3K(hGN1FA|NM~SS+LRY0$ybIBY!eZ`P*Q<9`^LOEJfIyL5|Cq1$67qX&*u~pGcK;2K9U>7)kV!#U5d_2?vSY;o zwNrK$Tq`b4bII<4XT`&Dx9lzWR(u@y$o@iLCBSj794v%ZLWS^3SRf8UU9{tcV6C(g zfwpHO-B`D!l6*FggIp$({{fK#G(bHsTo%)d2JcGerXv&l1GX}P;k%fX@YXOk7t0~(c4M{$jC(&jdK4#4!1zPw!IDZ7F!4_(?Ne#77JEibD?68BzqzL>+ z;lH!}j$rSx2dAf{J=&n12&>1SrUJNHZB&pNgG~zw@jI; zsLifNPL2GNn-ERdXn3Ax7a+Ee60jd zZ1f$Zz}X!ZO(2^j<6JgkQeHvVD_Rm33)%8kDZci%_5unS`N@t*iGjcD15d*YfM z^*WSW5C^5k#VE^$hf%e{in*rN!xWf(a9g_p2v}Mp6F>`7TWv+y0ING#(1vWi)10;i z*p!>(3t>6qWFZ*F&KIv ziPs-sgD@`FJT)}Xj<`gw0*Wt8Sj5pNwfp3Cq^lkou0@9Hk-1uAt{z#c zMV4MhE?0e*e|TtIkRKXzp?5j7eK3b*@8LanP?lF&2Dj64m?O~M8f|}=eIw5R<8C*A z#yQ4nkumEWRo~qI2u4!xrH#v*o7=k$jXi7$m-c?yvyhHQrw^R{* zy`o@X1DA}&GR=-0qqt*>OW7K0f3^F$egv~9=y$+eV zPhhAj8OPp{6r9W6Wdef6Uh2Ouu5Hb>-|?GA`~)UeFu>g>riXw(kDzW=ePh;D3wMn$ zgXjvk2ca*oDux@ZRJ?5j87-Dr!FrHH;TXi#hj%#O0L9giqXS&CW%T8wt;Nf9K=Z@4k`z1R0P1l zKZC-FtNE;Aq##?6H~Fk^o`xzwC)h_tH3Bd?|4Ejw6Qo5e*#N6IqK%!&rK}`=lBai! zILZPRg|mv~Tbl~UAtNz*><$QCGEiX)m7UagZYfgf+xFPIby3u&?Yj;rKTLF;PN-S=m(tv2JGyaF{PlCJ$wBEl@s!4klEumt-lOTgR0 zzidRdC@b0NCRhZ+do`l1RE(utiKQFYFQoHIx>!=vjp#z9^je9f4PiRdZG>$X45=(t z*n2Py`v3t(AND?ivk1;1xQGB<7h6Jb34jr@gEy|nt|0vaf~yFwA-Immv z%{N?u;lDH2KpBQcp7$Gpypk^}YPOi8;QJ~Nb~4F5$IMP6BNz3al)2FNM9t621=&XE zj0=IwgbVzDteawHe}fDCEgmM;sv3+c6~dRI#15Ni45!UiE4bl z8l4BI$EU0D)79u{fO`CBHNIGd2SR-Dzj`xz_pBaI*5gNO@uT(lLM^^fkDsl@&sKZS z{Z=rYLeE8E*XfMMOycZ}!82mvz50nOwG&tLL{d*o=mS%FEUx#B>hWR3$KKQX$Munu z92uzh&(`{9_w_>8&@nxA1enbXS-2|flAeV?%{TWj^s8Id^qe_OV!idw(OTyyt`LDJ zI8=)a0rJ@|AN>+1?-|g$`|90?YTbu$QJ5{Wqz{eOhZepWTG&>wPLdgg{0=Db4Co>0$r^9pz?jH-!mVSNh^J`y?)y5Y7NRWVBjXs#$Lgyc=*g4WXqbIuWI`Vv*C&qXBV+pEX?^~ro`U9K{p5T4)SRA3>#1RV z_Bb|Aociv_4Z)q*wU^bPE2+nVX;%>apZnL&<*NN>UnbF_mb zhs+2^qok+r@!0lNvy(StBsuuZ_2jVG#hcw^AZ_+=w3onPu5Vv|cD&X-Va7QPPH_17 zk3JtV`*|}#;t4a!(G*FKn*$sjgk%0X^?B+mS8e#XImBscGI7Kl=I97)z#Qf1818VK zqlZW`{q*RQqtCl*i79h}(fmCn?^Mly>d#X+lAAKu<4nwmVQCjcAT-d`%Kp z;fD(DMU45sY693c{*WQw=D%cEu7^7n3{DKK+3+#Co>$;i!KL$s^{%vnlkW^4Pd{Lr zz={4q`8xm*6a?XW9}&iXPeT7qM*oA%eeZAxj_*AH{_JrIj_<&Jdw%utOaI_~4`{5T p>waHdh}DExwd>GJVd77YYYrh%O-+C0+a-vCjXEa3<5Yf8{|Cc4T=oC} literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/gpt_oss_triton_kernels_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/gpt_oss_triton_kernels_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76a843f5a6644c33d3307c7025f057b8088fcbda GIT binary patch literal 12376 zcmbt4X>c3ob-Oqhz~TlE@B~Fk6hw&>DeAB#TcYlRqHIaN5G5E8yC6a0pm&#+$Se{% zRmNm@AGljyunzX&U-D_0=#!|tsArsY(Q^=*{bKe zxlPr}t5aioKP*v<;o7emLc4U-Ns8NijpDZOrqQ9*wbP(ma6&MJdP&W#P_u*YEmL<- z((M%A#&3%0Hd8`#sGawXo>p5_=A4ypzokXfDv&s;oyEPU?#@$fT)Pe0Q?c^!VB5`1tUM)Su!m@Z3a{A55Qr3i{JY3B@D{ zaVedG!8{`oF|1YQ7QgXM2e?ViZvOLk`sw=dKl~3 zE~F=5)h6F>122)EAOb&uxEP40ljFeLP=fCY92bPN z@OWU0sF;V6;b=M`D(sPg^GDByFT$*a&pm>jjl%IOCksHh=qrDO1)Ha&lCYwv(b58z zYXZR0nlQ_lWwUIYGE$W-43%|Nn4@K*wiTv;lwm4Hs9xWs)z4UDhSPDnufs2yrzXkHfYp}7VwrCTLmYk^_hc*(2hWcm08X> z1>$C zmuylgIA>&z+!cK|w2@YyvR#|I(wq~~A-mPtCbLHQnZ~-gP~aU2>bzc%-RszHOi; zbwSJ2yVd}try|iY5f141SUAp!fp`keY9vrT&t0TaG|lk=fsgS5pNjI26WWL*jf;? zsAq4_-o4R1(OrFf_xPkVl1`P{YND!p2nHSL=HepsKQtlnT-W$y`#|sBKEj89CwK6J zKS2V)XGliGF~yjcMtI?a0F-=yM;xS*A#yb?25AA!qo7s5jjLG4h4ioxNy3{ericJ+ z^NJynj&O=08WAPMGy&7GyH_!)MaC=8DX&-rB5A{EK{1L_THu2gQIk8|OqVoAqhA`hh%;Fw}AQ3=zEIiBL;QC?K^ zT>K?PFYrl4pHA@#9ajukQK7FY^rYe_vl1Sc1W~MkVQ`_>rQgT8rQ~?`czk@P6cL7b zY3J36eUI%Q+sAh&;zQb6yp%{JyOXf~3E-cPPQVLyCnA#|nY&}CmBPt1-#t7oh123% z9fL)%RElF9_SmY55cUB_o8TwT!rDwz4_hd^^ZL2jbJ@0gY}54df~zLW=3Sd#7v7kB zZF0%IG;pt@Kkw?FK7HTinYY}qWZSZr@42>5pDsA+zE5ZSvgW)auy7#n=vwmM>HDes z&c1xl(Y)j6^vMESlQrepmLJj!eGBHiZQJ7MJGT6eqwie$dCxn~*cY1%}coG_0-E$XacTOKg;K=Nb>7)0p?DdY>j=8q)U6_C2#tZkX zt??~@78Dn^(=8q`|`m9cVhYAv0pLAKdhr1O)Ib>lwu!= za~z)vi&8{_t!5tvrJEWKgVGt%y%WU1R_S$rAlWo#KXW0y+O8KgG_5|@^BvQ1`XyKDlT&d3(o zdRupkRY$l;1@&m1F=8?tiwmL@Gzw2Zk?RMqnD^t>2`S4!|voJ=jD*+3TB3i&*!$>|ECBE!gZHFHfW z8w#UU3>;MBqV5d!HCkpyQTne*F9)mBKVzgS2F@cRl{Qx8U}U5!^#9jL^k@|mhogxZ zsjFH>Ow0;i^;K(qS`TIOs5Gxt#~}yNnQ>*@U_jD1o}iPR8CG&@b0VQOB#ARfs1!*W zt<|8_=V<*ZtU;VnGi~rF$8?-Y!!sV)Ah5Dwv_Y%8`b^OrmGMf=8ik03s`~;33+#** zt>)@ew!xctu_xIBW?gAdWqcLA`am!-qgB=#dNn~wV6`_5!d^KlMT03vf!U_@S|ith zaIy>vI4vLi4;VWFzLsKA4;NTIM)iCd$-(*njiR7Iv1y_s1+v36F7VMfdQ_T-HVsFT zU@$2zRcft@a%jqyB@MI{o`e2`=P~QUp4|c-HpR$EljFQ%1fw82qA)`-FrK8Iy^8rt z59oX7*i0y3LYc4hsuZI*5&>zCb0VN@R~(NBD^a+BL`z9##8Qd{UV$evLO_2(VWIEx zivZTUiUn(h0qhrLu5bdglbD^t3{{2@!faLQoc4p5*^`tsu6dKxJ-|VW(^}ES~!lQVwQlPI71Af;wFmm<;;!u>vqqu zMH9vPmu)S1Tg&TP-spO*>#l8A&af+I+f}gDE!$f2w$_}jWAWf}=b?P(q2slfOOzSe?&%d)>W@9+J!f6weoclrwcriX4zqbX;o zdss^~ZYeO0s?J(%1UjqSsBZVHbf>mpt`fTcsNNKEvqbzLQ%I$Eiz!Md`9akYOdUx zhOv>GHR)CfzYIKN^ce$3XN;UKW0LSJl2lb(>EzWog?ZJYs-`kPPficujV zMMuUX9MKNLP!6g1k3k2-0n@@|KolcLuAaTZ7ZEmyV7LpoIl#|FU(yI@S`-365ymk? z4^2P?Eo{OJS1{-#XF^CKA%$57WD0|a0*}cmi^Sg*wqU7YSOC}Bx$2P;t`+YZ0=~u? zDJ`NosEN$dQ#Eeg7%L=R1v(~ZAPX|o{X6)Ht*RWgQm*>zU!MK)5B9uz;O2pQ&W`C* z1#j#8p&N%@-}Adm&YsPh z7TA02&YZ0q{7Qpot}kz>FEH)`<0>$Y0^>vg#86G!$4ncA=@pP#JQV_? zCZywNULVuVP%)a*U1LKAPR|>`z%oE+-2||Cw93hbawC*mI1_KJX2Nmi4d@KdLZ5cd zQoX!7wH)rZ!kCU3>M9*_!dbE@z9SG-KHq=(5(zgSSN%)k#YM})7v2^RHvU#NL-rRm zxRmfz)2vRWz6A#s4s}T&)6^vOCOwFzmN)?>jKYM$mxU;P_)R^WDLxS+11OeoIGTt+ zG$bry-+{ois`9@E$W(JR#=5kQ6sRKNtp0AAy3g2W&3UFi+w@CjtE!ZaKuNU34S;GQ z{G+UzrzYt)b%U4Q1TKYZKoGHNKs_~HU>O7OVo7(Qk3F@mDl=7Tz6Twxp!Axi3Qn)O zg(~^LIVF6c*;DD5YwG4AB}_w+fEr7fg{+dd)>u46f(crXAd=$3r63zz8ziRss`B|6 ztsdYgxVNXN_nlksIk)AQZL7MyLXHt7juWUQd{Oc?O#oEB4e=7nN~-iu5K~0?6Eu)H zUn3+%>=(tCx(Xhc_BFz5s-x=5t*`h#^dzEt1YSE!x!M-$7Z2thJ9E!@HpiSLFBWv+ zmP|#Gd^oIF!w`PsCK3p<;V?vI5^9NnuP5MUR18DubVB%Z1a-+V#d~S}eUk7ktbG}) zm@cXQydG2unEVst30}C41@_X0kMfBG90pqWI#9_1A69q%SFjJvevJiFkp1o&Rn*bO zj@*|0A5-uwnrNeM;nE7gzM@HI3@&+=drsziPOeaZ6d99ofPUbqo8NI`N70A~n5=b8 z^NAaYqM4u!)zn(F5YS3loimv%|7zl9V)6J}18)xGocr^Z$3C%PvAPA2nJE0qN%QYk z;L`9@g&=PPF=eVQ{HjycYa3N?YUcq2q@G7_6@{e@ge41tk>yYbTHt4epACL2^oVZW zn#9@3oig%)Egk9z8^I+Z>EfIyB;nrLIT~2gA)uUl1G;Aex_1M*Zv%SG2J~9qP2RU| zgL3s<>p!*B|EZ-78zevZ-RAe;hJGCJw)MnLE*G0uK$Cw!T1|08?f)!uLC8T z&*05tspM<)swGWS5}0j|s-&Q9aLRwNB;5n)RAMrayc!$t3t#Dr^z4QETPK9+!xbdq zW$2p-0O8L7s-mR)YhR|Up#B>hY=I0E)B|VZva|E9vopt>%I!RL4{~Sc=h0eaZGo>} zsdKAEz^b`Ot6KW=wutF8X{{H&2cs(Wk}B$mK5fq1o{VKrEB$+XEi>8 z@l>cRPPMO_s`ZsHvM%m`3?#P|!uQK;N1p9iW_RV;UH4f>C0aDAE_9#IC-98HwZ4=Y%*Q3Vl|MSszBk zt$M(fVS7(0Y%it9KtqI+yc8Miik2c*&_-pOXw(28xbbB$EHex^pOzAg7up!VLsv)# zHWs{@%xUEgHk2Bsb2OrP|51I$n zje*AIiJ++>pjRb~cL}a@*c603;U+qY2LY~hCZHiPUDPn#lYlD5XnK4y9IR3I+2>fK z!`MlArG&pn^exQD;^05>0WK)J6d&j8p%!CRPDMgN$5_n z#KrGJx3CX4Q*3R{)>f!%TiBMb>jba1spZv^H%~6~|J51rTno151=q^jV&AmpSEF7fpryFe}Q$7|M~)3S77U*`+~oD z*}pyS-(CoI6KY5Rtq&U=4b~Y)F+f?I*V$Qi?n_IXXV_md`^e4@`h+>0fg6~8 z2Q%C>swK+6O+4NKP6}D1p@PsdmJd0M!J`iqLg<)t?s3IIx@aP`Y=4Z~GD=`5s0M3D%K9LhVvN+6${ytk9I4 zV0dNn-J(+S=+#Y^dTApJ;TVtJL5p3hpA?d7AN2wVWxrCprkB#)HwmtjD|QlpTsN3{ zyG){#ilbkGPVn0x@@=4EJ59beR1q33sCfKjP|=U#l`~AJD1iVL)G#MX3nEukG4o?Y z&mvz&a(AVw2qKkaq_%2?<5O@EgafLvrDswgz7QvVE|Dy%WrmiND#e+Dszd&ykOLY> z97;@qDO}XkG`+%5wEaJ+!@r^qzelxyqSMp5j~N}U`@}@i!S|@WzqR=0p1)^lnKu1a zU-JunT~TK>TK}Ed{2S&%p1JTF=D9rc+2iX9=)LJi1SBSKA&aMp~Z7Rps~U7nS*T;92h-%3?CFVX3yqq+diU2LM z`%6mKC4b+QoqRS|$yEhT1ji=Fg5#6pMnmXXE^~6+H!(SYza3n)&oTRJ$#jZ~tJ%iC zb{l`!HvV<+UsZBV{&Ja%Sq1}Fub*nh&NaMdo;>Y4JvoN)(TJ8lGkJ#9ya_FPcJi!T z^R;A-ic76|7HZxcI3GMQ`3!1pLam=cUM+!VgU?Mq$MR|oJRh8#oMdiy;Dz9elP|J- z+k7v*Yn1Yv{2bS|4U8tP9WZ*I65fIEmraHv2Cnlp1LyH&zMCQEI+bmpf7}??<-5Rj zb)LPy)o;dl z=g7Mj$sZ%_?^at*zb*8AXH2a3t=v(-J@@xo;xUvs&syrZ4<)0e=Kszub;370`|r25 zr3+nv51c+m)j`XLJNVw^js+7yB5Kxc{H>T#BbL8 z6d&6~8)|j872Sn>dU&K4bUkr=ZxhpFBk2A0Igt zn40zl5qvhnPepteue;5nol8>3ImAX{6fU%jqv`+TnHf7DjA*f@j--0?HN8FLbKAh zSfNJXeP|@#aELqX53w#p?&Yj1QpZK+=1+Lhqf=gPDl&yYrwq?oe{dl%6`7-+E?@xr z7pEfrxsU`meCz#T?@VASe90f0!4L>9%+JqZRD~{jF9zl=Oa;8*8Ol!?1M|oY%^C#! z=4sAY|C)B~2FJHQ)3C`m-xAb=}8@ zyQG>2=MbMfJoDE@m-iFei{Fjn4-y>Nc-=5OavGdv%Z|tRFrYK+Y}%qdsTuKIs=*Lx zLD_2Yh*zKtS7{O)oC?zw6IyzGQnrm5Y3=+6bP5hn!2!^O=_B` zIcl^SqDF=KSTaY=v$?5p=qsXn&E~02)uEJzx;;IuQ%#~el(aLZvj%tO#9y$0_q+%A zcDT;HV-hnse^AW2?DNgT!Be}svqcLA?DQouW9mX!%m_|h6D{FOQ=LtPFaRQby>1IXj2uMsE0ib;$$% z8x^X0Hl4jUjwxQp*37F5Yuu)@@5Zr&t>}$CH}`za@^;Q!Ih(eI)$`wa?i?l^;$zRRvC`tM%E@gzz5-Lc0IExa>WpY`ZmCIHr;unFFmNJ*JmaNgt*~+wJ_)Mn6 ztgF@u$G%R%LZ1M0&DPWkRBTq}Oz?*gMVgWP2svpQ{^)P<{D@ql>D-d zoHCrC409Gj;a3)XzD1u`%#vNAnHFXKB*NX!!{6zIgo(p?=4A7QVamn@gMZqcPiP%seWg z6;xY&6cqEg3SMs-e1=IQXP7i`#>ouMG->8CCM}$KGV{8{oheqyYqalKwkV$^6ns`9 z7JZzUDbed<*7T*RP{BXgPA`1)KLnrthfRa^$^({X`pY*zhzc5YZ*9?i= z;v0t(d8G_@18>_4UzO_3B8pkqhOqqwIryp2WwAh}aBn-JY6Pt-ZEc9AWqTX3 zjN;8OY7~|WlTGncFyvVmrJ+7mU+7&)^B?U~ae z;PZ|}B`UI`8C)ijF#LX$m$?YCDMAYndk6vrQBnCzIhwL{NF2@>wSL-?quJ3M{IgQN zm4;9RaAYw!PCva2z=XE?#8Tv1FWH`++h>H>wg=FLY9b)xq=&OcGrnrzvLh;S6;X*W z1yO(1@O87|{*sZ)Sh7bg{2Lm$bVx7M(M&BP_4|zR6YDFFge5y_WarETXEd8L)2PFD zM$}Yk(3DspgRl9CW^2NWGy$?RQmXhx)!K|0-pHLcT{KLaUc#7J%8gp1xdioX)9cx0 z_%7vsTDWI`hwkJkV{IuLIb}!fG&5+~r<|@D+_tF}49^@w_~4vO(_YsXc3qmg>IzPU zuDj&Dw<~;UZXv+AF8Ewi;jsT=$j7-Nb1vW1^d%Qz0T)&}*XSu1wv;TZY3yJrpDE-r zKaahzYlfc-x+GLVDS_0c0&?yy>=1p|JT7_r?gCa|nQ++V^8O3h$a5~=6<;WD-8D5$ z%M>dz>~RePK;9Re!ag1)AXy4!LFS@L%lK^2b?%bSMVQSs zH{(K51vZ7Z;$rAnn~dr$C1WihCzChru6gYFd;rxS(#uWFhd@Zpc`G`V3W>}`&|<)z z7koUysPv3n0NdPp)O@!^z*1&XuuH|yE(Jf!ZFGk1(lyN#S(p#_UI0PT;W{}_6wFlM z1-2*goMW7*mtGol4ZBz@*EE<#QlBK7nd;do^}1KHQn*@?rCWk&Yt@vYrq+sbp$7wq zx=3kjYX4-_DMF{*JjC;GwdzBRVgWIOp4yXbsJ4{Uy0tqx+)p3W(rDh26Zl2cDq&Y= z51TDY|Apov>BIIZ6{E_@o#UFA3;A4rj09;(cKO4uo^6{UeEFx{4z6|@jk;?&f_ggK zTbdv_FkGo^pmele$8O05IL6h@nlKEesUNgbYNbD06H)B*;C;E~2)|st=|$@Zzd)}m^ty_dXukla zlS-0hOW2k3@nSy1*xI=(=Ez8fn19(}aNi>FN;Qhn3i*D_~avP zHi|gnsiSfV2sD5BUqI|H;x8NrxeaF9QKJEDh0|XiikVB6hZ0$Nx9YwWO*rdUN7fo+ zj;@%s>rtkmsClhaDD1v5Mgh+X&JM5+I{McA8y)*s?5nSAl(oM1V$59pB-2n(`ew!L zidc15tg2_d_jk^J|Ap_pa4-MZjpGlpT&qVnvN{q)l`8?EsO`?#_0qfU&CU^_XkZiGu;<1ct~-`*+upG~$uJEYbCz>%?OwSaYu_W(?h&ke0ZUH9YH%aF?|xO? z+hcEyfuxQVx2%=^j^+Ed?@^lSoS3!ziPhlteB1s`dWLS~7b|Ssur?uvqx_M{nCnb9 z-7!a7tZnaoN9nz?u1!aGEU)L0#gyCkIK%A7d1NqW=RC3-3QKRC_%N&deo5tpv?Lox5LA z`*!dsu!$j%- zUbp>lZ63c2r#|!MA0p&UEzPAztJA( zG-7xc<nW>JN?uRH(A&`rpebmHB_a;wSqd}~`G;j( zrU-S`%=2MNg~YTd$Y|7ciHTW2O@iDc85BjC17n3@Zy84uVrGbX(bVBe8Arsv?!({X zOL(0y7}+lJr8a}%MU0~*BiPo)S!G9$7|NscKK3NoQ$9V$*E4>__@>clc)i1HxNiJs z8CQ*N)1s+!ABhmn5D^9;kOk`_?-R|l zb69^e7X1)evDqr-C2K&k9+E$Rz6S$~iGEV6RK9IJ_+LeN34)xsXg3T`Dh$9~FWi1% zwPKAI>UuVdd)Iq};(;3ziK3d7F9=1R8{-L^qMA6hvv#+HIGZZlF#`U|R@LP*S2QdpaQF3(WCTUYQ(7 zkzdiUVvLCq3?AYwLoW(?Qp8m?fv-x4Cz_!$2QY;AU%)V;RB4Yg(6|h?Xr)7dqCOvVmP<7*+jK%WKi{vXlV4YYLRJX*WZxNLfJgc*5 zRiF&luo8%El=f)KFG~x;CoA8cQohQ$mCAQ#O}&a4$q>*HKX!gDCYp0e{U`0xhzf+L znKm2fwFPjOh^^Y;V)U15VP1~RI-6w#LOJB4E!h-*T^IXmM#rFQjAVgw9-Xu;RI#Bt zZU$W=5EZ+K{3CjeP`?BwV)(B}@FXcy?HC($F;*ahpZUuo<!Z5GNFRmnx! z%Waok+E~W!i(g6NQogRM{>UZQ*xZ?65BA9Zu-XtrydfH~@|0cfG>AuP zi{Q1H`F>YQ)v!$sLt6$-1C)(|QnDGpcKB5PL9o1iAyC>ZNZRBiH;`vYNREka(|Zpd?%5$$ogMG#)_79TF7Ki9=f;N5p5>b+AF)jYIxzdQr_Z02V9S4=L@ZC!r`$?Hx_O@axF) zKjAO@KjU@7@Nu`vlKGLNtf`mzPSa?;xluZ_{+kd6@S@2i{82d#wl2mT#he` zv*FKMBrOg}!XA$d@C>Eyh=g&&HLaY@Jm@j#Y*-%Mr5z zzm@o_LT=Ugtxc-p`m)e6{hC_UtG)vo-v(~SYlg{2ZYO$j*EVfWvevW*TC4gaXV7P!v2sJ`&1UGV znzxNdW$BH)y)6Acq;G+y?p2n@{%ztC%-g|%e4zQ71%26JXydkW+vPzpenLJoIov_O z<7V*uH{`<|V(^Rrp0-cS>uL3(d~&!^aGRH}31^fc5Y_f$tt)?@tBllNf0< zN~p9`zCMm=dUK7?p337+Yi3jb&!{|xPuZN# z=X?VU!|TplN&A^${6@sfCKvkPJo;b<@*2<|fh?AZdj_#~a?he|pVN=WJ-?0r+IXZh#d6lI#`ocDlNUQXA?JI?D*UD1pc5N+%PNSxhOOHC>5ZZ{*quY9) z#aBLK(ZTs%v#9IoJ~lH<+zk4AKX*|-KdgqA5Psk@_aRXvIxE#@tBo{2h@zXxuRrC<_=8EMSO#JHQr@Jk^Xmu4g6o!}Q$zb#Mo!A}(yEiGzUZfZIfzBo{F{Szb`b!nlJ|(lwA+ z>XOi%)q!coE0t6<`G_IHlp|Qfx%@#$&*_K<;|l3=pEN@8LD$4nrTi)n1M1~os32^O zEpb}F{h-VC+Z$(<*x4VYB_Xa(Hi+oNJd#cv$wwDNXR;vqC{hkP8NxJH7z0Pl zib&cAOs*_u;mi_K56py=V#a?>ER_$bfOp1@#3&O)w1;u<1al7h1EFcLn8HyiDWoz_ z@KEP?u@s{V(R>hl-~biVob_JtPld%Qh_(XPi6e)BB5SmI=Ry!TpzCppM&-bYd0EF@=VF{ zqLv#)N!I_Yj?SD??vf>HLD{7B$)2RHoEpOwbs>gId`4Zf6-w;n@2jQy{8`3`Vcf0 z;j4;SB5ozcEToa?PqaSTfI4)f`$Ol{tcaD6TT++M7%d}>LHMaeV$_H)+S{-vT8S^U zt)f-cbiM$EpP%u{hK zHAR~^8xuPYLVp({@S2gQBQMzyw2nag-N+Z=OD+drxkk(#y*k+qOU=>daH$60O)I4& ze>uC-YQV3#OHDU{XCpo(T)+PCol#;Wf8ndsf>GqC*MIOmmrSd;?&Q5I<9Bw%re&#h z$sKLMDCt*XAl+rfo&4n-(N;w)%-TaQjx*{`#?iEjEt!wv;_^|BejRnj1FPdDC2jH- zdFZrAB&ftdct~+4f0h1t7n@yrU#mP}8zA&q#CyHM8syI4Y zfUm=9SRuZSs9#0+Qt?Q%7+-X%iamho%ZfYs%M~(w1#GPcaafDTij)xmd{XaL4iT^iZ-~Q#DkJkn^GhAt1BXU`+rj6B!6e$Pr4OXcnpU@tH2_e zS?a`Esn!xX{Js(=`5XDSiaXjlV^V5_`7S8_T(tr(4v=puzO(lfSMpbZJ0*2ZGBqGL znvAWu_(S@T&HkAZBl#Qd)W8M2qgSKZL^8hO(laXq4Mf-1)jlDrR&PGBc@|ajN&eD) zO3yG2e0An4SNFBdCC`5wyj(p-iv@@7JzN7#R1?uvIz2l3zbBzoTwD{^c+=oFMm^s& z{v)HjQ=L~s2~)nJd|fxVnk}i7c$TTVaGa^&rbwL>rZpD(MkTF&s3Z}4 z1RO6-C}Wy!&48)>LwnOIGd#SaRc3a0#UfI}Nmm+#0zWCXES8Pn>zB27E>&=oO3!9$bTcc~|@ z7Q4>T`qeBqn2@ls;V5ZF^0Xx2xcJ;m1ePBb51gc9F7lU4 z=E1oXt{efK9Cg(Ep}9~eYi4vOvHm>B%6~C9=jYrxJgsbEzC1#_FuDVSCtx#+IWm8l zELr3ei%rr7mLoEqFPuIP$?CUxfQy|gVTr>$tq=U~(TnMY^}(esDR3Li_$$QU<8$C* z{j!M}b1>XA&r(?0^J)WUx@q2xx?t&Zc2vrsN~D z2Z#88lqKUgdJ%fH5g4-OiAuvT{W^Zc z?74ZEtpaQT(Mf8VTCl~_pH_`hqkb=)91MH5dXd^De;U-a(+1?PaXcW$P~}=4OE?PS4wv9?#T*@py82t@U&>3=HpFYYh1%|T?XXZg{9fbx znVYpIZ`l)$%9x|!uS~}LXN(UDOW)jgd*ABmjl$*+8xFqL{|Ebjuz$1R?Cr5carv8{ zyZt$wwkmE-4SCqG=kB@hd%x%1Y#6^acE7NCC2+4}-@U^9G5h{l;r>KnZOmRPY3&;p zvtIPHzR0z^4?`8VXq+sbp{ozi?^1`u;@MOt|6bsIC?cmIGqyAlMq5Z4O=2sDlA#OA zu2^k+Z21q(U0vMy88SU|eq>lpSi)9WnhH4DnM9|HJAlGz{Y6OHLP>B@@oZ2kYTOd~ zz3RcbM2J?B@YHUR@G!g!DkcKsbaNq|b`fH(Y`%wCf#TUgKTs}(M^2qPJwC-_6;zu} zY}zu_o}C!(*^wS?>x`75sWY-5%M)%MhKCj~^~sclG(bhOFLWg=)-!5m zeOaYto%}jxC zkX9SkK71a<*rb-SGx|cAp2U9P2{ZCe3e3^i(<{Pg2t&1;%($qO9u`Ac`c*;k&B3Y5 zKG8uWB{Rn-Px@b@3?mewkmQPVFoqUIWjXvq3jQN{{V~008%f85aM(mjFBTAr@#A1V z`d9HQ0se^ir_&Ae@vlBLjbO+OHzAG^c)tTz^Tg@6zioy=eEz z)8?B0m-PBRz22i2ZI=1pr5A0L`F}&N|CL@a-@#U1dgv|h)Irfg6@*X;Re&U4p?6JvF{)d!5EnsY=5DV03Cr_R8 zj*UJuI>yu9Ni1anp|SHLqu#OM=SR<+#f-!8yLng+3&Rv&K(cY^^<9|?@c*9D zS=f<%(S%c7nPJezWMm98nfFtmmDWs{HoVCHIYB@)nb`elmr6FVkOBV}l=c9nEri;} zF+~-^$4N()8!3d26wnq-%$4zFn2E{7Y_Aul&B9@?m%mKe{D>mwN!BG)Lq^5S+##gL zidqBdFoZ?bTb2?_GJSV9|B!NGD3wNybRvP+FcNOqb|A1`U;~D@dyKAT>+>g!5Jt(e z+4l#Oj8V5gAy?YG{}b|K!%0C*J`86WYyBw&#_098^kP$mWaebSk&c=xRwsQNPt!#+ zG5!yHiUrh>zH8Hg1(@Q;vkF@FnDGDNxldX~ZPH#p&kBXik#H?st5**$B;6T89gjN5uQY`ypG1#x?o zV6Tc*AGv2gns9e6k9=TnequGW^}k!SJi^XXfLw744xELC@wk$>(pr*H{S1y!X6M7RwmDYQb9?7^qQCWp z_|8+p&Qr_Q2aUTZmC*AN9nsZ(coeJiCXqsm2h>_cQqB+Nv=wqTrcfT*h>@kGWOpo9W+T) zx#CqFLRCk+YL`&8>pgR+gz9~FuWVKy-Yhs0vmZ(9Ie@rvYmZ>ec!_r90EQF;)uQG&@>+=0aT1u^*K7|J@= zjT@ysj|`^#f<)0M4hB(34?^;*;?8!#*&cU}3eHhLp4*IGaJtvVHypb$JW86^&c%v* zHj0nOb{u{`U)XW{#|BezX979q3uV1O&cMe5rB`BQ9jIG=^G9}+c)zIP&57F+k4&cW zG2@dAb8!iZ>)9PE>inVId4KnKqO9s|`&)M9r=ca@Feo$(CfW{RKE&OJ1^3|udF~e) z_CLwZ8q5>yJ7V@~I$VzoW z^~q;P{L$QW(jTS?BqPK+T+$f!P=K7Ygl`ZhC=@ng9#=JFx6UUsFW#lGvRQp z_J3>7H}(jQuJxV`M}N%PA9M7p!CeR58~KB=AB+iIrzl-VqNam>E7b_{tid7SxXra; zb0rEJVufAn$7B7+WA@{*!sE)kN>nt`uuinF;oGPni$v2PWmvEJcO0c}w;cTebu(? ztJ?ph*wzlT^}6Fxg`wI_4O0HdT$t^;*D&yZ8}R$YW2mT!mv;;0Kw7Wo-LI~TSML_8 zcN5kbeKtP&k}&$xy;1Mi&c>@dh3d|j=fJ({VKUNZt+;P1j+M0DX}V|YzF%JXcKKW7 ztAlG-Hp_P`A5S>SV~(af4QsP;&pyGk@9ry`p5e{5gR#TU32o0MTus=p=KipIc(eNu z1L=XI@Qp9t{9@eEx#8&i*o=QPj~Msc0sth?8^H@5$5tl=C?U=-yMI&CAQfHQ#^ zr3DN}Q2fStVY^V+ey_0eeu+C#G>&eiP+$iPS6ZzqH*A%wBk{%op>ZJIcxa>X&=aSj zusKoCx>gi-4+`$VxO+r!kHp>Mf_psHI`PP0JZU^)#7{c#Wdq z79%l1?bjt#cO~kZvvxNZF$T1_F)1xUE^RHOFmTf~^Y%K>Bk&GMMc- z7uEXJZ^0j=>O5RNX&XPw)4n_^(cna(0*yW6B_@(S}&bK z?88EyVo1RXj|}`h#w8idh{Ds1B0#mmpvNlHW4kHh7?|rYzIri|i89zTo2|H#zc9*X zoRx)qNneAmiW(SRs$yHxp7FcHO2h+^lX@C{R%MJyVf4lce%%`2T)#p0guI+@)R1I7 zUN=BUVVYRXZ3;6+K~uQ3DeQLJ#43rI#&%N@^(Afl*j|Pa-eRS6!j*QH(1d1|%)K+R z9drIM0E87H--U&Xj9_DgvS{*i{CdMdX=b?Ak&HS%fw&$BC>;tP{Q|a!Ro$eZBGzh=@2SA;+02*%A=c=$1#i( zw#t~T1q;y|V>e;1r#4pC6R#T*>W1QVBSPKCd)%KKd_VHXM=?(^hcgXzJL0ZE!8M4P ze2a^hvUTp7|-d^OXfkrmSw*E7RhG}0z zKmYQU_Q5SfT8Pkt6zw~I4-la%bRoiBM_P#FXm@|+5J^m=^d1Xu=`jNOcYgNJz>Le* z(M+C;4t2gsT(S26OnA$5^9T-4%O*iQj0JQ6`$;dvuWo#WFXvcK(@Zd1}E=9Hl{vZKglLGvfMDL>pTM^w}!Lc`H-TU`b0~q{}*4nMj zKe}xVl8OF=2J#_9+zKs1q=n-(IJR3jWF=Y5noqy5!0Z}CpHQDIVX0_!O?&hXD?r?6 znh-6aE3lh_J7hFWD4F|hlZYZ_s)?~f#Tx2q&5bL#&V%s?w`;KD)L<(Y8 z032emOhayd++HWx>(+{5_D*c;bN93DLzCcaiaUD*XU{rJ3+}-dKX?BFY?vLl9dYMA z!MX2I7EJJMZF?dW*rH8LHLW2#efp9tNhu2cXs3(`n1DysgG_{T7;@4q$N;Az5nkfY zrr|iTAQ@R1OpBE%%^>BZ1SY8JrL5`?9VNFS@zPeIv~{iHu4U7)FJ|4hwLa=t`Wmax zryncVjNyT8$Cl2>LHueKU)>R)K>3l(fMeX@Zb3jmdy}*W_sN(hB>f$WMD-8~<7pyE zY+M>4>1a-LFoZJioxZdXx(t!TVp&StVF6P@?o-1ZN1^JNA(Jn0JGNW{G-!?+8B0o{ zmk5o}fH`iY5d#*>$}8YoZa9ywo{Kjf6`GE2Gj^y>Z0u0;fNX_fxrxJ)MfWWo4{kaW z$HBgld`3#s@mm8Q68vDuO7J_2w|2C;;rk=B)OJWi)(!sDl<0r~8!ws?>_OM*bgq7f z!dGYG2VWh13;2?_FYUQwRL-Cg=)qp&DzsLfN1Ge5 z=ed^gyaBceEfcsQhLN;H5J?tcuaN61y_V_q6?$#p<#sR+F_ZIMfssE+LZ_D&_kkKD zF+?`Ga3!dL&!HC+yE-Yqzoz`mk|ZWoV8Lt-l0-pPb6A+igQ=_8Ei;9&n)hiLt8C8qgK5c@j3L1} zblvGGX zdR+ZL+fEMFbwvvk%}AH}WJ?lcc&C^nBqYHQ#pmF_Ts7(K3~{>7$QhxXS%6>Z<^Sm^^J^1V6ac8Dvc$1-8c=f_Q zvYFkdmL+xdvlIhYZW^SUN}h*H%waWR*RDbI$X#QIhz0;2X$(|Lw~sOXGNuir9QdTu zqQ1pu6)itC-hZXBaRSu=lFlQ$lHjo}=RB|#-iku>gSEa4yylJ4mfLk}r4IB362f4*7MH{)*iN+rC-muq!ZJk%c5G8CMv)-ZWnrPdV(i!TCKQDDghF)jL zhG}ti#VmDP+5@zN?x0H~qRA^JpiHmFKLc_C1oU8%lj+ghssh#+oh8PBd--HNbpOzJ zP@W8j7ueZ%b`Jh(PkJzPD{-1$(kD^#QqqzR7|!WJH&C`hABL&{JGBgHtdvH!u%1-( znt;tvS}VoO&=hP|U&ZBBIEL?qmdiu<@|-$4*f^mE6?3Eprh~=IPOx@x)JmNW6$ywz zsnnjn)Zx*ZHaxbC=|I~vlc^XWC5qwEa-q0=IVVxp6ff%#$~qGE;#-TWM}*S$wMj6V z%5#=;F>t~sUg%~E(q&*KQ!2gTL+ct#hc!Z8Z7CJqN%OgAwd%R!e9wkx4(sMv-8yjT15QZPj+WylV?rZ4%X zFVDlsZA6(40GQ%noWrG>DbUMGX0;3|R|`-pkTwvzfFKt_2b<%KaDH}F>!Mu zQC_EwOff5^n85JO5-F@DOw~zy31w?0Ef!MUNvmG$LSz@Fs_Cgy6<5>{zQ2>9nW{+V zHczk)X8p&{qp`uiy@(&4C%23{S;w$0O6{F3NV7)j7g|x}>!wnBBe_|Adc;xcgCkNO zKolkQ0dx}L_G$rg=->zT4%{d88C#p5LzPnSq`HXW9`b6$xMU02>I%!!rej1_{LBL zQZT)7SqFQYR47Z63VG`Y#ggWsX}E#DXG+i-((EM~FBVC!?DtR&wifd<@MYu4ZIzbc z?S>NT9R3obhiRuon1>!lsIjrQs9s{~QL)GKG)#U2(b>w1X}AC8{3aG15+Xb^NWP4m9O;i8wZBC$SfE~+mLP1Oqs+}uZE}ns6L4Y!KWHa zcYLWQ#Ml@B>w`eTc8ka^Cfm{Uw~W}6{XI;Pi0smIH4S3c&q`0ff^e;Bm$XjwUZIYD z0g)vVsv%z7DipV_fr#o`k9;@!n^B?o&~nZv^{3U4T@<%g2-b>~-K$e;6`R)nxOGsl z4&L4Sf%TX~b33Fp9ux($8G~C;7^T9t5lka;ItO4V55W)643GN(H##s=4^!i^ER}l3 zCdf)el{_J>^05v_gmPgIx&V36!!8fVS+a%lG|Y@Gp;3#xg=)nR(L)MGT-*k<5Y4`m znZ*4-J9?Acrj(5i&PL8zSW}3hV3Q!yzCkX^Ip48r^4%`3DQd$Cl%b$ z#p~osUBy_pGD1VllxPY54Dw^UWqEM2A8FIrp(Q;?{qd>f$AB*R@kz;U(s2K`fC0n3 z)ll9XFKZRbTGumUWv!cKJC}14j((gkMpMX`)nkcPPolXeQSVMPc0-_@Tk*tT&g})x%f%{73X&`5Kd?8Xt+J~} z6HT6kyDQPsn`mgGWR;YxgOcSF(L)jX6K+qUrALYIF^iC^(mZC=OL=J2v-#5u2j-7a z4*_D~BkCnXVh%_`b<%&pYV#1u()RuQ6oQmPEy_;6KeSr(s-f4g4N; z0CCLF0h#Ka9?m)s-NLhBt=>w%5n^X`k?A+ELD*tlg82iG(jUx8Y4c=CYKt(2ND@2f z@7h!7K81b*64s%**>u5lQfG!j(6I@igPOMxbT+ANE`k?yZLuhAFf-pDP*WX2?a)*( zPp*(^&U9j#jj@jn7ItY5R?aAtpJMj*SYdlAT_-iHcC8~fYuM-Dz#!4(1K+?ME4Z0c zH%wxK=}pB(0_EnXKKcz1W$PX=)_pp;d}C z+UtOSyGD~F9Q?0nRv&{dS*zcypMkN?+tD@Jl~aZ@WP`sxbDhY*w1?%Q!*pl45dSyr z0aM+mr+1~NTNO1ia$8r|u5@>|Stu-6gn$^a5{eA)et^5DXD)V4%^777Z#ekK=$O2D?BBtrW zmz2FEDj`icGaPB!UotJ6WKe$vlTxBW2)*z$N4GFJ*qM~Y617ETK!_st@&5r0_n4?X zG+cqfDXWN)hxTCzU@JWQKl;q&eT=O5Hwe~26p&JJn!T)`-^N(q4%6_f`t##yNLgM# z{{dz(u_61x#iqKXJ(7KLx`0_v%tr#NpEKclt*oCjNp^{K>0XsShXJS2Pbfzp3MSUT z%}JjK1z@oPmJFgUJ_(j+CU4>_XHf^XwCUhaJ>01$mc%8!mZco@fvS(1Y{qDgLiI0M zk~cIVu1B;N{ygVgB95e%jUj^*wNPU%6{1y&80A|KE&QZ%LoJA1jDBd~ zEK5Zgb9GAHlE0vn3$OzyL}-1|S8+v)?vyBo=&(9#W=u*wN;UN;j-rtY6FZgJ&oG#5pSV7R7$%OJ`CE$QGd zMcir*#z>puR>bW5bCFKPPaiWWB`lT7ZJ8S_r@jqxJ3~Pn~sQb~vpNd@9&#sVrK?o4E#pDU`uAN;p4kR`CB) zd9<9bM81c%<^f2l#+Exx$z8K{&U4MIre6J-!nLqjU!mXjy3_?r0a)~A3)r_ymC=f5 zCBt@2h&8GMG2t#c&!L06?zC=G3pcBU3#pFio5L{MqA*EBQM+mUJsF))sYu{G{TO{Czqm*FqlpK}y-Dl6fdDy|OjX z5(h8s(MvHyQZwRke&%*gSsl8PtIM74KoVWI7@ zk2IQp7iphKt3EzCC&RG600GAHI-#n`WzI%(m+M=^d#N@;!T|Ol*OwG`@>jt%>Pi5U z-qBh%`|C74SI24B^?J5))#=eTSh;ll<&~=ePly7rF?4DVUp>6gpX4`$ zeyUhNOFd6RZExDNRDIRp9+=?&34j*OWCTCSeLp8Up<*z1RihMrSS-=#IMc~~`36eS z#%$!vQj_9w*__VAmBaJ>${v=d764=I|H2RdAIY5|Ne~%>pCpKjZHkVCE4UDjxGwly zFSK0kY3XRW+Dos#mX}-?a0$ajs4$2|4--H!Q~Vb%F3iZRc(D-N5Y0EUi!U-Y{tLK# z0mh>)T!$S8Z$G}0!f&uf8CO(~jY@)JF&DYAJ1geraD^sgxBiIsHvDzG#2m7vB3+<- z@uN$BzWmSs`r*FCJ(?zgZJrtEi!jqbP<+D2lx((7sVI!VOa(lygL3s2Ev_pt&W0vR z@(q|`n}zXR!TnB?$TNN}(`&LStuN-uTmwmn)hpVy-(VWEDyQY|OjmXK(f|9mfBUy7 z9A=h+ag(BG8YYloG855E*BFSIb2BqxUqsR#zl<19Yg`IntYXUeiX||mMdm{6*4HG| zm!{@nPedAHkSHfR>>7bFOudnm@{s)!#y*2-j<1t@jb5~j@h?+!rsnYH^hx$8Bs(Dg zfL!DBYCwQ!gsmH}#v-@~cy1xgjNtIZ92OO}5ZM}I;%SLbSB{ioM!*+hMo@SgMK!}} zUxZ0VMH`Ig_;^3NCWI$MCW){}=p|Y(&yW?ZxBM3gfLF=461w7HMZ_d`BCG^OkWGlE z(}%pDUUc4%pP{64RHTJTIQRwfohM&1Xvv4@MU4X6EifDzfeNL6Is)b+bn7@~dI)C1 za6JmYgmmu4bm7WZC^y#PD$j~?2==gdub3~Lr;tpIc^SXVna(BW>27;8g9dHnk~X{? zFFYfbl99;~@39f@xs%Lb(+Dh1jvO23SE%^6==H1gVq>5BTFhn|aKsklze?X42`;qD zrhTG~9cd+tLKr-SIt|LEc2WyeEMq)2MQcm3{(>t(aIh6hS3!-BbdJ{O6HA^QJ$B^i zIq%s+!(?gd{HaqX&zu__QN+irIczL7uBf4Aw@JOpym%VEV!q@f%MVD!FWiXUVzS3i zcu8s!KSJ#77={M0&cgXtnPEtr)~XGd7buc$XO~XJZr1I6uXLmC=mSUbN~PdvOq4dp zOWTCfwzVFiwDXoZ;VgbL=XTDo_N@3;BKX@SI2+^6Ho@7pw)X?)?g#17)4x-bubyA! zR?n~OXBIHnk(bVU&I2D-x^MNz?0q;rZtr8(Ejop&&Un>Mp=xKm>VQxMvmxb)%0@cf zTT~V=8WxI%V+YS{7M)$repp$*+8=M|7aICwJC45Jv{`uy(e9U2uW<32E}^C?)^qIr z{>_q8%Qn*Is{S#od+fdM>|Gz)bngFg20k7*t5+*y&W;~j@PRn_^|7WsIEVw`Rdxxq zd+o(5rZ4aR#@-)RwcQ$s+56Yccrp7RlDUS|!}1w)SUv-X<*kP5wzp5db#jwxLe^6t zrd=L~P&QWCDcCy6L`vas1|S zS)BXQJ$rjf>(Ytm)-_A4>+t)QP1gkOA^s4CEXvB4k3FcUeY@?gw$qxV;9#}$17~+4zkKCIA>WNYx|I!0^f&Wv=OwC}5;ZM}`mTfvDsry3$KD#l z_oG}xe(e)OR(`=pP6HZF?cOIj22{$j`b8WBD%%IKc2WDCo0+3513(v{qJS-ViyzFE|e*s zYxu>yL{s;AxzIFpH}d`H_o716iRBY3*#en7!Lco-fp&SlI^HuP^o+nP%02t&4;wpg z`QDuU$Fu7L-yQtT!MlE;?)|IYD7E^Ki;|w!dm@vb_ zgqL;WP`BC1MJk!ClzJhvh-=g#$hyfAg0^{X!kj_ySJu6X-Cp?%+7UT8lUbG9ZLTE8{;jlp)O8o%aMd^b_qqh67}tg=037;;%fYs{Tp_a zS-cx%7MGyR#@?0wSkcgW1TR4CEL#~89Fl#Ty@GSE&ip`=R-+9aX2jmxgoM+hcoN?3roG|q4`qWz8w_DzA zi48p)b2i7^&)qL^B}#f@eaB+G$74k&l2%hN(bETP#mWniCO#lLMeh9RkKlpmHgFE2o6U~L4ipZxf=(a`q1k?DNIGYVfQF~drt&wue!Bs>j9u=)^t zkEr3tBsM%@Fm5a4yN1(lH*85|g5-T*U?vzx3@`n;oGj1LufL zNU@9*pj?(Rm$H_u5Mn_q76KRJ>kta*D!W0aR$)X#B$%Zue%Q9;B=ch_GeWdGdtjs` znJ0(ddiVx&h;{KnzjMbxU_9$NO`uNdDn z8V#?1*=%_IH_Xt%{bw0CXO}s#c=}~#E?FiUp=dG>EvJ{&?K6xbU-5Iy%C>aUK<;8( zZi#pq^f~O=*fwt%pz}J!|6=VJiK01J5oc#8Bmi7Z@>vELp~;pE zkG@J7fY$(PjIc37X4yqM?+eadfq_oC%TL;G5%J1w&hYJs3RQ-OyOWzWM}b)g4Gjp@15kIU z{a$Ts*Rl8WV>?c4R*wM@mDR4AS7$fNdTv<~Fp$MmmE}bdgqTE>H4q(E+4&H$*Q6-s)8-q3FvfYWY`b1eB*@JVu<#>{f#SA1uWz%L+<45g=VVITr$bzc;gf-un z4wfFW7=O2-9dFu!st-07k|F?|*GoHuo}tE49a~s*#-y4nXM7FHfFr8N;9a>KNo|%h zT>_?|@HaBvFi&SNcBfy1TQZ_1V2PKZcFgHZm{bzCsG<72-47p zzsDS23W9#ARc;X0RJ2o0f57QiniVpNr)grysHY8R$H>r*>2I8A7J5B&!Y3yj!7-{Z zUC+Sw)JfI}xLCJQzG!Asl$b`T+C#sDz(tQb@yGAm7fjp+0utMZgao4bERGBE_b8ZHzU)A~Iyw{1 z60#uhP0P1(zLB%m`|EidP7f&HH&5O^8Ef2euW083>0|Ziy`oOwmE6KNPTV}PVvd)2 zgfh>%=>vOT!qxI)%KVYRnw|5L#|AhVrSaRto`bEKVlMH3WJ|H!wKN0LDJNZt6e0BS z_yAteFT%tpK0c1r<1IbVrB(dU&axjQg>pQVm|-VZ-^)>$DQA2a!J4BR11|X_3+foZ zxJ`wfIfXHG2A+JWGNobnQ^OxDBo2X;5>dfqf^BIICkVq~+!W33cfRU#q3c+_^0Fwe z%WBzlw+L>YokKz@omLv+e)9flc02IP^5|zP&=tS$>dUDIUAs_8zMEboq*sRDzoSu> z8kDXHod>=wGl2}F=BR%=blvhwbd&@xW;{2F1$ z^6p-t=~&XJUkYZTeq(+258>EOH%#wf%Z+vrL`akV=pI~A@tA&^R9ksWe;nxO&yh8M zl;&#%&9WKl)GfC1(rtI}ZX{T2Qt{N&OzgcxEl9#Lrpv4Fhlt-3vkl0*DdMKYoy1|f zdv4QtkhUf=hv5Ue2b*#G4oTlm9o5KAoevZEU~Q@mg&#f2;!66edhM$x>sFMYGgvK? z4YZ{ZsMR!fnv&+7B`s1PRon$2U@Spvp-;I5BJSuB96d2>&(^xpa?Qre%c$jcxTfGXX?)X4_so7bP0i%yMP-?CwodU!&N_7D-o49ptbRn9ER%c2SF zLg;dM=_}1#NvVu!G{%9L9nVEqIxb3&K^QYkn=7Cpnt=@)j)v7pyk%Hu8IHA#C+r1a zG=twUcXMtzgV`Of6l|4oTf1OukK1-^*mfkUs#n7C+8&{{XT3k(cSz_v6ssH|!{;gf zCxpHeu}T=)d{9*zukr|09x}dtKN(L;Ttc&(D-Tw zanrtY{=1XEIT;_A5C$eT2Ts87UJsMG&F!Bes%k1BH`Qo=Zi=yfp2$XKbtY+= zW?EHCMHuZ77$W)-LqudSD_{s3mx_a?C72<@w{&MyDb!hsKf)fp$wu9HDrEJae`|5av7Cih7Y+LiVeBac5>WsTnSn zhjvD@ZWJJ&VwMjb+_K%tQPZGJmr$Rj%1{I1WGQj(*c48Onyc;Af=>f?)XrB$^W=7M z$iGhMH+i!=l(h1qj;Ir`*cDioDv*Df(s#+L8F zHO@d7>0^W=(=c#JV;weAK+=WD2nNxVxS6E$vT|n~uBs!dl}uz@@~Kp)G$}xR65`QN zx@Xuqhwz{pQ6ZUIk?Cbw9t~=jefc@K2=^&lHmE`1UL3bhVq940djW=BrZ0J<`?n~I z?Y>@eZ!P~E>Lu?8JCy78SouUx6AX$csbeglImjr81dpbARGYT8Lx_S17yOWgUvFz0 zbUk|s_Y^8ODUp#3`PLZ#iDIR*A813A=fXjplhG z--Fw(m^DP%A`9JMbD94?F`TiD_z|sp|A7{-TqrGIG+n&TIBe2MBEp&z?kq7EJ7q8r zy#NM{ESgyk?2em%ff&;DHt&(^cj-m?F-q2tD1YYr5xFQ$N|5TyFf_P4=iR{y!3g|A}7DQ!f9R zTpz(DHmaB9oH=>^+_8xxxKd~Q{Mhh0c1h;BGsEXbk37%UQB-HxkIN;|j_iOJ+KeBk zyk4ffI>^N+4Dvs9tE-X{g>oojr@Iby+^Mf(2KPxTh)G_KEXFn z-w8@9-5*i_Yn3bHO4e}*pQ4ix0`;uY`J$aoae2z|9*R`N4uvU`U1|M-tr#FowNWsK z?ytlH!JahWn^rhJ;gmh8HsDJ{%KR8jcf;^k$Bh};IUnOh{~06X*ks^-M5u&VgKU1U z@Ef+e`%MG!ro%$hVUkeNe-DW|tKGP>q#5^F)Gv>{)e50wUQME~I#%e8+1+?c*Eq!N zUAWBQ!=@feS6o5HUg}^wzX_=mt^1!ivYLmDEQgg=NVuF;G02ZA8{QszYiRAom7&eb z!Q0kEZeuLB`$L?CmP&8kA8#EIT1TYpk}x0#h$fq25KJN;*MQ(0z}*;>xZ}g2Q8}>$ z0t0;yK;N}Iw$dsz?t(PEd9OAK7`JlOkS#7i*$UV#>kn-u+Dax0+hT>d+hYgbiNe-c zA%ye&ct0%NLBd^ByLEq}c}Qxvvb^P^RN*ov9IL|Z2kTX_p3#_fH0Bt6P+0lq(CwkG zExi5tw?4mF*uHGV9yb>!S0P)kU9H=2HY9K(i?bu)tdBW859{_MYFiRDZHX!ml~U7= zHmOJDYEVFZbIe`?x#r4qv6kI22d;P6y>+W-#@J#M+ONT}-BwYi1LREz-u6gajC$Zq9uwJUeYa$Tmqj;d;4lKbH=@z3aja*wT)ZA^>b2s zTHP)#z@ipg%P1H|lW zKDLO?&3jy}XMDKYjQ~*ZQtPD!R;pK$PF$e&rqlxV6 z16^1AxE2}(qs0ON6__KZ&NG`_jFC5Ts>1~l90Qs_qOJ`<%?@{*vW`mGr6gPU9I+Aq zzuvw)HjXRJud2I??E5C0_s!kOV|X>gkwnUtOp)4ZYRBS`u#8M# za&`u$5DZLhc3~P=tSM{|D~W;SW+#YeGZ}jq3rv-$LAN_n%$YdIB7ZDOqP?1#Ajj`} zRn^tqDpU3*n+93+z3<(xUcGwX_r3$1MKB}TKpKPi3c7BLgUJZ0ALVpw;Cl^2m=5A= z`(H4LRr$nc2x`S86;N|cLdI1c;i+7%PpX=vGPT%j=D)TNh%YoBp~;1jZF_230jwU~ zv~39o7fiZ?P{|&Ho6yhPX*0FN~$fb$kv63h{8sSYUe2 z=#%vV_yX}NV_uxUUZpvY%y=*s@qf{-AcV4Y+XMd_h%DO!QPnL5W}E&X)E5wpOL#9$ zyN}G!!~SX+is)(7pk{D;Rqr#Mva9o+t2-IOS{(f>)GhTqyA>Lh+@tDBoL*6p2YJ;2 z9mOK|CJm{n8ILK(j$g9EQ#;b-{uWh5t48At&}^M0o1$NqpOmmJ!q(`N4fHd89q?P? zr}F00Og=N#-cnv-ww_)h}+rum-TVMX`_uM zy?6&*fs08-`HFN3`osYQy-_>=5|ousUS84OJJbaRTYrPwW<2ktZirs~= zQo=Oc4L6WSv9mYNt{=EtP`?p=zwCC|CV#hSV7s_HQ9L3Sk9_#TR`K!Gykw|&`2<@x z+L~PINh+~UQ~Z&X^(w@+8TT|#We708I4QT@N+N;PZxy(1s%uN#lAJ-R`sZHOMY6Sd zkdHAzA?!>?{j!O_rFr7fb{i^=S(^K{Mwn=%h+eTeJQ<`&c0@G|5Sd8C0D*rB@y^mV zxwLIVgjYyLgus6r@lmD9_Peh3`(@?t9KLlp39-dMaCIaRu9L%cQo|lOyhrlGD_Ui; zuo#iQKMqk!i*;gyU(cMU*XH@PP{kZdPPi9hopM>3hHw3Xe zR^3uX$0tSg>oZ$L-H@S{i$;^-2n08!l4iPwn;%*&MXH;!hjtDL&u&G!?sP*OMH)Ct z*Y2ZoWK8mpq3V|QMDt#`c`v%Xwgf@F7HsbumQSn=fp7^_tXpL79Ew!>T*RCqF>+g0e4q6u_5#(TP#2FSq(h7gYD@ z9nuV&q$gCCXo4{p$wXO(aF9MB2>m-W9U`cm%?29t8kpx=AHLJN<%7Wbe%9txkZCGp z?MyUW7IQjP`pP--2~C@7Fb=Xe$QLqKBU)ov$NPYNlY5hY)AFYEP0yQNtaGNvrz$;! zICMMrmyXMgK-g>zG+!A>c~Sl5nAc)jlBwc+R!L@U%6}4RNF1VdyT)ggZZaCX=U%bw z9&ePsyR^|tdOP`j55~)6ckxESS1!RjmAC9!=9ewY-eoIxEqPOx?>Uxi#;Jo&kwUgo zFgt;`3=V$oZz&J$GTD`w9nmf2BdrNXT`V-zOc$xVvfNs5V68m^)Mx(W(zYj;><0II zzFIE{mQV8xE2jFM0}`Zjni!M==1*WualTg10fU73wEM}B?sR9x!Z;)CDbp>>%9?z6IthtWwG+co`naNK^BM?3mOj>ZDJcORYEkVzdf ztfQLL!Oqxkf)!RmmyNEcA#I%r?Xm`$<}(xh&}TGuR#C1d zV(77$jizl`%**P3zInYlsqC{w=B44iF>N=@37JYT!+8&O9`pm%g!!TE@$F_@s4q#I zukQG>dO_05j1I`cI$=P=rPh{tg{cvebk;^2yfUY0Dj_=+`e`y#&8Vl95TvP{NN;jd z3arUW1+2+B1+0lM0k9_PRNM`GNoFrqc-95H$(jXh#gnOltun>dm}2Wpv9y!P`Jr6Q z@!TJ<&AB*^12$%gB?f-Z4{1AcPzVA`rwKVftVFaf;J4&j35vb*X)`U0uK7MS##k{m zsy1}FUym{F=~ZCB!08ZHLv4P;XcWvwU9_OK(B(lr-nhT9*Br0W#PvDr)Jv%o$c^+mh$xT&LNc=lwOYyI<148v=Y7-do&*?Hx zmu0%_en$0EO7b(hWD6=Xs{>6Hs^kqx%7}+45l!xtjXZQ1`r0W0N)9vEOeP(xq{WSIhc-mwFQFT?ekhbv9rc*^kxT}$H zCNab;qsI5OW#yVDqKR0mzlLa1F{$Ult@juXgf_mrIUu!<{N*{R{aLweboqoCZkwE( zx5}YbDKsFt2XJL>&$BSlH6&WBiJ44om3^SbRCn*mNNFMhrRL5=q*spgZbkNNcHWH) zN&X=zGIXEmbCL32V=U2lP;NXZ9U8y;{0q{d7vx5$q1Bt>;kj=+ojx;-zn*kFqne_0 z=a#<}CytM-e4s*s%CVPpMk+`g^eHLwxVnMO(#_`0(!V^nSs~Yg7h9FA?SBI0u1dJ0 zP&!#(`VeP%(R=RENP@E{Jmka7OB^M(P4G25-Bv0d{M3;;D{Iu1pfCruTzu_aeiRe^`nPXB6Kh`Mab@ zS90)#><1U8R;u5V4Ao1aP82P*49nqRDctbOP#v3K^^;YN`8} z96p{1KPQKuOBS_k)?qjc#-ZM4f%1)=E9kj%>_dmtbM&t}rJmz*_(UQ+E{DgFa^nC* z>^CPU`2-I=MkGg%y1%U8jO2RhAue3cI1+53R~KGhEXS*Vn?HhOzM7U7wgeM>Bo@<} zz<6}d=_^;~uTRcg!l4*j7-?&jazW$}qU-0FQ=%fWAgfx9ojk|RJtj#a@#-~InleQN z{)R3@9f`lC{Eq2aoHi1#L;M~gUniu8EudHrX$g(PMI0?k#!4o#g{qF$H0y==*HYys z)n1B2#s(H{#dxZOWR!KuNLJ(|*?S<@yeTJY*IKME>R6dm?u^BLQY@u0D5DF>VTll4 z>BBLz=<~D~R-)Md2^JF0?zLr0^7TsgUR;^%CM{%7 zVl-FT-G>vHQr*Yy`i@EVW0DVg`w;h3*FNS;`-uc_Ev_dZd(s=?mahv;zeL{&x$nfs z=ePRK-1VK2>}Mq3nT)cYmh`ixwnWo_+%zB!9^Yy@A=#raWv#OEkscpTi*BG_9Yrm%Z)JOP@k-CdT;XG$wd7j zx&9Cg-)rYob$4BL_hXQrXx*59fARJr_>8Ss9-0tc*qA7Y$px{EGg}4i%g*}^9rxXV z)oN&!OFbvI+^7EDYH4&r1%CPH>LOmD(uU<@FggHB08#~MLQ8|+Kw-t+<)e^2Ev;C4 zbz^XozrAm(WRRs>JG#EGRn)b7G#LoLU3s%I5onVGZ8)GL#Tk^a+4P0g5+ohtKt(cK zxn8_E^YLYQ-;2P)D_=gM6pk(hXoOOe$=OYYNT+Z++%5HvOQ8wLJ@L3ePp4)0 z>JvWp38O`hEW({wkc=VtFYJsV=zA(7P-840U;3vLzdSJ(b|IZ1`kE$~kusg%0F0wG zolta>byqJd*=eNHA@UB)4hT(%L~?(FyuBbbZLE|IR*F~8Yuq&-!&OMhn~{&|L8NgA zXGS)@S^^RWY%=jOdS+6GO1_|c8u6T;K7R1^k!NN<*1SZWHZg>>)kYc}V&ceFom~a?lc{|(l{~}K zEd7Vgq3`<)o2`29z$~#}HX)nEdUO2cCcVwZ9j!GXruA`~br_ad)3rJl9mKL*@rjM| zM9{FG9&2hr73UDB$Aq}nM~#iLbBR*v1!mmMETgVa{g&6s2B=zkniMxOqBIk771%)| zjSc}jq{GG?*nLJwhrl%9F7MYv#vM!(?(#t+tV6(#=&*4I_J|SEAz(*!*ti2bW`uMI z*yB2E+<`r5gmehl(>iS2fql*h=@76JI&9p5eZdIn5U?-muyF_WB_pIm@F}3Nt6(O; zD*h0uOjbl~oe{*B=fUVxf0w_dL&p8`IRn-qlxVV6=(Cy4T2pU5ucv(sExCym2t8r! zOX!EU^@&x*Z0*^-X4qH{AcadUGxKV!G)wN+HO9|u7t)^3Or^qaM9uNqA$!F5Hi0!Y zrRUdsqOnZV?42oF#`s>;QycdwVfT25iTH5j0{Mncat|9;`5%<4 z#|^nLX@}8ZJ<%jniIFhlG>JvP{NMAM{sR^JGrHt3oS5Lm6A~jfq-19w^w$5#LJ}il z-8?8F^_O1SO*T%89L@zPf6h-IWKFJvc&0dWCV`+-z9urpXd)n~O{wCkIf!RKIs=yC zR2l~UIhbUb%9^37t8Qq59$qa5RD^cDp z!_v)>&5^{=^YYO1TjejT9!W;ZS3QhqZ0b=Z2%CsX{9C&G3i(Af$j^!(K4-y z%r`TUd9v*XbAGCB{P^Vf#B+xaoPeRfV~0;pPK-WxczpEK@e%m#g?Qz33KA@&9!;X? z)90?h;v$@hXaTX23MQXxVkga!e@8*0Ma2dhKX3f1=8K0BkBrI`X(OwSxm_Psl@KkV z5ueja=LLIaoTCnm8TAG;lcd&}m*N1Ua_`O`sWo{db@-K*?D(h(j87%cmF7<{M(8LQFO7%M-#Nqzry zQaJa&b!z2Q!rd&pn>TviAG$qs&)xr7pzNQOL;u|2d*tT4;ZJ;x_xnfw!kg$nCHJ37 zhN~gjQkUpDCU=nmu7M!57GRxg!-Jk1w%s(g;x9w7hgL4sv<;>kbRjecK2LJ{{*l>> zt(}S1lXC0Huf%8UWPHYAmCx7*K4S*oPh%^hTd>3iJBcN?N;FTSJ&BgXa?9ag6d%F( zHnk=i_sWfXrJ++>ji=WJ#K)oEOmja!4rV@Q)a^Kj;XYTf$gwR(GlQM%Pso1QZ zia);g-r~E93anjL{8;$+?mu_`J%M;w#APBuCCS&e9d6iw^~B~+!ZlKD-<{u;YX`Q% zgW!=C1s@->U=FOjpp03b+6eN+c`q^|;l+)Nd{PvZ8V0wD_DaL2q{!*Vr_^#DK4;+& z6CK!!3_c!LagSf%X?*13BfotNvrXRc@(aI>)3^C&y@yNBIIVx<51t+t{-)+&9|AwG zIl$p2#q#MoES$-A$QCHMLV75 z!E##uqLZ$FM4G^ki?*BAA5_9tSUYS8*_ND3jwQQhMMw+5j*$IiuK8`ZszVoVBOA~Q zn!9;~x+WLwdK5DchP?`rFR9r1YnNuGm@Yc(8taC9ufB3UPP#0rS3IT#NnZ09f49j6 zmhDVj9HDQ_VY(cl%fCTs;zv|um>9C@60}-Vku-*R+B3>DOnv=6o=Tj^iZn-7_y1UO z<=m^&y2ctaxf`eUkfpSR+VqC$%M-m$+ob=9lJPwYa#bxbyed~7lRn$tgd+wF@e^KD~JMmGe7+7^|2MQz{Phd^S0vaWM(@wsv z*^6_eRr^|t;^V@VcoRvcvSv&UjbRZDRMH|EERjmP?hMGK!&1q<&2t~RHkUre>BmWU zFi!f59(nRne-H&JA%gs>p+b?OIA9F{;}EqeT?JV&~b-x z3!rj;%toK71R-Lq<_0(rV>36wffxZXU{~h;5R-=(6G93x&>T0DBtZ;W(la4@5#vCN zi8KkU4F;Kk5R*8u5*i0&^N+?R=Z6x>d7(zcXGW6a7A7ZvH+qU3AJWz|=ZD-L#1!SA z5k!plj|4qGB;kE07ebnWYX&ny4G)*h*SUWzv;3d@ToOp$qFCShA#mErSWTqA|LmI77 ztSvHx4bvD5w!3Hm86nifrpk7w!7IHzBOW;6PARa^+A-+cr5C-klp`2x{* zZ#DS@q@9dZJqUZcd!Z5upM-nl>OE3bKM(^2Bm@7_&6ie>BUIg@rn*#xvgt9qr% zJ|$rH^wZMfREpSnLDgOC8r>c#6eWWmLvq8A6dm6D=Eprhf99{Ak;Z;U3Y?YQXO|Cb z+Z~b*CIS;xopM#DRM~Z>{lmJC+JDk69e7T9Zd~$A$o7e4e%t3?wyPvw>B%D#M^3UI z(+oc;-lI|dr&t(H9wBjznQIeA7{Pu-y-`w(H+`{BiuG8}oBY6J~pr zs@44&D*OsAsr<>wsksZ2lVUNdVuy}QLmG}L=H_O^8cJdX1trotl@cz(Jl4EO1e9v7 zjY(`MR9HK*r|kIp4TcOGZ1g$;4SmGDeq^bYVp9I6>IC;0*IL8HOkG`MFm4uTdJ(6i~hJl5Ks%zQ6cSDCnW;oCMKg zC(>HToIi@7pg2*Yh!kBO(1nrR#P4Uus~7>U+L5_*K{dTE{trsVbje6Vgo(I|0fHE< zOk^7*S!Q6ZOiO>9;!YFVGYPU05f`SRU^WSN+?6OQj(<+0`TjN4OBZOzLUse++FyT< zduR&@cBPNQDO_b8_SebkXtJRNFG{kj4>Mf*D=bQO?@5-`CmWiRd-f(J9B^vH$nN-+zUaKVDr`I{w5DGQeA!EzR?(1VpMSj7dxN;Ly(xSHmTk#{SVS{AJ1 zA|*;a1EXA3z0$zIMy{sey|Q=9?%3q&9;J!lnz@D+rGo(Soq-+Ht4;=X zaaFZSHv@aPP(XYgLA}3wS#Ss?tv$cqyY>f8FMKFRb z7wuDqS(1Hvj%Qd9??grQde5zwzEB^~&b-qhkRR1N5@r#?>%x2XckS=_-u2z7ldJkb zz6;?Us~(G|sHKC&^jB`a^7h=#xec3K7&A-v5ao5G^LDVj|G#kU>3ZAK^$w(Q18Lm8 zG;ZH6I6-h~=>UOmkmsd-Jd@5#vzx{RvFqVS+>^-_i$y3{2QvgWEf=6i_Mi|{8vR1o zcCbJZa7Q`*g125$3svbS6cPZS1N$Dpp^hvD}f-j z1cHnbD5SQ8ATJm-J5+*lgr~wh0bz_!mqr1tI0hPn$bvK^ii?46Du-oO`nhBLY5rlT zO^7A;A5yHiZy!0T2)LuHL;NWJQS2u%C7;4}?jZjx|54db$`l8Mo!ow&I8X$;xx@Sz z|IzWE99KLP_Hxf0D)OI1X?F(m2N&y&DtIszIsVh1`PMSe3f-tTT#X}HQad?Wj zJ*M;>98@lYq8=Y2<}O06-tksPVe&c>Y=sJ+J*IQDIAr(4YH&0 zkq?pd&;p8uk{!pVGO%{&F^8KfyUF(A#(|LqJMt`9Fd`lpJ7*q1ra5-~2y+QcMpO1B zJC}yMmCfuhQ&WtvnJGY$`htGgsp;6>wy}e`RA6?F4l%D@n$gTY$5ld%$dTI_cE?ml z0X|(sk}QZ5Y5p1dkZa1Axe?P6iswGQ%P4z!WENUv5l8x zSO1NX3}M8mWQ#(RnqkUhSJ~P(H%@H11|{1d8{w?cq>*H!8Os?>(-BNtB-Tg4qSCTa zv=W@t?57r|a>24xg-4lbHof~7!L}5qk?CjDwF13sa6`nj%i$i$)hyYX)d@jE0JCWM zb5~W4Hvz<8Nx83(B|P|%gI%9E{G|~ul$|EppmGq;>gGSvq|wA!zUrhAzlE-dWCJ+& znFmRbS05eq98{b^y2RIf^fYRctNH6adXl@ze35p4^f0$elr#H1B_%`~7j2}a$&6Er zQ|v+J%n}&ae+5>ers$lXeBP0{YJj-|6nS!hCfO{yib8)yqeM~=>;(x%5`k(N4wqWD z0v$KT?)%~1r%Lu$t(R~4+ix6Ax(ZgWORg%(R;A7_8s`bj-tVFSmG{#WPx&s)!SEA! zcQDY=cJBNI#+hM^3)OQXUf!8Y=h^9d%0`0TA|0?$2WBtKgQpW0(I%WvlYJ^!$78$? zGQO&^Lc$n95jIve`SXx!EOK@hR{D?~b)K6Mzkz0vy~8xZPKaPSDP#9D&I}z5q#Vo& z;PmY8#Z}1!9}7=HxBvp-b64Xc9r~nP46E9+R{1l;Q%;35&}~#O8NCuo5Ip5)Q4lSF z4R?4bg@;h$Co_oyF-TB|QdQ-qYJ*3V^X!6id=i0Yrz&ba`aIVcL3;ny)l_XB>`TH5 zl}I9}sd84ZY6fWfwdo7j(Eqjzu-G(r1-%l9&zx#8OT)~E#SOEEZCa=4P)RJI3u88w z5%f7d2^>gqsqU{-6q*C$hvKAaLrWx?Q;`HrQv+WqJ5-FWXb(o6@0i&9Qud>QerGrj!fG&IK zGK5Pik6hqhy_Ry)B6Ag!|6DxfPd^$RGsxPIGZmkhW=1xZ4GBNO@oA9+OjE;Ow*YOL zY5Yu*L`og_N@ZM|)gBub)7%D$`%#X{pWY#8f-X~ZxkwkrpvLL7rYHDAmV|UD^<|*sMxtCK#v(wX4@kuy9A}trJ8dKAgkaxlhO$)6^>owy7Lo@K& z)vI&xtHRt9kVgo5o+RR{uFL(bt|9iaP45%ekdZz-@dWl2ik0X2$99fyf56o|;0hja z<)3nGGS~LsIPZVr+@Erl5?A?v>;11>(F3meQ*Q4AuK%$`;4M2GE?>Age$S`e$OEqS z0oRp{{|5iqVnq_W1(!UIpWwf+Fi;D}KIJ+dTWq`q4oQ&0iNlj^YqewzC9Ku5wfdg5 z_EXzY$#xXxg>IZkMq`q#Hd);u*(wy9T_{QhBX7rU#uO`H@ zb&}OVA!in`1`a|I5wVO|*#Q@!2t0y@XL(fgfP0oT7Wr+ZtVt!Zwd9_)>_6M!#k5qi zg}<;FeE|epH_n397q=W0H}Vu)K0mzP``*yILm-&=VI}C~kMKHz;LnR}e0aSF&R6&_ z*rJ06`D9%^c;gQ}FYtVS(&JaG1Z)Ek2tcQb%anWuqDOla2Lqj4WsTxupqtpeEH~(d zJlAsfsz7QcYx58dS%3b$v+th0Ga}dbORj!d7})W#JfbDY=WW!JfRMrg-0a?=o6_8a zLn-j3t+*2d`4qHsB^`=`AeGD}$j#+Bc02^l!c0ZzMp1$f z%6xEj?-pPAxn;bB4=lg7!_gh2iv?aO2~=i!?EKzjsOasXn?rb_m!c%IO&cggVj=?CJFnlm;p^IO$ldA zcE%(}+ZV1p-l7CiMY?pr>C&m{>?SSuzBRNmv{vv=*{w3kRWA!st%3P`b-D<^=^_AY qMG!;{G%Ese5DB2>;t;PwZ*IMLbHiMF_YHf3E0npyUvMq##s5EC4#AxO literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/modular_kernel.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/modular_kernel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7576121085d14ce50779faf2680ae2548686b5d GIT binary patch literal 45735 zcmeIbdvse@b{~B4BJm;tKEU_mlA=UHA}Nu2Sdt~%dfKui`X$*OS`!!|E+t4HKwp4* zaKITmaXO@(iKUrIMP=I0bdoO9%g#5-*JZe`W&qO@BB-;%Hq)k)b~TWx z)akTc?QfrRUjQ#3c1CT}l`HY!+;h)!pV!{|?7hzqDk|Ipo|ygh=!tg);ZNyBeO!uX z`HWQ%ZV3?~Vu=Z3mSM}7b=Yd5cWcZxW*@dQzb)n%D;q9jetWEZ%sK32en-qT<{ox4 ze_6~k<{kDje|fB8xB_{U$12Bs!#)<~jQPi^hO3z06{{Yr8Lk;?J<8b3x({R&R^KkQ6%W%tB>u{?@unLa~5$~%)q(XAOYvXMgZj;JbcqPJp z@1i~Qpuy}JZfD>8_*NxZv^1jBAyr&DszfsUn1@EnC(8vvYFBy-Rr_!cE1?-B zw5Tu}4oY4Y+KSLNp={JRBQBNX_wyq!c-w zIQG;;IG#F^h>u1u1a0|>kSrymlTv6TJaSRW*N-G7;;E2yZCsL5A>L6$B=lPO6ygu5vGdQ^%sj>0x+hKLf#$=Uzg0*o@*|Z+tOwvX{~EEeHKQzIXK~6S4#_FGXwFB< zBzL4dB8-+roUhuoS>w5G3%c^P9BgOUm_03}!ZbG~2U(w~uOdbp`aBxHAo5|}$A-8l zCniz|4eBH|67>0AcEFxWrj!5}3SxQaF@eyTlW$iD6OH1(b zYjAD}DRo>6myEGkR6$4sj48!02$$@dSM$7QRhN+|Yf9}5A#J^+ZdF=LA|dA;(Rga-pxj9rSW}6hP2NnADMOYbNe#ygltujJE62i@rO>FH7z+{9 zCN~{pBfq!{`->wzhlI%^a9$SfIfafbZ+qYJ=DeNryMNd5JGw(ZHb?YVR1`$scvdluTB%5^?)&tdbs3xdt%W+OW2l^c+qOan}&86Y>4vmH*p z>z!`BYaWL+gYF6(Du9ytF^q7GH4@*O&O7V1QO;9!IOLv z&Myjg?R=&+wBAbJOfT3s<(kDe58OU5-<56dU$FNtIjRa)m&2VCyXKyG%T=(Ef9D?d zyX0#B#DVu^n^0DLmtyh4sGJe}X-|dXTmB7r^!dqWXT&NwE_t=MnupK9h<%l>j#a|T zRtYbc$|FwmZ=>Fb%P^Onn9J^b&6&jb$>*f#g^Q^}@yOFsWMV{`e3nh+Lvc~v62P9T1gXBC!&coz90w+nJpn9n@-@%|a*=W+w{p?eeQqk6dY2G6E zIy24=K3iR;v(=86Y|6^J;LF?3ClWFF7zLJ0!v`shoCyAsaX9*hS5{lFRyaDB{DE8E zo8H+}*1xIXKtQ2PXxsGWx!dOo<@Dwh>RWEbZpI2OdUFdk4Yzu4_7*(!<`ufP6)MPA zDY$)aw9K@8;=>zF(%QR}IiI9dfuc!TjR1O)iaLzL>-*Ro4HN(K;KBdp(%E}*9yZe;PD+P6Bxw<+?N2ly6e8@+W0!JSbr zka}r6$=0uklpK+x=dq}T&nK=ZD`OEFMlfJ2GOZoAgzJ`|H4N%p(3Zaw@n|Z_swb{q zb1^BU2z5+irpPaS&PbuHd0bF>(HqG?LPnTB1?3 zv?~Bsx|Q+LFNUvxu$dUSsFlv^BgIFeQc?sBCXRse!lsEm1yGUYXaxR?*cn!?^+Rmw z3|i%*fKR?+lu;3UZ%ih!N(t&(?i)`eQ`q;SsStoMF+RC<^?LW0#GOSxNkUp*7O-Nv zd~A8lTf+nT29oy>Ii?hxcPqQ|Sa^JhZ7q2(h$;}DBaEu$8?O@?s?VAmZ+JU8Ujx>Y zl1vmJ2Gw4n>uK6k&$V7$M^BwA2r3dckQpO<7Y zF{+I}zW0k4Vu|zk&Y>)#wIc?sqE872H9x*zJftEPcpE2(D2bx+2uL3cDd6=0jZ~bc z1|N~D70@hKXV>iCJqM-w8rGFLF&BI0>o1va6S z#-VeyIy}uNMqqX#=2A=o6!vT1`qa808@(d5$Qf($#5iHQfTlL;N+J)0k^r_;G^Dgd z$4qE25wj~5#uP^{Ul^a*4l+^ce7@Q;%uWm^#q-e%^g|%lhnTTzlR!FC7g+>?3Nd8} zGshxLlfr;}iiQk09|{>KqZi_6EmkiHEGyBNu_*x5`HD2kC}Oke#8JK4eWKR#Rbun8 zqV<7%7)#4LPPQz{r{LF@D)}_M5`o8-Ciw~YlC&6c_^1nJ`8Z$CCp*_#D0!T>C<_l9 z;K7er42bXuEo{c8WG|b%q}BKYO-La4fc-^q5ozAbj1*w&7N?&y(kO|6%C_fFIC(uj zLOZOM(WtUr^G`a&kF%}f9Seg=k}9SW_{oX!@r0a8@)VQ7H5yzT_4km;va&I_1%H5T z3?my~*ni^q887S_5YA(uMiLBuSHmEpXafKrbt0Bhmb?T5@zsQk9ieE7ect^btjyQ_ z`cUQbmz5nJ(5i+lXOK^A^Qbf}lM?1~M5K-9f;2YPFN#k~FHJ;2K+?1#UW=9;R_F;V zf0RmDN%&+1x_~^A#I}v|_Mb{nh=-;30cc-^_jtN_lwGAkp#OOvxs=5A}#V^_#`a& zT0So-tI~^LT!6R1tAl~a)6n20EEfOAXvdBb=ZLS{z<4FJf$FMdMkAVzAt?Sxf|waV z#A#omO(>G!xC~BRM4H564`IHvYac!QM6b#-(ejd|QQ!9D8T?J4kK+_r7c;03XaKcTTuku$(}W~prV-noU&7P-gCyCSLbr& z>v;3Q9FE}pinx>^+DJ!e<5His36Sfcq_l1_5+E3t=c92+9FI)^7i3&P71k(C1o=qV z6J`P{mmyKWqcF0y+=>uPg%_yr*?md~rwWQVg&5>=G=;^T7=rA?54*(aiDZg+g*~Tx z#gjoo!pDm!16G=Pc*wKkSx>}yy@}^I5$8|>h0J)a>e9#(YZVt&tXOAKGRl^Urr~E^ z0X8rd5b=^#cwt{d#R-N{S}0a$ry8N7j&qf20u#7|2-^VJaXcJLu0nShT5AG2V@-H7 zO2-)JFEDsjO#jSEez~Cl4Y_0cx1KjgWy?7#yBhCS{sKlaqtTV=#?Nk?Dl#~^>9}&b z{NxHYMZX&9r<8}n&h0m1u^XQ$U8`Yq_h+VvpC8XNj{d)$Sai0c4>>dL#{WGNOPg~~ zYec+;GW{QcMXB@gZ#EhQ)J8#^)b?=tvWL&MUXfu?Dt3Sub-N)%2gpb?B3?%p_@H&z z<6%QhfIYLKBiyec5bB~}-`GOMFagafIKHg7Il)a4gU7wt+l=?3WW;;1!1V^nPD zNh3B?#D`cD*O`oK4Ln5LFrYB0>p(FWbAoVgLd%u)l5>Vs^uWNBQ~0eqmeZ{uZQUY2 zjMuxA1A;`cj4X*NJ+lxmWwLjT4Qxg1g9Hd=C#y8>WYl)`SrZQ7N_^HpaR!Z6HqPS; zdqq4Hiw)v$1(W}R6qkqz3N}9IZxARY2ewRNLj{_Ozz|rkgOX>An0_#%f$*!`ej*Y; zaxsRTRwG&G2N363;pXzZlLLi%Fe=b?<2bXB49r-*?lEG5=rj~fWE}qD33f0+ zX;N~O9V;7n1?F8<&6~h05MmXW>|3L+Rz^Q==q&%|>#SqO&Uz*RGIgW9Ma@=(!}`V3 zQkb}>y&9BRuMi+H6TmIU0Z<(M3UU}{D69*a9FqtN8;s!Sn8bTdOK#SKyfo0ylad(i z=Zu@N@U`gJ#MlaEs+JMzQ8Lb0_o}th%hyu;O#8SV6*R~ihOj7v!D<0R6`IJz^)e8_w0^*A{+1oBnNG<}+}X3s#BO@5j_o~4ftcC;dc zq7d@lrxnHpi}O6iVL~G3$oF#;!>MjoTwL-+`ml|h^W;Rxxk%0>a!TY)9-}aFzK*}- z6;S&mZ(?&em)+$KU!fXuAq8tYgigFwZ=9MrRdB!$c@ss#q@3QIs)UJ)-rPc6W5Gi{ zuMlXub?WA+xn0@X?m`6xRg(CLk9>ZierutMeAR-x`I8#>m=FpptZ>-ka0KS;%L0B1 zmHmd|;WrddZ+h|c#)~IkN%0iKizgp1o_r|YH+^lkCF9$ib#0k%$+~tg*U*O-Ev)Tz zn!ruL0lz`khTe2p8+y}aZRm~5+K{hO=-5^8kB7l$t4@vf51up1(}o=YT8S@J;M$UNM+O1Nv4aH>Ux`j0CHjJhLk)Upy{ z#df5k8Cdc}JWNW)S|rH?nIOckk}8;Ju^Iz%O6Njj}||8;Ea|FKXF9e52%1 z>jvT*Ws;!$!bZzl{ncoIPx}VqLp?-;r?`Rm&4&0LY=+TTYcYg&YN4%$&`qd0#cac< z?~1gqGUivA3#0akh;*BgGgNsZo%q{?zb^c3#@`nFbxT|Lj6pe0ns1RFMcQl?|EP_v z3I0gXP|NPfR;eelZIyZ4yGpn{;(?+~A8Htk^uH<$Z$%xpp{?5y(u+E;mfwz5^82jo z&cG`9thOFe8wqZMtE6LdKeBVpue;X#x_gzcdsYe8d+!0MCPMHOBYO?~(s$hx+$YDd zL{9!3!v@zAyL*!3W_?^TB-y74uW^Ap2)!w?b}To8yd4vOC=$8@{=D1m~IHT5E9H{v&52-(66qmXYwt8^(K z#vb9qV{`zdFiLd(1VIAQH2RS#-dUP%c2KcV{z$c+qrx9a8fdXqpt3T}HNZ!f00|Bg zl9S0>uMXgEkcowl5bmu$Bpq(U-}0;I6^sh&C8REW)Ri@2x!i+nYtpv$VqIT2&6n%P**LTS5cK6W*qRm#!6Q!dG^j|=PN z^DQCbSTC<{34hK^GDnnp;U6>$MlcavA4)Gj*nyBa$k`9psF~jvGLa!>4q@z(8r?3i#NRen$ zPC51Eu1~x=$jeb=eb$YyvI_OHPA+h&wI4bk(tfP43<@f%Rs#YZ1?$qDygp32LthrO zoH%rtqy{nL6X&l)6B9|CFP}&;iCVBC*=cPA;s^jYR=}%}S|&bYbQD5t;C*UYMJ5n~ z*ox;<5R;51lM^K2M$(1Y9-y2>s*WO;$_0=~^g>xeQ@hk)J+COqX0q?ZR*g{`q4J5Y zB1w!Hw$;_P8VPcXe7&TW1DB9IQwEmgs zSQOQbUDp|eN#Zps)fbULJPFxpUPYzLS%nykNETUz++}Gzg&`%5fpcpRHsM{!xWC26 z!QdrBV1xu`FtkRXQH9=kEOAvrAd|IG_EoKo3VT=#^Il;B@{oN|Rf5xrWOgqH80{0}@op>10xXVNZq929?Q7Y`m3e?rRu zX-uQbpos+jXV7Di&%8ouu98CvNqNh$yk#hF3FR#(^On)xvS^@jPUeyug* z?9cf6bJexCx^H%WBREq&ZJ8dJ4(Gg;Z#+8l=#9g#pPaTVc`Dx6GqdOQy*FNZ?ch>P z^R0t756)FB)NGoQ-@f|R)%llxee%PafsFV0I~RX@?6<}~IJB_mcxLybAEds2_=k`F zvqv*$&i>%>?;ptQe15_Ee5Pg~=dZq1ezSbm@(uUQp{1I@tvxsQd}HtIOJ6^j@rpMN z<`DA0%?D-&zOjF<>gx|@y!-WH=3;lc?!0toB-6Ej!Mp#)!-lM;4=wrX-$>7-U;px4 z)rX|E)tT{i<`6bDGd26vg0FqrzU1@Y=(zFJtYvmr#@C*4wlfrA$Q)(rM~?v`5Qw){ z`k3jD9EoIS2Xe_#kUWhTliSwoQ9};@5dFAZi&QTQOTL;#pP2QD zi@x5huQ%iDWv#eBjbOUBAtrLOq7koes1b%7ejAODXOQatRk((@_*ne80Rhr_kEd2t z{A*P4o$Ep0B&Xk@DYc>(c4d9LGR|H1uTg%g3b5`FOdOcE=nZPox+_F!Hs3RC&Op{T zkZ}&&zvehqsp1tKbCa5(i^;8q@+(5gzh^opf>~cM;|$)<>MN4YE2{s^ja6Ss{`)i= zh&O-#s_FuyFED}a6?Of)R99Unesxgu^gl3#_13I!YsR_tel}omwFcbYSOe7bKO|`1 zy06f7UR4I>3SfVW>bmZ@R+BH>|AFDWy4o22`x~pOl6>+0SJeTqOpdPr>6=u~&Fih< zEbYHC9lKkyzAYK&misq$9e}Y(#eZ#M6{oa+LeQN<{QFnQazN;OG@M)k-`}8Wc5eW_ zl=@FiA>5Po^<#&yo zKTu`w?=t4PFt}P}iJh~7%Bt!AVgr>WqXH|8QjBOM-yv}{7QVnmfk5AZHeOwkOjYAD z*j3W<8M!zSCmS+4ufkm3qDsvKxE4<_Ik4U&Sdru&wfpkgBg(YhItTi5wOe7!Kq*#d z#hN#>MyoSVLu$3_W5gzj{u`RA%A~msQWizj2q00a?>_mG)-|7{sYajpi$I@ zPN>zxi2o1%Vs&~_I$|jmk+<#ZwY(&O8x2)P1B4s=jTvVHYqI<*ijrR;=NHKNML79t zuJFx7u=prU=Bt=qZBbCQGD5i~E>i{S{|wrg|JiO}rXGRD6=wX{dDxMKz2#MUj1!X>BQ!QI(csoHnRYv9d}15eF=? z&WZxZjM;Ah+m>;w44tSRrf?1{B~_&$NgW+CKviqjs-^XcHY8xDFU6HEbv39HWDTH* zM|^t!3Sm8cz3dZ*_KQy^u8KY29F8Pn6Jv3*N}w_-0VAcwCS2i@`whnIIA>FmmHn!|zn+gWu#C-EF~s&O+T1Hri3*?h=#$%z zJ!#Bh<&aznInt9-K|=a#WD}0{ha#A%0l>ifGkE!tnt_YTWgm^laz^&P*5v~``N-y) zdi%vOMR^J=-U~2j5Kl~8fDU{DN_#QVT+~M(IFC$TjYhi!R<>SyrmU+~7d(D-l^FaP z6h%|fFom?~Vkl2JZ)6j;v{}+MB<6=9$}HtmmrWC%(AaUD6|FTuT?7=V;vI_Mu--yC z2c^RTZgE4;NfZx5Ula2^Ax8ixnxckAsHg+Mnabibn$KTt#%K)`8N|t`0JplJ_837% zACih80;cmyUpO?P`bH0{1*qr_y^d_>4=UCIOcr4IAJJ0$kIDIMI41gwehpBZ1DIT& z#Z;QYPdkDK*~5syDWmXy-mL?SsWp-TKP*TiSg*B*{KSA?{$*^AtDVk9Ebr3Ri6>&6 z#M!R->8u`$oec8RS=c0G=ce0Is3w2R>>OUA9l7K&wQ*=mTUD!u7U4SCJhXrA{0V3! zPuiNcmE_OPIpxn&f7Ljjp^Vb}Q$F=0&PXjdBh}8MVLGmSNS?%J`Acx<7coA+Lb+_GTx^N)WZoJ&0vm_Oj;S4vO~mqE z%2s=;(z4fr+|Pz%6B4%vXXmFc|AaDSdWF9OUy=qAEw1cvMW&AiLF(0bUz&B+y)=k_ z6^U??0^^Q#Z+vOyOS6Noe`T&J>+778-oEtKr90KX9{-TBfiu2cIZt4=?n6)e$B?W& zF>`{XYZtwnv);{f()_bO^6tu2)X(nuu%ct-_tAMt`TpRC6826Q<_-xa8!0Tavx{pM*>9n<4x?-Y($Pai zA3f^%hAnTG;#ZRS;IpSroemuvI&}EdvC#3e&m21%I{n14(4iw|PCR?)%!wz4LPs7u zJ9P5I(4+EiBKZo$+sCc+>V($1*k(zBtFQl&wBTN=zZ?})Z)Wuvk#wL zc=)-3&8nE%ECP*&X|E*0*b$C7&D5?+x2q->f&Cd(uO}Z~4_jU*gOk)ACu`5gy1ahA zf}h!k_?bO7`7(r9pd$ue;+pa@ah~%YZSfu0nRjb1gUlk#h#yhZXAJ-ZE#Wty8LRD1 z#Y#J5vH%D)H?sqyqnJ8UBnnvq1M(0SPe-pYBMR7&k8y*-T6R5qyfpJ5bFkfW1p>VI=_0DZgcQ&3a| z&gMDxu;UjT$LiL!Sb?jmqSW)>^st`3ObwO@LTZP3Q=x-@KYY0^_ zMrs5%RuCS=xMbbI0Bh(F5};XO^K2ymcfFR}yGBgO4X~m8Fu%;=xmD|-dldT89a1C|VK%lO>Ptj@b5oV3g zH~_Vk5VTS0^gD!#%4Anh1-w>60|j9m_$qd8N>3@23L2zPP&DAyAfGl~n_@D_2!s`u zay3(V29AlL@A!&*rpQ+L7lUG-(WrwVf3xU{#Yj>G{TpPOZEd3&WTubEKWV6i0OFFHtk-!Ofmc*JM3tH>UPTSC<%=kjo?2%W<+Pli`CpPl=z;tPbJ@F zTjYF)oKAASOU^!W7*X-h$oC*QKOl$TRH}JL0`5%8JqkoC`DbXLqIp+<=3Q{9F1YN# zFDYAbr7dV)Ex9T)zV1x$P{wyC>pJ`iA`~fbG3BhFL)4V@;0G;IQO#8M%IVFizPad4 z*CwMkFKLb?ZycGenH`zinelDOy1JJu=nLt>RTiukN8jw(H($8@!u+mm>vmG^>f@?o z^o5kWlsISRHr@Whd~LS1Z?!nP=g!cl-CFY9TJpiUF8Va6eHzq0ZJpixX7BA@fWL<|osX=U|#*}Wz;&nV>$u+eWY|PW$`*!-RwDNM`(DzP#_Y{9w za#bulD9R@-Hix_5hAw1g;OH{CNY#nlZO{RQA8q27E)>e?O`o;&=2qvhhu*xZGHwOE zRjSIkK6>*DfrdgA`Kk#DHRP);Qp*icP@PcQcGadzPX(Vg;N)vbx3O7%UM8C8`)1zo$SkvG}q)CUz+63Qq z9BB>7Q}~WsaU9^8N-w||e zd7PD*^2VH}>Wu?42VQ?@_Nfm&ZMmAp>C;O!jkBBQY?+#_jJGT2;Z~zZkV<=qfTw4f zHaDYQD>TY*Tad)8>2sYn=$Dn5TGc1(`Ls2cJkW40yF}_sB@eXL%Aup?hOKi~KJW}X z6#{Mz8^BN%I6N^rD#=Cs544nno${ia#EMj*6fssyzU(|pTFSU#TlZv}X)&6T(l-d2 zE@c0*kgIE&wtvmT&psWDZ{5$YW!Z-Bd{AJE)E;6%u|Y9}fk5Y==ydG(T%AI{vL>yu zeW&agG}NY&=anbS!`EVRrR|rg=7X!GG+_j%t*my9tE5negQ0F|8>Nt2Qyw+Y>>iY~ zo!GJZ6+7r7ZDLr7RcF#-^rXE>FdI^7%P(W11Z~{3y1W%7gY`<4lk?>Kh@36tuudWT zEAPOOXzF_2ekCeh4LUgP`LBxNwn|q%*l+mJ?E<-p6enLdtLy~^@R_8JG8)# ztgmD4*$;i&v=EYxo;&uTFZko8_BXfP-Zppn{Vm^jeAF~Fee{=3=6rqADaOJ*FncoN z>sj>mWqp0~6B*y`jB_^|rb7_oC+%Fr%{&ZWWmxK7`oIuW9bUUAY>6D=NAhJ*BwtQ- zzD~`31aGejp8%(7E|hs4dvbxM*`0H?*}XSIw43b7_3!+4?420*A9?KeAN;2eGC#0G zy3peuHkv&=bIP}9*=a!lJFJp48+c1N5Qcf@H22*Y+OZoyI)R-^xdm8sp^ z8<8C;0n_4)E6#Rd`Ju}@T)0oieW{KLD}GFs?;TA2y=+<-wcxfLx?|9dg|HG=Mc^6< zQi>n;D}K7xqbgEKS6oOnQnggAsQcR^P<9`#jrdV2joXq3mtwdt^_B$UHG&*)KVNmE z7JmW!)#0xme+~F+lec1xS6Qkpf=l#94VEk)w*Mq{Hbl7t9(=2tF#(DLvJ}caLZ>jA!FWEd)G1O2Q>A@&!jRX&zsf_bKX4MG?cFm$^|q7=vmQ)HH!a> z>|S*&-|zk!#K>0rZ2kW7tkUAo3yHeO&Q~479n-=!%Wx-l`#t%9${Ib*?_WI0u3rrQ zCNK%chhnEY+zh{B%ajC6)I^FMqeJ>_Ibd~hYtY1wWjPpPo1dc+4;YGUOvw>8$Jn6} z>{gQ?lwubfEHgS+0`nK>swT}OzBo#}SoQjq7j?^V5WC=?pz(&AB$Xo~XmO@sLLPY$ z&kiiuv2Z+mflip{QWedzoXYN181$?STq@**(lkZhi)uyJn?lkeF#%Z_W+V-!=QIX! z7-A>!3#c)>`;ukE&K6XuAu5&!s5+!{^Cy!&gK0Cm5l78Uxt4=~0Nv9#=W+8}c@){b zqOD#<{FpwNfx-ZFfr(&)Ed%>cHOsg~9pBHfF-{UBi}l_pSB7@~96ZLmAypK?h=A&7 zxZ$f>FT=f(hOd-2d#)=4hMgftc^B_Xu+9w}$}@;*FrCOr4$WBN8g0@Dkna$_)q!kY z7CvqrTxi_|)NknH_FW6@d!~hcvu zaFld;*!@()=yL(ofOF`5A3<3E{Ajy!^3vqHPIh6M+hZ`aFa6^9F7>^&$B#R_r zh?HWzLv{s!#bP|qp8@&vHQ?SVtZ|M{wk@>Y02v{KH0Y;oc+W)t3Fd1v~@$ z5L4&PTl(cRO2g%k4oIuuYAS~7{{%s-U*tcBFYgbP+#e;sND+uzop*&oY^ex^*x*rV zrQ#gqzd%%tMtYgiND3*fuD8YGTDAA_do;4%LE^uH43$@KRnC-Woedv3n{wigY5Pac z)?DjW`fbX!_0n(ilGD5BY|T1bKXSG&d0Q5}{aJ7Soxa7vr?Z1kXP!Be9X$J?_iVvd z?s9_}<%b?A>C7$q2D84wOy$ly136#&qK~NoRu10j%K2Ir!7%o1S@dns`nG2(cigeC zFnYUV;fV_6wu;@$<;e1nyl2^rD&jcM@MFVw2p(eeT*7bZV`jPnF}1;}$A4F$gdt=_ zhFj8X@g>p|T#rNyH<7WmR~k)`dN)ZIOpGh{qbc{1otJP&11u)c)sfm22S5iC;)NEA)|H$@F2u8Hn07eiquU&*aJYUD6Z+F(Wd%^cW#`(a? z5%{x#Ok9J^MlE78bb*e6xjh9fJi8^zF&|_AL1JW}JKZh5nWa2vRU*2_mREXvbBgA5{`*g!_D#Q6%h zsz@&_6h{z=|#% z*E8w2&@(57j}Z!dnoM|H)ny^T0VC;NF8!7yQxcGfI0FP0nBg_LZek!v$--9>(Fnxs z8Tt%7aZ-{KMHN9wbVQ9m*f~~&ggM4drhU2O@rp*Ox9=9`#yIF~OAMChZKwvHvA+p$ z<24$Wf1iAfOzcmW_tTpxSK>PmD0G9sG(Pff{bSy(x3*XuGw)g}sdwu~-#*W$Hg+R}=o2)_%yf!iGo(un~FXQaSL2S;~IJ+NqLo?3J zY+mGDm{D9QhH**r<)?Y2^JS+)qZ6^1uG~WRl8=Wjmq=HScu-;WalylPLo@}ev>hC7 z08Msa>@uHXI>_`WB(0fDI)~I#o+)pN@?wv=iceLT@0)36aB`{=U%@3TeN07~buPPa zl~l%-DOm!^bikxhIgA>)((ZR@4Z*|AUHPY~ro5jaC;O^7d8fQn)#*wz#PO1kmuPg; zejvg=ou58cgS2KSs#)5qlC-sHk0LtWm8wxoN&DYb@Df8+H_#lhIP&!{*rN>gDrR5Zb?LEk$#gRsH9LNnwk*9 zb2P1+#JFLlS)nYXO^E%fK{!%NBf?vgw_=N_Qj}tSNzbtl7}*t^!R=>@CHGTmS~@qD z!!wd3zNkf2R20A`AY}~2G9bI8M$J{0#$dLd^fge7W+Y3m(Mk;)u}8~(U?=V-(iC5M zl{RCIUZG-#St+ms7Jee>mH!I`P(RB5o*ae-eTRJCCFc*wSt93$K)!z1gPT#ij$IJhVt$%;pE z-fGiS9Z1#6Oe!UC@Lu1xSQpIJ1sCi3vUPpPdT#X269t=4-@hza>)q2H)TXX=G0>e2 zbT0vk+b-jo~n-`A$y>+uyuRZ+tQkkyZ;Ai}9*ZW^t*#E?b?N5B% zuzBwCorf11j?A1`YG_|<2xc3Ciw%9*hQ6G?eZK5Y6d5&AVU3m3CkS_Lo_&1YmC+5p zuu+$HDZ6dtn#{)0r|4sIvx!97U)AmX1J_?pr|2Pr(rkEtie|y>v zvRyl+f*C>6J+UZn+7+>rrX8dVGzG<#bQ$xNr|5u^J=mGbCgEEzH*I}(Lf8Pi83`K% z9bD9an*#G|vqRCynk*BqB!TULDrAZ+?7;LdQC85!U@JQ?o^_~lJgJOdc zHox+gWZojNV{~nH@kv#y$S?54HN+PoU8SX%7%+?bI8Ci@qFR_*>Uhf4v}C+3OTLCT zzC82g+3*6+P{9P~nz!H}HRF6f(|7Rw==T!eO=J!ZeSb7__Swvt=Q7)#U%(e%Fjv)d ztLYgskHEx<4g}BmmWzHLzt;~A2Oj|zo`fgZm3}yYDb0^*&dTZ#; zzygM_ch5993c%E62d2NAakg@ltDf*GmFhFdvgRu}gjBh5c?iuY8lBY7XrHkBbvQ%? z^U+Dv$IJM8#=xM&PHF>2MX~WrIU;Ptn9(&6VZs7Ko#in9=`f>lA~0C~o?RO;WdWXP0t69H7p`1e3eS86>BZLY;iHcxl{Jki!;vYIMG4M1>C_ z*BnOqdXvBqu1(H2krw011oX{B`L9(OHk=!&WX{yDNt0DK7GOHo%o7m;n7oFH8wOpb zZFi;(Hs7{pyj$#wW^H$8oXuFJHg&^P+md(FqIX}`yYGYAjQ1GU zG1sQM5D(je@7?BqiJxwe5^=e~Omo8Mg!Y>Fz7 z{4WT!50S$JW*>pC=-NaLJ$GU$`8-UN-`UA!DZG8Ote-YV6%b4Ace zyRj(P;+C{~%9i%PZ-w78WmU)*Vjz%RR_o-KH0I42;Ha?9$S$i^E`Q<{ldsB7&fk#pcjo*}Ie%{sJTQMJ zHljuU_N;$95WK3&yA@>>l{tS+&fm!5iGSqpy4N67bbaLA%&^Dp*{1D_O$W102RD*o zbLI5Odo6;vb$;7-CO&FEc+>O2(eH1|9C;S3iHhdwqc?g81FMB4%8zT>zIhNW2yC0* z^-=APk84|h<>bGvZTRN#yJ$$`H{2^Kmh*QZt+*r8{=m(>nS-aNol7k{GA(;&PGuf? zn%2euWnFXQM5cY<&ffRWWqcTGZK1E5d87 zuQf+3DIksfiCA^Ep&qLHRKcO?=INGc>vSvb9k9I$X2DF0qL004rj`3RqXnCO4gPA^ z7RW<8Ec!M+6^L(Vxg5o?RM&PI64Zl;y$r^O_xTs?y3yF@(rk|*5+IPtRh-o^cD`NRGWS?dd3E6OncuedspcRi||Xg?Ls<$^&9%?4E63t+M+WP8kV#;E=S<}iv5-_ zQ>IWeH4*EKV{KZ7c*|*F#>OZNi_VEdx7&!qn9(sp3?G#jZJ;2);%9UMBN57Z6b%Trja;CC{$(CXdIzX*rd(ai~T_s1y$7luGr~(r*V>{sA zp)`!ovX^{ua<;?CyO|7Yh-^a3c7o^@3Ux3j$<-9#A?Fw+_(O7tCJMIkZL)J7xL@^!h_`{vA1NU~0XduhlSkBiZ8_0jHTHVnFm6 z1_m1wbYLioFY=9brTfSukMN;VsbryyfXS{^_S~XzvOt63162FJ0&f?E&Mk`_JF^`- z7dwt-JC5c+(N*7gdbR=hdMB72ZD4lnxiN4fJWE>;#U03Y4BXkW(6Ns`x^FyQusdBX z@6{9p{Cp?yo#gK_HH=T`m4%@QZ}p9*UVAWC)5;A%cf!zfQ`_{(Tx09>;~zJ)Domf6 zJCA+Ta1fjh#It<;p^UfV#zVP*-HQW{W(OW+dwtC<_f0q3@9T2@D)v9XcJ>aq%K^_I zsM7xFle0~8U%1yKG_}u8{$b3>ng_`X%N2d=>zqGWedvVjT?51NZBcKnA zP`y!+^COq0WlvdGCG1uMHP^g3w`n)rkDI#ZU3UiVgcq9j ze%!caF16UbFWbFuvHNhg`|t;cK6q)N@kFkro5g=1*LR}OE>vv(OTk}J318DT+_`47 z#YMBvoZcB_^iG7xT=Vsp@PLv>#i^6$P4IPfUo1&Oua{P%dDfzMJS#I&%YRqF4j5fS zWOT{%${n;xJ3Q1s!lDXBHa%12@&9L)cAM_&pwIrAb;Ye7g4yxKD})uTQ&$5`!n#!fZVhx(gr9#{y01I*JZ{}^`43~8I%X}>0rW1A?Yn}Y2TV1CZ zHKht)e0Fk5O{wKqN>BCTZqw`W5k^*VQ7lzNfy6^%VZ$+p37K^cLxXQJKOoUD^%0R^_?os1Z|5&Z%5YKvFIgJ zGdVw&OprmGXfl#ZhqL~}3;v_`D$23G_!%YE2|GA+i-6d*DDKRPJMWxc5D(w1Si*|f zyd&GZW3l<+Y%`4Yuao6+MR`l*^eJexk${3}rZ*~Z=3Cn~8+-5AM>T_V#@o7tquf)O zK=5w4N>zE+p{oA1xAvDFU-I}CJ#AS}TgJ0RQSA~lHN6>cFWg*>IJYHJ)06RX^R@YM zphz)Dtrnf*(y~BH;xY;4fg~!E38h9r>GN`hjv1;GCJC7Zux0}KvT1fEYtVA?KopKx zQzegX*b3*WPR>;)mI_;3XbhD+z(Q%+;wQ7pqRB`yuw6isIhZz;+2Ug{6Z!*bQK>Pi z=)Bj<$Qj>zws)00ru=}3#sJbQllFl4;dODVi#O18d-$&~Zne3*iuoBc8#ChSwq>;J z=yXzwUbvWoWQ1m82I~WOHQeIh1#V<^Od0`0CYl^u*ZxeAaUOG{F$}v_?Mpy5gHAY2 zEs^{mV;*LOnG9N@iQle5PqTmkNoHyev^#OAs)1Q#fh#Pns2uwh*>A`aj4}hop7V)B z48~F*g+oRgddbitSxaPP($((Vibyox$7Za~LlecL(r9=h23teuh2!B^(jdn7x0Ys@ zr6$PdM{p>gG<2nAW`!HdQTI;8!Ym)oKO0LkYYWDE7ZTbtDhOP6D-H8D$2{bO&cH_!x6to!V$p(6eSzHJ8 z)GGSlxH%VS&DD40+6QuN+sP9=27jSh2=puql>zs(3$(4jmX24MygyD-@6Zg_w?^Pk=c6c<)Ir#XIs$&3-(hmk1)9BuB*o3UTWG^aNw6%R1Xx&$>$Ur z+un5EcE0Jm?VIn)HuV1`Mf_x|!*SSBsI)uw&YdBf6MM-t#lE>+WSU}Mp|aYsW%ikZ zfSa@=J&o8TBg@R&ttG#vQ*P>>%p53798|~!m8?Sd2f0P1@l&t*T=lTnrjMG%*(AeFmEjxu?R44on~~Qp1n7a{gw&3-3nG(&fPvY zzx&$<-Z_x*J&<+nU2bGi&RB)A>VnnhfT4m0GWk%j!M{{fS8(7LwS~Qia`NGrp`qX+ zpPOprAs^X?Ar0eCD)2`39C%K!@j$*dhdWnWU$EhKsbly1_H4(2f&(we6j1iw1_58db#;AGfmfvn`tXS5jK%S|WeNAr!%#}W!QPOwdJjE+*RO)- z?`|@B;o<(0>)?4HzYw1L>E865v*Hc!j92-&k;*!obIz7o{BA1#RB+i{b$4xL?y}`7 zo;+rK!QyCKs;Q$>-!CpDAnU`QqBWcz#E4>-J z9h*P)?NjfZ%CzmxHtfr~_AOVjSkGE9a&k>#uCpt*wLjN4c(?v(3r3eZ25?d)Otd0Q4Dgs%pHJkBiBPHMqH7#Jl zKAf7KACZX~WCpGBKA1=+i{-e)G1*U_=+q_eA=wG0Dcp5#%&@#2cj#VZ z0)Y84$Wj7Lkcr;oE4b@QEH>6phF7^5_BQrDmWW6sBO{GW00HS|7yHKe@AzXO!2UPmCqK2?Emmwe7VBTS z1xxp*)-w16IQPz3T$Zh~se1x`Ff6Q|Pp#D!Y}9bDidj7cdcC)`#?m>v>7Ia}!chw* zXU^{W>Zw;wDL*%MW$pDjd&4Y#+l#|kH1-`UkE0PQPr#ZHs))1FFl&VIL$ zxmV8JIGS}dEISb4(>lKe8!eo|7QtG6&-$X%;+{SB=3}=Xd-I9gPb>@cPVFqe%RaFo G|33jl9gI2v literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/moe_align_block_size.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/moe_align_block_size.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..099063bdc51c7b3cba04ea756647bfc9db8b14dd GIT binary patch literal 8126 zcmb_hTWlQF8J_iCy!P5@E|`$`!l?qnOLo`sZmuRlO(+2Z1X{vHv8?TQ=QtjFc4wKH zO>DCfX~hGm55N=Qr348HP>>p=RwNP<&=-(+=(>r7yMhX-^Z}_4u9Bxd_505`Gkfs{ z2hp{6cJ|CU|M~C#|DFA^zrR<*%ZmKo{PVD;{gb}#{2reg*B_%96sA$7u7(us~R#w zOYn^)&{)Fg*=RN5^m#-u}Tklevli@D9dLmA1#*r8<$UK!cS?*u>O}GP3vWN9k$FA3Cj3HC@y>{-xV&g>brL%ys#x@B=I#fmo1>%?2W;qJ^VnVyLWi-J(WF1e<` z9hYG!;8AzkX)H9DUcy^SN6&?Hm|8A7cG)p?k6ViwLFWhu%Mc$bn5mQsLU0)_E;BvI z;>=40DK$_)VYcou9m`6#2fhR$qEM(f4ljA^p(meaj$WE);3qxKTwBn8+SfL*J=@c*ywp~Trsn6Tz)2Z=KrXX{J!iQf zMDBy<>3OAluw|G9uuC4m^};Nk!lR@>X(^saTNCUA7#89(;W8DM8(6J|ZsI(&U7E!7 z6P_d^_@&`#F2qnq@^@|I#E#N7k(-h>MfPmAHVMTS0(oZ~yC^KFRPC1a0!5A(LtcjV z#R4WDK&?$q5m1p#G2h!s^^TEYkI32TDI7rhRx=5u>VQ%!3D^sYg#=}?69~B6Bj(cM zzB*dV0;y!i(lM1RhaknOGqW5-96?DH5Mf5)^%gIk@@CcU?yT@&WCTz(U_tDp`!O$~ zx)Hi58T-E$RVZ-8cKS^wLU4OJ;$8Y1;)hC^bSTL!{7f$tY{$TqKpoN$rb8J4#-Jrq zo4)+Y9T34ydo@W<`qNz(c_&z^3>rAK5Xz@V{v%^0U$$Kj35+z+*)!!lrq3W2h04Hr zDqtOPSaC>P!#r)e=Co;<-XfdEe^BmKK#SZEWGd_!BnTA(Y9o=NL>wxQDsfJ8ZH5$x zICaI|A~PYEA=ANO)aFR-Z97x+Jqh86L&2&@OIT+}u4OcL3|;A!hFQcqlmWS|$jr!r z$d(0MSlZ1+@(O^&A+zWKMUqq(Umsn~|@ zuzYt))P$yxr$Ij{KFeWfRbJSvewwrKtn@dRjY^v#3vwAG_lUfB5Az_Dn-pYs`xU5R zQc)e-4L}NPn1JQf9=<~zcAE@N76I31M1-+%Ikk~qTV4C8>9}5s;@Lw|$6T4@X)|AM zDWqxokQb|_f&(EmY6s!9GoJS*20gJ`-#LT zq_L+dwxu|fSArmz1W$WY?10>6mt+thV$&4chdW3>-|)XD#rD$|#X5QD2`V@)##Cu8 zrTGH-87j!5N*HM^s@w>2)O{*E!TuEfZe`j<;iWYl-ML9ofr45hf{3(NB^8n)r=Mv= zS+H?Sv@;VqVVJ!bIJD8R|ydK6C1Er)vWdepO> z!fZXti)C-I9yLo|F6$5mbEpBYM+%7fdKk!~ep{!eE7|K%L+Jp2X%02&4q8Xeq2|0k zP}CRl%_&f1zk09A>x5P|`csOsk!!#ddb*`~vC5w=ktSox*MEOA<`v6hWwV_2(D86D zy-?YAVE_C+K4zKIW93COPo=TbmQ@@p+6K4K9q>X0GLKpMBATAD8R{Q>QL&v7cS3zF@B}AJofnV(Brkpw!e1|e9cB{ z$@@R;y~-Y~#fQJ%bo(2FuMe*DjMU=ngXkX_$wxW8{q& zUVovsWB=uQuWo&?7P&20v#k~%d}q(Q_q~1JrG<}P{P4xbnnCu~&Np{nc>YfV9}Rvu z_~~<>=ReC|+x~Pdvh}-2_g9gj<;c)VeB*07&o8|x z4-zwWb;68B=%PZBXT#O5IfV&nQ~z}~QVll{6VOY590^?lJ(9PHS4V6}=rKT*op53` z(u@=d3oc>45t<)`fxC`rzYB>maCWwPoxKPf-uhngAEsu6e*~?bbr3GJ2GrM%`NVgD z@*;W+4lPoQETv>qVP)pDA4LY0 z=MG{LrT3Jo0HTEiSwX1bU_s}US$Cyq|9ziT`9e?=1zTiqU>&r*MIBhBfEE!DuGA4s zc5J4QpOGI?`b4iuU{_}b5-|~ha2A?-i}V6^!XK?j;P&q4kCn*U^bv8I_j2QM=1bK{^R&6aMK#SCFaJOVzQal zfT=Td5+V73?HH=vP5QD95qC>Wv+-cBHlzfK3UbIJ7oi$go(ky&uL3lzTq>ZTphpHr z02FL=$f1rFL?;G2Iz^X`BS2fZy>!PGXz&f5k#R8FCJC67oXSHWsGi#RT|TW{O-W3Mg7VTv!JjIXx3_Hp%QdI6GT*lNm3M0 z2|nBHB-t<7%ZdtK^Mu3P6O2h5ZMz{){CxjpN1pM(B#&|eN>rE;3Jj@+VDrn>p6n)CxGP)3=d?7<^0DO)xA04Vm$D$0+ z7qC?37bfEN6fYLpJagIu=;T@AV1mgG{i@<@q04P_xgD4KkVsGx zJ+ApR0>JoAnkj%Wqv@d|lHMIOOh39f+vze$mmyr*kgbECS2bsce|e5({1q2~&m&!0 z&-Sn4Bg^rT^Uq(6|NPwHmHv^h`tM%uzx&)#z`Nm*cjC3=q038G*|FuJV?dDUeEjq$lUF8dzk2rCrsvKbTS>4B6U&M8rA>cJWNVRZEsn!41xymg|1LTe+L z+VHNAqn{+MB)&L$ZPQcdj(xrT&KC6f%(d;0`N;FG<;Yztae&XwEAegOikrb3{oO#H zo3*~%o6x2+0-2Is$B!mA2qo7e1lAowl~T)_=zB)~=9$OuPL?ZACI-Z~7|7rxW>!5D zhZ-^2g4V<6l`_qJh0j= zZsQs>-}idod2e~+j@2G9)~gL{TNUe9`?Of%xB7OqmdGtf scVF)pvrl$CLpyCBUJZ+zof&oWz{8&({Oq9o@@=&DW@K(eX_*lAe-|3nt^fc4 literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/moe_pallas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdd42b0029953513fc056d86c714750616e8d6ee GIT binary patch literal 4926 zcmbtXO>7(25#A-2@NL7F9D`VQ7buEI!W#i%%cn&r~?G3d`gc2df5#Fklh-nf%f8_7$~P6+Rj^YNm`-O z0)2ws&&<4eGxKKWef(1(;3J@k)Tcs%A&9@>h20{}$qr2t#0J3=JSh^_8zY%;^-4B*9 z@E_QN--GBqEe-R%AG@x5?~w7(OYGb=(tE?QvzCAQaD42>#PyH9Z z>-+hOZhT=yr`qx{P47Aame@M@BMXocyr+OEUHyJSKW_S)_IJ@jq#+@9x5p4i+=`Tg z(3~2j5s#bE>!zc_n=X;Hb-hQ#y@k6z7TTLq1}+t`>M~-N1x1yXWG=%>i!7IAg{=D8 zG^?VlBFPE%eUO+JvMl14kiv2@VtKSEWD(EK-@)50b&wmdlV>44*qfY~N6I97OHh|t zMMXLGhwN0s5-S|Ca*5646_uSw_&imT7nT!^BYE{s4kZd>Seq>(B1M5?z${5-(T90X zge~}-gxL%BsfbT0Kt|aGWCj~Yr&I}O9O#vCYAm#}4i@-6@#nsC6}k<4U(Gnfveh61 zn^cX&S($ESK}!U(^ohfkg19=by~JA{k*1RuGI59L&1V%b3l)&*RPsQPaRSL+=oD-a z%l`9cNi~y8=7ijFm6Ml{di-|&)af@?PNAeI%qMeq)MY7~yeW#AWJcnVm_oPFLLTCk z6uCP{R+5W(1-#El2>2+K<3y2D61h7j({R2KodTO`-EZQYC@pZJaz4>15yU3Np(_z} zHxXj1!MG8OZ;e!flll)X8o`Uz-Wj8J=F1Vicc#)itJAYPAWX8_R|j`onDecV2*oyf z)_XRU@|#`Q)d+|KQOF75{0SIxUB>KY?X|=TSDLsGN!v({XDG{-@2r5~m8;rIeIc zaowdWK}VJjY%rQZ4 z2$$GbN-2KriZpx*ogWGqh+EOyoz7GJnCshvOY+nKB8~5FSF=la(XWx+&f}eyUPg0t zoAVpucO5YSJ>3emU-Q{T zxCljg4;iK<;{=(&t^mLi9;Y6gn#P&Et3C0wGz7I^+D0cGQX0im@TfHJ>jsR)U@@e5 z-?(qs0lJ*IJq-PN=hK!`rLnSc!tiZS(=EtHDX;0SRTRWr;}~l>={o1T=BJ{76aK} zddlXuM=OV$4{gI!PZC?klCgVPA(8 z4Y_NDW9`Dv3JRN*R94C7auyn57Xc?VumC$J%_L-8Bwqd-+Dlw&3U6aP_qK%<*ei2$ z>=>jv6*idWX4(g_WZkZlMT2uiW%27DabS;V!r^+lo8y6aR zKc;#XMNS29YkB~5VT@$DIWTAmnIReF5T~ZBs}>P=6In1r#^uy>N;$=Jf_0{IDU&hD z%O<4=V&3#X(6C7|{?y|8WL(GOA>19r9WGi83vf}AalMfbL1%hf?wUSH7M4;C$NU|c zV*1*bVfs7rzzlsS8#_Y5;9E=GLc-EAfP}_M$nt==6fWs(b}x@q3}%g`@<~HcBut-9Ln(+HeD^ z4gG~mz6c$xh7KE{!=8+a0D!*5r*F*7YC~1U}mCzIxjc$yu zk8e(`9|O7l2OiQ7=yLc0qf>pgz=3LD#0ZR(F4vg;Dl=>_!=;%T6Wfd$%uqRO_z#so z{K`LGC!8*K-9dU|wHRAIVZ@H=edBc}>7CdCgy_~m!bkLtR-dbSe1P@1bn zM|M*pY&9}&M8>z?{XGANyPw_FBjc6Go2A*>$nomPv@tUM^u+eN`p9%; z{@D|*?K@N!t79jPv6I`=e^m9cla;YqW8dsOrWP6AxUhZ!uw*1&ntc%&u11a;k)zhC zHgL2$kTeF8m4PXp+Fu(zSsguZjGnKIUeu}K=h6O++4b4-fv1Y?_Z>0lu_~P~=){%`u^Xt+Go?!}Xz#t*Dl=*@qh(oV zMk~yOK~Iz}!H)Z`RoB|+J=dp!#uAKPo2bx3u<|pdA6xJ{?znb2?rT(vi1jfhvf0F9 zKD(d_QkD}cEv+H3g~WUd^8&UC`(XO#w%nwKQe%IimjrL~ds^f~y&mKV`^`5MUM zr%U-9I>;rG+@T3F@K<73Cx+|pAbDmDZHVh)oq*|9#2TKyW)0h~e{t^FxgE@?pCH)7 zwf#figx(+nwaEUBbL;2oPME)F{OG8=U=Bat;8zR$1gU%QiYDOlv78sLe1yNJ?#DBR p=pU>H@T>>_tYOxfdM{oDiEylr_1D9M+yAQ@AKmyi0xQd_{{RH9Ukv~N literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/moe_permute_unpermute.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/moe_permute_unpermute.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e4e74b67a0a4ab9eeec02764e462ce8e3c1e2ad GIT binary patch literal 9456 zcmc&ZTWk|qmQ{ASTz2^#llPE(B&O*N0h*?PnI6&IYmBR$*eO4{TqaGh zr7&t~h~@+MBJ9ix(2mq;(b8uA*!>wkS?$NVaF6UM@qskGTJ@jV(9&vucF(<4<=Bo5 zJ-r`$A>~{5aqcw{%NE?HL^@%;Y=yo>AmSba(DpA+w-_f^tOUVZTg)wysyQ$&If#d{eX>|ANZ2E?72<`1K%TMITs{VHWbNbluAf%}hc{F>$tLfdSV6 z!VZS>8>_KC&&W<=U5>~xIeNuJ5Rc&xwLFm{Z!$U37)J&pPr^DhV#~2n%S9r{to)sw zgLYDo;WlGQfs+>nE|ZdEfe*>4CAd_$w{cFtBj|I{l!O;8#I6ZRE-j_5ib8@zk}-)v zod-Z!Sp%fW1*gdc7f$jV7K;oQjzm%tA4@K9a*Dgsc-|Ea()Z{lB}C385IR>n%!l-q zfCmy+GapNQ4L^zS7eT8^Asi$Mma7&yB}EohuOy^}upEjdfjUtLUlUYoSXw|pR7*q* zC(;tS%Aiw1_b4F&lh6%GHuOMFNUSD9Gynx;VZ?G+hxo9H&G?Z3lQaw0vIj-@ALKwOX~zQ`Utd}!&Q zFeS#WPNlEQi>c()HBn4VB~rW~f>H^QED%2>hOY}!W-6M^z(s-@!@PhGgsTZ7nJXVQHC*kccc>;Fr(*Dc z9o^{O{P^kMx89R5=JwytujLDU6Givr#*IyT!F_b~RD~uwM@tu{)e{0 zyGNf~e%4uFFBGT?TR<0~np0U>WovAg)Q5&yY^j!o*utW$_J-4G@p?#trGgaF216-O zh-5k|N2)de_Ejy@f58h_CUY~p^BCq(`xFE^ZvpE>Hd|F=zpOdaO)F@ti8n`0y7rRs zs>RZvp(jrw+f=nTWRA)N0D3Kk`eMMW$$qd`Bu^Rp4eLS@GNKE&cr&{Nw#2YCByrso zw5%*+1MqW0y0Dpu#*$%?lY~rGl(7ZCTm`iUm!R3qz9xJ{yr^ zsU!#gk#VZCDY~KfA86+$B^MwYv`B66qf*ym!m>&wvSJ8sJDpLjNX#?YM39jJus|9@ z4|MC&JLoYC50(5tr4VK+c_xEMHWX1BK(k#~Q>w}n0(%z1=n2vk%+J7ICI=5tn|8w8 zU2^p+uKuEnyE(UdVpT3PZ6&5tVLI1+MW%Q4MA_;tSbNItgAYeH-g^idwdWRF_Vtx~ z0mT=1IP*~6_^9ZcT0Qe;c4T{NPpK=YbOkpCH!f^?i(PxaW%j+GtiL{U^UNJ~{oa2Ij4CXenM@&-pyQ(W00+$ZHSTpUar_s-=ZPA!0g0x?Q8Qxgdu3HYi-seY< z#Wb25I!Mg6{u7Kbs6$gXT&E7;V&Ce{*_<69H4$F}5^~PGX$4F;DalysPGn zX17Jbox{;Jr}&#-(u!7HCiCu^Q(94?%V5n(KY^=xPt8fq=t1F~=_j6HP~!!S?AXbL zH$yd?N51y-&Ye)=MxI|Y6-UOnp+UH6$~m>rJcmMXCg;w1bDo^*u^#%8D9oJ(cZW=L zLo+mnCznow?)HN-^~8~ZY-on8>jAdSpaSd#IBVVYqEH&njt@0#hF9a9oC^ol_V6RP z6zsh&Eyf^X&L%Yv2BsE#21LIQ2|S{Qpch=muz;Nv6?cBGPWESXJZGvF&B z4$okC91g#QaAZ*r$@Q2!EQJ#&gwGto%O-R$sb^%|D2|6GF9Dd86gYdx0b^sqZ6bA}G!~5qsDV$s~ z4%|*m-#|Tg!r&n!uxXUhXxjiTABzaawZX+nDJ{0qa$sXwCLNAIvZV1Jd*u2VAPwD& z22_^;j!VOZi=rTMnZ>Zg8wfYL@EWv%W$3Q(3T^>7o#qyVM1sQ!H_qNL*LgumqjCM( znnVCP^U;ifDBKt@F#(|hrWga#!-jgify(8Sv=oNpK?6?0S#t=7Y%-Hgr*Y27O-$Fp z((z~>bkva7D3gY;auB>jw60%~#wQfi%}2R&sic4-&-yjoGNSCNz(piT9R*C9T&u+z zkPEmzQB}8A2-e~S>^GLxD1G(rjGD6S~%; zL6BIT$-1~lV-n=4So1=uXfz|pJCs%*);}zR zv2j;HCl^2%B@`b9eOk)q5TKF-b{Ll#B@}_F79o+AuSLYWjiFDUCzY1RD1mWJA$l8B?M9HAT`zShtX>t$LekS4-(u z9aV53o2sW#f-@+_M0#G|aIPvryhGr}9CR}O3}FmprRPbaYj>%0Oz9jec8)95uGP6) zzbvzDw~wtITlW^(p4F3OhyPB#)G@7eOcy)$DvrIIlENN-^3iklc!e-o=rXf!%M8B- zc&CXj?%_N4T?J}a+1gRC4#9{wQ1*4B5{y^rpHX~!UzthozL&nhD~rjC=G*+QtY)vj zVkbP@D<|#bDlWo3@XBp<4phv9t-InUnD%wwbI6Rl25-5_*1m#u_{HGvuX-N!l%4$# znSvu&_6?ML!-{XX+{TsKMwGUZ*H$WEFEIT8))6SNyA^hKksU2e9Dm9b#^;~2^A*Aj zxZC#>X5M@H%fj@z!k!Py_O^n5tmL0k{8L5$bfImgVkT`s9jq*{^RMkh@8E;k`?F=n zcI)7sOKZn}zY8I;`B`cIyt02D3SGVQf6&gG3%8cnsavTc-CwZ~maf+f;cdTXziThs zx(iHindy1qYP-F(wp4camfS;%d+4>z;-y#TUpojOi+#kFfncw~bpL+%t9Kv0yO}5* zIISEw4e$r)0@e4LCj4FZj@&(Bz;oYuXU)|Frk`Ft3otwT?%lY1qiplsdFPhzTRR@bP`zx|0Uuu%6HLDLaE&f9p5PhD2Oo(`_!-1k39khl&0o}&-{3qrb zC}gcoH*~Ly?Uq*UuW{v?13SMX-GDG_heNFky)e{?s$ZzD9_`p_7``TPQmqv1*tavA z(BuU>bxZ~(%W%Vu7IgK>3t!AJG4mc^)4#80Z_V>~uP%14{7}t0Is9@n-kjDO9I*Rrqw~n2&^d9t7 zXI-cjv}(0kk>|rJB&5)bceV*p*&ZCv+@kjVGO0Z*uqqD^JDs1oiM@4o3 zT$_8ieyUoL1djMF5j(0}a*8(go<;BjoBt9L+SeIMpx!uW>|j zLo-awMdEAnT##yH>_AP(vmMJq0bI|tz9z6))fjjYU`yzM>}IA19;?I))^q#t+Tjx0 zqp&?C_Nc-hEwaZ7)G@pd%IqVs?A$r+M?#bZy;{&lp$}3Lvcp;wb{M@+)_|R2tGs{md+&n-zuQ(x)(nim{nJa90o^kmMFakj{T6hJG-aNYf8AL7pHhbb!1_ zmOJ|%IPNywa_hX}4pb~?1l%ro^4Yty6)S#ah_2p>4R`G*W^mkftjmgL zsKVkg2jTKooVe>EX!h3r+p}x4>$@KG-tXO@l&V zV1J=lKHhTUJ*P<@87SLaw|m!mD`x1wgd=+I_Es#=2UGw(aZOymq__qvRy4#Az7D*= zhTiPN&`5ATQs_^=_>^-U#b-Ic&ChsY0yxY z$QKJ`DiU1MNwJ`*dzktNHY=pG3gzrAKTsaHUtEXhA+peBPt_-?f%D}GmjX~Vc!8Ez zxDm9;yq{T5kNIlI;?9ewcshPXi$7P}jVeJ@;f!Not6`9{uR61=kQIE&cZF4Cc`m?s zdl(O3f@hont8ixTC~ZnF>z@fd4-qS}Iq)*V5)|%PFV0=asT!#cSbmbQn1%eZg~BPm zXMWf3p2#c!>9)ZM11tQL>#~6Ged4uz)2TMCAzq`(7iu^KiT3exr2;a z{4o`?1}u4N8nki9EWYPO=>KO|9WsaAD4R)_>c1^yMcpr{CQe{S`WTn85(=;IpTccP zpYm%w%g@cbI_zpKXmD3+4YMpy`8_X(gca-OU?r@fj#bhcta+?tb@2C}S%X$`526A6 zNI=%$xu+%I=JFxR2KIgorD6SotQ*%m6yr{H^7Zs@uDsq8K||i_;qK3Y@ogH zCPl2MFaz=ifYVpf^A}mqpS5=5g55c;`f*Q(D!dfP*xw1U< zC~A><6?3c~k!u#$`}vBh+hb6v~)-Asp5S>k=w#tN1?k8~U1AEbLIP zsguRr?Hf=O6MQ;$(8Qr^_5mlR6tm~ywMPg6^{K&W7-S|ytzY}ybu76_aJM11VQ zYzZ5Bg!MCF?Mp*CQP=a`B*YWtq9IlQQ{TcRgNi4LLFJdWq?@+H266sAR@XOR_zoIa z!>cA%dgC~@rj}qryD_>$-JQ{@vO|N%)rC3VBj>5utGTg7BQE4P3b#WA*gH&zRx1QX zym{Ka!5W2~zjN1r@;S3u&Xv{jj9DP-*qqs^EM9waYZ2!(btPBcH8=E9Zd=oexuULM z4ZZ}gR$w=BT45IxBezyD;QNX?20eya!8YYpIIi%2+QzfxT^j5m$Sh>SN^krf=28u{ zCsA~)DNj1`WbM5ckkKh8I#pkBqS@N}E%|a+RsXQQV$0d4oO9${L!M*M_~G=y^y9gM z*8rCs`BUJFKt1|}WD7$rDbGxfbMf7XcKtle()CH7C>Zp3ea zD3)%qOLVF3r=R?>-ld?JG0QpEIGqV`%)`5c6jsPX8nyuEL&ST^2@gWQfqVy5YFhL z9Y`LHT(*Vb=ZVqB@Bbq)y&r1D6OZ-|miL2wB9VD=#YtS<7h5BlC!@~D)n@9Nle+fx z%D0nt>RKc9tNqAxDg5xA2k$((SAVx5&F*v02L>N*KiGa$_#&s4h-VR)9iI;6L(;8od zS{{wRVZV9X8NUs5dlU_4+n13v))XfkaiacSL(JJi?geZsH2m_dY>==Y3ypP-!fdHD zyH+W!nyL=xp7Ny<4fT1##ys_X1ns30>Vcl3!iuhIorCgcL~@`eGYk{B0jIB`;h&R5 zVA)^I_#7T>565vY0*DL$6OGwutQ|+`$yV~xzvZjkPg=3$;f;eEZ4b~#f$&4=fz1B@F)(L|g1Z%2_&{N3G8 M?*0%1u9NKl0Pn}2rT_o{ literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/pplx_prepare_finalize.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/pplx_prepare_finalize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b1e3e81383e55dbdb37251f05f2853d11b4a9fe GIT binary patch literal 12401 zcmc&aTX0*)b&K}}aDfXDAOY|RkQ7CV6sZUGrew(vIguz@R_HjPox%|Ar34BO`T~?i z8mwhUsQ@{t#AIAj&2&U{I%6r)O!Vn~bec{-nwkCpq*L77Dl^O^osRyHP#$O6>7?iE zy_W<*P*L)e9ddDY@9x>NXLrwQ_um~38-WxzejFqHg!~E%T5_nsYO{`zdBPK37bgkb zm`;bVKCT}#&|l-2k^Y(z=BQpr5*C^>G`Eh~Xlp~U;Cp+yJ;psP|h&hpM{#<5mCd6n?oHNv|D+r+U-%nJO#p!|*nr}aJV zCs^pw$;?ZUcsz1GE{s6I9!thD;dtu81wn+O=j2pc;LoI9d3`F9%)FFJj>j%Qb<=2S z^7NZR?83#&i6nnc;HRR(%R*eZ5XlHVdbvJ)ITDZYPza}^kvQZRBa?z0NTh`DB|%II z@$kEdFE%5DW68;>4D>NZ`PgNd5mQr1K0GxkI}?#B;p9{zoJn00l4;p?F~;*kGR(&k zG82Q}%rv01Q#@+HGbxDSNHn8#QHGH1=i{m9C53{hS(!i0O^_o$R?jIrMK^2C8a{kK zqplVqSwlvzwVyC*(E9HK^=n#xihQKY8YU{StIc@*rEWsh7F)EL0ZL{q6=N~$V`xw< z+_Oy9C|a_n50&wmS!;49lzdqeugASHYs=WR{#jktrqMfV&)TyL9o3rjX0e|ezUym^ zq0q(~W~~|RWRa{TYu0d8)LPeT$Ms($J;sqIE*Sm^@>4`s^;l$EEFDXxGm&IekWHDC z7`-T)l!ats86lB|4Ix{`L9r2|1up&-GWHCcBL?>n(W;ldwg57o@DZAFU zNcjA8Mo7z6*prDcMO&wSR5qmY*{1Fd9M*<=jh54C94J_k{wFe!NlXq*#wK@WBH{%h zv-8T-fkOu`9S{cMvGW6y)0vB@4JTZ_+@vysvD?)S%<~Ir|T7`Few zshri(HTnTh8zJ9wkcj|GV zWntppj^)6<+$ky4doNZD9e`G4GimRI-xU~wDRmzz!e7%N7(!}kEijEw&BW?1xI>Hk zmYJT%jH}@8SbSrd=`EWKthr1KCi5yxj6^ZhQk@E{UWHS79b~yc#&kRx)AKsPFj3=- z@Or_-8{pq4n8ukL8Q1ZqYwVa=u<&Mqnb7cQ_~Cb4&s(k;AE-5BHfU$!8I-c{#R?MD z4mIpVV?`GgTx(^6R!-~(J?sE;0J{KY0d}jf1K>K|37EQSVB)r@cWqJc1{&+O_(o~x zJ%Wq(3SJn!UTA=sgO03D<5WV7yZCyb&CfS%fzP)^y?={(W&D7~MWBG0*i$_=aM#8+ z!nlop7M|3m$i+7SMNNG37Bg$vqW+IvJzSes;L5?xTmf!1j63-@prhr_O9#&Y9YLU@ zHAk-K#@axdhvbH{AiB;fk_(jnNmX{qhRB`>mCv#pMJQ<72ns;Z#hJ9sVHF=sPewA) ziy#T3sCBi3+LvnJ%K&_$n!%D>Q{>n6AWr04mpEVvVctK~1%6p|6qqx{)4(tWR8${hG~q zBASU^j$~q~c>!F*HugvV*8neHt(wW;On zh|3GvRq|`jUE+2Wxg90$P>};YbcoJA+6ZbGd)|-zq#o*kaV<|u9C-`_hvSqEig^bApH5by#o+d z6G~6kOxG5aRUOZ_kgN)hb{6R=*-RL;Q}(_QfO#^lyRRP^y}xO2*`1oofU6RYrNbh1 zc=#>1=BAc^0jO6|?9k=;Z)F;#N=2c?;nHf0UlmR zxLP*?dGCj@ymtbf;HsqYURjS>C$ws+Y68WIRv@j!hOATZ;GuQP=GMHCH){Uy4KK85 ztw zRF3N_duFz4BTYaIqdMiu1j^{DltJT%rinfcTK~=39$K{+)QyHe1wV{h!<&KH-Rq-h zkhY3=Ub5D4L*7&CRBfQi2fN2#yIn0mM%hK;Kf$Wjh~|lPd0$PL#AqjZmP@`0ebMNu zN}_=`!qI;Z;QwkE z#|^XYXRmSAU9n$K<5eXXw}#ZM*h79;uX>=lMyzwXa^c^hwk z$Uaczua4wAId85$*O2q&{5g^{j5ECB8Z%p$tyAj)5K}Ngq?MuZEl%+qp8Out%-f?E zh2Yt`vB#vCx*X#LKKRbMntdnu zUMwCDo)>};(@KG<(Jy*{_L(Ek(!Rl1I+#kvr-QM?B!s+#WCoCe0UboGesuQ}6puZI zs<_z`H12!!VgvlhW?r}q_TT8q`&PLgG`wm7REA*Lis~zJU1dp$^U6kkGJHk0VXoF# zRI^gHoTri9glyqc;WTiF0-_MT9EIW6k{uwN?&>gPpD^G=93FOMl^CNR3k}D*+{F19xa&I)gya1 zo(lEi$9OT7x+IR_(CC)Y(^a)?#PjfhF6KMIApC7e$CacUCdY=eGSm4yCyGbpZ-C7B zJ#3Y-mo)h2-<*4M;q@Pk*E8xD}sI&7~I4e|qum#pQ;9qI)3s;!1-rXP26` zmzs_&H659Y{G{&ReE;J6srz3WbM{{c_DHN#st=Xw_bt`$%a49GR626Dc;u|)?=1Ou z75%%UU|%VCycj&bR`b+uZnWi{IYaK7PhG^?DzV(Hz`}5;d8pVt^jW-g?CZs2Uw>-Q zTWv6xU4sQCxMKGf>_Mq{+nv{LzqZ`G^O3vQyeD@CD0T&qJs*Ft7+T^we(vimxH@mm zV#AT!BlmXSi!3+qx@BG2zU$Ku?tbt{EN(wsU^`blZ6(j1CC{Ff=53|szG8FVy|+H= zU2Z-JEjfPyB3NJjbmZ>HqsUU{0pO9>`?SQmN^EA9M&P5NjhS5UcG=zs+U)ErBd& zEH@H|`%AqESY%>cwlZO?ih`+io;O5$QShVIUcCw#`22%dPkaMD}hOfb&(sA^)SbqnsQ-{E-&Ne=-6su&tu zOM%{EpcjQK>z7!s#QG$*L1Me1xXLk9kQ#Fre^W=Sj_;a(&iOu`0r{c?)y!ZKLxfQF zDA&E}@~f2%YP>80&l{BZVHj=osLCuLfRHt2Uj)`cpiFmN55Y3abufb~o+X&qLs#1N zPGew6r?K@T*!njtS>psAxvHeKfuPkhzS2Y1eTqCA#}DE8$+PkNko=uN8xcYgx>{qd zhuXlAtKsfl0t+v_4MH1#pf^xstc~Hhs4!_xd0txcGVtr;Ud{-;D0JgJw zTyDVem{@y4vA!sowZk@nR%TU#5SiAWfw|QP!%d(aUzNZCV!#>F=Pf+LTOZoA0|_p! zC8y8nz`13>y*0yyH3OoM9hrGSdCL<-7c&;6S2jV|s6o0>@d76Z_U$B?p21i$m;uK% zno69HC552!nx}up7(9A3NN>^r90Cx@5>G(}Hz#OUgI@{HKhR}VP0mrMlr3w~XZ5g5 zE4Ja+vG<#hMd5!fA*-p|@)ZC+F@xJuQ%BVF4fWJF)aBurP`^+;cA8(91B{wqXi+Bz zvK`$(@jIAZ!>kuFHEbT^)8Q$&OL+;Zs8m-Rz6z)2z!q{xd-S1%?>GB;uc1Kd2ee?S zYQEm~qHnw8YmvO*;svGp?sYGZBmVBEgi*ab5TO_@67d*`$UEiMir)TMQ%ShpiN|2c zP_v471Fbj?-vg+}st8{?Pda0qnc_WApW&~tNcr*C0Htv)-hl*S_xN!|;yO3ZY|D#@ zjm_F|y89vfKe*_uBd*3mVU=OMG_X96Gwh9W5|N z3*6D)mXUJn{0M&_Tj!C58WTSHM{IPm}A#WGC$n$Zdzw* zjB&&ofOQ6A%&VAPmA2OgC4vIQ0RO8P6nh?x{XiAEvlh_J7EN61$n@IP=Pkh0!#vLh z`3PFl0EniJm94>OYrf^Zx7OgBxE*!^7W&G`>sO2!iLH~^HccagE^c3SSkNf{4LCIJ z?BgQ)70P-F{CVF&_&#j86F}fL~sb9F34o#`BW+{BBR9I;?m4Pfc;Ge#u{aUe zLT1mYw#q2HTBO#F>^`f!r8p;~r{Wn}A3;Rl#`d1k%?~U^Tn6#`_}QVpE{F>8csL9q z76UnHU>#A;qS%2=F+;M_cfg$Z-8E9yJ50M)9PXRn%73eD0vNV%pt)>8h#}2wWh+89 z(%ABa9idiYc7mg8I=r~2)N!oXaqPDQ3abXKx@^{&I_`PyW$rhw!j}##H!r^O>FK+t zA9WTx_hV%@8TfBUIjL)@EdxQZH0B;9L^7yO;`ZZe9iOmxp>(^iwyDj z=lkaR%2rymk%mCoPC=H`H_i{w4KEHBz1zzUTIVF&d&?XJUBtJe?51EHv2nR83w3we zZnwd6xT3XZ)k8ldjhyK#x)nI5L$`;@CM>}C>VOtnWJsX(4ttwj%oP2-WhlSM{8(DYk0k{^LY4Tj4EUX2x&fjVG~KZft>Yl>6c^wn zNGh4`kDQOHX9y3j`hGYq{tfPMs@V48!^{4S4=?+bhnM|p4=?fi6b7yoiv^w+Z+NIF zo0Sd_(9)i2(k2yM6fe+D2UFsRq7i72d!K7~^`Q6ho16x!+T|V3g<&ca zi>GPGQbeU7*VA6KwjZA3TnsA>P?ghnw&D*E;xSvm4DB}Akj{wW-y&p;#{do@*r+}6 zQDs=gdoV@x(3cA#XsXKpAw?J;6CXhx%Bl3ewzkt8Z7V} zj{5XaNhk<5_UqwNq!1}Z3sIX7=wc~Wh;hV6t`H@7mngi-Z=UeB+=`!6WhyGAWfm2Aqr}`C6 z4Q$$eJg82{nosfF4Lo**1s?Q5ihny~j|A=p3Ia&No0*qqA=lFif>spg!CE|E5pWD} z2yk2xVD`x)W{3O8BYosiuuJSSX4|X8)R+=ildyYW^VH!T*OE#CEE-UfeO5{Jk^jGz zq8$TZ!_N=fpm2(REv}@&j!ey;%dim$lV4M{B2yKDVo58@Ivnf@2pLaRt4hm7ILO|E z7=x~R4oHJoy@5?%5B9)jpa*;4I(fvc zmkb30RlTJatCm6Yx?EGKnP0=ikRX!t73Z6!HTR2WD>asqdv-``wjtZ6`?_cLkTQn8 ziE(;|e3wY=C8k@6>Al2!D>1(-%y-cPD=+yHCAKfoO$|~bGz#Dmp$T|7A52pK*>Fj| zC6%is$uhRovME(mDp9$-wLRD0x;|6Tye`(PJEWaR(=?!0u!#Ma%RwonvcQ}2PA>jdNwc)CIA zJiSu)Z=%&u(!p7lxt+Rh9)xRb65}%V0q} zeJFZ?3G$p|7G+&!QRk@aF_y-(hoa+D0jCGE=uO=yZb{~bTv2mTir+(jh9rw*90_(J zI)P*o$rKP4Ktq^cp8J%AfjHZlSbnnmk5ha?Zl&BF@1IhxN*3mue*|)e{3}Dm*W2mi zjm4eCX6i|L;@+ia>ESQuzBu0@zF54Syw>%@`@#7RPhyGBuiU@#V7c+p&PUDnzJC9! z_n&0WJQZiZ8#&t+<8ATSo;cnT$89j#5+`ji(-LQTl5UhAjogpl^WXbzCrOg&2OoTC zHCG=l>?ZQNLf*+F-pMzXc6xxDF0OFMO%iB2Zt6;SbVbYv@9AZS0upF_y}YRNRJi9k>u{Ck=-vpShpYU;8+~K)k=3)Aw^=PyUBN>}- zUjF*(S69ECY)zi)2~*89;uIVBy{j zTjjtMJABybSK7>JlTNr?c%>P3e?C-C?A)}oluf_VGF3~{O{XD@F1i2b+;BZ`QFkKH zzMl1>k_m4+33QQO1FvG8GT#8w@pBy46$lsmCwb?ue0DCzc$?`mkd=ms5P9Y3@p*#IG!u zk)^EeQU6exCOqueZ9z{RB~@*jQHwP^pj zzL_OOilKaWXh+(&Z{EzjdGmH=-fxEgWVc%pq=ey*;^+`UzrvYPYAiA@-_Rj+AF+s~ z5-3T{P!#5M30+b@qbIyRVVE(1%#bi9O*1AUGbYSQdWKF~W-Juap$mvLeTrB!M?KVQ zdS|ReN`ur=A+@pkD5FE1p0z%*J*2d{7KH!UUY6nPtQ{T)JWhCA@VGe#TL*6s=bUx1 z^`9Ca77d?qY3+)w`&ch$Sm-DgR=il>0dhZRZQz`2BfOj732=36GxXMSKz;p+86mb6 zYPNBn+I4D6XLZ=wvpzQXsd1*hWHsA=pvCRA^{^cW$U6^^A38wZ1taZ-k%tZ_*GAgI zSy+v0t6gulo(;o@yvzFV;cv0W!<1^im=eZPxg@Zb1fPqtIFU#Wf+<`g!rkU#IU&stCZfw6 zpB2PEomxj?$pkCM{aPbFM0hCQ+x=6wSE)tWMagv4Cf0 zRbr_}y3&5TY7s39us>BNtWnc8r(^Yx3=fO_-hlV2b=4-|NkLMnUTIf8kM1%t-5iSP zegJw`85jkFt=iQ6vBiK^?I1DpT_Qb4m0GUA!hxI#zgHa^m+Y$Oq3lhj)VBN<9lyW#L>t666hQxn?}MjhRoeG`oTBp& z{ywf%x9DEL%cClZ#*#NEB-E7(@0i~~>|T8QF(o=h%d9SHS%z;&!L!ONp|EL6HN8CG+>fX1?Np)wrMueDMSD|l-i_gu&M&c~#ED0RN zR>gc4f0y8Q5|6huwGsQT7!CIintuz*!|-H(2Ge^D6&jGYOA3w3^`mQ(yAI?F-2Y(n z1I6Nh;Fa2kcg+T$X>F=tMQw-kt;2HbuwrpYuHe>D z+0`S_;q8wKdenC0CDOH-WJ6%>(!Hi#6FPocIz1^}zAm4>{&e)cCk@||N8fwgE;WX? z&pl~8ERDV=8N5=%_r5kW&+Sd?(^A{u<1V>vRA!DB^tyocIn(sQs0#$vouI4Ee{X5a z@!0$??&Is{l=k8E=?CX_nAR71RQLKWjqL6}^Gg2EcFPm@=#yIQy-oKgHYc{){?7RK zwqM%58h=`Ub?u#vkJcu?X>9#G@LAx1;U_ne+VGo9lEDY#551o6J|lOZsWJZ0Sr~r^ z#^1R%eeXjECq_@+&>$Ncq{hQf4Fk_?ZIUe{8Qzn^?>zx(3n|S5j~!CrEL8dLUk{?j z_o*!2=l`*d8Pge5`+S^bIfyMnRN%6zeGcMYYAyn?CzBIY1D=VhjptHnK1o&$#32YF zT2PA2L5#|%Rx({Rp24vOE11otVyd?o!?H1Gi-pk`EI8HAX(1^Bb$JegZ8{aX6^~}s z2CQ2P{re_&RyFaQkmFM^axMVtal$-_Rue#Rv|1qI{e-1(JYKazX^qGQaRh@Vk`}O6 zs!|EF^M5H?SZeV+EZd(8dl7RD?_0W3ogY)>5)!4DRlCR>Y?EkRyaB5v06O%qs`uw` zf)TAijX$g0BN-Nn)+QX^>_rauwik=a1A$0d+X@r*jHp`yKSGQ(Ju;Uf7O~#3>J;q< z=(E=x)vNWnR-LKKg1uzdLXEg1T16dThrO7%fKgUe5^ay@Qbb;L{ek-SB2HQ9K79_6 zzB5`eBQQv;N`M*Y1%H(gvLdyB(O*?6MLR$YF3}(wMZIW#WO-PeK@>`bMH6dX?16b; z2F^w}-8%^8Qh8qDjkhX+6=jX4{W>dJzG_AT?kdh`!;hFB*=KcE5!Q*{VP=XPGFEa2 zMblJTsEnio!OICQn&pBJBy&6$6y~E^dCf?H!!c|K362fk;${JwLVXb79RqI#11l{x zyUpe@nKUm1SI$Cs#|SpKG#`)6V{I%4r?X@{g*7e*OE~vdRDd%aTg1El!CN^YI0uHW zFeL@4^wI!<5ByrcM+I$F4_P3?jZ*R&oZ8FogMafLZ+L!`)d%J9JUmO31_T}l5GJWzX zeOh4-ZGE)E^nBy&l$?VFWUyEZI-Pysr5-qt>{`*lo0LrVN_5kMDaF~IcZOtVXq%Fq zz1!T5^9X68IQpdiGx`4Wa{u{%xFYvokow*(Al*gE|J>hvZ|=d3f>CGhE9yJ~;kEU( z$EL?#sbv(@DvhCS)3#S?99(xOP5t?%VYz8ovAZQtSKiYnd-}G|$)177^|I%v#2owb z96*o&h9Cj!hI5^{7ljDy@5=jnWnV9}x_2h;3CSL8p{IBIn(P^pnBgxeXyosMMtm<)$t~lS@L=_C#{TxwUtjvl^wcxGPCxTE-~VLuldZAsk*(7JB<#Pm ze>MBme|?>KZf=##z0a`!dch|O)2J|=d8S)ty0=#Ip%ZfG#1rPE!ZavMy}~prj89?w zO2_Dbx-H;WA96K6Sl)5=Y)^c7ai@PwqQ@lX7}3@#)15@y{+6JqWArz;@oodM_`mbj zo;%yO&PdK7i5^mH{s#?`tz9y-lT)p#TN;9An13H;1wR8swLGDFKiIxkuSM^npTr4kzLNpA97Z(`%;k!D*Eijj+AJ z#>%^dy})1VFq(2!l-JlW;ee`NsO%TI#Yup$silRzfK`$pRNDAg^`hZ-w5|~d_gWi7 z^BtXNPQeWkKp47KK;)N16Q6^!8o`Z^gYDk3RVD6WkzsXO=)n-AB!lCZe!*9&S$?wy z3>R4AUJFyXg@B7lTBsGQAUFaD0Z)LH__m`efwZ+s1X+BrWz87ch!zO>Mc{B13Bgj` zT9>8^hf1a8_o@wg=q~3=9N$!-3ua1nI#3sm6IJQX!~tL64fr}3CR)ngs_lg}683BR zLMyAV7Kfm!RNLCr_Ybfu1oD9@6_Div6}{j{Rh16trTc(hDm{!NU{$KE4Lo-A|8L&z zL2>)7z{FiNrFA)i@kn)`>`1Q_P3A-xLG+nok}bROZW+wo*n}5 z#HE5aU$qb4#L(&*pX0{i2HG+k8-i0?G{IqZxX8MQG?`8d^JNJXR!a2Ip<}8Ka8pFP zL&3Muu%m0bY zF~K2;$9ukN{5YD(ajFw*N#35#j6{}3z~{Ud7uv}9F*Jpq@wr8HVH4ndABoLt{qy*k z!+UUSw?E!#Kl+qz$kQ!5 zbj#C|m+~hs%O@`b+%ozKfU$!2&u*UmtE0a-{THX7c!t&{6sQ00CmWyq^yud&KRfxv z8CrY$d9YUs9^35w^8DH*Xkqc@>7E_B=WC`1a8#b@+F`l?FuI!a&NkWE_5={ml^tg{ zpi7HYVcvYHhqnZI6RFZJLwSen?AmtkID3ngCX1DN0ra_A^Uhwtt^{m36bAkYJywn3Hd5#cg z)E)ler{b* zJpKnmfBJf{-W5{s9=@8xp2bIlYR`i28oR)Kinl%Q?UlW~uVR{ym%U72Y*_yvF#RAE z&ctpA~pG7UnVujvU;`GOPOSN~KR}KOCOne*$gcREM(g%Q!0NDT?}y2~myz zirRjKn14sT|E1GYx?RRj`R;{w5xjPT2FiYKX&1q(;6M${5~_b;LOR>Mp{JA=B%R$# zQ=3xff8idY-k=m;V?mFZ=fD^-17DqS7R-dDk;7fE5Y~!W8(|p~2o~&wbs%p;!AV#b zVqCu_I$!FM&R5V8DNmu6R>kL68rl`;`N*hpsOyDyoT54uPs9CFo2LqToOxc%7;y$H zusI55!qTYDSFjM)iu_Fl8(|sbt}oaL>p&h~!AV#bYHlyM30sF+J3qhi*^TYCFva7S;CbBx@>E1TTb^QfmK%pK?{>hCyH+H?an5>Wg1@9NUPyhe` literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/routing_simulator.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/routing_simulator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23102d233706da33e237f83d70d3048028c0262f GIT binary patch literal 12370 zcmeG?TWlLwcEjNe-_(Po-jY4Gr9_nVaBQz-tBRjeTqln5u$^v|1YC+Ul1Ab~J2SK+ zmfji*7tppJ8-Lm9?jq_>tH4I<{b}8=rawi2{ozDekR5mdxBDpaQ_z-xp!?IFbMMUX zVN%YbKP}KJ@y?w)_n!MW=bq>NSup5hASJm!lYbRum|xjL6J0qC;ep zjubo3rkwLm2fjO#T#BFPNtsK!=3P+7C*3K}yvMCcK-;YI1r> zxFV;r$+(tLe7>v8vMQu9Vm2uWYf9$2EJ~^n7Z#OxO1hCzR)xikBB-=Ic8#Y+L6cMh zMwAy9B}GbWKFu0Pq5~=BK&qz1HEBr(mNJ3^&pfgg1;Wsn0QJjF3RL6an zG(pX-tz{HV6)YIZ%#xfC5?NKtq~u&&lQU^D3pt%gW^uOCNAc7eOkc)41=AsIR9{kF zm4s|shLKZ(0u#@`gHCMR_Z99|lr@~^&R;NE;tM#_gqD)D<%}40Ky`Y34xWBFEo(8D z!jhyU=myig#eCWMYbe}hBxc?LxMxMjyi?-5fF;o>@hgGqF0?FiP3n2cExO=aTZ2Aq z&dne_cpLX^T)W8j0H{Tu=()|`H-XOk*4e1fXf-L}3dPTQ?&$frDhY{XT&0+LF+#`Z zjNq~?ic&hJ!j4O-5yZ`v6x=OYQw?6rti@K1fSeZPgrvr_^)<{c_iDdAFjEvuQ80-dk$ zema|qQK+MKdpRe4SOchLQB&jyNs2(%SPKHdnNzC;K{jGan9HUXpeqo+*$UU9N=Yf9 z39ul#z?!kEw{A+9iED8IcafYW3PUiPRJP(>vcYw`p=I%az{tZem(|z}NnTpk)KM4` z-eggY+QZI$#zc98gJ~@)UciTftWWV{hQzEQ;s{!P`{9I^TANst*T%HCvLtC^A7v*` zom`!iCX({P#M(NXr1Zq~WHL2DPB(0gl*j_lnMlUhB}JWB%&L+YOJ$@9++yG@F_XeA3l_H-9Hz=Am46G2H{?&5ds!rY&6d zV#&uqtJOQJn=^Xna2dM+0AK4y_J{c&FORekstnZ{u2G=L8BLu{@+uVFBfxQ>p zX8k>4JM?c8Lyh5!&H8a-82YvU55|o^{}8|rFS23>lq2h|Xt&WlBZCx^7vP{|(#X$6 z^U%acEN)ueVr&7M)Cor5jYMLv3pZqKSx98ki}F%di7x>A5Uapj%$KMScotDNu=`SB zn_M@!8;WScu_@zh4_M;l(qG`L4^xh1r-hiZRrV?gh$OJ z4Wd4tG{IFi!<1?DYKx@}2y1Z-1Xmgsg{;j8RIk(sSHH#9;-DF+2~*i3RET9{kUJkh zc9+RJR$#m9lO=%jOw9h8XW`9;lxJ3uht(ykpM%Q9-&mN$iCzNTM2;sxWYuhwg;aNoaz>T*7y;Y71}baS!saqUTohr;GwFB|XvQ8ZElHvzJ~ssEZm%sR!Gf%9m=cbR zABm2_i4m*nUV%Cr%3PE)_+;jWq$mQ=gmM5fV<2`N@GXM66>G2oQKy2+W2L={Y85Pm z$S9PBM4OG}4$M>?)v&5)R!B1 z&%{*?9a9YsnN@{?#tpZ5ehe2SnBVqZOPd@Hcyu+cr-WSJxNwDXzq)Vglwfd3X#Z}l z)o=odnQ!GYKvRS`3?ED_E6plOMxlh|QzgwhqAZ~zBTPuGhR#$Kxs^#z${>_;Cz_Me zUZi>%`l^2l*)8VD{^70R!oFh#Pu~-NYti4Y`}+%nle&NMaj3r(8r`~D96PO#oi2=? zDKn0 zBj=Za>H1p!q5AW{@0EEb5GCY0*GTDAP>ZTDS&&f^!u~fX5S9*+%3p z66@@yE6?3((`+Hoh){2GK!^=QL!QsO@=g#psP)yQJa^wTv2MDdH;Sja1ihQ|wY&>A zJ$X;wy~ti+qR#knn4-;_-X+GgE#I4o_pFLq0dQZ78`qKRo}%gh4LD%jc6y-!C)`*D zS4^!9JVxz1)7qO7{(zc&w%S$0R|K_@bT>hOi8y7tY%GhDwg-$ZD&SW^Bv*4On^Y7e z?XUg|Qq*Pe>T(j8KdWE18Bf+MM~SE5UnbRPT n?F=HvCWgT?POC$9=-5Q=d$2Tn4cj=`Myx^zcY2(0R|l zd2Q>>;^SRwpJbueLhe;cTGQcZM*Bejlh%EzI)#; zv*IJcUdwX*qA%d=76a+VrG_>eA834h6c{J zya!hy@;UHo*kc2mTS0TaF6H?L_UKm_4b6O#L}aMeOVtxJ64sjS_Z?AB?!*=9moz;! zmuA$;$=ib|DlY?2a%{j6;{*-R;FiH%13np0^c6IG37Qp9ZJ4i61ByCmZ4}F1s{N58 z=0-aV{TKVKQwH`NSZdi!R;`iia15#10V9}ARpmHJUnu!sbw+W4P3Lu+%lM>vu&!dvJmF(8jcciotz)aNp+e7r_xY zq6c3p9+=e+%$C}E@6B$WfP;Ckr4ZN$dIo+usUJ95IQ7Q%fy+==>K)zsBfWR3*mG9z zIlJ9+e&hAWJ^McG`=oDk{$cyWH$Fe`)3>&JW;R}b5+1<0Z00|Hbvt~#5IDZ;V!{K3 z{ihxc7N_3Or`{->x?C7|b2~Iw2+UD^nuA4I$=0X;32IR7$g_(qQ4g#ZRZsI&PXe{t z>P6+VsebTHzB-BO;?itC?MgLQF~JZ8gv&c0aQDqw!a1=|v52rM1KOp!bK=SkndgKE zk_!FQ$P;y((7Q4MV3bkFqO5WT%gFI^L)FT169`j6l&@GuLt9Tm3F%K(ihu)PL_Rh2@MkGSH= zOZv%6g~`_oZEpYtqpr*bTL@(tZu5ZFI5QBDM6=C7;ROPtql(#9lPUDXly_L{5Xi#T zUx5LC6S8pT4$cv80D3pfiJ~q4{*h7Pl@HNtwIorR$czS`H2?wgn2a zMff)s(1$OY`lVp1$hGphnv}+gR0gI~D}R-vFte)|Mik|g;a-)bH7Mkc>`9SyFu_!A z!J7nB>8TRiTqX_CH1}dM6W5L%RglRTPKZ!~EuT$MQgt)!te?bQd65pHFh+LnZ!x-(U3?;|YiacAG%)SVQ{S2u_9l_S>qHG>tzn$4b5XioHkm-lN6dNxgS+yZ6*n-r42@ zr;FD|1PRQCsH_p_8SPLy;Gtj~Na@0?8#44lDGF|%GKAS6W~gB(y_of3Mh?~hmPmXJ z<+_4uAfbN+d9Z>cYj6-v)|5ZQ_qrI|eyqdn2>jG{V0&&ePgxi5dD_;_k8NHlGw}H5 zD?9jj_EM0ahF~Fv<__^M*`YZo!_uwL9KO-e9G2=rb6C|lG`ADPZ?8KB`L`TTB3)%3 z3jnsWr|c$03X>FlOnc|uz@0$ZPhJCN_^%u!uPqSz`(ghN`gdB%>t#m^-%{!wC_C}_ zQj|WAo&3A$pH7qV6K@M?&cpYz;N@GkhW=*sv(YjG#Yf-zhv~nc-oawIv)6=!PlSU{ z3l5ZRI8ZL1WilKXSAGF;ueaG6Pk0Fw|mZzS29U=6jWQxj(e zVJ^rq&PJR7X1vuN0Y|Z5(d#CFt1DJiR!+m7rW2KT7F<$Nz60}&It^zcsVZ?OD+`d- z#(vrrbK=$}t>+Hb6W;i9ZT=JCAX^845MrXQY&eUa{kmuWCijKqdm^rJltfnIpg?x_ z_n~l?(d=)(yHLGv_65P}$bHY=A(XI5zyfYz{CZqYlIV0D{jLkR*H)OCq7H8!)m@Ca z{2Ux>?`eN~P}pA%2PK^*j# zhF8#tunhxirC^$(?*#mh)t-weA2(oAY>b->*}Qm$wg(0|XV6x|Nfkn53mTPzy(Bk|L5a9H4>t-FT@2IGfV`yI^CgQYBCwhWo!qSsZ8 zz>T<)hFQllFd)&Yqpm7gE7-x08G3H1%&0RGjW$cxTQW9x;5j~4-vl{}`V#;NLSuvp z^n58CDGD<>{QGCji0+45Z$T(`8eFqoUv>`{yQlT;>0j`Wd#Q0u8;Ay7Cn8sr?1Si!3j{OgT1?6-s{_KVIqUY@T4A|d=%8fuWfirtzE^| zLA`ab)cZ32hq_8VLuD`1HnhX=Xt_Wys>z~kYizO*LxOHEhI*8F%rMYuw8mobG@yvy zGKSlPexiNRtHX+f?FcSpY#bmeP{RD!)Z|HHuR3#v;1{!B!%zJhvOh7+nNFtfSgGgW zR`}teT_@Y;d+H(;2ezCK!?eQhnRFGnu6vPtiOn;GeMj}4V>^t8cY$x*)m~;j{3~b< z!Qv_gN(fkZ0Py<{cFniE$dYSi?%qxvfDBCs^l+lLj zE4#5oZ5=Gt*-u#2*nWBzgl{-z=qU>&iYnZRXp3o1#upL_iZOa+(*?IPfzGLciF<7L z@!sh;JqHRBHML5USVAOH?2;X>-F3q=O>c>wRdBO=EcLi>)zOEFl%CoQ~4;%j;Z$PxA#&+QZSk zM;8j-w3kCXx`b3=7O7tBedclb)4mxl>HC^)t|obBU=rJkjSb5rO2JgIt`svQqwBD^ zV-j6eEUkJ{_jtb33ah- zsdVsvlo4o=(ZOW+L2|GK8d5tzxBYEyA9@emk0b3d?V&lo2ZYz(BRm^QOLJ(NOGn31 zQgR~W`3xAfVa@WtgEEYdf~}HnTe6vx zEhbCG)4*d}BTUrgoSZR$P5&wUQ_e!k(6SlZB(i476Ie4QW$H>y)(!#nW7y$LR2BqH zU|BP4Omv*N4NP}5JI^?COh)4vRkMm2JDYDgkKVwtanaT4suOX~QtHY&!eqwAP6Wg# z=53IlDHgGs6ttc0t44x`@EQR)|qVT13^CA;6Ld!?xb9 zj#W{KYTneO!ox1mkTQdko~vKei$6ulpyRblU~5NI6wT0VMR6o$*SmILBQv361sC-R zCs;7Ck~NK-Ht8IsPo2{fE~?m8(&C6Y-r#H)8RnYxl3M$1YT37gl38Uf6%D{I#<3s~gozH#R*$@TzOyde?BZ zYq%Ehc1AWmR7Ql!7^p@Eo?Kjs92Os#ydqwFA;3f^{{O8#IS4`<`gg;c9D=D;d+K!M zu3g$w-#kePY4!!36MgH6}*v6>)%R>V)%}s$xU2 zY`C7~gk8akZcb{pm92Ljz>W}2E$kk@13S}n+1_(3t-a@b_6fjSgp6OZN$?ONIXn z|E%u!cZG_lBINjS^h1<4f#a@+(tmdHJ$E?r-aGXEz#Tf;-=U!!q;Enm0uFr`K4gHd z#qHp9B5scdtsblEgxbLAE0ip|-u>;&@CtYqGy@i)mLRm+IR2bTAor?CAB*$tbDUs3 zq=Cl?xET|cghoh9a&}X~5dfCEcR6fb@;S=zPnaNu)`BP+TbKG*rTz`6tDX;})CP^W z6V=3t=f~C(=NE%Owzp?JHnbWW`bYfST72w9=W6`IM*P5fe5@KDdl^6fy6>YG!)tw4 z?tcogx)^*NPb~YFrHz9};jiXJJx8{XrzbLZxz>-O`yZTfqvCB38XSIn>G8ED-QY=} z8;n##eRH2a2-HHTvv(!>6G++TQvPOyAC;H@x5#YkSa<4>k@ReiP{N?tc|eJQSD3htjh2WS|-!sd)hirA>-0 zMZA>1pIAx15F^lz+C-ax z6s)pvVzJ^3Gu+PhV#IAPMz*^N1|YQeWP%C&P(QQCgi`$H&Fus literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/topk_weight_and_reduce.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/topk_weight_and_reduce.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e621e7ea7958f9ea2b223ce85e2ea409cd793914 GIT binary patch literal 7970 zcmdT}O>7&-72f6lpG3*}v6aa3hO%Wdk;N#M6I-<$JBi)IP82(K(LhPMSaC?Hn~&WhUF?|r9To(M8Y7f8BykPIi&=W*Bn=**}mVV{}p^219 z&fG{6Y8v0u;0NnWfs#n7BtbM&G8J7DWStm-rkeuDOzAWsTxS*tQZlJ3aa%^xWPzeR zID#peOquX)4aZ5cOo?GIb2LI^L(r!LNmXt8^h6SLOsYhcU?3rY3Hx4$OfEXumR z-8&>SAH@_hAECQ#-yE*@&2u%+2$CVd6isvhN`nV)pp}z~Mub^HHKN+ndj-6Q0$W-m z*p8GmVe@JY372(YN{TD00?TJR(40nJz&;qP*SXnSaXqEV0&ET#zvUidQ#Ev^<=YNw zums(YI^)y24r-_M>w>8>4dwk)_6xCR9Ea^&O)e-QbgV}pTi{H-(v#uMs@(l@=@NI3 zACEe$fMO_`VM_-AmQ<35O^IG1=HVNumyREs zeTl?WWipnWGpBVec1=|iv4k!YRU|h^JZ0)Mrb=_5B{qd8NKEJ?X6ngV(VmP5TSsIE zfD9$)ERQIXtD=~WY~#!!TjnXS%4mb^Cighpvv}aqGbdKVV|m}$I<#??M^Bl_lsRs7 z*xx1Wz>0RO6{<7Auzakh$cEL6`)^L9AVz4#L|qdV4Rl!^O3W11;vW1$Iia_i$M-;? z-s3WVzwG@V@=?cl`qE_vDym8-9x=2hZW0;RwzAT&C*ku{Pf`VTV6*+yuv3PjycK#U zrI|{CoS{^w_dL`OX3$p5!kD3hXgg-RAT!W%ggQZ>K`2^oHsAC!+qs*G?8VU#2Ep4K zZCv>2l7nw~VcqEr2TGjNAD}JJb%n2um%eR4j)?@oGv^%ROFEl=ZkM zQj8Cl8+J&1+VWf`nxRw6HL2?=-HqBfrj*&qwk6BZYD0$XBku2xklXi7FYi8C@&w(1 zuiLs7LU%)pW}&UW4*{`Q4*ilGYb%lb0{QHC8ILztGX5r{XG zNb;O8Ngx!~M=;pC)#ycJ8;mX^Vr~Xj-0TvB!NYp;zXvRe8#wBqKq5Gb7)O1Ol@ZF` zD!8lCeOpYqj2*)`XcKlETMZx2`;OZyj>l)b+4-Rz(6j0Iqdn~Se+7jFF2mn+WE_{^ zccu!x&{0nJ%fA`NcD=KQz_{o+;FQ~I`3xB3_L@H9e(tJ#^M|MnEp12{hwS{qRS5|^ zSbm9%y2sNO>`fpT(^8T&0#*Wf1B+($8>bAe_K+a~L-7@0F}9+ybk`t;Ng3h-Qm3-; z*k!~p4Ip+>)#I~@b_D{G%K!>vx#M~gpcp}?5w{pXr1x&M$PBSYaNKpsZgQnvTxeJJ zWU*r)9~uB4XzRZH;jIrpna>MH)`VjP;n=D$`Y=3N;#~d!0$X3PZ~y1BpU!@kSQJYf ze>hT%99@fy79yjokrVl6#!7BS%ggIdkj}%X0Ot?i4&4gnyACaz+0Y~3iDI}b+hT7e zj0#l13=z`ap={q`_8k0G!ORI@7U12ZB~Pcjdo!3pLmil5tp;XTtAQETY%s%84a}fs z8-SnvzXr2ri~gz(A@E8{PZ?>+hO}=}u|`w^LJqK0zkMsX$20)Xw0HiY1du=AD}%!M zDC_xe_|_))isFW74qyA5?Cjbk%OG|Uch4peJGmNuIq!STna)ZJ zfTOFqg%v<-07thG0eSR9L&`Xt+bQ$1yJF}4GT0-0-W)QKgRwNjo)9WE`xh+qP2s$n zT#um@V^2f+!c+Oq{0&zG12BgitFp)2^G=rqEA!QsHz9C%r#-IEP z(i2?mbOi5_jM>&7u4-%mWEawbT6B1Vu7ewgM<&;lIH104&Lzn|yQ*3$aR>)S19Sky zEte)M3Cnd&A=fQeBBhEH`Kje5iKID4pNBHV5N|p4q+xmC${`x5MAW}MIkxzO<B6LH}Nz5RxrzFj}=4v)QU(Q*TMGe$;Vw!ExdR4y<+>WJE!k>io2g(+dW*^JzVUL zu64%>-LVa?tGhLO7D7)YnC9AZr*oc%;eit8@CS;m{ke-veC}-S*wX0o)URiMIs2=` zvX~E@S$9HrK0L7Dlz)r%0P)Mg=6&!Q76N+$bDn66wXezk9ha1Ar2J@-zWP58qIj%q z-e(L7lgJ)L<2qr8jAqF9{&TwMYe{-B2N_lz`+TcQfXO)%f?*e!o zRW!wf2ZcjB%o$>;EjbU0*vY@i6+8E2T|e=&9k9tXyb&_f-i1Q_oxj;D%MG6PU|g8N zQ4J}>HETuuzEqx|)4uAA!9X^>EeONF&eOwpm7Ac(I0#ww&E(W-`1CiNqb*PjwckE> z>)hi0&!eA43!(lJ9@GDrNg$nJo_P$*2aD#IfriE;^DII+&T()laW%V?$c{=xo%eta z)Yy>fB#?t#1)zk)%mO$69~!PFMO5JGhGdnhib*j`Ib6Oc%f@ZnU?BWwF18gX8dXJxXfi* zKX%CQ`>|un*#l$d!x^`6Bom&&N<+#xGa+oAdAgwksxYF7?`x|gdn>c3*n#VnX+eFN zF!~H@C*b$TKXpMFuE_rJC#aGlOwmEiVwk-ESvmk1SHAqxA!vXrt`i7Lw3Qw~8Kf{} z!jdLbH|hwBdrDD(%^;{*Zh*uq#Bxg7oaLQTCDSCD)dFwBnkmAC8HduNs0weo(Y77P zO~7V0v+&GjwPP8s+Y&y~vj1aP{CO)hrBK5xw^%_{s#W~9n5f8BnBD#1o(?pLDT4>R z)j`;>A1REm+O1v%m#f zI&Qyt>&?aXAHV%|xXXSm$UU`mF&BOqj#UZH#BzJ??R@{p^0}33zy0txAAUK%qUS^7 zWrh<2y0Is^H1dn%_l_5Kk7Pr|?noZkR`-k9;9qO8vEIacT#TkU)~d**S+p0$pFLdQTZwc2qw`&x0=ft<2DSQt3@ z!2O`}f%|t?^8>FJcD<2(^^YB~Vq3?8@2+pLH>VeRUR>+>PNC;JE3f|c_20by;Nq9( zziMCYIsd5b&0=pPcl6V@mR>FN#+GBhC?wvS#L{-PAL{e&Ph$As;bu>8@_Oo{c zOrebZGCvAftv)7Mz7zH{`$>xI2rpB{4SqT}|~ z2Xc1aym|9x=gpfpv%e1p{REoi{7IIN5%O2;)SIh@Y%Bn>NCYBK5=q%lDo1B%isT#? zbXw4ra#J`;OP-uJ<+X5!SkKk|GKh3va(yO6S$L3i-+GRMKIv=R~nM zF6MG?-zti-su&!9BQ0eGUI02j1p`LwP71F~r;8%U3|^D7Y9U{nnxu{WQ=-b}W_ge> zmCuQJm7grge5R0Bvs0x)N#UjRT%n{ItyyJr0bU%-9pO>C%;sAV(o^ef2s9x-2Gfcp zMF}KD`-sphP@+SiMdzfmOeQJ8aW|N9iEhCudPMI`WUCk(-<>*1cfjx@Ex7JFAJ!(N z{2;{%H1}O+Dp12cfHQ!5XF6Y$4ZL0$4+%bz75wlIz@LGC5dI;bN@tqr`vuK1+G(v=e;uvS=zijkT9yU2_+p|>=lw>F`-HKDgRp?8Qe>wP<$jN9+} z+Dk{fUOL*{WVELV{pD7V!yfkt39w~LncSjMtyf8MkI`m3^ksSHsbD_DQ-hui$HrjO zl2o|4>A{)Wk+b##ar|#^6d}K$7Kz%}rh5*J{0styMm>ZvY!p%J0b~nRBYZ=mM&F|3 zA&O{Z#%s^n3TLdqw3R~OZR9qKpV>~wp32r~b$_4j2Ym?NkPYi_b)Ls7uL!;ZjNvYz!~YXY*N==Vdqt#0aWZG~q=n63M8&!9ZBxCyQhJ z&9U^D5x6cDGP8F0kx^qgh`@=QTq9uB1SFk?QxT;}OOVBA_;@}er4@ze6`YeuB<}qC z`$KB3I8@9Q536Yz!q?$jr4uK|W>1JiQug{#aZa5s@iGhpH6~v*M`kQ(Haj-ZihheGtL=(>%tlOXkxRkp9{&k0Z=_kI>J&*2fU+P-- z?*Ci&!3U?-y2r}Sb#JHXCc&;}gbrS$s$8tX_3B)2g&Wkl!DkL6K1E_e=Mw9E7uNbF zbZ)}*5MOZJ+hvBx$cf7E`PJd`KfSzkuF`c_?>fB344MuY|D(6NI&r0rZh#)daLb}v zlJl9GuOba@pPG077LY~sN~G>+KO6!&a_(p|X!RX!cL}?3qNghK1@a^6)umr0GJ1T{ zA)|d|ycvv0I-_Q9q}6O8kJnQ!ttv@MZiOKQ-!vh&BWo9c`Gd`N?L`rw98ul_aEF+F z5)4g=f1?bF%53VT9lPdsx_+?eTd`urwM2ggh=f&JUU zzTzE~G4Suc2|tICW}9%kiJ5NG8eF3#2onlKD2#8%lLOQ zcNh+Pu>h$BpP4S@XS4aKPaKxD4Tmr#8nh}Z80_FM+J?t_e?HmF@Gl4(0d{2qz#UQz zMJl1adT4LiQH2c%#`R#lOjo(+g1Qh`<&G~YmH4P0A6<(dhXcgfUhdX;H~_aD8V!6eb!UsusX4&?soO5i zX+7PVOO9!7Iifi|?tR2=gbJIn&qZaTX>kzXs6k5P$Z@>4m+O$T2-~ zY&~+~$+@!Uvy*QuL@MDCJv_1=KK3M5c2`@vmd~xU^q2kBSm$!zN^F1G_m@D+FO$`1 z=hEbAw9h1tV92DYaH5(xur&7|^ytQ;Y12W4&rll<7+L`qKVhOPfgYGrzVwuBSz&uu zyuARcY)d(4g$0AUXi%37>RrRB7K*b*WJ)SrPfL8hl&fVPiV=o9Wd|uzJ4R|@R?I7W zPE^yE1~ZMc3*Oxaf6MLv0Z22qm(Q!nW3_seR)SlTNDUmWUIWT_5xeXBo6njv?m9Mu zdhBQ#g;F)A=F;4n=OJ!BG;nLOTLSvuNU*PGx1Gr0$}*~J-PQ(ovmnnZ#dJorQ$EGI zzms0uXAdIZAp`>m4kH*uFofU;00pgUxoE>nhk-KR-^@#QuJ=9)`3L~q>79h(DuMWF zApU1&&pOk$-hcY>hfjK+ytm#z{xsTFiT3N!{*~zH3Ns3@TBlT^C-vybDi^N*VpVQW zmFoq#!FppMlX$@=V+XG< z(>R&g-No|(pl>~1usXKeVJ%#9|L{z}rfr4WU0w5PKFzNMG)4<*A&q@ltDh(`SIaE7 zx-MQ^TM|u|Wus)o_E$_)K!9lL##L#Y;As^wBs{|*!k{eo^NS74=hR<0JwV}>FBPs59sX!aD8|9!RObQ zzE$S(djHtt?oa7Y-(2gzjMYNx7hW=Oa-pvhIj|Zz@K~svzM`MLQub8Y))jWoigypd zr(X7>;Jsj#iBy>mNV=?FdzFb*nbs=PW5E`y+g7LaKaX>@G%QJ-PqtyLm`~?Ko;UnF zR9iwxLY(1wD9j~$L`Ls1oYxBl$*Q>wZ&5DbN3A)-rIz5PHR#-|(O&-!cD7LkoVSVq zE8~|j`;_;hpy;(i@!cIau#C5y;mGDyBlfmcre3qc`9+B4LK+H4j4!ZSLC#DY?rRX} z3$o!*R9Qyz8^K!rnGq$4=b`9m1FBVYoUu0v{ZU2$Um1z=IR(|=T>>>h@BS}5VVD1D zYtnSV6KcTDp65t0_Y*fqZKeH5u29oLVJZxHC1{d--Zr-6{^f+sA=u%-twFJeB#{Dh5| z0mK;6oiKxlg-BQ2WD(;?cf8UyqIZov67;Uu%`g%p#Lt#X3s;x+E`0ai50+ianFk}w z)4yRKwLDTDzxC+W@8T=$8QphwBZ}M?s3g2K*RsgmXG|C1phkQ5qIO?1J=pV-VC18T zdlRM)dr)@Sy#V$Y($=*o-IvTD_Ch4uzBq7yz+|zqT>@z!%)Ja-jTm0bu2j*+o>#B({yRp}^ z!v=9ACdh$9U-bfJ#c^xDlQH=+Y`HOlUm6Cte`y%BzcdUszBJ%?NlMH7tSIM2$p~%d zH{A9F!`GN$v22+lU% zO)9BbNf~TNx4!^k&S1prhx&>r=io-N&!+Nz9*#fuEgOGAYY6#WGO@Z3+Rcc`yu2StC$T&5}i()CKu8NKJs KbAmlf`hNiKK%!0n literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/trtllm_moe.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/trtllm_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..640da890d9628d4e5bd76fd695a285e9d7d8053b GIT binary patch literal 6222 zcma(VTWlQF_0B$L-|KzZ>s`NMldvSQV<*Hs0!eteg5W|D3K**CWM|f1d-lQ1EV0d4 zq*baqDvv~{vZGeQR8$b5O6eb!Dz*J7mHOi)v|e5Dk*cat{sv4G#7}$9o!zywwjo!V zGv{?5bM8HlJO2m-yabx!_+|c3gpmIr(P+*(vR(#ciAY4^6f$AGxq>Zg<4D2Ipo2l@ zgo{I|O>r0a3BKT&@G#o0cniJ>A4hCtj7X02L~_cm>5j%U8}EAtkl}si$iA>lePPR)n*^zTBk{l=5&-*7}m6Aht%ltH0rnPO)&DoNm zn6pXF^Nx?|T_(IRiVt|M^NtCB9d`pB0Gyvrw~oyE^B@mN9yu&|;qk%ahbI6}5S|e9 z+qJcy6rOhzurRFI4O(s1YIeeNcDB~A6oI)SQdEvk-@VP4u^ssF9r%eI`0YFJJ9gkF zRn*L5{*ACL|9r)jE^&Z&cZmAb+nOG#V+(i2+lIf%2)2e<#DU8X* z$7jp3s%z9 z0gkfO3_DPDCjvbOP>!a|2~@+SiQxju)sW4RE8KX-O`Tc=7D;(Q$QSdvAgCVDpuS1D zP#6|Or9354uSJ}Zb&C{Y z@|-?ZDh|D>D21UyNs<*oo|Ur|y`&B);+(8%Lpcl`0+@VAS9MTBbg(?924Rxyd3&du+sO17m~L+025;j)+-Z-T?G#b{Fha&o zZG{=y(FmZ&vwHqjQO}o(Pn6Vxs8jxwE*2$Gm9B7V6gp_Q@?JF#kVeF&wvc1$+I;^O z3vN~F!*TWj08@HNAhhP+Q}ypz^B<`C4}9(qZYE8qJ?u4!-Q!gg(BU?_0c|-3nq-O0 zaaU~Pr>_td{O*VlP^xr@7l7;EjstWDC_7Q=SVFSy@T1+9#B?$ z5wxx#+8kYGJ}(r+^8CQgYuM1*kHcYL-@1V2u|dPYv1uNuMw+Q_v5FtDWl24`d{I=u z9fh+jK?<r zQhA0(CY90|Q4xxj0(hqt1KX5RwuKd`TR|_)$VE*kKztb=%wjBPv}R;PdqX+GE%5Mo zAOhNRwgs=j8y>@(vqRXmjb}I&4aW8-<6}oz-GdCwaB2nt?zp-a0iI12O;5dNOX`eP z7PIp3CVYgtN)^3a(b@ObLleF%zX&%w3EGrcmbLl5EyiqBIS8|90|3quGeHuai{4s1 z2_O=^`0T~RqN^6}Tz>LH*J@~Jl^lNTiyS_v;`Y+CSS~Abf(lKw-i)^43Dwuy3o;%xnv7hGSa|dvlD4LGCmF8g297 z77J0swh%MC3vna<(}!)uh)-|My!D0Z%`-QR*{Y>AW}n7z+R`S85u3)u($cVS?j*nE zB<>RTd)He8=66kFhG}V=X5OaYY|Jb<=E25>^Ls{!wG0kqV}$SdY78(ftE0J%$~icl(h+;sXh~eMOw2kXf%2o zS=VXQkvP)r1tDeKN1EY`b#IcJyN|Gi(3lmShb<9N~2azt4G;VE~m*lOD#0c0;t61 zLF?rpDwk9m(({Te)bpK`FVJYiypa5+N|K<@m1XLaNu=!*wdb1K3lz0(}W9V^n>_w1eI;|v+S(2Sh z^$=qBA~=iy53q_Ay_NItN9+Lv4Q17CnmhOIc?DE*k@n1i=$|qZ>?p^OrU7lL&9;|i`t`0v^?S5pHPhF4n zEV^rv6aZgj)z@819KATXq+C+ozjv+waJB#NwZ!OJ;%GH-bT#q9U)n$E`fJzUwbf@% zt{y)HjMe9!|1|N!;@I_M-@B>HsrR4z;HCFox|STida{}v{rJeAkN)xK=l-_0$1jYp z+igd@UwDHT2HvQ={n~}su6a{mI&I1HV!-4{IELS_%8EvSP;uWzo3`$Vx8_eQ2TVI) z*ZrN#kC{%uOcx2Hm!CJ?i1Q?{Yh~2*AP)D6^zup5hqxd4hdy}dy@$*I(m@hPE)TrB z@A5t~gmjn$I+qWvrS?`+d(AfFL@bS=%R^=q=@<$0uEf^*Myq|JW*j*QYy5V^J2u8o zBHg($ehTR{3G}Z#W_BUoO#*j+7&Chi@3r`Si0^_mtfuyx{YVTz0a$BK)jGT37Fp{* zjPR~Zt*5`%KTu2c)Dr2>`**MCYkQ7V_Z<0Hyw-p0s|TUqqU+`{j(Co9|K&&i9f)6) z-xR(Cn+#U{qgP#@`bTehNU#g65l$=yZUr{*Z#Ar4{`RnEtj+#On{zDX&UE0FzbF=D zL7-j%?#NOFLX$xK0v>n8lBihVsSX5Kmav?kgGf;+E3%5elTd?lSN;*~{$o^fowhv8uIZ{;Moysq z&mb!+iXcGAk2Mdw3Lax_rD7$=0H!eQ3&>{QoF}F&=p4QtNSRJ}!+~nQ=cXI6Ug8Rx zHkZ?T-4k3?*FyJHL-&06$lAWK>b|kn&{);;gRh(*UAH&%%rNJStsGh>@G@P9pKjur ztC6+gvFh*`ax$hX=sa-U7sM$Kz?A-w2~!?`6AYLKF)s;4Odn!?($#AQ5DSuYj~PNN zOuBm3(g&;QgIA?$`T?^InGxa*Emkg`TJFF2!iCpYoGaN653WqT7rGk1s(t+E)!9Gv ztcH$NJ;&FhsNH*lb0)5b<4gWae$$BrtfQlA$+%>gZbrh%%m5>UB-*jG?-Fj@Q;ba4hcdeu+1+B09>(n@d-mSyL)5x4 z;r3^O*6BIOQpfH8hz6~HM1#$LM1$<$i>fdqt3_F%!Q1tz%j!Tq%^pmpqBOfOAE0~h zXfo^P--cJw2k)pGt3tNi;#K^YM|&DScAkOa1TJlsOKHy)!B3I4Jfas=Eeo;#I{XpK z-yc?g4gxIsv_Sx-o#VK5o^XLL$gzKsW4CN}&UTZxakj6$uX9}DqPo_8f3^Mon*=E~ G-v0pvvGYv; literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/unquantized_fused_moe_method.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/unquantized_fused_moe_method.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62df1b79f9e3498700a7fb2cb9136efba969edda GIT binary patch literal 22021 zcmeHvX>1#5mRRu;ui_z*lqgXLB~b@;Tb3pH=ytnp$w$kwyKVP)m_Z1dRgx%^l%`0w zErr_gEINq1JAthLfn)@Wb|XL9j-9kS*`z(2WYA=?RbprfhxnzW3elyWUm&j~0uGf+wo|?MU4ZDC$o!B0u^(u>CU? zMXgc{#i*iGOchY65LQRk0X2DRV%o4;Ma6W1ihPPbW(XJvtce<9rhtjS+Ne232WSH8 zqL!F7U?p%x)E2V`?6Jx~WvnVt6>|g}1h0=eW3GUUz=mjbtR_$sa|hh9+CVMA8KZSE zPryT9Q?x$T5NIH^0zDyUCq1o<7KtX&0JrK9f z2BcVN$W+RC)YNH;srr~=9IT7AKGI6L69!UnHbJNo-M{=hU$+_5tdB0YyAHQ;G>{Rglg|Xn3$@j0En7n*8 z0P&L-Cdbd77Omr{1k0SC8#_4{pNdS24mtjQDilxVku}#MA;=Ys&9T8BBR&`=xuK;h zIsbV!IWxz+3;FyS(Jf<7gp-kvLdnQne0+|Jg_2^EoN|%nVyPrMc5|NPl8GyEcwtSbz5#b`dJ13I>X(X#p}HKY4jTUZ;00=A+AYlNI8M!yLAIbVYTYA`cKz?xWk zwyv}j5TnfPnj4y)azg>A-(hUbZA=< zKc$78q7AtA)gLXRpVAUaI)Dz2C9lbN%aiF5BQc=GhPP z-1zI6+DW4P4*&maM|QzXHT`;~{tC=gH_Vjx*E983V5WLtrkX)UXqg?~*;|asd>&W9 zHV1sm)J;{O;}S|XEwktL%-vL$dt%Sr-ePXHu?YM7#o8-HCRS$1B@Tbl3&MgoM!0?N zjR-LPsiZg0@DfJ9Pt*WI9afjLYyemTf7>5`jPrL@t5mX(cw3jI{DupZPHXQFEK%BY8KlldpIkj0_jy3t5^c5HLZpch5F)p21*sw4j~nEg zXB%ax_?AbP9QOHZBLs_Yjm1Hc5{XZH$5A727BvFilUFWJoVav}9B!aP@a}s4 zK_hz5OC%$qD1g-pN(DIm{5p{?Rg{SOfHygpFYApY`$S!IE=QbTH)|yqj|NJSOIFEAUiMW8Hw~99)F^18gxqx`%xs!2E(EG(6vZ3l3WlQm8ZDS z)e1@LCK^GtG#8wUhgsAH#ly+DxoCoG!Y+F;KqV`OrzqEofe!;bO8ph0Ho-=xNPndv z77B^RU@%NHRlx)v8eV*E{m+^HWNf~FJ~H2%3~|uw-kYhB!=u+n*#2nbTL1h4T*~qO zkD}37e{7C{VY4?`(0I*p{n5|@%O&~?6zx~kRRyKny>EVj^FyPHExT@pK6#GuluZ0S zDAE4n7PV=x+}863_nKqF(D1T-DBFIPZ$G=<{@${7!_X?2ZCP{sb94I(dVH(r)bgpX zX#ZBv!1AdL+W*QxS$fy$zHcb4<%DX}?#$W`@b&}WXfV2Ecdi;%3|V^*Z|}+45Aybd z-{>$QS3y;}N|T0o`_NMrZ$JEv9@C-L%I2(nn70qF+ee>P^7f-of6Uw8$(bpmWy8>& zvr@D@Ywmk)?%SlRvUEL9*9-OwuQY0l_1g-{()DNb_%FH+2=?~PhQorp{u9GJL)JaS zyN9ywW4!y=|268BMc-{j0(9h@WkWdl+wJvoXI)|c;`UYIm$anpPqQ`eCwsX zI&1gxc5hCtcAdiEds=cv7~{8;uCgjeQ&rWg?iF{ovX`&y%~l@bD-S(YJ+C~xDVGzR z7lgXzoDPb-qBKy%Ky~+%er&@uQep+iCR1EIoL6Ndjy!~g!_B-(EvO!ye^v ztAM$MJ4a@^E&;tJ{Ti+tigBp?6g{!f!ZkJ+3tfj(N>X?Sr(|`vn$VUTg^UTT(|e0h zm>UOp@zCyc-J{MqtP77(*r$b^C^%|fI@~KGykmIXF`A)A$@oQkd57Sd!1E)FbdVpM zF+!J*US#05njC1xMlS}u?*F!!@&Id}2R0F`IyVqAub zY)X;2SI#+GV1MPZD3Ht|Pf_;BGpRiLkJXRzw7EpRq0buSo|N>tz~CsiB{STYD8F`c zu|~;m;cZKREyfxP-3)>mB$%AL&c+j>4Phpdm?xGFF2Oa!7#Hn33>;X}EZXtz+`-0F zh6?taAX-{NNe&4wTJz244JI72y`qj`ucfA?YYpWi4nPU7y(O9lnp0?(`+mjjKt=ni(R{~`0~6dtZo&8TU5?O!98O0@it$^Hjpt4K)6NM zXX!qk?%T9=WNpK|ZTK6F$};i_4tvL)iQ5yI+QBEMzC82AnGO1_O-F~&)cVQIdpEO9 z!+g`Q;2p?%-{HOQtb5-T8k(~WgM7o_E4|if&B3i@eMJ}Q=zehd^Q#Z9ZqUP<4)1+u zwt1Luh7C57p+}?*X2zMw+mJ-_OoUl1%ahPJ|-DKW12`K zOA|{3ndHJe8!kw-Fk;*)xJkChftF(*wn=`|Gn&U*dC4ysV9Qy`%YclkOB>`oAl(+^^t2&eQN$~65~YpJ zUHM+)QqTgBap9=)n}5fsjJu4gB5f@8zGPI!(oub{o+{4($l$xqfN7V$>eD90zM4w= zx{n$7@OzC*c?P6)r8EQPT}EX{mq@ZDGhi+q)dICZ`HhobN4b!3lVKkYG#v6k&QV$^F{3b~-v4T7Vr)e-02Lp5}D(Zkg56_6| zcw97I#LdmZ2_Wj&*gVJ}RUA7VNuar(?0Zb&a3hEYG;EWmbc-xEm+;flp^E1z_dW(6 zU~mb8%MgfF?9C(>+F?SM`ktDHOCiSn5OW4F_-hz|c8E&g5fA5b;gJq#YnWs4bds#` ziN(&{kLR5^Z)2S|fn9*Zxe8eCJEONpv*woP<`$v0F4J&~uRV7A+*a-A^0}9;y49;I zSF^4T-qo>5SKfK=_IrZeBiQROtZsQ_)LE>+@Or#C8&%!8O&O|9%OF>TH&VS^AOv}d7j(Q$Mpm*5O+Kg{_!!WY5qt%|(nU$GrbthllS?s9i>BNTptklsO zKXkOlx_r8{qd$D2-=Igaqd&;BkA6iT&RHnBLAre^q>0<1sKtu}x6pQ!aC1ojrAP6? z`1>Wted!gHFC+6x+#T3}3YP^_Q28=y!sq>sUGiyUUI)_gYAbu>cJS(Aw2ROaMn~$q z#8kYY1nlZ1b+T~j%v#H;hG<#^7mymuVsG!tl>Td!b)u>CVz176C1zxUL zzuaq3{i=Y6gCpjD!$b5~U;iaQ5m1;?lGciX$q)^w;Yot;O`qS)^+G+Oi)cSSxN!13 zQS4kiaq86B$mNXVZ6i`}~)0Iy~Kmr!%!cObY$q0nEMrQ4s= z?VEOQ*51k6J2&iIg1vgxvSL}gwr+2}|Ne7(yHHuP+OX2F#;sSj-2X7!Hp;h+uD2cG zD~~)4^Y#j`=q1aNRhdqT?bvq7`{s7+OO?q}a&MIE+%S3{*ab~# zOWF$DII))%aAhN*PeRpBiUT%qe^J$bF?OD@1q8O=LjsL&zZ;8Z{^^;fZ`1yo1eX zPa@jV@O9Ci*XRU+TjP?!2!q06cs^D11qOr@xUBH?Afgk~(cpEKi?dOGx1YRJ0(NaR)JpaC*A% zObTp}nlLzGn}DB8My6A9sf5HbRshR*b3P?{Q1^s3qafJ?oo4xSWUK%$SpJ6)ER=R@J^EF_uCjYKg)s#OI=U7SF=>*Qs~r$HlthY-+%Vx=@3%#j};X#B|t zAvKSYY!Ar(6omg0Aa9QN4M<)b*`4{{b2a}bD4h69*hodgVebQW!!QJ9t>(6@cbNAM zKS^arPV*zDvm+Dy$i%w${IV`!%)8^+Z#7KhqImIeCN1ecS-iMti6r5x7|;D_LEP4lI=LkcO2c;sQp%+Zd;yM z>jpWx?ZEPwWDne+n}ycC<*_>#ZeQ4-y`)k!7{07+{KRw5bN@=V?GWE~C=2pP+mWXy zo*mez9lL#Q`P5zCW^?;zwV&2LIQ7K$;MmjmH=2)U=tjXeob{dHeJ9p^Crg@mY4(63 zgg3Wrn0Q>{)mA{E@ro z!IdXtPY-|P9z*!@lcA^XuiWpx?CkyA_|W({{gBRfp5r^u3GRlhyOVc!;>=WMoNcfy z>U*=E1H9+Jy60fo${3k^?wQOuCo`T&!C9Mi`go@=>+Is4U77A<8_u@`w|}#v=d*=R z7d}gWn$C8dRKpX`Jrfz{M8-1#wXa@SxsY}G zd8a?qb!5YNROlMXcAezAPCgr3?>cvP>?>!-7NSn^U8kO1S?_vJLbV^vwjbx)k3Z9` zw~yb|ePwUWy+v6&B$bQ`HaG|VbmXv~jwBPP6`T)V7tQc}3z}Q-D;Uv^x6L%6NVB>v=@yEc}&%GbnaRxN^C%~wZS9-AeYaJHDU6x5aN9)DGkmG#K-{7 zAe@X5U8HABFae$-1PR*9IZmtr`vhbImx(#}3WpSw6q5CDZbcVJtuUVaH#=~hfK@gH z*H6)s1gZz(#X{`G^7Tn>yWmF=_J&vk4i*t;6vi5ypIJx{7(4Eg6Tom91uS$S6;Fl> z(i$8s=!1t`<@O8&U=69{=yX-y9O=lvo%uI43woCF2jxoiTLU?)0<0H?v=O*fbDAa` zt37Q44z()nNIM^^9_4vcQ0Ay7{Z8&7RK+2e%KZZjkOk)+WAG0#_(vFgj=>WQzQEug zWAINf_+1SCDF$C+@Xs)Kiowq@_&p5%IRwB+=;3z05#pHAYes7N`?N|LBuy){n%M0= z$Xo7oWa1Z_cE1rw1~X7Qfw%H4>X&^KT_ZSJGmicY-4CJQ=*l>TGxRWof}=Cz7|PH? z5Q1mGo%HRrV6_QVH9}nnhVE9Op;xGB#<1xC0=}a{ZJW?Cf&fJQWBEJ~Z5he=DSQ1k z<+7V{l*?#(UB!b;6j?Hh0PM?v<}K<&m4yVr8lcj$M;7eCW?W#BFHzhWWb;=@99I}% z4ihESJZj~MKaoH0B7n~_HDC-iQO*IFQHjW<`?3aVB19gCs~`M<3p)?gt)L~5WxgfX zjuZ_;6;i5!m{>#E1N&7+_N%m8<(*2+;FS?)lBiE7Pr5Ey zknZFy!s*XZj1iAge88ooE?qmNzC@+#;M6ruiZ(QY<>e8)?}&sZ+Dox$F!#Gei)?OB zf|Qbwj%ac)7tiV95v_1Ai*zK;fTv(0h!skQGdaX}p3u_i{;#03+`ooEbmQ$T?aRar zeBH{(!js>?{Ts}nzHtEl21S}&>3IQAG>}+?Nk|t(;ZpGHNh$8%VFQ1N!N14gKVVSE zG!6y>tXnCUsLwk|N!gn*8#(+7x6yLHvxfYiXC^o4|0WREmB>q zDqU+}9Zp4)=ue051S)My#HVslGON=03mv{uWJ|EPv>%Q0Kub6s}F_7L~5h(#x~9AzM4Z z*AC$QSxAL@6|UL(ZYr-E~NH_p_S zwm-_>7H~3ZCdU{1$H>R(UeqFaK^zFh!@$LR!MPm$$P3$*IGBU_q7T$sj>);?S@66f zhC`Ma<2YdPQ62NJM5popc$2+}*)$in!F+15?K|w2`P65)Sz<5@U}XjM37GBXN@O%EWk$1br$Y0{&yn?jy18G$df2%d%wRFfa}TfrwEJ8ES8)ZX z$e9Q{=2y@`?zb@b&k%g)DFJQbcbO8%Cg79=pzeK6iI=x|mFL5E6b=%5GaMvHf8(C? zZ-mx?{u)lw(Hnj$z4iIGIsdl7ewFq3be?A^-LVBX1=z9H9eY_@vX||Wb3(#&j!N7A z9>rArj=F@Ou`qg~H8%sFV@O*_x~3$3?=!<#Y$sVbF$njj1=^N9%G@rIbWz5wcx)3%wM@O-9ia@{aVn)f}>Jhyd24Mvk- z^<~F{kHMRNp2Qq23~&*FYX1x(7N4#KAGzH;TV$iMuFdamB zI+{<>2ZPL9I2h#cuqEHihz8JjgG>N^d)jO85fJwdrkun8#XW8u13dG&AOvE?c~UWF zLCi(OXutwPOcmTT$#5AAF|c25US#i%WSjQlX z!5?Ap3k;AYAm$fXHKJ9%I^(dc!Orp9nB!y2Vab1W6lSB*U=Z#BG-HzwEZ&j6KtREa z!=Min(lGSD_?Y^IwkxMSsOw*w{FZ`O4%BHC)|^_UbFL-7q2RTxkrA+>c2Cmk4rh8# z|DliXg^D1OqpNhITXjP@9lW8W-F4^Y?VGp)a(cuVD3gENh*0OC?#LD_4mnu|aVSt- zo6{p?K*)%YiR$XjnGvEX@BzHOa(&x^u^E*`SN%YT`sW-4;8WKh)c+i`MF8d+-_o5{ zJvsE{=oh139{=Kaj>7n}s280@6V4u~0TK`1LHxK2S5ap#hx^vA4wLrVMR)l=m zt2Tt%!5{OJo_jqHoP1qxt^-k>JL>QwstX$UBz-UafZ>}4b4ZR{4`s65slQ#nrdwk_ ziQbDo82fzU;Y6l!ly@KIjfb~;G1HXFs2dO@$I#W_N-(E^_?EGLEy){Ow{;MK4z#p? z=Xv;%N>{gIA`nqB6C`RtLK_Jv%>w~xd4en?YDEgz2w0pB5-rXLiAobfz_J-3xZ;$M zs5B=8lqQ9MWwSzX(zFmznim2}6GOnVnIX8csUf(sxgoeWP7cX6q1g}Z`^)wK#7iiT zF^TX9Npwf3NOp`#R7a>tbc_{gjxmYk2$fMBQAL7dOrkeJ61fp7qc)->Vk0Ec8X<|) z2$fM9QDuZiR2iKS^+qydCQh{RHQP-Bq}3Rq%y{eRK}P@ zWrWJ8jHn`&F(y$NA&JTel~EZ{5|t5>sEm+AWrWJ8jHogyBdUzbh#q%y`NDkD@zWkeOJj4_GI2uV~%sEo>plBkT3 zL}i2|DkD@zWki)x8Bt|aM${XrjF~{?VQ9+eYqI)AUf;N(_X@SXpIHT?H)E{L8oj*H zyJ3V619rh})wcc2N>nX`9f4miv4!X0Q*UC2O!S4Wg$c97A-g3yh!0C&JPzI^l8G}E zm3S~y34+6{M&eI`ADK&zA~q5){^X+MYGWo?>HBXE9dN}4B|ntBeRXc0L-T^zt(2$S z?WGTVPvG|$rLV>bYe_ge)ES6JzEcH7IYj!LJ9rI#n~^hP1qZ)V1%%-Zae3z?@}Zx^ zq#_q9)+*H@x!RB~Q>-Li{z=BXkCA&PvX6Y`HU;14O0GOEl1wV;D}Hq^eerKWry)>@ z2EqD#ofRvk;a!t{t`UF2kwg0{SAzj+StKp-|G=)Aq~8D_!q^rj{Z|aom?awI4{ONs zk-jEAf~ki|>M@Dwf188fk3lbdD)GAzRSV)YWq`){u8PxLpA*w<$ghRy`WlO zP=kN2)~VFsZ=q6uORH7tKd;cL)W4)4_|`(<_%5j z3+>3CXh$;Iku9hD*7+?>9Z9U?HFX=B`v0OmyjA1LXq~?>I`3XxH#Xj?fWt-YxaH2O ls(4jZ#?iH|>VBoZWKgw(=Jm;$Z3;nPxYa$sBv`UX{tv{*L974( literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/__pycache__/utils.cpython-312.pyc b/model_executor/layers/fused_moe/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d549eba8427d3eef31d8bfe39b3b8d2f944abdee GIT binary patch literal 13285 zcmd5iZEPD?a=ZLo@k10PQeT$km1WDKtdCf>EXP)yD30x%EZcGHoKx8eL2;K7Eq-}- zCC56dQPL|y!Pif>vxc)<=e~) zN2}{$;ZzdNDd-4zt{-2WY|p^c+qPG2r~ zmK=mWgzqArq?~5#`t}yg5nEIBrjlz1*}FMc`4#0lHoIe;75AfN;J+X-_dp5x45Ag)G196dxw^5lqZf??!ni5k$vUS z`w@-x2i*L=;9jL?|1=LBYovf|kOaRJ4gpb}FW?c!{UINS3fu~WFe)$SpF$G92%`|Z z<8JIX;0_D!5PyXit%NOZ2|58KIg7g0Hi zs#P#*%FSGE{%Pte{J9LL1YBlNMLj27jqqRm7B0|aUC<*=$ojA}&I@0l(zwh5&ZTSu zdmIxyLARfi4IaT08slaCG%tijSr4YPw^!B^q=2%@xF<$C=ZL)sfhvDL;q5|@__-umNIV_(X)Kc-uA)-9f2I3Isy z*|GBC1Lrd_(@-4yH^Q;gYYkg&*4?N}Y+HW)=c5lA&cud3Z*02R zexp5caCu9rbzsGF-=1zf60>D0?enKom8~&thH=IlQ%vieHS44*_hsuTre^-as*zn| zOxKw?CTVY5+LE&G`@7K(efNATXMZ~JFU+%9Eyc8~!xT_5lUr}uj`I`d@Dl54zmGdL|oQD8P zv?y{#bv_!8AFu;JzmM-Ls)=ei-6S2=zpuZm?ol+fI}2XMV*xK?LPI%%YRrOLVPS1X zd4ZL}@DKaA$a+N4KNjLSMDZT)IIFB#w}~`-0h~>cm%u&_lIJcUQyd7}PPWkQbHF4l zxDK)kz=Cgx_(Nj~FVY8c_DdpZAMlG38}_N4h}|mFM>K|Q2e$@HPbuOSTCn~QjKeRo z;O{1K%?mDsZ9g56{Na!%aG_YdSAZ2e$Vx=kf{4WX0Y}d7C~}Z7MPT9r0Y1Qr;~s%iXY)m30P0lc#ev2mL^8NErDoH~ z5*MzDeZl_jBI!z5N75B`{^(kwJbtOLoTvuVPn-kn1TTa@FmQJw+qDNrCGZh} z7kN+-e`v}R@S`#)s|=fxf#auOegxD=KLrMJ%B3lIf7qe0LauLmpSrul|BQ_+}s{`Toxr~kh1<3k@E`bw*5FeG)(EJIng&OWzhcg(-M`0B!| zY3I&_l(u)y4z1bi9@=-M?7NmsY5U&Uq0eN8Ar zrmkhlk*w>=)NK9Q#^?=M2XId6nuLCkP?E$X=9(m80>zbml#pAgHh4wur*Vmi020eOl>D7NaK;=wB@#_b7lEV!V+)uO5?KYs=F+%I*r z>^bxve#P8z%U2A5SRk>P5O}*F&-pzP&*ozS)(ftYSN+wrhHanLyV!xJSebU6a_I$} zaA6P%2)D*P5neXJDMSK^WF7Vm27{|99D_o{<;*Fo)eUwBw(lnFg<+@~LmCkZh$>8! zrD~qO)<5rA99tNB+rMhwmZda$L&j{0wZ>k2M>l`wZDZ_eTp#z`9J?_V|D(j{lKq3a zJ9T#(mdwk0l1$&a7RZy(%s^Ro#r9l(a_-4%hvu*T#1Z$rTbDGqJT$kZ%xz0_(%g~M zbr8jq=|R~z=oVrB=4B_Ewa+bH@lQ_&AUuLtDLg5aY~i@^;`rB_p(s&FP_n4z{e0LB zi>EnthZar)oq(abODmj2_!Jh16_HQgl5OtLRPhN;coFH43_XrAJ*^q0BG#Ymcp9wJ zVjz>F48n>hPLEeoJceu@1+P8fCX3@OR_(m>Iwb(n&TcEHWVyftHsz90U4-?Vp=Q7^ zqLB4EOGPz;W{4^dNTS-iz#A`4gGUjO)JL_P7RLd19o}d35Mbm@1-R*RK}4h9=2$HNo%k%II~RoIiBkz$Kt(j4<(FgTl>=S4~FjyuT-Zy2UcwZ zNoF8v8~6;Ig3=wafbv@YUoZd&v$eBZqMAoSXjMybtT~O%>E0vSjh0HTZ=v0|E=rqn z+Fhjkm-3pPYk>#&oJ=L!{aDJf+I^X*Tu!?&5%v34@9DmBy}%A7PDUbOK@t^fV8a6I z9ruJFIP1FN7kS}4jDDl$>jj)AVFU{deTbe57Z8H%3H693SGuw9d=^ab0MTC};VmdY zm75e*{>0XlWL`=(zm#TPip{J!T}v%#XQ!&kRlkEK_bk)P{b}c+-$9e*ni5~GjHN`q zQ5o^K4&Ifh_Zx}}E~C&hpxhjVj`|LLO28xG9bJ)o7zG4z4>!#Y7wexr^{7?m#m3+Otdds5&g~#k z?dHQcO9>n_5-@Wo@K9`!*FognkSi6&f$(QY+zo{ybWQB(q^>dc^e49MN!|9Od3(lO zn;*xTxjBA0ZEi{GS{{`iCx}dXCjjVE9+j4(i&BsIiFavF$&4?u-_7q0_CgNKtR{gm z6DHs>eDeV*Z3?RZ=}79I;J3aDibqz#LRMkv-~(p=W7pw-VI;l5y}|^F{*CejWkLSS z(BxmSSqKXf_6GnWVHbuzWPi&?UtTo$+=Vci{$bE^n0}3vtD(w;E$9k`%0iZegfZHv z?k)}#@CDxYU5OJz{0YSh7iGCtB{E&wu~f>SC+p28WRmi5H)mD>T?gineZ=3!V;mq|E` z#8w~VUcukuI|^Tj1P-zeJuq>+N(76pgP5X#>MNiQz@hgf!Xy$#4G{8XIE#t^6QuJCS5gKyBQ(TCr`-2EHJ%D>?Z6 z2e$J`=KMzd?!))ZY1?p;8P3%}cvQ6`F_^fLeB#Lc3u)V#By$GqEUOhe)@-f+j#sccuRUgDyUVl(U6Gcq6=A{-j$jhp6iI)6gPs1JIdz z71>VtOrp9MMQKi}yyPs$rMxs?J!)&P9?;Sp2RKmarVo&CW{?Hn8Jv`q2p!(dN&XNd ztj0YMafgKo5rcLNGlT4+2LL#+F%)BcXlzayL6;sFTN0kzW4Fd0 zw(U=~?O$nGZ99~)Rb8K&oBGMonRnxI z2T-AEZMZlZQqLL%i2(&JOIbQhq~xRC0uwI3pru@3XDmA8Xh}waOgRbgT(s~fz(Yi_ z)v%;C1Sd2|*g*zC8MSi475)@RoIocI129WvjMk*FA!D{*ADA1sb~t`!)!dwMZdtsr za3Nt&45poJv16DasGdCu8U5NOQc<1w{_P95E-Y97wL9(XiyedJvnQ23$^~3Y%nxHy zvKg76Q!>A^Fmga{ykK5o-h|8L=PN(^11DWA=MUyiu{uaS2aA?ju|i14 z@*W8TcU}nao+(~gehpcE0Vlu2-kJ+aFzS&YO~sWe!!>BojLLwP3L63#VI`CR#kHecN#@nyPj;%QxA38fz&d#OpFJHLtdEk8hq4Q+Qc`{SA1$Ge{(o40KpgR!# zqVDK(y0c|zbh+)r-S>8{jQ+GI?R++N?6bV~$h1c`dK3{$qWHWLttm=G&(c>RS`gde zxY`DZ48U%@@(M%Z4zU{Zn&mwHyLrQYlkgP$<>@%`SsN^}YTU*BDRm@{sNSSCd26G% z4zYhTSrk5)DScc&P_KJjFHo<4&yX8G8I^WcNi01^`r`Pn>=>2zPFjxKbI!@9nb|~^ zD2dU;+ResFMyY$3dQV#{DZc+Exg4D3X&w92Q|z>!#l$V#;9(CW1_z-^_#PJc+6+>x zKSKD=uo%UHq;ME~GvJkYPDFDp3Xhy_@)`qDxVaD-v$DcGP<5jv{00DWTwrXouVm6(2^)v&!PwOWYO)5qFObT;2xBGAP zFYUQ?aMjkeyz9fRdtLYGRPVF*J*&OPAs1O;DyB|WvI$+{%yn0YQg-;rDQIKR$PteoD}-d`i$=oG$HFz9zsFC4oqGQal{{8b_L@b@g5_5>d|cO38jDJJ zcqhFzQ(1F8Fc-*b5$@?#?#~?k2g}baKbKO^oF$0Q^rp8izHxC~4@B7)XolWB|I*^c zg^O7Vzy!Z^WVvSP`CDP~xU%!(&W}3Ru_jw?^d>fIM2Mjb=2#yFuyN<#b{s5Qq2ZYKWp$+^IwHbe`J52an>6*O! z0GILy1n_Tc&KeQgz#maX{2fX2)|9?^-HiBl!n=arp63m4DQ`fiyb;>K8&O5Pi|5{q zbu8guStL8arThV*@<(U`e?%4W&&!VZj^88iPT-x{-u9)nfo{xC4Zx+M0|G>^u8r&Q zi-459HQ`O!U0EaI8C;GEgiIysWJZ()6&HDDD>o?Ck}lQInKXB%^j+&VBsgHCx6C^i zn--d~6o84Nw})>H6L9&Z4=>)kxQ^JY!$4Oj`fd;08d%_8`D K{}K^o2mWu-&b283 literal 0 HcmV?d00001 diff --git a/model_executor/layers/fused_moe/all2all_utils.py b/model_executor/layers/fused_moe/all2all_utils.py new file mode 100644 index 0000000..2dd6250 --- /dev/null +++ b/model_executor/layers/fused_moe/all2all_utils.py @@ -0,0 +1,160 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm.distributed import ( + get_ep_group, +) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEParallelConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEPrepareAndFinalize, +) +from vllm.platforms import current_platform +from vllm.utils.import_utils import has_deep_ep, has_pplx + +if current_platform.is_cuda_alike(): + if has_pplx(): + from .pplx_prepare_finalize import ( + PplxPrepareAndFinalize, + pplx_hidden_dim_scale_bytes, + ) + if has_deep_ep(): + from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize + from .deepep_ll_prepare_finalize import ( + DEEPEP_QUANT_BLOCK_SHAPE, + DeepEPLLPrepareAndFinalize, + ) + + +def maybe_roundup_layer_hidden_size( + hidden_size: int, + act_dtype: torch.dtype, + moe_parallel_config: FusedMoEParallelConfig, +) -> int: + """ + Given layer hidden size and MoE configurations, round up hidden_size + if necessary. + + Args: + hidden_size: Layer hidden-size + act_dtype: Data type of the layer activations. + moe_parallel_config: Fused MoE parallelization strategy configuration. + + Return: + Rounded up hidden_size if rounding up is required based on the configs + and all2all backend. + Original hidden size otherwise. + """ + if moe_parallel_config.use_deepep_ht_kernels: + hidden_size = DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size( + hidden_size, act_dtype + ) + + if moe_parallel_config.use_deepep_ll_kernels: + hidden_size = DeepEPLLPrepareAndFinalize.maybe_roundup_layer_hidden_size( + hidden_size + ) + + return hidden_size + + +def maybe_make_prepare_finalize( + moe: FusedMoEConfig, + quant_config: FusedMoEQuantConfig | None, +) -> FusedMoEPrepareAndFinalize | None: + if not moe.moe_parallel_config.use_all2all_kernels: + return None + + all2all_manager = get_ep_group().device_communicator.all2all_manager + assert all2all_manager is not None + + prepare_finalize: FusedMoEPrepareAndFinalize | None = None + + # TODO: could allow this now + assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py" + + if moe.use_pplx_kernels: + assert quant_config is not None + + hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes( + moe.max_num_tokens, + moe.hidden_dim, + moe.in_dtype, + quant_config.quant_dtype, + per_act_token_quant=quant_config.per_act_token_quant, + block_shape=quant_config.block_shape, + ) + + all_to_all_args = dict( + max_num_tokens=moe.max_num_tokens, + num_experts=moe.num_experts, + experts_per_token=moe.experts_per_token, # topk + rank=all2all_manager.rank, + world_size=all2all_manager.world_size, + # dp_size actually means tp_size, bug in pplx kernels + dp_size=all2all_manager.tp_group.world_size, + hidden_dim=moe.hidden_dim, + hidden_dim_bytes=hidden_dim_bytes, + hidden_dim_scale_bytes=hidden_scale_bytes, + ) + + num_dispatchers = ( + all2all_manager.world_size // all2all_manager.tp_group.world_size + ) + + # Intranode pplx a2a takes a group name while internode does not. + if not all2all_manager.internode: + all_to_all_args["group_name"] = all2all_manager.cpu_group.group_name + + handle = all2all_manager.get_handle(all_to_all_args) + + prepare_finalize = PplxPrepareAndFinalize( + handle, + max_num_tokens=moe.max_num_tokens, + num_local_experts=moe.num_local_experts, + num_dispatchers=num_dispatchers, + ) + elif moe.use_deepep_ht_kernels: + assert moe.dp_size == all2all_manager.dp_world_size + + all_to_all_args = dict() + handle = all2all_manager.get_handle(all_to_all_args) + prepare_finalize = DeepEPHTPrepareAndFinalize( + handle, + num_dispatchers=all2all_manager.world_size, + dp_size=all2all_manager.dp_world_size, + rank_expert_offset=all2all_manager.rank * moe.num_local_experts, + ) + + elif moe.use_deepep_ll_kernels: + assert quant_config is not None + all_to_all_args = dict( + max_num_tokens_per_dp_rank=moe.max_num_tokens, + token_hidden_size=moe.hidden_dim, + num_ep_ranks=all2all_manager.world_size, + num_global_experts=moe.num_experts, + num_local_experts=moe.num_experts // all2all_manager.world_size, + ) + handle = all2all_manager.get_handle(all_to_all_args) + + # Note: We may want to use FP8 dispatch just to reduce + # data movement. + use_fp8_dispatch = ( + quant_config.quant_dtype == current_platform.fp8_dtype() + and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE + ) + + prepare_finalize = DeepEPLLPrepareAndFinalize( + handle, + max_tokens_per_rank=moe.max_num_tokens, + num_dispatchers=all2all_manager.world_size, + use_fp8_dispatch=use_fp8_dispatch, + ) + + return prepare_finalize diff --git a/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/model_executor/layers/fused_moe/batched_deep_gemm_moe.py new file mode 100644 index 0000000..5336227 --- /dev/null +++ b/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -0,0 +1,406 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.forward_context import get_forward_context, is_forward_context_available +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate, +) +from vllm.model_executor.layers.fused_moe.utils import _resize_cache +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton +from vllm.utils.deep_gemm import ( + DeepGemmQuantScaleFMT, + fp8_m_grouped_gemm_nt_masked, + get_mk_alignment_for_contiguous_layout, + is_deep_gemm_e8m0_used, +) +from vllm.utils.math_utils import cdiv, round_up + +logger = init_logger(__name__) + + +def scales_shape_stride_dtype( + E: int, T: int, G: int, quant_scale_fmt: DeepGemmQuantScaleFMT +) -> tuple[tuple[int, ...], tuple[int, ...], torch.dtype]: + shape = (E, T, G) + strides = (T * G, 1, T) + if quant_scale_fmt in [ + DeepGemmQuantScaleFMT.FLOAT32, + DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0, + ]: + return shape, strides, torch.float32 + + assert quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0 + shape = (E, T, cdiv(G, 4)) + strides = (T * cdiv(G, 4), 1, T) + return shape, strides, torch.int32 + + +@triton.jit +def _silu_mul_fp8_quant_deep_gemm( + # Pointers ------------------------------------------------------------ + input_ptr, # 16-bit activations (E, T, 2*H) + y_q_ptr, # fp8 quantized activations (E, T, H) + y_s_ptr, # 16-bit scales (E, T, G) + counts_ptr, # int32 num tokens per expert (E) + # Sizes --------------------------------------------------------------- + H: tl.constexpr, # hidden dimension (per output) + GROUP_SIZE: tl.constexpr, # elements per group (usually 128) + # Strides for input (elements) --------------------------------------- + stride_i_e, + stride_i_t, + stride_i_h, + # Strides for y_q (elements) ----------------------------------------- + stride_yq_e, + stride_yq_t, + stride_yq_h, + # Strides for y_s (elements) ----------------------------------------- + stride_ys_e, + stride_ys_t, + stride_ys_g, + # Stride for counts (elements) + stride_counts_e, + # Numeric params ------------------------------------------------------ + eps: tl.constexpr, + fp8_min: tl.constexpr, + fp8_max: tl.constexpr, + ceil_ue8m0: tl.constexpr, + # Meta --------------------------------------------------------------- + BLOCK: tl.constexpr, + NUM_STAGES: tl.constexpr, +): + G = H // GROUP_SIZE + + # map program id -> (e, g) + pid = tl.program_id(0) + e = pid // G + g = pid % G + + e = e.to(tl.int64) + g = g.to(tl.int64) + + # number of valid tokens for this expert + n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64) + + cols = tl.arange(0, BLOCK).to(tl.int64) + mask = cols < BLOCK + + base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h + base_gate_offset = base_input_offset + cols * stride_i_h + base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h + base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h + base_ys_offset = e * stride_ys_e + g * stride_ys_g + + for t in tl.range(0, n_tokens, num_stages=NUM_STAGES): + gate = tl.load( + input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0 + ).to(tl.float32) + up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0) + + gate = gate * (1.0 / (1.0 + tl.exp(-gate))) + y = gate * up + + y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max + if ceil_ue8m0: + y_s = tl.exp2(tl.ceil(tl.log2(y_s))) + + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask) + tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s) + + +def persistent_masked_m_silu_mul_quant( + y: torch.Tensor, # (E, T, 2*H) + tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert + num_parallel_tokens=16, + group_size: int = 128, + quant_scale_fmt: DeepGemmQuantScaleFMT = DeepGemmQuantScaleFMT.FLOAT32, +) -> tuple[torch.Tensor, torch.Tensor]: + """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales + y has shape (E, T, 2*H). The first half of the last dimension is + silu-activated, multiplied by the second half, then quantized into FP8. + We launch a fixed grid of threads to accommodate CUDA graphs. Let `P2` + be a parallelization factor for persistent_masked_m_silu_mul_quant over the + hidden dimension. + + Let `expert_offsets = [0] + [num_tokens.cumsum()]` and + `total_tokens = expert_offsets[-1]`. + persistent_masked_m_silu_mul_quant launches `total_tokens x P2` number of + thread blocks. Each thread block contains `NUM_WARPS` warps. + + Every thread block needs to find it's corresponding expert by warp-parallel scanning + over the `expert_offsets` array. + + The i-th warp in the first thread block processes + `[i * warp_chunk_size, (i + 1) * warp_chunk_size]` groups + sequentially, where `warp_chunk_size = ((H / GROUP_SIZE) / P2) / NUM_WARPS`, + pipelining loads and computes. + + The shared memory layout for 4 warps with a 2-stage pipeline for SiLU V2 + can is visualized like so: + + stage0 stage1 + ┌─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┬─────┬───┐ + │gate0│up0│gate1│up1│gate2│up2│gate3│up3│gate0│up0│gate1│up1│gate2│up2│gate3│up3│ + └─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┴─────┴───┘ + + with the main difference between V1 and V2 being the global load + stride between warps, and between half-warps. Regarding the latter stride, + we assign the first half warp of every warp for `gate` loads and the second + half-warp to `up` loads. + + Returns `(y_q, y_s)` where + * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] + * `y_s` depends on quant_scale_fmt, + - quant_scale_fmt == FLOAT32, + `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) + - quant_scale_fmt == E8M0, + `y_s`: Int32 tensor, shape (E, T, H // group_size // 4), strides (T*G, 1, T) + - quant_scale_fmt == E8M0_FLOAT32_SPARSE + `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) + Let NUM_WARPS be the number of warps in a single thread block and + `GROUP_SIZE = 128` be the size of the quantization group. + """ + assert y.ndim == 3, "y must be (E, T, 2*H)" + E, T, H2 = y.shape + assert H2 % 2 == 0, "last dim of y must be even (2*H)" + H = H2 // 2 + G = (H + group_size - 1) // group_size + assert H % 8 == 0, "H must be divisible by 8" + assert group_size == 128, "H must be divisible by 8" + assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E + + tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32) + + fp8_dtype = torch.float8_e4m3fn + y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) + + ys_shape, ys_strides, ys_dtype = scales_shape_stride_dtype(E, T, G, quant_scale_fmt) + y_s = torch.empty_strided( + ys_shape, + ys_strides, + dtype=ys_dtype, + device=y.device, + ) + + ceil_ue8m0 = quant_scale_fmt in [ + DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0, + DeepGemmQuantScaleFMT.UE8M0, + ] + + cuda_arch = current_platform.get_device_capability( + device_id=y.device.index + ).to_int() + + if cuda_arch >= 80: + torch.ops._C.persistent_masked_m_silu_mul_quant( + y, tokens_per_expert, y_q, y_s, ceil_ue8m0 + ) + else: + stride_cnt_e = tokens_per_expert.stride()[0] + + # Static grid over experts and H-groups. + # A loop inside the kernel handles the token dim + grid = (E * G,) + # strides (elements) + stride_i_e, stride_i_t, stride_i_h = y.stride() + stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() + + f_info = torch.finfo(fp8_dtype) + fp8_max = f_info.max + fp8_min = f_info.min + eps: float = 1e-10 + assert y_s.dtype == torch.float32, ( + "_silu_mul_fp8_quant_deep_gemm does" + "not support {y_s.dtype} scales. Only torch.float32 supported." + ) + _silu_mul_fp8_quant_deep_gemm[grid]( + y, + y_q, + y_s, + tokens_per_expert, + H, + group_size, + stride_i_e, + stride_i_t, + stride_i_h, + stride_yq_e, + stride_yq_t, + stride_yq_h, + ys_strides[0], + ys_strides[1], + ys_strides[2], + stride_cnt_e, + eps, + fp8_min, + fp8_max, + ceil_ue8m0, + BLOCK=group_size, + NUM_STAGES=4, + num_warps=1, + ) + + return y_q, y_s + + +class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + max_num_tokens: int, + num_dispatchers: int, + quant_config: FusedMoEQuantConfig, + ): + """ + max_num_tokens: Maximum number of tokens from a DP Rank + num_dispatchers: The number of DP dispatchers. + quant_config: Quantization configuration + """ + super().__init__(quant_config) + assert self.block_shape == get_mk_alignment_for_contiguous_layout() + assert self.quant_config.use_fp8_w8a8 + self.max_num_tokens = max_num_tokens + self.num_dispatchers = num_dispatchers + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts, + ) + + def supports_chunking(self) -> bool: + return False + + def supports_expert_map(self) -> bool: + return False + + def supports_packed_ue8m0_act_scales(self) -> bool: + """ + DeepGemm supports packed ue8m0 activation scales format in devices == sm100 + """ + return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100) + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # FIXME (varun): We should be able to dispatch only from the leader + # DP ranks in the case of TP > 1. At the moment, all the Ranks + # end up sending their tokens. This needs to be fixed. + num_dispatchers = self.num_dispatchers + num_experts = local_num_experts + max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens + workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N)) + workspace2 = (num_experts, max_num_tokens * num_dispatchers, (N // 2)) + output = (num_experts, max_num_tokens * num_dispatchers, K) + return (workspace13, workspace2, output) + + def estimate_expected_m( + self, global_num_experts: int, max_tokens_per_expert: int, topk: int + ) -> int: + dp_meta = ( + get_forward_context().dp_metadata + if is_forward_context_available() + else None + ) + if dp_meta is None: + logger.warning_once( + "DPMetadata unavailable. Defaulting expected_m to " + f"{max_tokens_per_expert}.", + scope="local", + ) + return max_tokens_per_expert + + total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item() + total_num_tokens_replicated = total_num_tokens * topk + + # Assume even load balancing + assert global_num_experts != 0 + estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16) + # clamp estimate + estimate = max(estimate, 16) + estimate = min(max_tokens_per_expert, estimate) + return estimate + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert expert_tokens_meta is not None + expert_num_tokens = expert_tokens_meta.expert_num_tokens + + assert hidden_states.ndim == 3 + assert self.block_shape is not None + + a1q = hidden_states + _, N, K = w1.size() + + assert w2.size(1) == K + + E, max_num_tokens, N, K, _ = self.moe_problem_size( + hidden_states, w1, w2, topk_ids + ) + + workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N)) + + expected_m = self.estimate_expected_m( + global_num_experts=global_num_experts, + max_tokens_per_expert=max_num_tokens, + topk=topk_ids.size(-1), + ) + + fp8_m_grouped_gemm_nt_masked( + (a1q, a1q_scale), + (w1, self.w1_scale), + workspace1, + expert_num_tokens, + expected_m, + ) + + quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle() + a2q, a2q_scale = persistent_masked_m_silu_mul_quant( + workspace1, + expert_num_tokens, + quant_scale_fmt=quant_scale_fmt, + ) + + fp8_m_grouped_gemm_nt_masked( + (a2q, a2q_scale), + (w2, self.w2_scale), + output, + expert_num_tokens, + expected_m, + ) diff --git a/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py new file mode 100644 index 0000000..e69e9fd --- /dev/null +++ b/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( + BatchedDeepGemmExperts, +) +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts +from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout + + +class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + max_num_tokens: int, + num_dispatchers: int, + quant_config: FusedMoEQuantConfig, + allow_deep_gemm: bool = False, + ): + super().__init__(quant_config) + + self.batched_triton_experts = BatchedTritonExperts( + max_num_tokens=max_num_tokens, + num_dispatchers=num_dispatchers, + quant_config=self.quant_config, + ) + + self.allow_deep_gemm = ( + allow_deep_gemm + and self.quant_config.use_fp8_w8a8 + and self.block_shape == get_mk_alignment_for_contiguous_layout() + ) + + self.batched_deep_gemm_experts = ( + BatchedDeepGemmExperts( + max_num_tokens=max_num_tokens, + num_dispatchers=num_dispatchers, + quant_config=self.quant_config, + ) + if self.allow_deep_gemm + else None + ) + + assert ( + self.batched_deep_gemm_experts is not None + or self.batched_triton_experts is not None + ) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + if self.batched_triton_experts is not None: + assert ( + self.batched_deep_gemm_experts is None + or self.batched_deep_gemm_experts.activation_formats + == self.batched_triton_experts.activation_formats + ) + return self.batched_triton_experts.activation_formats + else: + assert self.batched_deep_gemm_experts is not None + return self.batched_deep_gemm_experts.activation_formats + + def supports_chunking(self) -> bool: + bdge = self.batched_deep_gemm_experts + bte = self.batched_triton_experts + return (bdge is None or bdge.supports_chunking()) and ( + bte is None or bte.supports_chunking() + ) + + def supports_expert_map(self) -> bool: + bdge = self.batched_deep_gemm_experts + bte = self.batched_triton_experts + return (bdge is None or bdge.supports_expert_map()) and ( + bte is None or bte.supports_expert_map() + ) + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + bdge = self.batched_deep_gemm_experts + bte = self.batched_triton_experts + bdge_war = bdge.finalize_weight_and_reduce_impl() if bdge else None + bte_war = bte.finalize_weight_and_reduce_impl() if bte else None + is_bdge_war = bdge_war is not None + is_bte_war = bte_war is not None + + if is_bdge_war and is_bte_war: + assert bdge_war == bte_war, ( + "Both implementations should agree on WeightAndReduce impls. " + f"Got bdge_war: {bdge_war}, and bte_war: {bte_war}" + ) + + if bdge_war is not None: + return bdge_war + + assert bte_war is not None + return bte_war + + def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype: + return act_dtype + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_metadata: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # Note: the deep gemm workspaces are strictly larger than the triton + # workspaces so we can be pessimistic here and allocate for DeepGemm + # even if we fall back to triton later, e.g. if expert maps are set. + if self.allow_deep_gemm: + assert self.batched_deep_gemm_experts is not None + return self.batched_deep_gemm_experts.workspace_shapes( + M, + N, + K, + topk, + global_num_experts, + local_num_experts, + expert_tokens_metadata, + ) + else: + assert self.batched_triton_experts is not None + return self.batched_triton_experts.workspace_shapes( + M, + N, + K, + topk, + global_num_experts, + local_num_experts, + expert_tokens_metadata, + ) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + experts = ( + self.batched_deep_gemm_experts + if self.allow_deep_gemm + else self.batched_triton_experts + ) + assert experts is not None + experts.apply( + output, + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + activation, + global_num_experts, + expert_map, + a1q_scale, + a2_scale, + workspace13, + workspace2, + expert_tokens_meta, + apply_router_weight_on_input, + ) diff --git a/model_executor/layers/fused_moe/config.py b/model_executor/layers/fused_moe/config.py new file mode 100644 index 0000000..a7bd64b --- /dev/null +++ b/model_executor/layers/fused_moe/config.py @@ -0,0 +1,916 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass +from enum import IntEnum +from typing import Optional, Union + +import torch + +import vllm.envs as envs +from vllm.config import ParallelConfig +from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( + OCP_MX_DTYPES, + OCP_MX_Scheme, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.import_utils import has_triton_kernels +from vllm.utils.math_utils import cdiv + +logger = init_logger(__name__) + +if has_triton_kernels(): + try: + from triton_kernels.matmul_ogs import PrecisionConfig + except ImportError: + logger.error( + "Failed to import Triton kernels. Please make sure your triton " + "version is compatible." + ) + + +def _get_config_dtype_str( + dtype: torch.dtype, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + ocp_mx_scheme: str | None = None, +) -> str | None: + """ + Return a string used to construct the filename that contains the + tuning info for a particular quantization scheme. See + try_get_optimal_moe_config in fused_moe.py. + """ + if use_fp8_w8a8: + return "fp8_w8a8" + elif use_int8_w8a16: + return "int8_w8a16" + elif use_int4_w4a16: + return "int4_w4a16" + elif ocp_mx_scheme is not None: + # The output of this function is passed to `try_get_optimal_moe_config`, + # and as we only simulate OCP MX execution in fused_moe for now, + # we will NOT look for `*,dtype=w_mxfp4_a_mxfp4.json` for now. + return None + elif dtype == torch.float: + # avoiding cases where kernel fails when float32 MoE + # use fp16/bfloat16 configs + return "float32" + return None + + +def _quant_flags_to_group_shape( + quant_dtype: torch.dtype | str | None, + per_act_token_quant: bool, + per_out_ch_quant: bool, + block_shape: list[int] | None, +) -> tuple[GroupShape | None, GroupShape | None]: + """ + Convert MoE quantization flags into more generic GroupShapes. + """ + a_shape: GroupShape | None + w_shape: GroupShape | None + if block_shape is not None: + assert not per_act_token_quant + assert not per_out_ch_quant + # TODO(bnell): this is not quite right for activations since first + # dim should be 1. + a_shape = GroupShape(row=block_shape[0], col=block_shape[1]) + w_shape = GroupShape(row=block_shape[0], col=block_shape[1]) + else: + w_shape = None + a_shape = None if quant_dtype is None else GroupShape.PER_TENSOR + + if per_act_token_quant: + a_shape = GroupShape.PER_TOKEN + + if per_out_ch_quant: + w_shape = GroupShape.PER_TOKEN + + return a_shape, w_shape + + +# The type of method in top-K routing +# Please keep this in sync with the counterpart defined in https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/trtllm/fused_moe/runner.h +class RoutingMethodType(IntEnum): + # Default: Softmax -> TopK + Default = (0,) + # Renormalize: TopK -> Softmax + Renormalize = (1,) + # DeepSeekV3: Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups + # -> Top8 experts from the Top4 groups + DeepSeekV3 = (2,) + # Llama4: Top1 -> Sigmoid + Llama4 = (3,) + # RenormalizeNaive: Softmax -> TopK -> Renormalize + RenormalizeNaive = (4,) + # TopK: TopK (no softmax) + TopK = (5,) + # Unspecified + Unspecified = 6.0 + + +@dataclass +class FusedMoEQuantDesc: + """ + A quantization descriptor for fused MoE ops. This class can describe + either activations or weights. + """ + + # The quantized type of this parameters. None means unquantized or + # already quantized. + # TODO (bnell): use scalar_type instead of Union. + dtype: torch.dtype | str | None = None + + # A field that describes the quantization group shape, from quant_utils.py. + # * (-1, -1) for per-tensor quantization + # * (1, -1) for per-row quantization + # * (-1, 1) for per-column quantization + # * (128, 128) for 128x128 deepseek style block quantization + # * (1, 128) for deepseek style activation quantization + # (i.e. per-token-per-group) + shape: GroupShape | None = None + + # Quantization scales. + # TODO(bnell): maybe put PrecisionConfigs in subclass of QuantDesc? + scale: Union[torch.Tensor, "PrecisionConfig", None] = None + + # Quantization alphas or gscales, used for nvfp4 types. + # TODO(bnell): put some of these in subclasses + alpha_or_gscale: torch.Tensor | None = None + + # Zero points for int4/int8 types + zp: torch.Tensor | None = None + + # Biases for GPT triton MoE + bias: torch.Tensor | None = None + + +# TODO(bnell): have subclasses for specific moe methods? +# e.g. for specific arguments bias, precision, etc. +@dataclass +class FusedMoEQuantConfig: + """ + The FusedMoEQuantConfig contains all the quantization parameters for + a single FusedMoEMethodBase operation. It consists of four + FusedMoEQuantDescs, one for each activation and set of weights. + + Each FusedMoEMethodBase must implement a get_fused_moe_quant_config + method to construct a FusedMoEQuantConfig for use with that class. + + FusedMoEQuant configs are only used for modular kernels, fused_experts + (from fused_moe.py), cutlass_moe_fp[48], rocm_aiter_fused_experts and + triton_kernel_moe_forward. Other MoE methods can ignore the + FusedMoEQuantConfig (for now) and hardcode it to None. + + There are currently some restrictions on what can be expressed: + - Most MoE ops only support similar quantization strategies for + each parameter, e.g. both weights must have the same GroupShape + and both activations must share the same GroupShape. One exception to + this is the cutlass moe which allows per channel quantization on the + outputs. Note: this restrictions are not always rigorously checked. + - Not all fused MoE functions support all the parameters, e.g. zero points, + global scales, alphas and biases are not universally supported. + - Fully general GroupShapes are not allowed. Activations only support + per token, per tensor or K-blocked. + - Weights are not required to have a GroupShape since they have already + been quantized. + + Other notes: + - PrecisionConfigs are specific to GPT OSS Triton. + - As a follow up it would probably make sense to subclass FusedMoEQuantDesc + or FusedMoEQuantConfig for particular FusedMoEMethodBase subclasses + so that only the required quantization parameters are used/stored. + """ + + # TODO(bnell) make sure a1_scales/a2_scales don't interfere with chunking + _a1: FusedMoEQuantDesc + _a2: FusedMoEQuantDesc + _w1: FusedMoEQuantDesc + _w2: FusedMoEQuantDesc + + def __post_init__(self): + assert not self.per_act_token_quant or self.block_shape is None, ( + "illegal quantization" + ) + + # + # Convenience accessors for various properties. + # + + @property + def quant_dtype(self) -> torch.dtype | str | None: + return self._a1.dtype + + @property + def is_quantized(self) -> bool: + return self.quant_dtype is not None + + @property + def is_per_act_token(self) -> bool: + return self._a1.shape == GroupShape.PER_TOKEN + + @property + def per_act_token_quant(self) -> bool: + return self._a1.shape == GroupShape.PER_TOKEN + + @property + def per_out_ch_quant(self) -> bool: + return self._w1.shape == GroupShape.PER_TOKEN + + @property + def is_per_tensor(self) -> bool: + return self._a1.shape == GroupShape.PER_TENSOR + + @property + def block_shape(self) -> list[int] | None: + if ( + self._a1.shape is not None + and self._a1.shape != GroupShape.PER_TENSOR + and self._a1.shape != GroupShape.PER_TOKEN + ): + return [self._a1.shape.row, self._a1.shape.col] + else: + return None + + @property + def is_block_quantized(self) -> bool: + return self.block_shape is not None + + @property + def a1_scale(self) -> torch.Tensor | None: + assert self._a1.scale is None or isinstance(self._a1.scale, torch.Tensor) + return self._a1.scale + + @property + def a1_gscale(self) -> torch.Tensor | None: + return self._a1.alpha_or_gscale + + @property + def a2_scale(self) -> torch.Tensor | None: + assert self._a2.scale is None or isinstance(self._a2.scale, torch.Tensor) + return self._a2.scale + + @property + def a2_gscale(self) -> torch.Tensor | None: + return self._a2.alpha_or_gscale + + @property + def w1_scale(self) -> torch.Tensor | None: + assert self._w1.scale is None or isinstance(self._w1.scale, torch.Tensor) + return self._w1.scale + + @property + def w1_zp(self) -> torch.Tensor | None: + return self._w1.zp + + @property + def w1_bias(self) -> torch.Tensor | None: + return self._w1.bias + + @property + def w1_precision(self) -> Optional["PrecisionConfig"]: + assert self._w1.scale is None or isinstance(self._w1.scale, PrecisionConfig) + return self._w1.scale + + @property + def g1_alphas(self) -> torch.Tensor | None: + return self._w1.alpha_or_gscale + + @property + def w2_scale(self) -> torch.Tensor | None: + assert self._w2.scale is None or isinstance(self._w2.scale, torch.Tensor) + return self._w2.scale + + @property + def w2_zp(self) -> torch.Tensor | None: + return self._w2.zp + + @property + def w2_bias(self) -> torch.Tensor | None: + return self._w2.bias + + @property + def w2_precision(self) -> Optional["PrecisionConfig"]: + assert self._w2.scale is None or isinstance(self._w2.scale, PrecisionConfig) + return self._w2.scale + + @property + def g2_alphas(self) -> torch.Tensor | None: + return self._w2.alpha_or_gscale + + @property + def use_fp8_w8a8(self) -> bool: + return self.quant_dtype == torch.float8_e4m3fn + + @property + def use_int8_w8a8(self) -> bool: + return self.quant_dtype == torch.int8 + + @property + def use_int8_w8a16(self) -> bool: + return self._a1.dtype is None and self._w1.dtype == torch.int8 + + @property + def use_int4_w4a16(self) -> bool: + return self._a1.dtype is None and self._w1.dtype == "int4" + + @property + def ocp_mx_scheme(self) -> str | None: + if not hasattr(self, "_ocp_mx_scheme"): + if (self._a1.dtype is not None and not isinstance(self._a1.dtype, str)) or ( + self._w1.dtype is not None and not isinstance(self._w1.dtype, str) + ): + self._ocp_mx_scheme = None + else: + ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype( + self._a1.dtype, self._w1.dtype + ) + + if ocp_mx_scheme is not None: + ocp_mx_scheme = ocp_mx_scheme.value + + self._ocp_mx_scheme = ocp_mx_scheme + + return self._ocp_mx_scheme + + @property + def use_mxfp4_w4a16(self) -> bool: + return self._a1.dtype is None and self._w1.dtype == "mxfp4" + + @property + def use_nvfp4_w4a4(self) -> bool: + return self.quant_dtype == "nvfp4" + + def config_name(self, dtype: torch.dtype) -> str | None: + """ + Return a string used to construct the filename that contains the + tuning info for a particular quantization scheme. See + try_get_optimal_moe_config in fused_moe.py. + """ + return _get_config_dtype_str( + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + ocp_mx_scheme=self.ocp_mx_scheme, + dtype=dtype, + ) + + def scale_shape( + self, + max_tokens: int, + hidden_dim: int, + ) -> tuple[int, int] | None: + """ + Construct the proper activation scale shape for this + config. + """ + if self.is_quantized: + if self.is_block_quantized: + assert self.block_shape is not None + _, block_k = self.block_shape + k_tiles = cdiv(hidden_dim, block_k) + return (max_tokens, k_tiles) + elif self.is_per_act_token: + return (max_tokens, 1) + else: + return (1, 1) + else: + return None + + def batched_scale_shape( + self, + num_experts: int, + max_tokens: int, + hidden_dim: int, + ) -> tuple[int, int, int] | None: + """ + Construct the proper activation batched scale shape for this + config, e.g. (num experts, *scale_shape). + """ + if self.is_quantized: + scale_shape = self.scale_shape(max_tokens, hidden_dim) + assert scale_shape is not None + return (num_experts, *scale_shape) + else: + return None + + @staticmethod + def make( + quant_dtype: torch.dtype | str | None = None, + per_act_token_quant: bool = False, + per_out_ch_quant: bool = False, + block_shape: list[int] | None = None, + w1_scale: Union[torch.Tensor, "PrecisionConfig", None] = None, + w2_scale: Union[torch.Tensor, "PrecisionConfig", None] = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + g1_alphas: torch.Tensor | None = None, + g2_alphas: torch.Tensor | None = None, + a1_gscale: torch.Tensor | None = None, + a2_gscale: torch.Tensor | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + weight_dtype: torch.dtype | str | None = None, + ) -> "FusedMoEQuantConfig": + """ + General builder function for a FusedMoEQuantConfig. + - quant_dtype: Optional quantization type. None if activations are + unquantized or quantized prior to calling. Note: "nvfp4", "mxfp4", + "mxfp6_e3m2", "mxfp6_e2m3" are the only valid string values + for quant_dtype. + - per_act_token_quant: Activations have per token quantization. + - per_out_ch_quant: Outputs have per channel quantization. (only + for cutlass). + - block_shape: Optional block size for block-wise quantization. + Incompatible with per_act_token and per_out_ch quant. + - w1_scale: Optional scale to be used for w1. + - w2_scale: Optional scale to be used for w2. + - a1_scale: Optional scale to be used for a1. + - a2_scale: Optional scale to be used for a2. + - g1_alphas: Optional global quantization scales for w1 (for nvfp4). + - g2_alphas: Optional global quantization scales for w2 (for nvfp4). + - a1_gscale: Optional global quantization scales for a1 (for nvfp4). + - a2_gscale: Optional global quantization scales for a2 (for nvfp4). + - w1_bias: Optional biases for w1 (GPT OSS Triton). + - w2_bias: Optional biases for w1 (GPT OSS Triton). + - w1_zp: Optional w1 zero points for int4/int8 quantization. + - w2_zp: Optional w2 zero points for int4/int8 quantization. + """ + assert not isinstance(quant_dtype, str) or quant_dtype in { + "nvfp4", + "mxfp4", + "mxfp6_e3m2", + "mxfp6_e2m3", + } + assert not isinstance(weight_dtype, str) or weight_dtype in { + "nvfp4", + "mxfp4", + "mxfp6_e3m2", + "mxfp6_e2m3", + } + + if weight_dtype is None: + weight_dtype = quant_dtype + + a_shape, w_shape = _quant_flags_to_group_shape( + quant_dtype, per_act_token_quant, per_out_ch_quant, block_shape + ) + quant_config = FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(quant_dtype, a_shape, a1_scale, a1_gscale), + _a2=FusedMoEQuantDesc(quant_dtype, a_shape, a2_scale, a2_gscale), + _w1=FusedMoEQuantDesc( + weight_dtype, w_shape, w1_scale, g1_alphas, w1_zp, w1_bias + ), + _w2=FusedMoEQuantDesc( + weight_dtype, w_shape, w2_scale, g2_alphas, w2_zp, w2_bias + ), + ) + assert quant_config.per_act_token_quant == per_act_token_quant + assert quant_config.per_out_ch_quant == per_out_ch_quant + assert quant_config.block_shape == block_shape + return quant_config + + +def fp8_w8a8_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + per_act_token_quant: bool = False, + per_out_ch_quant: bool = False, + block_shape: list[int] | None = None, + a1_gscale: torch.Tensor | None = None, + a2_gscale: torch.Tensor | None = None, + g1_alphas: torch.Tensor | None = None, + g2_alphas: torch.Tensor | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for fp8 activations and fp8 weights. + """ + return FusedMoEQuantConfig.make( + torch.float8_e4m3fn, + w1_scale=w1_scale, + g1_alphas=g1_alphas, + w2_scale=w2_scale, + g2_alphas=g2_alphas, + a1_scale=a1_scale, + a1_gscale=a1_gscale, + a2_scale=a2_scale, + a2_gscale=a2_gscale, + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_out_ch_quant, + block_shape=block_shape, + ) + + +def int8_w8a8_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + per_act_token_quant: bool = False, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for int8 activations and int8 weights. + """ + return FusedMoEQuantConfig.make( + torch.int8, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=False, + block_shape=None, + ) + + +def mxfp4_w4a16_moe_quant_config( + w1_scale: Union[torch.Tensor, "PrecisionConfig"], + w2_scale: Union[torch.Tensor, "PrecisionConfig"], + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for unquantized activations and mxfp4 weights. + """ + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(), + _a2=FusedMoEQuantDesc(), + _w1=FusedMoEQuantDesc("mxfp4", None, w1_scale, None, None, w1_bias), + _w2=FusedMoEQuantDesc("mxfp4", None, w2_scale, None, None, w2_bias), + ) + + +def mxfp4_mxfp8_moe_quant_config( + w1_scale: Union[torch.Tensor, "PrecisionConfig"], + w2_scale: Union[torch.Tensor, "PrecisionConfig"], + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for mxfp4 activations and mxfp4 weights. + """ + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc("mxfp8"), + _a2=FusedMoEQuantDesc("mxfp8"), + _w1=FusedMoEQuantDesc("mxfp4", None, w1_scale, None, None, w1_bias), + _w2=FusedMoEQuantDesc("mxfp4", None, w2_scale, None, None, w2_bias), + ) + + +def ocp_mx_moe_quant_config( + quant_dtype: str, + w1_scale: Union[torch.Tensor, "PrecisionConfig"], + w2_scale: Union[torch.Tensor, "PrecisionConfig"], + weight_dtype: str | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for mxfp4 activations and mxfp4 weights. + """ + assert quant_dtype in OCP_MX_DTYPES + return FusedMoEQuantConfig.make( + quant_dtype=quant_dtype, + weight_dtype=weight_dtype, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + w1_bias=w1_bias, + w2_bias=w2_bias, + per_act_token_quant=False, + per_out_ch_quant=False, + block_shape=block_shape, + ) + + +def nvfp4_moe_quant_config( + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + a1_gscale: torch.Tensor, + a2_gscale: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for mxfp4 activations and nvp4 weights. + """ + return FusedMoEQuantConfig.make( + "nvfp4", + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_gscale=a1_gscale, + a2_gscale=a2_gscale, + g1_alphas=g1_alphas, + g2_alphas=g2_alphas, + per_act_token_quant=False, + per_out_ch_quant=False, + block_shape=None, + ) + + +def int4_w4a16_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_zp: torch.Tensor | None, + w2_zp: torch.Tensor | None, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for 16-bit float activations and int4 weights. + Note: Activations are pre-quantized. + """ + group_shape = GroupShape(*block_shape) if block_shape is not None else None + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(shape=group_shape), + _a2=FusedMoEQuantDesc(shape=group_shape), + _w1=FusedMoEQuantDesc("int4", group_shape, w1_scale, None, w1_zp), + _w2=FusedMoEQuantDesc("int4", group_shape, w2_scale, None, w2_zp), + ) + + +def int8_w8a16_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_zp: torch.Tensor | None, + w2_zp: torch.Tensor | None, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for 16-bit float activations and int8 weights. + Note: Activations are pre-quantized. + """ + group_shape = GroupShape(*block_shape) if block_shape is not None else None + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(shape=group_shape), + _a2=FusedMoEQuantDesc(shape=group_shape), + _w1=FusedMoEQuantDesc(torch.int8, group_shape, w1_scale, None, w1_zp), + _w2=FusedMoEQuantDesc(torch.int8, group_shape, w2_scale, None, w2_zp), + ) + + +def biased_moe_quant_config( + w1_bias: torch.Tensor | None, + w2_bias: torch.Tensor | None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for unquantized activations with biases. + """ + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(), + _a2=FusedMoEQuantDesc(), + _w1=FusedMoEQuantDesc(bias=w1_bias), + _w2=FusedMoEQuantDesc(bias=w2_bias), + ) + + +# A FusedMoEQuantConfig constant for an unquantized MoE op. +FUSED_MOE_UNQUANTIZED_CONFIG: FusedMoEQuantConfig = FusedMoEQuantConfig.make() + + +@dataclass +class FusedMoEParallelConfig: + tp_size: int + dp_size: int + ep_size: int + tp_rank: int + dp_rank: int + ep_rank: int + + use_ep: bool # whether to use EP or not + all2all_backend: str # all2all backend for MoE communication + + @property + def use_all2all_kernels(self): + return self.dp_size > 1 and self.use_ep + + @property + def use_pplx_kernels(self): + return self.use_all2all_kernels and self.all2all_backend == "pplx" + + @property + def use_deepep_ht_kernels(self): + return ( + self.use_all2all_kernels + and self.all2all_backend == "deepep_high_throughput" + ) + + @property + def use_deepep_ll_kernels(self): + return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency" + + @staticmethod + def flatten_tp_across_dp( + tp_size: int, dp_size: int, dp_rank: int + ) -> tuple[int, int]: + tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank() + # There are actually dp_size * tp_size devices. Update tp_size + # and tp_rank so we shard across all devices. + flatten_tp_size = dp_size * tp_size + flatten_tp_rank = dp_rank * tp_size + tp_rank + return flatten_tp_size, flatten_tp_rank + + @staticmethod + def make( + tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig + ) -> "FusedMoEParallelConfig": + """ + Determine MoE parallel configuration. Based on the input `tp_size_`, + `dp_size_` and vllm's parallel config, determine what + level's of parallelism to use in the fused moe layer. + + Args: + tp_size_ (int): `tp_size` passed into the FusedMoE constructor. + dp_size_ (int): `dp_size` passed into the FusedMoE constructor. + vllm_parallel_config (ParallelConfig): vLLM's parallel config + object which contains the `enable_expert_parallel` flag. + + Examples: + When there is no parallelism requested, + i.e. `tp_size_` = `dp_size_` = 1, we simply return the sizes + unaltered and the ranks set to 0. + + Expert Parallelism is considered only when either `dp_size_` or + `tp_size_` is non trivial. + + When TP = 2, DP = 1 and EP = False, the configuration on different + devices: + + - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // + legend : {size, rank} + - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0} + - Comment : Tensors are sharded across 2 devices. + + When TP = 1, DP = 2 and EP = False, the configuration on different + devices: + + - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0} + - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0} + - Comment: There are 2 engine instances and the tensors are sharded + across 2 decvices. + + When TP = 2, DP = 2 and EP = False, the configuration on different + devices: + + - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0} + - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0} + - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0} + - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0} + - Comment: There are 2 engine instances and the tensors are sharded + across 4 devices. + + When, TP = 2, DP = 1 and EP = True, the configuration on different + devices: + + - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0} + - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1} + - Comment: The experts are split between the 2 devices. + + When, TP = 1, DP = 2 and EP = True, the configuration on different + devices: + + - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0} + - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1} + - Comment: There are 2 engine instances and the experts are split + between the 2 devices. + + When TP = 2, DP = 2 and EP = True, the configuration on different + devices: + + - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0} + - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1} + - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2} + - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3} + - Comment: There are 2 engine instances and the experts are split + between the 4 devices. + """ + + use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config.enable_expert_parallel + + dp_size = dp_size_ + dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0 + tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp( + tp_size_, dp_size_, dp_rank + ) + + if not use_ep: + return FusedMoEParallelConfig( + tp_size=tp_size, + tp_rank=tp_rank, + dp_size=dp_size, + dp_rank=dp_rank, + ep_size=1, + ep_rank=0, + use_ep=False, + all2all_backend=vllm_parallel_config.all2all_backend, + ) + # DP + EP / TP + EP / DP + TP + EP + assert use_ep + # In EP, each device owns a set of experts fully. There is no tensor + # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that. + ep_size = tp_size + ep_rank = tp_rank + return FusedMoEParallelConfig( + tp_size=1, + tp_rank=0, + dp_size=dp_size, + dp_rank=dp_rank, + ep_size=ep_size, + ep_rank=ep_rank, + use_ep=True, + all2all_backend=vllm_parallel_config.all2all_backend, + ) + + +# Adapted from pplx-kernels tests/all_to_all_utils.py +@dataclass +class FusedMoEConfig: + num_experts: int + experts_per_token: int + hidden_dim: int + + num_local_experts: int + moe_parallel_config: FusedMoEParallelConfig + + # The activation type. + in_dtype: torch.dtype + + max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE + + has_bias: bool = False + + is_act_and_mul: bool = True + + is_lora_enabled: bool = False + + def __post_init__(self): + if self.dp_size > 1: + logger.debug_once( + "Using FusedMoEConfig::max_num_tokens=%d", self.max_num_tokens + ) + + assert self.max_num_tokens > 0 + + @property + def tp_size(self): + return self.moe_parallel_config.tp_size + + @property + def dp_size(self): + return self.moe_parallel_config.dp_size + + @property + def ep_size(self): + return self.moe_parallel_config.ep_size + + @property + def tp_rank(self): + return self.moe_parallel_config.tp_rank + + @property + def dp_rank(self): + return self.moe_parallel_config.dp_rank + + @property + def ep_rank(self): + return self.moe_parallel_config.ep_rank + + @property + def use_ep(self): + return self.moe_parallel_config.use_ep + + @property + def use_pplx_kernels(self): + return self.moe_parallel_config.use_pplx_kernels + + @property + def use_deepep_ht_kernels(self): + return self.moe_parallel_config.use_deepep_ht_kernels + + @property + def use_deepep_ll_kernels(self): + return self.moe_parallel_config.use_deepep_ll_kernels + + @property + def use_flashinfer_cutlass_kernels(self): + """ + Whether to use FlashInfer cutlass kernels for NVFP4 MoE. + """ + return ( + envs.VLLM_USE_FLASHINFER_MOE_FP4 + and has_flashinfer_cutlass_fused_moe() + and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput" + ) diff --git a/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..56c1a4e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..d3677be --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..265768f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..d3be23d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..99501df --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } + } \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..589f5d3 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..2e0dd7a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..4ea8634 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json new file mode 100644 index 0000000..f3f1a56 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..2003567 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..e076615 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..19046fc --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..ee89655 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..05aed8b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..5f9422f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..555d173 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..e539335 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..555d173 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json new file mode 100644 index 0000000..600bd44 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json @@ -0,0 +1,123 @@ +{ + "triton_version": "3.4.0", + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "16384": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..86b4912 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..ea1ce9a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..ee8a28b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json b/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json new file mode 100644 index 0000000..09d3fa5 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..e1c4cac --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..5de5605 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000..b506820 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json new file mode 100644 index 0000000..2221e99 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..74374c5 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..63de4bf --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,122 @@ +{ + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "8192": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c275cec --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e505935 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..db1b6e9 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b34b6e4 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..60ccde1 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000..b0139b9 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json new file mode 100644 index 0000000..ab169a0 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..324ad7b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..ab6e155 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..249359f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..b962d19 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..6efcc02 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,114 @@ +{ + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16384": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9942546 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json new file mode 100755 index 0000000..f5990fc --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json @@ -0,0 +1,213 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "768": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..1bbb8aa --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..8fb4947 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b4efc9b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3559f33 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json new file mode 100644 index 0000000..03dfc73 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9c07695 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..beaac7f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json b/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json new file mode 100644 index 0000000..d613de3 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json @@ -0,0 +1,82 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..592b60c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,82 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..fc6454e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json b/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json new file mode 100644 index 0000000..4899764 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json b/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json new file mode 100644 index 0000000..ebff99e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..f10e394 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..beeb5a6 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} + diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json new file mode 100644 index 0000000..1fa444b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json new file mode 100644 index 0000000..0442038 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..2a626ac --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..371e87f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json b/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json new file mode 100644 index 0000000..9262a74 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..d251f9b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..0ecf814 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..51ad5b2 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..ee51191 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..6d0cdfd --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..68793c7 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "17408": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "33792": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "41984": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "50176": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "58368": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..6129107 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..de8eec3 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } + } \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..80fce79 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } + } \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..8b94452 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..48f19df --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..039a10e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..3793fca --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..51d03d8 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000..54d3bf1 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..6a40181 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json new file mode 100644 index 0000000..4f500d4 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..cd0cdbe --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,130 @@ +{ + "3328": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "1792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2560": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "2816": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3584": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "3840": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1280": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2304": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..64be6e6 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..0a6a6a7 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,218 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "5120": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "9216": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "13312": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "17408": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "25600": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "33792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "41984": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "50176": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "58368": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..ed8afa6 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..ba9041d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,130 @@ +{ + "3840": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "1792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "3584": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2816": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1280": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "768": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "3328": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "2560": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "2304": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json new file mode 100644 index 0000000..7a7508a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..dbf9a2d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json new file mode 100644 index 0000000..5fea55a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json new file mode 100644 index 0000000..1e3f46e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..5705545 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,130 @@ +{ + "2048": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "1792": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "3328": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2560": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 4 + }, + "768": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2816": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2304": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2 + }, + "1280": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3840": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3584": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..38034fe --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,201 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json new file mode 100644 index 0000000..eb4d11c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000..0611620 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json b/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000..f2ed716 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json b/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000..52f2a82 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..8239492 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json new file mode 100644 index 0000000..c2f79b9 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json new file mode 100644 index 0000000..c1ca100 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bdbaf38 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6e17bcd --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..aa7610c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..df920e8 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e8fe8ea --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0baf13c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c799871 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json new file mode 100644 index 0000000..43c249d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..43c249d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..4dd00d1 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json new file mode 100644 index 0000000..48f9697 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..a8c0571 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json new file mode 100644 index 0000000..f1244c6 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..2e692a1 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..857d11e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..a2ee05d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..63e1187 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e676960 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e676960 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..fc573cd --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3e0ad0d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c6d7e96 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9264ca1 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6fcf408 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c6eabea --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..381eb5d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e676960 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..cc85394 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000..21f6022 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json b/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json new file mode 100644 index 0000000..8ed3ad3 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bf97f67 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..24f13cd --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b4e736b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bb71005 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..ac53df1 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f1ed617 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e72282d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..7ffa2ac --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..4fc4868 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d70adca --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0f5867f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..c7df36e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json new file mode 100644 index 0000000..d104aa5 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..22e3d09 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} + diff --git a/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..94408e2 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000..9f4c3cb --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..20146f5 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json new file mode 100644 index 0000000..d014025 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..8bac7af --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b0bf1bf --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..cc1427c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000..6864939 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..2f0b450 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json new file mode 100644 index 0000000..5d69efe --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..5910027 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..564ff49 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000..a68c831 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..e55df46 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..6825378 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json b/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json new file mode 100644 index 0000000..a0855a9 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json b/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json new file mode 100644 index 0000000..5dd1a8e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..d5b6d02 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..d09508b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..746463a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..bbdb9ad --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..43584b1 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..40d86ff --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..6014d82 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..147a836 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..3622659 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..a01e9c3 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..8cc6c64 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000..39a9912 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..05b5463 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..d4c9ddd --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..c17a4ec --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..170ae7f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json b/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json new file mode 100644 index 0000000..9952f80 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json new file mode 100644 index 0000000..298a361 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..1d9d352 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..9ad5b31 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..2883dfd --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json new file mode 100644 index 0000000..0e210cb --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json b/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json new file mode 100644 index 0000000..e4fa1e2 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..8abfd84 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..2fc18a5 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..be8d4a7 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..71fdd88 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json new file mode 100644 index 0000000..082456d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json b/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json new file mode 100644 index 0000000..c3b2e7f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..b2799ed --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json new file mode 100644 index 0000000..c02de2f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json new file mode 100644 index 0000000..3e0bc75 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..9f7ed67 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..b8d3be2 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..21b7255 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..eaf32f6 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..2c897db --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json new file mode 100644 index 0000000..bba1d21 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json b/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json new file mode 100644 index 0000000..de1c413 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json b/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json new file mode 100644 index 0000000..5a9910a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json b/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json new file mode 100644 index 0000000..fd675df --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json @@ -0,0 +1,82 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..e410671 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,82 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..311d2e8 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..91c4b91 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..a7cfd17 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..8fee30e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..3caae02 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..b6f1d01 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..4bf7753 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..f245285 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json new file mode 100644 index 0000000..3918c93 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..3f3ccda --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,138 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..841044a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..59be497 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..0e5fd1e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..d6ad635 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..16e0a91 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json b/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json new file mode 100644 index 0000000..d766fc0 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..8323f51 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..1b46cb5 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..6d5b1ae --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json new file mode 100644 index 0000000..ffc1b23 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json new file mode 100644 index 0000000..f4c0f84 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..5c8185c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..97c9f44 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..e4110a5 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..0883ef4 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..81bb765 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..811c77a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..2758e48 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json new file mode 100644 index 0000000..fc31215 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..0bb423b --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..5557187 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..26bcbf2 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d677d69 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,154 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..1a0aa33 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..9952be6 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..379ca10 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..5a3f415 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..6cb80f4 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json new file mode 100644 index 0000000..de9d0ab --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json new file mode 100644 index 0000000..b41f9d4 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..edf2a38 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json new file mode 100644 index 0000000..32bbadb --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..673bae2 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..b2100ce --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..e6f753c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..53f3394 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json new file mode 100644 index 0000000..d720deb --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json @@ -0,0 +1,173 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 7 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 2, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "192": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 128, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 8 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "6144": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 2 + }, + "8192": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 16, + "num_ctas": 1, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..48bb5f2 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..a64d06c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..2c49f35 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json new file mode 100644 index 0000000..c7db6c0 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..dbc6247 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..cc614e6 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..32c0c9d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..4dd475c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..2ed15f3 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..bd2c6fb --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..8d7b780 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..7a07bbf --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json new file mode 100644 index 0000000..3a3268c --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000..f578c8d --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..918f683 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000..e341a67 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..eb81726 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json new file mode 100644 index 0000000..0c7062a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..cd4fb8f --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000..cf66868 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json new file mode 100644 index 0000000..c27ca0a --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json new file mode 100644 index 0000000..da477b1 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json new file mode 100644 index 0000000..34b916e --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000..96cbc11 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/fused_moe/configs/README b/model_executor/layers/fused_moe/configs/README new file mode 100644 index 0000000..85970e2 --- /dev/null +++ b/model_executor/layers/fused_moe/configs/README @@ -0,0 +1,12 @@ +This directory contains tuned configurations for different settings of the fused_moe kernel. +For different settings of +- E (number of experts) +- N (intermediate size) +- device_name (torch.cuda.get_device_name()) +the JSON file contains a mapping from M (batch size) to the chosen configuration. + +The example configurations provided are for the Mixtral model for TP2 on H100 +and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have +N = 7168 and for TP4 we have N = 3584. + +See `benchmark/kernels/benchmark_moe.py` on how to generate these config files. diff --git a/model_executor/layers/fused_moe/cpu_fused_moe.py b/model_executor/layers/fused_moe/cpu_fused_moe.py new file mode 100644 index 0000000..23ace34 --- /dev/null +++ b/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -0,0 +1,354 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import torch +from torch.nn import functional as F + +from vllm import _custom_ops as ops +from vllm import envs + + +def silu_and_mul(x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + return F.silu(x[..., :d]) * x[..., d:] + + +def swigluoai_and_mul( + x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0 +) -> torch.Tensor: + d = x.shape[-1] // 2 + gate, up = x[..., :d], x[..., d:] + gate = gate.clamp(max=limit) + up = up.clamp(min=-limit, max=limit) + glu = gate * torch.sigmoid(alpha * gate) + return (up + 1) * glu + + +def grouped_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch" + + gating_output = gating_output.float() + if scoring_func == "softmax": + scores = torch.softmax(gating_output, dim=-1) + elif scoring_func == "sigmoid": + scores = gating_output.sigmoid() + else: + raise ValueError(f"Unsupported scoring function: {scoring_func}") + + num_token = scores.shape[0] + if e_score_correction_bias is not None: + original_scores = scores + scores = scores + e_score_correction_bias.unsqueeze(0) + group_scores = ( + scores.view(num_token, num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1) + ) + else: + group_scores = ( + scores.view(num_token, num_expert_group, -1).max(dim=-1).values + ) # [n, n_group] + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[ + 1 + ] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group) + .reshape(num_token, -1) + ) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), float("-inf")) # [n, e] + + if e_score_correction_bias is not None: + topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1] + topk_weights = original_scores.gather(1, topk_ids) + else: + topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + if routed_scaling_factor != 1.0: + topk_weights = topk_weights * routed_scaling_factor + return topk_weights, topk_ids.to(torch.int32) + + +def select_experts( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + use_grouped_topk: bool, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + if use_grouped_topk: + assert topk_group is not None + assert num_expert_group is not None + return grouped_topk( + hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + num_expert_group=num_expert_group, + topk_group=topk_group, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + ) + elif custom_routing_function is None: + assert scoring_func == "softmax" + topk_logit_vals, topk_idx = torch.topk( + router_logits, k=top_k, dim=-1, sorted=False + ) + if renormalize: + topk_vals = torch.softmax(topk_logit_vals, dim=-1) + else: + logZ = torch.logsumexp(router_logits, dim=-1, keepdim=True) + topk_vals = (topk_logit_vals - logZ).exp() + return topk_vals.to(torch.float32), topk_idx.to(torch.int32) + else: + return custom_routing_function( + hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + ) + + +class IPEXFusedMOE: + def __init__(self, layer: torch.nn.Module) -> None: + import intel_extension_for_pytorch as ipex + + layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( + layer.w13_weight, + layer.w2_weight, + use_prepack=envs.VLLM_CPU_MOE_PREPACK, + ) + + def __call__( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + ) -> torch.Tensor: + assert activation == "silu", f"{activation} is not supported." + assert not apply_router_weight_on_input + assert routed_scaling_factor == 1.0, ( + f"routed_scaling_factor {routed_scaling_factor} is not supported." + ) + return layer.ipex_fusion( + x, + use_grouped_topk, + top_k, + router_logits, + renormalize, + topk_group, + num_expert_group, + custom_routing_function, + scoring_func, + e_score_correction_bias, + ) + + +class SGLFusedMOE: + def __init__(self, layer: torch.nn.Module) -> None: + pass + + def __call__( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + ) -> torch.Tensor: + assert activation == "silu", f"{activation} is not supported." + assert not apply_router_weight_on_input + topk_weights, topk_ids = select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + ) + + torch.ops._C.fused_experts_cpu( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + True, + False, + False, + None, + None, + None, + None, + None, + True, + ) + return x + + +class CPUFusedMOE: + def __init__(self, layer: torch.nn.Module) -> None: + use_onednn_mm = ops._supports_onednn and ops.is_onednn_acl_supported() + + num_experts = layer.w13_weight.size(0) + has_w13_bias = hasattr(layer, "w13_bias") + has_w2_bias = hasattr(layer, "w2_bias") + + layer.gate_up_linear = [] + layer.down_linear = [] + + for i in range(num_experts): + layer_w13_weight = layer.w13_weight[i] + layer_w13_bias = layer.w13_bias[i] if has_w13_bias else None + layer_w2_weight = layer.w2_weight[i] + layer_w2_bias = layer.w2_bias[i] if has_w2_bias else None + if use_onednn_mm: + gate_up_handle = ops.create_onednn_mm(layer_w13_weight.t(), 32) + layer.gate_up_linear.append( + lambda x, handle=gate_up_handle, bias=layer_w13_bias: ops.onednn_mm( + handle, x, bias + ) + ) + down_handle = ops.create_onednn_mm(layer_w2_weight.t(), 32) + layer.down_linear.append( + lambda x, handle=down_handle, bias=layer_w2_bias: ops.onednn_mm( + handle, x, bias + ) + ) + else: + layer.gate_up_linear.append( + lambda x, w=layer_w13_weight, b=layer_w13_bias: F.linear(x, w, b) + ) + layer.down_linear.append( + lambda x, w=layer_w2_weight, b=layer_w2_bias: F.linear(x, w, b) + ) + if use_onednn_mm: # remove weight + layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) + layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) + + def __call__( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + ) -> torch.Tensor: + assert activation in {"silu", "swigluoai"}, f"{activation} is not supported." + assert not apply_router_weight_on_input + topk_weights, topk_ids = select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + ) + + # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53 + len_experts = global_num_experts + + cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts)) + cnts.scatter_(1, topk_ids.to(torch.int64), 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = topk_ids.view(-1).argsort() + + sorted_tokens = x[idxs // topk_ids.shape[1]] + tokens_per_expert = tokens_per_expert.cpu().numpy() + + outputs = [] + start_idx = 0 + + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + + gate_up = layer.gate_up_linear[i](tokens_for_this_expert) + if activation == "swigluoai": + gate_up = swigluoai_and_mul(gate_up) + else: + gate_up = silu_and_mul(gate_up) + expert_out = layer.down_linear[i](gate_up) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) + new_x = torch.empty_like(outs) + + new_x[idxs] = outs + final_out = ( + new_x.view(*topk_ids.shape, -1) + .type(topk_weights.dtype) + .mul_(topk_weights.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + return final_out diff --git a/model_executor/layers/fused_moe/cutlass_moe.py b/model_executor/layers/fused_moe/cutlass_moe.py new file mode 100644 index 0000000..6753a19 --- /dev/null +++ b/model_executor/layers/fused_moe/cutlass_moe.py @@ -0,0 +1,1052 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""CUTLASS based Fused MoE kernels.""" + +from collections.abc import Callable + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + moe_permute, + moe_unpermute, +) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate, + TopKWeightAndReduceNoOP, +) +from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + + +def run_cutlass_moe_fp8( + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation_callable: Callable, + global_num_experts: int, + expert_map: torch.Tensor | None, + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: torch.Tensor | None, + out_dtype: torch.dtype, + per_act_token: bool, + per_out_ch: bool, + use_batched_format: bool, + topk_weights: torch.Tensor | None, +): + a1q = hidden_states + + assert w1_scale is not None + assert w2_scale is not None + assert w1.dtype == torch.float8_e4m3fn + assert w2.dtype == torch.float8_e4m3fn + assert a1q.size(-1) == w1.size(2), "Hidden size mismatch w1" + assert w1.size(1) == w2.size(2) * 2, "Hidden size mismatch w2" + assert ( + w1_scale.dim() == 1 or w1_scale.size(1) == 1 or w1_scale.shape[1] == w1.size(1) + ), "W1 scale shape mismatch" + assert ( + w2_scale.dim() == 1 or w2_scale.size(1) == 1 or w2_scale.shape[1] == w2.size(1) + ), "W2 scale shape mismatch" + assert w1.size(0) == w2.size(0), "Expert number mismatch" + assert ( + a1q_scale is None + or a1q_scale.dim() == 0 + or a1q_scale.size(0) == 1 + or a1q_scale.size(0) == a1q.shape[0] + ), "Input scale shape mismatch" + assert w1.size(0) == w2.size(0), "Weights expert number mismatch" + assert w1.size(0) == w1_scale.size(0), "w1 scales expert number mismatch" + assert w1.size(0) == w2_scale.size(0), "w2 scales expert number mismatch" + assert ( + a2_scale is None + or a2_scale.dim() == 0 + or a2_scale.size(0) == 1 + or a2_scale.size(0) == a1q.shape[0] + ), "Intermediate scale shape mismatch" + assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype" + if expert_map is not None: + assert expert_num_tokens is None + + # We have two modes: batched experts and non-batched experts. + # In the non-batched mode, the input tokens are not padded: thus, the shape + # of the input is [total_num_tokens, hidden_size]. The input and output + # require shuffling by a_map and c_map such that the tokens assigned to + # each expert are contiguous. + # In the batched mode, the input tokens are padded per expert to ensure that + # the batched dispatch and combine functions work correctly: thus, the shape + # of the input is [num_experts, max_num_tokens_per_expert, hidden_size]. + # The batched input and output require no shuffling by a_map and c_map since + # their tokens are already contiguous for each expert as a result of + # the dispatch function. + + M = a1q.size(0) # non batched expert M + padded_M = a1q.size(1) # batched expert M + _, K, N = w2.shape + device = a1q.device + + assert w1.size(2) == K + assert global_num_experts != -1 + assert a1q_scale is not None + + if expert_map is not None: + "Translate info from expert_map to topk_ids" + local_topk_ids = torch.where( + expert_map[topk_ids] != -1, expert_map[topk_ids], -1 + ) + else: + local_topk_ids = topk_ids + + topk = local_topk_ids.size(1) + local_E = w1.size(0) + + if use_batched_format: + mm1_out = _resize_cache(workspace13, (local_E * padded_M, N * 2)) + act_out = _resize_cache(workspace2, (local_E * padded_M, N)) + quant_out = _resize_cache( + workspace13.view(dtype=torch.float8_e4m3fn), (local_E * padded_M, N) + ) + mm2_out = _resize_cache(workspace2, (local_E * padded_M, K)) + else: + a1q_perm = _resize_cache( + workspace2.view(dtype=torch.float8_e4m3fn), (M * topk, K) + ) + mm1_out = _resize_cache(workspace13, (M * topk, N * 2)) + act_out = _resize_cache(workspace2, (M * topk, N)) + # original workspace are based on input hidden_states dtype (bf16) + quant_out = _resize_cache( + workspace13.view(dtype=torch.float8_e4m3fn), (M * topk, N) + ) + mm2_out = _resize_cache(workspace2, (M * topk, K)) + + if use_batched_format: + assert expert_num_tokens is not None + + expert_offsets = torch.empty((local_E), dtype=torch.int32, device=device) + problem_sizes1 = torch.empty((local_E, 3), dtype=torch.int32, device=device) + problem_sizes2 = torch.empty((local_E, 3), dtype=torch.int32, device=device) + + ops.get_cutlass_pplx_moe_mm_data( + expert_offsets, + problem_sizes1, + problem_sizes2, + expert_num_tokens, + local_E, + padded_M, + N, + K, + ) + + w1_scale = w1_scale.reshape(w1_scale.size(0), -1) + w2_scale = w2_scale.reshape(w2_scale.size(0), -1) + a1q = a1q.reshape(-1, a1q.size(2)) + a1q_scale = a1q_scale.reshape(-1, a1q_scale.size(2)).contiguous() + # c3x get_group_gemm_starts expects int64 to avoid overflow + # during offset calculations + expert_offsets = expert_offsets.to(torch.int64) + else: + problem_sizes1 = torch.empty( + (global_num_experts, 3), dtype=torch.int32, device=device + ) + problem_sizes2 = torch.empty( + (global_num_experts, 3), dtype=torch.int32, device=device + ) + + num_expert = global_num_experts if expert_map is None else expert_map.size(0) + # permuted a1q reuses workspace2 + a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute( + a1q, + a1q_scale, + topk_ids, + num_expert, + local_E, + expert_map, + permuted_hidden_states=a1q_perm, + ) + expert_offsets = expert_offsets[:-1] + + ops.get_cutlass_moe_mm_problem_sizes( + local_topk_ids, problem_sizes1, problem_sizes2, global_num_experts, N, K + ) + + if not per_act_token and (expert_map is not None or use_batched_format): + # this is necessary to avoid imprecise scale calculation caused by + # random data in the unused workspace. The workspace is unused when + # this rank handles only partial tokens, or when it is batched . + mm1_out.fill_(0) + + ops.cutlass_moe_mm( + mm1_out, + a1q, + w1, + a1q_scale, + w1_scale, + expert_offsets, + problem_sizes1, + ab_strides1, + ab_strides1, + c_strides1, + per_act_token, + per_out_ch, + ) + + activation_callable(act_out, mm1_out) + + a2q, a2q_scale = ops.scaled_fp8_quant( + act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out + ) + + if expert_map is not None: + mm2_out.fill_(0) + + ops.cutlass_moe_mm( + mm2_out, + a2q, + w2, + a2q_scale, + w2_scale, + expert_offsets, + problem_sizes2, + ab_strides2, + ab_strides2, + c_strides2, + per_act_token, + per_out_ch, + ) + + if use_batched_format: + output.copy_(mm2_out.reshape(local_E, padded_M, K), non_blocking=True) + else: + # for non-chunking mode the output is resized from workspace13 + # so we need to make sure mm2_out uses workspace2. + moe_unpermute( + out=output, + permuted_hidden_states=mm2_out, + topk_weights=topk_weights, + inv_permuted_idx=inv_perm, + ) + + +class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + out_dtype: torch.dtype | None, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + quant_config: FusedMoEQuantConfig, + ): + assert quant_config.use_fp8_w8a8 + super().__init__(quant_config) + self.out_dtype = out_dtype + self.ab_strides1 = ab_strides1 + self.ab_strides2 = ab_strides2 + self.c_strides1 = c_strides1 + self.c_strides2 = c_strides2 + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert self.w1_zp is None, "w1_zp is not supported in CUTLASS MoE" + assert self.w2_zp is None, "w2_zp is not supported in CUTLASS MoE" + + expert_num_tokens = None + if expert_tokens_meta is not None: + expert_num_tokens = expert_tokens_meta.expert_num_tokens + + activation_callable = lambda o, i: self.activation(activation, o, i) + + use_batched_format = ( + self.activation_formats[0] == mk.FusedMoEActivationFormat.BatchedExperts + ) + + in_dtype = hidden_states.dtype + run_cutlass_moe_fp8( + output, + hidden_states, + w1, + w2, + topk_ids, + activation_callable, + global_num_experts, + expert_map, + self.w1_scale, + self.w2_scale, + a1q_scale, + a2_scale, + self.ab_strides1, + self.ab_strides2, + self.c_strides1, + self.c_strides2, + workspace13, + workspace2, + expert_num_tokens, + self.out_dtype if self.out_dtype is not None else in_dtype, + self.per_act_token_quant, + self.per_out_ch_quant, + use_batched_format, + topk_weights, + ) + + +class CutlassExpertsFp8(CutlassExpertsFp8Base): + def __init__( + self, + out_dtype: torch.dtype | None, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + quant_config: FusedMoEQuantConfig, + ): + super().__init__( + out_dtype, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, + quant_config, + ) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # topk weights and reduction are fused in moe_unpermute cuda kernel + return TopKWeightAndReduceNoOP() + + def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype: + return self.out_dtype if self.out_dtype is not None else act_dtype + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + workspace1 = (M * topk, max(N, K)) + workspace2 = (M * topk, max(N // 2, K)) + output = (M, K) + return (workspace1, workspace2, output) + + +class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): + def __init__( + self, + max_experts_per_worker: int, + num_dispatchers: int, + out_dtype: torch.dtype | None, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + quant_config: FusedMoEQuantConfig, + ): + super().__init__( + out_dtype, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, + quant_config, + ) + assert max_experts_per_worker > 0 + self.max_experts_per_worker = max_experts_per_worker + self.num_dispatchers = num_dispatchers + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts, + ) + + def supports_chunking(self) -> bool: + return False + + def supports_expert_map(self) -> bool: + return False + + def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype: + return self.out_dtype if self.out_dtype is not None else act_dtype + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + num_dp = self.num_dispatchers + assert num_dp is not None + workspace1 = (self.max_experts_per_worker, M * num_dp, max(N, K)) + workspace2 = (self.max_experts_per_worker, M * num_dp, max(N // 2, K)) + output = (self.max_experts_per_worker, M, K) + return (workspace1, workspace2, output) + + +def cutlass_moe_fp8( + a: torch.Tensor, + w1_q: torch.Tensor, + w2_q: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + quant_config: FusedMoEQuantConfig, + activation: str = "silu", + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, +) -> torch.Tensor: + """ + This function computes a a8w8-quantized Mixture of Experts (MoE) layer + using two sets of quantized weights, w1_q and w2_q, and top-k gating + mechanism. The matrix multiplications are implemented with CUTLASS + grouped gemm. + + Parameters: + - a (torch.Tensor): The input tensor to the MoE layer. + Shape: [M, K] + - w1_q (torch.Tensor): The first set of fp8-quantized expert weights. + Shape: [num_experts, K, 2N] (the weights are passed transposed) + - w2_q (torch.Tensor): The second set of fp8-quantized expert weights. + Shape: [num_experts, N, K] (the weights are passed transposed) + - topk_weights (torch.Tensor): The weights of each token->expert mapping. + - topk_ids (torch.Tensor): The token->expert mappings. + - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q. + Shape: [num_experts] or [num_experts, 2N] + - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q. + Shape: [num_experts] or [num_experts, K] + - ab_strides1 (torch.Tensor): The input/weight strides for the first gemm. + Shape: [num_experts] + - ab_strides2 (torch.Tensor): The input/weight strides for the second gemm. + Shape: [num_experts] + - c_strides1 (torch.Tensor): The output strides for the first gemm. + Shape: [num_experts] + - c_strides2 (torch.Tensor): The output strides for the second gemm. + Shape: [num_experts] + - per_act_token (Optional[bool]): Whether the scale is per-token or + per-tensor. + - activation (str): The activation function to use. + - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a. + Shape: scalar or [M] + - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to + quantize the intermediate result between the gemms. + Shape: scalar or [M] + - expert_map (Optional[torch.Tensor]): In the case of Expert parallel, + every Rank is responsible for a subset of experts. expert_map is a + mapping from global expert-id to local expert-id. When expert_map[i] + is -1, it means that this Rank is not responsible for global + expert-id i. + - apply_router_weight_on_input (bool): When true, the topk weights are + applied directly on the inputs. This is only applicable when topk is 1. + - global_num_experts (int): The total number of experts. + + Returns: + - torch.Tensor: The fp16 output tensor after applying the MoE layer. + """ + assert quant_config is not None + + if quant_config.a1_scale is not None: + assert quant_config.per_act_token_quant == (quant_config.a1_scale.numel() != 1) + if quant_config.a2_scale is not None: + assert quant_config.per_act_token_quant == (quant_config.a2_scale.numel() != 1) + + if quant_config.w1_scale is not None: + if quant_config.per_out_ch_quant: + assert quant_config.w1_scale.dim() > 1 and quant_config.w1_scale.size( + 1 + ) == w1_q.size(1) + else: + assert ( + quant_config.w1_scale.dim() == 1 or quant_config.w1_scale.size(1) == 1 + ) + + num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(0) + + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + CutlassExpertsFp8( + out_dtype=a.dtype, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + quant_config=quant_config, + ), + ) + + return fn( + a, + w1_q, + w2_q, + topk_weights, + topk_ids, + activation=activation, + global_num_experts=num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + + +def run_cutlass_moe_fp4( + output: torch.Tensor, + a: torch.Tensor, + a1_gscale: torch.Tensor, + w1_fp4: torch.Tensor, + w1_blockscale: torch.Tensor, + w1_alphas: torch.Tensor, + a2_gscale: torch.Tensor, + w2_fp4: torch.Tensor, + w2_blockscale: torch.Tensor, + w2_alphas: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + m: int, + n: int, + k: int, + e: int, + device: torch.device, + apply_router_weight_on_input: bool = False, +) -> None: + """ + MoE implementation for FP4 Inputs + + # Gemm 1 + a: Input tensor: [m, k] (half/bfloat16) + a1_gscale: Activation scale per expert: [e] (float32) + w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k] + w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1) + (Note: `n` is the up projection output dim, `k` is the input dim in + full precision) + w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3) + (Block size = 16 for NVFP4) + + # Gemm 2 + a2_gscale: Activation scale per expert: [e] + w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n] + w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1) + w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3 + + topk_weights: [m, topk] dtype: float8 + topk_ids: [m, topk] dtype: float8 + + m, n, k: Unquantized weight shapes, dtype: int + e: number of experts, dtype: int + + assumes that topk < k < n to satisfy - up/down projection expectations. + """ + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8" + assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8" + assert ( + w1_fp4.ndim == 3 + and w2_fp4.ndim == 3 + and w1_blockscale.ndim == 3 + and w2_blockscale.ndim == 3 + ), "All Weights must be of rank 3 for cutlass_moe_fp4" + m_a, k_a = a.shape + e_w1, nx2_w1, half_k_w1 = w1_fp4.shape + e_w2, k_w2, half_n_w2 = w2_fp4.shape + + assert e_w1 == e_w2 and e_w1 == e, ( + "Number of experts must match", + f" between weights. {e_w1}, {e_w2}, {e}", + ) + assert k_a == half_k_w1 * 2 and k == k_w2, ( + "Hidden size mismatch between a, w1 and w2" + ) + assert nx2_w1 == n * 2 and half_n_w2 * 2 == n, "mismatch in expected `n`" + assert m == m_a, "input shape mismatch" + assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1" + assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype" + assert topk_weights.size(0) == m and topk_ids.size(0) == m, ( + "topk must be provided for each row of a" + ) + topk = topk_ids.size(1) + out_dtype = a.dtype + num_topk = topk_ids.size(1) + + expert_offsets = torch.empty((e + 1), dtype=torch.int32, device=device) + blockscale_offsets = torch.empty((e + 1), dtype=torch.int32, device=device) + # Problem size: (num_experts, (m,2n,k)) + problem_sizes1 = torch.empty((e, 3), dtype=torch.int32, device=device) + # Problem size: (num_experts, (m,n,k)) + problem_sizes2 = torch.empty((e, 3), dtype=torch.int32, device=device) + + a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) + c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) + + if apply_router_weight_on_input: + # TODO: this only works for topK=1, will need to update for topK>1 + assert num_topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + a.mul_(topk_weights.to(out_dtype)) + + # problem shapes should have [m, n, k] + # Note that problem sizes are based on logical number of elements. + ops.get_cutlass_moe_mm_data( + topk_ids, + expert_offsets, + problem_sizes1, + problem_sizes2, + a_map, + c_map, + e, + n, + k, + blockscale_offsets, + ) + + a = ops.shuffle_rows(a, a_map) + rep_a_fp4, rep_a_blockscale = ops.scaled_fp4_experts_quant( + a, + a1_gscale, + expert_offsets, + blockscale_offsets, + num_topk, + ) + c1 = _resize_cache(workspace13, (m * topk, n * 2)) + c2 = _resize_cache(workspace2, (m * topk, n)) + c3 = _resize_cache(workspace13, (m * topk, k)) + ops.cutlass_fp4_moe_mm( + c1, + rep_a_fp4, + w1_fp4, + rep_a_blockscale, + w1_blockscale, + w1_alphas, + problem_sizes1, + expert_offsets[:-1], + blockscale_offsets[:-1], + ) + del rep_a_fp4, rep_a_blockscale + torch.ops._C.silu_and_mul(c2, c1) + int_fp4, int_blockscale = ops.scaled_fp4_experts_quant( + c2, a2_gscale, expert_offsets, blockscale_offsets, num_topk + ) + + ops.cutlass_fp4_moe_mm( + c3, + int_fp4, + w2_fp4, + int_blockscale, + w2_blockscale, + w2_alphas, + problem_sizes2, + expert_offsets[:-1], + blockscale_offsets[:-1], + ) + del int_fp4, int_blockscale + + c3 = ops.shuffle_rows(c3, c_map) + + assert output.dtype == out_dtype + if not apply_router_weight_on_input: + output.copy_( + ( + c3.view(m, num_topk, k) + * topk_weights.view(m, num_topk, 1).to(out_dtype) + ).sum(dim=1), + non_blocking=True, + ) + else: + output.copy_(c3.view(m, num_topk, k).sum(dim=1), non_blocking=True) + return + + +# Split into batched and non-batched +class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + max_experts_per_worker: int, + out_dtype: torch.dtype, + quant_config: FusedMoEQuantConfig, + use_batched_format: bool = False, + ): + super().__init__(quant_config) + self.max_experts_per_worker = max_experts_per_worker + self.out_dtype = out_dtype + self.use_batched_format = use_batched_format + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + if self.use_batched_format: + return ( + mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts, + ) + else: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_expert_map(self) -> bool: + return False + + def supports_chunking(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype: + return self.out_dtype if self.out_dtype is not None else act_dtype + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + workspace1: tuple[int, ...] = () + workspace2: tuple[int, ...] = () + output: tuple[int, ...] = () + if self.use_batched_format: + workspace1 = (self.max_experts_per_worker, M, max(N, K)) + workspace2 = (self.max_experts_per_worker, M, (N // 2)) + output = (self.max_experts_per_worker, M, K) + else: + workspace1 = (M * topk, max(2 * N, K)) + workspace2 = (M * topk, N) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, # unused + a2_scale: torch.Tensor | None, # unused + workspace13: torch.Tensor | None, + workspace2: torch.Tensor | None, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + e, m, n, k, _ = self.moe_problem_size(hidden_states, w1, w2, topk_ids) + n = w2.shape[2] * 2 + + run_cutlass_moe_fp4( + output=output, + a=hidden_states, + a1_gscale=self.a1_gscale, + w1_fp4=w1, + w1_blockscale=self.w1_scale, + w1_alphas=self.g1_alphas, + a2_gscale=self.a2_gscale, + w2_fp4=w2, + w2_blockscale=self.w2_scale, + w2_alphas=self.g2_alphas, + topk_weights=topk_weights, + topk_ids=topk_ids, + workspace13=workspace13, + workspace2=workspace2, + m=m, + n=n, + k=k, + e=e, + device=hidden_states.device, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + +def cutlass_moe_fp4( + a: torch.Tensor, + w1_fp4: torch.Tensor, + w2_fp4: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + quant_config: FusedMoEQuantConfig, + m: int, + n: int, + k: int, + e: int, + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, +) -> torch.Tensor: + assert expert_map is None, ( + "Expert Parallelism / expert_map " + "is currently not supported for " + "ModelOptNvFp4FusedMoE's cutlass_moe_fp4." + ) + + # TODO(bnell): this feels a bit hacky + # NVFP4 requires two levels of quantization, which involves + # computing some scaling factors dynamically. This makes it + # incompatible with the typical prepare -> MoE -> finalize + # pipeline. Move the quantization logic into the MoE body. + quant_config = FusedMoEQuantConfig.make( + quant_dtype=None, # skip quantization in prepare/finalize + per_act_token_quant=quant_config.per_act_token_quant, + per_out_ch_quant=quant_config.per_out_ch_quant, + block_shape=quant_config.block_shape, + g1_alphas=quant_config.g1_alphas, + g2_alphas=quant_config.g2_alphas, + a1_gscale=quant_config.a1_gscale, + a2_gscale=quant_config.a2_gscale, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + ) + + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + CutlassExpertsFp4( + max_experts_per_worker=e, + out_dtype=a.dtype, + quant_config=quant_config, + use_batched_format=False, + ), + ) + + return fn( + hidden_states=a, + w1=w1_fp4, + w2=w2_fp4, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + activation="silu", + global_num_experts=e, + expert_map=None, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + +def _valid_cutlass_block_scaled_grouped_gemm( + w1: torch.Tensor, + w2: torch.Tensor, + inplace: bool, + activation: str, + apply_router_weight_on_input: bool, + expert_map: torch.Tensor | None, +) -> bool: + def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int): + return N % 128 == 0 and K % 128 == 0 + + _, K, N = w2.size() + if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K): + logger.debug_once( + "CutlassBlockScaledGroupedGemm disabled: unaligned problem size. " + "N: %s, K: %s", + N, + K, + ) + return False + + if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn: + logger.debug_once( + "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). " + "w1.dtype: %s, w2.dtype: %s", + w1.dtype, + w2.dtype, + ) + return False + + if expert_map is not None: + logger.debug_once( + "CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported." + ) + return False + + if activation != "silu": + logger.debug_once( + "CutlassBlockScaledGroupedGemm disabled: only activation silu is supported." + ) + return False + + if apply_router_weight_on_input: + logger.debug_once( + "CutlassBlockScaledGroupedGemm disabled:" + " apply_router_weight_on_input is not supported." + ) + return False + + if inplace: + logger.debug_once( + "CutlassBlockScaledGroupedGemm disabled: inplace is not supported." + ) + return False + + return True + + +# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8. +def run_cutlass_block_scaled_fused_experts( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, +) -> torch.Tensor: + w1_q = w1.transpose(1, 2) + w2_q = w2.transpose(1, 2) + w1_scale = w1_scale.transpose(1, 2) + w2_scale = w2_scale.transpose(1, 2) + + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert a.shape[0] == topk_ids.shape[0], ( + "a and topk_ids must have the same batch size" + ) + assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn" + assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn" + assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1" + assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2" + assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch" + assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch" + assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch" + assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype" + + out_dtype = a.dtype + num_experts = w1_q.size(0) + m = a.size(0) + k = w1_q.size(1) + n = w2_q.size(1) + + topk = topk_ids.size(1) + + a_q, a1_scale = _fp8_quantize( + a, A_scale=None, per_act_token=False, block_shape=[128, 128] + ) + device = a_q.device + + expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device) + problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device) + problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device) + + a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) + c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) + + ops.get_cutlass_moe_mm_data( + topk_ids, + expert_offsets, + problem_sizes1, + problem_sizes2, + a_map, + c_map, + num_experts, + n, + k, + ) + + rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype) + rep_a1_scales = a1_scale[a_map] + + c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device) + c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device) + + ops.cutlass_blockwise_scaled_grouped_mm( + c1, + rep_a_q, + w1_q, + rep_a1_scales, + w1_scale, + problem_sizes1, + expert_offsets[:-1], + ) + + intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device) + torch.ops._C.silu_and_mul(intermediate, c1) + + intermediate_q, a2_scale = _fp8_quantize( + intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128] + ) + + ops.cutlass_blockwise_scaled_grouped_mm( + c2, + intermediate_q, + w2_q, + a2_scale, + w2_scale, + problem_sizes2, + expert_offsets[:-1], + ) + + return ( + c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype) + ).sum(dim=1) diff --git a/model_executor/layers/fused_moe/deep_gemm_moe.py b/model_executor/layers/fused_moe/deep_gemm_moe.py new file mode 100644 index 0000000..86cdd25 --- /dev/null +++ b/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -0,0 +1,387 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +from tqdm import tqdm + +import vllm.envs as env +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + fp8_w8a8_moe_quant_config, +) +from vllm.model_executor.layers.fused_moe.deep_gemm_utils import ( + compute_aligned_M, + deepgemm_moe_permute, + deepgemm_unpermute_and_reduce, +) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) +from vllm.model_executor.layers.fused_moe.utils import _resize_cache +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, +) +from vllm.utils.deep_gemm import ( + get_mk_alignment_for_contiguous_layout, + m_grouped_fp8_gemm_nt_contiguous, +) +from vllm.utils.func_utils import run_once +from vllm.utils.import_utils import has_deep_gemm + +logger = init_logger(__name__) + + +def _valid_deep_gemm_shape(M: int, N: int, K: int) -> bool: + align = get_mk_alignment_for_contiguous_layout()[0] + return align <= M and N % align == 0 and K % align == 0 + + +def _valid_deep_gemm( + hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor +) -> bool: + """ + Check if the given problem size is supported by the DeepGemm grouped + gemm kernel. All of M, N, K and the quantization block_shape must be + aligned by `dg.get_m_alignment_for_contiguous_layout()`. + """ + if not has_deep_gemm(): + logger.debug_once("DeepGemm disabled: deep_gemm not available.") + return False + + M = hidden_states.size(0) + _, K, N = w2.size() + + align = get_mk_alignment_for_contiguous_layout()[0] + + if not _valid_deep_gemm_shape(M, N, K): + logger.debug_once( + "DeepGemm disabled due to unaligned problem size. " + "M: %s, N: %s, K: %s. M should >= %s " + "and N and K must be multiples of %s. " + "This is not an error and we will fall back to triton.", + M, + N, + K, + align, + align, + ) + return False + elif N <= 512: + logger.debug_once( + "DeepGemm disabled for N <= 512. M: %s, N: %s, K: %s. " + "This means we will fallback to triton " + "for this specific shape for further speed up.", + M, + N, + K, + ) + return False + + if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn: + logger.debug_once( + "DeepGemm disabled: invalid weight dtype(s). w1.dtype: %s, w2.dtype: %s", + w1.dtype, + w2.dtype, + ) + return False + + if ( + not hidden_states.is_contiguous() + or not w1.is_contiguous() + or not w2.is_contiguous() + ): + logger.debug_once( + "DeepGemm disabled: weights or activations not contiguous. " + "hidden_states.is_contiguous(): %s, w1.is_contiguous(): %s, " + "w2.is_contiguous(): %s", + hidden_states.is_contiguous(), + w1.is_contiguous(), + w2.is_contiguous(), + ) + return False + + return True + + +@run_once +def warmup_deepgemm_gg_contiguous_kernels( + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + num_topk: int, +): + """ + DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the + input tensor shapes. In this function, we construct all possible input + tensor shapes so all the kernels are JIT'ed and cached. + Note that this warmup is expected to happen during the model profile + call and not during actual model inference. + """ + + assert w1.size(0) == w2.size(0), "w1 and w2 must have the same number of experts" + + block_m = get_mk_alignment_for_contiguous_layout()[0] + num_experts = w1.size(0) + device = w1.device + + # This is the maximum GroupedGemm M size that we expect to run + # the grouped_gemm with. + MAX_M = compute_aligned_M( + env.VLLM_FUSED_MOE_CHUNK_SIZE, + num_topk, + num_experts, + block_m, + expert_tokens_meta=None, + ) + # Distribute expert-ids evenly. + MAX_BLOCKS = MAX_M // block_m + expert_ids_block = torch.randint( + low=0, high=num_experts, size=(MAX_BLOCKS,), device=device, dtype=torch.int32 + ) + expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0) + + def _warmup(w: torch.Tensor, w_scale: torch.Tensor): + _, n, k = w.size() + a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn) + a1q_scales = torch.empty( + (MAX_M, k // block_m), device=device, dtype=torch.float32 + ) + out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16) + + pbar = tqdm( + total=MAX_BLOCKS, desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})" + ) + num_tokens = MAX_M + while num_tokens > 0: + m_grouped_fp8_gemm_nt_contiguous( + (a1q[:num_tokens], a1q_scales[:num_tokens]), + (w, w_scale), + out[:num_tokens], + expert_ids[:num_tokens], + ) + pbar.update(1) + num_tokens = num_tokens - block_m + + _warmup(w1, w1_scale) + _warmup(w2, w2_scale) + + +class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__(self, quant_config: FusedMoEQuantConfig): + super().__init__(quant_config) + assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout() + assert quant_config.quant_dtype == torch.float8_e4m3fn + assert not quant_config.per_act_token_quant + assert not quant_config.per_out_ch_quant + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + assert self.block_shape is not None + block_m = self.block_shape[0] + M_sum = compute_aligned_M( + M, topk, local_num_experts, block_m, expert_tokens_meta + ) + assert M_sum % block_m == 0 + + workspace1 = (M_sum, N) + workspace2 = (M_sum, max(N // 2, K)) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert a1q_scale is not None + assert a2_scale is None + assert self.block_shape is not None + assert self.w1_scale is not None + assert self.w2_scale is not None + + a1q = hidden_states + _, N, K = w1.size() + + local_num_experts = w1.size(0) + if global_num_experts == -1: + global_num_experts = local_num_experts + + assert w2.size(1) == K + + M_sum = compute_aligned_M( + M=topk_ids.size(0), + num_topk=topk_ids.size(1), + local_num_experts=local_num_experts, + alignment=get_mk_alignment_for_contiguous_layout()[0], + expert_tokens_meta=expert_tokens_meta, + ) + + a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M_sum, K)) + mm1_out = _resize_cache(workspace13, (M_sum, N)) + act_out = _resize_cache(workspace2, (M_sum, N // 2)) + quant_out = _resize_cache( + workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, N // 2) + ) + mm2_out = _resize_cache(workspace2, (M_sum, K)) + + a1q, a1q_scale, expert_ids, inv_perm = deepgemm_moe_permute( + aq=a1q, + aq_scale=a1q_scale, + topk_ids=topk_ids, + local_num_experts=local_num_experts, + expert_map=expert_map, + expert_tokens_meta=expert_tokens_meta, + aq_out=a1q_perm, + ) + assert a1q.size(0) == M_sum + + m_grouped_fp8_gemm_nt_contiguous( + (a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids + ) + + self.activation(activation, act_out, mm1_out.view(-1, N)) + + a2q_scale: torch.Tensor | None = None + a2q, a2q_scale = per_token_group_quant_fp8( + act_out, self.block_shape[1], column_major_scales=True, out_q=quant_out + ) + + m_grouped_fp8_gemm_nt_contiguous( + (a2q, a2q_scale), (w2, self.w2_scale), mm2_out, expert_ids + ) + + if apply_router_weight_on_input: + topk_weights = torch.ones_like(topk_weights) + + deepgemm_unpermute_and_reduce( + a=mm2_out, + topk_ids=topk_ids, + topk_weights=topk_weights, + inv_perm=inv_perm, + expert_map=expert_map, + output=output, + ) + + +def deep_gemm_moe_fp8( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + apply_router_weight_on_input=False, +) -> torch.Tensor: + """ + This function computes a a8w8-quantized Mixture of Experts (MoE) layer + using two sets of quantized weights, w1_q and w2_q, and top-k gating + mechanism. The matrix multiplications are implemented with DeepGemm + grouped gemm. + + Parameters: + - hidden_states (torch.Tensor): The input tensor to the MoE layer. + Shape: [M, K] + - w1 (torch.Tensor): The first set of fp8 quantized expert weights. + Shape: [num_experts, K, 2N] (the weights are passed transposed) + - w2 (torch.Tensor): The second set of fp8 quantized expert weights. + Shape: [num_experts, N, K] (the weights are passed transposed) + - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q. + Shape: [num_experts] or [num_experts, 2N] + - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q. + Shape: [num_experts] or [num_experts, K] + - topk_weights (torch.Tensor): The weights of each token->expert mapping. + - topk_ids (torch.Tensor): The token->expert mapping for topk_weights. + - inplace (bool): If True, perform the operation in-place. + Defaults to False. + - activation (str): The activation function to apply after the first + MoE layer. + - global_num_experts (int): The total number of experts in the global + expert space. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a. + Shape: scalar or [M] + - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to + quantize the intermediate result between the gemms. + Shape: scalar or [M] + + Returns: + - torch.Tensor: The bfloat16 output tensor after applying the MoE layer. + """ + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=get_mk_alignment_for_contiguous_layout(), + ) + + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + DeepGemmExperts(quant_config), + ) + return fn( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + inplace=inplace, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/model_executor/layers/fused_moe/deep_gemm_utils.py b/model_executor/layers/fused_moe/deep_gemm_utils.py new file mode 100644 index 0000000..6cca954 --- /dev/null +++ b/model_executor/layers/fused_moe/deep_gemm_utils.py @@ -0,0 +1,416 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Taken from https://github.com/ModelTC/LightLLM/blob/8ed97c74c18f11505b048b1ba00ba5c0cef8bff6/lightllm/common/fused_moe/deepep_scatter_gather.py +and updated to fit vllm needs and terminology. +""" + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens +from vllm.triton_utils import tl, triton +from vllm.utils.deep_gemm import get_mk_alignment_for_contiguous_layout +from vllm.utils.math_utils import round_up + + +def expert_num_tokens_round_up_and_sum( + expert_num_tokens: torch.Tensor, alignment: int +) -> int: + # Round up each element in expert_num_tokens to the nearest multiple of + # alignment. + ent = (expert_num_tokens.to(torch.int64) + (alignment - 1)) // alignment * alignment + return torch.sum(ent).item() + + +def compute_aligned_M( + M: int, + num_topk: int, + local_num_experts: int, + alignment: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, +): + if (expert_tokens_meta is not None) and ( + expert_tokens_meta.expert_num_tokens_cpu is not None + ): + return expert_num_tokens_round_up_and_sum( + expert_tokens_meta.expert_num_tokens_cpu, alignment=alignment + ) + + # expert_num_tokens information is not available on the cpu. + # compute the max required size. + M_sum = (M * num_topk) + local_num_experts * (alignment - 1) + M_sum = round_up(M_sum, alignment) + return M_sum + + +@triton.jit +def apply_expert_map(expert_id, expert_map): + if expert_id != -1: + expert_id = tl.load(expert_map + expert_id).to(expert_id.dtype) + return expert_id + + +@triton.jit +def round_up_128(x: int) -> int: + y = 128 + return ((x + y - 1) // y) * y + + +@triton.jit +def _fwd_kernel_ep_scatter_1( + num_recv_tokens_per_expert, + expert_start_loc, + m_indices, + num_experts: tl.constexpr, + BLOCK_E: tl.constexpr, + BLOCK_EXPERT_NUM: tl.constexpr, +): + cur_expert = tl.program_id(0) + + offset_cumsum = tl.arange(0, BLOCK_EXPERT_NUM) + tokens_per_expert = tl.load( + num_recv_tokens_per_expert + offset_cumsum, + mask=offset_cumsum < num_experts, + other=0, + ) + tokens_per_expert = round_up_128(tokens_per_expert) + cumsum = tl.cumsum(tokens_per_expert) - tokens_per_expert + tl.store(expert_start_loc + offset_cumsum, cumsum, mask=offset_cumsum < num_experts) + + cur_expert_start = tl.load(expert_start_loc + cur_expert) + cur_expert_token_num = tl.load(num_recv_tokens_per_expert + cur_expert) + + m_indices_start_ptr = m_indices + cur_expert_start + off_expert = tl.arange(0, BLOCK_E) + + for start_m in tl.range(0, cur_expert_token_num, BLOCK_E, num_stages=4): + tl.store( + m_indices_start_ptr + start_m + off_expert, + cur_expert, + ) + + +@triton.jit +def _fwd_kernel_ep_scatter_2( + total_token_num, + expert_start_loc, + recv_x, + recv_x_stride0, + recv_x_stride1, + recv_x_scale, + recv_x_scale_stride0, + recv_x_scale_stride1, + recv_topk, + recv_topk_stride0, + recv_topk_stride1, + output_tensor, + output_tensor_stride0, + output_tensor_stride1, + output_tensor_scale, + output_tensor_scale_stride0, + output_tensor_scale_stride1, + output_index, + output_index_stride0, + output_index_stride1, + topk_num: tl.constexpr, + expert_map, + HAS_EXPERT_MAP: tl.constexpr, + HIDDEN_SIZE: tl.constexpr, + HIDDEN_SIZE_PAD: tl.constexpr, + SCALE_HIDDEN_SIZE: tl.constexpr, + SCALE_HIDDEN_SIZE_PAD: tl.constexpr, +): + start_token_id = tl.program_id(0) + grid_num = tl.num_programs(0) + + offset_in = tl.arange(0, HIDDEN_SIZE_PAD) + mask = offset_in < HIDDEN_SIZE + + offset_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD) + mask_s = offset_in_s < SCALE_HIDDEN_SIZE + + for token_id in range(start_token_id, total_token_num, grid_num): + to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask) + to_copy_s = tl.load( + recv_x_scale + token_id * recv_x_scale_stride0 + offset_in_s, mask=mask_s + ) + + for topk_index in tl.range(0, topk_num, 1, num_stages=4): + expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 + topk_index) + + if HAS_EXPERT_MAP: + expert_id = apply_expert_map(expert_id, expert_map) + + if expert_id >= 0: + dest_token_index = tl.atomic_add(expert_start_loc + expert_id, 1) + tl.store( + output_index + token_id * output_index_stride0 + topk_index, + dest_token_index, + ) + output_tensor_ptr = ( + output_tensor + dest_token_index * output_tensor_stride0 + ) + output_tensor_scale_ptr = ( + output_tensor_scale + dest_token_index * output_tensor_scale_stride0 + ) + tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask) + tl.store(output_tensor_scale_ptr + offset_in_s, to_copy_s, mask=mask_s) + + +@torch.no_grad() +def ep_scatter( + recv_x: torch.Tensor, + recv_x_scale: torch.Tensor, + recv_topk: torch.Tensor, + num_recv_tokens_per_expert: torch.Tensor, + expert_map: torch.Tensor | None, + expert_start_loc: torch.Tensor, + output_tensor: torch.Tensor, + output_tensor_scale: torch.Tensor, + m_indices: torch.Tensor, + output_index: torch.Tensor, +): + BLOCK_E = 128 # token num of per expert is aligned to 128 + BLOCK_D = 128 # block size of quantization + num_warps = 8 + num_experts = num_recv_tokens_per_expert.shape[0] + hidden_size = recv_x.shape[1] + # grid = (triton.cdiv(hidden_size, BLOCK_D), num_experts) + grid = num_experts + + assert m_indices.shape[0] % BLOCK_E == 0 + + _fwd_kernel_ep_scatter_1[(grid,)]( + num_recv_tokens_per_expert, + expert_start_loc, + m_indices, + num_experts=num_experts, + num_warps=num_warps, + BLOCK_E=BLOCK_E, + BLOCK_EXPERT_NUM=triton.next_power_of_2(num_experts), + ) + + grid = min(recv_topk.shape[0], 1024 * 8) + + _fwd_kernel_ep_scatter_2[(grid,)]( + recv_topk.shape[0], + expert_start_loc, + recv_x, + recv_x.stride(0), + recv_x.stride(1), + recv_x_scale, + recv_x_scale.stride(0), + recv_x_scale.stride(1), + recv_topk, + recv_topk.stride(0), + recv_topk.stride(1), + output_tensor, + output_tensor.stride(0), + output_tensor.stride(1), + output_tensor_scale, + output_tensor_scale.stride(0), + output_tensor_scale.stride(1), + output_index, + output_index.stride(0), + output_index.stride(1), + topk_num=recv_topk.shape[1], + expert_map=expert_map, + HAS_EXPERT_MAP=expert_map is not None, + num_warps=num_warps, + HIDDEN_SIZE=hidden_size, + HIDDEN_SIZE_PAD=triton.next_power_of_2(hidden_size), + SCALE_HIDDEN_SIZE=hidden_size // BLOCK_D, + SCALE_HIDDEN_SIZE_PAD=triton.next_power_of_2(hidden_size // BLOCK_D), + ) + return + + +@triton.jit +def _fwd_kernel_ep_gather( + total_token_num, + input_tensor, + input_tensor_stride0, + input_tensor_stride1, + recv_topk_ids, + recv_topk_ids_stride0, + recv_topk_ids_stride1, + recv_topk_weight, + recv_topk_weight_stride0, + recv_topk_weight_stride1, + input_index, + input_index_stride0, + input_index_stride1, + output_tensor, + output_tensor_stride0, + output_tensor_stride1, + topk_num: tl.constexpr, + expert_map, + HAS_EXPERT_MAP: tl.constexpr, + BLOCK_D: tl.constexpr, +): + cur_block = tl.program_id(0) + start_cur_token = tl.program_id(1) + grid_num = tl.num_programs(1) + + for cur_token in range(start_cur_token, total_token_num, grid_num): + off_d = tl.arange(0, BLOCK_D) + accumulator = tl.zeros([BLOCK_D], dtype=tl.float32) + for topk_index in range(0, topk_num): + expert_id = tl.load( + recv_topk_ids + cur_token * recv_topk_ids_stride0 + topk_index + ) + + if HAS_EXPERT_MAP: + expert_id = apply_expert_map(expert_id, expert_map) + + if expert_id >= 0: + source_token_index = tl.load( + input_index + cur_token * input_index_stride0 + topk_index + ) + acc_weight = tl.load( + recv_topk_weight + cur_token * recv_topk_weight_stride0 + topk_index + ) + tmp = tl.load( + input_tensor + + source_token_index * input_tensor_stride0 + + cur_block * BLOCK_D + + off_d + ) + accumulator += tmp.to(tl.float32) * acc_weight + + tl.store( + output_tensor + + cur_token * output_tensor_stride0 + + cur_block * BLOCK_D + + off_d, + accumulator.to(output_tensor.dtype.element_ty), + ) + + +@torch.no_grad() +def ep_gather( + input_tensor: torch.Tensor, + recv_topk_ids: torch.Tensor, + recv_topk_weight: torch.Tensor, + input_index: torch.Tensor, + expert_map: torch.Tensor | None, + output_tensor: torch.Tensor, +): + num_warps = 2 + num_tokens = output_tensor.shape[0] + hidden_size = input_tensor.shape[1] + BLOCK_D = min(hidden_size, 1024) + assert hidden_size % BLOCK_D == 0 + grid = (triton.cdiv(hidden_size, BLOCK_D), min(num_tokens, 1024)) + + _fwd_kernel_ep_gather[grid]( + num_tokens, + input_tensor, + input_tensor.stride(0), + input_tensor.stride(1), + recv_topk_ids, + recv_topk_ids.stride(0), + recv_topk_ids.stride(1), + recv_topk_weight, + recv_topk_weight.stride(0), + recv_topk_weight.stride(1), + input_index, + input_index.stride(0), + input_index.stride(1), + output_tensor, + output_tensor.stride(0), + output_tensor.stride(1), + topk_num=recv_topk_ids.shape[1], + expert_map=expert_map, + HAS_EXPERT_MAP=expert_map is not None, + num_warps=num_warps, + BLOCK_D=BLOCK_D, + ) + return + + +def deepgemm_moe_permute( + aq: torch.Tensor, + aq_scale: torch.Tensor, + topk_ids: torch.Tensor, + local_num_experts: int, + expert_map: torch.Tensor | None, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + aq_out: torch.Tensor | None = None, +): + assert aq.ndim == 2 + assert topk_ids.dtype.is_signed, "The kernel uses -1 to represent invalid topk_ids" + H = aq.size(1) + device = aq.device + + block_m, block_k = get_mk_alignment_for_contiguous_layout() + + M_sum = compute_aligned_M( + M=topk_ids.size(0), + num_topk=topk_ids.size(1), + local_num_experts=local_num_experts, + alignment=block_m, + expert_tokens_meta=expert_tokens_meta, + ) + + expert_start_loc = torch.empty( + (local_num_experts), device=device, dtype=torch.int32 + ) + + assert aq_out is None or aq_out.shape == (M_sum, H) + if aq_out is None: + aq_out = torch.empty((M_sum, H), device=device, dtype=aq.dtype) + + aq_scale_out = torch.empty( + (M_sum, H // block_k), device=device, dtype=torch.float32 + ) + + maybe_has_empty_blocks = (expert_tokens_meta is None) or ( + expert_tokens_meta.expert_num_tokens_cpu is None + ) + expert_ids_init = torch.zeros if maybe_has_empty_blocks else torch.empty + + expert_ids = expert_ids_init((M_sum), device=device, dtype=torch.int32) + inv_perm = torch.empty(topk_ids.shape, device=device, dtype=torch.int32) + + expert_num_tokens = None + if expert_tokens_meta is not None: + expert_num_tokens = expert_tokens_meta.expert_num_tokens + else: + expert_num_tokens = count_expert_num_tokens( + topk_ids, local_num_experts, expert_map + ) + + ep_scatter( + recv_x=aq, + recv_x_scale=aq_scale, + recv_topk=topk_ids, + num_recv_tokens_per_expert=expert_num_tokens, + expert_start_loc=expert_start_loc, + expert_map=expert_map, + output_tensor=aq_out, + output_tensor_scale=aq_scale_out, + m_indices=expert_ids, + output_index=inv_perm, + ) + + return aq_out, aq_scale_out, expert_ids, inv_perm + + +def deepgemm_unpermute_and_reduce( + a: torch.Tensor, # Grouped gemm output + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + inv_perm: torch.Tensor, + expert_map: torch.Tensor | None, + output: torch.Tensor, +): + return ep_gather( + input_tensor=a, + recv_topk_ids=topk_ids, + recv_topk_weight=topk_weights, + input_index=inv_perm, + expert_map=expert_map, + output_tensor=output, + ) diff --git a/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py new file mode 100644 index 0000000..b3093a3 --- /dev/null +++ b/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -0,0 +1,420 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import deep_ep +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceContiguous, + TopKWeightAndReduceDelegate, +) +from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input +from vllm.utils.math_utils import round_up +from vllm.v1.worker.ubatching import ( + dbo_current_ubatch_id, + dbo_enabled, + dbo_get_previous_event, + dbo_switch_to_comm, + dbo_switch_to_compute, + dbo_switch_to_compute_sync, + dbo_yield_and_switch_from_comm_to_compute, + dbo_yield_and_switch_from_compute_to_comm, +) + + +class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + Prepare/Finalize using DeepEP High-Throughput kernels. + """ + + @staticmethod + def maybe_roundup_layer_hidden_size(hidden_size: int, dtype: torch.dtype) -> int: + # Round up hidden size so it is compatible with DeepEP High Throughput + # kernels. + # DeepEP intranode kernels make copies in units of, + # 32(warp-size) int4 elements. Round up hidden size to respect this. + # For example, an input hidden size of 2880 with dtype torch.bfloat16 + # will be rounded up to 3072. + hidden_size_bytes = hidden_size * dtype.itemsize + xfer_atom_size = 512 # 32 * 16 (size(int4)) + if hidden_size_bytes % xfer_atom_size == 0: + return hidden_size + + hidden_size_bytes = round_up(hidden_size_bytes, xfer_atom_size) + return hidden_size_bytes // dtype.itemsize + + def __init__( + self, + buffer: deep_ep.Buffer, + num_dispatchers: int, + dp_size: int, + rank_expert_offset: int, + ): + super().__init__() + self.buffer = buffer + self.num_dispatchers_ = num_dispatchers + self.dp_size = dp_size + self.rank_expert_offset = rank_expert_offset + self.async_prepare = True + + # The dispatch function returns a handle that the combine function + # requires. Under DBO microbatching we must track one handle per + # micro-batch to avoid races between threads. + self.handles = [None, None] + + # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164 + self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160] + + def num_dispatchers(self) -> int: + return self.num_dispatchers_ + + def output_is_reduced(self) -> bool: + return True + + @property + def activation_format(self) -> mk.FusedMoEActivationFormat: + return mk.FusedMoEActivationFormat.Standard + + def max_num_tokens_per_rank(self) -> int | None: + return None + + def topk_indices_dtype(self) -> torch.dtype | None: + return torch.int64 + + def _get_dispatch_config(self) -> deep_ep.Config | None: + if self.num_dispatchers_ not in self.available_rank_configs: + return None + return deep_ep.Buffer.get_dispatch_config(self.num_dispatchers_) + + def _get_combine_config(self) -> deep_ep.Config | None: + if self.num_dispatchers_ not in self.available_rank_configs: + return None + return deep_ep.Buffer.get_combine_config(self.num_dispatchers_) + + def _do_dispatch( + self, + tokens: torch.Tensor, + token_scales: torch.Tensor | None, + rank_topk_ids: torch.Tensor, + rank_topk_weights: torch.Tensor, + num_experts: int, + a1_scale: torch.Tensor | None, + quant_config: FusedMoEQuantConfig, + ) -> Callable: + has_scales = token_scales is not None + + # We yield before launching the dispatch kernel since the dispatch + # kernel will block the CPU so we want to queue up all the compute + # for the other ubatch before the dispatch kernel starts. + dbo_yield_and_switch_from_compute_to_comm() + + # capture a DeepEP event and pass it as previous_event so + # DeepEP honors the dependency internally. + previous_event = dbo_get_previous_event(self.buffer.capture) + + ( + num_tokens_per_rank, + num_tokens_per_rdma_rank, + dispatch_expert_num_tokens, + is_token_in_rank, + event, + ) = self.buffer.get_dispatch_layout( + topk_idx=rank_topk_ids.long(), + num_experts=num_experts, + previous_event=previous_event, + async_finish=False, + allocate_on_comm_stream=False, + ) + + token_data = tokens + if has_scales: + token_data = (tokens, token_scales) + + ( + token_data, + expert_topk_ids, + expert_topk_weights, + expert_num_tokens_per_expert_list, + handle, + event, + ) = self.buffer.dispatch( + x=token_data, + handle=None, + num_tokens_per_rank=num_tokens_per_rank, + num_tokens_per_rdma_rank=num_tokens_per_rdma_rank, + is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=dispatch_expert_num_tokens, + topk_idx=rank_topk_ids.long(), + topk_weights=rank_topk_weights, + # expert_alignment rounds the number of tokens per expert + # to this value. + expert_alignment=1, + config=self._get_dispatch_config(), + previous_event=previous_event, + async_finish=self.async_prepare and not dbo_enabled(), + allocate_on_comm_stream=False, + ) + + # record the handle for this ubatch + a2a_idx = dbo_current_ubatch_id() + self.handles[a2a_idx] = handle + + dbo_switch_to_compute_sync() + + return lambda: self._receiver( + event, + has_scales, + token_data, + expert_topk_ids.int(), + num_experts, + expert_num_tokens_per_expert_list, + expert_topk_weights, + a1_scale, + quant_config, + ) + + def _receiver( + self, + event: deep_ep.EventOverlap, + has_scales: bool, + token_data: tuple[torch.Tensor, torch.Tensor] | torch.Tensor, + expert_topk_ids: torch.Tensor | None, + num_experts: int, + expert_num_tokens_per_expert_list: list[int], + expert_topk_weights: torch.Tensor | None, + a1_scale: torch.Tensor | None, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + if event.event is not None: + event.current_stream_wait() + + if has_scales: + expert_x, expert_x_scale = token_data + else: + expert_x, expert_x_scale = token_data, None + + # The existing MOE kernels assume that all entries of topk_ids are + # valid. To that effect, set the -1s in expert_topk_ids to some expert + # outside this rank so the expert_map can remap it to -1 when safe. + # With Expert Parallel, the experts are divided amongst the rank + # sequentially. For rank 0, set it to num_experts - 1 and for all other + # ranks set it to 0 as we know that expert_map will have a -1 in those + # regions for those ranks. + # + # DeepEP's topk_ids output refers to the local experts directly. Offset + # the topk_ids to move it back to the global experts space so it aligns + # with existing vLLM interfaces. + assert expert_topk_ids is not None + expert_topk_ids = torch.where( + expert_topk_ids == -1, + num_experts - 1 if self.rank_expert_offset == 0 else 0, + expert_topk_ids + self.rank_expert_offset, + ) + + # Makes a GPU-CPU copy. + # TODO (varun): Maybe it is better to re-compute the expert_num_tokens + # on GPU. + expert_tokens_meta = mk.ExpertTokensMetadata.make_from_list( + expert_num_tokens_per_expert_list, device=expert_x.device + ) + + # Dispatch and Quant + # DeepEP kernels only support dispatching block-quantized + # activation scales. + # Dispatch in bfloat16 and quantize afterwards + if not quant_config.is_block_quantized: + # Quantize after dispatch. + expert_x_scale = None + if expert_x.numel() != 0: + expert_x, expert_x_scale = moe_kernel_quantize_input( + expert_x, + a1_scale, + quant_dtype=quant_config.quant_dtype, + per_act_token_quant=False, + block_shape=quant_config.block_shape, + ) + + return ( + expert_x, + expert_x_scale, + expert_tokens_meta, + expert_topk_ids, + expert_topk_weights, + ) + + def supports_async(self) -> bool: + return True + + def prepare_async( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.ReceiverType: + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + a1 = a1 * topk_weights.to(a1.dtype) + + if quant_config.is_block_quantized: + # Quant and Dispatch + a1q, a1q_scale = moe_kernel_quantize_input( + a1, + quant_config.a1_scale, + quant_dtype=quant_config.quant_dtype, + per_act_token_quant=quant_config.per_act_token_quant, + block_shape=quant_config.block_shape, + ) + if a1q_scale is not None and a1q_scale.numel() == 1: + a1q_scale = a1q_scale.view(1, 1) + a1_post_scale = None + else: + a1q = a1 + a1q_scale = None + a1_post_scale = quant_config.a1_scale + + return self._do_dispatch( + tokens=a1q, + token_scales=a1q_scale, + rank_topk_ids=topk_ids, + rank_topk_weights=topk_weights, + num_experts=num_experts, + a1_scale=a1_post_scale, + quant_config=quant_config, + ) + + def prepare( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + receiver = self.prepare_async( + a1, + topk_weights, + topk_ids, + num_experts, + expert_map, + apply_router_weight_on_input, + quant_config, + ) + return receiver() + + def _finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + do_async: bool, + ) -> Callable | None: + a2a_idx = dbo_current_ubatch_id() + handle = self.handles[a2a_idx] + assert handle is not None + + # fused_expert_output can have 0 tokens - This happens when none of the + # tokens from the all2all reach this EP rank. + if fused_expert_output.numel() != 0: + if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): + weight_and_reduce_impl = TopKWeightAndReduceContiguous() + fused_expert_output = weight_and_reduce_impl.apply( + output=None, + fused_expert_output=fused_expert_output, + topk_weights=topk_weights, + topk_ids=topk_ids, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + dbo_yield_and_switch_from_compute_to_comm() + assert fused_expert_output.dtype == torch.bfloat16, ( + f"Expected fused_expert_output bfloat16, got {fused_expert_output.dtype}" + ) + previous_event = dbo_get_previous_event(self.buffer.capture) + combined_x, _, event = self.buffer.combine( + # HT combine only supports BF16 + x=fused_expert_output, + handle=handle, + topk_weights=None, + config=self._get_combine_config(), + previous_event=previous_event, + async_finish=do_async and not dbo_enabled(), + allocate_on_comm_stream=False, + ) + + dbo_switch_to_compute() + + if do_async: + + def _receiver(): + if event.event is not None: + event.current_stream_wait() + dbo_switch_to_comm() + # Respect inplace outputs. + output.copy_(combined_x, non_blocking=True) + + # TODO(lucas): refactor the modular kernel so this will be + # handled there + dbo_yield_and_switch_from_comm_to_compute() + + return _receiver + else: + # TODO(lucas): support this case with the refactored modular kernel + assert not dbo_enabled() + # Respect inplace outputs. + output.copy_(combined_x, non_blocking=True) + return None + + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> Callable: + receiver = self._finalize( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + True, + ) + assert receiver is not None + return receiver + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + self._finalize( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + False, + ) diff --git a/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py new file mode 100644 index 0000000..23116b8 --- /dev/null +++ b/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -0,0 +1,367 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import deep_ep +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate, +) +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input, + normalize_batched_scales_shape, +) +from vllm.v1.worker.ubatching import ( + dbo_current_ubatch_id, + dbo_enabled, + dbo_maybe_run_recv_hook, +) + +logger = init_logger(__name__) + +# DeepEP kernels quantize dispatch inputs in 128 element chunks. +DEEPEP_QUANT_BLOCK_SIZE = 128 +DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE] + + +def dequant_fp8( + expert_x_fp8: torch.Tensor, expert_x_scales: torch.Tensor +) -> torch.Tensor: + """ + Return dequantized tensor in fp32 + """ + # TODO (varun) : Optimize leverage num_tokens_per_expert counts + assert expert_x_fp8.is_contiguous() + expert_x_scales = expert_x_scales.contiguous() + num_experts = expert_x_fp8.size(0) + + expert_x_fp32 = expert_x_fp8.to(torch.float32).view( + num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE + ) + expert_x_scales = expert_x_scales.view(num_experts, -1, 1) + return (expert_x_fp32 * expert_x_scales).view(expert_x_fp8.size()) + + +class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + Prepare/Finalize using DeepEP low-latency kernels. + """ + + # DeepEP low-latency kernels are compiled only for certain + # specific hidden sizes. + # NOTE: Keep this list sorted, maybe_roundup_layer_hidden_size depends + # on it. + SUPPORTED_HIDDEN_SIZES = [2048, 2560, 3072, 4096, 5120, 6144, 7168, 8192] + + @staticmethod + def maybe_roundup_layer_hidden_size(hidden_size: int) -> int: + # Round up hidden size to the closest supported hidden size. + _supported_hs = DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES + # Check sorted + num_supported_hs = len(_supported_hs) + assert all( + [ + _supported_hs[i] < _supported_hs[i + 1] + for i in range(num_supported_hs - 1) + ] + ) + + for x in _supported_hs: + if x >= hidden_size: + return x + + raise ValueError( + f"Hidden Size {hidden_size} is greater than the " + f"maximum supported hidden size {_supported_hs[-1]}" + ) + + def __init__( + self, + buffer: deep_ep.Buffer, + max_tokens_per_rank: int, + num_dispatchers: int, + use_fp8_dispatch: bool = False, + ): + super().__init__() + + self.buffer = buffer + self.max_tokens_per_rank = max_tokens_per_rank + self.use_fp8_dispatch = use_fp8_dispatch + # The dispatch function returns a handle that the combine function + # requires. We store the handle here so it is available to the + # combine function. + self.handles: list[tuple | None] = [None, None] + self.num_dispatchers_ = num_dispatchers + + # We don't have enough information to determine if we should dispatch + # activation scales in a packed ue8m0 format during object construction + # time. This setting is handled by post_init_setup. + self.use_ue8m0_dispatch = False + + def post_init_setup(self, fused_experts: mk.FusedMoEPermuteExpertsUnpermute): + if not fused_experts.supports_packed_ue8m0_act_scales(): + # Early exit. + return + + if self.use_fp8_dispatch: + logger.debug_once( + "Update DeepEPLLPrepareFinalize to do packed ue8m0 scales dispatch." + ) + self.use_ue8m0_dispatch = True + else: + logger.warning_once( + "DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized " + f"activations despite ({fused_experts.__class__.__name__}) being able " + "to support quantized activations.", + scope="local", + ) + + def num_dispatchers(self) -> int: + return self.num_dispatchers_ + + def output_is_reduced(self) -> bool: + return True + + @property + def activation_format(self) -> mk.FusedMoEActivationFormat: + return mk.FusedMoEActivationFormat.BatchedExperts + + def max_num_tokens_per_rank(self) -> int | None: + return self.max_tokens_per_rank + + def topk_indices_dtype(self) -> torch.dtype | None: + return torch.int64 + + def _do_quant( + self, + x: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + a1_dtype: torch.dtype, + quant_config: FusedMoEQuantConfig, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + if self.use_fp8_dispatch: + block_k = ( + quant_config.block_shape[1] + if quant_config.block_shape is not None + else None + ) + if block_k == DEEPEP_QUANT_BLOCK_SIZE: + # DeepEP kernels did the quantization for us. + x, x_scales = x + return x, x_scales + + # Dequant to get back the tokens in the datatype we dispatched in. + x_fp8, x_scales = x + x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype) + + assert isinstance(x, torch.Tensor) + + num_experts, max_tokens, hidden_dim = x.size() + + # TODO (varun): Optimization - Use a batched version of quant + x = x.view((-1, hidden_dim)) + x, x_scales = moe_kernel_quantize_input( + x, + quant_config.a1_scale, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + ) + x = x.view((num_experts, -1, hidden_dim)) + + if quant_config.quant_dtype is not None: + assert x_scales is not None + x_scales = normalize_batched_scales_shape(x_scales, num_experts) + + return x, x_scales + + def supports_async(self) -> bool: + return True + + def prepare_async( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> tuple[Callable, mk.ReceiverType]: + hidden_size = a1.size(1) + assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, ( + f"Hidden Size {hidden_size} not in supported list of hidden sizes" + f"{self.SUPPORTED_HIDDEN_SIZES}" + ) + + a2a_idx = dbo_current_ubatch_id() + + if self.use_fp8_dispatch: + assert hidden_size % 128 == 0, ( + "DeepEP kernels quantize the inputs in blocks of shape 128" + ) + + has_per_token_scales = ( + quant_config.a1_scale.numel() != 1 + if quant_config.a1_scale is not None + else ( + quant_config.a2_scale.numel() != 1 + if quant_config.a2_scale is not None + else False + ) + ) + assert not has_per_token_scales, ( + "low_latency kernels doesn't support dispatching per-token scales" + ) + + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + a1 = a1 * topk_weights.to(a1.dtype) + + # Dispatch + expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch( + a1, + topk_ids.long(), + self.max_tokens_per_rank, + num_experts, + use_fp8=self.use_fp8_dispatch, + # round_scale needs to be set to dispatch in ue8m0 + # round_scale=self.use_ue8m0_dispatch, + # use_ue8m0=self.use_ue8m0_dispatch, + async_finish=False, + return_recv_hook=True, + ) + self.handles[a2a_idx] = handle + + return ( + hook, + lambda: self._receiver( + expert_x, + expert_num_tokens, + quant_config.a1_scale, + a1.dtype, + quant_config, + ), + ) + + def _receiver( + self, + expert_x: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + expert_num_tokens: torch.Tensor, + a1_scale: torch.Tensor | None, + a1_dtype: torch.dtype, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + expert_x, expert_x_scale = self._do_quant(expert_x, a1_dtype, quant_config) + + expert_tokens_meta = mk.ExpertTokensMetadata( + expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None + ) + + return expert_x, expert_x_scale, expert_tokens_meta, None, None + + def prepare( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + hook, receiver = self.prepare_async( + a1, + topk_weights, + topk_ids, + num_experts, + expert_map, + apply_router_weight_on_input, + quant_config, + ) + hook() + return receiver() + + def _finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + do_async: bool, + ) -> tuple[Callable, Callable]: + assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate), ( + "Weight application and reduction happens in the combine kernel." + ) + + a2a_idx = dbo_current_ubatch_id() + do_recv_hook = dbo_enabled() or do_async + handle = self.handles[a2a_idx] + assert handle is not None + + combine_topk_weights = topk_weights + if apply_router_weight_on_input: + # weights have already been applied. + combine_topk_weights = torch.ones_like(topk_weights) + + # TODO (varun) : Enable zero copy mode + dbo_maybe_run_recv_hook() + _, _, recv_hook = self.buffer.low_latency_combine( + fused_expert_output, + topk_ids.long(), + combine_topk_weights, + handle, + async_finish=False, + zero_copy=False, + return_recv_hook=do_recv_hook, + out=output, + ) + + return recv_hook, lambda: None + + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> tuple[Callable, Callable]: + return self._finalize( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + do_async=True, + ) + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + self._finalize( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + do_async=False, + ) diff --git a/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py new file mode 100644 index 0000000..f864634 --- /dev/null +++ b/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -0,0 +1,307 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + create_flashinfer_prepare_finalize, +) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) +from vllm.utils.flashinfer import ( + flashinfer_cutlass_fused_moe, + has_flashinfer_cutlass_fused_moe, +) + +logger = init_logger(__name__) + + +def is_valid_flashinfer_cutlass_fused_moe( + hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor +) -> bool: + """ + Check if the given problem size is supported by the FlashInfer CUTLASS MoE + kernel. + """ + if not has_flashinfer_cutlass_fused_moe(): + logger.debug_once( + "FlashInferExperts disabled: flashinfer_cutlass_fused_moe not available." + ) + return False + # Data type checks + if ( + w1.dtype != torch.uint8 + or w2.dtype != torch.uint8 + or hidden_states.dtype not in [torch.float32, torch.float16, torch.bfloat16] + ): + logger.debug_once( + "FlashInferExperts disabled: w1/w2 must be torch.uint8 " + f"(got w1={w1.dtype}, w2={w2.dtype}), hidden_states must be " + f"float32, float16, or bfloat16 (got {hidden_states.dtype})." + ) + return False + return True + + +class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + out_dtype: torch.dtype, + quant_config: FusedMoEQuantConfig, + ep_rank: int = 0, + ep_size: int = 1, + tp_rank: int = 0, + tp_size: int = 1, + use_dp: bool = False, + use_deepseek_fp8_block_scale: bool = False, + ): + super().__init__(quant_config) + assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn, None), ( + "Only nvfp4, fp8, bfloat16 and" + " float16 quantization are currently supported." + ) + self.ep_rank = ep_rank + self.ep_size = ep_size + self.tp_rank = tp_rank + self.tp_size = tp_size + self.out_dtype = out_dtype + self.use_dp = use_dp + # Enables DeepSeek-style FP8 block-scale path: + # - pass per-block weight scales to the kernel + # - skip input activation quantization (kernel applies scaling) + self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_expert_map(self) -> bool: + return False + + def supports_chunking(self) -> bool: + # This refers to TP chunking; DP chunking is handled separately. + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # We use global_num_experts due to how moe_align_block_size handles + # expert_maps. + """ + Compute the shapes for the temporary and final outputs of the two gemms + and activation in the fused expert function. Since the gemms are + independent, the workspace for the first gemm can be shared with the + workspace for the last gemm. + + Returns a tuple of: + - workspace13 shape tuple: must be large enough to hold the + result of either expert gemm. + - workspace2 shape tuple: must be large enough to hold the + result of the activation function. + - output shape tuple: must be exact size of the final gemm output. + - Workspace type: The dtype to use for the workspace tensors. + - Note: in order for activation chunking to work, the first dimension + of each tuple must be the number of tokens. + """ + workspace1 = (M, K) + workspace2 = (0,) + # For TP, the quantization is fused with fused_moe call. + output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" and self.use_dp else K) + # The workspace is determined by `aq`, since it comes after any + # potential communication op and is involved in the expert computation. + return (workspace1, workspace2, output_shape) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor | None, + workspace2: torch.Tensor | None, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool | None, + ): + from flashinfer.fused_moe.core import ActivationType + + activation_str_to_value_map = { + "silu": ActivationType.Swiglu, # This is the default + "relu2_no_mul": ActivationType.Relu2, + } + assert activation in activation_str_to_value_map, ( + f"{activation=} missing from {activation_str_to_value_map.keys()=}" + ) + + # Select quantization metadata based on FP8 format/path + if ( + self.quant_dtype == torch.float8_e4m3fn + and not self.use_deepseek_fp8_block_scale + ): + # FP8 per-tensor path: use global alphas/scales; do not pass input_sf + quant_scales = [ + self.g1_alphas, + self.a2_gscale, + self.g2_alphas, + self.a1_gscale, + ] + + a1q_scale = None # not passing input_sf in fp8 + fc1_expert_weights = w1 + fc2_expert_weights = w2 + elif self.quant_dtype == "nvfp4": + # Ensure w1_scale and w2_scale are not None before calling view + assert self.w1_scale is not None and self.w2_scale is not None, ( + "w1_scale and w2_scale must not be None for FlashInferExperts" + ) + # Flashinfer CUTLASS kernel takes scalar global scales, + # min because inv_scale. + quant_scales = [ + self.a1_gscale, + self.w1_scale.view(torch.int32), + self.g1_alphas, + self.a2_gscale, + self.w2_scale.view(torch.int32), + self.g2_alphas, + ] + # FlashInfer API requires weight to be long for nvfp4 + fc1_expert_weights = w1.view(torch.long) + fc2_expert_weights = w2.view(torch.long) + elif self.use_deepseek_fp8_block_scale: + # FP8 block-scale path: provide block-scale weights, omit a1q_scale + quant_scales = [ + self.w1_scale, + self.w2_scale, + ] + a1q_scale = None + fc1_expert_weights = w1 + fc2_expert_weights = w2 + else: + quant_scales = None + a1q_scale = None + fc1_expert_weights = w1 + fc2_expert_weights = w2 + + _ = flashinfer_cutlass_fused_moe( + input=hidden_states, + token_selected_experts=topk_ids.to(torch.int), + token_final_scales=topk_weights, + fc1_expert_weights=fc1_expert_weights, + fc2_expert_weights=fc2_expert_weights, + output_dtype=self.out_dtype, + quant_scales=quant_scales, + input_sf=a1q_scale, + tp_size=self.tp_size, + tp_rank=self.tp_rank, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + output=output, + activation_type=activation_str_to_value_map[activation], + # Informs FlashInfer to use the block-scale decoding path when True + use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale, + ) + + +def flashinfer_cutlass_moe_fp4( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + quant_config: FusedMoEQuantConfig, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, +) -> torch.Tensor: + fused_experts = mk.FusedMoEModularKernel( + create_flashinfer_prepare_finalize(use_dp=False), + FlashInferExperts( + out_dtype=hidden_states.dtype, + quant_config=quant_config, + use_dp=False, + ), + ) + + return fused_experts( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=inplace, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + +def flashinfer_cutlass_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + quant_config: FusedMoEQuantConfig, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + tp_rank: int = 0, + tp_size: int = 1, + ep_rank: int = 0, + ep_size: int = 1, + use_dp: bool = False, +) -> torch.Tensor: + fused_experts = mk.FusedMoEModularKernel( + create_flashinfer_prepare_finalize(use_dp=use_dp), + FlashInferExperts( + out_dtype=hidden_states.dtype, + quant_config=quant_config, + tp_rank=tp_rank, + tp_size=tp_size, + ep_rank=ep_rank, + ep_size=ep_size, + ), + ) + + return fused_experts( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=inplace, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py new file mode 100644 index 0000000..7628908 --- /dev/null +++ b/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -0,0 +1,362 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.distributed import get_dp_group, get_ep_group +from vllm.distributed.device_communicators.base_device_communicator import ( + All2AllManagerBase, +) +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) +from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input +from vllm.utils.flashinfer import nvfp4_block_scale_interleave + + +def get_local_sizes(): + return get_forward_context().dp_metadata.get_chunk_sizes_across_dp_rank() + + +class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """Base class for FlashInfer MoE prepare and finalize operations.""" + + def __init__( + self, + use_dp: bool, + num_dispatchers: int = 1, + use_deepseek_fp8_block_scale: bool = False, + ): + super().__init__() + self.num_dispatchers_ = num_dispatchers + self.use_dp = use_dp + self.local_tokens = None + # Toggle for DeepSeek-style FP8 block-scale path where activations are + # not quantized here and weight block scales are consumed by the kernel. + self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale + + @property + def activation_format(self) -> mk.FusedMoEActivationFormat: + return mk.FusedMoEActivationFormat.Standard + + def max_num_tokens_per_rank(self) -> int | None: + return None + + def topk_indices_dtype(self) -> torch.dtype | None: + return None + + def num_dispatchers(self) -> int: + return self.num_dispatchers_ + + def output_is_reduced(self) -> bool: + return False + + def _apply_router_weight_on_input( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + """Apply router weight on input if needed.""" + if apply_router_weight_on_input: + topk = topk_ids.size(1) + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + a1.mul_(topk_weights.to(a1.dtype)) + + +class FlashInferAllToAllMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize): + """FlashInfer implementation using AllToAll communication.""" + + def __init__( + self, + use_dp: bool, + num_dispatchers: int = 1, + use_deepseek_fp8_block_scale: bool = False, + ): + super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale) + self.alltoall_info = None + + # Initialize all2all_manager only for DP case + self.all2all_manager = None + if self.use_dp: + self.all2all_manager = get_ep_group().device_communicator.all2all_manager + + def prepare( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + self._apply_router_weight_on_input( + a1, topk_weights, topk_ids, apply_router_weight_on_input + ) + + if not self.use_dp: + # Non-DP case: quantize activations unless using block-scale path + if not self.use_deepseek_fp8_block_scale: + a1q, a1q_scale = moe_kernel_quantize_input( + a1, + quant_config.a1_gscale, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + is_fp4_scale_swizzled=not self.use_dp, + ) + else: + a1q = a1 + a1q_scale = None + else: + # DP case: use FlashInfer AllToAll + global_num_tokens_cpu = get_local_sizes() + top_k = topk_ids.size(1) + + (self.alltoall_info, topk_ids, topk_weights, a1q, a1q_scale) = ( + flashinfer_alltoall_dispatch( + self.all2all_manager, + global_num_tokens_cpu, + a1, + quant_config.a1_gscale, + topk_ids, + topk_weights, + top_k, + num_experts, + quant_config, + use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale, + ) + ) + + return a1q, a1q_scale, None, topk_ids, topk_weights + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + if self.use_dp: + top_k = topk_ids.size(1) + token_count = output.shape[0] + fused_expert_output = flashinfer_alltoall_combine( + self.all2all_manager, + fused_expert_output, + top_k=top_k, + token_count=token_count, + alltoall_info=self.alltoall_info, + ) + output.copy_(fused_expert_output) + + +class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize): + def __init__( + self, + use_dp: bool, + num_dispatchers: int = 1, + use_deepseek_fp8_block_scale: bool = False, + ): + super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale) + + def prepare( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + self._apply_router_weight_on_input( + a1, topk_weights, topk_ids, apply_router_weight_on_input + ) + if not self.use_dp and quant_config.quant_dtype == "nvfp4": + return a1, None, None, topk_ids, topk_weights + + if not self.use_deepseek_fp8_block_scale: + a1q, a1q_scale = moe_kernel_quantize_input( + a1, + quant_config.a1_gscale, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + is_fp4_scale_swizzled=not self.use_dp, + ) + else: + # Block-scale path: pass activations through, omit per-token scales + a1q = a1 + a1q_scale = None + + if self.use_dp: + # Build gather list conditionally - omit a1q_scale if None + # (block-scale path) + gather_list = [topk_weights, topk_ids, a1q] + if a1q_scale is not None: + gather_list.append(a1q_scale) + gathered = get_dp_group().all_gatherv( + gather_list, + dim=0, + sizes=get_local_sizes(), + ) + topk_weights, topk_ids, a1q, a1q_scale = gathered + else: + gathered = get_dp_group().all_gatherv( + gather_list, + dim=0, + sizes=get_local_sizes(), + ) + topk_weights, topk_ids, a1q = gathered + a1q_scale = None + + if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None: + a1q_scale = nvfp4_block_scale_interleave(a1q_scale) + + return a1q, a1q_scale, None, topk_ids, topk_weights + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceNoOP) + + if self.use_dp: + fused_expert_output = get_dp_group().reduce_scatterv( + fused_expert_output, dim=0, sizes=get_local_sizes() + ) + output.copy_(fused_expert_output) + + +def flashinfer_alltoall_dispatch( + all2all_manager: All2AllManagerBase, + global_num_tokens_cpu: list[int], + x: torch.Tensor, + gs: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + top_k: int, + num_experts: int, + quant_config: FusedMoEQuantConfig, + use_deepseek_fp8_block_scale: bool = False, +): + from flashinfer.comm.trtllm_alltoall import MnnvlMoe + + assert all2all_manager.ensure_alltoall_workspace_initialized(), ( + "FlashInfer AllToAll workspace not available" + ) + + ep_rank = all2all_manager.rank + ep_size = all2all_manager.world_size + max_num_token = ( + max(global_num_tokens_cpu) if global_num_tokens_cpu is not None else x.shape[0] + ) + orig_topk_weights_dtype = topk_weights.dtype + alltoall_info, topk_ids, topk_weights, _ = ( + MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather( + topk_ids, + topk_weights, + None, + all2all_manager.prepare_workspace_tensor, + max_num_token, + ep_rank, + ep_size, + num_experts, + num_experts, + top_k, + ) + ) + topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype) + + if not use_deepseek_fp8_block_scale: + x, x_sf = moe_kernel_quantize_input( + x, + gs, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + is_fp4_scale_swizzled=False, # delay swizzle to after comm + ) + x = MnnvlMoe.mnnvl_moe_alltoallv( + x, + alltoall_info, + all2all_manager.workspace_tensor, + ep_rank, + ep_size, + ) + + x_sf = MnnvlMoe.mnnvl_moe_alltoallv( + x_sf, + alltoall_info, + all2all_manager.workspace_tensor, + ep_rank, + ep_size, + ) + if quant_config.quant_dtype == "nvfp4": + x_sf = nvfp4_block_scale_interleave(x_sf) + else: + # Block-scale path: pass activations through without quantization + x_sf = None + x = MnnvlMoe.mnnvl_moe_alltoallv( + x, + alltoall_info, + all2all_manager.workspace_tensor, + ep_rank, + ep_size, + ) + return alltoall_info, topk_ids, topk_weights, x, x_sf + + +def flashinfer_alltoall_combine( + all2all_manager: All2AllManagerBase, + output: torch.Tensor, + top_k: int, + token_count: int, + alltoall_info, +): + from flashinfer.comm.trtllm_alltoall import MnnvlMoe + + assert all2all_manager.ensure_alltoall_workspace_initialized(), ( + "FlashInfer AllToAll workspace not available" + ) + return MnnvlMoe.mnnvl_moe_alltoallv_combine( + output, + alltoall_info, + all2all_manager.workspace_tensor, + ep_rank=all2all_manager.rank, + ep_size=all2all_manager.world_size, + top_k=top_k, + token_count=token_count, + ) + + +def create_flashinfer_prepare_finalize( + use_dp: bool, + use_nvfp4: bool = False, + enable_alltoallv: bool = False, + use_deepseek_fp8_block_scale: bool = False, +) -> FlashInferCutlassMoEPrepareAndFinalize: + """Factory function to create the appropriate FlashInfer implementation.""" + if use_nvfp4: + if enable_alltoallv: + return FlashInferAllToAllMoEPrepareAndFinalize(use_dp) + else: + return FlashInferAllGatherMoEPrepareAndFinalize(use_dp) + # FP8 path currently supported via AllGather; optionally enable block-scale + return FlashInferAllGatherMoEPrepareAndFinalize( + use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale + ) diff --git a/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py new file mode 100644 index 0000000..51e06ac --- /dev/null +++ b/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -0,0 +1,192 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.model_executor.layers.fused_moe.config import RoutingMethodType +from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + calculate_tile_tokens_dim, +) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, +) +from vllm.utils.torch_utils import direct_register_custom_op + + +def flashinfer_fused_moe_blockscale_fp8( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor, + x: torch.Tensor, + w13_weight: torch.Tensor, + w13_weight_scale_inv: torch.Tensor, + w2_weight: torch.Tensor, + w2_weight_scale_inv: torch.Tensor, + global_num_experts: int, + top_k: int, + num_expert_group: int | None, + topk_group: int | None, + intermediate_size: int, + expert_offset: int, + local_num_experts: int, + block_shape: list[int], + routing_method_type: int = RoutingMethodType.DeepSeekV3, + routed_scaling: float | None = 1.0, +) -> torch.Tensor: + from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe + + topk_group = topk_group if topk_group is not None else 0 + assert top_k <= global_num_experts + assert top_k <= 10 + assert global_num_experts % 4 == 0 + assert block_shape == [128, 128] + # Routing kernel expects #experts <= #threads 512 + assert global_num_experts <= 512 + + a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1]) + # NOTE: scales of hidden states have to be transposed! + a_sf_t = a_sf.t().contiguous() + return flashinfer_trtllm_fp8_block_scale_moe( + routing_logits=routing_logits, + routing_bias=routing_bias, + hidden_states=a_q, + hidden_states_scale=a_sf_t, + gemm1_weights=w13_weight, + gemm1_weights_scale=w13_weight_scale_inv, + gemm2_weights=w2_weight, + gemm2_weights_scale=w2_weight_scale_inv, + num_experts=global_num_experts, + top_k=top_k, + n_group=num_expert_group, + topk_group=topk_group, + intermediate_size=intermediate_size, + local_expert_offset=expert_offset, + local_num_experts=local_num_experts, + routed_scaling_factor=routed_scaling, + tile_tokens_dim=None, + routing_method_type=routing_method_type, + use_shuffled_weight=False, + ) + + +def flashinfer_fused_moe_blockscale_fp8_fake( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor, + x: torch.Tensor, + w13_weight: torch.Tensor, + w13_weight_scale_inv: torch.Tensor, + w2_weight: torch.Tensor, + w2_weight_scale_inv: torch.Tensor, + global_num_experts: int, + top_k: int, + num_expert_group: int, + topk_group: int, + intermediate_size: int, + expert_offset: int, + local_num_experts: int, + block_shape: list[int], + routing_method_type: int, + routed_scaling: float = 1.0, +) -> torch.Tensor: + return torch.empty_like(x) + + +# TODO(bnell): Does this really need to be a torch.op? +direct_register_custom_op( + op_name="flashinfer_fused_moe_blockscale_fp8", + op_func=flashinfer_fused_moe_blockscale_fp8, + fake_impl=flashinfer_fused_moe_blockscale_fp8_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + + +def flashinfer_fused_moe_per_tensor_scale_fp8( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor | None, + hidden_states: torch.Tensor, + input_scale: torch.Tensor, + gemm1_weights: torch.Tensor, + gemm2_weights: torch.Tensor, + output1_scales_scalar: torch.Tensor, + output1_scales_gate_scalar: torch.Tensor, + output2_scales_scalar: torch.Tensor, + num_experts: int, + top_k: int, + num_expert_group: int | None, + topk_group: int | None, + intermediate_size: int, + local_expert_offset: int, + local_num_experts: int, + use_routing_scales_on_input: bool, + routing_method_type: int, + routed_scaling_factor: float = 1.0, +) -> torch.Tensor: + num_expert_group = num_expert_group if num_expert_group is not None else 0 + topk_group = topk_group if topk_group is not None else 0 + + quant_hidden_states, _ = moe_kernel_quantize_input( + hidden_states, + input_scale, + quant_dtype=torch.float8_e4m3fn, + per_act_token_quant=False, + ) + + from vllm.utils.flashinfer import flashinfer_trtllm_fp8_per_tensor_scale_moe + + return flashinfer_trtllm_fp8_per_tensor_scale_moe( + routing_logits=routing_logits, + routing_bias=routing_bias, + hidden_states=quant_hidden_states, + gemm1_weights=gemm1_weights, + output1_scales_scalar=output1_scales_scalar, + output1_scales_gate_scalar=output1_scales_gate_scalar, + gemm2_weights=gemm2_weights, + output2_scales_scalar=output2_scales_scalar, + num_experts=num_experts, + top_k=top_k, + n_group=num_expert_group, + topk_group=topk_group, + intermediate_size=intermediate_size, + local_expert_offset=local_expert_offset, + local_num_experts=local_num_experts, + routed_scaling_factor=routed_scaling_factor, + use_routing_scales_on_input=use_routing_scales_on_input, + tile_tokens_dim=calculate_tile_tokens_dim( + hidden_states.shape[0], top_k, num_experts + ), + routing_method_type=routing_method_type, + ) + + +def flashinfer_fused_moe_per_tensor_scale_fp8_fake( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor | None, + hidden_states: torch.Tensor, + input_scale: torch.Tensor, + gemm1_weights: torch.Tensor, + gemm2_weights: torch.Tensor, + output1_scales_scalar: torch.Tensor, + output1_scales_gate_scalar: torch.Tensor, + output2_scales_scalar: torch.Tensor, + num_experts: int, + top_k: int, + num_expert_group: int | None, + topk_group: int | None, + intermediate_size: int, + local_expert_offset: int, + local_num_experts: int, + use_routing_scales_on_input: bool, + routing_method_type: int, + routed_scaling_factor: float = 1.0, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +# TODO(bnell): Does this really need to be a torch.op? +direct_register_custom_op( + op_name="flashinfer_fused_moe_per_tensor_scale_fp8", + op_func=flashinfer_fused_moe_per_tensor_scale_fp8, + mutates_args=["hidden_states"], + fake_impl=flashinfer_fused_moe_per_tensor_scale_fp8_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) diff --git a/model_executor/layers/fused_moe/fused_batched_moe.py b/model_executor/layers/fused_moe/fused_batched_moe.py new file mode 100644 index 0000000..7fd8511 --- /dev/null +++ b/model_executor/layers/fused_moe/fused_batched_moe.py @@ -0,0 +1,1012 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Fused batched MoE kernel.""" + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.fused_moe import try_get_optimal_moe_config +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate, + TopKWeightAndReduceNaiveBatched, +) +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, + moe_kernel_quantize_input, + normalize_batched_scales_shape, + normalize_scales_shape, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast +from vllm.triton_utils import tl, triton + + +@triton.jit +def moe_mmk( + a_ptrs, + b_ptrs, + K, + expert_id, + a_scale_ptr, + b_scale_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ak: tl.int64, + stride_bk: tl.int64, + stride_ase: tl.int64, + stride_asm: tl.int64, + stride_ask: tl.int64, + stride_bse: tl.int64, + stride_bsk: tl.int64, + stride_bsn: tl.int64, + # Offsets and masks + offs_m, + offs_n, + offs_bn, + mask_m, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + compute_type: tl.constexpr, + use_w8a8: tl.constexpr, + use_w8a16: tl.constexpr, + per_act_token_quant: tl.constexpr, +): + offs_k = tl.arange(0, BLOCK_K) + + if use_w8a16: + b_scale_ptrs = ( + b_scale_ptr + expert_id * stride_bse + offs_n[None, :] * stride_bsn + ) + b_scale = tl.load(b_scale_ptrs) + + if use_w8a8: + # block-wise + if group_k > 0 and group_n > 0: + a_scale_ptrs = a_scale_ptr + offs_m * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = b_scale_ptr + offs_bsn * stride_bsn + + # per act token + elif per_act_token_quant: + # Load per-token scale for activations + a_scale_ptrs = a_scale_ptr + offs_m * stride_asm + a_scale = tl.load(a_scale_ptrs, mask=mask_m, other=0.0)[:, None] + + b_scale_ptrs = b_scale_ptr + offs_bn[None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + + # tensor-wise + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + a = tl.load( + a_ptrs, + mask=mask_m[:, None] & (offs_k[None, :] < K - k * BLOCK_K), + other=0.0, + ) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_K, other=0.0) + # We accumulate along the K dimension. + if use_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_K + offs_ks = k_start // group_k + a_scale = tl.load( + a_scale_ptrs + offs_ks * stride_ask, mask=mask_m, other=0.0 + ) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_K * stride_ak + b_ptrs += BLOCK_K * stride_bk + + if use_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + + return accumulator + + +@triton.jit +def expert_triton_kernel( + a_ptr, # [max_tokens, K] + b_ptr, # [K, N] + c_ptr, # [max_tokens, N] + expert_id, + compute_type: tl.constexpr, + # Dimensions + M, + N, + K, + # Quantization data + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # strides + stride_am: tl.int64, + stride_ak: tl.int64, + stride_bk: tl.int64, + stride_bn: tl.int64, + stride_cm: tl.int64, + stride_cn: tl.int64, + stride_ase: tl.int64, + stride_asm: tl.int64, + stride_ask: tl.int64, + stride_bse: tl.int64, + stride_bsk: tl.int64, + stride_bsn: tl.int64, + # offsets + offs_bn, + # Blockwise quantization data + group_n, + group_k, + # Quantization schemes + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + per_act_token_quant: tl.constexpr, + # Kernel config + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, +): + offs_m = tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) % N + offs_k = tl.arange(0, BLOCK_K) + mask_m = offs_m < M + + # Make grids of a + b pointers + a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak + b_ptrs = b_ptr + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn + + accumulator = moe_mmk( + a_ptrs, + b_ptrs, + K, + expert_id, + a_scale_ptr, + b_scale_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ak, + stride_bk, + stride_ase, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Offsets and masks + offs_m, + offs_n, + offs_bn, + mask_m, + # Block size for block-wise quantization + group_n, + group_k, + # Meta-parameters + BLOCK_M, + BLOCK_N, + BLOCK_K, + compute_type, + use_fp8_w8a8, + use_int8_w8a16, + per_act_token_quant, + ) + + # store in C + offs_cn = tl.arange(0, BLOCK_N) + c_ptrs = c_ptr + offs_m[:, None] * stride_cm + offs_cn[None, :] * stride_cn + c_mask = mask_m[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.jit +def batched_triton_kernel( + a_ptr, # [E, max_num_tokens, K] + b_ptr, # [E, K, N] + c_ptr, # [E, max_num_tokens, N] + expert_num_tokens, # [E] + compute_type: tl.constexpr, + # Dimensions + max_num_tokens, + K, + N, + # Quantization data + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ae: tl.int64, + stride_am: tl.int64, + stride_ak: tl.int64, + stride_be: tl.int64, + stride_bk: tl.int64, + stride_bn: tl.int64, + stride_ce: tl.int64, + stride_cm: tl.int64, + stride_cn: tl.int64, + stride_ase: tl.int64, + stride_asm: tl.int64, + stride_ask: tl.int64, + stride_bse: tl.int64, + stride_bsk: tl.int64, + stride_bsn: tl.int64, + # Blockwise quantization data + group_n: tl.constexpr, + group_k: tl.constexpr, + # Quantization schemes + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + per_act_token_quant: tl.constexpr, + # Kernel config + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, +): + expert_id = tl.program_id(axis=0) + e_num_tokens = tl.load(expert_num_tokens + expert_id) + if e_num_tokens == 0: + # Early exit + return + + # axis 1 is M_blocks * N_blocks + pid_mn = tl.program_id(axis=1) + # num_pid_m = tl.cdiv(max_num_tokens, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + pid_m = pid_mn // num_pid_n + pid_n = pid_mn % num_pid_n + + cta_m_start = pid_m * BLOCK_M + cta_n_start = pid_n * BLOCK_N + if cta_m_start >= e_num_tokens: + # Early exit + return + + cta_m_size = min(BLOCK_M, e_num_tokens - cta_m_start) + cta_n_size = min(BLOCK_N, N - cta_n_start) + + a_ptr = a_ptr + expert_id * stride_ae + cta_m_start * stride_am + b_ptr = b_ptr + expert_id * stride_be + cta_n_start * stride_bn + c_ptr = ( + c_ptr + + expert_id * stride_ce + + cta_m_start * stride_cm + + cta_n_start * stride_cn + ) + + offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N).to(tl.int64)) % N + + if use_fp8_w8a8: + a_scale_ptr = a_scale_ptr + expert_id * stride_ase + b_scale_ptr = b_scale_ptr + expert_id * stride_bse + + # block-wise + if group_k > 0 and group_n > 0 or per_act_token_quant: + a_scale_ptr = a_scale_ptr + cta_m_start * stride_asm + + expert_triton_kernel( + a_ptr, + b_ptr, + c_ptr, + expert_id, + compute_type, + cta_m_size, # M + cta_n_size, # N + K, # K + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # Strides + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_ase, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # offsets + offs_bn, + # Blockwise quantization data + group_n, + group_k, + # Quantization schemes + use_fp8_w8a8, + use_int8_w8a16, + per_act_token_quant, + # Kernel config + BLOCK_M, + BLOCK_N, + BLOCK_K, + ) + + +def invoke_moe_batched_triton_kernel( + A: torch.Tensor, # [E, max_tokens, K] + B: torch.Tensor, # [E, N, K] + C: torch.Tensor, # [E, max_tokens, N] + expert_num_tokens: torch.Tensor, # [E] + compute_type: tl.dtype, + # Quantization data + A_scale: torch.Tensor | None, + B_scale: torch.Tensor | None, + B_zp: torch.Tensor, + # Quantization schemes + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + config: dict[str, int], + per_act_token_quant: bool, + block_shape: list[int] | None = None, +): + assert not use_int4_w4a16 + max_num_tokens = A.size(1) + K = A.size(2) + N = C.size(2) + + BLOCK_M = config["BLOCK_SIZE_M"] + BLOCK_N = config["BLOCK_SIZE_N"] + BLOCK_K = config["BLOCK_SIZE_K"] + + grid = ( + expert_num_tokens.size(0), + triton.cdiv(max_num_tokens, BLOCK_M) * triton.cdiv(B.size(1), BLOCK_N), + ) + + A_scale = normalize_batched_scales_shape(A_scale, expert_num_tokens.shape[0]) + + if B_scale is not None and B_scale.ndim == 1: + assert B_scale.numel() == expert_num_tokens.shape[0] + B_scale = B_scale.view(-1, 1, 1) + + assert A_scale is None or A_scale.ndim == 3, ( + f"{0 if A_scale is None else A_scale.shape}" + ) + assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, ( + f"{0 if B_scale is None else B_scale.shape}" + ) + + if B_scale is not None: + if B_scale.ndim == 1: + stride_bse = 1 + stride_bsk = 0 + stride_bsn = 0 + else: + stride_bse = B_scale.stride(0) + stride_bsk = B_scale.stride(2) + stride_bsn = B_scale.stride(1) + + else: + stride_bse = 0 + stride_bsk = 0 + stride_bsn = 0 + + if A_scale is not None: + stride_ase = A_scale.stride(0) + stride_asm = A_scale.stride(1) + stride_ask = A_scale.stride(2) + else: + stride_ase = 0 + stride_asm = 0 + stride_ask = 0 + + batched_triton_kernel[grid]( + A, + B, + C, + expert_num_tokens, + compute_type, + # Dimensions + max_num_tokens, + K, + N, + # Quantization data + A_scale, + B_scale, + B_zp, + # Strides + A.stride(0), + A.stride(1), + A.stride(2), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(0), + C.stride(1), + C.stride(2), + stride_ase, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Blockwise quantization data + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + # Quantization schemes + use_fp8_w8a8, + use_int8_w8a16, + per_act_token_quant, + # Kernel config + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + BLOCK_K=BLOCK_K, + ) + + +class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + A reference prepare/finalize class that reorganizes the tokens into + expert batched format, i.e. E x max_num_tokens x K. This is the format + that the PPLX dispatch/combine kernels use. + """ + + def __init__( + self, + max_num_tokens: int, + num_local_experts: int, + num_dispatchers: int, + rank: int, + ): + super().__init__() + self.max_num_tokens = max_num_tokens + self.num_local_experts = num_local_experts + self.rank = rank + self.num_dispatchers_ = num_dispatchers + + @property + def activation_format(self) -> mk.FusedMoEActivationFormat: + return mk.FusedMoEActivationFormat.BatchedExperts + + def max_num_tokens_per_rank(self) -> int | None: + return self.max_num_tokens + + def topk_indices_dtype(self) -> torch.dtype | None: + return None + + def num_dispatchers(self) -> int: + return self.num_dispatchers_ + + def output_is_reduced(self) -> bool: + return False + + def prepare( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + assert a1.dim() == 2 + assert topk_ids.dim() == 2 + assert topk_ids.size(0) == a1.size(0) + + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + a1.mul_(topk_weights.to(a1.dtype)) + + num_tokens, hidden_dim = a1.size() + topk = topk_ids.size(1) + + tokens_per_expert = torch.zeros(num_experts, dtype=torch.int, device=a1.device) + + num_local_experts = self.num_local_experts + + if quant_config.quant_dtype is None: + b_type = a1.dtype + else: + b_type = quant_config.quant_dtype + + b_a1 = torch.zeros( + (num_local_experts, self.max_num_tokens, hidden_dim), + dtype=b_type, + device=a1.device, + ) + + if quant_config.is_quantized: + scale_shape = quant_config.batched_scale_shape( + num_local_experts, self.max_num_tokens, hidden_dim + ) + + b_a1_scale = torch.empty(scale_shape, dtype=torch.float32, device=a1.device) + else: + assert quant_config.a1_scale is None + b_a1_scale = None + + first_expert = num_local_experts * self.rank + last_expert = first_expert + num_local_experts + + a1_scale = normalize_scales_shape(quant_config.a1_scale) + + for expert_id in range(first_expert, last_expert): + topks = torch.any(topk_ids == expert_id, dim=1).flatten() + rows = torch.count_nonzero(topks.flatten()) + if rows == 0: + continue + idx = expert_id - first_expert + tokens_per_expert[idx] = rows + rhs = a1[: topks.numel()][topks] + if quant_config.quant_dtype is not None: + if a1_scale is not None: + if quant_config.is_per_act_token: + rhs_a1_scale = a1_scale[: topks.numel()][topks] + else: + rhs_a1_scale = a1_scale + else: + rhs_a1_scale = None + b_a1[idx, :rows, :], b_s = moe_kernel_quantize_input( + rhs, + rhs_a1_scale, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + ) + assert b_s is not None + if quant_config.is_per_act_token: + b_a1_scale[idx, :rows] = b_s[:rows] + else: + b_a1_scale[idx, : b_s.shape[0]] = b_s + else: + b_a1[idx, :rows, :] = rhs + + assert b_a1_scale is None or b_a1_scale.ndim == 3 + + expert_tokens_meta = mk.ExpertTokensMetadata( + expert_num_tokens=tokens_per_expert, expert_num_tokens_cpu=None + ) + + return b_a1, b_a1_scale, expert_tokens_meta, None, None + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): + weight_and_reduce_impl = TopKWeightAndReduceNaiveBatched(self.rank) + weight_and_reduce_impl.apply( + output=output, + fused_expert_output=fused_expert_output, + topk_weights=topk_weights, + topk_ids=topk_ids, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + +class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): + """ + A reference MoE expert class that operates on expert batched format, + i.e. E x max_num_tokens x K. This is the format that the pplx + dispatch/combine kernels use. + """ + + def __init__( + self, + max_num_tokens: int, + num_dispatchers: int, + quant_config: FusedMoEQuantConfig, + ): + super().__init__(quant_config) + assert not self.quant_config.use_int8_w8a8, "NYI" + assert not self.quant_config.use_int8_w8a16, "NYI" + assert not self.quant_config.use_int4_w4a16, "NYI" + assert self.quant_config.ocp_mx_scheme is None, "NYI" + self.max_num_tokens = max_num_tokens + self.num_dispatchers = num_dispatchers + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts, + ) + + def supports_chunking(self) -> bool: + return False + + def supports_expert_map(self) -> bool: + return False + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + num_dp = self.num_dispatchers + num_experts = local_num_experts + workspace13 = (num_experts, self.max_num_tokens * num_dp, K) + workspace2 = (self.max_num_tokens * num_dp, N) + output = workspace13 + return (workspace13, workspace2, output) + + def dequant(self, t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: + assert self.quant_config.is_quantized + f32 = torch.float32 + if self.quant_config.is_per_act_token or self.quant_config.is_per_tensor: + return t.to(f32) * scale + else: + return t.to(f32) * group_broadcast(scale, t.shape) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert hidden_states.dim() == 3 + assert expert_tokens_meta is not None + expert_num_tokens = expert_tokens_meta.expert_num_tokens + + num_local_experts = w1.size(0) + assert num_local_experts == w1.size(0), f"{num_local_experts} == {w1.size(0)}" + + N = w1.size(1) // 2 + + for expert in range(num_local_experts): + # Indexing expert_num_tokens doesn't work w/cudagraphs or inductor + if ( + torch.compiler.is_compiling() + or torch.cuda.is_current_stream_capturing() + ): + num = hidden_states.shape[1] + else: + num = int(expert_num_tokens[expert].item()) + + if num == 0: + continue + + tmp = _resize_cache(workspace2, (num, N)) + + if self.quant_config.is_quantized: + assert a1q_scale is not None and self.w1_scale is not None + input = self.dequant(hidden_states[expert, :, :], a1q_scale[expert]) + w1_dq = self.dequant(w1[expert], self.w1_scale[expert]) + input = input[:num] @ w1_dq.transpose(0, 1) + else: + input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1) + + self.activation(activation, tmp, input.to(tmp.dtype)) + + if self.quant_config.is_quantized: + assert self.w2_scale is not None + w2_dq = self.dequant(w2[expert], self.w2_scale[expert]) + else: + w2_dq = w2[expert] + + output[expert, :num, :] = tmp @ w2_dq.transpose(0, 1).to(tmp.dtype) + + +def batched_moe_kernel_quantize_input( + A: torch.Tensor, + A_scale: torch.Tensor | None, + num_tokens: int, + E: int, + N: int, + expert_num_tokens: torch.Tensor, + qtype: torch.dtype | None, + per_act_token_quant: bool, + block_shape: list[int] | None = None, +) -> tuple[torch.Tensor, torch.Tensor | None]: + if torch.compiler.is_compiling() or torch.cuda.is_current_stream_capturing(): + # Note: this does a bunch of extra work because expert_num_tokens is + # ignored but it does support torch.compile + cudagraphs. + hidden_dim = A.size(-1) + assert A_scale is None or A_scale.ndim <= 2, ( + f"{A_scale.shape if A_scale is not None else None}" + ) + A_q, A_q_scale = moe_kernel_quantize_input( + A.view(-1, hidden_dim), A_scale, qtype, per_act_token_quant, block_shape + ) + A_q = A_q.view(E, -1, hidden_dim) + A_q_scale = normalize_batched_scales_shape(A_q_scale, E) + + return A_q, A_q_scale + elif qtype is None: + return A, normalize_batched_scales_shape(A_scale, E) + else: + A_q = torch.empty_like(A, dtype=qtype) + + if per_act_token_quant: + assert block_shape is None + scale_shape = (E, num_tokens, 1) + elif block_shape is not None: + _, block_k = block_shape + k_tiles = (A.shape[-1] + block_k - 1) // block_k + scale_shape = (E, num_tokens, k_tiles) + else: + scale_shape = (E, 1, 1) + + A_q_scale = torch.zeros(scale_shape, dtype=torch.float32, device=A.device) + + num_experts = expert_num_tokens.numel() + + A_scale = normalize_batched_scales_shape(A_scale, num_experts) + + for e in range(E): + num_tokens = int(expert_num_tokens[e].item()) + if num_tokens > 0: + if A_scale is not None: + scales = A_scale[e, : min(num_tokens, A_scale.shape[1])] + else: + scales = None + A_q[e, :num_tokens], tmp_scale = moe_kernel_quantize_input( + A[e, :num_tokens], + scales, + qtype, + per_act_token_quant, + block_shape, + ) + assert tmp_scale is not None + A_q_scale[e, : tmp_scale.shape[0]] = tmp_scale + + return A_q, A_q_scale + + +class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + """ + A Triton based MoE expert class that operates on expert batched format, + i.e. E x max_num_tokens x K. This is the format that the pplx + dispatch/combine kernels use. + """ + + def __init__( + self, + max_num_tokens: int, + num_dispatchers: int, + quant_config: FusedMoEQuantConfig, + ): + super().__init__(quant_config) + assert not self.quant_config.use_int8_w8a8, "NYI" + assert not self.quant_config.use_int8_w8a16, "NYI" + assert not self.quant_config.use_int4_w4a16, "NYI" + assert self.quant_config.ocp_mx_scheme is None, "NYI" + assert max_num_tokens > 0 + assert num_dispatchers > 0 + self.max_num_tokens = max_num_tokens + self.num_dispatchers = num_dispatchers + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts, + ) + + def supports_chunking(self) -> bool: + return False + + def supports_expert_map(self) -> bool: + return False + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + num_dp = self.num_dispatchers + num_experts = local_num_experts + max_num_tokens = self.max_num_tokens + workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N)) + workspace2 = (num_experts, max_num_tokens * num_dp, (N // 2)) + output = (num_experts, max_num_tokens * num_dp, K) + return (workspace13, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + # Check constraints. + if self.quant_config.use_int4_w4a16: + assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch" + else: + assert hidden_states.size(-1) == w1.size(2), ( + f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}" + ) + + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, + torch.float16, + torch.bfloat16, + torch.float8_e4m3fn, + ] + assert expert_tokens_meta is not None + + expert_num_tokens = expert_tokens_meta.expert_num_tokens + + E, max_num_tokens, N, K, top_k_num = self.moe_problem_size( + hidden_states, w1, w2, topk_ids + ) + + assert w1.size(0) == E + assert w2.size(0) == E + + config_dtype = self.quant_config.config_name(hidden_states.dtype) + + config = try_get_optimal_moe_config( + w1.size(), + w2.size(), + top_k_num, + config_dtype, + max_num_tokens, + block_shape=self.block_shape, + ) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + elif hidden_states.dtype == torch.float8_e4m3fn: + compute_type = tl.bfloat16 + else: + raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") + + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + intermediate_cache1 = _resize_cache(workspace13, (E, max_num_tokens, N)) + intermediate_cache2 = _resize_cache(workspace2, (E, max_num_tokens, N // 2)) + + # TODO(bnell): should this be done for any quantized type? + if self.quant_config.use_fp8_w8a8: + intermediate_cache1.fill_(0) + + a1q_scale = normalize_batched_scales_shape(a1q_scale, E) + + # MM1 + invoke_moe_batched_triton_kernel( + A=hidden_states, + B=w1, + C=intermediate_cache1, + expert_num_tokens=expert_num_tokens, + compute_type=compute_type, + A_scale=a1q_scale, + B_scale=self.w1_scale, + B_zp=self.w1_zp, + use_fp8_w8a8=self.quant_config.use_fp8_w8a8, + use_int8_w8a16=self.quant_config.use_int8_w8a16, + use_int4_w4a16=self.quant_config.use_int4_w4a16, + config=config, + per_act_token_quant=self.per_act_token_quant, + block_shape=self.block_shape, + ) + + intermediate_cache2.fill_(0) + + # TODO (bnell): use triton utility from batched deep gemm. + self.activation( + activation, + intermediate_cache2.view(-1, N // 2), + intermediate_cache1.view(-1, N), + ) + + qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input( + intermediate_cache2, + a2_scale, + max_num_tokens, + E, + N, + expert_num_tokens, + self.quant_dtype, + self.per_act_token_quant, + self.block_shape, + ) + + invoke_moe_batched_triton_kernel( + A=qintermediate_cache2, + B=w2, + C=output, + expert_num_tokens=expert_num_tokens, + compute_type=compute_type, + A_scale=a2q_scale, + B_scale=self.w2_scale, + B_zp=self.w2_zp, + use_fp8_w8a8=self.quant_config.use_fp8_w8a8, + use_int8_w8a16=self.quant_config.use_int8_w8a16, + use_int4_w4a16=self.quant_config.use_int4_w4a16, + config=config, + per_act_token_quant=self.per_act_token_quant, + block_shape=self.block_shape, + ) diff --git a/model_executor/layers/fused_moe/fused_marlin_moe.py b/model_executor/layers/fused_moe/fused_marlin_moe.py new file mode 100644 index 0000000..0b0f59f --- /dev/null +++ b/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -0,0 +1,792 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Fused MoE utilities for GPTQ.""" + +from collections.abc import Callable + +import torch + +import vllm._custom_ops as ops +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( + batched_moe_align_block_size, + moe_align_block_size, +) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate, + TopKWeightAndReduceNoOP, +) +from vllm.model_executor.layers.fused_moe.utils import _resize_cache, disable_inplace +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + marlin_make_workspace_new, + marlin_moe_intermediate_size, + maybe_warn_marlin_atomic_add, +) +from vllm.scalar_type import ScalarType, scalar_types + + +def default_activation_func( + activation: str, output: torch.Tensor, input: torch.Tensor +) -> None: + if activation == "silu": + torch.ops._C.silu_and_mul(output, input) + elif activation == "swigluoai": + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(output, input) + else: + raise ValueError( + f"Unsupported activation: {activation}. " + "Only silu and swigluoai activations are supported." + ) + + +def _fused_marlin_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + bias1: torch.Tensor | None, + bias2: torch.Tensor | None, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + num_topk: int, + quant_type: ScalarType, + apply_router_weight_on_input: bool, + expert_map: torch.Tensor | None, + block_size_m: int, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + activation: str = "silu", + activation_func: Callable[ + [str, torch.Tensor, torch.Tensor], None + ] = default_activation_func, + global_scale1: torch.Tensor | None = None, + global_scale2: torch.Tensor | None = None, + g_idx1: torch.Tensor | None = None, + g_idx2: torch.Tensor | None = None, + sort_indices1: torch.Tensor | None = None, + sort_indices2: torch.Tensor | None = None, + w1_zeros: torch.Tensor | None = None, + w2_zeros: torch.Tensor | None = None, + workspace: torch.Tensor | None = None, + intermediate_cache13: torch.Tensor | None = None, + intermediate_cache2: torch.Tensor | None = None, + output: torch.Tensor | None = None, + is_k_full: bool = True, +) -> torch.Tensor: + assert hidden_states.ndim == 2 + M, K = hidden_states.size() + N = marlin_moe_intermediate_size(w1, w2) + + if workspace is None: + workspace = marlin_make_workspace_new(hidden_states.device, 4) + + if intermediate_cache13 is None: + intermediate_cache13 = torch.empty( + (M * num_topk * max(2 * N, K),), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + if intermediate_cache2 is None: + intermediate_cache2 = torch.empty( + (M * num_topk, N), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + intermediate_cache1 = _resize_cache(intermediate_cache13, (M * num_topk, 2 * N)) + + intermediate_cache3 = _resize_cache(intermediate_cache13, (M * num_topk, K)) + + intermediate_cache2 = _resize_cache(intermediate_cache2, (M * num_topk, N)) + + maybe_warn_marlin_atomic_add(hidden_states.device, hidden_states.dtype) + use_atomic_add = ( + hidden_states.dtype == torch.half + or torch.cuda.get_device_capability(hidden_states.device)[0] >= 9 + ) + + intermediate_cache1 = ops.moe_wna16_marlin_gemm( + hidden_states, + intermediate_cache1, + w1, + bias1, + w1_scale, + global_scale1, + w1_zeros, + g_idx1, + sort_indices1, + workspace, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + topk_weights, + moe_block_size=block_size_m, + top_k=num_topk, + mul_topk_weights=apply_router_weight_on_input, + is_ep=expert_map is not None, + b_q_type=quant_type, + size_m=M, + size_n=2 * N, + size_k=K, + is_k_full=is_k_full, + use_atomic_add=use_atomic_add, + use_fp32_reduce=True, + is_zp_float=False, + ) + + activation_func( + activation, intermediate_cache2, intermediate_cache1.view(-1, 2 * N) + ) + + if output is None: + output = intermediate_cache3 + + if expert_map is not None: + output.zero_() + + output = ops.moe_wna16_marlin_gemm( + intermediate_cache2, + output, + w2, + bias2, + w2_scale, + global_scale2, + w2_zeros, + g_idx2, + sort_indices2, + workspace, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + topk_weights, + moe_block_size=block_size_m, + top_k=1, + mul_topk_weights=not apply_router_weight_on_input, + is_ep=expert_map is not None, + b_q_type=quant_type, + size_m=M * num_topk, + size_n=K, + size_k=N, + is_k_full=is_k_full, + use_atomic_add=use_atomic_add, + use_fp32_reduce=True, + is_zp_float=False, + ) + + return output + + +def fused_marlin_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + bias1: torch.Tensor | None, + bias2: torch.Tensor | None, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + gating_output: torch.Tensor | None, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + quant_type_id: int, + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, + activation: str = "silu", + activation_func: Callable[ + [str, torch.Tensor, torch.Tensor], None + ] = default_activation_func, + moe_sum: Callable[[torch.Tensor, torch.Tensor], None] | None = None, + expert_map: torch.Tensor | None = None, + global_scale1: torch.Tensor | None = None, + global_scale2: torch.Tensor | None = None, + g_idx1: torch.Tensor | None = None, + g_idx2: torch.Tensor | None = None, + sort_indices1: torch.Tensor | None = None, + sort_indices2: torch.Tensor | None = None, + w1_zeros: torch.Tensor | None = None, + w2_zeros: torch.Tensor | None = None, + workspace: torch.Tensor | None = None, + intermediate_cache13: torch.Tensor | None = None, + intermediate_cache2: torch.Tensor | None = None, + is_k_full: bool = True, + output: torch.Tensor | None = None, + inplace: bool = False, +) -> torch.Tensor: + """ + This function computes a Mixture of Experts (MoE) layer using two sets of + weights, w1 and w2, and top-k gating mechanism. + + Parameters: + - hidden_states (torch.Tensor): The input tensor to the MoE layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - w1_scale (torch.Tensor): Scale to be used for w1. + - w2_scale (torch.Tensor): Scale to be used for w2. + - gating_output (torch.Tensor|None): The output of the gating + operation (before softmax). + - g_idx1 (torch.Tensor|None): The first set of act_order indices. + - g_idx2 (torch.Tensor|None): The second set of act_order indices. + - sort_indices1 (torch.Tensor|None): The first act_order input + permutation. + - sort_indices2 (torch.Tensor|None): The second act_order input + permutation. + - topk_weights (torch.Tensor): Top-k weights. + - topk_ids (torch.Tensor): Indices of topk-k elements. + - w1_zeros (torch.Tensor|None): Optional zero points to be used for w1. + - w2_zeros (torch.Tensor|None): Optional zero points to be used for w2. + - num_bits (bool): The number of bits in expert weights quantization. + + Returns: + - torch.Tensor: The output tensor after applying the MoE layer. + """ + + if inplace: + assert output is None, "Conflicting request" + + quant_type = ScalarType.from_id(quant_type_id) + assert quant_type in [ + scalar_types.uint4, + scalar_types.uint8b128, + scalar_types.uint4b8, + scalar_types.float8_e4m3fn, + scalar_types.float4_e2m1f, + ] + + bit4_scalar_types = [ + scalar_types.uint4, + scalar_types.uint4b8, + scalar_types.float4_e2m1f, + ] + num_bits = 4 if quant_type in bit4_scalar_types else 8 + + M, K = hidden_states.size() + E = w1.size(0) + topk = topk_ids.size(1) + + # Check constraints. + if gating_output is not None: + assert gating_output.size(0) == M, "Number of tokens mismatch" + assert w1.size(1) * 16 == K, "Hidden size mismatch w1" + assert w2.size(2) // (num_bits // 2) == K, "Hidden size mismatch w2" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.is_contiguous(), "Expert weights1 must be contiguous" + assert w2.is_contiguous(), "Expert weights2 must be contiguous" + assert hidden_states.dtype in [torch.float16, torch.bfloat16] + assert num_bits in [4, 8] + assert topk_weights.dtype == torch.float32 + + # M block size selection logic + # TODO: tune this further for specific models + for block_size_m in [8, 16, 32, 48, 64]: + if M * topk / E / block_size_m < 0.9: + break + + if global_num_experts == -1: + global_num_experts = E + sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( + topk_ids, block_size_m, global_num_experts, expert_map + ) + + assert activation is not None + moe_output = _fused_marlin_moe( + hidden_states=hidden_states, + w1=w1, + w2=w2, + bias1=bias1, + bias2=bias2, + w1_scale=w1_scale, + w2_scale=w2_scale, + topk_weights=topk_weights, + num_topk=topk, + quant_type=quant_type, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + block_size_m=block_size_m, + sorted_token_ids=sorted_token_ids, + expert_ids=expert_ids, + num_tokens_post_padded=num_tokens_post_padded, + activation=activation, + activation_func=activation_func, + global_scale1=global_scale1, + global_scale2=global_scale2, + g_idx1=g_idx1, + g_idx2=g_idx2, + sort_indices1=sort_indices1, + sort_indices2=sort_indices2, + w1_zeros=w1_zeros, + w2_zeros=w2_zeros, + workspace=workspace, + intermediate_cache13=intermediate_cache13, + intermediate_cache2=intermediate_cache2, + output=None, + is_k_full=is_k_full, + ).view(-1, topk, K) + + if output is None: + if inplace and not disable_inplace(): + output = hidden_states + else: + output = torch.empty_like(hidden_states) + + if moe_sum is None: + return torch.sum(moe_output.view(-1, topk, K), dim=1, out=output) + else: + return moe_sum(moe_output, output) + + +def batched_fused_marlin_moe( + hidden_states: torch.Tensor, + expert_num_tokens: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + bias1: torch.Tensor | None, + bias2: torch.Tensor | None, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + gating_output: torch.Tensor | None, + quant_type_id: int, + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, + activation: str | None = "silu", + expert_map: torch.Tensor | None = None, + global_scale1: torch.Tensor | None = None, + global_scale2: torch.Tensor | None = None, + g_idx1: torch.Tensor | None = None, + g_idx2: torch.Tensor | None = None, + sort_indices1: torch.Tensor | None = None, + sort_indices2: torch.Tensor | None = None, + w1_zeros: torch.Tensor | None = None, + w2_zeros: torch.Tensor | None = None, + workspace: torch.Tensor | None = None, + intermediate_cache13: torch.Tensor | None = None, + intermediate_cache2: torch.Tensor | None = None, + is_k_full: bool = True, + output: torch.Tensor | None = None, + inplace: bool = False, +) -> torch.Tensor: + """ + This function massages the inputs so the batched hidden_states can be + presented as a 2D contiguous tensor that could be used with + _fused_marlin_moe. + + Note that both batched_fused_marlin_moe and fused_marlin_moe ultimately + use `ops.moe_wna16_marlin_gemm` for the gemm operation and + `ops.moe_mna16_marlin_gemm` supports only 2D contiguous hidden_states. + Note that the moe_align_block_size function indicates, + - What rows of the A matrix (hidden_states) to access during the + matmul, via sorted_ids output. + - What expert_id to use for each block matmul, via expert_ids ouptut. + + In the batched version, the tokens are already grouped/batched by experts + they subscribe to. Due to this, we can represent the batched hidden_states + tensor of shape [B, MAX_TOKENS_PER_BATCH, K] as a 2D tensor of shape, + [B * MAX_TOKENS_PER_BATCH, K]. We may treat this a 2D contiguous tensor + with topk=1 as each token (row in the tensor) subscribes to exactly one + expert_id (which is the batch_id). With the expert_num_tokens tensor, that + indicates how many tokens are actually valid in each batch, the + batched_moe_align_block_size function constructs the sorted_ids and + expert_ids tensors, so only relevant/valid rows of A (hidden_states) + are accessed and are processed with the correct expert_ids. + """ + + assert hidden_states.ndim == 3, ( + f"hidden states must be batched. e.g. [B, MAX_TOKENS, K]." + f"But got {hidden_states.size()}" + ) + if inplace: + assert output is None, "Conflicting request." + + quant_type = ScalarType.from_id(quant_type_id) + assert quant_type in [ + scalar_types.uint4, + scalar_types.uint8b128, + scalar_types.uint4b8, + scalar_types.float8_e4m3fn, + scalar_types.float4_e2m1f, + ] + + bit4_scalar_types = [ + scalar_types.uint4, + scalar_types.uint4b8, + scalar_types.float4_e2m1f, + ] + num_bits = 4 if quant_type in bit4_scalar_types else 8 + + B, BATCH_TOKENS_MAX, K = hidden_states.size() + M = hidden_states.view(-1, K).size(0) + E = w1.size(0) + + # Check constraints. + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert hidden_states.dtype in [torch.float16, torch.bfloat16] + assert expert_num_tokens.size(0) == E + assert B == E, ( + "Batch must be as big as number of experts as the tokens" + "are sorted into the batch/expert they belong to" + ) + assert w1.size(1) * 16 == K, "Hidden size mismatch w1" + assert w2.size(2) // (num_bits // 2) == K, "Hidden size mismatch w2" + assert w1.is_contiguous(), "Expert weights1 must be contiguous" + assert w2.is_contiguous(), "Expert weights2 must be contiguous" + assert num_bits in [4, 8] + + # Technically, the tokens are already separated by their expert ids. + # Hidden-States can just be squeezed to have just 2 dimensions, + # [B * MAX_TOKENS, K] and top_k can be interpreted as just 1. + topk = 1 + + # TODO(varun) : Choose a decent block size like in fused_marlin_moe + block_size_m = 64 + + sorted_token_ids, expert_ids, num_tokens_post_padded = batched_moe_align_block_size( + max_tokens_per_batch=BATCH_TOKENS_MAX, + block_size=block_size_m, + expert_num_tokens=expert_num_tokens, + ) + + if output is None and inplace: + output = hidden_states + + # TODO (varun): This can be avoided by plumbing the marlin kernel to + # ignore topk_weights when topk_weights_ptr is a nullptr. + topk_weights = torch.ones( + (M, topk), device=hidden_states.device, dtype=torch.float32 + ) + + assert activation is not None + output = _fused_marlin_moe( + hidden_states=hidden_states.view(-1, K), + w1=w1, + w2=w2, + bias1=bias1, + bias2=bias2, + w1_scale=w1_scale, + w2_scale=w2_scale, + topk_weights=topk_weights, + num_topk=topk, + quant_type=quant_type, + apply_router_weight_on_input=apply_router_weight_on_input, + activation=activation, + expert_map=expert_map, + block_size_m=block_size_m, + sorted_token_ids=sorted_token_ids, + expert_ids=expert_ids, + num_tokens_post_padded=num_tokens_post_padded, + global_scale1=global_scale1, + global_scale2=global_scale2, + g_idx1=g_idx1, + g_idx2=g_idx2, + sort_indices1=sort_indices1, + sort_indices2=sort_indices2, + w1_zeros=w1_zeros, + w2_zeros=w2_zeros, + workspace=workspace, + intermediate_cache13=intermediate_cache13, + intermediate_cache2=intermediate_cache2, + output=output.view(-1, K) if output is not None else output, + is_k_full=is_k_full, + ) + + output = output.view(B, BATCH_TOKENS_MAX, K) + + return output + + +class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + quant_config: FusedMoEQuantConfig, + w13_g_idx: torch.Tensor | None = None, + w2_g_idx: torch.Tensor | None = None, + w13_g_idx_sort_indices: torch.Tensor | None = None, + w2_g_idx_sort_indices: torch.Tensor | None = None, + is_k_full: bool = True, + ): + # TODO (varun) : Enable activation quantization + assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, ( + "Supports only mxfp4_w4a16 or int4_w4a16" + ) + self.w13_g_idx = w13_g_idx + self.w2_g_idx = w2_g_idx + self.w13_g_idx_sort_indices = w13_g_idx_sort_indices + self.w2_g_idx_sort_indices = w2_g_idx_sort_indices + self.is_k_full = is_k_full + super().__init__(quant_config) + + @property + def quant_type_id(self) -> int: + # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4 + return ( + scalar_types.uint4b8.id + if self.quant_config.use_int4_w4a16 + else scalar_types.float4_e2m1f.id + ) + + def moe_problem_size( + self, + a1: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + ) -> tuple[int, int, int, int, int]: + assert w1.dim() == 3 and w2.dim() == 3 + + E = w1.size(0) + K = a1.size(-1) + N = marlin_moe_intermediate_size(w1, w2) + + if a1.dim() == 2: + # Make sure we are using the correct a1 (pre-permute). + assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}" + M = a1.size(0) + else: + assert a1.dim() == 3 + assert a1.size(0) == E, f"{a1.size(0)} == {E}" + M = a1.size(1) # This is max_num_tokens + + assert topk_ids.dim() == 2 + topk = topk_ids.size(1) + + return E, M, N, K, topk + + +class MarlinExperts(MarlinExpertsBase): + def __init__( + self, + quant_config: FusedMoEQuantConfig, + w13_g_idx: torch.Tensor | None = None, + w2_g_idx: torch.Tensor | None = None, + w13_g_idx_sort_indices: torch.Tensor | None = None, + w2_g_idx_sort_indices: torch.Tensor | None = None, + is_k_full: bool = True, + ): + super().__init__( + quant_config, + w13_g_idx, + w2_g_idx, + w13_g_idx_sort_indices, + w2_g_idx_sort_indices, + is_k_full, + ) + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # Modular Kernel provisions output buffer from workspace1. However in + # the fused_marlin_moe() function, the final torch.sum(), is defined + # essentially as, + # `torch.sum(workspace1, dim=1, out=output)` + # Having overlapping input and output tensors for torch.sum seems + # error prone and depends on how the torch.sum is implemented. + # For this reason we swap let the output buffer provision from + # workspace2. + + # Workspace/IntermediateCache allocation matching fused_marlin_moe() + # workspace1 = (M * topk * max(2 * N, K),) + # workspace2 = (M * topk, N) + + # Workspace/IntermediateCache allocation accounting for output buffer + # provisioning + workspace1 = (M * topk, max(N, K)) + workspace2 = (M * topk * max(2 * N, K),) + output = (M, K) + + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert self.w1_scale is not None + assert self.w2_scale is not None + return fused_marlin_moe( + hidden_states=hidden_states, + w1=w1, + w2=w2, + bias1=self.w1_bias, + bias2=self.w2_bias, + w1_scale=self.w1_scale, + w2_scale=self.w2_scale, + gating_output=None, + topk_weights=topk_weights, + topk_ids=topk_ids, + quant_type_id=self.quant_type_id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + activation=activation, + activation_func=self.activation, + moe_sum=self.moe_sum, + expert_map=expert_map, + output=output, + # Workspaces are swapped in workspace_shapes() to account for proper + # output buffer allocation. Please refer to workspace_shapes(). + intermediate_cache13=workspace2, + intermediate_cache2=workspace13, + g_idx1=self.w13_g_idx, + g_idx2=self.w2_g_idx, + sort_indices1=self.w13_g_idx_sort_indices, + sort_indices2=self.w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) + + def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None: + ops.moe_sum(input, output) + + +def modular_marlin_fused_moe( + quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None +) -> mk.FusedMoEModularKernel: + return mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + MarlinExperts(quant_config), + shared_experts, + ) + + +class BatchedMarlinExperts(MarlinExpertsBase): + def __init__( + self, + max_num_tokens: int, + num_dispatchers: int, + quant_config: FusedMoEQuantConfig, + w13_g_idx: torch.Tensor | None = None, + w2_g_idx: torch.Tensor | None = None, + w13_g_idx_sort_indices: torch.Tensor | None = None, + w2_g_idx_sort_indices: torch.Tensor | None = None, + is_k_full: bool = True, + ): + super().__init__( + quant_config, + w13_g_idx, + w2_g_idx, + w13_g_idx_sort_indices, + w2_g_idx_sort_indices, + is_k_full, + ) + self.max_num_tokens = max_num_tokens + self.num_dispatchers = num_dispatchers + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceDelegate() + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts, + ) + + def supports_chunking(self) -> bool: + return False + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + num_dispatchers = self.num_dispatchers + num_experts = local_num_experts + max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens + workspace13 = (num_experts * max_num_tokens * num_dispatchers, max(K, N * 2)) + workspace2 = (num_experts * max_num_tokens * num_dispatchers, N) + output = (num_experts, max_num_tokens * num_dispatchers, K) + return (workspace13, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert expert_tokens_meta is not None, "Num valid tokens per batch is required" + return batched_fused_marlin_moe( + hidden_states=hidden_states, + expert_num_tokens=expert_tokens_meta.expert_num_tokens, + w1=w1, + w2=w2, + bias1=self.w1_bias, + bias2=self.w2_bias, + w1_scale=self.w1_scale, + w2_scale=self.w2_scale, + gating_output=None, + quant_type_id=self.quant_type_id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + activation=activation, + expert_map=expert_map, + output=output, + intermediate_cache13=workspace13, + intermediate_cache2=workspace2, + g_idx1=self.w13_g_idx, + g_idx2=self.w2_g_idx, + sort_indices1=self.w13_g_idx_sort_indices, + sort_indices2=self.w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) diff --git a/model_executor/layers/fused_moe/fused_moe.py b/model_executor/layers/fused_moe/fused_moe.py new file mode 100644 index 0000000..e63ab9d --- /dev/null +++ b/model_executor/layers/fused_moe/fused_moe.py @@ -0,0 +1,2306 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Fused MoE Triton kernels.""" + +import functools +import json +import os +from collections.abc import Callable +from typing import Any + +import torch +import torch.nn.functional as F + +import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger +from vllm.model_executor.layers.batch_invariant import ( + vllm_is_batch_invariant, +) +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG, + FusedMoEQuantConfig, + _get_config_dtype_str, +) +from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + _valid_cutlass_block_scaled_grouped_gemm, + run_cutlass_block_scaled_fused_experts, +) +from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + _valid_deep_gemm, + deep_gemm_moe_fp8, +) +from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( + moe_align_block_size, +) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, + activation_without_mul, + disable_inplace, + moe_kernel_quantize_input, +) +from vllm.model_executor.layers.quantization.utils.mxfp4_utils import dequant_mxfp4 +from vllm.model_executor.layers.quantization.utils.mxfp6_utils import dequant_mxfp6 +from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_Scheme +from vllm.model_executor.utils import maybe_disable_graph_partition +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used +from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer +import vllm._custom_ops as ops +import ixformer.inference.functions as ixfops +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.distributed import get_ep_group +logger = init_logger(__name__) + + +@triton.jit +def write_zeros_to_output( + c_ptr, + stride_cm, + stride_cn, + pid_n, + N, + offs_token, + token_mask, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + compute_type, +): + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.jit +def fused_moe_kernel_gptq_awq( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + b_scale_ptr, + b_zp_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N: tl.constexpr, + K: tl.constexpr, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_bse, + stride_bsk, + stride_bsn, + stride_bze, + stride_bzk, + stride_bzn, + block_k_diviable: tl.constexpr, + group_size: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + SPLIT_K: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + has_zp: tl.constexpr, + use_int4_w4a16: tl.constexpr, + use_int8_w8a16: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: + return + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + write_zeros_to_output( + c_ptr, + stride_cm, + stride_cn, + pid_n, + N, + offs_token, + token_mask, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + compute_type, + ) + return + + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + + if use_int4_w4a16: + b_ptrs = ( + b_ptr + + off_experts * stride_be + + (offs_k[:, None] // 2) * stride_bk + + offs_bn[None, :] * stride_bn + ) + b_shifter = (offs_k[:, None] % 2) * 4 + elif use_int8_w8a16: + b_ptrs = ( + b_ptr + + off_experts * stride_be + + offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn + ) + + if not has_zp and use_int4_w4a16: + b_zp_num = 8 + if not has_zp and use_int8_w8a16: + b_zp_num = 128 + elif has_zp and use_int4_w4a16: + b_zp_shifter = (offs_bn[None, :] % 2) * 4 + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + + if not block_k_diviable: + k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K + k_other = 0.0 + else: + k_mask = None + k_other = None + + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + b = tl.load(b_ptrs) + if use_int4_w4a16: + b = (b >> b_shifter) & 0xF + + b_scale_ptrs = ( + b_scale_ptr + + off_experts * stride_bse + + offs_bn[None, :] * stride_bsn + + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk + ) + b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other) + b_scale = b_scale.to(tl.float32) + + if has_zp and use_int4_w4a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = ( + b_zp_ptr + + off_experts * stride_bze + + (offs_bn[None, :] // 2) * stride_bzn + + offs_k_true * stride_bzk + ) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = (b_zp >> b_zp_shifter) & 0xF + b_zp = b_zp.to(tl.float32) + elif has_zp and use_int8_w8a16: + offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size + b_zp_ptrs = ( + b_zp_ptr + + off_experts * stride_bze + + offs_bn[None, :] * stride_bzn + + offs_k_true * stride_bzk + ) + b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other) + b_zp = b_zp.to(tl.float32) + + # We accumulate along the K dimension. + if has_zp: + b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type) + else: + b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type) + accumulator = tl.dot(a, b, acc=accumulator) + + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + if use_int4_w4a16: + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + else: + b_ptrs += BLOCK_SIZE_K * stride_bk + + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + + accumulator = accumulator.to(compute_type) + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.jit +def fused_moe_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + b_bias_ptr, + a_scale_ptr, + b_scale_ptr, + topk_weights_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + num_tokens_post_padded_ptr, + # Matrix dimensions + N, + K, + EM, + num_valid_tokens, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + stride_bbe, # bias expert stride + stride_bbn, # bias N stride + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + SPLIT_K: tl.constexpr, + MUL_ROUTED_WEIGHT: tl.constexpr, + top_k: tl.constexpr, + compute_type: tl.constexpr, + use_fp8_w8a8: tl.constexpr, + use_int8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + per_channel_quant: tl.constexpr, + HAS_BIAS: tl.constexpr, +): + """ + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. + + Key Parameters: + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) + if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: + return + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) + token_mask = offs_token < num_valid_tokens + + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) + if off_experts == -1: + # ----------------------------------------------------------- + # Write back zeros to the output when the expert is not + # in the current expert parallel rank. + write_zeros_to_output( + c_ptr, + stride_cm, + stride_cn, + pid_n, + N, + offs_token, + token_mask, + BLOCK_SIZE_M, + BLOCK_SIZE_N, + compute_type, + ) + return + + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak + ) + + b_ptrs = ( + b_ptr + + off_experts * stride_be + + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + ) + if use_int8_w8a16: + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn + ) + b_scale = tl.load(b_scale_ptrs) + + if use_fp8_w8a8 or use_int8_w8a8: + # block-wise + if group_k > 0 and group_n > 0: + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + offs_bsn = offs_bn // group_n + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn + ) + # channel-wise + elif per_channel_quant: + b_scale_ptrs = ( + b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn + ) + b_scale = tl.load(b_scale_ptrs) + # Load per-token scale for activations + a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm + a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, None] + # tensor-wise + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + off_experts) + if HAS_BIAS: + # bias shape: [num_experts, N] + bias_ptrs = b_bias_ptr + off_experts * stride_bbe + offs_bn * stride_bbn + bias = tl.load(bias_ptrs, mask=(offs_bn < N), other=0.0) + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + a = tl.load( + a_ptrs, + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + # We accumulate along the K dimension. + if use_int8_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_scale = tl.load( + a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0 + ) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] + else: + if use_fp8_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + if HAS_BIAS: + accumulator = accumulator + bias[None, :] + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + if use_int8_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_fp8_w8a8 or use_int8_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + + # ----------------------------------------------------------- + # Write back the block of the output + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] + c_mask = token_mask[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +def invoke_fused_moe_kernel( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: torch.Tensor | None, + B_scale: torch.Tensor | None, + B_zp: torch.Tensor | None, + topk_weights: torch.Tensor | None, + topk_ids: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + config: dict[str, Any], + compute_type: tl.dtype, + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + per_channel_quant: bool, + block_shape: list[int] | None = None, + B_bias: torch.Tensor | None = None, +) -> None: + assert topk_weights is not None or not mul_routed_weight + assert topk_weights is None or topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + ops.invoke_fused_moe_kernel(A,B,C,A_scale,B_scale,topk_weights,topk_ids,sorted_token_ids,expert_ids,num_tokens_post_padded,mul_routed_weight,top_k,config,compute_type,use_fp8_w8a8,use_int8_w8a16,block_shape,B_bias) + return + + if use_fp8_w8a8 or use_int8_w8a8: + assert B_scale is not None + assert block_shape is None or triton.cdiv( + B.size(-2), block_shape[0] + ) == B_scale.size(-2) + assert block_shape is None or triton.cdiv( + B.size(-1), block_shape[1] + ) == B_scale.size(-1) + + elif use_int8_w8a16 or use_int4_w4a16: + assert B_scale is not None + assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + + M = A.size(0) + num_tokens = M * top_k + + EM = sorted_token_ids.size(0) + if A.size(0) < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.size(0), A.size(0) * top_k * config["BLOCK_SIZE_M"]) + grid = lambda META: ( + triton.cdiv(EM, META["BLOCK_SIZE_M"]) + * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]), + ) + HAS_BIAS = B_bias is not None + if ( + (use_int8_w8a16 or use_int4_w4a16) + and block_shape is not None + and block_shape[1] > 0 + ): + assert B_scale is not None and B_scale.ndim == 3 + assert B_zp is None or B_zp.ndim == 3 + + use_moe_wna16_cuda = should_moe_wna16_use_cuda( + num_valid_tokens=num_tokens, + group_size=block_shape[1], + num_experts=B.size(0), + bit=4 if use_int4_w4a16 else 8, + ) + config = config.copy() + config.update( + get_moe_wna16_block_config( + config=config, + use_moe_wna16_cuda=use_moe_wna16_cuda, + num_valid_tokens=num_tokens, + size_k=A.size(1), + size_n=B.size(1), + num_experts=B.size(1), + group_size=block_shape[1], + real_top_k=top_k, + block_size_m=config["BLOCK_SIZE_M"], + ) + ) + + if use_moe_wna16_cuda: + bit = 4 if use_int4_w4a16 else 8 + ops.moe_wna16_gemm( + A, + C, + B, + B_scale, + B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + top_k, + config["BLOCK_SIZE_M"], + config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], + bit, + ) + return + fused_moe_kernel_gptq_awq[grid]( + A, + B, + C, + B_scale, + B_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.size(1), + A.size(1), + EM, + num_tokens, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + B_scale.stride(0), + B_scale.stride(2), + B_scale.stride(1), + B_zp.stride(0) if B_zp is not None else 0, + B_zp.stride(2) if B_zp is not None else 0, + B_zp.stride(1) if B_zp is not None else 0, + block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0, + group_size=block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + has_zp=B_zp is not None, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + **config, + ) + else: + config = config.copy() + config["SPLIT_K"] = 1 + BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K") + if block_shape is not None: + BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1])) + fused_moe_kernel[grid]( + A, + B, + C, + B_bias, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.size(1), + B.size(2), + EM, + num_tokens, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_bias.stride(0) if B_bias is not None else 0, + B_bias.stride(1) if B_bias is not None else 0, + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + per_channel_quant=per_channel_quant, + HAS_BIAS=HAS_BIAS, + BLOCK_SIZE_K=BLOCK_SIZE_K, + **config, + ) + + +@triton.jit +def compute_identity_kernel( + top_k: int, + hidden_states_ptr: tl.tensor, + expert_scales_ptr: tl.tensor, + num_tokens: int, + output_ptr: tl.tensor, + hidden_dim: int, + scales_stride: int, + BLOCK_SIZE: tl.constexpr, +) -> None: + pid = tl.program_id(0) + + batch_id = pid // (hidden_dim // BLOCK_SIZE) + dim_offset = pid % (hidden_dim // BLOCK_SIZE) * BLOCK_SIZE + + if batch_id >= num_tokens or dim_offset >= hidden_dim: + return + + h = tl.load( + hidden_states_ptr + + batch_id * hidden_dim + + dim_offset + + tl.arange(0, BLOCK_SIZE), + mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim, + ) + + result = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + for i in range(top_k): + scale = tl.load(expert_scales_ptr + batch_id * scales_stride + i) + result += h * scale + + tl.store( + output_ptr + batch_id * hidden_dim + dim_offset + tl.arange(0, BLOCK_SIZE), + result, + mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim, + ) + + +def zero_experts_compute_triton( + expert_indices: torch.Tensor, + expert_scales: torch.Tensor, + num_experts: int, + zero_expert_type: str, + hidden_states: torch.Tensor, +) -> torch.Tensor: + N = expert_indices.numel() + top_k = expert_indices.size(-1) + grid = lambda meta: (triton.cdiv(N, meta["BLOCK_SIZE"]),) + + if zero_expert_type == "identity": + zero_expert_mask = expert_indices < num_experts + zero_expert_scales = expert_scales.clone() + zero_expert_scales[zero_expert_mask] = 0.0 + + normal_expert_mask = expert_indices >= num_experts + expert_indices[normal_expert_mask] = 0 + expert_scales[normal_expert_mask] = 0.0 + + output = torch.zeros_like(hidden_states).to(hidden_states.device) + hidden_dim = hidden_states.size(-1) + num_tokens = hidden_states.size(0) + + grid = lambda meta: (num_tokens * (hidden_dim // meta["BLOCK_SIZE"]),) + compute_identity_kernel[grid]( + top_k, + hidden_states, + zero_expert_scales, + num_tokens, + output, + hidden_dim, + zero_expert_scales.stride(0), + BLOCK_SIZE=256, + ) + + return output + + +# Adapted from: https://github.com/sgl-project/sglang/pull/2628 +def get_config_file_name( + E: int, N: int, dtype: str | None, block_shape: list[int] | None = None +) -> str: + device_name = current_platform.get_device_name().replace(" ", "_") + # Set device_name to H200 if a device from the H200 family is detected + if "H200" in device_name.split("_"): + device_name = "NVIDIA_H200" + dtype_selector = "" if not dtype else f",dtype={dtype}" + block_shape_selector = ( + "" if not block_shape or not all(block_shape) else f",block_shape={block_shape}" + ).replace(" ", "") + return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json" # noqa: E501 + + +# Adapted from: https://github.com/sgl-project/sglang/pull/2628 +@functools.lru_cache +def get_moe_configs( + E: int, + N: int, + dtype: str | None, + block_n: int | None = None, + block_k: int | None = None, +) -> dict[int, Any] | None: + """ + Return optimized configurations for the fused MoE kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the fused_moe kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # Avoid optimizing for the batch invariant case. Use default config + if vllm_is_batch_invariant(): + return None + + # First look up if an optimized configuration is available in the configs + # directory + block_shape = [block_n, block_k] if block_n and block_k else None + json_file_name = get_config_file_name(E, N, dtype, block_shape) + + config_file_paths = [] + + # note that we prioritize user defined config + user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER + if user_defined_config_folder is not None: + user_defined_config_file_path = os.path.join( + user_defined_config_folder, json_file_name + ) + config_file_paths.append(user_defined_config_file_path) + + default_config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name + ) + config_file_paths.append(default_config_file_path) + + for config_file_path in config_file_paths: + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info( + "Using configuration from %s for MoE layer.", config_file_path + ) + # If a configuration has been found, return it + tuned_config = json.load(f) + # Delete triton_version from tuned_config + tuned_config.pop("triton_version", None) + return {int(key): val for key, val in tuned_config.items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ( + "Using default MoE config. Performance might be sub-optimal! " + "Config file not found at %s" + ), + config_file_paths, + ) + return None + + +def get_moe_wna16_block_config( + config: dict[str, int], + use_moe_wna16_cuda: bool, + num_valid_tokens: int, + size_k: int, + size_n: int, + num_experts: int, + group_size: int, + real_top_k: int, + block_size_m: int, +): + if "BLOCK_SIZE_N" in config and "BLOCK_SIZE_K" in config: + # optimal block config is set + return {} + if not use_moe_wna16_cuda: + # triton moe wna16 kernel + if num_valid_tokens // real_top_k == 1: + # if bs=1, use a smaller BLOCK_SIZE_N + return {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64} + else: + return {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32} + else: + # cuda moe wna16 kernel + # set default block_size 128, and increase them when num_blocks + # is too large. + block_size_n = 128 + block_size_k = 128 + if block_size_k <= group_size: + block_size_k = group_size + + num_n_blocks = size_k // block_size_k + num_k_blocks = size_n // block_size_k + num_m_blocks = ( + num_valid_tokens + block_size_m - 1 + ) / block_size_m + num_experts + if num_valid_tokens // real_top_k <= block_size_m: + num_m_blocks = min(num_m_blocks, num_valid_tokens) + num_blocks = num_m_blocks * num_n_blocks * num_k_blocks + + if size_k % 256 == 0 and num_blocks >= 256 and block_size_k < 256: + block_size_k = 256 + num_blocks = num_blocks // (256 // block_size_k) + + if ( + num_m_blocks <= 16 + and size_k % (block_size_k * 2) == 0 + and size_k % (block_size_k * 2) == 0 + and block_size_k <= 512 + and num_blocks >= 512 + ): + block_size_k = block_size_k * 2 + num_blocks = num_blocks // 2 + + if num_blocks > 1024: + block_size_n = 256 + num_n_blocks = num_n_blocks // 2 + num_blocks = num_blocks // 2 + + if size_n <= 1024 and num_blocks >= 1024: + # The kernel performance got much better with BLOCK_SIZE_N=1024 + # when num_blocks is large, event when N is small. + # Not sure why, maybe it force the CUDA SM process only one block + # at the same time. + block_size_n = 1024 + + return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k} + + +def should_moe_wna16_use_cuda( + num_valid_tokens: int, group_size: int, num_experts: int, bit: int +): + return ( + current_platform.is_cuda() + and bit == 4 + and group_size in [32, 64, 128] + and num_valid_tokens / num_experts <= 6 + ) + + +def get_default_config( + M: int, + E: int, + N: int, + K: int, + topk: int, + dtype: str | None, + block_shape: list[int] | None = None, +) -> dict[str, int]: + if vllm_is_batch_invariant(): + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "SPLIT_K": 1, + } + return config + + if dtype == "fp8_w8a8" and block_shape is not None: + # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0] + # BLOCK_SIZE_K must be divisible by block_shape[1] + # num_stages=3 can cause triton.runtime.errors.OutOfResources + # on ROCm, set it to 2 instead. + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_shape[0], + "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 32, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 3 if not current_platform.is_rocm() else 2, + } + elif dtype in ["int4_w4a16", "int8_w8a16"] and block_shape is not None: + # moe wna16 kernels + # only set BLOCK_SIZE_M + # BLOCK_SIZE_N and BLOCK_SIZE_K would be set later + bit = 4 if dtype == "int4_w4a16" else 8 + use_moe_wna16_cuda = should_moe_wna16_use_cuda(M * topk, block_shape[1], E, bit) + if use_moe_wna16_cuda: + config = {"BLOCK_SIZE_M": min(16, M), "SPLIT_K": 1} + elif M <= 20: + config = {"BLOCK_SIZE_M": 16, "GROUP_SIZE_M": 1, "SPLIT_K": 1} + elif M <= 40: + config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1, "SPLIT_K": 1} + else: + config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1, "SPLIT_K": 1} + elif M <= E: + config = { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + } + else: + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "SPLIT_K": 1, + } + numel = M * topk + if numel <= 64: + config['BLOCK_SIZE_M'] = 32 + elif numel <= 1024: + config['BLOCK_SIZE_M'] = 64 + else: + config['BLOCK_SIZE_M'] = 256 + return config + + +def try_get_optimal_moe_config( + w1_shape: tuple[int, ...], + w2_shape: tuple[int, ...], + top_k: int, + dtype: str | None, + M: int, + block_shape: list[int] | None = None, +) -> dict[str, int]: + from vllm.model_executor.layers.fused_moe import get_config + + override_config = get_config() + if override_config: + config = override_config + else: + # First try to load optimal config from the file + E, _, N = w2_shape + # block_n = block_shape[0] if block_shape else 0 + # block_k = block_shape[1] if block_shape else 0 + # configs = get_moe_configs(E, N, dtype, block_n, block_k) + + configs = None + + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Else use the default config + config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, block_shape) + return config + + +def vllm_topk_softmax( + topk_weights: torch.Tensor, + topk_indices: torch.Tensor, + token_expert_indices: torch.Tensor, + gating_output: torch.Tensor, + renormalize: bool, +) -> tuple[torch.Tensor, ...]: + ops.topk_softmax( + topk_weights, + topk_indices, + token_expert_indices, + gating_output, + ) + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + return topk_weights, topk_indices + + +def dispatch_topk_func( + use_rocm_aiter: bool = False, +) -> Callable[..., tuple[torch.Tensor, ...]]: + if use_rocm_aiter: + return rocm_aiter_ops.topk_softmax + return vllm_topk_softmax + + +def fused_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + indices_type: torch.dtype | None = None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch" + + M, _ = hidden_states.size() + + topk_weights = torch.empty( + M, topk, dtype=torch.float32, device=hidden_states.device + ) + topk_ids = torch.empty( + M, + topk, + dtype=torch.int32 if indices_type is None else indices_type, + device=hidden_states.device, + ) + token_expert_indices = torch.empty( + M, topk, dtype=torch.int32, device=hidden_states.device + ) + + gating_output_float = gating_output.float() # TODO(woosuk): Optimize this. + + topk_func = dispatch_topk_func(use_rocm_aiter=rocm_aiter_ops.is_fused_moe_enabled()) + topk_weights, topk_ids = topk_func( + topk_weights, topk_ids, token_expert_indices, gating_output_float, renormalize + ) + + return topk_weights, topk_ids, token_expert_indices + + +def fused_topk_bias( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + e_score_correction_bias: torch.Tensor, + topk: int, + renormalize: bool, +): + n_routed_experts = gating_output.shape[-1] + scores = gating_output.softmax(dim=-1) + scores_for_choice = scores.view( + -1, n_routed_experts + ) + e_score_correction_bias.unsqueeze(0) + + # For batch invariance, use sorted=True to ensure deterministic expert selection + use_sorted = vllm_is_batch_invariant() + topk_indices = torch.topk(scores_for_choice, k=topk, dim=-1, sorted=use_sorted)[1] + topk_weights = scores.gather(1, topk_indices) + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + return topk_weights.to(torch.float32), topk_indices.to(torch.int32) + + +# This is used by the Deepseek-V2 and Deepseek-V3 model +@torch.compile( + dynamic=True, + backend=current_platform.simple_compile_backend, + options=maybe_disable_graph_partition(current_platform.simple_compile_backend), +) +def grouped_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + if ( + envs.VLLM_USE_FUSED_MOE_GROUPED_TOPK + and current_platform.is_cuda() + and num_expert_group <= 32 + and topk <= 32 + and e_score_correction_bias is not None + ): + return fused_grouped_topk( + hidden_states=hidden_states, + gating_output=gating_output, + topk=topk, + renormalize=renormalize, + e_score_correction_bias=e_score_correction_bias, + num_expert_group=num_expert_group, + topk_group=topk_group, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + ) + + assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch" + + if scoring_func == "softmax": + scores = torch.softmax(gating_output, dim=-1) + elif scoring_func == "sigmoid": + scores = gating_output.sigmoid() + else: + raise ValueError(f"Unsupported scoring function: {scoring_func}") + + num_token = scores.size(0) + if e_score_correction_bias is not None: + # Store original scores before applying correction bias. We use biased + # scores for expert selection but original scores for routing weights + original_scores = scores + scores = scores + e_score_correction_bias.unsqueeze(0) + group_scores = ( + scores.view(num_token, num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1) + ) + else: + group_scores = ( + scores.view(num_token, num_expert_group, -1).max(dim=-1).values + ) # [n, n_group] + + # For batch invariance, use sorted=True to ensure deterministic expert selection + use_sorted = vllm_is_batch_invariant() + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=use_sorted)[ + 1 + ] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand(num_token, num_expert_group, scores.size(-1) // num_expert_group) + .reshape(num_token, -1) + ) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), float("-inf")) # [n, e] + + if e_score_correction_bias is not None: + topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=use_sorted)[1] + # Use original unbiased scores for the routing weights + topk_weights = original_scores.gather(1, topk_ids) + else: + topk_weights, topk_ids = torch.topk( + tmp_scores, k=topk, dim=-1, sorted=use_sorted + ) + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + + if routed_scaling_factor != 1.0: + topk_weights = topk_weights * routed_scaling_factor + return topk_weights.to(torch.float32), topk_ids.to(torch.int32) +from ixformer.inference.functions import moe_grouped_topk as grouped_topk + + +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) +def eplb_map_to_physical_and_record( + topk_ids: torch.Tensor, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + indices_type: torch.dtype | None = None, +) -> torch.Tensor: + """ + Map the logical expert ids to physical expert ids + and record the expert load metrics. + + This will select a pseudo-random replica for each logical expert. + Only used for EPLB. + + Args: + topk_ids: The logical expert ids. + expert_load_view: The expert load view. + logical_to_physical_map: The logical to physical map. + logical_replica_count: The logical replica count. + indices_type: The indices type. + + Returns: + The physical expert ids. + """ + + # 1. Convert the logical expert ids to physical expert ids + # Directly select a random replica for each logical expert + + # In case `indices_type` is not `torch.long` or `torch.int`, + # e.g. `torch.uint32` as required by dispatch/combine kernels + topk_ids_long = topk_ids.long() + # Use (token position) modulo (replica count) + # to deterministically choose a replica + replica_count = logical_replica_count[topk_ids_long] + # Flatten-position based index, reshaped back to `topk_ids` shape + pos_indices = torch.arange( + topk_ids.numel(), device=topk_ids.device, dtype=torch.long + ).reshape_as(topk_ids) + # Compute pseudo-random indices by modulo + replica_indices = (pos_indices % replica_count).unsqueeze(-1) + physical_ids = ( + logical_to_physical_map[topk_ids_long].gather(-1, replica_indices).squeeze(-1) + ) + + topk_ids = physical_ids + + # 2. Record expert load metrics. + + # TODO(bowen): When using `FusedMoEModularKernel`, this + # can be done in a more unified way, since + # `FusedMoEPrepareAndFinalize` will return the expert + # token count, in some cases directly from the kernel. + # However, now there are many code paths not using + # the modular kernel, e.g. calling `fused_experts`, + # so we decide to keep the logic here. + # + # If later refactor moved all the MoE kernel calls + # to the modular kernel, we can move this logic there + # to achieve better efficiency. + + # `expert_load_view`: (num_physical_experts,) + + # `torch.bincount` is not compilable, so use `scatter_add_` instead. + topk_ids_flatten = topk_ids.flatten() + expert_load_view.scatter_add_( + dim=0, + index=topk_ids_flatten.long(), + src=torch.ones_like(topk_ids_flatten).to(expert_load_view), + ) + + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) + return topk_ids + + +def fused_grouped_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + e_score_correction_bias: torch.Tensor, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, +) -> tuple[torch.Tensor, torch.Tensor]: + assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch" + + if scoring_func == "sigmoid": + # Fully fused kernel path for sigmoid + topk_values, topk_indices = ops.grouped_topk( + gating_output, # raw logits + num_expert_group, + topk_group, + topk, + renormalize, + routed_scaling_factor, + e_score_correction_bias.to(gating_output.dtype), + 1, # scoring_func=1 for sigmoid + ) + elif scoring_func == "softmax": + # Apply softmax in Python, then use fused kernel + # TODO: Add support for softmax in kernel + scores = torch.softmax(gating_output, dim=-1) + topk_values, topk_indices = ops.grouped_topk( + scores, # pre-computed scores + num_expert_group, + topk_group, + topk, + renormalize, + routed_scaling_factor, + e_score_correction_bias.to(gating_output.dtype), + 0, # scoring_func=0 (no activation, scores already computed) + ) + else: + raise ValueError(f"Unsupported scoring function: {scoring_func}") + + # Fused kernel outputs float32 values and int32 indices directly + return topk_values, topk_indices + + +def inplace_fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + ocp_mx_scheme: str | None = None, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + block_shape: list[int] | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> None: + return fused_experts_impl( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + True, + activation, + apply_router_weight_on_input, + use_fp8_w8a8, + use_int8_w8a8, + use_int8_w8a16, + use_int4_w4a16, + ocp_mx_scheme, + per_channel_quant, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1_scale, + a2_scale, + block_shape, + w1_bias, + w2_bias, + ) + + +def inplace_fused_experts_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + ocp_mx_scheme: str | None = None, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + block_shape: list[int] | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +direct_register_custom_op( + op_name="inplace_fused_experts", + op_func=inplace_fused_experts, + mutates_args=["hidden_states"], + fake_impl=inplace_fused_experts_fake, + tags=( + () + if is_torch_equal_or_newer("2.7.0") + else (torch.Tag.needs_fixed_stride_order,) + ), +) + + +def outplace_fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + ocp_mx_scheme: str | None = None, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + block_shape: list[int] | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> torch.Tensor: + return fused_experts_impl_opt( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + activation, + apply_router_weight_on_input, + use_fp8_w8a8, + use_int8_w8a8, + use_int8_w8a16, + use_int4_w4a16, + ocp_mx_scheme, + per_channel_quant, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1_scale, + a2_scale, + block_shape, + w1_bias, + w2_bias, + ) + + +def outplace_fused_experts_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + ocp_mx_scheme: str | None = None, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + block_shape: list[int] | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +direct_register_custom_op( + op_name="outplace_fused_experts", + op_func=outplace_fused_experts, + fake_impl=outplace_fused_experts_fake, + tags=( + () + if is_torch_equal_or_newer("2.7.0") + else (torch.Tag.needs_fixed_stride_order,) + ), +) + + +def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor: + return inplace_fused_experts(**kwargs) + + +def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor: + return outplace_fused_experts(**kwargs) + + +def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]: + if inplace and not disable_inplace(): + return torch_vllm_inplace_fused_experts + return torch_vllm_outplace_fused_experts + + +# TODO (bnell): replace this with modular op. Can get rid of inplace/outplace +# torch ops. +def fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + quant_config: FusedMoEQuantConfig | None = None, + allow_deep_gemm: bool = False, + allow_cutlass_block_scaled_grouped_gemm: bool = False, +) -> torch.Tensor: + if quant_config is None: + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG + use_fp8_w8a8 = quant_config.use_fp8_w8a8 + + # For now, disable DeepGemm for small N (<= 512) until better + # permute/unpermute ops are available. + # However, on B200, we use DeepGemm for all cases because they only support + # E8M0 scale, which means we requantize the weight and input to the specific + # scale. Fallen back to cutlass or triton for some cases would cause + # accuracy issue. + if ( + allow_deep_gemm + and quant_config.use_fp8_w8a8 + and (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2)) + ): + assert quant_config is not None + assert apply_router_weight_on_input is False + return deep_gemm_moe_fp8( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=inplace, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + a1_scale=quant_config.a1_scale, + a2_scale=quant_config.a2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + elif ( + allow_cutlass_block_scaled_grouped_gemm + and use_fp8_w8a8 + and _valid_cutlass_block_scaled_grouped_gemm( + w1, w2, inplace, activation, apply_router_weight_on_input, expert_map + ) + ): + assert quant_config is not None + return run_cutlass_block_scaled_fused_experts( + a=hidden_states, + w1=w1, + w2=w2, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + ) + else: + return dispatch_fused_experts_func(inplace)( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + use_fp8_w8a8=quant_config.use_fp8_w8a8, + use_int8_w8a8=quant_config.use_int8_w8a8, + use_int8_w8a16=quant_config.use_int8_w8a16, + use_int4_w4a16=quant_config.use_int4_w4a16, + ocp_mx_scheme=quant_config.ocp_mx_scheme, + per_channel_quant=quant_config.per_act_token_quant, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + w1_zp=quant_config.w1_zp, + w2_zp=quant_config.w2_zp, + a1_scale=quant_config.a1_scale, + a2_scale=quant_config.a2_scale, + block_shape=quant_config.block_shape, + w1_bias=quant_config.w1_bias, + w2_bias=quant_config.w2_bias, + ) + + +SILU_NO_MUL: str = activation_without_mul("silu") +GELU_NO_MUL: str = activation_without_mul("gelu") +RELU2_NO_MUL: str = activation_without_mul("relu2") + +# 这个方法参考了compressed_tensors_moe实现 +def fused_experts_impl_opt( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + ocp_mx_scheme: str | None = None, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + block_shape: torch.Tensor | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, + output: torch.Tensor | None = None +) -> torch.Tensor: + # check constraints + if use_fp8_w8a8 or use_int8_w8a8 or use_int8_w8a16 or use_int4_w4a16 or w1_scale or \ + w2_scale or w1_zp or w2_zp or a1_scale or a2_scale: + raise ValueError("Quantized MoE is not supported") + + attn_metadata = get_forward_context().attn_metadata + use_ep = expert_map is not None + + # unsupported ep now + if attn_metadata: + only_decode = (use_ep == False and all(t.num_decodes > 0 and t.num_prefills ==0 for t in list(attn_metadata.values()))) + else: + only_decode = False + + assert topk_weights.size() == topk_ids.size(), "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16 + ] + + num_tokens = hidden_states.size(0) + num_experts = w1.size(0) + top_k = topk_weights.size(1) + + if use_ep: + local_num_experts = w1.size(0) + start_eid = get_ep_group().device_group.rank() * local_num_experts + end_eid = min((get_ep_group().device_group.rank() + 1) * local_num_experts, global_num_experts) + hidden_size = hidden_states.shape[1] + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + expand_tokens, + ) = ixfops.moe_compute_token_index_ep( + topk_ids=topk_ids, + num_experts=global_num_experts, + start_expert_id=start_eid, + end_expert_id=end_eid, + ) + if expert_sizes_cpu.sum() == 0: + return torch.zeros( + (num_tokens, hidden_size), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + else: + expand_tokens = num_tokens * top_k + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + ) = ixfops.moe_compute_token_index( + topk_ids=topk_ids, + num_experts=num_experts, + ) + + if only_decode: + # expand + reorder + hidden_states = ixfops.moe_expand_input( + hidden_states=hidden_states, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + ) + + # group gemm 1 + pt_output_1 = ixfops.moe_w16a16_group_gemv( + input=hidden_states, + weight=w1, + output_dtype=hidden_states.dtype, + tokens_per_experts_gpu=expert_sizes_gpu, + dst_to_src=None, + bias=w1_bias, + format="TN", + ) + + # act + if activation == "silu": + pt_output_2 = ixfops.silu_and_mul(pt_output_1) + elif activation == "gelu": + pt_output_2 = ixfops.gelu_and_mul(pt_output_1) + elif activation == "swigluoai": + pt_output_2 = ixfops.swigluoai_and_mul(pt_output_1) + else: + raise ValueError(f"Unsupported activation: {activation}") + + # group gemm 2 + reorder + pt_output_3 = ixfops.moe_w16a16_group_gemv( + input=pt_output_2, + weight=w2, + output_dtype=hidden_states.dtype, + tokens_per_experts_gpu=expert_sizes_gpu, + dst_to_src=sorted_token_ids, + bias=w2_bias, + format="TN", + ) + + # mul + reduce_sum + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + ) + + else: + expert_sizes_cpu = expert_sizes_gpu.cpu() + # expand + reorder + hidden_states = ixfops.moe_expand_input( + hidden_states=hidden_states, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + ) + # group gemm 1 + pt_output_1 = ixfops.moe_w16a16_group_gemm( + input=hidden_states, + weight=w1, + output_dtype=hidden_states.dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=None, + bias=w1_bias, + format="TN", + ) + + # act + if activation == "silu": + pt_output_2 = ixfops.silu_and_mul(pt_output_1) + elif activation == "gelu": + pt_output_2 = ixfops.gelu_and_mul(pt_output_1) + elif activation == "swigluoai": + pt_output_2 = ixfops.swigluoai_and_mul(pt_output_1) + else: + raise ValueError(f"Unsupported activation: {activation}") + + if use_ep: + pt_output_3 = torch.empty( + (num_tokens * top_k, hidden_size), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + # group gemm 2 + reorder + pt_output_3 = ixfops.moe_w16a16_group_gemm( + input=pt_output_2, + weight=w2, + output_dtype=hidden_states.dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=sorted_token_ids, + format="TN", + bias=w2_bias, + output=pt_output_3, + ) + + # mul + reduce_sum + reduce_mask = src_to_dst == -1 + if output != None: + ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + output=output, + mask=reduce_mask, + ) + else: + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + mask=reduce_mask, + ) + else: + # group gemm 2 + reorder + pt_output_3 = ixfops.moe_w16a16_group_gemm( + input=pt_output_2, + weight=w2, + output_dtype=hidden_states.dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=sorted_token_ids, + bias=w2_bias, + format="TN", + ) + + # mul + reduce_sum + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + ) + + if output == None: + return final_hidden_states + + +def _get_config_quant_dtype( + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + ocp_mx_scheme: str | None, +) -> None | torch.dtype | str: + """ + Get the quantization type based on the quantization strategy flags. + We don't have a quant_config at this point so we need to work backwards. + A return type of None means no quantization is required because the + input is unquantized or has been quantized prior to calling + fused_experts_impl. + """ + if use_fp8_w8a8: + return torch.float8_e4m3fn + elif use_int8_w8a8: + return torch.int8 + elif ocp_mx_scheme == "w_mxfp4_a_mxfp4": + return "mxfp4" + elif ocp_mx_scheme in {"w_mxfp4_a_mxfp6_e3m2", "w_mxfp6_e3m2_a_mxfp6_e3m2"}: + return "mxfp6_e3m2" + elif ocp_mx_scheme in {"w_mxfp4_a_mxfp6_e2m3", "w_mxfp6_e2m3_a_mxfp6_e2m3"}: + return "mxfp6_e2m3" + return None + + +def fused_experts_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + ocp_mx_scheme: str | None = None, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + block_shape: list[int] | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> torch.Tensor: + # Check constraints. + if use_int4_w4a16: + assert hidden_states.size(1) // 2 == w1.size(2), "Hidden size mismatch" + elif ocp_mx_scheme is not None: + if ocp_mx_scheme in { + "w_mxfp4_a_mxfp4", + "w_mxfp4_a_mxfp6_e3m2", + "w_mxfp4_a_mxfp6_e2m3", + }: + # 16bit activation and fp4x2 packed weight + assert hidden_states.size(1) == w1.size(2) * 2, "hidden size mismatch" + elif ocp_mx_scheme in { + "w_mxfp6_e3m2_a_mxfp6_e3m2", + "w_mxfp6_e2m3_a_mxfp6_e2m3", + }: + assert hidden_states.size(1) == (w1.size(2) * 4) // 3, ( + "hidden size mismatch" + ) + else: + raise NotImplementedError(f"Unsupported ocp_mx_scheme={ocp_mx_scheme}") + else: + assert hidden_states.size(1) == w1.size(2), ( + f"Hidden size mismatch {hidden_states.size(1)} != {w1.size(2)}" + ) + + assert topk_weights.size() == topk_ids.size(), "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16] + + num_tokens = hidden_states.size(0) + E, N, _ = w1.size() + K = w2.size(1) + if global_num_experts == -1: + global_num_experts = E + top_k_num = topk_ids.size(1) + # We execute the fused_moe kernel in chunks to circumvent this issue: + # https://github.com/vllm-project/vllm/issues/5938 + CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE + M = min(num_tokens, CHUNK_SIZE) + + config_dtype = _get_config_dtype_str( + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + ocp_mx_scheme=ocp_mx_scheme, + dtype=hidden_states.dtype, + ) + + # Note: for use_int8_w8a16 or use_int4_w4a16, the activations are + # quantized prior to calling fused_experts. + quant_dtype = _get_config_quant_dtype( + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + ocp_mx_scheme=ocp_mx_scheme, + ) + + get_config_func = functools.partial( + try_get_optimal_moe_config, + w1.size(), + w2.size(), + top_k_num, + config_dtype, + block_shape=block_shape, + ) + + config = get_config_func(M) + + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + cache13 = torch.empty( + M * top_k_num * max(N, K), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + intermediate_cache1 = cache13[: M * top_k_num * N].view(M, top_k_num, N) + intermediate_cache3 = cache13[: M * top_k_num * K].view(M, top_k_num, K) + + # This needs separate memory since it's used concurrently with cache1 + intermediate_cache2 = torch.empty( + (M * top_k_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype + ) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + else: + raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") + + + out_hidden_states = torch.empty_like(hidden_states) + + if ocp_mx_scheme is not None: + # TODO: On platforms for which `current_platform.supports_mx()` is True + # and for which we have a native OCP mx fused MOE kernel, + # this dequantization step should not be done. + if ocp_mx_scheme in { + OCP_MX_Scheme.w_mxfp4_a_mxfp4, + OCP_MX_Scheme.w_mxfp4_a_mxfp6_e3m2, + OCP_MX_Scheme.w_mxfp4_a_mxfp6_e2m3, + }: + # Weight has to be dequantized for mxfp4 emulation. + w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype) + w1_scale = None + w2 = dequant_mxfp4(w2, w2_scale, hidden_states.dtype) + w2_scale = None + elif ocp_mx_scheme == OCP_MX_Scheme.w_mxfp6_e3m2_a_mxfp6_e3m2: + w1 = dequant_mxfp6( + w1, w1_scale, quant_dtype="fp6_e3m2", float_dtype=hidden_states.dtype + ) + w1_scale = None + w2 = dequant_mxfp6( + w2, w2_scale, quant_dtype="fp6_e3m2", float_dtype=hidden_states.dtype + ) + w2_scale = None + elif ocp_mx_scheme == OCP_MX_Scheme.w_mxfp6_e2m3_a_mxfp6_e2m3: + w1 = dequant_mxfp6( + w1, w1_scale, quant_dtype="fp6_e2m3", float_dtype=hidden_states.dtype + ) + w1_scale = None + w2 = dequant_mxfp6( + w2, w2_scale, quant_dtype="fp6_e2m3", float_dtype=hidden_states.dtype + ) + w2_scale = None + else: + raise NotImplementedError(f"Unsupported ocp_mx_scheme={ocp_mx_scheme}") + + for chunk in range((num_tokens // CHUNK_SIZE) + 1): + begin_chunk_idx, end_chunk_idx = ( + chunk * CHUNK_SIZE, + min((chunk + 1) * CHUNK_SIZE, num_tokens), + ) + curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] + tokens_in_chunk, _ = curr_hidden_states.size() + + if tokens_in_chunk == 0: + break + + if tokens_in_chunk < CHUNK_SIZE and chunk > 0: + # Adjust the intermediate cache size and config for the last + # chunk. Note that in most cases we only have one chunk + # so the cache size and config are already set correctly and + # do not need to be adjusted. + intermediate_cache1 = intermediate_cache1[:tokens_in_chunk] + intermediate_cache2 = intermediate_cache2[ + : tokens_in_chunk * topk_ids.size(1) + ] + intermediate_cache3 = intermediate_cache3[:tokens_in_chunk] + config = get_config_func(tokens_in_chunk) + + curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] + curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] + qcurr_hidden_states, a1q_scale = moe_kernel_quantize_input( + A=curr_hidden_states, + A_scale=a1_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_channel_quant, + block_shape=block_shape, + ) + + sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( + curr_topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map + ) + + invoke_fused_moe_kernel( + qcurr_hidden_states, + w1, + intermediate_cache1, + a1q_scale, + w1_scale, + w1_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + apply_router_weight_on_input, + top_k_num, + config, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + B_bias=w1_bias, + ) + + if activation == "silu": + ops.silu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) + elif activation == "gelu": + ops.gelu_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) + elif activation == "swigluoai": + # alpha = 1.702, limit = 7.0 + ops.swigluoai_and_mul(intermediate_cache2, + intermediate_cache1.view(-1, N)) + # Activation function without multiplication + elif activation == SILU_NO_MUL: + intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) + elif activation == GELU_NO_MUL: + intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N)) + elif activation == RELU2_NO_MUL: + intermediate_cache2 = torch.square(F.relu(intermediate_cache1.view(-1, N))) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}.") + + qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( + A=intermediate_cache2, + A_scale=a2_scale, + quant_dtype=quant_dtype, + per_act_token_quant=per_channel_quant, + block_shape=block_shape, + ) + + invoke_fused_moe_kernel( + qintermediate_cache2, + w2, + intermediate_cache3, + a2q_scale, + w2_scale, + w2_zp, + curr_topk_weights, + curr_topk_ids, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + not apply_router_weight_on_input, + 1, + config, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + B_bias=w2_bias, + ) + + torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), + dim=1, + out=out_hidden_states[begin_chunk_idx:end_chunk_idx]) + + return out_hidden_states + + +class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + quant_config: FusedMoEQuantConfig, + ): + super().__init__(quant_config) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + workspace1 = (M, topk, max(N // 2, K)) + workspace2 = (M, topk, max(N, K)) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + fused_experts_impl_opt(hidden_states, + w1, + w2, + topk_weights, + topk_ids, + activation, + apply_router_weight_on_input, + self.quant_config.use_fp8_w8a8, + self.quant_config.use_int8_w8a8, + self.quant_config.use_int8_w8a16, + self.quant_config.use_int4_w4a16, + self.quant_config.ocp_mx_scheme, + self.quant_config.per_act_token_quant, + global_num_experts, + expert_map, + self.quant_config.w1_scale, + self.quant_config.w2_scale, + self.quant_config.w1_zp, + self.quant_config.w2_zp, + self.quant_config.a1_scale, + self.quant_config.a2_scale, + self.quant_config.block_shape, + self.quant_config.w1_bias, + self.quant_config.w2_bias, + output) + +def modular_triton_fused_moe( + quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None +) -> mk.FusedMoEModularKernel: + return mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonExperts(quant_config), + shared_experts, + ) diff --git a/model_executor/layers/fused_moe/fused_moe_method_base.py b/model_executor/layers/fused_moe/fused_moe_method_base.py new file mode 100644 index 0000000..87f8c8d --- /dev/null +++ b/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import abstractmethod +from collections.abc import Callable + +import torch + +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEPermuteExpertsUnpermute, + FusedMoEPrepareAndFinalize, +) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizeMethodBase, +) + +logger = init_logger(__name__) + + +class FusedMoEMethodBase(QuantizeMethodBase): + def __init__(self, moe: FusedMoEConfig): + super().__init__() + self.moe: FusedMoEConfig = moe + self.moe_quant_config: FusedMoEQuantConfig | None = None + + @abstractmethod + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + raise NotImplementedError + + def uses_weight_scale_2_pattern(self) -> bool: + """ + Returns True if this quantization method uses 'weight_scale_2' pattern + for per-tensor weight scales (e.g., FP4 variants), False otherwise. + + This method should be overridden by subclasses that use the + 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern. + """ + return False + + def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None: + from .all2all_utils import maybe_make_prepare_finalize + + return maybe_make_prepare_finalize(self.moe, self.moe_quant_config) + + def select_gemm_impl( + self, + prepare_finalize: FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> FusedMoEPermuteExpertsUnpermute: + # based on the all2all implementation, select the appropriate + # gemm implementation + raise NotImplementedError( + f"{self.__class__.__name__} must select appropriate gemm " + "implementation based on the prepare_finalize" + ) + + @abstractmethod + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + raise NotImplementedError + + @property + def topk_indices_dtype(self) -> torch.dtype | None: + return None + + @property + def supports_eplb(self) -> bool: + return False + + @property + def allow_inplace(self) -> bool: + return False + + @abstractmethod + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError diff --git a/model_executor/layers/fused_moe/fused_moe_modular_method.py b/model_executor/layers/fused_moe/fused_moe_modular_method.py new file mode 100644 index 0000000..43974ba --- /dev/null +++ b/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -0,0 +1,164 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch + +from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel, + FusedMoEPrepareAndFinalize, +) + +logger = init_logger(__name__) + + +@CustomOp.register("modular_fused_moe") +class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): + def __init__( + self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel + ): + super().__init__(old_quant_method.moe) + self.moe_quant_config = old_quant_method.moe_quant_config + self.fused_experts = experts + self.disable_expert_map = getattr( + old_quant_method, + "disable_expert_map", + not self.fused_experts.supports_expert_map(), + ) + self.old_quant_method = old_quant_method + logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__) + + @staticmethod + def make( + moe_layer: torch.nn.Module, + old_quant_method: FusedMoEMethodBase, + prepare_finalize: FusedMoEPrepareAndFinalize, + shared_experts: torch.nn.Module | None, + ) -> "FusedMoEModularMethod": + return FusedMoEModularMethod( + old_quant_method, + FusedMoEModularKernel( + prepare_finalize, + old_quant_method.select_gemm_impl(prepare_finalize, moe_layer), + shared_experts, + ), + ) + + @property + def topk_indices_dtype(self) -> torch.dtype | None: + return self.fused_experts.prepare_finalize.topk_indices_dtype() + + @property + def supports_eplb(self) -> bool: + return self.old_quant_method.supports_eplb + + @property + def allow_inplace(self) -> bool: + return self.old_quant_method.allow_inplace + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + raise NotImplementedError + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return self.moe_quant_config + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + # Is getattr needed? + zero_expert_num = getattr(layer, "zero_expert_num", 0) + zero_expert_type = getattr(layer, "zero_expert_type", None) + + if enable_eplb: + if self.supports_eplb: + assert expert_load_view is not None + assert logical_to_physical_map is not None + assert logical_replica_count is not None + else: + raise NotImplementedError( + "EPLB is not supported for " + f"{self.old_quant_method.__class__.__name__}." + ) + + topk_weights, topk_ids, zero_expert_result = layer.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + global_num_experts=global_num_experts, + zero_expert_num=zero_expert_num, + zero_expert_type=zero_expert_type, + ) + + result = self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=self.allow_inplace, + activation=activation, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=None if self.disable_expert_map else expert_map, + ) + + if zero_expert_num != 0 and zero_expert_type is not None: + assert not isinstance(result, tuple), ( + "Shared + zero experts are mutually exclusive not yet supported" + ) + return result, zero_expert_result + else: + return result diff --git a/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py new file mode 100644 index 0000000..badedfc --- /dev/null +++ b/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -0,0 +1,316 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) +from vllm.triton_utils import tl, triton +from vllm.utils.import_utils import has_triton_kernels + +logger = init_logger(__name__) + +if has_triton_kernels(): + try: + import triton_kernels.swiglu + from triton_kernels.matmul_ogs import FnSpecs, FusedActivation, matmul_ogs + from triton_kernels.routing import RoutingData, routing, routing_from_bitmatrix + from triton_kernels.tensor import Bitmatrix + except (AttributeError, ImportError) as e: + logger.error( + "Failed to import Triton kernels. Please make sure your triton " + "version is compatible. Error: %s", + e, + ) + + +@triton.jit +def pack_bitmatrix( + bitmatrix, + topk_ids, + n_rows, # n_rows in bitmatrix / topk_ids + bm_cols: tl.constexpr, # n int32_t bitpacks in bitmatrix + n_expts_act, # num_topk + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + """ + Packs topk_ids into a bitmatrix. + code reference: + https://github.com/triton-lang/triton/blob/dd1bbc52b34d202dfe5ffea1e04fb16166c5c04e/python/triton_kernels/bench/distributed.py#L264 + """ + pid_m = tl.program_id(0) + offsets_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offsets_k = tl.arange(0, BLOCK_SIZE_K) + offsets = offsets_m[:, None] * n_expts_act + offsets_k[None, :] + mask = (offsets_m < n_rows)[:, None] & (offsets_k < n_expts_act)[None, :] + indices = tl.load(topk_ids + offsets, mask=mask, other=-1) + div = indices // 32 + rem = indices % 32 + one = tl.cast(1, tl.uint32) + + # Iterate through all the relevant bitmatrix columns. + for i in range(bm_cols): + # When BLOCK_SIZE_K=32, offs is just the column index. + offs = tl.arange(0, BLOCK_SIZE_K // 32) + i * (BLOCK_SIZE_K // 32) + # All topks that need to go into this column has the correct bit set. + # Other bits are 0. x is a 2D tensor. + x = tl.where( + div[:, :, None] == offs[None, None, :], (one << rem)[:, :, None], 0 + ) + # Reduce x to get a single int32_t bitpack. + y = tl.reduce_or(x, axis=1) + bitmatrix_ptrs = bitmatrix + offsets_m[:, None] * bm_cols + offs[None, :] + tl.store(bitmatrix_ptrs, y, mask=offsets_m[:, None] < n_rows) + + +def triton_kernel_moe_forward( + hidden_states: torch.Tensor, + w1, # Tensor or triton_kernels.Tensor + w2, # Tensor or triton_kernels.Tensor + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + activation: str = "silu", + quant_config: FusedMoEQuantConfig | None = None, + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, +) -> torch.Tensor: + routing_data, gather_idx, scatter_idx = routing( + gating_output, topk, sm_first=not renormalize + ) + + return triton_kernel_fused_experts( + None, + hidden_states, + w1, + w2, + routing_data, + gather_idx, + scatter_idx, + activation=activation, + quant_config=quant_config, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) + + +# This is a triton implementation of the fused_experts function +def triton_kernel_fused_experts( + output_tensor: torch.Tensor, + hidden_states: torch.Tensor, + w1, # Tensor or triton_kernels.Tensor + w2, # Tensor or triton_kernels.Tensor + routing_data, # RoutingData + gather_indx, # GatherIndx + scatter_indx, # ScatterIndx + activation: str = "silu", + quant_config: FusedMoEQuantConfig | None = None, + swiglu_alpha: float = 1.702, + swiglu_limit: float = 7.0, + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + a1q_scale: torch.Tensor | None = None, +) -> torch.Tensor: + if quant_config is None: + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG + + # type check, uint8 means mxfp4 + assert hidden_states.dtype == torch.bfloat16 + assert quant_config.w1_bias is None or quant_config.w1_bias.dtype == torch.float32 + assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32 + + # Shape check, only check non-mxfp4 + assert hidden_states.shape[-1] == w1.shape[-2] + assert w2.shape[-1] == w1.shape[1] + + E, _, N = w1.shape + + if global_num_experts == -1: + global_num_experts = E + + act = FusedActivation( + FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), + (swiglu_alpha, swiglu_limit), + 2, + ) + gammas = routing_data.gate_scal if routing_data else None + + intermediate_cache1 = matmul_ogs( + hidden_states, + w1, + quant_config.w1_bias, + routing_data, + gather_indx=gather_indx, + precision_config=quant_config.w1_precision, + gammas=gammas if apply_router_weight_on_input else None, + fused_activation=act, + ) + + intermediate_cache3 = matmul_ogs( + intermediate_cache1, + w2, + quant_config.w2_bias, + routing_data, + scatter_indx=scatter_indx, + precision_config=quant_config.w2_precision, + gammas=None if apply_router_weight_on_input else gammas, + y=output_tensor, + ) + return intermediate_cache3 + + +def make_routing_data( + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + num_local_experts: int, +) -> tuple["RoutingData", torch.Tensor, torch.Tensor]: + topk_ids = topk_ids.to(torch.int16) + topk_weights = topk_weights.to(torch.bfloat16) + + n_rows, num_topk = topk_ids.size() + + BLOCK_SIZE_M = 512 + BLOCK_SIZE_K = 32 + + bm_cols = triton.cdiv(num_local_experts, BLOCK_SIZE_K) # n_bitpacks + bitmatrix = torch.zeros( + (n_rows, bm_cols), dtype=torch.uint32, device=topk_ids.device + ) + + grid = (triton.cdiv(n_rows, BLOCK_SIZE_M),) + pack_bitmatrix[grid]( + bitmatrix, + topk_ids, + n_rows, + bm_cols, + num_topk, + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_K=BLOCK_SIZE_K, + ) + + bitmatrix_shape = [n_rows, bm_cols * 32] + bitmatrix_shape_max = [n_rows, None] + bitmatrix = Bitmatrix( + bitmatrix, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max, scratchpad=None + ) + + # matmul_ogs expects invalid topk_weights to be -1s + topk_weights = torch.where(topk_ids == -1, -1.0, topk_weights) + routing_data, gather_indx, scatter_indx = routing_from_bitmatrix( + bitmatrix, topk_weights, topk_ids, num_local_experts, num_topk + ) + + return routing_data, gather_indx, scatter_indx + + +class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__(self, quant_config: FusedMoEQuantConfig): + super().__init__(quant_config) + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Weight application and reduction happens in the fused_experts kernel. + return TopKWeightAndReduceNoOP() + + def _make_routing_data( + self, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + num_local_experts: int, + ) -> tuple["RoutingData", torch.Tensor, torch.Tensor]: + return make_routing_data(topk_ids, topk_weights, num_local_experts) + + +class OAITritonExperts(BaseOAITritonExperts): + def __init__(self, quant_config: FusedMoEQuantConfig): + # TODO (varun) : Enable activation quantization + assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16" + super().__init__(quant_config) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # workspace are allocated inside the kernel + workspace1 = (M, K) + workspace2 = (0, 0) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + if expert_map is not None: + topk_ids = expert_map[topk_ids] + + local_num_experts = w1.size(0) + if global_num_experts == -1: + global_num_experts = local_num_experts + + routing_data, gather_indx, scatter_indx = self._make_routing_data( + topk_ids, topk_weights, local_num_experts + ) + + experts_output = triton_kernel_fused_experts( + None, + hidden_states, + w1, + w2, + routing_data, + gather_indx, + scatter_indx, + activation=activation, + quant_config=self.quant_config, + apply_router_weight_on_input=False, + global_num_experts=local_num_experts, + expert_map=None, # applied already + a1q_scale=a1q_scale, + ) + + output.copy_(experts_output, non_blocking=True) diff --git a/model_executor/layers/fused_moe/layer.py b/model_executor/layers/fused_moe/layer.py new file mode 100644 index 0000000..22d2cca --- /dev/null +++ b/model_executor/layers/fused_moe/layer.py @@ -0,0 +1,2038 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable, Iterable +from contextlib import nullcontext +from enum import Enum +from functools import partial +from typing import Literal, get_args, overload + +import ast, re +import torch +import torch.nn.functional as F +from torch.nn.parameter import UninitializedParameter + +import vllm.envs as envs +from vllm._aiter_ops import rocm_aiter_ops +from vllm.config import VllmConfig, get_current_vllm_config +from vllm.config.parallel import ExpertPlacementStrategy +from vllm.distributed import ( + get_dp_group, + get_ep_group, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) +from vllm.distributed.eplb.eplb_state import EplbState +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEParallelConfig, + FusedMoEQuantConfig, + RoutingMethodType, +) +from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEPermuteExpertsUnpermute, + FusedMoEPrepareAndFinalize, +) +from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + init_aiter_topK_meta_data, +) +from vllm.model_executor.layers.fused_moe.routing_simulator import RoutingSimulator +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, +) +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + is_flashinfer_supporting_global_sf, +) +from vllm.platforms import current_platform +from vllm.utils.math_utils import cdiv, round_up +from vllm.utils.torch_utils import ( + aux_stream, + current_stream, + direct_register_custom_op, +) +from vllm.v1.worker.ubatching import dbo_current_ubatch_id + +if current_platform.is_cuda_alike(): + from .fused_moe import eplb_map_to_physical_and_record, fused_experts +else: + fused_experts = None # type: ignore + FusedMoEPermuteExpertsUnpermute = object # type: ignore + FusedMoEPrepareAndFinalize = object # type: ignore + + def _eplb_map_to_physical_and_record( + topk_ids: torch.Tensor, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + indices_type: torch.dtype | None, + ) -> torch.Tensor: + # CPU fallback: no EPLB so just return as is + return topk_ids + + eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record +from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk +from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 + rocm_aiter_grouped_topk, +) + +if current_platform.is_tpu(): + from .moe_pallas import fused_moe as fused_moe_pallas +else: + fused_moe_pallas = None # type: ignore + +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( + FusedMoEModularMethod, +) +from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( + UnquantizedFusedMoEMethod, +) + +logger = init_logger(__name__) + +def weight_quant_l1(loaded_weight: torch.Tensor): + qmax = 127.0 + abs_max = torch.abs(loaded_weight).max(dim=1, keepdim=True)[0] # [rows, 1] + scale = abs_max / qmax # [rows, 1] + assert scale.shape == (loaded_weight.shape[0], 1) + quantized = torch.round(loaded_weight / scale) + quantized = torch.clamp(quantized, -qmax, qmax) + return quantized.to(torch.int8), scale.to(torch.float32) + +def weight_quant_l2(loaded_weight: torch.Tensor): + qmax = 127.0 + abs_max = torch.abs(loaded_weight).max(dim=1, keepdim=True)[0] # [rows, 1] + scale = abs_max / qmax # [rows, 1] + assert scale.shape == (loaded_weight.shape[0], 1) + quantized = torch.round(loaded_weight / scale) + quantized = torch.clamp(quantized, -qmax, qmax) + + import ixformer.inference.functions as ixfops + i4_weights, i8scales, i8zeros = ixfops.quant_repack_int4(quantized.to(torch.int8).unsqueeze_(0), -1, 2, "TN", False) + return i4_weights.squeeze(0), scale.view(1, -1).to(torch.float32) + +class FusedMoeWeightScaleSupported(Enum): + TENSOR = "tensor" + CHANNEL = "channel" + GROUP = "group" + BLOCK = "block" + + +def determine_expert_map( + ep_size: int, + ep_rank: int, + global_num_experts: int, + expert_placement_strategy: ExpertPlacementStrategy = "linear", + num_fused_shared_experts: int = 0, + return_expert_mask: bool = False, +) -> tuple[int, torch.Tensor | None, torch.Tensor | None]: + """ + Calculates how many experts should be assigned to each rank for EP and + creates a mapping from global to local expert index. Experts are + distributed evenly across ranks. Any remaining are assigned to the + last rank. + + Args: + ep_size: The size of the expert parallel group + ep_rank: The rank of the current process in the expert parallel + group + global_num_experts: The total number of experts in the model. + expert_placement_strategy: The expert placement strategy. + + Returns: + tuple[int, Optional[torch.Tensor]]: A tuple containing: + - local_num_experts (int): The number of experts assigned + to the current rank. + - expert_map (Optional[torch.Tensor]): A tensor of shape + (global_num_experts,) mapping from global to local index. + Contains -1 for experts not assigned to the current rank. + Returns None if ep_size is 1. + - expert_mask (Optional[torch.Tensor]): A tensor of shape + (global_num_experts + num_fused_shared_experts + 1,) + containing 1 for experts assigned to the current rank + and 0 for sentinel. + Returns None if ep_size is 1. + Used only when AITER MOE is enabled. + """ + assert ep_size > 0 + if ep_size == 1: + return (global_num_experts, None, None) + + # Distribute experts as evenly as possible to each rank. + base_experts = global_num_experts // ep_size + remainder = global_num_experts % ep_size + local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts + + # Create a tensor of size num_experts filled with -1 + expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) + # Create an expert map for the local experts + if expert_placement_strategy == "linear": + start_idx = ep_rank * base_experts + min(ep_rank, remainder) + expert_map[start_idx : start_idx + local_num_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32 + ) + elif expert_placement_strategy == "round_robin": + local_log_experts = torch.arange( + ep_rank, global_num_experts, ep_size, dtype=torch.int32 + ) + + expert_map[local_log_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32 + ) + else: + raise ValueError( + "Unsupported expert placement strategy " + f"'{expert_placement_strategy}', expected one of " + f"{get_args(ExpertPlacementStrategy)}" + ) + + expert_mask = None + if return_expert_mask: + expert_mask = torch.ones( + (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32 + ) + expert_mask[-1] = 0 + expert_mask[:global_num_experts] = expert_map > -1 + expert_map = torch.cat( + ( + expert_map, + torch.tensor( + [local_num_experts + i for i in range(num_fused_shared_experts)], + dtype=torch.int32, + ), + ), + dim=0, + ) + + return (local_num_experts, expert_map, expert_mask) + + +def get_compressed_expert_map(expert_map: torch.Tensor) -> str: + """ + Compresses the expert map by removing any -1 entries. + + Args: + expert_map (torch.Tensor): A tensor of shape (global_num_experts,) + mapping from global to local index. Contains -1 for experts not + assigned to the current rank. + + Returns: + str: A string mapping from local to global index. + Using str to support hashing for logging once only. + """ + global_indices = torch.where(expert_map != -1)[0] + local_indices = expert_map[global_indices] + return ", ".join( + f"{local_index.item()}->{global_index.item()}" + for local_index, global_index in zip(local_indices, global_indices) + ) + + +def maybe_roundup_hidden_size( + hidden_size: int, + act_dtype: torch.dtype, + quant_config: QuantizationConfig | None, + moe_parallel_config: FusedMoEParallelConfig, + is_lora_enabled: bool, +) -> int: + """ + Given layer hidden size and MoE configurations, round up hidden_size + if necessary. + + Args: + hidden_size: Layer hidden-size + act_dtype: Data type of the layer activations. + quant_config: Fused MoE quantization configuration. + moe_parallel_config: Fused MoE parallelization strategy configuration. + is_lora_enabled: True if the engine is enabled with LoRA. This + is used in the case of mxfp4 quantization in selecting the + MxFP4Backend. + + Return: + Rounded up hidden_size if rounding up is required based on the configs. + Original hidden size otherwise. + """ + from vllm.model_executor.layers.fused_moe.all2all_utils import ( + maybe_roundup_layer_hidden_size, + ) + + hidden_size = maybe_roundup_layer_hidden_size( + hidden_size, act_dtype, moe_parallel_config + ) + + # we are padding globally so EP buffer allocation works + if quant_config and quant_config.get_name() == "mxfp4": + from vllm.model_executor.layers.quantization.mxfp4 import ( + Mxfp4Backend, + get_mxfp4_backend, + ) + + current_mxfp4_backend = get_mxfp4_backend(is_lora_enabled) + if ( + current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 + or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + ): + hidden_size = round_up(hidden_size, 128) + elif ( + current_platform.is_rocm() + or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 + ): + hidden_size = round_up(hidden_size, 256) + + return hidden_size + + +@CustomOp.register("fused_moe") +class FusedMoE(CustomOp): + """FusedMoE layer for MoE models. + + This layer contains both MergedColumnParallel weights (gate_up_proj / + w13) and RowParallelLinear weights (down_proj/ w2). + + Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We + copy that naming convention here and handle any remapping in the + load_weights function in each model implementation. + + Args: + num_experts: Number of experts in the model + top_k: Number of experts selected for each token + hidden_size: Input hidden state size of the transformer + intermediate_size: Intermediate size of the experts + params_dtype: Data type for the parameters. + reduce_results: Whether to all_reduce on the output of the layer + renormalize: Whether to renormalize the logits in the fused_moe kernel + quant_config: Quantization configure. + enable_eplb: Whether to enable expert parallelism load balancer. + """ + + def __init__( + self, + num_experts: int, # Global number of experts + top_k: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype | None = None, + reduce_results: bool = False, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: int | None = None, + topk_group: int | None = None, + quant_config: QuantizationConfig | None = None, + tp_size: int | None = None, + ep_size: int | None = None, + dp_size: int | None = None, + prefix: str = "", + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + is_act_and_mul: bool = True, + enable_eplb: bool = False, + num_redundant_experts: int = 0, + has_bias: bool = False, + is_sequence_parallel=False, + zero_expert_num: int | None = 0, + zero_expert_type: str | None = None, + expert_mapping: list[tuple[str, str, int, str]] | None = None, + n_shared_experts: int | None = None, + routing_method_type: int | None = None, + ): + super().__init__() + + # Allow disabling of the separate shared experts stream for + # debug purposes. + # TODO: Remove this after more extensive testings with TP/DP + # and other execution modes + if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: + logger.info_once("Disabling MoE shared_experts cuda stream") + self.shared_experts_stream = None + else: + # TODO(rob): enable shared expert overlap with non-cuda. + # aux_stream() returns None on non-cuda platforms. + self.shared_experts_stream = aux_stream() + if self.shared_experts_stream is not None: + logger.info_once("Enabled separate cuda stream for MoE shared_experts") + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + + vllm_config = get_current_vllm_config() + self.vllm_config = vllm_config + + # FIXME (varun): We should have a better way of inferring the activation + # datatype. This works for now as the tensor datatype entering the MoE + # operation is typically unquantized (i.e. float16/bfloat16). + if vllm_config.model_config is not None: + moe_in_dtype = vllm_config.model_config.dtype + else: + # TODO (bnell): This is a hack to get test_mixtral_moe to work + # since model_config is not set in the pytest test. + moe_in_dtype = params_dtype + + tp_size_ = ( + tp_size if tp_size is not None else get_tensor_model_parallel_world_size() + ) + dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size + + self.is_sequence_parallel = is_sequence_parallel + self.sp_size = tp_size_ if is_sequence_parallel else 1 + + self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( + tp_size_=tp_size_, + dp_size_=dp_size_, + vllm_parallel_config=vllm_config.parallel_config, + ) + + self.global_num_experts = num_experts + num_redundant_experts + self.logical_num_experts = num_experts + self.zero_expert_num = zero_expert_num + self.zero_expert_type = zero_expert_type + + # Expert mapping used in self.load_weights + self.expert_mapping = expert_mapping + + # Round up hidden size if needed. + hidden_size = maybe_roundup_hidden_size( + hidden_size, + moe_in_dtype, + quant_config, + self.moe_parallel_config, + is_lora_enabled=self.vllm_config.lora_config is not None, + ) + + # For smuggling this layer into the fused moe custom op + compilation_config = vllm_config.compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError("Duplicate layer name: {}".format(prefix)) + compilation_config.static_forward_context[prefix] = self + self.layer_name = prefix + + self.enable_eplb = enable_eplb + self.expert_load_view: torch.Tensor | None = None + self.logical_to_physical_map: torch.Tensor | None = None + self.logical_replica_count: torch.Tensor | None = None + + # ROCm aiter shared experts fusion + self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + self.aiter_fmoe_shared_expert_enabled = ( + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + ) + + self.num_fused_shared_experts = ( + n_shared_experts + if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled + else 0 + ) + if ( + not self.aiter_fmoe_shared_expert_enabled + and self.num_fused_shared_experts != 0 + ): + raise ValueError( + "n_shared_experts is only supported on ROCm aiter when " + "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled" + ) + + # Determine expert maps + if self.use_ep: + if self.enable_eplb: + assert self.global_num_experts % self.ep_size == 0, ( + "EPLB currently only supports even distribution of " + "experts across ranks." + ) + else: + assert num_redundant_experts == 0, ( + "Redundant experts are only supported with EPLB." + ) + + expert_placement_strategy = ( + vllm_config.parallel_config.expert_placement_strategy + ) + if expert_placement_strategy == "round_robin": + # TODO(Bruce): will support round robin expert placement with + # EPLB enabled in the future. + round_robin_supported = ( + (num_expert_group is not None and num_expert_group > 1) + and num_redundant_experts == 0 + and not self.enable_eplb + ) + + if not round_robin_supported: + logger.warning( + "Round-robin expert placement is only supported for " + "models with multiple expert groups and no redundant " + "experts. Falling back to linear expert placement." + ) + expert_placement_strategy = "linear" + + self.expert_map: torch.Tensor | None + local_num_experts, expert_map, expert_mask = determine_expert_map( + ep_size=self.ep_size, + ep_rank=self.ep_rank, + global_num_experts=self.global_num_experts, + expert_placement_strategy=expert_placement_strategy, + num_fused_shared_experts=self.num_fused_shared_experts, + return_expert_mask=self.rocm_aiter_fmoe_enabled, + ) + self.local_num_experts = local_num_experts + self.register_buffer("expert_map", expert_map) + self.register_buffer("expert_mask", expert_mask) + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Expert " + "placement strategy: %s. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", + self.ep_rank, + self.ep_size, + expert_placement_strategy, + self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self.expert_map), + ) + else: + self.local_num_experts, self.expert_map, self.expert_mask = ( + self.global_num_experts, + None, + None, + ) + + self.top_k = top_k + + self._init_aiter_shared_experts_topK_buffer( + vllm_config=vllm_config, dp_size=dp_size_ + ) + + self.hidden_size = hidden_size + self.num_experts = num_experts + assert intermediate_size % self.tp_size == 0 + self.hidden_size = hidden_size + self.intermediate_size_per_partition = intermediate_size // self.tp_size + self.reduce_results = reduce_results + self.renormalize = renormalize + self.use_grouped_topk = use_grouped_topk + if self.use_grouped_topk: + assert num_expert_group is not None and topk_group is not None + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.custom_routing_function = custom_routing_function + self.scoring_func = scoring_func + self.routed_scaling_factor = routed_scaling_factor + self.e_score_correction_bias = e_score_correction_bias + self.apply_router_weight_on_input = apply_router_weight_on_input + self.activation = activation + + if self.scoring_func != "softmax" and not self.use_grouped_topk: + raise ValueError( + "Only softmax scoring function is supported for non-grouped topk." + ) + + # ToDo: Better logic to determine the routing method type + if routing_method_type is not None: + self.routing_method_type = routing_method_type + else: + if scoring_func == "sigmoid": + if self.use_grouped_topk: + self.routing_method_type = RoutingMethodType.DeepSeekV3 + elif self.top_k == 1: + self.routing_method_type = RoutingMethodType.Llama4 + elif self.scoring_func == "softmax": + self.routing_method_type = ( + RoutingMethodType.Renormalize + if not self.renormalize + else RoutingMethodType.RenormalizeNaive + ) + else: + self.routing_method_type = RoutingMethodType.TopK + + self.moe_config: FusedMoEConfig = FusedMoEConfig( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + in_dtype=moe_in_dtype, + max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, + has_bias=has_bias, + is_act_and_mul=is_act_and_mul, + is_lora_enabled=vllm_config.lora_config is not None, + ) + + self.quant_config = quant_config + + def _get_quant_method() -> FusedMoEMethodBase: + """ + Helper method to ensure self.quant_method is never None and + of the proper type. + """ + quant_method = None + if self.quant_config is not None: + self.opt_level = 0 + quant_method = self.quant_config.get_quant_method(self, prefix) + if quant_method is None: + from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( + CompressedTensorsL1OptMoEMethod, CompressedTensorsL2OptMoEMethod) + if self.opt_level == 1: + quant_method = CompressedTensorsL1OptMoEMethod(self.moe_config) + elif self.opt_level == 2: + quant_method = CompressedTensorsL2OptMoEMethod(self.moe_config) + else: + quant_method = UnquantizedFusedMoEMethod(self.moe_config) + assert isinstance(quant_method, FusedMoEMethodBase) + return quant_method + + # Note: get_quant_method will look at the layer's local_num_experts + # for heuristic purposes, so it must be initialized first. + self.opt_level = envs.VLLM_MOE_OPT_LEVEL + opt_exclude_layers = envs.VLLM_OPT_EXCLUDE_LAYERS + opt_exclude_layers = ast.literal_eval(opt_exclude_layers) if opt_exclude_layers.strip() else "" + if isinstance(opt_exclude_layers, tuple): + layer_info = re.search(r'\.(\d+)', prefix) + if layer_info is not None and int(layer_info.group(1)) in opt_exclude_layers: + self.opt_flag = False + + self.quant_method: FusedMoEMethodBase = _get_quant_method() + + if not self.moe_config.is_act_and_mul: + # Avoid circular import + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8MoEMethod, + ) + + if not isinstance( + self.quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod) + ): + raise NotImplementedError( + "is_act_and_mul=False is supported only for unquantized " + "and ModelOpt FP8 moe for now" + ) + if not current_platform.is_cuda(): + raise NotImplementedError( + "is_act_and_mul=False is supported only for CUDA for now" + ) + + if self.enable_eplb and not self.quant_method.supports_eplb: + # TODO: Add support for additional quantization methods. + # The implementation for other quantization methods does not + # contain essential differences, but the current quant API + # design causes duplicated work when extending to new + # quantization methods, so I'm leaving it for now. + # If you plan to add support for more quantization methods, + # please refer to the implementation in `Fp8MoEMethod`. + raise NotImplementedError( + f"EPLB is not supported {self.quant_method.__class__.__name__}. " + "EPLB is only supported for FP8 quantization for now." + ) + + moe_quant_params = { + "num_experts": self.local_num_experts, + "hidden_size": hidden_size, + "intermediate_size_per_partition": self.intermediate_size_per_partition, + "params_dtype": params_dtype, + "weight_loader": self.weight_loader, + "global_num_experts": self.global_num_experts, + } + # need full intermediate size pre-sharding for WNA16 act order + if self.quant_method.__class__.__name__ in ( + "GPTQMarlinMoEMethod", + "CompressedTensorsWNA16MarlinMoEMethod", + "CompressedTensorsWNA16MoEMethod", + ): + moe_quant_params["intermediate_size_full"] = intermediate_size + + self.quant_method.create_weights(layer=self, **moe_quant_params) + + # Chunked all2all staging tensor + self.batched_hidden_states: torch.Tensor | None = None + self.batched_router_logits: torch.Tensor | None = None + + # Note: maybe_init_modular_kernel should only be called by + # prepare_communication_buffer_for_model. + # This is called after all weight loading and post-processing, so it + # should be safe to swap out the quant_method. + def maybe_init_modular_kernel(self) -> None: + self.ensure_moe_quant_config_init() + prepare_finalize = self.quant_method.maybe_make_prepare_finalize() + if prepare_finalize is not None: + logger.debug( + "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) + ) + self.quant_method = FusedMoEModularMethod.make( + self, self.quant_method, prepare_finalize, self.shared_experts + ) + + @property + def shared_experts(self) -> torch.nn.Module | None: + return None + + @property + def gate(self) -> torch.nn.Module | None: + return None + + @property + def tp_size(self): + return self.moe_parallel_config.tp_size + + @property + def dp_size(self): + return self.moe_parallel_config.dp_size + + @property + def ep_size(self): + return self.moe_parallel_config.ep_size + + @property + def tp_rank(self): + return self.moe_parallel_config.tp_rank + + @property + def dp_rank(self): + return self.moe_parallel_config.dp_rank + + @property + def ep_rank(self): + return self.moe_parallel_config.ep_rank + + @property + def use_ep(self): + return self.moe_parallel_config.use_ep + + @property + def use_pplx_kernels(self): + return self.moe_parallel_config.use_pplx_kernels + + @property + def use_deepep_ht_kernels(self): + return self.moe_parallel_config.use_deepep_ht_kernels + + @property + def use_deepep_ll_kernels(self): + return self.moe_parallel_config.use_deepep_ll_kernels + + @property + def use_flashinfer_cutlass_kernels(self): + return ( + self.moe_quant_config is not None + and self.moe_quant_config.quant_dtype == "nvfp4" + and self.moe_config.use_flashinfer_cutlass_kernels + ) + + @property + def use_marlin_kernels(self): + return getattr(self.quant_method, "use_marlin", False) + + @property + def use_dp_chunking(self) -> bool: + return ( + self.moe_parallel_config.use_pplx_kernels + or self.moe_parallel_config.use_deepep_ll_kernels + or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels) + ) + + @property + def is_internal_router(self) -> bool: + # By default, router/gate is called before FusedMoE forward pass + return False + + def update_expert_map(self): + # ep_size and ep_rank should already be updated + assert self.expert_map is not None + with self.expert_map.device: + local_num_experts, expert_map, expert_mask = determine_expert_map( + ep_size=self.ep_size, + ep_rank=self.ep_rank, + global_num_experts=self.global_num_experts, + num_fused_shared_experts=self.num_fused_shared_experts, + return_expert_mask=self.rocm_aiter_fmoe_enabled, + ) + self.local_num_experts = local_num_experts + self.register_buffer("expert_map", expert_map) + self.register_buffer("expert_mask", expert_mask) + if self.aiter_fmoe_shared_expert_enabled: + self._init_aiter_shared_experts_topK_buffer( + vllm_config=get_current_vllm_config(), + dp_size=get_dp_group().world_size, + ) + + def _load_per_tensor_weight_scale( + self, + shard_id: str, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + expert_id: int, + ): + param_data = param.data + # for per tensor weight quantization + if shard_id in ("w1", "w3"): + # We have to keep the weight scales of w1 and w3 because + # we need to re-quantize w1/w3 weights after weight loading. + idx = 0 if shard_id == "w1" else 1 + param_data[expert_id][idx] = loaded_weight + # If we are in the row parallel case (down_proj) + elif shard_id == "w2": + param_data[expert_id] = loaded_weight + + def _load_combined_w13_weight_scale( + self, + shard_dim: int, + loaded_weight: torch.Tensor, + param: torch.Tensor, + tp_rank: int, + ): + """ + Load w13 weight scales assuming that w1 weight scales and w3 weight + scales are stored in the same loaded_weight tensor. + """ + shard_size = param.shape[shard_dim] + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + param.copy_(loaded_weight) + + def _load_model_weight_or_group_weight_scale( + self, + shard_dim: int, + expert_data: torch.Tensor, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full_w2: bool = False, + ): + """ + Load grouped weight scales for group quantization or model weights + :param shard_dim: dimension to shard + :param expert_data: parameter for a particular expert + :param shard_id: either w1, w2, or w3 + :param loaded_weight: checkpoint weight to load into the param + :param tp_rank: tensor parallel rank + :param load_full_w2: whether or not the w2 loaded should be sharded. + """ + if shard_id == "w2": + # In the case where we have actorder/g_idx, we do not partition the + # w2 scales, as indicated by `load_full` argument, for all tp cases + self._load_w2( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + load_full=load_full_w2, + ) + elif shard_id in ("w1", "w3"): + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + ) + + def _load_per_channel_weight_scale( + self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + ): + # for per channel weight quantization + if shard_id == "w2": + expert_data.copy_(loaded_weight) + elif shard_id in ("w1", "w3"): + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + ) + + def _load_w13( + self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full: bool = False, + ): + # Index the loaded weight for tp sharding. + # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim + if self.moe_config.is_act_and_mul: + shard_size = expert_data.shape[shard_dim] // 2 + else: + shard_size = expert_data.shape[shard_dim] + if not load_full: + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + # Narrow parameter and load. + # w1, gate_proj: Load into first logical weight of w13. + if shard_id == "w1": + expert_data = expert_data.narrow(shard_dim, 0, shard_size) + # w3, up_proj: Load into second logical weight of w13. + else: + assert shard_id == "w3" + expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) + expert_data.copy_(loaded_weight) + + def _load_w2( + self, + expert_data: torch.Tensor, + shard_dim: int, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full: bool = False, + ): + # Index the loaded weight for tp sharding. + # down_proj: "RowParallel" so tp sharding on input_dim + # Narrow parameter and load. + shard_size = loaded_weight.shape[shard_dim] // self.tp_size + if not load_full: + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + # w2, down_proj: Load into only logical weight of w2. + expert_data.narrow(shard_dim, 0, shard_size).copy_(loaded_weight) + + def _load_model_opt_weight_or_group_weight_scale(self, + shard_dim: int, + shard_dim_scale: int, + expert_data: torch.Tensor, + scale_data: torch.Tensor, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + opt_level: int, + load_full_w2: bool = False): + """ + Load grouped weight scales for group quantization or model weights + :param shard_dim: dimension to shard + :param expert_data: parameter for a particular expert + :param shard_id: either w1, w2, or w3 + :param loaded_weight: checkpoint weight to load into the param + :param tp_rank: tensor parallel rank + :param load_full_w2: whether or not the w2 loaded should be sharded. + """ + + loaded_weight = loaded_weight.to(device="cuda") + assert opt_level in [1, 2] + if opt_level == 1: + weight, scale = weight_quant_l1(loaded_weight) + else: + weight, scale = weight_quant_l2(loaded_weight) + + if shard_id == "w2": + # In the case where we have actorder/g_idx, we do not partition the + # w2 scales, as indicated by `load_full` argument, for all tp cases + self._load_w2(shard_dim=shard_dim, + loaded_weight=weight, + expert_data=expert_data, + tp_rank=tp_rank, + load_full=load_full_w2) + scale_data.copy_(scale) + elif shard_id in ("w1", "w3"): + self._load_w13(shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=weight, + expert_data=expert_data, + tp_rank=tp_rank) + self._load_w13(shard_id=shard_id, + shard_dim=shard_dim_scale, + loaded_weight=scale, + expert_data=scale_data, + tp_rank=tp_rank) + + def _load_single_value( + self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int + ): + param_data = param.data + + # Input scales can be loaded directly and should be equal. + param_data[expert_id] = loaded_weight + + def _load_g_idx( + self, + shard_id: str, + expert_data: torch.Tensor, + shard_dim: int, + loaded_weight: torch.Tensor, + tp_rank: int, + ): + if shard_id == "w2": + self._load_w2( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + ) + else: + assert shard_id in ("w1", "w3") + expert_data.copy_(loaded_weight) + + def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: + if self.expert_map is None: + return expert_id + return self.expert_map[expert_id].item() + + def _init_aiter_shared_experts_topK_buffer( + self, vllm_config: VllmConfig, dp_size: int + ): + if self.num_fused_shared_experts > 0: + init_aiter_topK_meta_data( + n_routed_experts=self.global_num_experts, + n_shared_experts=self.num_fused_shared_experts, + top_k=self.top_k, + tp_rank=self.ep_rank if self.use_ep else self.tp_rank, + tp_size=self.ep_size if self.use_ep else self.tp_size, + shared_experts_score=1.0, + max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens + * dp_size, + is_EP=self.use_ep, + ) + self.local_num_experts += self.num_fused_shared_experts + + @overload + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: Literal[False], + ) -> None: ... + + @overload + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: Literal[True], + ) -> bool: ... + + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: bool = False, + ) -> bool | None: + if self.quant_config and self.quant_config.get_name() == "mxfp4": + # (FIXME) for gpt-oss all experts are combined + if "bias" in weight_name: + dim1 = loaded_weight.shape[1] + param.data[:, :dim1].copy_(loaded_weight) + else: + dim1 = loaded_weight.shape[1] + dim2 = loaded_weight.shape[2] + param.data[:, :dim1, :dim2].copy_(loaded_weight) + return True if return_success else None + + quant_method_name = self.quant_method.__class__.__name__ + global_expert_id = expert_id + expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id) + + allow_flashinfer = getattr(self.quant_method, "allow_flashinfer", False) + moe_backend = getattr(self.quant_method, "flashinfer_moe_backend", None) + + use_global_sf = ( + allow_flashinfer + and is_flashinfer_supporting_global_sf(moe_backend) + and "input_scale" in weight_name + and quant_method_name == "ModelOptNvFp4FusedMoE" + ) + + if expert_id == -1 and not use_global_sf: + # Failed to load this param since it's not local to this rank + return False if return_success else None + # Hereafter, `expert_id` is local physical id + + # compressed-tensors checkpoints with packed weights are stored flipped + # TODO (mgoin): check self.quant_method.quant_config.quant_format + # against known CompressionFormat enum values that have this quality + if self.quant_method.__class__.__name__ in ( + "CompressedTensorsWNA16MarlinMoEMethod", + "CompressedTensorsWNA16MoEMethod", + ): + loaded_weight = loaded_weight.t().contiguous() + + if shard_id not in ("w1", "w2", "w3"): + raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.") + + # Fetch the dim to shard the parameter/loaded weight + # based on the shard id. This will be whatever + # dimension intermediate_size_per_partition is used. + SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} + + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + param.data.copy_(loaded_weight) + return True if return_success else None + + # Case for BitsAndBytes + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + if use_bitsandbytes_4bit: + shard_dim = 0 + + expert_data = param.data[expert_id] + if shard_id == "w2": + expert_data.copy_(loaded_weight) + elif shard_id in ("w1", "w3"): + # BNB inflight quantization has already sharded the weights + full_load = True + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.tp_rank, + load_full=full_load, + ) + return True if return_success else None + + # is_transposed: if the dim to shard the weight + # should be flipped. Required by GPTQ, compressed-tensors + # should be whatever dimension intermediate_size_per_partition is + is_transposed = getattr(param, "is_transposed", False) + shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id] + if is_transposed: + shard_dim = int(not shard_dim) + shard_dim_force = getattr(param, "shard_dim", None) + shard_dim = shard_dim_force if shard_dim_force is not None else shard_dim + + full_load = len(loaded_weight.shape) == 3 + if full_load: + shard_dim += 1 + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + final_shape = list(loaded_weight.shape) + if shard_id in ["w1", "w3"]: + final_shape[1] *= 2 + final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size + param.materialize(final_shape, dtype=loaded_weight.dtype) + + expert_data = param.data if full_load else param.data[expert_id] + + # Case input scale: input_scale loading is only supported for fp8 + if "input_scale" in weight_name: + # this is needed for compressed-tensors only + loaded_weight = loaded_weight.to(param.data.device) + + if ( + "compressed" in quant_method_name.lower() + and param.data[expert_id] != 1 + and (param.data[expert_id] - loaded_weight).abs() > 1e-5 + ): + raise ValueError( + "input_scales of w1 and w3 of a layer " + f"must be equal. But got {param.data[expert_id]} " + f"vs. {loaded_weight}" + ) + + self._load_single_value( + param=param, + loaded_weight=loaded_weight, + expert_id=global_expert_id if use_global_sf else expert_id, + ) + return True if return_success else None + + # Case g_idx + if "g_idx" in weight_name: + self._load_g_idx( + shard_dim=0, + shard_id=shard_id, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.tp_rank, + ) + return True if return_success else None + + # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern + if "ModelOpt" in quant_method_name: + # Determine per-tensor weight scale patterns based on variant + # Use the dedicated method instead of brittle string matching + uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern() + + # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) + # weights scales. + # Input scales are always per-tensor. + # Weight scales: FP4 uses "weight_scale_2" and FP8 uses + # "weight_scale" for per-tensor scales. + is_per_tensor = ( + "weight_scale_2" in weight_name + if uses_weight_scale_2 + else "weight_scale" in weight_name + ) or "input_scale" in weight_name + if is_per_tensor: + self._load_per_tensor_weight_scale( + shard_id=shard_id, + param=param, + loaded_weight=loaded_weight, + expert_id=expert_id, + ) + return True if return_success else None + + # If the weight is w13_weight_scale and w13_weight_scales are + # combined into single loaded_weight, call + # _load_combined_w13_weight_scale() to load it. + # This is checked by comparing the hidden_out dims of the + # loaded_weight and the param. + if "w13_weight_scale" in weight_name: + loaded_weight_hidden_out = loaded_weight.shape[-2] + param_hidden_out = param.data.shape[-2] * self.tp_size + if loaded_weight_hidden_out == param_hidden_out: + self._load_combined_w13_weight_scale( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + param=param, + tp_rank=self.tp_rank, + ) + return True if return_success else None + + # For other weights, call _load_model_weight_or_group_weight_scale() + # to load it. + if "weight" in weight_name: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.tp_rank, + ) + return True if return_success else None + + # Case weight scales, zero_points and offset, weight/input global scales + if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name: + # load the weight scales and zp based on the quantization scheme + # supported weight scales/zp can be found in + # FusedMoeWeightScaleSupported + # TODO @dsikka: once hardened, refactor to use vLLM Parameters + # specific to each case + quant_method = getattr(param, "quant_method", None) + if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value: + self._load_per_channel_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.tp_rank, + ) + elif quant_method in [ + FusedMoeWeightScaleSupported.GROUP.value, + FusedMoeWeightScaleSupported.BLOCK.value, + ]: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.tp_rank, + load_full_w2=getattr(param, "load_full_w2", False), + ) + elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: + self._load_per_tensor_weight_scale( + shard_id=shard_id, + param=param, + loaded_weight=loaded_weight, + expert_id=expert_id, + ) + else: + WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported] + raise ValueError( + f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}" + ) + return True if return_success else None + + # Case weight_shape + if "weight_shape" in weight_name: + # only required by compressed-tensors + self._load_single_value( + param=param, loaded_weight=loaded_weight, expert_id=expert_id + ) + return True if return_success else None + + # Case model weights + if "weight" in weight_name: + if self.opt_level != 0: + scale_name = weight_name.split('.')[-1] + "_scale" + params_dict = dict(self.named_parameters()) + scale_param = params_dict[scale_name] + shard_dim_scale = getattr(scale_param, "shard_dim", None) + scale_expert_data = scale_param.data if full_load else scale_param.data[expert_id] + self._load_model_opt_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + shard_dim_scale=shard_dim_scale, + loaded_weight=loaded_weight, + expert_data=expert_data, + scale_data=scale_expert_data, + opt_level=self.opt_level, + tp_rank=self.tp_rank) + else: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.tp_rank) + return True if return_success else None + + return False if return_success else None + + def load_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[str]: + if (expert_mapping := self.expert_mapping) is None: + raise ValueError( + "`self.expert_mapping` must be provided to " + "load weights using `self.load_weights`." + ) + for expert_name, loaded_weight in weights: + qual_name = f"{self.layer_name}.{expert_name}" + for param_name, weight_name, expert_id, shard_id in expert_mapping: + if weight_name not in qual_name: + continue + weight_name = qual_name.replace(weight_name, param_name) + param_name = weight_name.removeprefix(f"{self.layer_name}.") + param = getattr(self, param_name) + success = self.weight_loader( + param=param, + loaded_weight=loaded_weight, + weight_name=weight_name, + shard_id=shard_id, + expert_id=expert_id, + return_success=True, + ) + if success: + logger.debug( + "Loaded %s for expert %d into %s", + param_name, + expert_id, + self.layer_name, + ) + yield param_name + + def get_expert_weights(self) -> Iterable[torch.Tensor]: + weights = list(self.named_parameters()) + assert all( + weight.is_contiguous() + for name, weight in weights + if not name.startswith("_shared_experts.") + ) + + # Filter out the non-expert weights. + # `e_score_correction_bias` is a bias for each logical expert, + # with shape (num_logical_experts,), not an expert weight. + NON_EXPERT_WEIGHTS = { + "e_score_correction_bias", + } + + return [ + weight.view(self.local_num_experts, -1) + for name, weight in weights + if name not in NON_EXPERT_WEIGHTS + and weight.shape != torch.Size([]) + and not name.startswith("_shared_experts.") + # exclude parameters from non-expert submodules (e.g. gate/shared) + and not name.startswith("_gate.") + ] + + def set_eplb_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + """ + Register the EPLB state in this layer. + + This is used later in forward pass, where we get the expert mapping + and record the load metrics in `expert_load_view`. + """ + self.expert_load_view = expert_load_view[moe_layer_idx] + self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx] + self.logical_replica_count = logical_replica_count[moe_layer_idx] + + def ensure_moe_quant_config_init(self): + if self.quant_method.moe_quant_config is None: + # Note: the moe_quant_config can't be constructed until after + # weight loading post processing. + self.quant_method.moe_quant_config = ( + self.quant_method.get_fused_moe_quant_config(self) + ) + + @property + def moe_quant_config(self) -> FusedMoEQuantConfig | None: + self.ensure_moe_quant_config_init() + return self.quant_method.moe_quant_config + + def ensure_dp_chunking_init(self): + if not self.use_dp_chunking or self.batched_hidden_states is not None: + return + + states_shape: tuple[int, ...] + logits_shape: tuple[int, ...] + + moe = self.moe_config + + if self.vllm_config.parallel_config.enable_dbo: + states_shape = (2, moe.max_num_tokens, self.hidden_size) + logits_shape = (2, moe.max_num_tokens, self.logical_num_experts) + else: + states_shape = (moe.max_num_tokens, self.hidden_size) + logits_shape = (moe.max_num_tokens, self.logical_num_experts) + + self.batched_hidden_states = torch.zeros( + states_shape, dtype=moe.in_dtype, device=torch.cuda.current_device() + ) + + self.batched_router_logits = torch.zeros( + logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device() + ) + + @staticmethod + def select_experts( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + use_grouped_topk: bool, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + indices_type: torch.dtype | None = None, + enable_eplb: bool = False, + expert_map: torch.Tensor | None = None, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + global_num_experts: int | None = None, + zero_expert_num: int | None = None, + zero_expert_type: str | None = None, + num_fused_shared_experts: int = 0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Route the input hidden states to the top-k experts based on the + router logits. + + Returns: + (topk_weights, topk_ids, zero_expert_result) + (tuple[torch.Tensor, torch.Tensor, torch.Tensor]): + The weights, expert ids, and zero expert computation result. + + **Compatibility**: When EPLB is not enabled, the returned ids are + equivalent to global logical ids, so should be compatible with + plain MoE implementations without redundant experts. + """ + from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_topk, + fused_topk_bias, + ) + from ixformer.inference.functions import moe_grouped_topk as grouped_topk + + # Check if we should use a routing simulation strategy + routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY + if routing_strategy != "": + topk_weights, topk_ids = RoutingSimulator.simulate_routing( + hidden_states=hidden_states, + router_logits=router_logits, + strategy_name=routing_strategy, + top_k=top_k, + indices_type=indices_type, + ) + + # DeepSeekv2 uses grouped_top_k + elif use_grouped_topk: + assert topk_group is not None + assert num_expert_group is not None + topk_weights, topk_ids = grouped_topk( + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + num_expert_group=num_expert_group, + topk_group=topk_group, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) + elif e_score_correction_bias is not None: + topk_weights, topk_ids = fused_topk_bias( + hidden_states=hidden_states, + gating_output=router_logits, + e_score_correction_bias=e_score_correction_bias.data, + topk=top_k, + renormalize=renormalize, + ) + if routed_scaling_factor is not None: + topk_weights *= routed_scaling_factor + elif custom_routing_function is None: + topk_weights, topk_ids, token_expert_indices = fused_topk( + hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + indices_type=indices_type, + ) + else: + topk_weights, topk_ids = custom_routing_function( + hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + ) + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) + + if enable_eplb: + assert expert_load_view is not None + assert logical_to_physical_map is not None + assert logical_replica_count is not None + + topk_ids = eplb_map_to_physical_and_record( + topk_ids=topk_ids, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + indices_type=indices_type, + ) + + assert topk_ids.dtype == indices_type or indices_type is None + + topk_ids = topk_ids.to(torch.int32) + + # Compute zero expert result if needed + if ( + zero_expert_num is not None + and zero_expert_num > 0 + and zero_expert_type is not None + and global_num_experts is not None + ): + zero_expert_result = zero_experts_compute_triton( + expert_indices=topk_ids, + expert_scales=topk_weights, + num_experts=global_num_experts, + zero_expert_type=zero_expert_type, + hidden_states=hidden_states, + ) + else: + zero_expert_result = None + return topk_weights, topk_ids, zero_expert_result + + def must_reduce_shared_expert_outputs(self) -> bool: + """ + The shared_experts are typically computed using the RowParallelLinear + layer. The result of this function is typically used as + the reduce_results argument to the module. + When just tensor-parallel is used, it is not required to reduce + the shared_experts results immediately. Instead we reduce at the + once at the end of the MoE op. (Refer to DeepSeekV2MoE module) + With EP and all2all kernels - this is no longer viable as all + GPU ranks in DP, produce the complete set of hidden_states. + Therefore it is required that we reduce the shared_experts output + early. + """ + assert self.quant_method is not None + return ( + isinstance(self.quant_method, FusedMoEModularMethod) + and self.quant_method.fused_experts.output_is_reduced() + ) + + def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor): + """ + Some combine kernels reduce across GPU ranks by default. + """ + if self.must_reduce_shared_expert_outputs(): + return final_hidden_states + else: + return tensor_model_parallel_all_reduce(final_hidden_states) + + def forward_native( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + og_hidden_states = hidden_states.shape[-1] + if self.hidden_size != og_hidden_states: + hidden_states = F.pad( + hidden_states, + (0, self.hidden_size - og_hidden_states), + mode="constant", + value=0.0, + ) + + def reduce_output(states: torch.Tensor) -> torch.Tensor: + if ( + not self.is_sequence_parallel + and not self.use_dp_chunking + and self.reduce_results + and (self.tp_size > 1 or self.ep_size > 1) + ): + states = self.maybe_all_reduce_tensor_model_parallel(states) + return states + + if self.shared_experts is None: + # if current_platform.is_tpu(): + # # TODO: Once the OOM issue for the TPU backend is resolved, we + # # will switch to using the moe_forward custom op. + fused_output = self.forward_impl(hidden_states, router_logits) + assert not isinstance(fused_output, tuple) + # else: + # fused_output = torch.ops.vllm.moe_forward( + # hidden_states, router_logits, self.layer_name + # ) + if self.zero_expert_num is not None and self.zero_expert_num > 0: + assert isinstance(fused_output, tuple) + fused_output, zero_expert_result = fused_output + return (reduce_output(fused_output) + zero_expert_result)[ + ..., :og_hidden_states + ] + else: + return reduce_output(fused_output)[..., :og_hidden_states] + else: + # if current_platform.is_tpu(): + # # TODO: Once the OOM issue for the TPU backend is resolved, we + # # will switch to using the moe_forward custom op. + shared_output, fused_output = self.forward_impl( + hidden_states, router_logits + ) + # else: + # shared_output, fused_output = torch.ops.vllm.moe_forward_shared( + # hidden_states, router_logits, self.layer_name + # ) + return ( + reduce_output(shared_output)[..., :og_hidden_states], + reduce_output(fused_output)[..., :og_hidden_states], + ) + + def forward_cuda( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + return self.forward_native(hidden_states, router_logits) + + def forward_impl_chunked( + self, + full_hidden_states: torch.Tensor, + full_router_logits: torch.Tensor, + has_separate_shared_experts: bool, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + assert self.batched_hidden_states is not None + assert self.batched_router_logits is not None + assert self.batched_hidden_states.dtype == full_hidden_states.dtype + assert self.batched_router_logits.dtype == full_router_logits.dtype + # Check size compatibility. + assert self.batched_hidden_states.size(-1) == full_hidden_states.size(-1) + assert self.batched_router_logits.size(-1) == full_router_logits.size(-1) + + full_fused_final_hidden_states = torch.empty_like(full_hidden_states) + if self.shared_experts is not None: + full_shared_final_hidden_states = torch.empty_like(full_hidden_states) + + def process_chunk(chunk_start, chunk_end, skip_result_store=False): + chunk_size = chunk_end - chunk_start + hidden_states = full_hidden_states[chunk_start:chunk_end, :] + router_logits = full_router_logits[chunk_start:chunk_end, :] + + assert self.batched_hidden_states is not None + assert self.batched_router_logits is not None + # This is only true when DBO has been enabled in the config. + # Both tensors will have an outer dimension for the ubatch id + if self.batched_hidden_states.dim() == 3: + assert self.batched_router_logits.dim() == 3 + batch_buffer_idx = dbo_current_ubatch_id() + batched_hidden_states = self.batched_hidden_states[batch_buffer_idx, :] + batched_router_logits = self.batched_router_logits[batch_buffer_idx, :] + else: + batched_hidden_states = self.batched_hidden_states + batched_router_logits = self.batched_router_logits + + assert ( + batched_hidden_states.size(0) # type: ignore + >= chunk_size + ) + assert ( + batched_router_logits.size(0) # type: ignore + >= chunk_size + ) + staged_hidden_states = batched_hidden_states[:chunk_size, :] # type: ignore + staged_router_logits = batched_router_logits[:chunk_size, :] # type: ignore + staged_hidden_states.copy_(hidden_states, non_blocking=True) + staged_router_logits.copy_(router_logits, non_blocking=True) + + # Matrix multiply. + final_hidden_states = self.quant_method.apply( + layer=self, + x=staged_hidden_states, + router_logits=staged_router_logits, + top_k=self.top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map + if not self.rocm_aiter_fmoe_enabled + else self.expert_mask, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, + e_score_correction_bias=self.e_score_correction_bias, + activation=self.activation, + enable_eplb=self.enable_eplb, + expert_load_view=self.expert_load_view, + logical_to_physical_map=self.logical_to_physical_map, + logical_replica_count=self.logical_replica_count, + ) + + if has_separate_shared_experts: + assert not isinstance(final_hidden_states, tuple) + assert self.shared_experts is not None + + shared_output = self.shared_experts(staged_hidden_states) + + final_hidden_states = ( + shared_output, + final_hidden_states, + ) + + if self.zero_expert_num is not None and self.zero_expert_num > 0: + assert isinstance(final_hidden_states, tuple) + assert self.shared_experts is None + final_hidden_states, zero_expert_result = final_hidden_states + if zero_expert_result is not None: + final_hidden_states += zero_expert_result + + if not skip_result_store: + if self.shared_experts is None: + full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( + final_hidden_states, non_blocking=True + ) + else: + full_shared_final_hidden_states[chunk_start:chunk_end, :].copy_( + final_hidden_states[0], non_blocking=True + ) + full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( + final_hidden_states[1], non_blocking=True + ) + + ctx = get_forward_context() + # flashinfer_cutlass_kernels can handle: optional DP + TP/EP + max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu + moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens + + # If the input to the MoE is sequence parallel then divide by sp_size + # to find the maximum number of tokens for any individual dispatcher. + if self.is_sequence_parallel: + max_tokens_across_dispatchers = cdiv( + max_tokens_across_dispatchers, self.sp_size + ) + + num_tokens = full_hidden_states.size(0) + for chunk_idx, chunk_start_ in enumerate( + range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank) + ): + chunk_start = chunk_start_ + chunk_end = min( + chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dispatchers + ) + # clamp start and end + chunk_start = min(chunk_start, num_tokens - 1) + chunk_end = min(chunk_end, num_tokens) + with ctx.dp_metadata.chunked_sizes( + self.sp_size, moe_dp_chunk_size_per_rank, chunk_idx + ): + process_chunk( + chunk_start, chunk_end, skip_result_store=chunk_start_ >= num_tokens + ) + + if self.shared_experts is None: + return full_fused_final_hidden_states + else: + return (full_shared_final_hidden_states, full_fused_final_hidden_states) + + def forward_impl( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + assert self.quant_method is not None + + self.ensure_moe_quant_config_init() + self.ensure_dp_chunking_init() + + has_separate_shared_experts = ( + not isinstance(self.quant_method, FusedMoEModularMethod) + and self.shared_experts is not None + ) + + use_chunked_impl = self.use_dp_chunking + + use_shared_experts_stream = ( + has_separate_shared_experts + and not use_chunked_impl + and self.shared_experts_stream is not None + and ( + hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + ) + ) + + if use_shared_experts_stream: + assert self.shared_experts_stream is not None + + # Clone BEFORE switching streams to avoid race condition + # where routed_expert kernel may mutate hidden_states. + hidden_states_clone = hidden_states.clone() + + # Record that the clone will be used by shared_experts_stream + # to avoid gc issue from deallocation of hidden_states_clone + # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 + # NOTE: We dont need shared_output.record_stream(current_stream()) + # because we synch the streams before using shared_output. + hidden_states_clone.record_stream(self.shared_experts_stream) + + # Mark sync start point for the separate shared experts + # stream here since we want to run in parallel with the + # router/gate (next op below) + assert self.shared_experts_stream is not None + self.shared_experts_stream.wait_stream(current_stream()) + + # If router/gate provided, then apply it here. + # (Note: This code runs only when "overlapped mode" is on to allow + # parallel execution of shared experts with the FusedMoE via + # separate cuda stream) + if self.gate is not None: + router_logits, _ = self.gate(hidden_states) + + if use_chunked_impl: + return self.forward_impl_chunked( + hidden_states, router_logits, has_separate_shared_experts + ) + + do_naive_dispatch_combine: bool = self.dp_size > 1 and not isinstance( + self.quant_method, FusedMoEModularMethod + ) + + ctx = get_forward_context() + sp_ctx = ( + ctx.dp_metadata.sp_local_sizes(self.sp_size) + if ctx.dp_metadata + else nullcontext() + ) + + with sp_ctx: + if do_naive_dispatch_combine: + hidden_states_combined, router_logits = get_ep_group().dispatch( + hidden_states, router_logits, self.is_sequence_parallel + ) + + # Matrix multiply. + final_hidden_states = self.quant_method.apply( + layer=self, + x=hidden_states_combined + if do_naive_dispatch_combine + else hidden_states, + router_logits=router_logits, + top_k=self.top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map + if not self.rocm_aiter_fmoe_enabled + else self.expert_mask, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, + e_score_correction_bias=self.e_score_correction_bias, + activation=self.activation, + apply_router_weight_on_input=self.apply_router_weight_on_input, + enable_eplb=self.enable_eplb, + expert_load_view=self.expert_load_view, + logical_to_physical_map=self.logical_to_physical_map, + logical_replica_count=self.logical_replica_count, + ) + + if has_separate_shared_experts: + assert self.shared_experts is not None + + if use_shared_experts_stream: + # Run shared experts in parallel on a separate stream + # NOTE: We start the separate stream here and mark the + # sync end point immediately after it is done. This is + # important to avoid excessive stream allocations by the cuda + # graph replay later. + with torch.cuda.stream(self.shared_experts_stream): + # Note that hidden_states clone() is necessary here to avoid + # conflict with the main stream + shared_output = self.shared_experts(hidden_states_clone) + current_stream().wait_stream(self.shared_experts_stream) + else: + shared_output = self.shared_experts(hidden_states) + + final_hidden_states = ( + shared_output, + final_hidden_states, + ) + elif self.zero_expert_num is not None and self.zero_expert_num > 0: + assert isinstance(final_hidden_states, tuple) + final_hidden_states, zero_expert_result = final_hidden_states + + def combine_output(states: torch.Tensor) -> torch.Tensor: + if do_naive_dispatch_combine: + states = get_ep_group().combine(states, self.is_sequence_parallel) + return states + + if self.shared_experts is not None: + return ( + final_hidden_states[0], + combine_output(final_hidden_states[1]), + ) + elif self.zero_expert_num is not None and self.zero_expert_num > 0: + assert isinstance(final_hidden_states, torch.Tensor) + return (combine_output(final_hidden_states), zero_expert_result) + else: + return combine_output(final_hidden_states) + + @classmethod + def make_expert_params_mapping( + cls, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + num_redundant_experts: int = 0, + ) -> list[tuple[str, str, int, str]]: + num_physical_experts = num_experts + num_redundant_experts + + # In the returned mapping: + # - `expert_id` is the physical expert id + # - `weight_name` contains the weight name of the logical expert + # So that we should map the expert id to logical in `weight_name` + physical_to_logical_map = ( + EplbState.build_initial_global_physical_to_logical_map( + num_experts, num_redundant_experts + ) + ) + + return [ + # (param_name, weight_name, expert_id, shard_id) + ( + "experts.w13_" + if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] + else "experts.w2_", + f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.", + expert_id, + shard_id, + ) + for expert_id in range(num_physical_experts) + for shard_id, weight_name in [ + ("w1", ckpt_gate_proj_name), + ("w2", ckpt_down_proj_name), + ("w3", ckpt_up_proj_name), + ] + ] + + def extra_repr(self) -> str: + s = ( + f"global_num_experts={self.global_num_experts}, " + f"local_num_experts={self.local_num_experts}, " + f"top_k={self.top_k}, " + f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501 + f"tp_size={self.tp_size},\n" + f"ep_size={self.ep_size}, " + f"reduce_results={self.reduce_results}, " + f"renormalize={self.renormalize}, " + f"use_grouped_topk={self.use_grouped_topk}" + ) + + if self.use_grouped_topk: + s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}" # noqa: E501 + + s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'" # noqa: E501 + + return s + + +def moe_forward( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + assert self.shared_experts is None + return self.forward_impl(hidden_states, router_logits) + + +def moe_forward_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +direct_register_custom_op( + op_name="moe_forward", + op_func=moe_forward, + mutates_args=["hidden_states"], + fake_impl=moe_forward_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + + +def moe_forward_shared( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + assert self.shared_experts is not None + return self.forward_impl(hidden_states, router_logits) + + +def moe_forward_shared_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + shared_out = torch.empty_like(hidden_states) + fused_out = torch.empty_like(hidden_states) + return shared_out, fused_out + + +direct_register_custom_op( + op_name="moe_forward_shared", + op_func=moe_forward_shared, + mutates_args=["hidden_states"], + fake_impl=moe_forward_shared_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + +# Mark the FusedMoE weight_loader as supporting MoE-specific parameters +# to avoid expensive runtime reflection in model loading code +FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] diff --git a/model_executor/layers/fused_moe/modular_kernel.py b/model_executor/layers/fused_moe/modular_kernel.py new file mode 100644 index 0000000..9984b89 --- /dev/null +++ b/model_executor/layers/fused_moe/modular_kernel.py @@ -0,0 +1,1222 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from collections.abc import Callable +from dataclasses import dataclass +from enum import Enum +from math import prod +from typing import final + +import torch + +import vllm.envs as envs +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, + count_expert_num_tokens, + disable_inplace, +) +from vllm.utils.math_utils import cdiv +from vllm.v1.worker.ubatching import ( + dbo_current_ubatch_id, + dbo_enabled, + dbo_maybe_run_recv_hook, + dbo_register_recv_hook, + dbo_yield, +) + +# +# This file defines a set of base classes used to make MoE kernels more modular. +# The goal is to be able to utilize different communication mechanisms with +# any fused MoE kernel without needing to have combinatoric implementations. +# +# The fused moe kernels are broken down into the following components: +# +# [Router] → [Quantize-Dispatch] → [Permute-Experts-Unpermute] → [Combine] +# +# Each component will be independent of (but may inform) the others except for +# [Quantize-Dispatch] and `[Combine] (see below). The components can then be +# mixed and matched with so that DP+EP can be supported easily for multiple +# MoE kernel implementations. +# +# The following main classes are defined: +# * FusedMoEPrepareAndFinalize - an abstract base class for preparation of MoE +# inputs (e.g. quantization, distribution) and finalization of Moe outputs. +# The prepare method must take care of any needed quantization and the +# finalize method, informed by the FusedMoEPermuteExpertsUnpermute method, +# may apply weights and/or do the final reduction of the output. +# * FusedMoEPermuteExpertsUnpermute - an abstract base class for the main fused +# MoE operation, i.e matmul + act_mul + optionally quant + matmul. +# Some FusedMoEPermuteExpertsUnpermute implementations may choose to do +# the weight application and/or reduction. The class communicates this +# to [Finalize] via a TopKWeightAndReduce object. +# * FusedMoEModularKernel - an interface class that combines a +# FusedMoEPrepareAndFinalize and a FusedMoEPermuteExpertsUnpermute to +# provide the standard fused MoE kernel interface. +# * TopKWeightAndReduce - A TopKWeightAndReduce implementation chosen +# by the FusedMoEPermuteExpertsUnpermute implementation that is passed +# on to [Finalize]. +# +# [Quantize-Prepare] and [Finalize] functionality are bundled into a single +# class `FusedMoEPrepareAndFinalize` since they could use collective +# communication mechanisms that need to be consistent. +# + + +class FusedMoEActivationFormat(Enum): + """ + The standard activation format (num_tokens, hidden dim). + """ + + Standard = ("standard",) + """ + The batched experts format (num experts, max tokens per expert, hidden dim) + """ + BatchedExperts = ("batched_experts",) + + +@dataclass +class ExpertTokensMetadata: + """ + Metadata regarding expert-token routing. + """ + + expert_num_tokens: torch.Tensor + expert_num_tokens_cpu: torch.Tensor | None + + @staticmethod + def make_from_list( + expert_num_tokens_list: list[int], device: str + ) -> "ExpertTokensMetadata": + expert_num_tokens_cpu = torch.tensor( + expert_num_tokens_list, device="cpu", dtype=torch.int32 + ) + return ExpertTokensMetadata( + expert_num_tokens=expert_num_tokens_cpu.to(device, non_blocking=True), + expert_num_tokens_cpu=expert_num_tokens_cpu, + ) + + +class TopKWeightAndReduce(ABC): + """ + An abstract base class for weight application and reduction implementations. + """ + + @abstractmethod + def apply( + self, + output: torch.Tensor | None, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> torch.Tensor: + """ + Apply topk_weights to the fused_experts_outputs and/or reduce. + If an output tensor is not passed, it will be created in the + function. + """ + raise NotImplementedError + + +# +# PrepareResultType is a tuple of: +# - quantized + dispatched a. +# - quantized + dispatched a1_scales. +# - Optional ExpertTokensMetadata containing gpu/cpu tensors +# as big as the number of local experts with the information about the +# number of tokens assigned to each local expert. +# - Optional dispatched expert topk IDs +# - Optional dispatched expert topk weight +# +# See `prepare` method below. +# +PrepareResultType = tuple[ + torch.Tensor, + torch.Tensor | None, + ExpertTokensMetadata | None, + torch.Tensor | None, + torch.Tensor | None, +] + +ReceiverType = Callable[[], PrepareResultType] + + +# TODO: pass FusedMoEParallelConfig in as ctor parameter? +class FusedMoEPrepareAndFinalize(ABC): + """ + An abstract base class for the [Quantize-Prepare] and [Finalize] steps + described above. + """ + + def post_init_setup(self, fused_experts: "FusedMoEPermuteExpertsUnpermute"): + """ + Initialize FusedMoEPrepareAndFinalize settings that depend on + FusedMoEPermuteExpertsUnpermute experts object. + The FusedMoEPrepareAndFinalize implementations that have such + dependencies may choose to override this function. + """ + return + + @abstractmethod + def prepare( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> PrepareResultType: + """ + Perform any quantization (and/or) dispatching needed for this kernel. + - a1: The (unquantized) input to the MoE layer. + - topk_ids: The topk ids. + - topk_weights: The topk weights. + - num_experts: The total number of experts in the global expert space. + - expert_map: A tensor mapping expert indices from the global expert + space to the local expert space of the expert parallel shard. + - apply_router_weight_on_input: When True, apply the weights to the + activations, before quantization + dispatching. + - quant_config: Quantization info provided by the fused experts. + + Returns a tuple of: + - quantized + dispatched a. + - Optional quantized + dispatched a1_scales. + - Optional ExpertTokensMetadata containing gpu/cpu tensors + as big as the number of local experts with the information about the + number of tokens assigned to each local expert. + - Optional dispatched expert topk IDs + - Optional dispatched expert topk weight + """ + raise NotImplementedError + + def supports_async(self) -> bool: + """ + Indicates whether or not this class implements prepare_async and + finalize_async. + """ + return False + + def prepare_async( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> tuple[Callable, ReceiverType] | ReceiverType: + """ + Perform any quantization (and/or) dispatching needed for this kernel + but do not wait for results from other workers. + - a1: The (unquantized) input to the MoE layer. + - a1_scale: Optional scales for a1 + - a2_scale: Optional scales for the second MoE gemm. Required to make + sure the quantization is consistent for both gemms. + - topk_ids: The topk ids. + - topk_weights: The topk weights. + - num_experts: The total number of experts in the global expert space. + - expert_map: A tensor mapping expert indices from the global expert + space to the local expert space of the expert parallel shard. + - apply_router_weight_on_input: When True, apply the weights to the + activations, before quantization + dispatching. + + Returns a callback or a hook callback pair that when invoked waits for + results from other workers and has the same return signature as + `prepare`, if a hook is returned this is more lightweight check that + the recv is complete without doing extra work (used by DBO, will be + refactored in the very near future) + + e.g. + + ret = obj.prepare_async(...) + + if isinstance(ret, tuple): + hook, receiver = ret + hook() + + if hook is not None: + a, a_scales, expert_meta, topk_ids, topk_weights = receiver() + + is equivalent to: + + a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...) + """ + raise NotImplementedError + + @abstractmethod + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: TopKWeightAndReduce, + ) -> None: + """ + Perform any combine plus apply weights and perform a reduction on the + fused experts output. + - output: The output tensor, written in place. Must be (M, K) shape. + - fused_expert_output: The unweighted, unreduced output of the fused + experts, it will have (M, topk, K) shape. + - topk_weights: The weights to be applied to the fused_experts_output. + - topk_ids: The topk_ids. + - apply_router_weight_on_input: When False, apply the weights to + fused_expert_output. + - weight_and_reduce_impl: An optional TopKWeightAndReduce + implementation. + """ + raise NotImplementedError + + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: TopKWeightAndReduce, + ) -> tuple[Callable, Callable] | Callable: + """ + Perform any combine plus apply weights and perform a reduction on the + fused experts output but do not wait for results from other workers. + - output: The output tensor, written in place. Must be (M, K) shape. + - fused_expert_output: The unweighted, unreduced output of the fused + experts, it will have (M, topk, K) shape. + - topk_weights: The weights to be applied to the fused_experts_output. + - topk_ids: The topk_ids. + - apply_router_weight_on_input: When False, apply the weights to + fused_expert_output. + - weight_and_reduce_impl: An optional TopKWeightAndReduce + implementation. + + Returns a callback or a hook callback pair that when invoked waits for + results from other workers and has the same return signature as + `finalize`, if a hook is returned this is more lightweight check that + the recv is complete without doing extra work (used by DBO, will be + refactored in the very near future) + + ret = obj.finalize_async(output, ...) + ... output not valid yet ... + if isinstance(ret, tuple): + hook, receiver = ret + hook() + receiver() + ... output valid here ... + + is equivalent to: + + obj.finalize(output, ...) + """ + raise NotImplementedError + + @property + @abstractmethod + def activation_format(self) -> FusedMoEActivationFormat: + """ + A property indicating the output format of the activations for the + 'prepare' method. + """ + raise NotImplementedError + + @abstractmethod + def topk_indices_dtype(self) -> torch.dtype | None: + """ + The PrepareFinalize All2All implementations generally constrain the + dtype of the topk_ids they support. This function returns the + required topk indices dtype so it can be respected. + Return None if there are no such restrictions. + """ + raise NotImplementedError + + @abstractmethod + def max_num_tokens_per_rank(self) -> int | None: + """ + Some PrepareFinalize All2All implementations are batched. Meaning, + they can process only as set of tokens at a time. This + function returns the batch size i.e the maximum number of tokens + the implementation can process at a time. + Return None if there are no such restrictions. + """ + raise NotImplementedError + + @abstractmethod + def num_dispatchers(self) -> int: + raise NotImplementedError + + @abstractmethod + def output_is_reduced(self) -> bool: + """ + Indicates whether or not the output of finalize is reduced across all + ranks. + """ + raise NotImplementedError + + +# TODO: add supported activations method (return string) +class FusedMoEPermuteExpertsUnpermute(ABC): + """ + An abstract base class for the [Permute-Experts-Unpermute] step described + above. + """ + + def __init__( + self, + quant_config: FusedMoEQuantConfig, + ): + """ + quant_config: Quantization parameters for this experts instance. + """ + self.quant_config = quant_config + + @property + @abstractmethod + def activation_formats( + self, + ) -> tuple[FusedMoEActivationFormat, FusedMoEActivationFormat]: + """ + A property which is a tuple of the input and output activation formats + for the 'apply' method. + """ + raise NotImplementedError + + def moe_problem_size( + self, + a1: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + ) -> tuple[int, int, int, int, int]: + """ + Extract the MoE problem size from the given tensor arguments: + - a: The hidden states, input to the MoE layer. + - w1: The first set of expert weights. + - w2: The second set of expert weights. + - topk_ids: The topk ids. + + Note: extracting the problem shape from the weight and activation + tensors is not obvious. It needs to be done this way specifically + due to subtle issues with particular kernels, e.g. the int4 kernels + divide the trailing dimension by two, so it's not "correct" to + extract N or K from the trailing dimension of w1 or w2. Similarly, + some kernels transpose the weights, so this needs to be kept in mind. + + Note: This implementation covers most cases. However, if experts + require a specialized implementation, like MarlinExperts, they are free + to override this function. + """ + assert w1.dim() == 3 and w2.dim() == 3 + E, N, _ = w1.size() + K = a1.size(-1) + + if a1.dim() == 2: + # Make sure we are using the correct a1 (pre-permute). + assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}" + M = a1.size(0) + else: + assert a1.dim() == 3 + assert a1.size(0) == E, f"{a1.size(0)} == {E}" + M = a1.size(1) # This is max_num_tokens + + assert topk_ids.dim() == 2 + topk = topk_ids.size(1) + + return E, M, N, K, topk + + # + # Various helpers for accessing quantization parameters from the + # quant_config. + # + + @property + def quant_dtype(self) -> torch.dtype | None: + return self.quant_config.quant_dtype + + @property + def block_shape(self) -> list[int] | None: + return self.quant_config.block_shape + + @property + def per_act_token_quant(self) -> bool: + return self.quant_config.per_act_token_quant + + @property + def per_out_ch_quant(self) -> bool: + return self.quant_config.per_out_ch_quant + + @property + def a1_scale(self) -> torch.Tensor | None: + return self.quant_config.a1_scale + + @property + def a2_scale(self) -> torch.Tensor | None: + return self.quant_config.a2_scale + + @property + def a1_gscale(self) -> torch.Tensor | None: + return self.quant_config.a1_gscale + + @property + def a2_gscale(self) -> torch.Tensor | None: + return self.quant_config.a2_gscale + + @property + def w1_scale(self) -> torch.Tensor | None: + return self.quant_config.w1_scale + + @property + def w2_scale(self) -> torch.Tensor | None: + return self.quant_config.w2_scale + + @property + def w1_zp(self) -> torch.Tensor | None: + return self.quant_config.w1_zp + + @property + def w2_zp(self) -> torch.Tensor | None: + return self.quant_config.w2_zp + + @property + def w1_bias(self) -> torch.Tensor | None: + return self.quant_config.w1_bias + + @property + def w2_bias(self) -> torch.Tensor | None: + return self.quant_config.w2_bias + + @property + def g1_alphas(self) -> torch.Tensor | None: + return self.quant_config.g1_alphas + + @property + def g2_alphas(self) -> torch.Tensor | None: + return self.quant_config.g2_alphas + + # TODO (bnell): make this return a CHUNK_SIZE or None instead? + @abstractmethod + def supports_chunking(self) -> bool: + """ + A flag indicating whether or not this class supports activation + chunking. + """ + raise NotImplementedError + + @abstractmethod + def supports_expert_map(self) -> bool: + """ + A flag indicating whether or not this class supports expert maps + """ + raise NotImplementedError + + def supports_packed_ue8m0_act_scales(self) -> bool: + """ + A flag indicating whether or not this class can process packed ue8m0 + activation scales. + """ + return False + + def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype: + """ + Workspace type: The dtype to use for the workspace tensors. + """ + return act_dtype + + @abstractmethod + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + """ + Compute the shapes for the temporary and final outputs of the two gemms + and activation in the fused expert function. Since the gemms are + independent, the workspace for the first gemm can be shared with the + workspace for the last gemm. + + Inputs: + - M: number of tokens. + - N: Row (or column) dimension of expert weights. + - K: hidden dimension + - topk: The number of top-k experts to select. + - global_num_experts: global number of experts. + - local_num_experts: local number of experts due to DP/EP. + - expert_tokens_meta: number of tokens per expert metadata for batched + format. + + Returns a tuple of: + - workspace13 shape tuple: must be large enough to hold the + result of either expert gemm. + - workspace2 shape tuple: must be large enough to hold the + result of the activation function. + - output shape tuple: must be exact size of the final gemm output. + - Note: workspace shapes can be 0 if the workspace is not needed. + But in order for activation chunking to work, the first dimension + of each tuple must be the number of tokens when the shape is + not 0. + """ + raise NotImplementedError + + def activation( + self, activation: str, output: torch.Tensor, input: torch.Tensor + ) -> None: + assert output.size(-1) * 2 == input.size(-1) + if activation == "silu": + torch.ops._C.silu_and_mul(input, output) + elif activation == "gelu": + torch.ops._C.gelu_and_mul(output, input) + elif activation == "swigluoai": + # alpha = 1.702, limit = 7.0 + torch.ops._C.swigluoai_and_mul(output, input) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + def enable_chunking(self): + return ( + envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and self.supports_chunking() + ) + + def finalize_weight_and_reduce_impl(self) -> TopKWeightAndReduce: + raise NotImplementedError + + @abstractmethod + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ) -> None: + """ + This function computes the intermediate result of a Mixture of Experts + (MoE) layer using two sets of weights, w1 and w2. + + Parameters: + - output: (torch.Tensor): The unweighted, unreduced output tensor. + - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE + layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - topk_weights: A map of row to expert weights. Some implementations + choose to do weight application. + - topk_ids (torch.Tensor): A map of row to expert id. + - activation (str): The activation function to apply after the first + MoE layer. + - global_num_experts (int): The total number of experts in the global + expert space. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be + used for a1. Result of quantization from prepare/finalize and not + from the FusedMoEQuantConfig. + - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs + must be large enough to hold output of either MoE gemm. + - workspace2 (torch.Tensor): A scratch tensor used for the activation + function. + - expert_tokens_meta (Optional[ExpertTokensMetadata]) - An optional + ExpertTokensMetadata object containing gpu/cpu tensors + as big as the number of local experts with the information about the + number of tokens assigned to each local expert. + - apply_router_weight_on_input: True if router weights are already + applied on the input. This is relevant if the implementation + chooses to do weight application. + """ + raise NotImplementedError + + +def _slice_scales( + scales: torch.Tensor | None, start: int, end: int +) -> torch.Tensor | None: + if scales is not None: + if scales.numel() == 1: + return scales + else: + return scales[start:end] + return None + + +class SharedResizableBuffer: + def __init__(self): + self.buffer = None + + def get( + self, shape: tuple[int, ...], device: torch.device, dtype: torch.dtype + ) -> torch.Tensor: + assert shape != () + shape_numel = prod(shape) + if ( + self.buffer is None + or self.buffer.numel() < shape_numel + or self.buffer.device != device + or self.buffer.dtype != dtype + ): + self.buffer = torch.empty(shape_numel, device=device, dtype=dtype) + return self.buffer[:shape_numel].view(*shape) + + +@final +class FusedMoEModularKernel(torch.nn.Module): + """ + This class combines a FusedMoEPrepareAndFinalize instance and + a FusedMoEPermuteExpertsUnpermute to provide an interface that + is compatible with the `fused_experts` function in fused_moe.py. + + It takes care of managing any required scratch space. + + Note: Instances of this class should only be used for a single model + layer due to any layer specific state that may be used by the component + objects. + """ + + class SharedBuffers: + def __init__(self) -> None: + self.fused_out = SharedResizableBuffer() + self.workspace13 = SharedResizableBuffer() + self.workspace2 = SharedResizableBuffer() + + # Persistent buffers that are shared across `FusedMoEModularKernel` + # instances (layers), to save memory and allocattions. + # + # We have two sets of buffers to support dual batch overlap (DBO) where each + # microbatch (ubatch) should use its own set of buffers to avoid + # cross-ubatch contimination. + # NOTE that memory is lazily allocated for these buffers, meaning that if + # DBO isn't being used, the second SharedBuffers will be empty. + shared_buffers: list[SharedBuffers] = [SharedBuffers(), SharedBuffers()] + + def __init__( + self, + prepare_finalize: FusedMoEPrepareAndFinalize, + fused_experts: FusedMoEPermuteExpertsUnpermute, + shared_experts: torch.nn.Module | None = None, + ): + super().__init__() + self.prepare_finalize = prepare_finalize + self.fused_experts = fused_experts + self.shared_experts = shared_experts + + self._post_init_setup() + assert ( + prepare_finalize.activation_format == fused_experts.activation_formats[0] + ), ( + f"{prepare_finalize.__class__.__name__}." + f"{prepare_finalize.activation_format} == " + f"{fused_experts.__class__.__name__}." + f"{fused_experts.activation_formats[0]}" + ) + + def _post_init_setup(self): + """ + Resolve any leftover setup dependencies between self.prepare_finalize + and self.fused_experts here. + """ + self.prepare_finalize.post_init_setup(self.fused_experts) + + def supports_expert_map(self) -> bool: + """ + A flag indicating whether or not this class supports expert maps. + """ + return self.fused_experts.supports_expert_map() + + def output_is_reduced(self) -> bool: + """ + Indicates whether or not the output of fused MoE kernel + is reduced across all ranks. + """ + return self.prepare_finalize.output_is_reduced() + + def _chunk_info(self, M: int) -> tuple[int, int]: + """ + Compute number of chunks and chunk size for given M. + If chunking is not supported, set the CHUNK_SIZE to M so we + get num_chunks == 1. Take max(M, 1) to avoid divide by zero. + If there are no tokens to process, the number of chunks will be zero. + """ + CHUNK_SIZE = max( + 1, + ( + M + if not self.fused_experts.supports_chunking() + else min(M, envs.VLLM_FUSED_MOE_CHUNK_SIZE) + ), + ) + num_chunks = cdiv(M, CHUNK_SIZE) + # If there are no tokens, then there should be no loop iterations. + assert M > 0 or num_chunks == 0 + return num_chunks, CHUNK_SIZE + + def _allocate_buffers( + self, + out_dtype: torch.dtype, + device: torch.device, + M_chunk: int, + M_full: int, + N: int, + K: int, + top_k: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: ExpertTokensMetadata | None, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Allocate temporary and output buffers for the fused experts op. + Inputs: + - out_dtype: output type of workspace and output tensors. + - device: the device of the workspace and output tensors. + See `workspace_shapes` for a description of the remainder of arguments. + Returns a tuple of (workspace13, workspace2, output) tensors. + """ + assert M_full > 0 and M_chunk > 0 + + num_chunks, _ = self._chunk_info(M_full) + + # select per-ubatch buffers to avoid cross-ubatch reuse under DBO + ubatch_idx = dbo_current_ubatch_id() + buffers = self.shared_buffers[ubatch_idx] + workspace_dtype = self.fused_experts.workspace_dtype(out_dtype) + + # Get intermediate workspace shapes based off the chunked M size. + workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes( + M_chunk, + N, + K, + top_k, + global_num_experts, + local_num_experts, + expert_tokens_meta, + ) + + # Get final output shape based on the full M size. + _, _, fused_out_shape = self.fused_experts.workspace_shapes( + M_full, + N, + K, + top_k, + global_num_experts, + local_num_experts, + expert_tokens_meta, + ) + + # We can reuse the memory between cache1 and cache3 because by the + # time we need cache3, we're done with cache1. + workspace13 = buffers.workspace13.get( + workspace13_shape, device=device, dtype=workspace_dtype + ) + workspace2 = buffers.workspace2.get( + workspace2_shape, device=device, dtype=workspace_dtype + ) + + # Construct the entire output that can then be processed in chunks. + # Reuse workspace13 for the output in the non-chunked case as long + # as it is large enough. This will not always be the case for standard + # format experts and with experts that have empty workspaces. + if num_chunks == 1 and prod(workspace13_shape) >= prod(fused_out_shape): + fused_out = _resize_cache(workspace13, fused_out_shape) + else: + fused_out = buffers.fused_out.get( + fused_out_shape, device=device, dtype=out_dtype + ) + + return workspace13, workspace2, fused_out + + @staticmethod + def _slice_output_tensor( + fused_out: torch.Tensor, + chunk_idx: int, + num_chunks: int, + CHUNK_SIZE: int, + M: int, + ) -> torch.Tensor: + if num_chunks == 1: + return fused_out + + assert fused_out.size(0) % M == 0, f"fused_out shape {fused_out.shape} vs M {M}" + factor = fused_out.size(0) // M + out_chunk_size = CHUNK_SIZE * factor + s = chunk_idx * out_chunk_size + e = min(s + out_chunk_size, fused_out.size(0)) + return fused_out[s:e] + + @staticmethod + def _slice_expert_tokens_metadata( + num_chunks: int, + full_expert_tokens_meta: ExpertTokensMetadata | None, + chunk_topk_ids: torch.Tensor, + local_num_experts: int, + expert_map: torch.Tensor | None, + ) -> ExpertTokensMetadata | None: + if num_chunks == 1 or full_expert_tokens_meta is None: + return full_expert_tokens_meta + + # The existing expert_num_tokens is for the entire a1q + # input. Chunking forces recomputation of the number + # of tokens assigned to each expert. + c_expert_num_tokens = count_expert_num_tokens( + chunk_topk_ids, local_num_experts, expert_map + ) + + c_expert_num_tokens_cpu = None + need_expert_num_tokens_cpu = ( + full_expert_tokens_meta.expert_num_tokens_cpu is not None + ) + if need_expert_num_tokens_cpu: + # This is blocking as some implementations need the count + # on the CPU to determine appropriate input/out fused-moe + # buffers + c_expert_num_tokens_cpu = c_expert_num_tokens.to("cpu", non_blocking=False) + + return ExpertTokensMetadata( + expert_num_tokens=c_expert_num_tokens, + expert_num_tokens_cpu=c_expert_num_tokens_cpu, + ) + + def _prepare( + self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + global_num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + ) -> tuple[ + torch.Tensor, + torch.Tensor | None, + ExpertTokensMetadata | None, + torch.Tensor, + torch.Tensor, + ]: + """ + The _prepare method is a wrapper around self.prepare_finalize.prepare + that handles DBO and async. + """ + if not self.prepare_finalize.supports_async(): + # We shouldn't be running an a2a kernel that doesn't + # support async prepare/finalize + # TODO(lucas): enable in follow-up + assert not dbo_enabled() + + ( + a1q, + a1q_scale, + expert_tokens_meta, + _expert_topk_ids, + _expert_topk_weights, + ) = self.prepare_finalize.prepare( + hidden_states, + topk_weights, + topk_ids, + global_num_experts, + expert_map, + apply_router_weight_on_input, + self.fused_experts.quant_config, + ) + else: + # Overlap shared expert compute with all2all dispatch. + dbo_maybe_run_recv_hook() + prepare_ret = self.prepare_finalize.prepare_async( + hidden_states, + topk_weights, + topk_ids, + global_num_experts, + expert_map, + apply_router_weight_on_input, + self.fused_experts.quant_config, + ) + + # TODO(lucas): refactor this in the alternative schedules followup + # currently unpack if we have hook + receiver pair or just + # receiver (see finalize_async docstring) + hook, receiver = ( + prepare_ret if isinstance(prepare_ret, tuple) else (None, prepare_ret) + ) + + if hook is not None: + if dbo_enabled(): + # If DBO is being used, register the hook with the ubatch + # context and call it in dbo_maybe_run_recv_hook instead of + # passing it to the receiver. + dbo_register_recv_hook(hook) + dbo_yield() + else: + hook() + + ( + a1q, + a1q_scale, + expert_tokens_meta, + _expert_topk_ids, + _expert_topk_weights, + ) = receiver() + + # Maybe prepare gathered topk_ids and topk_weights from other EP ranks. + topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids + topk_weights = ( + topk_weights if _expert_topk_weights is None else _expert_topk_weights + ) + + return a1q, a1q_scale, expert_tokens_meta, topk_ids, topk_weights + + def _fused_experts( + self, + in_dtype: torch.dtype, + a1q: torch.Tensor, + a1q_scale: torch.Tensor | None, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + local_num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + expert_tokens_meta: ExpertTokensMetadata | None, + ) -> torch.Tensor: + _, M_full, N, K, top_k = self.fused_experts.moe_problem_size( + a1q, w1, w2, topk_ids + ) + + num_chunks, CHUNK_SIZE = self._chunk_info(M_full) + + def input_chunk_range(chunk_idx: int) -> tuple[int, int]: + if num_chunks == 1: + # Use a1q.size(0) here since batched format does not + # keep M in the first dimension. + return 0, a1q.size(0) + else: + s = chunk_idx * CHUNK_SIZE + e = min(s + CHUNK_SIZE, M_full) + return s, e + + # This happens when none of the tokens from the all2all reach this + # EP rank. Also, note that this is only relevant for CUDAGraph + # incompatible all2all kernels like the DeepEP high-throughput + # kernels. CUDAGraph compatible all2all kernels like the pplx + # kernels and the DeepEP low-latency kernels are always batched + # and can never run into the tensor.numel() == 0 case. + if M_full == 0: + assert num_chunks == 0 + workspace13 = None + workspace2 = None + fused_out = torch.empty_like(a1q, dtype=in_dtype) + else: + assert num_chunks > 0 + workspace13, workspace2, fused_out = self._allocate_buffers( + in_dtype, + a1q.device, + CHUNK_SIZE, + M_full, + N, + K, + top_k, + global_num_experts, + local_num_experts, + expert_tokens_meta, + ) + + for chunk_idx in range(num_chunks): + s, e = input_chunk_range(chunk_idx) + + c_expert_tokens_meta = self._slice_expert_tokens_metadata( + num_chunks, + expert_tokens_meta, + topk_ids[s:e], + local_num_experts, + expert_map, + ) + + c_fused_out = self._slice_output_tensor( + fused_out, chunk_idx, num_chunks, CHUNK_SIZE, M_full + ) + + self.fused_experts.apply( + output=c_fused_out, + hidden_states=a1q[s:e], + w1=w1, + w2=w2, + topk_weights=topk_weights[s:e], + topk_ids=topk_ids[s:e], + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + a1q_scale=_slice_scales(a1q_scale, s, e), + a2_scale=_slice_scales(self.fused_experts.a2_scale, s, e), + workspace13=workspace13, + workspace2=workspace2, + expert_tokens_meta=c_expert_tokens_meta, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + return fused_out + + def _finalize( + self, + output: torch.Tensor, + fused_out: torch.Tensor, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """ + The _finalize method is a wrapper around self.prepare_finalize.finalize + that handles DBO, async and shared expert overlap. + """ + shared_output: torch.Tensor | None = None + + if not self.prepare_finalize.supports_async(): + assert not dbo_enabled() + + self.prepare_finalize.finalize( + output, + fused_out, + topk_weights, + topk_ids, + apply_router_weight_on_input, + self.fused_experts.finalize_weight_and_reduce_impl(), + ) + if self.shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + else: + finalize_ret = self.prepare_finalize.finalize_async( + output, + fused_out, + topk_weights, + topk_ids, + apply_router_weight_on_input, + self.fused_experts.finalize_weight_and_reduce_impl(), + ) + + if self.shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + + # TODO(lucas): refactor this in the alternative schedules followup + # currently unpack if we have hook + receiver pair or just + # receiver (see finalize_async docstring) + hook, receiver = ( + finalize_ret + if isinstance(finalize_ret, tuple) + else (None, finalize_ret) + ) + + if hook is not None: + if dbo_enabled(): + # If DBO is being used, register the hook with the ubatch + # context and call it in dbo_maybe_run_recv_hook instead of + # passing it to the receiver. + dbo_register_recv_hook(hook) + dbo_yield() + else: + hook() + + receiver() + + if self.shared_experts is None: + return output + else: + assert shared_output is not None + return shared_output, output + + def forward( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """ + This function computes a Mixture of Experts (MoE) layer using two sets + of weights, w1 and w2, and top-k gating mechanism. + + Parameters: + - hidden_states: (torch.Tensor): The input tensor to the MoE layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - topk_weights (torch.Tensor): The topk weights applied at the end of + the layer. + - topk_ids (torch.Tensor): A map of row to expert id. + - inplace (bool): If True, perform the operation in-place. + Defaults to False. + - activation (str): The activation function to apply after the first + MoE layer. + - global_num_experts (int): The total number of experts in the global + expert space. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - apply_router_weight_on_input (bool): When true, the topk weights are + applied directly on the inputs. This is only applicable when topk is + 1. + + Returns: + - torch.Tensor: The output tensor after applying the MoE layer. + """ + + if inplace and self.shared_experts is None and not disable_inplace(): + output = hidden_states + else: + output = torch.zeros_like(hidden_states) + + local_num_experts = w1.size(0) + if global_num_experts == -1: + global_num_experts = local_num_experts + + a1q, a1q_scale, expert_tokens_meta, topk_ids, topk_weights = self._prepare( + hidden_states, + topk_weights, + topk_ids, + global_num_experts, + expert_map, + apply_router_weight_on_input, + ) + + fused_out = self._fused_experts( + in_dtype=hidden_states.dtype, + a1q=a1q, + a1q_scale=a1q_scale, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + global_num_experts=global_num_experts, + local_num_experts=local_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_tokens_meta=expert_tokens_meta, + ) + + return self._finalize( + output, + fused_out, + hidden_states, + topk_weights, + topk_ids, + apply_router_weight_on_input, + ) diff --git a/model_executor/layers/fused_moe/moe_align_block_size.py b/model_executor/layers/fused_moe/moe_align_block_size.py new file mode 100644 index 0000000..7f61559 --- /dev/null +++ b/model_executor/layers/fused_moe/moe_align_block_size.py @@ -0,0 +1,174 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm import _custom_ops as ops +from vllm.triton_utils import triton +from vllm.utils.math_utils import round_up + + +def moe_align_block_size( + topk_ids: torch.Tensor, + block_size: int, + num_experts: int, + expert_map: torch.Tensor | None = None, + pad_sorted_ids: bool = False, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Aligns the token distribution across experts to be compatible with block + size for matrix multiplication. + + Note: In the case of expert_parallel, moe_align_block_size initially + considers all experts as valid and aligns all tokens appropriately. + Before the function returns it marks the experts_ids that are not in + the current GPU rank as -1 so the MoE matmuls could skip those blocks. + This requires the num_experts input arg to be the num global experts. + + Parameters: + - topk_ids: A tensor of shape [total_tokens, top_k] representing the + top-k expert indices for each token. + - block_size: The block size used in block matrix multiplication. + - num_experts: The total number of experts. + - expert_map: A tensor of shape [num_experts] that maps the expert index + from the global space to the local index space of the current + expert parallel shard. If the expert is not in the current expert + parallel shard, the mapping is set to -1. + - pad_sorted_ids: A flag indicating whether the sorted_token_ids length + should be padded to a multiple of block_size, + + Returns: + - sorted_token_ids: A tensor containing the sorted token indices according + to their allocated expert. + - expert_ids: A tensor indicating the assigned expert index for each block. + - num_tokens_post_padded: The total number of tokens after padding, + ensuring divisibility by block_size. + + This function pads the number of tokens that each expert needs to process + so that it is divisible by block_size. + Padding ensures that during block matrix multiplication, the dimensions + align correctly. + + Example: + Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], + block_size = 4, and num_experts = 4: + - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, + with each expert needing to process 3 tokens. + - As block_size is 4, we pad 1 token for each expert. + - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3]. + - Then append padding tokens [12, 12, 12, 12] for each block. + - After sorting by expert index, we obtain token_ids + [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. + Tokens 12 are non-existent (padding) and are ignored in + the subsequent matrix multiplication. + - The padding ensures that the total number of tokens is now divisible + by block_size for proper block matrix operations. + """ + max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + if pad_sorted_ids: + max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) + sorted_ids = torch.empty( + (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device + ) + max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size) + expert_ids = torch.empty( + (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device + ) + num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) + + ops.moe_align_block_size( + topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad + ) + if expert_map is not None: + expert_ids = expert_map[expert_ids] + + return sorted_ids, expert_ids, num_tokens_post_pad + + +def batched_moe_align_block_size( + max_tokens_per_batch: int, block_size: int, expert_num_tokens: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Given num_batches, max_tokens_per_batch, block_size and the number of + valid-tokens in each batch, prepare sorted_token_ids, expert_ids and + num_tokens_post_pad. sorted_token_ids, expert_ids and num_tokens_post_pad + have the same semantics as in moe_align_block_size. + + This function is intended to be a drop in replacement for + moe_align_batch_size for the batched case. + + Parameters: + - max_tokens_per_batch (int): Number of tokens in each batch (both + valid and invalid). + - block_size (int): block_size to align the data to. + - expert_num_tokens (torch.Tensor): expert_num_tokens[i], indicates + the number of valid tokens in batch i. + + Returns: + - sorted_token_ids (torch.Tensor): Torch tensor of size + (num_batches * max_tokens_per_batch) indicating the token indices for + that block. + - expert_ids (torch.Tensor): Torch tensor of size + ceil((num_batches * max_tokens_per_batch) / block_size) indicating + what expert to use for each block. + - num_tokens_post_pad (torch.Tensor): Torch tensor of size 1 + indicating the number of valid blocks with actual data to + process. This is represented in terms of num tokens. + Example: + Let num_batches=5, max_tokens_per_batch=8, block_size=4, and + expert_num_tokens=[2, 3, 0, 6, 8]. This expert_num_tokens tensor + indicates that, + - The first 2 tokens in the 0th batch are valid and the rest 6 are + invalid (i.e. in the 2D hidden_states tensor of shape, + [num_batches * max_tokens_per_batch, K], indices 0, 1 are valid) + - The first 3 tokens in the 1st batch are valid. i.e. indices 8, 9, 10 + - 0 tokens in the 2nd batch are valid + - first 6 tokens in the 3rd batch are valid. i.e. indices, + 24, 25, 26, 27, 28, 29 + - so on ... + + In this case, + sorted_token_ids will be [0, 1, 40, 40, + 8, 9, 10, 40, + 24, 25, 26, 27, + 28, 29, 40, 40, + 32, 33, 34, 35, + 36, 37, 38, 39, + 40, 40, 40, 40, + (rest all 40, 40, 40, 40) + ...] + Here, 40 represents an invalid index. as there is no token index 40. + The gemm kernel using this sorted_token_ids is expected to skip the + gemm computation when it encounters this invalid index. + + expert_ids will be [0, 1, 3, 3, 4, 5, 5, -1, -1, (rest all -1) ...] + Here, -1 represents an invalid expert. The gemm kernel using this + expert_ids is expected to skip the gemm computation when it encounters + an expert of id -1. + + num_tokens_post_pad will be 24 as sorted_token_ids has valid entries + until 24. + """ + + B = expert_num_tokens.size(0) + device = expert_num_tokens.device + + # Round up so each batch can be split to blocks evenly. + max_num_tokens_padded = B * round_up(max_tokens_per_batch, block_size) + + sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device=device) + assert max_num_tokens_padded % block_size == 0 + max_num_m_blocks = max_num_tokens_padded // block_size + expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device=device) + num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=device) + + ops.batched_moe_align_block_size( + max_tokens_per_batch, + block_size, + expert_num_tokens, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + + return sorted_ids, expert_ids, num_tokens_post_pad diff --git a/model_executor/layers/fused_moe/moe_pallas.py b/model_executor/layers/fused_moe/moe_pallas.py new file mode 100644 index 0000000..66c00cf --- /dev/null +++ b/model_executor/layers/fused_moe/moe_pallas.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import torch.nn.functional as F + + +def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor: + """ + Compute the histogram of an int32 tensor. The bin edges are defined by the + min and max values, with step = 1. + """ + assert input.dtype == torch.int32, "input must be of torch.int32 dtype." + assert min <= max, "min must be less than or equal to max." + + def searchsorted( + sorted_sequence: torch.Tensor, values_to_search: torch.Tensor + ) -> torch.Tensor: + return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1) + + bin_edges = torch.linspace(min, max, max - min + 1, dtype=input.dtype).to( + input.device + ) + return searchsorted(bin_edges, input).to(torch.int32) + + +def fused_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + global_num_experts: int, + expert_map: torch.Tensor = None, + renormalize: bool = False, +) -> torch.Tensor: + """ + Args: + hidden_states: [*, hidden_size] + w1: [num_experts, intermediate_size * 2, hidden_size] + w2: [num_experts, hidden_size, intermediate_size] + gating_output: [*, num_experts] + """ + assert expert_map is None, "expert_map is not supported for pallas MoE." + import torch_xla.experimental.custom_kernel # noqa: F401 + + orig_shape = hidden_states.shape + hidden_size = hidden_states.shape[-1] + num_tokens = hidden_states.shape[:-1].numel() + num_experts = w1.shape[0] + intermediate_size = w2.shape[-1] + device = hidden_states.device + dtype = hidden_states.dtype + assert (num_tokens * topk) % 16 == 0, ( + "The Pallas GMM kernel requires num_tokens * topk to be a multiple of " + f"16 but got {num_tokens * topk}" + ) + + hidden_states = hidden_states.view(num_tokens, hidden_size) + gating_output = gating_output.view(num_tokens, num_experts) + topk_weights = gating_output.softmax(dim=-1, dtype=torch.float) + topk_weights, topk_indices = topk_weights.topk(topk, dim=-1) + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + topk_weights = topk_weights.to(dtype) + + topk_indices = topk_indices.flatten() + topk_argsort_indices = topk_indices.argsort() + topk_argsort_revert_indices = topk_argsort_indices.argsort() + token_indices = torch.arange(num_tokens, device=device).repeat_interleave(topk) + token_indices = token_indices[topk_argsort_indices] + group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1) + + x = hidden_states[token_indices] + x = torch.ops.xla.gmm(x, w1, group_sizes, transpose_rhs=True) + x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:] + x = torch.ops.xla.gmm(x, w2, group_sizes, transpose_rhs=True) + x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) + + x = x * topk_weights.unsqueeze(dim=-1) + x = x.sum(dim=-2) + x = x.reshape(orig_shape) + return x diff --git a/model_executor/layers/fused_moe/moe_permute_unpermute.py b/model_executor/layers/fused_moe/moe_permute_unpermute.py new file mode 100644 index 0000000..9dcdcc3 --- /dev/null +++ b/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( + moe_align_block_size, +) +from vllm.model_executor.layers.fused_moe.utils import _fp8_perm + + +def _moe_permute( + curr_hidden_states: torch.Tensor, + a1q_scale: torch.Tensor | None, + curr_topk_ids: torch.Tensor, + global_num_experts: int, + expert_map: torch.Tensor | None, + block_m: int, +) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Determine the sorted_token_ids, expert_ids for the given problem size. + Permute the hidden states and scales according to `sorted_token_ids`. + """ + top_k_num = curr_topk_ids.size(1) + + tokens_in_chunk = curr_hidden_states.size(0) + + sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( + curr_topk_ids, block_m, global_num_experts, expert_map, pad_sorted_ids=True + ) + + inv_perm: torch.Tensor | None = None + + num_tokens = top_k_num * tokens_in_chunk + expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0) + inv_perm = torch.argsort(sorted_token_ids)[:num_tokens] + + # Permute according to sorted token ids. + sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) + + curr_hidden_states = _fp8_perm(curr_hidden_states, sorted_token_ids // top_k_num) + + if a1q_scale is not None: + a1q_scale = a1q_scale[sorted_token_ids // top_k_num] + + return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids, inv_perm) + + +def _moe_unpermute_and_reduce( + out: torch.Tensor, + curr_hidden: torch.Tensor, + inv_perm: torch.Tensor | None, + topk_weight: torch.Tensor, + apply_router_weight_on_input: bool, +) -> None: + """ + Unpermute the final result and apply topk_weights, then perform the final + reduction on the hidden states. + """ + M, topk = topk_weight.size() + K = curr_hidden.size(-1) + if inv_perm is not None: + curr_hidden = curr_hidden[inv_perm, ...] + curr_hidden = curr_hidden.view(-1, topk, K) + if not apply_router_weight_on_input: + curr_hidden.mul_(topk_weight.view(M, -1, 1)) + ops.moe_sum(curr_hidden, out) + + +def moe_permute( + hidden_states: torch.Tensor, + a1q_scale: torch.Tensor | None, + topk_ids: torch.Tensor, + n_expert: int, + n_local_expert: int = -1, + expert_map: torch.Tensor | None = None, + align_block_size: int | None = None, + fill_invalid_expert: int = -1, + permuted_hidden_states: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + This function expands and permutes activation to gather uncontinuous tokens + for each expert. + Parameters: + - hidden_states (torch.Tensor): The input tensor to the MoE layer. + - a1q_scale (Optional[torch.Tensor]): quant scale for hidden_states + - topk_ids (torch.Tensor): topk expert route id for each token. + - n_expert (int): The number of expert. + - n_local_expert (int): The number of expert in current EP rank. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - align_block_size (Optional[int]): align group gemm block size for deepgemm + - fill_invalid_expert(int): fill expert id in m_indices for invalid expert + to workaround DeepGemm unsupported -1 in m_indices + - permuted_hidden_states (Optional[torch.Tensor]): Optional output tensor. + If None, the output tensor will be created in this function. + Returns: + - permuted_hidden_states (torch.Tensor): permuted activation. + - a1q_scale (Optional[torch.Tensor]): permuted quant scale for hidden_states + if original scale not per-tensor scaling + - expert_first_token_offset (torch.Tensor): offset of the first token + of each expert for standard grouped gemm. if enable 'align_block_size' + expert_first_token_offset will align up to 'align_block_size'. + - inv_permuted_idx (torch.Tensor): idx map for moe_unpermute. + - permuted_idx (torch.Tensor): idx map from hidden to permuted_hidden. + - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records + the group which the j-th row of the LHS belong to.` + """ + n_token, n_hidden = hidden_states.size() + topk = topk_ids.size(1) + assert (n_hidden * hidden_states.element_size()) % 16 == 0, ( + "permue kernel need hidden dim align to 16B" + ) + permuted_row_size = n_token * topk + if align_block_size is not None: + permuted_row_size = ( + ( + permuted_row_size + + n_expert * (align_block_size - 1) + + align_block_size + - 1 + ) + // align_block_size + * align_block_size + ) + if n_local_expert == -1: + n_local_expert = n_expert + if permuted_hidden_states is None: + permuted_hidden_states = torch.empty( + (permuted_row_size, n_hidden), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + assert permuted_hidden_states.size() == (permuted_row_size, n_hidden), ( + f"Expected permuted hidden states to be {(permuted_row_size, n_hidden)}" + f" but got {permuted_hidden_states.size()}" + ) + + token_expert_indices = torch.arange( + 0, n_token * topk, dtype=torch.int32, device=hidden_states.device + ).reshape((n_token, topk)) + + m_indices = torch.full( + (permuted_row_size,), + fill_invalid_expert, + dtype=torch.int32, + device=hidden_states.device, + ) + expert_first_token_offset = torch.empty( + n_local_expert + 1, dtype=torch.int64, device=hidden_states.device + ) + permuted_idx = torch.full( + (permuted_row_size,), + n_token * topk, + dtype=torch.int32, + device=hidden_states.device, + ) + inv_permuted_idx = torch.empty( + (n_token, topk), dtype=torch.int32, device=hidden_states.device + ) + topk_ids = topk_ids.to(torch.int32) + torch.ops._moe_C.moe_permute( + hidden_states, + topk_ids, + token_expert_indices, + expert_map, + n_expert, + n_local_expert, + topk, + align_block_size, + permuted_hidden_states, + expert_first_token_offset, + inv_permuted_idx, + permuted_idx, + m_indices, + ) + + if a1q_scale is not None and a1q_scale.dim() > 1: + a1q_scale = a1q_scale[permuted_idx.clamp(max=n_token * topk - 1) // topk] + return ( + permuted_hidden_states, + a1q_scale, + expert_first_token_offset, + inv_permuted_idx.flatten(), + m_indices, + ) + + +def moe_unpermute( + out: torch.Tensor, + permuted_hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + inv_permuted_idx: torch.Tensor, + expert_first_token_offset: torch.Tensor | None = None, +) -> None: + """ + This function expands and permutes activation to gathering uncontinuous + tokens for each expert. + Parameters: + - out (torch.Tensor): output tensor + - permuted_hidden_states (torch.Tensor): permuted activation. + - topk_weights (torch.Tensor): topk expert route weight for each token. + - inv_permuted_idx (torch.Tensor): row idx map for moe_unpermute. + - expert_first_token_offset (Optional[torch.Tensor]): offset of the first + token of each expert for grouped gemm. + Returns: + - hidden_states (torch.Tensor): The reduced and unpermuted activation + tensor. + """ + topk = topk_weights.size(1) + n_hidden = permuted_hidden_states.size(-1) + assert (n_hidden * permuted_hidden_states.element_size()) % 16 == 0, ( + "unpermue kernel need hidden dim align to 16B" + ) + + torch.ops._moe_C.moe_unpermute( + permuted_hidden_states, + topk_weights, + inv_permuted_idx, + expert_first_token_offset, + topk, + out, + ) + + +def moe_permute_unpermute_supported(): + return torch.ops._moe_C.moe_permute_unpermute_supported() diff --git a/model_executor/layers/fused_moe/moe_torch_iterative.py b/model_executor/layers/fused_moe/moe_torch_iterative.py new file mode 100644 index 0000000..f721d00 --- /dev/null +++ b/model_executor/layers/fused_moe/moe_torch_iterative.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import torch.nn.functional as F + + +def fused_moe( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + global_num_experts: int, + expert_map: torch.Tensor = None, + renormalize: bool = False, +) -> torch.Tensor: + """ + Args: + hidden_states: [*, hidden_size] + w1: [num_experts, intermediate_size * 2, hidden_size] + w2: [num_experts, hidden_size, intermediate_size] + gating_output: [*, num_experts] + expert_map: [num_experts] + """ + orig_shape = hidden_states.shape + hidden_size = hidden_states.shape[-1] + num_tokens = hidden_states.shape[:-1].numel() + num_experts = w1.shape[0] + intermediate_size = w2.shape[-1] + dtype = hidden_states.dtype + + hidden_states = hidden_states.view(num_tokens, hidden_size) + gating_output = gating_output.view(num_tokens, global_num_experts) + topk_weights = gating_output.softmax(dim=-1, dtype=torch.float) + topk_weights, selected_experts = topk_weights.topk(topk, dim=-1) + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + topk_weights = topk_weights.to(dtype) + + if expert_map is not None: + selected_experts = expert_map[selected_experts] + + final_hidden_states = None + for expert_idx in range(num_experts): + expert_w1 = w1[expert_idx] + expert_w2 = w2[expert_idx] + expert_mask = selected_experts == expert_idx + expert_weights = (topk_weights * expert_mask).sum(dim=-1, keepdim=True) + x = F.linear(hidden_states, expert_w1) + gate = F.silu(x[:, :intermediate_size]) + x = x[:, intermediate_size:] * gate + x = F.linear(x, expert_w2) + current_hidden_states = x * expert_weights + if final_hidden_states is None: + final_hidden_states = current_hidden_states + else: + final_hidden_states = final_hidden_states + current_hidden_states + + return final_hidden_states.view(orig_shape) # type: ignore diff --git a/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/model_executor/layers/fused_moe/pplx_prepare_finalize.py new file mode 100644 index 0000000..2766a2c --- /dev/null +++ b/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -0,0 +1,362 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import pplx_kernels as pplx +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate, +) +from vllm.model_executor.layers.fused_moe.utils import ( + _validate_scale_shape, + moe_kernel_quantize_input, +) +from vllm.utils.math_utils import cdiv, round_up + +logger = init_logger(__name__) + + +def pplx_hidden_dim_scale_bytes( + max_num_tokens: int, + hidden_dim: int, + in_dtype: torch.dtype, + quant_dtype: torch.dtype | str | None, + per_act_token_quant: bool, + block_shape: list[int] | None, +): + # All pplx byte sizes must be 16-byte aligned. + align = 16 + + # For blocked per token: set to + # ceil_div(hidden_dim, block_size) * sizeof(float32) + # For per-token: set to 4 * sizeof(float32) (x4 for alignment) + if quant_dtype is not None: + assert isinstance(quant_dtype, torch.dtype) + assert quant_dtype.itemsize == 1 + hidden_dim_bytes = hidden_dim * quant_dtype.itemsize + elem_size = torch.float32.itemsize + + if per_act_token_quant: + # per-token (M x 1) + assert block_shape is None + hidden_scale_bytes = elem_size + elif block_shape is not None: + # per-group (M x K_tiles) + block_size = block_shape[1] + num_blocks = cdiv(hidden_dim, block_size) + hidden_scale_bytes = num_blocks * elem_size + else: + # per-tensor (1 x 1) + hidden_scale_bytes = elem_size + else: + hidden_dim_bytes = hidden_dim * in_dtype.itemsize + hidden_scale_bytes = 0 + + return ( + round_up(hidden_dim_bytes, align), + round_up(hidden_scale_bytes, align), + ) + + +class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + def __init__( + self, + a2a: pplx.AllToAll, + max_num_tokens: int, + num_local_experts: int, + num_dispatchers: int, + ): + super().__init__() + assert max_num_tokens > 0 + assert num_local_experts > 0 + self.a2a = a2a + self.max_num_tokens = max_num_tokens + self.num_local_experts = num_local_experts + self.num_dispatchers_ = num_dispatchers + + @property + def activation_format(self) -> mk.FusedMoEActivationFormat: + return mk.FusedMoEActivationFormat.BatchedExperts + + def max_num_tokens_per_rank(self) -> int | None: + return self.max_num_tokens + + def topk_indices_dtype(self) -> torch.dtype | None: + return torch.uint32 + + def num_dispatchers(self) -> int: + return self.num_dispatchers_ + + def output_is_reduced(self) -> bool: + return True + + def supports_async(self) -> bool: + return True + + def prepare_async( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> tuple[Callable, mk.ReceiverType]: + num_tokens = a1.size(0) # M + hidden_dim = a1.size(-1) # K + + assert topk_ids.size(0) == num_tokens + # expert_map should be None because with expert map, -1 id is used for + # non-local token; this causes error when casting ids to the + # topk_indices_dtype() int32 + # + if expert_map is not None: + logger.warning_once( + "The PPLX backend does not support expert mapping. " + "The provided `expert_map` will be ignored." + ) + expert_map = None # noqa: F841 + + # Is this always going to be a1.device? + device = a1.device + + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + a1 = a1 * topk_weights.to(a1.dtype) + + repeat_cols = 4 + repeat_rows = 1 if quant_config.per_act_token_quant else a1.size(0) + # TODO(bnell): always pass quant_config.a1_scale? + a1q, a1q_scale = moe_kernel_quantize_input( + a1, + (None if quant_config.per_act_token_quant else quant_config.a1_scale), + quant_dtype=quant_config.quant_dtype, + per_act_token_quant=quant_config.per_act_token_quant, + block_shape=quant_config.block_shape, + ) + + _validate_scale_shape( + a1q, a1q_scale, quant_config.per_act_token_quant, quant_config.block_shape + ) + + orig_a_scale_block_shape: int | None = None + + if a1q_scale is not None: + scalar_scales = a1q_scale.numel() == 1 + + # pplx requires 2-d scales even for scalar scales + if a1q_scale.dim() <= 1: + assert scalar_scales + a1q_scale = a1q_scale.view(1, 1) + + orig_a_scale_block_shape = a1q_scale.shape[-1] + + if not quant_config.is_block_quantized: + # TODO (bnell): use group_broadcast instead? + a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols) + + assert a1q_scale is None or a1q_scale.ndim == 2, ( + f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}" + ) + + expert_num_tokens = torch.empty( + self.num_local_experts, + dtype=torch.int32, + device=device, + ) + + expert_x = torch.empty( + ( + self.num_local_experts, + self.max_num_tokens * self.num_dispatchers(), + hidden_dim, + ), + dtype=a1q.dtype, + device=device, + ) + + expert_x_scale: torch.Tensor | None = None + if a1q.dtype.itemsize == 1: + if quant_config.is_per_act_token: + # (M x 1) -> (E x M x K) + final_dim = expert_x.size(2) + elif quant_config.is_per_tensor: + # (1 x 1) -> (E x 1 x 1) + final_dim = 1 + else: + # (M x K_tiles) -> (E x M x K_tiles) + assert quant_config.block_shape is not None + num_blocks = cdiv(expert_x.size(2), quant_config.block_shape[1]) + final_dim = num_blocks + + expert_x_scale_shape = ( + self.num_local_experts, + expert_x.size(1), + round_up(final_dim, 4), # round up for alignment + ) + + expert_x_scale = torch.empty( + expert_x_scale_shape, + dtype=torch.float32, + device=expert_x.device, + ) + + # This argument is optional, defaults to indices.size(0) + # There's not much point setting this unless it is != indices.size(0) + bound_m: torch.Tensor | None = None + + self.a2a.dispatch( + out_expert_num_tokens=expert_num_tokens, + out_expert_x=expert_x, + out_expert_x_scale=expert_x_scale, + dp_x=a1q, + dp_x_scale=a1q_scale, + indices=topk_ids, + bound_m=bound_m, + do_send=True, + do_recv=False, + ) + + hook = lambda: self.a2a.dispatch( + out_expert_num_tokens=expert_num_tokens, + out_expert_x=expert_x, + out_expert_x_scale=expert_x_scale, + dp_x=a1q, + dp_x_scale=a1q_scale, + indices=topk_ids, + bound_m=bound_m, + do_send=False, + do_recv=True, + ) + + return ( + hook, + lambda: self._receiver( + expert_num_tokens, + expert_x, + expert_x_scale, + orig_a_scale_block_shape, + ), + ) + + def _receiver( + self, + expert_num_tokens: torch.Tensor, + expert_x: torch.Tensor, + expert_x_scale: torch.Tensor | None, + orig_a_scale_block_shape: int | None, + ) -> mk.PrepareResultType: + if expert_x_scale is not None: + expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape] + assert expert_x_scale.ndim == 3 + + expert_tokens_meta = mk.ExpertTokensMetadata( + expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None + ) + + return expert_x, expert_x_scale, expert_tokens_meta, None, None + + def prepare( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + hook, receiver = self.prepare_async( + a1, + topk_weights, + topk_ids, + num_experts, + expert_map, + apply_router_weight_on_input, + quant_config, + ) + hook() + return receiver() + + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> Callable: + assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate), ( + "Weight application and reduction happens in the combine kernel." + ) + + # This argument is optional + # There's not much point setting this unless it is != topk_ids.size(0) + bound_m: torch.Tensor | None = None + + # TODO (bnell): fails in test_pplx_moe.py, figure out what's going on + # num_tokens = output.size(0) # M + # assert topk_ids.size(0) == num_tokens, ( + # f"{topk_ids.size(0)} == {num_tokens}") + assert topk_ids.size() == topk_weights.size(), ( + f"{topk_ids.size()} == {topk_weights.size()}" + ) + assert output.size(0) <= self.max_num_tokens, ( + f"{output.size(0)} <= {self.max_num_tokens}" + ) + assert output.size(1) == fused_expert_output.size(-1) + + # Set weights to 1 if we did them in dispatch. This is hacky. + if apply_router_weight_on_input: + topk_weights = torch.ones_like(topk_weights) + + topk_ids_u32 = topk_ids.view(dtype=torch.uint32) + + self.a2a.combine( + out_tokens=output, + indices=topk_ids_u32, + weights=topk_weights, + expert_y=fused_expert_output, + bound_m=bound_m, + do_send=True, + do_recv=False, + ) + + return lambda: self.a2a.combine( + out_tokens=output, + indices=topk_ids_u32, + weights=topk_weights, + expert_y=fused_expert_output, + bound_m=bound_m, + do_send=False, + do_recv=True, + ) + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + receiver = self.finalize_async( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + ) + receiver() diff --git a/model_executor/layers/fused_moe/prepare_finalize.py b/model_executor/layers/fused_moe/prepare_finalize.py new file mode 100644 index 0000000..9bb976f --- /dev/null +++ b/model_executor/layers/fused_moe/prepare_finalize.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceContiguous, + TopKWeightAndReduceDelegate, +) +from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input + + +class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): + @property + def activation_format(self) -> mk.FusedMoEActivationFormat: + return mk.FusedMoEActivationFormat.Standard + + def max_num_tokens_per_rank(self) -> int | None: + return None + + def topk_indices_dtype(self) -> torch.dtype | None: + return None + + def num_dispatchers(self) -> int: + return 1 + + def output_is_reduced(self) -> bool: + return False + + def prepare( + self, + a1: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: torch.Tensor | None, + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + a1.mul_(topk_weights.to(a1.dtype)) + + a1q, a1q_scale = moe_kernel_quantize_input( + a1, + quant_config.a1_scale, + quant_config.quant_dtype, + quant_config.per_act_token_quant, + quant_config.block_shape, + ) + + return a1q, a1q_scale, None, None, None + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate): + weight_and_reduce_impl = TopKWeightAndReduceContiguous() + weight_and_reduce_impl.apply( + output=output, + fused_expert_output=fused_expert_output, + topk_weights=topk_weights, + topk_ids=topk_ids, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py new file mode 100644 index 0000000..8f05828 --- /dev/null +++ b/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from enum import IntEnum +from functools import lru_cache + +import torch + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG, + FusedMoEQuantConfig, +) + + +class QuantMethod(IntEnum): + # This allows interfacing with AITER QuantType Enum + # without importing the QuantType from AITER globally. + + # Note that these quantization methods are + # supported in AITER package. However, + # not all are used in this module. + + NO = 0 # a16w16 + PER_TENSOR = 1 # w8a8 (pre_Tensor) + PER_TOKEN = 2 # w8a8/w8a4 (per_Token) + BLOCK_1X32 = 3 # fp4x2 + BLOCK_1X128 = 4 # block quantized w8a8 (per_1x128) + BLOCK_128x128 = 5 # block quantized w8a8 (per_128x128) + + +class ActivationMethod(IntEnum): + # This allows interfacing with AITER ActivationType enum + # without importing the ActivationType enum from AITER globally. + SILU = 0 + GELU = 1 + + +aiter_topK_meta_data = None + + +@lru_cache(maxsize=1) +def init_aiter_topK_meta_data( + n_routed_experts: int, + n_shared_experts: int, + top_k: int, + tp_rank: int, + tp_size: int, + shared_experts_score: float = 1.0, + max_num_tokens: int = 32768, + is_EP: bool = False, +): + global aiter_topK_meta_data + fake_expertid = n_routed_experts + n_shared_experts + + # all layers reuse same buffer + # This extra element when EP is enabled is used as a sentinel + # to mask out shared expert processing for tokens not owned by + # the current EP rank. This is necessary to avoid double-processing + # of shared experts. + total_topk_ids = torch.empty( + (max_num_tokens, top_k + n_shared_experts + is_EP), + dtype=torch.int32, + device="cuda", + ) + ns_topk_ids, s_topk_ids = total_topk_ids.split( + [top_k, n_shared_experts + is_EP], dim=1 + ) + shared_expert_ids = [n_routed_experts + i for i in range(n_shared_experts + is_EP)] + if is_EP: + s_topk_ids_list = [ + [fake_expertid] * (n_shared_experts + is_EP) + ] * max_num_tokens + for i in range(tp_rank, max_num_tokens, tp_size): + s_topk_ids_list[i] = shared_expert_ids + else: + s_topk_ids_list = [ + list(range(n_routed_experts, fake_expertid)) + ] * max_num_tokens + s_topk_ids[:] = torch.tensor(s_topk_ids_list, dtype=torch.int32, device="cuda") + + total_topk_weights = torch.empty( + (max_num_tokens, top_k + n_shared_experts + is_EP), + dtype=torch.float32, + device="cuda", + ) + ns_topk_weights, s_topk_weights = total_topk_weights.split( + [top_k, n_shared_experts + is_EP], dim=1 + ) + s_topk_weights.fill_(shared_experts_score) + assert aiter_topK_meta_data is None, "AITER topK meta data is already initialized" + aiter_topK_meta_data = (total_topk_weights, total_topk_ids) + + +def rocm_aiter_grouped_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + num_fused_shared_experts: int = 0, +) -> tuple[torch.Tensor, torch.Tensor]: + token = hidden_states.shape[0] + device = hidden_states.device + if ( + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + and num_fused_shared_experts > 0 + ): + assert aiter_topK_meta_data is not None, ( + "AITER topK meta data is not initialized. " + "Please ensure that init_aiter_topK_meta_data " + "is called before this function." + ) + total_topk_weights, total_topk_ids = aiter_topK_meta_data + assert total_topk_weights.shape[0] >= token, ( + f"AITER topK meta data support {total_topk_weights.shape[0]} " + f"tokens which is determined by max_num_batched_tokens, " + f"but got {token} tokens now." + ) + total_topk_weights = total_topk_weights[:token] + total_topk_ids = total_topk_ids[:token] + topk_weights, _ = total_topk_weights.split( + [topk, total_topk_weights.shape[1] - topk], dim=1 + ) + topk_ids, _ = total_topk_ids.split( + [topk, total_topk_ids.shape[1] - topk], dim=1 + ) + else: + topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device) + topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device) + + if e_score_correction_bias is not None: + rocm_aiter_ops.biased_grouped_topk( + gating_output, + e_score_correction_bias.to(gating_output.dtype), + topk_weights, + topk_ids, + num_expert_group, + topk_group, + renormalize, + routed_scaling_factor=routed_scaling_factor, + ) + else: + assert scoring_func == "softmax" or scoring_func == "sigmoid" + rocm_aiter_ops.grouped_topk( + gating_output, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + renormalize, + scoring_func, + routed_scaling_factor=routed_scaling_factor, + ) + + if ( + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + and num_fused_shared_experts > 0 + ): + return total_topk_weights, total_topk_ids + return topk_weights, topk_ids + + +def rocm_aiter_fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + expert_map: torch.Tensor | None = None, + quant_config: FusedMoEQuantConfig | None = None, +) -> torch.Tensor: + if quant_config is None: + quant_config = FUSED_MOE_UNQUANTIZED_CONFIG + + activation_method = ( + ActivationMethod.SILU if activation == "silu" else ActivationMethod.GELU + ) + # All AITER Fused MoE kernels are expecting the following datatypes + topk_weights = topk_weights.to(torch.float32) + topk_ids = topk_ids.to(torch.int32) + + expert_mask = expert_map if expert_map is not None else None + + # w8a8 per-channel quantization + if ( + quant_config.per_act_token_quant + and apply_router_weight_on_input + and quant_config.use_fp8_w8a8 + ): + # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input` + # This applies topk_weights on the GEMM output of the first FC layer + # rather than the second FC. + assert topk_weights.dim() == 2, ( + "`topk_weights` should be in shape (num_tokens, topk)" + ) + assert topk_weights.shape[-1] == 1, ( + "Only support topk=1 when `apply_router_weight_on_input` is True" + ) + + return rocm_aiter_ops.asm_moe_tkw1( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + fc1_scale=quant_config.w1_scale, + fc2_scale=quant_config.w2_scale, + fc1_smooth_scale=None, + fc2_smooth_scale=None, + a16=False, + per_tensor_quant_scale=None, + expert_mask=expert_mask, + activation_method=activation_method, + ) + + else: + quant_method = QuantMethod.NO.value + # quark moe for mxfp4 w_dtype + if quant_config.use_mxfp4_w4a16: + quant_method = QuantMethod.BLOCK_1X32.value + # w8a8 block-scaled + if quant_config.block_shape is not None and quant_config.use_fp8_w8a8: + assert not apply_router_weight_on_input, ( + "apply_router_weight_on_input is\ + not supported for block scaled moe" + ) + assert quant_config.w1_scale is not None + assert quant_config.w2_scale is not None + quant_method = QuantMethod.BLOCK_128x128.value + elif quant_config.use_fp8_w8a8 and quant_config.per_out_ch_quant: + quant_method = QuantMethod.PER_TOKEN.value + elif quant_config.use_fp8_w8a8: + # Currently only per tensor quantization method is enabled. + quant_method = QuantMethod.PER_TENSOR.value + + if apply_router_weight_on_input: + assert topk_weights.dim() == 2, ( + "`topk_weights` should be in shape (num_tokens, topk)" + ) + _, topk = topk_weights.shape + assert topk == 1, ( + "Only support topk=1 when `apply_router_weight_on_input` is True" + ) + + return rocm_aiter_ops.fused_moe( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + expert_mask=expert_mask, + quant_method=quant_method, + activation_method=activation_method, + w1_scale=quant_config.w1_scale, + w2_scale=quant_config.w2_scale, + a1_scale=quant_config.a1_scale, + a2_scale=quant_config.a2_scale, + doweight_stage1=apply_router_weight_on_input, + ) diff --git a/model_executor/layers/fused_moe/routing_simulator.py b/model_executor/layers/fused_moe/routing_simulator.py new file mode 100644 index 0000000..a01cdc4 --- /dev/null +++ b/model_executor/layers/fused_moe/routing_simulator.py @@ -0,0 +1,310 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Token-to-Expert Routing Simulator + +This module provides a framework for simulating and testing different +token-to-expert routing strategies for Mixture of Experts (MoE) models. +It supports routing logic customization and includes example implementations +like uniform random routing. +""" + +from abc import ABC, abstractmethod +from typing import Any + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class RoutingStrategy(ABC): + """Base class for token-to-expert routing strategies.""" + + @abstractmethod + def route_tokens( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + indices_type: torch.dtype | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Route tokens to experts. + + Args: + hidden_states: Input hidden states [num_tokens, hidden_size] + router_logits: Router logits [num_tokens, num_experts] + top_k: Number of experts to select per token + indices_type: Data type for expert indices + + Returns: + tuple of (topk_weights, topk_ids) + """ + pass + + +class DistributionBasedRouting(RoutingStrategy): + """ + Distribution-based random routing strategy with configurable distributions. + + This routing strategy randomly selects experts for each token based on + different probability distributions. Currently supports uniform and normal + distributions for testing different routing patterns. + """ + + def __init__(self, distribution: str = "uniform", **distribution_params: Any): + """ + Initialize distribution-based routing. + + Args: + distribution: Type of distribution to use for sampling + - "uniform": Uniform distribution (default) + - "normal": Normal/Gaussian distribution + **distribution_params: Parameters specific to the + chosen distribution + For "uniform": No additional parameters needed + For "normal": mean (default: 0.0), std (default: 1.0) + """ + self.distribution = distribution.lower() + self.distribution_params = distribution_params + + # Validate distribution and parameters + self._validate_distribution_params() + + def _validate_distribution_params(self): + """Validate distribution type and parameters.""" + valid_distributions = ["uniform", "normal"] + + if self.distribution not in valid_distributions: + raise ValueError( + f"Unsupported distribution: {self.distribution}. " + f"Supported distributions: {valid_distributions}" + ) + + # Set default parameters if not provided + if self.distribution == "normal": + self.distribution_params.setdefault("mean", 0.0) + self.distribution_params.setdefault("std", 1.0) + + def route_tokens( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + indices_type: torch.dtype | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Randomly select experts for each token using the specified distribution. + + Args: + hidden_states: Input hidden states [num_tokens, hidden_size] + router_logits: Router logits [num_tokens, num_experts] + top_k: Number of experts to select per token + indices_type: Data type for expert indices + + Returns: + tuple of (topk_weights, topk_ids) where: + - topk_weights: Weights based on distribution sampling + - topk_ids: Expert indices sampled from the distribution + """ + num_tokens = hidden_states.shape[0] + num_experts = router_logits.shape[-1] + + if indices_type is None: + indices_type = torch.long + + # Generate expert IDs based on the specified distribution + topk_ids = self._sample_expert_ids( + num_tokens, num_experts, top_k, hidden_states.device, indices_type + ) + + # Generate weights based on the distribution + topk_weights = self._generate_weights(num_tokens, top_k, hidden_states.device) + + return topk_weights, topk_ids + + def _sample_expert_ids( + self, + num_tokens: int, + num_experts: int, + top_k: int, + device: torch.device, + indices_type: torch.dtype, + ) -> torch.Tensor: + """Sample expert IDs based on the specified distribution.""" + + if self.distribution == "uniform": + # Uniform random sampling + return torch.randint( + low=0, + high=num_experts, + size=(num_tokens, top_k), + dtype=indices_type, + device=device, + ) + + elif self.distribution == "normal": + # For normal distribution, sample continuous values and map to + # expert IDs + continuous_samples = self._sample_continuous_distribution( + num_tokens, top_k, device + ) + + # Map continuous samples to expert indices + # Normalize to [0, 1] range and scale to [0, num_experts) + normalized_samples = self._normalize_samples(continuous_samples) + expert_ids = (normalized_samples * num_experts).long() + expert_ids = torch.clamp(expert_ids, 0, num_experts - 1) + + return expert_ids.to(dtype=indices_type) + + else: + raise ValueError(f"Unsupported distribution: {self.distribution}") + + def _sample_continuous_distribution( + self, num_tokens: int, top_k: int, device: torch.device + ) -> torch.Tensor: + """Sample from continuous distributions.""" + shape = (num_tokens, top_k) + + if self.distribution == "normal": + mean = self.distribution_params["mean"] + std = self.distribution_params["std"] + return torch.normal(mean, std, size=shape, device=device) + + else: + raise ValueError( + f"Unsupported continuous distribution: {self.distribution}" + ) + + def _normalize_samples(self, samples: torch.Tensor) -> torch.Tensor: + """Normalize samples to [0, 1] range.""" + if self.distribution == "normal": + # Use sigmoid to map normal distribution to [0, 1] + return torch.sigmoid(samples) + + else: + raise ValueError( + f"Unsupported distribution for normalization: {self.distribution}" + ) + + def _generate_weights( + self, num_tokens: int, top_k: int, device: torch.device + ) -> torch.Tensor: + """Generate weights based on the distribution.""" + if self.distribution == "uniform": + # All-ones weights for uniform distribution + return torch.ones( + (num_tokens, top_k), + dtype=torch.float32, + device=device, + ) + + elif self.distribution == "normal": + # For normal distribution, generate weights from the same + # distribution + continuous_weights = self._sample_continuous_distribution( + num_tokens, top_k, device + ) + # Normalize to positive values and sum to 1 + weights = torch.abs(continuous_weights) + weights = weights / weights.sum(dim=-1, keepdim=True) + return weights + + else: + raise ValueError( + f"Unsupported distribution for weight generation: {self.distribution}" + ) + + def get_distribution_info(self) -> dict: + """Get information about the current distribution configuration.""" + return { + "distribution": self.distribution, + "parameters": self.distribution_params.copy(), + } + + +class RoutingSimulator: + """ + Token-to-Expert Routing Simulator. + + This class provides a framework for testing and comparing different + routing strategies for MoE models. It can simulate routing behavior + and collect statistics for analysis. + """ + + # Class-level registry of routing strategies + _routing_strategies: dict[str, RoutingStrategy] = { + # Basic routing strategies + "uniform_random": DistributionBasedRouting( + distribution="uniform", mean=0.0, std=1.0 + ), + "normal_routing": DistributionBasedRouting( + distribution="normal", mean=0.0, std=1.0 + ), + } + + @classmethod + def register_strategy(cls, name: str, strategy: RoutingStrategy): + """ + Register a custom routing strategy. + + Args: + name: Name of the strategy + strategy: RoutingStrategy instance + """ + cls._routing_strategies[name] = strategy + + @classmethod + def get_available_strategies(cls) -> list[str]: + """ + Get list of available routing strategy names. + + Returns: + List of available strategy names + """ + return list(cls._routing_strategies.keys()) + + @staticmethod + def simulate_routing( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + strategy_name: str, + top_k: int, + indices_type: torch.dtype | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Simulate token-to-expert routing using the specified strategy. + + Args: + hidden_states: Input hidden states [num_tokens, hidden_size] + router_logits: Router logits [num_tokens, num_experts] + strategy_name: Name of the routing strategy to use + top_k: Number of experts to select per token + indices_type: Data type for expert indices + + Returns: + tuple of (topk_weights, topk_ids) + """ + if strategy_name not in RoutingSimulator._routing_strategies: + raise ValueError( + f"Unknown routing strategy: {strategy_name}. " + f"Available strategies: " + f"{list(RoutingSimulator._routing_strategies.keys())}" + ) + logger.warning_once( + "Simulating MoE routing using a %s strategy. " + "This should only be used for performance testing. " + "Model outputs will not be valid.", + strategy_name, + ) + + strategy = RoutingSimulator._routing_strategies[strategy_name] + return strategy.route_tokens( + hidden_states=hidden_states, + router_logits=router_logits, + top_k=top_k, + indices_type=indices_type, + ) diff --git a/model_executor/layers/fused_moe/shared_fused_moe.py b/model_executor/layers/fused_moe/shared_fused_moe.py new file mode 100644 index 0000000..6ec8b33 --- /dev/null +++ b/model_executor/layers/fused_moe/shared_fused_moe.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.distributed import ( + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + +# TODO(bnell): Add shared + fused combo function? e.g. + +class SharedFusedMoE(FusedMoE): + """ + A FusedMoE operation that also computes the results of shared experts. + If an all2all communicator is being used the shared expert computation + can be interleaved with the fused all2all dispatch communication step. + """ + + def __init__( + self, + shared_experts: torch.nn.Module | None, + gate: torch.nn.Module | None = None, + use_overlapped: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self._shared_experts = shared_experts + + # Disable shared expert overlap if: + # - we are using eplb, because of correctness issues + # - we are using flashinfer with DP, since there nothint to gain + # - we are using marlin kjernels + self.use_overlapped = ( + use_overlapped + and not ( + # TODO(wentao): find the root cause and remove this condition + self.enable_eplb + or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1) + or self.use_marlin_kernels + ) + and self._shared_experts is not None + ) + + self._gate = gate + + @property + def shared_experts(self) -> torch.nn.Module | None: + return self._shared_experts if self.use_overlapped else None + + @property + def gate(self) -> torch.nn.Module | None: + return self._gate if self.use_overlapped else None + + @property + def is_internal_router(self) -> bool: + return self.gate is not None + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + if not self.use_overlapped: + if self._shared_experts is not None: + shared_out = self._shared_experts(hidden_states) + + # Reduce shared expert outputs if necessary, since the MLP + # should have been created with reduce_results=False. + if ( + self.reduce_results + and get_tensor_model_parallel_world_size() > 1 + and self.must_reduce_shared_expert_outputs() + ): + shared_out = tensor_model_parallel_all_reduce(shared_out) + else: + shared_out = None + + fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits, + ) + else: + shared_out, fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits, + ) + # ensure early TP reduction of shared expert outputs when required + if ( + shared_out is not None + and self.reduce_results + and get_tensor_model_parallel_world_size() > 1 + and self.must_reduce_shared_expert_outputs() + ): + shared_out = tensor_model_parallel_all_reduce(shared_out) + return shared_out, fused_out diff --git a/model_executor/layers/fused_moe/topk_weight_and_reduce.py b/model_executor/layers/fused_moe/topk_weight_and_reduce.py new file mode 100644 index 0000000..99d4038 --- /dev/null +++ b/model_executor/layers/fused_moe/topk_weight_and_reduce.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +import vllm._custom_ops as ops +import vllm.model_executor.layers.fused_moe.modular_kernel as mk + + +class TopKWeightAndReduceDelegate(mk.TopKWeightAndReduce): + """ + Useful in the case when some FusedMoEPermuteExpertsUnpermute + implementation does not perform weight application and reduction + but cannot address the needs of all the compatible PrepareAndFinalize + implementations. + For example, BatchedTritonExperts is compatible with both + PplxPrepareAndFinalize and BatchedPrepareAndFinalize. PplxPrepareAndFinalize + does the weight-application + reduction as part of the pplx combine kernel. + But the BatchedPrepareAndFinalize needs an implementation. To facilitate + this case, the BatchedTritonExperts could use TopKWeightAndReduceDelegate + so the PrepareAndFinalize implementations could choose how to + weight + reduce. + """ + + def __eq__(self, other): + return isinstance(other, TopKWeightAndReduceDelegate) + + def apply( + self, + output: torch.Tensor | None, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> torch.Tensor: + raise RuntimeError( + "The caller is expected to choose an appropriate " + "TopKWeightAndReduce implementation." + ) + + +class TopKWeightAndReduceNoOP(mk.TopKWeightAndReduce): + """ + The fused_experts outputs have already been weight applied and reduced. + This implementation is a no-op. + """ + + def __eq__(self, other): + return isinstance(other, TopKWeightAndReduceNoOP) + + def apply( + self, + output: torch.Tensor | None, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> torch.Tensor: + # Weight application and reduction operations are already done. + if output is None: + return fused_expert_output + + # MoEPrepareAndFinalizeNoEP needs the output to be in the `output` + # tensor. + assert output.size() == fused_expert_output.size(), ( + "output shape is expected to match the fused_expert_output shape. " + f"But got output={output.size()}, " + f"used_expert_output={fused_expert_output.size()}" + ) + output.copy_(fused_expert_output, non_blocking=True) + return output + + +class TopKWeightAndReduceContiguous(mk.TopKWeightAndReduce): + """ + TopKWeightAndReduce implementation for a fused_experts output + of shape (m, topk, K) + """ + + def __eq__(self, other): + return isinstance(other, TopKWeightAndReduceContiguous) + + def apply( + self, + output: torch.Tensor | None, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> torch.Tensor: + m, num_topk = topk_ids.size() + k = fused_expert_output.size(-1) + if fused_expert_output.ndim == 2: + fused_expert_output = fused_expert_output.view(m, num_topk, k) + + assert fused_expert_output.size() == (m, num_topk, k), ( + f"Expected fused_expert_output size {(m, num_topk, k)}. But got " + f"{fused_expert_output.size()}" + ) + + if not apply_router_weight_on_input: + fused_expert_output.mul_(topk_weights.view(m, -1, 1)) + + if output is None: + output = torch.empty( + (m, k), + device=fused_expert_output.device, + dtype=fused_expert_output.dtype, + ) + assert output.size() == (m, k), ( + f"Expected output size {(m, k)}. But got {output.size()}" + ) + + ops.moe_sum(fused_expert_output, output) + return output + + +class TopKWeightAndReduceNaiveBatched(mk.TopKWeightAndReduce): + """ + TopKWeightAndReduce implementation for a fused_experts output + of shape (num_experts, batch_size, K) + """ + + def __init__(self, rank: int): + self.rank = rank + + def __eq__(self, other): + return isinstance(other, TopKWeightAndReduceNaiveBatched) and ( + other.rank == self.rank + ) + + def apply( + self, + output: torch.Tensor | None, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> torch.Tensor: + assert fused_expert_output.ndim == 3 + num_tokens = topk_ids.size(0) + num_local_experts = fused_expert_output.size(0) + K = fused_expert_output.size(-1) + + if output is None: + output = torch.zeros( + (num_tokens, K), + device=fused_expert_output.device, + dtype=fused_expert_output.dtype, + ) + else: + output.fill_(0) + + assert output.size() == (num_tokens, K), ( + f"Expected output size {(num_tokens, K)}, but got {output.size()}" + ) + + first_expert = num_local_experts * self.rank + last_expert = first_expert + num_local_experts + + for expert_id in range(first_expert, last_expert): + matching_tokens = topk_ids == expert_id + topks = torch.any(matching_tokens, dim=1).flatten() + rows = torch.count_nonzero(topks) + rhs = fused_expert_output[expert_id - first_expert, :rows, :] + if not apply_router_weight_on_input: + rhs.mul_(topk_weights[matching_tokens].view(rhs.size(0), 1)) + output[topks] = output[topks] + rhs + + return output diff --git a/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/model_executor/layers/fused_moe/triton_deep_gemm_moe.py new file mode 100644 index 0000000..b8e0837 --- /dev/null +++ b/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + DeepGemmExperts, + _valid_deep_gemm, + _valid_deep_gemm_shape, +) +from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts +from vllm.utils.deep_gemm import ( + get_mk_alignment_for_contiguous_layout, + is_deep_gemm_e8m0_used, +) + + +class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + quant_config: FusedMoEQuantConfig, + allow_deep_gemm: bool = False, + ): + super().__init__(quant_config) + + self.triton_expert = TritonExperts(quant_config) + + self.allow_deep_gemm = ( + allow_deep_gemm + and self.quant_config.use_fp8_w8a8 + and self.block_shape == get_mk_alignment_for_contiguous_layout() + ) + + self.deep_gemm_expert = ( + DeepGemmExperts(self.quant_config) if self.allow_deep_gemm else None + ) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + assert ( + self.deep_gemm_expert is None + or self.triton_expert.activation_formats + == self.deep_gemm_expert.activation_formats + ) + return self.triton_expert.activation_formats + + def supports_chunking(self) -> bool: + dge = self.deep_gemm_expert + te = self.triton_expert + return (dge is None or dge.supports_chunking()) and ( + te is None or te.supports_chunking() + ) + + def supports_expert_map(self) -> bool: + dge = self.deep_gemm_expert + te = self.triton_expert + return (dge is None or dge.supports_expert_map()) and ( + te is None or te.supports_expert_map() + ) + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + dge = self.deep_gemm_expert + te = self.triton_expert + dge_war = dge.finalize_weight_and_reduce_impl() if dge else None + te_war = te.finalize_weight_and_reduce_impl() if te else None + is_dge_war = dge_war is not None + is_te_war = te_war is not None + + if is_dge_war and is_te_war: + assert dge_war == te_war, ( + "Both implementations should agree on WeightAndReduce impls. " + f"Got dge_war: {dge_war}, and te_war: {te_war}" + ) + + if dge_war is not None: + return dge_war + + assert te_war is not None + return te_war + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # Note: the deep gemm workspaces are strictly larger than the triton + # workspaces so we can be pessimistic here and allocate for DeepGemm + # even if we fall back to triton later, e.g. if expert maps are set. + if self.allow_deep_gemm and ( + is_deep_gemm_e8m0_used() or _valid_deep_gemm_shape(M, N, K) + ): + assert self.deep_gemm_expert is not None + return self.deep_gemm_expert.workspace_shapes( + M, + N, + K, + topk, + global_num_experts, + local_num_experts, + expert_tokens_meta, + ) + else: + return self.triton_expert.workspace_shapes( + M, + N, + K, + topk, + global_num_experts, + local_num_experts, + expert_tokens_meta, + ) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + use_deep_gemm = self.allow_deep_gemm and ( + is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2) + ) + + experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert + assert experts is not None + + experts.apply( + output, + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + activation, + global_num_experts, + expert_map, + a1q_scale, + a2_scale, + workspace13, + workspace2, + expert_tokens_meta, + apply_router_weight_on_input, + ) diff --git a/model_executor/layers/fused_moe/trtllm_moe.py b/model_executor/layers/fused_moe/trtllm_moe.py new file mode 100644 index 0000000..132d35e --- /dev/null +++ b/model_executor/layers/fused_moe/trtllm_moe.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) + + +class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + moe: FusedMoEConfig, + quant_config: FusedMoEQuantConfig, + gemm1_alpha, + gemm1_beta, + gemm1_clamp_limit, + max_capture_size, + ): + super().__init__(quant_config) + self.moe = moe + self.gemm1_alpha = gemm1_alpha + self.gemm1_beta = gemm1_beta + self.gemm1_clamp_limit = gemm1_clamp_limit + self.max_capture_size = max_capture_size + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # The workspaces for this implementation are managed by flashinfer. + workspace1 = (0,) + workspace2 = (0,) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + topk = topk_ids.size(-1) + local_num_experts = w1.size(0) + intermediate_size = w2.size(1) + local_expert_offset = self.moe.ep_rank * local_num_experts + + x_quant = hidden_states + x_scale = a1q_scale + if x_scale is not None: + x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x_quant.shape[:-1], -1) + + packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to( + torch.bfloat16 + ).view(torch.int16) + + assert self.w1_scale is not None + assert self.w2_scale is not None + kwargs = { + "topk_ids": packed_tensor, + "routing_bias": None, + "hidden_states": x_quant, + "hidden_states_scale": x_scale, + "gemm1_weights": w1, + "gemm1_weights_scale": self.w1_scale, + "gemm1_bias": self.w1_bias, + "gemm1_alpha": self.gemm1_alpha, + "gemm1_beta": self.gemm1_beta, + "gemm1_clamp_limit": self.gemm1_clamp_limit, + "gemm2_weights": w2, + "gemm2_weights_scale": self.w2_scale, + "gemm2_bias": self.w2_bias, + "output1_scale_scalar": None, + "output1_scale_gate_scalar": None, + "output2_scale_scalar": None, + "num_experts": global_num_experts, + "top_k": topk, + "n_group": None, + "topk_group": None, + "intermediate_size": intermediate_size, + "local_expert_offset": local_expert_offset, + "local_num_experts": local_num_experts, + "routed_scaling_factor": None, + "tile_tokens_dim": None, + "routing_method_type": 1, + "do_finalize": True, + "output": output, + "tune_max_num_tokens": max(self.max_capture_size, 1), + } + + from flashinfer import trtllm_fp4_block_scale_routed_moe + + from vllm.utils.flashinfer import autotune + + with autotune(False): + # Enable autotune when, + # https://github.com/flashinfer-ai/flashinfer/issues/2023 is + # resolved. + trtllm_fp4_block_scale_routed_moe(**kwargs) + + return output diff --git a/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/model_executor/layers/fused_moe/unquantized_fused_moe_method.py new file mode 100644 index 0000000..d3b0f79 --- /dev/null +++ b/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -0,0 +1,578 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +import torch.nn.functional as F + +import vllm.envs as envs +from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.fused_moe.config import ( + FUSED_MOE_UNQUANTIZED_CONFIG, + FusedMoEConfig, + FusedMoEQuantConfig, + biased_moe_quant_config, +) +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEActivationFormat, + FusedMoEPermuteExpertsUnpermute, + FusedMoEPrepareAndFinalize, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.platforms.interface import CpuArchEnum +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe + +if current_platform.is_cuda_alike(): + from .fused_batched_moe import BatchedTritonExperts + from .fused_moe import TritonExperts, fused_experts +else: + fused_experts = None # type: ignore + +if current_platform.is_tpu(): + from .moe_pallas import fused_moe as fused_moe_pallas +else: + fused_moe_pallas = None # type: ignore + +logger = init_logger(__name__) + + +@CustomOp.register("unquantized_fused_moe") +class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): + """MoE method without quantization.""" + + def __init__(self, moe: FusedMoEConfig): + super().__init__(moe) + + self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + if self.rocm_aiter_moe_enabled: + from .rocm_aiter_fused_moe import rocm_aiter_fused_experts + + self.rocm_aiter_fused_experts = rocm_aiter_fused_experts + else: + self.rocm_aiter_fused_experts = None # type: ignore + + # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS + self.flashinfer_cutlass_moe_enabled = ( + has_flashinfer_cutlass_fused_moe() + and envs.VLLM_USE_FLASHINFER_MOE_FP16 + and self.moe.moe_parallel_config.use_ep + and self.moe.moe_parallel_config.dp_size == 1 + and current_platform.get_device_capability()[0] >= 9 + ) + if self.flashinfer_cutlass_moe_enabled: + logger.info_once( + "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod" + ) + from functools import partial + + from .flashinfer_cutlass_moe import flashinfer_cutlass_moe + + self.flashinfer_cutlass_moe = partial( + flashinfer_cutlass_moe, + quant_config=FUSED_MOE_UNQUANTIZED_CONFIG, + tp_rank=self.moe.moe_parallel_config.tp_rank, + tp_size=self.moe.moe_parallel_config.tp_size, + ep_rank=self.moe.moe_parallel_config.ep_rank, + ep_size=self.moe.moe_parallel_config.ep_size, + ) + else: + if ( + self.moe.moe_parallel_config.use_ep + and self.moe.moe_parallel_config.dp_size == 1 + ): + logger.info_once( + "FlashInfer CUTLASS MoE is available for EP" + " but not enabled, consider setting" + " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.", + scope="local", + ) + elif self.moe.moe_parallel_config.dp_size > 1: + logger.info_once( + "FlashInfer CUTLASS MoE is currently not available for DP.", + scope="local", + ) + self.flashinfer_cutlass_moe = None # type: ignore + + @property + def supports_eplb(self) -> bool: + return True + + @property + def allow_inplace(self) -> bool: + return True + + def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None: + if self.rocm_aiter_moe_enabled: + return None + else: + return super().maybe_make_prepare_finalize() + + def select_gemm_impl( + self, + prepare_finalize: FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None + if ( + prepare_finalize.activation_format + == FusedMoEActivationFormat.BatchedExperts + ): + logger.debug("BatchedTritonExperts %s", self.moe) + return BatchedTritonExperts( + max_num_tokens=self.moe.max_num_tokens, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + ) + else: + logger.debug("TritonExperts %s", self.moe) + return TritonExperts(self.moe_quant_config) + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + if self.moe.is_act_and_mul: + w13_up_dim = 2 * intermediate_size_per_partition + else: + w13_up_dim = intermediate_size_per_partition + # Fused gate_up_proj (column parallel) + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + w13_up_dim, + hidden_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + if self.moe.has_bias: + w13_bias = torch.nn.Parameter( + torch.zeros(num_experts, w13_up_dim, dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + # down_proj (row parallel) + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + if self.moe.has_bias: + w2_bias = torch.nn.Parameter( + torch.zeros(num_experts, hidden_size, dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + + def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor: + # Pad the weight tensor. This is an optimization on ROCm platform, which + # can benefit from tensors located far enough from one another in memory + if ( + envs.VLLM_ROCM_MOE_PADDING + and current_platform.is_rocm() + and weight.stride(-1) == 1 + and (weight.stride(-2) * weight.element_size()) % 512 == 0 + ): + num_pad = 256 // weight.element_size() + weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad] + torch.cuda.empty_cache() + + return weight + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + super().process_weights_after_loading(layer) + + # Padding the weight for better performance on ROCm + layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data) + layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data) + + if self.rocm_aiter_moe_enabled: + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data + ) + + layer.w13_weight.data = shuffled_w13 + layer.w2_weight.data = shuffled_w2 + + if self.flashinfer_cutlass_moe_enabled: + # Swap halves to arrange as [w3; w1] (kernel expectation) + w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1) + w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1) + layer.w13_weight.data = w13_weight_swapped.contiguous() + + if current_platform.is_xpu(): + import intel_extension_for_pytorch as ipex + + ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts + layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( + layer.w13_weight, + layer.w2_weight, + use_prepack=True, + experts_start_id=ep_rank_start, + ) + elif current_platform.is_cpu(): + from vllm.model_executor.layers.fused_moe import cpu_fused_moe + + if current_platform.get_cpu_architecture() == CpuArchEnum.X86: + from vllm.model_executor.layers.utils import check_cpu_sgl_kernel + + dtype_w13 = layer.w13_weight.dtype + _, n_w13, k_w13 = layer.w13_weight.size() + dtype_w2 = layer.w2_weight.dtype + _, n_w2, k_w2 = layer.w2_weight.size() + if ( + envs.VLLM_CPU_SGL_KERNEL + and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13) + and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2) + ): + packed_w13_weight = torch.ops._C.convert_weight_packed( + layer.w13_weight + ) + assert packed_w13_weight.size() == layer.w13_weight.size() + layer.w13_weight.copy_(packed_w13_weight) + del packed_w13_weight + packed_w2_weight = torch.ops._C.convert_weight_packed( + layer.w2_weight + ) + assert packed_w2_weight.size() == layer.w2_weight.size() + layer.w2_weight.copy_(packed_w2_weight) + layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer) + else: + layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer) + else: + layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + assert expert_load_view is not None + assert logical_to_physical_map is not None + assert logical_replica_count is not None + + return self.forward( + x=x, + layer=layer, + router_logits=router_logits, + top_k=top_k, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + topk_group=topk_group, + num_expert_group=num_expert_group, + global_num_experts=global_num_experts, + expert_map=expert_map, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + enable_eplb=enable_eplb, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + if self.moe.has_bias: + return biased_moe_quant_config( + layer.w13_bias, + layer.w2_bias, + ) + else: + return FUSED_MOE_UNQUANTIZED_CONFIG + + def forward_cuda( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + zero_expert_num = getattr(layer, "zero_expert_num", 0) + zero_expert_type = getattr(layer, "zero_expert_type", None) + + topk_weights, topk_ids, zero_expert_result = layer.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + global_num_experts=global_num_experts, + zero_expert_num=zero_expert_num, + zero_expert_type=zero_expert_type, + num_fused_shared_experts=layer.num_fused_shared_experts, + ) + + if self.rocm_aiter_moe_enabled: + result = self.rocm_aiter_fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + expert_map=expert_map, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + elif self.flashinfer_cutlass_moe_enabled: + return self.flashinfer_cutlass_moe( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + else: + result = fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + activation=activation, + quant_config=self.moe_quant_config, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) + + if zero_expert_num != 0 and zero_expert_type is not None: + assert not isinstance(result, tuple), ( + "Shared + zero experts are mutually exclusive not yet supported" + ) + return result, zero_expert_result + else: + return result + + def forward_cpu( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if ( + enable_eplb is not False + or expert_load_view is not None + or logical_to_physical_map is not None + or logical_replica_count is not None + ): + raise NotImplementedError("Expert load balancing is not supported for CPU.") + return layer.cpu_fused_moe( + layer, + x, + use_grouped_topk, + top_k, + router_logits, + renormalize, + topk_group, + num_expert_group, + global_num_experts, + expert_map, + custom_routing_function, + scoring_func, + routed_scaling_factor, + e_score_correction_bias, + apply_router_weight_on_input, + activation, + ) + + def forward_xpu( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if ( + enable_eplb is not False + or expert_load_view is not None + or logical_to_physical_map is not None + or logical_replica_count is not None + ): + raise NotImplementedError("Expert load balancing is not supported for XPU.") + return layer.ipex_fusion( + x, + use_grouped_topk, + top_k, + router_logits, + renormalize, + topk_group, + num_expert_group, + custom_routing_function=custom_routing_function, + ) + + def forward_tpu( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + assert not use_grouped_topk + assert num_expert_group is None + assert topk_group is None + assert custom_routing_function is None + assert apply_router_weight_on_input is False + if scoring_func != "softmax": + raise NotImplementedError( + "Only softmax scoring function is supported for TPU." + ) + if e_score_correction_bias is not None: + raise NotImplementedError( + "Expert score correction bias is not supported for TPU." + ) + assert activation == "silu", f"{activation} is not supported for TPU." + assert routed_scaling_factor == 1.0, ( + f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU." + ) + if ( + enable_eplb is not False + or expert_load_view is not None + or logical_to_physical_map is not None + or logical_replica_count is not None + ): + raise NotImplementedError("Expert load balancing is not supported for TPU.") + return fused_moe_pallas( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk=top_k, + gating_output=router_logits, + global_num_experts=global_num_experts, + expert_map=expert_map, + renormalize=renormalize, + ) + + if current_platform.is_tpu(): + forward_native = forward_tpu + elif current_platform.is_cpu(): + forward_native = forward_cpu + elif current_platform.is_xpu(): + forward_native = forward_xpu + else: + forward_native = forward_cuda diff --git a/model_executor/layers/fused_moe/utils.py b/model_executor/layers/fused_moe/utils.py new file mode 100644 index 0000000..1f946d6 --- /dev/null +++ b/model_executor/layers/fused_moe/utils.py @@ -0,0 +1,332 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +from math import prod + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, +) +from vllm.model_executor.layers.quantization.utils.int8_utils import ( + per_token_group_quant_int8, + per_token_quant_int8, +) +from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( + quant_dequant_mxfp4, +) +from vllm.model_executor.layers.quantization.utils.mxfp6_utils import ( + quant_dequant_mxfp6, +) +from vllm.model_executor.layers.quantization.utils.mxfp8_utils import ( + mxfp8_e4m3_quantize, +) +from vllm.triton_utils import tl, triton +from vllm.utils.flashinfer import flashinfer_fp4_quantize +from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import is_torch_equal_or_newer + + +@triton.jit +def _count_expert_num_tokens( + topk_ids_ptr, + expert_num_tokens_ptr, + num_experts, + topk_numel, + expert_map, + HAS_EXPERT_MAP: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + curr_expert = tl.program_id(0) + + offsets = tl.arange(0, BLOCK_SIZE) + topk_ids_ptrs = topk_ids_ptr + offsets + + acc = tl.zeros((BLOCK_SIZE,), dtype=tl.int32) + for x in range(tl.cdiv(topk_numel, BLOCK_SIZE)): + mask = offsets < (topk_numel - x * BLOCK_SIZE) + expert_ids = tl.load(topk_ids_ptrs, mask=mask, other=-1) + if HAS_EXPERT_MAP: + expert_map_ptrs = expert_map + expert_ids + expert_map_mask = expert_ids >= 0 + expert_ids = tl.load(expert_map_ptrs, mask=expert_map_mask, other=-1) + + has_curr_expert = tl.where(expert_ids == curr_expert, 1, 0) + acc = acc + has_curr_expert + topk_ids_ptrs += BLOCK_SIZE + + if curr_expert < num_experts: + tl.store(expert_num_tokens_ptr + curr_expert, tl.sum(acc)) + + +def count_expert_num_tokens( + topk_ids: torch.Tensor, num_local_experts: int, expert_map: torch.Tensor | None +) -> torch.Tensor: + """ + Count the number to tokens assigned to each expert. + + Parameters: + - topk_ids (torch.Tensor): Tensor mapping each token to its + list of experts. + - num_local_experts (int): Number of experts in this rank. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + + Returns: + A tensor of size num_local_experts, where tensor[i] holds the number + of tokens assigned to the ith expert. + """ + assert topk_ids.dtype.is_signed, "The kernel uses -1 to represent invalid topk_ids" + expert_num_tokens = torch.empty( + (num_local_experts), device=topk_ids.device, dtype=torch.int32 + ) + + grid = num_local_experts + BLOCK_SIZE = min(topk_ids.numel(), 1024) + BLOCK_SIZE = triton.next_power_of_2(BLOCK_SIZE) + + _count_expert_num_tokens[(grid,)]( + topk_ids, + expert_num_tokens, + num_local_experts, + topk_ids.numel(), + expert_map, + HAS_EXPERT_MAP=expert_map is not None, + BLOCK_SIZE=BLOCK_SIZE, + ) + + return expert_num_tokens + + +def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor: + """ + Shrink the given tensor and apply the given view to it. This is + used to resize the intermediate fused_moe caches. + """ + assert prod(v) <= x.numel(), ( + f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})" + ) # CUDAGRAPH unfriendly? + return x.flatten()[: prod(v)].view(*v) + + +def _nvfp4_quantize( + A: torch.Tensor, + A_scale: torch.Tensor | None, + is_sf_swizzled_layout: bool, +) -> tuple[torch.Tensor, torch.Tensor]: + return flashinfer_fp4_quantize( + A, A_scale, is_sf_swizzled_layout=is_sf_swizzled_layout + ) + + +def _fp8_quantize( + A: torch.Tensor, + A_scale: torch.Tensor | None, + per_act_token: bool, + block_shape: list[int] | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Perform fp8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + if block_shape is None: + # TODO(luka): use QuantFP8 custom op + # https://github.com/vllm-project/vllm/issues/20711 + A, A_scale = ops.scaled_fp8_quant( + A, A_scale, use_per_token_if_dynamic=per_act_token + ) + else: + assert not per_act_token + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_fp8(A, block_k) + assert cdiv(A.size(-1), block_k) == A_scale.size(-1) + + return A, A_scale + + +def _int8_quantize( + A: torch.Tensor, + A_scale: torch.Tensor | None, + per_act_token: bool, + block_shape: list[int] | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Perform int8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + + # If weights are per-channel (per_channel_quant=True), then + # activations apply per-token quantization. Otherwise, assume + # activation tensor-wise fp8/int8 quantization, dynamic or static + if block_shape is None: + assert per_act_token, "int8 quantization only supports block or channel-wise" + A, A_scale = per_token_quant_int8(A) + else: + assert not per_act_token + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + assert cdiv(A.size(-1), block_k) == A_scale.size(-1) + + return A, A_scale + + +def _mxfp4_quantize( + A: torch.Tensor, + A_scale: torch.Tensor | None, + per_act_token_quant: bool, + block_shape: list[int] | None = None, +) -> tuple[torch.Tensor, None]: + assert block_shape is None + # TODO: native mxfp4 is currently not integrated in vllm, + # so simulating even on devices supporting this data type natively. + # Once integrated, `current_platform.supports_mx()` should be used to + # control quantize+dequantize, or simply quantize here down to mxfp4. + A = quant_dequant_mxfp4(A) + + return A, None + + +def _mxfp8_e4m3_quantize( + A: torch.Tensor, + A_scale: torch.Tensor | None, + per_act_token_quant: bool, + block_shape: list[int] | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + assert A_scale is None + assert not per_act_token_quant + assert block_shape is None + return mxfp8_e4m3_quantize(A) + + +def _mxfp6_e3m2_quantize( + A: torch.Tensor, + A_scale: torch.Tensor | None, + per_act_token_quant: bool, + block_shape: list[int] | None = None, +) -> tuple[torch.Tensor, None]: + assert block_shape is None + + # TODO: native mxfp6 is currently not integrated in vllm, + # so simulating even on devices supporting this data type natively. + # Eventually, there should be a check based on + # `current_platform.supports_mx()` here. + A = quant_dequant_mxfp6(A, quant_dtype="fp6_e3m2") + + return A, None + + +def _mxfp6_e2m3_quantize( + A: torch.Tensor, + A_scale: torch.Tensor | None, + per_act_token_quant: bool, + block_shape: list[int] | None = None, +) -> tuple[torch.Tensor, None]: + assert block_shape is None + + # TODO: native mxfp6 is currently not integrated in vllm, + # so simulating even on devices supporting this data type natively. + # Eventually, there should be a check based on + # `current_platform.supports_mx()` here. + A = quant_dequant_mxfp6(A, quant_dtype="fp6_e2m3") + + return A, None + + +def moe_kernel_quantize_input( + A: torch.Tensor, + A_scale: torch.Tensor | None, + quant_dtype: None | torch.dtype | str, + per_act_token_quant: bool, + block_shape: list[int] | None = None, + is_fp4_scale_swizzled: bool = True, +) -> tuple[torch.Tensor, torch.Tensor | None]: + if quant_dtype == torch.float8_e4m3fn: + return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape) + elif quant_dtype == torch.int8: + return _int8_quantize(A, A_scale, per_act_token_quant, block_shape) + elif quant_dtype == "nvfp4": + return _nvfp4_quantize(A, A_scale, is_sf_swizzled_layout=is_fp4_scale_swizzled) + elif quant_dtype == "mxfp4": + return _mxfp4_quantize(A, A_scale, per_act_token_quant, block_shape) + elif quant_dtype == "mxfp8": + # TODO: `quant_dtype == "mxfp8"` is ambiguous, + # should be fp8_e4m3. OCP MX also defines `fp8_e5m2`. + return _mxfp8_e4m3_quantize(A, A_scale, per_act_token_quant, block_shape) + elif quant_dtype == "mxfp6_e3m2": + return _mxfp6_e3m2_quantize(A, A_scale, per_act_token_quant, block_shape) + elif quant_dtype == "mxfp6_e2m3": + return _mxfp6_e2m3_quantize(A, A_scale, per_act_token_quant, block_shape) + else: + return A, A_scale + + +def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: + """ + A permutation routine that works on fp8 types. + """ + if torch.is_floating_point(m) and m.dtype.itemsize == 1: + return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype) + else: + return m[idx, ...] + + +def normalize_scales_shape(scales: torch.Tensor | None) -> torch.Tensor | None: + if scales is not None: + if scales.numel() == 1: + scales = scales.view(1, 1) + else: + scales = scales.view(-1, scales.size(-1)) + return scales + + +def normalize_batched_scales_shape( + scales: torch.Tensor | None, + num_experts: int, +) -> torch.Tensor | None: + if scales is not None and scales.ndim < 3: + if scales.numel() == 1: + scales = scales.view(1) + scales = torch.repeat_interleave(scales, num_experts, dim=0).view( + num_experts, 1, 1 + ) + else: + scales = scales.view(num_experts, -1, scales.size(-1)) + + return scales + + +def _validate_scale_shape( + a: torch.Tensor, + a_scale: torch.Tensor | None, + per_act_token_quant: bool, + block_shape: list[int] | None, +) -> None: + if a_scale is None: + return + + if not per_act_token_quant and block_shape is None: + assert a_scale.numel() == 1, f"{a_scale.shape}" + elif per_act_token_quant: + assert a_scale.shape[0] == a.shape[0] and a_scale.shape[1] == 1, ( + f"{a_scale.shape[0]} == {a.shape[0]} and {a_scale.shape[1]} == 1" + ) + else: + assert block_shape is not None + expected = (a.shape[0], cdiv(a.shape[1], block_shape[1])) + assert a_scale.shape == expected, f"{a_scale.shape} == {expected}" + + +def activation_without_mul(activation: str) -> str: + return activation + "_no_mul" + + +# Torch custom ops can't deal with outputs aliasing inputs so we need to +# disable inplace for torch >= 2.9. +# See https://github.com/vllm-project/vllm/issues/26378 +@functools.cache +def disable_inplace() -> bool: + return is_torch_equal_or_newer("2.9") diff --git a/model_executor/layers/kda.py b/model_executor/layers/kda.py new file mode 100644 index 0000000..2e7500b --- /dev/null +++ b/model_executor/layers/kda.py @@ -0,0 +1,448 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +from einops import rearrange +from torch import nn + +from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config +from vllm.distributed import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.model_loader.weight_utils import sharded_weight_loader +from vllm.model_executor.utils import set_weight_attrs +from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata + +from .fla.ops.kda import ( + FusedRMSNormGated, + chunk_kda, + fused_kda_gate, + fused_recurrent_kda, +) +from .linear import ( + ColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from .mamba.abstract import MambaBase +from .mamba.mamba_utils import MambaStateDtypeCalculator, MambaStateShapeCalculator +from .mamba.ops.causal_conv1d import causal_conv1d_fn, causal_conv1d_update +from .quantization.base_config import QuantizationConfig + +logger = init_logger(__name__) + + +def kda_attention( + q_proj_states: torch.Tensor, + k_proj_states: torch.Tensor, + v_proj_states: torch.Tensor, + g1: torch.Tensor, + beta: torch.Tensor, + core_attn_out: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self._forward( + q_proj_states=q_proj_states, + k_proj_states=k_proj_states, + v_proj_states=v_proj_states, + g1=g1, + beta=beta, + core_attn_out=core_attn_out, + ) + + +def kda_attention_fake( + q_proj_states: torch.Tensor, + k_proj_states: torch.Tensor, + v_proj_states: torch.Tensor, + g1: torch.Tensor, + beta: torch.Tensor, + core_attn_out: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="kda_attention", + op_func=kda_attention, + mutates_args=["core_attn_out"], + fake_impl=kda_attention_fake, +) + + +class KimiDeltaAttention(nn.Module, MambaBase): + @property + def mamba_type(self) -> str: + return "linear_attention" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend + + return GDNAttentionBackend + + def get_state_dtype( + self, + ) -> tuple[torch.dtype, torch.dtype, torch.dtype, torch.dtype]: + if self.model_config is None or self.cache_config is None: + raise ValueError("model_config and cache_config must be set") + return MambaStateDtypeCalculator.kda_state_dtype( + self.model_config.dtype, self.cache_config.mamba_cache_dtype + ) + + def get_state_shape( + self, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.kda_state_shape( + self.tp_size, self.num_heads, self.head_dim, conv_kernel_size=self.conv_size + ) + + def __init__( + self, + layer_idx: int, + hidden_size: int, + quant_config: QuantizationConfig | None = None, + cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, + rms_norm_eps: float = 1e-5, + prefix: str = "", + **kwargs, + ) -> None: + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.hidden_size = hidden_size + self.model_config = model_config + self.cache_config = cache_config + if model_config is None: + raise ValueError("model_config must be provided") + kda_config = model_config.linear_attn_config + self.head_dim = kda_config["head_dim"] + self.num_heads = kda_config["num_heads"] + self.layer_idx = layer_idx + self.prefix = prefix + assert self.num_heads % self.tp_size == 0 + self.local_num_heads = divide(self.num_heads, self.tp_size) + + projection_size = self.head_dim * self.num_heads + self.conv_size = kda_config["short_conv_kernel_size"] + + self.q_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_proj", + ) + self.k_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.k_proj", + ) + self.v_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.v_proj", + ) + + self.f_a_proj = ReplicatedLinear( + self.hidden_size, + self.head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.f_a_proj", + ) + + self.f_b_proj = ColumnParallelLinear( + self.head_dim, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.f_b_proj", + ) + self.dt_bias = nn.Parameter( + torch.empty(divide(projection_size, self.tp_size), dtype=torch.float32) + ) + + set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)}) + + self.b_proj = ColumnParallelLinear( + self.hidden_size, + self.num_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.b_proj", + ) + + self.q_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.q_conv1d", + ) + self.k_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.k_conv1d", + ) + self.v_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.v_conv1d", + ) + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `set_weight_attrs` + # doesn't allow to override it + self.q_conv1d.weight.data = self.q_conv1d.weight.data.unsqueeze(1) + self.k_conv1d.weight.data = self.k_conv1d.weight.data.unsqueeze(1) + self.v_conv1d.weight.data = self.v_conv1d.weight.data.unsqueeze(1) + + self.A_log = nn.Parameter( + torch.empty(1, 1, self.local_num_heads, 1, dtype=torch.float32) + ) + set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(2)}) + + self.g_a_proj = ReplicatedLinear( + self.hidden_size, + self.head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.g_a_proj", + ) + self.g_b_proj = ColumnParallelLinear( + self.head_dim, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.g_b_proj", + ) + self.o_norm = FusedRMSNormGated( + self.head_dim, eps=rms_norm_eps, activation="sigmoid" + ) + self.o_proj = RowParallelLinear( + projection_size, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + def forward( + self, + hidden_states: torch.Tensor, + positions: torch.Tensor, + output: torch.Tensor, + ) -> None: + num_tokens = hidden_states.size(0) + q = self.q_proj(hidden_states)[0] + k = self.k_proj(hidden_states)[0] + v = self.v_proj(hidden_states)[0] + + beta = self.b_proj(hidden_states)[0].float().sigmoid() + g1 = self.f_b_proj(self.f_a_proj(hidden_states)[0])[0] + g1 = fused_kda_gate(g1, self.A_log, self.head_dim, g_bias=self.dt_bias) + beta = beta.unsqueeze(0) + g1 = g1.unsqueeze(0) + + g_proj_states = self.g_b_proj(self.g_a_proj(hidden_states)[0])[0] + g2 = rearrange(g_proj_states, "... (h d) -> ... h d", d=self.head_dim) + + core_attn_out = torch.zeros( + (1, num_tokens, self.local_num_heads, self.head_dim), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + torch.ops.vllm.kda_attention( + q, + k, + v, + g1, + beta, + core_attn_out, + self.prefix, + ) + core_attn_out = self.o_norm(core_attn_out, g2) + core_attn_out = rearrange(core_attn_out, "1 n h d -> n (h d)") + output[:] = self.o_proj(core_attn_out)[0] + + def _forward( + self, + q_proj_states: torch.Tensor, + k_proj_states: torch.Tensor, + v_proj_states: torch.Tensor, + g1: torch.Tensor, + beta: torch.Tensor, + core_attn_out: torch.Tensor, + ) -> None: + forward_context = get_forward_context() + attn_metadata: AttentionMetadata = forward_context.attn_metadata + + if attn_metadata is None: + # # V1 profile run + return + + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, GDNAttentionMetadata) + has_initial_state = attn_metadata.has_initial_state + non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc + non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 + num_actual_tokens = attn_metadata.num_actual_tokens + constant_caches = self.kv_cache[forward_context.virtual_engine] + + q_proj_states = q_proj_states[:num_actual_tokens] + k_proj_states = k_proj_states[:num_actual_tokens] + v_proj_states = v_proj_states[:num_actual_tokens] + g1 = g1[:num_actual_tokens] + beta = beta[:num_actual_tokens] + + (conv_state_q, conv_state_k, conv_state_v, recurrent_state) = constant_caches + # deal with strides + conv_state_q = conv_state_q.transpose(-1, -2) + conv_state_k = conv_state_k.transpose(-1, -2) + conv_state_v = conv_state_v.transpose(-1, -2) + + q_conv_weights = self.q_conv1d.weight.view( + self.q_conv1d.weight.size(0), self.q_conv1d.weight.size(2) + ) + k_conv_weights = self.k_conv1d.weight.view( + self.k_conv1d.weight.size(0), self.k_conv1d.weight.size(2) + ) + v_conv_weights = self.v_conv1d.weight.view( + self.v_conv1d.weight.size(0), self.v_conv1d.weight.size(2) + ) + if attn_metadata.num_prefills > 0: + q_proj_states = q_proj_states.transpose(0, 1) + k_proj_states = k_proj_states.transpose(0, 1) + v_proj_states = v_proj_states.transpose(0, 1) + q = causal_conv1d_fn( + q_proj_states, + q_conv_weights, + self.q_conv1d.bias, + activation="silu", + conv_states=conv_state_q, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + k = causal_conv1d_fn( + k_proj_states, + k_conv_weights, + self.k_conv1d.bias, + activation="silu", + conv_states=conv_state_k, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + v = causal_conv1d_fn( + v_proj_states, + v_conv_weights, + self.v_conv1d.bias, + activation="silu", + conv_states=conv_state_v, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + else: + decode_conv_indices = non_spec_state_indices_tensor[ + : attn_metadata.num_actual_tokens + ] + q = causal_conv1d_update( + q_proj_states, + conv_state_q, + q_conv_weights, + self.q_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + k = causal_conv1d_update( + k_proj_states, + conv_state_k, + k_conv_weights, + self.k_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + v = causal_conv1d_update( + v_proj_states, + conv_state_v, + v_conv_weights, + self.v_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + + q, k, v = map( + lambda x: rearrange(x, "n (h d) -> 1 n h d", d=self.head_dim), (q, k, v) + ) + + if attn_metadata.num_prefills > 0: + zero_idx = non_spec_state_indices_tensor[~has_initial_state] + recurrent_state[zero_idx] = 0 + initial_state = recurrent_state[non_spec_state_indices_tensor].contiguous() + ( + core_attn_out_non_spec, + last_recurrent_state, + ) = chunk_kda( + q=q, + k=k, + v=v, + g=g1, + beta=beta, + initial_state=initial_state, + output_final_state=True, + use_qk_l2norm_in_kernel=True, + cu_seqlens=non_spec_query_start_loc, + ) + # Init cache + recurrent_state[non_spec_state_indices_tensor] = last_recurrent_state + else: + ( + core_attn_out_non_spec, + last_recurrent_state, + ) = fused_recurrent_kda( + q=q, + k=k, + v=v, + g=g1, + beta=beta, + initial_state=recurrent_state, + use_qk_l2norm_in_kernel=True, + cu_seqlens=non_spec_query_start_loc[: attn_metadata.num_decodes + 1], + ssm_state_indices=non_spec_state_indices_tensor, + ) + core_attn_out[0, :num_actual_tokens] = core_attn_out_non_spec[ + 0, :num_actual_tokens + ] diff --git a/model_executor/layers/layernorm.py b/model_executor/layers/layernorm.py new file mode 100644 index 0000000..7bb535b --- /dev/null +++ b/model_executor/layers/layernorm.py @@ -0,0 +1,578 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Custom normalization layers.""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.batch_invariant import ( + rms_norm_batch_invariant, + vllm_is_batch_invariant, +) +from vllm.platforms import current_platform + + +def rms_norm( + x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float +) -> torch.Tensor: + from vllm import _custom_ops as ops + + if vllm_is_batch_invariant(): + return rms_norm_batch_invariant(x, weight, variance_epsilon) + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + weight, + variance_epsilon, + ) + return out + + +def fused_add_rms_norm( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + residual_alpha: float = 1.0 +) -> tuple[torch.Tensor, torch.Tensor]: + from vllm import _custom_ops as ops + + if vllm_is_batch_invariant(): + return rms_norm_batch_invariant( + x + residual, weight, variance_epsilon + ), x + residual + x, residual = ops.fused_add_rms_norm( + x, + residual, + weight, + variance_epsilon, + residual_alpha, + ) + return x, residual + + +def poly_norm( + x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, variance_epsilon: float +) -> torch.Tensor: + from vllm import _custom_ops as ops + + out = torch.empty_like(x) + ops.poly_norm( + out, + x, + weight, + bias, + variance_epsilon, + ) + return out + + +def dispatch_rocm_rmsnorm_func( + with_fused_add: bool, dtype: torch.dtype, use_aiter: bool = False +): + use_aiter = use_aiter and dtype in [ + torch.float16, + torch.bfloat16, + ] + + if use_aiter and with_fused_add: + return rocm_aiter_ops.rms_norm2d_with_add + if use_aiter: + return rocm_aiter_ops.rms_norm + + # fall back to CUDA implementation + if with_fused_add: + return fused_add_rms_norm + return rms_norm + + +def rms_norm_qk( + input_q: torch.Tensor, + input_k: torch.Tensor, + weight_q: torch.Tensor, + weight_k: torch.Tensor, + epsilon: float, +) -> tuple[torch.Tensor, torch.Tensor]: + from vllm import _custom_ops as ops + output_q = torch.empty_like(input_q) + output_k = torch.empty_like(input_k) + ops.rms_norm_qk( + output_q, output_k, input_q, input_k, weight_q, weight_k, epsilon + ) + return output_q, output_k + + +@CustomOp.register("rms_norm_qk") +class RMSNormQK(CustomOp): + """ + Root Mean Square Normalization for Query/Key tensors. + + Computes: + q -> w_q * q / sqrt(E[q^2] + eps) + k -> w_k * k / sqrt(E[k^2] + eps) + """ + def __init__( + self, + hidden_size_q: int, + hidden_size_k: int, + eps: float = 1e-6, + ) -> None: + super().__init__() + + self.hidden_size_q = hidden_size_q + self.hidden_size_k = hidden_size_k + self.variance_epsilon = eps + + def forward_native( + self, + input_q: torch.Tensor, + input_k: torch.Tensor, + weight_q: torch.Tensor, + weight_k: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + if input_q.shape[-1] != self.hidden_size_q: + raise ValueError( + f"[RMSNormQK] Expected input_q last dim = {self.hidden_size_q}, " + f"but got {input_q.shape[-1]}" + ) + if input_k.shape[-1] != self.hidden_size_k: + raise ValueError( + f"[RMSNormQK] Expected input_k last dim = {self.hidden_size_k}, " + f"but got {input_k.shape[-1]}" + ) + if input_q.dtype != input_k.dtype: + raise TypeError( + f"[RMSNormQK] Expected input_q and input_k have same dtype, " + f"but got {input_q.dtype} vs {input_k.dtype}" + ) + + xq = input_q.to(torch.float32) + xk = input_k.to(torch.float32) + + var_q = xq.pow(2).mean(dim=-1, keepdim=True) + var_k = xk.pow(2).mean(dim=-1, keepdim=True) + + out_q = xq * torch.rsqrt(var_q + self.variance_epsilon) + out_k = xk * torch.rsqrt(var_k + self.variance_epsilon) + + out_q = out_q * weight_q + out_k = out_k * weight_k + + return out_q.to(input_q.dtype), out_k.to(input_k.dtype) + + def forward_cuda( + self, + input_q: torch.Tensor, + input_k: torch.Tensor, + weight_q: torch.Tensor, + weight_k: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + if input_q.shape[-1] != self.hidden_size_q: + raise ValueError( + f"[RMSNormQK] CUDA path: Expected input_q last dim = {self.hidden_size_q}, " + f"but got {input_q.shape[-1]}" + ) + if input_k.shape[-1] != self.hidden_size_k: + raise ValueError( + f"[RMSNormQK] CUDA path: Expected input_k last dim = {self.hidden_size_k}, " + f"but got {input_k.shape[-1]}" + ) + if input_q.dtype != input_k.dtype: + raise TypeError( + f"[RMSNormQK] Expected input_q and input_k to have same dtype, " + f"but got {input_q.dtype} vs {input_k.dtype}" + ) + return rms_norm_qk( + input_q, + input_k, + weight_q, + weight_k, + self.variance_epsilon, + ) + + def forward_xpu( + self, + input_q: torch.Tensor, + input_k: torch.Tensor, + weight_q: torch.Tensor, + weight_k: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + from vllm._ipex_ops import ipex_ops as ops + + out_q = ops.rms_norm( + input_q, + weight_q, + self.variance_epsilon, + ) + out_k = ops.rms_norm( + input_k, + weight_k, + self.variance_epsilon, + ) + + return out_q, out_k + + def extra_repr(self) -> str: + return ( + f"RMSNormQK(hidden_size_q={self.hidden_size_q}, " + f"hidden_size_k={self.hidden_size_k}, " + f"eps={self.variance_epsilon}, " + ) + +@CustomOp.register("rms_norm") +class RMSNorm(CustomOp): + """Root mean square normalization. + + Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. + Refer to https://arxiv.org/abs/1910.07467 + """ + + def __init__( + self, + hidden_size: int, + eps: float = 1e-6, + var_hidden_size: int | None = None, + has_weight: bool = True, + dtype: torch.dtype | None = None, + ) -> None: + super().__init__() + + self.hidden_size = hidden_size + self.variance_epsilon = eps + self.variance_size_override = ( + None if var_hidden_size == hidden_size else var_hidden_size + ) + weight_dtype = dtype or torch.get_default_dtype() + self.has_weight = has_weight + self.weight = torch.ones(hidden_size, dtype=weight_dtype) + if self.has_weight: + self.weight = nn.Parameter(self.weight) + + if current_platform.is_rocm(): + aiter_rmsnorm_enabled = rocm_aiter_ops.is_rmsnorm_enabled() + self.rocm_norm_func = dispatch_rocm_rmsnorm_func( + with_fused_add=False, + dtype=weight_dtype, + use_aiter=aiter_rmsnorm_enabled, + ) + self.rocm_norm_func_with_add = dispatch_rocm_rmsnorm_func( + with_fused_add=True, dtype=weight_dtype, use_aiter=aiter_rmsnorm_enabled + ) + + @staticmethod + def forward_static( + x: torch.Tensor, + variance_epsilon: float, + hidden_size: int, + orig_dtype: torch.dtype, + weight: torch.Tensor | None = None, + residual: torch.Tensor | None = None, + variance_size_override: int | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward().""" + x = x.to(torch.float32) + if residual is not None: + # residual promoted f16->f32 automatically, + # otherwise Inductor eliminates the casts to and from f16, + # increasing memory usage (and complicating pattern matching) + x = x + residual + residual = x.to(orig_dtype) + + if x.shape[-1] != hidden_size: + raise ValueError( + f"Expected hidden_size to be {hidden_size}, but found: {x.shape[-1]}" + ) + + if variance_size_override is None: + x_var = x + else: + if hidden_size < variance_size_override: + raise ValueError( + "Expected hidden_size to be at least " + f"{variance_size_override}, but found: {hidden_size}" + ) + + x_var = x[:, :, :variance_size_override] + + variance = x_var.pow(2).mean(dim=-1, keepdim=True) + + x = x * torch.rsqrt(variance + variance_epsilon) + x = x.to(orig_dtype) + if weight is not None: + x = x * weight + if residual is None: + return x + else: + return x, residual + + def forward_native( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward().""" + + return self.forward_static( + x, + self.variance_epsilon, + self.hidden_size, + x.dtype, + self.weight.data if self.has_weight else None, + residual, + self.variance_size_override, + ) + + def forward_cuda( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + residual_alpha: float = 1.0, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if self.variance_size_override is not None: + return self.forward_native(x, residual) + + add_residual = residual is not None + if add_residual: + return fused_add_rms_norm( + x, residual, self.weight.data, self.variance_epsilon,residual_alpha + ) + else: + return rms_norm(x, self.weight.data, self.variance_epsilon) + + def forward_hip( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if self.variance_size_override is not None: + return self.forward_native(x, residual) + + add_residual = residual is not None + if add_residual: + return self.rocm_norm_func_with_add( + x, residual, self.weight.data, self.variance_epsilon + ) + else: + return self.rocm_norm_func(x, self.weight.data, self.variance_epsilon) + + def forward_xpu( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if self.variance_size_override is not None: + return self.forward_native(x, residual) + + from vllm._ipex_ops import ipex_ops as ops + + if residual is not None: + ops.fused_add_rms_norm( + x, + residual, + self.weight.data, + self.variance_epsilon, + ) + return x, residual + return ops.rms_norm( + x, + self.weight.data, + self.variance_epsilon, + ) + + def extra_repr(self) -> str: + s = f"hidden_size={self.weight.data.size(0)}" + s += f", eps={self.variance_epsilon}" + return s + + +@CustomOp.register("gemma_rms_norm") +class GemmaRMSNorm(CustomOp): + """RMS normalization for Gemma. + + Two differences from the above RMSNorm: + 1. x * (1 + w) instead of x * w. + 2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w. + """ + + def __init__( + self, + hidden_size: int, + eps: float = 1e-6, + ) -> None: + super().__init__() + self.weight = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + @staticmethod + def forward_static( + weight: torch.Tensor, + variance_epsilon: float, + x: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward().""" + orig_dtype = x.dtype + if residual is not None: + x = ( + x.float() + residual.float() + if orig_dtype == torch.float16 + else x + residual + ) + residual = x + + x = x.float() + variance = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(variance + variance_epsilon) + # Llama does x.to(float16) * w whilst Gemma is (x * w).to(float16) + # See https://github.com/huggingface/transformers/pull/29402 + x = x * (1.0 + weight.float()) + x = x.to(orig_dtype) + return x if residual is None else (x, residual) + + def forward_native( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward().""" + return self.forward_static(self.weight.data, self.variance_epsilon, x, residual) + + def forward_cuda( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if torch.compiler.is_compiling(): + return self.forward_native(x, residual) + + if not getattr(self, "_is_compiled", False): + self.forward_static = torch.compile( # type: ignore + self.forward_static + ) + self._is_compiled = True + return self.forward_native(x, residual) + + +@CustomOp.register("rms_norm_gated") +class RMSNormGated(CustomOp): + """RMS Normalization with optional gating. + + This is a native PyTorch implementation that supports: + - Standard RMS normalization + - Group RMS normalization + - Optional gating with SiLU activation + """ + + def __init__( + self, + hidden_size: int, + eps: float = 1e-5, + group_size: int | None = None, + norm_before_gate: bool = False, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + ): + """Initialize RMSNormGated. + + Args: + hidden_size: Size of the hidden dimension + eps: Epsilon for numerical stability + group_size: If not None, do GroupNorm with each group + having group_size elements. + group_size=None is equivalent to group_size=hidden_size + (i.e. there's only 1 group). + norm_before_gate: If True and z is provided: out = norm(x) * silu(z) + If False and z is provided: out = norm(x * silu(z)) + device: Device to create parameters on + dtype: Data type for parameters + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.register_parameter("bias", None) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + + def forward_native( + self, x: torch.Tensor, z: torch.Tensor | None = None + ) -> torch.Tensor: + """ + Native PyTorch implementation of RMS normalization with gating. + + Args: + x: Input tensor + z: Optional gating tensor + + Returns: + Normalized (and optionally gated) tensor + + If z is not None: + - norm_before_gate=True: out = norm(x) * silu(z) + - norm_before_gate=False: out = norm(x * silu(z)) + """ + # Apply gating before normalization if needed + if z is not None and not self.norm_before_gate: + x = x * F.silu(z) + + # RMS Normalization + if self.group_size is None: + # Standard RMS norm across the last dimension + variance = x.pow(2).mean(dim=-1, keepdim=True) + x_normed = x * torch.rsqrt(variance + self.eps) + out = x_normed * self.weight + else: + # Group RMS norm + from einops import rearrange + + x_group = rearrange(x, "... (g d) -> ... g d", d=self.group_size) + variance = x_group.pow(2).mean(dim=-1, keepdim=True) + x_normed = x_group * torch.rsqrt(variance + self.eps) + out = rearrange(x_normed, "... g d -> ... (g d)") * self.weight + + # Apply gating after normalization if needed + if z is not None and self.norm_before_gate: + out = out * F.silu(z) + + return out + + def forward_cuda( + self, x: torch.Tensor, z: torch.Tensor | None = None + ) -> torch.Tensor: + from vllm.model_executor.layers.fla.ops.layernorm_guard import rmsnorm_fn + + return rmsnorm_fn( + x, + self.weight, + self.bias, + z=z, + eps=self.eps, + group_size=self.group_size, + norm_before_gate=self.norm_before_gate, + ) + + +class LayerNorm(nn.Module): + """ + Layer Normalization. + """ + + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.dim = dim + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.bias = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + + def forward(self, x: torch.Tensor): + return F.layer_norm( + x.float(), (self.dim,), self.weight, self.bias, self.eps + ).type_as(x) diff --git a/model_executor/layers/lightning_attn.py b/model_executor/layers/lightning_attn.py new file mode 100644 index 0000000..9985368 --- /dev/null +++ b/model_executor/layers/lightning_attn.py @@ -0,0 +1,729 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +from einops import rearrange + +from vllm.triton_utils import tl, triton + + +@triton.jit +def _fwd_diag_kernel( + Q, + K, + V, + Out, + S, + b: tl.constexpr, + h: tl.constexpr, + n, + d: tl.constexpr, + e: tl.constexpr, + BLOCK: tl.constexpr, + NUM_BLOCK, + CBLOCK: tl.constexpr, +): + # This kernel computes the diagonal blocks of the attention matrix + # Each diagonal block represents attention + # where queries attend to keys in the same block + off = tl.program_id(0) + off_bh = off // NUM_BLOCK # batch-head index + off_block = off % NUM_BLOCK # block index within the sequence + off_cblock = tl.program_id(1) # sub-block index within a block + + off_h = off_bh % h # head index + + # Calculate base offsets for the current batch and head + qk_offset = off_bh * n * d + v_offset = off_bh * n * e + o_offset = off_bh * n * e + + # Calculate offsets for the current block + block_offset = off_block * BLOCK + qk_block_offset = block_offset * d + v_block_offset = block_offset * e + o_block_offset = block_offset * e + + # Calculate offsets for the current sub-block + cblock_offset = off_cblock * CBLOCK + q_cblock_offset = cblock_offset * d + o_cblock_offset = cblock_offset * e + + # Calculate pointers to the query, key, value, and output tensors + Q_block_ptr = ( + Q + + qk_offset + + qk_block_offset + + q_cblock_offset + + tl.arange(0, CBLOCK)[:, None] * d + + tl.arange(0, d)[None, :] + ) + K_trans_block_ptr = ( + K + + qk_offset + + qk_block_offset + + tl.arange(0, CBLOCK)[None, :] * d + + tl.arange(0, d)[:, None] + ) + V_block_ptr = ( + V + + v_offset + + v_block_offset + + tl.arange(0, CBLOCK)[:, None] * e + + tl.arange(0, e)[None, :] + ) + O_block_ptr = ( + Out + + o_offset + + o_block_offset + + o_cblock_offset + + tl.arange(0, CBLOCK)[:, None] * e + + tl.arange(0, e)[None, :] + ) + + # Load the decay rate for the current head + S_block_ptr = S + off_h + s = tl.load(S_block_ptr) + + i = off_cblock + q_index = tl.arange(0, CBLOCK) + i * CBLOCK + + # Load query values + q = tl.load(Q_block_ptr, mask=block_offset + q_index[:, None] < n, other=0.0).to( + tl.float32 + ) + + # Initialize output accumulator + qkv = tl.zeros([CBLOCK, e], dtype=tl.float32) + + # Process all sub-blocks up to and + # including the current one (causal attention) + for j in range(i + 1): + kv_index = tl.arange(0, CBLOCK) + j * CBLOCK + diff = q_index[:, None] - kv_index[None, :] + s_index = s * diff + # Apply causal mask: only attend to positions before the current one + s_index = tl.where(diff >= 0, -s_index, float("-inf")) + decay = tl.exp(s_index) + + # Load key and value + k_trans = tl.load( + K_trans_block_ptr, + mask=block_offset + kv_index[None, :] < n, + other=0.0, + ).to(tl.float32) + v = tl.load( + V_block_ptr, + mask=block_offset + kv_index[:, None] < n, + other=0.0, + ).to(tl.float32) + + # Compute attention scores and apply decay + qk = tl.dot(q, k_trans) * decay + + # Compute weighted values and accumulate + qkv += tl.dot(qk, v) + + # Move to the next sub-block + K_trans_block_ptr += CBLOCK * d + V_block_ptr += CBLOCK * e + + # Store the result + tl.store( + O_block_ptr, + qkv.to(O_block_ptr.dtype.element_ty), + mask=block_offset + q_index[:, None] < n, + ) + + +@triton.jit +def _fwd_kv_parallel( + K, + V, + K_decay, + KV, + b: tl.constexpr, + h: tl.constexpr, + n, + d: tl.constexpr, + e: tl.constexpr, + BLOCK: tl.constexpr, + NUM_BLOCK, + D_FBLOCK: tl.constexpr, + E_FBLOCK: tl.constexpr, + NUM_FBLOCK: tl.constexpr, + CBLOCK: tl.constexpr, + NUM_CBLOCK: tl.constexpr, +): + # This kernel computes the key-value outer + # products for each block in parallel + off_bh = tl.program_id(0) # batch-head index + off_block = tl.program_id(1) # block index + + off_h = off_bh % h # head index + + block_offset = off_block * BLOCK + + # Calculate offsets for the current block + k_block_offset = block_offset * d + v_block_offset = block_offset * e + kv_block_offset = off_block * d * e + + # Calculate base offsets for the current batch and head + k_offset = off_bh * n * d + v_offset = off_bh * n * e + kv_offset = off_bh * NUM_BLOCK * d * e + + # Calculate pointers to the key, value, and key-value tensors + K_trans_block_ptr = ( + K + + k_offset + + k_block_offset + + tl.arange(0, CBLOCK)[None, :] * d + + tl.arange(0, D_FBLOCK)[:, None] + ) + V_block_ptr = ( + V + + v_offset + + v_block_offset + + tl.arange(0, CBLOCK)[:, None] * e + + tl.arange(0, E_FBLOCK)[None, :] + ) + KV_block_ptr = ( + KV + + kv_offset + + kv_block_offset + + tl.arange(0, D_FBLOCK)[:, None] * e + + tl.arange(0, E_FBLOCK)[None, :] + ) + + # Load the decay factors for the current head and block + k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :] + + kv_index = tl.arange(0, CBLOCK) + + # Initialize the key-value outer product accumulator + kv = tl.zeros([D_FBLOCK, E_FBLOCK], dtype=tl.float32) + + # Handle the last block which might be smaller than BLOCK + split_n = n - (NUM_BLOCK - 1) * BLOCK if off_block == NUM_BLOCK - 1 else BLOCK + left_shift = tl.cdiv(split_n, CBLOCK) * CBLOCK - split_n + num_blocks = min(tl.cdiv(split_n, CBLOCK), NUM_CBLOCK) + k_decay_ptr += (NUM_CBLOCK - num_blocks) * CBLOCK + + # Process all sub-blocks in the current block + for j in range(num_blocks): + left_bound = (1 - j) * left_shift + # Load key and value, handling boundary conditions + k_trans = tl.load( + K_trans_block_ptr - left_shift * d, + mask=kv_index[None, :] >= left_bound, + other=0.0, + ) + v = tl.load( + V_block_ptr - left_shift * e, + mask=kv_index[:, None] >= left_bound, + other=0.0, + ) + + # Load decay factor and compute weighted key-value outer product + k_decay = tl.load(k_decay_ptr) + kv += tl.dot(k_trans * k_decay, v) + + # Move to the next sub-block + K_trans_block_ptr += CBLOCK * d + V_block_ptr += CBLOCK * e + k_decay_ptr += CBLOCK + + # Store the result + tl.store(KV_block_ptr, kv.to(KV_block_ptr.dtype.element_ty)) + + +@triton.jit +def _fwd_kv_reduce( + S, + KV, + KV_HISTORY, + b: tl.constexpr, + h: tl.constexpr, + n, + d: tl.constexpr, + e: tl.constexpr, + BLOCK: tl.constexpr, + NUM_BLOCK, + D_FBLOCK: tl.constexpr, + E_FBLOCK: tl.constexpr, +): + # This kernel reduces the key-value outer products + # across blocks and updates the KV history + off_bh = tl.program_id(0) # batch-head index + off_h = off_bh % h # head index + + kv_offset = off_bh * NUM_BLOCK * d * e + + # Calculate pointer to the key-value tensor + KV_block_ptr = ( + KV + + kv_offset + + tl.arange(0, D_FBLOCK)[:, None] * e + + tl.arange(0, E_FBLOCK)[None, :] + ) + + # Load the decay rate for the current head + s_ptrs = S + off_h + s = tl.load(s_ptrs) + + # Calculate pointer to the key-value history tensor + kv_history_offset = off_bh * d * e + KV_HISTORY_block_ptr = ( + KV_HISTORY + + kv_history_offset + + tl.arange(0, D_FBLOCK)[:, None] * e + + tl.arange(0, E_FBLOCK)[None, :] + ) + + # Load the previous key-value history + kv_pre = tl.load(KV_HISTORY_block_ptr).to(tl.float32) + + # Process all blocks in reverse order to compute the prefix sum + for i in range(NUM_BLOCK): + block_size = min(n - i * BLOCK, BLOCK) + # Compute decay factor for the current block + block_decay = tl.exp(-s.to(tl.float32) * block_size) + + # Load the current key-value outer product + kv_cur = tl.load(KV_block_ptr).to(tl.float32) + # Store the previous key-value history to the current block + tl.store(KV_block_ptr, kv_pre.to(KV_block_ptr.dtype.element_ty)) + + # Update the key-value history with the current block + kv_pre = block_decay * kv_pre + kv_cur + KV_block_ptr += d * e + + # Store the updated key-value history + tl.store(KV_HISTORY_block_ptr, kv_pre) + + +@triton.jit +def _fwd_none_diag_kernel( + Q, + Out, + S, + KV, + b: tl.constexpr, + h: tl.constexpr, + n, + d: tl.constexpr, + e: tl.constexpr, + BLOCK: tl.constexpr, + NUM_BLOCK, + E_FBLOCK: tl.constexpr, + CBLOCK: tl.constexpr, + NUM_CBLOCK: tl.constexpr, +): + # This kernel computes the non-diagonal blocks of the attention matrix + # Each non-diagonal block represents attention + # where queries attend to keys in different blocks + off_bh = tl.program_id(0) # batch-head index + off_h = off_bh % h # head index + + off_nc = tl.program_id(1) + off_n = off_nc // NUM_CBLOCK # block index + off_c = off_nc % NUM_CBLOCK # sub-block index + off_e = tl.program_id(2) # output feature block index + + n_offset = off_n * BLOCK + c_offset = off_c * CBLOCK + e_offset = off_e * E_FBLOCK + block_offset = n_offset + c_offset + + # Calculate offsets for the current batch, head, and block + q_offset = off_bh * n * d + (n_offset + c_offset) * d + o_offset = off_bh * n * e + (n_offset + c_offset) * e + e_offset + kv_offset = off_bh * NUM_BLOCK * d * e + off_n * d * e + e_offset + + # Calculate pointers to the query, output, and key-value tensors + Q_block_ptr = ( + Q + q_offset + tl.arange(0, CBLOCK)[:, None] * d + tl.arange(0, d)[None, :] + ) + O_block_ptr = ( + Out + + o_offset + + tl.arange(0, CBLOCK)[:, None] * e + + tl.arange(0, E_FBLOCK)[None, :] + ) + KV_block_ptr = ( + KV + kv_offset + tl.arange(0, d)[:, None] * e + tl.arange(0, E_FBLOCK)[None, :] + ) + + # Load the decay rate for the current head + S_block_ptr = S + off_h + s = tl.load(S_block_ptr) + + c_array = tl.arange(0, CBLOCK) + + # Load the key-value outer product for the current block + kv = tl.load(KV_block_ptr).to(tl.float32) + q_index = block_offset + tl.arange(0, CBLOCK) + + # Load query values + q = tl.load(Q_block_ptr, mask=q_index[:, None] < n, other=0.0).to(tl.float32) + + # Compute decay factors for the current sub-block + q_decay = tl.exp(-s.to(tl.float32) * (off_c * CBLOCK + c_array[:, None])) + + # Compute non-diagonal attention output + qkv_none_diag = tl.dot(q, kv) * q_decay + + # Load diagonal attention output (computed by _fwd_diag_kernel) + qkv_diag = tl.load(O_block_ptr, mask=q_index[:, None] < n, other=0.0).to(tl.float32) + + # Combine diagonal and non-diagonal attention outputs + qkv = qkv_diag + qkv_none_diag + + # Store the result + tl.store( + O_block_ptr, qkv.to(O_block_ptr.dtype.element_ty), mask=q_index[:, None] < n + ) + + +class _attention(torch.autograd.Function): + @staticmethod + def forward(ctx, q, k, v, s, kv_history): + # Forward pass of the lightning attention algorithm + q = q.contiguous() + k = k.contiguous() + v = v.contiguous() + s = s.contiguous() + + # Check CUDA compute capability + capability = torch.cuda.get_device_capability() + if capability[0] < 8: + raise RuntimeError( + "Flash attention currently only supported", + "for compute capability >= 80", + ) + + # Get input dimensions + b, h, n, d = q.shape + e = v.shape[-1] + + # Initialize output tensor + o = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device) + + # Set block sizes + BLOCK = 256 + NUM_BLOCK = triton.cdiv(n, BLOCK) + + CBLOCK = 32 + NUM_CBLOCK = BLOCK // CBLOCK + assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK" + + # Compute decay factors for keys + array = torch.arange(0, BLOCK, device=q.device) + 1 + k_decay = torch.exp(-s * (BLOCK - array.reshape(1, -1))) + + # Step 1: Compute diagonal blocks of attention + grid = (b * h * NUM_BLOCK, NUM_CBLOCK) + _fwd_diag_kernel[grid]( + q, + k, + v, + o, + s, + b, + h, + n, + d, + e, + BLOCK=BLOCK, + NUM_BLOCK=NUM_BLOCK, + CBLOCK=CBLOCK, + ) + + # Set feature block sizes + NUM_FBLOCK = 1 + D_FBLOCK = d // NUM_FBLOCK + assert d % NUM_FBLOCK == 0 + E_FBLOCK = e // NUM_FBLOCK + assert e % NUM_FBLOCK == 0 + + CBLOCK = 64 + NUM_CBLOCK = BLOCK // CBLOCK + assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK" + + # Step 2: Compute key-value outer products for each block in parallel + kv = torch.empty((b, h, NUM_BLOCK, d, e), dtype=torch.float32, device=q.device) + grid = (b * h, NUM_BLOCK) + _fwd_kv_parallel[grid]( + k, + v, + k_decay, + kv, + b, + h, + n, + d, + e, + BLOCK=BLOCK, + NUM_BLOCK=NUM_BLOCK, + D_FBLOCK=D_FBLOCK, + E_FBLOCK=E_FBLOCK, + NUM_FBLOCK=NUM_FBLOCK, + CBLOCK=CBLOCK, + NUM_CBLOCK=NUM_CBLOCK, + ) + + # Step 3: Reduce key-value outer products + # across blocks and update KV history + grid = (b * h, NUM_FBLOCK) + _fwd_kv_reduce[grid]( + s, + kv, + kv_history, + b, + h, + n, + d, + e, + BLOCK=BLOCK, + NUM_BLOCK=NUM_BLOCK, + D_FBLOCK=D_FBLOCK, + E_FBLOCK=E_FBLOCK, + ) + + # Step 4: Compute non-diagonal blocks of attention + grid = (b * h, NUM_BLOCK * NUM_CBLOCK) + _fwd_none_diag_kernel[grid]( + q, + o, + s, + kv, + b, + h, + n, + d, + e, + BLOCK=BLOCK, + NUM_BLOCK=NUM_BLOCK, + E_FBLOCK=E_FBLOCK, + CBLOCK=CBLOCK, + NUM_CBLOCK=NUM_CBLOCK, + ) + + # Save tensors for backward pass + ctx.save_for_backward(q, k, v, s, kv) + ctx.BLOCK = BLOCK + + return o, torch.cat([kv, kv_history.unsqueeze(2)], dim=2) + + +# Apply the lightning attention function +lightning_attention_ = _attention.apply + + +def lightning_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + ed: torch.Tensor, + block_size: int = 256, + kv_history: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply lightning attention algorithm + to compute attention efficiently. + + Args: + q: Query tensor of shape [batch, heads, seq_len, dim] + k: Key tensor of shape [batch, heads, seq_len, dim] + v: Value tensor of shape [batch, heads, seq_len, dim_v] + ed: Decay rate tensor of shape [heads] + block_size: Size of blocks for block-sparse attention + kv_history: Optional key-value history from previous computations + + Returns: + output: Attention output + kv: Updated key-value history + """ + d = q.shape[-1] + e = v.shape[-1] + + if ed.dim() == 1: + ed = ed.view(1, -1, 1, 1) + + # Split the computation into chunks for better parallelism + m = 128 if d >= 128 else 64 + assert d % m == 0, f"Dimension d ({d}) must be divisible by m ({m})" + arr = [m * i for i in range(d // m + 1)] + if arr[-1] != d: + arr.append(d) + n = len(arr) + output = 0 + + # Initialize or clone key-value history + if kv_history is None: + kv_history = torch.zeros( + (q.shape[0], q.shape[1], d, e), dtype=torch.float32, device=q.device + ) + else: + kv_history = kv_history.clone().contiguous() + + # Process each chunk and accumulate results + for i in range(n - 1): + s = arr[i] + e = arr[i + 1] + q1 = q[..., s:e] + k1 = k[..., s:e] + o, kv = lightning_attention_(q1, k1, v, ed, kv_history) + output = output + o + return output, kv + + +@triton.jit +def _linear_attn_decode_kernel( + q_ptr, + k_ptr, + v_ptr, + kv_cache_ptr, + slope_rate, + slot_idx, + output_ptr, + D: tl.constexpr, + qkv_b_stride, + qkv_h_stride, + cache_b_stride, + cache_h_stride, + cache_d0_stride, + cache_d1_stride, + BLOCK_SIZE: tl.constexpr, +): + """ + Kernel for linear attention decoding with KV cache. + + This kernel computes attention for a single token using the KV cache. + """ + pid_b = tl.program_id(0) # batch index + pid_h = tl.program_id(1) # head index + pid_d = tl.program_id(2) # dimension block index + + # Load slot index for the current batch + slot_id = tl.load(slot_idx + pid_b).to(tl.int64) + + # Skip if slot_id is -1 (padding) + if slot_id == -1: + return + + batch_id = pid_b + head_id = pid_h + + # Load decay rate for the current head + ratio = tl.load(slope_rate + pid_h) + + # Calculate offsets for dimensions + qk_d_offsets = tl.arange(0, D) + v_d_offsets = tl.arange(0, BLOCK_SIZE) + pid_d * BLOCK_SIZE + cache_d_offsets = ( + qk_d_offsets[:, None] * cache_d0_stride + v_d_offsets[None, :] * cache_d1_stride + ) + + # Calculate offsets for the current batch and head + q_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride + k_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride + v_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride + + cache_offset = slot_id * cache_b_stride + head_id * cache_h_stride + + # Create masks for loading tensors + qk_mask = qk_d_offsets < D + v_mask = v_d_offsets < D + + # Load query, key, and value tensors + q = tl.load(q_ptr + q_offset + qk_d_offsets, mask=qk_mask, other=0.0) + k = tl.load(k_ptr + k_offset + qk_d_offsets, mask=qk_mask, other=0.0) + v = tl.load(v_ptr + v_offset + v_d_offsets, mask=v_mask, other=0.0) + + # Compute key-value outer product + kv_outer = k[:, None] * v[None, :] + kv_mask = qk_mask[:, None] & v_mask[None, :] + + # Apply decay to previous KV cache + ratio = tl.exp(-ratio) + kv_ptr = kv_cache_ptr + cache_offset + cache_d_offsets + kv_cache_old = tl.load(kv_ptr, mask=kv_mask, other=0.0) + kv_outer = kv_outer + ratio * kv_cache_old + + # Compute attention output + output = q[:, None].to(tl.float32) * kv_outer + output = tl.sum(output, axis=0) + + # Update KV cache and store output + tl.store(kv_ptr, kv_outer, mask=kv_mask) + tl.store(output_ptr + q_offset + v_d_offsets, output, mask=v_mask) + + +def linear_decode_forward_triton( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + kv_caches: torch.Tensor, + slope_rate: torch.Tensor, + slot_idx: torch.Tensor, + BLOCK_SIZE: int = 32, +) -> torch.Tensor: + """ + Perform linear attention decoding using Triton kernels. + + Args: + q: Query tensor of shape [B, H, 1, D] + k: Key tensor of shape [B, H, 1, D] + v: Value tensor of shape [B, H, 1, D] + kv_caches: Key-value cache tensor + slope_rate: Decay rate tensor + slot_idx: Slot indices for batches + BLOCK_SIZE: Size of blocks for processing + + Returns: + output: Attention output tensor + """ + B, H, _, D = q.shape + assert k.shape == (B, H, 1, D) + assert v.shape == (B, H, 1, D) + + # Initialize output tensor + output = torch.empty_like(q) + + # Set grid dimensions for the kernel + grid = (B, H, D // BLOCK_SIZE) + + # Calculate strides for tensors + qkv_b_stride = q.stride(0) + qkv_h_stride = q.stride(1) + + cache_b_stride = kv_caches.stride(0) + cache_h_stride = kv_caches.stride(1) + cache_d0_stride = kv_caches.stride(2) + cache_d1_stride = kv_caches.stride(3) + + # Launch the kernel + _linear_attn_decode_kernel[grid]( + q, + k, + v, + kv_caches, + slope_rate, + slot_idx, + output, + D, + qkv_b_stride, + qkv_h_stride, + cache_b_stride, + cache_h_stride, + cache_d0_stride, + cache_d1_stride, + BLOCK_SIZE=BLOCK_SIZE, + ) + + # Reshape output and return + output = rearrange(output, "b h n d -> b n (h d)") + return output.squeeze(1).contiguous() diff --git a/model_executor/layers/linear.py b/model_executor/layers/linear.py new file mode 100644 index 0000000..f7d2d96 --- /dev/null +++ b/model_executor/layers/linear.py @@ -0,0 +1,1496 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import itertools +import ast, re +from abc import abstractmethod +from typing import Any + +import torch +from torch.nn.parameter import Parameter, UninitializedParameter + +from vllm.distributed import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + split_tensor_along_last_dim, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.utils import dispatch_unquantized_gemm +from vllm.model_executor.parameter import ( + BasevLLMParameter, + BlockQuantScaleParameter, + ModelWeightParameter, + PackedColumnParameter, + PackedvLLMParameter, + PerTensorScaleParameter, + RowvLLMParameter, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from compressed_tensors.quantization import QuantizationStrategy + +logger = init_logger(__name__) + +WEIGHT_LOADER_V2_SUPPORTED = [ + "CompressedTensorsLinearMethod", + "CompressedTensorsLinearTransformMethod", + "BitBLASLinearMethod", + "GPTQBitBLASLinearMethod", + "AWQMarlinLinearMethod", + "AWQLinearMethod", + "GPTQMarlinLinearMethod", + "Fp8LinearMethod", + "MarlinLinearMethod", + "GPTQMarlin24LinearMethod", + "TPUInt8LinearMethod", + "GPTQLinearMethod", + "FBGEMMFp8LinearMethod", + "ModelOptFp8LinearMethod", + "IPEXAWQLinearMethod", + "IPEXGPTQLinearMethod", + "HQQMarlinMethod", + "QuarkLinearMethod", + "ModelOptNvFp4LinearMethod", + "PetitNvFp4LinearMethod", +] + +LINEAR_OPT_SUPPORTED = [ + "ColumnParallelLinear", + "ReplicatedLinear", + "RowParallelLinear", + "QKVParallelLinear" +] + +def weight_quant(input): + assert input.dim() == 2 + qmax = 127.0 + abs_max = torch.abs(input).max(dim=1, keepdim=True)[0] + scale = abs_max / qmax + assert scale.shape == (input.shape[0], 1) + quantized = torch.round(input / scale) + quantized = torch.clamp(quantized, -qmax, qmax) + return quantized.to(torch.int8), scale.to(torch.float32) + +def adjust_bitblas_shard(param, shard_size, shard_offset): + bitblas_tile_size = getattr(param, "bitblas_tile_size", None) + if bitblas_tile_size is not None: + return (shard_size // bitblas_tile_size, shard_offset // bitblas_tile_size) + + return shard_size, shard_offset + + +def adjust_marlin_shard(param, shard_size, shard_offset): + marlin_tile_size = getattr(param, "marlin_tile_size", None) + if marlin_tile_size is None: + return shard_size, shard_offset + + return shard_size * marlin_tile_size, shard_offset * marlin_tile_size + + +def adjust_bitsandbytes_4bit_shard( + param: Parameter, shard_offsets: dict[str, tuple[int, int]], loaded_shard_id: str +) -> tuple[int, int]: + """Adjust the quantization offsets and sizes for BitsAndBytes sharding.""" + + total, _ = shard_offsets["total"] + orig_offset, orig_size = shard_offsets[loaded_shard_id] + + quantized_total = param.data.shape[0] + quantized_offset = orig_offset * quantized_total // total + quantized_size = orig_size * quantized_total // total + + return quantized_size, quantized_offset + + +def adjust_scalar_to_fused_array(param, loaded_weight, shard_id): + """For fused modules (QKV and MLP) we have an array of length + N that holds 1 scale for each "logical" matrix. So the param + is an array of length N. The loaded_weight corresponds to + one of the shards on disk. Here, we slice the param based on + the shard_id for loading. + """ + qkv_idxs = {"q": 0, "k": 1, "v": 2} + + if isinstance(shard_id, str): + shard_id = qkv_idxs[shard_id] + elif not isinstance(shard_id, int): + raise ValueError(f"Unknown Shard Id {shard_id}") + + # AutoFP8 scales do not have a shape + # compressed-tensors scales do have a shape + if len(loaded_weight.shape) != 0: + assert loaded_weight.shape[0] == 1 + loaded_weight = loaded_weight[0] + + return param[shard_id], loaded_weight + + +# TODO(Isotr0py): We might need a more flexible structure to handle +# bitsandbytes shard offsets. +def left_shift_bitsandbytes_4bit_shard(bnb_weight_attrs: dict[str, Any]): + """ + Separate the BitsAndBytes 4-bit shard. + + For example, given bnb weight attributes as below: + { + 'bnb_shard_offsets': array([0, 4, 8, 16]), + 'bnb_quant_state': {0: ..., 1: ..., 2: ...}, + } + + The function will return: + { + 'bnb_shard_offsets': array([0, 4]), + 'bnb_quant_state': {0: ...}, + } + and + { + 'bnb_shard_offsets': array([0, 4, 12]), + 'bnb_quant_state': {0: ..., 1: ...}, + } + """ + shard_offsets = bnb_weight_attrs["bnb_shard_offsets"] + offset_l = shard_offsets[:2] + offset_r = shard_offsets[1:] - shard_offsets[1] + quant_state_l = {0: bnb_weight_attrs["bnb_quant_state"][0]} + quant_state_r = { + i - 1: bnb_weight_attrs["bnb_quant_state"][i] + for i in range(1, len(shard_offsets) - 1) + } + left = dict(bnb_shard_offsets=offset_l, bnb_quant_state=quant_state_l) + right = dict(bnb_shard_offsets=offset_r, bnb_quant_state=quant_state_r) + return left, right + + +class LinearMethodBase(QuantizeMethodBase): + """Base class for different (maybe quantized) linear methods.""" + + @abstractmethod + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + """Create weights for a linear layer. + The weights will be set as attributes of the layer. + + Args: + layer: The layer that is using the LinearMethodBase factory. + input_size_per_partition: Size of the weight input dim on rank X. + output_partition_sizes: Sizes of the output dim of each logical + weight on rank X. E.g., output_partition_sizes for QKVLinear + is a list contains the width of Wq, Wk, Wv on rank X. + input_size: Size of the input dim of the weight across all ranks. + output_size: Size of the output dim of the weight across all ranks. + params_dtype: Datatype of the parameters. + """ + raise NotImplementedError + + @abstractmethod + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + """Apply the weights in layer to the input tensor. + Expects create_weights to have been called before on the layer.""" + raise NotImplementedError + + +class UnquantizedLinearMethod(LinearMethodBase): + """Linear method without quantization.""" + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # This method creates unquantized linear weights. + # The weights are not quantized, and they are not sharded. + # The amount of memory allocated for the weights is + # sum(output_partition_sizes) * input_size_per_partition. + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + + layer.register_parameter("weight", weight) + set_weight_attrs(weight, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if current_platform.is_cpu(): + from vllm.model_executor.layers.utils import dispatch_cpu_unquantized_gemm + + dispatch_cpu_unquantized_gemm(layer, remove_weight=True) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return dispatch_unquantized_gemm()(layer, x, layer.weight, bias) + + +class LinearBase(CustomOp): + """Base linear layer. + + Args: + input_size: input dimension of the linear layer. + output_size: output dimension of the linear layer. + skip_bias_add: If true, skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: Prefix for parameter names. + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, tensor parallelism will be disabled for this layer. + """ + + def __init__( + self, + input_size: int, + output_size: int, + skip_bias_add: bool = False, + params_dtype: torch.dtype | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + super().__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + self.skip_bias_add = skip_bias_add + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + self.quant_config = quant_config + self.prefix = prefix + self.opt_flag = quant_config is None and envs.VLLM_LINEAR_OPT_LEVEL == 1 and \ + self.__class__.__name__ in LINEAR_OPT_SUPPORTED + + opt_exclude_layers = envs.VLLM_OPT_EXCLUDE_LAYERS + opt_exclude_layers = ast.literal_eval(opt_exclude_layers) if opt_exclude_layers.strip() else "" + if isinstance(opt_exclude_layers, tuple): + layer_info = re.search(r'\.(\d+)', prefix) + if layer_info is not None and int(layer_info.group(1)) in opt_exclude_layers: + self.opt_flag = False + + if self.opt_flag: + from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import CompressedTensorsLinearMethod + from vllm.model_executor.layers.quantization.compressed_tensors.schemes import CompressedTensorsW8A8Int8 + self.quant_method: Optional[ + QuantizeMethodBase] = CompressedTensorsLinearMethod(None) + self.scheme = CompressedTensorsW8A8Int8(QuantizationStrategy.CHANNEL, False, True) + elif quant_config is None: + self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod() + else: + self.quant_method = quant_config.get_quant_method(self, prefix=prefix) + self.return_bias = return_bias + self.output_padding_size = 0 + self.disable_tp = disable_tp + self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0 + self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1 + + def update_param_tp_status(self): + for param in self.parameters(): + if isinstance(param, BasevLLMParameter): + param.tp_rank = self.tp_rank + param.tp_size = self.tp_size + + +@CustomOp.register("replicated_linear") +class ReplicatedLinear(LinearBase): + """Replicated linear layer. + + Args: + input_size: input dimension of the linear layer. + output_size: output dimension of the linear layer. + bias: If true, add bias. + skip_bias_add: If true, skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: Take no effect for replicated linear layers. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: torch.dtype | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + # If MergedReplicatedLinear, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = self.output_sizes + else: + self.output_partition_sizes = [output_size] + + super().__init__( + input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix=prefix, + return_bias=return_bias, + disable_tp=disable_tp, + ) + + # All the linear layer supports quant method. + assert self.quant_method is not None + self.quant_method.create_weights( + self, + self.input_size, + self.output_partition_sizes, + self.input_size, + self.output_size, + self.params_dtype, + weight_loader=self.weight_loader, + ) + + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=self.params_dtype) + ) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) + else: + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + # If the weight on disk does not have a shape, give it one + # (such scales for AutoFp8). + # Special case for GGUF + + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) + + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param.size() == loaded_weight.size(), ( + f"Tried to load weights of size {loaded_weight.size()}" + f"to a parameter of size {param.size()}" + ) + if self.opt_flag: + loaded_weight, scale = weight_quant(loaded_weight) + + param.data.copy_(loaded_weight) + if self.opt_flag: + params_dict = dict(self.named_parameters()) + scale_param = params_dict["weight_scale"] + scale_param.data.copy_(scale) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]: + bias = self.bias if not self.skip_bias_add else None + assert self.quant_method is not None + + output = self.quant_method.apply(self, x, bias) + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + def extra_repr(self) -> str: + s = f"in_features={self.input_size}" + s += f", output_features={self.output_size}" + s += f", bias={self.bias is not None}" + return s + + +@CustomOp.register("column_parallel_linear") +class ColumnParallelLinear(LinearBase): + """Linear layer with column parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its second dimension as A = [A_1, ..., A_p]. + + Args: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. + gather_output: If true, call all-gather on output and make Y available + to all GPUs, otherwise, every GPU will have its output + which is Y_i = XA_i + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + output_sizes: list of output sizes packed into one output, like for QKV + the list would be size 3. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: torch.dtype | None = None, + quant_config: QuantizationConfig | None = None, + output_sizes: list[int] | None = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + # Divide the weight matrix along the last dimension. + self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0 + self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1 + self.input_size_per_partition = input_size + self.output_size_per_partition = divide(output_size, self.tp_size) + self.output_partition_sizes = [self.output_size_per_partition] + # If QKV or MergedColumn, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = [ + divide(output_size, self.tp_size) for output_size in self.output_sizes + ] + + super().__init__( + input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias, + disable_tp=disable_tp, + ) + + self.gather_output = gather_output + + if output_sizes is None: + output_sizes = [output_size] + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=( + self.weight_loader_v2 + if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED + else self.weight_loader + ), + ) + if bias: + self.bias = Parameter( + torch.empty(self.output_size_per_partition, dtype=params_dtype) + ) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) + else: + self.register_parameter("bias", None) + self.update_param_tp_status() + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + output_dim = getattr(param, "output_dim", None) + + is_sharded_weight = getattr(param, "is_sharded_weight", False) + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + + # Special case for GGUF + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + final_shape = list(loaded_weight.shape) + if output_dim is not None: + assert final_shape[output_dim] % self.tp_size == 0 + final_shape[output_dim] = final_shape[output_dim] // self.tp_size + param.materialize(final_shape, dtype=loaded_weight.dtype) + + param_data = param.data + if output_dim is not None and not is_sharded_weight: + shard_size = param_data.shape[output_dim] + start_idx = self.tp_rank * shard_size + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor): + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + assert loaded_weight.numel() == 1 + loaded_weight = loaded_weight.reshape(1) + + if self.opt_flag: + loaded_weight, scale = weight_quant(loaded_weight) + + param.load_column_parallel_weight(loaded_weight=loaded_weight) + if self.opt_flag: + params_dict = dict(self.named_parameters()) + scale_param = params_dict["weight_scale"] + scale_param.load_column_parallel_weight(loaded_weight=scale) + + def forward( + self, + input_, + ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]: + bias = self.bias if not self.skip_bias_add else None + + # Matrix multiply. + assert self.quant_method is not None + output_parallel = self.quant_method.apply(self, input_, bias) + + if self.gather_output and self.tp_size > 1: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + if not self.return_bias: + return output + return output, output_bias + + def extra_repr(self) -> str: + s = f"in_features={self.input_size}" + s += f", output_features={self.output_size_per_partition}" + s += f", bias={self.bias is not None}" + s += f", tp_size={self.tp_size}" + s += f", gather_output={self.gather_output}" + return s + + +class MergedColumnParallelLinear(ColumnParallelLinear): + """Packed linear layers with column parallelism. + + Similar to ColumnParallelLinear, but the weight matrix is concatenated + along the output dimension. When the weight matrix is loaded, the + different partitions are sharded separately. + + Args: + input_size: input dimension of the linear layer. + output_sizes: list of output dimensions of the linear layer. + bias: If true, add bias. + gather_output: If true, call all-gather on output and make the output + available to all GPUs, otherwise, every GPU will have + its own output. + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, all weights matrix won't be sharded, this layer + will be treated as a "Replicated" MergedLinear. + """ + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: torch.dtype | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + self.output_sizes = output_sizes + self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1 + self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0 + + assert all(output_size % self.tp_size == 0 for output_size in output_sizes) + super().__init__( + input_size=input_size, + output_size=sum(output_sizes), + bias=bias, + gather_output=gather_output, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + prefix=prefix, + return_bias=return_bias, + disable_tp=disable_tp, + ) + + def weight_loader( + self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: int | None = None, + ): + # Special case for GGUF + # initialize GGUF param after we know the quantize type + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + if loaded_shard_id is not None: + param.data[loaded_shard_id].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + else: + param.shard_weight_type = { + i: loaded_weight.item() for i, _ in enumerate(self.output_sizes) + } + return + + if is_gguf_weight: + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // self.tp_size + start_idx = self.tp_rank * shard_size + + if loaded_shard_id is not None: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + return + + param_data = param.data + output_dim = getattr(param, "output_dim", None) + # Special case for per-tensor scale to load scalar into fused array. + needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) + + if loaded_shard_id is None: + # Loaded weight is already fused on disk (mlp). + # (e.g., Phi-3's gate_up_proj). + if output_dim is None: + if needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, 0 + ) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + current_shard_offset = 0 + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + shard_offsets: list[tuple[int, int, int]] = [] + for i, output_size in enumerate(self.output_sizes): + shard_offsets.append((i, current_shard_offset, output_size)) + current_shard_offset += output_size + packed_dim = getattr(param, "packed_dim", None) + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if packed_dim == output_dim: + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset + ) + + shard_size, shard_offset = adjust_bitblas_shard( + param, shard_size, shard_offset + ) + + if use_bitsandbytes_4bit: + index = list(itertools.accumulate([0] + self.output_sizes)) + orig_offsets = { + str(i): (index[i], size) + for i, size in enumerate(self.output_sizes) + } + orig_offsets["total"] = (self.output_size, 0) + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( + param, orig_offsets, str(shard_id) + ) + + loaded_weight_shard = loaded_weight.narrow( + output_dim, shard_offset, shard_size + ) + self.weight_loader(param, loaded_weight_shard, shard_id) + return + + assert loaded_shard_id < len(self.output_sizes) + if output_dim is not None: + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size + shard_size = self.output_sizes[loaded_shard_id] // self.tp_size + # Special case for quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + packed_dim = getattr(param, "packed_dim", None) + if packed_dim == output_dim: + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset + ) + shard_size, shard_offset = adjust_bitblas_shard( + param, shard_size, shard_offset + ) + + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + + if use_bitsandbytes_4bit: + shard_size = loaded_weight.shape[output_dim] + shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id + + param_data = param_data.narrow(output_dim, shard_offset, shard_size) + start_idx = self.tp_rank * shard_size + if not is_sharded_weight: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + # Special case for per-tensor scales in fused case. + elif needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, loaded_shard_id + ) + + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "MergedColumnParallelLinear, assume the weight is " + "the same for all partitions." + ) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def _load_fused_module_from_checkpoint( + self, param: BasevLLMParameter, loaded_weight: torch.Tensor + ): + """ + Handle special case for models where MLP layers are already + fused on disk. In this case, we have no shard id. This function + determines the shard id by splitting these layers and then calls + the weight loader using the shard id. + + An example of a model with these fused layers: + https://huggingface.co/microsoft/Phi-3-mini-4k-instruct + """ + + current_shard_offset = 0 + shard_offsets: list[tuple[int, int, int]] = [] + for i, output_size in enumerate(self.output_sizes): + shard_offsets.append((i, current_shard_offset, output_size)) + current_shard_offset += output_size + + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if ( + isinstance(param, (PackedColumnParameter, PackedvLLMParameter)) + and param.packed_dim == param.output_dim + ): + shard_size, shard_offset = param.adjust_shard_indexes_for_packing( + shard_size=shard_size, shard_offset=shard_offset + ) + + loaded_weight_shard = loaded_weight.narrow( + param.output_dim, shard_offset, shard_size + ) + self.weight_loader_v2(param, loaded_weight_shard, shard_id) + + def weight_loader_v2( + self, + param: BasevLLMParameter, + loaded_weight: torch.Tensor, + loaded_shard_id: int | None = None, + ): + if loaded_shard_id is None: + if isinstance(param, PerTensorScaleParameter): + param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0) + return + elif type(param) in (RowvLLMParameter, BasevLLMParameter): + param.load_merged_column_weight(loaded_weight=loaded_weight) + return + # TODO: @dsikka - move to parameter.py + self._load_fused_module_from_checkpoint(param, loaded_weight) + return + + assert loaded_shard_id < len(self.output_sizes) + + if isinstance(param, BlockQuantScaleParameter): + assert self.quant_method is not None + # Assume the weight block size has been set by quant method + assert hasattr(self, "weight_block_size") + weight_block_size = self.weight_block_size + assert weight_block_size is not None + block_n, _ = weight_block_size[0], weight_block_size[1] + shard_offset = ( + (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n + ) // self.tp_size + shard_size = ( + (self.output_sizes[loaded_shard_id] + block_n - 1) + // block_n + // self.tp_size + ) + else: + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size + shard_size = self.output_sizes[loaded_shard_id] // self.tp_size + + param.load_merged_column_weight( + loaded_weight=loaded_weight, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=self.tp_rank, + ) + + +class QKVParallelLinear(ColumnParallelLinear): + """Linear layers for the attention's QKV transformation. + + Linear layers for the linear transformation of the query, key, and value + vectors in the attention layer. The weight matrix is concatenated along + the output dimension. The layer is parallelized along the head dimension. + When the number of key/value heads is smaller than the number of query + heads (e.g., multi-query/grouped-query attention), the key/value head may + be replicated while the query heads are partitioned. + + Args: + hidden_size: input hidden state size of the transformer. + head_size: size of each attention head. + total_num_heads: total number of attention query heads. + total_num_kv_heads: total number of attention key/value heads. If + None, assume total_num_kv_heads = total_num_heads. + bias: If true, add bias. + skip_bias_add: This was added to enable performance optimizations where + bias can be fused with other element-wise operations. we + skip adding bias but instead return it. + params_dtype: Data type for the parameters. + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. + """ + + def __init__( + self, + hidden_size: int, + head_size: int, + total_num_heads: int, + total_num_kv_heads: int | None = None, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: torch.dtype | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + self.hidden_size = hidden_size + self.head_size = head_size + self.total_num_heads = total_num_heads + if total_num_kv_heads is None: + total_num_kv_heads = total_num_heads + self.total_num_kv_heads = total_num_kv_heads + # Divide the weight matrix along the last dimension. + tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1 + self.num_heads = divide(self.total_num_heads, tp_size) + if tp_size >= self.total_num_kv_heads: + self.num_kv_heads = 1 + self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads) + else: + self.num_kv_heads = divide(self.total_num_kv_heads, tp_size) + self.num_kv_head_replicas = 1 + input_size = self.hidden_size + output_size = ( + (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size + ) + self.output_sizes = [ + self.num_heads * self.head_size * tp_size, # q_proj + self.num_kv_heads * self.head_size * tp_size, # k_proj + self.num_kv_heads * self.head_size * tp_size, # v_proj + ] + + super().__init__( + input_size=input_size, + output_size=output_size, + bias=bias, + gather_output=False, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + prefix=prefix, + return_bias=return_bias, + disable_tp=disable_tp, + ) + + def _get_shard_offset_mapping(self, loaded_shard_id: str): + shard_offset_mapping = { + "q": 0, + "k": self.num_heads * self.head_size, + "v": (self.num_heads + self.num_kv_heads) * self.head_size, + "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size, + } + return shard_offset_mapping.get(loaded_shard_id) + + def _get_shard_size_mapping(self, loaded_shard_id: str): + shard_size_mapping = { + "q": self.num_heads * self.head_size, + "k": self.num_kv_heads * self.head_size, + "v": self.num_kv_heads * self.head_size, + } + return shard_size_mapping.get(loaded_shard_id) + + def _load_fused_module_from_checkpoint( + self, param: BasevLLMParameter, loaded_weight: torch.Tensor + ): + """ + Handle special case for models where QKV layers are already + fused on disk. In this case, we have no shard id. This function + determines the shard id by splitting these layers and then calls + the weight loader using the shard id. + + An example of a model with these fused layers: + https://huggingface.co/microsoft/Phi-3-mini-4k-instruct + """ + shard_offsets = [ + # (shard_id, shard_offset, shard_size) + ("q", 0, self.total_num_heads * self.head_size), + ( + "k", + self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + ( + "v", + (self.total_num_heads + self.total_num_kv_heads) * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + ] + + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if ( + isinstance(param, (PackedColumnParameter, PackedvLLMParameter)) + and param.packed_dim == param.output_dim + ): + shard_size, shard_offset = param.adjust_shard_indexes_for_packing( + shard_size=shard_size, shard_offset=shard_offset + ) + + loaded_weight_shard = loaded_weight.narrow( + param.output_dim, shard_offset, shard_size + ) + self.weight_loader_v2(param, loaded_weight_shard, shard_id) + + def weight_loader_v2( + self, + param: BasevLLMParameter, + loaded_weight: torch.Tensor, + loaded_shard_id: str | None = None, + ): + if self.opt_flag: + loaded_weight, scale = weight_quant(loaded_weight) + params_dict = dict(self.named_parameters()) + scale_param = params_dict["weight_scale"] + if loaded_shard_id is None: # special case for certain models + if isinstance(param, PerTensorScaleParameter): + param.load_qkv_weight( + loaded_weight=loaded_weight, shard_id=0, tp_rank=self.tp_rank + ) + if self.opt_flag: + scale_param.load_qkv_weight( + loaded_weight=loaded_weight, shard_id=0, tp_rank=self.tp_rank + ) + return + elif type(param) in (RowvLLMParameter, BasevLLMParameter): + param.load_qkv_weight(loaded_weight=loaded_weight, tp_rank=self.tp_rank) + if self.opt_flag: + scale_param.load_qkv_weight(loaded_weight=scale, tp_rank=self.tp_rank) + return + + # TODO: @dsikka - move to parameter.py + self._load_fused_module_from_checkpoint(param, loaded_weight) + if self.opt_flag: + self._load_fused_module_from_checkpoint(scale_param, scale) + return + + assert loaded_shard_id in ["q", "k", "v"] + + shard_offset = self._get_shard_offset_mapping(loaded_shard_id) + shard_size = self._get_shard_size_mapping(loaded_shard_id) + + # Note(simon): This is needed for Qwen3's fp8 quantization. + if isinstance(param, BlockQuantScaleParameter): + assert self.quant_method is not None + # Assume the weight block size has been set by quant method + assert hasattr(self, "weight_block_size") + weight_block_size = self.weight_block_size + assert weight_block_size is not None + block_n, _ = weight_block_size[0], weight_block_size[1] + shard_offset = (shard_offset + block_n - 1) // block_n + shard_size = (shard_size + block_n - 1) // block_n + + param.load_qkv_weight( + loaded_weight=loaded_weight, + num_heads=self.num_kv_head_replicas, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=self.tp_rank, + ) + + if self.opt_flag: + scale_param.load_qkv_weight(loaded_weight=scale, + num_heads=self.num_kv_head_replicas, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size, + tp_rank=self.tp_rank) + + def weight_loader( + self, + param: Parameter, + loaded_weight: torch.Tensor, + loaded_shard_id: str | None = None, + ): + # Special case for GGUF + # initialize GGUF param after we know the quantize type + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + idx_map = {"q": 0, "k": 1, "v": 2} + if loaded_shard_id is not None: + param.data[idx_map[loaded_shard_id]].copy_(loaded_weight) + param.shard_weight_type[loaded_shard_id] = loaded_weight.item() + else: + param.shard_weight_type = {k: loaded_weight.item() for k in idx_map} + return + + if is_gguf_weight: + output_dim = getattr(param, "output_dim", None) + shard_size = loaded_weight.size(output_dim) // self.tp_size + start_idx = self.tp_rank * shard_size + + if loaded_shard_id is not None: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + param.shard_id.append(loaded_shard_id) + param.shard_id_map[loaded_shard_id] = len(param.data_container) + param.data_container.append(loaded_weight) + return + + param_data = param.data + output_dim = getattr(param, "output_dim", None) + + # Special case for per-tensor scales in fused case. + needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) + + if loaded_shard_id is None: + # Loaded weight is already fused on disk (qkv). + # (e.g., Phi-3's qkv_proj). + if output_dim is None: + if needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, 0 + ) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + shard_offsets = [ + # (shard_id, shard_offset, shard_size) + ("q", 0, self.total_num_heads * self.head_size), + ( + "k", + self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + ( + "v", + (self.total_num_heads + self.total_num_kv_heads) * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + ] + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + + packed_dim = getattr(param, "packed_dim", None) + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantized Weights. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if packed_dim == output_dim: + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor + + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset + ) + + if use_bitsandbytes_4bit: + orig_qkv_offsets = { + "q": (0, self.total_num_heads * self.head_size), + "k": ( + self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + "v": ( + (self.total_num_heads + self.total_num_kv_heads) + * self.head_size, + self.total_num_kv_heads * self.head_size, + ), + "total": ( + (self.total_num_heads + 2 * self.total_num_kv_heads) + * self.head_size, + 0, + ), + } + + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( + param, orig_qkv_offsets, shard_id + ) + + loaded_weight_shard = loaded_weight.narrow( + output_dim, shard_offset, shard_size + ) + self.weight_loader(param, loaded_weight_shard, shard_id) + return + + assert loaded_shard_id in ["q", "k", "v"] + + # If output dim is defined, use the default loading process. + if output_dim is not None: + if loaded_shard_id == "q": + shard_offset = 0 + shard_size = self.num_heads * self.head_size + elif loaded_shard_id == "k": + shard_offset = self.num_heads * self.head_size + shard_size = self.num_kv_heads * self.head_size + elif loaded_shard_id == "v": + shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size + shard_size = self.num_kv_heads * self.head_size + # Special case for Quantized Weights. + # If quantized, we need to adjust the offset and size to account + # for the packing. + packed_dim = getattr(param, "packed_dim", None) + if packed_dim == output_dim: + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor + + # Special case for Marlin. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset + ) + + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + + if use_bitsandbytes_4bit: + orig_qkv_offsets = { + "q": (0, self.num_heads * self.head_size), + "k": ( + self.num_heads * self.head_size, + self.num_kv_heads * self.head_size, + ), + "v": ( + (self.num_heads + self.num_kv_heads) * self.head_size, + self.num_kv_heads * self.head_size, + ), + "total": ( + (self.num_heads + 2 * self.num_kv_heads) * self.head_size, + 0, + ), + } + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( + param, orig_qkv_offsets, loaded_shard_id + ) + + param_data = param_data.narrow(output_dim, shard_offset, shard_size) + if loaded_shard_id == "q": + shard_rank = self.tp_rank + else: + shard_rank = self.tp_rank // self.num_kv_head_replicas + start_idx = shard_rank * shard_size + + if not is_sharded_weight: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + + # Special case for per-tensor scales in fused case. + elif needs_scalar_to_array: + param_data, loaded_weight = adjust_scalar_to_fused_array( + param_data, loaded_weight, loaded_shard_id + ) + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "QKVParallelLinear, assume the weight is the same " + "for all partitions." + ) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +@CustomOp.register("row_parallel_linear") +class RowParallelLinear(LinearBase): + """Linear layer with row parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its first dimension and X along its second dimension as: + - - + | A_1 | + | . | + A = | . | X = [X_1, ..., X_p] + | . | + | A_p | + - - + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + bias: If true, add bias. Note that bias is not parallelized. + input_is_parallel: If true, we assume that the input is already + split across the GPUs and we do not split + again. + skip_bias_add: This was added to enable performance optimization where + bias can be fused with other element-wise operations. + We skip adding bias but instead return it. + params_dtype: Data type for the parameters. + reduce_results: If true, call all-reduce on output and make Y available + to all GPUs, otherwise, every GPU will have its output + which is Y = X_iA_i + quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.down_proj) + return_bias: If true, return bias together with outputs in forward pass. + disable_tp: If true, weights matrix won't be sharded through tp rank. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: torch.dtype | None = None, + reduce_results: bool = True, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + *, + return_bias: bool = True, + disable_tp: bool = False, + ): + # Divide the weight matrix along the first dimension. + self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0 + self.tp_size = get_tensor_model_parallel_world_size() if not disable_tp else 1 + self.input_size_per_partition = divide(input_size, self.tp_size) + self.output_size_per_partition = output_size + self.output_partition_sizes = [output_size] + + super().__init__( + input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias, + disable_tp=disable_tp, + ) + + self.input_is_parallel = input_is_parallel + self.reduce_results = reduce_results + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=( + self.weight_loader_v2 + if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED + else self.weight_loader + ), + ) + if not reduce_results and (bias and not skip_bias_add): + raise ValueError( + "When not reduce the results, adding bias to the " + "results can lead to incorrect results" + ) + + if bias: + self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype)) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) + else: + self.register_parameter("bias", None) + self.update_param_tp_status() + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + input_dim = getattr(param, "input_dim", None) + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + + # Special case for GGUF + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + + # Materialize GGUF UninitializedParameter + if is_gguf_weight and isinstance(param, UninitializedParameter): + weight_shape = list(loaded_weight.shape) + if input_dim: + weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size + param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) + + param_data = param.data + if input_dim is not None and not is_sharded_weight: + shard_size = param_data.shape[input_dim] + start_idx = self.tp_rank * shard_size + loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor): + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + assert loaded_weight.numel() == 1 + loaded_weight = loaded_weight.reshape(1) + + if self.opt_flag: + loaded_weight, scale = weight_quant(loaded_weight) + + param.load_row_parallel_weight(loaded_weight=loaded_weight) + if self.opt_flag: + params_dict = dict(self.named_parameters()) + scale_param = params_dict["weight_scale"] + scale_param.load_row_parallel_weight(loaded_weight=scale) + + def forward( + self, + input_, + ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]: + if self.input_is_parallel: + input_parallel = input_ + else: + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size + ) + input_parallel = splitted_input[self.tp_rank].contiguous() + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + output_parallel = self.quant_method.apply(self, input_parallel, bias_) + + if self.reduce_results and self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output_parallel) + else: + output = output_parallel + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + def extra_repr(self) -> str: + s = f"in_features={self.input_size_per_partition}" + s += f", output_features={self.output_size}" + s += f", bias={self.bias is not None}" + s += f", tp_size={self.tp_size}" + s += f", reduce_results={self.reduce_results}" + return s diff --git a/model_executor/layers/logits_processor.py b/model_executor/layers/logits_processor.py new file mode 100644 index 0000000..14ae9b3 --- /dev/null +++ b/model_executor/layers/logits_processor.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""A layer that compute logits from hidden_stats.""" + +import torch + +from vllm.distributed import ( + tensor_model_parallel_all_gather, + tensor_model_parallel_gather, +) +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.platforms import current_platform + + +@CustomOp.register("logits_processor") +class LogitsProcessor(CustomOp): + """Process logits and apply logits processors from sampling metadata. + + This layer does the following: + 1. Gather logits from model hidden_states. + 2. Scale logits if needed. + 3. Apply logits processors (if any). + """ + + def __init__( + self, + vocab_size: int, + org_vocab_size: int | None = None, + scale: float = 1.0, + logits_as_input: bool = False, + soft_cap: float | None = None, + ) -> None: + """ + Args: + scale: A scaling factor to apply to the logits. + """ + super().__init__() + self.scale = scale + self.vocab_size = vocab_size + # Whether the input is logits (default is hidden states). + self.logits_as_input = logits_as_input + # original vocabulary size (without LoRA). + self.org_vocab_size = org_vocab_size or vocab_size + # Soft cap the logits. Used in Gemma 2. + self.soft_cap = soft_cap + # Whether to use gather or all-gather to gather the logits. + self.use_all_gather = current_platform.use_all_gather() + + def forward( + self, + lm_head: VocabParallelEmbedding, + hidden_states: torch.Tensor, + embedding_bias: torch.Tensor | None = None, + ) -> torch.Tensor | None: + if self.logits_as_input: + logits = hidden_states + else: + # Get the logits for the next tokens. + if hidden_states.shape[0] > 0: + logits = self._get_logits(hidden_states, lm_head, embedding_bias) + else: + logits = torch.empty([0, lm_head.weight.shape[0]], device=hidden_states.device, dtype=hidden_states.dtype) + if logits is not None: + if self.soft_cap is not None: + logits = logits / self.soft_cap + logits = torch.tanh(logits) + logits = logits * self.soft_cap + + if self.scale != 1.0: + logits *= self.scale + return logits + + def _gather_logits(self, logits: torch.Tensor) -> torch.Tensor: + """gather/all-gather the logits tensor across model parallel group.""" + if self.use_all_gather: + # Gather is not supported for some devices such as TPUs. + # Use all-gather instead. + # NOTE(woosuk): Here, the outputs of every device should not be None + # because XLA requires strict SPMD among all devices. Every device + # should execute the same operations after gathering the logits. + logits = tensor_model_parallel_all_gather(logits) + else: + # None may be returned for rank > 0 + logits = tensor_model_parallel_gather(logits) + return logits + + def _get_logits( + self, + hidden_states: torch.Tensor, + lm_head: VocabParallelEmbedding, + embedding_bias: torch.Tensor | None, + ) -> torch.Tensor | None: + # Get the logits for the next tokens. + logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias) + + # Gather logits for TP + logits = self._gather_logits(logits) + + # Remove paddings in vocab (if any). + if logits is not None: + logits = logits[..., : self.org_vocab_size] + return logits + + def extra_repr(self) -> str: + s = f"vocab_size={self.vocab_size}" + s += f", org_vocab_size={self.org_vocab_size}" + s += f", scale={self.scale}, logits_as_input={self.logits_as_input}" + return s diff --git a/model_executor/layers/mamba/__init__.py b/model_executor/layers/mamba/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model_executor/layers/mamba/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/mamba/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bd53083ff9f9f3592781b37a8c02b3ac97f8798 GIT binary patch literal 177 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJV1?ZRL7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?E|R?rWO_J x=O*SRCF;k=XXa&=#K-FuRNmsS0jey`Nwq6t1zNxe#Kj=SM`lJw#v*1Q3jjaeEmQyi literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/__pycache__/abstract.cpython-312.pyc b/model_executor/layers/mamba/__pycache__/abstract.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f412e682e5febd705b3086e0873b693dd38e8478 GIT binary patch literal 3273 zcmaJDOKcm*b#|9r@z1p=OHP1LY4 zDOYW(z)!nVUZ$DhG}X;|xn_>TQ*PcXGz(s_S(FG(GnCD^C9m8pOGGBuiJiSi>>SNK zRK)#Ft(U6h|AR+&fj=@(X`t1R)<<~_GhNIY9a{o+MfLD zrnNzR+Y&a3T}_bPuL0O1lr$xqH07UzqnZj$(e!#@Qe42c+>)OGkWSgk75rxG6wQKk zZp=Y$OSaSZQV+!u(#+EWmn#BY-KX#IOPpJ_r;@XoeVL}ld1%eaMBT(^d?)D5PMn#B z093%P89U#f0&7xs;a;kFyqB*P69=K|`uB1=EChASHN#ME1x)AuTX3BXs=MgqP`}-F zthOF@JDq?8rp$@-77IM!J7J>EY3aS7t2V?u=>)b>@ZJRlFGI2BU272;F%|Gn9WyiTLzbQU6$t+ zcKr~m3MPb*CgcM$Dv@k)7eMmV7=ZaJOam!)3^wrwRsk^1aZ8M-*P;EH{TuhGr$A~q zxlD!2*l|>YUF@khV$?5Pry~8#aTm~6MOjazFOMs~I7zbaF;qh*fFj>Wbo8IyH}^2F zhBjhta%-PzC}DFbccgPBVq$X-9T^kOs)LdcJNO6Wi%fxE8|N_r49HSwhAWl~%<;+q zz-#1pkWO3VuCzYpOniU60QAXI32z+>>(E*s_r?jjEp19HAHkRr%a$J|b^03#H1k0; z=85l7{qsHfdt^%o&m#7o5@CMvb(gGTf*o4p4hme%qeE+tE8kT>V)hUl(4#%Ysz=k; zPkdLy$Ek;juWk`rnZ#;oKEU3&P5q08PwzziOCm3CI}x-~m{48UjGRrX+tdnd$6wQ- zANfHvZnf0bdo-#uj9E>MRo+*^Smg;4#<^8Dur`d)>C;&2m}``SM#r>mYBMx7p7p!X z#`bSA_>mB%!1N?lq*PoGy(Pi2?N*X&P}IEXV9Lgol^|LkH*{(*F&416z}xp&Toctk zVcx8ue8))4B{qZG^SB*ta}>bj4r?hkjWR#L?Hq2{NkYtB?8>719K*sLPZR#|e)V8g zA7EZU4WGaUMOK&_7G4?@UfM3ayrWfz+S!42cBow#Xcvas)q!^PufH5^=}<*lJM8)po|H$7??an3LxUR@XAf?>P^^DW^_8HOELKu+_S2A_ag?O1&a{0mVm z8;0pa{y|a(erOmBTM9!D$BN@eJWz3_!-5WFQ7=~DM^1z+g_*%{jKpc4oFbf97Ythy zYv9IPBJcKC$PE9j_{K-bXMp{d{8v`e8JI!knLjLT-Td(8{U2`6oF0_Ue4aYN^=`^WEh|NQQs-hH&VJ%4%d;w$^=xc-rPJe~WfIU?}+ z%Z<-Y4X&&_L0F75d|&NYteC47cVt|)0@tM$cWxc2+_Nh94oaV4uVM_y3B!#cY$;V{npXBPl X6zy^9wI}M!Qf^y2`7MDj_tyUbKkGS) literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/__pycache__/linear_attn.cpython-312.pyc b/model_executor/layers/mamba/__pycache__/linear_attn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c5a5a8d5b8c8a4beec86d3b9672295c58762541 GIT binary patch literal 20236 zcmcJ1YjhjedEgA*?-z*&DUbw3N+d{%6sh-vk|t!IkAmc!9tuN1qu&p0NN4@ zY?Rd9fVHzCn{-KS+KOtkYpT|+={9MYb9y%NzLK6jXM+*61u}ZuqxI5&IdqN(PH^sbh zU&t442sMzrIo258652x2mY6@@6l#h$hnh*=8f%FMLV6T)q3tB^h;_s}L!Bh;j0NLep)Qhk#k%7?p`K6Af{J>UV(Q+cn0j{0yJ~SdLpw;R z8%jOz!hB)~^^!6#l=)cw)P#2Z7Vx$J-rXvS?P3D#E~asUW_RChl}18S*XZEZpJC@mgS23M>D3obhEF_q z^2F&=qt8NtVIy^SbaezAfdMp+Xj}a)q6oA*Il3X~B;NhtV7m39n z0dbgPndvb$s1lm7xiTDd>$1?kdHownF2;mY(HX>JdnU=f5#bo8N2Hk%{IO^PrXPNy<(?6wTxr6TU@3= z$7zcCDo`Po3egM|Qo)~^u`)EPVN|S^Q8SuxU5*;38SR_akdDM9*_oTq|z!7~y~L`Nc5UxJ;sv;W1B^P@>F{?ZGA`BIc&*aX?A zf|{L52|A8VPjiXUu?h(z^J0J)^N)cZQh!JE`UxYzu<;3fgjz8`YoC;^ zKA}`$3xzEbL`DTT(%~?N5P~5+6(%Ht!{&k>vPe-l z?6hFSuQkrXrz2?L8yveNXp#vwC8(o`w4i%~Mam#p#b-E{j4%M`cs;^JBZ)CK42u|z z0XfhJniLxw=b8W|*G%w+!(*{XDisc=kiP^1f$P6I-mcBCU5kfj}0r}rJ$ ze|aC<7mHr(o4S_1luYy?fT3xZpdI)4Xk6^X@5l_d&|C=J74W=40R5 z^UlHB2k(xpbl=nb!1{re9~de0juzU^7CbKinE7FQXR&=R-@Z4exnZ6)KQOeG0&O|X z14C2EV0#R2q694x`nX^vv&MwQIUK8SW`v+{E0V4LSCD{>UfC|y4W3n{<;^T@@T@vR zPgZy~B9Q_^<2RX6&A=P7nlyZyV#ug7ni+UgM$M>Z5Q5|+2_dV6t%jf6hA=5_&}s#2 zbuGR_+Rngdsi1n4!y)|)`ySMh9VhNHZ3<~P?8#Vkj1@GW}cFrj5oD&ckm+Mt;mf`_028q|Ra{WyHf>0JYYdMf#bpoz1Qgt&k4 zV-d7m>I#?UdLYepK@`+*J1~6+D@{o*IuRzg1%pICCDJJFZlK#0Ic;-#RsnbhP^NA| zbe$?2D35pH<@uL$rjo^aV|aG>8}5as`KE%UB}bR)>KFFR?^$$yegDGX{NU1yEA@Z< zx$j^2-i5irLfz?{p=7q-7@Qqk?7wgJuX*>a8VcT_d&55%`Cx?ip36N42*06SFwdJ8 zpIb^7I}Y>k-+#E^800O3fZS@Et6$hMzhyRDv;=rdV5zZS>49>)BWDo53 zu)nD!mkvm84S>9x0Pzqc#@-;!odP65b#+m==b<2lJ&_0utgt94&opmBWGeONu_aP1 zSO62{@a63-dBY3&))xv8+gnQZx*OTq?BclsmbZ|}2+|W@`@i2=`sBZ_9N}7_WzdO? zIswc#92QJrR1T(Nn6`w&S76F2H5z=f0F=2rK8d1B|#& zbuSTpgOE!yYT)hDQ!$o1gJ`T3);z|7{t>=IbG=ZS!U;bnQdvJD!U(r=>c2ts%QvZ# zx$XM!BaK08Upi5y;IYh-$7=fLm@C&)I#*d`(%MUQ=Z(y4rmTkaL$7}!GoLAIAzju{ zCfmn)NIh0duw|cF>wIW-+&DaYxU7XV0@-i)XZ?@$me5dAfil8PWVA!4+0xn>i1?Iz{AP0IDGjxoR=O@wx%GSs*O)B&yZ-Gua@ z_F6!TsujOQ73eg}CD_=gKwlB)%L4tnV7&Z#cnme!6tLY`a*73!FwF|4i?QU`Wl=&m z5~(X3Wv+59F!C{3pnwdr9;A7X!M=cButw>7O(T^hK}lSZiLwU1_FCMPHqxjV`f?|b zeM82;?E*+8rdK8K$ZVeuL1$FI1_C5&r4@e-7<1N`HD%3NOU9D5X4I3i{9WlXJQ#Jx zl(A&2k||OqiJfu=HcrBO3P(PPxO1}UV{I;rXvE=)4qQfCa2R=)9Dn7KEQ3(1wwU&hDlJ_D#}fT%Y>&mc077@ zX4j^lbnm8(u&=3qf(SU2hw=m#F3ev5rDpK4-LwMlWJSwhi)g!eqH5r0!5+t3E7!MT&3> zq~3xEm?7{c2i3amk=>^~MwcoKOpO_tQ(*?iW^oecrYBhgRo}31c>ZwNNU|o%-MDaO z{!H0SvKGqiD_cp@M%8;4>*o)a$qUM^3VZvw4r>Yje5Asp*iY=@I-Z7lIEKf+1h!sl z)MC|W&>B}Usv0wt7Rq#t8dW=}(SU(T4LLR0GVlUIY z3AUC^$^)B}RtnU1HuxVHunaUkivPCc}@in|$KK33Z; zlp{#DP*ejq5e@DkDo$r<;998ARZI|I@%4z}jyy!kzONFtqg|=@_5NP48;Qf{y(k+0 zQ@zyzCb+Bcil9o=5ip&?msiXWf&*>oXm|=&+Rp5F>Nq!o|4T$@hUhx=q2b&^`?kDc z8yWMu;o|toa7C*Sh$B$2F$K z(FqpOkOvGG-6a+xaaPP9h#I~YxJ`GK4cb*+5x~Abw z&#zsU8tC6eY%fD}odUhBXb5?^jXVHF=x1TIo97RhLZ)qx8+IUM_ zO}3A>^gY(99i}p+Hk!x^_pV!F-0@_I6OgDi{3%T}EC6na4FO7(msP=-cbAsTIz)2@ zn};zm9S>h(BTS0egHgm1446j&CaC35oZED9h#19v1wc5QJMK-4p0Xwc{x@C|0*!1o z1u`eBNp-$GjSGqS#4__<>`v_7OFy{q!3B^baFSJ(NggEzXu(-=G(Rb{6{y zTQYJrJe5pE(JCMco(?h@qH}^MJzNGylqKMxii1B-+`*#QClN$hKN57JoHkbLM!;nt zOZ*40cpKapj*V`Nj2X=N@>iD8)jBcI-W3KMVn91t7`G1zpfFHjc^i;AS(PF#0(mo} zWd%YK;Gw0g*rsLiy{1;S{=rtO9hvlQWh)%?MpW5S5{3HB+rkV$uVl3f-YfU8@=|y} zsob+_DY0-BZDmz0V}M@BHp+|8*sR#x22IIlK|sX=u3u{`=Q@} zLtg(hTIQdEgE8DSN^~CjkJ2CsX*8IUFu6}kRiDrwg_V?)Pf&36jOjA03}*(V)?8T+ zv_O%N@Ia{vz7c?$0X%3?#wUn|P`?ox-nGBdu;u}!zM5<$38kCv2PJrTKeXJuRU^J-5@tUGa@^y-{rB6!b4453gN&T^`aH9!T zxn0J37#e}!#wz-4xCmsQnKAZW0quK=OHKlF&|Hj0Qrz!D1@|^YGnQUac@0khYX(rr z5zvW8mwuZ2&M!YY0=88g7}4e@xc?f;n5gp68P71-9HV2#^L8E7^LwLTcMyyfcu)`u%<42aXN}eEr`WTJggxa1z6G$ z-bt!B-0XsDEE%7Q!qEm`G!+IScto?UWayU;WC*6uM`F|LNsdc$L8mBvq7EmHS)LOv zhh<;HXci-xO^J(zlgYh>5iXn9v44xHUWigi!q%A`Aah+b_~KH`YJa z@uWTpP)3{NOMTd|75t*MlDh>SrqY(}=nVChDy6;vlv+zIomlEGwGQC3dChyM;2ne- zOV$7S5Jj*y-rlyvyc4?}yO#GYyzekU>+|2T z-Lx&&zt?o9iT8m^7GBlRbgSuRQ>Cl{%WNe-d3}TY^!TCJ{E@GxWbqR5Z7Xltx^#}W zY%e)IrG{-KkALo!#aHun9Z$@<1{30aVx#jFqWK*HDow4qBzi*25Q z)ZJZ$bksl5s@?cHBls>oU`RR1yHDO@evtSeQE-0_j5p>ji&yTO1EuE9<>&b3J-Okp zoqgEc2}YZ_Q~B2Zl_9=$4`08RckErQ=N*HO)T$O+Zusr9V2*%kvEQ`AyKbDBJ+rv` z*5FNW=5#I(qq%&wpYJ(bad&r*l+~&R(?6Q(XS?3&DQhXC6$Wphn)ei%_7tZ z7F+uHmi|J^uKV^~dBd)}eb)w$^aDeO?2L9Ms)x8ls;I1aAbJ0)YJtkB;7Akwrs4Nh z)#C?BT8N681&tHCQv)8gQ=V5SuCNm>A=% z!{HM7Bd88yn7W}JQ%z7~steEt9rvI}sIet>eO~ppMFVwN^?8VCl6r$!UR#nUx6Nu`j=E zKOfkiZ$1DP={ar5*EFYH8`!r>f4j5bJ@kI(^0`Irl6L9JZ9~D=xzfpd59J+){@;D? zA!6KGu@9*<9GM~9KyWbNTkzoi9YlgDnu60>X?6lnV1a`d1ORcaxIf2g*H|o>VpAY+ z$o%&d%wjKsfe%n62bzi}=f7Q8Kzd8gw%5x-t^yPcb{K=R3p_lWXS01Tp_p5YSt#-Ny zzW@Pq2>K$ri!>D{cSZg>ffWoy;M!k;hoB=eSc=2RNFh531S*d0MaazT{O3;3M~LSb zL@*UkG*+YQNi*g0F8JsDKel-C-p+z0_-*%ho8E5%YlI6prgOF;`jr7cFnH5&ED$8}F-*`0)#%JQ!)rsKlqBV6x07FA%uVtMX{iLj_SHUt)B@eMQhtiI~ z-SYNc4W+yr7!^4X)(L&sQpGH#MD9WS+B(UropnKPP`lX>fIYgTwFS$*J~fzbkw7K` zQmXm{czUv}nwL`Gqpk>s;|S3%l~!UwwihLUq?5_4R3U znAK>VkeBnQdQ?So=YNn+Yvy2M2%?E`;77V>0-B~pdaGsQQ|W8mpJ1bJV}ul0(1B$XRQzC_ zsIR8^f z+WbRjVe9Fl|4i;o$-d=}QcHuywqtzTu|mtxefyB$aFfz}+rC1}{`>a*Sb#@Yck}Mu zxo1dnfOik9>|H&>?|P1RpMt`=K(THoU$=9`v)aS=Kl4K;-~T)hXZ?mtaIkgn${D`< z81EU%JzH|R=JqbO&ovjFLEafeuYJMUS9Biaod*lf!@1+10YIR0*}KxV?B@e}^OmNM zeEmh=Y2J6bN1@6Fz_ooYBkKc)-~w%jRjep~BqjLeQK z28)fod}D8+ap&J!`pbP((+FMONm&|-<}TjcwS4ZrxwqsG6#c!tzZZ4k{<+Aa4<>K5 z>9JbXWXp}f>{n2=E?#(G>Hdk`y#`0g4$U80qL=#L*?W8M^432)xcH@&E8o5P{?&U| ze(=TzZ#<|wo41_($mv0bzc{|!`oP&!bnfGw`wGqjIUSrxP-5(O2npHs9H>;E4XW*drj zVaJ>9`@ZiQULb@lItE4f-q4A#3zo1cxW^wnh{sWBuGe$qe z=)XXe!rsaBLNuKHH5AQsJzb!zrKnqBH>6N51tr5`&D;9stVKs3@94{0b`~uMc?+C& zDOd)JmgjiOa|O%uaIogH1#@lBcN|}CDh7}9!Q*+?iJX4TxkEad^U7lSR`zCgd18fw z55{H6>1qX6K_Z`olBc)?3_~zVp0w1M2J%us>5y#yDM-}b;8C(HC|M8`?FK{>1vvwB z(TzbmZ&x+kXhp82PQpDS4Lbihv{?|#sB|#9fCV@V?g2!SDkU-ia#VK1Rf7$O3wkT3 zwBUXwmFfZvpdP?kCfg0;tnr3z)>brh@L-s+G?Xl^lEsZSx6`wye{R%Pyp`)_497%f z?C;=%uk{GaAG-!lSt^(x-~xyC%Of@Nzcww)=8}X`am8(N)dK~?tacp+)JsU%h(XEq zmeDG?2T{Jy>JoO4wG>=Y;$@XAI64(iHWG5#C~G?89Hbuf9Q~y1?5cDS9%)(jRdU%# zvS8L&1J_t>Eu|*?QmXo#l;zKAS;i=1G-XVat<@SS1-;yK#4z5i5?uAksK(_>6j@W% z8LKF}ngEA_^g02~J+#n(WGY|ccnx+Go>v8JNc-jg0PQ-f@03!ot8njb2)HLIYh;7< z?b%Qb*{nT{$Kyxxn}Z>~Nw=BMl#b=M@XPeYF zzfw^;wO5GNX%w|?RKU%ZF5ox?n|N^|orGJjaBE&LSERmVB6f{PkAeoB=P5Li$TH-o z6QWd!v-0cQv6Xq)i?W9f$UKofwt#Nbm$R3;4;H(h%P;?`}gKXNbQ3i!w>3C<}D}lbtfM>nu?B{ykqCeU~$)rJp6aO z_()Azo8g*GYi}{Iix2DqA0tuf9y-Zwv&C$F$I*Q6(R|CXf^%qAms97?l^k^ox_R9< zrWdmF*@9zRPP+zLlyBZwbhh)(_KM!Qt>D}VYLwH9H^~;qZbffKmua}bGyLAjosrc6 zbR;^Dg2oIM*Y5f5#qp)|JK5XWrQ|BjZ$GfgJ#ZbL)r&{b#j%#WTkw5L;o`OvJpA_# z-#f>9pM}e8Z99u?C;2uQWZL2YCMhsb;m6_Z@=j zCePeSsBYc9tiNraJN3}Ft?29JeZBdGEN7I2{#8+P*zyG!-%xhspUi?8H64y|4Q)u7Qe zuY>M3HWwSZ`G#(V`Z-h`T%Lu7`G%sijd!*!JyURY!#cY>Mb}o|1=FAJKan3ek#8F= zxK6?nYI2b^=q)r@-2L^IrM+O25Vr-E3>LTa^02#|eL3C3&Yt(IcdRSph0a0Vuq~&{ z+go9+@`kOqheT(AZgjLlVNqTd%r)nGfvk++=RS(jS*$g{DG<2tnZ72dQ)!NS9m_Q2 z3Mfht+)Ee{dP(Tlb<74a`d1jCO30z;&iOEE!{|kf&SP{7BlJ1cki9dQC1me&nEE_K z@WA8$Hd0X5skPge+VdR;_^k&YQ;;c} zbXwDrdujTe-?{xeeCw{q6y(bewOF`(ex+?CwS0kxbv&Q%IL5aQVJlPFPxb69b@!D9 z_QJi8QlPcuX?xUkhSmS-^ z40bW?ptdxZonqQWdABYN^PbLfomfyW7r4a&59JLm@8&&yWv^J^!`FdH2s%`5L{wYE zw4ZX;m&J~jo8fxl&s)TDV7Yi3Q4Nyf`9|Z_C@+eC>@#wgjQ0=z(qYSdT~0G$tli^a4nyL!{5UoKYTC!iZeI~!mTcT^LpU;m>Zz49;p zaHwqyK_wXSwcrJhn%_4{h?o9c)v;_qEH}d686n37#dCpAdjq+n&?~Qp8iK)>6i=P{kQ)6&rKVM%K!cO?htzs^$D2$&R%&O@lO9Ss5xP4$b`UH-t z&_~M03_ZRGW$D9cN8Ns{H7*U^= z3h?kP9YtNDSc;`nRGOZp)4Ew5O>s1Df;3%9pEk@IXw28AjI%~~8&bx!Y1Ty2jH#xy zdDcu|Q>r;_nYE;?v(~h2)<)8rQucJqY)jfP>mYG+%9(b}y3+1hH;Ff=Jn7ciRsvg6 zZE5eUm%!GPFYTZ8rvtNrbo*>ONwcLo(!tpvf$gbKIy@Urcg}W_cuR^&M`k1GuGuaU zcci-0J+nRO-q~IfccwO_H_vV+uq)M`wKk2WAIoN=I?6i(gd-r}}0+*J&lD zqYhGR>jxCu#s#_7>jowDdIfcMh~#@A-^b~$>y_l$80+VZi(jiC)x6li6VltCklw+K zutE5R;1}j%^DS)W2Zq^EmU#kx>gUaDHT+QP#tav5^Iu z+j^r{9T%E9qmOR-74|r)lkIQ(&TISQ(=Y9xe&ywvgR*5Nk>=PpR+dv-l$M>7xg3|t zC6_W&iMdl;hMj>#T_yuz^J@tnP;(p~)yvlD#N0`4dMPuXT#&6tmRK%TiTD<{TzqbY z=YdT8-Bcp5=f}%Yh~S zy#&ufjXCagPWE8Ac_j^N!$fGXd3q(ATS~vW3~-17;C7r?EVAY3{ z87{%g?MFC%fdi5a@?5Vky|)n^hvP^heIhZH$YP5EByud5$Z`8}XO_9?L~3p&mB=me zwOPkbCTg>woO3R*l1-$r(eIA3@%fD0x-n&CnFX4F>B@2`ZZ4O6my2iT5*bAG)+Axg z@d`4`VipGEJubO$G8a$ea(our8(@<>5Qy{KLNW`J2y=v^6JNsN@sakAYLkD2%O#-V zgltV?FXQRtX^x+(Su#|p5_#AUBI`RKv^Z*(W~o^nOV8??soH;Rgw=Bf)&M^v{7mp` zf}fc)vd!?bz|RUl8)ste@NVInSl!tsieep{xdJ)4=K0_%HBYmy5A3rR*3DTL{k6?e zU|@cGM@vQNLc104s?Uhzk%JSTy_f7?Rz`BzTTV zW}q$zS#B959LOnBwlB+Z3~3wGu*~ty+oQkB9K$uYlwrnVmvr5RPag#D@3Jych7?qo6x1(^hMiaW!w$@zH>#3yw1 z1ebe{<1&?I;c#FNQ$%Djxn!Cnl_xXrCir9`lVgr;RR#@cVQZA$Kx7U@9ZQh=`?a?9 z_t%bVHcIkmWb4T!3!_4$t=y8$rj;R7;!YG*@lzb10jZ7hSav0GP+$*9q6{KQE!Z?D z#Vsp>-M*6Lh>VM$NG7tf8KVTUA;hP%@eIgj*`q@7lZk9ae#tg1kz3Bnj$}5TTH+IN zE|WL`U6O6eMgpzEoyaza+frr6kxejD^U2e5&YCp=a)m;L;Nfl~MSYk4j^`4U)3VRE z3e+D%y2|e~B^U)N4iZjz1RcaG>I6jzW{6c(4~SVI24yRlHi((Dm>yzSGkF99#LQ|e z=TRX+4>_hAw5qTG7Eu{7U!WFSRlMqZ(OZR8sF7S;>`-&7Z&r7`qD(zM?YK=XpHt;O zCD4LlfoAn*q3=S|4MR=q9U68-g`kHl542X-Cn2e!O3H>A>1`6KVj4XbI3T07Jf zipoV*cyUC9s&5zz*ID2#L08l6E!BJ#x?xktv^rz)b~X38Td++nQGKf|Q0b~~7~!Cb zmD{C4z$Y)LaP`gF&wG;e^~xHdsSWsBt9UBJ+6CtZ{94|ig(jHkmYSIktK|eU-;&#- z#f*YQXujc4M!7ZedpbRLvSx1s-~#W zyA<@~h#F&^g5%Gq?`f;?_i2G@TuyLIK%p9*f@5U;vp}hH_`G=YDATlcBiJFPajnn- zF>NJ5f2jC?TH_fi>Y4#{+Cp|yMebXdR&ublDiH%p_+O!n`4uupl;)AQ54;i85kR(Q zPbHVhHV{v+tZXFPx@=D^C0N+C743Dd!d9eG6sIUGAO0l-AhwY-dDLmh>vEcK*3v-q zE^0zrEkRL>nmDee33Nlu<|>8?0j4IJ)ie-hG%7}d9v3xB-_WUxLLjP^Ix%n>qoVp5 z9=mppmi25h9X0V*i18?1Wdl*XjdM%OXX3IkyPQhq;DtwrsDZ~t%x_1urd(xnMF4l* zEcOz0slPimoJ%hcFDI9WatS!pZ?LyK1j^I~M!~5ijB*@+@6wmR zpl}|zyvi_(wQPmO;ejdDc5fvGLzRux$+J@Gq~bJQqj3coVd&;g%6jhfvTU4(@ySK? z)rsP#fJ_$ol3~vAlkk=ulj`BDGMIx;9n3oHGf1xsf?uDber5|??)|v;ljuj$^&_Rh zJ>uY=;^1VlYpM{L`WxFcKMiPbyfdx zvPn7e$|i+Xc-&>==9NtLofVEd%kj9w%T38le3@TbxtUP zTs6YuvX|X>=19W%9*zd8vgyThMslv+4k26KN~Bh}{X7r00Fz?;kjJ@wpf@qs;0PTF9$DK(T%<8a_0Y)W*-;4o{GLMp)M*+>hh5;GqjU@S(F=Hq*NE|th;^G%}Jf5+S| zg$Gv+cg!8yJJ^TufYjczYWPPEsWXPrpu~*eJ0wN6;5&TZ+b{VxOa7?j3O{Nx_$+{I zExW1i&7TZ>G*If=Dt2uxb?p$lc9goN#jfewz~u*44If(1TTAA4(cFH=+8Yl&?>+g(O@1<*%(L#L%#48S}1ob;{Q|b z>f)8fLUiY?1u?q67(P&R9YlotX8+~ZtKKW#wc(pD79x{(%u{!3lJ*oL(|64K?!ypb zH+_;TAceagH5&s+-u|GSat0q+DSIn4Kx$>Aj@TDif7wR4y=4d0Y==pDz&snt|6E9q zTB-1OF|@4|dRYv;TmZhXx(aQZOKrPF_;1~P*BvZ$j^CWW`PD*bchSA4VA~^2>;=4% zc~mry-rOXbciazzuLiFK*G?1z(HrBRPF$O~r7I5XD)=X`L-$n1Qs3U-s^O2VMCQr# zY1stF^c6m{VP`s&VRqp4HvZtGe=W=v!W&H{{`LyQp40R7p_$cOczalTZQA z=?+n-oi1vpuPTm~*Wi?@K?)HT&3Z>y@+zI?h%a7RT@Xh zS639Gf_~#XE~4&N_pk;yWdo<-lQ#(_JpIn=!3t(n)W2W-UwC*FrN8=nfTDIqg1-p~ zvIWgp$%U1rl`Prg_!l9yZkOfpGJ}s}@O2DO$&#CMU`)?0FM(mYiJ!;hlbCElLk`ig zvw))7XJmRxrl(~$o6OF^dg0<)P8PYcvMVTROG_L{5b9v=N}pAX-o%g>+nDtk6wSir z4`_U46J>Ljtero#cHRqwu6A7MSUdX3@sEyw|7*p-)|*Gg!1nWpw2VIsuhU|nfA!Ga z_RgyVR|X`TSF*cGc1E-_Ydxi|aj|RsX0EjTkhuL&!3Gl54I;F|a^A7pd|~uLqHG~` z1%GM{N*-Uy(<6F%3Z9XIZ3IG+(k*(r3!dSEZ5YA^DgI!|9~J%4^=PSox7feC*uS^v zpIm)ea<-P7jOb+6T%QC#3VuIabdIb}J|Q&}xqAG{@%3migdGfRFNG$>(B$o|+b4>O zBy>d9M%NMr$L4~0vm&nSs>zMmDrJ*$j+(0#La3Y)(Uk$9)@ap8nQeHE(Zmc8s61f2 zLGjW+?5M&5)m(l*phu00s-<579npZN2OiJnL~psKR~T&t6W)cOqVF;a<1|(7X3n~6 z6p2b&0kKiFfy05>{$ca^<_}xWx0K9XqPa`51tgnWvN>`4gw3kiLPl%Dus}&=SPTGM zQX6}X;?cUau^BCuD&rYY)ZXPb@bAW~5|+gQKSxPs+vAm@C?a$5130!dJc;T-uvHd5 z;A&4EMG>~N?$JbKgS4prCCJ^%NRy4*4b!)D1RRx}IEEICa#G->KtgW8@vn1@&+ zDPx7&HFK6a3eY1Pq$3O0J$W#JETRHl_XxBm@SofQ1FP3s&?2-fcGnSsC>S#hV9cy= z$8mV)9eF3z+4DGVBctM?W}d0y{|2S;j8OM%FaKn%4rD{xt zgf^jdK?e$Dle$L`9%k~9XHq+)(x|?lK&>m^4YePx;;B%+2hwNMSoNLn&DdBv&%iGN zzb^Q7!>-(EFD6Z9@3{CZR{@79v8g zV7y_zZdY>io1x5CRLtsI*!;XaO_cP$d^Fz=lzyj5L4~x{Gmv{*O%pc1Zx#9g|G1h` zeGAcN)hzTs6N75xJ+p{*g}O&=oe&fT*k&Ot*n}py_#q>8!*acH5XcV-n-=Hlib7Nv zR97}!!1*rF{67X>O$eQf+~c^cQD7Dq8l|f=R9Uo`d>U4M=!x{Z{=d+VWeh?m){>9q zPN^*sIvS3-jy0;Se&6!jbE?oObP3E2>-CCu@T@xX!%yJe;f5s}NMypjBl*$%mS=Kr zRRX2dQK`P4JwECz=f`qOYB^X1jaynXuEI#ezSLqgUPq$>_6ZMOb&n8wCPiEeLqbd# zgDbs?qyP>ddseyp7_5_b>Z*e%s}sglK7ot3tcoFQQb&a~p6UaY+$d~P_Q;I4Mn>mU z{OTL<^i|y?zZH6l>U`b9j|*EHwz-BAp1R{B-9|h9R;}Gc!^)3mdY**&torLI3FC0> zObh-F>I@yVubcz*-J;q&Uth@(zY20X85(7d9XrCn?Fol|7BP#4b`1O(vRZ5iA1yWt z?j_61sWXJm0@%yhp(BT1Q$01%hFx(a$t@8SKJlL7h7!|AY>6?iflZcVR|;?}IFf)t z99=C*fxb#^EXEw*fB-s-@LYB!g&r@J+KHpu z&@tjoLwxs`KIn6lIRW=Kr?OQVTcJrD4`-uvg^c3Y2dz27{NmdeaG?4$XPy7>PojON z6(5qyz(Q@%5q1Ue(U<{VdEk=+jRCieEO^}z^Lb5e<4~J2P--VDMUvUtfj9w&hfw^7KsF1nT0xkJ9bRS8jh?fz| z&4I@vR0d9w;HU#$M9{T4E{jb&0X{q6{({#eEHnM)zDbRO;wO`3UB zL`f~kd2o-gz|{=P%L;SOfu|t@4o3-`Z>*Gb1xx1=aQy}yCKV8u7p2WwD>k$}I;bFm zi(nS%sZGQ#DHnl2yv8>wbw;C8@jpuPj565DI6w=q=WwXTR&mf(WLA!uPa<Psitzd=_q}!9|E&0q3DOnN2xdMSx3@$MfN;tfoFB zBzrO|=+{bo)+%JOvPYqVcd0vx7ZPLS>&dZC!C>T%RUyngm$I63_VOu zbb`n8zHA3C(M%ki>WJ4TxsZ~bY2x}>ZJpevI9XKU@n{v3bl-)Q3 z%iuTw)2Q%wmPaLBHqM<~$(-U*)#p){2U}H=dk^l_U|RSzrv5IbD&>_0uJA!tb|FoL zn3`lG)hK9LIYMsZE8_@LTv=>!SQwRZvU`IiFFrMmZG9|_l})%wK%Vku!I_s#(Z*4M z{Kn~Q)g&y3>ezQ9TM}-OBUPM+YXB;0YrxXbL9~1me+}h^5p@RHp;(D=O%wkO9?dNL zECzV;;PKEXn-a_DT}e84lH>VfnAoIV@1lK2wvap8cq$1psGXDnKH~VCtWR+nSr1Z$ z*u$gS>a2^hLp9f-^n-1MzW}BAi%7z|kvUaALH@gl@u!Gk#~aqTdIL-B>=p4MH*3z0 z{AI+yg80sg?9yCGWoP|-$_AWG@IJ)Fs$6Ji`R^fxe}=(TBx9{IR~B3>L5x+nN4aoU z*iKf z4VMfHk`Fu^R*FoDk;#HB47W%?ZsN+sy1m%3?dCgT$Bt6RwAe9yd+H~;V#h0_JENt@ zb}_R37JZ8^M79?rQ=&Nx8JBzR`8qCrf0ms?YT`C`=%ZeHv(JPOxXejD|4@NxYW5*?A%%GoG5_f zlE-)H#fxwy4=vawMt0qrx~(flri-3^w~vaRg9Y2cd!hbP=tVIE4qIZ1NDtY_XM@8=-Ky91WU1D4RsznOM3ZqkEa0*J@@7yav ztv06w!)yynHa~<9?M;lOU=Eg@RCr4%v`-A}!^%RVRoK;Aat(^E!Bvw~14hP6k;7u- zFff2r1OMC_x$kVdWW8vWS|g>_J!0z~sf}41EVK?w{s7#&x!e!*l>G=@uniQ34ukuI z%d27U6lNp`U-&K^{-lgZuQ;|L?vgJugAIS6weYY}bz zz{_jhYguSwxT_Qz6+@%N(Aeq>xOub>KpDvykeoeOyXm4ya(XV=FWN7kSnDb}d+TEa z<&y29?N3@Hrx)EYI=iuP&_rlo$AH$vN8N7HxJ{JZudwLlvv+L$zp%Fb%+V_i?=KA> z7KaZ@t!)M0wvunJ=-YeSRrF0mi$~ml>keGpd^z$b{beKN8iH1j9;8bnN5qjM1b=(U zH!1oiZ%2y0eSkk&#eV?!G2rl#19SnrR7MUIoJ02mLom7}=OFm4tRKBe7oA&3m+M9w zI`6pXkh*s$BmBtK+}`q~n{xQiCoj{NNB?N4?4j(wvj4Xj>u`4|G$e+G3bDPV*dZ}? z=qIB;NfcwR6+=f?XYSg717I4|2|T(EE$xSCDLG@JGq(QD%}CL?wK_*IF{=TrhGRD)KMjwqzH*uV!D$POByx`^)UhfW)JG&JqVuU{G#1N!o5`eqvYj~4tW~gOoF{e z&8~Sjx?u*)26a5a)uj=pc8gi9NWGCbR=Klos&TMG4-nZ+4mG2d1CinuiMJB6OF8W* zXGxMSw<$@ey3{0Jqlq?}#gVPtZ#(e&Lq**nCC|1W!K0J1ZP(@f0Lo?Op)6c2KfXyn zmU7q8S90tU9lJo=twCXfa{$y{ zUN@SK>SSvsed-C*UNJ5MQSE^#wrCLzfG=DON(4M%bTd(dYzG(S>KC5GOB6Icav&n# z6DcMD#Zm#j(q1TCH6j^zJAfyFQMndgkxxngF{|6Mw&BiRhbJ}NNlG{W~6) zEphyQ8%$^b+v4$e;IvxFX^O|$r8!8}gRxmQoLE{)5pw|FjGaOeA?vfy+gCAq00X=w z1s8h!{+vha0{;yReh-5p1}L5RpJDKyA&|{*76ixh+!_8C7&Q>b_Wz36I6|_$;@LdM zrBd-Yd~}3>fGw0iSauN5NqKx_7XjUr zucPcCAUMf(mD>pDrF_A%kAUq|d$`;|z#!%8TH7aj`^q5_4{PyG5@#q^>ldVj<-LYB zV;^`lqYH6a5AeO_merLHh4aGYH;F?U@t`d@wu{X#JTyXnIc79k*4{*KJP=n#%dLuc zGl1n5+Gt z+0+C?+tyY-`Nl`z5F=yYEDhw`CM9wG*v;d z$;H%C*3m})<>~(gRrw$4)r7K1Z``!jQ`ig+;5#2u5Gh;q#@4kq^v1hs5F=Z^gv3@w z_~Go%!B2ZoHsJH{-$GFRQvFSe(>GE~;2U%rJujY^t9xp${;9c!r{)@;nrp(`GfiMp zcE6^WB=?b9iDUSU(9{jcMSdq2G!N)h=T<)vsAf|iN}?lXd00wG5Asq zeu_c{olm|+N&e7P+Z9{U&{G3}qx<3Whl>U{k7`gba`o^PNb8dn=#HUJnI=N@7Lxu9 z1GLR4HZwGr5z`xwH}-QT2-F7!2$A&gS5ROk;fq=0SxCy8p7CuDZg`Yn`j$@l#vNP{ z^u&dw>Jtr;J>Qf!KZ6eaTy&cAmS@n>L2KZp8V_x3jWutpYG3fJaT9*S z2#yri8%EWs2sUohOiZ~!*#Wf3H)Sx>A0hnb7xA8~*PchyRc4O*RvJ z5n?nctgQLH$`x6NS15m4?sA>VYxHt3|RlXerztM)S?(nH74eB(o z3qIt_LQtr6yw)I?Y;wWl$}ExlswC{$%2$45cLk?j4VC{pAi@6tgLw#uN=c?vnaVI` zU;tm!QCYmb2iv2C+Pt&4Y1g?|C2M=hIw)EP*JnlR3+MJp26M^KCK}q-j@;S=)BHDv zNitL$1_g;Js~G^zcb7;XX!pj3ZFJ?W+)d$4C?B6sz{%k6frJHHhF;FVWERS(dO#4$6JT^g)Qdi)1UV_hw4rOo8U~+L6OtTpyrrInh&2%y|@wPVRZqmoMF&}CKjl6;a7BfL;)c@SIvUoZ$Enpyo@ zJc?FIHpH)B?*GJK4TA?5kfnzVUbivJ<_Sd&-_3hKzQTVhy8vNXPt)|5W{MvAx!(D? zVe*%h?Q^R2bE@xesh36SCdSxpHnY=x_*u&Zg@slf*|a*-doxa{D6+ zUS)R|ZIVD{r!J<-dVud)93KvzA1oUI2Bo@#xjJ}du-t?(GiuPy2w5nPw`@g7(W((@ zp-ldV4ulTTP4pBkdHi^yDeEzEPfWRx9T8~%4;LmIQ~&?~ literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/__pycache__/mamba_mixer2.cpython-312.pyc b/model_executor/layers/mamba/__pycache__/mamba_mixer2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc340601bdfffb6e9201a5b84c98285743dc675c GIT binary patch literal 28575 zcmchAYjj)Jb>;;DkRToeNP+-Jf+YAB_$EcYsRu>s{jy@kc4#{=#0yHK0Fb@_WsAO8 zl3S;y8Yd>xbVMcTh{|+gs&<#1x!OS#7%JM{!NHK<-i|?9`psv(|J$kz8ly z$IN{DTwDO8KuMmsb4i|ip8GuRIeYK3_dbXJ*5*PIQpo&;!J?31 z#1JYTDGnJ&j4ZAwXbPD}%puE&B~&s}60(k1L$(ncOEUyZLuDgnq4JS(7A_9jLlq+x z%x?@jLe3Fq$Ti{$RgP4$I8(4HR6SD7{N|uLR5MZ&svW6iVN0+sPPBXxFpyR zY8+`~ervEP)I8G6{I+0AsCA^3`AdTvLf#QC^OptNLhU2%p^lM`Q0GV|iz^Ryg}O(& zncp7l3H6TjGJi#|FSK!FqlPQs0u>Wy)xpfZdB?jNB~-v2;CSa79PbKLysK5>-c4s5 z*~C`{HV3u@w&G{on6?t}^v72Pw(`~Za|hbT41CQSx{>Yc)>ONyraIK*2~?vl-I}^~ z6m$FrzCNIvcr{&p&WmqYC%!STlW$rlO*>PBwz_%kn2~ROLqD?XpS6Y-)UX?)(972b zcJU2oG=bgkwW__;a2K>*@2AuwuST|<`r1?bd_xEK4IMdj`~V_sgVAUp937tw@AZ$K z3xxUOh%5+);g)8Sjs>EA-XHaQwX$i*KYBJWG#MTnKO>utP4ag7Uy{QNi{kZUOms*6ktz7Q4lP5A|XFbI#}51*5p)=z(BQV8JI~c=rsDt`_B7? zV1TAY*~w`?Cd`bVCJjap69HpWa&*$q2Za68VE~>C%MKKrnv9^}D}nJdXQRF#OP4Dm zXEEq&qXE_y!T6{1`lC@Hg1Ab4TtGv8Lg37JBpMJfz35~3CZ~|-WFy!2&?J>BLgTLn zgg&LaKDGCwIcu6LP6zJ)6FhGMy9c-t4bP1f6eEc@@tS~^FTkHRpyPD`{g`2v8`JRm zH!LHCd|{wy!j=n6@$*IbiVVdP$ z)r^>*=DcRP>=+xUuN-^&IHtz|njNxjY#Qt6>^RQ{!pd^9u$Zv#j4(Mp6_L%;kw9ip zWo=+8GOEuR4b;RDF2C;q)%;z}6)u`dysQ_wZy~RW<276~Q&No+IbWI*a9mXVN|_Q! z&6E%|2om$=s zpaKS;Z#+C6_4x#9tXveGViQ`RkH$hoa|G(FY-E#LneTR#W#nkpDllXgfDM>%+#end z_%Qd!gOg#w3lIWLOu3kleJB7_CF_C0N6*T-$#5VdbW)Cbay*o+EQ8RfXDq+BP}Tu0 zkFoX$7%ff#+eZ0|eZJA4KN9izA~eDtdM^F1Q{Bpb$ZLTt>?A` zx`X4VyQeNh&rXKBi4b=yv^MZ+V00R->JIuZ1cXR8!}W^mQ&48t)CHj%70gsWj2XJr z(cF#Js;$gexw&TqGG>cMMf`ojaDDJ7;xoneRF`Ejzc6FX`!-)!i}FB&*t% zLQ>TZ#95O~{kL>KF@I#1nuaN{KIz)9H2nUF8z&?eWw5Mx8kIus4fL)|nyd+v`(u;) zlPdSMj;kHNKmE?k^_iPZiM}IJ-;w3kqsvvtW{2N8a@Xu$ICbs$tIsd(zB6z#{*|Yf z2TsM?zWP_@XHtcnr8Hq~l+2Bb2bcIe=8Z{fdBWNxS(_H4?}*n$$-438%O6{}rnrJ) zW70f$UyHY7-FhGW^==oM;Q_MOEz6V7a`>H~(+y{A`kOiY)>oT5%`RSXm)U%jX= z4|e6*u7Qn1p{N%NXRz>NMd60Wwl0H})wThHQR@mbtt-M;B{agGytumvE%l@`y5`PodBz<5-$E_Xk00 z#)H#x0f+%+5{Tg#xbx`7K857;6NDa$*G^5oB5NblA%Tzx=qhbPWL+rW56gNX^0E*W zHc@5^IenBxcYZwZim;Kwz2p$#^I8R>8Nx1dcEgcPXa*yrSw|S4rj@8$z%ceV=L0kKc_`uPEYF}=gg(qAF6HK&t}AEe z&&-;VX3KXAu2|+R30s3?Ygp`mXUp|1?`?X2=Z&4qjoX%O+a>e%Sq-B?o{z1bWV!uH z|9t;~X}P>9Zf;sJS>N7ydFR65$EF(Od0V_JF8DvT)~(dkUi<3RuP*j}|G8@~Tzz5b z^sT0a7nW;w&l&I5?z%OzTzh!VwBmAKGha0?zWmNB*I!u@uD>R^`sVb>>XxMnsk&?4 z09xp-xwh%*rp40lZ@sqj>dvLIo2M3bE>~}#GbC-L-`#iR$o!E-!%}~ueUF4+&z@!b zpky1oJh)Qd{7%jFnw$0y-50vY^W?39W&4gf%}VPAs#!8O z%|5@-maJ-8(!FoJVU?=3%pOU)Y8N!u3a=K%U56J^eMqDCzWE ziOt6rPcA!K6V49F*|GGMTlToKW7)ZD_F&Rg|NHyiIdc8TO~bAJ#Ez#V{PsMx+;mcM zokUq%?OgGdlKGOjZ7YD=wl0l)Y}-0J_&3(-q|2Rf^-8YZo5LR-{ottN+Jg~sR4?#} z+AR`(9b4wKNqfcI=)$JO@`Y`4VaeW%o?zr(yZqY1$!n*so?2{QuIZAjU2#K~u#;v= zp%92p3*l8ffdL7OC-@QO!y|YCZX&}IphWtK90yg=siAzlbY3h`Fp;4|YFX@^GW>Zy z`ma(T^_>qwmKRX45Aa+begLs~_y9(wG(@E$*8JtLBS5M^yNx`Y0`2BhTJ|&-b*lYh zJ2jUY%D(ebe@-j_baytF>f^QV>E0#o4G*xY{y0OI!XSeCy*gQUdfXq8H8TQZy=_c# zayo!DA7By%V6*+mnPvKg0~9<+&LMILqY8(~$skhUMG6v&!VsdYBVigtonGA<+$fwx zt_aNn!b}QIGOGw@bei|i)I9Bt=^e#2M z-+H6<$ITyhe$ctxwrANnm~akD&f(k7FFQ|4h1>j7?7k0|JjxkuB-X)Mx;L z4&M!pu!9DS25VH@H7PybK&7tgR1x_M zoU=MrOg;R>v!=Fh{6;a5* z8Le--FS{4?3xR9FtHGsx?;pK!G+w(^a&41}w|!oWOl;&>V`U?gtob5Sfl)@i6$1Qq zG=CL}qKVH?)JyQZ$-Sw0vjEBsO;${k4>j=xk4Xo6JXSc%%@)jR#A=LFHsP zVx;BZ7XqHqNl2JSqrnTFv?S3Jnaq^9;DOB2b2{LOOa;fI9!!r(&#N7ty&ayRF3+K; z#~+MLGDxecg5;Q0r%1Sz8X{AZVSYS(#sdg`&)I;VSL*Yiol`;oXn<#MqLbKoL*(pv zWjuP;Gal_?v9g}+>tze;t11E&=vy2g&BgrmG0O3G|Igt3u9iN;3&evL#I%>q!`zGD zEndLKis@qdSfQx9T*5f?BC${?;0wZ<%NE)Nh((MSErvv@nDL#)D9vW}h((ZE>1T}K zF^kDnv1-f&TuS=@6P2i<-Cl0H)Y1jOp)AKY3ZTmDM5fE8%~4A$EIVo${m=&@l~yDGeoZZhT6%sR82qeB(TJ@lpS|fG|nbGxn6Y+=qC< zOP4Mwc(;(17iPK?srdOmUtZyL*8t;SBfGkk{O=Ob58*&SveL9cGStr+=DP0}a(x4m zxpVg5Tp(W4nJVL|8Wx|GDm!Km-z(uNz4N_GXJ&_i0;=m4tE8%SFiUk^H>;((fwz2&jmrvX~p+O_l*jOQC z=ggDDQ) zA8-0_;DZ6?y*>On#iU-)aOSd0#}szUtiV?URWy$>Pkq{VzYP;ESD4RhMzkc*U;D$? zrv>{uUJn6(A^wU2dftG)V)%{tGvUu1DC8}HBEAIi)_{R8U`jn(pg8R-4H(nDvOv+8 zhp7n4p&~Frh0ws;1LnMJNyR$xj&hKII)T6`W5A&X3Mkbif3={@L zfJvsA{5HZekkSi97196#_EK7r2$lsn3i?Y@w?$s+BCx}ZoUxrS2o)cg!oC!$Q&x6E zBoOeh#-Rpm;62axzUX;cA%&ieU7Nc4JPn7Sq7<+P21pP{&*YeAuz?jO%KFOL3!XvH zs&H%6lhO8hl`auA%+}qxrsgP7V*L6&=K>cXU>_R`K%)U&NE)3&xYPtLeTysRh3WmB#usw5H# zr8SmJ5o{ynrw<4{3+LSx8^SGBUp2Ktb{`?p!VoPltR&-6OJr zUL;L`oI%x?Lc%W_{iG^ogUpzI*{q1u0h4s#=223JQw=Nx?iAvQbf{1fQH+*mraZ^d zW(_2ryI%|68U>l_*Hw_kO2H1SuZCg5Tc(C7AR=MXv?jn6(cPpMx$}aQP!c4Im)_6 z_n0=OZCUYiJ}kaozccJr8FG{nYqhH0yh=ih4;Ztx53zJ8n-SiNpi{`1XFE$d(>0M) zi#_YLFTF8`kW&aoRk}&4OsF>V!OkChMl0%hPPBa=qUhf%c+c=IT0jrlo0;i3&^3Vm zh2W!By8+?cU#ynq3 zSxRnObL+^oHUHQmHjg41EH|<6>(7+*`W(~7$}v+kya_%#^5+@HbZSM^n;9o=qCxd% z-}!{zyjh*$P~HoAVb28ZRM$LFQa!L|JsPQwcXYd&b7GI`%f8>UsQI!z z@nvx76Sr_U3sLp0*Dl)ht97Yw;MzvD-2cZqGjmTgqXo^G7UbfB{c3FX&6}gg)DW*x z`><+dpHS2D<>X7=$cio;t1!=AwOsbin?=Ve*txAELG7C730ikjEdpww5es>1niAPS ziAq&rGDakWQ=?zaDM9?kVKtn6i^liL-qk5#lxLOiCnm_aGp|P~FDBGx!p=quO5kCoG zV|X%SMnro7MY7aEdz-K)8=;sR1xWVn>Ft`CT-%_u-785Yr>CYS1(?z?2`AeQ(ALkx zHVqNaWEgqIAKffwEQ^+*H%fE51e)bDMP0Oy>E&TjYg9C-Y=>i>oQ{$@l>!mrCFB(5 z$eE`yhOY6jkL0Lcz3@ATkWD$c*JR!=kZu|haazHnX*3Nl$I&EZa!Q)Tu<6UB<&_$8 z6KRH!imHz~mDR;>g6`#p=D1g@i0ie1SEqytWQGcx6va#+pB60{27=h9^Zs2kEjb(Z zj6qUX{-);pW3(?qY}6(0#}yw|eo%R<_a|FE+VbzWE%)w|diF7a(y5u^E)0b-Haa#E zGPaL2kmHezU2=9L(XKW?oN)}RQb-T8qw2Y!+;xAHq(Un1P?iNGL9BOaG~y%9kl%) zA6;vLEgPTl2d4x31OXP&g;8b?C>ze5_pxoa@NKH?ZE|R~dn*-f+8Y#+9d$O~vlMrk z95&!==-Eeji{jXaVV%fA%q$`%^OOtILT5XBD__M2qva^H1o%3uB@b$}g$WfhxvYNG z#90v)3pc28rhpzgfFvAP!euMJ!skgTmsy&qdU(nfE)nbBJuVFOoMna*NsT^jt@+IiD;>*T+wNpad@ zW2!)7>A0_jFK*ohwOEN`K}QA$lBH=$^D)$e{QJQh!CU^1yLR6mhz~uR(&|yDP-EGn zxnBft()=t+)pe3-Uv<_#u!&3qSL*vHx;E+ZkRf2QvH`D>m2}8dKF5>1STZ#%`ad?c zJ}_{O?nK2Asp1HUx|_&05Pfiz%_4_uskGi)zd;XLI5=*sc=<0(@iz?5)~+GLGa0pp50 z+a5U0Zh-mr(FYA&bvN3ObbGEHyLv2H(XhHYQ(?L~8|UspbxC*UP2Go<4=j><=WX4e zm;7mo`I<$SRO($imFV0f;n%i@hIeq@ zg5EZj#LXUbxMj;N&8~q6pN0P8NNzhG9 z-Z5{6btbgGS4{J!gv}$_JS+CbJIzBY?%pJt?Ct^V$|?Y^tf9%2bk*Ir8d@AFt+BN1 zf!<;%c~F}Jv_-PFELrcEx8?#RJDV>E6g{`Osa0H8 zj?N!V*jpug>x!%Oj`zTdXH$}X``K?NK3q*LJ{(5i=+&c1cjNtXV_Q{9Yp!rS&|6E( zQf*vg>pSh&+Y=4_QbT{DVVl&jEzvL}H4NPzTyEGut9#pY*_1F;NrtLBhC0mXGW!+x zy!)=XlvL+;ZH~DMi%suzT<=)6b>DSVUkS~J9_Y1}lKWbHnGv(K5QD80`9a{?#MOzV z%A5R$!4HCxckgoT&~hm(Y~L!p>#V#Y&Wo^}Ieh)_(uG^YKRNo*QK|dja?_z@C(U_t zGHq)6iNUpCzvj5=SnR&D@!(HOmp2}X*B`xOICi(H=dQVav3}XyB(?0rDyufmTA`LI zu2l2i+#RnUx?>o=%RsoVy77Uw9slXe%iB)ITfcJ0@bo7xZ_*6A0B5qaG6`jRWpmQu zo_lWLYw_|{*aB2icPtOexQbR}gIda-@NSp9+i#m6XbUPRu^7WsyJ@**bE4*uRC6e9 zs7jhj;}xxmid_1xomgbygNCt2l*0)UdhmVvqds&`=qk= z8XC8Fdb!dId#r&Q1F&0a-w}5WQ1vUS$+7x&xMy|WGASx3izRGTe5bj2pkw7>l3t3u0@bLn0M8BXM%w1%-y$UZn4CX?v)bl^nJvsz<>GqP2t+YV z`x&t~MJNYNnwcxMuokYWo`j~X`YCvF&tc@_*`C-k8J0QN%oqT2%>#R(bmwv6OzA3E zPR2oKCm}}HktMqDI!ZB8E!=`{b)R8|s2snVDbLgIzo8Ddp%5q>I$ZL0@#W&TOD>lr z42_bZF=?($n#)swcox} za2P`L{Au|F-?74V^5z-SSdplI{m-(Nu%bR|*QE*}BCLLqT9*+PD?K5XxQdv56$G1_ zx*7t?XGD`~4u~9e`9z2eDl89p4f*m~bHiRhg&!+MP8};_6!jCXEOgZeYXp5hF*|ID z_4&loF$>~r)zaBFEN=A^o@`k4#jJ?WBchDi5Z|Qc%D$mS(dV-nj+J4Qh)Z1ah#8Rv zv)(9&=vDi=9^y?cv&<-uwyOmuXot1t5z7IUHV-w;JrsBMZ74%(MZQ+;dZrWNlt>M*J*pM!&J^&PoEe94jy1-b9#h`= z$nwpxmPePrXv&nY$y0x8Y=c-NoOv`1jLXqAgMuJ>lniik`UwVN7g#+zO5LMaf>5Jz}5eLND@RJ+kIo z^C0I-YoZXmSbWheZV)@g4zW$_f)Z1I(e_^PyJ>3uxYl5f)>Uz_CWaZBRNTcd!^T*D zY!klTX3_lm*DhLwXT=S%Ets=C;)aRQhugvH#I}h*zIYX?ic2QWJStagOU^eq`*2PX zvyA6pF-Z4faAS<}Uo?tahywr?SHI-u=sC5lxG^8S5bM62`KHz=wg6sGtuy-;``4{K zoXw^B0An4hjPq^jzKNT@Y=~p}izTReN^M2<%@a2uWW%cOB2?uUOD~p*n_;|RpgOYD+Vp}l>{}A(Bu~yv{eMPAac(F#U z8MdvMp|R~)726qSxC5(0@ePOr;to=G$99T4)8`E}Z@I97wu?K&fy`N|UE=n1s+QNp zc85)v-(`r?QCWh?XwYtPx44abN_=b&Rv@bf=K--piVcc`3f&S67x!EoRN6cPaV&Cf zy|_o*Bkn|bU@U?mZuf4g70_|wR$hD$KeG4_a!7mI&=>?^j4?V|U+ zl6TYlt5_q$q>buKWSDecY=7(kX8adg@3Dg@kq4h-u%A`Aw1NYCi{-kKAAlem(re`*Lya#@3BK-^OLu3z4oasi8Z68c`#9?hV*!R zk-n=GDRvmJ^J2d=rTKMW9})NG!-9N)cn~SSku!8@udHBPO3(I-2iAph;EACe z5clPWg8qLAyjSL1?Tdax?H6eCKCxLmB%%l65%3#_dE>>Qi^F11zLtQ`e++GaLfoRj z)(~wPQtQdS#cr`e;k`58_lb5dB6te1p_+@?p&TwtpW7n%pZ;&0hGjZmXfKBm!4#&e zRG1_yV=B!51P`;jOcOzD=hE9cBxD|uyR6A46hkZt*48|1Q?AJ;7W?w@V^gkqFT-NM zep!25puP(~fuoAr^z7h^qGBXAMfg2RERaiAi7tJ6eM3xB7NMSd((0w}!9UZMNA!kk zcX6T!i)+zq-Wz@$^q#`{*^uZCB`kSBqq8!NIbQ8y%8uGduD{x&=isB-yq< z90TD$4kj46((=!&hCLefKp0a*{wb=}~W8>j;D22zP8Sv~F zNjVt?j4HO?X)qDFT!BMcctG~S;`>ZkHo!QXU7aOcpAbbY)>~b-V8-;NKEE16ItGbxJyiB$~H0@wAAeW=wsnxSFBK##)P@MjL z$W*#!9#GVp4~Z!I!(S4;TBH_}NL~0-a^56|t@3|KzTYP28abWhJOzgi`r~YreiYZ# z1TevHt=agQ>B;GcutZtjp)6*a%xuJh!H8T+AwDpQ+u4-$5s}MsGN^GiYZlGw#lU8Q zT%XO8cjcj*6?qmC+VZccao;3|gjB+RB!|StiunTR>YQE*HY=1VLupsfG*eTcp^2N+Y)3i44%q zaRt7;_#0viPWVSEZ_3Vsh>&72G~?5=1h*|9piB~F!r+JmEgyzZfe9ZXHaI#6mF$sI3SG{)N2Cyb>gRT5xcTjR^elS&B6TnGEB*y>;z>Q)=&n3cwCMt;g=X3 z;XaL!9ixMzEx61A#$U`zhtMWXhlgSTIe+{Vvwf1Ss|RpO*6zh((GKAQ+@Y$<)H5%b zH!2#nC>Qi8m=xEvrHM&Yz{QI~oLc$kYlZN zsp;5}Xg?^mADp$|LJ1tgm#iKb*~ASUaFf+_iRuoix+77&Q>xw>H@K4Z&58OxslG2! ze^9DF7>6;8$rU#>-7T+K2)%dePI+J4+!rtJOSYdKvJLUW0=6&(< zeQ8UPrS3%AK?%R^L$k+LDse<#s`Mr*H%gToZ|;TSyK=|u;iRj1(J#3+;0BGxo3^8negnEJW#pUtn%kTgiTIeBRj~x zbGde#WT-}O7wT6`6>(F2vab7Pn{w9mla}^(&Rsv3G`sWil60raqWvBBb+=^eB!d!M z_@P)xNVcvlANjy)Xf;AHJgbYB)Gd~!?3~Me<+b_Olv@uH&MwK>mARjwRx;P7ZwDJ( zHn-ry64MP+V#9W6!}i36-O`5Lw>8Tf_QL4FWt=tNHyT=uaYF+@nk(Zb54u&_kg&B& zw)UmLrI(j&y$RbE$+l(Lwk=^Bl58;axQ*CDzbJ8Cp7>^vPUqI%E99DY$9J=pUNvQ&cC?0bGf4T^8VSOxzc30WA-qrb2TTNy^^zc z)|_-SCLEh2$0l4jQ`45H*(udv4pk)`4T~owM+;&rJd3^azFA|^O^1oy193wotSI`o zeYoR;9bnhvwLS5=p18R;X>rA?I^t#wY3YL^T|dUICyq~4)AIndZh-UUEzRUmH%8eCaO3UtPImB87G_a&MRNcc6^uXJH4wI>y~8!hcfDsE3djYHE0 zm^Ptxp5|+>U43l@clbOn*?P!8s&>8>_xTK5ACOABZ}xw<^Mjo*kxG$O4lL)&J&Cer zsjN8(PkF1vOj|sdFK#qFGd636FK0}CS;Co}xJRh8DPikmGrV-J7c;N4WOfkdHjNuG zib>mQODZ6dYrC)RPP$u?4V#m-{mGt_$?g-$hQVahHe9xc6`S<*KB%!amV92zSzVXM z7c>jK-<*us_QvaaQ+3ux^u6w01DI`1eCV0w4bMvE=GhYqPcJrNz-k+2k0-5-acj5K ze<0bsBW`Q>S-Cgq-JS66m%RJ&k>b_c6V<~~_3-VJ%hdR5+<}$N&%)XbNAtHh;`WW87n`5aC=r_x(Fg0E z$}RD2&q=Q5Fo!Fekpb5w_I~W_fRQ9dpkpuPPP(?lw>}ehJ(JREDxcHb(`q|VcmoQ% z>K5z2BPLr~ubNg`TH|fIZoTr;(vMz~+KxyqM;1(g(7ySWS!x@;{W9o8+}?@`D!Z1> zNUqKCZHFY+A*!_xjdyLr4T;XJD1ctOx7`|%+y~I=%Br}l9c3D@UfbhcgLKp53sTj# zds*34CT&#@ElkW!>%2ADxJ_A) z4@!$0VLw@S&yjzvfkfeA9!z-i4YTCzpFIc)NR#_JFE2DLYUe~Oh##K(1I<$Z;s{XD zFG`*Bj)j7SVKND&>$-MH?YovscmJZae7@`l&DT1wcFvV8mu^TEaTXiCklpn?&G&j2 zc-;L^-k2zFlgitc`j!L*gDsa2&JHIXRb*fL!`?-HX=rgka`Yq|bUD_o0+xR zB8`0Qedpl{YGH z_TF&ca!KC7<+8nxPp<7u)b>fWeMwsx>_}lCyU4#2ydIS7eagK<_$qZJvnPO3Eg1Jo zpbMY@Eb{Cxt>8LZt4({78}?;Z=}ptEuiUo(dCi~JEZa|fR$T&=y@5zHhl0^Fz%ifXrVU^L;=}J+Hzce(BrA9#Tf$r;nQIh&FegY?q~ruQO6H9q zS#*PyE^TVqwQ3$n8-=C;3nnlef(?AQ{Wyd_R*Q(Ns-kmyH#(oa>`}( z+`x@GxZmPU)W~jWUIQ%W5C{J4#Co%V!B}rLC>79bID$$paoI0j+mNX3muma(R_$7@ z0w2lPZ9;+J`LG&tRwtZolCzCnAlNCnIyPi|1HOVO%T4=UpB)2D>NuLff#7b;6Z z_z-cz+D;gixlfmL&Ny;sb2dIkO(&d!%Y>{>1?Ld_RdbE72&=ga9=b&e|C<8poI){yUzD^LDZ`JM~JS|%D;q&C>J30`A7&WY(tlE z!xODe=2vtG>7!aUT82zkn=U)!&h6IP)YqwVCP2Ex{oHVJ#o8PJeyC3pM0)lfxvc=g3EPKf)iA!;?eGU4ihH(1lJ5{cvOhByX6pI1#mlf;x7= z>JCZ{QAW}r=UyiFn-ob7NeCk*d<9%4cTZ=~8&XYNWmU4$ope=^HGGwaj&!tl({JDA zq{DTuF|5_stW>yCdh&o4Rk~9}%xmDPYg5I{YvkN@DHHR8*K?;V%v-{hJ5yHXwQ=Q+ zR4Mb8;S#A-IrG{nwt{&boYS3hGOvq6ZI#ShMfcfNGcPVRZCD(Z92-(KEL@uj*RimN z!!?5alA}9S&%zB{sr@t7_|(}VuD$zSaiQL_;&jsygp?kBbOoC@^Sn7#L?HuLQANir z7W;9^;yz_gxmYn1uE$f0!Jku%LTidqR0At^L9gGZ$rNOvoT5s2O<^VSys1cUT&!4} ze&@C8uif1D;gJuH#9Mbu^?UAfNJy0xE2&FQ-)vlZ?)t@B`dg#7H{Lq?ku~0W0HT`v z95R48EG)qhDZOi9=$9N`_qA$7s!*$MS!{~8;ta_4&p8B?)y$?yM!<~waOrw>cKIPY z7_iEyKf_#*^;s@ac^6@N)-zuh**}871={+2hcj^^SR-nI?y(-}ArpQ=_*x;;F4h)A z5Sf6`carJ9#-VIcZj)yV1{$Ai{vlcubh1JCpX4xVUI?Eef>R`FuTu&+Z=&KzFFee; zzjga^%Z^JYlBTMJsY5b#ER9H}t(S(AIzvKNA?Yd>kKJnd=}25x@mIR%*ch%FBS2A* zA)(34xeJDXag$k|2Cg}Vkmga8o5p-&bYt8B^yNF$M8X{G#D!it6N39fizcVo$6_A3 zV;L6&O)K(sT+1JkiwS~nJTw(#=RdrAaE$T76!cO84YurIXg3ee^`8d5f?!>sZYVg4 zzP>gQ0xpUJD#c>4D~*u1S5Q1nZ8$04IPio zuaL{>=y)(95Cf`k8hwu~hl43ytNv|{qGn>Mq5dZ-%{rR>?>g*~SGntniYko;-^(;j za_5Uft4evUFrYFp$uj%4Q3rV9x7d# zue6f0p3)sGokeu9|9|0A5*rA@b}S;Hk+K=o|9_D_862d!%7`r^*+i{{OXU0}IgB#z zp~A0Ikk+EYP3@%M036vcfG$9%zf15T6~A2M-@%3KL8JN1z-j9LMr-@IZt!n8^Upc^ z-*LNs!5xygL!TEEY6?E%;Cx=fY4&M;&h>p>pwkrGAlmY;J2Kj%osQBZ!#mC%$)nzDF#+p?zp0j%3K#s$}XaAq1KC_+I; zibbW4Yqx3WQrx+(E^J)XE`D`kw^RX1az~2dSFml@;|=OnQkp6vpMfj1-!CTL4F1Jh zrkc53%bJD<1!G1{%ba)Nx%(XXsGi0Yi%gAFaINk4iVd2tX_DoxD+BWbDJ=z7aL(}P z<)bM*1&B)8Q$@_nq_QB<3sK3|m|_JXLMSxeH?ibe2Ch(J%0PoZ0}TE&Eb^@ZMNtYQ e`0kZ)nyRE3zpi9i)m&Awr0$=J^_l{P)&DOEd~$FA literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/__pycache__/mamba_utils.cpython-312.pyc b/model_executor/layers/mamba/__pycache__/mamba_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7703a82e01f68f681a4ae9596f37077b488ec4d GIT binary patch literal 8024 zcmcH;O;8)xc~{a(yZS)_@sEJT*uerDTpQbD;w{PG3-h1Ew_w+ZP&&|MivpY|Pt?bRS>@z5re+B|b? z*!>(kjD%U;{cbdQD|-IQ+_X%e(vqB@Pb;kMl;Zc|k{q__jbm3vBj>OD@cj#sEAPQ~ zbm)@ts5jo0Gm)u#kytb~DMvDC5}S-j8K4$3mS8}S`LHFy${sN?GtNrPxJ|Z8tn84v zJ6>yN1D|&>zysMO*=4?R56LlMYl6-bPKkTqcx;jw7hZ{~^OY#`lFLHb{hdC24vrGgZCHOcM^tYZ>F8^M$MQCaT$EnP^5nj|w>y zO~hsrQBYCclSDN)^i;1-q6L@=OjTER5;-$NQoS*|MK94(@p)?yz#}GWlh~=l3`0B_ z78>X!*&dtt&)7_9Y}S@#i`KVb&)O&0F(zyu(OnaXbTo6~_^ z`%`e;B3%C(>&qmk`=;a5M>A0ZW_xsY=Je|WQ>W#=MEq9Y^ju~#o$9-nNF@6V_Q_co|a-RhBZTXtK( zU0`g2o76+6Z@*oFo|QBsz^N;4=lK7$owD9tNVpBbuH~YnsC0$_dBkj6SD76-vJg9{ z{W-Ia1upBzauc>OCSyiEhRND-2itGLwh`SC1$_?NsMQdNFHG23Np^Z5mQLNXZceYw z$Vo83Xu{g6IDrJ3mMBswlNNZM-$$&wmFGB&P%Z)Zn0Z;ng!y>8%AbD8F$d46XFkw6N8#N@KrFp6_*wEjK)J+E5gzvB zr|Fld*jHG}q3|xn?gR=uG@)LZ1iuv7$*}ojdnNyS_+mlzUr_lAbkif@fX+uEsc2G;M09r~0+DAXfw(sk zxjPe0m~T9>L=-%_3?WJ)xJl9jK<8+TJS@9RaXn z_#DGxF)=3ykR8guAr3rX@?4-`6S%gGy7t_m-}QXnqtzWPaL`b2GNIQBF2s1o?O&Yv z<^50YFAb^wc1;jB1!&*0GtO!{xKK|qo^zLzfd-QdVmu>u7X-xIy9s)r-6V)TYX3h3 zz1%?g3ZcPVA!5b~5v#O9Xg61gJwkMaPOh_@AT*c+5v#;GXgA4VkAO+0dncLh5}9r* znG3(Y{MlvI-=ztMHw7oxT_V+8B6X47N$Vm@iEX-^+(nj-cws}7swLHk7-&T4!dQaW zRvuJjX8{9ciUZ0NhXkbzd>r_hpy5Y?phQa)9tyMV62%Egtqc0|@Dt$YwvIJM7o0j8pbB0WkrLKnn^|`<*20SuFP~qiOcj;sUg03#rsfF5DaNJUNoPEI2j08mWj6$1giwlmXp%4`la!o@;Z}+lI&{jZj{w;#+GRn-=fP7? z*cQ0TIU$Dl1!tC%SQIbZ4YPs%SuX3u@V{aod5)<*!7P#B62dNn>u0dxO$28Vd>=qr z7D!pMD+{Ep3MP_NDn+@JC;POMH7gt|SFGN$WS{n)1d7TB03eVCnAT3UvwtOFMQ)Sy%(N22`muPG%A%h`3n;98kHW~lUCz!!Pbg`j6wcb`VSLt! zkhbz!mt;eMSqfo(dnAZTu)J|>zoEjiFSxTp){R1gQu#TR7v@4x{)Xh)42hZ>h(Nte zjWPm^Ba%WeB@cUc3iTqi5^7m!R2CWYR437Zhw#OD01DR8rabXxE#QhZ%&W8f&{|%!ttEO$EL3<&?2#}3C>mq%3kzg>;QY4J+Fp6D80v+)y zq>h;!7)BLS+pISCEZIypT9Cp4K!S&XM*M2;$mh?KGj!QmDhl8BkK7>&i!GpP(7 z<+zJ_MC9p2JQl?R5*(8zl1z$ylhe`E93_jICn=unpctfJhn zypQkEN0U(mSo@GG2(BV1*}^_9Wsh>xFbzw&xzjW%mz#9h3e2r!0f0Q~ncB9TM+^2W zN3~$zYWI`-+VNp6IK0Sz?X7t@uyk|X+mrW(mhNg^F*l%jdkUOA{MQBnq7m?r5vhVpwTIz=C-uheTSM<>vZZcwiyaPmqB9H^R=IBixi z$1FM&dN)ForDCW3<}DTb*bcY4U9r=ye+7!l9|2H(fx|>kTfVt-@p9f%r+QlRO&yEF z8?~)DUaLL4DCFxqa>H7E@1p1H(1G0R%aDQdb#vjEp=-FJe6T2%aAkjC73KLD zz|FzkoE|YA$-*&6r-fIzB~57Ggj0~>;Q)3pPC*2&WA#LVf!kVe6L023fEfobHB3|U z3tT9j4d-{_94^F+*${K%#>~wEW^T5nt+?4>{U*}bxeG7T*_H1&lovbljZH7ATj7ic z_XKyC1=#08%wQ>ED6pmePsa-$eDcDaO9ww4F8J`Niow|s3n06ZRz=pLSXb*-l_x{1 zvtP8Y9eQeiI;M_ZedzhFsOIP~B2AH9^sIy_OaadN3f9>>wx0#6F?xla% z8V6@(?ew1?dcvj$Xn!fLWJvrLBt8<&t@V1^Z2bRY{M}1&dYoHHbv zc<(HcpbJJ~1$z|iEX!{34D0(RbNKK0?fk}OXKmkjFSG1L_60+4-+W{{Z)4q?3_`lQ F{{SrErX&CW literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/__pycache__/short_conv.cpython-312.pyc b/model_executor/layers/mamba/__pycache__/short_conv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e474fbd4fb853e73130441a52ff61f6d762511c GIT binary patch literal 9617 zcmcIKTWlLwc6a!G4Br$*N|a1JEQx+dma}%^N9@F6Ofdyj22G&0<(y#7+e>jy34h9ZZ!ER9){b4GnS+qdgbMA0R zN@kpWpks69+;i_a_ug~QJ?GqW^-nIBgTRwB{W!C4l#pLzL4Ry5VDsk;A*+NV9Frq? zCc!X-X9NqB4LL*Jm@qO}XUv%rCit2FXHJ+Y&YZL4tqE)1maygR2|LAEa*n(+;iRxN z$L3uLSKgg))3Pn+$$JxC3fptOyg%X32NHpNFcG9UN3Jg)N`xrv%=PEPiEuuWh|n^d z8^{kP2J_KGl$Kq&q5N=Sn8NPdj(jW;V~Bz9-t3${!OoZS+=n^hV<4{)&ieu3e0+%a z-ZyF3``U*jMmaxk%D&nvc73?OSKx#EIM>G+7i?VU15;w>A3@&_^b;_HVa~(v^;AiO6;kuQjuVqq?ozQPx{^MLfV zk>_|Z#ihhl+^9O|Q|Zh6e6g^QSyY|piX5M775W!>F`2Fi0x(Km&E@jRG;IZ(#<2=I zC>Mof9!Zj=l#t5h0C=w`%vJJ*Z?*WH%@p{Qpbnhlg+(6t^r-WFyZGML=D^Z^E|tHOnoE^&7!g{y zAf`nAq_|Y#=To_KC6^M5LU+}L%c<@vp!23vm2xVFBfUDyB^L^6a4V)#0x9y)U4+XB zd|FHj{9>jo@&d>WXOt|KpmN~C<)R>h46Z)20IH*mb+m_cmwFLrtFZZ9P}Cp6cH~Kd z;Yh+@C-{#Gat7YW8R2K*O&qgqC4@8c<`!h(EennsSztKp2d;#bv+=g9zni@VbN1f& z1rz50<#YT2bWWgi{sD9>(6OA0cfp9FJx1zYFhl^oVBkE^(nDLiVD>G1tIi8`Ufu__ z7HH}Fz|`Kze!vAdf2&;p+68Om8k6X|K;nI>y}d%`RU?9-$*jquytFc9)L%qkJvRX1!B`H7hjeN+sQ7>OjIo%TpZ zC?PFIxG(Z#7TXw$LPzTnbY*>dn-|S!q)*;TQo9Ex1cd|VnaY*L_^D!(k zoM{t*(}_EENMG~pgpTQat5(S(S+i3ej1FBT zf?M3JmpB`)#Lo=6bbbKZ)04`}t+-v}PaUCe@KClqw>%{;IQB5!(gRrA(?U3=aFhNjr8Sl?jom)SJ8zBE(Hl=g}#aI)gwYn2xdy>#V8ekPZ>G*emKOl zOUPg2(-j!s3|)ZoOddV$OvjH+mzIP{=waF4wyM+mqKpH+{3fWtZ?BW5eqwjuu*$aZ zBin!yiq=ej?o=Y9@Np~QLHM|o{-H+yF1ddf6j&uV&B-EWar2BJN8CwpB&rQh#in)2O6>Ca_smonf2IQ z&Ge!DhP`16(?JaVrhiuHA5aE{ZuPDCZb!E8y;u)S0N*h3WuLMnvJ0DT>-_C{U=kWd zu#xMjn+zXVA3E3=IwKFAslyzdzIxweqwkmu|DDGk`=j+82OcgwyjUMPw(dV(XOAmK zUx9WF+pKJxeK;Z84n2+xe-ynPUAwd%iPyusk?lrws%Cm*izxaRCHE8OCPnDX}8vV2T9QdUMEclv3mrxLtFX>Tdeh80HE7DX4`%cDq2sr?f-NPA=z$k z+d_jMMf=?Lbe-1sTXTninFy}{K5iBc!bde0OJ&uBdM)%rneHcH4xp_wx3=cD?0aU) z-^IqL>7dInS2ya?hwdBhhHXT)jVNqHVf_m0{qXf0uYX}R*&WZy!U@E5EkfYMK^ABm z>7#p#s!=lbl1tSjnZCayqAcknnZ$M+aK9zTvf-NqJOo(tUUESLUo~yv(DvxtIP{Jc zjJ=`_%GE4E9Ewo3?FY3{;+r#ocW;fbYLQI6;!^O3rd}~c72>BXF4ne3VsuaT4sfp8 zs}9MY#Q|)4I1^`hVAgwo2P9v0R@tfxxVAl9hon~=n%&W<*P-8ox*bpGv>=c6*#AgA z7F`DV{HkurbbVTKN-m6lvSXb-b*Qan9*LDa*s2PVKz65I-}!Enuoooky+B}hRQ|YC!UF%$1YSdrM60UC*T-tVW+t|GpJQ?&~q~q3W;{6ha`geY&JNU&-eroOKy? zEJtwe2o>ObSlzKmsxhEjkit@wu6wUN2ez*N48BnJN zN!0-nw&T2j-8}hUc%J+Zyo7Nm#u*y!J;5pgw|3m3I#OwH&NS9koiv1OVlZHUSmTzT*R1d_q!V^Ma{M1c~E!>+qR4*dYpUVm|?VjUqows#JpHntq#oeYF}LI_T*Qvv#zw}u`D-eZ zRGB%InO7OeO3>xd4kl&QFn3MuYjFj^!rUPRM?*jXgA~=-nQ2-0Gm1fxwj`lCu%HP- z6ACC2u1`xS#Ggq`ny1N< zJx3b5Ps_Vc*IbG-Tz8H?4h-GOKX~g=V1J$6Uk~h8qVc=qzu0|mw;VlKJ4b6D?S1W0 z;B=imT@RdAqOp(8-9C4Brm^d^4F98NYUh;5#M--ZB;JU;AV*$!IQNNRJ#wV>O=WCf zW9*bXcB;0dA z1X@~IYV=Ra{gaLUJ#zmZj5p8RIMZOGG8=8MBQiU(HnPr+uW=t|?qnJh2jz)_jfo@j z#F0-%ezb2~5@5sg3^eM*pnbKU-s!&{!jMKn@+KIhCPZjiHz2p_lRC6B=E6 zQw~i4wlB6eyOOLql;~uA*HJlov~G()@EnY+9$7h}u&%m0R`FSTlh0)ND21e8e2Kj2u#U= zsk^?rC)NYgwUdv1BZ@buc%$%MwXfJ$-79WoaEB6|P(}|b!`}d{QDU=C0}j9Y%OLTD zZ)9#Uw`PA_tPjuDcg!~X9R3$-bImaE1#ezjvDHo}xZYzjJGOS>5j*jjv#;H4Tn@z7 z1G}-yowUn=XL}tx^weebyMd#(7ss`CAF=VL76Qjn_BLDJd0;(!u;yx7NPO=vT=!fL z^XpS5W!ui0rS2K2dq?WF(K{#TaYwZXyjT$m)9Lov1~#4vei)@~{yn@_F*gU_?ay`X z{6c!2f3&*J=_T7f{dBnW=GyBl1T9;hyh9C<48&C1AKl|$Xq_c3U8xofw7}}M^Z6}j z$>EKIYSunJnk>M)9-`+aA0n(HtRdvK;A{ux(WeP?hn_eVr(tK*JN0 zVe@$Q)NOm<{Ve9OXR6^jE_;rHnP{*>GCQQHOjmJ6W@k3dMz5nujQWwad+G4VNtX_n ztoz8?ON5;T(vKP->fTG$ZS?(DbDi_gUVex17Q}Hh2wR-^W&F5z^4S@Iyq@@bwpnwLvl)^h#V3Zer1d zizECsg05EbHO=R8$s`2thXG5K?G+lc2$-#)FN`b8aQ#hw`vGaX$)0`6?inQ%{@gQb z9s+yo_BE{t*+{UzX-CLG`bXDJ%Au*I6N_xS=)$5Ktoav|c7ZXQJsbT|Oqw>$00TE) zXq8=IH?3H5+s%%4HNaifSPJDC^C_m?fR?(NXyxx3EoR4B-&*D4@7?*{!}a*IiAvR>-PPf83O@kpZOJyG##04<2Y_yKr5mKL%CJfFz7g|mS=>b19=La5 zz&xSq!h^X*G*E_4`ZR9T&PKjM5%T_c2f!z_d^uGhFiq_V2y4JX;PHi%wh^k?U_L&Vhz=kL=uYHz7M;x_(kI*&3!k+0?go?vshSsqbG* znjhXG6yUMN)M(C3Ksy8D-a4?Y*-%f{ZeB=%nf*D?{5SkMIbX9=+k+<6S}f7+M}dV( zAq{t_6?!U3!bL+_wIhdQCSS_MC*i7Nsgx-!3b2$p>zw_bTnx3BdD zylzxSdbQD(c{)e`azVg*BzjS#-4Df|LqTo#bbE?;5v%U`+V+~ouZWpkS-=oP^LhKf zmJZV2(_8+UK|9ydqil#$Q=ZfGCMv1%??e87g)cnKt2X`59W?XxQ>^$~d{Lc6)u1}1 z?OJ+|U^}W=;bX$zK`mH0Qho}rrjcQoFKmPv`ON73)O6z4g#A}?>{sNBOwN2tPW+0T z{FLnf(qLi?pA&d};ULT_%$EiuWBAerFQk0=Q}QO*BSYYNxWV{k#((qZIum+caYVty{2ZrJ#*i9Gi9u s2F;AKO$2b82H+M6LK+1?pL+>2ps- literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/abstract.py b/model_executor/layers/mamba/abstract.py new file mode 100644 index 0000000..e68b09b --- /dev/null +++ b/model_executor/layers/mamba/abstract.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import abstractmethod +from collections.abc import Iterable +from typing import TYPE_CHECKING + +import torch + +from vllm.config import VllmConfig +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + + +class MambaBase(AttentionLayerBase): + """ + Base class for Mamba-like layers which support the v1 engine. + Inherit from this class if you implement a custom layer. + """ + + # Contains the KV cache (mamba state) for the layer + # in the shape specified by `self.get_state_shape`. + kv_cache: tuple[torch.Tensor, ...] + + @abstractmethod + def get_state_shape(self) -> Iterable[tuple[int, ...]]: + """ + Defines the shape of the state. + For mamba layers this is usually a (conv_state, ssm_state) tuple. + In this case, returns (conv_state_shape, ssm_state_shape). + """ + pass + + @property + @abstractmethod + def mamba_type(self) -> str: + pass + + @abstractmethod + def get_attn_backend(self) -> type["AttentionBackend"]: + """Get the attention backend class for this Mamba layer.""" + pass + + @abstractmethod + def get_state_dtype(self) -> tuple[torch.dtype, ...]: + pass + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None: + if ( + vllm_config.speculative_config is not None + and vllm_config.model_config.hf_config.model_type not in ["qwen3_next"] + ): + raise NotImplementedError( + "Mamba with speculative decoding is not supported yet." + ) + mamba_block_size = vllm_config.cache_config.mamba_block_size + page_size_padded = vllm_config.cache_config.mamba_page_size_padded + return MambaSpec( + shapes=self.get_state_shape(), + dtypes=self.get_state_dtype(), + block_size=mamba_block_size, + page_size_padded=page_size_padded, + mamba_type=self.mamba_type, + num_speculative_blocks=( + vllm_config.speculative_config.num_speculative_tokens + if vllm_config.speculative_config + else 0 + ), + ) diff --git a/model_executor/layers/mamba/linear_attn.py b/model_executor/layers/mamba/linear_attn.py new file mode 100644 index 0000000..0a2742f --- /dev/null +++ b/model_executor/layers/mamba/linear_attn.py @@ -0,0 +1,402 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +from typing import TYPE_CHECKING + +import torch +import torch.nn.functional as F +from einops import rearrange +from torch import nn + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config +from vllm.distributed.communication_op import tensor_model_parallel_all_reduce +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.lightning_attn import ( + lightning_attention, + linear_decode_forward_triton, +) +from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, + MambaStateShapeCalculator, +) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + + +class MiniMaxText01RMSNormTP(CustomOp): + name = "MiniMaxText01RMSNormTP" + + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.tp_world = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.weight = nn.Parameter(torch.ones(int(hidden_size / self.tp_world))) + + self.weight.weight_loader = self.weight_loader + self.variance_epsilon = eps + return + + @staticmethod + def weight_loader( + param: nn.Parameter, + loaded_weight: torch.Tensor, + ) -> None: + tp_world = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + + shard_size = loaded_weight.shape[0] // tp_world + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + param.data.copy_(loaded_weight[shard]) + return + + def _forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + orig_dtype = x.dtype + x = x.to(torch.float32) + variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32) + if self.tp_world > 1: + variance = tensor_model_parallel_all_reduce(variance) / self.tp_world + x = x * torch.rsqrt(variance + self.variance_epsilon) + x = (x * self.weight).to(orig_dtype) + return x + + def forward( + self, + x: torch.Tensor, + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + assert residual is None, "RMSNorm does not support residual connection." + return self._forward(x) + + +class MiniMaxText01LinearKernel: + @staticmethod + def jit_linear_forward_prefix( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + kv_caches: torch.Tensor, + slope_rate: torch.Tensor, + block_size: int, + layer_idx: int | None = None, + **kwargs, + ) -> torch.Tensor: + slope_rate = slope_rate.to(torch.float32) + should_pad_dim = q.dim() == 3 + if should_pad_dim: + q = q.unsqueeze(0) + k = k.unsqueeze(0) + v = v.unsqueeze(0) + b, h, n, d = q.shape + e = d + kv_history = kv_caches.reshape(1, h, d, e).contiguous() + output, kv_history = lightning_attention( + q, k, v, slope_rate, block_size=block_size, kv_history=kv_history + ) + kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e)) + assert output.shape[0] == 1, "batch size must be 1" + return rearrange(output.squeeze(0), "h n d -> n (h d)") + + +class MiniMaxText01LinearAttention(nn.Module, MambaBase): + @property + def mamba_type(self) -> str: + return "linear_attention" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend + + return LinearAttentionBackend + + def get_state_dtype(self) -> tuple[torch.dtype]: + assert self.model_config is not None + assert self.cache_config is not None + return MambaStateDtypeCalculator.linear_attention_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + ) + + def get_state_shape(self) -> tuple[tuple[int, int, int], ...]: + return MambaStateShapeCalculator.linear_attention_state_shape( + num_heads=self.num_heads, tp_size=self.tp_size, head_dim=self.head_dim + ) + + def __init__( + self, + hidden_size: int, + hidden_inner_size: int, + num_heads: int, + head_dim: int, + max_position: int, + block_size: int, + num_hidden_layer: int, + model_config: ModelConfig | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + layer_idx: int = 0, + linear_layer_idx: int = 0, + prefix: str = "linear_attn", + ) -> None: + super().__init__() + + self.layer_idx = layer_idx + self.BLOCK = block_size + self.hidden_size = hidden_size + self.num_heads = num_heads + self.head_dim = head_dim + self.total_num_heads = num_heads + self.hidden_inner_size = hidden_inner_size + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + assert self.total_num_heads % self.tp_size == 0 + self.tp_heads = self.total_num_heads // self.tp_size + self.qkv_size = self.num_heads * self.head_dim + self.tp_hidden = self.head_dim * self.tp_heads + self.model_config = model_config + self.cache_config = cache_config + self.prefix = prefix + + self.qkv_proj = ColumnParallelLinear( + hidden_size, + self.hidden_inner_size * 3, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.output_gate = ColumnParallelLinear( + hidden_size, + self.hidden_inner_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.output_gate", + ) + self.out_proj = RowParallelLinear( + self.hidden_inner_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + ) + self.norm = MiniMaxText01RMSNormTP( + self.hidden_inner_size, + eps=1e-5, + ) + + slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(self.num_heads) + if num_hidden_layer <= 1: + self.slope_rate = slope_rate * (1 + 1e-5) + else: + self.slope_rate = slope_rate * ( + 1 - layer_idx / (num_hidden_layer - 1) + 1e-5 + ) + self.tp_slope = self.slope_rate[ + self.tp_rank * self.tp_heads : (self.tp_rank + 1) * self.tp_heads + ].contiguous() + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + @staticmethod + def weight_direct_load(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight) + return + + @staticmethod + def _build_slope_tensor(n_attention_heads: int): + def get_slopes(n): + def get_slopes_power_of_2(n): + start = 2 ** (-(2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio**i for i in range(n)] + + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] + ) + + slopes = torch.tensor( + get_slopes(n_attention_heads), dtype=torch.float32 + ).reshape(n_attention_heads, 1, 1) + return slopes + + def _prefill_and_mix_infer( + self, q, k, v, kv_cache, state_indices_tensor, attn_metadata + ): + hidden = [] + for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)): + if _prefill_idx >= len(attn_metadata.query_start_loc): + break + if _prefill_idx >= len(state_indices_tensor): + break + offset = attn_metadata.num_decode_tokens + _start = attn_metadata.query_start_loc[offset + _prefill_idx] + _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1] + slot_id = state_indices_tensor[offset + _prefill_idx] + qs = q[_start:_end].transpose(0, 1).contiguous() + ks = k[_start:_end].transpose(0, 1).contiguous() + vs = v[_start:_end].transpose(0, 1).contiguous() + slice_layer_cache = kv_cache[slot_id, ...] + + out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix( + qs, + ks, + vs, + slice_layer_cache, + self.tp_slope, + self.BLOCK, + layer_idx=self.layer_idx, + ) + hidden.append(out_slice.contiguous()) + if attn_metadata.num_decode_tokens > 0: + hidden_decode = self._decode_infer( + q, k, v, kv_cache, state_indices_tensor, attn_metadata + ) + hidden.insert(0, hidden_decode) + + if not hidden: + return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype) + + hidden = torch.concat(hidden, dim=0).contiguous() + return hidden + + def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, attn_metadata): + q = q[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + k = k[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + v = v[: attn_metadata.num_decode_tokens].unsqueeze(2).contiguous() + slot_id = state_indices_tensor[: attn_metadata.num_decodes] + hidden = linear_decode_forward_triton( + q, k, v, kv_cache, self.tp_slope, slot_id, 32 + ) + return hidden + + def forward( + self, hidden_states: torch.Tensor, output: torch.Tensor, positions: torch.Tensor + ) -> None: + torch.ops.vllm.linear_attention( + hidden_states, + output, + positions, + self.prefix, + ) + + def _forward( + self, hidden_states: torch.Tensor, output: torch.Tensor, positions: torch.Tensor + ) -> None: + forward_context = get_forward_context() + attn_metadata: AttentionMetadata = forward_context.attn_metadata + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, LinearAttentionMetadata) + num_actual_tokens = ( + attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens + ) + else: + num_actual_tokens = hidden_states.shape[0] + + qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens]) + qkv32 = qkv.to(torch.float32) + qkvact = torch.nn.functional.silu(qkv32) + qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1)) + q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1) + if attn_metadata is not None: + kv_cache = self.kv_cache[forward_context.virtual_engine][0] + state_indices_tensor = attn_metadata.state_indices_tensor + + num_prefills = getattr(attn_metadata, "num_prefills", 0) + if num_prefills > 0: + num_decode_tokens = getattr(attn_metadata, "num_decode_tokens", 0) + for prefill_idx in range(num_prefills): + q_start = attn_metadata.query_start_loc[ + num_decode_tokens + prefill_idx + ] + q_end = attn_metadata.query_start_loc[ + num_decode_tokens + prefill_idx + 1 + ] + query_len = q_end - q_start + context_len = ( + attn_metadata.seq_lens[num_decode_tokens + prefill_idx] + - query_len + ) + if context_len == 0: + block_to_clear = state_indices_tensor[ + num_decode_tokens + prefill_idx + ] + kv_cache[block_to_clear, ...] = 0 + + decode_only = getattr(attn_metadata, "num_prefills", 0) == 0 + if attn_metadata is None: + hidden = torch.empty( + (q.shape[0], q.shape[1] * q.shape[2]), device=q.device, dtype=q.dtype + ) + else: + if not decode_only: + hidden = self._prefill_and_mix_infer( + q, k, v, kv_cache, state_indices_tensor, attn_metadata + ) + else: + hidden = self._decode_infer( + q, k, v, kv_cache, state_indices_tensor, attn_metadata + ) + hidden = self.norm._forward(hidden) + gate, _ = self.output_gate(hidden_states[:num_actual_tokens]) + hidden = F.sigmoid(gate) * hidden + hidden = hidden.to(hidden_states.dtype) + + output[:num_actual_tokens], _ = self.out_proj(hidden) + + +def linear_attention( + hidden_states: torch.Tensor, + output: torch.Tensor, + positions: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self._forward(hidden_states=hidden_states, output=output, positions=positions) + + +def linear_attention_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + positions: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="linear_attention", + op_func=linear_attention, + mutates_args=["output"], + fake_impl=linear_attention_fake, +) diff --git a/model_executor/layers/mamba/mamba_mixer.py b/model_executor/layers/mamba/mamba_mixer.py new file mode 100644 index 0000000..b6345b8 --- /dev/null +++ b/model_executor/layers/mamba/mamba_mixer.py @@ -0,0 +1,535 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING, NamedTuple + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +import torch +from torch import nn +from torch.nn.parameter import Parameter + +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, + MambaStateShapeCalculator, +) +from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( + causal_conv1d_fn, + causal_conv1d_update, +) +from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( + selective_scan_fn, + selective_state_update, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata + + +# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer +@CustomOp.register("mamba_mixer") +class MambaMixer(MambaBase, CustomOp): + """ + Compute ∆, A, B, C, and D the state space parameters and compute + the `contextualized_states`. A, D are input independent + (see Mamba paper [1] Section 3.5.2 "Interpretation of A" + for why A isn't selective) ∆, B, C are input-dependent + (this is a key difference between Mamba and the linear time + invariant S4, and is why Mamba is called + **selective** state spaces) + """ + + def __init__( + self, + hidden_size: int, + ssm_state_size: int, + conv_kernel_size: int, + intermediate_size: int, + time_step_rank: int, + use_conv_bias: bool, + use_bias: bool, + use_rms_norm: bool, + rms_norm_has_weight: bool = True, + rms_norm_eps: float = 1e-5, + activation="silu", + is_lora_enabled: bool = False, + model_config: ModelConfig | None = None, + cache_config: CacheConfig | None = None, + prefix: str = "", + ): + super().__init__() + self.time_step_rank = time_step_rank + self.ssm_state_size = ssm_state_size + self.use_rms_norm = use_rms_norm + self.activation = activation + self.is_lora_enabled = is_lora_enabled + self.conv_kernel_size = conv_kernel_size + self.intermediate_size = intermediate_size + + self.conv1d = ColumnParallelLinear( + input_size=conv_kernel_size, + output_size=intermediate_size, + bias=use_conv_bias, + ) + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `set_weight_attrs` + # doesn't allow to override it + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + + self.in_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, bias=use_bias + ) + + # selective projection used to make dt, B and C input dependent + self.x_proj = RowParallelLinear( + intermediate_size, + time_step_rank + ssm_state_size * 2, + bias=False, + ) + # time step projection (discretization) - + # In the forward we need to apply dt_proj without the bias, + # as the bias is added in the selective scan kernel. + self.dt_proj = ColumnParallelLinear( + time_step_rank, intermediate_size, bias=True, skip_bias_add=True + ) + + def weight_loader(param: Parameter, loaded_weight: torch.Tensor): + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + param.data.copy_( + loaded_weight.data.split(loaded_weight.shape[0] // tp_size, dim=0)[ + tp_rank + ] + ) + + def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor): + weight_loader(param, -torch.exp(loaded_weight.float())) + + tp_size = get_tensor_model_parallel_world_size() + self.A = nn.Parameter( + torch.empty( + intermediate_size // tp_size, + ssm_state_size, + dtype=torch.float32, + ) + ) + self.D = nn.Parameter(torch.ones(intermediate_size // tp_size)) + + set_weight_attrs(self.D, {"weight_loader": weight_loader}) + set_weight_attrs(self.A, {"weight_loader": A_weight_loader}) + + self.out_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=use_bias, + input_is_parallel=True, + ) + + self.dt_layernorm = ( + RMSNorm( + time_step_rank, + eps=rms_norm_eps, + has_weight=rms_norm_has_weight, + ) + if use_rms_norm + else None + ) + + self.b_layernorm = ( + RMSNorm( + ssm_state_size, + eps=rms_norm_eps, + has_weight=rms_norm_has_weight, + ) + if use_rms_norm + else None + ) + + self.c_layernorm = ( + RMSNorm( + ssm_state_size, + eps=rms_norm_eps, + has_weight=rms_norm_has_weight, + ) + if use_rms_norm + else None + ) + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + # The inner tuple is (conv_state, ssm_state) + self.kv_cache = (torch.tensor([]), torch.tensor([])) + + self.model_config = model_config + self.cache_config = cache_config + self.prefix = prefix + + def _ssm_transform( + self, x: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if self.is_lora_enabled: + # Lora kernel requires contiguous tensor. + ssm_params = self.x_proj(x.contiguous())[0] + else: + ssm_params = self.x_proj(x)[0] + time_step, B, C = torch.split( + ssm_params, + [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], + dim=-1, + ) + if self.use_rms_norm: + assert self.dt_layernorm is not None + assert self.b_layernorm is not None + assert self.c_layernorm is not None + time_step = self.dt_layernorm(time_step.contiguous()) + B = self.b_layernorm(B.contiguous()) + C = self.c_layernorm(C.contiguous()) + discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) + return discrete_time_step, B, C + + def forward(self, hidden_states: torch.Tensor, output: torch.Tensor): + torch.ops.vllm.mamba_mixer( + hidden_states, + output, + self.prefix, + ) + + def forward_native(self, hidden_states: torch.Tensor, output: torch.Tensor): + pass + + def forward_cuda(self, hidden_states: torch.Tensor, output: torch.Tensor): + """ + Run the Mamba-1 SSM pipeline. + + Steps + ----- + 1. Apply the gated-MLP linear projection to the raw input. + 2. Pass the projected sequence through the convolutional mixing layer. + 3. Feed the result into the State-Space Model (SSM) blocks. + 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x) + to produce contextual representations. + 5. Project the contextualised sequence back + to the output embedding dimension. + + Batch handling + -------------- + Prefill and decode tokens are processed by dedicated CUDA + kernels for both the convolutional (conv1d) and SSM stages. + In the case of a mixed batch (containing both prefill and + decode tokens), both sets of kernels are executed independently + and their outputs are concatenated before the final output projection. + """ + + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + + assert self.cache_config is not None + mamba_block_size = self.cache_config.mamba_block_size + prefix_caching_enabled = self.cache_config.enable_prefix_caching + + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, Mamba1AttentionMetadata) + query_start_loc_p = attn_metadata.query_start_loc_p + state_indices_tensor = attn_metadata.state_indices_tensor + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + has_initial_states_p = attn_metadata.has_initial_states_p + num_padded_decodes = attn_metadata.num_padded_decodes + + # 1. Gated MLP's linear projection + projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) + hidden_states_BC, gate = projected_states.chunk(2, dim=-2) + + conv_weights = self.conv1d.weight.view( + self.conv1d.weight.size(0), self.conv1d.weight.size(2) + ) + + if attn_metadata is None: + # V1 profile run + hidden_states_BC = hidden_states_BC.contiguous() + return self.out_proj(hidden_states_BC.transpose(-2, -1))[0] + + num_prefill_tokens = attn_metadata.num_prefill_tokens # token count + num_decode_tokens = attn_metadata.num_decode_tokens + num_prefills = attn_metadata.num_prefills # request count + num_decodes = attn_metadata.num_decode_tokens # token count (=request) + has_prefill = num_prefill_tokens > 0 + has_decode = num_decode_tokens > 0 + num_actual_tokens = num_prefill_tokens + num_decode_tokens + + prefill_decode_split = split_batch_to_prefill_and_decode( + hidden_states_BC, + gate, + state_indices_tensor, + num_prefill_tokens, + num_prefills, + num_padded_decodes, + ) + hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p + hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d + gate_p = prefill_decode_split.gate_p + gate_d = prefill_decode_split.gate_d + state_indices_tensor_p = prefill_decode_split.state_indices_tensor_p + state_indices_tensor_d = prefill_decode_split.state_indices_tensor_d + + if prefix_caching_enabled: + block_idx_last_computed_token_d, block_idx_last_computed_token_p = ( + torch.split( + attn_metadata.block_idx_last_computed_token, + [num_decodes, num_prefills], + dim=0, + ) + ) + block_idx_last_scheduled_token_d, block_idx_last_scheduled_token_p = ( + torch.split( + attn_metadata.block_idx_last_scheduled_token, + [num_decodes, num_prefills], + dim=0, + ) + ) + + block_idx_first_scheduled_token_p = ( + attn_metadata.block_idx_first_scheduled_token_p + ) + num_computed_tokens_p = attn_metadata.num_computed_tokens_p + else: + block_idx_last_computed_token_d = None + block_idx_last_computed_token_p = None + block_idx_last_scheduled_token_d = None + block_idx_last_scheduled_token_p = None + block_idx_first_scheduled_token_p = None + num_computed_tokens_p = None + + ssm_outputs = [] + + if has_prefill: + # 2. Convolution sequence transformation + conv_out_p = causal_conv1d_fn( + hidden_states_BC_p, + conv_weights, + self.conv1d.bias, + activation=self.activation, + conv_states=conv_state, + has_initial_state=has_initial_states_p, + cache_indices=state_indices_tensor_p, + query_start_loc=query_start_loc_p, + block_idx_first_scheduled_token=block_idx_first_scheduled_token_p, + block_idx_last_scheduled_token=block_idx_last_scheduled_token_p, + initial_state_idx=block_idx_last_computed_token_p, + num_computed_tokens=num_computed_tokens_p, + block_size_to_align=mamba_block_size, + ) + # 3. State Space Model sequence transformations. + discrete_time_step_p, B_p, C_p = self._ssm_transform( + conv_out_p.transpose(-2, -1) + ) + time_proj_bias = self._time_proj_bias() + + # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x) + scan_out_p = selective_scan_fn( + conv_out_p, + ssm_state, + discrete_time_step_p, + self.A, + B_p.transpose(-2, -1), + C_p.transpose(-2, -1), + self.D.float(), + gate_p, + time_proj_bias, + delta_softplus=True, + cache_indices=state_indices_tensor_p, + has_initial_state=has_initial_states_p, + query_start_loc=query_start_loc_p, + block_size=mamba_block_size, + block_idx_first_scheduled_token=block_idx_first_scheduled_token_p, + block_idx_last_scheduled_token=block_idx_last_scheduled_token_p, + initial_state_idx=block_idx_last_computed_token_p, + ) + ssm_outputs.append(scan_out_p) + + if has_decode: + if prefix_caching_enabled: + state_indices_tensor_d_input = state_indices_tensor_d.gather( + 1, block_idx_last_computed_token_d.unsqueeze(1) + ).squeeze(1) + state_indices_tensor_d_output = state_indices_tensor_d.gather( + 1, block_idx_last_scheduled_token_d.unsqueeze(1) + ).squeeze(1) + else: + state_indices_tensor_d_input = state_indices_tensor_d + state_indices_tensor_d_output = state_indices_tensor_d + # 2. Convolution sequence transformation + conv_out_d = causal_conv1d_update( + hidden_states_BC_d.transpose(0, 1), + conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + conv_state_indices=state_indices_tensor_d, + block_idx_last_scheduled_token=block_idx_last_scheduled_token_d, + initial_state_idx=block_idx_last_computed_token_d, + ).transpose(0, 1) + + # 3. State Space Model sequence transformation. + discrete_time_step_d, B_d, C_d = self._ssm_transform( + conv_out_d.transpose(-2, -1) + ) + time_proj_bias = self._time_proj_bias() + + # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x) + scan_outputs_d = torch.empty_like(hidden_states_BC_d.transpose(0, 1)) + selective_state_update( + ssm_state, + conv_out_d.transpose(0, 1), + discrete_time_step_d.transpose(0, 1), + self.A, + B_d, + C_d, + self.D, + gate_d.transpose(0, 1), + time_proj_bias, + dt_softplus=True, + state_batch_indices=state_indices_tensor_d_input, + dst_state_batch_indices=state_indices_tensor_d_output, + out=scan_outputs_d, + ) + scan_outputs_d = scan_outputs_d.transpose(0, 1) + + ssm_outputs.insert(0, scan_outputs_d) + + scan_outputs_combined = ( + ssm_outputs[0] if len(ssm_outputs) == 1 else torch.cat(ssm_outputs, dim=-1) + ) + + # 5. Final output projection + if self.is_lora_enabled: # Lora kernel requires contiguous tensor. + scan_outputs_combined = scan_outputs_combined.transpose(-2, -1).contiguous() + out = self.out_proj(scan_outputs_combined)[0] + else: + out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0] + + output[:num_actual_tokens] = out + + def get_state_dtype(self) -> tuple[torch.dtype]: + assert self.model_config is not None + assert self.cache_config is not None + return MambaStateDtypeCalculator.mamba1_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + self.cache_config.mamba_ssm_cache_dtype, + ) + + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.mamba1_state_shape( + tp_world_size=get_tensor_model_parallel_world_size(), + intermediate_size=self.intermediate_size, + state_size=self.ssm_state_size, + conv_kernel=self.conv_kernel_size, + ) + + @property + def mamba_type(self) -> str: + return "mamba1" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend + + return Mamba1AttentionBackend + + def _time_proj_bias(self) -> torch.Tensor | None: + if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None: + return self.dt_proj.bias.float() + return None + + +class PrefillDecodeSplit(NamedTuple): + hidden_states_BC_p: torch.Tensor + hidden_states_BC_d: torch.Tensor + gate_p: torch.Tensor + gate_d: torch.Tensor + state_indices_tensor_p: torch.Tensor + state_indices_tensor_d: torch.Tensor + + +def split_batch_to_prefill_and_decode( + hidden_states_BC: torch.Tensor, + gate: torch.Tensor, + state_indices_tensor: torch.Tensor, + num_prefill_tokens: int, + num_prefills: int, + num_padded_decodes: int, +) -> PrefillDecodeSplit: + num_actual_tokens = num_prefill_tokens + num_padded_decodes + + # In v1, decode tokens come first, then prefill tokens. + hidden_states_BC_d, hidden_states_BC_p = torch.split( + hidden_states_BC[..., :num_actual_tokens], + [num_padded_decodes, num_prefill_tokens], + dim=-1, + ) + gate_d, gate_p = torch.split( + gate[..., :num_actual_tokens], [num_padded_decodes, num_prefill_tokens], dim=-1 + ) + + # num_padded_decodes accounts for CUDA graph padding when applicable + state_indices_tensor_d, state_indices_tensor_p = torch.split( + state_indices_tensor[: num_padded_decodes + num_prefills], + [num_padded_decodes, num_prefills], + dim=0, + ) + + return PrefillDecodeSplit( + hidden_states_BC_p=hidden_states_BC_p, + hidden_states_BC_d=hidden_states_BC_d, + gate_p=gate_p, + gate_d=gate_d, + state_indices_tensor_p=state_indices_tensor_p, + state_indices_tensor_d=state_indices_tensor_d, + ) + + +def mamba_mixer( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self.forward_cuda(hidden_states=hidden_states, output=output) + + +def mamba_mixer_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="mamba_mixer", + op_func=mamba_mixer, + mutates_args=["output"], + fake_impl=mamba_mixer_fake, +) diff --git a/model_executor/layers/mamba/mamba_mixer2.py b/model_executor/layers/mamba/mamba_mixer2.py new file mode 100644 index 0000000..fb45afa --- /dev/null +++ b/model_executor/layers/mamba/mamba_mixer2.py @@ -0,0 +1,928 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +import torch +from torch import nn + +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config +from vllm.distributed import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, + MambaStateShapeCalculator, +) +from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( + causal_conv1d_fn, + causal_conv1d_update, +) +from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated +from vllm.model_executor.layers.mamba.ops.mamba_ssm import selective_state_update +from vllm.model_executor.layers.mamba.ops.ssd_combined import ( + mamba_chunk_scan_combined_varlen, +) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import ( + LoaderFunction, + composed_weight_loader, + sharded_weight_loader, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata + +# Added by the IBM Team, 2024 + + +# Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated +@CustomOp.register("mixer2_gated_rms_norm") +class Mixer2RMSNormGated(CustomOp): + def __init__( + self, + full_hidden_size: int, + full_n_groups: int, + use_rms_norm: bool = True, + eps: float = 1e-6, + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.full_hidden_size = full_hidden_size + self.group_size = full_hidden_size // full_n_groups + self.per_rank_hidden_size = full_hidden_size // self.tp_size + self.n_groups = full_hidden_size // self.group_size + + self.variance_epsilon = eps + self.use_rms_norm = use_rms_norm + if self.use_rms_norm: + # Register norm weight only if we're actually applying RMSNorm + self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size)) + set_weight_attrs(self.weight, {"weight_loader": sharded_weight_loader(0)}) + else: + # Avoid checkpoint mismatch by skipping unused parameter + self.register_parameter("weight", None) + assert self.full_hidden_size % self.tp_size == 0, ( + "Tensor parallel world size must divide hidden size." + ) + + def forward_native( + self, + x: torch.Tensor, + gate: torch.Tensor, + ): + # Three tensor-parallel cases: + # 1. n_groups is 1 + # In this case we parallelize along the reduction dim. + # Each rank computes a local sum of squares followed by AllReduce + # 2. tp_size divides n_groups + # Each rank only reduces within its local group(s). + # No collective ops necessary. + # 3. The general case can be pretty complicated so we AllGather + # the input and then redundantly compute the RMSNorm. + input_dtype = x.dtype + x = x * nn.functional.silu(gate.to(torch.float32)) + if not self.use_rms_norm: + return x.to(input_dtype) + + if self.n_groups == 1: + if self.tp_size > 1: + # Compute local sum and then reduce to obtain global sum + local_sums = x.pow(2).sum(dim=-1, keepdim=True) + global_sums = tensor_model_parallel_all_reduce(local_sums) + # Calculate the variance + count = self.tp_size * x.shape[-1] + variance = global_sums / count + + else: + variance = x.pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(variance + self.variance_epsilon) + else: + redundant_tp: bool = self.n_groups % self.tp_size != 0 + if redundant_tp: + # To handle the general case, redundantly apply the variance + x = tensor_model_parallel_all_gather(x, -1) + + *prefix_dims, hidden_dim = x.shape + group_count = hidden_dim // self.group_size + x_grouped = x.view(*prefix_dims, group_count, self.group_size) + variance = x_grouped.pow(2).mean(-1, keepdim=True) + x_grouped = x_grouped * torch.rsqrt(variance + self.variance_epsilon) + x = x_grouped.view(*prefix_dims, hidden_dim) + + if redundant_tp: + start = self.per_rank_hidden_size * self.tp_rank + end = start + self.per_rank_hidden_size + x = x[..., start:end] + + return self.weight * x.to(input_dtype) + + def forward_cuda( + self, + x: torch.Tensor, + gate: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + input_dtype = x.dtype + if not self.use_rms_norm: + # Keep gate in float32 for numerical stability during silu + return x * nn.functional.silu(gate.to(torch.float32)).to(input_dtype) + + if ((self.n_groups % self.tp_size) != 0) or self.n_groups != 1: + return self.forward_native(x, gate) + + return rms_norm_gated( + x, + self.weight.data, + bias=None, + z=gate, + eps=self.variance_epsilon, + norm_before_gate=False, + ) + + +def mamba_v2_sharded_weight_loader( + shard_spec: list[tuple[int, int, float]], + tp_size: int, + tp_rank: int, +) -> LoaderFunction: + """Create a weight loader for mamba v2. This ensures that the projections + are correctly sharded so that they can be split into x, B, C. It also + ensures that all the groups corresponding to a head shard is placed + together with it. + """ + + def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + # - track boundary of (sharded) param, and loaded_weight, respectively + boundary, loaded_boundary = 0, 0 + + # - iterate over the shard specs + for full_dim, extra, duplicate_groups in shard_spec: + # - full dim is the model dim (before TP). + # - extra > 0, means there is expected overall increase + # of dimensions. This is so because of replication. + # - ratio is used map the tp_rank to the actual shard + # rank. This is useful when there is replication of + # groups to accompany head shards. + + # - size of the loaded shard + shard_size = full_dim // tp_size + + # - compute the rank into the loaded shard. + # - if there is replication, different TP shards will + # take from the same rank. + # NOTE: currently we only support duplication + # in the case where num_groups == 1 + rank = 0 if duplicate_groups else tp_rank + + # - leftmost boundary index into loaded weight. + loaded_skip = rank * shard_size + loaded_start_idx = loaded_boundary + loaded_skip + + # - take these many dims from the loaded weight. + take = min(shard_size, full_dim - extra - loaded_skip) + + # - always shard on dim 0 + # - the ignore is for a mundane mypy error as it does not + # seem to handle slices well. + # https://github.com/python/mypy/issues/2410 + param.data[ + boundary : (boundary + take), ... # type: ignore[misc] + ] = loaded_weight[ + loaded_start_idx : ( + loaded_start_idx + take + ) # type: ignore[misc] + ] # type: ignore[misc] + + # move indexing boundaries + boundary += shard_size + loaded_boundary += full_dim - extra + + return loader + + +# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer +@CustomOp.register("mamba_mixer2") +class MambaMixer2(MambaBase, CustomOp): + """ + Compute ∆, A, B, C, and D the state space parameters and compute + the `contextualized_states`. A, D are input independent + (see Mamba paper [1] Section 3.5.2 "Interpretation of A" + for why A isn't selective) ∆, B, C are input-dependent + (this is a key difference between Mamba and the linear time + invariant S4, and is why Mamba is called + **selective** state spaces) + """ + + def __init__( + self, + hidden_size: int, + ssm_state_size: int, + conv_kernel_size: int, + intermediate_size: int, + use_conv_bias: bool, + use_bias: bool, + n_groups: int = 1, + num_heads: int = 128, + head_dim: int = 64, + rms_norm_eps: float = 1e-5, + activation: str = "silu", + use_rms_norm: bool = True, + model_config: ModelConfig | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + + # For TP, the sharding plan is as follows: + # - for the conv modules, since + # conv_dim = intermediate_size * 2 * n_groups * ssm_state_size, + # we shard intermediate_size and n_groups + # - since intermediate_size = n_heads * head_dim, sharding on + # intermediate_size is achieved by sharding on n_heads. + # - IF, world_size divides groups, then sharding + # (n_groups / world_size, n_heads / world_size) + # also maintains the invariant n_heads % n_groups == 0 + # - HOWEVER IF, world_size DOES NOT divide groups, then we need + # to allocate extra space in the shard, such that groups + # may be replicated to follow the head shard. + # - NOTE: currently for the world size DOES NOT divide groups + # case, we only support the case when n_groups == 1 + self.tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + + assert num_heads % self.tp_size == 0, ( + "Tensor parallel world size must divide num heads." + ) + + assert (n_groups % self.tp_size) == 0 or n_groups == 1, ( + "If tensor parallel world size does not divide num_groups, " + "then num_groups must equal 1." + ) + + assert ( + (n_groups % self.tp_size == 0) or self.tp_size == 1 or quant_config is None + ), ( + "Tensor parallel currently supported for quantized models only " + "if tensor parallel world size divides num groups." + ) + + self.ssm_state_size = ssm_state_size + self.conv_kernel_size = conv_kernel_size + self.activation = activation + + self.intermediate_size = intermediate_size + self.head_dim = head_dim + self.num_heads = num_heads + + self.n_groups = n_groups + if n_groups % self.tp_size != 0: + # - for TP we shard conv_dim by sharding on n_groups, + # - but if n_groups cannot divide tp_size, we need to + # extend some extra groups + groups = MambaStateShapeCalculator.extra_groups_for_head_shards( + n_groups, self.tp_size + ) + self.n_groups = n_groups + groups + + self.groups_ssm_state_size = self.n_groups * self.ssm_state_size + self.conv_dim = intermediate_size + 2 * self.groups_ssm_state_size + + if n_groups % self.tp_size == 0: + self.conv1d = MergedColumnParallelLinear( + input_size=conv_kernel_size, + output_sizes=[ + intermediate_size, + self.groups_ssm_state_size, + self.groups_ssm_state_size, + ], + bias=use_conv_bias, + quant_config=None, + prefix=f"{prefix}.conv1d", + ) + + self.in_proj = MergedColumnParallelLinear( + input_size=hidden_size, + output_sizes=[ + intermediate_size, + intermediate_size, + self.groups_ssm_state_size, + self.groups_ssm_state_size, + self.num_heads, + ], + bias=use_bias, + quant_config=quant_config, + prefix=f"{prefix}.in_proj", + ) + else: + # This is the n_groups == 1 case, + # where we need to duplicate groups if TP>1. + + self.conv1d = ColumnParallelLinear( + input_size=conv_kernel_size, + output_size=self.conv_dim, + bias=use_conv_bias, + quant_config=None, + prefix=f"{prefix}.conv1d", + ) + + self.in_proj = ColumnParallelLinear( + input_size=hidden_size, + output_size=intermediate_size + self.conv_dim + self.num_heads, + bias=use_bias, + quant_config=quant_config, + prefix=f"{prefix}.in_proj", + ) + + # - because in_proj is a concatenation of 3 weights, we + # need to interleave them before sharding + # - use the custom weight loader mamba_v2_sharded_weight_loader + # for conv1d.bias, covn1d.weight and in_proj.weight + # - need to set these settings, to assign the groups + # to the head shards + group_shard_settings = ( + self.groups_ssm_state_size, # expected model size + (self.n_groups - n_groups) * self.ssm_state_size, # extra dims assigned + n_groups == 1, # if there was only one group + ) + intermediate_settings = (intermediate_size, 0, False) + head_settings = (self.num_heads, 0, False) + + # - the weight already has a "weight_loader" attribute + # which set_weight_attrs will raise if we do not + # delete before trying to override it + # - ditto for the other two weights below + delattr(self.conv1d.bias, "weight_loader") + set_weight_attrs( + self.conv1d.bias, + { + "weight_loader": mamba_v2_sharded_weight_loader( + [ + intermediate_settings, + group_shard_settings, + group_shard_settings, + ], + self.tp_size, + tp_rank, + ) + }, + ) + + delattr(self.conv1d.weight, "weight_loader") + set_weight_attrs( + self.conv1d.weight, + { + "weight_loader": mamba_v2_sharded_weight_loader( + [ + intermediate_settings, + group_shard_settings, + group_shard_settings, + ], + self.tp_size, + tp_rank, + ) + }, + ) + + if quant_config is None: + # - quant layers do not have a weight loader + delattr(self.in_proj.weight, "weight_loader") + set_weight_attrs( + self.in_proj.weight, + { + "weight_loader": mamba_v2_sharded_weight_loader( + [ + intermediate_settings, # for gate + intermediate_settings, + group_shard_settings, + group_shard_settings, + head_settings, # for dt + ], + self.tp_size, + tp_rank, + ) + }, + ) + + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `MergedColumnParallelLinear`, + # and `set_weight_attrs` doesn't allow to override it + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + + # - these are TPed by heads to reduce the size of the + # temporal shape + self.A = nn.Parameter( + torch.empty( + divide(num_heads, self.tp_size), + dtype=torch.float32, + ) + ) + self.D = nn.Parameter(torch.ones(num_heads // self.tp_size)) + self.dt_bias = nn.Parameter(torch.ones(num_heads // self.tp_size)) + self.use_rms_norm = use_rms_norm + + set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)}) + a_weight_loader = composed_weight_loader( + sharded_weight_loader(0), lambda x: -torch.exp(x.float()) + ) + set_weight_attrs(self.A, {"weight_loader": a_weight_loader}) + set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)}) + + self.out_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=use_bias, + input_is_parallel=True, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + ) + + self.norm = Mixer2RMSNormGated( + intermediate_size, n_groups, self.use_rms_norm, eps=rms_norm_eps + ) + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + # The tuple is (conv_state, ssm_state) + self.kv_cache = (torch.tensor([]), torch.tensor([])) + + self.model_config = model_config + self.cache_config = cache_config + self.prefix = prefix + + def forward_native( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + mup_vector: torch.Tensor | None = None, + ): + pass + + def forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + mup_vector: torch.Tensor | None = None, + ): + torch.ops.vllm.mamba_mixer2( + hidden_states, + output, + self.prefix, + mup_vector, + ) + + def forward_cuda( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + mup_vector: torch.Tensor | None = None, + ): + forward_context = get_forward_context() + # attn_metadata contains metadata necessary for the mamba2 triton + # kernels to operate in continuous batching and in chunked prefill + # modes; they are computed at top-level model forward since they + # stay the same and reused for all mamba layers in the same iteration + attn_metadata: AttentionMetadata = forward_context.attn_metadata + + assert self.cache_config is not None + mamba_block_size = self.cache_config.mamba_block_size + prefix_caching_enabled = self.cache_config.enable_prefix_caching + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, Mamba2AttentionMetadata) + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + # conv_state = (..., dim, width-1) yet contiguous along 'dim' + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + state_indices_tensor = attn_metadata.state_indices_tensor + has_initial_states_p = attn_metadata.has_initial_states_p + prep_initial_states = attn_metadata.prep_initial_states + chunk_size = attn_metadata.chunk_size + seq_idx_p = attn_metadata.seq_idx_p + query_start_loc_p = attn_metadata.query_start_loc_p + cu_chunk_seqlen_p = attn_metadata.cu_chunk_seqlen_p + last_chunk_indices_p = attn_metadata.last_chunk_indices_p + + # 1. Gated MLP's linear projection + projected_states, _ = self.in_proj(hidden_states) + + if mup_vector is not None: + projected_states = projected_states * mup_vector + + gate, hidden_states_B_C, dt = torch.split( + projected_states, + [ + self.intermediate_size // self.tp_size, + self.conv_dim // self.tp_size, + self.num_heads // self.tp_size, + ], + dim=-1, + ) + + conv_weights = self.conv1d.weight.view( + self.conv1d.weight.size(0), self.conv1d.weight.size(2) + ) + + # - get hidden_states, B and C after depthwise convolution. + split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split( + hidden_states_B_C, + [ + self.intermediate_size // self.tp_size, + self.groups_ssm_state_size // self.tp_size, + self.groups_ssm_state_size // self.tp_size, + ], + dim=-1, + ) + + if attn_metadata is None: + # profile run + hidden_states_B_C = ( + hidden_states_B_C.transpose(0, 1).clone().transpose(0, 1) + ).contiguous() + hidden_states, _B, _C = split_hidden_states_B_C_fn(hidden_states_B_C) + hidden_states = self.norm(hidden_states, gate) + out, _ = self.out_proj(hidden_states) + return out + + # NOTE: V0 put prefill before decode, v1 puts decode before prefill + num_prefills = attn_metadata.num_prefills # request count + num_decodes = attn_metadata.num_decode_tokens # token count (=request) + num_prefill_tokens = attn_metadata.num_prefill_tokens # token count + has_prefill = num_prefills > 0 + has_decode = num_decodes > 0 + num_actual_tokens = num_prefill_tokens + num_decodes + + # Separate prefill and decode by splitting varlen input + # Split along token dimension + hidden_states_B_C_d, hidden_states_B_C_p = torch.split( + hidden_states_B_C[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + dt_d, dt_p = torch.split( + dt[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + # Split along batch dimension + state_indices_tensor_d, state_indices_tensor_p = torch.split( + state_indices_tensor[:num_actual_tokens], + [num_decodes, num_prefills], + dim=0, + ) + + if prefix_caching_enabled: + # If prefix caching is enabled, retrieve the relevant variables + # for prefill and decode + block_idx_last_computed_token_d, block_idx_last_computed_token_p = ( + torch.split( + attn_metadata.block_idx_last_computed_token, + [num_decodes, num_prefills], + dim=0, + ) + ) + block_idx_last_scheduled_token_d, block_idx_last_scheduled_token_p = ( + torch.split( + attn_metadata.block_idx_last_scheduled_token, + [num_decodes, num_prefills], + dim=0, + ) + ) + # Prefill-only variables: + block_idx_first_scheduled_token_p = ( + attn_metadata.block_idx_first_scheduled_token_p + ) + num_computed_tokens_p = attn_metadata.num_computed_tokens_p + else: + block_idx_last_computed_token_d = None + block_idx_last_computed_token_p = None + block_idx_last_scheduled_token_d = None + block_idx_last_scheduled_token_p = None + block_idx_first_scheduled_token_p = None + num_computed_tokens_p = None + + # Preallocate output tensor to avoid memcpy cost for merging prefill + # and decode outputs + preallocated_ssm_out = torch.empty( + [ + num_prefill_tokens + num_decodes, + (self.num_heads // self.tp_size) * self.head_dim, + ], + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split( + preallocated_ssm_out, + [num_decodes, num_prefill_tokens], + dim=0, + ) + + # Process prefill requests + if has_prefill: + # 2. Convolution sequence transformation + # - It will read the initial states for every sequence, + # that has "has_initial_states_p" == True, + # from "cache_indices", using "state_indices_tensor_p". + # - It updates the "conv_state" cache in positions pointed + # to by "state_indices_tensor_p". + # In particular, it will always write the state at the + # sequence end. + # In addition, "block_idx_first_scheduled_token_p" and + # "block_idx_last_scheduled_token_p" + # are provided (which are pointers into + # "state_indices_tensor_p"), it will write additional cache + # states aligned at "block_size_to_align". + x = hidden_states_B_C_p.transpose( + 0, 1 + ) # this is the form that causal-conv see + hidden_states_B_C_p = causal_conv1d_fn( + x, + conv_weights, + self.conv1d.bias, + activation=self.activation, + conv_states=conv_state, + has_initial_state=has_initial_states_p, + cache_indices=state_indices_tensor_p, + block_idx_first_scheduled_token=block_idx_first_scheduled_token_p, + block_idx_last_scheduled_token=block_idx_last_scheduled_token_p, + initial_state_idx=block_idx_last_computed_token_p, + num_computed_tokens=num_computed_tokens_p, + block_size_to_align=mamba_block_size, + metadata=attn_metadata, + query_start_loc=query_start_loc_p, + ).transpose(0, 1)[:num_prefill_tokens] + + hidden_states_p, B_p, C_p = split_hidden_states_B_C_fn(hidden_states_B_C_p) + + # 3. State Space Model sequence transformation + initial_states = None + if has_initial_states_p is not None and prep_initial_states: + kernel_ssm_indices = state_indices_tensor_p + if prefix_caching_enabled: + kernel_ssm_indices = state_indices_tensor_p.gather( + 1, block_idx_last_computed_token_p.unsqueeze(1) + ).squeeze(1) + initial_states = torch.where( + has_initial_states_p[:, None, None, None], + ssm_state[kernel_ssm_indices], + 0, + ) + + # NOTE: final output is an in-place update of out tensor + varlen_states = mamba_chunk_scan_combined_varlen( + hidden_states_p.view( + num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim + ), + dt_p, + self.A, + B_p.view(num_prefill_tokens, self.n_groups // self.tp_size, -1), + C_p.view(num_prefill_tokens, self.n_groups // self.tp_size, -1), + chunk_size=chunk_size, + D=self.D, + z=None, + dt_bias=self.dt_bias, + seq_idx=seq_idx_p, + cu_seqlens=query_start_loc_p, + cu_chunk_seqlens=cu_chunk_seqlen_p, + last_chunk_indices=last_chunk_indices_p, + initial_states=initial_states, + return_intermediate_states=prefix_caching_enabled, + dt_softplus=True, + dt_limit=(0.0, float("inf")), + out=preallocated_ssm_out_p.view(num_prefill_tokens, -1, self.head_dim), + state_dtype=ssm_state.dtype, + ) + + if prefix_caching_enabled: + # The chunk_stride is the number of chunks per mamba block + # e.g., if mamba_block_size = 512 and chunk_size = 256, + # then chunk_stride = 2 + chunk_stride = mamba_block_size // chunk_size + + # Save state for sequences with more than just final state + for seq_idx in range(num_prefills): + # Block index for the first scheduled token + block_idx_first_scheduled_token = block_idx_first_scheduled_token_p[ + seq_idx + ] + + # Block index for the last scheduled token + block_idx_last_scheduled_token = block_idx_last_scheduled_token_p[ + seq_idx + ] + + # Number of blocks that need to be written + n_blocks_to_fill = ( + block_idx_last_scheduled_token - block_idx_first_scheduled_token + ) + + # Skip sequences that don't have any blocks to fill + if n_blocks_to_fill == 0: + continue + + # Look up the state indices + cache_blocks_to_fill = state_indices_tensor_p[ + seq_idx, + block_idx_first_scheduled_token:block_idx_last_scheduled_token, + ] + + # First chunk index for this sequence + if seq_idx == 0: + first_chunk = 0 + else: + first_chunk = 1 + last_chunk_indices_p[seq_idx - 1] + + # First chunk that is aligned on the mamba block boundary + first_aligned_chunk = first_chunk + chunk_stride - 1 + + # Calculate the number of computed tokens that were not + # already cached + num_unaligned_computed_tokens = ( + num_computed_tokens_p[seq_idx] % mamba_block_size + ) + + if num_unaligned_computed_tokens > 0: + # If the number of computed tokens is not block aligned, + # then we need to shift the index accordingly + first_aligned_chunk -= ( + num_unaligned_computed_tokens // chunk_size + ) + + # Get states to write + from_where = varlen_states[ + first_aligned_chunk : first_aligned_chunk + + n_blocks_to_fill * chunk_stride : chunk_stride + ] + + # Write the states + ssm_state[cache_blocks_to_fill] = from_where + + # For all seqs, store the last state (note: might be partial): + ssm_state[ + state_indices_tensor_p.gather( + 1, block_idx_last_scheduled_token_p.unsqueeze(1) + ).squeeze(1) + ] = varlen_states[last_chunk_indices_p] + + else: + # update ssm states + # - varlen state is a (num_prefills, nheads, headdim, dstate) + # tensor + ssm_state[state_indices_tensor_p] = varlen_states + + # Process decode requests + if has_decode: + if prefix_caching_enabled: + state_indices_tensor_d_input = state_indices_tensor_d.gather( + 1, block_idx_last_computed_token_d.unsqueeze(1) + ).squeeze(1) + state_indices_tensor_d_output = state_indices_tensor_d.gather( + 1, block_idx_last_scheduled_token_d.unsqueeze(1) + ).squeeze(1) + # for decode: + # block_idx_first_scheduled_token_d == + # block_idx_last_scheduled_token_d + # at block boundaries: + # block_idx_first_scheduled_token_d > + # block_idx_last_computed_token_d + else: + # Without caching, read and write in-place to the same blocks: + state_indices_tensor_d_input = state_indices_tensor_d + state_indices_tensor_d_output = state_indices_tensor_d + + # 2. Convolution sequence transformation + hidden_states_B_C_d = causal_conv1d_update( + hidden_states_B_C_d, + conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + conv_state_indices=state_indices_tensor_d, + block_idx_last_scheduled_token=block_idx_last_scheduled_token_d, + initial_state_idx=block_idx_last_computed_token_d, + ) + + hidden_states_d, B_d, C_d = split_hidden_states_B_C_fn(hidden_states_B_C_d) + + # 3. State Space Model sequence transformation + n_groups = self.n_groups // self.tp_size + A_d = ( + self.A[:, None, ...][:, :, None] + .expand(-1, self.head_dim, self.ssm_state_size) + .to(dtype=torch.float32) + ) + dt_d = dt_d[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D_d = self.D[:, None, ...].expand(-1, self.head_dim) + B_d = B_d.view(-1, n_groups, B_d.shape[1] // n_groups) + C_d = C_d.view(-1, n_groups, C_d.shape[1] // n_groups) + hidden_states_d = hidden_states_d.view( + -1, self.num_heads // self.tp_size, self.head_dim + ) + + # - the hidden is reshaped into (bs, num_heads, head_dim) + # - mamba_cache_params.ssm_state's slots will be selected + # using state_indices_tensor_d + # NOTE: final output is an in-place update of out tensor + selective_state_update( + ssm_state, + hidden_states_d, + dt_d, + A_d, + B_d, + C_d, + D_d, + z=None, + dt_bias=dt_bias, + dt_softplus=True, + state_batch_indices=state_indices_tensor_d_input, + dst_state_batch_indices=state_indices_tensor_d_output, + out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim), + ) + + # 4. gated MLP + # GatedRMSNorm internally applying SiLU to the gate + # SiLU is applied internally before normalization, unlike standard + # norm usage + hidden_states = self.norm(preallocated_ssm_out, gate[:num_actual_tokens]) + + # 5. Final linear projection + output[:num_actual_tokens], _ = self.out_proj(hidden_states) + + def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: + assert self.model_config is not None + assert self.cache_config is not None + return MambaStateDtypeCalculator.mamba2_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + self.cache_config.mamba_ssm_cache_dtype, + ) + + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.mamba2_state_shape( + intermediate_size=self.intermediate_size, + tp_world_size=get_tensor_model_parallel_world_size(), + n_groups=self.n_groups, + num_heads=self.num_heads, + head_dim=self.head_dim, + state_size=self.ssm_state_size, + conv_kernel=self.conv_kernel_size, + ) + + @property + def mamba_type(self) -> str: + return "mamba2" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend + + return Mamba2AttentionBackend + + +def mamba_mixer2( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, + mup_vector: torch.Tensor | None = None, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self.forward_cuda(hidden_states=hidden_states, output=output, mup_vector=mup_vector) + + +def mamba_mixer2_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, + mup_vector: torch.Tensor | None = None, +) -> None: + return + + +direct_register_custom_op( + op_name="mamba_mixer2", + op_func=mamba_mixer2, + mutates_args=["output"], + fake_impl=mamba_mixer2_fake, +) diff --git a/model_executor/layers/mamba/mamba_utils.py b/model_executor/layers/mamba/mamba_utils.py new file mode 100644 index 0000000..831dab2 --- /dev/null +++ b/model_executor/layers/mamba/mamba_utils.py @@ -0,0 +1,225 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.config.cache import MambaDType +from vllm.config.model import ModelDType +from vllm.distributed import divide +from vllm.utils.torch_utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + get_kv_cache_torch_dtype, +) + + +class MambaStateDtypeCalculator: + @classmethod + def linear_attention_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + # TODO (tdoublep) requires testing + if mamba_cache_dtype == "float32": + raise ValueError("fp32 state for minimax is not yet supported") + state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + return (state_dtype,) + + @classmethod + def mamba1_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + mamba_ssm_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + return cls._mamba_state_dtype( + model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype + ) + + @classmethod + def mamba2_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + mamba_ssm_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + return cls._mamba_state_dtype( + model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype + ) + + @classmethod + def _mamba_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + mamba_ssm_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + if mamba_ssm_cache_dtype == "auto": + temporal_state_dtype = conv_state_dtype + else: + temporal_state_dtype = STR_DTYPE_TO_TORCH_DTYPE[mamba_ssm_cache_dtype] + + return (conv_state_dtype, temporal_state_dtype) + + @classmethod + def short_conv_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + return (conv_state_dtype,) + + @classmethod + def gated_delta_net_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, torch.dtype]: + state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + return (state_dtype, state_dtype) + + @classmethod + def kda_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + ): + state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + return (state_dtype, state_dtype, state_dtype, torch.float32) + + +class MambaStateShapeCalculator: + @classmethod + def linear_attention_state_shape( + cls, + num_heads: int, + tp_size: int, + head_dim: int, + ) -> tuple[tuple[int, int, int], ...]: + state_shape = (num_heads // tp_size, head_dim, head_dim) + return (state_shape,) + + @classmethod + def mamba1_state_shape( + cls, + tp_world_size: int, + intermediate_size: int, + state_size: int, + conv_kernel: int, + ) -> tuple[tuple[int, int], tuple[int, int]]: + conv_state_shape = (divide(intermediate_size, tp_world_size), conv_kernel - 1) + + temporal_state_shape = (divide(intermediate_size, tp_world_size), state_size) + + conv_state_shape = conv_state_shape[1], conv_state_shape[0] + + return conv_state_shape, temporal_state_shape + + @classmethod + def mamba2_state_shape( + cls, + tp_world_size: int, + intermediate_size: int, + n_groups: int, + num_heads: int, + head_dim: int, + state_size: int, + conv_kernel: int, + ) -> tuple[tuple[int, int], tuple[int, int, int]]: + # if n_groups is not divisible by world_size, need to extend the shards + # to ensure all groups needed by a head is sharded along with it + n_groups = n_groups + cls.extra_groups_for_head_shards(n_groups, tp_world_size) + # heads and n_groups are TP-ed + conv_dim = intermediate_size + 2 * n_groups * state_size + + # contiguous along 'dim' axis + conv_state_shape = (conv_kernel - 1, divide(conv_dim, tp_world_size)) + + # These are not TP-ed as they depend on A, dt_bias, D + # - they are typically small + # e.g., (h_heads, head_dim, state_size) = (128, 64, 128) + temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, state_size) + return conv_state_shape, temporal_state_shape + + @classmethod + def short_conv_state_shape( + cls, + tp_world_size: int, + intermediate_size: int, + conv_kernel: int, + ) -> tuple[tuple[int, int]]: + conv_dim = divide(intermediate_size, tp_world_size) + conv_state_shape = (conv_kernel - 1, conv_dim) + return (conv_state_shape,) + + @classmethod + def extra_groups_for_head_shards(cls, ngroups: int, tp_size: int): + """Compute the increase in group numbers to account for + replication in order to accompany the head shards.""" + + # in the case ngoups % tp_size == 0, this will be zero + if ngroups % tp_size == 0: + return 0 + + # for n_groups == 1, this is exactly tp_size - n_groups + return tp_size - ngroups + + @classmethod + def gated_delta_net_state_shape( + cls, + tp_world_size: int, + num_k_heads: int, + num_v_heads: int, + head_k_dim: int, + head_v_dim: int, + conv_kernel_size: int, + num_spec: int = 0, + ): + conv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads + conv_state_shape = ( + divide(conv_dim, tp_world_size), + conv_kernel_size - 1 + num_spec, + ) + + conv_state_shape = conv_state_shape[1], conv_state_shape[0] + + temporal_state_shape = ( + divide(num_v_heads, tp_world_size), + head_k_dim, + head_v_dim, + ) + return conv_state_shape, temporal_state_shape + + @classmethod + def kda_state_shape( + cls, + tp_world_size: int, + num_heads: int, + head_dim: int, + num_k_heads: int | None = None, + head_k_dim: int | None = None, + conv_kernel_size: int = 4, + num_spec: int = 0, + ) -> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int, int]]: + if num_k_heads is None: + num_k_heads = num_heads + if head_k_dim is None: + head_k_dim = head_dim + + proj_size = num_heads * head_dim + proj_k_size = num_k_heads * head_k_dim + + conv_state_shape = (divide(proj_size, tp_world_size), conv_kernel_size - 1) + conv_state_k_shape = (divide(proj_k_size, tp_world_size), conv_kernel_size - 1) + recurrent_state_shape = (divide(num_heads, tp_world_size), head_dim, head_dim) + + conv_state_shape = conv_state_shape[1], conv_state_shape[0] + conv_state_k_shape = conv_state_k_shape[1], conv_state_k_shape[0] + return ( + conv_state_shape, + conv_state_k_shape, + conv_state_k_shape, + recurrent_state_shape, + ) diff --git a/model_executor/layers/mamba/ops/__init__.py b/model_executor/layers/mamba/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbd0b3b6c23cacc821e3ac72860973834a2bddc7 GIT binary patch literal 181 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVh3J>$7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?E|R?rWO_J z=O*SRCFqOsjuqU8Z4 z?(f7Oof22x6@4O}e{xD8-pIct?#{d{`DIz+lyh;#FDV)mh^6v&0u=Wr^2ki8o}4H)e@1%o1&W z#P7-y-wjkSl^um&ulWFBTN2lG9a7G$R-22oSjKBKr4)Wv9l{K3CgCF zStUSs7QTNv$x!3cx$=pO;KpoE7U*2+p2wzUlO|l*UKnX85uRbpPZc@$!l>3?7u)xJ zh$3;+E{>^A^bby)K0Y=+F*0b+iRJjFV|hNx?VI()a-Hs(Sf1DQ%CyT9(|G1*CMF$| zmt7OnZjZ|wQ+pws(>1X;f!PX#H{a}4$j+xOq@*rPEV!M%%Q0O%x7+J;_*@fEMS2v% zG^FO@02||x94c;)(>NBT@CV1+_)T1ZbCr*#|j}ILm8jZ;g9UmJwHt`()FgjVo&p~j{gTGw(`}mJwj=C+P zbNw~6#9vDn`s-+uzn(Vx?Q}WJ0c+m^G>|{4T$2O zufVL9e))umD{|pcc;HL=8-;jz#Nx8{-RX~$GICJRg zcjCBVtNqP%BVBs6UL>N#*OSX;!at90Os15-C8@5*c6G-gN``(ceMuqB{u2owSDT#HV;l7qq2| zq;%6=&KyEVDcudJ$e-PGJKaW}dUmK=l~(s2kJ6dr?}5K|_}c@2ZSdDkSJ6#`UcJ72 zzF#3uL-%4We82kl(jvNr)?-ejJl#y|_-FsV9Wbc{ekkH|(|eM8?iof#A8n)Z=)MG} z(KYZ^C)CqS*YnT*{j?Y`HVE$r=q8S`t5V9h1NPEXI_`jR0Z+rkUEtD#^ubjzHAwHL z8#rl0+XamJ0l$K)!14Y=Ad7NIuNN=%`v-QwZeZz--JD)shna+C-HHKjVYLx zNb!%*!ypZTim9dOVZaL{QfllCSl{rnDh(E-_>U%Go@xFbp^qlXt7A#IcW9^ocv9X> zkW*)wc5E{*d3TW6k6V8|2eZm^nP-*M*3-wiv6x0J?440aYj~8BkD=mNQd>;`BJJXL zWW>pQl41Xa9;J`bWBlwO7OhG=UtPjE=ui^;9qFEVUQKO1;jrVhhTO@lmVpi@V1Cvj z9e^HO&J0x9fxnU|lZgvX&|^-?y9&O|{*ySm$a?7%y)PAKz~U69aI2aO_>S;7GL8A#lT`vu1llw3)lW*J9*#c*_tShDXw_93?U#Im@8=0Bo6vgNNzcxx; z8_{LdK{|(SCjC6lug&~tl4hw)>y2^x46HZu`FVARm1gOq^D^HL{2O=ctGe9~o(CLD&g6}PQFW*apAcrpO2TAaSrL+Et zr5F4!FAe$~Np!RFWfFI~JW=`rEZzMV0b6c|r*7$@Q%rVEZZEwEu+A-gA*s(Mli+6R zkIAJA?-slhPw_iE^Mszb@7gB7l@ho^kU9l%T&<3!m!7$<1BBvOS8s;8b_sP!NnMAb z{#uCR_NHu0=byFyrHLiS(#wQ%Vb{_>@zsAW5eZbyU2^ybJ%s`_q)S8oL5Lgjs06I@ zNZeun5X2v*E2z??38y%T-j#{|805`jyLP#;m^sK`fQ_fMdmng>!p5x^BW@A#f$Jf;n%X3f;&m~tZcgF3B zNg%E*CcWXJX1!DmmQ}i@T{A9^Z^CyymhW_3oWC@2(LqsAV@&Gx%~GzIbOF$E#bi@Z zi*I*(Ov=H7LoJ8TJvkAF?pmC)PsOB*6LUT&xj=66i*ARPJn7vYx6kdEo*-vjT%0cD z)QV&?U55M_vKn5QcTv}|Fy)(ob1RZrana$Myv&s&M_Ut`DL3WyO?ctR&N)Bra>ftC$`cu;9U0PfDaZh27h?u;_BT5-H}B)Yc%kpGJ_^sW zC_GN0aMyxTU;wKi5e3XBqSP4RHa>;>(3A!PoJlAh2B;lC88AQ{AF2=o{Gz1H7$E;q zxLZV7Fet$Q=a`sga@KP#nXINn1dx)P2ohQp@@IuYesa{9AsUYxGeiT8V+!t!63;+o zNL4(nR40=?rXa+2Iegxj5mM(}o=G^h_EV*G#()mFrA)`6{gVK4M(Y@S!%9O9Hh$YxYc!5BXXnw2Wl}U#~Ojciz!(V!B|l;nzQr1L<3SN$M3u2pcb(RVP1|+asg^gKI4FM{Y9WA|FjqrUx`5_1Z5~8BY0qK zL;`rdQ!ps4zmn_kUugBs%(c$B=bAanHZRV1@9DYP?P{HNUu>Pb?z=qeX}vZ*J<|%4 zh--Snwdk6h2a(=7?YQouysa~inTw9r**S0Pq+{L-!WpUD=4_d}9xDdzhNP&8sRif6 zRTt%PO;fK!15tE&8zEd4ZGS;5GTWEOqGc_N#1buSW+cUY*b2{Ea^vbNth{JDS0pX? zL@UzhZe3crwCetLiU$~pX)C{isqA9&yOs}cmDs~a*pl7Lhkqd}{6s0Lu)TTd&LyUH=mQyB zI~FNF!N^MnQa`7RE;ol>$Y6f zI3T`%mTfr3$m^mNHB9{xw&DmQFO6EPOjS2)>Au&+7WW4VqP0Eu_OrE6sA@}996Y(9 zDvve{eUQgCj56vvfK$@Uv>swDhXNWPhOO!?*H-0Nm54@E2ko0h|@!G|IR2*&=H!_VF0uNv zz^b});w?jIkY){y;qmXbh0n42PT*Nx7w~Mp0eDt#3-z7c-^1Sp6`{7*TE+m(yIqrY9W zR`mMxwnCI&vt1z4S)+zVre%ON3}BfdY!Fk?!5TWahUAGfhOhMpE`6ks)R}=^IxABN z0=7F=pjkCSv^2+J{z#EmXJ*t@+ged!b;!*cTfeJYA6z%HhCTOQ_`vXki4U9`<4Cl9 z(Dt)|!H4RCO?4@&E`6Xbhv!?vE5l4-Q~2^d$P3y__{4_=)cRAj|-!5v<`TLBZr~VJiS&C$cw7X!L|)m zX|$q|QI`q}pvLgo_0t~|J}`_02A?ELMPt~p-omyVWGfDS?r=U$Wm}JF*Av0{%rb53 zMft_SHV{Z5O+?iS-LGpxNexS)w)L~DelLsz-M%e-35vv0j=2?Uil8lcKB8>^$ym_{ zQ-vC*3Mlmj*1(SueIJ(G|M)oqM!AkPHGHQ(?1QysVF#?R27aK4YK>oWtjb_m_26V= zDhM2gil1pRs6q5=p>;-U$i^C~Kpx`UAceU>nL0P9aBk54n#~Ol)P28@X`&i)(9UYA z0+Oh)ER?%87EnAi72UqHb}8r#7vHOTVA{WF>Ss;;II$dEISK;KtXxrn_<@zh<+aNp zS@6d9_3vBWvpg{E3#hiVreJBffGzG~wOz|2K%4?|(6BPGd^jpItiG}#D~jr17zV$@ zmNc`vmO$P^_|h=1nS-W~DeQfq+YRG%#|%#l>4`IHi-QAen&pv)g_hvn&BD5XB3e)! z$lWp&2jo$MIUq+gE7mGPr9T6t`-SDJ!59>OtqZ<1R)?cjdwAqt0o!z#wH{s_c?h2} z3wIVm*VnH zVV-|dyrn4$?qfA|FwXMruuf2DK2eJls$23E`Kl>s2z77D8$X&6i%j(&!#6W*)q362 z^M_+%*z;O&xi4MzQHvO@N^Ed=W16(x&RBlRPH9Z{$S!G2>6rAnufcxotcMx`fYf0O zj$kkh!DRA*6&{=rh5LHgpM16P)uvaQZzriLff9ZE&?2E=Oi5@NmE#oqWzL+}8bRGK zu2KMygv+mmc-;F;yXf3Zx%opEzsQHXq~yASTcv3iEo|!hb#%eaI=bKrZlk7Mem$)R zyb7*lQ1JB|AU?VM@v?rS=N2uwf;+Bh7n}l-D^qKnxdP^ebm41_PKn=?)PfA=nY3w2 z>=RUje7G^pLCn@>h6=no^LPqyY7kj+$&zvn!f^~9 zyqF(pJrBe1lKAG1ycX=@d&+X1t>EIAUG)q5&;%##gD0{ePedQ(SBACHYk z483iqHK0a=ytrfg;*^jqbG;yWYzd;Q6ff~aBid`)iQ7~9Bj;u#o%5s^01xIoo|m!z z;@C<7vh+zI(`AtNv7;mja^B4fIxkaho{#9VDRT55$0T=Ic2wk4@~Oc2nMb?8J;@Y; zHe050GE284oXlRVLzPrV{!JsD4=VH^KPe3yTGyg+61*dY6p&Y)l(*KOPZumpocUi7 zz1Hl>br#?~&=_b6G*9I^HD8hY8)!qO*#-JK!<=&!92`(xP$JWep0R{5UJbPIDM}fA zc790H4Jc(mX*#ty zC74fW`BWL5* zjy71Hc<1NlW+|WbnuCH$NqlXy%^#98~SzFD7R8bUB0^;&1Atd8kvE0pkB-5#s& zvct#KC6wilJiS&2#X7 z%3+nXB!$$LmX;>Oon){!TifhbH!3a&NX*YUNsVxbIqjHqS>03Clw~Wy%`@w>f||lL zu$6TRC0}y;tn*%%_2rcOFC)fqUI+~aV0fO25%@x~Az2a_l135A@U-A4PADs!9ln%O z!AtH77f3lM((HyN0r-n9#KdWJAgPly7PvlrWZexQptr`U`2-oTMytaEAf|jSitDWe zo*6LkxS&3+YVT$Dl&_ie7DC`Dy5RO*h8~-qcX_$~tXr%ThqqpKQliKUkle<$X4Wx9+<>)$#ZMOb3Fr9%nC(X zpo?8|aF`AayLjEYt3@d9o)R7xfp9N%Hd))a{}-(Lnllb2t|5ILiID364APrrMrJ!k# zKq3Wrq6r0F;;34mhNiu_&Dz(;k&^2`1N)VwyO@vx~E{(?XLHc!696>Qh$l z<=OdZ7&Fil+?ll1=KxgjJ+3NZWz1>zP-7q2xwD$^;IL7(nPu!3S2q0>;1bL%9>H05$QuY&}NYlJ;I za+5d)_02%(X~Hq>zT`^Yp;WuDV@4a70g5T)?D{X^UsqHu5S#-X88r^Zh8 zT3>!dM81q{h6@yAOgD%mTs*+4Z`wNxt0!Td<9+!?m{!8XcT$wgGnNRcQ_@sRN6;!oMyOL6c zUA9CPpNfB`rSHTs;O`I?)xGRNCc3ElzUL!dD;< zUo85n4OF1P5gdNmJZ{f12}@z85fzxJ^O;hdk{bZeFBc%A)@a&A%XuuWpeAYBSR0U?Pp> z!Qvd2+SDFQl!Nplis~^LEKE^&$D}y*PEaT@V$#Xkx$6)G>W!0^V@mw&od7_f$|wbG zcE@$91#(fhA+RfAlIMoT`(rX#twW98ST1~S$D}jU;D9Nn<4Xal{I_RJh{rej=X}Xi1mVLN1nD*{(Ny>H&Mz!~cAc&SlQN1BB zuw^s_hN8tKfumcdveoB9#*q7N1zWg_F|-8+xAdml^=tKSG~8}oYmMlu0s{|?rrRTH zBj31)ib3z@h6mqKM9SOO2P4KkfuV<`H6iCe8(WjF=B}QOYAu^u8>_WFG!_S|-Z=U< zxj8yL995Myfo_l1xO$A$ST{9rz-o_Z8sD8A+iV_Xn@17Bb8GpV+GFjDfyl_zbJxjY6v| z=nU<-Gs_xV!V9di8$?)W799wTY!z3AI>L&(2iW4>>&H>M`6!;7 z8-wmU6(DxPjjX1N(Hx3aH;0|;d)}U9tM}jgB3pfIRYIgrw5|7E$9=^Q4zO*>4#L1>{FKgWU!($JO!;s1px3R|dO=B-=eRf5Rhc=DFta13yOCA`{Y#GdHu=i{j z_VFM?jogx=1faQuO_2dwAq z5{_>qV2t7A43>}^yb=hk%gf8$d?vXB?hJ2~XP?M0{XJw2+_nF^WzjFx=Ab0#jHs&~ zs`a;ytsD#H2FFA8I};Ig^R_fstK1glDwITA0y_9n1m3% zx#aTN3u8+7Jn_IA7#V?K2yEDpuPbh{|9z}P>6)4IT?a3Xu+8v&h)9gGm;?;#reYGb z@rkb?U;|)xJ6R-vr$^X*aDwq0`J&_Kr@`IAiMiPY7X@ZQ6JS1r>ky8HuLEp*3-#Mr z#Vrg_t%F*{U=4$>Veke9zk>lW@fyOXR7H)Lf!jAo`8oyT6PM2cyX%fvK5PTKai4vH zylQ7*OW*AQ(+zMuiM#Q!T==j{YbfQ+sbHRNkF-!(ku^dnTv?_zr+}oD{e_vB!QPU^UK-%@`$`*x&NV3v#GSON=vXU zqAXh;h{`mKtR!kI-89y+##+$B(iEX;TH@Vi-?SZM;lI3mOK;oM*RuNBh`xSP-^l74 z!$T2$dq5V|n4nMz7-SUv(pY@^=-Sa}ei@^#_;$(nhu%N--mwqFer5) z0an(%Ww~p~lBZJ1$jd=bsfg307r*hvkb^bXh5P?t`0e5KlkXh6XM1~$HSGg4okG*? zb8F{rjxG-bildssTl5Njb9lLbc|NKw42stjpkQVw-Za?3G$d?~7&?}ZZy72#4Rx%c zE-a218p0=8L(B4U&=I@cv(^*re`7Da0MO^xo)0?ScxibwYAC+lx7HWZMGRG#RC4>m z+J#VCq_76kii#OYVLY^yFcNdrWCo-PG|NY#GCdBanoGSn(IN2(%iVAZH!|dd}*_+g)M6dzrvQaZD`xJ zWfFyQJ0~Z<9jH@QzggPCmO^5rw2hI2nNnR^M7e#ld>2~|6-CN-qp6I>8tUKB)G+cI zMpF}&DPLW^Xm8E0%)h=6w7o_{)d4!HDQ7edjJyFtc;3`h zvYN_>raCO%(AXKdozd8_(BjHsQ2hEAgN_YNMX2qqp1VEa{%`GFw{28G3wAS_-BFc> zQI&vV9Cp|ny&>J#4lwG<)dMgU1*_k*-?0PQO&f+&>NaYM&8J18h;M-kPkA}!H9e?uoP95GpYu#fVzKjqZOw2x^Nw9Y+HA17`o&2 z9E;b}4P9T=07GjF2d?>JiB)0#>%(G`!u+?pBvK`CoLsjlGqW;K`h2tJPS1v{3iM2G zA748j(xKXX#Ml61+}6Em>t$`d_u78c^MjsFD0=0VI@^tas`-}Mdj+K+z^!) zMuD}!zDceF1F0w&dC6-@Oi5I(+LEhCycx{mqH8@30; z&C!(x%%6dY%CwrHUxT2#r@b~8oYQFG_LamL&qEwqMVLDUp2v~6k4Kt33rAHP5f zo)?SMnp<5fU5vgmqN-vfRbNjd4wiTAi^FtO*Qj-x1k4ueq$z>1g5gq-+C!`3t( z!|)XyQ-C+%v;C*g*{>EJPoEeXm>3)y7#kcJJ(Ahk@81{T*U{rwHBQ0NEzxgM5@!@U zI;~AO`F+GaF0OIXE^v{N>K?Zi9MdK{nialM5(Q~FU4vgg{CA7LE~y=v^2^fl*L%Q8 zzrPOt>USt90WAkJ?~ni--6hqU=^Mo-Ael**Qoz?vSAla@J6(dBBY_)PkCvIoQ(AsI`rt69TS&Iafre??-h0^7$CQ8l2D(mqPv~cp|PK{LLOC zP+pCc2fdjFcmjOwW;Xo7I^o*GVf5+7uiJA#u? ze@|NPCG`>ClWESYfE&TtmJM{Wb39^>J2^}J-ygs@wwHCl_ALdUu_eyn;3QnTq zIlv_kaBM2}d>XC()Ga%(L(BGq%c6t}=YvVP)zGN43-@*KN^$)OGhO-x&Y$+u2YJUa z{{AGWnW&pO3)Fu`a`6yw-J#4f86LJ!M1EyW{0Nqdb@|1zS zvr^*ttCrL@$iwvy?Le_XdIDP*5n8khnRY}4f`tS~I9Y(pFB}DlOyZ8e z0F&HjfOHHXakAYCkfhH5={P{*q&q^&dj?3)0VGboBP7`~KpLg{{9{7@ok;2>!P+Ci zDzuvCDB`5-e?#A&TsrJOwKU>Cy|mjuo>W>mkxeRHmQorP=l(O$4$fhied!Fiz=f3@ zXpeU<9R+w}OD8!wN=D7uB-ojx%h{z9;3PD@tLPtJ6~o#GIRMv&?L2HMk3jAq=@N+L z>i&nNW6xTj=RZJgoKw)fLTysMHqXBk>XMSWI{o9GpF$jm-;b7#KXd)R2laCfJ%=*X z|Jxbr|B6t*l+@n^INg9at`+l3&pk6vrvWE9>4DJB2tLHoO4VY3yX;go28|Hd3y9x0su?48VbFvD?wnGs7<6I) z8vLNbk3kOxyD`{7RbT+iykw7PsrwC76jOhWL1MF*Y?q@pN|Hyk z7`~#cr2Y~E>}$?JS}8uAz~B@F$qv$p(=6&7CSAlNU9yrb>}-H1vbgOfdIIAjt?Y|f_6IbNI)P&4mLwlc0Gsn2mFhR?ek&_K30sv<;x_3QGVMd~^l{|Q z5vIBu{GU|!M@kQEmJYF{LqEOv$CrP6IZ`?X8`}?g_eh=h%=g=VP{LLYMJ$ImEhDUD z&VIx z#@HCX#2Pyz>aI=o9#*~Qhm}97|3Q63J#gQ_s==j*NP#}Gs$eI$0`7_@a1*5z?Dw|} zRZMLkdV8up!y3-O=>u5~ zFYu7wc)NHFTvr6ABf7@rqp*Gb2$7UFrg{&n+yh4GieA`dR~K@=wZhKt_5<~mUEl^n zf!pHwX11dBjtv}MsP`e+RA*x9{D3cju3c10@bLqOC*X>!WZNiG+Cm3dMSD~YZj@Mc zMQ9KB>iUjj-N?ZM7hc`q!fVyLav(VPO_f9|1 z^)DaYQW{pLSY;_3sO9TdpJ(&SLdD_ptZfgQ-wVFJ^1zfn;QQLiz`}|qcodxCls2sw zuqC{s8p|*%A4cFRP-0RVS0`BoH~?T3<=+&CIzlp5ULRF~5kIRc2~M%9>Tf#2#t^i; z1+4ZJtywWe-)34pGRf*SaKGdQg{bA>hmyQaiIJ5UA4p8UEHsmoE@$ZIx;#?2XL%I7 zOu%RZFLJ4)nmIJi%JHPb@~Ag7bDXiu7;7i1>O_=~c**|H@i!9z`DU2J%S6 zXW_Tx5uiIqJg(X_uOzPxYa!qN!I23}ziEz4{5c|1Ejd5@@E_nJ3)AGKr*J$- zoFu1xGQ{#{9E%g&k{i@WCgF=wxbQm&zmh&)h7*@pt6r@J%}F(7p%rlp_l#PJiIYBV z|AVvD_8}Z{(Gs78f9Cf$DA4b7=FWq9ol^o<_hP@qBc~-d!NS-ny~f*}i!xakfPHml zy+CoM-28rxUj)|2;(5`cm@=T=;1v~*QR4ni{E^!~@E6bpln!zx(}&L?SQa4MWz3af zclkU};I*pPs{I-dcpr#UPopS6~M}rL-@(m*Q#l`Ujx?rnQDa9wOAn+$PF>A<0KQ2B4Y4i zfN^;ww9yK@JM*0daW4iaH;h-X+O*4=kQ+wGfx9qi7oE?y2XvJ+j<3V!jS@e|!9Z{X zy^_cM7{txaCc!LLu(_R@ot~auKwCeeE9vd!Oz)emFTX@|Ou+EcJ#*n@FtCps))IR? zv=K+GW$;)`Y`& zw(~}##VkG;h_QUSBL>of#10IYFQU5yY&SH7{Hiy!1uRfGYf>S`y1$QexPbl?2#l=G z7#^`GPSnOvx;q(7j!Y5rUbH*rEN>Ga(5%p5oksucU>Hp-%)zK|Hg3L2dXY4S;D~I2 z?gxUlzHy>qTf{P7Cn1|U%zk1%H_i{Q?9U7&_b>@^)|L7v`Q zR_`SEvj8vmpO0;AV$U4e+Jfv;F%0&{-EIDTvz{nMhUn#D}d^ePui{WXLZx5M(cb^9*8H z>)@TiOgX@R$1KziW5Y+9gt~G_F9A1fJdFM{M-;BC&ILG(QTrXvNPD zb0jT~2O&N{bI;7raODKdKt6%@j|?4@WW}FaHI7?!LhuN?A_kw<69EF_4gR0R8v*7| z^+@n28jrXour{^tvNk0(DB+d>;SuHp+@H!d0SAESMy9wYAg1A*4-+yU*wN8L(mOxV zaq)@Sh|3JA8Li4kiDBh9v7?~=B~0bSaGd(@_ykKQ(TD#BzRqq8$L+}Br+Gj^b8__( zaaF|0_9?KuY=${1&6Q4l+=<5Yb>LMrg6NBh?UTvtDm>o7)zQbj5d3b=ZLm{GI(0(? z(yq+*Db&A!Y}ES@*mH?dO5*Uy3yV)QM)pUBw;F z1sF}mLzyma#}TcthsVL2|JvDg--NbhsHT_!GXPBsBX40eEsvUayttWP1{=;1 zdHHgGR9g}1V6=4`T2P$-gQ2&F-Z`@V%G<}6M}98TZmIK$EmO|xV*zngWR_&p7D z0uM(!0cpUyc6du;V>Hc-ycxo%wsKQj$7<11Lfa6Kf?dO{sgdvimF%N)6lmRra^~?)H7_04uRBA75?RBJ|wq z?0WzC_n!aWi>$mopkAF0jYd0r-hc7E7r%Et9&>jT0My&xTD-gXtsAVoI#3W)N6kfG zby3&w){VP2zC|ZTZXGdMt%iW0O#;9=UkVe@ z+;!?3ILLn=f>;^;d{B$S2j2wvrPgxMF$q5DoZgmspL^QNna$})9+)Y7vz`eq9=F{% zo4MR8ZeL6Riv+I^rXh;@DW<%Rpz}CSR^Sl;elRW^14d| z=RRN%vfV~r+xgVpV=(3$&qqn=sYY+drhrIUUaZ%}=((S$Eb)N`+n5gMa z({>+u-OoclKwb}udC>i05^+d~7~mq%F2s1dbggvz2p4gb9BxmqOm83KB94oBn9p$$ zqu`C2YwH*naY6ukl8ZPc6gkaBj0+KGxQMf2QGxc>`IYnA=eP*qF)re-Umy`5=ZbO) z;*=jR5H+-bWqq`)Dr&Bb?mZCg-H)%WyTKAHny-uM%^%s$NW|4qnfg`wCcT}DFED}M zI=*szTZ%6zLbSJj=btc`oe7kc{GY@8{ayA_x$=GThp^$Na9SQ znufQI-#xzG!B%%~n@L5J^yMauMIpGHImmRKDC*=w(zN~ z-3uoB!zM62tgE$zs^jjiCZ1DTrRXkg@$t2s%G zj4xZ-!rSUK^|lmWfRXd{gs)}fRi47v3K9puxKp_5T~h|f!R5v_m~KEZwWzrKP4yjh z$j25nZiC7-#1$m-*tUkmY2$++G{_d#Z|g{$9!%u74dl^?++9c>(>R&LrEu~m#TZBA G@BalTveI+_ literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/ops/__pycache__/layernorm_gated.cpython-312.pyc b/model_executor/layers/mamba/ops/__pycache__/layernorm_gated.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..651df9e8aae8b0d5a7e3d88da7dc2936748c95bb GIT binary patch literal 7212 zcmcgxZ)_XKmEYkm$>o1i)E|rbM@n*R+LaZ@K3hp-rs8wqGtNdLKwpJuwS^Fi-0+nDFGDiadqf!Bhu%0H%co}y<$NGb8&jcIF=vdd zZq{0?9ex@57`o{gy7?Ho#S%jvL)RZew;e;bSEf%@GmiJD`v>Zde#dDO zQT$`780ULv++-pcH{*hmc^Ubuzra0*{W?`AL{x*og@t%brLPA@gFOR*(aD1~(ZuT@ zC*k+R3E2iA!aCPOi6IuzDB;%)tElH5aSy3fh@0kl+|&EFcRGdWeCK?4zC!>iD|93l zFTdC|cbV;sgeN-ZmxY;ltn*eR677t}88#AR6YS)o5a&80p=FliJENiKM5r@9&ld=> zI2R30hXj^6H@~dX-4SSJLRUGw;1>AtHpo)M_Sna5_ild5WIKAE0D(~TctYdPIGb?} z?^ezPSz$IDR-x(AGOVR42rni*6twj)8}NDv zrL~&HD|I9>rSp}n`UXy2@gyeQA|Y0Z#ud;cx>>s2 zri$&9=;CQn4>HY~Oe{|?8op9Dh`N$a+^!St94(Sxp^;@Xp^i)#stgis_YZ85tS;jP zdmT+M(`r07v0AjuK_7;EK;6gIg*1z|z>y@B>Xp1u$$~XtiMK>2|K>Mnr@pHFfxbBI z|AX#Yu|}*d(_6tSinYa&YU=>ID_H|;6D#Xg!KaBe40>QFju7?*oQq^#Ss&~mjwD#^ zD$Txi#eS%K$CmHwgPz5efLYY7fTw53;(8=cl-)7@zZ-pw zu1p_9&Z9}J7p>SXrFFoz^py3fWJ|+=Ej@>}ELtnsP}aB7KGz-Ww1)3wk2S)nEVpPS zjpTQ#u}SkCHMg}rA$ssS$0*n%S}EDAd5LDR5z}usVar16K|R?5r*TYlawFehqvC1& zZfsO5w)`*7VN2NE8;3dU^uECHU>bf}f(TDVL;Reoj|(#_ z=SN%v(ET`CC&Ikn!eI%jd7g_;bD?N3%&3MC7m7`@Diw)`7=Q97dSj3CL6uhg1DD7hjxLsmXYRS5ZPmD=;@rbYddJ zspKulC=P4Eb5zliiY8QjA~+KgRJ6={f$05)q=kC5F{r_ipawgpmYCrA`4GQ{$TfzOQ&Eapx5hICW1r_o~=;)~LY@(sn9vv^az4{7t7 zQ8LO%?p>!90Y;YC=6C4}d-SC&ed!n6uU0-;$X!owwSIl;B8;kQ z*ZRFZYiHKl`3v2z%%7NZ)|b-=tGw~EstwP&M+yA2DQj!a(#_hyA39uXx1?KgND1va zyj$n9j`lQ#2SH1;T&^*eJ$sB9(e7}wJB*z9#eYP z-`QH&wfnaNS$l^>K0eZAH0SUi%bq6l7V8Jx+LE*+^UCP@a?at)T6|y+%JBa6)$8do z`I0jeXzn!NVBH|`W2DhAp9N$bxTJD9ObJ>tWLYn2)h|-d!e428qDeGQ>B^*{id^QY z^6{c)(5hQKE$W_y|6AzSuQrJKihLExDRF5sD8Y1XJRJ6)Q;>@U2-(qJuGF-2gQq$BChf zrKA&fv}H<{bd_1!Q{XDkXYqaWNUCi8*~TMmP%X~mp9p~CEyZ@xg@*`NB0XI4ba0Wg zine1;bm>}o%sxA8r&e9ZobnP6>9Z4@&S?v=73ggK@1={c^_=?xdJH#iI&ZuTKIxj2-BjGu~PFk631zFrHkf~NS z!baJc5ah!vtV%}1F%F+M)e&P8LU2C5#B#y-RPY7WIccUHO=uv+U&WoroefKDczQ-qsfloi=Q^;m7ciZSFAAzz162imP7Ux(uv38RwC5Jp z66S+kl-H0@8CWyMv|c=S0Smi?h1rJ!ZwFs}bF{BFI65%iS1?QuZlR{(NrcCk?;pWe z0dF~u;4SWLU><@We-q9}irB9uthTkw(&g3OR3NpGH@Y&$`g~1o+6>`5y&_#zZtPf^ zGgNcN(ww(Dv)svnxxx}+|-=T5hBe#g?7p&B!m#=MbECG!ra1~YVbL}g*e;my$A zjKhnm01TA#N=SL`kZN(QbxYk!K)JYMX~|G68B0qZXjh~wN>$F%xP^8sz6|BdSbTY- zC1b41JKSqOl76JTaOd`xZ^zM=q1!T!HcVZSRzURFj>DIseHn-Ek z_`%7KI_`D+8hu9phW!2_km@NF1dfl z*znNq$u!)^*+(+;$YZDbXV*aTg-!c5%2l7)4&7MyrKOn6P|)DfBRKEJ!K0+{I0p&zqNK8K;qA2O z%-}VRm@S3$L$eQi60o>eD6uCJS~PACCc2v>z*MM7XOXzzCi0Onm-y(A6b)g;oD<}DhF zVG!Q2^~2m1u&Nd<#=x;-56O+-Y!qi>kooo8HTbATj@3Ln2Vo!X&XaNQ#M6uMMPBn; zezWFvxi@gPf5w?6N^`p+s|M z02z2j15xGPs9Uc~4L+hxYbLlxw9284>Gf%4Vtp=0x57=+NWY(0O~|^Ri1NaY@dQ3n zuS>7r8HW2>(HUa5cV@@Yo}t?_j&{s3APvY9itp~J%~M;gn`d*@=Yai)N6kB38`gE} zPwnYI-sawe+n%jCXY;0BJ#>V(rQ11&7vIO4;Hr|R?Qnz5)0Vv1l6N=d>;3ua6Hi?x zhheoJJNC4%;OX&AbsX-g8Op2OT3~U8hsR#^Q^V8$_NQNNefggsUmYHUw~Icof$GBl zMmSga4@PiN2uFDBO`{Mu%`@yG7v_cVB(IXQVF8Cn;nGFb1dGNCY+|0%BBi!)nr#bj z{}@{p>`!;YKPVO>>{X5d8h%>g?U3)22%#rd0@?mZoQ6fvxl^@!s49!9aJTkuTS@rDey3xsfC*iKyIK6&)--T;c#hM$}s*AN6T&pE$ zGhWC2I$S$ZtkvUMBLTPLccpjtn{cg}fGcrCitM-GnwNmPI|Q(OAFj0$b@dyO^~io3 zuC)_Y)fZ`B5`o6EayM9&Ff414o1YCaIJ>MUnAc+5g3i4H& z$vmgE1o41i2!`|%gXml1Cnq$68WQETe(i*AP>=IEzhS~SXvBHFpPDcYnkLMH<_XK7 zg(SSDX_9Rw2@T=ZjF-!uWWT+Z@rrCo&V5Kqxs6&e2ASi=b%Gcy@D{K|AgNMrpZ#Wx z?+_nmii3p#=LTGqnhSchj8|u6$T`NunBO6Mq}=z0d=|#~QhYY1;HCKNOyNuMIT)v! zZ-cQGdClXyMl{U!cl3j$x5>I4&k(V%juefzye#Jn z1|Vy34~Mx>aKas&h8aCzW-96s0sN+Lx}Z-f`J+R z!KMRE2Zl3|kNct^g;_{E{~Jgi5Fw>*T!ZU#5*}vi4VX(Ig*vWNvPx#$sAQGQ7A;H) zqmtnXc@8j|8KBA~pd)?Znk9JRDp9BH`xmKv==3wBeVv{~3W)ltF)!;CbrYV@n5gv! zM@6l7dh#(TYM2m*80p^+G>0Z8nJ2qchmRa>yK%(Z?Dq{dPu>oV1q02u z{QimNi6Gxi;czI(Hv2ud!4=IDo{1q(Gq^y?xjAm4Y4Wy6alw($q(98DMbM2C zKHPCgBE;`(_W9Riug8ZHUH8X-UHw7rquLMlKidD%&~HW`kN)Py;~P(^KMa0G3Qexn z>W>!S_5qv>POgCOG+)(Uyn28pN# zkF-p(c0?$IL~;p>tcA^n6LdG{S|Q1AyT(V2f1~+#hTj^N*}pcgRv&+IWz}{%V=}fF znu!)P6T14{9p_zFbL@wr8^=BB`vF7}LvDi{Lxf3)DNUbfVM6X9pNDf#hS)F05saMRd><#{r76mzhT>O5|vlBE`U9)$24Ynj)e- z;GGV+CxcU7)*T#iw{GmbeZbD4TSSET^{utqcA>WY3G-Ro$AK?udnM&95V}_{o$0@P zap0;*^`Gx{pX)l`D_T`!>K#4;668y{o$FwxZ8aQ*{754YW%kJCw|+*e;%(c4SEu4V z${%L$@fN;q2EugKqH2juzwY8KvOZfpMDpB>6xO5=3AJ)V4{rNH?{vpr=x&rehltJPGYeGvg~$#)qVm zyawvFW$G$&>NI>SOYvHew*%7=zn0fQxl$?Xcs;bQ%CvXPz-^MR&XJ<$4ZyKGhht9; zhk-xD7Vs2s)n&NqbLuJH1a%FWy2hM36MuwnK@^pD@IoRtP#BVL`7rZ>g8kaQCI2fj z|LjSyZ96P5h(FRW#E1q>XucS0a}rSzJwq)~bVIRoia-9n+MNb}we#h&tXUWMsGZ-g zlshthY3Fw;v`+pY_{s>rsl_zpD3RQNq(&Vmmq@L@Z@U!k1?zqRvCO8NW?+SqTPcv-JaA)?N}(`&W|?gi9b&)f;# zdIRoSGw`^a?NNCy-yPx9To0rF$e_IRUD%v|&ILMRJ=CqXn8WC;WZfmt{@IKC0q%cp z(Pzhx+-EOD#PPY_Eqd+zaeKx0mR`P@wS>@18NTLv6+3LDP1kpgh9Aj4eJU%Vce!;J z-#6ErJJ(VCDx`V9~*8`i| zf36$uRrn6_BgM<*E&4n2Blq_z++Ef0OL<~!?(B~{lV`8M-FvG(&;7`KUWNN)(cI2=||-dDDK%AnQ4x@+zh}+{$XR#s7v^5g9Vr?Gen-x{mCO#Hr|R6m+{L*os1DD}0r1Oqpj}nPpbUnpdf>0h z^ANWLeP;($E%NDqer`bar+O5(i8`u}MJfSEr1MPsIMxD%XNWQ?_gJSvAPILv)CWTV znbeUiS_{=#B^bp#$p%MR&xG5@h&q4J!vLZb6!pG9=#@jF!NYn2qh8TC0yUw7t4h1jf7`h7+}nZ-Parn^-JS z+<{Rx7@p+ZtS96PicSsyGsdfyEjn^&NRC*TLjsv%Q7#4RCKhCf)6(Krv?*mQ#i|sM z1be20&R$2oAv&*fatnLx%q+VB#fD@vc{qrI4I;}!1TY5M zfFi09B~2(PLkR+K*lLvQK?z!%i~30)MS0N8*wJ&k{zSp7Am{{!_a>m`@6B#%1cRVd#b!DW!;sm zJ|tKVEr*k*E(=F43)ahDYc<=$HUr%+GO?7)rwm_1ohFHrSM*Ot5fwrct8V|Vh@6`}Ep zKwSYJISr8>uwH4+OY)Z*B7MMLP!zwG@CzmNf~_99D=3cNTC@pejY2_V!E!3n^`)^gMOowa#p6O@lRz~^b!gh0jh&76 zEcP!KePKNu>HZyMo;SvfaYN$T;_$+Cpi+m|s5XIW`^5Qa#m5z^R9D3HrKJeD=(X5u z@z)k#S?YXrVwwBGc6`l--1iCl>CDG7tG1q~=1WUy%3APe)ema#*CrakVZ)EFe^&H` zvwO{XPH>+4FV6o~@t-PIo&AEfKiUDzUpclVs(-m7+L6%<29?sR;#g@|HW*LRsYGA`N7!Wy{5FDFqyvvw-(M{ zi(QMm5}Nl~5=`RAg600DRom{E25q|_^{?G?ui179wq3v0d|-ZLUbXFCr}h3-=iW7E zo#3qdsQNeckLy>RhXw25yxPXQ@Aa)&cM8^>e=+oMd|`amS{EgeoiZxiKXdOVQH|Qb zBMa{3&M&C8HR`xP9si``)1HreR;jZQ*HEOqMtJZT-@@4t{sjemJh#vZdBYGrO5g&Tbk?2a4-M_J7s#?DJr1iJAA{{BK zBi<3~NjMXpUy+N47L5zz3%dmCzDW1e?-zbL!`*Xt&P9Kc7+5O)dtK94X6yXk*j~7i zI1-0`_&@cgn}u+suXN3~yN z#J(QqB)j~Bt_g$ZXNJ*kjC?7;_$0gl-z~jccDEeu`iv|4TQVbn1DP!Bb;3&k>j19I zj$@VwUiKAW0QMotP334aa>y;@0C@XSax`1W(eT=rlB3NlM*#s4IQ&v_ba~}u?E|3l zOUco1AqSx6JG*(qrm{2{TO0Dq%8t#z8$p(FQ(3a#jCo~c$7kdzkVS1OOV*3ZD=VuP z#oIQOBkN?#D<`Xy4Wc#=)NmbWvjy*#zgsqIhp5fN!RXLJ?A_A*wXxDzd8}*%Yr$!H zh@YdF%_Aj$Yvh|uD{a13NH%ZJk*>?f$6(NYwlG%C7tX*tBBR5SjXeaM+R02(&@6A; zSSeq)t`B{_KJvF#jlgiH2x2kM*s+DMzcXVs-q{Y3mWOjoD^bt~m9OOseVFe!d2?RQ z(h+RtOX+Rd*~c3jh02rN3@Lb9ctDAv#b@yjt*q3yBHg)) zd4qZdE*;SZNQRnq@^;3APN%RM#vCYxk|k5J2JBF>Wl9AB3QG1&sSw&at^*!2TQpk? z_sx8oI%POUiZeKEcA6wM;4ONAw;p&kFDbVec&V4+oh=EWCmO!S^K6if!-moN3!*L0 zM#s4NDVb>^kKr%@Kp0vr;){2nJHseEHSd!=2_Fr7DEMgMql1qgJ_h(0;bVi389o;H zSnrduQuvg^rwqz=l!uuMR)-Ricf3k;Y{;R+$A%nAglx#6oU;QCa!Wn(bEx&m&!N^M zepmN>4qU)1sLOIosyFY{;SX zvmu9iJSN~OfltwW@-GccF<|CEHYxO((Y-c~1x+xi-!1&%3A!TH@c;Ld?4)7m$P zDYCA_a}!>n%?MI*v7_M$ZyDj&uApTSsAIKS?Bj_)|Jsu8oElc?3ia%l6?~|V&5kV?Y@T@~&@$9`+26dF3-*w2z}FJsjs{8fd>a#L=+lKuf?M4VZf%+>RZPl%rK`TcYt7@~)~H0)2nQ86Shhkpb@$2;wrnDl#(t<#{Lep97# z1KrXqZ{>*c1)x_@d)UKy8`SzLYGibvWhFi9Y#<$HqYu!Y07D;Cgag)k zjK0izJ$`?17ywX)h7+VTP$$D7I^+#-LAHr@aj=Err8y5u#ypc=+QZS)O^Og?Wcqxd zankP@_RS~2M=7LZ>C0y4tfGk5F-OJzFR)dH{|!yL$~S7Mo7*X zIoQ4;&@r{=aFCz)bM02g%3>P%G^KoRh>fm43kiqbJ?Tg!l0ES_vrh#ZV|-m2ZPN*4K0kL7GSZ36YFO5DJ3zqyju3xQ`f&Tg z_Qk7kKJsTm{mIpBr#>tB+#r-oM>FfR*ecoN89GEzr+G%h*E6I8dk+)XcTxUN5cM=B zJVP&F5+Q;KIFAz;A7W(x6q0BE7p`aj2NKZ~2v4}DJS-g9HsOJarf`6}8TNW-ya3N5 z---rl|3}or?iuX<=!V0L2e!z(x8OV^+I3^R(-&~sWr{gU@pG^UBcRhAJQFYG^=Z80Gv$3>IY9g}<$HS6^lK0CNUGME! zrv>}p9lx2-vA1ZT~n3((4_F=aI0ow_si_U*Xq=RZpo zIOZo}6A9O9L1k3?bZ7O$Hy7R%bQKX-oj(#g5)UUv7Kf9i z^(&V8q^>?`sZUvK^KG%V_{|4X_or4Yl}TM?(o&hqu-{5>i)WIh`&TUcle+y$%l=e> zeSS1H8u!G;BWF_;y(xS3nth*O-?ucdbZ*6dC`lbk+7G3S zl}TfL%D#8a-XPc;mP?kY6?e^uV%e3{?Mhm9 z{j=p#q$_p&#FO^8<3aiT@vGZ@zzHspU6qkK2}A9|8z};QFnzV_~+rG6DX8U|atYXbnA($!>Ef3ok+8&-Jzi?NFd$GuDGT145Wrqm%yy_&KgQ5w{gMnq`> zg|Veyy>}tqj3@`xlomv3RZ|Wk$|1CGiCp2icQJh!QC?9~jvz{#nsO9TjuCCAl1F-? zqw|4SAo1#I!LG%!4=Nv3E|aTOt%B`f^75hkyS>imV_2kelVA{nTS}&&ckwin|QhU@hKNcH{-@JE2inc}9 z`>?B7`^1yJgxM}b(^9ybTui@;%U6{0Rb1{@$^*E34a$Y{J+U4T*_(b1(_YV^y@6?i zIkY!1?K;rP({JJYPat2CcH{iNg8a5`JUBl@5@qx^!zlm1w1j#8nyFqe)&HJD5X*lp zIahG4LHiF4`fG=cJ_6nsL<8#$h1tNc`j`$nLWJJMo>xG+(Y^ztpb?A)9gk%T^Y1*! z2q)}2}*FcQ-(hqu&_6 z&m$`@g0IMegVp2c9e3R&*E164Uc{Rt4&cm(5Gw2GS=|=0FjCEetUDXY(tnuQlLA@# zm)0vkDv{MozlAKdUioEZ_0orw-|OYW9^rUB>_EX`b@WpK*c*Xk<7W5)h!u{(+xP3kQLvn*B?0kU?Q&*%A)Zpf)W6DCFu4PV@h( zYX#)M!$EikcKjyeR?wptA|!L9DkDQeLx4#NA6S#Iw^4YoAvdfofe9~%=>2p{9YQz` zwkihg2l!$*JQ4Q8$-!H&Lia-304&6%faU1m2u2KKItoLjErLEWs8<*O3~(US!U_*{ z5mZDq0mR{*^g;y#z(;6|K9F7f=b#+xw5HoZAS@!A`Z1EQ@67`ZE%5)9hL$65)n&nw z=_b7LY{#~2IU*S#2eOfezkcYh3`QYoZ>-G#)TQxiX@_d+4YvJ;DYj&}f_Cu7#z6Ma2qZ>4w$=bc0IwEJXNy;6W*Jz=di^5Qm|g^UVNasfJPemAFE6 z=q};$`$iEc3GPHu2e=^ua{wF-VgpEme9W|a#K)pD(ugPQ_cHEK@P;=ahgw4&JUc?S z0a1MMmNpp#8cZvEC(5x+1C4)Jk9)`B6`QLk6DSGL;t1fhpdH*0G61I({8|0{o{}wEE z0|D4!NC4oxs39oR-RV2ian0L&{N{?W9Kk*SYDg%i^)7#hk9VvZ%MvXs#vK5Et`*Wk zAuYq70Hdkdsf{IVR)s;z@`dun>i2gnIaapSuM?_ZtOtr5**-Z1OqLcU>6G{IEkLhv_DSiB>Ek#W%=z3ky2lKyl8!XBhpqfWq| zF@eXxN%rr6hE|!J6}E)ZT9PE67Z9ZFABmm+Ow{}%arnOzhXmr#Q*-f(`C6heQTwoQ zp>fHuw(pp*@7QX^@h7uSraqnhc=q$xR! za{vh%thw|w;sCm)C{nscIt9`h?-gn;C5tYvlCP#nr$Rj|?CeYyb*+-!TT-7=RbLb2 z?&7pot2aE?>NOfbAhR8$^yNZDZ&KIyT&sa91EHbrR@|vbYlbvrb9#&j|SJWdKCB z5GH%7d`D^rohsZ8r@f!=d|gAD(_J!yy(m>wo+>U+IZIQe6^I8YyZ5DPYN7b`9*!g( TfRGw2cdy^M{%tKP;>iDhGNos= literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/ops/__pycache__/ssd_bmm.cpython-312.pyc b/model_executor/layers/mamba/ops/__pycache__/ssd_bmm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6fe182abfcce63a2523c8f204c36c7118a6abcd7 GIT binary patch literal 8508 zcmcgRTTmNUmbdkOwIl>cAdnF+gLr9h>_9vY6XG~F7(3%}vbGsJ8bLQgpa*WXjG>wt zc{f`|$TsCrOMAQht3V+L7h_bbsQ?YlHwqtuj!(o(uRYD!NT?ioJ;?YI=>*8;yu z;x|*~=ipaE4=={AeIb7B3+p$%pnjR(LXpogUK4mTH{R#t*XH@nO8p^Y2vOF5Anw^d zQRQX?=GBs`DIsO2Rby5qC0D0PiqIWJ=%qln7tm8ygkD}mkFyBBs|dZKh@Q$K{O%(3 zsv`92BKkc==!c5XYl_glMd-DK=_3tP-Fuo(b{s8#2aQw%RsS9_XqqB?ja#_&yiX;l z*odHGnJ^oVZKZ&=g$MwzLCvQYEUyo|b@FWBV&CAYz_~rx`90X#Exe_-@RQB(}VmMT3DB1H9y*3)ApuGBzGagr{kNI4cn4GU`v4I6dec#67=;59TUhrVU!M1f>B1t^Q=Sr zS%(69Sck?1`;JsR$p$3r^1_%RahrEpqJjd6G3D4mBNb-@6gxFRGuS7L5d(ZO4!Z!z z1yix#o_t@2k6RWTpDSZK+fl~xmm=<|KH1?I!w%e6*gQT4+V&S&a>m8EIQ=x}pQ(Ua z1=K;;G?+G1sp!AJ;m&Efa`~8;aVvNW*{a^Pm2*qIRlSe|j9{jkb1*ipnk&Z^a#a*D z4TIv!FgAV_+(}F+Jyzh_;nZ9OmdTIZqp;x_{JEa;QinLic{qb)MI~pH`~_O4Nw5)d z1hkoV+AMq82(Bt;t)kaLhpU<^ll_XVovD$>e4t!@J|sEsf&5OnpP5=1Qw?0}=xa8tLp9~cgl(&X@k$&DIZ_(Ecb}Z(6Kt1cTRnG(^GK_#L1ATqD}^iL z%12a+wOru3xTndFqmjjjf>bEALUW<@tK;(Hm*w9;*Zv7cbyy);XkFAj<*SBqSoJwi zD14>k4s%Y)U!VnZOv8SqwD)v!z*A`7a;5o|)^va!^-uZ!sUErg^B`IJf_5}>jS5Tf ziMy|GOFY35@z@?QTwb%djpEa!NfyMFd7I`5Gce2Z8*lhd{od3Q{fweCK87$Ru^P~u@Eh2r)egh5cDHJ zWnVfXXmeyW+=NV%q>GNwQ98y3*eOAqVB-w!YZbJ?zy!+(+9Bzq$M-A7%Z8J=n|J53 zPDoT;?vmdP1Q_essgK7*s7#zZ&1_Lh!wHVZ4fbVFg9)rENv!I&@Q_!)X zmA<*pXfB4qlvs`u&3$5mV<&czVhy1A`spA#&*8r;5j_1grJiK zpU6u>Bov5*qhVH12Zx6RVq74C0x=}0!8?Ls$3=-mE76_fkpO*@9!|nE`6Iz8no0Ph!RS!XAD>A0 z6A3CX6peOFObO+QIh(2OMB|npY-+(vpsWyH`ioOt@M7}w_45DzxvSpDk%S| zvTE72efrjeIDd4P(bcrpa)NiAfKh#AvdlH7n==F1wzX3KXUyl*pH6>adOdY= zTZgQrna&S~7DDr(#bCC2-P!T)q<@ycUOt^Z4G!FJRD4vj_{O4Z-oMsxly@B6u$5=1 zk9sqq^iQ%S*$XSxeB&{`_84zF_E=3gx*j>ow~boT^o6E+M*2o3xLEQ5#~Z3<)f)zD zrY&2^J6d={%dBn>n%0Rm$od_z4v+gqC+(4moI9L8oN+C9<~?7Q`1lgvx~U~~azkUD zO|H3`cw5s-E&&`yrwsGdc)>iY+cd)b<;E)Y+uqgC-Jd*g^Zv6uIq+Y4jaK&r z_8S^j#{T*IA_g(@ZPjhdZE^u|Xbdy%D_3Ek&gkAJXY?FVsJ*b%INTKpRX_S`0^0F~ zv9C~se1}r1@h&)S8;bC$_wm_*4{Y9FRiYndfS)RgW$!;D$s zg#vNlm`xu4?ziwQN_35R)nbU4hxn(Qc}*ph?1aDEK3YzD+nj>t#Yo9QYoWFiZ&^_crHZ**5RaQ&yXI z+?Q`3I-q^`$?UinBJ#YPGHMRMb65iO#3bfKYiHZ8{^4LU5sZNBi1%$KNw;|+^pANV zvixDV%{z9@I}&HSVQ-Ij%sUy5M7%?^H^>ZyStiI#wRz*Q$kbE3Ctd~SAVcR(!NUwC zac;oUH05RESj%vnVd!DDL*n`Y%_f=Hu2Z1Iag$CIG5U2B!1~7Mdru( z9-QoI{2ftUgvZYFyZMGs#o(tgR4T>11r@OuJ9UX>do2HnF||c%UoBwD-*;4`a`p4P2*WJ zdp7ij+c$6B%&7i`%Upk`uUNdWe0k||Hu(2HThz+QZbVaub&O0J=3&!JUhK+IC8)9U%G$O{rgw8 z)he?tby74T=oqa1kY0$*#~zv+R)}Bg@9IB0_Ib~zJrA3Ex7EmKo-@C177fT=GB=r? zywkYczSRB@;^xj@cirv!jQ%|OY4l<1N#KNJq5u8Wys^ z0#n>+_m@UDZ*(tqE_W|=FZV3<{F||5)8JX8cticysN87!lcQ8r!}rhd)x!6&4wYAk zdR#N0x+9_y*G#C)Et+x7f-34n64$KAW*2R^W=G}KVhPkl2l5T+jww#$s#qAEA6>lu;kf9+lnSL(C8oHQQdO8zt(5X$${}>9c6o4V zaD}*YO{~EbFYH57aE`kg6o*yj^pv=8>U%RyRkc$qkQp zwf+IaTlzO#6&p_1wq65nFd?H;Zm(G_eK5pZ`iiw5B=23Vd#3!s`VW$KJoo``IaAEO zn$;I?zeseDy070QkkPjtBsx_pk%B+)4Y%F5+@k6NLFhy~f=y+aw}>SY>Ocn5ZSEE) zmP%+Dez>`szABbWs1rFW7B0_U7F`lrfy$f<-SgdIrG&bX-7&|dIk8GYt7T@7gdUQa zYb4YwGuKLJ9fDUF*ezncgf_^sjS_koIm#9)<}1V|32nw`^?bGHlh8Ith&wqDCsq6IycA2!ScvqpBXlCaur^13aMrK>z>% literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/ops/__pycache__/ssd_chunk_scan.cpython-312.pyc b/model_executor/layers/mamba/ops/__pycache__/ssd_chunk_scan.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e72d7d8bed54014689d7f69bc921e1a76e82e86 GIT binary patch literal 17232 zcmdUWX>1!wmS!gJ`z9q(x2*f3CF{Oy9kwjVvdfli`79PKQnDC-XkuOvKMXOa2aLq|Rp7{@51o2O}!d>sD3f~RL3E~0a zCfwv4F;8A5Nn94qiRQ(Z#q*NO5|R)R9`Yt5iR;Y%dL%bB`w?9Gl4LQVUT!hRF?*FD zF3W*Wa?|jAISg+4KJJ#drEZy9{<+w#a4Wx3eJM_lC%Ku4Uk3bYhF{~>yam4mMtC=V z**o#eeh>NVeBVxgEdO2j@oe8^zIe7Qzt*jL3wx?Tn|I-trTH~{`P1rY!ma;;{L1j9 zh+T2;8%qZNFnYu{Z5eQr0B#n*Edsa|@azooZ31|f0G=%%KSzMyE`a9>$jKAn&lkW8 z1mqM7@D~Z-#R76l1o#gL;H3g`$^`h!1@H<1Ih6u)POSicod8}hfI9@_ z9~R(m5WpJ+@FN29n*{J?0lY;3Zxz7X1n_nNyh8x*6u`R#@NNOTCo?{E)qV8Gk}p#m zp8LJ5RTB8m-RJK8F?m^c_gGc`OUzS6Cgrz1KL5;uw@Q>01?G~nfNv(S;9Ze6);HE4 zez^urFA2CwzC4E6#Ys)ysWbf}&at7($DO15YNz+rMqc7O-Ae`TB}%Hix8|MxfNR?0 zPbzVF+2vdGzbwKm5=^9+;7C~|NvbAq-15#k{WB|`q~wNsW+}_V=D2 z>pcZB<&z8EshMeiQatOq>%+0tXT=0Z)1<|F!{c)Mo!)8R!mUNW)8`7zEF|@Qn4R0> zoV@O2#z^Y+0L-5>?$syp_|Ez7L$hfycbtCD(wxVe)TX-KaWYbPRjfTX>dHa(mHq5c z#q6983%3GX2TlTpm=X2?{4v~t12Y3)Uk}W{Sk{!%+1(33suMRfi!uEUurXaNsje`4 z=ZyPKsxzlkKc_WkhW`B;^gC0lY39V>wdGXc#$y)T&-Ofs;R&a8U^}NoW~W5XcxS+P zY}0jDVDd&XD?PpY)tB~fT7bv<4oh&WGD3EE-VXgtnUL*td#Z~vU@4Rd*H%EY6)kY^ zZJ6P)fl^bQm^W>WV>}&xfZt8->(xt<)G^AM(!;h-1Co+}js>Z7EZ><tJ;Rm;r>R-hR#?>njDf#;YU0r(c}< zJuNrN(>}AM+uX`oJFL2hYV(p*Yg*IOlswo6kNCu&irmDMC;4IIq z)fwK186LBP!)cFwkK)GuEw4|8^$ngms7df7Ug6+DO>`Jtfym5X`_;2lhyO_0yX$#l zWm?ZRyq=Ui(<5W1$3@=Sp7SmSj|YeLTPJvevkt56o;8DmT+8YhPp1s?r^YB|rh9!U zW~S7Eu_$fE`JbcCW%B>GpJRH(&hhlj@Ms%68$8GPPw>6{zJuqwIua(mxxcrKa=Bd! zQBwz=no8&l5ZC?*WRLT7%V5K|_aSN=$Eu8$d^b;O{R2E{;tjJIJLpZ6E|}K)pb!4> z{USKR)AtRY!QA^tf@i2!rgSDamMV>X@;@X*aO8WQ15sp^FF$h}+lyT1DdJ$fpv>6+ zsPp^xCS~DvAQQPRz)6AO;O#T!-0(enX6@@Y0jG%bR1t@*UWAjw`Mvwm>LsedFAARY z8Uy*92U9)lDTO^F1W$6iX7zG_@8y^IULGc|{q0-$`GxPzow*wZ56)d9?B|)H#Sh=% z(h=ec&kr)~!b@No-pKqo{RfPCmEw<^@vgH!8c|pGdjj6^+#FW0zwzZ|n#aXI$~>-7 zSEy_3_*8?h?%+xW#}BR|_FH_KTYaBr_X5t#_I|1NgIC;gE@G};30}!$ZHj!`yB=q& zH`-}})p&151y8dand+9!e+q0a7D+&oG%+&l3$jp2? z_yguY3G<)i=~lEChqw|o$@vUp3FZz@iNQj-DGTN8vyhk)Q@NQebkIIje!3qFwd&?+ zQNoSA_e;4!(|5Gd-is`fxJy*2Prt;8AjsxQTz6*tJ~LE&@DM`0{6BC(u2eGb^3Nuv z3xOLR-!wNguuI*6yNezl&i+)9zT^1EGyba5pJ-F9ecO zwug9r#+#HbgEG+DKQleQ0Ha9#fd!wZ%9E5qdUi43OG@vsKP$|S949|9EQ1s#{?+0v z<*j)?WRr1K&1EB*uKf(AN~Hi9Xn=uqI5o(FEt9v{{ADU@>Bn3Lv&k|x&+D_pqPm{ed=iAfbE)tJ;^Qj19)CiR#&Fgc7# z1161_9KoaslV(g>Fo8q~;cLUB9g_}Ba5mD1bB4YyOu8}IpMv!DVC*O+$ABcIi!*NL zB>QuN&Cg>5mI(jMvp?RXDV3LJa^5(*>I)UZ>tJ=KJ%PGAw_4lVW?MauxtZ&Z#k+wU3tq?Vxw(1A{DRvv=k(n1 zOx}WR>zH%hh1W=qdDr}Pmt$ek@9_JhF(+Mk7m4HO0>~rASlym*@WK;$K|c)zq&Yf3DG`8=+<}J&RRGghUzF7DgRN z+YmdxJ&Kx!kah@3hW3pkjpg+D4qrvu;=kqf8qYqAbcbPZZNm#4bgDzTx>r(>ww@Lf z%4}Li6LPakJxZe1Es{&B{|%-&+3*qb-t$jAdVmzE(a` z#)@NCcPsmlv2SfSA=vU-Mp&~R3~dZ06v}u}Lu_oPs12#}w&%VRi>$J>QJB2WoKWaK zS_+r0FNgFI2U1jRjc%9zg50h^wSAABsCFDF#$Sm=I@#I@&`xi>KNXq^Pkg+vHkgoS zNBVc zFc!fW5zA*wdt-n|WAh7B;TL4IDJn;%%6LsTGIhg(8+%~+wE3@9L~j1Wip`3Ip(tTH z^hIwp5FJFe+O6Bj)(lHx`Nh)XtZyAp22jU)$a*$m%XuxAL{u@x@$kSKJqJYP;7^@5%sRFG`jq=rC90KGOB1ry4H`xw1Uv)C3KeX_-75_ ziO@p4i06;FpY=upkwK&@jaRfIT|0Kn4(w(*pr+W4$*OC=v!+!j|{h z*{CEkv9S=ZY(uuTgt;hc`C=)Wh0Il3Ct_zG4I^_47{b=}0<>33jk@*TR~o`n5?zkf z?o{+VFDQC=WAjGBS{&_)TMlnaf3E&k{jBXYYC8?KEi`SYVQmYGA5L#hCoH82d&SSl zt)?wGvLBAMZ1;ZCf$ZIhto(@ki{8leMsUl5vg%+oyW@pDA4V%Qh1IY&h1Czeo8GPS zUtf82?CP09L+geEVlL7G~m z$)jrs(_y-nP-@nff3g%Q4KE{oAyO2si+=^-^-|~(7^ikuTLHmHXW_YR++KqeHCt1$ z@xO0~O*~qNA3KZc&mzTH@LRoXZ7{Q&pNKX6y)oAM=*;#!svSX!k(8GMx854Q9o3^O z2husf&#fiV3-S76sO%WB9$P#8N=6uRUf~t{mDUh0ixh8EMJQyfiPv@SYI|roAyvUr zLa=KHdm>k#X{*=C=b-;YXd=IGn80utnq& zo$6n3KnRt5a_;`+(B+ToY4{{U4ZbGuhs~(^IR)-uXo=iS9oqJ|Fk4)$B zh8zwJ!zMSY*5%;EO7$n#*?r5p5wezT$zzhgcE#if{K?w3t7}J!_I1&7*x$>cjTe?<_Fmu*^$zyDB8bktX%I)fH}^G&WD>HbZ&G$ z1D7yY?Sh#^LXjl4gi70xscrkhePki@phi-?jM~XjQ zMT)$2aYCUFcg0JNATy4?>vHg#aBZ{@nGPex;dL3)47WjZ15z~LR$HVtUe$^0ok-F7 zk&bR9bTxDbY|D2Jz5giJ{8DXAZ((ah8r7ogI%KGe$&jH1X5#+#TKwF3*nURcuXTpd z^-oO?>>Kuvy|1K%RtMV=tcH&X(x`Mxi;9|2cJr>G<(Z)!8QTA$UlJWh`YM=@sfN3mWYAT`>beBsr(>fd;`(Vf(ib5` z(YpABL><=`$4i@$9;>WJZD)}F43eCI-LAJK6giRMFP63-u%kS-Gi>9GXl%hS3@Ko`ZTDFDek_M!|q zP;mFp5N;(!{FHR7C=#feQUTRaa-dpD1XM?ff$Av*Py;0aYNVtht9=<+B0z7I;tP%z9qvw`U6j&I+`{ps8~U1|FZ}1u!Hc^KyQTOp0{wB%9!iz!c{G6Ymdq8v zpH4DYrvKuh^jws4ALCxZfu!C*`ZFk~K?G*y{u%t&5G2h9POp>(Zp`?LX66^!G&TN4 z*jwcDEZv%cgj`W7Z^AiGrmB@#~xo-qWl{6_CJwD#cB(IZl{BbM(j8*Ps z(r}QhWIu`a;|@h@aQXfWN=Zf693<`BuAY_R0|~nNuDJzBkNSI{z_R=Ztl)2lUcXx- zo-1_s)gg6wI4X(0kBXXh70u68hWj<4n(%Za5Iv0wkL;?N;8dw>excCAqrxktG)LwO zLvLRnCfI^iV#c~Wg3oYO5*~mJ#8xlj(j`nTV{!!(c`zMa<{KCeM`Ze0EUCYwKgFrAO{_b+kIbT5hjh^SJ)E0i0qMRw8_#^2?@!wB z3!1%d9L}!br&#)*WAg8S`0=Zb40-WmaIwy^MD)BsQilOQ?AQ{Bu|L zc6Zpw-_LF}eBJh_E!O*GCo;4?nOPg64TQ!XSC`RhLYw(`H<3m!&APhFT5SNN@sTfLh2~hwKzB~E#^dVd=CE(S9a#I;C z{#-+sVWgbMwm-PLaW`uGl%gvzQi&6+Rhw1O-p_04DvVU~ks6HD65xZ1O-0oBxtgxS zNIj92{ouyNjmXlcv$O*vhxteYMjDBnl4$W~8u|#XHGv_bm@PPNAlVF8Ek+g z^2(m&)}Y*)t=^}(_4IKDK0y+>rO)iu$X>nG@YG&M4>It{gWN+5JWLV=We?xqd>=%B z!ABVI6pXfKs!;|zP4Z21hQZ$3$9I;&&XGj^p@*ZJqg%yK^J?ia1{{aMOCEM_c5fM< z=GM^X8Suh>aDo9ZfZH>5o(iL<6j-Fu}Z>E`NasBlin7-iisb>XEsGuo!VYi@b z`}$LRFFlW2|Jh3twnBP=>A47NT)s7qvKn43F#w!yq-y3y4`-Frr>pcy<)UkH)-^o? zksqHRlScdsv7Y@;Gv}?q%$%S3=zzULNf%u{zbC17I#d6d=5+cV;$8|DylM>G^1=_eL2L{@gfmIWDS2wIRwwQ=o*G4tc=NHZkV{-*v# zv63I`|7Q7r?Ea?5=aA;4(EJKx@7_L5wrOR*@sWhG<@GJHQzWAAz`yt_KD6Jn)1p%n zQbrfx+^%LrLl-jDBK*#;HdIR&Gu0BD13eWwMIU0SaN{0jst`j{TK19*009($s~cD8eine}nc_Ik0uVP- zGZ|z7h?uD^Lo5KX67EEB2Z1oj2!%MAn(ZhHK#&9u&8CJv!vYW?Q^q^X0uUZk!;Y~4 zgvOK+&a(i7#niA9EC3-9UQA|hW3qUx8muqm^ zfN03bfN%(J%L4FUA}Oc( z5erahj#U=;6J{w-ZJhc|kO96E6C%xbcbToQ!m{`jgj}7FsS@U_*Ew?}$;RIQ3!aC^ A&j0`b literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/ops/__pycache__/ssd_chunk_state.cpython-312.pyc b/model_executor/layers/mamba/ops/__pycache__/ssd_chunk_state.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..021bd33392eb5144905f7b18f8d34f7abd563cb7 GIT binary patch literal 27541 zcmeHwd2k!qnO`^V`yvUF-~rwwctN}miIhkk6sgO)HJY)x6hTsyct{OU*ET#Hue~D1 z6B8WomO$533FCEksLWKB>)I-F99OK$rZ(fGQiT?^Ar_@SP8NA?7I|Kl^zB*X`B~B_$f94EMefLwPEi(laTd8Ui@ZcyK3U}{ z{jTD}M1u;y7xk|^)}5X*kKR-Mp4?O6(R@_-p`1@L>!{a*zK-atJ?fXxcY2(!sIPw| zef=w^uY1Mx1^p_I>Lu!{gVyBR=jHVENqx0A{VDAf<*EKLz2kFRBRxJmwOQoZp69wO z`Yw=5J-78))|m()qgPk&Z0s zcV>}yNy{gjJ>3%Xh2anL#SZfw&R(xF-{I5$D$oBP<_l@Nlkq>}e%HI5%<~z@zbo!I~Jo_c)L8FKE9Pk{3 z+0f)UBv}5FvPrpT=)3eqvzGFVcn(9z0-+?GclEqnfA@k zeTsgAi;ii$^OL@7GYj6BVesgQ{YOSm4_`budiYZ`!#`CZ(I7!v{L?Zd`+&rB#-M%zE=H-R9lyH8*w@ndcq--%!Y$7COwFJ$pK)KRj@H6a<6A1E*saQN!&&eD?T}@j_@L;nG3?t<}#z zQKBd4OZaOPbtU2H2#WaYp+Axd{dxuTGcR9Ka7C32CIdoDf8h?f$Wyua9U@)q>Wqt3 zvW0g*mQ1_o<@7!Bv{=O~#TK#pJ5Z-PMvC%Nteh=mElFO^DuB;P@(R|$<|8M{jd6j(cu?lfWI^|I3%y>4*c)4luD%fJS=nk}ozfx3NslIb=OJ98zM%}jh zRig9`j7*Q>Bc}$%unyq51(|eYTx+*r0 z5x1EHs=|Ulbkzal&I(;iF1AXj7laair@u;6JBjxL`V~Vj`(_RM3Nc~nQ&jVRZF+1kH8dJ{uE>AF1yu5wi z1DdIUu*<;UFpBA}G4oT**zD-E2j0x{G3E4}ud_X-n3;!np=yj7o12=5sV9N;wYA2S zHzBDB2B-a)a(rfN_F7Eo_02Qz0v=zO^)Af1T;x@I&Bw%)1Ect(hgU6mb0VVy24932 zHhSR2Mj{b5cYJ|=X(nFGULsZTZ)wI1IWjnE864h+B=zL+IVHZce7GCw&9Z|=nJ zcuWn|f+~4qDpF@JsdS>AUZ^xwlvKx?Obcw5(V6MlXCHuAc6zT@&t^=}Ycww|$rA=iE1DW@g>9^PY*B z(TQ6V;|tJt?wPUM6O7kAJ2rc1%sqe2>-Kt~MR?oFH|CpYzIHoSkPsv~Z1SdO^y&mN zH!;H;fkL8T=FNfev`9U_NK;1h^0C0N(DlcL@-0IZXQ=wgsShr^e__+mym;`5!IZqY zc7FT8hi`+Bfy>1MzjYUVU`Yn|&= zKd0AgAMB0v9pf5~ahhY#u<>}*WPPqy=u}^5D7E&j zOYhj09RbIm%*C~oQvFOvX>y*}a_`ly)J6^Yp_)i#C+Fzo44oV29!`Ed8aZ?RITTp; zTb&_L@y@B`i-C*ZZ;mS|tpy6C)h|uGI~K|fPI0DEPE)!h`vKWY8M`?*}EQ!NfYS3$Y&9)9a^RFf0c=(0D!#zn4Sh%+AkA8LhC z^;`$VT~-IwK~?Bncye`geg9)k_m*Zir`i3B!C#JiJhG`7S{#T%oyHP{xe{!#Bs6Yu-Ke*Sw?5 zm~VPMW=?n@w*)6haC&a>zKn>)v<3A`nd+B#_0)a75J|U(6v>>lAtc^arVMk- zO%yJnmq=2`17=#@JrWDhH)2j;rN4pXB#?2O>xC=n{=5U`!|%!cDHXl`6DEOB9Q-Aw zLc$I6J4L!u29BC7PMQ^EB3-F;6)CrhExQ96{_2HAEs!gTnZv^wZlDL*lqVh;A#DX(lJSh{w5fZt8E17so{LMZz z8_5N|+r*X#4#oj0NPqK9=wr7iKdJiHZR;J?RUEaMz0<_HvHWmUm@4l9dfPzXUVlql&h43*1)!6FAC$7 zZNo9|Z)aQB_6+9`d=h^P+alEhEOkY~RlI_JO2#GR11U9&O6JH=1D4&zwg{t|%y-S` zN{WE3I@Xm0HW27?vGu?cD2wRVvyH&>&oJI^4ASp>M?ta8k~NUMat%6I8O*-~XoC5d zz)3j2ki2ET@A((Xk~yamh;Q4oln!upZi~e~>zVUx@(J{JiFyrv&WwxgWINcdk2J!p zLsPe@2ioJrYiBAznZZ{-@#p~(S9*N6uT3yjD5^$M2gGG2PCJ8+Aybc}0Z2?S?wP*9 zG$P-Gq#1=uFp1{9F(vQ8%O`GKBN#wTH!(9YJ2B@Q_1%uiJ@Y=M1+-n=G371da4Rp7 zA6>%w0%5EUM4UBI49PhTz@;`IT9#rW=72ZP!vzU15fd7-~W=caPyvzYqkdKXz$LOmu`c{m-1l~IF=dZ>TV2lF3;vFBG0RV(= zv6w#9a*)mVC}J>PY`B+oX5k_}@<~wtMOo zbx0L9Y+9NYhsEA`_%$xmVGj>;h9+=>Ud?NdDzs6JBQy#waa3=K0bOyCCzZF64m zM%c{dyE&74ap;LP=U)CweyC)ve*OA`=E$Dooa;DOczn}(V#|7pvz~gQ(Jku&I&gy% zfPlR+d@jA%XNXe;L&hnB**r)K>5#Yv@2If5ja>z}f27&aEH#dHMR#gVD&|6OmJ=xxJ^k zrqi7DOw^VKaDq{{M1xNa&i=OOk*fCf>s(bgXX^$K;lO_^e>n8<=qES0zO$V5oDdtp z2)$7RDQGFJ4${lXO5`OK0D0 z33`HEfp2k|q9rgvZRh`?4ibpYd7aW^x5eR1?1OUlh&EQJk_06OOzH zN))hw0;yXB5sRu909$h^|v1DD9~fR{k#1*<^S zg1P7yyLeLRis6HL0v|MpQr|fS!Uv6_7`RlBaRGc#hvr}g9MHrzC7eqF1^Ao6sci;W z0E})P767AGPe1~G#THT^1|@<7f*!we>PoGP5)BGWz-E437t+WyN9AyjhY?&7FhVmA zA3!_;CnU{Sa9J4*s|I;FafQ*0R_5bOTancd{znlf$90(V2)S6RJ>$Al6YcQ zQaYa+As=umDAt|swj`jJmJDbi1+!q;TA>$qWoYn*J&7lR7E;g+#%qH%Z()-rwZC1| zCQ@TU4sK znO?W3MiRgS+nrV)MY1P)MDe6R4Olf!12rV-Bmy;h(?E?bwnlI-2&nP5&=b2w`AOBQ zb=$ZgYp`tQxaeYci$RTT_j?le!S2rlB>a1JC_@{cnrJkMa@#ikv{zvs!aTOcdO;b! z!pP~{q4b?Fu0-Wc14X|4cm@@;H8@cc}Nd^Wq0`MB7nG%o6 zKfu;{O8^4RoSg|gvu*4X=$gERppyQ& z9r8vR6qN=%XJ$au)>3niJf;87H0$qBf;R`Z)s>(9GT(X0Ip=G7y7!A(M)FB{b3pv0 zz?0F-?q~Or*2YoR%l1k>5vo^yA`Y-B=y$TT0oF`OvOh5w)$iM)fwE+dy;3aEwsGf7 zmJ^?YgWF>5(2M>;|dZtlB-B?D-JV*WEKg*--ehu49_D;;%&*@BE0W{n`TU~ONQSPLpGE!*C0a7 zgkDJscFIKPUILtZF%b8sSgN~7mXHLHyp80Kkfh+e?<4yTl7E5Z2S_qNyk(H_Xx$}( zp`Y2`u+e%p#Jvfbg&%xQFu#bDlBgcFb}Oz(B!iBwKs*A_?;yjQA%= zz6-=V1SAu-`;t75_Bqnfz8hl<>{J#u+`I>w<-)7h``=+kg$*}@axvhSxx?m5B77#x z@>F0dI2T^v?2T*JIs2|nQ_JE|)R-H*9(X-;ku%gHE(gG#F{tBS4&7Ti8@$MUeF$Gv z0ULIV5sM3fwu6!Oqg>-rPIDB13#AHn$vo3g8atqh+Wb&;q@ss&^l+Ms17D zH0=aAin0jXIdd&0?t;YmjY@$xvnKw9Asvx4A|h$}Jt2~s6;0Hb6ZD6#bNO|g5zyC2 z=b=xmT;~YaG{PBoF>Xwvo*xsc33i#zV!19&w z%K;hz%;Pkr%L$%^nIW(Ck+wEwa6Bz6UNZgKToQGMK5RR1-vf>++l z*&!o3Zy)_soW=lWAAkXuHwbAcs%#NR!$jqlgcibgIA_bFoR%lmwI7^+|9rHdI$Bf} zURb-s)%I{jJsSgDQD0=>H0L-CsTQ80KbOmE>|dy;{KAz>@7eAs-;N&vWF zW8>7yRMb$M9dF;b!8IM^tcQ50JYOdPmFGS;q;TxxTz=~&4`J(<{y21&Ggok$3c#d| zmjBQ2wUP5k6c%vgqc^s5YPiwMoFEe@j;3C z>xu-v_mTWVu#EBYYkm)TXqWU$hDn)6@m-ByCn_1#G4r}2?w`a>=^9FXYeGQSI$*z~ z&n%?P>PVZGPQy1RSn7wgN5#@W)vN-jhSdPovRa@zRt{9ps(~5;2B1b(2Gqo=_erL}ss-p+v*;wxNr8?H#??;gW9PLoHxEOHv4?)M-yvl>QHB z_*x116A&7jg4GSpr0g_RNwrX3`lgJgUW0LylHZcQM%|=cPX4R}*=+$wBUGM=8`I)hfQ~;h`eq0Ghvzk2?Qq_k-Q*H#U3^kHdoPX2Th-{480D<6X*jEJz2EP9&)v9X-h6 zgZ3p(Sm3;rxp4#iY5tf+8H65CE(5;=7H_VK-C6?FMy zEU+ts`5Vyu-{_s4gTDEni&Wf3*$eJnSh?`l@x?<+hoXAJa#x@$cq2IdNbig&oDsb< zsy0Q`j;Ptb><{?E_OSMm*%i^aB4$_ARJvuVf<@QxNzPQeR?e9kV5@-9w)}eFbu`-4 zA;;R_UkyJP{>9l}e)HpR!a6x_F2H4O`#t@NURuSfS{w$IymR#8u_sy+o(I^{IykK( z)Ox>Xwdelc)xCeOZFr(73zu*j_h(e0R`(A%d2u=X{@RukSHkZz6;)UkS7S&+6_m!c z7}8NiRdGFr3{*vH+=wACp&W5DhAbGeV#r1nRK#;26wjq{i$eqN?vCdnXBW79KjMKMJOqzii+>ot=5HG-fM`Lprn-Q?2B|94y!*f zy>D8({m!qhHDzE*(_`Y#%>~YzycqNLe(nQrL zs-c{9TcyohX)`2P+7hotVI5WG+H$%%r+b}#>}-v@P*|TKYyjc2MyjwX-h{WCq0z!V zuAm|A2KMU?STOWB`viDgF2)~e1x<=RxV8{zjM)buIMe!G1nUa4Zqw#NR z59o&amfdk+9h2^wPGBUu@|~*lVfWpf4n>CRPty_Y=0FU%6yl zGA)^xERzb494sz_s7w+of|6LUWG+`snSGdA9U&H1(l%iY#Nw&|i%UUlEy3=>_qUqZ zR2pJyX^9P{3z$Hz4;XeY7JWf`=V_s!hbi{d;XVDSmC29XDC2T*CCG5V+gxz-^hMcsahYI61dYKh{dw*+JP`F+8}VtfPH7jF(eg(@#z znD|Rjap{;3P;(6l8k79`$^#6Ak^CzpA0qi4l8=D6tTB3!Uko^ivY#UP*GRqt1V0Hv zHj7QqjZR)`Yvs*FdX%>`)rq5YF+)O&Y}Ae^&~zld$82CiDccd}Q<3dhR<;i=-hQ-X zb|CXhPDuP=7{9C zKhn2H6zvgxdsLshyf?5nbm{)()yt3cH4#NkL|^k%ZH%Z30b^Ze11v$=WATe#f^6cvBQZyeWzyFhw2lJP3gqT2Z&w^{Bix zo{ww+RZ@EY@akc(I*-H)QQ{C=okb`~w?k1u{U=S8dASzAP4N80<&1 zR_|Kx+IFy5ZEstp?Y@LXs`cyOy$w%O=9Rdwf7>FB@1%jS`hX?Pnw7SCZTQuWG`%lr^-9&A)dp+xk{GC$_hNplL>8fty%FI|17SY6)0@LQey=u{xkRWURt#1;V*tKfkbzK_@SZPO_ar z$DTze+18+w56|IFc@<>Q5$9EyMMs>M1LlOC@+!)rBhCx$XQ_0=d6j^@Dy1X#FG{oM zi1R}GSt=cIUS;s)Na=|4D$k-L&Z{DejySK%EIP^O1NyXzHSAcP>MT0q^1!}2$#lf! zsm-DzE>9f`U!}um1{!%iEty}{{(%O^e7Ax9tFq}9!0YVNp{J2 zYcmX4^T+qlYIP~dDZUKymAnsmCdj;l_ko}^S>QRvfbNIYt5#$ z4vs-(cX`S+^=8`RQC1B&8Lsit^cu5W%b}-FNlnJ1AN?G(f z`%Cy%72$UH)X`^DO~SW=U*cQAFZfpAQK>PcL2zG-Astm(AJ=2ZKo!@-jTkba4`#-Y zg(|L(TQOuqe=7$=yl;geJ3{~Y5CXSK@UjY#b%=eTB9x^2R46GGdr(f4l!;@NqohKt zQ;Cu)u}(EgYQ&yVElTRdevk_#^*bJg7&*2mSm;~gmM%n){=u$!uD->U89YI`?Y9@qB8dr;V$ zwpOtlMSIeG!#))Cr-}BWXdj*CFAmTs9Hgn5JsSg$s{7;niQoWDRqx(tc~sRGKS%_J zQi35OI80NueH$kq)$ENA6Tt`#-M^m86?Hs2f&$(l{07Z?gePH(P(wVy@jl_-QHOet z$>sl9t~_Q`!`G%VGhn=?-^_!Xoam4JRxKV9X#VwT19=sLNls(VU$z*_J$TW8y zZ#m+!b2P1j)iyYGST)332-{Ay-}~&M*?qFNoUn;p8(sD6rYi z&i5s}y4G_ZUg8YH+3xQYzihqYh4FVzf2a7iho?Bh;cVq|uD|;I({!^@_1U**O6z() zMz=rhvRL8Z5Q(GAZ(|QQ5Rvmg)!Vi*j^!{-GtpEjNv}Q!d#pfrUNt)z&xA~ z?B@kA5A%Z04iW)BbSFs5Px2}kU^HjWJ%1RyQiJ=2gO`!P;I#{bH=nSnd6BRVVb&}n zELbW0;4L9+nJ_NO2}?#R4&F+_RteK&HDPN6a{vGU literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/ops/__pycache__/ssd_combined.cpython-312.pyc b/model_executor/layers/mamba/ops/__pycache__/ssd_combined.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc8fa0c3cc3655596707b5aa2f6b286f5b5f1ac9 GIT binary patch literal 5410 zcmbtYO>7&-6`m!Rup7g6k@zQp8h}`FmlQ4j zd3R~a!V-|$0F@D-QqY5CB!JbM3<;=%9-^GuV~ZB(iZYPJB1Q$YIpn6q>ZO)(Ph``C=m_xM z8h^^d6Dc>(d`yf7kL3a#DT?<2U&{j`=Kl_xb0bFRPEp`Qk;|tAXgKZ*qLeA*fwaqF zMlR&P!qpr98|2Y9B(Oy%3&p-gl56Dyal^k zvRp-=^GF$`SqUlOl(m76I7&D01yYULEJtz$QpN^XXC#m0Ins`X`KkFfg*sC5NUt;=^NH|xoW6!Bq@L3MojgFlUDb)Nq*vr?O^Ju=qb5h67 zNhyx`Q?SRcGp7&uj`O=i@yy|yX{+r7+#=U}RB;`324rV#2Nh4gc4D_yknhO!+{e@xm9LN3L)8F zqm)+OSqsWo0<3m7jJoQhd4flZTWOlNC{1}9&KZt^2geuY1<2yHswI+iLwvTXmhJ(h8Q_!8(URDXw+zn%m?7P@zSO z(s~mC2ZL`O019(sw%bHTDKjguxsoKa69Nn2T}TTe3n3orLW&)(gg)%#MK?0lv z2xRYbVphn5NU<1&wSb zmyvH_q&C16qe~)Hd}DYvFgQ5K7TG*IaGu3vFVHbi<;~N666xRpu3WV zyqrmw3ShDo79`?hE5a#7B0iKN-c8+R%q-%wsM`Qalasn5l`U}c8^a<6ov{--F{4{} zStl;_LB1bU6M1Up~1f>B~8*2dxrgQ{VY-YN=mNz7x=6~vstXK+4Zx=Msx5`mUI zN9#VIt4dAR>rGislB<20JfBGll8Dchh|hpdn+c6!IxN?a(e3$3f#bok_>0fvbRwZU z@@cV9DoVPYH$-$Ne<5L}OV~|dlcZZNU6imz>@h{ecR}w+7zswrRGf&_atxeQi9Z1* zydTo{aHZ$a?iu6`FT0-8T`GA^?Y_1JHQl9kbUsNxPH&QJbC(w`FI~|htxrOaLz`sF z+=aR6CA&uZm#%EmZOiv6G^>)VO0yd6UL1KivUGoWdeyGB#4B`MCF3d`*J#h;xrgVL z?JIUQ)Kj5*RI*2G6-QWedls)fy!OeB z1>%^YZOXYwJ*1YJK5;J-6)LjKJ(+wwxpMEbsnz!?9fR9-CBV7hfra!w?p+!Ftba9H zY3+MPd`W#lZTSB{Z``Y#9c$n=mbmPH5`G+BIr-U%RevRNrqLogsM1lDys38G+=7~p zZaadiBdRg&Uo%~sOxL<~_39e45#8+l@y4A$eema$+HPi~M zAjU)gtDMLd;U{f2TS!7QxMk9+v~CjI}OX+W}3_SU4p@|?`E zI2*tz3>d*mQjRsajOD*?z-s+?o0&`%SzmLxvqSa6wU<5>#ln5a8h9372@WSoG51j@ zFUH|w8*nb3#nce4hHG#O(j6+<0rkHE!$#mVgFZx1GxXCtOdx3A@gdrcqbNk{P`B3g6HGL>Ymu1NHeezH z)Ian9>)iF7e&mj*WY`F*v5*$!4cV+n;z2)t7a25PkrE|2 zla)lAazma8Pa*J_WTtpt<6*?i-ynZ6Qaqh$=)^fuJyF#Cc*s1X)#oE4chqe~PLu?l zP9&;NSc!!0y!HOOw|;&jF>F4t=r$vdG38q;KMzm0y<_l_G|EF&* l2Y>sv>R~ls;APK63vp&=g4`!{fDM%R0=qqz2_j-F<$n);F;V~k literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/ops/__pycache__/ssd_state_passing.cpython-312.pyc b/model_executor/layers/mamba/ops/__pycache__/ssd_state_passing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2947522eb9b0cbcf224ab52a6bd327fad058f5a9 GIT binary patch literal 6480 zcmds5YfN0n6`s3yA9tUy%f8tcFT-MB0l&dGIF5r&Y)1vxacG^@n!CWlvfz6c4Ctb+ zt4M~lDr}<)P%VL_NXD|I5~))78L2;#s+GFe5J9f^LDWi>|6K=5Nux^bnY+7qZL`Kz z`>TUE_ndFe%$YeebLPzcNw3!+C{g(@BEK~w^j9ooOL>Ob`9X%z9HJ3TL{S(1%A&-e ztV=dXc98^+IuJsDGTm1i`kBz>xqV#{|xWK+=&xXsqfU ziuFW#S%Dk~j|-aazLD5Kkc~`)g(4REq{Bfr9*T$Apj0E6o(KEFAzCm#k3qjeVFnr+ ziDw!X?q3hMpfl29ri=%XAfqMdF#!UJu2)`vmLD6;j8R zmvKADWh?%+Ymnh;wk-2?atEdW0qsCj+?<%bc1fCLo+~CaD&zNE$MZ z01d-#3xacUb}Z#sg%Q74Xp*yMgT@3@P8NgZVosOsd(s8{7IOu=vYWFpZ5#=<%vlWT zEm?ljlVeZ$qBR(`2eg#~;451GQ1JkJ5=y^1{mrHEU7>%E;JxcrZ|p#h#XW)^XYV0i z-Pc29)&rAXnw<2-3$ms;Ro2d1XqLEk9{f|1V{Nbbx&FuVE8WN5?*95Od|0-R4LMiB zS+gsg^~hv7=i|x;jv>U1z@E4Wd!n2x%4+?;?hmdU_Q$g`zsKdsm1RA|d9gRRQt4cI zb6T0Q`|H`4b93J8&UvWHo(lqTE>0tzZx=)WW9Fp0IlaWWXNg~4kfU+rr!GClgYpxC zJTw+z861mxv%N#imk);60YNbo?+Y`aq@d&D!(l(ccyJ>PgO8!08D@rhnb05vLO~f~ zLb2YkAde0~xR=F;1XT~P@k5P*Vj|28v4SG4K!(SL1qB-)V#59kL6eEa!*NDX;hO+6 z>U7}7tRW{}V_BPxyI3%HkEG*o5YT8i#^Tltjtqh-CS3)rpiEaW_?$Bk7!iZh83rF% z2CoHU#02}7DZ&I&NyJz%v0`Gw#EuCL>5LN-SRy1Sh9h*aFZ~%5ltVo|Y;X{Um%a#L z9IQZP=EDjF&(8vnO41W_!%TP-j~?G80?{pKV&R*a+Z23`uQ@@Lsb(v{uJ3iq57z?m z!QsGgWVkjSVtT{z+Od&iCyoys3kRZ+>w)3%c;8SgFdB^x1_p=da5NYm3wMveiUgve z@i4;%21A3_LxG`THo&rUdN{%15X(kly>-Lmg5!B8*mIK(4uqLlILf>MW5f<-Pr`;u zpr=;|q%WAcIDK*c#=5Q~adt~*$dR_?hS`dgvxYa-@ba2O`=(q!Ye_k(d1Li*|7yei zSgN&yukGNC9lX3Faeh;-of?__8DDUKmmf&9ZR<_5UA*4=N$EZRUH|gw<%zY@CSKo^ zcze_2njcM7o#4GEc+-i*JDUa8dFxUg@2KYs>J#U_QCYTi&P_v6%38x4YMzjUzINMS zPT3?$A=B52BvRX->X6AiXG_}Vz00Gk*(x;6sgkOB^;%)+MqveCSnjL+-xpdR&PWg`U)?+X!%kgb%amsU;w;q0?kXeq1N@R1VyppUU zK_-#W2a|NV7Htc*CGX1U=j5l{mxtCJ?`}9cct^*U**e#rY~LYePE%6(l){?!1^ZId zQtj`V&n7;dSa-ZRtK7DErgU521Z&bdulU%#WaW*O%dm!rr<7ZiW`>%kX6^Gwmz66; z_jO-UN4BY=l&zYls;9J@lrd$g#t2$DshQGE>3d|f>O*Q$8{f?W_j)4kl|}45o4p>A zS-VHz`4PvVSSp+ruMq8vAe!RP#{{kB)Ic>H0aVM$f$BICsGgGnHB9S(7H~?Sg&cWH z#!)Z}TyK;lRYo$wxL)Z!Xi=0$XV-(EW6qqXG9aO!<+S@Y=R+Op-*$+cyNoxItdZ8`JkBt^8EQZ*aZ^U1_hE0Ou`%*}bd&IxTzdR3D3GNn>F{WzJIwfCKvD|BxPW_%52K$< z1DKb%?%gB|5PT@{eZSKLTWj(S^V3FZ{mqfSQQilsxRCdyyFOY2TXhq^;i_7pH_ty|EB z#kt4#33SK0SiDfY)VW-~%<rhSN(SC?J`Dx3wZXI2<2m&VV(X^5CBez(pf;NKiy#ba+fk7yJh1 zI98CdzL7z81b!NLF|lJ>u!fWjMnchSf~3X?mXw&LXC+MtPfZ}9MzG?n>zO*V!UJFh zc`tm^&Nv6kEY4SQ)2i0i^dgyFUK-{q=tgm)DD10$B#>B$T&0UA7f$kWd!lXX%53MR z(K+W$dY4SRu^ciQt$F5+={M#_mu{|TQl92V+UAtJIi+ph)SG9nO<$XDr~9>TWjt|yOH(j&aQfiv%_ZeB30ca@mCvqydhJX9y5@4?j7TA!Go`J7`&(n* zPFBzNI78d3mkn$2@gqTW=LrNKp(IzOIc( zytSn^%s8i=8)_%7cFrGNY*}d8D6Zp+>%LM4wy2UNxQi>lMGm#*Ynxjn;ror*CMw|j z1kwnfsKOkDT-Bl)a~kCEiCWC*P-&y6$D9Ef&0+!O3X#Pn8i5l{$WxJW*C&szguWMx zFk(grYEu=*lM}0eSTJHmC6)K|clFDmbzi+`!;l^M4lGVCOfLUu-5U@c7;++S#p2+? z;PUBpPn}qdAs6zLEOsq)EfaUHiEfN|klVM|zR zFCIm@f<*gw$DZ*&u=wrr2mhHalTnjWPH%#*ZGBPNTHLnRe(j9Z@pj3f>7L z(eNoI2o*SvvR?odNrE7DOo-6`9aTYgDKjRlx4pN#+mvNWIisJ}&vHv0DQndl<=@PK zI#X8v8dd$DLzh$5>NV>D>Q#CroJCE+lE6S0=L+-IC}*j^agXffBuv=tFkr9GB$85=ksYHpg6FvQI3N z#ByY{&s|Mk6)Pn10D@HiLh^!GDT!6M?O1Y5^h;uOw&#PASc65~f=;ZJ#5&~hEnZ!? zDh4F69=W}X=N8V14U*W1Tpk>0#6yyJ7!|u0YZhw6Ba(PLYxjgCHf1LGZ8H{kNF>wk Y*sx#hQguv*%=S%#>8ZYfD3BKV-x0%O;s5{u literal 0 HcmV?d00001 diff --git a/model_executor/layers/mamba/ops/causal_conv1d.py b/model_executor/layers/mamba/ops/causal_conv1d.py new file mode 100644 index 0000000..83c2c5f --- /dev/null +++ b/model_executor/layers/mamba/ops/causal_conv1d.py @@ -0,0 +1,1240 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2024, Tri Dao. +# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py + + +import numpy as np +import torch + +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.triton_utils import tl, triton + + +@triton.jit() +def _causal_conv1d_fwd_kernel( # continuous batching + # Pointers to matrices + x_ptr, # (dim, cu_seqlen) holding `batch` of actual sequences + padded sequences + w_ptr, # (dim, width) + bias_ptr, + initial_states_ptr, # conv_states_ptr + cache_indices_ptr, # (batch, n_blocks + padding) The second dimension contains + # the block indices relevant for each sequence + # plus potential 0-padding at the beginning and at the end + has_initial_states_ptr, + query_start_loc_ptr, + batch_ptr, + token_chunk_offset_ptr, + block_idx_first_scheduled_token, # (batch,) + block_idx_last_scheduled_token, # (batch,) + initial_state_idx, # (batch,) + num_computed_tokens, # (batch,) + o_ptr, # (dim, seqlen) - actually pointing to x_ptr + # Matrix dimensions + dim: tl.constexpr, + seqlen: tl.int32, # cu_seqlen + num_cache_lines: tl.constexpr, # added to support vLLM larger cache lines + # Strides + stride_x_dim: tl.constexpr, # stride to get to next feature-value, + stride_x_token: tl.constexpr, # stride to get to next token (same feature-index, same sequence-index) + stride_w_dim: tl.constexpr, # stride to get to next dim-axis value + stride_w_width: tl.constexpr, # stride to get to next width-axis value + stride_istate_seq: tl.constexpr, + stride_istate_dim: tl.constexpr, + stride_istate_token: tl.constexpr, + stride_cache_indices: tl.constexpr, + stride_o_dim: tl.constexpr, + stride_o_token: tl.constexpr, + stride_block_m: tl.constexpr, # Stride block to align divided by BLOCK_M + # others + pad_slot_id: tl.constexpr, + # Meta-parameters + HAS_BIAS: tl.constexpr, + KERNEL_WIDTH: tl.constexpr, + SILU_ACTIVATION: tl.constexpr, + IS_APC_ENABLED: tl.constexpr, + USE_PAD_SLOT: tl.constexpr, + NP2_STATELEN: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, +): + conv_states_ptr = initial_states_ptr + conv_state_indices_ptr = cache_indices_ptr + stride_conv_state_seq = stride_istate_seq + stride_conv_state_dim = stride_istate_dim + stride_conv_state_tok = stride_istate_token + state_len = ( + KERNEL_WIDTH - 1 + ) # can be passed via argument if it's not the same as this value + + # one program handles one chunk in a single sequence + # rather than mixing sequences - to make updating initial_states across sequences efficiently + + # single-sequence id + idx_seq = tl.load(batch_ptr + tl.program_id(0)).to(tl.int64) + chunk_offset = tl.load(token_chunk_offset_ptr + tl.program_id(0)) + + # BLOCK_N elements along the feature-dimension (channel) + idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) + + if idx_seq == pad_slot_id: + return + + sequence_start_index = tl.load(query_start_loc_ptr + idx_seq) + sequence_end_index = tl.load(query_start_loc_ptr + idx_seq + 1) + # find the actual sequence length + seqlen = sequence_end_index - sequence_start_index + + B_size: tl.constexpr = stride_block_m * BLOCK_M + + if IS_APC_ENABLED: + # Handle the case if prefix caching is enabled. + # In particular, if prefix caching is enabled, the program write additional cache states to "cache_indices_ptr" + + # Get the length of the completed sequence so far and compute the offset. + current_first_index = tl.load(block_idx_first_scheduled_token + idx_seq) + current_last_index = tl.load(block_idx_last_scheduled_token + idx_seq) + sequence_completed_index = tl.load(num_computed_tokens + idx_seq) + + # Compute the offset where the first stride_block_m-aligned first full block is + # Value in "token-space" + sequence_completed_offset_token = sequence_completed_index % B_size + seq_completed_offset = B_size - sequence_completed_offset_token + seq_end_offset = (seqlen - seq_completed_offset) % B_size + last_full_block_token_index = sequence_end_index - seq_end_offset + # If the sequence without the sequence_offset_index is stride_cache_chunk-aligned, then the last full chunk is the second-to-last one + if seq_end_offset == 0: + last_full_block_token_index = last_full_block_token_index - B_size + + # Get the number of blocks to be filled for the current sequence + # If n_block_to_fill = 0, then only the state at the sequence end is stored + n_block_to_fill = current_last_index - current_first_index + + # Get the index of the init block + conv_state_init_index = tl.load(initial_state_idx + idx_seq) + else: + n_block_to_fill = 0 + current_last_index = 0 + conv_state_init_index = 0 + current_first_index = 0 + last_full_block_token_index = 0 + + token_offset = BLOCK_M * chunk_offset + segment_len = min(BLOCK_M, seqlen - token_offset) + + # base of the sequence + x_base = ( + x_ptr + sequence_start_index * stride_x_token + idx_feats * stride_x_dim + ) # [BLOCK_N,] + + # cache_idx + conv_states_input_coord = tl.load( + conv_state_indices_ptr + idx_seq * stride_cache_indices + conv_state_init_index + ).to(tl.int64) + + if USE_PAD_SLOT: # noqa + if conv_states_input_coord == pad_slot_id: + # not processing as this is not the actual sequence + return + conv_states_base = ( + conv_states_ptr + + (conv_states_input_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim) + ) # [BLOCK_N,] + + w_base = w_ptr + (idx_feats * stride_w_dim) # [BLOCK_N,] + + # Does 2 things: + # 1. READ prior-block init-state data - [done by every Triton programs] + # 2. update conv_state with new data [only by the Triton program handles chunk_offset=0] + if chunk_offset == 0: + # read from conv_states + load_init_state = tl.load(has_initial_states_ptr + idx_seq).to(tl.int1) + if load_init_state: + # load from conv_states + prior_tokens = conv_states_base + (state_len - 1) * stride_conv_state_tok + mask_w = idx_feats < dim + if KERNEL_WIDTH == 2: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH == 3: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH == 4: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH == 5: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col3 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + conv_states_ptrs = prior_tokens - 3 * stride_conv_state_tok # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + else: + # prior-tokens are zeros + if KERNEL_WIDTH >= 2: # STRATEGY1 + # first chunk and does not have prior-token, so just set to 0 + col0 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty) + if KERNEL_WIDTH >= 3: # STRATEGY1 + col1 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty) + if KERNEL_WIDTH >= 4: # STRATEGY1 + col2 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty) + if KERNEL_WIDTH >= 5: # STRATEGY1 + col3 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty) + + # STEP 2: + # here prepare data for updating conv_state + if ( + state_len <= seqlen + ): # SMALL_CACHE=True (only move part of 'x' into conv_state cache) + # just read from 'x' + # copy 'x' data to conv_state + # load only 'x' data (and set 0 before 'x' if seqlen < state_len) + idx_tokens_last = (seqlen - state_len) + tl.arange( + 0, NP2_STATELEN + ) # [BLOCK_M] + x_ptrs = ( + x_ptr + + ((sequence_start_index + idx_tokens_last) * stride_x_token)[:, None] + + (idx_feats * stride_x_dim)[None, :] + ) # [BLOCK_M,BLOCK_N,] + mask_x = ( + (idx_tokens_last >= 0)[:, None] + & (idx_tokens_last < seqlen)[:, None] + & (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + idx_tokens_conv = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + # Compute the offset where the last block should be written in the conv_states + conv_states_output_coord = tl.load( + conv_state_indices_ptr + + idx_seq * stride_cache_indices + + current_last_index + ).to(tl.int64) + + conv_states_ptrs_target = ( + conv_states_ptr + + (conv_states_output_coord * stride_conv_state_seq) # Offset from seq + + (idx_feats * stride_conv_state_dim) + )[None, :] + ( # [BLOCK_N,] + idx_tokens_conv * stride_conv_state_tok + )[:, None] + + mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.debug_barrier() # NOTE: use this due to bug in Triton compiler + tl.store(conv_states_ptrs_target, loaded_x, mask) + + else: + if load_init_state: + # update conv_state by shifting left, i.e. take last few cols from conv_state + cols from 'x' + idx_tokens_conv = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + conv_states_ptrs_source = ( + conv_states_ptr + + (conv_states_input_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim)[None, :] + + ((idx_tokens_conv + seqlen) * stride_conv_state_tok)[:, None] + ) # [BLOCK_M, BLOCK_N] + mask = ( + (conv_states_input_coord < num_cache_lines) + & ((idx_tokens_conv + seqlen) < state_len)[:, None] + & (idx_feats < dim)[None, :] + ) + conv_state = tl.load(conv_states_ptrs_source, mask, other=0.0) + + VAL = state_len - seqlen + + x_ptrs = ( + x_base[None, :] + + ((idx_tokens_conv - VAL) * stride_x_token)[:, None] + ) # [BLOCK_M, BLOCK_N] + + mask_x = ( + (idx_tokens_conv - VAL >= 0)[:, None] + & (idx_tokens_conv - VAL < seqlen)[:, None] + & (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + + tl.debug_barrier() # need this due to the bug in tl.where not enforcing this when data is the result of another tl.load + new_conv_state = tl.where( + mask, conv_state, loaded_x + ) # BUG in 'tl.where' which requires a barrier before this + conv_states_ptrs_target = ( + conv_states_base + + (idx_tokens_conv * stride_conv_state_tok)[:, None] + ) # [BLOCK_M, BLOCK_N] + mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[ + None, : + ] + tl.store(conv_states_ptrs_target, new_conv_state, mask) + else: # load_init_state == False + # update conv_state by shifting left, BUT + # set cols prior to 'x' as zeros + cols from 'x' + idx_tokens_conv = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + VAL = state_len - seqlen + + x_ptrs = ( + x_base[None, :] + + ((idx_tokens_conv - VAL) * stride_x_token)[:, None] + ) # [BLOCK_M, BLOCK_N] + + mask_x = ( + (idx_tokens_conv - VAL >= 0)[:, None] + & (idx_tokens_conv - VAL < seqlen)[:, None] + & (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + new_conv_state = tl.load(x_ptrs, mask_x, 0.0) + + conv_states_ptrs_target = ( + conv_states_base + + (idx_tokens_conv * stride_conv_state_tok)[:, None] + ) # [BLOCK_M, BLOCK_N] + mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[ + None, : + ] + tl.store(conv_states_ptrs_target, new_conv_state, mask) + + else: # chunk_offset > 0 + # read prior-token data from `x` + load_init_state = True + prior_tokens = x_base + (token_offset - 1) * stride_x_token + mask_w = idx_feats < dim + if KERNEL_WIDTH == 2: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + if KERNEL_WIDTH == 3: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + conv_states_ptrs = prior_tokens - 1 * stride_x_token # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + if KERNEL_WIDTH == 4: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + conv_states_ptrs = prior_tokens - 1 * stride_x_token # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + conv_states_ptrs = prior_tokens - 2 * stride_x_token # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + if KERNEL_WIDTH == 5: + # ruff: noqa: F841 + conv_states_ptrs = prior_tokens # [BLOCK_N] + col3 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + conv_states_ptrs = prior_tokens - 1 * stride_x_token # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + conv_states_ptrs = prior_tokens - 2 * stride_x_token # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + conv_states_ptrs = prior_tokens - 3 * stride_x_token # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca") + + # Store intermediate states aligned with stride_block_m + # The additional states are cached starting from the last stride_block_m. + # For example: + # If n_block_to_fill = 0, then only the state at the sequence end is cached and the process below is not involved. + # If n_block_to_fill > 0, then the states at the sequence end and at the n_block_to_fill-last + # stride_block_m are cached. + # For example chunk_offset = n_block_to_fill stores the state at last_full_block + if (chunk_offset - 1) < n_block_to_fill: + # Store the states at the chunk boundaries from the start of the sequence + idx_tokens_last = ( + last_full_block_token_index + - (n_block_to_fill - chunk_offset) * B_size + - state_len + ) + tl.arange(0, NP2_STATELEN) # [BLOCK_M] + x_ptrs = ( + x_ptr + + (idx_tokens_last * stride_x_token)[:, None] + + (idx_feats * stride_x_dim)[None, :] + ) # [BLOCK_M,BLOCK_N,] + + mask_x = (idx_tokens_last >= 0)[:, None] & (idx_feats < dim)[ + None, : + ] # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + idx_tokens_conv = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + # cache_idx + conv_states_output_coord = tl.load( + conv_state_indices_ptr + + idx_seq * stride_cache_indices + + current_first_index + + (chunk_offset - 1) + ).to(tl.int64) + + conv_states_ptrs_target = ( + conv_states_ptr + + (conv_states_output_coord * stride_conv_state_seq) # Offset from seq + + (idx_feats * stride_conv_state_dim) + )[None, :] + ( # [BLOCK_N,] + idx_tokens_conv * stride_conv_state_tok + )[:, None] + + mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.debug_barrier() # NOTE: use this due to bug in Triton compiler + tl.store(conv_states_ptrs_target, loaded_x, mask) + + if HAS_BIAS: + bias = bias_ptr + idx_feats + mask_bias = idx_feats < dim + acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to( + tl.float32 + ) # [BLOCK_N] + else: + acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32) + + x_base_1d = x_base + token_offset * stride_x_token # starting of chunk + + # PRE-LOAD WEIGHTS + mask_w = idx_feats < dim + if KERNEL_WIDTH >= 2: + w_ptrs = w_base + (0 * stride_w_width) # [BLOCK_N] tensor + w_col0 = tl.load(w_ptrs, mask_w, other=0.0) + w_ptrs = w_base + (1 * stride_w_width) # [BLOCK_N] tensor + w_col1 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 3: + w_ptrs = w_base + (2 * stride_w_width) # [BLOCK_N] tensor + w_col2 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 4: + w_ptrs = w_base + (3 * stride_w_width) # [BLOCK_N] tensor + w_col3 = tl.load(w_ptrs, mask_w, other=0.0) + mask_x_1d = idx_feats < dim + for idx_token in range(segment_len): + acc = acc_preload + + matrix_w = w_col0 + matrix_x = col0 + for j in tl.static_range(KERNEL_WIDTH): + if KERNEL_WIDTH == 2: + if j == 1: # KERNEL_WIDTH-1: + matrix_w = w_col1 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 3: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 4: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + + acc += matrix_x * matrix_w # [BLOCK_N] + + if KERNEL_WIDTH == 2: + col0 = matrix_x + elif KERNEL_WIDTH == 3: + col0 = col1 + col1 = matrix_x + elif KERNEL_WIDTH == 4: + col0 = col1 + col1 = col2 + col2 = matrix_x + + if SILU_ACTIVATION: + acc = acc / (1 + tl.exp(-acc)) + mask_1d = (idx_token < segment_len) & ( + idx_feats < dim + ) # token-index # feature-index + o_ptrs = ( + o_ptr + + (sequence_start_index + token_offset + idx_token) * stride_o_token + + (idx_feats * stride_o_dim) + ) + + tl.store(o_ptrs, acc, mask=mask_1d) + + +def causal_conv1d_fn( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None, + conv_states: torch.Tensor, + query_start_loc: torch.Tensor, + cache_indices: torch.Tensor | None = None, + has_initial_state: torch.Tensor | None = None, + activation: str | None = "silu", + pad_slot_id: int = PAD_SLOT_ID, + block_idx_first_scheduled_token: torch.Tensor | None = None, + block_idx_last_scheduled_token: torch.Tensor | None = None, + initial_state_idx: torch.Tensor | None = None, + num_computed_tokens: torch.Tensor | None = None, + block_size_to_align=0, + metadata=None, + validate_data=False, +): + """support varlen + continuous batching when x is 2D tensor + + x: (dim,cu_seq_len) + cu_seq_len = total tokens of all seqs in that batch + sequences are concatenated from left to right for varlen + weight: (dim, width) + conv_states: (...,dim,width - 1) itype + updated inplace if cache_indices are not provided + [it use `cache_indices` to get the index to the cache of conv_state for that sequence + + conv_state[cache_indices[i]] for seq-i - to be used as initial_state when has_initial_state[i] = True + and after that conv_state[cache_indices[i]] need to be shift-left and updated with values from 'x' + ] + query_start_loc: (batch + 1) int32 + The cumulative sequence lengths of the sequences in + the batch, used to index into sequence. prepended by 0. + if + x = [5, 1, 1, 1] <- continuous batching (batch=4) + then + query_start_loc = [0, 5, 6, 7, 8] <- the starting index of the next sequence; while the last value is + the ending index of the last sequence + [length(query_start_loc)-1 == batch] + for example: query_start_loc = torch.Tensor([0,10,16,17]), + x.shape=(dim,17) + cache_indices: (batch) int32 + indicates the corresponding state index, + like so: conv_state = conv_states[cache_indices[batch_id]] + has_initial_state: (batch) bool + indicates whether should the kernel take the current state as initial + state for the calculations + [single boolean for each sequence in the batch: True or False] + bias: (dim,) + activation: either None or "silu" or "swish" or True + pad_slot_id: int + if cache_indices is passed, lets the kernel identify padded + entries that will not be processed, + for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] + in this case, the kernel will not process entries at + indices 0 and 3 + block_idx_first_scheduled_token: (batch,), dtype int32 + The pointer into cache_indices, where the first cache block to be filled is located. + block_idx_last_scheduled_token: (batch,), dtype int32 + The pointer into cache_indices, where the last cache block to be filled is located. + initial_state_idx: (batch,), dtype int32 + The pointer into cache_indices, where the cache block containing the initial state is located. + num_computed_tokens: (batch,), dtype int32 + The number of tokens already completed for each sequence + block_size_to_align: int + The block size to align the cached states to + out: same shape as `x` + """ + if isinstance(activation, bool) and activation: + activation = "silu" + + args = None + # Store original dtype to cast back at the end + original_x_dtype = x.dtype + x = x.to(conv_states.dtype) + out = torch.empty_like(x) + if metadata is not None: + nums_dict = metadata.nums_dict + args = nums_dict + batch_ptr = metadata.batch_ptr + token_chunk_offset_ptr = metadata.token_chunk_offset_ptr + else: + seqlens = query_start_loc.diff().to("cpu") + args = seqlens + MAX_NUM_PROGRAMS = 1024 + + batch_ptr = torch.full( + (MAX_NUM_PROGRAMS,), PAD_SLOT_ID, dtype=torch.int32, device=x.device + ) # tracking which seq-idx the Triton program is handling + token_chunk_offset_ptr = torch.full( + (MAX_NUM_PROGRAMS,), PAD_SLOT_ID, dtype=torch.int32, device=x.device + ) # tracking BLOCK_M-based index in the sequence the Triton program is handling + + is_channel_last = (x.stride(0) == 1) & (x.stride(1) > 1) + dim, cu_seqlen = x.shape + _, width = weight.shape + state_len = width - 1 + np2_statelen = triton.next_power_of_2(state_len) + + padded_batch = query_start_loc.size(0) - 1 + stride_x_dim = x.stride(0) + stride_x_token = x.stride(1) + stride_w_dim = weight.stride(0) + stride_w_width = weight.stride(1) + stride_istate_seq = 0 + stride_istate_dim = 0 + stride_istate_token = 0 + num_cache_lines = 0 + BLOCK_M = 8 + if conv_states is not None: + # extensions to support vLLM: + # 1. conv_states is used to replaced initial_states + # 2. conv_states serve as a cache with num cache lines can be larger than batch size + # 3. mapping from sequence x[idx] to a cache line at index as specified via cache_indices[idx] + # 4. computation can be skipped if cache_indices[idx] == pad_slot_id + num_cache_lines = conv_states.size(0) + assert ( + num_cache_lines == conv_states.shape[0] + and dim == conv_states.shape[1] + and width - 1 <= conv_states.shape[2] + ) + stride_istate_seq = conv_states.stride(0) + stride_istate_dim = conv_states.stride(1) + stride_istate_token = conv_states.stride(2) + assert stride_istate_dim == 1 + if out.dim() == 2: + stride_o_dim = out.stride(0) + stride_o_token = out.stride(1) + else: + stride_o_dim = out.stride(1) + stride_o_token = out.stride(2) + stride_cache_indices = cache_indices.stride(0) if cache_indices is not None else 0 + + if validate_data: + assert x.dim() == 2 + assert query_start_loc is not None + assert query_start_loc.dim() == 1 + assert x.stride(0) == 1 or x.stride(1) == 1 + if bias is not None: + assert bias.dim() == 1 + assert dim == bias.size(0) + if cache_indices is not None: + assert cache_indices.dim() == 1 + assert padded_batch == cache_indices.size(0) + if has_initial_state is not None: + assert has_initial_state.size() == (padded_batch,) + assert conv_states is not None, ( + "ERROR: `has_initial_state` is used, which needs also `conv_states`" + ) + assert weight.stride(1) == 1 + assert (dim, width) == weight.shape + assert is_channel_last, "Need to run in channel-last layout" + if block_size_to_align is not None and block_size_to_align > 0: + assert (block_size_to_align % BLOCK_M) == 0, ( + "The mamba block size needs to be divisible by the BLOCK_M" + ) + else: + block_size_to_align = BLOCK_M + + if metadata is None: + + def num_program(META, seqlens): + tot = 0 + + mlist = [] + offsetlist = [] # type: ignore + + nums = -(-seqlens // META["BLOCK_M"]) + + tot = nums.sum().item() + mlist = np.repeat(np.arange(len(nums)), nums) + for idx, num in enumerate(nums): + offsetlist.extend( + range(num) + ) # chunk-idx if a sequence is split into multiple chunks + + if META["batch_ptr"].nelement() < len(mlist): + newlen = len(mlist) + 1 + META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID) + META["token_chunk_offset_ptr"].resize_(newlen).fill_(PAD_SLOT_ID) + + if META["batch_ptr"].nelement() >= len(mlist): + META["batch_ptr"][0 : len(mlist)].copy_( + torch.from_numpy(np.array(mlist)) + ) + META["token_chunk_offset_ptr"][0 : len(mlist)].copy_( + torch.from_numpy(np.array(offsetlist)) + ) + + META["batch_ptr"] = META["batch_ptr"].to(META["x_ptr"].device) + META["token_chunk_offset_ptr"] = META["token_chunk_offset_ptr"].to( + META["x_ptr"].device + ) + return tot + else: + + def num_program(META, nums_dict): + tot = nums_dict[META["BLOCK_M"]]["tot"] + + mlist = nums_dict[META["BLOCK_M"]]["mlist"] + mlist_len = nums_dict[META["BLOCK_M"]]["mlist_len"] + + offsetlist = nums_dict[META["BLOCK_M"]]["offsetlist"] + + if nums_dict[META["BLOCK_M"]]["batch_ptr"] is not None: + META["batch_ptr"] = nums_dict[META["BLOCK_M"]]["batch_ptr"] + META["token_chunk_offset_ptr"] = nums_dict[META["BLOCK_M"]][ + "token_chunk_offset_ptr" + ] + else: + if META["batch_ptr"].nelement() < mlist_len: + newlen = mlist_len + 1 + META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID) + META["token_chunk_offset_ptr"].resize_(newlen).fill_(PAD_SLOT_ID) + + if META["batch_ptr"].nelement() >= mlist_len: + META["batch_ptr"][0:mlist_len].copy_(mlist) + META["token_chunk_offset_ptr"][0:mlist_len].copy_(offsetlist) + return tot + + def grid(META): + return ( + num_program(META, args), + triton.cdiv(dim, META["BLOCK_N"]), + ) + + if batch_ptr.device != x.device: + batch_ptr = batch_ptr.to(x.device) + token_chunk_offset_ptr = token_chunk_offset_ptr.to(x.device) + + _causal_conv1d_fwd_kernel[grid]( + # Pointers to matrices + x, + weight, + bias, + conv_states, + cache_indices, + has_initial_state, + query_start_loc, + batch_ptr, + token_chunk_offset_ptr, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + initial_state_idx, + num_computed_tokens, + out, + # Matrix dimensions + dim, + cu_seqlen, + num_cache_lines, + # stride + stride_x_dim, + stride_x_token, + stride_w_dim, + stride_w_width, + stride_istate_seq, + stride_istate_dim, + stride_istate_token, + stride_cache_indices, + stride_o_dim, + stride_o_token, + block_size_to_align // BLOCK_M, + # others + pad_slot_id, + # META + HAS_BIAS=bias is not None, + KERNEL_WIDTH=width, + SILU_ACTIVATION=activation in ["silu", "swish"], + IS_APC_ENABLED=block_idx_last_scheduled_token is not None, + USE_PAD_SLOT=pad_slot_id is not None, + NP2_STATELEN=np2_statelen, + # launch_cooperative_grid=True + BLOCK_M=BLOCK_M, + BLOCK_N=256, + num_stages=2, + ) + return out.to(original_x_dtype) + + +@triton.jit() +def _causal_conv1d_update_kernel( + # Pointers to matrices + x_ptr, # (batch, dim, seqlen) + w_ptr, # (dim, width) + bias_ptr, + conv_state_ptr, + conv_state_indices_ptr, + num_accepted_tokens_ptr, + query_start_loc_ptr, # (batch + 1) + block_idx_last_scheduled_token, # (batch,) + initial_state_idx, # (batch,) + o_ptr, # (batch, dim, seqlen) + # Matrix dimensions + batch: int, + dim: tl.constexpr, + seqlen: tl.constexpr, + state_len: tl.constexpr, + num_cache_lines: tl.constexpr, # added to support vLLM larger cache lines + # Strides + stride_x_seq: tl.constexpr, + stride_x_dim: tl.constexpr, + stride_x_token: tl.constexpr, + stride_w_dim: tl.constexpr, + stride_w_width: tl.constexpr, + stride_conv_state_seq: tl.constexpr, + stride_conv_state_dim: tl.constexpr, + stride_conv_state_tok: tl.constexpr, + stride_state_indices: tl.constexpr, + stride_o_seq: tl.constexpr, + stride_o_dim: tl.constexpr, + stride_o_token: tl.constexpr, + # others + pad_slot_id: tl.constexpr, + # Meta-parameters + HAS_BIAS: tl.constexpr, + KERNEL_WIDTH: tl.constexpr, + SILU_ACTIVATION: tl.constexpr, + IS_VARLEN: tl.constexpr, + IS_APC_ENABLED: tl.constexpr, + IS_SPEC_DECODING: tl.constexpr, + NP2_STATELEN: tl.constexpr, + USE_PAD_SLOT: tl.constexpr, + BLOCK_N: tl.constexpr, +): + # ruff: noqa: E501 + idx_seq = tl.program_id(0) + if idx_seq >= batch: + return + + # [BLOCK_N,] elements along the feature-dimension (channel) + idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N) + + if IS_APC_ENABLED: + # Get the state from the initial_state_idx + conv_state_init = tl.load(initial_state_idx + idx_seq) + current_last_index = tl.load(block_idx_last_scheduled_token + idx_seq) + else: + conv_state_init = 0 + current_last_index = 0 + + # cache_idx + conv_states_input_coord = tl.load( + conv_state_indices_ptr + idx_seq * stride_state_indices + conv_state_init + ).to(tl.int64) + + if USE_PAD_SLOT: # noqa + if conv_states_input_coord == pad_slot_id: + # not processing as this is not the actual sequence + return + + if IS_VARLEN: + query_start_index = tl.load(query_start_loc_ptr + idx_seq).to(tl.int64) + query_end_index = tl.load(query_start_loc_ptr + (idx_seq + 1)).to(tl.int64) + # revise state_len and seqlen + state_len = state_len - (seqlen - (query_end_index - query_start_index)) + seqlen = query_end_index - query_start_index + x_offset = query_start_index * stride_x_token + o_offset = query_start_index * stride_o_token + else: + query_start_index = idx_seq * seqlen + query_end_index = query_start_index + seqlen + x_offset = idx_seq * stride_x_seq + o_offset = idx_seq * stride_o_seq + + if query_start_index == query_end_index: + return + + if IS_SPEC_DECODING: + # The rolling of conv state: + # + # Before forward, the conv_state is: + # [history1, history2, ..., historyM]. + # + # After forward, the conv_state becomes: + # [history2, ..., historyM, draft1, draft2, ..., draftN]. + # + # After acceptance, it becomes: + # + # - accept 1 tokens: [history2, ..., historyM, draft1] + # - accept 2 tokens: [history3, ..., historyM, draft1, draft2] + # - and so on. + conv_state_token_offset = ( + tl.load(num_accepted_tokens_ptr + idx_seq).to(tl.int64) - 1 + ) + else: + conv_state_token_offset = 0 + + # STEP 1: READ init_state data + conv_states_base = ( + conv_state_ptr + + (conv_states_input_coord * stride_conv_state_seq) + + (idx_feats * stride_conv_state_dim) + ) + mask_w = idx_feats < dim + + prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok + if KERNEL_WIDTH >= 2: + conv_states_ptrs = prior_tokens # [BLOCK_N] + col0 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 3: + conv_states_ptrs = prior_tokens + 1 * stride_conv_state_tok # [BLOCK_N] + col1 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 4: + conv_states_ptrs = prior_tokens + 2 * stride_conv_state_tok # [BLOCK_N] + col2 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 5: + conv_states_ptrs = prior_tokens + 3 * stride_conv_state_tok # [BLOCK_N] + col3 = tl.load(conv_states_ptrs, mask_w, 0.0) + if KERNEL_WIDTH >= 6: + conv_states_ptrs = prior_tokens + 4 * stride_conv_state_tok # [BLOCK_N] + col4 = tl.load(conv_states_ptrs, mask_w, 0.0) + + # STEP 2: assume state_len > seqlen + idx_tokens = tl.arange(0, NP2_STATELEN) # [BLOCK_M] + + # With speculative decoding, the conv_state updates works in a sliding + # window manner, at each forward pass, the tokens are shift by 1, so we + # load since idx_tokens + 1. + conv_state_ptrs_source = ( + conv_state_ptr + + (conv_states_input_coord * stride_conv_state_seq) + + conv_state_token_offset * stride_conv_state_tok + + (idx_feats * stride_conv_state_dim)[None, :] + + ((idx_tokens + (1 if IS_SPEC_DECODING else seqlen)) * stride_conv_state_tok)[ + :, None + ] + ) # [BLOCK_M, BLOCK_N] + mask = ( + (conv_states_input_coord < num_cache_lines) + & ((idx_tokens + seqlen) < state_len)[:, None] + & (idx_feats < dim)[None, :] + ) + conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0) + + VAL = state_len - seqlen + x_base = x_ptr + x_offset + (idx_feats * stride_x_dim) # [BLOCK_N] + + x_ptrs = ( + x_base[None, :] + ((idx_tokens - VAL) * stride_x_token)[:, None] + ) # [BLOCK_M, BLOCK_N] + + mask_x = ( + (idx_tokens - VAL >= 0)[:, None] + & (idx_tokens - VAL < seqlen)[:, None] + & (idx_feats < dim)[None, :] + ) # token-index # token-index # feature-index + loaded_x = tl.load(x_ptrs, mask_x, 0.0) + tl.debug_barrier() + + new_conv_state = tl.where(mask, conv_state, loaded_x) + + # Get the state from the initial_state_idx + # cache_idx + conv_states_offset = tl.load( + conv_state_indices_ptr + idx_seq * stride_state_indices + current_last_index + ).to(tl.int64) + conv_state_ptrs_target = ( + conv_state_ptr + + (conv_states_offset * stride_conv_state_seq) # Offset from seq + + (idx_feats * stride_conv_state_dim) + )[None, :] + ( # [BLOCK_N,] + idx_tokens * stride_conv_state_tok + )[:, None] + mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :] + tl.store(conv_state_ptrs_target, new_conv_state, mask) + + # STEP 3: init accumulator + if HAS_BIAS: + bias = bias_ptr + idx_feats + mask_bias = idx_feats < dim + acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to( + tl.float32 + ) # [BLOCK_N] + else: + acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32) + + # STEP 4: + # PRE-LOAD WEIGHTS + # first kernel column, configured for weights to handle BLOCK_N features in range + w_base = w_ptr + (idx_feats * stride_w_dim) # [BLOCK_N,] + mask_w = idx_feats < dim + if KERNEL_WIDTH >= 2: + w_ptrs = w_base + (0 * stride_w_width) # [BLOCK_N] tensor + w_col0 = tl.load(w_ptrs, mask_w, other=0.0) + w_ptrs = w_base + (1 * stride_w_width) # [BLOCK_N] tensor + w_col1 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 3: + w_ptrs = w_base + (2 * stride_w_width) # [BLOCK_N] tensor + w_col2 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 4: + w_ptrs = w_base + (3 * stride_w_width) # [BLOCK_N] tensor + w_col3 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 5: + w_ptrs = w_base + (4 * stride_w_width) # [BLOCK_N] tensor + w_col4 = tl.load(w_ptrs, mask_w, other=0.0) + if KERNEL_WIDTH >= 6: + w_ptrs = w_base + (5 * stride_w_width) # [BLOCK_N] tensor + w_col5 = tl.load(w_ptrs, mask_w, other=0.0) + + x_base_1d = x_base # starting of chunk [BLOCK_N] + mask_x_1d = idx_feats < dim + + # STEP 5: compute each token + for idx_token in tl.range(seqlen): + acc = acc_preload + + matrix_w = w_col0 + matrix_x = col0 + for j in tl.static_range(KERNEL_WIDTH): + if KERNEL_WIDTH == 2: + if j == 1: # KERNEL_WIDTH-1: + matrix_w = w_col1 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 3: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 4: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 5: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + matrix_x = col3 + elif j == 4: + matrix_w = w_col4 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + elif KERNEL_WIDTH == 6: + if j == 1: + matrix_w = w_col1 + matrix_x = col1 + elif j == 2: + matrix_w = w_col2 + matrix_x = col2 + elif j == 3: + matrix_w = w_col3 + matrix_x = col3 + elif j == 4: + matrix_w = w_col4 + matrix_x = col4 + elif j == 5: + matrix_w = w_col5 + x_ptrs_1d = x_base_1d + idx_token * stride_x_token # [BLOCK_N] + matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d) + + acc += matrix_x * matrix_w # [BLOCK_N] + + if KERNEL_WIDTH == 2: + col0 = matrix_x + elif KERNEL_WIDTH == 3: + col0 = col1 + col1 = matrix_x + elif KERNEL_WIDTH == 4: + col0 = col1 + col1 = col2 + col2 = matrix_x + elif KERNEL_WIDTH == 5: + col0 = col1 + col1 = col2 + col2 = col3 + col3 = matrix_x + elif KERNEL_WIDTH == 6: + col0 = col1 + col1 = col2 + col2 = col3 + col3 = col4 + col4 = matrix_x + + if SILU_ACTIVATION: + acc = acc / (1 + tl.exp(-acc)) + mask_1d = (idx_token < seqlen) & ( + idx_feats < dim + ) # token-index # feature-index + o_ptrs = ( + o_ptr + o_offset + idx_token * stride_o_token + (idx_feats * stride_o_dim) + ) + + tl.store(o_ptrs, acc, mask=mask_1d) + + +def causal_conv1d_update( + x: torch.Tensor, + conv_state: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None = None, + activation: bool | str | None = None, + conv_state_indices: torch.Tensor | None = None, + num_accepted_tokens: torch.Tensor | None = None, + query_start_loc: torch.Tensor | None = None, + max_query_len: int = -1, + pad_slot_id: int = PAD_SLOT_ID, + block_idx_last_scheduled_token: torch.Tensor | None = None, + initial_state_idx: torch.Tensor | None = None, + validate_data=False, +): + """ + x: Input tensor which can take the following shapes: + + - `[batch, dim]` - single token prediction + - `[batch, dim, seqlen]` - single or multiple tokens prediction + - `[num_tokens, dim]` - continuous batching, where num_tokens is + the total tokens of all sequences in that batch + + conv_state: (..., dim, state_len), where state_len >= width - 1 + weight: (dim, width) + bias: (dim,) + conv_state_indices: (batch,), dtype int32 + If not None, the conv_state is a larger tensor along the batch dim, + and we are selecting the batch coords specified by conv_state_indices. + Useful for a continuous batching scenario. + block_idx_last_scheduled_token: (batch,), dtype int32 + The pointer into conv_state_indices, where the last cache block to be filled is located. + initial_state_idx: (batch,), dtype int32 + The pointer into conv_state_indices, where the cache block containing the initial state is located. + num_accepted_tokens: (batch,), dtype int32 + If not None, it indicates the number of accepted tokens for each + sequence in the batch. + This is used in speculative decoding, where the conv_state is updated + in a sliding window manner. + query_start_loc: (batch + 1,) int32 + If not None, the inputs is given in a varlen fashion and this indicates + the starting index of each sequence in the batch. + max_query_len: int + If query_start_loc is not None, this indicates the maximum query + length in the batch. + pad_slot_id: int + if conv_state_indices is passed, lets the kernel identify padded + entries that will not be processed, + for example: conv_state_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] + in this case, the kernel will not process entries at + indices 0 and 3 + out: (batch, dim) or (batch, dim, seqlen) or (num_tokens, dim), same shape as `x` + """ + if validate_data: + assert pad_slot_id is not None + assert x.stride(1) == 1 + if isinstance(activation, bool): + activation = "silu" if activation is True else None + elif activation is not None: + assert activation in ["silu", "swish"] + + original_x_dtype = x.dtype + x = x.to(conv_state.dtype) + unsqueeze = query_start_loc is None and x.dim() == 2 + if unsqueeze: + # make it (batch, dim, seqlen) with seqlen == 1 + x = x.unsqueeze(-1) + if query_start_loc is None: + batch, dim, seqlen = x.shape + else: + assert conv_state_indices is not None + batch = conv_state_indices.size(0) + dim = x.size(1) + seqlen = max_query_len + _, width = weight.shape + # conv_state: (..., dim, state_len), where state_len >= width - 1 + num_cache_lines, _, state_len = conv_state.size() + + if validate_data: + assert dim == weight.size(0) + assert conv_state.stride(-2) == 1, ( + f"ERROR: expect contiguous along feat-dim of conv_state (currently stride={conv_state.stride()})" + ) + assert state_len >= width - 1 + # when above happens, we don't shift-left to keep any records in conv_state + assert dim == conv_state.size(1) + if conv_state_indices is None: + assert conv_state.size(0) >= batch + else: + assert (batch,) == conv_state_indices.shape + + assert num_cache_lines >= batch + assert weight.stride(1) == 1 # Need this + + # adopt the strategy in vLLM that overwrite on 'x' directly, rather than creating a new tensor 'o' + out = x + stride_w_dim, stride_w_width = weight.stride() + + if query_start_loc is None: + # X (batch, dim, seqlen) + stride_x_seq, stride_x_dim, stride_x_token = x.stride() + stride_o_seq, stride_o_dim, stride_o_token = out.stride() + else: + # X (dim, cu_seqlen) + stride_x_token, stride_x_dim = x.stride() + stride_x_seq = 0 + stride_o_token, stride_o_dim = out.stride() + stride_o_seq = 0 + + stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride() + stride_state_indices = ( + conv_state_indices.stride(0) if conv_state_indices is not None else 0 + ) + if num_accepted_tokens is not None: + state_len = width - 1 + (seqlen - 1) # effective state_len needed + else: + state_len = width - 1 + np2_statelen = triton.next_power_of_2(state_len) + + def grid(META): + return ( + batch, + triton.cdiv(dim, META["BLOCK_N"]), + ) + + _causal_conv1d_update_kernel[grid]( + # Pointers to matrices + x, + weight, + bias, + conv_state, + conv_state_indices, + num_accepted_tokens, + query_start_loc, + block_idx_last_scheduled_token, + initial_state_idx, + out, + # Matrix dimensions + batch, + dim, + seqlen, + state_len, + num_cache_lines, + # stride + stride_x_seq, + stride_x_dim, + stride_x_token, + stride_w_dim, + stride_w_width, + stride_istate_seq, + stride_istate_dim, + stride_istate_token, + stride_state_indices, + stride_o_seq, + stride_o_dim, + stride_o_token, + # others + pad_slot_id, + # META + HAS_BIAS=bias is not None, + KERNEL_WIDTH=width, + SILU_ACTIVATION=activation in ["silu", "swish"], + IS_VARLEN=query_start_loc is not None, + IS_APC_ENABLED=block_idx_last_scheduled_token is not None, + IS_SPEC_DECODING=num_accepted_tokens is not None, + NP2_STATELEN=np2_statelen, + USE_PAD_SLOT=pad_slot_id is not None, + BLOCK_N=256, + ) + if unsqueeze: + out = out.squeeze(-1) + return out.to(original_x_dtype) diff --git a/model_executor/layers/mamba/ops/layernorm_gated.py b/model_executor/layers/mamba/ops/layernorm_gated.py new file mode 100644 index 0000000..b592906 --- /dev/null +++ b/model_executor/layers/mamba/ops/layernorm_gated.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2024, Tri Dao. +# Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py + +import torch + +from vllm.triton_utils import tl, triton + + +@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None}) +@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None}) +@triton.jit +def _layer_norm_fwd_1pass_kernel( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + Z, # pointer to the other branch + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row: tl.int64, + stride_y_row: tl.int64, + stride_z_row: tl.int64, + M: tl.int64, # number of rows in X + N: tl.int64, # number of columns in X + eps, # epsilon to avoid division by zero + BLOCK_N: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_Z: tl.constexpr, + NORM_BEFORE_GATE: tl.constexpr, + IS_RMS_NORM: tl.constexpr, +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + group = tl.program_id(1) + X += row * stride_x_row + group * N + Y += row * stride_y_row + group * N + if HAS_Z: + Z += row * stride_z_row + group * N + if not IS_RMS_NORM: + Mean += group * M + Rstd += group * M + W += group * N + if HAS_BIAS: + B += group * N + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32) + if HAS_Z and not NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=cols < N).to(tl.float32) + x *= z * tl.sigmoid(z) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + w = tl.load(W + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + y = x_hat * w + b if HAS_BIAS else x_hat * w + if HAS_Z and NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=mask).to(tl.float32) + y *= z * tl.sigmoid(z) + # Write output + tl.store(Y + cols, y, mask=mask) + + +def _layer_norm_fwd( + x, + weight, + bias, + eps, + z=None, + out=None, + group_size=None, + norm_before_gate=True, + is_rms_norm=False, +): + M, N = x.shape + if group_size is None: + group_size = N + assert N % group_size == 0 + ngroups = N // group_size + assert x.stride(-1) == 1 + if z is not None: + assert z.stride(-1) == 1 + assert z.shape == (M, N) + assert weight.shape == (N,) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N,) + # allocate output + if out is not None: + assert out.shape == x.shape + else: + out = torch.empty_like(x) + assert out.stride(-1) == 1 + mean = ( + torch.empty((ngroups * M,), dtype=torch.float32, device=x.device) + if not is_rms_norm + else None + ) + rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size)) + if group_size > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + num_warps = min(max(BLOCK_N // 256, 1), 8) + grid = (M, ngroups) + with torch.cuda.device(x.device.index): + _layer_norm_fwd_1pass_kernel[grid]( + x, + out, + weight, + bias, + z, + mean, + rstd, + x.stride(0), + out.stride(0), + z.stride(0) if z is not None else 0, + M, + group_size, + eps, + BLOCK_N=BLOCK_N, + NORM_BEFORE_GATE=norm_before_gate, + IS_RMS_NORM=is_rms_norm, + num_warps=num_warps, + ) + return out, mean, rstd + + +def rms_norm_gated( + x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True +): + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if z is not None: + assert z.shape == x_shape_og + z = z.reshape(-1, z.shape[-1]) + if z.stride(-1) != 1: + z = z.contiguous() + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + y, _, _ = _layer_norm_fwd( + x, + weight, + bias, + eps, + z=z, + group_size=group_size, + norm_before_gate=norm_before_gate, + is_rms_norm=True, + ) + + return y.reshape(x_shape_og) diff --git a/model_executor/layers/mamba/ops/mamba_ssm.py b/model_executor/layers/mamba/ops/mamba_ssm.py new file mode 100644 index 0000000..53fd5d5 --- /dev/null +++ b/model_executor/layers/mamba/ops/mamba_ssm.py @@ -0,0 +1,478 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py + +import torch +from packaging import version + +from vllm import _custom_ops as ops +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.triton_utils import HAS_TRITON, tl, triton + +TRITON3 = HAS_TRITON and (version.parse(triton.__version__) >= version.parse("3.0.0")) + +if TRITON3: + + @triton.jit + def softplus(dt): + dt = tl.where(dt <= 20.0, tl.math.log(tl.math.exp(dt) + 1), dt) + return dt +else: + + @triton.jit + def softplus(dt): + dt = tl.where(dt <= 20.0, tl.math.log1p(tl.exp(dt)), dt) + return dt + + +@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None}) +@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None}) +@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None}) +@triton.heuristics( + { + "HAS_STATE_BATCH_INDICES": lambda args: args["state_batch_indices_ptr"] + is not None + } +) +@triton.heuristics( + {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])} +) +@triton.jit +def _selective_scan_update_kernel( + # Pointers to matrices + state_ptr, + x_ptr, + dt_ptr, + dt_bias_ptr, + A_ptr, + B_ptr, + C_ptr, + D_ptr, + z_ptr, + out_ptr, + state_batch_indices_ptr, + dst_state_batch_indices_ptr, + pad_slot_id, + # Matrix dimensions + batch, + nheads, + dim, + dstate, + nheads_ngroups_ratio, + # Strides + stride_state_batch, + stride_state_head, + stride_state_dim, + stride_state_dstate, + stride_x_batch, + stride_x_head, + stride_x_dim, + stride_dt_batch, + stride_dt_head, + stride_dt_dim, + stride_dt_bias_head, + stride_dt_bias_dim, + stride_A_head, + stride_A_dim, + stride_A_dstate, + stride_B_batch, + stride_B_group, + stride_B_dstate, + stride_C_batch, + stride_C_group, + stride_C_dstate, + stride_D_head, + stride_D_dim, + stride_z_batch, + stride_z_head, + stride_z_dim, + stride_out_batch, + stride_out_head, + stride_out_dim, + # Meta-parameters + DT_SOFTPLUS: tl.constexpr, + TIE_HDIM: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + HAS_DT_BIAS: tl.constexpr, + HAS_D: tl.constexpr, + HAS_Z: tl.constexpr, + HAS_STATE_BATCH_INDICES: tl.constexpr, + BLOCK_SIZE_DSTATE: tl.constexpr, +): + pid_m = tl.program_id(axis=0) + pid_b = tl.program_id(axis=1) + pid_h = tl.program_id(axis=2) + + # If HAS_STATE_BATCH_INDICES is true, then the ssm state's batch coordinate + # is taken from the state_batch_indices_ptr Otherwise, the state coordinate + # is the same as the batch id. + if HAS_STATE_BATCH_INDICES: + dst_state_batch_indices_ptr += pid_b + dst_state_batch_idx = tl.load(dst_state_batch_indices_ptr).to(tl.int64) + dst_state_ptr = state_ptr + ( + dst_state_batch_idx * stride_state_batch + pid_h * stride_state_head + ) + state_batch_indices_ptr += pid_b + state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64) + state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head + else: + dst_state_ptr = ( + state_ptr + pid_b * stride_state_batch + pid_h * stride_state_head + ) + state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head + + x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head + dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head + if HAS_DT_BIAS: + dt_bias_ptr += pid_h * stride_dt_bias_head + A_ptr += pid_h * stride_A_head + B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group + C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group + if HAS_Z: + z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head + out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = tl.arange(0, BLOCK_SIZE_DSTATE) + state_ptrs = state_ptr + ( + offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate + ) + dst_state_ptrs = dst_state_ptr + ( + offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate + ) + x_ptrs = x_ptr + offs_m * stride_x_dim + dt_ptrs = dt_ptr + offs_m * stride_dt_dim + if HAS_DT_BIAS: + dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim + if HAS_D: + D_ptr += pid_h * stride_D_head + A_ptrs = A_ptr + ( + offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate + ) + B_ptrs = B_ptr + offs_n * stride_B_dstate + C_ptrs = C_ptr + offs_n * stride_C_dstate + if HAS_D: + D_ptrs = D_ptr + offs_m * stride_D_dim + if HAS_Z: + z_ptrs = z_ptr + offs_m * stride_z_dim + out_ptrs = out_ptr + offs_m * stride_out_dim + mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate) + if HAS_STATE_BATCH_INDICES: + mask &= state_batch_idx != pad_slot_id + state = tl.load(state_ptrs, mask=mask, other=0.0) + + x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + if not TIE_HDIM: + dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + if HAS_DT_BIAS: + dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + if DT_SOFTPLUS: + dt = softplus(dt) + A = tl.load( + A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0 + ).to(tl.float32) + dA = tl.exp(A * dt[:, None]) + else: + dt = tl.load(dt_ptr).to(tl.float32) + if HAS_DT_BIAS: + dt += tl.load(dt_bias_ptr).to(tl.float32) + if DT_SOFTPLUS: + dt = softplus(dt) + A = tl.load(A_ptr).to(tl.float32) + dA = tl.exp(A * dt) # scalar, not a matrix + + B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32) + C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32) + if HAS_D: + D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + if HAS_Z: + z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + + dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt + state = state * dA + dB * x[:, None] + + mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate) + if HAS_STATE_BATCH_INDICES: + mask &= state_batch_idx != pad_slot_id + tl.store(dst_state_ptrs, state, mask=mask) + out = tl.sum(state * C[None, :], axis=1) + if HAS_D: + out += x * D + if HAS_Z: + out *= z * tl.sigmoid(z) + tl.store(out_ptrs, out, mask=offs_m < dim) + + +def selective_state_update( + state, + x, + dt, + A, + B, + C, + D=None, + z=None, + dt_bias=None, + dt_softplus=False, + state_batch_indices=None, + dst_state_batch_indices=None, + pad_slot_id=PAD_SLOT_ID, + out=None, +): + """ + Argument: + state: (batch, dim, dstate) or (batch, nheads, dim, dstate) + x: (batch, dim) or (batch, nheads, dim) + dt: (batch, dim) or (batch, nheads, dim) + A: (dim, dstate) or (nheads, dim, dstate) + B: (batch, dstate) or (batch, ngroups, dstate) + C: (batch, dstate) or (batch, ngroups, dstate) + D: (dim,) or (nheads, dim) + z: (batch, dim) or (batch, nheads, dim) + dt_bias: (dim,) or (nheads, dim) + pad_slot_id: int + if cache_indices is passed, lets the kernel identify padded + entries that will not be processed, + for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id] + in this case, the kernel will not process entries at + indices 0 and 3 + out: Preallocated ssm output tensor. Assume same shape as x. + In-place updated. + """ + if state.dim() == 3: + state = state.unsqueeze(1) + if x.dim() == 2: + x = x.unsqueeze(1) + if dt.dim() == 2: + dt = dt.unsqueeze(1) + if A.dim() == 2: + A = A.unsqueeze(0) + if B.dim() == 2: + B = B.unsqueeze(1) + if C.dim() == 2: + C = C.unsqueeze(1) + if D is not None and D.dim() == 1: + D = D.unsqueeze(0) + if z is not None and z.dim() == 2: + z = z.unsqueeze(1) + if dt_bias is not None and dt_bias.dim() == 1: + dt_bias = dt_bias.unsqueeze(0) + if out.dim() == 2: + out = out.unsqueeze(1) + + _, nheads, dim, dstate = state.shape + batch = x.shape[0] + + assert x.shape == (batch, nheads, dim) + assert dt.shape == x.shape + assert A.shape == (nheads, dim, dstate) + ngroups = B.shape[1] + assert nheads % ngroups == 0, "nheads must be divisible by ngroups" + assert B.shape == (batch, ngroups, dstate) + assert C.shape == B.shape + if D is not None: + assert D.shape == (nheads, dim) + if z is not None: + assert z.shape == x.shape + if dt_bias is not None: + assert dt_bias.shape == (nheads, dim) + if state_batch_indices is not None: + assert state_batch_indices.shape == (batch,) + if dst_state_batch_indices is not None: + assert dst_state_batch_indices.shape == (batch,) + else: + # revert to the default behavior of in-place state updates + dst_state_batch_indices = state_batch_indices + assert out.shape == x.shape + + grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads) + z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0) + # We don't want autotune since it will overwrite the state + # We instead tune by hand. + BLOCK_SIZE_M, num_warps = ( + (32, 4) + if dstate <= 16 + else ( + (16, 4) + if dstate <= 32 + else ((8, 4) if dstate <= 64 else ((4, 4) if dstate <= 128 else ((4, 8)))) + ) + ) + tie_hdim = ( + A.stride(-1) == 0 + and A.stride(-2) == 0 + and dt.stride(-1) == 0 + and dt_bias.stride(-1) == 0 + ) + with torch.cuda.device(x.device.index): + _selective_scan_update_kernel[grid]( + state, + x, + dt, + dt_bias, + A, + B, + C, + D, + z, + out, + state_batch_indices, + dst_state_batch_indices, + pad_slot_id, + batch, + nheads, + dim, + dstate, + nheads // ngroups, + state.stride(0), + state.stride(1), + state.stride(2), + state.stride(3), + x.stride(0), + x.stride(1), + x.stride(2), + dt.stride(0), + dt.stride(1), + dt.stride(2), + *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0, + A.stride(0), + A.stride(1), + A.stride(2), + B.stride(0), + B.stride(1), + B.stride(2), + C.stride(0), + C.stride(1), + C.stride(2), + *(D.stride(0), D.stride(1)) if D is not None else 0, + z_strides[0], + z_strides[1], + z_strides[2], + out.stride(0), + out.stride(1), + out.stride(2), + dt_softplus, + tie_hdim, + BLOCK_SIZE_M, + num_warps=num_warps, + ) + + +def selective_scan_fn( + u, + ssm_states, + delta, + A, + B, + C, + D=None, + z=None, + delta_bias=None, + delta_softplus=False, + query_start_loc=None, + cache_indices=None, + has_initial_state=None, + pad_slot_id=PAD_SLOT_ID, + block_size=1024, + block_idx_first_scheduled_token=None, + block_idx_last_scheduled_token=None, + initial_state_idx=None, +) -> torch.Tensor: + """ + u: (dim, total_length) for varlen or (batch, dim, seqlen) + applies changes in place. + ssm_states: (batch, dim, dstate) or (batch, nheads, dim, dstate) + applies changes in place. + delta: (dim, total_length) for varlen or (batch, dim, seqlen) + A: (dim, dstate) + B: (ngroups, dstate, total_length) for varlen or + (batch,ngroups,dstate,seqlen) + C: (ngroups, dstate, total_length) for varlen or + (batch,ngroups,dstate,seqlen) + D: (dim,) + z: (dim, total_length) for varlen or (batch, dim, seqlen) + dt_bias: (dim,) or (dim) + query_start_loc: (batch + 1) int32 + The cumulative sequence lengths of the sequences in + the batch, used to index into sequence. prepended with 0. + for example: query_start_loc = torch.Tensor([0,10,16,17]), + x.shape=(dim,17) + cache_indices: (batch) int32 + A tensor with each cell is a correspondent + input and output ssm_state indices + - Without APC: (batch,) - single state index per batch item + - With APC: (batch, max_positions) - cache block indices for read/write + Each non-zero value indicates a cache block to load from and/or write to. + has_initial_state: (batch) bool + A tensor populated with ones and zeros, + indicate if the ssm_state at the corresponding index should be + used as initial state. Not providing argument assumes + there's no initial state + pad_slot_id: int + if cache_indices is passed, lets the kernel identify padding entries + that will not be processed, + for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] + in this case, the kernel will not process entries at indices 0 and 3 + block_size: int + The block size to align the cached states to + block_idx_first_scheduled_token: (batch,), dtype int32 + The pointer into cache_indices, where the first + cache block to be filled is located. + block_idx_last_scheduled_token: (batch,), dtype int32 + The pointer into cache_indices, where the last cache block + to be filled is located. + initial_state_idx: (batch,), dtype int32 + The pointer into cache_indices, where the cache block + containing the initial state is located. + returns + output: (dim, total_length) for varlen or (batch, dim, seqlen) + supports inplace replacement + """ + if u.stride(-1) != 1: + u = u.contiguous() + if delta.stride(-1) != 1: + delta = delta.contiguous() + if D is not None: + D = D.contiguous() + if B.stride(-1) != 1: + B = B.contiguous() + if C.stride(-1) != 1: + C = C.contiguous() + if z is not None and z.stride(-1) != 1: + z = z.contiguous() + if B.dim() == 3 and query_start_loc is None: + B = B.unsqueeze(1) + if B.dim() == 2 and query_start_loc is not None: + B = B.unsqueeze(0) + if C.dim() == 3 and query_start_loc is None: + C = C.unsqueeze(1) + if C.dim() == 2 and query_start_loc is not None: + C = C.unsqueeze(0) + + ops.selective_scan_fwd( + u, + delta, + A, + B, + C, + D, + z, + delta_bias, + delta_softplus, + query_start_loc, + cache_indices, + has_initial_state, + ssm_states, + pad_slot_id, + block_size, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + initial_state_idx, + ) + + if z is None: + return delta # output written inplace to delta + else: + return z # output written inplace to z diff --git a/model_executor/layers/mamba/ops/ssd_bmm.py b/model_executor/layers/mamba/ops/ssd_bmm.py new file mode 100644 index 0000000..ac5ffc1 --- /dev/null +++ b/model_executor/layers/mamba/ops/ssd_bmm.py @@ -0,0 +1,211 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py + +# ruff: noqa: E501,SIM102 + +import torch + +from vllm.triton_utils import tl, triton + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=2, + ), + ], + key=["chunk_size", "K", "IS_CAUSAL"], +) +@triton.jit +def _bmm_chunk_fwd_kernel( + # Pointers to matrices + a_ptr, + b_ptr, + out_ptr, + cu_chunk_seqlens_ptr, + # Matrix dimensions + seqlen, + chunk_size: tl.constexpr, + K: tl.constexpr, + ngroups: tl.constexpr, + stride_a_seqlen: tl.int64, + stride_a_head: tl.int64, + stride_ak: tl.constexpr, + stride_b_seqlen: tl.int64, + stride_b_head: tl.int64, + stride_bk: tl.constexpr, + stride_out_chunk: tl.int64, + stride_out_head: tl.int64, + stride_outm: tl.int64, + stride_outn: tl.constexpr, + # Meta-parameters + IS_CAUSAL: tl.constexpr, + dot_dtype: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + pid_ch = tl.program_id(axis=1).to(tl.int64) + pid_c = pid_ch // ngroups + pid_h = pid_ch - pid_c * ngroups + num_pid_n = tl.cdiv(chunk_size, BLOCK_SIZE_N) + pid_m = tl.program_id(axis=0) // num_pid_n + pid_n = tl.program_id(axis=0) % num_pid_n + if IS_CAUSAL: + if pid_n * BLOCK_SIZE_N >= (pid_m + 1) * BLOCK_SIZE_M: + return + + chunk_seqlen_start = tl.load(cu_chunk_seqlens_ptr + pid_c) + chunk_seqlen_end = tl.load(cu_chunk_seqlens_ptr + pid_c + 1) + + a_ptr += chunk_seqlen_start * stride_a_seqlen + pid_h * stride_a_head + b_ptr += chunk_seqlen_start * stride_b_seqlen + pid_h * stride_b_head + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + (offs_m[:, None] * stride_a_seqlen + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_b_seqlen) + chunk_size_limit = chunk_seqlen_end - chunk_seqlen_start + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + # compute a * b.T + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load( + a_ptrs, + mask=(offs_m[:, None] < chunk_size_limit) + & (offs_k[None, :] < K - k * BLOCK_SIZE_K), + other=0.0, + ).to(dot_dtype) + b = tl.load( + b_ptrs, + mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K) + & (offs_n[None, :] < chunk_size_limit), + other=0.0, + ).to(dot_dtype) + acc += tl.dot(a, b) + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + out = acc.to(out_ptr.dtype.element_ty) + out_ptr += pid_c * stride_out_chunk + pid_h * stride_out_head + out_ptrs = out_ptr + (stride_outm * offs_m[:, None] + offs_n[None, :] * stride_outn) + tl.store( + out_ptrs, + out, + mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size), + ) + + +def _bmm_chunk_fwd(a, b, chunk_size, cu_chunk_seqlens, causal=False, output_dtype=None): + """ + Argument: + a: (seqlen, ngroups, k) + b: (seqlen, ngroups, k) + chunk_size: int + cu_chunk_seq_lens: (nchunks+1,) + causal: if True, then out[i, j] for i > j will be arbitrary, only out[i, j] for i <= j are + guaranteed to be correct. + Return: + out: (nchunks, ngroups, chunk_size, chunk_size) + """ + seqlen, ngroups, k = a.shape + assert b.shape == a.shape + if a.stride(-1) != 1 and a.stride(0) != 1: + a = a.contiguous() + if b.stride(-1) != 1 and b.stride(0) != 1: + b = b.contiguous() + + nchunks = len(cu_chunk_seqlens) - 1 + # Allocates output. + out_dtype = a.dtype if output_dtype is None else output_dtype + out = torch.empty( + (nchunks, ngroups, chunk_size, chunk_size), device=a.device, dtype=out_dtype + ) + dot_dtype = ( + tl.bfloat16 + if a.dtype == torch.bfloat16 or b.dtype == torch.bfloat16 + else ( + tl.float16 + if a.dtype == torch.float16 or b.dtype == torch.float16 + else tl.float32 + ) + ) + grid = lambda META: ( + triton.cdiv(chunk_size, META["BLOCK_SIZE_M"]) + * triton.cdiv(chunk_size, META["BLOCK_SIZE_N"]), + nchunks * ngroups, + ) + with torch.cuda.device(a.device.index): + _bmm_chunk_fwd_kernel[grid]( + a_ptr=a, + b_ptr=b, + out_ptr=out, + cu_chunk_seqlens_ptr=cu_chunk_seqlens, + seqlen=seqlen, + chunk_size=chunk_size, + K=k, + ngroups=ngroups, + stride_a_seqlen=a.stride(0), + stride_a_head=a.stride(1), + stride_ak=a.stride(2), + stride_b_seqlen=b.stride(0), + stride_b_head=b.stride(1), + stride_bk=b.stride(2), + stride_out_chunk=out.stride(0), + stride_out_head=out.stride(1), + stride_outm=out.stride(-2), + stride_outn=out.stride(-1), + IS_CAUSAL=causal, + dot_dtype=dot_dtype, + ) + return out diff --git a/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/model_executor/layers/mamba/ops/ssd_chunk_scan.py new file mode 100644 index 0000000..661c884 --- /dev/null +++ b/model_executor/layers/mamba/ops/ssd_chunk_scan.py @@ -0,0 +1,456 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py + +# ruff: noqa: E501,SIM102 + +from packaging import version + +from vllm.triton_utils import tl, triton + +TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0") + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=2, + ), + ], + key=["chunk_size", "hdim", "dstate", "IS_CAUSAL"], +) +@triton.jit +def _chunk_scan_fwd_kernel( + # Pointers to matrices + cb_ptr, + x_ptr, + z_ptr, + out_ptr, + dt_ptr, + dA_cumsum_ptr, + seq_idx_ptr, + C_ptr, + states_ptr, + D_ptr, + initstates_ptr, + cu_chunk_seqlens_ptr, + # Matrix dimensions + chunk_size: tl.constexpr, + hdim: tl.constexpr, + dstate: tl.constexpr, + seqlen, + nheads_ngroups_ratio: tl.constexpr, + # Strides + stride_cb_chunk: tl.int64, + stride_cb_head: tl.int64, + stride_cb_csize_m: tl.int64, + stride_cb_csize_k: tl.constexpr, + stride_x_seqlen: tl.int64, + stride_x_head: tl.int64, + stride_x_hdim: tl.constexpr, + stride_z_seqlen: tl.int64, + stride_z_head: tl.int64, + stride_z_hdim: tl.constexpr, + stride_out_seqlen: tl.int64, + stride_out_head: tl.int64, + stride_out_hdim: tl.constexpr, + stride_dt_chunk: tl.int64, + stride_dt_head: tl.int64, + stride_dt_csize: tl.constexpr, + stride_dA_cs_chunk: tl.int64, + stride_dA_cs_head: tl.int64, + stride_dA_cs_csize: tl.constexpr, + stride_seq_idx_chunk: tl.constexpr, + stride_C_seqlen: tl.int64, + stride_C_head: tl.int64, + stride_C_dstate: tl.constexpr, + stride_states_chunk: tl.int64, + stride_states_head: tl.int64, + stride_states_hdim: tl.int64, + stride_states_dstate: tl.constexpr, + stride_init_states_batch: tl.int64, + stride_init_states_head: tl.int64, + stride_init_states_hdim: tl.int64, + stride_init_states_dstate: tl.constexpr, + stride_D_head: tl.constexpr, + # Meta-parameters + IS_CAUSAL: tl.constexpr, + HAS_D: tl.constexpr, + D_HAS_HDIM: tl.constexpr, + HAS_Z: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + BLOCK_SIZE_DSTATE: tl.constexpr, + IS_TRITON_22: tl.constexpr, + HAS_INITSTATES: tl.constexpr, +): + pid_c = tl.program_id(axis=1).to(tl.int64) + pid_h = tl.program_id(axis=2) + num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N) + pid_m = tl.program_id(axis=0) // num_pid_n + pid_n = tl.program_id(axis=0) % num_pid_n + cb_ptr += pid_c * stride_cb_chunk + (pid_h // nheads_ngroups_ratio) * stride_cb_head + chunk_seqlen_start = tl.load(cu_chunk_seqlens_ptr + pid_c) + chunk_seqlen_end = tl.load(cu_chunk_seqlens_ptr + pid_c + 1) + x_ptr += chunk_seqlen_start * stride_x_seqlen + pid_h * stride_x_head + dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head + dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head + C_ptr += ( + chunk_seqlen_start * stride_C_seqlen + + (pid_h // nheads_ngroups_ratio) * stride_C_head + ) + + # M-block offsets and prev states + # - logic in next block may override these if there is an active offset + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + + seq_idx_ptr += pid_c * stride_seq_idx_chunk + seq_idx = tl.load(seq_idx_ptr) + seq_idx_prev = tl.load( + seq_idx_ptr - stride_seq_idx_chunk, mask=pid_c >= 1, other=-1 + ) + + if HAS_INITSTATES and (seq_idx != seq_idx_prev): + prev_states_ptr = ( + initstates_ptr + + seq_idx * stride_init_states_batch + + pid_h * stride_init_states_head + ) + prev_states_hdim = stride_init_states_hdim + prev_states_dstate = stride_init_states_dstate + else: + prev_states_ptr = ( + states_ptr + (pid_c - 1) * stride_states_chunk + pid_h * stride_states_head + ) + prev_states_hdim = stride_states_hdim + prev_states_dstate = stride_states_dstate + + chunk_size_limit = chunk_seqlen_end - chunk_seqlen_start + + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + dA_cs_m = tl.load( + dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0 + ).to(tl.float32) + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + offs_out_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_out_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128 + offs_k_dstate = tl.arange( + 0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K + ) + C_ptrs = C_ptr + ( + offs_m[:, None] * stride_C_seqlen + offs_k_dstate[None, :] * stride_C_dstate + ) + + scale_m = tl.exp(dA_cs_m) + if BLOCK_SIZE_DSTATE <= 128: + C = tl.load( + C_ptrs, + mask=(offs_m[:, None] < chunk_size_limit) + & (offs_k_dstate[None, :] < dstate), + other=0.0, + ) + + if not HAS_INITSTATES and (seq_idx != seq_idx_prev): + # if no init states AND starting a new sequence, we need zeros + prev_states = tl.zeros( + (BLOCK_SIZE_DSTATE, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty + ) + else: + # otherwise read the previous state + prev_states_ptrs = ( + prev_states_ptr + + offs_n[None, :] * prev_states_hdim + + offs_k_dstate[:, None] * prev_states_dstate + ) + prev_states = tl.load( + prev_states_ptrs, + mask=(offs_k_dstate[:, None] < dstate) & (offs_n[None, :] < hdim), + other=0.0, + ) + prev_states = prev_states.to(C_ptr.dtype.element_ty) + + acc = tl.dot(C, prev_states) * scale_m[:, None] + + else: + prev_states_ptrs = ( + prev_states_ptr + + offs_n[None, :] * prev_states_hdim + + offs_k_dstate[:, None] * prev_states_dstate + ) + for k in range(0, dstate, BLOCK_SIZE_K): + C = tl.load( + C_ptrs, + mask=(offs_m[:, None] < chunk_size_limit) + & (offs_k_dstate[None, :] < dstate - k), + other=0.0, + ) + if not HAS_INITSTATES and (seq_idx != seq_idx_prev): + prev_states = tl.zeros( + (BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty + ) + else: + prev_states = tl.load( + prev_states_ptrs, + mask=(offs_k_dstate[:, None] < dstate - k) + & (offs_n[None, :] < hdim), + other=0.0, + ) + prev_states = prev_states.to(C_ptr.dtype.element_ty) + acc += tl.dot(C, prev_states) + C_ptrs += BLOCK_SIZE_K + prev_states_ptrs += BLOCK_SIZE_K + acc *= scale_m[:, None] + + offs_k = tl.arange(0, BLOCK_SIZE_K) + cb_ptrs = cb_ptr + ( + offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k + ) + x_ptrs = x_ptr + ( + offs_k[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim + ) + dt_ptrs = dt_ptr + offs_k * stride_dt_csize + dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize + K_MAX = ( + chunk_size_limit + if not IS_CAUSAL + else min((pid_m + 1) * BLOCK_SIZE_M, chunk_size_limit) + ) + for k in range(0, K_MAX, BLOCK_SIZE_K): + cb = tl.load( + cb_ptrs, + mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < chunk_size - k), + other=0.0, + ).to(tl.float32) + dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size - k, other=0.0).to( + tl.float32 + ) + # If there's seq_idx, we already set cb[i, j] = 0 for seq_idx[i] != seq_idx[j]. + # So we don't need masking wrt seq_idx here. + cb *= tl.exp(dA_cs_m[:, None] - dA_cs_k[None, :]) + dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size - k, other=0.0).to(tl.float32) + cb *= dt_k + if IS_CAUSAL: + mask = offs_m[:, None] >= k + offs_k[None, :] + cb = tl.where(mask, cb, 0.0) + cb = cb.to(x_ptr.dtype.element_ty) + x = tl.load( + x_ptrs, + mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < hdim), + other=0.0, + ) + acc += tl.dot(cb, x) + cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k + x_ptrs += BLOCK_SIZE_K * stride_x_seqlen + dt_ptrs += BLOCK_SIZE_K * stride_dt_csize + dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize + + offs_out_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_out_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + + if HAS_D: + if D_HAS_HDIM: + D = tl.load( + D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0 + ).to(tl.float32) + else: + D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32) + x_residual = tl.load( + x_ptr + + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim), + mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim), + other=0.0, + ).to(tl.float32) + acc += x_residual * D + + if HAS_Z: + z_ptr += chunk_seqlen_start * stride_z_seqlen + pid_h * stride_z_head + z_ptrs = z_ptr + ( + stride_z_seqlen * offs_out_m[:, None] + stride_z_hdim * offs_out_n[None, :] + ) + z = tl.load( + z_ptrs, + mask=(offs_out_m[:, None] < chunk_size_limit) + & (offs_out_n[None, :] < hdim), + other=0.0, + ).to(tl.float32) + acc *= z * tl.sigmoid(z) + + out_ptr += chunk_seqlen_start * stride_out_seqlen + pid_h * stride_out_head + out_ptrs = out_ptr + ( + stride_out_seqlen * offs_out_m[:, None] + offs_out_n[None, :] * stride_out_hdim + ) + tl.store( + out_ptrs, + acc, + mask=(offs_out_m[:, None] < chunk_size_limit) & (offs_out_n[None, :] < hdim), + ) + + +def _chunk_scan_fwd( + cb, + x, + dt, + dA_cumsum, + C, + states, + cu_chunk_seqlens, + out, + seq_idx, + D=None, + z=None, + initial_states=None, +): + assert seq_idx is not None, "this implementation requires seq_idx" + + seqlen, nheads, headdim = x.shape + _, nchunks, chunk_size = dt.shape + _, ngroups, dstate = C.shape + assert nheads % ngroups == 0 + assert C.shape == (seqlen, ngroups, dstate) + assert cb.shape == (nchunks, ngroups, chunk_size, chunk_size) + if D is not None: + assert D.shape == (nheads, headdim) or D.shape == (nheads,) + if z is not None: + assert z.shape == x.shape + assert dt.shape == (nheads, nchunks, chunk_size) + assert dA_cumsum.shape == (nheads, nchunks, chunk_size) + assert states.shape == (nchunks, nheads, headdim, dstate) + assert seq_idx.shape == (nchunks,) + + grid = lambda META: ( + triton.cdiv(chunk_size, META["BLOCK_SIZE_M"]) + * triton.cdiv(headdim, META["BLOCK_SIZE_N"]), + nchunks, + nheads, + ) + + z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0) + initial_states_strides = ( + ( + initial_states.stride(0), + initial_states.stride(1), + initial_states.stride(2), + initial_states.stride(3), + ) + if initial_states is not None + else (0, 0, 0, 0) + ) + + _chunk_scan_fwd_kernel[grid]( + cb_ptr=cb, + x_ptr=x, + z_ptr=z, + out_ptr=out, + dt_ptr=dt, + dA_cumsum_ptr=dA_cumsum, + seq_idx_ptr=seq_idx, + C_ptr=C, + states_ptr=states, + D_ptr=D, + initstates_ptr=initial_states, + cu_chunk_seqlens_ptr=cu_chunk_seqlens, + chunk_size=chunk_size, + hdim=headdim, + dstate=dstate, + seqlen=seqlen, + nheads_ngroups_ratio=nheads // ngroups, + stride_cb_chunk=cb.stride(0), + stride_cb_head=cb.stride(1), + stride_cb_csize_m=cb.stride(2), + stride_cb_csize_k=cb.stride(3), + stride_x_seqlen=x.stride(0), + stride_x_head=x.stride(1), + stride_x_hdim=x.stride(2), + stride_z_seqlen=z_strides[0], + stride_z_head=z_strides[1], + stride_z_hdim=z_strides[2], + stride_out_seqlen=out.stride(0), + stride_out_head=out.stride(1), + stride_out_hdim=out.stride(2), + stride_dt_chunk=dt.stride(1), + stride_dt_head=dt.stride(0), + stride_dt_csize=dt.stride(2), + stride_dA_cs_chunk=dA_cumsum.stride(1), + stride_dA_cs_head=dA_cumsum.stride(0), + stride_dA_cs_csize=dA_cumsum.stride(2), + stride_seq_idx_chunk=seq_idx.stride(0), + stride_C_seqlen=C.stride(0), + stride_C_head=C.stride(1), + stride_C_dstate=C.stride(2), + stride_states_chunk=states.stride(0), + stride_states_head=states.stride(1), + stride_states_hdim=states.stride(2), + stride_states_dstate=states.stride(3), + stride_init_states_batch=initial_states_strides[0], + stride_init_states_head=initial_states_strides[1], + stride_init_states_hdim=initial_states_strides[2], + stride_init_states_dstate=initial_states_strides[3], + stride_D_head=D.stride(0) if D is not None else 0, + IS_CAUSAL=True, + HAS_D=D is not None, + D_HAS_HDIM=D.dim() == 2 if D is not None else True, + HAS_Z=z is not None, + BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16), + IS_TRITON_22=TRITON_22, + HAS_INITSTATES=initial_states is not None, + ) + return diff --git a/model_executor/layers/mamba/ops/ssd_chunk_state.py b/model_executor/layers/mamba/ops/ssd_chunk_state.py new file mode 100644 index 0000000..11cc125 --- /dev/null +++ b/model_executor/layers/mamba/ops/ssd_chunk_state.py @@ -0,0 +1,700 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py + +# ruff: noqa: E501 + +import torch + +from vllm.triton_utils import tl, triton + +from .mamba_ssm import softplus + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE_H": 2}), + triton.Config({"BLOCK_SIZE_H": 4}), + triton.Config({"BLOCK_SIZE_H": 8}), + triton.Config({"BLOCK_SIZE_H": 16}), + triton.Config({"BLOCK_SIZE_H": 32}), + triton.Config({"BLOCK_SIZE_H": 64}), + ], + key=["chunk_size", "nheads"], +) +@triton.jit +def _chunk_cumsum_fwd_kernel( + # Pointers to matrices + dt_ptr, + A_ptr, + dt_bias_ptr, + dt_out_ptr, + dA_cumsum_ptr, + cu_chunk_seqlens_ptr, + # Matrix dimension + seqlen, + nheads: tl.constexpr, + chunk_size: tl.constexpr, + dt_min: tl.constexpr, + dt_max: tl.constexpr, + # Strides + stride_dt_seqlen: tl.int64, + stride_dt_head: tl.constexpr, + stride_A_head: tl.constexpr, + stride_dt_bias_head: tl.constexpr, + stride_dt_out_head: tl.int64, + stride_dt_out_chunk: tl.int64, + stride_dt_out_csize: tl.constexpr, + stride_dA_cs_head: tl.int64, + stride_dA_cs_chunk: tl.int64, + stride_dA_cs_csize: tl.constexpr, + # Meta-parameters + DT_SOFTPLUS: tl.constexpr, + HAS_DT_BIAS: tl.constexpr, + BLOCK_SIZE_H: tl.constexpr, + BLOCK_SIZE_CHUNK: tl.constexpr, +): + # if dt is long, may cause problems, so use 64 bit + # https://github.com/triton-lang/triton/issues/1058 + pid_c = tl.program_id(axis=0).to(tl.int64) + pid_h = tl.program_id(axis=1) + + chunk_seqlen_start = tl.load(cu_chunk_seqlens_ptr + pid_c) + chunk_seqlen_end = tl.load(cu_chunk_seqlens_ptr + pid_c + 1) + + dt_ptr += chunk_seqlen_start * stride_dt_seqlen + dt_out_ptr += pid_c * stride_dt_out_chunk + dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + + offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H) + offs_c = tl.arange(0, BLOCK_SIZE_CHUNK) + dt_ptrs = dt_ptr + ( + offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen + ) + A_ptrs = A_ptr + offs_h * stride_A_head + dt_out_ptrs = dt_out_ptr + ( + offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize + ) + dA_cs_ptrs = dA_cumsum_ptr + ( + offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize + ) + chunk_size_limit = chunk_seqlen_end - chunk_seqlen_start + + dt = tl.load( + dt_ptrs, + mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), + other=0.0, + ).to(tl.float32) + if HAS_DT_BIAS: + dt_bias = tl.load( + dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0 + ).to(tl.float32) + dt += dt_bias[:, None] + if DT_SOFTPLUS: + dt = tl.where(dt <= 20.0, softplus(dt), dt) + + dt = tl.clamp(dt, dt_min, dt_max) + dt = tl.where( + (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0 + ) + tl.store( + dt_out_ptrs, + dt, + mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size), + ) + A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32) + dA = dt * A[:, None] + dA_cs = tl.cumsum(dA, axis=1) + tl.store( + dA_cs_ptrs, + dA_cs, + mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size), + ) + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=2, + ), + ], + key=["hdim", "dstate", "chunk_size"], +) +@triton.jit +def _chunk_state_fwd_kernel( + # Pointers to matrices + x_ptr, + b_ptr, + states_ptr, + dt_ptr, + dA_cumsum_ptr, + cu_chunk_seqlens_ptr, + # Matrix dimensions + hdim: tl.constexpr, + dstate: tl.constexpr, + chunk_size: tl.constexpr, + seqlen, + nheads_ngroups_ratio: tl.constexpr, + # Strides + stride_x_seqlen: tl.int64, + stride_x_head: tl.int64, + stride_x_hdim: tl.constexpr, + stride_b_seqlen: tl.int64, + stride_b_head: tl.int64, + stride_b_dstate: tl.constexpr, + stride_states_chunk: tl.int64, + stride_states_head: tl.int64, + stride_states_hdim: tl.int64, + stride_states_dstate: tl.constexpr, + stride_dt_head: tl.int64, + stride_dt_chunk: tl.int64, + stride_dt_csize: tl.constexpr, + stride_dA_cs_head: tl.int64, + stride_dA_cs_chunk: tl.int64, + stride_dA_cs_csize: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, +): + pid_c = tl.program_id(axis=1).to(tl.int64) + pid_h = tl.program_id(axis=2) + num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N) + pid_m = tl.program_id(axis=0) // num_pid_n + pid_n = tl.program_id(axis=0) % num_pid_n + chunk_seqlen_start = tl.load(cu_chunk_seqlens_ptr + pid_c) + chunk_seqlen_end = tl.load(cu_chunk_seqlens_ptr + pid_c + 1) + b_ptr += ( + chunk_seqlen_start * stride_b_seqlen + + (pid_h // nheads_ngroups_ratio) * stride_b_head + ) + x_ptr += chunk_seqlen_start * stride_x_seqlen + pid_h * stride_x_head + dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head + dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_k = tl.arange(0, BLOCK_SIZE_K) + x_ptrs = x_ptr + ( + offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen + ) + b_ptrs = b_ptr + ( + offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen + ) + dt_ptrs = dt_ptr + offs_k * stride_dt_csize + dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to( + tl.float32 + ) + dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize + + chunk_size_limit = chunk_seqlen_end - chunk_seqlen_start + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, chunk_size_limit, BLOCK_SIZE_K): + x = tl.load( + x_ptrs, + mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k), + other=0.0, + ) + b = tl.load( + b_ptrs, + mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate), + other=0.0, + ).to(tl.float32) + dA_cs_k = tl.load( + dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0 + ).to(tl.float32) + dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to( + tl.float32 + ) + scale = tl.exp(dA_cs_last - dA_cs_k) * dt_k + b *= scale[:, None] + b = b.to(x_ptr.dtype.element_ty) + acc += tl.dot(x, b) + + x_ptrs += BLOCK_SIZE_K * stride_x_seqlen + b_ptrs += BLOCK_SIZE_K * stride_b_seqlen + dt_ptrs += BLOCK_SIZE_K * stride_dt_csize + dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize + + states = acc.to(states_ptr.dtype.element_ty) + + states_ptr += pid_c * stride_states_chunk + pid_h * stride_states_head + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + states_ptrs = states_ptr + ( + offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate + ) + c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate) + tl.store(states_ptrs, states, mask=c_mask) + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, + num_stages=4, + num_warps=2, + ), + ], + key=["hdim", "dstate", "chunk_size"], +) +@triton.jit +def _chunk_state_varlen_kernel( + # Pointers to matrices + x_ptr, + b_ptr, + dt_ptr, + dA_cumsum_ptr, + chunk_states_ptr, + cu_seqlens_ptr, + states_ptr, + initstates_ptr, + # Matrix dimensions + hdim: tl.constexpr, + dstate: tl.constexpr, + chunk_size: tl.constexpr, + nheads_ngroups_ratio: tl.constexpr, + # Strides + stride_x_seqlen: tl.int64, + stride_x_head: tl.int64, + stride_x_hdim: tl.constexpr, + stride_b_seqlen: tl.int64, + stride_b_head: tl.int64, + stride_b_dstate: tl.constexpr, + stride_dt_head: tl.int64, + stride_dt_chunk: tl.int64, + stride_dt_csize: tl.constexpr, + stride_dA_cs_head: tl.int64, + stride_dA_cs_chunk: tl.int64, + stride_dA_cs_csize: tl.constexpr, + stride_chunk_states_chunk: tl.int64, + stride_chunk_states_head: tl.int64, + stride_chunk_states_hdim: tl.int64, + stride_chunk_states_dstate: tl.constexpr, + stride_states_batch: tl.int64, + stride_states_head: tl.int64, + stride_states_hdim: tl.int64, + stride_states_dstate: tl.constexpr, + stride_init_states_batch: tl.int64, + stride_init_states_head: tl.int64, + stride_init_states_hdim: tl.int64, + stride_init_states_dstate: tl.constexpr, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + HAS_INITSTATES: tl.constexpr, +): + pid_b = tl.program_id(axis=1) + pid_h = tl.program_id(axis=2) + num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N) + pid_m = tl.program_id(axis=0) // num_pid_n + pid_n = tl.program_id(axis=0) % num_pid_n + end_idx = tl.load(cu_seqlens_ptr + pid_b + 1) + pid_c = (end_idx - 1) // chunk_size + b_ptr += ( + pid_c * chunk_size * stride_b_seqlen + + (pid_h // nheads_ngroups_ratio) * stride_b_head + ) + x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head + dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head + dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head + chunk_states_ptr += ( + pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head + ) + + if HAS_INITSTATES: + # if there are init states provided, we differentiate between states (which + # are boundary conditions at a chunk boundary) and initstates (which are boundary + # conditions when a new example in a cont batch starts) + initstates_ptr += pid_h * stride_init_states_head + + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_k = tl.arange(0, BLOCK_SIZE_K) + x_ptrs = x_ptr + ( + offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen + ) + b_ptrs = b_ptr + ( + offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen + ) + dt_ptrs = dt_ptr + offs_k * stride_dt_csize + dA_cs_last = tl.load( + dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize + ).to(tl.float32) + dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize + + chunk_size_limit = end_idx - pid_c * chunk_size + start_idx = tl.load(cu_seqlens_ptr + pid_b) + start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0) + + acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, chunk_size_limit, BLOCK_SIZE_K): + x = tl.load( + x_ptrs, + mask=(offs_m[:, None] < hdim) + & (offs_k[None, :] < chunk_size_limit - k) + & (offs_k[None, :] >= start_idx_cur - k), + other=0.0, + ) + b = tl.load( + b_ptrs, + mask=(offs_k[:, None] < chunk_size_limit - k) + & (offs_n[None, :] < dstate) + & (offs_k[:, None] >= start_idx_cur - k), + other=0.0, + ).to(tl.float32) + dA_cs_k = tl.load( + dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0 + ).to(tl.float32) + dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to( + tl.float32 + ) + scale = tl.where( + (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k), + tl.exp(dA_cs_last - dA_cs_k) * dt_k, + 0.0, + ) + b *= scale[:, None] + b = b.to(x_ptr.dtype.element_ty) + acc += tl.dot(x, b) + x_ptrs += BLOCK_SIZE_K * stride_x_seqlen + b_ptrs += BLOCK_SIZE_K * stride_b_seqlen + dt_ptrs += BLOCK_SIZE_K * stride_dt_csize + dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize + + # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk + # If HAS_INITSTATES==True need to consider two possibilities + # - if start_idx < pid_c * chunk_size, then we need to take the past_states_ptrs + # - if state_idx >= pid * chunk_size, then we need to insert initstates + if ( + (start_idx < pid_c * chunk_size) # first chunk + or (HAS_INITSTATES) + ): + dA_cs_boundary = 0.0 # default + + if not HAS_INITSTATES: + past_states_ptrs = chunk_states_ptr + ( + offs_m[:, None] * stride_chunk_states_hdim + + offs_n[None, :] * stride_chunk_states_dstate + ) + else: + # - this seems repetitive, buts its to help the compiler + if start_idx < pid_c * chunk_size: + past_states_ptrs = chunk_states_ptr + ( + offs_m[:, None] * stride_chunk_states_hdim + + offs_n[None, :] * stride_chunk_states_dstate + ) + else: + past_states_ptrs = initstates_ptr + ( + pid_b * stride_init_states_batch + + offs_m[:, None] * stride_init_states_hdim + + offs_n[None, :] * stride_init_states_dstate + ) + + # need to adjust the boundary + if start_idx > pid_c * chunk_size: + dA_cs_boundary = tl.load( + dA_cumsum_ptr + + (start_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize + ).to(tl.float32) + + past_states = tl.load( + past_states_ptrs, + mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate), + other=0.0, + ).to(tl.float32) + + scale = tl.exp(dA_cs_last - dA_cs_boundary) + acc += past_states * scale + + states = acc.to(states_ptr.dtype.element_ty) + + states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + states_ptrs = states_ptr + ( + offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate + ) + c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate) + tl.store(states_ptrs, states, mask=c_mask) + + +def _chunk_cumsum_fwd( + dt, + A, + chunk_size, + cu_chunk_seqlens, + dt_bias=None, + dt_softplus=False, + dt_limit=(0.0, float("inf")), +): + seqlen, nheads = dt.shape + assert A.shape == (nheads,) + if dt_bias is not None: + assert dt_bias.shape == (nheads,) + nchunks = cu_chunk_seqlens.shape[0] - 1 + dt_out = torch.empty( + nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32 + ) + dA_cumsum = torch.empty( + nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32 + ) + grid_chunk_cs = lambda META: (nchunks, triton.cdiv(nheads, META["BLOCK_SIZE_H"])) + with torch.cuda.device(dt.device.index): + _chunk_cumsum_fwd_kernel[grid_chunk_cs]( + dt_ptr=dt, + A_ptr=A, + dt_bias_ptr=dt_bias, + dt_out_ptr=dt_out, + dA_cumsum_ptr=dA_cumsum, + cu_chunk_seqlens_ptr=cu_chunk_seqlens, + seqlen=seqlen, + nheads=nheads, + chunk_size=chunk_size, + dt_min=dt_limit[0], + dt_max=dt_limit[1], + stride_dt_seqlen=dt.stride(0), + stride_dt_head=dt.stride(1), + stride_A_head=A.stride(0), + stride_dt_bias_head=dt_bias.stride(0) if dt_bias is not None else 0, + stride_dt_out_head=dt_out.stride(0), + stride_dt_out_chunk=dt_out.stride(1), + stride_dt_out_csize=dt_out.stride(2), + stride_dA_cs_head=dA_cumsum.stride(0), + stride_dA_cs_chunk=dA_cumsum.stride(1), + stride_dA_cs_csize=dA_cumsum.stride(2), + DT_SOFTPLUS=dt_softplus, + HAS_DT_BIAS=dt_bias is not None, + BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size), + ) + return dA_cumsum, dt_out + + +def _chunk_state_fwd( + B, x, dt, dA_cumsum, cu_chunk_seqlens, states=None, states_in_fp32=True +): + seqlen, nheads, headdim = x.shape + _, nchunks, chunk_size = dt.shape + _, ngroups, dstate = B.shape + assert nheads % ngroups == 0 + assert B.shape == (seqlen, ngroups, dstate) + assert dt.shape == (nheads, nchunks, chunk_size) + assert dA_cumsum.shape == dt.shape + + if states is not None: + assert states.shape == (nchunks, nheads, headdim, dstate) + else: + states_dtype = torch.float32 if states_in_fp32 else B.dtype + states = torch.empty( + (nchunks, nheads, headdim, dstate), device=x.device, dtype=states_dtype + ) + + grid = lambda META: ( + triton.cdiv(headdim, META["BLOCK_SIZE_M"]) + * triton.cdiv(dstate, META["BLOCK_SIZE_N"]), + nchunks, + nheads, + ) + with torch.cuda.device(x.device.index): + _chunk_state_fwd_kernel[grid]( + x_ptr=x, + b_ptr=B, + states_ptr=states, + dt_ptr=dt, + dA_cumsum_ptr=dA_cumsum, + cu_chunk_seqlens_ptr=cu_chunk_seqlens, + hdim=headdim, + dstate=dstate, + chunk_size=chunk_size, + seqlen=seqlen, + nheads_ngroups_ratio=nheads // ngroups, + stride_x_seqlen=x.stride(0), + stride_x_head=x.stride(1), + stride_x_hdim=x.stride(2), + stride_b_seqlen=B.stride(0), + stride_b_head=B.stride(1), + stride_b_dstate=B.stride(2), + stride_states_chunk=states.stride(0), + stride_states_head=states.stride(1), + stride_states_hdim=states.stride(2), + stride_states_dstate=states.stride(3), + stride_dt_head=dt.stride(0), + stride_dt_chunk=dt.stride(1), + stride_dt_csize=dt.stride(2), + stride_dA_cs_head=dA_cumsum.stride(0), + stride_dA_cs_chunk=dA_cumsum.stride(1), + stride_dA_cs_csize=dA_cumsum.stride(2), + ) + return states + + +def chunk_state_varlen( + B, x, dt, dA_cumsum, cu_seqlens, chunk_states, initial_states=None +): + total_seqlen, nheads, headdim = x.shape + _, nchunks, chunk_size = dt.shape + _, ngroups, dstate = B.shape + batch = cu_seqlens.shape[0] - 1 + cu_seqlens = cu_seqlens.contiguous() + assert nheads % ngroups == 0 + assert B.shape == (total_seqlen, ngroups, dstate) + assert dt.shape == (nheads, nchunks, chunk_size) + assert dA_cumsum.shape == dt.shape + assert chunk_states.shape == (nchunks, nheads, headdim, dstate) + + if initial_states is not None: + assert initial_states.shape == (batch, nheads, headdim, dstate) + + states = torch.empty( + batch, + nheads, + headdim, + dstate, + dtype=chunk_states.dtype, + device=chunk_states.device, + ) + + initial_states_strides = ( + ( + initial_states.stride(0), + initial_states.stride(1), + initial_states.stride(2), + initial_states.stride(3), + ) + if initial_states is not None + else (0, 0, 0, 0) + ) + + grid = lambda META: ( + triton.cdiv(headdim, META["BLOCK_SIZE_M"]) + * triton.cdiv(dstate, META["BLOCK_SIZE_N"]), + batch, + nheads, + ) + with torch.cuda.device(x.device.index): + _chunk_state_varlen_kernel[grid]( + x_ptr=x, + b_ptr=B, + dt_ptr=dt, + dA_cumsum_ptr=dA_cumsum, + chunk_states_ptr=chunk_states, + cu_seqlens_ptr=cu_seqlens, + states_ptr=states, + initstates_ptr=initial_states, + hdim=headdim, + dstate=dstate, + chunk_size=chunk_size, + nheads_ngroups_ratio=nheads // ngroups, + stride_x_seqlen=x.stride(0), + stride_x_head=x.stride(1), + stride_x_hdim=x.stride(2), + stride_b_seqlen=B.stride(0), + stride_b_head=B.stride(1), + stride_b_dstate=B.stride(2), + stride_dt_head=dt.stride(0), + stride_dt_chunk=dt.stride(1), + stride_dt_csize=dt.stride(2), + stride_dA_cs_head=dA_cumsum.stride(0), + stride_dA_cs_chunk=dA_cumsum.stride(1), + stride_dA_cs_csize=dA_cumsum.stride(2), + stride_chunk_states_chunk=chunk_states.stride(0), + stride_chunk_states_head=chunk_states.stride(1), + stride_chunk_states_hdim=chunk_states.stride(2), + stride_chunk_states_dstate=chunk_states.stride(3), + stride_states_batch=states.stride(0), + stride_states_head=states.stride(1), + stride_states_hdim=states.stride(2), + stride_states_dstate=states.stride(3), + stride_init_states_batch=initial_states_strides[0], + stride_init_states_head=initial_states_strides[1], + stride_init_states_hdim=initial_states_strides[2], + stride_init_states_dstate=initial_states_strides[3], + HAS_INITSTATES=initial_states is not None, + ) + return states diff --git a/model_executor/layers/mamba/ops/ssd_combined.py b/model_executor/layers/mamba/ops/ssd_combined.py new file mode 100644 index 0000000..ac905ad --- /dev/null +++ b/model_executor/layers/mamba/ops/ssd_combined.py @@ -0,0 +1,230 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py + +# ruff: noqa: E501 + +import torch +from einops import rearrange +from packaging import version + +from vllm.triton_utils import triton + +from .ssd_bmm import _bmm_chunk_fwd +from .ssd_chunk_scan import _chunk_scan_fwd +from .ssd_chunk_state import _chunk_cumsum_fwd, _chunk_state_fwd +from .ssd_state_passing import _state_passing_fwd + +TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0") + + +def is_int_pow_2(n): + return isinstance(n, int) and n > 0 and (n & (n - 1)) == 0 + + +def _mamba_chunk_scan_combined_fwd( + x, + dt, + A, + B, + C, + chunk_size, + out, + D=None, + z=None, + dt_bias=None, + initial_states=None, + return_intermediate_states=False, + seq_idx=None, + cu_seqlens=None, + cu_chunk_seqlens=None, + last_chunk_indices=None, + dt_softplus=False, + dt_limit=(0.0, float("inf")), + state_dtype=None, +): + assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2" + seqlen, nheads, headdim = x.shape + _, ngroups, dstate = B.shape + assert nheads % ngroups == 0 + assert B.shape == (seqlen, ngroups, dstate) + assert dt.shape == (seqlen, nheads) + assert A.shape == (nheads,) + assert C.shape == B.shape + if z is not None: + assert z.shape == x.shape + if D is not None: + assert D.shape == (nheads, headdim) or D.shape == (nheads,) + if seq_idx is not None: + assert seq_idx.shape == (cu_chunk_seqlens.shape[0] - 1,) + if B.stride(-1) != 1: + B = B.contiguous() + if C.stride(-1) != 1: + C = C.contiguous() + if ( + x.stride(-1) != 1 and x.stride(0) != 1 + ): # Either M or K dimension should be contiguous + x = x.contiguous() + if ( + z is not None and z.stride(-1) != 1 and z.stride(0) != 1 + ): # Either M or K dimension should be contiguous + z = z.contiguous() + if D is not None and D.stride(-1) != 1: + D = D.contiguous() + assert cu_seqlens is not None, "Assuming varlen input - must supply cu_seqlens" + + if initial_states is not None: + assert initial_states.shape == (len(cu_seqlens) - 1, nheads, headdim, dstate) + + # This function executes 5 sub-functions for computing mamba + # - a good resource is the blog https://goombalab.github.io/blog/2024/mamba2-part3-algorithm/ + # which has a minimal implementation to understand the below operations + # - as explained by the blog, mamba is a special case of causal attention + # - the idea is to chunk the attention matrix and compute each + # submatrix separately using different optimizations. + # - see the blog and paper for a visualization of the submatrices + # which we refer to in the comments below + + # 1. Compute chunked cumsum of A * dt + # - here dt may go through a softplus activation + dA_cumsum, dt = _chunk_cumsum_fwd( + dt, + A, + chunk_size, + cu_chunk_seqlens, + dt_bias=dt_bias, + dt_softplus=dt_softplus, + dt_limit=dt_limit, + ) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + states = _chunk_state_fwd( + B, x, dt, dA_cumsum, cu_chunk_seqlens, states_in_fp32=True + ) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + # - for handling chunked prefill, this requires i) initial_states and + # ii) seq_idx to be all specified. + # - When a new seq_idx is detected, we will stop passing the prev_state + # and switch accordingly to the init_state corresponding to the new seq_idx. + states = _state_passing_fwd( + rearrange(states, "... p n -> ... (p n)"), + dA_cumsum, # (nheads, nchunks, chunk_size) + cu_chunk_seqlens, + initial_states=rearrange(initial_states, "... p n -> ... (p n)") + if initial_states is not None + else None, # (batch, nheads, headdim*dstate) + seq_idx=seq_idx, + out_dtype=state_dtype if state_dtype is not None else C.dtype, + ) + states = rearrange(states, "... (p n) -> ... p n", n=dstate) + + # 4. Compute batched matrix multiply for C_j^T B_i terms + CB = _bmm_chunk_fwd(C, B, chunk_size, cu_chunk_seqlens, output_dtype=torch.float32) + + # 5. Scan and compute the diagonal blocks, taking into + # account past causal states. + # - if initial states are provided, then states information will be + # augmented with initial_states. + # - to do this properly, we need to account for example changes in + # the continuous batch, therefore we introduce pseudo chunks, which is + # a chunk that is split up each time an example changes. + # - in each (pseudo) chunk, we detect if the previous (pseudo) chunk had + # a seq_idx change, in which case we take states information from + # init_states. + _chunk_scan_fwd( + CB, + x, + dt, + dA_cumsum, + C, + states, + cu_chunk_seqlens, + out, # in-place update + seq_idx, + D=D, + z=z, + initial_states=initial_states, + ) + + if return_intermediate_states: + return states + else: + return states[last_chunk_indices] + + +def mamba_chunk_scan_combined_varlen( + x, + dt, + A, + B, + C, + chunk_size, + cu_seqlens, + cu_chunk_seqlens, + last_chunk_indices, + seq_idx, + out, + D=None, + z=None, + dt_bias=None, + initial_states=None, + dt_softplus=False, + dt_limit=(0.0, float("inf")), + return_intermediate_states=False, + state_dtype=None, +): + """ + Argument: + x: (seqlen, nheads, headdim) + dt: (seqlen, nheads) + A: (nheads) + B: (seqlen, ngroups, dstate) + C: (seqlen, ngroups, dstate) + chunk_size: int + cu_seqlens: (batch + 1,) + cu_chunk_seqlens: (nchunks + 1,) + last_chunk_indices: (batch,) + seq_idx: (nchunks,) + out: (seqlen, nheads, headdim) preallocated output tensor + D: (nheads, headdim) or (nheads,) + z: (seqlen, nheads, headdim) + dt_bias: (nheads,) + initial_states: (batch, nheads, headdim, dstate) + dt_softplus: Whether to apply softplus to dt + out: (seqlen, nheads, headdim) preallocated output tensor + state_dtype: The data type of the ssm state + Return: + varlen_states: (batch, nheads, headdim, dstate) + """ + + assert cu_seqlens is not None, "cu_seqlens must be provided assuming varlen input" + assert seq_idx is not None + + varlen_states = _mamba_chunk_scan_combined_fwd( + x, + dt, + A, + B, + C, + chunk_size, + out, + D=D, + z=z, + dt_bias=dt_bias, + initial_states=initial_states, + return_intermediate_states=return_intermediate_states, + seq_idx=seq_idx, + cu_seqlens=cu_seqlens, + cu_chunk_seqlens=cu_chunk_seqlens, + last_chunk_indices=last_chunk_indices, + dt_softplus=dt_softplus, + dt_limit=dt_limit, + state_dtype=state_dtype, + ) + + return varlen_states diff --git a/model_executor/layers/mamba/ops/ssd_state_passing.py b/model_executor/layers/mamba/ops/ssd_state_passing.py new file mode 100644 index 0000000..5481bab --- /dev/null +++ b/model_executor/layers/mamba/ops/ssd_state_passing.py @@ -0,0 +1,157 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2024, Tri Dao, Albert Gu. +# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py + +# ruff: noqa: E501 + +import torch + +from vllm.triton_utils import tl, triton + + +@triton.autotune( + configs=[ + triton.Config({"BLOCK_SIZE": 64}), + triton.Config({"BLOCK_SIZE": 128}), + triton.Config({"BLOCK_SIZE": 256}), + triton.Config({"BLOCK_SIZE": 512}), + triton.Config({"BLOCK_SIZE": 1024}), + triton.Config({"BLOCK_SIZE": 2048}), + ], + key=["dim"], +) +@triton.jit +def _state_passing_fwd_kernel( + # Pointers to matrices + states_ptr, + out_ptr, + dA_cs_ptr, + initstates_ptr, + seq_idx_ptr, + cu_chunk_seqlens_ptr, + # Matrix dimensions + dim: tl.constexpr, + nchunks, + seqlen, + chunk_size: tl.constexpr, + # Strides + stride_states_chunk: tl.int64, + stride_states_head: tl.int64, + stride_states_dim: tl.constexpr, + stride_out_chunk: tl.int64, + stride_out_head: tl.int64, + stride_out_dim: tl.constexpr, + stride_dA_cs_head: tl.int64, + stride_dA_cs_chunk: tl.int64, + stride_dA_cs_csize: tl.constexpr, + stride_initstates_batch: tl.int64, + stride_initstates_head: tl.int64, + stride_initstates_dim: tl.constexpr, + stride_seq_idx_chunk: tl.constexpr, + # Meta-parameters + HAS_INITSTATES: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + pid_h = tl.program_id(axis=1) + pid_m = tl.program_id(axis=0) + + states_ptr += pid_h * stride_states_head + dA_cs_ptr += pid_h * stride_dA_cs_head + (chunk_size - 1) * stride_dA_cs_csize + out_ptr += pid_h * stride_out_head + + offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + states_ptrs = states_ptr + offs_m * stride_states_dim + out_ptrs = out_ptr + offs_m * stride_out_dim + + if HAS_INITSTATES: + initstates_ptrs = ( + initstates_ptr + + pid_h * stride_initstates_head + + offs_m * stride_initstates_dim + ) + + states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + else: + states = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + + prev_seq_idx = 0 + for c in range(nchunks): + new_states = tl.load(states_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + dA_cs = tl.load(dA_cs_ptr).to(tl.float32) + seq_idx = tl.load(seq_idx_ptr + c * stride_seq_idx_chunk) + # we have started a new sequence + if prev_seq_idx != seq_idx: + if HAS_INITSTATES: + initstates_ptrs = ( + initstates_ptr + + seq_idx * stride_initstates_batch + + pid_h * stride_initstates_head + + offs_m * stride_initstates_dim + ) + states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to( + tl.float32 + ) + else: + states = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + + prev_seq_idx = seq_idx + states = tl.exp(dA_cs) * states + new_states + tl.store(out_ptrs, states, mask=offs_m < dim) + + states_ptrs += stride_states_chunk + dA_cs_ptr += stride_dA_cs_chunk + out_ptrs += stride_out_chunk + + +def _state_passing_fwd( + states, + dA_cumsum, + cu_chunk_seqlens, + seq_idx, + initial_states=None, + out_dtype=None, +): + nchunks, nheads, dim = states.shape + chunk_size = dA_cumsum.shape[-1] + assert dA_cumsum.shape == (nheads, nchunks, chunk_size) + seqlen = seq_idx.shape[-1] + out_dtype = states.dtype if out_dtype is None else out_dtype + out = torch.empty((nchunks, nheads, dim), device=states.device, dtype=out_dtype) + + initial_states_strides = ( + (initial_states.stride(0), initial_states.stride(1), initial_states.stride(2)) + if initial_states is not None + else (0, 0, 0) + ) + + grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE"]), nheads) + with torch.cuda.device(states.device.index): + _state_passing_fwd_kernel[grid]( + states_ptr=states, + out_ptr=out, + dA_cs_ptr=dA_cumsum, + initstates_ptr=initial_states, + seq_idx_ptr=seq_idx, + cu_chunk_seqlens_ptr=cu_chunk_seqlens, + dim=dim, + nchunks=nchunks, + seqlen=seqlen if seq_idx is not None else 0, + chunk_size=chunk_size if seq_idx is not None else 0, + stride_states_chunk=states.stride(0), + stride_states_head=states.stride(1), + stride_states_dim=states.stride(2), + stride_out_chunk=out.stride(0), + stride_out_head=out.stride(1), + stride_out_dim=out.stride(2), + stride_dA_cs_head=dA_cumsum.stride(0), + stride_dA_cs_chunk=dA_cumsum.stride(1), + stride_dA_cs_csize=dA_cumsum.stride(2), + stride_initstates_batch=initial_states_strides[0], + stride_initstates_head=initial_states_strides[1], + stride_initstates_dim=initial_states_strides[2], + stride_seq_idx_chunk=seq_idx.stride(0), + HAS_INITSTATES=initial_states is not None, + ) + return out diff --git a/model_executor/layers/mamba/short_conv.py b/model_executor/layers/mamba/short_conv.py new file mode 100644 index 0000000..04efa8a --- /dev/null +++ b/model_executor/layers/mamba/short_conv.py @@ -0,0 +1,264 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +import torch + +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, + MambaStateShapeCalculator, +) +from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( + causal_conv1d_fn, + causal_conv1d_update, +) +from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata + + +@CustomOp.register("short_conv") +class ShortConv(MambaBase, CustomOp): + def __init__( + self, + config, + dim: int, + layer_idx: int, + model_config: ModelConfig | None = None, + cache_config: CacheConfig | None = None, + prefix: str = "", + ): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.conv_dim = dim + self.L_cache = config.conv_L_cache + self.bias = config.conv_bias + + self.conv = ColumnParallelLinear( + input_size=self.L_cache, + output_size=dim, + bias=self.bias, + prefix=f"{prefix}.conv1d", + ) + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `set_weight_attrs` + # doesn't allow to override it + self.conv.weight.data = self.conv.weight.data.unsqueeze(1) + + self.in_proj = MergedColumnParallelLinear( + input_size=dim, + output_sizes=[dim] * 3, + bias=self.bias, + prefix=f"{prefix}.in_proj", + ) + self.out_proj = RowParallelLinear( + input_size=dim, + output_size=dim, + bias=self.bias, + prefix=f"{prefix}.out_proj", + ) + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + self.kv_cache = (torch.tensor([]),) + + self.model_config = model_config + self.cache_config = cache_config + self.prefix = prefix + + def forward_native( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ): + return + + def forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ): + torch.ops.vllm.short_conv( + hidden_states, + output, + self.prefix, + ) + + def forward_cuda( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ): + forward_context = get_forward_context() + # ShortConvAttentionMetadata contains metadata necessary for the + # short_conv triton kernels to operate in continuous batching and in + # chunked prefill modes; they are computed at top-level model forward + # since they stay the same and reused for all mamba layers in the same + # iteration. + attn_metadata: AttentionMetadata = forward_context.attn_metadata + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, ShortConvAttentionMetadata) + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + conv_state = self_kv_cache[0].transpose(-1, -2) + state_indices_tensor = attn_metadata.state_indices_tensor + has_initial_states_p = attn_metadata.has_initial_states_p + + BCx, _ = self.in_proj(hidden_states) + + B, C, x = BCx.chunk(3, dim=-1) + + conv_weights = self.conv.weight.view( + self.conv.weight.size(0), self.conv.weight.size(2) + ) + + if attn_metadata is None: + # V1 profile run + Bx = (B * x).contiguous() + hidden_states = C * Bx + contextualized_states, _ = self.out_proj(hidden_states) + return contextualized_states + + num_prefills = attn_metadata.num_prefills # request count + num_decodes = attn_metadata.num_decode_tokens # token count (=request) + num_prefill_tokens = attn_metadata.num_prefill_tokens # token count + has_prefill = num_prefills > 0 + has_decode = num_decodes > 0 + num_actual_tokens = num_decodes + num_prefill_tokens + + # NOTE: V1 puts decode before prefill + # Separate prefill and decode by splitting varlen input + # Split along token dimension + B_d, B_p = torch.split( + B[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + C_d, C_p = torch.split( + C[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + x_d, x_p = torch.split( + x[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + # Split along batch dimension + state_indices_tensor_d, state_indices_tensor_p = torch.split( + state_indices_tensor, + [num_decodes, num_prefills], + dim=0, + ) + query_start_loc_p = ( + attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes + if has_prefill + else None + ) + + conv_output_list = [] + + if has_prefill: + Bx_p = (B_p * x_p).transpose(0, 1) + Bx = causal_conv1d_fn( + Bx_p, + conv_weights, + self.conv.bias, + activation=None, + conv_states=conv_state, + has_initial_state=has_initial_states_p, + cache_indices=state_indices_tensor_p, + metadata=attn_metadata, + query_start_loc=query_start_loc_p, + ).transpose(0, 1)[:num_prefill_tokens] + + y = C_p * Bx + conv_output_list.append(y) + + if has_decode: + Bx_d = (B_d * x_d).contiguous() + Bx = causal_conv1d_update( + Bx_d, + conv_state, + conv_weights, + self.conv.bias, + activation=None, + conv_state_indices=state_indices_tensor_d, + ) + y = C_d * Bx + conv_output_list.insert(0, y) + + # Merge prefill and decode outputs before passing to gated MLP + hidden_states = torch.vstack(conv_output_list) + + # Final linear projection + output[:num_actual_tokens], _ = self.out_proj(hidden_states) + + def get_state_dtype(self) -> tuple[torch.dtype, ...]: + assert self.model_config is not None + assert self.cache_config is not None + return MambaStateDtypeCalculator.short_conv_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + ) + + def get_state_shape(self) -> tuple[tuple[int, ...]]: + return MambaStateShapeCalculator.short_conv_state_shape( + tp_world_size=get_tensor_model_parallel_world_size(), + intermediate_size=self.conv_dim, + conv_kernel=self.L_cache, + ) + + @property + def mamba_type(self) -> str: + return "short_conv" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend + + return ShortConvAttentionBackend + + +def short_conv( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self.forward_cuda(hidden_states=hidden_states, output=output) + + +def short_conv_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="short_conv", + op_func=short_conv, + mutates_args=["output"], + fake_impl=short_conv_fake, +) diff --git a/model_executor/layers/mla.py b/model_executor/layers/mla.py new file mode 100644 index 0000000..50d9304 --- /dev/null +++ b/model_executor/layers/mla.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import torch + +from vllm.attention.layer import MLAAttention +from vllm.config import CacheConfig +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization import QuantizationConfig + +@dataclass +class MLAModules: + """Modules used in MLA.""" + + kv_a_layernorm: torch.nn.Module + kv_b_proj: torch.nn.Module + rotary_emb: torch.nn.Module + o_proj: torch.nn.Module + q_a_proj: torch.nn.Module | None + kv_a_proj_with_mqa: torch.nn.Module | None + q_a_layernorm: torch.nn.Module | None + q_b_proj: torch.nn.Module | None + q_proj: torch.nn.Module | None + indexer: torch.nn.Module | None + is_sparse: bool + topk_indices_buffer: torch.Tensor | None + + +@CustomOp.register("multi_head_latent_attention") +class MultiHeadLatentAttentionWrapper(CustomOp): + """MLA layer registered as CustomOp to allow OOT backends to add + custom implementations of the outer MLA layer (including rope & o_proj). + Note that currently MLA ignores the enable/disable mechanism of CustomOp + because there is only one in-tree implementation in forward_native. + TODO: implement this with a new PluggableLayer mechanism. + + This class takes positions and hidden_states as input. + The input tensors can either contain prefill tokens or decode tokens. + The class does the following: + + 1. MLA Preprocess. + 2. Perform multi-head attention to prefill tokens and + multi-query attention to decode tokens separately. + 3. Return the output tensor. + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + scale: float, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int | None, + kv_lora_rank: int, + mla_modules: MLAModules, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.num_heads = num_heads + self.q_a_proj = mla_modules.q_a_proj + self.kv_a_proj_with_mqa = mla_modules.kv_a_proj_with_mqa + self.q_a_layernorm = mla_modules.q_a_layernorm + self.q_b_proj = mla_modules.q_b_proj + self.q_proj = mla_modules.q_proj + self.kv_a_layernorm = mla_modules.kv_a_layernorm + self.kv_b_proj = mla_modules.kv_b_proj + self.rotary_emb = mla_modules.rotary_emb + self.o_proj = mla_modules.o_proj + self.indexer = mla_modules.indexer + self.is_sparse = mla_modules.is_sparse + + if self.indexer is not None: + assert hasattr(self.indexer, "topk_tokens") + self.topk_tokens = self.indexer.topk_tokens + self.topk_indices_buffer = mla_modules.topk_indices_buffer + + self.mla_attn = MLAAttention( + num_heads=self.num_heads, + scale=scale, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + kv_b_proj=self.kv_b_proj, + use_sparse=self.is_sparse, + indexer=self.indexer, + rotary_emb=self.rotary_emb, + ) + + self.prefix = prefix + + def forward_native( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + q_c = None + kv_lora = None + + if self.q_lora_rank is not None: + q = self.q_a_proj(hidden_states)[0] + kv_a, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split([self.kv_lora_rank, self.qk_rope_head_dim], dim=1) + q = self.q_a_layernorm(q) + q = self.q_b_proj(q)[0].view(-1, self.num_heads, self.qk_head_dim) + kv_a = self.kv_a_layernorm(kv_a) + else: + q = self.q_proj(hidden_states)[0].view(-1, self.num_heads, self.qk_head_dim) + latent_kpe = self.kv_a_proj_with_mqa(hidden_states)[0] + kv_a, k_pe = latent_kpe.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=1) + kv_a = self.kv_a_layernorm(kv_a) + + # NOTE attention data do not have position, pass it here + self.mla_attn.impl.forward_prepare(positions) + attn_out = self.mla_attn(q, kv_a, k_pe) + return self.o_proj(attn_out)[0] + + def forward_cuda(self, *args, **kwargs): + return self.forward_native(*args, **kwargs) + + def forward_opt( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor,): + if self.q_lora_rank is not None: + q_latent_kpe = self.q_a_proj(hidden_states)[0] + q, kv_a, k_pe, _ = q_latent_kpe.split([self.q_lora_rank, self.kv_lora_rank, self.qk_rope_head_dim, self.q_a_proj.output_padding_size], dim=1) + q_c = self.q_a_layernorm(q) + q = self.q_b_proj(q_c)[0].view(-1, self.num_heads, self.qk_head_dim) + kv_a = self.kv_a_layernorm(kv_a) + else: + q = self.q_proj(hidden_states)[0].view(-1, self.num_heads, self.qk_head_dim) + latent_kpe = self.kv_a_proj_with_mqa(hidden_states)[0] + kv_a, k_pe = latent_kpe.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=1) + kv_a = self.kv_a_layernorm(kv_a) + if self.indexer and self.is_sparse: + _topk_indices = self.indexer(hidden_states, q_c, positions, + self.rotary_emb) + + # NOTE attention data do not have position, pass it here + self.mla_attn.impl.forward_prepare(positions) + attn_out = self.mla_attn(q, kv_a, k_pe) + return self.o_proj(attn_out)[0] + diff --git a/model_executor/layers/pooler.py b/model_executor/layers/pooler.py new file mode 100644 index 0000000..7dd02e3 --- /dev/null +++ b/model_executor/layers/pooler.py @@ -0,0 +1,817 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from collections.abc import Callable, Mapping, Set +from dataclasses import dataclass +from enum import IntEnum +from itertools import groupby +from typing import TypeVar + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from vllm.config import ModelConfig, PoolerConfig, get_current_vllm_config +from vllm.logger import init_logger +from vllm.model_executor.models.adapters import _load_st_projector +from vllm.pooling_params import PoolingParams +from vllm.tasks import PoolingTask +from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.v1.outputs import PoolerOutput +from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata + +logger = init_logger(__name__) + +PoolingFn = Callable[ + [torch.Tensor | list[torch.Tensor], PoolingMetadata], + torch.Tensor | list[torch.Tensor], +] +ClassifierFn = Callable[[torch.Tensor], torch.Tensor] + + +class PoolingType(IntEnum): + """Enumeration for different types of pooling methods.""" + + LAST = 0 + ALL = 1 + CLS = 2 + STEP = 3 + MEAN = 4 + + +@dataclass(frozen=True) +class ResolvedPoolingConfig: + pooling_type: PoolingType + task: PoolingTask + + @classmethod + def from_config( + cls, + task: PoolingTask, + pooler_config: PoolerConfig, + ) -> "ResolvedPoolingConfig": + assert pooler_config.pooling_type is not None + return cls(task=task, pooling_type=PoolingType[pooler_config.pooling_type]) + + +@dataclass(frozen=True) +class PoolingParamsUpdate: + requires_token_ids: bool = False + """Set this flag to enable `get_prompt_token_ids` for your pooler.""" + + def apply(self, params: PoolingParams) -> None: + params.requires_token_ids = self.requires_token_ids + + +def get_prompt_lens( + hidden_states: torch.Tensor | list[torch.Tensor], + pooling_metadata: PoolingMetadata, +) -> torch.Tensor: + return pooling_metadata.prompt_lens + + +def get_prompt_token_ids(pooling_metadata: PoolingMetadata) -> list[torch.Tensor]: + assert pooling_metadata.prompt_token_ids is not None, ( + "Please set `requires_token_ids=True` in `get_pooling_updates`" + ) + + return [ + pooling_metadata.prompt_token_ids[i, :num] + for i, num in enumerate(pooling_metadata.prompt_lens) + ] + + +def get_pooling_params(pooling_metadata: PoolingMetadata) -> list[PoolingParams]: + pooling_params = pooling_metadata.pooling_params + return pooling_params + + +def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]: + pooling_params = get_pooling_params(pooling_metadata) + + tasks: list[PoolingTask] = [ + task + for pooling_param in pooling_params + if (task := pooling_param.task) is not None + ] + assert len(pooling_params) == len(tasks) + + return tasks + + +def get_classification_activation_function(config: PretrainedConfig): + # Implement alignment with transformers ForSequenceClassificationLoss + # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92 + problem_type = getattr(config, "problem_type", "") + if problem_type == "regression": + return PoolerIdentity() + if problem_type == "single_label_classification": + return PoolerClassify() + if problem_type == "multi_label_classification": + return PoolerMultiLabelClassify() + return PoolerClassify() + + +def get_cross_encoder_activation_function(config: PretrainedConfig): + function_name: str | None = None + if ( + hasattr(config, "sentence_transformers") + and "activation_fn" in config.sentence_transformers + ): + function_name = config.sentence_transformers["activation_fn"] + elif ( + hasattr(config, "sbert_ce_default_activation_function") + and config.sbert_ce_default_activation_function is not None + ): + function_name = config.sbert_ce_default_activation_function + + if function_name is not None: + assert function_name.startswith("torch.nn.modules."), ( + "Loading of activation functions is restricted to " + "torch.nn.modules for security reasons" + ) + fn = resolve_obj_by_qualname(function_name)() + return PoolerActivation.wraps(fn) + + return PoolerClassify() + + +class PoolingMethod(nn.Module, ABC): + @staticmethod + def from_pooling_type(pooling_type: PoolingType) -> "PoolingMethod": + if pooling_type == PoolingType.LAST: + return LastPool() + if pooling_type == PoolingType.ALL: + return AllPool() + if pooling_type == PoolingType.CLS: + return CLSPool() + if pooling_type == PoolingType.MEAN: + return MeanPool() + + raise NotImplementedError(f"Unsupported method: {pooling_type}") + + @abstractmethod + def get_supported_tasks(self) -> Set[PoolingTask]: + raise NotImplementedError + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate() + + @abstractmethod + def forward_all( + self, + hidden_states: torch.Tensor, + pooling_cursor: PoolingCursor, + ) -> list[torch.Tensor] | torch.Tensor: + raise NotImplementedError + + def forward( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> list[torch.Tensor] | torch.Tensor: + pooling_cursor = pooling_metadata.pooling_cursor + return self.forward_all(hidden_states, pooling_cursor) + + +class CLSPool(PoolingMethod): + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_embed", "token_classify", "embed", "classify", "score"} + + def forward_all( + self, + hidden_states: torch.Tensor, + pooling_cursor: PoolingCursor, + ) -> list[torch.Tensor] | torch.Tensor: + assert not pooling_cursor.is_partial_prefill(), ( + "partial prefill not supported with CLS pooling" + ) + + return hidden_states[pooling_cursor.first_token_indices_gpu] + + +class LastPool(PoolingMethod): + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_embed", "token_classify", "embed", "classify", "score"} + + def forward_all( + self, + hidden_states: torch.Tensor, + pooling_cursor: PoolingCursor, + ) -> list[torch.Tensor] | torch.Tensor: + return hidden_states[pooling_cursor.last_token_indices_gpu] + + +class AllPool(PoolingMethod): + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_embed", "token_classify"} + + def forward_all( + self, + hidden_states: torch.Tensor, + pooling_cursor: PoolingCursor, + ) -> list[torch.Tensor] | torch.Tensor: + assert not pooling_cursor.is_partial_prefill(), ( + "partial prefill not supported with ALL pooling" + ) + + hidden_states_lst = list( + hidden_states.split(pooling_cursor.num_scheduled_tokens_cpu.tolist()) + ) + return [hidden_states_lst[i] for i in pooling_cursor.index] + + +class MeanPool(PoolingMethod): + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_embed", "token_classify", "embed", "classify", "score"} + + def forward_all( + self, + hidden_states: torch.Tensor, + pooling_cursor: PoolingCursor, + ) -> list[torch.Tensor] | torch.Tensor: + assert not pooling_cursor.is_partial_prefill(), ( + "partial prefill not supported with MEAN pooling" + ) + + prompt_lens = pooling_cursor.prompt_lens_cpu.to( + hidden_states.device, non_blocking=True + ) + + # Use float32 for torch.cumsum in MeanPool, + # otherwise precision will be lost significantly. + cumsum = torch.cumsum(hidden_states, dim=0, dtype=torch.float32) + + start_indices = pooling_cursor.first_token_indices_gpu + end_indices = pooling_cursor.last_token_indices_gpu + return ( + cumsum[end_indices] - cumsum[start_indices] + hidden_states[start_indices] + ) / prompt_lens.unsqueeze(1) + + +_T = TypeVar("_T", torch.Tensor, list[torch.Tensor]) + + +class BasePoolerActivation(nn.Module, ABC): + @abstractmethod + def forward(self, pooled_data: _T) -> _T: + # shape: + # classify (& score) -> (batch_size, num_classes) + # embed -> (batch_size, embedding_dim) or list(embedding_dim) + # (batch_size, dimensions) or list(dimensions) if using MRL + raise NotImplementedError + + +class PoolerActivation(BasePoolerActivation): + @staticmethod + def wraps(module: nn.Module): + if isinstance(module, nn.Identity): + return PoolerIdentity() + if isinstance(module, (nn.Sigmoid, nn.Softmax)): + return PoolerClassify() + + return LambdaPoolerActivation(module) + + @abstractmethod + def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + def forward(self, pooled_data: _T) -> _T: + if isinstance(pooled_data, list): + return [self.forward_chunk(data) for data in pooled_data] + + return self.forward_chunk(pooled_data) + + +class PoolerIdentity(PoolerActivation): + def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: + return pooled_data + + +class PoolerNormalize(PoolerActivation): + def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: + return F.normalize(pooled_data, p=2, dim=-1) + + +class PoolerMultiLabelClassify(PoolerActivation): + def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: + return F.sigmoid(pooled_data) + + +class PoolerClassify(PoolerActivation): + def __init__(self, *, static_num_labels: bool = True) -> None: + super().__init__() + + if static_num_labels: + vllm_config = get_current_vllm_config() + self.num_labels = getattr( + vllm_config.model_config.hf_config, "num_labels", 0 + ) + if self.num_labels == 0: + logger.warning( + "num_labels should be > 0 for classification" + "models, falling back to softmax. " + "Please check if the configuration is correct." + ) + else: + self.num_labels = None + + def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: + num_labels = ( + self.num_labels if self.num_labels is not None else pooled_data.shape[-1] + ) + + if num_labels < 2: + return F.sigmoid(pooled_data) + + return F.softmax(pooled_data, dim=-1) + + +class LambdaPoolerActivation(PoolerActivation): + def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]): + super().__init__() + + self.fn = fn + + def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: + return self.fn(pooled_data) + + +class Pooler(nn.Module, ABC): + """The interface required for all poolers used in pooling models in vLLM.""" + + @staticmethod + def for_token_embed(pooler_config: PoolerConfig): + head = TokenEmbeddingPoolerHead() + + if pooler_config.pooling_type == "STEP": + return StepPooler(head=head) + + return AllPooler(head=head) + + @staticmethod + def for_token_classify( + pooler_config: PoolerConfig, + classifier: ClassifierFn | None = None, + act_fn: PoolerActivation | str | None = None, + ): + head = TokenClassifierPoolerHead(classifier=classifier, act_fn=act_fn) + + if pooler_config.pooling_type == "STEP": + return StepPooler(head=head) + + return AllPooler(head=head) + + @staticmethod + def for_embed(pooler_config: PoolerConfig): + resolved_config = ResolvedPoolingConfig.from_config( + task="embed", + pooler_config=pooler_config, + ) + + pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type) + head = EmbeddingPoolerHead() + + return SimplePooler(pooling=pooling, head=head) + + @staticmethod + def for_classify( + pooler_config: PoolerConfig, + classifier: ClassifierFn | None, + act_fn: PoolerActivation | str | None = None, + ): + resolved_config = ResolvedPoolingConfig.from_config( + task="classify", + pooler_config=pooler_config, + ) + + pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type) + + return ClassifierPooler( + pooling=pooling, + classifier=classifier, + act_fn=act_fn, + ) + + @abstractmethod + def get_supported_tasks(self) -> Set[PoolingTask]: + """Determine which pooling tasks are supported.""" + raise NotImplementedError + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + """ + Construct the updated pooling parameters to use for a supported task. + """ + return PoolingParamsUpdate() + + @abstractmethod + def forward( + self, + hidden_states: list[torch.Tensor] | torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + raise NotImplementedError + + +class DummyPooler(Pooler): + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"plugin", "score"} + + def forward( + self, + hidden_states: list[torch.Tensor] | torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + return hidden_states + + +class PoolerHead(nn.Module): + def __init__(self, activation: PoolerActivation) -> None: + super().__init__() + self.activation = activation + + def forward( + self, + pooled_data: list[torch.Tensor] | torch.Tensor, + pooling_metadata: PoolingMetadata, + ): + return self.activation(pooled_data) + + +class EmbeddingPoolerHead(PoolerHead): + def __init__(self) -> None: + super().__init__(activation=PoolerNormalize()) + + # Load ST projector if available + vllm_config = get_current_vllm_config() + self.projector: nn.Module | None = ( + _load_st_projector(vllm_config.model_config) if vllm_config else None + ) + self.head_dtype = vllm_config.model_config.head_dtype + + def forward( + self, + pooled_data: list[torch.Tensor] | torch.Tensor, + pooling_metadata: PoolingMetadata, + ): + if isinstance(pooled_data, list): + pooled_data = torch.stack(pooled_data) + # pooled_data shape: [batchsize, hidden_dimension] + + pooled_data = pooled_data.to(self.head_dtype) + + # Apply ST projector + if self.projector is not None: + pooled_data = self.projector(pooled_data) + # pooled_data shape: [batchsize, embedding_dimension] + + pooling_params = get_pooling_params(pooling_metadata) + + # for matryoshka representation + dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params] + if any(d is not None for d in dimensions_list): + # change the output dimension + assert len(pooled_data) == len(dimensions_list) + if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list): + # if all dimensions are the same + d = dimensions_list[0] + pooled_data = pooled_data[..., :d] + else: + pooled_data = [ + vecs if d is None else vecs[..., :d] + for vecs, d in zip(pooled_data, dimensions_list) + ] + + # for normalize + flags = [p.normalize for p in pooling_params] + if len(set(flags)) == 1: + if flags[0]: + pooled_data = self.activation(pooled_data) + else: + pooled_data = [ + self.activation(vecs) if f else vecs + for vecs, f in zip(pooled_data, flags) + ] + + # pooled_data shape: [batchsize, embedding_dimension] + return pooled_data + + +class SimplePooler(Pooler): + """A layer that pools specific information from hidden states. + + This layer does the following: + 1. Extracts specific tokens or aggregates data based on pooling method. + 2. Normalizes output if specified. + 3. Returns structured results as `PoolerOutput`. + """ + + def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None: + super().__init__() + + self.pooling = pooling + self.head = head + + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return self.pooling.get_pooling_updates(task) + + def forward( + self, + hidden_states: torch.Tensor | list[torch.Tensor], + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + pooled_data = self.pooling(hidden_states, pooling_metadata) + pooled_data = self.head(pooled_data, pooling_metadata) + return pooled_data + + +class ClassifierPooler(Pooler): + """A pooling layer for classification tasks. + + This layer does the following: + 1. Applies a classification layer to the hidden states. + 2. Optionally applies a pooler layer. + 3. Applies an activation function to the output. + """ + + @staticmethod + def act_fn_for_seq_cls(model_config: ModelConfig): + return get_classification_activation_function(model_config.hf_config) + + @staticmethod + def act_fn_for_cross_encoder(model_config: ModelConfig): + return get_cross_encoder_activation_function(model_config.hf_config) + + @staticmethod + def resolve_act_fn( + model_config: ModelConfig, + static_num_labels: bool = True, + act_fn: PoolerActivation | str | None = None, + ): + if isinstance(act_fn, str): + if act_fn == "classify": + return ClassifierPooler.act_fn_for_seq_cls(model_config) + elif act_fn == "score": + return ClassifierPooler.act_fn_for_cross_encoder(model_config) + else: + raise ValueError(f"act_fn [{act_fn=}] not supported.") + elif act_fn is None: + return PoolerClassify(static_num_labels=static_num_labels) + else: + assert callable(act_fn) + return act_fn + + def __init__( + self, + pooling: PoolingFn, + classifier: ClassifierFn | None, + act_fn: PoolerActivation | str | None = None, + ) -> None: + super().__init__() + + vllm_config = get_current_vllm_config() + self.pooling = pooling + self.classifier = classifier + self.act_fn = self.resolve_act_fn( + vllm_config.model_config, static_num_labels=True, act_fn=act_fn + ) + self.logit_bias: float | None = ( + vllm_config.model_config.pooler_config.logit_bias + ) + self.head_dtype = vllm_config.model_config.head_dtype + + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"classify", "score"} + + def forward( + self, + hidden_states: torch.Tensor | list[torch.Tensor], + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + pooled_data = self.pooling(hidden_states, pooling_metadata) + if isinstance(pooled_data, list): + pooled_data = torch.stack(pooled_data) + # pooled_data shape: [batchsize, hidden_size] + + pooled_data = pooled_data.to(self.head_dtype) + + if self.classifier is not None: + pooled_data = self.classifier(pooled_data) + # pooled_data shape: [batchsize, num_labels] + + if self.logit_bias is not None: + pooled_data -= self.logit_bias + + pooling_params = get_pooling_params(pooling_metadata) + flags = [p.use_activation for p in pooling_params] + + if len(set(flags)) == 1: + scores = self.act_fn(pooled_data) if flags[0] else pooled_data + else: + scores = [ + self.act_fn(vecs) if f else vecs for vecs, f in zip(pooled_data, flags) + ] + + # scores shape: [batchsize, num_labels] + return scores + + +class TokenEmbeddingPoolerHead(EmbeddingPoolerHead): + def forward( + self, pooled_data: torch.Tensor, pooling_param: PoolingParams + ) -> torch.Tensor: + pooled_data = pooled_data.to(self.head_dtype) + # pooled_data shape: [n_tokens, hidden_dimension] + + # Apply ST projector + if self.projector is not None: + pooled_data = self.projector(pooled_data) + # pooled_data shape: [n_tokens, embedding_dimension] + + # for matryoshka representation + pooled_data = pooled_data[..., : pooling_param.dimensions] + + # for normalize + if pooling_param.normalize: + pooled_data = self.activation(pooled_data) + + # pooled_data shape: [n_tokens, embedding_dimension] + return pooled_data + + +class TokenClassifierPoolerHead(nn.Module): + def __init__( + self, + classifier: ClassifierFn | None, + act_fn: PoolerActivation | str | None = None, + ) -> None: + super().__init__() + vllm_config = get_current_vllm_config() + + self.classifier = classifier + self.act_fn = ClassifierPooler.resolve_act_fn( + vllm_config.model_config, static_num_labels=False, act_fn=act_fn + ) + self.logit_bias: float | None = ( + vllm_config.model_config.pooler_config.logit_bias + ) + self.head_dtype = vllm_config.model_config.head_dtype + + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_classify"} + + def forward( + self, + hidden_states: torch.Tensor, + pooling_param: PoolingParams, + ) -> torch.Tensor: + hidden_states = hidden_states.to(self.head_dtype) + # hidden_states shape: [n_token, hidden_size] + + if self.classifier is not None: + scores = self.classifier(hidden_states) + else: + scores = hidden_states + # scores shape: [n_token, num_labels] + + if self.logit_bias is not None: + scores -= self.logit_bias + + if pooling_param.use_activation: + scores = self.act_fn(scores) + + # scores shape: [n_token, num_labels] + return scores + + +class AllPooler(Pooler): + def __init__(self, head: nn.Module | PoolerHead) -> None: + super().__init__() + + self.pooling = AllPool() + self.head = head + + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_embed", "token_classify"} + + def forward( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + pooled_data = self.pooling(hidden_states, pooling_metadata) + pooling_params = get_pooling_params(pooling_metadata) + assert len(pooled_data) == len(pooling_params) + + pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)] + return pooled_data + + +class StepPooler(Pooler): + def __init__(self, head: nn.Module | PoolerHead) -> None: + super().__init__() + + self.pooling = AllPool() + self.head = head + + def extract_states( + self, + hidden_states: torch.Tensor | list[torch.Tensor], + pooling_metadata: PoolingMetadata, + ) -> torch.Tensor | list[torch.Tensor]: + pooled_data_lst = self.pooling(hidden_states, pooling_metadata) + prompt_token_ids = get_prompt_token_ids(pooling_metadata) + + pooled_data = list[torch.Tensor]() + + pooling_params = get_pooling_params(pooling_metadata) + + for data, token_id, pooling_param in zip( + pooled_data_lst, prompt_token_ids, pooling_params + ): + step_tag_id = pooling_param.step_tag_id + returned_token_ids = pooling_param.returned_token_ids + + if returned_token_ids is not None and len(returned_token_ids) > 0: + data = data[:, returned_token_ids] + + if step_tag_id is not None: + data = data[token_id == step_tag_id] + pooled_data.append(data) + + return pooled_data + + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"token_embed", "token_classify"} + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate(requires_token_ids=True) + + def forward( + self, + hidden_states: torch.Tensor | list[torch.Tensor], + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + pooled_data = self.extract_states(hidden_states, pooling_metadata) + pooling_params = get_pooling_params(pooling_metadata) + assert len(pooled_data) == len(pooling_params) + + pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)] + return pooled_data + + +class DispatchPooler(Pooler): + """Dispatches calls to a sub-pooler based on the pooling task.""" + + def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None: + super().__init__() + + for task, pooler in poolers_by_task.items(): + if task not in pooler.get_supported_tasks(): + raise ValueError( + f"{pooler=} does not support {task=}. " + f"Supported tasks: {pooler.get_supported_tasks()}" + ) + + self.poolers_by_task = poolers_by_task + + def get_supported_tasks(self) -> Set[PoolingTask]: + return set(self.poolers_by_task) + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return self.poolers_by_task[task].get_pooling_updates(task) + + def forward( + self, + hidden_states: torch.Tensor | list[torch.Tensor], + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + poolers_by_task = self.poolers_by_task + + outputs = list[torch.Tensor]() + offset = 0 + for task, group in groupby(get_tasks(pooling_metadata)): + if not (pooler := poolers_by_task.get(task)): + raise ValueError( + f"Unsupported task: {task} " + f"Supported tasks: {self.get_supported_tasks()}" + ) + + num_items = len(list(group)) + group_output: PoolerOutput = pooler( + hidden_states, + pooling_metadata[offset : offset + num_items], + ) + + outputs.extend(group_output) + offset += num_items + + return outputs + + def extra_repr(self) -> str: + s = f"supported_task={self.get_supported_tasks()}" + return s diff --git a/model_executor/layers/quantization/__init__.py b/model_executor/layers/quantization/__init__.py new file mode 100644 index 0000000..ffaad80 --- /dev/null +++ b/model_executor/layers/quantization/__init__.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Literal, get_args + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + +logger = init_logger(__name__) + +QuantizationMethods = Literal[ + "awq", + "deepspeedfp", + "tpu_int8", + "fp8", + "ptpc_fp8", + "fbgemm_fp8", + "fp_quant", + "modelopt", + "modelopt_fp4", + "bitblas", + "gguf", + "gptq_marlin_24", + "gptq_marlin", + "gptq_bitblas", + "awq_marlin", + "gptq", + "compressed-tensors", + "bitsandbytes", + "hqq", + "experts_int8", + "ipex", + "quark", + "moe_wna16", + "torchao", + "auto-round", + "rtn", + "inc", + "mxfp4", + "petit_nvfp4", + "w8a16" +] +QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods)) + +# The customized quantization methods which will be added to this dict. +_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {} + + +def register_quantization_config(quantization: str): + """Register a customized vllm quantization config. + + When a quantization method is not supported by vllm, you can register a customized + quantization config to support it. + + Args: + quantization (str): The quantization method name. + + Examples: + >>> from vllm.model_executor.layers.quantization import ( + ... register_quantization_config, + ... ) + >>> from vllm.model_executor.layers.quantization import get_quantization_config + >>> from vllm.model_executor.layers.quantization.base_config import ( + ... QuantizationConfig, + ... ) + >>> + >>> @register_quantization_config("my_quant") + ... class MyQuantConfig(QuantizationConfig): + ... pass + >>> + >>> get_quantization_config("my_quant") + + """ # noqa: E501 + + def _wrapper(quant_config_cls): + if quantization in QUANTIZATION_METHODS: + logger.warning( + "The quantization method '%s' already exists and will be " + "overwritten by the quantization config %s.", + quantization, + quant_config_cls, + ) + else: + QUANTIZATION_METHODS.append(quantization) + + if not issubclass(quant_config_cls, QuantizationConfig): + raise ValueError( + "The quantization config must be a subclass of `QuantizationConfig`." + ) + _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls + return quant_config_cls + + return _wrapper + + +def get_quantization_config(quantization: str) -> type[QuantizationConfig]: + if quantization not in QUANTIZATION_METHODS: + raise ValueError(f"Invalid quantization method: {quantization}") + + # lazy import to avoid triggering `torch.compile` too early + from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig + + from .auto_round import AutoRoundConfig + from .awq import AWQConfig + from .awq_marlin import AWQMarlinConfig + from .bitblas import BitBLASConfig + from .bitsandbytes import BitsAndBytesConfig + from .compressed_tensors.compressed_tensors import ( + CompressedTensorsConfig, + ) + from .deepspeedfp import DeepSpeedFPConfig + from .experts_int8 import ExpertsInt8Config + from .fbgemm_fp8 import FBGEMMFp8Config + from .fp8 import Fp8Config + from .fp_quant import FPQuantConfig + from .gguf import GGUFConfig + from .gptq import GPTQConfig + from .gptq_bitblas import GPTQBitBLASConfig + from .gptq_marlin import GPTQMarlinConfig + from .gptq_marlin_24 import GPTQMarlin24Config + from .hqq_marlin import HQQMarlinConfig + from .inc import INCConfig + from .ipex_quant import IPEXConfig + from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config + from .moe_wna16 import MoeWNA16Config + from .mxfp4 import Mxfp4Config + from .petit import PetitNvFp4Config + from .ptpc_fp8 import PTPCFp8Config + from .rtn import RTNConfig + from .torchao import TorchAOConfig + from .tpu_int8 import Int8TpuConfig + from .w8a16 import W8a16Config + + method_to_config: dict[str, type[QuantizationConfig]] = { + "awq": AWQConfig, + "deepspeedfp": DeepSpeedFPConfig, + "tpu_int8": Int8TpuConfig, + "fp8": Fp8Config, + "fbgemm_fp8": FBGEMMFp8Config, + "fp_quant": FPQuantConfig, + "modelopt": ModelOptFp8Config, + "modelopt_fp4": ModelOptNvFp4Config, + "bitblas": BitBLASConfig, + "gguf": GGUFConfig, + "gptq_marlin_24": GPTQMarlin24Config, + "gptq_marlin": GPTQMarlinConfig, + "gptq_bitblas": GPTQBitBLASConfig, + "awq_marlin": AWQMarlinConfig, + "gptq": GPTQConfig, + "compressed-tensors": CompressedTensorsConfig, + "bitsandbytes": BitsAndBytesConfig, + "ptpc_fp8": PTPCFp8Config, + "hqq": HQQMarlinConfig, + "experts_int8": ExpertsInt8Config, + "ipex": IPEXConfig, + "quark": QuarkConfig, + "moe_wna16": MoeWNA16Config, + "torchao": TorchAOConfig, + "auto-round": AutoRoundConfig, + "rtn": RTNConfig, + "inc": INCConfig, + "mxfp4": Mxfp4Config, + "petit_nvfp4": PetitNvFp4Config, + "w8a16": W8a16Config, + } + # Update the `method_to_config` with customized quantization methods. + method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) + + return method_to_config[quantization] + + +__all__ = [ + "QuantizationConfig", + "QuantizationMethods", + "get_quantization_config", + "QUANTIZATION_METHODS", +] diff --git a/model_executor/layers/quantization/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca49f34f3019d01cd4b9eadbaba97fe4d2e71e28 GIT binary patch literal 5204 zcmbVQTW=f36`m!R_p3N&F{CHaORi{;tx zoS8Xu=FH5QGyF#+5+v{_{I7GKMne9LO!e_pz}}|-J|;JaL?lijD!0U`t|b?Tyi0Mb z{1VTwTk)vgB`-%@MCN7RW6o*6#Zvkd003|{Ax#U2#wEA5N$w4`cwd9E88HI0- zEJY=+9A0%vz7P1N7-GpU$7L_pTIHnxNbA;!6#SH1N^FHv;jchWQZCzfC1=PqtJwZE z*$}gIO;2%lD3{L}qEc8}lPR!?tHo^I$lc2txk7%TkYCNMr8;eQcH^!cl4M!aHCdKc zHQR4!MKPB*Mr`-0He&lVLt7CM1y`5XWL0I@zp9CMQJ?Ks3zDo9G{X*85!5(s`<8RY zvXa$per>I|YDd>J8@g0p1QXrrIC~CRJ)eV`%n$Vx^#JRMvG_8ZzX( zUZA=ihU$7YFD-8wvTnO?-@R*x zJ6rfZm@OKGAzCQrCEHDnyzS2AS8R{EiROni*#Ph5*OBpTj9|<9t2f5y7H5{m7iZ?@ z#M#Nk%k!76+hOL1ieFbMO91BuRs|t$`0XWt{0bJ-hh&wLxDWgmXL8qJ-m;TWd`(`< z=`edjR#++OMnTQplOieW?_X*SvnXL;JJBQ&O?Jk6;#={U66#FF60Y_pcgf* zKn+l}yv4K)3R{Juu#(LS^kqFVwTJXM9STOFs!PZjmF~x3g`H(B(3nH526Tg_&I*gS z<-_udb@{9+SCmX{W>rm*4=5TN8xvM(L1p%)*>V!)O?d_8iKdn8mQ3~Zq0VwDj!_t> znUYSY(W2TAokd!i9MKuk;Gx>78uw#+$1Qizs!BIMT5ozetIHK7!YfC3^@ZL1nsFVn zx4ENxHH-Fel_G*1@M))2b@VB-p(3=sh${%oe6qp3! zgM04Y@~ieyf1lnjWECoBr7c0;gb1Pw5DkTmoT3QJvQSu;=?2Xi5H2wQ7|(CV%I5CV z)AuG`=%Z4oLO4QQSqK2jj`Itv!mSr){Z=~Vr+r|y?RAbD+qaRW`CNX@_GUFrh5#GP z=~X4vi#3Bk%_>EClF|aTyTyqc*B9q!XO<=}RpP3+I4`nrD^ASMP0dWhAs4@kOTMx( z#1%!SM?sN}z7_6&e=}pKT1LxhLk66GvN5z-JoDDboilPq$t`EJEr^Zz3`W(=;kc4< z;!1`ouLO#W2uBD+e=)6X*?w_@;vnd~^y@ZGdQ>T_WEFiZU9JBUv~C5+|L&8WP}6o> z=1C~?ML6-Bxd(G)7gu-gS-gEGE_@g8M55nDN#w}q;r=J#{zu{M@EiY$wUkvVTR*MtJ-Eo7_L*=)ebAla6;)+yQ~<)NkvePSaF8ym2S1is$;l8=~erz1jCKW zQMKPnGTfvLs41(S;b!HSdfaMI2dzfNwCakCESpcw}`WX6D|O&(~*i~>Dj`hcD^y&rLA ztC=tZX1&RoO=i34GTY3g*=)9$ozn0hB7fnmF0`H&cwV}Q$z@I2G*L@K7o9A$R#bNhGU)4=K~F903n!g?bOw^? znlLtXae8ugc1qh91r9QisRb5bYYl?a(>JCLa0`q3W2u`4?s?6)_G z<*UzE49v_;)XIZ13zP3wIG0^Fi<$nsX4K5JlU32&`job>H#%F8Z_bUMe6yx0G>chv zjg2p0c73=}WMOe(qNW;W?b_m8twm%Jv-a`%eF1uPQ7hI2p_`bbJ1oUxn);F8flTov zreP#_{!%<_DTXf^M^c9bb99PX1Fc8WfCTP`gf<~*29k=<5Mo@Uv=vDkl6E8=NIH>p zArX)qLDG$+2T3oG)OmJH(jmb1DLjpSK$eRsFTLYXdInpvn~TV9F7z$b&4zmx5wrVk zL~LlIh|qX?4v5{ie~}U^7pe4%B07c*nD@>jVq?94h`Dea5%cIpM9iZTh?pNQA!3f3 zM8upqg^2lf8WD5#Wk7Ziu5Fd89-YA==CgNL5nkV%OqAkD>P*@dM6*bk$LA0+7taH- zJ?zp(7f{Nk>M9}u$u%Twg03TCt6~umP7A#O#P$|7DGL{|G%zW_IY`mnj*FRhnPZtx z(VdQ;n2RnT%N)ae!rZ|;!JNRx&xVepc1DVWbOyFRCN>-#i8BOdJu{ry%*uXM7uXAupO>Gn0hj9g$}wc(B+yi9K3=XvCP4y;hv&*k#PuW)dl)P?AZ{c9$h-#2 zO~^EpKx3(?zubaMD{G!Aw;|K+bl8Eo(`nv?SRjFpQs*1xBgk~K7RSmx$n=ti?wwfY zPIGD}-nvuQzSGdNlN5Fu`*vc@_=2RVALuvdpmvF`FJI({r{QzH?Frwu-9EO>pMMr= z`{c@#P^!eISW=eaolH8(&WlRwI?Mh9(AjLVSTmPO+FoNzgJhh#VR|VK#1668<0R`8 z??6918XdTuI{o`a5pVUPNK>d^L~XnRHxlPc;09pFL@}F(za`k8PdZe^O~&$0n+M}$ zX0PpoT8aXtr?7Mw3CmvnqmZ)~6?u&2KnBZ`=)VL~c5@u}Uq9jcN~HHo(z8ph{VP!a z@ZG0@&imeHjU8oIhbR2CyQ|4nBFV2WaHOaI&zq$a6HoguJ?)+>k@|B?-BTv-{WRD?%1QAGQscPdP`?!o(#XU$GC6ab$LRrpZl1R{|A#} Boc;g+ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/auto_round.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/auto_round.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e6615a01264502dfd545ea11682506f9036772f GIT binary patch literal 17005 zcmeHvX>c3YnP4~WivUP~00s0f9jW5oTK|I%S7KM~ z@qMow2VqEdvbDAIW1qm+@A%&Lj(+dE-q-q1B_%ovLO}6{{+|z0)UPp<2}&vP_~($g zN--2e2dE%DLDQI)1!O__ggmI2P>{Slpqx-axjdkrP{UIZ&;+#;T2iMB=z{tQeb6vr zAbC}wBxsy422B$tl2-@J!P1FRlGX&uf|dyjNoxbvVEIHjO|fzgaq9xMpnbwlQ!4<)us1L_JG$f>EB5O{kz$1*K{&mHWrkGa6RIXyH%CY8gGNgLTxg@T`Ci zmedr*s7YMHVtL7w49lf5BS4$r&&;T&%9g1qnkl_lGGS!QtZquilwDLzm{>F9N+D;t zsJJc-d7_Mz>RHQ7i&T{NK>Pr2&4Vu|@ERyB&nvY_Sjb4B&X!kahdTR)5t|umO2Iht zU>r;Z8K)D*SwU98%xVFq67o)_fvJLLCDX`Mi?f89nv2SbstqNZprjfhpnn&@*D$p( ztAV`fNXeN!Uo9i;6igkot6iqf(GzuJl)FhV4K9S|UYJ{$WyCGDbWD7@CIh}u$OT)) zm4)ckxgfm2PO-3+CfV(-Q{nJ@XuHSb~hTW8mnE!|rQ{D1-m z^O+1IJTRY;tOino+&>%s(&~c&csU%wc9NW)55KWwDJ=D!ePaOj2lb^6Qrm+*F5sX2 zQjf4wMvy~=pul#50?P%3f1W)js6^ONL4THPZ|_NeI3(z%p~rb|2+o-x4=n}-mw9{~l->{ptK&=G0|SE(SJp}ER1t^o-Q z9ie{&C$Bi{kEkCjGF0nLI-&+BT=IfYfKeV%OBjp57GP8WMncgyqAb9uh$tkCMPLgs zY5=2QBR0$xU{pplfKgQ(wg4lnqc^)=5m+g2c3u%V7s_2#nTi%I4r*c+J@y%-UO;$%Pv;DMk~aGd%)C zhW=8Elv_KxRB?PZv@kzE$AwwOmA9?8yOx~oE{W6!7F~0*5J`|O*Ag(7T(j)Xg+^8R z6&nu%)RyPhO1m72E8AU9?Mk5AKXbuc+*UyeU9y~@@p=i@^9shXa6H0R>!$)5@G6)UIaz#Ufdk^g&-8aV7Kxf=0jIdj- z79v6C5BX<9VIPnxIOAashB6=Uhq*e0sK=-QBV2P152v8Vse7k@ea~@jouCM@fvIAc zi(wG7UN0e;UT+9@5>iSME_`{y6AsRM=Kb>>VIMcmhC9wJ^lj@u-N$+Y{*#{hMd0tV zo-=_!&=Z_v*npQk$4&wj<2(W1BFlw5673;;-;<+@o%4%a8}zi~7~Dw0J2Ok&4z-~f z;Ml*tKs~WhTH_@(uW{VbI8%DtDwC|~=HaLBzE@V2Dr<`!itS2M^tQHCnez_$&= zTX(1Bbm<`dST461)0Di#nAT8+^3{FGs%{>BQd4)#9jk<HiS+0 z6*P6OS(CMWJpA;1q}R^acjBD`&~~@j?I7Pa7;i1w*0S1=bawOble%q=sn-^vX}8#P zh;JK;x9-Jm_vd%3qpF&h6?Zg_l(S)3aoLoz*x;c_)wE-2QxL z30m^V!mg!~c2|z!?OIZ`yMX8I5@@$uUa-O4G`9tM58)JCTpz@EbUAjO^LmTzG_mFt zY=Z4BQ1^|kvEF3c4!&(iymcqMpIy$Bv*stBcRcZm)>zZJ^+U%8j&yV7#G zR>$>2%MFVLd2!K@4PK=dX}4^2Nr!vCgGdVRQ}5G)d@>N)FitpIgkgVmoS7OQpbMe8 z0pm1Pezjj*e)6)0NWVEQJyYbxx_-{#oy&Z{By(543b~sjk@AHp2J;S9Z}$zd8!s zYGf41o$-WB)3B%Eg*);opCod1&V=S>ix1C=!xIHF@035lqU;u0YA80qLZ~1PQ4bO9 z3v+AUatSICbTa?_BWEWA-UCCw3OZs0{vn+$3=tY9ljGx0fsuU!82JVm?7bjBUy5m( zQD(GK7*TO36{DV(G1S+>X#m^TLumm@_qDk6i~;&A$-vz(UY8^>c+sX&L4h)P0sH1` zFhRm;1#3nC_vVE1P#y=z9Z%S}U_dtNm1Gpo3FU500f8g3K`{=F`a15QDQ+(4JB;}xva{VR#6Sj$@)INzK_&($9oRO9nXEV4{95*)>)mb=-?|lVpHqiiEkbL zNRg;GlCo7KZLPemHMVvA(E6@hjS1UMa?ZY6C;(2qq&7XAjhrf++LyizQL{rJ{U}vt5!E zuQ@|;8aRIPOR|jcJx1kJqzE0Bwi&2bQhx?2ssLPBVIM{+1+FGsl1VSiz`UTis3{J) zFRQF&dL=lKh!zCyBBPAZGg(z2S3Q$e0rJYSVt%eHqZYzUUoXA}`>3~-ld@@QQubOE zMMagMp5TEf2)PyFw76oRq9`(Jaz%EzMnOeY&{oB@<yL7B~se zf>9$Ftb0}|Vl(wbQTj~3e27i$~Q=B<$p)<_rCMs#^A z4m=~OEV7;@0IQEGi>{?UvnrXVw3h0KUR=vb8NG2mb!0vD5#2@wSAnbt_I~b7j=?UV z;V-H@n@MT2DJXN70I@~BC}=Yl4Q2ytLunOv;p6`dC-(>RTZ*gHC1o*rHPjWMIRn&X z&v9=3W`XTVwMEVZ)XJc?1lpomq##Vm$3Wi2IP#R9;Ut8Lk10aoPK?STvYD)EpF__m zo;~8sn|?}bILBnZPnprzv{pgu(&97Dv=nEap9`ZzLPE~C@CSwC)T|G$2BxYLX5R@lq(%*sv4#dCZ4Lj{=d{_mcRdMXTbOAIq8<`jem zuxQZK3F`9yI7^+ApPb&i~E(=z8#Ic`sxN< z3>kS3{5@`h=m+#6iUyfCLcIp8H44WZ$?p&8t29kr)+xX~^%wH9G&h9kQHKEaN7mUT z(367Yh5aM@U-ph2+rRhV;9-8s$9vghWxfu)f`5eg+SQP>cL ztj?Wap7jMn14I`pEXyMLA>a=oU;6ETMB?ZdqWzAw{e$T*LU^LT zXDp@NMp35RXiF00&^OqRAX56nY%s)i13b4GA}~46E($V^6;vU(zyTdkJvkSg2h~GR zvS`n8t2os9xt9p=zy(>hLeLYx|N7y|){B zZ*d9hb09;K^25J45o`FF=g&PK4JE8cGo}0AUtH^bum4(q!n!xFFBP?WFTJ7YiGiwb ziZ}Ho>iXk`O(|DH($&qox|6Ox-qrVyu5DKgE9zCtJ$vn1^NpT6cF$jh9!qZ-gx(!x zPbgVG&ZxXPtZPDN-976cPG{xKJ@&hCqDjSqUxng@%|4MW4%A?|MUJ^Orq)#%*5q< zL@6qqU`lcvpwlL*cKgEyssO-}9}NaUSdAlgwlx5c&5j8Gz-Y5RsV4Y!0ku&8q3&{==@6c~?Q1cMI$C1^20%qoH& zlj4GfLU?WSE!Ve@ijg4FbD}=y6YZtkRpMT<)P#8FL%a!yLjO}FT@Pi=TI0K8X$l@U z#y`hLdYq;V=A^#rw!Ui384D*pdwBTO?zwBJUY$u;+;M{&47wHdYg=w~CLG)1)@{q$ zl-`uoH{I4Z-E&j|X{a)$`gbP#$8PtJ5uF)|DvZkq!M>Y^dEsa0-#?%9jPRb3q~{3l zIg;=kUgYFR2m;tb*$_I5_@&`mECLG?^rq?Dyh=S)!wy=kIikV zs*bdhDsjNEt~5T>QRd2f7TX_(R(B-q%?V3OZ1A?FBURb7_QpE{ajW~D&6NU@s#}YI z$+UNnhPZqP?bvo8%5osehV1xKE$zvcy?o2wWXnOm<={WI9OBIlE4tMKDNFU5^R}gJ zU6<_I!*}huW=@1ZMS-_ZI3bQ2S2VK_-Nb8p?KZ#dv@o$J*(eNRJJAT?d$Yy zdv|J6Ypna7@7#8D+;cXhoGmeD%m72*(wFMnvwj*(ycK(K5|uWXM7b?pO1Tfy539iB zzS_FxPnf#mnyxP%?SKXP;!z!R|HUH}RsSL#DuaFS_dT`4eTsi5sTkg=`iFM&2(8+n zso)7m@!>IUJ`hMi=Qs^z#B#`;fHG13kdXsGY(N_yBjWFyA_y zGFy=enA>1qw;7lB-7z%7@g!%M*H^{$Eh!WdcJi*BH^-B^M)_T%3D?o(eIFaDVYeIH z#Vm|xfrP0nR!)~wt*%S8WANg_HYr2RWKB1j z67Zu@z#TYUOkF5uf&rfa4VXV9r3dCJXOw922`S+AT@Fh4(J%_iB+R&^MBXE>wl1$& zN;2wqWlJt-vG&B{*>G*^sq8NA>wDlQVX3=7=8Jg69{s1qC~z$*g)* zKPzK2QGG&IRk&gkg;M%%bqf>{m`#PMx8gFH^F+Nrce+v;{;O&&6EV3!OExwc9${J2cuz3 z?<%9#h-GG5K~u;^teJy5v!h6D-tGgVTu#~lx;<3}qXuK@`I7Ud4E`~g`T5fGWw|`~ zADBmpbOjf18X^^ec=f(au!-bSbhgP|V+ot2au6~$)`diU?#r%M8bhzS#=!?|yQ?wO z?izxt05LU!E;_C;aM2O-V~asd5VJm44KoMwgJLj;s_-yqnx?^*oP0^6LO^}*fkRYWv~h=`Dz|EApKqQr3q zLH-3s#K8pDKW_~SYz(*~y~J7c$lx(S2XE~u(=8Rd0zVQI{Lu+6e^R+hafIZa3bZ}o z#6^NArI(AG7gtzpPPWccV{ryu0QPIZ9r`86>ldg`G=sm9!DT)O(jcEU@8p|z-aLM5 zIMIBF*EB5aS0>ghDU&5>s^?AhYlrTby6%}ORvUlXdSf{!Ll-i+JGiD!n=At zwQNdS8hK0Oy-ebvQeI_TJ`9d4l})j#xN{5W(M}`3#JgP&92^bKEhtrmC9(wx*14Ka{dotZoO8jauF892{?4&bcMgL}@OT>dSZZ zO?OQdckT7@hArz0$-WW3ZzN$q5H}zA#PBl6HnmMDM}4ZMB~{nBI-RO?J=Cjg#^vF( zk+Rk$EgifCuJsa@ElJBTZyCO|05^7)<8j0BPxa+1?7K5JI^)&b?&!CZF52TA0}1=? zxOw*jd(+xd!oE2U_lhQqD9+|fLug1xLG^Lf|CffIB@K<|$vZAsuY!W(7b5hX_O;-N+{p2ETRxZ*#T`pp@jMh*2sT=z1k~#r$Q8`E}hGKG&2z{gE zI=a?Cc&cQy?{|`nj9JOZBuYlmkNKaiTvRH`MciAk0Nf>vE@On&c}ZH46{9}x^Vw4je)I@kV*)M1@(SNdgbHC8@O_?{kglAah4aEBw82xLA zg!bPB2ltoQ4o#HYpJ4Q-7-i)F?x&Fby+nW#d>L68fcrBo-VX8JH(}72*6KzHH$9t?{;gpuQw&^+miO@csty?RIF|VVFhnu5WcZnYB%KHV%kc7vJqw6Gg5r|4MmpeqW6pn3?tGifC`WEd=>^}5e#Tr$zsTPiI5K)ir@mj zkF1B&n=~llYLVFh+pruww*Tb-@u5t ze~Zz-!>AUc4v5^kLQN|-3l{@rin=)Bf6-h@M4?k1e-HLk90b-|b%>W%kY5=BtM_X*G zB+HlDS9_C{ojm+XI|*!U?Ay1N@?hIz!{V|3)aFXdXuBun=}CHqc+XJ06W!>`i5q?9 z`awro9Yj0eP_fnSG-DKm40NbI=UrIL zhcF9gg7F#(kU*AE;C#3;sLdMYa!~nHRH7^9Tx})=-fl)hbBf>xs)e}GDpn8*aN`G5 zPs9C`kEisZLQcJOK^3DvXf2h_pv9-i3Y(3vk6boQ{B!>*V|b zd7hhJ5N#*7u$mnEUtsEeh~RbtZyv~I6s;`dkR`4hc)H2CCbC%ZJ4aCf5^3Mh@%TQ5 z))%O|`qC8!oEvW2no~7($r{g{8qbREzO`np={ESnK6i8Y*4A4s@$H8aU=zr?LFjl} zPtw-M+xl**Zpjk1z2N>}J9w{hGkg?s+uEDfQk%NqFqc#(4fVXC{*IyXo@rCk)Wn;b z?tnXm$?|sOQe^eaZByfY^w#L)EuFE2b5tsw3q7p6m;uMEYK9k_D5l7v_k{MhNx-wMX1Ncsw|a zaPp!51~tFENIjIvl$u8x%2sz_H07vXIU844Qx%?g_i)@X5?7Qzl4+c(xWbmMqRJ~$ zHNB~AyHcH-Q8KhwKeTixx7;sxq?Paldk&=3n9@+jva}XcI;x^7t;ZA?lp5BUSkE>8 zHPdwzjkhi_MhPo-V~y8C6!E#(vUPN@x7M98<5; z3Z-tf{xJm)K&&bSYOkyU|EEV3JRi$5B`_qhu03Xru|GTg{%PL59bwwDFzs2Gvb7$p zD$7=(la17TZ9GSvz$NuACto@7@0hpAsfWrCsB+vU?pyjL&oc zhU<#;z_qf{wch&}ACFZkj&a0@j$}}1eg-fhTmB_tqvy2P zBOZ@u>cD5fP41+3Ui$w4yln6!%P=?tJ+?HG&NBZeKuDy)W>Of+{pSFp>E2arV1&wt z#QOq5EJSsksPNBNYx$hQ1nKI3 E16H$YbN~PV literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/awq.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/awq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1788d8033a42a42f3826ce9666a6659e6dc9bd7 GIT binary patch literal 12290 zcmd5iTW}lKb-Mr-Z-5~90w0o=;!7Y!P}GZh*m5X}dW)7tDz-66HXwFMfx?5jyOc>7 ztQ}XKP>oYbopwY|otkPhV`}WFnV&|U=`_lpXZiz}$^hA+rt&16j{60f@;K>7+jH(> z0g!@YyX|y(3EsW;+{d~1?z!i=|KxJn8Awsnk0O8D#W26Yik`SaVf_gdZZjeyvQZ|+ z2HBV)XowkuMiya)s3~ZIr!mUK%s~rd7>^|4BYsvj3)nmgC5I56k(ESD zNKDIEjl?605KT-@N(8Fb<1?}(o=XhFjP|pUxD+BsL$aiLwDLJgxs(toq~T)x%1kJ( zM3RzN-sm@|-WRlnkOJ6;67h-1WDzEnrT}!Fh%7`ya}p8c%aQ46Nrch96C^P+O{W+Q zhoaK?5DCR3MIvg=`B3<>Bwjsx_FNGNlWd!m6hRJ6K%cmrAhIxxBV+*vg~X5&f-W`X zE7Degn|Hk^MJ6vP^0^S;CSmQcwcJsttbYsM%nyOKBqqp;Owb^*L8FZkeWFn^i6;2x zBu+F-X35eBLqtQ;0=Y#rOxPEg30AaTa|NxUMY2p7MB6n}&?YvDc6izWW|kaNzH+2m z8RvoC4(R2Q+|bViHRm8T5pKyVS)`f?16`3tWF|~vRau`} z(F2&fpT<13^Jy4RoCUBoh~CovCd^_Dtj;GkiM8--gwv>lUiI)#_Ye3Qu5rPp1!k5F zHjgs?X4N+M;tLunB+qD1J{%3nGCz?ZJhTPub{^6NX+k1WJS-jJFDc5je5kK4L}nvb z0|_$O7aEuQcJJA>Kd@`p!GQy`|A^{%g}y^!Jfg^|eG+L$kbx*wdr~3^VLB0sD{6f# zA5WroDVIv>I9V;S$=x(eA}vbPxpo?5!jrI;T>yZ$Cv zCdH;0!JxON3{WfFS%yg&p=K(!nPyqpz%r8zK#TZN%v7<5-V^ADy*$7$q)~C{FcD_1 z)bp=^tNp~NbB{Mz(Nwu#64nTKfi|@?$N0ICPZ4$U+zrg|pcZmJx6$$KItveZb08lC&1-bA^Y9y%o8_bb6>Y)tiA`23Wh z^?4&(eQ@1qta0ZVqsyJQGA&z|YO}5V8Td8#FPMI1^{oSXCR`94SY~YSPf2$g3Lp-F zIHwT4DFM+SITgva)Qsd*nSh`LNehpAQ-T)mCpNg`=8auAO}8X3}cu zaz2}E+{stIpU~-dmyM1Xa z+p#axu`k`eKhLlOO}VD6@ASRZmu~EaYR{%>XRfn5+c}u&987l{z2BO*7+u3`-pJYP z1gT)d3xK-z0`O+tX69JGVMH|oE!|~^A3z({i()GdqOE+rI_xNRMlSI>ldGzI8WFxb z`lhjCIg6_FR!a=55(}9$j52rG5fJzsT!!!^)jB>AO@x&G{i^9w2yWo!h5M+y)%kz` z*HsXexoJsGZZE$6z?bzI#*Nj{jjeg=Zo6-}f6BdUzhh5V^`xymPj3`@K6Rp>o&g}~ zg$!i4F&vf4kJXEqW0ANJ4o!!~BhiR5mu%U%u8plHVICP7(xchD4)LY%MNT8!Ajhq7o8=oZPx` zv76h^!Em`2GP+zB|H!#LZ!vl7dB#+rNY$*33FEr;gcp|*%-=)dHpqgha>h~Um14EM zy;m4Hk8GlHKci@xG!*+G)2Q5!n`DcVnNdlb+N#e{dg{BAzpwB* zbgE4+m7by{WdJE>Eh$=n!v=GVr6;QQ<#!)y4 zg5r&wLZ}VT=aqPFwYH$BwnCZ8=YU&QqK9 z^kh6er9x}R=Qhsce&S?&eX9+-(w<#wHQSf=uGZ{GJ9n(P8Wy>?XCJtB{Ht^Dp|k&? z^UNnTyI1zzYyU;xhkXxfPOo|Dv!1q$r!7~%WpQHhB~UFp4?HRBuXux=KELnW%mqa_6)7o97{Wo<*I5lg{UYK;1E%6P}%W4C{Ser zo|_YC1zv!?HYMOPn;=-#`3M-P#nw6C7GxJxL5OL_B9MM`PKZWi1wd`kqE=s;K%*Yb zI-q!TtJ)Jk|JTFjU674Qe-9X`LTCI&1JL#?CP%`CCa}FKey9 zt;<>~NaIQ4ys;#MQHH!3x|JO`4g=z@UN7m{cYjXFmcO~?)Y z-0)_jiJ3RU2-6g@u1!go_Bj6yOV~&;9Aud9z`L49322$B-ZTWNA_q!gf$nD6c`M-+ zuih(VMZ78FRhHC!gH;}eRoW1vf;I?EpdZ+hz)mVtU>ICBVZ%K43gl2S&RehywyvKg zNU16tASd8SaGfeg4I#gSk}K&)kfmTuk=>=*p4|jD0`4by8VnSbizre|R*k^0)MjCH z@c8i9@W|-%7eujnS;G_mJ;7{MdGFoc>7IaP^XAlI{E)*m=l$`>~r)Z(YA_!t43}7R=EKTEyM3h8e zxM~FLh@>-8JU1!N^(j zwanfFj~!la_oF&ro`Wa6H7HrIWM%3a^ENEmnbwY_(L1)~W8nD8WB0t1vAaKaVQGh% zvoE%-Gw@jVuv}f<%yK)I+m~lPc=hh9neM~u4Afy-b9LTe;JizFK4ajqZqxyQ$L!4; zIL^M*z9irAf5t!&n>8SaRvc;J&a+FkOX1}`OPB6!%`;eEIrABP7Un`w-rLL7=IWYm zzjW)R+rlj&Z^U|_pFSVi-07-;jP1a^y&2nc>l`$#e~&eB?#E6Sca&WY=NWw78@<2x z!{8UROoawLyD*$xF(F-HT7QPhi1MI=XrO@rBRJeS2t1glidK;}1oJ6RsjQbEKw*Ob zgGzWJFirAhO*`hbd!Re53(5=^2QLK^Q~;jR*r`N(6&(YJFlx!KNaFc7ni3 zX@u%1^r2A{XCatI1zWfnKL9MKUO3=?eGInxW$H{NcVr>D(NI|ttTbZY8aJd&V1yb{ z)+8uBqJhRo93`g&C?s%UFulPl#Z{(?(0Ln}rWF;>N=}O}!C@-l0EGqjzA1^E-ecZg za%(6>g{ac?ggt$el>uof#fv{xREJ6ndEc!2(Lhau$lcf9b;|y}^}T{hIq#rn4cO+L zUwbFc|Cc*)l~ld2coVpO^TzXoGqLIJt9dsVG-v>CN}@gG(A89o2v+RlNV&eUk2B>i z^`VZEPyZ7VRV|YWG9-FSdAyjq8zU5rOvWJ^hmI6pfi9GbVNhO=;K`Rl(^3HV88|+| zp*TM-!2_fGJUBj-M1qfEw2aq+=R98bXjF<)B!T1%V%7M;{~=n{Rfrxz974qJ5;RQ* z^=ym_Vo+bTX&g(8#8i8MVbOBITY_a*p{zv|$pQ{Hk?Zgve~Q_UFhi(nxuPLd%~vom zC97r`KRuBBPRf`ypH3_GG|DqEvKLt=7n5r5B z2JH_@{wiv=YCe%4;NUkf`(w{!q_2Jy}Of z#?g{>bY&b}%ey~z^nwnaZS2l8c4r&+Wg7QgKmDk&ZQ=AIN88d=#<6Q<@MFh58uK(% z?^rjQFu-YYp^v3wM|#JR)%NGo*4CV(eM!l79msSY$aWpez|V2)Blcs*siy~mzS!84 z^#wA%!18OWzULNPYu=8v>iXNwx0-XF9XXGGc`Vy|B-48&+j}|#KhNp4`sUl0Z(V+3 zF?rpffj0PZ+OKJAwy7u6)RS!*$TSV)n!29UIcn?+mV6UaA6RM1)E!>1|F*ilIC<0m zbaJnK!Th9&@pf;RGFQKSa;V!!#3HwS$X_}-G7g> zUxDm(X3e+d9qU`xtgk!c>t5dX!LxUt&GtN->3Q~}i)ml?s&90Gd(*Yn&~khB)@-)H zpK0(fcYLt(?#^uYVKB=++LvzduQt2@ot$5z(~q5$NL6+gqk17ZjI^mI)G;ZY+JS;l zk)G09%AWHkfEOsN42{Pt2i%v>pz8wjHqkI|PuWTTSLiPgElrsr5C(rGJx)Py1Wi-ui`bMv z<08EGv%POBttDmrL3PS8h5Tq!DrxGVK{+a&RjL?j zf=DnoLVgG{9H*)gs0WDG@7694N;^&Np?yVVHGGPMN|0)S+eZsV`1nzEV^l$eOR0Uu zsEKiNETqI{qN)`eC#6_S6TGI7C<>qrwBeL^5y?+5`)kZl=u#Q0VciZ@)dqrE&~Vuc zNJU{*-FEzZ|9;`z;Mlo~XN8l)7cZO|9XmB73_U+`?9|w)=iyrs7)PB;B`K}DvN`Il zrg7&D;+cZ~1CYx91{p|YCsX6Q-FvHd!IA?x*mA37!CWfTH)%oXZ2g{0{T>KM*Vf%W zaqGn5(Bg%gXO~XAd+yG;mF=1KefJ!h_Mvp!aHe*6!IrD5zy0#9mlw%W|GNWs29|f< zIlNlew_pbkOV6&fwd>c8?lo^+*4v)(wlBT7Jh3uyFLb~9KKSg8FAS}D8*cBswR17K zyl2(hcYWx$-nv44VCDF|Kq-yKTTWAqtHPq#mx6!H{ybJqAc7a{l_wxfkJMi=8e)b%E69)DB$~t}Ky-bJyvCYSIZa@rm!0W8O@yzvS@)oRF z8CY2KdUSCt<8IC408oGIRX19$w>)t`O*_%^ur+^&YKBjGFwBVTSMwR77}YLGb=lXp z0s7GpL2f`UL8h88*h0_-Cx3%=2i5~xG?5@fSGy@hiv?~F2ocGhC1kn66(RPJ6sE)YMvu4f{J>! zx~0;1>PbN(fLaICbWpd_j-c$vW$?+zAbKTeQ)*r%Elki=X|}?7m2ZYRo>jh5Z+h5r zKwBcp0qw?&VTpQi@vUzndDZ$141>>RkB|?c6*NUgz6=?Bo5-?XSQ)lA!|ePG<9x^r z{R`9f5P#2oVKA|V#|&g&*o~~=bLjOV`_Iga51G@S7(EY7XC9i4d}6D;`O>Ow>+9wx f=0O8{ludiKKVfL;$w?Do+0Jx*=jRL-C`JAk@t5r1 literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/awq_marlin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d718abfff8792187b9aeda4ce93bf43373f4b23 GIT binary patch literal 27763 zcmd6Qd30M>df&rJtR%PrBnXn=PH=%XO4LG0*1l7+CEH`fHU=UdC5Qk)KY*4*gSC=6 zr&KaImYS1@I+>}cChgFXT65ZIN7JT>Cv6j_X<9r}Il#OlcQhwHX?;%H16y=v(tq0D zci(#eNIl8cOwyBQ`r^C${%!ZWSAXhs+Bkf1Pbm zxr8pH(-EwX>l22MAz=&|6Q+=fg&E@JkQs5tctyezva&E!+?KG1>`J^N+?A*fRVQjfHOy~`*Cy&hbquz~>l5yfJK+g=nBNv}NHm5T8ElVx6HTF}gfHYv zG>4iKEuj_`?ufT0{2_m$E!381549&cLLDsJ8ShMVg}NAA8ShT?gnAOap%iU$&X zp*{w?;{Az%&_H4^G?)m6f{AsZb&2(%^@$Ck4T+7RjfqX6O^Knm%5!%B0ilHxDHUwHfrHlccWZ4&q z$0KLse83<%&%AQ#KzQHb1N)9089gK!_Do)uEGMVZvE*bVK8nQVFr_#R$a*RwL=t?O z7vQmn$EH*1WFnlLN(J;%<-w$IF(O3wB`4GTrLVJMZGMSbGk;^qs$-;?<5RXlU6G=W`lq_mesYZoP@agl( z=-x<*mwZYX|1uvtcRqc3EE4BWPftxHg)|?P8eg1zgDQy4@X^(oQLycJY?6-%tQeOJ zmlfXL8n8VtKd*E`FV(O<#bzQjv`T-~Fuqg+vQ-QGR6H`q zho_3;?UX!`sj2woaOC0}>NsF5&`zn5g3?JfbcPp_;i+V7GM$p#W9Rv?2{k5F?0=J1 zcq$x^T;>I-Rf$aUYci?{JnB3|CipO)m`Yy`p9{yLm!!t>h>J;KA~i+5nB*^(qEPA- zFC?bZd^kmul9FnbTx#&ySR^IYuL)GrHe>GD5NP+;^CmCE#xJjFHzwSt8IcO7&PS$r zT3|7ybK%rPY-);+BCL{Ddpyp^kDoZqN1|A*4Tpr}^c0&$wxEjJL8?6!8JpmvZyrB> zq8LcUj!g>!Kba0=I@8z;34}SQD-j`_zC6X#8fm4Cl8TIDTTP~tLQ0m@R4Ic(b}>!9+tl$>Qrj1D~oH4)}wVpWzAB+3tJu!i@JF` z`qL2gu#(Kt2K3s8(i_WLfEIeoYG{u7qfJQH5^Vz}w?~`tKO0xHvgI99sCACJqzm~^ zbAfiL3Ml0S!-FyknQ4-L{bTV+D&@x_^CN{{P7=_6YD619wZM_=3UWFd17@(Dqoo{9 z0Ru|563UROe8-IN}c?Bmn%IC%+kn*}~SEliGK zH55OBiB>)OJ*>jt(B0tD#i*+l8SdA>ji~DYG*Fk(&FDEU!-e6>Uq%m~LhTS^fRFma zzKjt*m7n1y=zoC2OW2r3z$Yf^&@fy)UQZI{FBp(FfAtkPXCN2=r5Aj@`!+57)D0Pd_Iy2 z&rAg>B@+q&m9d1wj1>q=b`Xh)@OWe_ofKGNVJmV7+sL6ECTu5X2OP=5c$E`-PD!?x zBJpYdfFL9V$vDAZ#)@VumZ4Rl2Z;j~$(Z8f<8m3)h_Z4^MOwpQwsXSa6b*x4{;vF= zSJtHyQ|qQ;Q-kS<02DoVX?l3u)`?+$T|9Pn-PC1p8k6hZjK>q}us!&An7_o2O``|v z7!ggaQ}z!d_UlSSJ2-V&*n@gz8rJONpgJBW5S#ilI6u9@t<-SV%BvNkrQyECn{R4f zXcC(S=8X3(4S8?pobek~>QCE73T??Zb<45cBOi6|{XkEi4mI$hy((vK675Y3L*L$c zd#7k$|DeW~uL&%6=K8jaecQ8v9R*G|(wpxcc;ES+GuOLa?A^ZHJCg0$Sup6TcRw;1 zYAXwz!C6_baZRna25$~78gso{Mf`cUQENW2wB{YIcTZkDxoCX9>b<5x9mW%6{9mvYv!i6w6u24I0yVC+{wHp^tp4UZovN-2HiQ+IN;YtqB|vBAYc*> zqU~#!J}uFJPB?_06pfo-I0BdU4Y)%g9M)Pva?Au}`2{#vxQCUP-uKtNw=Nslj>+BO z&3jvJt-HA{+t>$h{~B*wzO5_QHY~OcXZ>3dzpXpp-Iwd$BX;k}cJ2M3tzb1e59kU8 zv(+XXMft*dIIAWVetA-9R&H>YbpidTWGgNI_qg|Tl3^^KTGb`ufF>bWJl-WWh6qQB zkUhFo)x;`aRWB_MwwU&^dP{UhYvGi|YAl{}`qOCSC?q2jw)5C|$#^~zAD1j=$K%OJ zdczjsd1_19j5II?PUWq`z+>SkWA0{p*Q}%9lTu!yB8k3S;XZNI->_Y`ee=Mr(VL^$ zs{X8{|MBi3C|f6+EiDld4xr< zCS2Xws-CQ+hc!|%%Sp$|kp{JqHTEHZ8=#vP%8?Pm=*#G+6L*w+vxIoGQ|Zf65zW_5 zWhhHU2#M-n^PxuLIbGCr*R0apSwc|SDJzhYE2cB+(F<@$25>?q@HV(Ookt!6tbP$q5MU)?pOvlrVv_i0P(kzS%5at;vb*=J-lR}_EcmWL;!sIZTtU*sx z#Aqq>Q^r+xR-`-`k0I4yS>!v&nfeVlm`ZnJ&fO!rdlt=0?j_5zd)u5nU*C|c?-uL3 z7v5ZQEdIUa`dxE2%`jhU&bMCltzUBA3*2k^kk9(oFZ<49EsgoM&RpAOv2AnSQIoA7 z%+>{$jE@ZZc3alc@|l@))#e?Zyu;1@n)8l!BzxqnaM=o6g@WtG%BO@d5nD{_G4W4c zOe$M9Ysl!M+%%Z@ltYIpi|QtNIr2fp2A`gMv&M`;crjxXPDc%z)6ZQfGP|f<`BWwn zOXexD#RY9j7a70{R<(@MZ^opy80~s$iKoGg3tqI{NNpGDpI&aXtXwdY=26Kki}%7w zR6eu&%z2*ZBOmw2QhqEK5-(!VPWjVG|68;E=(G@@*-k_7bf|5 zP%?nP%>2%NNYG=@?)cN^BWXBlnPZVj%Im)vOP}|bNYQ}*eDWfXs{EIe)Be;HKNcI0 zA$K|BwiC(y{H3XQY%G?JUmozMk|be`QT>RbrtpttP_z$B6dRAEpe^&qCdYZ`0mk@% zNos(qq0|cCzQh!mMD$iNlZGIY7EWVOB(tm|lZ?ol44B#elN{&AWx*^wHhxaBK`AE4 ze#wqr5+0?YlWI}Ln~``d8ZJv=NTklCX!_+XD(_HV5*dV8lvm|F#;=m@0gAMlfwgG( zX{mn`MW=4Uxxy7JoX4AUZxr1dmqwP|&&}B%I^6S}Sx3u9j+YktmMmiXRzUZTemEeG zyi_o7R^Mk9&fyW8f(4_ly7Nb!eYX!TZhC*@y^%Y+?rr*mk?)WE+g;0@M++v-=~}UH z&f0g6T|G8$xbN`gYuz_Su8;isuB@Z&$9DJp_Qm>-nt~4;HI$;DH&l177);<;9h|F9 zCJrW9%VnMt`0>~!!HX8J+S@clY;XS;wztzo)Sl6$iv&g8Q7PZ5cq8(%PmLFDVILLu zoEnQA^Xuv2{v#n*OCf{MqG(sj8P(m@-%;ujZ0RaBM#TiDxpcJ}MnbWcIH|TSbs$}* zrqvQrA5WFfrxzPG}99mZ_9j$ItQ)G&{5L!O^?P?0O zO@ce!rG|kN>aO{ImI7mkzW)8EsT>LC;aEr-7( z=AOMw8WmZg67(O0N|3hySqQC!pQb01(3K}hlgtz^GEPbb19g%u21J!gN3ib&8fT%8 z946NW2qF$&a+CB+x@|?tl7KP~SLuDYD7 zUv%}CfVQqLtR`3Gii2xdx9nM;b*=xTHn=#w6#ZWEJIVXC2OqfHIai11>d4of&)0cZ zOor+jV1|aaoM%Av3@o0>4ek^NcP@K&eTPT0WCCh4V^hpkLc-Hth}dR_Uy=c_K2Q6 z%btC6j(lxvuC_z0?N~UtT-!fq$=A2#>bu1Hu7&t={U%~^SNpMDa#cNIRnPsZK)$x& zq1`jT`6Ig@AgXcHW$jJ#GuhS+OQG!MLoB{_e$%aynl2JVVZJ5tnA1u03@a*nd-nYJSCQ&7}n zNeU<+H2~)dmnY3w_eb{beBX{--$AkOU=a}kW%O6h?f-K}57-okB3}YslFE-{Rmmdo zZvxK)BbSc?Z3#AZzb_*M&C$RX;YuvU1%;?1XYnhG)nFpZVl}qlIm4`>Yy*PW-YrSC z*uwj{cTHpZ^W2#JH4nrOQ^s_`S**ASMD*Ft33(VOoaf;eHV|BOR{U^9C9 z7q=`O7yF-E?%p$J{BujkUw>Z3)g06P^&>qJq=*y$L8WhBo$&{|ckOF512c^vh-p9p zUnO`0ISiAvk(!p!LpUti!U}H(*bycC9F7KnDUg9PCO#n{;yhDpravYZx-#%dAmmjJ~J6y<|~H_HlNA%u)d*S!Y^hQPz6C2uD-EgCCJA4 z`~^Ef4z8_h;q-0m;(pP;zTl*wO3v+_@0d@`_g;U!P(>jw&Q|$FH9;d4Cfj_+BMv`> z8qQq(xn5`TJu(0)5QNHPXkf8!k$->Uy$La}{c}!d8Yre2D5j}d*hEn^#i*KMRPVx> z#g5yrP;_rGx>t#=GT9dD7RDAgEu6pIQsCgt4juTMewCzv3hk^R9#h>zYjw7!N3`~S zZh}uq0jRLC%T$-I_uly0^{?FsUk?`yb4SPLfaFhQ$aE~X!h|unob8g&F$ifS`8jeOL&4xm&7H{SI;qP zo788Fv$is|kOhh%W5#yZpsIVIf+-4A=_0SDwdau2@`SoIb|YC`q=~?>PfKA`vIYR3 zW{$#+LUSZE%FbBtR;cF^vv#!0Mzzg4zH(0-eWDnATmymlH}#`&db?3v24G*B&+>_yNrUAVZ8G7k8P_#4VH$Ef5~Oonjs z8Rdmf|1UlmNH&)3yVcuP_~+<{utW~lKaNpl4up`6@Gcxk=ti<>ldSSKj>Zy_O(~7R ziZYB~rvfX6A>XC!M&TY6^?h>w4LKAlRlFfLQ!>9nNAW4ioMM)RfhurJ(+S3F%Xp9) zCG?U*f*<3NBonh>6!ua{CNNxJ*!u~t5%_ZuL1nB2Rt}Kdy?;%yHqsd|JAhQ6s!U_X z2H<@voQ=YF2>KVSI2gqWkyQqw0+qzNAS)jJ9eMu)IloWNAHYeGfJNB9?320m$}xc_ zK3(p5`H|@=JiI!Ui?H!kIM7Sfan6PZojuuJXx4XUEp7RRzFb34YzQvCz1*;S&IxA5 z;mX;4qTQFX_dtWNxbY+VAQj!O*ba4PpniJYsjX=(~y_{ zye&Czzv%7Hd51;saNgUqQg5%d%~ceZ6EEewlc?++x&6%P5tLa-+ zWZvDrx@l-gwUyEq+_1~ZS^T2KztFgN=)PqrZ*eOAhQ%%SEgRSR9|rc!neJPlVc18% zJ^7}tIn$q88W=HLMVW~;XOwv!#D*xdmLf&VtU(JUM|Dhe);xpI(%a2It^gV+Y6hcv zXd~%-LPPzabvnV2J836Lz+CfX3^OPtY9P{hO(%T(#8k)^)sdKv96%j#(H>|tNvEk% zE=#E^GxO6PF%9Ab9cZoE$P0BURQg4I8tf?Ap*9Peq2v1Gs^O> zX&k&-{m4}OE!3?cerVPWv+O7JTkT1@L+u9)PRyC2u?bkp@)#)RR6YEf7)z$8p=So6 z)=@^#W4Ndy>%$-=Ttgj_ISPH~7#|cEH7gRbK!+Sa^cyCiWOgr;WTLw=A>2YywMD)^ z8l{si;_ZbGDa}hPCnNAO9sWZKW#p15rOaut`kKhQNDl2tp&gE7EnY{E47jiYm2v4F z1+3<91}7Y3>6k3SOlP_%GvgSV32LKglYG&khW&(LuAhb#D-<%g#!vwZSGxB>hBuLv z(23*$M~O3l6zyw91$Y#k#P0N=TwJDV41VN5+W*BK{AvF|$cq*n4WG*NNxQW0^n# z>|6K#;Cq9)z7er+B-giB?A!an2C?tJhsNCDSH#1wCMAkRa2}tY0P_AQ0 z?11~6h`;*h5NZc!(kI&c7DJ+a_}+$P`$&GnmfVI@;)YY%)31peUdw(xB5sIe$EU>g zQ`yw4xPCV47|iz``S3vQ_=JeR-if&b_Z?lNV0rEOYq|P>SRa@>NIJ{@oBerTTh6y# z^li`kc79No+w+3B=Y`y!*G2sKUVqTu^X>lI{rQf-iq+KQojV5YA+j~yY+~64MBl)d zX7hUIj%sGhqws;-1G)B1V*93C`!=zCn_Lm)fHG`McV3q}n-tF`b7#}y+4LODAhOM` z&-1yKonp({#*ZdmdvYyvO$Lsm$4*ap>2M#$}&NZUu%KApKo)OWp^8s|L{K6|s_WOzbhiG{TG!%2iri}{%) zIaP~sq1x7bIG50CF&DV8r}N^7QZr6wt}2K<8Q0CA#97Gfnx`l@eR*;Ts(dzUEtzv@ zY0WZLkb1K8!`%tZeia#wQs-bSqKv?-4M(@t8QX>WQmhJPih2;pnl`N-0R3ZT=)(c6 zVO026fN(5A=n+RC4A(O(%nb^_a-t-bl-US`pfgD=g(eyb>Zc2 zzjphzrHx|OmU~TN*ZyqhA+i3@oRt#2a{ZNgVPV6!hi?xrZoIu?xqclan9AyR<5%PJ z+ZQe_neUnNzMiH1xuFB%(1G00DRJo3ia}TPf^N|`+L>>R=BsMn zee3F5^V^oIy0VroS)azv9bCev1VP?<8mN6>GR^PA)l=@$No zoR8s1<}>Urn(&tta*CXzKO@VuLkLagh zKCCl!Kdh>I_tMo%@4kKY?ShE{AX;{G7b+NNQS|!-EA!guFu=}02j^}qI2l;Ud3+=( z7OI%H2I~GtwG4b!e@tiEwY2$rTfejQd%M50yTFnEgY+ZzTR5xdx&ohBTh}xKeiCo1 z-yOL+Qm7!Gg+n80HqMBZuydy%8)vI}*LT&oVuw#20P6b~{x5z(1$y+4ftOa_v*Qh< zAs;ugdvFHagEQd<-B;I;mu}jTrrfM0o{Za~R_M0v&s450YKO+2?&3L~Eg$cU+DXTa z+@w*jv~o}TFWUZ0rIWV3r8ufZfN9&c!d*Bmpqqm=_`jA;4w&}6t_alwazCL=Ijx7U zR&q7)HMpHui<{NjZIIJ9p(Y*tW7%>cfmE=rX4EZ!en#R7j472 zLp{4c=mzHQ5N0vBjPsTZF#1|$v_J%y5n)nTt+}oTBX_#42a7$043JV1X9h)b43bn% z(3L6FB2%z_2H6noB~>@Fwr&38 zWc;#f&6n~IfwhNy-^JjWfK_-7sb$2-C>9AI!hUjSTMM_zIS7Xt+A3DPP(d{@jQf#N>ihV|0zu4Z0hZ<`P%!IW_CVuAGYdbjUubv zdPRHh;vT5oa8oPWyl=T?f7Y@8fww#79T2^^2u2scTIP(uRwbjSG|H9iq?v@xG7N!J zlKp%v8s#VDd!DV(ZsO`SAB~aXkO?}g4qcd{a!g?tP?XG;v7?i(WA4G0nYHcO$_!Dy z>KeecVs*|TX$Lz)x~)-Otuy;g5xx#*o&5>DU3IKdHh1>&-af;61XKOQHsC-- zbHr^KM`>is+Ti#Z+hG0wxF?)X=##ztD5|Va&Y!VQG_Km~$F*j;!TU4sa5{X`+7piQ zL$k8pRF?M!c{Q}LTAI(P%I%cxVSrd$Bj5UzM;<$6YKKmExpmGBuUNnxcX@5v!tD67mK5kz_` zrhXItGoXO0OvwA66!pKz`QPLa$07Vja#+R{KvHGth%JmZOU`otM&S!e{a57tgq)v} zLlvfINXw)^2Fi4q1RPlqEYjuG=VKC4=OjiOf*`J5H{~pCqNVL&Th{~sNY1}k^zY62 z4~zc8uw!yo;$jIq49o0ihK;ltwa;oR^U5r3{T>T#D< zzL3)7U#YTqY;$(1Nu`E;&zz1X@w*E%BN z&o!b|u-i6A7Er~4eNf0)wo}30Y}fF8%hrdDopgPBo7lT8*SlZDU*rC{%6!A{y`kKW zRospII=S!>oq^?;hXx_f?+S0KPGWQ*4 z^4?Zh@YmbstPsSCO>SC@p`Fy^#%w1J)rPg2@hpS=42@$G+5A-po3*@bupevr!)kY~ zx?QYp&s7J+>cE`&fp^o=!Fzj_y}Ra2D*HBq<+<8Q2LUoiPHYvfQ!D0$E%Gk;69me8 zjP1%r@>R4Gs<)ezueAB^0Mo=@(Wm)k8(q>nVwZmdvd}n)GiX8*o_3`E}THBHqaC3 z&X|(_9K8&KTklih3CWK8igX<@PA|l`P`PaFfSXE)pO9?i%0ZH=BylJqHRs~Vvk_Rx zmx&@aStcR-eaWLd9Y!_NDbM)y+A=w2V$U!f(d# zT9pTvFJrJFp-Tr-=P#!iObxGBBd{rODKvs6;8yBbIS)a6Jd>jT5PK0aQ#B-AkQMnW znUmqE7ngkVH&t;Ir{Z^%6bQHBuCQ*lK24>tPmR-(uxH&Fw-8GgrQ(b`O*lZ#he@YT z*pR80Z|Yvs;pw(ADGR2XT*-hb<9?b!zbdcoFPiwPA+w&zzOowp&<(h?WS6LUVn5w` z7Vu0UQfNqGNYM*5x@QtMi)THVdSN%XtU9?DLwcc^aCB2Fn zpY_f*&H6G1t&?YVg`#=31>=&$fr4MoRqjSB$lhbNwREa`p|d<*hBHl>mbV+;ZoF$# zEe~FWd_U`-ZJTYM?U?P%bX@3pJeRr(YEK@0$4fi7fw8$PhGpS9c8tSKA!HG=eOf; zXohR=C>p9~nPK38@0RVtQyTqI_R-rv^In`SpC_#md)9SCBlh#Jq+aZbx;u8c$Dk(u zXf>l8eP#3ebF8XN-`#o@H_Gd(tlYmlqtJf0v~J4h>F4PA+Wi!8kIwWRICXsQnp^So z(BRk0na{8LFZ1aj-du~tr)SzvlGgon?D&iR64ymn>xqyR7Ro4Ol19rulHBOasrXNHS-kcwU38-=EY zY09Jg7%p?BBnR=T>`jPxoZhq52iHrw^eR5wI&zMm1g|LUn&~Rj`)hWx@JoFII@W4tb2W2h54{BIm)Tsu%;w^l!_U~<$KNl&OpTN9pxzaM zB%@qA2?iXJQ#~O_<4_;M`t-fosfKRk-oO{LqSsWfrn zDPc^7CvY|`8D%L1!4!)`>`J(#9~})?$&8yAezPJhS}rQOFZnd5;$m1y z6C`&!Nkub@cDcFGuuVtRiim9N~{3Xa} z!Uxo_m&o~3a*8WP@;o7vg#Eqik`f*jAKuxW7t>QC`EcQif@5gH)o63?P|CfX!AkRTQP^aFOkv zH6b)KPGwr;GNn4jy9y7%+A*yT@P=~F#&XXlsdhYuSIWa{@+wn8%RkkR5iTipz7&$R zK7WZ!Gc}Ln_+KW3{x2bAT;V>RyEGQ^ifqu)^a4+V+t^Q~Qr^*7)C9PK=$|L!JW zevpa#u|#(AEv(6;ZXeMM`hjXZsXL&@S|kbrAJV}cI;6jDf7sHwXu6sFs3o;DnA>++?h%7%w$$biDyzs>TR7nM(-`v+<>=6mrS|!heZ5&4`q-1!m|6-xkEp+_n_OjOnBvt2v(Lwhv}yVY~EXQAz8AM7kExoS_&wLx@k$TzmZUMlYoO}<-O zZ*Em;!DSD69>!Q*14GN^p2d367o0l*Gfk=$SIL8Nt@PGRk*ciw+ z`7tNChF-CuH{aBnYZ|~DKB{(^UdwI!Z*YD)=n6w-a2{nWWKdSJw4bVw(iKcuA`FscTll4C>FC2 z$hGeh+jr&NZ8>+h=+Aq-PTW!BMJcuCK>u-w}VNA#Q@rxBaP z)=kT;o3jqzm-n(`)0cO&-p)GeTia2rU6=boOUJE=n-esXcrc}B5!1Q(vFU8Bxn@`9 zk3BD>Jup4PPaT`4ZVb;C9tW0%k82Gq?iE{t*X$3v2Xfte#qPa%Z&%LSCwlu9w=YfK zJG<=NllKqgJGSN8f@v2{;iGe=O z-&JU4U<A&)=;{xk{OTmf_oQ0&N=x#4KqQsu%+sCyTHN8Np{*Mb<(>-LHS7_^%d}-{LV^{310LMmVh@zDuL8^@+n&#%LUnt z{?QbAdyW3p6oHgLGHY&U@$w^(0_lgP{;yI`P`>;xZ&ccV(&n#}O=)QHA6OCS;;Gd4 zmGZK_GOk~!ruLIwU#O-0ZBNM$$qD~@iTp4gou9}r-gzk7(|XEIzm$JOC5V$s_6(l1 z=_wVh%5fI|IE?broj|H`_Wq+Z_*JvxxGJ6!NcAsmf2KS^<)l$Az(+MVlSAhL@-u&a zg4)PwC#Qp)PI9`)=_ZGi^6dJuwC&j{myZnPx=CyQ)bV-~Qr%hhuaM9`eu97C~ghK$r7sn@JUL3j?KjaNp4pD1r+)1n-&(#tX}91R6Oq$ zW{p)>^EzsCk5 tUNOI|N10i7?+VADmHp=Z23>d7H?YDHurg{sX4Z9NYrDSS2w-^j{{aJ!>Ouej literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/awq_triton.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/awq_triton.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4ef815eb964d4e0684e6018eaf4b1f67bf05a20 GIT binary patch literal 13303 zcmcgydu$uYdEe#xEs+vMy=;koNTRKW<(I5Gon_gw9=2>*=eOf4OKT}v67^bAwxwGI zbaH`7Us_4mq+VZ(hBi_heKv6PA;2-008#EQH!V<(>Gp`l0TtQ;f&Wut2Waz8`^_wu zTGEv5yF3=saK4#uzWL^xZ|0l%?f$dbY@p!rsoo!98Yt?&Fd;of>C7{Wf}(Cw48_ns z>H_{%_~>!P1;x1Xf|90&DMophzM%3dM~N({x%kVdyoym%b{VD^b^h`p6{C4ibwAzU zf_5FL+Wf5<-8$v^b;=F-%ZGG~QKp4XPw6S9oH2Q+QCps|nAa({tW#d#H86$nZ_Qsn zWMXXZsV^9Duh4G=QePsArVXm`}z+Zb)W3LaM*n=i#k832L|cUKt>7w&r(|K zZvk4W7zO{!eHi+I;(nTAUjGVSrHt3W893dXnv%;`@J#Xanoc6Wa zX|vjyI5lU4HA8(*!B9hrGLVI(a3)-aOVeez4>M=vG-J@~yoG~yBv-&$vRiWnIW55o z^M&7VR90zJjOz8qCh}#C(8j1ZH5rpNTT@bI%h48(vxu{%$63tTIJGE~jMmN-XUpL1 zIqmUiOJwtny(Z`3D9()Mj4Nc+_pwL8Gw+bK(TQd9lyh(nX;gm}-n>4?Xe1pN;%nJs zmyIBQ>x}1R+YsAg)F>aVm}gX(6;#TViapMk$$H7NZ*yhYI&nI2RWPa{g`|6yj65-c zE5ohCS(mqhIVhWh9f8r!;aPS_KFulTU`@#M%0GIg)IV9T<&tN=svVcH8fjMn>$N<` zet5lBWRIKhR&wQ>M$~Q$ui6b9v?jTY+=gr!+{T>tc!X6l-SYXdf~&;+a8)utu9mgQ z=f~AtwPc6+KL>9xaBwBk9!yiSluLh{#bNaxEvP)# zM*L5~9@JCp$iJ3^JjL}-@fv@MRdpFzyoU>hDRy$0^^Chm7(wG@Q5$u+@}&6A-enNqz-UlUAeez6DLh-d-7P|A9*cv-*K?YTPbo@5y>>kfD){FAJp z@n0Sp3it&JHi~;qmuwm&%j?sWq=i@~>0lt7q6TVWYN3EwUnV`L;F@60;*dzGgLJLb zgHdD_yU>U;>#u{EseaAp_vf1e<5NvjBU23l4?FA)G+di$Y27#0;%)McTxyz{4P2g_ zXu9I_jW>-?GG3qCd(AsI1Af)y^UQ)Bo2F+x6M>OAPhbT2uua9oa^uvjV8=nN zV_tT`>thdtdN^GByI~OospsXCxnSW~_!vL^#N-Sf{>o&LQL7rtTEd%9;g%n>AI;vG zjag%T$=VK7*b(ea724OUv6S#-cbg+jqy?32N0#l;k+|w}Ph5){4xi=E0<88^t z9^~jrsESjLN>UZ|AVIs_?j@pmu=N7*%bmY+E_b6{eMr~0s-)CKt2(NrCaOdB9l_%%`_`x~)_l)|?7M*~a^7`D zPDN@_(YEMc#5=!m#=B8NCo1X;_C427g|>yO;j4@9M$RHfebgH}nJj1x_Iy=fU6>Eg z^XDShk_9_nO+PnK2Gc@WxNNa`(er*4-;DH?Jae~$A4a;G2ewbj?v=%wW1jm}pIhUd zaR+jBAYI3QXw27#L!R%BuBs{ZhE)?~sfp6a>{|G4?U4$7{0Y8 zS{Y+L?}!Z}*M4N(A3Op^hv9XHyBB*SCRAJ>9gS~Jn)a_b@z#hvX{voaxX)r(=fTA0LrGI7@jOe>;!xNfQ6kf3LW8^cP|3FFC@R_a^PRtF{b}p+z$5x+?Wp8P zus;O`BDDjkr12-V&&nQ_#hc^4M+0d8@nmy9D(MHF;!1vcsV~z0X>*K;wV=&!qT)B> zlaH9kr%pc_M*Gj8;xoZxsp8F%%du^!rVSOhNu>3T4;AkO%2sukj_i(Tk*zNJR=oK0 zX>f(@9mv)ZJPKB^6fT?%pIy8VX+ZX-SVep=X+9|T1z)Uhxf$*5Ol&=zGuAGP0Z|FU8as@Rh(*cZs?7Ijq6o;=ald&+%u(2N_V9~`F14ShGnwy$+VH_J{hIK1aPj52es zu*f*`ob%!IWyqXw3pg`ZfbRxy|HU~tOU+w3#b};0)o7lx^1MyfSH63So-+(7INJkN zrpF>#i#(?=hK3VcdVGuqpSZG5f3Ix4cK;*TKn5`Yd*wT~G1`pXis9x=dYre*ddYLD z%e_$p=3s!kA81YV)yi&Ua!Hn)DFc$TEh*`4X-qeQ$CA zG9c6Dm2%R&S<>Jx%W1?BjbYy?`y{6!_{$h(n#n3WW17X;dY2;JncVtjsVdF zPnYSKch$izo8Ftqyt*>p;mV!|BAs9$vblg>=C^S!@IKNPnZfPc_Ux7~pd@$(>p3O2 zjr7AT{Z| zh|o?zYSOgi_gH6Aw`o!F{SzvREBYg8zxx};!9CwFs_f+yup5x*FCy{SJHQY3WP1}C zFRs~}#}`|2xy6W_%PU4N;tl&`-c*@qj#{|g+`b1UX=f(rk6A>2SiC>6_(+Yg>n@Th=|2&Ku`@b zBUji8$g)@~whbqCoRs1O`=_8L@DYpm6xM+g>|=tGnGCQSAm?fo)EgZPK7 z#;jGCl|ClW#|8Q*>%>LfIO)MjFHVj?A}9g)5n-P>L60FK3`%ar;vrI5=YQe$gb z&sz2pKy7&FenCSl;TPyJ37yJhJOb^J3NH!tCBY~u>z|W}os(3A!Zg4&ClCaP0Bn9T zQ)^uVo+geH=0DBu7eqKG&3_0Rh8`60Gxy-Ql$j7TWWGS@K{t`n3Y@Qr_q@o)cH({t zFjh0{9Uqran(I*NgMU8<+X$dE)s)e^&=79m2c8(Jf`=rO#+fu#1`j{iQP!ec+9fSd z^KHq(x?t~$sbH}cfRy0j6y#jt{@~%2(hWZ{+%|k!+JH(M5-R)iLaNAqt8S@|?~B$a zDq0flC!W{_;QIJS03Zx62Ozjd)2RIfY8ptKJe6oXoiv}xfguvrZAtT6 zv1HS6WIdi{j#zKQ?RL{QZSg8K)`!Cit&2xKoQ`baucG2wR9G8z#X3KC#=6nAHdNRK zNSf8Q=vgY^Tkj4;RRAAVG@!zUm?6IV3v0Xu?Kp%A55bh0ZHwoTd1J6EmC3z=iHQ|zNAtMH!9K3Vs4FB%O z%~2-Wf~t0*;$5-P<>D`b(}Cz!R0B3> z0gKo-M#_+VTgdQLp^dB@L(~yhJ}GPqsd0&U$;^8rr;~--Uym+WDu}3~rkF2Ldnj4h z8B(tl+j-;Njz}QVjY{h=HZ>RD@&z5QMNJ1$@xia9Ye{va6_wP7j9!frl?%97Yy!6X-8;HSH9&EpaER zeG{46U>{Ox2)=b@aezPb@2V|q7v7(>=~p-`jtXKZFr*Q6VU8^&7{M~!Bx*D?%^PZ9HKj?P<4KJu8?ysYgyEs0zLXldcq zBs~U^hgHLImE`=2Bh+{3m$igf*96EJgBKjEZ5>Xc9US9;CoNHtcN9dIEQxkF$gde8 zlVj2n8Sz z)gjR51p2%muczu@P&`BF+I*hzON?j#Ty4&quk<^yHdwG8pMQU&#HfXV`Wr#U{{*S8 zbe08U*tiI?tZ)f^a=8b+DSnThw;J7g4#Pi6_{mj;DKSAlh<-C6Z;*U&ulX6cQN@M zPR`-v3{LQ+0PisP9kHMqW=9zIeO&&1Nc<*9)()8m4v417eHADgg#7_9KgF$Jho=5F zNUd5Zec^&RY+m#rT`6qLU=vf=v^Wqh0K!^&Ys1n8ei~Vwkux7%ymK+;yyHgJ-N7R( zMpJ0|<{sEFtsk6DsLDg_DWi3vE!@V_;dVHdym5Q-v9Tdm^I84F`uOz2rsaXhJG&C9 zu7t5GWh`8HGyLY_ba+3+nr*k;kB#*)`)3soE8+tWH!e3nqJO#fadU4%)tfN(ri^yp zdb2&AmmFFb1g+?`PMC5(OKJhbqw@V8RBg3LJNd3c?st4tM?EPOls?Ubc> z!58*D*Q+fWIBLEy_B9E?j^`?+NfYdZgSNVy--1+CAQY^V(wS2_8_7U4)TiuD`1@Uv zL9JOWrA)TqF)@0mM<%9b0)l2*d5mS1Wpn8|3EaTu*NPaFo_mCfBx3 z;7ZF~(=@CKs~8$1S{UH~tKiY9p-ef}A_X#_RI;$9=X7!^C&=MikRe(&WD1!>sv#u_ z3x=16JkZ1xHV-L5bGc7}OW_BCPrl%#k+7C|qb%qcGC>Ci8wHk&tq!4G!)e2+2ip7T z00qIvnD;Hy+S@rJXN0|9cbs51B2WWUO|duz7z@ ztimSvdq{o{0jZcWS#f;0-Wlu&PQS2|zkXEWrz6cD&LmV@62>hlqix}>@LRkU87uhd z$HwZ&*^l12^G>W8ZQTRNf#;_skGDdQX-^nm8OI(ue|hHd&Z7y{(S-5n*9dy0{IO^U zs%${U#+dW5aaY{@$ohESk%a0 z9f`8W*wAObhrUO9{VAqP#nvQF51i~wmBrfGzUMw&Z=Mt>=RtCHPnUQvIq`q-_`v z1Nyf`gnp)^6efxH01)ZMS7>@G_@&6ZYA0lgL^=prDv@P`EXUttTnu0QZ3QONa!Jhr z6;)P|DlPwwv5~d`tb@5$m5^Tnx9YWJ~LzL8ZAo2f~`A2Fr2Y)b0y>fMIxQ3ArFFq!rf>w1;C6*=`m%XiXE#)oHOBW b%s6MkxdNO6+m)|DjmS6ZqN6_pR>?6Y>vyiGJy@v-or9oFgVNC7V?7 znX;vNs+y|HRawG!*;eZ5YPz1OX85~g_f&geT(NuV*=klIG|h_ep0;!Kd^OK$GInph zP%TKrNzr08pCSi{*>jedS(>^mi!WFE%pC2feRspFnWqCa#q2$+R9A7@0?-x{w1cx! zx$j@NaI%~V(rliO&Bg7d-=$IZQ`l@6Ne5?9snw5wk!oH$APx= zTvK;^tEKyv>m24QJQyy1om$7IeebZ|Xi#Pd$wV6~Xh=c@QBsvmQcamsRi=uWqUjTb zB)RCH=(~NamNMnD%H?pPn!U5trUG9Y?7~$s(+RHmJCSGwo*6iTUf`AKwi8n^d*GO| zKvS6QDd&R0P^(mE^a_mRc`OFsl1hJuS|Se?eC{mr6dbhQDLeOBAOq+g7Kj>QP_S zJyrL8W=%DH>Zxu`^?^_G}uj4V%_MT59fR%w=UIP&{ha7#?=eD?Y2~?6?;U z(iyNnEgAy8o~Hm6%J6x0+^;vr8&+eR55&~pHq+ek$j*~H=(uf7jW=e&WS#L-wp|~u zyC$_YIzx@7@3L_&jW-_Ziks{B6lhd4T&HFouQXRAal zmWB&$&$iZdQ4OED2Z$n#5kWM!G%QTg%pNc z2br~T%yr#ibZra!{Xxghy-417ZG|hAkO=GCDAPu+Nm}bZ;$Qz+{A834ZU_B5;otNxiIt2 z8yDUXg!_O0%x|ArM8b93fTs(LP1sOnI#VQ^u_G&N4b1<-xY&ho*iEf2gM zTCdw=>T&B7_&1@Hiu!ekkl_lr4^y*hT|WsIMfIBC_RxjYnvQ``g>##RxFy`DUNz&lK`B`b)g=j1(e*(x4=)pcL5p4 z-&;-l0Y@kK6ryj#no=!e78BIND^T~*5?_%u`x7hfg53;&cJ5!M8hAbN;O%4PDmnmM zMnYLL=3s)?TC>Dw2f|s!90KsTE=O|2Tw|_={(5s5uz$+D3;OrGMP{Vxy>Kn>j-e&D z=?RW~T4PUiLQC+mF{XzQf%wR)bhsI`TkdA|K9?snj=31Hx%hrB7Ow<Hss~Mu!0^$Z5h8YMKaB zXM{I@7+AN~hQTPY)k?&D0th^e>YtqtKy^2^4XF#ptU|i}5^0h4^ANA*cv!ZZtg~V#mY_ z`}V3QDRZdp2^2wtK`5rcV0@I&g6>LWgi?t$VucCoyjzhj<&4M(he<9Dm%uAT$f}fd9tUvyb{b)w3|e z1_SJ3O*{z*Z>g$>@Q7oghHE(jMWL|)eyZy}LL~PTWz52Sf?puW9+-;I#;RH19lCvD z+qPIDE{1S!syFR6ZOqIdkA*Y@`JQH$4+$%N#%oS-^dk~BI%CiV0x{i=c^MWnnstCt z4rD#?=p;w#fT;F>MuNh04O0LItf+Q8$fL00y%S2U3pYi@WO$UC5VSSk<$P9JV8NH8 zBxQ&U3<>j7PSRPg!}wzGjplKV%t`ymD@8(H2Z%gJPJuK1L3R!xGGY`XC6S{HL~6OC>+%d4fjqC1?=|$kbpgl4aPtiKr7+OWk;absoa4M)oCtkZTvF*0?`}D zrveRR(-}Fp_S4dRe=KdkR@#2$$@lktRNA{x*!$V1{deCaQf}?_(#ZEF+cNY%_pk`} z?bU0@`QAHHg9hAI9x;;B=50mB zy-;=96<>caH$kyWJr)%eJDEhuU&j(5x>!?dLIR4BD?(7THIl#pfr2{ea9}{$5q${T4kN+=hZUh0kz2AN)Q^^G=n!Hw56X3d?K=w%nGnpo~nZt-}9Cng-N%=@O{i z)zFzfmb@k&f(xJ-_q^yyp~XT$A;NAm)mBkuFp_Yd; z#5S|hHUa-tVVq%|q^z=0>~F+o44eC~!PJviBv}@F81MqchOCQWB{)d)oQXY#{ay}; zgeM4OKxb?(4i`ivDH?~BYy!sF*RaW8^CUJ;W5eSKMm%2lWSg^@rAHS=w%sJ~ zTohGM=(V%a+{Lkr&0oEC>9uQL-Q9&|)pabW_M}JWx8Ef2;2f}7)mGrOJxDTxoCJ8z zkIj4UmY0|y*iK~gSnJC#naO-cc; zSN!)U-kZf5?%EHw^T)!`7jNQgxEOlvKCZJuM|HU^<9p=+{=AV}d)~a38@-u^L0kyx zhhBS~lpdO&XcKr`oVYS}>FGu6wllry;`I&pFUs(2k0|NlMdLqERTejtAq*JOg<@;D3fLSy&j_s;H%JMvqeww>$c8T1EUft&()2+}s21pO#be#a^Z9 zTeioq4l|KG+G(I*$stgtT-WC1O#zcsoW}xzfSTDyfdzQ<$CwRcV3x$gToBNUBrRr% z)c;qq@vr2*e~`kT$9gI1gRk@k%UN*dRek4OEzhddXln5+H1q-gF(!Y0)+=X04Wg* zdbN(Sq4s=dF_)a^tT(3jE_HO-TgOdOwYpTUPWiDLr&37)qBejhN-a0*tBR}g12Vbg z&88~J_w~%+K|%65l}w3^etg~i_4L1&w7zTp3cruQIMms6%Q}r;2M5+K`sy)p1=yAJQibApLt2 zWJ;JrW>VJ1EeUJLny`gz346$%aD*JBUKe*JTp<_9>*MZ(C*(;qg_;uHkT>BA`AEGX z?oTv_nrTWwaau0$F)d;nYEe+9Db{#{VojXsW2IEvN@~qeYvBx)+Fc8Zp!Ev`2`U8T zU~)mwU7XFtQpresEJzF5d5%v*39`mmGL{L)Q`6HN53q&{u_PDahazcCu!;FmE^|4> zVhyxAUYv_0GqG$02YWu1oQh2g&e}Q=IvrFAzM-=dLl*|e!xvweI6HbaG#nlo9-MgT z((t(8mz&04df|nOmnMc!g~whR4MXj?;C{4i{MAvRY4q&a*-;UQjQaBMrSY>DaW$3~ zB77ts=i(Pe&u|eIrnV!t^A|3RVoQR{aJ=Au{&FOlBTNY5%m23ExxSvCA=vRaB_wVX-H=_)xrXO`*=PzR7x zmL+P6W=%ItAq#8d%u@>1d_xtoayBU0p=7zC`dC6CcH`EopnqkEH z9!8d$Qn8+zkppB_ySRG5sHvu{h4sRkp21brKQkxm%UEIoGr)mi{RDpv+YIlkZHc-@ zhuX)fV2fZLie-@SM5511zf3e9NvD~q6wgTQeFg&q&n0e(6dzuK7uNi%segV-xSXn#|lP?$R=Z%v|yMnDOk-FpjKYNY8RCm`NUB-O*!h1+bBw=A)3n3IV!A|$IU6AR5^b+ zC6p?2=2TEJRUp++GFM6(C{8Zvw25^J96lJnJH;2u=YaH{f_Wvu1j zMPtqY@5?3k0NnP<`RXq~c;r&fn4_~m15npgsl%PCyaO2HpxjdZ1qi=fTA(_pOl5C( zQ+(^9E~i`6*X)Ecr-yf-s--PfU4830{o4wbTGYLzyGmcBt|?!ouF^s47?12Ydl1gx zBph}|x;vRznz_oSlGA<63vmwa8zz=aXW$-UB*2p=vM)~X3$f%hlSwgi0Fb%N$&Jh; zUMxwrmzkW)FwxXpoMn=!3=>PtrufWN;1mRUDix2XuHq0J&!_kSCOh`H=>Q(s!}QL6 zz-<_c0ff@r9k$ zRwlx8jMRr?`?Bqi9Z+OgIF?x&3-}il1u7~DJ7iCh{~Ut9tszLxQ#9P9IqG$wk3~91 z&(uN^@am~g)9<))^qXaAJ^U6n*;zI-17aWtiW+ z+@DFz_QOlsmx=JxTxQ?3xueIPzH*f7kH;qaXBU90Ci~~(@kDE2J+RCbCz!@0oZC1d{a0z6vM@ zIHmC-`ft>4uT%N<^B*PF?04S$+l9Yg_?vJ4#et6!|09h&^lkSpdPTd74mJx~X-Xc2 zTR}Y=;X#5khQkuIhQmCvFTr@05ZEwMlVEr`5})J5yc%{6#3Y@J6pyp>c)0jxOahp+ zV1jIpM?T3TixbS_6BjQHo*oXLnt1hvVL=I3n4p?T#gc+K4X-CM&4t;_!Yn5^0q*%T z;d*eV3P5B$s1KOOza=mT5V zmQwMQ5PH_mb)zYst&NkGbRtz>l#M=*C80sqG;C zwnBH4t!Qj67@O~RoG2JiJa7a`j$?Or7YEK22F~S=oi9`L1-kP==X3czgODGh+saCX zy=_aWa$3ri%4{i{V2F2*-#mW5^-#fhhzxLeed4FD{p7XvQPQLJLF=J>%i;Blup0X? zt_Hg*p{s%NwG_R33*Nm;<_Aswm4hD~eeY-iDSo5<{6_Qnjiw7rrp=bUh^xQU(N*j? zUg$WEC1$g^YwdWU`3M%gB{we80IO-<^+Er8{Y$Dpw*Geeb(*p@e@SWey``@1ylFR} z2Cvq>ue5JC-#b#K6sPIn=fUUl-Gg`SWv$YF3Q$u6Y6i;HTQ;iAz5lbUhR(lB<3;rM z2Cc&k^)p64qE-GZ=o?Wef38qNzK(05JV7L???VQZOSr{mAh+{@dkAEa8Du^?9%uo% zq>;>pcNerYkh`D-RtIP+=N{!yI1!LD=G+05l4CUP0)B+t;jPb}aKB7LN1!Xjz zu3G_I4&fxIl{;4;*DpZNG_qk>fz`{Of?OSi;GUD|@Ls!3Wgr!wM$}N#irDRVrWNB9 zkJ4IDDX0kx6;zia@hL$!ITcSuG6#wT+gV2*eUF1%!~xxq zT&F&_xo;bm4et+sF!tV9-rAGb^*p*SbnsWNcx`Qfy0tVw>!sKO01SMjJbKeO(%=UTyf8LJOHJbrdg$IDTBB8I5-u+sVi{EcpHzWse?l{ z+!#(Bb{2A;|HvWmC9}zrdK}e(pBiU+^ zgtESRN~mf2M=(M9N07iN@c4?Ju7am)O|$M<*KK%?Eg4I0Z_(XZaCffGubbB1y2Ed{ zhn5T_r>E%LU2yJReR*wW?GNrewc#9G(v|G4qP@LfZ(lvKHoSIpeR#t@u%sz9`HM~6 zg{JN`&w5|}z~DyH(2}VX*i{VdF9i0ld+r49G=IwF1N%1u6M3EQGpnOyYFhEHzOpvI zp34uv3^JI_05XBzAktv%4k14$G}tN9U_%9jrDYYERiJs|8l48aZ)0VJRkErIwJO1& zLCqn&3E&m1TIvm!XDE@e7FA&Qcqymi$5~B7+M#9;#~shdQW@y&39S`8)ESAo!01yk znpB6(RSXn>*Aqt^2c-){4#6J1%tf!vf;s@)BE=^n8FH}-b`U6lbWx#}>PH0C^lawM z?2!r3k_nGO4Li;8u}B;&UM%RY%(oZO(guU{F)yW2D~5_I91mvQzMxuA0#OSZX_m8q zwV)C0+GtmsN(D89#0AskDbXkvj!sPr22jj-vCL0mA17c295cp-t1W^uk)BTDs}Tub zB>O-Llq){Qa96Skj6$X|L21wE^!D67d5a=zLb^$@Q7>GhiXMj z6DS8Qr|C_9XVJg6;NP3?IeAyT;U59ksrQ4tYHnKDoj0}KGrhRFcU@CxKLWWsdq33| z23{;HDShCfjxx0r+76c-=ZcOO3yv2{Hs@{AvguDPtIEGLeQ5e~3(V%UEIkkUhuQh= z`J3lgl=n>mIDfaFS$^hEPv%W+n^vcIYpIFOD`;kUZYp+7a7s(9^%vA#VR&_r!^1-c zB%TU`%+Z+&qnC*=9jI%^zT}bH`6eK}3Yp5Sz`Kl@3d!au`5+=8iV~7qFvNgpz(xaB zSssN2-iis)XH$?1Dv7kpC$KgNNg8p69T&Ho@XT7_H``jj#mG}@+kc8f^g(i+f_G~2 z+~VF3-8XhWfZW{s$@`{#>${5w&)hqB=5uG~YG%XPn>Y2Aq*pAW+=1dUN*Up>Uimd*%jqSEtYSp6a<C_agi|Dwf@YK~hR5yYkt_tWq;5yKcsv}23+p+kC6eoNWCM7#7LtU)hW`LW z^xGTMLxobU1M+uzZ|_^)cl~_H<-b1qnZIpy&sxid|7c#-1Y(7w<@&j@YLHg%-E@1) zYRo{b+q<%R`CM5`iaN^KwBlOsDeFnmK-t~5GRwxYkrYi7Xr`x@XUb+$v`_)2Y$aJ6 zWw89lPO>8kt=h2Cu|>g8*+FUSplPaod0+3B6#Q-})PnemC`|FVz5xLpaK?h9(Jlz1pzqT8*w9TD|;XYng)L`o%Bt zN16d6gz!sAvO5TG@N$ECIZ!_tP<)SwKZZM{At=5 zSA%*r)FX}UcnEyO4qa_zH8FL_4E{I^_?>j>^X4ZLw4mGi^8&pX2-F=g_ zxnUK^adtec=P~s*;@Pw|ZUwJk(+zdV58k|H@X|H2kAu^&e!xDkU<rLF#CqVT%3uaOJDG$<}%n+sUbjMf35Zqj1mlX7fb*d zNu)(10>2AJAFJ~);+>4D@0Jb((A(5os<+fRElVTSgMS7UA;|4~8VR_vC<~r1OQs^s zf)!0u?;(85hz%r$MSV_xUBRlrM^u#@l(OWoXvpbV)s>_0{{5hU>9a5b2f>mJZ6XVAy#K1#S8M~OH zsEkw2HR5qFdIJtOgPXDAk>5589@T~zRs3$%j;#I@^T(3B##&f4Yk5pxbSCV09)nrM zr&l`NS@- zHm749?<*F~P;VjQSu$-h#t`@5WYsY`XZ{QNPv}KU(vs<@X@j@Ky53jFt&e$QyX79$ zU)D`#iv8YuZ$htZ!@X>PiO6D7z^93VnU*3<@Y=#lYHBVSMb-_lz`}{LoMyn~ z%ZT9~kY`X52Z#v2fQG(I1Rz0quFP7w0Y#5Mx|c~~fE4`VFvQtb0l`QYzy!7Ix-l43 z?PJ%N1eh;lv+-Dz;My4L`fC^0QN3L1uOsmq#(RcIO%dxQfn+MTQ(px!9H#NzHNac| zodpLF+pE1992wpx_LZM1D&pjs6gc$>U_A=mb4l1BHdx=g=4O0ccL|Bc{cA7Ns$?SL zhTF|XG7$zHsxmsH3&yymnQrhibI}YKOCaK45&{m;soEzggn(&fp_fP`P>t+^GYme4 zPBJiSg;pvQ+THc0NG@H$-paT*j7U37cbAxf<|-LMEg`3_I%FcTG`9nns%LtDIXeZx z3pGN<8)z89V2L;C*7pVjq#3KBk~*viW=Kasyfp+Mjl{<0!8R}%hxm*ILT1DlMt1#? zQQ%0vL0*UC3V5SR0fJfbse$i><@oHO7v%WEnr%zW0Wt2}95D`L8WuIiw*&tXV@~{M zQA`twuO9F2Y)25znmAG&_EpT8?yMr%Hv;N*27Gkyz^H;5d={|EaApoU09ubDX|R$- z7Jw-E*APM#ons^5P13O3d<^U)Z;@_1_vT!T=hES6KEi&1=9@467etK&wSo%iQP7L0 zhK(fzgLDK)9v%4{8^*j@%1ds2`=n(3%YY+=$e8?}z_1|kqM4XX3;7kyV?F;*#76w# z-@^noGBD{82vR|set=EyV?suwt71K_k+qpe7W_kKlM7B5I3vjeZ;jr5vz~ z%q0Y$wB2(&SFv0I&jWrZn1{y(hb|0=N0NHyVcUnKb_aPP7V18ga$bSNn(-HD{FEe-aKrTxkE5We;P~Py=%`FyvLW!o5rS*I6>{E}8a}e0#v3 zUG(=B{JkZ+r{ruZ+1;XFqiF9d*!#*>O5e3j>GXzeFXilfVE5c^UTy}XPSMs;uyqt| z#|pM%8@A)%NCV%u=us(pju$+~ORXK-dbI;1Gn%|5@gGjrQ}Fi`{YMM_qp)a~x9IFF zI6I5ZgZG>V!AcCX*AIMW=$`G_O}B5;-?iOlcN&(oi7 z%0KraWN(gq?%oHx;jxsO18ej{CFb^>B)>}|WiVbtj7p$j3>1xB1tY|L+%q0|;Ocq+ z)4sO+TFJ$fT>I8t#REfy14G3FV+Hu<8Y?xokrynk`q_g2*;0Ggwo&78lr2HI%d_kcKW{Va7hQ0 zK+4{=BlmR&tL3J(J-^WHC%Sr_9<2jL1AH){_I3C7-1dt81&YoL`i_V^cvj=v%Wa}>3I!iWx$p*(U@X)MtS%9RRdw|j*N9oKKAPQ$o$ouBtPg;I75+dqptd#vB@9%6k5S;`ltn#pxk=hJ zv{xR{^}lFfm5WCBLw&u0tz!o1XU7BjYGYW#4nQa_qAuTP+(vc`QK#-=)qs(D5x!T* zX^}1#t&bU_fpvr5i)#V9iqtdNUe$9+MD_29OFccp8PEf@)1V1kbGn>?)swTKU=2yE zYrMh+He`rbHs#F3A#VXYj9GkNi#8YmBV%?vIa{vc>IGvTo?`NVi_|1W7sEFtrcokH zJWm2KM#!m|5L9#EI3-RRejU1qrbRW>9Xbe+kMx*8pBJ=)QrM*STT(_3m$#HPMxxQV zgk+q13Wg9&7+f62n;IoR11X6T5V`pEqypta{vYCuC~gEz#A(a_3+!)^zt16#rE&4_ zVpmZzAt9n@Pawxw^gHbeqiZb1hoy5VOG2Vbo8tL@hLhOIiAO`@6lldDW~GUtQ7#av zg};R(FJnTmLy?vLSC|mo%<{;hd+J|f!{1<1#qY17D40bVOp@r*C|uP_CnPT#T7Lvh z*?mJ|gl8T9c&s>h9a)JTlIzrOJd{4T=?xUUJq2&i+J%Dm7|13~0f%cvN^A!)8?V6ZN?;^9R`p3WN(h(>p9FSCqe=LOO8Atb!aN!> z2E)fdC-?w!*UvPG`P!7#K`c-eM$^bt3s~suuy3v%LCn{$P{Pmv7#hC@Lu1YC`f&&* zC2KQ6Pt(`*G&SsL`sO`@W{F8Ez-ugOT)qf|-iKMRdO-MMaz4h!B4IIXAL9D$$Zb$9 zorJG4z>$%;EV!rRsmVw@EOQ(PE`-385dhYR-3VjJ+NFceTm(%l0@ej4Ye8d!s4g>o zeSKe`(goaFl}XKB$+ln%SXcucI zcCl@R5LEhBAEZDHnAh1*X&Rjld%j_F~Wle-|*x9%{&cpM0nM zbL7Gpt970FrS8Npb?1J?9N1tE<#jEe`3`RQ4lh|sMy6=oQ!wt?0CfU%5(v0yE1C`$ zOovOhJ*#hRD^WAlQU3m-?_j}qaQ(=JZvcDw^2WB$S~^xo)=%bNeEELMx0c2}vj$2X zhu2?TKXYd+-**0%<$=wETEoiNeH)02twr0Bg6+tip}g(LecQ7ze9;yx*n(@F>wWia zgQWa)!S-~?6@K4aKh^g?;V1S^W*k!6_7eO^2-401-doK~YOk!TApd4K*rO2?3TdLGun7M$o)TzhcF&AxR?>fZZ0iSxvUF672iv4*z%D+)K~}HrLQtp>Dr*`1Z5Q&#s=z zI}fdm6r6|hrbEAvph{jp#D`r))b>CE@g8Z+XE2$=WF8aLvjh|T?*oPsK}=F|@^2uMz5y=^!`M_1#^$12`>tOoxxCj$H+ANB%{R@r7Qix8 z%~)<)Rt#8UxM{d`R?du+!K9_RK32BT>fq|@TM!TyENd<()CV@Lu6M89y!P&QZhoh% z#tN|db##`sB&(y^_NAo9O!gt^2d$FjJ1WNt!Bqn;IKr0Al+YfN-oII)YbQ5O+6 z7ebO#JcT)O0LekAKRkl&88Hg>B>xcL;OtQ8C?sHNr|GXW6y5e~%Jg5TmcOGqeo3AC zN};3`U+L7eVvB<0Hzta{NPnfMEg$`r-gWDh4Snl%&9>$b6|^hw>fEL(-!`RZ^N%)RN3}*0t2EF3Q?1%3ifsuXWeoRzQ>o_*UJNHQSnB_kc{fcGGU%|2r4= z0-!+E?$Y*1JUHh&k9(fq`CjMzuHA0o;G(*}6FK)R$NeEi6vt8!Ssm4I++{Azh50Bq z#gFnlg*8#lly+1*r5n|;xHhUE)dQ}J8m5e+Mh4SIO;hGk^OR-O!s3Rgb;>qso3fAE zS=<ffRH1}haH%C)!qtL) z@)5awsUP;N1Fu;J?p+7&3v0(M;aaq#^>64~b@*1B86zarh3kdZuy=wN+63R^!P5T8 z;cE?V>g5p6q5Y5DpJVcH1=!L1VWVAad>b*oO@dt=-@rL-J12B^aH4UvN9cK(4{Jwz zp6@xspW)7GpX1K(f#x4mWdU9??GHtxp%YOdppmr0)8{1pGt-fo=@BFuPYYrUVWc%$pP3$yOi0yA{HeLnbX*1^n@IsnZYA`%5I;E+-WQ4q0i9$y8krVCBE=+^ z94@9yzGtRiqUs{^Lbxayr@Qe_dI2jLy;&CyCPr2EbBlDp+TBx+D6ed?3LqA+& zbD}6r$Ah!cP<(twoC2&a91(@Fcu*83BC)t22FK=N@tLXM%P1J|zG8$D*NF%#RuLmuB*p z&ia7G;)i=$7#A?*W5O=~$#{G=wrg-OB%Y0&9+(j)216%egPXT(+&ZvnXv?M{mVZRD zMQ1|cU}QSD<3uE0j15uDdt&-TkV3(+nW@>ixDX7-&&>)YaJFuPSV3ZFLXfJ}{4$(n zu2X^7O2P2V9QIt0rAz*3bPAu2?+6Ohv=xKnvs*R?$7bh(GvnjbE~!?{aWWJ`@}_Nt z{UFt-DX~+L+2GVncrGf$B)1BVpA?1I$(d+aGKfNaPMjXImQ0QgVZ!698Z7W{@Rzx` znz&$1aDNwgN?CL~m*5jz5a*KY5*ow`X8^|~w1`n{%q4V)6>uEV^oSYMGy`H-__9kF z5i_YV6JlmHWXbJJ@=V&tKJD!=2&CSlvh;bq8S7)zms(1MVc1e$1f+(4Sm_w|Qpjd<2 zL@#-SKg3${s3~GSc@5;zwiTPmYlbITJ{yY836F~6j2N&} z)Jhwn;LiW#`N8~+Wr+PEypPXw z_v$&bW5Jj))!s7I=Nfxbx?83?^|xy`#oKerZ~Hci`*VSP^xKu&_$2-I=XxHY-_Bg; zuA7~Y+&Dpzwp`l~{kGh(Is#a6& z$a5OIBd_IbJ66#$Zmh5lD9qXM$M(f9ApklID}R#UYuq=1#F3)ziDEKRup=hb9Z{Bo z-7%@|hq4sxlu30rl%-(TR9?Yh=Dj7Q4+ISBuV~r`S9!4$KQWpGzt|0*c9ba0 zioxJIYfDZXrc!h8&U1Gh1IzL3;3JvAN74hsd5+)5H|84s*~b1%V}H6~0Fc41Tvu9$8VrF9W9MmXenSNmvR>vV(8UNm+aq~dp&t2c}@84^wsIK zb5Gi|=l&5w{`)59$3&uwgYba+wPVp(>BP9{`%|C`#zM296Om{ner~>Voo1G2JBA9? z$*Pi@0+8xxp|i=N@x6>g?ieR-Mv%2c_GoR%Tfsxk7-PhDa6A$PoiHWD=DXKv$l8qH zGPxKXxwLMoQfAf|$slJNE5%-PZm>pmpm{{_hCPfdSOYf~;LbI0O9Aeh z;3v3}`mF_c74o}F;5H4nKy-}`^os#>iYZ|R?O{#Wu2sKTpgDkdYevRsz!XQDz8Zh4 zpMysPm6==6P10PiaS4!9Zx)6=sZD6VqDf#=B{avlb}p_?30OEi2rtc;<~ghng?(e| z1Wb^fR>imozU7o~E+%&169K)r7eA7YAd)>SjECl;@!)BqFC;zt5qBX2TbV>1oR=4v zgYC1>xFBbp-&SF1u`kJsW?*-8W-Jtq?HwpD_j#)NVR$&`^BSGKGgsS?tqo*q1Iu04 zJ63A9U)1OPooWB3i*sq$f$K?lAK$KS`heq+)z@*;)sfdC_>-6mb^J_C19ydQ;J#@M zXqWkbmK`7m1H8B&iHyKHE>l=T6lY{sD}{*GlC1XiXv}ePOt$?>6e*^M&Zej3^Y0sv&Y5?y9ycO`iT@1cG?b-3J9`HWEdjRj)%;1A* z&&Ib1-T}BD@Irg~(w_bsPXpcxcoX2weJp<O8jdva~tb1j?h)mUAYJZCMO(UP7~vXWs|6viWG0~(QVom5jgv=DH}VuSm( zG{cF%g#tKL)y1&Mx}6~aL+CIi&+Y1-WzX_lcJSfM;KM7`kECsn zSX`vDM|_Opn(1lD@HmqNh_oEVrwD2<@P}hUG#U(o;O+vH5j&5t`4Bynl{_NIW3LcK zd4;>H(dtck2iM%5YiiAP4d%M~bL~C3-of0aZMnceZu7QWNB3P{quz1HTc6kCH*es4 z4S6GlOk8V6-b^72g{%~^am}rHJB1uvM^D~KAs4q{FkeL>H`m^sucnZP^LH%mzjAJQ zDAO{SuOX0^Yii5Wr}DL&#qnVsg?5|tmc{l}4nHVjsLE@2eRI0LA7_fbc~z@G&?q@= z?{dfT+;_it^^2L_hbYfpHP2o(k9%ngCArlkHzm2LYWGsdQtXX@nk)kgB?t7DCC}2> z@|LBOZ#3sQMAJi$eL%m3nh?sr$eZ<^TuuGu&t3Z5<=~}YUQ6+TP3-rM*^zec%9tOz z5zd(ReWXVcTa@zL&=z95v5aUVhLItt|AfG+hRYC_RH8V-yn;?%GsG!Bb14I)j+$3Y z3I2d8>QV9~HAQ5`@@Zc&#fgq&u86!A3`z;*w?bXoVqKto3g|4UE7k%+OFIv^k`~sT z!pcwT6B=0zl+c6R(vL{|KpDS9J)zyQN~`%B{1J`^l1^}+$EYOv1izp;&HW;OndiB$ zT692tzp6dMmz>~QNi(ofvdVHZwy`A(F~T#`kQm2eGP)|zNao!W!nAO9R@^(^T?tPX zi6S-Qx6vlXrF{HHj3nYTKEwN44= zuslhKpes%A0>sippU(#FO|f z>$eyIH4OtbLqCN*}82=Ds;cK zgBOYF4d@|Hn+2OCX+zLPXd(KS^kQflVpT2Jo`8`ZBWw=<5zd_eW(|&=6vj?T4$=}7 zr%tM)ctvr9WRqzCxuD!(8g4&x1zb1+fJojV{$ek|19f$$%6GZ(Qsd&-N>$tW!#I6B zzSY8%LZsv~AmTBcKYGu?dFxhcpM6LF&e*%Xzd3X3k2|d@?b~>LZ^k#gQoZl3P^S9O`J;ENzMIzW#ocLLx6CsZb_yE6akNs{P9Fi3?Q;e$ z_r%AGySox=pHP=ZCE}!{Ea8gxGQeahtTbXjrQ8{(o z1m&83)bY383Kv@8{9A8D6u)RvUgDJ2uymo;H7sr?cT zVMUU2RX@Uxy?MjNfuU^!o96cueo=HQ{(=VHABp+Th%?g@1O6wY0<`CT#P@Bybc`)<3N@)}K5@2XY@b&(0Decr6I`~EDi zMdD9lByoCqSU2qC-gcUY1KPK1ZNnYfw>$I*GnQAXjZ8yd9E`!FAqX8|5CasCki~-J zotcYMk_us&V&KTt5|Xu$hl+z{35lVpm@FzEQ;Bdifi?j#@2meB-mBcJnpd^28eX-M z)(RFwupWBN%5y1eN|Vw;ma7AXHziFhW*FCob%>c4Dr%}KZ6uMTuBJEfC2~p2G)ZU8 z#W-XSBnepK!uo_okw5UI5)MnkScF6WYLK@B$WqH^po4vq^?9tcHmaQoSl>b58@kJ| zxLM$f(vgaq4t1+8sM~l!6IW}k)X!_YBHxDiR^8lck&3#wu;!XpDV5~nBrstv0Shhs zgEuhN3RB#;Hl^Id0A)A9zTp^n|Scsot#0e4BvdBT0nvE0PJvSxk zNDa<5t*oymf&d>hz9ie?h$M&+e~!FY$?JqCi`de?ZAp>Fz4yJUElJ0@0H$rMxECU zF+^Kewsmu+b#u1$NT&5juDSi5)8uoc4&8He_NJW6cc&GKWM}|8a{ZeSxN9|3x$o)C zP!)SPPeW?texiz&2V0V`q~pG!EfovXwIwxIAp43+k+A-nL0P^yiK~># z5@0<^Ngg(aP2bWfIY@JX1#T{!aeIj_V5`+nD2-0F>6y4c6pc(w!^GlDB!1E#M;1mB zkQJC;UIhNw$xGo!x6%apH!JX-8x zkV=>tk`Z78CX$5$BkD!A)D4nVrcT%vku33?NeiZ zI%}V5`eXWR!K~tT{or-mO8uVHkvnypetmy>$H>ai6PdavQipTSy2T?IXZP}^o6f%5 zUVqL*_78OWX|S?QoUifn$xA1*-kyxN=QdQS)=SpK!^;~oja#q3^p5_wt#4azxsGC= z=TdyBqsos^g?YdJ5C?odLAtqXb4Y414yGTzZbyJ!oQ_EJO z9FhjGAao?GaY zJZfQ{qzXxAyj3ZXuuT$su*NAA=mGA6YfLl2jcGoQUM5`$SJ+53RGJaF?n|1ldUeR3 zbg5%d6{q8zxycSCRQydi6ICUn*;PzYLg-765-$E;Grd_@T}gMCPgch_C~5J2C6ur! ztx$dsa2NT{$Iur~!j|wnxG%|?gaaf5z8%veWTcSzeNgDC>2SiH@a)FNm?Zvsdz`BzWtxP*JM(mGRBOi6db zB;wu!@}akD9c+b-7%P1}_Z=QN9*8wB@xN;Lc@FhyzoJjTXXEHjSkCg5FckI(#(7K= zRO?K*k0H)8+;N5rSPQ%>>~bCt#i!F?4(16`zv@NY^ z@jH~gifosWM+}oaIkX}%P+|bdv@QFPWRsf+6Lx%7{05aJS{Z`wxG)uy3FEB<{WbDl zB2T1r&HjCo_SpV?^6^hQOERk#=Q7uLfq+&>)j^%XKze#YG7&dAONw#x+;r@vIYEGY zL%aX%K1p{vBAf~MO2|ExFc8&2knA&JWFj&RTAW$dJJhH=5T3$_1ViI=vqSC~sHpMa zSw;p*Mr9ZZ!bjF=FxjPJ5MzTNnZZ#9G0Z{e2usIHg>RCy$RgR5J0_9ouy9tKq5&e( zRMJliXM$&AgiDFqtl(sc*FQpra{I?JGUtAhitK%2UQj3JxsM#2dHbE(y4Q_Yj46Ar zuKso36<^AcYw1Xv8go|Lg>4JlE)Ku8E9a`ZY`$b(Y+DSyK7M6<>FIAyuDAlJVOYDS zZT_6obs@eG|N5hsM=p)5INMXYJ8s|QzDs>6W6oWdb#KVHHz4S1&iV#2zJZi2SKE@U z?aS2m;hQy$*_!T5O?S$2$5V6p(4|9*!cN zQqM0|EqozsZ_U_Sms(fs9qAoMGq%UUmDE^LW~kMmPTQ?Gq_ zsrNenox%6pH~xc%FWMJF8J8ax49gSOLsutnY=3LhTQ9x6^#^;?U55*aw_NSH?k(4c z-}rL2duOJ5=Z)6i==njodg5f{=J$0z1jY!GW}1f3B4fbVO-p~Ez`Lz+xbMM z^9is_Rn0k1&0Q^6tslADa)U!^?!G%7bUx$h$a)4co`LJtH$6kS#=vrGrm-*ExaVf$ zo;w?dE58vRw_vqD;wC|y} z_N1SBI?#rpo>&|$(Z+f;A0X-Q{4_d2qAnobSZ9nkVQ>gqFZeBq{RDE}FrbNA+t@|@FsVSHiy;^$W!?J4aKtt~ls{l)gn zJ(qgY?!Aj6%UhQBr5gtkzFz(P+V9ogX#2Iso9?}-{V?lKTN`pV=f$CgBZ!)uSyTN@ zQ$6O?)bgIG{*Kd~vi#`>r+Cgj!ZU9CH@$U7t)?=ri8vZY(%%3w2G^vS1YoU8>HsHX zy~ZW=Alrz%FOYGi$zhF3!qYd_IFKKjih@y9AX!4f>)1CcNpn;gP9^1>LUbjKl?}ka z7?>tiCZO0rko=}fECERVilY!_x7U)^c#%LfPZGtr#+8udwi1%umav5ljEt=?!gAdz zNv;yZNqbS8T8Z5N5o?b>aCFd@3hc-=qe_xH;zbgUjD0N$hmr<4mSeKQe1NP57A6Pw z#lVOX$E2SD$~+JybWK;wh!V$x*Fk|OaWJ9;J|jvT35QB**g$HSMh=SC@d@#J@QxSd z!QwwbJW$2R1o59zhJQxh_sP3K-Vewtrw_z`L9l;L9v%7OzasA!$)mGga>&fQa{o#E zw*Us56;J5jqC&q--n-Xgn7vqc1E=?@Q zAS=o_s(vbHRK}WhD$@Kn+v*mrghEz2Bvpr9Jppj+ws+-?EC_?c&ZT2-nDb^9wQx1Hc`FOrV3U)#v!H`y zo~JIIT6MDMh=%J7+%=o^50zDoIBK?4UD&;_J8z_zi3pSELUi$X#?hLm0?;Q|oEI7w z8dt3pYi1RV@cQbqng~!*6^pK^i@}POvFJ&;UH&z-Azo4$#n#kCAf>8%R>YWOOCDoJ zPZL`p??>8z_lp))xz`6sE|q)H9neaLpL{SDl?_GpcS2QwD5&xP_xYnhm`3oxvSh}B zkF0S8sSoKIb!5*B!6eH|zhP}rR#8b>M$3zwQoqVWYYRNI0k=6!Wjr*+iHA0Pb%aa= z13Fa;X)W*sgk%`c20hsTDi^clG0dqlLk!_cJSiJ0jWR32Fkyzusw_f`Mq9;yvf9j9 zp!Zd*`#$o#M%e8a5P+oF!RZ5cs_U}VflPJa{85mY=O4dqcYl4$#l%W=$A7eUrudw< zZE4e`nXGq1#=Bv8IO82iS?^l7hY#~Pn>}^<;>2R;(&WX76G{#?-N3zgx{Yy6%^q zqMb6KMYIyG!IfNYg5+Jn;rveJ4Z}yEQourG09BT1S3tuM>9-r^#hQKYp?y2 zofIzl>&JEMW|Zy~+o%CHP{vBOQI*IovX#Q402WwDSzsk)tR!v4+ZD`8>QmKg73^Uf zSv^6=Z-J@au1avxyB~Og!y>l~PYNiZ8ic&- zlx4lZg8@~AQnh%fpiZd9yBd|-Am?wuml~0?3BSz`%-Qn5RR050t(8;pR6@nJRfft> zDzLfg4Y$Jjs^(M8#V1=qZCC@|zlM6-QLpb))ocB$j1E?D2fkGMH}a)Ue5sB-Ls1Wf zbvL`nGCzU)Kz|cPa7=h6X>o0)^1Y*vvCr1L&47F2^fp66HCW>n133+h&j>j(S4r!U z&{onwDW~&;%NC6hPJwxYnF3Bt*;O!*Q(M6d;h!I4oG4gs4K3 z$;{|412W&SujGx3;zB8J#(T6}4kJ706Zcwew67uS+nDifTzKq`5==ezzR8udRpo3x z_J2dpR(scqw*{cfcDAnCl~)EBojy`#Ba4r$M`S^{p!PV1&5G{Gv}HZJGz<}Z7`N9H z3w7BhcYb5#{;gl)k4tDV1LrE-Rr(ct4mN8Lj6?n&ota=J^KlSlrz9($<-rq7u=#<^ z-$jam3FM=ThDbj}Ha4@TWRT=4uHpj3JrSKb0kMBE0S1%uI)|8lR`L~IaiTAi`99Hq z%-Yfx8-tRd0Fi1~%fi7Jrkep}BRL;71hUcHMZ6cq3c_YC6q6cd5lEpw1xvP>X*~A= z3r@*`(vj2b9V*FMc)&-Pjh>KPg_f}wH%PG|`DiTgQS>!9d-7b2g{k2+N&=p=hyscx z%uUC~YG{57&VgT0qulCV40#DphS%B>Rb>Wasz?zAV=4}=coNV)GqC&ZB=Ew`cIfRK z*qaqNGMIoBF!&2-Mbew}C2NsW#p+5kp=qaZ<@B3ir%_5zk@vMyOa?Lp4! zPm&Y3q6V0vE>RC%D(I1f3wn9Fq0dzkX2NWZuoxW9ccaSR*~i&NDB#NZEss8VbRRQH zQ{Sgx=I<|7+!rtU&k6AXJlJDbP%L9K#Q%k_hsqiNzw}m}JBPjD~P^_s^)JpOANt zybsA^r=eBqpf`O6%HQ-Ekl*wfkoyiKAOgX1QuWA89M;-VVG38@gs^O~P0T4>t%}lX zY08n}mYYy$vwTUlh?c8Ml~rLFuHsEMsmuU~fPlHy;-5f65ogIPA{)x$D6&B<4rQ^o z6$Vuv5H>V_P6P6P$os$K{RMf{Wzj$$-c7{Cx1gjwvuO*xwjf{FQ?P%9U->||hFZ#B zYbhQG3}NJ{Dq{G-uMcqML;Me|-rMfZoa<3=)`jPXV1C!xpY3=k)A3NQz9CoNoonjP zxqZ3X_MESk!hqw7J1(?Speb9oF;lnkUY)(uk}1@%hHJXNm2~UzdX0% z-MVlnwf~YES97!8{*1SOSy=H75$vM-eP<(m)9KDRtLP&&O*v03eKznYuFq3|J~)u6 z8@T7Q(N%hz^4v}7*-f~D(Fa4=!KJ3JMad6jJ=!o&hOP-6^Nn;FENLpaOjdzTsAPOr zG@xAvpP9m%l3FX_F|8uh59_X}me|bdNu_1cqk@Bdhgng~z;2x+)yx6UsAjNh;J~Rc zgq8cLN$3jw){bi-xgz7(l5#}ll#GX^q~$-Y@v!_A$HQ9ES1YhAuI9>AS;c?? zbHZA2?5w3@2WgvSMAl`T!qCd*z2dhJlq}ef1zW-6pzD7`$(P9c8l~C`$cQW(KMol1LYjSK;YiNakaIP|9N@OMf#l%# z%z7B@+PDUP-pOoyO+E`Mbr;Q38<)R$%k&VLr#7U!cHA=UL=eR4otEBQQ%|nBCEYfh zX&%1gX~}uETn}YD+jE|lyiwOi2KW9Axj_Hj8mrxLPwxO#>*wk_)7=L%bqC*lC{z1c zX!GT}@UjKL?UE+K9KT`{y*;?@@3o{X7z1-%&gNz9*qv(*uoh4w*86yKu}=-llX6EE z>!DzH&$L4pVwNH|477~gXpxjYsP(dZha7h37UY3YSf#bs)KsZW52}f9CT`S%-Yi`5 zgI$-iKo5cYVK0g@1#UsHAo*yHmAikS&Sd5#6cw8=m0~q{-^XyvY6sB$(8R&Ri%-SH z$8uIt5+h`A*Fm0+#pM8mz+Y)L3{Q`Oa?jJguz@=twjTu{G z*4B}+bu2x-T(x2Yzp6s}GPb@|z19i60iNh#1ui_X@I=61*4ejW0}avtK(12OSi}q2+;ygrmw$NVARhRPMCXua1O)HX09I7666?0Umi2i`C7Fas zn^63pEQTn00$%B3Fh>YT-VNl7 z-9%s6V=zo7>w)&k+mqMh7e|ky@k1kp`aF6|ncfp|Btz|rWDBMkAv1+6TyKBgN+Fw~ ziM3O#PrK$}6vXeC8yDjlb2}D_-h*43ZQZ%n&Rq9~Tw6!3cObWAd#(}D-rV+wa{Zfg z_5NIAYi{EZMYry}yJ7d579oxnDLP$>nYh}f%ZW>gyqRJaJS&yAQV8ne`n;V&4z8gk z@1&3m+T_KKONaAS6mxS8%}cr~z4>a2dAKHjzJ@|x4(Hj?OGlT6U^kWb5eUuhT|V$e zOTLa`^_;Ih-$0>8&fl4DqEItB_A8BVG+x(bT8Hv21oCs;8}h9bY9lRnJB2!^Rh<;- z!fE|_-<7^)Po};<-%X&NvN{3;>ct0MPh3eXhchjk^E4d!zJkVjQNJj>9=#G>e)PLX zuO3Y|@5t2d%$Rqs_EV+=-o@1KESpRM6lRkJ<>_QWc|KXNIH4?hSYy_=l~sdyp%x0s zl~72ogF?kBDCTwQePxg3Ab!W}NX=bHEF>16dHsbeFQlD;jJbDJkCc3yR^L{Z9r23U zKf-!yW0-?2@<6ZT9Gi&}n~UqYv4PNuF}8~&16XnhiR8zX2%j>e2gyv=D?_kCosjQ7 zZ}=p9_JAyA6WP|LeJk~TqGF6+mtWYTLr7|UKq*MH@WeWM&5ONoCK6l2h+L`r0mT@X zUbvwz5}(7~qLLh}i1KiHOst~Mdl?WSwrFgCJ^vYGVbM*+m~jY0J9oA&BY=+cA8GWwW|f2Ykqtn~ z*Y^k9?t6x_Jnu=@ZMcWm$azoxwAR7*ELG<@{M zs+B?m2HvVxfv{Qy!X;HutXKub0+i>yOoVzPm)0wT=oT=IT zq1B)_0G)y8f9}$Am%ni73o`ut$UWPrmTyZp4t&T_;N!d*^fjKi;$1n5^ZXH9ct=pa d@cx_iI}S8+?=;z|GX z`}S~$0|hzl^j~*?y?uQ9?cVP9-uvf3z)K*d`ERGPgM|DBD`pbvg^gc8VTH&<=F%j? zMY)V4>fo^MNIRoW_T)1{RH(PPGC~6SGw!I!TAXQ5#vAprGN1Nk{84`<5Dl=pkZ#Ka zqrprl8e(-pj&R^Z>&nnub zoQy5d-|>a{cvedl;yC*0TsE1S(L3#J2DA!TL#m?1t|+OQOIj?hX;e)#Rz`@sUfI~g z5%MkAW`#sKnM55j7j=4whsd3>L*Zp7{CQbOx|T?ilU=U`qJqpTuA~DtiI2J!5o#W& ziLdc**HK2j(CSfqic9e)9sSS>7)X+ry*8MD+$H+~LmR^oR6?^m?aMRDI2LdRY~wl@ zJP)mHw$@Ii3)T@-T=fw_TVJ;v0_@`tW1o#Y+$OgxF2K|$`xqvd+yQIsUm{nz=)eUM z?$+B+DN6oAUQy)J=Z*a@d|*7KWR6N!uXOsn^flwG6dIA9(DV~+# z`9#F)l^{JuXVfFCgthvaT})}}5lOqGNLrlED4L0m9leCTXko&n4;%!S)T= z6zd!R3fXtL6{4|b%$A)+@*U_mDS{(Ku1I1I6QJmTnuSXf&GuO)6wI3OwDtO%d$2Zx zm`qwGMKa4j3?p-wIohwan>{j&X$ALvB0GQJd>i-y)2rM93G?T|jt^Z>`w-Vyxb*Zq zrAk&yFN{mMY6mg=NzB7yCe!f+5M`4lt)Y;bl!0MXWSC?=l3$=*xRC=E zpO{<}G2d?+^s5=jKD|yhy~NYD>@JIa_r!tfw%tqoJ+ZGkFtWt|W$^ys@SEqpckZ2M zesuQs+0wQntnYy@Sn+Ku`?jr3m3`s+9fLIo*M5q7(Ait*+*R(}Rq7a7J6#hT?c*Cx zzO$`H_&{6DN&I6QFd<3Qj~>eeH-nM)5h#Eb0s@}J_ALpB2Fb}H?MCdfhC{*PV( z_^lT!6UWLE$4cYJYlJ&7P#xI*=H&M#OZ{UtCl?s6Ih`Ia9Rld49e}#A1Gp6{WPuAi z&gnkeao#4kIo+8^t4+%TQ5DO^GfJ!FnSBQUqly9qmKPjkQ|qn-Q)dbLR$7N}ku3o> z+I~dPhN;NO+zj`2ecU1*M{6jD4a(^NX#sM``Hs^Z35Ug>(|MGvjKAm(Kn{vDfU8c@ zTqZWBEU00g?uK@{2Q!NoP2Q=Wk&l5JFH{(M==dYH?kT{dJ`NddMNfaFXSCciddqdE z=Z?7E^W>7R+TB;_-d*nAy>|JI|JJqj?&C||Uj;i>zka81FL<&fo-{VE*EWaUY-l=A zBxgp_eNX^-ZS7K#m{RWo94VM2bOIo#1eyJ4+i@e(%s`1{m{n5PWt-PfgOP`Tjv^Y| zsq5r^aQB+F9vm-;;}6pt9w1xZNhsLWNh|JZD4SNIe8zEs&`CfOcG7Nm==`NPNWC5? z(#IgPzf~VDxUQTPITlZ7F_{U@!dUBX6($!|m5E8d&lH=aT9R0{4biF+S@ zSAcwow?4#;q9d4XU1JBXF_X&167hWeVk(`|778P+SJw>nG+aWMgP7nMx}t64b)( z){EW>_bi}MQB$yG2c)WhXH9U1yftD(F}VzLI}mHQ32cFzlcm0G9%H1zZXm zgtD=!ZKxh_t79?U8;gN@o=;=hAH&t8>pkvROwJ{s*_p~}Oe5<~Riktm@aR5f4j8MZ z9>hF~&AgmSXcS+PqH>}8SYtW`;~0ktjA`gW>_eHR`(ewpRHBaP32bs?v%2dnTNA}Y zqQ|f`P`5x5N;(~jf$(7_01BtivCX7t_pz)K3ThAZeEJ&s$RP+~t%D2>RR@MXatDRB z2i<)&0iHnUP{Nj)NW5*Ic(AmS7rd*xHV8awe&T8e!$24+^^bo{;JM*6TVO2qIfbXW zwXU_qt%GZq-r5dS2ldjSGas`;9U9v>2-^0IiHP*hVE_qpXCY7t;o~#lYcqM-N>*aHX8!hA?p8M&*BGs$Z1=W9C$=K zUjT6@x3%s~igFNE(YC;c+Zn?YR6|=z2AMY7=S?nLD^JZ}UPE{pu9}U>o85rA5M`d4 z?zazYGI%U$EW!J(7;cmG<}#QpG2{!DOabMfXQ8gU43iS10h3GErXsewL^AB|hJSkc zyxmN&tnWX?VdyDcC)L3{XjAuBdj^*H@Aw#Ton!J&?@ML#^Pn`rw~T>v63fS_mO^7+ z@5#+;*k!daAn-vNBXnOK28Zi@fQ)BUL**tcPY_msqSgohjsJw~TkaKdO}NI}WVz+3 zEqXK*ODr{H`7Dm#l8T<9Bk3@0ouUgYMf;3(=yy0XaI@&ua0M(Cy)rP0%)j9Iv?(4V#C|-+ z9?GeV?yYYjlqqVBCK^I@hNkPrzd(kL`LgiMlklog+nN@6Xty@6(Q7*HKyOx~1Mq;! z%z?}8_*?GpT`5v@{vG#M9Bcz(vEUko1eoHD@r&p|S*o@H1)FF&@Qqe;1?FLK#H4p{ zVdc0M*8?(8jlwvI5fW6*N=)Yy^Kv}w(gj9ddH@_`8H_uFM|2(nDyr_9yTYEq98Jw2 z#l|x+19ROwV}q!Qgd}0QZs_(-gL|8R3QyZOzlpOFREcX!eP%UsG@VPt)9Ud^3nbTI zA~g=#r-rpQR0{5??i~H$*jr<-2dhInD?|IrL;Kc;4!kb>wf9${p;cw=tEJGXJ1^c1 zo%-=hkX6V3=*M6BY4j)2QYeftt7ohJ_KJUd*}r{tv9e=-dB^_s9S48uKV)bM)DmGg zsQ&qk-ep_=dE5#`Q)u0Gz5x5n$dRIrsCQ6h2GqX6_Xc3tZ?lHZ0}%ZJW?#hY0%nY& zaX!66xvJ54Q(#i>w@HL%fx@Vp$}mtxD}Wt?B)x)9<1pHq_qWNw{ETzBh!~e3!>#Ti zfxfE0=l;lOY0vTXkrO3x_ku<8k_$r={Z)U*gQ4x!?K?gaoPoAY!R7JR+8}gM5v8&yt@YiSx+fmkIQ(}m)VL-=EuSm#dbt~B!i!!FyAr=gE#Fhgmia|;cosh^V&&*#$% zO}=o`@v!c1!(bJovT!JV($Tvzcyn;c1-o1EOJ%>b?%%n@-4BLtTwT7p%B}8SmDhv2 zO5!eqnkQjgI7pAfgQ7b|KZn^0_TxtntvL99hN@>uvIk0^{Z zA%!|11yVh_e6;4qnn+;M^m2Oj#d6zl4F>>rZ}7(8^5CWqYR2nKz}k@?tz@dpjXTiv zu$9;ex5v$(fg5(LoQNtY~uRVjj%$j_p$vL%7li5Cc7*$-CZqaeIstNA_?g4vV{ zVmLsPFhQZr>?@lnGmS6^y)prhQcYqG4n)@Dnv}}q(@F+Hx)_#!ViH303`1R!7@>CA z3}8m2m)UiSDzTt01Uvz4Dr=hq!n+xW3^N>O&VNZAXMz4iJS$yPAm(oBCmCQSAhJJ^ zgc$ObOG;MCLVOgEU0h%Re->3WqWYlfQ9G%Z;$Uc5c0p^=Pyj<=1+Q-4A}382`G#54 zDn>X6Bz>zeBdcZFPaeuZ4&!rpVtQZ2al zMfM+x94H*K7)PpKa7rmvxZ8wS{H&4s1V?RBogk@J=O?{ znwZ&@a7F8j#p0;Hork#06I8`wTPzcwQy@Tjk%=umKtYV8sG`PZXk4cAi0do@`t5Vn#RiMXggr*Y znt@s(gf8GnT#D`lOOHmeio~rVFQb9Ql3`^*z$mL+fh&i>Q;q9S%S;6Ck@fxnM{ft0 z1=V-J)BA7!!QTdZZY(Y@u70I5bf7$RU_E&7A7|dnyqo#iv+Gl5?*`A_?-{OkUD$9s zdOK>w(cbZiiv&(=wvkZxjju0%{XuA;5|YXxX+5-aEnV4ttPH=Q7_3V1^xh1}Ssbu=RzZx99bF^}Bx_of@-r)0w8$7yyiLZ$LWwHOBI8q%wXs-o3 zmwcaYj*`%}kBQ*vz27lZb2!?=#wC+Tx_WCKKG6GL5VP_0{$534^XgOHQxbV!@|-&2 zd_U|zb=dj-VFAjzOJn9olShYgni+osu(^*i{#r%JWo~W{9P1FsS0w8#NwegCD>rx! zE+tgQG+cy&--d?&mek7i1xQP#Ho#{ssn%$N6}7+#<1LBSfNd!SEjQehNi1efzY4=Z zbhy42!+V^tPj|`62=JI=yf5H%P+*}pJ7=vRkY#nATyc~m2W}0 zDWdWx&`=m|$q;s5^k+CO0~v%=g5AgsmEdqWIK1}K-QeCkuJ`=!`rj8m@c!I;ci(hL zoGt~YA8ti`tMu+E_wK3m9xC@9stF`8 zx

KH=F_n$%XoL-Z{gM0F`GY%a4F>U67PoI_|AVsTBd$$!qixSS<3gX16JqR>zOT zYLQ$E!*bR#A{rQ5&yC+3v@$eWk=Hv8Qzat_6<+q5vhR!Rs|P7A87wJ53{Q*0Eqr!{ zS!s5A&IksmQZ5Nypye!Mv+->PBhA|m-34+Q{9U>X?-2GHa`!KwOws$OzvM1b38!J1W7Ea&Y99>uxY|=g@mk zzWd~RC*D2rzVgBB&u8x*JYN#emxAYkMk_(79F!`-z2)Fupwk=AEkC!K#H;4x<%#2! ziRtphG~BI+#=$2-qI8&d%qYpsWJ$KJ|L^38*V2})XTCmZDer(F>(ZJLOy(y0#Z17{dNjBZk_`z zLlI2(%)qJ)PG^ycKgTK>dlXd)MFytd#cTw#M)VvMIVNRt_NNHE7Lcp zYXX!ZBw)l2YHqBFq<7DT2TP|pH{3HF{^8NLj&2Yre$)Zkt!IAp!tEF8by4vCc>brw zpAraC&j7Z&?o`A@qCKnkw;D39rP3-3 zY8oM;3AQpzUb4O3atOM3)VNeVPTvG*kXl522{QPQhU0!G5^nf6#Q$s3{qJN)iR}0_ z8Tt=0{X2(~b9@4Qk8ir3;<&C--`FO>!sfG%qufYodt{TqbMqQ^isQVi2i`pTy`!HH IEVJ$ZZwBP4-~a#s literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/experts_int8.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/experts_int8.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bce4ad16bae59e300fc44296b9ff9d15f6a3b730 GIT binary patch literal 10786 zcmb_CTW}lKb$0#atjl)>vz*E!M{BE~Pycj)gg5AxDYizDy*K z+z=ko10Un?D5ncg#}y?$ zrpQr?ZvAF@S{Gi(YRODGu8cy~WXWhY$iu=~7|CQVj;k?RRhS*9Z?5oW zw#jw8ak^iHwO^8xXU}P3T+^tUs7aU*hgn&(aD==88%ZWHP9iaj#Ko*q(;OM+B@%b#qe8!CD2`IT5TUYT#jbz<9oHyqk^pK&!p3)oW5|GG{pI`e18F zg6x-@O|X)Xy?cq%JXO??i&o*~%WPU9I#C~;MdjAv*BJ0r3&Fk*f>_Jsy@JC(IbnofW`s|61L z{;zET@FQ-KXw|NHE>C_R4s%7O9GNE9INY?nIth&Ef+!}_Nlg@?$)?K6xb751#)&AZ z7N8TE{qJuLX{qc`HksX}#VPFNT~m|$4(va_PaaZ|V?)_#*sAHFi;9vOGIqZ_B_}4e z3>{M9(=t_u>Q^viE(_kO?H|leQym=tV)ERtz1Y8ll%^{=}IOV_OlMO9IH5!wY%7w~YV<0-ij6O}#`tcp5umfZDq z({8}ly&DC^-u)CH_41#b7%n zWt{dvZ~YS5kxlrV*$mZWNsK2nQPQTfvYP9^U#<@tiyK@G0hlEp`9q6M3r&B<|J3!i ztKi#N5O&_bWEgp$xIcao8fgTW`whM^?I@uVW-hgWmk8#A(T+#s@oDh+o*^PH10quY&PiTW>H z-4A}ol!fH0C$OHqsD*BU0hAHOIT3SHy2oiIW%8NpzMs3g?maL~bpn_r|Kn;3Oz6_229A?xO#_*tpW zY1NP|R`MjzO)zhx(qcR-V^4!xGB&+kcO}(iTGis|giO6KjN;*^0suW&CkZg0h9S~j zX#R|k5Vg_{96gMHNd}h~sfg8&U!Q3sF~tmQZc81-r+&Jb@prGypFiY;)iKq3d z{Nvbd_l`R){kK{M3Z8*G-nJ$F$5Xex(A2kLTai}>4;2Rw-D-KF;CZ6tYhj-QzBHb# z(P%qrT-s@tX?af&U=zT(eeMs(k}1!4>)4 z^ViN7qmSX3f$Ern>X^XtUhE1~y8?AxXI8ep{py3dL(9SC#LC{~b8l}d1BFmw!zeFFFIhBASAVMCF4*(urK-vR#u%xmEEjJwb^iMhWNU6<^U z+&R!&;MjOzw%17>_9t!mQq;Y`DQ<@i1$^+v?LJtYU-n7O@E-u2aVE7?u^?cf2h26b zw#t5UR-4=`waWo1{Lt|o@TN`}*#*z;hmPFz(B91t?R7Qom0KRr-cUM*6Ux3-zZAL5 z$AaKW_FS&&+&WisD}4k+u6qo1W~e{i3aW6OmPhr6|7)lWH@JpY!!^uXuA91_11+5= zVpaQBbh4GU!)&ID=uyC;HbcuIPM<}<6a}gmP_?s$u3&y$z3Hkd?5b<`O_-pfz6RIs zo{Kn}m~*$(a$D#wb?un5eJF&DLx}2Jcc&*)qHH+Dy8B#GlH|0gLQJD?28}1vlq@CV zn#`I+aBVRN(~=n1=pKy6QmWwxXm~uE^og`RLk?^`+!Lt zE^@Zr84Mno&U6nB$(a>37CGX1H>)I>LWa4Y$7&x^(kXQEu zcy;96bv)I0b$@NV2C56KA~osOeYXZ08sv`W4az#XdmgYBA@8YMi!l-4SCDT*p7YkP z#I5%M+&c5#8n@FV>KfI(X6z3EiX_vIQFlZ+dI%mzAidzF@`cGHl~u@n#3kKzX?U-h zEYO{o_S8^lRugeW)&*!~Sk!HJHa3hC4OQ1}#*xYN*-FO2cq@GhUZ*=>#6vEFCeV2~ zmDQ&80F}>zi)5$*ql==LbUr83jH)|WShaUgw5iT(d>#hSV+f8TIDz8>@{~s7m6x(K z2e_%gn4wYV3pglriN>>8h`-iJgq}hwrxBb%@DczP&Vr__D@H8Zl%O)IFoS}c+gj(H z*3{?vLC+zb1}9Mn{(auZ7OUTDf&hV!brh4QflvCd7{|<_?-7nWMnmR zq8K?*@-$!l#+udA<0=T<;9w0s!xe;%r6-EQPH=O94tPkbp}u0M?~bSW%CY%lC4Xnh z-&_dpE&BH&-Ws{*)C~8SA*M&!R_^=f%?!!qM<$@8gk7!>xp)*B!LnV4HvdOdRy4{Q83(y z|br^Ew=QU{4Kec zzxCJr&%Ez@Z+niEIyRSi65PHW9<&JGNH#xXZ}Uq&%E)Nm_4c*1YNE7h$4Sk&a-i9R>@+bfgrzJBfXn`a6qUxE+f zca@37>0$!YxhYT=nMpjZO6F`*Q!)LK70>H#DyK6v6<5&K`++?pkqTTw|9Rbo&}h>A zH6Ouj37l0jV{t{Sb*Q?_@FK)iJgc`QCRGh$ZG1B-mhs7S!tfc?M21#c^bq4rVnSw( zkYveFy$#$A7>3~|r81i+jwR!&-UX^anPyaBJ*Gy;q#^5-oz!#}%uZfp89m(%IS5EB zi*i;O)BPrAsMo}cN%@lAhOz+F2x}Fy=cZMLBjce;2Zfo?3M@>fwFGXvO2NQ`ZMa_p zvPkmH)LnBSB(CE2eV>D@r&ZxnmYEP3|Edeq$Oz>vV6yFCwE4UvFXY{MPu`nv%9BRH z{X4enrknC@;9{nArfsGjG9e+L!8LSXxJIE;zBS(_@p*sV2dBNFq3oKtYf}kVS9-0R^CYl4*#(!31JT?o3qBJ#538I$jr8Eh&LHg3Y1CA?txQ zo2m#>2+|19*N!^%meGv%Tq>)`DH(iT=`f|R3%6nJb#Q&|b;L z^TKAWI@1Az1(Qu9l~6TXy8pi0g>X%O&~c>7gi@0S6HiSjwM8@#kFid+;z=ZR5y2${ zQwSK-Gl;zoK<7jiUG6%=xgIslD9;Yxx_oZUda}OX1tRL70GK783=&VQuJ_X^DrV| zpfOks58i9@pknlxigBF{aRJZlNM8ly4e|yg`z5Z<$E(I}&~286R>V>_r#>Ds=SWr! zcM2<#O~bbuIA6oZU2|lPW0x(S8A~40)^PN6o`qVvmw9FtUr2YG8W7W4x}ctmQ%Oup zy3?piTIBS0kf2W3MGDK}XqHIK)dF?jxgC zabRa0HBDEc+DPNg2%xRp=4eJD_4OsxDg^V6=FOsiRgr}m`PWi6nqOzC^eC`M{cz~I(M`b-n1G%R16=Qd;U%^ywtb+e6eHjy|x>}f7yA{Rowmj zt>Ce_!*@bGKm5*WsJ|HMpF2|OimrAI6uSmiUcS{eGI!#xuWf1M7rxDR!Ut~b`hfrW zg%5<{V<&HgPhGW_;I7QK;9K=?EBd!Bzxp2cUjKWZLhr$%{~N3Rr;GllZyx#i@CWe^ zM*imbhyIgyo5M?Ie$m|X1qiuroV?lb{&xzyj}>~475&Fo{illlQ>9?X9}O?Xmqr$j ze`>Wf2O#rlKJRSI}6HLyiH$W zJK_xc*#mtND)b2e5N%X<2mJ{&sA#TUd(8+j=x;zJx9dxN{l7&b=#Ko?EGgT%oTs=u z^&wi>icKGNM&3I3=D{+L6$mmry2}oR3Z$*0>}05mw0Hie8`1JX;vXsqgT|tKM@C0K z{$D7gqq;-QjBBa*6nZpJ%1?|Qfot)kG6`eg@`mE?Naz6sk0C& z%tIZzTf=1sVgh1L#9X9*SJ{o2hqQGqo?ke>=Ea)E@w{u@9SYR%I=hy%qI2sS50!Ek z+0s|)=`Ho`EN$6V>K`oa-B;>@YJX|pS5Im82v#3C@X5{tK%kx_r~^yj zEFGLbSax7dAno0Y`GtJhi8UATx0Kz8c}QzV*^5{c>D*lQA?7Ec_N8qL&zGCA79gFQ zmThnLms_wFB;Ap62(eboL>yl@zC2P4_LbYPshw=;UwP*3&E+uGI&kZBBGyGBz2$Di zHo?Mv+VytVd$wZFNO?0hMM&Syat~r#;JxLoh;2izdJ)?W35T}^-h_LvV#ls>A2#h+ zMq~hj1WfVtwfVg~Dy5_r<_$B!*vY8*kE3Uvj#EkR2;77!zP6wJ%mWHJrE z;&~Yc!i7wHO1H+xRItj)iz57d2cmQdo^Yj`%8H;Z{~U+eF2XqaV?dd@1dD21qWlP2 zD1u*O7Y3W~BN(^;%H#8o|Ka3fZXtKey=B$Cz3ARP`^+70;L5A>uR{Cu!t`?Mt+pL2 z@f*S0-a|Jo6usZP`ApIK?Cg=dHrE@+zIW`Z{kE;G1o(;Xow)phiCi7I+Pf6IZQFF$ z-8_53(2bjPUm^p)WMiofbud1bU>ZxeLof?@5aUW2cayPnKZ`^qyq8XbYa>gBSA(7m z{T>4R%!b0mm_pzAMIzTh<8sT;pXhMtJr9|tu;0DaemS7`J!G)yaj;AbyII%we!mEU*{30)Kft8ou{r0tQmkFRZiGwMi+=u8|Z!-ML)7YfDO+Z%p8WSfz)PUWj=wK_HAu+sv11me(b)PwXsi&Ch@4D{T!t&! zGBytTHrbxBLv5EGMQ6sza1Pm3bZ6YG?UebVC*xslm+US2GQOfe<1Yp>0fuwS!D1*A zVr^ay7ke^2#YiT?`X0Hr*q7-m_GkK8-z!Ip1DSzhEE8jWpFCKMXX3@7%uq3rNidvW z-c=mV3>QZ-Bdi~gM~h>bF^<@XyY)kK}J- z_b%F!k#CSBY18aKEiG#Ng^F4zm$LHtB&WH~mBj^Ff|lpSEX@`rRie=G6-otFkjpbO z(6wvcvxSnBrB7!SNek=kbCNn+78z#XmD0}_vL&^!B#G9KFi-DG`iCs8s1H$qkBAlh zOu3XV%(QV*XJ()r$W|)yqL8ml3&kvz3nc;mffX%Lp;9GFCH6_qE=p9$m#IMn1HI4E z@w&T)L=8jVERXGOWPuIvMAGqPJ^RW#+{^h`@X7k{$b%B<0hMGhiE9^#bn=53IgOk;~ z9qv9QpK_W#Co2k$8l~d^H6C^^lt3ANmuK_K<3OjNg}FxVg@))}Xg#nlz6ZAM)6lTK zqq+7=fe%LMys|lDb4Qu6T?8i2gU)ioY3F7&{&HR}XVpiKYHp)JQHZQNh#{Zwa4Kc0 zN}`a>se-63RwQLjaNQ#4^Uj-H+TD$}2Wg&yx!UL6^Ib0R78w^POe6&ip!A5i2dne5E}0U98F?GvwZYWr@?Goz2INY4tY@42=j zHnDI&WRCCj%+?*w^s}f}^_f`HL|h~)GR8`kn5zCFVd_0;r}&n^95LG@>JEyc1@fS{ zMbO-pN})6Z`rlhn3MEC&mU0qB7HAIwybCQ_$>!#f*tl(D5Jgj?^|vqL0iVoHcjNdI z2$(S8F@gTqHD4HSpDA(yTtRT~mJCDMHnpjp6z5>PauUEbvh5=N=m*lTG8?|B&Cp15 zU{`ZwPh%uqA4xZdcQ=NQ)rXIL>vn{L+g=h&uMZrog%57_PObH=_oiz8R5R4eZZ|oI z)(&M+5Hzn~Y!BLg0eO>+FK$5)%Q-+hm|eFkD%G4!Au@ZT`Iv>%w?Xl8=m7+20Gg9& zLwX3?_CiUeNAMFWbx_u%0i5ss zF;a9}7(n4swgLR$*&vUCM*LrU~-y+z)yYMZmTM^>wY1ueuqBeNoj|A#1 zyNPMJj8=NBZ%z7_`A_HT$tRE|-KI&mX(FqK@l&MzDPs9_X>HG~*SdeUJI`>deXF^( z!>hBmhFb*twZ~5Vku{9T5ZChhoPEvy_{ys{UtJMy3N1VKQ;)Lxo~Nf4nW%dv?>hm@ zE|VQY4o;w5{6>p8w^r9yvV&u316R)hj;(Xfq-zf(WG-mw7){Z^TvLz4Xj^gWrSqErGEBH@WMmJP$6e zGZS3&C41;8m{RlWLijLqavkWvoN(YdwwX}z-LfKbDn3|Nx+%0fHdFn<_%>xniFG@E zQ?bDaGeSXSW+fJ@Fc=dQjK-2YeFeCp5d>iX5P6<@S-5oJ z%qjHDAkfnzJY7(GP~C`9KD!_rE{x_WgZs|p7fcsZW;maMYTZy{y1zNxRROlW&Y$6+ z!vL<4W?~Ax%fZd~-sY~||F*C3#QFLY=eO;)A@8!|-4Lj?_()^$V14l5`XKnD?*;Xj zjXG3|8Q}y*hv26R5R*dC2+^(<1wJv-Xk!Egh!jxhdH@0)&1cZy7tIfp*&;ae=&9w} zCk(4#8#UNg4R&cYXXcGSG~%9X-#t(or4wOX<==L^?X(y=yJ?WFdaCv+*NIuc1ah4h zQT0HtV;)`5Ynw+kWUiy4QDS8d?{!CkIczFkH7ni35?-Uhy=WI5RsN3iw(%yfdaK+V z20A;6=&E|n7t9eAWuKLYbJA`9BB&h;cU^?KBP)zIx5?{OpJla#$w8-n_^jsn&kc7* zn60W0(o$pyq`T|h`Kr&d_s+?Vn&Uh5+@4O`Y+1M-H*uXBT%Czd&un#z-5E@2*d--C7Cu7Mp}eQGmwVR zHHDsUqZnlwvE1Q9;KLM3`Lbq*z=yt!v!T3cE>$WiWvc5=jQ*1WizkO1(!WH2wt&8Y zAPYc?NLN&v6^s+esw!33q0lmt%pxctz!??vR>2kQ6x}Hd8hJ5&5^H>9^(<2lQsK!f zhu{XmE$kzny&L>QGq$f0OVwkkwRhHIPb~+U{^*Ck&GE_F)QR=+r)vDzy^$l!&J8|s zFZolfCz|`7!FsYeG>LT#6meu?vnRSTbaSW~PBz0+o6(_0G+B=(x806Nu;nEKu_pY* zC%1b%5%03A1xp7T{AisYU5%|B-QW+w9I->j9KXudc2BPlAG`Z#ZP$~Zzw*1+fBX7I z&qZSbw1h#@P?Y8}a@1`2I%xiF*8r&yW1>$=^P?;eV+aA89#B{J<8m z$GywJ7EeY~%g+Dg6S{Rl89e_$Ac`ITpb0Edx1EUu0$yXipFw@qcGZ5>QFZ)P z#Ig^^j=i?+sfEVAgQnYg*yh@{t+N}G+a2ROXF$5y!ETn12GzE8tG4rjm|c9szlh$I ztAJ z|4FAsqqNszBNTjlOfTpvVONEgbCRMMibWB!c~ohrz9DK|I^5m$dg!2E!V1dg03hGA zYhrbwF?p;$d8{#cqCR=z3u$B5B~$_N&4JO?{!bFO5{r?cEr7ZrS;+{~lucXo!#7#?O>X#-n+MNA47zrzao|M#z=HY>b^-KDFT={u&mHeiFMC`=|KbBO3#tnBMg_ zeW8XgQTHWarT_3vZu+Oz`ZxUPd*S|ic)aB%{wXHH8h1r=TrOmloCCIH7R(6aJK1`H zNikSk^NXeK_Uh?Li}!+((M8Gt%Uh-~nt;WWz>WggE+H0Ry98N~V&ZNa5_A!hA#I!(H#YS$sfTd=_Vl}dO>JbcNE~s*5L`F5KNXcxv&U|=dCiGS0GO!X)biKg_2Rv z(`zta(oO#ws?Gq4VF7#cdWxM9`7P3dhhriC8|*S$%1m$ux|);yW~I->EVaWTSMT6} zYXB5{6ev=9%HVIIXGxwtU|HFG|BPe)5x_Ol@)Ljfffv}juNfU^4kT8pH>(iBG26SWKXno))YaFQ$9BK9sZio23V2kj9AehK--}~>}c&8B>uZPB)k=|x+ zY~}dPLZL4rID_B* z;5up9oX*3m!WMz56+Gxn-3#@-f91v%+*iv9eGv9N6D>Ekcrvzk_2Mm0%Y!{H>5sL1 z*z%L;U@L&FAc2#dzd64Z!k)A%A$O9vo}o09eo)dOvX^vF(GNermX*bU%`3S^t!g zhSDNtH^st5{J~cD5PkUDOiKUFmZGd`V+`})uQe(C*P4_OWTUpAsFL)g;p9U`u-Lw^ z#InaMpQauDfz+wX@WZi@XQ!xhSi)92@k56%s0CR`8NdIs4jMJeL=ml*cI?N^q-QDE zL@WcrQmWeYkDKJJ>Fl(i&%sg|sQs9U)pm88sB>B9?XxgJ3o=6U7hr|`p2uWIa~apj zB!@ltB#vqefyN)#lfEY?+D-IZlnQ{Bo#VLsJmJQEPyAnzp|8l)f0I4;Z4S=%2Lj+* zFX5i!?%V8~?LOZD=I8rfz?}Jy@J;T>r+pb5rzFKr@n?Sw&1~vRiwCSUrFuM?giy4m?A)l6hP7spcWQbwtHL| zxu&X1n(7(J>FTjuQ$40*&zN;mGo_qPX4Kv38Bb^W1dpT}sMmB_bw-_}l5_%>RPD;- z%uK%jzLn6UY`H4uOiluc_uY5jUH-fO+x_o@QdXAovG6)j{CRtqWsLV=TRny z<8E^yE@X;uQB%NVB7aUKCz>0`W&YeqUep{gGk;zrKUxqdh*|=cXknm`g_$GPfE8)- zBetkLU}s?kk)mjEpjiIyh&lsK*IC>;-BziP(GEP6a}JbEHAXQF2VXIYpxaxQv4a6bB6;5qhQ5qUoPLg0nyi-8xTfj}VomB3e` zF9lwTel_sb=+^>YV{s}Y!RX6@m!so>@#uxX1r}BnnTUo0A?B}+grk#zN#?JKT#Q}{ zTw?y($W(MXFde-dxXj+`B9Uk`5M_Q}WF|Trm^E?X0zMxjp*}JfeI@V;zBPmzFXjaJ z94=fJj=gV^##|tt!yV#6O>c0a=5X%&xpL@ys3m+Q)Eb_j%nP->kr%kiV)+rPJv^+& zy2e5~5ZW2us)k-?p5{5r- zWM)2!XTjV!ADXv^YIbHkGKRSMCuT$Q zEOp_jaeh1+j)!>!+kzAGvG{B>I6D`^YcW4N5e<${;iDYVbTJ$c#=|qQSw0w@4TU2? z6bew_xjM^7Lc!S70>NaPnwg3RBeNGTvV6<0pxJ{{vEYUA_{62))XbG}erkLs?#~x3 zqw}$F=)~-Z=v93CCgM|9#tGQbSw1=*7fY2dC&KYdv!VUtv9MUDgq;fW(fN3I;6d+Vr5b?`1IX~cnb7DIst*9gYWZ9E`S8@mOYyT4#czhU*5e~jWk%JQ~Hrl{BNi_>b z$N9+A3`Ia##rYW)3qXXFGNk-as`dEPOn97U)pJUIU4_eaN%bF*tBZj-wZPg;N)q$u zh^6{qxe`iPSj!n1lu((+o;xr;aY_E*FBILQk@482shP?B?w*)7ZPN~#I)<>l5Drwy5 zzEg8wkRht(!xyJwG)5_uKtLx2C&wq^vwTdf=V#;NaR8+OOps92tE}^5Vs$JW2~VKs zWa#Mo#c&h@HaZs(9kHw9bHS^>lfYpUO8VFm3Ww(yAak>^cyJE0F^t9|@Z`7` z2nMk+I)42^IQaBP^_wLR#^OA{zIa`1Fo+`0bv2In3Ys^hRFjn-j9nU^3uFA2%R_^j zR~q!_2@GhlTJ39mii}@JS549a!Xk-2kP|WmazigNB*d}M5NzSLH>xz%_SlPO+%@1uGzC0A}iA-JSnY)fnZl;H}%pPfv4POgmoX+w+Y}m(o^&+{`AN`E01N3BeMj{<1w+gx1zHcg$7^?P4_=l@I>kGd%9 z7cwa*R?PD`F>{od{6CCS7qj2AMe%7*{(Eto@}|P^X1pjnY|)f3#fz2DX{X}To|n;< zd~>{137PgNKJ6LGRp2sl@iHZR+N=1q=UFbn@xITgrP?06EncOh4dp3i{Fdo+pfM{k zH$YRZU*$O3s-!hd?hD4cg++7F{EpCpAt)ZF6QWJ)}VzeKD8%uLIsOCiJZ@E zSBp~qhMF`f`DstoL>mt+TDaoF7N>ou*^%|*PorN>#ygZ?1=3iVuC2S2ui7(Is6jZ> z6Yo*JDCs}9hE^@TZscrH^3k3f`s!)3ZKtjeo;ut3R@C|PMz+lmzCH~u#RvGFK1b|^ zIrS^)wdYyRU$k)9>cj6W!_0i~OJQIMQ$QH5`$kU<`#>)EWb}_k8=H-XxBD1+&=s2t zPfSftP535(22iA!54*W97N3ele6thSC4J*@A3u-1JR0t1$$uU~;Xe<+S?E0<1NP_} zr5*1Gp}uN)(w?9c>%$J~JA0yc%a(=TGKd_%1bpSr3x*6uf@VJTbx$z5A5yIs1!ZTN5VijxZ^Jho0n;Vewhmj3W3kpx` znP@*ZGIsXlnKS2tgdpTg6s}kZDrq)2J2MfM_F-yO(fZtYWIjB?^RqlpV@vc%SW`9- zm8|%4BWKSAM^BzP04Pok2W7;8;3p!S|0-orssc(#h=gL4u`n7OQ~@=dKf<8BI?m4! zHN-$WgdqMa2hrT8ope| zU+njR2)xEUF6Ig=QWl?J@vT`J9@?u@_Flo>yWF^DA6$3Tr5yc&qks9#nq&B(-I=m? zuGl-*?X_!m-^0?*)za=81!-I9T3LIlY)B{@TCokS+e=gSYQbKetU0x2KfUhqrCfc2 zt1oHq`nBL-I4V&UnVElQ}-y>Twa>t%q69d?Oc&dN&noM{dtyt ztKiz2v~PWE;fiWf_CCShx7@dCA52;Xf5DjwJJxNEl&xK`wP(zn*@0LVZ>pe9D5zU2 zs84$;Z&-g?P_k|*PFcKy#ha|yv1Zx1?x;>V8U;sV($e@ipR?Fg1+^;$wLk1nHSS(% z-2I0;5aHXevbxj>_PV6+qU)gMa{Yj?pw3(X8`&Imp^Iuqi9`Qu5W0|pW4eG zYA~Ezvz%uz>=qomla}3p!E8(AxrID;(lfM{H@t51rfhYBtuC2Y$B?}Vbea0I$-YN- z(1(9J3n4G$q<^9EP;NLclovLK%;Eg7xdMqoISU1F^YLDwdoKv(Oj>Vnlctd6jiP`h zTo@_{TPJfug>U33n_|GmLi5A+=}Hw2)kAq9zEu~$n8hzZs7)8@kaJEGvmIaeaA}qubJzs(G@^~66 z6AIUzT7l3gMgsUSUjCj5L;!`d-<#$H_Z`Gl`q{)KObITiA}fg;4GFtq;)~;jO3IKa zk@LIU@2TkhZ{Zy&)p8TLKtI8MzXs3LNSyw?vUOfj1-A6*V{hmw0jMV&*rmxr+Q zd37oBjN<(+7c&2ziqsP{p?sFop-WjnFo!G~;^hH8l^nUw`3uJuYR}K86tz$51E16f zzJ=rGfZZ@;pzG=sHhw)u;G3O^0C$o273k`@;jWn4qdu*#*&vviz1q#6LZuf*Da#G{ zNXQ0pXMMAiKB=31Q80Ubm&UJzeSN+Vu%Ve4c&@QdwxM6dhQ46a1@p%#=ks42Sef=W zs4{Kk09IF!>&aN%Ao)Tj)CevJaS^>EKgJ~ zpCD(EqU1)xGyFH;1BW>e?7c~rQj5U8iG@M1iHIK=jM13FUgz_@#s#C68 zf#44i9uwgFO4{y<-z>mJux6=z(0TYq-kQb#z*fE1G$_~x*GsFDb-R+gPbPPrO4giS zEj^R8pGnvEQLNf@ZRbjD-*OJURHbdzNn6`bT|I0MGkMw{<>t~JmWw^i!Z}?@SJSGa zd8uT@(Jsg6Snudg_Uv8l*q3bIf4?J>Yx0~lX{j+ORcmsU`}uNv6)U(#BJ`n$)7VO=3)%;K?J;K5_2aa zu^6S0aRdwD2*gy;x11983F5_QLchj6aJZi=jaCU(+J1QStBMh8L0faC3An7^#hkMM zZwx#)Gq@%bmqg2j$;j+@ymwH{yEKl$a03bSfO`o@b&>`#NrNHAGh1lThQMZVeny!Q zQ{Xl3q0@7__*U@`&F@;@u_hfINlV9*6(wM^oAu{3ZCSHE0b41R8-)Zx!&?`sH$t`X z>t7)&wV_L%ET1}zl}}^-HkQ=09RKZ2i+5A;l1u}vdQ2;HZ)~AM&-mRy=71}+m(MH&AxMvy9qZ;6yb_j#d8tNjxn{~AeSgg3kf_8V}?TESME zw6&)jwx$|(3k|#P#Z!CE2z$<~Hk`e2@MC*jCYQ5IgG9`insGwkPU>#^zaUH2PT4aQ z;R#d7bkX#_oNh5E1lGzGZZQ|_WE?wJ7aKd4VH=x5c^AQ$RpgJ0*xNUWXWkGmKVgo! zH;tRWA+9+GJ)g);A9nCRT_8l2_*1^k;-%zsYK_sOCCCaY97uGvlImL`)!N0WocR;!OEEfwjC>QqISP{D!^B?k|G)U;Z0@zD20svRszj zej>STEP3kOYV~=f_^G2NUE@pD^a?e-X=hooqBrU7TR!+GH>bAv#@MI%*jdv4k#ucO zy2=mtaiQ611>sa^WwTW@Gn`P&hjeCg>Kg9@@DuvEsE*7l7-P`HFhTnQniz|uePIga zk|xGtUNmnJ8WjIw?RRavmwCxjZXV-c#dnY6ecf@qxtD?^&HPflpI=UtndmnHhr3Y5L)Wq-%G=VJc_ zN_ma@Wj}>Z9}TIU#RU*SJkDex`nE4H3YM-%zDEL z4vM|(X88MoHCr1vCh78;bX{Yru1~1zV^OyWwcCDKkmoE$#@?RQ@-0c{mY=$s?@q6} zdXx6v^`i1S=I>uyD*`szaCiIC{HkjpX&+$0cdoA$`PW^IcMq;I21DB6lD1|2q$d1~ z-XHz*Cnj}VCB~*{>Sz6wX~36aN~vU>772u4jxwMz&~3^WOi^_~P`>j+I+ASjDQdI~ zsA@TwqU!WhU_-g(YPq_Jv;i7bW}`xL;5Pt`N0Z|KrKwh+(4N|~j@K!n*c$bVgRafK z(BhCMEe`zeBR~Ei9MPRh5^Pg|04?i$K!VO~n{(^|Xsh+(ot(>-YPGN@R3^snT@QHm(0wPc!Y0 zkY1i?4{cQ`TZ3S0ShF>yUF9iPr{L;bI=?)+>e`XC?^w5ao^eL4WAgN`y4sTVHf=_I z?sV*3_N03E32aACI>F4VyPJ16yz1&qg3U=2bF&#J<`Gi80qvvKVe_z`%|nbCJhOkA z+>|c^%4fR5VfGm!%%R*+o-&>xf&Bd3%umfNKUDBcxmli?TVcrhxw)xQs7-5V4;5*- zTrc)J#+%V(OuR$G>;SRFd{-|4nS@LSj*ltf_^9O}AA(jIq%#x@-UMD5Kjm`I72gD@ zM1fTUy{4F&+0=}0Vm1oICBlwe~Y7C;YHh5kM=c#X@Ha(3r;cfGmmPTiW#zch8PU+CV0mFV`) zTRW4!!F!HX*Wsl7FzCVGY@}qx;F@hnEvO$(thNDytn)FMj6{~49&pb0j&H{ zIR0w>Jo%p^=Xp3{K`bgt;`7B&T&7HGv72Z84c-u0C6lCS9eRp%;+IBZCRCQ!1UB=E?i+@=}zJn$% zza*1mGJBF0oe&S1J&$sg5M(QT>s)d#g+ILf-etkRo#J$=aXQsFrFXZ|r&9G(sqWLc zrN(z&%=-K&KgWE|1c0dwhuM19eRpDM>)lK5)MhxmE+6{^J>=}*%QRG*-H=b+e&N;& zw}ZEWnOu7B?q$yhg(b<-7NM~9kr|7AnDsf&uY%wVb6efL0y!(-{da?^W9 zKO-+g8yONmfu2x*3Lf{!jc&=->k8*VK$8oR26Vf^`9|cP5g`U)Qy~OQ)^MRDy1^R> zlnRl9O4zC8bCY%w8U^eq!4k5C>=2@m2&kx#3za-msl~|O07nZsHVL;gL72_>>!c;* ze8U`YT(|p6`5yz?@gIXxBhh9)X*#h*nnXBkNk6M_`@ymd(Q1w_I#z$oM@t#>8A!uU z;pDTm&Nn_2VmZj1_Z~UU2AKo)pKfVu*%yh;YN7?DFqwLTSby|X?_kgV(cVEHqw}5w zK`w?BnNf1g7FdUL$KRy+mq6UDtVmnLI+wD846P5TNHjp>wjdGCCeApMC#ecAcyQPI3gk@A| z2dM}8vNB`L<6&9F(+BO>aUXGqeM)A47sm`_fFNIC5Mg6|qLj8v@%UV9dr!|rbm9Dk zZeS6l?nl6=Pe^Jm^z?=Mhc9g1Hn?SLe{b)2fA7S=mcD`TDk-CXWEty1;Wt_g!ihg(D}0ZNOJmX3*aNYXzH<#D40;j1wlyehMXquUI3b-xs4R zk%Dky_S5J!BW99@6{|XHDs#zbjX};Ty{gOEt(2xc73g`0xHqIysme`bs4OQVfwz(h zd8&|VRG{vw$_@jKt%0TbrW;hR>Or0*$_r8%#0YWo)LSYwy-D6JN-o+H!qhe}pShsg zR>mdpb(&sXh}g|WP%t6Bg25wCb$y@NhZHC$`v)ZZNo~8bc)JpBdW+)Io*VM@=;HQi zaTK3EE+A-9!nG$KG=_}NIm$s~Y*^3N*thtAl8e?d#_@lfL?a|l$oe4^yGZsAp$7Zb z2l?;Ia|d-I0-yb^bMbA8AL$HQ*tle{>E)762Ej@K=hE1O|1}j! z8AwQJ3E}dg@Z|V>L>A|Xg&-g#$sXf>@N*QIrV{@OIakTy$#IZF2n|ylla4VkeG16G zBH(dRaJNCU+J9`UU-TW&rd?X zPqZ>GQ#9i{=$nU~F=8@&>4XG3SHV;NNt{dxLWQiMhR~BWk~G_!#lcZ!+N9wf`&~E? zSyi`@NV;OZV|2BH3AEeRD{4~}TZD=&s}+!4f15auCG{!ipx_)_=2xBDlXgOTi+a`_ zu9Tx&aCAd+s<7zW4#b1flC4p&HLls3)3uE^4u1ROy2PZBULLj&r`k^l?I%{-Pg1F^ z>or?fYX-<$`JlczRo^Gn_oeC&3iSsmtd?<2dxfgrd+ub_E(-Fdt9_ENBvrjfsNQpb zSg1ZmA1avat$xK;zuw%lY+CM5Z9O7vJ@Qe#u=V(A^9ho?H2?*7dsD8a6<5=mfA^Yy z@4A0$%D+qS@4A`yv9skNz3mqKyKm;LIa?l(EU#+CR+TQVg7{_y;&gjk($&WJO-yBr zaaxnEfp!0`^~SEHc&cYa=ov{ij;34MQ!P7%mYwO&Eve4qLg#T%DXEU5LdQ{XecOu( zm=-X8>)O*wrPU;l&AD}m@ST$q&iw=GaIbv0h|Zq?&7lKBIFN`S@WjAY zmyRPLg=kY~XnL_TMgfj;5IQqKM4Iv=>dc0}NB@Rx=$cBgvK>LHqi*Y9N6Opzq9su< ztul(0%%CNvsyI}odB@f55$%qLq=J?1Y(U zm-A&oqEr7yXMX8au*zlhm-D+QpWN7dRLsRzEfzzWdJ1%6@aj}3eksQPls;GJmaDAQ z&N|CaU6gn8Z%_p;kh7H>h6#L~eBI>iBS&IXg`;!O`z}?FjIsrgj$-oEGyHyX4v@1- z&R%i|`|{g$LJR_o7s>exa@NWD1ddo5zJ{}u8xH32{~LX!7ULgM6fThmpo^z^11TIgA)ZFM9|*|4{%`_he( zHGAW_y(#VVq@9&%XI0wSmUgzJo$i#gMR2w}E-EN0&bYxAP1&ji8}NW_E4FRxo-Jvo zKiP5M{#T)eP*(irD9K0L*TJ{?%B`=Y-Cb#SXS%ZGaY0@gDW+9br}4kG<8eu0S@Dfg zWLF5qYtk;i>s=aLv-E*Kwy}T*1aaevt8v}ix9;vpyW1&8w>Ryse`L$AD7|UMK3h_M z*Rxc;{EcMW=$i8oYLIfa3C^~q+IyaloqHaTXtr~usx#^AOqO@D?Y>JW?@E?$OFFl~ zWmzLrIqQ_WT_FFa1w2p+Ncwl&JHO^U0K5auR-TuT_aZbw5~i2)X8tA0BIvSy zMx=afKoh+h?Lfw`gt!k>-3eB5K1$A?R(E!-UnKqP4qG~zD z4(WDy!XHdV1i!YwLvt%n)3H)^@R!hM)Lu-3%cx)6iCh_@$(>fk#Tpc9y`Wk3^m^+w zOY){0v>X&)CD2yCYd zEf0b%Ud)Y-U*oBP_@9wO0Pw^f;J3hmM%o+X{{bAF<|dT?GSo162xwSD1_xy?7Q73Rtx9oSqlJ*PM{-$EV?z=I1^Ej6Ik}_-h?>wUb-=w zuJWa-x`e8(8^^IPxV^Xc{M|k4{m-S{m1(yx?XF0<+XQ#py1VMm$lYDbO{?yqM}_8r zQo-4KGyjhL?q$e*6ml?K)sn30OS=2urW?B6?RlqXwPEYcV`)zn30S@txG{=DHs1QX zW$#wKQ+01-4Jv%*Z`;>vTJ9!Nt;0g=@JHsgnlaiAXW71H--U%f>8wxNOVwswxA!cYKei8&u4pw|@kP@$ z$m*uXV-tDkZsk`94_oxVg;4$JY-K^XY2puWveX|(xIQmAPnB>oSo96`d_c|_E$;{= zoG0stPy^&Wk^6y3As?8wg1UTY$)`&+zv9!KKQ&mgl}XQ*bOTs^jU3Gh^R&tXR-n<2 z!GvOAiF6Pf1M+%h`+%!c$VpVEi6<;Wql45D!^cySBL*-4Y96BX69o$md zLWahyL8fa%1JZNar{$vfrW+NX_5_KQKi#aoD?Vi$f?*(UC3GHsx<&b}J^6#W1^ZdY zRJ)ds;=@SmRQ%c#p+q2L{V@LW-j^8-3_hbR_&?qG!WnVCe*<%sn>ON-Z#IL?3WEg7 z-1%dEvuI|-pR{wb4UuhN{}FAkqdW_E6ZRG@l2{0{QkP>;01d;a2yK5e^7ArJN&Wd* z3HT&ssAlc2lvueJD3lS2DfmP)Q`^^yVF|<>2;zuJ3t|3GD2jD*egM0!uvEUr`2ZukR-x&a80~4alg*G=&j}`i-6c4-`t( z5v?-K4D^9v4WdmBxG>o}_=KV=58Y~xX`FAM5?SRMF=%7ZG3G!VIyrme$Iy)<4sG2E zsg;6oGqC?85!R$)7Rd*@p_)AcQ>`mI9!*0irVW^tIKxgE#GIS6#~0EV!CiU9C&KE3VE54V_B~p<&0ZV;Lt`(VX(OK|^EJ+rBiu z;_U%bI#t~*RCh1EB2@P-2T}tg!obM=abe)_O7&rEu&uqzI~4}+>ek~z%kj5ta}pS!m8X#+ zaVwFoX-L)d2sJ(F%DT6E-s(Ycs(er=A7q{Zp?o0isd((LxQjEKMJ0F{oA?RZnbD?Q zf~za#qBF=wg;du|%xq+eCU*Cui7DrNGaae^lhA+A1lsj4AR)2;q49XQ|Pi-G7Vkk z;=3?Pmb z`R2|{0eiJ@fGF~2+HkrNwS_G#)+ z4$e^LhRsH(kP#v@na9pi=e=Qv8Km5BfvgTgC#p#i#%x5HPFm@tb->2rTSGP|!Qfmp zlwz#mA|v5o$VSR5lR2!UB9!F7dFkS3ff6}8C-dn*=1^IsV=l!Ihk(!47ENJBrdciky}~6kb9GG58A&h>TrdeswP|+sts3#>Yf_khmz_MvjNY>r^al0>euF{ezh8Z4OedxZrEzlIeRv$ z+Cwc+eX3!n*lU3ZwDJE8O*}r26@g5+{QufdUrzgJ-BW|VC}h2x-~=^~?UWA3A%D~)3@!wzz0 zuwB-y7m6nHq zEpP1Suc45C*SA3-QNiQMVkZ;J`*!W}eWkZ=xYGyctDQbFxB%^zTpXqms}-e^SU3!R zA9#t9?gwKof~KMTB~0y~l8+{|G_^H3*foTSHkMSflPXyz`YJ^w@GLAR zpi#m{$O%x~0@;cbPb&pa9mtcCuUL|m$K(vNFC;o(dTaKoPA%po%B58bRZMQ`suYsc zVZb*fbEnFQA;wx|b^9igD<_6tlhDi85yVIz^|XFqzZTEWn?o|6Rq5T+X zeYB>FN>W8NLQ%~RYf|<5g!+9eMf+HcQK5bG#=$kjc!FkIZd{HnSFV)rU6e(0Hjv0cI=2GD z%cAYlR0tMQ+1M8AaBKi}n8TqdG8M}R_YJBlOkO&nuHxuRTAgWiee`{BF`+>U!AlUY z(vPW(phM+1QW-%X%Wn{}Xgd!RA?SA=gH_9DRpFg+l@n>Sk}!dfDw zXyGc~)ZiQ7RUv%xl;UE6agHGqCmk**Yl}KHiq*QI*))-;4f zm!mNpa!+o6qiB;>E>76Bp5G*nBcYxhgu2l$bmjksYD6RYRSWS~VBQNW66PrCAdT@? z3w7az6s{}Zxc+%kpKPfA{{#JiQT+tWY=zM3@#W#&tiPOJpubA={e>A~w8pErr80ie z_N9ca#U$sDV3I~sgelWZ)rNEi*rRbeg=S$&dPa3Kz=~ng ztX{2!UB80p!i)6b&(LmeYB&C`H%Vat4NS7BSaryw)Cai#ue0+0=GKbhP1b?3M6tG3 z=-RDV-){ej)WNYy9m*39!#X(hbqHBOJE=SKbzGeToJ6Hz*qchRX29&H*@L-t0^`CKj;7Tl=I_5c%_SbY%oj)VgS9=JG`|PQuSa zs3+OeGTBNRYESu-be=rDrJff;i5XkY;; zWxk$DXdtJNoF;PAQe(tCkhNyiheT+Sw1Zjv(g@9sYM6JhWyjvdp3Dw{_ntD5-bYuu znrT&5s81s;OPy*YX|Iq)m>b6>vXPD&JRw8M(!VU*zFM~P#-Rs_*5%z-1gBrs!DMeZ zj8Rr|;}FS@dX~!t$MC&6DeUWe#O3&kllE$)sB3!r(p#5OwOfSREozDn z!XHN8i>^6$el&vkb-jp>^BgBpRuN1aHwpG8)F-*^$eR6V+FqiBz+%A1_QMa6V>S$F zcU{`;Nmg#VcSxu_01;^Ap^pv;6=wzaxwOZ3cS!KGFP#=V-OEFQd;2fO|`SI+Lh{RwTXvKivs2U6l%E_9pWDm2m zY_Gl(Om^>Gv+pCppQ@wWE!eu3+g5Dbb-?XgaqW|>Xm)EBHlertrTZJbyJkNOcoY~O z)-=5R!dov$lIBf@PVE*99kScDVdxAje`C!)N?Cm^*|lfQz8Brx+DYa-8=+v<>?bwt z`gCI}J$(;aI@lLUayfxzZKKLCaVm(ydo%<7w_bb9NHtj1f@#!FprV zlfN-Y?TbaaF>KB15v_OCv4Deq8|Q-2e6eT6QF~TbR#8?Qb-vuppZQmt>?(swn`+OS zIk8i^-DFt%qWF|PS}fM|PN%mkU$y5mjHqJmXam+`zmmr$xqiXfnb7BIe)hSkgJY)x zk@iIGc582nPglxbB}{ulGhn~=rublUK;G-JdQjc9;)j&D#-%Fr=pgsi??Y$W5kI2D zpFXPiwC6MQiz5Vn$2V~tirh)2ORbbe^Aeg2Suv=gkpDrgG{u*2(6`Tp?2GnoR=LEG z*#Mb&6UdG$@Mup7vTy%`JWkhp?b>(6XQZY5lpXQe*Y8btMdKasW*+|+UuMrk{}{CL z|6BI5n|U)YZ?c-HE2q+9;A0mse(e8%IvZfHuiFt`t;JS+ui4_nv17lGdaoLIvZs|$ zpGaGU(iOd;#(w(vei!qz=V?fYNAgxtY#|C*etyE+C$YH86b(RGgLSz{B|q)Sm&B)(Hyn0XZNRhR+R%)7xtJLxDIQ}|F-Zysi(xreU%_8Y zERigP$*O|v`fm&AM$*;QOl}90SAUHB#eC_KZf1LI;ppl4@tODmX1WmfTg4RFnqI;v zT|PC*L~pRjC0~ptTY6>&C&wVk>Ry_6+at;9cW2oj-?61*04`Xz>dM;9J+-hQ?zg}>7;)^vqL6d z(4Yy$_$?HdI#^QwOpuROTbNprE{dO=nP1?Erjz7fWD{GGdND~@3%!!mjV~vM=mY*| z!4oyn46Na||4md;Chu&J*7Lf8J)ngchj>#c*!^QocJ!qDl| z&~w7jbE~b-C!M&Br~d7eZ=Fn4(-k{63)3wfsg^xBRG4ZxDB!=d@c|tX>=J6aQZ;=- zO&?A@wsxjk_X(~0Qmvyx>!_B8E_KpLH--BAOO^KraCa$*XR6ozeQ9UY^2p=d9N2_I zNDB!S+m;INd)8_X-57!Kyf)n9)3`i<7=9L`84XP`%XZuh_poz&s`IGOd6bztXcF8_ zDYsv6`Psrp>be2l05T#vTi0l&LCyU+pa z*{P~6Le&=bIv`XHq^hxe?R$jwJ@>D#Rh+(Ybe#@f()eBSd|3Bhon%Px{)=ng zlQ)i#8g|FKFTC?Ys(DCg9+E|TtfM*vcZV!YO;~X%m=6NmBCZ7oZ402R zjRDbsT2}e)np3_Z!8i2hmBTmlAG*sWyB0JHwh8`ilF7n*SN`agKdrx)5d6ni-6v@3 z9KqDNGx&qvAB^AGNyd&)g}p-i-uq=j`(c3R@haJ9z~7tvqwzm2xfc=q$5!3PDU(B{ zn}v5;eo+5|(|0-rXDgLS1`_XGS@Dl%3c0FnkofBb=gaLMjU?-OSL@D5BOIrMRT24e z#YfG_+AXWKr)X$1HTU*aqhWkE{O;^Kv%is{`}ZB0Vm5Srg2R_`^a+l>_4?$*#!0aib05dtID8l?)6S(?+?2dwIi*g$Z%*w!C+t0^M&5zG z-?KD^xEt1uX<8a+-N4d7>jrA-eb1Zn?-2Yu(r&D00JME40BvW0lA#xOW75^V-r4)% z;r9-wI(G}5yVKs*Wom=2J&4#uC%vhnHsKhH;U{*Np-53lQkxTzc_d@%`(<@aa|W8B9gb8Pm;?l$)0MyDtiEtg-jZ_pQS4(d3q+f4VQZitj$s@<9Ks=LPe6T8N5aJv1D<6(2Vw(O;ywM*3cw4&w6 zE1JBOvR#&`P4%4> z`p&Mn&aS%$mZ$Ig*WAa`ZG9g)-gDeEr<^{)>05I)D@Xk8bUP3h)a}gFPBY9C3C<=g zVD>8hWgtRi4_E!=!AFveM}LG+IvWXkOZ}*MVt#}Z(w_B;)gYYm$oj=q8qr8Ig3)It zX3Xm8X$`o;EbTW?swz*@ie~5dRBnzIZg7szAXe5nz7i!p$}*r@jUmwZN{gnrDseTk zo`aJrhCF>E>8HXqA*vF$e5pLzH$feLXP3rL#Svul7+;E>V=G! zx{69k#E?clcqu+3vY4J^6GfrpZ;TERt)cLR`HR6>y0h+^^ocm${A+N;s_6Lj3t`+> zcNvy?arF#HMi}pcwGe11`g0i-B094_NOzvlkn<^*5uYO<+cse5Ps#b;Gu#~Me}&5? z=?quv3Oh&k-B-W$s=S+tX9=3Ghp%)%A9QuZ3bUUMguZ0b)n?G~DLrHbL!MT(X6i`fxw zFoF029A(W_ga^!!yTMdanVtp{B~IF;>}NDF*aUqOgWIFT(w<7Gi}{Is^$3}PkcWp| z;u%;d28&%4dl?21-YUG4m1Er$MOrLaKou6Wx4SKsQ-VH!5GXr&1&2 zV$h13n3x$_qN$bj(+&w69T8@2`N9Sx`kh%B;MKTngmym}7v}A0?%}s+(Y0-_6sC7^ zsK@d}TT!<_S+GDoz0V2(*%$4d&H6^(Hdx=#&AkR~k5Ge=^?_<58jGb(QuW70+GYFM z9ciJ1;qcrcoY5VTPkp-O@N@iBe0D|&Etg(T@>$Urww)N~VJL~64LEc1K-9PI$hna- zDlbhn^Y3GK+~B-$AO3vT!}0EgJv-Ci#bUW%)E=Q$(`v4^90N?+$jodbleRCmk+I$GD+B@K4zdlA?S-VvBJF@|S3HX- z{z8eVM$5Bw9zClPFBANBiDwlHPDSS;VuAb?D@1-OUy3oLU`I0?!F71-JUENJP_tqG z%l_=ffREio8Lx4lS)u>?DMw1cWYUrBKrZ#CI!0GIMl)Pap*3wk1fr%8_p^`6=qH&} zN;rkp?OlTQfZ8?SQDeY=cx(1W> zLAdw`qyFu?SK4>Onq~6f3qW*>DnP>&xxRb+&Esk3>9n&t(4c}&W%Jj78HY}ivJbv0O2C} zi=IUB>s5*3X@>%r_JmMZrAo9=MT|_gw-@PHtif^xH3^MSH^3N;6z_~^k;La1J7Tze zT_V>AdL0Bl4SL0=)dEM>i=n`2+C+feq@>WEh})vQDL#a@DSqvlC?xY5pmj_}s_+Ok zqCE^v%(A2ZhF{RbTeK1tU!p8g1od%3?eGJwWRB$|Z1RZQDut4gD(X%xpup*X3Nj&A z_(26_K=M(%P(CPJd3#;-ZunB5*`r^q(6->|K_#{J47E;ipn`I~XA0p~AGk2H0oRtu zgKJNi;T9zd;1(w=a2*L7TpaL+izEDSOB2O#U8wW0maF2!7k8ort|wuITb3w=3q={Y z-h>-&MWPUHe!>g4AW;Fg5`91t;;i5IOuubftV~ogwYRFp>P2G8k+w|Mj~~K3s*2P_ zeT$GOgQ~54%2+ZQHC2h~gfrnt;CQv6QgpS}8#hfe1Bi7n3k1CKas3*QzzPtm`Mn&4 zj)uafrZ~O%NH>6v6rRyOho3AFp+^lua}%gf{J8REdQ9E4b&!~dh{~Y(eXip)}u2gFnSszozbqvIyQRho_+L$npn=8jv%zyO9pUp z(_ecs^n>R2<>L~Ik8Y$`Ip&^Q9rfZbQ#O7E>uZc&wJF3iW7IWy`0zY?)n!5L(9#jd;)1|V5Kt|$D4 zA+VZa3SBe66Ri9-a<0M=OP(++B^Jr3!0hB?EF9r#r5>`Y2`6fszU5y|)o z=vG38#UjOe6)t56$9Tfj_#5OH0KvaXxjSG5dRDf6MU=JZ)CbUoYW#O71D4el^0BJS zlaHz*RqE^HgJoKdUnJ)na76pm3@**a-4iU6*C~8~oZlknZ;|r`IRu&LB|J_-+)4Wb z6_Z*c<~NC6-AQ^Wit;UAv@={#R_qZi>}@I(<6onkseV$C#JgCiWqM8;Zw7n@pUlSr zqTCJorAXpQ5WLVwkR<$hbhp zk0rRYED1u?uxLZI&iRX2lS6Z?7k_>H_oyDfL(Xk-Slw728!IUBK&lXBVfR_qh`tUV zb*8GRMY>+m`8X84BG(Z(5t)zSo?X@~;7uu)2jgBy((m*4^8XkQp3p5F=OjX}#4i!I ze-S27x6ZU)oR3E$(IA*Vd_Gm=k14CaL%#2j?{6s6JjzsT)TPqMIYu=ys*-F8^5qoY zO02oFFfaJrR!RNJXa+LPs)F!bkeF=J%wkNTMk@CIq-=_1MvOeN#8PP_$pcur7XB7N z%J^5?zEI7d7`9Y#y6lSoDmhoFaMEeg6s7nPdiCS0n1k_01CD2$stWSGM9v*@nA#e3 zG`|au-=|Xq`wx@@o2%M1WwTbBtJ?50n6=WNXG4t5*sRI=eJYf?Ow7lvGPseH|7{BS zE`=1yZ76?GN3B?@Us2?*tR1K_F&`lIea=tP?t%z}TPWQ?#xtsG2D@+U_u;(8ecH|0 z%Sr0u>Pgyr;Od!MY0`}7Pulz8uDfdPgn#(LnyWi$?}i+oT~g8_*jm;U%Z97AV`*2- z-Le%|EBI|i#pzN=WS3t`I`@O^R#cosu%tjlrWXFfc^Yrto@DuUum(JxDR;NvhN1Tl zufBJ6%{}x0#^wvSk~XO3K&^^tUO{(??73gRb^Z1?Zha$Fencog@}L|7tv#vkqeA!5 zmGYy=si`;BxK(J}`s0zGocPg+wZ{DqDq7O@!(?@xtkRRc!?eFUE#+EAJ|Cs4 zCtm~Q)kwZ3n4!otldpxV>CCi}uZ>dp;mfo`St`|hV5Rv$rUUOciZWZc!2>42(s#pt z^HS2+Ck?q3TgxwVFrHAqvhK9Agx!}$f_}Pf$Z^Y&uJNZU>Y-GX?m9xQ%0P8l%Zk4> znknOoOTQcZR!|wMN&9n4N0)Xcht4Pc&#l^@!#G{{fLZ%w#!feQqO3T&>W+CNzHSY8 zylJO*y|z77J1o==r)qZywL7T%#sLCBx$f*DjM^V_64Nw7lsgdmgP>pGHgab=y4 zIJD54qrdcWMaN7(^E9Mu>zIaAqu>J18w~YZd*sIT;)Xt!cw-Z*Y~TlA`?>cD?!Av| zP-k4>*8m-hJL4<1I*5FeO?%hN>+iOF(7eW=#!BHlB`*=gg?CpbtGb}VleBj|a8=wm z!PsAZ$LQGD&;KKSV`IQ6W+&qiY+S@jh+k&U*eL&>VXZ@yF@N3WcO0-?^G}dMy58V* z^8Fz>Y-9Ze`A8vF%!yv+iS@ysL#8}2iTDuthzY`9hEKFWOP0)^Ug!Tgh0W4eo*V}` zL}2s(0*+{A_K?`FBVCs8FX_|2fq0=*|Qy?o9Ajo2_?W z%y4)t&;R7wkFNdX)gQh3OM1=hJdtDWeBkh8%;Z7C;ZlSG=EcmY$P_ZKm22lcO}r1E=!N+ZP9sTH>&~^Hk96~bPJI%!pD*H2L zJW-a@qh8!lKp_^+>VH&7zJ#gNYz5nv3;^8@-U?=N@&2H&`c7OZYFNTJl z*?lLDE2PctN4ZJ>a^Ccln`>x=(cW}xN4lXY-QJzvI+U))YkPVKbFD93>4Va4dP_gO z4s83h;~P2V$_*DjPyh;0E_`6G*n?{5#*a)Pd$n>-SH{M?xMZR-Q^dT*Tvc7h!MspF z_TFi_bu?4LUQ4;E+PisgwP#%H)y>uTG9KnFg9g^^UXM_^Mv~TOdSjJajhMhdgg7QYcLv_w}~KZX5JPk zh`!zVR_BsisO-wLvaq%d8TnaQJLzdA-byTmgu1>AYobi2e80sV^PTY9k+&jCBOe}r z?|2d`nRlB|xb3qpN-6Q6X^c+bPyNwIr{5#uni*YtAzv_&iwh>q>^gva5R-ZQhlsv# z#7bDq+4;G-Sr`)b$rsc4E??=I!0CuEJCi}OSc#P!^G))zQQzpPVV`CdE!NHQV6xT? z!X^TY{vI{K$uezE%0b}f>yP4G=cEf#a`@K}pPAT}6?G&r?l%z{qx@9eoY(QPPyyWP z*mDO6TKT5dAQ;$F@nEnY1u)}cP{vVhQCEL#X^~d5g3T8x%9uA?CaBR^%k@y^gbA}{ zA_vQa#bj>1`{HMqOs$`iBXt)g@Fzsa#B79ws^GoHy2md}u=PpI3xkB@fdta24dUs~ z&PHO)(6^Wmf({MO=Tm{;jB~t&99lk!WaK99t$CQI|N?Vj)BM!!uW6 z{4~99{X!%_bT%{}8RvsIGctoqn2PZpj)1fc!+7{CkIR~`sHA*LIoGcVt0C9c_v?r*zB8qHd!QfcnBnnSTK4SY* zK)*m51a#nIoS(wkBk9&DMtCqXu!LYB-nz7luuX*o5Mswu@>DC=EeZRjeTS$wo`inl zz!yp@=fdt}IoiS=8C}@3=EaMaF zRXTx0iy_aHCEDQ=hn}%89HEj*>7aZ_wjfx_(96*cXSNt>#yU$CavYqdeH5uN~wgraiki zTsqT6hrPraqs+BcXfGw%N6vnVU11bkg7*M@FJ|8%l#P=*O-wpXCoK`gLBX2DxN;*D z*UjQe6%T=(!n9TRQ7Vgg^rA=iU1QAfhv{3n?i;jWu=B~CeT2UH*w>A#UN?Jwa?Xk-#*^ZHQNtbBZ$yVC%Zhi&pCH`}{T1i^OK$Ix>l`3DUq@ho^j!!thn)kpbT<_0vouB0Geo$6>w=n5xPjaO{D|Fv{VYRUK zwfx5b*m3jVV}R_)xObX-ch6-wJeH!5=<)a)rsr}@?xeHfF-M-qeBO(BCTp^+cT1}NnSMy_zJdY2S<(pcQZ;EE7|-3*Z0%@nTVD*Ool))ubha#ff6 zzMg$)S=nIcM^beI`lkEqp6>p-zy4m``)@Xzm4YW~cr~)Km!f`!nZo1ACzj?76m^l} zDV~l}F?x`uF|CX026gb(NA)qopn>2FQDe+BXo{H!%`s+>iCG3M1aFL5W9%Rs;|95y zZO}$=rl>vU7<7=dIqHnn4Azh|6LrPhgKmB+1{-Kf zM;)Yi?mWfYgqkaQZM=i43Dyo+hrkuFjlAJ{*f zE8v?}z&Ede_f*FZb9@WX($2R6Eo%rZUZEXmS@Qt2JnW47s*m9_uRmi5Pc2HeU<69)G>V5!yc?*ipQgs?!G>d$zgD#!c$zGswsw;Z+S?mO zrt`wWc=P3f0o8pn#D`)bkq?fDiLqo*0@hP)Q0FY-_|!yFQ0;{*FfeeH>Pkw&7@x>< z6=5EARV=}eMFlAcToO;Dg5gB`j3A~|lPILd#CX_LMh5Cc{9Ae%z<212RH|4wV@gxs z1dUTz9-2ziX)35IR7mR}m)EHjmDWQJ>;Uaa8z7h0R#0YyT(Q@*L1a^oLQS5&0&IrQ z%k%{0*9}Z|KYbrb-iYJ{O1#ooGMNxl0#9b{%#kDgenvF{RuEMt7>vXtsbEkR6QbX&8YCe)ymDnrFh~|D7?eN|QC{+#``s6NQn6%DGLqbo z3W*~^YQy;0_D6P{+Aj1&BSSsO30ReQ&zWd6)&r{`M1#V(03s$K_C!Myf++QruT;-) zGDz})%{F)8dv&&@ z8{duDwk`N>&35$RyZyh|=3A~Nx%u!7*AbaJlG9UM%M!Flh4Yh-X^MqE!VF)B1Q>L% z+@pxEcwUG74JAcJC>28qu)%1N4VIMvONANiKBJ;px+p>x2 ztg=&fPddAiElQhZ_hd~8qwSa7leOL6`?t*Q$%bxk^6ifrPi#TFQ^gwyC1 zy#SFBMIk8$gZHndhTn`I3;{St-L`MM=E-b&O4;<3yzx+uq7T!}*=Ap+xkqX4k()LG zW>ZVHr7hF4Nom<6H}?W!b6d8pBh&VX()Nh#+X;wW9odeqOvhfOW3Rk+-@Cq?#bA4a z&go4StB9PlY>gqWtuZdrMQVbEEvGUCMtp~Q2NY#EDlH=j*gO~qm03wp0pABrNx0jA zAVfASNcX>dH>kw3q%!NJ1 zBJp52lnf0;qLI|ZWI@tv8i0&xd5!(hQgIqx_m81Cjkt?0QX|wcN<@`ZeFz>L0*HRF z7=pu*sDKM7RqP`&h{`%o;xJI;;cC`QTgX6ElU6NUzU#3PS=THiE>f_IaflU9n&#;d z`bz%0DU=-5Yw5~rp%kfJOJ7zCB}(;LhO$~HU8>hImen%pVEmxc^q{(oX;Z<~0_kHK z_=MfB+HjhOMKGgI2@{gvEc$_daVvtA^iY{T$%_n@G@#A!WW#daEh|C|AiV|vR;I<5 zY3Wf~dajw|M+W2_M{l$|ErZ;zZ_L#Dm3seT$F)c0-p6m$@1M418=5i=T}ngO;x>89 zetGi~HyRF1+q2EyOmjeK4lJILcN~$o_upt9kU@j^)?|FWim&(DHo5P(yzAK;zUO47 z<(CdumTS#&4Own=mTSp!ZA+}lY0XguA)^|Jf>TXNQ5cSlhl_>*4A%gEgiqdr#49?& zDUhmsUg}I0zna#8)kvRbVC^caL4JSHC@B~ZqBT`CHwp&JF)D=|N}e>GDr%SlFM=6@ z8c!>y3N0O@h#9g>Xd(e9wU-_P@E!UvMH6;^0gQ|(dPY~By$TKfHGL5xBHySpK*K-O zpQXv31#_Gc;=FV=k~#@)gD$X9HE9Y+#NdO36XErq(=>{uZ$yX-<4JMP?M=%lgaS{zcRDf!U)oN9PY@xOSzt|Gy<9 z@@tJPwBMvzOZrfJLc|GDtq~~_mr|j4SP(6UaOX#{%!knvO(;GB70PM&ibjL8a;TFn zpG?Y^p&e-e01nzjan20ep|Bk?+xrt~35m$|Zp-xcU+?W-(nIbZrQ2b>XQDjoZ@9Z< zuKNyKdqH?(@Fv@N%iXb%(vVq)n@BUi@vawIR*m4jCAX;N(2x|2g~ksI_*bL)Mo}3I zs@7moTM|ffL7eAkzQi01@`*5@^^tf=)k`T+H4G&ZQPmIy&x)9G;u`FLB_^b{HX;WY z5&a5r1L>UTDzOI<5L%Pnk#I`HBu&($+oP^ws&SXZ~xHdH1^ZCo4q+B zCSa!P8*^roWT=LwoP{KwNnX z1+ScoGS%dCv~jiE(ESkw?Ep-0VGDBk%{y>QUp>Wao7 z3!&2Lc$NfY?A1#lnCUog9JEic{u=RdXl62`2?wvn{L5Tjs}-H8z2bm%{x7}%%a#RtJb4JJck zDuU4+byZ?4g+)aSsU(4%EqLV zHx`ulLw#*J)d@7|dGiz#-}4Z?GWqW;@LgNkj94GMjn}E~>Pi%+3c)ifRWv!PZTnPB zVFJ>{`AS1DY!Vo-Z1s?PE1s*x6ARvU)m|`VrYtE8c#tP;;T^mV{#f4mh9PanZCj{^ zF`Q8hh*Ug$P1;hb^ZBSqGoP6XYZ(;~s6#5!%tKJ&N?RYE3f@sR;}|EZcwol!&dahH zXUoRQ@^0P}p&xLrYtw9@MVflhx&Dl!yrHgbw_$XJ{H$~ICG_?;=)3gH5 z+sbIOJrr$qY1_lEz(b6>l}NgseQsmAyjFxV%%2tmYn_lDxUmtegiQ79t^W9#w6S@NCu1bi%_1#lPbt=~01 z8!yqlEM~aVdl!iZ+u#{^9RN4YCJLv=;NV1psC~lTMxu$KP*gjQQk@vn zT861B47V5pg@Y(DGOpGSL3bqhSyAG*5r0Drz{tgB1Ze7%MJLelQ;k9_nVJA!W-O)} z$H1-GA!1-mbrnyGhz)|rW~!M4B%^}io=_61QbU=?%l6A_JgBsv1Yj);GJ5mGUPQMwn<@| zGHjQ^b}f!vXSZe>8fOk=Z8aHNi(+fZ*t!*4_cil%+jdfL$JLZ|ZCKkkP+;)@cz09*ZBX=6ttkhf+ zG*_Q*u1Rj~x#p2uw#kniR$30<-`r+->k~Jb19#fi=bOs{12}z()0c5>Rh(NZv#Z*c zEkHZ;)taA_cDZBQ&kXX~-ST5!P}Y9o{=Rm~eaCJx$M3Xvf2O|HKwsW`Uu)!b+kfVf zJNC$Xk0~9;?(b`ty!+W-FwYTBcR;1bA*c}vNg{foVH6UD2OSRLUEP8N92pi(U2q@h zYB7DlmVcjb2e4J#1He{OlzPBdaU4J?7d-I7=-^$=761DR#<<|ghu8fUwWg@! zf_d?~veY8Nl^i^xMeaA0iO6w4h!X%vtPLfaCd!PVWHLHY4E!#WV5I2R_zec~;P?-u zjW5gDvasX(XZrDVWabFGGTd5)TT2pc3fGq9oFwg6I6tH_T(iP8XSi;K>n_W#SGe^{ zZYu_It@+a;Ej;NEpTxmFh2Ri^!w5)JsR>gU!4i)lIF8^5f};pfZHi|BsHWpY!Ha3c ztV7^K;6;ED0s>A#Li#5F=cx~M1|yT>Xrq7Og(ZmR_;aQdZTupA+u@lVpBbP1>daSj zMuLI8-QJNiYiTAQkIGrJyp^i0&#_vXqgFNKY+Bk*)it3kTyki6h&8VTG2}E_mvsPn zl+-n|eKUO+2eU%N3~OY*C@W=k%r?z5-Gh*rcE(1CsUCtrmjXlHX?hY; zJUt0)3~4Yk;BY}VAo74E^8-g-bV#L#%A*h}6BZ&-uqV!lFG0QkfjxPtE%jwox`Tk_lE#;UaFoNQ*?yoCqD=43Ntq;T10#;}R*DqXlC zM@9<e3^gvqMOd^aq_1OQ~I=? zH;_fq;i5nq@Bm>-ri^I=Z!9Y_k}@MHGnLLMs>-$M{TDAfeCqg2*jEbvw5O_+ta@U8 z)qKUEeKTeLlrh2?S6L6$C2JKqT-F1VHm8|k9k2y6Ks*MGlhM?^dkz5(MH~X~j*f^0 z)x-;DB4NRAg1Z;M44M#0VlA}H#8&~p1s6Q*-4PUa#5NAc$NZKG_EHUE;;f_^!ilJa zCN|H5xlK@o3@I^>p9~7Uu}vyTKwAFAyPeQ(4A7%g(SD_P%PLH@(STVi)$lZig!M z>wmKKr@MZ<>)kW&PrWw<8roqxcMx_6NFef+`@)y68Z(`JN@w4@&%OV`doSdxfRN(c z7Y@oLF+jGN->8}sNphY+62oKhu*${8QeZesLGZ#P)q)fUBe7&u<@kt{45h*+@hDS7 zWu?+*Ro%Fx>R`r=p=k1CD2(Ew@Zgk^efcgVDn&<${~)kMU)2>H?$PoV@-3x`c0e`V zUkI<2wW$BYw3s%Oo%8AkR1@hYFztG$iHJG zlO+BPR(Ku38wkD&fE?o!R?|4mS$SJfd<$wdBab}|36O$z3XWG93;e&uxGwb0kKbfE zGt35s*>F|J1okO`eK(lLKcOhg*x*E#Gr@_h(WHsc@`)~rP#j|UQdKk8M(`*fcW=c5 zx4G%n)c5G><{VFMHP6vMERR1I7q-+Cv|UB8@cK7&uhDSz!!U67_wZ;E!!6F9rm+8{0fcRux8b2fc9aP$hTvVY;uN}F;?v)LD|6iB> zN$b+D!19XI2xbtVE){W0z!Cj7FkRdiWeU_*P@r34n}_4e^e3{E_!@#)1Q!s@A@~-8 zZv%J`Eg3(fZKoeY{m1^8wd5|PbJx2E-tT{}e`PI+8)%tMLraKk+!ukg%8gs5f&M@` zt(rC@cPMcj6%TJf;~lJ0unY^@v}{CwAf5PmN0c^{X>+cE1jt^=s|4+4)ti737ZALS zfCq3FwF6wFLJ>;zuJ}Dn5njKBsdo?r5L6TX1yu_%VeMkR_!Fr5JTmHEKtfZszDia5 zpBCGU=Nm%C?gJ<9hNX? zvwzb1S^Qg^_{EiGzo2@Hv#-gDe}MWiSOW11e&|Dht3skEsx2S+)*|C&QD6*}XrW+q zdBbRK7M`N%c@j3mN-iOGhR-N&=!%CJkT4>ArYvyEs3&K^Ln(uXgRiRCF4ZiTQMT|! z>7BNuO=)vM%b$QTPFbhe&(OB5)Mg=-wt@x0f>CY2%ZlibPQnNZhU^p<9|LNPpP3pA zxK})c8p9uenwpA+A^WM+*i&fRkcT&=ndRs0aA}hp^%TlWZ(_k#xhw+~teIG_T(zq- zycq>7TClHCQ(W5kk}(Z%64n569C38-{x8_35Ck!UZ-A#f;sN+VbA0i5Nem%CQRe4V zD_mAdMMlOFV-g;jz)y38I1yv2K9K|$7Jnr`+%)105L-^$737eH2t5>aaDGDqhY@W0 zrwF15VhBn*q!@7|{<6wwL7?Di)s|1?&zbK=z(Gq!fc*1ot;wzc1|^_)9!+w7S)UpKejYFqcV|FZw>z~w-uZJ*M%FUz@P z&-%p|;fSK$sxYn7%!Li$!rEN3Pt80vJ1{el;V|r&*rk;ILOY2-W<6!vM_Y%)ODuo+}^C2mCbdaO{R~}Sf=UQEO+inBH#R7 z;7v{>;b>30%(nq1R&|jZ?SXhaP&`Z%QQoLd{8ED;y6o8Md6#XRq0riCss3RXx+diPye?ax! zGaaO9kL>BVM`7aLaoy88+A6Q^x<_H+qhSM0H{}>T{5&Kd@XzTp|>oqjh>K#;@mV-)y?nB(rN3W5<#ai$3Bwy6M9fiiS(vaGSe1TjS2!*JNvanVNu7 z6Zn`l(Yl<=Mw_x`?v?aQ>6{+YxASQuq{*?`EH}gD%$S4oH#^B$FlQ~q%~{NGl!<-i z#7ifZY>@lNOVeMXKV(SHaDBXq^xR(Av!32pP9bDU6hh|Uk@wT@!Jx3(id61d%zq+ zt!~9N%vmsJ#UqU9Tr|gGj-w!e(LK{W?@_GHIUAzvR7?AtCoY}HIWXtMlaLxrxhPLv R&W$Ngeo^k#LXHTk{{tJrIRpRz literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/gguf.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/gguf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8a892f048c698ecde6f70e1b12b8a64524d8f3f GIT binary patch literal 27392 zcmdUX32+-%c3?N|qj7@(Nq{7Hi3CMb2PNy4B}>#vQIbX3GJL=gyGelpNU8zqpuw^1 zcrsw(jmdG%lG;&La5A-~VojOeP1Vqq*&0vPR-$Ax6F{UxxWn0zH%`rZw<-&Va^~8q z-S>V24UlTeb~dv|OZ<5MzTbb>|NqfsGEneERKFYkxz#qKGO3%BU)!B7S8=9Z*BKGNK7+$gei4>s8QHR39*q6sm|iY78(0R!2-x zbHE(61T4g_iCCkyfSthFh$HF@I0>walto{lO-fza1vLw{Q~G$|YUJF=m#%ZC1*Z0Lk+=rq<92fU zMq$@*$IuNNU*+#A_yZ@{GEOsa^Wjoh*Baq&?j&0-AJ?a2D_|X-`e!b)63RTS$DQG4 zJ=`ho^c`5J;ipc*@s81m31&aO)!45GHXnUt8>kfW7y``)PZOIuhi^^ zngjXqXyKmW0xQPjnZUCIFRur@+{#rIR%75f650l#?Odf4`aB8U0HGVX=cLdVNa!XA z-OO2}(BL&?J;j}Apm=SdhwFKjW|e`SXM3*DSE#GXXQ(Um|4yub%cnRAep=8S2t^{H z^AXOk5S04|uL-)7!}0LYU?|cJvChMXPjsCg2@S@>qah6Lxi*X`^`}C7D9Xh-UT~cq z3=f9m;ZP(z%CS;7WHttSM`H1zXmDsahF(k-j12Ynb37DbJ~R^J*b_sY2Zjdw!u^7y z;3w6IfsjmB2o~%G=mZzPFvLC)igA9WVCV`Dav>glf>nfz@q+v8;3cv399tR*&2f}= zOUx33`s}5_`KA@ZxMImrfa|HD-q3mMa3sP-I-}<~mJJW~LyR@X#e-M4aQ}sPFcgpT z*!v1L%yYf*AkX!OV=!*TvFR<{4zLTzw`_%T{~JK-a8!V1sepo|14`D$DmfLW#*>~^ zkR7996@98C)km}H*Gxd|Ky06a)xaLs0;~gA3$Pww6=xW5d3Yo+b?e8HblbbHw<5cg+939awHPn0NvmsLGCKoI}#t_H$+0$ zI6k%^zqrZP+|b`Y($_wGjjw^iM;*n1YA>`2Cn4qm=NH$hC7o$po6(go=qj`IkKL=^ zJy(}heW>#-Lp0T!Cpr-(9~mSSfK${p+OOyyWs1c&`)|V-@IHu*VLl&U2QH2T&kggz z;HsvGfvuPthL^fdWi3^FJwRVrsi49h@{N#|Ux)ciYlFcH4gyMAQClH!1Ewz2R`zgh z*gew47Xc_;1q0GNLqz^7u8{#8kyT$wtY%ddc>9%SD832+ze-TUJoa7?R2M>#K0$ZB zFESL0Z+cA75};%ALtQM#-oPpw9^&I18w>$5VdH>~W25!O$!K3I(PpTW%souKwQR~T zX}H;WtNTXxoz8bp+&+=E>`d!+K0GkkgVl5Y>23%lvv8Xhl)aHy={(r6{%CkG*c%!S zoexLC@oS@1tD3hmdMh?eoSXX=+rv%3NhVFe;hciqmmfiK9duysl6Bz3`g_TV66^~{ zI9w93(VA5)Se;-8mV&7FIt6V=>PagEjhJi&#=!MJT4M!Z3MkQpgds}Xd4-a71SAB)FXQ*rW9cB~^;aGSu77q>f zay(Wml!aqKGKj(0#qjVj>>`3u5LNRwOkl+sDJ2y6G7KijR65~BDu6d(j0rOU5hI2P zhUgHNpIyI_phSKb7LHpchI8X9ECeeqRD!;PQ9{p*dP-Kq>d5_=A9?ZX)B_F0xF)!p zfdymplBFT_)J)G}TW6-NbHQ?G$=yEh-k7#-TyoT>4$eFLX~w@~a!;!lYPK$z9$Rv( zOU36Mt!bu}gx?#4o4`;^H?VY$Ru*I0k|; zJQ(K@Bk)^YpjoGR10J4b>4uIh9A$?0fy^>qC*1wT0r zt#+j=+tA}e&#sw_Eg6|K!AN&SqLbJ>Y1s3>zTEp z%U~9IXF5_B-m1<)31zcgn6gYNs4S;3s-4-g%Bgdc=ca;_!JHEP?VHH&eZ4JhUzgE0 z{ag*9;_Q)b6J(7Gd<7blPz6p0l9|PAONdGOHC=)}D519E_J*^6#W{^@8gcM49?0LK zk5e=hmY|;3QPdcnpvM)LseegN(KPkCK?OVWyUHsxSycpk19_xq-zXSG5{;~}(w%Z$ zBqa2^`?*2x>M*}|v{`olw(pJrB_4_GZI@D>g4V>a|G)SzNbID|U7792XFSQ5ru!Bw z4FLZ#hC}dMW*ZH(xiU0(4KTJeR0LzcC}M2>aY#p;lUOFA11NCGXr3o!f;(^QN~^la zkPGxxL3>H0`~@Zt1o4pOk?dN5t?{@a*!mc_FlLG(<#3te-GJpJoDC`SA4_AWSj9zH z_`HHu4$`b@OgZTKQmNIDS~I2uT)%e7^hF>N`NPGtLGE05R0)r23FL!yvHF8h!|E`K zx}$vuX9&JCI5yHr0%Kz#XMkCZEp5WWn=5!9G^O!KHd ztP!Wf6_?PALJ0}=C}7<_#hA8aynMyF3J~;2(XU8o{{Y5KZpJ6+?`nRJ8q>k3>IT-B zM{y+M^-`IN0umZi#1W5^kJTp>-==iz4EJuvx5?4O}I#3DQD%w9lX=$cLmP?IvlD209Ps)6+eI^~8y%POm6L2|^OV(cd49 z1fwJH7m5cjbG^aPcENJu#LA?W6{#SF>0oC`dw4(A3zW=>u6zw=pzvw! z(?mppA|1ieJrqA09gc8OZVjN1PK`nG>`a+_}mffL&=ZSO+tcQgG?ynk4?dRT&LjB)HxU2 zEg5Y~w!(L->PA)SQl`Q`bMRip#>xG&=g0N;Wr9OVJ!aH@nl9yf?ojCpK%S#sL z8!wH&G`;E8wj0}0n{Mn(U3q)#t+CnDK<&(3`oWd=uiU-#S6Bc1>VoB5T6gZg*0JDj z&S;yn7VE^esU4F$#>dh+?|rRKD)X6p+UnHqg^mN6`U6XhWs#}RF!ia!GpA>nx&9wS z-;dtipV`|z&zwxsIUmKCt{=%&Q3lt|nvA}gq_4>^H7V_kZKi9k=?87^x80>PJCDsX z$02>CnBI-)t1|j^OTLCh-`0$8>ue!P9OBq{jYTMg;-r6&3 z%hY$wUdz<)`(S6L{zTf^om4N`oQt-GjIAN{^h_je>zX?Y&IkMd?C=i{-?MckHCe_q zp_$T6>c+dEVl%1ufto6>OSR6sHzb)Qb8Wie$X)MUbGq*7c{2_UIpS8FVYr{j88!hx zDG?!e!T>JFV{}L?ed37*xM%G+S>8k*v)oY>F!34cwo=3dd zZqpC8y}#}5roY<$=i3*IJ!w^sNTlJy2RcYQG)(9p0`!dx_6mBeBN&D~DJZUNg1^lI z6CWDBm=`R@1Re2(*_dD|r8G7R1`rK|FB6$iZ|Ql9XDFGiUjY{Xh8Fawbdlzeo3TDc zQVWtr#cRx%j#aWM)U%H1=@Md54!almx-mmS6PM2R%hUwTyAtYEAtsnf>2!w{=08?h z0T8Pul+Lf^(~wimIL5&0ktyLh=sblR9i;_Ay6EDah=s`jBEt*b?@HBcV>16PASWojgZ9=7dsp38z$x^9xFH z0wp4rq)8~&?iZ^`7!wveuZuM$oC8R|uKFik5fYMolwIk6w;9vMZv?}DNDx6Ohj2q6T@g1b);d|s< z%9Vn1se+k$NO_QY zeww1l{x2i0Ys?LMtPB^U+!|+r)|B5XELBO3Lasu|W92M0R#8&BE8#BG4z)g&ueEZ_ z11rEoFo-vf6c_mtEz#%{$4%}oGX>EKl*4KSD@DOqX9D`bH>Ane3kQEDUGdiT*aygx7u2i zZpv|za`QeQNg<1!gmp$0W*YDhNa~>m^*^}=lqatG9VgnEjl6cdTmr4*ZSw;(m@ zc;n*u#p%|W(7XM&`{zPGxbXgk1@qygGHWcGcHJ}9ECKCxbo}TWUE^JgOk;*=1Yz2y z1*R=$psbFhA*ZLz<tXeawZvA7Rp#Xg`N2e}9d>mA{F z;J8+@o1!qZ-FFBy%b-#Aoe%Y1^u?jD$j}uZQiOy{BLrT%&rg$T{VtIK)%0+Ku_0b) zN3CJ|O08kLs5NZQ^It<;5#ry3rV9qby#^y;XkipPguI-%Xxt*xi4zKf>te?RqZ9$0 zsFvr%^`Sy0R#06Ib5{f{4p?j?$|FX}v!P@Rwam!IYpsI0W)~LC+12 zL^(bbhqX2u9v18mGmU~O#IiwujYz&<$ELo5&a3GB7COkZ@~@$TOO^j7I;5jESiZpv zVV30vgP6U(~;mq(LOV z4IrH4_&9-G#ZrR9L$P>p7~06O{3KK@C?StPUjZpMdlg09!-6wE|0~)8wEh)A5e~x` zhy{-f3&uRi4qX`(Ood!x(^21xI}pAXiTki4vND!c+me4Da();7V$Fb~VLRrW2DRh$ zkv9_KiO(qjKd+?JJ@hB#Rg2}#neyft+kE-@v{Jut>shZNi(po)>>dw%gPtaHfG8; z&U)v|_9hJVt-?3d>Qu&z+l1|kB?uDds$!rtzSXSn#0Ph<|#*a*#9zRa9yQa%BOm)71 z92P(@PD_fLdGg(Jx6jRnZa+J3-mz%jn=$W&Iv1I$3{y2dFw;0|n`b(pxmYtaQqR0$ z95+rJNLREh=+@^>(G*jaR(W$LY0BVUsBFpTTV@gqkDttJIGO#@c^jlK!vl5ca? z>7MqaqR``7`_JsI$%YC3&E}k%vVlg2ayF+O8?x)SXPnLVoz08Rwv4lFCiK=P~Jl-3)dGAdO$7~$SWcuN8u6qfR(!$m&Q|$gj*%VrUp>= zy6Sr;?;q0wnMKQqmGh#e5>Yl1cQM5=>Q%a=?Q%i@5P~GMcZ$TPu1L7aDhzZ$)+!TP z*;WvNJg_wRksZ^cSV7J-3k{9rN&vCCQWT+sx*a(X@+Bl8IFQ_^Z6Y75dPqR7?j{5S zk&=VKo01Jj1$B6k<*xE9VsJVoIze+3*tb2W%%3u+>$7PNVZbqobb zg#}*_&Xom2!^4qlgvbmQ+xn-Fv;)_?0010oW+F{Sxh{%>UmkyX`pOJFZ)r*ETC)1G zM~PbPQx%gHZ+bvpYN?#=zZJa^o!OtMZlCpMs&~x|WU7zcvm8n5j-)L|z^Z1^T9>ib zrMAt|bFKyJJ~-X%WsCO4jJ+|-*d}(TJTraoMs7#uHr*bY8v(53FeJ5riLBn_6+Cj& zmd(?rZ#{kEX#i3e?^!mdb(=qN*M5;ewseO|0$J2Od@&}HW61Km1ek7l#;jDt%D)8F z_*JV#t5`V*T-TawiDk%3SSw{GQN&(%p{QdxlZ^lX_5wlmY8mUUSq|jdM}Bx@!FVFA zIw4{+TmilCCw&D0;c|rFt`%mL3Qh%vjY=>zMB_t^+@6yY4YF!Z&uTaWr^ijmYQd+2 zKRuKnssE(ark5Hy1tM!%Biuv` zP_6Q@Z~?#uhSSz-CclH<2ct0htY}u_6PcT$xu(H@n|eRrAKOVF^pTyPx1-tV1Aa2TGYh4%aAv_Ng#5Jl6-=A*^lcJu@msK-e$C-tPmIBE(jCKfzC&^8BWw7unE8r@NqDg9vT!}d5&5N zA&G#WDCQ811+E*FYz(9fMPs7bUvFL<2z5{tDp8H_xBMzu-5^jb>1FY3ODGGp2Pj|R zi~?1zzNqSFY1rG4AHOj}Tq-|dK;eHv&#L>u{4NY)Sv=iX&CL>VZE5O+ws6j)(dGdn-yrD4Nyfw1 zke_3O88}jb!OTlp%wX?=@{_#IM=-OYcnDW{JOoznU~LS{HqQd%3G$pg1e)VvJ`KI( z{}P=ap@Ziwm_^|^3wEcDm4y=X6v3D`S4TCjU=s@~4KjncV~FR1c^;i`)^MC-MPTbS z5)~?Od&pk};1L@NcFFRctYW-7z~dkn4-Y{J_L_ov1Z=>z`x$XB;ub3D$&bjvd71Qo?-IfL2)~wF7sPkoX zzEs7`;RW56PuxC~X&oOwep8wDZk{ji`2N-(?0SFK0&_T9S@S@xEH@;X2RbMLxYqkk z>$5e!Y-1DD_?cdf`Mbm}}Z z{kL&3J`d%?QRMb6y8Riqf6=`)pWT<*rbIloD^Gw-?x($z55lQIWS}Ou) zUv^a}AzWG&(D?i}rYgb03OQBtReDTX0W6KGVo))EEulV0y`kwsA#trXACi)FUaQpgiOb9mlBRsf1i$sgVAid`p3UloSdjr<7DyVGWiVP653M z%H@Y`EXQ*Ig>Jd05&F|1n0l0l9K7W8Puujgb8yE zAjQAxV|9g*9EF%MRKKpBR@(Vv)u@P5Vf{K#Kf9q6qK>!rDM8K3p#SwC zsL5nk+d$K*=76`*Z=e{8Nj4NgfUDP;{eNU zDC8~vvP{BOOmj_pO^X-Zdlh4vE0o_fy6XTr%zPqV@`)!p`0|(cBr4o@f`jWv`73fF z;%g`L%4qFDAUS*z$>F;Yiut19s~qbSFQ4r6tqLX?ld$ubth{}$S| zjO$J`7_Qp)UiHlW>Cmmd8-1zMx5DpL|8g1o_Nu3f{!MilI*2r6H~;tG@;Pw+`Mde= zpqu<>SpGkt69VTq)gAr;*6`yRYk&=kj1LYAwI_cXNdFL zOGsX96JKl$wgYZMz*Z!!5sZ04BNk>kxIqCLGq4vGJ^2`t_8+m7_b{&|3~U12%92i;KU6Wa0lS#ErIomHqZ_aHjqo9JpU)cTcrV zwk5S$clDxsea5{WC=*BJqN6_Js88y^us-EptlgHW-Ilg&OMY$MxP5NZym9wOtvlzP zyKZ-<4o>t;1>OwIbSFF0&Rt6m*Tl%1J5#oAK29i=?;Ok8J7zm)x23JSfPQH-FC+aj z@i?eu$|`O;r@3!?=ADhn&QB_lD3GX6d3hCP{@z4JBu(v0_S+%*%* zY+d_IEK}DpVOc8oWIU&4zLwtGJ@+y!%X!bK`SMc}>a5E%JvbB0RPR}=ej-!-#0Q(^ zt2;BUPRQ$SObun+k1e`)W!$^w_RqWbPN=dQ_d@oKCnt>4ovHfiqp6V@Rca*dZOvHQ zvi|m2^WjiTyyk$2vhbl?1PR{dU%~XRMH$LcgM4>bQD z(1YLu!*E%CV+^DdK3YVosoWGy}#$ zhl{^Ox)J0hYLK_+f${(o9Hcx5g(K07#XJRi0^|ueYVZ{(9x&&Ls;DA$DJSnpXjtVP zNs2MX3~rDgg)#LSeFU5N)gMSpB)*FDLuwgPc(2$JjI7C$BxwQ}5CcX>>TZF)1iX+A zmVrQn8vHUj-NBwh6}lRZ0l!aB9zneXZ6T|~|0i$+4aoQrP5mKy42Z(Q{UbvoF%j`9 z0rl~OGy4w2`%NWC^cgHd0~LxG9V@_V;IWZNoM1&>iE;2~5Cn_ofyn1gLc$n!x`_Dz z68SYXQ8v=<+nGy`SRS= z*OwFdn zngf{{jI22_0X&^+$~0+Oa;^ha^7OXU))_}?$4vjs<(d9;{nm`^Uu9h6h&4Q9k9L^iLm}u`Srwj~}=%HOS*z^t5F>ZHt~A8P5)65F1jb zXSATBx9H* ziOwl>$mw_#q2uWM44wam4jwf8x6rZSL?J^U)`H9kzpvoDh=cwi_+FzvQ>fIs97C)9 zsQ_Ng^XD`h)Q{f>rO(yztEl+Nslg9Am-=-%EkZh~wlQ_)EqzXpJ_A)&o--oEP_Bxc z2_ZA(_D)@#ytr&Z-+5ZE-Z;DUd)wdJ{=LWFdwiJ!Z_edY@0~3}J^S}+-m6)rz?)RzU@z7rY1lmg<1|ZBk5Y5yAL~e%y&c7~Fyl7!1 zqI~J9x1{~~d&EAKkl+{O3W(lJp4FIkEa<9av_VF+4-;* zLcOH@`8I%}s@Q%|yMn?-!y0ZX3u&SLlK3ytc4I=S2l@wg_>UYtLrLj|FEM&>^QG7_ zAhzU2k6EDv1EeT^zmd^1mbBma1)~S#>=(A(v`XP+?j~l+B9BcdJ0i>I0W*LF=oXL1 zv#5VrM2PUS#w(jTg88dU{NF$@kcUecOQ^Y@0wh>Vl8A~b!WES16T5RFh=2%%Dm|2uU48l7?st3U@0sCW-J@=7B@vy6I?B`zX} zm1?4Y#LVZw0m{%$RRBGe_8$QP@9z4v`>7=JiEHEB$bzdgX(XB<_bScMXGRUGYV@G0 zaiXfmlU}!dLARr*s;QbaF6efD;%UgMf~M*3l)deF%L5kBb7z5R^%}s85{(l<3FOT- zG|$=YvKi(KXosLk)D5L8{OR?37Ib^1GSjsk>8+g$x(h+~Ke(87KM4&j+c3v1 zlpRbONJCv;(9p_sKs+^60KoG_;_D#d`vZA}|1O9r6}1VKXyYtdN#~#G z0LBB}DI0BGIm>IO+ya;@|ELTa`Q#sEK%N8?!=R)jTAu!JJjVYKh8e*Qe29;dXVe5M z+>iyO@esTz#ez}tFqj~d+kPZSuoe#yF_B>Jj|`m;MS{hMqFFMCxCD3pB{r-VO|0>T zA+bSVV!cCrAw(!6?P7y5cus)?g1J7}lDpwtf@Juz30SL>j6vk~1+Qp*mG4bnUpzDj zk|r>DAU4-vxIBoj%LTdN$a%q9PKes8Wvb=2)C@rp(>n} zE6f3L*{u4|`~i$0RKV10fvpF{LBa%T1lfBmplr#XaYVpSHU>A9iWnYBqUFv=2$eY* z6J-f}=>hw_>XRevAtg%jR(-N5t?%f>WI)_$x<}V{p6Yr6ZZE_O*B3zAkI$>UxMJzQ z=)1 z6KPsxsl~_AibodN&N~oqAvt`V`mCK|+)MVx^tvS$8eM5MI%k zt=mR`s>O4jTD#NyvcyrN*UJ;SsU zZ*7LDU1ZuaOdDPXu<5Oa90i9aIUD?zL*3nc5<1r1EofsyeQ}WB_k$O{+~eJc$V!GJ z-G2Al@(ZN9*A`+N$Jn2tgS(fn!ihoo1qernh9W#7tGr%-Jo3i>8Uu)I<1|7~f)j&0 z)IM>Fp1?qKeg=WD&%l9qi}E54joN?T<;$tz7fu(m_j4^m*4^q(dWFCN10VrNGqeQM zA*d&yfof^X841V~gd8T~P0-3!PYr?pzTP_>&*GPe+wkO(iB1IX(LD+E&hp z5Qt(bb0&n$l&2ui8?3AZErMl6QbD$55aD2Hkgj^K7mee)bmD-YV zHs#zH1kG-lIrvsht^$3Pl)EzLLC8z_8go?$RYS+#_P*tvRb^_oVa_)gn}f z0-}0^8n9K32(3d^LFvmVVdV8jPM@GNn=WUQkq-t;t^unL?71+V$I}o($kx-ER$Q>zi2MRtX zDtDZK-KcDMAFy2@wb8L(r4G2@4ALI*4CyruTyWSAO&)zgdQI&&BS9#lTn|*YLX3b1 zIw2cSKDm}gVoX_Nm}GSZKmrt)q{2N8C4Uu46zD_z2k=`EEEHRUu?4|`Pu$^{P<$Uz z8jlm?HMn-knEfS0$6iPWCOtOSly}mb)GS%7;-eSDFD{& z<{GaW$#ofCj#9`eMnr>gFxEl21o`5I0z@b!G$fayX294q&lLmJ0dIBS0vFjpt*8rl z76Zs(irkps6p2k@4=Ig~c&sAd2Arl3Gm!Bnx&)H{`w+Iu;G77TM>iocvwGAYX5*GkZ(P0g@{O1A ziMw?mzux^GdF^i92c3V`^}{aElB(C`bXv8cDB8|Z;Qb)}0Dr`aunwFg>BHrP-|}PN z6tU1*ic<#Q!C##9+bavIqkmAu9$B{wNz}Km)v+DYZth0Q&?3VY2 zH{)Elpjs#Pk)$R{j28t7K22$@2`P!h0%hD?aT$;}v~FyS3o%p}?6h;j?0$>9nu+~ehOYl-)8jpSah zcx9HyeI~5`Qa61??v*F>3xnBOzEn1114$@j9_Lc1eMBx`8PT=@wVJ#Y9UJCodPI)= zQ&eQ?$QzMD$CpgqejXnDMso&W$$1A>hN{a)X1|QjbrbUje6PFYwh;1^LWNw#z$`~% zqWQ9m#FKz_GGRe667BHBReK8|qU1sxAuiL$v1sL?j?Lh}+rFwNz$F%>g6i~^;6~zZ z0(UEcw+0d3PT=hTJJ0m&?>@Ny$%89i3R9il96S!`I*7Z4xLb+)7`V*wlBd7~_0iLt zf+xhEGlKRgWIcE8jG!R^yj+IAB)o%!gTDj(5WOWRCfgG1?&3M@&k#BT=tR*W=!j_U zuAw)9&L4s!nD@hr=i&1p-ykmt2x_qN2C>2WnBY&)>A>YrDCIvz?_G3uVc0%|onOVMrsH+yuSLn z=7FY(cBXCh4=DJ}^*}u*=u|zt&yWSB?r>!5Usr%-E2WXk#U&W39{<%w=PHd(AqQlg!qW*H1soO#QXsf!L uR{m(G;>2#{k9H&cW0ktws{FBD(QQ=z*hpZ9y1PdC<8noJrSiv>2>&m~35QDn literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/gptq.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/gptq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56500fdb7b20d7302b2c2b9f5481898ebca0af45 GIT binary patch literal 15138 zcmch8Yj7J!df*J+j{)%_K=3^jpAbnSHWrHqX-( zb(`WSj!sZveCrZak`B`}=Jg3(QXkfnydhyo8pFnq(hO zG$b3tjU?|#G$otE&B>NS_{ulI`Jkl6NLLlAYmBn&ORu188$4 zx{}@DZc^t?^dy7fAWfxo{N8Z4j^aIh?}xAg_=Nj()ESENyhU+dKJcNw1nno#T7dd^ zx(p2ws2`wpyuS<`n4^P%FR5r|f_m9;>DA{?MNU3@>g2hz z<7Z_3GpRY*_+knxY1tIP+6l;6pN|UBBroy;6zq}MY(`8cBk7q87UQY77)hk3rbw;j z^lXObE~HOEmFs*w#Ycq;ym%$eorq@mpicI@FdI#Y@%boHa59~mj8Bzed=(}w*G=)z zFX2J}1nFxc7ESPF0_DINAw4^zRpWa;8oSDKC)0`9WUA7@ ztAMYaKYyVDbX`opz6}D?X+?rE(Mec`R3fD?EsOmkFN$I)TEmBN2q6Q}15 zoB{qDc_U|n|7PCAEB_6=ao$W(le&87NJ%*hZ|1B#qzL15SkKu2X6JO1o<(Ys=9sq} zVJm0n&67IL@s=TMjK?{v@LdGo0KK@XpzcyH zgc_)GSJio-&O_$nEzxf0s7V9ot-`70YJq~iRYRAu(|u3EfJb1Sd>!Yn%ze_t)xjJC zTr(GdcRkm_)dN-o{3mMz<2Alz3^y!N*XeNM1Ql$RZD*dp^n$Wi=g%u|HkOEHGVEkp zV6ie}v$618=jX*$s8OQhzi%^*FtGwY9M+!GcYtXxGyv~Iy^K?@W*BQ z>truPF2}`;Y@5QZ8_C4ydD+78nOFoERdy$mkt=+ZQPJwo?hBgA8naYHry^u&D3zoYMP+qRGL-A|P*P<* zl{G-gq}GW}HOD~@(`C2{g%L|*Kr4$1)yCv>*Jz<$RJjRG4sEcjFVBkXR9a-`O)T!YkPyV)C#!lFukh>$>2_*Hd@VA?CzI?|UI1}h z?gl%*JUcZt7h<1J@F3XP%z;0{uRkuC$jgn~}l2t@IMY>B}B zgmWVzurR*Rgh?|dC|iV9OmOb9eFkK9WD@uwEy%W)qKR4llpsLfq?|Z1^LkWB#Zyy} zG_akp2OG9w(hf<`CL1z*Vp6kvwv0*|A--&lLWMW&zGuJGvwywufP!jw=IxD=y>a!hWFL6wX)5Yy_dsD_ zI6rVq8aS2<9lviX>S_10^rl|#ttnD^M@^BT7?;v&<+TTP_95fRGp!QSifxb6TYAK? zQT=D>2YuteK7r!|izaH%Fd+kKF!zb2WfP`C#Y&0|<|$-$+~J%50}@d9ko6$9^8p40 z0Ww`;{OVe$L8dN?mP#Em^{(s=qH4M z5t$AK&AadV@`ex60@-T#WmsVtQe;elpAjOF9UD^t3V@RNLr88=k7_zsxqR2tQrFYD z&e0-8?{6qHw7fU)&OokyFBE$Ng+SALj&~fl-h@(jTcNEp-!>|>jpo?>060)rsB65P zUC!Q0L8-H^(6=w&cR}jAkn0@>z_;ja4errJhsklMsJB>c!U&ANZ8Jex*-W?_Zd0K3 z>c(XQa@hye2ehn@B{G@nDWV<+=O@2yiYjmc`pDoJ1ygkTo*WGKBH6ho$Y7O&QEQi0 z)oSe2DcuCnHx7EZ5ky7oifp+&nMg;)!KY=zl_{{&CiYR+wRzM&fc7*_sN+8<`c{M#bE#n2Qk@6oCn8E zg4Pj>&O|TA6LE2FzIg{(+pGT=y2*e#3Iu`FaCx320q0H@8+jJR#1st-ryHNIEeTXU z5+gQ5XgZTlY137UbSYSo$#{atFQ1uj-9gz-g#U=N*$HirTwOV5ch1sH=#xzfYOGo) zq2M5N&H%-?;nbf-%6B|j8tIQyAL&#JZ9$jS-O$CU6rI&Uo|?u!c0ABZKSis$&jJcP zZ7f3-779RZEMpZG3PWu=O;a>yuA&o#qc)}mFs)UXC@i%xZGc%lchG-!!L;w_)2f3N z&FVp4G=dJL%bK&6to6f^7>E79j1w&aOe-_TPex}GqHI9rrlL{E=8~X2O|%1yYhC-p`zLaqu656uMROt0y5d~kv-a|$`H{aq@9&oU z-D{@1{=1fS|A9q&!57H;x+EXAW7ghS_Z?ld6}?;(%qsow*2`nydT)WLD=_{7Q(It~3k(z??SXXDZFSj- zl(i)BWg}6gWYdhmPsXp~XTt@ob} ze-rqa7SP9poTi~F08~Rw;J_N^Oz#>O3|akjS;vP#OHbCIS_ji*J4dYpD+%~trDB2- z-D(Z&dZ-aSYWc&`Ap(0ItOcA5?D?7tY5wK$X9k}>NhG0g0Kk_{gO`LMGP2|~5zYgc zFodkn%q4{j*w}^+i+Cy{Mp1c29Y?k%(|qLhRCMrZ;bm-5(ac}KYQiep`gI}JB}jrs zqD%-E5io%X=>VCAoTo&SCQ63z9e~X@R_%Xex9VQexXkd7fc|Tu82>H)-B&j3UH`-k z7A;g;H)t05K#vsYDYWc)Fa1s$REa_(``B*s*DRhc)=;&3^4@;Q+rKus?tLonJuP`p z-=F<86kg0RFMjUqS{2t`ynA}%@cDJ;g`DMra_*G(0}P9MhU}aF02w0B;l1MM6NN6k^fF(?LgL~_VBa0w;mhzkiosGG^|E0bu!qiqpM5|2>^DP>C{ z#jUics-ADETG%IQQQyuCKyrgBpheIp+52+#{e^wc8Gfe?z?bc!wIP$qrMC6ffo(S`nOF6>TQPsmHEeu}IW{a#rkctdTV&lyz7!x<6#Kk42+WS|yo zIHFkb)!4N6Zw$N~%F4bD)RMeq!TdTEG=a-mxD5TtMxvt1bVzs!iUMxlAfp_V=thwZ z=r9$IW7b?!e}!i%rQspLk0oPV{5^4Z83 z`m4d8m6-wMGiX=DQ4|_0gqSn4mvJoFfL+NovnfU7C@QpUQT@2GjW~9Ri2%4M@RM;! zK}nt}7Cq4o74JYru*u$JbnY@AnVkWDu&SEQw^Z??M*Xj#@62x?Q91H~z3-v3ex)Pd za7b!6^jDWQoF~AP?&!(4k3DD~+cFxS@_g1lwqpJ}wtL0!rIGU0KVrL!_%5`MeTmh# z3>&UNXyfpH-q8A9|2zF3`F_#zLCatH{<`@un{)MN7Hb}QhwdKywPnNm!lDHZ>dyZ6 z$M1}9bc}2njeBgLb&N47S;j(_1gIcWNytw&Fenu2;^9M)o-q#y&4T?O2iikxgh0~N(jBqG})5yd!$JQKn5 zBT=d`MnJAG{PG`7UayBvryi;q7`#C zs);SyF~?9I|E<`{$f|yIVr}G3c-1L2?#uc2N$$a-0}*Pdrq-epb1tg8cWwAi{B8%B z@I^O*JXndN6>BM5&2M~|JE5~0Z7c1Y6ugRl%H#&$kI|p2@B5O1_oiNj0AWg9-&*Y& z|BI_1T$O?c5vQ+=)2HAVXrpI!7^^&+Mr?$Da`T?mOKa_SUd8I3a?74lb%W8iTDuxs z8(zI~r=>_i@$UIA@m0cvTyekKI8GOQ4Yyxee&u##Ia1UUKxk0OJhIm0oTHL;|Gg2( zdTi5(m784#W6fix)_80!R;1u{Z{q&QUxt5+S#t1^Zxn-XRDMv5;j@V@kxHBfVt9J3 zvqY8cVdK|gn7$Unj2;e|Idwgeh$f>GBFLKYphcJkN`z~eybg(M;33{88!$p3n_e7$ zapKgN?09kf?D*MBXP-HLHVj4M#Z%9Wy{eeaBiL{llkcm^-$5pW{^7T%g2TUf>E^3T zum14cH%=F7n{SQXKD&JOC+BWlC@{Xov72X?&i?S+_f9E`g&k~LEcJlgJWP~=>Yx^< z<1s{}hfs_Wf?B5OvP&E~DLvDuWA1n$_+^FQmyIV!9E8BE5a_Z(pvyFk$Hk6klHqJH z0`q%~VCNVJ24O(V0bw`y@71;j=s?wZ$HO^yfiom5XV**Y3O;X8t8P%D0R6Tg&e~59bo6y9!5|U>D2da#i6i5h)-~ zwhUdcW+9-b_2sn8TrEHNF5{LBUJ_Q**cn=LfKVIKU$+ZI29^HGd%>2{Wep29u`D;V zq{V_AoJY2KpkjgHs09aPE!xvWR(k(c| zI<;0rorgTr4Qj6P-gRGS7%DjHtW`wChCEtw=_8Zsk_2-C0^z&NQKR>)8YRn{-RhqpD$F~-*bIiN?tef;wn;H^P4oY@-HM1|10HjXRn`dS=!w)SgrJXvoQ zjy+39%G$)7V&KC50`sa7=1O#i55egSEn?9Wdzpt9-lSqL&p}i@%_cDH%qlmJSUh492?)sJ zAffqL9921k{yz@;#lOP?HVrY6UV@JnG=Y%?_n>5}ayD}Dq--l4jU9u)YNM~eI2Ct1}i0&J#6VL~xC?ez*LI{drE6yf`e#}`xljeme1nCsz_)id<+*J5u z%%Nyk92iDEIU~XqIxsuIZ-KEk*`$0k*(gL)a77ONK7tn$JTPRBz)yioLQy%utc9HH z;1$I_0)APM5BjRysz1VpKgQ%HCbuwIhD7%8*F_;3DQ(H9C<+;}hH`zCZn9mqQKBP% zj;+09F(Og8Mue-Q+&~KK0G2URghKSNmU2M2d2c=tk^-T%H`fEl799n9 zI|isnrLNI@*9i&!>?iI=AK1^3yAL`ySeFL8=U{NuY}u&Jo?Oq-^^RjXOIyL-u`1@f z_e>P`Gx)$6e8~3Y*KYZ8d~xV zy;4JOzF|~q7%en(Z~5$A+oE}^fpWKDmrct};FND^&$S=gXg<7Y)Vn>OH6LD~!B6b- zuhdr0dA;_jLi6Frh_Yp&>YB)ab`uO(g9Ad}UYpytXxs`=V6dNCK34$pUBR_W`TnC) z|IvK^a}xZyo~zL6Lt0yUA3Mx-wnh7vXIs01e-BKjf}(_~Toluhv-dvq4M@I=_Xcym zi@D=3Kz8x;Lm%8!F#2l>jZOI1vM1j%AhitSTMkJrhaQ`u2AwP|kFAhfI=$th{GlSH zBlv6dV?E~jkCOM|X;>0}ut-4&qR;@*2(1pSPzlsShowk`3vTx0Ev#f=SL@f#Y*

*J)4IB+sq1g=tE3Bo)>Q5l&Uy?d?0CLG zUuDX-^`)V`euKVn)yM~}lS5O{fbhU2r>%0D;+b8gh%8}|Xd}lV zC!6skhHIa)4)m2dtXsw8E+#*KMApN_eVHNtm&}#f$;kvyqP(iR1sx*Q?iRUuMo7o_ zOh!GKBhg7zSWxeUi!O0pZQK z^Y#|W-tzOl{GP+op2O?*!v&{z+bP=K^Zt=LNACJQZv18Adi#-l`*ErL_^*6_TmLup z>+R=m8E@N{?Vq^V&jIhyokNA%0Nk3g-m%{8{J8g*z4@UrX=rRquj}wAm>Vt@PJ;FU zh4jT!8%*m%#+hfDB&KQQ#rMAZ&Ue>N{o>pQ=kC?!4?HIwcrJh7CF#IRpD-^M-4t^~ ziL%ny@%~CQ6D!Lx_#`O3P)2V4UsyZvz3EV8-TA-|Oy~x*k+iI@oybt;Xmm47(({l) zxKuY?mUb0nxSOdx7HO-V1`zHn`<%3-HeB7&qza%Kav7i+vKF9aFHeO&fNDtI%B2Y{ zDXaJ-wLHbZ-x^X^%Lk)gKw2tZ25~`3{|)M7D`8x`m8}Q}qFy$n>hh~A!s_7rqjyHvhVC3% z_YExCKxy>&ZZ|K(l_0nwfb-XeC7(h79c_%TWIMqJv~X11I|UrjryeLzwIp`tDO7@BiigkB|NGSdqf= zeQ}e#iy^8r_}E%!3~n13$}lc`k;6-ei)JiYC?G7clvueW)wC6{0cgB7=gp?2rY$>^ zl=Aa+=Bpydoo@tPuzw*|8i_enlXlTVCEvAj%CF_5OZHiS~yk(H8^Z zW?lg(cct3CqCtZEa7}Pv$HQNNSMmgrpTmVN!%rznS5*a+bmT&ty#JdvpB3YYOo+$_ zk`+)c%FSBMD~m%M5~!RiUWr=F5ywRfX$M|~R0e-31ixa4C<7>gA>~#^BSBK(;FBW1 zEL0GMIvl=RYxvTtR!^Qp-epz`n|zW?NSPBJSV9()SxoRMlfsAJ#~k6I?R+I$4lCCL zjtUZ>fN-KRC>!C&4m7=Kp=jn)%J(VN@()y>MD_h61;vw}Qa%4fwSS6#$3CTw{#K`_ zb-%IbXx(p3Cdg5k)KK&}`crD?Grj9m!@0jV9DUT#oHNvYX05&T%DT1XhH1-`)X~Rs j?)EJTGh2UP)YE}mBU==_wwNnAI+&~L{tbm0!WI7yTu%>= literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/gptq_bitblas.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/gptq_bitblas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23d313af845fefadd9e90af8ad3981a2d71d333e GIT binary patch literal 17169 zcmbVzS#TUjmRMEa_X#u_jRsI4&L)8dc!?)?iQp+hln801*6fs;T?MegKA@@wN%S;E zlvZnl_UuxO-8G>djt6qgc%T&>1KDE_#~Y5N{V|>h|8xnoyH)1coLJdm{WS&1+Fgf_ z_p-YB02&;PGk~l-UcP*p`7-mpm#_bW-EN^EBy|5U{^c=>`fJRnL0?X+F*HTpr8tVC z6I7Cp&`C{1Lt|c(&_=ZI)h2XFeMC>nbO}S!7%`HxK4D6lBW98|BrHj5#G154Y)N~> zPRfi4N75N_CS4I1$(s`Hq$lD@dLv$vHz%5s%@H3-TOi#M@sqR_(yfu!WFQhqwnf^K z!AOvl+Y;@`jz|Yd+Y_BhCc-4UB3;RCk!{KDNO!U)(nIPTiQZ&iq>rSXiT>n3WPqkL z6mRB3AH!HNM7C?F(-h}=kK)|C^J8tfbO$N*K&h9vRZ0ivG@+(15G16LwZ~I)vgyT{ zOgx>6CdNawY`n$`BIF=xz8DpvNj}31kg>*7@eG?tPfbBqD_dTOr}(IFA}aE-OG#hg zGgs1_QX=vh_BtP*x{_g|nT#MpSDu$6MEYUwnCD>?Aw>$^{lmFF|ed=GsfI zUkK^smJ4Uc&t6afN%L37UcPkp#c_Gt<#^_DA}X>1Kf^~etQd6;hm?nMn&7o|FABdGX1)%{4%Z17|al;1e%gIKxM|kX{Zveh8JTB`zW!aoLT!nz-xaWb{0=XXpV|#I69)`b<<==);6Fh zD7Dp>V2;z=(?pECY1*^7E@I)d5i6&Q*nqF?ya~8fqd-L*oIc{@47`go!oP`kb7uIr z@E*>ZH3C)}5I(wyTxTg1!RdC#PVbH1mGH1W-l^Fhw_p6=sv$B`CN>IYhO zZKkCaiU8hS10NvpE-3ZXl(v4CYG+*d|<_Y+k@GI-% zsm$(OvJO7`W%FCgp#XcwT0SpeSBbJ^3fZ3(<5^xdaln&oG?tMKagi10lCq7oV&Q0l z17}=zCz9+H*ivj2nL*$)vqCBcEK&*BJsyo~GZ6n7eU~!BCBab%YTlSBADI+2jl^yU zIpefpLlI=v!}r;3GuSH$D*RPoZNZbVs^y%f3iEyH2O1UbUD{0Lw1P3G;S5&?ppAwz z&TBL3hNNy;bJQPo0Uc#TG?k-s6l+%T<}{G2Y|oq)a+N-FI>=QvX-*G0d!?NLa+O|k zRHF@>=}@7s(E^jHwB<~V;lEFD=4u`E~gw>qhqn1>^FgNa+`FG}M7`ZE_15VPZRJ;p((WtG07rTMp z+Eko#R5z8W@J$aTbj+J_rg?KsFWQ_LdePPLEvK)pXwLkuhNI?9-!)yQuTwX)uT$6Q zkb7K!X`r&Z%C{y{mNS^R$XplFsi`n?F~Ng4!NgNy1};jb40vQT`_hCk7f($wnKUyC z0GTViTFG3-%P_?aGM8sFOe{T{;Fwf8!^D$#BVUhat`O+SbRvx zANX&c&e7AgkOaJXqKNeOnsW3%qVLi)^}SXdHAnq3?REMinuv*Uk&92oGa#*L!Xq*r z6LEDJ=I`DZ&Ln4sVUmV2QDKVD4BeRBcVPe3ef)4DetCFi4vy^9@U=uDIh;&$e1hd~ z@UhuUS{P2CSSJpb&u}6o4o}Tw-cp6B@XVZSK0L*z_!~3Ak?hc>t0>H}L`G%9hZAX# zXT&4n3ix+{I1yR=e^7sOlgfL~OPz-nyB@stS98CZ`#0bD)8ml+TM>`ycbhxuMPnzO z?RzCvSp=q*DBv=bT@NA)ilbS>AOo{L8VbtBGQ$d}i^%$!r~tCDm1WC}&a#3VE39V; zZ;l}&%9dB7iCJDr>!CkhkWK0^gdPAFdNJz52+t=0E&@ux(@E~c2|zh_3+Y!5YykiQ z@W#4{7hgVpdW=0e@%qIv*)GCzM5lO`Lvc{n!bL6Xrql71+(IxufBxbGd#SF8bq2N} zI|+9*j7guunUO7#v6o-G#9nyu<*8a$msfL>Z9;U)!h_Ym&PADpe zBF$7-3B@f@1FQ{PQ}VdYga6qU_}uBcJ%4L{@%IYtBU1aws&nVhPXGMECl?;O`q#9Y zy^b#_jmxoapjx(-G<5U8ai7D{ zU{ftLwNM?~3hjrb_QOa`;PaM1(dM{)`qpX5dwwxf=sfgk=b=^a`BmF_m?FPr!3HgY z1Ive`w&#>FHsjzdFp9R$9}a&oj01jRYgzQ=t<19Sr}mHRl65bkX#az@2lj`r{_51P z_UBy_fDK8|00|~+I`qNNa@)#eeqdAz99hu)k@IiXlQiXO`;s!62aEj!d0P))fpu{Y z6^F+1gD?}C({$+b(6Ri$@rUk`QR_Yl@1YCv3TB#Pp- z3pEYIQK$hiZn`3FR#8HY;RjZb-;Lwhj!mcLK+Ux@nzauyRmpoaYaL`%;d?Y|3Mm)P zXx2HXoQP!!d{m}GA^nzH_aj<32h@mo(=cS);TZ?&wZO8C_nuOB8ymd_(M{@!V{mz* zFf=L+jphfB052X56ayW|DEZdyklo=g`h!2Te_((AZOHZeioU?zt9P!x|ND^Z4iRgWB35_>s)Yg@@Xfa{P1v?xG-?c`uYm$(a~j98Apl*5DVj6hH>q~Zd7}nK z3QDaOlvjPukTZTTfuYOD@srWn1d6S8T(C(Y zon)`_bE533%d%-9WD@Z35jc!`FxpBKBZ8wUS||scl^|BuF5AAD32LgB0kw!b2iD%# zTJZHtzW!yyif_fV>N~JtEjIfL&3#gH-_o@e+wyl;n~yA5ie6vA+aq~ww@L18OV2HjE$>@(?_MwzTLOia0jXtR*|!qT?>xTRa$><&40aZRBT{f= z#rGiepzT+DJ~*-(oXDG6KXZDDaOVb=UR%DllFN_13b(7v0ym`DqU@R4g~F4JFyRO+ z^43hK>Y(#luwziOU`i5gV5tD}A=)$`2ZngaX)rgh1AE6SIh}Bc(>7!ZY8rdp5LEjI zFw!$>D;q(dsa&4Z6~ozMx5be@yR%}uT^(Ou>k`* zS0WyZXA*Ou?xfMo7egAb4(Z`HH&N9#ViKH;itvg6YV#xyrso)s#%~ZNYH5M9GdTlV z9}HVIpjkSa5nhMckqye50~!n`(;)-dl(Oy0q+;!3W0O;|1*`&sl9#P8T0Dnw4rMQN zaxI#Ob8HQSHYrYt$OOukRW@ue4Gn}i$E!v}qJWkKoNRdW4t^H=&j7jjAw-}wm?+qU zzCDs}&&sz}eJ2*IAmiFw7JKrx4p0>AgGFcS;_H&L?@KLYoYMhd(;}@sS6+|LX8B4*%tm)t+-DJ!N;Tn<%^Y_W4`q7qyRU zK{y@m9=>zH#A^4y4u(hezW@mkS1Lz+j?PprD|Oe2!Mb{EOM+T2w7_1g>`S#4j$CV| zvPW|ji|Zw@L@45oUbe(xGl0Dam`cEVQ9$v7D9FNhAQjRQ*;AOq(kw(G(hLi)tU5XL z6)jXTCTF*8UTM4-YGEWq6Ncy}1zMiX_db6w^2j>y7*ezMQjcsyD?No>XFlC^=5z1% z<9oda)iQnV^AeIT5bcC`}s3fjlWg{7hZyPwep;oh<&W84HH#9 z+q-#i4ZvhXg$7_0RJ{e~u;d)hn}!uuDVyFKNF|jE;m?%|p`mGsqbnDJrm^vfs~EuH z%4m#3Tfw2*0-|F2oYz0wJg;6JB0ZQks@mhXV3W)n=8ZW6XUM|b<&4OlXljM6;H<){DYe~IXh(WRV-g9Cw+={3 zU^FJiUG06Ny3s&ZGK~w9&`yzNrZCk9(Wx)c*r{@ki}c)e#4*u&iuh(egH%z_g9Uvn)}rF|sU|j)5{Bgjtb|vta7ma~WRc zFtPvg&RzR!*4tQC9c8O>4iRIa@H{jWj*`~E3Ha78+{B_&7#+uQ-Q{#T0pd8+9LB|)2plMFI>vf5^n3475v=Y9J#e|V!*hI(0V;KRDZy|xv4J_28QnKL!8JvK( zihzb**)$`h!IYYr6W+lx8*L6avr~T%+lI|?6Kl-HEU(#a&N+D?^NfYI5DVa&qLb;nuR+6-V@UVFD z&UDF6vJNW9l$<2#qAZTD+$8y`#;CU}cCS(JDS0S^8_Zk!j(qFjmlS;0v}y@-r_>ED zH!br&z53BrDRdBF1}iXw6&TObZmjZDsyvmdfu)J%?hjwb>VZl%+Ak5iS#MctT8b_2 zUb^yOM~Q-Le$VMI@mmIkRLMs`1Nx>X=HOyRGIxKehfEm&QlYb+eyDx0 z|C7|06l6(f>l#RrdyOnSvRNglFQ98zfD@{253hiA039D9dT>=RsE0B*HW+vl9=5!h z(;%C3THeCxI6XZ28{j$K3hoQGX}sq*glXJ)8v?jFIKj=q#XAurI6F8wEBxC4!?q2e zSdYCpI6L&>`JH-EsB=I&FT8~RZf%`F3utT`1LxYJTu+=ebd75#P2g^!k2KHeLtY}a zWv?n?C!-t?o4g9b77Jbm$Au~JAW30P5z!REY8;sEQOl}dXwjHs`l>K$in!WX zYIc8vOX_~b7nak4Umn`L8_jtG>^a+ghiY}6cV%{{z5X`(e=9xZGYV+2-LLrX&bu>1 z->?TKxEUF0#$~$iQn_>9lNnaA?CmQ9C&}m>0Hx~=s%`mGdFDF|m^PG%Cs3!Oo2j~Hyl zvS^3`A&P*Lv#A&g8vqMJ2Uvo$fD!z-7{xyn(hRDV01*}DP%O+u0WvjJFICp|4)$;2 z5k5%J5WkiRFnYfNC1|}y>v)xNG-k(dFiCiMilZT&(At>ldZ8A_Q5A>k1t>+xMahUs zPZD!Kfn+N3lp^0E3Ji^C~Um2gW;9N8lWzeHYofFaof5RX$1MrGbejrIRm+^H4UdF>^vj$LFS;v=&^KgxB zfjt~=%qe8zt1GQ|Q$Sz+Zz;+C&|n2GqEWa9>XigkR~ZRP2}AYQ5fhDz{03U8USkKD zvyN5fwWn?{{oB{jdNi0BlT0j*-Ksi!C#Y;f_LqMfKXn{!f2>cY7HWmF>`AwBLH z{~D>VDty3#N3bZn)vbp2C@B*D5`aXUGCYSi2j#KO5hD^od8GBeHMP%_;d(eHY$>rH zr?Acv9-O> zx?O7BUTEDbweG!j?n!I+!nr5b?xktTIf!p11~zu24y@?LikvAkrU_{Goz>kL2G2ZyFn{Gev(p z_{J$tCI#P!(F^q(O&z|Sh6$>6$<|)5^+~qAWnE$5 zfHZKRFmO^DIQcLQuNpQBSlRn`t{ux+sq27b>wuA>FDDcoco31gPQzn5yn%cB z9=m;a+wQa#T?5M}3xlK5;Amm+tOS3qvoJ}IeQoF!xNg*W(7m(8UsV1A9fiOSDX^mu z*e3<{6$AY+Ctk~falM0bcU4iU&GlMn4$u1FnA^W@H@Yng`t>Ht)Arcr#uxUYYa8@c z^lgKFh-YjY15h5zf??f7nOloCPe}t6MNdyj3*RU0dx{;MUn1-}_?cSPeH0kJ&)+HI zLAS>v($JB@&^ZbIT<26OjJjsxcDv)L-PmGTu&#UR+9~~4LE2%BTftav>&{!ZKW-kD znqPjfGvEAj{@6>9TsZZ(8T=^qK1Z=Fh=1VC+c7M43>P{MNgaot8leV;?Cp4JhSaT7 zFcM!F9D@k_GW}GGsi7m}yKo9t))y*K1}F=(HAZNLN>~CwXob|msS-?$qhMkr6SLI1 zeEN}TPtjyA=UbM0ADKp~dH?crk4(EZ=6~bsDgr~a^%UE>i@}~^M{lvKx6rjq>e>a< z4>JT#)1@ZLY`<;2Wi6NnCDUNh)>E|g7F&CZZePhvnTNij^s1|q1`Z=Npr(rdYj~M# z^-kZ=VX5-kjiy?~WkDnSd%z;M!xL4E7e)0nW22LJJD^kossmhV=KzXWQ0wfrt5BNV zTl45yckn$E=YPU;unF3OG1p{MW?`pf+PTvEA5Hs}$9?*S@NWSi28U%OV44h`gT(JP z_?$*nC(*O(9d}OG$V01Yhy>+Tr@A|6m1$~?&dem{>K%oY+ey~Jh0KE6ab==}M?fJN zfUJdEPek)Bxl5GmLr+lEIpft$9;yBeCrPn*gFC#_d;99Gs|Cky$+3H7?7_}e$0$4m zTb=OATeLZEpSgAB{fY057oDDk@+%VYx%DaX!)>JIeT?p4gsuPrxstW;<7h>?ScEd! z$-Mc*4O}B5{x^esMTW^>2SOCX;cVEd!k|wIF7ciJhW|Fnw)25XqZIrLI zWG6`n)e^)bxa1^R4?JS7c}a5IK$dY9c8`jPi7J3Tu#mWRIdmrBY!cyrLpWq&(S*zhMe1zG(tlp5-o5LR{`DziJ9A zziJ9AJgayp;bR3666OA9v?!~TgewGm+aS;6^3iYDB&-ORVmKMU0jdIccEv!21LtMJ zhbZsLhrTJQ*-ShkhE?I4c3 cr7d~?_H_yq>#rGjE$z*BgukLNL0I|!0b`j7PXGV_ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/gptq_marlin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9cd5133fea0ed46ca3336af928b111b35215600 GIT binary patch literal 36625 zcmd7532<9yekb^FgE)wb0C*F;3Gt99>Y#4Rx@C#hWw)DlHyYx5q#yw#^#EED4Yt`m z-80n0iKQg7qAJdsp6c4BW7U|h>fKf+sck1)HF0*b6+Dq0VBV-V>S@nxxu$j(vefBJ zYHN3Y|L@@eAoVEAJ#kf@s1M(LzwiJ5{@43=PN$8-6F2@=Z2Ucr`y~a`r=s9_WHfNx zEsp1SU7SnkhIKmf>*M-_Vc5X@hPW|d8a5@&!)6vX#x27Zh(DLu zKfIs$jq&Fb2S)TdE^%=9kdD)Hf?GKJflgV2;Ujzx9~3MXZ>Wn?`sJ&IV|)$%*J`Ja zTKT%~nTC(^y-%%Sz0zjB9;IL4`}hX!`uWB$Y`+OLoZvS;wTAWE525r)zL{^~ThW7e zN+VZ&7yo3NHZdNJ z4Y_>&Wd4or-X#>Qly z8a_Q09ZxBdDCHidY>`A%jK{_!i3uTWl&xx3*{k|a3aJYd{PWSIAlDb-gs%y)u?wlQ zBhk2Uc4~5RLQDy~T=UZSWojTcE%2)gqfhP=v2h_PmR06X3aQ8yR#7CHN{KWOAM0Uk zI!c39@+MK(TOM5)lo}_LszgrT$x}*0C!*H`v5+-5a$#a3DMS*J5sVbQh+GoHaUqTd zJ)$rfkB$hD$*34j2r0~;L9QA@&-iQO(L`({GV!_~iZNc0gA`jR&&C-+e$0&QQnN%x z*nEdAa$w|wFmg$q+GKHxnhNm~=!`P$g|uWOP92tO)YXVy5+YY7#7oIZ>fN|-rIdui zlY*F-N(qr9tw2)t6-uem=VQ^N++2vn2!&{>kTo(f9>I2th|%#&NjZqI#YV5K>5IUJ zO>*sl3(;}R5u16o(5KWHlBvUP_Lj z6@?%5=?Pk;xr-FL=6MY{W(+gN856Hd>!y)&#+){$&C@3MEW93Z$ieHU0TpMgsbc*X zjcY1GFl|U%(xy>E3biQDMYHNF{h}Y%Qb_gPF{tOtOhwAB#`5S%TCb)YvwqNw9b(DU2Gq0zA9~4GE<3Me` zQo8EnjVhc&M3YJSXA5HnSocBU&S zK3&0Ec+2db!cjwHe+mjEQGxs8aNnIws0TIjl&ABV=5VX_4%IA3!#l0LzFo(8XZYZ zh&`d9Z-ipwse#RfGNJ$Y&wp3RsU{5;QhHczS7JDurc$Bv;zX1mi6&Fj>V>H>foj3& zA@H$iN+=XbhPtA!PsI3S=*1ULhWl)+hW+ALawqfASk$*@X9&F+xzM+&FB7Ao8;6dmV7 z3ARe=sL!y|plsmEnhAW_z}1cA+f%ac1=*O0CNIfWb&#?y zCfkx1Vxy_Z>(O{pHZy-xHnAy}br+L#Jg`M5IsY)cF_oCyI2oJlO+`hV&b?Qsw(l6c zv|ZR3kDcE*c?~;peBf*0Zu;i@n)g|TjAFfX~0LDK@z-#9jzx?BKAeUsN@ zuhO;1rO5ecYUBbCo|O0^`fvdMlR-@JESIn60*$x6dGnj!8JgWcd-aAV=dGHz-t3!w zA!l{p*drk{INOP15e$prkhO*-i6bPy2buwPw$-hfwwm6ZIirh3#XU8or~Kf zZ~xLc$+IKt-Xpp9{HgHMi$A%z>^?SUdTiyq!5dSz-njY3LjTQg|Ba_(ZhvlI@UG{h z!MUTqdwSk8fBLpRQ{A&r2j`EX8UMz6j!k(3=k0uG=A7Pq9ap(A-^e+tW?y`S`Qb*2 z-~<5zC1DxhXr1x~cUnhA%z1|S2-D)fx%y{KLe z(K+?ujcdS#6*U!so0<#9vFQVP&G5`rq$>a{2xM3o$Z#t0-j4hKhH^%oy~4^|z0c{AqcWZ%Hvt*Dam;%bNX!NrI$0Wd_D8(Ys=1W%pOMmU{f~GB?Y<`JNM}QoRlpiDX!m}f*7VcAzEMVclzRftEaNO{^OI?^E%W+`e zAD{|)zN=%YSbgB;h628BP3vc?N*HkphmrE6b?RZD{?6!A7By813&)GkU1anxHJ9>x z$EY54Gy1f(tPU;gTlYnEuw%HQtQL)940jMy+Jd`8@!-AaUfm}OtJ7X`+9RZGX@lFDK`VP_^sUilScBBpLUZE?_j1#lxNIPo@@2)8LTxFvS zX!PF>OuSO-2Mx}b=14znY80Au)!&`0qsIqYnvW4RAlD;>nLk*@3_ zU8&faESr_vk#a240WF)R1#tq{D7$~q(#(B9S z#$Synh_PM4nU#Q%P-pQZs!V#}lppGfUpd;{%Cwf@wU?3mZ7Mc}R|k)s$dVSCA!JM72+`Y30A6VQOOZEM# z>UaF}SO1`CxhgzYao_E~<+$mX|N7$P53jy|b;Uh6d+dSJ^ZSQxow#{op?azL$6X(F ztyJxjoV#WZ-*ELU!vGvulo=7og= znX0b2id>+6{-uS1OrUel{;8|#?ez8Z{4oHH@P~cx_ucgXTAaQsE^l}~yWyC$;n*k5 zpX~pH&zyZ}dBe-guCHZX!;))w<<)OxUmcZR9bI;f%^B}|0yk2(rf*I!w5aH_!~6CN z*I$@7&YuOM|MG3eqJ3%q`;}7lwx2n+KP1#S@8rCj@?Os2oju9m(JE399q9}Nfs*@l z5X4A9IC`H3f+&?h5J)p$Ip8Vc*fV+%gOqkHsUlWf0&z2hThoiV<^kSML?o+vkb=Ba z0)bi}rGhu5^;$InZ>E1w$DCPfr9A493_xV#84(CROIZ)9)^r!aG>t?H2p8vq-8VGG zy0(YF+-+$)9k~}l4qE&0mOB;dQLF`C%0Nc#+OnlX93(IY4sToZ`crQQtIqJ9#bcQr z#$R+$GpMkdkaM++kWr#0JxR&v2?^1W3q@MYrr!Rop)MS}TgnJADqtul=4|OD$WkET ztRm|uB%>}~7>!QFQ+o!>Dk>h#J3}hNLjg`iVky=|@GCa1^h-IK6_QxtD)R*C<_jrR zfOloktqiV--^8*hHwSi+SPikn2vR|2Ht?}TxKg=Ae2WScVV4-CAl)Zq1BR`DDxTs5 zau_f(Bmo}xQ*Z=M*jWNp;uyuQhEC!Iii^RK9qP14sOn1;#UK=$Le* z3fw6aSKy9#6`B4G)iQO8%Ij+!B7SAsS`m=#SDFBMd-Tik{AD>wc9DQWad(5-P04zPV^9OFa;Kdck z_xms1dhOA&cl-P@W+<{NKR)?omqCCW%nCb_53!evN`MQ zkenThn={UiWoKX3*)KW!f7Jft?vJ{coqJ~wKd^fW@If{!d>+|-r3>?`x5}z0cKS#-#q$?ZS&fhTq z@^V$%?8#3pz)#+(Ua_=bUpkv}?uJ}l^JA02S(&$jNcee$(dkG2s@}zyvb}qx-aX5{ zy&1>eU)VjLI_l<6E_M9OvEvu6hWSJ>-L*Gk-J7$!6kGxg79U^1_vmvh_P@ZQ3*4}d z=Z5vXZrC6gwH64m8W%-g|BimxQo->d-Y8gk6aJe8Fe>ohivJaYO|Y~72u%ZKD;EjH z6dG-_P<}5$h?74HRl&JwckcEG>vM->N=C^kdY>SC!?dX5~G~ z7*O6{mR7?D(7P>b`v%BP*OE04#2V}vWj$Zb`flcHF!Bb}UR&M+v`|;pLLNRZLf2G5Dg$U)jUXkX0 z8?T|dC3H9|L`xc*nw>%5`OO%BMsktDIRP{cbdCts#vYtpT%>rmsk9lym8dbGtwf5{ z44gz%V?mAuj@ols)*VVg8T;{CZ~;m!2My70|}au zW?7|P+8}7H_B9d3C*@>-R~JPT$m5!E+V8jL9IiKSb3P;oTocp8ZSx@ zgv$9OxW^O3*~yq^d#24jAuRtM*%Kwk%EZ(-A1RQHWiIT-wRH+4OxYU2azj!iBEE&r zi~oY0e@V_Ea^}dnPR?(U^V@J_D@mM0PVPS?+g^^wr-VbII3db5<#Z*!miRlA%>b^C zcmuw$O*STl_^48MsYccctCqD)bt@$+B7lr|4jN@5^l1mFIN3Yjd@Y?{2)k zF;lYv!S3o@b^W`}+s+$rBGlQOYi`drZu#lQrf-ZR)ZUZp*_7=$ zDfOJpbPs(J&D%}R)4IIDQek^?H=uE4H_(xJi@T-^>j5FkxTf!ey(}AmZmt>}3C$2f zrZG4*=_n134qXh+RsUqmw6oYge4e#jf&f~3t}K;0y5fa_VAbWZ5tEr2{bC3XDgfZ2V)8Vz-tZ%)nCunp;gr4O-1trXO)Ar7xcZY5d zWnA4EYxk3*r5>}Hv$-LBcYwb{;5 z|LkZRo5J0Zadl>_ovfD(IbX$Krt@3bA1K4PHPrbvPg+Mqh;es{vPs~J=x6mYZd{kv z!_Qr$I@UZWWf&{oZh%j$m&;g|i%vG}x=dxch()cP%UqU=PCV_=SjKd`wQMxNK-S4w zft=;zw>$+^bB$kAN`&udseV>x_p8LqiNq<3eRA&`e|_p@5}BO*RnDXDJ+)hZ#gA zSxefBqawsZFA3L@5dduGs+e$v_#T>2u2&^YKnq&aS(zk+stA~sdzPJ5uWXhl&-%Wy z)K@TV$sB6H2J-r|-VVvzvGDp*#o`+uKcDe-EPId4S#s4)^RAmMi(i|wJP6ce1D#T! zbJ4sMSh6k$cFfsx{_3p1L-JEO$Ktn_{d?zZIbR^_Ym-qwWY#7AXigvCH2mg!9lyYKC}TwRg906@6!7tjz`nq`<(^iRHllIXh5wXK=nP<7m9+_}apTC9Bjj2;be~pY%$*zlL|S zqW+?^$(P{mNR6hvYN$$?UeqO71K zhK=k_QbzEkUL*SLq2eR%~$=#X@G|Z3Azk=IH#|}9D9ghtLj~DM~bw@VXF9rLT z+OnIUmo`7Y96XQ>Lh$EYuBv8!8}7T+Ufg#*ULxyvEgBbvrI%OsoLcss&NxorhX_z= z@uj81E4$AuyUu2;XW6?YJCv^CC;|y%6%JVhodqF9ofa-dFO<4HwSb`t_JB=lmoL6D z0KM8diZmbqUhN!3pfv-gu4XUI>MJ&5F3E;y-L=*JCUw_{Zf3o-9b1e1hVMX_@H>#_ z`3}SiXITui1PDhiV2N1Z;sc5na+FECYvv+h&VpQt5lqYiwSZxp>ar+mE#ghoEPhB1 zLkyS5N7SLi;*a6JuPOi}D2zXjlBWQvc9zmJeervX?11fRseBl54Zkjge1~@apQ8})&0uRb*eL}&7c1}TmxH^q!TnNj zKQ{23Bj;<%`dTGl>%!q>UpG)ke{-R4pqG`k*~&(#vT=TDxw0Kn3&s4$ zmwg-Ote?6d;?^m-I#*oboUi(UJvhJZp1m1f05L~Z#$Go+ooVV{I=l4h$44^<&ru#n z)s2z4eG9EW?0K(8viBkCQ&)Q~SW9$(g;y4n3olB+Eg#oDHs}en)T@|fVTmvz1*>G3 zCGjWM6NvjxGd3jUTJvDrtcjtZx=BHXET&!qD@XhRf?>TvrL#pRC*#+cP7R`naw2vOFkVw3ouD7yPgt$G4&8~vn^xClb`v)|% z-u#IN%Ghq2bEd(H>(ZXxgWTJOw~Zr)G00+n!w$(0L)rk5I3s+lemp^UMl(R7KMe#}A zL`WZQ8q-RA7~vy@K8{zjelPh3*(20Ir}uaSI$-*Qbc1(FTTS zFI{@YZgRdw4hD(yOr=fz52et+W^n_BOtF-ZNGfMwUF7Wk!-vit8an&pnX{2|FXHda zfuoUwL`R3NSrRveRM@Tn`emw}dLdhqQ|C$g4VXNtBB@^mLMl`)*{bTc$u_2^$hNrn zCsdsEzW`4al!j4j*a50I3K9<$!E5J*$kZfMIg_TL2@+Ak1X^rJK7hb%Z{#G+ZJ@G!vBy%g<$aZ_Ak{h_kcwIGi&(2 zK59ZI|LY?!lGtqiPwoT#=09)Rdtkd+ITl-pl|tuCBqG})(48e-8vKq3UA5wc6iXz+ zPmF-99A;}gMbb0p2cfeA&*1s~&4UVXOi`-w{KQ0DHu5nr^9&G!6bTbj_8;*;#CIr@ zov}&^@R_*D>>9{6DBB3)4=L`C$hkw#kI4BmII@0xTsEI%-4yBiA`%`c5^NRkk@Fwn z$j$;UW<&s=Fajl|+lXaE&gT?ZOzca>!~8WR{2eyN-+zz$_r?yOiB+MjuT%1MGC&!Y zeBp;CgWEiN6v{a}_dfLPHXVl!O?BQx9xN!l7Us2bzF@wBc_G2!&)b>T!PSTIPUfxT zyn!1d^IH}S3uhO%yf?hylIk{P0-GdHf8ND%xVb>>eCvF2zU$^U^BxxE#Fo?|q$;yNb!gnWEbLCfh>Q!pP#l!iD!5 z^BjViEr&m+-$GXS@_j68wW;bsMg4qAs%ZV(gg_w!eEBUd(>YzvUwiA7o3Gr8+>GQ6 zETXSp@jR&TfY>HgbUrdse7=R%7ckkD&fRUjo4nig(KkQm5G>RP-{+f*rpkwoD$~Bj zkvxZ=yJtV%@{{3Tlb3 z6F)(1(=p`+2`xs_MPDmTpsD0je)-xNBI+7}fr9+ODq2;6$*c&*$#n&_c}R{yop78_ zM$QX#9VROR3RX{;+t8}}91tA^;Yz~;go`-;@F~w1p;=lSOKmhu>sD1lX>=r+q#qLh z3eDn@%`PA<@R!|S6r`3Vm7}o=|GB9ICd=>ZMK1JzAP7nF?0}HXOpVZn@F|vao2cXDHNyQkJ38EmS z60*IJhv~FK#1GaCk_wY-q$n$a9#-H#&`|Et{{#54O6sh1C$^F-8QnplQfJY+Nbp7N zm>F9c5{7#!?xxUkVK-EQGHV{KTPsP1XswzAQL~o#@H#bNm(jy&(wYU0YXEOzw1nWZ&+&|N zT%R^VYrc%2r@1~FbN=kDq$^bMZqdq<){immWPwLarQF{nk9f;RGd+li(=qiQvnQ^B&)yBW5|6kZ2^qL;@ z#dgQ0_HM@W1@*d^P`uWDS!ZUxupb4|0>tv|@91Z|&(OZPG@FG|zI5eme~R#8_V9iP z9$I6%wC@MH-_gxfrReshra+(T*=L`3zGFyxSuOtY%_$-=mFh9`mG9__Z4`GFTBkMs zGOf7YF{-VA*g|k`&BMFX?y`4ZJC7&r`NBNjbk!H;@umHxJm4xj(sterJjKU*?o@qH zVEqB|7BMKm!@!X2EXWl?>H>OThhewI!$brN(TY)mKnd!sidTganQ;RviDWQwFe2y4 zcGA;|OitjQlawniM8VC9Kv`mkN>8Zh-QstG%WyL;?LX|?eV6R3AyRJm0 zC(AQCx2rj$u$>xITq(B+3;xUyV3W4}{ z@^w$u(t&${IY!1){DvafJ;&9v9+lrR3mRvrVc9t6k|z*}e-Z z3lM>xHO*^jt9Vx@5RxQXXr--iWV^Cn5g4yX${ux>(^|49v4V2i;3R44%SbC_UWLk1 zIU>T=5|~KLtko(RoAN#0w>sOWRCNuGRWNI+xc0707r2=FGS=o?ZCj@6IjQ!!Imje# z$X54B)qRU^E?4iHbAH;gHP;kg+?8qE{=nUpbBFULLmSDg*ED3S!%}rPTfIf9-U7Pz z8niQ4-H;1_?@BTjSzMz7n;!e^t&n$UdR)r|`!aopmIH_9><{dLti4IHH!Y;Hokt}6 z*^d<1rJ3qIci+hFJ1gxwyW%*PbGWmP9?8*@b!?Ixo0jUb{fDIfLn{ta9Y>nIUb5F` z?VXanb8+)MdoM^f*_sVf&4z5vR;gy|_2Um}TIY@fUv~7es)$gNb#zFMjzwd(Ylqae zBinUQ>N@!G1PJ2}8>mK|-5Ez~uDN@0T58@QIT|tO&Mp?U#bcW@VGVH)f8pi}Id=;hT0EES-7EF( z&GsIb@aI0R7PT1t)N*6@L#HKZo3lUmt}2%cw4igPEDczr8q!m2&Db~G_ivQ^XYTfA z{AV)HorZVr@O?i74^4r}TwOi=HMV3MH%g5gvyHo?#$69BNI^e+jSnl}yMFjFs3v{j zGO+xMx`zhx_3mZAbB8g1fpDHPBd)eiyTS@2qW7rK4&U72yc^=TS!+nLh8AiTkE~d? zf3US?T_tdOpNij?Fzx6fB@kg zEaQaGIkp}pU+eNx(tD(9p~~g(&^gdBDNc+CkVP)MzLDrC9exBJanA+EYaVT}%i5|t zBJ4$S(_7ZLBR;iC9wSOtp;BBNt&X?t*X>zrzhv!SYX6yakd2usg^}1-bhY*n!E_uL z+#{l(e7WHRQ!Z-fpItoxPK6;;82&1L^s8t`gf+}H{Ak(4ij<3htvcvf>KF}^uq_CE zkbnhQ-pRVUBv;p>e@VBrW!W{DQOElOWD8d*Q0!A`^FNXEpW%Qk3L$d_+5QU!T@jq~ zkQ89H0wF)n2$Sr>t^m0IlM4P{?7C8)`sKY2kS~S^wj+u}XTPTL$tTN|A>tPepf-t@ttjIn9Kw{Z5o>Sbep-e>{Uu6=8s!|&a;NA#2Tep_d< zJ%Cp8)$3Q^e)Ia9c@st8P;BkUTbS3%HMgUNihKnN+K4J@XI=*vsL4B-w~`Ci(?OMY zv7i@J%tt=vea@&e?O59Sb2(4l1>3(&qjJoq^$&!o<)lwIBMe^AjvttRJ{r* zk4S*dFIP^eBMhyX76d^g)Qo~(`^vQQR$rp!YQ+j!6>5zLu7pn#fGi534y6Y z4Xt7%)7i_;-+v3w8ht3RaY-KvB2+jz=mEus=$=LJh1;xANhxme^7&n0-I}mDd z#kF6E@w_muDCIR_PKpK=HORT_YBW+#kfBAkN7P-12R)PAytnj*)}KF;czLY+0iWBj*uGeCje02rAaH|ROBX5|o;(vK*E$F0K8f!%S*EJ8wE`TrZ6kjUHdlGiX9xVu_ zMf@!pO1fC9mM#4f^0v;L)|YarK3W5i3yoy;qO2!H!l#;#<3OM*97#}SD2j!FNUpos zT*{^TV4TQNA~>a5)p!t|ipFBUlChIYht_^tXY6UiZ!~J>I-|CyotmR|e#N6E@-th* zv}v7ns$8eXj&!AFk1N089@|LFaGk!o*6FJ=?b7V4YgJ!Md(FjHuCp67fNQL%JoJ_9 z%hC#csr~sI?N0YP^XDojIG4SX?p5>mkAEJ~T7UJU_N+5%cR2~YY}B4rqb}_l&p)AE z^TnrA0fk$2CVSVJ2~XOq`TlrU&BQezUcxT4xl# za;8n$uB`fsNAa7jg@2tM)3K86u1l! z+QUZ}{A!L>(cvoDFgY=)@N2GP0N`LHVa=M2c#Z}{moeFNwi(zKw8X zXMsaP*R}ouFs{^@QZ6;@BMg{pO5QA6A)p90$JjNcsR62G3prcK*-p+NIrw@F7)|8h z+chwcMGo~VNt`O&?6e-`vPU$!a)I&EWp6d>$$s$Ea`Wko zwedlT3O*<`4rXcxKj-wdww$9ov+2?27f=r(jlGxOqXb!+ReD zJeV}hxxw0~n@?q%Hc3sJvQ4`s{JD3-GGnG{KynYLY=#O&udObWcX7cmWFCpvfXOOS z>l>ErE7sljt=02&nU28~>yA$qF~Y3BQ}TC~x3+Ol(xQ@X*;!IJ^c*bUrJ@SAC{UzhSa8#>BQjz1A+!YKI1FWdkl- zlrJ;j`|EX~xCbmo=um=o0^waYzZTY98E5E;=I;BdZw=lY%=+3TU;C=PrVgRJ&Z-X3 z)~jrg2`{U&cJY-J>o!`m>P+ia@-}z;u=>5~Z1aHBJWyVzW^i+-J%eg9H5)Vi2Ue^H?}yrdXnoI`4Q-Y}o6GCnsMXiz0o2=!dK*^NJDAz|;)?YY ztG6$+>G>7wfd`&|qPL#)ge6b-x|z|LR>@HkxrorHN5s-vm*1mtH5OM{W-d|t$}Gp} zwB+*CoUnUMq$&0=87nnV_NzsI9D_BQf2}L~HjSY~b%ZqssI#jQOt_GF)XKD20YET{ zNUNHU2w9p0mFiLs)t4#~b+ia7cS;tojb$SxLW5dC+3&NARLkDe2ZcITtKHyQUu+$- zU{M>wh&4zw#T7A^^^@e%)M917&(hBs%c>Qzq>B=7&#)q)vc?MDuV6*2Wg{h0;**VX z&oa`du1LkIS+JHQ6PeU-#nANb5@)xFg%HWRTuV7rv|rR0nv4thR$2)5A45>}u`-os z1$8Cr`e*beE80e9DG16O_Zh7`6q3eR5+O-Cau^=Pa4SNl!k#i5(l<1bI+j37^qm}m zXI8n`w*eG{>?&1sh#Yz!81^VvFkCktgU}2^)tKxMF?5;y4?c-C0s9GKQxj83k+n&v zT-a8^ALS~BBNwD&)K3_!j)jpVA8%x{(e(;rsuhjtFrsgNc+!Z_kUB`59JA1eR*F07-;PQJ>{|1 z%(pKzWHudKu^xhYkvGV`3ggc<_e;(FOIwIxn;keJ4IKHaslR^XFW*@4ygFyTA0W#m zMU?ZX)P3~cx%f&Tkqumx0#}y<({pyE;`+rmR;;@|ZR{ypoos~F$vW^bGTnRcrn37^ zN&8M^_q`_Vd+pxXo9I>3e)OuYP&l)Zpbl9Ak`t z!;VYAr~^qc;N}PsG$z+EMg|q})uqYECD{(+8)RA|jvJ2bCf*b?w1}=C{gP}WSBa9{ zCEiaV#yb|DI3I=TX&F<?eAAu>v$bQx{p50Md zB>VQ49HbBZO^Bqn$87gQCjj5E!zcdg6?u)qcnYr;5}DweY(weT>r6aCw!`QX%!fqq zwZrqWyU;Us1!kX-3c`>`3<7VY(K~tJT9Wyx<9;;(?UX|aXwjc(P zUqf*IJLNNGRRKL(Ei!7g+y{ib?eyV{G||L~2(ab>Pp8l>`)pzLOwCL! ztc7SGR!pC_Zl)fqF#&7t8ab2?=FK#e?1E^iW~FP=^=}5=3_?`Buy(Hj*3C4|G|hx& znrB+l%~~JZP}2cs?lX^O?4?9&UTG zj&z&4e`ngUhkjJHf`b~l3v2M^Z@i8(9cBH|nz44w00{}Mc@Rsob8DW$jO|?$jo`D- zSh@)_)`2dsA9<2~<$V1MDgn(0b+ zm0Rh#_)ljR%J)tAI{i0{{ElCp2W8w)WSH(hbn3+O<@h+vbkP?!zomgQd@FQKNWmyr zG8UhjZh4W!TF%B!yc8+{OR_*zv_7O*2AKYaMopppWE3N$K;Mu8eIX|U;TS0EJ!!@W z9U3LnMio55Wul<9*(X~GUuGcP0-TpT(A9+ta6_8AgPU zhBC^s>|$TWDTu^C+e3CjMYI%DEI;WP`Im`dg2sQD>h6XEQ|}YhgpJ2Hdx3%^#SzsH zMN@o}M?t|5O{vHPjfN|iW&O|)>$37SbB^1`ucc|od;e=a6vRf)!gh}?vX4vb^rb;@oE#^`SHc)GtlkwRl`tJZW7U;p<|UMmFPURkNAWRnxt6vV*_)%! zOSAc)+`?60j*1EjQyw{wx=Tw56%Fk|Fhe^zCA%pVb5B|s%34)v?PKNY%I>D@=F5b^ zXln5VXuPXTsUtBJSJ)oKD=K3&$bPD@pjaXb{1iT!O<&j8htlD2=?<%WVuNk{(ynJO zL22tpXzsp6j>bzM3Y0KP&KNlt$cd3dB3oeB;MyQeLg{au9J?Q=ncw>Ep4)pC`(f;R$&}r6M8aR~k<77ImjkcO9r=a56HUPm zgr}Pt>meK7J&$2Su}&9^tqz3{k_ z^8^*UNV%HkTzzLQ6ejJ73X`Yuq0{WF%(-iG?vVP|kgw$2p{%P_ay7ReUuA z?ZI~Ss`kJR2!T5_ATM^OIj%?F5sc9QcNw6~)Y|D89m=8#thd?i{e_X`{ z8?u3ZDbSzucEC=S!fu0g3&loPE?A#y+6en|*{U|Fs%@=;Q?M@Q3+7q}pt!@-YU){S zVJQ$^>|PFRo3oP75WF>Za}2|-tHkU& zV5cVQ=#(6tIcTT^>lK?~xthjzn{GGd>OyFWww&v>D_7T)t?R)C%hqj|>bB?Vy3w~> zuqD^I{Z~N;SR?gZUHhV5s>Pz+Z|Tam?3G&fqS9=jLke^(Ov5l=%W_~3gjI7*P~8e) z1zRgKj;8!J&W-)v@;J=ZHD_zLO0`>mv#xY*$E2`9VN->l-adw}G;UKGXH1E(>+q2B zfr6nSSS^|uO(mjNiEfD}zi;TUNOt>>Whh)N{t|KGr{w%6a{e zx|hM69zwm09&;<{E<6lQT7(9XN*+v!DK zYTBG|5RJ}v0*zxOLD#?tx z$q%&cpW|mpiPI~A@XN9bACx5Q3-r2VU-bM4+qd+Y$;oTV*O1ImnFb1|*m`1&0m80i zPck|RW!rHe0Leb~5i`+(cu}Urwx<{~W%mmro#N~8u*)Y}sepmdIy)<+mr8iOD`b@gR2gyPcd4ayqG8B`Y@>kEY1_byBe~>SmF2 zi;99yNS~r1Y*cP!(M)QLGQ%vT(m!=86LCbYW0_dL5wE_ag@JTaj(WNeQSrW_VeUSa zY7d(R#Yp0-DYSfCCka3?sQ1UHLV;H)^1VpTFgcgWVeGk*dLjd4MmBMxFj42j|x|D;y5aLvnuYHKQ7i zoJvmD^K&lzbME-hxy^se^=7!I`Ip{(?e=SV4&J4+?B`?uU)29u{UeHbcvx4j^Uj;zb=-F3 zIe3>0>?iXQ3>G7piH>HY2`QR*M4^X!H|V@K*k^6?9K4GM*v}`Xzjpkk;}ON=U(;3V z+&QR@8}R$U?tXjl`rzC9uJ6m65C)8Ny7LzDSz)yBQ3ZUDx4mM}?aKtik2&%@e%<`4 dUbiFD(EFGp&*QXlzeU%YsqOg{M;-<){(t7ZYYhMZ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/gptq_marlin_24.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d26dad767f2f85e97c82934276b29b7d058f37ea GIT binary patch literal 12232 zcmb6+!1mR*cx-j*$^9dgJ z@{+tQ=8OA7{&*l1h*yWI<29igg15(N<8`4rnle$ML#%%QvfvYH5FDXK!5M0Tf3v`b zTEx1UQN64*1?*U9UWcbmAx>}!PI$MOs56w{zC{Th(euEpmxuNetQW98(N(~ zXGjt>Y35^zMa_18UWv{n!m)`Ut=TVxrEpwSL1AY`-!>4RCHPkl;U)1IcU**SmpZo z_`8MHwZ;?I!ij_!BNZ-3!ZEQ}R)0pCTbQp@61WhK%!)!8E0CO(fk9Z}mBo2ce&&VF zKf8&^QB3|1o(KSirz#I^AApJa3s^QH6`}zAn;bg1Dn6%1(pGZ!W=z8jWe(Zv2?efpD)`kRZy!m`Xw%}Ja= zFrYcVBSxpMDg4!_B5RImoMB#$CPmFEiOPbMn26*THW2~m2=m)dVUB)CuTe@tvK32; z`WQC8G4eE}Sd79Ms{s|?C8f3^)_?^1Q`3yo$Q9oy+StZR=9DQ#@deH)^Y?5iv*I>r zuhUYk;xlrBDP{f!3=ZOO0c^S`l4xXF;^CyE! ze132~I^VB^B^cHI8w&^D8ks#P4#uKa2j>^zU`h;LkHzAH@i{?^@!}0JvH+bQjD;6P zNggD_ls~Knr{|UT`8d&CFc$;!i_#uwC`meOXe?co4H$EblF48eP`|rLy>L)=_ezy& ztKYOWe(7n-c-mD@`%?ruRZr)0#$4<8%1SxiEA7j(PaKW8D#{+nIjE-2Wy|+HIV;81 zrkyS8jhkTf9TPnM}0YpK0|js?RqEOc@E%lx-Dlm`P#P4mVi4)SzK=5 z!Fq2WSvj(~_q6Ifov%6k&qx00Ncxao<(1Xhq6p2c>(!s_xxXjVc1VR^)1hU{rmY@i zYitG~-7lsyFwEP-rP%Lw8cfUKcq+48Y#8%BkSs-oUn)bItjVnlVod!F)1mJK~4D}Brp`U4Vl_*wYGc1`mpw)?Mdxh%g$^~ zeWs>Mt?637{*c}H?vt9aWydc){!hNX?s($qP1}0)>BZqO9z1i%aQ!a;FkBTim{2ee z1=DN~MWK->Pzzul%Pb8;BA5{iIq*0kXCX9}lv#Q&Wsxom%uH#E`IceG4S&{<1>T04 z`|46-0+tx1V5rX6OV}7~6bw<2>y_b?+y_hO4S*sZ;NFv?iD^X4iW2yL13^YYttuVB z_o@mKaF_oa63`wtYEN&bbwF($NDm%= z(t6_a57gF6pxi*Mxa#h7q}kTT?3ML>53TCn0{}hRmp*$%J#-~!rtHnnZ4|plZ9kCp zzmxG_RQ(sT-oP5W%KpH;ZvKqD&;Feoj+=mc`2-x#uE6bcE9dT*H`(TFpmuF^b@T^g zX|`?46VP=6LrfseIxmS+(HmNQX_rd<`Z5Qspfpgug8Nla87W#RC>LYZNlvA(a#NJy z>fj2JjzD!dhaFLvc3BB0BBEpil!QlC&`i!^>05{hK_cU>D9nr(HHSXu`PpbM75#*+ zV|Nna;LGTyfSrbx)+gepp-pGc(?IXW$tQuKG&_{d4@|HTkABp=Jg+%;9@g#x*vY)c z^4M%FUsA>M!dwKTDoV$(>=Y(v z5Ni>l5kPZL8K{>(5&wgDI^?!=}!^s%A3OxF&5)_uwV8AmYl>#iMo`~iz7)p z7IeZ){q7d^OG`)2GQhNK)zs%0c*CLts0tw)<#7Mjj?lMhGvm0^zD>akY)h*zXQCM{ z-O%?N3f|jh0|S-yvgVEI4e=jle=@5E4ofvCxcv{Y@ysPp z_rHb3J%yS*`C>EEw_d#-*?4pP+WpoXh}4%JKKmPd=Scy|3+ZL5x9rV#6xH7T8wN6Y z0)TQI3{#!0X&uNc6R%|XU30^?SA49|Ca6bAC; z*bu~2%n%77@t~PP44_b(nn7Q1$0IPJU9duE&`zQ@Rgkm6-!57!dEiqlMub8vMx0)| z1*gF3k)V*v4gtYGsID8@Dfsn8(IHw?8P8&DNpMy!ow5nuTTIBa$OipnlP4p(j^Xr8 zQXUm@I8YHElcwcE1j1Zi%ah%Gh{Hq0u=(|(yoTm<`(S|7)7a8lW!4WWE8J(>7+}HU z9O4g!P8te|mg0a5nsj}Hu#ENcXN2w@wv{TMQoUL55&s(NVbld&gyx=Q%P(vV`XWOG zr`1Oj^9x`yqmK?QFgnCa6l14a?c9QbMFk8Yf+$(BhUU!EkdPUR&U9SX4ekgUtHy&z zKB}SZ5x}ld-(^x&0*$L~$(FLd+U}nPg(kU+iX0+Y)*n$nHW}*tBU&l61QuWZKv=S; zAfhvaI(^48Q;0tm3zBg9f@y>;b(OSaN!cNSg^{k;@IqSO`Yyi#iC>+^X{ zsS6R&;^ghpMs<dh~Zx{oiv{w~QsGYrnp0Q<8u)9cKVQmYJ*&fhJCc&Gi-*tv7)I-^q zT)$_g=)bYrBm*|r~isx@+90!TS=K0 zX0YZ{lTo_sx~imoaUE9P?R$+Dif^2OC9l$HU`qOk!O$H~%3C&DFqS(X=sljp@W31e z4A)`F2M%_xjxSAx+LY6v0avv$OStL;&64!T@|@0piqVm^l?`g6mA-BDqF_F8v$dyImflQ+ZlI;~KOM?8!Zg0NLqS^3=G+7Ad11kylcXDsVMvF04 zAbdtTyZ=-;a*e~d{(?fTgY`hHK1Px#gLlo%B{&7fPzHC4o1S0LERh8v3_(V#AYPAx zS2ztVK0~7-SOwluSP5%(y&pm}t~v6v)z<>CBAhc_RXDBO2Q3&2o9)d9BM z2M80yl8(Tal!8RF9EW=V&3YW&{;)>Nn*BR-(yR>JMH0UY)=Ec_t|~v;Tj|sWUcmpB$V1bhGqD*7N_`I^HG2leLxu3j-4oG;hCgHg*4Pt`YU1O4> zG$%{wlxTiQoCZ%)w-#aK@ZR2E;xx};t$9q|#{{Dfk_?IF7jGz1n9ncbu%bvZ>AKdi z>w?t$iG?_iLnrIY2BuT1dS7RuS;)w+`jn6*AZxxnd%S%YdAOF5H7kK-&5u_;g)t=d ztaJ+mlkreTE5H2F<9_Z4Ts$daUg{m;&@zj=oNx^p|2HTX2)a~LuKH~KzKtVl{nwXW zS!dm+U%Pks?%~G}0l<5f(MNy$Xd>-BN1`VXtarKLGN-dA-81&2<4D@pmUVWlE1B*Q zwR&&$6SKIby+Q!tju{3w&(fi2ncqucQ@NhxC{@0U_rCh!Y+oG~98Ma4ddp3?gW(Ts~ zo{iy5@2J{4n&~~G!q0o=i=oHf@h@=%OWNSFTt|i*Qn{fFH>z@@={<)ZT|}9Wmdo6_ zH`6kpwhUxi4yr8&;nKutvRh@lH+(9)@44C3>PXufAr!!Nrn}!#oo_w5_}DoHy-K%S zeB!^9W-mcBC(!lOSG(4<+JqvW%=8^r`wnON&ZzK15moPa)*HI5}+ITSA(q{BvL~R+#Hg>;g@dO;pRWCXyZ+nSM$5e0sL!av1|1hk24}vfoh{@Kq z812{@XooFnrzzXgiu@W}UdOT}2mGot9!~Xe8Bee3>D`bXdxoByDVKZ87u=Z4^pC0i zW10T9RrvYd#*4+87j6pT9p_fhWxdVV{koQH?cNu5rUn~o?S1aCwZPCeaSB-A%M47LTHZJk?he_yTa*a9h<&wII4Spc-sS0=B)D{uYm;00dF&gV6FVNc0Z*2fAq z-)ry!J5PVW@-a`GhVQ9dlKB2Y6b^2FS_%sgf=3eRdzg?z^v@9b2ou!lM%VnCQo9>*tcjoSyd*gSe~fK5M?;tZUiNJ2&gj0-k-^*uH-1 zv-9`QXF5jIj*%~z&Blpk=3|c@JEw2pZB|&06b2j`#sCXhW?zzJ28#<2JD(*>ZMKvtrzlC3R9+RDNBk;SyLENso*%xRC4d_DQC*@Ab)SQ#HQFvJ>R9@a1q-`L5)gd zLL{D|lOV0(@GrUGLoQ#+HB*!CM9Hfd-U8Rd1(%wZ31Sfw(_TU@L8rbQ$R;4$0e6CC zxgHh21K$T-3(t$189uAj0_B(d{InR)+l=T$X_l}c@IiNpNp}~}(%)cm8 z*}CQQq@7JrmE4(GHrbXfPtCGRKT*hKl~=;}u=H(A!kApegj}+qk1id>B#H@cdt#?c zQwX64BmEGQJCJDBNiiYMNz!MCIfThDCiuw-35=8EW5~&mA-P38H(3~4j-{Dky7S;R zT#*EG*6-5HTU#*NH&$-k{&3~P9D@ijT-v*GRRpwAZJq0v@7r^Bl6BxEkCT8bRojqr z5ztN5HKUcY?IGEt7KS-bKm6LyM}9W)^CLeylA|#Hxw1{(xmv0-_}spisV!?5@@Sm- zZjY{v=BhAfqd>6OO6<<0>Tb(n1vuCpp4-hU%`cpg)4NOV`8qT!d>R5eU%&A}?MH@D z^OG-31`>$@;#;GCL(Ur5Nbo6_gz8ZGTSzn~<_GkPVF`mwS{uQXf9y4&f9y37BVR7- z7mcV)wC-0BlbeJ5rGkXYgWTh3EtRSir9@Y)#C;QAmbvA5o|Zq?SO zsX(^1Elo8UDF^UY&v2$^RP7mk6w8cFs$-Ms?)TDEN496*#)UqX}UVy(EEZy W;Kig_Hqo`|J%hgmgMijo!v6zeIj~#+ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/hqq_marlin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..def2b4be934f9be7492c5c99a306b40c355dbbdc GIT binary patch literal 16942 zcmeG@Yit`wdb@ls$)!j=C{a&SwqCR?DSjoE&vxQhoH({Feq9pQ20?RIG9{9tXDQoK zsK%Ew8tN{o<7=;$+q9N@*G7iZ#s&(+D2g^Hu4wMAXvN3}G7A-OaaW-J$D9=S+8=Gd zZ}ve-S~8J_02C1DNOD9jSKC%6zt%htFf;S4zwu8=FihjcEi4h>Rh zJwO{omkwP&W(qbw#McH*vUyK(Oy<6tO2Dag^lsCdW)kl?v2);=fBEkEs z%E9|ak^`|p9VV7$4q9dZ!Iw`Q4IkNa?C^{I@OU-+@}6UR4j(>ncv%ayo_OKdfj#@f z{c>F*LgKMxI1w2Z!(t+p9t#hKW5PwbT5U*)Br%#6!)IaPF_=o93qxfX7Nrt@{Q1L&kGvcqk%X8Q3E)IVMpN|19mi=(E!8i>AW>M- z9VX=QZ4nYqkEKK@T22%WwX(1qXzeXHEg}Nme%T$e3pUXB2IKb%?UNfWy@{#1tAxOT19@Sp&S9FjPpW& zXIcLW+TQ|DR~gi!&O$MPHomOQ3vF((QnZOx115^8O<)ErLPc4hYQZDg0J|E1djVG; zoECm)<4TN%dRD^Jv3{Y7;%gJCVTSrFbCC@-9A|<)+4I8Dqsn$F)HdFzJOj~qM3Mpn zBP0Ml0!o*D*>ypT4W3N{futqbIf$DYmVmrrzlLL}aAHIl1saiUL`;v8WYkiI5%K{1 zU-(PNzQbN)(t6{hCBwX9Wf*PcStbL!Io!oClKl|uo(6xD&9H)5hwD3q@C*|+X%l8l zP*Zsh?7KC?jzb&3bbQj37i0e+UK>?~c& z1Rb&!-U-Kw3x_E`3KOJI(g>OC!c!O?h(yyPgw!F#IvPu+x5`fX0(zoI6Ha55Es_`? zD2=6H*%1!Y%@2nq*b^oI>C%6^)|*bGdQ-7fcRE4_#dP<@(P#R$4?iRJ#$#uCQ)9sM zlD+5S@kB4sfEW*p7sco(V9*b4ITN0jkyM>K}%Ct*79SnegR&|L{)Ky zWd;|667WtafRF}~8$$T9^ant&(4rG#sTo?#e`tkcXXzhrT=sjrN}IE5+`TbSI$FDA zHW)OK4w#2-aF8XPP?iw200}~lWI)6e35SiAQEAzYL$LED=0QbprtQPtTfMWv9Yu!y zN<*Qc+UX^%?>A7551Qx0MwNQKr+6@jIlvezifuhyv5vNWpgwx zEkO#XB%B2PzY-~}Wk1YAX$+9^)U13Vt|49Vz4V18hGDe2Wvz(cvNUKq4oLQcbg}|f zM$gKYvyu3K%$*sCk3`a&o+g`MKsow;L{dN5VK`13J=8M zB9ehL-e!c{@?Q9RB^P8s6W)$lPv*2@BOTj{J;6NnNJ3zm#kQzBu3Z2dB?oM8n`xmp#K3NVGL zkR)4^$k^#QR(LQgyK$`s2xyqY;+Pb)knzkYPbG78-C6!KPu06$nRd<`p7Zq0a((JHrHD8XySQYV zkf13|y$OZ4OxKtUYqaD7tJx3VT!n=S%qV-2B`qwI#;vC*YXXhVbeS_|y8CslLYo^l z%iJ%sJxh2SVz&kU7m&|>hkc1*f%Y@ZX*dKEpj}Lw&NIKqUSnD2ZKs79WB$y1fhF6Z zE2V$nDyE*NWS64;P^Kq4qa%sbND^efB;n*BL=|T3*f}UB#fvGjdwk3C0}+$LR9=Qr zn@Wj7&(8P==)uzN9(_;*=8{m*|NP%jB!Z;d=KQSh(9DMHOH(`U^X<95L;o(}dViy$ zl0|kWrGPUilHD9hjs?wBe2|0Cg&e`j6zrZ;u=6fQVH}m3at*O(d5=>#$pOd!od>?g zYp1TBnqD{O>&QBbHpW|%_XKjDz>HIKBP(6=k!cj(i;Lm;EKIRUiU zNu{C1a-40;-@8@kWv5v6;eOl88)r+1?^;$4j>7gOtdNR{A;8@oM~Or)0E8P zOj|f$6?dyhJ)7Rj7hPil#V2$BEjM0C}udQOr2JYHEp$zVGmq#b+y%F>&3IL zFbJ@dz-(N(+RJ;0I0;fJs6#e}uYFg1FLdA4Rq#~jJ#9Hp+w|A&dp6G3^~}{hHS2w9 zzIyHS*177gSsr`PwN-vaBPtEZmcwD$8CDf{DDz>&C$6^G!(m}03Scv6kg_EnlhU$T zN)xKHk#4|3c2QkX!JeR2O%OTqG`3l(#3X$P*ohfxA+kk?MbiX#l{`=DXGTWibPZ)w zGAY}RP$PoiR+Fz`2e+z_M#XqM9KOYpE`U;5tq?6iwoEfraHJj}il1L*KC`qJd1igj z=MJwm_`v5cTHy&42_-w0IHs;~YRlEwbX%^rqlle~PR3dBg$qk<7OQh=-2wv-n8a3D zG_h9SY&}pgJQvIw1g2BkHqO+{h#wB$8qNiGV4sb8pN)E-s_8A*RHZjn=}leJCuY{& zd<~ns^yV(L*=pS}T{9h>*)o0hW=oNQ>g?77Kf|Zm6-vblyVW@@6&ZNUq!;i}tg>5c z3ciMGuU>uiTKHP6Spllh1+HQg{`I6CsD(Iy%I$3M%Aj(v zEP-un0_)N$u&cJ4q22bFb_=xI$E-n{TnSv~0D34&o&q;$YJw<)6}h@Zz91&3Gb|v{ zfM%*3Rmz)hsr>FNw8*yM3t+uy3|T#d^*C=hA%a;WfYK5lO(fNKjBi+hMbE08&*A%# z+gxG_98Z^Y1&&Dc1~`Ix-w7=Qag=b?HL>oeZzCfYs2UKDgJGro`tkLvU}n%46=RaX zjB51St^$CjRo0E5Z6hJV0Sy}u#pc}bc@2ryRgfrkxd7u!@9qzBRQ&BAuXDXV=AeJyg-Y=n+beX|(>Ao}iQRZ%D0S=G5Xv4brQT%Sa zRG&Hd;i+4v7O`ESbeu%CQq794IcWyKlFh>3O*hLAg>pGlQc>PS^Jl!he;4l8 zsKX&4FHgqarIHFUgp6ca0&{{L($=}GO)w+poG_>Htuz^NN5PUYj{|P2%t1|yK@)hc z*deDjPwDxAO*1tn&=9eYgNzf{anLz376FWrMwnON5DI3ASzw@Ufpv5AlK@OI-hMKv zo3a6_Z|DxDt^~NZ182m*)*wOF@Q`MxSKTs^Fg#?NAfAs!#fL>~^*{zT3UxOrG9H~v z+d<7d8%c?>m3o!QZ(=X2!qmo~tW0E)(qc+BgCj|IEU7HGVl+v;R$IY33Xg6dN5-;j zw|ka$`3B4?VFuh5Ri8CE&zk8?pL*J6xg)b3N9G{+w4v4Qy6VbW3!bVgW0PZ3m6MsQ zWxk^N%JAgy)TXKQ_3hKg-+%4qYcuC=o|h3r^Fd6?~yUec;hFfiKt`gPr^y``aw|M9~web8nz)16l;Afn-Mdgcbw3q7=!5 z^_foM!|;y%kdW0;UWd=3fVYEM$>Am z%&TSE0;JZ7^WYGY#f;84jHNEfN`~+yMM&UVsR6v4d^CUJJ2(o}O0ZVuIXCqtPBqUs z@9fWU`|dseDYviC*rB)z8`s>}aec?M@P6`UGS|56o+ZEiP!9eY4=L?Ufg6XeADT|T z|HjQXa!t=rx1l0nP>?T?Cw2fRf)+L!HyjpP-*RYEQ${y{(dZm9Yw*&OdL0dzlV3dxbdcYIvTP?J-FosZtU1yol9W;-l1Qqtr%q4`k)g(L%Tm&w zNQp8K8BkqRy1;Yq(uy(@UJI4$Cwr_LG~nu(12pNn84t8jO&6LH)iPb821J3V}U+)YPfKqfLmoB z$efT!MwW&>P}^^bhM=p2xY|@hSj9ySOj}n1p{tq%MFOiWAc1i{ny3T@dA>pdoBPST z(1S|gphe-FsH&ADD>0ybDoA?`f7I%}k$W($)C(D9k7cs+qxE;!-#dD@d(QtHus}r} zd#vokgNR$aa0ar~muZznQZI)z`3B2`^U*|Qi+j0jftv!VoY0k8xkyO%7QXLH-J7Sr6HgB@#Xyx;1pK@>b#=s1&6oT8-oX%87{lK483ZT^`Fg+aI$q zCEocluE&#-%nDxTSe@4)0hc+M44K;$_QC8U#9{|z$lOe(GCqanB4;Y`DJWuH=TlJp zoPkW?Q#jcYKBXx&3(tWPSMFSeFh7%sK}5P6W=QJ`b7*PEtCdPmaM#c&ZXMttAj%r@jJbkU2C@#4HPOQ0{&7@Rd)nJ8)kHuIf0ZIap>S9s{WCSIuCE^$9~n z5Bho2F^mb6YgIc~htNPPh`K6`kX@RsOhd4!zfn@c6sTXUKU!`L;sq5|2b1uV%YB*| zv!+hN&j)H$5)GnK z59i}Y1~51Zug205nHzzi=mhc^J9;)_$wAqr++5QaYSD!h2PedZCWSDT$(uM;HAGN@ zTSvX>Rs&=zRgdoZn0P@&$W6-;DNLiXfaTI3OMiGJTs)5^x0GR$8~7$O+?Ugv z6Uwxeh{!66p%jiFPD;EgwOr`s6Zr=I;5Hm<1yA;t&B|ho zutM=TAqS_gCDT~CiP;}x_Fc$ebuWVRO1&!!N788`(LI#y=M++qt++xEEvw)sSH=dD zuuMwG58fL|vV}%Yq5KC|R#{SlH7cS+#Eny?lg3*>vr=T8DutG||07HzT?KN5(199e zefMnlzPaFj^ksGwx}L*l`vd>_yuT;s@0oey`}~~$NEU9?7*~vQ?a#p< zzrSc^9PazvnnjMOZlABLz1DcOu>i3i-tL^Y`%X=M)85>sz4=Z3Ir#JT)0Uo`w`ab# z@!Ig!;YGWpsse7?>-+`fuc0O1up!s5A>Z&!uHhN#Z42i7!MuN4&cChD(79O0S39%z z#U`e*qx@|+C!M@A=j|-iu3b7YjH`51I@V<@dA=>jw*iv#-q!DpzW@5o z*Y9ll?r(m4`o8!07gn>o;&U_fH1^$qm}VBf4LNEiLr__sjKtT3sbu;Y-4F<6HjmL= zLB>=P4REK03Ik0>!rK@tyr(b}I*gIh3Vn5ftGx@NvJFK3ZM1G64b>^n^1Kw%Ghs=i z_Dd7sXCPjObb+e%17+D<@_PO+VBb=jSu1Jc^-Hj~F2O#ajV?WZt+BVI*J+rRb|k}w z^&MWNU03zs_L2?jd!MmE4FhOt%0L`{SXbrJXd5bURuX&7ocV01zl^h8NMKrtLTR^O7J@d zZG$LPBnVoXip;`S$tDn(s0x!A%)X5omCWyB$pe{eRX*LRqK`0yPo&{n01)Bu`9*~~ zzb0pRwXTR8w^~!Bj{R&zLg7Tdk5i*rNu_SkUXpRL8Loig6OZzDIMrD$gwM&GHlys& z-jtG1fmJ1zBCVPsKmxFnG;tM<44`O56$7-1@$Tiq>#^O}ff$qUYdN@_f&1LpWGr9N zlB;ORnjtFA-+b-$tFKRQ`ryEaFW!3bUi5z5-Yh?lp{yHnz74>Xue><;kjH7z7f~kQKl~-Pxd@0Yj=J?j> z*6E{jeCK1Q>&o$6aLZHUeB>-m=VBa{aOdNL_f}pxGwPZpU`-I8O@-zTd^QwX=(F*suBvypO?BLN zwN8ug58WJ^?btcrvguAcI_KtEc4ZGBYVUj%WPazaSyw9+Rx*2$63mioZ9KSCX#F`@ zB1@^iKq#C#-7&wCq@Ej_!*%1(=b|R zLK-d%^Z1b(%YeWT>n}0dnz8*7qit!OK^l2^Fo*zF%)pWbF>Qr5V6jF*5}*ctnv7-0 zK(wU;+KLUW)k>u&tSoBR%aRhO526+*WqAT<$8s2mXz+-^zv;FrOeVM|+S$qsXSBK* zyS{r|h67K%MPcYCo)6X#d;_5j87YT=SC}FHfZ2~QqjuZVP?GK1$0L7_@V`Tlx17HZ ziirgIA-16#i2MLM*rh0mq@!o)r&**O;csGw)(c!>c4GM$X7@2eo(!J>XkS{Azd_(% zQ{V`R4bqRfWQ`SrX(XYON4FCczH5!7Qt`2IyofkLKe|$l0}6f~`KVv934jr`>6~z$ zZ-c)DyT#|sIv@tNsxDvImaA;b+6rD@-n%yET?=tFHNI;HuO7@g9=K}Wv)*uDcYo?? zFL*1jS*}`s_vE$HS5MD**JbwcAuU%a5S8iDHmX(gzC4fIdY<>^`Ia2t0)wv{oILpM zp2gyTaSHir0-Hqn!&C_FZ^_#MX9(d|OlULlg+hAFP z-sMi^5bg$Qo65Br(gZn!SrjvRpLPmMlw$rCOLLeJ%;qseO+|S@5KAYu3`t4)3^EL3 z?krj?5W!sAzF>uCkw4B_E8ugfD;Fm(UU_5kjiMC+V5Y6>DB5X}V_Ms%kKc3@9klAi zkY^Vy@=R@g(M^jWCYsO|D|%?P3PPV3s%de^)N0)gH+S3bZvW`nyU!LGsD2DF#qcQh zvrK#Nb4RzeVF?CM2LwQd?VQ|Mv}2890K@oXeCkB5qP2(vFuK}vrD?Kh(FHZ-SK|*3 zZjf#8VHKzVN=&!~U`kl11-~aGsP!vn%Y}8gWWsN+2}%sPl|q(&aoD4Lao7W27s8j> zN`O8(DU|^I&c}?P?u^(t{Q#4OvddeZuy@ZH_~Kdp;EABgM88~+cRcCv(R3^>^{BcN zttd=~vLDp;6uk@;LiySM`J~~r2*LZv-Q>PsY!=1SqhN+|Qf@XHPaV+cxFDzr5Fci! zu*+WB8x>CVlWv9ky@G8vbwlYn8pI*=e4NM9Ma<9@qa2N;$3^CLDj(eJCJ&(t@GwR~ z{s^Bivg{*{VOxK~@c+uJ`)6j|Ba?+Sea=Akg_B`lU>}*xtmzSFgF1s*g$Xhn!@dkH zDz#ZO0c37U;?NZzN_n A4*&oF literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/inc.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/inc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d058b82fe6bd61265b0f1369f7bd56f5750fd48d GIT binary patch literal 2568 zcma)8&2JM&6rbI-*WXU!Bz!dq3q^oYVv9yC^aCo20HKz&B|<`bS#5U5VZ(m7v+F9h zI3ScmrC!R(J#YxOpd8x&qdk~{RJx*4BzozkN^7b}J@viWwPOQC9mz9q-u&jhH*bFP zygw8QSpv;Yd~N-hCgd4TdMz0XT^EElF^MVI#1T}%5miyZxoAtO1fygpoTQp`WL4(+ zgq?EIYTC)D89qaKkeJmi@$Xsm?yqeH!i^ z07mK4k7Xi95|%^0T2T$`fll~Sx481{n<_{0D{3-~0>(L62GL|ivTlhhKXJb)>hXm4h~ zP%N$}FY6%dd5CMw%y4hAnO#e$B_7o{_|6U3ouHF|OwoKCSAJk^PqR=ihMCI?=c7cm z=A%(DY~A;jn#YujpPg13zU3|}m)(Hc$^vaL-B!+fPMuNT_gE#A84ViDHG~)9L{@YY z(6vcZD2ofBWVyxz@<0eB!}fiIEB{=r22QL0Dr(;Wr3K0c7! zWNT!i3j!I69K+&~9|56F0tE23BKD-+aamY|^B}jA5R-8uOfS`JPY+JM!;l%#8QBDg z;XVw&Z`A7^3#h5-MxdENvrhfi;o%@(g*k!7MxeV*ejA-^7w;B3BL_R_gKQ6)mJLpG zV=%z}F^Yf~JjNY??E@9uLh+pMQ--hS0%IO4i)l7Ju2^&Jo($UcK^2|@obo3X^Q zL9ecn-b-8@-Xo(?Z2dixK_=1=a!Xj@6|*;w#?9F0B54wyn6eyZEg-xf=&nH-mIj6= zW9XcrhcN4SG|op^3KKrHYwR$tz6qU=2U2>u8^v4a9Mv3N-&zND6ZqoVTe$sg=x&qe zGRg1xhTc;j=Z-%eKk;aMwv(TITG;z-^4{v>0;py;uKhfB=Hc9#N8@KZ`Lj<(#(BzL zP38aK0K;nY5>ZPSmSee^q1W{#%eI1MYic;?;nnxi$|3BKjRySsth=EGZik8}z%I4{kmzzlRv5k5T%V>Gg?qV`F~(#{HRXfJ}e? zF(Rk?$mwl4m7I7od7yp$?)A2ISKF3wUOCCft;|@bbo627ST_mU?NaP`dVT&SD?~0~ zbfxtp>;C;RE|<_r=}%Ph>*0qikA^bTKlo>flE`z$Q+)!` zg*t_c(%Wt=^#M^x04ach$hCp>q5;+cQY1eM^bf~$!Ra7D1r$Y*{$ijMUF?sx=gx3Q zTBeOQ=w4fMU+13tI_Ep*9{z*bY``ETRDT+~(uHCFf+}*-)C+sRk7L*dMqw15z?P&O zPbwk`97`&fR5)r;B-BaGl2$A$6S|~+NiUXF2}9DjWE9KlgehrWGUJ#6qs{Rid75Va zuwGoU0tFT)nmuxfwmyWlptNKcx%j18>sRK&C@4+JNZaGDwf0KoCD;FAv{oE_sLkjO zN=LgWeO`@Wc_XZ)6S(Q)qc2iY26?>->?}qZ?_iXP)<0BAl=4cKx>oRj`8(7Y3gCi~ zjO4OxI!UH8S*Yqxi?ziJlnrN@bS`uGdN@Nv%@)Zq44q=hOd`xi(@gT3#5eJlT==?P zsLj?3x8D-~v8awR-%&3ae{<`KYA6fLV0!7Ef84rNl=Y5k$y~&i@g>V;EZ`7y@1QH3zc`iu za{~cMtj6**5ldxqEWy&LY?>k1Gyx>U!3@m|OHD(BSWe%dQ$qw$pxQMgQGuYy=gRe1 zmXO7b$mTMcG{a_zt7-N+k!8bdEJB1+6hW<|!bvELf)vX`dpadC2niaAPB4mm$t)68 z!Jf(`Nfu2@X2KK|OI;J3k#r)LOp(cOJk5~VNH{@f1${0{lR0`SIU13{2T2Q37xcHc z2BP`b_y)$JRx#bx6tJIx8p^7KV+Fi`kqWuLpnzIk-7u`6gqo_+r-B-cl~O?swZ^Ok zMcgX)G;?_tpw!8&1uSlp%gvnWF5rxI0U%Qz^G7)FY@-OkON3xFc12lGupKg&xJNEE z^Ojz3vsEr#SKP$1iUkbj!Wr*#yxLITIEZ0cv_?hi8e3*)Gd~&wE4Iipt|$Yl@E^dn zi1LGEaXJ%b!pXY0Ov_>e!q!e|_K#{2n(61nix7JVQJjc_7w0aKR~9eKg@y>RI2XFS zcxkA?3@rmD3^P|_EE8r{h^vWog#+#GN*})VTxd{)1W^fumlxLf;xeO z6X__C&at9`rGX%pZSX=9BJPcbI`cL%K26NS(xlzA>JR9n#9LAY(BuX)#9iV1_Q^Cw=Z`I-z9uRV^(pK6PE%+C`QlpF7z_ohf*QzZ zM$nODEEQu(l0nkT7|{or5XOu?NInEpEK33@DQ;GB1=IDTM`UX3pzUnX`-NS zT&-YeUajCkz<_v3AQFp`^&JJYf+|ZVqAy>Co+KlQa5hVlSuj@^I!uDN{p;TwVUwAW zOe{0ZhM8+LJG`8mJT`S>k{(IKu8w3@*z4)k$jw9|IfBkCLDI`~B*&(ikwkcfX0ju# z=Q1L~uCy;?SA&@q#tw7k?eElaIw+qO@_Mp=2cKWv#(p?}=`DA)oX+!1=jA(kimE>| z^4+f$RnK%DzJDC$K7L>VpU=KEc3HUMy0|IVA zCuU^ss-R(LHpiqQ4Uh#wbnGI0{|(^82Ea!g4d0iTvjxRR`v8+tP|5^&nycVr7F2P> zXJ0~KA8$Z$vlChypxSI@WgiEvp#fpd{zs~Zb#rhTD<~)|hq$)^cM&)xm{orCW_z0K zD73eMd)6h-7jFQ)+=5|Hp)Xn+XZkec^KMkh?<}dD(X_`4QgmI!- zmrIW{%@YV{L%;U7K^AZk!iihqm8^{D46?T1j_N`5$s8L?WCJS34Q(P82x^L50o6t4 z{T&DZ7Enq+(T6CIi3!*lCuG7fQn%pq>K2sLE!e7VL2(Xd0FCKtfj)%`}tY*SYHc6Rzr)TX%*>tXa;9SLY zZtcXkCQ##^s&j&KPVhE6-`%%0yE)7I4_5uhIsfr*ItbqD+X`+5Yevj9xrb?7mKvrp zTWU7U-chvte1cRBXid$)~o#npUMZ)G(FJ@Ph%fIjhzI&N{%G%)H6Y`wmonM>yXR-rdU& z2C9RnxWQ9D7&SJa*XRulLcI2H5Co8TDI9?JvL6mWWE-9wFs-Ko{t!tOq!y)Zg&(O? zRIR#iBq?nh8c}H*K45(yWNZUs+QCXgX_4vMml{f|Xew<%ar@t4FF$UcXaRqfXBtO# zXJ4X}0BrR$`T%rwRkf-HI7K?QFHuUFE{^nfUwXA?D34rK6_g+SF7R(aQlGMfLTA;RR!$kN1qzJ4f>nsB$wpfqCmr{rl4uN zmIE?%bKMW?@1M5o@yqm$mBSmVU6ACXGi*be+Ec)a0y4K|PHAP%c`(9ItFLBN%f2B~ z#fRilGcRZ;E&5l`MpcxK(m$+gDB5{J5mms`*AQaRc9}u=eD~jw!0af&Lq}OMy!-%V zv6N_#1YIPZg2!lz{Wp1vK&NDAIwVjBr?c*nrr-&ClWu1g7_>h@8NQljP`tt5)2F-iDj5H_)jwV`49u3pfc@E5{1j z`rA5-&`PvJ@R3ZBMYcK8PV=2F7$d2Jgpt`VL84!2j(BHh)fwcR!JQ8)&atv-Y}e-g z|Fts67TSv^cq^EiKF-=Fd$3QflYDp2*3r$QA5U)0Y|iYAmu4#6!FAo5dhJ8r)4OHe zwC*S>o;RwVAm<4_%Kd!x!D_{GVol9EyUWhk`JVo*3!4{q&Xq1ydXBEo?RvVl)SK#$ zwOhtbNJO&1btDXtYGx2%um#e=326JobUAOm>pK$KMs%waI4Lut>S9T3m zT<42(H63QP-J|YOe|2=@#QKSfrGI;_>OaEyk5v2}3UAxDPj25T_l`XFKDk_RoGhN+wK~@3iutmxOES52rGpD?@xXV%fP%@z z+j-z3fQjVKe<&O|JgZ94XKdwKC(A<)(LYdi0Eu?}&;~{Uf-P-(5wxix_@cZq39?V1 z65_S0VUar}rgo;XfK$qXCaR!Rlo~)-h6)IL#P#yHW=`QRn_rH_t^(}D0q*xF8G_?& zj5~!>+DAGW-Y6wHr&k&0Rb4^HHr^A+xUA~oRNA}}m>ITVO4_{>T6)_AXDHIH8CU&hJV>VomQ4|E2VHbh{ zMfJq@U{)|7t4LF*otYQ;E=yfBC1}8|BMWMfB48vFz6AknDHg3;mZH*>pk0>yg)CZ@ zY*)oxG_K51XwLUFENXk_;-^5A{amuF8myyp+eIQgV;Xu+3csY zT*vt1cesv;Cr7@T{bH7LFCfCcZU1K@pN??uv7%wu=q`Dm8VAsbX|7`!OoPq7p?Pd^#s>?f-B)13Wu#XeWm@b2zUu5<3!U;#$!y*GdQ<~LowE$^oH{@iB^ zpDuuT%|FOLIaC>$tN7f*&koL(?E@A2TSW~#T}wAP=U`FCJG>jg z_27Naqp6DHNYydPIVQg}R2-*^+FeK2#?bmuiP@g6ID$nj-`HYrDY8AYy|6t~_650) zk>XjtPMWVc4#7Zs=Z0_Hx2>qy2VlYVnqlYg&Z%wd4(5bn*tL{NC z<=azg?W)wRV&=#Xl_Te4<4SC`0q1fFgu~ zpb-6ppFn7m6bvK@3YANsvWY}@pQ!g}Ns>xOpjmY_olZ#MO=T>_ilISCNiD4;eZJeUPol`k?1mcd+kOO0}-$ zzyia(Kfw17esAtqU&ME9u9_MZAfDGj)U;wzhxPPryEdn5da-K2dDq3`uRH3*;W-toJ~YicM1H=Fxj;clTsan=JhEo#zXP7k`uZ5QVl ztf3J#17@%mb0XclTOVwGP__m*{h>W0>YH!y4ye5i-ca7p8#O)0dryy=ejD#s_wn|Q zjh^+Mni9&pddrJ*{U>a`PrOF)zye)2027ej`{PU`9;A9p>`Cyil%aT5u+t=9!HB8u@|ARpkwH3u`Ek7;)y)Kq|*^2 zSGwbpOuxz80!k!t*+=lHDRCU%(_y&zpRv9&*7vX2z_-}=x7fvRu{YpqECx;qBe_?96ffW1M%qjCF!YD4e%FRoubhj{Kl9iZXL4{ ze?`m|w~tj4zbWR3JI9=H*O-fh&9SO@^;k9WTVn2b%~(ymcC0q;8S{`hYpgEr9rMP0 zV?GkL#r*NWSb+HLvHEz!SVO#VtdWE(W5IaSSQGI(Vq4E#P zkF|rpYOF&`Jx?*!S188KdhhDQ{*Q%7Yz@TLvQ0{CCyDhyY#kd=V!KGJ7h-*EqY~Rq zV*L;sV4X^A&x|fq{{g-vq!CQdg=4YsnHU?=3c7t$GlJ<@G967!g=3?TT5*BpQV;^K z@o0jXhHM(ad_2sB<7}GcuzWPhUZNAp6qK(zFr8wVqluBDZ2D}1*&j}^LO_aRUuL5d zXVWJm;TU^zI+;vxX_kTV)pD5=)8VP~bBU?(=tM{-SdK)e*f2+`aEkt-NND^ zSa^oz=+wDrGD-U6OtESDBIz$3PNzA1nKLrYaqLu@PR7FN@dOv2HEiqM(YrMwEq3y7 zNrCk_SozqWn&h-w3f)MX@fn3hI{H32D_uY1L4|&AM3wTQjaD{jf9CxSpvg z%2O-0P3#wxuPsXRFdk^#)}m%f-pOrG#=-6}Kudg#m$co;_@L+hdFqm8EO3$v1q92X z<0G$#Yhm^)hel5g1*IiNR#q@VR%$R33#U@S=~Q%TB8UZoFtx$-SvL6G@t68uIsQ`c zOgM6monpEzL2%KjNNk$Hys-4TgSeb}Nx}y(-xQ|>a0j+w_~Lm%KaotI9~F$S&tNMF zR%tn%iKbJ6Wr9mgC+QR{f1x%`wq=S=C+Mj}nud14o=poThD}B2a3n3bVsZK`8)j%_ z%LqnTw5gf6VC2~JG&dD76^##KU&KJ`$DvKXuem{`mBg#2IqG*HU0xF!YECmp(b#O_ z=Clx!uop$m=^&(6()5=!DXoT@pdePp@YI|ELPjNz5kjzXMK@OgA*E#uylGO&Dd(Z6 zS$I3-Gs`jRxftyQ4d+eQ$hjD3n_6vu0CePgy1O_V;J%LURC*xBA!H0?wL!bp@|D*% zLxn0vKL|n%AaISDSpnLt(SNqcOWM49x>`8~Y2{zqx zX?k$y(78djFBU!1mz;s+Kh<|37K``6N@rsUcYLp%lTR$=*YG0 zez$G^11-ii=jwtGXU{cl!(f9_qJGhxZR+RY-``IPKe9TqRzGj`FK*|pZEID5yjD}~ z&-oj({x06%mGN~i?aS*luD*4huDUW$>Fkwx3uSY@vFqxtg#>Tyfg-u8{SOBIa@W7x z^`PhB`Hb_#x=xE(wUAXu*?QNZ3sgi}t>~w4_mJIt8a%)|;R?q1G6yRY_Ufdva|_9k zf_p<{)sxj+0l3jHI$Kp3)D96t?U}WA7vU%j1`B9vkDv*KbY<%#q~XX!VEcnyKR{fT z9Gm25x@_9S$Wh3W+6mxw>QQBI(fyA9wtvx{r!>7K!wL~SzMQY&R^QFOjJI>CDR0o) zd-6KH*}@?uC>byaivxx|zCq1sLfTP5kL&0Tbw?xUBC%AexPh=j=_%lTN*X9fz65oo z5V=7E9W_tB49A^LrcQTcjnsrNvge{I0n@ZY?D@RUN*~YTAmS+e<`IH;L>r~!I*@HRTmb^jdw7|xf)}>$+%S4K|B_d0*WFEsRyaA+c z5?5oHo6|6w3C&%p+$82Nb27=ktew*`+H)Pym#=JC7qFQVTDi8%hz``>X<&>+`Sg%a zA?$_ML)$7QfQgmGkIO*vH^^s#eCB%=Ij4=fY}7)JfvwQZ>E{fLnK929?@A29$&JTC zUl@x`T5l}MwQ<|6rER(lOvwhVx0U4tW@&?*cF0*-=x1>+fju$-<5SejkYiM^iOW^Q zv|K$5u3)%G17{%U2}5W*PZw})fsPB7S(Z!C$wYK2eE?*T3WgmIPsh?ig~SpPLNDlm z6+8uq#E$EFoJ+*%bL>n?FvJoUfm3jn#L`Ihxu>B8+(86t+yn9qBKOb?z7ldTp?NEL zi@4fnYl}8cQT$mfI}8gCaH-3kb#38YTNZaLjb^%bt+;k)UHf_0{uS5oyfJ5WU3=+U z&*psfSzj0L>smUn{PO+r%)rQs@4);EId35A?c{+B>{uRsa3r(+#ESRD`NO%^j%@2r zzIA8LS)1|pXX>^s4+C*+tIWW*G*VEPw}A^VVEVUACr;uW4Jn zuxwlU+Dgqc^OjtoF&o&*2evNP-VfdP!+Qc-R|2OpCU35~He21!S7QynW!nQbQ{BE& zePrI0!?9@PU9F4bOVdje_tP0yD;b8LJF0TFs%tMS`j%c?uFLE?@^BnFg!n}()4gK6 zfFymKb0781Q ziUr%*aZxa$BjXc-1r!*Z7$#Mwu#?EAikLr;0NIL*GOR315m}HH_vLJR(L&xV<2Rtr z)CmB_{tJLOe?m8;Bit?bn^-XoAcbSDEG2?gi_!Y5 zzi9a&&Mi*-E|zBjz>+aiw%Tj#n`5iij+~?Vjjvw)>H_<&qj{~mIp_8*3}*v_d|>dt zJG=7~zw^{%oz`6iMFQJb`~w;Hz?!FXX<)^(HRIeW7ao7t(X!_0S{h#Q^k?&}fdgRHG!=Y)xLQ8`JejAMqj7cdKG|eO;5M6_vpijKCZ}b#>bmY{*W5e{(eaC^s zq&YzcWJu7*qA4U>iA0RZXWUcJE5S-+Ly5lOJd!=iJ%ecm!lQEgFm?nZY%qE0Xhh2W zJVxo6Xe7d7^1`OsBwsC?wC&Kuwf z1fWBtlbC6?#0N` z_QkWe8}bwcmyi4kA5ul&%MX#9R)hPIIk1rC&CS0uKtRd>zP!(7aOY}#H(t5^$_@HD zo!4Qwx1T&8nO&Kx7T(oSqvLurefyq&vOhwiOh$>f^vU*Sw>Oi$;VDzkU5^-Uf zn{<{%K#&2>78H*aP|j?o80!}*3#!;j8W2hT`@>0Jh3Sd}u|ahW-?m zlPII8Y9ZoUIUZD$E`J2s*O3yuPUQj}^ZK8eyyA=!4A}+=qa~6L;%3sUw_#snvnC;; z+@)*~uMn+5>J&&Em$fi=@Qi6M>k4E_l|<=ONtDDBZC*BQliw;6opaD0uDU?N7;PaB zs#l5?jAl@is;$6DoQpE2wyHpyQZJA6tjtX!TU6#Q8#gGAIIzr3qSCR@3dZf3=0hy&pX%C}nRlVJacgx(I5!A#ecLmE5AUKKZrp(=|xQmnk++`E24v=-r z-DGEZ4D5Lm9t6y3Xl3L!cw_IAWjS1hDhCf_6`G-^s5~fhs&mI;EBJ5=IOvs32O zdkV2nmt}+?#O_v&RU4~kpUlZ<|n4g0fK5XR#+pGd!H=9n90nj z&;R}>^K)9p0wZs|XS-_^Q(;z>RX9U6Q)z_`w1DpWvc71q>Pz;jp(JGdo>N{D)yobe8$4wb|X2o8wd7fGMM(`k~q>iEcu^!|OPo;ye%dXCTM!6+7TbIb-a{H$I6@aht&CK^F@-1fDZzLibx0{V=)uY-v2sMw zf=xVX5o-=view#}6K;?+0EA`kw=np11XmFJHUey3$^rnyA<2mXc72jdMA%eH8V;D` zaWFHWL9vaernB4B1g+jp%PirmOj^QqG__@$2HtHNn4f-Q_FJ>};cO?bcbGkYW83k* zqxOxpx_j;v>p z_Y7t|yLr!U5aYd|XF}yv*84Q?eLCws%6pH_SHAD?F0_cstb1dtj-iZcDB~E)Rn=yz zn)#~cY}FyY>QK(xnyYQh`9rxv-P?K3_Pm*@KCA%&q1rNE0gU>V_IK)U*F(qorh(-b z`KFxrKLbS?phNwn9SI*~Wu(AAA$+=fI z@9)m~2YLVCT77%Aeh**2=l(cfKa%U{&UWnNJ9cI}_P*P(_vZA2@w|t!zYN{5k)tki z<@!X`}~J2y1%j*z%OVn5u6f{7zoHp zQ3m)CM)c3X1B|FvO&ty=ZCW)(l+hsZl3Ab2dLWosUzwZJ%Y+3OI)hsM8`KUAt`EsM zqPa3Ru9<}xx!#i$_Z$QPb60RI!5!R92y<^C_!pRBEZW0=gs}vvE1NqT1-cHF$Gyar@%_h22X}WxMwAUHdYf&+-1}Rvg0_)37K( zpo9YQ(KtM~2?S>mL=pH9Ad5nj30w+&L@Dq#_yprApwJ19yNA)G$2>&ZA^?Xd^;G~@ zsGsZi=JgswXmM;E1m;lQxD909r+zf_!=WEN`@?7R6a*in*YS~0YN*!Ghh~#uMDxhu ze&f>BOK-e(^|ibKBS5^ix!%})b$7mk1Wgp=h+T~>oZ>5+@}vx`TZ`k3z}3KGD`pY< z$v`6_@Kc;3`~EkGdeUI%A0zz|)1Zngw*OZoU}BnMgX23JheI&?)DvvU=MjSm*)0(> zfi!%k^vlgaCJwjLlR|F*7k-&WP?3567R%w%*;LX+36da~?*rN?Gr2K~CO31MdkW8u zW;bdoSt1LoAOb}e297nR2U%F+ih&o$H2exGPWVIu!`nnQjy5yTA<}>vMkENYUL3JlsQ#d7G+_z@~Aj2b850E$imHXHjo`# zWPjnAZj=2oK~Q92wfBbP+=Zti3#-(wCe_|EmB_+~2+Q0g+F+D9#w^dDdOy@;-B}=O zAZH=9BM!!LK=v1&s^hCB;UY3FbCY;(Q|301KR1$ZQK4Ms)N4>36|FKV8a0U?q+UD% z`J6LTSPPST%bEm1#)1R6L2dh=%nvfJ752I9p8YQJ)!?3xc@>S4B=Z*bq@iR_8cRYI zpCR*Bf|;;*#({{GyNsX{!PgMHhTwGs{|td2K|KN#1G#Si2pL3?0GW+~u)=YLZZR=Sc$(VXpO;11irx$^O|40#NnD-9< zT1B8rMFiT;H*H_ux6-r&Psa}PI}c}fp621d=`@jv{s}~%P2?VyuOC*Edul|v$674+ zZ2xu1J%0>xPZ#yD%Y4YKdpO_+iRZ5z2JmwV!~g%ScZ1^o7)zrXmmG(L>|b2i;J%Ng?;%JdmLRyhOO+%u|6<- z7I8dgne{)}6h(sQgB6lOwz`1L>lOk(Gqr6hQRtxnX?F%Z#A*0EAG$F3eI)c0d421T)>)5l_(w=P@;#-F9*Zer}r-6s3R$E>K9o_u% z*S_?=zh&{|`y;FV{qrN=IkMK&xwQMi^lH=5`H@vy1K23ckNnJ5pSM!ZXT(*bj_q-f z$)dRro{bJ%)fNrWYAKTng(at&*N|4GR&6P%9mG!Bl)x)138<0{=&8N!k3l?z@)FrGH(P0lCyT+UPO?d(0(C% zFDkyUwIUP!4EjPMjYYR?-h46VyM@)SJoGwb-s z`w0E98fr`QT@Wr(cVVbZ+l5iGvg>0jgv4Iq+avH(9LfUd*7pO3DJMOQmes2+9h+FF z7>?@+nbHB7s(^Avu8BnWisPyB%5{D70AiF_jy{JD|5t*1@>oR|S63GNxMbxxrVnL0K_x zE$fs@c~}=*sHHz0~Ffr}F0IixrPF5F4_vEk+;$${KUtkjiOFSXpJRr@G zNeo)ri1IaI9uUg{<|*bLB($USLSY>a!LDP1YOYy`fc8XNeJxV4a$d!`&*l z8wM8)t!JYQ!%m4e3>)!ym5Z}XG@NEh6b-p(I8seV@sh1z18+E<67P~k6zmQ!MlyZB!yrBsmkXUwx|ONwe# zZ-+DrCGD_mFq#!}=EA6LkjFGYG%on%GUS}bi7I|KWk|;)ShoN|=n)~FzAu`_W^3B{n)Wp~HGJXf3*_>N zE92hIJGZ01cFSY40rW*S${)--sM?No%2aEax4>)6m044eHw701OS7Q=TQk)y_%f}7 ztEQp%-QF7quOH01+jw_dam8@4rLVlons%tN#xid%uF}60g?g|`Z>D9(s%a2zGX$D$ z4c#2d`g?eP4_12k>fvi&UEFu?r5}9x&X-qhyBU8 z`nq{vw^HI>>koSF^sL&39vUFOzX$UBE%OdwB;^ienzm=28d)_R$eHY77c))UmXEEP zh98-HSyK~lYRcLDtMxmx^?UjHz1jK$eEorp?RhYWH(P)OZED5gtp_`ZdBe|40g>ly zgJ~``LPgAt*a_?@zL+3=O_O@KxhQ=XszrBMo0hdIsy8J7x2WMil|#BYZ9xnN0sOOS zb&BsVhH6KDCAI3)iukX_CQsHB*CzUl+M$M(HEKtBMQ>3#yn~}91!orCr%p$Jk5D** zbd#JL##Ds@$o$$c_2oBVUJQlfJheleFEB6oeH!W;aQupJT!%+H^1D!70@S=L$8N|0 z^&|6E=H?8T_L(jFymMRj-X~gy*Dg!iF0~AIN|_T|_MJb}Q$@3<(4sPu@{3CFQMzy0+wZp>6r{t1_ zr}~P5Se4n_V0CEJ%xfX21HAH)vH{=$n+Q7!|1&~GN; zAs89wa7@X2@XAQ4k2~J5Ap}{7W zOM`J}s6mv@{tdiXyx(U=<&AhZ2ssSFLih@>bv(z4H2f!+l@M~G^1v=}KgXOd;7!6Y zx=7;T1Wf!T0wnr^feTMfu-ulXt=fSJ1b;srpJvLEo zUGKbd`;~0#Am2K;2Bw3S>z14w3?;jGch{P)@m9~xo}9lm>qq;*#}#N!X`vch@^-3i z2vkdLm8i$u^4JNy!lEG?9OQ$8|Is&e&G^3ClXZ9SZg^`pw383*%!Z!fL(km5@RRdD zZMr|lhmNeckAA45T!%E*EDJl{>VGS|u$y;wU>ElCoqHeD@|_2vd2lmVs;1ER`{O?e z|I~Fq#)n>5aUa1-p4VJ6FSNbY^wx=mZr<6B?-bwmZs-6QUHW%EuB3vkw=Uhhlx^(i z8~c|#AC6=i`&SxY#O4Rq8k%pNy?Hjz6k zKHl3$JiWZPx735TnR9i4TzzNWMb+!>=iijGIR0KpGDMqLvEn^6pBaz~Vay+0cfo!PNfiHiN zg2ZrL2oR+(sTKMo$xCocfCw^z1xiORkc-8F6}~M2pJt=kWbBNL5{WT1eId$T6g;uS zL=-U@eETVRb|yvq*zg)T0kCNdVxbAsQ|U-CbEYu117wnc7{Jh%J5aJ4L1_c*2`Ds7 zb5>w4CBi@y?O^#qK`D5wL&;E0VQq-#$)7+h_dgMQis0`M{4WIm8^QlU@b?I?5>A5v zJ`hE5It1`>Ke)Jwz=#08>ql`W1n^Zq@iQbc>MMB`spD(PmmL}Qe}%=d&#=OJDKJwe zCoG_0&ylT^vw3qiPtI10!IoXHSDa0-SMVCWS-M6aavT^P<%Xe((NUoyl^9RQ!vjxrnpQZK-jS!^frkO`$d77t zW%oKEEZ+fLH$Wg?2ZkBBrq*10S8hv7uCq6{eJ~e*U}tV{SFUGU&KJxDnsQqQF!Tk?lpUayu1Q&Lz^H%hM)v+&cM_(o7Z_GQ;=LEFB&~p87 z-i4tm%HOc4zuB3u#*mw;59VvoS4)BXedPL)#R1;kp7&rBG`n+Y_;zF7iymlz) zUjVRwz8-xIXmJv_9az?b&3(QRqk>d>SH1~-TkuA3Gx}PvRjuf2Ln*lXX7`et_x0r4 zF{-1aju1w5B6*p+Ik&{{job1#9Qkg^NciI(2#GQ*&UmO-a6}TZn22>!z2P$vvT_6? zTvY)SCEk<aC0@ovw2@a8vsMzWHJo$UY+d^bx2u+)m5O1Z3xBF2} z7Vqx2Z&HFB|4O%gIApgu7XV?iW)&Cdg|Dq~DC>Ow62QH31Rc1sSoh~_TQ7dmjYC8( z?EJh%fOy4Hy`;G`@pCAL3tqMIVmi^oiIXdTt};cKZqd*VMRL(RK7zhKK|rt#$`j%$ z><6D<+AVUy&u|TZec+!;-2wm~J=17DGEth=Ur@ncQnvS~`u8aRdsN4Nr8+;>>NHwR zJEHkmtJi2hqyT(uks=RjenGwX9(CwF>hycmj`ygpN7cRz<$P?M)o9!qSMy^EJ&$K~ z`!$*oO~&8#m?FN%hqc2Pv(WdLBA&;i+B!|`LgQl!9*=h%)@m%7+K!JX^pFnxUq!2H A%m4rY literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/kv_cache.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/kv_cache.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19f4fc56622172eef12a9a968c36b1074f06cebd GIT binary patch literal 6894 zcmcf`TWlN0@$SeYPb5WAlC1~zIzMEKvPi~uWUGcBNgUgW9m{t7YJ~vj6YnTdIv(k} zqhb+M;RgATn-rGNqLR@DR*C?T>i~9u0CCZe_%lHBA&Lx09Q5I$=|_GFCUQ`qU!C2% zBPE%Q9i+&D+S|wM?Ck8!?9AwIwY4q+ie!H$_EHN$`~!8YLYs}%A!y7I0wItRF@(O9 zM8>HhibQRjWQ*H}>~VUC#(lfwh&zXzBta1(B{FwO1IthiMVuf6`fWmRi1b~XIqI4u z1J18eXD~qO?wB0YI4LnQA}Rq&Z#b9YWi6KGwOB$vnvkQhk@A>$TGU1p!b`j=!a`pp zr6{7TaY>2Sq6sA)G4LC;-R!L9p)p5@#1JVELzF-c*#t_oM=8Pfwta{e?4o1LyLO?` z9`ijp25V?oQ}aXCz`-43o2uZl3r@i0{$aQTCTbUIV4Y`@3Aprn#EDavj`ERFlQsN1 zzd0Bw&M7gVJN^Cf#i7KllV1%c;wMD~O{uA5GNEX!l;DM!JOZEzRb&xT)Qj>FEupAvR7u2Hper(-OvGeO4PkT_ zMn!f!Z0^av8IvS-SY%VGC;;XI7z7a?WUm@hU_eLE0a>6!i2-%PB3w3X$WIu%i*b`+ z7Q?MUB8X2%6BV_{Xc-Zv8qRK_!nh73d4-P~6mbzOkp~ceSU#Cl5=kY-YofvKbyADh z>K;sixhz7h5d_pm8nM~{)f|x_GXz(jo+JV!D2Gx5BcK#^Rg|K*qoQRj`=85UEuIV~ zW67Y#D?hoWdMG<9m}-;WM?e% z<0sqEY2Co2JwT_0va%gjM5G2=-xOmbquN%shaczJtA=gqV@;cbO^&Jjf$~z&W|(04 z^_Ir47swcl7+aP;I}S#_Rso%kDWb}aD7-KjV00R!Ix?zLvaHvfMVcyVqM|#%Y^e!F zcbYWlPLl@RX%e8jKzoM`3hz*g8+K9ZpfWW~@jzSmM)0{f<5+5%eNrdZLv$hxzlZ8N z@z_VScHU-gF@=`iY)kJ){r3*uIrzYy+jZ#ki%TuNxt8(msf3s%$P&;T_p zD_w@Ds;Fi40&uhZ0i1>RC`%pRA_LT5x;+F|QkrDd(L_oTSUI5~gAZ?oBnqK`P4~Q> zg6AbJ9#a&!ts0JtNW7|Y9I`_)+$F~;en7wOQfQ1a-TdTT;E6s2&q8&bSn>Mjh1>Ei zxzO34?d)Ib*tfLr^`*A+Iq!uf<^pEM;5sfK@y1;E7pN98%40CFu?Xb&eu9{xln~5U zF=8{6x&>yca7duWUj{L|4zp#|tzb82uU20>hTeh=MX*`i9z*iGq1-k5Q&jc+egsCF zR%>lm&3m$TR{oly1bZ4@IH191wrukK>L-Nr12>gT)u}aSX@Q;sKWxe-|K@*U0UoQ= zN#ZS1tGB4E4qJUE>k*ljj4;VQ-PZEeIDj7V}>AYe-LV8SAiGm9OBm@Q_HWkeS?2#yl6jGH9Z$Um)xtYeNnXSFKdiq2%{ zvAvaXt7Ta_nalv!*2(Ad7EI+^RX)pS#F|k7F1LhQEMYs~&ai~3G;q+6wf3jTlQ%=d zt6Z0HK<#^P4F@vh70{{N@YxKZbF4&Ru{xdRq z&CL)nDJnt3(b^FIvB;^5CI{G4myA#;CO{;`UfqBo#GV&5 z4aH?K4TX26_ZDI=jK)+pfryABj}b+hu+{Oi18gN!9p)qBq72b)PZYd~(I5cIEFuW? zVsaD`4I$kw_Q&_6XD+H}6N}6ZRIs~4&sj&|#OLFCL+q)_Xj zh=4){=r=yTD&=eHz>W&MBdrr%#xG3rMF_pu!a2FiT7l5lzUQ-bk*6_rc zgAyMf7Wl*I7oYU|fTs}=+(y$)=lCc{2IQ1*f+BQKN`S+u9)^a2Vgk^r$j`k_{BCTa z{)1z=rrvJ}Tl0>Uy4LwEf4q46jazTz>R^cS?I`-{=bW?7-`32ZoArKcgONwR`hss; z*0(L^>n=HA74{?3ho4O|OvMOVnF)bC7e;}IZUc2mw;AG?l2JBgSb>mAu%ak7-H76+ zFvu~2Hv$(n{VT|#Xroxy$`O>QC2@Wdln;_CEFXbf86TMhsZ~>=8e-2%BBYK$g~lO~ zk^-r`iiy}D{wj+EmY?8b5q8eup&UlN_qi|IcCCmtr?@Qv?3~yz$&}Y z&i0rl#&tJW;eUXs1deMC*np4BX$ek9C8byd7=dZW2=xXyEwVb2s3l}$CzS#GDA+~C z5crPo73y^O2WmkWHc-1Ewz*D}T!g2-;O@%0yB7B4+=2Vug*~TodrlSWHr<@MF*UC) z*KJ)$eK__0)O~e%=YdE5uG_&|!Ggax>+fCc&-r&3`~zA4K+b<)###Kn$(6=!g~o8U zF`RGQUEI_*e|7e?nGuZcU++?L?@D)%2~B@i`WAKxqht4lH=ljpBG_l1^r&n!Mb3=)s>KjT7;oDs1 z*NU&nFbXA#+Wac{*k%U^7Xe4~&Uy>J?OEUUh0A$gu(&fcL+5?l;gq!vHwSMF7Cda$ z!!9)EJ%OU9e&fit-sxj`4_mapbxrfze*MdZor~xH6uN)uVSnN1J}W+4%H{mAbA%-L7oiu6$jv=xd(eJ?jBJ^#rGned+0b)YLkC zqUiP&+-+HR+k9l%y`|XNS!fMpTLaT4*Dv+Y^Rsocu1Bry(jsm^23JQr0Yk{6@ z-EkT(Qc$uK+Ye<6Y8`{>4uLnoimo8l#})dQgNg!&w~22khGt5eNIJ08y?>Q}Z^_}I zFOv($K78fQ1TDZKSptORZDW-L$60-bCqU8g-gZly0W+WcF)u1$uzo;WldPX*XYIzo8?gNz>i# z{{H9O7oba7j-7lv-?vxd!MW$2_w#@L&;R6d*-d!DdA~Y6>@l1EmR^*{B75$AD#v7c z*%UMd&0$l-JZ?6VKPQ|M$sNy)}!6wI8riR5-A-oW$y*yvPk)OIrCe?6_LvE%1G6C zRit{nI#M%U6R91qW#4S!x=8(aJ@ebczDUD(L!@!MG14^N#KIim=19wUOQdzYHPSZT z#=@N8_K1Jn&-|`%N2GJSGtxEQ#oi0U-I1R09_Dw4dn231H%0o!`y&10{VdEA-W(Yi zABb!j-xAq6zBRIKd|PC2d@!RcV@tu)f%-teBt-toPWedGHg zL*qk{{p0&tcv1L3-a~46~W3gIpZG%hz(QLQ+NR6**-N?p_8(A%{AS5|U?Loz zcrG0B=Sr?qPn{SJ95_6D;ISj4hos!0*-Mgj?0jr`ZgwI(ipcrL=YsQyYL;v#CWMJd zC>9d@Ia2B5d@MW>jRvBV6X8%WFm-+)5Q#`0-RmdjCuUKB+7k3Wp|R0p68oLXk*dI&wbjw@W1>YUao1 zLi;Bs&xK}#Qp?2o^WjVSg3lw%SZFpnC-6yunhYA&Z!qxC`c>%~qmM}0RhU$l^1mD;*FKzQK<&A6T95xx$|?= zvsmz0WYLf?&GxULRC@0Dz$ER0@+lBuhlJz~U7QTh2Sb<+bYexNd?6H@7iK5(bhV~K znf~s*2DK_TDI;h$#m#XO&Qtk`=ip5~)J>*%?nQGHpw}4_Le&!D zW}z&`z(Gnh<5JRUzd^GCJeWBJfT}eW8n(^3##fY-}Rzo0tvyCg{1=d)CBVM3V2o`{1w4_n<-c05}kz(y?)h!Sk;!}G#VKhhhEobNq9 zeZD(3Az+8@zBs>a`{224q2BQHbG_#;;S8VceLftH^kQg2;XvqO2>a)p(93q9Xs^7f zu#=HTnL8irIe$qwh$0rMl*yq^pdO_aPvd*^5S*X9V7l)#*$OWgh}Mc#YgM}I@KWBY z)t_$JLBEaZ`kp)W12-G#rRIB%wl!zb=SMG(uIHG&`^-Pe&2<&7=bD`D_mGuoQU>nG z&mJ~kXoz1nT{2_5Br9tM>UhI+!z|@ahNDqR;}ed+m8^6|%>q&~s=4wljF_wkUohQu zR|-e*no%yzI#%v}c=WS$MxBrNAV1n1o>6C_^37yAlXDW)9K{Z6L7z{am8{QAh36(> zeOslxvlAHNQN-0Zpn@u?9u)x`j)5TCq!&6K9Ey$No~KMp;k;ma$5Z-p;nl(~Szfol zW>2^~6V}cL8%*`h?&u#pg+SprIoWkAqH-gE=_V)6Pdqmro{n8wXg020W-RKfdo*>a zZ00j&bRQOJ$=Rt4sIzA#IyY-vxe}@zhfM)?g+nwMq6@9Y)ys}| zkt*om74)beWUKgAf zOlgNZ<){}O^$AC3x@jQQv{!7}dpj?+@07UjRI=&x(!sZ#K3czByr1+fruD%3DIWv- zGIsPA%$LmQmFzW+Xu+Id?i+F+8}*uUE-dE8bAx&3wqaHxub?Gve%>T>$Kj(Ba4|2Y zT>4G%yjVVh*rVhul*jW7OV8IzU$g`Z;uhp;zLd9NZ8PHQ%Cy92=4O_ek)PkXaFQQ5 zzUe8PBRD>k@xZY`cQRk}EIU4)3;F2! ze}PK)Jvc9z)(cI}iiD#+U0lANZ>nfcmA8rIZ7YSha+2jcQ{_Wq`B1X_z>*UmofX%1 z+;OzTOHM*4&lXy57uNb$6$0{i)h+v9^09d~1KQcJCE?y13%n zQL(r+UEPB6JSFQkQ&nrK(l1u}SGrJA<<6y}l)mFjldJBImHAtCv3GyEyy339n5_#b zpO5aOejlN^%GSr1*!l=!ZD8J=#|~kzLNLd%9V#<n#urq&x zIm+(xS#vyRMuEYV&-e{uO(`j9g=M6+yl3isCB|aj2Kg7LF)XtK*a6HbzXMJ>-viriNxp~1CAz-pC$}^Q~IxjFNvWM?3`I3u{ z;wfP+5;zyS6cr+-4%M2=x((LBDi@d!!SV4T1Ez~L-M&Dq43;bip)wuIu^YvDH!t`_&LmH844A4!&dG~xLuwv1GDr&!&&a_m-PvU>N@ zvF{bvvk0Tfvay6`EZw{*)jS|J4qyo0h;==Q-hIisq03|IR#Q#m zt2?gmSPtLZpQP<4-Q4l|(btY9dJiU>hgp*5S7)xztW+fH`j^J;I?7-9r0D2aE2>Es z4SelmH;=EIa`&6t;CkEc6y(q z!gFxdGt&VdHNRvd|MRA3E4B=EJ5zF`ZA)3Uh+9u_jW-r07IHGbv>8{woIJWrb7_MZN=c=cQA5!pE*gx`SB0+y;F#i89tvkDB~2ZhpA<`ktGMoyC<9fV&U(!$}|p z&-mKun$57GJnN{R-4mnp&*6ln2ujQzyPDrAf(`g_?!!$|%NtmQ?k+IoO1>}D85d^FC+{mc~cV^NazlA}(LN5keDmza| zMj($u&G%uHkEg-mzX@Id>vcZG!?2)pGg%Mz>5l26k>YQEefL6Ap-GH*5xC%rF_Lz!VQNZ$(Sc~x-tS=z$=+`(-%&lY^H@$(n(Q`7R zKJnam-c*hPiU^LFm&XFiXM+^6A|-~J${C}pFMDu=t02=!T&HeWMg@YvfHJiW#eguU zLt1hkKQ^R$dme!B+$^qD1Qk2wH@2?)4ylChQauw&+8#azzluDCm&jp<9F;Hp3OVI) zr1Il*SaK|!o-iR*=ssi+7X)gRjMe4oK(pZl2`r66l=BifG{Xh8a1|>g&vFl)zZ~f2 z8_9wSC}`(mcAURLmQl+01=El7b8Ytfc9XO0O6ZmGRY!Z;U7T{ai0+ovog9@J9~v`Aqpw8#)I?Ge$<&x#emCeg%U#nX;(R*Ux&<~kMj!j|f`^?pr z(v{qG6Mk;y-J_rN9mSTCbZOPgAHDj~mjhP=>$&va)5m`A+PsOPR?*h>V+%srbYfji zb4h)ivhQ96`aY8&5zGN00#_rT<}E<16PiAMrch0(dS*PTSM>x7LbjkaWDnU0F%h)k z4gR15Ih3i9RL_)?5fhv!!77&sLc+KU zw2Zqi<@t+*E68l(D*A};UK!FAc?g7O? zhiQ-c=qy)~bE=<@QKJaTVW14as_Ez1ErWjm^fC`RLrA&l_##9&W_Pe)5O@=+Uhoq3 zgy~6^p5+#m!K`DbFI9DJHoAj-?d$Q4>8?#apwqfJRW+kAzTi9v0dx2djBbxYg^xs( zN^sDoKPxka_Zj2 znpc~TyHu)$oFQ6hAWFngoLlTN*N1F{Za9+t#PAb=Q^TVt$DWYvQy?>O;;f9?WU=+g zOiqwc`vWJK{w!6=5++R1Xan*G?y{zxXbzQYRW0e-hHq4*wvUS2NAKt6RNI&Ge$CA{ z1Ug5his{+&^T4}f`UWuQ0TAtD(+oB!18_^9)DRXa3T&C0LUPD)D7oZB%!x>pW9=vD zI8Yw)6J&L7KSt$c)2H%2Wr?A({E4e~nMG^7zz7AZ5w8|)fRoG_#4)puIA+B+8hiEt znzJi@?RQ4m^|T<9;EAbwo6(LoYmVnH+GE8^5)GQ0DN(|--?$a`F+1iC%|JY~tHo_^ zTHcUrw&=iZji95fCzv01D3Bb8fRA4LAIE5;WzWNF7OPg`X??D1xxwzKTLHE=3zXU| zI(6mLDS2wY56|-rdE{q&S_;J%cLu?sR*e;F*hAL1Q>g=>`CoLaT|c*0yB^wVTim6! znh|g9qmqjs5@2we08aOHz@_E5o)EV*C;Y=zA_GT2cW^xVc}lfU#(5PRa$D&0nay z7f?ln8|3^Oa^4{4O*m3f=weKm2*}ODHAaXEe?ng~n#}>=e@o$Cr&JbxY|{2XN0s*Q z+jQZi5F*wco@*wtEn|<)j7L-<2JzxHU1Guo!LiodmT2FTY~Gu&Hl!;$QWZU7MbFBo zlNEcHT?%nruO|GJpv;K~={;s!n zy})EE#T}-obdHlRcrrUqJY(ij+&HX`yI#jcdGl>0B?Tm>c_rRoM_pZY}obf;qM&(*74QSC(?DD z_bs_Kg-gfoTTS)Nub#Vp?#-63bl>Ps)(xiWc8PVnmX2NbE{&y~-juUhbT%)4{N~fE z&i(I{bUr-G(!!--w4tK*#;8u|m0 z<)ra|M0;W?oxr%J#eTf51mi(@uAQXkf%S$SHOf&qBtaU{kF_#a%v6*et|qJcBQM9^;&v_+c>1~DlY z+@?Z5(hC2M9Lhy9$Anw>_2=m}3`!XtJ%e0ni2oPz{T-=4#DSe!2{JvUahoJyq5vmUSh|dX^k%Pie~2 zD0&)~L#ft5v32m4S8UxWdUmGV`$YG?Rrmh2#+``tvh%9*S}0YwNvzv+vnAC(BKD8m z%gt#j6rJ@;k6kMRimswzx$O0d*DC&}s#{xD%dwNZt2O){@3@S^;KF_ipqR}NLCv5Ld?9re9^yN>d+|8o9IbBo?gz3w>uI@@# z)}<;t#mdffMQy6WFIM=|-qLl4$xoAPaT4yMx`#_tinK=t|8)GfS0rl%=OQ^u-;R(&_gv=OhbQ`7I4_#s z&9Pam>t%Tsd!l9Qx(UCx2JX?%y0gc!^RBz(^A|5)r1q^_@Qy>Jsdc@8d{$FK%ks(B zZ0k09vztmQ)*a+?n#wBIUF0h?mDjv{?&`UFZhCvxY_ps)-$WVodn7!oiR3Tll0Xoy@bZL39QCve0&x&f zA+ywL3P zQK|Y#@Y)1UpBxU192+`$_{iu8aG%G=@OolU7^Irf5DWE`lau2_pEwmj8r<#YLQ)|} z(S%h68ax0}y)aDifYg|p3(OI}UcrIWr;zVS$?`mL)+jVWY8P)IRZ%z$N2L+Iz`-4Y zn=$}7mC&eSREa1R6%p?zVDz>tjNaEuIs4nHwFC5*rM#Ey3cB@2IXO1_yH*oG=ayZZ z1N^`t`fXaHt90ibN9XFM{i~Y}u5}ztc8n|?Ty?hH#U0+$DSA3rdk>^~Pl&xI?s!h5 zD{5Y~Ubm(ydhb;9zT+uNd3+$3gQ8Ws^L}oww+OfF-UH@T&rz}G=#{)xPv<*MT>NW9 zCs<^fh!DS5Y~Gt{9u~n*(=>AXEI4z1l$!%4nMzaNuo(=k3?9m0{ibaOK#+-1gJ|=@ zg1~F`6I0pvj5ad1)XkUPJFU85Y5B9XL%NHvAqG_0Di_T$6_aapi6xdo!;7cEMyO%e z;^4ZDsW{!s*o=P-2zzBCPC;bC3h$KR^XCj!$(4{t>=xH2+@6IcoZ>o)^oI=gN5QD&xGE8&l_> zA>Wiao@<0iF}esSdE)5KW!KN2TKQ&-SyCW*Xg}1$8A2P{rp*!I@933wCpI~5$cPE* z8n$~$1x&;y7R|hbLg1B)O{y}03oV9|AUoPG)5z0Z5d&{$*h9LNCfu76)=hBNih9yc z&*u+c23F0JayE$0hI^LWV*647HXNHfWvvvg;PhQSqhJa4iLLuqtwU*d*-{}JRmm|s zAAzI|NJ+7%&SQZQ7`No5f^Ag-Mg&szW|`ajTACWXD!ktd+0$Krpgv)9-zh2UeqpmPZ6Cqw)%}2vkZ% zbiYQUam6f*zmJ}@#B(6$n6v1d1w$2B5a83`R9#;hFcW}$WYvwl3L9z_L@(0@iU&Px zQ+jXF^~fdUK>G4$E2D6OUbXAVI~&At#0!IPfYvGYD&-Z+(97@?gr*c*oJ0uB=Mb?h-3^ zT|RoZa&YPBJEeY=ZU)j>RoYXXE^EARvw(Ky)M#hL_A>jDeZ5Shoz<)?tXg-xQ_`+a zjvRW8lb70;Z2IKYE7PmiK|Sd$>DbN=YeKcA+^u)qpa5>U`H6&oyXa}XD|-{3*0uVr zL`)mMI-V}+NSBnaRn@=ReZBjBL0(xAGN|tYp^t-X9+Mh%xq7+qX6`q(d~^37?_MoE z!U=ixg-gVyPsBtb)+JiDCAJ?~bskk|Z9oR!as#>p7S>gta<+)hmbABiwPAOv;h@-X zFx7BWY&e?mJO)yus}MJ*_bv_qM%@;<`i6C?D_k1Qs1vV&u1Z^Oe(G)K$U0Q27!d#o zn4qoy9`Ptbv=x(wKiDwTJB<yePA=kDI z(i$APLC)u=q_pFxH#m5W5JS;|qYyz-%7%!RVM}Iem0D`QLED=S1xzzK>>!S1JgN_= z{{m*gq9xX#l=41u$R(n#QXfcjM3EB&B^L{{+=8lj7QRp5yOmIQ#%09wDZh9D{#h2_ z`{kTqL9pPL0Ti)t!(NipSXoWv`MhXD83bPD@vq9PAwux`EqNE}v1n&C0{M8c;3ZH1 zUz*R;ucN%I^`VlmXInE?h{0@qgcYLZma!64ejuSAEoww1v!2gnmTeH@5z401iai-+ zt6Vw4R+QbW#L<2~(~;3`gBV70TUifTy{YzEMjceJkkz48D_ikp)xjXf`>2C@DZm=X zK9T)=<};7J_Jy8Ur;^@yE*g!Cvc`;4==cH+>lf=%N;Fb4!thVNSf3KdxD13E?C{FU z!ahwf7<(XpA)gRH5f}K~3ooDGoCJ*bmoUIGM}d!V6sVlROwvlzJEG$f)&z(2yt+{m zXX=*mv_KYGbzCt7+^Ol+^56@7B38E1AvMk+1ZKbL%x0=a2BL11QiB`n&Sw#uqN>;tB7&ZZ9#zL zKT*Ca#OSYLEF2u%{6qT0U``K1T|=`|GzMYIx${v0=cY-JC<0DRlT-kuJ;dzAp-?Eh zBc3ke%GH6*MCz9E8U8pbSp-sEl2tlK_tBKs;rwVV2`zB^4juT+%4bCvGNuy1^}NF+ z?*j-B$;NvjFdam#NTy4&qH;oz0&KxgDAx;fAnHIl%7EdQgUUqGU+6c){5+KXTfF z|JAyMp}GOtIg77sNmO;c?dT@HlfxqYKLu#h3!F7XyjoM94$;$*^mN@cCp~@XlFC#` zhgi~)D%m8KY`WEefYS<|@?7VGGG=Kwbf2!e#*l^?u$$qqSaLLnD^A53j$E`lGdG~Fv*t|dCX;^D$ zU7k<1Z4=wJrP_9jZM$zxh;94+bHmV;Bk7vjE5mEe{*{J5IDF0e*Uj70H4UknF0rO- zTGzlRDRLpPkK6UZP6tJO9q<_9jlJco0in3 zgW{%xiA^Wgnz~qV&7!9{9-XPFqrx3Lko-D3-(}9Y=mTdG^&SQzHr8>mY{lYb{L@(RdzfzqDpZ<3IlR0 zLRG@9mc~f!lLAX1g$h4tDK-ox1VPq=xJIxu-H2q!4A}47(GkB*vR>$uaUHfZeGu?J zk1ZI1{c2#p8aScGgzpU7Cnf8I8WTP_u$WFDq65a}pa)Psm>!_Q5Hlm&=%p&oiY#Xj z9zlk42E#H`;1xtaF&_#+T!&5=sg%c*%jB9RCj{-_k%}@gF~G_^uLx=^Ap^ajhO8)% zw3m=}xD3qyLa*UI%@hR~CfJQx8vR!&0q6~^EMp?gJMMPSAXBBCVreI1AlxdJZB3Q! z5zF=@%l5HP9b#z*`_w0v^`*+Ti)GuBWjixJb&6%3EW|IC`dQ>2v8)H1-~D1cF{9ZP zW;Ei81M4OFjPXF8nFf|0`kl@P&PfiaUm(q3OxvReO#)*kWMI%0_hIvR!r2aF$B1JG@ zfdakh@ccs4u&Sqp-rIcIT=qQ&zL7IvJpt1+?O>8iA=8l>(x}i#K(bR=X|zs+b_5Ac z53+3z9K)uD_hcy+nA>| zRk=y5+;qRv)nXT|4NJ}|XP3=jZO%B;N5qbiJC2cbac!buyI8y(mmstUnj))F__eO} zT(cK)<_Jp^ZwH88(v$Xh0am9yHEB-;n8>M$sBTt8e^g;|oi^VsE>9J=i^c5;reOf? zUl;p$dJamH2!Z>m8`{6pm5@KGp=@d!QnmeJZNHXlxs#gjba7Ju4L9@?{NVq=Gdc>G zYHliq`>ya~jNs@9Th91A>aXEkFaMfm;>+Y*BWE``C&|%x)(_%U%88tlszwMN!pNWC zI;4kYgCnvQX@SUVQoR}h`~-kRXrBPP!RcB2(7W*8Dd#Ve^9nhCP7d9b1p>8&2AYwp zvaxG`Q56TRMj@I zYTLR6Z&}$;2(odeY@MR5^PY{~t3G1395QFH$g>b61-E?x_gl$t<9@rTqkG-K{UBy> zK71E{7jjNO8N$siUmyJH;Mezlb?-WwY3aHhyT^XlgE=0H9k8ONcL48-nuB{5ysY=* zvYu{eNw;;T8=KP|J?YKc(lvPPNN?Mj?(R=l`EVIdZyKQ2E!*Gi{J0sNrWYlk0D^gC z27mTyB_Xi*)%d!Ny@GqCc-_Ig5GklycQJ3Fsk(mM&Ac9n99(O@dUV~(UW-iCb<26z zJJyTYYl*4Ww_eJ;WhOM{*wtgp17bQy^_6FnaZoytC_dPs)VO-bRvnXky-Gs$mQBwwkJauXbIB@(Ho3d%cZ? zwP$4HXJH*E=hgW2_)1W$?_XzKwB994G+eV>3%we?9$p##%CQ^A5_N-O#dguQ{m0#u zQaGy2&PjOGM*-RHjbqMcOFBT=LYX4%5afyxG~LtrArK$Hhhb;(3RY;IhO`2dJlL}| zNjq3*e#mJA64BnDrDM8~#V^RxF71|cQ8Z6O)3gGpLe`+1v`jn#C8d3vloYEjT{?P(bZ&8H&%7VqHb&uLts>e|CG*4R@EMaZR50)YajmN!0 zDpp{#(dS3b4;)~MbBvgQJHO}f>oh?hHG`!WM?I+I{-uL2jiuXm z5Myw2x~ZS5nDv^R9c)!kqUIS78|gd*DJr6!5n|$=J5S2X2Mo0YGi^T$hV>^@1nCg^ zbE-cLE$J$*eXwM8jy=mtj=|-R;q6rUXh>1lr8T&_o&iTCQv(daJO(VBLT18ga=t;% zC>(Y!%jobY@xIVyG&IySl@VX3TKc=xRH83usf&JdIMuaR?An`f?@L(sJ=g_E!Izn? z^M7yxfyRCQ6uxMRn05O?4^=EvQ}kb;O2Rm0MH_j>kkjz9A-BS#D2W!Hrntw+$?lJ5 z5V+93QGakT08LZ$9V+3~gGys`)psLD z2asWBnSg*Ap6pgc5WtQYC_gU0-irkmONq3C8V4%*@tBXxJmDarIeqb$FD z;UgRAZHAExSplj->3e^=o4J0|vltNJ7&+7A%#gE_oX0UhWKd%OoAfyf2$K^bXO^5f zIFbeOHBb?89)4}FmNx|2UNrW2{e*d#hGhwJCraG_#H;fcqG7q}+6*pd>1sc8Fg%Ke zWlj)D4jvVIkEY986jjSiacqXxg95aI&GmK6dfG9Vp5=cA3M%_3LB#(-^epGSkQ)S} z&TNOK$O#(H?Pvgd6jHnZE@#9l*N8H~0-aWZJOz|ts8G2#Hq;WWGVd(r$MaFXZLwgs zURQp;rt(Htem-J3Z6=K~JFaPQF4#a{wBw9P$zjo|E5Z7py2ofr*EXlFZE9_^+O~m` z*q~kmws9P--g)t#Z~q&YueRtz?0@5uT<@d*^>gb{`af3_Ey~i$U8sG5bfH_5E`(NY z57WwRi`#@gdHs^sf2K!dG3N}_e=*ltE|92`rh;6!NUsFyFLW5_ z7l%UNC|9Egb$mow>SDNZb|d(Khf+0Z7cedIEN&mM5Ew_~==&&UbCzu&Ejm*KEJr04 zKjD+)#L1zf4|;qvuPF;h`CUmbj_Ff0pfC-?Q*&WhGeCWzt~?YDGR?zA>dbfez^<}L z1!)KAr|6ZPuH6Bk%lONCmeu|U?m zeUv5Lyd<~W7VsUh+LL}Kv_dM3aCPx=aFh^u8Hi(!H+})Z3-v~ju1@*+Pw8B*#Bqpo z9e84{{2XYen)*joe}4D9)Sn}_w#}*5tzzre#I|F}*5inX? z`D-Uvz1!2RyAkkCnJ-^kyljQ@6vMoULmTm*0x4ek6!8lD&pe~zaNY?IL8cn-v z6K#y38-L=s-53G7OrE5E{+Sl_eUf)^+lq)402Cf{1hK>48RrmVs#$?^_(irJ= zpL;X5T08)??dh@|-x#>N0$5h$$r6XW~ z`XN}Nd}~$B>8jn|2;Sb4-19_Y_sK-XDU@Gbb{9xbOHL__(IC=4-#=!4|Ed139}b|p z(PpfpKilOvTxA^-lgmaFRE5d3;)B=k*^lN^}3E@qZkgeq( zgc4f?vNHUh7t|CSk~I&9mR9ir>7^>wMoB|p>@-86zSVDA_<~*8mVWWt^v{!0ZedoZ zOJAwWDMsWRmCPNIxo08gY5etcjNm%eizYGrm+(vQf$0%P*F~v-m=r)Rm24z5#8mSu z0`;GvXgLA|?&Z=X8%qcbma5q+*6h8Vd;4^<=CLcb zba5%^TY?7et#23ie-BH=VXElD{w6Dv=R!|as6vJ?vO1rM%>AG$dCW-_>OoTZ(Nx7l zkU*V-P&WVI?ZgkJ3mvpX*G+mpkUDu?i9o?ViaL2+iU;o(LKW%=VSY_&kCe(Av4LS# z(AsD9hvlx%!2=Qb&*Mi^(ayEPGnj*zU$}nw%8VmhuSrX|N|6|$yfh<{Li-cGN2SQv zTVi8idy-Zjj3mIsD7QBH0Ak=b5nnUhrbi4ALXHb;a~0{oL=*J45M5RZzitODhx1#l zj3i4pFWJ`|rLvOv@3|{g4e%RlhVNQAl`QU0IQt((Gd!|PmshV9w?kR^e5!Y^*t_?3 zX|fo{?;%xByjl1d=BPlG+<@GAow|oWJFXLqQAQ8a>+g_5(mI+Fcuq1URO+K@f7j7) z35mrr+tt47@LD$CZSPvQ;1|$FUBmLg_3`xrdQ+6W zuiNO2YZD`%ljx}~^0mrZ-RoqH?{%{JbhCxnX}QMtA#;|>_aUXg3YjUhf+XHQ~K z)JI;Yrh7#0%BBMn4KTzsPRZKIa-qiz8qHu;wEzrBd2CTG9AQV@13H?{J>SjDqcZsd zb`N39cd)cdrem`7o$K^>XMhn@J^H)X3@3|zMcO#9EoaD8$n=36johP{K#3v)fl+WM zKUHG}P5XrgWk-S1#tiEdDh=l~B#P2)?qw*_}uSv;zu*UQR)c9ky0LVh2!tG&bb zvA!*-g4tQ|NHh!g|5u{w}IcA0cCwt312Bl)Wrwrb#(0!j&1sN~S)F#BbVZ&eqE zXw{e<{F!}4L!+v3egjtweVordoa5F|8CP}uzN)cTShY*X|Ycw6(0FGLK@Cbt6` zavs`W!Cb~M3pI%u@ku_27zjW za`k2lN&TANF)HQF&dr7viZoSZ-LUmchQ6f&#gor%+5gm`i?mR!QCAa;8p=#R^cY!2Y2Fj}4C=6t>~3K!>`} zLC%ZhFudL{`Tm-m*WqLkROy(~EF&>6SPz4%r{PUHtNuG8YnCPMme(%jL4`2sHERTOTgxAT|wRc}YSzKy+*UMaum@~2z_qHEyhGpnux zYt_KPZ6jNx2Va8e!G=}R8ufUhZU60o@9g^4uG>9{mdBF_C~jI)Qm;I9$F((G-ON+8 zuNHSCoTn2#Be!GUiGM48dp6N^I*EYd4$|@fJ+are-f=X*h9I$2ZU_E_I7Y#C0TDJ* zY*YaQ-q*G+H^1KXTGw*j%BK^Rd*7A`|NMfuLLGR6oPR?OEfB7-NPi6^KdqsAzlRRv zb}lf75s>{DaZ14SDD8*r(R2N4=ylC%1GAK&Ufz^c&>o=LDw8c`s5ug^x~3))=zX)M zCK76Pau)!)%jSjk^-%XMCVik4r233n0`*G-I~wb-}G+Ku2*~%BYqL~JiOkqdL^DV zZn~QJl&{*aW?=)hQf+}OI&tYXs+BXt8pbcWR;~?d<h(f@_r|1D?b|}Gj?3=qoT2nvzOCk1w61}k8$id}pdI~&{n|Cr*RBj?|d!}hU1A>T=IzCq4r za{d$!m?=QT8q~B03E!kox5*)bjo+=4AlPU}d?pHR{Hz zf~b^#bsB&&`Ia({rA#$I-=)LpkB}?KI^38BlM3Z#G4|eZ@0Bk8_U{#;6^J}NqNnF( zgXr13R#r_q(GAoLF%um+HACD)$9c?quq|0T_-)H~T;Fo7dPm8&hb6b% zzT^M`MYZiwd(*LM-GzizeGgAqWnUtWbtPe`MF4TM2^)>?spxz4pC6C_<3{Oyb562u* z9|dYPGA^MJuxG4l0@d>drVl+ZjzQ-)dvGdd>ibbX5E9hj>@hNM@FCPFS68FltQx62 zle##DhE0rXOzH}=a#W!iEl1oD4D(z-PyOsYomH+nCMZk26{sCk8Kur=ssDGd$XmmO5%*$V#Ip!B z6KIFQ$u-lg)KB{bON()vjp7!7^@ubFL8H%1o0f;-)Ab43pZzGaJZ@D8Va(XO1yiu!o|8c`B`ziO07WN=vDm2lp2ya< z;MHAuV?qqHJudwSBZkql^HWn0YzDPYn7$a8kczWHqEms1AH0A%_$&Ac{$2Q`9LQBO z2;w&0xM~(U*rz!LWau~eu zhx9#<v~9l1pD^KL<3ja{M~w=nkEqhE(=^C;)ZlXt-1g zOWF$yVMwa$A*!eyku~^4r*UPQfX7Av}w6}_nn z()aPTEnnaL)!oU8p(PiYx*AqOjZsr6|U8{uhiY1U#&k*DQey+14{wyZINo`WlPF8AmYDs;Px<7b^s`_yyU>; zx1l5LD!fvf@-~Y2?`ljrTSaHO6u z-MsuHEG1PBEe-$Lv3Huc-keWuJ0fm7l59Sja6+R@N1}6I;>79Ih9}kdr(b{iwWm`p zTg8^Gw?nJd$0#Mo-v1@0l8}ItVK<2;7=bbOeh7CXwt5+bHm2hRC9!i%rR)cxDnOup zL#TSVW`t7dSr6hG05%9M%!-It*j>tbL(cgm)XW*FB!L>f9HZkr$Q0%wUxQX`9RExX zsOl=xfC=9l9T9Z_!mjKkH$@15=5xK^;Fxjs@>gpPH=7 z;5DN@24nmIbF6!?#P`V2_h1TL9SNW@7+I);*l30t)ByYo`9O0SBL$isEFWCV*jecj znvGM%TKIQDbSZIzrYdOcj;Y{J+zLJ-+qC)D!Bab1X`D}xfqYS}GQ+Rr8MppADM-xy zj0Gwsau*Q1Xa}HXpXpFyXuomGuY=RWUogr}XHXOG;G51E89PaB7v_X>(euQPlh3rJ z&&t2&*iOO@aNz=-gzPY#htFTe{Zy%}?1YynCcBld(AQVUVdp6A1Hy~+m7V8w(sH!G zFOlyy`$`T&1#BfB0XxFK!q*JNXX+j&Ik`$5ZU|U?V@J1Sxk#dElJjDKk)pWDpMppv zFd>|YN=_Nq&E(LeT;yY{_$-MUWbu1uUgAX6ufC|P${ zv{o-!mf|=jAtL_C%b$EDL^d<;xcv#MKjHSLAq^eB9OsAamrlR_vDZF!M^VzBc2_Im zsg^;pW$=!BFku}`xCfDHgX1}2-Gf7!az$AjOYx=Q*N?w;{EoXXVeLz}`@sF+DgwCh z`D2%lrJWTFaHtcVbrkg2<;M`T?kXTl01#2(Cp*+8i;l+S-K!3$N2q^w*Y#a*=B_|L z*N#N{u0-|jrDJPF9V>?tMMrNvEfyWUd;m%a7$es9JH=fIXIG-QYptklr94s8w{!qF z`h=r8T~vB4?`p@=fwa4PDN2&_*J={ro|UQYTolmQ5}rnNXOsAMX@=5Wc0)^eTmb;h zBQ864lzd0XA%Zb;ejlfD^lbzYf!+!NeMyC7$ed^yTsM*D)~S2!cYWZn*|Il7ew_l) zp=1k>c{O8r?3MGcFfTWU$Gn<7Job7jXS3xvGl)mOWDt*j*99}Ph}UOn%P0}=C{PiY zr?)JNcod{r#KRj0OsKJwXrlTM1&N+}&!hE5{2^pn4VLZljbH+!Q-k;}r>v^2J6q5a zvKyiKfx5?l$$TeP@SQk=whdywuhNl+!3htmcy`Eb!**N9%_YzwUH*RZb%wmKjOPs& z;?IrVXy)CfcVQuq*{TbAUo?%EyszA;&$w)Q>HA9uyK=lA^Mb`_U-?EB@_2Z#{e9(L zf_BoaE9-$BJmc^(^i6rl9jw4#CH$;r!79~P4PVU$X?fh*P^FxzE>r~zc-63gXIvJK z-++=D5wi)u%@2*)^3boX5B&<=2uA%7+Mq4Qu`}%CwZTqaE#tGQ16<%2X0a?6ID-c% ziaj2Rot-1PvH%W`0pH{E6SFbJCYPPAs1Gx2#|a!PjgM<1U`aTg;QJU$PT(AZZ#3h2 zqT*q%p*F;e#SGAyBIdsx2<H0X2`93f#^#$e(pT#DQUiS2V&NLNqK-6AdHI2hgxAs@b`UAdIC$MIu)o zLHY+}0+f(OfeTHFML0H(GEf)_Pi>FCNZR`lMN@hznQRR1!NO>^XzX}=nk}IM!eMg0 z07vr5EE!4#LqSsT$JjOG+N-29$pZs;a~A>{mPoRbZb8n=!SwP zOtqLmw*+R2i?cucEnfWv9AOltPuk?CI!pPyY*x=G0S_Gc^8yD84kBQo&bZqGC~`U$ z2)u|XA0>j@PhK$n$O%jc6gAljIrCsKFer+hIH=D7^Mw6-UjIt-S9)&rq&kMgj-jM? zKdeBvb*I|)ifwyIQ$%yx*P8Ne6Mft0trm)NQdON|RVNa=)2$uMvRr!PF#jb^4Bs7x zPiI>WS|&EO3b$@DA2RpA_4fRTDWI^JoDG~BgWgadTEzebO!uBe=*D*8qJGR{`Rc8v zg-57ExVjT)Uv!vi8Hu2M;}>3HJ>MYDkH2T0>WbHLr3}wEAK# z7sbcy%NgtjM*E9`ZKr(^_T^M60<$l-MQ>kD_^-&{UnqQ$*3c{D{1*i1bWZ;by%L6i zPu^kj{T(^#;^Is1?onMD_I_?JHxPEm z*#L8J585vTtbdnT5PKlPV`A@PcVHMK-{gDT{BtuUrygB@fl6Z6rwq*2LULJ_6-sh3P{k=5j~_R;1e-QS6XX9x zluSEvOi@~vViNT%$hIB#Rhk4(BM)4s4Q1yc5tBPEEYX-Ulsj~BUZdYZ+YUyO7EiXk2+#SKhubnY)}VpyhN)5 z?vX{K0!hhmHb$oU_dfbB%jlt|OqknK`6r5UhMYfw16BE|NkoM!QSxTlI}G$|y7m3# z+1!fk1bmY+JV}m5j5tTHo5>+YMb4qfbV&T)^h$uZWZ8dg?7(BppdmATDBPw`|3VIf ziD;HHc-mj8aSaFtXD;Vp{5GPaMS31fu9W%9ln?8nACd1L$@vL6tT%q2I`I$SNG`gk z%6jdiQ&NTEg%yHOK&R+l>yKzDvLUeMfk(#}*=!8HXEH_p2OQ&3Gj>()bT<#Zm{erdXje9WHZg zinuOhnq#cK>*>AOCwc~qb74jmb-8e(itkkSs(gzu$f&DNELFcrtlyNX-znn1XJ@*s z-JotY_Jp+prY1Fx$jyl@!>iVjcd8T~OtKQ`VU-(5Z9XV&KA74(CgQ(m%s9(x`;t2& z$NrmRtJZ^etyL*&19U~FUDd0#+f%iB#oE28+F`MFIN=-tO9n$Nz+j@c?B(jK)z?0@ z(!e=KR=q=<>$47ZtpX81tLv7;wj-<7qwmygR#Sb=@^#l&!6~|jr>RAn%8WDviLHlL zt%o0|&OWhrU#fOQtQ~5a`8dR!VMxnt3`$JybrrYtdz{Q zxX+jtJ^1tDuLys|8UzYOR@jZ{IcU0)^UN47=Ow^FlXRH@{sz`6m}1}esIvhUSt$iK z1e#rV7E9xvnM&^0yB|UsrAG1%ai@{p38h|Ck(qjZJ-Gi3IY^0THa(an2xU>I z0vD!ZXL(BDGjwmbkCA3FWH7qvW>Y(J2^d|7D8vCLF0!qlalmUwX*KmGD1P zq%V_0ELu7-uV0~805veWK@MZ7&IhLy$rgVWAyR%Y^gL|xNOq`(j!mDLpPPqqFPNpA zJco-KAw(tDnNTFsC!;~5{8h1hAC+qPSH{y74TPs>LlXjVdj&{?f&5vuvzGmt<2{{J zGkysqpVpSkR6%b>@r;qIe1kH1T(xdmT!RXb**0CXSe0ca_@GpoQ2@0@{lt=G6rp`% z$<(qkl9{I^ch*FZij)9w+sow3DB)QixCrt9b{&-bP!n-}0%pxHMzQHhWmp10bq6?? zKWhpo!3%qhFO83!!43ec(QmT*BuvWKcKtXf*Jj7H(RRX|uBycy2KF#JQZ+qdP0yRq zQ?y^~-Jh&Ekg9o1tO4&>`898%wnwb&x#_s|32_r~iSxS#c%=53Q;i43#)E6bs$MTT z>+!M4{`Yq8<+jX(g1Bi2-yC)r!?gLJ`sbkqRwr7w!3JpSuJ=(EOVcm5?#?KtwLM{N zOgDoKts&jgruuOYx$?=Bw`pkzD*Wn4h?d3aU};aoGDP`c>XK|^zS;7X&KsRc&(^f3 z{$4>|ec{p=y3N&=a(0VOnBnAcWu9~r{Y!cqm-nWcw&0$G%U5dasJL}B*);Yp?EgG& z{(Eor)vDjD0X@Z4_%2x1kLZhSTQS4bDwit6&Cc;vN7Azk#io6YkSA;`OgQV%Yp&Lm zvrBY#eb3XCF6mqu5=(l%Us`$f)GIBoc3$sHmbS8cJqaZwdWKUy$Hbmv$%^CeT5?Ov zehd>bzN_QE`Sd+UzPIRpt`(IiG1c$WSK=4f_5=E2f5C14M{4^z=u}0`_rZPJ+DROm ziN42@EyplH-r+oxiHg3Yv;X@p@8wg!)siY|5sO-quGa6nN-lro%Ey))FhRw}ZMUl5 z$`Q98O}ZXiFEH5}A&2m@WY(GJ-FF*{ZY1eC#PXqm#`LbQLDMhi1TCCd>!ES-c^sv7(bD(OZlc~w znXKS@mc5v#>xaCoHmcYS<6f3=khUJ|)Fs{_KHAuOV&;s$Ps4Bg%upt@zPuzL>8Y71+EaoX*eQ z4_TwC-YB&)p!+AE$=&|UHO?x75YFt8$J~H+^8cAJhcbPPxj_vxr9ly=M`@Mz`=O4E zx_UB3rr<*z85M72yr!?1g{O_zacn?o^~_eqr~MKa6w>ZN+U;7X;zQplUfnN3cj#Va zFG6K3j{X}$cj@1`-#CYvy${CJG~Xc2gP%GZaco&})ICy{k0UFNx?}3%m}mNx z+_hiyE#^V?^P#Sh7A1E`&)4Fw4u60yp3PxgeaDoq+OJR&18$Z-dL55Q0~+SD75FQ~ zUnL~AAJ^g;%|T^c7M;3&aAx(D1{dgaGnl(-ZZ2JJuB_Zt9$#G?!&PGh9cATJtF#VO zS^MY@{sk5`1#n!OWJz$UI2Q*$0!%{&eX^fRgEPg=KEg!kiI@F7zM;vWNd`$BrHM9UmJ8k2)iM zl881Zelij(RhT6oF5pDtzd-oH=8f+E zbFjgz@U=w)6I0x-`$P~U^hb2(_hUbVxH`$gwTb0zsq!wdybB_4Ln}stcCnp z6-2mX{^Q;QFiM-K+jR3ePzE;@;;y-LEK$`3Ug(;Z<)?3#u2vr+s&WOBq2B!#M5s^F zdmV_p5Qh8Ir*<(inS20H# zkXE-Y#g1NGQ>u20Si9v9CvJIf2F2Q)sNPyrSE76GTl1?;r>QU>5EhR>uXGYQ)LWMF zw1}P-exXl#I=Q}{H|M{ycw;f?*`4wn6g>xTpH2-wDGooG^nB!9sQ#%zLI@PrZ5Hb` zr|Pzeb=z(YCF-^%>-MDThQ+$!6l4JF4*zeZe_8t!ff0kMrnIXjQQs%J`clp< zqLakfBVUaqod+S12t7U^iCSKAiA476MVBw-q+Wu2;4`m1lXPyry#?LU#)S2o zfRs=x-^{QP?V_hWN%(o&%!i!0qAG!PDZw(@D>hRQYODo;tBl ztnFJGWi_k57E0A^5%J%(CFR^MI=6qL<(u7q+?{kD&Z=H(FIBHjbk(PvO`@}j3mW1k ze7gnJYh~4Ir28rGJq<}C2I8dW1eg}Q(2KN3^z2~@l6=H4AR8$rg-NZVr!(f|H|V~vXuQ{0+H{zt=__S9Tu2|_H?8@&}c;Nedv ziMpe!&d1*A9mJPg)QC-mB=g^e`L}8NgJBfUf2$PndRe?q#CtHzZu0rlm|EUW(c8Ha z`pVpmInj%`G$eY4zTNVj?r(LY8`5=67(7?vh@TEc&xRcKiY22)3C528nH%O2X9$Y zgU7|eS^M8m$F1s-DM)C_kEvkrD2I@$-{L+B%80T8!H zNvHiHRK1~TphpU?q1o+3K+ zc*5_J?>~{_gOj1mvjG8F#_m*JZX!QSfgLFrAh95J5(c>a$DOEc^f?sA7?ClN#y&U3 z^rv-*?vB--{i&WYv1ja#dkif7@4)O#sjU2-c{;>WNK#k7Ja=_Y`$?7ciDi9wPnB&F z%QoG26_7oX0-K%94uOe@H&P3l$Z000g`8G$+Q?zNRethykkd&{7dhSJ^pMj_j)CF% zN%caDCJ{8kCZ7q@5R=aE zK2?3GbQ@dU&B|7sCTSj8wd-)mYM=W1W^cXVZfqxQZ8ZDM>^I-c&dtnz^V@H}Sqi{i z=bZ(u73c$=;3kfcUgqYI55*`ZcygcP_AK+S-xDpUVh%qvo#G-C!;>On`1xOoNt(`L+|Q2WSI6!#{K6!VJIZ<{fLT71@^PJGv( z%#O&r!OXep78uy^Z?5dme)^XQrGk;ljP zZS~vBfsN-ZSI5uAUdbRJTGz# z1utO+yAAvJ%;fB35BBXJK&$~rUg4X*v3jTTh~U-cXnRm~iXaFGfiJr0#RajCRGOU# zRPT$^Q&agl{#hTJ&bU2=_-lS=GoVnq189Oj2z-11qo96*SjTiO_@Zg!mKLHslrKB$hY^4d#FUKO~hnIq&8da z^Pioc%FirhXLt`5uaN*+ai|V5VBzgU&B?X$ztBxuLaCK{hgWO%=kuqIptC(`?Z4wWmdK-9n{^OVijRcK1xYU#I>p$d(SlkMhoK<-y0Z7WD7{*fwSu8zLs zx$>uQ!sy&d@iV&Vp?-GNrF zYW>y;iuz_`YZ4z~n@qq|4{e6g`&F(qUPa!8__X&XJ(M|M>M6<_iJo?$1uYf3|34k> zyb9(sLCXm}%iV$#f{c^{+)G=Ogk88LWMQJT}i!et+HpYfE5l-!LbeXsMKPB}d&r3aN{`t981oSf{JlKo033;aRxw<*D-5>LtTgHrq; zYDpf?u8hb%k4X4SJi>4FOL3gVn^s~8CDDf(YQqBa?i1Eo9Q|Rxz+CQIBSPF;e|A_q zATVF!*|Ep1t~TY?rL7iw^#aNXO2ZAf=8Nf75dkT&o zgr9gbN+hayk=`Pq+Ft+pJ*#`DvoztUo8@qq6z)>|VJcKFIx|{qbWpL5I_E}=)jcBs zLb^-pN8Y&N@p1=y^mTz;@~y0{+$|L%Gov>>-t(!|)P?5NG@AQ-=W3_yNlEY^CrI!! z_k`rhLeeG<*%OyMaoN)*dD_>zq0eh2NRAM=cL>8>jh))>69l>`#fJ z@53Uk`$TjDCLW~V3yK~h>K{a1q~K>1{RvS&rr`G!@J~(j9Y!RZXR(=Bt`SA}jeaoW zD5Z^2Kogkt@riSEn}X#EOkux(zRAyytAJ}dhu`MB`B1xN!xdxA8PckV;_e0ENAHc5 z*H<(39mnJk+FB=`<_}n>4#l*RXU( ztJP@-Ll=M*}5|a`|wX?TB!t_k&&U@A_cR`+GFT6Mnb2t`pi=h1*sG zQ1bl5>csinYECOh{H8s0VNtTTtlJRLx>!7=G$)l*hZ0XH>8#R?@s4O(=^IcoUGT1> zgqoG@JrwQjztu5chBk%EMB~YAlz^u&yzKF4HS>qbjihGhQ98|8tK}qI`vtT*PU3jW zsOI9Nn>F|^Bv!|?dLH$#VC1cei)qcvqdpdnX$_owh~Z(zSI6J#!Q`to@-#n-r`Psh zj%rOj8eok9Ey&3bizT%%CnI>ww?mghS1P3D9xck#Vl366HFGkK3TQ2yOi&GzoNQ&m z*rjb3x2^f4Kt@aPw6-mR+Id02xU@DUMY}ZKMA|l1<9Z|Xdgy}fLjKas#hJB{ z_r~8DFGO}p{(i~ce>+1t^)W}Cw;kTWkK?d^L6>Y9&-}tcOH&ZYBav zVIsk)dPh-P8`JPExY&cg|DRqb4#%Fmr%(2vXNejxn{YVY^3 zs>z+iDa!d796C>GFqR{EC|>iTTB+-G;=fRPesdGEJ{wfh$fWNOB7J~RYk8?y*S&yx z(HdRtc%XXdki0(g@R@@2Ytco4o22wUkd%^|Jt@M(p74H=^j$UmKzaF->PoYhiuF-e z%cVj|*VWDVMS|T$!k6`PeN*=+th9{aD`X)|~B*d&z5QX>%_AO@&<_tK%a zWIuVjwAX@QICba&>sWGhwn>$z#Spb?h^oL%9wH=g@EZ#R4luKzdwGhH%w`JW6p-qx z8%ZRIYNa4WK^uavTcYkWEyqja3rT{+_2c3?h}uRn`@h~(@CMdJiWw4;rGQXzZX6-1 zMD3uU3xOK8l5URSZer{qMmJ~FAqBpo7x|K`fhY4P$eO>1NQkz60v;jOYRsyX(%M#U z>we1cCIurDtWYlBBTwgs_YNN#$vyGZ-bat-_KqKX?1=a<-9Jac4=LcP!75R+H0Jp$ z`7lKZkkz+kRDX-lC;34!idQta*g_XVty~a->~CZm{4D*7iB`f6Rvd z#D+d*`){z8+hvtP*#@J)$%G@q?Xn7?>=r|CyT+h4YM5{ccgn#8iH~hzss$&z6ZmaH z**ZgT+hJ132V;I=Lb$=&P5IbSP*H!QV(=5Y@7%;yd*roBy#CI;P;f?GtA%QJ?xh0r z6)Pvo1Yg0QE;1sDrz@TjgqlJmTVzBOUoQKOO>h>1nIa>iI8-}b4t|(_Fd~W*Rh2@j h5ZzwH1Ez|er^*qFAx1>;RON~tnM2tPMg(uue*=Tx{TBcL literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/moe_wna16.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/moe_wna16.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8308243337739aad0599beca19b0ad7cbef4fe97 GIT binary patch literal 21307 zcmcJ13v?UTmDmh^9Q=s?j{r&VPmmNp5+zX|OQQ9&C5y5w$#&=@FvJ;>pa7B{fcntj zMki5fs_SjUq*+mEwuaK26DsyL%%1ct+jH7F+im0RZnqdhDaKRfth#YdR{cVjTxYXA zZSQ@90YC`U+Sz0zzPayz-<$XDz3<-p@TX?8fr2Nd`R!=rG)4UsBg&&K01y8+05>R> zV(Azar$=cTVO2~OSC6XWno$jjt7F!>wu8?}-2nwUNA7KIJNK-26D8-tuQfw{feOoP$Wi-fIxF*)hHFHfqNE+9$ zwyT=a7E;y@WgVQgShn@DI^?{M6+^Vd9E`+bk#jLFq>|Lb)0ZXYBI~lj?b`MEIb?Gp#lJWWcs{hBKfo734Ug7Hl&suN9Gb7dt&DBskz8>^5D$$ zcyvPYN2im6;fsTjfgR8Y7k&-%!egb(q433_YMCL8#3&6)wF*>d)vm-(aLLIT_UTB1 zlLCb_?jwW z2u_5Ynt$%B!|@0oi%y3VG6sooEOMFSA={dOS-nV53`del zK7natb3D(%yv)WT$?+LJJ|=%zIkXlc50f-Sy#b4sqef|#8db6MsG3!A8dl9|Sq=PY zIUTEmzZ$NF)8JBInfV$B^&G=#IQ_Vaq#0Od+`34O)2#8TY1F_OSp#PpSFxt6nzsvG zjhabn4OctWP)I6!kwvD0Wmqb2CFQlyN^M!Ht@4CUk}rG{96>QGcl12K(~$I>Zksu)vJDQX^SLpg=+QYkIObV_~-YK0La z@}z1YR>YOXzV%8Ts|G3z6n6nBu$tAzOmXvqp1`^oP~8OsU`tKaD&de>8;G)f-z;hqGI*amXvX-qm-{e z#d4+;GZj)&OYdU7Ib}+j&%;de_GGt`&l*zZe?+mylnFx9?`f5Be34G+r?x1iO7BfZ zYAkEfT-Ju}3NwLo;!#?-Or>-i7(~^GprBm__frtenCkPcRCRvX@ ze`Z6Pq?wpaz9y+77hk*o|KOQFcV?2?uxEnuCJ7?*xHt`*BG?^-(!ppV2q+hoOUHILz0<1n_O>;_k~z2-Yc_M?h-dP2jzPSzex0MARo{UpBqKZwZp)>)ox6t4@8tSo z(Q|#Xmq9L>?!(#c+xTJo!RjOi|7u(%P0n$(r0pQ+XeU^+>Qn6#ip(JZV+2` z;5%5V*oYN>#ytBI=Ea{dFW)m-a>f?H*pj}mZrma^^yV6#5gML(&!25LyLjk#PKst* z&fF%L+g4ob=H9!FeYwWNLgV3V(ru`XHgJ=n6TDGrSw%>IJ^D33A`(F1`nXY|#ir(KXcJIw}J+*c!U!!&% zpdYHW9($hB+U$7~)d=Vu5gLxX`+BzFG@|o?J&>tCn6)3um=4`DHf4+iyzhUtEMv^bs(a~x8Q+()$d!x&T`$PYo{d{Y%%HqwsIL9yZ-+aB)~ zKN#s1*%KV|=m{F+IMv9!v#{@gNRcy0?^I%Dx+*G?6Q@8f6&{bqIQ-~|`IaiURThdM zc18$b1z6iNmW~Y5L2#6Ga?!E!FO4D&!Sx(qeFKFU#J=K5(Kv>;3-uO2G}@$;s;m@< zYnzl(mz4rBZVLOac%Vnk1kGyS(J6<=1#L=I6s9(7nJKkwq84-$)I!bl5Y*EX!Krgo zj0~lE3m|4}f{_t0qKf#`OdFF$!ICwVgclXeB8aYP5K(m?aMq;gw+lihtOE0hWFo5| zT#&ytD0ma*E!7kx?IUn1t6{nE$XqPRqhuzj6PM$B7ea=4j-Lt7&H!8E3EM$g3$~yc z*ULCR6Az!~E+-`GhG-Z>fL&OQ;g7?OQa6fyP{*UkLQCy4ON${`a^{pg$WplVN1SUtSDGwa)vaqU?=BDQr1re@I`Odno1hpIFUB|39$yM(q~qSc-84P?Aq zR}Vc@tJ-U!-Xn%`)EBAwf3es_Q=@3|h$gpa0=Qm;rcL&T7RGAGQ;gmq@Bi{Oj4;_# zX8_oIPbn*AL7h_bE?A{BS*6zM>rWBKPg9pE{vb3AdI1l}Epdf=mkUMDV7`WYY{@h^ zt|&gnCy3}*H~~t=u?UDnGt+o>OK6}9esK9?2WB9VkFo`wNkJn~ShMrp6<_~Jg|9%n z30$OCs60b?>T>P@!9B3rlXdT3G`??iFYjLQulol@hx^9frM+Lh;xSpQw5XO|gWgL=2{MfL+B?>`R-{!l_IP zr!wtS@qAe-Tr3C56t*U&jq4U_;A~klg<@dEQ!LBC;vr|lOEeFs(v-#o$^#y>h$%Iz zzd$YMA>V+D3sON}2&u+GObanxiaGyTidrzF^q1&_3Q`O}s7xa~$`TUAzlbk}1_FF`=N&@4#Ru23s*`#Q?c( z$S6}L786Ln%=BD5Olac-rtQVxDGW%ReE>-oawI58>ryRB(xo`zf(j8+mNp-Ts`Kp? zD|nNQbPdWT{tNByNY>H4 z$duy8vW~t*<^z9s&Oa>phqL~J8OD3h-kh;_U3)sye<0U?ROmmN?SB>oRDIn?45dFt z->YxW)$bDOcV+AMWSF{pb?qzdLS6rQ-OFp<)z8A~-O0=gFK3RtylA;+@-811OidqB zD)W$N^)3%DnLktme9zjHY1=7Scjjv-y$57gYva15Ki!*Q`sKZ)B}V~CH_LP5(M!Ao z204a1N_jK@kRA3{!1h5%;JbZUb zH-$&wiU&#uE0Q3p?CmLB=oJs>j*7J9H0YInBdt~+p{!B5w8;pElWhvBsNb4uS7N2N zGV3rKn%@ZC>fc1GNu)I7VHxGie39O;u8{PQr#M1)vZ#v!p^UDkepSjMV-4v?%19ed zB#2@L)EgZn0z2OUNrdFdWR3`5UdavQgKVd8;(T;=mMiE=WdRK}=#n9th)ySxk?Apx zM@n8s{2AyQzuw8=X$k7$Ze9^{P>l+??z zB|$QvNFnpwPz}Ea%gtagivgj&h^^c|cAdbjT_CeWdf+cW!hB-|iA9OD9G7ol)l)!L zK$oDSOwH-`mDXI>VWI2r9pjO^j;^euCu8clYxXQ_zd8S7bJx#ICq7cC%(WlsDD(3) z0YRX!I&+o|!P2p^W8E?!Iy^V_e|5iz)OM%f-zj^wuw5AocQ;%H;;~Z+GWxA;A_} zIlFG#TIlLd$J3(Iw>Ma(Ufjo(dIgbgwCO?V^_wsOA($zepC?R zu`wK$3}G~)fyy3WQy4!(tdLU^4zn|3kgQH5c}b1hGR?V}nV6)BMH2wCQDFJV3y5Y2 zL`JkZ$VV66hRM%ha2x_jOAhn=2qqv;=U>2pti~~fknu{Y>1j!Kg5bjA>4~fr$y`u( zjd5T;2;Zjp{g6de2M6R8<;9{Hpoo!JgjMs4tJM3N19?rab{H6DFt0@bX5a10*AS4Q zT;9B%fCkFu${Pu2q5{FZnSixaQ(OAYeE`D=oJ^gW0fbmBd1}S8GT+(_<^!(v!Cu@)Sft$Mqq;3q=9S50Xkct@EBf zu$&b1EgxzjQYZjW-bxC%wF9ezYwESrYs~leeMmvHPz0cSf2Ys-|?{)Qi(J{K8;Ku@KCxz!%!3?&>V&q#F8jM?11!DCPg1Antc?~ zPBbPPR97gCAhCRi@6tDDntIcq0b2h1>Wegg42lv3 z3mlPEz5SB0unlna6MPfcYA|?(EHg>JcY>SdF3s}$=0nwKdhg!Y3>bG4`+AE-{~TIK zpmO~ee}#(COzlZwXk^8|`1B}KiX|56=UQw^g$1Xkr zFbVzxwuy=^Xz;GFHcQ%sFl#)S(VUczoRC|56tm*PFG6$G&9tnB)v~&A9hvkRFurP8 zhSOEESxs4s?G^zrV4{k{&@5L!>C|o&O33guNkQgmKBFr^4_ei(*B?iwH#%$o`NVFlJ(QS$3d3CD6I3 z`7z^@sW6IZk`WKiK*La}BN->7EX#o>2wLZME^c3sSZ$eGL zyZ8-LdWhi)AW!L}I%QDOO7AILL=_KcQFPUGH7Ntq-*1+!9afjpT~KfXZ4Yvg1!J;E zd#4-~O+yscn^8HhESSJ95B2s!da_83r%=sO@l0XU6;BeG3wf%YqCsDzN@?bCwX0J# zIIm?rRH3Ku)Vy8zTCCpa#H4$t}Ol6J|?1)|J6rR zyUAz_soFBEOQWg%wT-4(nW55q!xv(RpNn>ruVC2(Z)3_*6>rOL7H{$uo{X_|6O2tM zYgLS`k6Le?3fGr#KYv28L7=9I1KEZE9t;nzPhg%S`t z`m2t~zR8$sQ}(K3vX_r(lP`b~DT}J{xGNtNIM4oGF)5egjQ)mCxXt<9IRx zJ(pwTJ_<4Sv4{IX2#&y+gy`MPaPCQhi)~_d1{`W7bHP0j4Erp)kn&%G!f=c+^W1B5 zQ678%C-?{pUW5bN!mr6?CBwz7WiVtU&>c1*=^?!U8c2DxWy;B~5l}XE>M?I39Az&_ z4AdYHbd0EmTtuMa`vI11rmm95VwjCyP#m6Wp@5`4`ohVx&r8}1=;Ir+%aV2uM5iIi&T|u>6_I5oxde}VRMN`mBWEOD zLXsBBZ`&Gjm#NQ|F!QSzd<}zt2!UkhF2P(Yh!Bt5(%ATKV-W`QGJy8PW&Rr&bzS5m zM8mid`EO#AzlXs;#^5as(iq%=K(a@tVGg1!@u4nQh}B>xmb9|KhqMnQ(JGI8EUq2n zITRV?J`?lxifXSQrf$r>3#}zYkfFe6Vy7CKbM?JKeea@{_z5-U{5^udC+B}k@IM8n z5Ygq64Zk_pfZ!Sct8cI=!_?;ul%;pouwJ`+?fiSk*7v+Z=7E`|46Sj*MecjAIw`QPv{}Vcnph1m@$2AjtL4(FkQcDyu<9h%XpXlnf9SO z%r58z>Vi%t=jsq#9p#-sKYf{P2LmucN`huKM7#R7sc01jj8#+JLjhPS*oy*XdE;Oo8%PC~{dqv&iDovw^`r{LU~ z*>hZQ9*30lZo#?bZe8PB-8Z{oII?3`D}0N(9_Y5NcJUZ=YqsZ1t%9jF{n^#scTD^4 zngYwCnb4j)roA6{>)&E-GC6OT;O#2!r!mvAO>k}#o!eID1?O)&D%n2W;tGna?!sM(;I;C+BDt9IZJ=_qwB7v^J+x ztFL6*hXw0!#@hU!J)!IByBpAiPjGzccv|z$;qM*4ef*Ad=shji9T6Rs&wgDCt=rr= zYo}oCTyd;$cdUbVt--Vl(D|`-&jTan?aG@e(;ivsNT`MjXH05Kt-Y*Ay#tPXik`$7WmzO z%pnQqBfL`5LK?W?Yk?KQ%Xkj-kLSaQ$;d1hGRWMJIPnnPqlcz>9Aq^%Kp197TgH-# z6{T8PO%gsg7mWd*C*Ey@9mouzj<^%y4V%aA(aeV`n8D^n&%okH+yLx|HQnQvy?}D= zZGw9nxB%H)Ia{+}YnH7ni#pL#w;WlwG^Y2gHm$d8%P`wAmTj;^t$vdH5qNZVeqie% z2JQ!7;BEkgb+k39J=`mR+ zlNg)fl@!TEFk{IgA&JDlTk?Q92a2KD4({@hir_mDNk{<@7Gjr4t1zB|%rG+zUeU92 zNyz}UqZi2S5XlHGhTu;f=4NB(Bx?aPl={LKqTEHvgL4659A5dDoxGeNFe2_!GI*#7 zser=VbaJeyUVzVyyNFPS{|f+62ODoJ*#KBtQKx+o900)VPsddYE>NvmSq<6YHODAc z`!>|Xr`+H>g;b~FA%1r{*z4$b$E4tT0w@*WVo9}~559$ko&Q76?hm}WaU;DNyv!wgWw96J z?QydE6uq7%5sl5wH=o370;i+Toeh@0BwkJ^I@$JyOp;mA0VY9)OYr|1ri}k14Bo}y zhZy`D4E`8{A7Stw27iLVzlT6FMW?|Gh}V6=!<6`I~sgp=RzYA?$p&s;7CeK}a zTc-U?)_yi)I{P!`$UTd1`NefhONMzi)Ano@LQBit)}CDJ9-(!QSmzh(I>h=dqTM5U zTSQM2!jSH36PtGupg!m87kvGXeC9R-$i$1L>yv5vK?7w4pGUK3vgJ(m0{TA!5D-iO ztmonA3?SZo}?7Jbk}ui)!_8sS= zhZ}_uYV;IqVAV;;m!E2qLp2G3)s)pKN&-qNaF8yO30Pf8K45E-g&m5w7L!GJVBEB* zUeqj7i!>Og^jFQ`C+sf11ENby4r^f40AJg&k~sy4%J21X0UXC7(;zorG(>ZR3WF-^B`cbnt8xwftrBn{1&~S zf!3;w2E7**eT3XfN+a8k7SMC6N(~U{irQaNruZ4iz)4U|v#ziPs6)$MkbFZCe?9bV z__Dfi`P4`>+GS%a<;|11O@Uwrt(cJU@BwJ0P+uOw!*^1M+=svEf`#44S!P-K}9+%AkNi_u(3y&4#vdC~G zovc3rk4s<-Af0HCXUiH5oF2FcMQ&2_Kg9@rMlyi7myAx#&CDf0ApD=0YAjr~Mj8({ z%%GKisU|TQ9Y-~Yo9DnE7$hEaup}4DX80;%q8*w=XJS%JZf|~X1tEJ(@lvv=KVJKv zAi;y%>Ok=FPRivcrx*HbBa4Sch`h4&%C}wL@!#@iU0W89JOUq?z>U~aEIpilExqFh zu0QmD-~aB5Kldd;jAPrqT?GPA_b@1sbrc!lV|MO|Zsu zP{kvkari@<)TnZrV717cWW5KNka5NgD(ad6*jAC@F3{5-v2ZD%EKjX4W61k?b^Qe7 zNSokM4RiTU)ekqRQ4Mp0V(u>BJyac!sq(X9VZM~FGUch5K!_&MF@ikROxdNcHGRG? zC$B)ie}{HH#fZtb;vMjPjBpFAux_g9hG6ak)n|h>5)wvFqduyNR*9@aRQ^;vaK+;g zP%QX;4qOKb4QnFn0eh)pl7}%>ZKJemSTm_VQGM>9ey{=oh_YxhZ>f@QROUhvIa4*n z{Xq{`Q)JVf2hP(~f8HRa0sR}pTG8bFW!1Q9FEl^x-n_I@78!M zquK;}hl2fPqgF;GGpkgMEI4j99vneAPOw_c=&WA2L#Tbn`F7zf15C^{B58?5>=jii&$3$Iv_`~Z+a<~Jy)%Dnj_Eb}o2=mtfs zWeKEPrStX`c=!X5BzaIRKz`6c(TU0S&H3R^&b2pbLO`bdPhoNX0%TSzW%Vp~3zoL` zgT2@1GS;EhLlB4#Z_d#sINEZKo^?mhs(0-bVc?k!MLuvfeeLr(SC@c-*)2N!%dZQLuJ>DatS-D8&9*+jbV76m(ni76EvHRpTTd>XxYs)H zos+i!5Wz5DYY}WMIonRbwsURAx^1|!dAH!%pRw-Gc=jvJi*of`>pr1%U$*r?uJy3c zdiamIAD#c<`E2V8r9PpbT@b;l3p{_yqhzn*P9c>P3W^N9WXod@2n&vu^7b)FJB zPu(=%>)if5%WcbAlhC;bVsdxaPk`wU4H?~nwRdS_3dQmM+oPE?bdS4u9{&?GtNT*A{a7PYL@^WnMU)-G3(Q zJ1aW<57eqUyDZ3qvo#na#r=n`AHH#H>Dcn+OxK>3V{1*fPXdr>+_Uc7d#{wa(zQn4 z?h{Ck;N1Iu21TP<(O!5#zzD zuLchsAe)U}f#UPQzingP!hM74o4>h2eW+>G*M0aLt<%@#15|71f%zD%x8HSmZwxIB z$y+C{#-tDY!MFC_+?&^81gxWV_4yhC;!V>$T&06l1LgJSjRZ7N^-Uj>w(=Av}ffDt_T{Ub;CEC5hsB;@5U1IQy|=3mDkg#n5o zveRd22EUAHWYWHb&_BZfPelB`!~pJU%CpJiNsGrBmbBzI6u{o7mAUHn)mhz2df=VgRCD;?6zdmaSr4Pz*GQ{eu|Y zzUx8vPH3PUFNgzLU|HXLZEwB?V+`f3zmZx><@FddP*z9Yh>!`capcVi)l&Y(yagdE z+y_~1T{@PxVa!hX8`7GaU3mw_oD|$3aUtZUz;^W9(sSuS!P$}bV3L<=?pir?t1<7x zSRJk`KSBX2*p{zHr~wA{oxrWYszzuU%r|0Ekm~5pHzCxF_fT38YDKKt5NZdj|64sb z;i`yGwBwZO7=nR_dCGX|vTx!ccW z8ioY#E|YzBTyj~I#)y^(WcWC4Pj z=<+OBmE`+As7fHTK(Z1QJqw;MeDoX`##kZ;5V7EiKOa<7-}k~P8GesXUio;^A~s5? z>yxPd<=-MAx_{Ye8}-LLGIpuy3H9ZlI+O3(;aNoLctT;~T3)#0PHr`Q@}DX?2fzMT zxGcw`bWSePO1nQ{XT4zTizRvsKi5Yh4L6YKw!ff)Kc!4Rr=I;-rJ+?1 zC8Sza literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/mxfp4.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/mxfp4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a67723967fe287c9fc3f03469d9d4bfb93d70446 GIT binary patch literal 47430 zcmeIbYj7Laxggj80TKWK0t5kq4}b(okOcUClOpw&NQrt-Z(E_0AP^g*NC70>04<3I zVol1ap*P75xyfBgH@Raf{a%b_s13- z$?Me2?#_PSX*3#yniS>7otdgMiPNXg^E=;pecyM!bI!jt7<3do0oC^=|F3^eQGbUY z;-gMk9zRh~)J=+}XhnbuDn=Cwv?~M3V9sa`vF8L-LG`Gb*j0ht(OmFX2l9fNQ4Mj+ z4QPY9Q5~`81@eRXQ9ZG10)}A0XaTWn1ID0f)D$cnEex7R&BRRyZbhROV$Tm)gTyD?B5tQoBd zx<*~(+Z3n`){WK?dtsnH=pJfOu{dJ0V_;cRLNx2@REBpt?m2~Cns?kHV z(|?$JtLQ5D9wafUAx4egosKa?TwUN=>o=!ej}X^7aIN31q{9soNPR|9=?bK{|a%S2W7=aJ%X&>VY`on$(L-?m(3BlVo;SYPq<`~959rnHw z2n4-jGt=Xf6X2CUIXxNn24*HEAVv60^qe1T{?M1t~W@%yAde0hdG=nEkX z1;H75F5qLlSNzPhKj2aEMPj%!bH3?tDzZIzZG5)Ni)NoUIOF%ejGied_>}L}E?;}k z)4sNuu~~@ulm|d-Ip_kQ=B-%Kvp$-hoSyLd0+SQdL8x*p++-+~X6VY~?5v+gw-5xr>Ytpr9QOLc zVFp`@No*yv0bdxh$M&TUjrp*O!`Em17^dK|FXSB$_(GQ_r^iV& znVY7)bF*N#LH^+xX6&-p4@eAnp->SfepH+rZ0g5BkQg_@`+`36Vd>q&_A#?4%#7(7<8z z<6JOCJUU+E^-e=S@p^fk7qAYE80~s5w#-1vM>TS4WRNdEce1^$%{w&gJ$Z5HbQdxE zyywrJKXKwDZ1xhD^tJuy7O=bk@1eE!tP zUFtD_N&Uz3t>NHo>+IxgbJ)i~)iz(7>)q9NrPtpYn7q_FdmTFBbSw7fRw#i#;Pqeg zkIey^TS@Z?wL)2@!>MN3N?J(E>~+QiNfToaSVFB}yiO(Z^*0VDbEz6vOjVgc)0rqM zkEv{l(z2MU7<>whH-^aT9}O`BVhVgN;}6d<)0?UhF^<2-KL+C_6&97G%=DO_=>vDB9}Q9~yD?b#HhD4!A;d}eg?hobK|Rq?dOMe2!{*nl=hto+%Q>Ty zH9BJ&=OdMlQ(0J*C1xF7R~_5Pw{iIuY<@*dRYCIN3)2M=ho!M}T|?UGcJKo9yqi>n zVsuhl5bGW46~mEdD%}D^7m53z1p?voEX_QxM}b-C;stq5G6ft6*xw z1!A13LeVC@X=O(HGK=n07SSfXH#Kmri|R}qO0_Nj@(;ia(`BBsb#THA>}XHb`D3#y5sg&`8323TWVib zV(!wL&dq>WYN|?g z3$r>g2O;0-_B9g~CW)Acgjl zL@Vs`YufdCd1bm98Q=#^snB&60owF{TTWH z8ib@odV+7&Jk*hnHdW*NN9N!%prJ%keqQcgr+Kdil$=DfbZ8Z*dqvc zgZdMn5OAbo{h_^Wb-P2omce;C@g(Dt8ZVeUe|YTm@3vy&JB}=Gxe^S zdLQY{oZimr?J>vkb^VErqDro)gDvWa={ux&HS7AGjClLkO#3t9U0Bz@K;rFYi+0EK zySI$jw`S^lgc%I5MFTPYz=pxW89G=)$7<)gq4$H4_1z~nL~xSIoI?AP9JQg~iJr1J zxuQ|Sn z*KJ^wZr0cxH}>9Fu*UxTXV#2+H;QW4i|RLu8diGOiy)IUQaac5U8HbzY*AfIU$>3d zhPIT`I$5oAU0c1ODd03EtfnMZIs|WRFhy;thQ=hTeI%&Uw7}$2;91^ zY0C`!2+3YMn{SV)?1bzcgBnji*jMm*{QrRQk29Ip0$S6v;o55CoY4Tz9(Uss89%cz#P>cBNG!!@6{=J@qfGE^7l2H`as;R{mX z2_)dB&G0kQI!MO}k+PmC8M=ec_p1O(Gp#4NvW zq6Q|-&mbF51Xy0?JR||`s*pc0PCj_8*EQW1=X)L0m2ux20& z${7oU_*|IkK@ni^L4Oi4r^vItW8d5%mVWjnNU3K|36|9P(=XDLt~+U z6dWisAA*z&7VuG(FlaCjl(}rK718xB1{d=yemXzQo8;#LOJ-Y=lQ&E# zQWXPrq9zh4UUO+YFyjlugq96h34)N420%*&>cUGCDcHO#8%1)V{wV@(1mgzv$Yi}) zuv8E;R>m}y+Y|x$#hcUwmDzfp0?UG62~Ej6J{bVQCg=~%JF`K`4)Gp>g98?+w*@T$ zgIAOO$>)Lua(wa{Qx5)Ph)wZ9#1rg(0CK46H!drMZ5GCc@TiNQgnT0$PCybIcb=z&nR&Q z7kr)asEs7VlkJ4rJ^IvOcs zAhiR26fFb!+YHow00K=AniJ$<&MgGxE%T>@6hYW5R6q0eGS21u3S8!2)2!2B#WG9(1Sh380 zG)|y_e7Ha}p8^|i9tHxN^fF*61o=I$!_^F-r||`w{X8lLsY51!2Ej6d6~a*wWRI!g z--Lacl+ha`3oG;X?0%f-`@dicAAoU#`ZSl)c0S6tF8Y5ux}NV&7>hV#^_sDIy}mbL zb0jKjxXKQ;vLjJZ%~kATD|UUFr*ap7w6CN!ZflE~+MZ}ALlLJJ=rl!PRA6w^0t7;LxHKfShYa1(;9jy2Y;8{Hc=&q^)V(9bsX$87^K)4(HJ?XsV%>tXA9 z;aDb+`>{oBw*E6UxC%9ZWgLTF z`VnO-kN*!C!#7d03V%+DepSA%$E6s*T5g1%N=tmwX;QSxuR-dT&V^qd#7oOGq%h+; zq*+IGK%G*f`OlR)P_T49p|4f69?~$-27kenjHK^5Q!SWsO7Mxc@m#t9@+brW5%V5gnk*9q!p|=r+`%Q#kkLeM26zI0`i`a^%ziuK)s}?KQTp$`U$9} z{AG>^)prz=2)9A>2YMuDWZs3dUl2J1adFK3)SA+s`DqVK{IHZfJ>wjk0U=*_5(IfE z0VfLgQtN6?w4dVN?;2Uz0|DafW#qhI4b|@rfo1`3B7PYzL>|Q#Lnr{=3p{UdF6;-Umj=;Y2*@`= zVtfbTCG!|I+0C4;z*D9vmNc`QfRG{Nc>gFdySIwgHI5BO%SwoAJ;=5mj5`i3sE`9L ztKiC-*s`X-)^lx#SokjkkMA2F>1*QpT29}`>ianTIaYrznL}v}kO-ORIb#)Tta|I~ zTx}0q+p}iuNm44UE>QsDCa!ac?Hr1A97$5jqly7A4FivJz&8e=bkq^WumXAR8eyhF z5{l0Ubr)FT1{8pjNB-qWn)ZV}1R|{xNCH5NpPuvqZqNx41&R<%coGrM>%r;^g2WTK zx3SE|ij{=j;PH>4OTi)~FeG016rOO3m4PM0bf*>lC@Pxw41W!+h-xFMs1CYUW}af| zY367O3C*krJ38}1INf0)Dp-O{!4u73f2Pt*kCcGe=@>&se9)mt$IqG8Z}LgA6w#cB zI->BRsDV6y4~2w}NDh3ZVU6hED?NlvfBei+$_*k=%8=ukF=a$u1V5!2r2wKNT|Qlg zdrpRX&h-EA%)De+Hq2`WP|6S;GEhpjuo+6pJ!krm#NJLhQ&o}iO4n~qxJGmq-Jv_j?TNMNyDXw38z_1ReICfh+Yf>!iiSVC!&w!X1Ggz`b&p9 zq9t%^wt(BP1>CxbAp<@MZo@we+!-~A%vja~Wx+WGo=tU7utiz&BL%W$Dfnv30&P?G zj8dB-A3Ld*}bUB=;t-) za%s{R;H5Dok959fll=&X!CPdRCqOPD&@_U2FH?nqE`#AQ zXULfdq*B%h32*ja3j^_*sp<*KsDk1rp@{f=f){~2;`39)ZD3~X3h#Ley3M4O*EcSJ zMe|xvJQb8qnVSgPEi`VU@fT?P1Pvlm!5IZ5V4ELdnYCby@gor@fESu~5NVi1y>9mY zdq^qtGZ>$4P)RdouSa6FWW#2^`RdZE3;K=X@|#zdt}NsyEtIWvA^&05xtOLsk#FSk z9c;eiEjw4$#a4B#<##Z zN09zVkUMeKde&OMp)dU5@i&eqOyvnvVa&3JHSGcWt(RGoGf`aqLNZqcBX-YDZk5zTXx(C-aLy$qN)Dvsmv##0usHo&--%=k}RLd6C%H)7; zX?kh;Ngk|a1AhXr8z@WNV=B+0ThJs;66|Fw7uPjC2zyDarW>r#kN`JFmyU9lI@VH` znKqzj*ZniBX>T@Gtn~o5Rkxtc0DHN7WfD+*QSK!&`?whXeiPx zP|E&19OF=f~#T7eQx`F29mI{O@H5seN=t4V4r zqK!6St0O6To+Cv=3OaE`K}-(bkRq^Pm|{qIjQS2r4`tX6IR;V&TXHc!FSd4?*3PlD z(-gPCRz&p_wmLG9nX>KxTXTSCxtN~^Tc@5QEfHHXEg{{ib_$n>C;4`o-Kvo6&U$u$ zCoG#>OgTL5q-;A)Yn!skw1J+ig2CsuWit?5W7*_l%9ZU|;|q=kJ55W(lYG6U#+USH zfP(a_X9s0F0r1Gh{5)l=dycfkvTf16)1{Id3;MH)37=6Azsj~hxv;qqR&J(|3!4XF z<@%XiSPg`gt6Q0{Q>cQL^=wb0h$~?r8keuJFE-1KifBG4=jBJ#bXs&0RTG!I z43|9OqNUR!tf+Q^it6EKfL{UpjPNtTuaGW`n#23Vg3HgtVo48Ei#gwT8S1tO(gz{x z)P6CZ^p503ig3=HIwZM?whVk87Tu(`m`fzRkvM9J=%oQlcUy%K@LLzff5!#zGzKYWwE6GGO)BG zY-v*^I4R;Ann1DPWHKK5V?skZqjK8eZTN7U)2Lj}I)K+XNfTd2%5QVo14>N`Ahj4e~rJ zSyZH^WGFO&vKAM(o-y~H72%cMB8*Y>GfGpC4yk!YNTV2X=jDb{%au)*pa0TIoGZy< zB5)kTCW%s!h@p%y>Z~wnWq^z@pd+{`9V8vdlMyC2D@@tbVV;3u)AM6kq6o8cPwRS1 z5I!UK!Y?uRw5q}mWS!?lJV|dNgL$^B^93n}Xv3I`8m3;h)ajZPLxarcPcls^tL6Mj zrYSX#l@-DDo+URk{Yj>oZu_)|Cwa{O zNv4^`lxPt-So%*gO%WyVe&*QuRme1VDtoj=Y&0BI_+2HMwBfrs!q<2XStSzv^iTUm zOQR(j^wh_vw9PSz-o~(Qtfyc{4P0F639kB&{&-@;A5UhdL zU|X~a_R=@X?J0^jgCwR2{64qrS*$5pp0!!@jkZ8N%B_dV)njX8I;|tJTWSpQ0{fWYA{eMQ&u!WrayY3v)w2f>^XoO^Y_^eN7Q6 z2HePppOwOiwrF3Zk0}W=;`dZov`KHee72kX70`AEgKW(>~_Mkl_&X=U}2TH$2IG6N1IQpIHPOgPbvg0uZ9}|M7o@ zXOkW&>V+bX9-CloVJkh-C~Rb-6<2^3WeTK?NRf2;B8quPBC5zZ(J8o?dGq9+#)PrB@z_47G!bPRJ1a)d+BrbDo;lFx$;6vRt49+;ek)4Ube zc$M$kWQh5nFqjt3Z1$1>S*KVOg|7N$z5ZAH%yn=0>Wr6}xf+@u&Qjf zNQ5-!e*=~+n=`5irx-E_mVU626Q&xD=}El_lT8zm8|>}F&H@i~2jvtfUS&V1uuQ{F zyiMV@&nF{&$erX0xk1|EP>y*)Lrfh%BlTpa8FDAR8@4fH`(oY$gU?Hi8gQ7G;wqn) zYH+;aYQ`8thY%*%KO$@zsmSn3Z6}9wu*un9q2R8z`SYKzC2pVUU&B}uiUJtWC@U)8 zoVY!)cqHMfeS7%MaH6DgS#evpd}QUoonx!E`-Sh7-GBML$_MSRfugo}@g%gm+78aO zi*@a~Uv>Y?1I2^(xNC6nWWv=hxNc6M>dvXv7w?~WZ}dUcd*76juzO+_H)(=!mQ~(R zz<=#f_!@6K&G^UQL7g zu$9U{fbaAAyMmqL)6!lrI2+Cvh6tDErmv6bU6^O-=419-g1&HY z4$jk`2r>T>1N|#BzJno6cof0xY~Y%BBtLIO7uxTi^#f7@qO)+`XR&aQCY-W*__X)X z(F-HTz2}BU2N~Sc%v*5392^iPv@7u#gCd;avmNt7aHyJZ!q4ko@CD}lgA6mnc=l#& zP{Zmcb<7(=5^PZmw4r+cKCG)n+Y3<)$Ll14}_ zjS#O4K^mb>I3FIGjj*#Yl?Z9gyl#e>oDk>;-YSL3C>9h2Y6IF`s!lRSVw(9SHU-_) zPO+-_V#%HvkZkT0S_5R0uHB4mq%H%;RfgaQ5^|deUcv#S4jeWCN6pTLgx#VzGD3o4 zfnZ+hfIBe2O_NS}FTk~VOXKr?3A34FTS95R>3JrzZzJWj1Z&MONy+Sc+|W0zT==!V zbJNDct#&BJjf20}pZ-XH?vc~80p}U&m-M&%w_aQhbM?J&A{OiD{b24F(f6b4rjwlM zENeO&H=R#Z)I3q=R27iTk#M+}6m#s>h2`^HZ8r=5j_waG{Nfw$e`DQroHLzfO{e3g zvqDTKiD`$J^=)FFJBwLQV%ES`MMvcV&bWoe0`OR4g$eh+E_~)P_CBF)iBq4gzY`DW;wEz_er+* zWbE`Aw)f1M<;+%rFpUrQu)P=8EEm(Td`*Dmrg6#0nQB;5P25zwQn&`&C@rNoPc5C| zEFRY4S*ctOLnB+-C|nA0Q~l z^EWwDHEXJln_Me^a1WMbWNC!6xLJ#P1<>8N66RX>v90^oEc@g=2iexa4;5_d@S0^f zN%n%0BaUH5Xkqb9!;*nBRk5b3xT$6t%HhT|PA;9~EcL9Veg#U=utIY!d)bz~YnHv+ zdZyFf2T40?xyo*~vYV^yXDj<-dOP$DuA-Bz=;SK;*owY|Q;E_FuGGVpdbrXSwzOs8 zSfa%7rt!9sD`{d&nihr=uqFE2OW)>f4Xmw!vo*7}=6n8kgLi{*+wKL}C0c$nxD@1y z-E6U&D{f?q8}ALid-Cqdcya&2k;j?>L-Qw2%Gnv~JoMmfyz(eO@SD=6mC^g=cgr(+jo~p*YXheKBbEZz#1db%= zZ>^VNaf?JM!phJ?L^v$wPM_E^V3u49<(7>?VIeWK1OE&EJEmAP)0 z{lxw_Ki69PBu4{4SSa^!CJ^>pXO@n0mRi061>>mQDx&yybuU%AG55OUD+NCqcKsyoc*Q#`YhJ*B?jBpWGhvHOmK} zqB%<=^g(EP$L}7GTl%)gJocxOhAl9^+RAktWjl_>?Zew3esj6YZyi`VCd>tKgr$YG zw5%|z?Qu)@=HjREPGUTb_vKTZtA};3x8M7A+%<$~KPsX9|3Z2n9~nTwY!KlXFQ9=6yMFKmDf_6Elj3sqY2X8!H`m}`H$!ir7aZ?*->SIlP_buG6VRqN>haGX#h(HNczb(esB^xO6KVcTOFDZheH!2+T_1}szxVJHJU$BHGiUN{+py~^l&JKNO&Wqw=kF} zcmAyK&Y@-B@}a*fTrLH&VCBL;YhU%P()SBj>3?xxwV!R?yLfnEXrsFRZO{5Ty0=U3l&&0Jb*=2ae`dXEUrb-| zP^9~iEE7h#d)@Euxw{9Z`%Q2x7vJ8qQv1&otL>{@07%zr-rbh1A%IgpURru7Rs})t zpIx`^e`9Fj;G%M)()H%`+t*jBS1zrTtTOAB{aoc9wsOx0{$EVLKOL_;3Dgt-B!D4I z_nTn&Jh;e&3W?P3fe8!6>?tzI(f~=gvy!NV;oa37YsSf==E9;HuSxL$@!`2| zeUhVWFr!Ny$oQ)pV~qnJcCS~RSycbHV8acpll#6Jf_NavuT8bU?U{?Y!Wy=)=B)@< zKg8A##S4f2pwbq4lEqX-E9You9qoTz`}4;48sm<=oZ}GdIP|Xve|7wq$K#IE0Ilc9 zhq}1uT+CGWu)3bB?q#ccx$50)_3lMu!rsW)TUdL`y}5T^z58n1KET=cv-bV}+WM>V zUzW%1Cl|H9aSuJX9(SLKnQ9-^xZm!()5q0xvo+lt(C%=dA-sJ4R$x`JZs~YX3#SCR zdLC0s7b%<*3s?Vk$(<6es*|k(;?}h~2tlg5AV@U?`Edb+XlQ-6bdc$73 zJaPZRy8RG*olF)`71eKEynT^#^stVeHOIyKu3xy{cXRz0+5U?QBfl{^HmYj5s@-hW z?gu$*RfldL_^@zE`_S%K)IMt2oiJHb%Qm%Kbst;Zci#g|r?r69H!d8zrCIg>Mw0T)laL#%5v!4C2(-+n&Ul1rBq;NR* zF4nz^bMIx{dq4EASDXj(FNvmRJC*JU}uC%EcmT??Ley{8bD ziO%5~8@bCydP$$2h#)4DAZY#F6V>%d69 zuJpmz~p~zQSom;&j>g}ifeMepVviARJ!s(OpH0_8r1QyIST93Oiqy3nS=VOI zd$I(ZP$Yq~@w^7uJ5~G6K1^6=DHV}bQD_h!1EGSa~35ca2 zNeM@{WltJwhMt72oU^sDwl}eJw6Hdv%@Nv{M9ZhuxDqJi2Kqxs?J6UXuE8#$X)mw@`gA1|bvATn{ zVwJ9tmB+bQCp*ZaK`e#zCfKuvQBQ{g!t5a2^D}QBp3c0702i7?9SNs%%SY6bK&9ad+%}LksW3>2F%{60_#(I+eOS8ZiLB&HIv#RmV31}a z$_ET~f15mmQ!1#){C-AW4|t^dRx%^ewEuWy4s=85>w*183aD8h62(kf9%@Ke%@@&G7cI zwS8;GzC=Sm*Kn3?I2&&`A2Zb_Y%MWcf6UYmCYj5SrH$G8Vx~SYH*AfGn*M~VOPE(c zzMDYq1-TAbRwyyU9S4PGxH8iQr;&E7c5$79Z0BICV<<@}jwsr|w6;CY$+1DW91sKR zC|G|eahC#*1hMG{M<70V(t@|%iJp}{%vPJ z>;WSNi>-xLiCykz>-*OV`!{SZT%he|Yg0Ep$p->0J0YXEZEwu9_ffHDbFm*>j#*9t zGlm6L7MjVtxJlm)+K0gK;PjTdvhcicZK8rSo6M^M4RF=i3~GYS!ri_8)LmK9>A;(O zzlN;>NkP2tu1Mgw{%81vv!Z9<4h;sb^q820t0dqWz*+AVKA-W!g_S`vO$*916A%ux zCty^Vy~69zgu|u5i8o0%QKVeV6M>maz5p3S#49Rvf(0)Z<85PepcoSb#mG6l;uuJ} zY3RdvGeroLoNeM%f*Aedoz8(eCEx>2Bc!+^pKxl05ITHX%o9f9j4RVYvD zNN2cc1n@H#=5^9A!siQDmw5fNflIt81sUpIdQqQ=w;?V-{}RgToxOZLMC=H-RrCNt zCIGGg!rXLtEOWO*l~{rQ0XRqCSPS81ap;S}L#OY^A=7f2Zx#LFMAI}`vn4U}W9dG1 zS)LWB{zz^_1t&Mt3lhUgFIy8i+HCi0CWn7K01@8gGiVQgdt};9- zxLTxgh&FJQ;T^$MhOhv*t&w8fp%zXP4|1e}ETLe86PQd99q>EIPQU}C64nKI03V18 z9bls%Vg!tsBIZa@#PUvxyAS2T%{}S6W_M6GGUNv#i}3u2?Q7K~!X&*h=AweRNyDii zfuf)#3jELKO^0y-$s|KOgqy_tJ>a@O z%LzWgRla}9^)A`|iM@@cpzo;2o0>W(o6BKY*P*9fQ>P0ZecFeluZEp5I@lSbrt`zc z#W2v)PKb8tP3y#+DEIX5q!qPluA^1c*PhuPx9A}P(n5C~(f{E={W)=dUa`2I%-N45VJ~DF@_~3aD zGV>JpU|-!Yl}NOKPbIXiaq(MvPhFPmq76bjw+JmG7lhDNTZ9gXDN1h$UA;wUnRyOE z*X(?*`jO(bDik9{ZIS97%u&-4RH6-M8m0$Ec2}S;xGP#iXs8;zicgl_kQHYEC5`mI z7OyOm+oc*Y-m4X7BA}DgyHn{p2ro0MX6=1+kIYvNo<4C~n^@lhNoyqXUfjw$vTljGx@pEdh!bYdXp0*@75rjv)f37V%Q>vM>{dyR^O8 zJ3B!{1e=Bmu#A1;P9Fad*$71Ta-9d?qVU`_-g}uTMFVHMw2uYuyA(ELr57_iI%Y3I zy;Ce9g2;wY*oE{Tpy|NE4%yHm00ykC$TjmC!v1@fv+wmVnVMVg#!72$ncv5cOvU2N%%o>2n;{);;E~Z za)mEZo|xL4P?!NFvh@%RoYFw1ob;iqlM{it)NZHz)EC+ABzlpZPJCW?R=5@uEkw+h z2OECKMVV+p7v?aO2`XswZX(?H;Dz=KQaL(RJG}BL?B%!$Ef2PRphO~702N9@gceQO zA2vOv5)INOKLl(EBAne!9lCxO9Xx3Cp@AeJUyu^xrO?OVcr0ww%G&#L3q8ovS`Eh2 zlYLZOUVlfheVkW9KX70GWS)f8wU5wt5VMV-4Yw1MeO1gMbX&(y9u3@XDC`DXL)*Vc zLz2q=6a4%d#NZ3b*3?Yd4)ZWoV0rlW!#VIk6X z84lw^a|+{LHn_?)3>TlzOi%DSe{eQ@9k#4p@iVU?0N+951%&%WG%lh+HiGRV8Nmhi z{s1&UKXk7drX}FJ4&v&TbRTP(m<@YpV9)L5U1M0$%qS*Ri^lIE505Kjj}}VFgnlUv zTx1-RMlY$>WyUd~5s)!y7iQ3p#sC_-!Qk~ol$P4X#cRmdBpu=vUQ%m(-Zf#IK?~_3 zq~iF()BrUB*S-qPP(Z6`+;;_U#KmqV1V4?rXvQZ~axlGc|Cu0dW=OHNDu48$+`b!k8|U7~y7$4( z(L_aKqN*Kq@Duj>M0qXR!QJ6W)D3{;iB0bTHI~|i6Sv&U2cFm|&;vFVrgdUcYA^VL z?G5;XO$fvn_m>b0Zm&p~90`*>VJb;fRKx#9MFUsS%U1Mq75!{Qf1-IfQPYiim)Ab2 zG2;rR*--GrO_{0_u-QOgESfb5J^Z^9`V!F*KC7OT8cez*WzgkIT_Up__A) z?(MewiuP<$L`z+WmavuFyt;Jtt=hL+?zF7g+G3`*n5`{QR=$`AHxe)9ZIVczV(U+> z6`uOY)cw1f_SLG@3-Ou(*zCQScgvP2J9Ml5&8FK;Z_RP_DBg~j4RB?6iE*s#g$HMT z_2MsI{P5g|mmYi_K4a%Dtd+g6nD;v;=oC2w&7-%hZ&%!@0PW<2$&xVH$bU*HDHGSJ zDiHR75D(mKPAv5-EGq~qO5|IplvHg&n!sY`D_0`1>!%T?d}*a#R&0`zfR8lL1FnAt?JLcT<+ynBKK%yVo`<(ZAOp zd$g053eD7-pV3T{_D!oSs8$$9wZc@*oWI`@ui2Zb8`O~Nztb;njCt_#ude;_+Ir3D zn8}%;7TFCt?x1$aS$4CQ-LP5grT1ZP#xSVI!7dtMn-XW~Vl7>;OMqRs zHwU^^Y<8;zw7p$4h%?2Pe0)XepFgL&;3w8U+Y{|TW zj%XmI6Pg4L>yNKfNu@zOthhDuDTVK3#Ra9h=b_Q^!)tF``{8SEyp~j>1N4#N(qta7 zYS5}BRvp#QoXjUySX;H@#h#B1`1yxv)vIdtaYb@Izgb-X^>_2)(#4zJC2ukZz8`As zx5BKp=CK++lHXJ`tIZF~>|9wdTh^NdH#4pXThkEap^Jvn={ap9t8IL&#qVz_s9e}c zs8CyOh5tao>v4|g0Bydy9@c*o^^LH#4ab$v-h>@K8xp+(iRO+(nKNOpO0;$1XZNm8 z8b@;g)Juks%}_0e5W#5 zDtwnwwz6coV7J3I#H2&8SD=!z{f>QA1=pu0D+NakLoH;wB^+8Rh(_grafY2CPr&VOAsny;m{ z`=g!Dm98G(lo2h{hArF+pln9Jfi}Uf5bU~nP#&etXkX3C|lKy!rv~1DO|+MnEskn-(=)2euBv!qGP`!b=$UF$Zw?8akkz z83iMC)XfcHT*diezI=Gre=Um-@dC3m8TNVyR1n1mq5gP-+Rzx@$YbFS*0ObtD^ccJ zQ2oFt^bV8}5+*j|n(99Tw=q0;MSM_27$>F2C=Vs&gW+Cy-hd;|egHW^nQ-wPL~rA2 z&?n19K_D#U!X;N*mK5hO-hxL!-~{CrMP{HJ>Ct5Hpx=XFS->%O(2d|k1GW_FmO>0B zJ6|P(n9Vmv??d!-Ls&o$-1@97Km=7J^o0q%jr=zPck@Ypo>m8Row4c<2DF)|+O`GN zuQgQy@}5@OM|lQm3CEy*Z{oGDQ~*`j6>_3QIhB36gT%sENb<-nwuxeM`8*`-gTG!( zM|ywhFv#SQ0fVd{S9T#+KBUa3ip#EeFq@nhUrqsI3fEDy9(ml!V+>r&vZBK$xY}i1 zD`Z`rPrHInE2^xg>Kd-&O(}Cw)b;e?ZRg?4Hod5}4*mQU_>xXWc}3Df2hnHhY#UL> z2ac=>`qYyxfLNZ@xodNWbZmCr+O{HJEX#&uvWRr(p2;lQ4XtRy%-@ zsRhrg(1SOmgp>m0n-$^-@jKrp)6KU%H03Pz`oDp^Z&3f6rsE+TsNig4oo$@6n{{?` z&VJU}|3UXJ2HqctJC7`=IZX)+n(G=z0`~5`cKbE1vXias!-#mKzD6mDWy@|8;vGzU~ zY&gB0MafFx4^O=z9Nw@=1s#dlZ!Uc^bwWhKTAFY)Bnv6WuE&(tp<7V@T2m^ivwd}f zx=mVM=^+kr*Zi7F4}U`1Xv&_G zsH9b(MWviFNZ~{q9AuZ8vgQH)ab-I1V(7RsO74+jqDPh^Xp)h0I8#u->XwbxfZmAg zYSDLCP)cAly#{BO^Sk%|>MV5v_NTjA_`KMk2IfTt^zC>m?AY$1u z?Djv0L_jaNg))`hdTGt*1~pWzE}=gPI;mP+%;<*ggXJ~LU0ls>wr2PJv3S|R1tT6w zpjpxgr{ZuXl-{i9`E5!kI0 z`cgtn5Xq^n2KsJk1N7Ze-GTu)#?rdwu~_5K%6YDNh;1GMYpiaFEjhxKoMlVS#!Jq_ zuwk^vGQ)T#`KoPVL6&|W!ypI8;P}VaZxfiP15D17g?t7HT)uOMp<6D&I&p0E5IZzw0>@AtyDGG=!C0q@5Yu0s6ct3EB}tN=}SlDhG++Uxoe! z#2@w&gkgRShL~z}G!aK!N%S#ZRJCAgz?q@3(p?z9MgovzT3~<543O7CnziVgxdIH^ zeJ-c`a3bc5Gh&h+mGZ2|42U>*KibGb`9ZXiG5ZYK&Y^)qEkPL3kG26cc7wrdb_>+^ zUgli@6aJ~t{|QDiN1;%BnoB8aKccK3QRRO}=|7^}zejuPM^w#cN|i$S2?fTdI!ZC5 z_)Mu*C?8W`e5RLtl;90XoK*alI`a{AC5!Le%)%X$R`G{&G z_J)tB{@-aWix=bCiW|94GzNunvFiy1ucT9}s9gR!%!!K1WXXPo!m(mWQutnNdQ9F= shLn8@OU&$gLc#mVD~fL!6vq`YYyA_7*q#iPpN8PKDn6y~O>pu51IxcyApigX literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/petit.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/petit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8bc19a421205003d1aec8ce563c774ab3c1cff88 GIT binary patch literal 14343 zcmc&bZEzdMb$0*`-(Lj3K=JD+N+cnXAVo^DY%7vw$+Bfzv>eHn1IIxj+<^pj`0(9P z5>YT#J#I=RPHi=9D9?DtbSItAV`U<@nXxnN4|S%QrtOacCNqRD%9NeBo%$baGUK$r z`rh6Ljv%Di?R2^l_V(lL?%UnB@4bC*`AJ28KD)|nS_|g@G>t##*@yb<+zZW znu4rVaX*>P@(J+ayO3UdS#|!8A^whL$ zERoWFh8NHASt%!;O(q21sK8E=v!cjn<#=95$S`&$scx1Ud$i2PKN70ssTj>sF$+h> ztS(CZ$8~X5-p1K@hO-nL6vZ*TT}#<{2hVgs7h2xISyFD!`3e(ra^0Nk6fn7dU0RXpgZ%-##{?fB@OCKmno5IOF9ZXq^O@?Jpw7>Sc!qCI zSqO#>j!N0MfC;9BwD+jEL;Ikqt`+KY2%t12MtO45Z+^~)xOge0**3M>I(PoCKyom6^&*np~&7xTF^P>V{(5)f!Y zz>}mjU%V7gBHhQy>53tNI+xRv_i&T-C(KrfZjiA#BO%>@P6*nM?wAd=!bg&tq#rnX}I){R`R%mNL7cG!8 zFe%y~XV+WL0SlTeyl?`zO%hHXOXkkj79+}@5qLQGY&IwJtUR5NA>wfbGmuP6EPo{r z3zdOWmJrxPmSd&a3p3CcE9an_M0SdgMjT4Jy1w!0gveb+uArTDu^kXm>?(Ux98i@_ zXQxD@1~H7$4u~R*Vv~3wMS#^}0)TZA&WpwHU*|^UOnx+<&X33maf+8muFT$d|DlWb z@uNce!f1XDxMX(pk|1P8VV8Iz&R^k^vvN)xC45F2)i@_%kfVg7Mf1R!<8i`)X|?+N zRVUTCd%^Zp^Ti2+>t)(@%N3);#@XPyecC_nlBz)naYx z+^{mu{wl=)AzCT#fY=YPTZj(wD!)QTvPjL*z$p|DS!s?d=XtTU(-r|5`#i<4@UJUhH%_WItwx>pSX|~((+uPzaes4 zfxZfzNhmphP1&hnXgeZEZ3Gb~BtTK8r|7dpK(tJ!G})IVI%0H2%4Hj4*MiuoQ1Mh+ z;E@)j!tRFn)q|Z!3_S$H_257$Fj#U95-b(F3Ys+YI}~CB+dWWvH-TEDaTf1t?dB~- z%M7-u3!Ft4p7Yik*DP9PLs05Hh?b&N^2jc|1J0_qe3>p#eWQ5?;X^^A6M4j_vc_GPYB1 z4P!gzop%|-HAU%o%)5%NtJW!M-klvYjpLGoI@UTQs5#Ag87R(F(RG5-S5O~Sv=&{^ zlX(|Bdfm%3=PlYUQS%-Ps8~f$(OI+?-8X&jYRm!X**~dx&9Xvq*VMYgeW%ls1Ud+^ z>2&rIXn-6$SgRj|!mWH>I}k)?DI96kx~3=&vO}O?MzD`Y2P#UG8^x9YwWKgH*g)$X zWHYmp%w9mbBgrD-nNG2JF?T7=@m!RsOIw&YY5)`P+c*r-6BJF($qOKM=INRiQKV(# zTt$LTg%02n=1#-vNOB^ZIxyi2j56Eifv|GU5JcXkuyKtB=3_ z;+y?z-kx_n|0&^SzPzK0hE=Llz0E5mq$T-z!0`ok2zE1gDk zh+FX?2O~&c}BYY8DK?F}}c)S_0bc=SmlJgUKDCK56X(CCan^ajOa zTJJ*Fy>{byXfN#nApmmO7hLo8lzlxnE-m{@gZHiY4lP*LJG7Fxd+n(^zV;iQADmnD?YSv^s*x28KH=V^EMsmMym!yFr#{A6XkD&H#%?`Xiab~H(gB74Z4%dt;$fdL_yLdQt96z@kMoYiJpTC-;ANJLsKnq zbg?GAtRP_;!5GpDxOppH{~AIWl1k(Gwno=To!5f2vT@8MN({n9)sS{NR+Pb$&jNpl z*e6BQH$`MJ;%SH!3Y6OunDCx{>ck^YK6U0?9FCmg)N>K0ru&MyWJWxRO&`SwtcP&r zBZo-w*4SifN^yh2EvkBm_yl%t!<%^R3aHY1R}O4TBJe{MGF>1yI_;d0*#3@v`I{?4M<925=UU*2a^Q)T!082B#SP}j=4BQ zQ6OZuk%V{_zzI!6EVbPRCQP^`vL3Rh;@2QW_&c&Obw@kkf|NIz7?Zh79yaxYz!&yx z+s?bf{t0&c7(~e3{uf&pXny_jwU|C23w5JfnmMdvmWlfVt@t(;A-pSQ`zNR2K$p#2&ALZV+iOjC0V$Z&haF{#keg5~Q`h2B%J zME9IK36~-V7-@wBh;oa_E3VWmSVt0aavF{w3z&=FzzF4w;(!GQgLGDLiTo6QB@R-x z{@gXe%Bv<@O*(OfosD*9E6fe(T|()4mHK7-z`~=IaPM1zHv-GnWFH-jOaizo=UY>07gK86~+ z*nr0YJP`9%V3t%PW@*s0O~xA&37{t7y@&*TJH!duqnCl5&DbU>1pWH-n%dV4Bf4u! zyFT7oN^a7ji01CUC)$iFz72HTvjTxiBjRrkU1MMO9{?Jx;fksr* z*q%@p8P=f??;tV$5d}Q@398PZC=hFI z52VFjjOy1Yq!VVX`v&%o?LdD8ok;@_!TNXtuN}R1wB*^T*86Db-LSegz;Sv}&ZTpGT*TVgud2FuaYp?*aW&7)5G^2zS0& zRZ2b#>{|x?c;9v^umK>M#(qecHB#71P3k~}5!4C6-Cd(3jebFDp8^>5x=}YZQGl8T zwn=>obsXhy09wL}%vI_RNYfK*Ekos&p$}W4%MY&v;ASyO<`A)+oSdA5Vev9J8DY^B zkH;e(SKM*U(*kL49Jz#`)i~mDE|-L2+l5?CP;YD~^Htn&ABi1R@eogtdIJ;PSd40? zI0liz5QCpMfdvOKI)tS*E}fJ`Tu1R~jEH^i1x!t0wI!QX?BJjT_m7ARF6J>3F~S?I zco`#HCY2BLW2z6MHa$X0l}fPauTUS^c7sqGJ5bpZt@Q1#43B;42{NPWL&H@DzHn9P z>08?WMy%?V z;D%tjOC7_XQSjZc>Lt*#S~tAhvdsVZ;yV}1kp~cF*nk<dNFrghm0*EOc~1Mc?3 zPtzp%^Aj6bQuPOzmP+f6#pkX+w-~=3uUau5-ABIbuBK9GpzIplV4$?RlZHui%M)*> zH(+L=s@=x)E*;;X;8XR|%=q$IEFZ7hX=V%-F!@f$CgjINve~%LN%$E7NeRQo5JZ0j z2h;3<<}Bb*wJcC6nzO#*jWG+<6*^`IHfs~J0A4udM)mlt>M@sCFllqhVJ}`9A&yEG zWd#|YY5m4L2umQ?;!)=5uw866$ z8*iWStDxq124`0n;GuyF-0N=Mg|ZRc+%C=uKNs|XbE^v}FFrAddB4;M9?lC+eRwz! z^SLOl=}Wcs10G#YcDhy zRc#s{*GK@1h)HE zu(U#NO+_#~d;+B&3cqztuQ?CW9#v1(35S;-@F&iyjsV3EFT(KAQ2cV5lcy!|+gROU;(=RPsCsYn5!rbt z&hO7mq_QHOdxhaMd3g@}b>OKxq<|%13jA~`_XVjBDY2Q14DS>ahbDW;g)I_PMFFu^ z;qc;jFnSfE?_%`F5Gf)4iYzAL+O)u|CQ9O)SWDJNpRRhJ{63a1Vsrx|V)S?eQbfDe zrK+m-fasDUYeCxdj7MN9M*Lvg_eg`Uh zj^lT*;^|+yQ1%QhAN$ZVLcDoTnY?*kD%i3XsExX5rF@|^Z|{fR-u0$#u-v*@H@#G7 z;7(K9V(0bF^> z;c7LWRRP-9^=RlPHF<2l_!jBkL+7V;@+4T3q_G2&u z@y{_LhaYiOn1qmrH!m;R)E6( zyC|ynY&8J8E+KNDy@tEj!jW<~vKAgGheuYzqif;ua(MhF2j2h6dtX@zA6xKMI(pVR zM#>!{YaRQ_9s5=~Ce}I*mOBpqWcK~}_vTkR9sx&x*Y2gv?E|Y_r;u56-02-!?zug? z+IyPtk~?j^6>rm;x4Z1^USihR{bl&?-GBQLFv7!eWnT4w!__sg#IHJs0T7_n6Y?-% z2*$m1LQ3M%&;$A-$T2}yy~W0fM2ym{EhqxBy{$3oDfk0K`(=zC!-yD_QST7{0;8J{DR%W{F8(E! z?7;{XJJn!=76M`>_y7tnk*lW-R7pdA$(ah2HK+9`au3bR`f0e-k zFhTYYR2?Mgr22L*oqgL?b&;$a%&1ilNqVWaj;fC&{ZxAw%A$<`$v$gwGd;HsynpDu zL+>Ab?`V~R>}`1iKUJ369r@JdXWUyxg**)EZF=qSwZl~h=A0B@C0rA3oGbhLs@MSJ zgFEnA*R`%q59HLj;xr?>6+74=z;>b9osnP_hB#owB&1cfC!`t`MI>vbkCYJY(0Kl?x&Wo9K>lt=MA8+J+Eo|d;5}NH)mdUgS0=t7bo2uJNr85; z7xBJIJhsZ%7j3RPP(+j%%Kcxo*{qxvq-dVJBp_+^upt9hUvYiOo>cFo+7r4)mF`BE z)OH0mQdQsd6F~hzyF}i?sAq?`!HE45?Tm!b>rQ~;JPbPseug9BZ2&=GE#dj8T4|d8 z+)dGazotUJq`bePhW?G}|J-7uEuT^l{RXNYqd&J;A?c*(ryyyrI#2%#b^ce>-cRk6 zyY=w;c_{3nEug+7u#3$C3+z4=MIHm}3%Sxl+<}d{-L&YwF)JHh>{I`9 zW`?2!W#RDQT>f+Zf6nEym?>N$7o*7pATe%tnJf6`EWMO+I}sPk7lF!ST@G`0j(t;&&F9h zsI}(XvTdv#(%SQhY$D&0?Z|g#I~gsk?aFs$yYk)HZq|=z$^7o@Zq|-!d-6Tm9%#pu zmKi?V%Of1deUCVM_1V2Vx`dSYJ*2ea#3PSG6&SS*sO`A3LG54QQ;9#&DJkCajOYti zaH43c1zpm{Q=Ao=l!%naCMM8{h;qp=3wg0nG-y}VRa4XoGcz#CTj8s!jwNzlGO*S4 zzWz~3(oMCD6`Onwo3jN4xDxN$0}}m+O{K!x9Wu5_R7%QYg~p1*qXj*u&RDI`ymDhy zl4qSMRvai>gu@yxkwU3>eO4-Bt5r2bOMeG$q5=1m$5x9v zqZbHP?3*wwKfz{+=(3}fY^~JjXn*5tDE@(afJ{e`5t`%Kj4g$GWJbhv%@xelRsNrF zP2B!1jI5*Z?#wOx9+&zPZjn>GHxD61n&ta8$IYNCb=k9sKIw!D_Ahda{NsRYR2*7B zso;2dplB9lF;^TG9AjWdBdCU8l#0ayF|i^PbZr3&;k}8`eEJW64Wi4}z6aaLwF|;! z-7r;MHic`KhXw|OOOx*#f}|^gj&Eag;m=d5Ii z3cjVvSd^urG_7i?xnTL=U5^K0FFk_+|smsCSOpnCgOQ4m&^jmFmGUFnlB^c zo=dv8K>FbeV6dMT?zgsa)|Lvt+u~kiN60$Ft!}12D@vi zbgeH_+mosXP$>BVc|+k1_#ToQ)vI;!%h{L@c=ENx$P?hI#>@k!##5N z?(i_H+16nO{lP4L23)6WF?T#X$XnzWJ&!#uFE}ah_@*kP_~y11qpQym2u3&Yf?>68 z>52uC@{$yMh#a9sLrYyHa*ig#1lU_GC%0aF)6iLvVY~?i98k~R^`2wZo@1ZqDyQD5 zoV>EybF~so){;Hz$z# z;G8;7A|Q8*JM;p9UGXoV0T=|1ieivE9u93fyhYAzI5+pe$PqIHEbIqIiRKv2aVd7) zm>V`@?uhG9uOl<=(iV~H@POKIv*6|$*8}dcZKv7b`aI)=-aqJKXv8!jzCxr=g{%&0 zlWEDNq}e%;A(&}}RYL_YCh0OJ)V-2cT2Q}2=$5R6b2rW5p3n`zHB-wZ1GE^V1(mn$ znwfGis}}*v01PSjZM|!%Jk8@HRK_(Z?xGid6dL*_()lU==j<97ZhY^G|8wv9p|jOP zXV>EAo^+*Gy9O$)15XmKub8Weqm}5KuR`az8W zbfWQ8g8DsbmZE5dMG;(8Nu%wkNSC5HBLPuV3NnyAhDj{1rb5*Z{8ogyMSDAhUNkvO z$zDa3O+wX&WN6<9eovmYcwM*r*Vrcr)h|MA-ikTSRmPeoieGSKA5fX=c#F9XvWrfo z#a=q_Z&3W{9{Qd4oq7J5)e0Y*CJ@qNIpbgamq53S7pYhNAF{+APfWVJJylIr1z zucOwv`@x5wefU89OssooKRw9mXQ8%A`|H)v!IwUuvIA%9mtpCRpKBx={|lt;FizpI zH^(!u=D8Qm`T)%N=UN)Pjm8|+&}}~edVYXx0PC_L0E3b5#T$h0P|eu(Q$pWC_uJ5N z=h0kH3E%T&BMk0E!IMVGNf0dk9oVI9LxQb3TY(;f4b5P0Y;t%D&`XCQ0t*uqvpB%P z5VV4%sQQeH901dt9~(R=u-GeXA_qgbt(vng&R}vj;i6#304VgCv@lg*0)?^3on#yp zN&r5DJftClQN*&ETVRsRf&gCp7AB@(&f;d~+gP2MH5-cr2vcC|vv)EDlbspP3WOZ2 z3c&AeRnvrNEKp?PfE_`~K^|m(5BTg2+S~xR2xcF_N;=G@TvhWb%_R!DafT5F(_0Qs z$icyx!y-POAIj;avY<$&B$x|DEGQ)`z%uDhw1yNwZA=WDKeOpHKx|0^WS!3ohh|N) zXq?GpX5ffR(`mVo&rm=~!4S3o%(PaR&J4=~CkBoyr*cEN=>d$7%Ws?l(&>}Z33*7C zurfU`IQYitoRpEkXJ-r|XGW%=p~EI_;f!2Tq|A9p4$^Y5xbN!V!12M9z0(mg<7|;C zhwVr0n0iK_2h1pH*#^J%fPMN_w0ZIttkDR9G}ZbZIYHnZnhQ8K1i1i}!+y&U5I=#4 zvB*7cWB`yr1A?Dx;CXj0LSiH2ZH7t3V>bN2PS*d}>ju|FB)UF^W&*zPa(ajOMe_iL z{vAtV$$*^)q&PALiy4Egx)s#XiA_#w+PO)Yq1moz&~!HyX84=*0v{1*LBj+SLukfG zXs{+UzC%ub@jY>B;>yKwD@@@$7#`&9DUTgV>4)t=K%)rYZ95cDY2}_Bw$VVyBFr#u zKmiCD!ej#<1N04rYteIngG1p;JoOwRY70ZWD_SMky%rpLcKQcP-t}Oz8cePQPd?dw zpw|1^zaL&dGhRJ2{@law2`_m+iQCSc#d(_0j&D(y1~mOMM=_HL6YSqk3FKVka@|=} zE#?|{YLmK?YA95J9C*9PiS&@z@6{#jgv**%9Camb)-)Y8-T>A1G$FKCUhz)fdj#i1 zfB%FO@4tAO45oqU=4d*%?YHwD6?oxz?02F3kKkEvPP9emjZ)dV?5X`Pg@?p!*eaj?VGGqnp+0_>rJ08$iIasURX-3CZ;3In;`2fUiL&&{F$l0-Bh)y^YS*U}zJr>%_gFD&%Uu2o1 z-}rARz!!A&JedD%e!VMI?Mi)q;Acm_IJ$oLZ1wQjuTNCEQmb9>F8TgG`YhgYe{p$n zJuXz^kXpZ9jlcf*z|YcOq<<41wjD@|x;B@u=*GHUuIlA={o|_s@k@jX+!+6@T|u^d zle@&%;n|3Ln^tf0H)s|6fOmY)@be(*-@h&#uL{R&(YE_nm#?mPe`SJ?=?_$;WjV#nbY|w`TmN7%U{kLO&%!8>lOLCaL(*O}<&fFu%g#q5<$5VdG$5eq# z0yDdG@*%X~Np4d?#L_hR5fDL9uFx@`R4i%>n;gvwM1Dx+(Ais+hx`~wjqJHOLrXJh ziHQfgu-_2#EpaP41*|gnigkhL6dVkQK2D}J&&vq(>@Y{ylX}hagTnz-Z0B=S9oY7)cH52Vdb;So&f9*(OEYXjwSY`PVMSQR zZNJbH`g(2kGYW&Qjdw9S5jWB{4zTlYmUSQe3zM5c9m_whp;w%po`$!5(Z+%7^}{;y zs+{2V2zB=?^(PdKS*Kr>w`8iCk+uORYcnv!9vZCS?L6MTh_&DZ2C&z4wy$a^MM}cp3I^{0o@)F8352 ydA6%>B~`aU38&Bn%LCG2~}q8^~s}lq^XhKO10S7b!%w#=eupiT^q? z2I7t@Z$$;B4+LqeWYwQ_+m~!vsXSHmsXQX}#e-LhHKh{Lu2k{nUD&GHKD6iFnQ=@A z%Su(R>^tY)d(OG%-nr*|=gwb!J`V*Y&ip*)-cC`UQ<6rfu-F$L$Gbnjw8m+>vljI}@&H7fp$-g+@!&#uGV2+d|Vu+oF(D^dJ}f*%yqv zVax-6PINASrJD=#ZrwlIh+a~+87d5;7aFv>Fa8qbIYTB9; z1w|S)x57;5MY^15+f~Q3f5{fw@fceaqIK6~Oc5m^J_R&)SrXD=MT#k@WXPu5l(_CR zTDn_~3UM)6llo>)gU64s7;8iNgH-F*Ueb}b_{it+N?T_RaooIZF`AC;5muyR!?pR?8y#n z0ql~)vLx41r;$Vrb8^cDyV^olx{cJ^ElX%>$sTeGr(m%j9?H9-AW1@UPLz2?Ov)*V zM}i{ovr;O-!-4Hd2n*1sqp!Q2Yw4s2B>sbI)-qlH3(RV&fq=K8GJk-aw~#FLa{|+4sI@< ziU@HmCf6F6U&5%XdySV)j-ITuQW(W9g3(jF`9w>>x&CZq9phIC<8A)mKBs3@ck&G#06On zXW*d7z3C;r*+93j;kl-ZOWj}*MtJ!&yx~0c^fk&KxS1#aI%n*&uR6=+#Kpfz^o$106GEUKw}6=_X- z@)MOB^~j4yysZxsjDgx?Nn!M;0PTJT7I@~M+* zZU5Nwk*7Fx=fK^pc6hAZGEri-J#1=TIdJR1>de~s-8QZJM7e1+KLJ{T9jjTbDFmhl z+g2}X!ToC|OGn1E?lCPmo}c`Jab7$1!Kn({v^=A+yGvdD8rxrCgI_!B4(AitS2$kA zh8tl*cS@p?k&;mpxiJTY4?9797F_L``l`e3h_+tfq(|6_~N|7 zaHc_i3}@cfUVvc`;mwrB+=I(8Wu71CiNq9MjEmS<+0gnfal^WVrnbkZPTz)gu<&QI zBC@v<#=Pvo(JN^&ssM{Q2QSC6BFU;m{tzEoQUIw3crZc^Wp_Peb0Q-vkXvBrR$_CR zR7QTB2bIz`s6saBkVQWeFEF?U;s0a?s$VcKyC)Qwg3a>2x~vjBW^*an3$(>KYlAaL zPk|9Q=>~iSR17(E=AF^=!vu;6DC=BWKuAPk0>4%R1Fej$qUB zU<89}*+PiZz2++k<5!~lVlr52Z8c;cI0+#OlwQY1x|8DNu$V|IOA-!@1RZteQcS$8 zJK$hO=XFO0lHGxjPwK_v9k}YmlHMe_0FnjCxCI+xEKk<$aF#cc8J(GfOX9{BO8r>L z5vXKrcr7(J4#n8I))#J=bOI#a$D*AOEB>9Qsyiuf%R1Mkab0EZm9^auxPw*7=5SWn z{wH>LmZ0qP7 zxpFX+pL`f>HQQ&)!B_K>m8Q;OP;2VKINn(d5JZ3E*}m@KH4k4rxc1}H?l;Sxq5KJ$ zvt?(AX|A$VXP3sb<;Ryj(AU1NnAO@2-MM&gTY*InS0_Ge z>?$++ib!J)R@mlswpC+Wf3@%9gC89%v#(TGFZppd*#i6Fww3lf}N8TWLN25 z+J0RFS&|fNam^-4fenBjr{|=owz>p>sq44bfxQDDo`8MMsX7d>&$(18#{y`=hXs5I zp0$xg#x_pD-0W6q0M>z}Z=1==0``+l1v!xmvB+)lc=-`mI_IbghdcJ+%Lq6g6&p|ma@Nt~w^Tt5{l;9%}2p0hcc|a?OGUlaR=7l8U zg>*W;1Y-nt@HM*@G%=|0X+eQ2c+$M8pA*H6oBID9l8x(=v5ZZS47Ym{V)^~2y7>zm z_fqrF5Au6RRNcexKeKcm$5*_N`4A7^B=#6V4~EW@Z*=@3tWS)GzB7Dp%>DY|1D2P9 z9fuANgu3}dM>hH{bbl{o*k(WhcEoJRw2`+U+A_7ywr+kdrSKalZCsUccpxF;G4|WX z|A8W5xR!7h8FG`Dt-J8O5z-jl^S%(zh@+B}lB7Y*h2w}c1QlFyMJbVi_lk2=-L1e4 zT27~AF~mt@m}wkWH3*WvkDJGFg~5ctitaSvS$CSin}0=C9WSN8k%ojEL>dOS(Mi%@9r-9hSq%}ns4OZ+57bUL-&QU?+i!> zcC2*X>MYud!$oQBP&v@E9yqE6j^3s3LgTIVz^E1&y%)K^=YFIdIGZ1T#BE#WIyA1M z%MyGYm$(d2A|8sRJPQW_jl!Y`M^Y~4o=aW7obB|$PW2R(rA>Afa!lRG&1CPI|x z_Jx?DJI|93a=IN{U3ZgjH{zAFL~;&tMvRkyUxGA>E1ca}xfFZ=$i&5C(hs2zLJKAT z4XUc0rs=PDQgr+0l=tt{zJE~rVTWw&wYCr1s#G2?p4cgym&{WA34N6I zuZ{k3@{`Fb1?1iL{xb9DnJ1Y1N&yphg6@;_Y5Gh1KANvI?O5r))m(=yR|*bHq6;^a;#wiMWQ# z`pSmgeQx$`jd+H=eO~6bMSR2MedWUyeHHB69;qCz>Z=;A?yDZI>8oL3j!5lrU0)sZ zJ0tbO4Sfy6jeU*mTZl9bH}^F&zboP&Zs}_oZtZJj-(`_C!)yE2@|=Ma1+ndIUhQsQ zyPZO~pHkEa5dMZ%cUXD_q}JjQQ5L^NoE0NK(-ARXP>g#=CKTHj#$tn` zBcTW)?OzN@pX&`3-T zL@v(&B?}IRq{!e%aCnsJv1v&apXNI%#?Fq0pA1FCfKhQA85|Kq5`8Hi)nABLJW(+g zydVyqIU5UxVlgR-TFP0agOed@x|%kMihRYv`F3hyVxdSB;FZL&NT^>7j_Ex%DAl2{ zvB*SFilO0Z`_Tt9OsUnn6FMgbFN{j(qGObIM7)5aglK;#5|V>Wje>hx!gE#*LeH^ylY}y{!3-4mNn0&_0t2FqVyh?-h@&dMWNn&DYkqLXHlFFae`Pb+QbTVFQ9$gfGO-MN>dpw zLkl|ZZ^2Oa{b6CZXhTnG!X8$eCG17-YA4JApJK=0s`ETqslNUFkx(@19~hPVi0f9I z`IU2eFcwuDXQa{bv0!v?QdBIG7#o*Hdi(S6h1F|@OW#WX)0m4mALoL4gW?8!|T^W;oY@+Nxp}t-K+d z<}19>YCV2}D@v>3szOrqsyJ=H+WPFV zL@C-++?5`Hh3+xH%V3ZJ#^9t+UwqwK*fvxg{dG9gT+YEYteY`?XsgNAt(`Hw>b~Q2 zXPou2vp#YDmeZd+nQ7Z9w{5-Qz16nswyz;);Jw}1?oFBQ$K>wE7P}t5@p8_{d!OQ$ zjK=b^9A|WumRs$+UpqJ(}5jO5S@a|Gk7} zaQ*sv#fQGS_uvDx$ttS~YY!`ZD380T6zxHb@q6YIT)@;jS*7>BqiBmgI_c{47k1gB z3UH61Ff##OT8F}-)GEJ(wad|X0v1cb72~OfJxz(*;Y@RP%GPRQwzziN!tl-gKmEc7 zUs&9Ca`EJ;Md3?1qrvWw)+6b%F5_F>(6o#%b3lcLUd4#1yv|+c6=Q!Sx~wf&Bf$}1 zz~Z^p!j7VZC=DUn;;y*ADjF{aJDy}!#o($>sgW#-rIxLD&KORhdA(TKW~|2kvx@EX zKx8x&+qg|Joeg2BZ$yUThSyLd^^Fz`^~Zu?hL0v&^=ax}EzL11tP;*N_pzt)a@l;@ zo94G1*Bp!P4U4u7_ji;U%Q{*#rgy)HK3R<_MY^$vp==nsvBP7g)~V-u4N%UiWP z_7qCd2ikjo{j_*k{RGu6b7!~{oV1l{E?IpARo~BOmf*l(M5O7BPBtxXz{>dNs0oCa zX|BC9cGd{Rq9*Gv-WfCotT8Xa1B}5974JBN(G%yx{25-OTBm4tl}=?SN(Jm6=YVsJ zT0ZPMW3OT&h{+%&Xb%^4Za^9x4xSSyq5+e%3AIU&lcNLFC^Zq#Fu&iNEVU~vhrfQvBwJv2@s9Lb4t9H&fvz0ZO%67T3J^Av2kovWB4xX$l%)`pKe#*WR&NO$=!UZ%uAVmv<}*9k*T83G;U^e&|AAbJCO)RUDBn-@Yhp&$`Rm;tkXhq@~3l z3@VNwDD&`mg#1E~ni9#!ScAdvXg`9DK(vZYCuAK33RViBF^Vx7lcX;4IT^*Jc0}?Z zMA}TjrpO?Y5JFVU3>Hb$35igT^cYJJ9_)`vvD1G7 z>S=t*%@pO+qkKhCCsS*#Jy$x~Xzol_B>PjFlV`6r0o9v37Y=_)uY5u9FAF}7%t5rXgQlfj6|ueLCi(T zaFb{Wo5L0`xx{E#!C-I@TiD?e)8Tl9)7cGY*ttp>l;0EClq$_q->>2<~dV0b^u#WNpV}=Ju&LE%Y z?;jnT2>R7U+RaiZrf?`0Qq07?iHe|ZM-?ne%^0HIep;H^qa%0XPdn0UMEDv1{tJzLtq=-5KI+-bt*TQ|Xr^kjT(x=O{H>}T^WGbw8Ov>VO;+$+Iy8GIBQ(fDL*hiTEw$}h z&%(AFUy?WPOAGrc`kC2hGD4#)G$#6!TT|_6VIxA-kR_|hHr-)( zmZ8^@qVx@l&W~Q?t0zAzU$35)@F7cmUkUayN@i| zj;IS?Cnk`QTFpkngV4{j@TZ7DDls0V7X6A%Vkja!c#T{Mn`k3V8RJH2ZHy=g_6!M{ zukZ%JXlC=lhnYArZkWV;BGgEs1G-=U1kl5kqvcnHlLoCt*d~@0xPI=exm{j!dDr}| z#CY=9m5Zs$)cNZ*3kQL%ZydPU@}s90n+~MQ4=xG^KVFusU2cA4q3s6$UdLk7?sWN{ zMPUzHK#Gg72LzcaZ@Y}-*LRk>o{9N6CF!e?*GXubB9Ig{FFz3#Q~!84R9HZaeaWvahM861 zn_oszRIXUht0S)ndFcYf(<@fe1*9xqNvy1IAuvj-834oInH6d?LX#{sr3JqZH#0&& z76L_|9kS2?VA|r4ZH+V5xdw<@?5<1pS^FjTtUF=-(B{wDgiDTD2YqL3jYyV=e3WO7 z11{+Zs*{eA(@V}5$YG?fMP1`6-AaTr!xvHnKE*aBjbdBHCe)2XomollGFVE^F#e)L za9-g)F&NCYoXup3ThLA>^_@lXJy%6v1vQ2&oRSY1vD+)MVTWYEq9>2})HA93^S7I#>RVP$}{=z^KLxXT=de7;rxkk0jj;_4kWWXgs?8 zUl#o%BBY$Lvk2DcSCT^${mGJR1nJmWXgXMNBu4G@M@Rh^K%$0F@(6S<{o+{6k2Wxs z2cukxiX9jq=_hT6zk`W{y7R!3EqS$Vp^>nkX)snJj#a{!?YsK`8ljOTU?=W(c@PWw zY|jFue3MWQlVQAL&;SmTMH~cPC;C-!N#gnOK>+*U87UNoBE}3^lqf0SCZQt<1~tJ1 zOKdrJ0m9g*Vh;ux=NJrXyLwvX@dyh{R$xB#nF8U<%1I(7S_9{=i5RGuy)gT=L^M@# zHz2x8Nq#}))4v!F@7i9HCIU4j}zsJ1`#VmYob zX)29oR%x_pBw_Pc=wF^tlYSjhDxsF26Pme6@3J|eC4HUJ`QczfK6cl9xAIE$m4;;O zt;uVX3tKXq_sg62r`-n@Z3o!I1cX%qr8J2`B%1jmQ2AB*WNXHiUn~7$BoYi>=hX(% z@I0ZenJG$2&K3McX#xB-K}r)KrNEo_^NFoj_gvZYOK|7=0Rv4%u#@~6>?A+IPP`fH z#2c8vQU-O9`5+|m@ZMuyb)8Xt%V48M$t~?c)9#0iP@)+D!OVbkmXa8kq$ew$uvxT+ zEuurT6Xh4S;tLQkY)1)IE0W@KKp-_;pYXYAaE1j{J=f=g`rZ4v@|K|;4fnSLD!bC* z!lV_waF&h*g!e#+>7Eb*KIv(cI{BO`bjvgDngXfY!GPypK#^U{N8DI31Ha3E5-=C5 zbvYzdx%)g6Bx-gU6eNn*h`%M0RIX(4HCVtiB1YRLR7NpBqcZF@sJK{IsI%>VK*@>O z1yfkerZYLAQZqO*HV!F12~Svh!J zO={bRwoSL8(iVIy>4 z8WAJWZniwxYV8f!3LLdk28B6F(N<<5nM6QFDPshwZn~sc!s5$={bF9;scs~;Gimj+ z;#bHyl}2!@#Oi1@1VaO~XY)MGWYh&65uJ9L0TN;Wk zljIZrWoSMH&oa5F0aIAKBoR*!LQSf#*e#gRl%}~$;xWFotgqA}i5uVH-_BzjFk1%9 zwqj<0SDo4kG)A%O6Dqain?W}f9{vEISRWM>s!ypzT~^o3^5mKTkHOP7DG}vg&gUp* z##=H@XBo(2!Aq3>5F9`{f%B}(`pUBqwC$PSldWyKT7RYfc2#p0kg2*uuI|WGZGL^qzDN!k z5iq1CeUE(KCx`JE#Imduvph{<va1pxZ4Y0p`i*T%K9B~M(l=j`m$L3&3g^9o#5ZO+BKWn6Uwf!rlG`+SMFL*%mc z{T=V^c>nSD9!KBJoi}4k^vX4HZGlhhZm2SgYsEJz1tQopyC-L*FB^yW$ZRBWQZ8%B zQ3il=hx<~)Y{NY#zEq$~9qh-S{&eSkOK)J=FBj#AMiSEum~^qtKZI$q=h+p@5Us-% z&wzvM4Evm)i`@X}qbUtPUp~9V=F>5?->!s?8{ytM#yhcc z#WHWmQSQ^y!fs=lY6y@+%$)`p$uGT4&acDK0ApS@La7;8)NMg@j?V&AuM4bfu34*W zt4Wl9XlpFN|2c^&6)%!RII1|$4u-?xh-#|Z%*2AjVt9~f&-)ofM(*|(Qy(@oiD;H3 zQb>FrYHEcV%L?R%Vpcw-5F(UY+c7@1Btv`HJUT0AduUb=A{6M4K`nxzF@p`vqvo*X z9jnIdOxc05b-Jw3_AuB;Ee&Y8hh_c>HCk2u^=B}FdYizsmK=$9>>B+D=CyQ79mT9$ zj3_wvfUUF?dQiIb2hNS@;q!dP+?0UmWUKaoS#hq?S7+Q==+nc`3hGkl!}QSe;Vj+z zqWK`Eywu7OmNDgp`4EcdgE7@Bl&KO~+eq_p4ocsK3zBW)<{%^QRo>b`-zqz)*e+~R zqmXJ-J}}@^2{uNY6EZ1q)Y4`8yh0A~(^7&Q#yOiOMQJp;j7$FK6iMWb^e#EyBIk8- z2Fdv@Iq#A4LvsEFIlm8Q6+W2JRFm}}`3oR#Ip<4EZ}f+Z^93)sDoYA7WfeZSCb1<` z-znF3X6m=g_1kX=+jTw|WL1;S2ZNXOE>>)2%xr76sug@Nvy=u$?6y@pX6(hRXnkt% zL)(szD_e9vxR8C}W!dv+)&oLSWr*!&FvJ2^-cVqOz4j`{jH8$#Zb(gjXxjybI8nFQ zwjE`!-l5J^EYtIZ-19`H=dj#!_?B?Es6!Qvg$^a2UtGWE=8gjGyztLnLZm z@f1l1%&=IxifDCHvh7Kml5Nl4fTIX7B$uZm0LAvNx|6Tb=NoWv&Y~bUV-SUH_vIx= zNh+I{d#TEm?;~Jx{j%NuK<>VmHb23oQrx}2J1cne_DcL+AF z_PRt{QrlU?<>$9;6O{ShOJA7%LgLibuUz>`>UrdAhQ_{r#y+=0%cm`8%9r0IBpt|9 zuam3SWvaX7>h6zhJ#39Hlm4T%D)~qR1r7=nnUrhN{{4t4mi8-kWUHG$?*`l7VhzOP zx@E()dcv1ztZ2lUiqMm{-h2}Q(5gZJ%0w(@=08I^`Z5jw==!C^a9 zk_l54#Gg|f+xgEcwT45c$Vrz)N4 zklU!Fg(#J*WnqS_SRk{EQ>sCnA{kjh_v z$UXQnJIbdxRr}5$PUoFgJo%Q9j4Al?pm;&4#&N_!7;K`i!LhRwQRb(HS85RugGCUE zCXA27`gQR+#*_9U?aZYS6@GbPQx2c{gmojWuk*3|{)Gt)Ert~jqjD^9Q`{W4g4%J$ zg}5tT2Fjwu{)WCh4{_*}=2YcW)l_vs)qs*pTBMhTQw7S1%)r9BQg>N6PTFbE2c?uy zQ6RJg%mFP`8LvVgN>FvB)Qm%Tl(mT2Sz!#4^=^w$V zFa4YflW)FMyis2APl&PZfS|ZE{wD^OCo262MM^&*=f~vyOLG1dIsclR56Jl=a{diD ze+)+v21j6D4vPdj1_R0pR2Ex`R!~;t378d^x?w3na4;fweiW%}{J= z55iG}4>GP**M6+1tO5`a}m2LN>xsCpLBCV^=)6D4>i3u~~f0PRIZwyr5tw?VGk zkg3}y*KNzx?Ud_wW&;PZ%^T4a2rKV3dq@!J$qOO_?t{I(jF|83Rjkp`f!J{9;u-W2 zPww&FgE)dX7#T;J-az$AX(ba>5=Xwew34)MMS@B@eUkEsi7X++9)*UApptX~j$&qq z!GI9Pu`VT=2I)7+`7Jo?R9N09;t&OrgG&Z*1O-Aq7)9h@J_eCyj>Bu=E5D>y&h{j4 z-my%^NC6NsLTG7aUK@Gs%=FFJ3r{QLsG z@7Nm>G1&{O$>|7xh#dncdl)h|SO(UrVa(4Oxt5KeFk`Kbzx}1xAWn6u8&P_fKWVMO4^UvhG z^yTB~o06t0>vQGwRlzm-bCu+);&225CU?m#a>cq_H3gyB>r?x$HRo#StB$L#%hi*w zf%CWJ8p+p$j=j}ztzp3=w`|EZQ;;7gIdd)KYb6=+8uG2BR<)6@9s0bhomX(=M6T<~ zt)rlhWn}~?Xgw;p8ov@xh2`c=IU0^!XI|)=Fek*Tkt>nZfp?EwKeE`gL$28=+jst| z3yDqloiLvt*ijBwt{a6_@np!4WBl3h+-_jevhs#tZC6)df1*^)E$lN*V1#Z#l zaFmpP!n#0i3MWhB{3LeClqqh=%aU4=3yXH;lQ)en9mfo$nS98=Bp8I5S1n`8Y$ro1 z>fefIm0YSZHe}%%mIYs6*RqyW`B;FfNj62}hetK%p0XC@D#eqO&Sm?IT-G67rlrwIT)56{T=a0`ikA zoz?o#z04mvc+l9g`CVm~u-0iS{vrBnjJ9G->hB-XBKQDfmd{Ax0444iz{%-=5E?|c zkE!<%i2jgYg>(LKoa#K|hk(6{8Grb3z9%-=&y?*Hp`Pz4XO|&*1E+;E8H?v7rL5d@+c2FhdFT0vPb$DmQCmuW0c6YO>RDY6v;kh$`C# zFbSP`9g!cFtHKo_UD1~E$`$LUkK#;A)?1zNw#eR=WJ}t+cE)n2%sp2#dv5wEqI9Yf zzFW?Ey6wSzNthMpj^U_6JJAiv^H+LgTOieU%eMWFtNhXnvo9oCW`nRIGP`pQ-fX+$ zt-jne-;}hxC0r9y$F8{->TVoQd!Lwo<~A+>z}cI``c%up$qzldZft~9bo$V0!8m|$ zX#P-Q@apiD;nzn~6Yox4pL+Xi>59jvkAl=r?30D&|K3(us5Bhv@6?)r>-jqF~NJowg;Ye&-VjT!eA*}diWnORBN zy?6TQth+knZk64wY4_Ucr|($Za~r=tg_FQ`;Z8-vYscQ;lN*y;lA+|5M4w#Ik=iR) zbS-rJc;k;lAC%wRc(eS*PI=RTpBd#%hvo7kGq!AbeWrYkT)qa@QJ0R)9!Yp1s`}E^ z7p}aJYPk|z;N`}R3)|k`^WL7D{Ckf>cEyCc7>$NZdAnTRj>FtDw!g%jYsDn@rpwz= zET+UY?@ENy9{-H_jo|6eZA2Shq4=CElc=Ng_emd8QJ?8xy9OrXZAs5au)L$y#!K?B`y50%gH66{l&8Am)g; z!zdFG5w=W%jQxibHF|4NCJ~Tf8;BL=#vOQ@h-BG=HoW?z35FLK!O}f3h6}1u9;5Wx zsq95_ri-4ax9Wp&)a_)6v|Xh{a~$)kjnfbb##cN#vFOD9@Pc4$UXcKwMZ^k7UXUS< z$sg<|v`e=keO@Xg`VS;Sm}XI>Q}RU5g~8a_Vw#7JyXQ$AR6ju?~>J9i+2}rP(#mQ@qCncDe98; zLzVE{U@-2Hf=`MSj3?=&VW->SP- zm)iUNM%mjv{Y-XEfX>np7CSJrb8hHPMNx`d72UJv7YO%gs64Bpovh6Z3|=;4|Jdod zw0m}U)?PMePwZN>wX63|+-(18*9TqD3JR9#{Sa-~JcO+B#}emYmsw$K9(%vU=?%Z27cmAK&%$dV3JPJ!jc)>?CBs4W*6;hk?Wxzinh?TUF~8@@oKz=U#0waj$;SRNNn3a)Q#=p>#`Wq#ZZx z=Pp_M4QD_kyoeE+vH}g+;uZ$@QOCUDRl$U~DQi(}0o3*#)TSfwAtEYPJOy+$Rb;(W zD&-k2Y=1-7uK*0&hMX%3!)Mr`wIIxex!*9v?W!eMDSHeZ9NP-okCNIBn^xwO1M^+1 z@6r2<@p}a0!UL$6AZdy;L`69+S&Bx~g7so8F{1iSjhMepCR&d|QE4mZj|T@_qW-WG z^S7)vT4`*(SZIyt^H*FO<`0%a&`b>&Phf6&Hoq4%pQYFr5PI+WQ4Rz3VAqJ+DqID? ze0NEW#LQamLZwYs#Vk=+%XJr9C7*{7$zAHjU0OOQZjPc_$)C@~5DW#}C9#>rqrkf) zOu9=Fle@KW0(R-I@lq_%C5YhhEI7qMZ^OtKOpqlhG@;{@m&J&Bw8X(KuL2`CHi4UD zOyKZ`rN5&>%;epOd)`JPz$B5;{&TqeZS;b|4=MZ@4qziTh%$-sgQhhihF%sGCks`v zr{Z7;l}?EaE8c=1MtCDc;`>eJ7L|xBi(pHvimxpUD>4 zvu4R|t|^-{Lc$H~L{Rm3a?O=zQ$}dZLkn%!2j#}?A9;3s;zBAw?W%@*9?n;t#|5}* z1Sd1VvR9Td_Rz0gCZ#|Z7ISzM$PqrjO*VS7Gc7VW<^P5#dPod8Jb5#3`@{u{;kQfsEEML!m2iV;E#~95&-R%q4<*)sn*&&32O!<|Rvv`g zVa_l|$#ggrSDX&`E#KC!uGUs6`5(EhRGvOhrS31%u@+&#()}4RVqAX8|itd^2@Zj`;7ASbXxhVm;TTr088A`BS2Oaw0>b*L99CS4d^!@vz1b6 zMd`Tt0c%MsnaV{Oh0&xuV2exZ9r7Fo|aV>;_x z5cGh)LM~_lpxr4R8R#vb^V4-1wz$A(Qb4vzv|`HnY6Ga4(wZ59G`a@vfjfGP!5{Y!gX+&)JB3e=vMNqRCeJcGg z^0ksOnd>2!gnZ~Bo&JJJX4(iZ}Rga3V%+qtI7mOL=;mD7tSg?>}f85 z?!@iz{CP%~u0}Vy*%40Yi}Lr+OI_&Z28>BGf_;Ho0}6uW=>vCcWtW_@&bhtcD4X7! zUG@RG@zmwM`MxCoofl-3>JM0I9uu1l_2*IWps-QKjVNzGL@TQ+w-(ULfpwlyQf z>Y3a7^t zZX_+V&G2_t7LD`|TXZPtof}RIYcoO*Gzqu@uegfSZ?1XsT)K7BV%6rfu;quxe$4-5 z_u|%rvhdU_f2YuqH`jdFz9ZebbFpeyTG;*Ly+1zx;gcs9_nef4Qz*@5r|X{PwkBE= zp}Fp~ttDgYlx>}9TQ?1p7TJ=BT?r&lLTF;TzH=eG=-ZvP?I|SaN!wZrUt2M%l>At# z;@Y!|J`~!6NQZNJU)JHev|)Aw4ey(6Z*91?;rm^SRga_{8-LjH!(%@&FKmzHrzji?O($!eA zdzny28$A(xgf}g$$q3!D(9MSFbFo_tzi}7?SyTVo-q+59=3bjT zmRy^72Dh{EzuofN$L9Ds)7-{6)6Du=*Z=E8`9)CZM!PJuzd26V)TD(?8R1b`cofPb ztWl?+_DMAm#_rP}ozVXxLW}h(rOZn}#+&q07>iRl`=U+ z>D1B{4~#MMEW;FEATgKe26h4ILLdh1029BcE=UiEwlqWtEdlyoN+9##Hfd-AViI-_ zA!!^KlILID10>HwR&KiwHMF1$4B+4Wd-4&~hxs~_F*Eo)iU2wh9oxLA0H#&=Y?R)j zNQ-)ZgbHz~`{2-3{=6!GSbhg!H`KN;Cf>XWUTG7$*GJ&?caR!aOu2lQo}YallGsc2 zv-Ob9HYFk^ig$L=BE;EyE8AW) zaU-~x2mR6&Y2d1G6R(z=-FyMHDCVqE8@afpq#E#nBde_{?G2fNt+a~sbI(1y1~tq$ z!EoX796M|ZTfsxp!dUSXNmHP5{frb=CSOo zObb>6^Y$jV4T_Q!GN~|ac?h$h2}ema0^pZ_7z~t>R4q9J1n_296uk6mcDlWY^`(k3F z>}pJ0xH@%ZDs@b5?g4Sqym{fv@4xuoi#IpQTMytiR!9ObZ7VVDcN~QttFvrTSYgz0OXyIAn)t1jHK_OFHJcVU@4I>G zXNS{ko+bTV@7xi(7~b}AbysR6)BU*I{rDf7f9m|eneIN4u09GqRF+N^^~lvdS({s> zuIY&Px~pARx>CGcw?1`Ru6tym74_Fxko*(C@!D+thD`ltxqkD)-gNyo)SIndld0Yy zS8qt|O;>l%2zT%!F)u7S1B<4>{e%b+7K8zo*24kBhpRr*uENO`Kr3DG9pWbN4_An2 zLjDxeOSIwfb1ZM*6~*LpDr%IjQ5-q5v@B?U;buRpXVF@7yP_*KumBz0j#)c!QvQnZ zz@v&`G^)5EVW-m&uy>1ghfeo1YjqeHPmDoOu3qv_avO!%tg#6Rzh;-e70RdHW;uf2 z*3o{6i=A*)ob(Hn5Q8ztq#M{!&l3ehRBQe}y~YMTv9_%GQUVe&L56F>MTQgHX_WKIT=7&t( zeH!%_$bBPQ;OyrDK%qI5}*&Hjod9IDgjpY4RN+hqPeol`nrzzHxF$!Os*g zQVT6K{H7r%`N=83;v^-xM$RMT&|Xq$L)#wpr!gLr4kHKtgZYMwA7kSAB?rel{+wI; z7o7hWobS&#;m^5_|H7^LmBGjxerelr+Gd0NWr`QFYOg` z&!+87(-s)T@g^egDrYt^#rRi}C(_Qqv`vdBtDZT2>ABhG7OU5#D$}l>X$L?OZ@MFR zrtSAE+xdz`PwPDn?_b6E0N?ouc6H~yXL%ogke}OmkE8d!^(T4Wu~^l9k0Z~$5yNu^ ze3!S~uj5U%O6yZ-tfx-*?Q8nXOEbUwO6RO2eYFO*XeLnGutV z@m+kSdc9_jgLlFFzVM#Fyf-U<+VDZc5`}*H5?{w}`(zile#_epGS@zBp8o1$Mf<0g aPTuqBS9m+``?SW-uU|UI8~L4VHvTVL{7Rw# literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/schema.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/schema.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6fe6b7fd5f75f242b73a98998eb08dc4645d6a9f GIT binary patch literal 4340 zcma)9O>7&-6`tkpk`(o8{aTW3jr=FkmBd*7lV#U+BRO$oS$1p31%xz<6?Z7Ey?1K(PkzKZlj9y*TRMXMp)Q%@vN(Gj*MHf^{PiJj02nrhYriC}_4bEUqVXrVXK5o7IS+s>P_0RyC$k-Q-q*5hIJ4~NXI(9@gHL7OkQw-GOhWuIGNm5nQbTnNxKrV{p z8=)p+s0uflwHc~$lByHgFn5Gr2RRe%pwlQF%~14v~BBiaExh_{_x5~D#{(|Z#%Ajuc|h?j4pN~6KOM{&rj-c$8EX+{2F)Q8{kW~nfqB;@J7>C+{2+q zH^BTS`2T<~GAjU}18)*0GD#E8!YGqjkpDu7KsWsND9i?MHb7Wd6CX(r1^5GxR<8x$ zlOB41$ZXv==0hx^$r?#WTDXR<{#*F!YWNx!Bhd!8_0o+AJc}P7>aX!|aAnO|%rLjf zcbDS1%azpx1i)v4E1?_Qpv9c5WyXDLxtS7B49@i#AP-2M+~mVsi6-0{B*gBRB3fW) zjz2)Ce^88_r)6kE?RcbAz>DM?+^n5lTM%I)*8z=ZL}fRG1%lmmT+v{rg~)Sd-C-%) zl|k@4i-xUTFlcok=|R$kWH%7*2-NOIxCRA`vRy&9!!Swx$7)5+CNFBhama7y`d=7W zRh9#=%#Hy5A+%aM9}ImsbT9Px`qpyi-hy809K82>nGO^O&y?udV#D4iZJqa>wXP$j zt|O~$-}&^;r=AwsL*##KqEyi>pW9D84dPA;KkLQYcSe`r`CozYkHl z>xuwj+1ONsHDNM1v^Fqa8W>+2IA0n#|JA^&rIrKtLxr9vT|I^I-~Oz4YFYOyC zc8`?WW2;i^pK^O);7fV`vw$qOuZtvjpn_jdn@F&OPgDYJa{KbR3V~PgC8 zfDH?35^otp@Fj1whrvgoDz6r1by^*7{~|SQU1U*=*1IjdpSy~;7O$zZpaNoB)1I-? z(DXyH=BH+7)Ht(e+=$n>@V+m6YeC`txm6EqtOg`JBjc%WNeOsdFP#!@BImXTZhPdR zpBA925Pv?97k93fynEOU2_V2bf){XMYS`KU5_zjEhzdGzmK z7`Zi91GrG&^$SPfUZUxV1YBs~vc=s*xd>P2_fv!`LQlwbM5XO10yW#%7hxLk>QjrC z12#RL>)(D_e&e^vas|bxK($qP&4ld6wPebBAy8r9_hd+sZ94q#ZuJmKoIN?`~|YF)qt%8A`owHR|vd{V-T0{_Tvq{JOwCrCK`4_ilU|CkXXVQ|AKEJ=D6)% zLgD#^Tg&ainAo(@4WEVmLHaspjl%@A1H>Kz0_ji?o(Ag$>AyWhm=;#arRTv>p}o-k UoWSe(O`%n2zrXk&0xvG@|AN+0>Hq)$ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/torchao.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/torchao.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..275ccf081c7d91269848d2be7755dc49fa8929da GIT binary patch literal 15833 zcmb_@YfxO*wdOwEr{4|TKtuD^5nzc1)PUZSB{?90B}+n1L>!yMX|?*ChDQCs`!tdu zjU&5!CupKX=spHC@h#)2TXRv_GsZPDW!;*We3PpDm_OYH1$-`^tEu?@$iKEw#wpK_ znYH$LbR)Hto2oe!`|Pvdd+oi}`qtXJ|3_JwlfV<_e;WJwUP6A0ANpg}BAfq~BV?Y4 zMC9URM16B39Q!tnnBZ-Sn-ly9&taZ9Zb?{2tO?tQjeT=*`-p>;b&fdM+m&!fOdLs+ zjg+$zeB6`pj(Azz60b;9j#RR^HC~nQjrdsH7Ozg!jMOCjBmP9~NG(gV$LkXHBlRrq zh&Ln}M;a4NBTekv8E;N(8`;JY6Onw9@Bz#KpONjN>m4%EBDx`L70V!O6U(LNM31y1 z%8TB2_>rAr1$BXKt`Krm0z z+VT>Tflc~>R#JAQwFSU*ec5DFhMXs93shv!xW3Kx1bAmi#%Sd{xnmkq?4gh>#iOB6 zfLCl|VHt;{*dyV1JbW=ODVFrqgd{6Ybqd4q0h8sUlgWr;PF)0@SwYaEEk>hv5zdQxO zHIlcI#^x+vyYBJbeDlVev*ry~<=no7*88sJjjH;EeT%ItRXb)+Y*aNYocPOQ>s>ur zZ`DSR2PCi^kb%fNP-?t*e?S^OAE++^yC6Ww4`9pBm|$g#Sg^3-l)F*2buSd} z0AN;)g>~7+ksF;nnId<%Az(mDEG;EuZD~MGu%*H69^k0}FS;E9#evD8B#1WJ3g7Zp z>Q{%TBm)acTUP*i7wxu6t%3b9*;DHFJ{-vHPC(4R_Vt zz`~)`9f$6=-LtIh7`*R(@hb~yJjm#$cr^YDX_8j4QYk%2lciZJ58V~ZG1%f7iRl>! zDG+fSXYtsRHY8XnMi@61pir&Qy57sPdnicHOFsbea#JP;nKDnATS?kru(Y;hgp9ij zJ?XKisf|!GM_z+^SIAZKYvc+S;D;2OMy27wfQ{lvDNaT~jslhAh$SXcG!4R0v4e06 zi{W%wah+m`1C*wyQhq9VIUJ9PujuNGc0zp_X{WJS?PKcz+9VX0!g87d-XY(@(n$zV zKziruR&5P=lhsiN%Y1X@`pmnptu=M$n!4AT4&<5++;yxp^{;r2|LV%GXFi=-_io!X zn@)1&4~fZJ4!m01n72XfD>rdh{Na&>R5ZT(y|t!2xu!i?&)$cGbJTraRhu`%`x_Z$ z$j|!heRbs1y1Kr7=1=!>5GvLQDn(;g6}NmTHW30PFd3J&YCc04qgX^%1a|WytD!)> z)D8p$Xf?%&ri|$q0L=c{3^Ag{E(2X|_%Rn)$sFLri$EqP3Z(C(E>6m60XvwLl9t^7K}oIb~=>Cj)EdsbeYCN`Z2Z zH3O{-hU2nImtlnq4&EUFGu;OZPWMBgaI%c70>f4{iK}Rl=<|?s2L9!P5L_dhK2lyk z%dNXA=Dz(epZ~ndzg|{8`^Ic!u44A!LdC-Vh4#hmb2Fd0+8`_MfUm`;mA!aZwB(qc1>q1pb-uzD7RnRdo}j7>|w| zHh&16`~(`82-+rKmvAFy2XO!z7tIncSs0o(O+$H+2h?p5O;PJCiE^U#ow5;|Xpt;Y zlV}5UZHKr6;&zBTA`BEG7#{NfKp2 zPKT3XIG##MLL`-#h{aVdN0AweBC!~SLK2K59s>4CN>?P>-m*J*FxV3mrOPb=RV-r# zd{S~yN?wl9RPy^Rubeq^HZ(kNemHb~xNqo0->g8x}OTcNg1av)j;@n{%WFmE062eh1 zF@YA6x<>?2nvjyBl#EQlyoJdLQ0|h5M1;v2Z9Ic0Mj0F}K0u`d!Jd|Y03;TYsr3I3 za-KMKzOVm`LB`PWzT+qVI5D3|yOK>Z%Bj;3grR^%vB;CKh0x4k7I{c1*Lep-F__Ja zy=Xsda45{9ZAu5>qmZmPLSP00TY}|^paC1d#`FapfY0d~ZDmys4(k2HAWz-}!5^-X zNA1K>e%+R{*RI;@*PFX$`Bi(vdSm-%jlrdU{HR;+d3lz(66&YZUNS zfSZ0x;|xJeNpf;$8YJ@@lQ9aSSrlWcYY?;&N*5stnn~^wu1Lb921HSQTu^g^<8mr_ zL5R{+LKvGI9R;-z4M(KI!UcwNLt3e^s200mh-!6D9)e*q%Bk?xhx1eGgs@-So$_Ox z%~t90GJg*3$mp8@SBaB&D`y?7RmG|n+~UH(9nIEdFUXpCgoR)u#Yuvj0lWZmMu(A5 zOn)+*Xc^^1UUx#w@EKm@C%LN}MOU{q!(S#dmZ#RWZK-R`Sg+Y$BBEW#E^l*b96Eb4 z)(ECNG8TOW&lB)ipqyk6cqA&o zL4`YmJ)+@y%$VN>9W!Ho(|m<9ycM45?&DMfkiswwgNg>XASZ;A!dO&b9H6CBRBPzV z);BH`V}8Zy*Iz~#ge1h000`r89W@_23KmySut&HY3kw&(PyDn37Z4`_XX+KVAlKlK z;%39s9KuW+C{8sAMc{N}Auc4)^1mnvcv29H#w77~xD^$fC`H4Q@$~e;fvb>%&@7dV zPeC9F;L#LzYWX{1h-&X4Y+69KDIPM=DWwKzf&L3L8UcXzf#K!4s$9073iR@bii+H=14rJCig<>q_7 z72n{j1C)8z)0VZL$nH3?0->jE!&5nT=%;TkR($4ZS)5uvmD_nVPfQNyx_b~DTMj3v zIPhxKY+IPhRR?CBpZnStM^=12vkvriHP27qn$ETyzT3azf8jI#z`fnGZb+|am@mIo zzHnv9wBqfYwQP9m7M}YX->PTFy02z-;6Zi${A;&f%Wm7ZOjoK8&pI)`<8Q=OPv?4F z!|Y2BgpM_#Cnxm$M%bNmH_i^s9fD!!Ji@xSan0MB^R_My-1m0mEoA$iuk6HIHFszs zp6xob;yIhOpZ&w57dhf>cu07Mf1|P~Z{jM>a+_vz6*}j9{=Cgx=Knu=Gvt0FgUgJ( zZ9RUJ{PL*dgx~xttNTQy`B#+|h$}92nV2rxqC*g*Fdec1kz#KxoEB;Fdmnn@OwabY zg1#}08&U_F(jHyn6(5I`^5_Z}a z({m7k+rbl4pxa}SJ`fR?*&&vi{5^Kecbsv0`zQ@3C*xrXIOyoh68%o8N?9pY1j!Ir zxTs={g3$m4^)1I_Rsu3*+-vHjRRQ8Hk`ioZPB%YQim?1&Vk;LQ06VURlvmDmu9h_~ zZeMqo-5k6=IM;jQ)OvNz{42L!`HNR)2RG`Q-|M*DvG~eLeb?;iuWh7!8z8Ehmc>&` zKgzZ~pYJa(tQqj2f&H;Sc=Ktpmb1h8e?#NRMK zerr6ty?3Q%_pEEZdiygrgNo)Sa7fHejcPE0tE@kU=cq)P$K?-|9-E z4a}a5wKVl1wNWoqc_h}+ZOz88omv9DZ0cc+G>hVW8NUc&#`Hy!|2nRQ z`Txm&_)AyK+~tMz+P0qDww_g2??zR}N>x|Z-Swcl0Z^Z-dQM!c3FK-5+0G-YuA}R% zJJ(wG=UVr#wVuwkp3b@f@cCx@);)FErrw;VcfH}cwT7NtL(g*cYQup%F;`Rqx~u*c zqPvR9tp5Ol-oR3Q&bRj#_uHNmSzp(B!=ANS8gDmdcO1L> z#$B4-(SPry`*r6YH1=eBPvsh4THrpfZOWIC-T^M}BXzshYI}0EJbdlqzdh$~Uus+HIG5`<2UxkIGV9#|RH*IDcJ{6K`?KEuFa5z4KU~P@ z`4YhO-W5-K*53Z4w+P6mDnpcy^Kfoad@lBcON%*5WVm#s`LmwtD0#e$8~Tya!+}1% z3e8Ode!{^(GkTb@olr<|hV(rYQrsb2?6{U=gX=IUxa*?u@tB-e%%Cq7{$eT>XRc(5 z$Bh&Z{pk<}FGHYMn6r%zW8B63(CY0Z#SG^_^i@oD181aT5jCA69H8SE@ED-Jqca#> zgFrDQlZsX4eTtf&{tSbgm{q3vxg%0M4%cxwdJEr9JP*zQ%c}7y#+4Ucx~UxCQyyA%P*kpTiR1kwC!#2XS-! zNo^0ezYGm#e+drzeqkK_z0JY37!h=;2s6SGpgpLM1 z=u>+v4n=nU`;zo4n!uzH|D$0CmadycRU1#7fG9dI;opo4X?j@+fl#r%-*-;-HDaNP(7b+(cZ~;L^ zr}Y)Ml?vlB+CpWgy36$Hsuxe4C@ZET40kcC2j`tb>rVpdhj)*2`#TEW!*0m>ZGF$AA?8NE0*?7HiE_&b51YfmH^H*uCB$MrHCzUw!}c z=D&a5a-6qC;M;gG6Q9i^OpX-a_Yh4I4>v8$aBIPWw>(PndI=m4cOD#~RdPxW#*a4m zvctavS}1|9qi($CG2(_pgo;0@7dNzr_Y@y{#ImQPyVz}&^3u6cJKPDejFkULBPbVL z+C6vAlq*n4uftHL@2WddP&M(EYaFGxne8%-+YbxF`hk8prSlep#hRC>3&e?PUIVL# z({Mu~0#}>mEu6{j1fcV{6nCS09Ij$Wmyx9nCJ(X(bU(N1x!Iy@(3oS&rV=7}PY&*Y z7-XMPZyy0xK+_{&QZY8hwVT>1a2SAu>BbVM+&C>ifQ*IqC;+R*c03BNE11GcZ7ex4 z35Ss2fC#~1KHQw5X?D)0)TAcUm}R7}LU4h+P(yKPWw5&9W*jZ6N0^b~!zzrUKG6%D1eQ4OkaKJ=Lxgty1;&8{KaCr2zdL`^@UDJWR zXlUb%UF5-*wZ3cM0MKw;N&DCawk0k)JfQ81T7Y9RBYlbl?C|3VIuE4=?{G?)b^XArZSP;TAAevkTeAx}yRcZhw13sUd&6G4P?O!h=e~U};zdlDz+gKDObQ`;6*!^^?mb2%xN)XJSvkb6{K2ou!XmiW zrh7|i(q~q|E4}gt^a1MC<=G-|*zaH3SF1oo)qsd;+qeAPPk!+64^~?H!F%*&L)(V8 z|K7e|ANlmiiudL9=GOP(x8rNg2XoB_A6d+e<+B4f&OCCGvX;f?{yX(ZY1un;p-jpmC0hU=gT=uVDWWyS_Jaeddt2C@UtTjS$OD^x4<{JD_X%Zf-yVU z{@mjE_Z@i$emUX9Iq$-lo7B|h%P>|>{EhRMZe7~+;Mcb}@?796ht<-(rBV2XMuEOZ zt{=(U@XJo1)A;rH!f>v9dmbx5>#8lX>B28>vt`fnzE2K*eDISOK7Juj;PYPk5x&$p z;`9PNvnNVA<(2KpOu}@s8MVmGq0So=bu?Rh?n^_U{4XlQG{tqU8(9wMwueG zVS5(TC;QDP`_-W;W0*YoD=hG@FxZO$+pNz+OxdZ~tU(X}=@i=bLG_w7_OJb9Dj1%K zQ6``m3;YsWU;>Km;jQ~xu^&N8lDflHFqUZUVeCX zX-h1XxPgJAkQR=vcaVa%s0RU=?dPcXFFzmT}PBEPFd``+{(rI%GCUiC{o4*0%Z{S1~ zlH_EPw8>nOcM-WuwkdX*m+e#b3Yp?$;kGX+$h_j1vdNCy_M{VVQE|<7w@+%~1MmSV zCs4u30i6$O;%)o=CTY^I1lVYMlhW1J*99ZoN>H&Yp(z{o0}o`%E_>Q&!m?NChVuvh zlpfj7W^u>?Sg*H2=D1|vB@zo7zOy;~#N|TKJh)N7u9*3LOn=5(ZI&1A(t|{sis~E0Sik(R+rQyb+(OZ9fE1_o#i9)7uM3?Baq92(n9)ISBMPN=p5iiVBl%G=XwyTP_ zSnq>AZ34j8x5!hGxcrOGn%MtH9ISgHk47hIo{5c6q`K$1>Z#YN&t0tUetj+UgR1uj z_2Kb~{Znyh6GoDB^EeS_gN1(<;7ARz#m-0MBaX5)aZHq=Tp1#4HU;XV>}0al z9tA25t`4pZI^(>Nw_Qljy64G zIH%OSq#vCy#H&q=4gsCk#(zKZX^Bh}a3UfF*80fGHm=Pz?xqK%?_?D9+o)UKEVHOuFs+DyKv%F$e2 zk6u-xX-QQu0kC74$I&O2A|on)$SAX)&CGXdDx)eWOo6PK_IS5l|XdrHp$AMtQ#rQBBCRt}2^j)A{CS@{f9Xm8M zOPQ0oY3_bg>=tY`pex%RSQZDYCO>JK1Frcoe8~G0&1X~BlzA=2(=nKg9!7w6qiG+S zy5eDkCVQoU?fYx%isQh20AO2@J?oJpwaAfsQ`O_=suQoSMb1~nf!_u~)qN9>0>`W3 zadXoGvm=J1K$WS?oDl{~jq|rV%`{#}tDyF<3o;L6OA5gNTYA?z3hL2Y;RPH?OG+=1 zdGk_1-`p83lhbr5msTiV2ptCir-9ajeh0x(0NYQa3y+T3y%acn@ammD8EQP@{DbS_pj<};low$;d&s zq|J7Eh)p18(mKVnrU?Xik`#}h{sjQT*YqW6MO9NNU>wZ?Q`GM$760i+AN{Cy z@L6n30Ieg~uyMlM-0HC}pyh3Dlee0%H-x>R)zMY$ZVU-R*l#vA2it@btKF;Vd&gF1 z?hZ8wR3E^Gig>jw^1Ui{QM+Te-zq$#^--;DlDu26(X0-1yOGF6bMo^ZI_$>P59YuGw z8p?djzL#xf2ROgx4EWS3iw}hRXu6;*xXRot^0K4Ij3wLNDW}?<05x!C3_702m=LCk`&cbDJf>vQikpzGbcfCpbpYMv|L^)SG@IL-}2zn z;LjqTW&59@u($1-m%n8{Z8Y{BG}fNL_3@K0gBtm$a=nz)P6B2) zWaUN%-c1E;{%dRzn25)OHqWJDXC6Tg0L!N^hh#dV-36LYE1K5wMk#3uogn&KP~ps{q=!>rQ{5oGLrqrOu(;YRPC zRmVf+QSZr$@KX;HP=lK@>{m0AmbL>gt%B#roZ0^XvP>YvhQzE&;QF~vLdl$obuFz) zKwzA@<+5hc#qTJc!t;(U1>z)dBxJUOYk0t0@se2Pe=L*%mS7z)%;F`omu$VYN5N|^ z2wU-*Y#H&g@SaU33&K0XP3|U{d<2@F;tIt7??xy;5_k6umkR zfxW8cZ-N9w^%;|d7FjY$i%NQQM%N44cM}PC`)YAaO~$26Ldii!QdcxR0a2C?LsRgg z#bhcbC66>$VL9LUjGo2yqO_Qc-Z+2$LKKozAZT%j?_Po}#$3jz#Z6q9gP=HrDJe;f zr;D1-GA1;qsD=l!RgkYWldUX#!7QKYU-&$D10J?x^t3{MIhB?4Y*A$y(&kESigE+T z-UMK{tkuD(P-VZ(j{q@sbq-)m~L~!+OgmyPZaL8&1STGQ4l~(p^`>g*7+n>2G)t^OD|y zh7U16>5D91Te`Lxz*@XZ*i-M_wX80w4Ic2vE`OzXr@UBRdE-}CKD<&5#A>dCn*#JS zMxDZQ4<{M~J|D}Q_-PE1eX-ABrSRiG_no=LxjXMJzKa}%@kWAoPA;BAuEM|-^s9^N z%H^7W7+DJgYwp0E$YSIvuxBQm%zRhaa6sx9l*UZ=pv^MHSY)CYhaWgPeu#7$*kib_$^Sw_p)-D$`c4-ml;iovES z9+<=1alM9k(oCYAqVGd5KH;=i05o`x>+_F$ nBP*F|&sdd&o;pUk?rQh`rv$!FC%KCp7p@K-{E{HTtp5K4YlW|% literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/__pycache__/w8a16.cpython-312.pyc b/model_executor/layers/quantization/__pycache__/w8a16.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fdd539fc0fda553d62e4e4963475da3ccce1304 GIT binary patch literal 5386 zcmb6dOKcm*b(Z_#^8b-2Oa3TwA{(NkQ1B;~^8Yw?(#BTO+Nl#Z!D7u_O3N%inO!E9 zOaX4;gMu6qou~i-BB2M%aDY0eTzl!M7YTL{Vxb}~(j0s#=t@C>9@@S)OHwOZa@zs+ z&704g_h#n(X8#n41PK&V_{8Xo5b^^y+C^+38%u!HiAFTeBtkN>)|Snl zVsEyWBR=vf(E`_q7Sv-mc~_QfiuHz|H>^iGy?ryjbmSgRNed4DQfbB!UNLgEBb+rX z+mYX>*haaenv-eH3B0LNwW!-Vg+@rpRV}+*RLT`A?Q?=>jgqd?msLx5;x2wpx2MY* z_Q3SyFRE(EHfB{^?o_#yH>QB9|5aM9R@l(_oNDSF22RhLYVLhq`|#}9bDd6DCT{7r z@{w*#P1}lU+tkXrn{%PR)!6tOAa$aXET@sIPvf$DfCPw^(1dGn?wU^*^FB=kTx#JG z;C{emJy7WNF6rU}PyvM>W`$r@7;qWzh#tjLzS$zk#`HLxctRKTWZs7~nolEnK?_|I zI&*s9^aF*F7Kx`S9Pu<-s20&h;Mb=`nFNv+1Fn0RoMKC+IP~<-@ad!zx;UX8J?Wn9 z>_N94%9*NV4du&ph>d1~Lr@rSB&yq0TAIvpU5rs-HUt25GDB`~lQ#f3f~A{z3#ShK z^TL>2tc+ER$`iIqr*!*?%hmCxCf*;{$4ukWSY-z8y)^crX%@$dWlc8~{j#2`+GRRs zsxvyZ#y;@q!q`VBb*3`o$ckc=3|moVdp$eJw0rSdEWF{Xq$!a=WCKw0P}#Y(Z<5Ag z7DZZ`0^n|d?g13}QN2PR_wM)b zi}gLo&j;h|W?lxQPHcc~Dd0#~@r>d{c9kjLc|v=jGtE09+)HlS5vEl$@5q<(W?8k5 zo_yf2`cX`)TB($&t!s*!vlY#rsp!`1u=lcdoAn4Vv+x40lJ$719$W~1ar(>2&nH)7 zkFUy)Ke$;G*sF$kmG@qU&Yh1oiF}JjNy(`d^^##4_RQ>Hw_MxDABDx**BeW?5+K)h zK}}~8NPyJI6looqD*#g{R0&s=dXayQc{L?(m^$9FHS3z3YlxYhW*mYAS}J`vmY5GP zDICdNDCfOa#F;dx&UNtbwT{>E$^|Z!&zTlYRuaD26Xzhaf#zcJ@5YW?(cd2o9XG=F*7!BF& zP>EY=%{>o(r>&ns!!k;it(I~+McL>WfHY705Pt$e27rYe!Lzx$jpJc6C!;{Pf*>@z z-@E1RQC!8^3*ajGYbf=Z{zdjqXk@)-cSR10q(*zn~ue}ZW)z12A}mu0N2QUpHGyVGA~B%f_oRCZU#B=k;TV02vkjp z6Zg3I9>hmihF7Y;zjEWs+L32)!YFnomrfy`ga{!e@B0AT-~nqQhJ6XdQcJx{xs~Hf z)60WRn4G$G?(bOPOn1B~31aV(etmj*`g(b}+$7L!92{=)Sf`J!9y`63KC^+XW)yj9 zE610OP2|h;gl#SdTk4c>3)_I44{XAlm_RTWAf4bLjEkC}OND5Ms)H2ppt_1C>H$sC zgL(iDRr5m&k{2xiE1+p~m3&wW0*5dpLGk~KOBmKe)95OYm_&CTVbbg}mX{%c5wo$G zP&&cvcy`crBSWq_u{`A?mJbBo93|{p@xwe~5=5($8doDqN65DO;OI-x2L{8u1>#<> z$Sh)Z?*s0vlX5(V1eoMQ3b213cQ5S>NeAHZLou42ffq=G z>K{}Ms$0qwRkeF?D}-V&gU}mruU9jQPOx>mh)eE!#gUkIx6*!a!qpMrJl%+otoQA!7ZwVeejyTV2Fbpm&4@o4oR>D^BrwnjCmRO_8-tJB7x`3l zlNSQ9%`i#s>#UH#Mfx7ynR>CWF>s)Mf92#I`PjPL zzu32W@c141L_?13IP|T2g4z0{!(DcSONMH(sGIAU59|_63{7tXp5M#|6`;{w<^)i` zPU5pQIw#hA1@CIDAy9p8FR2MTj+x& ze~fm^t!w_8P!l!&E8%8K#yPph|3(A@<{5XnChx?(Cbiwtd6E_<@BJIfy?+64d>8N^ zf&@>|qH3q5E!}fOYg(=7jvtJQAw2w2aKa}0p`fsEkPdj1|5qR(y$HbZf8d%HQ(-3n zdak%D1Y1jK%4LU#ZVUI?*6mi3D5{l;Inzppx9Kz+m;wsxw*cU|DU+W5`sl*wyuXp? ztM6afKQA>>ef5hA7Z=YjUi$3arHj|!TYm4>v9&`dSBEFoQWNt59DirwokhBI^!oVn z_{y>6XKts)=7a0e#COrbwdmmD?8^A9Gk-X7C;Hkv-w4H4L;D)Z)S|F(Wd2kmmi+X} z$5$4gx*a>TDj#xV4tuY~>60K5J&9l(!2|;K#>MoG`4jp)VlNKjShen!y=mBivxo@s~ePx5SE^s7s?7lxF9&V)e*56)u zyRIxKO&;2J-Lr0rfCEwLz~Wn9zWe#RYrRLBI32pipZ&)C+T6sJdq(VSu*Z?$WoneB z+{6^s*F6K4kq$xANwP<8rc}zXz>B}Mm_K#|{A)omFjCAWoP*fTZ2yaqasOguTm^L> zC1^p;@rO+I{_@CNf`4MI#~4M&>m1mLa!WK!A1OvZXJDs^Roj3!9|jIr6YvzeF4d8r zb)N^%(-Mqg)UsX#(BwIe`<{&6CZj))@Sn-ZkL0oM$ None: + super().__init__() + if weight_bits not in self.SUPPORTED_BITS: + raise ValueError( + f"Unsupported weight_bits: {weight_bits}, " + f"currently only support {self.SUPPORTED_BITS}" + ) + if data_type not in self.SUPPORTED_DTYPES: + raise ValueError( + f"Unsupported data_type: {data_type}," + f" currently only support {self.SUPPORTED_DTYPES}" + ) + if packing_format not in self.SUPPORTED_FORMATS: + raise ValueError( + f"Unsupported packing_format: {packing_format}, " + f"currently only support {self.SUPPORTED_FORMATS}" + ) + if backend not in self.SUPPORTED_BACKENDS: + raise ValueError( + f"Unsupported backend: {backend}, " + f"currently only support {self.SUPPORTED_BACKENDS}" + ) + + self.weight_bits = weight_bits + self.group_size = group_size + self.sym = sym + self.packing_format = packing_format + self.block_name_to_quantize = ( + block_name_to_quantize.split(",") + if isinstance(block_name_to_quantize, str) + else block_name_to_quantize + ) + self.extra_config = extra_config + self.data_type = data_type + self.backend = backend + self.pack_factor = Fraction(32, weight_bits) + + def __repr__(self) -> str: + return ( + f"AutoRoundConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, sym={self.sym})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "auto-round" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 60 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantization_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "AutoRoundConfig": + return cls( + weight_bits=cls.get_from_keys(config, ["bits"]), + group_size=cls.get_from_keys(config, ["group_size"]), + sym=cls.get_from_keys(config, ["sym"]), + packing_format=cls.get_from_keys_or( + config, ["packing_format"], "auto_round:auto_gptq" + ), + block_name_to_quantize=cls.get_from_keys_or( + config, ["block_name_to_quantize", "to_quant_block_names"], None + ), + extra_config=cls.get_from_keys_or(config, ["extra_config"], None), + data_type=cls.get_from_keys_or(config, ["data_type"], "int"), + backend=cls.get_from_keys_or(config, ["backend", "vllm_backend"], "auto"), + ) + + def get_layer_config(self, layer, layer_name: str): + def get_config(name: str, quantized: bool = True): + if not self.extra_config: + return ( + self.weight_bits if quantized else 16, + self.group_size if quantized else -1, + self.sym if quantized else True, + ) + + # exact match first + if name in self.extra_config: + cfg = self.extra_config[name] + return ( + cfg.get("bits", self.weight_bits if quantized else 16), + cfg.get("group_size", self.group_size if quantized else -1), + cfg.get("sym", self.sym if quantized else True), + ) + + REGEX_SPECIAL_CHARS = set(r"*+?^$()[]{}|\\") + for pattern, cfg in self.extra_config.items(): + if not isinstance(pattern, str) or not any( + c in REGEX_SPECIAL_CHARS for c in pattern + ): + continue + + try: + if re.search(re.compile(pattern), name) is not None: + return ( + cfg.get("bits", self.weight_bits if quantized else 16), + cfg.get("group_size", self.group_size if quantized else -1), + cfg.get("sym", self.sym if quantized else True), + ) + except re.error: + # Invalid regex, ignore. + continue + + return ( + self.weight_bits if quantized else 16, + self.group_size if quantized else -1, + self.sym if quantized else True, + ) + + # 1. Exact match from config + if self.extra_config and layer_name in self.extra_config: + return get_config(layer_name) + + # 2. Determine whether layer should be quantized + quantized = not isinstance(layer, ParallelLMHead) + if self.block_name_to_quantize: + quantized = any( + layer_name.startswith(name) for name in self.block_name_to_quantize + ) + + # 3. Handle fused MoE + if self.extra_config and "fusedmoe" in layer.__class__.__name__.lower(): + moe_configs = [ + get_config(name, quantized) + for name in self.extra_config + if name.startswith(layer_name) + ] + if moe_configs: + if len(set(moe_configs)) == 1: + return moe_configs[0] + raise ValueError( + f"Fused MoE layer '{layer_name}' requires " + f"consistent quant config for all sub-layers" + ) + + # 4. Handle fused QKV or other patterns + if self.extra_config: + for fusion_key, sub_keys in self.packed_modules_mapping.items(): + if fusion_key in layer_name and layer_name.count(fusion_key) == 1: + sub_names = [ + layer_name.replace(fusion_key, sub_key) for sub_key in sub_keys + ] + sub_configs = [get_config(name, quantized) for name in sub_names] + if len(set(sub_configs)) == 1: + return sub_configs[0] + raise ValueError( + f"Fused module '{layer_name}' requires " + f"consistent quant config for {sub_names}" + ) + + # 5. Fallback or try a regular expression match + return get_config(layer_name, quantized) + + def check_quantized(self, weight_bits: int) -> bool: + return weight_bits < 16 + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.block_name_to_quantize is not None: + self.block_name_to_quantize = hf_to_vllm_mapper.apply_list( + self.block_name_to_quantize + ) + if self.extra_config is not None: + self.extra_config = hf_to_vllm_mapper.apply_dict(self.extra_config) + + def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): + from vllm.model_executor.layers.fused_moe import FusedMoE + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + check_marlin_supported, + check_moe_marlin_supports_layer, + ) + + weight_bits, group_size, sym = self.get_layer_config(layer, prefix) + if not self.check_quantized(weight_bits): + if isinstance(layer, (LinearBase, ParallelLMHead)): + return UnquantizedLinearMethod() + else: + return None + + logger.debug( + "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s", + prefix, + layer.__class__.__name__, + weight_bits, + group_size, + sym, + ) + if backend == "auto" or "marlin" in backend: + AWQ_TYPE_MAP = { + 4: scalar_types.uint4, + 8: scalar_types.uint8, + } + use_marlin = (weight_bits in AWQ_TYPE_MAP) and check_marlin_supported( + AWQ_TYPE_MAP[weight_bits], group_size, not sym + ) + + if isinstance(layer, FusedMoE): + use_marlin = use_marlin and check_moe_marlin_supports_layer( + layer, group_size + ) + + else: + use_marlin = False + if use_marlin: + from vllm.model_executor.layers.quantization.awq_marlin import ( + AWQMarlinConfig, + AWQMarlinLinearMethod, + AWQMoEMethod, + ) + + quant_args_marlin = AWQMarlinConfig( + weight_bits=weight_bits, + group_size=group_size, + zero_point=not sym, + lm_head_quantized=False, + full_config={}, + modules_to_not_convert=[], + ) + else: + from vllm.model_executor.layers.quantization.awq import ( + AWQConfig, + AWQLinearMethod, + ) + + quant_args = AWQConfig( + weight_bits=weight_bits, + group_size=group_size, + zero_point=not sym, + ) + + if isinstance(layer, FusedMoE): + if use_marlin: + return AWQMoEMethod(quant_args_marlin, layer.moe_config) + from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config + + config = { + "quant_method": "awq", + "bits": weight_bits, + "group_size": group_size, + "zero_point": not sym, + "lm_head": False, + } + return MoeWNA16Config.from_config(config).get_quant_method(layer, prefix) + + if isinstance(layer, (LinearBase, ParallelLMHead)): + if use_marlin: + return AWQMarlinLinearMethod(quant_args_marlin) + else: + return AWQLinearMethod(quant_args) + return None + + def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"): + from vllm.model_executor.layers.fused_moe import FusedMoE + from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + check_marlin_supported, + check_moe_marlin_supports_layer, + ) + + weight_bits, group_size, sym = self.get_layer_config(layer, prefix) + if not self.check_quantized(weight_bits): + if isinstance(layer, (LinearBase, ParallelLMHead)): + return UnquantizedLinearMethod() + else: + return None + + logger.debug( + "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s", + prefix, + layer.__class__.__name__, + weight_bits, + group_size, + sym, + ) + if backend == "auto" or "marlin" in backend: + GPTQ_TYPE_MAP = { + (4, True): scalar_types.uint4b8, + (8, True): scalar_types.uint8b128, + } + use_marlin = (weight_bits, sym) in GPTQ_TYPE_MAP and check_marlin_supported( + GPTQ_TYPE_MAP[(weight_bits, sym)], group_size, has_zp=not sym + ) + if isinstance(layer, FusedMoE): + use_marlin = use_marlin and check_moe_marlin_supports_layer( + layer, group_size + ) + else: + use_marlin = False + if use_marlin: + from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig, + GPTQMarlinLinearMethod, + GPTQMarlinMoEMethod, + ) + + quant_args_marlin = GPTQMarlinConfig( + weight_bits=weight_bits, + group_size=group_size, + is_sym=sym, + lm_head_quantized=False, + desc_act=False, + dynamic={}, + full_config={}, + ) + else: + from vllm.model_executor.layers.quantization.gptq import ( + GPTQConfig, + GPTQLinearMethod, + ) + + quant_args = GPTQConfig( + weight_bits=weight_bits, + group_size=group_size, + lm_head_quantized=False, + desc_act=False, + dynamic={}, + ) + + if isinstance(layer, FusedMoE): + if use_marlin: + return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config) + else: + from vllm.model_executor.layers.quantization.moe_wna16 import ( + MoeWNA16Config, + ) + + config = { + "quant_method": "gptq", + "bits": weight_bits, + "group_size": group_size, + "sym": sym, + "lm_head": False, + } + return MoeWNA16Config.from_config(config).get_quant_method( + layer, prefix + ) + + if isinstance(layer, (LinearBase, ParallelLMHead)): + if use_marlin: + return GPTQMarlinLinearMethod(quant_args_marlin) + else: + return GPTQLinearMethod(quant_args) + + return None + + def apply_ipex_quant_layer(self, layer, prefix: str): + weight_bits, group_size, sym = self.get_layer_config(layer, prefix) + if not self.check_quantized(weight_bits): + if isinstance(layer, (LinearBase, ParallelLMHead)): + return UnquantizedLinearMethod() + else: + return None + from vllm.model_executor.layers.quantization.ipex_quant import ( + IPEXAWQLinearMethod, + IPEXConfig, + IPEXGPTQLinearMethod, + ) + + if isinstance(layer, (LinearBase, ParallelLMHead)): + if "awq" in self.packing_format: + config = IPEXConfig( + method="awq", weight_bits=weight_bits, group_size=group_size + ) + return IPEXAWQLinearMethod(config) + elif "gptq" in self.packing_format: + config = IPEXConfig( + method="gptq", weight_bits=weight_bits, group_size=group_size + ) + return IPEXGPTQLinearMethod(config) + else: + raise ValueError( + f"ipex backend only supports awq " + f"and gtpq format,but got {self.packing_format}" + ) + else: + return None + + def get_quant_method(self, layer: torch.nn.Module, prefix: str): + if prefix and self.extra_config: + for layer_name in self.extra_config: + if ( + layer_name == prefix or layer_name == f"model.{prefix}" + ) and self.extra_config[layer_name].get("bits", 16) >= 16: + return UnquantizedLinearMethod() + if ( + current_platform.is_cpu() + or current_platform.is_xpu() + or self.backend == "ipex" + ): + return self.apply_ipex_quant_layer(layer, prefix) + if "gptq" in self.packing_format or "gptq" in self.backend: + return self.apply_gptq_quant_layer(layer, prefix) + if "awq" in self.packing_format or "awq" in self.backend: + return self.apply_awq_quant_layer(layer, prefix) diff --git a/model_executor/layers/quantization/awq.py b/model_executor/layers/quantization/awq.py new file mode 100644 index 0000000..1545c72 --- /dev/null +++ b/model_executor/layers/quantization/awq.py @@ -0,0 +1,278 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TYPE_CHECKING, Any, Union + +import torch +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped +from vllm.model_executor.parameter import GroupQuantScaleParameter, PackedvLLMParameter +from vllm.transformers_utils.config import get_safetensors_params_metadata + +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.model_executor.models.utils import WeightsMapper + +logger = init_logger(__name__) + + +class AWQConfig(QuantizationConfig): + """Config class for AWQ. + + Reference: https://arxiv.org/abs/2306.00978 + """ + + def __init__( + self, + weight_bits: int, + group_size: int, + zero_point: bool, + modules_to_not_convert: list[str] | None = None, + ) -> None: + super().__init__() + self.weight_bits = weight_bits + self.group_size = group_size + self.zero_point = zero_point + self.modules_to_not_convert = modules_to_not_convert or [] + + if self.weight_bits != 4: + raise ValueError( + "Currently, only 4-bit weight quantization is supported for " + f"AWQ, but got {self.weight_bits} bits." + ) + self.pack_factor = 32 // self.weight_bits + + def __repr__(self) -> str: + return ( + f"AWQConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"zero_point={self.zero_point}, " + f"modules_to_not_convert={self.modules_to_not_convert})" + ) + + def get_name(self) -> "QuantizationMethods": + return "awq" + + def get_supported_act_dtypes(self) -> list[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + # The AWQ kernel only supports Turing or newer GPUs. + return 75 + + @staticmethod + def get_config_filenames() -> list[str]: + return [ + "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq + # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq + "quantize_config.json", + ] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "AWQConfig": + weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) + group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) + zero_point = cls.get_from_keys(config, ["zero_point"]) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None + ) + return cls(weight_bits, group_size, zero_point, modules_to_not_convert) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Union["LinearMethodBase", "QuantizeMethodBase"] | None: + if isinstance(layer, LinearBase): + if is_layer_skipped( + prefix, + self.modules_to_not_convert, + self.packed_modules_mapping, + skip_with_substr=True, + ): + return UnquantizedLinearMethod() + return AWQLinearMethod(self) + elif isinstance(layer, FusedMoE): + # Lazy import to avoid circular import. + from .awq_marlin import AWQMarlinConfig, AWQMoEMethod + from .moe_wna16 import MoeWNA16Config + from .utils.marlin_utils import check_moe_marlin_supports_layer + + if not check_moe_marlin_supports_layer(layer, self.group_size): + logger.warning_once( + f"Layer '{prefix}' is not supported by AWQMoeMarlin. " + "Falling back to Moe WNA16 kernels." + ) + config = { + "quant_method": "awq", + "bits": self.weight_bits, + "group_size": self.group_size, + "zero_point": self.zero_point, + "lm_head": False, + } + return MoeWNA16Config.from_config(config).get_quant_method( + layer, prefix + ) + marlin_compatible_config_dict = { + "quant_method": "awq", + "bits": self.weight_bits, + "group_size": self.group_size, + "zero_point": self.zero_point, + "lm_head": False, + "modules_to_not_convert": self.modules_to_not_convert, + } + awq_marlin_config = AWQMarlinConfig.from_config( + marlin_compatible_config_dict + ) + return AWQMoEMethod(awq_marlin_config, layer.moe_config) + return None + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.modules_to_not_convert: + self.modules_to_not_convert = hf_to_vllm_mapper.apply_list( + self.modules_to_not_convert + ) + + def maybe_update_config(self, model_name: str, revision: str | None = None): + if self.modules_to_not_convert: + return + + unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32] + metadata = get_safetensors_params_metadata(model_name, revision=revision) + layers = {param_name.rsplit(".", 1)[0] for param_name in metadata} + quant_layers: set[str] = { + param_name.rsplit(".", 1)[0] + for param_name, info in metadata.items() + if (dtype := info.get("dtype", None)) + and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes + } + self.modules_to_not_convert = list(layers - quant_layers) + + +class AWQLinearMethod(LinearMethodBase): + """Linear method for AWQ. + + Args: + quant_config: The AWQ quantization config. + """ + + def __init__(self, quant_config: AWQConfig): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + if input_size_per_partition % group_size != 0: + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size." + ) + + output_size_per_partition = sum(output_partition_sizes) + if output_size_per_partition % self.quant_config.pack_factor != 0: + raise ValueError( + "The output size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size." + ) + + weight_loader = extra_weight_attrs.get("weight_loader") + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + num_groups = input_size_per_partition // group_size + + qzeros = PackedvLLMParameter( + data=torch.empty( + num_groups, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + scales = GroupQuantScaleParameter( + data=torch.empty( + num_groups, + output_size_per_partition, + dtype=params_dtype, + ), + input_dim=0, + output_dim=1, + weight_loader=weight_loader, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("qzeros", qzeros) + layer.register_parameter("scales", scales) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False) + layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False) + layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + qweight = layer.qweight + scales = layer.scales + qzeros = layer.qzeros + pack_factor = self.quant_config.pack_factor + out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,) + reshaped_x = x.reshape(-1, x.shape[-1]) + + # num_tokens >= threshold + # FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256 + FP16_MATMUL_HEURISTIC_CONDITION = False + if FP16_MATMUL_HEURISTIC_CONDITION: + out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0) + out = torch.matmul(reshaped_x, out) + else: + out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, + pack_factor, group_size=self.quant_config.group_size) + if bias is not None: + out.add_(bias) + return out.reshape(out_shape) diff --git a/model_executor/layers/quantization/awq_marlin.py b/model_executor/layers/quantization/awq_marlin.py new file mode 100644 index 0000000..100a58d --- /dev/null +++ b/model_executor/layers/quantization/awq_marlin.py @@ -0,0 +1,869 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import TYPE_CHECKING, Any, Optional + +import torch +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from torch.nn import Parameter + +import vllm.model_executor.layers.fused_moe # noqa +from vllm import _custom_ops as ops +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, + FusedMoEMethodBase, + FusedMoeWeightScaleSupported, + UnquantizedFusedMoEMethod, +) +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, + set_weight_attrs, +) +from vllm.model_executor.layers.quantization.awq import AWQConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + apply_awq_marlin_linear, + awq_to_marlin_zero_points, + check_marlin_supported, + check_marlin_supports_layer, + check_moe_marlin_supports_layer, + marlin_make_empty_g_idx, + marlin_make_workspace_new, + marlin_moe_permute_scales, + marlin_permute_bias, + marlin_permute_scales, + moe_awq_to_marlin_zero_points, + verify_marlin_supported, + verify_marlin_supports_shape, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.parameter import GroupQuantScaleParameter, PackedvLLMParameter +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.transformers_utils.config import get_safetensors_params_metadata + +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.model_executor.models.utils import WeightsMapper +import ixformer.inference.functions as ixfops + +logger = init_logger(__name__) + + +class AWQMarlinConfig(QuantizationConfig): + """Config class for AWQ Marlin""" + + # num_bits -> type + TYPE_MAP = { + 4: scalar_types.uint4, + 8: scalar_types.uint8, + } + + def __init__( + self, + weight_bits: int, + group_size: int, + zero_point: bool, + lm_head_quantized: bool, + modules_to_not_convert: list[str] | None, + full_config: dict[str, Any], + ) -> None: + super().__init__() + self.pack_factor = 32 // weight_bits # packed into int32 + self.group_size = group_size + self.zero_point = zero_point + self.lm_head_quantized = lm_head_quantized + self.weight_bits = weight_bits + self.modules_to_not_convert = modules_to_not_convert or [] + self.full_config = full_config + + if self.weight_bits not in self.TYPE_MAP: + raise ValueError( + f"Unsupported num_bits = {self.weight_bits}. " + f"Supported num_bits = {self.TYPE_MAP.keys()}" + ) + + self.quant_type = self.TYPE_MAP[self.weight_bits] + + verify_marlin_supported( + self.quant_type, group_size=self.group_size, has_zp=self.zero_point + ) + + def __repr__(self) -> str: + return ( + f"AWQMarlinConfig(quant_type={self.quant_type}, " + f"group_size={self.group_size}, " + f"zero_point={self.zero_point}, " + f"lm_head_quantized={self.lm_head_quantized}, " + f"modules_to_not_convert={self.modules_to_not_convert})" + ) + + @classmethod + def get_name(cls) -> "QuantizationMethods": + return "awq_marlin" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "AWQMarlinConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + zero_point = cls.get_from_keys(config, ["zero_point"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None + ) + return cls( + weight_bits, + group_size, + zero_point, + lm_head_quantized, + modules_to_not_convert, + config, + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> Optional["QuantizationMethods"]: + can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg) + is_valid_user_quant = ( + user_quant is None or user_quant == "marlin" or user_quant == "awq_marlin" + ) + + if can_convert and is_valid_user_quant: + msg = ( + "The model is convertible to {} during runtime." + " Using {} kernel.".format(cls.get_name(), cls.get_name()) + ) + logger.info(msg) + return cls.get_name() + + if can_convert and user_quant == "awq": + logger.info( + "Detected that the model can run with awq_marlin" + ", however you specified quantization=awq explicitly," + " so forcing awq. Use quantization=awq_marlin for" + " faster inference" + ) + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase) or ( + isinstance(layer, ParallelLMHead) and self.lm_head_quantized + ): + if is_layer_skipped( + prefix, + self.modules_to_not_convert, + self.packed_modules_mapping, + skip_with_substr=True, + ): + return UnquantizedLinearMethod() + # Check if the layer is supported by AWQMarlin. + if not check_marlin_supports_layer(layer, self.group_size): + logger.warning_once( + "Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.", # noqa: E501 + prefix, + ) + return AWQConfig.from_config(self.full_config).get_quant_method( + layer, prefix + ) + return AWQMarlinLinearMethod(self) + elif isinstance(layer, FusedMoE): + # from vllm.model_executor.layers.quantization.moe_wna16 import ( + # MoeWNA16Config) + # if is_layer_skipped_awq( + # prefix, getattr(self, "modules_to_not_convert", [])): + # return UnquantizedFusedMoEMethod(layer.moe_config) + # if not check_moe_marlin_supports_layer(layer, self.group_size): + # logger.warning_once( + # f"Layer '{prefix}' is not supported by AWQMoeMarlin. " + # "Falling back to Moe WNA16 kernels.") + # return MoeWNA16Config.from_config( + # self.full_config).get_quant_method(layer, prefix) + # return AWQMoEMethod(self, layer.moe_config) + return AWQMoEMethod(self, layer.moe_config) + return None + + @classmethod + def is_awq_marlin_compatible(cls, quant_config: dict[str, Any]): + # Extract data from quant config. + quant_method = quant_config.get("quant_method", "").lower() + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + zero_point = quant_config.get("zero_point") + + if not current_platform.is_cuda(): + return False + + if quant_method != "awq": + return False + + # If we cannot find the info needed in the config, cannot convert. + if num_bits is None or group_size is None or zero_point is None: + return False + + if num_bits not in cls.TYPE_MAP: + return False + + return check_marlin_supported( + quant_type=cls.TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point + ) + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.modules_to_not_convert: + self.modules_to_not_convert = hf_to_vllm_mapper.apply_list( + self.modules_to_not_convert + ) + + def maybe_update_config(self, model_name: str, revision: str | None = None): + if self.modules_to_not_convert: + return + + unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32] + metadata = get_safetensors_params_metadata(model_name, revision=revision) + layers = {param_name.rsplit(".", 1)[0] for param_name in metadata} + quant_layers: set[str] = { + param_name.rsplit(".", 1)[0] + for param_name, info in metadata.items() + if (dtype := info.get("dtype", None)) + and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes + } + self.modules_to_not_convert = list(layers - quant_layers) + + +class AWQMarlinLinearMethod(LinearMethodBase): + """Linear method for AWQ Marlin. + + Args: + quant_config: The AWQ Marlin quantization config. + """ + + def __init__(self, quant_config: AWQMarlinConfig) -> None: + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + del output_size + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + verify_marlin_supports_shape( + output_size_per_partition=output_size_per_partition, + input_size_per_partition=input_size_per_partition, + input_size=input_size, + group_size=group_size, + ) + + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + num_groups = input_size_per_partition // group_size + + qzeros = PackedvLLMParameter( + data=torch.empty( + num_groups, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + scales = GroupQuantScaleParameter( + data=torch.empty( + num_groups, + output_size_per_partition, + dtype=params_dtype, + ), + input_dim=0, + output_dim=1, + weight_loader=weight_loader, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("qzeros", qzeros) + layer.register_parameter("scales", scales) + + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.num_groups = num_groups + + # TODO: Update this docs + # Checkpoints are serialized in AutoAWQ format, which is different from the + # marlin format. This function is called after the weights are loaded. + # Here, we handle the repacking + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.output_size_per_partition = layer.qweight.data.shape[1] * self.quant_config.pack_factor + align_bits = 64 * 8 + align_size = align_bits // self.quant_config.weight_bits + if layer.output_size_per_partition % align_size != 0: + padding_output_size_per_partition = (layer.output_size_per_partition + align_size - 1) // align_size * align_size + layer.output_padding_size = padding_output_size_per_partition - layer.output_size_per_partition + device = layer.qweight.device + + pad_qweight = torch.zeros( + layer.input_size_per_partition, + padding_output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + device=device, + ) + pad_qzeros = torch.zeros( + layer.num_groups, + padding_output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + device=device, + ) + pad_scales = torch.zeros( + layer.num_groups, + padding_output_size_per_partition, + dtype=layer.scales.data.dtype, + device=device, + ) + pad_qweight[..., :layer.output_size_per_partition // self.quant_config.pack_factor] = layer.qweight.data + pad_qzeros[..., :layer.output_size_per_partition // self.quant_config.pack_factor] = layer.qzeros.data + pad_scales[..., :layer.output_size_per_partition] = layer.scales.data + replace_parameter(layer, "qweight", pad_qweight) + replace_parameter(layer, "qzeros", pad_qzeros) + replace_parameter(layer, "scales", pad_scales) + return + # TODO(gyf) Marlin format is not support for now.. + device = layer.qweight.device + layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False) + layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False) + layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False) + + # Allocate marlin workspace + layer.workspace = marlin_make_workspace_new(device) + + # Repack weights from AWQ format to marlin format. + marlin_qweight = ops.awq_marlin_repack( + layer.qweight, + size_k=layer.input_size_per_partition, + size_n=layer.output_size_per_partition, + num_bits=self.quant_config.quant_type.size_bits, + ) + replace_parameter(layer, "qweight", marlin_qweight) + + # Permute scales from AWQ format to marlin format. + marlin_scales = marlin_permute_scales( + layer.scales, + size_k=layer.input_size_per_partition, + size_n=layer.output_size_per_partition, + group_size=self.quant_config.group_size, + ) + replace_parameter(layer, "scales", marlin_scales) + + # Permute zero-points from AWQ format to marlin format. + marlin_zp = awq_to_marlin_zero_points( + layer.qzeros, + size_k=layer.num_groups, + size_n=layer.output_size_per_partition, + num_bits=self.quant_config.quant_type.size_bits, + ) + replace_parameter(layer, "qzeros", marlin_zp) + + # Not-used + layer.g_idx = marlin_make_empty_g_idx(device) + layer.g_idx_sort_indices = marlin_make_empty_g_idx(device) + + if hasattr(layer, "bias") and layer.bias is not None: + layer.bias.data = marlin_permute_bias(layer.bias) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + # return apply_awq_marlin_linear( + # input=x, + # weight=layer.qweight, + # weight_scale=layer.scales, + # weight_zp=layer.qzeros, + # g_idx=layer.g_idx, + # g_idx_sort_indices=layer.g_idx_sort_indices, + # workspace=layer.workspace, + # quant_type=self.quant_config.quant_type, + # output_size_per_partition=layer.output_size_per_partition, + # input_size_per_partition=layer.input_size_per_partition, + # bias=bias, + # ) + # TODO use awq kernel temporarily.. + qweight = layer.qweight + scales = layer.scales + qzeros = layer.qzeros + pack_factor = self.quant_config.pack_factor + out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, )) + reshaped_x = x.reshape(-1, x.shape[-1]) + + out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, + pack_factor, group_size=self.quant_config.group_size) + if bias is not None: + out.add_(bias) + return out.reshape(out_shape) + + +class AWQMoEMethod(FusedMoEMethodBase): + def __init__( + self, + quant_config: AWQMarlinConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.quant_config = quant_config + if self.quant_config.weight_bits != 4: + raise ValueError("AWQMoEMethod only supports 4bit now.") + self.quant_type = scalar_types.uint4 + self.use_marlin = True + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + extra_weight_attrs.update( + { + "is_transposed": True, + "quant_method": FusedMoeWeightScaleSupported.GROUP.value, + } + ) + + w13_qweight = Parameter( + torch.empty( + num_experts, + hidden_size, + 2 * intermediate_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_qweight", w13_qweight) + set_weight_attrs(w13_qweight, extra_weight_attrs) + + w2_qweight = Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition, + hidden_size // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_qweight", w2_qweight) + set_weight_attrs(w2_qweight, extra_weight_attrs) + + num_groups_w13 = hidden_size // self.quant_config.group_size + num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size + + # WEIGHT_SCALES + # Allocate 2 scales for w1 and w3 respectively. + w13_scales = Parameter( + torch.empty( + num_experts, + num_groups_w13, + intermediate_size_per_partition * 2, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_scales", w13_scales) + set_weight_attrs(w13_scales, extra_weight_attrs) + + w2_scales = Parameter( + torch.empty(num_experts, num_groups_w2, hidden_size, dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_scales", w2_scales) + set_weight_attrs(w2_scales, extra_weight_attrs) + + # WEIGHT_ZERO_POINT + # Allocate 2 zero points for w1 and w3 respectively. + w13_qzeros = Parameter( + torch.empty( + num_experts, + num_groups_w13, + 2 * intermediate_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_qzeros", w13_qzeros) + set_weight_attrs(w13_qzeros, extra_weight_attrs) + + w2_qzeros = Parameter( + torch.empty( + num_experts, + num_groups_w2, + hidden_size // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_qzeros", w2_qzeros) + set_weight_attrs(w2_qzeros, extra_weight_attrs) + + device = layer.w13_qweight.device + layer.workspace = marlin_make_workspace_new(device, 4) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + return + # TODO(gyf) Marlin format is not support for now.. + num_experts = layer.w13_qweight.shape[0] + device = layer.w13_qweight.device + + layer.w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, device=device), + requires_grad=False, + ) + layer.w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, device=device), + requires_grad=False, + ) + + marlin_w13_qweight = ops.awq_marlin_moe_repack( + layer.w13_qweight, + layer.w13_g_idx_sort_indices, + size_k=layer.w13_qweight.shape[1], + size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor, + num_bits=self.quant_config.weight_bits, + ) + replace_parameter(layer, "w13_qweight", marlin_w13_qweight) + + marlin_w2_qweight = ops.awq_marlin_moe_repack( + layer.w2_qweight, + layer.w2_g_idx_sort_indices, + size_k=layer.w2_qweight.shape[1], + size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor, + num_bits=self.quant_config.weight_bits, + ) + replace_parameter(layer, "w2_qweight", marlin_w2_qweight) + + # Why does this take the intermediate size for size_k? + marlin_w13_scales = marlin_moe_permute_scales( + s=layer.w13_scales, + size_k=layer.intermediate_size_per_partition, + size_n=layer.w13_scales.shape[2], + group_size=self.quant_config.group_size, + ) + + replace_parameter(layer, "w13_scales", marlin_w13_scales) + + marlin_w2_scales = marlin_moe_permute_scales( + s=layer.w2_scales, + size_k=layer.intermediate_size_per_partition, + size_n=layer.w2_scales.shape[2], + group_size=self.quant_config.group_size, + ) + replace_parameter(layer, "w2_scales", marlin_w2_scales) + + marlin_w13_zp = moe_awq_to_marlin_zero_points( + layer.w13_qzeros, + size_k=layer.w13_qzeros.shape[1], + size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor, + num_bits=self.quant_config.weight_bits, + ) + replace_parameter(layer, "w13_qzeros", marlin_w13_zp) + + marlin_w2_zp = moe_awq_to_marlin_zero_points( + layer.w2_qzeros, + size_k=layer.w2_qzeros.shape[1], + size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor, + num_bits=self.quant_config.weight_bits, + ) + replace_parameter(layer, "w2_qzeros", marlin_w2_zp) + + if hasattr(layer, "w13_bias") and layer.w13_bias is not None: + layer.w13_bias.data = marlin_permute_bias(layer.w13_bias) + + if hasattr(layer, "w2_bias") and layer.w2_bias is not None: + layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.") + + assert activation == "silu", "Only SiLU activation is supported." + use_ep = expert_map is not None + attn_metadata = get_forward_context().attn_metadata + if attn_metadata: + if isinstance(attn_metadata, dict): + only_decode = (use_ep == False and all(t.num_decodes > 0 and t.num_prefills ==0 for t in list(attn_metadata.values()))) + else: + only_decode = use_ep == False and attn_metadata.num_decodes > 0 and attn_metadata.num_prefills == 0 + else: + only_decode = False + + + + if use_ep: + start_eid = layer.ep_rank * layer.local_num_experts + end_eid = min((layer.ep_rank + 1) * layer.local_num_experts, global_num_experts) + if apply_router_weight_on_input: + raise NotImplementedError( + "Apply router weight on input is not supported for" + "fused Marlin MoE method.") + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) + + num_tokens, num_experts = router_logits.shape + if use_ep: + hidden_size = x.shape[1] + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + expand_tokens, + ) = ixfops.moe_compute_token_index_ep( + topk_ids=topk_ids, + num_experts=num_experts, + start_expert_id=start_eid, + end_expert_id=end_eid, + ) + if expert_sizes_cpu.sum() == 0: + return torch.zeros( + (num_tokens, hidden_size), + device=x.device, + dtype=x.dtype, + ) + else: + expand_tokens = num_tokens * top_k + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + ) = ixfops.moe_compute_token_index( + topk_ids=topk_ids, + num_experts=num_experts, + ) + + # expand + reorder + # TODO use kernel + expand_hidden_states = ixfops.moe_expand_input( + hidden_states=x, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + ) + + # w4a16 group gemm 1 + # pt_output_1: (expand_tokens, 2n) dtype + if only_decode: + pt_output_1 = ixfops.moe_w4a16_group_gemv( + input=expand_hidden_states, + weight=layer.w13_qweight, + w_scales=layer.w13_scales, + quant_type="awq", + w_zeros=layer.w13_qzeros, + group_size=self.quant_config.group_size, + dst_to_src=None, + format="NN", + tokens_per_experts_gpu=expert_sizes_gpu, + ) + + # act + pt_output_2 = ixfops.silu_and_mul(pt_output_1) + + pt_output_3 = ixfops.moe_w4a16_group_gemv( + input=pt_output_2, + weight=layer.w2_qweight, + w_scales=layer.w2_scales, + quant_type="awq", + w_zeros=layer.w2_qzeros, + group_size=self.quant_config.group_size, + dst_to_src=sorted_token_ids, + format="NN", + tokens_per_experts_gpu=expert_sizes_gpu, + ) + + # mul + reduce_sum + # final_hidden_states: (num_tokens, k) + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + scaling_factor=routed_scaling_factor + ) + + else: + expert_sizes_cpu = expert_sizes_gpu.cpu() + pt_output_1 = ixfops.moe_w4a16_group_gemm( + input=expand_hidden_states, + weight=layer.w13_qweight, + w_scales=layer.w13_scales, + quant_type="awq", + tokens_per_experts=expert_sizes_cpu, + w_zeros=layer.w13_qzeros, + group_size=self.quant_config.group_size, + dst_to_src=None, + format="NN", + tokens_per_experts_gpu=expert_sizes_gpu, + ) + + # act + pt_output_2 = ixfops.silu_and_mul(pt_output_1) + + # w4a16 group gemm 2 + reorder + # pt_output_3: (expand_tokens, k) dtype + if use_ep: + pt_output_3 = torch.empty( + (num_tokens * top_k, hidden_size), + device=x.device, + dtype=x.dtype, + ) + + ixfops.moe_w4a16_group_gemm( + input=pt_output_2, + weight=layer.w2_qweight, + w_scales=layer.w2_scales, + quant_type="awq", + tokens_per_experts=expert_sizes_cpu, + w_zeros=layer.w2_qzeros, + group_size=self.quant_config.group_size, + dst_to_src=sorted_token_ids, + format="NN", + output=pt_output_3, + tokens_per_experts_gpu=expert_sizes_gpu, + ) + + reduce_mask = src_to_dst == -1 + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + scaling_factor=routed_scaling_factor, + mask=reduce_mask, + ) + else: + pt_output_3 = ixfops.moe_w4a16_group_gemm( + input=pt_output_2, + weight=layer.w2_qweight, + w_scales=layer.w2_scales, + quant_type="awq", + tokens_per_experts=expert_sizes_cpu, + w_zeros=layer.w2_qzeros, + group_size=self.quant_config.group_size, + dst_to_src=sorted_token_ids, + format="NN", + tokens_per_experts_gpu=expert_sizes_gpu, + ) + + # mul + reduce_sum + # final_hidden_states: (num_tokens, k) + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + scaling_factor=routed_scaling_factor + ) + return final_hidden_states + # return torch.ops.vllm.fused_marlin_moe( + # x, + # layer.w13_qweight, + # layer.w2_qweight, + # layer.w13_scales, + # layer.w2_scales, + # router_logits, + # topk_weights, + # topk_ids, + # w1_zeros=layer.w13_qzeros, + # w2_zeros=layer.w2_qzeros, + # num_bits=self.quant_config.weight_bits, + # ) diff --git a/model_executor/layers/quantization/awq_triton.py b/model_executor/layers/quantization/awq_triton.py new file mode 100644 index 0000000..67b4dbb --- /dev/null +++ b/model_executor/layers/quantization/awq_triton.py @@ -0,0 +1,337 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.triton_utils import tl, triton + +AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] + + +@triton.jit +def awq_dequantize_kernel( + qweight_ptr, # quantized matrix + scales_ptr, # scales, per group + zeros_ptr, # zeros, per group + group_size, # Should always be one of the supported group sizes + result_ptr, # Output matrix + num_cols, # input num cols in qweight + num_rows, # input num rows in qweight + BLOCK_SIZE_X: tl.constexpr, + BLOCK_SIZE_Y: tl.constexpr, +): + # Set up the pids. + pid_x = tl.program_id(axis=0) + pid_y = tl.program_id(axis=1) + + # Compute offsets and masks for qweight_ptr. + offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y) + offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X) + offsets = num_cols * offsets_y[:, None] + offsets_x[None, :] + + masks_y = offsets_y < num_rows + masks_x = offsets_x < num_cols + + masks = masks_y[:, None] & masks_x[None, :] + + # Compute offsets and masks for result output ptr. + result_offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y) + result_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8) + result_offsets = ( + 8 * num_cols * result_offsets_y[:, None] + result_offsets_x[None, :] + ) + + result_masks_y = result_offsets_y < num_rows + result_masks_x = result_offsets_x < num_cols * 8 + result_masks = result_masks_y[:, None] & result_masks_x[None, :] + + # Load the weights. + iweights = tl.load(qweight_ptr + offsets, masks, 0.0) + iweights = tl.interleave(iweights, iweights) + iweights = tl.interleave(iweights, iweights) + iweights = tl.interleave(iweights, iweights) + + # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7] + # that will map given indices to the correct order. + reverse_awq_order_tensor = ( + (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None] + ).reshape(8) + + # Use this to compute a set of shifts that can be used to unpack and + # reorder the values in iweights and zeros. + shifts = reverse_awq_order_tensor * 4 + shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_Y * BLOCK_SIZE_X, 8)) + shifts = tl.reshape(shifts, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8)) + + # Unpack and reorder: shift out the correct 4-bit value and mask. + iweights = (iweights >> shifts) & 0xF + + # Compute zero offsets and masks. + zero_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1) + zero_offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X) + zero_offsets = num_cols * zero_offsets_y[:, None] + zero_offsets_x[None, :] + + zero_masks_y = zero_offsets_y < num_rows // group_size + zero_masks_x = zero_offsets_x < num_cols + zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :] + + # Load the zeros. + zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0) + zeros = tl.interleave(zeros, zeros) + zeros = tl.interleave(zeros, zeros) + zeros = tl.interleave(zeros, zeros) + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8)) + + # Unpack and reorder: shift out the correct 4-bit value and mask. + zeros = (zeros >> shifts) & 0xF + + # Compute scale offsets and masks. + scale_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1) + scale_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8) + scale_offsets = num_cols * 8 * scale_offsets_y[:, None] + scale_offsets_x[None, :] + scale_masks_y = scale_offsets_y < num_rows // group_size + scale_masks_x = scale_offsets_x < num_cols * 8 + scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :] + + # Load the scales. + scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0) + scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8)) + + # Dequantize. + iweights = (iweights - zeros) * scales + iweights = iweights.to(result_ptr.type.element_ty) + + # Finally, store. + tl.store(result_ptr + result_offsets, iweights, result_masks) + + +@triton.jit +def awq_gemm_kernel( + a_ptr, + b_ptr, + c_ptr, + zeros_ptr, + scales_ptr, + M, + N, + K, + group_size, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + SPLIT_K: tl.constexpr, +): + pid = tl.program_id(axis=0) + pid_z = tl.program_id(1) + + # NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead. + # num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + accumulator_dtype = c_ptr.type.element_ty + + # NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead. + # accumulator = tl.arange(0, BLOCK_SIZE_N) + # accumulator = tl.broadcast_to(accumulator[None, :], + # (BLOCK_SIZE_M, BLOCK_SIZE_N)) + # accumulator = accumulator & 0x0 + # accumulator = accumulator.to(accumulator_dtype) + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype) + + # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7] + # that will map given indices to the correct order. + reverse_awq_order_tensor = ( + (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None] + ).reshape(8) + + # Create the necessary shifts to use to unpack. + shifts = reverse_awq_order_tensor * 4 + shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8)) + shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + # Offsets and masks. + offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + masks_am = offsets_am < M + + offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8) + masks_bn = offsets_bn < N // 8 + + offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8) + masks_zn = offsets_zn < N // 8 + + offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + masks_sn = offsets_sn < N + + offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + offsets_a = K * offsets_am[:, None] + offsets_k[None, :] + offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :] + + a_ptrs = a_ptr + offsets_a + b_ptrs = b_ptr + offsets_b + + # NOTE: Use this in TRITON_INTERPRET=1 mode instead of tl.cdiv + # block_offset = BLOCK_SIZE_K * SPLIT_K + # for k in range(0, (K + block_offset - 1) // (block_offset)): + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)): + masks_k = offsets_k < K + masks_a = masks_am[:, None] & masks_k[None, :] + a = tl.load(a_ptrs, mask=masks_a, other=0.0) + + masks_b = masks_k[:, None] & masks_bn[None, :] + b = tl.load(b_ptrs, mask=masks_b, other=0.0) + b = tl.interleave(b, b) + b = tl.interleave(b, b) + b = tl.interleave(b, b) + + # Dequantize b. + offsets_szk = ( + BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K + ) // group_size + tl.arange(0, 1) + offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :] + masks_zk = offsets_szk < K // group_size + masks_z = masks_zk[:, None] & masks_zn[None, :] + zeros_ptrs = zeros_ptr + offsets_z + zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0) + zeros = tl.interleave(zeros, zeros) + zeros = tl.interleave(zeros, zeros) + zeros = tl.interleave(zeros, zeros) + zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :] + masks_sk = offsets_szk < K // group_size + masks_s = masks_sk[:, None] & masks_sn[None, :] + scales_ptrs = scales_ptr + offsets_s + scales = tl.load(scales_ptrs, mask=masks_s, other=0.0) + scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) + + b = (b >> shifts) & 0xF + zeros = (zeros >> shifts) & 0xF + b = (b - zeros) * scales + b = b.to(c_ptr.type.element_ty) + + # Accumulate results. + accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype) + + offsets_k += BLOCK_SIZE_K * SPLIT_K + a_ptrs += BLOCK_SIZE_K * SPLIT_K + b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8) + + c = accumulator.to(c_ptr.type.element_ty) + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +# qweights - [K , M // 8], int32 +# scales - [K // G, M ], float16 +# zeros - [K // G, M // 8], int32 +def awq_dequantize_triton( + qweight: torch.Tensor, + scales: torch.Tensor, + zeros: torch.Tensor, + block_size_x: int = 32, + block_size_y: int = 32, +) -> torch.Tensor: + K = qweight.shape[0] + M = scales.shape[1] + group_size = qweight.shape[0] // scales.shape[0] + + assert K > 0 and M > 0 + assert scales.shape[0] == K // group_size and scales.shape[1] == M + assert zeros.shape[0] == K // group_size and zeros.shape[1] == M // 8 + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + # Result tensor: + # number of rows = same as input tensor + # number of cols = 8 x input tensor num cols + result = torch.empty( + qweight.shape[0], + qweight.shape[1] * 8, + device=qweight.device, + dtype=scales.dtype, + ) + + Y = qweight.shape[0] # num rows + X = qweight.shape[1] # num cols + + grid = lambda META: ( + triton.cdiv(X, META["BLOCK_SIZE_X"]), + triton.cdiv(Y, META["BLOCK_SIZE_Y"]), + ) + awq_dequantize_kernel[grid]( + qweight, + scales, + zeros, + group_size, + result, + X, + Y, + BLOCK_SIZE_X=block_size_x, + BLOCK_SIZE_Y=block_size_y, + ) + + return result + + +# input - [M, K] +# qweight - [K, N // 8] +# qzeros - [K // G, N // 8] +# scales - [K // G, N] +# split_k_iters - parallelism along K-dimension, int, power of 2. +def awq_gemm_triton( + input: torch.Tensor, + qweight: torch.Tensor, + scales: torch.Tensor, + qzeros: torch.Tensor, + split_k_iters: int, + block_size_m: int = 32, + block_size_n: int = 32, + block_size_k: int = 32, +) -> torch.Tensor: + M, K = input.shape + N = qweight.shape[1] * 8 + group_size = qweight.shape[0] // qzeros.shape[0] + + assert N > 0 and K > 0 and M > 0 + assert qweight.shape[0] == K and qweight.shape[1] == N // 8 + assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 8 + assert scales.shape[0] == K // group_size and scales.shape[1] == N + assert split_k_iters & (split_k_iters - 1) == 0 and split_k_iters != 0 + assert split_k_iters <= 32 + assert group_size <= K + assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + split_k_iters, + ) + + result = torch.zeros((split_k_iters, M, N), dtype=scales.dtype, device=input.device) + + # A = input, B = qweight, C = result + # A = M x K, B = K x N, C = M x N + awq_gemm_kernel[grid]( + input, + qweight, + result, + qzeros, + scales, + M, + N, + K, + group_size, + BLOCK_SIZE_M=block_size_m, + BLOCK_SIZE_N=block_size_n, + BLOCK_SIZE_K=block_size_k, + SPLIT_K=split_k_iters, + ) + + result = result.sum(0) + + return result diff --git a/model_executor/layers/quantization/base_config.py b/model_executor/layers/quantization/base_config.py new file mode 100644 index 0000000..c8a8424 --- /dev/null +++ b/model_executor/layers/quantization/base_config.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import inspect +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +import torch +from torch import nn + +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.model_executor.models.utils import WeightsMapper +else: + QuantizationMethods = str + + +class QuantizeMethodBase(ABC): + """Base class for different quantized methods.""" + + @abstractmethod + def create_weights( + self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs + ): + """Create weights for a layer. + + The weights will be set as attributes of the layer.""" + raise NotImplementedError + + @abstractmethod + def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor: + """Apply the weights in layer to the input tensor. + + Expects create_weights to have been called before on the layer.""" + raise NotImplementedError + + # Not required functions + def embedding(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor: + """Gather embeddings in the layer based on indices in the input tensor. + + Expects create_weights to have been called before on the layer.""" + raise NotImplementedError + + def process_weights_after_loading(self, layer: nn.Module) -> None: + """Process the weight after loading. + + This can be used for example, to transpose weights for computation. + """ + return + + +def method_has_implemented_embedding(method_class: type[QuantizeMethodBase]) -> bool: + """ + Not all quant methods have embedding implemented, so we need to check that + it exists for our given method. We check this by making sure the function + has been changed from the base implementation. + """ + base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None) + class_embedding = inspect.getattr_static(method_class, "embedding", None) + + return class_embedding is not None and class_embedding is not base_embedding + + +class QuantizationConfig(ABC): + """Base class for quantization configs.""" + + def __init__(self): + super().__init__() + # mapping is updated by models as they initialize + self.packed_modules_mapping: dict[str, list[str]] = dict() + + @abstractmethod + def get_name(self) -> QuantizationMethods: + """Name of the quantization method.""" + raise NotImplementedError + + @abstractmethod + def get_supported_act_dtypes(self) -> list[torch.dtype]: + """List of supported activation dtypes.""" + raise NotImplementedError + + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + """Minimum GPU capability to support the quantization method. + + E.g., 70 for Volta, 75 for Turing, 80 for Ampere. + This requirement is due to the custom CUDA kernels used by the + quantization method. + """ + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_config_filenames() -> list[str]: + """List of filenames to search for in the model directory.""" + raise NotImplementedError + + @classmethod + @abstractmethod + def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig": + """Create a config class from the model's quantization config.""" + raise NotImplementedError + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + """ + Detects if this quantization method can support a given checkpoint + format by overriding the user specified quantization method -- + this method should only be overwritten by subclasses in exceptional + circumstances + """ + return None + + @staticmethod + def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any: + """Get a value from the model's quantization config.""" + for key in keys: + if key in config: + return config[key] + raise ValueError( + f"Cannot find any of {keys} in the model's quantization config." + ) + + @staticmethod + def get_from_keys_or(config: dict[str, Any], keys: list[str], default: Any) -> Any: + """Get an optional value from the model's quantization config.""" + try: + return QuantizationConfig.get_from_keys(config, keys) + except ValueError: + return default + + @abstractmethod + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> QuantizeMethodBase | None: + """Get the quantize method to use for the quantized layer. + + Args: + layer: The layer for the quant method. + prefix: The full name of the layer in the state dict + Returns: + The quantize method. None if the given layer doesn't support quant + method. + """ + raise NotImplementedError + + def get_cache_scale(self, name: str) -> str | None: + return None + + def apply_vllm_mapper( # noqa: B027 + self, hf_to_vllm_mapper: "WeightsMapper" + ): + """ + Interface for models to update module names referenced in + quantization configs in order to reflect the vllm model structure + + :param hf_to_vllm_mapper: maps from hf model structure (the assumed + structure of the qconfig) to vllm model structure + """ + # TODO (@kylesayrs): add implementations for all subclasses + pass + + def maybe_update_config(self, model_name: str): # noqa: B027 + """ + Interface to update values after config initialization. + """ + pass diff --git a/model_executor/layers/quantization/bitblas.py b/model_executor/layers/quantization/bitblas.py new file mode 100644 index 0000000..be15f20 --- /dev/null +++ b/model_executor/layers/quantization/bitblas.py @@ -0,0 +1,502 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Optional + +import torch +from packaging import version + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import ( + QuantizationConfig, + QuantizationMethods, +) +from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( + BITBLAS_OPTIMIZE_FEATURES, + BITBLAS_SUPPORTED_NUM_BITS, + BITBLAS_SUPPORTED_SYM, + MINIMUM_BITBLAS_VERSION, +) +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.parameter import ( + BasevLLMParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedvLLMParameter, +) +from vllm.model_executor.utils import set_weight_attrs + +logger = init_logger(__name__) + + +class BitBLASConfig(QuantizationConfig): + """Config class for BitBLAS. + + Reference: https://github.com/Microsoft/BitBLAS + """ + + TORCH_DTYPE = torch.float16 + STORAGE_DTYPE = "int8" # assume int8 storage + TORCH_STORAGE_DTYPE = getattr(torch, STORAGE_DTYPE) + # "original" or "rescale" or "quantized", + # gptq_with_bitblas prefer "quantized implementation" + ZEROS_MODE = "quantized" + + def __init__( + self, + weight_bits: int, + group_size: int | None, + desc_act: bool | None, + is_sym: bool | None, + quant_method: str | None, + lm_head_quantized: bool, + ) -> None: + try: + import bitblas + + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION + ): + raise ImportError( + "bitblas version is wrong. Please " + f"install bitblas>={MINIMUM_BITBLAS_VERSION}" + ) + except ImportError as e: + bitblas_import_exception = e + raise ValueError( + "Trying to use the bitblas backend, but could not import" + f"with the following error: {bitblas_import_exception}. " + "Please install bitblas through the following command: " + f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`" + ) from bitblas_import_exception + + if desc_act and group_size == -1: + # In this case, act_order == True is the same as act_order == False + # (since we have only one group per output channel) + desc_act = False + + super().__init__() + self.weight_bits = weight_bits + self.group_size = group_size + self.desc_act = desc_act + self.is_sym = is_sym + self.quant_method = quant_method + self.lm_head_quantized = lm_head_quantized + + # Verify + if self.weight_bits not in BITBLAS_SUPPORTED_NUM_BITS: + raise ValueError( + f"BitBLAS does not support weight_bits = {self.weight_bits}. " + f"Only weight_bits = {BITBLAS_SUPPORTED_NUM_BITS} " + "are supported." + ) + + if self.is_sym not in BITBLAS_SUPPORTED_SYM: + raise ValueError( + f"BitBLAS does not support is_sym = {self.is_sym}. " + f"Only sym = {BITBLAS_SUPPORTED_SYM} are supported." + ) + + storage_dtype = self.STORAGE_DTYPE + storage_nbit = int("".join(c for c in storage_dtype if c.isdigit())) + + self.storage_dtype = storage_dtype + self.storage_torch_dtype = self.TORCH_STORAGE_DTYPE + # 4 Bits packed into 32 bit datatype. + self.pack_factor = storage_nbit // weight_bits + self.nbits = weight_bits + + # Zeros type for the quantized weights. + self.zeros_mode = self.ZEROS_MODE + + def __repr__(self) -> str: + return ( + f"BitBLASConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"desc_act={self.desc_act}, " + f"is_sym={self.is_sym}, " + f"quant_method={self.quant_method})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "bitblas" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + # Need to figure it out + def get_min_capability(cls) -> int: + return 70 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @staticmethod + def get_from_keys( + config: dict[str, Any], keys: list[str], default: Any = None + ) -> Any: + """Get a value from the model's quantization config.""" + for key in keys: + if key in config: + return config[key] + return default + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "BitBLASConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"], -1) + desc_act = cls.get_from_keys(config, ["desc_act"], False) + is_sym = cls.get_from_keys(config, ["sym"], False) + quant_method = cls.get_from_keys(config, ["quant_method"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + return cls( + weight_bits, group_size, desc_act, is_sym, quant_method, lm_head_quantized + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + # compat: autogptq >=0.8.0 use checkpoint_format: str + # compat: autogptq <=0.7.1 is_bitblas_format: bool + is_bitblas_format = hf_quant_cfg.get( + "checkpoint_format" + ) == "bitblas" or hf_quant_cfg.get("is_bitblas_format", False) + + is_valid_user_quant = ( + user_quant is None or user_quant == "gptq" or user_quant == "bitblas" + ) + + if is_bitblas_format and is_valid_user_quant: + msg = "The model is serialized in {} format. Using {} kernel.".format( + cls.get_name(), cls.get_name() + ) + logger.info(msg) + return cls.get_name() + + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["BitBLASLinearMethod"]: + if isinstance(layer, LinearBase) or ( + isinstance(layer, ParallelLMHead) and self.lm_head_quantized + ): + return BitBLASLinearMethod(self) + return None + + +class BitBLASLinearMethod(LinearMethodBase): + """Linear method for BitBLAS. + + Args: + quant_config: The BitBLAS quantization config. + """ + + # USE BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS + # Instead of BITBLAS_OPTIMIZE_FEATURES + # If you want to high contiguous batching + # performance + OPT_FEATURES = BITBLAS_OPTIMIZE_FEATURES + ENABLE_TUNING = True + BITBLAS_DTYPES = { + torch.float32: "float32", + torch.float16: "float16", + torch.bfloat16: "bfloat16", + torch.half: "float16", + torch.int8: "int8", + } + + def __init__(self, quant_config: BitBLASConfig): + self.quant_config = quant_config + + def create_weights_gptq( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + """Creates quantized weights for use in linear operations. + + The function initializes and returns a dictionary containing quantized + weights, scales, and zeros + for performing quantized matrix multiplication operations. + + Args: + input_size_per_partition: The size of the input partition. + output_partition_sizes: List of output partition sizes. + input_size: The total size of the input (unused). + output_size: The total size of the output (unused). + params_dtype: + The data type of the parameters (expected to be torch.float16). + + Returns: + A dictionary containing the quantized weights ('qweight'), + scales ('scales'), and zeros ('zeros'). + + Raises: + ValueError: If `params_dtype` is not `torch.float16` or if the input + size per partition is not divisible by the group size + in `quant_config`. + """ + del input_size, output_size # Unused arguments. + weight_loader = extra_weight_attrs["weight_loader"] + + if params_dtype not in self.quant_config.get_supported_act_dtypes(): + raise ValueError( + f"Parameter data type must be torch.float16, but got {params_dtype}" + ) + group_size = self.quant_config.group_size + if group_size is None: + group_size = -1 + # Validate output_size_per_partition + output_size_per_partition = sum(output_partition_sizes) + if group_size != -1 and input_size_per_partition % group_size != 0: + raise ValueError( + f"Input size per partition ({input_size_per_partition}) must " + f"be divisible by group size ({group_size})." + ) + + # Initialize or retrieve the BitBLAS matrix multiplication operator. + self._configure_bitblas_matmul( + input_size_per_partition, + output_size_per_partition, + params_dtype=params_dtype, + enable_tuning=self.ENABLE_TUNING, + bias=False, + layout="nt", + bits=self.quant_config.weight_bits, + ) + + # Initialize quantized weights with dimensions + # Quantized 4Bit weights packed. + qweight = PackedvLLMParameter( + data=torch.empty( + self.bitblas_matmul.retrieve_weight_shape(), + device="cuda", + dtype=self.quant_config.storage_torch_dtype, + requires_grad=False, + ), + input_dim=1, + output_dim=0, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + bitblas_tile_size=( + self.bitblas_matmul.retrieve_weight_shape()[-2] + if self.bitblas_matmul.propagate_b + else None + ), + weight_loader=weight_loader, + ) + + # Compute the number of input groups for channel-wise quantization. + input_groups = 1 if group_size == -1 else input_size_per_partition // group_size + + # Initialize scales and zeros for the quantized weights. + weight_scale_args = { + "data": torch.empty( + output_size_per_partition, + input_groups, + device="cuda", + dtype=params_dtype, + ), + "weight_loader": weight_loader, + } + if input_groups == 1: + scales = ChannelQuantScaleParameter(output_dim=0, **weight_scale_args) + else: + scales = GroupQuantScaleParameter( + output_dim=0, input_dim=1, **weight_scale_args + ) + + if self.quant_config.zeros_mode == "quantized": + zeros = PackedvLLMParameter( + data=torch.empty( + input_groups, + output_size_per_partition // self.quant_config.pack_factor, + device="cuda", + dtype=self.quant_config.storage_torch_dtype, + requires_grad=False, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + else: + zeros = BasevLLMParameter( + torch.empty( + output_size_per_partition, + input_groups, + device="cuda", + dtype=params_dtype, + ), + weight_loader=weight_loader, + ) + # Set attributes to indicate how scales and zeros are applied. + set_weight_attrs( + zeros, + { + "input_dim": None if input_groups == 1 else 1, + "output_dim": 0, + }, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("scales", scales) + layer.register_parameter("zeros", zeros) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + if self.quant_config.quant_method == "gptq": + return self.create_weights_gptq( + layer, + input_size_per_partition, + output_partition_sizes, + input_size, + output_size, + params_dtype, + **extra_weight_attrs, + ) + else: + raise ValueError( + f"Unsupported quant_method {self.quant_config.quant_method}" + ) + + def _configure_bitblas_matmul( + self, + infeatures, + outfeatures, + params_dtype, + enable_tuning, + bias, + layout, + bits, + out_dtype="float16", + ): + from bitblas import MatmulConfig + + bitblas_dtype = self.BITBLAS_DTYPES[params_dtype] + + with_scaling = False + with_zeros = False + group_size = self.quant_config.group_size + zeros_mode = self.quant_config.zeros_mode + if self.quant_config.quant_method == "gptq": + with_scaling = True + with_zeros = True + W_dtype = f"uint{bits}" + if self.quant_config.is_sym: + with_zeros = False + W_dtype = f"int{bits}" + else: + raise ValueError( + f"Unsupported quant_method {self.quant_config.quant_method}" + ) + + matmul_config = MatmulConfig( + N=outfeatures, + K=infeatures, + A_dtype=bitblas_dtype, + W_dtype=W_dtype, + out_dtype=out_dtype, + accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype, + storage_dtype=self.quant_config.STORAGE_DTYPE, + with_scaling=with_scaling, + with_zeros=with_zeros, + group_size=group_size, + with_bias=bias, + layout=layout, + zeros_mode=zeros_mode, + ) + self.bitblas_matmul = self._get_or_create_bitblas_operator( + matmul_config, enable_tuning + ) + + def _get_or_create_bitblas_operator(self, config, enable_tuning): + from bitblas import Matmul, auto_detect_nvidia_target + from bitblas.cache import get_database_path, global_operator_cache + + BITBLAS_DATABASE_PATH = get_database_path() + BITBLAS_TARGET = auto_detect_nvidia_target() + if global_operator_cache.size() == 0: + global_operator_cache.load_from_database( + BITBLAS_DATABASE_PATH, BITBLAS_TARGET + ) + + bitblas_matmul = global_operator_cache.get(config) + if bitblas_matmul is None: + bitblas_matmul = Matmul(config, target=BITBLAS_TARGET, enable_tuning=False) + if enable_tuning: + TUNING_MESSAGE = f"BitBLAS Operator {config} is tuning ..." + logger.info(TUNING_MESSAGE) + bitblas_matmul.hardware_aware_finetune(topk=20) + global_operator_cache.add(config, bitblas_matmul) + global_operator_cache.save_into_database( + BITBLAS_DATABASE_PATH, BITBLAS_TARGET + ) + TUNED_MESSAGE = ( + f"BitBLAS Operator {config} tuned and saved to database." + ) + logger.info(TUNED_MESSAGE) + else: + _message = f"BitBLAS Operator {config} created." + logger.info(_message) + else: + _message = f"BitBLAS Operator {config} found in global_operator_cache." + logger.info(_message) + return bitblas_matmul + + def apply_gptq( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + qweight = layer.qweight + scales = layer.scales + qzeros = layer.zeros + + x_2d = x.view(-1, x.shape[-1]) + + if self.quant_config.is_sym: + output_2d = self.bitblas_matmul(x_2d, qweight, scales) + else: + output_2d = self.bitblas_matmul(x_2d, qweight, scales, qzeros) + + output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],)) + + if bias is not None: + output.add_(bias) # In-place add + + return output + + def apply( + self, + *args: Any, + **kwargs: Any, + ) -> torch.Tensor: + if self.quant_config.quant_method == "gptq": + return self.apply_gptq(*args, **kwargs) + else: + raise ValueError( + f"Unsupported quant_method {self.quant_config.quant_method}" + ) diff --git a/model_executor/layers/quantization/bitsandbytes.py b/model_executor/layers/quantization/bitsandbytes.py new file mode 100644 index 0000000..e5a741e --- /dev/null +++ b/model_executor/layers/quantization/bitsandbytes.py @@ -0,0 +1,658 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any, Union + +import torch +from packaging import version + +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, + set_weight_attrs, +) +from vllm.model_executor.layers.quantization import ( + QuantizationConfig, + QuantizationMethods, +) +from vllm.platforms import current_platform +from vllm.utils.torch_utils import direct_register_custom_op + + +class BitsAndBytesConfig(QuantizationConfig): + """Config class for BitsAndBytes Quantization. + + Reference: https://arxiv.org/abs/2305.14314 + """ + + def __init__( + self, + load_in_8bit: bool = False, + load_in_4bit: bool = True, + bnb_4bit_compute_dtype: str = "float32", + bnb_4bit_quant_storage: str = "uint8", + bnb_4bit_quant_type: str = "fp4", + bnb_4bit_use_double_quant: bool = False, + llm_int8_enable_fp32_cpu_offload: bool = False, + llm_int8_has_fp16_weight: bool = False, + llm_int8_skip_modules: list[str] | None = None, + llm_int8_threshold: float = 6.0, + ) -> None: + super().__init__() + self.load_in_8bit = load_in_8bit + self.load_in_4bit = load_in_4bit + self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype + self.bnb_4bit_quant_storage = bnb_4bit_quant_storage + self.bnb_4bit_quant_type = bnb_4bit_quant_type + self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant + self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload + self.llm_int8_has_fp16_weight = llm_int8_has_fp16_weight + self.llm_int8_skip_modules = llm_int8_skip_modules or [] + self.llm_int8_threshold = llm_int8_threshold + + if self.bnb_4bit_quant_storage not in ["uint8"]: + raise ValueError( + f"Unsupported bnb_4bit_quant_storage: {self.bnb_4bit_quant_storage}" + ) + + def __repr__(self) -> str: + return ( + f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, " + f"load_in_4bit={self.load_in_4bit}, " + f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, " + f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, " + f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, " + f"llm_int8_skip_modules={self.llm_int8_skip_modules})" + ) + + @classmethod + def get_name(self) -> QuantizationMethods: + return "bitsandbytes" + + @classmethod + def get_supported_act_dtypes(self) -> list[torch.dtype]: + return [torch.float32, torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + @staticmethod + def get_config_filenames() -> list[str]: + return [] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig": + def get_safe_value(config, keys, default_value=None): + try: + value = cls.get_from_keys(config, keys) + return value if value is not None else default_value + except ValueError: + return default_value + + load_in_8bit = get_safe_value(config, ["load_in_8bit"], default_value=False) + load_in_4bit = get_safe_value(config, ["load_in_4bit"], default_value=True) + bnb_4bit_compute_dtype = get_safe_value( + config, ["bnb_4bit_compute_dtype"], default_value="float32" + ) + bnb_4bit_quant_storage = get_safe_value( + config, ["bnb_4bit_quant_storage"], default_value="uint8" + ) + bnb_4bit_quant_type = get_safe_value( + config, ["bnb_4bit_quant_type"], default_value="fp4" + ) + bnb_4bit_use_double_quant = get_safe_value( + config, ["bnb_4bit_use_double_quant"], default_value=False + ) + llm_int8_enable_fp32_cpu_offload = get_safe_value( + config, ["llm_int8_enable_fp32_cpu_offload"], default_value=False + ) + llm_int8_has_fp16_weight = get_safe_value( + config, ["llm_int8_has_fp16_weight"], default_value=False + ) + llm_int8_skip_modules = get_safe_value( + config, ["llm_int8_skip_modules"], default_value=[] + ) + llm_int8_threshold = get_safe_value( + config, ["llm_int8_threshold"], default_value=6.0 + ) + + return cls( + load_in_8bit=load_in_8bit, + load_in_4bit=load_in_4bit, + bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, + bnb_4bit_quant_storage=bnb_4bit_quant_storage, + bnb_4bit_quant_type=bnb_4bit_quant_type, + bnb_4bit_use_double_quant=bnb_4bit_use_double_quant, + llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload, + llm_int8_has_fp16_weight=llm_int8_has_fp16_weight, + llm_int8_skip_modules=llm_int8_skip_modules, + llm_int8_threshold=llm_int8_threshold, + ) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Union["LinearMethodBase", "BitsAndBytesMoEMethod"] | None: + if isinstance(layer, LinearBase): + if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules): + return UnquantizedLinearMethod() + return BitsAndBytesLinearMethod(self) + elif isinstance(layer, FusedMoE): + return BitsAndBytesMoEMethod(self, layer.moe_config) + return None + + +def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: list[str]): + # Split the prefix into its dot-separated components + components = prefix.split(".") + + # Check if any of the skip modules exactly matches any component + substr_check = any( + module_name in components for module_name in llm_int8_skip_modules + ) + + # Allow certain layers to not be quantized + set_components = set(".".join(components[: i + 1]) for i in range(len(components))) + set_llm_int8_skip_modules = set(llm_int8_skip_modules) + prefix_check = len(set_llm_int8_skip_modules & set_components) != 0 + + return substr_check or prefix_check + + +def calculate_quant_ratio(dtype): + if dtype.is_floating_point: + return torch.finfo(dtype).bits // torch.iinfo(torch.uint8).bits + else: + return torch.iinfo(dtype).bits // torch.iinfo(torch.uint8).bits + + +class BitsAndBytesLinearMethod(LinearMethodBase): + """Linear method for BitsAndBytes. + + Args: + quant_config: The BitsAndBytes quantization config. + """ + + def __init__(self, quant_config: BitsAndBytesConfig): + try: + import bitsandbytes + + if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"): + raise ImportError( + "bitsandbytes version is wrong. Please " + "install bitsandbytes>=0.46.1." + ) + except ImportError as err: + raise ImportError( + "Please install bitsandbytes>=0.46.1 via " + "`pip install bitsandbytes>=0.46.1` to use " + "bitsandbytes quantizer." + ) from err + + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + from bitsandbytes.nn import Int8Params + + def create_qweight_for_8bit(): + qweight = Int8Params( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=torch.int8, + ), + has_fp16_weights=self.quant_config.llm_int8_has_fp16_weight, + requires_grad=False, + ) + set_weight_attrs( + qweight, + { + "input_dim": 0, + "output_dim": 0, + "pack_factor": 1, + "use_bitsandbytes_8bit": True, + "generation": 0, + }, + ) + return qweight + + def create_qweight_for_4bit(): + quant_ratio = calculate_quant_ratio(params_dtype) + + total_size = input_size_per_partition * sum(output_partition_sizes) + if total_size % quant_ratio != 0: + raise ValueError( + "The input size is not aligned with the quantized weight shape." + ) + + qweight = torch.nn.Parameter( + torch.empty(total_size // quant_ratio, 1, dtype=torch.uint8), + requires_grad=False, + ) + set_weight_attrs( + qweight, + { + "input_dim": 0, + "output_dim": 0, + "pack_factor": quant_ratio, + "use_bitsandbytes_4bit": True, + }, + ) + return qweight + + if self.quant_config.load_in_8bit: + qweight = create_qweight_for_8bit() + else: + qweight = create_qweight_for_4bit() + # Enable parameters to have the same name as in the BNB + # checkpoint format. + layer.register_parameter("weight", qweight) + set_weight_attrs(qweight, extra_weight_attrs) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + if self.quant_config.load_in_8bit: + return self._apply_8bit_weight(layer, x, bias) + else: + return self._apply_4bit_weight(layer, x, bias) + + def _apply_8bit_weight( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + # only load the bitsandbytes module when needed + from bitsandbytes import MatmulLtState, matmul + + original_type = x.dtype + original_shape = x.shape + reshape_after_matmul = False + if x.ndim > 2: + x = x.reshape(-1, x.size(-1)) + reshape_after_matmul = True + bf_x = x.to(torch.bfloat16) + + qweight = layer.weight + offsets = qweight.bnb_shard_offsets + quant_states = qweight.bnb_quant_state + matmul_states = qweight.matmul_state + generation = qweight.generation + + out_dim_0 = x.shape[0] + out_dim_1 = sum( + [quant_state[1].shape[0] for quant_state in quant_states.items()] + ) + out = torch.empty(out_dim_0, out_dim_1, dtype=torch.float16, device=x.device) + + current_index = 0 + for i in range(len(quant_states)): + output_size = quant_states[i].shape[0] + + # in profile_run or the first generation of inference, + # create new matmul_states + if generation == 0 or generation == 1: + matmul_states[i] = MatmulLtState() + matmul_states[i].CB = qweight[offsets[i] : offsets[i + 1]] + matmul_states[i].SCB = quant_states[i].to(x.device) + matmul_states[i].threshold = self.quant_config.llm_int8_threshold + matmul_states[ + i + ].has_fp16_weights = self.quant_config.llm_int8_has_fp16_weight + matmul_states[i].is_training = False + if ( + matmul_states[i].threshold > 0.0 + and not matmul_states[i].has_fp16_weights + ): + matmul_states[i].use_pool = True + + new_x = bf_x.unsqueeze(0) + + out[:, current_index : current_index + output_size] = matmul( + new_x, qweight[offsets[i] : offsets[i + 1]], state=matmul_states[i] + ) + + current_index += output_size + + # only update the matmul_states if it is not profile_run + if ( + generation > 0 + and not self.quant_config.llm_int8_has_fp16_weight + and matmul_states[i].CB is not None + and matmul_states[i].CxB is not None + ): + del matmul_states[i].CB + qweight[offsets[i] : offsets[i + 1]] = matmul_states[i].CxB + + out = out.to(original_type) + + if reshape_after_matmul: + out = out.view(*original_shape[:-1], out.size(-1)) + + if bias is not None: + out += bias + + qweight.generation += 1 + + return out + + def _apply_4bit_weight( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + original_type = x.dtype + original_shape = x.shape + reshape_after_matmul = False + if x.ndim > 2: + x = x.reshape(-1, x.size(-1)) + reshape_after_matmul = True + bf_x = x.to(torch.bfloat16) + + qweight = layer.weight + quant_states = qweight.bnb_quant_state + offsets = qweight.bnb_shard_offsets + + out_dim_0 = x.shape[0] + out_dim_1 = sum( + [quant_state[1].shape[0] for quant_state in quant_states.items()] + ) + out = torch.empty(out_dim_0, out_dim_1, dtype=torch.bfloat16, device=x.device) + apply_bnb_4bit(bf_x, qweight, offsets, out) + out = out.to(original_type) + + if reshape_after_matmul: + out = out.view(*original_shape[:-1], out.size(-1)) + + if bias is not None: + out += bias + + return out + + +def _apply_bnb_4bit( + x: torch.Tensor, + weight: torch.Tensor, + offsets: torch.Tensor, + out: torch.Tensor, +) -> None: + # only load the bitsandbytes module when needed + from bitsandbytes import matmul_4bit + + quant_states = weight.bnb_quant_state + current_index = 0 + for i in range(len(quant_states)): + output_size = quant_states[i].shape[0] + # It is more efficient to use out kwarg like + # matmul_4bit(..., out = ...). Infeasible now due to the bug + # https://github.com/TimDettmers/bitsandbytes/issues/1235. + # Need to change after the bug is fixed. + out[:, current_index : current_index + output_size] = matmul_4bit( + x, weight[offsets[i] : offsets[i + 1]].t(), quant_states[i] + ) + current_index += output_size + + +def _apply_bnb_4bit_fake( + x: torch.Tensor, + weight: torch.Tensor, + offsets: torch.Tensor, + out: torch.Tensor, +) -> None: + return + + +try: + direct_register_custom_op( + op_name="apply_bnb_4bit", + op_func=_apply_bnb_4bit, + mutates_args=["out"], + fake_impl=_apply_bnb_4bit_fake, + dispatch_key=current_platform.dispatch_key, + ) + apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit + +except AttributeError as error: + raise error + + +class BitsAndBytesMoEMethod(FusedMoEMethodBase): + """MoE method for BitsAndBytes. + + Args: + quant_config: The BitsAndBytes quantization config. + """ + + def __init__( + self, + quant_config: BitsAndBytesConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) + try: + import bitsandbytes + + if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"): + raise ImportError( + "bitsandbytes version is wrong. Please " + "install bitsandbytes>=0.46.1." + ) + except ImportError as err: + raise ImportError( + "Please install bitsandbytes>=0.46.1 via " + "`pip install bitsandbytes>=0.46.1` to use " + "bitsandbytes quantizer." + ) from err + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + if self.quant_config.load_in_8bit: + call_fun = self._create_weights_8bit + else: + call_fun = self._create_weights_4bit + call_fun( + layer, + num_experts, + hidden_size, + intermediate_size_per_partition, + params_dtype, + **extra_weight_attrs, + ) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + from vllm.model_executor.layers.fused_moe import fused_experts + + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `BitsAndBytesMoEMethod` yet." + ) + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + if self.quant_config.load_in_8bit: + w13, w2 = self._apply_8bit_dequant(layer) + else: + w13, w2 = self._apply_4bit_dequnt(layer) + return fused_experts( + hidden_states=x, + w1=w13, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + + def _create_weights_4bit( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + quant_ratio = calculate_quant_ratio(params_dtype) + # Fused gate_up_proj (column parallel) + w13_total_size = ( + hidden_size * 2 * intermediate_size_per_partition + ) // quant_ratio + w13_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + w13_total_size, + 1, + dtype=torch.uint8, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_qweight) + set_weight_attrs(w13_qweight, extra_weight_attrs) + set_weight_attrs( + w13_qweight, + { + "num_experts": num_experts, + "input_dim": hidden_size, + "output_dim": 2 * intermediate_size_per_partition, + "experts_shape": ( + num_experts, + intermediate_size_per_partition * 2, + hidden_size, + ), + "pack_factor": quant_ratio, + "use_bitsandbytes_4bit": True, + }, + ) + # down_proj (row parallel) + w2_total_size = (hidden_size * intermediate_size_per_partition) // quant_ratio + w2_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + w2_total_size, + 1, + dtype=torch.uint8, + ), + requires_grad=False, + ) + set_weight_attrs( + w2_qweight, + { + "num_experts": num_experts, + "input_dim": intermediate_size_per_partition, + "output_dim": hidden_size, + "experts_shape": ( + num_experts, + hidden_size, + intermediate_size_per_partition, + ), + "pack_factor": quant_ratio, + "use_bitsandbytes_4bit": True, + }, + ) + layer.register_parameter("w2_weight", w2_qweight) + set_weight_attrs(w2_qweight, extra_weight_attrs) + + def _create_weights_8bit( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + raise NotImplementedError + + def _apply_4bit_dequnt( + self, layer: torch.nn.Module + ) -> tuple[torch.Tensor, torch.Tensor]: + from bitsandbytes.functional import dequantize_4bit + + w13 = dequantize_4bit( + layer.w13_weight.reshape(-1, 1), + layer.w13_weight.bnb_quant_state, + ) + w2 = dequantize_4bit( + layer.w2_weight.reshape(-1, 1), + layer.w2_weight.bnb_quant_state, + ) + w13 = w13.reshape(layer.w13_weight.experts_shape) + w2 = w2.reshape(layer.w2_weight.experts_shape) + return w13, w2 + + def _apply_8bit_dequant( + self, layer: torch.nn.Module + ) -> tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError diff --git a/model_executor/layers/quantization/compressed_tensors/__init__.py b/model_executor/layers/quantization/compressed_tensors/__init__.py new file mode 100644 index 0000000..6655f89 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/__init__.py @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + diff --git a/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1816ed7ad0cd0a25bf6fda8e25e35d28667f5dd3 GIT binary patch literal 203 zcmZ8a!41MN3`}T|5K;%>QSt;N#0abqh1P^1_m4F&rC>nA>q3Iw*b#6>e_G-wouW*$Q5F?bDppK6JTHD%^)^p#zCTq@U T5H?}V)siO;Dgs(>X8Ft28 zVOPu@cC$F!Tv4nzTpaU+Juz?C%fjq)C9%?Qsgk!W<_r6nzhJIBRuQgXe#cy8tSVf^ z{DpJXv6^sAtTtR5^N0N`%sE#Vs}I*RziX}`76=Dojp0W2?4H{ZYYI0pf6-iXtR>tM zYYn%?+QMzI_HcVF7!JmEhIhtxg?Gg|!X2^BaAzzO4#m2{U9s+PcWiffH_KZ**Awdv z_s06deeBsY*B=`Q55)F__rwOngDlKDw>LHv9%BBIxqY$y;r+~CI(Hy;FnrL&@fN|3 zzA2kK6gwO~Y~suuU%?N*i(aNHd?f0-UjHyr%sF?lPtsYH>85ovk6|9nv)!7uOwy{=HatW zPsJ1CL5ox}vJhi=5k9sc#HJEb^~9AaAwHY9krg$vFh4VUK`K$BCKAHc>BLom2MF&| zSEuF^vx`#%dsw&-mweeFNII3^FWksYH+ka|2erPd^9@}K+3W!Bxi{S z%~EN0u+l^|jMpIpK#kfLd~}kZk1q&u)JRSdERy`s_6_eIm^e55^u%aSUu5E$bLY-J zJvn+ba^%G1gyeg69I^SKWm_Qj^huRl0w<;~@-bej+7f{x`^K*H8OIEcT^Zaa=EOYS zuo1M~0|c(#IEuY@9yK(m$lx$5&HyN4P~4zVd4uQ@s$|91@~F4)a$2{<$S+mH7`gi; z{{7~Zmcw}liF?)?6ZA8Hv`GK$z z!=NmFapCG*G%|Z(enH?Pb5l1MK9&%!ojN}<1#k)in56RQtBJX(csw$5r7uFGaY0D% zQQ(bt`l=xC^NGlnxv2yJMp)6a{OpB`iTIhRD_3|ys@tZWO7lE%Wi&>n)XZCWD{oUt6L06O zj7V8|Yt+W#3Y0jz7FUpy$3)~T>M#f^+!{9HUFK#?QRk~gU^N3#_p8=$Vblf4&Px?) zrS&`2WX2pVLMk_3gmfUG(c)LF?*=$@z-_tIzyptb5yRNtj`)u-R1 zrD&&%FOQb7vg}bGAXK2N-1fMl+>A9^o>g+?)--%oc7Lh8CHEe3S7ha`HppFrG?iIt zYFUew_klK6WySgN?gpdPd~vjfuLC=%LmTR598v$Pws3v4j&F$8^9_L4fYt=Eo&$Jp zRG(QMd$bW@JMzMIAgn1btO;Swd11{6Ysm|1L0D^ESS!NX^1|8>)}9yEj<8@}SP)@5 z^TKu_Y!~`quNJ-w?@`7?SF{5?(TSeuU@3kYYzXO4fD;`GPILiQXSADPic$*tb~ow} zVtr_i_Q2o8cWX70Yl3&6)Tx&*P;1BDQ{SsL4TpCxbJtDbo(V2EWI*+0UNkP1$mB|) zW*}l$B)d$HBsZg35sg@>R6-_vEU8Z;9V!i*jVC0}{>G6$*Mis@ubSH z+>#Yfb}i0^Cy*KWS}MSkLwj-vwF#BbahH5*xPD)9sB#+exzz9*kcV@h-!Y8`-I5Jd z8^qWVi7@daBJ?ANFhEWpIeW?3L(U*ML*#hiNX4;*C_fjuHZ?aJg`6YoLtxM;S>ycN zjGR@6q~~Yg7!XP?6-FY{jDJMpB;^Ox>(<{Ux)QM~T~}tWbjqmEdHw31y@Qwc@LhAW zFLhnH0qJMH>)PC0tV^y3f1L*}S`fMzzlwKdbGEK&Mb;xeC`&O0L7^)*ghQyrVqgnn zlF>4x_R%;}#d$bCxy3!M;0lXhD-a!38;+WEg@4)lx-0E(U$$;Isy7>Z=w6?02+`f2 zZW^F_Al*Dj_eLf84_vhmntM~tN5tkM$&rcm=E>!w-*q=`Hn+3;BNW6*c7cc3IX|t4 z;1s5#W)&WQm|ENtr=q|FCmcXz&?I!>_Kryy#w|_>1HuSgsU%B^h%uRRF}Pj(w479;vRT9RwvgzJxNE8a1ALBq>UCej>{T-m$dPs$6m0g z4Yo=)V8ir9sbFSqVJgwvqj+`?NRF3S(C&eEOp;}KE}q>TRn$gJkReei5=}s4k1uvV zsaK40Tt>CxwQ$fm-tsq#-YELY)vw-s>*jm>H|F1&PkIg{9S62{6}5$t8*2NT+znIE zJpQ9G_}}3KhId)-u9PYpo1F()yE66C>>N~=#hp*8W^USfK#da#y2WjJNXyU zoDnjq;{c>j%Y*zH;njg4zX3vJ<)<33YuoZQBCVzbYfwr>`5H2BTRt;%=y}PPm0znf zYU<8SMNh9ZZd)&d)LC$~R^Q-mDAb}razaL*yAg@bPA3>KR|Dq2No5w1OVfj9Mp>ki zi!+hLLWBr2v0|{6IK9b$N^NAYS<%~5n=MJN0#N)-IJdZTV^6B_fY^B8ZZvgpQam`h z-uR46YAf1O6+L1_&syKN_kDBUdc`3|kem8aO^3v$Lut1+*QFZOr{5v%SSg2EmkDmEU?QhymbA7C zu0)-hQ!hI=b)sHPq4tnak!V!IF6~f#`h5vKs-~U5kO*4G1$qV0U<-#6pm}8O0lt3* zVUm-*ZARyUrIL>jJ2a&x3b9Bb!V=ELXXoRIsrhMMC}jbR(=csaXhQ%Kprnc`Q`46r zra-8*P^7@D2GAds9V*<~4*v2w{WHGe~BrC|4runyfm#{PwjwiBvn60AOB(f&!^mAS>1kBjvQ4>G4OW2}`U+@)8Cn$R0ccsl zd$xC#wnBZhk25|b*=0=-60W9J1PiuTK=ed*fv{Av5=)aTU~7^EI4U*g8pU+AGQx(L zqBB-RNbiIysj8oHIL6Rf0F|qv>px7B(itYHJ_bX?l~urE({`gHE78{p&Od;2i~GejLo%T1R z{6|Iq(f7xaex#d7I;zusds2PlV&8bWtQrih|E%c|XYO~3j@?N|ZL)4Rc%gsCSBKsj zO12KJ*X~U^s(#=pP1koJdAhnK9q3Ir_LIM^^RdU_b!Iq6p>r#S6OUsU&M^!(7Fu*{ zSMlMx>83gE);XlgkuJbw28yqZflCHL*pi0LH_c1tO9o=rbyM64{!6M1yHw~$xi49! z&9mIJ`Ln3^O$%_*{AKg&1yG=Fv~mdO zG!+ZT$TSp~yvPS$%+_QTTj+}c63l2uL%0zjgivyXF2xt-bzri}I20HTB(7eW;{z|x zCN2gNFE0f6IoQ_c6LBpmmA^ZLgjsM&d;{wV$ohAH@v;C3^zlGqA&|Jp&k86OR^IrP zh50B_=0GEed4nS-zK2Sqp%++~$%idlZ2}nB8VD^fMpzuLNZ1WWvNHLU-luSwo(tr< z#)Z#O5RJc}J!?RAQyRORMk-;oQ;k8yXqkm0PI4DjUJ+RZk~$Yd+l@wJGH@4p;4tn$kPcY1CQCEax&*7T-oj)*l!?j2gMnOOF0 z_6(#E$s3Hk-gC~&OF_82OBpn0)^I;*SW3BsFWzOM!)@hBQpq?~**r5CtLbELi z>OcVVpm=tDBJ(i4t1yHt=3&gHvRIN@l9tE1WHn+qYPzG`Z4*f;8bh%gGD4j>6)Y7T z{=>R>oV`m3e{Wh+@=Y%re;ID8Rx36`a>=Wf zq~8-H$Fj@F40*X7L(ug4AD-mCX!@CYS*w{^L#;-vS6~036&Uck#fot2?p#{yIH(Et;(1NuyzPO60~=yIt@W)YDK!ot-nap4Aq z6(|5;K9{BRtn5LTa2;uw$Rm)wg;7z-h6(amTp$C1aG0DR9LY)+FM$+9$u@TS?C@mJ zokc~NFvASQHz|cFqMGF8DzUlDzU5pw^sS@6H@052Z@DPl(2;8B z5gU4-J-l{yd3bp#U9lro(JEH7uH0C!*u8x0Kh`wfwx>HnNpIsytLP0r*mEqk=e)S* zJamJUuUYgpuROo*>jJ?ZELGF221RdQ#=#vijhWQ2Yh|J+<~?0>(z&E+X1AiK3N%jY~jiW)@C+*2R^JGy1Re9`go>* zD{0IWb1l15&3$5XU$X!Bdh-cD{GhDqe%s(jPR`%C>F->f5&ivZv!efS%0DXlM;|mO zY~zu|R5y^ZSgLC>X852Z)pe%)`$Ye~bf6;@=oSOr$=yfS1EXkJT|HP2s@E!3?N0WL z-me;axR0ydk+E@s{>*U`=WR?n09 zj5j9X$>xoQ_C_TaR3Ymt=fJ@k{CHkL97Y~tnVi?i`MU(dt5-O}#AC^+he(B5!x;i8l!_54sWJkq&Gcn;a#BmV z*tH#UZj+&i@RQ^j&?@Lw9jV%3v37X9b_A-`rmHsTYD-rKQ`OyKb@zI8&+^F+JT+;5 z>uN~!58f{Ppsecc>IXHg>8g=*)!}q)Fj=!Rx%2S7ntSf|hm%#~j|!}{6&cQ2UXiKc ze052;KjY!*+6;2n?7KUj+;{5z9^^{bgjN?<=k7KptM+F>aLN!i%c|rti(V?g-Hy8rcWAmxR)}ZT@v$Fy0sTir zS>JJD=^iHCX?2`HI=AGW937uH`*h^k(`TPKC)qHme3i!-yb*(~P?(*ToJ`J%#Ag?I z;T2?H!in%j%IcWE8jHL%n~1CS%io~*-=wH=EOwITsAM@wL030|bChvXA1p!$ zn<0#l93&Ouv5%EtV6*W=DDjoPslLekHQX25wxt2N3EOGxW2Gp@KFu*yC0z|G`;tuq z>9Q(37geX+jiS48qh)WZWlU@tOSPO9@pGSk$eD`HnbO|cl($**HgB}=Pqm*A+fSt0 z&x-hY&w>jV){~sjoT}^;D?9I39$VX$>NzI%99woPpqrpTt_Y$#c;9_;^>Qk7R16(` z-zP%v64z=&svn+?{INBjV@YLnZHH1qVcLV57M&)jTBP=N+OVE0C+aG35)2 zzToOpqAzsccY1AKs`r@Kd+hz)V(;nYLMnX6L(W=Mm-f~@wBWw!t$t|39Wf=14-3fW z;7aN;F1unJ7v7z)|#K6a+DDN_W_b2 zeogD@WU_kq{pu5Im+ws`2Tm*(Z@R0tLw-WfdLWC6+sp3F+O||}k67EY_L5jT2)>y{ zTvbr43T|{BNOg{g_*IQSuqWwU!%Z0%Yaenp!o`+{7Th<1lQ!H57i%6CkPo=n_^^3}rlSRl!OlK4i?Drp?z??qI|EYlc7bX0i@4OeTEKyL8iw5jQmAOEMLY z6NfYz39MNV4RCrph$P+u+ho4`ddPf*fQ45#XrR8@yq)sggd z-1i+{dm+_-Te{8Q9&%YYN!Lns?;!asvgvQ3^nH99VQgF+2QnXpDt z-=wq-;C^D^GCz-L>3BpLo7;>jC00%!`Q$ORHGPc6)W1Zz|4+u$efM)Ku2i5;4D@Xb zo=y!uEe<}N8hlQ~FYp}k!4fd4n$}cxr&!&YEbqKuK1yQVsMs^Q?9>L+b2OMbQacZe zI}g)f3L6e4O|bgf69$vap4g-p`XK*Olo(Qip3Dx&_LFyQ#||)&_KRj)HpU=+(QJ!U zD1#xw*w*#~NLHMn0x(ue%!>rLPhwtJ3`4>llk3QIuysJuFp>up8(M!rkNR*@7J2AQ zi}W9bS;<yQgBx32E&~7l7neu zq{R-=doZP-k0VUV7^pRRptQG$GudA2uuU1AoLg` zh0rald4n=%ZGO^lpSdEZ`ujA%{|4>H0C(hBKz`K-5iS>P0u!2QhGKEOi z_7cJ9+L!7&Ds~-x;0~nRZKAu4dD=yHJM*-P?$%6!^#tb9Icv8w>FD^#%@x&u;BETA z8~DK62h%4;vAZ(ms=x25PnT8VsVS7&u}9poC+TQN!{4+=Y}&K2?`&${c@e*+^Oyxm zds^>%Mv}W=(jD0<#cGUqQ5oyNTraMh*0)WHC&GSY{tF7=NhW_Doi`nvk`G^i{H!5R)Zq30X~Oyb?o&bux>i zI`VWvCYZ_`HIJF*+f;-N%c;M_^~A??%JP=^`$GDjI50q#!p~~ zpU4WerD9K`wU|EOE}`l97tN7HE$?YFV|cA(B;zh+tBXn|S`e+SFM$!jHvXe9VnD0S z`os`c_I??G!k@u;$C76>ixW_YJ+j18q7glkbK|im*z5Rxpt!e*?My|mt_uRg^+WfP2++C8w?@+st(-2*?Q*LfS@bOA=5y@3JSZP; z^xBDy6B;P5U*73x*NwS(wk0NeZ$S+R&id?iWPV{DM{GdyRmEjCH7NWZs=NhhDhX*` z9-qP-h&+L)SpA;!&PNlw)EF|y(G~Y5I%FTmT?(l_{f=?4TYXaVU<*T!8g{8)_38JhR=4bJ zf>A5B#DI9v#V{01Jpan6EsqxM%};?<(6l3kU6=N&KK-uNF=~IWK$V_JF%wSPBF5` zvH_d&rKrUTb(NudOi>`Zfc-65D<84ik)3c97(YLDt`F-Sug=fYu9z(nB0CBTCrO1S zu~D?*;#3@lZ(0?1VQ%52sW~hboSNguuuJVVlnWh`L{q5(@_=HqlM~B}FA>cwgqxI9 zsWbT{kCIAV7nRdsYLGz5gdf0J>=|Fc_8j?)Gd`;B)n`GNdVUGpfs~b=FHh0(^Q-gG zP_SJ-_+yoT1jr#XgHTTn>oZm>tz(71q=*OP5Py~mV^adoql)zO$;J}e5g{~EOfl_h zl~-S31y@w4p%|k5Y`wYgkI6?QQ?l~&*W!}@{OQwYBF{{WMp&muMkXVpXP!AdJbB{m z_=HeL3GMPZX_EWpDS_TcWPvu;GNqJh@5E+=NpjfKM>F|I_7%QGP7^tQNY1|?hj^z{ zHolNJ5yLhq+8M=1bC!;)K2LDGU%WyLz%!H11riTD$y z^$pXzAjii$q3=OZEXHQMWNqgg6>EK|-lJmg(PcXsD6v^RAi4r8{?*TKxb{6L>sT)f zCEcM-mnY?F6kUzjzv|j>?SD|V3w!ob)giGuqV$I?E-oshI zhehA;hn1Z=5DjeoPT3aB48a6eOM}J`e0xdhV=w1FaQFOr?NNx^lB*Vba#Nlr(bKeY zdTrN+=iq~~_KmWg??qFgv-d-1AJq1)WA@3{|DmrYfv9{DwR_NCi9QtdS5^uX7V@*NU=ht_?=@0Ua65026%%OJK2w7{6> z3!;d0%kE8o+kOAYnk&^kB6g1?J4PNC*!)<`YePAuC@0<6nQ9ym8wb+82UERcV((b8 zYwWSjLTOMJ^cvZd8by!%x;^{T!M^*!b9Y}z9XKZ*IG5autp=?RD!a8FN$(m)cIp%h z+tN(gcZ!~!tNZWPZg@s>lXj(>yHd@2#pb;^FY1YTy0_005R?fiOMqNvSn`sMs4)!h_De;q zvN=vv+tCI+&Nzi8XQFM!DYX4sFw`3!FY%;y=|@cu6O zHP)SAzgKz|z2KVQnz-FKb?9YSfS$)YH*v53`E|?l+{=&#r~F`+PIZZbfh45t+aPIg zK5%t@mUgFM-yQ92hCDPkx9~E_c`r}h7;NoDXE|pB9^0xdU~A0OeBdQMFrFJ5 zc=_V&^u+*=!{;3%nw?M5f!qRrfxphS@&FPY?5V^o2swbziuw7_8FmH<+wvXKyWl9F;HML45)KRF z1*(w-hc*o{Ns_t|$CIdA$FZSN1CK*(Sq~dD8qLkxO|EP*4=4wiDzG#-*n*Ce=BOt0 zzP3o?6YOA>03D&rnmao`!{a>CX+ESufRx=2WRr?!D3yBB$~d+dy-2}!`3M$yJL7*v zi14fAe1RO20wI`dVxgoTRw~PQk}ZmJ=J=phDpVQ@saH>4C7)TRA|*ERD?8m5uMJO6 zqgmJ+fk0MsU}vWWLqU)D*6mp@-N@>Mz@&W2T_`X%H3yOWLf|C`;MAkr9Tl9_7}mPQ z%4cw1%>|{rp+JkgMYSa!tP*xo7f{y;QF8jofe{PR`(^${T;85fh6o(0f!!N4-yvA& z(RrS>buhsihs?@{#lq04oFEIWfeF_`l%9@F6Mjg}ep>HS@-Adz|su1k@`+3^8Xs^rzqDo~FQ_-_Q_BXW2++w51J zM%3btE!a20POJ^-GW-n^+-VaOc6zmP?uN9x;!fhLpMUG~cbii~&xk|Md~7khX&1kb zRvEPJOge(g?%SVFGpYT)tMNe{Wb&4Dr9WNePgRA)s!*z`SFGw?uj)_N??~0}73=pt zF0|DbFONepsBMtN__cb7%gd+IwXJkY&gxzWuQ>9irsb<$Z*^h+>3YpUM3x8MjJy$9 zd2YQtv^)lbu|JTi?Gn9b83Cs=Qk) zC#iVt^Y3FbELX7`!g)n8`o+x^7Ns1`qN91`e5&Q3*m7{a^_|8J|T9WczgF88 zK&+ew%cB)IG@oZY<^3*Di7ugnHi*KMT8fmEFYk(4Ald=<=z@Xplc8bl))S!FqnZH_ zCu3pr=u`_TqEXZl+7_vC2BORIaVFGCg^uIw5XN5Vq-jRpE_Yo6*95f&sey$=-{(%7 zLbsSNLV*)AfB-&&A3?%#?tBN>8l8<9z$UN&ZAZoSG3fAGwX!O~U%vvD0iFPIc@0Je zovkTn22fRpvZ}2C8Bz~ZVWfmeV-{FCspQo8>~;_bWCf@K$YN1QIWCK}bFmECF~^DO z2Wzk$s;^hV3~4YAo^Ytyj0~M5&g1xueiE#$9KKY5ljBHem>k2bBe6-z&UQ8`Jd#<{ zNE;LG!GS!%-Vbpb#=3GBR+N!NkuBVVtRgSni!bilj`J8K{vo}s(*Tas6G7=JI>d^O zR7I~?(fhrM{^hZBZ6h7+v@*M1+r4~x)9V+#?b&9QROd0V^VquY*bgdeGeunG0Qiiz zTvn}#RpzMHM2Q#`dl7}wbTeC(V7O{6YigL@ao%P04%pPN<;ictC|_0%cV01gG|F+& z3}si{-D+r-Ub154DW*c$nmz6oR!N|xP^w;$eWrw0Ey+l!H7YYJhfmF`-;HuyFhi~9 zZn0@VlrP^oqXXub?AJ`R#v`i?16A9ojI10+)0ze&1+}96jM{Xl-iD zkjEf2Z>JpFn=GHBMXUFw_5jT^N-cAtS>hACak$4sMW{!J;nOE*Q*0`J zSyo{fP5)g=_8vKm&a;)HG>L+3>Y8FA5apC=F_QqkFdNlR6Vz%TmB=xwD1;BN;B=q- z^irdS3+i4DrZQr3iS4q-%a(LBQzoYJQ*7PcE_{sYZO>$sGP6~<3}Z52+z<(%=ims7 z9mbCyEL}EAaYpIB3$*Mn{BtYLcGT zq@xvX+J#l%G|7_0&U@F1ZDG=q@|3$nbawz|2Vk}XCaoZ6rwO+m6J5uUcT?SAx#~)_ z?-$$mf6p^~&m?+=la688WlR>Ee#Rl#38REjU4%zwAx!=d^G}Y+tDky>ALChaMwAm| zBRHKYLNjl3vX(5L0wqvJ5`*M}@@QoUJNjNYPQH_HpkuQ`pw5xs#m?xLk8PDK*nuF= ztd&#Z^W?lhsm$~9l3lhL31nLr=E=E6Q8sqK3OhMPvPmacYjJ@e?|_Qtz0MH_sC)MeQbJ? z*$<{E?EjDhYax!DkSAW{RxwP+DnR*b1hBx&MItyX{|HjEGtCbRBb2dMb!f&rehvc5 zPhRCPA=Wd18J={{z(-|Owjd6rU9qf8tT^5}n6cp*dU8qm?TI%EGX?bI;3{h}h2(Q` z^*dIYl5K~RI}VF=!x}3 z&%IH+;uF0snGy;vh30=JzA|#>`dgvZJ}mYAnCkPW&t&V(*iE*BtG-qK8<*d?ECz=j zad<|}>?N4OvX!p%Vw04u^pQmkp!ChjnodgB{K)RKoi}Bw8g0H!kMB#@U%URLFTD1J zj1AA!IA5lKd=Ac8nk;P<3)>zQ(({;!0V}aNS9~ketGz21->%Pacue-4c}RDqH26R( zs3N7dJ?Zl5J56`uciP|hm5c??n}x;8iQ6N|>Yk*pNA&crP2O$#=JQF$!Nj*Nxw`*1ce?wKK<5=eIn#68pDwc+dzO3FO@PRSky zi9#~X9yI~;Py+I>1mwx=s8GPtouYAPvAActci4`~;F*V_$;0BwgR{BHmF#q_RhsBd86Tl$(sBq3? zd(O03(VDR_AFvMo0@;tfbiUisH=G$c6^50w>5)s0`xOg!*mM-{@dcBuB3)kd=5ud6 zN9L}K9MIE$*ZR$!^226fNwRd8SlIE%Mu{_v=3<*~WhBGlwmP!b{LaZoZ zVIf5kUY0&KlUG)#h|UnX(k@kDc@zNY(!XmJHSrdFE6I%S@mQl4-e$Dk$Y>2v6kof- zH+}5L=lH4e74U`lR+N)3q^YlHA)esJSAlFsg8qz~eeueT?*w^&s#1yo&p-)?7H<=7 zW1kqb=WB9|Z4g#uHI=j=GPdC>4O*A0-$F16HRyBI zdP4qe>xulunC(8o7M{_i^WGOWjjL$+Xw4tZi551~{g zm>MQkD5f(lgq;KxS5Kp-9NKC0?7_{9bJ&hdLd-NtBkD!cJ4yAAs4wx(3!ZFX#j!hv zJKBwmNWdTcxN-+p(JJNQ9l#W{`1_j9@l0Hv1KZM_%B&SWsJAdZi{r7P@{!Fnaj0mr zn8KMBctmNyNSB@G>6Ox9(5&5RKJd6rJO{wRqcga&Kt6x~JI)tNw zRP<~T9Zl)d%5-I8y5&H+X)n!BV3wk$4F~BJHh-)gZ!VdxLt34kisvk!C-IsF@FQv{ z5f`Ju3;UAwmPO36fi-9vmDesBV{|_szX?fKhw(<6;l`r5O ze4+A*Pg!g*lNhAV5`>-X^MqJf0z!8YKo>v5&`~jh#RCeT3|zj}sn1%GHE@P4G6vO^ zPvg&Dcgvt)mTUSl+rl50mmvtRq6lM_Pr5yQZ4g#cywLp1m?^!hq{QMUtD56ASfhU( z1}P(vz$#TS>k`;WB=6&h@^}9Z0t%CG)X52|udEdVk+?wMmW*G7s0PzUIUkLs?2mB@ZF2>_4`GU0*ZMvZv(~C>#mdVyv__2!ui(|QCFzolosGe4xQh{GH9xG(dL1+% zIdvx!x^iylfkCtYV&VY?sUvW4;bk6XeoQJ)^Rw5$Y_x=n1xiY0(#2`RY?aMooY}RN z5(79gLs@K(4nps1v_a|Ne9E9r9AnLwIyn_iv{`tJ)9jTviM?7$=Z=1 zKXt*!l_o2tsM`yM0?-HI)YX3j2RxLt#L5j!l>M_Obh} z+7-v@@YjkSl(x_ckCx4r&Xtp^7uNW-XR&L(*0Y*z6cwh1JhovM(zG_VW7~ z9m5hOvd-#kUMh%0q6^cJNEW}ZK+ok{X{k4bN^%&BsKBWn-^{d?c$#)%r+Q&W?$PWaXP}6A_%0l1;2M7^KV| zrSf+P*lbEMLG~M@*S_5=f0ee7eSwX3kq94B78iS#H;V`~87noj5P1Qf`q5bVLv8Vp zvJ!BCXJ1K^8lR991Dl@`*r(itGHMa&wo=;@a^$p(Z0Az%r%WAs3A&oHLQtrn!YV2I zQ=c}wLAPxU!h``Ye;4k^&jDAa)Nz6AL-IBswv^9LkF;?|-b~a$z5qFm zzvkROHd{^Rk2p9#cB(#n*w3{0KXFyx=bFCH?bTCI_;J&Z%@&jSV}}_z?8R_6)%lqD z>Jj^6M>eRlrjN}?{jr@hb^O5M{l4|!4+?#^pIa}izh&Qa?Mym%-747ZJ(}cvw=X2? zyTpp_b?#^yGcO~UBbiS2jwC}PV({p_gc!s$;wiD=bdvKwwokiEM@-53?#CSSJw9F8 bZQ6_N^p82*AJ3RNOvTBTfsZ-d872L{!{}s? literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/__pycache__/compressed_tensors_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..503c191b8e974acda41e891c0c830a97972bedce GIT binary patch literal 127709 zcmeFa33MAtdL{^v1PKxV0T3iX5)*rK4KKt}Zl1KoTi}WGBFq z!~(6d-Q7d=Rg#a9WRIs*w#P+vY255xds^P{j#{v>jg;6=8hLdJwwIJpEq6-EgdRle(Sh5 zS~gV1{Q2YM(TbspsBg%}?hD2%qg6vy%x@d7j@As-MEyhlXzfsKv~H*_T0c}DZ5V1` z&+X%ZXyZ_0v}vd*+C0=84GsmPEkiBQ)}dDR+%eu3Z69i9{=)H&Xy;HT^E=1aMZ1Q& zqTNH?(e*>?S(t0QC)zvI8{IIpf!({uH%9x0`k23Hd{cDu&}QcMjBkl<9oib*Hnc6e zeP}xiD<0nw-8r-~x@%|`yDu5v9o;juC%SiNZ?u1?pM{lK}{Sj4nFlZL7dxpoyhtG^h;Lquwn2zGty=O8y#YJMV(aDK@lU#H-9?TMpc8$bG z-yV)5>?jwGaHA9F#NyYchbQ8rGc5E(oEwfu&Rs?Zw$R9QEIt_xO-{vdTgXk0L_@=) z@dzh}G@OgXL-EK&Y?2E_C&Q8P(9|#oLg2YL$&H6YvC)|b3N1wO7l*m-La7_1V#LVv%xx zBoaSA8Qwh{i-ga3zCs+Nst^&r(TR9Z=wi=s*M^l3EAViwgzTA7 z!qSPi&rbEM$dDQ)78#F>#6uCOVh9P0#zJookB^2!XUB(Q=SL^bqRmF8v0J6}re8Jp_)8Opb%Ru}My>%axuDQiO&7UPQE6+3n$_j6$gWPqBRtn7>-6l zA+aEY4iKIOlS86CL_@&1{3Itre8?G|j!%lY$NP`%+H**>9Pi(?_l-NICq%^GU+RoU zr#h!br`o0Wp#9SH#!Y<}Hby$fN6&OlUB=)w(fRiHc(hX*S0k4q==_shC+iQfPS!i+ z5xa9l9_Ll&#SDU|vvy2f=K4@PbG`^qtP;*wO({p|+{yRexcbI7zxeLHl)Yqb?|TDR z2flge=li)`N_14cLiwOX!YA3FAH(1*4YXqIif3Qw+ts&ks!xkjXBcGw!=V-@urdyf zGg2F9(P~zSrjk>=GD6R;o+FbHnOp-}dUo}t(UR8F^1ueVo*ajx^|VNK*c{FZXGe0z z*i4qiQJpcn>Qy~q%uvtfhAm&sQF~+F<=kK%w+B%~d)jxzyP^{Ex5pw4GPztNKFv*F z=AvChC{2T&d=(RDOqBRfL;_Q#tlu=xW`|9)If?AOrZ0WPWSX_a zDGzofEQwX}7*nR`N>uzRl|rexx?tIRH~XU;$vc~s$a%NYn6|O}l_?Wh%7kz>=Uc}U zIWLp)LNj{4`CDH(X!-?nj>$NOD_P|z+=zm3j!HqUaoWV{X)S8nJfyW5)Gs%Y_mSL= zoS4Wp>N=OqFC8*v(h1Wua?RJ}+N$L8>|D{V?MiIzH=%FWl6a@`WEC3aMb8Q9on1;? z?N^^$tO0S>YtIy4TAU)p=~bR+zxp^_X?&ydhp_HnOn4HiccSh+Zx}h zgpX}keA+K)qVEp7o=sDnhR)}V3)=dX*xGLzZKd)3%9H0Pdq9aBE=c75x+!c+7tBHXpqP7jbRsg0dF_mgi)`*_2~S>}zyx-3tfNH^IfvPW5>m+GOi#)8!5pzb zng!DALd=!s9Acg{afpcyWp+L_MT@ySX=)Y|-I%UU$3w&FdN1~){|DF11~x~>x70rx z^G{61{qoc`682vlji2`;0h>o}JI75!lVl3UJa~$u(L&6*5V;%^OEIy& zz}7P~6^qeh#)oXoFhws->n{${LX6cWG;)4;Vj?mw7D%_WKILez;OI>f^AMXBxuP{D zuYx)Ho{EO}GOT>Xt&~1!63^X`NV<5!AJA9j2Qsu@sHl zODRh!NNy}laYk74#YI-*SHY1JxXj`}vZ_xj<(M?y4J3mrMyA3AlSKlBAm zgha#nlQvJ|=p_;>42tZ*f*31Yj@}E=eta5og{Z`uU_N&Uf!q;#QkcFB6zx(cmS(qN zp|XOInq0K$o{DyP?ZKu>Br~91&ThE3kadh^4}Rt1cC*rGwLuMl z=~2fi$M539{H|%)WwIAvO9W;E%1Q%Gb?vhf`IxbAx|X(|XPJxV9%% z(~|P}Q-wMOTk49g6)qQu!Jtd7zrrmV`+Wm8rOIq+4P z{B6q(rm_Z!w1o-Y+b?(rc<;c+!H3?HbM{neolx4tm-Z~Y#h30CN)Pa*2U7MLWacfH zvn^Xq_A#vi7C27NKk%U$$*7rZ6=34$5?{1K|p#1L<rM2?Po_UMib+)LNzbd6V8cP!xsF{#s55{ zGnf#cEeKn`nln`JLiy&03(n?*ZAfPe=SS>D)8TNz3_cq&<#teB7{8%c< zOK_9vDwfgTc9^hW(w00~rbHGd$yT+|RF9E_LrFKAoyazNGZUr|O`zC?7%s)H{UU~1 z7GfB@rHG*l1j5<4Qzb2zO~IT&j#kZ(FqVZOI))Hjlak9b%O6pxL7TXP#AzMr;Z72@Tc#Zev*}yU+~mw$}hZ{$6L#8dKRr!pSXkb4d3gy(;>9% z;#+n-bnl+aO*OR(O*{Ce9dkL0)`pb7QSfi%{Tu17DplbZDq8u9*14Q-Ia3vtb2;C3 zN<%)I^4TC0RkkQZ+xgLO7*kZ5fQvPlFhUR>36Ih$h=NEkPWu7zQQD{y?X;~F#q^fM z$0IcPRxaupHvE6`JLp9>sPCFX$}7c=IWvfp zfGBMS-7uV`E5ktc2l1DjV>s(>jv|nl&4VyN73e4>A~%aB62e=nigprtiCiVE{%w9$ zwMt|sa-{k^qc*vUFfXk()>UegrwOXkYGeI;YooOhMm2-ch1yM5PXn>oUE_M@uToEI zB42AAh|9cMiz>cweq0r>8O2y!lqIC9DT~qvQBeepikL8U*0^E zGxR~nmBtzSDx9$=>_0=!*gs#+gw4<7jZv?0til^d!tpcYjpOs>%`-V;)N2Y?;Y?wo z@Mp-GLOo|NXLSz#h=|>X4Q2=F8H_?l2ja=$uxMcuoI7Ujm+@S5U+h}1Y_v;4PCWS) zdg8j+o&EqJT3oitE> zoBK8><*t$Q4mk-TJpUL?5u55!430KH*d!Pt z?(xZCXw68<31aT_R2VZw&O;G~DPIeeU8W;q9*eiWTg*M#KX~Hk@t~9YCdHxor$o

tl9rW0jmUVYSO(4`Xh0C? zRdF;x3_xdB5qf*c_?dYFwAF$L$THN!2~7IM%LwSI_}XibP+I66;fB`8ZQj z7Ybl29MVE>$%)Lc8e4g%QYjlwC3JwEtOx1v2G6inhl0U5?TBKuGy&Fn|R2o-r zSsKs@^EAw=G@w`JX`HLlR267UtI~i*nJ=eiRhrrYP2H+A&H{~VRT|Lqu&yK>S_k7~ z!f{H{T;u;Asi$U1exzVz8de=mQ(?p#SEXqx(5zXN2DJD53~;YX(_ElgyDAN6`T26z ztxD5UplMx|##NwcTa~6gPh&2(nv3hcWFGoF~$O)RmP2`LT8|qGCaAYeiUm~IAu(pQU9M_W|^GDnNnVWuo@`z8zDu|D)zAl4&IN*6yZ2c-3P)%M&9&0L4k~A6paYw@lz=T8YSPj5J zcy=uVO8I0=(B~64gwbL)#9B&9DB?%4rXnt*AU-98?%37>CgTSnB8YVt(WMe32RaMz zR=x$Xb;VMb(*R&mAgTfkX?WPr0Y2qMo!$+wi95D?qZ^P48A0<64Z&G4oD^fVm|C{# zlJ?3_5emxyjBSU945Vzt5W<4oX7C=Ou*P+whw$J-0;cugD*|B;QHXj7do#s#VF=}o zhbVq<-55fV;2{CbwqU3UJp|%7l*~L;1aRWEV#GG|NI9e-1Zvuaac~w&$rw@}hFZ}> z@Ir=^v@$@1t>7xble95E7mjdyF|6nOGJ3TG-aIz$6{I?Y9s)<)e)LF{*i7yKdI!N% zOb_r26EuiBi%G9xk__GSNER7UQI6vGL0SYmwgR4!p_%qopqXfR1u!!XCHy@WbpZGm z0yCDM$F9eI$|oo8>1zqTuw%jbh3j+I-!&{)g@dmO=F`id1+(ME?&;kMFjINs?DSbw zOsjMHI#FMj)z_z>AU7#!`~2&<-Xmh~k!F{D%qy6GQc@To9L}7+LDV;-b@%jb zB+tgo%bCN%)`71EbH{w*F`qDSUN}D@m;)vG43xv2v$u-&Rz!mAAGUt9VQ$06lb>9> zc}=L>BI>t5Nr>yjGQou|q5f#rIv^MZ%0oI!m36bbXLf(Q?UP+McipS(oc7K4Ck`z& zZpbwP^3yG90BuT|pqAvxH{Zy&@O0*wup=mpg@j;OFprnC40_}M;{|=a(6m+518%g? z|GD;_eh#D;_y3{N$2&7!FNbmbF~}9+6|e#+1f6^g^p#J{v2~qWDee)_^Knzyp-7g#6ejJ zuCQn{O&v|qLPL*O+XHaTqO}&VgC}~8RhKvjD!8GEfD4PRrgYWZhAG|K6TfUuo}9Wo zZCk8glkT3|nbC-@jsIHlJ#&4TD$%uRW%)lm{vF$*5!#tGw){Wa1$n4{c$a~`!JXba zcQn%?)_3NI?$p#3(NOzuny^&mnjM%KNUPpE@xWlYVV*W8Pp0Tp|C}~W&l%G{7Hc=o zzkJ`YdtpK}JpbQyrt9of&(y?EM`zn++EM^zy|?kB=v*|t_k$~7<5Rspar_C`@i=Y_ zOb+tf47qXcrCbi-5_-Q^OyP&W+w_{j}p&N49BOuBlIK z>Raf^Ha+)5U4W(gru&irys+&t0w47DJ3vs zbhqUP1=25IB5XO#q^!LSXzXD32>x=+#>L(`bLFNS{Ab$#$i2T}ioRkQH8A z4oiX9GN=D{Ooi5h6-NNij#fC<06MYe4x>n-fpWbr`&E>7!`D`WRBHAeg->q6ZF?V{ zjS!pOAr9;lQH~nyJMJcXHyZzr`68o{PC|MdgYL^C!61s3gd9#VCnH4m0>KOOacJ(h z(R~>XxBR43|0O?n)qOq~9vTG!Q!dAyHv``29xN0$&P|4(wDTY#vM9EK)#NCIWJ))j zc24>`-3NR_7fFjqIiw{Z{DHhP9tKn7y!>}z7m+(ngt8A#!F*%TK=+Hw1jKwh z+#?~k&rJ*|v5--}f4nr?aXA|rZK90qQyN7w5JGaQOAc9OgVC7nMRJCVbO z?c`7tMkExxO3p@E=ph^khEWzLrA1^3v;#i8EG(+!(@8jTWc^npEg1)oS<-@#>kmq% zmvLOkktGo&(M%Oi5id$=-}wj^#8PkyOL{Qigd^psq=wogO*rI_$RglgNc9*boqVn= z(Ss6w91iH8r}G4hiNLOJo|5wrXb^W*XjW0@Fu(3moO>C=i-9&1`zFE9Ynx9j#eMd(P1%I=bdf z3;kKgbBTcmAe|haKAzf;>Ar8?^c&xN!(VTipIGSrZ1S)A<{L6$(fnNEz&9q_jgIM# zP@{K3Tf8Y+C=v}&?rpUKKJELqM*C?xUa=M zG||Xol`r_l#)GgQfd1`9lL|;B_BLckh}Dh|YIi_>$onbgkEz{V?pVE>1PNx29K4S!YmHl91tn+f;Jx^Etd>l*y_)R!P>;M_t=Sj1Kt;ooIW5~ zj`zRf9Xv7UJ$>NRi5Db;l2H!fMFgorE`Y@Xz1Vfh8VyH%K``pLKn#;48&nS?SB#)X zBsw5z$uLL^mYHv01cf_f6GbO841+TKTpB0_kY7}S2mHeB0G++6I&pl-V7#H8R==ao z8EQdY$r@aV{%@;I5&t@8>bFDaIML?Hi2fE z@n>CIrv?^nuGHDAt>ZyOwcy;GbM}hPUQqPA7yMc0{`(aN7Arsr6)Re1Y~NZa#~wJ4 zwl;wdt~E^o{Lz?tA>AanHw&8w1kH+F}#&k z?!Fc-d|;89KXu=N8(YYBi7VP>DvS*z>;eN-sERsGxusuVy>O1ZN{gW&I33lM+wCB> zSWO|s(g-NCM7vpqg4l}!1+=!nR&_koLiSC~pezp}nCV*2K1n0Drh~ zfOREG3*@*gH%UwJ|l_6_cF3w1EKRrB86LfES4wB z(A+JkVHj5x4kYfvwi}?9rrOeNLY?Ox;}J9-f$@N#=5VI8ADQM%aJ_{HZjJ_7l*`#W zM0-cpzAz{?9`9IfJ)xY zq0I${QZ3od{|JFu>H-D3F?39JaAcDY!(P=669Nxf%IxHoK*HINoB*)8AoXs|Z*Giy_qtxdhMHaLp-0kz@c#ECYfqDfmWmxMep z`H-1<4DvQab}|h_$f!7(l8BUZ1UT<0p(I2kog|Trok&T@38jjgfe1qRv)=wBfQI^bR!-Vn=Yjo_^R3&@G5>Jh90W@1AV%qXyG z>i)$mQ}m+2ku%hZ@W1n5)^!N{j6KzznwZ;`?*3q>(0NdF9s1gE7+IuqXP{ zmovCU_`mIVws{czc{<&k_T6mFH2;}L*mhiO9{k#HLe9!m3pMKnX1$w#OATL8et z$aDu=d3=%@@Rq1rX#3CcS#*$@c z1knv{iz;;#mR;d)5!43D?%j`}wt|*a&VmN+3g;I-+%*NBPAoWTpPlCd2!_8d=3_#= zkkjx3bDEarbQMK)mMM2{l`mjZN;|<)$rQlFx5BX{!ZXh()Wd}lmTggZCVz7p&zDxX z+xd48=L_~YWo>Q>cWGhFOa(S$yi9p4gTxqIHcZDQ0cUYA)AJ{AgoBWSHVOuP85=@c zITzt3gp#ls!eIz=(a0qc%0hZ<4N8f)&I*T>TitMzeTRH3dM!?l1wH0GWAcrU17qT_ zrvz+)OR8EQ$N8>ulh}Y?pojR1qz?K+C@^302W5fTL`;vNvhk}>+NVfkM7HJ)j(|*# zybXkKc<+|fApJ^O@VsH}Ka#RQ0G1gaJuYboFKmYRh#9=1Jgca{g!jq2FX5ZW`@R6m zw~+TC0|eez-78`rNHtzt-P>1VL9c8U~(vY{yPF~G2%?K$$_uh=>?WylI z@61;Bp$#_l?(N2W?5`WL)t%F(Z>+UKU0<%QU##oDyFObtkhLBY495V(Awq=X!xz%7 z54=L{rYzI>n~C|3Ok8C4 z>hJ4p2U^S$9VyCTl8FFnu$O_sKimihTlLT~5gYRn*+T>0uki8F5>0721w7!Xa^M#%Q0R>#@=$JD zATXE0be<`QLkR#?s|MwTsUiEijA{N2R6x{O5Qpz#xG3m;iLZCi`*ZNfcaT?(tOeHQ zZ=mDanuN>dM06lh!*7B7*SKfV|z zIGed~Wco;w`*2Gt_Q8)pG0U1a{^sPoDRWjd@0zBUis{0d&Dr{H!P%2FZ~4o9p%;#l z4~ym_D2fn^Qhlyk=tJ@}Wm)=V>veH4t9g9qt zzaY~Eh8!YRIx=O{plgAB3fX&9yc)z&@JC(H->1+BL&1!O`VzUn|ojZY!R$@MgFPigv&# zA+2z{?RH)!h*uPWCa@F5cBqtgz)EP!Vh!+klb) zn;dckqFfQ)E*S|lirW~6UFC4vk}ZoPivpWPkbST+L7;~F&*+gwQCK6A<}z#*lAb{I z5HR2?+{77{bQ4|-lQku2FCZ2p84(;J`-{ZL$tC(CBLlT0Mq*0#BFT|oXizfdLe)`X zigEA@7I11)+!7;r9#*==DPeipmX?bA|lyzuhfm%lolt$jt*R3`?KRf~>V__>A+i2*2} zvNe4=*Sb?|-I;at!I48<`=X&T`C7Uy(~;|VPV9K@E(50+E}fupA~hUEWQOO57Xn$w zk;DM1vvq=NV}_j{5NyxGFIHF_?{7(7d2i1@TAG1dIGf;B#^`KlCX_z;$tyQs$vQiy z3`zUj3@RLJQTeS(oFF{Iqk4#P&_Pd)r1yH+@Q~Nb-N1S9SLl%gp$h!nj!xu$K8~;z z4%vugf9eN1$Si{|kE8A-zuO_t$W|=s9D3vEy@}pC=%vv^FhJ(lS22WAiKLT{#N-)^ zAMX|=bGQ~vHSC5T5t$77pX2@sUg3*Uk-r7+k(#FIZ*>&C{r8mR_muf}l=+{j_J5`} z{S(#sox7fXf&OkU9A(oxzB{ZrN7L@__Np7{itnDMwez`q$?j&@OZS_L|gFY797(*Qe(K$Ek8Rp$`@u^z$><&pg%w%_CO} ZT~){g;iU@KBP~4f3sybRLx^2E`ovx#}7>_t3X{6yJ zcZPa6WU38B_pANf0(DxTKO!tZ%pk_P*cOHVXxTv*S)l1T zcR18zWp9fD9f9|rd(OG9bI()%5)KCme1`BRdcKX2U*f?1I%~kQZk~`kL?tR`kPJ7< zaR_@1Plg}m8O$5Pr~o*@@Me6YK8Eue{){*(GT3JXGQrUxgZ)M*BaKR#@MxGL9-{fQ zhA%j0r_n}L{FIDF0%YeuwNDLbQ8lPFsr+?6A!?`&Nt!UV*FE?~U*ijFTn%b?DPJS3 z9?_cBDEyjU${BkpwfSY#@piT4Q}1Yt+6s4VRoh;`+x`N42jEj1H*_Jjb7%f>QH_5p zjJE$j?9&DNbOean|H8d?fm~m7PM_KhQtA9Z%t>a$r7ofTLf#&g} zT{YTnty7C{qxH=5eMH;UPpE&iTkHM=^kuaBgYH>wmdx?*lUeTN)a0HgxJ{C?#WRXw zC}V~OFn>Bbe-ZkSp4Bbc$W2UW6o!#>fl@7N$$3Mu#&a|SIG;szE0;}r><(3@TH2DS zHldrAM&)$Dv~n3am$%z=Q?_z6Jt=D+78FCyQ8}y4YBXKDB_4tJ;D_U9|L9>>%YmhW z)uufh$#_a=uW&bm&j=-f!OUh$-i*KG%ZMdE!+FtOOK7{C>O*@i1vsLUQV@P2D~{XX zH`V3zo7*pH1A1~3U-xDX2Uh*A)zdvzuS-`&7i)G}yIt(m9v9l^)qu;%kz$}E!7rTs zVAGyEc!T=cy5L?xxL0$j!MPiZLq#vA@0OZ32|+wCOLM)wS~RgI{U8vP4TDN^L8 zjySt*Zi^9@7w$`@j%}tnu-c@?+;f&lk^8MY9LUTEKYx)Fz4w~kxkYK*qmyFrr`(@% zC9usJ59jV*3^RI&(YYdW2(ILd+&F)UBwMZ@pVjnesIuS^|tFk(nFMA5CO( z+F+t!YAVCMbNa0sep;hh&7de~w7}q7*>i?A_l9CAN#5qqy?fMdNawOM8U?#l=5qzh zj?PjgUo$UD%bGbfRi8ryD0~SlRMph;jQhzbEPUO2ICUa5e0}KxJ4!B_n9Sw#8cpbC zf)=t_Jv)&AB(vDWNP;mBCag&yrxiFeyN-ceRxUB7B~;x+$Ds};W+(OZWFn)?*BD_@ z^b<-p4P4p;RaC{&bJg)nkV)T)C8149z(` zjlk*QPJvujo*174@)L}}fbnPMWMxh_lOEav7(0+HWaL?e=FKN6j4#82VQ@ll9uu8Y z*0Lj7UI))s&}6ePX6c6ZbxbDfJVdEG$7Zm9l6}I z&_?9xe<@GOQ4J#E`N(qfBeC^te(mEV(8nW(e}EtH=9Z#$%!qxSU;u zJm`@Ik)}g}a${bn`(Rl(d=4mNqjmu}~&~(554rQP?)I{rmRXkd?^~<@NkQOQ92* zb#Sh5{M8fF$F(6t9~;WgTa&r$5Q=yxlT$TA*5hRdM&%QrDU^T$YCK?_QOTeH|HC_OC_`{hQdZCdSKRyz23UFL2L! zArPzvN&B7~Ql+i`rt~$e5a=xjdRGH||LNfa!Eb^jaQJas{|)J@)bV@M<+lBQU%Dwh z-hXIq|E2Q&OO@u9Do+kv=AIGHfnZr2xFM~y4pc=Fj;%?j%F?Myd&k}Pfh9Zw9N~ti zJPGV$JkNtfY+v*Dl>I%c{@&HSudMBTyS(@9e?GIi_ua?4dMkCi{K);QW$A}2XMZ2sg|n4V+fwIGy6zTM zLW3*9;4_dtu_NG872pHDXKi9YqNG6xC9;S63#5=C+AHD@02WHQ zVi0;r9*ZG~KrrSn+I}-PZe^4?I{;=dVH9$T{xg!4=m|IjeHELN*qp*<1e@<+^BOk4 zgU$D`IgQQh*qnjJ#7k|qWQr+?-30zK3xOq{&%v`f#`Y!R%7zuz^Uei*Xz_;xIy_ zuJtHFO*l7(P&1Kwt8J3xs}jlUqaZhuhF|@wKgJ&c>JG8$gejzK`=*3#2pCRb7}@qs zVc6LAx%tr)tKGK}Rc?k*kLtMu6%Ra@fZ30)7UZD}B2N~8Kk1`TK10NUG{w?0n(b4m z4#|-1W16bjP2e{nmN{jL<3?>kfy#sa0Oql)J1^9NjCn;>QP~wkpU7q)E1IaDJW^;2CGjBUI4whrdEU&PARI>^+yQTi|0v9uhJOFp#KM9cAvkC}nAIOIB z;@dvRSh=FdtrahkK4R5h2P(p7Nw9EL_Njb5a}_@EsH7x(ENoPXKNBvpoD+oZ$;@5X zFHdTTypo<)CbYygC8HjM;6kUbu?!JvIH>egOxiFRyl~|pq@|G1LQFF2nVk*+84AjT zZY5@QYcg>S;#$H*ZY-Ee3U(7)JH-;fR4xC#0%x{EZ)MPa&QhAAwy1vS7*f(pF9DTF zm0m`OG4mRwCf=4=qw~lOka9Q)fMbGx0OTC}OteKXL7(3ndn6{&Xc|5{b@SA%4;S~` zjXw+>ToDfb>UkrP+SZBJ8~eJU1A|%TGhS#6f=xzZ)qoI){kqBn@mmuk=qJ87`AOgy z|5ND@f8TqI-}16(9qeWQ7yz&@%vssDlYLEKT~mKWmlOVH1p3fP{t_yU9jROgu&SQx zAc5Q5U;Bz4;M>6}@9`Vj0yajmBz}Tc4v8}s#b8h@|AS}ovInufS|)GJL+UiCKn%Ku zto*hIF!6FAc1P&3twNW_A@!pwaC{6u^ZU>^b`?d5SHyicdOsVuIdH4**5Mn2E8@OI z?sNZ=|E^~#bXQqwEQ|Z@hkkzgpWgh3H=pw!^k5#>uv(o=3j$EO0RNAGpeLgarZ@X6 zlp~XT7Jw}_g@M8IBAaq+z#S}#-_fFYW>Ngd{Dwu*AHtSNfxZtN(@=_~mmT=9uE&BS zMy<|p6(AFjj4JF1vsrfZcdH&x0uTCX6P?<2Js!C^vdFE(haZLxuLy_#w`$x;{ycD$ z|0`*T{~Pa7ev2?skW83^0Q@#JE)v(>fHUVVf_f=1%MLPak_~-Y0~E7kyuXQ$=H9tu zr#m}pG!1NjhWuFM)zoM$1ko?Pc=}c4Inc7*1kkrj)TXMbW^&!{+l4Q7FEm zbK?*zik#|+f^`zRVQZP>)L>DDtQBr7VRs^3E5kb(JTpkyA&R&GtIR~1-AtD0Uc_Ne zZwq6&oMH1&581xU?0*6_4+RoMzsF)6Gj@vpkeMb$jiM-BYCvN*oVMU7V~`>`&esdh z74k@oOxpvgI09BzFtpd`9|9F!n>hrpS>-v7`^HDOJ&#E1FNyT;r0)?q@QC#Pg1q;y zf#zFR9tOG>eCxjB+`;M$N4onekdgNdRQB$#3>~Q)IbKPoD($_M*51nCAsh}Ld)m>- z`6{iQRUZ1sk=QNucChLN7$R<4=i(c;r>cG&i6q*3N4hO7TIEPzHGrrf>E636EXman zjwIOa^Q%i&m)|JwI#dlKs)6(kR2vbBkXUOqick}YHdkW^HIufkJH~C}?&Wf8vf6^E z)-7waA*!8p^?p9GG_u@Zj;EgCVb@Q30UwdLRK+j-xcH-Dl}8xZ&M%#J;Nv5I_tD>e z#Nfx?;E%8V=;|{s@Kn2DW_=RE?j(fWNdVU;0SqTTdh#i3ail&I;e&^8`6G2sfU60{ z$#Xq*PK1#Y`Z_1T)h>~Xd?oCvM0zTb?n=|1Z!og~fwpukT&RjJaZe@EUuinv65~B( zq3u`I7!h073CBe$!N!HRAzT9ZYmnw|V!t&Hf#N)937;J%5BkDjZ4ZXMXTQfkc*S$} Y1pnX!!e0{aIf?($>pAD=zw{&g-*T#JIRF3v literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/mxfp6_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/mxfp6_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0015877cc66a6ecfe6905c9d7792ab0fbe50906 GIT binary patch literal 5059 zcmcgwU2Igx6`uRMclYkDcWq(=Hn7CFW(n+;mhe-IBm5JZm=Gc$q6X=*>>Xh3{bTN3 zu+6$PG*WQ$VDbd0sj#X(fS@XUsG5g9RD?*a`cOMYVsk~MNL3zqTVkR_8ENO9nKS3inVECGbL_tb0zLvQEx(icLzs|%;-K0%2L|)M5ed0UOkxUYGKgO> zEo8(&Q9xQsOBs1k7KoRaO{PrAP*tv~2fn64Ra1qZXL$w`JGRZ+-J2xV2zK zdzkt_yI!yTJ33<{oUvh4XeZRafiTZtfCerJrZ5;d7C0}QCnM5N$oc=ljjni%S8^aK zIN{fO4;rr>HG1~H-aB9%dS$R5CRdp$Mw7O|=;@SYQ)VO!mYvHOxx6EcIQrQ_B5NC_ zJ({OZvz1JwsR3QGkWSdCT-M0sOzNnN+69(PN);!;j|YA@p8th_aRNRpK!lKtFfKua z2oND+MjDqh^0)$HW!xi@qEaNbABC|MJ+~|R#x=Xaog-u@RHYWab=+krteo$b=@TaP9PV( zMd3DR#x}0Qj8r5=eQ1?CQ{NZK(CYfJOINiOJ#{xB+|h3>8ecUxde1 zTd(Re<(dxll4z1~^{48BuSER^7@7wUzv)zBm>T?jQ)ARPmEQ)x?M5D4;55q>^2Yg; zea7HV9u*xQJx8+!%x6s}%&2uHk*9{8Gft*+$zkhR>hKvF$tRM-iPJQ4B9Sq-fMPa$ zB9gKqjGirkkC_oW7b#dY^4ifYFCW|s!7kKk~HdP=so-d4C9g;Q@5E!8DmYjiHAR@F#_ku zeFVc%xC2;dRHUvwI}x`t`8d4M7CXUCQ+vxuVaKkW!#ij^ojMuMkJ@K)+4#A1Iuqyb zY|s&!EWi=s>BJ~yR=oc7aen%EW(56@)3N-hv%>XF!vyz14B~9o0fu!yVV?c45}bHUvc`yK_Grc#rs_mmsn>)z?kxs{vl zitqP52=2YOZ=&z=`jXx-)iv35O`OwPC;G}7(bi5kf8^ihb?aoIg6z{{#&9VM+6rw7`@$sa*+QUHR_HIc;?*5PsCO zp|oP%Q(g6{6MNCbz{HQge7cADTRtO-()1{}sw|3u)$@`Jnl%z?Dtl#b(^q8)n9nQ( zolC)qyZU*8iu`93#pjiPVn69-Vslvn|a!WW6 z71>JQL6~Rq_Nb9g4bxlF61TBXIrsqEfGbxm_dEt-3flzyZSb=YSSQF+AMuAD>aDYS z>$Sae`q~+J?L06DYlqI!Po;B-N`^^RvNv)UG6-9mL7=XxWe~_7m^n;9+B-@Oc|AqX zPz@2eHr^Ep&j3|?hWe@~3ZTE zb!)T6~ls(4VfBZA(7~V3&lnn@luEJu_InDJTSwRC(%;LBL zW5H1iS=`Ez&!y;j*Kd^qK0w!fg-3 z?X%(b>Ag1#GvW5R@QV|#e&P>Y>A1S--A(TuxpD0JvAM?R!^X|CjhpWr`Q5QwFx>Ti z?`-2s55v8);od(F%!Lom`VRusXvDv6!}2g#QC%!U!^94l>~i$oPBO$h|gi&vxlvhreelK#c_^nzL zVKrL3qybV?Y&b-s>H+SEkI|OLIPDflBKtnlYoUwEkOr~fsrVYlNY1$>FXvwtszT3k zou+BA)3_Mf3rmi{&oZGy&mn=})V|4mS9&G~uC1SZ{o2`CJu_Iy77AJ^x5m{ZoV&~Z==vT``R8;l?Y~(b#xpYxZh2wd0c_W+1P)?jGWRs3RQ?MbXEd#Q<<#>6mn1Ui> zQO3ct0&i^oD0GlQ{DNrQq}02r)25bJ~mQQ5tFDVMh(?xC(=eK5_U z&@IYK?lkg7ko7!tl?*hsYjW3A_hk1&eao!ArD6IIsM3te8jz*z@l4O-s=ZJ z-~acZMXCBX>mgEqRQ4)t#Rjn3k6jORiyh0IN9H@oYljYwszS0|IBMst-g#5F`QqId z-oG%X56s8|{H#%d!9O-gH00jukMKfciwm`wTd2kA3pKY)jpdSgsK(q%%`HxwxdIog zd7WBe%CL!}9;R6<#~jJB8L!`Xf#8N0f+xdRaDsbno25<`Z0Z)U3KXSK`{GOTgWe+C zayO*3E-`iXRZm)N&ky1yIg4<)6)LHbY=`+d@fKBWH*358iZ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/mxfp8_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/mxfp8_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ace1ead9e0d07869d5bbdf7cff031b6d9c2c820d GIT binary patch literal 1144 zcmZuwO=uKJ6t3#O>CR+gbP{z?LrYe4AW6giU>1^PqpKnq5%i#8q0>&+WGbotNmaEo z2{RI9VFOS-p^1%wdg8h#@I@Ne?d5v^@BE(>D?xWI)3OeLp8peId95J&(?NPYlYH>VU=MqsvC zu~VdVx(d?19Qm@hDCI!QdaBc;f+STR`0YLYQ zsIkS}HmhJZ{eQ0uJ2P=poi3ujTvdwFSuhB=(#iP>(CsyeCy1NkZcei*U6ou=lP*hT zSOyjO0w}}0ewrqr#TY(W#5a;nYN{~|a_Q*u?)B8RUu#!tH{qYD+kUxV?5mz!U6L{f8?v&ktFD9IY~4_d z?1#E6g74Wb(f6u0VSqKmm>87Tc63h+$ZG?tu=9GX3Py!eoKA?2v>pjfUP7$fw(oez zwqvdRcC>TIXJQE|k3xc)#O{NJxC>}sGh<8)DkA6yx=;KjOJzR_RhJ=z4IRMDe_-$* zqwo6pm&U-7R@X8xck{d7#%_((fiPE({eIxl&>usO)hFF+-Os;1^W|BnDz#Prc-Ba^M*KkP_V(sSr3m;@ zRaM*focXY!X17w+Bl(^4`JOXp-uHPw{EuF*hk%lxZpU^83F4nn!z_3N8ZSO33E~dH z5geHyuArwaK_+ciY$P#FaQ5GjSM0nkPMgcB&p1=9o3#f<<%-ir@I>5a_E(?4lllz_ zV}WX)S$pVsVAJbY+??}E;);iJ!OZREc=b7DoTf;3f+_Z^~d_GKStT-7>M1_cSYnB(ocFjK_ z@X?vb)NF5L;+DjVS|i8b$V5|8Y=MtR=?I4=YJP#AillC$S$Hr^|BHDCM&;Q_qq1dI z{>`iqr;H)mtOU$T^Qmu1&4KrrG;SR8bpo}+@5N#0y+g=E+}y6_DOoGxR86;0oq2~K zMB5ng3ldf%ey^2lusUnY+F{iF+7L8N`N>i;#gPjjnVf9_=4Bnwwxf2|DWMbM5;tYo z?0~22lpP=)R8^LKa37cK;+%+Y+*`xMsMfq`9Z7xwt5C_(z{B;w;8927Ego(O^iwbO zP6D^pWR6##4{2+?NA}3>oiJ2$J9YF%ra3p~d0^b&f&)&^$V?n9e67ef+0HR&hN+Wv z&qY3D%`<#NguUlCw}8(kl8a40BIu4l{Ow#Uxrs1j)1`MfIsRsBlAneal&#+hNEkMe zWC>_p*ejsh(i~D+n7pnzGO?7@uQ|l)QP2w4R3a)#dt-U6>`JCNJ`v&P`N<5tX;&h8ix~HO{{TcGDh#{am+SkAzcaHk^BvPxq}qy1o6);C ztPCnQo_T%CpXNTjeN`E)u)+ML+;}0XGOfyR#S_T)<_;-C-}{@EKhJ$$7+Py899tiJ z;_oc`kE;HoC4awi=9za}fyzb7Y)EB8UvK+n*Zo~3_MkFS2{adaSNvsvhwAV6dgPn2 z`(q`4mvZKNwt4wN?!sF4H~shfmoDgYy}^8cE^;?>@AK8q*N<%MePl1S4yfJ%W%L=- zuzVtSqR>{jxYn?Gxx{ponf)rWfBkZi*M4XE)EF^etvKEh! zKw&KVBUAv(NWc@Q0iFPAKpo)_1u|~OTHpYAcT~rrj>esmq2quLNH$+9GH{bA*~tMm zarP;vsVP}k&AQq%8S4QrEK8RK1d3aTR^0&OA7~%KGju}Pj!tClK!YYbW%8FcptfJy zrff*eK?7aqHx;M}+Yo64vKayrb_(eDHN9v@VHc21;7#DsuwB@V>id9bu3274W+XoB z7J5+U+o+SA(L580^yExL1T&?%k`a(iuwCSo=9)3;b`X8dp3X=j`T&+z2;GRmuE})j zCND_V7el5l0ZJAXgh7~qb`kSHK!*K+d{1t!&;TMRs=jvRbcGJ)FXnv8AR6kT@cb#1C+c^E`PaZcs9xT2+TJoG$hRi{GaZgvt^PVzP@pzZt z&ApqSUbC+x*M7EsQL zQ&|KN`F}K(P9z{8uJD<%w1n;y7n!IrPIb(nB`9v$T^G2WGOY`_>@dwi)-989gubtp znw@^ggpm@GH(X>A#1{BJFoZu#0%s(Wuaz3k_1?iyKH0Vau4Wbtoa~A3c#Q=#Ih$#M z&@H{`ZLJ$Y2CvRpWV^`))X2WjJ#6`7A?Z3F0v{{{Hb+bgp$RZLAt@a~wjwku@RKnS zAh}alr~iTyX0dJ2)u{zeDC|K5T?R5uN^bjPY}t7x(o?_}*mCR0?Z?5JHpP znUr`V!}AL~SS*jeJ@Oefw^0|TGI7XJZ=LTq^mc6*uGtuniH9RH3vP^1nNnFEzz zOF7uC2HO`u1bZ`Ip^?tA%HY$6(Auttmw)@&x1W_7jwxf6hTY|c4z;0UZE)?z`q=%S zKk}3shM+&N^Nw%DcQ;dRJ*>7KE(Lm&vrhvD*5BWVK5BS0RtlU}&Q`X!-bt(^zCQBJ z@%zV1+q*Z;soPH}7q*;msDi)T+^sfuZ?u=1kEqO%vgd^AIq~Sm_SfJ!q`nRXl%1@eJ{XxmGd7OclGVbLU zWY~FLw41vN#L0iC$>%4^Zr!(x< zJfJcu@ZM6BJk}D;sgDVJfwSh+rP6Q#F*D5$%ry5Uh>Au8AuWJUNzRMtWFg(O_`uJK zAYGgl(x3^#$1pSuKQRVGA)dM#@@-#TzH@Em+7nm%Q^U9EUL#kJg7^&FAK0)z>i%8d zU-v!f7<&BC-*^B1XC?o5ksg2QZp;tfb=(#6@)P&o%JvvYKzd@d{&*5gpwW7HQb1&?XhsPLh#C`E{vh{3K6?AOGW?YGF1vH?eA{inzDoN(PXRAtu-Zb0btfEhTEhZEFB#qXV4NWjRy40~ay*LK(!jhEhS1#y6jfB`N zmh;mLy~wwYn*R#*J+ZCJ|e0N2S;b)+OK>NjT~ z7Y93owRwIEkF2coea)2+Lsi|v+B#{@KyPE# z!P)&k+@9>hH$&HayLvbCN)Du6786N~X(O|42}MP?$ZqqTaF@uXd#zL*(J8Q|Vdgw` z356k;b_t*aodp6Vo}Yzm2sWg##{ko@Q~wDrqfN6Xqw_#wDFc<7AM2>(uW=>;9SMy} zq4n6M+y`ypGekZ`B!mdEe42ms+=anQy%CfNiHr|^qB$lL(d4W!ie@0d(p;uz>CrK6 z42@V$qX4c6%(=-=M$oYw*t^tNi0i=8>%OK&<4!Izuem1^vDsO0G3Q03f>zY0qosi0 z0+Xvy5f8#Uqa4MKGP6fz_CWSv&;32$F`Y%Kv&eL!ECq7RF8}gT`z87V@GE_d%duSSuH&A2)xFmCYj(YTgL~pT_GnP`ohbW&4*u=F zzdQ8%LnYr&A(mvD3#Y$M{bi)Ie?)B=QQ1*tl{ z)#dWGchqg~l-O{YJ*ct=4fmBj28$Zo*7iN@eAH4pFsinXmKskh=brfk%TiAIYUIxN z%6Q4&zIILZcPnR}2DaT{R+z$>HEw;V6nL*3=vM>%;LHw|0wdNKUsZ=?ld$kB?_di*jqi-lA}{ZY~6ZQ;RDy-qo-YuXCgDE$%Wmtv(@8bkzw&J@vA-Rs90z zzzPYG&cBJXsl|1*yqY~_zHooxk!|rhcA#!RY=yybD{EVQBNHRCtYn3mGArBbaRqv4PjUl&NKyHCB14Q#o z3+c?Po;R}3N0J(l84FMJe>)uWIJRZyBRI*VLk=|`;ch_w77tN#(9mO4l&7fT46f;c zK$DP38ULAB=~7WxfawYNiT?@&;#MCKY=R&YG<7aFw|GVwT57B`Hr?r4=~K?VbP~G{ zthW^poqW8jwCkKo?NCOR1|X{R2A9VnQQA=u*WzmHdm9Hz>~TPe#@2FUSZxfix0f0Z z72o@b+V~UY9AHQO93)4}Oju>Y>mBPN zA{vWhuHFcLIN3d=+m}Cu3BpB0kPQ+#5TOwHDI$Z2V3|PH2OAh2_0he5OXv`W@&+0` z06FhWf&(84eG-@$J8t^fc4 literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/nvfp4_moe_support.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/nvfp4_moe_support.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67cf9a45f99bd6f306e50311c3057923b093a2c2 GIT binary patch literal 2252 zcmb6aO>Y}TbY^$`wcaF7)6`DV&{Zh~x3XQNh*DHlP>4&bBt%V8%_Qr=s-05s9C`B~5~ARz+$>IdF58L`a;N8GB7qdg&bAyf^R7dvD&? z{L@lX#>6mSLD$-BUgZzvs^ zjjltb0eiqUaLVq(0~O8gzZ)qHt|{5nUxJ^7LBh6}RVJ42&u}W~xDG?aTUx@DlW#jd zs&L-2<5n<5mD&U<*O?GTRSzR;#d3(XNH7;o3sKImT9i00^6RylM;W#`YdE|k_&r9D zu`$DChTIi-kL&C50%K;SSeuyZsJOv3L@U&5V7J_TlibuUv3rgq_W=eaXoHe6&m&(_ zv4$hIa*=Z+ruHyRU~7?VlqNXRC|t*WoiMZPxy*80OwEc%&6$N#oyP^@ zEEZ~OY}s=QD}+=F+zuuPuVU`7M+?MS!_+Tui!O5-Qf;Bm9O4)L7yf*0O$JATW8A8r z=HM&vOuO(jJ+`flwT_K$Yoo2y(H-q56#{`S3rM~)h*Hez)Gc>wm9qg5-68U|dx@Xk z2du{ntMm?R!c|Ch2w0d`(Bmmhg^n>zC=%-?@4c6o4Vz#U`U>wDCF}qQ+%qg5O5T*c z8&aK}jvmn3cbF$ATYZ&QTf3t+p`T*|;f>H!nz18y)B`$SBm`Y^uHjmjLzCK~U4A{)o75sUG1t(x6hJXpIGUV?UR+M6_M7F%wZ^FA+M zvnYD`s_13&7N#yHzP$66pKs_h3!>(w^arnytGm5!@7^&z*DTj5#V+f)k}uw9oG$W) z5Y`s0@-35j=Cm9Mrcf^&>R8S<@W(G*Hf<04Ca?P5gE~GPEZV#|OR(i*lfTcrOt^*H zm=N>I?7VOC&c&`6M+e@S&#Khm(WD~1w4Z~lPDNU%sL3GKopnKiPdE>GZW#v}c31ph zY+-6@8qLp5qT(}R-+kV z=CE;JRI$ky6+J~obxLr8BmM&p*1_{kZaY)h)P|p?-`Pqp?Bs52=N2}#qt9aLC-H&D z@sUUIk(QDFQQtau?I*IYDhYir4vf^}b^6I`b{ri>Q75 zZ5q&wdV6(I;d zlqZLWRK7E$@`v&*hgANhPl$g$o%~&TJvH6WN$M28RVQ^9UQda14@n)el6VmzUhxP~ zF%amO0OE5ARG-mKiigFSn3OW@LHuIp7q~>v@G~(C{R#(d6+*Zl1@PpbAoDwTb6-)Q zvL6jWn`im-cVBGk@2$t4>BIMyxAhb2u@_OGB=4Qtf$6`M2pnz$4t77%Q6rR~#um97yHclW(wWT776(tTypn=kWNJ3OmoCp^+tSYS*&&1hwcfHK6 zn>g|zhg=W`dP@)7xTMO3W4VFE0f~#99aayb^!ti>cD_9gbZ^!XYeu)0S7RTg?;Fg4nScX#c*E;xUpI!7k$c%9CU?6vuaQs*G$b(^h&vm$y15? z=Ga5rkg>UD&A6hmshQ5K8)N313Z62iyLj4|^6{*KXFB^7zhO+x@iYcufk=djC>168 z>8@_FAJDB`fPoAVkU0cp9&xfjlL{e07EwqJ;|LOQ6ot3K0H8<*tqCZ)BkTtOm#q(Ba&(&>2&%1UCoxviX+)o z%c>9uBS|x*GeaY(j^wOk$*36EbS83rbYR-Da z%-`2_BX3j?))l;oOI4>r^152X#Lje*RKVEO8)R z;^J3@FA8mrOHH*!kQ!|D4nB*E$)0u($n+o6$AetljVOw#8dy=>n4-`ER&|CG3af_h zd;YtL@sh6Ewt;C2kjvW+Ap2EWS-fja5zc2nY^MicE;4u%x)L+c+-@(EcXc_{hzt4n?j2maO zFyW6EVc%lHx<&e!FBP=){hQ14A1cKg@*-jHa{G#(tgI}5d~0z*2^hirUS>zV9A0Ip zo0l7T$1m0#8KDmL|6*UGqRm4HpNjxy{{UBhjbJn97l X@4*J>dnOLS{s;39GtU79UhsbaQp3b$ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/petit_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/petit_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8c2c6552836f17c31d5482d7ac013966f48b2f6 GIT binary patch literal 4228 zcmbUkTWk~A_0HHce%ncG2PeFSB`g^7SlF;(fl^Tx*g!U$E$K=dtvXCRH_15TvAHt_ z967=uq!O)E;HOFZQ;;fc5Jcj&j@-m*b~+j?Q`A zGv&b?5drt;-kdNc0G(Gv%&XoW5P*HEa1!3CIOSKjss7vEsQ?a41=Ro!sX-i8`2`=W z`fvnCfx>1pJl#1Xsv+QxZRS*YSP8>QhZ@12>>CX&+g{newn+n3eX!4~Mqyt(fYg*4 zgS`h{!`HC|y>knC*B10R*p`4Nxy5?Z?$>HD)1h|X7N(LVKHc+zDw*c($mAyzXXP{R zpFMMNbo{*S|FB?bc|+01)0`drAg>m6JXtDWA^^J`kqg+;EO{Oi1M3SHCg@DkOv%W@ z%P=iP*ReV%P3TxLag!ne&o#}OlP(vuLW{^G4Y-mn585KZR*@K)cHenn;(=$wfMFR~ zt+kutOZ*Z7$McXv+#)I?rH$(n+KVhIgFzo6ogA{L9Skh7i9Loyq@Z%Q_-p98=M!{| zOM4U%$UzC0?UT6qFqrLJbovwD77(WO^{B3)I8 zrcCSDRpnA~+Qe6i*s$1FO$Y>h(DgNwq38rFYH2{vD=Jo{9E-T5Y6NF2z2piRCp@?( z?IqM@cDP+fJ2c7+KTAlS*j@6(*~!sKd35~Jk zw?9wy-HEuQe?JR#{}XwHV?T%}*86R5|5|k5%i*ej#PI>~N7D~tejVyXzv~_L*kPuX zoWr2KY6oXYzF3e=Z2@Pf+nPXemdp|$%aE!nH$Q?9;Yxg&yD64A)zg@JOF~*4w}tVK z&rJ-c`Ev{RCRm+iiiAKQ^^%k~U@(gXnsVlGi7`s;zA7qQy7*TCS#Q(fMtMgIS4}q zIU@2W?p^!Fx69#u9YF`;+g0`Ld~onf{+lUyJ?tz6%%cyw1xTGf%h~#<`Ebw=-Kv>ufk8WYui7nc~h0q zJEgyd67^WB8tbe2`(S()O|A5-MfXngN@j>EkTTlugjm&RgcP z%?m~wPuW}M%bq9P<3`LDg|=^XwJ|o(EcMSOC<`k8#M8`(GLm1$pUZCB#0*ogJM|hW zCtL@-&%aMa`ra;WL$#hmt7Bga)t*CZJ>!+LfAMxV3ZDg%^npRlCw8>m$?_4F5G*xp@3f|v z8S1kp&?03mqHC)b;e*s~6-RR$y3#-FWatF z<1=M{nWLp%_P?&dsC=`DKtm8z4$7GxSrIqd0_)&_nx(POfs`7O^0M#zLvPHM> zP3i%tjV8FKQA#Nk^j8lfT{Eyk$T?vDB|PSDV1S!d0woTx0(suqy}a}O&K2_D=;tRM zo>(Ohf4bKBcI867Be@*9A6hxH^7Cqk^x)iscWWK}l~FjJqrK~q{k6#cdN5HBcGQF2 z_2k}qtmp2Hr5h_()?$0=u_XN^x}8oGJ+OfSQL%E)Ng#i?>f2WLNAHG~LigVL*5C6y z)Oq*R(y5j7TBxt;?PEy~XEMzf_CJQucAf6FMXOZMj9EfMO6cK5-h=`A7yRKdT~2o6 zjmffNz+W78`7vdgP#?Jwipq`7>UIm;V_L-aK<5J~#expena<~Rb{!z}&?fZ0VvFt{ zE7n_VaS|J5-t{D*msF;&>v#W2_d48e*&h?dWEw*nB^#eiph=cyX#W#X+?wB)(y56ir9kG*3)w`08hrai|xte%* zoZ-)f;GN-n@q5<&-fC!jP1vy^0EctH%k{0A4uaQH&r|hJL56;9ZBU96jdLGztI01? zk5XUkeze;`bpF-k275cl(Vl(Z$98f1>YAZ6JMWGyjX46~pkq9>+<(8{ z@zI$dC3~00?vFVEIt!vmhZCY$m|_u%MG;8qU+P~;)Pg%5DjM#4V(3?&-u(23PM9(O F{TCTu0tNs8 literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af075be5718f6dd0026a00e480eab8bdf1f5f6cc GIT binary patch literal 30167 zcmeHw32+cq$HjqMN&6)(UNFgrX@>`*YtrvY)~S_OASyGY0#k^ zXH3|h2%JquQ0$DLY<5k>%9`5omX)fhi9D07@$6iHpdHePGEr(gHLKmN9m;aeCfPam z`(L905CSD-CfS;;+Aitg^?Uz2`W^rM9{(sWPvGzj=-%vm?`e+vOL|eCykz8_$;ff% zIf?7$Bwo@C@PnFe4Nr0HfOb&Vtz&WBfPT=>ZD4WzfN{{&Z5lLpo7uZzKKlRu(r8*aq$0b`}=~9E17Y`GW=B1w5zWy!QTDwPWeu_ma_W zr&sK6OsA`{?qUu1I46m(agxPreow2UcbBl#JfvE^C8^X>v{EYBB-?u$1mk_ zdG<)Rv^9#Z25wP)$yRBx)2HC8XGaV4wK zDPfK<8+NHQH7@GxtzzYhy{o(xSISbgt(M9$R$b|K)R?r)TiUCWmcOR!u9a52#&y>{ z2vVv*El1U|>0im^ZOC1J?$lmQDGK+Rq)N2gC>a2iV;*}oQdREqQuPw)_!`MDjvXnj z1jH||r-HQVHGTIQGbcSEtzM$-8gIT->uu^aNOgd1eni-Mz^=ud8TvSNhxGDNiPZ28 zFEx5wq&41FZwrl<)a2E8ThupY1f^zg8^*q^SCH1eX6#-kwO|IENn*qwy^%SqwGd+LH|g4)u=I~%4$6Q zdtB4Kr{+KygqU2<%8?nYT3qkrfZh3s_3j01$0Ne-1MFkc{w4b0%AJleIe>Dz*y__r z2NB;rW~e(f-gxvxpU>6XH{f;Qchu*VT)o4xtJ&}M`F$?WkmPDU<{j!eG3b#``tHzl z*YOGC6P}TgzM+uYDCHLT<+c!EgGA#Rj?%t7gRB6wsEPIFi?vVkHAAK7{ zrNR+Fdt_=NVWE;9+3m-RuP41AIWCD~xqDSeJU@@=xrY8 zJJvih=07n!)O>niV6Yjrdk5U!vtA6%u-rW08S~1%X0$Wp?;B^;Hjn!I29il`7H=9E zldWim1;vQ?EC{~B&6_P>-izS1y)vb;?A&AY1!QNAk;He?C~<_#yn2a^QO3KL#!S+A z4Z!+(tR`bJZb00Wj2jU*kD2Ps2_ZeKKPbC8JCa7xW_f<9Xy5sV#~H?cyMrdNGafR)sA@OHGc1qZ&+@0 zvH0*w?@*mmQ$jej=PCEmJzYnhd@5mP@h1=L=}H)6uYXh?>PfOc_(~2Oo3fbhJ%-46 zF2MCuYStt1C&AR6G#{5^B`SGXux>QmSf&GjiPgV^|7pVE~4l*5;nM@>!a zQ@ZK|*+DJkQ&4~)VRXBt;U2eJHc*n2l61$0hX>?pdZ%fZ>nJFqfbDNuH!|`0ELet! zZx@2EaCbFYy=lS1IZDIJV)oT>-RgN;ao7^Gt%~bbE$DcCOW1#p!_R_&*RKsr6kkj6 z=CJfR;>vIccHnV6P>W)ZI;iDHb1G34Sk zA*N$!Mh`a0402^mmPghaZr%MaYEc1m*U6dVp6I zEFclp*y180=Ax);xR+reg7vZn6fCnv#WpO%)z|AfDvx>_UD6nEKada-Iz0nEr3jmO zH^W+6GEvz5l_&J90||rDNH9xu2*}*3(b)D(gXwgbcGDB0+C+k`}qN zH?kgllWvHY^bCT3oBO67sL0Tj5YnhKVP992`Nk*g?9RrqN5T}j-Ht&1Am?r}$I#Wp3A zItcc41e4r?fwLFR*(zeTif?b3tKJl=-gLvZ>8_T`D_t;gg(Y(Z)v>?*2>rW47j~b?q!a zUtV#s{TuDUvorgm)(x|sYvSz0f|k$g3D`v((}7Jf9VRC0fOl;CW)^bk zlfZ%Y^^Ot#ATCA46`2TT5k^U7fSpd^76qLUVgWXxpCsoqf+0@lwIb8oGdv`%8StL= z4lDsVHaO_@%Y8inR-;?3`g0&0)Ji?bdXlI>kK7b;NrOIlHnObR1n(IaQsY!tXOwrKS%x6wO%&Tq z0YeMhDYg?q5;r6SrFOyxnJ7Z0{WES?+fuQE2dTa@2*Av>Zuqd_KQx5Sg#8y^y6{qX zaHcipY`We$CET_azoWTmx?q|sX^fRL&K!Ne`*L@zWK*VTN2tVH@RA@D%?0HX_{uc)5voZ1)G#VpI>y6X9&vdMi@ z{SNHTP1MnEDG)+Kf^L^}SbfWGRV2NAQ$ir|*)fLwEiGY`?-(5)%dXNzd2>d_gDP z?u!#>UhICTlrzl z)tYNnR~zQbRxQd4S{L+Mi(!G&>J7JB+f=Z0B3lWI#xaEw0*njsCn-IL5ml-FZtjKe za@_Oy*a^)9QS8ilKriVDDC4O;iKXD<-zt3&6y`{QY79KEwsD{ki3uV;&3^7xUPE(2 zTvOH)00%UVG_}9EP|xwDNBamFrLORO#*p zG-_+A(s+V<>wO*i#zZY;6;bWbPMEzzqk~@ArmU}H@~}th@c?BVLH1Yi-}lD|Cb?Vo((tN3 zx4VM6DSoPDUbLLs7~B|Y32q5H-rNyBb@9xFGm%pl#%B({{iXSW!qBPnXQt1DPfd?c zZ;u?lQLs8{Ssg7{4SB;+65`K`(_*+I!e7`Ew^#kxQ9NA)uEg;qKVMpQvG_u9q&U(N zIdy4$w6rdn-q1M9e`vgFyvAP@F7HStZrlF-XTJB$^}`=KK6?84Gtq5Ne8wvIYe&g+ zaoCZp-%)kf$mLlVjGUvAuuk5jq7X@l%C2I}olG)kkIHr+&VgxA5XTCP(s?dp!PAmY zPVEFA(BztC+38O$z+B5LpivhgnCdiS4yu=iWLqi+`-#M4J(5AKVH_(iS1YL3 zl~o9@xm&v;r23nC=4i$i5l?kS9%a^Yu7;dzTm7a9OTf}!oul`uoz}nV{@m2ei2XH7 z=k}APCF@a3G7>L#-64d)=sZKtcL zKG-(|7T`I+Bknm{=VYWk;efiVcVHN5&Y|P(kzuHp84H%s_x26-4kz?|>_<05RWrPA zl)a3fNEjjT4vvBxJV|vLp`#c=y$K_NzQNJKgw}J+m(WB1IyfRfPrZJE0@eYIe>fpP z0P*)79~~YA6zf4kki9*9Bl0i=m%6+(w<43#PJWSE(H|Q)=^j*mh7yLeZoEqgi}Fi~ z%t25gbi{qyGr)SnB03NH#Q3*_`7Epe18$$k?;~?P5vA0Vq)v>LmK0V9Jdx}J+_&)G z7XvRc$$c(x1p>@x2#gYY%*U z>c-kbQQe`acnH+){Kn~x;g;zwk@A_tmsZ3Inr9s!7GEvC=D1pR?WK=fzV~vh<%y~F zlY8d#tmmE$J{uYhpPZ?@Ff`+xeeSyB>WlHb-IJZ@Phr{l?I4$xk;60mOiR3A&E&z` z1r>7zHL-%4*~(}^O}t>;2_)?KQba>mV#KQ`Ng#UW={AAV`3<-*G|FI{W7{Bqp9 zT}gL@Izp$ym2va3EgJ#bV20wR-(Pmb6bO3!(v=qIVU#9#OAozGQ0l6_N(o` zy=|`TK&Ad;h1{fMD-6 zuUHmQYK@AeCLkT=qSOlYlwHV8(P36I3#2smcwjF0fZVlfnewbRpqUi?#8D`<>Qt=t zD)P{Ts(@Q9qI#Mef?zw?AlH)UjJ!n;%qAdC*JM_eV{_uivV^dJ{WzYA~-5G6(U~agU~f&GmyEqWw%Q;Z)J#uuE9~C-*wFE z0=7O4#g*h5hRRAl)8~VYmyEiD#}w4=I&-40=Y$K?12!^PD_#C!SBmjxZ9p@%m)d+T z-KNKN+DrO#*BLMDY{R1i5;c`7Mhdb%Kh;EPvfj~w0hjkII_W3n_wZ0k!JcZ=S7&C# zPW~bqVq_>`?eO`$GTBe}$nvnvNE?$wj)24&(8adaI-N{Hy*x?*X<_Bl2(pA4 zN0MwvUdHVc;UF3CPYCiLBOD+Vrh=$(#jT>&*|uw9ylD57c-ttve)g5Kp_W%C=B@$ ziR_c9>K0LdcXQ{oJ(}ZOkLCr4M-Y>=0ZmeKU=C=fHE&sT$eYkYZr4mh=6*v+qv9z6 zEFy+pdm^Lb4V4|Ep1M$LKe^zr=LEDCLF!DVZNz#n_Q%rp>BpEPeAkiFl z8O&Q@alpa*tF%OXzeQ z<28;}C(LiHgr>xtQ?HUDyhpGic(y9ea8Ls;(tgvDYs3G0I}v=HKfv+CS#U2vLY&}( zn*LPRk)3?fO0eJL&+|O@RU`?j3^4M&3T&M*Akk>Po~dk9vJ>NCCqXw6!*E9Uykxx4kJ<=x3<&+gI3eYt!Ef5(MLS{=zmm&+Q(Okj|_r~7)m zO_#-lUiJ(fhd<1KcSsR4^nP;Uk&mG?q#M#r9s^-X=mz?rFk+SkWhgO`eYj$#P3Q(azLN>-koSyYQB^>+5On0PpcWqy z`jnL5+Nq49>tRDDzl~ydseCPF5+en3o8=Jyj`JN~r0w0!mo~rs^3Bqwc-~{N(xyJYO+g5wo}=9Wim$B3{3kg#n1YO;%BIpKwqKS$wkkSco@_$iLK3&A&teiV*R{T`3quZ zH|~Ssa+vQXhrBiXhqagal_mF3xTjh%w-V~C<2&e4<{~2$ubFXW0l7uJuLwO`k z+RmL$Wky4-KgGnt<6QTeZUP>tX_!YY317glTlVc&zN~IAf%W-6PUleE;)ixk}pxf ztUw@QsfXfZJ-A2YkGQ}9e;P(O1$Pu74Lnl>KdgABi9g{`)|s2+V5};C&x!E{=ycg? zbJMCE@k(jzaUXCsRvA-;l4GYcydr9=iJEHex69JfVjXNmN!z9+vw`u| zkJgbIY85+#(CaN*ew@x)dMW&!N(R7M^r9O$j5U1ZYWo zizYB@sgiRJEmh33{9SwlbJxQ72+9g1zx7P_T+>z<1dd6b?c-HT4~`16mj;GV#v~UW zo7o?$P(=Sl-AS#dR=|B-laivKeS;LG_345Ti%)YyV3;!2mOaPX(y_HAzfT_#p@496 zQq41|$W98qu6{tzRnclcb-f-HC@TqCngZb`D_TPW-0SK#Pj!a+!JA*}|F|={{i(R+ zsZUn4CRbNsF<)nrH&Xl46f_`!3&+SnpFaWH3r;BXK;Q8N64aRL3f{4O1F*u#-vMBq zHH{_YZ&7jj2KnDnKq`fVz%T@m#ZV7C7Uki7cH|&oV~fX~POz#8M8mwkgn&;@A`_g3 zpp^z-M&Im|!xGf|LS@ys6~dRKdp^>Je$nN^KlM@MC@Q}QS;@Z($YnHqT?l>#xyj1w zYrkC$uOof!2masLb#;tIe{hr>h-(*2ynZMD&Jjx6$$zl&1K-s;CH?~)0qo@O8n~*~ z>1EN4dnflp26Ggjw@h2YN8k@(Rv%1-rtVBg6m2vSGfueL>Hei!r5T0t^~OWmCcrm(@=l{G_68rtz9JUh#O; zvh234G-|7yU%qBqn0Hi0y5o-KTSczi-?|Aq0IseZ~=C{u1n!Q2{VirKRuK@T>~Tqmk}nnySBMhykJt(GO>aU zr&Zh;BDwq#s(PB%04;xzCNVE$g1xEZhPXVc+ZCjvA>P6NQ(D(9brY@H$T5`fzV)degSAQe>t)S z39kX&m6SmVv-dCv<3l0L4}<8{;qZnQDmSLd!q1!N2Fwim@WbMYBjaz->%XSp*AXO4 z&~iznkdseP8krj48~%F~|9uL6K*4`PkPwIEzT<91KbA0@bdv@w!Moup?(UP$Dtaw5 z)y7scQ<|t~RdutnP(!fKyK%Yoi?eF55^g2J#c zy<Sxr?24M`^44kd@94j8{hsyu>G-Cu$$hsij!+%kF6S&YQ0m7mbud&5 zc~gDga7+!vgz`CIbxc?t7i!500LX6~4mHLs6?2yQn58~$X=J8!v-O-QXbOu{rnqS( z^iJmtK||=|lp$`anm6Sw%5$8v1TEnMQlLCqwkW?)yWV#20b&jdLo>#`E@3MVBlm-`oCnK$fjIFOq|@KXE0dZbm~UIm*Z z=?JgMnxOp+O;Gd7jv!(uQJ1_0HHr7PFM-JlI(k#0)&$|ulr13*FTp)pGN!aY1eK#U zYQHzNpC=w>BY+^v+?EJ~b?1D1`noq-tbtS9g&b(nJF9(CM+9r;|&U8#H_qYorv zWId8_Jmuk<%Y6%Dm19Mu>_Z|1W((%(Z?K-ej-HffRgYJ-C%%FQe~DxupXH><9Q1;0 zL;WPNW<8R4#iF+V;2IT&v5fCukyRC5Gjd(`=YoETREyf4`kSj?0Vobsg+eY@Ep;3w zEJOb)qg8VA&U*ZnY64L{j0@Ous71^}whX(}N);yko7vNhI!pl}Rfq74)S-UOgqYFa z9AgN|yJ(ai^z~RLnt%m|Ck$k#p2$#n-jHPeI*Djt+pSKLG%d$*?Et#E_H%FKR%6CG(R*58nwpY9u6DJ`$G zVT}HHwUA?Cu94DTN%z$fus#fCi5bnP1!JXyGR{Igkc7FV-#S~@_}0#(AEAr+5iUL% zIp`U4of(!-GL^*%kB_M>j(NTC2mw#-mByb|6%mYIcMhU!y2OJp>bidqwV>^Oe+#9vPR z$De+>UGB#FB58>@0HzBIBl2}9Xn~u-T4g^$$VfqL7FA5{piEe}5|bmby0@09O@q2p zR56zmnYz}A`eH_K9t38Tq)mWGhfLoJ4Vs7I$0#5!6y7Z{ivIus4s4O7^T5a@%9}*;rFGLX(^zOh}}GZYN6Jew>yZP3~!0ZGaI61t#NzX zREL^a89p50;h&#Q*$@FxecawKXWtOBZ;0DBA+MltDtX1gA5y=Im{J1PU+-~fKi;Ho zXr_R%;4e@t{h?$qW=^OHgJR-V{7?-?*~xvzh$qaX$binmE6Ke8he>*PbV!oDRFrsz z1nyqwO_;EDA<}wail$H*!hB^2ZxASrqa3H;CIx?l08WL(%@d<3k5J+;ON3J@%+chj zY9^OeC8~uDnzS@y$Gq+{?(q@DqW$|+>W373LO}w74^}#Ekv;_iz|N#`+MYc<@*O~* zr3rZ!>+I*GRN*W|bD}FIx^9Y9P(|ns^QKN{8uW&!SOt}dy*%6>U9k?z1-$|8M&>$r z1ySBQs660@T031!LW1MGaoRY!Z{AsS{+a1#Ug?6g(0N<5%!w;v;)aO=&2jV;(2HJ6rBq*1a_?+|mrWBAMu+GZt~n zige0&q%~T!I&P_%;-M6=6`t!0_J#RyOGLV9ZJ4vR#;mQAkKeKub0(CJUaqWR)V#WVJ1@tWEC z51Xzw#akb{e)vXfXVlUeE$EytC<{LmD_A*~-w?}hnAJt|;V``Znn2P}eAC`o{@$rw zpW2;tKINS~7x!J*7dbwwy)+nKwk~d8KWA@`+1sz}yFM1TAD**69kV|jw?9K?c8fo^ za0Mmj>!#~&*$YCC&+NPA_)gb_u^;IF!2FRpR=7WIKd@v5X^gB>yHEqXcgdpf30F@a zj|(g2gsPZO6&F@sJ$YHUekvyHgKa-)_3we!t1m9Bo!oU7TC$ZhLbP@}$%ck$LkPVi z_bhwOO#S<7F0Z*}j@LhS{Yb2SZ_K_osK4zfymaz{FncQI*gR+77PD`=))%+${kSb= z@0!wMG0YjuV#czY#$}M=rpC{GDfp$BX~mQ_Yi0bcPG@$aZCgoL`{vM;_7gfCgt?p2 zlgHf2;K^`n_;k22Zmm(~J#Jk$rA6(Srg>+@oUbzBO#xv!oB}w$OaqXfMCR+ldGQ=Cimb17I6JtOgoS(CVwGR!v3#g)G%Cq@UI!M+IDu0uP-uxtU%PBk!pvpFZS1vafs35k^{)E?M5e5&syYmW&|7HbF;A{Y?9W1HX*XNK?uG zfu1vUmqOM4gkBk?dx>IyO~Id1@Mj3vWkYEdm#XA)(nzHo*i>dj{tP94NYt(b0V8&f ze4@4R65)8;ijvbZd$fNou*KO zaL2cwe)ri+&(5Cu@a)yIH|;wo_kCit&)Z7Foi}VC)HqW;Z`n6zITW)T0`&?L&CY{~ z0O?U%6_9)Fy4hXVoY#eq_r+_z5HqcWTFw{R6W)AVw1=ugzVM#N=BU^h)ip9?#YTA1 z30&fCX}jIq{s4qDp-KC%=F&)|9TjuA4y0Weme6UVWqRNP1pOEZHMwLXb>qQo3|;mj zjdKbKv1(Y5f+qgjm`+z?0ZoPunSUsR_F)jcTA1(u0JYZ{(h6Z5lai0)t|-6aX3jLj zNhbXF0IDqaKSB&fHbB7#Q-XtJ_5oTYFUfeze+%%xr?vhjBH*-D4eqDl6W~G}amEX3 z<_hX#1@#{|KPB3$R$WrhDqlyZ(oAb^vx^RRY$?hIEDEtFZDvU?yYv!+kp;Mx z8LZd^S*i1*pi9{gQ=#UVesy^~NKu+=5&B65%${8IE~O;k(zG)t-J||v6zE$YTq71K zVSPNHwJAdfC{1$9z~EC%)ENFUF!n$^*L`Kt*1(1`E^=cwyj7~>{3On@CznGtPQZ$4 ze)>0~pGE0ZHKz7Q{mrd*P;=>&Ei6Ps{{V7-seayJx zqiI#iP10?e*(aGXg~MQ5Fb%$+04md%QWPnQmek%5Yoof_s95`Her2RKp5FjTNpyq` z#Kek7%MEd5RJSrJu1uL)ZVP$mYJxRy)}3n(Hpj^fLhSgr*T1{<(pJWKr_3*rcK8a< z8loi|e`oajf$s%wS`JR``82}|H!oBWIvjd2EQVhE*5Rmgb<9ySvuW0Id27tEF=pBr z)HAm>Mup}q%V3EKKYj7}3(sF!^?t+UhWPRgam&Uz%a)jB%eD5n<#AYNMEm4HMkf~S zT-ra3MBwt-f9}rJ{y)ES=?gFpzs#Mh5HofECXXiki`}^oGBzkXTmC6l44hLI?`ZjF zNR@w1!T+Y0XezAX_OSJru|X{kWg|m9GPwsVUUIvpPDH^IBKOuk%eHRD`IwaT3FI{QrbLPC zhsd+K{27t1CLuGIb~3m9N-6(dl2EHe7*^4o_Dn{+&DniOYtnO(DYedA_Q(edqiN!l zA#~=j5-BXogX_AYO_6Qf6c*s=6cKI=gD^Z{N)z8jWV{4v|3LswAh`m$W^lMbR{(&e6^ zDRQabPH1W^BDh)ry1lBU83u@_b7FZ+EQgzyVYVSwyDeI={f4-maoh8zeRHNmG1H-W zsC5<6-W%ziISJ{|EI>LW)$Mk8eH1L8%dd{*S5F?e4F%YX(=W!ItLB^yF=s>Exn|C} zF6LbK!P6f;fAx7N$tJsQsg56vulYd#q2;ROrezy24Ne%91fK{WhME_Ll+rASZFxi& zv#p%c&6|qCn_{NwDeHW7&6NHPPs+dE%KYn%I0W>$0CD)y7?CCL+#=kTQl*h;H2W$Y z+NCOp-Iufpp zbk20f#WtwC@8_6OT??V`qsltC*@Ve6a!Fe1$t7&#PIS$;GSK(XM!Lm@xo|wZq;65k zvhR#86~Ck|FJ1pGItH6IK$1!PTP_B_HA>mQw*h$9~fF<#O3d zp`8o~2XYZ~mzj$JI=DMTf#7#bSc%!mWDyf}ho((ARPzjCcStpIhnU|xBpzVhq`_Aq zK_;N_3e~|erFC29f^u#XlydOYerbX!hvD|ccp+kb5_*p;d&UyxJ!gBoBXlNRW(dqf zReqHL$nek}P?#TLZR+&7RxIpcFowwkUI6>3t4w}TgcpMcb1URu2$0OA3x z5vk$rbTIvvv+VqH)6d(sBq7r{EIfCTMn#;`!CUhH0=wM8@ ztFZmhgk(SJNaxZ=sCnedm7Lvt9PrN7ON@#-hPawBHw+s)bu7U2I6N)bdjjpsLmr9! z;e%s1U43#*dfT#77@ddKxs(haUEzvb=;@{l4{p3T;T>}^2Yv=ex4yF4;69IAYj6n$ zF0G-XA;-p8{;sEw9_%=B#MSKT#GgMnv2Uo?E4xm5W%x$;nuKIOWSq|c((tj@HK~Wa zvgWtogb1$ZZBk6-O0lQd{q>8=umX!98TLVNln$1h=^G!%-(E=GCE}uka;%T+#L2+$ z@jiU1a;Lrvz9IFj<$C3|+wnum(zxsiqs6k4->O{a-#0XZuUBpjv9;hv*J(3}2Ii;0YBd7`thYI&vOY>zdt!t=sUEtPzEj=}_*;^)<>0Z=!x*8LDLd7NL%5eVB*Og$GwZM=Eh;bJSF===}dXwJoAO`3=O9-usM#Hak_VJ-JH04Z)Vk>+u>}8E zf`1~xACSpBlQ2C2$|T_unS|+p8Z)RV>dMLK;+%i~u*lV@GC%|I0wSPq%aZrMzO!ns zY)!0e%?)u)%BNv<(x+h!d>ZnL&zDV?F$;g_@T*^%hc(7FZBzVG^w5Ky+d>ymzOf^! zI~ZMdFs?f|wc|D%3}AqMHh01z4?9TVPoNnYI?5dlC1#bcD~&v|aepEI->id-i(E)rsq#c;4R0 zP8{ny_j2&%uqWb(bVN?UEOj_;Yo4=hiP^TqZCmGT9Wh%++_nps2*5#uKHXNV+!r1( z&D6$iZBbJj?h8;v57tUaTx^Z%S`{uAbDMey?=u=qOCpm2&*{3FEav0K__P`#7hO)_ zWl)?+jiec}f$}^+hmULFQ%Zs~zX}Yj?dnvxFTG4eHerOz53SLJlg^(g-uCVu&xq$( zAC6l=%dA|3AyaMmlQ8ylmYZEqflKUCz>Bu7Q~i@O`1>;IADe++Q9YLAcW7;PphBGW z6U*ns>X=x4Q(QG~E4)=)GgrJVR=h1F$^Ey) zx|#a8xIU^|&&D4z3@TI(iq>g5JL~qy_4H{r2$%yP2jC`sO`c;tGR+Ham{fh|e9PKe?m)DF_;0MK?!yK)I*hPJcY62}9=dL%lvc;$tI|u(D+JZ|C@w zu~lFRTPOc;jx16=#JnWTy(62HOV$$R&O@8ryUD(s(338I6Q!(EQj|Y8LkLYSr(hWc zO%!aSU<^Sb|Kt(&H(C_(bc6_IqTpob$fiZfV}nTCt^6T{OvpllT$~Qj{1hl%=burh z3dxcIXg}F?8Z-Pj+2j>4QAs&W_bAJMLA~5h!4V1=Qawj89|c4i6jS-z6#Fd-Hc~K6 zK?4Q9OTkAJL@Bs~AYr6YT6YZN#D?owOY0{2e^M@~I+@{{Sp=D6%k-c!X&IQ%%6NZ< zAlXXHx>!G9+N%7~q3!Zcl)yjE_ZL_rxC57`Gjx30Uvbqx;wpc{RsEQ&`Ag37L(cLe zuJEt8_8)N#Kjzlo#=X0Av}1M50?(}4Ni$vjhf^by=Fd$UJ}*@NIftLm1qa_8D*v3r z&)r(Cv^vrnk>e%xlfq5T`AKnQq%!hUyttNL3(=@X46VG$7ymC!8IN;r2<~Y)P00ei z-rbVUEu>ro$y~(lHgQE2;n9l|7ba$;+5L0t_Qlrii#zv6x%^LxR)oi*^;y2a)kfaYe)&ZP@Gq8&$WT8~bO3no2pi0pay z;H85&n8F*bjo+i6&l=5q-a>5+-^I@xEw2Y&2`p&o<+h`IK~E8&d=RVh>GB05do^(e zacVug@z8m3*@b0MaaBye`kt9(KF@Qj>*tqOU3}rf3wO%Jvg^+e{9|U=<)+mt}Ibf7v-F|!x9}8EBH03 zMi6Ih;5XSxilLK*3ucN5G+(LiQc-l*!W%-~^8?cZ3ml>|>)+pcdFwrjF063!j?^F^ ze%tC;(Bl_%=h+vG6f+@q&rGqk#TmeeGhqBCfhnc{Q*7Nb0E2tNA=?El;tUwSNnna8 zz!ckV;Mb+Pfj9%kZxWbd3NXd;3o?KaXTbPP0#i%@rdVSG-xl^>9JqjQM|5WN{lMkG VJ&G@uA9;7a=GL>~=>gYP;kE0CqF zTvC-vzOUy6QlRXuORCOEOX&H&p6>p7y1!R{-Te1XX8{E%z`X0P9HOYdMMO(QD)404 zPEo6rKnZk!3euA_jj$AAEG(=NSi<#>+-fEC&n(NbG&m6a~HKZ8c*RDuCi%p)&t_GAc%>f;wKIdY3vi;y!VIH4Fu6x96{u zzjVfgtzZnb(0_;E&R^^Q8eLYX1G=3;{X5iTm(Z{a_1#A0f%$1xB(%WDyT4^)ZiO))_y;%U-6X>$w840LzD48R3*$Zb4{p5A=VLF7`4DKh zDq)}4BOd%@|Ho!Ul0eIUyCkw5zNGhCHs%f(^WksVm^)$2eL|PmFT5Z&2;J~26lvi= zel4n67*^}~U{Y=E?_>p^*Vl}hSV&el z{ElUE@P^6T|0NFGOUvw#FA(re1;jBxY;TED)E^E3$nn$j(O5Xhha*uy88P%$)N7Ip zhr+>#Kj4c&te<41p+~SIvkqu&tliF&ig{KldtpvTyRPX&qu~D z`ywJ>MOuRh^fVKeg3xo-Ou!et><`U|5+9xR1w?_LiS+TopzONri}J=>Xzx&n0;%~> z4Df2fFNxDJUJ_^h(HL}~kNu|kK^n>Et&*Vs4z~=f%~*GX+18`F>sQVG)G)&4O52mG-cvNaaw{h^p_ zhLx0?rsrcY&rv=a@kvopo9SSXkIqLTVJRjGcTKQ9dUHND9|?$Zfl(60squLJ2j3M7 zM!F*YNJq>k0qKtG^N0KUt{fJ-0{*G4$c@;+>V6(2P z$Q}Nhgq6DHWBx$2>sp_$k0)SfopuP&WWlpzOP@txm-eNAFWl~!# znBt~5ebYiw3+A{PtO`BmHOr=0Shj$Qp1v&GW&&Yfth?9CNPD4Y$qSk6nDJc^`K#AN z|Lo+i8!F+*qpgrFQduiiRrVo`z0xVF9PwQG(&uX(u}uOGERA!u`iL-w$WOlpa*_WUW4q< z=(7|}mhKG?Ont5!p)VU$Q_~Ve$k1q!XcovmKvJcLRA@O=dI(oAK>hq)l-z{_wf)ert zqDMRR3crVqk@f%Xzfga-NF`mvsh$%XzW3&DeShiuYrb{*wy=JD-FN3iy3h-6{ySlZ zKWz2VWLRECHV4FzY>A@xC!2woY>|ASS?~mvap9c3sf@~zI~3_CyufLT9!BD5=v|4c zx2`Q+dv|cH`vK=pGVUbjhSE*f@J|ojFaGPwpH+TV{Ii;eJtv@KlN(Ml!wL&I6h>8q zN_GXpvwrwU{+eHiU5;)OGF&ghT}c4G&LuRs$Z*jFdd3ukwo01A{&pk~Ty{qhpr4qu zJ?nx=FfW*6=-rWI^nHSQ11$6Y#V1Ccdz&paB5(YcL-?j*`8?YQ!#bH!d%Bt{>MGQv3aWqHggQ_0rIUo%|GubYN z=7XZ-i-}&FbPP(Qm$0}jcXs*d%f3)Zghh^mV+ekyU$`zix6Qb07DGZ5U(#!rQtZA=QR$K;Yoi+apa+L1$uy`MTKUpx9b8w8+0!0MEERZq~FH z(%so7^65fBoL#WIHwK()j#D$pt>!VQ4BjZrX_w7G-*uHkXJDfkmYu{%_3`4tV9!hl z-upw+DQtkIMsawHa`uwM5_vfZjZhBva?mY*O;ru<`OwUK6gHo*X_*g*uyX=d6sNC5 z!mx*uPGie6n4QH8-e%&;Xa?~e=mg^oBy4UFkg^ih_s!U1cT1Tp+EitrJ)v+;i z@523_dl!FB-@BCReEGAH;YOj+g9aJG=zGYDgJRIXW*Tz!(MCt@-U-=Zf5AY}C?mmmX$s=rul zeZB|?^z(%P6sBr4ebf#3hwOsN?Jh073=gn77? zY5Az&PQga^Plx|%^e3YaJuj!&I#?FAMb#Qt{h`P_cue7W*r5}>Ya`3g`(zcEl9?&L zFDg5r3RLBZvDD`gRScR8_R8P>IIUH9Jm?q{>fiQfag z@3JnQ1mDt8ndrSWur#pdNOCPnMp=pc6N-zOJE8d}Z=8~5p&~df4t%iR;ESaI)QsZ7 zX`iX;10$Nn@A%LNRQ_k!3OB>>p%s*)`I%32ShJMLVpU`<*F4~i5?NDBkpPY@9=a}A zX)4CRJCgFbBogS@xK%S!(v|!!iek6~Gcgwzcd0|%WP!N}<5C=M=|tS}c46F-H?pOe zo*u*8%(=928c#5(#RjH9+Gt_9(CuMj82lhfe`EO=B?l6kxp69nArX@1FfOtqZCiGW zE;FR1*E*(HAqZ=RLwwNRbNxC|HeaH(lP*K%b;zX{?&{Qnn$b=vJROuItbmIoSHCtg zGRmJnH#Ev$7$4@x&z>F|yU3p!9vzhiuo7x5Mb7Nk-WrcU^zQ7dG98m}Miix0pjZvC zPQkBbKth+8mS-7o4~~q_*`{shLf#*Y1f&~4_Fuv;dKa=q>bFIdr2{mc-I-=vAF!>z zag@DxCgUi%HL^6aR=jj>?dk)ECl{YWJ8}Ghqb1`iNd(pzq9XkdJcqw1E?cczsl&S8 zS^Cbq*h$7wo^h1I|1C$=yJHzg(UU^9q#$9>0YLCt>_I!Ykb|J;pSY5L!hd$<=Oh z<4I;*nM3G+B+dAcIs7>k1m#rb6B!#ajVmSw0-2F!Ceiv!0f4;OyZv%n< z48Q0F$Ux_pQLfTk7nd&H`rgv_(#~Tk=P_ad*6iztH%{G~cw*SN!S@c`8eAG&v#!(Y z-Rm9hZa=mG8AjJ*bzm(+8B$+d0LCL2a zGX|0@`WG;+TmzYJ(Q_9w$K$Z&Y>{o+1|){8P0~Cil^SOsalMARC4udT5_VazZDWf( zOOG`L*f``{bTxFNu|+-$L;N2I;1E^*C5pPD?^%UGaOWvA#B;Nd6A|eMR`~cr3d3b)GsYv zT5DZzTz_@L`l;itExzUSE2Btv%^%1Dq>f9DP zIKznGyVbGh>k0b#hCSKiY}}4P@i;T5JCj=95Pwxc5;kYSsfDw2A9c zm%J{grgmV1H3f~79hVQGe~cOGA+N(wK@#qv$);8MAwmhvc4#8P*zNcNb&|K>`CbOu znUQ^pgw{*8sey?ouBf(a(E@ZHMGkUeb2}yBkD*mBO2RJy07>ZEAqiVWE%?8ra)|1Sqf+SUU_M=&Kv8pSIs^zjyU+*QV>p;wgxxxXPAiL65H6 zKC<7j|AL4vT*ly4dofFPITI?v=hBq~?rl=-G({R_v=En+7}XB}z-Xc5jhm#l zxVBlc8Ud1V@JUs&Yy~p)h(e7AR|`bzq5G7!%*G@mMwIsbq z?gt)t&ukU8B-xh7EahzYO;PLTjqa z7MQ^T)HoBD{ZY>}-cs=>cSgjh$LE22M4kYt>me6NJo8b1Xx5{$p~pYt(QhYsd?A7K zMATKM9!2tmW0yq(en$NEC%As5Q@&2s9L;{t(lRAR`YO%67x^Xy&5JJ^2M`*)dTC6izcfeL`p>0bzuSc*4XBJgc_-A##Mn2#h9uchXyG~9T8V$%)=p{4E) zLA;wClQ1Uwr2v`qrih-G&8t-Ebt0IO`WOL%9C;<{ylfRo~If|uIE4_sl z<3B(X@Jk$&!)@FdYuTST@heBuma{JHY)Ls=9y;3|wjY1kKD1R>_pq?xVe`Rs^O;oh znP2uiY(AIq9$IF8Rn+>(dq8=%v?oqHbTnmMl%pca)oqoxZG~rn_+ND7D~c}e&6HK8%l4(p_HE3j$_`~Ml(P~}a9X->ry8P5y& ztjn|k$Leu_l)C1jLo-3=#KoynV>212gJ5rR9Ijktd_r?7FILmizNtZSZ%=` zXBHjMLmrs}NzPktT;s@hII8J}D{!Mgssw4~;EQ)8bm}1%j`|Z4IVLSw;6e7 zX`a)iS*e4$*0bWUEjZ%b*Vh7_QVKeSo6`?Z^Ymv%?X%#FX|4w9pgfL2+!1$bV_tAU zG~X33fVeml=O8w2i`ze8v{*Q}bQbks^&rqAI}LE--b8o|GaqJCm`!6w)QboZ>?LqZ zkrS-O*@Ng2Tn!|rS`tQzWeeUB2*_r^AC%BjBR-zDQ2I-Re~cNSqBE)VFR<)W%#f7K zDr|Z{Q1&Ko;)_4ROm`FI(jD8`_eTBQ#A)SYkD7Wy&$oMAhrnN z0rgL8OoMCbT;k-)sjP!)XxreHPbE%f%c%N&>AJ2|UDsw^_q|tBb-hWpCe!5osN+t@ zkGi0ssrNzS=>1cFefDQ(|MloEkEaGllWaY>s13akH%~j7QjVr%b8oV53<8A>1xdCR z{Mn*CnbK&Lmiwd$7YcC*51ma(i;2;AuDMko+UM}FZlW<;k@aU8T z)HgPs7sAs#FQJz!7rZijo}U;V8$Wj*PBO#cfb!eYFTO(>ED zl5Qi4*$;6+k^sCzJvLb^EKEkn;dJ})RQvHvQ|n{<9!tTan)<8-p5VND_GN7du~d2W z+QF5xPw;(qf~LG3nf+av&I6g|y^o9gpiN=ft@u(rYXKPA6<4e}RvcLyVk}kZUJa}S zvUbD@kO>A>2G+|`r7c+wiyTx%?dqkKOY0|6<$JSEEOJrR^&dEHJJw^Vs{PqQEGnXa z1a3noQm)1<(u7m-g2G$wCHK=(z?6?A--+D5ZAXa8jW}yx0YJ_t;JGsy`@ek35Zw>u zUK=O}w75G`PU~D%uh!C{I16ThUSJ^j298Xe!9E9}2B^E|_n2pwqxT5vUoew+2^>d5 zyae_>1Oq1s9iADudI(JCUxfSYB-k;l4(TO09eEjGCG>F~Ayhpw1Gg~x4zeMl_{5M7QLJ8hOQm&TF z-;{m<-Tqf(&mRB)`U0*6m0;}s{X^;6&QxvZ1Fkd4bSAk@V5tiC>gy}7e^~xe&7GRf z^7h5COby(PsLTw$xHz_aG}+RZD(y=#{TXL%+UZF-Js*~*8+%iYy_?R%aCYsW$~^0P zfC1q?q^%@rtIt$4t`DUu+7cW@0G9{XYm*%8M%xvh+7H9=z9<(?hXVol8wmKjjA*BC zYFa{$QpC?5ftZDF7Ti1uSxx#rX6QPRV-MwMk|QUVid#ioHpQ(wOp3SP-|tlJ?{^X} zO})S0si{HCa|JYT%^5zoT|8}<)lz)VZ0Z&~2> z99%m}VE3b#UBnEd%!+h`5Q<=S1G6}0Z)1ih3Pzj-uJXks#TFA4guJ{%2g~fBa^vZ! z^ecD^j7~*=4os3Y(=`2CmZDq!mU4Vf)%_=G&*xP6-%u}oPK|s{z4AG=|8J->pHl}u zrv`!XtLu_fIp{}A9bDc`Gwu}Q-el^(g3Ag$8MZuOyX9PRE{o}kHsIz>cJIdM#;H$7 z?~dMY*=&C)#lG~F87e+vAvRN$W;;`C=Z3V&9)N3jv^(pv(MQ*tpHT3~mf7edYcYh6 zAp8=&K7q(fG-7@00wR5n%c$zk&8jXK(QoSeH|qzIRBhI3q2Zvp7|*P;W`tYzZCv>D z(%nm`mVQ$3$WrjbOK)F#Vu6?0A`^X*ULIZ@Ss8%~7XTw2d}a#@X%AedT5Vrx&zcc_ z;o!aTpV$cuT_luax$+O(x8a6rOUlyv!~(Ul&2G9uCy(&n4vhF`%?O`_{l&lY6L^;{ zpw?GM8G4A$x(eymN9uP#&#hq9AV!K`WkU=Me)+B6U;6#59kBxSkRi(<=Fl)FVlIj+ zy4Akao-IVIh$^kj79&(fSvfKh<%l&qX{$~S;2d&@8RQT%$RTEsL(CwDm_iOAgB)U> zJv7`1P*(|HoixBeTKzT~Vi4ga2Pj!PVg_l%9NM=z5p!u@REXFJ&C(OJP6QF13L-gV z2x`zFs6mLJ1|@<=iQRx4Cl&s5XfJmS~??itQeI zD7q#Cflx8$+A2u9DoLAd)i_f*M01s4*0R8bcwdF%*I-Lm{9s6oT$+%vnW% zbEqO_5Jk+OiI_nWF@qvv3PFSn214vR^b7PseLVrrp@*134>5xtVg^0L40?zu^bj)W zAvQx-)9sncnyeY0FC0{!W9P~9k;R#qUOu!sxH6b@HKZ(_Cl;vn-v&jRE_h5~o_)Zt*lFzu literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/allspark_utils.py b/model_executor/layers/quantization/utils/allspark_utils.py new file mode 100644 index 0000000..4c32468 --- /dev/null +++ b/model_executor/layers/quantization/utils/allspark_utils.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType, scalar_types + +ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD = 1024 +ALLSPARK_SUPPORTED_QUANT_TYPES = [scalar_types.uint8b128] +ALLSPARK_AMPERE_N_ALIGN = 16 +ALLSPARK_AMPERE_K_ALIGN = 16 + + +def check_allspark_supported_dtype_shape( + input_size_per_partition: int, + output_size_per_partition: int, + group_size: int, + weight_dtype: ScalarType, + act_dtype: torch.dtype, +): + capability_tuple = current_platform.get_device_capability() + device_capability = -1 if capability_tuple is None else capability_tuple.to_int() + + # For Ampere GPU + if device_capability >= 80 and device_capability < 90: + if group_size != -1: + return ( + False, + "For Ampere GPU, AllSpark does not support group_size " + f"= {group_size}. Only group_size = -1 are supported.", + ) + + if weight_dtype not in ALLSPARK_SUPPORTED_QUANT_TYPES: + return ( + False, + "For Ampere GPU, AllSpark does not support " + f"quant type ({weight_dtype}). Only quant type " + f"({ALLSPARK_SUPPORTED_QUANT_TYPES}) are supported.", + ) + + if ( + input_size_per_partition % ALLSPARK_AMPERE_K_ALIGN != 0 + or output_size_per_partition % ALLSPARK_AMPERE_N_ALIGN != 0 + ): + return ( + False, + "AllSpark needs input_size_per_partition % " + f"{ALLSPARK_AMPERE_K_ALIGN} = 0 and " + f"output_size_per_partition % {ALLSPARK_AMPERE_N_ALIGN} = 0 " + "for Ampere GPU optimized kernels.", + ) + + if act_dtype != torch.float16 and act_dtype != torch.bfloat16: + return ( + False, + "AllSpark only supports act_dtype = float16 or bfloat16," + f"for Ampere GPU, but got act_dtype = {act_dtype}.", + ) + else: + return ( + False, + "AllSpark currently does not support " + f"device_capability = {device_capability}.", + ) + + return True, None diff --git a/model_executor/layers/quantization/utils/bitblas_utils.py b/model_executor/layers/quantization/utils/bitblas_utils.py new file mode 100644 index 0000000..62a4f90 --- /dev/null +++ b/model_executor/layers/quantization/utils/bitblas_utils.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +from packaging import version + +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType, scalar_types + +MINIMUM_BITBLAS_VERSION = "0.1.0" + +BITBLAS_MIN_WEIGHT_SIZE_N = 16 +BITBLAS_MIN_WEIGHT_SIZE_K = 16 +GPTQ_BITBLAS_MAX_PARALLEL = 16 + +BITBLAS_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] + +# For dynamic shape code generation +BITBLAS_OPTIMIZE_FEATURES = [1, 16, 32, 64, 128, 256, 512, 1024] +# If want to enable high performance for contiguous batching +# Please use the following values +BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS = [16, 32, 64, 128, 256, 512, 1024] + +BITBLAS_SUPPORTED_NUM_BITS = [1, 2, 4, 8] +BITBLAS_SUPPORTED_SYM = [False, True] + + +# Determines the supported quantization types for BitBLAS based on the +# device's capability and whether zero-point (zp) is used. +def query_bitblas_supported_quant_types( + has_zp: bool, device_capability: int | None = None +): + if device_capability is None: + capability_tuple = current_platform.get_device_capability() + device_capability = ( + -1 if capability_tuple is None else capability_tuple.to_int() + ) + + if device_capability < 70: + return [] + + if has_zp: + # AWQ style, unsigned + runtime zero-point + return [scalar_types.uint4, scalar_types.uint8] + else: + # GPTQ style, unsigned + symmetric bias + # TODO: once fp8_bitblas is merged into "gptq_bitblas" we should be able + # to add `scalar_types.float8_e4m3fn` here + return [scalar_types.uint4b8, scalar_types.uint8b128] + + +def _check_bitblas_supported( + quant_type: ScalarType, + group_size: int | None, + has_zp: bool, + device_capability: int | None = None, +) -> tuple[bool, str | None]: + if device_capability is None: + capability_tuple = current_platform.get_device_capability() + device_capability = ( + -1 if capability_tuple is None else capability_tuple.to_int() + ) + + supported_types = query_bitblas_supported_quant_types(has_zp, device_capability) + + if quant_type not in supported_types: + return ( + False, + f"BitBLAS does not support weight_bits = {quant_type}. " + f"Only types = {supported_types} " + f"are supported (for group_size = {group_size}, " + f"device_capability = {device_capability}, zp = {has_zp}).", + ) + if group_size is None or group_size not in BITBLAS_SUPPORTED_GROUP_SIZES: + return ( + False, + f"BitBLAS does not support group_size = {group_size}. " + f"Only group_sizes = {BITBLAS_SUPPORTED_GROUP_SIZES} " + "are supported.", + ) + + # Finally, check if bitblas is installed + try: + import bitblas + + if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION): + raise ImportError( + "bitblas version is wrong. Please " + f"install bitblas>={MINIMUM_BITBLAS_VERSION}" + ) + except ImportError: + return False, "BitBLAS is not installed." + + return True, None + + +def check_bitblas_supported( + quant_type: ScalarType, + group_size: int, + has_zp: bool = False, + device_capability: int | None = None, +) -> bool: + cond, _ = _check_bitblas_supported( + quant_type, group_size, has_zp, device_capability + ) + return cond + + +def verify_bitblas_supported( + quant_type: ScalarType, group_size: int, has_zp: bool = False +) -> None: + cond, err_msg = _check_bitblas_supported(quant_type, group_size, has_zp) + if not cond: + assert err_msg is not None + raise ValueError(err_msg) + + +def verify_bitblas_supports_shape( + output_size_per_partition: int, + input_size_per_partition: int, + input_size: int, + group_size: int, +) -> None: + # Validate output_size_per_partition + if output_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_N != 0: + raise ValueError( + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f" min_thread_n = {BITBLAS_MIN_WEIGHT_SIZE_N}. " + "Consider reducing tensor_parallel_size or running " + "with --quantization gptq." + ) + + # Validate input_size_per_partition + if input_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_K != 0: + raise ValueError( + f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible " + f"by min_thread_k = {BITBLAS_MIN_WEIGHT_SIZE_K}. " + "Consider reducing tensor_parallel_size or running " + "with --quantization gptq." + ) + + if group_size < input_size and input_size_per_partition % group_size != 0: + raise ValueError( + f"Weight input_size_per_partition = {input_size_per_partition}" + f" is not divisible by group_size = {group_size}." + "Consider reducing tensor_parallel_size or running " + "with --quantization gptq." + ) + + +def check_bitblas_supports_shape( + output_size_per_partition: int, + input_size_per_partition: int, + input_size: int, + group_size: int, +) -> tuple[bool, str | None]: + try: + verify_bitblas_supports_shape( + output_size_per_partition, input_size_per_partition, input_size, group_size + ) + except ValueError as e: + return False, e.__str__() + return True, None + + +def bitblas_is_k_full(act_order: bool, is_row_parallel: bool) -> bool: + return (not act_order) or (act_order and not is_row_parallel) + + +def bitblas_repeat_scales_on_all_ranks( + act_order: bool, group_size: int, is_row_parallel: bool +) -> bool: + # Need to repeat scales on every rank if act_ordering or + # channelwise and RowParallelLinear + is_channelwise = group_size == -1 + return act_order or (is_channelwise and is_row_parallel) + + +def bitblas_make_empty_g_idx(device: torch.device) -> torch.Tensor: + return torch.nn.Parameter( + torch.empty(0, dtype=torch.int, device=device), requires_grad=False + ) + + +def bitblas_make_empty_zp(device: torch.device) -> torch.Tensor: + return torch.nn.Parameter( + torch.empty(0, dtype=torch.int, device=device), requires_grad=False + ) + + +def bitblas_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + g_idx_sort_indices = torch.argsort(g_idx).to(torch.int) + return g_idx[g_idx_sort_indices], g_idx_sort_indices + + +def unpack_gptq_qzeros(qzeros, bits, is_gptq_v2=False) -> torch.Tensor: + qzeros = qzeros.view(torch.int32) + elems_per_int32 = 32 // bits + unpacked_zeros = torch.zeros( + (qzeros.shape[0], qzeros.shape[1] * elems_per_int32), + dtype=torch.int8, + device=qzeros.device, + requires_grad=False, + ) + + for col in range(unpacked_zeros.shape[1]): + i = col % elems_per_int32 + unpacked_zeros[:, col] = (qzeros[:, col // elems_per_int32] >> (bits * i)) & 0xF + if not is_gptq_v2: + return unpacked_zeros + 1 + return unpacked_zeros + + +def unpack_gptq_qweight(qweight, bits): + qweight = qweight.view(torch.int8) + elems_per_int8 = 8 // bits + unpacked_weight = torch.zeros( + (qweight.shape[0], qweight.shape[1] * elems_per_int8), + dtype=torch.int8, + device=qweight.device, + requires_grad=False, + ) + for col in range(unpacked_weight.shape[1]): + i = col % elems_per_int8 + unpacked_weight[:, col] = qweight[:, col // elems_per_int8] >> (bits * i) + + return torch.bitwise_and(unpacked_weight, 2**bits - 1) diff --git a/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0ea0225 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..be487f2 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e9a50e1 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..119969d --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..119969d --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3e8ebf3 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..2bb5b45 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6496a38 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6e2aeee --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b0f9442 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b3bf9ea --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7e52ab6 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7e52ab6 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bee8d03 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9da876d --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3618053 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0a1a252 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..46a982f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9696611 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d6279a1 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..defaacb --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..ecc2fda --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..ecc2fda --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3bc0036 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..310dff4 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..035ec02 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..206c8a2 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..8b49f27 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..edc2353 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f81e09e --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e073843 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..987c8f6 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..108af31 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..108af31 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..43b5bdb --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bffa749 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..851bc9f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f96f127 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d1227c2 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..fe3e18c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f74a52f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..8cab1b0 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b3ed43a --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..abd1915 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..abd1915 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e4d5b2d --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..137b9dd --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..77ba0d7 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..ae244f9 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..38cac46 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..8e6ebe2 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b2931d6 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..459062e --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..1225d84 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..03e8235 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bb61d83 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bb61d83 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d44e384 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c559a69 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..cf35403 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..8ec2005 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..65840aa --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..1a457b9 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..574cf49 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..574cf49 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0a5d7bf --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..4e120d6 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..eccb86a --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..125fe36 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..4415cc9 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7bfaf93 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7bfaf93 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..cb91a27 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..88af484 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..5c29874 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..dd06972 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..125fe36 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7c039b4 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c2bd478 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c2bd478 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..4990268 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..18afdd9 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7febe3d --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..ad630f0 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..51d10bb --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..1480e09 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..10b940c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f5fdec3 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6bd350c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..5c604b9 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..75906ad --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..75906ad --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..94ce6e7 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9540df4 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b4d25ae --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..fdc6437 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..fdc6437 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9d7658b --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..cd3e078 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..2b9f0d1 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9d5a329 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7f449db --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..96f6c30 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..5676757 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..634c1bf --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7eaa7d1 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7eaa7d1 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..03dba5a --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..96e1594 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d979c6b --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..5ffd367 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..be93dfe --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..19452df --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3382554 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3382554 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9a5ff48 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6eb22de --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..eabc423 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..84ef35e --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e6d9107 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c9d18c9 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c9d18c9 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c746e70 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0b4746c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..386928d --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0894ff2 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..8ec2005 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..202acf2 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..86c68e0 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..983525f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,18 @@ +{ + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..11a9bce --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c298da8 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..56a766c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..56a766c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..386ee59 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..60df5e3 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..40c01c0 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..4f1747b --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c6fd365 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..53bbaca --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..cb993c8 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f250d3f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f250d3f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..ffe67dc --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..2a17e16 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..160f12e --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b259993 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e5c4a1d --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..a71ab88 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..56d3e1f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bbd4df4 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bbd4df4 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..eda96e7 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..bd0767b --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..2bf5eb2 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..29f7651 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6db1385 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9cdff13 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7bb8e87 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..7bb8e87 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..1a47cae --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..8dd5ae5 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9c908e8 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..af1a384 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6d1a8b5 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e77abaf --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d381764 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0cf6a47 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..01327b2 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..6f9bd75 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f050b75 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f050b75 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..12eea5f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9db9dae --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f78e706 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..821ad0c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..365f8d0 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..f080ea5 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..daaf21c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..0cf6a47 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e9bf044 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c7122d3 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..4a3ccc0 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..4a3ccc0 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..1d3ce5c --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..2583b5a --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c37aced --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..baa64f8 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d962889 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3cea21b --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..24ef112 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..24ef112 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3ab5796 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..58cdd93 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d6bef7f --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b72e037 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b4b08ea --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..a8141f5 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c911a8e --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c911a8e --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..3cb7eaa --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..8df6e4b --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..293adce --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..9d7edc3 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,26 @@ +{ + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..c9566d7 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d86b349 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..d86b349 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..e471687 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b4c3249 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000..b4c3249 --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/configs/README.md b/model_executor/layers/quantization/utils/configs/README.md new file mode 100644 index 0000000..1110ced --- /dev/null +++ b/model_executor/layers/quantization/utils/configs/README.md @@ -0,0 +1,3 @@ +# Quantization Kernel Config + +Use scripts under `benchmarks/kernels/` to generate these config files. diff --git a/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py new file mode 100644 index 0000000..fdf3303 --- /dev/null +++ b/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility helpers for NVFP4 + FlashInfer fused-MoE path""" + +import torch + +import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, +) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + create_flashinfer_prepare_finalize, +) +from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe + +__all__ = [ + "is_flashinfer_fp4_cutlass_moe_available", + "reorder_w1w3_to_w3w1", + "build_flashinfer_fp4_cutlass_moe_prepare_finalize", +] + + +def is_flashinfer_fp4_cutlass_moe_available() -> bool: + """Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" + return ( + envs.VLLM_USE_FLASHINFER_MOE_FP4 + and has_flashinfer_cutlass_fused_moe() + and current_platform.is_cuda() + and current_platform.has_device_capability(100) + ) + + +def reorder_w1w3_to_w3w1( + weight: torch.Tensor, scale: torch.Tensor, dim: int = -2 +) -> tuple[torch.Tensor, torch.Tensor]: + """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`""" + size = weight.size(dim) + assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}" + half = size // 2 + + w1, w3 = weight.split(half, dim=dim) + s1, s3 = scale.split(half, dim=dim) + + return ( + torch.cat([w3, w1], dim=dim).contiguous(), + torch.cat([s3, s1], dim=dim).contiguous(), + ) + + +def build_flashinfer_fp4_cutlass_moe_prepare_finalize( + moe: FusedMoEConfig, +) -> mk.FusedMoEPrepareAndFinalize: + """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel""" + use_dp = moe.moe_parallel_config.dp_size > 1 + enable_alltoallv = moe.moe_parallel_config.all2all_backend == "flashinfer_all2allv" + return create_flashinfer_prepare_finalize( + use_dp=use_dp, use_nvfp4=True, enable_alltoallv=enable_alltoallv + ) + + +def select_nvfp4_gemm_impl( + moe: FusedMoEConfig, + moe_quant_config: FusedMoEQuantConfig, + allow_flashinfer: bool, +) -> mk.FusedMoEPermuteExpertsUnpermute: + """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" + + if allow_flashinfer: + return FlashInferExperts( + out_dtype=moe.in_dtype, + quant_config=moe_quant_config, + ep_rank=moe.moe_parallel_config.ep_rank, + ep_size=moe.moe_parallel_config.ep_size, + tp_rank=moe.moe_parallel_config.tp_rank, + tp_size=moe.moe_parallel_config.tp_size, + use_dp=moe.moe_parallel_config.dp_size > 1, + ) + + # native cutlass experts currently don't support DP; TP case won't call this + raise ValueError( + "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS " + "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)" + ) diff --git a/model_executor/layers/quantization/utils/flashinfer_utils.py b/model_executor/layers/quantization/utils/flashinfer_utils.py new file mode 100644 index 0000000..f22e179 --- /dev/null +++ b/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -0,0 +1,298 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from enum import Enum + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import envs +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, +) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + create_flashinfer_prepare_finalize, +) +from vllm.platforms import current_platform + +logger = init_logger(__name__) + + +class FlashinferMoeBackend(Enum): + TENSORRT_LLM = "TensorRT-LLM" + CUTLASS = "CUTLASS" + + +def calculate_tile_tokens_dim(num_tokens, top_k, num_experts): + from flashinfer import next_positive_power_of_2 + + # FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now. + # TODO: Revert this to dynamic calculation once a new version of FlashInfer + # with the necessary kernels is released. + tile_tokens_dim = 8 + + # A factor considering tokens are not perfectly balanced among experts. + imbalance_factor = 1.3 + # Calculate the number of tokens per expert + # assuming perfect distribution. + num_tokens_per_expert = (num_tokens * top_k) // num_experts + # Apply the imbalance factor. + num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor) + # And pad the number to the next power of 2. + tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert) + # Cap to 8-max_tile_tokens_dim tokens per CTA tile + # as it's the range supported by the kernel. + tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) + + return tile_tokens_dim + + +def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor: + return ( + x.reshape(-1, 2, x.shape[-2] // 2, x.shape[-1]).flip(dims=[1]).reshape(x.shape) + ) + + +def rotate_flashinfer_fp8_moe_weights( + gemm1_weights: torch.Tensor, gemm2_weights: torch.Tensor +): + from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a + + epilogue_tile_m = 128 + num_experts = gemm1_weights.shape[0] + hidden_size = gemm1_weights.shape[-1] + intermediate_size = gemm1_weights.shape[1] // 2 + + # Reorder rows of W1 for fused gated activation + gemm1_weights_fp8_interleaved = [] + for i in range(num_experts): + gemm1_weights_fp8_interleaved.append( + reorder_rows_for_gated_act_gemm(gemm1_weights[i]) + ) + + # Stack weights and scales for all experts + gemm1_weights_fp8_interleaved = torch.stack(gemm1_weights_fp8_interleaved).reshape( + num_experts, 2 * intermediate_size, hidden_size + ) + + # Shuffle weights and scaling factors for transposed mma output + gemm1_weights_fp8_shuffled = [] + gemm2_weights_fp8_shuffled = [] + for i in range(num_experts): + gemm1_weights_fp8_shuffled.append( + shuffle_matrix_a( + gemm1_weights_fp8_interleaved[i].view(torch.uint8), epilogue_tile_m + ) + ) + + gemm2_weights_fp8_shuffled.append( + shuffle_matrix_a(gemm2_weights[i].view(torch.uint8), epilogue_tile_m) + ) + + # Stack weights for all experts + gemm1_weights.data = torch.stack(gemm1_weights_fp8_shuffled).view( + torch.float8_e4m3fn + ) + gemm2_weights.data = torch.stack(gemm2_weights_fp8_shuffled).view( + torch.float8_e4m3fn + ) + + +def apply_flashinfer_per_tensor_scale_fp8( + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + routing_bias: torch.Tensor | None, + top_k: int, + num_expert_group: int | None, + topk_group: int | None, + global_num_experts: int, + apply_router_weight_on_input: bool, +) -> torch.Tensor: + from flashinfer.fused_moe import RoutingMethodType + + import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 + + assert layer.output1_scales_scalar is not None, ( + "Expected output1_scales_scalar to be initialized" + ) + assert layer.output1_scales_scalar is not None, ( + "Expected output1_scales_gate_scalar to be initialized" + ) + assert layer.output1_scales_scalar is not None, ( + "Expected output2_scales_scalar to be initialized" + ) + + from vllm.model_executor.models.llama4 import Llama4MoE + + assert layer.custom_routing_function == Llama4MoE.custom_routing_function, ( + "FusedMoE flashinfer kernels are only supported for Llama4" + ) + return torch.ops.vllm.flashinfer_fused_moe_per_tensor_scale_fp8( + routing_logits=router_logits, + routing_bias=routing_bias, + hidden_states=hidden_states, + input_scale=layer.w13_input_scale, + gemm1_weights=layer.w13_weight, + gemm2_weights=layer.w2_weight, + output1_scales_scalar=layer.output1_scales_scalar, + output1_scales_gate_scalar=layer.output1_scales_gate_scalar, + output2_scales_scalar=layer.output2_scales_scalar, + num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + intermediate_size=layer.intermediate_size_per_partition, + local_expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + use_routing_scales_on_input=apply_router_weight_on_input, + routing_method_type=RoutingMethodType.Llama4, + ) + + +def get_moe_scaling_factors( + input_scale: torch.Tensor, + gemm1_weights_scale: torch.Tensor, + activation_scale: torch.Tensor, + gemm2_weights_scale: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + output1_scales_scalar = gemm1_weights_scale * input_scale * (1.0 / activation_scale) + output1_scales_gate_scalar = gemm1_weights_scale * input_scale + output2_scales_scalar = activation_scale * gemm2_weights_scale + + return output1_scales_scalar, output1_scales_gate_scalar, output2_scales_scalar + + +def register_moe_scaling_factors(layer: torch.nn.Module) -> None: + output1_scales, output1_gate_scales, output2_scales = get_moe_scaling_factors( + layer.w13_input_scale, + layer.w13_weight_scale, + layer.w2_input_scale, + layer.w2_weight_scale, + ) + layer.register_parameter( + "output1_scales_scalar", torch.nn.Parameter(output1_scales, requires_grad=False) + ) + layer.register_parameter( + "output1_scales_gate_scalar", + torch.nn.Parameter(output1_gate_scales, requires_grad=False), + ) + layer.register_parameter( + "output2_scales_scalar", torch.nn.Parameter(output2_scales, requires_grad=False) + ) + layer.register_parameter( + "w2_input_scale_inv", + torch.nn.Parameter(1.0 / layer.w2_input_scale, requires_grad=False), + ) + + +def build_flashinfer_fp8_cutlass_moe_prepare_finalize( + moe: FusedMoEConfig | None, use_deepseek_fp8_block_scale: bool = False +) -> mk.FusedMoEPrepareAndFinalize: + """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel""" + use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False + # Propagate block-scale flag so prepare/finalize can skip act quantization + # and inform the kernel to consume per-block weight scales. + return create_flashinfer_prepare_finalize( + use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale + ) + + +def select_cutlass_fp8_gemm_impl( + moe: FusedMoEConfig | None, + quant_config: FusedMoEQuantConfig, + out_dtype: torch.dtype | None = None, + use_deepseek_fp8_block_scale: bool = False, +) -> mk.FusedMoEPermuteExpertsUnpermute: + """Return a GEMM *experts* implementation for fused-MoE layers""" + + if moe is not None: + return FlashInferExperts( + out_dtype=moe.in_dtype, + quant_config=quant_config, + ep_rank=moe.moe_parallel_config.ep_rank, + ep_size=moe.moe_parallel_config.ep_size, + tp_rank=moe.moe_parallel_config.tp_rank, + tp_size=moe.moe_parallel_config.tp_size, + use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, + ) + + assert out_dtype is not None, "If moe config is None, out_dtype must be passed" + return FlashInferExperts( + out_dtype=out_dtype, + quant_config=quant_config, + use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, + ) + + +def flashinfer_cutlass_moe_fp8( + hidden_states: torch.Tensor, + layer: torch.nn.Module, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + use_deepseek_fp8_block_scale: bool = False, + moe: FusedMoEConfig | None = None, +) -> torch.Tensor: + quant_config = layer.quant_method.get_fused_moe_quant_config(layer) + assert quant_config is not None + + # Construct modular kernel with block-scale support when requested. + fused_experts = mk.FusedMoEModularKernel( + build_flashinfer_fp8_cutlass_moe_prepare_finalize( + moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale + ), + select_cutlass_fp8_gemm_impl( + moe=moe, + quant_config=quant_config, + out_dtype=hidden_states.dtype, + use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, + ), + ) + + return fused_experts( + hidden_states, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + inplace=inplace, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + +def get_flashinfer_moe_backend() -> FlashinferMoeBackend: + flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND + # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations + if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability( + 90 + ): + return FlashinferMoeBackend.CUTLASS + elif flashinfer_moe_backend == "latency": + return FlashinferMoeBackend.TENSORRT_LLM + + allowed_backends = ["throughput", "latency"] + raise ValueError( + f"Unknown flashinfer moe backend: {flashinfer_moe_backend}" + f" expected one of {allowed_backends}" + ) + + +def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool: + # TODO(shuw@nvidia): Update when new backends are added. + backends_supporting_global_sf = ( + FlashinferMoeBackend.CUTLASS, + FlashinferMoeBackend.TENSORRT_LLM, + ) + return backend in backends_supporting_global_sf diff --git a/model_executor/layers/quantization/utils/fp8_utils.py b/model_executor/layers/quantization/utils/fp8_utils.py new file mode 100644 index 0000000..1b13438 --- /dev/null +++ b/model_executor/layers/quantization/utils/fp8_utils.py @@ -0,0 +1,1206 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from https://github.com/sgl-project/sglang/pull/2575 +import functools +import json +import os +from collections.abc import Callable, Sequence +from typing import Any + +import torch + +import vllm.envs as envs +from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + group_broadcast, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + CUTLASS_BLOCK_FP8_SUPPORTED, +) +from vllm.model_executor.parameter import ( + BlockQuantScaleParameter, + ChannelQuantScaleParameter, + PerTensorScaleParameter, +) +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton +from vllm.utils.deep_gemm import ( + fp8_gemm_nt, + is_deep_gemm_e8m0_used, + is_deep_gemm_supported, + should_use_deepgemm_for_fp8_linear, + transform_sf_into_required_layout, +) +from vllm.utils.torch_utils import direct_register_custom_op + +logger = init_logger(__name__) + + +def is_fp8(x: torch.dtype | torch.Tensor) -> bool: + if isinstance(x, torch.Tensor): + x = x.dtype + try: + return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz + except: + return False + + +# We need to pass in the is_hopper flag as argument because the function +# current_platform.is_device_capability() is not supported by Torch compiler. +def cutlass_scaled_mm( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + return ops.cutlass_scaled_mm( + A, + B.T, + out_dtype=output_dtype, + scale_a=As, + scale_b=Bs.T, + ) + + +# TODO we should be able to change the type of block_size to GroupShape +# after we resolve GroupShape compilation issue +# https://github.com/vllm-project/vllm/issues/25270 +def _w8a8_triton_block_scaled_mm_func( + qx: torch.Tensor, + weight: torch.Tensor, + x_scale: torch.Tensor, + weight_scale: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, +) -> torch.Tensor: + return w8a8_triton_block_scaled_mm( + qx, weight, x_scale, weight_scale, block_size, output_dtype + ) + + +def _w8a8_triton_block_scaled_mm_fake( + qx: torch.Tensor, + weight: torch.Tensor, + x_scale: torch.Tensor, + weight_scale: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, +) -> torch.Tensor: + return torch.empty( + (qx.size(0), weight.size(0)), dtype=output_dtype, device=qx.device + ) + + +direct_register_custom_op( + "w8a8_triton_block_scaled_mm_func", + _w8a8_triton_block_scaled_mm_func, + fake_impl=_w8a8_triton_block_scaled_mm_fake, +) + + +def _padded_cutlass( + qx: torch.Tensor, + weight: torch.Tensor, + x_scale: torch.Tensor, + weight_scale: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, +) -> torch.Tensor: + pad_multiple = 4 + dim = qx.shape[0] + padded = ( + dim if dim % pad_multiple == 0 else dim + pad_multiple - (dim % pad_multiple) + ) + + has_pad = padded > dim + + if has_pad: + padded_shape = [padded, *qx.shape[1:]] + padded_qx = torch.zeros(padded_shape, device=qx.device, dtype=qx.dtype) + padded_qx[0 : qx.shape[0], ...].copy_(qx) + + padded_x_scale_shape = [*x_scale.shape[1:], padded] + padded_x_scale = torch.ones( + padded_x_scale_shape, device=x_scale.device, dtype=x_scale.dtype + ).permute(-1, -2) + padded_x_scale[0 : x_scale.shape[0], ...].copy_(x_scale) + + output = cutlass_scaled_mm( + padded_qx, weight, padded_x_scale, weight_scale, block_size, output_dtype + ) + return output[0 : qx.shape[0], ...] + else: + return cutlass_scaled_mm( + qx, weight, x_scale, weight_scale, block_size, output_dtype + ) + + +def _padded_cutlass_fake( + qx: torch.Tensor, + weight: torch.Tensor, + x_scale: torch.Tensor, + weight_scale: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype, +) -> torch.Tensor: + return torch.empty( + (qx.size(0), weight.size(0)), dtype=output_dtype, device=qx.device + ) + + +direct_register_custom_op( + "padded_cutlass", + _padded_cutlass, + fake_impl=_padded_cutlass_fake, +) + + +def _fp8_gemm_nt_op( + q_input: torch.Tensor, + input_scale: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + output: torch.Tensor, + use_deep_gemm_e8m0: bool, +) -> None: + fp8_gemm_nt( + (q_input, input_scale), + (weight, weight_scale), + output, + is_deep_gemm_e8m0_used=use_deep_gemm_e8m0, + ) + + +def _fp8_gemm_nt_op_fake( + q_input: torch.Tensor, + input_scale: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + output: torch.Tensor, + use_deep_gemm_e8m0: bool, +) -> None: + return None + + +direct_register_custom_op( + "fp8_gemm_nt_op", + _fp8_gemm_nt_op, + mutates_args=["output"], + fake_impl=_fp8_gemm_nt_op_fake, +) + + +# TODO fix ROCm->Triton custom path: +# https://github.com/vllm-project/vllm/issues/14397 +class W8A8BlockFp8LinearOp: + """ + This class executes a Blocked FP8 linear layer using cutlass if supported + and torch.scaled_mm otherwise. + """ + + def __init__( + self, + weight_group_shape: GroupShape, + act_quant_group_shape: GroupShape, + cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, + use_aiter_and_is_supported: bool = False, + ): + self.weight_group_shape = weight_group_shape + self.act_quant_group_shape = act_quant_group_shape + self.is_deep_gemm_supported = is_deep_gemm_supported() + self.is_hopper = current_platform.is_device_capability(90) + self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used() + + # Get the correct blockscale mul and input quant operations. + # We can't use _dispatch_w8a8_blockscale_op to figure out if we want + # to use deepgemm because we don't know the shape of weights (and + # whether deepgemm supports it) at the init time. + self.w8a8_blockscale_op, self.input_quant_op = ( + self._dispatch_w8a8_blockscale_op( + cutlass_block_fp8_supported, use_aiter_and_is_supported + ) + ) + self.deepgemm_input_quant_op = ( + QuantFP8( + False, + self.act_quant_group_shape, + column_major_scales=True, + use_ue8m0=self.use_deep_gemm_e8m0, + ) + if self.is_deep_gemm_supported + else None + ) + + def apply( + self, + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: torch.Tensor | None = None, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + assert input_scale is None + # View input as 2D matrix for fp8 methods + input_2d = input.view(-1, input.shape[-1]) + output_shape = [*input.shape[:-1], weight.shape[0]] + output_dtype = input.dtype + + if should_use_deepgemm_for_fp8_linear( + output_dtype, weight, self.is_deep_gemm_supported + ): + output = self._run_deepgemm(input_2d, weight, weight_scale) + else: + output = self.w8a8_blockscale_op( + input_2d, weight, weight_scale, input_scale + ) + + if bias is not None: + output = output + bias + return output.to(dtype=input.dtype).view(*output_shape) + + def _run_deepgemm( + self, + input_2d: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + ) -> torch.Tensor: + assert self.deepgemm_input_quant_op is not None + q_input, input_scale = self.deepgemm_input_quant_op(input_2d) + output = torch.empty( + (q_input.shape[0], weight.shape[0]), + dtype=torch.bfloat16, + device=q_input.device, + ) + torch.ops.vllm.fp8_gemm_nt_op( + q_input, input_scale, weight, weight_scale, output, self.use_deep_gemm_e8m0 + ) + return output + + def _run_cutlass( + self, + input_2d: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: torch.Tensor | None = None, + ) -> torch.Tensor: + assert input_scale is None + assert self.input_quant_op is not None + q_input, input_scale = self.input_quant_op(input_2d) + if self.is_hopper: + return torch.ops.vllm.padded_cutlass( + q_input, + weight, + input_scale, + weight_scale, + list(self.weight_group_shape), + input_2d.dtype, + ) + else: + return cutlass_scaled_mm( + q_input, + weight, + input_scale, + weight_scale, + list(self.weight_group_shape), + input_2d.dtype, + ) + + def _run_aiter( + self, + input_2d: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: torch.Tensor | None = None, + ) -> torch.Tensor: + assert self.act_quant_group_shape == GroupShape(1, 128) + + n, k = weight.shape + + use_triton = ( + not current_platform.is_fp8_fnuz() + and rocm_aiter_ops.is_triton_gemm_w8a8_tuned(n, k) + ) + + if use_triton: + gemm_a8w8_blockscale_op = rocm_aiter_ops.triton_gemm_a8w8_blockscale + else: + gemm_a8w8_blockscale_op = rocm_aiter_ops.gemm_a8w8_blockscale + + if input_scale is not None: + q_input = input_2d + # MI350 case uses triton kernel + elif use_triton: + q_input, input_scale = per_token_group_quant_fp8( + input_2d, + self.act_quant_group_shape.col, + column_major_scales=False, + use_ue8m0=False, + ) + # MI300 uses tuned AITER ASM/C++ kernel + else: + q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d) + + return gemm_a8w8_blockscale_op( + q_input, + weight, + input_scale, + weight_scale, + list(self.weight_group_shape), + output_dtype=input_2d.dtype, + ) + + def _run_triton( + self, + input_2d: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: torch.Tensor | None = None, + ) -> torch.Tensor: + assert input_scale is None + assert self.input_quant_op is not None + q_input, input_scale = self.input_quant_op(input_2d) + return torch.ops.vllm.w8a8_triton_block_scaled_mm_func( + q_input, + weight, + input_scale, + weight_scale, + list(self.weight_group_shape), + input_2d.dtype, + ) + + def _dispatch_w8a8_blockscale_op( + self, + use_cutlass: bool, + use_aiter_and_is_supported: bool, + ) -> tuple[ + Callable[ + [ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor | None, + ], + torch.Tensor, + ], + QuantFP8 | None, + ]: + if use_cutlass: + return self._run_cutlass, ( + QuantFP8( + False, + self.act_quant_group_shape, + column_major_scales=True, + use_ue8m0=False, + ) + ) + if use_aiter_and_is_supported: + return self._run_aiter, None + return self._run_triton, ( + QuantFP8( + False, + self.act_quant_group_shape, + column_major_scales=False, + use_ue8m0=False, + ) + ) + + +def input_to_float8( + x: torch.Tensor, dtype: torch.dtype | None = None +) -> tuple[torch.Tensor, torch.Tensor]: + """This function quantizes input values to float8 values " + "with tensor-wise quantization.""" + dtype = current_platform.fp8_dtype() if dtype is None else dtype + finfo = torch.finfo(dtype) + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + scale = finfo.max / amax + x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) + return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal() + + +def block_quant_to_tensor_quant( + x_q_block: torch.Tensor, + x_s: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """This function converts block-wise quantization to tensor-wise + quantization. The inputs are block-wise quantization tensor `x_q_block`, + block-wise quantization scale and the block size. + The outputs are tensor-wise quantization tensor and tensor-wise + quantization scale. Note only float8 is supported for now. + """ + x_dq_block = group_broadcast(x_q_block, x_s) + x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype) + return x_q_tensor, scale + + +@triton.jit +def _per_token_group_quant_fp8( + # Pointers to inputs and output + y_ptr, + y_q_ptr, + y_s_ptr, + group_size, + # Num columns of y + y_num_columns, + y_row_stride, + # Avoid to divide zero + eps, + # Information for float8 + fp8_min, + fp8_max, + use_ue8m0: tl.constexpr, + # Meta-parameters + BLOCK: tl.constexpr, +): + """A Triton-accelerated function to perform per-token-group + quantization on a tensor. + This function converts the tensor values into float8 values. + """ + groups_per_row = y_num_columns // group_size + + # Map the program id to the row of X and Y it should compute. + g_id = tl.program_id(0) + row = g_id // groups_per_row + row_g_id = g_id % groups_per_row + + # Ensure offset calculations use int64 to prevent overflow + y_ptr_offset = (row.to(tl.int64) * y_row_stride) + ( + row_g_id.to(tl.int64) * group_size + ) + y_ptr += y_ptr_offset + + y_q_ptr_offset = g_id.to(tl.int64) * group_size + y_q_ptr += y_q_ptr_offset + y_s_ptr += g_id + + cols = tl.arange(0, BLOCK) # N <= BLOCK + mask = cols < group_size + + y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32) + # Quant + _absmax = tl.maximum(tl.max(tl.abs(y)), eps) + scale_raw = _absmax / fp8_max + y_s = tl.math.exp2(tl.ceil(tl.log2(scale_raw))) if use_ue8m0 else scale_raw + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + cols, y_q, mask=mask) + tl.store(y_s_ptr, y_s) + + +@triton.jit +def _per_token_group_quant_fp8_colmajor( + # Pointers to inputs and output + y_ptr, + y_q_ptr, + y_s_ptr, + group_size, + # Num columns of y + y_num_columns, + y_row_stride, + # Stride from one column to the next of y_s + y_s_col_stride, + # Avoid to divide zero + eps, + # Information for float8 + fp8_min, + fp8_max, + use_ue8m0: tl.constexpr, + # Meta-parameters + BLOCK: tl.constexpr, +): + """A Triton-accelerated function to perform per-token-group + quantization on a tensor. + This function converts the tensor values into float8 values. + """ + groups_per_row = y_num_columns // group_size + + # Map the program id to the row of X and Y it should compute. + g_id = tl.program_id(0) + row = g_id // groups_per_row + row_g_id = g_id % groups_per_row + + # Ensure offset calculations use int64 to prevent overflow + y_ptr_offset = (row.to(tl.int64) * y_row_stride) + ( + row_g_id.to(tl.int64) * group_size + ) + y_ptr += y_ptr_offset + + y_q_ptr_offset = g_id.to(tl.int64) * group_size + y_q_ptr += y_q_ptr_offset + + # Convert g_id the flattened block coordinate to 2D so we can index + # into the output y_scales matrix + blocks_per_row = y_num_columns // group_size + scale_col = g_id % blocks_per_row + scale_row = g_id // blocks_per_row + # Ensure offset calculation uses int64 for y_s_ptr + y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(tl.int64) + y_s_ptr += y_s_ptr_offset + + cols = tl.arange(0, BLOCK) # group_size <= BLOCK + mask = cols < group_size + + y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32) + # Quant + _absmax = tl.maximum(tl.max(tl.abs(y)), eps) + scale_raw = _absmax / fp8_max + y_s = tl.math.exp2(tl.ceil(tl.log2(scale_raw))) if use_ue8m0 else scale_raw + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + cols, y_q, mask=mask) + tl.store(y_s_ptr, y_s) + + +def per_token_group_quant_fp8( + x: torch.Tensor, + group_size: int, + eps: float = 1e-10, + dtype: torch.dtype | None = None, + column_major_scales: bool = False, + out_q: torch.Tensor | None = None, + use_ue8m0: bool | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """Function to perform per-token-group quantization on an input tensor `x`. + It converts the tensor values into signed float8 values and returns the + quantized tensor along with the scaling factor used for quantization. + Args: + x: The input tensor with ndim >= 2. + group_size: The group size used for quantization. + eps: The minimum to avoid dividing zero. + dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn` + is supported for now. + column_major_scales: Outputs scales in column major. + out_q: Optional output tensor. If not provided, function will create. + Returns: + tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the + scaling factor. + """ + if use_ue8m0 is None: + use_ue8m0 = is_deep_gemm_e8m0_used() + dtype = current_platform.fp8_dtype() if dtype is None else dtype + assert x.shape[-1] % group_size == 0, ( + f"the last dimension of `x` {x.shape[-1]} must be divisible " + f"by `group_size` {group_size}" + ) + assert x.stride(-1) == 1, "`x` groups must be contiguous" + + finfo = torch.finfo(dtype) + fp8_min = finfo.min + fp8_max = finfo.max + + assert out_q is None or out_q.shape == x.shape + x_q = out_q + if x_q is None: + x_q = torch.empty_like(x, device=x.device, dtype=dtype) + + # Allocate the scale tensor in either row- or column-major format. + if column_major_scales: + shape = (x.shape[-1] // group_size,) + x.shape[:-1] + x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2) + else: + shape = x.shape[:-1] + (x.shape[-1] // group_size,) + x_s = torch.empty(shape, device=x.device, dtype=torch.float32) + + # prefer CUDA kernel if available + # TODO(bnell): this causes some fp8 moe test to fail. + if current_platform.is_cuda() and x.is_contiguous(): + torch.ops._C.per_token_group_fp8_quant( + x, x_q, x_s, group_size, eps, fp8_min, fp8_max, use_ue8m0 + ) + return x_q, x_s + + # TRITON FALLBACK + M = x.numel() // group_size + N = group_size + BLOCK = triton.next_power_of_2(N) + # heuristics for number of warps + num_warps = min(max(BLOCK // 256, 1), 8) + num_stages = 1 + if column_major_scales: + _per_token_group_quant_fp8_colmajor[(M,)]( + x, + x_q, + x_s, + group_size, + x.shape[1], + x.stride(0), + x_s.stride(1), + eps, + fp8_min=fp8_min, + fp8_max=fp8_max, + use_ue8m0=use_ue8m0, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=num_stages, + ) + else: + _per_token_group_quant_fp8[(M,)]( + x, + x_q, + x_s, + group_size, + x.shape[1], + x.stride(0), + eps, + fp8_min=fp8_min, + fp8_max=fp8_max, + use_ue8m0=use_ue8m0, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=num_stages, + ) + + return x_q, x_s + + +@triton.jit +def _w8a8_triton_block_scaled_mm( + # Pointers to inputs and output + A, + B, + C, + As, + Bs, + # Shape for matmul + M, + N, + K, + # Block size for block-wise quantization + group_n, + group_k, + # Stride for inputs and output + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_As_m, + stride_As_k, + stride_Bs_k, + stride_Bs_n, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + """Triton-accelerated function used to perform linear operations (dot + product) on input tensors `A` and `B` with block-wise quantization, and + store the result in output tensor `C`. + """ + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + As_ptrs = As + offs_am * stride_As_m + offs_bsn = offs_bn // group_n + Bs_ptrs = Bs + offs_bsn * stride_Bs_n + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_s = tl.load(As_ptrs + offs_ks * stride_As_k) + b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k) + + accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :] + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + if C.dtype.element_ty == tl.bfloat16: + c = accumulator.to(tl.bfloat16) + elif C.dtype.element_ty == tl.float16: + c = accumulator.to(tl.float16) + else: + c = accumulator.to(tl.float32) + + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@functools.lru_cache +def get_w8a8_block_fp8_configs( + N: int, K: int, block_n: int, block_k: int +) -> dict[int, Any] | None: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs + # directory + device_name = current_platform.get_device_name().replace(" ", "_") + json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n},{block_k}].json" # noqa: E501 + + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name + ) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info( + "Using configuration from %s for W8A8 Block FP8 kernel.", + config_file_path, + ) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + "Using default W8A8 Block FP8 kernel config. Performance might " + "be sub-optimal! Config file not found at %s", + config_file_path, + ) + return None + + +def w8a8_triton_block_scaled_mm( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + """This function performs matrix multiplication with block-wise + quantization. + It takes two input tensors `A` and `B` with scales `As` and `Bs`. + The output is returned in the specified `output_dtype`. + Args: + A: The input tensor, e.g., activation. + B: The input tensor, e.g., weight. + As: The per-token-group quantization scale for `A`. + Bs: The per-block quantization scale for `B`. + block_size: The block size for per-block quantization. It should + be 2-dim, e.g., [128, 128]. + output_dytpe: The dtype of the returned tensor. + Returns: + torch.Tensor: The result of matmul. + """ + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + assert A.shape[-1] == B.shape[-1] + assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous() + assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] + M = A.numel() // A.shape[-1] + + assert B.ndim == 2 and Bs.ndim == 2 + N, K = B.shape + assert triton.cdiv(N, block_n) == Bs.shape[0] + assert triton.cdiv(K, block_k) == Bs.shape[1] + + C_shape = A.shape[:-1] + (N,) + C = A.new_empty(C_shape, dtype=output_dtype) + + configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1]) + if configs: + # Get the optimal config if there is one + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Default config + # Block-wise quant: BLOCK_SIZE_N must be divisible by block_size[0] + # BLOCK_SIZE_K must be divisible by block_size[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_size[0], + "BLOCK_SIZE_K": block_size[1], + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + } + + def grid(META): + return ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) + + _w8a8_triton_block_scaled_mm[grid]( + A, + B, + C, + As, + Bs, + M, + N, + K, + block_n, + block_k, + A.stride(-2), + A.stride(-1), + B.stride(1), + B.stride(0), + C.stride(-2), + C.stride(-1), + As.stride(-2), + As.stride(-1), + Bs.stride(1), + Bs.stride(0), + **config, + ) + + return C + + +def requant_weight_ue8m0_inplace( + weight: torch.Tensor, + weight_scale: torch.Tensor, + block_size: Sequence[int] = (128, 128), +) -> None: + """Re-quantise *weight* so that its per-block scaling factors are in the + UE8M0 (power-of-two) format expected by the new DeepGEMM kernels inplace. + + Args: + weight: Block-quantised weight tensor stored in `torch.float8_e4m3fn`. + Expected shape `(..., M, K)`. + weight_scale: Corresponding per-block scale tensor (`torch.float32`) + with shape `(..., M // block_size[0], K // block_size[1])`. + block_size: 2-element iterable `[block_m, block_k]` describing the + block quantisation granularity. + """ + if weight.numel() == 0: + return + + if weight.dtype != torch.float8_e4m3fn: + raise ValueError( + f"Expected *weight* to be torch.float8_e4m3fn, got {weight.dtype} instead." + ) + + from vllm.utils.deep_gemm import per_block_cast_to_fp8 + + block_m, block_k = int(block_size[0]), int(block_size[1]) + + # Flatten leading dimensions so we can iterate over the last two dims. + leading_shape = weight.shape[:-2] + if len(leading_shape) == 0: + w_view = weight.unsqueeze(0) + s_view = weight_scale.unsqueeze(0) + else: + w_view = weight.reshape(-1, weight.shape[-2], weight.shape[-1]) + s_view = weight_scale.reshape(-1, *weight_scale.shape[-2:]) + + num_mats = w_view.size(0) + for idx in range(num_mats): + w_q = w_view[idx] + s_old = s_view[idx] + + # De-quantise with the *old* scaling factors (float32). + m_cur, k_cur = w_q.shape + s_float = s_old.to(torch.float32) + # Expand scales along rows and cols by block size, then crop. + s_exp_r = torch.repeat_interleave(s_float, block_m, dim=0) + s_exp = torch.repeat_interleave(s_exp_r, block_k, dim=1) + s_exp = s_exp[:m_cur, :k_cur] + w_dq = w_q.to(torch.float32) * s_exp + # Re-quantise using power-of-two scaling (UE8M0). + w_requant, s_requant = per_block_cast_to_fp8( + w_dq, [block_m, block_k], use_ue8m0=True + ) + + # Write back the results in-place. + w_q.copy_(w_requant) + s_old.copy_(s_requant) + + +def deepgemm_post_process_fp8_weight_block( + wq: torch.Tensor, ws: torch.Tensor, quant_block_shape: tuple[int], use_e8m0: bool +) -> tuple[torch.Tensor, torch.Tensor]: + assert wq.dtype == torch.float8_e4m3fn, ( + "Expected quantized tensor dtype " + f"to be torch.float8_e4m3fn, got {wq.dtype} instead." + ) + assert ws.dtype == torch.float32, ( + f"Expected tensor scales dtype to be torch.float32, got {ws.dtype} instead" + ) + + if use_e8m0: + requant_weight_ue8m0_inplace(wq, ws, block_size=quant_block_shape) + + original_ndim = wq.ndim + if wq.ndim == 2: + assert ws.ndim == 2 + wq = wq.unsqueeze(0) + ws = ws.unsqueeze(0) + + # From https://github.com/deepseek-ai/DeepGEMM/blob/c9f8b34dcdacc20aa746b786f983492c51072870/csrc/utils/layout.hpp#L46 + recipe = (1, 128, 128) + + # Ref : https://github.com/deepseek-ai/DeepGEMM/blob/c9f8b34dcdacc20aa746b786f983492c51072870/csrc/apis/gemm.hpp + # DeepGemm uses the `transform_sf_into_required_layout` function to + # represent scales in the correct format. + dg_ws = transform_sf_into_required_layout( + sf=ws, + mn=wq.size(1), + k=wq.size(2), + recipe=recipe, + num_groups=wq.size(0), + # is the scale factors for A in (Refers to the argument A in A @ B). + # Weights are B. + is_sfa=False, + ) + + if original_ndim == 2: + wq = wq.squeeze(0) + dg_ws = dg_ws.squeeze(0) + + return wq, dg_ws + + +def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor: + """Pad the weight tensor. This is an optimization on ROCm platform, which + can benefit from tensors located far enough from one another in memory""" + if ( + envs.VLLM_ROCM_FP8_PADDING + and current_platform.is_rocm() + and weight.stride(-1) == 1 + and (weight.stride(-2) * weight.element_size()) % 512 == 0 + ): + num_pad = 256 // weight.element_size() + import torch.nn.functional as F + + weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad] + torch.cuda.empty_cache() + return weight + + +def validate_fp8_block_shape( + layer: torch.nn.Module, + input_size: int, + output_size: int, + input_size_per_partition: int, + output_partition_sizes: list[int], + block_size: list[int], +) -> None: + """Validate block quantization shapes for tensor parallelism.""" + from vllm.distributed import get_tensor_model_parallel_world_size + + tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size()) + block_n, block_k = block_size[0], block_size[1] + + # Required by row parallel + if ( + tp_size > 1 + and input_size // input_size_per_partition == tp_size + and input_size_per_partition % block_k != 0 + ): + raise ValueError( + f"Weight input_size_per_partition = {input_size_per_partition} " + f"is not divisible by weight quantization block_k = {block_k}." + ) + + # Required by column parallel or enabling merged weights + is_tp_split = tp_size > 1 and output_size // sum(output_partition_sizes) == tp_size + is_merged_gemm = len(output_partition_sizes) > 1 + if is_tp_split or is_merged_gemm: + sizes_to_check = output_partition_sizes + if not is_tp_split and is_merged_gemm: + # In case of merged matrices, we allow the last + # matrix to not be a multiple of block size + sizes_to_check = output_partition_sizes[:-1] + for output_partition_size in sizes_to_check: + if output_partition_size % block_n != 0: + raise ValueError( + f"Weight output_partition_size = " + f"{output_partition_size} is not divisible by " + f"weight quantization block_n = {block_n}." + ) + + +def create_fp8_weight_parameter( + output_size_per_partition: int, + input_size_per_partition: int, + weight_loader: Callable | None, +) -> torch.nn.Parameter: + """Create FP8 weight parameter.""" + from vllm.model_executor.parameter import ModelWeightParameter + + return ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + +def create_fp8_scale_parameter( + parameter_type: torch.nn.Parameter, + output_partition_sizes: list[int], + input_size_per_partition: int, + block_size: list[int] | None, + weight_loader: Callable | None, +) -> torch.nn.Parameter: + """Create scale parameter based on quantization strategy.""" + if parameter_type == ChannelQuantScaleParameter: + scale = parameter_type( + data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + elif parameter_type == BlockQuantScaleParameter: + assert block_size is not None + block_n, block_k = block_size[0], block_size[1] + output_size_per_partition = sum(output_partition_sizes) + scale = parameter_type( + data=torch.empty( + (output_size_per_partition + block_n - 1) // block_n, + (input_size_per_partition + block_k - 1) // block_k, + dtype=torch.float32, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + elif parameter_type == PerTensorScaleParameter: + scale = parameter_type( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + else: + raise ValueError(f"Unknown parameter type: {parameter_type}") + + scale[:] = torch.finfo(torch.float32).min + return scale + + +def create_fp8_input_scale( + output_partition_sizes: list[int], weight_loader: Callable | None +) -> torch.nn.Parameter: + """Create input scale parameter for static activation quantization.""" + from vllm.model_executor.parameter import PerTensorScaleParameter + + scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + scale[:] = torch.finfo(torch.float32).min + return scale + + +def process_fp8_weight_tensor_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, + logical_widths: list[int], + input_scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: + """Process weights for tensor-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz, + requantize_with_max_scale, + ) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale, input_scale=input_scale + ) + + # Requantize with max scale + weight_scale, weight = requantize_with_max_scale( + weight=weight, + weight_scale=weight_scale, + logical_widths=logical_widths, + ) + + weight = _maybe_pad_fp8_weight(weight) + return weight, weight_scale, input_scale + + +def process_fp8_weight_channel_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: + """Process weights for channel-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz, + ) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale, input_scale=input_scale + ) + + return weight, weight_scale, input_scale + + +def process_fp8_weight_block_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """Process weights for block-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz, + ) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale + ) + + weight = _maybe_pad_fp8_weight(weight) + return weight, weight_scale + + +def maybe_post_process_fp8_weight_block(layer: torch.nn.Module): + assert layer.weight_block_size is not None + + from vllm.utils.deep_gemm import ( + is_deep_gemm_e8m0_used, + should_use_deepgemm_for_fp8_linear, + ) + + # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to + # requantize the weight and input to the specific scale + # at the same time. + should_use_deepgemm = should_use_deepgemm_for_fp8_linear( + layer.orig_dtype, layer.weight + ) + if should_use_deepgemm: + dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block( + wq=layer.weight.data, + ws=layer.weight_scale.data, + quant_block_shape=tuple(layer.weight_block_size), + use_e8m0=is_deep_gemm_e8m0_used(), + ) + layer.weight = torch.nn.Parameter(dg_weight, requires_grad=False) + layer.weight_scale = torch.nn.Parameter(dg_weight_scale, requires_grad=False) + + +def expert_weight_is_col_major(x: torch.Tensor) -> bool: + assert x.dim() == 3 + b, m, n = x.shape + return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m diff --git a/model_executor/layers/quantization/utils/gguf_utils.py b/model_executor/layers/quantization/utils/gguf_utils.py new file mode 100644 index 0000000..79b34e2 --- /dev/null +++ b/model_executor/layers/quantization/utils/gguf_utils.py @@ -0,0 +1,373 @@ +import torch +import numpy as np +from gguf.constants import GGMLQuantizationType + +def get_awq_format(w, group_size=128, w_bit=4): + org_w_shape = w.shape + ori_w_dtype = torch.get_default_dtype() + assert w_bit == 4 + assert w.shape[1] % group_size == 0 + + in_features = org_w_shape[1] + w = w.reshape(-1, group_size) + assert torch.isnan(w).sum() == 0 + + max_val = w.amax(dim=1, keepdim=True) + min_val = w.amin(dim=1, keepdim=True) + max_int = 2**w_bit - 1 + min_int = 0 + scales = (max_val - min_val).clamp(min=1e-5) / max_int + zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int) + w = ( + torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros + ) * scales + zeros = zeros.view(org_w_shape[0], -1) + scales = scales.view(org_w_shape[0], -1) + w = w.reshape(org_w_shape) + assert torch.isnan(scales).sum() == 0 + assert torch.isnan(w).sum() == 0 + + scales = scales.t().contiguous() # input // group, o + zeros = zeros.t().contiguous() # input // group, o + + # from auto awq + scale_zeros = zeros * scales + scales = scales.clone().to(ori_w_dtype) + + pack_num = 32 // w_bit + intweight = [] + for idx in range(in_features): + intweight.append( + torch.round( + (w[:, idx] + scale_zeros[idx // group_size]) + / scales[idx // group_size] + ).to(torch.int)[:, None] + ) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.to(dtype=torch.int32) + + qweight = torch.zeros( + (intweight.shape[0], intweight.shape[1] // 32 * w_bit), + dtype=torch.int32, + device=intweight.device, + ) + + for col in range(intweight.shape[1] // pack_num): + order_map = [0, 2, w_bit, 6, 1, 3, 5, 7] + for i in range(pack_num): + qweight_col = intweight[:, col * pack_num + order_map[i]] + qweight[:, col] |= qweight_col << (i * w_bit) + + zeros = zeros.to(dtype=torch.int32, device=qweight.device) + + qzeros = torch.zeros( + (zeros.shape[0], zeros.shape[1] // 32 * w_bit), + dtype=torch.int32, + device=zeros.device, + ) + + for col in range(zeros.shape[1] // pack_num): + order_map = [0, 2, w_bit, 6, 1, 3, 5, 7] + for i in range(pack_num): + qzero_col = zeros[:, col * pack_num + order_map[i]] + qzeros[:, col] |= qzero_col << (i * w_bit) + + return qweight, qzeros, scales + +GGML_BLOCK_SIZES = { + "F32": 4, + "F16": 2, + "Q4_0": 2 + 16, + "Q5_0": 2 + 4 + 16, + "Q8_0": 2 + 32, + "Q2_K": 256 // 16 + 256 // 4 + 2 + 2, + "Q3_K": 256 // 8 + 256 // 4 + 12 + 2, + "Q4_K": 2 + 2 + 12 + 256 // 2, + "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2, + "Q6_K": 256 // 2 + 256 // 4 + 256 // 16 + 2, + "IQ4_XS": 2 + 2 + 256 // 2 + 256 // 64, +} + +def dequantize_f32(data): + return np.frombuffer(data, dtype=np.float32) + +def dequantize_f16(data): + return np.frombuffer(data, dtype=np.float16) + +def dequantize_q4_0(data): + num_blocks = len(data) // GGML_BLOCK_SIZES["Q4_0"] + + scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 8)[:, :1].astype(np.float32) + qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 16)[:, 2:] + + return np.concatenate([ + scales * ((qs & 0xf).astype(np.int8) - 8), + scales * ((qs >> 4).astype(np.int8) - 8), + ], axis=1) + +def dequantize_q5_0(data): + num_blocks = len(data) // GGML_BLOCK_SIZES["Q5_0"] + + scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 2 + 8)[:, :1].astype(np.float32) + qh = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2:2 + 4] + qs = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, 2 + 4 + 16)[:, 2 + 4:] + + bits = np.unpackbits(qh, axis=-1, bitorder="little") + + x0 = ((qs & 0xf).astype(np.int8) | (bits[:, :16] << 4)) - 16 + x1 = ((qs >> 4).astype(np.int8) | (bits[:, 16:] << 4)) - 16 + + return np.concatenate([ + scales * x0, + scales * x1, + ], axis=1) + +def dequantize_q8_0(data): + num_blocks = len(data) // GGML_BLOCK_SIZES["Q8_0"] + + scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32) + qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:] + return scales * qs + +def dequantize_q2_k(data): + block_size = GGML_BLOCK_SIZES["Q2_K"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + + dmin = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32) + d = data_f16[:, -2].reshape(num_blocks, 1, 1).astype(np.float32) + scales = data_u8[:, :16].reshape(num_blocks, 16, 1) + qs = data_u8[:, 16:80].reshape(num_blocks, 64) + + tmp = np.stack([ + qs[:, 00:16] >> 0, + qs[:, 16:32] >> 0, + qs[:, 00:16] >> 2, + qs[:, 16:32] >> 2, + qs[:, 00:16] >> 4, + qs[:, 16:32] >> 4, + qs[:, 00:16] >> 6, + qs[:, 16:32] >> 6, + qs[:, 32:48] >> 0, + qs[:, 48:64] >> 0, + qs[:, 32:48] >> 2, + qs[:, 48:64] >> 2, + qs[:, 32:48] >> 4, + qs[:, 48:64] >> 4, + qs[:, 32:48] >> 6, + qs[:, 48:64] >> 6, + ], axis=1) + + return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4) + + +def dequantize_q3_k(data): + block_size = GGML_BLOCK_SIZES["Q3_K"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + + d = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32) + bits = np.unpackbits(data_u8[:, :32].reshape(num_blocks, 32, 1), axis=-1, bitorder="little") + bits = 4 ^ (bits << 2) + qs = data_u8[:, 32:32 + 64].astype(np.int16) + a, b, c = data_u8[:, 96: 96 + 12].reshape(num_blocks, 3, 4).transpose(1, 0, 2) + scales = np.zeros((num_blocks, 4, 4), dtype=np.uint8) + scales[:, 0] = (a & 15) | ((c & 3) << 4) + scales[:, 1] = (b & 15) | (((c >> 2) & 3) << 4) + scales[:, 2] = (a >> 4) | (((c >> 4) & 3) << 4) + scales[:, 3] = (b >> 4) | ((c >> 6) << 4) + scales = scales.reshape(num_blocks, 16, 1).astype(np.int16) + + return d * (scales - 32) * np.stack([ + (((qs[:, 00:16] >> 0) & 3) - bits[:, :16, 0]), + (((qs[:, 16:32] >> 0) & 3) - bits[:, 16:, 0]), + (((qs[:, 00:16] >> 2) & 3) - bits[:, :16, 1]), + (((qs[:, 16:32] >> 2) & 3) - bits[:, 16:, 1]), + (((qs[:, 00:16] >> 4) & 3) - bits[:, :16, 2]), + (((qs[:, 16:32] >> 4) & 3) - bits[:, 16:, 2]), + (((qs[:, 00:16] >> 6) & 3) - bits[:, :16, 3]), + (((qs[:, 16:32] >> 6) & 3) - bits[:, 16:, 3]), + (((qs[:, 32:48] >> 0) & 3) - bits[:, :16, 4]), + (((qs[:, 48:64] >> 0) & 3) - bits[:, 16:, 4]), + (((qs[:, 32:48] >> 2) & 3) - bits[:, :16, 5]), + (((qs[:, 48:64] >> 2) & 3) - bits[:, 16:, 5]), + (((qs[:, 32:48] >> 4) & 3) - bits[:, :16, 6]), + (((qs[:, 48:64] >> 4) & 3) - bits[:, 16:, 6]), + (((qs[:, 32:48] >> 6) & 3) - bits[:, :16, 7]), + (((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7]) + ], axis=1) + +def dequantize_q4_k(data, device=None): + block_size = GGML_BLOCK_SIZES["Q4_K"] + num_blocks = len(data) // block_size + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + # Casting to float32 because float16 is very slow on CPU + scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32) + scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32) + qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1) + qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32) + # Dequantize scales and offsets (6 bits and 4 + 2 bits) + factors = scale_factors * np.concatenate([qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1) + offsets = scale_offsets * np.concatenate([qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1) + # Interleave low and high quantized bits + qs2 = np.stack([qs2 & 0xf, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32) + # Dequantize final weights using scales and offsets + weight = factors * qs2 - offsets + if device is None: + return weight + return torch.from_numpy(weight).to(device=device) + +def dequantize_q5_k(data): + block_size = GGML_BLOCK_SIZES["Q5_K"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + + d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32) + dmin = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32) + scales = data_u8[:, 4:16].reshape(num_blocks, 12, 1) + qh = data_u8[:, 16: 16 + 32].reshape(num_blocks, 32, 1) + qs = data_u8[:, 48: 48 + 128].reshape(num_blocks, 4, 32) + + bits = np.unpackbits(qh, axis=-1, bitorder="little") + + qs_hi_4 = qs >> 4 + qs_lo_4 = qs & 15 + + scales_lo_6 = scales[:, :8] & 63 + scales_hi_6 = scales[:, :8] >> 6 + scales_lo_4 = scales[:, 8:] & 15 + scales_hi_4 = scales[:, 8:] >> 4 + + m1 = dmin * scales_lo_6[:, 4] + m2 = dmin * scales_lo_6[:, 5] + m3 = dmin * scales_lo_6[:, 6] + m4 = dmin * scales_lo_6[:, 7] + m5 = dmin * (scales_hi_4[:, 0] | (scales_hi_6[:, 4] << 4)) + m6 = dmin * (scales_hi_4[:, 1] | (scales_hi_6[:, 5] << 4)) + m7 = dmin * (scales_hi_4[:, 2] | (scales_hi_6[:, 6] << 4)) + m8 = dmin * (scales_hi_4[:, 3] | (scales_hi_6[:, 7] << 4)) + + d1 = d * scales_lo_6[:, 0] + d2 = d * scales_lo_6[:, 1] + d3 = d * scales_lo_6[:, 2] + d4 = d * scales_lo_6[:, 3] + d5 = d * (scales_lo_4[:, 0] | (scales_hi_6[:, 0] << 4)) + d6 = d * (scales_lo_4[:, 1] | (scales_hi_6[:, 1] << 4)) + d7 = d * (scales_lo_4[:, 2] | (scales_hi_6[:, 2] << 4)) + d8 = d * (scales_lo_4[:, 3] | (scales_hi_6[:, 3] << 4)) + + return np.concatenate([ + d1 * (qs_lo_4[:, 0] + (bits[:, :, 0] << 4)) - m1, + d2 * (qs_hi_4[:, 0] + (bits[:, :, 1] << 4)) - m2, + d3 * (qs_lo_4[:, 1] + (bits[:, :, 2] << 4)) - m3, + d4 * (qs_hi_4[:, 1] + (bits[:, :, 3] << 4)) - m4, + d5 * (qs_lo_4[:, 2] + (bits[:, :, 4] << 4)) - m5, + d6 * (qs_hi_4[:, 2] + (bits[:, :, 5] << 4)) - m6, + d7 * (qs_lo_4[:, 3] + (bits[:, :, 6] << 4)) - m7, + d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8, + ], axis=1) + +def dequantize_q6_k(data, device = None): + block_size = GGML_BLOCK_SIZES["Q6_K"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + data_i8 = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, block_size) + + scales = data_f16[:, -1].reshape(num_blocks, 1).astype(np.float32) + # TODO use uint8 and cast later? + ql = data_u8[:, :128].astype(np.int16) + qh = data_u8[:, 128:192].astype(np.int16) + sc = data_i8[:, 192:208, np.newaxis].astype(np.float32) + + # Unpack bits, subtraction requires signed data type + q1 = (ql[:, :32 ] & 0xF) | (((qh[:, :32] >> 0) & 3) << 4) - 32 + q2 = (ql[:, 32:64 ] & 0xF) | (((qh[:, :32] >> 2) & 3) << 4) - 32 + q3 = (ql[:, :32 ] >> 4) | (((qh[:, :32] >> 4) & 3) << 4) - 32 + q4 = (ql[:, 32:64 ] >> 4) | (((qh[:, :32] >> 6) & 3) << 4) - 32 + q5 = (ql[:, 64:96 ] & 0xF) | (((qh[:, 32:] >> 0) & 3) << 4) - 32 + q6 = (ql[:, 96:128] & 0xF) | (((qh[:, 32:] >> 2) & 3) << 4) - 32 + q7 = (ql[:, 64:96 ] >> 4) | (((qh[:, 32:] >> 4) & 3) << 4) - 32 + q8 = (ql[:, 96:128] >> 4) | (((qh[:, 32:] >> 6) & 3) << 4) - 32 + + # Dequantize + weight = scales * np.concatenate([ + sc[:, 0] * q1[:, :16], + sc[:, 1] * q1[:, 16:], + sc[:, 2] * q2[:, :16], + sc[:, 3] * q2[:, 16:], + sc[:, 4] * q3[:, :16], + sc[:, 5] * q3[:, 16:], + sc[:, 6] * q4[:, :16], + sc[:, 7] * q4[:, 16:], + sc[:, 8] * q5[:, :16], + sc[:, 9] * q5[:, 16:], + sc[:, 10] * q6[:, :16], + sc[:, 11] * q6[:, 16:], + sc[:, 12] * q7[:, :16], + sc[:, 13] * q7[:, 16:], + sc[:, 14] * q8[:, :16], + sc[:, 15] * q8[:, 16:], + ], axis=1) + + if device is None: + return weight + return torch.from_numpy(weight).to(device=device) + +QK_K = 256 +kvalues_iq4nl = np.array([-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113], dtype=np.int8) + +def dequantize_iq4_xs(data): + block_size = GGML_BLOCK_SIZES["IQ4_XS"] + num_blocks = len(data) // block_size + + d = np.frombuffer(data, dtype=np.float16)[0::block_size//2].astype(np.float32).reshape(num_blocks, 1) + scales_h = np.frombuffer(data, dtype=np.uint16)[1::block_size//2].reshape(num_blocks, 1) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)[:, 4:] + scales_l = data_u8[:, :4].reshape(num_blocks, 4) + qs = data_u8[:, 4:].reshape(num_blocks, block_size - 8) + + ls = np.zeros((num_blocks, QK_K // 32), dtype=np.int8) + for ib in range(QK_K // 32): + ls[:, ib] = ((scales_l[:, ib // 2] >> 4 * (ib % 2)) & 0xf) | (((scales_h[:, 0] >> 2 * ib) & 3) << 4) + + dl = (d * (ls - 32)).reshape(num_blocks, -1, 1) + + qs_lo_4 = qs[:, :QK_K // 2].reshape(num_blocks, -1, 16) & 0xf + qs_hi_4 = qs[:, :QK_K // 2].reshape(num_blocks, -1, 16) >> 4 + + y = np.zeros((num_blocks, QK_K), dtype=np.float32) + for ib in range(QK_K // 32): + y[:, ib*32:(ib*32)+16] = dl[:, ib] * kvalues_iq4nl[qs_lo_4[:, ib]] + y[:, (ib*32)+16:(ib*32)+32] = dl[:, ib] * kvalues_iq4nl[qs_hi_4[:, ib]] + + return y.flatten() + +GGML_DEQUANTIZE = { + int(GGMLQuantizationType.F32): dequantize_f32, + int(GGMLQuantizationType.F16): dequantize_f16, + int(GGMLQuantizationType.Q4_0): dequantize_q4_0, + int(GGMLQuantizationType.Q5_0): dequantize_q5_0, + int(GGMLQuantizationType.Q8_0): dequantize_q8_0, + int(GGMLQuantizationType.Q2_K): dequantize_q2_k, + int(GGMLQuantizationType.Q3_K): dequantize_q3_k, + int(GGMLQuantizationType.Q4_K): dequantize_q4_k, + int(GGMLQuantizationType.Q5_K): dequantize_q5_k, + int(GGMLQuantizationType.Q6_K): dequantize_q6_k, + int(GGMLQuantizationType.IQ4_XS): dequantize_iq4_xs, +} + + +def dequant_gguf(data, type, shape): + values = GGML_DEQUANTIZE[type](data) + values = torch.from_numpy(values).view(shape) + return values \ No newline at end of file diff --git a/model_executor/layers/quantization/utils/gptq_utils.py b/model_executor/layers/quantization/utils/gptq_utils.py new file mode 100644 index 0000000..dfebeca --- /dev/null +++ b/model_executor/layers/quantization/utils/gptq_utils.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Mapping +from copy import deepcopy +from fractions import Fraction +from types import MappingProxyType +from typing import TYPE_CHECKING + +import regex as re +import torch + +from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + UnquantizedEmbeddingMethod, +) + +if TYPE_CHECKING: + from ..gptq import GPTQConfig + from ..gptq_marlin import GPTQMarlinConfig +else: + GPTQConfig = object + GPTQMarlinConfig = object + + +# Match dynamic rules with module name (prefix) and override quantize +# config if module (prefix) matches a rule +def override_config(config: GPTQConfig | GPTQMarlinConfig, prefix: str): + weight_bits = get_dynamic_override(config, prefix, "bits", config.weight_bits) + if isinstance(weight_bits, int): + config.weight_bits = weight_bits + group_size = get_dynamic_override(config, prefix, "group_size", config.group_size) + if isinstance(group_size, int): + config.group_size = group_size + desc_act = get_dynamic_override(config, prefix, "desc_act", config.desc_act) + if isinstance(desc_act, bool): + config.desc_act = desc_act + + config.pack_factor = Fraction(32, config.weight_bits) # packed into int32 + if config.get_name() == "gptq_marlin": + assert isinstance(config, GPTQMarlinConfig) + is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym) + if isinstance(is_sym, bool): + config.is_sym = is_sym + + if (config.weight_bits, config.is_sym) not in config.TYPE_MAP: + raise ValueError( + "Unsupported quantization config: " + f"bits={config.weight_bits}, sym={config.is_sym}" + ) + + config.quant_type = config.TYPE_MAP[(config.weight_bits, config.is_sym)] + elif config.get_name() == "gptq": + assert isinstance(config, GPTQConfig) + if config.weight_bits not in [2, 3, 4, 8]: + raise ValueError( + "Currently, only 2/3/4/8-bit weight quantization is " + f"supported for GPTQ, but got {config.weight_bits} bits." + ) + + +def get_dynamic_override( + config: GPTQConfig | GPTQMarlinConfig, + layer_name: str, + key: str | None = None, + default_value: int | bool | None = None, +) -> dict | int | bool | None: + for pattern, pattern_dict in config.dynamic.items(): + # Negative match: matched modules are excluded from quantized init + if pattern.startswith("-:"): + if re.match(pattern.removeprefix("-:"), layer_name): + return False + # Positive match: matched modules have quant properties overrides + # base quant config + elif re.match(pattern.removeprefix("+:"), layer_name): + if key is None: + return pattern_dict + else: + return pattern_dict.get(key, default_value) + return default_value + + +def is_layer_gptq_quantized( + prefix: str, + quantized_layers: list[str], + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}), +) -> bool: + # prefix: model.layers.0.self_attn.q_proj + # proj_name: q_proj + + # GPTQ's `modules_in_block_to_quantize`: + # Substr: ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"] + # Full prefix ["model.layers.0.self_attn.q_proj"] + + proj_name = prefix.split(".")[-1] + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in fused_mapping: + shard_prefixes = [ + prefix.replace(proj_name, shard_proj_name) + for shard_proj_name in fused_mapping[proj_name] + ] + + is_quantized = None + for shard_prefix in shard_prefixes: + is_shard_quantized = any( + layer in shard_prefix for layer in quantized_layers + ) + + if is_quantized is None: + is_quantized = is_shard_quantized + elif is_shard_quantized != is_quantized: + raise ValueError( + f"Detected some but not all shards of {prefix} " + "are quantized. All shards of fused layers " + "to have the same precision." + ) + else: + is_quantized = any(layer in prefix for layer in quantized_layers) + + assert is_quantized is not None + return is_quantized + + +def get_linear_quant_method( + config: GPTQConfig | GPTQMarlinConfig, + layer: torch.nn.Module, + prefix: str, + linear_method_cls: type, +): + cloned_config = deepcopy(config) + parallel_lm_head_quantized = ( + isinstance(layer, ParallelLMHead) and cloned_config.lm_head_quantized + ) + if isinstance(layer, LinearBase) or parallel_lm_head_quantized: + is_layer_quantized = is_layer_gptq_quantized( + prefix=prefix, + quantized_layers=cloned_config.modules_in_block_to_quantize, + fused_mapping=cloned_config.packed_modules_mapping, + ) + # False = skip module, None = no override, else = Positive match + if get_dynamic_override( # noqa: E712 + cloned_config, # noqa: E712 + layer_name=prefix, + ) == False or (not is_layer_quantized): # noqa: E712 + if parallel_lm_head_quantized: + return UnquantizedEmbeddingMethod() + return UnquantizedLinearMethod() + + if prefix: + # Dynamic per module/layer rules may override base config + override_config(cloned_config, prefix=prefix) + + return linear_method_cls(cloned_config) + return None diff --git a/model_executor/layers/quantization/utils/int8_utils.py b/model_executor/layers/quantization/utils/int8_utils.py new file mode 100644 index 0000000..925d0a5 --- /dev/null +++ b/model_executor/layers/quantization/utils/int8_utils.py @@ -0,0 +1,489 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py +import functools +import json +import logging +import os +from typing import Any + +import torch + +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton + +logger = logging.getLogger(__name__) + + +def apply_w8a8_block_int8_linear( + input: torch.Tensor, + weight: torch.Tensor, + block_size: list[int], + weight_scale: torch.Tensor, + input_scale: torch.Tensor | None = None, + bias: torch.Tensor | None = None, +) -> torch.Tensor: + assert input_scale is None + # View input as 2D matrix for fp8 methods + input_2d = input.view(-1, input.shape[-1]) + output_shape = [*input.shape[:-1], weight.shape[0]] + + q_input, x_scale = per_token_group_quant_int8(input_2d, block_size[1]) + output = w8a8_block_int8_matmul( + q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype + ) + + if bias is not None: + output = output + bias + return output.to(dtype=input.dtype).view(*output_shape) + + +def input_to_int8( + x: torch.Tensor, dtype: torch.dtype = torch.int8 +) -> tuple[torch.Tensor, torch.Tensor]: + """This function quantizes input values to int8 values with + tensor-wise quantization.""" + iinfo = torch.iinfo(dtype) + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + int8_min, int8_max = iinfo.min, iinfo.max + scale = int8_max / amax + x_scl_sat = (x * scale).clamp(min=int8_min, max=int8_max) + return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal() + + +def block_dequant( + x_q_block: torch.Tensor, + x_s: torch.Tensor, + block_size: list[int], +) -> torch.Tensor: + """This function conducts block-wise dequantization. + The inputs are block-wise quantization tensor `x_q_block`, + block-wise quantization scale and the block size. + The outputs are dequantized tensor. + """ + block_n, block_k = block_size[0], block_size[1] + n, k = x_q_block.shape + n_tiles = (n + block_n - 1) // block_n + k_tiles = (k + block_k - 1) // block_k + assert n_tiles == x_s.shape[0] + assert k_tiles == x_s.shape[1] + + x_dq_block = x_q_block.to(torch.float32) + + for i in range(k_tiles): + for j in range(n_tiles): + x_dq_block[ + j * block_n : min((j + 1) * block_n, n), + i * block_k : min((i + 1) * block_k, k), + ] *= x_s[j][i] + + return x_dq_block + + +if current_platform.is_rocm(): + from triton.language import core + + # NOTE: This can be removed when hip.libdevice.round() is available. + @core.extern + def round_f32(arg0, _builder=None): + return core.extern_elementwise( + "", + "", + [arg0], + { + (core.dtype("fp32"),): ("llvm.round", core.dtype("fp32")), + (core.dtype("fp64"),): ("llvm.round", core.dtype("fp64")), + }, + is_pure=True, + _builder=_builder, + ) + + @triton.jit + def round_int8(x): + return round_f32(x).to(tl.int8) +else: + + @triton.jit + def round_int8(x): + return tl.extra.cuda.libdevice.round(x).to(tl.int8) + + +@triton.jit +def _per_token_quant_int8( + x_ptr, + xq_ptr, + scale_ptr, + stride_x, + stride_xq, + N, + BLOCK: tl.constexpr, +): + # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282 + row_id = tl.program_id(0) + + cols = tl.arange(0, BLOCK) + mask = cols < N + + x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32) + absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10) + scale_x = absmax / 127 + x_q = x * (127 / absmax) + x_q = round_int8(x_q) + + tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask) + tl.store(scale_ptr + row_id, scale_x) + + +def per_token_quant_int8(x): + M = x.numel() // x.shape[-1] + N = x.shape[-1] + x_q = torch.empty_like(x, device=x.device, dtype=torch.int8) + scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32) + BLOCK = triton.next_power_of_2(N) + # heuristics for number of warps + num_warps = min(max(BLOCK // 256, 1), 8) + + assert x.is_contiguous() + _per_token_quant_int8[(M,)]( + x, + x_q, + scales, + stride_x=x.stride(-2), + stride_xq=x_q.stride(-2), + N=N, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + + return x_q, scales + + +@triton.jit +def _per_token_group_quant_int8( + # Pointers to inputs and output + y_ptr, + y_q_ptr, + y_s_ptr, + # Stride of input + y_stride, + # Columns of input + N, + # Avoid to divide zero + eps, + # Information for int8 + int8_min, + int8_max, + # Meta-parameters + BLOCK: tl.constexpr, +): + """A Triton-accelerated function to perform per-token-group + quantization on a tensor. + + This function converts the tensor values into int8 values. + """ + # Map the program id to the row of X and Y it should compute. + g_id = tl.program_id(0) + y_ptr += g_id * y_stride + y_q_ptr += g_id * y_stride + y_s_ptr += g_id + + cols = tl.arange(0, BLOCK) # N <= BLOCK + mask = cols < N + + y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32) + # Quant + _absmax = tl.maximum(tl.max(tl.abs(y)), eps) + y_s = _absmax / int8_max + y_q = tl.clamp(y / y_s, int8_min, int8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + cols, y_q, mask=mask) + tl.store(y_s_ptr, y_s) + + +def per_token_group_quant_int8( + x: torch.Tensor, + group_size: int, + eps: float = 1e-10, + dtype: torch.dtype = torch.int8, +) -> tuple[torch.Tensor, torch.Tensor]: + """Function to perform per-token-group quantization on an input tensor `x`. + + It converts the tensor values into signed int8 values and returns the + quantized tensor along with the scaling factor used for quantization. + + Args: + x: The input tensor with ndim >= 2. + group_size: The group size used for quantization. + eps: The minimum to avoid dividing zero. + dtype: The dype of output tensor. Note that only `torch.int8` + is supported for now. + + Returns: + tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the + scaling factor for quantization. + """ + assert x.shape[-1] % group_size == 0, ( + "the last dimension of `x` cannot be divisible by `group_size`" + ) + assert x.is_contiguous(), "`x` is not contiguous" + + iinfo = torch.iinfo(dtype) + int8_max = iinfo.max + int8_min = iinfo.min + + x_q = torch.empty_like(x, device=x.device, dtype=dtype) + x_s = torch.empty( + x.shape[:-1] + (x.shape[-1] // group_size,), + device=x.device, + dtype=torch.float32, + ) + # prefer CUDA kernel if available + if current_platform.is_cuda(): + torch.ops._C.per_token_group_quant_int8( + x, x_q, x_s, group_size, eps, float(int8_min), float(int8_max) + ) + return x_q, x_s + + M = x.numel() // group_size + N = group_size + + BLOCK = triton.next_power_of_2(N) + # heuristics for number of warps + num_warps = min(max(BLOCK // 256, 1), 8) + num_stages = 1 + _per_token_group_quant_int8[(M,)]( + x, + x_q, + x_s, + group_size, + N, + eps, + int8_min=int8_min, + int8_max=int8_max, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=num_stages, + ) + + return x_q, x_s + + +@triton.jit +def _w8a8_block_int8_matmul( + # Pointers to inputs and output + A, + B, + C, + As, + Bs, + # Shape for matmul + M, + N, + K, + # Block size for block-wise quantization + group_n, + group_k, + # Stride for inputs and output + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_As_m, + stride_As_k, + stride_Bs_k, + stride_Bs_n, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + """Triton-accelerated function used to perform linear operations (dot + product) on input tensors `A` and `B` with block-wise quantization, and + store the result in output tensor `C`. + """ + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + As_ptrs = As + offs_am * stride_As_m + offs_bsn = offs_bn // group_n + Bs_ptrs = Bs + offs_bsn * stride_Bs_n + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + + k_start = k * BLOCK_SIZE_K + offs_ks = k_start // group_k + a_s = tl.load(As_ptrs + offs_ks * stride_As_k) + b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k) + + accumulator += tl.dot(a, b).to(tl.float32) * a_s[:, None] * b_s[None, :] + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + if C.dtype.element_ty == tl.bfloat16: + c = accumulator.to(tl.bfloat16) + elif C.dtype.element_ty == tl.float16: + c = accumulator.to(tl.float16) + else: + c = accumulator.to(tl.float32) + + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +@functools.lru_cache +def get_w8a8_block_int8_configs( + N: int, K: int, block_n: int, block_k: int +) -> dict[int, Any] | None: + """ + Return optimized configurations for the w8a8 block fp8 kernel. + + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs + # directory + device_name = current_platform.get_device_name().replace(" ", "_") + json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json" # noqa: E501 + + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name + ) + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info( + "Using configuration from %s for W8A8 Block INT8 kernel.", + config_file_path, + ) + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default + # configuration + logger.warning( + ( + "Using default W8A8 Block INT8 kernel config. Performance might " + "be sub-optimal! Config file not found at %s" + ), + config_file_path, + ) + return None + + +def w8a8_block_int8_matmul( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: list[int], + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: + """This function performs matrix multiplication with block-wise + quantization. + + It takes two input tensors `A` and `B` with scales `As` and `Bs`. + The output is returned in the specified `output_dtype`. + + Args: + A: The input tensor, e.g., activation. + B: The input tensor, e.g., weight. + As: The per-token-group quantization scale for `A`. + Bs: The per-block quantization scale for `B`. + block_size: The block size for per-block quantization. It should be + 2-dim, e.g., [128, 128]. + output_dtype: The dtype of the returned tensor. + + Returns: + torch.Tensor: The result of matmul. + """ + assert len(block_size) == 2 + block_n, block_k = block_size[0], block_size[1] + + assert A.shape[-1] == B.shape[-1] + assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous() + assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1] + M = A.numel() // A.shape[-1] + + assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2 + N, K = B.shape + assert triton.cdiv(N, block_n) == Bs.shape[0] + assert triton.cdiv(K, block_k) == Bs.shape[1] + + C_shape = A.shape[:-1] + (N,) + C = A.new_empty(C_shape, dtype=output_dtype) + + configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1]) + if configs: + # If an optimal configuration map has been found, look up the + # optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Default config + # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_size[0], + "BLOCK_SIZE_K": block_size[1], + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3, + } + + def grid(META): + return ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) + + _w8a8_block_int8_matmul[grid]( + A, + B, + C, + As, + Bs, + M, + N, + K, + block_n, + block_k, + A.stride(-2), + A.stride(-1), + B.stride(1), + B.stride(0), + C.stride(-2), + C.stride(-1), + As.stride(-2), + As.stride(-1), + Bs.stride(1), + Bs.stride(0), + **config, + ) + + return C diff --git a/model_executor/layers/quantization/utils/layer_utils.py b/model_executor/layers/quantization/utils/layer_utils.py new file mode 100644 index 0000000..3b8c9a8 --- /dev/null +++ b/model_executor/layers/quantization/utils/layer_utils.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + + +def update_tensor_inplace(dst: torch.Tensor, src: torch.Tensor): + assert dst.dtype == src.dtype, "Tensors must have the same dtype" + + # update tensor shape and stride + dst.as_strided_(src.shape, src.stride()) + + # If not the same underlying storage move tensor data + if dst.data_ptr() != src.data_ptr(): + dst.copy_(src) + del src + + +# Newly generated tensors need to replace existing tensors that are +# already registered as parameters by vLLM (and won't be freed) +def replace_parameter( + mod: torch.nn.Module, name: str, new: torch.Tensor | torch.nn.Parameter +) -> None: + old = getattr(mod, name) + if ( + type(old) is type(new) + and old.dtype == new.dtype + and old.untyped_storage().nbytes() == new.untyped_storage().nbytes() + ): + # If we can just update in-place to avoid re-registering + # can be faster if the underlying storage is the same + update_tensor_inplace(old, new) + else: + # Fallback re-register parameter, convert to Parameter if necessary + # this not only ensures we don't register a tensor as a parameter, but + # also ensures that all parameter subclasses get re-registered as + # parameters for `torch.compile` compatibility + if not isinstance(new, torch.nn.Parameter): + new = torch.nn.Parameter(new, requires_grad=False) + mod.register_parameter(name, torch.nn.Parameter(new, requires_grad=False)) diff --git a/model_executor/layers/quantization/utils/machete_utils.py b/model_executor/layers/quantization/utils/machete_utils.py new file mode 100644 index 0000000..ccfcdac --- /dev/null +++ b/model_executor/layers/quantization/utils/machete_utils.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm.scalar_type import ScalarType, scalar_types + +MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128] + + +def query_machete_supported_quant_types(zero_points: bool) -> list[ScalarType]: + if zero_points: + return [scalar_types.uint4, scalar_types.uint8] + else: + return [scalar_types.uint4b8, scalar_types.uint8b128] + + +def query_machete_supported_act_types(zero_points: bool) -> list[ScalarType]: + return [torch.float16, torch.bfloat16] + + +def query_machete_supported_group_sizes(act_type: torch.dtype) -> list[int]: + """ + Queries the supported group sizes for Machete based on the activation type. + + Args: + act_type: The activation data type (torch.float16, torch.bfloat16). + + Returns: + A list of supported group sizes. The group size must + be divisible by `TileShapeK = 128 * 8 // num_bits(act_type)`. + -1 indicates per-channel quantization. + """ + if act_type in [torch.float16, torch.bfloat16]: + return [-1, 64, 128] + else: + return [-1, 128] + + +def check_machete_supports_shape( + in_features: int, out_featrues: int +) -> tuple[bool, str | None]: + if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0: + return ( + False, + "Input features size must be divisible by " + f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}", + ) + if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0: + return ( + False, + "Output features size must be divisible by " + f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}", + ) + return True, None diff --git a/model_executor/layers/quantization/utils/marlin_utils.py b/model_executor/layers/quantization/utils/marlin_utils.py new file mode 100644 index 0000000..071fb4b --- /dev/null +++ b/model_executor/layers/quantization/utils/marlin_utils.py @@ -0,0 +1,575 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import numpy +import torch + +import vllm.envs as envs +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import LinearBase +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType, scalar_types + +from .quant_utils import pack_cols, unpack_cols + +logger = init_logger(__name__) + +GPTQ_MARLIN_TILE = 16 +GPTQ_MARLIN_MIN_THREAD_N = 64 +GPTQ_MARLIN_MIN_THREAD_K = 128 +GPTQ_MARLIN_MAX_PARALLEL = 16 + +MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] + +# In case there is a performance issue with Marlin, the variable below can be +# changed to False, which allows Marlin to perform global reductions in fp16 +# precision (instead of fp32), and therefore, save on some memory movements. +USE_FP32_REDUCE_DEFAULT = True + + +# For binary size and compile time, we don't support the same types for with and +# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ. +# TODO: we may want to move this into the C++ so its closer to the actual impl +def query_marlin_supported_quant_types( + has_zp: bool | None = None, + include_fp_type: bool = True, + device_capability: int | None = None, +): + if device_capability is None: + capability_tuple = current_platform.get_device_capability() + device_capability = ( + -1 if capability_tuple is None else capability_tuple.to_int() + ) + + if device_capability < 80: + return [] + + # - has_zp is True: return quant_types that has zero points + # - has_zp is False: return quant_types that has not zero points + # - has_zp is None: both + if has_zp is None: + types0 = query_marlin_supported_quant_types( + False, include_fp_type, device_capability + ) + types1 = query_marlin_supported_quant_types( + True, include_fp_type, device_capability + ) + return types0 + types1 + + if has_zp: + # AWQ style, unsigned + runtime zero-point + return [scalar_types.uint4] + else: + # GPTQ style, unsigned + symmetric bias + res = [scalar_types.uint4b8, scalar_types.uint8b128] + if include_fp_type: + res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f] + return res + + +def _check_marlin_supported( + quant_type: ScalarType, + group_size: int | None, + has_zp: bool, + device_capability: int | None = None, +) -> tuple[bool, str | None]: + if device_capability is None: + capability_tuple = current_platform.get_device_capability() + device_capability = ( + -1 if capability_tuple is None else capability_tuple.to_int() + ) + + supported_types = query_marlin_supported_quant_types( + has_zp, True, device_capability + ) + + if quant_type not in supported_types: + return ( + False, + f"Marlin does not support weight_bits = {quant_type}. " + f"Only types = {supported_types} " + f"are supported (for group_size = {group_size}, " + f"device_capability = {device_capability}, zp = {has_zp}).", + ) + if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES: + return ( + False, + f"Marlin does not support group_size = {group_size}. " + f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} " + "are supported.", + ) + + return True, None + + +def check_marlin_supported( + quant_type: ScalarType, + group_size: int, + has_zp: bool = False, + device_capability: int | None = None, +) -> bool: + cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability) + return cond + + +def verify_marlin_supported( + quant_type: ScalarType, group_size: int, has_zp: bool = False +) -> None: + cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp) + if not cond: + assert err_msg is not None + raise ValueError(err_msg) + + +def verify_marlin_supports_shape( + output_size_per_partition: int, + input_size_per_partition: int, + input_size: int, + group_size: int, +) -> None: + # Validate output_size_per_partition + if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0: + raise ValueError( + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. " + "Consider reducing tensor_parallel_size or running " + "with --quantization gptq." + ) + + # Validate input_size_per_partition + if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0: + raise ValueError( + f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible " + f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. " + "Consider reducing tensor_parallel_size or running " + "with --quantization gptq." + ) + + if group_size < input_size and input_size_per_partition % group_size != 0: + raise ValueError( + f"Weight input_size_per_partition = {input_size_per_partition}" + f" is not divisible by group_size = {group_size}. " + "Consider reducing tensor_parallel_size or running " + "with --quantization gptq." + ) + + +def check_marlin_supports_shape( + output_size_per_partition: int, + input_size_per_partition: int, + input_size: int, + group_size: int, +) -> tuple[bool, str | None]: + try: + verify_marlin_supports_shape( + output_size_per_partition, input_size_per_partition, input_size, group_size + ) + except ValueError as e: + return False, e.__str__() + return True, None + + +def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: + output_size_per_partition = ( + getattr(layer, "output_size_per_partition", None) or layer.output_size + ) + input_size_per_partition = ( + getattr(layer, "input_size_per_partition", None) or layer.input_size + ) + + return check_marlin_supports_shape( + output_size_per_partition=output_size_per_partition, + input_size_per_partition=input_size_per_partition, + input_size=layer.input_size, + group_size=group_size, + )[0] + + +def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: + hidden_size = layer.hidden_size + intermediate_size_per_partition = layer.intermediate_size_per_partition + # apply_router_weight_on_input is not supported for moe marlin + supports_router_weight = not layer.apply_router_weight_on_input + # moe marlin requires the activation to be silu + supports_activation = layer.activation == "silu" + + # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size) + # down: (n, k) = (hidden_size, intermediate_size_per_partition) + # moe marlin requires n % 128 == 0 and k % 64 == 0 + supports_shape = ( + hidden_size % 128 == 0 + and intermediate_size_per_partition % max(64, group_size) == 0 + ) + supports_group_size = group_size in [-1, 32, 64, 128] + return ( + supports_shape + and supports_group_size + and supports_router_weight + and supports_activation + ) + + +def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tensor): + """ + Given Marlin packed weight matrices w1_packed, and w2_packed, + return the MoE intermediate size N + """ + marlin_tile_size = 16 + return w2_packed.size(1) * marlin_tile_size + + +def marlin_make_workspace( + output_size_per_partition: int, device: torch.device +) -> torch.Tensor: + max_workspace_size = ( + output_size_per_partition // GPTQ_MARLIN_MIN_THREAD_N + ) * GPTQ_MARLIN_MAX_PARALLEL + + return torch.zeros( + max_workspace_size, dtype=torch.int, device=device, requires_grad=False + ) + + +def marlin_make_workspace_new( + device: torch.device, max_blocks_per_sm: int = 1 +) -> torch.Tensor: + # In the new marlin kernel, we use the num of threadblocks as workspace + # size. The num of threadblocks is sms_count * max_blocks_per_sm. + sms = torch.cuda.get_device_properties(device).multi_processor_count + return torch.zeros( + sms * max_blocks_per_sm, dtype=torch.int, device=device, requires_grad=False + ) + + +def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool: + return (not act_order) or (act_order and not is_row_parallel) + + +def marlin_repeat_scales_on_all_ranks( + act_order: bool, group_size: int, is_row_parallel: bool +) -> bool: + # Need to repeat scales on every rank if act_ordering or + # channelwise and RowParallelLinear + is_channelwise = group_size == -1 + return act_order or (is_channelwise and is_row_parallel) + + +def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor: + return torch.nn.Parameter( + torch.empty(0, dtype=torch.int, device=device), requires_grad=False + ) + + +def marlin_make_empty_zp(device: torch.device) -> torch.Tensor: + return torch.nn.Parameter( + torch.empty(0, dtype=torch.int, device=device), requires_grad=False + ) + + +def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + g_idx_sort_indices = torch.argsort(g_idx).to(torch.int) + return g_idx[g_idx_sort_indices], g_idx_sort_indices + + +def get_scale_perms(): + scale_perm: list[int] = [] + for i in range(8): + scale_perm.extend([i + 8 * j for j in range(8)]) + scale_perm_single: list[int] = [] + for i in range(4): + scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) + return scale_perm, scale_perm_single + + +def marlin_permute_scales( + s: torch.Tensor, size_k: int, size_n: int, group_size: int +) -> torch.Tensor: + scale_perm, scale_perm_single = get_scale_perms() + if group_size < size_k and group_size != -1: + s = s.reshape((-1, len(scale_perm)))[:, scale_perm] + else: + s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single] + s = s.reshape((-1, size_n)).contiguous() + + return s + + +def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor: + origin_shape = s.shape + _, scale_perm_single = get_scale_perms() + s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single] + return s.reshape(*origin_shape).contiguous() + + +def marlin_moe_permute_scales( + s: torch.Tensor, + size_k: int, + size_n: int, + group_size: int, +): + num_experts = s.shape[0] + output = torch.empty( + (num_experts, s.shape[1], s.shape[2]), + device=s.device, + dtype=s.dtype, + ) + + for e in range(num_experts): + output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size) + return output + + +def marlin_zero_points( + zp: torch.Tensor, size_k: int, size_n: int, num_bits: int +) -> torch.Tensor: + # Permute zero-points in a similar way to scales, but do not use the + # "single" permutation, since zero-points are applied on every MMA + scale_perm, _ = get_scale_perms() + zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm] + + # Interleave column dim (for the dequantize code) and pack it to int32 + if num_bits == 4: + interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) + elif num_bits == 8: + interleave = numpy.array([0, 2, 1, 3]) + else: + raise Exception("num_bits must be 4 or 8, got {}".format(num_bits)) + + zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel() + zp = zp.reshape((-1, size_n)).contiguous() + zp = pack_cols(zp, num_bits, size_k, size_n) + + return zp + + +def awq_to_marlin_zero_points( + q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int +) -> torch.Tensor: + # AWQ zero-points are quantized and packed on the column dim. + # In addition, the values are permuted based on dequantizer. + # Here we undo both of these, and then apply marlin permutation + # and pack it back. + q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n) + + # Undo interleaving (use argsort(..) to get inverse perm) + if num_bits == 4: + undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7])) + elif num_bits == 8: + undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3])) + else: + raise Exception("num_bits must be 4 or 8, got {}".format(num_bits)) + + q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel() + q_zp = q_zp.reshape((-1, size_n)).contiguous() + + marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits) + return marlin_zp + + +def moe_awq_to_marlin_zero_points( + q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int +): + num_experts = q_zp_packed.shape[0] + output = torch.empty( + (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]), + device=q_zp_packed.device, + dtype=q_zp_packed.dtype, + ) + for e in range(num_experts): + output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits) + return output + + +def maybe_warn_marlin_atomic_add(device, dtype): + if torch.compiler.is_dynamo_compiling(): + return + device_capability = torch.cuda.get_device_capability(device) + if device_capability[0] < 9 and dtype == torch.bfloat16: + logger.info_once( + "You are running Marlin kernel with bf16 on GPUs before SM90. " + "You can consider change to fp16 to achieve better performance " + "if possible." + ) + + +def maybe_warn_marlin_atomic_add_env(): + if torch.compiler.is_dynamo_compiling(): + return + if envs.VLLM_MARLIN_USE_ATOMIC_ADD: + return + logger.info_once( + "Marlin kernel can achieve better performance for small size_n " + "with experimental use_atomic_add feature. " + "You can consider set environment variable " + "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible." + ) + + +def should_use_atomic_add_reduce( + m: int, n: int, k: int, device: torch.device, dtype: torch.dtype +) -> bool: + # the performance of atomicAdd is better than global reduce + # only when m*n is small and k is large + if n >= 2048 or k < 2048 or device.type != "cuda": + return False + + # disable atomicAdd reduce by default, + # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1 + if not envs.VLLM_MARLIN_USE_ATOMIC_ADD: + maybe_warn_marlin_atomic_add_env() + return False + + # sm8x doesn't support atomicAdd + bfloat16 natively + device_capability = torch.cuda.get_device_capability(device) + if device_capability[0] < 9 and dtype == torch.bfloat16: + maybe_warn_marlin_atomic_add(device, dtype) + return False + + return True + + +def apply_gptq_marlin_linear( + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + weight_zp: torch.Tensor, + g_idx: torch.Tensor, + g_idx_sort_indices: torch.Tensor, + workspace: torch.Tensor, + wtype: ScalarType, + output_size_per_partition: int, + input_size_per_partition: int, + is_k_full: bool, + bias: torch.Tensor | None = None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT, +) -> torch.Tensor: + reshaped_x = input.reshape(-1, input.shape[-1]) + out_shape = input.shape[:-1] + (output_size_per_partition,) + + use_atomic_add = should_use_atomic_add_reduce( + m=reshaped_x.size(0), + n=output_size_per_partition, + k=reshaped_x.size(1), + device=input.device, + dtype=input.dtype, + ) + + output = ops.gptq_marlin_gemm( + reshaped_x, + None, + weight, + bias, + weight_scale, + None, + weight_zp, + g_idx, + g_idx_sort_indices, + workspace, + wtype, + size_m=reshaped_x.shape[0], + size_n=output_size_per_partition, + size_k=input_size_per_partition, + is_k_full=is_k_full, + use_atomic_add=use_atomic_add, + use_fp32_reduce=use_fp32_reduce, + is_zp_float=False, + ) + + return output.reshape(out_shape) + + +def apply_awq_marlin_linear( + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + weight_zp: torch.Tensor, + g_idx: torch.Tensor, + g_idx_sort_indices: torch.Tensor, + workspace: torch.Tensor, + quant_type: ScalarType, + output_size_per_partition: int, + input_size_per_partition: int, + bias: torch.Tensor | None = None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT, +) -> torch.Tensor: + reshaped_x = input.reshape(-1, input.shape[-1]) + out_shape = input.shape[:-1] + (output_size_per_partition,) + + use_atomic_add = should_use_atomic_add_reduce( + m=reshaped_x.size(0), + n=output_size_per_partition, + k=reshaped_x.size(1), + device=input.device, + dtype=input.dtype, + ) + + output = ops.gptq_marlin_gemm( + reshaped_x, + None, + weight, + bias, + weight_scale, + None, + weight_zp, + g_idx, + g_idx_sort_indices, + workspace, + quant_type, + size_m=reshaped_x.shape[0], + size_n=output_size_per_partition, + size_k=input_size_per_partition, + use_atomic_add=use_atomic_add, + use_fp32_reduce=use_fp32_reduce, + is_zp_float=False, + ) + + return output.reshape(out_shape) + + +def apply_rtn_marlin_linear( + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + workspace: torch.Tensor, + quant_type: ScalarType, + output_size_per_partition: int, + input_size_per_partition: int, + bias: torch.Tensor | None = None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT, +) -> torch.Tensor: + reshaped_x = input.reshape(-1, input.shape[-1]) + out_shape = input.shape[:-1] + (output_size_per_partition,) + + use_atomic_add = should_use_atomic_add_reduce( + m=reshaped_x.size(0), + n=output_size_per_partition, + k=reshaped_x.size(1), + device=input.device, + dtype=input.dtype, + ) + + output = ops.gptq_marlin_gemm( + reshaped_x, + None, + weight, + bias, + weight_scale, + None, + None, + None, + None, + workspace, + quant_type, + size_m=reshaped_x.shape[0], + size_n=output_size_per_partition, + size_k=input_size_per_partition, + use_atomic_add=use_atomic_add, + use_fp32_reduce=use_fp32_reduce, + is_zp_float=False, + ) + + return output.reshape(out_shape) diff --git a/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/model_executor/layers/quantization/utils/marlin_utils_fp4.py new file mode 100644 index 0000000..842fb9b --- /dev/null +++ b/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -0,0 +1,397 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +import vllm._custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + USE_FP32_REDUCE_DEFAULT, + marlin_make_workspace_new, + marlin_permute_bias, + marlin_permute_scales, + should_use_atomic_add_reduce, +) +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types + +FP4_MARLIN_SUPPORTED_GROUP_SIZES = [16] + +logger = init_logger(__name__) + + +def is_fp4_marlin_supported(): + return current_platform.has_device_capability(80) + + +def nvfp4_marlin_process_scales(marlin_scales): + if not (marlin_scales >= 0).all(): + logger.warning_once( + "NVFP4 Marlin assumes the scales to be >=0, but has encountered " + "negative scales. Accuracy will likely be degraded. This is " + "because it changes the scales from FP8-S1E4M3 to a special " + "FP8-S0E5M3 format to speedup the dequantization." + ) + + # convert to half first, we would convert to fp8 later + marlin_scales = marlin_scales.to(torch.half) + + # 8 is the number of scale number using by one thread + marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8) + marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape( + marlin_scales.size(0) * 2, -1 + ) + + # fit the layout of fp8 dequantization + marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view( + marlin_scales.size(0), -1 + ) + + # We assume that weight_scale (FP8-S1E4M3) is always greater + # than or equal to 0. So we can convert + # (weight_scale * (2 ** 7) to a special FP8-S0E5M3 format. + # After multiplying by 2 ** 7, the top bit of FP8-S0E5M3 would always be 1 + # when weight_scale > 0. This allows us to have an exponent bias + # closer to zero after dequantization. + + marlin_scales = (marlin_scales * (2**7)).view(torch.int16) << 1 + marlin_scales = marlin_scales.view(torch.float8_e4m3fn) + marlin_scales = marlin_scales[:, 1::2].contiguous() + + return marlin_scales + + +def mxfp4_marlin_process_scales(marlin_scales): + # 8 is the number of scale number using by one thread + marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8) + marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape( + marlin_scales.size(0) * 2, -1 + ) + + # fit the layout of fp8 dequantization + marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view( + marlin_scales.size(0), -1 + ) + marlin_scales = marlin_scales.to(torch.float8_e8m0fnu) + return marlin_scales + + +def nvfp4_marlin_process_global_scale(global_scale): + assert global_scale.dtype in [torch.half, torch.bfloat16] + fp4_exponent = 2 + if global_scale.dtype == torch.half: + target_exponent = 5 + elif global_scale.dtype == torch.bfloat16: + target_exponent = 8 + # exponent_bias_fp16 = 2 ** 4 - 2 ** 1 = 14 + # exponent_bias_bf16 = 2 ** 7 - 2 ** 1 = 126 + exponent_bias = 2 ** (target_exponent - 1) - 2 ** (fp4_exponent - 1) + return global_scale * (2.0 ** (exponent_bias - 7)) + + +def apply_fp4_marlin_linear( + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + weight_scale_2: torch.Tensor | None, + workspace: torch.Tensor, + size_n: int, + size_k: int, + bias: torch.Tensor | None = None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT, +) -> torch.Tensor: + # For GPUs that lack FP4 hardware support, we can leverage the + # Marlin kernel for fast weight-only FP4 quantization + + reshaped_x = input.reshape(-1, input.shape[-1]) + out_shape = input.shape[:-1] + (size_n,) + + use_atomic_add = should_use_atomic_add_reduce( + m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype + ) + + output = ops.gptq_marlin_gemm( + a=reshaped_x, + c=None, + b_q_weight=weight, + b_bias=bias, + b_scales=weight_scale, + global_scale=weight_scale_2, + b_zeros=None, + g_idx=None, + perm=None, + workspace=workspace, + b_q_type=scalar_types.float4_e2m1f, + size_m=reshaped_x.size(0), + size_n=size_n, + size_k=size_k, + use_atomic_add=use_atomic_add, + use_fp32_reduce=use_fp32_reduce, + ) + + return output.reshape(out_shape) + + +def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None: + logger.warning_once( + "Your GPU does not have native support for FP4 computation but " + "FP4 quantization is being used. Weight-only FP4 compression will " + "be used leveraging the Marlin kernel. This may degrade " + "performance for compute-heavy workloads." + ) + + is_nvfp4 = hasattr(layer, "weight_scale_2") + group_size = 16 if is_nvfp4 else 32 + + part_size_n = layer.output_size_per_partition + part_size_k = layer.input_size_per_partition + param_dtype = layer.params_dtype + + assert layer.weight.shape == (part_size_n, part_size_k // 2) + + device = layer.weight.device + + # WORKSPACE + layer.workspace = marlin_make_workspace_new(device) + + # WEIGHT + # Repack weights to marlin format + perm = torch.empty(0, dtype=torch.int, device=device) + qweight = layer.weight.view(torch.int32).T.contiguous() + + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=qweight, + perm=perm, + size_k=part_size_k, + size_n=part_size_n, + num_bits=4, + ) + layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False) + + # WEIGHT SCALES + # Permute scales + weight_scale = layer.weight_scale.T.contiguous() + + if not is_nvfp4: + weight_scale = weight_scale.view(torch.float8_e8m0fnu) + + weight_scale = weight_scale.to(param_dtype) + weight_scale = marlin_permute_scales( + s=weight_scale, size_k=part_size_k, size_n=part_size_n, group_size=group_size + ) + + if is_nvfp4: + weight_scale = nvfp4_marlin_process_scales(weight_scale) + layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) + + weight_scale_2 = layer.weight_scale_2.to(param_dtype) + weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2) + layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2, requires_grad=False) + else: + weight_scale = mxfp4_marlin_process_scales(weight_scale) + layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) + + if hasattr(layer, "bias") and layer.bias is not None: + assert layer.bias.shape == (part_size_n,) + bias = marlin_permute_bias(layer.bias) + layer.bias = torch.nn.Parameter(bias, requires_grad=False) + + return + + +def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None: + logger.warning_once( + "Your GPU does not have native support for FP4 computation but " + "FP4 quantization is being used. Weight-only FP4 compression will " + "be used leveraging the Marlin kernel. This may degrade " + "performance for compute-heavy workloads." + ) + + is_nvfp4 = hasattr(layer, "w13_weight_scale_2") + group_size = 16 if is_nvfp4 else 32 + + e = layer.num_experts + k = layer.hidden_size + n = layer.intermediate_size_per_partition + + # WORKSPACE + device = layer.w13_weight.device + param_dtype = layer.params_dtype + layer.workspace = marlin_make_workspace_new(device, 4) + perm = torch.empty(0, dtype=torch.int, device=device) + + # WEIGHT + # Repack weights to marlin format + for name in ["w13_weight", "w2_weight"]: + weight = getattr(layer, name) + tensor_list = [] + if "w13" in name: + size_n, size_k = n * 2, k + else: + size_n, size_k = k, n + + assert weight.shape == (e, size_n, size_k // 2) + + for i in range(e): + qweight = weight[i].view(torch.int32).T.contiguous() + + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=4 + ) + tensor_list.append(marlin_qweight) + + weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + weight = torch.nn.Parameter(weight, requires_grad=False) + + setattr(layer, name, weight) + + # WEIGHT SCALES + # Permute scales + for name in ["w13", "w2"]: + scales = getattr(layer, name + "_weight_scale") + if not is_nvfp4: + scales = scales.view(torch.float8_e8m0fnu) + scales = scales.to(param_dtype) + if is_nvfp4: + global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype) + + tensor_list = [] + if "w13" in name: + size_n, size_k = n * 2, k + else: + size_n, size_k = k, n + + for i in range(e): + scale = scales[i].T + + marlin_scales = marlin_permute_scales( + s=scale, size_k=size_k, size_n=size_n, group_size=group_size + ) + if is_nvfp4: + marlin_scales = nvfp4_marlin_process_scales(marlin_scales) + else: + marlin_scales = mxfp4_marlin_process_scales(marlin_scales) + tensor_list.append(marlin_scales) + + scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + scales = torch.nn.Parameter(scales, requires_grad=False) + setattr(layer, name + "_weight_scale", scales) + + if is_nvfp4: + global_scale = nvfp4_marlin_process_global_scale(global_scale) + global_scale = torch.nn.Parameter(global_scale, requires_grad=False) + setattr(layer, name + "_weight_scale_2", global_scale) + + # BIAS + # Permute bias + for name in ["w13_bias", "w2_bias"]: + if not hasattr(layer, name): + continue + bias = getattr(layer, name).to(param_dtype) + + tensor_list = [] + for i in range(e): + expert_bias = bias[i] + + tensor_list.append(marlin_permute_bias(expert_bias)) + + bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + bias = torch.nn.Parameter(bias, requires_grad=False) + setattr(layer, name, bias) + + +def rand_marlin_weight_nvfp4_like(weight, group_size): + assert group_size > 0 + size_n, size_k = weight.shape + device = weight.device + + scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 6 + global_scale = scales.max() / 448 + scales = (scales / global_scale).to(torch.float8_e4m3fn) + + fp4_weight = torch.randint( + 0, 256, (size_n, size_k // 2), dtype=torch.uint8, device=weight.device + ) + fp4_weight_part_1 = (fp4_weight & 0b10000000) | ((fp4_weight & 0b01110000) >> 2) + fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn) + fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6) + + fp4_weight2 = fp4_weight << 4 + fp4_weight_part_2 = (fp4_weight2 & 0b10000000) | ((fp4_weight2 & 0b01110000) >> 2) + fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn) + fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6) + + weight_ref = torch.cat( + [fp4_weight_part_2.unsqueeze(2), fp4_weight_part_1.unsqueeze(2)], 2 + ).view(size_n, size_k) + weight_ref = ( + weight_ref + * global_scale.to(weight.dtype) + * scales.repeat_interleave(group_size, 1).to(weight.dtype) + ) + + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=fp4_weight.view(torch.int32).T.contiguous(), + perm=torch.empty(0, dtype=torch.int, device=device), + size_k=size_k, + size_n=size_n, + num_bits=4, + ) + + marlin_scales = marlin_permute_scales( + s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size + ) + marlin_scales = nvfp4_marlin_process_scales(marlin_scales) + + global_scale = nvfp4_marlin_process_global_scale(global_scale) + + return weight_ref.T, marlin_qweight, marlin_scales, global_scale + + +def rand_marlin_weight_mxfp4_like(weight, group_size): + assert group_size > 0 + size_n, size_k = weight.shape + device = weight.device + + scales = torch.randint( + 100, + 125, + (size_n, size_k // group_size), + dtype=torch.uint8, + device=weight.device, + ) + scales = scales.view(torch.float8_e8m0fnu) + + fp4_weight = torch.randint( + 0, 256, (size_n, size_k // 2), dtype=torch.uint8, device=weight.device + ) + fp4_weight_part_1 = (fp4_weight & 0b10000000) | ((fp4_weight & 0b01110000) >> 2) + fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn) + fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6) + + fp4_weight2 = fp4_weight << 4 + fp4_weight_part_2 = (fp4_weight2 & 0b10000000) | ((fp4_weight2 & 0b01110000) >> 2) + fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn) + fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6) + + weight_ref = torch.cat( + [fp4_weight_part_2.unsqueeze(2), fp4_weight_part_1.unsqueeze(2)], 2 + ).view(size_n, size_k) + weight_ref = weight_ref * scales.repeat_interleave(group_size, 1).to(weight.dtype) + + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=fp4_weight.view(torch.int32).T.contiguous(), + perm=torch.empty(0, dtype=torch.int, device=device), + size_k=size_k, + size_n=size_n, + num_bits=4, + ) + + marlin_scales = marlin_permute_scales( + s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size + ) + + marlin_scales = mxfp4_marlin_process_scales(marlin_scales) + + return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu) diff --git a/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/model_executor/layers/quantization/utils/marlin_utils_fp8.py new file mode 100644 index 0000000..8c96848 --- /dev/null +++ b/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -0,0 +1,351 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +import vllm._custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + USE_FP32_REDUCE_DEFAULT, + marlin_make_workspace_new, + marlin_permute_bias, + marlin_permute_scales, + should_use_atomic_add_reduce, +) +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + + +def is_fp8_marlin_supported(): + return current_platform.has_device_capability(80) + + +def fp8_fused_exponent_bias_into_scales(scales): + fp8_exponent = 4 + if scales.dtype == torch.half: + target_exponent = 5 + elif scales.dtype == torch.bfloat16: + target_exponent = 8 + # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8 + # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120 + exponent_bias = 2 ** (target_exponent - 1) - 2 ** (fp8_exponent - 1) + s = torch.ones_like(scales) * 2 + s = s**exponent_bias + return scales * s + + +def apply_fp8_marlin_linear( + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + workspace: torch.Tensor, + size_n: int, + size_k: int, + bias: torch.Tensor | None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT, +) -> torch.Tensor: + # For GPUs that lack FP8 hardware support, we can leverage the + # Marlin kernel for fast weight-only FP8 quantization + + reshaped_x = input.reshape(-1, input.shape[-1]) + out_shape = input.shape[:-1] + (size_n,) + + use_atomic_add = should_use_atomic_add_reduce( + m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype + ) + + output = ops.gptq_marlin_gemm( + a=reshaped_x, + c=None, + b_q_weight=weight, + b_bias=bias, + b_scales=weight_scale, + global_scale=None, + b_zeros=None, + g_idx=None, + perm=None, + workspace=workspace, + b_q_type=scalar_types.float8_e4m3fn, + size_m=reshaped_x.size(0), + size_n=size_n, + size_k=size_k, + use_atomic_add=use_atomic_add, + use_fp32_reduce=use_fp32_reduce, + ) + + return output.reshape(out_shape) + + +def prepare_fp8_layer_for_marlin( + layer: torch.nn.Module, size_k_first: bool = True +) -> None: + logger.warning_once( + "Your GPU does not have native support for FP8 computation but " + "FP8 quantization is being used. Weight-only FP8 compression will " + "be used leveraging the Marlin kernel. This may degrade " + "performance for compute-heavy workloads." + ) + + part_size_n = layer.output_size_per_partition + part_size_k = layer.input_size_per_partition + weight_block_size = getattr(layer, "weight_block_size", None) + + if size_k_first: + assert layer.weight.shape == (part_size_k, part_size_n) + else: + assert layer.weight.shape == (part_size_n, part_size_k) + + device = layer.weight.device + + # WORKSPACE + layer.workspace = marlin_make_workspace_new(device) + + # WEIGHT + # Repack weights to marlin format + perm = torch.empty(0, dtype=torch.int, device=device) + qweight = pack_fp8_to_int32(layer.weight, size_k_first) + if not size_k_first: + qweight = qweight.T.contiguous() + + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=qweight, + perm=perm, + size_k=part_size_k, + size_n=part_size_n, + num_bits=8, + ) + layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False) + + # WEIGHT SCALES + # Permute scales + if "weight_scale" in dir(layer): + scales = layer.weight_scale.to(layer.orig_dtype) + elif "weight_scale_inv" in dir(layer): + scales = layer.weight_scale_inv.to(layer.orig_dtype) + del layer.weight_scale_inv + + group_size = -1 if weight_block_size is None else weight_block_size[1] + + # marlin kernel only support channel-wise and group-wise quantization + # we need to convert the scales + if weight_block_size is None: + if scales.nelement() == 1: + # tensor-wise quantization -> channel-wise quantization + # (1, 1) =>(repeat)=> (1, size_n) + scales = scales.view(1, 1).repeat_interleave(part_size_n, 1) + elif scales.nelement() > 1 and scales.nelement() != part_size_n: + assert part_size_n % scales.nelement() == 0 + s_size = scales.nelement() + # tensor-wise quantization (for gate-up proj) + # -> channel-wise quantization + # (1, s_size) =>(repeat)=> (1, size_n) + scales = scales.view(1, s_size) + scales = scales.repeat_interleave(part_size_n // s_size, 1) + else: + # channel-wise quantization + # (1, size_n) + scales = scales.view(1, part_size_n) + else: + # block-wise quantization -> group-wise quantization + # (size_k // block_size[1], ceil(size_n / block_size[0])) + # =>(repeat)=> (size_k // block_size[1], size_n) + if not size_k_first: + scales = scales.T.contiguous() + block_n = weight_block_size[0] + scales = scales.repeat_interleave(block_n, 1) + # size_n may not divisible by block_size[0] + scales = scales[:, :part_size_n] + + marlin_scales = marlin_permute_scales( + s=scales, size_k=part_size_k, size_n=part_size_n, group_size=group_size + ) + marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales) + layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False) + + if hasattr(layer, "bias") and layer.bias is not None: + assert layer.bias.shape == (part_size_n,) + bias = marlin_permute_bias(layer.bias) + layer.bias = torch.nn.Parameter(bias, requires_grad=False) + + +def prepare_moe_fp8_layer_for_marlin( + layer: torch.nn.Module, size_k_first: bool = True +) -> None: + logger.warning_once( + "Your GPU does not have native support for FP8 computation but " + "FP8 quantization is being used. Weight-only FP8 compression will " + "be used leveraging the Marlin kernel. This may degrade " + "performance for compute-heavy workloads." + ) + + e = layer.num_experts + k = layer.hidden_size + n = layer.intermediate_size_per_partition + weight_block_size = getattr(layer, "weight_block_size", None) + + # WORKSPACE + device = layer.w13_weight.device + layer.workspace = marlin_make_workspace_new(device, 4) + perm = torch.empty(0, dtype=torch.int, device=device) + + # WEIGHT + # Repack weights to marlin format + for name in ["w13_weight", "w2_weight"]: + weight = getattr(layer, name) + tensor_list = [] + if "w13" in name: + size_n, size_k = n * 2, k + else: + size_n, size_k = k, n + + if size_k_first: + assert weight.shape == (e, size_k, size_n) + else: + assert weight.shape == (e, size_n, size_k) + + for i in range(e): + qweight = pack_fp8_to_int32(weight[i], size_k_first) + if not size_k_first: + qweight = qweight.T.contiguous() + + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=8 + ) + tensor_list.append(marlin_qweight) + + weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + weight = torch.nn.Parameter(weight, requires_grad=False) + + setattr(layer, name, weight) + + # WEIGHT SCALES + # Permute scales + group_size = -1 if weight_block_size is None else weight_block_size[1] + + for name in ["w13", "w2"]: + if name + "_weight_scale" in dir(layer): + new_name = name + "_weight_scale" + scales = getattr(layer, new_name).to(layer.orig_dtype) + delattr(layer, new_name) + elif name + "_weight_scale_inv" in dir(layer): + new_name = name + "_weight_scale_inv" + scales = getattr(layer, new_name).to(layer.orig_dtype) + delattr(layer, new_name) + + tensor_list = [] + if "w13" in name: + size_n, size_k = n * 2, k + else: + size_n, size_k = k, n + + # marlin kernel only support channel-wise and group-wise quantization + # we need to convert the scales + if weight_block_size is None: + if scales.nelement() == e: + # tensor-wise quantization -> channel-wise quantization + # (e, 1, 1) =>(repeat)=> (e, 1, size_n) + scales = scales.view(e, 1, 1).repeat_interleave(size_n, 2) + elif scales.nelement() > e and scales.nelement() != e * size_n: + assert (e * size_n) % scales.nelement() == 0 + s_size = scales.nelement() // e + # tensor-wise quantization (for gate-up proj) + # -> channel-wise quantization + # (e, 1, s_size) =>(repeat)=> (e, 1, size_n) + scales = scales.view(e, 1, s_size) + scales = scales.repeat_interleave(size_n // s_size, 2) + else: + # channel-wise quantization + # (e, 1, size_n) + scales = scales.view(e, 1, size_n) + else: + # block-wise quantization -> group-wise quantization + # (e, size_k // block_size[1], ceil(size_n / block_size[0])) + # =>(repeat)=> (e, size_k // block_size[1], size_n) + if not size_k_first: + scales = scales.permute(0, 2, 1) + block_n = weight_block_size[0] + scales = scales.repeat_interleave(block_n, 2) + # size_n may not divisible by block_size[0] + scales = scales[..., :size_n].contiguous() + + for i in range(e): + marlin_scales = marlin_permute_scales( + s=scales[i], size_k=size_k, size_n=size_n, group_size=group_size + ) + tensor_list.append(marlin_scales) + + scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + scales = fp8_fused_exponent_bias_into_scales(scales) + scales = torch.nn.Parameter(scales, requires_grad=False) + + setattr(layer, name + "_weight_scale", scales) + + # BIAS + # Permute bias + for name in ["w13_bias", "w2_bias"]: + if not hasattr(layer, name): + continue + bias = getattr(layer, name).to(layer.orig_dtype) + + tensor_list = [] + for i in range(e): + expert_bias = bias[i] + + tensor_list.append(marlin_permute_bias(expert_bias)) + + bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + bias = torch.nn.Parameter(bias, requires_grad=False) + setattr(layer, name, bias) + + +def pack_fp8_to_int32( + fp8_tensor: torch.Tensor, size_k_first: bool = True +) -> torch.Tensor: + """ + Repack FP8 weights to gptq format (packed int32 elements) + """ + assert fp8_tensor.dtype == torch.float8_e4m3fn + assert fp8_tensor.ndim == 2 + + fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor + fp8_tensor = fp8_tensor.contiguous() + # fp8_tensor is contiguous and have shape (N, K) now + # with `.view(torch.int32)`, it become (N, K // 4) + int32_tensor = fp8_tensor.view(torch.int32) + return int32_tensor.T.contiguous() if size_k_first else int32_tensor + + +def marlin_quant_fp8_torch(weight, group_size): + size_n, size_k = weight.shape + device = weight.device + + if group_size != -1: + scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448 + repeated_scales = scales.repeat_interleave(group_size, 1) + fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn) + weight_ref = fp8_weight.to(weight.dtype) * repeated_scales + else: + scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448 + repeated_scales = scales.repeat_interleave(size_k, 1) + fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn) + weight_ref = fp8_weight.to(weight.dtype) * repeated_scales + + packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous() + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=packed_weight, + perm=torch.empty(0, dtype=torch.int, device=device), + size_k=size_k, + size_n=size_n, + num_bits=8, + ) + + marlin_scales = marlin_permute_scales( + s=scales.T, size_k=size_k, size_n=size_n, group_size=group_size + ) + + marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales) + + return weight_ref.T, marlin_qweight, marlin_scales diff --git a/model_executor/layers/quantization/utils/marlin_utils_test.py b/model_executor/layers/quantization/utils/marlin_utils_test.py new file mode 100644 index 0000000..89756c4 --- /dev/null +++ b/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -0,0 +1,161 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility functions used for tests and benchmarks""" + +import numpy as np +import torch + +from vllm.scalar_type import ScalarType + +from .marlin_utils import GPTQ_MARLIN_TILE, marlin_permute_scales, marlin_zero_points +from .quant_utils import ( + get_pack_factor, + gptq_quantize_weights, + quantize_weights, + sort_weights, +) + + +class MarlinWorkspace: + def __init__(self, out_features, min_thread_n, max_parallel): + assert out_features % min_thread_n == 0, ( + "out_features = {} is indivisible by min_thread_n = {}".format( + out_features, min_thread_n + ) + ) + + max_workspace_size = (out_features // min_thread_n) * max_parallel + + self.scratch = torch.zeros(max_workspace_size, dtype=torch.int, device="cuda") + + +def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE): + assert q_w.shape == (size_k, size_n) + assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}" + assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}" + + # Permute weights to 16x64 marlin tiles + q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile)) + q_w = q_w.permute((0, 2, 1, 3)) + q_w = q_w.reshape((size_k // tile, size_n * tile)) + + q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape) + + return q_w + + +def marlin_weights(q_w, size_k, size_n, num_bits, perm): + # Permute + q_w = marlin_permute_weights(q_w, size_k, size_n, perm) + + # Pack + pack_factor = get_pack_factor(num_bits) + orig_device = q_w.device + + q_w = q_w.cpu().numpy().astype(np.uint32) + + q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), dtype=np.uint32) + for i in range(pack_factor): + q_packed |= q_w[:, i::pack_factor] << num_bits * i + + q_packed = torch.from_numpy(q_packed.astype(np.int32)).to(orig_device) + + return q_packed + + +def get_weight_perm(num_bits: int): + perm_list: list[int] = [] + for i in range(32): + perm1: list[int] = [] + col = i // 4 + for block in [0, 1]: + for row in [ + 2 * (i % 4), + 2 * (i % 4) + 1, + 2 * (i % 4 + 4), + 2 * (i % 4 + 4) + 1, + ]: + perm1.append(16 * row + col + 8 * block) + for j in range(4): + perm_list.extend([p + 256 * j for p in perm1]) + + perm = np.array(perm_list) + + if num_bits == 4: + interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7]) + elif num_bits == 8: + interleave = np.array([0, 2, 1, 3]) + else: + raise Exception("num_bits must be 4 or 8, got {}".format(num_bits)) + + perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() + perm = torch.from_numpy(perm) + return perm + + +def marlin_quantize( + w: torch.Tensor, + quant_type: ScalarType, + group_size: int, + act_order: bool, + test_perm: torch.Tensor | None = None, +): + size_k, size_n = w.shape + num_bits = quant_type.size_bits + + # Normalize group_size + if group_size == -1: + group_size = size_k + assert group_size <= size_k + + # Quantize (and apply act_order if provided) + w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights( + w, quant_type, group_size, act_order, test_perm + ) + + # For act_order, sort the "weights" and "g_idx" so that group ids are + # increasing + sort_indices = torch.empty(0, dtype=torch.int, device=w.device) + if act_order: + q_w, g_idx, sort_indices = sort_weights(q_w, g_idx) + + # Reformat to marlin + weight_perm = get_weight_perm(num_bits) + marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm) + marlin_s = marlin_permute_scales(s, size_k, size_n, group_size) + + # Create result + res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm] + for i in range(len(res_list)): + res_list[i] = res_list[i].to(w.device) + + return res_list + + +def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int): + size_k, size_n = w.shape + + # Normalize group_size + if group_size == -1: + group_size = size_k + assert group_size <= size_k + + # Detect num groups + assert size_k % group_size == 0 + num_groups = size_k // group_size + + # Quantize with zp + w_ref, q_w, s, zp = quantize_weights(w, quant_type, group_size, zero_points=True) + + # Reformat to marlin + weight_perm = get_weight_perm(quant_type.size_bits) + marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits, weight_perm) + marlin_s = marlin_permute_scales(s, size_k, size_n, group_size) + marlin_zp = marlin_zero_points(zp, num_groups, size_n, quant_type.size_bits) + + # Create result + res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp] + for i in range(len(res_list)): + res_list[i] = res_list[i].to(w.device) + + return res_list diff --git a/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/model_executor/layers/quantization/utils/marlin_utils_test_24.py new file mode 100644 index 0000000..90011f1 --- /dev/null +++ b/model_executor/layers/quantization/utils/marlin_utils_test_24.py @@ -0,0 +1,467 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility functions used for tests and benchmarks""" + +import random + +import numpy +import torch + +from vllm.scalar_type import ScalarType + +from .marlin_utils_test import marlin_weights +from .quant_utils import gptq_quantize_weights + + +# This is PyTorch implementation of main part of reorder_meta() +# function, from tools/util/include/cutlass/util/host_reorder.h file +# of CUTLASS source tree. Furthermore, CUTLASS template for sparse +# GEMM decides upon layout of this matrix, and at the moment for the +# sparse GEMM executed on tensor cores, this is layout described by +# ColumnMajorInterleaved<2> data structure, in +# include/cutlass/layout/matrix.h of CUTLASS source tree. The +# reordering of meta matrix into meta_reordered matrix calculated +# according to these segments of CUTLASS code is re-implemented here. +# Note that this calculation produces offsets for scattering metadata +# matrix elements into reordered metadata matrix elements (or, +# equivalently, for gathering reordered metadata matrix element back +# into metadata matrix elements). +def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype, device): + dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols) + dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1) + + # Reorder the rows, then swizzle the 2x2 blocks. + group_x = 64 + group_y = 32 if meta_dtype.itemsize == 2 else 16 + + dst_rows = ( + dst_rows // group_x * group_x + + (dst_rows % 2) * 2 + + (dst_rows % 8) // 4 + + ((dst_rows % group_y) % 4) // 2 * 32 + + ((dst_rows % group_x) // 8) * 4 + ) + + topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8) + bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8) + dst_rows += topright - bottomleft + dst_cols -= topright - bottomleft + + # Assumed that meta tensor is to be stored in CUTLASS + # InterleavedColumnMajor layout, and reverse engineered + # corresponding code to store values into this tensor. + interleave = 2 + cols_maj = dst_cols // interleave + cols_min = dst_cols % interleave + return (cols_maj * m * interleave + dst_rows * interleave + cols_min).view(-1) + + +# This function converts dense matrix into sparse semi-structured +# representation, producing "compressed" matrix, in the layout used by +# CUTLASS backend, and corresponding metadata matrix. +def sparse_semi_structured_from_dense_cutlass(dense): + if dense.dim() != 2: + raise RuntimeError( + f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor" # noqa: E501 + ) + + m, k = dense.shape + device = dense.device + + meta_dtype = torch.int8 + if dense.dtype == torch.int8: + meta_dtype = torch.int32 + elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]: + meta_dtype = torch.int16 + else: + raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix") + quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4 + if quadbits_per_meta_elem not in (4, 8): + raise RuntimeError("Invalid number of elements per meta element calculated") + + if meta_dtype == torch.int32: + if m % 16 != 0: + raise RuntimeError( + f"Number of rows of dense matrix {m} must be divisible by 16" + ) + else: + if m % 32 != 0: + raise RuntimeError( + f"Number of rows of dense matrix {m} must be divisible by 32" + ) + if k % (4 * quadbits_per_meta_elem) != 0: + raise RuntimeError( + f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}" # noqa: E501 + ) + + if dense.dtype != torch.float: + ksparse = 4 + dense_4 = dense.view(-1, k // ksparse, ksparse) + m0, m1, m2, m3 = (dense_4 != 0).unbind(-1) + else: + ksparse = 2 + dense_2 = dense.view(-1, k // ksparse, ksparse) + m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1) + meta_ncols = k // (ksparse * quadbits_per_meta_elem) + + # Encoding quadruples of True/False values as follows: + # [True, True, False, False] -> 0b0100 + # [True, False, True, False] -> 0b1000 + # [False, True, True, False] -> 0b1001 + # [True, False, False, True ] -> 0b1100 + # [False, True, False, True ] -> 0b1101 + # [False, False, True, True ] -> 0b1110 + # Thus, lower two bits in the encoding are index of the True value + # at the lowest index in the quadruple, and the higher two bits in + # the encoding are index of the other True value in the quadruple. + # In case there are less than two True values, than False value or + # values at some index or indices are considered True for the + # encoding. In case there are more than two True values, then the + # excess True value(s) at some indices are considered False for + # the encoding. The exact encodings used for these cases are as + # follows: + # [False, False, False, False] -> 0b1110 + # [False, False, False, True ] -> 0b1110 + # [False, False, True, False] -> 0b1110 + # [False, True, False, False] -> 0b1001 + # [False, True, True, True ] -> 0b1101 + # [True, False, False, False] -> 0b1000 + # [True, False, True, True ] -> 0b1100 + # [True, True, False, True ] -> 0b0100 + # [True, True, True, False] -> 0b0100 + # [True, True, True, True ] -> 0b0100 + # These particular encodings are chosen, with the help of Espresso + # logic minimizer software, for the purpose of minimization of + # corresponding Boolean functions, that translate non-zero flags + # into encoding bits. Note also possible choices for the first + # and last of these encodings were limited only to (0b0100, + # 0b1110), in order to produce valid encodings for 1:2 sparsity + # case. + + expr0 = m0 & m1 + expr1 = ~m0 & m1 + expr2 = ~m0 & ~m1 + bit0 = expr1 + bit1 = expr2 + bit2 = expr0 | expr2 | m3 + bit3 = expr1 | ~m1 + idxs0 = bit0 | (bit1.to(torch.int64) << 1) + idxs1 = bit2 | (bit3.to(torch.int64) << 1) + + if dense.dtype != torch.float: + sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1)) # type: ignore[possibly-undefined] + sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1)) + sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2) + else: + sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2) # type: ignore[possibly-undefined] + + meta_4 = idxs0 | (idxs1 << 2) + meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype) + + if quadbits_per_meta_elem == 4: + meta = ( + meta_n[:, :, 0] + | (meta_n[:, :, 1] << 4) + | (meta_n[:, :, 2] << 8) + | (meta_n[:, :, 3] << 12) + ) + elif quadbits_per_meta_elem == 8: + meta = ( + meta_n[:, :, 0] + | (meta_n[:, :, 1] << 4) + | (meta_n[:, :, 2] << 8) + | (meta_n[:, :, 3] << 12) + | (meta_n[:, :, 4] << 16) + | (meta_n[:, :, 5] << 20) + | (meta_n[:, :, 6] << 24) + | (meta_n[:, :, 7] << 28) + ) + + # Reorder meta tensor elements. + meta_reordered = meta.new_empty((m * meta_ncols,)) # type: ignore[possibly-undefined] + meta_offsets = _calculate_meta_reordering_scatter_offsets( + m, meta_ncols, meta_dtype, device + ) + meta_reordered.scatter_(0, meta_offsets, meta.view(-1)) + + return (sparse, meta_reordered.view(m, meta_ncols)) + + +# This function performs reverse of the function above - it +# reconstructs dense matrix from a pair of "compressed" matrix, given +# in the layout used by CUTLASS backend, and accompanying metadata +# matrix. +def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered): + if sparse.dim() != 2: + raise RuntimeError( + f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor" # noqa: E501 + ) + + m, k = sparse.shape + device = sparse.device + + if meta_reordered.dim() != 2: + raise RuntimeError( + f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor" # noqa: E501 + ) + if meta_reordered.device != device: + raise RuntimeError( + f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device" # noqa: E501 + ) + + meta_dtype = meta_reordered.dtype + if meta_dtype not in (torch.int16, torch.int32): + raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix") + quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4 + + ksparse = 4 if sparse.dtype != torch.float else 2 + + meta_nrows, meta_ncols = meta_reordered.shape + if meta_nrows != m: + raise RuntimeError( + f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}" # noqa: E501 + ) + if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k: + raise RuntimeError( + f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, " # noqa: E501 + "expected according to the number of columns of meta matrix" + ) + + # Undo meta tensor elements reordering. + meta_offsets = _calculate_meta_reordering_scatter_offsets( + m, meta_ncols, meta_dtype, device + ) + meta = torch.gather(meta_reordered.view(-1), 0, meta_offsets).view(m, meta_ncols) + + # Unpack sparse tensor back to original dense tensor, using + # information provided by meta tensor. Note that torch.float + # datatype is handled pretty much the same as + # torch.half/torch.bfloat16, as metadata for a pair of torch.float + # value is encoded as if underlying 8 bytes contain four + # torch.half/torch.bfloat16 values, where either first two or last + # two are zeros. + meta_2 = torch.empty( + (m, meta_ncols, 2 * quadbits_per_meta_elem), + dtype=meta_dtype, + device=device, + ) + if quadbits_per_meta_elem == 4: + meta_2[:, :, 0] = meta & 0b11 + meta_2[:, :, 1] = (meta >> 2) & 0b11 + meta_2[:, :, 2] = (meta >> 4) & 0b11 + meta_2[:, :, 3] = (meta >> 6) & 0b11 + meta_2[:, :, 4] = (meta >> 8) & 0b11 + meta_2[:, :, 5] = (meta >> 10) & 0b11 + meta_2[:, :, 6] = (meta >> 12) & 0b11 + meta_2[:, :, 7] = (meta >> 14) & 0b11 + elif quadbits_per_meta_elem == 8: + meta_2[:, :, 0] = meta & 0b11 + meta_2[:, :, 1] = (meta >> 2) & 0b11 + meta_2[:, :, 2] = (meta >> 4) & 0b11 + meta_2[:, :, 3] = (meta >> 6) & 0b11 + meta_2[:, :, 4] = (meta >> 8) & 0b11 + meta_2[:, :, 5] = (meta >> 10) & 0b11 + meta_2[:, :, 6] = (meta >> 12) & 0b11 + meta_2[:, :, 7] = (meta >> 14) & 0b11 + meta_2[:, :, 8] = (meta >> 16) & 0b11 + meta_2[:, :, 9] = (meta >> 18) & 0b11 + meta_2[:, :, 10] = (meta >> 20) & 0b11 + meta_2[:, :, 11] = (meta >> 22) & 0b11 + meta_2[:, :, 12] = (meta >> 24) & 0b11 + meta_2[:, :, 13] = (meta >> 26) & 0b11 + meta_2[:, :, 14] = (meta >> 28) & 0b11 + meta_2[:, :, 15] = (meta >> 30) & 0b11 + + dense_offsets = meta_2.view(-1) + ( + torch.arange(0, 2 * m * k // ksparse, device=device) * 4 + ).view(-1, 1).repeat(1, 2).view(-1) + + dense = torch.zeros((m * 2 * k,), dtype=sparse.dtype, device=device) + if sparse.dtype != torch.float: + # dense.scatter_(0, dense_offsets, sparse.view(-1)) + dense.scatter_(0, dense_offsets, sparse.reshape(-1)) + else: + dense.view(torch.half).scatter_( + 0, dense_offsets, sparse.view(torch.half).view(-1) + ) + + return dense.view(m, 2 * k) + + +def mask_creator(tensor): + """ + Class for creating N:M sparsity masks. + Masks will be created using the N:M ratio, where for every block of + M weights, N will be pruned based on ranked weight value. Each mask + will correspond to the given tensor. + + :param N: The number of weights in a group to keep + :param M: The size of a weight group + """ + N = 2 + M = 4 + + mask = None + # for i, tensor in enumerate(tensors): + if tensor.numel() % M != 0: + raise ValueError( + f"Tensor of size {tensor.shape} can't be evenly divided into {M} groups" + ) + + num_groups = tensor.numel() // M + + # N:M sparsity for linear layers + tensor_temp = tensor.detach().abs().reshape(num_groups, M) + index = torch.argsort(tensor_temp, dim=1)[:, : int(M - N)] + + w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device) + mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape) + + return mask + + +def inject_24(w, size_k, size_n): + assert w.shape == (size_k, size_n) + + mask = mask_creator(w.t()).t().cuda().bool() + + return (mask * w).contiguous(), mask.contiguous() + + +def check_24(w, num_rows_to_sample=50, _verbose=False): + BLOCK_SIZE = 4 + MAX_NON_ZEROS = 2 + + w = w.t().contiguous() + + print("check_24: w.shape = {}".format(w.shape)) + + num_rows, num_cols = w.shape + sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample) + if _verbose: + print(f"Sampled row idxs = {sampled_row_idxs}") + + total_segments = 0 + non_24_segments = 0 + for i in sampled_row_idxs: + for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE): + total_segments += 1 + block = w[i, j : j + BLOCK_SIZE] + num_nonzero = torch.count_nonzero(block) + if num_nonzero > MAX_NON_ZEROS: + print("i = {} j = {} block = {}".format(i, j, block)) + non_24_segments += 1 + + print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.") + + +def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType): + assert q_24.shape == (size_k, size_n) + + # Remove bias to normalize over 0 + q_24_no_zp = q_24 - wtype.bias + + # Compress + q_24_no_zp = q_24_no_zp.t().contiguous() + q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(q_24_no_zp) + q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous() + + # Restore bias + q_24_comp = q_24_no_zp_comp + wtype.bias + + # Resize meta to its actual shape (without moving any data) + meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2) + + return q_24_comp, meta + + +def get_scale_perms_24(): + scale_perm: list[int] = [] + for i in range(8): + scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]]) + scale_perm_single: list[int] = [] + for i in range(8): + scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]]) + return scale_perm, scale_perm_single + + +def get_weight_perm_24(num_bits: int): + perm_list: list[int] = [] + for i in range(32): + perm1: list[int] = [] + col = i // 4 + col_o = col // 2 + for block in [0, 1]: + for row in [ + 2 * (i % 4), + 2 * (i % 4) + 1, + 2 * (i % 4 + 4), + 2 * (i % 4 + 4) + 1, + ]: + perm1.append(16 * row + col_o * 256 + 8 * (col % 2) + 4 * block) + for j in range(4): + perm_list.extend([p + 1 * j for p in perm1]) + perm = numpy.array(perm_list) + + if num_bits == 4: + interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) + elif num_bits == 8: + interleave = numpy.array([0, 2, 1, 3]) + else: + raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits)) + + perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() + perm = torch.from_numpy(perm) + return perm + + +def marlin_permute_scales_24( + s: torch.Tensor, size_k: int, size_n: int, group_size: int +) -> torch.Tensor: + scale_perm, scale_perm_single = get_scale_perms_24() + if group_size < size_k and group_size != -1: + s = s.reshape((-1, len(scale_perm)))[:, scale_perm] + else: + s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single] + s = s.reshape((-1, size_n)).contiguous() + + return s + + +def marlin_24_quantize( + w: torch.Tensor, + quant_type: ScalarType, + group_size: int, +): + size_k, size_n = w.shape + + # Normalize group_size + if group_size == -1: + group_size = size_k + assert group_size <= size_k + + # Inject 2:4 sparsity + w_24, mask_24 = inject_24(w, size_k, size_n) + + # Quantize + w_24_ref, q_w_24, s, g_idx, rand_perm = gptq_quantize_weights( + w_24, quant_type, group_size, act_order=False + ) + + # Compress quantized weight + q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n, quant_type) + size_k_comp = size_k // 2 + + # Reformat to marlin + weight_perm = get_weight_perm_24(quant_type.size_bits) + marlin_24_q_w_comp = marlin_weights( + q_w_24_comp, size_k_comp, size_n, quant_type.size_bits, weight_perm + ) + marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size) + + # Create result + res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s] + for i in range(len(res_list)): + res_list[i] = res_list[i].to(w.device) + + return res_list diff --git a/model_executor/layers/quantization/utils/mxfp4_utils.py b/model_executor/layers/quantization/utils/mxfp4_utils.py new file mode 100644 index 0000000..45ee3da --- /dev/null +++ b/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -0,0 +1,181 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable +from typing import Any + +import torch + +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.triton_utils import triton +from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer + +logger = init_logger(__name__) + + +def _swizzle_mxfp4(quant_tensor, scale, num_warps): + """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel""" + import triton_kernels.matmul_ogs_details.opt_flags as opt_flags + from triton_kernels.numerics import InFlexData + from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor + from triton_kernels.tensor_details import layout + from triton_kernels.tensor_details.layout import StridedLayout + + value_layout_opts: dict[str, Any] = {} + scale_layout_opts: dict[str, Any] = {} + + if ( + current_platform.is_cuda() + and current_platform.is_device_capability(90) + and not is_torch_equal_or_newer("2.8.1") + ): + logger.warning_once( + "Mxfp4 on hopper is running on torch < 2.8.1, " + "this cause swizling to be disabled, which may " + "cause performance degradation. Please upgrade to torch nightly" + ) + value_layout = StridedLayout + scale_layout = StridedLayout + elif current_platform.is_rocm(): + from triton_kernels.tensor_details.layout import ( + GFX950MXScaleLayout, + StridedLayout, + ) + + from vllm.platforms.rocm import on_gfx950 + + value_layout = StridedLayout + scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout + else: + value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout( + mx_axis=1 + ) + scale_layout, scale_layout_opts = ( + layout.make_default_matmul_mxfp4_w_scale_layout( + mx_axis=1, num_warps=num_warps + ) + ) + if current_platform.is_cuda() and current_platform.is_device_capability(100): + constraints = { + "is_persistent": True, + "epilogue_subtile": 1, + } + opt_flags.update_opt_flags_constraints(constraints) + # transpose the tensor so that the quantization axis is on dim1 + quant_tensor = quant_tensor.transpose(-2, -1) + scale = scale.transpose(-2, -1) + quant_tensor = convert_layout( + wrap_torch_tensor(quant_tensor, dtype=FP4), value_layout, **value_layout_opts + ) + scale = convert_layout(wrap_torch_tensor(scale), scale_layout, **scale_layout_opts) + return quant_tensor, InFlexData(), scale + + +def _can_support_mxfp4( + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + scoring_func: str = "softmax", + activation: str = "swigluoai", + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, +): + return not ( + use_grouped_topk + or topk_group + or num_expert_group + or custom_routing_function + or e_score_correction_bias + or apply_router_weight_on_input + or scoring_func != "softmax" + or activation != "swigluoai" + or expert_load_view + or logical_to_physical_map + or logical_replica_count + ) + + +def get_padding_alignment(): + return ( + 256 + if triton.runtime.driver.active.get_current_target().arch in ("gfx950",) + else 128 + ) + + +def _dequant_mxfp4( + x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype +) -> torch.Tensor: + try: + from quark.torch.kernel import mx + except ImportError as err: + raise ImportError( + "The package `amd-quark` is required to use " + "MX-FP4 models. Please install it with `pip install " + "amd-quark`." + ) from err + + return mx.dq_mxfp4(x, scale, float_dtype) + + +def _dequant_mxfp4_fake( + x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype +) -> torch.Tensor: + return torch.empty( + (*x.shape[:-1], x.shape[-1] * 2), dtype=float_dtype, device=x.device + ) + + +def _quant_dequant_mxfp4( + x: torch.Tensor, scale_calculation_mode: str = "even" +) -> torch.Tensor: + try: + from quark.torch.kernel import mx + except ImportError as err: + raise ImportError( + "The package `amd-quark` is required to use " + "MX-FP4 models. Please install it with `pip install " + "amd-quark`." + ) from err + + return mx.qdq_mxfp4(x, scale_calculation_mode) + + +def _quant_dequant_mxfp4_fake( + x: torch.Tensor, scale_calculation_mode: str = "even" +) -> torch.Tensor: + return torch.empty_like(x) + + +# Protect these operations into a torch custom op to avoid errors as +# torch._dynamo.exc.Unsupported: Attempted to call function marked as skipped +# Explanation: Dynamo does not know how to trace the builtin +# `kernel_ext.PyCapsule.dq_uint8_mxfp4_to_half.` This function is either a +# Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python +# extension (perhaps created with pybind). +# TODO: Make sure there is no way to avoid having these functions +# marked as skipped by dynamo. +try: + direct_register_custom_op( + op_name="dequant_mxfp4", + op_func=_dequant_mxfp4, + fake_impl=_dequant_mxfp4_fake, + ) + dequant_mxfp4 = None +except AttributeError as error: + raise error + +try: + direct_register_custom_op( + op_name="quant_dequant_mxfp4", + op_func=_quant_dequant_mxfp4, + fake_impl=_quant_dequant_mxfp4_fake, + ) + quant_dequant_mxfp4 = None +except AttributeError as error: + raise error diff --git a/model_executor/layers/quantization/utils/mxfp6_utils.py b/model_executor/layers/quantization/utils/mxfp6_utils.py new file mode 100644 index 0000000..2b5659e --- /dev/null +++ b/model_executor/layers/quantization/utils/mxfp6_utils.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + +from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_BLOCK_SIZE +from vllm.utils.torch_utils import direct_register_custom_op + + +def _quant_dequant_mxfp6( + x: torch.Tensor, + quant_dtype: str, + scale_calculation_mode: str = "even", +) -> torch.Tensor: + try: + from quark.torch.kernel.hw_emulation.hw_emulation_interface import ( + fake_quantize_fp4_fp6_per_group_with_scale, + ) + from quark.torch.quantization.utils import even_round, reshape_to_blocks + except ImportError as err: + raise ImportError( + "The package `amd-quark` is required to use " + "MX-FP6 models. Please install it with `pip install " + "amd-quark`." + ) from err + + axis = -1 + block_x = reshape_to_blocks(x, OCP_MX_BLOCK_SIZE, axis) + amax, _ = torch.max(torch.abs(block_x), dim=-1, keepdim=True) + amax = amax.squeeze(-1) + + # TODO: there are other rounding strategies supported in quark and in the + # config.json that we do not check for here! + if scale_calculation_mode != "even": + raise NotImplementedError( + f"Scale calculation mode {scale_calculation_mode} is not yet " + "supported in MX-FP6 quantization" + ) + scale = even_round(amax, quant_dtype) + + # Apply dequantize(quantize(x)). + x = fake_quantize_fp4_fp6_per_group_with_scale( + x, + scale.to(x.device), + axis=axis, + group_size=OCP_MX_BLOCK_SIZE, + quant_dtype=quant_dtype, + ) + + return x + + +def _quant_dequant_mxfp6_fake( + x: torch.Tensor, + quant_dtype: str, + scale_calculation_mode: str = "even", +) -> torch.Tensor: + return torch.empty_like(x) + + +def _dequant_mxfp6( + x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype, quant_dtype: str +) -> torch.Tensor: + try: + from quark.torch.kernel.hw_emulation.hw_emulation_interface import ( + dequantize_fp4_fp6_per_group, + ) + from quark.torch.utils.pack import create_pack_method + except ImportError as e: + raise ImportError( + "The package `amd-quark` is required to use " + "MX-FP6 models. Please install it with `pip install " + "amd-quark`." + ) from e + + pack_method = create_pack_method(None, dtype=quant_dtype) + unpacked_x = pack_method.unpack(x, reorder=False) + + scale = 2 ** (scale.view(torch.uint8).to(torch.int16) - 127).to(float_dtype) + + # TODO: `dequantize_fp4_fp6_per_group` and `prepare_inputs_per_group` + # always return fp32. + return dequantize_fp4_fp6_per_group( + unpacked_x, + scale, + axis=-1, + group_size=OCP_MX_BLOCK_SIZE, + quant_dtype=quant_dtype, + ).to(float_dtype) + + +def _dequant_mxfp6_fake( + x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype, quant_dtype: str +) -> torch.Tensor: + assert (x.shape[-1] * 4) % 3 == 0 + return torch.empty( + (*x.shape[:-1], (x.shape[-1] * 4) // 3), dtype=float_dtype, device=x.device + ) + + +# Protect these operations into a torch custom op to avoid errors as +# torch._dynamo.exc.Unsupported: Attempted to call function marked as skipped +# Explanation: Dynamo does not know how to trace the builtin +# `kernel_ext.PyCapsule.dq_uint8_mxfp4_to_half.` This function is either a +# Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python +# extension (perhaps created with pybind). +# TODO: Make sure there is no way to avoid having these functions +# marked as skipped by dynamo. +try: + direct_register_custom_op( + op_name="quant_dequant_mxfp6", + op_func=_quant_dequant_mxfp6, + mutates_args=[], + fake_impl=_quant_dequant_mxfp6_fake, + ) +except AttributeError as error: + raise error + + +# Expose keyword arguments. +def quant_dequant_mxfp6( + x: torch.Tensor, + quant_dtype: str, + scale_calculation_mode: str = "even", +) -> torch.Tensor: + return torch.ops.vllm.quant_dequant_mxfp6(x, quant_dtype, scale_calculation_mode) + + +try: + direct_register_custom_op( + op_name="dequant_mxfp6", + op_func=_dequant_mxfp6, + mutates_args=[], + fake_impl=_dequant_mxfp6_fake, + ) +except AttributeError as error: + raise error + + +def dequant_mxfp6( + x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype, quant_dtype: str +) -> torch.Tensor: + return torch.ops.vllm.dequant_mxfp6(x, scale, float_dtype, quant_dtype) diff --git a/model_executor/layers/quantization/utils/mxfp8_utils.py b/model_executor/layers/quantization/utils/mxfp8_utils.py new file mode 100644 index 0000000..bed771f --- /dev/null +++ b/model_executor/layers/quantization/utils/mxfp8_utils.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def mxfp8_e4m3_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + try: + from flashinfer import mxfp8_quantize as mxfp8_e4m3_quantize + except ImportError as err: + raise ImportError( + "The package `flashinfer` is required to do " + "MX-FP8 quantization. Please install it with" + "`pip install flashinfer`" + ) from err + + x_q, x_scales = mxfp8_e4m3_quantize(x, is_sf_swizzled_layout=False) + if x_scales.ndim == 1: + x_scales = x_scales.view(x.size(0), -1) + return x_q, x_scales diff --git a/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py new file mode 100644 index 0000000..62b4802 --- /dev/null +++ b/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + +from vllm.scalar_type import scalar_types + +__all__ = [ + "break_fp4_bytes", + "dequantize_to_dtype", + "ref_nvfp4_quant", +] + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() + +kE2M1ToFloat = torch.tensor( + [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32 +) + + +def break_fp4_bytes(a, dtype): + assert a.dtype == torch.uint8 + m, n = a.shape + # Vectorized nibble processing + a_flat = a.flatten() + high = (a_flat & 0xF0) >> 4 # Upper nibbles + low = a_flat & 0x0F # Lower nibbles + # Combine nibbles for batch processing + combined = torch.stack((low, high), dim=1).flatten() + # Vectorized sign and magnitude extraction + signs = (combined & 0x08).to(torch.bool) # Sign bits + abs_vals = (combined & 0x07).to(torch.long) + # Device-aware lookup and sign application + kE2M1 = kE2M1ToFloat.to(device=a.device) + values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0) + # Reshape to final form + return values.reshape(m, n * 2).to(dtype=dtype) + + +def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size): + m_tiles = (m + 128 - 1) // 128 + f = block_size * 4 + k_tiles = (k + f - 1) // f + tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4)) + tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5)) + out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size) + return out[0:m, 0:k] + + +def dequantize_to_dtype( + tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16 +): + """Dequantize the fp4 tensor back to high precision.""" + # Two fp4 values are packed into one uint8. + assert tensor_fp4.dtype == torch.uint8 + m, packed_k = tensor_fp4.shape + k = packed_k * 2 + tensor_f32 = break_fp4_bytes(tensor_fp4, torch.float32) + tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size) + tensor_sf = tensor_sf.view(torch.float8_e4m3fn) + tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size) + tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale + + # scale the tensor + out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k) + return out.to(dtype) + + +def get_reciprocal(x): + if isinstance(x, torch.Tensor): + return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x) + elif isinstance(x, (float, int)): + return 0.0 if x == 0 else 1.0 / x + else: + raise TypeError("Input must be a float, int, or a torch.Tensor.") + + +def cast_to_fp4(x): + sign = torch.sign(x) + x = torch.abs(x) + x[(x >= 0.0) & (x <= 0.25)] = 0.0 + x[(x > 0.25) & (x < 0.75)] = 0.5 + x[(x >= 0.75) & (x <= 1.25)] = 1.0 + x[(x > 1.25) & (x < 1.75)] = 1.5 + x[(x >= 1.75) & (x <= 2.5)] = 2.0 + x[(x > 2.5) & (x < 3.5)] = 3.0 + x[(x >= 3.5) & (x <= 5.0)] = 4.0 + x[x > 5.0] = 6.0 + return x * sign + + +def ref_nvfp4_quant(x, global_scale, block_size): + assert global_scale.dtype == torch.float32 + assert x.ndim == 2 + m, n = x.shape + x = torch.reshape(x, (m, n // block_size, block_size)) + vec_max = torch.max(torch.abs(x), dim=-1, keepdim=True)[0].to(torch.float32) + scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX)) + scale = torch.clamp(scale, max=448, min=-448) + scale = scale.to(torch.float8_e4m3fn).to(torch.float32) + output_scale = get_reciprocal(scale * get_reciprocal(global_scale)) + + scaled_x = x.to(torch.float32) * output_scale + clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n) + # both outputs are float32 + return cast_to_fp4(clipped_x), scale.squeeze(-1) + + +def run_nvfp4_emulations( + x: torch.Tensor, + input_global_scale: torch.Tensor, + weight: torch.Tensor, + weight_scale_swizzled: torch.Tensor, + weight_global_scale: torch.Tensor, +): + group_size = 16 + x_m, x_k = x.shape + output_dtype = x.dtype + + # quantize input to (FP4 and interleaved block scale) + x_fp4, x_blockscale = ref_nvfp4_quant(x, input_global_scale, group_size) + + # dequantize input + x_fp4 = x_fp4.reshape(x_m, x_k // group_size, group_size) + x_blockscale = x_blockscale.unsqueeze(-1) / input_global_scale + x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype) + del x_fp4, x_blockscale + + # dequantize weight + w_fp4 = weight.data.view(torch.uint8) + w_dq = dequantize_to_dtype( + w_fp4, + weight_scale_swizzled.data, + weight_global_scale, + output_dtype, + x.device, + group_size, + ) + + # matmul + out = torch.matmul(x_dq, w_dq.t()) + del w_dq, x_dq + return out diff --git a/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/model_executor/layers/quantization/utils/nvfp4_moe_support.py new file mode 100644 index 0000000..c3f26cc --- /dev/null +++ b/model_executor/layers/quantization/utils/nvfp4_moe_support.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + is_flashinfer_fp4_cutlass_moe_available, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + is_fp4_marlin_supported, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + cutlass_fp4_supported, +) + +__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"] + +_logger = init_logger(__name__) + + +@dataclass(frozen=True) +class NvFp4Support: + """Result container for NV-FP4 capability probing.""" + + cutlass_supported: bool + allow_flashinfer: bool + use_marlin: bool + + +def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support: + """Detect platform support for NV-FP4 fused-MoE path""" + cutlass_supported = cutlass_fp4_supported() + + allow_flashinfer = cutlass_supported and is_flashinfer_fp4_cutlass_moe_available() + + if allow_flashinfer: + _logger.info_once( + "Using FlashInfer kernels for %s.", class_name or "NVFP4 path" + ) + else: + if envs.VLLM_USE_FLASHINFER_MOE_FP4: + _logger.warning_once( + "FlashInfer kernels unavailable for %s on current platform.", + class_name or "NVFP4 path", + ) + + use_marlin = False + if not cutlass_supported: + if is_fp4_marlin_supported(): + use_marlin = True + _logger.info_once("Falling back to Marlin FP4 MoE kernel.") + else: + raise ValueError( + "Current platform does not support NVFP4 quantization. " + "Please use Blackwell GPUs or enable FlashInfer." + ) + + return NvFp4Support( + cutlass_supported=cutlass_supported, + allow_flashinfer=allow_flashinfer, + use_marlin=use_marlin, + ) diff --git a/model_executor/layers/quantization/utils/ocp_mx_utils.py b/model_executor/layers/quantization/utils/ocp_mx_utils.py new file mode 100644 index 0000000..7752324 --- /dev/null +++ b/model_executor/layers/quantization/utils/ocp_mx_utils.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from enum import Enum + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +OCP_MX_BLOCK_SIZE = 32 + +OCP_MX_DTYPES = { + "mxfp4", + "mxfp6_e3m2", + "mxfp6_e2m3", + "mxfp8_e4m3", + "mxfp8_e5m2", + "mxint8", +} +SUPPORTED_OCP_MX_DTYPES = {"mxfp4", "mxfp6_e3m2", "mxfp6_e2m3"} + + +class OCP_MX_Scheme(str, Enum): + w_mxfp4_a_mxfp4 = "w_mxfp4_a_mxfp4" + w_mxfp4_a_mxfp6_e3m2 = "w_mxfp4_a_mxfp6_e3m2" + w_mxfp4_a_mxfp6_e2m3 = "w_mxfp4_a_mxfp6_e2m3" + w_mxfp6_e3m2_a_mxfp6_e3m2 = "w_mxfp6_e3m2_a_mxfp6_e3m2" + w_mxfp6_e2m3_a_mxfp6_e2m3 = "w_mxfp6_e2m3_a_mxfp6_e2m3" + + @classmethod + def from_quant_dtype(cls, input_dtype: str | None, weight_dtype: str | None): + if input_dtype not in OCP_MX_DTYPES or weight_dtype not in OCP_MX_DTYPES: + return None + elif input_dtype == "mxfp4" and weight_dtype == "mxfp4": + return cls.w_mxfp4_a_mxfp4 + elif input_dtype == "mxfp6_e3m2" and weight_dtype == "mxfp4": + return cls.w_mxfp4_a_mxfp6_e3m2 + elif input_dtype == "mxfp6_e2m3" and weight_dtype == "mxfp4": + return cls.w_mxfp4_a_mxfp6_e2m3 + elif input_dtype == "mxfp6_e3m2" and weight_dtype == "mxfp6_e3m2": + return cls.w_mxfp6_e3m2_a_mxfp6_e3m2 + elif input_dtype == "mxfp6_e2m3" and weight_dtype == "mxfp6_e2m3": + return cls.w_mxfp6_e2m3_a_mxfp6_e2m3 + else: + logger.warning( + "input_dtype='%s' and" + " weight_dtype='%s' is not supported " + "in OCP_MX_Scheme at the moment.", + input_dtype, + weight_dtype, + ) + return None diff --git a/model_executor/layers/quantization/utils/petit_utils.py b/model_executor/layers/quantization/utils/petit_utils.py new file mode 100644 index 0000000..081f53e --- /dev/null +++ b/model_executor/layers/quantization/utils/petit_utils.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING, Optional + +import torch + +# TYPE_CHECKING is used for static type analysis to prevent circular imports. +if TYPE_CHECKING: + from types import ModuleType + +# 1. Create a global variable as a placeholder for the module +_petit_kernel: Optional["ModuleType"] = None + +_PETIT_INSTALL_MSG = ( + "Petit is not installed. Please install it with `pip install petit-kernel`." +) + + +def _import_petit_kernel() -> "ModuleType": + """ + A helper function to handle the lazy import. + The first time this function is called, it will import the petit_kernel + library and store it in the global _petit_kernel variable. + Subsequent calls will return the already-loaded module directly. + """ + global _petit_kernel + if _petit_kernel is not None: + return _petit_kernel + + try: + import petit_kernel + + _petit_kernel = petit_kernel + return _petit_kernel + except ImportError: + # The 'from None' syntax prevents chaining the original ImportError, + # making the traceback cleaner. + raise ImportError(_PETIT_INSTALL_MSG) from None + + +# The _require_petit function can now be a simple alias for consistency. +_require_petit = _import_petit_kernel + + +def _check_petit_nvfp4_supported( + quant_method: str, group_size: int | None +) -> tuple[bool, str | None]: + if quant_method != "NVFP4": + return ( + False, + ( + "Petit currently only supports: NVFP4 quantizations in sglang. " + "Please check the `hf_quant_config.json` file for your model's " + "quant configuration." + ), + ) + if group_size is not None and group_size != 16: + return ( + False, + "Petit currently only supports: group_size=16 quantizations.", + ) + return (True, None) + + +def verify_petit_nvfp4_supported(quant_method: str, group_size: int | None) -> None: + supported, error_msg = _check_petit_nvfp4_supported(quant_method, group_size) + if not supported: + assert error_msg is not None + raise ValueError(error_msg) + + +def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None: + # 2. Call _import_petit_kernel() to trigger (or get) the import. + petit_kernel = _import_petit_kernel() + + # Repack weights to petit format + part_size_n = layer.output_size_per_partition + part_size_k = layer.input_size_per_partition + qweight = layer.weight.view(torch.int32).contiguous() + + # 3. Call functions through the imported module variable. + petit_qweight = petit_kernel.repack_nvfp4( + qweight, size_n=part_size_n, size_k=part_size_k + ) + layer.weight = torch.nn.Parameter(petit_qweight, requires_grad=False) + + # Permute scales + weight_scale = petit_kernel.process_nvfp4_scales( + scales=layer.weight_scale, size_k=part_size_k, size_n=part_size_n + ) + layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) + + +def apply_petit_nvfp4_linear( + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + weight_scale_2: torch.Tensor, + size_n: int, + size_k: int, + bias: torch.Tensor | None = None, +) -> torch.Tensor: + # Trigger (or get) the import here as well. + petit_kernel = _import_petit_kernel() + + reshaped_x = input.reshape(-1, input.shape[-1]) + out_shape = input.shape[:-1] + (size_n,) + + # TODO: Use auto-tuning to find the performant solution_id + # Call the function via the module variable. + output = petit_kernel.mul_nvfp4_a16( + a=reshaped_x, + b=weight, + s=weight_scale, + global_scale=weight_scale_2, + size_m=reshaped_x.size(0), + size_n=size_n, + size_k=size_k, + solution_id=-1, + ) + if bias is not None: + output.add_(bias) # In-place add + + return output.reshape(out_shape) diff --git a/model_executor/layers/quantization/utils/quant_utils.py b/model_executor/layers/quantization/utils/quant_utils.py new file mode 100644 index 0000000..05c5047 --- /dev/null +++ b/model_executor/layers/quantization/utils/quant_utils.py @@ -0,0 +1,687 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""This file is used for /tests and /benchmarks""" + +from collections.abc import Mapping +from dataclasses import dataclass +from types import MappingProxyType +from typing import ClassVar, NamedTuple + +import numpy +import torch +from torch import fx + +from vllm._custom_ops import cutlass_scaled_mm_supports_fp4 +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType, scalar_types + +FP8_DTYPE = current_platform.fp8_dtype() +FP4_DTYPE = torch.uint8 + + +# Use proxy as NamedTuple direct subclasses cannot have static members +class _GroupShape(NamedTuple): + row: int + col: int + + +class GroupShape(_GroupShape): + """ + This class describes the quantization group shape. + It includes static members for common shapes (per-tensor, per-token). + """ + + # Aliases for common quantization group shapes + PER_TENSOR: ClassVar["GroupShape"] + PER_TOKEN: ClassVar["GroupShape"] + + def is_per_tensor(self) -> bool: + return self.row == -1 and self.col == -1 + + def is_per_token(self) -> bool: + return self.row == 1 and self.col == -1 + + def is_per_group(self) -> bool: + return self.row == 1 and self.col >= 1 + + +GroupShape.PER_TENSOR = GroupShape(-1, -1) +GroupShape.PER_TOKEN = GroupShape(1, -1) + + +@dataclass(frozen=True) +class ScaleDesc: + """ + Class for describing a single quantization scaling factor. + dtype: data type of the scale + static: static scale if True, dynamic if False + group_shape: group shape of the scale + """ + + dtype: torch.dtype + static: bool + group_shape: GroupShape + + def __str__(self): + group_shape = ( + "per_tensor" + if self.group_shape == GroupShape.PER_TENSOR + else ( + "per_token" + if self.group_shape == GroupShape.PER_TOKEN + else str(self.group_shape) + ) + ) + + return ( + f"{fx.graph.dtype_abbrs[self.dtype]}," + f"{'static' if self.static else 'dynamic'},{group_shape}" + ) + + +@dataclass(frozen=True) +class QuantKey: + """ + Class for identifying the type of quantization. + dtype: quantized data type + scale: scale descriptor + scale2: second-level scale descriptor + symmetric: symmetric if True, asymmetric if False + """ + + dtype: torch.dtype + scale: ScaleDesc + scale2: ScaleDesc | None = None + symmetric: bool = True + + def __str__(self): + scale2_str = f"scale2({self.scale2})," if self.scale2 else "" + return ( + f"QuantKey({fx.graph.dtype_abbrs[self.dtype]}," + f"scale({self.scale}),{scale2_str}" + f"{'a' if not self.symmetric else ''}symmetric)" + ) + + +kStaticTensorScale = ScaleDesc(torch.float32, True, GroupShape.PER_TENSOR) +kFp8StaticTensorSym = QuantKey(FP8_DTYPE, kStaticTensorScale, symmetric=True) + +kDynamicTensorScale = ScaleDesc(torch.float32, False, GroupShape.PER_TENSOR) +kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, kDynamicTensorScale, symmetric=True) + +kDynamicTokenScale = ScaleDesc(torch.float32, False, GroupShape.PER_TOKEN) +kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, kDynamicTokenScale, symmetric=True) + +kNvfp4GroupScale = ScaleDesc(FP8_DTYPE, False, GroupShape(1, 16)) +kNvfp4Quant = QuantKey(FP4_DTYPE, scale=kNvfp4GroupScale, scale2=kStaticTensorScale) + + +# Normalize the group_shape to the full extent for any dims that are -1 +def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape): + # -1 means full extent + return ( + group_shape[0] if group_shape[0] > 0 else x.shape[-2], + group_shape[1] if group_shape[1] > 0 else x.shape[-1], + ) + + +# Useful when treating N-dimensional group scaling as extended numpy-style +# broadcasting in numpy simply stretches dimensions with an extent of 1 to match +# the target shape by repeating the data along that dimension (broadcasting) +# , we extend these semantics to say if the extent of a dimension in the +# source shape is not 1 and does not match the target shape we repeat each +# element along that dimension src_shape[dim] // target_shape[dim] times +# example if we have: +# a = [[1, 2], and target_shape = (2, 4) +# [3, 4]] +# then we would expand a to: +# a = [[1, 1, 2, 2], +# [3, 3, 4, 4]] +# NOTE this function does not explicitly broadcast dimensions +# with an extent of 1, since this can be done implicitly by pytorch +def group_broadcast(t, shape): + for i, s in enumerate(shape): + if t.shape[i] != s and t.shape[i] != 1: + assert s % t.shape[i] == 0 + t = ( + t.unsqueeze(i + 1) + .expand(*t.shape[: i + 1], s // t.shape[i], *t.shape[i + 1 :]) + .flatten(i, i + 1) + ) + return t + + +# Quantize assuming once scale per group of elements with shape group_shape, +# example group shapes: +# * (-1, -1) for per-tensor quantization +# * (1, -1) for per-row quantization +# * (-1, 1) for per-column quantization +# * (128, 128) for 128x128 deepseek style block quantization +# * (1, 128) for deepseek style activation quantization +# (i.e. per-token-per-group) +def scaled_quantize( + x: torch.Tensor, + group_shape: GroupShape, + quant_dtype: torch.dtype, +) -> tuple[torch.Tensor, torch.Tensor]: + group_shape = _normalize_quant_group_shape(x, group_shape) + # assert quant_dtype.is_floating_point, ( + # "currently `scaled_quantize` only supports floating point dtypes " + # "but could be extended to support other dtypes" + # ) + + finfo = torch.finfo(quant_dtype) if quant_dtype.is_floating_point else torch.iinfo(quant_dtype) + + # Reshape (M, N) into (BLK_M, BLOCK_SIZE_M, BLK_N, BLOCK_SIZE_N) + assert x.ndim == 2 + assert x.shape[0] % group_shape[0] == 0 and x.shape[1] % group_shape[1] == 0 + blk_m, blk_n = x.shape[0] // group_shape[0], x.shape[1] // group_shape[1] + x_blkd = x.reshape(blk_m, group_shape[0], blk_n, group_shape[1]) + + # Permute to (BLK_M, BLK_N, BLOCK_SIZE_M, BLOCK_SIZE_N) + x_blkd_permd = x_blkd.permute(0, 2, 1, 3) + # Flatten to (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) + x_blkd_permd = x_blkd_permd.flatten(start_dim=2) + + # Compute scales + min_val, max_val = x_blkd_permd.aminmax(dim=-1) + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + scale = finfo.max / amax + + # Apply scale and convert form: + # (BLK_M, BLK_N, BLOCK_SIZE_M * BLOCK_SIZE_N) to (M, N) + x_scl_sat = ( + (x_blkd_permd * scale.unsqueeze(-1)) + .clamp(min=finfo.min, max=finfo.max) + .reshape(blk_m, blk_n, group_shape[0], group_shape[1]) + .permute(0, 2, 1, 3) + .reshape(x.shape) + ) + + return x_scl_sat.to(quant_dtype).contiguous(), scale.float().reciprocal() + + +# inverses `scaled_quantize` +def scaled_dequantize( + x_q: torch.Tensor, + x_s: torch.Tensor, + group_shape: GroupShape | None = None, + out_dtype: torch.dtype = torch.float32, +) -> tuple[torch.Tensor, torch.Tensor]: + if group_shape is not None: + group_shape = _normalize_quant_group_shape(x_q, group_shape) + + if x_s.ndim == 0: # scalar + x_s = x_s.unsqueeze(-1).unsqueeze(-1) # convert to (1, 1) tensor + if x_s.ndim == 1: + if group_shape is None: + raise AssertionError( + "if x_s is 1D tensor, group_shape must be provided otherwise " + "its ambiguous which dimension to broadcast x_s to" + ) + # unsqueeze the scales for the dimension where we want to broadcast + # across the full extent + if group_shape[0] == x_q.shape[-2]: + x_s = x_s.unsqueeze(-2) + elif group_shape[1] == x_q.shape[-1]: + x_s = x_s.unsqueeze(-1) + else: + raise AssertionError( + "if x_s is a vector we should be broadcasting it to the full " + "extent of one of the dimensions" + ) + + if group_shape is not None: + assert x_s.shape[-1] == x_q.shape[-1] // group_shape[1] + assert x_s.shape[-2] == x_q.shape[-2] // group_shape[0] + x_s = group_broadcast(x_s.to(torch.float32), x_q.shape) + return (x_q.to(torch.float32) * x_s).to(out_dtype) + + +def pack_quantized_values_into_int32( + w_q: torch.Tensor, wtype: ScalarType, packed_dim: int = 0 +): + # move dim to pack to the end + perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim) + inv_perm = tuple(perm.index(i) for i in range(len(perm))) + w_q_perm = w_q.permute(perm) + + pack_factor = 32 // wtype.size_bits + mask = (1 << wtype.size_bits) - 1 + + new_shape_perm = list(w_q_perm.shape) + assert w_q_perm.shape[-1] % pack_factor == 0 + new_shape_perm[-1] //= pack_factor + + res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device) + for i in range(pack_factor): + res |= (w_q_perm[..., i::pack_factor] & mask) << wtype.size_bits * i + + return res.permute(inv_perm) + + +def unpack_quantized_values_into_int32( + w_q: torch.Tensor, wtype: ScalarType, packed_dim: int = 0 +): + # move dim to pack to the end + perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim) + inv_perm = tuple(perm.index(i) for i in range(len(perm))) + w_q_perm = w_q.permute(perm) + + pack_factor = 32 // wtype.size_bits + mask = (1 << wtype.size_bits) - 1 + + new_shape_perm = list(w_q_perm.shape) + new_shape_perm[-1] *= pack_factor + + res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device) + for i in range(pack_factor): + res[..., i::pack_factor] = (w_q_perm >> wtype.size_bits * i) & mask + + return res.permute(inv_perm) + + +def is_layer_skipped( + prefix: str, + ignored_layers: list[str], + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}), + *, + skip_with_substr: bool = False, +) -> bool: + def prefix_full_match(prefix: str, ignored_layers: list[str]) -> bool: + return prefix in ignored_layers + + # For case like: ignored_layers = ["self_attn"] + def substr_match(prefix: str, ignored_layers: list[str]) -> bool: + return any(layer in prefix for layer in ignored_layers) + + match_func = substr_match if skip_with_substr else prefix_full_match + + # prefix: model.layers.0.self_attn.q_proj + # proj_name: q_proj + proj_name = prefix.split(".")[-1] + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in fused_mapping: + shard_prefixes = [ + prefix.replace(proj_name, shard_proj_name) + for shard_proj_name in fused_mapping[proj_name] + ] + + is_skipped = None + for shard_prefix in shard_prefixes: + is_shard_skipped = match_func(shard_prefix, ignored_layers) + + if is_skipped is None: + is_skipped = is_shard_skipped + elif is_shard_skipped != is_skipped: + raise ValueError( + f"Detected some but not all shards of {prefix} " + "are quantized. All shards of fused layers " + "to have the same precision." + ) + elif "experts" in prefix and not skip_with_substr: + expert_ignore_layers = filter( + lambda layer_name: "experts" in layer_name, ignored_layers + ) + return any( + prefix in layer_name if not skip_with_substr else layer_name in prefix + for layer_name in expert_ignore_layers + ) + else: + is_skipped = match_func(prefix, ignored_layers) + + assert is_skipped is not None + return is_skipped + + +def get_pack_factor(num_bits): + assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}" + return 32 // num_bits + + +def permute_rows( + q_w: torch.Tensor, + w_ref: torch.Tensor, + group_size: int, + test_perm: torch.Tensor | None = None, +): + assert q_w.shape == w_ref.shape + + orig_device = q_w.device + k_size, _ = q_w.shape + + g_idx = torch.zeros((k_size,), dtype=torch.int32) + for i in range(k_size): + g_idx[i] = i // group_size + + # Simulate act_order by doing a random permutation on K + rand_perm = test_perm if test_perm is not None else torch.randperm(k_size) + + g_idx = g_idx[rand_perm].contiguous() + q_w = q_w[rand_perm, :].contiguous() + w_ref = w_ref[rand_perm, :].contiguous() + + return ( + w_ref.to(device=orig_device), + q_w.to(device=orig_device), + g_idx.to(device=orig_device), + rand_perm.to(device=orig_device), + ) + + +def quantize_weights( + w: torch.Tensor, + quant_type: ScalarType, + group_size: int | None, + zero_points: bool = False, + ref_zero_points_after_scales: bool = False, +): + assert quant_type.is_integer(), ( + "Floating point quantization may work but has not been tested" + ) + assert not zero_points or group_size is not None, ( + "to have group zero points, group_size must be provided " + "(-1 group_size is channelwise)" + ) + + orig_device = w.device + orig_type = w.dtype + size_k, size_n = w.shape + + assert w.is_floating_point(), "w must be float" + + if group_size == -1: + group_size = size_k + + # Reshape to [groupsize, -1] + if group_size is not None and group_size < size_k: + w = w.reshape((-1, group_size, size_n)) + w = w.permute(1, 0, 2) + w = w.reshape((group_size, -1)) + + # Compute scale for each group + max_val = torch.max(w, 0, keepdim=True).values + min_val = torch.min(w, 0, keepdim=True).values + + max_q_val = quant_type.max() + min_q_val = quant_type.min() + + w_s = torch.Tensor([1.0]).to(w.device) # unscaled case + maybe_w_zp = None + if group_size is not None: + if zero_points: + assert not quant_type.is_signed() and quant_type.max() > 0 + w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max() + maybe_w_zp = ( + torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int() + ) + else: + # If the bias is such that there are no possible negative/positive + # values, set the max value to inf to avoid divide by 0 + w_s = torch.max( + abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)), + abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)), + ) + + # Quantize + w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0) + w_q = torch.clamp(w_q, min_q_val, max_q_val) + + # Compute ref (dequantized) + # For some kernels (namely Machete) the zero-points are applied after the + # scales are applied, for this case computing the reference in similar way + # allows us to use tighter error tolerances in our unit tests. + if ref_zero_points_after_scales and maybe_w_zp is not None: + w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s + else: + w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s + + if quant_type.has_bias(): + w_q += quant_type.bias + + # Restore original shapes + if group_size is not None and group_size < size_k: + + def reshape_w(w): + w = w.reshape((group_size, -1, size_n)) + w = w.permute(1, 0, 2) + w = w.reshape((size_k, size_n)).contiguous() + return w + + w_q = reshape_w(w_q) + w_ref = reshape_w(w_ref) + w_s = w_s.reshape((-1, size_n)).contiguous() + + if maybe_w_zp is not None: + maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous() + maybe_w_zp = maybe_w_zp.to(device=orig_device) + + return ( + w_ref.to(device=orig_device), + w_q.to(device=orig_device), + w_s if group_size is not None else None, + maybe_w_zp, + ) + + +SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128] +SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] + + +def gptq_quantize_weights( + w: torch.Tensor, + quant_type: ScalarType, + group_size: int, + act_order: bool, + test_perm: torch.Tensor | None = None, +): + size_k, _ = w.shape + + assert w.is_floating_point(), "w must be float" + assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, ( + f"Unsupported gptq type = {quant_type}" + ) + assert group_size in SUPPORTED_GROUP_SIZES + [size_k], ( + f"Unsupported groupsize = {group_size}" + ) + + w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size) + + # Apply act_order + g_idx = torch.empty(0, dtype=torch.int, device=w.device) + rand_perm = torch.empty(0, dtype=torch.int, device=w.device) + if act_order: + assert group_size < size_k, ( + "For act_order, groupsize = {} must be less than size_k = {}".format( + group_size, size_k + ) + ) + + w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size, test_perm) + + return w_ref, w_q, w_s, g_idx, rand_perm + + +def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor): + orig_device = q_w.device + + sort_indices = torch.argsort(g_idx).to(dtype=torch.int32) # Sort based on g_idx + + g_idx = g_idx[sort_indices].contiguous() + q_w = q_w[sort_indices, :].contiguous() + + return ( + q_w.to(device=orig_device), + g_idx.to(device=orig_device), + sort_indices.to(device=orig_device), + ) + + +def pack_rows( + q_w: torch.Tensor, + num_bits: int, + size_k: int, + size_n: int, +): + assert q_w.shape == (size_k, size_n) + + pack_factor = get_pack_factor(num_bits) + assert size_k % pack_factor == 0 + + orig_device = q_w.device + + q_w = q_w.cpu().numpy().astype(numpy.uint32) + + q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32) + + for i in range(pack_factor): + q_res |= q_w[i::pack_factor, :] << num_bits * i + + q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device) + return q_res + + +def pack_cols( + q_w: torch.Tensor, + num_bits: int, + size_k: int, + size_n: int, +): + assert q_w.shape == (size_k, size_n) + + pack_factor = get_pack_factor(num_bits) + assert size_n % pack_factor == 0 + + orig_device = q_w.device + + q_w = q_w.cpu().numpy().astype(numpy.uint32) + + q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32) + + for i in range(pack_factor): + q_res |= q_w[:, i::pack_factor] << num_bits * i + + q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device) + q_res = q_res.contiguous() + + return q_res + + +def unpack_cols( + packed_q_w: torch.Tensor, + num_bits: int, + size_k: int, + size_n: int, +): + pack_factor = get_pack_factor(num_bits) + assert size_n % pack_factor == 0 + assert packed_q_w.shape == (size_k, size_n // pack_factor), ( + "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format( + packed_q_w.shape, size_k, size_n, pack_factor + ) + ) + + orig_device = packed_q_w.device + + packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32) + q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32) + + mask = (1 << num_bits) - 1 + for i in range(pack_factor): + vals = packed_q_w_cpu & mask + packed_q_w_cpu >>= num_bits + q_res[:, i::pack_factor] = vals + + q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device) + q_res = q_res.contiguous() + + return q_res + + +def gptq_pack( + q_w: torch.Tensor, + num_bits: int, + size_k: int, + size_n: int, +): + return pack_rows(q_w, num_bits, size_k, size_n) + + +def awq_pack( + q_w: torch.Tensor, + num_bits: int, + size_k: int, + size_n: int, +): + assert q_w.shape == (size_k, size_n) + + # Interleave column dim (for the dequantize code) and pack it to int32 + if num_bits == 4: + interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) + elif num_bits == 8: + interleave = numpy.array([0, 2, 1, 3]) + else: + raise Exception("num_bits must be 4 or 8, got {}".format(num_bits)) + + q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel() + q_w = q_w.reshape((-1, size_n)).contiguous() + + return pack_cols(q_w, num_bits, size_k, size_n) + + +def swizzle_blockscale(scale: torch.Tensor) -> torch.Tensor: + """ + Pad and block-interleave the FP4 block-scales so that they match the data + layout expected by the CUTLASS / FlashInfer kernels. + + Parameters + ---------- + scale: torch.Tensor + + Returns + ------- + torch.Tensor + The swizzled tensor with the same logical shape as *scale*. + """ + assert scale.dtype == torch.float8_e4m3fn, ( + "swizzle_blockscale expects the input tensor to be in " + "torch.float8_e4m3fn format." + ) + + scale_ndim = scale.ndim + if scale_ndim == 2: + scale = scale.unsqueeze(0) # (1, M, K) + assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales." + + B, M, K = scale.shape + + def _round_up(x: int, m: int) -> int: + return (x + m - 1) // m * m + + M_padded = _round_up(M, 128) + K_padded = _round_up(K, 4) + + padded = torch.zeros( + (B, M_padded, K_padded), dtype=scale.dtype, device=scale.device + ) + padded[:B, :M, :K] = scale + + # Reshape / permute to the layout required by the kernel. + padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4) + swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda() + + if scale_ndim == 2: + return swizzled.reshape(M_padded, K_padded) + return swizzled.reshape(B, M_padded, K_padded) + + +def cutlass_fp4_supported() -> bool: + if not current_platform.is_cuda(): + return False + capability_tuple = current_platform.get_device_capability() + capability = -1 if capability_tuple is None else capability_tuple.to_int() + return cutlass_scaled_mm_supports_fp4(capability) diff --git a/model_executor/layers/quantization/utils/w8a8_utils.py b/model_executor/layers/quantization/utils/w8a8_utils.py new file mode 100644 index 0000000..fceed3e --- /dev/null +++ b/model_executor/layers/quantization/utils/w8a8_utils.py @@ -0,0 +1,516 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +from packaging import version + +from vllm import _custom_ops as ops +from vllm import envs +from vllm.config import CompilationMode, get_current_vllm_config +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.platforms import current_platform +from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm, has_flashinfer +from vllm.utils.platform_utils import get_cu_count +from vllm.utils.torch_utils import direct_register_custom_op + +# Input scaling factors are no longer optional in _scaled_mm starting +# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale +TORCH_DEVICE_IDENTITY = None + +# The condition to determine if it is on a platform that supports +# torch._scaled_mm rowwise feature. +# The condition is determined once as the operations +# are time-consuming. +USE_ROWWISE_TORCH_SCALED_MM = ( + current_platform.is_rocm() + and version.parse(torch.__version__) >= version.parse("2.7") + and current_platform.has_device_capability(94) +) + + +def sparse_cutlass_supported() -> bool: + if not current_platform.is_cuda(): + return False + + capability_tuple = current_platform.get_device_capability() + capability = -1 if capability_tuple is None else capability_tuple.to_int() + + return ops.cutlass_sparse_scaled_mm_supported(capability) + + +def cutlass_fp8_supported() -> bool: + if not current_platform.is_cuda(): + return False + + capability_tuple = current_platform.get_device_capability() + capability = -1 if capability_tuple is None else capability_tuple.to_int() + + return ops.cutlass_scaled_mm_supports_fp8(capability) + + +def cutlass_block_fp8_supported() -> bool: + if not current_platform.is_cuda(): + return False + + capability_tuple = current_platform.get_device_capability() + capability = -1 if capability_tuple is None else capability_tuple.to_int() + + return ops.cutlass_scaled_mm_supports_block_fp8(capability) + + +def cutlass_group_gemm_supported() -> bool: + if not current_platform.is_cuda(): + return False + + capability_tuple = current_platform.get_device_capability() + capability = -1 if capability_tuple is None else capability_tuple.to_int() + + return ops.cutlass_group_gemm_supported(capability) + + +CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported() +CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported() + + +def per_tensor_dequantize( + tensor: torch.Tensor, inv_scale: float | torch.Tensor +) -> torch.Tensor: + fake_qweight = tensor.to(torch.float16) + dq_weight = fake_qweight * inv_scale + return dq_weight + + +def all_close_1d(x: torch.Tensor) -> bool: + assert len(x.shape) == 1 + return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0])) + + +def convert_to_channelwise( + weight_scale: torch.Tensor, logical_widths: list[int] +) -> tuple[torch.Tensor, torch.Tensor]: + # Create channelwise buffer + weight_scale_channel = torch.empty( + (sum(logical_widths), 1), dtype=torch.float32, device=weight_scale.device + ) + + # Expand each scale to match the size of each logical matrix. + start = 0 + for idx, logical_width in enumerate(logical_widths): + end = start + logical_width + weight_scale_channel[start:end, :] = weight_scale[idx] + start = end + + return weight_scale_channel + + +def requantize_with_max_scale( + weight: torch.Tensor, weight_scale: torch.Tensor, logical_widths: list[int] +) -> tuple[torch.Tensor, torch.Tensor]: + # Max scale to be used for requanitzation. + max_w_scale = weight_scale.max() + + # QKV / MLP is fused in the on disk checkpoint if any of the + # weight scales are still set to the default since we initialize + # N weight scales for N shards but we only load 1 weight scale + # from disk in this case. Skip requantization in this case (since) + # we already are quantized with the single scale. + # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 + unfused_module_in_checkpoint = ( + weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min + ) + + # If unfused checkpoint, need requanize with the single scale. + if unfused_module_in_checkpoint: + start = 0 + for idx, logical_width in enumerate(logical_widths): + # Skip any component with zero width. + if logical_width == 0: + continue + end = start + logical_width + weight_dq = per_tensor_dequantize(weight[start:end, :], weight_scale[idx]) + weight[start:end, :], _ = ops.scaled_fp8_quant(weight_dq, max_w_scale) + start = end + + return max_w_scale, weight + + +def maybe_create_device_identity(): + # Allocate dummy ones tensor for torch._scaled_mm + global TORCH_DEVICE_IDENTITY + if TORCH_DEVICE_IDENTITY is None: + TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32) + + +def cutlass_w8a8_scaled_mm( + *, + qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, + output_shape: list, + **kwargs, +) -> torch.Tensor: + # Fused GEMM_DQ + output = ops.cutlass_scaled_mm( + qinput, weight, out_dtype=out_dtype, scale_a=scale_a, scale_b=scale_b, bias=bias + ) + return output.view(*output_shape) + + +def flashinfer_w8a8_scaled_mm( + *, + qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, + output_shape: list, + **kwargs, +) -> torch.Tensor: + return flashinfer_scaled_fp8_mm( + qinput, weight, out_dtype=out_dtype, scale_a=scale_a, scale_b=scale_b, bias=bias + ) + + +def rocm_per_tensor_w8a8_scaled_mm_impl( + qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, +) -> torch.Tensor: + from vllm.platforms.rocm import on_mi3xx + + if ( + envs.VLLM_ROCM_USE_SKINNY_GEMM + and on_mi3xx() + and qinput.shape[0] == 1 + and qinput.shape[1] % 16 == 0 + and ((bias is None) or (bias.dtype == out_dtype)) + ): + output = ops.wvSplitKQ( + weight.t(), + qinput, + out_dtype, + scale_a, + scale_b, + get_cu_count(), + bias, + ) + else: + output = torch._scaled_mm( + qinput, + weight, + out_dtype=out_dtype, + scale_a=scale_a, + scale_b=scale_b, + bias=bias, + ) + return output + + +def rocm_per_tensor_w8a8_scaled_mm_fake( + qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, +) -> torch.Tensor: + return qinput.new_empty((*qinput.shape[:-1], weight.shape[1]), dtype=out_dtype) + + +def rocm_per_tensor_w8a8_scaled_mm( + *, + qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, + output_shape: list, +) -> torch.Tensor: + output = torch.ops.vllm.rocm_per_tensor_w8a8_scaled_mm_impl( + qinput, weight, out_dtype, scale_a, scale_b, bias + ) + return torch.narrow(output, 0, 0, qinput.shape[0]).view(*output_shape) + + +direct_register_custom_op( + op_name="rocm_per_tensor_w8a8_scaled_mm_impl", + op_func=rocm_per_tensor_w8a8_scaled_mm_impl, + fake_impl=rocm_per_tensor_w8a8_scaled_mm_fake, +) + + +def torch_per_tensor_w8a8_scaled_mm( + *, + qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, + output_shape: list, +) -> torch.Tensor: + output = torch._scaled_mm( + qinput, weight, out_dtype=out_dtype, scale_a=scale_a, scale_b=scale_b, bias=bias + ) + # A fix for discrepancy in scaled_mm which returns tuple + # for torch < 2.5 and a single value in torch >= 2.5 + if type(output) is tuple and len(output) == 2: + output = output[0] + + return torch.narrow(output, 0, 0, qinput.shape[0]).view(*output_shape) + + +def torch_per_token_w8a8_scaled_mm( + *, + qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, + output_shape: list, + **kwargs, +) -> torch.Tensor: + # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM + # when using it. + # For now it has only been validated on ROCm platform. + # fp8 rowwise scaling in torch._scaled_mm is introduced in + # https://github.com/pytorch/pytorch/pull/144432 using + # hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above. + # + # For CUDA platform please validate if the torch._scaled_mm supports + # rowwise scaled GEMM before using it + + # Fused GEMM_DQ Rowwise GEMM + output = torch._scaled_mm( + qinput, + weight, + out_dtype=out_dtype, + scale_a=scale_a, + scale_b=scale_b.t(), + bias=bias, + ) + + output = torch.narrow(output, 0, 0, qinput.shape[0]) + output = output.view(*output_shape) + return output + + +def torch_channelwise_w8a8_scaled_mm( + *, + qinput: torch.Tensor, + weight: torch.Tensor, + out_dtype: torch.dtype, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + bias: torch.Tensor, + output_shape: list, + **kwargs, +) -> torch.Tensor: + # Use unfused DQ due to limitations with scaled_mm + + # Symmetric quantized GEMM by definition computes the following: + # C = (s_x * X) (s_w * W) + bias + # This is equivalent to dequantizing the weights and activations + # before applying a GEMM. + # + # In order to compute quantized operands, a quantized kernel + # will rewrite the above like so: + # C = s_w * s_x * (X * W) + bias + # + # For the scaled_mm fallback case, we break this down, since it + # does not support s_w being a vector. + + # GEMM + # This computes C = (X * W). + # Output in fp32 to allow subsequent ops to happen in-place + output = torch._scaled_mm( + qinput, + weight, + scale_a=TORCH_DEVICE_IDENTITY, + scale_b=TORCH_DEVICE_IDENTITY, + out_dtype=torch.float32, + ) + # A fix for discrepancy in scaled_mm which returns tuple + # for torch < 2.5 and a single value in torch >= 2.5 + if type(output) is tuple and len(output) == 2: + output = output[0] + # Unpad (undo num_token_padding) + output = torch.narrow(output, 0, 0, qinput.shape[0]) + x_scale = torch.narrow(scale_a, 0, 0, qinput.shape[0]) + + # DQ + # C = sw * sx * (X * W) + bias + output = output * x_scale * scale_b.t() + if bias is not None: + output = output + bias + return output.to(out_dtype).view(*output_shape) + + +def dispatch_w8a8_scaled_mm( + preferred_backend: str, per_tensor_weights: bool, per_tensor_activations: bool +) -> Callable[..., torch.Tensor]: + if per_tensor_weights and per_tensor_activations: + if preferred_backend == "rocm": + return rocm_per_tensor_w8a8_scaled_mm + if preferred_backend == "flashinfer": + return flashinfer_w8a8_scaled_mm + if preferred_backend == "cutlass": + return cutlass_w8a8_scaled_mm + return torch_per_tensor_w8a8_scaled_mm + + # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A + if preferred_backend == "cutlass" or preferred_backend == "flashinfer": + return cutlass_w8a8_scaled_mm + + # If torch.scaled_mm supports per-channel (weights) per-token (inputs) + if ( + not per_tensor_weights + and not per_tensor_activations + and USE_ROWWISE_TORCH_SCALED_MM + ): + return torch_per_token_w8a8_scaled_mm + # Normally, torch.scaled_mm supports per tensor weights + activations only + # so fallback to naive if per channel or per token + return torch_channelwise_w8a8_scaled_mm + + +# TODO(luka): follow similar pattern for marlin and block-fp8-linear +# https://github.com/vllm-project/vllm/issues/14397 +class Fp8LinearOp: + """ + This class executes a FP8 linear layer using cutlass if supported and + torch.scaled_mm otherwise. + It needs to be a class instead of a method so that config can be read + in the __init__ method, as reading config is not allowed inside forward. + """ + + def __init__( + self, + act_quant_static: bool, + act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR, + pad_output: bool | None = None, + ): + if current_platform.is_rocm(): + self.preferred_backend = "rocm" + elif current_platform.is_cuda() and cutlass_fp8_supported(): + if has_flashinfer() and current_platform.has_device_capability(100): + self.preferred_backend = "flashinfer" + else: + self.preferred_backend = "cutlass" + else: + self.preferred_backend = "torch" + + # Note: we pad the input because torch._scaled_mm is more performant + # for matrices with batch dimension > 16. + # This could change in the future. + # We also don't pad when using torch.compile, + # as it breaks with dynamic shapes. + if pad_output is None: + config = get_current_vllm_config().compilation_config + pad_output = ( + config.mode < CompilationMode.VLLM_COMPILE + and self.preferred_backend == "torch" + ) + + self.output_padding = 17 if pad_output else None + self.act_quant_static = act_quant_static + self.act_quant_group_shape = act_quant_group_shape + self.quant_fp8 = QuantFP8( + static=act_quant_static, + group_shape=act_quant_group_shape, + num_token_padding=self.output_padding, + ) + + def apply( + self, + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + out_dtype: torch.dtype | None = None, + input_scale: torch.Tensor | None = None, + input_scale_ub: torch.Tensor | None = None, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + # ops.scaled_fp8_quant supports both dynamic and static quant. + # If dynamic, layer.input_scale is None and x_scale computed from x. + # If static, layer.input_scale is scalar and x_scale is input_scale. + + # View input as 2D matrix for fp8 methods + input_2d = input.view(-1, input.shape[-1]) + output_shape = [*input.shape[:-1], weight.shape[1]] + + if out_dtype is None: + out_dtype = input.dtype + + # If input not quantized + # TODO(luka) remove this path if not used anymore + if input.dtype != current_platform.fp8_dtype(): + qinput, x_scale = self.quant_fp8( + input_2d, + input_scale, + input_scale_ub, + ) + else: + qinput, x_scale = input_2d, input_scale + + # Must have dim() conditions + # In per-token quant scenario, when the number of token is 1, + # the scale will only have 1 elements. + # Without checking the dim(), + # we cannot distingushes between per-tensor and per-token quant. + # Example: + # When the number of token is 1, per-token scale is [[1]] + # When per-tensor scale is [1] or (). + per_tensor_weights = weight_scale.numel() == 1 + per_tensor_activations = (x_scale.numel() == 1) and x_scale.dim() < 2 + + # TODO(luka) do this dispatch during init (after ScaledMM refactor) + w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm( + self.preferred_backend, per_tensor_weights, per_tensor_activations + ) + + return w8a8_scaled_mm_func( + qinput=qinput, + weight=weight, + out_dtype=out_dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias, + output_shape=output_shape, + ) + + +def normalize_e4m3fn_to_e4m3fnuz( + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: + assert weight.dtype == torch.float8_e4m3fn + # The bits pattern 10000000(-128) represents zero in e4m3fn + # but NaN in e4m3fnuz. So here we set it to 0. + # https://onnx.ai/onnx/technical/float8.html + weight_as_int8 = weight.view(torch.int8) + ROCM_FP8_NAN_AS_INT = -128 + weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0 + weight = weight_as_int8.view(torch.float8_e4m3fnuz) + + # For the same bits representation, e4m3fnuz value is half of + # the e4m3fn value, so we should double the scaling factor to + # get the same dequantized value. + # https://onnx.ai/onnx/technical/float8.html + weight_scale = weight_scale * 2.0 + if input_scale is not None: + input_scale = input_scale * 2.0 + return weight, weight_scale, input_scale diff --git a/model_executor/layers/quantization/w8a16.py b/model_executor/layers/quantization/w8a16.py new file mode 100644 index 0000000..6c42ce7 --- /dev/null +++ b/model_executor/layers/quantization/w8a16.py @@ -0,0 +1,114 @@ +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.parameter import (GroupQuantScaleParameter, + PackedvLLMParameter) +from vllm.model_executor.utils import set_weight_attrs + + +class W8a16Config(QuantizationConfig): + """Config class for W8a16. + + """ + + def __init__( + self, + ) -> None: + pass + + def __repr__(self) -> str: + return ("W8a16Config") + + def get_name(self) -> str: + return "w8a16" + + def get_supported_act_dtypes(self) -> List[torch.dtype]: + return [torch.half, torch.bfloat16] + + def get_min_capability(self) -> int: + return 75 + + @staticmethod + def get_config_filenames(): + return [] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "W8a16Config": + return cls() + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["W8a16LinearMethod"]: + if isinstance(layer, LinearBase): + return W8a16LinearMethod(self) + return None + + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class W8a16LinearMethod(LinearMethodBase): + """Linear method for w8a16. + + """ + + def __init__(self, quant_config: W8a16Config): + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + output_size_per_partition = sum(output_partition_sizes) + weight = Parameter( + torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.int8, + ), + requires_grad=False, + ) + set_weight_attrs( + weight, { + "input_dim": 1, + "output_dim": 0, + }) + + scales = Parameter( + torch.empty( + 1, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(scales, { + "input_dim": None, + "output_dim": 1, + }) + + layer.register_parameter("weight", weight) + set_weight_attrs(weight, extra_weight_attrs) + layer.register_parameter("scales", scales) + set_weight_attrs(scales, extra_weight_attrs) + + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + qweight = layer.weight + scales = layer.scales + out_shape = (x.shape[:-1] + (qweight.shape[-2],)) + reshaped_x = x.reshape(-1, x.shape[-1]) + out = ops.linear_w8a16(reshaped_x, qweight, scales, format="TN") + if bias is not None: + out = out + bias + return out.reshape(out_shape) \ No newline at end of file diff --git a/model_executor/layers/resampler.py b/model_executor/layers/resampler.py new file mode 100644 index 0000000..c9fa805 --- /dev/null +++ b/model_executor/layers/resampler.py @@ -0,0 +1,283 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +# +# Copyright 2023 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Shared resampler perceiver network used in multimodal models and +related helpers for sincos positional embeddings. + +Example models: Qwen (Qwen-VL), MiniCPM-V 2.0 +""" + +import math +from collections.abc import Callable +from functools import partial + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.quantization import QuantizationConfig + +DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6) + + +def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor | int) -> torch.Tensor: + # abs_pos: L, C + # tgt_size: (H, W) + # return: M, C + src_size = int(math.sqrt(abs_pos.size(0))) + dtype = abs_pos.dtype + if isinstance(tgt_size, int): + tgt_size = (tgt_size, tgt_size) + if src_size == tgt_size[0] and src_size == tgt_size[1]: + return abs_pos + return ( + F.interpolate( + abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2), + size=(tgt_size[0], tgt_size[1]), + mode="bicubic", + align_corners=False, + ) + .permute(0, 2, 3, 1) + .flatten(0, 2) + .to(dtype=dtype) + ) + + +# sin/cos positional embedding helpers are adapted from: +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_1d_sincos_pos_embed_from_grid( + embed_dim: int, pos: np.ndarray, version: tuple[int, int] = (2, 0) +) -> torch.Tensor: + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) / (H, W) + out: (M, D) / (H, W, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + if version == (2, 0): + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + else: + out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product + emb_sin = np.sin(out) # (H, W, D/2) + emb_cos = np.cos(out) # (H, W, D/2) + emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) + return emb + + +def get_2d_sincos_pos_embed_from_grid( + embed_dim: int, grid: np.ndarray, version: tuple[int, int] = (2, 0) +) -> torch.Tensor: + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid( + embed_dim // 2, grid[0], version + ) # (H*W, D/2) or (H, W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid( + embed_dim // 2, grid[1], version + ) # (H*W, D/2) or (H, W, D/2) + + if version == (2, 0): + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + else: + emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) + return emb + + +def get_2d_sincos_pos_embed( + embed_dim: int, + grid_size: int | tuple[int, int], + cls_token: bool = False, + version: tuple[int, int] = (2, 0), +) -> torch.Tensor: + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or + [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + assert isinstance(grid, np.ndarray) and grid.shape == (2, grid_h_size, grid_w_size) + + if version == (2, 0): + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + else: + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version) + return pos_embed + + +class BaseResampler(nn.Module): + """ + A 2D perceiver-resampler network with one cross attention layers by + (grid_size**2) learnable queries and 2d sincos pos_emb. + Outputs: + A tensor with the shape of (grid_size**2, embed_dim) + """ + + def __init__( + self, + num_queries: int, + embed_dim: int, + num_heads: int, + kv_dim: int | None = None, + norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, + do_post_projection: bool = True, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + + self.num_queries = num_queries + self.embed_dim = embed_dim + self.num_heads = num_heads + + self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim)) + + if kv_dim is not None and kv_dim != embed_dim: + self.kv_proj = ReplicatedLinear( + kv_dim, + embed_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_proj", + ) + else: + # Maintain the same return value with ReplicatedLinear.forward + self.kv_proj = lambda *args, **kwargs: ( # type: ignore # noqa + nn.Identity()(*args, **kwargs), + None, + ) + self.attn = nn.MultiheadAttention(embed_dim, num_heads) + self.ln_q = norm_layer(embed_dim) + self.ln_kv = norm_layer(embed_dim) + self.do_post_projection = do_post_projection + if self.do_post_projection: + self.ln_post = norm_layer(embed_dim) + data = (embed_dim**-0.5) * torch.empty(embed_dim, embed_dim) + self.proj = nn.Parameter(data=data) + + def _repeat(self, query, N: int): + return query.unsqueeze(1).repeat(1, N, 1) + + +class Resampler2(BaseResampler): + """Resampler-perceiver network to be used for a variety of model types, + e.g., Qwen-vl / Minicpmv 2.0. The main difference is the addition of the + do_post_projection arg, which indicates whether or not there should be + a post layer normalization and projector after the attention. This is + present in minicpmv2.0, but not qwen-vl. + """ + + def __init__( + self, + grid_size: int, + embed_dim: int, + num_heads: int, + kv_dim: int | None = None, + norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, + adaptive: bool = False, + do_post_projection: bool = True, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__( + grid_size**2, + embed_dim, + num_heads, + kv_dim, + norm_layer, + do_post_projection=do_post_projection, + quant_config=quant_config, + prefix=prefix, + ) + + self.adaptive = adaptive + pos_embed_arr = get_2d_sincos_pos_embed(embed_dim, grid_size, version=(2, 0)) + + self.pos_embed = nn.Parameter( + torch.from_numpy(pos_embed_arr).requires_grad_(False) + ) + + def forward( + self, + x: torch.Tensor, + tgt_sizes: torch.Tensor | None = None, + attn_mask: torch.Tensor | None = None, + ) -> torch.Tensor: + if tgt_sizes is None: + tgt_sizes = int(math.sqrt(x.size(1))) + if self.adaptive: + pos_embed_arr = get_2d_sincos_pos_embed( + self.embed_dim, tgt_sizes, version=(2, 0) + ) + pos_embed = torch.from_numpy(pos_embed_arr).to( + device=x.device, dtype=x.dtype + ) + else: + pos_embed = get_abs_pos(self.pos_embed, tgt_sizes).to( + device=x.device, dtype=x.dtype + ) + + x, _ = self.kv_proj(x) + x = self.ln_kv(x).permute(1, 0, 2) + + N = x.shape[1] + q = self.ln_q(self.query) + out = self.attn( + self._repeat(q, N) + self.pos_embed.unsqueeze(1), + x + pos_embed.unsqueeze(1), + x, + attn_mask=attn_mask, + )[0] + x = out.permute(1, 0, 2) + if self.do_post_projection: + x = self.ln_post(x) + x = x @ self.proj + return x diff --git a/model_executor/layers/rotary_embedding/__init__.py b/model_executor/layers/rotary_embedding/__init__.py new file mode 100644 index 0000000..56c165f --- /dev/null +++ b/model_executor/layers/rotary_embedding/__init__.py @@ -0,0 +1,278 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Rotary Positional Embeddings.""" + +from typing import Any + +import torch + +from .base import RotaryEmbedding +from .deepseek_scaling_rope import DeepseekScalingRotaryEmbedding +from .dual_chunk_rope import DualChunkRotaryEmbedding +from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding +from .dynamic_ntk_scaling_rope import DynamicNTKScalingRotaryEmbedding +from .linear_scaling_rope import LinearScalingRotaryEmbedding +from .llama3_rope import Llama3RotaryEmbedding +from .llama4_vision_rope import Llama4VisionRotaryEmbedding +from .mrope import MRotaryEmbedding +from .ntk_scaling_rope import NTKScalingRotaryEmbedding +from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding +from .yarn_scaling_rope import YaRNScalingRotaryEmbedding + +_ROPE_DICT: dict[tuple, RotaryEmbedding] = {} + + +def get_rope( + head_size: int, + rotary_dim: int, + max_position: int, + base: float, + is_neox_style: bool = True, + rope_scaling: dict[str, Any] | None = None, + dtype: torch.dtype | None = None, + partial_rotary_factor: float = 1.0, + dual_chunk_attention_config: dict[str, Any] | None = None, +) -> RotaryEmbedding: + if dtype is None: + dtype = torch.get_default_dtype() + if rope_scaling is not None: + # Transforms every value that is a list into a tuple for caching calls + rope_scaling_tuple = { + k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items() + } + rope_scaling_args = tuple(rope_scaling_tuple.items()) + else: + rope_scaling_args = None + + if dual_chunk_attention_config is not None: + dual_chunk_attention_tuple = { + k: tuple(v) if isinstance(v, list) else v + for k, v in dual_chunk_attention_config.items() + if k != "sparse_attention_config" + } + dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items()) + else: + dual_chunk_attention_args = None + + if partial_rotary_factor < 1.0: + rotary_dim = int(rotary_dim * partial_rotary_factor) + key = ( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + rope_scaling_args, + dual_chunk_attention_args, + dtype, + ) + if key in _ROPE_DICT: + return _ROPE_DICT[key] + + if dual_chunk_attention_config is not None: + extra_kwargs = { + k: v + for k, v in dual_chunk_attention_config.items() + if k in ("chunk_size", "local_size") + } + rotary_emb = DualChunkRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + **extra_kwargs, + ) + elif not rope_scaling: + rotary_emb = RotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, dtype + ) + else: + scaling_type = rope_scaling["rope_type"] + + if scaling_type == "llama3": + scaling_factor = rope_scaling["factor"] + low_freq_factor = rope_scaling["low_freq_factor"] + high_freq_factor = rope_scaling["high_freq_factor"] + original_max_position = rope_scaling["original_max_position_embeddings"] + rotary_emb = Llama3RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + scaling_factor, + low_freq_factor, + high_freq_factor, + original_max_position, + ) + elif scaling_type == "mllama4": + rotary_emb = Llama4VisionRotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, dtype + ) + elif scaling_type == "default": + if "mrope_section" in rope_scaling: + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_scaling["mrope_section"], + mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + ) + else: + rotary_emb = RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + ) + elif scaling_type == "linear": + scaling_factor = rope_scaling["factor"] + rotary_emb = LinearScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + ) + elif scaling_type == "ntk": + scaling_factor = rope_scaling["factor"] + mixed_b = rope_scaling.get("mixed_b", None) + rotary_emb = NTKScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + mixed_b, + ) + elif scaling_type == "dynamic": + if "alpha" in rope_scaling: + scaling_alpha = rope_scaling["alpha"] + rotary_emb = DynamicNTKAlphaRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_alpha, + dtype, + ) + elif "factor" in rope_scaling: + scaling_factor = rope_scaling["factor"] + rotary_emb = DynamicNTKScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + ) + else: + raise ValueError( + "Dynamic rope scaling must contain either 'alpha' or 'factor' field" + ) + elif scaling_type == "yarn": + scaling_factor = rope_scaling["factor"] + original_max_position = rope_scaling["original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k + in ( + "extrapolation_factor", + "attn_factor", + "beta_fast", + "beta_slow", + "apply_yarn_scaling", + ) + } + if "mrope_section" in rope_scaling: + extra_kwargs.pop("apply_yarn_scaling", None) + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_scaling["mrope_section"], + mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + scaling_factor=scaling_factor, + **extra_kwargs, + ) + else: + rotary_emb = YaRNScalingRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + scaling_factor, + dtype, + **extra_kwargs, + ) + elif scaling_type == "deepseek_yarn": + scaling_factor = rope_scaling["factor"] + original_max_position = rope_scaling["original_max_position_embeddings"] + # assert max_position == original_max_position * scaling_factor + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k + in ( + "extrapolation_factor", + "attn_factor", + "beta_fast", + "beta_slow", + "mscale", + "mscale_all_dim", + ) + } + rotary_emb = DeepseekScalingRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + scaling_factor, + dtype, + **extra_kwargs, + ) + elif scaling_type == "longrope": + short_factor = rope_scaling["short_factor"] + long_factor = rope_scaling["long_factor"] + original_max_position = rope_scaling["original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k in ("short_mscale", "long_mscale") + } + rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( + head_size, + rotary_dim, + max_position, + original_max_position, + base, + is_neox_style, + dtype, + short_factor, + long_factor, + **extra_kwargs, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + _ROPE_DICT[key] = rotary_emb + return rotary_emb diff --git a/model_executor/layers/rotary_embedding/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/rotary_embedding/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bd0f5076463e26cd443776dc072e60d8d80c31c GIT binary patch literal 6299 zcmb_gTTC3+89sZz@6O&Yu*-7yVgtTmVuNGZ#%X*Bc6_19HtA^03}(&jC9{JK)}T6V z9&l&kiXquZyd%0twPv$3W^ zUWa7^Z%7+hBWvP~SyS2seMZ*Io3oa*g~%4(nzf~EM7HwwtRw9pvW<6UU1=AQ?Yukd zNqdOw;Hj)P?Ip64_htQQKapL0AR9~viR|V>*>E~cWDg(7M$=IuQ+zDjkZ#bS3`#e0 zaW27`#`IALf@@;EgK)QP{;_e^$0b=m{0HDa$ThPeu7wSQRV$}gtyse?3qov!GhBaF zOUc%~IvNGbSRJbdm^;8WzGq6ePa9H+Tk(r|krAeo7xKl7n91iDK6xs8m1EgV?pm?u zOYAqLQw+y*)5Bjvw()~UHQ7{yuEY?>6^a};acPv{!E&cYWN4D%PmWLKCU%Y`hNg2& zHZwYW`RsANFwX4MNN#C7cZSAunH5JztoT7(LpS-Nt#yN(iivr#7_Tx#PH|<5bdJkU(M55Z=M<-qFK~2GEr?=b#pwd41PhEHW*DBX z`WjqNg+3C+gfDE1+oGD*MAL5@k1n; zZwExYJjiujuSr|?2UT&#EScVYLo!`AZH;M?rZa>-uzg@3HDu7J{_R$T-U5F!wi&x* z6Ja*>lI)^YlOI&)6C9#L>yzvcs(5lb$^J9_-F73IabPbsI*u2i(ClZNIX#PJoRan3 zw~Yi zYO$%SlMwXjC4*#?oRV8Ik6C9tl1FrF7B`S!+j^2b_(gUJL3WOOgX}oBWyb(M4U&!2 z$qi9bBucL~V+|tnm^p(a>Q}m-=w`fg`rMdgD*8l!J)N zv>inIW!5Q~APc)hJSDld&1GVpcmVcILv^_3#=2Oywg*1;JgDvqz@y9k?JIMm%Y6bU zD1sugl@T$nO;CfwFPR@ycOdcp8?f{e%iUX+&%x+pk%n<5EH>5mE`_(DzX|^R#Q#Ya zDg*h!2m_=l5wW?BkH|9!M_72@RPCy@7U7MeUN4C3h}f=au|aKb{7?sYLIlr`YU9<` z$v1ebZD0T4dh5TsA_gStdZ&hE>&`~D@~K15teOzny|7mR<=0#{lAn!!WYBh2?)PF! zn~P*g^$7JJvuy3Z9Ik(i zu2oZAY~97{A~q^J}dv(;qGU9+x|lJDo}7y6pj<*j|MVqz$HFTok8L!*Wn`;6`8 zw~|xM!%Daa=YNsgamN2cg4%N*3Bn@GJfTvGxk?{AeEudq zCUEc2)edSrb8UQEPckoLu4RB;(zVQm<~F&eV#yM(2Nes;jWLtFsJOB!h2cgq9Z|gM zVA=z>Nu1M>s|s2FnM1bE9#aV&imys6|;F)^51-a1jGl1c14 zS*1tG>|{|)0^bptOfJb~#BokYb`pb5K%VSW?kYIVm5Z&CY+{!( z3;S^B`*CvsHwU3n%vo&3DQ@*nGdxcSqm>6jibGbM#qqo#YEB)vzxn{huMSq7sDtXm zt=N^^L@s|bm&CMoGjVu5BvWC<1T=DVT=8DxM7ml(TBV$(jL2n+iXG^6t|&6OQBE=P z8OVo8oGbuR0~A!iXT5-r76J1@0n<1E-jPVaL`5-xR{rl@q3CWXK6TZp z#Fdbjm4!nL9jtIL^H^=REOVrH1&rWGeyIP}C#(xnl5*bGnrZC*&Ma0o+d!wo); zlwjRWB;c2i;?fMssuc#X5q^afF=0qP|N8g$dc|y^w~#4x!w*47bobO`|KXP=`ng^{ zbG5fHEsp1Ny*GG1+ndd^98Yso+~_1!yO(FCIic8F{Ux!vlY418lgo%S-BXxWtav@( z+zBH9182JUAE@wcR0$!g>-NkF6;E zbaVS;4^;|uFM7HwCggJ0n@x@kz?iLd4T0T;9kq9rx`vj!&Xl{(EOnh*YJ070ZGO_W zs|2m3alW`{X;}$I=3}LffpT?hI9d1_5XU0}b`a&Y4@mFZ*L=twTM4#->fEh`!H0o|Gk@UL4SGCQv7$)Z!r5|o z@2vBQBRv1oqN7!gr|$16#}CX7%ZaYi?!j{6=8YiTiyK~B!uM1AMhGS9n60}kc(g*{N#0m=wQmKv67Aa&0l*Kgd#nT( z&fff$MOVv~i5zNK4t1169ShSB+m=H8v)@^X?yu`aro!u98}Sp^iZ3(|@pZ__J@>y` zPWH{YR^kIYBE`W11cKU!e6jhr%HEXR+hH55>pSU`-uUn{}+X90?z+($?Pk z<&<0Zt@*JE1k$<|1nHiABZd+$X;E*w!cnhzp!iT+P{qsOo*m2}T6)p7r>1JP(OS1h z>wGp^2z1S0ts5=q0+QHETEW}3b&ppKAc?sFiN6Vu#N2nzs7TU_u1>k3^S-;>&_8=l zj&&@|lw*CfXKT08vlTFI$5e{+m;L>9vJZOjDu@kecwirHLV~Rbj}4GYKo!8K@OHd& z1_JQMOHGF#?Rk_a#a~_Woh~^~%c10ZXJ(7#n{cQ`peV?vV@K$`fz8_vW zPuH&-px95>FX|u*PuCB@+(rER>*HO+z2-k!yNCPCpM<@`1Li-4z2|l2zchKzTg-p$ z_MUf{Q>M$q!$Ja-hf^Jjg{Im3C`~J7U>xv;QNUlHgk!iNXF8@0N^p}QX@?;>dN3bQ zJT*r$>BVeA302u44V)3U5)ef7w89iaiPR2m^GL{tU*($`fm01JOHmvuKURA%mr;Bw zx2128kBsU7<~NFokXn`IP#DCvc`)5kVg>k~NE4ohffEg&wj>D5aujbZk5u~!55lBM zF|wIaQ5eFaho+fa4n87cwpyfV;S}yQjq!O#gl~xXJg*pvqM))Od|v++twW21D_GAu z26z$5;+R{SS46@yNv`q&QU5ABy^0P$+1-8HK07oQocENxon_~)CDaXjprJ|b>XDOea%+ct_~@E{#-Izy z9$&?Pg(sB1V!{F}dw-~6A)*yIsEUn?e|WAqe{yc>UeCh8vahq^BK>aU z^nNgKXJ9^1cEl?l(nq1j=8BhyKGf2&Fna&M!uT%}58KMkhbjaCYy)#}BZyUm>k#mN E0jx)}p#T5? literal 0 HcmV?d00001 diff --git a/model_executor/layers/rotary_embedding/__pycache__/base.cpython-312.pyc b/model_executor/layers/rotary_embedding/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff7d2cac40741acd2d05308eff0553bf9c8af2b4 GIT binary patch literal 9571 zcmd5?Yit`=cAg@JG@W2ywZ*dJ}r znHL{Ba$0N)Ip^}9fG`lCdPC8Z#p%1zt`UMa^$6 zn2^qDyRykUyfp(C)fg{po*Tei3#Mg0C+6?)vNE3$HA+zC3!>(gL}gCOO{RQJuTBAt z7}-7t-|GkDGNIU=i?mAoHS}B9AqkaK3EpK5P+d?n@kJ1-8)_cA&jU5&ttXX&nuU8p z^+L^-K|r3gJ(8G>T6CkdBv|mNgwscsJ5=9nXJfoYsa|VO;7pL8nB8UdHA=yy`ULVr zx3wEcLg}%3XL~KGQCiY~+!joUc_QJNoX~=)ybNC}$ET9184=L$FNl(ymK8Cl01_y9 z4(b}s^XXh#;du$QgcQJx#Vm*!@{~Bpw9uR^^3$;6X41K7nAgRp^4Y?iB0|~p7|o^R zHBJ(5;&UYa#@sXvmhi)9{Qv|-N-OzX(*_g;Ud$zLWJE#R5B>TUe#)Q(qj_X8GyOHf zQu-@v49}-fNASGNfCO>!diAN#7VN%quhb z+}Q0*COekR3u1;B?}({6P!(gD(_zU{zHb_RK>}>@V+z$Gt{2L(Ph#*x`F_k+( z%r=jNeoO#iV%FY#Mw{O5ZfxgSdjmH5KlD5>XF1O=0yc{v4VDWo)dl*DD)yf@P#9Nc z#CSS)8@`ShpO(b8=fqrUe)OinzW%&~f_{$KI;jIX;n65V`FoR6GIvwdp|4TXnS4@7 zxTQWAs_y}f2DUu1lyMmVL(*O-wGMj&SpCI8#}9Aq6KmMFV7B}MWOs==OT>27*zm8y zoy+VJyYkAa=O4HqaT^_<4gO^K$HQCUiJ!)PzV~N)E8%NjQ0`#p3pW+>muQ$4?k;gs z9GIl+Zvhosfu4cZWmGZEy)Xb1v^sn>TRD-6tty?q63n(0q`?n!#s1#2y0OvdLFz;D zf%(J*R}*(sGi?d5<5EdW`7O9r_bh6kwqz;21rPM&hH6W8Kh^ZKNZTN&sveb^b_s4k zo+Y1G_5MDtK*Pn8IUx?pG_GrwQE5L+FEZ(o+4zy?;_2DX{|nMOskz}mO!%e4&}%5r z14s>GrYqHhKxtkvos;LXnj83PZeR)&V9sY??hpuMOvai-LX;xx5Y8UE)b}S2dcbL=$TVK1!@7HlyDv(T~}Vd*5CB?n+;69^r(P_`#0%&MeQ>#dXUhWi1h?JS_~q?dq;jo!u- z^3TAK=)wS(6*H}_Fv?N>xN{#_7 zBWNvk)XQg+N@}L{H0E_Kv0;Kat0Yzi10}r5MYUb36e`@MV{CEJP!x%Tn1^D=ii=7X3QQR&kOk!G`rI| zMbfvC=SDuCkq&^I#`56Vt9f4+b8=oXb|CHz=>#?hO-mppW->evM?gQc%BUP)Ft!oe z2NGsz_Q*eg?6>a`|K=I1dj=@~lW13+g0k)2jt(L_1mj!?ptZsMSLe(L-^AhvIJbP-bmo(dI!2K7ls969&8! zT4V&j=ymcGe)3{r)@aZ+`FRNOaCst8CG28?!q)er^GI-+~lQLq>~HLgv` zKHWu;Fly53{n~~$9{FwFFL;dT{esWy0^b(Q7Vsub6Pko$W1XCYg8{XG?tvs|9@f*iI zV+96e36CtzeM_9zkG2-aA74ec08YV?wv*jz?d*jeELed=Edb6nHEUb2&|oHX?QJWM zR_R%XGhAq~1+RlF3(SJw`D9-RKsz27Z3*0Wuv%>k7Q7`4n5mvWa~WD{fyEZpBY5qW zpc1f{sI1DUe)zbiMY6yucp&M?NoT|OV5hIOyH{`98ma(rZ?i+-iZ5}H{#Ur|KN>6_8Wn}pu*b19`yb0$hYG!&%cf#y(77#lD-Wt zVYZMFvk>usYCadwPsgY8(mP2>IGh;OeL`s;EUq+!*)U|IlMrI?L7YQ@FbTv+W|9!B zV$(T6gaDS95mR88r)K7Iw=~b~wD^vM$Te(`qd;jsNz|t#I;0EOgP4~tV%FG;XxQn2 z70sWU%R&V57MLuW&$QAT2EP{2O=aGYfiWslUeWvpS%Aof#=w3uNBEG$ei{kvmC-jc zcXy*O+}Dt~4iJ`urAZb0+jo3xzX+V`;lRb@ngwOZ@69 zmCi#Ye=V}N8X2iXM%G^0U^XLXO1@gOcX@nie6?fEb$@&_I=o1i+@;r^bnRZgwREc# zsDZ!WU-GY+jHjK~c1&6}_qdDAeC?=u_HR`^Uc{9+~i;={9>;VUHyOgEM*JzwkI{Q>hn zQ+lZu>0Ca%bl7}q3YdcG7vK z!guu8!8d8<2V5{gt9dXObt8vX2e18ssRLo}kkTa>nV_WaLE!|}b&qkTw8C0O2@qOs zpfYyf&Y&{-{~RV;kq|JMV-UNlK~^y~)K%EUqkGfx>V^Y+{?E!~Qx! zn`ceS>e`3tHNfmnKBe@sf-aPvoYvB5B^@=SaV!%~u@6{zhd2kcI#}S1BOQ-<)M@%< zj5|v&c=UjA)RMMQsxC_t!U&IJ;Q9Yex`caD*P#$w4)7`cu0_9j2}oTapsp4*H(bSl zu7p^sw!0BNZixWnm5u>9TQy2=A_H{YB*f@C9)5xm*FzQh&>FQxkAlC*m=Rb9c(>si*fHoehC9DrdDKDt z9pQZqV1_EDEj2@?DM03=HZ15kC=cL`XIfkOU%}l}KLA`4?l9XSoM$LO%cOdZkj}$H z6O6PQ3Iikfl3LW^-at1so75R@I_WLUbVIgPdj%|T%%ls&-8-F!(D^AakY&hh;2e7t zIHv2R;R-vvc5#b+_Av(6M=R`TGkk2^m^fPImbeCrI2uqCX`o0CvBWD}e6?ro;>NK} z?qr!Zf+o(Y?jJ!S4A8^iovwr_s&mc$po4!g5tMUSZ?S7oH$LHv;Td(Q$oByc={#sqS=W`jZDoh{iuN4hO-#zi$!Zi$Ne#XE|W*;&jEVs;L*1DN66hwjm9 ztYM~SMYx(2q;Y76>#I5NZ484=$NpP5I)&Ly?03tGB+X*=In17itU2QOZEVEsDt3Ga z@B$Ifx+`FYJE1hgok$tsPNa-*CsIbZ6RAeH6Kh7e6RBRp8>zecsoq+&d->ebxw;$Z zCtU2Fx~SGEpy3H{hj4xI`pSh$Fkbgz6Kthjbp|OvMq2|&!K3ypt4nGNG*!v z!y4Tp4h{w(c6`Kk00#`|eD}hOS$u(R;)}I5zS#6szBu4JhQJZ(l+76ECdOE6V~kBt zWsC!k>H_yP2@G@-U#zw9#ipn7#Q`TgRL9Es-w;q3I7e_L0@4`-8p2uQ4Bc)2MaTGG zz#hG|^`d>$wk9PH&h)}1YBrzK-EEDw{$8L5!gUO<6UHO=;$Zrmaf$Gpgl9CWQ27nW z>TZ%Gzwi>|*e?n8*Tm4j63;>oFYb!o?XHrM3K=O!2RF%sU%Dnp@*25vc$?4(K#}WE X-Dv#sRoCk-vZLIU_=-S6f9!t)NHnZj literal 0 HcmV?d00001 diff --git a/model_executor/layers/rotary_embedding/__pycache__/common.cpython-312.pyc b/model_executor/layers/rotary_embedding/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af875c2b7d2602fdd3a329ad8452e96137ef9b3d GIT binary patch literal 6616 zcmcf_TWnlMb?$3lcOQOd9lOc3ozz(;wyD#kNeL}^K;NcqBA{+yb+h-beeJuC&b=mC zcP+V95nc+)8j!q2NHt$LN+kLT5+4bvP=G)&#W#&V_7L(H>@-TG4s5?G5VB4*qH!jf!In3<6hFgr2n(i= z5@$pPi>8$F%y<~gncf*M%uA*(<)87hF^?HY$uly8y=E{Knh9})C%Rt`U+3K2W+IwT z_s%PgL$q4Wzx#Mn)B91q-!|2|7uwX>ru3C zCNu>o;3-6H2&nGUhIhZcHgccw1NRv}2&a1RK69fedyPwmwL@@<{r~4ZW3Z(>mbC+s-YXAR5NX*{uL*_l*4lXbW&j)&^@ zB26diM$L{PK@E#S(>vCZ=`$#Y3w<8O%9VJ9r-Nlcoa`0x`bAL7t8&Ji>t6%!oEnp3m7BiR|6m~(`L zYF`tmA5a|r%0uv22q$jK86kLZ)8acMG}v*$AD z$#0ovYBH74bTh7B(G!b6cG6UHI<+Qg2G-@`dg_v{X-0Z>GLcE8GU*5heC zbA|4K{rcc<%|Nw6s{X+0(Q?OdacF&XZL~DJc6g(+(lrbtWqG*bA6YrWFpAu|cgTs>JhS)|28X{z|r#+jS)XjvTH2I^#YNv^<5Ly{O4zAd!NN)vXB z@pe-i=Ih)>%8k;LhIm72NMjlG1NG*vqInwO0_5eLo}eK>_z=m{AXZ`Q{tILn6KE&E zAUs=5EI8gID6p-kX*Ucuk<1vGm0^zfcyVG>SV3iHzk4mXUTs zhQ%Zqx9ps$C)!oCX)N!;t$@Yce3N-Vqh2(w!G`&!qBrJ1y~%D$-&g_W#Q)D}@_B&{ z+7M0LGVg0Ztrm=!*90)Q`iU=sDlCi3Qo09bBv6=kH4@pFXUCY77xUZ_@Fy?jd2E*; zgIV@K&m9%(qp;<&7tYh)b{>}Nw9bN1-jnx&3&0aKT|nCkRHwVjy({J$nNk~OOEyeJ z=>W_zHEn6=1liUxI@OH+LG&`5o>x!G0TTCy|E?kWoNgrTKnB$@Q zaX`Up2rJ0n22+37Zim9F*4o*kT0FaYrSS49U&Vpq>EgNKE5&oGxxz&l^2@7A9LKe%p3_wgKy2ccv7~>ai4B&27k(2-nVeLnTA1J-{zJ?iKIBp5Yje6ir&GnwD2tY^ z0IW`~RMi#o(I5#9Zwzgm-yGRo-jcyje!=yVUQJJ`i>5uX zBle&hWFkYajj`u7J`YjbhMrr_L(HD0hoKYW=r}YME~y`DuyKlZ3rH@e6L@9%IY4|0 zSGXsujS*j@?A=?9^sRrGn=8To!TR%J!~PpwT9(>m zh2GQToA-q%Mxc}Physx{`4JV-^Drikjs3=cg9up=eS z3@{lv>xc>6FdYFD5Go@b^U{-u{wP+cp)HA_KrcHY^2@?ljaCcAHQc?s%8#mPcnX|B zGVr&)2^Cmwn8=ZK4oJMn6F(v_UxALxov@$Ju7EF@{k})C~-QvdGsBzeEhjB z-!x1IJuByMI)d*1&CmZtD4p9F+2npYx)mKO%VTA6j0u;15jQ-I)j_Cs7!it!33e3# z^aFWr;SeA~8UV%u`QY+rs(xO$!8cf(XK4gDeb%BOEQ@WTM(1`{$%~NRHq)48DKCML zr0LmzUc2*$=Po_}ICvjCbequXxu{2F+=Qo>siv|qPune-oN$#Rn?_n!X&il%q?bL>_ zdF*EQL+_lebU*TobMGGg<API#V^4Ga8jIRnm+5I)gGVy z@mpAa`FMoyM`VGMy+S3JdzNuL3{@5A3Z02So&-e8e?eW@Dhs>aM7;DZ*Cv`e%DQ*C7Me2zFi_{n93`>EZnpTYTIK0{uI*`$+ieV0N0@YUF z>1iYl-BQ@)08>-oehNH}tPHgj_jPaP+{n+A7#{rx8&N&0$1fZ9Tzm%$;zpWnb>qn(Ed@L8X_kRjdcvIgTVVr_BVeLT~z0^uLR z-x`K$h5S8Aq%U)S7Gu?}foizB8jK*&H&X4`^Kr;qk19K42+LgE76Di%IYK7^xJKv& zXznq{p-($ZJgF||^d(q#9oM4sg+SOrB-vHEfls+eOmw`NY&@-|bOtcb5KJxN_W&yn z9~vz9h6Ci|Mk;HvBgS~6W0VksfEI$Ku*b%b8#p3~j0q-;>R?T%ml70>7uI7Yl*yPD z#k)FTAeEsue6^TZv<;J@&vl@xAV<8cW&o(oL-*oO`F28XN)PQ$WJmNv2s%@K^W%|f4QyP;3A7qezPJTae( z?vEKhlRjfO{W^|Cv10Fi-zl(wMN@x_nlOeA#QHUOr<%ZV+_sN!!S~65_sIa%e<8uY zl9Bhx;rGd@I{^~tEq;B=zi-7;^9kIc;xTliH3Hzqmwxu_PoLdJuoe`#{>Bu5jVSaA1dzR^nP(B{LH&}0p#ZcCxngnNZ@cz;Dw{wBfbwg9VZJV3R_NO!b4GJ41F<2=>w zzM25-ZGW&PK@0bgJ8E8pd}QBH4RP2Vwq=Br`cgT>9j=BtY67&k!<{t=T4X_}=0(Vd zMEnQ^kO*SKyVN7E9>wo4uf?}RIHLHt-l{*0`}bD+2HbY|pxaIy{rSSR1qL^n0C;!$ z1a7o+48=CO*^43@tp$7G9`J(97eFom--fSYrwXTP62NV8!JdzFbl3a{1xQanQvexz zK@#q&g%Ao8PhdNO(C2)d+$0NdlPth(WU=RxMW{s+G(7;$*Wi@mX2H+H>yg zY>c%{InJDW&i$La=brDJGx;nYj}jn5cuW6GBIFBHdiz1UnBUlaF)aj7n8v1qTI7;glmg_( zM2lV`T8xSty!Xape38q?Z=*7m=bU6|!8B>bR%(W7Pgl+IZ4O3YAn{_=R?Wqy%jc=4 z>6Mwh;KWSmvZ*ptv*)@>g;p;*`xjNSqD<=*O(|7jzmlz2D~hRBW~h^4V}@R#stH}? zno?G+c}KKLszIG#xzn1|UN9@AfOks91>ktO^x8PsW`^417Z)D+;iL19(1M|#FVq(8*=nV5!7$2&a#f>-LNC(N zf?YKWhPp^itAGYH7ZvIowV+X2vnZWc&#jMuoL~YSYFss2U6wg5BJ~%-Nk1_`A zP0|iY?{dGcw-fu?G5Ov8<;UM2{b209v9-zH7h19W`k7X&a6>5Uz+6&7=F`EBu=5uX zegjse1ih^o(6}P61>n!)f+oHn1fLSXrv&gRL5omPi_#$ak|xp6T+-k2ZmY#%Ook@` z^ONxOfxHi%l$M@8&?M8GmboMsBW#Zp?2*2^)|aLQt^blxjA{RCJfIE0xYQ&Uxndk{ zbI{42?O5?8&xm_Gw?4h6BS$aVw4zZh_nL0c=6<2RIF&0?xI1kk8qEQnF{!58R?e>G zeo8AdIcn!rViaFKo9Q+Uy-6bD%((ocvtyKY9 zh=*yl1is+JRNL+Pqd8cYs}{y*}Hi~H4H`5%T8pLs+wZyb?QVrj)dI< zW%Z&`t6DmHN7uD1M>wxq)QRhsf*4gSdlB#^`;OK*jHua*HEIsvsRxk_Ap?|oj&Ba5 z1i_0;mBPT0g78G(VbDwiz<$dCID-wM;3%U;7_=@l8xhdD@N7gu>jWc08Zpoijf`vr zn? zg+Cp{jSLUMs1V_Z|(bJ8(jyTtKLRi-! zje}L4OZEG()dF_co#%X5?3!R>;$Sp~(Ah)R_ziF`q~5!u3)W%RrGO?h0;nxP1%Qr* zZpn1P#;armdvu-B?_)b-!q`K--v_;W=AjC`J)*OJ@?XL4UlNysnrNf%l%4jrI`!{-sa@a#JIOvrq@{g_vDn?ozlBhKvxJVgdLz-VtQ+ z!_YWEjb6}8)RaMCfjZBzfqa}n$`K(lmu8)y%8C{baHBXE9mI)ws}&Y2@xph7Nd`-d zhYW}Lh%VT zzIHOblDLupX{&#DW&X`(uB7?3Vm<3?Hv>pX17-E$QfX zh#xxEjJr`1KeDE-kGGCJ(vlu+%cE;cE%_mwJn8Zz(&vUras&^6Bg5?BiCI#rJt zq6Hwe-JQ)h_&E%+duXGh!)bfO3x-$$R(7Mr-R0k7rMT;aJ>mhT-D#*;uL*wYUkWyY z5COy~!2SL3Du(fTL_aZ?8$XrP=WhQ82)=H3sGI=$591Vss(Aw0I5LK3yl`^@m=j`| z`s5FsAk{0@LfPRaxD9zcs&l_Je2Nd$uJXZ}PqRwvw5Z$d$_s7?Gi_tyT^1!Z5V6;>8KROe+?Ada@gR z3a7<8bx#SZbf~8|a}0Vd#O{+`(UZZ&$gt$K+Q7cOM79IG7Wc+HK@c$FQ zVrkDSHZgKJK}ZtiYQ@B2$V4|bSr%kz4==^5WYL&ciT5_FE1mG;-nE`Gr$9y9TTcUX zd5+_D!i1B)AktUl{;$Z$Hvy3g>=0nvNuGOPb1j*M|TA literal 0 HcmV?d00001 diff --git a/model_executor/layers/rotary_embedding/__pycache__/dual_chunk_rope.cpython-312.pyc b/model_executor/layers/rotary_embedding/__pycache__/dual_chunk_rope.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e20ca4d5980a13f5f96cbadbb9311e7a7e977286 GIT binary patch literal 10549 zcmdT~eQX;?cHbqrBVHgX2u#2Mjd zf*-|GoS8R6J1yhxN);;2(I8`mpN+GRc(TL@$uW1CVJog@gF3$7t}|ld9CuB_cCJ>r zfpbDRVy{f`FDds+U*iju;xCTO&5kdzTt^{y%7^58CPC z{CA0AH}th`#J~k$gyDVNCWpQ65#gW`9Du2F8qJ({@x(c!&71T=l@XdsB*#)?iFhOy z(kE4DBq4-Q%@D2{I+aTCaU|5ESV#E?$0o zQhMR6QzZTz%C(h75+W%QEUW?b5e<+ltww?njgT|xWhTfW-S`nH$Z0zRAzB~@^Qcrt zLyp#SX2|L7OZheactE4d{~Vdp*UNaY%-5*#CXFipYkm1O@fQYdcL9rbO+qBbTQs`- zuhqH+J<3=!wv0W)WE>#dT9(l$&LCPj;}es%Pr;~Fm!m}6E6JHf`zy&=MCO&`Xwk7* zZiWb3htgDZnu{D&T+u`lrc0c?$wnj5QGO8iRNE9UB*&6yvMO%9JgN$mYfO|F4HY8s z1RIS-CZ|e8e!Ykti>G)&Wry7gm7E463e5t4EXA?{@-PB=Edufkf&(Q^l%Q=@s6&Yh z5`{`7gy^VZ=J;D-rh*%CLvMiY}$)8GX(`?zn4OW<#yehKXL>5&m$ z2)h(hl8=qlwvKF;o1j=qqomL`wc)U6!%zpTE!}tAtn9X8WzpaylPsG=dPAF@JO6#X zH#IrcJ2f`74}8-NKDF=m^r0h%Cl2wwv9asDQ!}a2M7;M_EH>FYnc(;s%irds(=Z&p zvB(U(u->gct#^~Zo8o(>W(2fE(mQMIWLK!ct%!4C259pXiRusUZgG96r_8w(9> zS<}y*g~na*=PGo(nKiA^y9&M_WUYmkPLyrbN=riiMkusB4>Zlk=3==kPYymk`uJ$R zYe4QASnV2=I?qb&=T-yfLCC(&xbsYh%yg_W;ctwD^(>jKzrT0kthD=$Wk-=Ftd2b0 zEYr` z{-W!%t~K}Im%e{!{kvA#{o`*aqr>@)k@DHHbde$4!K_Q@1SL`ZbwEbQRE%{Myec|G zC!949b}X&*Y6uoqCfby@Jiy(tg$3Csl%^?Dzm08|=>Fbp2e+|ZCwjg&+YQ^;_KLpm z&35B9w*6w@d$Zjn){8;0L2MM8Mks&*=FKnwaBwIR=R$z=Ll_J92>W2d0N6Lek@M^S zh37HQa0U$J}6u*i(#P45&y- zF#t-%NE$0`R-wdxRM)DWU1K~*-egmX2|#p8F-}0@sKnDL#j0WH6hh$`eP9%>Qh>r$ zViO8iSvm?=vlL)>XdV2a_L&iohb;O>c?#e>DxJVNS7jRze-i`A+qGt)QauADLOU9u zT`1X&l07KtfFy~AZxi(2lEG+72!T|(cYFKkxQ`N#60+}L*81G(y%(E}EgX4r=IO=97uTGJv&QEo*md;@ z|8(N<#G3o)@>SVAn0LJ`yWTFieeR;-&(mU2ZyqbuY3*jm*y_z8h*3; z)2>gt)_iZS`2M!_Z(3#F53<&;JoWQkb6pG3+~HMEch>UU8=OBjcWfb*`|+x`Crdxy zTHd_#;l&3R7adEtSDQ~tbi?yt%R~DEd+z+=?bYDntYh6>KX0G2|I(4A3Ke8s!+i5x z^DlQ~&4miGLZD2pH+JP4d*#O7)yBT8tFSp2XwL^am9b4M0NawAg9zNG_e z-Z%2zBeM6%);hkUPzNDkr~@PPyP9cfdwBlA`P?5ZyQDqmzP!HH{KI_nWx4tCRx>SE z&u9msn1S{$KUw;=1=O3`r>2M`#3v}pQSCETfY;2}jF=)~c~ zoUwv_6zAFU02c**wmhO01(ij!NNa(%3~Mhz4~Shm~?rl$SIEy^KV zvll2RT74|4f!10h>Z|}xRbVXyp6w!o;WEXM6(AQP9-?ssMLoDl}qZDxhzL z><441QMR-N3jIh;sY-G>8U?I@RpPL!;#D)KfQ<@p5doI~imyUK=T?b4mclnqxUKjg zB5-Uxk{Y|kr#ot5kIjO&pk>LoG3F>biMpoy)VyuZCb>^7oWai6xag6ZyCJ`L9g?M8 zpZ0vx^UQq;Jg_s6jourb9hIEN7K{&(H)+ouSa3pi(SVYHr$dj2o;ixJpTJpsi}M6@oPoCMk z3Jp6JlDUD8-g)$nyz8LUdQfh7BYUAx-?;Fa8eNh@eNt1OTz_Ep+3m{b9-7UMjE2}Ms<(;QxW}v|M3rt|cVyd%csiKVtH0J$%vcC_@?v$CG zQrqF>BXZk-%$(6>1GcOUWb2#qfdg{jfG(R0FGe2ilbQW_2HJIK+4_a+GuM}-eE7nb zr&gIEy?(tdYlr%+?fD%?oR_=amMdgi!T&dbaA;S)^_bjx3=8j( znLScR|B6-aI4?67bj2YMZhir?A~0{Av#z`S1$Se?-BfV57u>swcC*8Yri}MRofUW2 zN4Vj2X48N$)F@u1<44XCJN`}}YDV!$v?YL}DY?J*AhrqD!~zd>!c@%#nnZ>$3|B*$ zj5fv5^+J00wkrXN{WYjaqWC%lFr5UvV^^E(YRes2pDpcQsD_2v~?@?CY*8Yy#1hbeI?rGJr-F~b}(rC|&K0s^hYfJF-N zaMTY$fNCJ6Dj>nIt1;!(MxffQzv{>;-*Uzh2ltk-h?Z(Grt-HE)!+bU*4hlN(J~}- zgK6v^u~Wo8^6USBBR_VOVF#Sx(bcwk-}XMpD%RXvr{j!VtfRy6VOT& z&=gZ$DR!zjCVJfPuIO+X07iSH>{sq)ZTHyxZ4pHzn&iBvt=js*>_xo49 zU9)u7n0*giN8aA@%-#ZUk}D8hj5rx>YNjJ-zPn z=RF;=r(=;^6eLf_s^>u7b6oZuUw(Jha}rm6d3k$BzWv~{_JhB=`1vvLbN<#`yX@cn z*t*oYy0?GDDDOQj`%hs$QXbARLf)w!!V+u}v3_3-lbHUFvE!F7LYcCg^`-}~w8 zPZzGQx_0JWVc8X4+_~!NmgsJESE{@dK`$5YO8*1|zThIpf!g}~TD{nU{b=h0_w6RQ zGa)$y(ku77k|4T|PXlnoGVtv{wL=VOM=MvqB`MoL+V$G#2&Av$N*NBmJE(TEQdt|; z2R)Rw>0e{?rybp)@&_6H>BjC*jc+pg)0XZK`YfYA?d}e3`Z5D+DSwrLv*iyn`ooqA zAX@9X8{%+n&>MwJCeiq6muxovc1nn_uqcIed(Ej|7J3GClXoD2+3pN3_#WWsy@4Wy2n6neMGGcrqHTB4ib)$p!-{rHGDNVs=)fdE z0{^`aXFn{uaMq2g)?w0vNG~RRgw6S%eoVea)=|M6UnJmfiTafJgxNr3anFA061hk{ zWgY_q(Zx7P?Ct)>{wCG94$+_EbH|Gm(9k$AD9(yS3(C<1MEb7HUMpHr&PG5u3PN5v zlUxTl46zu(=E_c^3JX^rUVCs2rh{zl+;E_>3kGb#LY;=rmrS5cg$Y+OrfhTrE!~ z;;M&2zDl9B?>_KLeN3o&!?@_UI{n@h{sgjU4kV95QZ$kzxnUzn$G;K>{+;NCo*KM& lf_c&-lOD;tf0gWeVYpx<>lSWr5a@ryNEi-$gIQSn{{i@HlzIRF literal 0 HcmV?d00001 diff --git a/model_executor/layers/rotary_embedding/__pycache__/dynamic_ntk_alpha_rope.cpython-312.pyc b/model_executor/layers/rotary_embedding/__pycache__/dynamic_ntk_alpha_rope.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fe01b44e6e155a4751d64b63c9d8657566b63a3 GIT binary patch literal 2145 zcmah}O>7%Q6rTO{uCq>@I8Fm?Q@4OB1GWV%6i`(GD(ZnM6;L5$4x`P^BwlyD>&~u= z?K)}?k+4;v&;vA_=z&A3L@Gy)+_^MrrADhBDglSwT!o5?3%ps|shd{iN&Du_d*7Rx zeSg0t5^)6N$X9G#LFjkB84&Cv+Rnpr4G}~H2Ne+!iR1_su_y{iM5hpumk?2?xF+@B zMX4cV)h)i6-4cLjg>dXsFECl-qskm5#C8{~9<;}k{mphAgsq62Pn2+$YjMPe+;#gR6u3q|8I zlud*)CmXI=v8~y&AAjIfOXjXOH>oSn(>&uV7sQ1*?pE>sXDYm-M|cN&&@}kp{Dl+#lXt=jSLP#jMpI>;f%Yn1h7!Y*m6wY$Jpl>9!*Y{|FH`}rCO-k)f|-7 z0u6HY+Oc=vId_Z}9DA-%Z3HFHEnIM%N}=Kr>R?)@RxR*Y!7&?@`GtNkcS2Vnz0_kj zIM<64W?q%%s|_{|7dH<-zcliLpBn?Pp9S{lGU`sEnb+F#9qq+?TA~x%(55$y^ta7h zFTMB6$zOA~KKcBiB3D1at^9a5kU-l{ z&OpQ0XKRVjgj6&k6HQ1?6JE0@0+Ppn7q!H-{<{~YfM+hEGEd(wY6(F!X1KaMy2}XL zb#bl#FN^ZfNzZC;UTo4})GR73rJQ>DytTe+Q3ZSYIriAlZBtLEk|`PTmx{NNv+aGQZ6{hxqcM)xPC z?oQ-4Ci3mr=2&XE(P^w4y}LKJu{XDAB$v;0&a4DKUi#tE&*K{h-rO*bJdnhF@drpu z#@ljNMU&HaQ`wDFcJ<}ksX|-3XN)hO?wnrPv##76xiNCbINBEeNT#nDSB;e~R}Zd! zaXa~TTiF~? HiCq5!WhwlE literal 0 HcmV?d00001 diff --git a/model_executor/layers/rotary_embedding/__pycache__/dynamic_ntk_scaling_rope.cpython-312.pyc b/model_executor/layers/rotary_embedding/__pycache__/dynamic_ntk_scaling_rope.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..919ae68e495b2cf22de99ad7736ccc6087a183c7 GIT binary patch literal 2300 zcmah~OK2NM7@mDeD|y$FEjw{Y8gJY*twL>)Cas$iLZL1tG#K)r6!kEycgFGBt6gPw zm0F5Sb1+mU6gP(wAKgJt%vCQiS@qM>6eYxVZ&+>7_MGi(cIE+m3nx)j6bLGcL?}u`q%x7< zli^dSLJ~AVlCY{YIU8&4u|`rPJ)Lc!X@T@CYQ+>uf!x3zfexG&iMA*gdm3n7C~DVG zUJs3ls%w^PYx4T%*DTY4Ywo*sjqM{CbUvUip~U#w4rXF~0}L2`+rn5nZNT`1QDO(a z5qL&0LyfE43xQGbDf5j&r7-1q*7W%@MKZ6CYTo5y}ob!gbzxelH-t-xcULV{|Uh6zT43Uend zb&rEM=AqoV0Mie`eN<1>5$*~EVhoqmI{Ge!P+h2tb*V1bm3neL(dG)M3UQyzOH2bo zsQ8sKWuc0(?b-px`BW(T)R|^U;4*$914D@MWMn;j;%ZEXDNJE(Ii~Mp>~k6JPCoAZ zJ68xwfzV`UdkA9i>b@cI-FJ4GuD!l{3$l~=94_<#!AX-3$Fre9(vu6crHz2#@ z8t5(vz>{}rF*;bIE^wWL&!|B}4%Q+<4MN60;CDhFpTpOKsYHylz-kRbAKg_1NgmyO zqb{y@JhC7KJTIa#%adc@s0#s4o@mVW#%OnaD6Mz=v>@-XbWjJ8A(f@Ln=L4HWj42K zCr0ZXu#f=$U`*XzyZq#y(+=(OV+6&OgkGdPOmy$3)>Y_7$w|h8zr}Bz7ot+ArR;ZS zjnRvSJ-hWE3}~zp^n$$BE}p1lxD%p+3j46-mCBWXLe1U6(~RB>707;TCQO*jbZ^j5 znRYxg2ouzH{Yoj6EYA-m-*y3+K|aO$!B&{$jmDvFRGXpNwIu|g4@P}9%#Spqc&K|BI0U+xizCt6`0Fbwe#I+_XThldJ4{esX_Py>(LdJgAA2;gab}_k zzy66vax;@#t}azK^}Y?=XzIr5cvC;Uc70>?QWJjqCE#ylMw*$C)ho@+sclKn-+A8K zzw9k}s{=m`|2X{UO!LIKX7BskN$J@5Hj=XA4Y{SFfuW6DzM0FfoqUojG}LEW-}2?9 z%PWT;DnIu;=y|H0YY2a42k&e5w3V;cPON?TB>O=_+3d|N+e`M{xrVd{J_EVbz0}It zRcYn)+Q`%FC`e`o8XDsX1^-CUFX11FVjQM1h9*{VIMy(}SuvdsCy6ofEFep^8?ZOH zv*jtzbJ$UUQQAbkD@6j)mcG&s~sa-^b>w^}KV(kL^~>fuO3L&sV=N4@CC@m7YTEXocp z-(I@i>f^h9l#BcL^sIG?WaVU&HGtIeSq2$QFf^p+RnMi k&H$f3!#@I3OA-WO=RG9of1wk9Cr$~2tKYmputci=1N)8{PXGV_ literal 0 HcmV?d00001 diff --git a/model_executor/layers/rotary_embedding/__pycache__/ernie45_vl_rope.cpython-312.pyc b/model_executor/layers/rotary_embedding/__pycache__/ernie45_vl_rope.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc103be178a7cdaddb5476260733d0a01f1e48ee GIT binary patch literal 4306 zcmdT{OKcNK8m{j4+llS?k%!5{17rse6Pz$%nPftSCCM<%EJQpd0#eJkJGc|S=x!4n zwVBb1kvf_K$tmVU=ae8;TAx-qa-T~#VHIzSl_IV7z-5(H*vnpa|LV5E2@aq=?be~| zum4qlef3w>-@i0ByAZUL<_F{_W!x491xYGKt^49c!JMx>>z!D zzV_Qan9ra3VmxB*Pb8NWIFZX{;;9I1mANO!A|U64h&U{AX*QA^PO{wXq!^hSp5quX zc|mv9tOVV9C(rT)otk9}x{YVWJfBHe>bA!QG)|VCzeBn$BZ-))6V^zEm_34!(6Wz| zG^FHr=y!xf{b=^Y3|X^&+cOm2oT5cSGxaf46(qutS|#fYZ2#Vd)Bg9Cr&X^vp)cBs z_V-1!Nbo6`$+6cHti+75p^UK_Q8*e~GG>gH8bXXzM{A25#jnU+f414))>%BUdeZR z7lJe1I&qbhybX5^F;o~lG?W=T3P%sJXPUrkd5W(YXvLPghh`xRFMi+k!mZp7DIJ$HA1RYYTF%GL!??8qG2M{Shhb>?R60~ zLaJQ~?T!fYo@_LKu><&ldX!CCvM>~6Z17!4?V%`=5^3O!W?J9XdXu+dtOcGL=+tHR z0TCY|qe)>2b`>nbHO|}qY4r{$hCi_WuykPh!@1k>Ci6&%YKTrJ@c%n>D2e(Abh@zC z2hr)SL#OK_(CPXBI$cutN1)UB1|3QQe5T3EC~CjJZWXh9A_*p9JTa@= zdDgV05^+(t+~wFg-I-+L4EUL6byu|p!BgT~(3?4d&al~gv>+BzY_vzW2yAMa$2U0d z#2;K-=&npYO^exCHY4cvYEU`_gib+ty0aP!oqS8ev^B1%H`hpPyJwi(EuDj98cIe7 z>%yZs-G;m5+*@4?OPwb?tn-A2;K@`nI*gFPv>B>%y4!fBX%j7{IRJEYmr3U0f}lI$ z4X@1n9OEk zcT=f!ES+W86wTga6M5LtSSns%c_CKauQ!EAj5TVQ6ZG8_M3ZBWLYp(C=Z;&0vbAJ+VbmFt>6Cfh54@bampQ%cJkxp-oQbq>hqE10GBo>mIwA??$V^*cbTk<(h|Z>loo;#x=f zx)vRM)}r>FQ6p!x&N2D?*1=er&<-9ipMUm=HaPw)tzNmReRHkUyLdvLco_;W-&ne# z^sa@J?((Gr?t@OiZ8V6UUHYlN|#h$M44Fqe&zczxiVRfYlly&hlVuY&<;hm zdgO0j+fbl=Il2@rT~n@>Cd>X8{y}-{|2_QH1Ix2Zv$7lG;97E(h82Eo@DHO;MmK!J zvb7TIC>>IJ`jy~n&q@yvYPequ4#>_*C|vqNv8;Mmyjo;HJusk!j?1o>!M-(aJ+Pj5 zHn=%!YtT+Lt0xlcO!ynxA9@OCE53RA~ z%Nt&N?)k4k%AbNy)TujYxL?vNP3tb2hRZ6S!gLc&-^s^QRf(OZnQVfl`4HIXwkz;n zWO;a@^0^diEQcQj!4qUU#)`bTh79Dboj_WoF}Z|EjTwLOC#G> zpto(v7kVfyNZWQiav<24n+rEf<63iM8!Kz)kJO87&-6`tkp@=v5lnUXB|M;bepW5yB**{1UBC{-v!a!*>g=GYV(d-J{FAEMDPf~Io6P~JL#(BJ5w(fB&Cl>%Z3$w+2YG=XHF%&JV* zH{oND4}E}S?jDkP;#*@q{t33gB>k`GVEh#WeUfov*L6!Y3S-$RBFl<)Tk=@FK3*Se zeF?u%*jDC3hO_qd4&i2_GE;R%=VQSC4vy~nN91twx;w7^WnZ=htm zbL5($5z)9Ii7FhY<#b|-L#SmAlLd=tGLiATVohU^0-@~LnbL%V8WGoQ=phGpVx zUbU2*N*XLUBTANTm??Zi(Im37eoeoA1&gwnvxos=VgXN!bHvprE13*2hz4rCeHuyz z@jSsA0b`omNV=Ie6%9+GG)-Was~WbZ6%)^kg+W)DqL~&EWvpkgk=HahH8$L!V$z5z zo+XAxRGcFQR&_&kw~<7R3N&|emC;oUyNP&8v|vNaYXJ!4$!J?8O<~aA)_P)7=W}9mc0*k`ck9Ni14eR8?##SvaNV_OS($&8d){o|<{R z)m?bud&7gcfJIHlMK6BMGn?)vw{9&A1IxnjnL(UNrGUszy3`=PwV=@z&0C>Vc5;%c zstXpVV4)#c7}lGD1u9r*3Kled64rJki&V1MkSq@8nvz8-S!_xcwH!!#hlRjPI)u{5 zOV&P)6isk~s_OIRrEfbJwYJ4%YQ{Hgaph@RgF`%euPdl0==rFwx~V|u)bL%_!w8J` zZ2y>0i#41#38Z1hBydk4Lln4H3(&jCvQK8Xo70>zh?ygjrObWvR(TuT1SU^0)$3jb z=rue=uqa8yG$B#v6cK+~&ojZhl!B!Tp4g}O$`7yB(`v*Zkj1Wrbo1*J`7CO?fz$c) ztf=U-6cFZI^PJE$TrR;>io^+f_ezkJtaC6cE(kf@bgzT3lXVU^C7Q&EDyECYq!~c%h#aS>XgCS{@YB+ zZ~JW|G*>NugYSOOM!)Pp$YyMx&DxyJ+kv(EDuZmqpaOc#j3;>~tN_rMmIyHB@EKJX zEh7S293H@r7*0?S6iu-NAsKQwlc*U3ARZaiTq8s~TCt2a+EFnh3Y`-Y1W6T5QxHtb zL(`77yZ_0gt!yrxQ*uL4fNm3OXd!>$-HWpqNLp2<(z${)t!wEyRn4Zex=d7oED$LV zK$}*@0sv~d9*d^uL5w1|(D1aO=SV77Fgjt^;=r4#LaK4D_kh)WAG)vaqU}x;>0Z>o z%x|;}Rr^!xed+Q?)xqzUxsBkF7s2TLKqc5yjdU#**WbGI+tJ?*t^er9+pI4XejP-i z=oaWeQZs5Wpq_GL`cH_^5_o-P_bO$F2D8ohc{8+Mt8Y3Xlz@W9kfI6JYb)2O-sYJ&tzr3 z)TSM=S(&8|h*HD`9!l-^?2z3jOetzd>}Zj4x7q9rWo~J9TO&4bk9_-bgm*YTU+0i@ zxS?YI7&f>^(S!B}oWv?f@=d}iKY_d`wU;=H-YK`)T!R7uZz(9+9z)sdUo-#jk(!Va3g#_KtE&tkJo)NzvJ;6U^P>>WF95`<4MK^gJMV- zoWVnvu`=`OztG_Q$pV%dH0hIZr$dnRY%XsRfSEZVV~{%@qPSo})7IhP{!7!2Uo=GR zHZkbS((x0(tbEpC;o;%10AN6g7GT!2NC8&NGCqU}~h&v4IO>H*~3~G#HR2T1RaQvRI(*}TO z)eDmo(cl3pG^G_Y1`;kt?sUdMs-?T>A2=#hBu#MD7SFyJ+!{{ZBlHm{G(Q1kxr?fa zgPVzDC6Qb`@ih53`8;v19IPf!m%}wb>cizgweR?5-#f$%U7!Lj>Yjx{CN4w-#dGku06cAa(GpI*7Lmc zQhBU;;NVj7Ve*U9<&j_1!SKl|)TrT_e5q}qF8#jf<8UG!HYi3j8N$CuAQ-5#mn0)iXK=@J&zn)xv)C? zn@g3*P<3GRHD#iRe{wsB;)%s`n~Aq8iMOA}PCXfU7FfIXr`W~y;KlXW#TT*82Zj5E zt!PEcVY6DS|;_&agddG(Sdx`-Cbhj8B0>Ui@@NI23kU{@{!PPnGeP~vi zR1|yZ(6cuKciuY1@4PV~zD?{7)lX65E_xYDY{rHvv7z4+fQ$%|@zB_Ig0F2OVlk3R2ua#OmlIvl+`#!=AZ0)bnU9sf zqj=YY&+dO#>!70pDBS*4f>I|jKe)<2jXaKQAs}m`4Dz?vd<>sjJ+Yd9Cao4L?_7F~ zV7$e`$~Ikbed*epLE8g3G5(N)akj3W!vxS`ddEx>~&B#L3!gb~hvr z8;Ju_9aRV-QH3MNRJqVoIdZJjOSY|KYsCQx^}uaWib{LxduuzPX`+s1-@G^TX5Rnp z&#_pTz|i<_i+=?Od4WWG2rXp$BS6-PN>oN8)6JEcW>i+?G`7S|a|~h0MWXU|h$>KS zlWX3a=BrFL@Dhosmkhux<92?fTZ&QrtaP2KYOy?%Z?ZOFvPEpy0Qm+aQUX%QG^;X{ zQ(5>p_;|{L6hRHpz+A`M*PFN+gmV$T5PV_akEqc?vPKGw8oLvn7XAAXxZC~Ty9M6o z>j1uRjVv+Kkt-zI=_Wqcl#+64kLqq{mMW@j7MH0THhi;WwODd{OUja5(aoY&)XOq$ zTWY%eb;YD^tZ2$*sxQf=Rn@2~s8+Q?-3~LaXy6aIpyVyxa62`9Q7#ztMteh=EzZpT zYo|*$iZk-t!rXvCtp%ezm5;n-n+P~YWV-<1J7%5OL7T`s47LaqM#mp^2q<=k3Io-O zCLtESpiD#gku zWZMk2Mwb@GJ|3SRqdBd3Jy)q(vwAsqQ`1VhlCDxsrb{%x0DjMDN|hRBt`*Lm;N>(w z17t(5(BqY=5eI?G1N)}M@wREGlzAS&o7=?eCxfFk{x>n*7(9+^rZMyZuJ1Gsoy2wE znHXCQI$~cVnz+Bbb@nX2IJl+<`ZCa$xt4e}h*U%kLz(KZmyO8Ox{>>+=fx zONQNjUY{R>_>Uc+kAUy(|JpKWv;T_+^ZVKb%B;$+1T0Kif6QU#?irBr{5<&HX8)Iu zo1Z{Fkh)LDL5V772YhW=-$w`X*#Yn$cySqYY4_Onx)QX5(5v{VmzXAo4=_QpVWS5Y zw@c3JrOJXuq3&+_ZN#s~-STw#l48xeY^CT5Pzd>1S6G~-2K7rX8#18Un^ozC+Ao-{ zw9};)m77$fWw&d$O|`c~L}WY+m?d2YA)zvDHaFA2mTzLkx2yNI(TkGO0G8iBpuqN; zatI#@LbgMtJ&UK-E;{ki4fWUfsamL!9;~NNIO!9QC+ep!IWW_gwxG%~L-ou_Cv)=g z)%u5*9hjNRTVm>YPyB(kzPPry_Q|7^({r>YHX@z%$e8%s`d?8$reGgA)CZ|wqTw?4Tx z*@(usdWW6pFjV`qXxF{T)yam`x0N1oq>+Y{dN@(fS_bZHUY0$ixX8f?v zeA~Ol zz2hmALaRwppety1c4u~GZ+3RRUH;i_w<0J}!_UGQ8$w^;j$Y_8vH2npw-ARoB8o<} zmKY^ClB1$zj2fi~M50#_XSjhlny2nk+PqQ2G!Za9#GT=X1k?dSai32}tT25tc7^A- zaD4pxtjLEnajk|msAY2=X8aVa@(3)UQIaEgiX-8lf`0>V082Dya*3QdkTp;yX4|v0UX@juFE% zykZs9jb^xTOlgd zK;fX;?1t)>#4RLkvF1!^^a}{oG?6CLRN9cH)5g0-jZL6wbcYxYkP0oPlDwdp873SL zOAHe*D+ZB|jtSU`fYl2YT;a1I)Zhxop<-c}P?Qx#h7qyMKCN8;Cm)nz$zU>^JOsf! z&P#`;Qb+pxCXVpIX!uGnIW1jH#DkO3Xe<~@aD0^Er}$7xN(jLyJIxDXuVul=kh|L5j!2yj@$Al8GWJ=;;D<+vSfxo8uKVT5*fUWL;V$oTmfKRQ6jezg3 z3vMjpg=*%|u5Q_B0v{z22M_pX!_&0zs_Z$m%oRM(XD#cVXYzYr%z19NF14&_B<_~2M(o=;xCh%s>ii5`gmXkevL~C-IHdRw4PVdWXy6zJ zK%aznODz)K6tzUKyLOZ`seiZLW(@e8&cL^1KpRl7>Wnc>bL0%n$uzoTXxgrnK>YqR0FnJY-6n!8a9_BY2TJ%H485(z1gAU7g2S)$cUF zO=$@~V@X>g&wNupO;y`f&H^Az>VV5p6W~jZ+S)Hm#aHb8rp^lPzFoP#Z*tQXeLpL- zwC^Uj-Cz3bj5TeABvCUQAk>^I;1nd_K!)1_1?chcUr(Mq36Kw=ej;GU1jz1T0Y`$` z>khcpgrKEvoWvCxGHvLpVq^t2J`Q*ij!Vz?C?1 zKLscwv7retkPsD0495dCYEDzkdOA~T;;9&uWTntmUQ|rs*fHk zt$&meE6dk%6Zt(otHcA>OIg!; z^Pa3}-Z;%V$UO=ST8qNACB@&I?(3 z-C19Bw#m-6T<7iJQgFq$!mid8d`DMV+4sG*x`OlNEWLqc{Ib)(VXM1&X70?d>fdjA zuPN7CXbODTd$0da|H{Qe*U`LpV9ocyc5;&<>#SM2WJJ#TtWCwOVTFd^5vCPtf}alI zW2KiZ5VL6@MqVdRqn}_{(j_X|Naf7(c$%n&`}&biBW7!UMRKSDX(S~22#(VoMNf_q z{WVa*AIEd5fEJ`tq8a^~;D`lynJf^$HNJ!N6>S5&>gyo?i*=CSc^!tYVG|O1m9NlO zJcMr2H;o}`9EHdq??dP^21#0lNowL*ICw6&=u*|*G#oVpmjf&#kZEI_;HU-i=RO0< znDBh?c=cVb2HmE2P^J1j~h3QDQV>W(^NQVa1|}CsLAP!ZU!Vjs~@33E@pv z;Fvfog(vx$1G}NXcG)AaE-|WR|B@9s-PyMw8E4;u1a$DZV%MPDHCXIAE_WSYtN+uU z-|s1O4QCAx%*`8h4aK?+xvnGMIgm@;o?V*#AXDrdkUIzRotIYmPbWT^xWE5?PyX02 z^z&!WR!(pHyoa#qg{5i=ly-T5ex(${HW;fll^^p|Jl|3p9Ve&ti8N; zezhCA_kFlxKl7h0IL>7akgA(}xsKZhmkvJg1|QK>i}kPG;DTYEm_J;ikjp*KzT2>P zIM;ByWvONP+EVLhEr;h$tXz|Aee2#JMhNeh2qpncc)I3aD7yW!+h24aklhECU(34( zvSxTz6g?fXrz1DHVwOAm3ZDL==Y;GzvDQ}b3}r0}tHbw1hQ%k?hrFF3mM=57_Fv}HK)%RJ7;c!p7|3U^oddA>)gG3^#9XyN>>O)yEUf|F-^cX@G0w(l!pHrTd0vMDUGc=7$q?_FLVlAn1l z?|4qO9Nx5Gfo@e3O$WC$0liZb9@<6I-z^}ZZ4>rMfEMrtU-LVDfC@P7RCnD$nizfx uD&_%g1CI%#(8b;rTcIjZ1VLe@|vYsA2Y{`-=SrRRKXnSkhT}83UW;a<~6h*q| zp$DFEGcyyCNCLKkEYvdy)>LHdaIjc}vsoYl7K`=df$0*JWv5ob3L6KB|9UJ9kXbCS zd(N${e$Y+Q6K8gj0(tA)Q|F$0@41h2?m4&r+-|o}@I-ariyVJ}qW%^$>f_G?Prs|7 zs8x!kSUO4#)8wrg*1%g6)lTS!brbqwJx#G%Ru?r)7>A7n)<;bf=3z5|4N=R4b=W## z8@ACDryr|Rt0=x%Bc~f{DCSj&HSK^m?|`@LfVb{|x9xzp?|^sgfOqbIckO_8SH_Q2 zvz`xi_w(H!cCubhGZs*#QGBz$9q?5<;H!7Q`#Cq~sRhPt4d-EN;lGaavh~~!{3~-A z(IAHr58LpeZn%nVtW3og>#DCzF=AkwfJ3#0I!Ce1A5v_9Gv3!Ky&m?@(!rL`F>^Vn zku^e8HVAw~h{rys0T-lY@3puP;%8r)c$Z_@NbKh6P=W(=ZD?{bI?M3nBg0L+%dnBe zWJm~)0-L&ThOS+{9u7r;ep_+bIUC|*%uP;UCK5ow$+{RbHHzu@sH_=fWev~DrpXY? z5PTkjZwAD{cjp8jpX8VX7ZxJ%7$a;)jBZEF$c{*YiNpksk8+_~92<5iohPXQQWVBh zn+X@BfZZm~7%b534@=+Af1h`Wl;{%mqH#`7QRE+p=P8i}tYI6L7EP>X4*1QxMGbs+ z7wA2rN7R+%y`r})?-R9U^i^V2+4pL(x-9Q6X}@lJ>tQ<5qE%_h{e17{YeXj>6l+-R z97uxIjdxQN=0rQTaU3?4Iq30xZGl2lpm3B?)QPoX-JFr4oYZ`MtW~tlH;4_u1*Os` z)`+%o?0-GrYLF{n8WEG{4itH;yMV1L=?jk1wg)6vgWSb>mi|PqPU!ijJxY1=rE1Gc zHSbZ%moHUURw}SZsj7-CY}un!b;TC8?orBLv4w47lh{n86d)tfF1CoRB;Q8z9VJ{V z&lj^gQ$RIQ-vI@sY)i@v-?Q|#*@irji^7AnpmR?imfvs-y7p|rPBPHrT-~$ut};+} zhQh4x-jm->@-X}cS`#d3O{FzRiJfFNb(gM-G4r;iimS9^j2K1bUkbZv&mQ&AVi&0= z_%-X?+ImiQV)4VsL(7kM>YvFKJKz*KE)JJ?f#wK2p!X*R1D&*dw;% zel6}7JBapy(U?D2QoF5uWH1LBP8r>UV)rKob%&*qhr+8T&kJbh2lwQqNNiB-FXOc@ z&+9-*yAJKaOPvje#9$e(19@KiO4{|@p1c$ZKPL{9@|vZB#>+e^$ zah?l4FYBk6Nr9L3QQU`wKKipe|3CFEN^cQElhj9cKJ^!%?k2ZV{qfyu~ZAK2<(Tld|b zK78bb@x$CeH1h7iK1IY(HhSFZC_TF3+lQ#hB8kty0at_09+MBiHGLS04YrM?Z1+Q$fkmmX1h{0T__->@yc- zwl0|$&C4yzZ*H2Jet#<6o|=(NU2Cn9srM89$FuilH{2U9KM4Kw;^x5lPknzj^{HPP zxGI^h=CqW)_L=ExmN!sU{?%7jUP*=0HJiSkKRU3+t{s+qgX^s1JNnpU$+zuBYHH=J zO;by{{g3E$mt^W$yS9Gd-W!tX&@Z$apJCw=Oo)FbMkdPTS(;g#S)R@`4@&;Qjowc~ zkDRY93>sga?$-ZcRs@1p*3+JKIuS}b>P>l!HPj9sR4|*oXEU(^^x=1mh*MV`T8ym*o!#lpnMIfo07L<&9r`4>OLxYk1kw# ztTTLc{)6+$*E3Z?$sSxgz7hQ^-Nmf8emBY<$=5T`Lq038HC4kqbIcpqywDEus}% zD+bXnIz*eOV~HmW-U^+L)e<=x@8=ykV3I=F5zTcfs0)1?2I}ez7dv?4QhEbq@`kA9y1$8Bavezm*8gM1;}6#B@}MfbI@O1CiL|w2%lE+5{%k zZ@7lC^#pYZ(1YHOi7hgCkM$@E5TrTd66;Gl2Ro{3reak_9T)O9fi) z&fJ-ibPWqHWvmTZXCVEGNdYl1&}*5t~X+~m)bHTklps;tQY>2gRi)qMF~`>e&SH{{wW zn>`sy9$L7_zlwsOZ-?vSydDpkqwUD#hVK&64`hDM~7)y?Yg zSjy_Yqa9IJ_fB6m$AJ%&)IuAR1MfMnh<-fyPdH(gm@eij)v#s*JDQ0k5>bZr86~&QbY$Wvn8Te9X$i z)_saVt$tsLXb%avH~We z@tLA=Ap0jmx530v%qWad=67*f_b!AmWjn+KV_f_;lMrU39FO~mVqTz8!H;4R!36Di zejJl1CPR>fRnrx>UlaTj<+lLp?)PX=e`b+F+dz51l%<%0A_TLRVhXAgOk0Y=k(g zJOGhr5snN$2!iTGBb}tM&Mp(FPQZdQ}8kahYH&w-z3zl^;M=Q z^aIQTJZfPyGNHLZeVd-8z-Ozwd{Zf(d+~FWrJ@!cJC3KU9|7;Iz~hL@hI~Z2Ivfg* za^OHLnLlta01I{+l%#B8AebBx7^XO1z->D{5p>GB1Q#7C&Ov?xn-Ie!j>%V?WU`rI z!qHG7!7zz08>0(Ehx509;au(Z71*z~?JwbT;?E)ZN6_Q#RC8Za_sCSAZGQpZE!p}` zeAi@aTa&u)JJgrexn*sVtW6nf&u4Ybf9PD-ZS6lJ?LV^_IQ!r|DR6DG?t1d<4=z5b zYfPT~{zWhkx=uczrOs2xtL?GLzGy@%(pHtVRcCFkEnAml>jEHas|D}hGrQ4j0heFX zJ|wTpwj9KFQx+?2d}wWa?5$oFw){cKAKdiztc`7)x)*=wJ(01Ukh~|riDYY1w0~jw zK<99X$>JLUfGoa>%VxnT6iO|^(o8^a!{wtr)j8E zc7rc{0#0NcOp03!8iYJ99R_X%1PVw(SfihWD7i5JFfAsZ}m8J+xqX>-13Z)&$i|T!V=el+l zRT(Q9`kUy75|!n308UpK+aQtFHzemb9{4t$FK0|IE5{nMsx2T=0%kowD{IHOS=pEm zhlFvvSD(VR9{?Pf?5^KRgEfD*i`9CbCe}k z>bxGx=eOTtX`Jh51I!ylL#5Cx*vC4!K#`x}#Fu59H|^GD{&T3uQd)xo=lep96}I^z z4c{l=UPc~N43)~OZJ0Nf(1P07mDUvdk$@2}@{ASKm5PA?eU%D}#xeE∨2YW&Wiy zQKf&DS~4$L!rGfuSo03_pUBEQZviE5MhUNIzHie}VePzS-dtK9`twy=X!`%uLKFD& z3dToKIP+G(W7oDlqMp@(-0HTWA*+Uo=QW~+HNZt8s?rTa^$N(m4XG;Au-XEZU1(No z0;9=@ro>o4#T4K7^F5fi?JjTNVIL`*^W{2W@c(gAD zM*S@=5Sf^aauZxkAU1vgJSmY|a3L@zp#LQwXzUK=-L~kAD|g)@rm$fbJ-Ud-r_`|EIh9xQbB2HlEI{}z5(z~lo= zzKuycBr*y7gB?(G@yc{WrYB@85hg>p$Yv!2{y5p9fRkVx%Vv;Vp2CPAem5b2UlX|D zqIk<5#r@^TPcBB8h;z+fGA_yq3}{rK#}iEH0OhJ(dVlf#wELm6H68gl_QTi){jl>$ z#&jg(Jd$;KlHbahnv&moVzOjrIU*%Q&%5ao3dX2mbc@fw_kztZV<+{1fv=`_|D* z($PzqYeVbi&Hb-G>b&vD_lDGY;|>eKRrm7M<*DBp%5>g%=zAkuQ};}-^?R1J5Wn>| ze0?@>WB@kfE8|HkM9o(%;1EBW=GPAW_~gBlo35h}LHE_B+A_`i)4q?Jf7lE_rf$FF zJCHPItLjpNY2C-pA3CMLflU1Ysp??T^4Qn67TNG_gdZH*I`g`8=JieAjimW8ur{xl zQ%BNcnYsh({3F-VE!Szub^5`vP1lvA{)vx-l7Xqiy)?Qwx;&RYm=->M|A+5yI-XBz zv!1HuaHhUDHFTG`!vK(}?v*@!I8N2eiPYJFx{!_C1drhqfHYCCBlNZ*DryK_yP#ayU7cG1dO+ z={~CJP5M_)P54Tn>GYppa9+?^{)^dp!D1=1I`J?~EcpKk=|7*lj1_D`#Xk?WXGNfusWsbz{$ zR!yaPzzWk=#9Mc@WGA3q6+FL+B`e^4D)QvxlHl5@L=9NHbhYgPEbh?T9>7+R-u*m| zSd_MTyJ)LuR}><>qP^nxg2jT`c!z$|cqZXr5cOjPS4**p`3USz{i$HrLw_2I z#Hh~7%Kf^ZUqi5-iv0)Y+4e_~w`wz!(bp91sh~;mk`gZ#iTJ=TFRV;>8ZHn{F!9L* z{~<8szXQoVns}FjRs2Vg1KkXlz4$-CuSraZQxg?Gzlh0iVS>RS62&ZWSShGHGn3Jz$oiznZBI~aQ*VjXTFyjhl{K3cmHqeAwpI`l}+46Nt zzRs+#X7%#QWqe&(xdJaGO8nGquJNqsmg(gKxoWDWc6D}THeJ8z?^~N97qet zXELrcPfCpnn{&$+kZgf;?b=K0gB!X(b^O$kZRyx*8I)QE*N3*AJ10GN?wOXhze494 zC>Ll&qOfv)%H`W~wmfvUWL@4ZSEuCaOwVq09h16_ZFZg5n0^rc^w7^w{_JG7yLSt2 z4|N}X+}-zM+dZ4g@-L=;KKrxTN8R5n5Ii$#-N$pjk~*wZQ|ngaeyMSP*4LPtPKV(t zRg-7MvaDZDfE7__jN;`g^~|9EgXqW}05BR|DGr_A=XC0cQT0~Q%GA~GICT?P6`75S zEom)G!pZ5mhMni&wQX=-PCi&X`Usekh-E;X*y!FmPj2 z2+43W7XsCAZ32!+a8Z_U^d__HPOU}(b{;=gDFkC3)_U#NIZ z8e~I}OXuJz#kacCspmcLax{aN1NN!>i4UL|xEFny1ft=7v|hdU97zb^|N9tt@N1ad zgG8p8gbv;)Lq&u&_+v0~A%t9QDa33NbHOjkb0^Xn6z&yZ3wKd#sOtLEf!}@esom(Z z{K7_6H!ai8td!jic45}noN0Yt@;$#{%$&XJ7jmD}@6tQ-8a4PzPJqG#o!5$=^Gc9e7s=ulg#tddlpy!W_H>O)4FUN;l?JeH&Z#_0HII;s7wQ^~p1g|#NB;c&)zIAc5f(DVXXJJ9Qw z;s0{52E#P5&;-XYvV~#b>eMt`17~C#!%R(wqWLdIhGFAjh~)eTrNN^NbnnLFQKHh6 zaLp(91&2{K3=s{-i-7l4{*P@xvHg<3Ig3_*j$YHF zszA&+8mevY&UWqlg>_J0_rzPXdVJ-0P7g423>qvhia8_ZOcb29-d=n=byl(ma%MzX zD35=&VWlChlick&EDznaIFr*@D)su^x9_~Yb{4ct#<^cIA9!j<0-jc@J5W*`z$MjT zZd-MT+EyK+cC8KxKq2INXVjme3*Ey^Nrfc4tww^|RwcphS|_3KPzFypa`ZNnE<1PQNa(1O!6S+Qh_!*bQbb~-srR*GU!y3vH~&tJKl zhF?=8$WLs@1wtjPhTD{EfRNclJVv}vvYY(4SiMU??lj@SN(t|3$G~Z21}&NZH9E;VrP!~WZ8q-I0AJpYE37Wo4!_lK0@{dCYs2ybpdiu@qg5=Bku@wVL-U`5Fqw=F;M&T&7fa5Ps|R`Y(2D$4+cYAU_2pwQebsXbEl&}>sF(F&+&AR!1%&=&lNQ0;q4hx+<`6-IQ&0vG7vU8y@5;D1o-C89B!g0!x5Bm z5iY~~5osqpbrUyy0(>4S5HIx*AB=wLor-KW)sRU5_FPpjLZpqjavgP)<4ABil<_;- zZJ-r@RV(Lq&V{CfnYKDAbD7`;lnPtBM#nz9kd+khddnlKpKe*`rDcN_2qnFr$;M=F zPA-Ee6-aO1k-$)|-mo+n=mnZIjG}%_6fe-dDnEssWxl2*w2H$}t;GDCCy17P29bExL2A-r**=7SkS)*-{)l4krh^koR4 z8dr1G_?o*W)VvFx#x92{=pHwka%(|Y(At4nf?g_8&GKO^7i0tDRKRlURLN<89MYH* z8uLnHDXO(G=IFu=O#^<6vx=na80+i|i>X7`|ILaeB1S7Ek^(uyp>D z=e)}wSO*=Hbur!~7EK442&_8DpsCj0rm=>mHfuF^t_NX#1Fe_G3moC9Q0><^qlp$m zGXh{4WmqmWk-p~I!D=6%y{%os))_Ecs0vtC0bVwORo4Y%FiahqV!_%%+goq>H3eR7 zrG@6g_V+_?fSX}&n;32Q#n<>vUUh7}0eTC!qjj(g5e1cn<_%Mwuj+5|aVI|jlc~hc zn%CSAdH2gGrub2fCC5`{FBNRm4U1=^ZrB^FD9)<+V#%OTb60RqqnDjRvjj-~?4;#^ z2Bt%o;1zYk@<^Ign4p%BQ&h>Ycv>!|JQ{1Aj%mx+GGTF&6;P!7IFaxnoY&ZEb(Ttv zYSg-z%4@)?Z~jlUY;~-MS@|0nl+9{5cB?W|xqtX!`ayc>?bEkUu5^pPb&HSAo8fbH z_v2V{cEpSwygRlWd#mny(iXmzo=GpY^{gg(=J&1i51akNi+zjaNA(BQJob^t~=CBguOBSrkRP=7O`q+hZ$xhRi)f%aKDXkzq43yc`)>jV0!e&mNyYZ^jPXJ^Wqz zUfS#!S>w6r5t~P$b~}Kg9bccCJ3D)JK01H$+jDo$-M?(^JFhTFosrDQkWdX_;N{7nmb;MiJFCFUM?6KiyHh1R8`S-Geb_dWSz?Lj5*jkKv z>&#L`fzHb{Fk_|uegiHBpT-y~VErbr7t?6X#R&)OB6pq_%ObVi8w;f_qJKD9^4l^l&BAvIc&Rn&lY}SD~61U^C-8Qpo$5EjD zc_)+lBSLbX+6adG!o%Q$;6F@UW2_M-<{03HP+r7x9_3G36^`7FWrdUd)fd@ g*(>#3z-*r5xb*0ub{^Mlw}IBW_ELP!8`a z(IYfPWWJ)m#EdWwO&0^D;7AZ?zswZb5;ww8$cL1m!e61>Sw}+7c%c-|`Y2S2j6{jX zlSmGnLvm1|3u1lsbtvl-dlxi+3MW<&Ipv5?rbcKEaY*izX~i%5 z;h%y30Q?6PMrM_O%qhV_Z2e>|E%OSeFrC22p#~+u7~gadWMPlWB`PRK$&qv65nhfq zG-EJp;@@Y@`Q`XI|A^r1qN&{iv)yR~E4RXkaQzf*o6jq^-$9;;tWvbQ}qoDe9u`D2wFd<0(<`)cbi}gz|VsS7cFBMPpnMHJl&I ztC^y>VQMj7nk*_MFtd!2*VKMz#o)9wMH!)(iEosd;2HI%iu<^b7F&D|HEl%z#Xl5e^w18i1#JKKwRi`b;F%y%Hgl z0?zMEX41elMwx!#0*yWfII<6?m@IH?gX4hXJkDtMC~r@jc|^hUT~8U>8Nn1x|HNU~ z7m~tWhlvVZ9^Gtl(nxt+VXH?qTbYK__)Y(n`sv_aP{BdYrVgP%y8fo?ozo%MP1p>} z{s~wc)3Sdiq8>B0d1DLPJ*wHt%)0&^O}R&zVec#4MQ_s`d1J}WreR8tpNW|KS)Uo1 z;mwfKhD;uA4+N@wD*@`0v?N6D-im#GgpZ*2-qp)@~y zbovLU?+QPs0Ym_I^z!5+xT!9ZDC0VC|MXu@>Htup2VV{(5rON=dcSyLJg95j`Rih3G0 zBaSL$(Gpp3r`-RIdx5ndV&e9g-1240rU@7(9h`EZ3-$+SsY=;Qc3gpg%)=h_ylw!l z0>U$JbqcmwNpfPhWb-d(ie=>p#u~=NJWDD*#-8{e! z-?ni(T-FuH2>>^-27nrH(*s+0BZF1=9FhiCVf#xNW8C)ZuVKTcK?Apt$%!M_3CJ%> zlVwODZaazXGc=n4P|l9qbT(tyQLK!SV_|8uoXaUV-3s$5#hiNp=O`Oa5g}_tN!2`0 zV_m$3EY7 w80ctQQCBS4*zc=AFYi*K5btU%b=HNm90$(RE4E$#F>ZyS<$KM`^$) zO%6=vC-*_HgzReHneqbq+*Owqk`~EDtl|o!(Hdtko*V(Ss zP;%i&HMH$UOUjBJU!#4kks9)aBQ+Lv@2vQL9<@5RLld!fAFBATvpcN8=SaKHYD>887l)Ps_+*(QDLl5?7|a2 zJ81QYv%_EYY`gUG#h0yE{N2OzPt85GaPn8je=EN~{@%DZ*hx6smPlo| z7C=IBIkeU4PFIGkP+~c>#p>#<99!LRlGQ}lN@9OCvELHf*_3iM^H3YJ2BO`(U+w&=T%heC6t1P)qO?9@G-z{Ht@XF5Y*oJze3y>J~3Obn&6Z ziK`DU?>zL`W7oTnF0)CC6;{|(l}%Z~uI1hbKRa2Y5!dq_@^Sncd@w}(!TIoYo7L?A zR*W)MvvoKoN|0sOlas#z0jUW|+MQBZuT__sR5NucG!l<`Co~^W4XCGNA4yLNjTEG* zwQh|@64LC2(7$P@m;LM3-X!S5+Vvf6KHdqvu--IarIR4|Odq7U%%gBK$lwf#76epm#_3Aacn_I@#5bGGtFKErth}ZZ zcm#LH`-wQ-Kr*j?`oMqTh93NRgGOO4Jp7Bf#0v4-VN?{$8foih zmIId6u(!=@W}Rg*>t@=_etaeaiDP}WnSdE65Y*pLuGD#xCiXBB1g%EO+Em@3#jabK zgpLix44Q1tC(|GJ8(zSfoMU77e$buf@_hwy-ytzy`1YSrFnMaGS^NWN9Am=|6Y3GD z?0}M2^>PUUo2J_|L~qhn4}XcZH64j z6!A`wO+WZQ7~gHNcM?YH#2?S1ntnbJXtwzeF=Wn3=xA%=G^?GXcj6+nbn;0DX6cMNWi$CR+N5rGI=?Ax%xNUZX)t+q0y6J`20uxv zr%3fQsh)w#rY7tbCjpzpkTM!l-57$I`X%zxC>IGWNYZQNOtIb*lq6Zpf|$;$h6_&q zQB5nlh((fT+h2sV+sP{JzzHW@x|l{V1~T+A{D^~JAauB%3TKsKQIgUW9t5dQ0>`5+ zDiQO+q#{oX{m0Pr{W)~gM>A|qM7#D_J9=++K20(FYHOm#5CT%A_GB&SkSyB1v&K0j zkM{Q0LJldQXj?7pkP(zj)uIj=g9|zT=IoobxWl%fj_!r~<|b;b4%>zjT?;$sUa7S^ zYyx$p7Ua2Nt;1oHDAB!eV(!#;&Q@wK`UEC#l_&haxYT(8;!#V-d}=PW=�i)W~?iWSmpNrJq!wtF*p~VZo%8hLae4GpnJnLAa zv$e~}-KLX5u?Jl{J%qXx+dbNMH<$6~%wqvJhw!wBtOv1WJ7B^K3!&q~7cXnq+ z@}1duC2NTv4Aqhp$I!4FD0KTKvxcNlsQc17Pza%=weW()V5p(#gWtwon})u$=Zt== zZsp>G=G=4cz31G&^PPM3$4DehfOP+d+73v6L#0kaP1rpJ!Uj=^%IPFURG-T0oZ(CP zIF$W5Z}?MwLr4imAQj+Zt zVlAhu16KJ^P1*e&2tR=}P?F+QlJcosiif$wN^k|L@N)rX>sN#Dh2RsZpoVEcZG%#x z!KIF7r&U>vKwA{PcIe*$UnlhMQoH8|i)5Ztd#*-PqG#U)_C5b=Kkrv#SN*9r^;zvc zwHMl@B3b29k#|X=&l&uo^3LqLuy)w2J$sj*-%}B%t2Q-Ns>UyA_F^0@;|8@CGwO6W z90xkhsH)jk+|I6uGTmy*ruIoUfvJWN=SBQ8=! zl`Sn#oiOtvLsm7z=`)m7Ih(OGZwOis7R&KpP%P?1G)p#VW>vQA9K=Ml=B+-wpxU`C zWo!x-i zR@Wm1^3zF}1y}GD_=3M66oT74RV6D0=wyi zWLYycTb2`1$8S-6p5dUZ6ImCsZe$qrRrkFpJ@X7iSx)PUWy!LI2ZO=cpez4UleUpf zX0_}TBnv?Q)auHKmrgF9ph;c3kj&;Fq|M~}x^5(mj7oKxuF~`hgke%wa+F!gS|~LF z$BSjT9wsc4rPJ9Q+Xn;Z$DfU_>H3Osq1Gi}-&`SXoQzBr{X3zd%EU=5M=L`IuQ5a}3u%#%ph&yrgc_r{Kw$By6adhym!x$Df1^xD^*y&qq`+d1wAi5U5) z?Qxih(e+>%yhyQ+^LNI-bL-6QsXK3=B>4T9?m=uc59Gh3qI7=c#t3-F{VA; z-xSYe2_d?Nz%xe09%?-v)b8nrD%3VG@w9mxkq-1@SmBzm7ElEnqt+up3Lw7qj^v^L z^X^I>ZdCBoBKza|*MbGOc?v=7g*sp~=HbA+VG;6i-4rilCsbE05yoi1nvLLyop8w;*%g~YHmQ!6 zKl1+|xVLu_oLCQlT_Kg8{*ANO&)%0huf4VY)@JACnO~gy^jzufU(??#|7v+hI(2*Q zOZf}AEX_UQeX(#+bOU5;ax3@CkvpONmC=b(^wvQ6(5dq1Y0$s!nXX73*Jjsen~HPm z=TJMlein**(xI|+Xtym83$KeszBuQ0kx2A@bbRY{si$Or_Q8!0ZjY9er~h>PFE4-f z@=o*zk9aQf9`^t(C(F|0qaZ(Upcrw(Br?9Gl%~oDPL!pSmFUFQS~+?QJ0Ev>B6hlv zIr@w3EDnoIH?AB|@K`<27we#JZiPl92t5p(JygFm);Jrzc+@J?!MX+HTl>VYwHt|L z2?BaA-Q>1wb6N8>joSk7st>L!uKsXb;|u%}Ue$YOyQaf@T7cVwmme$}kbY|Z9F{`U z3|oK~uhMyVI^Og*))XiN0ItF;8;61NqIP5{J~b29mLC2a2;RCpJY@i@Q_xzy5+_ia zM23)bg5G6&^aUqCHPc!#96p_~*dR9XmS%#g*oj~jtw*qyv*Ch9u(G4*-7*v*> zuq?x?U`0n+lI4plie768%CeeCLp2YnjA5#T2W2LsGw)!$2MfLloWLA4E%+7Id)3=+hMCS|2m|XYVBcINk9@oka`%&ihbqJSD+5E7fx*i7 z!G8=73*iUdy{-Tyq@-BC8$>At$;lN_3X}F8w+$tU^zU;cC`HM@ezzT^4$=|3c6t4> z+lg8a8JTcnDD{%wA-4~ue$vzD4xluM(hy4f$ndC(Bg1^c(I+D)&GZTVp9?O5;@0Hv zkNoz?E{g68q7W+ypGh~Qe-aSA9ro6i2vv6q0mcydP5{!gkuj?`4n8KBchBH%+#pp$ yFYAZCTRoB)b_SYo->gw!F3)k?ZisM^zmw6wlJjM9{&C=Sj*D%LJRw+mF8%|XT25^M literal 0 HcmV?d00001 diff --git a/model_executor/layers/rotary_embedding/base.py b/model_executor/layers/rotary_embedding/base.py new file mode 100644 index 0000000..ce4f406 --- /dev/null +++ b/model_executor/layers/rotary_embedding/base.py @@ -0,0 +1,235 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Rotary Positional Embeddings Base Class.""" + +import torch + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.model_executor.custom_op import CustomOp + +from .common import apply_rotary_emb_torch + + +@CustomOp.register("rotary_embedding") +class RotaryEmbeddingBase(CustomOp): + """Original rotary positional embedding.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + self.dtype = dtype + # TODO(mgoin): disabled for now due to failures + # Flashinfer only supports head_size=64, 128, 256, 512. + # https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202 + # self.use_flashinfer = (self.enabled() + # and dtype in (torch.float16, torch.bfloat16) + # and current_platform.is_cuda() + # and has_flashinfer() + # and self.head_size in [64, 128, 256, 512]) + self.use_flashinfer = False + + cache = self._compute_cos_sin_cache() + if not self.use_flashinfer: + cache = cache.to(dtype) + self.cos_sin_cache: torch.Tensor + self.register_buffer("cos_sin_cache", cache, persistent=False) + self.is_rocm_triton_rotary_embed_enabled = ( + rocm_aiter_ops.is_triton_rotary_embed_enabled() + ) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / ( + base + ** ( + torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim + ) + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + t = torch.arange(self.max_position_embeddings, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache + + def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None: + # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`) + # is expensive, so avoid calling it if possible + if ( + self.cos_sin_cache.device != query.device + or self.cos_sin_cache.dtype != query.dtype + ): + self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype) + + +class RotaryEmbedding(RotaryEmbeddingBase): + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + @staticmethod + def forward_static( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None, + head_size: int, + rotary_dim: int, + cos_sin_cache: torch.Tensor, + is_neox_style: bool, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """A PyTorch-native implementation of forward().""" + positions = positions.flatten() + num_tokens = positions.shape[0] + cos_sin = cos_sin_cache.index_select(0, positions) + cos, sin = cos_sin.chunk(2, dim=-1) + + query_shape = query.shape + query = query.view(num_tokens, -1, head_size) + query_rot = query[..., :rotary_dim] + query_pass = query[..., rotary_dim:] + query_rot = apply_rotary_emb_torch(query_rot, cos, sin, is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + # key may be None in some cases, e.g. cross-layer KV sharing + if key is not None: + key_shape = key.shape + key = key.view(num_tokens, -1, head_size) + key_rot = key[..., :rotary_dim] + key_pass = key[..., rotary_dim:] + key_rot = apply_rotary_emb_torch(key_rot, cos, sin, is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + def forward_native( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """A PyTorch-native implementation of forward().""" + return self.forward_static( + positions, + query, + key, + self.head_size, + self.rotary_dim, + self.cos_sin_cache, + self.is_neox_style, + ) + + def forward_cuda( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + if self.use_flashinfer: + torch.ops.vllm.flashinfer_rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style, + ) + return query, key + + from vllm import _custom_ops as ops + + self._match_cos_sin_cache_dtype(query) + + # ops.rotary_embedding() is an in-place operation + # that updates the query and key tensors. + ops.rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style, + ) + return query, key + + def forward_hip( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + if self.is_rocm_triton_rotary_embed_enabled: + self._match_cos_sin_cache_dtype(query) + rocm_aiter_ops.triton_rotary_embed( + positions, + query, + key, + self.cos_sin_cache, + self.head_size, + self.rotary_dim, + self.is_neox_style, + ) + return query, key + return self.forward_cuda(positions, query, key) + + def forward_xpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + from vllm._ipex_ops import ipex_ops as ops + + self._match_cos_sin_cache_dtype(query) + # ops.rotary_embedding() is an in-place operation + # that updates the query and key tensors. + if key is None: + # XPU kernel doesn't support key=None so fall back to native impl + # TODO(sarckk): add support for optional key in + # ipex.llm.functional.rotary_embedding_batched + return self.forward_native(positions, query, key) + else: + ops.rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style, + ) + return query, key + + def extra_repr(self) -> str: + s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" + s += f", max_position_embeddings={self.max_position_embeddings}" + s += f", base={self.base}, is_neox_style={self.is_neox_style}" + return s diff --git a/model_executor/layers/rotary_embedding/common.py b/model_executor/layers/rotary_embedding/common.py new file mode 100644 index 0000000..9b5c069 --- /dev/null +++ b/model_executor/layers/rotary_embedding/common.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from collections.abc import Callable +from functools import cache +from importlib.util import find_spec + +import torch + +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.torch_utils import direct_register_custom_op + +# if current_platform.is_cuda(): + # from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb + +logger = init_logger(__name__) + + +# common functions +def rotate_neox(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def rotate_gptj(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) + + +def apply_rotary_emb_torch( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + is_neox_style: bool, +) -> torch.Tensor: + cos = cos.unsqueeze(-2).to(x.dtype) + sin = sin.unsqueeze(-2).to(x.dtype) + if is_neox_style: + x1, x2 = torch.chunk(x, 2, dim=-1) + else: + x1 = x[..., ::2] + x2 = x[..., 1::2] + o1 = x1 * cos - x2 * sin + o2 = x2 * cos + x1 * sin + if is_neox_style: + return torch.cat((o1, o2), dim=-1) + else: + return torch.stack((o1, o2), dim=-1).flatten(-2) + + +def apply_rotary_emb_dispatch( + x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, is_neox_style: bool +) -> torch.Tensor: + """ + Args: + x: [num_tokens, num_heads, head_size] + cos: [num_tokens, head_size // 2] + sin: [num_tokens, head_size // 2] + is_neox_style: Whether to use the Neox-style or GPT-J-style rotary + positional embeddings. + """ + # if current_platform.is_cuda(): + # return apply_rotary_emb(x.unsqueeze(0), cos, sin, not is_neox_style).squeeze(0) + # else: + return apply_rotary_emb_torch(x, cos, sin, is_neox_style) + + +@cache +def dispatch_rotary_emb_function( + default: Callable[..., torch.Tensor] | None = None, +) -> Callable[..., torch.Tensor]: + # if current_platform.is_cuda(): + # return apply_rotary_emb + + # # if torch compile is not enabled + # # use rotary embedding function from flash_attn package + # # otherwise use the naive pytorch embedding implementation + # # is faster when torch compile is enabled. + # if current_platform.is_rocm() and not torch.compiler.is_compiling(): + # if find_spec("flash_attn") is not None: + # from flash_attn.ops.triton.rotary import apply_rotary + + # return apply_rotary + # else: + # logger.warning( + # "flash_attn is not installed. Falling back to PyTorch " + # "implementation for rotary embeddings." + # ) + if default is not None: + return default + + return apply_rotary_emb_torch + + +# yarn functions +# Inverse dim formula to find dim based on number of rotations +def yarn_find_correction_dim( + num_rotations: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048, +) -> float: + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + +# Find dim range bounds based on rotations +def yarn_find_correction_range( + low_rot: int, + high_rot: int, + dim: int, + base: float = 10000, + max_position_embeddings: int = 2048, +) -> tuple[int, int]: + low = math.floor( + yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) + ) + high = math.ceil( + yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) + ) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def yarn_linear_ramp_mask( + low: float, high: float, dim: int, dtype: torch.dtype +) -> torch.Tensor: + if low == high: + high += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +def yarn_get_mscale(scale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * math.log(scale) + 1.0 + + +def _flashinfer_rotary_embedding( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + """Custom op wrapper for flashinfer's rotary embedding. + + This is an in-place operation that modifies query and key tensors directly. + """ + from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace + + apply_rope_with_cos_sin_cache_inplace( + positions=positions, + query=query, + key=key, + head_size=head_size, + cos_sin_cache=cos_sin_cache, + is_neox=is_neox, + ) + + +def _flashinfer_rotary_embedding_fake( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + is_neox: bool, +) -> None: + return + + +# Register flashinfer rotary embedding custom op +direct_register_custom_op( + op_name="flashinfer_rotary_embedding", + op_func=_flashinfer_rotary_embedding, + mutates_args=["query", "key"], # These tensors are modified in-place + fake_impl=_flashinfer_rotary_embedding_fake, +) diff --git a/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py new file mode 100644 index 0000000..b85099b --- /dev/null +++ b/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math + +import torch + +from vllm.platforms import current_platform + +from .base import RotaryEmbedding +from .common import ( + rotate_gptj, + rotate_neox, + yarn_find_correction_range, + yarn_linear_ramp_mask, +) + + +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +class DeepseekScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with YaRN method. + + Credits to Peng et al. github.com/jquesnelle/yarn + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + mscale: float = 1, + mscale_all_dim: float = 0, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation. + self.mscale = float( + yarn_get_mscale(self.scaling_factor, float(mscale)) + / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) + * attn_factor + ) + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: + pos_freqs = self.base ** ( + torch.arange( + 0, + self.rotary_dim, + 2, + dtype=torch.float, + device=current_platform.device_type, + ) + / self.rotary_dim + ) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) + + low, high = yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + self.rotary_dim, + self.base, + self.max_position_embeddings, + ) + # Get n-d rotational scaling corrected for extrapolation + inv_freq_mask = ( + 1 + - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float) + ) * self.extrapolation_factor + inv_freq = ( + inv_freq_interpolation * (1 - inv_freq_mask) + + inv_freq_extrapolation * inv_freq_mask + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.scaling_factor) + t = torch.arange( + self.max_position_embeddings * self.scaling_factor, + device=current_platform.device_type, + dtype=torch.float32, + ) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() * self.mscale + sin = freqs.sin() * self.mscale + cache = torch.cat((cos, sin), dim=-1) + return cache \ No newline at end of file diff --git a/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/model_executor/layers/rotary_embedding/dual_chunk_rope.py new file mode 100644 index 0000000..b5dd94c --- /dev/null +++ b/model_executor/layers/rotary_embedding/dual_chunk_rope.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm.model_executor.custom_op import CustomOp + +from .common import rotate_gptj, rotate_neox + + +@CustomOp.register("dual_chunk_rotary_embedding") +class DualChunkRotaryEmbedding(CustomOp): + """Rotary positional embedding for Dual Chunk Attention.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + chunk_size: int, + local_size: int, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + self.chunk_size = chunk_size + self.local_size = local_size + self.dtype = dtype + self.device = torch.device(f"cuda:{torch.cuda.current_device()}") + (q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = ( + self._compute_cos_sin_cache() + ) + + self.register_buffer("cos_sin_q_cache", q_cache, persistent=False) + self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False) + self.register_buffer("cos_sin_k_cache", k_cache, persistent=False) + self.register_buffer( + "cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False + ) + self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`. + # However, we use `torch.arange(..., dtype=torch.float)` instead to + # avoid numerical issues with large base values (e.g., 10000000). + # This may cause a slight numerical difference between the HF + # implementation and ours. + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / ( + base + ** ( + torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim + ) + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + chunk_len = self.chunk_size - self.local_size + q_t = torch.arange(chunk_len, dtype=torch.float) + qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp( + max=self.chunk_size + ) + k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len + + # count from chunk_len, no clamp(self.chunk_size) restriction + qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len + # count from self.chunk_size for q_inter's rope + q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size + + q_freqs = torch.outer(q_t, inv_freq) + qc_freqs = torch.outer(qc_t, inv_freq) + k_freqs = torch.outer(k_t, inv_freq) + qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq) + q_inter_freqs = torch.outer(q_inter_t, inv_freq) + + q_cos = q_freqs.cos() + q_sin = q_freqs.sin() + qc_cos = qc_freqs.cos() + qc_sin = qc_freqs.sin() + k_cos = k_freqs.cos() + k_sin = k_freqs.sin() + + qc_no_clamp_cos = qc_no_clamp_freqs.cos() + qc_no_clamp_sin = qc_no_clamp_freqs.sin() + q_inter_cos = q_inter_freqs.cos() + q_inter_sin = q_inter_freqs.sin() + + q_cache = torch.cat((q_cos, q_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + k_cache = torch.cat((k_cos, k_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), dim=-1).to( + dtype=self.dtype, device=self.device + ) + return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache + + def forward_native( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + query = query.view(*query.shape[:-1], -1, self.head_size) + key = key.view(*key.shape[:-1], -1, self.head_size) + query_rot = query[..., : self.rotary_dim] + key_rot = key[..., : self.rotary_dim] + if self.rotary_dim < self.head_size: + query_pass = query[..., self.rotary_dim :] + key_pass = key[..., self.rotary_dim :] + else: + query_pass = None + key_pass = None + + positions_with_offsets = ( + torch.add(positions, offsets) if offsets is not None else positions + ) + key = self._apply_rotary_embedding( + self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass + ) + chunk_len = self.chunk_size - self.local_size + query = self._apply_rotary_embedding( + self.cos_sin_q_cache[positions_with_offsets % chunk_len], + query_rot, + query_pass, + ) + query_succ = self._apply_rotary_embedding( + self.cos_sin_qc_cache[positions_with_offsets % chunk_len], + query_rot, + query_pass, + ) + query_inter = self._apply_rotary_embedding( + self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1), + query_rot, + query_pass, + ) + query_succ_critical = self._apply_rotary_embedding( + self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len], + query_rot, + query_pass, + ) + query_inter_critical = self._apply_rotary_embedding( + self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len], + query_rot, + query_pass, + ) + + # merge query into one tensor to simplify the interfaces + query = torch.cat( + ( + query, + query_succ, + query_inter, + query_succ_critical, + query_inter_critical, + ), + dim=-1, + ) + return query, key + + def forward_cuda( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return self.forward_native(positions, query, key, offsets) + + def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass): + cos, sin = cos_sin.chunk(2, dim=-1) + if self.is_neox_style: + # NOTE(woosuk): Here we assume that the positions tensor has the + # shape [batch_size, seq_len]. + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj + hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin + + if self.rotary_dim < self.head_size: + hidden = torch.cat((hidden_rot, hidden_pass), dim=-1) + else: + hidden = hidden_rot + return hidden.flatten(-2).squeeze(0) + + def extra_repr(self) -> str: + s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" + s += f", max_position_embeddings={self.max_position_embeddings}" + s += f", base={self.base}, is_neox_style={self.is_neox_style}" + s += f", chunk_size={self.chunk_size}, local_size={self.local_size}" + return s diff --git a/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py b/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py new file mode 100644 index 0000000..dd9d06d --- /dev/null +++ b/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from .base import RotaryEmbedding + + +class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with Dynamic NTK alpha. + + Based on the original RotaryEmbedding implementation. + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_alpha: float, + dtype: torch.dtype, + ) -> None: + self.scaling_alpha = scaling_alpha + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + # For Hunyuan DynamicNTKAlphaRotaryEmbedding + max_len = self.max_position_embeddings + base = self.base * self.scaling_alpha ** ( + self.rotary_dim / (self.rotary_dim - 2) + ) + inv_freq = self._compute_inv_freq(base) + t = torch.arange(max_len, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache diff --git a/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py b/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py new file mode 100644 index 0000000..28fd87e --- /dev/null +++ b/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from .base import RotaryEmbedding + + +class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with Dynamic NTK scaling. + + Credits to the Reddit users /u/bloc97 and /u/emozilla + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + ) -> None: + self.scaling_factor = scaling_factor + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * self.scaling_factor + base = self.base * ( + (self.scaling_factor * max_len / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.rotary_dim / (self.rotary_dim - 2)) + inv_freq = self._compute_inv_freq(base) + t = torch.arange(max_len, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache diff --git a/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/model_executor/layers/rotary_embedding/ernie45_vl_rope.py new file mode 100644 index 0000000..749cdbe --- /dev/null +++ b/model_executor/layers/rotary_embedding/ernie45_vl_rope.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from .common import apply_rotary_emb_dispatch +from .mrope import MRotaryEmbedding + + +class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding): + """3D rotary positional embedding. 3D is t:time h:height w:width""" + + def forward_native( # type: ignore[override] + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + assert positions.ndim == 1 or positions.ndim == 2 + assert key is not None + + num_tokens = positions.shape[-1] + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if positions.ndim == 2: + assert self.mrope_section + + section_h = self.mrope_section[0] # 22 + section_w = self.mrope_section[1] # 22 + section_t = self.mrope_section[2] # 20 + assert section_h == section_w + # Split according to [h w h w h w h w... t t t...] + section_cos_t = cos[..., -section_t:] + section_cos_h = cos[..., : section_h + section_w : 2] + section_cos_w = cos[..., 1 : section_h + section_w : 2] + + cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[1], section_cos_w[2] + cos_hw = torch.stack([cos_h, cos_w], dim=-1).reshape( + cos_h.shape[:-1] + (cos_h.shape[-1] * 2,) + ) + cos = torch.cat([cos_hw, cos_t], dim=-1) + + section_sin_t = sin[..., -section_t:] + section_sin_h = sin[..., : section_h + section_w : 2] + section_sin_w = sin[..., 1 : section_h + section_w : 2] + + sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[1], section_sin_w[2] + sin_hw = torch.stack([sin_h, sin_w], dim=-1).reshape( + sin_h.shape[:-1] + (sin_h.shape[-1] * 2,) + ) + sin = torch.cat([sin_hw, sin_t], dim=-1) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., : self.rotary_dim] + query_pass = query[..., self.rotary_dim :] + query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., : self.rotary_dim] + key_pass = key[..., self.rotary_dim :] + key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + def forward_cuda( # type: ignore[override] + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + return self.forward_native(positions, query, key) diff --git a/model_executor/layers/rotary_embedding/linear_scaling_rope.py b/model_executor/layers/rotary_embedding/linear_scaling_rope.py new file mode 100644 index 0000000..bb51dcf --- /dev/null +++ b/model_executor/layers/rotary_embedding/linear_scaling_rope.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from .base import RotaryEmbedding + + +class LinearScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with linear scaling. + + It supports multiple scaling factors. Since multiple LoRA adapters may have + different scaling factors, we need multiple cos/sin caches. In this way, + instead of running rotary embedding kernel per lora, we can run multiple + lora in a batched way. + + In addition to that, we also keep the cos/sin cache for the scaling factor + of 1 (default) at all times. + + Exemplary for two scaling factors x=1, y and z with embeddings + [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and + [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and + [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]], + + we construct the cos/sin cache as follows: + [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p], + ... + [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]] + + We then use offsets to index into the cos/sin cache for + the respective scaling factors. + + The offset to cache can be accessed via `scaling_factor_to_offset` API. + + Credits to the Reddit user /u/kaiokendev + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factors: list[float] | float, + dtype: torch.dtype, + ) -> None: + if isinstance(scaling_factors, float): + scaling_factors = [scaling_factors] + self.scaling_factors: list[float] = scaling_factors # noqa + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + # Lazy initialized. + self._scaling_factor_to_offset: dict[float, int] + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.base) + cache_list: list[torch.Tensor] = [] + # offsets to the next cache in a tensor. + # Each offset corresponds to the same index in scaling_factors. + offsets: list[int] = [] + for scaling_factor in self.scaling_factors: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * scaling_factor + t = torch.arange(max_len, dtype=torch.float) + t = t / scaling_factor + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + if not cache_list: + offset = 0 + else: + last_offset = offsets[-1] + next_max_len = cache_list[-1].shape[0] + offset = last_offset + next_max_len + offsets.append(offset) + cache_list.append(cache) + self._scaling_factor_to_offset = { + float(scaling_factor): offsets[i] + for i, scaling_factor in enumerate(self.scaling_factors) + } + assert len(self.scaling_factors) == len(offsets) + return torch.cat(cache_list, dim=0) + + @property + def scaling_factor_to_offset(self) -> dict[float, int]: + return self._scaling_factor_to_offset diff --git a/model_executor/layers/rotary_embedding/llama3_rope.py b/model_executor/layers/rotary_embedding/llama3_rope.py new file mode 100644 index 0000000..ed9a603 --- /dev/null +++ b/model_executor/layers/rotary_embedding/llama3_rope.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math + +import torch + +from .base import RotaryEmbedding + + +class Llama3RotaryEmbedding(RotaryEmbedding): + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + scaling_factor: float, + low_freq_factor: float, + high_freq_factor: float, + orig_max_position: int, + ) -> None: + self.scaling_factor = scaling_factor + self.low_freq_factor = low_freq_factor + self.high_freq_factor = high_freq_factor + self.orig_max_position = orig_max_position + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + inv_freqs = super()._compute_inv_freq(base) + low_freq_wavelen = self.orig_max_position / self.low_freq_factor + high_freq_wavelen = self.orig_max_position / self.high_freq_factor + + wave_len = 2 * math.pi / inv_freqs + if self.low_freq_factor != self.high_freq_factor: + smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / ( + self.high_freq_factor - self.low_freq_factor + ) + else: + smooth = 0 + new_freqs = torch.where( + wave_len < high_freq_wavelen, + inv_freqs, + torch.where( + wave_len > low_freq_wavelen, + inv_freqs / self.scaling_factor, + (1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs, + ), + ) + return new_freqs diff --git a/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/model_executor/layers/rotary_embedding/llama4_vision_rope.py new file mode 100644 index 0000000..9fdac30 --- /dev/null +++ b/model_executor/layers/rotary_embedding/llama4_vision_rope.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math + +import torch + +from .base import RotaryEmbeddingBase + + +class Llama4VisionRotaryEmbedding(RotaryEmbeddingBase): + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + ): + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + inv_freqs = super()._compute_inv_freq(base) + inv_freqs = inv_freqs[: (self.rotary_dim // 2)] + return inv_freqs + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.base) + + # self.max_position_embeddings here is number of image patches + # i.e. (image_size // patch_size) ** 2 + num_patches = self.max_position_embeddings + img_idx = torch.arange(num_patches, dtype=torch.int32).reshape(num_patches, 1) + img_idx = torch.cat([img_idx, img_idx[:1]], dim=0) + img_idx[-1, -1] = -2 # set to ID_CLS_TOKEN + num_patches_single_dim = int(math.sqrt(num_patches)) + frequencies_x = img_idx % num_patches_single_dim + frequencies_y = img_idx // num_patches_single_dim + freqs_x = ( + (frequencies_x + 1)[..., None] * inv_freq[None, None, :] + ).repeat_interleave(2, dim=-1) + freqs_y = ( + (frequencies_y + 1)[..., None] * inv_freq[None, None, :] + ).repeat_interleave(2, dim=-1) + freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2] + freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0) + cache = torch.view_as_complex( + torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1) + ) + return cache + + def forward_native( # type: ignore[override] + self, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + assert key is not None + # self.cos_sin_cache here is complex tensor so we cannot cast into + # query's dtype directly with self._match_cos_sin_cache_dtype + self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device) + query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2)) + key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2)) + broadcast_shape = [ + d if i == 1 or i == (query_.ndim - 1) else 1 + for i, d in enumerate(query_.shape) + ] + freqs_ci = self.cos_sin_cache.view(*broadcast_shape) + query_out = torch.view_as_real(query_ * freqs_ci).flatten(3) + key_out = torch.view_as_real(key_ * freqs_ci).flatten(3) + return query_out.type_as(query), key_out.type_as(key) + + def forward_cuda( # type: ignore[override] + self, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + return self.forward_native(query, key) diff --git a/model_executor/layers/rotary_embedding/mrope.py b/model_executor/layers/rotary_embedding/mrope.py new file mode 100644 index 0000000..8699fa7 --- /dev/null +++ b/model_executor/layers/rotary_embedding/mrope.py @@ -0,0 +1,403 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import numpy as np +import torch + +from vllm.triton_utils import tl, triton + +from .base import RotaryEmbeddingBase +from .common import apply_rotary_emb_dispatch +from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale + + +@triton.jit +def _triton_mrope_forward( + q_ptr, + k_ptr, + cos, + sin, + num_tokens, + n_qh: tl.constexpr, + n_kh: tl.constexpr, + hd: tl.constexpr, + rd: tl.constexpr, + pad_n_qh: tl.constexpr, + pad_n_kh: tl.constexpr, + pad_hd: tl.constexpr, + mrope_section_t: tl.constexpr, + mrope_section_h: tl.constexpr, + mrope_section_w: tl.constexpr, + is_interleaved: tl.constexpr, +): + # Adapted from + # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py + # This version supports flatten input tensors from vllm + # and supports cos and sin cache with shape (3, num_tokens, head_dim // 2) + # instead of (3, bsz, seq_len, head_dim), also supports interleaved rotary + pid = tl.program_id(0) + # locate start address + q_ptr = q_ptr + pid * (n_qh * hd) + k_ptr = k_ptr + pid * (n_kh * hd) + + # #################################################################### + # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position + # m of this program instance + # #################################################################### + # Note: cos and sin now have shape (3, num_tokens, head_dim // 2) + + # Updated stride calculation for half head_dim + half_rd = rd // 2 + t_cos = cos + pid * half_rd + h_cos = t_cos + num_tokens * half_rd + w_cos = h_cos + num_tokens * half_rd + t_sin = sin + pid * half_rd + h_sin = t_sin + num_tokens * half_rd + w_sin = h_sin + num_tokens * half_rd + + # Updated offsets for half head_dim + cos_offsets = tl.arange(0, pad_hd // 2) + if is_interleaved: + h_mask = ((cos_offsets % 3) == 1) & (cos_offsets <= 3 * mrope_section_h) + w_mask = ((cos_offsets % 3) == 2) & (cos_offsets <= 3 * mrope_section_w) + t_mask = ~(h_mask | w_mask) + else: + t_end = mrope_section_t + h_end = t_end + mrope_section_h + t_mask = cos_offsets < mrope_section_t + h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end) + w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd) + + t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0) + h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0) + w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0) + t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0) + h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0) + w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0) + + cos_row = t_cos_row + h_cos_row + w_cos_row + sin_row = t_sin_row + h_sin_row + w_sin_row + + # #################################################################### + # Load the left and right half of q and k for the current + # program instance (i.e. for the current token) separately + # #################################################################### + # left half of the head + first_half_q_offsets = ( + tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :] + ) + first_half_k_offsets = ( + tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :] + ) + first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & ( + tl.arange(0, pad_hd // 2)[None, :] < rd // 2 + ) + first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & ( + tl.arange(0, pad_hd // 2)[None, :] < rd // 2 + ) + + q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to( + sin_row.dtype + ) + k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to( + sin_row.dtype + ) + + # right half of the head + second_half_q_offsets = first_half_q_offsets + (rd // 2) + second_half_k_offsets = first_half_k_offsets + (rd // 2) + second_q_mask = first_q_mask + second_k_mask = first_k_mask + + q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to( + sin_row.dtype + ) + k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to( + sin_row.dtype + ) + + # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin] + # Since cos and sin are now half-size, + # we use the same cos_row and sin_row for both halves + new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row + tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask) + new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row + tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask) + + new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row + tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask) + new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row + tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask) + + +def triton_mrope( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + mrope_section: list[int], + head_size: int, + rotary_dim: int, + mrope_interleaved: bool, +) -> tuple[torch.Tensor, torch.Tensor]: + """Qwen2VL mrope kernel. + + Args: + q: [num_tokens, num_heads * head_size] + k: [num_tokens, num_kv_heads * head_size] + cos: [3, num_tokens, head_size //2 ] + (T/H/W positions with multimodal inputs) + sin: [3, num_tokens, head_size //2 ] + (T/H/W positions with multimodal inputs) + mrope_section: [t, h, w] + head_size: int + """ + n_row, n_q_head_head_dim = q.shape + n_q_head = n_q_head_head_dim // head_size + n_kv_head = k.shape[1] // head_size + pad_hd = triton.next_power_of_2(head_size) + pad_n_q_head = triton.next_power_of_2(n_q_head) + pad_n_kv_head = triton.next_power_of_2(n_kv_head) + + # ensure tensors passed into the kernel are contiguous. + # It will be no-op if they are already contiguous + q = q.contiguous() + k = k.contiguous() + cos = cos.contiguous() + sin = sin.contiguous() + + _triton_mrope_forward[(n_row,)]( + q, + k, + cos, + sin, + n_row, + n_q_head, + n_kv_head, + head_size, + rotary_dim, + pad_n_q_head, + pad_n_kv_head, + pad_hd, + mrope_section[0], + mrope_section[1], + mrope_section[2], + mrope_interleaved, + ) + return q, k + + +def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.Tensor: + """Apply interleaved MRoPE to 3D rotary embeddings. + Reorganizes frequency layout from chunked [TTT...HHH...WWW] to + interleaved [THTHWHTHW...TT], preserving frequency continuity. + """ + x_t = x[0].clone() + x_t[..., 1 : mrope_section[1] * 3 : 3] = x[1, ..., 1 : mrope_section[1] * 3 : 3] + x_t[..., 2 : mrope_section[2] * 3 : 3] = x[2, ..., 2 : mrope_section[2] * 3 : 3] + return x_t + + +class MRotaryEmbedding(RotaryEmbeddingBase): + """Rotary Embedding with Multimodal Sections.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + mrope_section: list[int] | None = None, + mrope_interleaved: bool = False, + # YaRN parameters. + *, + scaling_factor: float | None = None, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + if self.scaling_factor is not None: + # Get n-d magnitude scaling corrected for interpolation + self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor) + else: + self.mscale = 1.0 + + # In Qwen2.5-VL, the maximum index value is related to the duration of + # the input video. We enlarge max_position_embeddings to 4 times to get + # a larger the cos and sin cache. + self.cache_max_position_num = max_position_embeddings * 4 + super().__init__( + head_size, + rotary_dim, + self.cache_max_position_num, + base, + is_neox_style, + dtype, + ) + + self.mrope_section = mrope_section + self.mrope_interleaved = mrope_interleaved + if self.mrope_section: + assert sum(self.mrope_section) == rotary_dim // 2 + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + if self.scaling_factor is None: + return super()._compute_inv_freq(base) + return YaRNScalingRotaryEmbedding._compute_inv_freq(self, base) + + def _compute_cos_sin_cache(self) -> torch.Tensor: + if self.scaling_factor is None: + return super()._compute_cos_sin_cache() + return YaRNScalingRotaryEmbedding._compute_cos_sin_cache(self) + + def forward_native( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + offsets: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """PyTorch-native implementation equivalent to forward(). + + Args: + positions: + [num_tokens,] (text only) or + [3, num_tokens] (T/H/W positions with multimodal inputs) + query: [num_tokens, num_heads * head_size] + key: [num_tokens, num_kv_heads * head_size] + """ + assert positions.ndim == 1 or positions.ndim == 2 + assert key is not None + + self._match_cos_sin_cache_dtype(query) + num_tokens = positions.shape[-1] + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if positions.ndim == 2: + assert self.mrope_section + if self.mrope_interleaved: + cos = apply_interleaved_rope(cos, self.mrope_section) + sin = apply_interleaved_rope(sin, self.mrope_section) + else: + cos = torch.cat( + [m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))], + dim=-1, + ) + sin = torch.cat( + [m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))], + dim=-1, + ) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., : self.rotary_dim] + query_pass = query[..., self.rotary_dim :] + query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., : self.rotary_dim] + key_pass = key[..., self.rotary_dim :] + key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + def forward_cuda( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + offsets: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + assert positions.ndim == 1 or positions.ndim == 2 + assert key is not None + from vllm import _custom_ops as ops + + self._match_cos_sin_cache_dtype(query) + + if self.mrope_interleaved: + num_tokens = positions.shape[-1] + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + query_shape = query.shape + key_shape = key.shape + if positions.ndim == 2: + assert self.mrope_section + q, k = triton_mrope( + query, + key, + cos, + sin, + self.mrope_section, + self.head_size, + self.rotary_dim, + self.mrope_interleaved, + ) + + return q.reshape(query_shape), k.reshape(key_shape) + + if positions.ndim == 1: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) + else: + if self.is_neox_style: + ops.m_rotary_embedding(positions.contiguous(), query, key, self.head_size, + self.cos_sin_cache, + torch.tensor(self.mrope_section, dtype=torch.int), + self.is_neox_style) + else: + query, key = self.forward_native( + positions, query, key + ) + + + return query, key + + def forward_cpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + offsets: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + return self.forward_native(positions, query, key, offsets) + + @staticmethod + def get_next_input_positions( + mrope_position_delta: int, + context_len: int, + seq_len: int, + ) -> list[list[int]]: + return [ + list( + range( + context_len + mrope_position_delta, seq_len + mrope_position_delta + ) + ) + for _ in range(3) + ] + + @staticmethod + def get_next_input_positions_tensor( + out: np.ndarray, + out_offset: int, + mrope_position_delta: int, + context_len: int, + num_new_tokens: int, + ): + values = np.arange( + mrope_position_delta + context_len, + mrope_position_delta + context_len + num_new_tokens, + dtype=out.dtype, + ) + out[:, out_offset : out_offset + num_new_tokens] = values diff --git a/model_executor/layers/rotary_embedding/ntk_scaling_rope.py b/model_executor/layers/rotary_embedding/ntk_scaling_rope.py new file mode 100644 index 0000000..031a12f --- /dev/null +++ b/model_executor/layers/rotary_embedding/ntk_scaling_rope.py @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from .base import RotaryEmbedding + + +class NTKScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with fixed and mixed NTK scaling. + https://kexue.fm/archives/9706""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + mixed_b: float | None = None, + ) -> None: + self.scaling_factor = scaling_factor + self.mixed_b = mixed_b + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_inv_freq(self, base: float) -> torch.Tensor: + base = self.base * (self.scaling_factor if self.mixed_b is None else 1) + inv_freq = super()._compute_inv_freq(base) + + if self.mixed_b is None: + inv_freq = inv_freq / self.scaling_factor ** (2 / self.rotary_dim) + else: + a = ( + torch.tensor(self.scaling_factor).log() + / (self.rotary_dim / 2) ** self.mixed_b + ) + lambda_1_m = ( + a * torch.arange(1, self.rotary_dim // 2 + 1).float() ** self.mixed_b + ).exp() + inv_freq = inv_freq / lambda_1_m + + return inv_freq diff --git a/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py b/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py new file mode 100644 index 0000000..5e519cf --- /dev/null +++ b/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py @@ -0,0 +1,151 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math + +import torch +import torch.nn as nn + +from vllm.config import get_current_vllm_config +from vllm.logger import init_logger + +from .common import rotate_neox + +logger = init_logger(__name__) + +import ixformer.inference.functions as ixops + +class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): + """Phi3 family of models scaled rotary embedding. + + Based on the original RotaryEmbedding implementation. + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + original_max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + short_factor: list[float], + long_factor: list[float], + short_mscale: float | None = None, + long_mscale: float | None = None, + ): + super().__init__() + + if is_neox_style is False: + raise ValueError( + "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style." + ) + + self.rotary_dim = rotary_dim + self.head_size = head_size + self.max_position_embeddings = max_position_embeddings + self.original_max_position_embeddings = original_max_position_embeddings + self.base = base + self.short_factor = short_factor + self.long_factor = long_factor + + # Force long factors if max_model_len (runtime max length) exceeds + # original_max_position_embeddings to prevent KV cache invalidation when + # sequences cross this threshold during generation + max_model_len = get_current_vllm_config().model_config.max_model_len + self.use_long_rope = max_model_len > original_max_position_embeddings + if self.use_long_rope: + logger.warning_once( + "Using LongRoPE scaling factors. This enables longer " + "contexts (%d tokens vs original %d tokens) at the cost of " + "some performance degradation for shorter sequences. If " + "this is not desired, set `max_model_len` to be at most %d.", + max_position_embeddings, + original_max_position_embeddings, + original_max_position_embeddings, + ) + + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt( + 1 + math.log(scale) / math.log(self.original_max_position_embeddings) + ) + if short_mscale is None: + short_mscale = scaling_factor + if long_mscale is None: + long_mscale = scaling_factor + + self.short_mscale = short_mscale + self.long_mscale = long_mscale + + short_cache = self._compute_cos_sin_cache( + original_max_position_embeddings, short_factor, short_mscale + ) + short_cache = short_cache.to(dtype) + + long_cache = self._compute_cos_sin_cache( + max_position_embeddings, long_factor, long_mscale + ) + long_cache = long_cache.to(dtype) + + long_short_cache = torch.cat([short_cache, long_cache], dim=0) + self.register_buffer( + "long_short_cos_sin_cache", long_short_cache, persistent=False + ) + + def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor: + rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32) + inv_freq = 1.0 / ( + rescale_factors + * ( + self.base + ** ( + torch.arange(0, self.rotary_dim, 2, dtype=torch.float) + / self.rotary_dim + ) + ) + ) + return inv_freq + + def _compute_cos_sin_cache( + self, + max_position_embeddings: int, + rescale_factors: list[float], + mscale: float, + ) -> torch.Tensor: + inv_freq = self._compute_inv_freq(rescale_factors) + t = torch.arange(max_position_embeddings, dtype=torch.float) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() * mscale + sin = freqs.sin() * mscale + cache = torch.cat((cos, sin), dim=-1) + return cache + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + offsets: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + assert key is not None + query = query.view(*query.shape[:-1], -1, self.head_size) + key = key.view(*key.shape[:-1], -1, self.head_size) + + k = self.original_max_position_embeddings + long_prompt_offset = torch.any(positions > k) + + ixops.vllm_rotary_embedding_phi( + positions, + query, + key, + self.head_size, + self.long_short_cos_sin_cache, + long_prompt_offset, + k, + offsets + ) + + return query, key diff --git a/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/model_executor/layers/rotary_embedding/yarn_scaling_rope.py new file mode 100644 index 0000000..ff46ad7 --- /dev/null +++ b/model_executor/layers/rotary_embedding/yarn_scaling_rope.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from .base import RotaryEmbedding +from .common import yarn_find_correction_range, yarn_get_mscale, yarn_linear_ramp_mask + + +class YaRNScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with YaRN method. + + Credits to Peng et al. github.com/jquesnelle/yarn + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + apply_yarn_scaling: bool = True, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation + self.mscale = ( + float(yarn_get_mscale(self.scaling_factor) * attn_factor) + if apply_yarn_scaling + else float(attn_factor) + ) + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + + def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: + pos_freqs = self.base ** ( + torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim + ) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) + + low, high = yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + self.rotary_dim, + self.base, + self.max_position_embeddings, + ) + # Get n-d rotational scaling corrected for extrapolation + inv_freq_mask = ( + 1 + - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float) + ) * self.extrapolation_factor + inv_freq = ( + inv_freq_interpolation * (1 - inv_freq_mask) + + inv_freq_extrapolation * inv_freq_mask + ) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.scaling_factor) + t = torch.arange( + self.max_position_embeddings * self.scaling_factor, dtype=torch.float32 + ) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() * self.mscale + sin = freqs.sin() * self.mscale + cache = torch.cat((cos, sin), dim=-1) + return cache diff --git a/model_executor/layers/shared_fused_moe/__pycache__/shared_fused_moe.cpython-312.pyc b/model_executor/layers/shared_fused_moe/__pycache__/shared_fused_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96ee8ecebc27cda549ee404201fa3b56ce53f35d GIT binary patch literal 2514 zcma)8O>7fK6rNr0uGfx#0tI51m@N4bIFMMXD*gnNN>Bt6;SefSR;1O&JH{K<-eqRzkj=e7MGE!_O#_WgUic!#9iztSp zOk$&QU7=-TwmJ+1ZkTgsssM&2v0_wQOiZlZ0;SuATVk3;W@F2ZZaB8)7D})AqbG1& zZ2M7p3E!0(#OftLU$t(5Va>L&UI-A$%0W@y1;5R6K)&P_kPAri$GlKSp9+8r&Or6h zH{54{{5n@hD!;X(&I8xBRzq2SY#C(*Pjsq~B~MmWqhh$K>Lu0AkVphfWJOPKuvsJ& z#Z2N!H*Ra>y5mJuwP0$FqpA)Kkd|%suR_i(+d12?M_ky1-I3|)@slTR9LG7+n9SKV zw`5gvw@kB~^Dh#f#)T@>B4=tfOq^U00?Q`cHT)03s9hs*a5vL4-T|D^t#iBui_Q>` zSF`9v3`G+2;ijBw%KeMS*5v+;zLAFTSnl08aI_)(B=6sZjmVP-cB`aPXr~o~bWYQ@ zc?`$`s-r3R?i?fr@^-4XSI43K0KB=eEYFfg60nx#ND@8{wXBfcKv`C2GCNMvS%wT% z*$-qEJvnq_{m`kkL#G<F#g$~Co(|2`E8c+cC&6$fX@ZV0G@sc{uidSf%LN zF}7L~>2fLD*{FxMeYx$1I4=V1Au|AV{#$+}sK6be9;ypPejNSR&I!=C6WZRN4UIjc zM}l(ggAeJUb9uXTUcztk(Cb;V@xrdHI>rq44wkEq8$1D>7f6NlLzi+0%@3iWd*Px* z;PU8Mi6LRzmlq9+4mw`MM?Xx-F3@sl9_j5;W_{35M9l2l0h++_L8|Z-5I}z`in{l# zC;Qiu{Tu1bdU~Xp9(hoHls?_K+>+6*z3bfrYuy76gom-^*mpPA4xiXa^{%JRHdALG zr_TMHIrIJ0qs*nd(O*-UrSMX6O4Yl$d*coT)viN>YYZq)tu;>p$I;Ho_MB7%m7 z?_a!swY_iXUD!92XvPjUE-yyFYNPg0qC@{)Wnxs+^) zz_%on&b*{6tvyIew0KUuz7wlb+s%`WN+< z2y?KK$}lcWIT=Nhjq2z1`TF9A_a>GmR+HIgWcbertUX6SSRPrVS=p1^nhlScfA~QN z+Uur`b`;(sml%`K3v>oWZ Optional[torch.nn.Module]: + return self._shared_experts if self.use_overlapped else None + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + if not self.use_overlapped: + shared_out = self._shared_experts(hidden_states) + + # Reduce outputs if necessary, since the MLP should + # have been created with reduce_results=False. + if (self.reduce_results and self.tp_size > 1 + and self.must_reduce_shared_expert_outputs()): + shared_out = tensor_model_parallel_all_reduce(shared_out) + + fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits, + ) + else: + shared_out, fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits, + ) + return shared_out, fused_out diff --git a/model_executor/layers/utils.py b/model_executor/layers/utils.py new file mode 100644 index 0000000..2dd1129 --- /dev/null +++ b/model_executor/layers/utils.py @@ -0,0 +1,253 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility methods for model layers.""" + +from collections.abc import Callable + +import torch + +from vllm import _custom_ops as ops +from vllm import envs +from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger +from vllm.platforms import CpuArchEnum, current_platform +from vllm.utils.platform_utils import get_cu_count +from vllm.utils.torch_utils import direct_register_custom_op +import ixformer.inference.functions as IXF + +logger = init_logger(__name__) + + +def shuffle_weight(w: torch.Tensor) -> torch.Tensor: + # Shuffle weight along the last dimension so that + # we folded the weights to adjance location + # Example: + # input: + # [[1, 2, 3, 4, 5, 6], + # [7, 8, 9, 10, 11, 12]] + # output: + # [[1, 4, 2, 5, 3, 6], + # [7, 10, 8, 11, 9, 12]] + # This will be used together with triton swiglu kernel + shape = w.shape + N = shape[-1] + first = w[..., : N // 2] + second = w[..., N // 2 :] + + stacked = torch.stack((first, second), dim=-1) + w_shuffled = stacked.reshape(shape) + return w_shuffled + +def get_token_bin_counts_and_mask( + tokens: torch.Tensor, + vocab_size: int, + num_seqs: int, +) -> tuple[torch.Tensor, torch.Tensor]: + # Compute the bin counts for the tokens. + # vocab_size + 1 for padding. + bin_counts = torch.zeros( + (num_seqs, vocab_size + 1), dtype=torch.long, device=tokens.device + ) + bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens)) + bin_counts = bin_counts[:, :vocab_size] + mask = bin_counts > 0 + + return bin_counts, mask + + +def apply_penalties( + logits: torch.Tensor, + prompt_tokens_tensor: torch.Tensor, + output_tokens_tensor: torch.Tensor, + presence_penalties: torch.Tensor, + frequency_penalties: torch.Tensor, + repetition_penalties: torch.Tensor, +) -> torch.Tensor: + """ + Applies penalties in place to the logits tensor + logits : The input logits tensor of shape [num_seqs, vocab_size] + prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts + are padded to the maximum prompt length within the batch using + `vocab_size` as the padding value. The value `vocab_size` is used + for padding because it does not correspond to any valid token ID + in the vocabulary. + output_tokens_tensor: The output tokens tensor. + presence_penalties: The presence penalties of shape (num_seqs, ) + frequency_penalties: The frequency penalties of shape (num_seqs, ) + repetition_penalties: The repetition penalties of shape (num_seqs, ) + """ + num_seqs, vocab_size = logits.shape + _, prompt_mask = get_token_bin_counts_and_mask( + prompt_tokens_tensor, vocab_size, num_seqs + ) + output_bin_counts, output_mask = get_token_bin_counts_and_mask( + output_tokens_tensor, vocab_size, num_seqs + ) + + # Apply repetition penalties as a custom op + from vllm._custom_ops import apply_repetition_penalties + + apply_repetition_penalties(logits, prompt_mask, output_mask, repetition_penalties) + + # We follow the definition in OpenAI API. + # Refer to https://platform.openai.com/docs/api-reference/parameter-details + logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts + logits -= presence_penalties.unsqueeze(dim=1) * output_mask + return logits + + +def default_unquantized_gemm( + layer: torch.nn.Module, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None = None, +): + if x.dtype == torch.float32: + return torch.nn.functional.linear(x, weight, bias) + return IXF.linear(x, weight, bias) + + +def use_aiter_triton_gemm(n, m, k, dtype): + if ( + not rocm_aiter_ops.is_triton_gemm_enabled() + # MI300's - fp8nuz=True + or current_platform.is_fp8_fnuz() + or dtype not in [torch.float16, torch.bfloat16] + ): + return False + + # use hipblaslt for the larger GEMMs + if n > 2048 and m > 512: + return False + return ( + (m == 5120 and k == 2880) + or (m == 2880 and k == 4096) + or (m == 128 and k == 2880) + or (m == 640 and k == 2880) + or (m == 2880 and k == 512) + ) + + +def rocm_unquantized_gemm_impl( + x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None +) -> torch.Tensor: + from vllm.platforms.rocm import on_gfx9 + + n = x.numel() / x.size(-1) + m = weight.shape[0] + k = weight.shape[1] + + if use_aiter_triton_gemm(n, m, k, x.dtype): + from aiter.ops.triton.gemm_a16w16 import gemm_a16w16 + + return gemm_a16w16(x, weight, bias) + + use_skinny = ( + envs.VLLM_ROCM_USE_SKINNY_GEMM + and on_gfx9() + and x.dtype in [torch.float16, torch.bfloat16] + and k % 8 == 0 + ) + + if use_skinny is not True: + return torch.nn.functional.linear(x, weight, bias) + + x_view = x.reshape(-1, x.size(-1)) + if m > 8 and 0 < n <= 4: + cu_count = get_cu_count() + out = ops.wvSplitK(weight, x_view, cu_count, bias) + return out.reshape(*x.shape[:-1], weight.shape[0]) + elif m % 4 == 0 and n == 1 and k <= 8192 and bias is None: + out = ops.LLMM1(weight, x_view, 4) + return out.reshape(*x.shape[:-1], weight.shape[0]) + return torch.nn.functional.linear(x, weight, bias) + + +def rocm_unquantized_gemm_fake( + x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None +) -> torch.Tensor: + return x.new_empty((*x.shape[:-1], weight.shape[0])) + + +def rocm_unquantized_gemm( + layer: torch.nn.Module, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None = None, +) -> torch.Tensor: + return torch.ops.vllm.rocm_unquantized_gemm(x, weight, bias) + + +direct_register_custom_op( + op_name="rocm_unquantized_gemm", + op_func=rocm_unquantized_gemm_impl, + fake_impl=rocm_unquantized_gemm_fake, +) + + +def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype) -> bool: + return ( + torch._C._cpu._is_amx_tile_supported() + and (dtype in (torch.bfloat16, torch.int8)) + and k % 32 == 0 + and n % 16 == 0 + ) + + +def dispatch_cpu_unquantized_gemm( + layer: torch.nn.Module, + remove_weight: bool, +) -> None: + N, K = layer.weight.size() + dtype = layer.weight.dtype + if envs.VLLM_CPU_SGL_KERNEL and check_cpu_sgl_kernel(N, K, dtype): + packed_weight = torch.ops._C.convert_weight_packed(layer.weight) + if getattr(layer, "bias", None) is not None: + bias_f32 = layer.bias.to(torch.float32) + else: + bias_f32 = None + layer.cpu_linear = lambda x, weight, bias: torch.ops._C.weight_packed_linear( + x, packed_weight, bias_f32 if bias is not None else None, True + ) + if remove_weight: + layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) + return + elif ( + ops._supports_onednn + and current_platform.get_cpu_architecture() != CpuArchEnum.POWERPC + ): + try: + origin_weight = layer.weight + handler = ops.create_onednn_mm(origin_weight.t(), 32) + layer.cpu_linear = lambda x, weight, bias: ops.onednn_mm(handler, x, bias) + if remove_weight: + layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) + return + except RuntimeError as e: + logger.warning_once( + "Failed to create oneDNN linear, fallback to torch linear." + f" Exception: {e}" + ) + + # fallback case + layer.cpu_linear = lambda x, weight, bias: torch.nn.functional.linear( + x, weight, bias + ) + + +def cpu_unquantized_gemm( + layer: torch.nn.Module, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None = None, +): + return layer.cpu_linear(x, weight, bias) + + +def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]: + if current_platform.is_rocm(): + return rocm_unquantized_gemm + elif current_platform.is_cpu(): + return cpu_unquantized_gemm + else: + return default_unquantized_gemm \ No newline at end of file diff --git a/model_executor/layers/vocab_parallel_embedding.py b/model_executor/layers/vocab_parallel_embedding.py new file mode 100644 index 0000000..1abc3ad --- /dev/null +++ b/model_executor/layers/vocab_parallel_embedding.py @@ -0,0 +1,558 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence +from dataclasses import dataclass + +import torch +import torch.nn.functional as F +from torch.nn.parameter import Parameter, UninitializedParameter + +from vllm.distributed import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, + method_has_implemented_embedding, +) +from vllm.model_executor.layers.utils import dispatch_unquantized_gemm +from vllm.model_executor.parameter import BasevLLMParameter +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform + +DEFAULT_VOCAB_PADDING_SIZE = 64 + + +class UnquantizedEmbeddingMethod(QuantizeMethodBase): + """Unquantized method for embeddings.""" + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + """Create weights for embedding layer.""" + weight = Parameter( + torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if current_platform.is_cpu(): + from vllm.model_executor.layers.utils import dispatch_cpu_unquantized_gemm + + dispatch_cpu_unquantized_gemm(layer, remove_weight=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return dispatch_unquantized_gemm()(layer, x, layer.weight, bias) + + def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor: + return F.embedding(input_, layer.weight) + + +def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: + """Pad the vocab size to the given value.""" + return ((vocab_size + pad_to - 1) // pad_to) * pad_to + + +def vocab_range_from_per_partition_vocab_size( + per_partition_vocab_size: int, rank: int, offset: int = 0 +) -> Sequence[int]: + index_f = rank * per_partition_vocab_size + index_l = index_f + per_partition_vocab_size + return index_f + offset, index_l + offset + + +def vocab_range_from_global_vocab_size( + global_vocab_size: int, rank: int, world_size: int, offset: int = 0 +) -> Sequence[int]: + per_partition_vocab_size = divide(global_vocab_size, world_size) + return vocab_range_from_per_partition_vocab_size( + per_partition_vocab_size, rank, offset=offset + ) + + +@dataclass +class VocabParallelEmbeddingShardIndices: + """Indices for a shard of a vocab parallel embedding.""" + + padded_org_vocab_start_index: int + padded_org_vocab_end_index: int + padded_added_vocab_start_index: int + padded_added_vocab_end_index: int + + org_vocab_start_index: int + org_vocab_end_index: int + added_vocab_start_index: int + added_vocab_end_index: int + + @property + def num_org_elements(self) -> int: + return self.org_vocab_end_index - self.org_vocab_start_index + + @property + def num_added_elements(self) -> int: + return self.added_vocab_end_index - self.added_vocab_start_index + + @property + def num_org_elements_padded(self) -> int: + return self.padded_org_vocab_end_index - self.padded_org_vocab_start_index + + @property + def num_added_elements_padded(self) -> int: + return self.padded_added_vocab_end_index - self.padded_added_vocab_start_index + + @property + def num_org_vocab_padding(self) -> int: + return self.num_org_elements_padded - self.num_org_elements + + @property + def num_added_vocab_padding(self) -> int: + return self.num_added_elements_padded - self.num_added_elements + + @property + def num_elements_padded(self) -> int: + return self.num_org_elements_padded + self.num_added_elements_padded + + def __post_init__(self): + # sanity checks + assert self.padded_org_vocab_start_index <= self.padded_org_vocab_end_index + assert self.padded_added_vocab_start_index <= self.padded_added_vocab_end_index + + assert self.org_vocab_start_index <= self.org_vocab_end_index + assert self.added_vocab_start_index <= self.added_vocab_end_index + + assert self.org_vocab_start_index <= self.padded_org_vocab_start_index + assert self.added_vocab_start_index <= self.padded_added_vocab_start_index + assert self.org_vocab_end_index <= self.padded_org_vocab_end_index + assert self.added_vocab_end_index <= self.padded_added_vocab_end_index + + assert self.num_org_elements <= self.num_org_elements_padded + assert self.num_added_elements <= self.num_added_elements_padded + + +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) +def get_masked_input_and_mask( + input_: torch.Tensor, + org_vocab_start_index: int, + org_vocab_end_index: int, + num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int, +) -> tuple[torch.Tensor, torch.Tensor]: + # torch.compile will fuse all of the pointwise ops below + # into a single kernel, making it very fast + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index + ) + added_offset = ( + added_vocab_start_index + - (org_vocab_end_index - org_vocab_start_index) + - num_org_vocab_padding + ) + valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + added_offset * added_vocab_mask + ) + vocab_mask = org_vocab_mask | added_vocab_mask + input_ = vocab_mask * (input_ - valid_offset) + return input_, ~vocab_mask + + +@CustomOp.register("vocab_parallel_embedding") +class VocabParallelEmbedding(CustomOp): + """Embedding parallelized in the vocabulary dimension. + + Adapted from torch.nn.Embedding, note that we pad the vocabulary size to + make sure it is divisible by the number of model parallel GPUs. + + In order to support various loading methods, we ensure that LoRA-added + embeddings are always at the end of TP-sharded tensors. In other words, + we shard base embeddings and LoRA embeddings separately (both padded), + and place them in the same tensor. + In this example, we will have the original vocab size = 1010, + added vocab size = 16 and padding to 64. Therefore, the total + vocab size with padding will be 1088 (because we first pad 1010 to + 1024, add 16, and then pad to 1088). + Therefore, the tensor format looks like the following: + TP1, rank 0 (no sharding): + |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >| + corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1025 | -1 | ... | -1 | + index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 | + + TP2, rank 0: + |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >| + corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1010 | ... | 1025 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 528 | ... | 543 | + TP2, rank 1: + |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >| + corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 528 | ... | 543 | + + Args: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + params_dtype: type of the parameters. + org_num_embeddings: original vocabulary size (without LoRA). + padding_size: padding size for the vocabulary. + quant_config: quant config for the layer + prefix: full name of the layer in the state dict + """ # noqa: E501 + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + params_dtype: torch.dtype | None = None, + org_num_embeddings: int | None = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + + # Keep the input dimensions. + tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + self.num_embeddings = num_embeddings + self.padding_size = padding_size + self.org_vocab_size = org_num_embeddings or num_embeddings + num_added_embeddings = num_embeddings - self.org_vocab_size + self.org_vocab_size_padded = pad_vocab_size( + self.org_vocab_size, self.padding_size + ) + self.num_embeddings_padded = pad_vocab_size( + self.org_vocab_size_padded + num_added_embeddings, self.padding_size + ) + assert self.org_vocab_size_padded <= self.num_embeddings_padded + + self.shard_indices = self._get_indices( + self.num_embeddings_padded, + self.org_vocab_size_padded, + self.num_embeddings, + self.org_vocab_size, + tp_rank, + self.tp_size, + ) + self.embedding_dim = embedding_dim + + quant_method = None + if quant_config is not None: + quant_method = quant_config.get_quant_method(self, prefix=prefix) + if quant_method is None: + quant_method = UnquantizedEmbeddingMethod() + + # If we are making an embedding layer, then our quantization linear + # method must implement the embedding operation. If we are another + # layer type like ParallelLMHead, this is not important. + is_embedding_layer = type(self) is VocabParallelEmbedding + quant_method_implements_embedding = method_has_implemented_embedding( + type(quant_method) + ) + if is_embedding_layer and not quant_method_implements_embedding: + raise NotImplementedError( + f"The class {type(quant_method).__name__} must implement " + "the 'embedding' method, see UnquantizedEmbeddingMethod." + ) + + self.quant_method: QuantizeMethodBase = quant_method + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + # Divide the weight matrix along the vocabulary dimension. + self.num_added_embeddings = self.num_embeddings - self.org_vocab_size + self.num_embeddings_per_partition = divide( + self.num_embeddings_padded, self.tp_size + ) + assert ( + self.shard_indices.num_elements_padded == self.num_embeddings_per_partition + ) + self.num_org_embeddings_per_partition = ( + self.shard_indices.org_vocab_end_index + - self.shard_indices.org_vocab_start_index + ) + self.num_added_embeddings_per_partition = ( + self.shard_indices.added_vocab_end_index + - self.shard_indices.added_vocab_start_index + ) + + self.quant_method.create_weights( + self, + self.embedding_dim, + [self.num_embeddings_per_partition], + self.embedding_dim, + self.num_embeddings_padded, + params_dtype=params_dtype, + weight_loader=self.weight_loader, + ) + + @classmethod + def _get_indices( + cls, + vocab_size_padded: int, + org_vocab_size_padded: int, + vocab_size: int, + org_vocab_size: int, + tp_rank: int, + tp_size: int, + ) -> VocabParallelEmbeddingShardIndices: + """Get start and end indices for vocab parallel embedding, following the + layout outlined in the class docstring, based on the given tp_rank and + tp_size.""" + num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded + padded_org_vocab_start_index, padded_org_vocab_end_index = ( + vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) + ) + padded_added_vocab_start_index, padded_added_vocab_end_index = ( + vocab_range_from_global_vocab_size( + num_added_embeddings_padded, tp_rank, tp_size, offset=org_vocab_size + ) + ) + # remove padding + org_vocab_start_index = min(padded_org_vocab_start_index, org_vocab_size) + org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size) + added_vocab_start_index = min(padded_added_vocab_start_index, vocab_size) + added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size) + return VocabParallelEmbeddingShardIndices( + padded_org_vocab_start_index, + padded_org_vocab_end_index, + padded_added_vocab_start_index, + padded_added_vocab_end_index, + org_vocab_start_index, + org_vocab_end_index, + added_vocab_start_index, + added_vocab_end_index, + ) + + def get_sharded_to_full_mapping(self) -> list[int] | None: + """Get a mapping that can be used to reindex the gathered + logits for sampling. + + During sampling, we gather logits from all ranks. The relationship + of index->token_id will follow the same format as outlined in the class + docstring. However, after the gather, we want to reindex the final + logits tensor to map index->token_id one-to-one (the index is always + equal the token_id it corresponds to). The indices returned by this + method allow us to do that. + """ + if self.tp_size < 2: + return None + + base_embeddings: list[int] = [] + added_embeddings: list[int] = [] + padding: list[int] = [] + for tp_rank in range(self.tp_size): + shard_indices = self._get_indices( + self.num_embeddings_padded, + self.org_vocab_size_padded, + self.num_embeddings, + self.org_vocab_size, + tp_rank, + self.tp_size, + ) + range_start = self.num_embeddings_per_partition * tp_rank + range_end = self.num_embeddings_per_partition * (tp_rank + 1) + base_embeddings.extend( + range(range_start, range_start + shard_indices.num_org_elements) + ) + padding.extend( + range( + range_start + shard_indices.num_org_elements, + range_start + shard_indices.num_org_elements_padded, + ) + ) + added_embeddings.extend( + range( + range_start + shard_indices.num_org_elements_padded, + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements, + ) + ) + padding.extend( + range( + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements, + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements_padded, + ) + ) + assert ( + range_start + + shard_indices.num_org_elements_padded + + shard_indices.num_added_elements_padded + == range_end + ) + ret = base_embeddings + added_embeddings + padding + assert len(ret) == self.num_embeddings_padded + return ret + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + output_dim = getattr(param, "output_dim", None) + packed_dim = getattr(param, "packed_dim", None) + + # If the parameter is a gguf weight, then load it directly. + if getattr(param, "is_gguf_weight_type", None): + param.data.copy_(loaded_weight) + param.weight_type = loaded_weight.item() + return + elif isinstance(param, UninitializedParameter): + shape = list(loaded_weight.shape) + if output_dim is not None: + shape[output_dim] = self.num_embeddings_per_partition + param.materialize(tuple(shape), dtype=loaded_weight.dtype) + + # If parameter does not have output dim, then it should + # be copied onto all gpus (e.g. g_idx for act_order gptq). + if output_dim is None: + assert param.data.shape == loaded_weight.shape + param.data.copy_(loaded_weight) + return + + # Shard indexes for loading the weight + start_idx = self.shard_indices.org_vocab_start_index + shard_size = self.shard_indices.org_vocab_end_index - start_idx + + # If param packed on the same dim we are sharding on, then + # need to adjust offsets of loaded weight by pack_factor. + if packed_dim is not None and packed_dim == output_dim: + packed_factor = ( + param.packed_factor + if isinstance(param, BasevLLMParameter) + else param.pack_factor + ) + assert loaded_weight.shape[output_dim] == ( + self.org_vocab_size // param.packed_factor + ) + start_idx = start_idx // packed_factor + shard_size = shard_size // packed_factor + else: + assert loaded_weight.shape[output_dim] == self.org_vocab_size + + # Copy the data. Select chunk corresponding to current shard. + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + param[: loaded_weight.shape[0]].data.copy_(loaded_weight) + param[loaded_weight.shape[0] :].data.fill_(0) + + def forward_native(self, input_): + if self.tp_size > 1: + # Build the mask. + masked_input, input_mask = get_masked_input_and_mask( + input_, + self.shard_indices.org_vocab_start_index, + self.shard_indices.org_vocab_end_index, + self.shard_indices.num_org_vocab_padding, + self.shard_indices.added_vocab_start_index, + self.shard_indices.added_vocab_end_index, + ) + else: + masked_input = input_ + # Get the embeddings. + output_parallel = self.quant_method.embedding(self, masked_input.long()) + # Mask the output embedding. + if self.tp_size > 1: + output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) + # Reduce across all the model parallel GPUs. + output = tensor_model_parallel_all_reduce(output_parallel) + return output + + def forward_cuda(self, input_): + return self.forward_native(input_) + + def extra_repr(self) -> str: + s = f"num_embeddings={self.num_embeddings_per_partition}" + s += f", embedding_dim={self.embedding_dim}" + s += f", org_vocab_size={self.org_vocab_size}" + s += f", num_embeddings_padded={self.num_embeddings_padded}" + s += f", tp_size={self.tp_size}" + return s + + +@CustomOp.register("parallel_lm_head") +class ParallelLMHead(VocabParallelEmbedding): + """Parallelized LM head. + + Output logits weight matrices used in the Sampler. The weight and bias + tensors are padded to make sure they are divisible by the number of + model parallel GPUs. + + Args: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + bias: whether to use bias. + params_dtype: type of the parameters. + org_num_embeddings: original vocabulary size (without LoRA). + padding_size: padding size for the vocabulary. + """ + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + bias: bool = False, + params_dtype: torch.dtype | None = None, + org_num_embeddings: int | None = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__( + num_embeddings, + embedding_dim, + params_dtype, + org_num_embeddings, + padding_size, + quant_config, + prefix, + ) + self.quant_config = quant_config + if bias: + self.bias = Parameter( + torch.empty(self.num_embeddings_per_partition, dtype=params_dtype) + ) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) + else: + self.register_parameter("bias", None) + + def tie_weights(self, embed_tokens: VocabParallelEmbedding): + """Tie the weights with word embeddings.""" + # GGUF quantized embed_tokens. + if self.quant_config and self.quant_config.get_name() == "gguf": + return embed_tokens + else: + self.weight = embed_tokens.weight + return self + + def forward(self, input_): + del input_ + raise RuntimeError("LMHead's weights should be used in the sampler.") diff --git a/model_executor/model_loader/__init__.py b/model_executor/model_loader/__init__.py new file mode 100644 index 0000000..65e139d --- /dev/null +++ b/model_executor/model_loader/__init__.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Literal + +from torch import nn + +from vllm.config import ModelConfig, VllmConfig +from vllm.config.load import LoadConfig +from vllm.logger import init_logger +from vllm.model_executor.model_loader.base_loader import BaseModelLoader +from vllm.model_executor.model_loader.bitsandbytes_loader import BitsAndBytesModelLoader +from vllm.model_executor.model_loader.default_loader import DefaultModelLoader +from vllm.model_executor.model_loader.dummy_loader import DummyModelLoader +from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader +from vllm.model_executor.model_loader.runai_streamer_loader import ( + RunaiModelStreamerLoader, +) +from vllm.model_executor.model_loader.sharded_state_loader import ShardedStateLoader +from vllm.model_executor.model_loader.tensorizer_loader import TensorizerLoader +from vllm.model_executor.model_loader.utils import ( + get_architecture_class_name, + get_model_architecture, + get_model_cls, +) +from vllm.model_executor.model_loader.weight_utils import ( + padding_weight_loader +) + + +logger = init_logger(__name__) + +# Reminder: Please update docstring in `LoadConfig` +# if a new load format is added here +LoadFormats = Literal[ + "auto", + "bitsandbytes", + "dummy", + "fastsafetensors", + "gguf", + "mistral", + "npcache", + "pt", + "runai_streamer", + "runai_streamer_sharded", + "safetensors", + "sharded_state", + "tensorizer", +] +_LOAD_FORMAT_TO_MODEL_LOADER: dict[str, type[BaseModelLoader]] = { + "auto": DefaultModelLoader, + "bitsandbytes": BitsAndBytesModelLoader, + "dummy": DummyModelLoader, + "fastsafetensors": DefaultModelLoader, + "gguf": GGUFModelLoader, + "mistral": DefaultModelLoader, + "npcache": DefaultModelLoader, + "pt": DefaultModelLoader, + "runai_streamer": RunaiModelStreamerLoader, + "runai_streamer_sharded": ShardedStateLoader, + "safetensors": DefaultModelLoader, + "sharded_state": ShardedStateLoader, + "tensorizer": TensorizerLoader, +} + + +def register_model_loader(load_format: str): + """Register a customized vllm model loader. + + When a load format is not supported by vllm, you can register a customized + model loader to support it. + + Args: + load_format (str): The model loader format name. + + Examples: + >>> from vllm.config.load import LoadConfig + >>> from vllm.model_executor.model_loader import ( + ... get_model_loader, + ... register_model_loader, + ... ) + >>> from vllm.model_executor.model_loader.base_loader import BaseModelLoader + >>> + >>> @register_model_loader("my_loader") + ... class MyModelLoader(BaseModelLoader): + ... def download_model(self): + ... pass + ... + ... def load_weights(self): + ... pass + >>> + >>> load_config = LoadConfig(load_format="my_loader") + >>> type(get_model_loader(load_config)) + + """ # noqa: E501 + + def _wrapper(model_loader_cls): + if load_format in _LOAD_FORMAT_TO_MODEL_LOADER: + logger.warning( + "Load format `%s` is already registered, and will be " + "overwritten by the new loader class `%s`.", + load_format, + model_loader_cls, + ) + if not issubclass(model_loader_cls, BaseModelLoader): + raise ValueError( + "The model loader must be a subclass of `BaseModelLoader`." + ) + _LOAD_FORMAT_TO_MODEL_LOADER[load_format] = model_loader_cls + logger.info( + "Registered model loader `%s` with load format `%s`", + model_loader_cls, + load_format, + ) + return model_loader_cls + + return _wrapper + + +def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: + """Get a model loader based on the load format.""" + load_format = load_config.load_format + if load_format not in _LOAD_FORMAT_TO_MODEL_LOADER: + raise ValueError(f"Load format `{load_format}` is not supported") + return _LOAD_FORMAT_TO_MODEL_LOADER[load_format](load_config) + + +def get_model( + *, vllm_config: VllmConfig, model_config: ModelConfig | None = None +) -> nn.Module: + loader = get_model_loader(vllm_config.load_config) + if model_config is None: + model_config = vllm_config.model_config + return loader.load_model(vllm_config=vllm_config, model_config=model_config) + + +__all__ = [ + "get_model", + "get_model_loader", + "get_architecture_class_name", + "get_model_architecture", + "get_model_cls", + "register_model_loader", + "BaseModelLoader", + "BitsAndBytesModelLoader", + "GGUFModelLoader", + "DefaultModelLoader", + "DummyModelLoader", + "RunaiModelStreamerLoader", + "ShardedStateLoader", + "TensorizerLoader", +] diff --git a/model_executor/model_loader/__pycache__/__init__.cpython-312.pyc b/model_executor/model_loader/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5699eba792deda0498016bb6f2f707d218bb2c90 GIT binary patch literal 4614 zcma)9OKcR$8LsYmdU{@rpV-Fqngx4^8SfJ84a8WOHIL02oAAnRucDznU1K+N_l&B0 zFt)ssH$vJ)IczQwNZCjzd+~}r>@kNNa?iyUNQ_!oLW-g&w_y!QlvDn{dY;CTDSN8_ zuKHi~)&Kuh{kK>wLZBJ`-|9Eo3Hdv|)LOhsY%_t72jn`@h$a}s6cj-~S~Nt{r}#MS zGyG;i37A18$lv`&$P6oCP6v!OGonN|9W*2}szf;*GGb<2iE}z^B+R6e{&rB&P zPDhM(vqS0Nv}AOeT}l_Hqei#cqx5h(X6!e6m0nKAjXtwq={FB32h0IwfR81NgQl#= z0uc#~(03lgYOpDTRM3+9kvc?^G)_~{cfb$-x@ZsdJ+xO#eiKvPrHAPeIz*$$ofomE z!`i+HcxownBrj_1_xws)>!4w+lScD?tqb0dYTa7TH-fgG2DM%q(7IPbz~4!Iw{C#o zj+PHBffEkkM#f)(T4&F4$Z9uuYBpE}V#C5purnB&cYnab8As zT$PvM_FF3hNbAZFUarK85BYRcR1EV8F9A;PiUmqL%SAf8bJIOhPiMM;-Gm3+Pz!QH zo;NL79e`wpqnT{~1-jo056FGN0m^H42W_bR<<~P!h2-X8 z`zBTs)quLGE!WEi)kb8f$MTYH81g)o3wJ16V!8tr5^IeE6~LlPe6Jw;JRDJ1X657? z)xm@khdNX!HS->8Aur$D5uqS@yjB;XptV#f?&Xs1EOLXja)YDk5F3OgyFu@Obwf)k zv*4U@Bf4F)#@di0aziyr^c-Ua=K6IjUw}hA*<3^5T(j9BV6|~0W9NjY;ddW>wi$l^c{u*HbYCh* z&y=J6s`c8zB>`IR*LPTN$}P9a(LSH>};7T(EeY<33hHF0&GANuHGy zPo9>eIPLRb!?0v&=m5v}kv;^S7g|*ulRL{4I{_?rk?rTut&tb01J6?F%~X2r<8mm4 zdmkEq9`AX0dcAKwwlQ{LWBA9#1Ef6k)}5MQylJDxOPRo$gDyx zVX$Vs7d8eaa`58~x7QCpglgM@?g4S?*BaOcoZr!EAbu{c3Tsi|LHQjGwo1HJFLsT| zRkA94FVqSiUW;Y{rEy^3Zl(Pk2@f{((~aOJc!w0=TW7?y$0W1SZ#aXz_-w1=%1*)f z8vLq&z6z4~z{A_m2F5lA#-7B+*Upzi?Hi%Ka`)h~?$OQe(XXOqY2P!ce^csT@B8b- zhSdK=y1e1P%mGcuxjxUgeV~=$C6D33z=NOH8HOKo49hXYD$b(V;b$Dfv|vf-(it}h z#}rg4hJL#NxXt7i8RF;iEvnv*7=|?q;&2jouKCi$5Y;NTSNa$pB<=`jY^_HvH>Ku# ziB}j;>@CuCIj`P8_%dBy{i^DBy_aR+R)F6xH7SI|J+`Moe*dq@AcAkPF8nsAi6H#$ z_?~8Vd?I+U#5cb{H`DGRBj@hRV}D=s-qVy8w9c5^IUL) zmkL_RpgcR=5bzlW%#Y$c)_pH4!xIpdz%|LPz!(-6TZF#i69nPkVIuVXgLM3pjBb&k zEplj!T-+j`Y?0Y5@(Z9}i=rUD4ie$IutnznBl-pLC4ugB1jb)RNTl<3pFC+BSPNFd zB+*q7qk+^*UpyhMNfnvI4wgH9QjQ%ccc;tIzH;In{EzgPyE5g0_sZ>u%bi2z)Zoj$ z_*p^RApH-uM?YQnuV*)hPj4JLy*V(sP2z%B89Xj@JhHb5JSw+@fN)lLbhJY7`2!r7 z_^hPJv~W@Q(H9hcDU>?~DjtzJ`RD1)%o*?PMIiL$r(b-!9YCHRQwd4@BE&KJKZu1L AoB#j- literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/base_loader.cpython-312.pyc b/model_executor/model_loader/__pycache__/base_loader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3119ea0ad965d75d6d70eb5662a26c47be9d7f66 GIT binary patch literal 2953 zcmahLNo*X)vFF@tmZC_C=31mQl1R%Nhr3WoG0X4M6#i!!1kX2xJ4Y|NFMQ}hUCi)SwdX)l!n4r z)z=ytrzxK9=NdUb-^g=Z^$LEmQRJ}Zm3*UNa9H=s{y<~EKhQY9@tim4A8Z_yh)k$K zhi*&a^v1AEUL#Ka8gU9#x~-(tkyW``{2r02oG7oJn@tAnxhQ6KGxlk`7&@1LUb+}M z)SC^1d3PZxUiLg+z*Q+JUI=Yx7c04e8(Utuus|8e4I=8=p4*|8k1CRh77Lp+imVmt zE-c28WzWZyfrRb2!2;-eCZe(B(0RM<#a0}$=Az}qt1a3T_KG=|BHQZ#+#-}TB!@I) zM}i>ARGpU{<(k^isP3pV=V*>TuR6JF>g`mlk%yf;Ei8?sggwiygzgmfNR*r+HN@Wj za`U=Vf-}pjdeuk{oUlG*ZB55ETcGjumG;cp zGw;mMy64W-TdQ!rL46r+v(D|Y=oQ**L(EfDm{*?z)A%`gqpn6ZNZCj(A;jL6!~k$z8qy#Fl93x9WXa^U#A{4q9+dNl5D6aC4?|MqYB z0{WKLhUo@zm^oTA-&}MfGm7oNvAr;$=BvMb-LyR~T#1TiI^3Y;*-g5u03!uu#ZwM~ zV|$F+&g%4ktI`osWO!p$MMML=5g|dlL9>X7fy%P7?Tq(FWbf`#kjvtc-y4q-CR>*0 zd!vE{jrjI=5FgC*B|!SengaB_)sZr+saIZ#?vh6O=yL#69;kD>Hxtm+UOz0a~+kAtZAg4$qt-zc07I-9) zKVH6~yhT=|e)_xrM~{%Is+|{jzGt<(;>L?m&D30QmuX;%HqmBw?1n*9gNKMsfN4B8 zI~A0~+|G(649uy>tkr7Jf6ESCcAGTKG~N>vUCfaUTmV$W!tOfMRDvJz%a>$X3@P<^<~j&;R!mKY(?_hbX2ZFBje&I?wJ~ zN_=k$1fwrtx=MO^QkmQuJGD7>>U#OX=&`NQ>CMsU>!rt&^{vU%o0F&SO}=vd;+A24 zX_%k=X6yXr&GVOg3Mq_0N2(n8tMM0O z!kwS)C~{?JU3;RCp`$xRQXal};l_n~l@~vK=e}|7u`#%1Ol%qxf2JF6-#32t(5QYq z_38MXwfn}!C&OgmY;S}N9=h4N(fOczFMk+FLr2#qo`SOet*6l>ICS@ z;$`?yGHeD6|GHQOrbG^cM8C+T#l7md^*l_U5ZYs-&IV z`1Lk{rKb~pq$kVT@sD5b5m@fL_-|acm5dJegdB=?eB<~>)eoyZ0^l8BhUM?T3`@!k zP_H6d(cTqxbF+^FnmKoi!XVbw#4V$U@hc1hviF@Tah&*oYXuxUhj zgRP0e5dlC~lR@meDY=HJ$77Xpi@ax;q{(xfk>}p--#|^EC pL{2^;C!WZPB>#{n(#zizM%I7#RpH20eMg^?#@Cxa5LmeC{{Z>{z6}5X literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/bitsandbytes_loader.cpython-312.pyc b/model_executor/model_loader/__pycache__/bitsandbytes_loader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27513ec2fb2bb1e842c8d67ba2603f36b75e00a3 GIT binary patch literal 33235 zcmcJ&33OX$mLB*XHUcCFfCNZ_JGc`R35q+lOBA&)wk)qKyJ3g`B}ybnKY*6R0^2T^ z6H`^rG2PYGrn|bwRJA*%V<*Mx$(gWb>YP?;k||C1%p5#{F2K+78G70|Nza)xp-ee0 zS2C0N?*A456zFBT`$_!Y|Necw|GoFW_u{V$3-dWV6NVp-|NB=s?r-QreT<4{^DlUg zdxHydL4JZ8k>C6X&%SjdI`-C&=-JyaV!&HBVVpFLm?rZ^@_0(GpD<5aMl8(FPvnmj zu(Z|@E8d2Q!b#hRjl~%!?30cW2lJaItdm6}Ma-W!Q9S7!aWcPo!ZlekQo{U}iPFik zkuv7bpKwo>kCabVj8w4if{Dt>s*x(@w@y?~){N9Jf8j*!q-Vr4SvOKQSwB)g*)Y;D z**Mb3;%pO5lg%T|%x|A)nQR?tWq!wmcd~7yZF0xRj>-0s_Q{Tsj!EB$Z?bcwlf@TJ z?40Zx>01I^kvmvK$MALqZ@rC3tyBe<*w%FXQpCL(}8%41PHz#|00SER=63{k;bI8YPs~h)A69CT^V1891gi*c7# zkj1je&U=b~m{Wb~mQ&y7Ij?TE=#_|Pe9RNQ6!Ngfct)qf(ZF~(;!zi#XCiPN zd&HO3JgT$@f17*ZoZ@&iF~&XP;lXV4C85*6zcq^Z@XhTBhh4E-45Ds3r9>rA3VZQ0>l4b8j zfQf6tPffWtQ6ngiK)@RL}b5DORyWwiMZkX2ky5oa&hNFzj$Hg@jpDd`25?~ zlCGV~E zp}a9Y#gCbT`7aws@>#Czv`Pt~f-xOSX$lqqWLtw)3JDg%Uzj78jn!hm>`_{u@si64 z8iPi;?obihQ>>J()?q`r&Y&Ib(*+&yyA*#B{3Ymv4dKP`mnwcI{AG&Y1-~0dQTgTG zjA|8c(1Lmm!BV8D+%65urempUyZE3xRL#mU1tiHEI1e1RL?) zJkMR@M_K@Tnx(QKY{Nt0;1F%YGjv|cC(i64_TiB?rYAbZdqF(ec?p6eqcF))@!}rs;qIEW=UlyBwJcO9fP-5;CfRB}_8GQ~KNd3phXK0c-J= zpBuUu_fG)oV!R4BdF~u7lMKP zv)$WFh|iDX78r>?0}v{C&ZAfc>R#l2J}h+M>!{Z%86z_|K_#=APb!}YM=k{fI=t1x zSOJ7m%_zfP{M*G>P6n=D07^kP&;yxm@+qJm%aSuX?GKJe22)^hTO9#06Si$#8jUJS@=4 zmFyGak*J^5!Wu5+jfE!z(a}qi3E$Xb0!@<)MXVZ)grbBql->Hs434tN&=ElZ!CD|3 zrNqa{IYG`zN@E6KVpyq^r?g$NDgaHc(sXIeADAAO@-v!YV~rr1M^0lwnz5+~pu`bc z{pxt)Eu*AGejaMinD=$rFUCxBpN;#J`*qSal0}`8~8sY)ilqYn@%#r#u zL`3Ztv`Zx`(nWBWZ5bbm7wuG9z;n?eH9lsH8DbW-2Xn@7K?d|Qst^IEKuJ1>8XDA7 z9|e1~OpOZ~7%VYn!V=T~)E}s&X|~~EtoneE4N^Ry#nV)-&#V7kVVf_5PcUn2^IaGZ z&l=i%(^269suJi-n{7NC2!s0oxCA%=B5n(OV3_E$!dWEsT7_pR+e73$OwKuS=xh?G z>1@XfPmqu3BB26~q@M_d1==j${EVGWn_S-ReoExPPL=l|cqoiJVrP3feQHZ~U`Ux< z6yYh#n4f0LLEU36= zbIMhda=DYPM$y%nbajcYu9U0lzQtHneBa7>hWM1jv*cYKK|LOuXsx<$ z<~&FE_>spFwU5WGRi75v;`Z7^foJK^or0#j#Wkto{`kO&c>l@Ub#cdOKK*fNDHF4b2Sn>o|P)07)C%sYgbR0lZ zp1MVWEMfb_Ok;Y5z0;zycR-y;1v0kJ@Xjn!g#aKURp#L#%IgMyIo&+XC-fpb2Z9;o zX&>c6GlM#{WkM-HmqvqWMS_Y_Q$&nJQOzRiJR_FSVtMBi^C>g|NFf|Sq;LR^ln)-2 z5RK5T_v!>{EZemaYMnv>NJZce6462&MwxP$9olwRX*=6b+{-kH^*|Z_6g5PS!U4sA z^;%StbkyH*)U)l;BDz|ZdC}FDboGg@zLiUBt^+A|L(=UN-M-~(Ywp4Ivf8B`Vp&Jr z(XsBViCb&fD);6|`bbm|s;D1~s&Nn)i9Q#c^iPEuZ%iNrPI7_E%E%&+OAPTKI$Ib* zl-DeStsoUan?{(V3}%fNYn~D=vYD5Rj0D4KAd)M{!Xk9=JgYz@86?@kI#!VmDrC0h z8;;$dqo~Lq!}%@a%5CWKEw%6We9qx@&&^pY@0RrdT%;VXq{AaRJV{5F=;+!s>fN>* z$AMC~cRU!Ik+@HE^lj!D+_w4SoB5ol^~OW<#}`5yCa$tJS2r}v>bm~wyg>~Ozk;7EG-t>d1r654 z3@_<|x;ew|8?N$Kxoi5TxT`#njM-zMDDVM~y3mPJm1d$Vr?t$ex{$$e&f-+H=c@;D zAOiYlV#0$r6_Ux$@&M#SL!RLHC~Z2TeHeH;J8RnM>+Sb-%^uDwU%AfmjH5(hDtyuB zd3Yj(OC8WKC) zfsj)M9B`{u;P)#X@cX5L6O+?ZLX;6g!X=~<4wExX>C0tq4X(_Be&W@DR#EQwvZ8vk zqe}fVVP$NQPs^eLvEx+yNph}I9eR*zf(FR(jfx|Z1W@E!ZZcT4LwbbLwgre0rGyus; zfTekX!7X;;XleHmE>~cGZT}DUr|izxe*Xu*pDL?Nmi38ceJNMt(o}rl81lG_*PRU; zMxDD6I8;$1GJ?4J`b2=g_BPSpma~3-OhE7-1wy8G&RWoz0vJH!&ulub_5WkO z5q_yap^1r4#QJU>yW)FX`rV>5Hp*9ycTUml=CUWeBNI5hRA;?C%V| z$moSxn@_2PF8V;?XVPv|O%MJypN8{eUb`b}0E)%m(B-1@)T%C1^R_%`dQCyGnp6~v zad&ZHW+Xly_^zRFP)1Pnf)|BIrpV0L*!VT!d-%xaRkkB!_Q19)5=;iRwK4`Wji6|k zVVhKJD;)XUjTb*59({cE=)#_)vr%+5CY;SlXQ$}wTrnk`1Bj9@$G-1v=Wz|@Lp~r_ zgx9e4DVvOKYq{BSi=S0^5-xv^M$NW=n=ag=_SR8*U*u9%_3_5ucx4|}{9Su-g0&rP z!u~|u{De%kXCj{3(xEwhMgInO35(~?_&HsS&o!TEBs0h^&3=$^pwH{s<=b{k%y@P` zVXcxWa3L}oxF%U--~-%AhE0XxPy~1RekIzWd}^sA-uJiv6;IZZq-DjHWHUS&OA=Vv z91j{p_dKU8uKcXUB@7@vrim>l^K*V>LA`>|Z@ZuvK>q3l_5V__gxRYL>X_Jnbh&k5 zaPj#yXA}HiZqht_-&Vxas*ga1ASy2;LwX?z^GG7F7E)qB>4P$aW-DlGE53RKZB6|- z3XBl&zzX_IXR6A_>e;Yy#ns88da;D!3uZ;~roG0q!uL_h@A4GzEm;|HGahE}?oqEvm_fAg zkI4DQa2U^cA^_Qh8x-V$qoNE!3K%2j1vrwOsGW@EjVl+K+Q|%4@1A7Y87W_hyArzo zAVwj47sbwYZ42Pi7(+G|#9`bbYsf1TLpIJm z%38keY)U$Jh|V1eXGhZ6FFN~Iu-W#f9Pah<9m(=8vAio$-jgifBbM)3J$mazqWtXq z$<&TPa9I{d?pRwl3OL&lexvm3sY>OyPo#mE23&-I_jk%-_TAz>v~GtN&gLc|4M#@&zhJy*IL&0Vm@YN z%R}4~ws-c>MUmn^WWaSvRAajajZuw;3Yu;djG5*OaD?Q6-$d-$ujv>1bLiXjhwZmK zT-ccZR2MTo&qZ(@hA)f!C6s{w16FbR@`o;$R0sfAVAl5F-mI^~+8)f8xh(HOjP4v= z-k&EbXVxtrU<}6wJ2ow-Bjq#hfy_y;M?;g-8Hl)iQMvV?c4Ck>ufZYC5@r>LG)AEq z5EE?nAWs}0f+v`{0;~XbK*w}3J(~xYfxBVI3Bz<>&1%PWo6-*8iQpu?hFZ`^QxJEm zqaj=&TbOtpkq<&0;-msO#!vEKUZZdmVZu*YM8;4^{1ssdff1TzqBvfZ53^!iT?ydW zWF(zE^9^7{Hd&NL>K-hv2RkWp9<}1EOI1|OpZchzae+@ctCt{<)Ufp29cOE*sdwd^ z*fg+kYKi}Yvl~XPw24slnxwNubhe}%m1~W?$;N|X<3Z3LuGURX=aSj3>?oYA@ny6= zYtH@t@L2Cx#*a<+dd*T<=0fp}&4gu2d4dg@49b6u9!UDp=^4oo41k3frf&ny0=V`^ z@sXzU6hN*3WtDJab0QFXN%#ple?rbbBWIbMB{&hPIZKC4byq#vn4xW^)dMijTOFG*{XM4Qkz?$V@!(6)M`)NHZ6-?PEt^>1A4 zb-V3z`r7b(YiN%axSHd|&GF{pxZ}v}TG8<&PM++5Pil6?yAI#-Cu$yBC`b(q#T_18 z8j6nIZH|P>^pWuW!}*5`vyTJq+l zACv*q=0a357-GZRg6xMfQ%PpuKWJpt+Q9~J8tP7f19RQtMp9kv@3^DUu!2%zi zd7u;vE5)_7435iDN+%A{vIkQJ^Hqd!&Kk4EEVBT%vHV%g;9Oz)J7^@}&O&W5bFd&* z7_$rKUH7WTz|A@5 zD6bQx7QSXvN(Gw3N-dnj?8$Q%RD?;9%Yj_xImAVY9A(etHr1DYW8V4qZECCM3c?-V zWNb5gu>Fe4Zqr;rtl%=SQnoyTK2|X23VRUSxg`ppQIhdx&*g5_mwvym&Fq{j=DOUM zj!}K;DhfJgIKhE518PkAt)`iCMR%#8QLGgCxx8ETrQhlr=mY%#6weTI1&e~k?>kjy zJe!x2xl*hV{8LAl=Uym{>2J6RbqedS|4U+dFO;Y#;DYYe8-(_ewAHA7u6VBKs_M7R z6+38$V_&U4)?lbG!eo&g$zvqXUIK;YRS|k(J4HNbwsINA>7BMu+J@q;X5fHD4e{o|HOJtGZ9`uQKSH9vrOy8SbE*on z`d-BahyOm$b^HyIKO}Q zX%?vGbtqZud0Fu%^#>PC_w-ze2MFcL(WGn7hxG>)mYuVd|A*+h@ZX~eOc43+y@%Hx z=wUf}c;tZ|9{E!EB^n_tz}ZGq=I=JW|A2jbDQ6#t9_S->EA=t=4ANnev3P+ivaVc{JrmSo;Z1#JA{%+w5uNc)9)iq9JZuQb}z z?JdtmH?uBDHWG;qO!#r)P;!lPtLEQoKb<8qzahDI<~^l#xL{9=s;}TdLT}G!9g;H>Hq5Vgut+MQ(!R zm=>l;-dGEfY5nwSq)Q5w+TZd%s8IhTCO7N*cIW9Mv>Y$u(EpM#Z;ABZ{b~8WMEQYv z+uibkMES0H+dTtcSc&t+;)0lJ7H>DXmA88zWWRHh!wJ0J=1aB>-Dw+w*o)J4pVK?& zQnj*bAum-?`Fh@ZMJxH>d}=S2Gc8b$W@>|8k}*6$Ik_N1z6m4GK!pZocDFbHDRq0i8j3XBypg415odXJ-mT-?A( z{=QM)RQgH(39+gh;zs3I66MWHS6007<~^(D^(C~r1gL{PbJI?l0W8dp%)*Jhhje}z2 z;N9BJWbGcYb`P4@vkn^ldI(?$kmvHV}IU+iDr5d}EjeEt$y@|&C3uitp zX@CE*pFQ*LGar=f`LOp;%H5P|-Ow(fsSip>BNqRjK_{f$^`Jn&JQJ0^=!{<*yxi@C!s_XAvWG2|)<^ z=1<{*7|D4vtWsv+74gpiXJBWgE>qwurjHWJ#vbkDOr`ZVApB)k1 zt&?-!gNC4{T@}l}Oq*cK6EvwaJc}4ysd2gVqH@lqcIgXVz;Oe1$g4lvZbl<{Cpqvg zmAYTzgXUR4r7p%GjT*kUDz=8VW2Rs|*ia;mZ025Z)sd zS4{;U(tMyipdSW45K6bL02Xdg*=mjV&!2MTIGCPUD*4@!1rTr`Uac9LQv z)J=&H-lQT3lZ7M;#D2p;DPK|11a>GKq*wwg!eMfT;Yj(92PS4@?I+<7h5ok`YGC#f zpi|0}7yYVU6fHbqkQ_$#TGjTzK*SU&GEU1Eh6Nc-lFal$#YpCG=&J0~lj8sPP)?)( zj=CK1mhaihm{{M1Fi=uvctGvmZ0EL^9Sy{T8Gc2m-JijM7+*e!iDXr$Sk;-V>bh}y zet4nzuDyJ*`R%qfd&j@T#aU_6(I7e+mVWQPUe{y8;o8`mY#0z529gcO#D-(H4T**` zap#$Zy>Qms2St~6;rQZ}j~Y7{&8dc#clO@gyJB8#O*9+j8%kV`s`) z4$-cJvysW3w~Ma!q^n1C^{fnk;M#S!sB-b>+sA)Xl-+~>#Cm{RLH8T`|6u=8@SX5a z!*?qhA)i>j6Y`0brI1gA5|xcZQ!4w`D*F}=r|O%Q0ymq10zySB_GWDP#EOup-M#vq zCislP;g=g4p{9uEEa=xT^k*eOz_-#z~$d z(Zl*X_Ii>sgy_Q%@>}xb_%?9`)>nrYN`81MY4wO!PrRV+j=BCp@h|OY_u-KJMn^kl z`m2xjvGjpm1rSL!Yw(Csa6lWZF{5&L znQ(YjrgN#jti#Kc>xyQ};YD)o-?V?)MH&vTbL{XkfXR{PHP23Kr@>(mA3; zZSawJAAhd(jLaR$iTS?-AX5lI|hVJ#=gE1NZ6A3c0Gbdkp}9 z44O%FK*9omjD}?CplGn||G;2n+8c`TX`%vZp?VA>dxE<3#d!v*l>>r7Zb+Z0s+pFv zM%e~!w`Z1Z{1#=KvdY#j=dt%SSg5uc)-&>gX7w@)azJ{a6A&?nu3OOZ9>#$lMiN!U z48eS+hDz7ToL`fN4BOL6u0dg4`If;)691#XJG?Ocu zS&J+jG7WSTSxT9O7Mb1iDAZXL=oR$HXrrfviCZd3T1ZuekXsnYf_XDiw)e1&>|t<9 zD$Q+tYu0pzma=E|d=3mZL&qYknG9|AK*_%)ccCr7g`w~7lbo4oys}-+3V(}fVdB@1 z&NgKteyXvW2T<+V!mR2U9G;!Xjr_Bnso?#xWN0Q-R|iR>05rXxsBm4L94O|5VA3p< znaw9GqRmHvF8>F-CEFq3@Z%R|q6&WfAE`qd6q5&hl_B_K6+i|Lk{)3)V*YxBR z66L_6KS}`%#{WO$`x|m-bW{etia1x2I*$Ux8QhWA(>O%7ug~^mjv<@P)67Db?<+Fd zfq6xo*qX2_VC5WT3p0yhpv!lQJ-{W4Tz6gW#gZSLO1fG_S8LqqT`t{|^dI?lwa&n!L-CuOf({QWz2g6|84AKN}H^`uJo zte#%of4e&FdIC_KG;(y%$gy%(2PxXD)eI$ShHpK0r{?(G;wtLP;f&hvnK`R_armvX ze|$Do?M+s9h}9j-;nh&0dU$^LpPU6(t$N4&UKwH405qmU1M1IW#7JKu6ZEP84!}VH zb&yJ+%SiTQa3wV9lZ+e&Egca|>`5O2!v3fRQq!TCU^6!;!|{>^eta54rpw5Mh5_X2 zGC^+IP5_PWjVDXtr!6gZ}V z)Dtk7G#;2B6ptV(yI2rfU}`Un+nI1+Lec8WMEexW0^F7424lrVxWLe#ts1boUXTtwU&Q5Y5 z?!?JF39ybhL4%%xo<4^t_6u_U3mmVajFJYDtXs5iz*dlNha@*Mt(Bwgvpc_SZ+oe4 z%P1XvyKm!ae9M3iz04-m-P#>%?MH9TB-+m;YR^&-Oe2&n8rKSH?s^a0GA6u7=ZDv< z%_%1&Mmz2}JMNY?rb<0;yOu(KR++5p6YKioo_@fq0f>R>N_wD;N8rlh!Wq7rM?&DNy3Q8YI$JqAlwDf(&_&CN?QADBDXkgNP8Her!0 zK01vQEvwBxg!78w6{D=EH^D)a-!Q5B0gTJc8|Gog5OSGjDDP<~8LUp4k%c`f0wmOe z53fNgH8xWg?-?LSb9%@;K?Gcdb0FcY;5_o@Nfrv?|93y=}1{i`eA5bw}=b1`F5P%IR(Elu^MA2$gJSvcyG2wWVQ$R^(B7uvcG{|B) z%h=#c7<*ROvJb%|WH8Ws)WwQfNykoo^P08pYa{Q~fMk-j2!$V_PNMUp_WW%fMy+-&nV8@aPIE@x2j zmoIW(*f`4>{&R!F(tN*>vz9JQygj^D;7!%;NY-|XwcWp|?OiDN(9ybF`GKPkNTsI_ zcg~KgxVaMYT!l?bPp?_Kf!X3)2Hr*TrG^Z=nM!;XOMysE2Ep^O-O|u|4yR@AjTyW$a~!CFF|>QL>q&Na&6%Xkj;H zHuRT3-1BNrIi{$i)me@K+S%=Zf*?j8f;> z1DS9O@+sht-JrwIv@Kk7kWGdxbcLEW{f?F93T3r2xq0NON~czRv65V&T2CA(HAk+k zeL>lTI&OJTc8<`jvU9{gP_`rIOt~(5wp52tb?spP<<3?5HMxp%)>ZN~xr%eoh`bAO z&x;&K`+*(0&=A0A<(O3^9UaV0+!LFy8XMGp=hAwJIn*(jEBl(1MOsQXus?mQ4Ehbw zDWGMTvn6aFk3srfNkf#-mPd=rv1Y(LRCDH-_4f3d`zEYt};fd+7)}z`&bElmE?mKz^{Qz8-X`FX8&>8d?%rn zX{3eFUD)M^jvi1<2>FdaKOUR`ZYtBISl5p7S;f-EOs(`JHw zM`-J?g}03(iw~4C1J=06%F@)0e1%*nt>XvMPZKKFhO_QP?sF^DLDxx}-h0RHz3VLd zQUCmzbzA_I_lV^^^R`rZC7i0dt?~uV>cz3QFReK{p$?j=s+m8Nc0MX=TF|E|YZiq! z2NujJXJx#hPjvR-mOfQ6eCx?uCvOMh?uWs;q|168F6&L4r}3@XH)of5zeE@WSl@ZTvr4O@~T zbK_;Fv$C$09b9?rUw!B2-?>wEFzz@gGB;j!aNSXpbTo;MCe$!-a{}risT?8INk^mT z0PEzPr*A&J=IBP6rKWdw+}yDoxanKz`&WB@z6TPb`$uJuX&ekoT``&Rjzw{=$|142KiRxjY~H(iPHa99FFp8e%Pg_YAK7qm zwGB(ph&A2w52dQ>mQIV+J28R2-ubhO2Sn?RR7W>_`xDl-jTd;5_Bg2H95r!sHKS9Q zMt2rvLT4%V5x4nA0B+1UfL=8dk*yI7l3n8~A-(dvr@oP!9!R(8Adc`;_)IvRVS)s_ zB5YQAU`U}5TDy6`VM$Ln2U{T*J0aJVq*^O#dPJdVEKIm09~6oyr*x) zWQ+uN$YO3RLb zPjcn?OU!0u5aR(M3=?6y>}$g>lsWj?ew z$6E$gOV(hp0H$J^`r_95^@_^*Q(#LJmnMsv#G$P$XAPet;;9lu6{~h^%>;eoB!F4 zRDBDi)mrvIF8xDS`9>3043jwRWLkJuw0ALMH??sTir(xy5O=@`&MruVtRHy1`5s6SLCA476`yhdj4&98CfkHWt!i84%ZsnKDf_ zWpHIoQIsb?D*QxEhca#xVXPOXOR2wjAeEk{SqD#ZWFQ;{j!c$y@Sy1;pT1j+X)R+m z0?dGW+P$Y-GP&syc&1H-x3IpT@n&+}MS^ca7e`wjb*y7~%w7Tb7Yx$1gIc8lWKT|- zl3JZkLCEHoC#G>+AmoRYLC;)<#P?+n3(9h01!~I64OtQFL+v$6$(*yiSV88TVa6LD z%**7{-C6m}L2mnz&nywtdmjYl0Lsrwb`qS;fTdCaL1h0}U=*rqj8vPg&j^u4qhXGf z=sXRmD>NCMAyN?)D1%Cwn;n59R3Hk~I2oXMoiZJTUn}GU0{1fEa2#a7Urs@svQ>k{j_NUh6SJ%4i1n6j70 ztNZTQ`%<+7tNCK>;RWkD*LB<|h4dOF^EV$E^#Zdu(@`@b}-k-^cb zPpg3gFP~ZMPE_w-$S2c7#UL9%m&996+-|#F6@TpUc+(S!iYMc)C)Yi7i;=g_FIOd+ z`o-FQ5G1X=$=2Os>+avQ?u8Ngg~N-LcU#+;{?m?R_hGU7aH4hicD~s9$inF*Cltv5 z(7pR0#8+AUmhVm9a@TUh`@+wzzk7YPD$#i~?md>MI1c?dVASnzwl7^xRCFdQdc=yJ zL`5Hd%tG$Q-qnloRx)C??ksy_{~zwZ>uOB7tCwsmt`*ZSovTkLcAXS=o{ZO?f>d-# z>Ag;_zL&xFEU>Pjs8tNCW5Z#{w?3Jc%NSE#0gRdHveTrMigF>gX&VWwGz|cx%gZfQ zhfr%ejp|cDI73qPklE#PA*m|Bfe8@fYo<~CB{D1mq?_d;6(r*y=w7wJ=tmy=$%~wK zXz43kAfM*%!TdzbOvo)MvIw>`$i|(Tg%m$j0Lau}axuF3m?oEqivTW{DH@DxI!yu4 z+WbG~V+DWC0~zNoYd~8a3!t}%RYwE|qQ=b3#X!DHf(Kh3b)2BIqMFYz%^9zfm9g1j z5}cxIT}7tu>!kHOX95uS3tS0#0-mdv@QV^2pU*ctcoZ5hQ&+Rf!L>01uW1>Jpoc7& zd6G4OrJE)vu-$v<5x8Au01coDsj!Q4smF#!8t$;W_c59K;S&ST^d z7YbZ4Qc5nsC3^I_e>8x4vY;i&n*obfI7FIxF2chZFG}M5VB07^I0{flk~fl#$=|4o zy}*^#1oXS2a-3AA$i8HpJ2DPW@Hsbl6{sh(joU#{Dp=Jpc4B8n@MF1vnoJC$RW!FQ zm#kRV%=_4PlV}DvVEO4a^X^a0w$}=76r`+=K<~qn4`kWbo$NR#cAQID-AQYWNERF6 zsS~Yr&~f#_c5@pnkj5+8fEMrUOLiU+JCCe&9$Tw8E_NQfd2DHTx$cj9$p=RL_Q4-80 z*lfxzAM!9JVLPKQ06ByJY=bKuuCyn+kBQyKJ}@7b5&ssX^R~*;n)!ZJ zD&8+y{dBxf5WNSR>3M#CaB38u3ebcRlH66KIsNp*F*4IqD4<^LAty}E{|pDF0q|qI zA>mmH=pcu6eTIBARth5XC{s6(ktITz<^6qwd_N}VyX3^knIp#xMei)8Eh9V+};bciFp@)hjXCFme!KDB8 zW$r`6srCASc->&!P_toh8NEvdd1rGgWzgv1Bo_ zbmX1WH&4fFcft(oO7E&GUcOJX@4scZb^dn4ttbCAe&yz@XnAPULb;ygb;c+86+M)! z@!l|7jK|p*y*ElLjD72t;!Pvo8-u;d2i_YKe3&=(vN-Z=Jge_8HY`8#v!~vDYJ-D! zwRe+V8^6a_8PD*`3?9-VHRRZgmfP-A4EEg@*CQ6Z|p)>Al+f1$)aIlvbbjpzs)80vL3h@K@xV!Xjlq z2_@MTg;GdcMSOt^qXHpoY;%D?xX#pKXiG^(p!(C-1rYzruCrg7xp)z$GU%U>|I*9_ zfpA0oWF@GB0EM$(P>}apaTG>0A|zc#zt}AEy$Fkw3K)A#mM3NVPjY00_~@u&U_l9n zj;10g9+Yj9;D%K#L_XgI8-+CH;<_on7cKv2CoVmho{&`c6-h~KJs>buQ>Jz#DWVd425k7JMW~t4MzegwRRNUzeK_DWMe7SUlNZ9bre|20$F+=v*+TM^VU;d66P14 zprtcT4uj-Oc9j80h@o-vV%;HFL!^d397dkz!`y6>1KjwCQp)=uwBge+^ zC-{%K17GO!c-cg0{8$)5IT||DUc!{O4Nh0BMzO|@<muEWD!3pZ%a;B#A#GN+NKTs*x zPJZ>fC;U_$|s9!BR=^gevGH@~C$Dhh6Qg4N;c{ z7jkjFnxlm5O$i~TvMHHM=dNEDM9BJ~x5RWj$K4b}DV57|aVQk@4&A5Yl7QPpEdjby z>*I5`vT4Y1QamZhLRQL&5|_&I!fh^@N()?4%w@Rgq~@<8-V##Rr)8;H!X}~h+S&#} z3QZYJz6)A{G{?`)rc;TyENtrxbt7p#6&#%7vN@T{3IZ?iTrwxZc#?uBWD|lG%*5xf z30!qBcz7AKOj05@D`;$HE-j}xd0G@;PO~!inZbm&VSMhmTeGuVBA%EQKHG|~Uekfk zYYq0slb91>8ZtMVj>|BwOhP9cJ#MIEHtzt9y$+{JAkayQnWo@$!5K2eD8Vwr)K<~c zm=euEiPSqL9r8{EDQ4p>Fpv5-aq8s|5_-4qrKect#Gs3g-Er1tTB5sqh zjw?jiuX#s>WE`le=D;i*#TgbI7(ry^>A1`$;#r($`BV~9smvl8X65-=L1GboLWy)E z@;e%4wr#^JOoP~*fs&ouEx=~;CbUg~sW=R$`H^;@Zz!~B)^eb_k*ah_Q>qKX4O|q3_>S+;U`^y=sU!RboWL}Taj*4 zA>nOWrrzo=(gBt3-)QzPS?`&Pv|pu%HrUXTd%3Ylhg3SUad4zSvn!z$>Fw|;uXOD% z(jzK;QH(%?;y#G}9ey}y!9;E@i}>g@0rp0~!%1bY8_Y@TFv{aegoQ?1TkJ#tE`g;< zqQWR}!&Zn9EpGk;MBgRv6EX&)=&mJC`~?X6JZT6%64EnN6``6~H1Q;Fnl>$(WuR0& z|CZI5u}B|sjP5&R!GPK&kDC?G0y5M!rl<(;kW|LqdF*&fy2FE zI?9G^nnjwM6+nn{Ac;WOa$2iSE(jG?8n;tfSs+zG)+p4P=C~S9&k19q2r^J#eZASR zRih*!ofL;*wi>goI)jf4$UWSGd#*ZK9bo6UL^>`3^-E~mSQPGvQAmm-5G}OVIoZaj zPXmIKh3L~e#5&z@*QU_^<@Sek&qhmEspXi`a%_oF{X=WV3oWOXnDw@f<#Z_+Rf5rC zaH!a}f9)lu?bMR{LucE%H=uf-ReiyduV3-?7rXFOi&H3 zvYBAoF)3?aeZ}y>u5b~Ng|STF1FRoit0>5GVm4uDmrzU~u8E1WA7UWQivER2GH){E z-EC4bA;MfF%rTBi&maow8>QhU8M6?-gILR z?1P0j^VD_IqCI;YU={?EEReF|QY&v;wDIQaB$N*Vl*UjRL+uF4I~M6I3nflmLh%g7 zZ2?F_1wvtnOLiM$;$3+w90}9bk;t1xYu=9PE`VOKXwOsMGUX}wTBH$qmr)Pau>Op_ z2FBJmZ_k^ag59`-z=! zh~(Oikay%6gC2}^z;k!X?FMSzX_WAu8>6r<8H%T-ddU7sjx_rkat(xaKadA!&X>yW}R~9R+%xf#ixxI(Ux0A;tmIXSDplB z=>3t|sQD@h?K|_fq#5)V-@?~k5QW*eC>TN?tDb2BEy=kuP z3Ycb}PQtRJL1<|nNE*hGZi3m2)j^hsK^*vt1&rq5K+!`KWz<{ft|x7FAiV&MNt?j7 z?+{O#iH3o~!14EP7n$?_MUV|$s=KpnA?)o>97JRD{ljk^zJL6!RD)ehm!P=N5z!0BA2jq-fFp&Dqe~NN>xsM1p=zTK8la)j@{6^IJ zP}n`Z_Vu-kg^trj@0kK~2E?Drv@PFUo>rLNrHKt^cflD|I}R>QeCTdj_wQ3ZU8<+C zbVAPn_0P*WK~5hiDyG8sF&MQ=t7HwDc^U zQ<=cuP(QK1ZGX4xXMI2I``O-~?p>SzWnZ!Xxgz_#!aQGK0_$#X!Pi}MhgM!Gy7#EQ zw&mOd-=NytfA7+7JWXXM;qQIy0>S&3AsPegp2KQmui6+?8~vrmkkS|`H4Z6_LuzB- ziQTfx`-CCb)1>NQSHi2YO|yx0DNNuAO|WBRVQi|n`zmz!k<(Rh?=Cvol~WI#J?q|} z>OE9AJYG2T-20sc&jqN3xJxzF83vK4jLV|2Hto z*U8uI_X%LH0Nfd7sseqhWDyCXS|y=UbeEh#O3{u0&^kgLpmoGLK&v8ss~LltzXT&* zG%r$EQP?W3re@HQ-Vt~b3zzXcSaWlPWO|NxmE01*^+)`am8d6Z%z5f5*mZ<*RU}*Q zk=d{1E#fbpQn%hbbRpTbCAPm*&y`(VL_G(7#-;^|;O@z&20;*`XiXNibvGba{4HhmsCWdLNH6f6pb#TU@7tYmF zKfufL8O+ErtT0I@B6S=3Pv}cRl=ta>5MUdB&^@qhQ-eJ}Xi)u~KX!fJwK}%!`WOHH z#}*>A_xBQ7o^S8&BE=y{Y+E=qX1s7`A+fb^XxUx@2DnHrn(mtG3AsGE05$Taq)A`e znq?y=!X$Qdg(=Mx*{4y+g4V3Mr(Wcs8Xgd+X}TLgvmd(-w+^>w#S;t9eEIdMR{j{Z z+zXMuCI$CZWo+v2POOc-*L$z|e#cuKD~-yoo|T&qJrO9VtV*zx40q^S@lmKM{t=3h z{j)=d)pcfM_KBAv(mY6qm0#gA99&o+0vA09Y=*rQn^4L(I(GKd)P+fIV(KC{dH(X) zsnIJUEGwb4)>s?3Du=8KS-3tgA_=H9f}Xv9iime;6w=W(2HZhO@c3YVT-c#2>u58?lPLN+ie5s|ix6oZ zL(euSS8q9m5K!|H(nQ#oVGmI_scYL!Ts=x&*z>>L)1RP*p8>M_{T%}IGJl}d(xgrJ4zLI;l;@(|y4=C<|LiFSV_sR9{Giqa~Mu|PBG#=cvm_yzN z9<~hT16%TSE57bR==ek5iS@o0RR10oChgy&`1h3jBZ_}S4ecw1hLzB;8VIirmj=!# z17|>Y?P+^rv+ijI&G?Cj*fUDjskqh7X~lnf(?(%*SO`x*A8COddFVb`cZ@Yd9#I-c zRCcd!2dMs@6}i;+tkU-^45G97iQN|Rg3<8AON2mW-#OJC&A=E!-U495W&}#kF2&jP z&>4Ef9Q)80+N6nHon^|r>%h;)KPKSw*g=Fw3M1pi?&qM&hSObgMiggct^0xV;QIbU zrTu4={b!2%$CgGPGST1SlU{UREznnW{hiTWceqH6|Gjz+g54xiNLT`=XywTVQkGkE2c8F zjs7tz!XgUoD`sx2&fMIHx1r0T3TIsCs>9ZH;CupYNPiCz7SiA`Xdf)K9Z=d1sEohF z1QjNT6P*gvsrs7M1KlfQg@N<$hl+tq>pgo{rwYRpg%>7^JyT#Iw-0X;)^^8|yX+#s zF7^#7zQNU8(RZRmpD55LJ_@kwOyk3*zEV>}fqy2t&iL+)71|?frXq6y*iMI|K)1p{ zYuU4OR&8l7wG1gOLu**o#SMB8K zLF!k5Q+CMwx`#X+wf?%7M(HRC@isddEgU*Pn^z&iHgX-E3l2^lvaEl|;Fg8}4Grxe z1QN&!G_%RDX^T97Ua;J*QUDY_3%DZMX$f`2R(8=<^>SgJu+Uj&6St$ldk#WEw04J3 zX{JPXD0D}i?F+DKPq=I+?7@!-i`{|Q&^C@!U6y}@@xZc}>SownIS^k}O~qY2 z@_kp90R~hJTooG@u*!L=#Z?7i~mlg42+?wTSacG0EwBZcin)1`Cx;2BE@b;0BCRr z2(swnkf-0Z8>Daz5YyWc;5G&O7Il z0NVxMcpNUhd3ePmi|}gZrof*y3_9J(ZK#$ADVI&BvI4q52T$^Xu+Vt{ga8PGsv#-o zayQtX$o?d&O8`l;V8aLuG7Ek`vvA~W4qe&TR=NdGZ}6P}y8BGU)6g>9kD_-C@G>TG zV-_6R$jRQQF_n(-{k(`aTsH+oNNFBqx4?^>xCmb`cO~&p5SYQbd1Vsfw-9I;YvQ`j zVcf?@^ejS#3;6Bx)Vq5fb$1g*w@`%4FwLsJiqh=xekcd4lMt{DFCukY^IpAh;UahO z(inH?g-Pzh*wwKMS`&s{7@rtBb$JVn{rX|2&IB=6#4<09S|K^jZEOL&c|Zfyj^vWb z+Ks>iL{BG@DN&L+aW2c{k~=mF<7h5CLEeyxBAS#&=fH$XrsLN&uhA2F9;Ks7NR!Yd zD~|Q6;5Pucu)7X7MwN`#;jJ7Yy#*2QVrNUq8B&~~Rr>?y(8EI$D%1GHY&z^HTZyA* z$feN|69BjHL*|eg9WF)3l<3&Q=(&e&=auNWx6iGNu6F)lxDY+JnsV{eX#p;@jiBd^BLUT!4vQCzZ`r2Sh3^9rHPLQhf0Gdl))3l!IMkWL%M&R zc9m#9TzacaR0R`;0l^+wYb*^-D)7%ts@~SJjj*>qAt<}!iLMx+`|Z`Z;)_1;UHA*$hz{SE>E-MEv!~KM@YA})`V$ej9-i#c2(d#|a4mG&>Ib7H0q| z0fiB|O{=SyNXJC;k-h>YpMHn<4K)g6)*dKOEgKe3$lc2jm@7(+PROVYj3W#3!l%W}cpt=L1G4Y(+4njkG-CDrC&*^E;g zcJIA|%f4md$La5<3#~vZ1FMs3!v)`A#eQTHMh@8JmrXv)(T!%f=7KK_#^Zf+{_cF) zhH^CF@O@4Tc7)S!wK!IvDHHH{4=Jq{N7-}G;$7+bN%-w>nSkVbPzs;-hd)6dtSjM# z4)=@8u*~8W6v3st&K@;)0ysRpe?f1kBJpbp-M4`hA@C~A&jRPyb^O0ZHSlhG!J(3s z&}}NP?Z}**N=x{@Q*&ZYt=oknq7Kbd1BoDJgGo-K5LjosdL5lHi->t?`2AId!B_cx zIzEp&tCc8`ilD@jMcey@QUZBRbuKB2NT1RMzp#qVUpak>_}Apq@n5305cOkgOW&v{8W?>H#2dVSJ0T*437r>7fC%^n zNp4yR()R)3{D81S{I5jx5z+Enf_X$VJ|YhN8`1rU=zByoJwl1eN5uJ0ObluIgfODV iDD`n0LGJy~-gxiDqCI%W`ox-`NbmCQj|ud}!}uS#5gXC~ literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/dummy_loader.cpython-312.pyc b/model_executor/model_loader/__pycache__/dummy_loader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5fbc5af4fc848216c0f6da027944ecc1ac6c894 GIT binary patch literal 1702 zcmah}&2QX96rb_edcQ)_&?uplm?G4;BD+PZ8cGoYS_DEg2S7cT!{F}BCN5)p8GD=T zsuB^3L@R{E1qBHqg%d>m2l^)#31Gkm75fvxqjlE&h9`Iq_d-L8r|K5AQ znV;+R3X&f;z6d@v5c&g*(PtLQ@Um345JePo6k-o!&^4}wx~D5$=SFCHrqT^=g(a_~ zbd%d**(+nDp|=pVHW4kc(k(sD>s5N#wf_QR*~MA)QcM}Y5J&A`MVgg&W1k*aPrTtL zOa;I;6FWGY4Wb|od>-@}q1`a-ku?^qtfomTPoJ+=VGKW#%IC5QhCED>r%~+bRAWY4 zyMfx6>YFvsqz1F7Nv*a)OPhvQqBgUy9w{=6iGw(cmStS!23p5n^?l@4vdN18WejQm zWo0;A;dathKXuju&Ygs%P6#DC1#Tye9pOhb4xMX0@3N$kRpAtAsh+g7F$8i{2--O0 za`_r>q4dCcQ@f77l%u~ zyL9fQkC#}J2UnV%Ub-4b&1;;8&HU$Vowd4YEDAHn(#(048;wp+)MfVm+@m9Ij4GLs z?&Nj3JlsI{?fT8q9eehhW53$Z?^b8-RZriop1yzN>4Ao)=Xd8%-J3sqXa4Lj&nyje zJaulU>(dhhq}L|~I;y=mlu1xl%dzFX%`Cyvu85Wo5dq>95n#SssE2m&@{XL6n3A?Q zDwhO!<2Yhkifn?$Ymp+AO5LA-td_$8Gmrq@feo~4*Tj>M!&FCG@Ym4?x}N;;rh31FAh$9UgDc;-*^+#Wi)hu+>p?>(@l@WNN0?jtEwj(-6u3A@$+ literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/gguf_loader.cpython-312.pyc b/model_executor/model_loader/__pycache__/gguf_loader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26212cc86a9108b3675ca67a27e4f2e4a897d14f GIT binary patch literal 8459 zcmcIJZEPFIa(n#JkvvM2s1J&=tdlI+re#yMWJR{@IJPBOj%z!wQIeOi4lWe$C{aF= z%HGi*1cew60V<;|tfFY+;319MKUI(b)lav-f+8sj^bct%kL=Y3TqHpI{xMJjzvfqG z_IRYEC`}*GE{WUOo!Qy1nc11$fARS|1d{6bljO5*LcYd=m7G;z^B(~CfXGDVR1(G9 zrgAA;)W(^0_NblV98m|$ohet;#c+1DA;m{|20K)D$`kc4*r|F`LR3ikqCQr3sg0?o zXcL1QRDY^D+RR{HZAk^9fmCa>m6hFUTdF$$9+xss6; z^+JYD#IjmUy?iBXGrX*BB9l%e=M2vqs+uxkXyUmHO>3p5^D#|f#8|CRXzNd=lX^0y zCUc6ELe@t2BF)4VO_Oda$+>x5lVS;7p%RWEnVt*VjjlOGm*(cO2}!x7)0m_yX)Qyg zbS$N4MzDr7n@jrbMc^D;gl5>}-E1ta*H|Zkb9KHQ8gP}BL@cZ75-dS{UXrnG+*~U2 z>#i0y{}X^ehFw=kl#@x+CUdaMHpP*!Et3Q%+us+WPT8Tj5;ocSz9ZToyA&Sa27ujI zFY}5=aUs4FILU6ss|Zl`E$pfGuqCU~cw|q)j?@Vk?4u*vggj+8)H-AV%6{3WG%xgT zX;=kobS}9O=mK(+(rVW2P7nP3K-acRZgJ7}84_+bnx>~;zp!QFawU%liPea>GYS>; z`Is)oV`)|vv4NOKs)`m7Uz<;ABK)(OlE|uJB10Js?54=JTw}~w`xsS3wnZQhvUoG8 z&zqcLX<5`_H=r(=hMtrv{|rsV^lAs@Fc>Q>ySSL4x>oCEHUo^HZ?)Q?$oS32s0YVOpwm1bfunq$OuO=onmI%SxSwY&(x>WE>d;RLKg z4PKH^+a$^8WOHUZ1*&DLk;xDAH>PHlDM~Z6x)>%OP&|jGs0rEvLp9uz6jx)Qq>_e< zB8qo@m>JSji$jaa#Upx*f)*aRl^r{I;`*2}q$X#F7H{kGne@;NFb+dz=gKW54)%mr zi{|zXF(ozO$l`6<2F!E89nu%E=FtaO?Hqi5cZWRTeRms*eDFcn20u{twwJs|AA65J zY1vb;ael)-swk7+PzHE8>XD$mr zL-Z!ArQDF`eobWlquN-0#f9Bh!?s+*ysj^f4-Msb1T=)45!`S%DC}9P8r;xtQGwuC zXrRhsMoP*Ck2VB1B-j0VnkwJ2+G2ag96*>iY#GgP;FcS%q=sn2qk$yr8d@QPSJE-Gp>r^A*4dL|Q($_1nHkW? zPKrawVb{M;U|H?nst`Op>=>0HJn*xH?U7%{yc_gFPSFPeZFJ#`}@Ih7q=5d|NP$ z+b~UAFn$wr?}UTEsGDJ&GkN>2O5Gv{EQR{L<`Ld9kN>OD4myb3y5!QC|BFkwAnaI3 ztg9(~9rtM!g!OGCgc**8B|gvR*?wuj4<>sz|Ln7lT5n#t z6@ONlBky?^dg>f3$XcT%Auqr>4a5HV^1g*$3t!LWpxkM3V{6&C)Rgxupn=(u@{QI^ zmi&2N-jVl%R0s1-u%b+w?;VHrcFd4{-O{z;if3@fIXAFM!3d6k?=`0YI0DhlE!}XL1&p<1MOoAo<+^lZWNTsA za9fz+oR1me2wNR3lIxAY%p-F#U5O}?7SF&T1Rpp|aaAN-`J4l{C)XT-BNUFI%&aEO zMi`=pwZ)SaPQu912!?}5u~6N|Tw5fONV6yr{K!Sks@~ZSd*7uUtKQnKdq;m+HlA8< z0oEH-ZfOIn;(qt0k{*TtQZd0126B=?uI4&lPhU^Ns73TNOlO}3h2x^(NWo5k^UzkN<&+Zy0$Hu4pN)6BXZYpNV$O@`To0+;c>0~3Y^qCq|!oK_LiCt6q^q$ zyUJ}{rMCUWw*AYV@?NpPcRlmAJqSMb_Le*QmoJr}(EZrEzv3dHK2|JrjupLQ4Eddl z1iK%+QR*Hoc8~t1?xp34m9}zI z>jTevQ(w6wwBqE`}tCMSo)&nmt^G|#&_pjZ(_F(^4zTPLHNGUX442_rl?W>1Nhb|TmT?Dtd zt8w`cp4&-dA9T^)TWT9DwhgW$x34>Tm#52JeIK9s@XYGI)pyssj@`YyJh`&J9285z za4{HO?OzWLEx%kob-p0-K2#IeW12^e+|dHVMi_`c_D&_Ti1`-6sl z(w9Qtg_rFA^pbO$Z=jb@H(XTFvow7*j&87(Fhs!eCVCqXPh#-HQl87(tYd$Me8R!) z90hNRgzZ~G4dh#mzNDBe7nF2q< z{QGU8FbnhT;Ik~+%O~ZElU2Kmo-yKrsg%aOgf z!h(18l)8=gy`2Ru;GM{h1VLNPFz9%f!qUQC3##W3KJQX%wO;b*y%sI3V4nro zbBhu~zPyJ8eghU3?$BHoe#r|DHt@5Q52(q)0gJMpzvr!6YuG<@!C1;eecod|wy@`z z?cBUGurnMOpe25(;pUG+vlC`$wlidV5873OwuY;@M$nCUB_>xrZzG^5;gTssWl4cR zNybML?KQ)GVPrVhP^E#>WEMskZZmwQXKA^UXol;igyK;~(ZCz7?4k@lCdGR~dJaEu z6$7>gUFI7~)zK5o$CawH9LT0?H~DC3vhWsQxFDK=n>0F!ET-^-(r!4}Qwv3nVTOi! z0SXJI*}|bRvqcjKy@Q_^e2ga3T-Ba<=nU3+x6~TG>L9AGLq79r8s2%BKaB>Fo!49a zQPUmuFw4dcq+DoMtH*GEjvTIm+=AojBAxxE;IU%x*s||`YXF~lyEb~p%fg<*-pHoi zCVIXlaE%A7e4QnsuPF4Dgm6&^7Y5IM-d{L0y&+uuCN#Wux->FX9GNOi|8hO_=5kZT zL-voB2F8m6;~N8~zjHb|J^wy%y6Ep+ajaY~H-#Q3kDKbi$z{6@yCn&@e+Tk$e((2WrM%;O|bi8;X~o! z+oge%#etLS!SQ9^6JP(sYo&p);=tHfz7t0C-Y3C>Yk#y6Jh$Qmr>mvs8CrJMmGbb~ zhA{k%5MCWv+w*8?L%8zXOBzpB1mbU77XE9q8+v~Z-h0nV_Ydf=S(qO*6w80~9h>TM zeG#%x9kGAW?VIX#esSmkz+d({rw+Nl+~0uk5$DucL%0K9I#`?{86K%Bd+_QZNcjAx zRxu3{M6Gd2qWA{G9vTe0rc?7cMK{1aP8}?)!t=v$>e)qAVX|wuUSpRY?0ltfBi5Eq z8?Gx%&P^G{(}})|C|~upB(A8cB!Tv!zo&uJkhA9DibkB`2koNv1$=(@KKa(>aPk$w z3afM#qW1Eu8nLG$SVC8IOo2#!A*jBc=Z%q?K+1a9pA*P;&M7Y z5BvT${MX?M0q_xEA^T^*LRM+)cD9wfpi9h0&$9fP+k3zBZs&6+)R`_8PAj~J_Csbg z#WSjE-aBZK*le8Qy3&{MgBHQyS-cxbr)iFn&u8c6z;#;3npWE!5?K$t@ bY@q&}?RyX5PJQQk-Nv=8WWFbuvyT27m2+Ir literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/online_quantization.cpython-312.pyc b/model_executor/model_loader/__pycache__/online_quantization.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68b99e289f34483bc35ab034a4002d42645e8872 GIT binary patch literal 5077 zcmbVQT}&HS7QSQ8*yA5;!x#)9feFx3^J6D!fRZ2DZb{m7fwn4}wzRvm#xsD4f9RcY z8q#4!qdt%}Qi)Uvr0!~6X;)2^NQ?G~l~&q5R@xUENNkO$MAi18Z|;)5?9-k*Kep4* zq}q{Z?z#7#d+xdC-gC}({qMRuH-aZgf0+1t2SVSFiuJfmV*j5&tRWGJR1!@YB{fAs z-IkIeeG7<~VZ}Na*RV*1ecuz26JbiT zrPJ{BjAukC`CKMFlb9vdL^`1e$;|AmgrV9nD$T^QNkwNG%fv*z^~eIw#3fl4-jNcs zbBZj)W)um-sF;{Y&uUDHh}67#A;fjjaa(mI$N{t+-u=_?T|+80U(;!kB6V>HAxteI zh4AW+3U(Ilx6C>YDYVry@3g3DDbgysWzI{XMIh zs$Fzi?bxf@=Y3XBwR}J?wni_ab5$A=T@`7z7tvKJfm9^0mMj&`a~54KZ<*vM`t2Y> zZ^0^-=p{yF=7Uupi-OiBi>{W?+FY$!l<5AW9cBzz^=hfor=fEFDf)ihJqPGsv$jX* z{@04`4zlWl&b2YG>X<)V)y1M<^;co#&Khev)XLt-)b&8k*{JDRVwHAFhRRyLC37nK zb2<<&QJuO4PS(uEir&ildsTE~UH^4I1x)Gu1KQRW`qnOB`Gm6ro@C; zS`=c6f(1Et3Hpy@aAG!*jwLIx1xwS}L{bzkN{e#Xt}*glY(df(QCWn-Axf7LaY+ke zDW1WiB$~r?ks2$&)X5n^m<)58SJyOgC8i`YY0U|`lW8@F5$iPPJ29Ll(X0PbGMS1P zu}iuv#j~&qvq}P-gd^q>vPvQ;W8w>%b1o(mX_!O`Oq@##q+=;iXCa26gy<;adeTQW z4-dj9)nR7ovF7ZMBoN(_Su`Y&gPhM&4PBt?^n>FC0uGM7n5i2R*_@@&XKi6STYj^?gky7fmi-B zWWi-rb|W@ebn*o!pF6+h>@TstZ=JrCY`$sq3u(tWcF%!a{*Qdud?oMmUpC&e+g$E4 zY5Ir21824ZXZBbhbg+IRZg7*?agKoa zdu|j6t!DFWXLkG}d3q%887cMjEuYCe`2iz-6k?k$8y*Ed<*yCk83iiQNhTs!B{((F_koOFfj&>5kZKcrRVyM3m z>c1O`Uh{xpsnpv~By<$mj@*d?`vgqAHuCYv%5%H^rq%3v{Eok4H`KPC&BeDvJ*)I? zs69u2{D;!%XI9S^gONfoa#JY;Pi(xp6&&7eJiI=cJGs*s*%;Uy{nz+E#tToLFSF>( zvlOX3IAHG}DL;fzu%pbN=40hH)OT$8>>c`veb^^-Up6N>SkLCa2VxB=CX#`83Mk2n zp0cMVQlm;lwyEe8STPpQ&y*T zocwKeBY4}l=jT>D-}-|^e|y0XMu!dX_vG0g`~;C`;vB;_GzXTHEKbKOm<3q8@ah)& z0KR~CY(Il{K%DG+5ipLG{jX9rR-@)Cb^>|kBie7%mNl*NAS`4_1e{aR3~-l)Yb|k5 z9wc#z77?HiYFd|MaQ%EG%AGJ>PDwTYC5Vo2RS-jO!so=GP5EkTf(D}C4LaElSSak|r|Bxn__W+Fl z2wTpYw)8FeRsSP+Uov?EOZBSn|FF>!YrVuqmj5j^)ZnTHHDI-4>iTevRiQ^YS{Yxj zHduT$Hh2XPO+vRrr^Q(<0XIEXtyvU7>=j97lv(0LyGjX_9Z|v+ZVLFB>WqM$YWP$R zVu^18dw~7Qu#wRCT7&>lhKJ@wUP_a!Eb0iz1T=g+i?Nhel8fgE48l?>b4e1!>_RdT zk3p`#Kht0N)!EAnQd|LW0jMg;o0leD$?($>pGfn{oW#qJzI?`1nLL}6WCNBk2}MfD z5T#^LR?(cY34<7YX_&+Y&GoVcS~NyVEhvjRR>8y*jG#-xrRF68&q8UMpI|{)oz;Lz z%){hT77pSlp(_AJ3rd$0GBJmNjs&8njN!6ZiKT`iFgZgSC29`A0DTU@Kz&ZZgnb@k zZMp=_W8yL$3ptD)EijXCWy^KQnjU# zSom6d=&<&Fm?Yr|7{3Ou{3Glkz?=Z`HWxiz1y9#a*Oupa8QB;|iG6P04rShR{9XVZ z=5tJuA1v^LUov@qaGRglZhfiPI$3C)+-|*)r<-?!&FhWDmcBwu-=>sr>Dz7@-432z z9xrjB$`YtzOK$uc}!|KqDLo1Fd(ZH>~aHDs{S@MMPp3V~2 zo*UZYddi5d^ObE>{nL9~`)VY|7q)xH?gYkv@F4yvz;V%6sB!~pYArT)7aF@u08ZJi0^5~) zYhz%W9ftjCax71PAx=I`klo-eeKhyj&BGh&4n0~LKBKplJS};;MF$LF4ilJxPmz!A za31~2Vf;~o37pc57UODyN!r(RRQ}LwiD>*)qyG&fVN9Tu_V^<&GsZ23I4Ze}5GQ~; z2l?YbbBybOLr3R2LelIIE#L$99CDODNBVM%)6bqZ9Bu@^4qYTh$?w9aY^Nw{pG8#N zH^}=H3Vw$=zaqbhuh7e1yMn8)Z@b!-9c7Ukqo`vgPu)ku*M`e>z0wuk*NOl9&c;x^ p<-mE0I#g=tD%*9UKl-;fK7YfkoO{I}_84}Bd++so3}Mlg`ycD4%25CS literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/runai_streamer_loader.cpython-312.pyc b/model_executor/model_loader/__pycache__/runai_streamer_loader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea94fc102f5d41a1cdc83c95cee762176e67c80f GIT binary patch literal 5329 zcmbVQU2GKB6}~gGKeK<{!8YJO%wP;;9ecgTj!gsv6EI+c4YDCWTP2hA&McnI?5uNV z4A@$$v_h(*suGdfKvY$6`;tUNDwTN1QKD-sCfzWJ5^=j} zg4l?fe$vEBrl4TQ1@o_mMfm89Nc^B+(VWhhVm71GzPQ0Jl$3bI5Q$r0KAX|RGm~Nw ziR_`udP3#`nR*lP&CWV#MTB{m=NSiOeKx@($Icko_qUA*ggJR5bC zIlAKc=dQ?kE7tvJ#TQLL4d>ftZI9cNQtxu)8{O7=E&Gl^NA~}Zy7@h+|Br4Swx9q46_zLeQ^M$aVw*IXjE7qG#vqmWeClCr72p%M&92dK1#Ai{q!ROiYZ7pL)xV7L-B}&q@F!l5>nD<%Wqfl5+y>uxhBfVP^EKVslp&)Amgji<-@;x@mK; zPn#_oc2Lo;sko@y0ib|(B4GOrMay9UjAKVP32dpMZVgQ&qhC^Vd8SAN3YqJM*8{B8 z?2sg7wTxj%l0nWdfgJJXf8Opj3p2el>P(NB!LVe{joG1>kGwae^lIu<@64PzUDSK8 zXRO>qX?0rTNXEZr8hg)E!`F-0VU(&T>8;}_T060I~L@RvkgP_GX z-EDoyA9%dCbJ6#ZZ>vUjRU)b7NUGX;u+lngwGJ;{cpPtAidymR2Q5{hts;n)ATI4$ z7J45vKIPcMb*lmN!a(KFkacM2%h>XvGq-&Yh0bkx8Ys`x06KP}+9*~UJFUjfrMH(G z``0-3Sp9F=JK+z*<<=ofII_wkp?#^%iuXL|T3@(*Sx7zD4?_CuK*;f7W?jlV%Z&pd z<+vy1h$X!Ilt*3Ne{TM~x!iwoIdQ2RJ-D)W-zRe)&y^ELm-ikkN876LL^mC2-aIZ*3EX`!hX`9E&H3+(*K5hrFV$Nbc zTT3tq3xa#nxdwEwJX``bz~2b|aL_&A&-*CXrXo*dJoBwz9Sc60S@4^@ORx|q`3$zi zVYbBRM@;VuD8U7j_aNV;MkwzA-}+d|f0Ze*+t8Ck^33-Id#{%4Edk?LG zGFTC7nUa`;E8Lt`MC!<7I809{6r;KIVNo?iy=Xeuez&Na;&oNi#3@B|b2CLw&d0iv z8&U?y0psdby@(YO6_AeU1`PQ~gv_T-WppAh2kD-e(dRa%8jd2Kde>ERT71`L-c3X? z@fGeMMIRKlUo~VEV`67aa_$;fxherbVw@u5F;b8$z>WeYm})^8!5FgI5I#U=x=GPP zrgs6XLXxOtAv*-L$y}wnLqx+kO?m8w4LfxA1rY0Ht#)WSC*_J7WK*^PO*bEl~~xP18Z^1hKW-@ekY^UlzRLwAmU zc)Yx?f4N~`k*~J*l=;?`o!yJ!Cp_BGY_%PJ!lBsSf9^W!)Yt}cR{8> z*13tmC2eA*HD$G??(g}EUv3=*U$4abs?oN)a-}_KwI?g>C#?1pUo}=vUa?MI0dERn z@Mb9NcrogDaf_z_(VRN*5;V4VV&1z2fUNdGy1#`3U1u{0p0MnE>*?C#vW6yVPSlmG zVi*}bH=f|A)onjjkJ>ZAItD%oix^~4*Qj+Kno#2TL)%(&GoeZZwL@_eJ?5hoevifP zxvN#$Q&xNGes|^YS?loGzw_s)o$x83+Z5v@8mW2y1Pw3`n}_<@Z&O^ul#Z2u=aSwBC zay${B8@7Yf83Qf`jL(2rJ4()l8@+fKh8)M#BA=KD6Yo%JgKW=>e0&ai^9Q!|%tpxt zAT`K>;FqS(N>i8B)U_B}iPYadwzT(Qq_5icLZxlcY8zY}tMai5-(vAC)u>R39<-td zmlp18mBH7n!Pl0f7rx`%I8QW^)SRY3k{y;Lh?H6Q{X(*%lJx#;Myt&PB}p!3B?%KB z&=eK#gO=?hzeYGXKzwfd&Dj}EfvSkJ({|u8{bj_l2#E`PniRwUc3_OIME7sUYKeAM z(KJa)FgO7u2EoQ*iVrwW_((zCSjMMN{BR3BVL3m)8s6i-z$~5r4v~8GmHmFP+Hqi& zBX!@vee*9ZwEZ{~Tg-mYyyl1T)$>fyAFl2ZS2oG{ z;lw)h{ zJ6-&;*EPGsGI7BgZl{kKm6vVfPCD8B7tkQxsq27=~Hn5fgrd x4t$HEk5JcB%AjRpwiTy_hLchTkJ(h6RvR=DO=<2glF7C%C>l2qJF$S;T`vqx;@^IpvHZq?1=jljpL0JG9cc? zH{GE$j>nq~XcTeGRm3@Y&mCjAwPleCx_*xk zr8qu*B9)woO+wXvHWlG2HAgHNlfv=T`FM*B*%0T0RqWI3ldIHRxej=)*ZJnWykC0*!nK`ixN54r zV`By_P~%PKQP8i}pPz~d9M7GXA`-71bL2D}fSrXz&Ys1`%?qqF6_HpjCh$=yzDOEH zmXAcI*h?v4h8KE7Hi1Qp74b|!7?t|jOH;Av6q$-oMlQyAk&Unu5m6$;+1ZFR#ZIII zHXacsc~%pf&}>AA#N#j#nNEZ`c!958iOiQu0uxD|=R{UYu>v3A*q9`;Af2p;--re2 zhk=;uBxEo_QUc!D=oBBFnN7u#(g1s&=gC~J^9zy0EO0lNnwW^kl6*LpoPb4>QGQU2 z%=2L~BhQ7&s}0O9zD~G~!m3g%ISKDnCLps_tw8EaLGmIYA(EU4y@B&Ien%!R9(|Cn#Nf(&3K{ia=wsk_4qhvHqRP2}g?vy!~QP`~a$Od6pa_cqD09<(V+Sc=3 zP_bXZpLdH&FX{Zfi+)P$@BS9Gh=TN(Y60N_wUb;5y`1D1W`zzFhShn^ zqF9@h>1u4k~pA^L2+y7G(34?5L@` z(Ac9i_S~|qj}(SSmEqBRVR zH>uJ$U47#SDMC`^9_bR6#!~+^x8+&%+A9dX3Uj3mX`^hcb#rvd6{+iuK;bAaR`t*+ zrkz!d<)Yq~rez97yPl+wY0dmi*VX7V$i@kS{x%f4{v8@_vV=vIq-iVz+Y)DxDkc_8 zpz$i5HpNid40jdVt!+u?4|d|OVHYg21xkNW-V5a$Y=Tdughdt%Cs>}jiy|+vsR`U&nM|jcU?a(tW(Nr- z8>mpIX6=dyJAkP#T@P$#u(I8pf@~Fmuvab!G?WupF*Y29C14tl+LrVKff;nPT}7ey zE|_UaXtaoJ1oYQ_#h>8!vGPp#0mk(RR<`+ zFrVZw@zQ}>s%wI5Z5XzytQH0i#Z%EpTs$&RHTEQ)w;!@AsOW88J-Bi(E4*`bZAkHc ztKc0}yn}h~?)8&}kx^x2G(U1m@t(>trvw%T1YN2lngYxcm13!+s5&@)A~F}3Fa}cT zcubT4?@D|^R87gb1Rq!3b4h7&mH>*B5CO2=3$v(Zj-QW3dDR@5orU*NO`x@-Q>ys_ zIXcw{m_;?uV_PDsPQcG#f27#tTxw1XdW23qgBObjpcFD`cv48s&5EiKtPI&Suu+mA z^pT|mUIYeJ6QNP58I_V$Dxy*sRm04XYLMU;f#1a;5t}6H5X9c_s)-+}2hyqqeS_hi@GI+0kofGAEY(MYr#5`I`Kr zH;ewZg1=w!_vifs1^+(9zc259df8a?bgi9y|J-}$)`xT5dlk>Vg6D|hIdb24|CM{` zk38qV?zHVH*(uv;>KCT>nfI6y!sUPVZjvXgw=0w}7k#}2-=N|f{MffU^V}y*?JM*$ zwYDqzf5u6qPOaArd=j5|N(f^# zty%?;ArYi-ELwddD<&@oJ6~<`wm_p8kU;HesTO#wW%9O03jpZOgBh8m(nhHQ6~R2! z1c=kRuct2vPuVCNzJj`jfxBzCQ`RY=XP6}X__%?KF3ma$jGx5e$Jpgl+nZMYk&24NI3)djjab*VC|4$I1_$P=CwctG{M z?)olu)w&Pch`_QT@ioW*Q8_%hx~>O~?z@H$EVnHmIBz?1d!K*Md#dQ&nPYZ-+7ZgN z4`jwRyt|5?9Ys$=!NV#Zw%`dVo>0-#QS>ww+3o`Sl)^q$V2>&6vBwT`1E3?b)1}D_ zqsdGS4}%5-s*Tw1lqo3$h@2V=8if~tUNsR8$ZiG=+V_t8~32|~QoSz{N!OcKLWK#T_~Z!1%AS$yygtt2|%-Mz`H?xrmd&!2LHPf08BftcLA!b zxej34eW1X0NZ9t1G#%1QTTjmR6{8)p>kifqkl;Qzrw6ribuKgdy8wlKdUti7R+b%G z6xaU=IJFB+Ri|v1hV?PBU8jU|cT}G{=n-a2z>4BFrFLNGrs27;rquMnQ5eTqRHxQh z4pXXB-))0RlO9n2f&{wf3{0dtzogKYI!~nL;v5U0nVpCwxvFpx)N%Z(XK8;=KTD!d zpvFU@<|$XvL5K^7ndStDAK^e5L=t9G0z@A=mkwa(8wZRuwlQdj@v}U;Q)B_$r63?C zfHB8zK+jIGr@DX1c`hc3cq|E9EY-mPEilJv0k_Uzi|WB{{{RPgQ;1!hgCzxUL=jPU zs%0u7f+-Y)Kf_I@aYo*so`zVM7N;<4VF}GnTEyWnj@1g-R0~)dh36ns?IZ>kCeXHC z3+w{=n1`SU*&mh0xEU1^%mP-^0=5m>YhcDz4*~9#1DB?RaZEtsEH=ymHtnLy;G%A&g^QTvX`JCp zsM^UkhY6z&t*o(Y@K$T7TWeIZF@84imkmlYtv`$P~JV5p*QLqR(n=@ zR{K}_i%eUQ@w|QZ+SvjVP?*5lg>_?&3FMiPO_R}U&sZR+;%m&<2!QB(;Mh@Y@5o9w z%aAKA#_*?9nOc2 z=6%O9_F`+G&^n~F4y`}^aqD3X!2@|q_u7dYfkJagX%1yh7omFo#$cgkP-z*=oY`pq z*4q60@_X{f?ISC;W#jUNqPH=-Yb~m@^yR&M#m-%Y&LO39Xnpt|b#El!d2Ho$$%H!g z0Ce~Gv2T!l<3o4fhR46^T5%N{UMx1Wte#ytTWn}7HUu8qOm+1o200rGOq;^AWnX!~ zbQc-_+hfLdOE|vd zCkWz+&}eu7V`c*rt<58ZG+2j86D07&w8hE>B~ znoKbFA_Pd`ixYg2N<*kYrodPo2Sa28XaQjz&a`E8_aF>9zAm%`W8q$>O+Zy6$Rt;w z0fc!>vU$P)Tu>0sXr=MD3G%1&39Zl4z}r4_{d^bN4Myt{aQ9u{7ka+EVEit+ME#@j z5*4(Jss8eJA_=1Bpd0kq`URrb0l3?6JUpBN4_GzKO1NPHD@FW~A!s2$U1))w6&#SM zbydOh0pczT^SBek2i3l1Az&m1@~Vlb2aSfPKWbu2F^Cw_l`lQH>OvG6YPz&fD!!ip zsrW<4{vAg~K`I$f!O^NXS__UI#nF@7b>c(EiH*jm@9n(bkZ(L$U`{H`$=@-W!z1Matn(rLSnu{HsYkRXZ`KI9KCdBss z5&XeDIo}9GTERKAA?GOdr$5@Yw*L|_@^~wF+zDZtXO7Z$TcGqybLco_`sD}%z z<96Gx%oZpUJD_P$Ya+5h!Yt9Uy@y#4!gv;h5N*B$rnd?m<4>X#d=fPzjA;`_%2zy6 z1^d?eE|50h*xnsHC#1BQqtX`mS*u3X5=2xB3+uD!SRrkLc%&Vob2TEKIxPr%x(eAb zwQ#de3s|Uvk~MyDuJE|d=dMGW)0J6cOSmCVcRXd47>~9h3xmBRS^WZ7IDL6oMBg4g9O+|67VJx zY@*ru0jR3hWI1RkVBjpgjsEGBBD@t@xF2oJPbPmsO1Z&`^b)d}!3%-S!%oVypWf zU<$F9Ac`+s$oACl09H8aGe>IwGg71LhJ8*AL z;ouqN;F9=+fDVejc;>yDd8ZXCHK-RsRB$+wI&R`n*L?qCN7p*DAOtG)ng!KDE^^@5y?z{LT1{c&-_~@$I^GVSOa$ z-KW_0Z<>Ij)P^~*qCi(M(Dk{k%j7D$8{S^Hw($1lYnL(Su1{TcCE(QscDCNPEUmPZ zEVyPxUVoNe=_}zefQap`KRBR7-smyxgBAo6@CF83g$3BuwG3bnK){ltYJ{58BVg;O_LHHScdCD!9nh+H z7%vQ5gnzp!f0Gq#cu43QhiL9dcutDNMJ=T1zynz_5I%Y*MeN-1@J~5}WB|N7jW&YG z+P{iSqrpa5xQ4SSoZ;=ejmWLibeHvzc9V_>7zJT{EuMp{WTYr+(~2nXC&=*$vOPjO zA0hv55c3E%{0?en+20ybkEPp3&>$qZmY!(faD;xL%VuO@+ Ga{mJ_(CfJX literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/tensorizer.cpython-312.pyc b/model_executor/model_loader/__pycache__/tensorizer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a42b838e76612970ca0d18600a50989a9703a2e6 GIT binary patch literal 34493 zcmdtLdvqMvc_&!aUHyJH-tWd6O#lrN4-$OJBmsh?NRSpuS%M$!CaORJY&=}uAPEGB zAw`M7n7aZ)X9%tB5uB48!(&ZC&&kB{naqqN%duzXOty;+xm#^$Oiws_Hgjh8fP%bC zw0q9(@4MC2-2j@BlQZ+r7KvN8ZrxYi$9KPX{|AS|%HawNUmu-}a@^n0gLawIk3G>t8GC08myDMWm9w}hTrplbRLSDzaMgJAP<6U|&A4yK$KsZ7?Rec#-FW>_ z{dmJr!+7IRBTKi2o5q`mnpxZyZW;Fv`B~f^-ZkDj)XL(HaNBtMP&f?EB}ZedPc`d&C+MS&#Ut>bX2U8j*0xNiQ~j->G+5s*1RGN^@+Y$ zxS@U>H^7OtuW(|WRQY{!| z4-JZ~Qu)|F)$(TVVztQQ>)Y`4=bx5mL?^c6{b$91bV}@y8Zcsc>l!hNyOHm?r`5yC zQ+q~n5At0QJH=h7%PO6gj!7kBtJz*@u@PMvq9TtndWxZ^T_|};42aF?dF#gW^J1qo zDE6S1&q>E9ADh?o?6UUuBG(I_*spzJ?^D0aEbd477qJ44iU-7l>^lx&)fmOYuNa1g z#3NWs7#sBN8RYx?3}4SlhZ{Kg8$8WnhghJ^*HEX_IX16-UG^>(W^!~rB}Y*vNB2{5 z6lZeuJS9g-Cdb~Vj)pFWrL&=_snLlmej#NHMa0qJXv!jnqM_k%C=y8- zMne)$ALZbV;EJfwe=!7I5pPU#O zr5@SnU08igInRgWP&k}XT(u2cl9em0*Ix!jzh9|FF zkz_Pe{QTr_=(1K-|M+D|6zTKP0j>O0I20Y3l*cLeiLi8Y6r*%hzA`hYWp zhlZuAlVMSk(aY&DeS@`s;tDo==NUb_JIvL1mX|`ZGIvS1ywzQ@tZfSG2 zM9sOQWxqvjiZz;PPp+YseH7GEP=`SKZt7!tTJ_WNXv&CnF)dFFr%xH=w2 z_&siplLzLwvD_R;r%Qv?W9bt|O;tmY*LIA^AmThf$BX*w+)X}G!*fwV%R6S&V%htc zB^%dbbDUP+1#U+08wO|f2Rb@rI?awy^C?}VD{B9h)jh? zqbc2FBqd;1Tum9V?jq5MUne(GQDbEIsx&S|s4}1Lvw6^y=h92knE7aoRya>83P1ja{F9q}?dV9oaPzn3_pdjz)slu>{4S zm|2Y>+Rj1Dg4~IQ2k;j;ieQ0z-(*dC3h$NODZ5*d@YE@ux|N=|$G=*uc-j_?Nmo&< zY)I)5^l?k0+Ul`^kqcuana0Nj?pJ? zyRsua&qt{wyXN?4W~Q{WI#L$R%+wgeD7D<`UF7GuZvyZ8wqBdNm$3>iaDM&ZY~|o& z)R!|B00X}0q>o_a0pIL+z^Cn5b(F5%z*Izhp$Rd!5F5bstMTbb)OT6ReNAHCd3*dXe!SHe12m}7>i6!(0U1pk(6~bGCC27h9-ukl)#|28Ldwd zbV_*ue;Ba3I5Q>n%krcQ$Zx>ZPD?2xn|d+5j^sTUg;Y^?9R$^$22q!M81Dx07olli z;MN_USBGv5EgRyF>V*^QwxXqxZ;5NR`gLbz$Rg_IgzLO)5cP9{s5^v0 zI&O}i(~oj z3lS+ik`hLyCx%moD5eBkoK{N8s1A07zQ9M9Ivw6%5Ic4FYEVockip<=?bZ636<0&d+`uNtulpXb1tDb)2I(7u!IU)^1R%j7hqyf${K|AFtiF*?qe+>N zYgUqdf&2_TQt%@FBA^DiSGbRKMnj{Thu&=2`?PBbKyLn51O&|&3PJLvuY8bOgX$N! zoZX*+zH;MlR=S5HOcJmp-O?~ojQFR)4xU5()O&V`d=`>q;ZHu8vK^k(b9=Z5e*=s1 zMYPCsGa6n77$ZtIr=;PiBx1i01G^!NjCQ|7+ZD^K%{M9g1}7&Z-{=TH2f>Ev2;~N# z@!?pGN;jjv@z4a0efgm83IJAU!Z&$&40ZY@M|`y0e08(dIv+s`DI^8}ND6|3lc#_b zos-o)b!vPnoN}cr3#KbenYHeuEEzbG5~xD?39J}9h zbiQ9ubN9#-J?HFF9GxE-Iale1PUmQT)ZDsyCf>Y1(R@s4J{D^_{?>sFgU)d-2jo04 zp-{}x`EkUCM*qTe)USWrZ#dd%qK;%PAsyT7!v|#H&m~UMk7;~{OlFSDH{VamY0oK^ zjy@%)b7#&G{g4?{jz#2O;Ux1;igw6~x3;m;oh3!U&Nl>Y4G%d)!;llyv0F5%;7sz2 zHS8>)@jQ3CP{4-y=lUD`_xM39I=2kuBpSp?2gnqaWMD`sU0`>L zAK!AOSq@zRrE_yiJ~CUCXSoM5ucy(Y2qDA&JNI`BT&(Kkk8ZwMyZn`v+C*KqQrEp! z-SeZHe;=V$@^Wc8pT9C7PNCuG=ymA?I%pE5kQ6t%b?KeVuSL} zFg)6zd4?w*-FyK z;~u=tvnTqvFV?ZCHZM!hNd3J(^ZO!q)OnbK*1O|U|Auw9pRv$f%xppRsI<3xm zorQ~fv>e(<(KukN^t7n9!}+JQQlQn5z0YM@p;7%ujCaXzG~RlKj#QupLW{Ot7;lW| z7l1j<^Yi*S{gyFw?i|AzEVHdA(Cs%=&cKfJ-_4({omlns&D(lIk7NHo8L1mPMoQzM zsN+Il2ngWc(ph(rZE(LWWyx&+lpds5%DZLDgYz(aO%9>}(5bWM`YxU6zknEUJ38uR zLfcY$Ks1Igr3?(Zr9A1J!E?`@8aOq0^vnfdWBTE6B&El3nzE@dFgkfnnn;-^Vds##iQ8k-m_OFUEYMtr?`A8M<2KvlNHskANtmzrQ^xgz@HuZ-k~@3 ziQY3x@0ob(*`?#}6}D{Xxylwgnj1FkTuJ%8@jK&*;%23|c~yuPw=dc@1dFA2-Ba_P zr}h`_A{_kQI&{h5e)a6Fvk7~RVy{`Tt=Zc^d?q~2aZhv7U663sK5*Brd{%Kcz2~k? z78EB68kB;DRsKPNKk08v_>ZjlkG$qs)-S)1?AW)gf3LI+z1Z~(df{z}d77V8pf@o` z{l`zdXf5Wc{rHJlee-`K-~YI#@d1jb`Tvc46%+7s+du*Tr$>wXI`r=d^@zXIV(l;B z-sv>-9n`1rP6b82Wlm?{=Fge$db_n0~^Uko6}fL%-Yf z6Fa56{vjjsWMmrJDosHpi>v z)(j^T5#iS}q(UYdPo~z9aFXJuC^&&2LOu5R7E~I_bVCMPQ<7)f^KAfa_XQ4Nb7JEh z38b^%ql$JR04lIv(3B|HtrYA|6m%;E-D?GV-!G|Hz8EWMTX|k7X^XjO5f$!JNt$O- z2CN$Dh)gIh!$rQVqQmkC5@stusUv?#Md(aJyB`TUOZ$eIbC)bSS>=QNa)xosB$SYe zA_o~s1fooMW6F3297ph8Eg4>B%5o+&BguncVak%4eR>M8W|ZQ#^zOe7Rzb>oT%{nu zyH$sl!72UdL^S0J2164QlTpUoh=A;~M6ZJRNcetAI72W^RI4N2FEFvIfJ@eN-QYo` zoauO)oM7N6We3M1Iy$UoXXCP+bR^V~O?3p32#HpHmrJ^f?wRkH@44={R*Jq``km55 zZI@Epb^p>2U--cbiJk$aXCUS}xp4CR(kg}Uw|LdbcePnl+u~&PlVS z(>#(_5~Ko6nncnee33oKDY;9fGDy<) zX67Oj86)ar+eLD%%;3Z9au24hh^Mhuv3S5|6<(J9O+?aKK=pJc|{ zB6=&;N_80RVs%`pe|S@pep4?Al#)&@rBukz%3k4y8pQJ4QG#@_0#dU^u~n?Xy~%=n z^iOOPtMS|{wlmpm3uJyi$uCw?jVMzK$=fcm4)wK)^$6R<288WmBYIShxfaAGl!lxT z&n;4cSkB~tex!AXyO47?_;npJ(U*fMXAWDD`1rZc;;5W*F*!{p6VMt+G)6~ef>%S4 zt0~*W^mvc}4oJL|SIWpug4d*(kwcT(=1m<(sHSkK_pH}$`%Cj7`YmfMM(^Rb*ReOQU>~@$d*Hy zC_8p2|7QkKvixyiRivQ$}?S_8j5e=Xy-ao^;e;*jB^-oN+d4 zeSew%A3PeB+81Q+5fd;uE|?h)d_i^gtrm-zS*{FCKLdu3-!wRDXx#k;QJnqGxry+M zk656@&}32o5)slaCg!NNqrnB8A>m2{{M1qGBQfB+6p^y8rJKXy=?IuKsO-j7X(A)N zA)e})$>)yxLSkr&I7)3yhO@J!oD{$_-(>WvB;Oc~NP*e;^YY~NQS4#gWu$KJBuRv} zmJRq$O^iSiOHJpzp)X>>pqw;n3Cv!!<%3un-Sb65*QAJVN|uHt5pYood|cd-?mA<( zh68Hx$gEQp_hp{{)ujoIzYGd;GJGAZ=InT)rGW3KkBI=crPDa)Jf>R0lGYx5~EM3ehxUbZ6$0D%%211Ae!>4+}}S^sGQQt5NAbz`T`TaORrw0 z;P>gh^?4F@G0_)Eyh!w)vO_#KLEP#|jG5d{c>|Q!gs&l4hJ2fn{wD=(@!P)AInPCG44P6BYWOCmB4V6BK9Sv+Av}?&yG*PlkDcSYL@wk2WhaB(d z=hxlddzL$v<(8GcM8iI%VPD+c`<4)M_iY$Bi(}1PN&+)?!dCsjR=v{w-NS!$c->nG z&bQ|nakvYd5UM$xTk<68_dclKyKeWYR5Iml{YZ~oA6dEj#_u+Nr}?`b-|4vDkm&4J zI{V{wCl>qG>@`VOLBdt{z*UFHtMk77*yej{7oU?YWY1f#xJBr#>uQu%8s{(6MY zJYpb@S-hQT)Jp`XscNMEycDg z*73|+2bGS&q`f3zuU72Uv6_xG`)=*^>g=1JS2|8^f9ow=G$jjb5`~RQVdKvWn{V|k z9$oy(L$_zS;(@y%Sy;6Exd(*}$8ayeJc`12pB`cNbZt<^<79aEO_}1}H3njVg(1!U|jBYE&_ zYqn(#sysC_)Hk^Zc=s64bywB079P>U*zGPum7 zL!>DdjL_(^!{M9gB{(z$*i$Z&9kJNJ`>K89R+nhpbsz+9i3%_4=^>~^)qMA z`bHzZX;M)ki8i4?gr9{$&@u5gX{AI0;A-1*%c{_f;6%!lKCZPnv!@BEofB4?I|aG9 zX8qZIrZ?2nvOYso)y1YNX)8(C8WdYY%(naf#YE3ZrRQX<=OPZ)R|jtmZs>T=zF+J0 zIDGZo-bzcfZD)S4(iyO2}R;@SHM zPfXH$!a9|4sV40sXiR9erEC9BUkxyccLPo;J@qq{fUz%oW{R^>DMy4P z!udo?S$IlJ(jD+rAVPVzXh)wZ`3!ZIC?<3k*v;&%*_gR(XE!pFLyX^Cpq@Jg+HtJi zwQlKB`mi)1lT>s^-%P5p-GC{b= zMnlgIKl!KhhB}v)n51=fFeb?O31gCZJ49nbh&hdkb4k26dS~?RHN{r@2xHQ9w>@UB zSvR|ugr(u-p7+wLgRKhGnca!ft5%D#!y1w=Rcr0F|k-GH0|WSpa%j3YKl6Gd$x&v)YEa9a?Mci zG6lbbASGO$oD9pa(DUz7aFc@Hqu^^4EK+cb0-}!D86fvk%t}Fq?v+jS_$mcU6wtPn zzd`{qa8kyzOaoB9Ln&?w{(u68UD+r$jUZ(K17j9g7$n*<?FXMBl|ABy5e9amc z{K@*}m{5rI+jnP-N;bE~gwo~u72&mAaiMiv(TbXdlZ(SkeYcRcCfV8n*-|%Zy0C1% zb7ZAYDQJlc9m&qUR71mZWaaqs&DR2}-AZ{!T-clJ?u`lM%Y7>aE5j?LuMMtVR4Vtx zh2Hmj55LtDd+zzQ-p|E^>J?$-!fO4>3*T|xFI0Rzap80C@9ufC=VuLTyU)=lElKyT z-nn{j^3J5(P!3&=Dz^H#&<_#XO3(eqwbEXy7(Gckik771<0}QrCsDlvRWv1S1xu%v zhwlusBJ~@(c7yrDFY(QW_KoNGdPB)d;mYu8*UHuJRBv*4+&sj z4uQVaxoJXdv)Ev8K6VO*&+@CivF<*lwSSYNrw{GCq5MN5Z>XgJDgb&z{fd5r!);Z+ zNw*EFV5nP;fDvM-TkT@E``uKyZqu$eRH*BQmdqCZ0l}bp{B3q4(3Gyeh}BH8?J=Zn zztUQ0UQG*4-I~1Ihm6?PpT{l5kK{rU`HM(n!3Y705QwwS3KV>{*z)v z(`seKrTj4RWG?Nn+oWawkCq)~WsIuq&>)#1-!Vc`W09<)2{J>oWP?1%@`^xPlrBgO z9imlo=HtPB*8|o_(Y!h?)e{yX^u8np$HQa^%rqgUYxY(gFg9?8?VyM=hDU zQA>uWt(D=(Ze(jM1SGdzsD+>`yHE=q0^6<~wG?j4fJf9~(Ngn&QT`t0Z&BpGMQp5$ ztVFAseGyZo{}Yn>5R7LR@|J4x>>a6u$8Woknnx}m>R5|;WC5D;K5H=}5dfx?r*8R6 z(ORv!{H-%0iOct8L?rc)iGVOxuho&gYdd1z5N*^_$C|WQ_C9CO(Cm5RoDsdu^F^Zm z5DIM9I3xmlyY`)?Wd{v2sJCoAxsmV+s!3V1l2G-i|2b0qh4LTcCjUDMUZQMf5|IX> z2hOTV3DU%MSf5?!3LZUvy#K<5;IsWN22b@d#2{6OCM!CL6!9PO)#yvA- zW7ufQBW4WcRK?y1Ga67?j%oaqNE9;U3(l5gNfctxf%UGwm?CIP3}N77rudE}8+sBAroojc;m-Oom-HC?7O2gqdhvN;$V$h{KbaqKdxLXxB zwCxr5{wEG9#ZjQ=?DbD{+T+KJ&VKhuPyb>4PYxSS7)>c7Ge(jbLc8S%&Ora)##}M7 zANTDStdn(b=rxp{9nKjTp3N%?S*Dg~c*B@I4FC`F3l#8AA-*F)CK$1orK_Rqqj*80T0G}e zkyWLD43i09$$TWJa&`~rWCY(ZBg0iQ(*9<~vD6IgTfkjZ4Q<XxS`NZ7{`88`AAgds;yf%go#D@jdBya;Zf%D>!dd^ZJ+HgKB3d2VlK{KR%WXwVf&IK(I6^OF@V< zq}e|?HR6+)j9CQpAD==wAgsn}kV3su}4pSFcHIUT%LRVn-4co|^BCJ-D6|h9s zNwO*;({xCwCwvg+OKcD_wZV3rrVVydp$X^j4;t+GnYNr za)v!yr8>eBQuM~8d`%r`y-Ia|3H?$#htY;6RC`(zYW8ojrlRs>IO4lHd4mkU&>qvv zq@{>S4U;)BTf0<(mQvcP9a;~nY5_hfGC3_{2T(@>zCP7lh1m69CPT9?w_(AKVzSsQ zi8GLmkCG|a43?BSL+H>P}6x>vr-AsTrw{%Hb3*F&SK9UCbbP)n*&thx+IA`|dk zyo%N=MuAqIIuzda_DEN|s@g@<)IKWCewijoP4dy; z(|(~&VC7z(9=;|;)q#hSFOwtDCup!@J2r%sdwo=%oFFrQs4PI23zCb#Y%yz9)ho+s zeD)f(tS0~CJFo-qdP*m-3h?Qg8r$p_scr4)ANb9=_+0+hAeHfkM3Ibu*-;R@B2P|F zrR=$BGGjv#p9#R>SGc7NELScetdx~C8l8E*@6SSmi^3@|jd z=e(~-7Suoy!h4iYXD}jzGbaNuq{3t&ObU7LNqz$glBi^Ro(g1O?dZL*YgY4^QI9vL z9$&Veo~KmPkj--VDOoDAS-NwwxH)(6qC;h9kkb1WTAFxigz0SlHG*8}MLtn()^4?r zdsR2AyMnsD2ugNd2la?FhJ$J{*B#w$w*V!Fc^%owiX?K2wcQDH=bUBDC1Qf}nLU}(TtfUu#qWn7X-5mngIhfbzU z0Pn=qm4AXKzd(`o(#NE#*%}17_OYnqnw=C9tWYG|FKl6Th)niUqVST^nK-~G< z4=$}<{L3qEi9hRm@5pBtqu==4^2L?zcu|YuYPo+&aX$B0kF1L>;NtF@n7R7zKeTY3 z&+vc$pPkP|h<~tLaI{MJm!6|__P0$ggnrL&!WAYi3}cnb4=F|#CNk0N@_(gZj)MIN zQkFqzKEVwOG@>>raT~|=4N9Yc7&H;$41Ws8bt@lrL#QD)upzGfw@?7zX`A=Z2HL%K7Sd{V@(Q8xwkFlAup z%N9z5Ry5li#HfB6rJ>RK`RE1h9e7Zn{uvdLXP4v?!m1XN*acPDJ&>zBLF`2Mn2?X5 zXS4rJJxWyVVH~9DNtwnkp!>&|X~3e?^OevbXxfp}fi|3-(B6UEouRtWp@>Y5Ygl*0 z2gf23!+RLADMDuuqFgx)cM|6zhzE`WpCZH8a9)z~1x}Q#QLTnPsXbBc&a+54L!;DU zX*6p)w%O*3Oap}gf;<|H_;N-30qkbdMT$zQ-jR{{rHIH!F%T;5^G(EXsCz#1p0h%eFJ-Qbx7nV$ojw0a;2dKE5KWl@YX5by12I?;cZpCtv~m+ z!$XP9qZ+wYDYmMW;<&AusbUD8MyRoodY#eXd~Dzf0&InCSyjZpW2@>Idi80mst=x+ zzG$Phlf${>vvgbzyetA`Nwo2{D<8j7`o*O4>o}%EznBUlytP zc$WG5m4|ovlrPA`!F)sa{q#XH%_SA|ePP9g?XS`kmq$sB zRRacAI?9M7A%ve{fN2$AXfcx>GP4bh1elHL_jIbHj<(pK-K+U{0SbP4P}h)jU#EUYRr4Mb!Ew)Ka!wUg)p z^CICbxmR_k>R!#F;gP*~X?m@?`#pOP*^All*c0wL#a*|uM{zf<8WJt1m6p?S_vuGv z)vs5)R)IVTcc^=9TABKMrPV5~}_71Gsp{s~4snz(E67r1% zv%!*_?4dP#Z_>AGvH#UGx6b_BUiG+w^QzhuGO&}wgu_SfS}zN4F-acefzbb zJIyiUitR7(j7^tU*KbajHAFy;JcpfHYZmF#jz}`E&mMJfq{U;46WGS3x**7>Cp*W3 z$%k*CYp45_ohcV~FwH0Qs%iF42Ky(l*Dr#R;!sbK;mPqS(7}FQHliQEA(?oa5C&BU z{yfD9fMNC0q_Au$D?1n&U9)xV7BOCQmMNoZJE(%KTyUIiO^U54X6uXz zod}bjx`d||-*uyA@!C`{Ta6Sfw`))KRI$AoT#k1S>HSt{15y6=BJ zcK+het1cy7zJ#kyakaf!5p%W0UH!0|u6~}6Rb5J0DiljaqUsXjALBuN{+OvRye&A7 z+4XOGy+`*M-fpl^e4pVMXLXjZC}w^t+JJd65K&@1`Q z!nEAT5G~MJn=_1%=s3?A7tNed`Q&9Fdvk(c0cJ2c#^+uH4i)?#tynrH!}$b$(wT zx8GLN{no+RW#S4F<8-GD4`Ww8g^mEvm5T9`u{WxhbKXTeT&}1 zFcsfJR1m|LQpMZMY~kQbz6KYjc7}|pXWE-o(5emu`4I&kNY*0qf1-4IuH+`ACz}!& zfKVS;^p6*BQ4`%0uR9a51*-ouHOW85lp}0C?&fSnw?rA~b z@Neo3V4!-aqJw0?3tP|7GyP-GX$y=ZB`W3JLk*g-*YJzsvnv0y96qkRi@pb=e{zQ4F zQr;Oa?~axACH!9}F6=}`fw(j&MoHdHGChn|V)ITXgt|umR>Z59|q+#VuqT#sGaGVVb z+_5ZOy7gNNC)YjI%X5jEBTCJYw@Q_oQ*qB}e7U>ot_wa81b0FxR)pd;p>$oae(l+p zo=uwltIxc7{H?|xJ^QorkMuf&5w>dv8@-CT{i_GoguUsUF?Y+Ve@*CqB-E}nt_dyc zLdA0b8no!l4#nK?q>H|QL+~-{Av{4M8lltU?ITBf2O9K0-rF=#YIwKOjQCGW4Fk27 zpH`VD-e4GLH!*JA7J{3I;(GjT7GfQ!I$KdZs#iX?%@#r|FBS_{PF_b>Kw2`W1#;ve zubE1T%DB{LN$fejs*nH2Y2+bCy(EKfXg$t4rR~Q-_*+1d;FL=Ddjl1~0l)^m=XV)F2%QcCzeG2|NPQ7^;0UWSeDwaZ$ZmY>%SQ{3nfbSi3y&VuzrBk;y5IPAt3dG% zE2Zz}DSps!%wT$lGb3fop3G37G$fzK1j}W9<>>O~WxJ&9T8PxxcF~?nzs<8RvGcI^ zMw#0nM%`u@E!k@1;wXe2>NV!kV%dAH>wp=ZNYu-tI~R;TUXeI4^u%5vOV$$Ud|2sRHX{k0`SA*xA7DF=?4jJ$W za4IJ!Sk9#N5M{jZZ8*=7JD@` zaRqYVlEAh~HAKoHGvtr~kt|I4I+m3RY<`Dggqkq`q_GhQ&d3#5S6lf9USNpZ+JV=4 zj)O#mi_D@7@Ssv|D;`wPxNa_3I+rNyRPf(?@&06D?*#?_%@-f*g+k*)Ygxj&Te0qb zY`_x)O$8+%nh;w!16ixNEMaPSU}{-6J70C)aza|_IK{(m9A~(dbA$QNuxeKs4~!EI zxBjFzBdUB2<{o$pi-1l~oM_zUZ==Jx;z&5>Mv8iAWDE#@f}-q#dtHJl?1GICKAEoV zu5`1fbH1}#Bi5kUsmnQ=LCWI?OZ0!l!!0WH4Ht;5%(Tt!R%`n6tyiEYSB^b2Dd5?!l5i_>T`w-cAyj@#V+=K?t-V*j#j<{?z8 zo5ScaX3P2-h*0d`y4D}{y*0n2eWt3T4jb5GxjzFcxMd%w@n64GMEJ?XW zuS}4_6jZK&4NXL3!rW4PhfFr|AM0;iZk;XooD5Dop;GCd5Di+Wvg?;I;{1=lL&~px_}kfFl=zqya@n?D8AhZyyB= zbTNQcLR~Z5KyM^@9k1XyfWfU@ly1aN5KxV9Jg<(I{D4XmLJ%R8olYH1W>=O0uiM91 z-jBK_2)g+YVGc9w#j)b%HwxBl?V#zPn&B!~FDQwX`r`$=V&2x(JsW!70gGupQFVHd zKvvG`O;{QfOT(I_X|-wHTn2IuYONpYOjXWbn{3EZ%9T~TZhg(VviIkuyB1F-3n~%? z^-4kgT0vvdUHZsfxb(ukFW&j$O5YktozJejE0QHmiIN_rq$gIq_kQS2^S^cc$Z`Lx zvBCjBRFC&zS@nhicfeBQEACQ?y8uz~Z03q9AGwR~+3wh4m3!{@{_yY*4zIb7CXob* z#PVlffBv=Szxl$NyXlDw*@2LGc7eEhY~jjU;f2!L9J4e%X`qTY1RrbfKYmiLzWX?G z7z6vSkGlK1^zS%2`102W$K_h89{yvbyIFk77O45CLEHcXVE& z*j)rEH{po^I1+{v4LWT^8NbYywyXR)<*KIOHf6Ol5U#Niizyp{U76sxikcFnXF7db z_=0~yHK-JERzo&zqLDJCHDy(>PXmZwDVmN>O-I=QqxObT%+?I_m!HB2%(&l^atMC8 zADz03Nr{|B1cV`!_D#`@)?stTe1%Un|=C7FcCxJ~HS* zOk1}i$q!2_iKs4UNEGZ-3iid!`xZ?xQy+fRWT|<%d(8po#Yu0$y{0=&cUu;Z1MoU) zA62&9FIcPWfzyD8-r9tBx8mKs@NCjlnlLpyFg1K&KK`({R%tjEZ|GBs`xb4FEQLvP z0T_r@W3n7ZCUmdwOw@lyss9WZh*sl6i)+285e!6}A;s2*B`r8Zimf<9AiZPEzsDv# zFB0NyDM*-W6m!jrVa?o>Gf!;mKhsOY0&&QQHPxjL8VIIDD=T@wZ$sXV-x zyXVg%Zyt>A8(0%g;%9ai&OF(JFNryPA3rHXLGa?N#*ZVVn5(z_9=H+yvGYKGfj;dn zxWLd~X-e57_8V3z*Mkt#l$%`XgS{(hz8ANg9E7zqj32=5(>Vd8=#e=?P%?c1;^jF{ zG^h&*#pm-`f>L?-1UaR2aug-=a0_r+2rrFfX@q<#G8&wQ5m@HrpkIYSjb5$a**jA6 zhyanA=Tv~qJ5BebPo@klkW-%_A61b3HgG%eFAFI}0Y5g0AE=x+VYEzRwb`1q7(`9j zMs(q{GUufTj1Xh-`A!RMEJk^pg#;#X!OouF0h{~MFxWEtXP5BPVWUx|56@VBnhP1` z4lmaMA9xB;Xq&@s;bO*YRAq*ongFJ)sWh(u~?QF7zQ6xuaQk2GI}A^dP2 z2c>eV*Gf1jsh0yAXS!(kEjpiSJi>UB^aDx(mT8cIblBn(9=^@`PCFSa8B>f@Q3N8Y ztjr=!5HoQKj+0XuAAdorP;D9hN9kwb(kvX+2A+Y|e}QbOHvk6eiP*!>one3`$5o{K zYm{M;f?E{qrLHkZTgHI@^*sT#>HsNK5d$JwBh$*Dr10V?O%+SOz9_9z7e$knQiYL*D0j5NCJQsp^m|4XF1!|A7(!;Ld#cym`W&W#Widw37yP&^XEHa?E8ssseahvTU5Zb!U1pZ-rI~{%JL` zBQ1{(p$2H@Z8?MipN|{>Ys~g-9S8u}CxQU5jrp=8q{Y%CgJsT=58UVUy+JFHy?@oJ zjbJ|FKy+$vwL8d!onJ^O=(Y>}c70nnZ_oKY`)~FYIb=t6Tpe?ce0m#mnU#f+c8s-e z>jWOPk>1(wXh6$7mLX2G6li(6qggGByfdSXNOrDgblVR~Xn*ogA!EMvlyr@}@r3Z( z&#E{A4sN%|>OgB`P;Fmmi&tA1%|#+G=Rtl=-J94z5i5c32??%bvjV=YB~`;RUk=}^ z9kLAYmRSeMwzO}`seMns*~!O&%RzXi#}0C5VZp7ugLu<%>iT*|4X zspsdcJ@&7-% zR=3146ez0l5*d-%gX}8d$;22h)8p4EAhZJ>%csYu@S}ww1nC#uXlJL~DqWdX=a(66 znc*F(6s1$XKz&=Is$QfR>+0`Q1vYgO)p?+wrU0g*J_xLfCiQ)U(U>Wtnn9&VRafqi zWy$o*Z4o*<7$%YlRFtGRvrJTV;`G$WS5b&usYm`3A`9H6n=_RB7h(UeTrI0*aaY&E z$%nqWm_SZ~S|7nzul0^~x%cPphD8B>z+Qdk)-%h`tl3(V<#j6;((Zf~pH07sRkW_z z+P1z**^2*{gYVlt8xGFX2;+WpLBdp_m@1YB*G#QRZ&kuuuaH7XyW(wMy%zT#xMfOO z3KNzp#Zt9AvSw*qhoG4tU^7RTbXHB0$&Xx&pp zKgqhh=WgYqk;J_f#G%Ypnjs>U^hsM4TWieP7Hj)V%zPB|D5z717xXA-L?dS_Pgt83 zYcqJ7cs6rZTheTK)pX1B4S@~B-nhF?wa{B_j=MY8%w11xl+($9zV>!1-p-h(E7k?S zL#LsJVgYkiu{^udAG18W8cejGR@zTLusr*-uYed{3BeL-(aI>?dc|D7G5}qdE%dI5 zv!3C>;#D%`6Nzb0z`Nh3a0v6O%O#7h0(5PbZ^O(oH$s_gpXqT+#F z`gdEp2lnaz%GQ2Tum9;T!$7y`r~3>iIn&SdhLd(v+PBX|{}-5blsr!>_BsXsjRLkm z=p>QJ`>IUHk?cf_emhY`t)UPR2wfhQ36+z9$w38+0>bW8=UPOrf^j}Wzql zvQdnfn)CxuSEQTrYn1R6N^o6O)i@xKkb+mIFRMI#TJ5Py_A4;z_74QrAN>X3Q-4)k zCizy%j^8Dih!D7jB>{8Vm9j9(fgFO!Ny@^M%;^`NvGr6JWJKM{1r!iPo6ewm(UwWZ zmFi+usH`#VmxTkWwL;)J{k%4|aN5CYMtff-IymKICAA;ej>t?aPNikk-;ai_1B;$yt-xArJDoTG3QCI;KProD1g6{9Y#dp!yng<6E?!0S_{pLwJ3}i5LvZXQ%`N zh07HCYYI9kARfN@8>!Dz>^ud8fH4PZs)NQDJ^mpDZ&2{RQjk&UW7_0QJ1%9s$S7u} zV8Z0Y0P4gLWZXSspE2T#0lRIseJS%HOd!laj>zPP1OK^58`dp^D?Gnx=6LrnIL|LQ zFZ*x%1!wsM=lTVQMB6X9o?mjkajy55-2OPX|8FRd{R6K31FrA`?!X6}|G#jhzvNzs zb1!_r)qlV}{{h$f0e9#FuIE>J_Xoo64WWQP$3KDfo^cb#e8ykfIm4b`3g;E!{D?&RJydXVl0;gXwE8 zzVzazfl@b0ckzu$o8#5}xAt%7DehPA4-J-01D-dAcn|MP7Bz0@aeruVE>169SZ;j1 z?X@O>{t*2);EoX~@NJkVX6B4$_5~JtdV%Mv>mHlx zd1KO3wV}tI6~Ud3q>4>?vvG#+=8Y@E8ys%2BZD8&{qHue72$mR5My_ae;>aqekktQ zO*Rp=iIPsGq%+plA1^rp|9p8;%IXtk-3oLodrrm6PQ#^No;==4^_YzX`s3acVCWWB zCki{1!j9PP{&?XD@FR94l-4CmyOh$dSohg@>0n$a+9;sTeHG@{x*IuZ!gV{KIU|emkW}xC zY)hbEV_~2&l5U&@T5ESvtczWs4jdFgKMMaQn~w$h!&%87JCFevTcFrKX64iifAyR@ z9FmggO}E$9eV%jfo%6nz{~ieV2o%NfQR=H!LVkw>t2oP@Edx6Di9{q$A#rSN3YWIU zZJar0kK19+uGrI#xP#3(6ldBMcctBNciI#8usNsVP5a_L)^{oXG#}?#->n4F!FZ7M zJxVCu5N}8~#v9Y&csSh@Z%Q}Eo6{}v7PjtHTGMUuwsd>EosE4;N4hiK$q^foc{%ck zGfy1vnj@0`mt4GC;^m%Mn-sX?i1*saSt14R5Gf=FAKA;0!V(v4_&pA$qnzPAD`#Yy z(6ST;woE2!H~bf~lB~R)&CI5*8o}=;XhKmcW8eFVk~Y@>!8esnNR^R4l}Tx$lD&FW zMrxrm2~}pa5DKPTM#CjJqh@JpS*B(BU6QQIG?h?bPSh%^Vj9n2v@nXhro$;!ybkB6 zB1lGal?+=l2}EjeZ^)^ubDAoqwDJerjMl5NW^N@WXmSo#lG-AbjRxGed?-c;KBr9u zSV^8uEGn7^{7TM=lD4!UpG3wQ4g@otY{qa&^7T|wHe5-@#AI1WOaQ22h_N*b)qRp9 z^R=rCgpfz&c7ps>4!bOD_yA zq!tcpKwH)h-dr3xe(c(aJgB6u3@$8bbJ@%w3Tlw?SCntc$wlC5d8jar8LW|>ITKr0 zGD6ouTcm_0iza(%FYMV5f9ji1-6q9`*8BXY{67TmI##*Wp`xF^JHGaVjlS{Uc8_m{ zyB=_BYQFD$zWbf^@P#~o;TtDu?A-!PlElg?-g0N_U69Y;f}~^;=OhxhNgNozO?J%M zR>>?U+3)air{s`bl2dliIwThuo=0-aUfC!6Wq!WXB3Nx_Z5=R>Jd$VDjy*6}FJJ{F zpBw_rhWWlKtkna`et-$Ho!o%wka!q3nPa&b#%S^_Qm{UL12thumE7dwkr@(gs&xjYgIGR% z5v^V@MJ+6-LZA_E?( z!?Zq=<6wjuO;}dthm9zj6|*n!3l3FQX6as-F}$LfR1&Hxie-bpO~FqQx*w|Lj;dcV zoFTR0UPTD%fYLvH%_CldYAScD@#)JYTCo;K^&Ct?qn96|5OhyNZA2z&n4& zO@j+XMjP$dAr#Ioi8oT(TsbV2j~}UB+3_iWeE_iOL97^y@uc8{nM{nJSjII;E%)vg z&o+Q#fULd&)ooI85pSU2>Crts1<#1?8F|cacqWS;-kLk1drmxl>xt)r$#NH!A$(@0 zw*|ZkbEkmjpTk_sX{>OywrSbR455dBpvAgQl&h-I#UuphY8nBtN2Lu2XJbqzgO`( z^DHkmX;^j6R&RD2evN@89k(5~shW;DEl|>CcP*DUxQZa8dkt(z>bYPrL z>^il1k_zBw96b)AnvVL6fSDg#k%l=g6a#I6%5bven!??T&@fK$xQAty6#bLokRb_- zI!(uiWa$`INXc-rja78KjPs@&?1YO6P+@CgRfJ-4VR8AuZUcWM@C1-me*zU)c`NC7 zt11~=-8)1ze)%hBDo`3g`>EY%sPAh~RP zs{k{Pe*b85vU>gEDjtt=h<*?u`gEW zJF52`Wy9WLNXUBxDga%23aVs3AgxNwwqPKsB=e|yJBnp)*3OV{a#7pz*HakEs$S3vWabOW&VW`E#U^hqz!Qlo zyvo}PGqQ}~tm5_mkI_mf!wG7VrWUjc`fQd~j)_qQBcpj(H8u^37=-U-`b5uTkH-r0 zFcu{&%{zq?EXZJ%y z#&u!*v9d0}YeM%XzyD!N{?O?s{F!Hb<1VlMgdbz7v`vdD+1@!YuUBS!c93wb?!kGm zpsiQ6BWJJY86@u71hC);WKrzRas}Qjfm$7#pGOYvC^nL-T;P`>T-3)DD|cqClIPWj zv~-c)f%Q7sU-f#l;sjmC6ktbLacSsTSb+=`Gld<++Jj}dcx7g#-eXM8Rd041ez%36 zgY$guu(-YA-Wk4jj^X?8ZE8ok<%TaT=j|W|_Y87$&ki6M!d_169Cb3vn5i2MvJl7fx zczuaUDTu)-NE2m=UPKZ|#|WAj_|hvb(v)ea%tl$F)`Wm4LpqKX%k=`-W5#tApueQ}4%!Kb0%&8dNFqB|1!-=mZv4D0a!*!%0pKF}LJDA@@l z34EW<_igY)&pqGaFN$6J3SGl`*YN7a&Bpx?y9&`!Jv#cNacr}t^Zw;eFF!a~=zCM| zd-F-l$Y$42@%7Q)M4sDi-L3Z~A?;{Ah6&_2q=>j>x6XI3$qz1X@P~^{?WTB@%U+otr+U>vg1SfdsuUkFP|C)DB zw0OU6bYs8WIkCshEnVrR5Q$3j?48V5a+oRAJ4k?w?C`LYtZimi5Wj#WdbwP#4LsR9Kck4oR z@gu|>gv#{tt@+k{pEqy#W1GSuU6{-tnZOzxyr-)eZY%Z-7CZX?$L;X?m_Ltp;R9PH zk(NbxU=wk>MFso3DE`f2LMgAfMN!HoMUkS)vez<-Or+>LnWa(ucDMm8!VS-`gJCiI zFkGe%v#3UqVbpWy@eb$MGjI6b@LJQF{0rr$O1|TDz3ITiOJ5w#H;w4tV_QyuDDh!u*Rz)1k`r1m zDj)wz$K8%CH;(o@oQ)4hO9Yz7N4Kyk@h<0{FMe1e(BvoI+rqZg;B$^Y==r?p>o=5` zIYcy>#;(AxaP00YQ;HGaIPje)Ro`tY6icg^duI6-Gc<&g^^BQzc9N!Gh?=WPP?hW)$8C8C z$3G?gzavmL|2KI{CvQC^`<{}$PszC#HaBN`L9lADbGB~*cJx2J&G$Z7_x9X&{lVer fwdF~q6tr_w+}gwz!S*??aGdYnt#1jo>=geC__*tB literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/tpu.cpython-312.pyc b/model_executor/model_loader/__pycache__/tpu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..977f8fd9e4735955389719d8c10d092cea2a32d8 GIT binary patch literal 5654 zcmbVQX>1$E6`tjCc`gr$I;q2}Ta;|lI&4|K5+^>Q+LCR=2@*EJg61wI(#s>eOUq(O z#3}MaMh#eMfJ#AH#65t@a8MiQAO8yM{Aq&zAtNC4BaYxV*cLtp*PYSNMJLrzr1Zz~@DAdM1 zL61sPLS4K*Sg+D1p&{NFY>YPrn^fK`G{?O`FM$k*_wX&Z3C+`Bs{x%toaKGQS^4_g zWLeraNBC_2!pxM^N~SWyEjY3Oiyd( z3|T|W^0fRt^N--w(Cz$`knOrojMk}2%Ng|Tnls8ST@IGrI$ddX&3%YKStEYR9=$h9 zd_wB|ISO7z_`6+R0caDkI$g8U#u_TLPUTEl6KA|MhLHG&cd1yjF0Zuu%8sKO<{OqU zIfKH6w;{%F!E8Bm&XP6%+;H7$L^-RX!- zxUChBr(99Y$rPUeT_~pJi=pr(KEe4~6{8%D^9r5f#RwFObONfb(q1Omi#!w`Ua!J> zZ5Hu4qR{VD#Y&;nlB}3G{&F)lwQ*HRu`&VWX0|R>-yo$T;AZC`29O$xm&W*j z5WN^k&B-&#L;%B#fCex4D||Q&u`IKyqhdf#rTwWn#ipiBRm?O$j&}T&u2yhEIsoPI zD%x_Rz(~>mLc#ySs{h!$am_MVYV0gF1`3UVdAj6kDZ2U#uKv7hZ{9MMAD&u;r)%$e zQ`^V%hxE~L{xvM@HeHggX+Yj1)RX?w(%lcyYnGzHhv^?_^=$$<=+YEEw0^ z!|U#bCHtbiR5w+cWcoV_|g?v_wM?5&xd|cm=lqEx%@k;6jqZh%;Q^1r1mthA z?T1Yp>N-z6a=^k0jsxFIL$HG$bdeK7V(=u zM;6D?+b@-IB872gOucMYDFV5Z{jBj)KO88;bp|8rEcFrM49`Crw8^Xy(3eT~q}$Zn%G&V-{U z5!dT0w*_t+U*cHIQ9euNk2d|3Jdu#nP`~9FzFcUd5(9TvsB^-cc4OnT`xqWDNY>MGZT(JQ^70RJg%urj>0xya96;zzZB#DQrmrSVT-e(mtCRR(Z`m$>#>{^gn z)iORg6iQ8uV$!TCrm|7|s&21H9LJ^x;F0NhqLS?lAl?F1$pn{asA@E-ty^kaZ2Mr> zylK6mdAaxF!4C&l8+zyMaI&>_6kCQ0Ekj?mj9fcCe|({*SL~~v zkqu;UI7_vy%NLeU-t1fHxwHG$?&83q!oZ=t@9@1dpSOJO$h%+PBta_=+9YZng=4v` zv)CFavU-VhQqtDbt%3W|3z~CWXWAua5KM~UAg?*ck>4)SNkT{ z8@i!P)xV5uQ)4aErkdJ^6l(0?z2EsK-vggY=FeT&!XnKNJiD)5Y=TC_2uHD~0NI~Nh-Wd| zq_Qk*cz+lmwjb|&2nH`GM$9YLZGm_M>w7EuTohi7=*2XC*`-vRQ|wg|K%gKJC-E5d zdk~n%U+>f)s^L_6F@Ez`jqw9sW2<;JEk_0IrP4sMW=O-;BK}~cs2I?%I?U=Au*_;0 z(64{0(^wG~*YeE&Ym_!Ke)Tl8-k|!bO8r^{Gz~k}n7|MG6+z9lcpg*Rn?SJ~hKXQv z9Ti#Vz@N3G)9?W3L=c-MM9|+N`fF7G4eH6Go^McJ9`*eb1-?P2zD8YNqp3#*k}!N{ fLByfITk96yShcoaHEo$#f~d{c_kD*jp-%Q6>?Tpv literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/utils.cpython-312.pyc b/model_executor/model_loader/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..068811324ae5eb492881384f03bcc81b944f10b1 GIT binary patch literal 12738 zcmeG?TW}lKb-Tdg4G<&&lHz;$5(SC^MbQ$ew`5APB#W|a$w~yrg+c6+1hoL@-6bV5 zU`(Y>O(kkgw@E}b>6o6$H1gOPGasGGGo4mSC$T&22Ues7WJ^!fvD<0-(Sc26lz2Lw zo^y8>0A!zT1&_ddCw5>nMSX-H#^cg6tN&@DsH+r5 zg(!~ZOadLE$=eh%!P_jD<4lO5v5pZeacjsLw}ouv+a%aSb||+9_P8VDAZ1p;8Fz(T zBySVkaZkuY@^+yl?hScK-XWC6%R*)G@=$r)7xIxZr{IrQgepkhB~-?%LRIk%p$+8Q zEmX&ALN)Q)P;I;}R2Q!g)su3M&=7A7HOAQxOTJ5lrg(FxnWjt>U%|J$MQbArwQ^q0 z1i#WBnz=H*oGa(sI3K@}cZ{AiCZq7KhAd~AvO((S%k*5u`mjSzuJUDTXcJepem%jk zfwzx7Q^0TJxC+iRWap}1W7LvmnCFM_#M*#g)1iR2%8)f_eFOMX-$0HISVAS%6 z6z1cDJcnZplQF7ZC=>Y$5s}jiN})jFUkpbDNiS)X#mR6~p4e$)u2Pk6%_-ANiHxx1Ntq3n@-^6Q)23+#+A&6L;eDx62KJ#n>qDeW#)hRS*FrIU?zJWOWzaEgNm)iqK3_jt_W3g7XdI$mwMI?DRMhl5w4Ao4tm`qJ*GyMR z8EV=#ZRJcU+XN+U-L<;_EF zH0la(fbt^LqvxM1jZrDbkLW+5*PHvOiH5bGwqBrGsBTK4FTnRXSUH-y@(&ly=co(x zgXNR7Rvsvu>OKZcCKeH5Q#>2d_#t~CCXcZ42+t13Cin!aF{E*kFdiv*Qmh4{$+ujxnRhhvh= zi#*2;iOD#d1Wd^xRu*||#IZ4%jY;gdg!LdK68r_0&~F(YS$TY5+#W)TWv2+2Wx?eO%{ zqg3r6v=T6MD8!sveu#(N5hc6vD_WIm)ug0q(_vM|$dFFDh-Y2ILn^jG5^xswW|3H- zfs8zDAnFN8sxu-8VUm?pb6gr$t>>YGVM#*1&gP%b2D;?N!lyu9V&c<;`> z^SykR5F6|on*`aM=)!%|rOh0Fk&lkcNm2hqZG;!Q#zA3`f@70vDNc-bCbg*+kxk$@ zq<7#n%}}{g%I;6wYO__%nW`?Os%zGrwUwuBH7nI!i|(cBy_ZX7U2{+0wUuXVjf$-? zZQHzJZDaGy?9X%@ zQ96z+cN|?RJEn9TU8tCAyV~(59qEpvGsn|q$Fes6yi>6?FI-&QrL^z;*wec3+)d{F zvv)jae(7ttzWc_+vhT=!Gga~|t$5Dl?2O|eoikGw`@Is%;(2A@(!gBHa#dh4@&VKL zG2?vY~Q0xs^+r!E=^0Vyb4BM-)y#PP0*z2>l>iLjjYtOcA&9og< z+78ZI)7~b<-n`;q7iw=DdcX3H}n0Mr7 zU;|n8Btt!t?n6a%=e4YqMJ_e^kn<+jP+lVgO6!Yctc7EYUZJJ~^sV(MrAI3boP~FR z`YT5mYVz~)_9wbrwVV`n9^M;|Ob+s43B(lW!*T@8i7<$sa732H*x)z_kT8gz@C80L zJR*aNuA3$Xdxkck*sJwpVloPBUx$;zks;uhU>p!TCg4^dBVuVSdkJcv8hZ&Q6QIuP zj9IrL!~-xk)q}j2m<7P`CDjHD42e^nF$ws#97#lZ5vNG4)b^#(D>7I#mc+-QQnX`& zYogjlA`(Gh;pE9NK425~VjXIK;yz3cV6q(&)e01%m^la^5^iywr;57~gy6;Gt62Rm zBs0`+>?Vu#6FX(ubK~&MJ_GZmN9Kb?c(%gR*_u(z;CPE~RwWjUCISkHbMO-=8kqmn)+xH|J`o z%Gw8U0+yj4PQKllZs`5M)%RCl$-2wa{+{2MP5Yd{t~|bsd#mE!x>%R#?pM0|KX4!V z!~>}AyEMF?Nhr#0m6rE2mcL~z{T_#S915biNsPw@8I+={w;76h63!4y^i`UMeF$1H=;))x{6dE>I`iNfA!ptURt~eCmGkhv;GD4Qw*lLLXy8zb zR@ej2X#R{ADu7P`2g4`Dg!2?%ImJ*8IRD=_;fQ3M3-TEBgI)Bf@humP5A3`N(1fqJ zkUlgXiB+WIEPm;8`TYVXThID>$mqVnCVNN=Uvb}B-WcISTKGTT;vtah@GFi^S>86i zrLWAiEoB?U$#~#dAD^*Gz{_oGR_ZU-hhKNCtp=7g8187i^Z!@bScuX8qpN&@(&XV! z%opf@`J`%&j*SPbDkFilsahovfS~t+E&_r;h=4UGt5!|KfK8D19|b%rgK|VI)#Oyc zS`bfRQwBwcaNY7u~jReZ$5k5)_k|b|7~~HUUGLwUz(}OSt)-_#@C_vI_~&7FWY9B*+|yq zezos%->Z8s@0~xq+ZuBX@PI=dPVwcxkD=7t20rqBeB@_MZ8FA=UpOZG*-B=0WSRcKVj18`Is);cn{I z{#MBUthL_*ub+3f9cwlJq6$7*&41TwhN8Sv6IxTKz!Qbv1nE^u)`cdBO=`4QZ9rIf zIC{9Cy3+syO2mWDC~8;_Jm5s5Kt$*mbvLLv1qQ`>4BZe`L;=|WFnP0}fR;0#?}e>y zI!=8P=~xefA?+*Wyd!iy6`Zqxdse3lgb+2FSMm#ep(?{!u|M%SIWJtn6wzl(JXO?3 z-pyHP&)Ez}P&)GN%R+l}$6l&_1P*%v2be+TA=7E<5iJ}=Es%bq&Vl*^$_loIF*!+D z>m%^E3V7taU@xNHRD{v}r7(JmFru4}JP*NG@})3(i!j!I5k@)h{7o^1HpU17r6*NS zvBybzlh!;^3|c-a69I?H)E~D=;1QJAL{jtP#o}WEk8VTaE$U>4z@dX)v_bHzp)2i- z7)eNAeu1wE9F1#$gY4;KoM(q190Kkti3K|j(8}x}&+1VRa2lcyaf}xc1DZ+2K*=!( zh(v@SxU!N7ekyPX99&Tu+JM!hyBEu^b^s$Mn&&b&73jwp43c?4jEn(#-9-rW zLPy|$f<7TO!;8VGdWJ~YnbxO9M+=5;gLK`aKTnW|meN-NcY0;S?%SatC+NL0J#j}<4?GMLh&fHpD4 z#W1?0)`!uxC>O0r7?+6$TN$gJ8YC`~i0oB$>%P8l5-0@UrZ&bfstO_=b=Ar7gX6=b zT14I;A_Ec8K_H^$E+RJ!ST(!!_aLu23dAtD)x`6pRL9J75-QS~cqhVLts*3At%~=i zI0k@!4u9!Kz=&q36@TsgiA>!#rEc5eQ#YSk@*kZUxLetrsobnoZq8KhRVw#p?G@RI zrksVU+P6xXtE`uvnmINrgK24Z&!#foor-tol6UuQ+wQElHsjr>csE|(c-y-*ZQq*q zZq2%?GOlLD)x0ot+ts<^F3-3d6?bFKWGdSR+C*8!J%`0pk}IM7Y{u8A_&R4!WNp5= ziQBgN<>u{+vE}9iEB2C%y;`wXr)v%`+mEdHdlru^`S;B_vMztx#S)zLin~7DaBSIq zJnO5Q=jQf<`tdP#rd8}+cJI5Zqs`jgU`D$u=D6AYsM1tZ&dT|!+YYvmLl+@%l2a{{?>(aOa3kZFDU}|Aozycj;12;-er5=ioaz6 zMiB&Xdjb5Og>qEPeQnv^eAngI30#H6qSnY?2&HNAw;Y!6(efAM(>f9tO4l)qp%fLx z2OcP0M^+XLQ#tRZ0?2h_VX+2a@Xw*V0X9vi=z?<$!hep!SBggA0a7b3Ayb8bB*>6r zNc2RfKt=^jCq+Co0hh`E#~smpJmkQ*#PLJ|oKr+;FO$V_3H@5}B!rctaN5NfGy^q# z99VM^Z}5c&o;q~+MEKw{{YS%(p9>#7c={+gM**|O=gsH&NrMaEVUtihg43w!+3oNl zVzLB(=?@@*W9KZn>)w!YHz@9gEN0picN@vHD(+U2=}_DqB-5g}TUOj<-)*1o%+zdE zYPK$R-Q2cReq_o0M9yP#I9JUMmvyEecB$R9T+*c2npV8c3q6XrJ#BCQxNOt)x}~xm z;6QH-EHhQ}W`(KCx+>C4h4>h1^%nZ<6!u%riIQWSJxt~xVEL4ZU0kJevLhgU&rJ@LTZtaVh*Uq z^}{5JGXpLD4O>lPLgoW9`3KoBKC82kT%~vlt}9U?)3cz`hRi&}TX-vP!40ZiY2j_0 z2{$2JhOzU^+xZB7$U$J80PiZk0t4_2!gH3FsgRokT%3(Zs5MwQJ6;fi3nUJ>K;oIS z2b`+=6o#b+KmvnwoqE%WjQ?Ok6pw)HhiGHo8U@);AW0~Why+&i=^B?!f&{}ckkI6c zg~UeK!I&nrA=HTCUq>*=%NKZvyN;ps16iMsjuCYO!ih1FpaQuh$uSuuw2VLy8YRVQ ziPkWzWksOQ1L&-*ZGRF)sYO38{} z*}za&9!Fmj_#VWU0E6a~w}`(FdDQ~3%Q(dK1U{iU_yk0kA;Qgz--5b;1$-2|Fr>Ql z`N4e*0mAV?BN}LlB{=?~9=askT*C3;9x={l7;ex2=hK3mQ#HlADY&NYchFE8hGd4y zwr|d~Lqu&?ru~4@e&9~~pL$kG~M>LTz4*Zzv;PAdwbKNZ0UxT zw&3-tJ8j^~Zkn^qP2Kf3X8qMUi+OkX9Fr;CtdwrPe*AW6?}bK48rol=ONLslwWVuXcl`>f>#O(-c`X`Xumv zTr2SvNK`whf)d_M2r%Nev2+d-^sA^AEo_WL7goTFObOQvP9sJ(rs~$L!2`8$Ba)Jk zQNSwHG-PVFC^cI$H9bmA&z+j>mygf(&sAi-H5qS<;%!;z|G>K`TU9fEcy8);DQa!C zJ3!AW^^>U$lwfchw@etK%3-~G$h*TB78mpyTR05&380waVh|cPgR4qq1h}rCdczPQ z0$W47a1joRxV6Z}6md0Fb1Wf?uVUGEG1-U-vQ#3SWJmW# z7&i-fCQbMYFcB23goU00b4X{u#^y0tBs)0C!UZs+FYt+gM$=PdoN$Nb zg*A8oUSQ*~i?IZ;wg*vKMK}&k11!0pQw36sS*#097XR)Jel zyXo;d=8&IAYZ!$H?iY$AF604UgRdFt9#}7xbLW4wciGjwQrEKZWV-v%O>U{~$+W$C zr5bJnq=QG^XO^l5W=r7YgI(m>lvgOzoA)fc_hwzyX;)iz+kx2=AGo{V_Jez4&P%y9 zH7u}0oX1`Rm~=s+IzgOcG!<@)h#^nFAY4W@?%<#?rNxk2)@x2Sq!fhU_!!;-(hPNCqZ4Z#jd!)}3mO7H z1Sd=c0_56`Am)EHX`o-Kfty&YWo$n=_AVhA?5#&>iB1>^+RgQuc)I6b@W53<-W;Go9^2!v}u)sLUUX8sL zd#w@T2W9O!2NpXiU&8{Ul(*)v9pJIrUpaT_+^Pq^1iGF+OlMu5S9f0CnKP4*R@l}g zbKvlM?pto{Vxby<@kIz57!7 z)ams8)5^{>t5}oUNYgzFt{ergMP~7tzdZNTb3gU2;%Ckd$@TN3>PA#AqD-#)rfS-s mqaeWnxN_vPuYx{EXJPuxB-7J-!}K?fcO5_7e~%Op@%Ud%#@HtS literal 0 HcmV?d00001 diff --git a/model_executor/model_loader/__pycache__/weight_utils.cpython-312.pyc b/model_executor/model_loader/__pycache__/weight_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f9a9a43cd5899990499c56264641e9654ab1961 GIT binary patch literal 47920 zcmc${d3amrc_(_X?;yZ^C%6+_KvDZHQzEsCvL$(=t-ufuNPz+uAAnkDFlJnLOx0>d z)oDaE$(U|3WBQ5PDBa@r>780~oH)}?A3&f27-pu*t^3@YdFBtwQm$+#bLalv?<@dG zNV40`+#~Uv59fUQ_U`ZZcQ%`u!xhxN8fbchR^>Vz5R|i$S zYWCFhYS>fTt7T7JuZ}(Sy?XXE^cvXH*lT1@Q?Ch6P0&1K>9wdRFKy5|Wb3sJ*?aBm zw=S472YLlwOhLzTUiER88xHRS4b4ORD657qS64Au75viIg--B5jRJqufc4MUB+jVx>p zHVwIZ-7IViHV?J*wy>~0*gDkK+s48D$Yc$&I@iB+St2M`lSwT>D|KK7X`NtZR_1Ow7qvbOH~%!G4x39BPveK z`P==S-&2X>(Yq7Bjf4MA9;Wp3ds48si!Xnf>+MFoZuuSMvWMdh`mKB`>UDr`L;DW$ z9=vbIUp=qx*Ym4h*7hFaJN#~bwZ8>*>%hA;c-Mn^3@Pu|QhfhmlwQ!Umg>F^c|OYO zJ50Ti?uT@JGTM;6=BOiTw;nk?hVodi@*9@qmF@+qubJP7xJUR+a*O%Rh)pT#grb|FPHfB$u^<{&9DovM+hu9;$ z8uin#zOnL;AlF{BXk=MFovfyI{}<28$nQjppFv)u%ktXgfA&k(%gA>juj9z;`DJ_Dy`JU$0eJ-ZgBTejf9Pd>?}=sg zq_y+K`%cgIU>y3;USU~2G?VFXk2rR0)qJ_Ujr^l%KhNfW=nLfj}v7OJ4*&y)!m<gf5CC?)hS?6UyUhR>;MIsc|QPN-mTb3LE0z|9T*4TED@>Q|t|*!VOzt-0{`XSlPP zr@1pKDJQFxQ=t-X3yb&BvUnaTUa=BytAA_O5=)dA+wK#iREe=2Woa)M<+9vkhWS8W*sV?(clm-r--)0{3H|!hm(;Mxh zzVJyTQ13gO(sT}=b8Ax8z5TlbLI2Tnp|F1_m2l)M@Id_6g=%`Z>hZS^ZXD!{llk2cv=oodv=Ws_Xh^>(|iy) zD8DKP{9$j{KO7nnysX{cQJ;W@`Ga1;H++hcQ{Q-lBLmbLbt?C~ zQbm->^BM2G=mfDeN8gwr_=m&Z(V#Dko~MdA9}W8Y{3l0(yk9`w#)5vl%sYzKh!qJP z4GVtXkY7mU3uD8+fK;)N^j0XtfTXko17rQDPXVPn#u^Z!KBd+nrR69_GU(qUjEs$@ za{7ItF!h#{zfg%Bgeu%na52WM@1$?Ud)hCA&?c`Bt@R^8AHFM>79qm_?nH0{Uatx( zp9a-RPzjt?31xsgYGCf_369r>_421dqZ~>&j;ihG(nr%~pz zZoJkv79JVG0Q6y?&bfSj;lOEM7&9C5D2!1H^`iqy_U`uXeSD|4yQg#K!EP^p9Xrz5 zb98t25rOL4bIff>sYgO717=A;7#U7kGE?WA92*!w5B6iAXl|xV7|~D|BOelKkvwJD zH#9mTgu4ZSX1&mWcOk;HU7sIo4-bvDj|N8D!ae~LqwVb2hD{q!ZSc1T11H)?&xKEp z47Z;S28Y_k`Q|_C??Zb8=_k!^ztAqOKJOS-ZOAivE|u#arnTnHsHD(>^17(tZMaQx zw+uCJ*L=VEt>&0v?V?(5a{iOG^48AX7Y-+^^)YMxP3w+D3uj#S>0K-5DEov{nVcV5 z9g7+~KMzrdf4!ug`@U(5=C4&-G)u-_9^X4OZC=6X`?+2fC_D8O*RSGLFB^L`Q`}iq zul6X132E&Lgs7_gF)YJAiT*%Bj-yc~`tXhiw?9zPEQ$dBY{jpz67Rew!u>90TSOJ$ zBI@s|eov+1&T(!{&#{z&7js4-y3p7t2HEKKveoIO{UHR9lhUHkLn-~KGwf;ddi#RD zP{`{I(FAt6T$4gG0zwCF;{};D^~hNr!>16fjZY`JWVvffKXoosIBeVMzbR_mGX&Y+~r=+?8UkY2^5m+yo+OyAiR;bagp!G)DYIjAgNIUWX#`jcJR7WAM+{+fP9 z)NTUJ+q&lj&2scPgU3T#rUMt6$(1c~S0ypSb77;%qn!&u4Z)L(#E_JZr{&6O> z5p6^>jtIPt?i0FjksN!lSPrG14{1GZ(95ofI---?!|UHR$l(cnM2}@->^b(9DrNMa z4TQqpky9yEdrH+duJZ&(`c4J@5+jF=T1$iOwx^7MnWHq$DGfk^P=}II#v$J+KVF4W zS~eN5R0NtGDTCk#+Sw1k}4~0`kf~!yAb++JQg#-B@)Caoa`&AIfkQwGkS;de zQ$~WC^htOepPt5F=nrwjs>><9kyH1ry_0(vO3I@Z?eUUTQElOZR{zR@uO0Zh`m*7Y z;d0KUoLOVs;lA2+Q@bhYtW7vuV$PPiU03ts&UKUf7aT>G?U(E`&WX*UU~BMXRkav*L1Zf zUa{+g`gzNvlU+$m&g22M0L4zCP8N2ewiq+&Mu3WxUhs#l)=MpR%?z%X-8bkxCE> zGs-GdGExHRm;et9Y3RUSdPYsif(rjGJRzgwVY{3N$V4&PpzI*ob`4&e*@l-m8SMa4 zB4@+>uWij4rOZFpoKd-Tz8|AY%(QHPZ4zefl2B_`_@poF8Wl!P2Y4VqzF`+o@S#AT zi&mcNi~x)t$UG20JpCRs%i`EczYCbml58j@(uYS}J{N5*BZ5yjCuRcDpw$)VcZJ6K z`uw3#|5&ho`mnMS&uXmqFZk{51f ze#$$R9qVkAD7ED200`n`ZE=m981(mrH@iArC(&Ezc&PzP>minkO1wPGHM)jJFafaP^auRB+sbx-VK*`o_TVPM2I`cQ zVdxYQSEDI?KTkB0u$R7@y3h9cM~QSv=|z$zW%8$AG57=YN$?Mioc3eae}Zz}NjIvj zu#56AP`2KF&|)k=qb3EZ;FbHlO>C2wwMh{Akq7;S61e@}gk{}r;vBgb&zwJV@x}8m z&YX?sG||pqohWg~O5AfN=Sw!kw1ty)sEV?rPK~^P-+J*%wXt$l^UD z&)Rn};_sTT8 zD!KP6H*^(fepH}AoRp4%*OXE4pAHaZma@teGK0Q3fq~%>!B5y<*e?u+Qg$rJku%Eg zzD!U8X^e;l8nK%^36mh8FkcR(p8!~zi1uyG_Xxb=n$QocBe?Tg`X|>As3lOVbJ|*N zDQI{RZ$;qX){Xyhm&6H?xE}q&$dKz0)_m}2AC`>^v>$DNPaHgW$ou%wZttPPUEK$d z?mGNvw<{2G9TUdn&EW_G{^B;6xsl03=I6x{T4~ek85#CdwzOffL8i>Gve`bDkz!eG zJdAV&Y1c9a47M)eW` zbQ`1_Rfgr3k}yi@z?B~GQZN68;*;^KiIY(*KSOVnmURlO(k(~Zc zr>S#;=Gq3`PL*K^8BCND;aYdVBc!E>yLg2q+=){b$%G2jQRXt1V#>5l7k3dkp zVI))+3L@I9WhyaIW}}viFy=weFDbcOFHrx0;|FRer2}>{6iOL+U)bjz4GSr~Zxry1 zPicaF5GDkUhzn6Wfrtd1G0 zXRB`->i|)lu3L7;%t#$Me#qk46F(wcX%=X}r1x|`PO zMIBdA4cKDMo9q#(3(IJjEnr5xSwj8Fk0nGsbvTz%`); zlB+(=2|93Jv@|Ke0#GJddNN=_X)wVmh>VD6c^%J{g57i+r1k_-W}9f4Ncn{VEq9>` z1FOz9{PL9S;x6hh8u~N=u1|d&Xg-_D`UyisA2DQOBFO1pQ(q|7qTUmTo8}e{DyTkd zD7y_h(_uLjQHZ1oV_0E63}%5o5hHp|pN1%5g=hgmui_27F`#)D*KMl4dumfpe!iAA{VidZ8yxd#Q+g}RIy!RzY-5!a`gFihyraBfFWO4BzwcI+)M7KePJ#4li+_mowVPcT52GJ`i8 zCgxXPSU8RHh3&Wj^##M-7fz{1!$7@_L;kRjc9DS>^8aMdFUDWqF}{|_4n{+S0^@!c zuxhZ5!L287JOtbd02*XfM}3-pakaTVS8sNWTNRQ7Z^lL9is&2HcoXFbqHA<43}S?B zO%S(;c)Ks$caq6pPWjKd+>Vrv@!kcRG;H%pX+}pzg|k$5Gm&Y5fw7UXPzHG2LovJQ zMpP_#o@jQ;0G<)mKV>~KhBQNdMo|c~7*aYvu_4`d;bltn+w=;EKUmB>V+3LQOWCDu zQ0mZ>mi8Xus}%o5x>1=ZqhBn;fDMWj2DDVl$b?s7OCg8>3tntA70-G^xruC_N;C|j zhOqZUVAvZT@uuYxY_Ah$k?wEt7dj1C04NTiokx4^jOA+myZpTEz~t_vA#eJ0%utbZ z7R^{=&iX0+f~{hvKW?j^-2G9hYs#L?ubJ(bT^-MFnCePeb1&{czkfP>(^`I8!xc1Q zlP;@XG;?ihzJKtogKziD+qS&b^G4Tf=j`)SzG>&>@=N8fRmSX9b3K#0W40|H+Z`7N z&JSE1IX^OUDsFF{+#_+KrVX=oH?@rmTH93r^r^Y0u7;znTW@N&-PLnBPD(P3=rsv@ zbBrYtvsv;kY0Q~!1ao=j$(W&THhja-vS2S=F) z+Z|4mV_)PnCOfh8OJ>UAIrWo!KeFXbsgn7nGaZ-qO_`GU!uc0w zHq8cZ=B!%Ct&CRh`=I)RylB;-cy3SB+H>2)+4DcUQ_I=6h6t~|R#Vp9q{pgbI3@-S z0c`kTK3qQjM!lRPUb|kU$bST(k{&pF^V$2X3tZbktg8i*PPkm zUqmCf|0prQ6*?;28o*dxR>nBsu)LcRqwZ@hY34ZKxdN*MDmfeMQxGror@}Vp4H5Oy zo%lYb8#78bL=4MHH_0_)r7N&9-KvNo-R_Li%_~aJJ|`@vKvl53v!Jm71wNu>MRJ+w z-7E}zye*Yuq@O@~%hSK)5N{n&y>0uRR{Ra|3GnvpxdNL|;NPH2PM>}b)~3U9C}Na* zWWp3Ng~r15%9rdl&ss;h-x*cj_>dg3wjFpu)||sc$FXa$Sq`P2gRSYX9145n(A#!} zLO1i!1x?Q#6$gslIDv5=?2u!ppYn>IutY4O<$J&yIo{xEIh1}rr2p5YW6Gfy&Ae6a zJ!K@U5i2lL+bgy{^#Ea}KzM$~7TzGInX*Cdne(zSV(H@Ky2@!m@)V!9T~c2t1Ac0s zu!T3tsUmhc4)0*|xHwEg6?XAX%&08867;W>+Lf{=SY{y+eLK_=p@zKhN9bxD$@-rSjsMrLWo6p@T0Vq0;c9_eovI zda5(x;ESl9d@;L=Z%gm9w>soH^#L9%opsn5(tTc%y-Z-lrRQz9mwj6iN4x}i_tKoQ zm;Itr_EE@vmhzpZRTs{Rd?$wCswZq`R>Zl`DsiUDd%z_oR6L~#4ELq-vS^p2EX2l_ zfN?6fZ)9jx0G&%J91^rH&XS5DSt$qOoruxcoAJTrZwLntjr#i-rx^-Dq+J1~upmI% zKr#fN1FfzTKyq0jkh6z8*|H#hbzo>T=pO=J4JH^eN>ya57vb*S!4TM-1)2FMugAk# zU;itssbnv4;8xVtdLjVTnPL9KIS5HU*Ef5pif&B?3+og{=`Ef3g2Id$Ck)frnMoB% zR6$t50-}BF&Lg<}fl3q%M!4gEm=mh=>OqAigwN8IrpCUZBGKH1BBGT@MD&)bhslnI z`Ouj__~cufl-jc@r8=8Z^@oV$cS%YQDbv;gC~BP@6}FGpDjG}DT=8sW%)`)jj}mDE zY7ruM_$lMkM;o4sZGZ9|=j`)uj=wQ}wffBo{QT?=fyLh{t5h-8Z=GHdz2uBQut$pC(*HwyAuOYop-Ihzb_6qJzlw92teY z8p1DrtPgXN1jpmej}JpFgw%2{&oZZ&l0Pezsxhu*6H(=HR|~W)rL39pQwDSoxatC= z+WqokvSduZlr>{9gi;zrX99LgBfLfV<@Jn&p&}$tdyyg|Si!iI%ekO+^kFxp(*;71 z5Q$t>;_{|>se(X1+bB{#jb6s$WehH8gat=V2!|^R%JYvXqDfiCOdn7BC1U+{(mG1-HNtgDHAktk?l7zX+GbP6K!{Q7LDrG| zOyez8GCG1?ij3EQedDd&oJDZQ97L>R9^a8lqo{E398WBHsH zl8F4z_zOj`PCf;#spp(cbJj#tXRN6+?$|lG54<^Je!}328C;!<`lpBLeyA_ zm_?N{!9?ZSSmoO2`U4*vix(c5Jp6G%`OLG4iZ!u{HCK?qEt1|gh=Gn#d7Lq zz44rNQRBKrBbV<=vYNK5YYrfB7= zc}qt!uRM|07RzgkdUnV2_C&3FKF+OQRI8kORk!lWr&|_vYG>6QjW!RIq}FB!UF>R= zb3Gh=?D2Tj6U(z*H*ZlY`$yKnarPXdQ;`1l|x)hMa)vM zn8WFcAm)<l3A`W2LJXs#hhdH^-_sC#v_ws`o9FHG=tHVxQVaGR7`d z)V6CeAAE!<(=C@f;o2T^ZI8PiiE1lugOOV=&ybJviY{-uv?)WO#C`Rtc*XX3-j2zG zh?(z#V6jyHVP4gg@4IW_Y;(o^X6;kIqnh3d9)+$5T3@Al%rv7VYN?204prYU)GpYI zqh(v(Ihfe`cx>zA(JfCzOP-9|pNblvS}>TUc+MM!8pKQ5J#&#n``%dl-l%8a2f4|d z>ZrX28h^IBI~q-nnOFyA40uV)Tt}?9EvhY8FxjH^s(F)ZwsXN*aJl4C$xO#<O4pg&Xw+i(v%0)e$x%cvrYOx$L+Jwawv$$ppaG@-xl z6(e5M`62y4 zX#KM5MMxRsmO;q$;C?JymKZfwGe!Ov;|@8gxm4j7OSdF#+Dk z(w2p5`e0`@|TZZ{~FB?F8zC@v)MIz%f5&{(`gm|R9M`98}| z_&MSqB_#B}Au!2drz&kol(fW3T1XubJXTT}S+V0sS8n$I%5DDR-2BUpml`j(UTU2= z7j51j&pj}CV8LQj6iZs>Ee**W=adPXwb?#(YTi^j<6E+U0&wuI-Z$W$ajta?8O+0KL4P!^6VaUnyng-q{3gu~KByvJ-3I!GI_-ZP|) z;4DIR1m4jSSsO_QmR%78cwtfvAq&I6i&sREPKj+1G!f%~Y64QCVaSWZiqvjU;rYoa zu?8WplJkUgPT6IkccPNtvt3i!KeBNEl{SlVFmOsHtm$0?d09z`GVGGe#m=jPiZ3|f zbb$w?@clBldkW`dP~q2!@#!A1$~+AD3BJ-csl{KYt`Q5#0w?U?@E93~s3@bS-070! z5N|>|>@xQps-HUgC-)KdFB|P&X&3=z%IXPoMxLNumtUkz88N5xd>Xk|-n#{gR zdR}RXFzUEceKp-vl75#w>?{w)E7=V*7CjF>LmN;Mi2vn2?VGpvP`J$Y(z=- z(*u38bEG8dhoy%+;KavRHb=mF2|yLA0)ap{|{!q$9Y6D)Tp=6NeBUxc!O;I&eex{NDDL1QwZOI6UNRpYpc*C&U7!Hx9 zWacJ;u!tWO%YEmSI!;u2JJ~NRA_>9B&=785U994)Ia6PaTPkOrH!L+#ZB5itv*2_k zoXs(3bHceP=G-*7|3gc`qTXq;FE}e+tDM@IEG~UrcSQ$A!8-*X6wm15#Yd+0-O+R9 z&C`a3vYOYoU)etAzWQXmY}>SPA+K=y>}%U*J8tCFB^%mipPugd&{42xLgc&UoX+yf z!LJ>RI#*rw%xgOd4a$w?)j*MoVjDs1L`rV&Hwc79u?%}pKu2B z1Y$!QYf-IsHi6mWtW7$c_bhsw{Wgc&uNO6l^EuI!O6t!;E$I3`>9loqaMvq3^SUcF z?{%!l{YM?9?lR4fHWzglYThr@A^d)&uDi*QGGjsd&obUsUnUJ`L_Q3A6EK8Ju(xG; zw!ptA9=iy0fewx)!vrY{-j>Zc1g0U9L>$n4nWijh$!6q8OK#bfO~V&BKnbua1((Yv zZw5A~U|M9Habf#I%2DvSbOpXpIiQWx*b2%K?uT2P(@L0y&UY3a-z6&mpTtI#wvolY zia`^1;r|_2MtY89;wAT~5FsiGtor{KKT#ljR^&+ex)T1wbdrqIMxeed+Eh`aM4+O; zlku{i5r7FmKoS=K(y>k0#aiqcFI@JKtQNw4PmkLyQoBSQEdwSg9kZxS6&)m1>F(jM za1M-mCXEnT^aM>9EJ)?9{OZ<<@ZmN^BC#cdd?8xXw0nzNcB8UoXAB<9ROKx|*}*?W zvTL-1&me#u9Q2>EX)dbmrtKeF%aX-lg4SY>#0I&HE%__jrS2K9Geu3SKD`TFzPnV?W*9AAcXj+{)}0GEm2#X7v78MQXS)(*)z#*oS+|iv@b=GP6~XfT zJhT>-`(4$}HqFJRPELJ|TeGuScWsRtFRrayjqvqG3SW1tDBP@}aGP%DdPB+qp@9Ol z2;*d6ZwU)c*#NJK8z71juWX|5Z2HKw70QbqtTs7$R#EzcS#2SxA%R6iEDCm|6?DKN z(xx?>vu%SBMWD{Bh{Bv$!2(OG=qRM5{zV#FE3Io+Z z?vwAKU+IFla6MFUp}u=jE2%V~rOfJ8f0_enX^d#0vS5MA0=-2~EEis1I3aGDlIU6s z)l79y$}A4B7n+%13e}+`CdCqkQ2IcKaHM5?Bp9tiSf=suyxS;zMk$D8B+!yd}+eSpnb|rJfEtM%d5WG9o3e6R8&1>Ng69eW%2LUyxI6hWW_(wlk8XYJ#)ilKP!+9R1LP%I zbR=mhi)zb=jb0TsRsQA^D_49}6(TX_HCxFpgXa1=)2=-2^&MK=-!tfTIhNWV5YEfc z4jzG5)R%#^^Qu2mk*xvdhx!$DhSG~V&4!s$L+M2gr5Dvor5C967N1v3N-t2$rWqv7 z6g6gwX8eyBw1V+ja17c5tm8qV!F&-4_)Il+*zboUesbS!2bW(5DOoY3WVKVix7Wm}PPpA{dGvQw z(Y#uck`+KoR;&l=ZY{^6`0Nt{ezBe4)si;tf6%sR$GtDWJhpFQv+D&=f_)%J;g@Ff z8qcc!iF%mSl8m181tP8&hQ0km;feMa!Xsfaqe9sA%nSY?T+#UXiLEaHy};DL?3ieE zy#RMNa5#e~$IU4Rm^WUg=>d@uDabAvf8uDe<@S3BkeukOO4tdk8NbBqivl|tbXbu? zsQRHf&@x|k$vJ?J+C*L^u!V~jWm;hxQQ_f-hA{x89#RX3?4F!!Ft5~~l!Rv(P7?ui~b8e5H>P_5H5o0ugWZvxP@_`CD>+JA*XJ$zBhAQF-( zY6!cs=@dtF%yx!G5!XX|JsViorEL-+$4a9Z%X!dZ5x8fa5roYyyV4S_X;nxEvn@l~ zf+V8hb!oICjiOjHc(|}^K`+t;R0O*RIP{7}a8|oZBP?FjZyRNfE@`Z0|v* zwPlz5O|BWJX*2yx|Ay^S7I`4rBF!|QV{mbE;jbaRAZ(3Y5j}5_b6{<1l)nsOtuDVH zD^YA}R+hiKKAEM+by!wL{_+wagHR+C6NcqEzG%SM`Y~h7|E8m=XyPs#)d$zkvb=T? zbH+SwUfHr2EpjjPt0ir^0cg{iS0KIrEF!%t-v8h}@BiXH?`Q7w{&Dd=bM#;ve|aA` zlz=NPQnSq9V26nERip{7;ZcdDEi$B^B=G@1HZ+DP}?GB0C~J1pPxT z0bHRS9iK9v@Ck%^fHNw*f)~ORy;Yw%(UVf0NvVYs?!sksgJ|Bw#MM4Z;G`SDl$6E0 zv-5~|_u(UlI*&1cB?_j`QiQ*wo1J1oV*=erWr^(dzLT&3rA`pWDGmv=Q9-;&nVCi( z`vwE4E%cQU^^724qzS>flnHH>RM(dDh^R>`*`YBNMOdklQ4^tF@K8k-X-qLOr6Zqm zLn$+RFmFvFu0)J8B39Cuh&GkTy{iz=uz_U`_JkVLsFHx|KOi7_PblS_dCVt{^^$ew z={arO(KfkHwox+8!ae@Gp-Y9yr{EhOtz&rWz>h06}7fLE-2IuPLOWLn? zNL8Cv&8~ZM(;J(^oPUb`VtyDe6`End50zW5Q)Ql+bILG>-aF_zyrr@fiq z279Htj>R0*1H6yi!f0XRT;9C1?W%LZk$>5A$u!eCr#O061vZ)e4ncA=o?@{^aIy!`B?XJ_4W$KnO6r_2jB=XCQ7Y|mwJ@@O~iTt`)e%aRh7`%vA9L?^=_=$yTgkfs;sh2*4|QJF>o3@T&>)iheSG$$t9yL9Kjf3(@|Ew-A;>u*Ly2S2Nvs*xtbVG) zUcUmG{|^xgZ{RNc_Z8pHDBt>*e?xwcWx#Ly=NrBl{7x@03677~%TP7_DdbuKSYgXc z9#zMUt|k_63ptomLOyOi@PZ(E@f_E$@^palG2ZtmIq~!}#RBP@3viLlLtNXpxx`p$ zz_Tm_#DUg65D4ZQbctXjT``e(RHe$shWp{(6#8hgE7c5h?BB&ygY~tkA0W_wW<)pz zDXRk01^DKdfHx_brL=^QFav0z8o!1AOfNg6T6m!D58C`39Fk;oRA$&K>HiB&lr2eB zQ(Bl>hf-$JGZPJy1){^@IY|NVLMG~BQl!mLPpG$W@c}SAiX@J#w($E2NE76sQx?x45~xLIe!aOybZ3|J+KQ-8uMq& zF=Ne?6?(-3U8~~x9Z^TeLS7{xa=w0%!)?ky`qA}sh8vc4D4|Uc%!KEi zb#tnP3OK2Fv*C?~?=*e$D{x@~z*v8)s5(*97%OUw7r7Hwcg*TeLW|dZ!?tE2uVrrU zeBQc+LKk#&3m(Z(#r>60ZL@>-nOioqVUEA)TyypLWJT4qIayxu`tB>c|NY+Su4H4= zn~^snSG(eko2K{Agm2`NZ;XoiTV>_1Z@#kmn_H)iAJ#weZs`Y4#p|DlI%@7&xZJg< zc5!o}s6AHH9xv)hSUY0Yj*p$C=|VE{NATd4gLC=ua?f;kvb1tmH`5Z&ufMHBA=I+9 zG3T1wo6sBg>}Y1xS@-!ZtNYV?h4>P6uK65A*;ar0e>vA6+HXSr7}b|{?B-P0RHfZT zx@%=+6kf0DuGL@Lum<7l&7HdLdfj{N^x;QlRdZLIK|L zH@u_eyzEM|E7IUH?GOr1f)#L?Hg*DPpa_KD*2w~F$V`X;1!)J+V0@rw&xf!m6tJ#Q zz41#eO^o&ji`ru}oIfZ~_A>~YN-a@Emhz3%Y zn3x#@laVmv+n9Zp7oINBh$WH}iN5s8WJ%h3ES>yC+Rv!6aHPW)@w$0u$CMt{0aG@{ z*LzDP;M5atyiDvXu^0w+h`^MHNiF%v06SK|0WTaQDFr7&J#SfsF67PJWJoTyp&v8` z2b}&;Dp&0HM@L42-I8}@c)J)oF%$?h(37(ILeL13Sihpu5mA^TG$Ok<27(|*P6G+5 zs(|xX$m6G!C+kH4=dJ*HNJX*PC*RpKz47vn zOFL#a&z-rMw;{Uc$tgoL_o;;lNOU&9bwm3JIR6DghSyJ_S#g&S0R!!|su7D!ghl^_O6>#Rgs;Ftt zXQ|w$77ICB^QZSpq*uQQZAN=uYS~qx`eWTLr{+&fcA53pES(;NuWwTA$}wKwtVj4g zNU#qEln|^N(pu7>1T1pCy!K^sX-t}G z%dSDfXqH`ai7SP2;5DZ=)Camq2JcZ9R(Na~*m?$q#+(*Qnntw!V&gP2dPvzgc_aOk zK?dBi8nIE@R&12UjE&MH;0z`1b6wvU@7v5aE9^};@(5yN+*WLT%w%>oIzTa)MTbnN zG6+ni!V+80DI3W$!5Q|y7myld`~@B%eDBXXK@VBGtGj8}2{GlrZ7V#}^Qt z5xR~_&i{W&o3&dfY@tqKdm?RzBCpLdMB*uMicb52J-~2@CKQN#OxYmIp+5HFh=)^t zoY5d1N+5!TlocdC4jf>+KCD?oV?(JDQGN*jJei$D-jh;#WIqOV4w_skOB%5biQ6Y_ zq_SwA_-AY=Q%11}XkZ?qcQnQ+nbH<3GL z<;)c%pl@0zZb}rdj}@tkOC9U}ht1D)8#jQ2T+^VFr6FX|=2AeD{ftg{EdCCGGgZWhn zV`bD>d2fR#FRnW{dOfBhyzyw#U1yDX~T-C479hvs^_YUein^;KF5uQlQQ^=&E& z@6hcs8s5_y@CImrZg^yX55XPQfH(m9ex|@GJdJKs_D_VsxZg3QcJx@QNg6}p`G%-xj8*Ns#9>q%xJgvYnAVa-0 zgnlx$6X7Y81Nk%gT7mmoFmM^$JY|DY37f-A8w7K(7xBdHhsT602M9McxhRLpA|lVk<-*&R3n+!7+; z;kWIbD&U$3$rq`$Fkr?3{gZ&1{4KkvFpCgU1`|Sknx@DqX8eN6q!a34a~fpHxxpqn95Q(z8gl~TF&pj9whbb(HRFan}kh3!V_yToKmi(e;< zbVw2?nFoj|CrnTSZV~f@aL$y?_0Aye3(GF0KctZ%r#h$;vDM%TFq`|GwDHk|jhy#^ zt)@w{)sQBOa8oXY4k$!B9vI`oc@nmzHOZiQ?2;=DRo}0Ggq$+DEhgkVroT>s{5N?%M7n8j&h zCMs>Sw{B0wr+X$7ZJ>d%l`r3S>fwr*_*r=f%cZn!NG*P3? zRB_z^J1t4~B|j9%6C1xG^gXhOMIC8HI%u!IkjGxW*~DlF5lQh};)=zda2*Roz+f&B`^ zt$dGw;kyXP=$6rzFcilO#R)@W%+NS*a4*<$6Sm5jtukS&i`nWDw&s|v`MmmLTk)&M zr|`(V`0V*-XS!#ZPULsovt`lJwz$nRxf}Ay+@k5{zf~Kp^u$Zsujc-+=m$lARx_%{|o1qsZJnprV_x{$7XX$%=n#>E5~mbTawN#aILE=gXe%sJ?w^Uxs%-==N2Z7 zl}X$7m5kFo{gfV}%-qVE6SEC-4e#jYo!h7Mi|S%i*`nHJTazp(OBAe%6|90Za9X`s zuD6wedY;*tC~uFIw&^m%h$(?HcV?jDyf+6B*Plp72B*YUfhIVrDZdZ&D6cN zA5?K^C0uwwUTeH$`~&kX7@r(GfADvEZtu|L=|B4<2ethadf@r4PdE#T%`5({X14P? z`dD5=GQZ@Oqwq3BykEB`i_52V3y#8sqc-NKCGPBD^<52oB!C-G+Aw=!uHkCKySn+3 z-P8Kan*V|N*Na8S_ScIIN)3LkRN*(F6POP#?JV81U3I;@qPt#uebX9*-}5-So7L~_ z=v3{=SHG`v>~X8#&n?D>_nS?7oVxedbms4|>V9I-;lod?I=uRcQ$ykW+&vAtpER(r zJ9p0(-A}e?kSvuCZ=u9%Cv*z|(VeS;eWDJWB3t}R{8&!ClQ-iGoHG;nN&pu!M0E!6 z1`*W1Lh5$)keR0xz}5Dbi0F$bt6NEtLg+ReNC_fSL{zuaRxj`tdbi zY@3F{K>Mg@?HBTlm&@BOUZv$M5;KhLvB#JOMGq|TmTf0teTP|&RZCk?ViW*%XTAjBGbOn0QIi~8*L&V>Gsngn7qPk{Rbr$Kb z<(Lp&VkxCC$A64!y`mQ9MCKWE{c1@)*$Rq8L7ubmbh4WPEMbx3R01H4K*E{tKVp%S z9G2>^>;f98czBRl0f;iZFq$8v#kZ10 z4h!zK2Qtawxfv^=a0TyYDUe*u$x$#P+HdI^_^C?CckUyA@8F_%G)DAOW6iV^NS z4zNp~eZ-jN;tB(kNLf(`v(-cENaPbuvZcfL!Q*R}FmtmkrZkr*%^lCfRud{mO`z^f zu6f7Lk7q0!2YmD3)Dgxe7dD~Pad%cGdDdNGGvfCMTXDzv4$C$Hg^Nup$kLub3~XT+ z-MZ+;wg}q%T(0NXE`gi4QHcK9WO+cAh6-npfrVO@pq7aoFo#BG{?^f7ad?0cazAeM zFLoZeqfzIXKiqlb>dC3S7Z0C5JhOk!c+=Vu-Fbw7yaPRwX9h}8T>cQY_0~+T%1X9% ze(B{ymkzylc&a;DRDSt|OE0`Ok+9Z9t#z<2y=@@2walab>=QMLTrA~m^`DYK6ys%! zjO*WO+f}W)s@qkd`HPZWcKtOw7}o`w>swX3@{HHF84!L?XF`6A^YnhZg*r3B@FeIN%W-^F;xVA;`rb?W=&u;3KO|92ifsI&@m!98==r2;s(YK*>wu%a3dKPI}bG% zeh=ZG0rGm*=S(R;FESicIsUUaLp0-q8NTiWpO;0_(4AN?OVdmRx<}hhO>A zuf$8ce(;sK^)u^V-}22ZSHBV~?TQz6Paeh@ce5igI@|8URh#3MEk7{N8X7}p{t{jLNo8}r4&0Ay5TNBNlG5j}n zCJXCsQ$o1%(}AAg5MBllXJSEt#D&}$zeJ$0tzB_EiOvqBLPZ{;cqFg)5U4nud!#CbNi&(Fy4617 zQb|+K_2{{y@bz#MR>4rqVMjQMzN9jwy~?cg9Akn>(cB_SLP_b6%rs!+59Jm3bUGP; ziynB;=INxj6MnEZ{fcUf?41cn2YmEMb6#8+IBf<;vcRP=Gv@FOk=Gh>%`^np2JkI) z!cVF(gwmiaS~QZMC-Jk#wRhxb%LYCG?n>Y7p|E!N4sA|*zPKV>8474S%Ll`lc76m*>y{AH4a zaDxafzhvZFG{Y3O!5!zrV>M@YR-aXT`h`$_C}?1bWMgZ{^+Ua~Fg@h2r1&0c-cRoS z9vBaf^vMzF19CzPEnMwO-GoC4vP(5B%li=sqr(29)>$#P3!7gdBvAEU)ucI*4b@}dCiTy4PcMk?8)5Lx#OZg-*{VRv~?#A9NML-B{QDQ zZ*2a~)>+S06`X=N*WA${b`+_IpPpILO>4`i3zqy(K;qkhz~Z1(145rO<-eCYb9Qdx zuH|+%;BkGEWtSF|LdHNUl*o0w%-rYY>A=nhB1o|8lD~^MpaAt)=v!$B8~a)ZCJ@1o zWmh&jBBCL>16Y?jq8&$FfN5r(%tXTiLzLtbU5&kG?}3OIOd2<0=dRf*-wd>i&Dp%ciHV|5Z1t=TpQ>f1? zm>-!u-)vM06a^4dyt01R@Y;U;6xXlfbuU{bJz4$uu3dnsFue51i=OTAWP=G86{038 z191{&2ZwtjfWrjz1_GzyOphv_9vy*d+p}VtGrr{Sr3Oce+s@``> z-rXNBK8#6~1LM27L-Cx=QR8M&y;x%4UP4u{{ZcgrsutURpOaXc1zD>o%pu-os#mL6 zuSLB`8atX7@A^7L(ez^4qTUqeku3ym`$TFm&jb^h7^I6U2GbnTHLHhBo2hC?3zYuPNDGiAjGhk%e!|uR&=tX1+&SxQzK8dW|mm36L)F{nX-|yn+Z|zbLz)@D^T( zV@lgS&N&fhUn++XO^DffY08baD=);Bu9%9-cX3YAiU~C!Pt3^=jX1h>dT8EQmo(bw zY;o~y@GH49-QO&kH`XSNc5&3-r0Qu{eZ-o*jTg)x%5(B`D^$u^QV-#e5%WJ%nQbWZ zGjesU(dB$=?Dfc%2w=1>=Wn|I_TXO~WWf)P{Xg^uK-dZ|K&7l`9ECe_CZ#B1`-1BDKM6KDDr4s)!YxQ5H!bmt9Dkzw8x$5&|$- znB~1T>%msqYzNGt70RLIv~Qy}$S^(OGwM~It)z6!3qQ_?o29={S5l8$b`83gy}(aM zzrIAjW|Ug9GOzoTnx)s|G6(Bc=E7@F0YwyE{NlMWMbd2TZCsgq+SE9kV-2}pq~bo< zvNAQ;lxjHl*2yq6tMhJsMlRXMLD6hXmykYaK!tU7;V;qpFHk;<3tu?r^lk&wVc(~? z>>=9RpLYGrh;ir@@$i@}YH&*)jgJ8UER8xe2Cu^>{4OyzoCs(50b(w?e0|U$CxnT7 z>yp+MNO3bEnM1HSg$WLw^#w=Mojecb0`VAv=UN${Sz1o$5If|zGp}7o+hrs~gq|}+T;X-4LxG}&M;aOtn zTpFMBaXoe+M98$l7PXvTK%=!uhsIO1J2}$YGu|y>@>VT6-NzJ1o zSzbulA2>_p)9WRp3n$X>b4DhrZdEGx=Dp>Rn~GB$Jq zjRf<_FP>r*QluutNM%A3ktf3pH-vTAMc~uZBo#X%z!x*IQ=xI~I|rBC(6q)`#Bh@y z@(u_v@+A|1UlMB^Y@0ttxCfpziZBRU=T{LHaXF)H{R~CqP zB#X*nu+hBKCjy}eFe1h*7ctmBqrCo{Zht{vbl%Xp;XVj0J_jY>P^7=a~vh z?`ka2F)Nnxoavq?|H{Q5Be$F@Idfy=C@#_dNUZ&l`Qpy0eMy|uW37%^t7q%xt&Q^W z!|TZWH&NCSD{G18v?g*oVmTdGcizZZe+${bkw~JrDF&T6T@uaOe`WvchprrYyZ-xa zZ?(nCH==pv^@(y%tlTpfy4v;bzG%58Ufz=^e=Ju1SoFw~@$#pr2MS0BDdBL%9In}t zpE+7?;f?*0eI_*9b#>pg{b!Etw_#M5+a_XdBr#4|*u)R9^TU@Q?Iez5xRVPI>3+Bq zoQYxd=n}d!RDNi7F1vUI)-v>1QA$fk1Is!svI05}p{WXoxTAFO%Es?w$1JF2h{X7M zAOq5P7q5pwSvIeY<-B2uMX^OLCH>@$5e@irrf(}Y#B_8kw4F7)1xLoxH`}*0AV2JU z4(}Myf|Zh$=gB*jJ{sh6d0i0391$%PlzftTh#n9(J65L$X&|*9_+$xUWrLo2Xw;_< zf0lc$9kn64d)dY3$+hA086|CeNDmjly#$7O;*g$tby3uv2{j`@kh)i#?GH^MzWN*fa^E{o;z&3QsYLtN5{c~XgZRx;> zeSkjzO?{AT2gLInME(S;2;1x#7T|+SpdWC9ZRq5;0gO)+I0DH6%45&~0t8XM;F1+5 zM8P?O_Bhcq!yy$r$|7A`z#CQ^;0>e?EHlM@vjStprPk^K!$ox5PqLCxfvnd6|KRJn zhvX)BC-`B~J50{0SWcbr%b7lUftAw)%QERGju9a+5CFCzvUd4Q4N3*FvC5@SpvYMG zt|mGR6j^oQL0Ag}E)XB!fCh5<$i{==1dJ&2+rT{l-MXoG^wN ztrxpw+~heW&C~Huy1E}d(!HzmSa%oB3}GI)7|>98XEBfd*S%8KPuH#53g6H_&r%hsHAT0f8X{Z%hj2+XS^zgjj@N zLXN~zvCeiWb;Uf+EKKnQth_}8xy?2Ymcy_|ia-9s!=mos1!#g-h}_87~F4 zD(M_aW1X>P(&;?uiqcpjJ!3)eo}ML~Py#~GWajP=y(H>u;JjFXrwJ90&HXed8N8Z7 z3aFTzUX9}@f<8LTZn)LgZo41s|WsyuUk*9*R!K5cx zt7N+}Zgu%EL(nRCUJ+7S3XL~D>k0XT{j9OidYYa&b^5s53l`U)@`G)?b>o(FJP+pj zv#Z5f=vI%{Ey?0}bV8Q&pBHDOXEWv54kd?%WjWj@58w@104?<*9d;%`<;DtL34GhA$g!hn`Xaa(8;MCDpmoh5jTF3^B zRHkT~BcFNLXNQ5K3x46hQDGLG9RSb6jMRkufLt$oiQLGhoKbS$=`H*WwfJv@fIp4^ z5bzq#XnkekYZKGY%p0l~jP``FEM|lih}KqpSEI*qnGUX`g89*V=4yMqc;}QYX(^ty z#4IiFqo1%=#jI6}YR=iCPCD|FjtX}82zYh*#mSuVsImNGgE>YECT?CkZ&(YZjwvH6 zq$Fl6fgwjueX_o7cK=)u0yo>%UuneiIrB@WjO4Ip1N1LuHqJE9?u%MIi#qxU`{cta zlDR#mx@Bx%tjMX65B&s?>D;dGR9(%DRj-BDzWh?r{T>=v zvR|Dnu7M#61Uv}9Gekk*Z94~VfkkyU9CbLQrGCqzi?cc=4=^-)2~tgHFGH$-gg(C? zQvE$dq6>lz#9DsIUr7iJ0Xp{}wk2|)(w1FIawxx##4rnjzC`2wA7K=yd-X$F zN1k4k_;m+aY>Dpz3@tF65zj?3QCHJ=vl}3}59ku(w}`mg{o<7YMZiC4Dt^u);{H+CN4=LzR66J!{Ve9 zW#KLqNFBXQmiu2(^Bn}LsfyUyNHBlVa^CW)ZKfho(i$sijhDdxw&&{OKkWTM@1H&U z?(szTldudB)1<(Y=10<|Hds* zV{x>!dG7G7oXTj;>Ki$$nfXG}zUJ!T=o%u_d%hAqddZh10y{nZzv>-R+WJimyhsXEno2&$d**s1QK zhqGHvmIOG?8eYm+tAEG$`_5>7-Lx0CsJ$*`tWOx5W5(u$aaGK?>M9zyL*xbgfW}lU zgl5ymMS3HJ+sbPgf;wd}DR8>3OrYQ~NESzAW8_6e9&J5K$57YJ( zt<}iCh{Tg@r@w{k@+d@>0oe3ASq4CLP4DUFx(#)5QwTzIWpdOXr%}w zxRG9GI0q8dE5!7{kpY8d-Wq)nVEC`>0e8*p)=mFd4crln8F^@Mi9P3 znb_HuFUTYyli4#tDm+*Zr}T$L=wJ?LWno%`8K0HmdID{W!cXY7o|@B44K;4X6hQ3l z!vDkv{D&7u%zYqXDis{Xs2)-M6Ib#PXa9(^$am96Tn>KzoU8puuKnj+{m;4Lf8q_Teqmy zs-9H+T)Po+Use4_+MGqBPPJ3@kJ_ETvT~}&RlhLhPd^nmRZi-G@L8*-HYO~UF-v9K z;+iymqBp6Ww~IMd-aMCoSFceuPp`ep;c?q)Ryk(#X2WlkE^-LIqj`t_E4ZXV5GGA( z$E}^aZ)+M=)k(W^y6gPlA`824rUt!RxAC2|Kj>xQTRQuc@U`Qg=9{2rWLP zB5bd!HtAWNZ0opf`Tq*Lo}V_3D7=&PvbNV?vx{w5Z(S%qg29jqC?v?3R7oR|R7&}g zsO=$@l2q!?s1=-Y3d#jeEjM$@_GV6Sgi|6;^yaM(EVum=p0hnv$tB+#gB7VQT>O1+ z=Iu8#Z=Yw48PD{Jly7v93JgvaHE>M8R;Ci!8X=vnsFLZaOu6dzwXJX5H7`9%F)ofP z_=4KnT5#2^yY6GJYlK)2gKdj#Os#(%w!?aTOGJf5nUBO>;dgeC$|L~Mn4q1uc&JPI zx%g3tf^U96swBnfCJD4`sBsBBRF$ziqM_%7DcyiUXO1)=gf64A%%YIo=-*h~7+!x6 z#<=9R9Q61^!?-O(+V9Sh$|OjdzprdQC5;UTGzUYvG_i)tlmU)OGybm#NG&I9mO)5HiJqiOFBZGlP&;ZSDZJGZM(aQBjor z0=5PYF8fA0vncjEl5NM>;@ywPV21md#9E5w%HWIQcCm(2A(NM@UDdT(a?;bL!glt- zL@-or=mjy<8UT2Zu}K4>5u_;B^4&R4EXF<$gwoy9(6eHsB?d0VfDkj&#I+ZSvCpjH z^fmN8@nK5~TxtQKmF^*~y-+mL}j*0tks*-i^4+ z)qeNOKRP^noY?ikk+Dw^!hW{95pzVQr425%f&Zn=#n?_3r-?E{9-nt(4fZMPqU0w{ z9M1_Xj8z})O;*3#$xFMH+^C~2Ltx8EI5NKP2wOUyzP&Zr+cha4M}j15 z&ZwNf>k None: + """Download a model so that it can be immediately loaded.""" + raise NotImplementedError + + @abstractmethod + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: + """Load weights into a model. This standalone API allows + inplace weights loading for an already-initialized model""" + raise NotImplementedError + + def load_model( + self, vllm_config: VllmConfig, model_config: ModelConfig + ) -> nn.Module: + """Load a model with the given configurations.""" + device_config = vllm_config.device_config + load_config = vllm_config.load_config + load_device = ( + device_config.device if load_config.device is None else load_config.device + ) + target_device = torch.device(load_device) + with set_default_torch_dtype(model_config.dtype): + with target_device: + model = initialize_model( + vllm_config=vllm_config, model_config=model_config + ) + + logger.debug("Loading weights on %s ...", load_device) + # Quantization does not happen in `load_weights` but after it + self.load_weights(model, model_config) + process_weights_after_loading(model, model_config, target_device) + return model.eval() diff --git a/model_executor/model_loader/bitsandbytes_loader.py b/model_executor/model_loader/bitsandbytes_loader.py new file mode 100644 index 0000000..97c7a20 --- /dev/null +++ b/model_executor/model_loader/bitsandbytes_loader.py @@ -0,0 +1,822 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: SIM117 +import fnmatch +import glob +import itertools +import math +import os +from collections.abc import Callable, Generator +from typing import Any + +import numpy as np +import torch +from huggingface_hub import HfApi +from packaging import version +from torch import nn +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME + +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig +from vllm.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import ( + LinearBase, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from vllm.model_executor.model_loader.base_loader import BaseModelLoader +from vllm.model_executor.model_loader.utils import ParamMapping +from vllm.model_executor.model_loader.weight_utils import ( + download_safetensors_index_file_from_hf, + download_weights_from_hf, + filter_duplicate_safetensors_files, + filter_files_not_needed_for_inference, + pt_weights_iterator, + safetensors_weights_iterator, +) +from vllm.model_executor.models import is_pooling_model +from vllm.model_executor.utils import ( + get_moe_expert_mapping, + get_packed_modules_mapping, + set_weight_attrs, +) +from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_default_torch_dtype + +logger = init_logger(__name__) + + +def is_moe_model(model: torch.nn.Module) -> bool: + """Checks if the model contains FusedMoE layers.""" + return bool(any(isinstance(module, FusedMoE) for module in model.modules())) + + +class BitsAndBytesModelLoader(BaseModelLoader): + """Model loader to load model weights with BitAndBytes quantization.""" + + possible_config_file_names = ["adapter_config.json"] + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + + # Save the module names without sharding. + self.unsharded_weights_modules: list[str] = [] + # Save the module names that are sharded by column. + self.column_sharded_weights_modules: list[str] = [] + # Modules whose weights might have fused on disk + # we need their output_sizes to make shard in flight correctly with TP + self.maybe_fused_weights_modules: dict[str, list[int]] = {} + # Store all module names (from transformers) that support + # BNB quantization. + self.target_modules: list[str] = [] + self.tp_disabled_modules: list[str] = [] + # Store the mapping of expert parameters for MoE models. + self.expert_params_mapping: list[tuple[str, str, int, str]] = [] + # mapping weight names from transformers to vllm. + self.weight_mapper: Callable = lambda name: name + self.pre_quant: bool = False + self.load_8bit: bool = False + self.is_pool_model: bool = False + + def _get_weight_files( + self, + model_name_or_path: str, + allowed_patterns: list[str], + revision: str | None = None, + ) -> tuple[str, list[str], str]: + """Retrieve weight files. Download the files if necessary. + + Return the weight files and the file pattern.""" + is_local = os.path.isdir(model_name_or_path) + + if is_local: + for pattern in allowed_patterns: + weight_files = glob.glob(os.path.join(model_name_or_path, pattern)) + if weight_files: + return model_name_or_path, weight_files, pattern + else: + hf_api = HfApi() + repo_files = hf_api.list_repo_files(repo_id=model_name_or_path) + for pattern in allowed_patterns: + matching_files = fnmatch.filter(repo_files, pattern) + if matching_files: + hf_folder = download_weights_from_hf( + model_name_or_path, + self.load_config.download_dir, + [pattern], + revision, + ignore_patterns=self.load_config.ignore_patterns, + ) + return ( + hf_folder, + glob.glob(os.path.join(hf_folder, pattern)), + pattern, + ) + + raise RuntimeError(f"No model weights found in: `{model_name_or_path}`") + + def _prepare_weights( + self, model_name_or_path: str, revision: str | None + ) -> tuple[list[str], bool]: + """Prepare weight files for the model.""" + + allowed_patterns = ["*.safetensors", "*.bin", "*.pt"] + + hf_folder, hf_weights_files, matched_pattern = self._get_weight_files( + model_name_or_path, allowed_patterns, revision + ) + + use_safetensors = matched_pattern == "*.safetensors" + is_local = os.path.isdir(model_name_or_path) + index_file = SAFE_WEIGHTS_INDEX_NAME + if use_safetensors: + # For models like Mistral-7B-Instruct-v0.3 + # there are both sharded safetensors files and a consolidated + # safetensors file. Using both breaks. + # Here, we download the `model.safetensors.index.json` and filter + # any files not found in the index. + if not is_local: + download_safetensors_index_file_from_hf( + model_name_or_path, + index_file, + self.load_config.download_dir, + revision, + ) + hf_weights_files = filter_duplicate_safetensors_files( + hf_weights_files, hf_folder, index_file + ) + else: + hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files) + + if len(hf_weights_files) == 0: + raise RuntimeError( + f"Cannot find any model weights with `{model_name_or_path}`" + ) + + return hf_weights_files, use_safetensors + + def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool): + def _maybe_pool_model(module_name: str): + # For pool model, we need to add the prefix `model.` + # for the weight name if possible. + if ( + self.is_pool_model + and self.target_modules[0].startswith("model.") + and not module_name.startswith("model.") + ): + return "model." + module_name + + return module_name + + if use_safetensors: + iterator = safetensors_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + ) + else: + iterator = pt_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + self.load_config.pt_load_map_location, + ) + for org_name, param in iterator: + # mapping weight names from transformers to vllm while preserving + # original names. + mapped_name = self.weight_mapper(org_name) + mapped_name = _maybe_pool_model(mapped_name) + + yield org_name, mapped_name, param + + def _get_quantized_weights_iterator( + self, + model_name_or_path: str, + revision: str | None, + ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, Any]]: + """Get an iterator to the model weights with bitsandbytes quantization, + as well as the quantization state dictionary.""" + + # only load the bitsandbytes module when needed + try: + import bitsandbytes + + if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"): + raise ImportError( + "bitsandbytes version is wrong. Please " + "install bitsandbytes>=0.46.1." + ) + except ImportError as err: + raise ImportError( + "Please install bitsandbytes>=0.46.1 via " + "`pip install bitsandbytes>=0.46.1` to use " + "bitsandbytes quantizer." + ) from err + + hf_weights_files, use_safetensors = self._prepare_weights( + model_name_or_path, revision + ) + + quant_state_dict: dict[str, Any] = {} + + if self.pre_quant: + if self.load_8bit: + return self._quantized_8bit_generator( + hf_weights_files, use_safetensors, quant_state_dict + ), quant_state_dict + else: + return self._quantized_4bit_generator( + hf_weights_files, use_safetensors, quant_state_dict + ), quant_state_dict + + return self._unquantized_generator( + hf_weights_files, use_safetensors, quant_state_dict + ), quant_state_dict + + def _is_8bit_weight_name(self, weight_name: str): + quantized_suffix = {".scb", ".weight_format"} + return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix) + + def _is_4bit_weight_name(self, weight_name: str): + quantized_suffix = { + "absmax", + "quant_map", + "nested_absmax", + "nested_quant_map", + "bitsandbytes", + } + suffix = weight_name.split(".")[-1] + return any(q_suffix in suffix for q_suffix in quantized_suffix) + + def _quantized_8bit_generator( + self, hf_weights_files, use_safetensors, quant_state_dict + ) -> Generator: + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if not mapped_weight_name.lower().endswith(".scb"): + continue + + weight_key = mapped_weight_name.lower().replace(".scb", ".weight") + quant_state_dict[weight_key] = weight_tensor + + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if self._is_8bit_weight_name(mapped_weight_name): + continue + + if mapped_weight_name in quant_state_dict: + set_weight_attrs(weight_tensor, {"load_in_8bit": True}) + yield org_weight_name, weight_tensor + else: + yield org_weight_name, weight_tensor + + def _quantized_4bit_generator( + self, hf_weights_files, use_safetensors, quant_state_dict + ) -> Generator: + from bitsandbytes.functional import QuantState + + # First iterate over all quant state weights + weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors) + temp_state_dict = {} + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in weight_iterator: + if not self._is_4bit_weight_name(mapped_weight_name): + continue + # bitsandbytes library requires + # weight.quant_state.bitsandbytes__* in CPU + if "quant_state.bitsandbytes" in mapped_weight_name: + temp_state_dict[mapped_weight_name] = weight_tensor.cpu().data + else: + temp_state_dict[mapped_weight_name] = weight_tensor + + # Closure to parse quant_state for each prequant weight + def _parse_quant_state(param_name: str, temp_state_dict: dict) -> QuantState: + quant_state = {} + for k in temp_state_dict: + if param_name + "." in k: + quant_state[k] = temp_state_dict[k] + + return QuantState.from_dict( + quant_state, device=current_platform.device_type + ) + + # Second iterate over all prequant and normal weights + # pre quantized weights would have a quant_state + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if self._is_4bit_weight_name(mapped_weight_name): + continue + + if ( + f"{mapped_weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict + ) or ( + f"{mapped_weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict + ): + quant_state = _parse_quant_state(mapped_weight_name, temp_state_dict) + quant_state_dict[mapped_weight_name] = quant_state + yield org_weight_name, weight_tensor + else: + yield org_weight_name, weight_tensor + + def _unquantized_generator( + self, hf_weights_files, use_safetensors, quant_state_dict + ) -> Generator: + from bitsandbytes.functional import quantize_4bit + + global_tp_size = get_tensor_model_parallel_world_size() + global_tp_rank = get_tensor_model_parallel_rank() + check_match = ( + lambda weight_name, module_name: weight_name.removesuffix(".weight") + == module_name + ) + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + # override tp_size and tp_rank if the module has disabled TP + if any( + tp_disabled_module in mapped_weight_name + for tp_disabled_module in self.tp_disabled_modules + ): + tp_size = 1 + tp_rank = 0 + else: + tp_size = global_tp_size + tp_rank = global_tp_rank + + if any( + target_module in mapped_weight_name + for target_module in self.target_modules + ) and mapped_weight_name.endswith(".weight"): + # Without sharding + if any( + check_match(mapped_weight_name, module) + for module in self.unsharded_weights_modules + ): + weight_sub_tensor = weight_tensor + # Shard by column + elif any( + check_match(mapped_weight_name, module) + for module in self.column_sharded_weights_modules + ): + total_size = weight_tensor.size(-1) + start_index = total_size // tp_size * tp_rank + end_index = total_size // tp_size * (tp_rank + 1) + weight_sub_tensor = weight_tensor[..., start_index:end_index] + # Weights have fused on disk. In this case, we assume that the + # weight and module use same name. + elif any( + check_match(mapped_weight_name, module) + for module in self.maybe_fused_weights_modules + ): + # special case for fused weights + # get the size of each shard weight tensor + total_shard_sizes = next( + ( + sizes + for module, sizes in self.maybe_fused_weights_modules.items() # noqa: E501 + if check_match(mapped_weight_name, module) + ) + ) + total_size = weight_tensor.size(0) + assert total_size == sum(total_shard_sizes) + # get the start/end index of each shard weight tensor + total_start_index = list( + itertools.accumulate([0] + total_shard_sizes) + )[:-1] + shard_weights_index = [ + ( + idx + size // tp_size * tp_rank, + idx + size // tp_size * (tp_rank + 1), + ) + for idx, size in zip(total_start_index, total_shard_sizes) + ] + # slice and reorder the weight tensor + weight_tensor = [ + weight_tensor[start_index:end_index, ...] + for start_index, end_index in shard_weights_index + ] + weight_sub_tensor = torch.cat(weight_tensor, dim=0) + # Shard by row + else: + total_size = weight_tensor.size(0) + start_index = total_size // tp_size * tp_rank + end_index = total_size // tp_size * (tp_rank + 1) + weight_sub_tensor = weight_tensor[start_index:end_index, ...] + + # bitsandbytes requires data in GPU + if weight_sub_tensor.is_cuda: + loaded_weight = weight_sub_tensor + else: + loaded_weight = weight_sub_tensor.to( + device=current_platform.device_type + ) + + # remove the following after the issue is fixed: + # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342 + if loaded_weight.is_contiguous() is False: + loaded_weight = loaded_weight.contiguous() + + with set_default_torch_dtype(torch.float32): + processed_weight, quant_state = quantize_4bit( + loaded_weight, + compress_statistics=True, + quant_type="nf4", + ) + + quant_state_dict[mapped_weight_name] = quant_state + else: + processed_weight = weight_tensor + yield org_weight_name, processed_weight + + def _get_bnb_target_modules(self, model: nn.Module) -> None: + """ + Identify and collect all modules that support BitsAndBytes + quantization. + """ + for name, module in model.named_modules(): + if isinstance(module, LinearBase) and hasattr( + module.quant_method, "quant_config" + ): + if modules_info := self.modules_mapping.get_sub_modules(name): + # Map vllm's names to transformers's names. + rep_name, sub_modules = modules_info + for sub_name in sub_modules: + new_name = name.replace(rep_name, sub_name) + self.target_modules.append(new_name) + if module.disable_tp: + self.tp_disabled_modules.append(new_name) + # Add original module name even if the module has stacked map, + # in case model has a mixture of disk-merged and disk-split + # weights with same last name. + self.target_modules.append(name) + if module.disable_tp: + self.tp_disabled_modules.append(name) + elif isinstance(module, FusedMoE) and hasattr( + module.quant_method, "quant_config" + ): + # TODO: support FusedMoE with prequant and 8bit. + if self.pre_quant and self.load_8bit: + raise ValueError( + "Prequant BitsAndBytes 8bit models with FusedMoE " + "is not supported yet." + ) + # Get the corresponding weight name using module name and + # expert_params_mapping. + + for exp in self.expert_params_mapping: + weight_name = exp[1] + rep_name = name.replace("experts", "") + weight_name.removesuffix( + "." + ) + self.target_modules.append(rep_name) + + assert self.target_modules, ( + "vLLM currently does not support BNB quantization for" + ) + f" {type(model).__name__}" + + def _classify_module_sharding(self, model: nn.Module): + """ + Categorize modules based on their weight sharding requirements + for tensor parallelism. + """ + for name, module in model.named_modules(): + # Some modules like `ReplicatedLinear` should not have their weights + # sharded. The reason for implementing it this way is to avoid new + # static variable in the model implementation. + if isinstance(module, (ReplicatedLinear,)): + self.unsharded_weights_modules.append(name) + # `QKVParallelLinear` and `MergedColumnParallelLinear` might have + # fused weights on disk. We need to use the output sizes of these + # modules to shard the weights correctly. + elif isinstance(module, (QKVParallelLinear, MergedColumnParallelLinear)): + self.maybe_fused_weights_modules[name] = module.output_sizes + # In TP, these weights are partitioned along the column + # dimension (dim=-1) + elif isinstance(module, (RowParallelLinear,)): + self.column_sharded_weights_modules.append(name) + elif isinstance(module, FusedMoE): + expert_mapping = self.expert_params_mapping + for exp in expert_mapping: + if exp[-1] == "w2": + weight_name = exp[1] + rep_name = name.replace( + "experts", "" + ) + weight_name.removesuffix(".") + self.column_sharded_weights_modules.append(rep_name) + + def _verify_model_compatibility( + self, model: nn.Module, model_config: ModelConfig + ) -> None: + """ + Verify that the model is compatible with BitsAndBytes quantization. + """ + if not hasattr(model, "load_weights"): + raise AttributeError( + "The required method 'load_weights' is not defined in class" + f" {type(model).__name__}." + ) + + if not hasattr(model, "packed_modules_mapping"): + raise AttributeError( + f"Model {type(model).__name__} does not support BitsAndBytes " + "quantization yet. No 'packed_modules_mapping' found." + ) + + quant_config = getattr(model_config.hf_config, "quantization_config", None) + if quant_config and (quant_method := quant_config.get("quant_method")): + if quant_method == "bitsandbytes": + self.pre_quant = True + else: + raise ValueError( + f"BitsAndBytes loader does not support {quant_method} quantization" + ) + + # The quant_states in pre_quantized models cannot work with a split + # weight tensor. So TP does not work with pre_quantized bnb models. + if self.pre_quant and get_tensor_model_parallel_world_size() > 1: + raise ValueError( + "Prequant BitsAndBytes models with tensor parallelism is not " + "supported. Please try with pipeline parallelism." + ) + if quant_config and self.pre_quant: + self.load_8bit = quant_config.get("load_in_8bit", False) + + def _initialize_loader_state( + self, model: nn.Module, model_config: ModelConfig + ) -> None: + """ + Initialize the loader's internal state based on the model and + configuration. + """ + self.is_pool_model = is_pooling_model(model) + self.modules_mapping = ParamMapping(get_packed_modules_mapping(model)) + + if is_moe_model(model): + self.expert_params_mapping = get_moe_expert_mapping(model) + if not self.expert_params_mapping: + raise AttributeError( + f"MoE Model {type(model).__name__} does not support " + "BitsAndBytes quantization yet. Ensure this model has " + "'get_expert_mapping' method." + ) + # For some models like Molmo, we need to use hf_to_vllm_mapper + # to ensure correct loading of weights. + if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): + self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name) + + self._get_bnb_target_modules(model) + self._classify_module_sharding(model) + + def _dequantize_dq(self, quant_states: Any): + """ + When BNB employs Double Quantization, we perform the dequantization of + these constants during weight loading rather than at inference time, + thereby avoiding this computational overhead during inference. This + comes at the cost of increased memory usage. + """ + from bitsandbytes.functional import QuantState, dequantize_blockwise + + def _dequantize_single_state(quant_state): + """Helper function to dequantize a single QuantState object.""" + if not (isinstance(quant_state, QuantState) and quant_state.nested): + return + + # Copied from: https://github.com/bitsandbytes-foundation/bitsandbytes/blob/0.45.3/bitsandbytes/functional.py#L1352-#L1356 + absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2) + absmax += quant_state.offset + + # Ensure float32 dtype + if absmax.dtype != torch.float32: + absmax = absmax.float() + + quant_state.absmax = absmax + quant_state.nested = False + quant_state.offset = None + quant_state.state2 = None + + if isinstance(quant_states, dict): + for quant_state in quant_states.values(): + _dequantize_single_state(quant_state) + else: + _dequantize_single_state(quant_states) + return quant_states + + def _fuse_moe_quant_states(self, model: nn.Module, quant_states_dict: dict) -> dict: + """ + + This function consolidates individual expert quantization states into + fused representations for w13 and w2. + """ + from bitsandbytes.functional import QuantState + + if not self.expert_params_mapping: + return dict() + + expert_mapping = self.expert_params_mapping + expert_qs_dict = {} + for name, module in model.named_modules(): + if not isinstance(module, FusedMoE): + continue + w1_states_lst = [] + w2_states_lst = [] + w3_states_lst = [] + for exp in expert_mapping: + shard_id = exp[-1] + if shard_id not in ("w1", "w2", "w3"): + raise ValueError( + f"shard_id must be ['w1','w2','w3'] but got {shard_id}." + ) + layer_prefix = name.split("experts")[0] + weight_qual_name = layer_prefix + exp[1] + "weight" + quant_state = self._dequantize_dq(quant_states_dict[weight_qual_name]) + if shard_id == "w1": + w1_states_lst.append(quant_state) + elif shard_id == "w2": + w2_states_lst.append(quant_state) + else: + w3_states_lst.append(quant_state) + del quant_states_dict[weight_qual_name] + assert len(w1_states_lst) == len(w2_states_lst) == len(w3_states_lst) + w13_absmax_lst = [] + w2_absmax_lst = [] + w13_total_dim0 = 0 + w2_total_dim0 = 0 + for w1_qs, w2_qs, w3_qs in zip(w1_states_lst, w2_states_lst, w3_states_lst): + assert w1_qs.shape == w3_qs.shape + assert w1_qs.blocksize == w2_qs.blocksize == w3_qs.blocksize + assert w1_qs.dtype == w2_qs.dtype == w3_qs.dtype + # w1 and w3 are interleaved in storage + w13_absmax_lst.append(w1_qs.absmax) + w13_absmax_lst.append(w3_qs.absmax) + w2_absmax_lst.append(w2_qs.absmax) + w13_total_dim0 += w1_qs.shape[0] + w3_qs.shape[0] + w2_total_dim0 += w2_qs.shape[0] + + w13_absmax = torch.cat(w13_absmax_lst) + w2_absmax = torch.cat(w2_absmax_lst) + # Create fused quantization state for w13. + w13_qs = QuantState( + absmax=w13_absmax, + shape=(w13_total_dim0, w1_states_lst[0].shape[1]), + code=w1_states_lst[0].code, + blocksize=w1_states_lst[0].blocksize, + quant_type="nf4", + dtype=w1_states_lst[0].dtype, + ) + # Create fused quantization state for w2. + w2_qs = QuantState( + absmax=w2_absmax, + shape=(w2_total_dim0, w2_states_lst[0].shape[1]), + code=w2_states_lst[0].code, + blocksize=w2_states_lst[0].blocksize, + quant_type="nf4", + dtype=w2_states_lst[0].dtype, + ) + # The weight suffixes .w13_weight and .w2_weight are consistent + # with the param in BitsAndBytesMoEMethod. + w13_weight_name = name + ".w13_weight" + w2_weight_name = name + ".w2_weight" + expert_qs_dict[w13_weight_name] = w13_qs + expert_qs_dict[w2_weight_name] = w2_qs + return expert_qs_dict + + def _stack_quantization_states( + self, model: nn.Module, quant_state_dict: dict + ) -> dict[str, dict[int, Any]]: + stacked_quant_state_dict: dict[str, dict[int, Any]] = {} + # TODO: Change this lazy import to normal import + # after the checks are updated to run on a new version + from vllm.model_executor.models.utils import is_pp_missing_parameter + + param_dict = dict(model.named_parameters()) + for quant_param_name in quant_state_dict: + if is_pp_missing_parameter(quant_param_name, model): + continue + + non_stacked_param_name = quant_param_name + + shard_index = 0 + for shard_name, ( + weight_name, + index, + ) in self.modules_mapping.inverse_packed_mapping.items(): + # Some models, such as MiniCPM V2.5/2.6, contain both + # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' + # from being incorrectly identified as being present in + # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight + shard_pos = quant_param_name.find(shard_name) + can_correct_rename = (shard_pos > 0) and ( + quant_param_name[shard_pos - 1] == "." + ) + # If the quant_param_name is packed, it won't occur in the + # param_dict before renaming. + new_quant_param_name = quant_param_name.replace(shard_name, weight_name) + need_rename = (quant_param_name not in param_dict) and ( + new_quant_param_name in param_dict + ) + if can_correct_rename and need_rename: + shard_index = index + quant_param_name = new_quant_param_name + break + + # Models like Clip/Siglip may skip some layers in initialization, + # causing unused quant_param_name in state_dict. + if quant_param_name not in param_dict: + continue + + if quant_param_name not in stacked_quant_state_dict: + stacked_quant_state_dict[quant_param_name] = {} + + stacked_quant_state_dict[quant_param_name][shard_index] = quant_state_dict[ + non_stacked_param_name + ] + return stacked_quant_state_dict + + def _bind_quant_states_to_params( + self, model: nn.Module, stacked_quant_state_dict: dict + ) -> None: + # save quant_states and offsets as the attributes of the parameters + param_dict = dict(model.named_parameters()) + for param_name, param in param_dict.items(): + if param_name in stacked_quant_state_dict: + quant_states = stacked_quant_state_dict[param_name] + # Dequantize double quantized values during weight loading. + self._dequantize_dq(quant_states) + set_weight_attrs(param, {"bnb_quant_state": quant_states}) + if not isinstance(quant_states, dict): + continue + + pack_ratio = getattr(param, "pack_factor", -1) + if pack_ratio == -1: + raise ValueError(f"pack_factor not set for parameter {param_name}.") + + num_elements = [0] * len(quant_states) + for seq, quant_state in quant_states.items(): + num_elements[seq] = math.prod(quant_state.shape) // pack_ratio + + offsets = np.concatenate(([0], np.cumsum(num_elements))) + # Make torch infer_schema happy + offsets = torch.tensor(offsets).cpu() + set_weight_attrs(param, {"bnb_shard_offsets": offsets}) + + if self.load_8bit: + set_weight_attrs( + param, {"matmul_state": [None] * len(quant_states)} + ) + + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: + self._verify_model_compatibility(model, model_config) + self._initialize_loader_state(model, model_config) + + logger.info( + "Loading weights with BitsAndBytes quantization. May take a while ..." + ) + qweight_iterator, quant_state_dict = self._get_quantized_weights_iterator( + model_config.model, + model_config.revision, + ) + weights_to_load = {name for name, _ in model.named_parameters()} + loaded_weights = model.load_weights(qweight_iterator) + # Some models may have weights loading tracker unimplemented. + if loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError( + "Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}" + ) + expert_quant_state_dict = self._fuse_moe_quant_states(model, quant_state_dict) + + stacked_quant_state_dict = self._stack_quantization_states( + model, quant_state_dict + ) + + stacked_quant_state_dict = { + **expert_quant_state_dict, + **stacked_quant_state_dict, + } + self._bind_quant_states_to_params(model, stacked_quant_state_dict) + torch.cuda.empty_cache() + + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model, model_config.revision) diff --git a/model_executor/model_loader/default_loader.py b/model_executor/model_loader/default_loader.py new file mode 100644 index 0000000..1d8ddcb --- /dev/null +++ b/model_executor/model_loader/default_loader.py @@ -0,0 +1,329 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import dataclasses +import glob +import os +import time +from collections.abc import Generator, Iterable +from typing import cast + +import torch +from torch import nn +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME + +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig +from vllm import envs +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.torchao import torchao_version_at_least +from vllm.model_executor.model_loader.base_loader import BaseModelLoader +from vllm.model_executor.model_loader.weight_utils import ( + download_safetensors_index_file_from_hf, + download_weights_from_hf, + fastsafetensors_weights_iterator, + filter_duplicate_safetensors_files, + filter_files_not_needed_for_inference, + maybe_download_from_modelscope, + multi_thread_pt_weights_iterator, + multi_thread_safetensors_weights_iterator, + np_cache_weights_iterator, + pt_weights_iterator, + safetensors_weights_iterator, +) +from vllm.platforms import current_platform + +logger = init_logger(__name__) + + +class DefaultModelLoader(BaseModelLoader): + """Model loader that can load different file types from disk.""" + + # default number of thread when enable multithread weight loading + DEFAULT_NUM_THREADS = 8 + + @dataclasses.dataclass + class Source: + """A source for weights.""" + + model_or_path: str + """The model ID or path.""" + + revision: str | None + """The optional model revision.""" + + prefix: str = "" + """A prefix to prepend to all weights.""" + + fall_back_to_pt: bool = True + """Whether .pt weights can be used.""" + + allow_patterns_overrides: list[str] | None = None + """If defined, weights will load exclusively using these patterns.""" + + counter_before_loading_weights: float = 0.0 + counter_after_loading_weights: float = 0.0 + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + + extra_config = load_config.model_loader_extra_config + allowed_keys = {"enable_multithread_load", "num_threads"} + unexpected_keys = set(extra_config.keys()) - allowed_keys + + if unexpected_keys: + raise ValueError( + f"Unexpected extra config keys for load format " + f"{load_config.load_format}: " + f"{unexpected_keys}" + ) + + def _prepare_weights( + self, + model_name_or_path: str, + revision: str | None, + fall_back_to_pt: bool, + allow_patterns_overrides: list[str] | None, + ) -> tuple[str, list[str], bool]: + """Prepare weights for the model. + + If the model is not local, it will be downloaded.""" + model_name_or_path = ( + maybe_download_from_modelscope(model_name_or_path, revision) + or model_name_or_path + ) + + is_local = os.path.isdir(model_name_or_path) + load_format = self.load_config.load_format + use_safetensors = False + index_file = SAFE_WEIGHTS_INDEX_NAME + # Some quantized models use .pt files for storing the weights. + if load_format == "auto": + allow_patterns = ["*.safetensors", "*.bin"] + elif load_format == "safetensors" or load_format == "fastsafetensors": + use_safetensors = True + allow_patterns = ["*.safetensors"] + elif load_format == "mistral": + use_safetensors = True + allow_patterns = ["consolidated*.safetensors"] + index_file = "consolidated.safetensors.index.json" + elif load_format == "pt": + allow_patterns = ["*.pt"] + elif load_format == "npcache": + allow_patterns = ["*.bin"] + else: + raise ValueError(f"Unknown load_format: {load_format}") + + if fall_back_to_pt: + allow_patterns += ["*.pt"] + + if allow_patterns_overrides is not None: + allow_patterns = allow_patterns_overrides + + if not is_local: + hf_folder = download_weights_from_hf( + model_name_or_path, + self.load_config.download_dir, + allow_patterns, + revision, + ignore_patterns=self.load_config.ignore_patterns, + ) + else: + hf_folder = model_name_or_path + + hf_weights_files: list[str] = [] + for pattern in allow_patterns: + hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) + if len(hf_weights_files) > 0: + if pattern == "*.safetensors": + use_safetensors = True + break + + if use_safetensors: + # For models like Mistral-7B-Instruct-v0.3 + # there are both sharded safetensors files and a consolidated + # safetensors file. Using both breaks. + # Here, we download the `model.safetensors.index.json` and filter + # any files not found in the index. + if not is_local: + download_safetensors_index_file_from_hf( + model_name_or_path, + index_file, + self.load_config.download_dir, + revision, + ) + hf_weights_files = filter_duplicate_safetensors_files( + hf_weights_files, hf_folder, index_file + ) + else: + hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files) + + if len(hf_weights_files) == 0: + raise RuntimeError( + f"Cannot find any model weights with `{model_name_or_path}`" + ) + + return hf_folder, hf_weights_files, use_safetensors + + def _get_weights_iterator( + self, source: "Source" + ) -> Generator[tuple[str, torch.Tensor], None, None]: + """Get an iterator for the model weights based on the load format.""" + extra_config = self.load_config.model_loader_extra_config + hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( + source.model_or_path, + source.revision, + source.fall_back_to_pt, + source.allow_patterns_overrides, + ) + if self.load_config.load_format == "npcache": + # Currently np_cache only support *.bin checkpoints + assert use_safetensors is False + weights_iterator = np_cache_weights_iterator( + source.model_or_path, + self.load_config.download_dir, + hf_folder, + hf_weights_files, + self.load_config.use_tqdm_on_load, + ) + elif use_safetensors: + if self.load_config.load_format == "fastsafetensors": + weights_iterator = fastsafetensors_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + ) + else: + if extra_config.get("enable_multithread_load"): + weights_iterator = multi_thread_safetensors_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + max_workers=extra_config.get( + "num_threads", self.DEFAULT_NUM_THREADS + ), + ) + else: + weights_iterator = safetensors_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + self.load_config.safetensors_load_strategy, + ) + else: + if extra_config.get("enable_multithread_load"): + weights_iterator = multi_thread_pt_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + self.load_config.pt_load_map_location, + max_workers=extra_config.get( + "num_threads", self.DEFAULT_NUM_THREADS + ), + ) + else: + weights_iterator = pt_weights_iterator( + hf_weights_files, + self.load_config.use_tqdm_on_load, + self.load_config.pt_load_map_location, + ) + + if current_platform.is_tpu(): + from vllm.platforms.tpu import USE_TPU_INFERENCE + + if not USE_TPU_INFERENCE: + # In PyTorch XLA, we should call `torch_xla.sync` + # frequently so that not too many ops are accumulated + # in the XLA program. + import torch_xla + + def _xla_weights_iterator(iterator: Generator): + for weights in iterator: + yield weights + torch_xla.sync(wait=False) + + weights_iterator = _xla_weights_iterator(weights_iterator) + + if self.counter_before_loading_weights == 0.0: + self.counter_before_loading_weights = time.perf_counter() + # Apply the prefix. + return ((source.prefix + name, tensor) for (name, tensor) in weights_iterator) + + def get_all_weights( + self, + model_config: ModelConfig, + model: nn.Module, + ) -> Generator[tuple[str, torch.Tensor], None, None]: + primary_weights = DefaultModelLoader.Source( + model_config.model, + model_config.revision, + prefix="", + fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True), + allow_patterns_overrides=getattr(model, "allow_patterns_overrides", None), + ) + yield from self._get_weights_iterator(primary_weights) + + secondary_weights = cast( + Iterable[DefaultModelLoader.Source], + getattr(model, "secondary_weights", ()), + ) + for source in secondary_weights: + yield from self._get_weights_iterator(source) + + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights( + model_config.model, + model_config.revision, + fall_back_to_pt=True, + allow_patterns_overrides=None, + ) + + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: + if model_config.quantization == "torchao" and torchao_version_at_least( + "0.14.0" + ): + self.load_config.safetensors_load_strategy = "torchao" + weights_to_load = {name for name, _ in model.named_parameters()} + + # if we don't have `model.weight_metadata_and_attr_saved` defined and + # set to True, it means that this is either offline quantization case + # or the first run of online quantization + # see online_quantization.py for detailed notes + offline_quantization_or_first_run_of_online_quantization = not getattr( + model, "weight_metadata_and_attr_saved", False + ) + + if model_config.quantization is None: + # model is not quantized + loaded_weights = model.load_weights( + self.get_all_weights(model_config, model) + ) + elif offline_quantization_or_first_run_of_online_quantization: + # case 1: offline quantized checkpoint + # case 2: Step I1 first run of weight loading with + # online quantization + # see online_quantization.py for detailed notes + loaded_weights = model.load_weights( + self.get_all_weights(model_config, model) + ) + else: + # to avoid circular dependency + from vllm.model_executor.model_loader.online_quantization import ( + load_weights_and_online_quantize, + ) + + # subsequent runs of weight loading with online + # quantization + loaded_weights = load_weights_and_online_quantize(self, model, model_config) + + self.counter_after_loading_weights = time.perf_counter() + logger.info_once( + "Loading weights took %.2f seconds", + self.counter_after_loading_weights - self.counter_before_loading_weights, + scope="local", + ) + # We only enable strict check for non-quantized models + # that have loaded weights tracking currently. + opt_flag = envs.VLLM_MOE_OPT_LEVEL != 0 or envs.VLLM_LINEAR_OPT_LEVEL != 0 + if model_config.quantization is None and loaded_weights is not None and not opt_flag: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError( + "Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}" + ) diff --git a/model_executor/model_loader/dummy_loader.py b/model_executor/model_loader/dummy_loader.py new file mode 100644 index 0000000..b2a934c --- /dev/null +++ b/model_executor/model_loader/dummy_loader.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch.nn as nn + +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig +from vllm.model_executor.model_loader.base_loader import BaseModelLoader +from vllm.model_executor.model_loader.weight_utils import initialize_dummy_weights + + +class DummyModelLoader(BaseModelLoader): + """Model loader that will set model weights to random values.""" + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + if load_config.model_loader_extra_config: + raise ValueError( + f"Model loader extra config is not supported for " + f"load format {load_config.load_format}" + ) + + def download_model(self, model_config: ModelConfig) -> None: + pass # Nothing to download + + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: + # NOTE(woosuk): For accurate performance evaluation, we assign + # random values to the weights. + initialize_dummy_weights(model) diff --git a/model_executor/model_loader/gguf_loader.py b/model_executor/model_loader/gguf_loader.py new file mode 100644 index 0000000..7db1fc1 --- /dev/null +++ b/model_executor/model_loader/gguf_loader.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from collections.abc import Generator + +import gguf +import torch +import torch.nn as nn +from huggingface_hub import hf_hub_download +from transformers import AutoModelForCausalLM + +from vllm.config import ModelConfig, VllmConfig +from vllm.config.load import LoadConfig +from vllm.model_executor.model_loader.base_loader import BaseModelLoader +from vllm.model_executor.model_loader.utils import ( + initialize_model, + process_weights_after_loading, +) +from vllm.model_executor.model_loader.weight_utils import ( + get_gguf_extra_tensor_names, + get_gguf_weight_type_map, + gguf_quant_weights_iterator, +) +from vllm.utils.torch_utils import set_default_torch_dtype + + +class GGUFModelLoader(BaseModelLoader): + """ + Model loader that can load GGUF files. This is useful for loading models + that are quantized with GGUF and saved in the GGUF format. This loader + supports loading both full models and sharded models. + """ + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + if load_config.model_loader_extra_config: + raise ValueError( + f"Model loader extra config is not supported for " + f"load format {load_config.load_format}" + ) + + def _prepare_weights(self, model_name_or_path: str): + if os.path.isfile(model_name_or_path): + return model_name_or_path + # for raw HTTPS link + if model_name_or_path.startswith( + ("http://", "https://") + ) and model_name_or_path.endswith(".gguf"): + return hf_hub_download(url=model_name_or_path) + # repo id/filename.gguf + if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"): + repo_id, filename = model_name_or_path.rsplit("/", 1) + return hf_hub_download(repo_id=repo_id, filename=filename) + else: + raise ValueError( + f"Unrecognised GGUF reference: {model_name_or_path} " + "(expected local file, raw URL, or /.gguf)" + ) + + def _get_gguf_weights_map(self, model_config: ModelConfig): + """ + GGUF uses this naming convention for their tensors from HF checkpoint: + `blk.N.BB.weight` and `blk.N.BB.bias` + where N signifies the block number of a layer, and BB signifies the + attention/mlp layer components. + See "Standardized tensor names" in + https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details. + """ + config = model_config.hf_config + model_type = config.model_type + gguf_to_hf_name_map = {} + # hack: ggufs have a different name than transformers + if model_type == "cohere": + model_type = "command-r" + if model_type == "gemma3_text": + # Gemma3 models use "gemma3_text" in HuggingFace but + # "gemma3" in GGUF architecture naming + model_type = "gemma3" + if model_type in ("deepseek_v3", "deepseek_v2"): + model_type = "deepseek2" + # GGUF layer map assumes that we will have a merged expert weights + # so we need to map them manually + for idx in range(config.num_hidden_layers): + gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = ( + f"model.layers.{idx}.mlp.gate.e_score_correction_bias" + ) + gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = ( + f"model.layers.{idx}.mlp.experts.0.down_proj.weight" + ) + gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = ( + f"model.layers.{idx}.mlp.experts.0.gate_proj.weight" + ) + gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( + f"model.layers.{idx}.mlp.experts.0.up_proj.weight" + ) + if model_type in ("qwen2_moe", "qwen3_moe"): + model_type = model_type.replace("_", "") + # GGUF layer map assumes that we will have a merged expert weights + # so we need to map them manually + for idx in range(config.num_hidden_layers): + gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = ( + f"model.layers.{idx}.mlp.experts.0.down_proj.weight" + ) + gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = ( + f"model.layers.{idx}.mlp.experts.0.gate_proj.weight" + ) + gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( + f"model.layers.{idx}.mlp.experts.0.up_proj.weight" + ) + + arch = None + for key, value in gguf.MODEL_ARCH_NAMES.items(): + if value == model_type: + arch = key + break + if arch is None: + raise RuntimeError(f"Unknown gguf model_type: {model_type}") + num_layers = config.num_hidden_layers + name_map = gguf.get_tensor_name_map(arch, num_layers) + with torch.device("meta"): + dummy_model = AutoModelForCausalLM.from_config( + config, trust_remote_code=model_config.trust_remote_code + ) + state_dict = dummy_model.state_dict() + + for hf_name in state_dict: + name, suffix = hf_name.rsplit(".", 1) + gguf_name = name_map.get_name(name) + gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name + return gguf_to_hf_name_map + + def _get_weights_iterator( + self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str] + ) -> Generator[tuple[str, torch.Tensor], None, None]: + return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) + + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model) + + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: + local_model_path = self._prepare_weights(model_config.model) + gguf_weights_map = self._get_gguf_weights_map(model_config) + model.load_weights( + self._get_weights_iterator(local_model_path, gguf_weights_map) + ) + + def load_model( + self, vllm_config: VllmConfig, model_config: ModelConfig + ) -> nn.Module: + device_config = vllm_config.device_config + local_model_path = self._prepare_weights(model_config.model) + gguf_weights_map = self._get_gguf_weights_map(model_config) + # we can only know if tie word embeddings after mapping weights + if "lm_head.weight" in get_gguf_extra_tensor_names( + local_model_path, gguf_weights_map + ): + model_config.hf_config.update({"tie_word_embeddings": True}) + + weight_type_map = get_gguf_weight_type_map(model_config.model, gguf_weights_map) + + # filter out unquantized modules to skip + unquant_names = [ + name.removesuffix(".weight") + for name, weight_type in weight_type_map.items() + if weight_type == "F32" and name.endswith(".weight") + ] + vllm_config.quant_config.unquantized_modules.extend(unquant_names) + + target_device = torch.device(device_config.device) + with set_default_torch_dtype(model_config.dtype): + with target_device: + model = initialize_model(vllm_config=vllm_config) + self.load_weights(model, model_config) + + process_weights_after_loading(model, model_config, target_device) + return model diff --git a/model_executor/model_loader/online_quantization.py b/model_executor/model_loader/online_quantization.py new file mode 100644 index 0000000..890dd72 --- /dev/null +++ b/model_executor/model_loader/online_quantization.py @@ -0,0 +1,224 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import types + +import torch +from torch import nn + +from vllm.config import ModelConfig +from vllm.logger import init_logger +from vllm.model_executor.model_loader.default_loader import DefaultModelLoader +from vllm.model_executor.model_loader.utils import process_weights_after_loading + +logger = init_logger(__name__) + +# Notes for Online Quantization +# In terms of state of checkpoints, quantization config and their +# correspondance to online quantization: +# | Use Case | Checkpoints | model_config.quantization | +# | no quant | high precision | None | +# | offline quant | quantized | fp8, torchao etc. | +# | online quant | high precision | torchao etc. | +# +# The process for loading non-quantized checkpoint +# 1. load non-quantized weights (load_weights) +# 2. do any additional post processing (process_weights_after_loading) +# +# The process for loading offline quantized checkpoint +# 1. load offline-quantized weights (load_weights) +# 2. do any additional post processing (process_weights_after_loading) + +# The process for unquantized model reloading +# (repeated run in RL training loop) +# first run +# UI1. load_weights: load bfloat16 weights +# UI2. process_weights_after_loading: any additional post processing +# subsequent run +# UC1: load_weights: load bfloat16 weights +# (shouldn't be any issues since we didn't change any attributes +# of the weights) +# UC2: process_weights_after_loading: any additional post processing + +# The process for weight reloading with online quantization +# (repeated run in RL training loop) +# first run +# I1. load_weights: load bfloat16 weights +# I2. process_weights_after_loading: +# record weight metadata and attributes for R1 and R2 +# quantize weights to fp8 +# subsequent run +# (beginning model weight is in fp8) +# load_weights: +# R1. restore bfloat16 model weight metadata +# R2. restore the model weight attributes +# R3. reload bfloat16 weights +# R4. quantize weights (by calling process_weights_after_loading), +# also set `process_weights_after_loading_already_called` to +# True to stop it from running again +# process_weights_after_loading (if called): +# this will be skipped since it's already ran in +# load_weights + + +def maybe_save_metadata_and_attributes_for_weight_reloading( + model: nn.Module, model_config: ModelConfig +): + # following is to support on the fly quantization, currently only supported + # for torchao + if model_config.quantization != "torchao": + return + + if getattr(model, "process_weights_after_loading_already_called", False): + # In case `process_weights_after_loading` is called multiple times + # we'll skip it at later times + logger.warning( + "process_weights_after_loading already called for model %s", model + ) + return + + from vllm.model_executor.model_loader.weight_utils import get_quant_config + + quant_config = get_quant_config(model_config, None) + + # If checkpoint is already torchao serialized, this means it's + # pre-quantized quantization case, we'll skip saving the metadata + # Otherwise, this is Step I2 of initialization steps of + # online quantization + # This step record the weights metadata and weight attributes so we can + # restore the bfloat16 model weights during the relad step (R1 and R2) + # see Notes in online_quantization.py for more details + if not ( + hasattr(quant_config, "is_checkpoint_torchao_serialized") + and not quant_config.is_checkpoint_torchao_serialized + ): + return + + # This is the I2 step of online quantiztion that saves + # metadata and attributes of weights so they can be used in R1 and + # R2 step, note that we only save these during initialization + + # Includes two things + # 1. save floating point metadata (shape, dtype, device) for init + # 2. save weight attributes, e.g. `output_dim`, `weight_loader` for init + + if getattr(model, "weight_metadata_and_attr_saved", False): + return + + # save the dtype, shape and device for model parameter, used for + # restoring the model high precision parameters before + # reloading the weights + assert not hasattr(model, "original_weights_rebuild_keys") + model.original_weights_rebuild_keys = {} + for name, p in model.named_parameters(): + model.original_weights_rebuild_keys[name] = { + "shape": p.shape, + "dtype": p.dtype, + "device": p.device, + } + + # record the weight attributes (loader functions etc.) + # so these can be recovered later when we reload the weights + # structure: {"weight_name": {"weight_attr_key": attr}} + assert not hasattr(model, "recorded_weight_attr") + model.recorded_weight_attr = {} + for name, param in model.named_parameters(): + model.recorded_weight_attr[name] = {} + for key in param.__dict__: + if hasattr(param, key): + attr = getattr(param, key) + if not callable(attr): + model.recorded_weight_attr[name][key] = attr + elif hasattr(attr, "__self__") and param is attr.__self__: + # if attr is a bonded method for an instance, and + # attr.__self__ points to the instance (param) + # we'll record the underlying function object + model.recorded_weight_attr[name][key] = attr.__func__ + else: + model.recorded_weight_attr[name][key] = attr + # mark the metadata and attributes saved so we don't run it again + model.weight_metadata_and_attr_saved = True + + +def _bond_method_to_cls(func, obj): + if hasattr(func, "__self__") or not callable(func): + # If the function is already bound to an instance, return it as is + return func + else: + return types.MethodType(func, obj) + + +def load_weights_and_online_quantize( + model_loader: DefaultModelLoader, model: nn.Module, model_config: ModelConfig +) -> set[str]: + # online quantization, right now only enabled for + # torchao + # R1, R2, R3, R4 in the Notes + + # TODO: Add fp8 support + assert model_config.quantization == "torchao", ( + "online quantization is only enabled for torchao currently" + ) + # TODO: use create_weights to restore the weights to original state + + # Step R1: First restore the quantized weights to original bfloat16 + # weights, with original metadata (shape, dtype, device) + # and attributes, so that bfloat16 weights can be loaded properly + existing_param_names = dict(model.named_parameters(remove_duplicate=False)).keys() + named_modules = dict(model.named_modules(remove_duplicate=False)) + model_device = None + + # Step R2: recover the parameter to the state before first loading + for name, d in model.original_weights_rebuild_keys.items(): + _shape = d["shape"] + _dtype = d["dtype"] + _device = d["device"] + if model_device is not None: + assert model_device == _device, ( + "Expecting all weights " + "to be in the same device for now, got both: " + f"{model_device} and {_device}" + ) + else: + model_device = _device + + if name in existing_param_names: + module_name, weight_name = name.rsplit(".", 1) + module = named_modules[module_name] + setattr( + module, + weight_name, + torch.nn.Parameter(torch.empty(_shape, dtype=_dtype, device=_device)), + ) + + # recorded_weight_attr is + # {"weight_name": {"weight_attr_key": attr}} + # e.g. + # { + # { + # "layer.0.weight": { + # "weight_loader": weight_loader_function_object, + # "input_dim": 0, ... + # }, + # "layer.1.weight": ..., + # } + # } + for full_weight_name, weight_attr_dict in model.recorded_weight_attr.items(): + for attr_name, attr in weight_attr_dict.items(): + module_name, weight_name = full_weight_name.rsplit(".", 1) + module = named_modules[module_name] + weight = getattr(module, weight_name) + if not hasattr(weight, attr_name): + setattr(weight, attr_name, _bond_method_to_cls(attr, weight)) + + # Step I1: reload bfloat16 / high precision weights + loaded_weights = model.load_weights( + model_loader.get_all_weights(model_config, model) + ) + + # Step I2: online quantize the weights + # manually process weights after loading + model.process_weights_after_loading_already_called = False + process_weights_after_loading(model, model_config, model_device) + model.process_weights_after_loading_already_called = True + return loaded_weights diff --git a/model_executor/model_loader/runai_streamer_loader.py b/model_executor/model_loader/runai_streamer_loader.py new file mode 100644 index 0000000..93da07c --- /dev/null +++ b/model_executor/model_loader/runai_streamer_loader.py @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: SIM117 +import os +from collections.abc import Generator + +import torch +from torch import nn +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME + +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig +from vllm.model_executor.model_loader.base_loader import BaseModelLoader +from vllm.model_executor.model_loader.weight_utils import ( + download_safetensors_index_file_from_hf, + download_weights_from_hf, + runai_safetensors_weights_iterator, +) +from vllm.transformers_utils.runai_utils import is_runai_obj_uri, list_safetensors + + +class RunaiModelStreamerLoader(BaseModelLoader): + """ + Model loader that can load safetensors + files from local FS or S3 bucket. + """ + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + + self._is_distributed = False + if load_config.model_loader_extra_config: + extra_config = load_config.model_loader_extra_config + + if "distributed" in extra_config and isinstance( + extra_config.get("distributed"), bool + ): + self._is_distributed = extra_config.get("distributed") + + if "concurrency" in extra_config and isinstance( + extra_config.get("concurrency"), int + ): + os.environ["RUNAI_STREAMER_CONCURRENCY"] = str( + extra_config.get("concurrency") + ) + + if "memory_limit" in extra_config and isinstance( + extra_config.get("memory_limit"), int + ): + os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str( + extra_config.get("memory_limit") + ) + + runai_streamer_s3_endpoint = os.getenv("RUNAI_STREAMER_S3_ENDPOINT") + aws_endpoint_url = os.getenv("AWS_ENDPOINT_URL") + if runai_streamer_s3_endpoint is None and aws_endpoint_url is not None: + os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url + + def _prepare_weights( + self, model_name_or_path: str, revision: str | None + ) -> list[str]: + """Prepare weights for the model. + + If the model is not local, it will be downloaded.""" + + is_object_storage_path = is_runai_obj_uri(model_name_or_path) + is_local = os.path.isdir(model_name_or_path) + safetensors_pattern = "*.safetensors" + index_file = SAFE_WEIGHTS_INDEX_NAME + + hf_folder = ( + model_name_or_path + if (is_local or is_object_storage_path) + else download_weights_from_hf( + model_name_or_path, + self.load_config.download_dir, + [safetensors_pattern], + revision, + ignore_patterns=self.load_config.ignore_patterns, + ) + ) + hf_weights_files = list_safetensors(path=hf_folder) + + if not is_local and not is_object_storage_path: + download_safetensors_index_file_from_hf( + model_name_or_path, index_file, self.load_config.download_dir, revision + ) + + if not hf_weights_files: + raise RuntimeError( + f"Cannot find any safetensors model weights with `{model_name_or_path}`" + ) + + return hf_weights_files + + def _get_weights_iterator( + self, model_or_path: str, revision: str + ) -> Generator[tuple[str, torch.Tensor], None, None]: + """Get an iterator for the model weights based on the load format.""" + hf_weights_files = self._prepare_weights(model_or_path, revision) + return runai_safetensors_weights_iterator( + hf_weights_files, self.load_config.use_tqdm_on_load, self._is_distributed + ) + + def download_model(self, model_config: ModelConfig) -> None: + """Download model if necessary""" + self._prepare_weights(model_config.model, model_config.revision) + + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: + """Load weights into a model.""" + model_weights = model_config.model + if hasattr(model_config, "model_weights"): + model_weights = model_config.model_weights + model.load_weights( + self._get_weights_iterator(model_weights, model_config.revision) + ) diff --git a/model_executor/model_loader/sharded_state_loader.py b/model_executor/model_loader/sharded_state_loader.py new file mode 100644 index 0000000..d94dbd9 --- /dev/null +++ b/model_executor/model_loader/sharded_state_loader.py @@ -0,0 +1,206 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import collections +import glob +import os +from collections.abc import Generator +from typing import Any + +import torch +from torch import nn + +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig +from vllm.logger import init_logger +from vllm.model_executor.model_loader.base_loader import BaseModelLoader +from vllm.model_executor.model_loader.weight_utils import ( + download_weights_from_hf, + runai_safetensors_weights_iterator, +) +from vllm.transformers_utils.s3_utils import glob as s3_glob +from vllm.transformers_utils.utils import is_s3 + +logger = init_logger(__name__) + + +class ShardedStateLoader(BaseModelLoader): + """ + Model loader that directly loads each worker's model state dict, which + enables a fast load path for large tensor-parallel models where each worker + only needs to read its own shard rather than the entire checkpoint. See + `examples/offline_inference/save_sharded_state.py` for creating a sharded + checkpoint. + """ + + DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors" + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + + extra_config = ( + {} + if load_config.model_loader_extra_config is None + else load_config.model_loader_extra_config.copy() + ) + self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN) + if extra_config: + raise ValueError( + f"Unexpected extra config keys for load format " + f"{load_config.load_format}: " + f"{load_config.model_loader_extra_config.keys()}" + ) + + @staticmethod + def _filter_subtensors( + tensors: dict[str, torch.Tensor], + ) -> dict[str, torch.Tensor]: + """ + Filter out all tensors that share the same memory or a subset of the + memory of another tensor. + """ + same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = ( + collections.defaultdict(list) + ) + for key, tensor in tensors.items(): + if tensor.numel(): + ptr = tensor.untyped_storage().data_ptr() + same_storage_groups[tensor.device, ptr].append((key, tensor)) + + def get_end_ptr(tensor: torch.Tensor) -> int: + return tensor.view(-1)[-1].data_ptr() + tensor.element_size() + + result: dict[str, torch.Tensor] = {} + for group in same_storage_groups.values(): + for k, t in group: + a, b = t.data_ptr(), get_end_ptr(t) + for k2, t2 in group: + if not t2.is_contiguous(): + continue + a2, b2 = t2.data_ptr(), get_end_ptr(t2) + if a < a2 or b2 < b: + continue + if a2 < a or b < b2 or not t.is_contiguous(): + break # t2 covers strictly more memory than t. + if k2 < k: + # Same tensors, keep the one with the smaller key. + break + else: + result[k] = t + return result + + def _prepare_weights(self, model_name_or_path: str, revision: str | None): + if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path): + return model_name_or_path + else: + allow_patterns = ["*.safetensors"] + return download_weights_from_hf( + model_name_or_path, + self.load_config.download_dir, + allow_patterns, + revision, + ignore_patterns=self.load_config.ignore_patterns, + ) + + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model, model_config.revision) + + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: + from vllm.distributed import get_tensor_model_parallel_rank + + model_weights = model_config.model + if hasattr(model_config, "model_weights"): + model_weights = model_config.model_weights + local_model_path = model_weights + + rank = get_tensor_model_parallel_rank() + pattern = os.path.join( + local_model_path, + self.pattern.format(rank=rank, part="*"), + ) + + filepaths = [] + if is_s3(local_model_path): + file_pattern = f"*{self.pattern.format(rank=rank, part='*')}" + filepaths = s3_glob(path=local_model_path, allow_pattern=[file_pattern]) + else: + filepaths = glob.glob(pattern) + if not filepaths: + # TODO: support un-sharded checkpoints too + raise ValueError( + f"Could not find checkpoint files '{pattern}', only " + f"pre-sharded checkpoints are currently supported!" + ) + state_dict = self._filter_subtensors(model.state_dict()) + for key, tensor in self.iterate_over_files(filepaths): + # If loading with LoRA enabled, additional padding may + # be added to certain parameters. We only load into a + # narrowed view of the parameter data. + param_data = state_dict[key].data + param_shape = state_dict[key].shape + for dim, size in enumerate(tensor.shape): + if size < param_shape[dim]: + param_data = param_data.narrow(dim, 0, size) + if tensor.shape != param_shape: + logger.warning( + "loading tensor of shape %s into parameter '%s' of shape %s", + tensor.shape, + key, + param_shape, + ) + param_data.copy_(tensor) + state_dict.pop(key) + if state_dict: + raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!") + + def iterate_over_files( + self, paths + ) -> Generator[tuple[str, torch.Tensor], None, None]: + if self.load_config.load_format == "runai_streamer_sharded": + yield from runai_safetensors_weights_iterator(paths, True) + else: + from safetensors.torch import safe_open + + for path in paths: + with safe_open(path, framework="pt") as f: + for key in f.keys(): # noqa: SIM118 + tensor = f.get_tensor(key) + yield key, tensor + + @staticmethod + def save_model( + model: torch.nn.Module, + path: str, + pattern: str | None = None, + max_size: int | None = None, + ) -> None: + from safetensors.torch import save_file + + from vllm.distributed import get_tensor_model_parallel_rank + + if pattern is None: + pattern = ShardedStateLoader.DEFAULT_PATTERN + rank = get_tensor_model_parallel_rank() + part_idx = 0 + total_size = 0 + state_dict = ShardedStateLoader._filter_subtensors(model.state_dict()) + state_dict_part: dict[str, torch.Tensor] = {} + for key, tensor in state_dict.items(): + param_size = tensor.nelement() * tensor.element_size() + if max_size is not None and total_size + param_size > max_size: + filename = pattern.format(rank=rank, part=part_idx) + save_file( + state_dict_part, + os.path.join(path, filename), + ) + part_idx += 1 + total_size = 0 + state_dict_part = {} + state_dict_part[key] = tensor + total_size += param_size + if len(state_dict_part) > 0: + filename = pattern.format(rank=rank, part=part_idx) + save_file( + state_dict_part, + os.path.join(path, filename), + ) diff --git a/model_executor/model_loader/tensorizer.py b/model_executor/model_loader/tensorizer.py new file mode 100644 index 0000000..e4e530f --- /dev/null +++ b/model_executor/model_loader/tensorizer.py @@ -0,0 +1,790 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import contextlib +import contextvars +import dataclasses +import json +import os +import tempfile +import threading +import time +from collections.abc import Generator, MutableMapping +from dataclasses import asdict, dataclass, field, fields +from typing import TYPE_CHECKING, Any, ClassVar, Optional + +import regex as re +import torch +from huggingface_hub import snapshot_download +from torch import nn +from torch.utils._python_dispatch import TorchDispatchMode +from transformers import PretrainedConfig + +import vllm.envs as envs +from vllm.config import ModelConfig, ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.logger import init_logger +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.platforms import current_platform +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.import_utils import PlaceholderModule + +if TYPE_CHECKING: + from vllm.engine.arg_utils import EngineArgs + +try: + from tensorizer import ( + DecryptionParams, + EncryptionParams, + TensorDeserializer, + TensorSerializer, + ) + from tensorizer.stream_io import open_stream + from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor + +except ImportError: + tensorizer = PlaceholderModule("tensorizer") + DecryptionParams = tensorizer.placeholder_attr("DecryptionParams") + EncryptionParams = tensorizer.placeholder_attr("EncryptionParams") + TensorDeserializer = tensorizer.placeholder_attr("TensorDeserializer") + TensorSerializer = tensorizer.placeholder_attr("TensorSerializer") + open_stream = tensorizer.placeholder_attr("stream_io.open_stream") + convert_bytes = tensorizer.placeholder_attr("utils.convert_bytes") + get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage") + no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor") + +__all__ = [ + "EncryptionParams", + "DecryptionParams", + "TensorDeserializer", + "TensorSerializer", + "open_stream", + "convert_bytes", + "get_mem_usage", + "no_init_or_tensor", + "TensorizerConfig", +] + +logger = init_logger(__name__) + + +def is_valid_deserialization_uri(uri: str | None) -> bool: + if uri: + scheme = uri.lower().split("://")[0] + return scheme in {"s3", "http", "https"} or os.path.exists(uri) + return False + + +def tensorizer_kwargs_arg(value): + loaded = json.loads(value) + if not isinstance(loaded, dict): + raise argparse.ArgumentTypeError( + f"Not deserializable to dict: {value}. serialization_kwargs and " + f"deserialization_kwargs must be " + f"deserializable from a JSON string to a dictionary. " + ) + return loaded + + +class MetaTensorMode(TorchDispatchMode): + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + + if func._schema.name == "aten::empty" and "device" not in kwargs: + kwargs["device"] = "meta" + + return func(*args, **kwargs) + + +def meta_tensor_mode( + loading_code=None, +): + if loading_code is None: + return _NoInitOrTensorImpl.context_manager() + elif callable(loading_code): + with _NoInitOrTensorImpl.context_manager(): + return loading_code() + else: + raise TypeError( + "expected a callable to evaluate," + " or None if being used as a context manager;" + f' got an object of type "{type(loading_code).__name__}" instead.' + ) + + +class _NoInitOrTensorImpl: + _MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm) + _MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES) + + is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active", default=False) + _count_active: int = 0 + _count_active_lock = threading.Lock() + + @classmethod + @contextlib.contextmanager + def context_manager(cls): + if cls.is_active.get(): + yield + return + + with cls._count_active_lock: + cls._count_active += 1 + if cls._count_active == 1: + for mod in cls._MODULES: + mod.reset_parameters = cls._disable(mod.reset_parameters) + + reset_token = cls.is_active.set(True) + + try: + with MetaTensorMode(): + yield + finally: + cls.is_active.reset(reset_token) + with cls._count_active_lock: + cls._count_active -= 1 + if cls._count_active == 0: + for mod, original in cls._MODULE_ORIGINALS: + mod.reset_parameters = original + + @staticmethod + def _disable(func): + def wrapper(*args, **kwargs): + if not _NoInitOrTensorImpl.is_active.get(): + return func(*args, **kwargs) + + return wrapper + + +@dataclass +class TensorizerConfig(MutableMapping): + tensorizer_uri: str | None = None + tensorizer_dir: str | None = None + vllm_tensorized: bool | None = None + verify_hash: bool | None = None + num_readers: int | None = None + encryption_keyfile: str | None = None + s3_access_key_id: str | None = None + s3_secret_access_key: str | None = None + s3_endpoint: str | None = None + lora_dir: str | None = None + stream_kwargs: dict[str, Any] | None = None + serialization_kwargs: dict[str, Any] | None = None + deserialization_kwargs: dict[str, Any] | None = None + _extra_serialization_attrs: dict[str, Any] | None = field(init=False, default=None) + model_class: type[torch.nn.Module] | None = field(init=False, default=None) + hf_config: PretrainedConfig | None = field(init=False, default=None) + dtype: str | torch.dtype | None = field(init=False, default=None) + _is_sharded: bool = field(init=False, default=False) + _fields: ClassVar[tuple[str, ...]] + _keys: ClassVar[frozenset[str]] + """Configuration class for Tensorizer settings. + + These settings configure the behavior of model serialization and + deserialization using Tensorizer. + + Attributes: + tensorizer_uri: Path to serialized model tensors. Can be a local file + path or a S3 URI. This is a required field unless lora_dir is + provided and the config is meant to be used for the + `tensorize_lora_adapter` function. Unless a `tensorizer_dir` or + `lora_dir` is passed to this object's initializer, this is + a required argument. + tensorizer_dir: Path to a directory containing serialized model tensors, + and all other potential model artifacts to load the model, such as + configs and tokenizer files. Can be passed instead of + `tensorizer_uri` where the `model.tensors` file will be assumed + to be in this directory. + vllm_tensorized: If True, indicates that the serialized model is a + vLLM model. This is used to determine the behavior of the + TensorDeserializer when loading tensors from a serialized model. + It is far faster to deserialize a vLLM model as it utilizes + tensorizer's optimized GPU loading. Note that this is now + deprecated, as serialized vLLM models are now automatically + inferred as vLLM models. + verify_hash: If True, the hashes of each tensor will be verified + against the hashes stored in the metadata. A `HashMismatchError` + will be raised if any of the hashes do not match. + num_readers: Controls how many threads are allowed to read concurrently + from the source file. Default is `None`, which will dynamically set + the number of readers based on the number of available + resources and model size. This greatly increases performance. + encryption_keyfile: File path to a binary file containing a + binary key to use for decryption. `None` (the default) means + no decryption. See the example script in + examples/others/tensorize_vllm_model.py. + s3_access_key_id: The access key for the S3 bucket. Can also be set via + the S3_ACCESS_KEY_ID environment variable. + s3_secret_access_key: The secret access key for the S3 bucket. Can also + be set via the S3_SECRET_ACCESS_KEY environment variable. + s3_endpoint: The endpoint for the S3 bucket. Can also be set via the + S3_ENDPOINT_URL environment variable. + lora_dir: Path to a directory containing LoRA adapter artifacts for + serialization or deserialization. When serializing LoRA adapters + this is the only necessary parameter to pass to this object's + initializer. + """ + + def __post_init__(self): + # check if the configuration is for a sharded vLLM model + self._is_sharded = ( + isinstance(self.tensorizer_uri, str) + and re.search(r"%0\dd", self.tensorizer_uri) is not None + ) + + if self.tensorizer_dir and self.lora_dir: + raise ValueError( + "Only one of tensorizer_dir or lora_dir may be specified. " + "Use lora_dir exclusively when serializing LoRA adapters, " + "and tensorizer_dir or tensorizer_uri otherwise." + ) + if self.tensorizer_dir and self.tensorizer_uri: + logger.warning_once( + "Provided both tensorizer_dir and tensorizer_uri. " + "Inferring tensorizer_dir from tensorizer_uri as the " + "latter takes precedence." + ) + self.tensorizer_dir = os.path.dirname(self.tensorizer_uri) + if not self.tensorizer_uri: + if self.lora_dir: + self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors" + elif self.tensorizer_dir: + self.tensorizer_uri = f"{self.tensorizer_dir}/model.tensors" + else: + raise ValueError( + "Unable to resolve tensorizer_uri. " + "A valid tensorizer_uri or tensorizer_dir " + "must be provided for deserialization, and a " + "valid tensorizer_uri, tensorizer_uri, or " + "lora_dir for serialization." + ) + else: + self.tensorizer_dir = os.path.dirname(self.tensorizer_uri) + + if not self.serialization_kwargs: + self.serialization_kwargs = {} + if not self.deserialization_kwargs: + self.deserialization_kwargs = {} + + def to_serializable(self) -> dict[str, Any]: + # Due to TensorizerConfig needing to be msgpack-serializable, it needs + # support for morphing back and forth between itself and its dict + # representation + + # TensorizerConfig's representation as a dictionary is meant to be + # linked to TensorizerConfig in such a way that the following is + # technically initializable: + # TensorizerConfig(**my_tensorizer_cfg.to_serializable()) + + # This means the dict must not retain non-initializable parameters + # and post-init attribute states + + # Also don't want to retain private and unset parameters, so only retain + # not None values and public attributes + + raw_tc_dict = asdict(self) + blacklisted = [] + + if "tensorizer_uri" in raw_tc_dict and "tensorizer_dir" in raw_tc_dict: + blacklisted.append("tensorizer_dir") + + if "tensorizer_dir" in raw_tc_dict and "lora_dir" in raw_tc_dict: + blacklisted.append("tensorizer_dir") + + tc_dict = {} + for k, v in raw_tc_dict.items(): + if ( + k not in blacklisted + and k not in tc_dict + and not k.startswith("_") + and v is not None + ): + tc_dict[k] = v + + return tc_dict + + def _construct_tensorizer_args(self) -> "TensorizerArgs": + return TensorizerArgs(self) # type: ignore + + def verify_with_parallel_config( + self, + parallel_config: "ParallelConfig", + ) -> None: + if parallel_config.tensor_parallel_size > 1 and not self._is_sharded: + raise ValueError( + "For a sharded model, tensorizer_uri should include a" + " string format template like '%04d' to be formatted" + " with the rank of the shard" + ) + + def verify_with_model_config(self, model_config: "ModelConfig") -> None: + if model_config.quantization is not None and self.tensorizer_uri is not None: + logger.warning( + "Loading a model using Tensorizer with quantization on vLLM" + " is unstable and may lead to errors." + ) + + def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None): + if tensorizer_args is None: + tensorizer_args = self._construct_tensorizer_args() + + return open_stream(self.tensorizer_uri, **tensorizer_args.stream_kwargs) + + def keys(self): + return self._keys + + def __len__(self): + return len(fields(self)) + + def __iter__(self): + return iter(self._fields) + + def __getitem__(self, item: str) -> Any: + if item not in self.keys(): + raise KeyError(item) + return getattr(self, item) + + def __setitem__(self, key: str, value: Any) -> None: + if key not in self.keys(): + # Disallow modifying invalid keys + raise KeyError(key) + setattr(self, key, value) + + def __delitem__(self, key, /): + if key not in self.keys(): + raise KeyError(key) + delattr(self, key) + + +TensorizerConfig._fields = tuple(f.name for f in fields(TensorizerConfig)) +TensorizerConfig._keys = frozenset(TensorizerConfig._fields) + + +@dataclass +class TensorizerArgs: + tensorizer_uri: str | None = None + tensorizer_dir: str | None = None + encryption_keyfile: str | None = None + + def __init__(self, tensorizer_config: TensorizerConfig): + for k, v in tensorizer_config.items(): + setattr(self, k, v) + self.file_obj = tensorizer_config.tensorizer_uri + self.s3_access_key_id = ( + tensorizer_config.s3_access_key_id or envs.S3_ACCESS_KEY_ID + ) + self.s3_secret_access_key = ( + tensorizer_config.s3_secret_access_key or envs.S3_SECRET_ACCESS_KEY + ) + self.s3_endpoint = tensorizer_config.s3_endpoint or envs.S3_ENDPOINT_URL + + self.stream_kwargs = { + "s3_access_key_id": tensorizer_config.s3_access_key_id, + "s3_secret_access_key": tensorizer_config.s3_secret_access_key, + "s3_endpoint": tensorizer_config.s3_endpoint, + **(tensorizer_config.stream_kwargs or {}), + } + + self.deserialization_kwargs = { + "verify_hash": tensorizer_config.verify_hash, + "encryption": tensorizer_config.encryption_keyfile, + "num_readers": tensorizer_config.num_readers, + **(tensorizer_config.deserialization_kwargs or {}), + } + + if self.encryption_keyfile: + with open_stream( + tensorizer_config.encryption_keyfile, + **self.stream_kwargs, + ) as stream: + key = stream.read() + decryption_params = DecryptionParams.from_key(key) + self.deserialization_kwargs["encryption"] = decryption_params + + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Tensorizer CLI arguments""" + + # Tensorizer options arg group + group = parser.add_argument_group( + "tensorizer options", + description=( + "Options for configuring the behavior of the" + " tensorizer deserializer when " + "load_format=tensorizer is specified when " + "initializing an LLMEngine, either via the CLI " + "when running the vLLM OpenAI inference server " + "with a JSON string passed to " + "--model-loader-extra-config or as arguments given " + "to TensorizerConfig when passed to " + "model_loader_extra_config in the constructor " + "for LLMEngine." + ), + ) + + group.add_argument( + "--tensorizer-uri", + type=str, + help="Path to serialized model tensors. Can be a local file path," + " or an HTTP(S) or S3 URI.", + ) + group.add_argument( + "--verify-hash", + action="store_true", + help="If enabled, the hashes of each tensor will be verified" + " against the hashes stored in the file metadata. An exception" + " will be raised if any of the hashes do not match.", + ) + group.add_argument( + "--encryption-keyfile", + type=str, + default=None, + help="The file path to a binary file containing a binary key to " + "use for decryption. Can be a file path or S3 network URI.", + ) + group.add_argument( + "--num-readers", + default=None, + type=int, + help="Controls how many threads are allowed to read concurrently " + "from the source file. Default is `None`, which will dynamically " + "set the number of readers based on the available resources " + "and model size. This greatly increases performance.", + ) + group.add_argument( + "--s3-access-key-id", + type=str, + default=None, + help="The access key for the S3 bucket. Can also be set via the " + "S3_ACCESS_KEY_ID environment variable.", + ) + group.add_argument( + "--s3-secret-access-key", + type=str, + default=None, + help="The secret access key for the S3 bucket. Can also be set via " + "the S3_SECRET_ACCESS_KEY environment variable.", + ) + group.add_argument( + "--s3-endpoint", + type=str, + default=None, + help="The endpoint for the S3 bucket. Can also be set via the " + "S3_ENDPOINT_URL environment variable.", + ) + + return parser + + @classmethod + def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs": + attrs = [attr.name for attr in dataclasses.fields(cls)] + tensorizer_args = cls( + **{attr: getattr(args, attr) for attr in attrs if hasattr(args, attr)} + ) + return tensorizer_args + + +def _check_tensors_on_meta_device(model: nn.Module) -> None: + for tensor in model.state_dict().values(): + if tensor.device.type == "meta": + raise ValueError( + "The serialized model contains tensors on the meta device," + " indicating that some tensors were not loaded properly." + " Please check that the parameters of the model being" + " specified match that of the serialized model, such as" + " its quantization." + ) + + +def _resize_lora_embeddings(model: nn.Module): + """Modify LoRA embedding layers to use bigger tensors + to allow for adapter added tokens.""" + for child in model.modules(): + if ( + isinstance(child, VocabParallelEmbedding) + and child.weight.shape[0] < child.num_embeddings_per_partition + ): + new_weight = torch.empty( + child.num_embeddings_per_partition, + child.embedding_dim, + dtype=child.weight.dtype, + device=child.weight.device, + ) + new_weight[: child.weight.shape[0]].copy_(child.weight.data) + new_weight[child.weight.shape[0] :].fill_(0) + child.weight.data = new_weight + + +def init_tensorizer_model( + tensorizer_config: TensorizerConfig, vllm_config: VllmConfig +) -> nn.Module: + assert tensorizer_config.hf_config is not None + model_args = tensorizer_config.hf_config + model_args.dtype = tensorizer_config.dtype + assert tensorizer_config.model_class is not None + # TODO: Do we need to consider old-style model class? + with meta_tensor_mode(), set_current_vllm_config(vllm_config, check_compile=True): + return tensorizer_config.model_class(vllm_config=vllm_config) + + +def deserialize_tensorizer_model( + model: nn.Module, tensorizer_config: TensorizerConfig +) -> None: + tensorizer_args = tensorizer_config._construct_tensorizer_args() + if not is_valid_deserialization_uri(tensorizer_config.tensorizer_uri): + raise ValueError( + f"{tensorizer_config.tensorizer_uri} is not a valid " + f"tensorizer URI. Please check that the URI is correct. " + f"It must either point to a local existing file, or have a " + f"S3, HTTP or HTTPS scheme." + ) + before_mem = get_mem_usage() + start = time.perf_counter() + with ( + open_stream( + tensorizer_config.tensorizer_uri, mode="rb", **tensorizer_args.stream_kwargs + ) as stream, + TensorDeserializer( + stream, + dtype=tensorizer_config.dtype, + device=f"xpu:{torch.xpu.current_device()}" + if current_platform.is_xpu() + else f"cuda:{torch.cuda.current_device()}", + **tensorizer_args.deserialization_kwargs, + ) as deserializer, + ): + deserializer.load_into_module(model) + end = time.perf_counter() + + total_bytes_str = convert_bytes(deserializer.total_tensor_bytes) + duration = end - start + per_second = convert_bytes(deserializer.total_tensor_bytes / duration) + after_mem = get_mem_usage() + deserializer.close() + logger.info( + "Deserialized %s in %0.2fs, %s/s", total_bytes_str, end - start, per_second + ) + logger.info("Memory usage before: %s", before_mem) + logger.info("Memory usage after: %s", after_mem) + + _check_tensors_on_meta_device(model) + _resize_lora_embeddings(model) + del model.vllm_tensorized_marker + + +def tensorizer_weights_iterator( + tensorizer_args: "TensorizerArgs", +) -> Generator[tuple[str, torch.Tensor], None, None]: + logger.warning( + "Deserializing HuggingFace models is not optimized for " + "loading on vLLM, as tensorizer is forced to load to CPU. " + "Consider deserializing a vLLM model instead for faster " + "load times. See the " + "examples/others/tensorize_vllm_model.py example script " + "for serializing vLLM models." + ) + + deserializer_args = tensorizer_args.deserialization_kwargs + stream_kwargs = tensorizer_args.stream_kwargs + stream = open_stream(tensorizer_args.tensorizer_uri, **stream_kwargs) + with TensorDeserializer(stream, **deserializer_args, device="cpu") as state: + yield from state.items() + del state + + +def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool: + """ + Infer if the model is a vLLM model by checking the weights for + a vLLM tensorized marker. + + Args: + tensorizer_config: The TensorizerConfig object containing the + tensorizer_uri to the serialized model. + + Returns: + bool: True if the model is a vLLM model, False otherwise. + """ + tensorizer_args = tensorizer_config._construct_tensorizer_args() + deserializer = TensorDeserializer( + open_stream(tensorizer_args.tensorizer_uri, **tensorizer_args.stream_kwargs), + **tensorizer_args.deserialization_kwargs, + lazy_load=True, + ) + if tensorizer_config.vllm_tensorized: + logger.warning( + "Please note that newly serialized vLLM models are automatically " + "inferred as vLLM models, so setting vllm_tensorized=True is " + "only necessary for models serialized prior to this change." + ) + return True + return ".vllm_tensorized_marker" in deserializer + + +def serialize_extra_artifacts( + tensorizer_args: TensorizerArgs, served_model_name: str | list[str] | None +) -> None: + if not isinstance(served_model_name, str): + raise ValueError( + f"served_model_name must be a str for serialize_extra_artifacts, " + f"not {type(served_model_name)}." + ) + + with tempfile.TemporaryDirectory() as tmpdir: + snapshot_download( + served_model_name, + local_dir=tmpdir, + ignore_patterns=[ + "*.pt", + "*.safetensors", + "*.bin", + "*.cache", + "*.gitattributes", + "*.md", + ], + ) + for artifact in os.scandir(tmpdir): + if not artifact.is_file(): + continue + with ( + open(artifact.path, "rb") as f, + open_stream( + f"{tensorizer_args.tensorizer_dir}/{artifact.name}", + mode="wb+", + **tensorizer_args.stream_kwargs, + ) as stream, + ): + logger.info("Writing artifact %s", artifact.name) + stream.write(f.read()) + + +def serialize_vllm_model( + model: nn.Module, + tensorizer_config: TensorizerConfig, + model_config: "ModelConfig", +) -> nn.Module: + model.register_parameter( + "vllm_tensorized_marker", + nn.Parameter(torch.tensor((1,), device="meta"), requires_grad=False), + ) + + tensorizer_args = tensorizer_config._construct_tensorizer_args() + + encryption_params = None + if (keyfile := tensorizer_config.encryption_keyfile) is not None: + with open(keyfile, "rb") as f: + key = f.read() + encryption_params = EncryptionParams(key=key) + + output_file = tensorizer_args.tensorizer_uri + if tensorizer_config._is_sharded: + from vllm.distributed import get_tensor_model_parallel_rank + + output_file = output_file % get_tensor_model_parallel_rank() + + with open_stream( + output_file, mode="wb+", **tensorizer_args.stream_kwargs + ) as stream: + serializer = TensorSerializer( + stream, + encryption=encryption_params, + **tensorizer_config.serialization_kwargs, + ) + serializer.write_module(model) + serializer.close() + + serialize_extra_artifacts(tensorizer_args, model_config.served_model_name) + + logger.info("Successfully serialized model to %s", str(output_file)) + return model + + +def tensorize_vllm_model( + engine_args: "EngineArgs", + tensorizer_config: TensorizerConfig, + generate_keyfile: bool = True, +): + """Utility to load a model and then serialize it with Tensorizer + + Intended to be used separately from running a vLLM server since it + creates its own Engine instance. + """ + engine_config = engine_args.create_engine_config() + tensorizer_config.verify_with_model_config(engine_config.model_config) + tensorizer_config.verify_with_parallel_config(engine_config.parallel_config) + + # generate the encryption key before creating the engine to support sharding + if ( + generate_keyfile + and (keyfile := tensorizer_config.encryption_keyfile) is not None + ): + encryption_params = EncryptionParams.random() + with open_stream( + keyfile, + mode="wb+", + s3_access_key_id=tensorizer_config.s3_access_key_id, + s3_secret_access_key=tensorizer_config.s3_secret_access_key, + s3_endpoint=tensorizer_config.s3_endpoint, + ) as stream: + stream.write(encryption_params.key) + + from vllm.v1.engine.llm_engine import LLMEngine + + engine = LLMEngine.from_vllm_config(engine_config) + engine.collective_rpc( + "save_tensorized_model", + kwargs={"tensorizer_config": tensorizer_config.to_serializable()}, + ) + + +def tensorize_lora_adapter(lora_path: str, tensorizer_config: TensorizerConfig): + """ + Uses tensorizer to serialize a LoRA adapter. Assumes that the files + needed to load a LoRA adapter are a safetensors-format file called + adapter_model.safetensors and a json config file called adapter_config.json. + + Serializes the files in the tensorizer_config.tensorizer_dir + """ + import safetensors + + from vllm.lora.utils import get_adapter_absolute_path + + lora_dir = get_adapter_absolute_path(lora_path) + + tensor_path = config_path = "" + + for file in os.listdir(lora_dir): + if file.startswith("adapter_model"): + tensor_path = lora_dir + "/" + file + if file.startswith("adapter_config"): + config_path = lora_dir + "/" + file + if tensor_path and config_path: + break + + if tensor_path.endswith(".safetensors"): + tensors = safetensors.torch.load_file(tensor_path) + elif tensor_path.endswith(".bin"): + tensors = torch.load(tensor_path) + else: + raise ValueError("Unsupported file: %s", tensor_path) + + with open(config_path) as f: + config = json.load(f) + + tensorizer_args = tensorizer_config._construct_tensorizer_args() + + with open_stream( + f"{tensorizer_config.tensorizer_dir}/adapter_config.json", + mode="wb+", + **tensorizer_args.stream_kwargs, + ) as f: + f.write(json.dumps(config).encode("utf-8")) + + lora_uri = f"{tensorizer_config.tensorizer_dir}/adapter_model.tensors" + with open_stream(lora_uri, mode="wb+", **tensorizer_args.stream_kwargs) as f: + serializer = TensorSerializer(f) + serializer.write_state_dict(tensors) + serializer.close() + + logger.info( + "Successfully serialized LoRA files to %s", + str(tensorizer_config.tensorizer_dir), + ) diff --git a/model_executor/model_loader/tensorizer_loader.py b/model_executor/model_loader/tensorizer_loader.py new file mode 100644 index 0000000..2b3704c --- /dev/null +++ b/model_executor/model_loader/tensorizer_loader.py @@ -0,0 +1,151 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: SIM117 +import copy +from collections.abc import Generator + +import torch +from torch import nn + +from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.config.load import LoadConfig +from vllm.logger import init_logger +from vllm.model_executor.model_loader.base_loader import BaseModelLoader +from vllm.model_executor.model_loader.tensorizer import ( + TensorizerConfig, + deserialize_tensorizer_model, + init_tensorizer_model, + is_vllm_tensorized, + serialize_vllm_model, + tensorizer_weights_iterator, +) +from vllm.model_executor.model_loader.utils import ( + get_model_architecture, + initialize_model, +) +from vllm.utils.torch_utils import set_default_torch_dtype + +logger = init_logger(__name__) + +BLACKLISTED_TENSORIZER_ARGS = { + "device", # vLLM decides this + "dtype", # vLLM decides this + "mode", # Not meant to be configurable by the user +} + + +def validate_config(config: dict): + for k, v in config.items(): + if v is not None and k in BLACKLISTED_TENSORIZER_ARGS: + raise ValueError(f"{k} is not an allowed Tensorizer argument.") + + +class TensorizerLoader(BaseModelLoader): + """Model loader using CoreWeave's tensorizer library.""" + + def __init__(self, load_config: LoadConfig): + super().__init__(load_config) + if isinstance(load_config.model_loader_extra_config, TensorizerConfig): + self.tensorizer_config = load_config.model_loader_extra_config + else: + validate_config(load_config.model_loader_extra_config) + self.tensorizer_config = TensorizerConfig( + **load_config.model_loader_extra_config["tensorizer_config"] + ) + + def _verify_config( + self, model_config: ModelConfig, parallel_config: ParallelConfig + ): + self.tensorizer_config.verify_with_model_config(model_config) + self.tensorizer_config.verify_with_parallel_config(parallel_config) + + def _get_weights_iterator( + self, + ) -> Generator[tuple[str, torch.Tensor], None, None]: + tensorizer_args = self.tensorizer_config._construct_tensorizer_args() + return tensorizer_weights_iterator(tensorizer_args) + + def _load_model_serialized_cpu( + self, + vllm_config: VllmConfig, + ) -> nn.Module: + """Load a serialized model with tensorizer to the CPU. + + This is only necessary when the model isn't vLLM-tensorized (see + examples/others/tensorize_vllm_model.py) This should still + be faster than default HuggingFace loading, but will be slower than + loading a vLLM-tensorized model. + """ + device_config = vllm_config.device_config + model_config = vllm_config.model_config + with set_default_torch_dtype(model_config.dtype): + with torch.device(device_config.device): + model = initialize_model(vllm_config=vllm_config) + + model.load_weights(self._get_weights_iterator()) + return model.eval() + + def download_model(self, model_config: ModelConfig) -> None: + self.tensorizer_config.verify_with_model_config(model_config) + + with self.tensorizer_config.open_stream(): + pass + + def _patch_tensorizer_config(self, model_config: ModelConfig) -> TensorizerConfig: + model_class = get_model_architecture(model_config)[0] + tensorizer_config = copy.copy(self.tensorizer_config) + tensorizer_config.model_class = model_class + tensorizer_config.hf_config = model_config.hf_config + tensorizer_config.dtype = model_config.dtype + return tensorizer_config + + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: + """Load serialized model weights with tensorizer. + + Expects a vLLM-tensorized model. See the + examples/others/tensorize_vllm_model.py example script + for serializing vLLM models.""" + if is_vllm_tensorized(self.tensorizer_config): + tensorizer_config = self._patch_tensorizer_config(model_config) + deserialize_tensorizer_model(model, tensorizer_config) + else: + model.load_weights(self._get_weights_iterator()) + + def load_model( + self, vllm_config: VllmConfig, model_config: ModelConfig + ) -> nn.Module: + parallel_config = vllm_config.parallel_config + self._verify_config(model_config, parallel_config) + + if parallel_config.tensor_parallel_size > 1: + from vllm.distributed import get_tensor_model_parallel_rank + + self.tensorizer_config.tensorizer_uri = ( + self.tensorizer_config.tensorizer_uri % get_tensor_model_parallel_rank() + ) + + if is_vllm_tensorized(self.tensorizer_config): + tensorizer_config = self._patch_tensorizer_config(model_config) + device_config = vllm_config.device_config + with set_default_torch_dtype(model_config.dtype): + with torch.device(device_config.device): + model = init_tensorizer_model( + tensorizer_config=tensorizer_config, vllm_config=vllm_config + ) + self.load_weights(model, model_config) + return model + return self._load_model_serialized_cpu(vllm_config=vllm_config) + + @staticmethod + def save_model( + model: torch.nn.Module, + tensorizer_config: TensorizerConfig | dict, + model_config: ModelConfig, + ) -> None: + if isinstance(tensorizer_config, dict): + tensorizer_config = TensorizerConfig(**tensorizer_config) + serialize_vllm_model( + model=model, + tensorizer_config=tensorizer_config, + model_config=model_config, + ) diff --git a/model_executor/model_loader/tpu.py b/model_executor/model_loader/tpu.py new file mode 100644 index 0000000..fc142f1 --- /dev/null +++ b/model_executor/model_loader/tpu.py @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import time + +import torch +import torch.nn as nn +import torch_xla.core.xla_model as xm +import torch_xla.distributed.spmd as xs + +from vllm.config import ModelConfig, VllmConfig +from vllm.distributed.tpu_distributed_utils import get_fqn, shard_model +from vllm.logger import init_logger +from vllm.model_executor.model_loader.default_loader import DefaultModelLoader +from vllm.model_executor.model_loader.utils import ( + initialize_model, + process_weights_after_loading, +) +from vllm.utils.torch_utils import set_default_torch_dtype + +logger = init_logger(__name__) + + +class TPUModelLoader(DefaultModelLoader): + """ + A TPU model loader for model loading under SPMD mode. + """ + + def load_model( + self, + vllm_config: VllmConfig, + model_config: ModelConfig, + mesh: xs.Mesh | None = None, + ) -> nn.Module: + # Initialize model and load weights on CPU. Then, during SPMD partition, + # weights are sharded and transferred to TPUs. + self.counter_before_loading_weights = time.perf_counter() + model_config = vllm_config.model_config + assert model_config.quantization is None, "Quantization not supported" + target_device = torch.device("cpu") + with set_default_torch_dtype(model_config.dtype): + with target_device: + model = initialize_model(vllm_config=vllm_config) + + load_format = vllm_config.load_config.load_format + if load_format != "dummy": + weights_to_load = {name for name, _ in model.named_parameters()} + all_weights = self.get_all_weights(model_config, model) + loaded_weights = model.load_weights(all_weights) + self.counter_after_loading_weights = time.perf_counter() + logger.info( + "Loading weights took %.2f seconds", + self.counter_after_loading_weights + - self.counter_before_loading_weights, + ) + # We only enable strict check for non-quantized models + # that have loaded weights tracking currently. + if model_config.quantization is None and loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError( + "Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}" + ) + else: + logger.info("Use dummy weight during weight loading.") + + process_weights_after_loading(model, model_config, target_device) + + counter_before_partition = time.perf_counter() + model = model.eval() + model = model.to("xla") + shard_model(model, mesh) + counter_after_partition = time.perf_counter() + logger.info( + "Partition model took %.2f seconds", + counter_after_partition - counter_before_partition, + ) + + # Ensure the model is properly loaded. + self._check_model_is_loaded(mesh, model) + + # Need to torch compile after model sharding are done. Because the + # compiler hints ('xs.mark_sharding') are torch ops. + if not model_config.is_multimodal_model: + model.model = torch.compile(model.model, backend="openxla") + else: + model.language_model.model = torch.compile( + model.language_model.model, backend="openxla" + ) + return model + + def _check_model_is_loaded(self, mesh: xs.Mesh | None, model: nn.Module) -> None: + """ + Ensure the model is properly loaded. + 1. All model parameters and buffers are on XLA device. + 2. Non-SPMD friendly layers are replaced as expected. + """ + device = xm.xla_device() + device_type = str(device.type) + + # Check parameters + for name, param in model.named_parameters(): + assert param.device.type == device_type, ( + f"Parameter {name} is on {param.device.type} instead of {device_type}" + ) + + # Check buffers + for name, buffer in model.named_buffers(): + assert buffer.device.type == device_type, ( + f"Buffer {name} is on {buffer.device.type} instead of {device_type}" + ) + + for module in model.modules(): + if (mesh is not None) and (get_fqn(module) == "QKVParallelLinear"): + raise AssertionError( + "QKVParallelLinear should be replaced by \ + XlaQKVParallelLinear under SPMD mode." + ) diff --git a/model_executor/model_loader/utils.py b/model_executor/model_loader/utils.py new file mode 100644 index 0000000..ba708a0 --- /dev/null +++ b/model_executor/model_loader/utils.py @@ -0,0 +1,288 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utilities for selecting and loading models.""" + +import inspect +import warnings +from contextlib import contextmanager +from dataclasses import dataclass, field + +import torch +from torch import nn +from typing_extensions import assert_never + +from vllm.attention import Attention +from vllm.attention.layer import MLAAttention +from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.models.adapters import ( + as_embedding_model, + as_reward_model, + as_seq_cls_model, + try_create_mm_pooling_model_cls, +) +from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal +from vllm.utils.platform_utils import is_pin_memory_available + +logger = init_logger(__name__) + + +def initialize_model( + vllm_config: VllmConfig, + *, + prefix: str = "", + model_class: type[nn.Module] | None = None, + model_config: ModelConfig | None = None, +) -> nn.Module: + """Initialize a model with the given configurations.""" + if model_config is None: + model_config = vllm_config.model_config + if model_class is None: + model_class, _ = get_model_architecture(model_config) + + if vllm_config.quant_config is not None: + configure_quant_config(vllm_config.quant_config, model_class) + + signatures = inspect.signature(model_class.__init__) + all_params = [param.name for param in signatures.parameters.values()] + if "vllm_config" in all_params and "prefix" in all_params: + # new-style model class + with set_current_vllm_config(vllm_config, check_compile=True, prefix=prefix): + return model_class(vllm_config=vllm_config, prefix=prefix) + + msg = ( + "vLLM model class should accept `vllm_config` and `prefix` as " + "input arguments. Possibly you have an old-style model class" + " registered from out of tree and it is used for new vLLM version. " + "Check https://docs.vllm.ai/en/latest/design/arch_overview.html " + "for the design and update the model class accordingly." + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + logger.warning( + "Trying to guess the arguments for old-style model class %s", + model_class, + ) + # try to be compatible with old-style model class + kwargs = {} + if "prefix" in all_params: + kwargs["prefix"] = prefix + if "config" in all_params: + kwargs["config"] = model_config.hf_config + if "cache_config" in all_params: + kwargs["cache_config"] = vllm_config.cache_config + if "quant_config" in all_params: + kwargs["quant_config"] = vllm_config.quant_config + if "lora_config" in all_params: + kwargs["lora_config"] = vllm_config.lora_config + if "scheduler_config" in all_params: + kwargs["scheduler_config"] = vllm_config.scheduler_config + with set_current_vllm_config(vllm_config, check_compile=True, prefix=prefix): + return model_class(**kwargs) + + +def process_weights_after_loading( + model: nn.Module, model_config: ModelConfig, target_device: torch.device +) -> None: + # to avoid circular dependency + from vllm.model_executor.model_loader.online_quantization import ( + maybe_save_metadata_and_attributes_for_weight_reloading, + ) + + maybe_save_metadata_and_attributes_for_weight_reloading(model, model_config) + + for _, module in model.named_modules(): + quant_method = getattr(module, "quant_method", None) + if isinstance(quant_method, QuantizeMethodBase): + # When quant methods need to process weights after loading + # (for repacking, quantizing, etc), they expect parameters + # to be on the global target device. This scope is for the + # case where cpu offloading is used, where we will move the + # parameters onto device for processing and back off after. + with device_loading_context(module, target_device): + quant_method.process_weights_after_loading(module) + + # Initialize post-load attention weights for both Attention and MLA. + # NOTE: Happens after other modules so we can easily decompress weights. + for _, module in model.named_modules(): + if isinstance(module, (Attention, MLAAttention)) and hasattr( + module, "process_weights_after_loading" + ): + # TODO(lucas): see if there is a way to unify the signatures + # of process_weights_after_loading + module.process_weights_after_loading(model_config.dtype) + + +@contextmanager +def device_loading_context(module: torch.nn.Module, target_device: torch.device): + if target_device.type == "cpu": + # If target is CPU, no need to move anything + yield module + return + + original_device_states: dict[str, torch.device] = {} + + # Store original device states and move parameters to GPU if they're on CPU + for name, p in module.named_parameters(): + if p.device.type == "cpu": + original_device_states[name] = p.device + p.data = p.data.to(target_device) + # Parameters already on target device are not touched + + try: + yield module + + finally: + # Restore parameters to their original devices, ignoring new parameters + pin_memory = is_pin_memory_available() + for name, p in module.named_parameters(): + if name in original_device_states: + original_device: torch.device = original_device_states[name] + if original_device.type == "cpu": + # `torch.empty_like` does not support `pin_memory` argument + cpu_data = torch.empty_strided( + size=p.data.size(), + stride=p.data.stride(), + dtype=p.data.dtype, + layout=p.data.layout, + device="cpu", + pin_memory=pin_memory, + ) + cpu_data.copy_(p.data) + p.data = cpu_data + else: + p.data = p.data.to(original_device) + # New parameters or parameters already on target device are untouched + + +_MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]() +"""Caches the outputs of `_get_model_architecture`.""" + + +def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]: + architectures = getattr(model_config.hf_config, "architectures", []) + + model_cls, arch = model_config.registry.resolve_model_cls( + architectures, + model_config=model_config, + ) + + if arch == model_config._get_transformers_backend_cls(): + assert model_config.model_impl != "vllm" + if model_config.model_impl == "auto": + logger.warning_once( + "%s has no vLLM implementation, falling back to Transformers " + "implementation. Some features may not be supported and " + "performance may not be optimal.", + arch, + ) + + convert_type = model_config.convert_type + if convert_type != "none" and supports_multimodal(model_cls): + logger.debug_once("Detected conversion of Multi Modal model.") + converted = try_create_mm_pooling_model_cls(model_cls) + if converted is not None: + logger.debug_once("Creating wrapper class to forward pooler.") + return converted, arch + else: + logger.debug_once("Attempting direct conversion.") + + if convert_type == "none": + pass + elif convert_type == "embed": + logger.debug_once("Converting to embedding model.") + model_cls = as_embedding_model(model_cls) + elif convert_type == "classify": + logger.debug_once("Converting to sequence classification model.") + model_cls = as_seq_cls_model(model_cls) + elif convert_type == "reward": + logger.debug_once("Converting to reward model.") + model_cls = as_reward_model(model_cls) + else: + assert_never(convert_type) + + return model_cls, arch + + +def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]: + key = hash( + ( + model_config.model, + model_config.convert_type, + model_config.runner_type, + model_config.trust_remote_code, + model_config.model_impl, + tuple(getattr(model_config.hf_config, "architectures", [])), + ) + ) + if key in _MODEL_ARCH_BY_HASH: + return _MODEL_ARCH_BY_HASH[key] + + model_arch = _get_model_architecture(model_config) + _MODEL_ARCH_BY_HASH[key] = model_arch + return model_arch + + +def get_model_cls(model_config: ModelConfig) -> type[nn.Module]: + return get_model_architecture(model_config)[0] + + +def get_architecture_class_name(model_config: ModelConfig) -> str: + return get_model_architecture(model_config)[1] + + +@dataclass +class ParamMapping: + """ + A class to handle parameter mapping for model weight loading. + It creates a bidirectional mapping between packed parameters and their + constituent parts. + """ + + packed_mapping: dict[str, list[str]] + inverse_packed_mapping: dict[str, tuple[str, int]] = field(default_factory=dict) + + def __post_init__(self): + for packed_name, sub_params in self.packed_mapping.items(): + # Skip self-contained cases (e.g., {"W_pack": ["W_pack"]}) + if len(sub_params) == 1 and sub_params[0] == packed_name: + continue + for index, param_name in enumerate(sub_params): + self.inverse_packed_mapping[param_name] = ( + packed_name, + index, + ) + + def get_sub_modules(self, module_name: str) -> tuple[str, list[str]] | None: + for key, value in self.packed_mapping.items(): + if module_name.endswith(key): + return key, value + return None + + +def configure_quant_config( + quant_config: QuantizationConfig, model_class: type[nn.Module] +): + """ + Pass packed_modules_mapping by reference to quant_config so that + quant_config can properly match fused modules + + Note that model attributes are passed by reference to quant_config, + enabling them to be updated by model_class.__new__ (ex. chatglm, qwen) + + Once the `SupportsQuant` mixin has been added to all models, this + function can be removed + """ + if not issubclass(model_class, SupportsQuant): + hf_to_vllm_mapper = getattr(model_class, "hf_to_vllm_mapper", None) + packed_mapping = getattr(model_class, "packed_modules_mapping", None) + + # pass mappings by reference to quant_config + if hf_to_vllm_mapper is not None: + quant_config.apply_vllm_mapper(hf_to_vllm_mapper) + if packed_mapping is not None: + quant_config.packed_modules_mapping = packed_mapping diff --git a/model_executor/model_loader/weight_utils.py b/model_executor/model_loader/weight_utils.py new file mode 100644 index 0000000..d38b91e --- /dev/null +++ b/model_executor/model_loader/weight_utils.py @@ -0,0 +1,1106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utilities for downloading and initializing model weights.""" + +import concurrent.futures +import fnmatch +import glob +import hashlib +import json +import os +import tempfile +import time +from collections import defaultdict +from collections.abc import Callable, Generator +from contextlib import contextmanager +from pathlib import Path +from typing import IO, Any + +import filelock +import huggingface_hub.constants +import numpy as np +import torch +from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download +from safetensors.torch import load, load_file, safe_open, save_file +from tqdm.auto import tqdm + +from vllm import envs +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization import ( + QuantizationConfig, + get_quantization_config, +) +from vllm.platforms import current_platform +from vllm.utils.import_utils import PlaceholderModule + +try: + from runai_model_streamer import SafetensorsStreamer +except ImportError: + runai_model_streamer = PlaceholderModule("runai_model_streamer") # type: ignore[assignment] + SafetensorsStreamer = runai_model_streamer.placeholder_attr("SafetensorsStreamer") + +try: + import gguf +except ImportError: + gguf = PlaceholderModule("gguf") + +try: + from fastsafetensors import SafeTensorsFileLoader, SingleGroup +except ImportError: + fastsafetensors = PlaceholderModule("fastsafetensors") + SafeTensorsFileLoader = fastsafetensors.placeholder_attr("SafeTensorsFileLoader") + SingleGroup = fastsafetensors.placeholder_attr("SingleGroup") + +from vllm.model_executor.layers.quantization.torchao import torchao_version_at_least + +logger = init_logger(__name__) + +# use system-level temp directory for file locks, so that multiple users +# can share the same lock without error. +# lock files in the temp directory will be automatically deleted when the +# system reboots, so users will not complain about annoying lock files +temp_dir = tempfile.gettempdir() + + +def enable_hf_transfer(): + """automatically activates hf_transfer""" + if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ: + try: + # enable hf hub transfer if available + import hf_transfer # type: ignore # noqa + + huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True + except ImportError: + pass + + +enable_hf_transfer() + + +class DisabledTqdm(tqdm): + def __init__(self, *args, **kwargs): + kwargs["disable"] = True + super().__init__(*args, **kwargs) + + +def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None): + lock_dir = cache_dir or temp_dir + model_name_or_path = str(model_name_or_path) + os.makedirs(os.path.dirname(lock_dir), exist_ok=True) + model_name = model_name_or_path.replace("/", "-") + hash_name = hashlib.sha256(model_name.encode()).hexdigest() + # add hash to avoid conflict with old users' lock files + lock_file_name = hash_name + model_name + ".lock" + # mode 0o666 is required for the filelock to be shared across users + lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), mode=0o666) + return lock + + +@contextmanager +def atomic_writer( + filepath: str | Path, mode: str = "w", encoding: str | None = None +) -> Generator[IO]: + """ + Context manager that provides an atomic file writing routine. + + The context manager writes to a temporary file and, if successful, + atomically replaces the original file. + + Args: + filepath (str or Path): The path to the file to write. + mode (str): The file mode for the temporary file (e.g., 'w', 'wb'). + encoding (str): The encoding for text mode. + + Yields: + file object: A handle to the temporary file. + """ + # Create a temporary file in the same directory as the target file + # to ensure it's on the same filesystem for an atomic replace. + temp_dir = os.path.dirname(filepath) + temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir) + + try: + # Open the temporary file for writing + with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file: + yield temp_file + + # If the 'with' block completes successfully, + # perform the atomic replace. + os.replace(temp_path, filepath) + + except Exception: + logger.exception( + "Error during atomic write. Original file '%s' not modified", filepath + ) + raise + finally: + # Clean up the temporary file if it still exists. + if os.path.exists(temp_path): + os.remove(temp_path) + + +def maybe_download_from_modelscope( + model: str, + revision: str | None = None, + download_dir: str | None = None, + ignore_patterns: str | list[str] | None = None, + allow_patterns: list[str] | str | None = None, +) -> str | None: + """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True. + + Returns the path to the downloaded model, or None if the model is not + downloaded from ModelScope.""" + if envs.VLLM_USE_MODELSCOPE: + # download model from ModelScope hub, + # lazy import so that modelscope is not required for normal use. + # pylint: disable=C. + from modelscope.hub.snapshot_download import snapshot_download + + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model, download_dir): + if not os.path.exists(model): + model_path = snapshot_download( + model_id=model, + cache_dir=download_dir, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + revision=revision, + ignore_file_pattern=ignore_patterns, + allow_patterns=allow_patterns, + ) + else: + model_path = model + return model_path + return None + + +def _shared_pointers(tensors): + ptrs = defaultdict(list) + for k, v in tensors.items(): + ptrs[v.data_ptr()].append(k) + failing = [] + for _, names in ptrs.items(): + if len(names) > 1: + failing.append(names) + return failing + + +def convert_bin_to_safetensor_file( + pt_filename: str, + sf_filename: str, +) -> None: + loaded = torch.load(pt_filename, map_location="cpu", weights_only=True) + if "state_dict" in loaded: + loaded = loaded["state_dict"] + shared = _shared_pointers(loaded) + for shared_weights in shared: + for name in shared_weights[1:]: + loaded.pop(name) + + # For tensors to be contiguous + loaded = {k: v.contiguous() for k, v in loaded.items()} + + dirname = os.path.dirname(sf_filename) + os.makedirs(dirname, exist_ok=True) + save_file(loaded, sf_filename, metadata={"format": "pt"}) + + # check file size + sf_size = os.stat(sf_filename).st_size + pt_size = os.stat(pt_filename).st_size + if (sf_size - pt_size) / pt_size > 0.01: + raise RuntimeError(f"""The file size different is more than 1%: + - {sf_filename}: {sf_size} + - {pt_filename}: {pt_size} + """) + + # check if the tensors are the same + reloaded = load_file(sf_filename) + for k in loaded: + pt_tensor = loaded[k] + sf_tensor = reloaded[k] + if not torch.equal(pt_tensor, sf_tensor): + raise RuntimeError(f"The output tensors do not match for key {k}") + + +# TODO(woosuk): Move this to other place. +def get_quant_config( + model_config: ModelConfig, load_config: LoadConfig +) -> QuantizationConfig: + quant_cls = get_quantization_config(model_config.quantization) + + # GGUF doesn't have config file + if model_config.quantization in ("gguf", "inc"): + return quant_cls() + + # Read the quantization config from the HF model config, if available. + hf_quant_config = getattr(model_config.hf_config, "quantization_config", None) + # some vision model may keep quantization_config in their text_config + hf_text_config = getattr(model_config.hf_config, "text_config", None) + if hf_quant_config is None and hf_text_config is not None: + hf_quant_config = getattr(hf_text_config, "quantization_config", None) + if hf_quant_config is None: + # compressed-tensors uses a compressions_config + hf_quant_config = getattr(model_config.hf_config, "compression_config", None) + + if hf_quant_config is not None: + return quant_cls.from_config(hf_quant_config) + + # if hf_quant_config is None, we will try to get config from + # hf_overrides + hf_overrides = model_config.hf_overrides + quantization_config_file = hf_overrides.get("quantization_config_file", None) + if quantization_config_file is not None: + if hasattr(quant_cls, "from_config_file"): + return quant_cls.from_config_file(quantization_config_file) + else: + raise NotImplementedError( + "from_config_file is specified in hf_override config, " + "but quant_cls.from_config_file is not implemented in " + f"{quant_cls}" + ) + quantization_config_json = hf_overrides.get("quantization_config_dict_json", None) + if quantization_config_json is not None: + if hasattr(quant_cls, "from_config_dict_json"): + return quant_cls.from_config_dict_json(quantization_config_json) + else: + raise NotImplementedError( + "from_config_dict_json is specified in hf_override config, " + "but quant_cls.from_config_dict_json is not implemented in " + f"{quant_cls}" + ) + + # Inflight BNB quantization + if model_config.quantization == "bitsandbytes": + return quant_cls.from_config({}) + model_name_or_path = ( + maybe_download_from_modelscope( + model_config.model, + revision=model_config.revision, + download_dir=load_config.download_dir, + allow_patterns=["*.json"], + ) + or model_config.model + ) + is_local = os.path.isdir(model_name_or_path) + if not is_local: + # Download the config files. + with get_lock(model_config.model, load_config.download_dir): + hf_folder = snapshot_download( + model_config.model, + revision=model_config.revision, + allow_patterns="*.json", + cache_dir=load_config.download_dir, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + tqdm_class=DisabledTqdm, + ) + else: + hf_folder = model_name_or_path + + possible_config_filenames = quant_cls.get_config_filenames() + + # If the quantization config is not found, use the default config. + if not possible_config_filenames: + return quant_cls() + + config_files = glob.glob(os.path.join(hf_folder, "*.json")) + + quant_config_files = [ + f for f in config_files if any(f.endswith(x) for x in possible_config_filenames) + ] + if len(quant_config_files) == 0: + raise ValueError(f"Cannot find the config file for {model_config.quantization}") + if len(quant_config_files) > 1: + raise ValueError( + f"Found multiple config files for {model_config.quantization}: " + f"{quant_config_files}" + ) + + quant_config_file = quant_config_files[0] + with open(quant_config_file) as f: + config = json.load(f) + + if model_config.quantization == "bitsandbytes": + config["adapter_name_or_path"] = model_config.model + elif model_config.quantization == "modelopt": + if config["producer"]["name"] == "modelopt": + return quant_cls.from_config(config) + else: + raise ValueError( + f"Unsupported quantization config" + f" found for {model_config.quantization} in {f}." + ) + + return quant_cls.from_config(config) + + +def get_sparse_attention_config( + model_config: ModelConfig, + load_config: LoadConfig, + sparse_attention_config_filename: str = "sparse_attention_config.json", +) -> dict[str, Any]: + model_name_or_path = model_config.model + is_local = os.path.isdir(model_name_or_path) + if not is_local: + # Download the config files. + with get_lock(model_name_or_path, load_config.download_dir): + hf_folder = snapshot_download( + model_name_or_path, + revision=model_config.revision, + allow_patterns="*.json", + cache_dir=load_config.download_dir, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + tqdm_class=DisabledTqdm, + ) + else: + hf_folder = model_name_or_path + + config_file = os.path.join(hf_folder, sparse_attention_config_filename) + if not os.path.exists(config_file): + return {} + + # Load the sparse attention config. + with open(config_file) as f: + config = json.load(f) + logger.info("Loaded sparse attention config from %s", config_file) + + return config + + +def download_weights_from_hf( + model_name_or_path: str, + cache_dir: str | None, + allow_patterns: list[str], + revision: str | None = None, + ignore_patterns: str | list[str] | None = None, +) -> str: + """Download model weights from Hugging Face Hub. + + Args: + model_name_or_path (str): The model name or path. + cache_dir (Optional[str]): The cache directory to store the model + weights. If None, will use HF defaults. + allow_patterns (list[str]): The allowed patterns for the + weight files. Files matched by any of the patterns will be + downloaded. + revision (Optional[str]): The revision of the model. + ignore_patterns (Optional[Union[str, list[str]]]): The patterns to + filter out the weight files. Files matched by any of the patterns + will be ignored. + + Returns: + str: The path to the downloaded model weights. + """ + assert len(allow_patterns) > 0 + local_only = huggingface_hub.constants.HF_HUB_OFFLINE + if not local_only: + # Attempt to reduce allow_patterns to a single pattern + # so we only have to call snapshot_download once. + try: + fs = HfFileSystem() + file_list = fs.ls(model_name_or_path, detail=False, revision=revision) + + # Use the first pattern found in the HF repo's files. + for pattern in allow_patterns: + matching = fnmatch.filter(file_list, pattern) + if len(matching) > 0: + allow_patterns = [pattern] + break + except Exception as e: + logger.warning( + "Failed to get file list for '%s'. Trying each pattern in " + "allow_patterns individually until weights have been " + "downloaded. Error: %s", + model_name_or_path, + e, + ) + + logger.debug("Using model weights format %s", allow_patterns) + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model_name_or_path, cache_dir): + start_time = time.perf_counter() + for allow_pattern in allow_patterns: + hf_folder = snapshot_download( + model_name_or_path, + allow_patterns=allow_pattern, + ignore_patterns=ignore_patterns, + cache_dir=cache_dir, + tqdm_class=DisabledTqdm, + revision=revision, + local_files_only=local_only, + ) + # If we have downloaded weights for this allow_pattern, + # we don't need to check the rest. + if any(Path(hf_folder).glob(allow_pattern)): + break + time_taken = time.perf_counter() - start_time + if time_taken > 0.5: + logger.info( + "Time spent downloading weights for %s: %.6f seconds", + model_name_or_path, + time_taken, + ) + return hf_folder + + +def download_safetensors_index_file_from_hf( + model_name_or_path: str, + index_file: str, + cache_dir: str | None, + revision: str | None = None, +) -> None: + """Download hf safetensors index file from Hugging Face Hub. + + Args: + model_name_or_path (str): The model name or path. + index_file (str): The safetensors index file name + cache_dir (Optional[str]): The cache directory to store the model + weights. If None, will use HF defaults. + revision (Optional[str]): The revision of the model. + """ + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model_name_or_path, cache_dir): + try: + # Download the safetensors index file. + hf_hub_download( + repo_id=model_name_or_path, + filename=index_file, + cache_dir=cache_dir, + revision=revision, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ) + # If file not found on remote or locally, we should not fail since + # only some models will have index_file. + except huggingface_hub.utils.LocalEntryNotFoundError: + logger.info("No %s found in local cache.", index_file) + except huggingface_hub.utils.EntryNotFoundError: + logger.info("No %s found in remote.", index_file) + + +# For models like Mistral-7B-v0.3, there are both sharded +# safetensors files and a consolidated safetensors file. +# Passing both of these to the weight loader functionality breaks. +# So, we use the index_file to +# look up which safetensors files should be used. +def filter_duplicate_safetensors_files( + hf_weights_files: list[str], hf_folder: str, index_file: str +) -> list[str]: + # model.safetensors.index.json is a mapping from keys in the + # torch state_dict to safetensors file holding that weight. + index_file_name = os.path.join(hf_folder, index_file) + if not os.path.isfile(index_file_name): + return hf_weights_files + + # Iterate through the weight_map (weight_name: safetensors files) + # to identify weights that we should use. + with open(index_file_name) as f: + weight_map = json.load(f)["weight_map"] + weight_files_in_index = set() + for weight_name in weight_map: + weight_files_in_index.add(os.path.join(hf_folder, weight_map[weight_name])) + # Filter out any fields that are not found in the index file. + hf_weights_files = [f for f in hf_weights_files if f in weight_files_in_index] + return hf_weights_files + + +def filter_files_not_needed_for_inference(hf_weights_files: list[str]) -> list[str]: + """ + Exclude files that are not needed for inference. + + See https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 + """ + blacklist = [ + "training_args.bin", + "optimizer.bin", + "optimizer.pt", + "scheduler.pt", + "scaler.pt", + ] + hf_weights_files = [ + f for f in hf_weights_files if not any(f.endswith(x) for x in blacklist) + ] + return hf_weights_files + + +# explicitly use pure text format, with a newline at the end +# this makes it impossible to see the animation in the progress bar +# but will avoid messing up with ray or multiprocessing, which wraps +# each line of output with some prefix. +_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n" # noqa: E501 + + +def enable_tqdm(use_tqdm_on_load: bool): + return use_tqdm_on_load and ( + not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 + ) + + +def np_cache_weights_iterator( + model_name_or_path: str, + cache_dir: str | None, + hf_folder: str, + hf_weights_files: list[str], + use_tqdm_on_load: bool, +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Iterate over the weights in the model np files. + + Will dump the model weights to numpy files if they are not already dumped. + """ + # Convert the model weights from torch tensors to numpy arrays for + # faster loading. + np_folder = os.path.join(hf_folder, "np") + os.makedirs(np_folder, exist_ok=True) + weight_names_file = os.path.join(np_folder, "weight_names.json") + # Use file lock to prevent multiple processes from + # dumping the same model weights to numpy at the same time. + with get_lock(model_name_or_path, cache_dir): + if not os.path.exists(weight_names_file): + weight_names: list[str] = [] + for bin_file in tqdm( + hf_weights_files, + desc="Loading np_cache checkpoint shards", + disable=not enable_tqdm(use_tqdm_on_load), + bar_format=_BAR_FORMAT, + ): + state = torch.load(bin_file, map_location="cpu", weights_only=True) + for name, param in state.items(): + param_path = os.path.join(np_folder, name) + with open(param_path, "wb") as f: + np.save(f, param.cpu().detach().numpy()) + weight_names.append(name) + with open(weight_names_file, "w") as f: + json.dump(weight_names, f) + + with open(weight_names_file) as f: + weight_names = json.load(f) + + for name in weight_names: + param_path = os.path.join(np_folder, name) + with open(param_path, "rb") as f: + param = np.load(f) + yield name, torch.from_numpy(param) + + +def safetensors_weights_iterator( + hf_weights_files: list[str], + use_tqdm_on_load: bool, + safetensors_load_strategy: str = "lazy", +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Iterate over the weights in the model safetensor files.""" + loading_desc = "Loading safetensors checkpoint shards" + if safetensors_load_strategy == "eager": + loading_desc += " (eager)" + + for st_file in tqdm( + hf_weights_files, + desc=loading_desc, + disable=not enable_tqdm(use_tqdm_on_load), + bar_format=_BAR_FORMAT, + ): + if safetensors_load_strategy == "eager": + with open(st_file, "rb") as f: + state_dict = load(f.read()) + yield from state_dict.items() + elif safetensors_load_strategy == "torchao": + if not torchao_version_at_least("0.14.0"): + raise ValueError( + "Please use torchao version >= 0.14.0 \ + to load torchao safetensors checkpoint" + ) + from torchao.prototype.safetensors.safetensors_support import ( + unflatten_tensor_state_dict, + ) + + with safe_open(st_file, framework="pt") as f: + state_dict = {} + for name in f.keys(): # noqa: SIM118 + state_dict[name] = f.get_tensor(name) + metadata = f.metadata() + updated_state_dict = unflatten_tensor_state_dict(state_dict, metadata) + yield from updated_state_dict.items() + else: + with safe_open(st_file, framework="pt") as f: + for name in f.keys(): # noqa: SIM118 + param = f.get_tensor(name) + yield name, param + + +def multi_thread_safetensors_weights_iterator( + hf_weights_files: list[str], + use_tqdm_on_load: bool, + max_workers: int = 4, +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Multi-Thread iterate over the weights in the model safetensor files.""" + + def _load_file(st_file: str): + result = load_file(st_file, device="cpu") + return result + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(_load_file, st_file) for st_file in hf_weights_files] + futures_iter = tqdm( + concurrent.futures.as_completed(futures), + total=len(hf_weights_files), + desc="Multi-thread loading shards", + disable=not enable_tqdm(use_tqdm_on_load), + bar_format=_BAR_FORMAT, + ) + + for future in futures_iter: + state_dict = future.result() + yield from state_dict.items() + + +def runai_safetensors_weights_iterator( + hf_weights_files: list[str], + use_tqdm_on_load: bool, + is_distributed: bool = False, +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Iterate over the weights in the model safetensor files.""" + with SafetensorsStreamer() as streamer: + is_cuda_alike = current_platform.is_cuda_alike() + device = ( + f"cuda:{current_platform.current_device()}" + if is_distributed and is_cuda_alike + else "cpu" + ) + + streamer.stream_files( + hf_weights_files, + device=device, + is_distributed=is_distributed, + ) + total_tensors = sum( + len(tensors_meta) + for tensors_meta in streamer.files_to_tensors_metadata.values() + ) + + tensor_iter = tqdm( + streamer.get_tensors(), + total=total_tensors, + desc="Loading safetensors using Runai Model Streamer", + bar_format=_BAR_FORMAT, + disable=not enable_tqdm(use_tqdm_on_load), + mininterval=2, + ) + + yield from tensor_iter + + +def _init_loader( + pg: torch.distributed.ProcessGroup, + device: torch.device, + f_list: list[str], + *, + nogds: bool = False, +): + loader = SafeTensorsFileLoader(pg, device, nogds=nogds) + rank_file_map = {i: [f] for i, f in enumerate(f_list)} + loader.add_filenames(rank_file_map) + return loader + + +def fastsafetensors_weights_iterator( + hf_weights_files: list[str], + use_tqdm_on_load: bool, +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Iterate over the weights in the model safetensor files + using fastsafetensor library.""" + if torch.distributed.is_initialized(): + pg = torch.distributed.group.WORLD + else: + pg = SingleGroup() + + device = torch.device(f"cuda:{pg.rank()}") + weight_files_sub_lists = [ + hf_weights_files[i : i + pg.size()] + for i in range(0, len(hf_weights_files), pg.size()) + ] + + nogds = False + + for f_list in tqdm( + weight_files_sub_lists, + desc="Loading safetensors using Fastsafetensor loader", + disable=not enable_tqdm(use_tqdm_on_load), + bar_format=_BAR_FORMAT, + ): + loader = _init_loader(pg, device, f_list, nogds=nogds) + try: + try: + fb = loader.copy_files_to_device() + except RuntimeError as e: + if "gds" not in str(e): + raise + + loader.close() + nogds = True + logger.warning_once( + "GDS not enabled, setting `nogds=True`.\n" + "For more information, see: https://github.com/foundation-model-stack/fastsafetensors?tab=readme-ov-file#basic-api-usages" + ) + loader = _init_loader(pg, device, f_list, nogds=nogds) + fb = loader.copy_files_to_device() + + try: + keys = list(fb.key_to_rank_lidx.keys()) + for k in keys: + t = fb.get_tensor(k) + yield k, t + finally: + fb.close() + finally: + loader.close() + + +def pt_weights_iterator( + hf_weights_files: list[str], + use_tqdm_on_load: bool, + pt_load_map_location: str | dict[str, str] = "cpu", +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Iterate over the weights in the model bin/pt files.""" + for bin_file in tqdm( + hf_weights_files, + desc="Loading pt checkpoint shards", + disable=not enable_tqdm(use_tqdm_on_load), + bar_format=_BAR_FORMAT, + ): + state = torch.load( + bin_file, map_location=pt_load_map_location, weights_only=True + ) + yield from state.items() + del state + + +def multi_thread_pt_weights_iterator( + hf_weights_files: list[str], + use_tqdm_on_load: bool, + pt_load_map_location: str | dict[str, str] = "cpu", + max_workers: int = 4, +) -> Generator[tuple[str, torch.Tensor], None, None]: + """Multi-Thread iterate over the weights in the model bin/pt files.""" + + def _load_file(bin_file: str): + return torch.load( + bin_file, map_location=pt_load_map_location, weights_only=True + ) + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [ + executor.submit(_load_file, bin_file) for bin_file in hf_weights_files + ] + futures_iter = tqdm( + concurrent.futures.as_completed(futures), + total=len(hf_weights_files), + desc="Multi-thread loading pt checkpoint shards", + disable=not enable_tqdm(use_tqdm_on_load), + bar_format=_BAR_FORMAT, + ) + + for future in futures_iter: + state = future.result() + yield from state.items() + del state + + +def get_gguf_extra_tensor_names( + gguf_file: str, gguf_to_hf_name_map: dict[str, str] +) -> list[str]: + reader = gguf.GGUFReader(gguf_file) + expected_gguf_keys = set(gguf_to_hf_name_map.keys()) + exact_gguf_keys = set([tensor.name for tensor in reader.tensors]) + extra_keys = expected_gguf_keys - exact_gguf_keys + return [gguf_to_hf_name_map[key] for key in extra_keys] + + +def get_gguf_weight_type_map( + gguf_file: str, gguf_to_hf_name_map: dict[str, str] +) -> dict[str, str]: + """ + Return GGUF mapped weight's name and its quant type + """ + reader = gguf.GGUFReader(gguf_file) + return { + gguf_to_hf_name_map[tensor.name]: tensor.tensor_type.name + for tensor in reader.tensors + if tensor.name in gguf_to_hf_name_map + } + + +def gguf_quant_weights_iterator( + gguf_file: str, gguf_to_hf_name_map: dict[str, str] +) -> Generator[tuple[str, torch.Tensor], None, None]: + """ + Iterate over the quant weights in the model gguf files and convert + them to torch tensors + """ + + reader = gguf.GGUFReader(gguf_file) + + for tensor in reader.tensors: + if tensor.name in gguf_to_hf_name_map: + weight_type = tensor.tensor_type + name = gguf_to_hf_name_map[tensor.name] + + if weight_type.name != "F32": + weight_type_name = name.replace("weight", "qweight_type") + weight_type = torch.tensor(weight_type) + yield weight_type_name, weight_type + + for tensor in reader.tensors: + if tensor.name in gguf_to_hf_name_map: + weight = tensor.data + weight_type = tensor.tensor_type + name = gguf_to_hf_name_map[tensor.name] + if weight_type.name != "F32": + name = name.replace("weight", "qweight") + param = torch.tensor(weight) + yield name, param + + +def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: + """convert PySafeSlice object from safetensors to torch.Tensor + + PySafeSlice object supports indexing, which is done before loading the + actual tensor and can reduce the amount of memory being read into the + memory. However, it does not support more advanced functionalities + like `.view()` or `.t()`. Therefore, if we need to modify the loaded + tensor with these more complicated operators, we need to convert to + tensor first. + """ + if not isinstance(x, torch.Tensor): + x = x[:] + return x + + +def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + """Default weight loader.""" + try: + if param.numel() == 1 and loaded_weight.numel() == 1: + # Sometimes scalar values aren't considered tensors with shapes + # so if both param and loaded_weight are a scalar, + # "broadcast" instead of copy + param.data.fill_(loaded_weight.item()) + else: + assert param.size() == loaded_weight.size(), ( + f"Attempted to load weight ({loaded_weight.size()}) " + f"into parameter ({param.size()})" + ) + + param.data.copy_(loaded_weight) + except Exception: + # NOTE: This exception is added for the purpose of setting breakpoint to + # debug weight loading issues. + raise + + +def row_parallel_weight_loader( + param: torch.Tensor, loaded_weight: torch.Tensor +) -> None: + """Load weights that are row-parallelized.""" + tp_rank = get_tensor_model_parallel_rank() + shard_dim = 0 if param.dim() != 1 else None + + if shard_dim is not None: + shard_size = param.data.shape[shard_dim] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(shard_dim, start_idx, shard_size) + + return default_weight_loader(param, loaded_weight) + + +LoaderFunction = Callable[[torch.Tensor, torch.Tensor], None] + + +def sharded_weight_loader(shard_axis: int) -> LoaderFunction: + """Create a weight loader that shards the weights along the given axis""" + + def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + tp_rank = get_tensor_model_parallel_rank() + + shard_size = param.data.shape[shard_axis] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(shard_axis, start_idx, shard_size) + + return default_weight_loader(param, loaded_weight) + + return loader + + +def composed_weight_loader( + loader: LoaderFunction, fn: Callable[[torch.Tensor], torch.Tensor] +) -> LoaderFunction: + """Create a weight loader that post-processes the weights after loading""" + + def composed_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + loader(param, loaded_weight) + param.data.copy_(fn(param)) + return + + return composed_loader + + +def initialize_dummy_weights( + model: torch.nn.Module, + low: float = -1e-3, + high: float = 1e-3, + seed: int = 1234, +) -> None: + """Initialize model weights with random values. + + The model weights must be randomly initialized for accurate performance + measurements. Additionally, the model weights should not cause NaNs in the + forward pass. We empirically found that initializing the weights with + values between -1e-3 and 1e-3 works well for most models. + + We use per-parameter random seed, so that dummy weights are consistent, + even if the model is partitioned across multiple devices. When the seed + is fixed, the random values generated by this function only depends on + the parameter's number of elements and its data type. + """ + for param in model.state_dict().values(): + if torch.is_floating_point(param): + if current_platform.is_tpu(): + generator = torch.Generator(device="cpu") + generator.manual_seed(seed) + # Note: The param.uniform_ function cannot be used in this + # context because it demands more TPU HBM than directly copying + # from a CPU tensor. + # Note: We avoid using torch.rank_like as it doesn't currently + # support the generator argument. + param.copy_( + (high - low) + * torch.rand( + param.shape, + generator=generator, + dtype=param.dtype, + layout=param.layout, + requires_grad=param.requires_grad, + device="cpu", + ) + + low + ) + torch._sync(param) + continue + + generator = torch.Generator(device=param.data.device) + generator.manual_seed(seed) + if torch.finfo(param.data.dtype).bits < 16: + # uniform_ doesn't support < 16-bit datatypes (FP8) + dtype = param.data.dtype + tmp_param = param.data.to(torch.float16) + tmp_param = tmp_param.uniform_(low, high, generator=generator).to(dtype) + param.data.copy_(tmp_param) + else: + param.uniform_(low, high, generator=generator) + + +def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None: + """Remap the name of FP8 k/v_scale parameters. + + This function handles the remapping of FP8 k/v_scale parameter names. + It detects if the given name ends with a suffix and attempts to remap + it to the expected name format in the model. If the remapped name is not + found in the params_dict, a warning is printed and None is returned. + + Args: + name (str): The original loaded checkpoint parameter name. + params_dict (dict): Dictionary containing the model's named parameters. + + Returns: + str: The remapped parameter name if successful, or the original name + if no remapping is needed. + None: If the remapped name is not found in params_dict. + """ + if name.endswith(".kv_scale"): + logger.warning_once( + "DEPRECATED. Found kv_scale in the checkpoint. " + "This format is deprecated in favor of separate k_scale and " + "v_scale tensors and will be removed in a future release. " + "Functionally, we will remap kv_scale to k_scale and duplicate " + "k_scale to v_scale" + ) + # NOTE: we remap the deprecated kv_scale to k_scale + remapped_name = name.replace(".kv_scale", ".attn.k_scale") + if remapped_name not in params_dict: + logger.warning_once( + "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.", # noqa: E501 + name, + remapped_name, + ) + return None + return remapped_name + + if any("mla_attn" in key for key in params_dict): + attn_str = "mla_attn.mla_attn" + logger.debug_once( + f"Found mla_attn with k_scale and v_scale in " + f"the checkpoint, using {attn_str} as attn_str" + ) + else: + attn_str = "attn" + # Define scale name mapping patterns in order of precedence + scale_mapping_patterns = [ + # ModelOpt format: .self_attn.{k,v}_proj.{k,v}_scale -> + # .self_attn.attn.{k,v}_scale + ( + r"\.self_attn\.([kv])_proj\.([kv])_scale$", + rf".self_attn.{attn_str}.\2_scale", + ), + # QKV proj format: .self_attn.qkv_proj.{k,v}_scale -> + # .self_attn.attn.{k,v}_scale + (r"\.self_attn\.qkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"), + # Qwen3 MoE format: .self_attn.qkqkv_proj.{k,v}_scale -> + # .self_attn.attn.{k,v}_scale + (r"\.self_attn\.qkqkv_proj\.([kv])_scale$", r".self_attn.attn.\1_scale"), + # Default format: .{k,v}_scale -> .attn.{k,v}_scale + (r"\.([kv])_scale$", r".attn.\1_scale"), + ] + + # Check if name ends with k_scale or v_scale + if name.endswith((".k_scale", ".v_scale")): + import regex as re + + for pattern, replacement in scale_mapping_patterns: + if re.search(pattern, name): + remapped_name = re.sub(pattern, replacement, name) + if remapped_name not in params_dict: + scale_type = name.split(".")[-1] + logger.warning_once( + "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.", # noqa: E501 + scale_type, + name, + remapped_name, + scale_type, + ) + return None + return remapped_name + + # If there were no matches, return the untouched param name + return name + + +def padding_weight_loader(param: torch.Tensor, + loaded_weight: torch.Tensor) -> None: + """weight loader for padding in last dim.""" + if (param.numel() == loaded_weight.numel()) and (param.size() == loaded_weight.size()): + param.data.copy_(loaded_weight) + else: + raw_num_experts, raw_out_feature, raw_in_feature = param.shape + load_num_experts, load_out_feature, load_in_feature = loaded_weight.shape + + assert raw_num_experts == load_num_experts, ( + f"Mismatch in number of experts: param={raw_num_experts}, loaded={load_num_experts}" + ) + assert raw_out_feature == load_out_feature, ( + f"Mismatch in output features: param={raw_out_feature}, loaded={load_out_feature}" + ) + if raw_in_feature < load_in_feature: + raise ValueError( + f"Loaded weight's input feature size ({load_in_feature}) exceeds parameter's input feature size ({raw_in_feature})." + ) + param[:, :, :load_in_feature].data.copy_(loaded_weight) \ No newline at end of file diff --git a/model_executor/models/__init__.py b/model_executor/models/__init__.py new file mode 100644 index 0000000..9f8dd04 --- /dev/null +++ b/model_executor/models/__init__.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .interfaces import ( + HasInnerState, + SupportsLoRA, + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, + SupportsTranscription, + has_inner_state, + supports_lora, + supports_mrope, + supports_multimodal, + supports_pp, + supports_transcription, +) +from .interfaces_base import ( + VllmModelForPooling, + VllmModelForTextGeneration, + is_pooling_model, + is_text_generation_model, +) +from .registry import ModelRegistry + +__all__ = [ + "ModelRegistry", + "VllmModelForPooling", + "is_pooling_model", + "VllmModelForTextGeneration", + "is_text_generation_model", + "HasInnerState", + "has_inner_state", + "SupportsLoRA", + "supports_lora", + "SupportsMultiModal", + "supports_multimodal", + "SupportsMRoPE", + "supports_mrope", + "SupportsPP", + "supports_pp", + "SupportsTranscription", + "supports_transcription", +] diff --git a/model_executor/models/__pycache__/__init__.cpython-312.pyc b/model_executor/models/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2417c56c89df14e019b65406015da46f06a1b919 GIT binary patch literal 854 zcmYk4yN(kv6o&1|B$*^Lxw42`Ni)SV<${EeD7cD6QFcY)#+pq`VquRh+d)Z5%>&TW z@hrT7TSP&Jh|o$@*v@5licdbr=luV%J>Rpe4cy1#&-7;vfFC~Cf6;ESS-8a)FatBl z0EY=UfpAXO= zr6D#wTS_UE(Qm}$sm=D$=|qg5 z+2J8vGDAQzN<&3udy7(I>OHUyCYj!uU?x;)yN8e~A&DJb z#omVppUxpErM-Cw%yp+cwxf59aW@%Z&xINb!DuzNcdjST$;!MSZkRF**QXlGZ5?(_ zgxOmzV_XrQAF8%HIkZ`jo{%}!My>OK8o66kJ&%D$-(%?Eo*Af?N9^I>Uo|{ZkGe<4 zqv?@(BpzLdxA~E6)5;L_zMK-hl?p1MEvSUH&=EX>e9#N>LC;Is-jxo%C^gy68paF* z`&n(Gb_WW_u}zFi#xVW{Hf|UEdtDeV3rXd%Db<{qMH5fmTYnwcFHA_aGsJl+u|AV@4_+5h#UM3lgF2Np~u)0@= W(FQpDO#!_BCmwd9Pfc~(_525Y9`fw~ literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/adapters.cpython-312.pyc b/model_executor/models/__pycache__/adapters.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01e585c449a5d5f1a119c62c423bb34e27f14deb GIT binary patch literal 23022 zcmeHv3v?UTdFBi-c!R`)1Rvl-kobf|QV+@&t%oI96!ju$$CBlku?s_-p#&NPnHf+L zRWM~!wWboMmbP(4Z=9NGa~f)`Y?!m%6Q#S|Xwz<+o^DS8gI>T>c`Khf+h+Id9?DeK zv9{ga@4tfqKnikVr#-uS&dveud+xn+=idMS{{Q}W{;aIb&f$s*UyOY57{~oJMfA&> z3#>eE;JB-t$ccQE8`6A!h)3KIHH;aDj69_oqr#YJ$TVgiGLKn?EMwLoD@*62wjmq4 z*FNSLaKrQ@Dq~?&w0z7pK&^Xs$gMjv~tWhUW?{|Bzp+*T6{@(J9qQj?o_daSDDdg4fI5z*EsR)PT}1 zsqqay7hiXp6U%>_9}0*r(JgvP*CFy3EF3o?h~A6B(0Z`~rFymdi(Ro&YRc8pJYp4n zDBB`dA*~vJkJL0`7Hg2!D%MI3h&P}c(J@SFK0lk{mu2m4bp=cH)^Q4u%dKK6v2Rk-|uaP9x@s#3J!fbo}&bNv6!xQaltMj)z8Ki1|)R za%5y`e@uL8LJY^H++7CML&eUFghnD!DRgQoE-9)nE>DFhS(IXm6dD^BC!><`Bbp{b zgIYn24vdRZv~OH~VthOriJey65hXOCg{iz0ML`EEq%|bdno|9VZix#~_^vHRC ziR;!ka{Zh(#a()S!jupaoIdjv{-wtghEW=tHCMu)XNt9gyGk@(dt564?Wxxz=bquF4ClD>#%H*5e9(EJyMH_^298UyxD*?fI#0;qm@+ahk4dr;=%qyv zn2^UurD2+nfoOON>0Ow~R#`e5QIIyMnn%W?q9oHI42?)(%t99dm38sg)s?cn)gX@V_5vqn`^o1`ZNEiw(hTDFQtsAg?&R44QGwOLOk%^(!i zGb_=gSa?iQXq^PK>)z4L@v(``6OoC|cvwc?JI_z8nSo_1XZ0m5^;v8AvfwfGWIYwL=D(~D&h}@0>*w23 zzD=`zw|v!CpSki()?GjMT*|$P{9j19J5V6wYf1T9=0_I9w68l`wINg0nX2l%Yco}q z&-UGQa+PiKXBW;cCYCCCv+kO#)BEzkrGdHDCFlBVU0bGZQ>t!LvSWX`?y;onvDxz} z*JD|C)zz{qWv`7c)TA4?z2)v+uH_mw-t}{C@5|GdrsqbIu9l3eJ>_a&NMttmr8f5^ zUF~Vtp``WDPi{FX-{*K!&n;_t##)oI)+B2me#^S!w!^h-LGnF?NYL-@-*KRo`+nno z3%oyUZS1ue|H#b4mA4{SrnfhoSXv<^<8B?FpeI-O5uo z*(LI#ISN!ZZQ(fsm#`El)U-9W3AvW2b=p>-V58-QKIox@ZPcfS3x1&{(JI=m+w*<1 ziVkpuU&Zs=6ZVAlEQg+2<9Yf9s+Ol&h1x`C!kDkYfu1@_mrGa^c6wntX8`TCkJ8d# zbBSeQxn5@4iMr?&uDO1BYr;wWlRH=Su0+XIy(>fS%1YOW-Z@wIj^}u8$|hh&yyOtL zDK{u~quvcrY(FS=K#z|$>7jxzx)Or^97MBG&*nVU_FUM)sCJH)1tt}uj%WM(2ilZ? zmdNOUBnKwKanOcXmrVTr8o-b!jf5wo@zAPbLBj(%U1PG-h-Yq#&N;Z#~45$KF(M|phf8mdeu&%ja{p0V=hWvljK~zp~-k8 zs>nO2Izkol267ntu!0jP=OXbls;6iGy4cgpkD!=ph>WWy7FR8g5mY?#xM~5h#_-Cl zhpI)ECZgeCiFH-pM0IzN(+MY7UZ5w8d9YrnRzjY{4^$h&pm@|n5tymjtEs&eCv_F3Dqu)*fLRl70kuASSRsq0MDbrSjQ zO>OR7F&g~kvq$fkxynY6_}i}Pj4P0G1?C%pc>Hq5tvqw{N>i~bu&Q(F%$RUW>*E|^)jHMMT*V*8E0^tuC=4`-_z zuKwbcUrbhaE%w~Fkm^2itF<#*zv1;`uO3@im)!X{YHRb)IZ#_|^L*boj(zPIYCD|T zdT^a1Kg zA>*FXj3Irsskh4T2m34Ge&4%)54=C}@Z?sl>wU!ZM?E}T)io|hPHW4I;RCf?mnf9U zk&(#xVOqWVMc9||pu-4U0^Q~MyXl+ggm^QrR6SxFW`~~D4 z2Wc3@;`N3O9UMIPpSx4gU3~;ERBtym9&lo1^IE5m!E*6+O;*uCa~+Y zCbdW*t)N1^q>096nOe<+D*d4f_sLJA*jfCQE;ut>))SaH`Zsl*v#zXfUB=gu@^vhP z(!M>H?8`>3zH8acIjiQ%Q;yc8(7J*_ z=BKo05%4Ug1Q*Y9?Yu+;Y3w)`G!G6Jx0h>*TOkg7m6JG9a|mgG*K}DxfL&*s}K|uTnHiU zn&RXq5h6htwL}%-9*PtlQK*L8OtFSSOqdT{*z-v`)1}}3%cxSJSAa=XTQ}SLrK9qv z5xZ|CvCjT`=~At#fO6GcZC43|9)Vm2RihyVqw4a1S?{mwEO6 z08jFT68bP(RXg5L3>*WJohp!wr5d3=QH4S99QhY;x7-c~%uQ6w*aD6Whe)i8LsX;J zrV24=G!%NN`bxI5{1CnW3wuA&8!A+A5%ns&;mmNiT(xt}uLoZZe*O42KK->%FS)w1 z?#iq7EB1_geagLlzWE!$uLZx+@wJX6_vWnAopCm%oQ-qwx17!FwGIYNDrOFmqEJY+ zhjN-5gq@+#XD7o^Ek}MHjmZ%>*En{cMx7i*vsr$Y^#>iJ6?l<*-@x1ISIkCRi*`c{ zsj8n%QNkS|Svnn20I?)dqu;C6olXK)cH*uVSFJo+ccSq%UKB)=XqE)gBAEzKk6359 z5ni-kEFUt@a_9LW3zoCvUeC`w)B~&aQzuk{edY;S0v}Nwtj{^U5lvE=SchtU;T%Ly zMJ5b8m|xHI+DSwqh_rGB&MSQJBDn;0%a^buL@<#AXvu4agz@YAuk&n*mLQipIW4t# zKqd0*YKjXQ2d^3ANwmniCyyZnrI|^YoRDPp4plEH>7wD->B;bErl*slcu1(>pa3+n zxm$EUF&Nbr0tE{zN+_gI=fIi*Ga3YX9?=W?Kj{GHDb5vCqiloo6A0PmT$%UMAwmVt z!G*dTCsWSBn`hs24&L@RX8c=I{;g?$cgDXn<=+X?bc{`{pj95l-SP|MM9Fc(QH=@^ zCiPf*@Uq-v3`@~yC5tTemK0tyy+*bq(*^ zY_?rss{Ynx3x%wluV$I0*|}=}vV+o`T)FpyGKB7MaI{76OQ;Fm72&ClK}@(DD^N{r zajT|@u&hX`1;l3psw35gx223v%0S1A>JR|~lz0Z7%odb|-3Mg_)w~8ftotc%M*cP2 z!|n?sfU!4NOf6p?zB+zo{4R&!9WUqde{pQ)P*!lfIPl8@Uv9b@yb^rvP^P{!Ro}UA zCSAWH?d@3-9?4po7Iu7V&$T@%OHbBSaaVAfcFnafHSf5~A+%i2IqQ;wpN&9%ji5!g zMrih~^ul?Od(rTs5nCrY)xZ}FV}>snNeDm0%^GHnSlv*UUbKqBB`%&<0OZ*>RIFI< zra_rFI}n*?KU6(WYdWibCe5g2x#$3Cu;kM8hTiup#*a^NuyK*2N7$BdTr>??Y#cU3 zoRU?tNrGe_twP0XuEHjQSSC3~gjt@R6Hi%w5gR^?hLz))JomQg&|)C4l9`f=;>;yqi>)YcA@|^&)A1iUSEW56b6J4pYDaE<@T(71H48 zNF2Kj^7yc%Ktz{+5hb+2VIo(5zE;+$_x!Rt~!mSGb-1hm(w<=n1 zRo2b*&NshR*|uy#)?GV?gzfV+3)`0}wg5Hc5W}X$k)(V5TkbuO94fbJt2WPL&^BRC z2L5QJfv>^UuFvy2O05_ij~^MEh)QFm9v2VF^0*u{t{EPf_X@~R!l2j)=F@JC&VZ-+ zg5-i3YRDtkUnx#6g@8q4zU){*mnjOUpBp8aYt5y%SZG^EskCb(A2V>5Yy^KI{B=t{ z2RhSH9XJ)$S3tPfhbp;$RT^RE>op}zz1+*@VZ#|n0neeg(`Lv}X6R=P|HSYTbam4v z=vPh0i{yYy_M#{DUb2^Lfv|Uub{q$pzho`QcGH%GrQ|(%w0<0Y;qjj~p94oNU@v-b z0sn6NSNh=`!-OSa|G9dOdlt=2^RxvwO0#&)1%i`HRm?S2Vd@gyS4K} z1{XfRp_yk2z1-r+wXup%)8b&`Nmw>No*RqDQ{-M~=YYDEdMIe~V_RC8qEBuy|N2FK zlFN;CM`5g4w@AApVBoMb1#E=Dr5zj4l7EV>$X|g2k@!p`Dy|xB`K!qI#~9}~QLjRJ z)!aB6H+tV~+x@Bo<1B$(`fBG6U3vV?%7L%P7Y=4RcBeXar#l|KDWuyD{-841Hju6y zc)M+PZs50U<1d(*MyHE0Ys@Dv)5*lgLow+bxD`nrL9_fa+S6W<5yGQdPE$+_v8R@$ zm}&x?iB8GCg;GI>Mq^*0JAM<6TEQ9!VXK`K$ssm`su`+gARx7jsW?R~RnbNh+jgY- zm1n7nOXLvPRERT!RR0!Aj z+?(&i-fRgOMxI8|f(#=Qlaz^VLZv5aZZ9kUxiX$O#E5zvfL6 zaom@Gg#`cQFT(j9{FTqc`EZS(z8%XZWTF1Lbstcn6%J03s07(n z+k7ZIM6n3OdY{i?^z8Y-zMMQ)`#>@)FXB00rw49@b8m+G8)3%>!ehK?`!YvvPIh4l zhYdy23pQ&Wq%WOAO_~5xME>#&MK&a$D`uh%53vT8V8x;hY>1d(Q;dyuR#ece7Ng@* znl90#bJGMLqB+JcCKj2)&w?h~Y4skTJjJB>KsXkV&O-yE?|=murJyY$XbUFB<#<3E zJ0*!^KcIBk*}acV1#-J!9f9~6si;6;e1xT(I|D@m86b>J!~T! z)+G)>?E?-WwDu~6V+noE7>;tEho1kEaLH6kyI;|S?}6?vf95wE%NV?^Dp5r znoN^PXKhRz+4-@CysKC^{Clc&01miXw!SS>zdcpIJyZW!s{XOrfow$}S<#VncEEkt z-+sF(m}z<>)%3`X&UDk^Y(rC~p)-a3zOi(}?mH%<$Gc)Q)s^SfU4t?KsB5OR41@6s z#v%R%HBQJn_cW^K{`v78Wd9wNBgF)iTXzgLTRG8gcsH9qAWx&iH<7P8VQH72Td?C{ z3bys7_K$OxPTK1!3@{_PY}1N$8@%b;s8^=S3x#s}E1iIQ?;n2o4{^hP#$V}!^C5Ir zHJr6(&bS228<%(1{(+e@9pOI^?56Ui`iB;W)Ac*=a)_YVhDUBxe6Rky^*2A4UVkzv z_?Cpm+m8CAP=CK(iYBcQ|CK+6^P=%Z;YHJn=3)(QK~Pyk*iG1k;$hxv9234^95a1E zU~z#Nl<){cWQ$tIY+o?v^jgMQVb(NjhR(`O}O-YB9rFCQ^qFGb~NE1Y~I~*XR4{#~Xl0st>c_yG>UkE*Qp9tZ0&-TsbYy%-G)X4O3_EDc zC@uIkS3!L_VqjY@=8MiD8&g}_G2yKDda!;K>9e%|`5=C!nGX(22UfkSk78als~Pj6 z9n)8YkHfr5F)Ljb1!F781j!_=)V|BKiBvF9xLDVW3ZgKMwcnpALPK4W&F0r>;huy^ zBnE58?oeLIblvzyj+ug0=Gp24>MZfP$OJCWx-FNiqcnj3s+-f1Y@DOz63n_uABNfc!lV1jk954&4LmjDdbAysuU5m(7susX4Y z8H?*E!J(OYhseB#ZIF=)Cr9EUkcX{0xx+5*Ur=S{oGrXjE!V(>UO-MFTmCaRL8k^t zY_Nydwg*+1bi_k(AYa{<_0Op;heqQ<3Q(_Vk4=tY7w(hMcMSy*1TI zX?lzS8eP(#q?_4Rq@%b+!bmbT%jzs}Soz;k+MmEFmgpf3<|KLsdk%imb-M}yZ3Mef zNLfq>$lsFjZ%g^NW&FES{@odWf6CvV_7Bb;%ht7K>bg^P-7u6b1mCR5J6LRV*QAZV4?{h>FoD$!}9JGh1+q ze2Y>NaPkm6P^^9WJS=XbG7mHO|Nds_#aC7A4>ao%pSYttDpN!a^wD+ z+ta?oWX6r?Li=KG+V{w8-))y~_H*;C^If;9>#v+kRc~T5G3nlZ*UHs&Ec7n6rK@)) z-8(ffO>YiMmtJ(yc69-{6Y7+&lXHNaZ<0f!rYUqxnnEWpwG^#pnoaxi>sW2n0sjLS z?4P1|8$I+s1fcP`V{X*YKRiPJ@RpggkGI=)Em#+Qi^`3I-|PQwe{u(``;MgAkAA=* z=MFjAquoYLYAxi1(oF`-e!9@0Au{c$zK`k(bW3e1C2KH^h0+zl9+Nx*3sOZUz%GA_ z9HMb+g$|1*bg)9VDZkW65^IGZxDS_R7vNq?4(0by1PAgsMMU6#KnU@#5<-3qA>;u6 zfx&1plR;=d|3jha1EJG|(Smc!x_-VXZEa1q@40J4{O&LC+~z($YjwQryyQ$)ZG}nY zW^+>5uq14Py3M&UDFp8qQ3!U>WLcptFA`CHCh^1wy%UKfI8Q_;5T_xMki8x>M|c{# z*k(I~5@;UG5J)WXGA$jnA+O8Q8F~>FBApz&K*LHv57AmAmgE6bg2WPAxDib=`28_7 zQ?dlJ8&T8C005UzPo0wj8nb6OzF53a2*XR^;WL^_DIi8bTe9ID4n+Ae;yT+$UjKQ zbUm*+isUy1fevX=u!~tk$+TeuTOUGJ&^a(z#56w{BxqIu{|2EvZ&`(dw*QO+u?qIc zL>G>tPGTiLP4QFNJU^0yA+6oT0K_N|fG`&U2urnPPtC*b8qHHC@?`C z(Um)bT_f-Ch`htoi1vR$LDdGLt=$;3F(NImA-=8xg$Kv z>`vG2Sm6*^F>>bh%ajaSyzb$};qOGh9ld!h-T1`J;F3^x+fkboYBef~Ci0g)Symd6 zRM5jpFf@X3ub@*nXkdAm(bbpcYzNq3($zcAwDK5vei)DSKxjuQx;58F%BXzdSR{yc zG2An($|;6>Fp0*|Q^=&6?3Xaa^8^?NjT}dr(KH>n`l?ZX+|QF$qKWo%<(eWGn3uss zGvz5v4J_igR$;Rt<}OvfxWuY{!fI1zCXllsX&2;LtX}(|2LApMLv+ugVave1a;ALG zZBa-f#3cft+d}8UQ=yzr=|?oKj8v*+lQ@h#fisK@P4>a`j*);g7082AwetAEPlXQj z9}o3EzP~p#aPY+8$9s>fMjSDb{{xl$?{FYrGFgkjsn)N*(zEpWty}umoMq>xL(rLp zCfUj9oFI7Np^utDT@SkLA}tM4m(FmvtJY_#x>8kLnW~;tRZphsaH{HXy6WhxhF3&&(cMa&Ew`Xl-ELu@Jf;rE7X;%d*W|XB}@@*RvG|y30n6=ACADhOu0YfK?}L z|3rqhmn?rsCCAA5G&$YmEbGp%&}dX5hJ8wt=Y2zkX&bG6?;Q$hH?Xqz30%aJ*qZcX zbr$Rlr(s5dwPd8vGthBFK$!dr!5z7)ltu@0wshGN8l7o~Sfw;8r4Cx_>u}m!IA)kI zi^A!=LEN;Zpec?Mjb~TkXmc&%iTq)3l&;Gap;U|?UD^uQRxv=JAz?0c5tT>cEqW?Op;ZqT{KCO?xvF%DQBa(oAr{S^&%JFs>ccdd32keR`7fCJ<#i%w&9rQLxmE0h{Wwx z$3yLyosMYTSpBqv!F30;jyv_z1s|!q3o$)})JOGj!AI)8LQD@`cjk8*oC)XX0X@Co zC!j2*xs`v0vImwcTUD<70i)hqs9O(>_UWO5pC~`g+8sSoNYg`c0$c1F?bkyEU+?M9 z*?!s2+x`VREYDo%#bBXtdZ^ULP^R5QYu_!FW79MTR;E2zt)7HC;nIaFqFK+oajf@K zdW{8Nbm=32Qufi~g&aL3x{FIdB=ZdFXI{&nfW=ibD&boj^a&05Vca7VJC=V82gWOZ z4o`KmZB~uLu0BjgJtpG4vJXz)ZVdE_K8QgaVC|(@k-<#3BEjeE!D@=^!F~zlVRonC zO#?FlQ~C1@3F~I0Uqz7%J;l165If|n>8V!1gc4?SzfZhnJ*vo=cNrSYZ&$R=h8Na< zYvZ+z$(|$WjYnSMQx&ZrGAq;1S2prjEsZ?BP{OROuJLpl2)bXgRe+jTzGl>p$!gkT zN?{_E>WRyf_|yO$5X6Bo?BYlauCQ~c_nb_hojZGbuUxk`)C}{28`bVos*v2a>sg#K zJH_m|=(QE>1?0cSL&OACD`1jJ2eA+d7jD%Op3MO z43az^3NgKxs_ikuh9v)S4ec~Cju_|0SO(9A43m$m-h2=POvfX}V>0$-IT@R-9CVlC zF#xUHOH%NzLJ?8X&PuWabD?3QFjfYQa2V0c}p? z%Cb`;;W0K_6fYWrx@yg(X|7s+rgb2%L!)28U+DsHp5aJmb)0`kIDWgLCR5Rzs%Xws zY)w^co$1fsm*B0w+Hs{LAO|t`Ev7@l`Fxy zlj*W%#2w|cXTP{>ZvC4M90{pQI_lo7+>o_aWUP%TYva5l6WEao?6?`#zN&HX;s8|b z$hEY-X=%lGFkpH&Nym}r2bWwVFPC{=K6dF?#@YO)v-!5C>T2DUx{POQ%Cj}?>0Y*Q zHXp3iYT41}OwHC*&DO<^bj{vb(@SMg&ek^E;jq4MRaDQJeq&drqV>&+R;b<@%uwOi zLjUGN(^YL5Uw6vay?F4(j^wAFO8UCfzLPV9KNR-9>uzA50-4j%f6+1E&J5v=qZ&}M;F1u8Qj#ip) zYvGKwK4q=XSl6eZS?rnLn6h?etXopnEx+gcPQ$kw($>9qOoqyR{7gU8iTa4qDZrV7 zbRyv3{NCjDBk9VcGXrl6p35zB^-Dr4ss@S|ycwY?B~)Gh9FkbfpA!7Zx=l+$N47|$ zE|z2SD7#De|0o+Mn|}cq=M$rBlg{;v5$GpI*=8Lv{J(^O$o3=9yZ3hDGS#txTNc}mH^>tewLpu@{9Gh?s8&1=)i3*?|XTmYM zzeKA}9f4h89~YP|V;^7%++T9^u@Y$|Fm!^U?`cBaCD3%KTBcn^c+(}?i}0o!coS$d zk^ekQ4sY5@q4DAmWr8N*i*H*1t1rlQ8_;`W#6)ZX`N7hA6s3nWP@+WX+xOapn|8cQ_=L>sr zT6A{r{~=@muJ4~~`HjzHeE4ct`+^6b?#lRjQof!WzMB?o0Dl7Zf2W*#w3ko2d%t_~ z27kF$0}G3uw>;f9PR<-jx_e7;ed75u{kNSS4dVO*G3*cD_i>(s+8KmDHr5>W3b1v# zi`k)i?|=38*(;DhxAL#yd@PkVfWX?TlwELMMPn$kgI{CSkTAv}NNZP4eT~EOP+uF1 z)z`&JXkrh=tN;S8kW(zbk1Tlw4y+Ff0#TY$3$oA$lrI#mOor@lAvI^Ms!N{#k?tc0 z@;Rr35x_yLDh`m@+m!M)WxQKb-YpsL&Xjj&#(Om7J(~9R&m6l&czM2kp*LN&b*3-t z@GSX)DMv6V99wvL@#Ny+8-vMB$I`;FBvi4Eb@S$wqiw;oa4zlGkrZ}l>eX&`oLv4D zw5Ym<$07K!F9IrE;ZwufVG%O~D_Y6)BDB*agwmLpr%5_i_JO9ZSWIrBTG+=9x^gxg zqzuiaXkYY`ZIn`SLp~>k7QRoYeI>A@p>S^VKG%wa>TG0^Z3$s7SBUhg>=TCaf29!Z zI&1q?MBW%BR?XN6!85{on?a|eZvle3+iW^$&<`RC-ck@GY;zedg?Ie$pbd*sl@ zl%{#UPN8p-L(&qXi84(M_Qu!fs#WW5JRExQsJt14@#B;QxXVVK=kawW-tm`Q`8!5a1i;9$V(%&2L*FZ@G^j;Q5Vn8uYlKxa6 z8ep(HS(~bh)-l+VtWO1_K?av48&ZwY##B?ZDHV!_SeQ52oC-(73@%T$q*|k`46aDF zrP`zI46aOeq&lOWsYo=!{8h=WRClyH)f4Su{_13JsxR7?>W}s_e@${rYHM^GgMG8DdCYG5LMlZ&w5((sGl3a&JMlZ$>Pfm;_E=bPv$z-Ys`!B>ZvC(M(&B(-_ zM>rA@64JOSDn1pvAWTkA$-XJY*Gh4yMtV|+u?EMcMg%k>o{YUPDJ1z=Ix#~PawR4b znOJi2!iBg1Z`o59M}#0~A*4<4E{!p8-vHZ^kj*?3Hdr$(k? zm!6NMM@N$J*u+SRa(6!^k4E~`#?w&P;wmo)o9# ze9O+9Ih{zSQTC~k%P7ZFj6OXwH3e@~{KbqgGK!w1*jQqMkH0AS6X_TRC?%(31Cokk zI!PU=kxTJdJT;ZM3{-@q;(THx6E6;6IyQ!^CWXtAiy^W+(~?_m+LRCzML5o3@#UgM2S@*S2XBml`;`;4LqE zqE_A-w~d#VW+=nFO+S3h%-df!M;&wAi-xH4DK6ra?1#oulW|O}Vafd*jeT)4OIEoR zk}Ezz9~WkI^4Zbi$IMD`z@???!Jk_>?%RgzTt+S&?p1r1`xbDOrNY2v)RanCmWve^ zI*!X4#_eii6##}=@c@HuWQt#Y14GRh81D74pP^Nma!+xjaMqL z_8za%UsjZAii$MM3fZcT?(YVkw=f_Qy~3z@a9aPf-$~DfwcE6 z%Ui{_<`k9VC{|26PizfX7I%#ZM)(J)+q(n-kYQa0@7;p zmVBK`DvWSQfwebpyM)#YM-ls^7OFt{X{e5~w#>N_EWMd+Lu$4E6kOEM$eXh!d^`5< z8gDt?Dy~1JHO+iM!R8wi9PdPH&nO|<`_~M-3ni%YZC#8TF;8nD3WPKs#OTx(F=dF+ z=>x=gc{AK{)I(>L|unqMQXo;Ee_N5t}7mL3LY9*77SS8oR1kcAO7+ok;tfNARRFs^> zcvM560hh`^Cr*zNX_W@ql@@wg;QD+e*Ti&6DMYelCP9Mgls61-v1W)8jG3PhA$QH?h zP6zR3kAY?b4ILARHj-?aDY*|_tT>tAl$>%AtR7A|f*LK9Q4%*Fe;#~9u|0N`qBjd8 z6PF|#{0zXHpdNyk0AV*JEuR3T%vz{#8fX@0_Ldq#p#{9 zcU{^U?@uP4?Vq}wxi~q|PkHvsoL>CJ_$U|_LH4HmM~LX|o4PE}c0E&}(BOT_#NC77 z^mfq4Kfl5~EaMzyS8bv_aK~PE*S~kozkkmBsykoboU88@>wD+SckF?DT_{)AE!K4- z%Dc{UKhXM{Wx2qB7#KjDC-1MnK6`C8=Z}c~$eN$<-MV3I-MO}%V%tv2z9CP(17h0& z=DX*r$hm@|E4XOA?P|YU8O&F9E$zs4?-RTCt#<7%aE1fh;8bjTU@}#e6*!Zp?4g^h z@n1iF?Ref@nR7Ra?nVu$@ZGKSU+=xv`_N&jaiT&FU%w>9V6FZ%W)puU}<+-vE} zwG4?ZL*xtPo7-~DJH+N48Rp@N`Ft}$GwBK&HuvGwLxe&Z{J z8uIx#)WhF-+uvF6lRv=u8|I&0cv`G#FVs>%9aqX<=fNc3b5k=8>4t2LhyX*NF{PVoKi6AjPb4)`VEX-SF^+tVv-KWjWV)H2H&vgTV_9W212J#54Ck}GRrEwRqpvbL<1 z$p2Z}gfVM@pVMQ6c{3xNiLlZk`^Kv_6aa75rq&f@==E0CsPz`a50>Gd{T&`*0Er_; zVG=LN!*)-BZ8@3r&q!w`yO5wXtY9Z?C)h$Drcp^NK9LgAl%Z#gLT;cM~eYxgA5r6i^Ip_R| ze4tHU{Fb&Y&8!9X%z5qwf;SyE9E-j9OvLcEKd5=P;hl!n@V2$^6KjEEa=iP7J73qa zJc~uBw(S9DtaZ+L@Wo@9ay5#s##L8We(M3z-8Xl7wI0jN_7#s9J|eoC?uGkT-A#Fy z_qBaj_boKvcGY8re52>}o?Q5x7(SOTZ^6>rQU)C4R^F2Hc8K1NCFk;qHSfVy`$1s> zZHSnLCGKd%MRXX6^K&SgQBm$U}JteS5aMXFwn{Q>OPjS#*NjM#|(?-XsIu71TV+GpZs3(69%(Jva&`| zjQTVbp$8uUD3#?w&~D^S_;1F4BX5bD^mv#t2c&rxNbW4W9seEp?~Ge{SKJo2#~ouv z5`yqXCJA&S&V&E5uMAk;xU=XhkGqOcMcmF;;(rxy91Aj;VKroiZoVe&(c>w3-zMQz z`r%_%-Va%#m&p>#@i7OaGB&MM0sGI&!Ke@ajDrT`!iN`sI`?}&zkfjZ8k!+FiZWRK zMVYX5(5H71(qxEG34~Nw_JJ^;!nb=do*9wc3@)+?l0P-_Vr*(MonYKSToF5`B|D{z z;lo9OR60g9J!F^2c$CEX3du|SlFT1DMlzX+81ZAm6aq)RaFoXdfb9?e9;^BW5hs}C z^&(EJZCp&^d>2jVHLT(+;DL zhmuiZQP=<&`#?SLDRu{vf{0VD*Ao0LoV7qtv(~JQcinQAh%!|{^n{JgGu59VzMDNLL;YHmFgB7pV+>p! zF*8Ed`CF2M3-gTA6}-~y^L*EsExT3rR&hnaYzQf7A;|L%GjW@cLD&U3{NB4#1+>Pb zw2WadkoCkzXB>UcVH=tfCdVTtVJlEInZb;$Z<58#IQvKh!{9febH+Y%He!*mYe6h< zS>A+7_bJdgBk8y>kK7~=RF4z!$rm9Extxs8So&xmidcnTrJ{aaDNhQJU_;(6l}f$< z-R0y9vh?EHK#VjCM+l!}3Bs%7(5#bXU)~I=l8mYQtMDyENtIfty49LcQMvGcMkv?pUkbHD}8=b&{kYnD7)ss;h<~x{3sTK8Y ziR8kAVt8=*i(>fT>f!TZ_`KY(oqKaThi~s3UKqX4fFJ5~I}!8i85zF%~2 zVIeHXl7wGat!jJdvid2Z>tQ8V*Ozzub<&n3iLO?4B1s)3DMJ!};{ty(btARx%WWMJ zw+`L$9ihzb`Wke@(I3RCg>SpUcN^RD<<-|c*E|sK+}wX-f4*TbU(-bLpvEwyKuwSW zed}|DN|G`Sz52QP)eWfGYr|KE=_U8cYp1TBVl$^ptnH%iz#pV4ZM(#_U2@l)Tdiuz z`v%PN?DbX z(?TKh(Wh#4~u;BsERo~qXWoTHmjXV0co{H9;6Q0@cg}MXZzIJ zk^D%diL~s6Fv4|q^_P4ISLq%NA95_v8A^RrSrs>~FxDC>#sxAgyMjCG$T~7jO7wV0 zfwXtlqVy$G?K)6{7A;nRvJM4{mF~nWBqJHqupj^h_B>NQ5%u+ptF!MNhEMF4x z)&fIo?jt$(Y0-W9{gMBC;RhGi+~?;Ev}%#aUrmu~-zm25T&X50*IM9-HTSWc`;6#5 zlXrVwd*bR7kVko(a|YC1Sun91eeLzHt#&+~ukv3%cen>bIyfo`oq6-pI&ZVYuzdOc0zkl9k_n-+Q~)NT6ND=$DDQE zkoVTid1MJ!g>VI3D!fF_*U2GjQ?d;mIsCbyBTq^uC~GAPQ`1RiJ^}sgFA);4T>=dd zlX?k^USmW5%M?a~B>W0FufQplZC#~6a%kERbIfF0+9!1aCC_7yCEF-U=i>)C7Bf{lE3&gp$E zcs2Nm13p9jxnk!n((k)#-;jHwl+ZvBo!v*H0x&0LSdS3sN^1le)RFIC)?SNy@~b_$Z8Mg;QkQ zIJ}XX?L+%z=@v~v<)Picg#Qoz814q?7FNg&(k;WdSxKn9!Ce)1QjW_IMZlgc*%lks z)!Di(#>8`K473C<1=q!zwOGLm{#j=S2QO--wQ^lL3vX3(wP%!p=dxyHMMq85wrhrf z*mE}HQuSJE(Az;#OP#%2-k}gje9eXjp}q}4@RDvMYX%2vgS^51R&i%Xy;UxHle!x8 zc3G0_A#Isb4$=k{SbGZrZQn#+$mK)ck+ncdQS1TaqS6f$J2EXwYP3SXk6INB?G4;^ z&8I+GdAD38(i^oj&D?L(vUK_pa*E=dM9qogWX~@kFSV>+Kwkb|V7*k@3-S%mu*{!- z1NqBxh7jy+U`7uiK&d1Grp-k|?X=7Ze-jBH&LATRl7dV`Ji6 z)u@*oT8$!>5~HIX6spB9P<(BPOoPN3k_~1SsF)!95nNoL_uqdHY(;?m6dOaI()Yeswvs z=HEBx&b$1pu9jOiSaYma?^rQEG#MjgT2SH7mG2bGcS8T@sa>tzE4ueR1dHFb)V{jy z!e>|QO%Lr{UGLKQwOW!Jl-;lG$XC@CB?|dK5K4YA&0P&**j z4&?pK#J{%g&DYdlk6nwgh#s-Fr^whcR-duo^<^^q4I6=A=;nbN2bRv~dWS^()iYa= z0F#;2L&|4 zdi5AOG}N*9e6C}+*s+^TJf0K}K6$5!S(gOwcJ{0GFN;TWAURuivaTb+MjLu699i<^ zB74Qi-uHaDLubW9XYVwhqsYd)h-%Mu?GwB9z1N<5{G9msxjQ5&`Juf@CWAD#!45to z!9&+NTDnjZ%(i`;vm(abbfr2=<_(|0z|R^dYBs~sl=_Tul5?XnHays3jJFKR@;pn* zAqd(xKAJFqVW?zMN%LL#6vY{6qjP*p}8F%r~{gjDuF zAc645@tSh(7b2b`&>^e20n?P6g2GO=7R1Hg9(3mY*6R0cDY80H?0XJczF@QFnN=lm zdSm+y%T<`9#njxdnsphW66HQH^w=RTrq(2*62@;8OV=A~p*y1r5cNw%s1Cc6HG;h| zfW1xHC`=CcjqrHU=O}VY#T7+)V@D&Irt2FkACNq0BX}cPqA)=Q(kc!wY81+jm z$ypsAFeUck8BZT$IEwalLJnD}p8;h{p8<8u1M1grn=$>BkA%BwylS9{cGU#Zp-3*s zTeL;yEObcZh=J`I;4;{!$1ui`vux^4y_u$*LRavx?OC$#!{O3r`H=%NZQA#%Qp$b% zm})b90Dz44+o)ffmK)H{cVL|>Q;#4Z0R%ffc3FdEY!860SY{~%Vr!TC2pomsw7ybR7Z>jZG_rn0^4p5QpWPTrLTD0au+r-ee6>DzS zNpaW7JAqSRTaMokw66s^!HXQf-+w;u2`-GUx+7$#zhCU$52|kqG<`p`_sFD*NCP+p zXPtpG?M!TUqR(w-AZ*q$QLzbsqP7?3J#`h5LXgbw^!8o7Js0P@F};~$%ECh(IZ+0l zOg8p|)hI2&D96N`7-Y?Ol&YeHG6mM&2vr4W>q2#el#0l7i2E|ujai!!CfwkGELl~U z%D5}bto79jbIFpd_FuJDf&!(>v^|`1;?3VtIjpBhO|0%IN($_XRAO-~iY3~!7A*3X z;R`a>y@T4)iJ3KtPK=fb2ZPuo1`o}h&ilKUPKf@UbI0!2)GrL(JbB|J9ZK0PMs}|ZzI))E z13#)cF?Zz0?cGbqmuKI*xYqtT#zNlM1yPOfqye&r^58eWn)7zu_IBN^YR`xJSDqHj zk7B*3#!((4mM1$&Q+D6jz4+zjV{7&MaWDz2<8#y^dysvUlJnV-4DQlkhvK%O@sd?O zPq$7EK}$OOI*C~>{W>!g${u|IGv*L|*&80cuWgKR^dK9GIsX648QfOMEQiV%@&jtQ zP$1`jzyU47rW|p~%c4V=dfL!w~v3b{O_3rm--hUpOQ@9dn4Plxwo8mN7e(11y${v^;<<0`< za5&k3uOsEuRhNn`9ap{8R<@1Dh?;@C;n5rATV_Qq!Bz)Z3-5qA91bMHR8O*j6k;}e zW~plQ(o`m<8kDi42*p4ZUkU+rOR2i+y8T;9iHtBX9UE!>mywuF>=A*9P3;qpa`B zzCLSu^!^TJ%41VfXm2Gg_AJ8?H*fgz(=fBa+D>LRStE8DbQ3eXvCleYomofL`n(~H zlM#4xg5lM2-ozm62wBtf287hYIL4vnuRus?SN04T$XNBvcX(C(CusQ{D%n9{Sfk{D zl7^v>va(&)m38Uut+Q_IknXGtyQEWDQw*Geo48|!x$vwfYeD;_(blqT*)y2Yv(!ef z(n^%z<*mT_?DmwG)DZ^_O6s@;bu5?bsF~U8@!c79Z{}?opVDix3`4(lN+~6+EYCW# zs58`&#T9q9B3qF*yqeCI&*CbuiKoq6)`NCE{LKEx2(j@r&Kq8JWmFn~Le+ACbA{sT zOIlw#LKGL##z>O^WL>-?6I9x$jEvR}r34%+FpsN-LJB_U534fM4Vba7ehIBIV|{TB zM=UV@T>>K6!_vAU{60Kt*d;ma`=w#B&9Jb-!f*u?*#{Sw7PeY3`(xx_z_%`Z3@m1g zJX57=qcoG^88`Go%F+2gmvck|8 zYo25t1=}akvLOp~$v#W?cjV9zNTv~CYYU5&TTl+3wU}`+txZ|TO@!;B zWNdAZ^SJmx?#4820^mJ2JhIrpP1P=5aFs5a0n40BX~Y9$p0;8Uyuy8Cw1u6k_C_)Z ztzQ_-)o&5&x2)A~Tk{Rfx$cJBe!C4)ccKV)ilLorq1~(YhWjmDIGHy$G~fG^?I*?Z zt@F->lS_en)lIO*PcGL% zVK;BS2b-w<*Y+>^?^H#W_?2$4?+CE&!c2H@x%y7k4oVVef$2^lGH=O;+H#=*F*Gpm z`Ehx}LYyeRJLUU-TwYV?=XRVj6t;1#k=63hPnrhjpU59RAyz*&e`3+K^u-lBSn8!# z(ccHdVGxX+J*(BNc(-)WS;D1%x^e=C73WXh_xAxFy5m0tw0pshC6^c+oIe8gxUKt* zldqq=8wf2{{SLoaD+aok&gFU^6MO0Ccbjki^oLfisV5(3!12g@pe5fN$%ni19o=LP z>zjY#qjE0LlyB-xZt8r9suC07idT*D^yc8~Jb^e%tbp*to0UAg{BiW?`n`1xgDAm~7m& z8t4`s-48qzHe_&HocX$jf{EVk-EUla{gMpteeB(R@9bmnJxB0?1@VQT9aSG%s#!`e zA6}_i8C~r^Bt{+=TMs|r5L2j*8!Qcr$Cl1553E?2?cxCR!bil`qXmwl-XC~B{exYf zFql!etdB;~PwJxx$sZW;U6-m}WFm`>d|tqj&+izcCfvDb=An2=7;s7nT}%%YF?L+w z=HQ)B)8LRN?~dDxY8ubWWl;xI9Zo$fAv)@*8(zLicm-bx^+y$y6mGtHGaUVJ2(oLC z&cjqcO`E3n>8IxXxM{MIseS^RV92o*`mymcN(>&RdmeL;g(Fjo-i#PI(i1#zaPXNS1NRm%=ADt8q zk4&dWlBZ5%m;b*&7ygbMkcqOsDPQOKKaZ$+xh$CooU8h>VqP%lI(aO zKgXe=7qG*R;fBGZOuxj0>B1$-jaUbj7sIsFv`fQS3^thf{UfwW>@o!6-#0vDwhv7P z$z0gOj=0h?#GVqW&QM~4XXI6j0L?HTxv2SPNC3zlg}2btA_<)(awn6X)(A%mbo7Jl z@`NG}gi=+B8Z;s*rKpr!8n5Y!kZ(Y#q4rijbflrvL+k;!r6Zfu(gIlOxhp}rTM)9Dtb$pK!qrJg<41m|zr-djGt10b>8=mol8v@|f&td$r zM=_OAG;ps@f-A6Gp&Q6_v~k7DGF+L%N*=s=(V_T&aYli)H!z5FU=J|PDzNqjMz`Vv#?uO{y@5f?3wthe(CS%- zB@5GAVJ%Zv9|-@ccjP7Ae56GbjDcNutFP;Av!W2Y$RH>Zqu7X7cxHtKh zTbWoYTgL~ag}*aJjMY z9s-2-$@u|AS?La1+-`M|Jnko_am8+`c+FRgf|#k|hD)}De?l?URGSSKWu=7?imIb1 z{VTVG&l3W>cx?S;WWrO35S}FGE9B60eN+v{`kZ>7t`Zwb!A6HVMEIYSA_=GTZj(}V zoNV4yvID9L)G6#n=|98R{d}ELF8^stIa%?R53(bXwRdskEzol7)Qayt|60#+$SZwx zw$2Hny+j#SA9+;&b_%#bGUPi2#v<6YR^JVs*r~gr{$*=!%OP>gp|#NCq+e^mEl#hU zxOyVz-XXeoKpaljH}!ycZEB|IdV+96(hjk4$BHkv`;3Uc#xpD`-`w#==HY6{jhv@!OQtNK&iW7 zfZ0fAHg||EJ60-gx9quF*|^$te68}ts{6z}mv7Y-&Ub9fbqt9eL!@wQ&36vuI**E- zN6FWg@7ta0ds6Isl6>9y_N}@0$Hn%?$=9;M7a7b&j){?Dq{D14csO^9EbcEaei)49 zr5k!AzcQCWuc(t3thQ96M3A5X*T+H8jgNsT9*5Uy2SEDClFNSspj3*C^_M7{4Ue); zV-LcVX7Wa8@PMmQA6f#v%b2j{IW!CfQuci%cgT+{dU8h4Ik0TZl7T|8JnR=o%(P0v zlF%%9)TIIvn-_&Gw1#aZ2Up5tAtm+tG~^W!6$zxc7I0%d6jX55$&h8jFpX2y?2>WZ z--pE}6NhzT0u|nnt85%iUu>F0ce3z8wnW^CjAxqI!d&{PuFv%*eXz6*Kosrfn!9k& ztN+dZABFbduD}67s!gi_^J|gJQ?QwT?q~+>hto6*)IMh6|x}^VWRl z!F>DPf`jukf6Q4tWgptO=DjjoMcUzY5^ECz=>G#E3z)_k?~RY1%sQ)TWuM}NpCJXd zCAL}oHDKYt!ztB5Gk$tT-knN2VS{e^ZR!QuMwtHjwVA6kIoSAlBTJDr@8GI^P;SKA zRKjg?HdjFt-L<&_ohA>?suU*=CN7kpaw80^5y}`-cj@UU9~om363hf18RIee0;+#a z&1wY-j7`hP;?sGMX)Md#%IKbc+_G7`CQN5KuSfVpYB32a zF_0BCxr%nNq8;Z1v#)2Dzr0p)7}g-_;XWp`)Nd;7o9oviWo4j-=+8?1HAu692J475 z0zX|ql1SY6ZxWhO3KQ8#6|e-t3d%00RHIqX{4H>(XXr-#bv?rp7S&9p6T4I#`A5_l zL``8Yt_Uv z=uPrHdKbQ8D6^E^t82R1bEBtV0Su}q-0?>6^3E`mY^ z%XUpJxLuPAZZDAwI!ok&fJ@|pkSqm%e04|Y{?x^Lv!QCU4ggAK!PZ0i}^ZI_mfQD{4s1-nG!NM-bO)8#Mn z*(G+FiF};6p2g5QmPoQ&ru%pt@&uJIDWnB(_Mpy-81(BS^1)&^W$vN4au&yvNI3B< z-ABxBHkMf@V(5#BvAE)QMuZ9q+0K$Mo(^gf`Nmp>MN{uTRPL!(17sz6>!+k;j1)5q zBuS3c{AsA{7fje?Famwh@*OeVpO#`Hd&juGj!#L!e&(Y16DRC|g7j(m7w__6gaO?V zBws%>@G1GrrkSL6Vn6PprDV)%J#|5^UUIqPIvGtq_#~Hru*1| z0{kvXJS~vaMIiB&6#A3|_yL{Nh!7&&_-EnD6=^2iFga|2Y$b@K6GA&V9puoyB6(P; z>G*Tg^iw~w;mZ|@pv@ffO|qV53obia$Bwqk95dtd*-|$|c}$!2nf|;i%!KoaZmNz7M$I2b}MxocjZ=`meZw54in*!5tI1W1kq!2IGeu zoR6KHq4fjq%m>`A^?}C>A8aq2ht_d};W5LU@Y?08mmlI76+924rmq?dzWFDwfAQKEA94ggeBAscgP~)= zNLL{|L z|37;H2(QHUB|S4Z`1hRuTz1cYzW;L0ztq*a8AwU~jYMCFVLrhJGYJOq@YgKE%rg=r zu}LPy4zn!MT#_5+=)WVy5A(FlkrYzSVJD^eWKGI7?4q=gtWCLx-IR7FJt=Wmq;yTP zE>%BVPia@OA>|$RrhLP`RO4_XEvrp7rToKwO1qPRRB$**X-~2_)iT_|G8}V~k;K;- zsZQoc`^<@#|L-_-dbo{p)&pn5CY4>Vz zX*H2aLuvPAS&hP@k|EJlMv{}!u^5RZlR#X{kfan<5>qnpIi%;^TV|{<~lp< zKF@?*T0Nc2`D=-jXD^&MF{C-;bpD$7D!x-Z_az&qS89Cr+;$TCr@Z?(Z{$3sh#p9g-Sg5MfbZ8G`IcYxj~IAQrD6Lt=L#sT$5 zAujh2N|q8aRW^FKl8_`htv|TN6*)?Wt%bfc9%n-E18EKzo75oXjK$UH zNE%j}SLEaf@ogAYYcTp7kz?vOkrmC8NJq^FIz_EE8jUAoiV}?~_<2Hly79-8`_$Cf zzOlsEUNr_=NZosV{LmAFR}abil8HfSl6*CZdqjW;7nZxRdJgm`+-Rrfb0fSY-qYUIWxK7dm}r; z=h)nVqUgPOX6DQS`^Hd?#oFsL*WuxNMs0&P=OVq(blHP)osOkskZX4oQF}azv>1&7 z(j@g7QV*;|giLcNDxs302_Q4^E1L5~iuWXfwRMI}#^q!(8eL||4k%Udvpz%Li6AJ_ ztPLxkgzOKmGbJu4bQOc!N)G%#a8&<4_uS|BT-whIzJp@7Mb&UU!cmy!0H!HykK$WZC5e{>yK;bO0bsg@!n6xu+Nm|rrFkx< z#;+VtU6Libk2lC3?si<(hr8f2RKYPw*hV;3EoriYUwiO{QVRgfF9M}DZ)RUQ!r7PG zmyU41VDA{;7{C|bk0uo%j3#JjKO;ods`Lx)jc}xj=2x5>fi(ERbIYDzz^huhS9#2K%S7K>ssF1C|qq)a$f%IPZF_IZ2h?ts4 zQc5%p_C^%?(|DxJ4N2jc3a`E;Z|LGnA+e=u%`HqsaaK^<0w&&f)tQKR-yfVmHFv5g zHWtLryx2*Jt$A^4v9+_%x+~whYw1K`&*A)@!>cV%7yWGoe^1`uvp7-MIhfx$xZ3o@ zdaYnUqR0d~7ETm8BKeNU%8A0kllg-u@A;nv0IbzF0|E)z1=$7}#MPoQ_%k3j$3HvN zlPq+!Lp%vgA#)umDq0F21~k$rD#NN&MoMUf80i=y0pR%zAtNl1qwohfn2e#3snx0^ zmR80xicF4UMZE!pfM052LSE*|V5*={89+**$xvkFof?@cdRuV*RatN|Ye0%~6G5ac3?17Bpqf0o&E`0BE2LDTbM;Q=-GyxGvT?Vju z9DY{%Qv#x&ZjOtI7oQ_C?6kP7fHUwIRQykvWfdwqs!7EHqHQ1KCDUD>iQ_H23D4drj=hHy{3R zTRwO+=lQfPytHGj?EqM44fk8yQFH&5{NB}fudWT9EZ(t#2XW&=HEME3^R{l4F7Xkz$oOcXCgc(@hRz(Aprbo`e=1ag>Xbw~*W2_n&%T*Eh zu)-rMx{EYb1U?YsX;OPk>Jjh9jBYnM%KxUt2Jh5lma1Wsf}VDmbon1jZB_?LZFHm= zSB3tqwSo{$?aW#L@JaTSJcoYIfNlWB>~~Z(51-`s;w&!TvS56mosYUyB7O* zzZ#lq48_t?=(`pVmT}?8WqpzlK7bIkIlolrroUV?g0t5L`3)Co$T)@=121x?c*lY|OY6S`yvk{?!a?hA4DSRxK(BkK>#{rFx*Hzq=|g|MwVyt; zblk4LRln5zeq=e4ZyAKLx}vAy=HZ#c3kPpMee3DGC%p9H`>!m&@@vnN_ZvG(9P8^T z_JrSi<=t0SyAPHa_A7ovF+ntFv9Ba>zMT&pd{aHR|8?~xHxq1|e|hfZ#bfzEFLdty zw5hwOk6DD8)Lg3A5xPBeYY4O$sy-{Ygn^^F8Y6Jj$OLv2o7?BVKlgpJaTWWQ46b&_e_-SF<37ZthsY+QIcQv;v zb>1u$QD!Y0&_+nhf$mzEANmZZ{n>v*f(W_;#2xI<1QcmP{K|OxDoFxO1v*?)HgqV5 zVdcE4u~#*ALcu{%Iw4iqIeO@;ET4vj^ExnBSU4?nEeqS;Zl7`H9J%LK2*Plfs{r4_@QR?5QpvH(m=%>HeS4shK$R6a29)NcA*lm2u&FtqnNiY5 zQc$hf!FW@7{VWLo3uOKtWYrLU-@hF~P>UURgLhpY3abMr)^&)2kh_y<-nuYS*t$Qz zb^pppVc=AL;MBd~X@EV?{ibc@R{Tl^8W~vn$VP8@9_QFu!t{?YN5DKZ{4nfOm7vrz znafqSNM?noOk9dC+PkV9Gc+QCubr1ayY87 zV#N$l`zbPFV~j-A&GPfLbF~ZIw|C##U1&R)Z#%g1{94sDD?EthElTc@O2qLzO`(doa6(lf&#S92X-D6wExr8J40MB@oUifp~^Qscnd!&QE4ByR*Q`VuRn*rlo`knUcD6u zpq~a?&)`B>bqC{XUl>??>h8eZ-Vb{}I`C1~ z>fq47dcRR@9sC_~J#+}oc)scRoydq=?sx548Yt{OoZo#IgJ&1>M=##%`eyFfZ^Y1p zuATJ%zkct7W>ho$L)+KuTm*co=fCsg@8pjEHqf>(@b=3yCrS<`xb1oIui`nBAhuZ_0Op@S9D?MoXU&ZjhkKCBO3qJ(|eh920vnx^zw*0=XN$7pxZ7&J% zUjo$`F4Z9AW;#NP-M5}BdGJYOeE!dov$V?z0Wa6M0J${&es($g1(LdM#c|^B zvyuh3D^T>Og>UvE#MF9fje^Lcb_dE8l1lKyiE5GMzy&vcSlMWr^t^4!~7#CNYy5Y1jhz_fp3K+~baUD)z zCoqPgd>K>lo73-p8i>t3*GFEe;7m1F89L|4P3V=(LS|yc92SHjQ*c!H6cx^*Zd2)B z!mz1+jhx+((bK(xxFs)cSxBylyU~L3-aI{XdiK>f&gIw#mglF{==Z{u4JW9Uc>S2g z*OXX^yUf7}6Flk39s=(ewbsiy%3hIKhnKAYbiw+gt~EO9f}Oc4S?5Zm`4Oh8<%^iJ zg7;?MjAS|J{|V?HKG}55v}?LH>&glfECEo5vFgg)CI$61W&i=sIUHF5ck8q}>$Z#j zO?yCDc(QJ|zEx{_cPzs)=@6(4t8Im);g)#SiWizK%DtE)!}TuTbHn+Dxl~ZJmt8W%3gkNUMRXSKtZ~f}dATvyOc&7P zZ7c{v7xAlR-%2zU8yiccFKZr%>c!w%0v*1N-c2;d#~TVT8_g`#XN<>$(12dy(LBam zP(BwNI1wqDkTij^&}k{C)nM0HiEbZF#DRqp4_dk){?`(o6^fyrLa09<>YuIqt#`|Uy!i6Uv-iA*f9nmDe9XXEI4x&d zyA9s^p`A;@(rfpgxv&%+6)zv{kFzkJ#g+Shb z@9}5@0YM7TeS8dpz6;3t*O(1pb_lY{qknY62xcE*1!}$xWfq^p$Z_^s$8MwzA$8fU}iCu~ld^7zV+C?vdMbc^AeD~ws zLX3jw*McL|Ue`m^URouAVNVj5##Oj3g-TQlRu@Oc)dbw#L3O9~(8?6OO>ACO)cb^a zzT-4L=(*d;WriC6;LJyqU zpH!Nb8GRjh;>UD71yg`_Pm{lgM^s)4s-cpDW!cX~hVA);5kF=E|IYM(%pCcc34hEy z^)WN}1;?}8?-!E$+20*S$tPctr=UeAm*BKz!dpp?RESaC2n_OprTz~oyyJa>o Q58+*my0pH5Anpy2`{ZC7Ube!3? z)Be759{{%M4l9Q%ssDE3Z!U1$U(tWtY>k3(vUKSjVx|Rn!+YLHzrLfbJ)z%Oi4@18n&{y zIcZDT!}gRT>|pnnWKGH$cCxrN=}Ni7ZWgyCYg3-ECsh}&WB2xCeX1eckn)DT?B0>| zrTk$(i`OI@Q%&I}7I!8CspfEVswLc#Y7MutG*>d1Y74iqxI5XN>IiqRcx|#X)fMhy zaZj>4)f4VX?FjE+_jSooYG-(7syE!r?(37gQhnh*#2aGXWPfTPJdoNQ-p%fP$;VQA z!h2GK;lb2Ucqp|uyf-x*9!~8G?@R3u@7Hiz?yFqP{{|OpjGHHTWs=Lk@9Ly@_;HrA z2{{A5CFc_?XESoP{Fa;t<8GF>6?ubSocCS1OW{MTL>o%9S1%DCi0^*aAou$44MV77 z=IE*Pc$|-?$KrjH>Ew;z(Wy9}oz4VPld*U*I5wG@ie?j+lJVdxiR_i&$?40N6Y0w% z(J>^%6PK@KGyR`ZeIbo#K9!C0QDjBjn8+j(V-(XJP2U*BuQr`V=#Z;FHklqzTn=eP z%h7B$p3WvF(|D#tFWi`-qIDja3#U$;y@-hEnRCyMPV%XcPOLjNnVe3g&&vhQCeraJFS?&T^W4^?XC`0SoP>&; zXD2TwvYGSzH*;;|Us3Ub!R;^WcjWHutboJevqiU}0`spySM@dzJJMW-THuSGIr(PTW5j;3gk zo>OVOxKuorh-Tv#*ra4W)zCZDiuPxvsm`38eCDWVRqoE851B;w(dq2u^U@+XE45Q| z{`|Q_CWF45jov`Jp7`}FA05j^l5`tMq+{{xqBoJDH<6MuvYAiC(F3t36}=jd#8Xq* z8^~Jjll-PKk#STt$=?udEF-HkBig00P4V&Z#Pu<0ZA+nBzFB_`k#B+^#<{QtL{V$u zq<=AcOdHq5bokT9^)W--5YxZ!CkiiF7`mo#xYH=8ExI5GM+>{$~ijtGUHx)rWKD9QPgUbA>W!xU9VE zIWDKkX>*30K4<(sIKmIK?`ai^kmDjUMdP?E)y^*D*0B60Rzimo`ge`;GcHoDR~Iu} z-N$jfGwYCxVj^=ov|{>!?p<0?xW1vuI;B{-cW74~*9|UY9-T4wPftar_{oVo2GN>G zPfcf~8MRGLXO$bi5B2c7DCkEpW9^SkzLI98LPma&(uXK0XB$xRiiYcncyfBicQGxK zl6WkLSCY8K_Tb~enfCr5QJvs*+W)~sCRollKFJ5uldtrLETRFaShwZ~mTMv#iSV^( zK{UyI=j-UsqV@_DDVm&)pWyjPo_EnRBW;cGG-}dk;>mG7L>=5oK`#YtY^?htKIC=c4vcYS)_f&Ewa#Rrm!O9N9kvR5Y4 z1J^(j2PA?QzaAgM{^h0H%s^D4`~6clc)Ubz##3I_{mS@G<3Z+41iyKm+oI0qI_(owqoAp zw5!OFp_lSHS8{sZ%FX2#j(+>KIelJ}f2?G8y>({p%!20Iqj?QIeRb~Dh3Kj?C_i?( z@@D=B%7*l#BO-SpWToXwls^&?tr6mW(_jS=(H@C_9wp@`I*=|=m&x*c1NBaSX>u~j z)&+l@?j7oEId5obNN5;Xv(c@cb9&Ys6szIfb!$$Fxj29G z8dbDb%UNqa^C0%?uW0lJHybnS!5A7>uVy*6-iXdtvILToV>l3JP@ zw}3Smz#5DaYrwBH4weu%$1QQ|xYm!XF)d>uc06;$ZSq}B+#Yk{&lPvbDekx>R*OGR zOgkQ6?4=Iur6yJ%cTNP$Eh@2C!#3%j>gnUgnD-5R*v9A9B|+w&np_ z^NTKd9jb!Yr>-838~v0+=SKJw=$B}eSQjH(q9r|@id=zo0**{kNEy9AnPD=bFIuA6Y&w#K zyvXlHm&R(8`NFRqe~b*A#-`oEAM3w>KiU<-Fk1uOSCuC!e0A2D(;~~ zM03s=@XMIyDt1Lidm21&+k7BJs88mui0g5uXmv!$D#s@`l5<{t5u2@UyQdXxH?X#y z)pJ(lV|~mj;uBVM^%58t`una-deKu=X2{knGmwMM8RHa9Y1WlIjv5H12A1 zM=ek()NWJqZmJ!-RjD0$=$*3*wW)C>jkQ7aY|Ax+_lL5o)Pl0a$hTZb-M*~)UaIGc z8D>CD(1t4eK%(LzBm}T4XTqI2-`OrToo2pDt4OW3&Z@5uOU|6LLkcrOax=XvuO_tT zQCi88+`+DG>#foUHf!3PRj%EP+AS55WO)T}6C}fIxgb*-bLRJ~%DaO`&ds_vy;)m% z-CWnq*76Ox+FT8zuy)!>v!0xXx92nyBphwIX5BeQmIwm7CiW#^M-{74Dl+sxe8rX$es?0vd zO`^n>>pf@1ZfsQ^{(q<|sMH1d+%?M6mgk9=ie@76kys)n)=x$GY$BS3(h3RmhODez zmI^-j+22|EUj|Pgp+01iSq3Z1^de^#g_4mn52-CDD4e81lBZWbW9paq0#C0uq~)K( z4`UsoO;R_BEo4MDkO@+7UoD6>SDqacp@D<=n>>Png^Z;4_I?56G;Q&h+ufsFN%xCtL!(_zyw$R87j#}%Erg0k9CyI zJ?rf*%EFYE7bwOimnM!)p=gnTfY^1>l$~NjmS%%QME(*2=zE!nJO>>ST(Gf2-i4sxKK7&n#l?{s!txuu^@g>sd0c)(zw>rKaw@{+`))zjk=F zc7NVfYTZHi0Z7(GXNTbISiCGa``Qg6-PoL!6w-kt2eu@gVeQqN(ksPiH8KB#r5E_E|i3IvOR z-9lhDMEqjwu+TbOsS|#!;OW_Ln*F5gIiNy9A*4?C3f|tO*!!tFDZ%>$QxOBI8u@&| z(*-rsOGRuD_$`j2xmhqbm+ZBRft9msI?mFv!D%g4%wuCqzUF>o+hX@p-%8zTOFfbLZ&i?A%%Q>URqMozzwQ?;0#Mv`RYd#e%0}!(*+d zK3g_?oVR5m_Dp}%T4G}-j|dNymA zHO@l5nUJ}h^h64Ch6z=wE2k^c^@J*sm7m2(cCIPkD>0=-Wtq~x6i0ysg!(v~Y{ zR@!8>S+Eia#?*~|`hRf&4c38|ha7wo<$!NZB@ejYKnimSrllB>rV8nnDn=N^Il?CCpGZ9oQOwjAELXMa@-Fb4|1{nlAx; zqEY(I5X&l)e2Ll<5;5cZd(>VAP441p<^h7&IRFW@4Y!(aHZRPsy8Gsgd0qbLQf>XM z(9O`og?E1M_U|oqzQ60vt`+^y>_4>^0w+G&Gap*59i1}~z><%u5X(FE+xG9*6n(vd zuXpJy%le-|gyzvuZyL6h?t$zHEM=Hm}XSws2#qZRzP%=YWjd zfSe|7CP4llTa2zY?8w_nb-r6?Zk}1Rt=8?Dv*eBQzz&~#$eJHoi&UctW?Z2ng3%~s-sjXkCI?qd5vq5UA3eG|Cg zcbpOkS;vo{EBsd}I7tDmRnc_(1QSnrQciiACDAYr(^Xc~#}Z>%{xl^M$imP_{tU%v zVKDJqG%z!uXp~G>44~v`a`>|poI_BFOg=@4ET9J$u^R}Ptl^s5OZ|IFJ-wwp!vs+V z+Dg3x1XTLFi~fGW-~XujC5>T3^T5@-W?(T$bPfJBlN5&t=UuZ%aln?1Yc?qk;h}NO zA;oJrZ_~oyqHbY730*=%@0wG3;Nsf5*4$FOR>eF$QbHY}p7m0^foo`7^Gb0D743`1 zg@zq#ekq}mK+z^C9-!_>9bRi8V6;_Ae^qNYG(7P5ZXLXNaLs@?2H|VImAjc+Gtr%y zvpU}j%mvmhbQka#tZcmaMaHeYH&HThcp0O$(5)4XeQrZc!e42g$MC0K3b$XshonmU zb&EkxFB{_Ys!7fOJkfs;bF!NeQx~b*)@ZL>T2hj&KWPY1;}}fN4 zLWSPSqnNd(T@^X1EUZ~;B|rskQ*Hh=)!>>G0a?V0dpRUPG_RY3!;ZvJ>oYa*T~8%89!i@Cc`d4PiY9%zcYMd zP9>*UPiFM}h(c}%(YSU}&`ZHC3i>GMry!0XR9^vAvFumq(f1LErYq45`o{A=pd{kO z{2x*hha+fv+!uK%!7LyrSRG zX+y*q-QJ>WpWxaDgu>x3_@5B$2RDpd$Ihkh!k#l9U07{@reJPG3h;`CLg>KC-c|q6 zUvb(->o1;1Rr4bYC)Rb^5sfupL(I1;Z+i#l1u^bEh6sbdTWMlRx#)A1ZnLC10TE+adUN0HG*0>=qh!-}eVgu7-lI zZ|Pcb;FK_M3e7cATfTM@;yRz8(A`RK3?j^=Pquo192>1J$y~qD&egS)Yi{W*whRj` z!zFLCBuAp)eYbC+VX<-P;;I|?2>AtEm_Gu0e;r{X<_#Cu*}F7c>^&s(9{O;&c0K!&fE|g9&?+E<_ZX zan!qr03Ol3UYFC(!f01b56Y=9$*eXF6Jhl{Ih{hcm$XoSsY?2Hq?NB6YIC~uf96Nz865ux^1csJhS`8eVJNKY_XR-UR(0zDi z_M*$fRgq4>*I=R|@Q82gW9rM>I ztNFHhv3b=u0CmL~Sh!Mfb{EXuk`AF89cA`s5Y#Qo1XCY~5=eRiQycct?e{3Sg#gOJ z^b{1fdOw?qI1STDPG`6~qyzZ6^!jbZ6!ur0=@9Yq3tK?OL-?5=3ifewulITzu!n z+bKftT26t-g;c)V%fhQkAV+(^WRa2+>QlZpH& zK5k?pxABddum!FU)@rf~+##x_kLzO=h{tvd7jti0*2)|vY}IRs+2JxF<#241LwdSR z58@80Jzi72Y}{ErosFBhb5QAVYpmuCL)Zlu4mWBtR4*eb1XL2UQ5!d~I`lCY#_wV3 zK^^vMt!S6`*g0~rWefR^mT_y*S~gpWHgeUfbamkWB`O}ni)0s(8(Qu3yH4SEHQ&^I zQ~yoFH;tS80D-%1_5-XgpjE?nzhyw)D#HG^^hmd|UTg}!Wt=0MHo+9^ngdQ*_KcV_ zR6}i%gGRxwvcJK*a!NJbE)zses(>eF#1DV4O8F{6HoqfF?}c3;@Tx+fq;VYTZ!F&_ zu{k41In6dM^ja;Kc4k#OtJ3ev=h-%?46Cuj67&R5TwlE_m?Z+WFm6}8J-`o6|#So}rlpU6CMj0{~p(@t* zGXR}&@=$YG!UtqYbVczhwpaAfmXJm^U+;O|gnr>HV-bozRv(hw$<%t(FY9yG6j|b%+TzA&=@Q>_6m)?NOV55dB1(QXzRIa>tTX$_g#DUdk5b= zT@@q6~>`}>cv zDmn#w=dZN}M-8MAZ!@HEW?pq$4p;^AUZ(0SbiUoc)cIa7th06>HQ#;L)_vdCx@as0 z_Xxo~E5_pfGs6Be_k3sb$G-zbqkGl2BY*tOQxAJ~z4z>L$I=Twd~LM{c4M*(4?I_L z1QsR=_K+gvL&@091o&NSZHpGky*m-hNN|`6Ug0GW4Wz$ydc)h=HdRO7XF2{TGGdqf z6hGjqOzQnP;u)${akX_yi&brRMo|zMNABP9^y~C~iG(oc+oCflID?D+RcCL(+{;EV z%H(RXZj-Y=Py-z9fZ>+4l9BdMW~@^FSK0S}M+;!u2&GFQjn(Wgyg#S^x-O@mHKg5O zUzPRAr7QfE4REP4rfZ<{NKS_^&C-^I@e|S?iy1F-mDt>uu-Mq;W-5NFvr&avDz=>C z+)G0+2;)V}0@DMw#Z{9xuaP68sj6uf=1tN*l=|p+7Q@GouC%2XNZ%aRSDR0jS(ZDW zsmZGBig34>?rl{Ed=`wistJ`^l$WX8qO}5rGbufc>8~CKt1K5mVw1RGH z%W>vX6{_Kh6!ZUD`Cxux&%CQ42&c*^K8b3+@OidRiQK1Z6b@Z zviJt=8XO74t^ukd(Gf<*oPmOKM=98M)A1nKSqu&d!J*}^tp<JdP1d>7`D63-P^w&>`B<@GkAVMh)^W8JT{{HVj-}=mpAb5F&-Im$yL*>T zEzf=kP4y|JsovhdWQ11m2Gt`U&0IvhdJf`FR2K=7Rd zw+}47w0v^4=@3prKzV)np{=Ii46fSR7LVSwb(K89LOTrxLxLh(Fb5>6XOa)kZ92E0 zQiN}S=-uKj!*cs0oJ%NE2CPRShg&X%UMTC1p&L~wgOU$Bu}|KKpd&f`geroUD~7E= zrbn6DZE~nbdNP_fLs;XWNJ)@sB#q>En_Pgdy zdknp}20#MGg`^b1UaF`VU%yg4y_%y)POppr&ZB1j4B_dF=2^-IkaEyxvnqMYej=K`sM}n*9cl|!wDW7+4)BpCq(dg1WJfD|--mMYanV=_H7<_eBeMIcuaOI7%m^eZWAc4|RBD4Us4+(soKu=3Fz;$@IFz{`H6HMCm`<`#(7 zu!h4MVz{_aOzxht{ z^0i-v4wpK2&f6A_-*vUE8Mx-haGI^7`;WTvj{6-u1$!$B_LZESOM1b%tK{;PoZU;E zg0sJ{=hR0RggsBA9k-u3q>?OJK%SQNV#|QgGEi(eBD5U&P`}!8qTo6)Z$*I2@O?Zj z_y*<;_k%r4hNai;J$Z38_}skXLD!yQ*RaqvyxO&Y<>=k6LpRTTc&PC7Gs3ZFVX|ng>?!4#fW1VVP=$w1)p?CLk-SR(P^&XqomptD2gQdE_ zf7^RTa6L9}T{yGkD|uSrHR{=kWO5>fqru{t$h z9sh0r&8I$Nxh`omhFweb?+5M#);L60x_-9nr@PiE3SZm)J&$Z1hGrS4UhEV+-D?IU zU{%%mnG=bL?#x{4p5^00%l*L%dE@ml7X#;g+vJd* zZqs_aUIx$o=z9acqEWSv+x1;)86#-I2)y{Rh7Yp{V7=P-E1-90+#hEm8A>*zWFv## zo50RnMdfVm$RvM^S?$lBV~(!)ssj!@(9sk+@F4w4Ok8wHFaZMv=oAUglgMY#%So`n z{S>c4Gy$4=m7ttD(O!-# zF-3f?Y`@~{9_UJi3zRY2qm)A*tBu}>(|K40bfHEStN_a1E2VDwVnTVu)%&|kDO3JM zt*S)?h`D)GG%&FEj&92mWeXUqn(AN+7bmLdvnA4ZLoQ1-t{nl;CLkX2mf>gqUr~{N zO#v(X3}T{9@=0ZF@kG7&M-7XHU>gVpBQ0#OcF@z>6wdl z37-;CKBXez&L860HdrrpM(Wl&#VWC0k_#)bUd0ezK`B%a|2M>P@!2GT>^HlNd=cU+ z!MqRZ{l)q&p}q@V756>BRF_Pvp4}yH8_t?Ebgy&z26BtAl3Pa^d&QR$2;e30s}kwG z@9kI|Svs}q9o`^Q0U+43;Iqi-4Bd5xaQx<--0fVkb)V3>4-UK5pIVFipAz;zwc2`q zUjI++59MPrp4G-f`7`%BA>zqxEH%N&qKje3ZIDa@J6D@}S>;B~;ePA%-07lySg;R6 zMzlD7X_jDI22Axc1b1{3ra=pu5jcA9fN<&f24Y*U2&PlIZpa|6R$3&4aA74f&4R$M zO-ODN6qm?W@JhA{%=F8+ROKuDg8Z%MDN_}I^Y#tq^>10{tiLqxWU_@yvZ;$ibl8ZV zL!=V7#mMCCOWBmGnvbADIA-CMNuHT>$Qi%d8DJjcI7Xf) zr$e-oaYeG4Y?_sJQsUy#%gOlQwv)5B%H;g^V(m0XM2ueN?$`Sk>fUL*-B|R~5vbMr z-n_Nsa?d}$=)QSm@$|C$hoc4OV8J{nO`uGvQIz9|`>+WlLcM7M>B}^_vYizZp&jLa zg2XOZ2P ze~uiRRx@>*b$ve~%pif^tyeXSu8hhnMhba-{{p4?|Ae6O6*A>uroGA-wiqXN(>Msg zmc9%!GdF|HFE~R>p;hNl!8{~M#y6;qKcV1H5o|I-7?HpR)Q`ya78ZIh0Mz8S2q^+k zwqk1`TCN;mpb6W8Gb;v3`5+H9SfPY7jc2B@%1C7(d5}ifi9*cl$QQUZb91I-ub;nB zXdhm20FA}xW$aK#T)MBeLxvtL|RBH*|n5_-_Hjmj1sJm#N0*B3Zo%D+ zbDFsy=HO)IJ_cK&dh&hgAj?C2reaeYG&K1vK0|(KpWJMjqzPZWDk-|aoUaQgPa!ubmu23@T+Z^XP3qih$P z?Th;=jCmk9qoKeSm0p#!KWJG^CX$oK&ojaFHZGi~*zoy~S zcHsYpf_)U+q2Lq+zoFp&P(bvo?7&um8yu$x_>|+9LkvEp+!Xu|l*o{M28cnaWyjr} zCkxG|3i?J&iNCkx>3(F;`HcmAJ?`6k7i|SSY40+w(52!E6q9fTifzUfC}kA5!q9Tp zif-lE74ruJge?qx7$aO^Xw9g_qZ2C~E18uYA3U|r;T9f+tcgb!Go0({Th|Qu#kK}; zVWJp53Zw!T>y&4$PGi_x#x3@iaf`jE>)?}`Qr}~bY*s@+b+SXe0=A&L3fO{fx4;&Z zv>CQocTk$ofFm^C06R2eKpd4y*v5ihumsjklvHCd?Aivrz!L?#Kvbew^jbnkL{lUZ zn;gTpn4Dvi_!c3)M>d(x^hYm^F-QoOa##Vg_z+Wu*#bo)Fea=o>0s3N==IWS67Bfn zUOEHv3A~m@TF&uP(i4yj1-bOuOzH9yR$ARqBF;X%`7b_SeoGWn&1TiLQ0OX}8cClwn zLx?C}CQ~|7{xF;L)wIqp$jihb<~QP|YTLdjEBkOG`(zvY#Der;h2AeJ!3cPT!ol>A zo-ZiDzVIi1*-`qE))&=ZHUz{O<=~04i-TWKzvRP15IFmGrAm8vrcn_>CH7S-E={yV zY5Qf=d^(%J%3@eQL--x6U?%?ZG<_VDL6!6^KCD4}hB2PuNqXanONj=lXpj=xC}^jk zgMv;9x+v(TfKj>~6nlz-+Z2#s#|RVu0mY6`K#UsBR1)Pp%m+cO@Sn?kj9^Wt(P$nG zahj8wzvenW;R2s<4S&ViKjG>=;ky5p8~lVj_X&6CFS&ugVhKH)|_;hy|ltJ7#dHzWAW%xU`nmfQWg){GPm!6OPb8oD*^`Py6loBj@k#k0#hKjY}04gLQCsPAbm literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/arcee.cpython-312.pyc b/model_executor/models/__pycache__/arcee.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0773a56b206a34d7451a65781f58b52d84af885 GIT binary patch literal 15832 zcmb_@X>eOtmfm~V_l4jBk|6oG6I^9#w!5dq!m zj;jo|)iE`b9#IwdnA4M~P$Nw>rz%OSD@nCdm8vMoA3TvAz%O(~RijjD{!Bq8J)WB6 zN6xu#0Z4;(|Hzeiargb6^PO|g`)^LCjRF@ld=hS7Pp7&By5Q} z;?9thgsm}G+#Pb0ur20^dqZ9lw#TaCzK}0o9jYdAN314Z8>%H?XRI#n5BcNup?VT` z#Tw#`p~iS16d-YTtSR0cYKE|f^~PG_Oo)lMhFVFyD%KWn54FcTLLKqWP-na=)D`az zb<>oNdV^wpA5m;IXP!QsY&M=aAK@dh7#AClCb$SMc`qbC*q#Kn+~dis z(UfqWPfl`zkmRAZODlBl6c=Ho+P9OFkt=HA$@mqHWuu9!P_l;QrXsVkRQLlfdi7c= z97{%6j+g4=k+~~enCIe=neg?SVPP^7aK0&M z&v8*+1YtaRVMMa2vGeDHM#(!en@YYz`hjfHkL&#Tb5TKnv5ZINpx?TvfGZW3)50?_ z{Wu4mN!{_tbuP@sXHs+FXj%8+l-z%KDl!S{J}22p!5FqCOv*be`(7ooc?SX?!)4^C z5Y18{9ZQGwR!aWIx>!ADU=8qRPDnOO^@SmD>k>ZUwdYKmsp}x`jmlneq;#Qv(zm*R&ku$fn6PqmYz(-bM3T7*CkYBf1SXnb#>XS)MuLMAlKongWw}IH zh^9HoyKVJIl1<5n1|ia3vZMB-0GfO+{}>E;z5VlXxlwI5CjUbE#QA zF=;8AyA{HCxi|kSct5A_Q7J8R!IGhV2A55}o-~zGt{z2Y=!`C7%osAJe*q-&7rI9} z^;%}AutF>-Dut~R2c>CJ-b`xfp@!k1Ny($arFQkK@%llE;@v5yQk7y&89nr3{)PS_ z9$Iiep;K--R2m)htIcbU3R))8mciMX@C=`vhNEnYCT3<+@`~D%vne&gw?R96D|+qV zrEP<3@`Hq2%fxqMdJlS~VjXH($;flD+2OSRazY^_99y=>uQKV5LFUawY>w$B1-qH3 zz?4doLo#-5Fld#GkP27E5{8=+O@+g}1NxB6%IJAq2+69AOR~KkiOq5+c|OVW$jKxV z9E9*x0$MT%Tx^PO#v!(#$Dl`MM<#$%m;xj`t z(V2l%gaVyABrbgE*9o)ag#tLcsVKzMet$_ z&dl+B&}`bP-jhLfZm&SH@F{qIf1BE>rmU_7vuN?JTN<`Zl)t}F*H-Wc3*MSSeP^LI zu=wuMyE$LimfhsHWes;6TOO*RuTa-h@V6^v8iR7NP|nx0iXAe2eO8r**9TGltkZ})L}qzc#!dL z^A>pbs0=l&96%+X{A6hLSiT2!=jnMJl*3v(T&dhV4JEWAUn*8Ap|wO`q#n={K|S9O z#rPrg2Ec=aJs92xo<+e%AI7*@p+;$sw)#rS+mJ25?V@f|1-I|h%tB^qDh&|OY|obSsHUcT=;M)vgT%#Y_}(C;rBw_pkZQ6qAmrUcwmvkgu{|8 z9ERgC3oIurIl^I}NiikI91gR|Nl4ZM9h3Azisy0WK&g_+7&#=8k?>#1By%SI5N11- z^Et`IV&U)unkVN5SN}MfB;N!<;^A@#3*i0zN7PFlZDe5gJ9-K=4TV~Np{4z$x7|4S zyvAQN!f(+;1zL(`3|S~hvtkGiLBrDM-RYtoV-CvgD>^acqP*2bH-B@HWeImDau%06tbZ-M&k@$}xATSGm@dt{!@v0&cpEH@W~d zAPE@dCV@Wi1SO=$FY%Ft0EaNn@eC{HFjr#9$?GLv+0T3sO#won7M8x?{7}#X_{qjKqxyJ_ZcQ85oP9~S3rw!U#Y5`y` z8f8eSQUb6+mS%M-I-WOf%h9uxmSf78ejM1AYbfRNZs=2EURsw_#>5(qKy+I@06AK# z=509uc+{L*v;Y#uwp?o|cixh*%v)3Hfy7zZ9xAir9WWN;KRevKg|)Mem^p5lx1qd{ zu>nJ;Dy|e==}>MPixc{&aB$Vp15hw!%o%;glCc7CFaYy+{X(~`1vl$aOF!~HRGRpgOe!%+K6*8p zh{VG2$gS{9Qi$S(1VpdMB7#)6OGcD}SxGMK5{D6#ju8kYS#E%Ef^inG24pT=jv+hAq7= zXak1tt;>53ik^c-J>{&=)xRb>jsWOv?_KH24WE5_X|3%-&eHOto>{KV1rI;ozg9o; z6{Tyi{ik=Jp~bPKlbd?o7;VeC07cYwX6+mHs>K6Kudds>3Qg^whwq286JMKjR@+Mp z)zDM$)ffDMyuU~E_W(l8*X|K(_iWTN1y60xKd^E$KXh6gIt?v0U~~R9#Je4@q3>-$ zbgRN&#I+l>O;BygzWk4%e4T5nrAmgmTRl{DpwvrqN51)h*nFT+*Oag85dkIFcW!w7 zOSQ`lE0@>2du5<}Y4I4~0Z8>?>;PcAAWo@f9!&I?817>MO@*)M$Zt; zJMU-~9nH&Q`Hq8P$3e1^S;Oxvodn}ZR+TIWl2esg;h1xRWC58o$^s0WET?-ICVcoA z1OQL#Z&zn@^Ln7ru;DvhMyKNHd0nDrw=x;MN-YUymRrLIi| zO{MlUeqU;|d~{fAN(YJo-9%8wPeLJ{1utme@ig;4K<^5AVL6pRcdZ!DL8^dDgI7hq zC5VC3_20fF3XcCXtdR!qHU%)%QJu3l7V0{(rwRckAJ`)X_N+Fo1&%G8gQL~mm+yL2 z?0WU_(WgCYT^B`1d-mMYAviUkyY9PI>z@uRxz_yeW}VO5y7Fzu#I|EkoKJ79wY@7^ zTC&c?TUgL?-?H4a<{!#B3+}+uwVb;vXX%nf0sX`Te;CFA6!=y6;a@`!Aq#&LJzO^e z_xWQOBIIxcLw^7s;QrZ}7)S6ve*#lS!4vQ#C>VbX6VZDgGyWVr1>^ewrUS-r?*@$j z(teya+MatFi$)CK<&@#PnM5sW)Jmc@1ofl_Gyz!NNm563USs!jpa0&GyGM#f2;-e> zx|g||DVi~6p&+mSZvS$%Sk+myVv>!rxwErGgu49sd-vbVxr3s$_ZvIp$@m|~jeD#) zRb#RFQ<&mT{Lkt@gQfpW2bwG+XtGS8{`6=ywE!Y+7UX~_lOhGULROFhY}2(`5iJ0^ zu!`xR&9VVvcUURbyK7xD(V^KZw*dMxCo7k5?ovX|-DL!v6I*l=BlCg3fSw@jpjjQ5}qzwb>f%*H0Hm@2kZIxt`ERD=6aH@gEnugg(;w?UHhgUS*tAV$Xa4=9nQ2A!2CZrC zr$#`j6{L`#8X(=K71IJgH7%F`p=wlg!3Z&}-38;chJ&>#pg{@aEQm{K*!H24Qb{Hd z7-*@2vX(zmp?-yH!1ttZ*NFojQejNceg(|GU;=J|s|^lDt<_7oOGsQDcX@uQRjxx( z&~BR=F8yW=Sz}h8HNiF1ePqe#K(>*Up-Nxx!7sF@VRZ0BY?)Qu#wl(oh6E)A=kkppwNiKgx4V;7n86JJd>d7y#FfTvO-f3 z|JOBrZ8Zj5gioZ+gXH=PpyO(XqWrt)vFIT#2nI@$gyfKA1V#H{y*8z)K1y@K2}rh^ z_Y*l0cAiVEEnOC`N)!sgOE9RBfAa{ zH$2MBJZ_BSC32ShP$ADivIG5t=QI+NtXzUsf+jhiNF)3Yv9bZlAb%Bub&=Uy;d1aO zR}y%Z0DPgi_OQgCQw}G>xRU-ArL>xf`J~y()9qn0dLz24NShFPi*Le zh#N!?hyRn;^Y-p%_HKf-yPi3^9vyl3dcJS$S>M(TbNh=+qqvP*%MyCq^PF)jd(BH}VI=8&m!#2^f zpU6&29iI)ZbUfwd{VV86*2Q1^7X+Td0$~=BppN( zo_8~%n^~@3bNA&eeKLNVgiNV=+v6KZ$oO;$*9jk!1o`#|+cb^H-GG8qn?*P^z?s39 zok|g>al|_u@6Q;140^7F2RM|*PfLYL?*j8imYO#ud>}cQ&%nc*+Uif}jGiUnXx<1R zJrdn{(^YD|OhZ`q7)Mp&=g@ZryWw5}RwEVU}Q%Cvyc5^MNOlPNFaYmUa7 zN_9w&${r~_+EN-P%9zy=!ZYP4P!t1{H>2#AF#?w{O{7Oompyi0k`D|MpA}Nf6^?<& z$eU3Xo(VyY108uym$?W%=LHxq{}b?#%FcpP9b|6a zfQh(>{4Mn6(93`a&x12FK#ciWOu@zH{|P4RV_YI=FWn_saQ&y4{}FmDddPz$?@kge zz-tbBOEMC=&f|lUUrWjV~rr(4Ir!VR;Q6c5WA{dbH4tUddl1U zyrw>16BKKLE5WszeMK|nYyhdtA6Pi`+}EA+?JZI|r>o$(2sFp(!nlcQdS$gm42)#Y z73%s{&WLpfv!|Z#8O~YiHyYYEK#8;O-fMSXTQ=mGJtDJbwd2==e>J$~82LNBuFzj|t| z@hCigf}DM0!|uwtnKgUs^2jrLXTith+V;Yna*n+qbLT7pnVgzsqmS*xR3kZ~KzQy^ zSAi`25*{H-qz4WR$iny#R7v=mx2o%s zajEOO>&j?51}0{5|FnV1*m>O|eW&?*$8J)_j-~hSGH%Q}Q5y62p9FRE>db$iu|prT zFm3~oaeMkzGD(B)K6DeQ_%sK$JjsN>q>{|ktN;`p-!(8}FsKje z|G_$_#sS~oAQ=*U177X_8EhaVU00H>my+b_5`8o666`-47Nr}vgF)m9Y%NuSec+r*jU;)i0>Vjf-cFxM!|x}gD>@L$#e^dyTGN0 zuqC9?)CAIkwjGxE*A?ev@T+&BamPc7tGV|O_EHcSS7GO03KIQUw!}vc0FX)!J z%pELPt0+n24h48!B5NzziS3vWCS!6Evo>W=0*DJRL`hAQ=O)?6h=>J(WF(bIXOfc` z!Og&wNG%g~z_F5z6p_>H%JPr`7U3G0ZiNA(BK1%inDQ$lO9u`|vA7e`b!7~#Jc9v- zCj2eoYF`-)ZMK}H89;hn&r6aS@(Eyv@MQyvT2aqUV)G+tS$;f5F$h%!h2xAdvv*O-4|S8ANPraCksuji(_9@*T3-fuIyW_UiTePQ*b_`qCdE3EHIsUX0OQX zU37loX~L1No?7=DEseEf-P2vDZ(1~c;i)NhQwPTBm%Wsy_6xFjpUyALFTJzw?p*0u zy)O2SK*9Ek76Qvzs3v-jv^||4YaWno(bye z3tb1lvhM>OIlP5c7p)kwQ8o2TRiD@2ufO|dk>ts0amTp8p zdW+Zq>fzm=1{MN2f1haWTj5qu{(Ahc#-DUNnf=e1|CkZ~;8JeiC2{bwXuZ7Y#PV;` zZ_vi3pu3;lCXEhJM(os4D*oHNJMuRTh2-Y{2DFe~FTR(DzOlW_~R)eigMNK_x-z6PvPgx7m zt~+-rA?JeqY9$ehvzBSUZJ_je&5BgwXDV@du*WLAj8#?PhC60*t=ah(WAlOzX}>xrwg zk*gd-v|qr6`@vW?#a%_A2Y56Z4c#0cKW9UV&Ci0_P@Ef-?F@+Rr(QBC4A`V_U6age zi4LAm!kLg2Z=Db-Dcn&6M5Q-gyUlkYw!>@)aMgC(*vcjzmGCe;c*2us*Z?gPSB7Wc zkrEWbvOTRLghELTQ-9m)wbCy056f_=zLN4hqp$R01T#SO-VRqL{DNYEHK=%yjMNCG zFQzifUrkL8J)qsg4#)2UNdR@VJKS{Zjs%E8N%fYNWlp!1MU~%rI)+ zkivZ?4#rqPX^=5IvZ}ZdTGS9eD8GYhO;tWB?{Sy$_NsMCzfeoV{Gi&lO{7h-xd%KR ze+b3Mag-dO&qOm8*?I z8A#x7p!WuPNEIa`xmB`S0b~>~yhab?Jz4!L_l2lWGAje%Qy8-o9kTpw08z{Fxb~8V zj0L_4P(N;vHOBLh2evch;ai)cJc@q}S+Zglk2!%|>(aE1Q^bwInm zVR5Q<#0764?+uFHVBWh|^zH@KtgL(m6=^NAa`wrACp}NQa>w6VuektfSlbJG9V%f_ z1qv$7Krd0YA^oND$le(T77L16 zckr1z_`Ie4^UVEBzU83Ua`3U?x3=Hd@`v6O552k8a(>b9&m9{Lt;=021CM=c4M($Q zH#z`zfr=MstGt!Tyor(b(ELCI5nI;qeFoUnfb?Wx;Zp@jNiA=}#v+WyM=S^{g* zenR!j_gB#o{V{HDA9O%$9Q=WM>AUGdZ43Ty-Ct-?!os##FIDs&i3MnKIc<2)Vz7+T{0EqHJp$*IOpC6j#m1?&vD z(J&^u@WZ0n6bGN`;AcPSmT%2?dx1a4iKBV*ZR)wVChzSMy?|E#Rpw7KtM9LQPr!k8 z*JPb!s@rCXoswB1TqLuE-<__KCa9!dF3k^jRtBR#hETbhousDa?_0{ZZHI#WFK`y9 zwvbKD(+g?18loGOdG38V%RYIlP$Jwm3Dka*Ng$Ca*PmyQAt+k`S(>Y4SIO&H`YN3O z%Uo7}m4**ZutjjVh0~}m+u_O=AZSeg<8t92u||>sD@hrCFg@^?z`acMv;HM+6WZ#4Ce}n% zwZm5cg}$L*z4qX>eBYU8eP@==+&_{#f2q*Wxzq%2PgS<8NuhIXqPuPRVA+caSU*h! zt13xAQtb?}i%`B1&}9BfRzb4xLnNO6fHVCGdboe&web4{pufa|=%GQGumu6dV#os| z!s>fgJzUi5noT)F0|+Ju4t+&2M%#)|#Gl8bU*k`4oTiQ(rwaqGytF%w0Y$ti8X;UJ z*38i}>4{IMkY4ssAJT>y;3%NUPutCEM5U{P|7 zUID_#%8$^UmCRh_m*K(Er*ZNZO&#A+mQ*3{$9C+k^}C9auQ18yRPt9;ecx4>kmoYw zLIBO)@2DX^v!h-KKlv;@rpp-0gQFX$RGrWZ<;%@MMHMxhio#x?W=+_Bd@Xiil&ca7f%sk4yER~_5xu`a5BUi{h906maz*$!hW4UI z_>|=cj0;o}uk-7W2KR>&{v~)tJx$YJ*HU!nmsH?Os`jrb$Cp&~msH39qKwyGKdH+Q_(rFvb>CRP``SX$ga4b_^Nr2~5emGmssY-Y2_Dys3xeB1zt2jpt~(f04g8cXw{Nf7SLi Jg>f>A{|AFc0;2!` literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/arctic.cpython-312.pyc b/model_executor/models/__pycache__/arctic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7a3e4007bf322bcb90233721865fce39a3d4a14 GIT binary patch literal 25739 zcmc(HdvF^`df(vvCinmekOWEaB@%o}6r~j<>b0V@dU7Aq?yYfJ<0#Hhf&`G<0raB5 zJH^?{k>|ZEcPtVt1KW6&t-}m*`{hvG@Cx~%6MlY90{{{RlFg7`f>`t@0s$HM;K)zD4Tu+)S$1g?4rxMdKWHZXHBV*&2gdf%K~#I5seEPnOyB!A{=9QE}~Tulmm zRJbuMh^eIPRBlsK(^o)hd*#g8(J3*3%GaF{#ESxdWGa3&G5J!yrl(_*!k8#mynO1l zlBDuiglTl&SW4hkqF1J_Zze(k-qTYTW2xjz;?%g1L{A{8^5v^z=;fI)>gjyn8Rd!* zL6|-}EeQOvm-3zKD@r292>~w;4D8f#;?y*~jWVsoGslH7Uaom=Z`G+)4Q7z|LvcvE&ULdG*GyAxwK zF9^|TQJ9F`7*~dj5_zcr+28mfdgC=YRpm22`2yQ4o@7E`XY5qEj1$#H2@(v3WJ^=aXE(v{}6;FXt2 z9G6mim)nZ=GMixwTq$| z&)EXb*7vLm7jUM&WEa}(Q{#nS-gf0VjuUHBel?A^>r^a_s6nES5p*m>V1n=i!1lHzuxi312`Onuh}?wiHkl(2&W?U8IwLc+}i zUY*PfX@L)65X7!Ak+L^5)7%$&k!AxT3&oP50vX0~2vbB~xJdPiL4Z{pJJ?WteMn!*%6J7$H$V%Xf#QmVo14e{qN)bsl;^u zbZojeH3o5=>b-IGxuHE*o)h}xu?zjvH&d6UCi|~pQtDTx9N~sAeih+<$8zbj%D3#kKAn!ec8h=ONU=xH*pTn+wP}!&exZ7muKBU$sNpjD&9Fie|~Os z@ny-~n5(GC)wbtqf(x%No?AHh)MfQK!O#A*jH_xxo_9{qpDtttaeY{$Kf7??smE4D z^tSbK&Rv;x1tnK-$@<8(ZLKnxtL$3do$Vf$x`$W0_OEk>0|N*u1~yElsmUlS%kvFim2f z#$a+r3G`fzQ`{YmLdeIJCt?fL&4iJK5kEB*7X$1*nG)G2DW0ZVPyX{C7l3o3_YL9* zQj_!|4zae0J&3Y^8j<{21b_P$w{GX^g1Jz8ZeVw=sWn#<_}JTSX?;>2T({tN-OBm= z>o$sku5sN#F(>B_FAqvJ{p&8eb#t{1>mG`gaTUH#DEr@x8qAgk)>u{`b)9yp0&N2t z&ZfzfH$+V8bn4*vDs3_~vWXN^>g6(xix%OPr&ht9kJ$tVZx@{T3D`YS1<@r2?*fR zAi&honPDe=k7o#k_fF?QWhj=IjtdFED<-c)Qxhs!KQnbSbecd>AImn%grsasO)FT~ zO&k)*D4)6s$-P-(j%#xyTCO34#~5)X2*F;85K`KxOk1_lYD*h$7?RLWQySl*c}znT zmrzwF zHXse$fCmk`l4}OIfHz+0WO>w2n@csQWNCA%QH{R`&UAr$xe`O%g1Z7jG2QZ|+SO;5 zx2dthuiB%Kd61t`4S%35Iv;p4km^?RrVYR|mLjZ_e**kIT>RdQ_V<)3=1ew?2FskKLYPpPA7r=T9Y)p+5TCk=^E zLCF;}sL}%Rw3x@95<=X59v;vh+@oSEs6~=O&>)~l-Zz!Z2*+kDeT0pGo2ag0gaS>6 z5NQO4jfGTb?N(m628)nvVH1q(*=#9j6NiYyUJ6KpP_)>vs^KGH36%_IP_Z8 z*y5rj^~Ro8_j$RhC^?Fm3^t}KvIQoY=oOJZf4M3#CFHFG6Qc0uRoJ&~%5G@RTDt6v z!A_7&jluFIlBy-!@tCR7Oj#m!uv^I$WGhiEAIiMR`hoD1Y(lq*)f6*RM~NhKW%F4w zI~lJhq1+54MTi+_BV{9wG%BJHcq=-=jVF$yhih{$sr_l47f+Joa za)#rTysaZc}Qv=dT{Q0=f8ViYCb`v zjZYfdmkoE%FTa-UeL=!s@P+U1kbarswWO>6a` zZ2f>#Kai;(oI9NhcCsu_yj59m=#e+H<`3n{YjO?YTvKnZF0>d}@-2MjslyT|n;ZGY zacD!e;as^dTON|iL8wNm>v9dv*@ix;p^vh%eATP|4yZ-7$oTdNs72+zcV@pnyH?k? z=BvGP^7hGGZ9}#;EY*gWcPyu}{YNGIbsv40%D(Ug>4h(>243GV8LMjml3V(6&Am{_ z>MIw_zidMlg4;jCVldmiPwL+H;9Pe9S!w^-)xf#Z+(G3275me2uD;=3&)pv4aO(Cc zs215krxfVS1!@;ZvcUlfe}Mrld9A5G+q6$=+Lvo=$u{;&js2)db{4~ah^htrftyxjke%TJ)*3*D4y<@v=d*W+dI+tyKPHKN{-DIq&d}^^kOLXT6 z?6>Wu1fLKA(+Ok?1EsQu*%SG^AyIbaWe4dzOv@jqfuvr*S&EMN`%oKTztS{J4G|Fr zA=y@PiRD0U#hmib)qJo~87>p|l55rqemeMY)|PUqq`*@)hAOabGlN9sPLzg`BW20w=MXXH9j?Ze1Kf@_4{1h zG$c>8Qc6$Ti)zJcxv5tA{auOvB=b$lRnQa++)exo+m!ie#&LZ|pYrOJsmXDDr+QBl z4xSI>@56SHx?a45K=$y^Y9Q#DVO)^S{20tTY$~v@MGDz2O!CR=vD78m4N{dbs><3V zaY|8el>%nGBDIMvG&tE88YP~xGkFOX0Y#m%uz0dq{wkIOCdOy{d8+~B@1}|H4^cw$ zuMj{6!&R>C-E;53$JFYJEiK*$JJdAtN)ZNvY>pru>CDOU_l3bu~$@rX|;7S5I!&^AC6ZQS-w+(yr5U zCl_umH7}i9>iI3hs(**%9w?z5c{x{J{flod*grNIj~dDteb@4vD|_#Md9`K#@4UI{ zKP0&iL$0~~#YJ>I_dA9MyB^r3uEUh&Tg~5kdEqOPyCdiFzO(=9`*ZG!th*70s>PW< zc6Y&`<*rps04LCJ*(wUDtK#G)yAh2p6aH`fbA+%P87||d^wO+F20d#LPc|Y=w=Cq< z#vnb{^wGle5^I!smZXQyXoqfx8F$uNH0DW<(xpvX-#5LNugiI~Xx27spLKv%N1GM@ zj5nu^d5uH|i(dh-5PuDMHBGTl_sdmuI2nK=#X;He4uKtr?zb`f=eY?Ctbj;JVzb{1%7EQ+%6`Yi+bo4sOwJt)I=N)?>?_ zc~%}*LCb=6t^b~lUvaWk5S{n9Weyk1q4S<#_ptM}_wA}KJ?jP!0@fv$Scmk^l0@7p zU$f{cvI@AhesibYdLs-oi%A_#FjYB5#WUc&_y9TQNqoR-o<4i_k-$CkSy?x@^OW7?mhqCo*~g%AyDZ*fcMPJqqk*0?J7m z2DA#SQYy-c2TRgG__Roj$}^RsSJPY;3{fiod!=<;BKjTccytyQ7L<*mOYu4yz zVt#bR_uKy8^uru}Z}wMb=U-fD`|Zx(?990XIkzw83qWCLa<1CzX&I$F>uP-DYFrxp z_VByIE6tBucdoTOzk1}gOv@Km-Ay@fOV-;Vc{`S!E2lEvL#y^fY(m^L2a=ql{~Hqs z*xCtxOD`ZY{k0NpC1lo^Hl8KxmJZS_%ykNtcpUt(Rzf9;zf8duCE*CgX)uVtgdj=P z!dIv0m#1-~t%?BOd&xV%_ZdgKb?HD+6OOD(%sW-~2E(Gfrr<*lps7GQ$b z4tm4O`!g*=N)I8KbxPPWIy!nz97QqVD)B}9h%Zs_G6LBajq+3D(I{I85nrKn8ucPA zVz9*!@hrv8Q9z0?Q^Cb)#AK`DcEIqN(pYMV_$mdjAt+uUd7Tm|p!p^F7y(vDJ~kRH zcC2k+b)>t$Xqn`7gQbkElTZXLtgK#0-L6}=v0JQ`1@BzBePwAxs_9&Ju(YCe6LzmG zoUo{}a>Al1qtDX)#9Q%BdOp2wK^&|D4R_MF)9W_6vvW1I>kf)Ju{N{eqF7I><*;G- z)$g3Ye}0{#_ygg4@$berSbY7Y!D4A$diC4q-#z~+hbZfA*35CVP`e!Y*am$OpLKHJ zU7CY~;@CiL4oVCwGZS{^-eARQiw*7$c0GeUR%P_l9s21LR^EwK5+_?FabfCl%U(su zX--G^)eiSK`4{E=Aqw`YP5L8Qec3qs&p>D;>l@JYNAje=<;r<{ z_&(G}Zylw$J~ga9gq3wDOY5D|5GlMVbaN4aX)2}6Ont^TtI)X{; zDng!H^^{I)!CmVml@se#owo`71ZyGiQBnL~@?RP1plKheu*zdgzB+Zt>pVJdmpAja ze3^Ev7&%mwi!!w@Db-t4rVe-g{zIzHo7uG16;k;YepO@)#I3Dape|aeg%u(2*S>;6 z{w?d$QlL>|pw;{CfW6r}aG^ik?-kwwv_s{B9mO-sFgHWw7nRf}-#a$r=zH_ZHF&d4 zT^4sC_1uiDZ%Vm^dlEb&WT!Xm8`0^hWQ-gP;bbtD6l4#qU{_a7|@G7zG5A-IY3LCa>=M>a*RVC~=B zKg!z22uPFkG8If;Y{3td%y60=AAa6kvKK4ys8wMr(ZF;a4 zY+5ob4Q5;SO09bzT$EamW`f6FOOt<4oUcf{y_^2HFb9;Z%;0zzCHWytW-0| zn&_|3`a30m=knlp_TS$x`KhMvCmp@nj-yh?QR3GI!+y4|TdM2EBX15m%k1shC0_D( z5$Q*PaKS+e8p@gZGwePjxkD_{D!E&W-K>6QwW{N(*IrA0Qyy5q{ZIqon?>}%{QKYC zFS)lDIFid%ljd^uh1IHd=n8(y>q6f8R#37cm0Gbr-zC-VTC4CaoVydf9nDm<=js}1 z8G-V6u@F$*NO?x*N7byjBQGynm*|peyQnbLCG6GJs#ess8hO9rU>Hw!s3SV5p!!?* z)O(BOEXt9Q=DlQ6N9dyK-Ull!KNeX^;0jJc*FZf6%H^Hnq}5H z3ulkZ8o)x0^A}7maA!~rMXYY12CW7nR7y;7piJ9TS{UXzJ;Qw3u8Mrd&4HFWfICf! z$pgj=6J%xBCjJH(!yiH2Lqb&G>iP+4=Q;daB~ zY^I`j-a2QRdpTEGeJ6Z7ym;>37w>)%8hg+Eo(JadxxedP4Icgez(P1vIXZ8fGtBLn z8-sv&=PUDHS-iR2y!>*;+dprnbQ+7X+cAvb9ZO@Gn$9^F)L>egT5@H4J@bw^>w+QY zt)BC+ukOBSbd;%a_uVCGoT9w}$mDglg1Q!?v^x}>ra)IYyGu_fa8q;NL_~34>)B39 z864MK2Q5yfkl`0flfOk#zHD?0eJt;@=Ac_Ad8!fp`twGorRGUh;Lh;v;dKk*$f>GZ zbZ6(B-gkoY!A~8yW9?&g7)SBirG9T4i%9x$H76d9;uFwu;2dsa+S8AzCBea5SF6&| zp7NfnBf>a*%S7fhrYKM)lel;b?bxI|(Okvk#A!@GCCbxJ;67e(5KC)ISWuYMMKoPk z@`9rk#<;F(b;BTHQx(QJS8ydOaxOQJA(MeA#Z???AYGu&!z#F}ZAo@>%mq+2sK)z0E!~;KtGBoI$B8LWc zrLon4cd(vv0=l}?Wf8PaQ+3!P;3lUOXCJwFM@cB&Hx@wUOhw-X*fg(1lUS8Y3gpot zLSco+uWS;gljn}jc>0R>N5HhzkH?C!cL1?)MG-0`#}maFMLiU$fM$^lHiDQb(dKdR_SrB+$y-a^#OO%@!B zc6ng2^9WT{2Sv&b)i36TV#FH(;U=f|BA<|Y1yL4{dJR;bCsEpk;4`3dljHiDuVdMo z@%7I+a)IWh!*@N)HQDeH34i`058ssh$LHL+U`saGF9rJ_1rM#{eLQM`mMY0^szQ{= zYE>s3HR=f7kgq^<7ZKoTs4W}XA%%9V1~J*dzkRi#Z?&o)q|G3GY;OSI4RtJC%XSP& z9YYVUWe>h09em|+(^+~LT<>xl0dG4+KLccozB?H8_vHh)CAf=saf8Gp=@xc6x_LTCKG&q?o5vEd!=CS zav~EP#(jM!IaDEWND2-;=*t986Ffxsu+K}uJ#%N)g2NBq$OKPOT6p;k zvdoidP{=_^$;(DdURKSdbm@d9@SV#BuBj9pH2hL17Q#^dFbatr#CKk>1NCS^w}7|6O9u%;V>8cLCQQ8XEqmIiZEVbQ zBAvJxAtGLc@S++L18*IJC#W@1oN899s^2E0wx|?^-}fy=I8`$mr#e&$-g?oX&#x#S z+1A*_oWP(~S5Ao+nF+99mD4^WBBa+tR_6_>(XW__z5nKo^M-jqb#yk(zPO;U8%IS6 zF?b=+0ERO&eVB@p!_d@r+$tNtZrxIhZ?b(jK9;z^j~$rVR`jWBK&fw-EvhFEAfnLy z76~b8382&CdIA=U?xpyH-Jh6@LB-7x23H2tlB_b~?@=92AP83%&?pSAw9iM~%Iml^ zp^hcZZYVpi(f({^#1zSsM)6K?v%XRt7Q#qZys7iv14_xV3+~R?kk9U92OQ$^QEUFW zkAdyz=G{diy;$)-5=%1!r=1jS`VpI|VHY_<`5sZA)8UHeGQkr}lZOr+DzjgpGdJ-NxUpjjYCP(ty*eh-DwrR&n`Pn-V% zbop7!WIbdr@hG{9Tplcwj}v#_Ly}Xmi3m5lAeMM0Nx{QFr`7d~Blk|-J+5 zJNiMec_|DW9o#p6X5GZqoHBgmt^b8DXT4pIyj^Qm+m^i#8df`A$W$F)bss07?YQe$ z>dgeU156io>MG;tGiFQ&HD1qn$vT&H-PUHzRmx1kuTww#V?=;*pBz5&*IbRoxiqs* zKM!_(LO<)<4VIqe>hA>a2eBL5((|DGdp+On*`O$#kNO5ahVw@O&>{{%i#-Em$%Ly2 z0ota4l@Y0N5BZBU023>QO|~4ePI(C&GwbGRpj&cuf9gS+0(_~0#bhhX5?i#!On0R&s&`(e{VPP{`u9Uz!|KOD3lnD=d&+pV7) zdv~3@i^1TsEhvs%kP>p86yP(1o0RLob2qpk%@w&`-4E``=N1@x}pyXyP`t{QURqE%hG zC27tZ!26a_*}@<%Y%|)IDchh!(#pN~5DdaK1$-=Kks#(M*iHe1?IJTI7jiXm;@^iwosHDy93$-j=AP#ubZZSGmXR$1gz6&-R ze*|vDu$KX#ndZ;P(^9~@LGOq@3I=%JnVFx-dc%@8yd2JW2UqQbMW9#I5YZ-NOG+z~ zuzhNSz_RorGcT=5sZvJ4a4LSM&s+WawgO;0OzTz?yadku4PxUFZ&APSX!AvSb(D z*xw*Uq@jz&nMYf1Xu^vWBViX+e=wdHHsrU;1SrWVr##9I6krm#C4Adlf&hFRl9Zq|hVWRC|y z>7}Q-K=sfj!2)!L>HtKIg-w&5{T|+v$_ZC63bB~^nNiTL%$ETslU*>YP->EC(&!Rv{|>;+`K1k zg8{AVo7hjesnz=Q-aLKz=cHHpSCrDfecH^u{X?jH+KL9g56w(h)Q)^qZO^Q|076vt zch-?AS99p?(&8)C+;nc+2)AH(1DXkQUvbX5(nj>rdEDdMopsN8X3Nr^wDX!lG`#Ij zJN0&q;kjEIp-#|{3@o|$Dz!|}l6I!;jNf2k9~ja&l?O^XoVKT(Ozrljy?P*Nwj3&R zdD@$HrOQ-h+Q1pO$q&u=veANLKY0tbtX20#w7z z*^hc-*8CeYE*ReSZ_zvSti8}bMW@+#`RbzcY)B&FD!f1St>asCTy9pL7Kdr4`sEn# zrM~?I)R!ErYH)HFruBb19528B zcE(@&B>zC!YFzqM&BlCb*><*>&}V!P=;VyzBAT|MQ0uT2fmYfiHO8Jl^rCOZ3GQk& zr|1z9)Gg2ySM$yMs}rfv^;kR}O3~@9U*54jl!#4YW=w`?XFc3lLlfduA_Ujg(2m`` z7h<4h2YDv@LWkqn+BS*9q^=3^n>`^oK`=5L9jR`V=0kgmv(S-KW6JrkW0QOc2Y|9d zdI2jEW2sPTD)c3`HyY=r!seV3zNFM_Y#is2;r!T?pw_7mp2T!)A?%ZcjFj^)Dilop zCrFSj%1%D(*z`0E*5a2a&5SK{DY=}E2H*<^ed4?g)Ptb}9OEjRW0Mn8wCj&;I237d zS?r)7N-S(R)&tA6qS#1xrZJv}TL^<1|A!K4Q0wt4Dt88yhkk>z)svxKXZU}xQl^FPHT@uD^) zNvk6h+op@z+hjMra&<4_Oyg$(Zt-&mA)*9^?=9}Hjg}T{XKMr)+&jFABMXNQtlAss zgu5@^{$jRzK&l>?bAD7^x48EQ)!XKd=G={0ce~_ne^S%1D8Mw{HoWTFpHpx4t@>z( z-nz|<6Cq4i*wD*3Pt%W^1AE@zoWHqv`0HPOQr?m+4@>3YCw0wB?MwdE&f}|r6FK$f zh1CG8ceN<4x(3BnRHC?w*3X;c6UqV4+?Ebt!Jm8g=dv9mQpd;-nvX7=$oaxq-*(Bj z9j>-Zf#vXW{mRMJrh^Y_R|BV@b+JP~s=HK@#-+~Xq2=zC$ZF`&!|-b0^k+#v!v*U^ z)iob`xt3#w^& z7tBwp{CD=>-oNC3ToqpCA9PE7M?tj)KkmU5-{Y#?lqJv#ugO4o!GbdZv!R_*Xy=0G zgL1s;r8icNKQ7<@L3#ChEw}r$VI#=xJ#CPDyB1C^xt3qY%FA+_UFzn2~cJC@CGNLVRuj!E777Q*V(K1_$+!p2(;&55#GPj{rvzszP1i$aMm`i`>BNWIxZNxxBu?`W&h*) z9V`69ZfWpc7q z+^y>{xwUn#^M)dA3X7AY;mQ)vSh;3e5k4@D;QU8Tq=;nJ`aGuQzau)Dj&X?5Rh$$^ zo{-p7O@Xd6?f)ct3bG(c(i96Tbgr@UQP=U+h7+siI%bjfD;8<|kVP6l>jMVMu7X8+ zSHU8^3nOp%kRjK*<71bz$SRFEj1#I=n(m6M(sWy5m8PW4R_P57r3EZ-)5V@j+6QSt z9849{^`cpF1UGDyR3;iM4NJ$DUtQVxz``7ycaBKCN2Ru7>l{7${?6|w|8UQzEUsv_ z)C1uY;y=NU>>Z!NLF3qKJ~f%_8@n*Bbf+VRc`}7FHL|P+J947k4+0o4+DL3pL%jGO1AH$iRu?v{OdA53^ z9Ct!?N9LK7&kUs))16pG?AlpAW>Ld_bLHUL{IYXj;e21^INi2q&?-lqlV>HV8*Dqi z{49Dl&9TMdN~%WlGgxCk4Gu(ShySu|HSG8-xrWqg|X*@5*5Pl_Q*^sr1x<(xD@Y^JnrEDgu3 zPbLY}31U(yCjYTwP%!w%y!dgJ8l9_-bJT^T*oNF?L#j3=bZaP&d1{W|CHV?-cj>cQor`8@r!-DaSl+SuR_-7q0>of4m7n}88njBLSj@AB*?SbQ@mi+2AHPng6P literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/aria.cpython-312.pyc b/model_executor/models/__pycache__/aria.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc07b13b37efdfee25fa0cc3b3359813fc9be8db GIT binary patch literal 29070 zcmcJ2d2k%pd1v?B7cc+@$KW22AaM``M1bNYULZkAlt@q(Dajs584WQFV89sw>KTY6 zGT6v2C4n|}3~sU^7&{S^&6c1@wT6;xnXav^uxhtzwaISnv_WOY9jc6!m84v|e+&(% z(}2q`t>`$`@Q+AiVBy2D`xqv=uch}grCrZe%ZCi=2^2K z+!7)}M2rb>F(}4OK~vluG>Zf?#Vm1a&>FV|ZS2_`vj^>fn`4f+Gw5V6OUxB_2i>4AB#I;weh-O9g90-_3?&a1B<(2{&-`s zk;UDyrg(F(nZ-S^mUwHhHQp9%W6u?__IMx|VDZXWM|?+c2a9`SJL8?fP8P3|ybmSZ{n!a8JB1*caa$+{<9TSbuyVIKbkyvB%>3g8Slw!NK@Y za45b%xSzr6VvolU1P`!yee7WTQ1B3oH^dIdj|7jzp9nsIXMdzIb~Ju0c#OrH5I-I~ z!Q#!alkwr;aQsy8RQz=CG=sIoM&f6JXX0mrXX8%>pJcGs*i-Ry!E^DagHOkw2|g1) zA3QG#CgDjT()OkhX_t5NhSGK2O3x>tI#;0vB-@P5Dt zq(LM13k>=gp!=i)(hH6FBqQxbmNtmAA!%Pp+E-12)HC&`g+0LI(pRMyq!&Nfuh+ur zcpP;cDE~FmDZP*1*D4DJZN)6+A_a8in>_VZE78XH9^*RUK7pPrc>k4~tS6u&H8PioJeQ{m*;YyCqv1DkRUQu~zGAv7xvvUb4a$)8SzNYr{ zOl&Sb{j65(d~{k0%WCy=GuMhiDC|8yGZ9TD3R_t7mANpQKOd$B87-|eie7k13P0 z7cPzrpAS8E=E-xHE4v$6ge-m@k^kaJUn`7Y!W@9y0xdt@T~Mx zk>;kT)}BL8M#mC^FGLfCUEWv>zl_mOjL+Z=GP`S1-4nuCEZoPSnz@fgj!9*bH%JeQn#k&SQSJRjc* z?U9xdN;X6jq1o9`Jeo+L1#GIuCG>;p;qzct#alNGJQ6(ded5~JIJeAgv zwo%$irEfTTTxuvBLDpBY~&bAW&&3JT}RzD5&Auk-&@5 zNOH1F2Gui5jSsyZj?GC4)snn1E2*X{(?6x}R>etGyf&)3c%FDTaTVX~2(bkl3aPFT z;h;IJ;E?JGg!r%ODj9q9lLX8(b@gLZ1&kU1+&$ba|zW=TE?ZM;%Mf%cP=N^9Xl!SF66DY zZBbr%M1OP9nkBQ^<>F=8Hf zEDGad#PMbY_&TTLkepZouBm!fYf-=_EG?FZUGhjaij7+$t~V{gibdhN7_7V`^mx=d zn#3Hn6rd2*@fwixtUNRIo>{fM#$u}Ns&?loqpEF;6F(0IemWV_o@{(kt2S8z@tYpg zxEt0uD?*xBn=d2y*WxW9$-or6Wl0I&LV8L}nNsEtM1A3jLXt|cYl`Mg$)!{Dln^rB zbVKMdjlO4+N0C>)h@i)+S`%}GkuZ6q)6pcRsA+l{gg-*WIC(>@oSjL4N)f)B1{QtK zs#+3KY+SxTRT%ZC&QNG97EWLeC#Y4ltlyB&P@c;O=G)5lX21T<=a47SgW%_H2wNV( zS^1V-arl?(S7Ub%q#gcyj;FUALVL&CS8reaPJGewv7?#IBh}7{XkyHSvie0Ui}mFw zD8(%S^GEz6alw=%y1}lLFh%Jlm)>9RYxx$;`To_@7Az@oM0n-8AiRt>E?5_A5mUk_`4pn{1mduPa{yp*Yw9{*yC0G@jQEcU4D`76-CTWk3+RYh*_y3q!`Il zzl3T|NJ-T$b525jmCD+I=EBLOEH}`TIUI@fc;v4kE<+R))cQm+Ja$#DB@qJu#Q0P< zv48PIh`p$Saar1%fXwN=st)9B11_fm41=p~t-UOtBLPk^5{gDtE6c)~m7t!bS*~Af zva23?k;d93=695z^@VwU1=S?}8-h24pP8&XTxmxe7C=Mu^8QToV@mU5>&=7f^+Sv9 z`#X01n_Y_)f3ve!@w6|FEbZO!wy*40yj|HU|4$x&MycAj}zc(2&lU6pMvI?}>5{&;+gDlN|RvNm(f%T$gwa!O!0mvTfZz?0qWTdM<5g z0_hq$n639eaJsC08&#b-EABa)(9pBGU#TC++3C?CRM+Mx19H16KXX&e-)wDNd203f zwV^xKzwrFPlO7sT_MTBX&*lWAd^Ge?;=dmHoT8lIQ5&e;#-%Ut%}IcY37%w*nDB4L zKM0*h*lcflf))@g>y)vijk*Mo#IkUZH3!I=osl&MmakK-qA3aHd`cPvTR^Om;Z zC3RLnAaYld%OH5k3O#?stfMK^V%MP{be7SxkP<9cqTz(WRt&6!5atwV7U7mh^t~KL5&0o<=E2>IStZjWZwT!|7j_2A$x)_${Rwq zx&7_qw~w#xU2i@FW?`{n>9wrCR6eA6eXNP0MwE*xvae#b$&u4y+9*Pt%N~(fk6NR z2I)00NN>(>0S0;g-Iw2aIVT{xCVfBlVeE5?a+E=JkKy0A^vS(RjVeYNK|&dp2oT1& zh4H~wpbIOwWhe2+QB%1(7_JM0r61AqKz!)if0-)+#;}c-)Wgt8qVX0@&j&ev};M!z#Y?9Td z)s_gn5WU=$2ynEN2w==+0?^+86lI!)eiwf|bR2mJlXKul0x!|LI?b5#SGp&Y$=Sq_ zfq}`ni3ted5+_hF@g(wxI390rV2ITDmNefyh)s2=NMdLIoVtw!GR%)KLoS{g99Cq;E~7RDRIgWC5)Mb z1(3(y$e~lVdBs)$py#Vw(yZCaRTrhl1I z28B*G>u;KB(PR00vR#i)1@u_{j<4I1f6`+>zeepk3?rnC==+49aG{rowy`VOrN{C} z8tFzK6tuBd&zZk(^S=Ch3;{bipqDCN22dlfo|9TS<!upNWCG*Zz8>?F;BqN zRt+gY-UudR=m#Cg}PAg)@dl#kWA!V^As#$r7}z8Q-c}CFn>SUHl>5NU`gtky7uP-9928Z$o zcxmEi2*9D-cT_M*9Q?_~&YsN9CzYK~E?Vw60@>z{O!I)!JdkOALTP@2VE$}ld!})( z(zrL%IIJ`d6AzQEZ^_j6D)qgY`U6V+0f5}u_Fb9weMiAtt=H#==$!9msri#ip-FLfhC7J0usB|4%=M~;>>C1YmGM*O2)3O}Nv<)h4gKHz- zzxd%r)YXb=K6W&6wo1INI;poIN+zz3QNQUIQDBBF_hZ3=jZf#mZ^ z(CcV#((en76bWFCQJFqX-i=_sCa>FOm`X|n=8puR4i1dZ%z+n*PJ=xl0$+sI0^#@! zj26iN6my9vY;yswq7JBH|4;uft{zeTE~@XTgldmC1^JtJQXS7oH_piNjI3I&N;eYn z?-9^BH=O{gk>(}RO3p$GRIB+N@({3BObWj)$xD>gCS9Keg9hy%Vai(o$-hs*Z3I0| z&eVU00Q7FvmFG-UH{nk0168NNrl@v`Mxk4;$YWcub*dRbg6K8lp(xm z#!PLWQroxsl2Utk(YaAmf2;Rq?{aWCny%^2)C?*$gKPWNYaY+k993$LLV&COXn4J5 zbkV-yu1&iGS${{yKcx7FfP6+j9KAcd?mxSDjzRY*{ynRB?$7uiSNxB!UAfcs7kxkI zyZhRI82D&#-9Nf`ZUX|2_qKOcTs?Rvy6j!|Kebq~QCX9zY*#AVS1j*(-|?5>@_bIf zZ8h>4-E#Yd-u?&nCaZ75UG?pwZyn895r<; z$AY@98Cv8rdAF&*puC3V?ZArvl=6HBo7?m%AGP!4M{T<@`H|E{!u3nE-}ObkSk(T1 zrhb7s%|5D{uk9ORlP(IysESZ)@vO7B8?S7|w zb^rH{zIXJ_b1U8JJ5JthPr&p8TOTCx6TeI@?dgsX97nRNn>8g(V zZ9CtN-i~JLI@7xkrfUz~xw2(8wb%SZP4ms3<>BSmerF(O6{@Kb)%CYrH(i;kfKnA$ zY5urs??zQ!0i-q4aY*Snbf@~x@OsDbkE>2(E9#bOe^}94(*DkUna+bs=fOJ`(rbUz zk?uUW-g*8bH0OCVCtFx&x??cYaa8Fzdgm*5#r2L;A6K3Jf_K8cP}#;1Ja~qq=+(Ss zKqkpa{v8U2DL6$zf`V5mI86airofqj?@^2b1^O3dbg9WT^d6A%u1E z=R$30TO0xEo|vQuR-IN!A5#MYI1No%{0fxwyme>?IS%!+ym{;MC^KU-8eL?Wpk;2x z|K`upXC-Ut}XhUC40lQ7ks-oG#9>xu^<$Zh$ zU1+U7MTMjx=zyq{DGg{jy^B~*?=lbeAVO{i;#3oVQ(*{ujmAFU{f2HXuf@U_wG`&( zanxnx&8Gqy$Uo^ZK)?JKm>?XRNjUxk69}xv9inB`dTk(Dwo#H-Vsky3%L7?oF3&sq zwYdgEbR`*gp*PVWTv*W`;D`ffJo&d#O08xBk>;ny2psc%P%BY-Zj3zl67a@LaIcD* zypoDf)e_wHyHuk$J_8Rw$VS@h_ z37M%5%8SNdRqcA^Uj@9ZYX-Ab6`~nK6Z8uz@;?y#ywEEHx*(`^NR{(xYx~V{kGH`}+|wsuyqd8Lsfy;be>qo!|pIM)9LKCH-ZgcdGC8FvvW^uQ^n zIVro$4gMGvXczPdWw~X82PT^vA*+&$Or%C0GxC(-cfal?DT8U7pmCx5JpM1i8wB*j z9Cqu}f+)sWZdrpu0frZl4I&SO@EO6-NCV{?F0z{tL|2e9Bjb?(^XAh{5~zr&7#LEf z%=%SinMi0J!=acoJ&~O3Gpd9pp{L;8&oRHK0@DZ8#(d%Wxhvd}z(qy{^w)>Sp`&1* zM5LyGps-J43Q|rlsG_{$rN_=W3|f?`YkY2+8L7fC)y8eV@_$EyOpFCvQazRelcl~q zIXO~m6ENT;By9hik(c}V;1JKs*-POV zG>WKh8by>ZU3ym$!=^91%|WD;GHh@UWX4fq*WgcKV+K+im^8#-#9AMu0m!@kuNN zASOkkWH$}R0`Sn_hQDwD)5oeh^DyJbC%ft<2T6f9xAU$9Se*|?f z@kfOy2u@;UZGYt)524&HYXw}eRqBmo)S6=cZKEIXSL>?I>lT>7klYu*b$%u1ZkW5zHRKG6exk(A#LC7Wpc+ z65Cul6%3+ykk;#yCYhyXkWxLgXv;S2$TSQn4Fijw zPklQ<)>c~X)ZcO3wXDrRh^r-H#9cY$3@U^GU2av9g1ult&e0>uC56riKO!Ink5TKo z5Me9OVs$+5IIOOfr*i^scg*Z|_wwiT^ne1so~UHedeZnjp{%O`cgQ6b71TY}2{B^B zzx}%w_$N6c&hLs57gm~EvT3deo;NGul3Vd5U2-c6TynidF1c0Q)qvwr$TRb|3m0eJ z|GvER^Mxz3pn``MWVuz#d>LK^2`xkzuWI7SIdrJ%WXcILQL1(*B`ETX8jT(6A0OMN zsTc}jgBlDZ=P9bf!>V|wF!V4QfB1`Py~1mQI?Ko~yvXOfG4sbTG1x_74h=8kbO>FR zEPn7B`r=PoSP2A$UdLFNCi3ess@6@B$TW%_Yl@nInzSHP+ij zn$fMuUa;Qa$+&v<-ib>&v*2v}OfWgMdHfQxaZ%6^<^Un$z`ph$0sZIj2VN3i0ZP%e zw1!8p2LcLwgA*3m@3@a}9r!o{!J$>!_4Uc}@F_#c({lnT8#cY`;(}d%28h(&h~x!F z0djRjtYtiZozVY!^EK(Q_qB+o`81_$)0PzU8LUdcAzM^?=J4So#J?39H`}Iu>^Y(tehe3V?1A)c+KXC() zZ)lLoIXMsHyBlQURZ&Sre<5y~ckxF;a*WYNvHwMNIr=s1#T3VY{ym`bPbm11RV4od z#XqIs-y@j!$=D794-P4Qr9V3TdT3mhUYoBg0FKQhLSxv>BSnh9*hi&-*)P#w~ABtZyY-bb)kDImv` zo(j%iFuXHC&`AoS6p(8GGa{+3{9Zn8T6&d|Vid$Fm`0EwDi4I1zqp!n7GSQJ&SS8dkJE(FL7SMjG6|G<*9#8rG_*I;JX z0cF>L^<9VV4F7P~v6~m}9!r1aIps7V?&`J&cu(6hoX^Rbys9-*)uB{%tn}Wi+KWd0 z&6o3Oht>Ahv75(M8t(aeRwH-zDE(*P`TnV|cXj_-?LFTCJZyX!xA#`&_P#@?^_N!n zBUIP$*jnw{udX+oh7-N7Vd*Hrj91_x|B8|>(7fx5z2*NJ-tr*FaF>sWh5quTuJYVI zE7KGy;Q@)7$w>-^DL6#|o7Xg5%FAdQltRHNnlUu*7>8)KI(`O|_D~js7)|>h%eD`t z+xBPM2GgxW51iIE8~MmrqR@!=Gi={-t-!#G+nu4$=$5NIv8@+8QgknPL^+>H@27Se zm%(SA2dL1d`=&rHtq9@NDp|*2Rs=6-!?rIw@sSZ5IjQoETu8H%;v8vC#L3^?eZvM% zcebhK%Gecnv~O0YtWQ{nm+oIn|c_H71#wD1VEB&kz(e9)>r3Vv#4^d?$%bWj>`S z*LRCPC*$owCI30tP{;Nt7kw5@71EP!jv%;&ji}(65G%dPeNcYD#>FTE z**_M48{X;MozN_Aq6)Q2>vYgBzUN#feH+xqG-~qC5hSQzb$j725TD<<%?D%TjcVSe z?`=bF8outuBp<+T#oPTstKuELvl~I$G0d08v&c7E^wGvxx{!p!EwJM^il)=n8Y|ll z%gth?qtifNgKd4Zry08Kcsvw~#<5?0zIU4@m&yA_)N<+sw7jZexpwi_ur=x7*=9iw zHrt5uebH=N5R;UbUEAmrh*>YW^toEP+AM5-LeLLimwyXoR2z4M(nswRzzRoAt)*@u zg*)py=*P*LHQ%*OpOj|&x6~oMs1qGh>(A8gR%&;z*LE*D?|W;O4rhEv6yK4IZ&>jS zulr7~dq>ia5!PJ~E8Ezz`kb|R<_fvU%QT;4>Pjx?|0c!$Hw7YIT59@yKnWBqQ;UWX zc@xB6v=08vh9HOFl(<^IDbSss+E*Lad~0KOMn0TcYgc-PDWyGUGg;4wD`Q&%-TC~Z zI>+#DT)O=QdN5O%Un)sE9Z0szaOZz$c#oB_)+;E@gB|&TU9e=pK6JM46)&cHw4W;a;C!; zEZW-6C#1gBh-|&8%b0|4K{k+z?4p4ugvi)M#wn_mG`S;mCI(KyxSpKfMbrA>{$$i8 zpN-a3;v{Oqq^@hb^~%jxRt~}it*#&5a^Qu|-8`46?N@61*J~eJbY`17x$Py>d_-wJ zvgqFMHZ2FS2Qls7>g7?@4qZQ~`BfK&ZH6DtQDm2;_090bUn26$P+em<(8cA9FAy3{ z(mD++yI_YMR46ZiL&USC>>rr*^Ai|GwiFJwMYUd|V+B;( zBs+gVcGF-wXW&aIMK3^CG=Mw=#MeaEB-CsCmw48s2wIv5?q6-V*R=2MF^!%bB`8 zO5L7peP^}-R=ZuO84QMfag17n{IvU51}%jtY^9qh-sD7;gl< z`pe>5!m<+lOf($Iu&bnDRP4$k_LwlQvm_tRKmY^nmOO;18NS6S2RNj|sW#AH=g&LQ z7&o{FlQx}irBdq)Ss7v@IA6vQF|A_#BBd5v)tn2+J4EK`eYrK8UvpnJA8Z90`pzR* z(EG>ne)$OmP%wlk-i$!CGbdyz0w7yHIDSizTWGAQt;R~IW9toH2a`=K0_6hRpf$gD zn-1zPoBgk-+P$b2W;_@C!R|NqXBrPEjR)4Qtv8;y-_Vz7*snC~Uwd)A;npoH5?&_`k9>Ed|M$A%NM6+3i7bL_mL#QdH|j(u_Fk7?vdea4Vs zT_Y!&dz9Ke>$Q79G#httpNPfE4Qxm~eEV<)+fkbjuQxw|lP$J@{o1mP@Cx6ZZs^Lk z?n$@wr3X$ZEho0@_;4ksdFgsyF*oQWJ?Kvy%z;hkltLD`gR^3Wqf6h)IIBi!i`O@`!($#wu zXWyoifVI^qRI@8*#l5r;fgFN=Z_Y_EAj6uaKE(+S4?J>=NKL?UCgP}iIC0{rs0d9G zR;O$8JaEB7)?QQ4f>VlcJPcWT>71j8T4Bg@ zA^#&%!9X{lj+%%cwN^^L@_EZcH9d$W(uf*r*?UM#DY>TlN&t9-zN86d8no~6qZQ3~ zZe$1T?2fdQe_y0^8+e=4q_`P-NuKs%6Cm)b%rAI>tTXQp)<{FBl&PPQX^{QO_J2ysvr z?o-KZji48L*1&|3H;}~cF?~}nbGl?U>%3EAIKVX{@Nl^%$#8oN#OXM z$S_hV3@GjhLLSzKDR!{>l@NcLFg>v&DY z8T#ja9IPpXwCY4DYQCm_6%RFCS0@dC;uQ}&vMpWj;gQMhX;D9c4O5<>$sbVh4H1V@Fqv?E z=h4)76uY8WYf{ILrlRo)x)G2!Cv|*2aB(_zLmNVTIHR2bc<&n88ho4!?j>;lCF2AR zNu;yN*d{NKTk}{f!M;fzM7xSv9v?{K^gEB)!lC)O*ErX5E)ZZ|BzMO{lO2G+HIjLJ$((YA;Ebjxv( z<%IL?eNbR-6GB7{*_A&>1X7_+SM)^OxA9UTaf)3@!;}fLpEmrcpQtjsP-_|Q+j7Q8 zlp<{#y9&&S;uO*FlB;~J`uht!a^WMZQ@jr>dJJ!>(kq0FYqV8|VK;mkD)#mIFP|3@ zG)%F1Pu%1#y>8TD?C92E=;iXgv^$?ekEJX{UuT%5!E{=99=x|!&z;<($KWUlK0r6} zE?ASqs;MoP|2iNF)MSmdt@{acmV+Na36-nWilBMrQ5jc2Vk7wTgWSj;_?z6^>?E!O5TfXl(8X>Ih9$JHm@(4ZdDA9idbDE1XnA)Db-*dm`&UBwtx=*IN&o7S9{`HKf z^M{_!^mCVC_;gm#Yi)vmH}<1{`{G*{Srf?@I^)6NUcG22t9#XnJ*ZnwEBP#c(?f@X zwJndY#MdsZ*B{1tV1a=(%lEw>dhh<~y}%{fy{_5V$W~EgLm*t|#{Xsv!S=)}Vz^FV zOSD+-xFBrm&gU_wOmB!cgaR^0`v%O=ifM!s+!TmipvO_3!Cn0Kk zK1YI9G;u0iP-~}i;^1mao4cUskJ8Dqn&OMe2=hlDNx#0J^Z-q>(|94IIE+o(t@)et zEB$Mo>kS7NJ=wbETfv*b70+s9y>5s$5;vsX?Jxy@>fM>GZl|f$(1AHcvYJKLQjt2M zNLx(>= zraej%bgvEB>bgwz4yAgB?#8-bsoq}}n6nGbP|jMKU7tGyclBGlf7#ORYLs@npyYM> zb2UPBAXBwdsoJ^fNLTG#uiCdZtW-Uo@jjt=pIG-EOFND+9#S`yu=QTFwg^?Qwe>SZ zw$qNiVR~A?Nf;vEGeKyM@0tL!SoTEFJWB_=PWcnmOeqUcp0*kmtS}tdQkJjphKa&p zG0`CHidb|OZNdKNO=fv-I@DKo{*nEeW+T?Ct~sZ;ZxnF%~woX)jl3$ zS(q;g=T&g_I1ZLc%14o|+aBmJ)jV#NpQWM|nwf!b%~Lh218U=Er>K=$)ntPv0rg=k zM6J$B6k+mxkDlWi_fc5nS(@9whkj=0yQuo0%FK&?QRFC5AS>1c5ecg*q8t2&TEuzZkK0aZ| zIIVhmrI~4DZv}4D^4K~;a;@!6Q zzlazf7KESQFip^Yr1GoF*w#oER?K13C^OIi)t;ugMy_o|i~-F+jo)R`Fu4dy`~>$B zR-uZegcxhgvT|kqw!CI(_DSlL!@N`Kk1Rzp^+y!?8@clWf<<@M4dmID_0{3vjzDK7 za99Z(zO!dNa5n8|g=1aX(aF)3lOZZ_)MUM#Nw7T~3AMIpCm}!F4XS^x=;R-j7%M~{ zr21zZ98n_770VkZwsU`pnBm}JB?lLVI5b`i#f*y`Y(wk?q0Ex2oFq-hA>`e^7c4+= z+gU~#2MB~{$YMA&hj!QC&|v(e15Uip4+zev3DF=E#|6#*OEk1zJ`6^m@tu>qDI3Ws zIwE?R0IyK+D-^J57~>eE$8Vs*hw%*GgT*|Qww(mfqrvMaGDBm(fLee_ssvAC8b`Kt z9$p#AbRJgd@ARFs2-2R8Y)c^1@|e={ShlJ$TeU0O)`hM8PrUa;X3tX!{@R`bjcnZq zD%V=McmaYzV=Ftqh41akdg|C;AluXh8rVpNPKIbI3p@hZ$CM{?bd)Z`8u;eY3uB9K zD7|p{n+BzYsbdzCe3E*N#8jO_p&rCeIQ9b<#f(WMHe_QDwuIlTpkKe>R+4S|vpDlx z)UBk-0G=T(W4BVhdnNg9>YdcuE9=##u~)`hw^+fJ4LjGl{AY9AlQ%EI-&^Yd#toLA z?q#PsrnJ@_be;(9Xb@E!ogGa-6Cn?vq5O#9YM4NwI%GSljAUkidP(c2F!X0d{lp5+ z_dF627y9sii@IhP>ze!3J686uS9ha39F-YIGtSaqesXmr(|1bgJC$}cf9yETyC{R~ zX!K}Lpwu9Af|^x_U%P-q$(0-2rm63adJJjuBzjDBv%}hPLfzy{q`>EhHaBr+Q|f7% zS(EltjA7(qik+h10_96mjIcC2xk0sEV83`Ef0ZCTlt!ys-bu0Fpg^KW8Y0#~<=s^K zC=UhnD<6sfgn-;sJyvWbtQ1;1(w4@*_3wg!w)|#~vf->;oMR`4JWm^`)8HDMy_;4_%JquYohxVF zed?X3-o5zFMe^C&nX_4~yH@LS0&eO3PXT3EcjfAZqbJ4e-hB_;erxN7&rc^S<*bO~ z_57TroSmMq>l)|XDQ$;xPI`0+{#JH6ctmOF%~4r+a`+l^6%?z~8lJ{JROg;eFTwVk zto5sBzIXn;^EtST)!%vkFJAt^%bOIEJw#9Zc*X8*Giz+7k)71Dd~YR=iKR1K*-BCC2{&kv{yc?az9(uG zON>OJUyoh^>WuI+a&|E=SQGrfi2f5D{cOK78j8Mootc~X`O}?`$U9EIUIf#OBv(`8 zIv$Z#+uB@1y+lfDbTE! z9Me3id&`yNoke(4lJ&!R=8{p^I7m6mwqvMXUXaXajO~-j$fpfZeq^w%giaFSenHGu z-a^^xN(%QgbrUn8t4$B38Ghw5dH2d>1(5?3bRbY0%j6%AVhASW9h6?d()E4u3`eSd z24kPp&zVAKjOo{plJ=>KKL}eSo`s#p%>UGrFg%nzLCXHed~j21G=v5(RRDPS~#>7`>>)$IIHb}Ae@ zeS+=3Wt;!x4a!VzrQFww%=>DMEKOiN!02RXkr~EkydbA;XHY_Q9L2}sCv=X>q%Xst zkVqlK@5zYb=Qcq+qzDIpEbRJ;;Q2(T`>`;r2*aNUXFd^5{hhGm6Jh8R;fYU#qn`*T zek?ryiSXnn!r4!R(a%k0(e#-^5Fh*8WD!jd1O%VC1o6PvS zQat633#RV0Sep^M6tOF7Y0f&Ev(;TtU7Ac=uCNK)JXQwZJ^aq$bpOdsfjw`!k@kS6 HvC;W|o_lc0 literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/aya_vision.cpython-312.pyc b/model_executor/models/__pycache__/aya_vision.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..156dbebeef518b756a89252e4a7ddb3fcf9149e1 GIT binary patch literal 22096 zcmb_^3vgT4ncls4zeo@S2?BhA6e$vtNJ^yMlJ&AITb4}OlI(1dWD$t>k^&8Y^aW^% z94JvXPEBRAYiXLS=q5X+TeqRd-U-v~bd|o=NvAXEZfA&dxt#t+;jhP?s@#@e;@p5LxYRMm9+eJ;@%04`)hj8FNc;{|FVhW z?s5Vr@JTMkNBNW~YD$@-W}eDSNlVHawWe%Q8+$e2X%tjnW=*r+7SsipIMW>|Ge|dd2>l zcPE8D(I;#ZPfb~b{#Pu~$Atm$l&~HDRAv(fQFhwIi5)ZFtn@|Ci6_P5;)!>M^e?gR zMDU#*^~WHNi+kU(sF~E&zNsO%VQPo@ycmpFAJC_{5WEFI~R)dGx>enfYWk@yx6cPo9ovNh29gF&EO0Q}9%$kmOXvT0Eb(~aZ!4+{UAp^x# zoJw35lj;h^1E}Q+<2s$dI@89;9A?i{HXKgR9S&c<=!{IDiyF#^%^5O90;qZjf^yJleIxQwMRqwAJ z4nM_KjXquVHP(wNxNC>QpGyeYtDgaOC4D%|C`p(|LwHBkxY#JvPb;1|ny1*y@#MUi zQ7qYoIT3Vd?q@{M75=KiUsFt1(uy0i64T~mBH~gUu~<5u5@RvN6(h`jR+_x3*wjrS^`h);ZXGH2x93N*skza) z#N2Qe6hq7oU!Om4=-`V7#L;Bp%IMre_UdeUl#qCoVQx&kE=~gTCG{~g8efRVUS?Eq zWNty)g(g`tj7;V?k$i*u$mFm($}OC``SytH>MdA$A2xTC91X8|Zh8K~;lF))xp(=} z%C>@Id)dr8hdwf!{kC<^Y_*kL+-AUnBSat_0BN>u5Qy}c{=rR~NJDvU{o8o`YSlIp zc+os%TH>a7!SsqJY7xw$b;bjrnoF?MF9+YVg2&na9`$Uf=UC#d^HJv|E@D?Yjg3?x zJRpTL;v{BQu}w0bG6{LgxQG`L!Z0c8JN#WPYrMK;%5lGg_d1{Cxg4M4Vn*#8KSPL6 za{+?dV^`Lx=W@Kj1D;8HdYO8!H&Z${M_Yt|?8^G|+)T5ctK89cOXW$=)oZ(cuLaX0 zh~y$hi}wM&e&znI`5kTC=##OwRefr&!0NgGkA3218ueBycfs-5xNkKoOIRQQc8nLbMy*{OP z?3o@F+xbLVjH6FUc8~3m+|;HS{V>YLAS+oiVsc9DuQL{#OvW=HqZ!&K;qXoA7 zlEr~f+rK0Fz|R0Q^9v-uxXC?mG~BYwj^KSqdnwSlWcil6)H_)0JtX%YTC&`CbU)ZR zR`N9GTlTE&FOEMUk3Ufye@-5MZryC!Is)YiVEJe&-k-93KYT)pj(%HD31YVXbz{p6;S&hwRx=sr)+)j6y4oj2XtJRB=0_ z*d!5hcX|_fBcjd7+g~Ab7h7RQlS1%lTsi(7t@H*6+>AlK3;-biCJ0S|sJy4`VL+I2 zrWu25RbT|LhYA__B2fK?`Gy7GGl*mbHfPaW%bAHN-2f?`A=Ik5awZ)P5GJqy%Y<|q zFhBb{T(psdL@Hua8dOOkb9H`dDk)0O0YbWjM6pg@oln0gU8aKPkwmQ03zYv2dfA|F z7*cmK3V&U(OjC|VPm)rYA+BPdnw6j!2#funio6=smr*%0i{vI(3Uw7jJLS;M)rmsr z$WkMo-{^k5d$q9;I)JBO=Nk>LH>{)!!F@{&WgF)Y7n=v<=7E*?>cxEXK%sf}E&CF` zw3h%KuRB&+zT3F!lY={!8Xjs@Q-$E(rG}EXGwA--Jt4IAp$MT>f^sseY;mV?Nxi_$>7x;+rgM8j{`w`kv# zvsLxX3N)$1lNW(HdT*MdV*rTMtXqa(3Ob{@glaV^nE3L2_=g=K6OuR z+SW;#a^;jV^oOcW_*>1O%LW9N7s2mzC zdD>ONzv7cU+g6@mJ+pQS`~gJ15!RkG}W`WmUa{+MUF|ap}_L`b`v7ly_C@fW~1_ zH4Z%6ph;MCO#;sjO|x(!?-JdjN3bn|F{ujplu57ywgK-Nk#`{P6`X)^;omJb2_D=V z@Xw2$DUZ;I_r3*N#H+L!C<)DiY+9PJM*2MZH>ojF)_bCu^uK-r31hyv*IxD5ZPubg z>2&B4_clKPLCQRty{=l#u1;xk8e2`Q5-~2eN-AHH=J8x5P;Kt)2$|1}CcZ&c+mK-E zc>S+kzjeLn9gw{PZ_O6G`}2ad4o@6i3o5!rj@-pfexjx#FftX@}R(6oED9<*Q#tKRUjqNWLn zOh6G6>$&1pya=b|C|A1Po{iELX%dp;*~R*ifXG=>s%ZnsC%cJ%-*w#y~RE zrDDlMDv`}Bj(%FN>s0t%YM4j?dfwEw+_Lo5yhEiJo4QF$!@BuBWVVub-@!Y&n8;BF zGX~tM`iZt4gXbqM}4>3m8zzTcod` zh_pT!k=?2Ib=`O+1qf7=P;XFOqmuLr9%}{D-U_xb6Z^G9_9_fsnXI8QEFSu_RZwwf zi>i0%gBOutJ=(jA?PGHLSfPFIlDFgw7kvY=Z(!xB>>FLOm702rP5p9H|H|i}JS|yE zfq`ORj~v*uH4_u0JTS(Vzj)z7PPf=7K&g$L>D_$+27 zkv3+AhHu~j;Q$GN3=fEoz^G$vUlLy5HR;BT8|Esp4C_5*!h0rg;%*@Za9o&~&tx;~tI65v#AJ91R^N*K7>%Se_-aa&GP> zWAx(;+3^K&Dh@|SEIS*^z=A4?Ran`FsQ_-%ww1h;cdC^8A5g)ol+aR2M2Q&&tG3$7 z+2njGolzWQxRqwFWfT|Xt|VtCU(94kxYg%XBQjn>^eQXw;^3#Dbrs+;+F}OPo7_ex z*VMIqdF}c8-Xlwv2jPRIhS2hnyl1e~;JZC^>o=D#uMEEarMzcn$gh2WRy|9qqOd{f_(qCq}W%iVpsEiUu8dh1RY~rm4 zmcP2r;a0Zs)+hPZKp78D^7*}IsPsvmijS|Jq0-~|1LvvmIF||LN=k=G}EQ{$1l^a;Lk+NQv zZQ@)F8$Qai>0(`n1$KfV#nc@T&o1C4;?o&m+Y92wW#pq6sbrL+n zpghJvhaF|`h^H8b{zVoAWM zs0tezdCktgD5iCu@@>GYdyVaVSo$im2-_+|S=ruGEgy}v?*#I!#>miErK~+VYfSvA z!2WZzlp*m6?4z~&?w9U-Y30D`$wKSMlIvkh@b1|=XNxT(a?41eW%rV^6zC}icFKXB zOYVo>_T{MT9n3ohRUt*OYYL{~!m^M<7{qasmO;OW9kV4-kby{G`phLEqtvgvB-Wf+ z@?lx(F1Ielry(`nu)!0+)iaRgtTPV1)0I0)i5S#eSZEu>32mHA5`*dMCZMzhXL5x~ zieE>m;v`R?ln}&B#3ub7!QG{V5fNXkvXJV^S87M>D#<2!N+JZ2exDN7rb}fJOi+`e3F-3eM<3WSS+?Q&pysjauz zHYB$Vl{yAW?UAyR3+w_x3pAF^_9j=EvpZcXMPi!CBf2y#fpnV^k&^FH;-kb*$v>gw zC?yrj{154ok{{wfa}^0`&o=AOs(sD8c4^Ha4;?CVc)mBePPcN9d+a3tk+a$AdgyB@ zTX845M91>@ih22;D*UYCU^IQ9&_v`bw0v@?_ccWQ7LqVtzmy{D5}BF4_b~4FuJ8Xd1XpaDq-d zp>exv;M}4cl-naVFcHilcmUVP{Cr-JwuY_ze5#VD*g&=r9--JZ#9 ztJIrlntstlFjA4m_OEdUHmc4*xBwv|!al8Sy&G1h^J-6eLv*i4p#>{6T{CIAFqqov zZFk6)rh1lc!LYBx-0Jw+1CS81;?!I^hnOL0tL73Qp(c!0W(Y@XF2R&FT#JT;?98_6 zU|BK|uq)@(Nt%8?&AsNDG);4prY`~UZ@6;K8CtuVE9cVT@0h;TYT<6U0qI6vk5>0G z>UuD?dh^*)(-^T*N&B*%#K{8d5NWRjf5_*tcXiidw!fV)_^l~D|ij!zGqPkKUiB?^) zLP3~@;THy6g>m}|#aD$+(q2_A^L5HCQSu5U#M>2HmPHapyebd*XY}Ixl&n(nHi7uG z9>X{=JD;VXB*n979IW?**IHi%)c^v2OhVCw7-B#f=}l^rP0iXqGsah}p z057V{51Rr7R01;c=yc|($#Y=tzksUH$aVF-{-oTwA1=C*$6xZamOKGy1YNE?f+;*) zaO`&W?!ujgV%r{>{*JAkMY7~6wQn!B@0Z*6Ggj2refPyXFRly~0z08rES<@_4?b*a zDK-trO#_9d!4G`hce+;WtATva?t*X62fp?@q2&Yj`wtX+2g`P@p&cH?!^i(k=Do)X z$Dg}@=proE9gTl+=;GSR+YQC0U2@Z|)#v`aX*7T6V%f|!1j`&=e-z}ljja2e$TcB% z`{}Zm^R?b}-f6OT9aby<>9kSSbX9^{^ZoE`>*n;UjYRh>`7gk;ItD@uA zqbrcbwOR(vGgoXll@N}4VweC)T(4CHNeXYx1s9F>m0uO^U5g&v0G4N+P*I!2QYPlX zL^@1)LU=_pkBAh2mx08_&K+X>V=1v^BH%mBunRX2@}McM}nn+B(0a#-B$5Va~<1h3}^B zq*k8!7yJI`$PbU)d+8646#}QoK;(ZA8Yza3$)RKSuKm<94-d;$-#YV(}mzM z_IAG<+P`+K5IPR$ItgqRO<+?T_$VW0N&{$16%Ns$^vkdm{Rv85033|HytQ<)D+_Jn|6WE>Vg@pP)oKY`;r6xUT(@t zVOBS+S%d!r3t^+G;L9`_X2DM!H27FTxNexn=sUD&z@oYJGeF4iSnqPT9QC!9EFl}a zbQEU>(`tc9d!IA>r8i8sob`rT*U4|Hx1}Kd)Y(3mBqP2@`zUTGZK^!!Ig}b=j9$iZ zF#2gdzhS;*oiT)(3PjEf2Cr>Wm~jnZXC_#I(Q`S=ExSG|m~375dKd2_Jli>u25AjX zjE_oxNIAxlnW<(guBrHmfn&$UGO*)mrhcgll@&K4Xka2@iZ&d2|A87Lu2^AviiNCy z(!ZoaVw&Vnm{u*MFwQ~EmwrO8UO~LlwI0>Pj-%ip(MsQ4Uqg5-CS=s&46IQ_iUAzo%CW z{7)%oEEB8EteRi`GOfso6DQR5@H1Mwd4?Bv|GH~prKx`jU;N)pYpxYd&gJ($Ew`P6 zY!c`z21ew-$dbECOsUG2Egt97{9BWsaCHBOr5oK`Lw6|@F17Vx2|#`upfuZv2HP|m z9BHbxdNNg+2(a`TC1ll;NF-zIN%}G6Xy-{Sl+ccqIw|?jl%y${MN;jJ{-;z(34vs2 z!kIUE(AxX3$zQhO4sE$PwCrD=TJC^fqHL!khpIr^mXE-`7kL-dZ+}nOiaQ$Y9a=T7zOd?s^P+60A_wR4-S+1p%=vc8&Ry$HDh&3Z zQu9FBihFG(Dspha;j)u*(9xT3?~b$UZqfYj7r=*h!wOTd$wvZy9#XOWdY<1_{p^WP8ehm0Bucm`f&v z4fC%!a{Or{&{ikHXprFjQEgiVDzzkW>H@*<^9VRq%y3UC&dd176OuG5Nh5%O zWSS?y#!c;JSex3cXcR6`e=4L8P<@aFe+?PNGP;Yc<8tfx&52U5w-_9igQLaZNjZ2j zZ)yF|?z=tm*0%fh?GL;?D~>_o&MVsZ(V+O-^w%honsHY%>|l2Bn9sa zdB+P6T>g@4;4NYGSYi9&{J@d?k@I=SGf<-3HmFKk5WM2wGzO?9Sa&v%O_6l8wV2r` zj11wIz`uwx2}qse7Ey-iYNT)vYKD2LpaxhSMGhXahg3k5UhVFeXuespk0zrRPhg++ z_T}gP%-&ssI<;-T?A?FQCwmX)9fwQq?z~0yh@(N0H589(Z&wwnNt76uzA4wI(3Q;= zzo;1pU2r5ufCH2cyL`{`u3g`dQCRq3sk0+a#sRCHU2wB_8XMW?NEo3D_Phq6@q4`B zg;lRfv`<->MbP(3W0WFrnqM)tIbv`8Vx#C4?PAkRpIW;*Rjq_%MZ&k5MLXr-+i8I< z&@Z%#EkdAvJwVrO4_~TVF0`p%(#8@*`>i+`7(hF1^=oK7X$NEwrv*cR3}|y<5js)Y z&iX(^r{UMQ7^>1gHNXBB=*FMi$#@YU7yL zRWUA%L@bM~YD+Z$i+;6OBZXS?ooeA0A8{D1;&9@ul$a*pcUrurojjBN6V_C^N6F`r zOiWf=EGW-N1#SGTEf)HQllXEwbIH^3$aQY?}E z3duJi8WU1&HbaphI3-C3rx@L4fn~`|rRN%**0g>WcEt38tFv@oFa~e*;=a#ZOr{0Y zRC|UfnHqHU(Csg}yJdIx^0oW!!IHo8!^U7fbfnODH19ad1R#aZLco|$=lK*jJM^bX zLjMyZHyAxpQxYTDy#5J3P-D4Am~ZVdo_dl`J++lZ`4}<~zM)<4wAPLOsi)9ZYwEBT zp?`G2{%@{6-UAJuiy3x_972R+B)qVzA}zteSVy%bXJD0dWP-1N<^&g%Uad{ESCQMy z^;Fs!vJcfY?+pGJ9O?@=%W`bp!$3dP-KMl_W4$Rml z6pywy*`CCi)b>YYrbIiB#&Mg=v}Z}Vf+$vf+@^21*mqd&J6!BLE%%*9q*q&Cv29dt8-+I6 zIauu6D|hZKcAk+t5k(n#&^flYzt9OuC8VFz=^Ua1IH6MS7(1!+Vdt=Mkf!7bmIe<# z@b@4-tg+xfutCOk1`U<0qqo?;OK#s)Y(F5kAHew;?}LGp?`RtI8-=fyDFLm`UmX;Nw*xOk1uGtDUCBp`Ck^`_PMXpLZ|Z zxlm{sD0sIQy}M-ZuDoNHN=xn6)N|FFBv)g=7atk0J}5d^Q!hrDWZQZJe%ByA;Q1yx za|I z{HOXu63x>PS^*CG`d0Q9d_%xkZ)?%pCwu!=4y^4hj-Qps&*r^-1@F^&$J43`{t5MD z7|F;cHcq6OA&P_yivWgOxSkcYeh4Y%2;HHp{v6h0MU|jM%&Qsm%J2l!+otO%nuU-; z1GD1v0)l=_IQ2@{q@KL%uF>~u;mxW8z!>tTNmoL8;4vM#&d{1^-_0x@`Rt|GvhvHc zE@SB9P3|KX*Mw7e!PhR_y0EgV*tb{i+q-n3;MrH|ANu~eH_sLOC*=MK9KRz2lUX7au*~qqOMWY7IHq5-#rl6_5!MKY41a>)pinnTZ)@P8$b5sy8e{ zMP>vLDR|S$F}ZDw44u6z!*XCZ?3{G|?tmORP@)5xU9zWZ^*nr~?FZL6{qRq)Q}yAl zjTG8WqHbhg5oZG;NA8^~9-WYnP85zldq47A!Fy4TJhyW4HczfpoVrW>&h^EQP>6;C)-y=J(!? zymuP{@ACuC7LaxkSsQfxgWw;rOP zyPBeptmf{<4X1->v_qrOo6wz*lCe$_TDBs~y41u=e+#Hu2}x-tkMd@TcyaG%PvWDs z-lTaWV;Br;UcV{}~EM=-GlEN#rksxWvRDhG0ZpH?!j=hS9N#3R2~S-%?_%t@9Q&dl(zjCRA-Ofkc=8D`hX zC@q_Q;6RN9$|w!0JyP3}lhv$}$RwL(QjmZf=C6UIF;@6lZ*tZ6llxIpHh#BlN6Al z2z_kTCe9PiyLXm4!%NTL&~`DnQx5Jd2KUIpJ&1XxzutYNcDSdw&=^8e@yh`X(0`h6 zvQs_JEm4m)2|cv4*zO`b<5s(?{>K+$RJ3GsL0GK| zzC}IedVbU^9p!8oUp}oP5 z$KnKrn3YKJP&Iod7ww|A3MI5*RevVizr@lN+cWI9g(Ug~Cuti&1u54>D)tk43{xUcK31|CbrEO!SU8hUIJs-JkovsbDBV>a+1iuMn z^Oa2&>tn0q2!*jewx-_hjlb9Oqr*R^l6A8Vf+kP#r}>}LncbDeGTrWtYbSU)TjNJK zE;3BejdXBixNI|7&+#iytvHo*#MgeA(fE z>oYJ|w&ITdwsqc3-bt42^aNL__qEPjoy*o`@r~r`$<@d6-SFw`mYuuTodnS0vR>j# z9ec}W$~<&7EQzlrZzW}C$2#1vm-uqfZ*`S|-DNZG51oB0mt|*U-HM0u4&J(L<%#b< z_2yIGzwqV-I;FCWPN|$=Ctc{C-}@9DS2@9#1KiOQd}$c-=d*S_^at-Ay>ql|MIPO4 z39%u&B&ZbgvWc9f|urqPcREtG52hL*=xcJ5sFQrUJ3 z4z)bQ1fWvcN)PyeHUk#f+4+f`%2a)m7CrK$;>8iqJH3?uO?lM+Ko3!w$kOGx1h z^vitghp5NC@Xk|xs!R2+yQogEx_kt`(2{sr{Xr}BH(R!CdacHJYH~Im{+h>c z>}A+VE~R@DB(svClg3Y0vGd5v-cJV_scO2aDzRc^f4!>u;j@uPdZxc&C(Kkc@rY(7 zrX8*X$hM%Bj8ym~3({A->}6GHbEQh9bVOxYYDhl?CMc~n^+wcqT5?gVr?c7!s{Iit z9mO9|8S@r5>%Y>c>w0h!6OPxMiBKFzfk*fywqw#^w2VJ4vjZu9 znT+Q@ad7-WnLF?SH}Kb-=Y6i_1McYi+|Kv8$KK}-zt3Iz*ks~O9|LIH``q~Z+~^0~ z`470~W$yV;Om^P%35Vq4CXNU6@CV$n|Hhq{x%2OH1D}|zya}X$H*GYv^0pPv28Y|m z?xTG7%Gmc0y?JPZLw4gU{0{ykpYPti!BJ)-%6IUteBZ(Rp=ag3=QcPzZ5(}`N7=T+ z`6CzQZI?DUdMwYG+-P;S%;9!#+dAFKoo2pgbsMY~e9wF8?WcHyCy)@+P@Zoo@`Exz vSh9pm&Tz@M3n4Wo(}wGm3140{{lNLQGe7>sdyDHFD_M8r?MLccMn3*86-g(* literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/baichuan.cpython-312.pyc b/model_executor/models/__pycache__/baichuan.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e7d6275a937c46eaa245400067e3367bd05dd39 GIT binary patch literal 19811 zcmb_^d2kz7nqN17#tniucz_~AQKTSVI%pl%Em4;x(dXDRSa~rJyD5PJ2&w_f76+Vo zC$1PA+bb&BiKwh6rfWADD%O^j{1Z*Gm6=phS#36zMT?pRrb8#Hnn|+GRn(APPo+|o z{Jz)d21qwOvzu*+uV26Sy|dr@zTfw~ukj~7pPR##u)Q1qA168PXY`<7&QfIMl7-`L zb0QbvL|(Kc_y~`?HDO6wBUYZ$YzbRZhzKlhN!TNHqzMUo(h+g6G<(9CbVXb&?ntsb^`PL_@MM(#Yc8L?GD| zX<~6-qB+?TX-T$5T9d&@kfl{6+LD_hn^@eRXitVBAr`MrY)*DWI#|3Wu_f6V=}c~o zY-P{2iLPXKq&vASvW-30CBn(=k?n}riw%jMWN)N5*%#?c_DA|zT4Q1$xg)Y8xihje zxht|OxjV8uIT#s4UO;+AY?AC_34Jol_febJ{I)H!*TS9V#Fn=?u~l-6O_cNX*vD2a z7TM1zgP?5t6_n31%1xkb{}q%6Bp;&uXrGCq#5`UP3a$g%jiB%Vl(kH#+}&2=o3k;XIe6cIF}CnqLSaweKd$s<>y)NmX% zSya!7*vJ)0E2O$#O(c?~3aqO4vXqHVOhhlssmTd-6Fs5AbV`o0-bW{5ax9UM5>Yue zepT(6{c=1pd2C#~Fqy!UCYD7w-r%0-O5%ZmiPhp-@^NRrQWE2_jPwed!1QN4UMj}yeOY@g>GP?Vj$vv_PcOWn`t&&Keppazj!kA# zuW5_oyw+|Yo~HMcjHlD+AL~IlblU+SMx$HIB+> zjNCD#tiLL9ZR_(mF3WvfDy`qAQ2#Waap<&TItSMia0h3uY}Kq@UawEIOhGz$II33* zGo`)FabrwwR8)-UJ?qD%QA}GZdf{D5MNgtlv}O4z#42j6`Z67AEOEhUYekLKU#84j zvgOuMcb2=lm*eD5DoXv8W<%uBPk}S$`u}52giM{@Z|WcF#T*(lX0Pb!1~Z1f*r_ix zZ+MI4cVRYj!pEg|JME~bxnA!<`JS~^y)?|EKwT{FxVW-@Qiq+bY zL@F(%!H!dJNOCka8r`MZF2&N4YG*0fo#_O)Sz2{VZ$c16SzaA&URg>)E)ajvwy{?u z(2AvL35816|9);Dlbjfsh)?ukh+vR?Z%*#rzwhc^X&@25G%#@^b0sxCa2+CIKobDc zo6-oFlB_+Z2QJ0pBP8AWCvK=UQR1*M6pTl;uF1#H)+zj@2e8fG;)=~JABZ20-5I-A z|4H-R=KDt$!zUIu4Jpk-d0XR>r~2l>nS({AYnK0=%^n{OG_B4?Omre(Thd zulD9QX1+0-nR|87*ZJ0|CywU1=p#q(QbWt_i?=Qoy}sFF9}IrD|IYsT7xUo*%GLvm ztp^`mQ(BMan~y2=$MUtu74M0q+Q99;TYYG;(VcUn#rBS(chkH-ziUVd4Xs!$?e4s@ zb=A%_wcXC%$`)(u=4z-jgG$?=(meQJ=f6JrN9X?KxmByB$({3p4)rzPYMwjw`zL>J z_M@{4V-I>4J5T3Z&peDhu6ZG6FM4X;y0C(I<3=dZFHb45LR0z<;H<;}#&Qvhi*s?J zOSDK<(TYEtWD^BR5cw&vQPD2hM%_7XlouUu`v7a5Z*vhxj(d}jII-~Esz*mC7tX&h zta`4*MNt}OqFb$5BS}b_tL{=hq>1XiMsgO&g5U?1C6gi}#+F7WX{|9~kw@fv{B16S zyLQdkv)uc{luFM$m%)>EfwfpJgUJ)45G`3-R`^{`w0_@m&-QVNCd)-jOcm_cWt6jI z0q#rm!aDkL?N-sw7^^?y(@TnutQBoIzi-tA$9MRQU(eP1h*k{-e1i+ShUIRMoSO3W zUxuKMPEJ6QrpE4Cu~Fl(w0s7c@>v8^?tU@##yHCb2LT$IYWmuE37<(~2$LDV&d|!U zp{dRNq2~!W5?Lso4wcD9Q*vlL^+tc#1(q}k{-ipi5ccs*G%6F*1AMFKhFnWYu5!cf zS7V7u>7*>DWSO`v_|QluIyxS9pifd_R6dDBnbwwkngXNysw)~DNyO6WXf#cNSz{h= z$w!Imc?47S6>F$p?^6^x>EA~1%eT1IYR*+P<4~MUkDV<`!97K9py=C__qNY(R=it^ zHI2pQt;NRR?APaB&)4;=dhBiPob9o*d9{jb=_z{Kh`xd-lCRqaq81RnSG8Qlc^l^% zA9>oBYJb*#4a;Xi)_@cjQFhugZ3UgN1}o?+`U;wcfE+VcT?NJ3nt0U2zs%j`hr?F+ z1(3-vQ}7~!uuXmiH5~Um~*oJCNXJnZ~HCrL_Zy=`nO6zDu zN+hDuyS)50>K;w@QEf2|5j_RJNg2-}_~qN&vX|Svx7gV9)YmF(dD76dEa1Lu=Ys9a z4vINBq`4^O<^th`L8WnE*+Y+BuBQHTD)bl6+JqW53|9FF>NT#KQf%cOB9%;4}A0iqQoZI1z}VnG_nGoj2Xx@ z7*tfZ31T*H(L5D`&yjT+g0j>Dw5P)cF4JtZXF3w|K9CS*VpXesy;wCwC`y}Pc!*le zv^<+>)7v)Qmpki58513hTAcP|-J*lEgs-6SX4>^yL1pq54OZ;S`m&y^w}l(qtd}U? zjUM?>i^*FMolGmO%52fgnD@!NbhZj@YjfuUBysDuCA=Au(CoUwWxDnF2DO;>475fa zdU-Zu)#WEWcF&`0Q)t=Hq0C#;MhrS*h5i-C&W&x?+b`d<_KNxNn$JhrH_Q~QQ&W!q z*I?%uxhkJS4R}gvFiJRf{_OFyOhvn_RsG~gQ$jyD-Z)uV)~F$8K)`?)a=^Tx>fDGz zYa{bVnR&_2f(jydjEKu68s%Y1Aef^8BwFxHMUzJnQ=PoP4drW!#tOeNwFl(bn@8}(hL88SupfS9l4>$a}?odJTFp4D1bvP3^W_=7_q9a6kI)}c~X#ADZrELU@= z-_xOZI_57w^7Jh=4lXwC$(=4Xhfx0TM~9W>ow?_VO+dhRJoA|!eEXwsD}iTnXO>!f z7hWm!9aH*_Ew&!doiDa_FSzgT%eNjZw4P8}Pb{{c%$?s)(iqO2UTWH0 ztggRZb*qYQzFWRxb)eW5rvELWxu*Gse4u;P=V-3V4ZVLBLuv2U=~~-AJap$!v2Amq zZNJjC9}w|&^j37$CA5R~z2||Tf+3<<=u}$!k#gRosruXATiy+uW2j4Y=?EdM`g8>| z16?I8Z^haesf5uaZy6?7MPx%TZh&kC#;vg8Sk1sVEQOY`Meic&off9;(++Sw11F%F z(k0rMTG)tO+K}rm$bTVg*KK*MUDO3yL!Orl5G5>5rEyI=L1jqc3M!{ApIB`!jKT$6 zCk)HmK=j%F#RUtr9R`;1t7*9pX>tNV*e+kgO%+&%#swxRGedz)6mE@Ge2WrD1yHTm zu3ncT6yvX{{8g2|u6l^wL{pQQiOGy=*Y0VO#^nvJfpUWGQ_WxA-X#^|Pm!Ibmfqq3 zz-t?Cx87=pfG=vl46uZHoI6xrq@`Z zu>ALn&?RD0cT0dj^WT_K@~ccbXTf znEFBy=r8qwWe1Boxwg=8Qn_lgR!woY@wZeMHI%JOF1 z2I$H}n*q`W&oVeig+(1XFzp#RjtaAvDK|r_w42vm(2)pzSu2cW_K%$!5qe~3F=gYI zIj=Ys$Mynj*jE#HHubLe|#LA zKgv!5DN${LkcR!7OAW zlvrslEVgbgwDu{jeFWl~mKuU{ug+h*|IA{;{+xTMBfQX+-*xWcD~la3<(=(I&D)Dj z!9vqErD+@NiG{`;O5=`V`_5uubFuRP85wmuBQqh^xHJ>uYCTuqUM{$)v#@E8vT08- z&{ha+Qve$EQm@P&f!(p5K%o=1$x!$F^+NZ4rF;K_>xCmPDMwy<-2O6kGq|*Epy;hF zcsD6HC^%K<+^cl%h5bK_l0R{7)fjJCJUeA69T#Ci)9_FwHlaA;Gg92Q8DycaXZ)4$5ToRM2#Zm5>UD*+~BuN&>0CW?Y~5 zv=#%MxiiIJs1V$t1b5tTTMQnVx$vZGd!g%DrR&)ThaPTQ?0Oj%>fD97eOMnKR^6#8 zHbSrHUKY6K?iJ43?9TZBlecsiI*uqEM}F*k_~v5A*A-`b&Nurem2lp1&bKW#4di@9 ze{k+f-rtpXc4>IUs-qO&I+P+48)o3d43)_5A{U2?lM@N)E0D?iR0jp0;xGLH0r?W@oZST6TJM69k?6Y=LioPAZx+Rw=Fgvhk z0}3s7gSI6Xl^t~hrU(%?EW;kjD{VllK^lTJfL}hz#sH2@tU}+b7{Ku}+ptgH2^Yxk zNcGSmu5OJbVWTnft%wbeRPr0zw~XKXFUPuF2e^xJZUNwdJ7lRGF9inL@aqpj2)iZwr1rT_@Mt_ z0!~_q^kGCaN;oibX~K>`43Ap}iq(zzrtJ&oAM`%F{<+l>tYSO`7DLV1N&X0w@+<{e z3P^;68_G}%)*OrPPRM0yo#8pv%g#sOYufUZHY-pMvt< zE>Ak^JA-Dl^8n4hdGzSfN@Uh(KIP3c!9^7LKY(Q33d7x#nqZ+Otki@HHM^CX-SAil zbQA&uN?;)8t%yEp43hN>R;WTtkJ8dZ#<-sccQIhprqHQw2_dZrR{g-&$?YHFSw^Sg z?feT14sRh?0&O@NF4@=iyA|g%3=7V6e$cY&mLS(a8rE+U}e1kR&X3RRp0;KA&pq$9* zit!NlIyTX?!@@z$wV0Xg-m9@ya57`O25T-?nyxAf6RJtHeqcb@m+4@NUd+0SwOC@z zB_p9TD}XoJhAa4vr6TT@$(n$d1t-n83>A!=Yvub$lSwek|A>MG3hp9My`Y0LMv1J2 z(rFC3l12+o1IquHV*i8!gWstF<1@?>n06rxyg<7c6V@-# zvAV5T-L&GcbyqPYerMmZm1{i5gCkW3-~D#Mzx|Pa`%+!kldZdop>0nEYfbHnRj7yK zhXCHeOxWb#_4@rYi>-&?fdXWHje6}2YTMsY^1EKM`3)FE1|EDk0He%h3tU3+q+NLX zw13uJ`fAb*yKwn5cAcIFGcS)-`Y$-Hv_7z|vch#v=HK&VEy%fu9MnDSn0De&8>{C! zk34TyFf~g%%dU;sTY{N68fK$XzhzKlt*pMmj7RT0(hMg~v~&#SA%X2s9MPKXwP_D_ z7Y=A)N^7eSFfLAH(JFHC~b5k!hng*6Y7gRk_>!#{v1(-LF z>M3=d(K*vs;7$~ci>km#SeMeoa&=}wcg8SOFEv#1jZaS*XMmb0tIWGd?4tz%bZnnPo(1cO5OHieGu%1yf5d^-EUb89Gta1!I{FLTZiTY zkL$t<;)5Qg|0JlE>b5To-miaLw}*=0=x|7B3eO6~(AGj|w-VYt>-%YS+nhw~;&Jt% zpH??4H*&%ba|0~FQktpg+zHS56h6r+<0W)sk429pTCL21OF7G?#8F*+Gdrqq&=VYMcikUS zHtkz>(WAR`a)B2d*uG3O!Kn z%;-?N0fm;kLFlLuQ6&lHbyS!ve3D;cvIQjr zC|Qk9%xcCu%X6j2#O4j^vf+a>TcnPeI_kl}gK9trEXUyrQ;Wng|fm|2_Vy^9*#Iuq7QV}?xpI(3kNd1x>w&3n|V=a=s&Wz-3ajBWU?b)86=}{3Jevfb(y#ng-elGb`MQ6{Z}T{NUg?qF!(r+Jhrs2^W=L-4m(Wk z6|A4E$IMhgi(3O6m6i(Qd&+ft-7piqOCszSDEmvMsFc0Vi(g^0C>bh#9yqWVI0UPK zw-{uONX<(P9l!|1x@MdqtvYP@{KNL18-S&$>&r$A91<X5ha5GSg_ve($=0!y+%DpawQRL9Eie!BVHXH(MbqkI%mO;K*&XqY9Hnczg4L>oJAYHs_`eaBt&Hq740ckFrKgB6M$ z9@#6MY`_UE8W@ zY78S~!&Iy2tKms}+M;Dnb!{}_wRC?-11Hy94BQEqph8WTQqu(|x9msR`*0090fV98 zwj+~Tvu4c3T74EdmA0w!=b|*CUw5@qM;Q3k$IJ96jF-OTT{B)rHPyM%AS?Bme?>!V zV?$i3>6jl}tl7o}Rq%Hx{*L**l@_{xOC9((2-Y~d8EesP56paeM@o8jmIL^#Ls3aCwECdPD?diD1gGehh5PC$oiJ zDt&^uYu$P9V-%a}+wfg#GIXtM`y+a(Bz)M0Z3^E0)~#<90)0xL?|vg0tGqw;1eSuE zKRk2iOd&X+1P2Pi{Yr5EgTBS!8Q8SI&+!4{+~!A~)+KM({NzINC*FfU_imowy3qC$ z?_S2uaNa$PQ-{}SOutD1^Ga(Z2WRN?kjVI;#!%|zUsuzkC1k4ZvB`DOhwbb$ahWE? zTnGI3ltaOHXcEE*paVXsYn|Qxggj;&mIa)3KNako?0H+uaudgN!(VvlB-qY18fVAQ z4okp?R@HSS15*vS15+`RG}Es<2uEVlGZDR zl-4A`6)q4YqrkF&I4ae&$2ptg3a&UPsmdbsZlF6NN7EfwI7FA7K4IU&u1^l$J-Eyv z`cU{&?;m@=ps1!+(%g;U&$wvr=~~%^V69pSRT66D2l&+y?R%n8Tj|^yN+eTk98hc= zDt*z#KCi7%YNlS*6GJ<99}mUGMS?_(YFB9Q@c|_EyGc|HKmW?fXG8c(C&a$p3z2On zbS0J!#X<@Cq&PGJZ7I|}(yfte8qr7SY8h`D3J`H28V+&hmA{986d))rh^@cO&JBoH znYqAeoJ_9Eka%SmohZH!PC{RRGS^*d5f|ey=2W)wfh$qLeKyVktTW|K&ZDg6G%HlV z;**p*OaU2Ca7f9-U|IS5ZOURGlm<+?pRsRbmQc$3Nb|Zpa{?uX-oY-T>`62!noJ**hpIWBdgWX@y?I{J?8mE#}rE&fB ztnm}{33=rjKc-JHR`jo0&Ep~bgod%4PyD5q^L7kooj#8*xX8bOEkgdD;f=|bmvy+j zvY>&LSCRIuW_j~y3y9>i$@*D8Vq^XMPkJvw6vFyha{AaOv(P{mPW|{g!Te?{5SLqf z8GjC{FTLRZMu`;s`GzmpD!Azd<3=wSx2F_nbyCUjQEff-Bd+>KQux6aCYsazu}dRN z%HYHlzd4dgr4niFh^j2ez4$4D@iYK%64;9QF{xGLdO{!OXby|o$1bfbkzivy!M=R% z7vYF2$MD+%Y1v6FGmn-2l0Pn4E!YBAt64#jzM6?&0yGub>6!LT4O4QrG6JUCM)jX7 zD7gUimp`=DzJTuh3R-qj#tz|ep2(@2zlxOoWDWZUn|=19eR|#VRVCKz#Y|o9{)!Up zTl>;)+AueD^{eVHA5%!|mu$4pigtfR{hCt)8T{D~{IFwa`VAb-K0r#E+Gkc=n`jM8 z_m_?lCNpuY46+_E@TU4$!L)R3l71ISCJVLdW+&G8RT_K*LhD^7FfEg{NENj547Z0U zbu$GW6l|g3n-siD!7&7?{Q^Txn%}EL$?sD@W?lwp7{6nC;x4DH@}tOt!-Yn9_&?L*q5s4Ud||QimZuzo&)pn< z=u>Xr+LSZ=r`)MexuaiLti0t5CxXwN9N+(6xg!d9z(iWSMrEP3@6>q|VpZ|>5E@jLN_;Qgr|A6n%oWp&I_#aHE9 VcCK=`uXZ?jHy%FcaAyDl5N|*NoB^OW1B%1~ zT52PEfnBWyRWaVHbi*qFc$>3m+ z`Q3@CWc6S*^Lr9C$5JhLQx`4(;P}umBI2VLE~Zmmzod$y2FZRh9T$cG3cqzUl^7k455Q|2 zABXFIMTn<`q0#X;e{^DeWb^{!97odW_;`ABf&eN~Qh#37k6=KQ&?~P*ty0O6sr1Bm zl5!lIhGZO-Nv+ADOYvAdIhnp18!f0_nYL7H zWM~-ke^qiZzyy_+lHAFmtLNjfNg+NmdSzH%*0SqT0_(p6&kukzc@{* zdDna9LDw92#W3hT!$m!kU)fsPk{^()!)zl+?qOQsSn;$iB==i1_iD(Cr8hMkj|uS< z<}f8$l|ir;G?ES!LT2lKgxH@LZg6QWao#q={TPTqMI8oi#xQ2qk~Pl^7sHVwe}E_~ zU5({pT0TIqLrl$~XeIEzY3B6oyLry zP1+cy18N#?L=K&PVwBL250yDWUJVBDK{a>&SHCq%y&e@#ABs0kW3*6*SA|R0tG*eK z8@xq5L;$1pZWD}k6HJMkJKe1MP_wq0)Htehi!sz@ZxgIG1_iN(v6g&D^@9gU>#if+Re+f` zv$=L*uH7FhxC*0HtJc104SRTR!8)(gh4JanMnOJp=cJFQCFtit>>(8*{|jvOQa+^m z5ZkBv^S`_w@yGLF)rXP~sQ&!#y^?~}-lWE-PpLlS()P9*_naLX%EPEWl%R*1jBE}{ zF$ZM@bD-_ot6a2vKp;dX(E7L_m-ggi$pr!|_SU7?I4G~(;8(wIGbJrl5;*+ThKmP0AU;65gR}M{EyLga1@w;ZBn_w7V zRE3r*CWnOd=nx3UiS!UkmdTR8LOQ;+7bT|>&yOb0Mys-E7DlQ zqpgw!lrd;0dkiGgXgU^?To*@qw4TwcQW@n#88RU)XP10wv^x=_BvzgbrHzjg>6M|h zO6g#b9QjOk${Rx;CEr`DvKZPS)6yn%R@hJQeq<-YIiF^s4#p)L$W^pU5IU$Ygcb$b z#gd&Sl4eJ8%Tsz;4*7~EQK0pQAxmNiG0QPbpFli~jgqg+GGPppc6tevnjJn=2Sc}BGA&6 z=c{ND+0v8p6ggtC;lxlX6^o?^yM^WN+TZner<0T2lcSTJs0yRid1Y$PzP*?B#Jdxt z=esAbrY}y6cV7m>(=D?z@hkCR(B6U^Om&}^xiv65@vh0M0`2nYs$x2`OIt?3kNI_J!f?7>`pbGCk;Sig@#p4wr;Ojx0gaS zk4ty2mhPRiJ@J)geT|~8ap{8S>s%>$=#seGh}LE%#+R4u~BG9+7TuG_Z_kRAZyU$Q2ml&X=k=^x7fJ*eyiAcWHs~>rBvm@P1*1cF}!1??+^Na-YGf>$Ua@&^uBtg#8CrNFQ`?y->)P%n zxoRG|YSye=c`#en_^_;T!^D*Y7tOakH$BU{v+ak(_CufgUwm2-xN+|KxuvE%owqwz z&OWNxzh>h+C6Cz=)wEWjZ&_qbNThmuG60kuQq?1Iu&_Z-LTZM+;QLhvF9VtYV~oz`JPrs)Zq%$ z-1y%0@8zoMvsHV4RUqS!9qU-yoa9l_BG678*hR&~u~8n36T}Zw z!uYTv^`Q(LVH3tUUBj-pLF6Z~zeV4S;X@_%9H3@Rv*sDoSYas&JaX%kvF*l|W^!Z#WsWNMsv2IzK zJ|XbD(umn3+*|a`Sk$pZecYHvWaq(VtQv&%IsMZaX?5J5+Y_Bp*YZ)DHAbwUMIczv zDrLsm8o;aowki*!`UrByroztJ0i_YSDh!AyyV^%qngc7!0U3d5;FtdgA0#s-9Jx`i zK(rVR|H;H?S|EB)I8F|orjq&cX#8#AB?=!UhfZQ)fSg|;=L|V#$$14%)G541{y}nn z4US}b>(XV(5R(jVNrp?3;j-j`@F5+)l9rE7$wdcTY+@=sIh9Tko2ndH_z~Wq0&F7_ z!rMavKOKDLxKUDW0xVUFopX&_b8@AX*^*|lqvGK)l2=WRXy|GIoHCgIbU_gUM>78hACYlV zC;sr7g*-UP0uh|4mD|>AENtfj(dC_DW%rtcg`Heg?V5{u-CRlemn{3*uNwoFnkRwK zjhX8+YZmyaoY0NL^~9QuLUzvS`yezQ`pSV28ywcGVYEp5RPn|7w-NQM%#U|!62Hx| zK3VDqi63v{4I^$Q^|LdnpW{7l(7|MNPV7UM@c?N}_aSif=QqC!9lL zh6~XPO%WlD^}mBtB$WgD*esPx(^15}8Kd6OFk`r2fY=gHNGBSy=u@N^MsQUoRqzka zRx35F4jSZtThy>sxiZgeM#+`}evRm0_N6uH&WB12>ZQK358Rw};FmZzxoJ<=1suz< zsY%FjC&mO4f=oNR_=&g208_nyAGVSODp)v3&Ph0s^%rJmguNr5n+_w%3gK~V zWISq<@!e(0+d@v!T(RX(J&@7bHF*;-^0_QO^%~*)dw@*63#XWHejM7JbC+h_b)vg2 z=c&uM>z5itcO$4Bm7XGcaPiGdS^I{|TFc16stqq!-JWyT=5yx3Rq|^vQ??DnO*K(P zUeM}+hJTU(o#SFVVwi?g{b+@;RdxlQZ%D!zW=YCOxr)9#sWVGNgtmI}V0j33bWur$ zGhnUD)BB|lzW=q;Y$=g3AJs-dqE0?Fypz`xD2@`4Y_ICe|LU*w5ktXv=#1Sxm&*63ya;--GheFe%L~WHOK6^+52K$xMi{;5 zdpG>eVfDn<c{hLw!Yj*)e*YC8{Hz8=XtrVd0*zBGEgf2 zzJW*S`8|sgwRMnhAMd-=hjrdxpd2vresyL959XG5yrOT`Fz!YQau$C$wu!vcsc9Pf zb=HvnmYRd$W=vSWr2HxYaaUSEHa4kH<2E+RT6mLs;Did5|3xo#vF;YC1igKx+b_1y zGz?KtpRBwq`&}YvbmkwiI%`^QJpwYEFIV*;RtEzjRvQr&lH=}`f2f=%n0KN&*#`o+ zVnFhyh9`u0Y-DPDm?;Nr}h$S6|6^g7LP;?Fgb$}EHNE2dw`s!p{ zGLKC5_8gBInd$=KZ5CMzDKOL(3@koQW;n6<70Bz;Ou8O52tTBPY^jNnG$h~fj9y4i zjPld`=C*~0#(7x%2tu4Sm1Pd6E>27(`0)97cyv76LqJ!70f_ zdJH9*xZ=s!C7GMJjBya6G#D;eK0#>!Z48(?LgP&1BDrZqprB#%B?U-fL$g;jD3U*4 zfjoo4Z&IOtQfe@RB+MmEZ?v_5siA{jc%2;DC=#g%$Tmq>pb%5#C{1SSl=Q?U;X0*w znFfR{7I07JQ8&|w$n#4va@VL2QYp1-^I!L1rNFO6in4m~ih)*Yjmn#X~<#mh@qSDKXS9;*WO)oN;p58918?%VE_zD#)U z!|)sT&wd>HD3&?D9o2Id{$#$hhiJ%3R;4U_L!ni?2WQ zG(HYD-|^h`u(+r2dvAN^`X0G!pO#nNIC=df7%oi>qIgOwa}eg0Hj1T<8x9MJ-`{tD zrFEAr?9SBezklqJ>!rs{Eq6}eKK`pwfK6rwp*<2&V+XSVa3O_AJsnce{0=j zY(_;NoSZ-Tsk;F@Qgc)eJwnLJwRit4dN-PFdr@qAG1GQRPRhE$TQ_9gEf3u-nO9$f zV9?=#f(n^QyHY2*>N2jjCnVcz6#b1$S0DPfubj>9Iw|ftnc4N~Rtc}I*{mKf*s_wh z4eKuQa^`aHS}E7qK?70$#O2GnBBCp@^y))b%VS^V!sSeT-y`3#TvJ=NX+Xq3f5Q{< z9~PSqXZ#J|?3XID4f{p>bMMc%Yad6T>FCV`YIC*GTzylnI=mQMs?P*=KqSzJLHN{N zzvkx}TUm8a1JJ?LXKT8|nyy?hlnrhZgWE9RH`=bZ<-??t+b`Db&((xkWTzPH)D$Cc zWXf8Yx`gRinzr3JdHZCxX`k4%FIU@e>)g$AIYl{B+mNm85o>!kYJEY{qPS(QbC5RQ zIXIi-48&M5#_a*bhLAHDh`TQSm`uPpYeK5VrsZLvm@uiN&DQdTF-@|R&xr$`Pg$}#;E|e6k*Vb9fI9M?-G^E(j*sy$g zi~xgr;9UzT38s<2%GB7XJS1;oL}3le(IKXEl_rI16^q{36S1d`Jw%-50pTB^LQoJi z+mc5i8xqVAmDycD3TnX_0{USH5W+12GeWe?@GMah!X0uLre(81f-@mr5!d0fCW-ZPw^{lv7PnOwdYEFug3n8%0E3tOP`_ zh?RC>&8Geui4ZGAy0ej8BL3Oy=bQ_ta>3?ouuTlME%z*ccQyFJoCo*^!g>Ckz0wIiClH-$}DhGeLIvE z_0Ei)WZDG?Dz~Fgba&15V}Cyhh3ER8*lQNkvc6rk?*sS~S1{vh%0+hF_lS{W5Geu! zeQ;p@z+waR$55Hx>A2mIjl3pCUd#C#fkzv?fI*x6jagr-=xbeeuAEx+9nRPf%iH;T z=yudoBn1<$A$Dsi7?A?PTjbD)>dV8vPbuVlioX;A7t6!mBz*x1#K?mdl0=Mo3q)cp zoR^8QpsX%n-gk`EmLrCx*Y2FVeQu2-|4RH15{mdTmRSa=t#+) zYY{wRXLZvrvTce!u&mJ(NHYVvO$k? zzJwMDwsc4>f7U*3O>4a7tlbEG?u-pBGx1i|GBPE#6mZkpVE|sxJ|G#o815Ceqd+J? zKoQH9TTsixA?oHmzaqV^~{WY&OIs&*SW)MClO%7T@} zm3-4A2QtZ4PqMbr2}lK5hc^-Css_1VS?n5heul92e?&H-&bAUx>*}bR4y{8`4 zpC+m~^tdVtJNn2d_v#m+^yuecsmW{;c|kDPt9?KR5M3{2*( zSh$=G^ooJrhpyho!N~l{r@_eF$sIDerDzDgV!IN_@wVaH%cof;!4&_6sX|s zu;?G;t$fA+s_JXZQy>}%v_)F^3N zgg=CnqM@NxRB+x0ijbvx=ugt}c^B|7KyY*J@(hm8K-1ibTqv9kZ5Kn^S87*7FV6Qr zX=%&0>=#@1-#_@tw$+w1P~gq=FYYBBzW1gV=Q$|rH49hOy3U!ZoS+Y!Al9>y7sben z4?Le-S&h6wq=aYT3P|x=_M7&l+SOn;s03eV@nXi;lCih2a|-4i1F}lWU3?f@se6R& zm`1-q8}&`j;h$4Ja=uOd{m1YWs_vn;chx9rd;DVUYFc{=7M#Qs1Q%5PCBd)lFSE32 zYI^u7qoSs#kew?F-srgAu~aUWHm^A-%BgDZT?ol1Gqq|Mf7++w+Vz*v*20sSZaMxG zw;OR&#~e40nAsU@dC!acjn=qTPfvrhSvMY9Dm%1Pb_b-ITjq7@=jB~-2k&5agWOv{ z$f;Y@9cQT{PTupLWzdNhy5jCQ`V;q1pXel|zY!zvMGL*WFYaUM`u$)hFTtZ6f1Kq; z-~7l~!YGAOY@|}DfKuqE`+97J3BQMIl8X#Sm1{%{PY+|z*+)k#?Nx?jF2enZ;RmK4 zn15i|baUt#8wDJXwEc=cJ!^#@ngOfBvHg%ZLplhGz8_lV%{anoAFyxUtT*L=&@c=j zv}0AxwH47YkE>t2QB}5VA{A(S*{6!_^$36zsVXW}$o#KxIQ?vXjZ7$j>nONaplgJO z@^$#YdCQCugpLX1gt-N1{x_VN)-;)$uy2bwFzYFw?l}wU)KpMcIo& z+kkxg>5Gc|3nH*U?UxQq$jgq1X{qGQ*T8&E{>>} zEl-P5>m}!9x(Cb5?in#EIhbWXU1AVUP|1Hp4lxVtZiYUOcZ9kjWR8GpBn zRTS_(ZfslLo#{RCN%LysD@0p|pSl7+I+S&_K6J_U>P-*bP4_|{)@7r852Jlg8~4F> z^M>!bkAA$@y(Cd3acZVvM<&n%adZ=tr-s2TH1(`D_L4|7{IsFvPVjax+pt4y*s;=* z?Ri=3d3m+r6jfOFbnj7Sht(vyo4zz!ApUi8fm+yXeJ4}ayiwxV=M?Qssvlfz`f1m4 z)4leMN^bji3^`9|aV+DGlIi|IvHc(fG221rKeBghK-?3gPPCA`I9R`ACZYPu@$AlT zi95gbD0pJ7@5f*lT2_PG$j7#U2zfA zUkZDT*wd7Z^HQ5c7lhDQ|9%TK6rwZrOdF$i7kxt2jQL&M=Y!O>1em)}6rw;Da2^(t z%Z_`&N6Gh#VP;Rq4dkyTS->%CO>2||O4FfS5UWGCAXX)53)-hr8hzZG&|%HmARw?o zxL|zNi2VH$a{>C+`l};nPI>comLxysO+PiN=kgg2Ml@w;i)L<n844Bt3*v|^05o+) zpB_=4Cql6>ECZ5*+2b->-f3i)Y>H)X)GU<^Uz$wEv~*_Eu0(1UyK-a+gH&>|D5ePq zRl?N2)$%#0sJTh$oQfrfCMW6kBqLT9P%{K)gyWd8rYsncO6jExMJ27kPe+Oo+_NzM zofc3#%DToq_SI&6dqv;gT)3V7)wL3P@4D&A2D`;zH_jTrbB(h&oH7EpDQ}@{(wWiv zWz#VT7ti@%z%LkQjd_9%gHP#2A8*1?qb2mIzq6)ijUl1$;`H=gDjorTGxTu_hBq&9 zoT8($<&7qhR|liv-oAndgkF6JJ&Y&RLiB09%n4O_;j1oxMyB_m8YjKGs4 zXdy;o);{Z)b3jAGL zw2$Bp-Wdeb~BKXPqRM% zZ-8Z{MyfgbFDQcQKaOivej%QmxEzo1Q@Bz!%nVtW84cZXp7ugmLY^*pC$mz7E691_ z#Lxclcm5_jeF$a75S9#+!VdgSyCC6K?T>Jcn{G7Y`s8Dm&PAyb%*XI0wMq?_N-kV?C)~p^n{m$zz8tX7_!b3KkU z|MO-DYl$q_BZl{^hM|M41!IC6zH@yGolpG%+|dQvU;4F2{vEmU;Ehw)Pi4!a56h#u z@}_08Sl;n8+;GQr+qK+(|E&i{KH2#w{K|so&v%^?{W}(%i>H=@xw5(?UMy?Nm4|X= zVO*Ii+eTg(Vc-tb(y5hdnCC2*pOgh|9K3#TDe$N)y3F5i7rTzJ41vY(%6XrbwJq;l zDSuS98}UzqjVLG>U9jZB&Dn6T817y0eCDrRj4!>pa^jKy;Aj4dwPtSjX~X(9F1%~u zTe+hzi{*zGUS4u7|LT1^n6%|4F#rpbFsw~(9k_X5*|jo=`_K!0PXk@cXTyY!t$fzFkt`3;>_nECpt537I~{qt0(M5kGFo+% zmZ~{1no7fJ5SSt^E6drYq|;hhBL{eX3&r@yluXWV(t>;n4+-S40;~SUvE35I+N=z( z*~n)naXX9S4F*g1a>dU=cSCC&y!V?w?)a!gxHz6|vMlfz18D^#f}b_zUFm z6tZ*mJu7{$JaPY`lR=h)PLUl z74yr3PTd~HpZ2Nj(fV&=Fg8p4aiIgflH%y?%(0 zt!lw|EU4fDCGPy|u;qZz1P28Sx*@F*t8|M;C2R8LWXPux-vvYz-l^lwDp>>=b@y<2 z5=!moSV(PW{#Td+$xedHqE84RzPu_^|U-JFi9+4bvC$#>+@)DKX7bgH@35h*Nli~i~;Dv&FnmIV6@+kj+Yl4h*- z2Z)zf_8-A_T(T(?qVOLOj0W?lFW%kcak1LvT!Nz;1?^iwutahA)-GuV^h!tnMzL=OK ztn8X->qB2FJr*;4b6Pf~TQL8`{j|O_S6-bhZx+j&nWsf8Z^<=uKCWq9wyro={Be|mg>USg@raEW*S%3D5WVbv` zBCRGSyZvB#emd)mioWP_bkzsbn4Pi=!=b*HlFCcw^d+^lkG7UXldb8Drm=(+0+K{k zCQYkeU8gj1f{PZqCJ0p3Jc@y*wqRwkj(lynR=>-N4DbSVl`s{!051`r)7PhSu-Cbo ziR`}b0VxSnXE!F)4+)y`zyJv!H_f0K5o`tz!LwyQIX$5o<`4~_4>zM>>mv8tDBxXY zc~dLbqM5`~vdWuz)7@yvOE2Hz77re3rDtKMoAmp;8H!d$svgy%+jjQaMvGX|vXs6v zb9-jx&DD~l*p|MEIgh+%{}O=uJuVg;&XB}d=@Iox{RGI+FBs?&Uj9&{9zS0a4`RbK zFl|>kZLs8?)TL2+5P%+bji)d;WLPKb=!n8rM~fOvqeU;*ZW=8{hZlm!>-XZ!r z3iLBDyBdgA6fs0n<)kbSz@dl7Ml^|lK;+3cV3Y(uEo>SL0S}ND@HdSCM_j*skP9gc(C4lFjNw~|&#lJzZ0hGy{-=Jx- zAnH9XXI3m4DKUCjvc+Qj#4sMK!L={ExXtbkrn-jC53|Jq1t)GVrtwluO4bxjq7axu z&`aN4u3< zQY*jKSHmI+UydeN4!rz`yJ12{C?zLTT;rk7 zcuFAVN}!h*B#XSd*s(Mgp?-Gwdlpqcla)-zVqiEnFpZ^P78AW>l literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/bamba.cpython-312.pyc b/model_executor/models/__pycache__/bamba.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0955b86649554b4c514fd10dd4af9453f9ed4042 GIT binary patch literal 20895 zcmch9d2k%pnP2zZCkBISFu(v90wBP_O^^f+lM-nj;sux@CE6NWnGDela>$th)IFd` z7_j6-UI9KLGF~sCioJq1}h+(5qGe-Bkjz%;w~07+yfWRAiN>Ru9q}D39Za`o+Tv|2UX^ao?2PYZ z@#=I(rZe7|*%jZ#(lzN=rYqjX;(spAhSEZJ2My` z%*{+fy}}9L7ua4PjN!{6;5anZP#=9K$qV4ty(<( zIfmH?%*bb89%h(Lz-;~u%p>9v@vzYHeM|fa(Zg~@k!#1}bKNgCF8(CT(TW^x<~hVZ zaX=gt`^7!t?)&ayKaSqE#@eSE&t%6$Nz9Ilz2n*R?da!|nJdX?W?T@{eZL~i7_Yj{ z;LSIn*kk4@&rzsMSdlBhvyC?(U(Wm2+?>}-BBA{wZ6W|B9=gqWGg-A<&+np|8D za$+nwiV?l7dKqFmd0UiZ)t5=$z9J?jSer(*1+Ik#r96%FtDu!47w17QEpCqf32wn6 zS_P|U6Zk17#|bvkF4#qf;1KvR&n!2_3(i;kai`#Vg^Rmpxm$ePeTj>CR2SO-!{;xI zsE$##4OHI?G>=&Cv?)~El~htz9a@uAha~1ErR=D^tZgpDiKwif#yotFzsHrR&YXRk z`wnpREzWZ}Bj+@K-BDIbF$yS^x8=FYqg%*1ZJoAE+x{6RSbku6+j_rP)-;zeO3nH8 ztmY+cQ>$LC4dvR)%EeG>5As#&@zPsx+}O`?QaD$w!vv=}Cfe-!f%QJER6PHP&(&(N z(%7IE`XqC=xtM$8S2z>7DSzKptn|d>1XjiPb*w^9DmyWmW3)r{jZfzEge>&{U+Siy zmvVaggz*=%#ca}UfK+c8{uXrACWGos)jyk+wJjhDQ7qBaO}3p6MW=T3MZZ9lmC&Lo zIa)#+8<(Ql@fZ7IKGhB=R=+EOMW4zg5>kLwdo3jhVpeN&aMN}m5(olV)h3 zRZlTDTBCX{C)1PSNl6-)B${c}0cc_@i`{RN#q^le1)w$tY&2*fRCgjVnoi1cA|ca? zi_-J*Z=dSVWhVM3QWL$oB&cex_txb8g9mQx7yHwxEBzC0Lhvu>2DQzj>M4sNmccb56xo|IpR&C^A$CHRVGw zB@`<(MDq=UO2goCc+XN&3GXiiYYX9>g}TU{FU>!{T;09lbu@WqZ4X^xl+n=5GWIK> z{z83ozP?MT?<&s_U#{MT3=PQe?TWPu&R3Q9Mig&k!SSBA<55+lP}Q|KlfAjrphqG(oq6Au{d8$O2Sm7F+oMhWqg(?_sd)%P@NA%l|XbMxDsfav(56ey9>U` z*UrtIo9DkZGRp(uwXe;6Z9cgYh!#HwDra5NF=UTfM^34XlqM3{WJXLRR8NA=`^hxL zeTf9MyR;Q8ArPB;MksCjpgdl#jCRNHIiN zaDYAxBlyiL+?tEqv%gT=P^hc_(A#KlTdizdv*W$y;6ky*y-HpGnv;?)E>OMZrkIBd z)~pfgnve5TeC$W;Zx7k*LDnHw{wRtz9@@n8tp5>$Pai%we*DJqLkEv`1kv%M#T=ru z7;}m)!G$BKf*nb297%4$BYLjWkZgGb?>2B&=9o?Jy<&^|*iq)kQC6YWX){c0%n5N6 zT$yy9PSp`c6I3@lViPIh7EW1W+4KX>H%i=GF;ya;QXkqgYS4cqs_>sJ|2{12E$5tb z>p3@#MV&A@)p4?77 zhPIiE2B0Qw!+BqvCzNHJ<9`oj86%0)+%Yoc=*!U3j0GgH44hga%bA2sOty9`R)XKU znYbZJSu9OD6(!o@s((^uYx8D;@nJ4XYO$;)FUYDpJDEvb13xXRE_x+|R7Ukl899-K z*pLt>WYtqT$)$@ZR=Pxi!7BXL-!~?TLZV0+-?EfGy~V3mln|?x_EDB03fLJ-Bc=7k zpP`d50dasR@zz8#E2shO8|y0TMG-)(jn=IS4a4YMvG8oMgGS-hQRyqx zhR-2T?JOIkT&l0UFB0t)HK6fKj0Di2sFmPn6Juiu0}^wwBSh=l(^Rg>IBDdVRu;|| zdS2G3GL6bqi#ho*_8_KAby~#j32;ZkMr95s5V+U)Ejr?8~c^U{z6k*zG*;d z8Ynck=bLvc&AXRQ<@cUa@Yj5bF%JVu!$6_7F<;xQ)ONE-mr~n>GV^s^N?jL&>{4oX z6~Zm~aE}u1VJyThB?QRM;veHSE-zQNZ?sg_5%b{LXy=-C7JO9;kq31TT<`dohekH6 z7M&AuaM8|%oB7UzO6S1`H}g+kRGz%}u;mi5DUnCf&P7Ll*CA!sp?4hlqnDJUmmWr+ znYI1g)x=I0)vq&yv|6xaB~eZZI3Y*NnFJfgiMZkQ-y#63VR^Y`T4N<1%W}-{+1l;c zl4*;+&vkYpXViF|>f7>^*pC?-$}}i=2@Mpb)#`Hn+M$Ds#D-rcp*aY^yD%?{5W_#jx95AINcI~ICZf&;UTg3I@sXU=oy$U?_M zS8TPlBj37TY2E+8@s4Yy^^D?bnf2efRj7~T>tjlNY_WHx{=lri5QxlQTMl$AyE@n! zddoI4G9o>NM#UB-*Tu;gZf%jM1+b2*sf(4;bGi5hEpy zASmaxE)bA{=jp>AAs}AM!}Saly88=*L&Ra&P+X^qL)UR8d`jujg*hLXA?j z^AjgP8q-D%9mRj+(T~ve{|%5S(-!#u$h5i4nYl5C#>V~LOrYPqJdMpnsTE)j{fs%R z0CQNWHn3H0sP{h2AWGX3{XUG}CPt80H6k7B9S|qCFoM3Vj9|IKgLKy?#AqfnlbtuV<0rlL#p)S>I_qD7&b*w!ebIdY4*6xzJ1qzyN zYMw$LbaIZC8YFtVF*h4ADg$|qu|!?OfNXgHHDR3@v(6>_Ix?{8^myt0*t|wv=2(Jj z3iJ@|(`f@YjnPOLGd1l-%9NcJ-1^8!ZMjapHRiQoys%!jKGZv1Gce=L?ks1p1&>j> zAGy1>=7k=?o5jvD8?_m4u2;u5$C&n_U#@AV;LY{xnAZpNSm~|rm$!ZQi|ZJp5!S!w zi2r5Jw(G%UPSGM`E$I1a2F%vJ;4ca|6%cR&Wo>BGbLL3x!Jci)HEBap2sHM(UIuoy zUSlPzk&aG0=yj78YSWir91a5N)M~?w&ww`TF01To9RriZ+L8E+WIloAplA${Q(P1jkwjF9Waz6}p{qYr zYbTOYE|pBfx&jU6?V?3N4J329tPVpl&%~Uno5_Vt*NRp!Y$ha@S1C=SWT>T*vZ(rD z*2s$Ew;mITu}@RC2SQ@q#9TJAuH)$XT2d>8pe@LpaWBNtw0mCV{?2!Alf` zC}0d_kPKbA*@_Jq%{Uut%z{KjR(0v)rzyD|+K@@K>{S~Lo2KBqL50$8(-(2|GQf?)5c-Hr*HMZEXJb3QiXI5G-F1uO^B;)ofjlCo`)<3Em zTB$lP>nybF%(om=S`IQ{k|}1}ltA0URVC276nrl*xa=BS4h$B;P5E%I67F3JDd7X8 z?k=?M%C{a?S`SmkmV!5s_qHnD)`hF@d3%X3ug_O^Db-zzUsS61KH%T0KDg{VxLkb@ zO3|9e`DgM?dlmd``t&H$vcNCw&9@y=+73Oqs1bD5H0ylB;F)jm(Xds(tPJ+<7*|T?(3|+e4mR zu5R1#J8LPU8#VsF(onc{DlAJ`tn^Ka2|6Q*F0I$|kFkMRbWVq<8f-K6X`87Hgfmq< zaHeOMDfB9QW1u*t(*e&{AO*}gW}GuFoV5n0t$m{qC#YlEsh^yzZNLUD^=QjOnA0vD zcLto{bs`&Eo@w`{o_f$z59BTD$glnn9@y)xIA>x`>AT1WY6Z=d=wMg*7gYX+%HLGIL@E=SPD3JVvrHW*qiE7m!ml8U zVCn3biamz3ZASeymH9FPru3+)yVrELX?|uU*gNN#waz|WsH(jeyBov4{POExUTpty z&s#kYY(Mk;)VCZt`R?Ey?5vSFCmE4uldG^Cy)^gI{O!dZi%+iv`is^h?99~N6v)f& zh2%=zu32xPrvBc!yXO|XD>XfH?penj*qmx-{hDk~x|~rP0{x*lTZQi-0`-^`>M_fV z6?24Dx%GhpstBF2Y^xr#Ok0grgw-^om1anlr7}r1^)|2LGV31gi7gMz^k(H1`Ra1y z^43u@1DZr#Rk?+&mE(49Jyx5vC)pNBzX7-D^uHT>n`5Oo@?XaHy zzPheeQd@N^_1%TKmO?}4nw<;xtaDaSNI%p@ZJk9`m;aqxE3FrPM^%?JiRQp&aGLsl zhJv#c(2z@v6`^gut(HrAk-oBVBgWTU&vl!=P(V9T{sjUPJ=aBES9GyhQPZWx%T!$& zpiI}L0m_tJ8bH%_X>m>6rNysVeD=CkXt)mFJ-lW|92M0!G3}R=QXb9|cr7v)S@%*Z zQe*ElkP$CKrj%=ws>}_Lww1B3t`WADjFm&5X!gkdxIT;Edp4{X zeO-}ih1S%Thbdl*Op?IXv~eWzIb%J|+053_G_UW08H)v$s5z{YIs2S-&N0ow2aQ%S z9_$0#h|2dd8J$VDIJJmC!)1=KJ!_zi;-;qlVo} z9r?kt%HY{|4=RJtDh-!s&;6phxlmbGs0BlOo4gMkrTdJPX&b2TuQ%0dyra+6^>=GJ)F zA5r7j9R8SMtQ(uBO_Rjw^d`RbD4udDyxL^mLNrPL6B+&vljQR2?qsXYv3&E8(ma%J zKB_byg^4ZHnh%kW*et|Mv*@a3a9$rKn*_iQB7-D;ePQkkhRn8Zw-yb)YaWlg z;`^hYaCogzuu;z)Ji#-}cE#7eka{rmVAngH%LmTFN_^0xxb~44TR-3a{l3Nax4SnS z+`t$4fcv?KA^erdW{m6!fC@Zs$61YXil5y(`k?l~6FR+Uljlx9L=r(EBbZ%1fqh$5By zl7&|S-OH|S*8UO3&8jt=K}a2eV*^5~Q`ou9nPx7Ii#PoRc!I(HwH(R00gAT$vC zQKxNRhi}b{JsSi)G-J;VC%qZF0Fg0U0TDto@IauXodrS8s!_@r2Vsk^lu3U z$}yS2T>^M|{~JpCpf81oX%U6=_0j_bF;8i2X*K;j`tjE_8)b> z+5KPY`{qv19-E!~MMrm`p{>x|T?p^kaN0U6n34bW18Y_;+zq4KTl!CI5q4D~+e&5DR<%3)EbFqdPT^e|eMVDIIZa zhYw=8--8ZZ;Qs{fAfI*Y>x)WXoM4%Z&7gmSa|e$_`|IMO#ljS@U~h5~|81{eWwEMh zyP5lk01-f6V%SFZ$KpB!gTcYpvY^JV6!kUOCeW_9kef%HKJ&WBepRXqI40U1!KL?8 zLR+S-th~KBFUF2NfEo5p)N~B&p`C4Y&$P$PN;Tuf_VrGCpb2v8o0sQ!E_>X@{T?m& zm)o}B&q^)G8C!VTBe-*cT%D8S*ycAZG>mS@_oj{?tq`)|-v8#cXXBxc8E z|1NMz+fVLh(hra@U^g__AD6>4ONaD^20KW3-ON#)Whc6e>qYa_3ul; zr*BLWUr_DGaEIlp${(BZ_UXE&Y#)5|jRcdNU_D`easJKsUOcKzNWuNRlH7=e(lrwq z4#Z+kbxPtyIyow;E^svRi>cf-iH<_)KU4531tcA))}$cBd>Tpma{|OifTs)uD@?$4^X*mQ>HlCQz}8eX@mouRNM z$wXIm0!MF+ZS_$_#S1_3RH%&Fm48SQ$vckVW$q)3v(>ZgY9{4#1v2Ki@g)|F3Obimzq%#GT$kpncJ%1bPaU^{aup`S!V&NC^+WnuVtpFRg@j z-?6O*A`8H1*Wu@vT315*?%000@0?P(`;KS+++uy9x*1CQ>aIdfWVM+6+)@J!9LQcB zx_9L6k%iF1>e!<2pj+uX39JubbJ)97^RRjdi9&rFe1hs@ckG4e&U|!_65Vsh|BK3| zd2!+SrPB{9kNl#tcCCpUI?u1Q!en7m+@td1R*mAw>(0LT(!-sh^YxP<&l$8F0N&8 z>4egJV9iZQPf?e|%r0w`7cOl@g_56ApXUecO$(9<1LV6d-a zE}oYp;4^G=v0`OQKkj+igjWI8$Fv4LCz4yP^dBA=f9@xOw43kwGYWnER!7o#xgA;QZ6}|Net$iGIMQ+8TW6dH#66=nK2sr zA+HN&PKjLS)hg!GJy>*-XBKr$>+ubIWg@j2Ec%nP8yuR=y;cK}cfEG8gMp#$wT`A` zHej2iyd9(iDQ02L_zWFq_PDONoEl^&QQ5pQy~$(q{0pLRC(umr}iL3LRew$g0f;E{Cg0k5An+xq_ zEz+-vPdmb!Yw4zj^+t2>7$3*Q^X^6nAhjFwT7QjFppHC%AyobF zBq?27m!1QRIo@bqG#*xr=0M}6rWG%<#QbW2ct3XUT^kc^8!u6Qfr9-MkkWU{o@X

Xf-4lzQ4A;eUsIfs#x;tudZ+_i%!2Bt_CBlc_uTuR%JYEs z{y#=_Fd{Z{jUD-heM-Z=*+8MDC12B{)b!+QhLoD2S$83_GangLB7;lM<@cRc@E19| z>8LrKuj1H^avZ=q#{w}BX77^i1$3v}8Mn^yE8@EWF4NO@QSyfrbW-p! z1vF05zoLNcXI4B>b+r z(zt)wRloVm2POS_@p@JvIzV}v9@Vx$ihKL~gSvM@D?MkR?=Ie$a&h$?Oe0RxmZ3y{ zRNb=RSah#c4?9L6D*5Bx0lhOtt~93U9JDr!UAJJH^DwC*LHJj#(TFmB)nzwELOqKbVF;X?*0Yhh83A0F_ z03y*IgOtHKO?^`x+E&|q2fl)Sx@Hm=O|_53n~LQVx*35-g)-46^w`B!;R08O66}Ci zh?_cVEKU-1$|Vn|AB$9ty7Id&&o-38vi13(_hoVot=;Hz@W$D7Z<%It6qp zm8oK0q~sPY%-<1=#{IK&F#nx!DELEK)l&$dgZZe&R_lRsq@iu$+;aWEhkj?XW7*cQ zR>y_A3)LMT+O1)vYq1tuyBEC6wx(5k<(kE1_pH|LShM2|;^hgV*-7OwADzA;Ury|xc5jjvom$Cq3n~0aMd}I& zh@iRTw`>LX8T>ty0W`bWPKL$Uj)eZ(DMdGczOw7+eyXVbF}Q|pUAxM?<8i3$r?A*> z&M<#_&C9;~aXGefCX8X}eM}A}_xEYiKfBOBC{4#7rMD<7PdUb{VadxJxC<1D|A@0`E*#&fSf zx4|L0F~Psc^PLM-KMKDY-ry*{cGJpL^%VSVpu>%pWiAArwI#G+`6|yh-Kn}4z8hZW tC=M!V39T{k1{weOo`TKs>Y1;fDcFJqTV=skQLy<~y!qn*eP$!{e*uPhMp^&> literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/bee.cpython-312.pyc b/model_executor/models/__pycache__/bee.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d829a277555d461b6d286c0248487f0e586fabe GIT binary patch literal 7195 zcmb6;TWlLwc6Z1jIV6XqD3R2QlIVwI(Q&A?WXE>iPV6||m6UbtWE&;{OLImNhbN5d#}gvF(RH3eYPpve>Vl zbB9Aww2X8w#53ofd+zJp^SW36?(=yF6ovnX><`0){2nV-aW;^x1V_jnA`yvGNRCT# z9O5>`mb9_2J;x_`Mzbr9oHOZUIIp;JLQ-J3LviOkNe{!FiZ>@FMTWZ+U(TQO11?Bz zC6Mb#cI1M|V6HRS$!H#>D;G+J817ZVxkxg?5gR#2B=IJZd~)XlySc_>H>3K28jw5W z?p`1_+j>}A2ebucetz8A-p;ql*=HlNIRDyCi-pNSxm)ge*qo1z*aaho0?(F0zr!!jiT3X8%@<8!?swlbl3;CJsEYv#QORMsk zVlKDz(E>84KUnP*}2*#U-Ydb8=p$)_#0Hk+ZXNnu?p2X{I3&_VG6=TOR^)hsY$! zNdz>AyGfFE$tLqNHpzaIPdX%Cc1jNTIpODmp8!9%44CYkvEj+ih>{05g(YuXG=lHR z^6nL9P!LwJ0fI`hg{OLll*zo+((;{es8I`vSjTyh+~=m_Hp8CL77gF5tfl5=QW@*W zcvV(rRGcvO=d)v4ZeeU8yYQ-(roi~>V)4YOlUGj2V@me&*us)FSICcDg99JS6(m_n z$%}HPs1>MLRmU#Na$;eLhG4ML&ANK`lrQPiyMJqJuti;I8+r5 zZ9xMuoL8>HI@Dd_a6p3>ep{~txC6HI5`m!D+KR+wNtydK_cQJyiQA^{1E5{NV{|lD zwqWTJG~m)`4*(TMip6LI3H<;{`(L`X<^WObhyZstTG7n?jp_mT8()15Kvfv0-N0Z3 z3N$;L1wBm7G1W8tO~Uo8q|F&Zej%-8=493A$QN@dRkI~0@INoB1_^G%VVxLojK z8uVw=4RYZ6$g;C+FY_g6Uv>fJV3<(0mAMj#ciCMQ$}Wa^%I>n~${E=F5v{3Z8jj0K znFCH6a^eV*{Xs+U4;nhW>@B;>-V!)zi7!FhvRHOWd|53b7@d;dp55b z4q7PYCFVN}7uaK3)2QL1@`94i$Of-orCK~Q9Y8RMU$u7$KDK_oHUNHoU?7QrMXNtRMGavIEynWlVQ$>!x9 zio5A$=`gM<&?aNpF$NvOXRb3flg@xEHAFT=qo$(MiS1dKKQj7K=(_OaURYA;$ct|3 z<=iMlRR){ALAFA~9op>OS3P)4?>$!W*Lq?r@8~__72js}@M=KsK2Z^CLq}HsMjv{s zDnvKE-POLte|Qs5dZSM|!nX^n1CIvv_&GiD;p2Vl9g|yjTc5x3!@867jI4U}?(wy& zdgNr~Ty5Z$)vIeK?*CLDc;nXefA{ob=lrVw(U3m+p&mK^_~?4a6m;(OSI%SS;gu!5 z`v+?m^vG+_IT~NRu-1M5b3J;hGPM~!y!xdceZ4C5*81aBp$GWGy&ItsJv6c&8m+jW z3=Xekt4BY0>|Y=JNmb~7(l<~YJi0M>N*_G+=+(!a>w}Z)eN&adW=H6b`{(YO*mw6! zT|B&6_(piV?_a%o;zzoO8wq+U4j5Yu-k$jTH=aa?R{U$jdhEn{^kik~N&n#8(D z;~b-PC+C=0jcgJ4)SW+Y9EZ*H4b)wT38Xh#cO&K@U6HyMF_H8S)_sWikuQK)2k`_d z>h1R{i?@chgFrDS#_2NfYZl9&ZsGi`>5^=hdD&qmLM(0KS=!`uV-oF<9TK;T!^vu{ z20vc{*8{zT89{R0bR^wNJcuz2s=i0Q=+5OJ(JAINmF|N@WYA$*g$INM#bba#TE)#b z0+z*qUkO5`<=U4ayIZzHsI;ML1$W9%8;)!)Ju5SxSaQ9|DsMx&X$PZ>^QN;p3&R*5 zv{2Tp8eaBIX@x6tUd2_Qd7IYkM`M;SeNJhR#+DB5xh8k z@ptYX{yK3lvD*D@-_dU_RRZ8Be4*QyHo~vy;a6&r{k86cwf^B+U%c)nkt18g5%JgU zt`1L~xZEDoBXq)?aSwePKJ*;~?;=2(GwiBHnHQr!LK@!FDQ=qn6@oTzb_Qt(K8Bw< z2jC{D+jz(5s%y=@c5zM6M^DuW)E{NG@KXc2a&V!wM* z?>bm_AxR+N=-v0L`;JzlNA=M7+9f@7s*at&|9PU@Uck(K;MBW38xI65Tb}~>yB6P8 zAu}9Y2L&I(!Nk}gnQhY^AE-F3}H|6icHoEVxHb7J3s z^ZIA*4$+zu-m;a+SI};m8%K6>_Ic}Jq{$7LDL0QN&{pj0Xw3R(qVrO>){-MLl)?qzX+#woX;aIU`0Ign&_Pwk9i`MWLtazxANV;t+5~?FR0H@)hQm6Tx@O zr3*dZ3Vly{`o0$Li7TIO44=}6Pi+jJ)rZeMKK^abg$n>P!|U`#KV}dUC_luEg0T*@x31Gwe2CH{p(!^02f+EXzzlf?$CBd_{zt8fZ;P; zbR*q=F`GakFhl$qAncK5k8GmMS@CQcBN_6yl@<;#-@T$|+14BbIWSVUm0cWg?W4>O z^K=IIXchqm19}BO<5X!LaJYvwbOo+n@StWMYa=mn%@im+#Ym;Wy`55`(f+qEkoqnF z$VGahU;FO)R&wh-$11)TYr;KYWnjH$ti5eB5QR$xlR5;{w%~#41&J?s!8yZu3Gyen zzFtL|uc40_Sy57{`y9;ymAz`cYbxgFP^SnA*l+;ApKg+En-ARmpwlt1^2u+$_|+G6 z0_d9jFUoI~?+`WpKlXwLA&fqT*a-OSSuz8ZX0e>-){lYI8Wi9~){c3k1Q&jMSoo#m zq02hMB#-aLzvLtjq`h8vB@`v!3F_E0m4DR=m*Ka?A@{yR?_*|BBY{t zhme(o2}_nY&nvqeqf-g8=AOtZ3P7MfGR`!K$OL6)4tYd3hVnd~AR1<`^tbedJei%*da z_eFmSZOksGn{Qv3kfJX`Ro=qL*cKrnX~hT$FIcvRoR#Kt7zwdgTiGjSF56pPjqKK; zLX^R{%nEpQ8@#>JGSt`-<4*e5h%-$VnTV_%oJ2Y*n-ShqXmN*`UpB|03ie={juT?G z*y@Jy_V6-|Y5L+{Ku2`~K#MP4`TS7?F45l#@799RjbL04#y5f!dT^rRd@}sTqYu`H z->>lhDeO0m)ZK2R82pVOynevpu@lO%w{(hYq?nDe%~L!OS_IH)ri(~KP{D=^0KiP2 zi==l1^0kq-Y6r$^N6sMZIr=;xL_OPfPp`AehwDA$J?;$m2V!@e;Z{m@_I)&AejoGS zvHEicO{2y+;sZt?Q&1GhtMGpniS*?ROHV0)AqC~8?nrFb0dF@WXJhI5L&GMI|iN#)tBs>u}Hf%$I~ zCjGV)+z{S`wZq%Q+w?0K4MUrH4?x|{aoisT!X5lQ5ucLqQ!?>CWIw>CWb7$9{vFSA zw&w)EA3X$4=P8M8J5O-Ot36}e1isrE*T;FP(TVD@vwHM{Z34CJ*CX7(-H*Ti?A~YF y1kn08v5i)_&<1x%=ML5QSk2v63ywg5x7oHm9WZKj|A$)yku5LKo--Od?r0|C>CAM|fTFg~ zbkBVMy>F|+11UMqoHLKat^4k~`@8qQ-~ZqLzwh7rd~N|(-1-k=uS^QUU(tF1{xBL1C5EMfu=Y~=si%Q37zwcSNf1sDetVhgXv{4P+#X=hp+W3(4yIIU8#B6>@ z%snh-3u3lDB<5Zgvkfsr4~h8*i@5* zsRM^1+XoIuc0?_sUzIDO`-$vaC%iXe9blps zu1Z@69%nHhLCk%+ae2(3<&PrH{^%hkrP@n<25}D{?m^>{qhY)q8}9}+kK+b!wY{crjCt|jl?eDuIBjI z@OUIDJ~S#o^0?!{dpGSUeiGNS(VJ``_gI5jvj zhU{yOjmNJh#-5U+o{fz~heWCJ=`+uj2R$?XYH3i|E(MOohmy(INGvMynn@LXvE;;1 zYWT8zZz*4Q9=fR1JlQZh&5^(_3D#zf5f&*E?DwI&|_n|o{pro`;Ray>;@1Z%V?MT%t zX-0#JPy02_2ceBx9L4uNldjF`&`1lTElLdSH)2k;DR-kG#i#wE1cUxj7Ov3_EvDj& zSd^zqE!ey5RGw?Ucq3|$p>_r_BGxO92!iNIZB|l6Y)bCmFkKhI_WnC2$r7DNO5Ta_ zWQ<6|U@Vdp>77f?p^Hh7vs83)H__M7X61f!NR1p-C{QiFq%=W;9(iHLt0(4qcCm{o`Uns)&JYyey%bA=1P)%jIx>P* z*w|Vk*+FB+E=48R)v@F&SEJF%s8qkU9;`UY{oGLeYV^1$j*E9JXm2z=B5q{m4h{|z ztsfjr()0|4Zv5iJ_EchG`$TMFTWUzeqO$F^tB*Xo@5&?5?eW;f?Gx8Q=*PBS!xFWf zFGtbWqQjt(BEL;;zX)>DGjUz)Mt+lZCB&si>HjV~Oa21Rf4U*ubNX&Nvd-X=v$0Uq zn6KHCt=TkfebaTXZBM~lmG`z}y)ARs@@+e_Z97*i=8${Zy5wwLwF{m5+4C(~@0NuV z`R>En?!$QA$)1PseA7Ypygln}pFffB+?(y(i|3oz^L9LMY<{QtcJus5e$&3}rhRuu z@`ul751(IZxPa0tmYv?bvpMT*E_fU7c`M&Kd-LpEUB2OwY{Mft@4meE@vQgp4|e9f z$5(9@m%Hc`x_a`Pk7qX@r`mNEJpR0AbJnwYVgHAoJ@=}b3xP8zySkz%n0*yRi{Rb6 zg4PTDk}EbgaW%!pdKdwckKHHvuuDpIF`Bw6jt%Q6D!nNBTe%Adv{;6*L^qW=HDyg( zZd76}QoOQD7t4YeGI-KQN}JVuY>0v7r2I--)fo1)C2a?-vFOS*?4Pb!_5?GYP@y@L zZ{Cq@-m%z_Yd$c2Zn?ER-?}&3x_2?1YwgQ8o0dbJ`OyAsX#d^AxzH0CXUlSJduHSQ zZ0-Kcp{KI7PffcEwGFqvJp1MOCvvshap!NyI2**HC{H{AC+s3pH8zxpf>gN&3HM!% zlixcy_{!CxIDaD2GRUTxxDP(b&cTuR1l{=Li9H;R#|H=Rn8e2rnxvs}gmr*ORGK+N zl9LzU{HHgBqPfmiF*o_CfS=+%p{Mu0tJ2oETphf1aQ0x)27l2m)HmOX&&G=mx^oJ_ zmRsrB^n4^+zokf#i*CW~f2;Xs^JgC1@itLe!}wD#W!YW%_Xv6rGKyHDwy3=Uu|~`h zD|itH@Q@v$PVgpn+`E8u+@p1B&Z-CTop|nzxQw4iea7Kb7DG_(Xa&kK%==+!!Ebp` zwx6M_08o~fvkQb#qavfcCHRVFH=EIrgFF;prPLWas^TEE)j~??TEPfeDRZy{L;01| zqXEUI{Q@_rYx~1O6`X+FH8iBX0iU=P1@iG-1D{kmB913Y@VQ7? zN|s}1&r1#&xAW!H&KVwY8$GvSp-)C7>xH4Q%aR)`QtTSzJ;XzJsw{haQm{3vnkWK) zRqBHEEJb}1PU(X6NquL&es8vZ?;pMX{jdGO*K+kIv37Xx)z;s7arQ-J(OTS_-*r5@ z>v*pA1fII@Vcpq$d-J!#)7EeL`TXcZ#=|NZ#|^K^t{}O zX7M>Bld6e1Q~}Xo3OrL1G=T^zqX{A2{FiV)*{W;4K{ASytc7z!DEO=2O5aT9{oAwt z?K%I>jB_Vzk7T6+N=7&}hBb&N)fZ`ywptlANnzHmGEOsj$IRD=FH;?<4?}Xpz5@T6 zhLt5Aq(p?>%Nn+5&H49aoO^iFe3U90863Zw!n$qTMk2thjrv!LFuJt%OquXnsZYwT zk{Bkf)h{5PH~=RsxAG$V`c_J|!OPL1$YenG0$B))*ooL^S+MyX_igw5pv)oJtAE&@{C#8VWSBYqfvDd@;iW3=PFZaKPXZ?l)R?s`aHDyVu z>C={!m4||fl%I>Cw53mY%Q|enBn+Ehu0{&$!+^-F`OR7@awh0=o@R^@Y*AU7DGS9K z2E83NPg!4;XN#537I!Qe8%w5!#)d&P;=tZw1T_rX_~44dFOHBy{VLh{NJtV@W(b2% zB^m>hRk}AJ1B*)tPksf?4FM}=xn#iGs zi9`v-r{L%j#WNI04uOf}O*jlu*leAIDDYFPFxh(N&o4A(H||*x;4j)uw!I6UBK&(Z z`_53<-lE-NYnk8k-GlEQToK?cy1llRWnd15G8Xo{fAG#h!1ZkXzM>6JfJd6P6dmMq z%D`jJI``~5FWr7A8{Cq0ZTZZFXPnGZ-G=d}Udl?kLT|DJhB!z>6ER1vBUUD+vAp3M zut7v)H{zlRk{E@{5XH%0fD16dDS2o_$g)DoLtGSOf+Rzf?6Lq!ofxQ+4}WZ8wXekm zz}cTi?qY-yF!)WQDo#*)gC?1wJZ8$4qMESFsGX@wY=2bm4RJtJ29{py#oad?$_}ldxBV84%?$0jvB*T$e|bm=lO8zCsQS8OeQ)0rImT z4U&y>;gw@o6N56*WHm@iE_&CDua|5rNMT_Tlo1uqswk?NOD+}-%;0MruIt#A8{!d4 zzyL~(ZV;&mBr&oPKWn-afLt(=<`rq0)_rm=13okC5 z$u#ZzTbHe}s_3PN_bJVa#a#zR`H6d| zjwB*E*YKCz0|(^uzS(bc&p%NV@N?JlDg6}N?J7Ym+TaHXlsPU3?l`HVd`d{2!ji8L zIoFdqmq7m@e>Q{QS)$g6iE%%WM$0}Bw-Jyq$g1rif)4OoPNPXK^IL94>^vk{7HLTK zmyK~Y?uQk^iwG!Oe)E^t5;=$;N{O7t{D4%^pennJ37aul!x#eDA(HcZW{?{>H)Gal z-O>VO6~Hax8nH~gEDOrADmzvsj1oalc74IJFeAz@xm`AtW!IJERpLS5>gb)01L`@EDC(3Gr_sUh(~d&JM*3gZJoCcbK&HBD)#qpg1%0z(wMuB&rX_6c-=NZ(#z?29Wal4F<{d<*~w^&s1+jvU(!d9!@=oL>p6QdNb=I zd}V>2!lsZRYoiGk*AqQjbX`!p0b_H zHHccj*kG|BgbHjbFBkF>1`xgd0(!s2XqBe zwAXLGK6e0O82{dkb1$EMZz3`Vcq#ReETC?ZZ6rQEloEegB0_V9O`LyDmAVFp_;jyE zMEpSzk=kIk1M%*AV8jAIb zZ=o7`5>tu|hV>xSji^KoR(8XE7L(Ip{>FRO&rsK-ZKVG)EY;@;ksv!H8$a z-WzHVF^=X+WmR2B*=qN|^&e2C4+?Gn#%`G*%KT|KW{xds)h@zJn@ zvl=cEb<27hsqxfMe2@uX5SWyzv;d@{h)KzMF*cOcIFLyjF+XErhvZVziMLQ7R7Hk| zpzPS4S)ex}{vL(dU*T1DAZJoKksMSS z6w*Z6CX<{>D@E!Z$;zs@X2~ukK$0Uh!SZ_GvfZeF0c~YcbTR%CMI}eUd(+Y#&@J7C zCg@=-IfM7A_U5YgO*@v`dYS447OcF#BkS*&zm)ZFTde%h-8bBd zy*rhy+l>r8h59DW8!ha9|InR7kQE~o;;yD!Q?pZStD5fb1;c8M2p{*m| zwl~|h7a(6nB`;>GH?R7g4Fp?!t2IJxSmqFv)Kzu2x@Wr!)D2s*4O{e(=h;kk2htMc z`KF7nd{K1#T0}$mkgXwq49~+6%}b!_Q)WnX&4vuudh|rUYM~suFl!A$FIgfc2uI{~ z>Lt`{$~tAk+GMQd!7#F-PzzlUXnH^@`_umi7b$pQHURAYK1#zpn~29!;ynD~yX1Ta zjv^tYu8}O54U%b4GQA?1u1Kb9oQorx$GA8+;t|}i9f`??wF6v+US16CDn5f>htF`wM=4ce7=P-eEUPQj z;&OZzQP7!{^jW`{1TpeOv6_7iR+~YUWUR&*8&#TAB5|Z4#HQG9z*7d(9w>24o3K5> z<#=W+Viuijjq+oc#snHJqoy>;|K#bUhhx`;JGTA{zp(c(ZG}5F$;~yx*fQba;xbO> ze;4IwE19?%w|8_5p-AhgNCTd2R&eDhS-54uOY&udW>aP1sW)eosd=pUYHtLcT=X0dKkyT~@ohviQ}d29WS?R%q0aM0BJL$JO0A zZB#oJtIEy3kuII}jc8$1XI7-r0Y9|9m3Xv>Q5))4rKkXcNBTcE$YEVmiC| zST5LCvJ-#D%mqnzj7bY^`1~@0B$_B9Ik12dk}-HVY1#D%lOp+TCryk4BoAs0#?QgL zIvEnX=RzdrZMWH43N7ul^IEi!|Ix!gZ2mzr^Dnz9m`WI!KzW0L${xm_dMT)RCFZT+z`q)~;n7>Q^AqXSRD^SSB3dO~SUYgVHG(6M0@;9sMFlAT00`KfGl3Gb07 za5EVsQd!a?GWs}19YQY#;^d|_gSnp%?#KprEVkr=kCjY(@n2HWo-zTD$N&Ws8rS3b z=a(GQux4X>2UX}RxZ^?}KH+@<+kQ6R@k>Oit>_@1Q-Bo6MLr#dpj(b1$k$2Q8ccLt zByEkw==bA)5dVz4`~_0E#`AmzPu9%y2gs0h^W5=8p#mhz#^N5XsHus2eBv~!rJw(0 zIKpuL8{mis&427&f06mW4vw&_ha<%Q4owih4+qRLiGn$g`gQV^w&p=|ob8R5E&NZ= z*g1iG%4hc{wD5O5aN!R;XyF&x!rF(NhiPsp7cH`Ep8q|s_u@Z8K_U}}QjwV2`nk>a z_~f3DXZ4rS5#qm~L`TV?sVe>}a)==5b)slAu_1Mux-Nygl4)0^th3~iCm!>1Su6AE zrXIWJbB}quzR;4(>hdhAU_2kcfg0-PW5fhwY$)JGEdSUPGYxP+Z~L}6Li;(`$)P!D zyiY?a)KDmJ%W4CjH!21^5pUFMWaEUD*)SX{vkxnC1yfF_T{m@waq5Ua>W{dYB12%E z82ssarMn}QFp~>Js-l(As;I*_j}bGql>VUnP>tGFf8O|LbxqX4Qd%Q5NLkAeNF5MJ z9oHLzhEX(rfw9W4LbSoaN4zdsu0*d(wrga|!*&;eQ^!!cDT6N7a_9nu1ez9Qmyxsp07Z=nB^^TE1d_@w1##=)jHY1*xj54U z1B0c?9T1a{2)iI9gr1kPC(12u(Fba4AwJMB5P>o=NzO)cPLmUdBRS-hkVG)*s3sR# zXR*zG>K`Hldep<{0DCVg+)2=^K83jDoX~RJhJ0O5wyq~%w=Y{q;wR5XjbVrvw%*yA zYlJxLP5-_6_J!9!82R9ZAH9${_S{na^WcTNoSr<6980#j=#|mm%04&@@Zh)nG86!* z;6~flVm+|~snK<;1E{zXMPfdlU@Iq^CUL;Wo6d%s?h*9>K@qMs{k5B4%lmt>{+@;F zIsc;>=cBAal1nj6OG-5*`}3UBOEO!Y5*}S0pl6Zy`Vfz{Wnm^seS>R?%#~z658-ZF zH8#WqGxr0~QIj<*BA`9iDgO&X6+I9ZBS+De$T>z%6b@5mqoGl9#hKUjF5ZIY<|0M2F z%9L};HRVpb()Md6(e$P_Z8tO)bETb%JOg4ud)mfgchL4X5x6N&+G9j2ro12!-n1v} zNV}Ey2Ln@!q4Aii`e_nO!kaHb+mFh@O8e$u+G?C4?HEzDO^^l*J2);_UCSc?W8n8u zJMl;4{8Knez@JjUf2#}lpD5sc3b0AmNNhMI1<3v?!tC&&4<8kil0%G6#D}0R?&EK5 z5F4~)DsCqe@v#W@aoI%sGvp%GL9=}LijqD#m_QsdNv~yTu%}V-Sn`dhF2nXy9u1Ob zf|=G2vWi%#B9c!|$fHPZ7K2%jNgla6ES@uY83u3AE|qL7iumWKPSOX5jb_CWk5Dar ztO-i7lS~vO(>^LrdGIEU^8XnLZU{d!+uGe3XDdegh7Q=g6@rb^zI%0zbGx~MQ?99( z>Cyao2P`(*ru$~L{rR3V*}%>j_uQF2yM)Vc4b4mW_%w7TIQnjFE5^23LGi~Hs%A}*+BQgg{466M}gX6 zy|DMJsn{sg1m_NXSiPxGvmsy8m96Po=v?$=!iRrc^H{M>=-gUtC!PA%T^awbjB^*8 zf+aJLmNho>eiNSc>5($C6tTR{&H;1vl1`KkWoxV9d{Rd(W~c4JRt%UgHjHOdduC=S zVh@DuzT}nIgnS%9Gzsz5U!%ri2#(~{zdFgzt!5xQR0pR>n?d4VP+U73RXFWH9$kFg zt)YH@gSgzvmyZs83IBu!#~DPx+OU0Be*4kv_M;zc&TW4p>+GKPW$L>>scfI$eb=@4 z`cmaF2-8V9ZfiEUb>UntxF5{0vzD9SW?FieoV#@9DG-YFhwIB&V<`|zs@7o%f=^Q2 z1fOZRgZ-pB)wu1Whnh;9G|ZJ?sVy6hp%sW)&R2g(afzC-%?6PM0?e?lP9P@rE;N0k zDUrIu*fQcz=m|L^)FZWUhzqa_!RCy$X1TTN!!5@$EqxhlBdJm}6t?dBna5*$ff$0i zP|?A>PDt&GF6MO$jjeMR--+Fh-EJ%Lh+e^6@u`nJZ8X{TFVw!@e5aYrD)!&q_`~iW zbdzPpeoV}RM@)rnJMVjZwymmBC;TX=rh%DwI_S^>C`cd*v_kBiQ4t}VgB+JpWXNTcYFPH3FnSp05n*xP13s@7{b0GUrJ0QCu z!%y!p0@g)lOKkoV%TqlX$OqHomu*qGB3gif_Dak^FIPN$NIk?%`zs*VBD-NBj_acq0MWw^x838d6Y=9zy6*3DJc=Qm3>XVvPW7->Z;I zWu!Vi3A)qj8@iyR@O_5kv^^**urqbPX^JsM?-Ac4y7&+r9xMe(PW6md?+;@ zg67Y)SR@)!$beakQ_M(c_-ZmWo`B{Ka^S>Xt%*_xXxNCc zi#V+&xu4}l^O0ZSAkQsyAfj>qc{|!843(?;}E=1ylxYUj zu869OTOt8cMektp{Ypyhw`7}4l{ztLkz`{pKqN*b>?)HHN!+AcRBZ%t&Q1*Hs_`W4 zNFv%WIKdI|aOz$h;_zQ6uQd`R98@WAsX9?adi8n@R z&T_DW?-1pKhqJ-Me2d86m2q}u{9Vhn4f)zlSqR^Q9p5{9x9)@bT=!`*P}{sBnE8e~ z$R(7a>bhVBu+WmLdqi6sAE%ZP6fs^Pe-)A@>Nk;L=Pb(6iP*WmFqG7{^G_6eQFZoQtpYW9x(Qq+20k|jD zO)UFk2xY>cu!)Tb}%qF{5mIZ$2$Zm8z0rT;ZN>l_SZ);Q9L=?zS_KK1Cf_fQGCXtE|NG~!yEddDRp)|H95@a1R~X=myahQ^{9j zsmqY2n|pSMKS3T`lY@E;^OTD{Tn8^j$D*%Ih=(V84F;zka`haH7N@W9N(nH!sn)hJs0kBDiEIolr61;3#7K8@rcS@lhGC*Pi$_I}dT znx%&4TA$4|J)3a`Kk~QTt8HHt@9tY1TdF-ieS&lS?#1*{WgkeU9VWlJ_LbUi zIMAF)qc-bT1S^V4)*%Vk^+H2F%XpcoR za@NJ8yvi;uUc~fmRe;1vm|61hGqMc12alL#&1)`9X!`6S#OB7X|Qm@)iL# zsDysw6lP(PA(-POK(41`m2^>O?LZmSSuLNmZOpgr$+qoTOyt`7?lrXK8^YO!@WQrS z!@m1A3t5eqQn>q6#n<74>&iG0$uT$>86O@T;O4Q!2ROdI{(@W-P@4eGx=*GWbjWH6nvUxo>saoW&Yhlymgh&jgok-~42ee(vu2lzxhJ zCfgoOB(p~o$?VaIWV}%2RL~xNcpg961gc0kBMyNpbtlroi;@w&z`{XvKM2iS7XWOAjYf| zxbWneL=UOM@FcF*%aw!XkA780wMO0oVo1nKAeC^ew$ zO6$&4HKYWF+SwYl)6V^(9>du}rco79Yel^$dzpc*PYYLk8XeK)hvl5e(qY`)5I%AG-tyn{ z&$+%=`+n1%rn@iZ51q>%I=AHPzvpSXZ!x)`vnbenm~+)l`KnFXs!h47t{Z2TYg_ZR zTe7uV7WU<8cQ5Tbm90H>qkq}z%Uc_>*2cL5OV-WHo`#H-ALK`6sh30ct&rGBFGIk| z3vT(S8Fu=@h@IK4S=n(j>m4ft6%26HPRJt-kDY;HhG953aTn5Q%8nj>9L>42=S~4* za4n(js36l4Dmije9m=$#K8!8YkfV_-5^_wHz0<1;=u#0eF1wV0ixleer%O>$79F=J zhtRw{)sRkQ33chzytMQrXN{0MF?NU7pkkRO=rZbs+@@BqJyUz~D5bb?F0sF%d& z$$0^eWIcKO?6d5|E~bMmx%ufXbg--B;OAZvkYk8s8>LeFetiy-9sMG|FqL&YLSvg0 zdjL9duBmq7fN~Js>kDUcwWNfgKYR`v2vEiDzTG|l`rV49Mmo6;ItdN8uFqbdZ(10= zn_8;xQ&keAN(Kk$NbaP(_g8{%sGF(7RyM%_-6tzV_{uMm^6*PJ<b5kDX1Nn~EZQ z?RrLSV(t1(c-Chgl@;o99DG1ps5DT@F5Q`{6hk2xx>&jx?C>%sH3mhbvBpQ)F7z7f zPK2^1YHOB&XPwB8UVjE2*e^BOXY1T~QyXxEJ>okoNjVLx@mzMJl3| zh-df=vROaURxw#@HJ0m2sp|YwXOH$BKacI*WRy!1hY0MuX`Jw9A)Ne=hl;?HOEvk3 z3LOe1WEQonOiO~2ph%?WwdH$v6 zUglEXr-scdTI$CbKX4biOaLn*Xgl~|nm z>*k<5CQCO9!4TFYU!fTrdNA}@Bo*lbwJ}y(u?UkPSa1NHXE3WglM~GF`^` zV8lE$;mI9Ad>O$>iX;QYm*Ab;WKd~?BT!$V`V&Y2eGJr3Ux$!`Q;RPntYleKUwb2- z=ve?}R7u6-LcJ{g02ES{7=7hxOaw~i3_DORU`8#Xh(mDn3!?ZH1gu+Mc?V!nJ{Fav zBTUbSxbXJH$ddm+#(98OnJs%G$gD*1ns5=-edQTzX}!UkAv1m%u_+4P605<07%XD% z$!SXtEHE1>iINoZfh>$wr!uO^CUJJz}16Ku6B09VDP>1@;LurkAn8Pq|Vg z17H`XlZ)uDy0p6iYGBZsc26R-5qfRN(VKQp`LJBq@HBc?6w~&vPv9BJips99cS3&+ zC$90mv{5Ri>{9y4I6WPUAVMmO@&zNM>aXvn{L^k6HGpCIxk{k{T6VEg{HYct7}|Pr zj>=m`+kI*86{HoP()FA$+Nz|{evQ_u(RS^*;#2#(BJCXAsDx_2&?Y1RUUsD`MpR8# zB?qvi^tQm}PC4GZkDh8PON3kKA9~ZiM(r*8b%HkZa`~lux~2RXa`P)?E5B0#luO{I z?7{~z=p9z41EYlc%P#OAs>bS6BWU21gs|;4c0mt|G$cN zll)sW6U(z38&7lFASr=D*^+%fW`}(}lYF>67f_iS`e%S8zSE z`QUO*FkiDVTeC4!v$GJ~Fu!5`>G{sv{(P`I8|=;ndlozMy~nb>$36@mTMliQIc*Ss zq4M2l=W!y%R=&rY3-&HX@_SEa_n!PPc=D6h4fElJr%{h*bFI5(PA&tTgg^9m+^ead zv%cfI?aK$ZWrN%D!Ts6b{#?z08B-z9ln=CL1MTy@bbj$-CC(1-*s~PaTL{$6ZJ71U z*b9LmskhQG6wfTU7B}Oo5V`sT)2Hs?faBL^U;oaY@9uwh|H9>5$0Io$T5)QbpRhi6 z^+SJWp{e7YgSQVZ9GyFuYwDSGE|=WroAzXz_RKh!z4du-Yu4L3hs~zmY+G-xZTFIw zzN9iUbo0z_oG2bM2~~}&Crm=1Rz7y>=#sxD?|(Gwe>A57gRu3d1Py*l(15|l3*nkF z8NAp^v&%*iJ|&0EIId865d)d?bifY}J7BtrqR^N$X7&zKDdb$BVfo)69*Nz}ZiwAm zh0e`|4w#-bX`}fqZB%TntaLu)StO<$Ke&#dl*$# zF9j2=yoR81{*iWCp-W_jJ`KKZ0bNFX#Y>h5I-yTw2VdtPQ%3NBbP%2kW>KWOX!{)I zNz9mv@qK>K-wKJH@_!ym>yaqu{YtlSCa2v*Io9s~g|?P)g{oYLb4=Sp=Z3CCiQR+1 zA$0M2h)zrEQMf3U={C+{L6K!G3F=Z}_Vn~{29WQO63qtxGKL?i0w;%{+KR5Pwwotj zq29(Hy9mFMQ8GP*&N_KO*|^8%R+iCY}G%-8wLEOU@&OiiUi}rfkKg ze8rAz#g1tUWT_3EgBy>s40ynzf`){!*x|EO z*biP`T?fYm8EWFN2KJ$1CA^~>0c`A&;sX;sBB1;-+De)-YZ7j3z^O9uWUo$d!;t3! z??k1RT{=DslJRxMt3uZVU2N!vMC{6##y1~;#Q6mYy925l@@R+MlKzQP_r#`gYZ`xH zG}?6=SL#>?@~%)y)PCvQg|sE&`kq@ky(Qu)*)sR`b8xs9X+#?TB3b!~ec9_63^{rSn7_qwH8VgxUBebTfs~+M#H3upu&m0Qx>2 z!0*^4rpSS>HA2_~iyk|PqaZIJ#+}a)K0(+veK(Qu2Tc&YV%xu=S@KVi4TKvNRd0>n z9L-m>Wh>gIEuZ-7W>RwlOa88V&6^9&ZSOpJ`$@=J-#L8y@V&N8^T|y2!Mjmx>|HS9WdRTATzC|ni$6o1OD2#?{4;2hUieZnDNbRS;CcCFL}oFu z)M1XPqLOGW?XO}2MQo{nl4HcK`LXCb-mqdCRVz=mK|f`MXv2m-d)f*EzT>jqlw-=t ztZ=n&ues8$Q4%PXT@k0k_K7tx^HZmg77gMAXNf`GE+~3^o_Ht0II02;g~^0jp>eMh z7A(88g$kyGZX-rG?ZKBNyu;?pc>gcs4Nc*rUuiFiX@1N6ri*<$!mG3o3%p&tfK69e zq(^*_iikfF__iH_3_IoVmU9?}l*px0?m_RAlLQ3J)X+o14r&VZ1ing9jnB})cnEniR=VSf{{vC09%Oz)`19H>aA4K- zw;r2)Y<}mDt2g4jmG4;QkN&=W-Z^)8whv#+sUz!wn&9*q2F@n~Af7$Ygdfu|HlhtLxbfDy9qZ+7f?kWj_M$cHPdi9CC4Xh;57tOvW-9z!i_Q_4< zRZZcuBT4Y<34UIfl0oOg;<56#tRIV=Wl!HBRdVe5JaJjBt>rPx8rQtubJoUyDAIUQ z*UK_EsaqzA#pE9(Vk3+sHag>y|C4S*aKZsDh@o{sq@`25P7Yb$Np7wMnS@ON-{K)3 zv-*`xukyjl?H{3lLcu)PY3EXqw0pa4R1d>-AAW$!mK~jwsXnf3;D$@ z|Ezz$X8x6V*N+2RaMpZ2upt}RFu!}Daz6ayKzGq2)Na5rZ_RDr+5<5UgdLN!lll6d zY<Q71O8g2Zg3O|MQ=P zh57?OUuhMBPnnq1?0@X_9&d5}ajo}wyZw*bU2rjy&d?w_OU^lRT4^MC$j1gFu_5Iu zghVpSO`vE17z;Uw3X9}?k6!s8;xXMqD})>s!lMT=&piJ#OTf0T(ArgGzrB0-@7X6m zrTb!MBUjqP&)pL%0)C47c3^AU$n5en0Z$kOWF-!xF!fS^`2UB-PPDPj%r75qZbTd3 zKEBBwsb2RCDBlB3p#1o(Cyx;@HgJ-kRzC3QdL~p}X8fdm__Ic0c6%qr#q-$PgT`U> z7=3OzHWC}=-=H+(opZw+(L!S&fmgv8U{#KR0inq;zKXjtImVh3E7y?`5Nl9&u{|~; z7^~Y>)*0rQO87LDuS}_27R+^Iq@b5zXn@s_bF&!jkYVr7s9wg>k6>A|a9g6kr3`9P zMYXPtYh?9H#+_Vxq5-?q1|_Z0z7=i18|}qur*x%=4P+N$st{iZ6NJ%CnqTo5=NYlV z;zI7}Utz8eE!T+i+xk-nRh9$tAy+>R(+0#7|1&CykBDxsS5B(-f35>0NgJy)2=)6sR-(P! zynVr*tKNREISfu){)UEIXli*Ud^^m(cuwMf=Alog7hok_wIy2xz_2GNM_|H zxp(LKvZ_z&IzMUe%C{fLwjaQm84Hp3$M1~8%%P(SQZamw2AkLk_dNee%SVdw#?HnAOmpk)n4M>EA(IYc%1sZ4^9aOPeYvk{_e*f z-7)ha@*$Ozg`?+0xG~zc2fsTe=hp1fPM}^Z#Q#D}0)TY==JmY43#uy%4LR)u5pj)7 zLTwT45thm%RKCy*DFVeSyEN8|$hLf(6WSF*JZAZGN7njdfcHOAXc11C(jUX1TOQn0 zJ;QaH+DY>aBefEzb(EQqpUee!mmFJJZi>g0vP)Ee@!i?^m!!D#}wb4cE!fAx^+^y1-tq!~M#(fa)hG@ws|SKl>(u=ppiCz{!mdejlJyWE9+wwk1apA65wg!P}Ve zY$SH~(b-4y)g9UDj(l}bwz_9=!xE0p?>+fp^{I^aRHph=p}HBL6U%tJ;NJ7L&p)~_ zw&Xpu%m;0`ks)kAN=$IjZXC6;<^!?}|Ne{`RR)5!U3)gu=soM~ZAf_XfStZKs29np zrIu3185-DZz!wo=V1pkru)$C9gvn&vty$RY)+}sx^Uui|!C?M%*(&)=lmfnW@2gU6 zaOOL+)tibo1dzl)J{yJZbcQ%|TV{wuL7=Ox?5n;h>?;>3Pe6G21C4&%apXB=$Ah7{n0+@^`i5BL?hjOAse98i5I*W5aO82msrNeCyCSmi}Zz+G_O5sJjYV zl7=ZFUWU4T>pZQ9`s5h?bz<=Jfd@WauT5TdT4RN}y;&_WTU#u|7sjtd$Bvai?uVmo z67tf>UHJ>VieD+(HpZVuNLPozfrSlrWq_~=@32dU%T2o7MV%qNCPps7=qg>*KE=BZ z8aKe$>($E$x&M9O@rdI#p$z(#Ay4rwN=w_G;tYJ+nxKc09rBuBFoIPad5P#G)!`vH zlIi61W*d9+jfb+0hY0=^N}(lAYx~^La^9W~?aYRDa#ROMCj&~HS^Ho>xf}?62j7EX zkg|O{1CvFIm9SvR*C*N9`S*wg#owatHwJ+UGBWD*G6Z-7oiDykSskS}`S%X>#D7F( z{SF)>NXKkoj0s+k9?$&)s`V|U)?Wav0WSUj1g+62lt!AvKzm-isj^Fj-B80fJXSaI0+cM5=d>;HR zwS0}h^jD4*zl~xuf^jD?HkWMY=w^Ln8mS1*uT658XU41=EHVMJCjtMV~8I)R2!^Rn$?Dn;%f*D_4i0 z-aE-FNZh`(>jAN^;XIIw>{wX5uefHRC=z&*d@N;B76Y@>BqcWsOTmaKDUt@hRLcUc z?d+kd_uzEdp~w)9E}@NN`XCSvo27HAl41)2#8wLRYoUV|hmujTjY4f#Q!z-%LKJNS zISiyQHO)@CeTJMkIj@rQ203q$Gegd=kwc_~1Ds!w&qNcB@wxqUOUy2}uHHdDVp`a4 z0^{G<#1v0Z*injn1dil9h#`QpH4lq_hzIyDBp){mLeXL}nLcw0rY(OZcs~|uJ{CIu zSK-{p!lAzu4rPTy9}7o677l)9wwlcM1vsC%9dHCVp9TceNz=!|a~})aJ{HIGBC^oWui-qjPPJ*K)jGZD#E0p8UKQ$4bH=GKd|FRlvkuI{eGy-4P> ztN5x;%W7{U?yKm7VXFeX#r=Z0{v%WEOf+X|UNxUG;VCO{?$C-rcPkdb%$)k7mG199 zXT~8;a`XnozP=*hhea!z;k&<2dw$sDo~wGN;daCPxfKC7axD0YakEfUmk|O*yVdlf zso-a4O#^q)&9>h6U%G?O_0Y|-&9!19PqD&dYMhULH~wxM>NTdu#jD>>|3P|%ysOPs z=qI_~;9cGFtjTn64!ugh_a}5Uj+#t$nT?OE2;`BgL*DyOp`A*zo-mo*^N)Y`^t-1s o+xtF9tOyh!M<(z60bP!dntJA|zT5CFWEAkSSoh8Fu`%}l0tj-y6951J literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/bert_with_rope.cpython-312.pyc b/model_executor/models/__pycache__/bert_with_rope.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebf80618b47a57bbe061c376e961b60ed04c3368 GIT binary patch literal 30603 zcmd6Q33MDsdS3V3CuVRB2KRjo2?7K_f){v+rz8>*B}&>INgjik0XgIh0QC%bz<{M) zYoEbbvH*K+f?2HzWjAj@OL@zEFYhfkhgbV@>?Uy>w1&#ewjrCgwXxSP$r~;p%d6x) z=liRVIoKd+TTap>s;jH3s;jH3{{Q=rs`*cDw}ZnGGyPh0ZIt8wj4sr}rbO0WGIHEC zF3g4b7#HV*JWp{$%n&r-Y>XM=#-Nd%HU8SIRA1-s&bV1T8S#x}(_2RE~LS*$z0CAfvf%VRz9-e51{72(R*)_7mAFWw*Q zj}HU~kmd_l#kR$_2e-4hKQBw&!7hek%vMdb35Qe(=frjb$n zjfX?=saR-iTym&sW8=u-!`kp!A7kXCMF}u;d_2!bmWX$@JRejBpi;8)7PpDN5)2`V#&~@Nc8NvWGFTG{%gQ_fTvkk%*2(Bf`@Y6EQrcnsU;`1aDY?~T)=?-`p}zGb9)cX9 z9e zXZ2b>BUPs(-c?#S!-ce3k<(jPmaf9a3x_#Q=qPxOPQHgeM{DP}HO%3;+I6y0ed>9i z8q1ya>qV+Rmq(4gXUcmoriwM;Fip|fq7vIg`(rpWGEz6ZBPkl=}2rU zazqd&1fd#rNREp{24rIBJQodzBjfUg2UIc~(%O)aJTxIfdD$zI6Bm%1CFk-9G&+PnroCyiAZcr=w|ssp;4ONp-_TGXj9W`pFQ21j8FDXMklwR zmq4(#T%OvoYv+X>k=|JJOz-3skk|3vi(m=8GTDz@j*NmX3G!v4_YCF*qr!~R^-Nw7 z22k^KgYJCkQM;@cc@p11@Yk<#t2Xylt7!AzvDIXprDP1)m;>C1hpV-)k`%R8H(;Zt~*23YYvsrXDFCF~A*}fXs zpYi(A-VV{*v200q?H0RsBlo6~xx*j2IeF+kH%r#gOm0}RN5?0@XrkeSRFd~8=-L7vJSB{eQlF{^X1PoSe4ql? zXr3!DMa@8G6@bhdXSnnErIk!GygKh^O*5uhGji!ri!U9YF}$`lM=j2)&^+%^GsdJv zz0{=|COxY09jB%e0&0SUWvM8F4vd*51ZEV+L6@&@45F zX-ETTL@q`F0BjhqfC|D+6cBbJkjxCONMh+DSnWG#}G(P z`8y@fji6RHyGY0rD-7h)a+{u%m_zUymvwMe zP1nX3##6qIIcLUSm-csy{_f>3ru;kR+;=Nm<{TNPC+%z(o$V{mzKo}S#nZmp-0`j2 zcWN^gjo+O5*7Q5mx3{Es4~x5p*NujjviTAW&GtaLZJ*e-@AfaJ+J;wL%^8<(rKVSO z_0BzUx3c=$FD(4R(i5r5UR3U>Ua?gR2hfK;Y4+My1nwckZjJ{GWZ>c4PF*bp&>jrcP~Oktk!cW{3tW{H5ON36<~ zEn*uhVF<$hx;tnOJ0cD}Rupz_knYk?2X1h`ZVI{>Zg69OdkS%b2F8bl9e5F`e0(Y% z0#K6?dgxqa1jNKON?;u5oY1`F!bbymQ?5*Bq9X~(E%UI@1q{@)pguDG8_l2fgbRqC zti6t~2!q(~#~|R&3O?Yt939c9lWhjGKcrC}P=fqt;W2c~*rq{Y!u*9U3<|>&+?V+c z^Wj;jPopLqGIRq6Ls4$E!UnyIo$b@Tp2Z{|IBH$N&i{7w@Q|(b%co~ z>R>JO1lo{?Of=+!{CbE@w}oWv5ezA?|3W~^W1!lthq-ZQ_ed?mC&t2f!Gwju0@ax<21^1qIe6w(lA zvQ@p0=W4y9rm^p*)m4zHgC9|YUY{D3Hd@d|YhD|*yva_r<{2LU)OWHWzx`chT%*ku zYAzsB=P*N&LOasV%ISB%KW*=M<-$b>a1-aj$QVtq6BSS}VG_tS79t`clZLWf0!pzo z;YHLX1SxnK!L+#t^kh6>mF!wEA%q(Mt)nBch~%PVt?(BM3tM`CM-!2oX33#2Q4-pn zkgo~(Ft%LyxSL#{Qs}2j2=uWR5NI$7zevFu3Pun}Huh5CXk0SJM=lFtO0&Eozoiwe zVafq3wGg79K;0(U)Q%D8yGag(=7E6=L>43yy{AC(gJdHG0P91C38$A4a;x+MPydRof5p>(x4J%Ey+y3va;r+L-Z^J_ zqa@SPk!~3jTL!6MBf}M4qQ7hTaJu_|i2wcr$mK?bZ_QqvT}-AM2Soh$4BSelcN`OU z9Q(j?Y{hnL#dGZAy2d4bX{3{)+CiK5YN;#rR%nebz3uy zt?9-=v2id{TA8VC&-m)*zp(hlmGaJYkF6R&{S7y|s6;)MIRd!T`yHzb2e{25Zo?j_%TX$OMac{aAsF3Wc zkOl^-N?u9mKN7+rd_{fi?#HJEcIWi4dk*=&1ZqEP1VT6JV)%_Hsm9&#c;bW+b6 z!~A;&m7kEhebzi{nYALf##z)FK>vsySybH4+J@Z7t8s(8yvR9YRrAi;kPG;|_?WS& zxSiEz$D7&#Zj8e}{Tm!47h`6FevP9b=nNEQN#O$GLX3h53MMId1wlZWq6tbM#fW6Y zM3?xG#J?i(7bO0oO8QLVz&|wrW#Ob*is^IjmB1_^m1PrHRP}MKg zFV3b)w_LT%8Rt%A$||n~76OZ>Z@hT@#pSkdci-&3ZTi0Jd#;tbBR}e&52VV5uUhB$ zxxTp(nrN?HeRc85a`W=3l&ANqiPCAfMi-)(Uwum>DPI=^fC~S$lM5%8Hm53juG;4; z^L)lrIVX=ns3nF^4@+F2M0gr)5>8R@3ShhpEctQ zP~Om-wNlK+RkbV~7JXe=J6%Gu2`q0DeZ5&HUAj0=dDcy_63$<@7+(0LtcR|=TxrE8 zRAcrc?=br`z>hfU_SavVS(wRM=_-fyamA)3YwIYQsU7M7yie~ZBmJ%QY#PE3Vc-?^Km>H>|iCmzqUa z0ke5|rF=7(O+AE9`Vm0xoiGP+8osxTVg6TCh%`5^aiB`+$_}xzV`(;3x&5kR&N^QTaB;(V z-MMrumPos|A6<)+{r{H?_i5Gi$Wz8WH>Afg_>LPT)}p{r}mS}A7ZKv?Y*a}dFF zQp}}MQa4?e5XGe8*=;6s{nB&adg+~)vK*qfBHxXDC-x~tk3dv(sP53o5ynwWko=ToB zBq~Z)H8+VVYz7H$A(q&HPWg*EQzKFTmI|Fga6gG^S^|mc1`>9zR)OjO2M6+T@E2Cf zJ3w{(gq2`v7Vs)RZ5LBM#^A)TG1wq&a zXeFyRo#B!V@_6*q0js4y8g+~1zC}x5fRBX z2_t?y5n`4CK|sR_l$|65(dS-UM_5EiAkq|~sTu2KKH=cG9IAR(dC`1K1w53m@Fex8 z#ye+VtjSYNnsB9&iJWo}BuH17Q^h!x(uB91K{RI~!F{LFFet%Zt4LO>SLbWgSnjO8 z0|Y&YdV0znll3Y&pcD_x%n;+)2up>WRj_W@OsYEG@ z4*j+!yVPvwH>t7Q8D;8nS86PWd{vZ#_wlKB|3ThIzn691@^gD1f;H@T#s5x@yN2!CE`;P{ACK3b; z!ci0#jv$cSg)TL9c|r(%B}Pf4QldYyVpX9+!q@5M4yvI_zHy(RCy+iuhB-7lWRc`J zl_({rAJw@hi98JyTS5@9%PjA0O3+%I2ff&1iCD_Q5m6g3M0Vs$N2Mw$*5Jl}L(aqkR9^%) z3bdM44Ifk;zteIw)pDEwMWz|ZS z8qy8h#D;CRx5GSIMNr1MZsE!+Nkh2mDt+t8t50UU6|$MXFw!J?n^vnj(^Ugv)c|xg zXjWTjm#i6Wvn(_8k6TlS`!_oXWK z<5~7qn;&oW*0HO{?s^-SD$>mZV)H=ONUjZLv)!@o;_THMWcY+K2nXvha?YkWW2`aWVFo!6$_`5<>aVn1%`73Hc(Za(^R#gIPgSEM~CIzVPL7L~iDYBj@fNB)Xo#yVyo$^|&8 z-y%)5BEXyC|7MHMlM1Bh2=ighvk5IWpC{^3bma8UJjX~NGc5_r$R^yRpcz5HDG-W) zDscqvJ#!Q_C*aT+k!+FiaDu!iC94o&&Jw~qREmT##S#U-((wee<`Ky}Ix%@A1W|`1 zcHws^pur=&hk%(tr5;61u!Q2%K8^kqQ0rz4)5KpQ z189(!EALB{_0L&BXs!hpf^d2uoxx_Ya`T*H)mu4#cF7_7HZ8w!>y?`?-hM*tKAQ3# zn=@yezO=JZbT%%T?l{{sb%AtUpIFzYO=lMA7QNjpLNk5MYN{{!nT_*!=PT#V-LsqO z{c|TF#=ENi{uO;{b+Xybl;Zbmg>2{v zHi`c|`h~d@A))xtbV0nHAtFq2Rz6`+iffaV?rTUkS9eZQWA(!gsX*gMT%NdL(Ge!C zqw+LnvVg$?B{g)UXSUXx#(L9((7B$Ff>aIL<(fk#4&3qQQq&mS2NiP*Gkxgb>adwP zLpqc7YMGod1Z8Y`{S0lYQSDPaml$@@L4*?8WoT0`A_nLpIpLxUk3HBwry~K2JO;Fx zEV;;~S)ua@$;Ph8@63^Ngl3aJ;Kl4_eov;0(a0rPjQS=XA&}343=)nbCK)Ft;Ymea zeUgt1EU}T%NJ#dcRxyzFbQ#+)96`D#8D%NhgDC15f6&(PV;& zD?hzvucMXcuddm1z&*NZ8(+aaAwwh0$lz%}wvO z16i~?L|gNmYyR94kNb^nH=@_0H^#4zryCB54Tmx{on!*KboG*Yf<3abo(!X}F1(s` zbB%}jHO|`TSh3Y+%Q$fPp{qlS%^x^xS6d!iIrMz0L=B~S*(zj-=&ScGq=b41)%EtMs*Pt!!%5L+UPx7iRl;Wx~l;SdNq_&UZ zPdiBAM#mayREo7FnroO_(VD1kVQ>~#3`S;{jy_k{Fz7m51-gzBsWe~j^jK_S^ujRn z#?3h=z$;=RBE$Qa@E*)Vp+Lhy^p#Qn-v>;4n1&;#*#N9l=o`vw2U2a?g%%;IMHVCG zoFl7E6Ocw(Q7rpxUVKANhSB{=6Jl(&I84zHyV%b$KVaIVl1F z-iMe4JN|*{*Lnc3)iylsAeSX&k&^5L^#uO+E40 zB={-G@_CZ+i*y;J;AIN7A_$b{DFmnh3wF0g?4Kk<9GncC)bZMRt4O?C%(+P`@m90p$ao+i=Lw9>X+^lV?P z?PX8y7n}Ms9o^}UePYMHOk;bxajV$46;3~y<}I0qw#6?jonG9(?zYyI&6|G920^`U z^WVDMSE-!8hI|l~j;7lN#kRrQN7F;6#GzAns-A&R?ppQvuf-N(OM~gw?PBZp+k@#n zPm6n=zT-OuQ>{~GJPG(h=Pq14L ztQ%%bMKu{zqlrl#$^Qip68?aK6$F?~OsLb|PbYKN1Q%H4IdMe#UIf zUB<%PWvu#cF=4pN>#6BCa<74%$Wh=%b3QbnS=x~OEioDcdj zUF(`U?+ z``5Uy8*|>(3x+qkOsI3#u097$UK7Nx;1193ULuiHDkJt5xfl_ygvLW`F1=#*HN_bm!odihkU2lC51i z5lti|GZai`BeJeuksOt#(L|6e@&&YAjFx>!H5T#F!q!Sw?8B z7UV0+3;bD>F|4uMuM)xd0m>4)Ey?pD%9PfnO9NtQAYD2jmJZBWV5Co1^@>%!bMUIw zqN`=qiybR1gSYuq+3q*ZW& z3%D&YOZ*4d^ussly7L47W6RaIUJwIA@AG2dAjD*um%gz4#pNegYIpvX$5vVf5fzo( zqx#m2Zc?hh>HPW647f~?$nb-{&5uFdmtzWtkXXR`XnwLexf>D2`*1Ek4s$P~KWB{w z8k>9`A2ht-Ho@&oVBv<|zowhLg*TX#3pz$l45TaQ1 zJ`oy^T!Nz{t>8Zj<0T0sQt3!I98wp?vne1jlE$Q!KSz#H<@2x^XhQ;MF{sTHJ4pc}9}>lgoq}J?zy=B zL;P3IFIAA)H09eu+~`N9ocMJRt-IeQLW72a zu{F!1+GIkC34iRZo<{_4W%?3e-K=H29D^SZD?Y$h%~+k5k&-NADcGi=URW=VM z3ijHtqGHxrXV3u~p%C2;(cGSy`yq9D(V-_XU|$YSFoPu;&l+Y8<3Q@OhCE}K?i!nS zm3q}-IBU#F5$ZjxsNsQ?exKu*iA`Rwqfwb$!xCnodo$X=zfm<~K#8YN0`Cvgnr#-c z0PgX)=Yo`>De-cpabHvOLH050nbyME8hrsdW{j-9ZKN6|CO7Mtap*DoStk&lbH)J~ z#jeUkJjZk6&j95;`g6XNdrqF@;|N=ln(&n6rCqhZ-CUcMckLPhjD4g>d=f59RYUWpyX95a z_AcyQs=9;1;oF(IB&hWSOTl|U|=jSUMNwAFD9%}z+@h#HTxC19H8Jh1XVy-a4rR?6 zTkV>4Qj)7+-wLV@^~?KKta<3FsSEnLV6n&r_DzRT1L{e_AD^(^7V^@nXR=S0)*z3xUe~z&8ZRdL~mDmMMSKOc{ zCCpq-Vh405k=SPyx&2OhPN-W1edZ=>gtqWpAbs#>F6evJ#SJ~5IrFYQbJK-J=;TXn zbr#bhGnq#CCh7?|B@3Pj6UVfjJ>>;j-DxJ@hQ7~iRMFUIU(d7}>(gTWpOMBHcl+hs z{nIYR6M;-PD4||WyJYVISr3K9rv-vkV(JW*OAg;OZ#=-yp_%eW*iSL&zU06Q1d-T| zBxP9M^j&-A_+9H}N%~N0^5j z6GjcP75Z6M(vl9A!aB{mA>(Lo9x!Gw2I zh97JT^5Q|$v~KI2WuzA{5WmX|j#yO}ZpqsN_yuh)#wGii8Lh zSn_nG)RU4$-cN>o2w8@tUSvExUx=*8`?Q)2*+-Ly=ssjfbw95a+Y?JK-u8*MC*B|Y z!1e^FEL3qXExeSj?7CCgHFqR~-M5as_4w7t)2^rW`%)pNPln-57@0mHDEJ+E3 zUJ9^3Nw}@}Tl+l(PXcXX6IxnsL`eD>1xeGYe0*OZ=0oUnZ497*auAl!?INe4Z~c}0 z2B2daT{vdUV}{@6uhFR4D6b}P%~;gDZ=56@y>@k)zUV(QUg87R;b||?IV>2{C@pi~ zlvVGS@I=9a<_p*{7a$T2ndhd)F9@F^U6t6VeeCut?TOP3`QP^b`@88Y$58)Y(09(b8s_*vtZTh`0w}5@1E0ymSFb=H%~nwH%dadB zrE2>YZ1aZs!(Np?#1>?SmMG zB?sFokL`pBeVqfi87q{?V^-BuQ;)Om7O<~y(GpmxI(imd4{M~y$0MVWL;_n7D%On-Y*FSm8@4^WqkAq(jV zH&Q65oQO%r1o4WFoI~g=pQPupgY9kU#>~*VS5>%}gf<$wO%(JaFGGnpl@)ue6Zaui z-H}jM@)MAp_UCt0zKHMy^$kH&8iphzY!%D4-l|HK?SdJZt^;COV7Zg|46k}Bu3n-Y zSB|AT-7B_k;Q-17N;d`wIRwnWFLi(b>XHwG*{iUZdI;(*qjD6qQ;V0W#VWk0@R9pq zOJAm`_nzHMNWxleAqZT?J;opAf5utOJC~-jbh`oT*p2TqB*pAq|>UBi0f#;nU|-m(Ei#T^x* zB6^QQz`BVwy#UatnyR%9yo)|geVa^=cx5_(OD))psN&i2iA!U#kqeQ$FDf%Bf&@Z2 z0tnB4iV_0$^Tui!3W&`Q3toyAfwnG6qu^=kR?K(}v}s*xHBpyZ^1vD8FL)R7s-uH} zgpI+OjJn^pVrwVVKu6F7T&}z1QdQ`{ssUpGtiC`YN?>}QOjXDrYmBX z?G6lpxdFi341kpYfR#wzC*WoK%)}Xp0wXdC!yfE10Asz)P``k^*x@!o!3tgroCKS& zbJLD^^m0V#A?Q9`BF9yb zt>?Ku4T#5FWzsN zLfxj`)qLi1wL=940Gg~+Q=s9+hoHmB)o`<}uzAJ>;ByLfxMr-=NT=t_uu*fqit-g6 z@LE^OpQnsPhdM@Q-K;*_=T=|I_|um7W zzOkr_aXI5uzrl=wmQ~R+nX0z={=DiJ(QS`It%pie`_H%DUjtD#wKYTZbQ&q_J@?9S z)`v=*eet~*!EcO08Oon;zrQ}UMe2NFjx1oO>ben{Ph$qSqd4J?V#Fp^P^mKK6BfTz z5GT)2dk+~&X}JKRTcIQrMnVU3(Z|^G31+*PF6~itqk^&vdqOs}mNQMN@2FnGqI<@LNIwl0Y?AZubTHuWo`^a=xG3oEwAm;^Qp3MkgnsISZEpA@e zEUR0w_uHPEd(xW^i1=T20HCV0GF{p%mNutLd&Sb;bm^d2Iyh(flWlwMlns67*gGed z4lj4iZ(H;&4Ba|5XBEqa?z*bxcF4+H3i14e5%mxx-9q_(xaB zbH#Mso%V0Wwy@Bf7F{;9EbaQ{=yLNn#+F`KoS1jsEpJ@fBbN72KIihtJ08)$^;W6q zA4vNLMgQQfiz)xUv>!Uy!zusK`Gb#?ZQstR(GTEdgF3s zs=9aHa`*V~ymiqj_tZ-F{`bpLcv0_>yvMCnbZ07R>#&o&v*BJ zXYW7Xf5&s^JNw@mTH3bko3EI^_!}j+_J80xv|>B-^R>r$u6*y$*N*d?_b|^i`adXl zA9q`RxToZJx#a=U#bFHCN0Tm+8uMqe0|>?avIC?9LKm9YAavkTM^vc4Z_ww<3;&o# zIuSrsUdXd1=*tJ5jpc*ms7ES^j*msKXbbD6$ghKG!euGpE4VEy%UR1GA|Py{x1)e) zWa77}iK>7A{bNla)38NRK{DmzJ&%oGtwWX+{G6rcwdd02YSCPsGS_Ae-R8sm^5xs+ z+oSJq|KZRNhF11HCGLD$>^=1}j&7|PIZK_KEBg{}*ian69aS7a^d5&m76+)I1rQnl zj2&wYxRVc|!+g+4x)ta#X**_Y)28S#Ezo1Kt(XCi%@1?00X()rFX@CHRM(kkBj+JF z8O~qHB*QEY7;&1I2FOHKdDfIvzc2v;{PYHPQ=;eN*uV$Ks0U=BBhvw}SddC9&ovE#TrY)sHs(BwFvgV% zTf(NCpKF1KkAA&1xo4;`0!Xj1{i<`zU2=O6pKnt0<<0<9`pZc*jbCmf4&)?jXwjj! zeTsgaVe5M~wZ}ilTJ5=7!;ZH#;Ex{Ez7urK&f(LL%C(Wm%*{dzM(0s@9x?ygAYM%P z9SUAXK+9O^hY*;O85?p?R>`KQ(XQa9D6EQ88@t1HZ=!vPp(evUE0fau}pqmyf<5#v;}V+S9^@?J;UMvmeT65M2%0FJ>q^gH^YstNX<2 zzI64FSUt1~FlAYGrOLNw{PoyZx0Y(~WU;SqTiU-B$SUpMC;Iovi!nT1E4Ho`PuIu3 zwvTDS&mOU5Po|=IIsEO3n-h3^YZ)G0zXf{%-#C8#ICJo6Y`yX7^;a{s?dw=vU$*Yx zYBuHF#&)Zo?|D|9egTDR%I=lB$RD{x_D7a^P8J&pLzx`Y#Xr)5cmA>Yk$GerRP70PYl%{(ndAd@nd==%4O= z*b5dp!~JJ^N75|F&TyBnURD-6+f!_xYxY59mpDOlD=S#(L5dvqZE0$4tf7X|9=k2W z?M>kw;I43KGs%6HKvCfY1@sJ%A$m>KRq`OEvm9CjbeA}FKJ3+%1y3vV${nZ_GE@n9 zuHmIKzDD|wm2}P9vC)Iqzs8xp4y@x*HgFOC9kt6_gfJybjB7PJNZX7PX`6A%KEWF|@M&0sg1A2G z;LQU$-;05q@5O-TdvVX{f{(lW10YLnVtGf_j0DIks`mw5<#}JwWs&y| zrV8{XfyX7!=mdU)4nGGsF`nodIWsEba6;&82_|+Q-Q;bRTXHJ< z5hPUS-OvPnB`Wuu9P(cCwXBHzdwC3~VV`_(tr7e%O+p}oFRx_kQ8Wn9Aj*C;l9v?> zLzf^#&tOl4u)J>+VJdl_d{Q7v?JO7jHw+j0R`k;9B4hc zm-3Svpw#?;{Md0Hzl$dj+h?2cvfcM(ZtA15oewC>ew#$mx5yh;Z+<}T+z$Q`_S<*z z#{L5jC?Ws)6)ASu4-IP0Eb?z@ZP6tye(K*+BP8*nRc4^%C+HUT&6{P%ysGL>b-kQkEVWdI-b`gjNdL=&p$r^|Eib zgZT=NVMmUH&`BA(D1#@L8agwQfbTb@T28VhQt~hIyg-?Ml>!nY***_!Eu-v8{u4@L z&MwR*)kWlh37?EGoTIx3DAm#n> zBQEe0?&L?@-aqDgKQ)+n0|*^&_|(Pmr}#hSp8eEd;SHZ~6qIoMG5#a&z(?HkA8}88 zYB2GJdmMsK92~#nBX09Y+?GG*4vXC3kGQ%&=7v7v`qnL-e9huxHy*$K_&SH^`f2_s z&->IAimyLoXyoe``)};JzH6OBbp0@oaQ+N!$*v}h^ZZGEPI&9e)hp{9yI4PC+>K&} z8}{q=bq>+>X}%Q+EjI$!1M3{3*%SB;)~X-#mGhAlU$<^}hDWXa*LE%JT8yu8bOlM; zz=Eo*iLTer8_Iag{7dT`PV1Yx_$~9@>l{w&&+~TPvbblR!)e`rfXAJgb&k&W@SC9q fpYn(y-m!G%ThVu-D}6`TI9#zO+%q7?zRUjs$`(r+ literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/blip.cpython-312.pyc b/model_executor/models/__pycache__/blip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe177875855e34a6d014c243f93f3af72a7547ae GIT binary patch literal 15226 zcmc&*TW}lKdEUjHzyiGAz}t$XM8Og%Q5WlCNwzG>u_B4KYsnW{HVko>6ewJ%T~HQT zFw$h4gle3KY9}St#1);iHC4GWo#~9_ms)-3#7$lRQ!e2hrdG$!gWuqguH8Pg{rBLTt6rA zk|n}NE&Udr($|@2^MNBMFg)Xk&jP(q7pw z`S4dR`v)CT#mlz-rlPV+lr>wpuW?e<%bZj#+lO}>ADDmMuY-5yl zpsfE2N|8}EfU@x`DAzDbKPa2Ng0h`aHiNRIG^N~l!Qo&YjU>X4${}fg0=?7FP}w1c&xIvf>l~63K@_WTCCK^;j)as@BqB$GFU6IJ6ja0G zGFovHU1%T?9E=65TFrrYWHcIkJV$yo9Fs$eR{6w{r;3aIB>vLsA{x;XqaloAoK2HC z&RR(x3}L9jm*nu!=|nIR4@t7}DNYD|x%bJ@k&(EPP+48gjR_x;+2AzyNGLIIT2DI^ zIWell2J{8uI13W=N4s!YL^RDMxg_NiJaNQ>emq?z%;( zL_(=Tq*9S6{`VuB648-OBjJ(mL`cDmbe|vHv1{j<9rC6~_|&G6vBc?kY}2_&B)UnT zOZmJ!Fq()fdREt3L*V5i(U%L*8X)ZzY#{zxh8~hFW5`+m$692~S z#y8iYEWoqQ1J?Y9Yo3Cksno70ib_46T8;S)vC*jh9kQx4<5e5})K)|nxuvR&nd+9g z`&O)$YRAPRSqCT7&oreyohe(V(g+fB>_~IDbHy#6LgqUdz0CFVKu3$j%U0Po?&P>Z z%Op1ltbAGMw@X&pG3=v?tAk`KT|Q`&?7%G7BzKNfqSHfealU&fOo#clN#5e13*oQ&mdqF?U zxJAKhBtd~z^#BUNNavrRFk4QP8|A|lMk@bIR2%8x8Y7i|g41Bc!>)XZkuvc6Ms6RW zN00HYphw!YMM1)>lxh*BCBLLp=^Iu9YtgUQ=pfNzq|jE$c`MnL&Di^B%bY8;Ks&W4 zNc=cTKxfWm!MB=a`AT0kyXZyCQyVFi{%$i)IK;vj7qIs!H26>HECy_vCmah7oDRie zazxdH_-G=Z^PG_t6z70ZO#owLg`M*%&907)$cp9)2En@$!JtBCR`Z+=OOhPZYm~`! zAc9gjS^x)@SWGJ;K#$7cM4BDgHgH6A>K@}; zf%=^KF`_SlPcH8zXWG?p+vUGg*F0%^#gl1lTWs8rZrp&3FXI;%{TtGL%9O9leA2e& zTJmc0jYDr9xqjp(f3x?d^lt2(*uutR3vGRqw%e{|q8Ph6mT73rG`3u8z1o^-Xuei{ zwLDW@KmFj92Up7MqMH$}__*fHnfj*b3s)}8Z+rW}w;lvqHnVpP0L<3*#g^`LOZPmV zZt0~8{!iLFU*GuJ#wDTh@{vnNX84(Hi%s3>rtZ1Ko8P$pjRj%fvenXo`X9L37{+SC zNccQ9!nsgn6c9~FP#W5P!NgHO4Je9(#1mYDZE(XTu;ponO(-iUGodM8Hep45+PXyn zeU=p!A)BB{GWs$g~W0{4QEME;Z}Ku3@NMYzsuR-EpD7ie%g1UT4?*P25Yzm%Pn zu_WS}b1)Jj!BHzq;Fzm8n6d(e29)yo5uAHvBW0`up@g!PQZ^) z6Li5T$OeRs$=)zv0DKkThpM?vxKIHgtKdLY2{$2$u%L@0&cV3yQb>`;n@b-4+}npy zPyIJU5Ugs~EY^0VYrE$7bnShUZk)iy`VHy&4U6?#()C*w>vyK>cTNgsaYksk{Me<( zW;aegwjgZ65o>Ty3OJt)oqDyo{R{Oydaaw4H<8&lDco7z=KSuPXK%GH)E}A@mTLXe z&t7?UR+x*V+8(%7dAs(&r2CG)bDptcTHcs)HL~L#a1*Y_0MWsq<_?lrKN_L55DcCj z4Mp@f3W*j99b1N(`YJk(GfhA_f?~~^gV+H%5(x&c^U8LVsw4+Iq+=~HPK6=@U9}g{ zmoIZ!OM~5=YTK0Bav^7g>KYQr)qpuy!a!Agfc>CmAC%>R18!&5Ql^|$f z!SeUf=xRZ*-&)3%agq+(isWV%N`vlQ>SA-2?(57B?74Lb6RPhfxBif0@@I*0 z;f?iGy8)i0`;}q_l5HGcit!s;k!17|k~1hwW|S~ty7bsf_k_|CqeO3{1QrG8Y*vil z{;CBmCbudWfvE!}NQ`6Qt|KV5etN4;wCXC5w@o=W(P6ZD!)!fBYB385I{hr%zCv=X7#&p; zWQQSJh#H1jMLabok|0WFSryk0#S`L-Ci#oI#qlZ;98Y}Fq@glPpl3+$_|CiIww|+R z&RvJ7dMX@JmE$N^P9Pfh^o%I+VR?X%0hR^xMW9|;M?^G<3<(taa3z39tEO&DJkbZG zm6?5OE*d6%h~|EZM7u+Z5{FE#4um4IR++CTDe)0#2AVUMP-w?$E(|nCqtvWODYOS+ zN-+ni&_>ami4mp^D0Gaqdb95!kc%uu2_~;Av=ucQdW>DS>x`c^V*V?2)>%C2Rrt=gC6Y+hg1&DHs*pTF|_?7?*HdNdBXp=GgtQ@VZ=GQua- zof)Cxa^IysdK|lS%*a*NPq$rZ%QUXV2TVOPbA0N-6|b`fLi#JNl?twDqd}LeY|uTI zst!O?Xd=RLC<+aG7V7s>M@wDZbJCl!>oMpZk#`~sT}MbY==`K+9a>rx)~AK_n1b%> z-8XCA^}pkXxKfKYzE`Fz1PnU}7ppenfiZVt+92iD3emG|UK-LUN zD*&maK7i7~)^arra2clER20YnZh%55>DYiCaA2#h0xP@FM8K}u&V}Wdl;@C9PExcV zk>&;`O@xO=LrMIJRMHhsFA9?F8Jv()FX3cklA9?F9^?&zd)@dtq>l5xZG;7#~ zWp$PIMW=$z^iPo>^JK}H<_(5*5^QzC0zJx7N{^yhgwccqeT0)}cfvMdhd#g!Q$Ix^ zX@kb%FrX4z2UluklFn6qyU@2wvJG0W687dr zC~MHKvYVm@5cOd}PT{H92TqU1&M1TQVgON~Okb!ty>t^iscgD59k$-<*RDpMoPzKjyi!AmE^=9cgW~SG>Y8m? zXxMbgJL#UDwO}ly~g{k4$jTv9n286rpm>XOJp14x}{HpNU0n}kIFHM`VeW(U{Hz= zz$`ApPE1fN&Q3H*MMeQ zcVy}sJ}X;qci*WHQytsV72C3Qo#Sblyz~HHCZA<9k=gu52cRtcDsA# z#I=)GPi8qJ=MS*Q&4FdgWqls|nmhH))5$BzFYHL`!afawR@)4QgtbiLDirpapkF3g zWGe~#u*m^7Ane;EyTs>2dYS)||8%jvakm zm&OYiS2f183K)qkvg29YnA@Pr9_fKzIxC>X~KGK+aw6Q(f(b$mq?kVUB}r{&*5 zD%NE>3uHR$ja+MJ*O+TAQEO0_ZlQE?3C6hvST_y`@m2_*SL}e)q?9^?OR%)F|EJ$rphCr**N75ND zlrGqP7?#zPEB=xwzm2F^uE<&cK56Nib1bgkm0rJVp=CF-wk=h6WrV6lp*1bEvP5TE z=*(2suHxk$M~;{KxSF+jDzjcm^6$Cy6G3>liY3tgVted762Bu$;F)QTY88c~$vP%T z*D-fio-9wnm4y8}_AWH+n)H?+xl`Vpay9EyHHLPs zdy3#vM@c#;N|K^3iZ)V20C^8syg;uh`T_n_k~ztM;W980rVC(V)=nmj0yLzoj)s)d z(U4Nx?Dm$~6R)3qje%jyy!>|Lt;iQZ!WMn()X@O`%wQbG|4-m!C*)9U0N2dQ(a@M- zkGK~Cnxf(E-vI#==6=JSn2t$2u)|^q9l#`W&nAosDSfL``Ub2dD&|EXv~MD9-gs+{ zg>bBJEaq*}A89PBadO<$NX zM{n8O=7sePhR#|xng3lV3R;9nV(G(eJi#o42v=DEf1~)C6M9rO0Ynf@N2S zWE6+a z?A~<4=1K1zs>2|>unshELuy6~4JL6Bjw_Ss_qRmVKchm5ZsJcRt!@=2SJ)3ahWAMg*-4LX#g{l`sShDRspkdN|z&eQz(&2yl#Gm0ouc4Qj$gu)3afO zcEAYjgb~`!jL`l3jX!CC>x=-xkYE&@uZJ>sDdlr@$S!>Sk(pL?;O z@1sQY;`2RjR(ZcNr0!-Z8j#)_)IA`?0cNpguL^``&Hr0N3C7RK3cG_QgP3mGE<80P z&e7>xzJsVpu!d%OPt&&bq89=yF{Yw0VUL07;NI?@tffH6HVwlExyz0L%Mb*!=OHRj zIPwkvjCt5#hUJrvA9AnK9c2zRu2^2Fx1q*_<3dR09k*f=injtCw59+HVyaJ3H)&e?P)?9n;N6$?j`>;7M*S*lZ^U|@bm1_+E_9|Ko zkr+FvPoYcSfVWsI{!PzX%TIK&!uC){Y#Qv2#Lrz|M;cCdJnaVw@!Y@<qXU%w@KCRM!-w?A++!HrMW$F+1UHg8KeZ-b}DlE*ihxSYI{Tr6)-m$xr? zI%WqK*X>KM+xNa_AFj1BRhv`252rT6BFi7-yO+8*&)2-&{8sZ~@4CPPs;?C5bV`=dixc7f}nwNC{ zvrI+ZVnutpqMh`)z3KIP;kMS{n>_e|&<5D==wc}*VV5>;eH+vA)yA#6nzbx&X3iaC zMq>gCn$3laaB+T;Zd3~8%-p*?Q#Z6NV&4$O*uMIEq?I@H0;Xu)PX(lpm9Ti#6E81Z z{hV^uON)O1b%>uc-@iFwp)yw8lbL!4nt7RHeEEz_x14d7CV1xXRp_CSSewVr`x`ApG%t^H7Z8gTb z30nb*m-PK8<$r6oXixHxqy7aKQ5e~TgWTFmI_H`4mEy%>8LVC(9*ZyXuXNs}uIcut zbQB<}W0+W3QOLW@Np@nq<~UtRSJHmwBvzPz<(Z@fW8Rg!JS&@UPk55gMNza-a zw|w|v2mJ(w^`elV3O9z4NEki~iaZht4amwrQlS&SIe;FkD6~g3YetG#d@rq{w+f{xvm?ADN&D z=2fS8a$n1cUAX#GBxq2YosqDCsx&^PMVXO)%`8yoxnH=bg|3RwA=Q(bV?GGZo}wTv3yMxzyY?YkS@O zntN{RPs-tQEN)q7-a1v5scxS2ykET`Q`7wM_9N+v-YNIYk-3IUb?dB@uD%b&X>se6_rr>o8F}{k`A2V8?ESE!He1f!f0WNw zagCih+9k1T&OUeH_CqHY#HUR9+Z9_gLgS*)0wr=r`atN&ZsIyOWH%S+LApX#Q>+6K z^24!|87Xv+ajiTWg3~XiHmnWSyP%E~K{H zp9yqB+4@a6^2`86L~BDZ!sJ* zk<=YCmpLS}RUI&zJvw({e(O#9yTUs{YU{yt_n~y>BUuh_-rM?~`mZ~`prmd?qfZKS zDx`AacK~txfQ-31sXd`n0}9QL?qKR-KN#qV#dPz9z+8%T)0V2=bF&4jq2EyTm=-G? zZgelB3ie8Z!9xi{bd)9LnxuOxt!3}9jBs{0etKmXS@O4w3fY3RHTR&6u-~)kSCFKo z=vR$h_o%_VH*>#eWBW$yE=8>mP5)(EPYy*!6XA%e1Dc!NbPfXURE2JJ84|MJWa+<{ zZ~!IX%B#!2bR)ow`pn|PBw@C(6yi|&qL5~wx%TM$dY_U(mRNy$8xcf5p8w3j^Btc% zIDYNNobW5I_9Jf3ueq%safg4!?fJrD<1L?Yh(337{MYzjagY6)Tbp%Q_|Cb3EQiNU zl|62$%aqHyZT#9fE4-xmwKuET<5u-DxA84g%JkTku@w%S z(2kry7uzCpNeG_f5X$9ZD01##S8}H7F0p)9byjk9l^rde-J7B>Orpz`Q+HP@0gjT~ z^ZUDJW@ncJQHoQkOaVRJJ>5M${rLL-|Ni^m|5#q`5pc!qzcA_iv>^OB1+>epdR9|5 zLAWElAVh_zB__lzW0tse%o?|i*(?-ijoIUlF-P1v=8Tt(m9aQm%r)l1Gh56Z_l$X1 zoIU1^`^J3n^0D%G#aKn$Kjvrgj#y>9YOE?=Jyy-a&R9*ncC429%VKr$`muWEcf}gw zjbn|>?~XOao5z}&-xF(z2gU;N*0I)j+gKZm^Tyia!LcCo`(hpO&auvT*H~A)d#pRY zVQfRZXRL=kE06WY`^Ng1zakcjZyejm{QlUcc>h>`d|+$<;mT-LY%sogY%}v$$F{_W z#)jgL~CCaqIF{bTWCYN#&)sT zdc-yuVt0$X#NBgcc)mw06Zb|N_aZGz*@TqMqUZ7)%UzZ%2uRo zdqT>CEM+@V2A`1f5KGyCl$}pV`4mgpg_PY-Ncl8NxdADAo{;h|OWBK*eNRYvgry82 zLBH#Tdf zurjuwj3M*-iNoTExb-b3FX=6{z_Amvw$Sj8C={|OuHz|Did=|^O4+lK>FLSDMa6Yi z{OpXF7#H!_eJGKbN<~s)R4F?(Nl9ah``qlbcqld*Nsfl>itnjNYW&hsF_M~*M8*45 zY;t<|>8ZrT2cqSg7Jw2V8oJu4S=ea1R!jbV*cp`yN^>Z_kL~3#_LXY)) zHP60q>fG^XPai&XD*XJBXO5pe_x#KFuIguJVyVezr=pSA;YccScyc_Y)D_~6PKvRp zS`S0?N!0UV5-k>|*Vh!%QyDzkp)^oU3y;o7Q{!SXIVGVv@IVdX(~l=6rj%-g6ujWshP=GRFtSn#}nwIxEP&8 z%5!1@uT7#|ecXE%jT?_BUNv+nGA;hdLc_qK_^GR2oJ`_%XcRGqj;$?x!;$y}F&agk zlS=j3nd#{%DU~cdRXkeUnKMec=4ZWumaaN9lbU)-oV<7`l{_^SK|Ph`$;4zTd==dr zmc)ybNmN;khU4^J#TSpvUJ%36k~lGWeOw)~?B-GftDk}AjvxwSmZ&gh#o}QZvx)Yo zl_r?zh}sC?M5kDGxl+wxxM6C!qxPsH>Wr4XW#XWi? zI65^R4og@_1jQc?qt1j2)C!0th*2D=DQWzY;^f0iYDe6g!YW+hZ=czmicfEzo}3;? zu|Ym?eP+k5omX~IiP$y6{-P##QO5`&iv&oKUCk;KIr4B{Z`p5TSZ_>~J& z36YOi#m5GCSR|B4O8sa(X#kF5nUFS9UzJ{Uz6$Fm#*C>d-gxD@8_jaL0BVbn6S_0ty$E)Mz0hc|!+BEm3RK_N&&Y zU9?3VKoTdBOVKi53HP}~XVfj0MLkjHgl|EZutdGDR)7fgy()~k-9*HdV*##br58ET z3dA=SMX>-ccfua^zv>t(Ul6Wa#wyMVp(>?~eSx}SGK4)QMxuvODG@8%RN|;2ylGXc zCzIiEX)2ivN3;iulLMyW9Op}~4|6w>3iHsY;v1)hRSilf8IEqsev1}*Ct!fIJ%iHq7M{0D z+ zv2oX!>e8a4F7uRM6~1OIj4eamyS1E!JKCJj9+jwb=Bmz>pl#pW z_E2y#8N~7$j0EFErcTF1TH^_q3q*xJ_k))J5uhFvOXwnh>7(x|4bxM}NrDWH%0#<` z84=)$Tu4sEW>VtZh8Ge^PRohW;5x~72Sas=1Ia+OflOd@TUa`V*C<}TM1+ajQEE`H z0vd%`jTFy|M135QB+$M!dUe@Do~t|(iy4At$?-@G6;-T>gyK9knGhqA;s7fm%_=qE zM?fD+VdGnESHxMx!Dt+%45TyOAxWcDxth_b)bTjl4U`&q7hZ?5!_w6B)J#eV6e=?< zO@ty*wbyjCW=8XvftAJqpI>|gRe@>XTN zt+Kau$(arAmhtD^eLwQ9_rSyIK(4xL>1eilyWG7!-L)exSauHQhIeL%Pszil(nHU_ z^IYC$sXAj>wb^Pa@`A0rBJUBpH)gx`$zA&v>#U5wuCx3> zYe%+qo7}qXzCFA1jEuk5GxVT2*U*w}*eo|}Ua6HEwxNQ)_v#w%ynOrR#jDx&A-R2M zWovfih&*y6Q+E_kJ^$I;nrrRKw(gKycf2c@u!y0Wd{z( z0|zoS2a&<`P%ZAKms8ozhvdzNGIdW;LGSer=6u!Z+JWV{RhzZXL$&G7I|W~5*4rz4 zdzW{=>)rOSdPA;y2i5tUyz^Xo#|!Do7m=F2-dbL<<`Np4@66qvTk6Sn9+5kbWExP% zFZ&o>p)ZW+4}lf|f|%7MICmgJ!Gwpe^twp#Dij%?X^UnQ%U-ka>sUB1>q}B0-EE53 zmm6dJZ#%yt+@R%6lb#hGLQ)pB5nH3NG$1CNqd)q8xERX(h@eIak~0V=z0737g>uL@Up8;Qtdj?YXGE53 z1dMjS;WCt^69R=i^R$fXi+>@WNU^Bp8#cWhH`3{3Rw3QITd%dp@Fufbd*&T_Y2JD7 z<+?&Cny*yQaJfNyQn+iVaKj5c@tTokz!UG~mO>8Ar{x5mSXDd;XwM4w+ZKi!s39=H zH7XH_@T30=V0hD_*rJnhh7Zyu#2WCyFU2Y1GvvHN4spcNPB@AaJR*n^#jQ_kr7Sr$ zk%~vIOD|EX3*=m(q#ke|smY5oQ!`0vl41!hr5E9Zd<p90H#b1bFE4H=>z97TVKdDzxdk8N5y&W@5>Ay&orKR zEX(~HnXS)ennxE-J_-cyow$2q=|(27E$wQ0RNtJf-z?W}PH#Dssefw0`(8`%-rl== zm%S@5XSW@bw;g+@`km)9+sv#8lXRqA%%sbCNXg;&x z&%1=K-mmw+(ZBpNna%@gSNnU7t@nI)ed!H*?_Z%Rl|O3SkZl~08wZvnD^(D^d^Xd# zE8BQjZakc6Ji1VxYuvb4o^9MHH*Uo0+vLfWH)qS+@?DnL%r z9m>@=e&FwOJY#uO*_n5c2i0u|EpJ7O=iMyq5gIy|4$Jj@c`pn5gw~FHIrCNsRkeAR z9WpFW#gD5f_Jea4t78L7-*e#ovY?~&QA6O){O$R?1Afpq!428=Epq#om9297jt?mT z7lTms#__LT8g{S#G@^=x9`2|G;tngs9iSX0oJ_3Yc(r^CLJ!et#!NyWVjgcZkDqWx zU9Z~5Tui*-2Ksm)xF!LOD&9B?+WPcN3Ic3SVLyW!N+;or7Ol=iBiZo3imkL#afGc@ z9Ffx3RWlk2w3E(cqeS&GkJYdpY@1%Dp>)EDlFPgt#G0&A->?|U*Qq^2`6g=^N~$h- zC4$;|g?j3%)GVY#qlE9`bTZ_X&LA0|1fL?`%j7%{M{z*>mlPHEDT3Tl?9wO|41L21 z_`}F7ou_0qDn(`Fk{HatY3DFb{2=3q_?)301AZF$o_6z&*ZG}$$dWt{2mH=MSGmUR zlyVW1Zq6q_{7V^K)>Fpt?YjMsYg2(@Q;I+VqZiF}{Qt8xk`lZpDtKUFW5hnf}1;G#j`=dV5Z6>aRXxMEk z8s?&9kGM`g+ROriRul<{E7grD6+sVa;@_jJL!=|f!?Bd46{98M3WlvZLras?g?}1E ztMhP5B)uT(izU4UaH3H=Z=1KzJ0N?te9ig|>s#vB1_)7k>?)AYNKYwkNNG2~o<1KS z4RehCt!Gdt(SR%T0VVaHD;tFH22&PmGp!XNrvcfXMug(lOJGAvaq{$`G6PJR6wFbJ zR!Yn!Tt?t3no)Xt>$vqTWJ*$zz^x)V&3jEdz}V)RHqc-DP_903>r;zk>6*SZud^9q ziZ4~HRSFFQ%P-!)djHbf7t?!3AJm@)KzaF?)d^#BTp~nYAD8B>h2;fotT{G8ny&Q) zm_$34XIe5CF%Is32``Y|fD>|*Q0j`8WMX07vWkK7<}pd?Ro03ISZXz*f1wdoRQf)0 zB)7w1sMOGMCviKG^EKro^|_TsWA_^s6gXDVT>dD!?|1OL1f2f9=suld z!YkBEn-qYu0$t|Uo`ME5q0F@98+qJX%2~dQhdc(5Dj~nBCJo9OOd`&ge0qc8QHd_7 zw!9_=l=A?bClXsH<&l$_x>F=+eb_ayJdqtdAP*kM zbU`Nj1D86;2&(i|1%9@o&?>Nc0?zs~;(pMT)1ZZMZ5v|;Bav>!nh>T)N zHiPGa>o$W5I}cGrvIh=CkRX1FNTF=w7P)cDN^7R^KoR95-9dV3k(^ROW|<<%`2qfu zw4Kce8N1_1o4j674h4#Z^k|(FQ33vp7{j+ zN5Re&>35y}$HB^d+$pV7A7<*2+U~BFXiHXi{gltD8<`j>FCx1^^@<}-{>6h?s>6hV{a9qtMEvN#_UsAST zA!iRc>qEy1JtpVR$U}4m&p}Z|vtMUi{t8cumTxdFziNrvAkeWxpyOcrD^BRIa32IW zQ5W=A+);6J2obhhzwxq9B5tb3clk3p6I9*SUa#I81%DqSaI9fzCZHUV4lN z!n~zCB^TzaRf~#ZHCu-VBA$u(AS?JF+goZmNUb9gI4T3tfsm=_i0RGPuk_(#gGD?R z%8xqO8=QDUnSm#xZ)6vq8%UHy8EHKfNS2hog`sk6Zj|FO5JWmErHDIamzvUBl2RL~ z0(d*$DBoH58ZtntQDDFr;YTfZZq7f*6q*YB8;Jvs&eaSSQ9exOFHt|wm8-lU-xLT{ z6yQ;5V7kSOo(fNh*f&t;BXKUD4adYpn6^C_F~5YOq~C?3G?ge+g(oDg1U;)%6ju%6 zu*hTjS8U~nVwRDhsZCAy@;Rn8UWbf-=IQ#=&_oH6iANac8 z>)eg^Elw)Y3yLhI(`t8dr8?Ruv?z5m4rftQH9Y~moObHWN# z+SvY&n7Ej_G;W-Sv{o~U(}~gaF;)pXW{Ty?VYK(}&)czmCVUZFzVkM6$vf-umU?Yi z*^nsloy~uaZ|3+zBm|`4OBC^M$@w?rkRV@aG~xt5o51I)bf40EhtfEbq%W^jKc19t zjM~N{sgSEgS|WXy^8X9UU(biH4sn_Y#m84Dju1+fF$CH4^^{SS$SCXOI>f3ZkVAnA zbgfv)lI?IHQ1Ow}WdGg$%LAE~og_2yHY|3$>ut$x*!Hc+{nq>a_gnwalHPt?-f$xA z>wM&|$@+WW_4npN1KH4CIkY!dyD_)%00c1`_dyV|u{vGZv*r;3+j5n)*~$UAa)5dI z<;s5M3CfkhTxHLy+YzW(I0Y1_3}*csWdDYxi0toM-ute9$HVS@x!T5T?IyW)6Z345 zYqu~@w_MwutL^^v=azkMmVfKBzzMl_7mC_|w|1=gu<3Pg_U`O*e|qQfbm&C-%ySRg zpTFhHR|}O8O}DK1wQBXsmAz)w3L2_KIXJ*j=japx`~uB1h9;-r`50(o#cZPVB^P9* z1qmCJNmRK$oyIUO6HLmcZKSY)Qq1l1aUuN&l&*36>L~d4h??tuj2Iexe1JYf1EL?v z!RoekX4^K)ZP*Dum}xtNR)Fr-ozLC=T)Jy}dgrlB!|?@Q&flW7&Od0wNWX_FO21Fe zAHpg5-0vZ3ZsW)KTt0PA(MJy=llsxyJ^yMT>giN^U1OEoc zr?@1MJVr?(3mYVfEbJ3%8}jAM%O#1-%O#1-J8J231RzD^UrZa+0fQVdUq)ewE1Orb zRUOEmv)CMgm2KbN`^~*1lnA_ig!E;C*_>Rl(^Cwjp3xXdl%;OA}@S#J5gh6c$4>VxgcQ52k>Y ztddI;9UK;3cZ^#v0?=M*AQC2JkX*7&8^^qzHf@Ma_?oS-adX@Hr6xOa-Ov^3Q8%4v znZ*+edwLB=21l_G!&a%mka3?zr(Z+v0x{0{F_L>Ko@d#~qEk5Tr8uBRcu`cUNNZS? zbFvbXYy>F|7FcK7hH(w-Wg?1An$yVvlit)R<+Fqgv&c-8a^>U7wY2N}%I#OMpVQg9 zbT!+1K<+*8pyS|bmAky+{Ys(pkmc5KtTt7hx!UGj#{m7c_2$~T7eBT1((=p8C(|uE z|H|vEs>bq%wC_{4RhzdS)b5u(fAfJALAFcsdx1keE`tgbkqSj*EyKxSeu4B|`N01AB<`b$L zZl!3|$~J738@B#&)wYF0xt8{OyYKE^irn3|aOzJ3eYwikT+be`C~IZ*){57j&6f$) z+lg4v)ri0~exweODZu|H`cek)H_7)!a!O^ve?*bwJWrLT-s2lZ!1nS6hlgni;ci5B z3L1c&fI1N5&T z>K_=O37k#X1&Ek@0B0`)XCKC0d9la`I3mVAF*FlSu*u{^pOio|IKAS%ZcKuY4b1># z;y+6+bC_O7R;+W1W}8bWsk)?a1ht8cdg^aa0rL69CozNiQfZUEctA2^AC-Oq8RlGr zs*<$yZ3IK*X80|A1&_F_`E~eI!5S$Fl?t0_j45Smi5eapqueE;ABNvL5Eysei^LqP zzjg4wNmJ@FN?Q-SKd}TFhZPz&mdHXrm9FVob2}Odv-wE?P5hu9a-2d9iX_zzM6mI7 z3m$L{rbh#!76!EI?a3E*ya=eN#FA^?uJt>Ea(&zsl0j++IUp8x4Za}}#1yirGB~13 z0Nno$H|Zfc7s&Y`#j3#lUnqjUjd`RU`{g9$D>i{YPYYyc_+$rxtl{@_v z>cE%q0=C`X*qv<%$=E9IkQ@3Je2)O)#I5#cT6Tb2B}|8!Dil#OExU+&ZEm~Qf44v9 z3qU&nvMXSEOYf>+Z}C7i)hAR!AE1o^9men{hxR9+xa{AW<8DNC z3K~$If*jS!$5EYp{gsaPNA)e)`d+!dH}62G2-PWE&}%}7V>wl&2+Oqrv$`8k3g?%j z793M$nx3Kq8v!=j9N=e?CvfByNLvOiP$&Ej_+79K;Z%uXSREkC1*)lnaeTFY(dc5M z{UAJyN7f$|JZ4;R%C0@vMzY2i+Z5Y)>bg=+HXSZaXh&5E+@O|1-^tX*RNnV55T4^Z z-Nm&WEXw{{)Ki3t-~s(LuU~)dI+sTL#+8hJIPDtd4S5m?MvFd=s4Mn)^!huXm!sB6 zqJV6pii4eX<`N}pXX(GeF;q=ypc?20>%#GPI5rudOeNm0eU8}xee9tUU`pCUV5Z}A&HJ4883F0BKs61A#QNklo9Ri&n`Ksi!74CK~( z#kkw9)YA7-9)1X%=b7tU?_*0-uF#i)Hl;5mu~3g(+mos7TX5(6jah$4_J`805TDcT z(m74_m1sb5N{x|be@VgrNe=5Bw)M7`f`6_#Z_*n{8uF^}fz{#|$vZ8Mz;eS%&C2-w z!{5BT(k_P%tqO?CJFSk5OXF(-ZhWLr88j62OH=nKu}MM7G@@N}@M%U$*4Y2DVR|{C zD(J!)b4N)ECre+Zrnj^GJFDmdpIQb5K=-U2MgtmHlQI)1{B*l&_Cs-FjEzstBvQ#@ z6+f+(Y$Q`N-4SLjQ1jxn6E%<8wbJ|W7j)QA^3*qd)`&p?qetdvg=29%H-3qv*Nf$ zMm!YfC1&kI@#r%osko=Eijp)L6_WsE`y>p;@Htw3m04<0K~nMXsmD&@+R$@KRk+Z9 z>{${_ISpKCjU4K3iR4bw{~+go!%-@5fJdFe>`>s}P^6z1tk)t*`X1Q@Q&XqLvWSfTM zrlEz3TwNRS=Sw@5aSU{D!Skov4lHiGH+Xk&x#2wat@;LJY^y-FRMeaphp8L0}76sMq&OQkJfG6R6;hS-k8aK?0A z(F+LliRDBL1LBC=D{SzWtNMZk`$FoHiBsgTazWOTlrnYrFzCC&j4~-6tUO`v=P;;| zL-g^)WiW^#i7Rn`^Va0kn6&h)#I0+fO*HNe`(-Mqf%I^U!b+2yA6b};CKc;+3K3p@dVwA72VL)=0Z9z5 z1Y3E6D|G{U;QpJBqJ#2uWt(PV>77%C*kh7 z@47iC(7}b{xu(u+Q%G(KE%#&q4ksSgwEu>E#j`yBAh`FP?gure)4tR1`D#8ajXkl^cNs1Og5f1Qa51SQ96edSeq16~~8G5znWODw^g-0Uen&PedG;%vs}{R{3%|%FJF`?3f*Z)^nH5 zV{Xrv(E(%Q;pC{bAhI_V*sHH##DO|mE)t7jhj@K2TXy#|DghSMT#%W&{+O+9pVYCLj*Q^3u5i9{fa&1>XLX+!`FU1F?|(?m`c<#n)pN(Eo> z*|f$~=G=fGE~y!NVkwel1VIfOSSlLKFq?JBx&DtHR>i6Fs8DhS^*xi?f*jpbH=^*mZYTwq=*xvMbZFhXe?XJ?Vzt^v2zC!|nwiG)>9_xu)R5 zh7dPbk!cvpHTOV33?mpeCk;-gIyfKS0sL8NOd6RuR&wF82*4!llnB$Y`uvU{ap(gLZ9kf7o(_#(uW?JMb z$U7*?#mNKb;vTuCH}9q>4K9MSwK!r*Q9i*_|KoD<^|?@9O?%#fJId?mTedB~ zxa^gKTj|UcmFKCtRg$ zWDt*GGns7C1vyai@G!(pUYkr^lFlI$aNIT?OBQNcyc*XKbWg`3zJ;W;>`D!3KJY=NrKN~TXBSNMI5kcR8qYLFaiKJ#eAoqwZmZ@m+%Io0_%$DL z={<=%QaK7k0niS-P8=fHp2FTe^ioto$7F}V#}q=ad2Rk%Va zLxIAM8Vuah{-rk8m;gpESE+(&mDD7|6fMOudT9qaOitrdML(<*iD_22&qO5HZ-o_I zu8HUXgEy6w{yXw06>9CZ(w?VCRg;#JaBNFMDO1xi>j1=7!XgTX1Eog)I#6ju_yBPw z8gLS=V$v3Ts?r{hToGYvZaOs!2C86!SUp{z)ObkNF4QCv`z-y#rD19?5f6;&jw;*u z9HLbT8BS{;0Njnjz%d#O3FN8**{YCS70OnP$W-2(I~KyOW7gLp`#Owz z*oM8wFZtGBU!>>jyWZHPC(Zf>WZwV_w#mM>tZ$?2+nB4W0~OoNiVi|QqJ3Acu?uz2 z*7w2Y2zcTpCvBqD>_Vz`mZ}AUr}C<-uU+=FXMG1`-+{bMaF_qU)z8)j#i{bUXiEJm zgU^fq)eqpTzmi4;ZBE`GYo5ju)S_UntdPbwZ@mfq$eUPq=dGU~!HlIfr{wy$Ysz)Y z4Lj7Hh%{#xaMw!cNA_JIPf-!#EVR8pUtZwQV0F|6G*aISnT$r63cwe&{#gIZMhz^t zmD{%6g4bZe6X-EJw@M{!qD{rFis1{BAXQ*88+5S#bS`-A5(ouYITVA*OH(tkXz+p< zoJmMz;F+wj4~EJV+w{~lpC2^irT;??5r@oLu2OL*sq%izs-d=TyBF0g)K2k;*QYU6 zar)ts!GEoEvsP&jPe&x!bBiRRWGs4;ER3sSU`DXd?fYmNa?|sdV>FkSi?ImFwB~^(${&$@UzOdk$oKPRKpbQ$De9 zVzK#QO<%6AD_7T%>m9o9kvpEo(5{6^KjP(lg0C^{?Rr?#&yzs%K#ySS5JC!nYr6G7 zrs81Qb&!)mBU;lH33q6yFrtT8I&ST+V4;ny!>RjbY@Z3Tw{hwry(o>5R;@25ichsHtf^hp5aWH1{mUGEE~36_4<_F0TEfI8`3Z zIKS-LXP$>=z0H@K))UOKt|}*4PvhBCdV)sN=M`-}(ejQd!CLAlT3!=Vk&-eSz{P}d zohFDZYBAO5;QGq}mMW577G*k5Ekr9n+YQe8gR(!E_V=e<{cs;Ov@TBE{%NQnkPiHS z>>pT;^5f2L2QvQAH0Xgn$f|gG`?HoT(vBv6pN*Hlf``D7H&)cjj%tB)d$3=vD$rWX~elUQq6JN|k1i^aA1p?ao4V}DV3MV=&7LpcJ zQoyJIAQ{w=Op;B`^a91S)y(D&&9RcJ6qgBR)a~u3Y1E9u(GZ%glQf`|DwUGfHJnO8 zn(ID=3naNJ9A6i8SV@NTv(zHHP+7D{{q9@QZ2fMT{tn-N5zc}a<0|cK%hl9j8zT_R z2DZt8Z7WwY0T?%F$^n}+6AK=yu3IzFoK$MfNNlB?qq}N`f1FxbyJ(|=wfHG`Fh*>j zw?;uInIn6{nFyYIG04pR_Xn@x2OT1Kgf{1`6Ew(;5KC%4db3^-(R)aH30)O|hjj@k zNy<%q#NeSnnB*b$l-i7M7Q!b^i-~BVI41p7-x2JC#knaeE0PSJ-?n9V`?m9;eyTaK zy%-de31i!^1pUFm!9g?vCGHOvHy3+i-XJeg#feZ3jtpyWhbkMjObT5t;hocvC896+ zcR=ex{k0aR`=qxupwP%@4!iV6HjVRfDv#KYt>G>|$~Bl?@Q+01qs4X>o1Gb(=Nr_Zw`h$Ue{x%CH~`nWe-ZhaT5HNR9JnR#zTA>U#2+e3Dy;A zRSZ_%-LKF$^V5UtezUQ}-=!~ZWncWTs&i>;rm7D@BYz$L@}(Ua|E9ES6QgZFZH$IG zN;HR*I^e=Z2mxmK{<<-kgC9I0 zu89i-7y(Vx4HpDDpJhj^q%#!LN$Jm$k3K`XNe=bAMC~H|CLCDD6;l$mjQQ>)8C;df zp=0&QUxSAqQEcr<+nXLaD)UymW8X5fX|!*J-|mmRUHhHgAHcfNzEzvxtYS&Pjz48N zZ23Ug;P5QX<>_{RO#en0lT81y>e&sUT?w&R>@UcmkB7ry_Q z?|o*Ky!keZqigBN*N?q%?CYoBI8AC;U3sU&(Xm`lg>rgO=RD(EHv2 z9J{G$ytDWA-n;{T$TDi1Sg4FbE}?yR<*?kgGw-ILM`&y%f_^C~H*CyPUbH|>Q@)&h z(E4s&^^@cROe`zB3I~f?4{pKlaOA z>_@#MVivgo*~NayYcP@E!^+1@eFF?Jq?3Ht+e2xIqU7i2+4d1IgMRgo9gt;XS*d4W z&!YzmWHi&DQ#P2b1pE}>AWWQ1US(T*QYBuZ^nOHI_CrVNul_On&x)U?)+Ol@?USgL zDbxa{MQBOlntUWTVcYgfT}kFaCLd$_`bzU-&vXrWiS{p~MsjEiQzB-J?fxrGCYdHC zApyK7(LRw<&Yo+BD%fOI8d=;-YBH7_R5iSk?6>N|FdS4T5`Jq{xwG2R)6@lz*Ch3*>y2oMm!W$oU;|=y(s?GvFH4 zq;JSn7?f)5xBN5?97Gm_5JG)EHuu7(xb|Y*#&68+m#*S%_=7D)xcF5Pi{(QXey-$Q zVaJ~cTV!F&p9{Vp3Uwb^9TqDlq{aGSg<#q8Lt*5H!rmVWM}H`6{?KZ-Sl<`m{MaK{ zwtr}K!Y7b}l>2@t4E%|(|Gx?yh7x>&<+Mc>M%SDk%Tt!cQ)>d<*P7jy153U&0k?dk z(7rXj{j}V6IxSS*3T1@TYtGXaOY73}Uw`?Hm)8XHuSKnume$4R@4b8%G4QVSbXbCm zPu)9y_xPFs@7gho2MHvySU~S+U2CQc)%Tk2HmMoLESA8cmDJhR@Kc7Cz}lQOge0B! zLU%)J_)W&vwJ}StrCmz`@7g(w+p;m;+)owVxYpQY@ub_freVOn4R1wgZRaqe+P9~7 zoRHg3t_cX`FXPu@`_h)$tfg1B^ych=oI8-K>Vve@YF+chtd{1KX$EMW1Jum?j;Ed5BgqE zG8xdT-MBTl>#eA1Hlz|Kg1eb9HB!dvKMl8&iM?s2ai;AQ*U%QoEqBV9b({L14qVdp zw*8|$=erMp6exQ#$+TDEhwpK|=l#BOe&>AW{u_tGO2HE~{3xY{XjhF^VD7t;syH0Bwih8PoINL&{+28@u$M2#_1z(n$lQFF`^u#mVZ zYK_?fHWD{S?J-BdLE@HZRm>T1#;OC=ByEklVl{!9SZ$y-Ru`xvdA4YMtRc`q;`V4` z%pGvYJONLvDbPgn98qt~7x0mIRkS&_FR+iqoza$9YoImO7HA{s>gfL1fxrP0cSYM{ z9f1yr*RZwGgR#y)XRIsGMbdRqf2=#u9Xk{_6zd7}#Cik0vA#ebl+|FwW4d@8LfZ?`L?U)R5uc1qLDF(A zndIWhNCM01g}K?;1fL8h6MT3&hz%6peDxARmg~klObyzm?#lY*2_S;8?9l7YZkXlX1UZsym;E z&c)&{DNT(<;#`QATrXd}wma*U#I2oK(3W#7F%?M)FY$>mC&0iUtLEjo5KMXjRx}Y; z)E>rOsIB1{dUFv&s2jq^v(>;f3HNtXzVhmu!SE8k*;LL9)j}^VhlHEYD3w-jtll zL^2c&#^++eX_%`Z8F?-_$H&7sSoJYO1P|=iCo(gnlUnHv^rG2#Tr;cie7+NS&LOq(?N?0gIm;>wOIW$WlEPS^$u;Ps2j&AivO^z*57G0SQ^F& z?CS4fO_*e@I&K^vvkz+(lV)w7Y9-zFJiq2#p0Oadq6B||hZbwpwa%an{V?~_bW7D^sHA{l{a z#iuxlfrAQ3{cPfvq>n}7l0FNfLXkM~2l*J=WdgaHfLo|ldH&bM{$y;ne>O7P3sVIC z+WY3*@e@Ngj&uFd$o2l&`Q&sW-hUH#d%w)Nxi`5mFeP423jNnnaP-a2ORgZ{&QL27 zl*h%Nf{xC>pU@9y{5DnaG`+{(pSd^lsP0401JC0#Isf_Gz7f$gk~K7J*sAZGTsm1W zTbAkHO|Hz}ozJ!%%{2}_enV^=%GRF{t50OBo)^u-8&$P;VoR}YMsGLX9@#cikbZsX z_0_i3m!XE(eDu$r`)K%Q!%rL+Z;x&`YVN$Y^xATA^;*u+etUG&mL?7hI8JN>u-qOvsuqMvF=>9<^|Dyexs&wrT15@n%Y$t4rWkn9uz%;>jQsz;jga# z{OYz|=eDNpKnLx4?s`^7|M(mRSt?oowY_O? zLZyUxsqHePU1hzYU7JdiL|9E8ZKaGUBaFR*Fu|2yG^LEJ5k=24Xv|5k(kN0@c!#PO zU9_YuDO1YaM9nm-HA=tQh%A-Yk~I;iV@Of@j|F%t%e^UiSeqX$ zFnyt9G7fUk9x7+=FJLvTuHOY>at>ssWDWuVjUIPy3`Umm^1wjQ^B1vElMseE$EPGqFc^-81R)p{uzyv4!tZVVB=&U~ zf`!JN{I5@4iSI(8@GeNfU*D#-t0+s=l1VhXpO~8p-nP8ASM>I#4NuJOLQBW}#e0hw z_Z8f}yt_+ucdd=&d(MjR-+eZ1c-K*I*4|lMT3k)$TaJqG-+ALn0&Wl;| z#jNw+XIg~XIWt~F>PjlYWD|&h#H;bOr&}!9ZEr$#C z>b%`2+I8q-9pQBdZc{}of9?=?p|5_?OffV zB}>}4Oc$K>X$N`Z5A+ig67|52d@3G_alognL9|QEMKNv<1_7N!xjskkoffUuN3MWWWt?#6UH&}yuwxC?WIiUVK zVL;^3S~Q?+hBgtol((s&36RXjiO;c%g(_L-2B*Qo795=R>q}(z-Wq)QhA!B|Y1>BgK*8RKDwl2PMOB`x{${qeXWPQ;BjTh9 z>Nd4!MsuD+D3dmt`&4qMs7Rpl8ow|%$vm8XQ|8pbYqbY=3fV1hLSzMzTqQoOnTZIW zrWbWUhqZP+DUEIsIesPHtWjt!(chvT&=Y<=e-m2cZ()FAfegY`=P~2AAP}%aSY??@ zATL>{)|Oadj~^l(>eHY`qf~Dboo$)6oU?t&fILuVY6=@OVbY9j3<-)o2FA5of8Te{SEL}i*70G_gPyN2D)TICJ`8`_gG?Ep z|DV-iD|w9&gl{OEZ6_k#q4H|%Z332d7kdMh<{Pm$rpyYau^M@uFa!WWS5oU9(W-5% zGFt;npQ>u?W|CSje-AyYue4lUq!rcDKp3fVG+~q)ONJS_OEXksK!GrwJiuz1i3eb( z<7R~gXCJBuL4Ymt1qaCK5ujQy^u?mHe#dUciYHyNjG?i7BEiR`8e%sO#vvYr<|5HV zTyjR@fPR9c1Ph3Km8y33Be?-*!tI6HZy7mmZ7}~uXQi&t>}+%%%y&YB1q^})k~kM8 z;FQ$Xy6~II828RSRmQz>M-lG5Lr1ACZAy)S|0z%kczU1GPf85@vzE5|se7rltLxYD zr>=>ou03h_4p9O&_VpC(HOpg}p7n$4?vGot$6p3pR|{e3XcqGy`q2Ks4$Ob`TE6+1 z*nA91{iL)5O6z>9qnWYw(ee}~j75CD|7Q@+-`w_o)3 zKX&B2r?X}^hyYMDZ9|{hh6;_HOP4kq+tZgI6Y2UyZyyT!rtTs``TDmgy$?*QZV(f` zp0#PwdkhiNu(0$jPZR*as4nb|#F)*!aS^z%<1LPwZ z>=tK>n2Eu^fIk6CcjNS35>3VFvP4q_91&|WPA39xUl~k?co|H`R0*a-O5W`_gdHMJ z`VKMq(K4aRRIRTxv`Ql48VECdFi|2WTtZlu?zGwKDmyN!F5ab*4#e zv-AtxvAUE}W6-BhjhB8){1Qg0DND&Vl>%z*SL3B$Q27mMW>U}U-ZrOnnt55l0B9_M z)@oFGh-)*4xHj{Dpjl`J?Ic?*5K4}_wO6BGO8oU<=$?jz%$AgD~LtK^3xIY=LZ zyqAR?Dj@D@WN%xO&_nsre+ZfQRD`TuDZ}^S{%w(oyY_;tDt!C~AeD(*Qu^EE9x|n8 zX%wuBOi}}-(1Vo>Wk&$HDmW?2hujMHaH{=!g!&B-HH$_a1y*exz+`+K@GLU#It*oO zPn&xMaU(cN?kHe;VqUrN=kwy{jYlNAHbhM)K{0V*B7@_D|#g zJpLc*FQhO0jjt`!1E}41dTAWgu!d3kGpF}QZ{?lcpE|oYY7eeC*PFAQ7jw0jvi3`G zTXo-Z&ywlQxew#TlP_!b$w%3wI3k9X*9)_tD4aRj=+dr@4lfP!cReiqNYmW3t*lGp zaUqYoML@%%VOxDnQN#!bN1|kvNCA#gw-`^qTMJu`pvFZb3y0?>#V?d?awGV5QR+q| z*D)0~eWfkGXik~F|7wb2N^R|!vpLzIwx`YA3dhq52UI(r;BKLxL9Vdt zNol5WntJyy3_t~k%Q7(v4pPWLxw&e6Ez=aH#Oh*;9wqLhHVzywPFH{6Nca*UY*Jq1j&FS(lX^y zOAM*NKf)Hqv4vWtk}@l~Gd#}%@*LSjLqhqk%#P%W7pb7!Da-tzyr2h}LL&r#6B?RU z2lL*;qWAE}9a-<;ocB_$;WF6V_jmlXBklN)2f9UjOL}Cvcf)xwlU)1u6X%gawYyN) zk}-&N9q9|3&Q`c;a(02B-`A0Ob!}|DJ=c6@*$PbL{?&U|^S)uxH@s{q)HY{p{e|x1 z`R-F<_o-}MXQ6!{-+oMNKUQ!bTD!Uy$hwCL-u6srttQ*l^M#)F^lmowf`ik*gLt;@ zbhhct7mUtxmfq4EIvSS8;TEgoNWOhoY#&~~@`*M3>b1YF%C&!Id7LzRLTo>geSSRG zKCwLhneRa6!rGZ9CtuF_UP+Jtjoq`_k!gNn?=RRrGOSr;e_=mR9H80{6$h#6W|jTO zvcGz_r5jIkMI$%i_k@v8A_?*$@Zy7g%khsfF^~<^siIziK4$SLoW|}jX z)?R&lbe;Lg{xf^_=!n>RL2Mr_Qc&{A(NBcG9{LKSgmII(gkc)mqe}R#>#*tmq0J9q zfT;O@Zh)w0BH7DyBAY%GBZQ5u2IB=7Ccc6BqDnJgxWIhjB<721V4k&-d<`_3;LnHV zgiv&Bykx}SHGM--DB4K)iz7NP)maxFt~yNzDlq`lTR?<;{)Xz4E~QG+ppY7Yx}#*CE&YIE}(^k2+ML! z+C#LpvXi3OGsaO&W}(incj(iFXX(?j5?%0hp-$`B@OBj32Of>BH+v<9MVjt0q zWmEQ@;icict@~43H?d3~61|7=-eJ)@oF4h{*k@jUp}KCxcGs4#J}6cn)DoRyb>{}S zG>xuZ&NUv}HW_?X>5+HG-~znu@MB~C@Hz4Dxm@cD*njh;>X?TGB-8+DJ7#rp(%(z+uGT4|4!Uv;>jnx~?kP2v0wlKB;@DDbT#etQLb{qlTv|FLxk zST{V@v;%~A88AUbaK}3{Kn8hd=$5wU+$x#?;H@ca6dV|I>RfgFzlN5``BQxkuv5t> zpSvAy2T~P&*EN^q;HxG4E^49Wsmbmn!25pzk3U@Un5mkEyz8Lo0&w|3>iyK?-^jVn z@3?G199?;`*Px`_(vbph<|$P7v4s=btP@wsPV6L%1)8a`n+?$=-i(!v0(6 zli*nXBaD3wiv2m#U@(T`-Gm6B*aD*1dKtyS529H3DfZFKa0$c?mq6?=oPv{Q=|b7m&rQ-9Q%e+zec$0Um~rEs6}pp-~y_t{Oy(Z_9*P zReGjp53ma*D%gdn40g%9$u5O?YY!gD1Ms(ng6AqllVb@6hBn2(fc`>sCBGs-I7&ML ze1su?BnEpHcI~9@_vjTGROAB&YM%NN{Vf_^?}EK->-E7dwGz{`fFE_j?!$5M{}Kaa zdpoz@zsGD0ZbOH{F^K$ElyqB0%G0`JhYOTRbQu2L()U*1dgAOW)V6`5SKE~yDOA-g zkEWwJbJG_lApM`@j^GDf-L!vLG6jQdA{-3z-O!xm3@6}IP2zt{3B72i}5CZw;LH%B-Pd2&r!AFpho1{D0 zL9Xo?wBYy+pOg~kI^vctyU-naR*l_=h5((;XVf5H!YJRI%08w?pFu5Oh@g3ad_AjO zhsgMfT$=P&sxGgE%(@g_7@SK+;N=mMhlA7+IPkiM?~HlWe)}QA6d^{NF+nkCV^>VNp-h^cP(eJxc$Ivi*{3|0U)76=nY=RR{6^jLB2~ ziR%AKXP|XoP!N1+rRY;Tlb7gUQlnq#^iXJ~=$9bX`?u7gk${P^92z zZSKR=gVYvAw-2AD8wzcjy1L3I1%b3jmoA9rn25G~Hq zl&*1G7oh28IeQB}*`k}b^pIU7+1m@c0lH_I#}D2z-D|OJ3X;WX+DLl~j{0S8^?auO r1K<0;tglyW=o20NMLiU4GLAI2d|vjK%hq*>RR=|;bDM!8vVi{=LYM+> literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/chameleon.cpython-312.pyc b/model_executor/models/__pycache__/chameleon.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..251b7cd80038eeb95042d973b038000abb4c855d GIT binary patch literal 51176 zcmeFa34C0~c_;Y#K7q#302)98Xxt}3@B;4xBuJ4G3Goso!%`3s-2j{H2B6ms9tdE| zjuL~8YysmW1hYyACf*eoD-oR8OyEp1rZ<_1HZ!~1k4?MVFI3DV+9aCU>_7q(+1c5h z{eShY?&bqYd%d&2{r#p$RM)FlRj?E>Z1+)4J@1& zX^b}YH?eSjq&eEs-x6)@Z;iI~x3M@+q&?cv-@(EKkqyz#{!SJyjBJc{^>?vwQDjqe zbN}Y(mi{g5xj3>lx~+d(w7b8XJ$obDqdWR{uy9FaXLMKpE*370?2hi~-xJ;2zc;$C ze;1zDQ5>VE@7Bq5ebB-u~X`;r_$XBmGC%yYk30(WCuGS-2u{Ec$H! zvn*U0c`kaq|9JF7{|P+%gH@5f==1&0M^E;jjGpR06+PX5n#ET~&P31lpN+oI|3dU! z|GDUk{Vy6gBNwU*z4Q%(H0J%EF>=pv!J5~(U?6n-8zwonpT*W9wk~u?jr}Z(tw(G_ z=xFG(l0EAvy|JAvik`s zM_9`3NV($)DKD^;JCSnN6HR*AQh5$0A|8G>ML$ z(b2K^U_2BQO+BO6MBDK&6&C4hHH+?pgYlsYM?!<~2|grx4qX_Gh9aS{(L-aS!{HIJ zNR2qjj}3)lu`yoss&VH|o$EO)rNjpvJ@I&GG#(xsZ8eInLsTKT6z91}B&wz>I5-#! z^-e^i*Pb7z)G;J?kA&g_4qH-f;GF+LI;h=nJq z)voYpI6e>=8yN}lcyyi#M<#kkgC`~;c(R>7ai(vKkD>!APK5Z8Q1H-LWFk6xQZDLv zcr-M~i-o72JC_@EdhAMe6pHg49~%kBV`?`bs_4|jAlf@QNUcztXQ#^3FrZoaYB48{ zh6aOT>AA6?!ShPw;pq8LFo>Z<%93DccyJ;TAGi_i=UQ{YNs|+dHB9_s3YDK9%Bo@&N`4O!=Haa{e7UDTN z9zPu#j|>ilqL?qDTZ(#NJcubO7H2;7hT?kgA?HhI*1y4Y!tIO z8VZJy{Vba_Y(`1JGnj?ZLD3~YT^Jk>eMkrcQ{SuASsjU(SafD$e0+?L$F#SYSaRIS zlVY9{W`mDn%EB?4xzTVehDNf%iiS`lu{k<;DKrp5Yp)H2GwLYM?AXBY;81*wzb3j^ z!pPvY5FZoW(ZOryLs-b6;qcWV85Xb~yZo^FG6FZa5Z7-Aa{a~^K;!hALgt{6uvo|v zH0_~ZYshx7K+ciz1DNFqnuC_0HE8<=Xe4sON=`fFyyj?iiskAWKNcMv37rgI4Mn7m zpImeX;CjOtyBMwBUF@0SHU!R(?g}t~5}*|tXbX%^V2<;FvEe|N%86y99oiM>Q{N6< z7#tl9MPh-LEv*@8F6;^%WpkcN2mzu<#bl(ovMcanI2ga6lR$KjQ?&;!4@M?JG0_~q zHXag<=SM#z(m*s^5DiyEGv?f29~x{Q7#IcU8yFCs0|av>A{2HH47@rq7?I!D1_pv- zLjwam(VwDcU;uR{xJtc%SQ;PEf)O3MAX=sQ#&;p^4z~)Q`={qN#-rmK$HU_r;#gIo z_=c+!+js7`v^}&j50Sl8k#_H(qn8R`bdJ!o#WT|-6)ZH zClH8jf%gmCM@Fl~zEZ%sie@_nXKm74n|2n@9=>sW?)dHY#o$uUQf&E9vhF~_dq8mZ z+&A~|&ncy3O*9K)$*mrNr%kk=;dQRx6f}m+!^WWLb#uQZXr^fvvIi}bHf0*&ofYpe z$wGF7Z3sJo$n0X7I;FBf!jMeAu}{*_(BJ9-csIBy?jnA3-9<`~GCqr>u`+nC zacNKSn^&)2O?esxPvh6elAi8_y_@x|XuZTBXjU7kdtLZj?S*#(RO~vflqnJImHSnm>Ei-Yc*)4*n<$mjK!E}&8z0vIoeYCiFiirqqo8cQ zaAqoDuVj_huj?)(WwSAJt|E?{La$95rVJn`A92d~7V1 zMs%pXoH;9Z)pK+u;zLcj` z@U$lEt*rH}F8&bG@xA07CXZHxXdOF$F*Fos4dBfbV~WLj(FCl<(>mY>$YVpsAPQSc zeH2GtKmKCWgRgTR8x5B2D^`Q$8N*`fDo4NcNO+9_EHeXGRtFaZumX|cHq7=NBzYn{1gJ1M23qxlZcjq z!h%d20T41vkjuKLz-6s$ZSt9>Oc!MaBI7kTZJshu87GmGEisrR_7Whn|Z0 ze!_(kI&FWhFW>AY&4DF+>BPIE<_`7=MkxkAq*-RgMn5m|3D+vR>z3HB4Kk zty9K}$~;wSf#0kFw@sP&LJXj8It8DNg4&z52Mw4UggtWHlx4~~Wt+0UV^XL(235^{ zSwn8QLbQ@-y}*bno;qH1j9m`#d^i}2wOaWLh>_O3Tk6$e+^vC3IW z^H)r^0_O^6b2ufq#n7`e7qZeM;t3J)=gGT7UJ-c>FDma1wm4V9K_84R?DGbbI?wtWY7_^2Aw&%sbrS~ zy@#zq7t*j#B`S9`v6wD8OOb zf)WY1<+v$}mLjf#m$(W*rYs|dchpjXMt~0)^PJ)0L?AJD-8#j6rHTXYnletArp)h{ z6(EC-v-E)pcZ7*69|#9yqH#Qq2v=MZ-w@N)YUU?V4o{1bCoITM!)vuOXd{v4jI066 zh<=7d3nCw}W&mQY$TcF`Rl;aMqJ9R(MkCjFY6JK;5@c9F;vs9%zz~E)Svgv55?s>x zzUY)jlvUa>FfbGujDd}dQ7=lc=rxHc_!1&AC`_Hn(fR0yzl=;Vf`&hQjr%B{D=15s zSKTVVS)MBI63V-h<(pIGJB9L{%T3Aho|$LyGF84;DBrt$FD>Toq@X-iP%jkJCkq;9EDy{4^PfpobqV-iwrR%w(Bofl&wozvY)IHQFc>O2 zFrV34iHW6IQyXAK<`px60JVn=*Z~A?aB-D%jH{{isF9$$q~9?r06-%ETMhgk6bFv+ z7x2S3!V_)hNe&zeB9bwRbM)q)!Hdy{upZc25K9>5M_q?WU>K7rgvrg1?_wI(+&p)v z>`HjAacfSlz1 zyHgHn4TIcNxfWm!c}k8DJ8zK)Phf)l5Q({^wTvYSiHsfct-(k*eoeGWk}%Q5Yw}=5 z^?(s&q$sh>Ji!M34R~UqL^#Hf4I2c+3Nv0rlnKhvsQev?A#J049hA=lK@S*MtrBi& zmj5cHpjpn}AdfLOU!~BS@WlMguZ4rkGy5)DBD$sb^i@0+s9cit8HjSF_Yxdt;syz? zNWQe7irIpaA@&3lfyXAhp1hp0b1b2{vHt}gqjgI*&h~0tli)p?*z^p1Kr4ltS`U7e zSI=Kd)$9`RzkK&hep-6oD%5~P-iBuhzI6*#-9p)Rz_-epTRU&=OjYg_Dt9I;cPH$= zw67^q)|_bDC6w)&aj$_`u1=Q+q}T`kjVb?r!M{IUQlIcPEKCXBElX2Nqk^|LUB79~ zVyY@yv)S@MKRXDaIb?*^TAWKdGpdzGTPSW`aZnV{2k7jD()s5EZ_A2{qTHOb^piXaH90Mv z56dd&56xX$u^@~xo%u6E*UMLJ^yITxju{>n*Q{75fS0u`i>AeMi!LFsb;ZWw?3}Z3 zwm4DRo+xY=9386;7MovzdKNXVSn!*hjiT(FZ^Md%LLl~wW;+B&&8IFrN)$gedkBB3 zBLaflY6HAXil0&MhS%*RAcmOO%;;(h;hc;;#N)KZL3?spLLhKs2 ztb95EcfEYqNOJk%e})tDvbOF(dHy2mK^qHfmvR6A0xbvJn*=nB(MA+sM+vM_Hole~ z%;&>{F`42KT^GW^U}%)>c8X@&iDU9xX@*j-5g?I?)ZfM9q$ZGhbBm$df%=hh8A0UDER^`pgh0ovLaXQg- z=Dz*x0~QB5c46Rt_sK-VsqflPv#F9bqBI5!8B-5rj;Eej3nU?@OT#ej9UvHLDICdH zky3}WBxI&P+JvkqAESLnKDzKXGr#}C5B5RwZwig0n?=JB(aHAR254;fcafYYSjvAB z-W^G1F6Jr2KwIdsEa9vTvNH(ygOe31#MXLVe)nAz6*~YA{q8Hj<-X}oyUXXdrYgIH z%C4n`W%J#JHIuR2Ig^L=S>Cj0e%E!!CHS^11^*!Y&9LCxKa-dC_!D+N>u1Pl4xy|J z2`!C0|1ZgVo4iGMnc(ZUDU!VZiC*Nx1Nd4qS}bP_|Cn#FR7ne+5@aoOT9D*cNqCXD z&>bB83%bDGn}R0%ft8m;s!VKZ1$%Bo3V- zpX}pl=>xlG?vm0DqUeCH%Q z$w;r2ZVFm2ZO3qU;JS zN8fX2)}61QD2G}p(Rq$`Z4dMO7|)l|J1agoFgyyW0pm|-!tf+^;=fJaUGiv3u;xh+ zl6s6FBwQwtEXkaCDtP<>634y@58!dFh;!sK0D0eD@xb4mE~`sdv?t0t7GDMLo-Qm& zmp7(M{j)F4zmh0!UvpWjoCGk-*YdfF_H@Ip zVIT(l&Cwu8aB*#CF4@ZGiGrNt000wD4-Or1iRU#x`4+b{D+) z-1X=xS(g9cz0->9Qx^1;cWqgvYud}eyU*6I`MA(cnE`wQC=w{&<Rs<` z#kM4=Yf8zplK3yMmyBDg2>c6`ejn ztXjTO0={a;hN2V1iA2!Iq+YjNojR~ygS<&D-3oHamO(D5IpWP3rK^3c z#8I#4jmfZz(oj>ck~*s_wf@T6rz)#NNgdyyg!F5$PAsH=Q}JrW(K$aQ;q;efaHva&*25P{;^On?$EO z2uey^sV5EvxCqlvkw9mW7NG#p9Aa|P);Y$C#I^TY^W`l6FCq5 zo$8;LLA|o7vF5OkQ8T44-lY&DW?4I_iJ~n&&T1%;wzU35K~ZP3FSM8vQ5`@QkI&u%ad}|39h<@k$bKUAc>%IRNN*Mw=KRX6mMNN+$-Lh zaPLeM@04U@XH!*M1^mx?^}t`VU|86is^2Bl?^+%a>JKOVM;MI_+Ik_LYThm2zi&6b zo0l%FxG_36x)6VR`u4O?x)pJ*bXjGptVJkmS={>W-aC7RGOD2)u{WmYrWb-jSsQz< zXuyvr-Mo*AZur30N|ZZ_i_S&U{y@s#CHTA2HT9{QZlR_-lcfJlqPTg@V+YfY3V>Yz z%}-x!7s{Y#`EK-1R4Ch>!v(yUC~gE7P)1o?;1oaMA<=p?S$d3K zJt%EiY)G~37TR_%pZ)eL-+Cp{`dqT~ID1u5M?~<6>nG%vq=_4;1UG_$E2&`5S^-y5 zO^N!h_bF-dbo}~pHehW+c^fe=3jgv#qPQOIC_&z@I2he28eSC*mxhQOQ5+$AMxKmf z_JHG9M}Y}|2NNsTAyy?tw0Fu@0k{sMzB&~0sYHgB?+K+5QJ^C7j9lt&7~^tOl$()4 zMHVIz}T@$cYM zAO2quFhYmc22C6$Ib^WoZ}CQghyNRe{sEq7z8nr+!G<`cQugvbqPHKz%ihb&m$yD& zmEhK7t;!f@01jyrUQW_S;Nvz5!Zw4Q%gcZB*!5$x=jR)kf=t7^8}4jKx_5v#bLS~< zQWfn&Mf>8}cVD^lO47ZX-bq}X;g|Xzc_^CcdGpBiBMEooY|o8n=AM~9G=F;bI35@3 z?ztN&`4dxk3TA6(;xiK843$cmHjnXth+1dOY9eUatUf5sYEEw+YWy+dqfOW#H0kou zy0d)UIjl3YM@&J3wwd-aYDT(a(~wo^&S+?RDMT)-8R?nn>xGeA%BrlljI@M8a-5_? zrlg&=qeN8_r6rxRE0QW!nge}8B0WSrnZBl2jRL=&(rFVLepbw$Hv(M2e9KoV0yCzXk}Vni+}XfR90powI) z=`2ec`e`2jDl)Q7eW-?2%~j1$Ckr=Rx6YVmPNj=VZnVy|&Y!*Y*_)pQlaE1)0?|V4${7nL?LBI4A^WUSqKFP^OvpISkTUuRjfEz z5W8R%D=rpvbJ*}=x@d&Hq0+gM$Kvz3y2cd`3l^xtlS1}fMADOD7W8subql>hY0F9p zdoCq$3abM2-XKpYXR#-ZPK!5PSbD=h=U*{VxUTu_j@un9{Lqmu5Jqc!RX3*QrdDk9gv~b3oBr$mPaSxY1UA%5 zL-2TNmn@{QGYvZz6RNne^7BMZq}m%IyQ)R^iy0U>9{2_5w0^;#_13Jtl}2s%4=P0#MiM zL4)4p!q-&}TFDPy*_2gBSJ1B3hw9)4hnc4wW7oI`S-^0NE&Z`LK6^AWK0w>SljcqYvGK>sSa;9k_tCX?3UWAM2;w2O#L-aR zNlhadB~=%3(pgxAjKH2su<9~gi zw&K)YxnT9p9a%YC4;i&bC@5P&2`d%G?qGw|W>JF}L|y%x!I|a)VI0 zf!I9X188BKTX=bCTe4*5jPpTR_59_9&n?B1WqW4aX;)doRsW6+lxD))y==zDY%A$V z6qcn5whIN@S4>=9d7^x`;NF8xn)TaPL`%zUAcZ*yP3VrjX3l)nmBn!7lds1g-_pjyIyu=RL!43wR7cQrocM8or zmoKOGpBDC?zF%{OI>P^;Wh2^|a#suP>V+ezhV4SbcGln-^LOoy5>nG>GCNq0a1e(f zv(8fxC}F^xs1xvprq{ev(ry#RA=hct^t5rb1b|34&6KKw%;?MY_pZPrL3LRM{Or(- zDx}uFObW|NnvC*Pz?hj*p=(hVauNS^$JmGCS!yRh{6r)WUqGIhJXrtW?nv8SS~9*2 zv3lEHRr(WHrb_22*B4MBro`p;CR{jN&@gjU6-6vnCH?!apLp2RmTKB1H0@g6`(8`3 z>5Sm6pE)tV1M}oo{>^++46bNiv2f+htDK3c6}Y+j=2YE&p>F@(y!Wmq>s}JDxGy2W8dy3kgqC!rsKj{*I|nB0BHn`rFOZ;D(E}m7pp}f^=$MAYEs@&bI_xc2tP_s z>Qwok@1utPKHKY22#!Np#v@g_W7f@9OC zI0GRaF=BPrr}FNj1OER_<*iHKX{W)@TnUe60^h#>O5Flgz(NF)C#^e4= z(0IIqKu#WcB?NPbMONennM6jN3Zrx9x=Ls)Q6Z(|J%>Va;J!Y3B+?D&aedVIC^D1z z7GzXiCITfCHHkNbteAR-F7nGjY5M1bG6bJ=aziOmLVtWY8VRJw@EyZ^gzqHGSAmco zj?<)wkMy66=l+s%UPaXzp7Yku*w;s0P7Wtyd>w+XWAT+FlHs`%7}chV%J>Sp03W%@8H1DSY<&gKmpnQDg-r|obuI%Lt4R>OFM z8Ojqj#^>9?=UYULeqok>&OxPqj6WqA!ND0)qVKcxKvfwlF&{asJ!JJiE^80AQActL zw$O|ZGnc_~XEFasqvq6<2y{0}_1RB^Qld%wTgVR!UiJ>Z&CI)4TP`!oNi^_CWeX44M zP{p{eiU&A$dSv0qVsFyB37m=JgYwE-{+s@VV5(`S(6n=TYijoiVfTq-c^}^9vt`^a z_}Z~AC5b~tccvC@)+{4&2Q_8&IQ;eZr-CqA5>yW^>PhjxfkMcU#;~3)T#yb~s`E%n zOTID>rj43LsIC%;F=!-p4%pwZLghhTp-;%8tD2#^izJr~6$6i3CdMXW{6A6&%>=lJ zS%e{xi74Mjk4yAu4TYg&9K|_LesDAffl>&Hzx+rvJj#Dc@6FNB;HYQ~0(cHxV4zPq z;kyB4;7Fx>VuDol2Zkd|7zML;jv&qj4Z>7jO!T6x0d~fitaXJ)aex@HMOhh{qB#x| zjm2Ua$AL2yST!3#x`0xqbg&c5#eNADgGOsvy{sS5v-s*#?b4}a+4fY~L80tmvaA<^ zr`ipv+RZ}k=B3R`@znNX!uDgy+Gi8?>NV`M7SV2MqO@ai5L%>5r|z2XyAMCeFMcz8 zJv?tFo57NV=dhUC@*+@ZQLu%TELgM;^wchFO|SP=s*@F5hQEBzrNc=E6PtPA38@=2M zo&VTi6#IT;I&ok~HoBIDO-V9FwOajIcu%6)w38EhcoC3`QDH$73ljyn>;$aI;QpmF6JRfY60G13&rV=e0rNp zBdJPJTe689&2|!7ySn|Ok~&Sagj-QYHmUQ#TS_M^)9$kQ%B9-np1ZF?a+;w=Bk3E+ zco?}kj_3x~ul)!-`AigTU9oc4gr^BVU7PCr?0TIQom z{=3g6&b@fw_YyH^{#%Zlj)e_L;^{IDI$_h`5LKp^yu;+3Cof0Lxc(B|PxU9Sk!nDS zv22N&EzX5QpK$nDahokw3opF=^6i&D#j|`6igFAgvFhY`%dPeyDszRhV^G+ol`NeM zvC2psMH+J3KX8UK?)f?L-+kpDet+T9f3vjj>>cP16R*sS2Fs^56r9HY3nT^YI6Mq9zZtaa zK>`a{GhAz71|p&LCDMU86(@>?6R{8-+kj&51(<_{B~C`LN;t>y4B08}BduQx?R{?I zVdfk?Nx4AbE^w zCp;1$mnpPteG(W^@!2+lhNn;(@LA3#siXm%Zr`40-+%MDyDt+LU!JaITD{dDlvLb! zY3`*|NwZMWoGvS0bS-=D2C*Mn+Oo!i7nC6N=kKOU~Yg=qkn1B@~QU>l&bO3 zR9ac?<%YZQN6Lb}&DFW9sj_CFta;J;ZvLHo(9HOb`+-y!=_|hZj-{UE!S@QYzXGz* zU668B3a-lec&esLsOh@r>SF3zjrTl_4;tIve(Cm0Zx7raNPBA*{L7o}o=TiNg)*A9 zp{7kmv*zy>)PeX$x}&#`zWw~|=X26EZbiCAmacXsk8^L5D7-(!@MO_b1XCH_|2=d% zNhjD&CBnH*J84Il{PFf(w6v2&OkSZc+|AZ|Rrz4;%UU@<(Kl7-xlW1PMLljlyH+Yp z%P2$l2m^y-TL8z<;#|jxorkBW;?nexwufm>U=?Q^iqx#IX40h@w17fW zy+mvH&ygQ&WpO|fpM#ZJ*qiii0u5y^d8_1B)y=Af>16pf!M;r*va|#bL2_wN>m*-WwENdvRq4taNzr2QLb7uEtR0f<0^j@w$wVWzq@hz^*&$SREWVVi?4Gqh z^pq(t+2Uz6_8ocj)Fw@kACvYwN!?D~R_duOD1c}`Cs&FSo0T8AYAr<%3mR4|_yw3K zE&s$up(BPuON&AeA}FnADkX~^q;Vtn)0C@giY=KV-viAdi3X%ul?7B(^6x#o$szxA z1nZgPUtASn>JbEbrX@kz@ zoaX;CvWs>)z(u;#JT75T4gzG75Y4(IggOUR0%DYw)IAiL%@8XGh(7R^-#9vVH05m) zyiJ*OLJ8DS7>E3ZV-5Cl%5M1Pd^c+5YBE@8VBexzq$(n6!unHARbEZHF1I{4J=tF@ zC;EjB^2i$vkKx?oX!=uQaQ*Z48t$VL`I#IYdX!uSMm8e_E%Y3hk-~!02=-vQwg#DwHB%1HK*IYNOD1v0iNXs6C_bWykIAFu&V=RsClu!< z3G3`C_cczR9;45=A%h>4FDZ%f904EFhym%Nx(pg9X9H4}&1q0b7Oa1ffY5G~l|cbz zNRe>P@ABN``Dn7N1EO^pZ9F%3ZvI5Fq;1AYC+Dr(er#cS^_-}8Yan2r{A6^Kl zvidYkb=Vi07PsB*gyh=i{5xmi_2${y*;Dg|q_YA9jiZDJh=9b&EIOBtE)Uv#jdI!h#b{8QPG%_5J zZ2?K;*``deNN&zBJD|dD;tH%%Tza`# z!ztKlmnsGn!bb+J`xvGyBp zSHd8U^QQ8F%Va)7$q##X`O|qA*$yS8_WPRQjg7$Oy6-MSv+0d}c-N_=S3@l-^$F%e6|<%sOMP zJ-N>6T73fict-hx_KSl_c|n*KddHWk{lQg*yF>s!?a$vuh>{+NHW#S(2RH?ua*6YLhN-gsJxwQioyb}jingagv$zH zwldBP{IbF}DX%t5Y^t5uRANTirsQP`;bf~O_x?E)uk{W;MUT_uU8V2ZhbDNQ`kDVc zJ=(&!Nau1W!heZU{73YZXqPGyhEALmVp909Gz|O~DBX3WixHktsZb@5$rS4gW>j8y zfykrZgqO_=e^67m(4DH^Bh>Fn*6d|xZE{rOeASXtE2L{Bm6D}1kfwBIs09aFL8fIS zs0gGgx`c|Z84uH_BjdYJYWM)uC8XmwK0o();%BQ=8Mu%5Y#(vZZV-RoYKR8|4PT#t z_C&JzfDky4uveuUcTuJ4XcPOy+=;I}kHij;tMjnMu}P@flq%ai^NjZL@Pp!#8++&W zer^9uFAk(6n)fWfC^R1y$`0dfimx%zvTJ#}&~ixd9hy1zpfa%FP1SA{YPWu`a@(ve zT~KnvJ?DPl-I}h!X{U~KW9OR9TvEDbvKAJ>0-C=OVnA<2%1aiB1@Cs!*Tqq!-9p3e zdp?-ZKtIuOs>NeT-!{4n2B)R(2lg#rO$LsB*L!TGit~4`1vs>0=iJV(?MD5|t7m%O zcow_L_&JvLl+HJ#DmsOV&ZR=3Vl%9lJg7T-_d>Gn`C0QXd(xHl3$NbXJ8LJS1kgxH zx&4A0z3@GE-O6^(w`FaYRx%D#J*eDGw)cI3ba^cr;YT08X}@mAkuo%-pfu&F5j-^u zJMVcq)^JR$eR0Qq_tp;zi*HoURer5%#l|`EC^>9&I642GbxK-ubCRGU>hQ>6!K1bJr}5rG4e;VsF}8vf|=MwqebbZ4gS* z7bMPt-dIm}5*P(@JlE+RBh{lj6)ej%7-Xs@vn)-XF@n7jO^*&=eLOw&0Qf#bzYytl zj#?;R@8w?kJkp!lZEGt1s?}r4L^Pcq{XJ!-H$(^1uTmcL3e|l~e0lv6b+jIpGJ?iS zy#PgZPpFel338n*N=e`ajJkUoY6{S6Uohi*o_?>w%$Q7Tfg1dsb;^2NE2(R8TyBpk zbwy475;c+gli96MDMPJ@iz;oW#lemRKsUK{bE))|l#cwT$W^PQRzf+s>XmryH!FoT zu2LNro3u1aNK4E1>&RAz4i})MLi?_}hznD2sbYr4(XQtiFsE$RA03tQjbd3z^}SJiKm z?NMzP>JQyLo~%DLdpuoGl`3cu3K|w({%*l$T*uC!H;W0%5nPh*3p|sQ~nOY z-;vm`4@W??*jvxvd_K{!H|gJ}#pR@gObL3MCvKidwCzs%_fR@}`CBjCdgbOTiS|9o zs=b08YLVggqe1b@tMbFJc;G9+HVuBzd=J{AhhJ{i^3+`Ge^|QedUu=(E%+ zT961z zlyqUePdTHM%}7a1MH9wMbh0E&;Yai^Lk1ZU<`6{Qd{* zeGVO%%^W-^>sUNT@>^15!8x_zqz^2@8+i{ZYHqdMY`L}Z=ElXQRNDce?ZDlp)WOrj z!PAMeFC;6@A(7`nO~b9LH?Q9M{7q2(IJ(d1)!|h2R-t<9(ja8M0T87)+K)vKYSVGu zAzP4U_x(twS28I%8ff}kC21X~Km8N=>Lg^EEAr$(2GwP0PU!Dysd^a9vK8!Tfnvc9 z(3FMF87TMHzJ(6ve}lYVho@*lsrW~03Nkxm=OwEGoD>|wq-oK@odj{&cm}K<2fI8 zr#%+~YW>te(4`*_@a#st3a{n0=8H8dU_M6Ii=U&b!zLiZxpo3F4xI0pJmI8i(Gy^o zy9Ka&6o^W10yq&eg!_p@(v9K)>5`tnFfREEoICYQPak3e2TvZBUv#o^LnzWAgAwXOdK&RRefINDi7lf(-1Wx0PJ4wO~aeUAyXI}=qq-PW7arG8xH#+Uo*bpG{Zh0EY(AYeaL8#!&hY3 zU}R0k1<`a3*d6&GPKe^c%?e=?~9SQr6A0<6IR%t!`M&3qi*1gdVM6$m6 z4FocE{Q+UsvBoRe>FELF_Sy*b8n`UA*9>~P0otqrALpAg&KcjRRQgpRHM08`C{(l_ zJMj$u!xXPI236NkP?~m>L3Co%mygG;gmF=|)bCI-LFbAlbY51+^Iu1EGA^cGXA-ih zE#HvUPW>*y;txC}>AI%3J8yTU>bixx?)!BRpPUowcFcWd_U!zY<=$^U|E=fWJNn%{ zFMRn-qHf1M_c>rYx+Ti!aQ^5o+~=4bz~3tGaXA0j3@>XqNQ}USqXI4Zk~wogv<{=` zIb>v$vmOj)h61KWui*$CN+2hx5-Lxb_8gQrOzB2+Jtp**al~5FK*DyJy1|G;17;i= zu)OYOs1=JM*P&J>xmL9#?ttwrmy(9ejABBm^pX;f+A%^_sfqH78r8EJ0p+qKhPqu* zLZ&OEX7C|(m1yV@?PHPP0FARmG{~)HUm#+gL?7&C7@~E9x&~oq3}~70by#rIR9S=S zB_~011=p)$5v7Lg_zE+0m^FSx`H~XfDovGNF+9f^7yx<&+nLjb>$(PHNyki4$oN&m z4FiM@J~JrB-!WY=NaTirfmK{jW$f${4MSpaI65*AzeJ~^cmnW*KC-Ai(XnSFG#a`( z&hMK%_5{4sd44>!83lIkf#4tOWM3yALYdm5T2wbJfd;hehks9mBIa{t{`a?^S*lGG z)TauXg@Wcq!{Vk?!3JUbnIAEG3qS9#G4M+m5hfdGHHzkoW7rBHP9OMTnO^)cJPAKY zgeoNe^!dr|pRC&3sE!+`j-+l;a%214c8DS?^V8e+|EcNj(Lc#coIH){VL?{r0~L}g%J1N~+fsCa#(8v=4KfuU3>>^0r zI;knEtpjD`gqx`5UzTeA!NF(VGyZ97vikUUT9;3~75n;@h1hTHSRA|!A$`|>U%xc? z&zppr9d}!W>f;G}<%-$l$bak53WuMsJpz}O!!^cqt8Pu|;qGU+)b>_53|nl=6M znVTgLGECo`UOWRK1Fm*WmTVCUwj}nS6g;OA_ESIl*u&MFF#PCaNQqAyVzhqWY&mE$ zf2X49pxvA`)Bit4vi};r&(mV$-+`9}-9c+dM>eBdpWOFy!adaY8dCM&_NQvM2(?@8 z1Fw3zfnBqa>b83x>}zxbsdn?fi?3((3&9)|$1ziynebZ=ooetJ&{2i8$SpfPgSra3 zRUnb_OQH(N{X}0@t~0r%}~OMiB=%q z32c6yK|xF)6Oa~1sXXTgVxd8P=)!=MJ@(X#J}2J+8op=o4IJF_c*#iWTSp)Y+jDzQ zs(!0bzcpFko%C-93G_#e-|qNU#{+lK>{hatnksG*ikp(fElGDPAi{1!(K4Q95Fu+^ z)%pIn_|SieafN|Zd?}7U$?0t|Xl9t%mWAF7(mB?kj4L)8fJ9YOb5QD;G7<~>h8sd} z=^{EwW?|rOU{C=dNhTp$aQF3CjQ?K|=KqXF{LjfdPmeT1aHjJtC9}zxNmKR9w!tHo z%BxR7Ve&M1@+{ELcZ6oaIeZzj06M$vbM0>pEi{k;-BjHcp>9jEZW~yJGF!1XJ&i8{46%JN?IR!N@w!fjKJmL zK)pPkN2B>@*Mev&y;Pb|15^o!WYf_2 z1gHg=9=HsP0epgn=U26n;RTAg4-c2H5hX#F)5dZJ$3y$}AzWQ6cXr0D;_KkCoD3&u zIJTkIK*WU=slr;Juy)4EZp-QtO1hTH|G@uE|8gj~saGgDj2&f9`J2<%r{~WlJq-za z1MBU+M{P(SL4g^@quHcm@(Y^!JYi@FVKel=Cg;DSIMxMBpvFWT-=jDJ80<5W9Ol2H zIP$8g;X zryPe6@G!;MEvF4~Bn2TfmvV1cb8ml4ZnLFKPKVHn+hl26EP2;|#}9;WX$dOQ}d9MN*{IxJ+E8(H9OIT8FF;! zlt5$S9{LkE?tTY9qU&-b63x6POGOm4uvG%!*YNZ!hA)}EWd4%nOIDx^bx0n)*AdH~ zz(dM3n|NU4CFiPK04PiC zl{m4D%puoSC9M5s^p(2Rpy?XY5v38Mel0S-rtVZjdS~lXr!9Sj04Ozh)erAiAbBuy z*G<=r*9|m%V7ykIo(#hP#ZMqruDx`Es9=z2jYzL35fsKGL+j8O!Ur{ZwbTO9 z#iUzOC&FkNE^=otM629FT55a|mB0>JFflz(W!Wyj${NRSr>C0e;3cxMHXgqQi$ZZ& zJqiWGWPwPg!Ieu(MJnk#R3_^IHr(%1++B*h4lhPPh!|ctX&jlwi%D&hDSHUd(t!C; z#DB`N#P+bzpDJt>3R_c!TZFbf#~{!2HKZ;Q8@S_ED6(YQw`882)^BOyHmw2_ljFSf%a3~ z2Ze2<@>B|irk%ZXb4$wIAh;X;(P+W_?zs4&3b&`SE0GEvJDqe$6$Sa`8{X<%Y)Dfl~=j-<9eBW!)Oijp{ZLr4!-eRI?q3UG2w1q>`t}s z723(pP#0?aU3-UwzoGgg-@7OS2|P=5B>^XPnfa5qzZCw=!2dNmT#?07(V4P%(Id@U z$vS#C7@O3T2&J*hd4G+DkbneeI=XEuAb0``^VBRf&U2w#$T2%)I;<~revAs-q%;6~rpaX|raNw zaD^Wnk4a_LogWL=vB-la!5Z1JC&c}hyVAqlz*N=A5e%9 z#Q!M0KTEMRj{G>i5)dLj0ee=&Da4?$N>_{`l)IHF7Qlc%_ysv7c#27F(@UC9xe2Cd zLiOVd%)V-AoqRgUAh7dKl{L5a+}x9@>=Y_H7pIbydvFgtyUXTcy0}cbs4Z2zODNv8 zyenCJ1lps;m8AR$6Z5B&#a(#zl+Ae9yzFC^Z$+;j#1r$_{EtbdCu3Q*YX~W_*-NTM znM}XJLhFfI`0^?97qDYIZ5hSsoyX7KX-g0qVx#$F>4se|io2BgEHpV4%`bN@<|3=!-IdV*&^Z&O^{DgC5x*A{jG00Jx=vojetT!HNwGhk0moK|}l! z{|_`piE`yXq|o1y_xJEbH_|bsFP4m`FoqX#w>O*L49fn9Vn3!VtK_NQ5-kj;vZ$L&3~uiQD%}`%i`NK+~F25uc90B5y#os`R(A?U2bLYY?*MC(k)8t4o12Vy9&gBbd^%V z*Dg5PKgHG9vMDZV0cPt3)L(Q5C!$dzu5hD$YzQ%`Q#tW}h2+_sc)w``JP2;)gZ0Z1 z*xq`}*iUyXfe#1oY{T`})}VuNs@5PlW@R;k<8}qz;NkOdDK_|DnTPj)hj#`GLb&cG zCru~3Z)uLuS{Ky!YEDW-%wFe{5VCRh=q-(a%k`V4yc7ZJ2 zxJqUXAQG2;9qiV57G;|lXK|Q7{~n*>|D3#EBkxaWb-aX7-w8m<&8W8o4IZA)r@qZ+IM!u|RBMe4OpIqk?1^i2rxU zf~J{ilE?|2$#N!AyKend|jfr%twifr8n=UT8eH zic_v?!Bstfk3@iyoJQjIR1QXEa|z$r@yGhE6HhtnkI;3_ zu}1gQQW;PJv&VjV$e?VlMcFqFEyb*N}={Jm!act(!UBz z&l*A27u4HlAfcc>g2NXSsD-V1y-Tn7ji59Nq|}+dp2S8E@*uY=1JUvifI5*}yjBTG zcIK3t;Z#Gt_C~kG78q@jX$-VOm07J*-zKFr?H9RJ+6595d!J~*!U#pM5Y*Wv+8_(0 zTbkLd6J3$9k#Kx~?q&{+NF}Z8Z#Cd{Ns4LpYBC4@2eg(R!V?SG$vn2^6uARqxEXfb ziBfnR27!ElLVrcMevd*H#EHf6aA@EPcB?bYursEQDcwtE*GXO^7O;xN2-3icFqCKb z|A3Md&3Uysl3>S9pRTy}|JI9wx@0KMyj=_?tY|qWk`I;E(UM0Az zRB^zA;+h5Pq9a+n8KH@qKsA z2c=c8uMk~2lPuji<9ZO-xMWRr^$1-(?^WIpyg+rSBICf%UVk>_?iSqLptT%%Dii+# znyP{wgqcv-h2Pa*g||M)4rUl=pEAB?xJHVu7=~P@CoE>WL&UJ;xT^+IVbwIX$^0d$ zvqJb!hf(E)E|K1ZP%ya<;D_&_u!-MG;e+tR0(SKBatJrQ0og(21lKH>#>d8`DMPX* z=^*58$|&0gq>hl7NqR3jp_77}7(&5;3mM@nk8qMT1LK1{&YyzsrJL%)L6T5OoF${H zCVQS@?mfPU@6wcf5tU-fwZ7^blXH^`ol6ZQsc}D~%hhVYWj<`{e7F5hd#Y`}(6)cZ zk*;f(biT0t4-*{yAAqDUtVtJErRz5=Ju6i0$26lIWV6G`&{5VLz;csNPXz)@hJ)W+ zCrfUy)xehg8NAQjIWHmv8Nu1u3F5LDFq+)@uOPC1aJryYU%=`h23 znrGjj@qkl)v3~SxprRrvJ^8q(P0kHuZYJSSA6p}AQxPK60s@_E0GXhLaYD2MD=ih; zOlO>{tWCtzDuJ16COKjAn{8r>g_pxIkdYaXT@suVAeP9i0!cupp^wSUPo509c|N46 z@QbJdrb6ki*;lKx(T z^%)0wbQttUBr|MclDr&?$@B#yBq<~Vlm`aUsw2e)(&5*(G2~NgxHHcnG5VXlqb48`8 z!X}}xNjfUH^h&bu(2N!FxVO_)1v8YlcHP{Ss@x$|?nqT07AghC67YP|O?z zeZgYr>P#J)L~rWASOx^`ELT?$CgET@gPzJ!!c#)@FMJrZPuoWScFG9x=F8GHE*X}C z(KZLlGEfb1?T8*cci^gcxd+*n07miW{kNB=0(XO8iP9gmRq^qxFW)!TJuM?$jkV}uT{E@N2U?|wh{|>sVwScL9i3L!{#FcW4Lxl zVwnR-B${xKM{B;cVNi{7#Zv6S4_#6|9a9t+MHj0|3>TdZ#l<|S%2M@W{6D8^C~;Ef zixp(BJs27ugh3wJ-XM_8AiD(3$;x5}8bp_rUV5;{E)0Urhw&Ni(k)<@>l%Nn5l7zZ+dv4;^v-Tbdm0waf~Nx;6c0U3iw%OOGqL&DduN5sr$E^h zmP6QH;C=JwuK!%h(|Hf?wj8<}7q&b{$y+-U?xwWIH{XlRW^iOTcFpZd6*mgSjfvt- z$h^5*@OJ*q&gTX1w%KPFT#GM3X#gK6>m=(Z(hlOHYpFk3d0@8pAqds!x#@)$@0V>@ zYFNG`Y(9om56U_h<4b4nm+hh~{$|wFzhU-px}jCNXnOX<_r0|Xjf+*w_WRzR?|aKv z%DG)94J*~0&#zOQ@{#`gWnJk_+epn3g(g~|DrnqXk??h*(*DiMl|p6j1Akr0zftgS zTrqOBKVyXTn{=QXEM!&nM^3J)@nQYW`ChGd-_NYwg9hB-T{Zvt`(1}oZe!`}iI1#Y zNgY-5zIXfgJ*|turAtC*&x7K&#jQ);`^DYg_xd3^S?Cpft+SQ~f#yZa;^*%lcp(`$ zH=FnUf~xt@!YlU+wxr$VDL2Vb=7ZmJ*RLElat-Y(M~oTNCp1j^TC4a&w2XSmJ4|b( z1m|zSk)FSX-MJ7Bm2*v87qdJa>VMrFeaa~I8Q|xY4x&QCAOjT zvlK_-1c{+KN}(1^KmHfVE2hMoDXxq{OtVwlZmOa_JHQxfL#kI}PR(&Y^c;e@a2Ba5lpnlM+8YG>h!(QMhfxCIQqW$%*o zv%KZ5_glL@rWn%Nl#*cQ=pfExe9V4k%|)eZ-@1fp$na( zqtZPX#mZwRD-&NFjEGJx%$E0OD+syPfDU6A=k4p-@ltG>+={!%g zAWw2DvHfv57{inyVkUx5Y<^4*+1i0rc_IiOTfRo|gxC#RJn1uvTOL!sq^?K!67wvg zNm3+PI?;UM2EFW3M>9;$%OumlL_Can7(gDrmNJ&>Wt0-t(R+{do{a9HI9O$iz4ODI;c}Lm^aDX9UcmW0ej}CJMQNN&r)C&CeK83b zZ=qKt-{7~=dyCW~jOro{M5f@#vrP?#Lg>yb3C8!)+aP(@D5K*z>5KM}mH6MLNCxx& zm_pwrPt%yJ11w?JXE8H@%n~58tih}lFl|hxVIUUqp%JV$Y({E(APnw5OXWRB-f?)M zeUF5#_VF*GZYUgjrU!UW1D_gA z2IHr8c%R?{D1Bpgy6#VnR=nfLL%Dn3=QjL++xMwi`Pip!CG-(sc**QbNk{c-FiY<= z95gJr*EsrJt8g3I64hO69DdiTs|-#YPEPDRDb$@>@OkAf{Ypyf~H4K z1gZftziSyWs(xeu5?`_y0~r}*VR9h$hYThLCXEY>BG(z!FEXm%U{Scisc=J32iOK< oV*0{W!o(f`k+?N2vk8sm*D9!NK3n&ZtZ z9!$4nTH~#mws>2nJ>JgpYSNKRN4$f@L+Q>;SG(YIhz45&)UZ3vI48#XmydfRS?2GS1yisUM4`%kq_h*LULz&_DFw1LBk7N$S4`d#Q zKae>XKbRSfk7mZ=W0^zoLp*2Wp5TO**Epe7bYDsv16%%m*RI9mhgr=w)NKEZnnzg8 z2x@kGM$MzFW+!TPeMZgCi9uF3in@EO>x#qgIy73})ya<^V{N+8rpLOC`E|!x&0f^( z6OW7g#Ub?O(!cz~ye+nO;ne2q;k{I7tVbC^!ey3VrptSClCLMdK!=Ms^`g^C?!z~aaT%CrzR+7%VrUJ&n4$Y zX?#Y?P~ucBCuVc087gU%=VoVTq+B95BTY;v=v^svVN?B&CMTxFM`yB=sVUX>Vmh7C z;wTDCiMhn=Y+_29nVVI6DTNN@87aXCPRu5yWI8RT6R*rjX(1t}7Kjqx^QrXQsjP5* zE{&A?x%1C6dhBZ3c~P1Yg-2)7bD8WjdfRiUteBM4@Uv%M+?n;<%q!Jd=q_|_W-67F zpOI!JL>ZJItM1vkB))P1LpPJvX~!rLwbINiePWH-&G|E8Qc#;-oS8^oG%_E{ToeT% zm7PLgjeV&pB zwa}|4w*Lr`*TD)!4lIz1+q@k8C-?=MXcz3FL*N(O949zLryg^Pu1U`_H^~dG*Mf1k z;1)fX>MFa|;(}+Fe6Mx>q)YI<=7{^2xmWqP|9LJJPy^U4=Bgf7T@%`}4NTA)#-e76 zIPfwp2qWdvKh;@55J>~#N!!E7{WgDt%bA5M-bL>VLS&IkKwfB1 z4#S~6Myn+!S{ahXD8w&XnRG02mzY$lD5e_D1@H`kzdXQkvh6AE>-?_u@P72Ca~Q<= zOEuMfQ3^Fo>a1%>0p!o*un1-)BM^rG|fmpM*q*sZkU3!B&%I<0F~)MF!& zYc!}7Y}PF>dKi^=^_ju9`;8X(7Ry<*ngi+sZ3AXo?vfcd@~Fn8mU7C7Ent4pHfW?# ztI3F$fA8DYoJH$ef_(ui3-1US{pI!;F|@G$N()L+*J}M)&nSPX+jvL$_f=jFu>H|n z&NE_*_C?1ZgGqeX_MY=yeFcK%dZRoSGh(~aVL>w+UF(^&q)?~A#i-DC9;h(aQEtej zf*0I^$B^YquBsBR;4@3ySV_Z1tMYGvTk_-{Fw)jDXPqnf*?MyefkhYlKF@o>cgwW4 z8hz#t88P(pJFiF7?KIkzf01joDsB3B)nL3r6|>fEw4CQ+wc`?PFR2Z|l$O8slZD#h zm*+%jK5_;MmG zW~Xw~zv7}C_bYlys|7>s`AT9MQeT$P7gr`H7EztRL=dE&1mtHbmq@7o>68E(8IqA`h}C-P z0ljHBO6y1sn4K}D=*)mx4>W&j^j32ijYhH6zd!&B5;ttY?7* zN4s8TG^uXXBABJR<%wh(ct9e?t%j0b!OGc^ zr7_C$CK40rq%0>AGBKd2@m&4wlOwsz?8t0tb_mlAx`tkzJACxW<-_7gI(2bmc0M;f zlO4GNh&rMHQ1Ml90{BVNlJdv|@bpwVGdw#l9Y)U!jg@d=*qCP8rE&)A>u;}e_i8wA z&5B#`wA}Hu72Ed|+J}_(p=HM%Ps`o92RG}EEW3+ceTA;0O4m`!jTA%mg;0+Y>RFpo zLPHzj+o6Mb&%u1?V6nBM&^n~F4sA3mtw)v}Z`2g~1`B5?gSHk_b z!zb4tEDWAh22U1S`-)AGYkx1_F_f<#z8CUDeQb#D)p1SzW*2K?@0@u1gc86@>I#8Q zCD3X1WpuFmV!pojp5N7k_HX#8dB^IDYhT_N-E2I(?7Q32wdPt6YzUhz$CiT_&yJo# z$BM0ci{Yl#HYMCsZ0kqY?bLM$tEI7%s*kUX8?BH! zw{nin{ywF3AJL?L*Xrl<^*#8kMzsBwcN?n?`@!Ha*mEM#xWUL0DM40Ul9-#5vJ<+x zObs{^1PrRSK)JXd6GWCfMqP{CDZ9YG zXEOlSm+_7z=aOs5joKzCtmwnG=v;IeqLa~v+Nf*7vWmK>x#%|PE_qPP1W^^WJXK%n z#g}@a8Q8~v^?&d{BeO$B#N5*3XafE|n@;7VQ;18CQ1BE5XDN6BLChzeqxf?aJdZ%N zgJr6GLgimp`O7MQMGZow$ceA!5=?ti{lxteGjqAwxm-2xib~H=2kyxk>6N4;EHqSc z9wQf3SQJb)upM$(gi9F>O6izOs5lxo4oWi za%Ebi53`Yy`VeKo0V=qH;BQ~!z|r;}C?0&U*wpf|{|Vmd+p6s-IVpl|+uXf&T4@?6 zxmnu7HOJP+l%|oAm!*ANxZyXfP3bwC&)L4!9J#T0eX-<3oEk@Nq_3w-Zc2GLU+B%q zO5_tSQfwR;Arqj%d`Q^QvrX%!QW+NbxSgrP9;OPD5^Q7YFe$;VN+tL;E)Jy_sw*u} zuj;~zIDhV$g}*>B?DLV(LkyOlbUK=YCLhgBi_ugTD@(KfOh+|V5tWHm?2pb$Gnd4P zTr`!7=4PT}`=(hJdL1l!Kq~+u)DtBvw@fq(3ljqa(VUdb%9O_}B>SVutYG$rm$H7? zOQIK(6PG~)`ZwQh_B_n$jcYRB1a%XY`~+%=#;YE|MDI|m9lx_$ZSt+0$x#5gtyC4> z3IE?0-jo4$T@=zB{I?4(tD6)d!yU|%WN_zi`8YQG4 zRZ?PdMnG-nxJ0{Sp+*DFiD?+`5@XZSU}feQ%bkt|qz94BcBFI~G1X&$LWvYb=?LY8 zCMUBALj)0y3^EXC_DmcXXmTQ#y22JqVsb7!p}|GHrtvzb#yc4TG*^SR%SmL>^1iC= zbcV!2+Uhqk*qf*$e-8m+us{U{+ai22th5Y6-UjcvxXvSGxew43>`{U}`9Sa5%SvG2 z(?H-W`TG5k)}0JUwB4)aqWk`MQ3Dm|YJ_#wtpp(Qcf}GFU3{Rk$q5Km^P%NQ|F&O6 z@H+}*+mapP%v{)JU)-YYU0rZ5kwrt-`pV_d^UWGed^*sR<9Y5TKOVD7v#2P&Oo4rXTctRaxpV<$%#$X&7~fu~(dS8*SpN+5Wm-*Fxvif5LeC+k=g>xG zv*)zpiRL{Wt1lO$y@lwQ5*^DQdTKLz7I|BMs||aVhQ0az>XQzT~vQ9*Z`nMXuZsxwYk#5qpm}pz&^EgV;W^jAn+x)x{Uh0ri5|7EKP&|M5&EjWfV z>DgFqSq`ZKIKoKJCnQl$2>_C0lFtIe0H=@zM@Q9JAQQD^a(48fRYlb;Wn`)%(IiMT zznUluv&Bp$2@sV^!x$7|@JYk!tVthgt2Ncssx`X&6JX}R`c+#PDW<8WqB-#am}Anj zH{BJ|u6jMIAx1}n5yGS;L%OuGNd{I+!LYGEO*!}%s3-qB1UqdJTO^^5DS=}!7LZ}! z5I9wAq%iP=GVlZmw7#vr!FRs$r(gMQ|BqXjL%--6HbvUn9Vxg*>?YLhh6fre&klyX8V)W z#I>`@UNi|e-e|erl5g9$86I4pR>BW1yS7Z&ASFRtEVu(RNUtDwhhka*8m6dpnt}@y zSi*;SDxu)d@lXEy2pD{D06x@JD5Fv58J;Pl6d?>@D5ET0p^UP$ri`-aX?~Bh$y7!W zr^dQ6N~y9kij=m4h?)ufGanOv{0XxDuD}n`Pin6YLIMng1Q`e+2w4Fx7V0VzQD*oZ z*0sofe9kV&fcQLP6{`#xyw1OFd);1jPWvf*uws2KRB=v=8r1KO2XcJ_#e(Yn}=zuD2(17FJ5Sh)e6}=Xsvf<1RZ}Ka&+&9Jb5Tg1T!>XKGgG1t9|r(V zrsb1}XjPj)g-inr8T*|hYmcv=yS4xQD<8DK$hO;pdzkTAnb9H<2VnfV3dn@YaD_>F z)kh~jdOP}B)ej2_ylYsN>J_ts9(C!Ju2R3%5Jvg~Y8aYiXD+N711)Q_|HOgq-Wn^454jX9lzjvN{&{q{#ukX9(b{zF79@eU5wf8&2>%H#{>f)g5w!fYHOr^(1fusm+<`Cf#b6{K=vci{=z2iudSKgb zI{?7(foK0`0t?!4j9dH-B9+SyJI#iy9gp??L$0}7%MA=Pe42X+_}>~Lt^^?b z8#NBD%sC9hk(J3x4(7~#2OkDLWg70EC)2)#o*{*iA1>2l#L6cBC4P~|X0ne<{|JTJ z+Vx7bZ6s1xrGHGp4GKubPy;wC#pxTIqmtRn5^)LXk0^Kx0k+^diklctb!y9(ZNh2@ z(X?Zg=|G~77MYAru*Q5`Q)Kn@_s+g`cD9y5^7*Y$B=3o6>m*4ZQuF_c6xe`Vph2*N3)iTG0=R^7GD&K{g%2GO*z?Pl>ki%A zfZs}}K*$-P-e74SZU?@N%p*dov?~&pLn7!K@HoW)pAN zwRO-6#-!42T4T@N3R^K&TUgVvXpu?rqZ6iZ7yn=q)5V zDTXhohaIJsL6|UhQllEs-TKaxU*>O{H9IuZC0R%dfrR5sB z4rF#2#NP+w-x3N7+`do?4Zd}L6iA?6)si3|Cc&hT%_)bt;z@{;b{Wj3mA9OGUJY_+aQWut<*pz zP0~L>7LI6{r1?JeLDr+4z9jdtqBK_SP80XhnUlPYo`CZ1dV+>e$5uE}2*;FgtPmbm z!lTQsVsm$)c|>U*S%xQr8Qp4%0POhxs#QB9=v4x}ra{F#&yVNp2kv>DZ9pY&)Id$H zgh|!T9y>fqn)@oPFD?1QPeJOnP&WY}pANFk6av=od?{Z)hik0U zI+x++MQ@Vf^o*SVe~HUDYoC&8?217+mEHkpzPRQ5v@ zNk5=+eVfq8OB)pY3kv=v1^Pb@rfX?t#*mxSM=0b;OQ#1z?Xasmx%LQ7#d9vW| zzwPfY0zicZiqWC#!7bxRXXA^T(T9+(@6Xqd<^!V$i;)8xT}tHC@_FDBcoqcS3cx?6 z*xC(~P-`DdLajcy*Z{lYZR4$(~crh*<^y1|8;Dch2*tS@2~Xu0lqj5#zc zZATsAtC6B(Rc-~>(e)8T1Gh8rf{8Dtki^q0!2gGewlH8NEnYryO3q0)h<4X~)VQ9kv zpG9i+`}BM_+Q05x|Jt30Kff7$(d_+B?PxL3S_pI~FdqmX1bRvjaJ_@2gB5(}rfqyY zR<}bdRGVhCIoDjC%t@CBn_3!@8D}OP5vS-E$sb3$KvjuX!4GO~HZ4m3hBDr#z*2)G zrw56GqcnBDM1&llT$~q92|dL?f5~a@^_A?-PFLR129WUxf13X_7j^p97E1JUYxFnt zQ##G_&e(e64J@hvHPIKdIi7evr;N>_HLOE9WVeLFdzi2 z%Xe!hc$x0$5NgmmD1@MSLXWlXqpW*Q+JxFFjWev)>O1@qxU(~@6CU-rRVQdrhi<3I zn%e~KyOEzZc>-m{R}%8vMfl;xYz#xIwH1f`iQGI~!M=}y)9`hkVss=|w%O9EH6`&g zV9q-Q}nyuqfx6y0)*Vt}oV``wgyot2M9S>TWp{DdtDHxzOVRpR&>{|0Cz@1f1aXdqX&6Sr8DtAr~ zwcn&F^|_Qt*Hx=F);g8AG*K-Pj#S;S&CulwiH1OWhq7wZ?B*$%c(kjhnjWj4+M2{m zY@abbnh2IYpx|i)vIl`KdbfXg{Yw7ux%Y+5{^u#D_rvx)2 z$L9X0p`u2Z)~TJ>zIklrSi!&Vwtru7^l)MHQDyYe!svNr^n79T1!eSw<3O}RI>_QKKm>%Y}zE=tF z#YK_Dw-+~F+6+Gmn@ahdp0Kp_#}Y-A3OUpxU9!`sSkD?5kICZETz|^WA5bxDbSK_L zXdM@|?GoA782VDfFmORiGCQkT%*KY{eT)s=?XDUd)^MTYQv*_oxbrj|w6$So3U{xK zZH5Qf02D&qN~nA7@WxnS^hss($$Y4LGxSv6^AuYIpMsteji~es3J4aJ5vS^4g8MLK zPEjyT0b8g4m0~oP(to1heF`iw=6|LV3jU0UCA$HgQnd6HBf}fEeETT2b}QgzBg{$7 zt;LaJzxKBQQM$OMXvs}6a*8Z@DdywayH*#Kw*4hmCBXS=eiNjEBfRri8GRlrqt9a{ zm$Sm-0daJ!Ioz*06mMkPO<6TI=ULtoI)1`Yk+yT_6DIJ_d;)rG`=^Lh0!D#{-?t6_ z?BB87?|6e9Q`GKulmOUhefSX&y&BX5K(wyGfFO=DEK5ZVCi;XBjws6a*~4m`zSJI{ zkseLX$;tG&^U~w^H0Ih1RAa3O=1#`eEPKsXLK3$dM#)p2O0 z6=e~y+rUIpKRYnOA?1>ze8pXGl#e>CjyM+`#r=8%Rp0Dn{hjRfY} zN|%7Nzu9pGJOk8KzhXH0Oj-nVkIvyAUF6XFWfu47#rY^K6FGqR=!K!7>B*rPbRwk$ z5Y)e5>%Sn~!8F1D%fvdWdpaq@*9#XppPK_28Sybmnvqn;B^hpxPGK%HE0-}+%%!n> zb73$@qz1tM%ok}8gcb>dAe8{5{nKmEgdl%O{NplO0fH2FqK>=%x@!-vK6J-FSd0wj zhab)loyxaAvV87tU1!N=5BI`*zc2RAQ*S@DzPQI?lP9S!Rn}f})eoa&S4oQFdRV;7a~`tj8IqG{Fu)kW z_t4AEcaqqW&4H^lu(nwYoNi|QA2e=z(UQ5-;bv&dfYLHh3^ZPw&v!p~D+u2PW+}y; zP}5Q>{S5VCk|Kszc8iz(10_i#v9w2!DMB+rSPFu_reX>{AcFo75eR~<$WWeK`C|Fj zeIGmRnxVm`8yfu1Q#@H3=$A|l^jqpS?FqgLdjjHM?S?&pQWf?DO75^HP!pRE!nlyEyA%zVZ_-F~5UKXwr*7E;sD`TUxnEUwu}izx zLnd48hR)#qTI^I_>gXN)_i4fYh>HGKb+j9Q_v>G`-7dlHs-}744ybG-#;@f$BQX?oDbp8`G3MPr)?`7{ntq z!#3khPFwTOAOjw4_#)i5JSpKXi*fj$lm8cjlAY)Ik9{2f2>(mY{}I>w5f}L-7x;*4 z_=xNLYf7H@IXCii?&*)X;~#NHKCwA?+s7P&-=NJI{v+=3kGO|FvDuN~;rM5eG4v64 z^uKctDcnP!*xbDB6Ar<>h8K9g{o27BN3S2f$5H%V*0!H-T7K@$FRpy?9*5|?ukcUt z$dTThUzxwhQT$%qHp+Kgd+x>;ucHp4r8NEsSo1yGG>>f7*=lB+qtvz?1tnH+??w9% kf0n-{-I%|QjwpWb0^h-R$zu6*0fB*mh literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/clip.cpython-312.pyc b/model_executor/models/__pycache__/clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f39aff999e4ba0853f8e934103961abd4772663 GIT binary patch literal 39577 zcmd75d2k%pnJ1W87wSaezKEj$;v^0dyl;u(A)cTt9=6JqiXbXM7F7j6Sp|wfgErf( z8G~clB4?+EWVAhk`WS&DZJVCmnWkrU$8t|>w7WZ|qf(%|%DGgS-q!4nXJaBt0yOIB zKX&)`y_Z)N6O?wlJ7SZ>tCug|mGAiO_vN35LVf{P()FE0!<-=e72W8UPm3&HaS6h0 z;fxR$;$l)riBU1-h&ocvs8gggN79vYN8K#$OnOpf(Xx~`>Sgz?q%Z129#_(z3Pb}e z&7BOULeWsFJX)R#N5iR#Xho_rTFIVyl2xhdXf=zMC2LZ((OMSwChJo5(Rvp5B^y$W z(MA^cC!11hqH9vk(PnlZNVcR}qpd6+Otz&W(Flu&lI^LEXh&*obZx3L+R4(&lU=Fq zXm_e7+QaU{$=+07w2#FrlIv3a(S8=MOb(<5qk}A7l^jZ~kFIC&>g0yhaCDf(YmysN zo1&Xio1>dkTcTT1TccZ3+oIdpv)bhL)Q;$m)XwP6)UN0*mR6VCo!S%KliC~In|dbt zOzPR_v#EX2ee79%^10Oh=zbP&NFGQXj2>k1#^j;Y^U>#1hogsa-xObyJd!#ZJ(@Ze zJ;v^vlgCphq9<6qB{`Bh89kYLA^HNlZ%uwB^iMkE{0z7}7Lf1UW(^?g_LB`GR>T{=DPpghvK zc=vAn#(Si%k2~VMZ@8i_vzq!)(>iIdQO7GRwI8Vi(thccqWWX(*&v<`Nqb73ooCP1 z^!x!UdLd7&$k7LC!Ija}#oI{(_uwmU9boZvBFs63e*_Ik$g7&T*D= z2XgNGf}9sw&Rxj4`wMbjVmbF9=iV>KnP54eLC$BtAm?R=Af1Tsdr#z}jl1E?e`y_g z9+%EbuSl`?pVQ}o)xRJ0AF!`qIwq}u&&|uer&Xu_Mw2tHzJtG})IO)`J(7{+(F;jQ zEjuwfHI8h8JikKr;~`hJ}D>Tv2kKYYT^a0qCYGEWjiZC2 z8ag#SH8m+`(yT%xRG&C={Pd9%ClBm99((cN^G8mde(@#D6CceJlkw5yfziz9fy7uw ztz{$3(heo0WSsXytuT|1OiWE@($kEBK~O2)+V ztUi`_3L}mV(!3fyVb9UQdYVe}kx^c)q6gY72II)Y_@r8e`_xqC$V6PadKw*)(8)@k z_M$YE937KVSVwA*C!Lv!W8GqG>Wt;WU+lT*M3N6<^^plI>68>tAm?eeXlN}3dGr)E zLuynFXm^)Jr=(xg)fdnR<;7NKXyQj3l$c2fW5eB0owBIA^B6#Zy zk{p>Fk0fZJrY(8Lc1A{wr(>5!Cnlt1I?}tT&ywfT&d6alRaAonFv+X2rl*cZq9=>|z4M@8uEF%>D z=Fs{~YHIydVrnqMcG2L~>1{iKQ)uNG&!;U3Lx%!w2yvESEVtmRhi$W z*N-I=Q$tfT@*Y&ig69xPZ$j`7gijnEx3^Fs1gdThDgO2)S9{)Hee>Y$msebroN)*KoRV|g6L*0+aE7LhN;#H#Nv_F0pA=J+cJcHZi8*YLIvW z`=cpa9%6l(6E5pdaj#!G=8cQ9n4P#|77tDV`HuXWs*r~f04UVbPGm?Tc2Q2mV@YY^ zV&)Rys0&+jT>c7WeGx&L+QW*LNARoG==+yg3Z0l;cY!6Vc)7Ro5*73!M`K8pDTe`q;Cu?o>WxZqqC@Kf+u<4$+) zLV88O4{wDUP+jBG$)xHYOOB?~eJ<5?;^4j!)w%EZan-s1_$k$OeBY_lYGdq$lP8ZK z8F@Z-`lS~R#!jD%fn3{n`rz{~0aI9rgrem@1U_31^ftEzpOC>XzvcvD{!R&Ng^aNg zI|aEy6fzoHK+`@a{Ep)Xj`uW>6&JqccwM;ad`Wm+>~oCF?i!hljB|QE!l?rwlSo>l zEFy_?1Y`{8;CN(wQjS=Axif-=l0*ko?=flSpe%#z_n#$H$KrjWoWeuv9$6csZLYxu ztx=66iRmrMub~htxE~P!ElgF-qj2wHvl8B#^=|#^M}g{31sBrF8ww6F+>&qU%Qg=b zoMLEb+35`XF|8Ffg)*nFrr<=jzey9W{FXN$e5bL~Ifl4#8K{O{!is9%5TZaf2nofw zfmR6Jc;arUO!9(m@yr6za%u(72+jE8lbtJc#E3#6v9I5f36>KI7>N zsa3RpipkufS*ICiv;HjW8@4%qOE0^vpKW!yWxP?*tkJFu;7=o79Q0neExqi}ra5opVnUb`e_#BbXv~~E zjVja%ZRVzQE?cIy0J#a9$};b6HGYqHr&bDkDl zNh%dfCQ^w^dezyo$@`mVHXTHO?pM^`shpq7de?B!Y>-n%hpDr)?^%cc43RHFu-BZJ zLj$^^fOcJ`wOn#t)**@|{hDjeb)6yIj4l!u=A4&l=1ML?)bES$X>Daf`r|p*<#Ii4 z{_1-|&(Bc#J%>p_0%Ot!V5 zH`g$zGz{h%b}0?J=0o{#b2i+Y_4XqCI6RmSw&jAGmEh)la}N+f!7YTgEDO$1`HEKv zZMYxGZ8)lIIJy)(2Ap2uF9;r=|0z)EJUa4)>jA{m(_qzA5k!~M#J_WnIj1?R*cJ{V zj@_K~@NlLBc$7^Qqrke+i9Qz}ei|P6A5!pb1Zi4w#z3p>1P)A^7~D+e5*WucL|Yu6 zr)M{>G7V2Hb(@+RMu7d)+F_lMeB(f_ahKA#YpHQh*1Kokebe(O+;C@l@$5qrFdAD8 zH3-yWw$(WCf~{nLe0t0;9n7D@(!Yk`vquKS+XJBSI!Xih&EtoyUX5&3gPgh5xVw;c z$49z|lI3hwYV>N+l3aD|mS+4u8fMHd#;JQ!VjWSq&xv%`yjXv~`u^C113$Wa zzfI}eM^Bmyo=P{-R75IMjFZX~#0L>WqX$GsrQ+)*Fo-#)dD zHUN_t7l{B>A27|>bUk6o>+0M%J|nfNv0u# zzMA3e(Ua{m`pL}p*$pM-cT*>H`hWohX(5(}t)^l_5QxyPt0`i1x8d+Grq09OvX zJw;epM-4zh_9%>+F1d>KqWhX>&M8*{UD@o~xTvF`2?zkMl@X(Hnfg|8&AI11bFR6v z_no?+k5LD%5&0T&a^}-z0y))h;E1&9o4g{)5R*v6Q#k?eR2R{5Y{{#YjGN%2$pBg! zD36gs|CB1GIg$yM$#*IEM-+Sqfg0w}�O)2n1H?;_{YC0AH8)S^YWtPi{(N0WzP>ZRrYqmnSMUjS1IvQDuDswZBWO~lK@(2o z`%6IztsMD#6foHF+Z3y$AVR_06qJCCo0LYuAK)*22>~(8qPuFb?7s8=j^|S24GW<4{ye@NwI8X$baDZ2H=?gdN2Dqq- zJL9hJivSvsKT9qGC=5J!Bmf7=hg81?Ho&)8?wl-_PYT3&OauoPoEOGLaCBjCaRJE_ z4@%`vtwWT;2E>)J~6#8u#dlhmo(^rp^kf2JcoY)luI8(khX!eqOjL zMjJpwRIBy098GnD#z{`f9GqFVD}go!&WO5T)D3=1ucuKo*Tgs>We4aUD;UzMxFTvI zB?nM)9Em`~gc&)X$*4RMm#Jas;$rD3X)FQ#LL#2lv9-_4Qm#Qp(n(Mj>b0>Y%lWQl zx$$tedzAqgD$e%->d=t%tF_l+vUx30UPNcc zbf8OfWPtK<-2AqP*%9T?oI?(noUsXJA+~eKLGXsIs~B@pNC^CeK5#>7)o2{L zSZy7!K}sJ2w?R--{udO`MwZ{FfQAdpWC}m38;l=TksI3U;^uKh=YsT!8?kJ#ZN)EC)ZTdg#lhe(jEY zWK+Jjr4SG*T9$<(mP8HkAz}msTXImlr_#oHL3h|yM4Et`IK<$$&anMUM^VxAlv* z0DFyR5wVJaGDa>tP-AX|_YH9-lQS?%0`kVasE$-Aaj2%{cHF3C7f2s3#WiRn|7#?& zktqf)wTzU&?j0w^cv@qOX9rgu3Zf56vww=Z(`yi17iivV9=31IweMBh_da-isr~tf z;lo+)VLmJ&=v7TvQAFGsLqkOVw}o$u-*(>?ZrDnZ{t*abbJveM=H2rma1}`|gMbG* zoDt&Aj6s%N_M+O7OUImZF2D(G0xpsO$33EYX$=D0UVVbAt41x$A0YT`Cpzf1rx4BV z8MG*|&qmkGp3Dh4HF?c(!(%5%x$HA@>oL&2em!pfA~k5<=`pMsjmp2~yy3bi&N*k% z4|DyfYASpRMo>kW}0bB)Cz-m)l}>9yHt{G zNFq%QaCH4T2S;j{nv+c{hS>v8U340Z&^GJP?j%e|lIgvS47JoE0=qD^Js=q1O|G^_ zsqI-TdpLYxsrKOfp?qUou5n0d99q1()VO{A=;OAvZ_nJF$+hiK+VYKeh5#y>WnE*c@p(l(&|1QNkDd?bpkd)kmKy_V! ztWN${bWa zO#b=}A@8msHB0M)^HcEct;9ve$%I6#2!~v-;CecbSrTy zTyWD9k5B~~pJHC2y?4>Mcy=+ML^cproMSFJBH(tjg^^B0jEe zE4V3wimY8LV`*N&UwN}STi2hh>{omP%RZJ`-hg+k>MFSLTUv~gyh6iZ!ACJLE>$-N z6kprt0o?L^Kw~?GKjYHD)-u5mEB8X;3_xw-E&#EkF7Pd|iILoPz)RytAOx{}>RBb7 zb1`N1Pr+4qAt)(hT!j~a(9g9}WVL58Zt`#`agm6GmMme4<$sQV=vGrFc?0tD?_yQz zSVaUKYKP3PIZUjC95#4q&H&hRLX6fqyNXKLfcc_QWqPR@O$X+ce~2WqwMk4QGO(`+ zTuQ{_U^kdfznPx-uaH$9OHjj{-H>$CJhh&LfK!v{1ex5#jC7cQJ(;KrasPs9^|8FM zcp}BOB#j^-nMRXlaW}h$AXo9=59$4^!$wUXLN37bN8ZqlGR52Q(A)I5rYTp`t<-eS z`#x^la(_Ix^?7CM^FM1`YCARWdgyI_)UiDutjYz~DZzD%N0yzAwf=e6&%JGIsg9^V zMw=0B4B?^9yKu0<`O2||23j}JlF&vgT_M35ri!gtNBC zJ;OG^#%v&x4>x4Pkwx1Unv5jZ6ebelaEqxjDn9S_%jOr9Dx-@ z{0B;;0EY*J^h<~UMHC!fxBt#JJ`?a$XcmSxe&VZiH$ARyxV`(v ztAJ5Rl*wAm1bH5AVtAHWLmt$(d0M7DTLke4lCTx;Kv)9tps*-~cz{?Mu}OTm9XX2} z`fE}SJ2KV+^^n63)(J79E!YL&t8E>2MePQTl-e^ z8M@s6WC_^j0tC>75%uc4p(xAsd8OtgUeucvX^NRLha62Lvx! z>bou^M$_^c)XEJ5255IZun@aQE z4~CUB2k2Qt9$2kqP-z*wFM=7Mq^3t5o$n63Gq9M+t>34t-}k}hKiT`!y-OW0(!;jA zH<* zlz87`wB>=GPqG%M=`?Y|gRK4BNbn zBBq7ch z3TYBbl9;Al6Vr%F?K)V+*00Ic_bc`Nx%!Pt{l;AVcBOv%e9%bF2OHixdE?~5!2HRj z;Cc{`4gUEc=&y!OUTksSQvDDw^`PrCTp8 z1Q(Opwp|}oKCIo3F4VRybmlsCC>=WNu{{9xoy~`<|dTX;prCdh?f* zn}W?W7=(rxz1isYXWQ0)Cg8WQHRx_yIQZ`Ica9eXMDL&ZqjNtx_c=v51xcmH@Ml~) zsjxhUBr638COb$<6DS8=WIwpg-=% znLL|Sts8tYrn($!I>b-L8Hd_N9G&L6mS7zEXtEu1t6Km%fDhwiy+;Hg32X4>_qtMu8H z0M>B;YBJ^o#;=5ZI`sO?U!>aLB%~Syt|c#xkPq+Tx4esy4}@p{ChUxEBiV|<@6(~s zbOHxnBNt}q2w3bA4wm9<*G1?u&l}m#?~Ke=K~EWvoHz1Nnxk*%BJaZEpUnD(UVHTl z>B}xddaWQCS}jy&2#7M!XB(wu54JAbzbHqfFe~R${ai{xQYJOPPt~ysEcx9-wBcp4 zap8j7_z0Q6E{xK&(Rx4dBbP$0xLduEgJr=k0C`hq>DD`7L zE(1C68YrAII8WBL7j1jvu=D{!b+(_IBE?mApuvGrc zypuSzmb)zr>7~a08|CwXn_qbxZn|>-QpT(|VhRcGJz;xF$XjNV8|kpQ$=EX#BUmjH zg81?zofB^)QxKz9OQS52bY5Z8wnEt;Q=HT{ei}gRaYZXE!jy{Pf}7GIJ?vZDtkkX7 zv%#K>vO%p=U4L`t*7}75*}h#$=WeBHcOgU%%AxQ1EKIRi#DKf~F{F=>H-7F$oWCsf zVhkNHE*+IG6XQ`#8Zr$G4w-6XA%u%K%!$vuN}gI&6#aF zM2}vG9zjCFhAV(|>{rWZHJmv9!U&@QO;o_yRzo_7ZW;t%4c?}N-~*CvJ2@eUFiv{lv{{k-O&@W=LOj{l0M^)sFfi(MmDFnxN+1K= zBd2FwL*rw^khc6sjF>r^EO}#JuoR^dQ>*$l7zu|EY6wB%3OCpqpOEXQ7KZODanjK6 z7Qpeo6D@Hpir7Zo_@)FMGE7K=N^ro62$?esOuT^t{~2wQe~6qA@T>$aC{PpeJZkA# z^yGSX!18jbWhWP$RQE8|POB1ZWsy!L*qN`aHBiy%o6i6f)et7~u9OQkU1ly)39)jv z*}v=MJBL7G5!N8sPk+l-5qwFk@f34@edXbZsy(JKG(^V10LzJ}Ui41xrT{*N$V|)t zP!HDFLHx*vD0rR%JDB>V^)PGcyL03-tx+u(M$ES!1w+&kwA0;2b&KT7^6`8h@TYfy7V z&{AjVmWgXEhL(T7j5JPE+B4WLexlOCV6*L{9q(tJTsV)-Na<40%1G2xc$YC+AaoJk zLaZ%Y*}L)nCGhnLhT62O;ue-*P{bcNU_Irquqb-%f_du8R`+19*Rh3zCXEGHZ9sBU zOlS0l;G`kQLtaDY@%PjhD;)NW%=(AOl>zZ66WEr;+yRO3q^UGuaw<86(`v@kBKS-= zTlE>W$o~n`qx$&NPRc0`mCKXlv?n%!gczz+1LplO+cv5bZCH~x%1n(;lMTpF43ZlO z7hBhm0h@n9yYC;M48dkAd-ABU8z9wyx6y2MFHEgKgf|h)Ee9NLZhd?A-Q5JacPhO* zKk($9J)=B(=HZ&N1i>2~RX5$)dO!Fx&j(k3miZv|u=)&v>PKr@RxR{%Z^IXX=6{Kf zl3bffuhH|;%OvU<*fo}^nQux|0IG`swiDuIR=^LJyObz6Jh`Q1V z6adJtj?8;;#4y*;t2Fd3wk$Pl<3Jv4hC3Kp^*T7U7SEA_vKX5W6AI3Zs#)pSQGqoB-PS zmxuhU@IE-+GCdpd)^@Tp`TLoam}RoXbZXNRor>wFotSKj3u+h^)VU18f`sb{AT0lH z2uiS^_vu^)EPz6w-U+mpT-Q7k;0msJo{oDg9EayUtUJ062yc~*IWB?*eHpa)wK7wu z&M=*e@a|mM?+8C|;)H{Ndv7`3Y;d8>HA8}6#g;R8BC@BigJv9=^$uw>bI)Z*DL^{# zL-d3*zC_)tl}`*U$4>qe%wyAWED)x6z9_L>t$KLW0}d^ znoH5FVgxJyz}#YK$jZP|{~0x;zl}Z;o~p9&5RbVS?Qx(h8yL(-x^t233jPOM9!GX& zgDsEh8c31_)Yj3H?b-gYeaCXxi{lODLdVXVM^-{YWqW>2d%k@z-`aWSYYSgnOe}sq zyJknerR%@?+^|&&BHt%eV%h1hC-GC@zkTY!Et3KM)U{^cfS21;HE_>LF**oTESjQ4 zr(d)n*&>t?Axw5rgzyIkT)-sgc`UyNgqLBEam@{BGc=O=(m`%VT70t{Cl{@CYvs&* zqVQ(K1w%6#OY{`v)w;t4hrEyG!-KOC5)}BFwDT7zFYl)ZRYm=WK_Ea*rhG#ea1ONTU;qBs=TH2ixo>fBsd@YS z2{qi4ujt5C^eGic`gz56=)PO~=sZeizPc+L?4m>R8e@X9t0NF6m+&Qr=}k(Q6N%Ek zN5RvjK7UGC2m?IDgls1UNv7ae_)GsGG6Sn|4`Fo>0_BId6ntyIjsyw0g%GpgHcGdk zHoEl+UA>E)?;O<&0nbq(2!EcG3WezbtclA%tEAY@3U`Yp6e_q8N7ZY43S|_7^)ItT zyW_fZ=IwKL&wwPR7UDSgkAnTP=vy7I$&{RK%cRtW5^(KjWqZ zH9f(g0X=-7`5g|(7-A`taz#0VvbCTjU9bfVu**a_Rs>_RW0&o$CR{+0Bv>lB zidZ|HrmR(C(DZ3SNi0|zKbP*eNbcZ2T)^TFDDMP06A`63QF_FU~6qJ_`=>7nxxgvi6VXk8|Y?HYkp3;FT z$+U&Ws~%N>JK@G!x$uw@9$K7P3h&5zcW|hbr^X2t*`FJ`f+wrX=14*d_D(~J45`{8 zNUKH%HrMv@IYWJ}S!KIM?m5}Wn}*bfz~dWe6Vd=Sa&WRO$Rh~!HAll|mf@JR5>|Wg zQ>9twGo6erHZ`o~Es#aWs@rKm&!Tc&u|hU$b#1h?nt%j^#@=PY1r=NfhzMx*_}$}3 z&v&gWxP=yS*3sgJVKwjs8DkGC&BNJ^`D{t!obsn#@15=gJWyf&kHT590yCFx>*+1E#~ct*i`*uvJW*#4QVc;)7*VqDa5hnI8M9c zr+Am^@CxJZz*=vG%&LCPoz1y58v6-{g#$`M-%WQu(v^#BQX-pf zhJI1eaz|Qt`TpUD6?=YBQCp}KwjLJ?H9}*jmiJMlXVJa*&4 zv_!r5qu*tGqoI@#{vsL8D)N~xPYBy-XMq%Kb-AX`GmysyscqE&6D>05h7CbI&$R1x#o@XLeRThAT0(hG zk+GrcddJ_UzPYkjdx5T9b1u2ZWK;e>&rYTOt}{ZMj5RIyWRguBhkP6X)Z8Ttm9Yq< zQk!i(eM!RkyTnCeD-4k-^D1&_G#!~;qlrKVA{St~Gc!3Ip{*d{*hD-s>(%9>eO0!M z4Vm|H(y@e#3Brb!co>;&B_?%XhANE0WFj>uJZUju4mXD@XB&@kJ&ftmtba;g%iMo| zNi9=FPZ!8BchyhqeS;CgmqOrA6t%1(nk!<2h8C|ZHEktE2tEv3+K4J$Y<%F!Z9k!G zKk=}6gmg!ZCXdv!j~jq-Hi?YSIGaT0FxKXpH!IEX@7?$)P!7MIb2sL4;SMF-u@qRl zFrMq)t90-EFt8VvarvqZ+2QB18}5N9Rf0;_g^$D4HxJxCdF$j-P1jPm`zfx-|32SL zzDf&q1_8c`B3((0J>jKX{tY~|o=C7@sv%zF?o-}sPzeuay@PDN?!m)5_ZO|nI@S8s zJUX+ZIM=9e-#}onTIhHvR7wVi*gt;98{`Zutj#ng=x}lQoFct!o|>0 zLUwF#VNKTKy3~&sW1*d;J3GOnwX8A3&Jp<(-WV(iy}eqk9Uf#(diQtD zM_cCKcJ}RN-dJaD#)bbc8W--lRyOCcJ&k3Y>Vji0Tf6t`;+vhTlr_$I$!jwPQbyit z8k*uOZsWj)6DG zIS-Q*1m}Wwa?EQXND4x8!MU=zfX??@Bn4Vo%ejUi4y_u5onFGMMQ@o&3Jg-u(mO+0 z2uZ=8Xp#buEMKLaL)&MzT-zC%o=+8r_@S~_Xh(&aBtRx3RnGHGQ;coEeTb=k=9eZ- z-pu%Bl+ZLx>ftaeBac#Esz$y*0rgHMB-U4M3JBylJ(4IGN1z*{Fwdj%MM}Oz!E+QO zD7Z|)s}v+Dpz71*s99(9`7dMl8G}08LCT~j5tNnB|1oXscGUFc`20t8jdwa}bLW}{ zm8QX^x}o`i$@{~tEuA98$*hMsLV7y4<*>5laCXZJ`HBWGqZKvg!$-|+3$Azl@Awxt z{azRcJ|i2Knm66_<*S<)0v}fQLtyY%|9_Ap43ivT;}dd(x>7lUX&Ft6KxSI7?@^2} zm+Hki51f<8%y3$Q(5!rvo*tv%XB3d&*%UR%6O`o(1PxP^hXQ<*PDn%IW)U>3bN7I` z-?%m3HwbgUPs6x>ToEa_Dczzzcdjazz!P0Afhb24G$i(N37U)&xkTmPRxZIT zTdLsu(`TpBfpRTSBy|W$K||_bTLMytawyh9@FNV6O@EljPXxmPKsz3de*>(IdJlx(pA z29t~L@66%#VgB|V`q3P}7dKSw_?-cfS>^!N`j#3Qw?Z%3g+4Hy2Yv4;qJ$$jEKD?5 z=PqbHD%zp-AkV@2S@|N?3!UJs*O2}m_$|HcK8T<*SPu3gPvNO7f-<(4hZr|SjiyOu zdR<_y)2Y|FFg`ueo%MD<$szt-b(2Brm(eT}@M_dDWw9&mPOSikKi; z##d$qh%*kNYWWZa&r|SJEV4Sr_%GNC4)GcQ$g8 z{7EyX|PrYIUnwDl(R86q$z^LQ3p++Nj2Ak%Q#9dWcObRdnevvabz`|wHmoyF}7nL-mC>bJ=*mb#EkD0%a)YvGK zkV6}SW>{^g;RAiqp8zn~wnHvG$#H_h5?MBGp@l@XLSs*i6lMV+Z$)1D4t1KC7*6_f5R#>R#>Ux#B z-uVFYG&Z0#48ZcawKLbcNoj?vhlg-@+1kfFVDh`jU@o#liR_@emXDk8A+UOwbv>$X zTkt?=UA+bFnIT>#C7NGo*$CSa<7xR)^)@^uz7C2sh^nEUYJ8`W$*S9xV7n=_hFh~T zR|m`2ifSSqL0Z`j*Yst7#`>I{Bxtu*0 z`U2+m%W|P#LaU;?U6o(N%ma6`75{q_*KFg9`=s(O%*8c-2x^F}c>S|vrP>LVeQu{H zi}r2f;w4CQKvAx*&{`z9+Ly5JYi!9i(7IZDd5OLwXFa>inC)-iWr~O*H|iynv;2?w zP&%%+6`Aj8hzSpk%MMBFfRi1YGZaon5fEF@xzl(5H1vkU4uZXdIHM-${h8nZBwn$c z0P>dCP&41+DT)y!m4BD!N&BFm@vibcq^jCiRg9UTmuB#QlU`LcrC@Z^3A?pX%p5*_Mbm(7^c;TV{SaDhLcU_)Z`Qw^%2VAo-g-ML2X`VD3xke{qQ;oWCo+eOP2WSrC$MQBg@CM zgfGa)>HG8veB?^frkYtT&u(PLREp>M3(8E>q?2$FlMeR3_4e$o*@dC|olEuGfCU2e z*+3iUZm9j=?SH2~*Rf0K*p=%zq;vo!9D=8l=%eu3d}Uj{vL(NE@cvPt5x`%opbFnw z;QYOHmS|k*PyaYOtk9T>id45);-#1ARhkRmXv9Hk{2}cT&56LjLY$3dF&|P(D4RZ- zVjZS5|1el7vb4+jTQoXE`oO%qzU9vNt#7~~AAKc|gj@@AmT$`zt!8=$_I^YgFp!Sl z(n}YE zq-iMFWlkdhsyS#%%d_8vVcGSt`9a+qbV3ATAoHBZDZ%b3Gf1->0^g}4f@WC1Z^kCS%fp8Y#C3lcg4qN`9PggO>Z zXM=qo29LjgDm#2=Y4G`<4J(7k3r@io0;xm*w>=x^T|AK;*!QTq;np@7=q~O6Rg?{G z`8Zs2<8>@JIz7>`jVb0g;hS<5(A~Rfuemh`szqy%&;o;~rxCdgmkHQgh?QP(?&bsn-{I&TiNTMfI>5yLk%sh-Xf!ohIDs}e8K!Sf@exrgt+Y6Uki`;m zwSteIx$^k=3)?|9GyN`&@(#3ywO+ULW;|E7Q=$I{9-Ku0+B8s?4YcN~Yv+$VZjR)d zw<^tB@BhY9GyJhP<}D`ze7f`av|3Z-ItHhK)01-XC!LI;J&%U5-kra>x*L5f*h6l* z(r`S%2AS$9x%e?zlmBJwdA8$)7f2FAU-dGiG3?v!nM+6pza%9&mFHB0FX9WCaI3|> z=Pt9gPDDu2GXGy#lN6|CsQx1?g}9o$p-o#Rkdc@ z_AgZ($OaDpmyjN#xY>d^&GSrqs+MS zzs&|kTQ&dxj>eVrIAGN(JHWZ%`;l3=r0d<9?I9kh%0KUch#!Zjm{s7ywxw`?*4xj8 zVNvP~(_(a#Xff0ZfPLaaL7Q-7qSj6Ea}ZcQ(@A_xOg{)wiv#r7wijmjm$fvuTMaPh zx$xC}X)*)#1 z%0H!m2BTC7PXlgm=zoizP~fM5j9~kbN(IjPnzOE&$F02|t~;DC!I?if_tSIQCx1nE8@%%# ze&_JJC*L_qXBJ4mzZV}BE~{l~o}rET&c07WxY1sz;HC#2A+q^?Txr{>=d*q+B0xEU zLTg8X%3;uXA3qDz-H^xKzF3FPdAr-Qn-AkddN|wAK%Xkg4-S8VlMZ@2+J`BSfW~SX z>01>AyfSH`+BV{2M6KHkcxTc?6*WysaG}vy2vJOLdO1vYy=CqJ+ana83wXk}4lS9! z6ggOkASurQj$fdoz>r z*rlO~3BJaIFg=>kd{dyG<}78>u-NbEZuPTxmBN5f{V~B0=J&0RL>kUCuW&jIGMzl z>ZYgU=HCvYlUkVhq)G3}ME9$Mb|v{ZaTq_O9sQclB(Nx2sEqAkvNGu*MqG@Kf+vw% zDQ}q9Z^%7m(tK$1XigxC!F^_0j^Lr(PC*9+YboeNpw^aDGsHMW{xv)jmh%sjH9e7w zb4Rp-dns2R1?wp2r+~!LYNJh|@dP0fd61rm*mHaVdm46o5(C$2150B=GQ=q;VAo4h zYE&lawY-7K!M`pnx%p5qxKCyt_|mhH8oMwGH}E8~R^5Db8L}nut?J?Aq`VC`9Lo?9 zDDR|T7X=K}yg;!e1rI3r9tB4zU?P_^#copY4h4S!2*A%{v$G3Mx?M{FIo09EvyV~? zz@97Z)hgpF5*qJ_Z}~F(Lu4@rD?*!RLNK zJScu79QsH&`;l<+BjK5kg!N|m&&vhztf&ZQ%%sndwC5vX@R!2g|3i58BVqWL!bbDy zN|j%HPQ0^wMWEl6rYf;FTi28A-LBN_SP^gse-iK<79Clk@y_N2 zXSQp9HnJZdjsS2M9fhhgvF1KLH;tc_ntsurZR}qW@VjzOtjAr;rtIcJO3U*r0`68e zzbGQDbxU^ZVWsuRia@t3Nk_BTbf@a=#=DIx0-`H>dvU+#7YSLVc4v1R@O`Bz;L5M6mrjEF5a<=Zp2W>y45R}P6=k3F;EE>bAIa?UYLNzc7~L5kk)MXu(McH1%b`Xf~QGyl#>GwcM_lTK&S z-~W4f4~HR3Z9h>TzWe-t|LgmI@6SCRCkGcd|7Ps>j&R&BD55!gDX{jck>l=hJQwD8 z18%;`4{vArE%9d(BjFJiZjJ9q>b$erN$u3H@6E$n*Fr0zrb2utljYOmn9`VX~jD*x&wCt1lplne;l^weW4 zwI8Vi!iaFJ0~PhW<1B9j@-_-vYw}JQIidgRopL`~NH`%J6OO;XNo$3*u^DY_sXsnp zOc;O9qSk(|)JXW$f;l+2`0UZyL?W8xhYt_G7?WbN$-v;z*(-u5>`_giX~?G zDE4ud7vh0fVlFNugk&n3qVhAdV&K}DGiO7eP|HD+Vn3A<#OS5CpxDkv=jLL`%ZhzU zczs?-P76V!;y9d4&Z4fsE4DK+DjGi*G$_Vo5}EEJ(bV*n<3cnwFA7TIc~MA-(O6R8 zkIp7%VwX{_xoUWNa{FZ=6`7sGNTcycDi#;`$aFM$Eh?dkzpBc4adujeq*)O;&KKkH zgx*Huk*Fk0&L8++JcieKRuC`KVB+(MyhTOFPyn}>g;oqhtEVV96NDp>f(i$(Nz;V zPQ}jBx=%(^(aG3!N@*>poxUCwFH2bIgoGt;K9z`G7WA31WW~V}r(%nO(p;`cjj7eF zN*hhMoTb-Snkb_b#r~a2&dgSJemo|`OFN?YP%JT*x**KOqtgs56t|l6!W@rXR+=kg zlR_#Qi?dD97yDFlZayU)nUBSJfi|W2R1#2;5cn9XUlfwqDv3Zt4Ngs85fV|wRf=7S z&Iz9wRKRN~PvuzRlECvAx}-Es&CkuviYcj_t2ni^^XJi_Zz^^<9-C7E9xE=ygJ#8d zcs@1zk`TLmB_*AijbdnurxYcqMzk}QjHM#i06`H^xEzx(Xh0o1BZ@l_UAQDf=0ss8 zc4NA9a9_W zfj|naU8350`fy-+JaA5LdiqK^Nx1(9+?M6WQ6?nhFeQLB@?8O$& z)pRE$JNt9y{zpw+1-s`h_iguI*_-c7F841_t!&8IHy2F?$Ka{pvmsmAfP`sp)f-hRs*c4oL6hOldj!@dQo4oVrh0Nq^`rm=d9cUU5p6h{Hz zt)>E?qp3h5Iwu9xo38aJ@seqs%uYfcMoEma6kbObI`C%Xykgm+n%*2HN zKYKkXML~{%=s<_#@!9JFAGov-n7ST2apr|UbUGEg#-Jp`noz9MjEpMQ5@ITGIBXFJ zF;RT&n>ga%Hr(S<`r6z!r@3!ntu@}kz@-gWOK3+ggg}~$fMZo1N)TAqxT_lat4Ock z7H?QY*+ulaWI_qXEmu@R5GBwX`qcYLd9@O0Q<}T#)?($~%Ldd!o4k=XrH$X=c=LCS z?^`t92yN-(OEqh?v{@rpA4RH7%dOuJM%=EIDF4=*y*?6a%}6|2y#+34JNF4K?8W1Y zju18*>_%h+WZ!%#7L^nin520z8KH=H48_WYb_5#~i!@Kv3TTGWtBB&$!A6M807Pa% z*;G)Bl)%GvCC2kYGD7H1>59f z!nJ<1Qagec#ViT&8F3P2*c=o`jM#z*A4@2XNMt%5m4N9aY#m2b;ih<)N}h(d7<|$} z3~5uph~m_8@12rcE@m7>H#abt?>{Q{AN}d;a{ozI@#FR#1$S$~F1$SfKJs`UW z3ht)7yH|Gi=G|Lm_twW$|AvR5#A*G!VODf{)f0@sJ)DfIB{Gkw?$U<$N~uc*l&Hi2 zr7{&Ij3_}%UgOe6y$vcsM5)GAl`u|m?;6g5r}C&2vV^|G=TS+#Kpt&O&?LTqAEklO z)QE-^6^{mtBr`}74Hy8x_Fu6<_y8l!FIEdJb&zqWRPuFHmyW=@$rU*-}AifSxM&FcV#>jv%hU$>B+TkS7Vfy$hD6m=IzefyH%R>3XR#S9>J>J z(DCV?VZDJ(^}PUYsN6NwC&wFz@#Bqz^|aN!VOTOU>{o9O>bGK|R)GDCyjf!mm&|E% zy*<+K+N;3iOO_?;l5NS(Td!B zS--cm>oMu}_e!{)%?V?wb-W4M4=}~Nop)$uuj-hzTw0quYv*xeb5*Y)QT@eV9-ftsta6Ez69zjB@vY1`Q%W`5j|UjX4UT{WAmUIZl63;F z92p8s4Fx7&3tT5LJuns6A2_-7C6z~@W-c&lP}BCL#wmb5D7AP!2F5p*oE8PhdVHY7 zMwbWcq{hgJo3L(|)wO$cP$by>@~0eM0LQv?(Ug22dXwwPF(m zHDD&1BA!7B#RjsJn1_Vu1ldW&F3-=-OF=h70-}1Vt}rHEv0Nu6T`}BH3`ubmRmELY z-!QGXrRgZx8 zjG@r#zxU$Z7r*tD+s=$3Gm?qk@fF%SmoMgfcFOo`+nI5H;Ay<$yYs5->(6-xGRB%{ z!R>kL)a_GuqRUNMtp^uYTk>Pi$z#uDpTCfEPoWpRkZ<|$^6M-8E6?ZLTk`H5vKzw~ ztt`;Ad|{<2=iZ1kZ)3*J@EOEm4x6wSa~8+Q8z*lcd34+qlavw}*%e=-wEg5AAdi+; z@sud_v=EO+BJUc+myjwE<~yWr!ci(p-Z{!3q4RHVL2xix?4LTjEY57-jv|NO)uW%$ z&o9RUtu)8M1JXQ|6e=P~P18D6K*Ut~zjg=#M!)e}>*`VL6dgO} zG<-b8i7z7oJfdkjbwlxxcI?VbiSJ@NiOnH$%Qi=)rs4wPHQZG7kV93?e3PmZmkUzr zZGP*2mf0ivk{fX1Ddi=zBeQNcV0PK|UoOlo|1dVL; zidUr_`T)gWqY+k2Qfa0&sVTL%{YjIqsql}f)n0fs?~R#M-n&tzztGAxczN#u*?S;s zKftDau4>*i6aw!xlAv&<0i9P2+ZHsQQ!K0#HXV@;Yz0IrZ8U1gFQmUC-C80Nk0oL$ zX>sS1Mqa1VJ!+h214iH2w%n5W2U&aX7q`fa5^Sjd1cAqK(K`QN_%d#8$@rLDrmAC{ z9rae@mdq*O9o40cX*2OzY18|r_h=E}ux#dYVhkaX4h>+S95#!;PU$o76jvg8LsK+~ z#1;lE>IwKd718U6Zy>4*Rwc%|zYH!!up`7QKshC)biRCX|C6>r1)j%L|845@EIe#R zXK%i9hupa%*EyQ;7JPxcZQ>jeYsXjdJ70m6znk;f$r=-<0={$o`Rx z>jQ85iuHb%92i|a_RxDIYd^wvQ*o(1t6YOyjg(jkhkitK@T9#au9EHf|3(0NZo1hB zj8uu6kxowEcT?I_?`Sf9vfiFrzys@g3bifrm#j$F=v6tHTe2Zt*EWw7Wnooa6Y*Y5Wkm|ylgCK`rXpif zbMXQx5}?RnEkii~RcD1(wWJ*_H^f-bGROvP4DE>o$N=+LQU`N_Wfl6p)8g!0BsHsN z(aG1?%iC1!P4a$|Ja!QU#0eCJNNb7#g|Y~tEB zYX_p@Y!q5I=39euYcS(1bO!UCV{+$M#`|+`Z=t#6Ei{+@{!Ppu{zr}{2r3>01e@D?qAv@RBS#(gy$+c{~KPfjq zQ*=?(%{e{m9)#3!(&%V9x^t95Yp)_u&5sIZXr~!9AYC=MQ4@GatAm65YQZHCIM#|1 zRu;3B%A1KhMJ+oxPTMU@*n#_8htgLCXY4Yw>V8defQU}d&L>ln%7B)L3L7GA2SbBt zHc$|+Iw%md+o+Ku5IC+L8muSoSE0^&s1Rz3w2{yg(2(hzcnf1-!X^|)CLAZl9K$uF z%I}mgd}{WqLQ-s5w2?JJ;TPA8@qgv= zS+ph*{|3_4+I#P{yRWV6xqmp<8p=2ywY1+mb@x=hB_y|maxKqf90h-0-oH)uZ_BuF zRF=cCcT3j3g#pSr#U=@_N8&G@*3AU)3gbi!w#z(q96deWPIX=KvVyJy&;yg zN%Vo9)!l4OqDX>eTe9+`Go(?~xU{7n6+>zr8isAQf}1JLld*x|XDYUvu{A2T{|@?5 z9CT|a#&|&rTE%6gh;NfeI70D7%9~l*cVdcIZIB5Pam4SEN8lm;Q}S4wPL(ZS8txMJ ztoI_(1R1<&5aTO%f}O&(c64#%2?wnLXi8NIXjLNpBhb2=^Y>LjPNB8q-phAi&bJ-R!$hwisM>>T;&Mc{fX=V`ffG2gmPZrxV!2lD>SvVU`-tv}y3 zD7Ot3x;6pT7ag2`2nfvIP&C;Zokh;(aH@FeWpo{M*5YW|h6)t?4n^r2Mf8*R1M+H6 z@V6!B(qf@V@PV>A}sswcF>|S>vq|Sr7uEMVx zjII3+a;k3u1ry;_;7tw75O$n-PFX;rVyaU+U z0Udy|rko2}0gK=kJVeY&8i59OcjyIj^L*Lep~@uD4IP7HID#6d(S>?yT3pguWBP@~RsDLe+yrk*HEJ=PD%s1I zcd1q_H?3PeXtSB*-f~PEFLTqz*KiB5#S&cv3wucQsH7RQ#l!+9=Z4*9qjLEmN4I0sogA*SVne z9OGOSYYATx=6%;B*6_qkz2Fm{MW)gOTPi5@pg_{O_zFBs`V*y5#lygiV7NtztYz3Q z$?SwL6$hbEn0qB8k#t6iMT{md3yKNkk1=`*qsnEhTtNg#$*5RWwYA2<{}5^7ee#G= z7JrwD_)4RLreJnHMMg#9`;_jk9C&G5e2Gg?3^BGEs;06=b9__fIsZ8tsnIx8-7u*~ zHuS|sDx#tUYe~9q#BE>Cz}qL}?%lZHE4Z8Ks?hBRpWWlk!rY;W^K@qWNl9Di8u;G6 zxA(2w$aOuF^*wXPpKaX#sB72q==a9n9?N&_lIicrssmo;46vrJHQ%^NZrqe>-13pH z_ioRM?Y=+T_e{<=@{zCeZpZSTha30ge0z&F^swH}b@sg@t?bYBjXdnwvF5RLHGJyk zI(ObVTy&wOdycz~d}B~<3>E^z`M`u6m{@&27dV&?9Fqgb3jLvc|9-iD|7tYXe<du`-59etI+39LnZ{(iY^`rE=>DAM@?I*MDp5=t>9(v>r zWbJ{!`(-OiFiQI;O^4eiEoQ}mZDN=5*fZ1EhdAdJMCg((BZ- zSSX`59eJ3_+Lx5ZxpVdS7$7HICsR?9V5r9NChbT*ZMpKX2feVU2bM9zj8DQ<=`-j; zJ;o&k2QwOx!16E#i5(e>01C38n0eR^ian^HcxNEaM`l1;m~<;CHfCBS@IkZ6D*Yif z`!BV&C1MVU5n|^+X=P^|yD>xde^N=CytuoH6RLYM=fDXV8Ik~pLy~N$b z%GuxF^^<)+-naVtzc`TdA0ZV_^9LQFe8&N~1c01wQ`!bg4!90+P!JJZL0@xZ`#Y?g%Wmc5anOh@I?cdH=QV@F*xNy+?x)h z*8#OREl6*uC|C-&jAKd% z=rZYRZRglMpJJy~jl}pHx*CnD5$`}|Gi!FNaN<&Z`_?S9thSWMMcH92GQJdqE>9*sxi)7H-&P6n}WWH_v4hMn7L}}mQ zZri@Yeb=;PDH|Hqv(8LgZacJjRmifxhX#WV@yGaKBvAZ)3NcZJX*#Y_G)7(kp5lT* z3{)meL=85Ge?}SalSf=`>FBG9DkcWQ;!uPQuHv{f8-v;lg0bQdw4nGCs`K0Ml$KcH zGPLMPA#w>fzh5mCZKhPUod~XFnFY1@uc;E9P>BGF@Bj>NN>0THOwC2zmP~&a&psV4 zP?_7vJhKp%LKE>>Fc{JSCNotu2P1z@qO}42EVTCITerxqTkhK)j!foSk7bS*Hg3&t z9FsSWtq$fk9?d-asI&Kb!MCC8BGp)-dvm^fNbVjg^lZuZgyf!3p>s1xRoCbmXYOjq zoCebA-jWSY$XyfJgHv+XROSqX^*gVubU$#({rj>YeJ`(`&NiRSOn%(FIp4fPZr<@= zbhRtjd@SoerivoOI+4XYv+;`o*IHErOR~w*7e&3Pai}{@%o5uORpa#*MM;hl|D3#k zOCDXY1YIgVW?R7U@t;!Ge;}`1Qr}gziGPn=$p!CMe-%4*h&gqini=M0g}!b1zC&`~Az0EBS_1i&t#Zp&=u67MqS@^@Zur6U=N$d6t9~!%=`C~w z3T*?0?rlK#9*DVC!ui$`&i|qoQlv#AgfG&#MM4)Mv2=_iihn{OV%J6D^F=~IBJtN^ zjy#FH6uiny?0-gyo|y z-X}M2DLN?0$wb~2|MF=Vn-yJ@1YM&4GY^H%8=MfBoBN6u{G!4B!TYBBFWz_Ix}#{L zBs=Hy-D%GHw`G0XWXI5&gA&{OQK@ND(SqOFN|a>h+J}n{3PGxFx-%p@`aX9ds=_l( z3inBblN7hA@lrXmX{6{*BQI;eWCXKPqZu~AG#y6lu)|^I>F=1|w`rs-Y$29;FkC*L5rAV1p#&(&_dUp=78-z0(YUcoEaC`UgQNnX|hu*awxJrg7 zH4c^m9jMb)YPlP2bRyp4;GoeR;JfON32%++&XVsDdeyPv-BJA#Y~-ch>Xs0C+1%WG zA2h^$c$HMQ6q{q6b=Fs=AF~KB{Y5_r+6HA)73w%XE8;Z^A0z!`G=4%z;!O{QFD`0! zr70LDi=jn-g5I={kvksg02`?m(V@lGIAp+giESs zwMfZqwbXpKn)tX*)ZCgj7wvkhc&#%l#)u3?lEU@U+YONvcCeK=h9AYn9($-pvMfdC zeu$qTUXFOBwK|#?>(d!f*W+*4HE|t25m$|Mjj-!+K1)#XRvJo!qv0Yt4~(lqE<%Wp zIcP!Z=vkP)h-L|A;L2Pg)W%I}YNekxSLj`gyg}oJ%A6K@Bud2`+8~ucjjO9e;UXTb zSc)VX=F+CB-gKc1y_H6c-t;-8`t^E-`aM^*X4c!NKgt2Ewel}`=F-}tUww68#%(>P zxGOZK{YK+y|}L2CCbdBf!WC-SD?Db8!)-Y%)v zrAH~11afhRypPEH5T2%rrzx3>Te5pvw5T zuF3a%R0mEY)%7G--N;HYCvj^=x8x$;E#V?xQY<{ZcB(bs5&tWithxDq?Dnx1h|7|Q z8N11S?8S?rd(Yi{F4wXt=iQw54$0o3tbK?vDvIs8`ZDiJ?l3Q)nd+Hqv%{^Wv7Q+n z^x|B-nZK{&FAzT?Jyx~i{SM{}J zjkX7Dx~gLQABZgOcq*zIs_g7KE!ZF`1J->5E2BBzARyb@n)eRK-hq`p4@UDlPsuw^ zWxWHrvPqg|A;WmPDx-B$su5mZ!~1els@2te+_XIl3oZ?2ejSp#YJ5VJN;N+LZK`WN z1BDsdB?pRB7ZTGJy@WMwt*a6EQe$?lHgx;8l5W2Rz1h;XtHk)%IAT&Y{0{xP-%L7k zo31Ulk+wYQ!;sIb)+?n~i_AtC@3pRov-6j)Xz#cf7ZSi*>hfc+r?B^CdoUh&5wC3M zxh!Amf+Yull<{6RNzc2~mtX4RLh8f`*s1TBQUF!yj{gZ};b-t_l}yV?EeQR(-n;$+ zO&SW!!b%6Qu8vFZb+P?L^|`n@>zD+i_t)oRuzWA8CRK&y39LEp=o!5r$^esaJXlLX zRGy8ad%WRLI}h4bk@tV1jQBP3bR8!%2N(YpRW=9`@j}|nRq?h(yO$_gII_iC9Dlmw zRIl#W=vaLd{XocV=Nen`-hd3XH6fsr+0e-xeDBaBSj8NW+jfw!+rKg_`=80UAacQ) ztz)a)vGx8$u48Ye0V%NEVdC+FP_FH8#*LJ%yYs<)a&X`3>HNewdE#7d;{3zl^EvMY zIr#j_;X4M>!{fE&^WXMngU`#}3t9VxzxxaC1<8sb{c(TC;r(W|Le(~wv@Y!I4yb4M zF$!^>(;%LiCzj45>go>STe%E9zN=TpR8a<&8($}!V6MOk#+!x(uB?2=4I|w-*2S;& zxZZl2(f_~NyyYF^`?^V-s>>`x7y(y}5ip)t;>9K2*#=6+r0NlfsG9Z=7*;@oK zXCoBW+fy3BjLpWcLCY3{WQ2F)nvGPh~ev<=}faJaV;VU44b-wvW9-1z%sGV-q->N3eK!aQIqtk#njud?y!@J$F5m2u;u$XhBr+_RC6T3FrJ5$J5`k5_(grQ zc&xtkv{Z5ZlcvW+`lmWT(~R|?QtAWrch}FWci&yUax!oWrGKz@QkogJUHVjn#*P!# z3|bW@aBWgbMezj<5#+4Ic-wHT0=z9Mez5R(HZC#Ib(XRBu$TdgB`zh&Yw)Eno65I0 z(w7S=S^OGc6Muo87RR0%pzub&s=41KAbJID;PTVSx&3$IKRET!1vPAmi_M(=pl|ca z*RrFhem0uxJD+u{dcZw@(Xyw|7|1pbJ!j@zl!rET*aV(*vQ9mFd8pwV8Tf)->NjxeO0yiwYc}`*5Qjae?rSZDl_P? zz0E3PmbDLjapVNQ41Zr_BY)hZi1OP2;$v^Zeql;|Kv9x)QCW9sW7Q*xo5%p#`aArH zpHWzSa;Z{VqPta0snWHmY1)s_lL9u<7gjC;2*!k=l$a{{Y|dE2#;Bn zW?isMZG~woy{E`~=`95;yS)cjt?;t;gKW99N5L^(QXF%$64N+IxYDS6m#o?eNQ#GX z0a6KFPwV;(41>rhTj?_h07gt-2gPGNE8?~8F_Hqapn+syR?EYX+p0J_idORQ zRVGfP%P_1uiNT76iR79zT&JMA7q6mhg*03e5?({<;_%bK;f(q)_K(rGbO7~$$QwC# zAnV#z=niDgJ|cVKZF2jzeEW#pK7xx6`s?3S=!9yTgJl3`!6Ri4)X|MS`cqdLRh6_r z=Zu}Ty$C$+tkqvXjZfE3VrhLEreXDjH-VA(>}9<3z_KyHiLD30w8dGCnq9eFUE^Wr|@5Cb+ydgp=zHRf_q*3cJ#B9|7_OK=TO zZ$pFP-;hTqUA3n>jgVr8niJe}YJmyEBCQs_)G?1QD3#yN;ZYqvb3`H_;|>%v@x|?+N6emLJdXU!cyZ{PH2!;ZqxWMR`pp?&02ho#%OZnAY)v*uP%W5r?AByUNa#?R@uWvs}aW6D<|*2@L=bv<&WHd;Lh%xl!uSW8;=(`%KF*P zpGp5_?-~mi?d_H+L!oPL(L{kq4o^mSD}FmJJG#~^lu{gRwm1vzy+sp#A2~MPZ;>6t zYZgR`TU}_Z;K!#$SYY?bH5Mp#8Y~-Dj{WfD_fP)tx$i$mR^A)P%KH$r5U1bl=t(m7 zK4gHY{lp##NCOa1!1s2EgdXoqnMrR+wovh?%i8-P}Iq__uNb0 zO|S5B+two0#WFT`6g?Db;GEvKx^H)9ao8N&*1VLq*=#v%c+}QSm5LUMpaZqovRQU? ze{Q2R27_#eg3q$|mx>oJ5a>pjtb{|+OVbRP)Qc@bZV;#z=wn;#YcT9XF)D;HNO9vc zT1g2=0pKgCS6~EC%0Okhs&|({(^a2)QQv)@cyhszYP#s9TPw}cX^pCKKt+asZT+5RT zsl-k-+NFz3rM0GBNHxnNMZ5aMg7_n)^9!Z)-4h8(irwV(kVn_j>dj~XA*G{E&6yaM z=d##O`5u-J19$e_H-?jxc9u4e&$CG(l}tw@_Bk(c12w*p>Ny!P$BS-3Qc>C#${M6B ztHi$3#Wc~1g*C(UZz99KLsX8brngfFA2L*58nf3T%wCz@*(y!i$GSAZe^zA2tj5e) zu}`R6^#O4!n!+C^dEn#2L+5Iw<7*<;rk2g9D?h=X4r4&$_|XK zbNF39Wb83G3+B#j*KXOoXWfL@`XR$t43yZF?cO7s_pX~LTD)fDjDuN2OWv?WHf$-F vHx?Wl3%+Ndyfzxwou`e4!-o4uest>HQ`tSIem1_wv6M9zaz9mb8It-xtcvhV literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/commandr.cpython-312.pyc b/model_executor/models/__pycache__/commandr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cb03614635d922a486770922129755d2ed5ec28 GIT binary patch literal 20204 zcmb_^32+?OnO^tYCuVTpH$Y$r5Ig{$;we&;!~>SZL-t7YWQcB%L(Tzq4|u?UMenkr zfhJ>xEF&5E^^a;j1ViB#m;R3-WT z*V8=%xIxL@bOCQ(zvF-R|Nj4b|Lgza_j@=zaocxeZy(^eU(<{JxC)*}Z&*0)Iwx>p zPT&PgoKIN77M}dpxHVx5+nC=Lw}g*k*Ew;vUhL1Dp4J-W`1A1CQ%!%P1J?!67}JF z7UqvPBpSnw%wHC7N;HR?nLiM3NwkJrnLikBOSFgEnZG>Vk?0I}Cc461iSBSW3#*8) zN%Vw!n7=aKn+SzN%wH8>o9GMo!Cx)Z#QPKL!s`+P;eo{Z@OlKCpYPe~(Z+xDle z#H+67)1nkT9T(wu#N>EvL>z|Kl1zpys$a{pX@4p?8at!Y2R`;x-Z@p#CpHXIeDGor9R6(3I|Unpca z5=)9vNiBcz@X3`yFQqOm4+6!&k<^)3T7E%FjfgUeM^MF!<54tiB1+9OTI`}KN~tlb zM$O68Nc6N$>zTx9Q4nIuGpJIvAdW`I)a%Fjs-7wrmP4r z%~gm_+w^qj3QI(Pr(d0&Oc7OgX^iG4c7EIXKG-`wQ?^NN(l&t&;ha|sDRl*#PVFmD zvg?%6)H?Q@^Xb0g@1%XwGJ#i}@&r<&uKXX~(n+pB!?d?1t$InvxR8DLH~$yUaT=HN zqBthR62GCI2su@|kiIe|s+M$0wX?OaI!7_%(}SB-TSAN`RqI&ll4_Ua3sM@h%YQZ| z2x2lKr!h%o)uGL6HFz;9#iGd(%$qSe7EdKrZ!9@Bo{li;(t4Doy7V+M4PP63{^8L2 zbYg7%SZr)vItoURUUzwX>-KHuw~FiIvD53vuB6YVlIt&m>#Wy!ig;NZ!TgrAS9v`c zRU(=cq=B(3YCxObNi3(x=y-BO+JtG+q~#qfAv5<*gS7jf?c^lNC|`%+7|=;xxhvx zu<=gET_GFTxER=f^|^d?$NX`nI&}NQ2d~_EsC*;YDw6*4Fz7mE=aq(I0&gJnBWr>sT-%7v>?GSOjZ^ z8|4N5nmcS2Y=Y$)AGT$<%Y4{QgRQ!c&^jMRKbWt+{|ItF;IDJV{dF2^?t7S#`ewy* zC9~ONwWW;}c8&>EdX2T;NnW3O`tKwcF{n&h@aEKWq|0?5)~(695%_e4o+8HU>zC)Q za3RaE>R-mALmt(ReGELz6#?str6UoE7NzPYKF_$Dw2t0wsiY`NgYZg22Pb4x zZL%02RlQ@BBO;%TVsTh8sa1C*G7^u+#w6yPMzJ@y?Eg^NXE37Hgi$*xvN!+d4D0`>y5(uDZFZ zn~gUb=U=tl;=*R+_LzM9#)L@`SAg9YyQ#|rn zMlHvmaR*5YXaKQBQFfAj*)`UhE_pL zD7-ne?RBJdAW3zhad>nHty;0FvCA!A)igOG#^k{6W2{P0C_G3(^w@(Gx@x2c;TI{4 z8l;W=6BEponi+*NU|zI)2^Mez>z|^RerUOEd!H7oM&F>64rpOisu?BoQVqKH6?Bc( zq$_p0I?qF^3|9Uw?R{)NvS$Wej81O?Al5s z+QlkpvzmGjt8qDBGUk-Qm%fQ)@}I*2UwZ80++{jv%GWe8KGfKrYh0%^t|M+#mv8UM zwQpD2x6@n8gQhKcUohuuQhZH$e|gT|srWmyzOMNTimx|cUX!oy29=qw&z;Iv_C5AG znmv@Q{&5-C(8n@%D87y&g^~>6Y-KMpG$6y9Wk_G!{{48y_C2r0`HU%xYSZLo6aF$h zOo%ji9|8(baSM7Cd8aI^36+x;eTxDId>?&+GEmgH3Z&O0a*) zju-In+R(y~QnP-^Nv|%hyy|mG^}mK}_HvDp615Th8;>qXJeo()N=6D+>RXMK7MQAC zbdG|FLK<>_sk#Iwn5tWFiJo&cMj3`jaIX^XF%KVg2;OV9u#YimKbW*%4QK;r=+491 zGWU&;nq1)1#f3RiI-!VDvWa^i!?1!=J6Ce*qWq=vrztz*-b8^tU(TD6!r*9nF2=o$ zS|Npr%mq%U?{CC)iT|v4ChZegR!C>?Q=~I3%Pe;xuRipo1y!vm0I#4r~Sutg{DwU_QHyTIngN zwPte;p(Zm(a8Ft%ZIC}akQ2P`7iK-`S*z1aZ`6IO(lx1rHnL`QHcIrd5(_I4B^q-s zy}6`B6WXYUd1#lOulS3!rek#uYhvXixNrM)@DcTC)Z^2nd$4DcPw&wEC3C+;4-;sn zwd!vud#CO%{>lUCefry^fwN3GFsq){!;8O@4(b-6jMlJJjx;1)t$0eyOh2PXqI6@< z>tSqenD<>5TeHW%TG}Um=aWVpt+cO=^O3MvHm)Q#GhTH;L%w zNL);wNuO0~$v>8oV+79vYAk@Sa*S&uX_z2dhOYvcRK4S}7`bphLa@%k3HQJSV6P;I zOQcjzI0sUi;B8kuQVJN-l?cEsLL-X>Kwyu`qI4GNRX7 zi8eqxgeEJL8-+y#G&MDzmR?4rT1f>K<5;NjY&6qD?*S1P+I4ND>Vpbt^_AeHLM?3#Aw} zgw~eoGRA_bEDmibB;xVLXkc(vPYe(<5JTXDObk^m7dNV9D>|>8a(Oifk041|qE)2Y zXf&Aqt(Iw#4BsO}1}z}J>d+=f$W@{m8N&jw zNO^|xbQS64GRR=0Je8FwPY)`#ELLpGICUjznc9?Ju2Qvj=UTTbtxTnA%GWpN>engt z>+aMl_1j4M$=5aI>Ux#B-h~6X{=EwR>h>bspYQ0+b?j6+c9Krjn)e2B-VVjvF@NTs zcO7K!s=8d|TBUOB!plnK&|Uss<@T&^d$w{rE0&hzs_xy~I* z=Z?E)l+I@sn+`IKEnm}cJvp14ProD~Q_ziQ@Wwt354+oFHhlAUw=@4LEC zXY@-$pVH8WjDZJDo%075hVLF&Y}&`_@Sw3{e#^qHyWNY8Xg_q(2X&#M?g)M8==4$c z-lq84n5R?mb(U(O-^f<>JPx=3E}{D#S8%nxM*8`o_jbLrOYvSDzv zygMJZbWk%INqM{U#-#^!eG3N{>jpE==L6NbK)VuXp9jd?pWU$kUSNNIO@D69Q_7mB zW^DI^ou5=SQw4^nhxK~k?a1_zV!2Sf@z)*7b#GI;w`ofFiEL%(<8pU3)yBP4#X)oT zHs13#K4|F7H@DyX=8bPI9RJ|dty9^ir#`I7?K`RLJNejZX)Mbec=PaM3{DqP<(dbS z=7Br5AN&8(pKaO=FgsTjQmR6a?N&?=a70@j0UKGdKV`d=;5}ZylUInDq_K z?7RN#?6Y(G=U$pQg4g*Da)kF1?kQWPpQtdEkU?md`q%?sQ zWFNu}Ww%H{szEqO5Gx+cRia(sDr^;Lv<*eGn1NDGI`yJYo$dXhp9qnV(+~`|^ z-V191r=wN~kHBzWLl!KCv3M*kO~5aGiyS8S{0aH~n4D|me21KWO3qa{K;1}*k-ksP zCOE1UA`)9O(hP;oQW$?x^^#N*f#ioXQq`gT${s{ANrXAt@dRlH6SXTBQ~@%4h)DTg z!MVyUdAN$2>&>&xb5o1u>!uwUYv#p#MfLU2Y-sNI%~x)`ve5NG|E>PJwx9Zb;>$KY z^UW@8X2HqJ*EYkD)@d{y1`!?TCyy*Txpc4r(jz%8pY zezsZMvJMYR{{(dpm936cQjajWQi_s~q3Ub^2pCWunvnzpRLud*5z0jl4XAt-js{@% zZ^&=jLKvp^1aB`Zf|&G1C}zj}0i~vQ$;sYb2CC_1?;e6RS;o(j7N@;xslsosd01I@ zedp}XB|H3}P}h8Ya&~gbNpCLB6L_m>y6Foy-ZT`Hszm?BLk*?p(J$cq|3E=Sw;7}c zT+}?=y-K)taODxb=JDpCqZU%yM_s>zomOb`_<%S9BPK~ZC1fg5QM+Q|`ZCm%wqPrD z&HsS20yPDmxAJ_;OjLj#ZipCZV@s#4uz>#X(Aq8U7laI?F8DaE+R6>|@6+z_*+dv{dCS5>EFdRZ6 zw3R_hB%AH3+sMb@duV+~PiG6#>u}V7W}CrOA=02ij#V`$O-rISGNx3jBQ^f+Xc}tL zr^q0u;jC1XmTN+v)I%Vz$ouMyb0c;PLt-1O>rM~aZn%t&ml}*n=a&3d4`*W8jjF#|VmwsR^{d3f4+2JV-G@D$+opF|= zofDbv$}p%Y43sXuc?9M}i6u_ScGF-ou!SARiLQ~) zpcL#lWn5Kz#+C0_lk3=`bZogZ_2JpYj;|@M){KAVGJ@Y5cxNDA)0%JSfx_3h@eyb3 z^kiMF;D^vXp1bkf{HevJ!K|zPVO?7m3TWMq?5<&@ZaCwA7-*V1n+>eVy4Gmy@@;DD z@3~FT4k7)J`oo?cAB&4m*d=aKQgW8?FaHZTD|d-1yQjEG;H7krpx2Tuf?l;9f_(ku z_8wzvfWKsGpf?wXZKHp-f4&MB8wgM2>xsKutzAvUgQ&DN?}%iZRcT-u~YPm zHntDhgaGPZ#?J5pSXJe)X=S#wN6D0y$sn)`IfnjJy<};oAL3ESh|UUV=@S=Z%;L8Q z{Q>{F^>y3p_GMO75PZMNeq<)1zhQ$SOlZ`K=M5Y;o1G)PVV}nC7coxqrom{6d(pJb ztb35c*!_xjSLSIEYJ!yOED)4*k%ddw>QzP_GNM}XKoqrQ?4wXNuDK>Hz{oAwL9O}@ zN@@qF5jx5q_>GAPzj3~mZr4+o*9@T@dWz!jvfb5aPr64>Az&$-v6xdVX{&K+g?-sM zY)lHGLX)Br|BtBjA6QE*v8FB4JQ>zuH=un6598?)9q5L5#&Muq(=^lMpBKsZ5*$sx zzIs*LZB^IKcr?({9Hz{1c{e;-x&h2jnd*$)_kQELe6VKDHy^*d z<#Vg0sf=+E;O1=imsw?)D8cjm zNb}MlZ6Cw-VN?!NU|H%yxLVHeYt4#OtApwVHVo5q_NKaFG+gjH3Uw?>E;bTZq)wvK z`Xy${A*UwP+@{Kybh0Md(0YJmvv=>_QqiW$teSFfrd)YQDG%k! z2bJ={j3Zy$k*i&=)UM9}8!dX9dKswsP&2Z5*w9u)K0j&N1TiS*YgT;Ca~E^18Q z7zlK4!^?c$-!ykF>kBPx$@T40`p6D%0~4Y8*(6cjnw%KJ@duNfhzaTsF+%!(M1%U} zRyexF3@`l~q{ku2)A*788jegPow!Amb#643uT<^hsW(au|$WKnbNQS~tZo!xg3aN!xcI z$xYdl<*UeihFR>CUEro1$+Ai7Rc3H73Fb(96-+#=0@I$R93_g1DIS9|3>J*6g$JR! zJfH{RS7){dxKm)LQ%)d}PAESn=*pYrwh~)XqrC=8E)tc?idzU#60C0(^k3`L3$9AJJNyhb!X!5-D?-3^1$oVlje?`u}g`@g# zyayxQLNYqSmhQp{xr_5n6R7_KMg5eVP2{`_N3}C{qamE6iGeR@Pl;??S;`p-Lh&w| zhaVvWI}-BMeRp@x+jGy`lMggy1HJjSb+i75ZC$yxA*F5T&Nmj@_RRQkXfplW!^)m) z?6$_F zT`%)8KjoATL)mh&rRFNp?OlQ;Z7dsou}rxq9p?=BqgVyKErOLED};oS{c2eqP&JMPN#vA_Mva=THj2%%p|!fx8Ax*pWaJ6$8*W!HYl%O zxdxx5X<@R5YNgkJT5;7f!@p@+rG^w2EY|VM$4{@VUc#zlVayR8^Y6{4(Ha|=l6^R2 zhtmBTYD#C#%;J{ zo$){DT&wt6GY4kY88SpR(4TMV$hE9jTGr=U_9!iTKC~^iJd+JRGvk2+9lpM6{*Y3) ze#ZWwt!Kf$@Y?;oCl=dI&iEg8Z_0IVQM$J*c5l18?_T$=*`pus%D(uLvi}$ub9u|B z4mfk-{HgoFLGbQeuw4nZ&#%27gp6MI>!CwRaKnse?$ClOU)hWsMU|m^RTG`B152Ji zbjPz;i__qT0N;1b?wYT?Um02u?)E7I&mh$Uocj;msk°|gIj0>G{d&Dis8-MO|w zrEPG=|4ZDhK*@KWyC2;3%V70VJ-7KVztjS=0TXhc)NZ&_b>~})wfkpmK>KHQ{<5m+ z6Qc;c6pL8V1%+a$fxh~JNGtrp*SU0z=eqiqj`O(Vp>Hvo5p{4S{_myP86^ih0mp9a zFeFFY_SvRR$08E#L(&2{|CO8v$lSv=|!1Hk5{?E8ZyJvo4 ziGJ=5eojA2J9vBlLiGntx0;qXc<*-qwErjlkI1_eTR+&8;B14mug#+rA*L1_4o1M(}Suy7v6&T{N&10@C(p0T;&3SOnYK7F-vD z#v8+RG2Bsc0UUDUb_#%-0yyLaIOG(3qRUM87yRbo0Efx|uy}9*++}8CsqdHUW=!3x z)Pv^Kg>um+1Q>i&u}TUpc9q(RKFSODsPdX!uSwW1RINg56)IW{(9&vxl7t#DAO=N; zIgJt!$}g6S%z_a)Ymu{pfl!s$Tk6%q4S<6wX@7KFj>eB1#m4ZTFr?DY$RT-B`df1T zGaTT+v<+$34M>AhgPM-SR3frqlNpeusq==)YakW0?--)n*D*={u!9scp^NTAm;qOG zZVdp(0H|<^ZHQQ?P2rI>{fEFSn;uqjkYY7EG65w6%F*+1gtAa8@qm^`^Wgx;0H-EP z%(^Enf^qI$ycCRl`a&Z8r9r1r{5{uH^y@z4GPEPWLpHj9Z)ba-YG>F9#?Ba!P@o8{ zNOwR&`b%;M^wD6H%5+Sm`|?7GRis3%mHssx*vHXTckS-E8e}iH#jRg_XFN{orqut9 zoS(qa4+)GaXa{brL^75w9s!gBAcZ4-(+(^G_aT8*)U;#2xqakr&4;y%{m&z+ zhD@3WclVdSb!hrf&bLMJZNV1n_W#_affo#0cmiy(4J}yh2D&@UV$rT%0CtCE9TP?>4ZY6aO_etP8|&P!#0qn{Rq< zc=EpLnJd|jEqDFULyZfchA^-Rsh8=R#1NRS`48}v!Z28zg{6)N<1ssP6%3|8$7)h9 zauOR{@qBf9F#al(47O)wFvmZNo*Cm@Sq6>)*-cznX*thEWoDVnmN)GK(%-{TJ!Igl zfu;p(Ee31g5_p)^Bc*e8d5WLW$R<%L&<>n? zwaf`O&)qneYuT){Y`$}Ov1V_^`%7;PyTZ0ssfJYg!H!!ya(z!LeNTUQbg}xS%t44i z!5Uomb#pZhxoS$34_00G%zARc9wpe53vN<^o9-ONB&jJwY8OkbNj`sra+l2!+FsZk zc^aPObA*n+&Xj)h0W;vsRyNqT>ckNc8CFkJ77WUPmP)seEA8S^1?K0}!D~SR{I#4eOU~D4`}bwr zo(A8m@MIhXHrk;CI_9^Qpq-c}!=b7b!n9g5rm1t-if~i9sLz!fl0)GVwJbeR?pB@2 z6u$Hl6{Mm7{y(X)Us($OJ4#K?_o;#SQV~hvpV@5H9!v;-9k|1i-CFPY)bD9=WNi&g zbzDPlHuO}sZfDk3&7}28Ok9Fmf|iT#pZO8?$jUjYmn;-p=YuIwZ4)#8vBMAY zn(@z^O>s9pa#B!}&ECI?u#XsB*oRk}^3-FYeX27O5mF<#=^hwK;iEbubaO%;h@Ku{ z0|2NVA8JXbQgK-L5325A>%pG3dNAVl`{AnoeI;1lR;r(^_yurDJSi;$g}sq4&gY8|I>lYzqbF2>U_ zj5Gs!NP1x3u2AdEa%(B;somO$>t7s_B@%E|Kg%SG7sf@poE}=S(GM1#- zCA~nIN65KJ&R>v2GyfK+t%9xa;~-SyV!I_=RRrz`HbR~yE6?+vZs7Q)zvsF>=9+%Z z`99{Ve#H&3*ImEj*8ht8+Q;0^FDy3R@+k-Ba}UQq$A8Qn{FvMOF?Vu##MY0wzK^+e z|DD^TaC^S6IC;w#930rsTLM?BUvK_SbGEi8SG!xO-F=_m^C=R9zyU0K`6co`uJZD| zGnVV_S@&ZO-p4Jie7XJ(@8bh}lILq?PLr+VV~+ffPg{HWp1F#f^*8DtbMP*m#OKs% y9|KkAn`b)6hVu~)Wq4vVmf9s2{5WYn$n*VkmYePy?)jZ}8t$I@oTCudUH=aY;%6xTINwySf zwtF#yu#*7e*~O;4K!SCWMb<_)5C=#QBM)Sdx$Sc)}-sfc00V8FC7Y>b!|OeAdzn%@PNxn4fiBv39kaStNGU8qEMyeL7BGn7k28y%rR_NOv zu8Gtx)EX!w#g%e(UmDc0F4P;T%M|PQDaDq538Tia&_K$ZQ0B^(HIgznlzF&@FHIT9 zCQ@1frIotUW>V^f(kfkP3n{IJ(i&YUO-gH_v`$yrN=oaYv_V(eMoJr@v`JUmPD-1h zv_)6iK}u;TZPk@_lF~LPZP%4{k_FkL~h@6UWKA1W01Q^qZ^{o@4FDQ`|V)Xll7Y1CO z%K4Z$8;OUx2n?TNr+Gex%Vb}N5i&tCL;;b||MSX#7>N(WL-Bqwz%O%R|KqjOXT~0$ z<_5x{#ew*SxDtyFV6%Y;E-}MB=7MWtj90V5fQrl?-%x7T$-tQaSzR1@rl(DR(JA?l zg0F^w2{wp6rM_{tO19RwkfvUk`6~F~T@Wc&o)g#j===+dVq=(SAi^<>QpPYa#o^uw7oS^){1X2 zr8Yp=582(zTqF`0xywBk$&oF#yU4}J?%svO7gT~;h2O%kn&EZ&@-0Kcuxi$qsA&h4 zFfiIFq&6tN63XGIIn*sxw@@ivS;}mf(y%6Q%wmWQ@mvs3e;~{RG)hoPLjn^FtOJT$E?D6a6Pzvxtt-1SC8(gD$@)V{YR>W*(rrS=W+Ahm& zm!;;*$Y&Q&X5EC{UOVGnM?RbT;d2#|UT0&C1cuqHSV*@FV`L?p;>0Hu!n z63lJFHg6ddMiI%7goJ@LCyd&K3a#8)P3{*;D{?t4m50N^QW_4{`gbfazU-JbC|}9E zQVMIVQj20<=E7@=Wtm%sA15CJ&b7gCkwu0LMHB}gi*t++41`0`W#A8`D_nqOgwQ6Z zlx08(SS0Y6iN}Nxo^Qpx7!Wvq07mx0&f_8E@k}eF3=<3o1c70AJf2F~N{D5-C=&>Z zianDim@A`eksOII*rlM906PV;;guSFUpP>XWnpjv4hY67t^k};o*CT3N5F@L7bZel ziZw%=HZltsxPV8A&LgzmcMjl@MAiy=!+`;F~M>#nVCzp^e>*(q0cKCe#s zreye6IhC~S*R}4|w?6BVddFW~*{PpSH@2l3$K}THw6{Lx?UcQpdyVbSW}eT#y0p_c z`K{ebmnE&QZH>p!wGT$lV}vC8R>!Dama+_4LrO)0U>`u$9ar(LpNa8zxKGx&Xpo!2 zgAiH@1>q)%MZc48KY*@(2txnV#sV=jKyqr>0N(w=I6vbvku$24$78WD$7@{MiCi0w zW@w!Z@kNF^Uxvo-a_5QnnctD;=rrQ&g9y%1ZF{PAP_7+J)t-}U&n3&#_S%%aTef#g z_JQb1wlp9-~o2uG?IbWhk?`kW!H#*avKG_0>=`a67ih@nYb3 zzAb(S-zHaXc7tG4nSea9*j23ZY5o^bTZD@r$G4$XmMdyKQ52LO#i>nDhcaS8*fJOA z(}F`p(SU@6nKfz36Tq896pqvyBXE3^#;p_Pyc!Gih!<`_(4g~RL9Nru*%ILSMkYHC zJnBB%JP3M(yDL=>=B{TpYz7E%MWw$tc+c^Ej8Kap44fZ?l&@UJ#cMP+pJTm@xDn@g zJZOZm^JADggV9-tlyWW#Izy0SxB%!EJgzxEjukH5Wkarv2`KehR57Rn!OmlJ0izL! z1k|X3Tj%lXf#Rp{NMb{j!oPqBh^?kARWl&h3?vO zpGqS~ZkH?DQ1O4az}8sGE#ss76{{HwW~ypnEcO*Nd6 z8_p!>4jh!1PF41ASN89 zK$cRYbMoliPVMzQy8F5HrSq5O9eNsgdWSPP^V&flG*FJwV`%>x%#O$C9L`lo`!yD4 zhV~()B0;bZpndOrED{P{1kyjA^4ow$e+WiijMm>rVq4EhbfO*ROae$-Cf2M`4O|Xc zE&>!?OzN|s)W_RcE5Md$SfZZd(0Z8S90E#|#Z~;2VYHjjh%XD1XP9*W{Y9FDBA|EXX!skc8BBw0~p9)97BFilZTNV*n zE(!17Dq+svWL9W}47DI6tlG*NDDi?;xjLn#a=)x2VR`?awG}qNeOA2HR!O2{73G1o;^SE@vg#WCEo-#;{abBW#p_ys30uxQ z%R9sy#mWs-!kn-%S;~Slds2hT{jz1K_5&OW_#}w;v=q$#zLw7YYCF4CnkZfUom{z= zVx4f}Gcp{Ep)^s#oY3lEw#S?nkW*aqiYt4osFDeo8?C~l5GbfTQapbXBE^jA9{({( z3L(&4t57i%`9LVju~}tQ2vIg@kuT6}{yFJ@26a#b+Yt@_xu^h&2ptk=HY5ZV!yN0U zr&a>dWl$|?M66nO!d#SI;$soIQ=r8d#D4m2jOL=);$3uow7_xW^ol6Ph4F!b<&e0t zw&)MWBE$yRALnDMU=mUdf+Ejx0}&Yf0QWe6#ztWvwzLFlH^+pcOB|SAgB;oiall|V z%naDSB1XbWdENjOCuRli69z`REUx(abK+Z0N>eIx$_=x$7A2-BBA1cnB6Y0=u0EREH1t`_J;t))M4|9Qaj#Vncfelc( zqF_>HxH#-0SPz`}WlBjD79h&;iifm}aBK*zp~O;PjiMCF5{VN5NmIpx_JfQ;fs_bD z8dXeyK&jLXWr@c%h(&pLx}kVvg^C00b}_J}@*$QJlNGPh* zfl)6I4z7V=oXZDNtg1m>DFGWq6poSN1l(l%U>m`#t(1j?P!x=*u(OJp4S^ZN1V#>} zWF;UFOGgQqh%rILXjL3HVaHB53t&FZs;Z`VEbc2)#b%^0RbnQB`~t#z^9Ii>av+X# zWDe@V5U|Y#L;}j@C||2VXMXI@Qlw7KJ5t8RaU~%x~Zg-$o-&a{K~sS&r7x@u>Pd$ zd(w?Vd)|hpGtcH;cBO`|$ir85yt75jKD)NYgGLIhQSKhu-IH>k+IF8xyBoCZdD(ql za$g3`q+=k}epYTjn`)nx+h-*Qz1P^H100pzqte)QSn?)k@;a<{d2PxuBs+#)_U}3_ z>^UlaKL7drfzjj<3{dImENJ?h@{)4rK{eIbooX178%C0I=_X&QX-sY!OU{4es(ad( zsvnW-NB+s(UDxYR`}CsLi)w z9*epUZJ>KS3#Ym!<*vz{ri;n>w4*-dXqFw#Qp>^sPr+bQtVuH~CnQ6NK9w@EBG{n)xqW zcpW3>c>4gNA3_ByW=r!K=R?7pMk-@Z<5AvH?1IR}88NWTgjjwSO5U}dojGnUN3t2z z;*YYM?W5hSL$-HF_P%tFAMIwHXgBLWW+S%%Cy&uTN910JG8<{O3}kd?NU2B=>;pEj zPMi4u3%ke9z-02au0)6}iEwiqv2DXgj{eB~`ggdbD8Lo5bCUg{Wjig|d(s^~+^<&L zFW)iy^%p0l^{=eaP`MN)>tR0UJLEDkw5DV(cLoeS^Gc^KabcH|TIIfDbn` zyx7nh7Q)y@=-U}A=YMbsI)9Y-(q2IZH_SW=Y%rN44>rgS7$$kPQcaa65TzBX`sD4) zyqyzK>#qZxVkJ`(1T$n%bkYe<6Yq0qj-;4B2ZX|7h>8eO-*om7!D#ShCtQ$SWsg~D$1e1?ZfG0q#s7T|Ef_5t-)SAayQ} zkYKr>KY2M`NKA{Q3a+q4Wc3UJgxO$%5xn#WWNIbO8_@|pazckaCw?8w9SYs5MY zQIW%!Zfrssw=P}uy{cWdw@daD>FyKA$2*XZ_Z`E>e-8cd812J&UbF-w@v4QK!di^}D<&>tL>gex2OP1Q%W?6BAONeH zBECIy*VP5cKVNuVkn{Cj(!o)0qM5hP5@0F@CUe7sC66lhPD)r(5mxo-33y*D#;TgZ zr$xuA#OyoiYB`L#Ah5c(O|;CC*G9{{0xy4Hp^FAW%yL|e!E-+a6W*M-p;}imb~QS< z#s=tRJ`i7_gMm2sp>Z^tc8GP2rI%tn4OTw*iJ+Op?<-O2z$tMph=42;TNC4JB6v`# z1{F8Y1%w#dT9|l%2Okur0*r0(5Li*b%A-NmV3(vv@Vp{D5@V8})T5oGcqRV{OjE-gTTo zef`?!*V67A(ye>HF&RNOFQGg$2H?!)=Z_(gKgM}t^uKV7ItjZX`S zOK6@Nix;>SkX8u2jKg_{%iOT4)#!fVtQ(5D^MPDkj5a>?01XtOjf5%ZDgrHwPziqy zT4pIDHw3;~bZEI%mzK)?f-Ik*R|6GoJO(x^y@NY9_zC31hhy4Va(#e=Z)CFL2e;Ug z&3{#fVk(Ad)pR{`Z*Gp6oWI1}oSmMU{&@Dzw4N~0_DwHt&@8tUSPP4E4-alU;LgZq z(SR)+OqcX>EXL~TmC$bleSc`x5KD)m^!*{9zbxBw68y8qvpF1+&aHNwUZ0!0POHo_ z3pBs^vyWAi>NtIq=axc`fpr8|kfVV8KI5w#-pH8c!H_>I3be2mkH>hCjEEew?1%)g z{{xQ+p?7Z}?5A&naa!PLo+I#q=muaF);NLQh^@g>zy=*#1lF?7`8V%e*;oWaF=#JP z4wLLhV>e6Va)n{BL@@3{_7Rx?-vY z15En3qQDx-iYiT+g%iNK4O1+rCVY|q`*64wZ|30yT9QGd2VWHYVogC1U#kE@)v5j= zq>3p@KC&q~+CP$4!dDEG@C49&N*z>CAb@H<965K6eS1yH-Xq(4B>P~xZ!Fa}Blpdq zRnJFi`nT=W_03JJf z_K9QHJq#wtQrFMzpWD-p`-dhY=uuXxe<XZLD(vd{){bw9$U4F)iy1a`vs4O%wF(g-`yWuidH1P z^SkH#S><=vvvz{T$U0zTo!U75@2`c}t@TyZ=fu3)tJUXz6Xvh7A4B~vP{vl-mR&?8 zhJ-}fDylIBA$PW+eyE@lvbuV6F<+9eoSLsg(LHq}9IT71pkbp7PVC8dd*G)M#>WN$ zJ_&>U#K+u%pq-zsa`B8-m2f0VwYU2%r+7A7UY@XJ%UubVu7xv^{U{7QYPQ@7H%KX& zvqVwRiiAxE?MT=XrNr|l!(lRQOsn`nUl2ZX!OV)3C)^LAHQz*f%AR=foYs7GQcLB2 zSs0f#16Yybc*)4NJPA+2@$h$n39CL(n8OpVuJpwmPAx^oS)@GSc?j#y`v9Mb$xRZ3vcsxX95Z@Pq9Ql?jk1k0Uq#?2u*g}FXgVsO<|M8JL^=*`mZ}aFThd7m zngAw%w#C6~HIL9ykX^D!K&T6ZQSm~HEV=D7G7?BoODJvh(E<2cDS}`B1i~O-2_W(z zY*swUq9VdYR*?heitOl!=t2F&{N)!bHmF{wU2(p0SZ<(^lV)g z8_X;MRv(0k&}6b7CWk6!Tn@!Pb?@%n8p@aFCNifWTk79@cr!pg zwU?%V0ou0otdt;ut zI5~A~dj1mcL|8YdFpPeSlyWS|eewdGE~}m=%;JW~2|Q|5iWgo%SZ+N89?oD-2rP!e zA#sC8F@#5{NAd8S5DTxvFvvH5APy^z01whN>IXmdQ%H zitm7s$?8E=r;g`jR5^UXm9=O_(npgfkJ5sAZqTZsO7GQb*SQG-F|i!$tK{*bP~~^B z4`fF=usDTI;d;Y2jpRcgT=xce42r@wsbm1(5iLhi(Ng_)s^R2C!B(TK z>X7IGG%KI<%zYPcjfT#{_H-`HwXwjtRz^zy{6 z?VR3SJ*VoV>pN5Rr{wxmNf*&=(Dwdp9TgmOxEPR|20owvrs>qnaB6g3hJQ^rlJf^{ z$^p;gE!*~%z2>%5^BKAM%&Q;E%~z8*{$l9Di}qhGN{;Tm&Qs}zMuKHrZW@1eey8bL zy1pw_e^RbLxl=!yuI@N2wRAauTSmE>KaYKDr^?$8oYaYaX#0(u2GHN!8%+cg)#vNbRyxlinMj((rc z53ut?Cso^-su`7QM$<4oN2l!QOxJZDT1>TNpF5JJPliCTYv@QsF|;yHj^Hc{SbK^VLad@Y=5yc1}Eyn;#@^ zr2Eb$=UzK}uq7?y|MF7W-6FMqB%Qw}x88$Sy$R>Hl(EH`ya5Y~JJu%K+a!DM^Ki;P zE&Hc`<=XatobEb>nr(Z=#s|-Oo;kz5y9K5IWR(z)+yde*Y- z(p;JBbx)aXd*8kTb=>m0lw&}43`h?6ObztoVc9kOGP>=$v_E_SI~mw_S3dd4bC+Cm zT6T}U0&k+LXhLiTrMSEjEsUT=ds;yw_O$(`)B@d?Q|{@^>mO=7{8t#!809qIe}q|# z(5#nzL#pGtcL%yvQJl$~y-mI?B*^ zgvtv-of;BG)}RS(_$Z(KoD>!6f&dDwc@-~{g0PBrSV72{5BWwg)0^MhBxwE;W`lvK z1Vwsd$>$A%tN2)0D-na6P)U35K1*_aZrC<#zS7NGS#eDLA|DJ_@iAh*BR2owng9|e z-3EmW3pS^4pbafsAafD}7w+C2b@#G(z>hM@a^U(Ki$7FM&`>GMwcZ@M2PQMHq^WqZ zuZQdXD;JG(f&b)a9JRSivPc&Vi9e%bDSzP1Y=MRkKi->#!0!*ZnB zYrBrQv<66SpO=lPp;>uo_E!trLm#KR`}5&1zIIF)v;j7BvGjU;GdQ>3q4A z=I{3XP+Yms@YDgjvk&&S!HEWrt;5VsV5MsT&Mln3qQ#Us+jB~0NP&|$;j4Kq;O3Pg#|x$&5$Hd z#8@-{D%Cmw_(K{&4T1>&YM<*v{;ZHnT*#LKK~9y=QSw%0e|5mBpiTZNhHAhi0^8gYanJ`ChVnY@{)Y74pH$WeZaz-tgGCeUpaGb(?Ii-AA$ z0qQk*rNqzUX@tsNkc#aiSSR>P9Orl=PysS=;lD$CU@{mC|HDcd>c6JEUsLpdp=!UT z+P|hweodYDOKR|I>g-=q_hjncuTAc+&0W7P^*(v9Q`-Eg^|w}&q3v4=qTf2qhAW0| zDU1)D9z&T_+kZ&G?_rO{FeIJ0a7e-Lp}WrDl$wSQDfm4c`?0|=X^bb` literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/dbrx.cpython-312.pyc b/model_executor/models/__pycache__/dbrx.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0060ef446dd242a1fe49b5f6ffad268c402863bb GIT binary patch literal 21474 zcmdUX32+?OnO^tYCk8hL#5K4G4haGv0Fe|0Qldms2TAC@V98TMbb}mn1^|187d(*Z zEjb2kIs$9sCFu1=P&SFcl`4Y0l^Av^v9>BHADdLr0<|;lK($OPC+nnA1)5OgY^suc z|9jm%1E@jDo6WIJ;`Mv4-+lb=|G)o#^QXa}Pr#FOd?WGaj|jpqC}2D;-E-$*n;={h zBta6BLP{JGMe^H{wv>Iy&iwYIW5|Ixd(xS54Y^pHBk4|ghCIygOnOtkAs_R*lKxa+ zD3A&c1zFggEJ=liLaEZB(p1?{8H@8I%TpCY70mBVR;H?ks#4WM)v21H8W!hEhEuge zwao8NZb{V*)iHk{S)XbcYGD3gvN6>()Rc-0MOe5b*_>(_YDu*YwX$$1*_LV_YKOm6 zDobunbqsYde|d6SDmoNRZ6Dg6+A*{v)j8Ce+BvjS6l}tyf>iOkAXUn))2|qVRCvE* z=l-E?ma+;dtK|wawui;mAU1rD{JU7nTBO`^kCeSEWgSx1-y>xoOWA;wjrT~|&r&ua zW#k?y?_(*Ok+Q`)rQGq3Lm!jfm+aBjUr`_`YM#e4vJyX$l;L+J(#gcIJP5Dv&8ntQuN8-b$Wd2R_y^u_%^mKMDa8l01#>ZnPm9cZwjE%>Ycrpo(5+6OIwQc_W!kChjV(G*LHOq74Tw0cnj~$NMwX&y=zVMV@ z%CW?#99Oi`XXJ4-JD!mxJ*MoLu?y=Hp_I_Ev6G2R`YB~>SWcr!h$?^jTpYcch*QtZ z&U&a>%GkJ!zLuE99DhuXOIq~{W5e+iM&#kt30aa7qbHHFN|Hz7=aQM&1vzo@R3?@j zi%YVi)u!T?PRKDuPQ}M#XU@mc!||jX8;z%^V-=5&V(?S4ltAO3V*{1`mB{DC_w2dx z@i7Grek$tJ$_}2(jJ?Q8P9NhX*Cf(3)Tu-|jTW$>P01)u>qy1V$T2xJp1BlD6jh*4 zMLISTAI^*^moz_1n2cYNm9!S%4INkHk;KJezEZerlAhl|_-i0OvM?k{!jMf8hwNTK z@=A8uAvt8{=|FKd?w4#6Zb2AvObH{R$H~BCT4At@UHD0z0Ii8Ym{?Ih}s8_U@tmOB;skM1@T(oEbXKi zA27_2Gkp%jsd-{pfr(5kruZq9J2TGqwB{Qb8PyYOq0|^Q#JNYLBzT_iIOkM!`N#I52m{%(7~?pOG*a{oTwnG z$d_A}(Y1bjNPiK|Z!QaKKEYcu?N&YEo1QHzH4iM;?4NR63FMnQa?QPJb8oKsVYT_; zDaTDuW4@;0+G|%|qt7cx1lwIkCqXj|DvDHA*h=6LMIl2+0hrYY9)h@&pX`;Mnc=Wi#NRY0Bb6LpO zHNCm-`o6i;a$P@0Xeu~!HXo{A^3*H4@LjnNPSj6JhNxpKrukwt>E}Q$V_F~91**s}3PE3`(^gXDrieWjKjKyw!@m_TaHTBodTs<>)NUh%X34K~^u{j&& zo`3t5w_aHl;9ZcvpZsp}Q}XhjQTfC8Gaq9s-I+yHF-7$X^iK*%HrdXosz}sxQ`*YE zI%W5WhtXFT=&MI^%ihzKW?iO7@_@cpn?chfd1bfcqcS8v=xl(|*&yg_Q1jD#9%d9W zJ;?W2JM!_(=F*P}okFnk=m21A?-Yi213h$ddI*Z8^C4yn;5ltlv_kAADJ`fZQd-eT z>zXVMsjOWRB~ZLf$f(_<&<@Z*y=)QG7Ie_8f0J5;m^mWYdlmOC7dod+EaK> zV85ajW`7wqehwtpUra->C%|rKeuG?qk6@hEBVFKNxM$fILQ zDxRbiY`(XldgTzDs9)Jiej2JXy3S44P;C#3-7mljoXfAxe8oobY z9?q4ws^zV@@||k=PJDf{T=C{)aE_8=10}$M}N})Zw7ukpfvL5~(vDcynSr-pU==E3i z6eIQld>L)nq#rJ`AHJ-ukblI+a-klKvh^6b&75D`r0l{Nz3~6v7-ij(hp&bWBNf@C zH`XI1>LcYeMz2oTWW2oQc%9v(hQfFa{ofz2(-nLT7rQt%JfB^2#>y*Rb9KTcp)dlM z5Q_Q+l`sm^Y!`NEE=~x3^*`|_wE47vP8yiNsM*f%`W1_|UFePa6r%o`LyCi?C0Yu0 zo8)%QBacey3yI7rWk1EbLD5ge!G|-V%P6xq1Iar*K9-iFUQU0XL`vl;a-JsV8I}=z zI+M_Kl0+40K9r?!nH=?zKTVW@k!W%_FBsID&Bfn|+G2XW?h`RY>0g4DF`8YmYq`94 z%AK#;a&749(3CGN~Hbm;9J3bWplo&X0|d{+pX4iFSLEX^ShmD z?Sc0lKMwp!V5#!>9jCpyd?vW+5qcl|;o0}we^$2mC3V-)8Q-^e&Bo_^v&la?xKy|E zgVL^iSzWHIO)YDi`{K>A{(Nf>Pcqy8L22Yx!}b|p-e3LK*O-Iipr}<=&7AvoeCFtF zyX~M@&gwfoziWQt#(|~A`+vA=scztd()}2P^7_qN7TG!DyS{5C_}A6CPhCPq{nhB~ z!P&FF+r8=*${I0r&0T*K|6bX`;rTOaWFOUcHF%3xd^NaQCIq&!L-2-ua8MJXCCdG@ z()N+_06BEnDI}({Wvd*3PqU{p3g_i*qobOa5Eq7SC=XIPiJXVXIS8kimw$vJ$$5f4 zd>IaSd9<)^Pku*tzB+u{U*&vBWZXP?h@Fql_p84%It<2T{c$ z00&i=7~nU66%L|EP9S?02crJfC3|!+zzs3L3wGaSDF%40;{mhy!2bIg`yT-NAJE*i zc>%jkGJuA9vm*fD;?VE9Q)>jH13oI>Mp})ybTa3BVzdGcpcwzmsJgNsOYTOQ;E84>7bX z9VFNBbD1=w9nlR3-UFIN_r zFDt~yC=C)h5Tt@EkqM#!>>(+{O5QOJBIGjJyCU=l@{M)AtmF5gP51*FBpsbuLb?Z?%Tx$61K zTy&or-M3u*05SF-*R+7+=g>>8cDq`;Jzrfv^U~}~OBGw!yv{H|F#*1%2^vxQ;H*Pm zXaw1M`5`<&U&PaRZ+s?gh^JH=9>}1U47)M2ll0x3+d}37nuAp%Sh3?+GX>_@0rtwG z_N>X&WB1zzl^0L}pl3Sk$7RiNJ|SOV;+-;z56T$(T7(k$#+1a#7)z||#z(D!?MPD& zWQbLoXgyH}&Mq*3W+G(FtiGwgi!|wH;Q%wLYROe?Q>(UtB?!0Y!rRsG_W9oV%zN1z z+2!!SRPa`~{!RaN|J?EAaPL$w-`MhI@_KT9|KiT&#>bXCb@@0hn+rzDp{XCXXpz#zl*;#Wn)N^6G&2J(GgUs<4JfI zA3zk`aQ;vfKYqy9>2mfPKoh(M&;&2<7S%kAKl3qm@n=pM9cc1@7HEPPh!sX>O%bT% z2B_q*LL+d>SjT&<<40WVl<~1s#?MZfAkDt1U*a&678oWSlMWGcc;FCvwT?6s6U??+ z2^Q4`(hSLWJ^21PNV8}OLJ2XQC=%>qP2R1@xDM0Ma%T_fw}5!8j&l&-Vu;Bqmjx_T zOtLqr%K{TbE*eFng%;=_QkX5s5RS{!ZZ;BTX+}ymYG!}d#g-OAh7SbL&WEQxR%o=m$DfwC+{ z-H>x7=ew>V1xtmID-$(*CNtm!P9wysjCho5HKxu9oSc&T4CKAE?Fr#`=(w;>2ZjXY zne&Zw=13TIAcgbHE1+kUn|&>+t(w(_Hd>Wal#aD3LpVEofG}9*$U15N7{?x3YK}6} z6MwPcL9E5>aHJ?kzz=(ns>kpb-gnKr#YktJO7c!%e9=C0XJkmWVGnYA?v^4g9hn;* z#9Fo2rtc9S*0R57k2DzB!LkJgk4!kbUOao|eAKSA;X{FjPI$Y(c*Vw*vD42%hvrkD zX;Uu6q(q8zz!zfx2NI+Yhk7rbmbD<%%%k$yMW~!EC1r+9FxY|)H$1m~^q0=AcqTK7 zq5_0KL&tpzn)0-A0bdKXMjJP9h?*M^F(uSOT-QgLd4z{vtU-P%*-#YvP_&V!ypYGJ zQ1c8Y6XP*-nff#iK>Cyn*aAv!v|Tr>rtYS?YA_DnPMK08;hFa%RYN&7F><6K}OP;+;p}nlAooemQ zg$BSyNLrHjhjRWF)!#Cg`oP}@#;r1(tJtnqY@dHst=P91|DfW*rNDzr6%T@mt8AX@ z&$aGT@qgW?mHMVR(fG8OQd^HM*B@i!mapD&ZS?BsT;}b`w6~;;x{=xx$BWu8;*@q7MrLdbkQ!-; z%xrilW!7z^g`Pz?O$gYAC+jhI1zk<=AT@o|F$jYrdnz%id=nYCpeAZ|kZ_>7Qxx@8 zIIv$y##7@lNN})^$|!%EKD8S) zbo?mrgTPY#;h*%*M3>74r`=QH)UK&GHuM``p8oP|cK+!FaXHjG?VxztU5Tp+?7dxc z@#X4mQ+~2W*nfTheBE;0eHPXTGTM71uu(#!MXL~T=5QsQfm|*my!~Fn;dx3!4nYHH z@^EOfvnSuXn?aKYMO`o?4@rhC%*tTZ&B723b*R$I!dxz7-Xpevv*A{C-L>S^EuSqkg*nQ!heXR;2oe@n=4N6J$kp0y1mBL1?HC8b*n_ZNmD4b)Bq8bH?bG z4fJb+tf(6%7`*o9f$<6u9)J~tAKM6_nR3WIbXAtJl(Fx@TTE35-yV2tV7a=7 zj?)n{vxtTp#s@Nh_4iC!rwTu>806yI5{?a=C`1fS@iT?l(Jyb zdKWrq?xu`!#E>Bs$9e2b?VzUstuuE3Xq01cBGkpG4C6h>r~ExMYQoV4`=Y3;h-T5O zXeArc5*n)6kyIh(!&<4pEEd_wn6#&x7T96r*hqU;8vBb0PSet!r$KOPu_&Uzi?FikeCsmun7VNjT71!W#pfO7mJ zyp;FIuUmPVghIK2Si^R<*0I)Ep0L;6^U&J}VUqk{ZR3HHe@yVoitRn#dM9_1m1+81J%av^35uwGk!-Uge z8o`$gIbj5Et;=;=$LmJ$9vFv~ZkpF;omcY9UI0~KBc#A4Dfrh->XzBp5uX%%-8tlw zN@TwrkR8?~jo1inqd#a2g0h2rXnP2n?p#FZTjJLoUvqxVwa(xkQrYLh{H;*_uagCx$se&gne%lA;w|LD zue+up*A=K$K{-3^w8HceYk{vCawSSKDPzXOc#)^N&=SKp<(P6#xkg~H`no4;gW{GW znv)qDXoInX)ok8d@vPe z?vtW*Z;c!nHCn^$!I|V4G#4dB(W0!t-<=jX-?A4QXHMIuMY;n5hR=bIc(_)TPB0kR z6A=OVrDe{M(q{6}KyvbX`SN9MP~{nb5Pw37ADCz(8q7A0c~*4|;82ez(+A+;sa_@m zNppkK>pS<)y)ggE;=s~VFML?{A|qZC?yeKW2c+4SJ9+!mFQ)$SZ$3VNyGV94jM4$XI-E=MsCl-mK@FW}k$!P1&%G^XD(^5g1) zg6mNvP-tEltm$5t-Y$|ils}|AT*4U9-24_mT63R@r@;y+nlFJXPbg<}SmsSmQ2KI3 zKAA|5uG#8&D`G3DGtayUPy^vdFFxQHzWzk$&pIX*8<;vHz2DeG^hdKeC34m!Dg+J;7re&N4WoxNU={&KlxGMMUL^kxKz=;=62kV)Av4NPQtTo z-|3ofdv^z}r|4qQ?za!TbzmWLvzoA9KcOQKmVR$^>c~o{dglD>`42-|-`%$`^8GX4 zJ@YSme?0If10U{q>Q-rWt~9EaM(4LKw0&5*2gYAD?Px*Ii()=lzZ7VgJx>_(9lNau zjr-8k$yS`^O^m_jk*5=6G>?@k+>&6yuV7zPV-9};5w5axZu#;5Ja@TcjB zFl-P*K<#A-H;?psxfLRxn+>02W4plA0Kn13>5H@8I zNioPQ2S3qX5?8)15~8j61(PE^;&(9+?8^q-RNUy(!AXiiRzz%#MEj_qCqb24U_ zFDIQDrt2>0ZzC}?W(|bDaUkdK_`u&m+-+!EKC+YSA=`41el^m+@bYrx!K=Z4SJAN) z=zu;WU*EIPq}CstI=)g>2ki6c^`m5zbiW$Cf3g2Z2Yzti&#Io7I{fqc=DE&=;Bx(e z>Eo+*q57!!aj5PaujNA9KL~AKsn|LnT5MSAcyzhqv8BLc8-()$i>j?%uZ^dR37(K_=LN+k*+)gm0pocv9)DmPv0b%YDRba(pR^U$ zfi?6Yx!yI+;mw(K6vRhk_0Uy@L&A*sD%b@k=5Vl`&e;vmS5BcOi+D@xC(|_jqB!YM zj%FQM58E-`0;HCO#bn0Z2gd53^kv;v@V(Jq$xF(1#g=uCI$$vGWu>~YSA1D7;C-S> zcmOFn?+)*SG5J!~WwjF|pD|tvR0fitiF3Gez!@txMt+*1tn0j}h*$h5#bhCQ?d0OS z-)POGD?=NBJ=oJGbA;U6F)}VbGFk6wQ|WG`##-|?>!$z{YDv^j;%T8BG%KD?GDVrvslq0{w8e~NJS3d>^XEzd5N4M zIFOCa2+e+BS09U=aHNvsT}QwpmEs!r*m*f7ozpLzb3dKR5P0zrWri}ukXg-`*4UpBp-tF`R1M4}NERKJTXys4gTo!Hzgh=llzpL4I+Xvr_ zA1uX|_W6KX+ke%$*p@G^p83*DYN@pC*G_w|1hROfchxU=8&?Hnf%>B*&r?wg?= zD0OEOYE8$Cjtd>&G=c z3zZ9BTCRCy#*we6nHgAD+=kxjMN(e^wVd^wmHsTyx_VR;+IFlS6NS%7=qvXZBWe^1Us|@2BKtj3(>K zFnVD=#!+2rWmB_=3U22d)*K&etN zm=zi`Lq0O5JQ6>bjwg>DFF;d@DH)p5Khb>RL>b@oPcTLIg319T5rewnvEpE$V6p_G zn8K(CbOPLAERv#96@TFtvt;twFa?A$M!}S|z>jCejLCl!pJ$w_1+xd_Ga;~|6W(sc zAD~TJiW;MPvyOFjnCCwLC4Ee+!6}p${%mOmKZHCcs0Ln;Aczgm>6Q%<2qG83BkTd7 z;~Jc}FO!hT0!!kO9Q~e4n(mv7j%6bF84LOm7TiYxryn5zCXymBB;BR?*;P73mOJVMQ5J7=cg-3RC_g zIW%X=56S5y=P$^i(a=iyuQZT-AOD>jp0in5P!IkSdsP-IOj<6rw zXt+hct5L58>T`hsH86k^+Z!|h3WW)Qedx?Op@=p%+uVEMVbBEL8y{;eudzT4xWdM# zj2(7AMG?hd1ruE-xIVlr%@~mq z*0lrhY}z$TTQUGEM9}e&C{>uZ|BM*6x4_=B{X>;%F22RmR2ev6&ij@OkEH+y$@;IP zOmK~Q(Uh+L{&VUw*|MX{9vDUC%C@RyTVef|eJi{0>T=m5fCWQUQ$gNW;^^1)m8xca z-3L!Gzyxs)CDTJa`Mm3sXf|WEjDAvu)KB^it9AWksV3?-PGm8Ie~Y?Cs4r+kS<77i za@jW48U1eVp5@SvCC?7lFniC4DC6d(^&L@E9Cp z73Uc7B90NS)h^NbK!Ia?pujOcfb}`>keJ`O>$cxl#5KZ?)md4~ct|&einvA!Zr~az zY9rT3abc&k`WBc^x>4$cABA$utyzcat-s@@s1lpAa}&-IUkuI?UcPpy%h5erXm|`4 zH;2ixJKYsOG0d55ZvrTJW^62(=I|M@=9-87pibB5sB(&efue&h&3|&Igj988_RW73 zj$1l2{aqK=5^y6q;`_BymR15XuOv=jgGda)V^`SuuR^u41ngMF4?gir zu!KJ+liq}+E+|sy_u=q+aB^Cq<3)4wT#OztaDfykOohXE56uO69>2(>L@4?Za?X%* zo}4*y-X(_#XSD8a2z(9pz^{1*&`e}Lpt!LEiAtsa1DsX6D2ktW1hM@iq5c;_;3J{x zBcb(|LffAU-JjYVqV2W-=M$eG?pq&xO#DbV@{#b+r#8E2`_u!6O6mHg(DJFxg@6EO ztvoC?%#@SK>6!rV+U_Hw*fq0lO`!MMGqyIdf9k{=iRr|e0PotM_>_nY4_|xy>f>ty z`PW8ljbg{_GjG0p{pB?Q-nECtS|qVu^Ir9?3GlA%l_`%NLH@O;>^)-V%n4?4y5$oA I0X7c*3kHvz2mk;8 literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/deepencoder.cpython-312.pyc b/model_executor/models/__pycache__/deepencoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4300e1bcc508637207c1baf995c98a6f557b24a GIT binary patch literal 31074 zcmeHwdvIIVdFQ=&KQ0LH{Q^bsA&HPEQY0l=mSx%0%aSZnmMmMgA{zk-P^1Kc9@yF~6P6VAqMR)6da94eEx zJF~mL@7xDK=+cUtw$qv2EAimmbIjS=IJ>70qh`H16mbp-=L8lvM_fZi=ZaX|5^)cC&Uske8YvzsIah+XEo6_B4waoN8!A6n&T|It zIWFXQlM6Y+){CubS98Dbp|5nDt7JI^UBQCA5Sp-!z@84^`=xVD(k!mkZUb zIjhhw*Y60`qF!Ifi~gX^&^kRrUv=-J-xjJz`G$~B|9Yrlo%F_a(!FbXUAN|Jy*>z` zCiJ0^y_IIXm5q;iD@~zIZ*u3FLz@w9WTjeA${#M*#@8m6x&^6QA5+)nYrMbp0lnHD zKQGyi#=~N;HxfqNIuaD)gTV-*o&(2^zVPy3Y;ZVwU^v=8cmZi8C&nW2!6V^ds5>4H zN8`9gsi($*h>r*9T+3S?3ik(*J#ZyFcwrzOhztiqVetWvR{aLaNwpt}_6;LVD$6x> zXs9S0$Rt6xLpOPKp%?i?TD&KbhSi#9b=?zf*0@nLhwfInl{f;>gJ zYif+fX~+~Z_uG;j@>||?owJ3kVf)20HAn6@WYSOXH-&6?i_Ro>l|LuE$ocJ(?ZojH z_D6>ME=jJidM|;{;E-e;ij1&`G1wQEEMho5CPsVuOu1G#iW9Y1{nrS7lb_+@+Qmsz zf_vM{aqlT9JQv5Ad|b4svD|NhPZ-`a$mawkZuJw7zjJ~BA6EglpvgyY+;jy>_@?n_UE+arU$?IYLX z1H;kw%aO=X`w)gwBoMwD?i-5_i}Go#JroX)gymt@HgZiYN5$jC>MLqf`&)b z@mKql=T4`b6@ToVeio4#F2P--8BlN_RaqtSFCbNW1G#j(A*F`G-xoQt6#4u{k;XQA zsA3gjcMM_;C2T+tqrUlkq8C4swSQQ=5)?z@9_@)JDJ_(>5y5pX<1W2%ZSvY|*{!OZ zRddbn`fvN^yMM1WUEQ8`Z(p))7wO^oUG(y!!J#l-z9SI8%NUDLTnGe4$AS_0idaFd znw~p!{3Xda7>!G&__dL+WI4fdvzH^8@$&ixB+Hp_G&U@X4OGIVjEuf;BoYYR;l)~{ z#^@CJM8&Tssn^-k+AD9Y}aHMWLA(R-$2wI4w~ zg#y38hmD{`rjRjY>bEdzV+OS`hb&=>9%Uk$rI!wBV+FMVy#lqdGiu{J9=sM7dxpiK z&X6QT$A$uZ1HouC9EnNB@JLK7MrV5Z^2Zj9C^jBHLZpDk7(X42iK|qZyU2JpRG4=> zI!^N`$0yUFyqa|=w{FeaDv&Us9j#~AlsVxLiLA3BCXYh<3qcXX6-%G!#}&ym91X`L zi@aD#=JBvN9Ft7FgTa`%9oPLPc@DKxEtDXSL87Zfr{rPCs7qPiE71rH6^lKG00ic~ z&83l-imI8&bmW^m-`#V2&wM1^^i;ZHFQ|-@sLb){BjSu&(HF2 z^(+hZDWQIjpWm@0)Tf2*T86KNXD&`(oGYF?^X@CRU%Bl~SG1*sHe41wRQ7B9?2fsz zSyxKfG+&$&w#=XT;MF^?{^96i^Ig-zL~7filyC@{-JYa8l>Gcp-~OL}Jok(Lw6M?b z5Q*@MO%ya!unB==?gs-GmnRTS7;y^)M4rX16p-YTPm8GPyGs=0`l2VLWu~Bj#9SU9uE3Em;gJ&KPTl#=Fv`+&n6 zICEH}*>i`JY=c7=0? zOBP9zknF=_aixfD^wQ-3(PGIF#ab7L(5ffd!AL7n=U6Np2=>MTBcR&sLm~>x7of+%j=0Jja9EoR{rF>{F=2Y2e~*X;4&+y2kVT9`AnuA2gToFzz96pk z^J0mbFZUag$ufGw$TR(#)K*n;o8MkuTRMX}PYM8umXA1%bao`nYK;fE*PaEbEshhb z#*Tz3VM$mM_JkwhOt=z7iQ%`2z~4~yl;L_nVu6RAnwFE791Ce z^DH~egJuj%2Ur|- zbc+{ad$co3Qs=s_1^m2!kMC%7WGs$UkPsiyfVMTcbM!#X{{m46ot!GQQ1kVSL6{bO z!~MR&ycSeucWo;geNhX`jcDSy4@*6y3xE}C%Q0f4mN`h2iP{4RT$@cJR%+H<-#|oU zRK&-q8T#rA2XTL>!`Dloz}lKLTIXwFt1F)0X(D~pEUx(m21B86l)CGS5Bt{ie{Dr7 zdGxjP4i87@HT3(=h-2YaUoaH%1$`07;ZcHkd_+X4veB`yc&*iUDU9H9Ffvx~qHqgK>?~Wb*fjBhi}cA^^7_aIS3x~G3!AScG8mJmDO>1RWpY{H zSPZC}PcD?JId|uNljOV<79m^83#TO(7YBjfodB&7dl1O{G~)vN4#|j7E51P2PEqh8 z1!pLDiGs5fyh6dN2qXua)U4jaeuH?PE)bI~9zp;-gAxdvWXPa= zj^hzZ7Wt`3W-%DO5SGmQ8RTaPjsR^6Nsbr8m|(y}gAvIBMD%iJi1EUbDH;YcXpV;C zflm3S%U>o-mee)zI0f1h6;IM>?jfG0(}E}YGNrA3oFV}r5>!aF@nd+vwb4yj*Gi4j*980BJGv#%e ziu#9kv)7R{eQ2x77IE!e$XoP7XT?W#^|#L7Jb&L)KI5ErX3EN^uFmaARc@QVy0|CR ze(3JCRimM}WYuWeP?S9Ql^!5+mDMvt(?jnJGugPfVruNSyvgqSW!0Iox=c+I{jYAw zpc9CAH)k5VGFw}hc08Bb`rOUu@14qco97)VZ|B01yN=Y3V@tKqKP-2a7bOp6t2kFN ziA&4EhLo^jcI>Z&joDhRWW&P^TzTcpiRlyfg`yirCy!=4)%4$6mnpBoExC%&C0Es~ zJuNgolB24cGC3{CRHSBFt@o7UF?_}Tu$hC%|LXLsv$44Y3#S%G?=>u)KKo($E4b*m zk2Y~BBeQjlziL4q#((v^0>D6g*clpYrq|1G#QiHiI>IXefWr)`a%v=?W#%l{Fc;3a|LCM(H5df9&RL;1jT}kswY0b>u>AiFO zJI^F-E6&oXt3PnoXS_|fPTo8@-#tIN(6DeS?R_G7BIEThds|c9*7slf;I%uirM*uj zPu%y`-8ymeM8*R;uwl-&RPN7|Z^+ba$ZTlB?5HdHg}}LsCwr&L-x|!iI9Ck_MBSEU zZ%4}8LG>>3iydk2)2O@Dw8Ux%w>a+sP+bPJBOFC^*uK?{C2Syg~UDzzCZyDcri@DB?{>7 z#RvsM2>h0uDnc^$MlMNCS`h;f^&vM%LLaS_YtFiqvlt0D^3ayS)&`u9R~C+F1!Ma| zT#U^iAdJnkEO=9bclK0Ts0Zlfbl=!Fxo^q2G2&!USua0;+d-E^RgWouBza`6H>HGcLCxaSzA><~nIMGUzz2(cULwLoUSNWygNTD%iFgFHD}ZE=_&QGVnCJrjxqx*JP-Y^BefhN4 zjs&*CtRPYV{y7W5KhSb>j+O(EF`y7T@_<0VT;Q>+30uM*;xAz~0|;|Ojey7OfW8cQ zaE=BGK2op7m*u#p1RvL^JytU#X+~5!p0GgcLwA7#y`oi<9cVLOVP^81gM-M%o+mxi zM?oI~Bow?ShkeMnj6Nds{~?oq3drRDRg{C!t--htB6ctm^KEY>ye6s*R~oW(5N@Xg0W31m#V^RL*rz9t%=&^BCwhp=k&h!3F)&0!5B?)^#)u6h^k8=$ zc<{sW`Z?n}0VevEH9_nJU_h`zRa0g|duHSIh3+40+&MkI`24-vrKetc=(5T(^v5M! zg-pS4F(?G%}LW=*{aqN9Eg2qWIchUnZ#u% z!~cQ81Bm!Rypboso@)dfwJS%6Sw*gMjVZAwOcTa6%Y&|nqKXvfv7}7!5W2aGv@{hQ zxy6U6(6!|!mrdzyO`ZHhB^&>R~G!cNGeAfwDik*1<}16iaJC3!5GyD&{xzI7tW zuQ=Vy&f1i-b~Z=?a%tJj6Vp%3c1-V?9VPptxzU^B$x+}{RW0+y^9Pq(_oiC+rmLQw zJeurIUS2^>fS9I&ZyhhJ>At6Nu468=ylGcz)2>C!-Qv65cP(kpbC64fvZ>KY6o3W6 zlxfB`ZG(MQ&CQy*?yuLSJ)2X)X4I9_?iUIgk~I(r4fh2CGR~Q#r?pnr5GN^36E(yy z(`h*cehRiwu)d5z+#O@_zK2NcLj+9u&zp-N<$LROWC~TF*w*oJWvhAfip!HV;}_xt zgaIqXY$PGrDdr%lz)3M7Unp=<3Pg&WOyH&z56J<=6e}Sypp;@|Bn6aH3}QviY}0IP zwq^R{TqsqxB}>f%7=$2DO|kR5Cuf*|sLJpoDpM7@*FOAdhdTCGzlx+nhTqPq|3WTe z{b5-OLj~BWTJUGZpAGh^NK@Ev17pAKAwz!=W4|3>zpY^PZMvFXhyr@)V7~>h-wvka zcVe~`iNB7njX%w-dAE_O%BQUxG7Am18*I3`s%dJMe5Ff1W397l$xJj+SoNSdqqY*L zZ2=Q)D`cWUW%3wjRXaRkO;|(xI9OL)rMlRWXDy~^jMXdji?T2XVJ1SxH4Lz#G3B_& zmsja7TGj3d?0Y^F&0X~5w<^cQs|m|>>oG25eowjh1wOAeP1Bnz8K*hM4n0vW1gJ7( zC4r9aCuAeDt%OzWok~~HFHLm(m0Pp@60K=#z%|n~liwcvQ@mh}6Z&TrA9Cs{;!7S; zi~h=bQHABs8I%0us{a0kaHK@iaIAb+*1z#gnbtGD|0q+v6aMWR8jHn!z0jeG!kv8r9 zjUQ~ivvnc3xaIDt?`%u$dhT9BYS;7kUP*6xIbC;_^l0~GwcFz|zTE>nUQpmHLV4Dh zn9^fnRQz{Af!0$ood*v*VVE$2t0f*)qtqy?964@K7jQSgNYun=9PyYMrB+Lmjg+PY zq8=3-0Le@+bH%Ev=2}-7MEtC73-XGg^g`o*^&t72k?eWg>4bH{HenB08ISIm06e4z zqqX9tapdnGuG}7>#OZotO0=5uUG@% zD7_~(#fy~*$ET{>aZv?da}R^oVoY2*P@CR$O@X-1bxlp9KK%(;w0)|6Id$93^-KSH zu3!4sAMKZuZ0$(O`c%F96`u;8h2B%Os85~dl_pIJaoV85j*CRH3l2SV*NgQ9nQ$rt zsR;q9VMY3$;GHEH*US<;~MB(zbWn7Kox zeF)he{2v?ztM3D<%WvqRO@MYnivmNz*d>3l_zu#=-$5W*fqRkcErU}ulqAbov=`<` zY&DiDV3HFEh5Lqw0Ca^zs^OJ5M>X80?54|u;VZIIkV%eA#pUQ9i^3v|?6@Q^Tk%oL za6EuzJ~RfSUUWvbqZ4nT4!p*mprpvayV6T`0i-BKyGtuej zTyVbOc6c70q+JE-Mubm!O-4S_U{1=aXO2xD%k@Rx(h~f=rx!MUd&{@BES~zKZFkGR zU-P}1d)8F{z1iR>gURD)-6}Hrm9*$xy_B=-}1dJ_fGx!wp8ayG+SMi zJo?bW6;}{uvg~O}d773z&!#-jQY{@*!MP{#a*Dh8RnAacG}#UA>d~wWhZ#+K0N>Cs zD155L^ZV06`;T2^lV_(+&GNGy^n%l_ru>xNw5uU&-aa=FH(U zJwR$3X~;KG&`iPCC?GgjhG{+>I76~O0{}=N86%_&;LHzA&jDAdsU>Je*6sp{aW|M> z;TaS|5e183QH&4nZ>{u@(|?_rR8KfIC z6Xd|aU2j}tWc5Ja1l$qOKgyr5^jlQ1d;*(i6BeSKr17KQ3Ht;94?XJeknDDDhj05Zg*tvdeSpaBc3%-QM z%*6T^Xp=b8g+Ny_?28UZw+&woixF5ql0rnrV`DOgq!?|Id>9RfAt}lt`dU=rDiknm zzbEu9vK}!$dO0cvJKO3z@LEp5S5RLjjVj7WIZ08ku%4b~yR7qU)|I~P z(K;FIhwYV5LypxJeOr8_v{bUUD%qI{R>4VrpIVNLpr0eS3%YGc#RF{byN`6%R|Z3X z)Oy2MTp%{9PmuL8ClKchYsBBjKzRUOi4(tx05~>Vl_wC#&a_}i`~$kgR%8dPX5xbv z#)ijYeka@ID^HkzLpkym%L9^`0p9Y@70Ga9L^7Nmk*vyH#z-5Gw@AuMB%zoh1?Pxd ztRQlo%hqrur8AY&m2Vuo-hF*E*^oT70(;1g+0Wg4Cgs{Pe<>wAdHqmEC&?-#N561w z>dNiu6I)w`=lKe=IE}iY1!76vbCjc+m~%SQ?{K8 zy^CA#es1Z^OKIE7p!H?}bWJFoIz3x5b!5ruT{0<>?|13GZz7Nk16S5;(9r83WHw$mS@2^xh#M&-a9B>F@&0d@rnZNPk2 z=lKL|2J~1)&;)rG93VqAuim`ZXu^7J!;ykU6YS`Vg{UFuA}AxHi}kQ!z$O>$$+RUb zEVNQcDg7q8<4NS6OGW?bDDjm<{via<+>SVP>mwwqyjZBn|fD#EoaDP>k}x1N<(s#ZXmCe_7EPS zEGVXHoQzDKToyKAS8iHpAyY)*`Vq$KNS0CAyw`F`{)OPh z_Alz2cYg$80Pi4p4Du1;bF4`Mtb@vWFa(Wl?tV|n3*&Et9`h2Cw^AD~J%MNZP2|zR z++Y@N(9<`^>1DA)54X`;Hsla5U9`(^u~e-m_lv!dpU__*Zwk@UlgUB;h)gOu5-^;5 z-=d!B-N#y!SKZ#2U^t6sPls-cV8Ybv6Xq!T4%2p8RoS7Z(ZSm?T+}cSH7Cbm&!N(e zN?K)&VBUKMGD_RzLv}SMz3mChW7-4W`S|u2>1H+ydF^3b>9hw%8B8rk0MoMJ_i-f{ z18l?MVY{g94b9!Rx){;2fn~uFJg+Vb&xPZUDkRtJ1M>y@V17?}26Yn`CehlhUNM*y zGKxsFh3$c3YY4W%F>0;spjykE$}>}??Tp`qU7}ogG)JZ7iCdVBYR=pBr_5@s0xG-v zNEh>TtiG1x;CcP)5%EJ;<0AIkfqLgz8L{n4%z@I{$MHxz%Du{%5Q$Pb3NKmUvIkoR zu@!5KE?h@!f`kPs>vhk{pS%72Zre~tnM_9Q4qEUWly6ue6T zV_7ZO`-Xi};`fmP*FeM33((p88D0K!3W&B#0%~P|QV1KagYOA~KQOr1(uD`)rqz`A)w-7JwQsh-&~z2zG(-FofjYw41$ zN$dTZ+FRRhZp#RzH%?BT1cQ6!*2K+;`BSO-_JxwPuwz-+n-caep1RB5?MMrUGR1Y7 z;_8Q1Q?26{HqPmqY)roNmOl%7fhvgRwOf{JI#M+q3ylkrbj^X~n&(qB&l9SYt~oX7 zUa4tFx`AtW%5Nkl6SIfspZH+koqZp=_xwr6VrcP+l>5M6^kr;CQx%i$*-|WU&+(Rz zIfKV?{fJ(n#T|cE{E3CIHdguOT2hr;<}WYucgFwp#k&>vHhr&t>GVtKr(aIBy_~9e z`TEHfXU*(0%eC#P+V*s9N6Oi`Wa^Y9dorXE=P0<1ppT>+^`Lc!DE8`a;blOmmsx5v z#B>v5uAyDYt$W0RVzpB!{DeL4iUn-5M*D%#>)KN>eT{QMm~i2GelL{XV5hnn^v$KQ zzePD4Ej?d=y}eKOYD23WbhKiC(>wueiY@t{b31MO6x z9d_Qm?Hx}(v3u99u8we5*Y;5Vll@)0Iy=KVcWm$M-4Wc;@no=bS68sJZ})b&2yNJk z#M(7z8ScQ?9TX7zJkqR?ct{@wJA&E1XBjDA=(w*12*qe%00P8l;EGJLRf3~Ztbx3W zRRX2a?X@(2fNEf7rYe(vbue#luUaEX1;6Af!uq7_93~VZxTZpNfm0=u81}?s=c*yM1&UHFx~YPtWWCAs%zk5Yhufy`~+JrQ9YMfJ*=WfFdI@$ zE!QD~Ess3bHQa&R%kiRKNb{`&m-@EtL+p}renmZJV9SI5P0A8lHUPWO>L4#}Z26*f zO|pasqp`6e$vzg1;h}`b!_bXvNg%i)8F8-^0DCz-fWIrU#Pe^diPx!O zqZk+!Y4ei!M+joH5kY;n+OI4O-Jwhr{5w#GSPBscLoJ-A;*BHM4aPx`02r(&}Q=1S9!jn@y{7m9BjpFF-) z?T3Y}AuV)WKcthCK`vV20DRX2_j%;(#N>(D2G~4w-F$W4IFEf;?Q*XCbbh{zrLQgeJA%(sf>t8WNdPm)r(q#ehV@W`3^Fy*@@`wv z<^C_a>lWl@8$$CGZor_K_KhLXqiLZ5uuPs34!$!FQp+s(F=CL3Sf^OPI(3**ow_Qz zFH_M?6qHjyYsceEr-PJ-f)(oVTZk~zX@l7YRbk8aOyibJlV9GZ`LU}EJ1=3s+%;#M z-8~)1nsE-+9R3mdX1k^arrlX9rP#QVN+Rh~SEuWsUBEPLo?^5 zSgR?gmqS$5%ONTc1M1xxvaHdhW?Wf|OF=DVxB#ErzstX`IST-Bg!4UmY~phWfEqqY zae@%VKSuC?5EHl?DOMb}%3Ix;!!VbcBs&h%a%daOq*^tHj{6hHu#Q0Q*dTV$$AUxR zF>0`m0xtr|tz8L##SHX{KSin$?>R;@41OD(<=ASHsi?v}S*f`Lejo%(mNT0z@;YTg zCiN&ngc0d4A_7O{`~-|ZCg6-1@C;t7e1P-<(GQt&fJt1dLl1t_6*vgeqM?Wbgq*zH zSpjrGWAMKHJ;ex^jHU@n=@E}2mw14JgB1Ke1?-=P7oI zf-@A*G#8Ik@FE3gDL6;Ls}!6@07AB(;&dnScxTisNtbNmf2Y)+Q1B52e~m!$U}+c{ ziRZm!#~`_nl5x-?dKKVDRX1`j`I_>QW2B0a$)4iVR{R^Pm;!;`+-DF0UnFo(wPBAK$P zjQ5F5Sw*I#@nN;A+;aUy*2@XiOD1_=(IZmc2j$px(L`-6*Wa1KUr4i-D0 zJZuHf&qq$I>1GR#kcn*&uz4IocLSP$}~@p$|6y7d%L2 z=&6m|Tqkng4%YQ5Vt7v}M_xtXkJDHK4H%$Vt_ENw;8C+$+b0Xl3Ef@N^2Vx=kpM`w z9rbs@`b*cX^-(K}O-_1XHgt4W#WTu(gLe^itA~0E_=USY%s0TqGy`zx`Bnf?Wc#?a zJUYO)3WPGlRa!j?jyTOLcId&m+I?7Z4*2wp=>Tp3r0zG!elkHbl((S=Zs%OyF^wa! zrUeJ?3-kvsKpUyL)HW?U^1Iz94O0_b*o#kLu?H90=yzfw{9UvXYI?rFJ9iswO z;!O}G#z7?%KQGQ6Cg zr_?Ode65RwO2)om96Un}<CCAnO~5NuJBh zKYyMB7+N}2f{0&PR{R1I$D7xi|803*U^Wxc?8Sry(X?^yiW`y1$ZTi2XhYJNsi>P7 zo*teHrYlbko>_Pv@TFnlm-Lj`K#WueUEZ7#C(w==kF0Puc zlzFtdhIDcBkBe)kYu@p~h6?cWU?Z_kyY2Cznp0N!Pzb*Grq0 zgr=M`n;uCR8N3QrAM?Q%!x>xZ7>F->@7AZuSMiD%O~#uoJY*{aQuMunzX?OM7~rNd z2lDgg@#UB?daxmsXA14s)8K=KYo8ck8)tRaCq1|l{0Zz5v{`A&c`wj&ZVGz?17^Mv z?m#g%=^$TIxFa3WPYgL8T?jhlqYvR2fC`Q(xj%A!AV(^&8m!RC6PoX zCO<doXT4H;&Npt6fO&U7Dw+krYjF6h5N7*g=@4q zVLq}{|J2>$56cgr8SEc#UT*A6HFh#5KTRp2Y3|^1^UhTB&a|*=(eYjPcii{*^zLKO zcr-bn@PI2d5|BJKOSWp6^QZ9w#z-8hH^OIWcN-SAF& zADWR-?j*IahRam9_pMf7e&*v~;2}$8Y5E)BG4R;|Gq#uVj7OvmrgF>*AJh0KK|eNx z_VXe8o9=T~@*(r4_HmJO^6!P3yTLc#?64;ca`JTue31YO{smkDFrPqD#O=V>PYNxo(W$gqx`gC4V| zk!IaGnaNpuyg{uOzN0PZqxn~93FR%nt?j@00)Gu(0q7AQpowu?n<8sMvUSoM$7jL- zL{UHkK(di`0w1o4jc+(J5Y8E?DW!bF(a1I50G#fQ*UHFetku^G7s=O#$6(F?V?-*$i7_Jy+Lj_y=P_j1RPRL7C@wxdg}$L_UcjePMLe${9!Ey{96R}t(?g#)~d z6X98D2gyl{SR6qlA2`Eg(c#!snNXb|dWKxuM}WxJ?A18=;8)!YGr<8e4M`m_>V_g7 zTtjrce*IU(99jMvy&o!ho%_gLI(2a715q)H@^)e#s!{WIwbO9RCec^c)X_P~w_^4Jnz#aoi8RK)*TwziTJy_vf%$Q#B z6&jPOt)-IvRsQg3**>dYbsgq2@Ma?M+IqHKioCinYq_8`t3_8`d9L z3N!gddI^L#f|)FWBf?wmo9=l5b}m&1lCDgZce%=+s`AfoOIPhqx*(T=r>xnMs@amM z*__$fx!An4abMQV)oxwojJ1xW8#z4nOSTQ{#lgCr+2F_Y6&H8W>ZhN06NVhLq)ti# zz~IhvC&y{#;BfIqtO-^@D}LBP+e&gyaWx^x3g>QQNv8=pJb(*b;YqXg!1oQFfRj)o zSRF%;wM6xENvwd_jKYsXvXVeGD4zk0jc;0iIOOM>r7;mg_W-nBZH@Ohl%!KrPTvidnws%&HO(0y;y+@bkr zKHLk}w5O9NekfGWZk((8P-xEx)yqO%N~oI+{gu!JyQ=1`Kn^^0OE$&gpUDDcg|Yxm zI+5O=V(SLor0jUh2H+YdEb(hHAj61|COFXd?vS&A$`L7$9q4Sv0}|UQ*h*=w2pGwF zRu;YReNr@q;4^gdL|dTy!D#M!B5U=*caDSW+y(u^&s;cJ@%XZ4N`xy?|NM?rMZ213 zW6ON=?PFOdT@<+DvMgm+)H@VA$a9vWj}5$e+o}<K3J7CQz}~a(%IaZRE1y7Smm8P003x5s&aFdN}v{po8*~Q7o9cR&2@7} z=3iRqS~Qc`v($BvzDCr1ILqP6y{>z)zuf%`in7Ve9)`a~e27N@ad|jG1Q^>!V{O6S zKK5KCJKIVMzo@aC&Ox%VkI1z}ql`jf)xCeg1@1FNv&+Gtrce_ z+g|hx-|gEc?!_R)e=hcy2*J(p{Av})3qRtTf5g@OghQnK$6WmqSN|i<|JPjCkGR7> z;!ggQ>w0K$@f}H#J{b6rL-b*@z!%Hk*n7w!`q1~Rf%i`JlJDGy9K|0VHtpy6;^cl( z!#?CF{_vRL6n@lP6weMAs`xMw(eGoncpQt{Zkrapb2sDMe7b;ldgG@7Lqml+095S3l$s s{n=sO!dre;Qps08+|kL`EY){C*fsPq4?(<#o3(tzYlJ_oB#j- literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/deepseek_eagle.cpython-312.pyc b/model_executor/models/__pycache__/deepseek_eagle.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..479c62f59e0e4b474cf91c4bcd21b569db41cfd1 GIT binary patch literal 10569 zcmcgyTWlLwdOkyP$RRl-btNg0lBj!V*^*<)aeR#(%W<;4Bu=u~E)#=Gb4D^{-ZC?? zV=mQo7e*^%v$c%fLQ<2$+7`tsaDerr@{|HaixhpRW~G$OzyY#IiX!L>6Ddu0U)uja zGegQM<;|kN9!O`-{ru;g|NNKl@LxS17lDwr{Ur6PenLLQj2Y}kV#Ci6GEW2|Flmxu zVhn?6OWG2%(62RPi`i(IHO*%1F*{A$(vA!l<7k>qJ2S4Bi>B>qcZQGgH0?-xGTxY% zrn$5)(-3Q*X=l1I(-dpUG{>50-j#02w8mO9{uqz~bGp-kOfVMAw8h$Jo=>-DLa`8} zJ%TqK&UC~&GM%x`OjoRnAr>Nr#K=8Z1BO_)Xt$8lMDX1tLW5|V8rRFW#`mmx?w(N? z>!m#!p+}S0X4b~~u3MtbpJSUSqjAqEqLjFl7SBVbRi4i0bCMEQa#HegJekYnQ)v+@ zT(6|lnUlHfL~0Up{G_PF^ZEFsl$*}u7|%@0qHr#E3et`j&%Ja$CuJb%JDZzKDe?-E{TGW%1%PdRzaLdOsAFjH8C}L zS&64}2|<*!woKysB{43EnM6K*_#YwujYtL>ujC6B?kWcb2@t#C$K2Xrtz?jmG{Rl3bKMdl$2twpwIEkNa4^CAv3kZ_;u*nq2+FKq}umguC}}JfLZ=WoD(lM4S&QrtNeW(xP_q{bB#ou=QUnL zjRo5BikOw(v1t73(}}FIRSp##R7T0eQ45tznq3!CpW}^s22_ykQYs$HTUI|Ac$GLq#4`c?bIyUtmXnyJDnEKresB9r9^g8)EcwXnYhtIUxtKYNwe!2 z&3-nO6%&%iQc5(-L{dT_sPR%pj%UFLh{K{ZR>Y*nUZx)##!-i6lg0Fegz};$uXUoC zl;(`blj(#k$Kx_eya)|9e)qzNlF5(cQ~5n0{lNy=^TzangHK*LAdaL{mqzl}mCL#8 z2x@{6eQU%wVC%q?(X;Z1V3-HG<{r*pm%wx+h2S=IGi+`?-h25kL3ey}gKT<<(|gOI za=~@3z3lHU`NvfMSjm4{^`9>JFR1pt@*&FlkITL+Sa)?m^IKgpShEP zM0?kQa`%UeKP+_}QM-<;w;kQ^I6U4jc+!4s?!=~>G!B+qx*miEm+kilSGrfS>Y=l% z2hXkUJ--%uz8vbhdwB8ie>-fAO%)gP_yYTGSe*fw=S}yfpES45NAEY*3cTVH}U zt_Q8*-`Aogv@m}6%;K3+cvKCKuC%-#dM{KuctJgQVZG(WS=&#Yzx}cw$d*z2{$ta~ zvq4UCrn32I1xx@rY3L}Z)Ac*m@py#uC}~X#fL7~Kkul}|OXM8}>|c5r-myprpk6u* zk&I(TBGMuFXg>Y2S0^lVZReY94bI{Sc*$;%vah-1i>SWHQrntuXq6kRLuyp9VjE4xfxLJtd*Oo zqSdUQ1cLw!0b{#oSZH501zMk5$K0a4UhJAdf}mOc%U#_Q)!k6701G$(7EB+4maI+4~`Dibv(F5~9u5({FT zbsQ5p=~_Y(3W4n`V^lm3edND}=mx3qBp5CQ9#aF4Ew`-&MrZlY8vXNaciPIH#@n%5 zu?2q3)4!zrvha(-{XJ`g$5;6i{~&EzElsXjd&N%LI_^3a9ZUANe6yZ%b6}P$x3?777!D^}v~hFQary9XB!E?vC;>{{q3$hle1 z-0NjucpKLpO9JCc%v=?o(q_t?T%b?C;)moZy7de5u@))oaTXY-! z9xuA#V%VpuHm6A=_(x8Yc4%+T^IXwZw4*UH!x#B_cIAu*E|RCn!)0=r7l|PZ$^JcD zA}g%uDd_7hdS8P*r>pX9S(ooytO6+c))d%_uA=W<*FD1nqI4u|q-M99Co$!(t=miq zZhfx=-Z+sD|A2k^ubSE^#9iUUg z1fnYO*8$~7=pDJ&jg;B%CG6K!Z4kefQZC0cuP5jb(gmz_3b|`p+Fg1P$F*tA$t!s! zUZsJ8Cq`kltzOtjTN-5PMQ$_+#W%o4RpdgmlmkQTdK`q}a4LH>J|T&(7x+v%KWwxZ zmXN{%J4}sSi^j>9K?@ryJire*@RS5fpP$d1igz0c$chMWl_-_l9*2?lA^|mSwY6l zlPI6c;h{Oe;!G%tB%vFxSrdW)bWu~`JQnx?l1N@LXQ>jVwHhmZ zm|o3oN7FV4Q-nJ=jiq6=!#2l5U15Xe@0?*P=Ph4SEfl z$%rdeb(4|MRTPgT7+{uM2yfR@TWdue=}Tbu8{|uiy~nl6b#Ath?r14;QjMG}M~3jP zbFdr?-*qp#OTjTUI0hzdH#F=(gEq7v_<-OXa&} z$LIExeSOO|)wdfo)u;QqHFPY9s2A58j(*h8T6u~*ahBP5njw+> zbEnHEzoWJspZm^|d-?k-9N7QMeQIF19E_mJePr>-vip8)Ep&Wt{FCq?tSP)_?o_!i zTIxHX_8piz_fboCIoMGS_QPhCyQAfvp|9NbR{s~Uy7o#JOt9HUf_*>x(^6os8rXaP z#IFPUS4KW~Sv_=*?>Q*nw3G1u`!BB?UJIR`8!rb! zb44hv&Htcp2qs_u-x?l4eb173yk9&tRGT zW$>1Bk_5_#Xsj}wPm9tyNK0r#NS82@vC3n3fk`o)j>o|~KM18VirM3oXC>4dH0r^M ze}m|on`l1;+RB|>U%Cg_ml)V1f287|NsctORGc*FB5mP{-hd}A?^hmL`U1nSyO&#k z6@DjNA&^|@e}DITyEiadvA3|hKWPrmAGvd+!a^E3sEy_vnBz#WW4?H&SaD*`MO?nw z>D$Fy#f9(QeQoizRbNzf?%Hr;-R_GF+p+ZQ@^|l#uCVX(@A0dnj@k zejxwLlV8&`-EO+VBrL8POj~9n51a8Yy8JdHSOHtMi8jE6*_-wl3mCJ#9-0UUY~6C` z#{nM8If>A?V_#SOzJhy)a(;($X8{B|P~xm#U%%8qoLz$FCL43@IOh)Q6X=|<${Jqn z1iZTbUQlcsEv^X*MX5d4ozW(({S+eqRS~)MQ9(jX3|N(7&{cG!ia(oQJNtu(U-_OIt$PSrt{i`Fd|GI$`S zT?eSJ;fFkg0r#78)Sp3s+hwi+(CNWPx2{V$3n(sf1%$0AZvFYD00rM3pjXv?hF<8$ zZVRTJ&5-LP%AJ=Uhx({>8}_C--~r__yva$pYnmG##o%d-KCeyc<5ivlZND3NdrB=B+wHOO>8kJr&S9r)C|_;`p`qyZxO&{)AIK@+~DQXOfO@Lfa# z_l!umZV6dV`VmGG7~wtA8YrDI8o2r+qcn`gUYsD46U|AQz}%yGY8g=zFlyJ0x9TO; ziEYrKMdN7+hLLW-fE04l?Im6HAY&AIca{TH4Dp|__Xb2a$OF!G+j+}b;@VZNePL{! zi+n2_F*HyLjjEy1Qs`+l^mLgIl=wlF9|YAM28${P+TIW5L+|qG)$x}B2k;{z=h_5? zG*CtDc7_|BYt8$>HVa}W9*wik1>x?M#VgB4S9;g{$G|p=j65{V@(|3?4b9iN4$yO5 zz1X_cd06c{jAfxR$CtP+mFxPD>(&iM~#fau16fyHqh=jH~@cA-BqFOydeRvJk->)5CQJxC4Aee zyYE30AmM=p+s_V_ng`V8fz_tL)xqai8(sj0Z}wJ*#p4CrtDyytW+A%t{rjFZ-?3Hh z7(JMWSj-0F((Om4*bz4+_0%pfnIV*CEJeT*O-B237>SA?ke}4`L)zgzg#OS$#>rq4 zbcf1|zn%bJ6n|a-zk(=4AF&$`H~j*Jkr88nP2otBztlLOHVy#fSNuis{>8P%le6}+ zuXWa=UoUh%A6m8@3h1(b015o{0sUYfIdz&-6N2fCK}?)+%Z_JEHpwJa&?{BD3;7dlpfYxx0s zJfJc?@|6@KKltMjQ3l4r2Uy+1v4V3BtD&5P=P3OgB4j80tqWby4@`cd9SVcraw*@y zI5J8_pnKE8ItM|tH}!%bhgYdh)=IHrjyQcKE~Ijyw@)rPmk+FkM?W;=o6hcD{pwF6 zVOKFiZji2F)Q{0K7+uGRie5ZZ-H*G2DP%!P^gBdn)I(S>V@Cf33AqH(O;SE_@=Maj z_AC`D_*ogGQ95#x0`*c>XAWui${jJnn#8?m*JlQWDwX@ z!LHyZ)^*Z6Sl7<7{!ap7>d~=~hOu3~+u>W`1)J&&Z#b~X%d`9I`f-~CYV#S6 zd*BHVd~C`u!GNvH~dACXBv#u945 zQchpM>~V~c`!((ePzqSdG3l3(MU^h&#j02thWUykO#i2Z|CqG?2RZsN>G_yE`7t?2 zMr&Wx~a?|+hKRr2g_oGo*IvgYi(fsa3yz@{Z`VLIk6&8P09HVCFStWaE` U#hc^oQ_TL=?kBz?@TGM9FBoGt;{X5v literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/deepseek_mtp.cpython-312.pyc b/model_executor/models/__pycache__/deepseek_mtp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..679a4e5f53eb4e6d0b00dcc3f7a00c5595f0f852 GIT binary patch literal 15043 zcmb_jdu&_RdB2x0@hMUwMNtwZQBRxJ)0SP^vE3%NEWaH$cG8qdK+wEb7A2Bm?!9s( zhT6O$z?98JBy^kPwMFDLh2_OVv|{K~pg@cP12XI%-bg8#TN`lK^pE^UN6OOmpY8k3 zeUM@(C*5`+oqHbNd7ty0-}jx5|L$;DDR6PaU&bEor>NgzM0I8*u=z8Zq82HJV(2)P zphGl`VO?Ao(vhz|VF(#Wnm%q!m_jBJHpHtE=8&0$jd4rD8nTkGDQ-*HLv|9biaQe3 zp=uH~$DIjR$VI}Icuk@hE)j9-ycOZ|8CaxBzCDC>aI`xTVEgt8teeP2P@$C?Rk z1JE{Bre(X>ws(zkAKq0u7V4kT1)Dy_Sdf;?X9SjuT#d6AA>z8ku>u!~C0XXgWO6ii z4btlPsZ?r`6T-qI7aa>nCljeyoP`YQ3-NeDBeTVlF(DkEympP{u-cI+o@Fjfo`i7K za~CdOoa7P^be^BQ78CeO++>vH`AMX#{q|HODa6tdAvT#*s?$l;mm*vw9%tj{FPvs0 zjO2Y`G8(z6CZ0@OWf>-xycX0+bqqTinTiYH*Vx##G3-KwVL7QG5t+HlhB-D7Nrfk_ zhxuqE&W4kb1a{pOo#Hq)DTGt;h%hR5&3z^bGm&7KSVUm2uu140|0ynX&>(q-SvJM9 z?8FQEhS}&O)PFuQ1C2Bm(=WuPg(;4GcJ$Q4D-X&2!W!&6wVBgiK5oTQ%+~txyg##>(lV2FCn`A!M1OrsuDGKViW+>13!Gp;&!l8ow zV-APOu?cfnRI-i57=}%T`B<8j>|BBmCt+`fNhKKK>_}z3jz@tIaXzfG9qTko2A++N za!r_mt&sL4OE?^jM|eIQ=5YoB;BNfk;((Ay4WweJJ|V)vzU-TxI(Ycd#6fl-9=ke_ zni0k(lLOb`bPpt8ugAmeG#j0Q<1a_~0Y*6u;e?RtPt9!;DNFU6RifFqVQzfjX<8b|`TIqG|LRC?|0!|*sSVHRIm1oM15bP2UYoNwi}q#` zXc6r#`G&3s?$%|~ifP?Fuw^mUS?3Jjw`@Yil;lt+O@MRDN0k-FEV94I9>WeUQW%`U?BOsRnz)?9;ii)G zyufibEQteLgP7(});G$=Q?F_Z;*HSz(!* zH44a5h#N-+QRZO9`pc3aI*#YQ%w>!Mo_pfP_o<=c7dRhY5@qh5Nxi#fO-X?3vnJty zN&^$-X0%-70ucTbx#|dj z9K+REgqN(bB*RVv$V^QDs53IcOp29E40}BmWrO;%YYn(F_C|6eIb5M|3|T&Wb!v1J zz-J5B0eLdZtVP5%1t2eWomJ6DEyG1d1!TN%wJ8Y2GB&E7TylYuyBhCg4%eK+MU{*! z-ocV_jC^#-q+~h|NIAkuRqR+e#Z8WLh^Qo+-03imfO`g0s>ULGL=ZU16yw9!hxT#b z!b)+jq0&T3n2}t3bd1G|fKz+qO+=={$*Dy6YDB<1m>FO>ykv|e1*id!gk<9h1Pdd; z;BXU47PyhaSL2h>iJ)1&z>Z*31P^EflWb(#v$a#>G@Vb9scBq&AqZ)s%W$IKOoi* z%-KJw^Dn)SYaS4r2UcIm9XKr>IK5GK28gT=cFw79Se#jyS@!1IhQzj^)ri=3aLt=L zd`>)kF8kbfHtJu5^7c=>-8t_*(Yp^@{KVV5G`bvL_YRTvZO*rj{Kb)cV<6YqBR2NT zoqN!}>rT(@p1i9m=js(*y*bwj(RBi%54=r_6AKgh`liL#7hVS((orqiTMN}x$F8is z^^>Nqyt`q!do{DA*ZBdVY?yspCaQ5TZ?6X=lXurI9$7e&4-7%UMg%a`zgZk4;YT;cfUvau8}h%%TXyOHLwE@B*8{W57;FOVP1*+w{Vk z0RA!D(4D2eM+*oKh#SWGj~${kQBEfES0 z9T({`dW{0^s&S3?7b!~mTH*G}7&7{6I)(-mu8c&h&jj^pH(Rkj;8!MwD&7PyL%CVg zY*nTzV@d;dBF_vQ` z%T$tob&6%vEH{G0r_ehKo}`aP1PEc&H*U0W7nPj+x>ZO>&onXp`fQa_HI(T4y$Nxopa1j1K@0E zTRtZG2BCVVKWp}r({NXJQJ!=gw-@IDFEefkJ>1M3sG(%ePG+6^21F%2a6~dO9O7^8 z5O}2k_$f?84>=+KC*UCf*0mX}%V!G|eAe{jbKmee#tN5cs()X;HSpNsH8wwT`xcKb z94#0j3_bQVkXRMQ%v4+V9s6y2!GbX>Ol^NZSXh5ofm0826K|#yaA>$v5lNBM}Oi*^sj-{=YF)?G? zA>F=1x`m)@2W#00w*$(q29#|D%wNkmcc|q_J~{<8{#YGAGol z2|FOP%D9cHBzYlcR5|Q0^_C&3i&0VCD}a(fe$a4wk>3l5HHbrm_(M<*XAD22eynln zALwqj8lZ+*?S_EaC?fJ!t+c)(Cf8R&vugjKCQV)uQ0OHNW;mb(OFrst0cAXD@(lEg zCVhJiv;Y^==6<5EL+r$;d^LpkyVi5ZQ=iQpi!<=HPCYL91jZ z5JAC>9Ab3I1^QV~zam&w^slB$C=pmHGc996fJpIHwapEG=9ZvL;&nxPUNXYF5H?I*rz=EyaVBIf^1GbQVeXf6dc-Bol`SoJpD zlv7@D4Ns6`_o47moiT=6StJQDH|F_k$P*QqBRnH7;* z^?)5s>eI+&MK@<8^UKA7h?_)@9I|l;1*_zPQVF$Phy3D`f{ewPX&(DUj*g-_l)+oa zla@rmZGQzNc|->Ux7i!Awm{w=$ocz3f8R=M-G6BALcVGD%Klu>H%0hs`exSb%RA4{ z32(jr=IcKiS`u=t2gTNd*{9Bmt!LMp&WXaxDL8XwH2 z1Qwshfc2*LJZb&6)82t5_-o+7HUQ2CO4)4-?(r?2TR8WQet9_8c~tB?n*G+ZV&}8# zflH$2+jEwDO-7o`2@VXO!1?f?iG+ z$aih%bMh?=(#GDEy7&Bd{it*9UF&+k_ZPjJ7zBw1aC&pgV@D0Bu~nQ7;S!vVu_8{# zSQ$>oBs=A6DqtlVPTzE5te0wOeOzTVdLO|Z1kk=n$@eDw|#a;}rL zF+26Gn&5Oh^vkznJ5MMtw?ArPY5?s!m|C`)b+U%ay&2UZBf)>@jf*v42;?3&%uNkE z(%YCicy%y?9A#p>kXB1%2RFzL4U$benO8uw59PQ~^ePJr$lS>Vw+ewWVFCEz_UFs5 zugC`=YJduf6Et;5hAw%u1a*hX#qMd(kWdbLL>zEqO=Yd7l|H6|3RZtnG7UMl6!!X$I$>gEY&A^ zO6FNZi$6!=Q{a`MMQ>Zq+b??ibKXOu_YeZJ2X%vaU&p=kYu@|y>%C_{*XBjQh6ml^ z$eCL~zwnW{jhshKyn}iv6t>8UjR9+0q^<#+K>dj9D&Jt0;0Bb!T8P`BR3#8nBJ7ct z5g`&sV%pk)5}t-yHDOPs0=2&;)Rd&v6~k)iT_rF~z48xmqv}e;K%IaqNSwNZc%l#l zm0zQpQ~Z{d^{YAC%a+zZu05rwwD2SpP{Ud|`sPJwu>vg_yCw#zO9^jHJLD2_89xt{TIMxF##&xg?lXGF^|vr&xSZ;bTb3BPKD>iS-yF58p zhv@3ax%P;zJu3&^JAC)>hU);NFFF^TIajyn>Rz6@Gjn?;x9gC&>(EE8!v!bh9Ry&4 zjHp92cPyX#$UG=RgXdwm+|F{~A;=~c3h*s=;~K*$<`lP?3S)JIHOoLu0K!tqkff%I z%CRe#`pcw05lf)PNH$$aznRWCgQ7FI5?pug&zkp>K`0U+f?&x5T35|nDlEwMQnD5s z@o`jRX%N9Mj$sqAAsHp0p>n54WyYdZhMF^VbTk>=m!&WQ$bSHJ6`S5c@N$>#Nk!1w zfP#D%c!02LsfK1$c>`XapIPo(ZNBeNb=d%qn^gt%HYv#fMajTW5A(+@Uq-vT)gdYi zh|n3{8Z_$@4yTjfcE*q-O>EDIK?1DicERC4umq~$h~VJzyhWCCx9Hrx(!AzacaCJu zBV<9PD%ogmyV(pr)R0l(#>1PEq5qDC+dx~iO41&qLt_r$i8v=^u9Eb3&!|2R&*}y3 zX$Z<|sHa$odZ(%U7nF_;HCN38iH;to=IK{nq$u@Khk?hO>l8=dw3iqrRO}&aIzR>J zX3ZJ%4fWl-Pylmi#M&&9Qm-B0k0!I9`dtl1gGOl}8RS`q_ z!9qX2{&@dTqJV~$wYh{jyP=z>Z(3o^s7m-|TthU)Ihp{Ol`}CmvWi$bSR9K`R7*P; zdkL<=vjVO-uWBCvg%~R$4wJfWH0;=$FTr?08xeF|3>t!Z$$kwC*}_vPVp1SkNQ_t) z;0^`Oumm5eO3CLC7D6G{DB9bGUj<9oh+NS8suGe#c|G?-ELW=$gkOsZW8uWB5mE(r z3v(^Z`Qnc}sW1G2eu$DvA zEj1=06RgstES&IRFaS=0sjzG@;!1`AT?sH%D&R4o4VUCXyE~>to&n=KFonkBBl*CJ z_%$vDi&A30i}X?z$ASfMl$FeYp?I87?x#?KL$P1Jj!|hQRe_Bq+JA6}Z6$q#VS+U> zgZ~>$@B&DSPN=PG9hT~pVG|*%m~2zxq~8u=?yPhiY)whG)I@oTBx|wTl9d!zmPeUF zEW`Z-dt$>NnJ|`-jD(a7NUBmN@f{!|CXN93wmVtfWyk)gFgNWhy>m-*KE-NUU_;-a zKGPZ7t?;J2RZX=8bAc0L;6y&qgMTf%@D&=~sdK*lqHjN#O|$`FGZ5A{E$z=W4T?>J z>rMO6j>Gn#z3V4kbB+fcd%z-NZg{>g@9bPLh|b=;%lDw+=-QR{U;4#MA9SuaJUe$b z?`+6A+aLo>GIH)g(LK1@xOQdTeSB{C*9XptuD$ctrE@F3NA4DQ&2{g|d;Fjos_TG= zXV0o(?VLDx3epg<#&5+}j;vYNTTaZ|K($oYnr+|zp=W>I+qTjqdiMj>qo(fpVbn^fdliW@+Z!Uo~P%}F56bVyJiN{nUzklzCZ5^luifuKt-t_eR z@FV{&7{0%6{$##0D8KU0U-;P5miIO1eO)lG`L2?%3sKt7ZKJc$8eGLS&?FOmV66(WJjQ4*Wv zV%K8HNIXFFIQ?2x&oE4&U$X)WKp~_6%Q8L?O9nu+OY&F|%LRz=79hrAgbcZaoTyb6 zNiqTWodkA8HZ@ZU$$jLpe^;@WsX&~)&c=(q2r4frBQS?WBN$nqo`STID`|(^aan}y zm+1bqt=RAWv`Nkm(%e0uSB+-0tsvDGnOf;jD0s=O7{!$^H(sjw5;eSD$;NXs9k&Lp zR4}+fc$qG~cLLPj=`uJATZjKKkONDzn>v6~p=DU0Kikru1!>mbmNhrPIhb?&x@q6s zxxBM+iTTjkp6}e9b+zT)E!p-1qWb_u;h4gyw^9C~$9Brq_6W?zzyc*-+q!IB%B<|o zx1Y=#vtnJMPj;&3vyHC#zgGu_F zjlh1;|H)Q1+$_$m=PR`Q-vgqey^SnkA~r79_v2{&XEFz zSsx62!2jE!O%g5~*S8owt3w3}K5J3(xgY%kW3pBOXD>J)Iir*CTd63q9PE!=jdHjY zvSFH;{Ai#*ndDAGnq)8G0KEJwgC?|FClX`~4P_^j{S5rZiHkr%ov~1vWdg4 zCzK#RN>YAlMt)Zz4LyNc{+SJeAp9MgY;cLcIU~Qq;ZOjT`YIKd=Rp1$TED`Dri56W zC-9G8L#dh+RDXTO%Z9@alE|}QK!`d!JYr}}EdTN>e0?9$-EkgBCb~NroPymn=wU_y zgz_BfZzQAKmJiAHVgkNQ!ZJyw3*;Vso?LZV-KnJhg8Qy>hMGA&_n9fLu%v;ou1ZxVFvGW zGe!44r24)v7-`)m1>Wa&iU#7=-&j2JFRoi!ZkV=AHT3TJ+Qo*2hAj$$Tf3j4o0mD% zs%=pa+)C4jY439To#5@@76rk=H2ey)eoM#eXwQ86VsIh26v_rqY=XXv_H61Q?=i_E G1Nt8UwhrI` literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/deepseek_ocr.cpython-312.pyc b/model_executor/models/__pycache__/deepseek_ocr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b6ac06bfd00d59f5df9ac181a7d09b5063e8054 GIT binary patch literal 26166 zcmeHwd30OZdEdjnk^l)3+(~dJ#6_Y+Eu>aT)IyC!jh315AY)@7K1c!ukev6xkwk-u z>@$g>*vUvIO+_6$8Ooh@I2kpjvouQMG;*58N}INLf*c^P@-cHfIqf9vIT(}a_@qnv z`|iU7Knyrz=cN6sSHyky-TU43-TU2dyI23Ntjx;6h4o(!{r(w_`vJYE&YTUbMYSAv zm*Y7f$7^_PSmV>+sS9f(I-f40_vs@BpFu-u`mix#@|hxLpP9XD!xo=~<*`O=J{yZO zgzb?sUsWJItW^tBqO{CUW8>#cv zMe2R^kp^D_i?@awBTc?07Pf_(BQ3s`h{xxNwE9{jZN4@ZZx6ReI(!|GPG4uF%hwg@ z_I0!PvT#qt>+?po__joPeZ7%BUmuGv5BEm~d;=`(2ycyS^KD~cXLvBO-M5{EE5bV> zL%tyvt_<&t?DFl3?Dp-B?D6f14Eu&7Bfb%q<_hnP?DOqo;i~Zd$N}Gh$U)yh_Ff%6 z6dCo6Mh^Q90Mb3AU#Wf?YB{-Uce2&F>5ZfBul8ZgXV%reg9vldsszd%<+G&>7fwaz` zwq`efd~m5)r}Bkt==AhV zG!PB)vguTaK*NZ&9}Yw(uN(^oqH{t}b{;u3{``v}F*GxMWM=wO=(1NMTVD)^Be^%{ z;eZ$%n~OwloSBUROhm<1r{}`a(CHaI5I!2Y802|WEy|VW=VoVTgs7NLm91*r^Ur(r za>eLebmk?c8>eOhd{B_fvai$uyw-=NLs9?hXnSMR|Ev(a z6uORfx=z1v>caTxGh?Ht{O67yA3uNL+{>t>GGF~zAQ~78O-AMFeB7~6Fw85xlb!kK z=dJ|=^r37bh&uTT!D(?u5K)?IJQBDZ%=Ho@9uc#V&kHk?K@o6Dq=iAA56uVBvfTJk z6IeRRs-dCGC*|_Xm6VWu#ZY3J%*YjZkIY8T1!u$f_>tgrRJJQoFU<1zs2Biu?$eA< z&(1}~!*ii9#uO>#=L3=1Fp7FUAOs>J4c7QH#xfG*L&&RkPa{{VbMVDeFK*9%bAxO- zJbM19|NOY`sBAfM?#%Ok4DwOgI(~Zecs9C9&4;O*?}u%2W^Ov_kA}iQs74vnt&(RDQ zqebx_Xs{@dt&SR&pVSwGz_fU21|1h<$3+?+zZi)4U&kc7h!)yUhi6grRB$pnBmAvt zUx+^TfywN$We-dCWi5@sT`tIB!{B_{7qCV6bU{6@B@`1h@Vb5UGzLvm&TNi?2at<} z*YgJ6h<~Olng?bTMO;$zTRt_v3Hhz~S31A#Q}gS2yNXhCwU$w?8|7Yy+)(gE*-U;u zbUheWCf59)T0L-MAxuL;72*hcr?`I4#SsrfDIO7@xFCLd#4|Am>>zk%E_v9%iUp~& zg{eS!m;;5$7qKAxR3H^D23gruPV`F9GpnwT#jFHV0zdys6&RW0i-PS4UL;crX_BB4n?AHv$#WAO!KYraC+rgY9fDa_1%Kzo+# z%EpL<%L8GSGU2t#X1^aL1^s^6>L;W&7pAb??|*GB5YDEU{C<9B((e~cRFcE*4=7li zh8?j+iZx)iPF|6X%8C(s5%(^)2A})eiLKGd?AF=PY=0C8Eg0>;KDT>#&(+<*t>Mtc zt+O|xS7xTS5`Nvvkg7j;Jva%ZEGVzyRzABY`DZ4Df!Q0v5DI19F$Baf!~0q8L#@eR z&Qx-?%G(2ywKb`4Opeu9G_3cYL-`RJ6-!~f`UU)Q^96Xw6 zKPuH7d#XPsjH|8Nuw-+zo`B^1lC^MHJldd+*WsrR>VrC3y1Zr{BYnxp8{Por#~ZPP zm3UL>rOca4$D;(x8+xDRhQ@1?8z*LfxUt7h90&d_LRAy8b^0=**>sg{*Fw|$%r)i3 zb?pi^>M-^_1!?+2yf~>xD^*9+i>B!seU7_aOcUe$Y8n^Q;*Ibib1~g@jcCWH@J5-6JEV=8gIe`JXt3OqXNih4m*}!42Cb|(^#}f?V+VFoFs6}X|`wk z=Q}^C(*r8lXOT&4gLjk5*tv$zxc)1SbVFBM|I}QYZr}B!{ouFSe-E+WHGA0Sm8~;# zQS5Fs6tXeQb_3Zc1VO$`Pv*LT@@T!Vp?M!eaF>hb2SW2+HXP%ovNR=NbM7kQ1Z_-v zzF??mU`xSbwt&a5fq$sy)KV98QL39cUb{)X>YP*o@uZSeguI~NBwva4Qa-c;B`g@C zHnl+Y<1QGZWoleZA2X;mjB%eGK@L-XI-rjx{43c{D!rhG=9ro1cwIdg(_QVs_tSi( zJ7%16sP*KZfMcsi9UBezm3?|HW<)I|wngg6f-ztJV0JQH`f9 zi@uiEFd+;eEYKKz@ON;sE;M}!%TE^$PP17HOgT3T`dAkUTxYXeI0snS0IJ|JVgo$S zrnGFh6rKr0Wj(DtuSGadrM^Jkiv(UiIVXT@k7h|v*)9eHz`uSGC^jlvvv0v^UM{-` zY^_o}qO1wY2Da2iq9SO`Fqd06lYwy9|LF@y0lSPM#J_@flUuiPc1L{v%Lf-Pyyd^= ze`4!PyXxYn->++4oL?UM#+i4|JPb&E2NPQkCAJ(+v>Zv+jm68~uc==g{MzKwmaopN z=u_JcN%(6%l&l$z+n-g}F4q5&e<}LT&)okRse0GL3aNS|ZhgPD;j4yt*|Q4w?bwp< z&nmXXN1s*IEpAylyIiq+;T!&U{gQWIqJDp}>OlNx+EI1;{OybJ`NhvA9ldF1Q_9&Y zIa`-Hm$mo3-*xt7OkCw)#>&-oE$dS~ha~*f9*UR!g{}5k@3xid_x3IK$B+G^p~bSK zt4p$XuR5yZWeR>ji#~g;!YCfXVe*c^qfZ@WUsJe1aRKrODGJBQyF%Ux^4Jz}l0v6c z?-c|@%J2q<25lSq&|)?m(yUh1WDFF@7&&)c#>9eVuDNx|b?-CF`g{I029~L+HMm!+ zY8PF1#xn+l0bb$0YrkX9nCQ*SSsiaS-fCR8;7zHWDxSnI=TwZmMlH;5P%1?kG+fg1 zx;OOdlJ*&~U6~MPzy@Q+7G}V^g*UQ|OV67SYrSFgn&o=7-8>%@KqD7zI`fy0UMHR) zNL19!>p?*X5!c2Ds%rvczC4rRfGB!iRWWss_f=15+A|}7r|?8)JaeGyRT2;DY{nC~ z2-M6<;DfJ+z@aiq$~en-j>$sIMIC0iISouq+tyTWEb@Wt8UYfJ(rHtayU6i5yuP3< zoeL#4{zN5W*j0SFp!)|ZQ_mM*`Iz>m=0@Q@qm2?p$DCTTV2I&UU#-WKbwpM6gX4`c z?N>S8^eZ|Q4}MWofXj6y%h8o6#~jlYmlM-}!BbF{J_@|7IJHe;VM{DAeQ^mlxEolb z6M0hM10Dr1lB%ePtjL7%DN5C%Qn%MATZ7l5Lcq_213J*tp!~(q zG%!D8EJ6r^(W3=NcO!Dx22mqQq5bX=B+>Ev@O<^W2vE|($LY= z(6}@-p0>9x4Xx?4JFN+G15l>jov<~fT{Z7Jup7KR`P8vxdFtWlugyHGY)n_4d_3~F zFY)3_3Fl{!#Z{i+v}NV%M$TEk*!#Bl)X}^A`G2#DqS5^)@YHvx{w*1ye+TQYZ?Yd4oWJMvarfi-q33u-@%QR4s$pk25Qqb;87@uR$UCevBnsI~}|R(d27>g%bfum=yxf@A!ndv00{n zh8Bo#!viy9=CDftA1-5KZW;f@$my?5CfasBZQhkQ{}Pt?GZ1h!t?AnKv}eb7%L3cRcCX?fG6@; zJxd*urEA@USEU*%d=kH$Q#aK$9#Mt#*BhKq2l`5{&|eS-=t-g%H1h^VYcW|J(u~CD z^CsT>H4Se;sue$5&DQYOHSE1BZbDk3z>8!(k6umB&VfXT1xs(;Z3he+FuE8rndcqg*KHps+ z3Czk}vn?IiP{i+1=301!-CS>8zjZz3=#(6tPrLRc9m5IpFdKc@c$JCPH}sNlq7A<_ znml(oEw@QO3u#C|lFRq=k|qRd@omj7YH~dlZURDfW}BH?n?g6Dy;^0FvuQ>JXC)ZI z8wk&D`P5G5O8ZxU7hB*_zkA|QrRP18qvxr2zvMXa$PF)HKA}uL+Jz=Ibe`&FpO~mD zs3L*fLg)fEBiX=pWO5kS-y>mMLRt7z*)0c z75|xpxtTTh0_2@$5@a%Mk|NpkUkmZkE3#b?d0t`a8zGL|xn5CkawPr)+sdMdQXCL> z?lUeJc+G;5*Fp@TSJQ>=7!_WNC00>KN$>#y%oyzzeR~ZPab_jrG#OE zwK6HZB8PNBA~_933HU|702%g-Y^6z&4LhdiBFdytR6VkjB9w`djnQ0@HP>YAmF=?j z+IHD>IXrU_a`4wf!E2%-*@U_Jx1_6UKqNObCd{>eX{$;*>K7-UI9iv&E0a&UhSSY0 zOBIVR-XZd{*_*CvS!(>&7SN;3mA9?fvaOLE zWpQ}fDm83R>^dgZA4@uqC(Os=n)vAZmF~OVJKja#@`%*5Gci0aHI64MPbSPK`q9VcyHub#AGm?;K4dhJbtsKvGI^%<&qM?g{Svg5|X= zrYujD@sOWC(5rl5q0E%m^4h`_$mc$wx1 z`s~UlCv^~}UGxf6ClHWdQdHRD{N7KV1D}}rFHxu13-2cP%vQZv^~Bcnws$%Dq;1DD zSL4#2l@m{0`xEy4&m6Uj*Osq6b?i=bZB?97QcyDk&1VJ>CF=`Csnq@bUJ>w8lX$&pP zH7l-_$%kX#oLXs-yrY!Tf=PKob4v3=U5jB~y16yOo`XZm^UxT3u3GGC2Bc>WYjlP^ zE5qy1%&QwjGRMVGk z=!7ue-M7Z+-Q^jbk=R3HmOWHBj$&aw!-T@u(EtS#(rOZHxSvIZcR?e@OrJi9_Yv_46xkisnEw_JE^P$BFUa+DnW55&LsA^c;v7}qv zBe^;SU#`^s;Wfy-H!kjf_Amuqu*G*_pvGs=}bjR?t*_C9GiALOe*zh%Mv{>wZit zO#JvfTPcjXDbH5|yYdEbdOG6t@}xP8VKgyK2rL1`$8t+|!4NZ?-;AH6IalOpE|Oa? zqD4lqB49y4#ZX@zaZ~-2)-hlA3UI<8OLUk z%P0~ORIo=@j#=sdSiKuK#uBq+;Nw|fN5F|UQT9#l!*b5Im_y*%_t4? zG?^pPQ<=Q4pc|W$2GLxeDLu=#!2{ut-MexG1c(8s#krrwlu@Y~;_-5^kRvIy4#g6^ z170{^M~R6JB0@K%(pv?!^M(gG^*2;hxyhMV|L2wdQ{@F2vq9dZw*Bt`S3+^moAQPQ zuw~1(2c(Z>LO6svA!{MhOgKm$3o%9g%XocPBkN#D#V90@|2qld5P1jSLHEvf)nf?D zh9EH24R8!sU<4}bG3|Nb_X)C^QcV|`(HJi~E`3J)1 zFOujR{iB`V+57Fik6%w7c`?~cLf3Rv_pfP|2bc9yRsV`csv1n}8vnPgkIz2tkanF( zoI5Y=Ixo2{#LdqdTi<%_-gC?5#P*}9?I)$}Cm#!m?I)Al&m}IrkZgQ0VXk}Ta6PLV zNY(9<>UOPMOV%BH=GmF@?3XpI8D4kGcv%xO>DwAIpiQ z7PQmH0aIf4Du6-frbwn&kSk1Qp zm=`X=634>WcbZ26Ns0wRhXR@9DMH#T53dNF7*CVu)|IW3vC4VUyAxTlIXCGREF@bhTVkBAd%N>z7D)!kT>PwV@iZati+JA$R@E{`8u z$L3eR*!2`D8pp15yEoN7BDIe^+>>k{lboK_nkI!XU;a$8X4h@~YJ2xP!}o_%?W0or zC~ef%`_^=8=Q|De8=f_FfpqUUtO4!bQJHXhfRU=(6R!3|*DlGmD`DTIP`^KiHqikC z*oZyN>iT<8T44s4@x3AW9 zrs{^Ix}kXaGibL59<)Do3?$403CBQ2@38FttsNh6c&vADWzFe2Pr9}%QPZ7n+ygZk zl-3)Wp$V^}SV;7B>M5a6;XZkU&V+B1M|+y^AINJU&rKfTAAuAN!YX+u$Qvh*iHUxb zLgbmKYmX2Bhh(fWw5~c`83Uf+e=6%1U5l3%8>9-bY!qcySR&)%Udh#+u}~Cd09c|W z_u_L>RZqr7QFhMiUMr(eUpe?GS4+l#C(3K>S=KGTxNMU=Lm3lA!P=?fwhOxWL`AP; z>07fzR=%`>A@k{AZV$tJNGcVRd(+npAW|?&}yk< zV8c5hc5LLkHi6uvueo-1qn%AS1=bvFfr_|fI~DSuu>iK9CupJ&#>$JFt#R(U#@7b& zqgQqpptEB$!jZt77zm#_ePO;W3_I01k_<6p{Q<=f(vhny+c_}bSP~{2xDgb@f%*E9 zsna+RGcaFMl!h}`SAqe4K!yp^ysK1QWlE%4xJ`LgN;9k3=Z#7MfnLg+GD>L>dL z<=mF=3}YY=B4#p%iAka#RYzg^t?h%iIiIF3WkGX8o9Cy%7iE|D7d2j0cDEe`&3n$! zX*)&@&@nr*FVPDE_OGGfRlUY}0=*$&t zfrSk^Ry2`b?Ov?xcg5_L^7DmH$|C0A0o zO{S1J%3FaWA2hqB6wO{rx(Y#x4yO~0YHsLXIX`W1%z^YQ!L`~%?`8Y2V2-w{P%(2%A0=v+Iq1c$D!cH&rjp?(-=#uCy=n+zxe_C#t_|eL z_f|Jm0$Bqh*Kdqz3DgaUnoq86YMYv#e=b;fdvu5TJ~gC<^3Q_QUHMow6thHWH8VH0 zM-Anl56aXLQu|7p{RJz=qeL(Ha;De}ME9yCOcBy4as@3S@kEh}S=5?hR#N@s-Z;NU z1^vHNer`5R*G7-3b@7hUef=u;D>>N}jE0Y^@a&LFrTUhCZkTsYbyOcz6wJF06&f`X zU0UR(h(axLm^Hci3}fNS3E@`&5#lgeg|8s=0j;bLR>>n=5P~3qyZqsmAH;wANADki zal7qGi06aT3a?l}Hp1$0Rwc?s+Ot3u@>e>h%(gO7uAs@gtOGVsb_!tG=818mdT zKIS2zlRp}`ykW=jWRH}DZ!k;a5f4>fxYuL<`73H8NjeKE@^;AvY+Eqjzd)&Ww$-Wt z{}WNdM-*QXp##WluTvBMJH?k}dz9^nGdln?LKaH4kv37yLMTNxybd#si#VjHoc_IV z=DDL23ZqABT(;10TGk4I@T{9=f}_A|;x zI7WH&;%hitV^pN+vWcC~*#T>k>1o-*4mAhy{ny?7HsXZMIq9}C@)+5er6o$_cN0#;%&1|5k9alIp0#62Fx9snO3Ad!kRi&vsJ3u8n>q#yHkxrQsYpn z@tD+j406F$cO#D5+0w1s>9}3zEqQj+TT{BBHPz5BHT1KC zd8_$SpH*#JX`~ZBmUMIXTQm1&9)zXly#&?peqB$pZVPHCf7aBWu5C;=x1~L8>1I#5 zraoO$mu_fIH?*u*Seju*UcX+&HSJ2fI@5KX&s;4_#^v&)Yd5&)CM%Uyw{GX^yO#J= z*Iub>Z?bM5m71#sfnZs>r8CvCQ)<~sz>Q!sT{Wr7ZmF_6Rk>BF+!{BftE#_d zc*}av`ZhZ|x?=j)+#f9b?!wdd)5+QiscHhvw>jS2e{28Z*b|%Q{nqw(8t*qg2!7-0 zyH_7}CkGBcdRZEnNVcAdk3F?Fzu(sJPT&2$2Xo)}{JWojxI4M^$fJ<7^-Qwu`S{p( z?Je(<+54eKRo`j)cGDwA!ufoG-Mg7<874zocYVs;CAqs;UlTohlCEKNwuN=JAydwk zRix}KlD#EmKPcG`W^|mT?7QYZg(LnVrdp29Lzw9^u(xq*TQDIur;`f~p3ngqz!p1D zBKgB9MCFx&fsRAn13>Xpe0ToN{L;WmN3v!&P)c*hTT}O@65a!k z^vUL(K?1q_O0s<<)qYTFKbUMEeRTFo`!O6E zVG!*=52@NMQtg&h?M|t7XMFsbYam_So~~{M+DdoqUWJ`qT}G#Et6aC4t8ga7j6)<< zoTDkxbTC?rb;QnPQ8Ni77PnU7~j%n$XpB;Qp{tdRMHiuwEC$WQ=vkkbeP!2Olpf*8>;fD zYKWAfibC`fI|V$IV-r*eDhbOrUy|ihMpPfOT*g>z=zqx;@_Lk-9V19pOxd%+c@5;t zXoK)?;QeQ%F{=l$(c`M8ETzJz4&5Y90fX3JWTb2Y0TG!4fnWp_=q|dML^<=NCkjJw zQnWF6eHLpF2UO2f1-Q2fuPP>^0$!Q2+>^5RVIevb2my zEWJuEsKSRDKW_NJbwU23YBpHI778FXf&XK)Rs0YYK>N_>d28a{#PYsm;~o$Scuh6- zOO5@@;l#ev$;Js%D!OX#4&NDGb+&%Jb?NN=b4#6)vnTCrO*y@i)4SZ7boQm3Ly~i7 zPIx2(|DZI6r3 z9F@1X-xlxe{&Fm?Uv)Ly-G68QQq@v)`Fzqf7&oUY-KokBsj_3~!t&5>R4jc~s@##P z9FZzV9u7UKcyuILc_MDgwsP^xvMJHDZ6)y3wFfyJHE%B5T39@{ahQlQk;0C`kW%G@Ql?wC|O+Sj!_M$V*D04j`obA=QFw z14`|dF&sW+_6gYMa|lUS3Vr0IXQdQ|kXZmFsms3zAwiNel|v|)O<4yE0n9IQWJFU+ zwa5?>&=KSwd6|T%2q9St(trp^>hekBo2%2hAs@5D__6ZY(AH9D%LcoB3;yM9?%8~t z=wy&ei%e02iyTe~Sl9^wl*pbr3kr-BH&tH*j@OSN)|{*<5382SYthE+)`7DO*>8YR zBUDo4AceC7y=n{4D}AdH{RetA7obNjZCfu$L>6&fb)B zhveL`GP?5GL!j+_KQW%hbrF?|=kL}0(kr+m!p&}kxYwVm@005LlJx^g_tv=W8E%^x zSqj|SuW*schONn(ZE-tD%ChP=Pv1Jd*tcw3@ja@4d_HME`~9-2TQA){yQslcK!K%- zWLdis^UBgmsd`|=_HattcH(h+vTOp{YfA;xhQ#ib+zQh6+Qkvc-nsPJayzkM*0>4h zHmd4q>)N;}td`sq6xy4FeT)H^)FY%iqT^^`tsTJJzsp@FS`5q)L$I2xsh4(x0a%s> z#3_ogtO1yzU_oZ76}5PJVnOOkp($pQhzUY2PZ{Fw6m2Oc3wN`WpgGkoD2F1odGj&U zT7ss`->{(O!7Vbs3WRO~ebT^IgIZF6*^7=6w zgx%yZELwq(Y@~ZUK#P_u7;-1U<)*S+#kfNeGB1A=JglD?&r?|tefJm3xri= zi^4}jd#4~+RaU2BSO_cG*{$5W8TVF#Lk{vH@f{oRWEN4@`wK5!Bb^|VkP%P4!5no1 z*}7?#|AkhH2jDSd4o_Rc+_?G=XFiv9w<^~qBzlIGt7?)xhm-Cjz-lh2=K4}~gHqk# zN^7!i=+9g`<3|9$)DGEB{CK*yV`*~f1*x_-KAx`YTRsoC_{r7gwp8>G;Gd_*#eeS%c@T;d{eigO~g(d}`OYv}^ov$CKyJKJJ!w zol8`mPuJG18+478cg(l}2xfo`xPzo*!&2Aq!;U}HJnWXb4kxOPJg$J@Y-8nZGmI_D zT2uCJT&c5c{;qv6)5%rs%Jgu~j)b{gA%nB0JekyB0|!mw6o&790|6#x!n4@1yH`qJ zGIoDTA@EQPP~#(SlRlm~f|cr)Htxep*mpk}SMaSrE4s1>60jVl2*Nkw8FEyTo( z0n~?Qa8C&?ucZSXjG;S%a8Z8Qk(KzW^Z#SQnJU%DJ9~6Zk;wFJMif8{PdAAc0ykqw zMOy8O7^Xc{(=F9>Ljpr{vtM%e(~TY8RP%nRd4H;zZgTpgx$n$>dp?0Hotn=W3j;a7t-?tqDpvqe4hyFs>y?kXRh$WSIW!L!(94_xEg43emS z?9g5%X~#F}Gv>XUbpGR@{nT~BBVbD%u9Txq!jVRE8=I^fno9GNi9ydHu=y!+K6g$W zbCun{gB>GFAR}YW#Fc$fe-+{4lgs*thkJEg{* zD_4_^hlzi8coORUc@{SL*!6i9<%%8LB#F(SgysW^6lPSf;AU|$D^F!3WQ5NKn<#tC z3*8sMQ!+si&6pxbLNj(nhvvO33ZQG=*pTAX@;MsKlQf!H!g86a z^P_8w#hl${j{G)Vnh|ij06$-N?xk=@v+28OBuT!D)#T*to`kIzV-`QXN>a#PpzE}~ z`u2@P+pb3uh~DdPv`JfMjh6v=I-9V!u&F0pK-)IpN18(nKW2xdwApj1nU=$7>dggO zMKx7U_di48l)YU9GmqgQs#!!Au(L9@LwE@h^PQWlsEuXdnr%)bn?iIQ-KkR6Q_3+Y zIR;nyla9P7)s($pdP5)S^RYg@h(MtTq`>kRtE%LBqCJK;j#kSo$Aq_C7)9=a3+t+J2Qe(UF5XZM6{zY$CqdD<6yfs}`Z{@x{rxW#BA#8j;O;(! zNOPbdBSt2YCcl6XB-)ehT?|zTWhhiOo@Up~3eQmub^@UL8H6?py+9$FO6=Y$;T4J_ zRLVw4F3ajICxdiJA(m}5AlYRhhm=Y90|F(F$`St#-j85`YUi3-6Z*QpsO4>|xEA=7-!qgLP>>Lyw0${)Qfzn8s}w)j&6x(NSR4(yLk7HH~*;cVZbGg$H;2PUANlS$Ne_PM7#C>Lp9# zngNNKW11F2$Ev&GZum|(gQL?O=pNo)x${cKL~pn$u>Gx9?!A(+&>IX}Yu0QOI;<%( zw4@t$WpsFEi(NAyoN3k=+Lw;LbK?Gqch1~DLr1sUGe(1R zc`=f%y0tP2wdrZRvLj%i7Zj%4UZ`~3#NzBaL)+4cKzH{F6N&QBXmXG$Xv{*tI~yFCr{ECTGH7#mqt+2wa?0W*Z>}X5kFp<_6t1 z!mc)Ew;l_O6f?6+jt8cvm5FS}HEh$_tGq?UaGH2r#UgG%F=$}RMRpUK$i@uhw_SGN zRPJPnirIvE_pxGEDGsnL@e*!Z7nPMl9I-%zi0t`$dH!6)-p3RAXXJ*A~mF- z1*BNXHZLEGYU%QJ#f;x7!6Rb$6UpV^B{-*(kOXNNlSNM(mY+$nA zK^))kI_iSUO}eNX-Q@L9TU$4)Ja;Zq@Df}tgHzh2 zXbk-#yL?{QLS^+5Obdf~eQ-dz{$!x|o-cun$=GdHvW@zr2=oNS@MRm7VBgTyK?2tk zl4Z#I6un+0@2AOoo4ntICs(Tf-+{_Hz;NYN%Er(v(V@!nI!VCheVD-+&{{~lM3*T3Z2zQ^tONUPUqKjh$jWHo5C zz%Lr@-`F|LwvV*x>qqvFhX`c<_qeU!l~iz z7d3TA>ljJwy&!eGu+HIaea~|mjWw-rNHp%0^t;w|^qRq~KiZyzrYfcBmNeaIeOubn qmaZ6pWrbF|ZoQ_{v@M={>*aedCwfL6o?PQ7V$FuM4_O);(EkGmcmNmx literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/deepseek_v2.cpython-312.pyc b/model_executor/models/__pycache__/deepseek_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c6dd4e033153b74f192e73eecc948c4176e229f GIT binary patch literal 68524 zcmdqK3v`>;btVcB58_Dx1WABz@J;YdilW|<^|T%&ZArH5z;;lI03}iYN&SFiNdavo zbsI8qYO1-Zs7Z58&q|l6vAfKy)qA5%J1g#;-dnewdk4R$6bPzWQKwqd$?ap{NSS1E zXU*JipZ^KyXB@Ygx$Dj^>fk)~dH&}=XYYOX-sjJpPMZ!-%=qQ$qaMBPuP7ire$n$N zZ`SE<>LR+3E~1Z=#PlIOehsmbxFKYS8$(7t#T#R$xH)8Iep5^zFAbG4e@V;|vLM|Y zv&5|-D~l_Q+2Zz)J?;oOSlAMC#>+xw%x{gk;_i@}`E4;zygXFS{PvhPUJQHsOCR7uz4b{f$LUk;@ELIxg%TI+?#b7L0d=x|rV^>yGz?dYHc=)*J5&^~L)`{VZG= z8;B2v2IE7aq4<{2miTaJIKDNsm8Dh1M&jE-+nC=M+aBK$+7aIw+R4KH*sl2Q&~E0h zj_rx>4ee$An%KVhQ=zArzc#i%ejs!pelT>9h3jI6;)g?r<3~bAShzm+bo^-ODDyYO zj>V6Mjx&E_>_mJtG|K#e*jRi#G#-B@^b89(#h#6y44q{D=GdwD>CkEBZ;3q@e?Ihl z{7mQ!3%ABzh`$(mG5(p*XAo|Sw8uj6mqIVaUk<$-e30%pwQk@Q?soBygf8@{`#?b5kL-gb@dU73a z@MPpj^!3QWbNcA@=<#L!Oo{Yfl;8KCe$OO?=nKG%KZ-eX>=S0!7Z~JmK%V#n$X{iU zqktUy1jsj{_<~XYIAEXo`LOc5`!$y1S>!kweM$Zvf1SmiLhNbna-yG&{(AJ4=*#b6 zKF|~TBFpg{ay%dXqFl}*i#>zb7gVuJkr$)(sDsU=&tP^BVaBnUg>N*3asSfQlHkkN zx{u9HMY-tgWVCl~Hg+{|C>otlM57nZ4E4+2ErIx4BpU1cA+=#VSdz0GoQTCH&c>oS z%dupXBVSOTGfyXC)05;g?4P}gpOV>G0K4UKh7QinPEDUv#cUB`09(e*O~%6$(D$olQ>9%_88I0|zE1FGOb}DA+X*Z#WUX7*2B2$+_8_=hAFZ6mnK3F3r!+ zamjFUj+;Cmo}7!%PoufUoc-X$oRmUf;o{fgNXnbNmeBp99F*y;7hG!?@G~_-^ z;7j!VDbr)}{H56oNODV)Zay}V#QTjS&U*~6my1Uu)9CGUY<4B6jc9f_esO|cHwG(L zLE-rcybV<>vR6l@x#%Ri7Ckqepl@yRQX)AQ$0vtG<78y|G777fzt}B9$7UnZD>OgY zSC^}KGHEQDoQO;$Cy>#8;tcz!PR~atb8fsO`DIYd4+&$zhvy!fzLLDeMW2~Ed!G#Jx$pQ#&5Jg9p2VXHj=WA1Xhs3wm=N>7~oB^I9J*1QW5q*wOug>G*ss8po%{ zzOMkS9f7mTP7=60*f*ZDs}|0jSy(`G_Vd$`2);kI(phIO#lz?EfhBS_`trh9-I5bI z2lLY>gLj_u#V4+W=jRgB^eN-*ir*ey+$62$;>w(J2;Z_;jahh|IWrfXt zLkd6ocfgXrroX96%85(Plfmm>)jDQevQ=KlQo3Y5n z$)1$s8feXk{z9itmnb=|`+|N`IMjvmM7#(7k{O3wjO+n~+!B;jCMU?Al;gt7IvrQB zS*of%RjfS@ZAcXyYj@HkRS_xC&Vjz75ATU%hL&2T95j|%qeJk3V-!2V0!sEvd0A|Q z5|Vt^FjB}XzhC6lpePW#d0u&ZwV@)0Yj~rm!ztC7G9e_7d8!m4mz0)ll6E79+-#^?lQOm^<{K9(nYWZ9*Iy7wz&i>Sm*55K9&3EK2oCWhH}>PkSsji7my5 znBO-`*g8tAl*&o=NklaoV>_XEnwU;NMq%2C#f)h&IBwJo027ry$DEIm^)4o)To>*iF#sTdG+H? zP^_1g7rh$3JQ2GT71zab1yrgkXF$`qmne&c>SoO~Ulf+qQbINei!2q_Pi~Z|9Ve%m zWw{85d2VioYo(xhP6%{SK4K!u)5_WVUnCOp)luY>YP~ZGj?iJKps4E`=as9vTSo_rg=BtyqhvMX6>$wy_vT+ zub#VW?*-qn!k_VW@!qbr7kKaR9sOPJj&iY$clbMcPJpTNG6u;=z3-bd|pAok|MjfK?+@9SdWnpXU{vK@P==+=k+VAkQwI^1uKEsfzP z;|TDM0Q0o)j+U&aDqG$5*jZX_LoVmza-FYJhFTqdch5U}c*g*hAOKoTJ_58qF4NVv zNTmub$W{+zJ+)Mt-CRpRE&x?F5YYJ2xCD#P=+Y<~?k>K%3*huuRk!@&%JXS&%VU?N z5_!L3eS}x5%ULC~0r;kbcK~IHCMHGx7(wZ1uch!KKc>P>+IMjTxP~inY~|L7J!gpM-!GB&&zDihf@#6LfL&%r-bs~GQhHi@ zCB6V_8=wIz?}#c`K&DD1*aZuqY^2ob%*NlXDJE(8kS>f1aF+aB>DLKRJCkr@xrfU&!e%=j^n^gy$|L=PxC( z8nNGm4WP=>z|n`AD;3GzYcj2(e9>ak?*T4x9nN)K-li+Bx>n(!GG|C6IgX|f=O(9<99r~pCdT>5nFVT*dyV3p;?g@A1tTGR zThGN2o1m%w6dPdfX?U4)ffD{LoS(j-`@V5|-mR-|&h`yuJG!&o{n??BY?c3G-yyxp z_MoycZz2y?G*45yWee}#k}qWu7F|_!-paf-U4<`iXI_V{vL^3j-ZGu9c4f<|VP!k5 zQ+!o--o@hGy5_dLhk3D@G_M}wt2*;u7Ov3wf@{NkRe!#cg{yR)iacvbzD8#&`*AIc z9WAk$svda#H+SFIoj1Xc#`)`Srf#J2r4+L0Y_7NJmg;_DMTos9)@ho;@*|;hk4_;< zP2Nd_UBn(Ki5fuK0ht1MXNsC4dPd%vqh^qH#t4pvrpg$pW(BDRlI{&%$ifIL`j{L! zr$k_#85$dXcAV{~IZpxA%C>QvxNd=RW!MF{X3@wcunxFr0&Ir_!!|f-<|NG&>7fsZ zR;NdQ4yTC5!FOLs<0KVoL!T-+hf^@Z6M^3wX}thuWdLSnRAHf1GJBHpij*oz;nV^- z*cyX8Cgd@zurZ|$L!|V=HcT~FQeJ%`mXrZ`tluz5YtI+-N$DlX)EIC&xsSlKFjm&+Tm{%6 zPcW2(X~Dj5dPUM&dR_R^8DdmbFau?z=zjxv;uai2iYin{(PM|Mej9D&&W5z3akWX_ z{&rLSjl6zuBJQdsF!#DY;8~m4W+%E*sN=4qc6*s>l{@pRd?-; z_si?DgB9z_aNN zu9xN(+a*gDh#Hk=vQ!jyWqGGxASzv+w_*1%=ZY{_E7z$W2o zGqpUZ^yf|Z#VTCam@g%tMTa;m`E2rvZl|C_S6!cXlFv;`J7ve*z{>80?>~;~jV6z< zU{k%5_%A=wI`rroqKX&nh>aHOsF5wu1}xP^EYiYq4Q|{NmSu3%%vh!^Sh}rRoC4z1 zX~j<&BUW(E>}+9oU}3ixEbMcK#~Hmu=N7_QgUGECK3`gCCe`yxUbX+%(kE7J;D4K~ z+V6?Ofb~(H&&tZIN|ozUR$i>yShppd5G9~y;ZZm}QVE*Ww~BA0k)ui~P5DI~pTKdM zq4lrmk=lUVL^!f1rJre#{K_vNikKt1lo4eKTn_O~qMb6Aqu`xdBRc7w zMr_1&s$=QZ(i} zQe0?}#!N$s8!4fpq7X!iN4s}ODMyONm?6a*F~hAu`pcSSR;CO{uZsBK`U}>F>PU^W zJ|L~OAg#6_t&WXW{WnSg*H8f05NTAELAa8}3`mqjN@ndylUHq}r6@6=J%%&e6)4FE z=q5mGtO7`BM#@tXr1FcD7Nls*0;II6X8&iEJd#hE{hR~i8<1j^-&vgZeO3P(=y!NP ziUn2DKAx+DXaFb%cIb-}Eb-X!uLYp|KJK^R<8G3(0_Vsz6Qf^l8&u>7BSP%Sx%mrW zoNb<;i{vb`a}$@Ygp-rkTzv_4WFL+SXQ|-DXb}hBpb8jS!wM@a%#O~~$A;?GLMb5lw1w{yNI#&3?J^F4@RfcC=SAMxsQ_UJ65=padOHdjgY z5=A^4p_6>EOpf}FHiZlOd|MYBR1sygb8<0q2uoo+Y9^&o3eGc_QXKa#rO_!$&Jl@T zhMbGoHH$3%;iL@U?@N6~0xJUKBeCESWFE9coDxz4|2dUfP2>0k=?1n?lyyKL^uA7w zpp8O!FTYDZ!qIY85=USM-`tldY+?uB+_%YZr@@hM%{!FkI~46jJBr6D=b2>^1UPM# zhA&qR0{Q&a1OzpdI6r|=#1ko%MKR$7xcAANw>TOiJRD)TjAEUlGh^y%Sa_Y0Ts7!e zl|+y#iF==NRZ`3o3=&3yaP>m(l(`%hB|UOwGJ)nvm_kCM8eA~$Pu&rn9OnU3+N0S?Er--(0e&}!gdRfN5h4*g(fb+hu<*Tn{ zd|P-Q1tp$-#uwy$!3|$Gi)-u7wC(2G*m*>KwjlJNbxWpojBg#IvYQ^*JsEo)Z?9W5 z-?g`k6!}P|dpF;`JKeP>uhZ`xg5w!_WH1nMZUAv^(^a-*9pxEE1MdLK9v=C0pkBqD zx0{JO|E3i^c6gUZR`#s5ZrJ2I=__{3{?qRS1 z2Fn%?y>)Eq*!LX0pdAOd3ZZ)lfho|QsT$;~2EmH0?aI`S@Ur+NG}pC+)zY-dlVb06Qi@5A9g-u;KW`Oar3 zy6r(*@ZFc+d3pU%X80tJzqXSU-TZ)#L)v*q`&vaNc!bBF<4E41vpP}XV~eh9dnP!_ z2S+!8;{@KGrE#S*ptQYfEt2UQz1ug6mMk9no}=+Wpgj}V%?Ebhxt!T|hR0vvOxjWZ zu(>~5*PN;Ari(t^Rd z=V%1$#^3%UU9I1it!>QI_T8=R%Nr1S;HkV>{hQT!6Z{yy>c)(}|E|A3UrJ$%&Rd-+ z@3~vvlebdX#?rg)`n&RW3OjU-ZJCChcN=!*od|zirmJhpyC}uYAbaond-EPjDc9As z=e^{s&^0vYE8%-wrE6$gJ+tnHE|w&kl41x~K79;Z1k z^4|7*Ey2|3YMNH7)*9|q->W{DuP2yRc78T^*E{&JLD$lbK4m=Jyr=u8`3_3%)Zv94 zd*@iD_3+)+!}%b^b%C{z@psU5z&+O?yw!$QoXOp-A4Xk2&2I$+I8l&Oq0j#!IfNn>@T820AZq;4-4e!= z($64J{K%Xry%r}bqE8v5b0EfxqB4sf&aTL$q-4GngQgX!=+QcXXEGuZ|3n3klO@$T z)Pixg1DKRk1t5BrT2-e{+U1#)b1j@I85w*jJBQN14zC+irJomMW%M)dC+kg?OL1D9 z8AhGgp~V^;h{RBe&b~A#u4^cB!7@wOq(+NAUzIAoF0`mLLfk{G66|2*Q7Lp@r&Wek zF2j;4y@v5*{nvW4gGEKug{wxiPw&mkle_O`i#rwO~hSsia z_;)Tku>@n+vA5o}x8ASYvOcy^cVKbsf!+1i?xo!sd)Hli*Mqvi+oQKevkl#M_F&zr z4?fbB)R!%dVKEQ1zJ2x9)wTMK#%;W#ZgFg7Fsli?IEI~|Zx?UrT6Cs;T}*hPfwwoL z?d@4_0G#`@qZ2NG9Mu^+rF^sP-QIV4@7nvbUSGyb3v<@ZWD@rC9s7AV{#vq5PsSPG zoq@Eohn3QkcJ!pZJy}Qf%4rg7*u&$m9`XWvSdWB)u~!xXH?ERI0S+LrdC4Que1NJu z2~?q_Oh{@OS--G;E!}*8cQm5rw_aX)d1e1yXFWFDcYEIH$uvLDH$RU|au$ou-I8&2 z@UD(E+xqbh*S@r6pTIXS85g+b&SI`PeOc^emh+IWlbj`T=y-+ASMJN?dySko$@vN# zH4pup6iH5=5~6U3hi)StI=JWdvvkDj4v6QfCJLh0HJ$lV=C#Pob1Mtmz%&0b%alKA z*kXE0zxq6(EqNXJ*Q4K#eJl18<`=dtsu!OnQ2J=j9B02jXBRsTXz0p;=8fqVAo&+5 zxln_~>#veD89OfohmQ8Uq9>)3xP$CW(Lh`?tR5znJQiCl&i#{H^n7O@$z;VL=AJV^ zt{Dd^e}xR(LvsEdoCH-NeN{Qv)d>!?7v#3Z_WygNewn_fZg{Ti{=(>bQ0iFLf8koz zRi3q%Emtk=UTJ;1^H%5T*01;T_Tbt<-rk=!_OlV0L>~ER5q*!IMRCOk3eqXCRTuG# z=fDu~WycG2v;a|Gi1^Yuuo(vjW{C8fA<}D(*df~Mh%%vHQRG)Rgm5CQENT~zBwP^w z72@4dr{>}=qO~S}Irqg2;aL(6mSx>2hKUHvF*ZR8k4eJWFJcqoCImv<0bw^e#Enw% zW8{pJ^9(u9!pV6Vdt!PPQWeoe`0S-AD98v@9>eSC`<5P}U}+LQ`U7w$i%!-ni_d5Z zrH64gt~#N?*}7V880Y5nZ6FTi3J;SkHOnF3EXwH&WFA$g;YXK zo?=;E76`&@9$*aA!=eW==veIl=a3RoHiUq^2u}+9U)8yc>+_|@AvGY?&ZM2S zaG@_GWlRM18l}>dU&LzU!-`Xua?3IX;&`D2GM1Q>K3}w1s|2i7tx7GD@BLB<$3j_J zGO>g;%2=nAi^fH3%-DJ-wba6-A(WQNZ)kRha)N|%w8!8dR;RV7P8k#a1x8&S$%0;D z4irB$XXW={E2@z=Z0sp`A6jL9^81i+?Sk>rDr>WMB!5X#tJs^9XS5PaeJXf+bXK62 zZmyPKl~c1#XNFe!m@$0$u<{#mtEh`|l;EKbE_jpsB$$*pxnK5|OFO#swJea9f&6|XIP|fC zMp1NCYe({+1cQ9?td!!|*l4Ulo7ACQf>T@}$}x^!)hUmBS_e*jg@}XD*~+y;8an_9 z_RP*D35#b>p^%Tp26p6U*1n732|=RzHi{)_Nhnvc=(8;PXQhB3@#~*?`>;0&u;Evy zljlim>t{BKlpzJ#?3^o47Nmg?oXV7s;W`E$0M1H5;9~0!Hjr zAa`W~aug=Qh-e$Lpa#W#hBAcU1Y1;GaYFRCzai&u;pFVtYo%?L`#XyMdvg8*IVZ@W zZ#?HBGD=_%6L5lyQqm|nY^SB=IA@WEhE{!O`NYvo;?81(a!yKQ8d(&@1;h-(P~;%p zPGgoUe{#rjWh_q4nzPCy$YiXT94gm^@|bLD&H!|gBRrA&B0042F?rQsmB8xNKtYt@ zC>{aULk)z2%Tuu@$%&EkU*P1*2**~=g9M~Dj+6v_1L7Pj{84j~>og{u%uoKG2*e3t?sVT?zLtr3`?I3dw>PTBU(yk6j#;-liH-ha<;_)P5-og7i)+%^kFXK8FN|R?@&1qK`SjJXc zzC`cr1_#+{qZk{b#K^yMZ4P@Q5DCB0@jk48>Ow%BbKlfl(yR@&2L1?YE zU#axxX7r_fK@wJ{W}HCAmMxIR$~fvkTQg5R@2FojW&&Gz{5iG?BJhkGo~|EAdk2xX zg^9{Fq7RMjtC39mHokq^oimv|&+>boz1P6l%(eHcd*$Lt+zZn1E0IjY7QSK2dgR-4 z-3cy$AIx2c zKij!&adf4YceFjMY0jcwz7APVKU>oXVLw4sAN*bb*9SB8y?lKyCf0|hjrt>t{R?#MJbiG)+$j+9C*+u+?eo}@n1~=Eil5Kl#3n;fMUNs97}2w>(E#Cp zlvDg5cX1TLh}A|tG|(vs|ChWkv)P#NzmdfMQ3kD4MNi5oZN*@xYBOhL2%+UOuA(9k z1k$0?Vz44s1M!rO{J{{u4hR{MRm%Moqouw1Yt+gRiC@Mz8h@E1kTPyE3$&0SO;+r= z5}<3!`^EykATY#bgikfDCi!iHkerbbJ1q!to?27ZPuXAXmTL&J5Rxp<1*%8;d-#RR z(-;Zcf?ZLkoRLvCRe$B#z2HzmYLIq#Q|(GV$tR4`rn6t8*C?k`%B}n&mJp1h2a22G zm@GIIHApRyAsn?>2z`fWk|rV@(fwM9z%fGlKv6XaY6dB`g2bQ@+Oq3z577D@|mrg*{z69W$ zO)v=%o4yd`{+M!S$$?oe9jg~sOs-5p!9Qn-$G|oMt4Y8Vos)BrtcP$1(=&(B0HjXN z&0mGir%J=gEuf9^t^-AM;IU+uB76ZQU8M#X2}OlVQVCaB1n}8tt^}%3W`VhtGhb#* zE{;Ac?sMd%$XS4saKkCY3&mrFoLxLafPI4GwMH!jQ3X(n)g=*v!?dG_c`hPcv=#5U zwFd_SwcGCK`P!X}qrd=#W9M~aX5cA);HeK=GY3xd2TrG-|4ioj8UFd1bpM6)MJ`>} zw^5f^8e81Im|T92Ax_P_yLokR!`-&HpAawS4JYK$-GNmP4goUmPTt+QR=)OJrguNz zyZ^)C%)!(A!P6V==a%+AKr?pT+O@i{QM+}0g0J0P(2Q@FeXH!lip>6#JpKkwrcb|+ zIUV6oN77R>nW@k5Q=d!sr}8@8!IDEIXcYAS>JOJF?eg3--!Lz8E5n)Ee!jMUy&^NX zpC8=6;Xbgq|3SI;X2*>VN*f?#q%|}2G(Yroy5{IdXO}xR%1<(-u;B-!{Mk(C@9>5T z_%R(Te0J%xD~C3mP0)sLwl0=rovycDSbAak)Jlb*4!V}i^dI8;4}F;YTlUr4`o`n^Q}_;Q>QbhLj0*v`s`%p>@#BQPR&jhvI;nu2Ht}@NdT?rk@sE0>{UR> zY&Hc<0|)#6a^ws%zr;QR_tt*D+PH@H`z^-01O;0ytV_W$2v63+x?~6u>!JhnPuE}V zmTTZ21y~o=^JzvxqdX0)%er90!Gj?}^IfA|0xrUsk4@5i)5Oh`@4}A53VX!Fg#R^4 zB;1C|!|@29KC#w-T&o72r-kLs1f?9x@9Q{YLrogr7GgEV=^ADc#!3%1u((cr$~UZ0 z6VWs>3zY=uyOs7yJ{%wcJCo{MaHgF9_6O}{{4#1$d!^DSsi2P} zO5d-PB>8}QiYq9?8OuQJmTihN_Q52Ri5bWU`c>E>Y_fhzlyNg`k#kW|^jYO>Q;gr%OWVayNb|1hVXHfR7Mp!S5OcKYpyzutP60}4u028xLsBYw-ZsY_=dY@?IiE+2UbSLDhBxKf%W>0>U|&T`RW6i>Z5%1 z(T`d)$6w-)zm$GuBJ&Ezzrv-DCDK=~rK^u_RDW*i_~N1E)_)|1hsKa}=h%k3YYlw3 z-hXUk{(p$pJ?QF#DS?*7WX9FX(_eeq*3MfxfZi3Ld)cl5f=j#Fmn$-!cHYwtuK=3e zqzGV)ZyjAansGEj>O~a6bvT#x%flIO3-4`Njl4Ve&K&REns$xcnSgMRvlYTYPTODG z%9dIeIRfk8y&Y?BvaMJS9PsD_d)n}n6KKsVcSC$ zRiL8E>dh@Glya}*6B*~OlRGX7=6H(*b7*x^>?R1>tzV?LMRJzNA+%IwG=a!OL0IP% z^3~D@`WpG(B=>p_;|l zkbQ+(=B<Ov00IHyWrw80pH5eHbsBPNDD z=aW}p*-9cae#o6gD;O_9IP-Z1!2-oz;;k?$h?ovVLL=b$Y^K<&#Wy}D3sjvW)gzH> z1MG=HG7pXkp3o#`4-*fOF+~V+szgVUvY^9zBD5A(1nS2C1xpQn?PYLMlEf5e4~Z;) zUH1){9wX5Y?R+T%;e0rTEg8R7b?B047Bs-l@d9KCWoIDhRGmxGCz>-05}i2<7%Ge_ zBaHk>qzTeI#7E+&N4avr>?hf2iQ+l z9e(WcQCwvWg0}nh!yEM@nfhaV{joF*RoOl1%8pFsULJq;y-?OjH*Q^@T7NNJzjvd2 zU)r%RyZb4?Wh{fdWpKUyu4RYtK^>*8o`O>#vrZF+O;h4#!7d`YEj&Qn(1YSfOIn4Y z)$+tsa-an1n^hupkOvnhFupItIMWp)J8WlhmJB<;q;d8Vz;jl11kJ+qhJz-Kc#py> z57Nt#-x}nR*=wW@!uUD8ErvqnJR~M4O`C86#!0UQH~F0Vo&bFbB_#-Ty{^0OYLqQ$ z2`V?)>KZZ(6u%MA!sJlbjjqR5liQZpnXIs$>H{lQmN9!rB9m+Sui3q3)ob8$k-!3qV!fIX8_0 zGtP8YumpRp^XHA18Zt^PqSsBs)tR<*vQ`D%#T)gnQ@!+I3u`CanTgpcNYJuvd^-iZ z=@qm6*g-*DPO!<&+)OFtj8Y5E0GM{>5{GHOKI^Z|8sOH(IA zW+G^_#9(MhCK`3;bPHCM^&XiKaY*;d1{hK{EZVSrL8Jz)&=jIyB4!utAhrz`Ix$}Q zH+?D{5<9Gkq4o5@wrL6S>rxJhAY5>w#xneODfT6_c3K=|lu#}B>gkOYJxO^b(2~uA zr6HQGcoy8)Je(;vwHd82rz}KHN^Q|%7-IFY$gTGf=`6OxOgUM<%T+C>xmNU~JXELf z4p3+Lr>hez0t1eHr12}mv6#HQDitY*BzoPO-!tkyp+#18nV;BeZ_29$qZ)G!R)j== zFL8!f5QxwRf-`r zC})Ds$Cja-0p`GA>|l@y*h@$Kj1WIdttuns&Ph_x77Em#&FK?40}O5E%wmpu z%JEkO3?2Z)rdTx8XQ4GpCcR_B; z>I<{C9lju%BNq@G+=8MzkBOA{8az0$aqF6b@0Pt&#vALeA78eu?9aOV%tRfBBZ=!r zvo7CT3rh=x46huAO`bikkh1 zwnIApOCwoF#r?AK<-z5djk2aiL)KHba&pzU>RmOiyuf?f7R?Xb<%^~)Irl4TZoYiu z<<*uo{W~4m;NS<1w;S(w_I_Z$ZGUVksoJ_oBu-<;>gBZ=zGDaUM0XF|IiBt4|Jb14 zGr)pyItM=1=^F=sFu3(w73*j3bZ!hDB*n=($N^PWt$0^1-tw&;=ly+rMc<_DGayTlM&#WA{5lT0WtOnQ4^UWjhr7K5x$F_`PH}BYe=hTP(4af1k$xut$jE#ZE zR->nB(FEem>dRQ_c}x9D^m~@p{61Y-_qKlX`FB72&Sy8g!-z9Fzxd4OpIILLp0P>T1TL~EbwNN!A*DuE z#Mzt`I6<`b=1K+Q@X%dnQig)Q5yM$>PBJqqSTn^O&Xoz~UZE5orYX(j$O?>C(+LFI z&3}Z=MvbD$*t@v+08V~zj;>G!_N0uSNefk?Qdxqq{J{tuwhGad8&0NSUiUq_P}pf5 z7F9t8rm##b_QE5VDJaU>wKf23EOxD>9|p1yVcCaxtCl>9m|BKZ4gMpxBv}?ki%vm* zN)%2iF4+;bHftU@fRkp0>9-_V)sdvgRHYc~1cE%yyvi_Bi&aQdAzr=r7Y8jN?d^4j@LM-?ZPeRT`59dx2TY^0}P1B`Z znDKsjoT_IBOMs9I@mdCkQhv>Tv6U%Of>pwxnT}Jo8r?tD|E~ErbPLs~YE2kn{jS+0 zRhZkgsT%a_IjINA@8%^EF(F=VP?M@9gnOY*wI&laR`jH*u_n{GU>(Lrvo^$b6=gy& zWqIHBp16zQ`>xiS!wdE3fzXNu)mQSOR%xmF_wCY~(VPXcV*L!UBZ?lW9@SUS_z5*O zsB1(k>01tbVjj(YDlJDj7?XvjO-3?6YZhRzdZd~NEz#_$B4gN`YD_hgk|)0W^3A^d zB=O7H6LCmiiZqg&t-LAolWXaxUo)iI7QEQWX(etZ+<~1VYk!Yj$*|N`tQn^zzw(=^ zp^`M`x3Ck4JFMy}{ztEN^!@*$t60hgCYe)|q)VXPoJ@5wO6u?u~2wrL2wflG?ZiP&jL&YytkN|-5P zU=pP=g?n~5k`c@~60tdG--3DrMI>00ibN-Ibr`N^A)OG0565B&SQ$b{NRssJ4J32t zlFAe@XhuEeO2vG*Hij-nN(fA%Kc+sPKoU_?6X*Pi?o}w@UWfwb5=zPwNrd`5yOt`>kg54m+5kAia zSkon>KUYCbFTTl%8)6lJcAbD7C3c)uhP=?-Aqs?h8mUEO2X~a>j*)YgKxhsK!JXnhzyYllf=YcO3o}^JmpZUPUT8zV4w>mdd^Ri zI(Oni&Lxh6R0?;F3a_GZz%X7coEzLON;J>nRt zisaQiItk-tbFUJCO?tgKr%+yaZfYtKO+rnI2yfVYiqQ2!i59>YQtFx^fqeIBd=}hm zX!I(U^27x8J}~<0bd{YO6~Ro!Hojup^|6Qc%H{YspSx%8x(~$}M|IXw1By25aI!xy z*-56L5a0O7=O8As1Bb6Qy2|EN5W|Dl$L>=Z_uBT|vp)Fy_OT_pDvy%LlHXVAdqI@&0XTW936*X~yW|jlQhKnzgvI zmWs!vMw1!lCAK2jm-Y{=^k!;$`I_Ff3BG0^Z5+&%mS<}_1rf=##hY3KZ@sTig$$L@@r~_GjImth+MfZsOfd zY7c4YKmi2?rOI%;nW^mHD?0$Y+?FYC;mccA55N1&JI|!u!z{mV#gy>}d4F*2@CVP_ zM)Yo06p5e8o=0|bRoUW^JPZ{zlWgfX6KlKfn91b-ho|`MM>jf-WxIR9dZ_Doq|?{+ zXZr`hr>GlR9DU&MuN)R6VE@_0_13jl`8wFlfHF+B3hGaMRmbAfd_}dTa!RYWE)MpPLLD2WKe(~vtuuP9L@zvegmd^XZz7J|{*JQg! zGF^N4u04-S4Hf$q4}o)mOF(clNbBOEY;8xTb_-v-W&OGJ!*_-@ps+Ox4ogE9Df0Z@ z)^G3n)~=QCM*TjVo;#r}B@XIp+q?bm^rzi}V73g`tdIV{U3tT_d=_okTEn8?cxqt$ z!daWgf?C!4BP;}#@@!205N6j*7aBw~hC<;c&_uT(o4X+GJB(_L_{n`>wqk zhQHdk;%%hMThrdww7YHfbC1yNVcyY<0?_@{lj)8f>CPSLww;-_1AH3{b<_&)9U`;t z*449lfN#%s_k6JJ_O|uG^`W(0KcX1O3!s3io=oK+Upctm#8(b4+OnRGjHj3P(46!P zESj^f>XmS&ek)(UHB-NfuitfNKVQH1u4`}FvN!G8o2_eKtK#eW)0XP{9>`Dxc@Hi# zff^G&GH<$~Jzd$JcHl#-%{W?lM=SGm@QA7DfMQd*ZPA>E8LNRz^$1@*vRI}r4s4^* z8>6fB+0K#227TrBA8gFTJ8n71T)H=y@ec6b0kG4mhaL>?{20HxVEF}> zKZ2R+LB4u$y_>Jz`(YJd{dBgv9VOIs{GfC2wspPfJqOiA=jrb1l`-BGTr@uL_%far z-qW)B-0I=?x2{*cw`aq%17qi_zxlZvpIhCP?Fb^1zXczFANM=-pge!y5BdgfpZ&ec zZ`XdS_PseOvl6Vik?o6)<(F5-@IrXYP*$D;Fznr%b=PIwt@K7i>*eb&XSP1gZ+-fs z5(pV@xJU8MADIkwWth1(UDpV)4FBNLcR%-?&#g6X1aS}p&L2d!)AfCDF);_**Xw>z z*L3UfDr83VL+vaIPJKH*CTj*}|iB0{^~V=C#lv3eljY@}UGB0Tn$zzcdMt)za^JPb>-iGQ$0EKuQ>2U(l?= zhX|$P7%IeA?lxS2mz+L#Y3@=2$LIer)Bsn7OhJVKXYJUTp`%a&%!u%mg8fhsOv7E< z&~t^e9msKl5Nzyw0Z5@R8J>o9e>$);+tK;%m3OY(Z|{D0_MKTw>&lVqV|a@;-*RH* z*vgA5$Ch8?t?e0WH*f8}Z*jg=wp4~M$L9ZmzxkH!>oliLr3G)D?X2S>Nn&O1CKt0M=eSM>n<)yoJ6$l zA_$O>g%@1n$n3q)HI9OB6BF@bsukBfx|yXG8!WXrU<*dmpgiouXvaIX<6)_#3_>bS zCZs}AlkQyUG2uLcBQ}T1n9AQ%@L4$H!nrv^0#715sh+bIL=c_xr~z(~qzX)a6iTWv znG+y9!UI}fZQi6r>%hN2M&*;QHZ0NN1EUPN#o*dhNcAhffRNV(a2eI62WllK98GHQ zjS#EBTShG5+C>j>IB?)-N?Blvqy&Zr3=+=~HOlZwrA9Itr%}3GhOwv&5(j0EqT9Af z2`0!Ti|+w-OEX$mRJYtFsa$rfO05@+%8M%@N|0N*Nqtt)9E^Op86k5p@|%dj;IncC zATwiACDoZEF$(r*mz^At(lvR>)LIzFz;%PN#v1C9`SUow)ZoWz@rRMCPO6E_0fuGC z{s=BsjWxs@7obK49-UP&a0sY?ln2M4?s2l+{OZNRX;fA_zv{^b8y z-^)FsC}*E|^aGk|ZUzK8eSm_H*mLmZOqZhD;{!juPOz3fkp{-CA&g)n0$vd)V+XB} zey%0C40bsooi)+nBr{jUjy;1-Dv2ri`v+CeC>^5H7(FPFKC|K_gJ!{#D-+;kdk?G? ze@pGe9U`#61nUuk%ykRJ8L8*Fl2R) zgk2|g`Lb}D%S3zFio`@XzC=BzO)Xa{jtN$F5Y{otrZJ;sY4OUn(|v$_xNw$jSR}38 zC(5fN`k=Ab$GQY-PQO8=d1%}Uj@W{BMP5j2u`F6ESuujaUs_?59Z@X~<`!3AS^VoI ztfh%K8bIP-RVwi>**b$D?Dar0@Ji+s{7}jlq-2>Kn7<+8@8$huRskYn2=w#*eijgo zHax5V3M*;#!%hG$>{~hX_K9032&GkN`LlcSRF-(XxQI`_;ZFs3dU-EsF)tbHEd@R1 z3#=O7b-v?VAHG+$omxa@8gQyy@pkR4+V$ZNM?UQQs5kx8i;oQ@;srq1vKyM;e&yCH zOn#KQRP&&kWZTNJffm6SMGG^?8vq&M@5l8vH($B&%KdWR&AuCbf<=NC)82Nv63}WQ zgR5^kK%X|i$bf+HALQ!~vQhy{4f;2hGk@8%`_4-nO~)3E-?In^B13_4B^NnC+6_y# z{K!YCJF)O>dTWp@%E*)xNEP3tC11hh-ZW52EOPR3t6a09IBil6UNz-tNF7E)f=0Vk zZCGOXGSwCNmKW9&LCsv=VwAQi%N>KWGUEgCmZE}Fjz+AX&sEA{U#$Alq zWNccpHn2j5?~Wb`X%~BMQ?*&%xCBUYQJzH{HF0BIP`@j!95yY<8KwS|y{J4I72?4b zJw@a1!1y~fdnO-Bpd5JvR0<>X1SuMGGGe6dn;pn%-3AfUMC8p-lVPWyT}ERC9okX-nzPUb*22v3*vQT z4(D4(mX0hBEhicG;n2Hd?~H9YhM=bm?g;o1V1+Ilna+6f?S)$lYsn85ZZGhSyVI^c z9}Zz_@!OUhi^jz%u*gvMt4psgFKoElq{4-}(DYv(|J3Du3^k0vs&#F6z2e&q-)i_! zzcF}Vqw*jwSu-w1Nb?%z=a-W!XO?Hznqhce(8gwQbg|^>{;$5iW=>c2ub;Z-*uJO- z$F{0QysK?tqjIZoSK9-JH{&3qKr3fA98ILXJ;LM9F>AsIL*ND1U*j=F3;$d;+WILBtm%pMPFxhZ{9cjMg4dm}1{vdTb#rzMfW$f-y6XFH= zFKYQD8ZAF9qVJInDQbQ~L?1FJmSf2W>#?|&4k{_I7|XZ`jIRLRLMi@R@ZXC6HvBJP zsiOVZ2zUWe8~6$AR8mYU0~A?~6(C_bR`hwIwn%xjOlz4IA#i>w+K{aPXTlw++yuI6 zlX$OoJUA1@HeU;FoZP>k7)(sNl9V@Xgi4P=vN8awCWz$ZZh5F|q0ouQH>tL|Y#%iFJ9>Fc2aokQ5 zK!&+MA{q^hjqX1<7mv@)QkxLHQ-)<1mR}z1mkE@9i9qQGfzp2|IUP&jx^f1uX#)2j zV1JHn3yhvRh3Pn{M&F1FfGHYFta7>7%0J@JUtt(Rp1{~MUFmFgt3x#~Ife^#!cUZJh~Dj z>!pm$DzPLBgXVC&!)6Q|7 z$g(~OPWjzD2dhJa)cb@ST}s)KZ}S|iRXq}%@+;;zs>Db>^g`oprraY zYwr&7`XxcKHA!9<3Szatb&d54ZPQo*wBH9}wbq+T%s{MvQeQCFI;6G~8A?9F{Rka^vAIbR%IX@-mKauk@a{eE3{xdoMg`EFN&RanD!0Z+ovrrgPY_^@J^mF8t z0ElB&Jrv7tX2j)6#e6v{tQ-qcI0CKPsbYN}ry~A_oWCVUM}_?z`Tm}q|3D6XLd2rX3FPhV`2!C{Wb0L9>Nt`q`Rzyvv1 zolAr-Zy92w5BDws(X8ia-~~%`fSL{ZG}+Uv@k;-Sy2iReldYH z7G%%(QS>metm8sR(MipJJXs7Fzc718r{aVPG!M1c2bK$9D ze=S*W^>XJ*XBw9ad8`n6IrNp$e3h;)!0a1W1%&JP_*$HB)R1T#l*wM^ zf84CA-p+VJvMuWX?+DP#0*i?y5SK~qdFz>_X9{x=ql+CZ`Li$$?I?d#?W_hj?JKw_ zjMkoca)>WkU@ee#O-7s2Qn=}n4D{0^j7!mBe-G;zt_zx>a^=Dct_xxs>+BE>C| z!`NfA&u?yuW^=yT@6f_c&Y#lyIsy;)W*?WBO_qFnu_>A=e|F2Zk6TWbh^A=BQ@~DR z5du4nc?DB6%qy6pVP3%$4f6`7XqcBw(X0&L2<5#jD43&RUcnp<^J*l+k3`_5gyGyKn5yo8U)ZC37?sQY_3MB+N$| z$Vt2j`LT<>M<2nXX5%r&9T;t}Gb1wZ#Os>C#M45W#Z)t{YO*l_pi;2eEOarHCNwUK z(u~IiQkwC?Wl*?oN-K5B60t)($jZcnnEF6Y#e_S=gw?sxiK|ggWIQpx(J}Ox;Z#qa znOc#=KG46%cT&VW!Yv?0;z5S%RS?LakGkm5IunKYOifsrp3!TG65(7-uXSb$lf!x~ zgc$c48nng;$Zo0Byrj%1gKCx@3O&?UJg~B$PBBE})5HZ^L5eYAlJnRLQg9uZoZ?W7 z?trx;Gg5@Q$c`=myT3l9;;#8%$3R22cDei(4 zmny}xiMpAal)kR$6BRHsEfe?=&l^ykO}VB@4(Z+#nk^eGBBda%G+8G`Losr;8Yk)v zQds!~R6q&=%AAx*>jf{)e7%w8XT(=9X_L@UxO(fU@YEaev|_EZ^M zXUYW^cS^ynN)iUn9?XF%X`{g{@CL2(MIsrI%a!uL)u*g*VPylZF=dDAOPS&NQ>AdB z>;@)a%{NNmuT2@nyW{1zkMet@nw8(r_v3vVv`A=DCglQ1b+CKA9P}nQe{xXDi!*WD zFv{wz11qNfJ@Grl?2(y2vm+|@kM`GL&}0u7G)1bE+8R`%aB^+QT}s=u-W2+|TY^x2 z-*1%O0a`Az{eB5$1*AF(W_=*iBz;4uuStRd8Qd~15QwiMPIZgibp&z84lWaCwyVNZ z(D7l%z&Sk`t288{^UT`0K#vlSN+3!Z(MF_b&Jq#t_bIRxL?7}N{C`Ba&G)-!;6dhkO~8KQ5RPD#?}f4RDtbQD6wky89did#}uM zd#?>CxR81;kSVvm-Rw@E#NIwR&u?HBCFqU*dm`uZx{@8X?*|U8coz>Y4}Rq+A2{@% z9??Aeg3F^X7C=3Lc!2hJgyhydq~O>_^g9K>F#W`c z`(=u~QII6?Bi^OBHFAC*POw?U75Htep)e&iB?*x!&0uw~a{mS8{B?3J!^v5NcRC%p zlCuc{XmlBE&Vaj=xTguA6!+MNag4(-F=iAHWz&B6i$u)Xh2%5jeTCe?%oz zPEB(#ZgK${P_rnDi_XU;pndmEN;6;_gd6Ua{>lsq(w&>XlvHj_%K4$k#B4SR2Wv_S zyDB$Wsp2KjzzgG>Cm#)sAnL|m3mcCjHo~_EDx{ISa8Z!jmzP zg+R;Olk9^)U=s#{<}{q6Iro>C*_d&lX{}{TrM$&Y;$(Ku;@)g!AXC}PSN3KqxAB$R zaI=iBIpgc+ef^7$O}sU2nVPM9&DKR1lep{UYavz#iM#uibt`99UtZt3QMqH$cHh^4 zyUAZ$Pj2{jFFNkymZA3a(20+p+h{(OwlvGU#_y`d7@P#IfdG zacuZ|*Us|(!HtTcMXPv?5ukF3>s>nf(ATwi^!wFKKhk+>Y~S~{&rXISElnf=UwpOqM`ng^{G4YdwruTW?Wp1d*eZk{xt6)FyTgc z^5JHB+z3y;D!L`!M?Sx;d~Q( zpvTMdojNBZD_VF*OWJ|Uo2pmNWa@YD^*ipmcTi}Yhqc*>yYBvnzE-HcWCQ*3O;Q!h zuaj&c5$Q)Uv;1w@x@JM^E8E*4w4omojHvmle*Oqt%KOaU0X$#m?1wHx{Nsm9YZ zo$Bs)_Ji=vI7rrF^+=|58{fJOGAu!~`kti?)1ke0?M!;>=tq+qZKp_NvAKvghS+U1 zF&()y5u2n%ReH$Jknrd5O<-3rTyF*%i7&JGk&rKDnO;N;)i8>c_=S>LTwbar*hscu zqz#yf&XuszOzLt3Kb>blH->f~JPRc%qhgw;rHWvglqc~l`e76DtKFnS3TdGY4Ds2L zg3W$y?)Ei1_81W2bZxQL*{5g*tsLmyCzU`F{4&BOwFx|Y?J<(i4%Lt{B40Q@8Ycru z`CLF6AFlr&JU->W(Dob_zkUi1;3AUQNNqAo6Pdi1r0C z)dfTZx0O|-?X_87YsS~b`?@xKJ&Q+iy?Lf?fCu}vexq*R(%1vyuI=R8ci!3aQRhbc zX{;Ey1$}$gU-x#|t+H%YBUJyONLdYSPy?7%PVhiIwJXDK@4dD6Uia>M?mby|ea78_ zi%hD`5Q7Vf%-q0#27dGZ4DtjkNXTC1MBE>dLlRIeOdZOt_A=J8jzn|JSCwjjEt zBh$Q(Z{GKz^P?*p%`ftn#zp7y6>5&$6F z5;KA=}g&MduwuOkPEY}s_({lsRnpVpgB3u0T$NBytJmkNFhWVlv`d3Q(N zDnwvQQ5hK)@}g5PE$kEkW=gtD9WpZqev~IMb0}odfzi`*qi3~(FK^3RDaxj^xtN%i z9U);&rPfZOq4FawDUS$|QS*05ObbeVhTjGa2jMJJ)HG#ed>iu{Wg)uMyHx84jl{LI z;$c_VN*oxbOtx8G2zF`Zg)*2eVq>CT&P^bM)J^J+veYRXRK!dnJL3U4n2MN7!TZNZ ztKtabVcymY2Si6PAj~I|BebLwkA!VHm*IX*|3&>5&1z$!OQpr!pCwIhjyC59urvdU z9$aci;z#mU^Yga>K+0u+lBBs^^Z1j&~&C2J)AOta)uezi9_#gguq%(GBQ{gOG^Cgpig^k^9l z<9bxWDB7cD(3F*iIW=%I4B#vnRh$KR#AaQJqL)-ZMpsT(^C`516&B5ldRX8#y)za;lKRQXG2n9 zj{?2HMx?pD1c6n}c;;djyno;5ZjIzB&4%_hl zn6kMOljoy!^3BP%!bw$(D0ya7R2gTo^TH7h$gWuyXm=Pjy}JxEG%lbE3%1e|L5n8YFN_;?hP~oAiee_41HVE@*i&ly+`ri?{xNO; zq_gld7<#riWwP8;CsK7eT`pXo_Vpuk6Ue*oIqE^*HT7@c?qrYa^NNJ$&~bQ1t_# z7{6!fVc!VkUKrPB8nYK;7%Jf+oW6J+5G%eqCu3?(+plBxQ8>y#{B^P0*;|e zn9p*(>YD}`{vfVIGC+&;p*yeqI;z2WLkTe?{%#u;&utI(RSq?`)2RsvHR z%Uhzrz&4$gvi?QTM)is&Z82Jw9=Ph1@$;}}x?q}xac9kYN)>e{Sfg7o&z6Cv7VMV- zIUsH)-G3n16kISD(6utk0U+@~9X83Y{32FDIh9EH7zqZuU8n_DG!n}ma@O)aQ;g<@ z+ToiX7>Y)(g-7nM#(_sdee{EpfXd2BjIiIT(^4j5HHzDrSfeFnVw9kuJwzlLXhLN* zwy{N!`*Xx||CSuuZ@9l8-}lMMk#nD%K}z^H@CB{P1QV#szoPVqm&n8zg~F#tpM2I;mv(h#1HCt#4z~;O z>|1tXy;wT>z}u1bZb>_~z|DeOY~<_qFOFq>-D}5r-?qi04=Za0U0V5~@!{|8{m$M$ zuROkZ`1|eMYsc0XK0LqC{tP21Z*7N^M%4*D7(DK}FGF9a>#nQoJ``D-`|rHSyN}?5 zt|$ZN9P5V#av^W;xV2;TmGz?=wR^zT1Oa%FT5G8j=$5z98K`l6fyVGx$$0}#u67Cx zXzWb5Bq2E$o==7&mpC#>9h<`~RSH$d$T+sieVMYd*Tmk&YZQEwoUg!PR{x(8hWG!Z zmqE_Qw6=c@sUYnhGI3*n?BRPwwoilch^j-xQ2 zzZ<_G$8bqZ#6ZLq&`cvpG|{RD^k_;W257I)bugBzhM+Z9quT$buOu&V(PyR(!-hDQ zgqb-y_yLz*-q530CP*m-Hl3s6+H3PXT8*_?e|?;khQnx$@URflR3Y(;WOgQ0nimWb z??(kmUmtr|nluh%pF(}S?Utal)&LxLi?x%HcO0T?DY>e(LG)S#66&KBnms5(gH!^9 z+;%|7XuOn*A@_2B03jpv1rTx{0nwnONIrGHfjYx>4h`nD9rP|wN23?c4DAez?5 z#^w%-x6e$?U4ohsRFT+;kBE}?!L&*l;e+x?pJ>#OtjTl9@HuYo66_hW!yht?2RjF0 zAt{lopnUV^uOj$0>2#chDzhhz~~fVP<@So5eD~ z{V8I(KO^T~k@H97{Bv^t1P)elw$1+yUUtDYUj=J>+{7e|OJGXF>LU#60Gmu)8{B;Q z_i1N2PRsCZ)P?=uUX`{tWSd(EgKEsSwbO4iOajqwYqm2;zirvRe){dsws+8Pi~Jkx zqHz0tM>)v}rfbi9&+)>8j-gD)q5rqIYY&R!zVExo9UR=j0e5i6Q#_8x@fPTT1VTar zBy5DES1c=*G2%`VdVsnIl8uoQcbXU_>KOgeF?6OQ+4YRk)Kil*)1XXSD~Ts|Ch5$r zIC;+-DJG56N7|+*gX=WSwEcX4d%Js_R}yhLlO6coZ-4LC{(j%z_bFC&Q51@3k*a&d z>K=MSL0hDT{bIv@dP8yRrOKUR0sWgW7JE+8SzUm}&~^j81xEXV3TuGUybYA*nAMmxWuVxQ{Ra&hEk)rM zbTovT`))bJqc7rxu8Xj>G*Z(=zg3ZnEel;z`#}+Z6$dG( z44j*ivoh?goZ1zs?Vj2dwpSw+{gyFC%pGFo4tj%{4F?%;mh3*!?z?TTy_d?l6zn7b zql5$Ad}cT}rqF;L0|SF2BcqClV{l+V_)i++uh7W;ntVj&R*u6F^8E{ik<&$;>p{?$ zIWT(86p5~?MRQfuWCDYrHOtb=&)47dip@RC9KA#<(=ByN#idaTesMxqR7X?D2UfX^ zl~nkYDyZNgl&L)`OuPIW#H>c65)$=pPUL~CpcB01PLq?=a`2fofS+EA(%dmawh1X?YX&d6POJ@g>D8EV6dYaqnnY| z;NF&qZf2Upv@*xi(Hiy;!Z?mK(6TB9w?vhF2n>pbELDCpof0voxh%C*pgKvuHra@2 z4krnX1$%IRBLF|)5wcEj{^VCl$%X;c=1(D*cTA&4PV&0NKF|dF`6n#aZ-ZPBD#fnN2gT~}KZA3s*B#$xQt zRKNbKLb%AIp`tNn1~8}*%N=s*V`zBGzmGJY0Y1P8=^Z=&wbr0zoAtJ6qCI_ zk+1BVv_<1zd*~DTEg8v#zeWMd$?`4u;=x4W1_I(?!njY}2=}Wpn_Z;ih_(=wajYYe zaZLC&N(SDUiFqZTRyK>|^U7Xv964nR1IPiuYWszpa^@`KuoAvTnF|xRy@Ui5G2`y~ zTci~Jj-2nqAr;g^q{$m)ljM`C%$!|EKkCWGD5zb7`WzWu=(Qt%!Ka-{@P&z3E=8>jmFTE@epr zRwWLlTHrH8<=%0ZU+bCcU36~(Dp!%g0VcPwX0f1_#BlDEG|neWO`T#>=VHlDg5rvn z(2hqgJtEmjAq%NsNe^4=fQ(4b(qXo_t~VdO{-{*9SHz!vZ!A>-przDw$+`~Y``cE< zD%7g)55uk;WBFJqFT-XZwh;s=&lr83?7_#b9X4thUHtU}1^KkptQ0Lhd!u^T0~975 z1S)8z2IJR=eFoQwK3peEaa^#bG3|&^w_{idbgfFE9W!bQ<=PXU#t1#|X{MGSf4LKE zE!uXhH}@icx!+8P5}Qt2)%9t?wCw#*13;xJ4^otOB&nM7`pazv@5X<5GwOGou0v0djlU6{7ng&vCQ?@1O7(8<22vjqL zb1I>^9ty-OUMmhenxK^ajuW=w!WC^{XM4!r9&)y`*sH!--)l#2J6e{&rZPM`e8m<` z=W?rv-^dNb#+~fq(7OJ`?8id($0FqoQuz+Cd)@CIP$cS!2MiAlJ3@du->K*{Awva(bvV)vV zIKE_}td5@z2)Kd>b_%wUvk?nX&L(ZDvEfy%^kvYZ!|;JqxT4R{TBMC0|2PMu;@wSBQlW z1Y42Xys}MI_%B2SX;#@{ROmT`BAVD_ycQpZ9t+XxoI!bOM~_u*WjL=6v9bfw=~YZr zyv%6V)%Mj0$o9imhNqJvjlI)J;mo>-ZyWtqMrs4r(W{(gH}%EYhf&MD1NQWo87>-n85ShfSGL_Y6-a=u4$Z!2wRlc58CvVJv*!_UOxHCGI@>eweO(+ZpCstOm@K_Ns|Coeuk;F(-J+7 zJ&<8dnSdWTw0oZ#$8q3u#9-3{9#{R&!x`xfR!2sK=fEsOC?EEyaU`G3jO32R;7IF5 z;*L(8w4Nth^y&It&$tJ17T(h)(_Y{&S577YRtTUp%-}`aMLT4q?I?{$X?7SUpmJfP zVKSM8PI&w_wO?`NZHq&D!i<1g2I_uD=SE-?-fxG2VGH^-j&xa*Sx;hzWPQty>z%`&j#Llt zk29`~&Q)z}LK~e*8+C43BR=6V?GEhE7^_kHq7Vj*d#8>>TrZuFR?haPzu{1oyTKT) z3-|xvr1K(_D2+UU+kU2Cg1J`e)q`gMW6NfgQdf=Dt1Y5Xt!B*QmneR7T>s*LSD4v_;w6dBay zj7%aw2-%C3@Qv8_;)cmtkCFVfk$N}3^E`Il%j@EJoIOU|%zl^JC!rRl(=nXzoZ68P zfY-IQ;$q%N&3#JU`MZ<~1F|{zs*`9Q4v?{q+({3@);>BpM2dMMSDj%%>)*s0L|NsS zxhJ@3)8#|YBt5}h;C=3Xz!9@(ta7B{FeS7G3t%VfRu+Ju& z-PZToYtXqb#&iUvVf&yYgw_up^s@6a_0ZrLzfTVhQm(0o2Kn?J8rSu-bwr*t1*M1f z$N@6oC`GsvyanQ-zm_0M2D@6$hnlxymrK|Z~I#&u!Fc<}le zej85%oPF8?P_h#78oFOux8xe3?2S_nXru^MmjS9D==AtB;1xNIjai_!0XQW;Y`5Pv zZl@vj)_`Xa;JJA$_T8cN#d=RvAJr4%-r`P`sG<7rH2*^NL+9RTKg!q3PLvX|qAfHMQ9wpWbHUx}MJZOnvd{eNcVI}h_ab`n5nuwnytIyPCgE_xXn+;N~sm%uY^fnvU zwT4w?G&%-zZe0GFR@EA7cxYd;AF!iCb=dD`M+c!MwV&y~0ry6=30u359Q<sD;9RN#V(?(jaW$Oaz3{Ah7F1){*UsfjV$O6I4v=_IV zos@>26NjGrg!m77P4|?aD982ikC^cDInzGV3dgnYH_cc`bAse)5j`zoPiv&GR4Uvc z(qHHMdF$@La>}#UpH(D6q^*0!t$W`cmmWJIK6XNSY(#u)B(!BTG=4tZcmYj>CQ`{& z1uoUFt4?X+o##8vo4@7zx^Lmo zTTk71O4@cn+;$*z=!kUata#|GbZA^WG#+X{ADX-vZhl^!$QsdA6LvLz_9dV-H00U* z|4_;;&t~~|neETIOjdQ{cP>QqY{r6E(ig;r-)2E9Wx5y<2m9MEXp10)qO`M|gnb(J z0t!qEb|z>fqkE_jCg=Ms5rilp`O9$WLJ4hNI@0{xlFUdQd{iZKRh%h zr_;u(j$BYSGpo^Rpt2v!Y5EAtX)Loca5g1gA%2w8XjxdSGIE(X7h(;Fm6H;qQ3zg2 zYon?dQz5#QqM~6uqoBV8$IqP`1mU43MrCRbbAm2xda7Z`K zN{+1*^%Sk-b8aZ!kqVl`f~LiSX41TEyHi&2=M_`wcgkxZ*FDuWT^}_a;nQ;>&T1GK zbv8v@g&!8}xaoSQ;5!BH`WK51Os7ShMIomT5kG6&C%QIIAGuaAcXG*HHs=-H4G~WX zxOnob=QfCWjq?X@7Kj`B5C;9+SKDXX=WO#w77KPvcR_Ld%NMU)y!Lq5T|e(#I4w5! zBGrfPhPko%gJJhpyet(~&6&l*`swZn9TBAFYkKedF6HpHhFzWSyYixqI4=2UGgn<7 za(RDT-a3;K+1)RCx@OX@JvyJaum`4K=lo(pbEMD0B|54?wO_d9 z_`)4${>(Z(-wG+iz@_+75_q2SSgJ zifyCsWuA*Vxx5yrduK_172RdR0+lmezYcE3C;Z*5=020@yW2f*zvtOif}ihiNrC&0 zDTmy=403&TayK`@{gKJkmtqlqh>gJKNe~edAS)iP!RaV%8N^8!r^bmGk>VtU66Al0 z9aZ>m+LJ9~;}BUDN-4lh<+PG-A32xl^^f5&(rrvg_;y>cnTgpgm7(XO7jGvE=^KMHwK8OrU#!~*et97$=Q49Q7 zY*6q}_)1|kh2E@ONySybfzecYgI=eMWgGcQGA&I^S_!|%Lefg)BWb1ORPuRw%SK%& zd800ryb&LL`wkv<9q!rE;slf7M?tD!GQGtKCev$zU@`@*7EE4Fr!bGjlDhgFts3N^QV-`~jms2PxBNbBAU2{2e!TDVa&V`{+W2fi?ui&m_4l&W(T}hUrYkTGn z&9^LA-m>4Yhg!PC`fkzN6XhuC-IjNQKWP1g`4z?ynu{S!lJ?-8JcQ<$WgE zsh$mSe3t#!lM>fgf?OHt8WhO+0nUwOq6q&H7cs*wID9O8&GqYILM`zv5Drld3AG-k zFmnDLwF;l7H&!7FXJFDJ2wO$UM?!^^|2FHot}}t?3@YATIEb4%awa~0)9EhM(T!O8 z5E7mQorVVaRk9z*b)_KKl1g)pu&);emJTp8P9T6;h0{zFgHTtf#tfNw?f}PF94-B_ zhvb2wwHK;YL4hHVCMIyMNHv515o)|tNoC<1n10-187Ovy#*9fHtVbw-99)yf82G0u z*rgF<+>A0myjuKH*g{d{EE2GA0u2Vg1-|pl>Ew{7Ic#lV`cP^`do2`f+8Y5rE0?LM z)H_+Z)8{4kCddK2m$iASGvdgX92KIYVlEv_x}{$~J3nxnkHwm1bz}19X>|#;)09fh*3izM7v_+h`1JI7fXuH6W12$ zIT|DJ?L3N7ssRTS#P};@^rB%&*#_lG^ z27>{Cu=BF_0lAJttul$^1QZUV2PI|r7N!n~F!};<4alVrC>k_M&^m<@N{nxm6UxXb zC&vp%u6upIFFNn=0_PyS~g6^GF>yVb*>DG zALsj!7orD=gdp^ZXG7E{#I?35d z&Mr7i&DhYme{f*%%<$=e@G7;Ti()?yA45w%b8b8~I0M7y0?!Q4{?$!c3x}UkwmPs6 z#Ej}T93LN1HcnC`6$_M8wNN3G%Kt+ue}$F5mx;wG3SM8L;Md4`k(_@fhao%4C~1aP z9?8-LgdQsCYn0{^IbR{?c?#P{&Wq$+B8M;~Oy`^;az`TO47g-a1v~teb7(ba02Oqo z1rmE5!dGx6AQKmSBZ=dpNj%TrYv%aopK<9Qa1D34!n<7V&p7*C&h-J;_H(Z0F6X_= zZTo=R!`#D*+~H45$-L=fj+_jR-*}fRzst4X<@)b(+dtrTt&ZKpf53hIF4yz1$%N#| z9N)=*VoKsopIG61Y~}cF{w`O0m#hDP>t>}?#-b_apSXRC+&;ZPD9ACFg_4|7QmL3! z8cr(ziMc9bDGXV1Buk-aDGXbR@0uGv;OawM{ZCU}(}x#Ri@%h-l6-*Yb7xFfQ)g3G zIP$NYG1c)|(>q`8z0$kF!MpMlf7rzD;ME}Za__v^!Z*!qjB@yy3$UMs?qzyg$vkgD zk!8e|y~2@yMX==azL~()ld~tUj?Rt(MqS|$5^d#7xnIhW_#BbXS>)aKOf7ubauQ_y zqU4K?@P|zNIv_ZihMy#QS#r6f7V_X|$93(@r8ChK_G;y<=`T*c0O+2*rm3$s_G;%c zoG(3j>B(q1d(GgobD|FB&Ey=}(Jbc8=1{nkd0o)J9?fB1C>CFLwQ#l&*Ydo&sGEg( zxMbUlPrmTvavpmglCj6|G_>F5Mz3FYFH@%7UrnivZlm?PdX&?EN=8;cn9r+R( zM?req@J5R_KSK7k)sg%`lE34sS8&>H`_F0}!n>zN=@TK7uj{GZ6 zCk>kT%`^M2J~8{m3P=8xJ(e9NzDf-uf3(NUncOR;$9TSI`WUg@FLUIl>2pU}@QUA5 z$=8KS*ROE+UD?k+Vd0A+=HwT9U+A5h2o-FcpS*c=C5b{;o=M~R)KGro3WwhnE4>sV pAAX~D8=o1m7lzg~iuT5661{Z*wS?a#i*;(l3rAKg@G~gy{{V4wLO1{b literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/deepseek_vl2.cpython-312.pyc b/model_executor/models/__pycache__/deepseek_vl2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28c9438d096b66a22ac92f90e523499e524e1741 GIT binary patch literal 27455 zcmd6Qd30OZdEdjn6CeSC1XpmCAW@LGXkRH&>qyjSw2TKH2d4NQNl*aEc>qTe2eg%m zY*Wd^BQ;G$)lNmFZpFz=J9O$ca!*dII6cjDOYw$sfV^ssnR?pPIXx#BlkUuMy8XWU z@BokkXYAyU_L02rzPqpAcE9iP-#Hvs4o_77Yms01JjeZrKGesY^DKW~%W-$PiyY5| zI9|hRqneNgZ(URy(}i>~eMlcOgbW&r(?^XlQ^*uEhs-fc$il+3QESM`^4VhckUbaX zh}j3V8ZK5Aab;mp*5A&O%l`(I~%lzhORjfKx&HR?AFIE$( zVSa10HdYs^V}4t-KGqOwV19eFG3F2Xncop@iZzFtV=bYUSZk;?))s1G;bqbG*oM%C z*v8PtSRfRLZ3=B-;m+vhSVyQMwk5PB77PVrTSHr8+d|t|oGaQH>k4%-e|fY!))VSs z{)*`K*pASS*v`<-*sjnn7Uqubj_nEUVSZ0^Z){&^AM;m6_s0%|4lutrdN6hSf}cp{>{#en?0D!n3#*Bqh@A|bjGYRdiai&4E_OO}8sW8} zGg|IA$JgED_jY7woxJz12@Q?n}LOrTa zxFlQNa%CdwSkVQV<*Rk1W_0p6t;|yMX&q& z1YsgB2v@dTI^E@ujq^g(KR6zn7)V4eM+N`YNMgi)VsdymGB$i{U=RU9WOyVI5B``+ z?GNZ=^T~uD4j`>;Ix{dafpoI@yzt5-)qrSA@7UOQVjv;#vgve$l163Ah3gYSZ!|Ix z4`^kJC=7_=z}T=L8%1G47)S&(viVP;L$8g2s#U)(4c(RX5K)LRdv4LUXXza4U z^B71asS^0SI6f%E(K{v3!cv|`pU}otjbZTtgIq;}REWywmn$eD_eqtE4UNmy&m>f1 z%8@cQk$6EFpO`R~knKv)#R(qMib3+`r{m<<#AG6Vcrp^@1(AmD?u>5pkR?kmPOpJ?(cp+A{s$tJRj~OoSolK0ssO+KB z;{)iH?8tr6?!jkGWGs>hzl!x97KLH77Mqn%kWSF^S^`Fbf!7h36O4js)RoIo^a2>N@Os|B8+jA2 zAJM#HR)NWon&0x7`K`R|Z6Hm0^11Ak>$)RQCf61wv5T$c=OfpIsIq{jo?88Q`XX3v zv;g9J*f)i3@n0UB*yCs5!B1P-zsWx~i47zA$A|oEJmRZ@0LU=G@uC8dy+5Dh+33OI z=)sYJu`wYU_iyYD6csVD$A5y&Ayq11(J3_-rMSAs|6+s(?0hzXY@eX6hF=|sP6}~Z zPs>Wyp<6#DL?vrRWX)BqKs$eZY#t85O3kx2-LWoww$%w&}O_Hg)>$$@At#uN_oU}$i$Wez?qQPa&2XYUdQ=S(k>*p5rpUe1~=e9rG_h8?* zYad;B+_~6!G}Urc@*R7kKPH|~Ygx5fXawk4o`Ul?-7KIQN^Tn+ew0d}rpc~T8a6{k{ZrVe7UU!oV8D_X^nvn542OMB8Ms>1oaA=yig)Q@L-P&D2R%9s|r zN&VNkX?;?U6<)4}U+30Yy?R=`Vpma#rN_SXS}ZXmYHOcuL83`*4R64VmDrQt;C@T1 z&eQ9ffU$onaDk{+|HSw>c#vU#B<=@R_Qw^1QsDiAlOnjbMD%)4cJt#`$KnIA31IhF zbRs+~L?=Z8L$XofU#99VjumoFsARRDNR2)IK!t2TEND%T3yj(fiv&dFvV6)gBT5{M zkPhdXBpY~PA~7Ocs7jIO1z9^bCL2#PmPximi1;UJMRpanPf2JRQ}BXp$Z>i^>yGiV z{`k?;7eOL3evv2{+1)>$I7vMtRzu*AisHD4ZVg-)#CX6Y>*GRnNNFu=2WyfDsSPKa z6nZBvYr?W6tgvw5aGY>FQI3B94Ur*^7ZEU3Sqy`N>M&sAbMarr=KbUb_tfmTWs=O* zPs}w-Ztsl#4RfZUDc!J5YS=cTe`2o9xV>q2>-+B3l=)n${oEp4XX}#9nYJ}ZwuV&m z&PChKOm*#j%RS4?N%A=EIc83N=&HDDxns$As_ve-b7t;Hx^b-RZq&CH#8M zQsi@Yo|_9up3Tb!U3o=TXQ(QhIlf}&ytT8Pvyu6xd$B*Q?6|GTxT@}!-6{K~c>n6X ztKYc(#1+VeJoR|*p1gB%cJTg{dspU9N)6i=;=lJAu-;43&i+Nu*`MmPDAvGvw`8rH z!}DhUt^RLz-QRg{=QnmgvA1RH?p(<3d%Ne(Km7cI&oAtj+V(vf{N9!CURkU?^~C<% zQ*ZU%*qzwiiz)Axnd4a(SJg11|8-l|$(2>49e&B-pF8osql1z~?nLHv4;>F2^YM3I zd+W7D?_OlE=Q6BVICp!dA~+vUZ|#+~_NIb|Q?4V+I<31b%V`~D3@fj~kHq0H{P;5P z-d(~52w3~Pj1<$_X&v?^H=4(fg;czTL4*I50A^60q>k540XM3<@|tGaFm1GONkeIP zQVNewVW*3oKY#jPv~0pt)21=!fb{YSj4$sb=XHFCgMCl*pxnA}u7@40%F40U?sO3!e<` ziU;6>RKRvc5Wy}oS(9u8;Tqu~7MmDg(wu+|V3))T;^(LsT?8c%*cHR{X%Yn%DQk|& zI^;k$NKPgbkcv%0LKEf1A*#bH4pVB)HSmIQSvzu7{3R-a5!idk_bQw?k$4Q`!C^p8 zHVuu75I^x#o>hP^7y1>-1TiyrgUi&_-#>ouxMcRtXlD-Jwq%MR+WWShPd!z$6?3|K zwna~C%HBHD3m{}~$~axOd+zSKvn%cHklY>bJ3BJon!6)+M$+D`l6UJu(*pl{Bi|lL z?>H;rx9hCrJ$pyDWvireRY_3Bek<=vxqj?Ol* zsNT8weD8cbRo}f3dR)HfIYJdTs@a=n&&_G(Iu~7SGrdnyr1_2+QJFgb{S)_2r0cdz zb=xylwRf-Gxt6ZlCRJ_ARQpyehVrtRUI3zU29i+^%2;$Y&-5-i-EY2j>$Ta=U;9$Z zT+bj-z_gZcX{OO8Tp}jn$c`Mw85E+?aQH2ac!kE4=)gk?+Bru_$r+>wA}2n%$z`>T zhK5vK@TVN!|KqU6YiLl|5H%r^DNkOm!{fWas`45z7l2%Xeh5(Eraolg^(561%)Eh^ zPleS)oDp#rNOR2J)bJL>TJdKi##As5nR)w7W5@=-9Vs11>9}ryjK){Q2Ib^zIkDg% zbxeH-7&Cx;2p#}rt`tBXr4(zPL~hF#fbMzjEe#2kb%Tj(vV%mrBSSg)o!Erj7*|^V zY+RL>~y}g-=+c@;ZVOf&LotcZ{0#B;r$@ zpWgZsS^g19jT3)~PPnROJu_cUnU(Ew0rE04$-vn0gG~`cTqXINDwP;v;yUrykvrcj z>Wv-$Xh2iAx_=YMft$cYlrfz z0g*V<=4ngPoHR_qpR_3NY1_1QOwVgTq*$kIi98`tiv1>nfaNt!A)SiekkdZxNZOMQ zUPIC^Vv(oIlD4FsvDnt6gWg4P&SY8ANhy^uHuDVm0W{Kbl JlTcQ+GPx?lg9>`{ zf^LUsKYUq$_#LQjJUU4NM_CWa(uk}d6e7`pOIhq+M`7ZdmjKvU@l>ei>5KS0lpJxN>>`S$=eU za7|d87)BMZs&56qdT5zb70Mtm#wk7({PZ=xHvK$`k4tcFa9JZ)U6-!vkg7TstAaE3 zOl4ELvO}uun6VanYd558cS*IoQs(MEx4ARzUFr6dQv1nFdEIO@<=PA~dPA9HZ<#qY zYs;Fs^4i%^$+a=#s-2yZTmjJk4Z&My!2xu2r|exbXHyNq4_zLHkN3>iNVVOmoyVn` z}F(>2{vO*aU=*@1g~^RFzFzm>@9xZ0i-PFt(+TL7oJ zE5dagk{Oh}WmI-fK+n*vTsb%}#z*M*uV^pFWnFCGnn>6+U|@ZdEd}iTEd-1IkeuI! z6Q@!BB5fgM`!vz^`P5EL?fqMf$RARbhvA_A8BcZE(=K`17d;ziEE#_w?LR2_4}#Ep zT3riTc3G<}FIzDfeAZiz8RKmOxNetkc5?pG6K7A#+{0FKplmHQcoT)o#_`K^e#yY0 zNcce3fd>%d@QH8Gr~WdeJ8TKCVGqrPvCE;LGkUW#eNK0l#3t@(+r%CHX-i!6J0_9*W|eeAtt zu`e4CpY5_3wlC~n;qX$LKot+-FaM}0YWZaZ71NypP7?)#LSHialLX!R5IANYv?op& zAqFo>VPs#XTzWmxrpQIKr|G64WVx=#AxM4E6tm-fMeLqCPln)$&T=Qm5^?bs8na3$ zK%>p@IDr!;6es~qNdHFN_z+f?6by{g%3Sm0@xLyqD~ZoZ1gRx_32>t87k>@;8LtRo zzYZ%~)iE=gV-#>9^`B6I#42JdR5slG!ksV7?VRsjtPIXrmps)ZHA;Jek|((6*@o@mZA^PN zOWw^hwk2oHY)Eo$NSQaVrP42(;=(K8s4ym5F)`zcCdMF5g!&N7Ws@SqgGter0;H=Z zWgThFr%s9i#8f#ZrsRaHF5eIZ@BS*qNe@wTSDn3wdcG#J9*{aP~XobC6R( z&hL|RoSXuLeTP2D`49MukH8^D+-TT1Z(7hToL?|Y8+T_pd_NjordPI_+uy7C$l^9w zm&!d^1K#L~r*^h`PB*(ta<^tp6lCTq>t>InnmSW;osy?(;ezDZouy15u&i|}Hu#iA zP}QqoERmJ;EJGLYnWIF4t$>{>q$QYI_|mWXG6=>YD?4~J;^5K5n*<|oUXy}yn{!eD zBX40+0W0JHtJB#88(_Fya1e>Z|W!qVM%WpbDmF$Sr3lQp-8;X`@L6_!vl{uD;0Fcm1g3Mo8%M3RtOuK9a zYz-@Z1y2-1bR*i$KoadiqVAH~c-uOhJf&OnXQ0*3L;r!(#RXA^QHJdh$wS_j5iUtg|y~3g?p>+(%`ct144nM~5%z>a9}s*2U`1nUjC!uA9B|NdMl2C+-(g z_7^^^Yl5J&Zb!!L%Xn&m>M~VLfIjs*0e$MNDYFl!8jglkL$_q_N!xcz_T7(~7w!8W zYb1Lw2xd#o>=wzq5&PcNlrsC--ewdpI|z%*-r}($!ADN0RJ^PR7qR8|6rT0TmP9_! zXfb|89(q93sWc_(ws3@gfDY@9iX+W-)pEn(re12mD4E^GJ zD2|auj6j0qYdZzddK6E?0ls6{@hJSVLBRID4z(vJMvBCa$1 zs$D$RIm44g*ogm;9I7Zz14RoihJe_5JzfB9LYpm#bxKR|EjUDl5QCjv|HJuuC%!J6!$v7U?B#oFGk1; zxFJZTZ`!8~1>qF0yJZ~KBn^diF-!tPW4~;su7)H0wN)Yl+8m4%--H4GO&vIKLun!} zq0P0b2>wEbmJ6NM6uOYq6w<2kVg*DC`AwphM4Y|4;HR}?r1Apk+1jTl4LVm!`ZnrZ zn+GN9N~*!=N~ysl@`qG~qbVrj)Not@?@ViN=`pY>2cnXr%Ar(2`oQFAIEAthDaaO= z=tbkiDmakd#Ut!kYfw;x(K%^?a+bw7jskzKP_QJ*D3GhN4y*w7@Sjm>MunJFq;5(+ z1xYaho?KQGglQLvvQ(rYH0QKyZ``<{Y}`6(Y)(6yR05!stLUqf{@~Q-exCiS&E5ZC)D1unPV$}{x!4hPnx$s zaqh^p1kx>gq?SF8b}hE_NiP3VeM`E&ORDc$*tJ-{@3wWRWy8bW4|bii=9)#Kz(ZqiyA7LDYWUi-Da?A#s2?j*d z6#w_g`4KskhjA5O{36ZIA!T;l3_z|P;Hj=(d{XG_k;yuK#(0bBuG@*b*Y8|US9MDC z`&_E$6xydmtGi6VmzS-bAEl-@mlsTAk2D5snW#@OgKj!dSX$9iW`23mb z_Egn|Ox-SkR|ohBBf(f>4#xJ&jvV!vgS74(wZc?kbJ&^z;>pPfA0H&rQGN1w`zAsZ zGO{#iV&q{C5b=6;4(|oInaAg9fdxVl@X?@ZTdyK0N%yaapj@2}P?TZSV-Vf3gJ+;k zw*L(ZVfNopk%An5OuoN_1K|XN1WcqNyTee)iC$kt;3`m}yc3;}oEZLgpb}npB*_sz^yC3zwZ`+@7 zSEb#XCHLk`Reh#mb7sT-OzX~!uPJNeJm4>l9_W`C9c3#n&Q*Eqs^sh>GO%Vtre$}= z-;FJgbgnL>D_%0nkzJK|2$dJ9urltGkf%6L4&e!rs8~k0h(917k^3U)Gm3AmZV7q-fJ^a zkeRFAlC_WztZ~Kd4$0EEY@^Q$8g6U%M~+%#ce!u#v)y+hcbr)RzEMo|=B$Z)=2fk* zP>_{tXq!7f-#+(wslGF7qaZuyZktO;6`QkE9F&5s&Y8sRBe$-nYPuHsQqEnHW%tKU zM191;5l^V233)wZg6xXOOd;k65u1NhI$-%vu(4Nb{)TkW_J)xKuZw6v=3q|f34t*O zI}P;{9J-tEHnXF7#(ux8B;jy44}%FD&fCFII|S2^o-xK{H_LD^?}U{XF#g(Sl$Obw zhj&q4z8s9bldt#%^0@^U@_CT29E^T|uM{eHgeh&M7NO;iQZ+(6;Drd+%~uH?q^ZpH zPS01PEHCdv8-i;ZiBG;}op6&-T{?z|3ygd%O7-z|sOQsD>iK%4tU;=ud+eb6(11L( zxt`QvJR0#`Z{aW^n|OcecJWQ?gf|NfxzTG`Ck0yvRNIi1Z^alk@@?y+q+XO7MP()F z`F6C^&ugdU@|@s zJ$>fFlttD54^BB%uVR=K1lFk-7bC+EMTEzMt2qlL@k3N5Ya&bx(?3{@^vF+t%lqK0 z&-ki4NYq@{7MS1yD+<+FmD=q>K~w&-WYWl}OEOYmL=mad=CuhRIrcCQ38fR@5e~QB5)IN|pIZB79MP?&70%a_imf>V)YV$f{#YFCF67z($8d2m8 zZpr3Wv_xmGKCx}cxa&SFt4{g$E|%>}nfEbBBtz4Mgj{6jCEJ)Wm}12y{x6h}=aPu9 zFyoJn6CJmEy@j^VsC z4qQ;#vw4t$=mqwy!zXd!e7lf>#Jp=Bl#%D>O1E~}Sj1Q5C2r8mFqKlhPtchrduY#~ zbdoQ#2c#TP&ubopmg<>4LDSD37#(O-{!wzhEQ_sR+}Ltde!t~R#a+dty;lGbp^j;Dq6!$`7*fC!>YgV9_=gm zRUcMhv+BPN9oK%@_$ssfWg==oLIbnCm9<@cQ$YsdyZZd3))P()FiVL%crRE(pWXv1$J05V^%|KbV{515ZkD1!>dp(x(vh2lV^I59}QnP``pof#$tmHQ|y0P z|8pjsVkfnk0|H?Rr0jTDsX;cY!yr40l$c<$5obnpu$Op6Oh7EaY_G*HQr)JZ==eaQ zyGwkTzAVqfOg<*S%9z<^@ME$=5zYXOlQD^0Mp{wfI4tvF$}ujApHN$9OUq^I&?-cL z+!!0UBEZfAf?IT=VECOskgzI(S~d+d1EQI_WF&`Vsv@q>;KYB=TE(mgfn#}vg(Tzx zRIEITWQfh%RFU1}DrzNf(s2c4#{Uw|C#y_4GHu(^ZTqFR{pq&jQrqzvW2UM(UA0xJ z+6rB(x((^N9;vP;U3W~XJBEXRC2t)JKx~<&E;9XSU}Bw4a2oVglkvBw{W~T9PWr0P z)HbDSw@9^H=&O3EQ0i09wuQPy&jB2&w{$!__~77o_DL;=C{^u}wJm zBzwc0{hjEe9glY{22VrKRAr^wA!@2=pX1Z*d!_chi@tpbE6Y?j+;`q{!t2O1wxt`l zOO4y9$U3k~mELa}?pyC!pW4gcJbml*tS9XYN+gmdq4Vzx--~@W_QX!b`@jNeE3GpY zWU{;7+<$BTY~TAf|A$R259=P(J#|&ywcWAJ_N8&GS=X7a+bz}YUUcnw^pfN{hTP3% zGks6&4QO3U+ry3r9iNrH1?hiaZ$x@`eWv2jW6vMg|6%=OXUg?F3d;lz44vVqLK;~*kgZL(%8 zZbSlQ2Pq`2M-kpBt${)pYeTUz$(kL-DB#tp=V*5klm}7ks!h9^Bv;d1%c84&#*itm z#I7p$-j2^|f1?I!`4BQ(nme~xx#89^%v43)lC$#mqFFcm$)&gb3|cS3hFLTPa;k3QF&@i)4$|6rUD34o}yELqJxZ5hhpkP#x-v= z0T1S>J+*u$tjkX~E8jG2E-D}Ty`?C?l9ZfQDOGGvnqjCzluUi1K(WCLhfI1P5(ITm z&$Ok8ay3CO*rMWI4fn>izg-*9-+O3|+V=-AbG}tMd z2FIgUBXL3epBNPi>Vgf;_*DwR8Hb)Fw84@P*KsIDgbvye>?RFtsJJ5kCKM=sfgD_d z#4!^aSqBRlk!WsGa8fo`4so?p8iua6z$a@2Sx;rkMrD(VEfi|W?_{WS0dp!AnekN^ zEXPy26fJ2i`7ri;I59D`aXp4|ZOXS$Rs2V2Bt)QEon_yDvR6SaR_{;OZISA>{Au0R zTc>CGZnu+=y!~6v4+9SZKY&qA>&A5JeyMf;qrt~Li>)WHS#KL>El=I`vqN+I!z&N2 zJRE;8{=~iG(eC#Q@9#bHv}rpK%$#L@-(&p>AiekYsTJUry4hoMC+DYr;Mw!ElEAY5 zUE5o>cb#uJe^9yW(M#{OE$%=2nK`pQu6|p#j&r$Zte>p3aIV^)a2m_Lrxo=$Z(pqF zdDN7$@BQnn4q-ow(*f1%`XiOxA5~h8Y}NfygZ;=R-5+f-z%Sx@EpC<4nHh;0toR&a zieQDlpeURmNXVX29IguTw}aq9QsZVED&(#z%%=>2yqqdLj&7~ zhA$Zk`>=)U1SGf$y`+~g8h|iYa{CjeaRj);ji_j z+I}KgY={AB#8EgovbJ-FY#+ppQqWAsrWB$gG2-G*B$c&^aaqxV%U$ckmgVvF< zw;5&1YQV*2z@x03XB6thGDNN~04hPne-9br578WK*)nCzg3c*x`_{#WUwH6^A2_<1 zW^$Y4Y0J2_XKMXW(9SfqW&Eu;iKuMHNknDY%+Zf^Tv^Xb1!v#3XzN_SImdRE3#P(> z`8I{VJh}vm4F+)EoJ=?FkQ#R^HtxD@UUcmSN_Ki??4PVOed=!e*oYcZj_#kuo6*+S z^@nxb{i?%$-S>5t!}YrF+wF&|b>FWxkl$}OjN$t#lI0~*@H2@N?V4phdc3}Dudr9x z#Suum_5rb#-%kwJmrA26N&{SZ1G-i>G`Ivn`^%fK&vMwcxa3mOE5+Eiu)ClrN?Kd6 zoZK+S$^7}ul~l1%w8j6kAEXvH5N8J~>u{>y@Yatwr z3)1lk1{xKEnI0 zT3o&Rf;8BRm#>lcRJO&3x|R1t0mLEDc!zHYTuP^KO(gsN!gFU zHm2sOJD90#fng1f2$y^<8DGms23>Q-ip5N)gk~HadpLV_%GUDKjiW(IxMI*zlnw`o zR?b?IIs8Wr08cW|n6)+;n-; znsh30Q)tt)HR&paONlN{=A}~8u0**y5*WX{#8eH#iHJUF#xBo|{d7fv#x5lfPP&p6 z!(appJ0EQ>*$y5OWfeN071TZUB+Gd-?CBiT&UY+t=jLm={4?8WMK1F@wmdiB{*0EE zC*5mWTAnN~HDaq;T8>dFao)(=8Go|rGwO69cb(eP!dsz_z5?q;CPgnFF05J4Sl~0t zq!xX4nXck8leW>kyuHwhq(@z?8m!hgUIM3rRTyyei!mfph#C@3Pg!~YmIM9~f2W^^ z+173ZK@h3K^(G>5;8V>Tak*RKYQVp7#1EGxrl1joB(Aa`5IkC(F=-Q53&4KLS)7u} z){3~ZRI%Wf^&`Rn4?9RuS}+L)Pe3O7dAlwVm&9_SNX=vl0|W3uz(@9$dJ@u!#AAq6 zl!#G8y1{{jNI*v{r!ZU>%ZZoq8K_ib1$iiy)5nUdHRVKrarG{*Q7Yp#I1nn(1uuo} zK-rqaus~D7HMu+|2W7H5hPPzhD1S}7N7?^AWv?J*V`h(uEQ(!Nf~*SXNN=<7+l zx6d3|+IHme5oy~A$-V`4ob_8W&W5?B_nj@N4LcuozQ18###Na;rByVQvnRD&b7!ji zh|~lG*nrX-JLXSG4LfJf!Vgs$XtbTXQ#MyKKk%R-)zv38!D>v~P&U(#Xh&75reppU zsb-sG-!^j!niq}1bVC=Jfp#toJgQu5I5=|_Tw+UL-jfdOl>&Poz5L#}C(UP5kolpe z*87ork@?2do-?V9{izEVpEO)z)#gj-T9{mH*pD)x(*cdUhK#Ex<8H~gn==(|{H{1m z)RGks*9=2ePukTixtde0+f#c^y*EHiY8kpjd0Xzkbnm74ved3qsSVGi&b{!Y{yaKh z<{CQE_1mQSZ3`6(7Z>aIvQD8KXN%-)nS1qp=a#1%x4dh4%d%9x0UG4#>djL1=J~$g zZJIwWRrf%lENjwNdY5&^K!xOLgZ5A$sHAQ$m3l)Vb;{Mo^vToqR>|I)YTNUt_PyD2 zTF$d0`$BGF6v1WALX444t8Cl|OECI@wB?AdRxD6+_>{*t6F0&n7V*lH)D|Q#rRZK2 zH;LLLgp>vP7Z{)T=rxZDIddm&{sOgIkn5DvfyHSjamqO=SuF+i<+d)A%lKO7E=j(QnNuW4+Vm~qVe~=tiL-mDsww4a zdWw5m_N3h{lDlQm**dRz-x>I@s`i^l@1MSRdcNjS`IEZ6i>lE=YCivRX~~InA<8 zM-qNe&43%#Eps~n_K1=Xq&6Lt8bCkS;_8*OGblNO8DCp!Z$AXZuC2?QmXz?cIlawQ zye7s;*%L&rtVdJl?%KfRAwHxRzk;MiH;Bj;B-mbkmQ{SW#_9olHtiJ}J`yuPQMzKw-t^!*B3;tO*uxrC+Y~j_wKpPr@eb5`t5sk z3C@fyWAmnL4H_PCVQZbkM*GYWhPt3_@UQppS*3BCQnZ@c^SqLBwvgMYY2`iBBCP1 zE96AUVevGqvSWl@n2O^{$erTPFv#qJpsB|7nBW|lH%S9Yf&lD77iagUY@0D)GiR1m zB$TmN-oBn{-tjnwUEzZYD-P~v9KdL+<5#Acuw`E8ZFyB zav9wYk9(=Y(eYv2;{tfc4xXsyRaex!f`F;^^*X(}kgrnT>9$Su9sL0|rcIP~c0w(8 zVauX3&soc+-2L;bI;mGX3EmTvUWESg3L3PPN^T)ibfEleo_F%fJRn~-O&wI zf*{h%b{wSh;arR4a@c!;dSuF4xpXv|(s zT<04!ZqyWZe#F-lz9WV-~>ai!?`=ZqQVyb@IV*TgX#T+nIYhr5!EBvpK z6KlOc&@LW=x1iB1SH=X~?Hs;*U9p!^br-f$s#mEuHYT}+-oVH>Uu1pzGfH45=S6ZD z;-fPb*?5MXV=Biig!Y+CR%{_3Nm3Qf>9h37Ko`T7gd;^liwu&9!{pmcVG;8A$(JOD zmYPVbT+G73r3(-;(;$}Cn_i}b(nGjd|1<``3cu;u+B|o>Gi1l zr}WB>YK(@a`O0^z->QDM@vTNOGH!YtCnMvgY=y_*o;&*R#DfzL&ptSt-Nv+TZlu@-5rwYe$`-SA#R0 ztd2ZOmd3eO$qI zW=rQsHm9L}$z6SS-<^F~1N>-%$H%@*^aV`-sB23NyRsJg1ldr3H+d&H$AfgpQeF(V z+qdi>U$dS%Sye{~vIhDh6=a z9FQn7nMCu%!}sNjyT!=I_$_a^mEXE;FJF(}M$-2bHBqK!hpx zL)+#$-9x2ZWlcxdBGJ-pGeBR5{+B>-Y)sLPEp3|&8T-L4(E3~~K7vUF-|)Rn8$!1xonx+sOh3x(tCzha0R zsEyDDP$x(!g)Q1#0dxQdPKDIHhVTzm;tGH!)lqQ^MH!T-CnO~@^s$Y6Orgy|AUc0E zAu{0KN3pL_#5wZaAm<%&{s4|#q5ekpAf;%m7rwiqP&U5*0*jJ2p&2i0K#|lU9l?IFEf@Us>OL_3l)^G-Y zs`;>_KMbl*Q#ES=-KWv6Sa)d}Zg<_?eFvBKz`OFQrbDwmQ_zPg=7<(rn1+n=_W?OnDGWR$A1N e&}*7!FO$F~1u;%&8RoT`=4Bh=K4Nif#Q!fVwmWzL literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/dots1.cpython-312.pyc b/model_executor/models/__pycache__/dots1.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23d9ab6a881fb97c2cd721a07a7fe785e38214cc GIT binary patch literal 22344 zcmb_^d2k%pnP2zZN6&$C2EYt4xH&j^fTwthw@AViDN{05@+^oRkRWFOsAoWt2w=-8 zRT;3Q2+XdFV67cO>)IF!l`_1Wl&Neb(pENJxi$xyL+^|_P?lR(Dy2&P7#Wo1b#2w= z_q{%5FdER_t!;vDUcdLfvtNJj_Z_eQ(&KS)xDu9ckN&Ik9QSj&(J#9eSvzmyxZ9k_ z4RRtcni71{G-%=}Zcdn!mO%@PTN2i!ZP3Qz)`We~j(l6fo^%X4SeiZIOu7bLEbd6S zlft0D;?9I8=^ga4xGUjH`Um~ViouFxe1raj-EN8H^;G2Af#AKhd0Q8Ej$kibQL&ZLp2S zD--R>j=>HVuS#rAMhBzG&cRN0U!CYmb`N$Zdj@;heNCb_**DmSc&!*nY)Nh%+?w1r zxQ*Qh6Wf#hgZ;@JgFBKt2X`iS4em*J2AX?>y#zxT{) zeDEno*#OGWZ=l@AD8r!K^cyJmGs;F#Mt%e3fw-5^Hi5SJ$+Y*h&JP}BC0bCTwS0;A zw)ob2R&9KTZd#*lQw^uaM&eR@Y&hOCK9;x{5y#W1t&!xo7*F(mLX=TnahyuWrJ+lS zIO4X^RAO|PV&=nRHwW-*8XH6CK1v0TjE{|sUdFBSa5^0yOOK9IUQKFpVq#oM$I|1{ z@Riu`cyeMCm6;Xyk)h!$akY@*K1+>iH?E6`L{h8GtO%Fm>Da_X?6NdIIia-D9coXF zOEK2b*u;=Dlt{!Au~){WgcwVWPQ{f-c?!_Qq_{ZAhTys|nwUI1CZ3&4pqt(cSB8*v zd@>am&yF8Miv9fA3j^a)GHOxk&&H+8aq-A_Vlp{)PHXSX=vaJ6QYxN3eX%6Te?C5e zc^XQ`MV;vU_$%v)(17pE_~p@b>YOw_98Y09NUD5xatISQHAItV%%_9KER9dZqb9{` zRCD%Zd`MJkFOClnUD6YeB`?KAadhl5C~L&{$k1dW9eX7{dihE^mKYxrN)u}N&?POdWMdzPmvz+>br-firgbV6}Or04&e^_zp zcjwMUZA!)A$@KX1tQ};tcD(1#ogGc3(3dkqH_>ijG(}G_siv`~mW-n@r6W0XEgp*} zC(<`#qeabYPdODE85+jgxv97rVS;K)DS|qT2`N4@dSh5!)oSR|Zr08r@=Y+#I5)_H zd77LY{S)1yDQ*_c_-~0@L~GnC@>5{7qAhL{?MQKm{D^m!8{tLgYaXys*K6FMW0t$Y z4>~V!Q9*IA#eVk8xdFv}WmFX7W9pi#Sl4;B3|(422A~Kp)1$@ur=|MA%#bj zQf%r#6q^)JPfBCMI`5~kIPstO(%J(A-{Eg_1;#aJpXT1cJFRm8o=aKqH}?1E~fE2j!RQ3?3y-BW0wKFm6fofgzcV9%j05&`pu&K z+AfZhd})tXloK7(X0+n`p7|cF8eIRJPwNlF=q1{9sJR?>lZ(0rKEVdZ9iQ^_UdFPE zO-{rnr17h7n-$BY(V>*ogG{Lp!IZ049Diku&9q3a$F(P$RAG;{%rl6$X&RA?XF_cQh zVkw%Mhl1zH{FRx z{6>5jucD;hrurC1@13|Q?LnT**1Tb9UJgeCViO4eiVc zft+X4iqN>&BnvIMirQRVTdp=V|H8scD^*>O-L|l6*7DF%_t?wTcjbgI(W^uaQI&3R zrE2qIkG&pruY2=;PN-d|ec*0ftqkQVJD2t@b+7Dve#QU7n%Pw4&2uJ?_pyx&H2q#; z*7AmX4V~hKX<_IWtyT6O+(6_uH_ctu)|gH}ztgmV;Q0&?wQTSNOf;4)L(jm&UV}ZRg zB#Bdg;~{7%No1#}o>#eCOXzuSL*S?FH)-ucX~{;h#cEA)Qv{#9s23kDNv4J)A+^-q_>t-jN9_mw2{wP z(j7WIsLIW<1?gp;>$FLyphY?rwLKXS zJU8X&)krU+CS3W0QJ>;nUyI@%n@s9eD%SKk5Pea-G>3{C@*Sj@p#0F}1q7amN^4e|8O1`o zt>R{Vg)nhtT$Bi?Do&DzROtef0G ziA$KNYl@eqlFb=`&`XMaOq-(@soYCc&R%%J1PY`J6fm?$5LQKiY%isr5ee%>=^C0AVh!rY$E_W`#|%u!%)l zbO0Duf!&u^s#*X5YY5mmv0sDO9hTkU#rg;CuGKBuek}Y*SoPQ3cHeR@9M6Wf%c1QV ze?M^ER`2Z5H&4x-`kBxJB(QaddiPN1Vf}2D-OVfRUN(Y#a%A8A>)C@BW&A}ht_b1P z=FVKGIUDMgL%q4??YV}gY(ux)(47lMbB(RJ`p80Xv2i7^6^OG1&HPMg%vW+P9kWLt z3XLBJ+Hwtz*@j-Zp*I)YlnwUD!M}~Qz-BqHS)GU%R;t=_b&c7&Zn>`eQFB|ab@LBSy>%+vdQ5IT z_PD7ch#7iAV8BK3=+a^ut2QMOA6tiSY3pY20E#TaxBOKklwJm^<;$U3k)Jl-(=hXI ziEAx0fF4(M3{}(|Gw0v6UMwBvoUyKw{Q`PyQZP)Fct2Ry>SZNYHzbmPMjjkq8YyBC~?q zOvDw-_0jk%5@AUxK>_h1iE)76r5LgNsGUK3wMG}Q1h4_mGE9=9azankc{kwk`(XP1nSLe+RPE+Xx^AH1%ei`epn%HqE-`pUDN=vO!v( zOWT&FGQqvGo<|LhclX@cv-nb`VLJr2S+kxqWZ$o7=b0 z^uXP)+Whp&k&Bt;=aHWCHD`Stvae&wwfs!RcW}jVP!&2%1JX(KI_j2A;YWIgg3}bx zxS4Pvk%*u&{}IF#o2pr2^B{eR^3EYB7B0?HA_Zy6pj;9z93)&ovT*LVx*qu(@>cvp zve2anJKZ`UJA6h2`LFRl>mh#e%RhMWtrzng#h2qhPW&kGIg6{Z1T{g=-?#!=Z0#qA zl*$sS0tqUliXs(~87d?TqzEe{2FMa|$B2p6x+ZNnpk{K$o!Xr%?$ToJxLp*WXsTtB zhzF90TlB_-tMx`tjELylAYCY*K4KI7kWiq)g38K=_o7nqG6}`d$_}XesgsuT6(|v* zO`|B3f$WZLS&EAkj81azrh|q|wz>Tt#~x z<8%g`$H0hNFiy7wo>w9Wc}fK#@F6ay@iZ(5gHzqpW@yF@K2R+9l;u>@?kQ}z5N-fu zQ>JI2mnzTEK!>(aUqMWQ1|f#B>Q$b5O}noF0Z7#w?+8V2fFYp$6Vw&bdjnz5Gf)5#ii22X47>kjmEzHRQ_kL(!9q!O&7{BMQ}*6* zO;=6a6bjs%RDetu1^x-e+fXVlO`{fwjzj767_^B5l%%gwUQ!ME8zGHFof0NXvHs9}u}+mtlk~fk@dgD<$fDWNu$TH%ENlhVZ;OV=^%p3V zY6oNd6-2K3B9SZC&`v^CaJ6!0rgHbJJ=eII?n6+@W_>NPuVwMF?CV*sc;MT%;@Gz0 z+m@>fXX|?8x}N2LT(_IF;JM~rD$J z)t(h$&q~!EAe!oih3B&2ei?u3vQ|Tli~M4Lwq>u}viJUFx#d_Ubew9hTMafY+5pr} zzu%ud^1O_{{^v8n7tjGuuD1U6*sZa}^bcm<_?8dRZXssD%j-`+0=sH*}} zZ_nJCSrp|!C%dn2#g8x7ae&%rWnxdST-Q76{WusUPT|*Jg(*-Na^&8{Xf|M)_z&oQZOViYs2#^Rc#P9160KQ*v~bz=q;%o z%++nlRWzUl5H%2y4;G0U2(JD&2j&KJS`{6$2hu6mbyA0Pa|V-UeGXNcg%u>QX+gZ3 zypvq6&2Bv^Z$0|3_82w2S{p8t{)StX?wmRzN-<$!nDC8$5uamt=Q%8@4J{XjMS{tA z<{7B#%1S-lRgx4-F7R6eqZG2rr(QOTd_l5%5j~r+&e&$Khh8*6747~)CQVm2?s2P&sM!hSyd;H^jXM1ArNUGIkheGEd z!_#i7kDfhO%rt0;J(gTVIc?WzXB?n1cCrHLv_qc}R+|&6gv`?B0cjjWFyWiA#ejYO z3*14@g&#s%`WE8Sw<-8NO0~o2O!6<=BpCOXUZ%|7r+{re^Ax*9!J7!8E|nR7hZ2$q z6f=0fD)90zEBrNuzpl86v&WznoR~~w6JWn7!Vm@SD;=T=P9T`7E#bVH2wVZhN9zBA z;8hMBv9k7d_*QsfCR5QfXPY(8KAWqoxgEU~U3l*9@7(#FrPd#IzukS`@}BUnuo61< z!M6EmrgC7;PA0M0A$se{XvW`l-f=EAFN!}Hduwc2{BiO}$xQHQMmUxg&dS2s4~G87 z<)2>82p3^kD=m?2-zB&2x?lZX_}y?Ocp@X5%nIjZ;apDeym?~o1e94G*DQ~I`m5jk z`rOwSZZ0(~J)80MX;wL~+0k30;LlqZhcdOBXWhB#;O*15PA|GM)!lQ>S=&5pc{Q^h zRfx8I;_u+Nfls&yLQSCdmkQKpX#R+oDAo~pbEFlEI69nGrE7w763w?Naqpqn2nE+E zph+(;Y2Tn63VwqBslP!$lD3x%H|Kh{Leu^4l~)9wNUM!|#Z|#CZqAc?XI8Ia=v5dI4i@ zLFRDr4notx_AClZWOUaTW%TlUQaQOD-X-uaG5DOVAkiboEE>}2jD1}WTy=~xj$&_> zvi#rnj$P-IqJx;NemCQU1X3i=7bI-(+j9FVUykoXH)}|Fs9d|i?ZLsP9n5#e#Qb1P z@3`*KTZs#;YB9w5awf%Z>_*?p8atVGAC7bSF4U}Q%3%t6 z=$Q+4HJTLZUsF)vWF_+9zoA?v3%U_2ppjxhu%5w@VwP#hf=xWO#p?)27WA)^1z~>b zksEqQ7z(b|gcdF?zPP+2Q?qB*wF(L1`r_A?)0x1&Sz)y~y41e1?equFWtz{gI2u>0 z0}Hmh!X06$YC*^ZdzLQA!L6CTh+KVXRCV7QZw;5GJ%YPCj;U!AOzSZF9DHuiOmtSds0G z8Q!YrzYg(>3cY?3;#Z_&@p`snkKD0`YSJ>v#q=2)?H{W z3c>B6qYa>;&=?$rh4=>T>e72(CS&asC0HruNnAOiWy{A%^I>iLBSS9%qB zewoH4nA5*Z5KyyMXkV$`n)b&niD)_GZ;%e9M_XS$)1dM>c1x~15aW~`w zt+OX{p-48gMGkFQ4rf9K=gvNA@65LEmD~5;-~YkpO#20_ve~l>yRm5Rdhd9b>ptjN z@RGX0lWT6zHXoFm55DjD;6|qT1=-OEQ`HS3blh<)0$2Be$QN3;vf^uBakR6?{kCaf zAnGXvzcg1W)TQ_M9|Lv&h{{nAp<#X(5!GX@dkYZmr=Bg=r+H>oq6pR@P^O(lfo+@f zP8NmBnubo@>|QXyDJu@#3_R*%St9SXc08&I-rjd>U*3v1stks2Pv4r(+v(21A+P&Z z_hPkN*_L-wl8bZs7)o{HPF?xb-E}BcS46KJ#~_MPs>lzTMU!a$F0cDFk60KQwY}zr zo7)ENY@%aBH#aA_xxv^~xnWrtvC4rXQGl7t#eAwg8&IgZ8?+r~xg##o` zm0abtH{T*hs4}$QvLc=IsU_F9VC5;NxBr%H&Qgqd=Pc!PG@vl}QMy8}`kodkhj*p! zw1LRV*NIdEXX!O^+?fcnTSC2tgV41bf z+Tg2ce$Bz!vY;(Pejtq(>htULh#TWYoS+H2%FIH$o>%ylI?_@7ZiDe!t`ZC^xnR|y zHG~Uo8nd*)*p2*xjYhPKj?1Rv;1GEgW5p+#a`N*rd zm~u#Q>`M$KFNs43rrHW`q=ChH_a$)HC6PLSh)VW#;^ZU|0i8XBCy>9E{@@S9{yuZt zLSDv4VTxfWG9{64&#*GnWU)OmT33XcscVY!%835X7jPx@Tyb5e<2I`6nuiiRIM$*< zVQFaWnnW5q)w(Ve&$HA*>9ly13TC8H&yA>kQrs!nhBO4MIOAiY3WgP%mM@?U91g%4 zAZmh^r;3~{3>3Fft&P9o5KYvfqULw0@J3HVg~`NN0Fw{lD#plFWgf^CntmJun}|r&jwe<~*T=t1Ciu34V3^<*xlu4Q>IP z{h6biEk4DmN;sop8K*DWKG+sQFWQEt0plpb1x`8wLcCdjh9A7X2k;~P6hVrZiGJ9? zr!GFD-6>VVVH=#r0F1m_^bqM?hu;y|u_CfBvRIe#b*(tMSpNqYrc|oeoo|A0h9he! ztbdZ;QaHbblV_=6jUS*kjIC=>>9Me!&U$&#A3zTrHyRxhE}G~543FTMuk*ehjNsA$%j_#CJx#j zQ#lR)>lKzFj%kdyjAbO{E$lK>69AlH2f9Eue0nk^7i&?QJI$uxn6`okTLy}_n(79p zayXYn%u0Hng8xbZ?Yh!`q2NDL@aGi#gaX=5rN2NBbr#l^DpdS8%KGmVBq{h41d5fh zDCs!jDY6}bai|yL=c>F>S;RRsLF4yn&3yoR2-WZ&WiK4P=nw@~e4BHTo?D(rk=AUa zUyk%IeU6h!GSjeMbGxH1J3{P5S+|24tN%p| zvv~EFTlJ(7g^x~I4P-fFh%zoEJ=DEl|is1bfna)TMYAGSC4O|VmL!O0hcgUO?3%Wz|F*wm@i*M z-P0@S^s4FswjhStB1v{U|rjzqFa-*qaI~D?GVu*ExxQU6?k3lf;pO)@K|u&KVapyteB+hMO)giN$c}gm`lBO zp8VGg-E-k-7W|=ktf<$!FcJ&ie)~Yw3Pmxq_ri;Y@qj--DI@KgnnwB#BTaKt`ktD$ z7e5lI&X|7_h*&~~63AhCs!DTS!XDo{I(9uqn%yY@N?rYkN-tcO*f|fA;-G^ZA~ehZ zvMQ@U)U)mgdQ_0Uk7{wA17|eE!b$KHDbdr?hm;2!wW?5d01l^K8BJf2TB$#OLjjpM zq&W&g2o&>>C`NtiZlzE{E%5a4HN9$sODXONIu??G6Bf>6d)Rq+t@)HvqxDuJ(HNUz zp*|Y4>M#@+BT|{BTN@hDNq4VO4ZVyx%4F9s|`X+?V>7s~=YF zq$0r<7;=Nrd224xmW^zeBirXaANs=!@x_;xPd@bT|IlBPZ{T*G;ny~Cksb3Vaz~z# zs}IdTv*=#>%6$g}g{4+GfYTZgoZPs(@6Ns@_wrz-;n4iij|06+>E-7h2KFNJQK$o> z3H8h$%e6*Tb@Ke#538GU!7$wBv0LPtqPgZS$bvPspTaMwKHrAc@|(F}hB{+X;{;mc(c(jRRqm6A9RWsSv$Y676!*&nftnf-g~U z4neW&{f{Y;0z&Pn|Ah#&q>q|ARxFK=4j=gi*I{)nPUY$6{`Sx4Cx4#jt=&sCKMcJc z!bu`)_x-l_y5H?yqiEjG^=|vr9kPZEO(^0(Qq^@#FKVYd2iLf5`KY{Uciu_2E=^6! z{Lb=J7P@FvF}f1$lAT>^9!lG1Lr0D-)hwl!kKC`iKfKa+SdKm|w;oyJkdyZxw1yW? zE`51<`+X~O5!rrJ?!lp)<9QA_A8h|1_0!#-QxSO7g6Ul=H>hKS(rU{XK@~ z=LB8`9laDJ6ZFN9ccIlJB*8AdH^uoyeh6jzpyOARDbm!IGd`5{OeoUn!j=zt$;!N7 zuqKS>uh^mIC;PYbr|4z0szfCy{U@ZVy3z|2V_dR>V!xt*N!Lt%CS|1b9tF(u|F;xt zp@3#t@vDc?2$8AZ{E>oJPVN-`3!{oP-mv03ktI{tC7BCTXxG~dCTEUmg`Kjn zlTNC6bln+KZGG{mByD8$e{DAcrZvNF=~YgsD|{YA*QA-$C&)I?zeE+qdNighnTjxB zwRoaSv}+NXZ+gB=gZAds+*H;Vm3`5rXvWvS;^;5ZqDh7lrM#-F7qx7x1tf#;v?2OK z%>|{8t`DSjs4t|SB3@2sQ&>o58mB% zYfdZtxINgOi%<>ehSrJS5CYX_MQeP+v2o?pB6wQFj&Z@T4nw#)R+IE=q)7B?f#I_^ zhZ=>toSpPc(ZaynqG}(!^uaNlRo9MzPBngEu!36Gr3u?BaU9M<{xBT>5 z)5|YqDvm&#ZMbbpByOx5tg$fnBBxmE0|^^@30*Xl-*#y>gPvhEeW27J4U|5ywQit{ zO4F+pFV$iJVh@dtgeS0tisr@sOvPq4nyjx`_BAi=D$@BH+aaj&)jaJ9?nLAt^0w)$ zzVPH2xBL**NI3*WY9aVR;q2TWbSBDV*SO3Cd zsd3@`@dWWw<*nvA*QXxWCfkao9y854-cf+tJMfn7 zJH+RDwtnjN6hUspiy$}M6+v#gErHyWR0_E%ZIjhn`zR1%J`7gGQK^c{7c8?`<=TpHgmN-@2*VoTp-T1f&_suMJApo)-&{VWCE21z}o`cZqyyicdDR97?g zkWvFnB(^Y`VBZbv72!-P4dF`?DT$z=>J`wded~wJ2a=ltk{J{~D=k9)l14AlM>14! zKv0OCC|BAT0RtdVi>jaN(yWucg>RLq9}#PP0__NWst{jyimU3`Cy_EGJu?=P@la`c zLajLAi?158FP^dUUF!EYYN=S)lZr9cTZC&E-065i3HCWW?MsKuXZ?xw7c@|D_I*S3 zeA)IV)UO(9Nf*OD4#~D``ZA(aPfwzO2GmooxcXpJ4%@3KcP7)LSX-oQV!9{A!wRP2 zFHh39GnvgqaWV54zEy}%J4A@GiJGvgMHwG&q109i+9;rnRU!zf*v>MBq`F9+r{o(H z{1F8VXwqx&HmAN4hY*LIT;*X0qy{h&{BbE_NqIBR^Pl!}eDnY2T0i1KA91xm=Y)^A z>W{cqma_k^xqULX?{kxdH+{+>_{_!e`#<6ie#Gtm++^iVYaD{l1dcz+f5aXCh&%MT z$;_KRcOdx8!STKShuiYG$&M5b!Q<*yzG7B-^XA;m#~h-M2ly}X{2^XXVmFUP)AKxE zJAeN6S8jdfF-P&oY13hz?^rO=;f%)|#q%dkoGI|w^qh$g&sW~AyH&TwQGCseC=lkZR+c$6BT8Hew|Rt{@4LAVo@oC{d)SeWyh2TWP6snNwv9M1T@0fTSKkEiBM3 z`zt4)t16PnQu@7M`Gw_wqu9~y*^ww$#H*4FY04cJWCr) z9CwR*o(pqfKFY=T0X}9JF!1zlh#F(20aMI8V2)V^EG*0zEgL99oH1&R*#>M%m_242 zM4Fgmz{x^Ph;NkGTij%x{TS#5@BY<}Zs@#;OLYV$}oH?A;oziPa9&GQTZa z7xNByncp6*k2MT5GQT70i!}{2#hM42*}F5^67vuEnco#{jkOK5F@JexPdbfOXO^L#ZG`T$aX+>M7C(@&#};z2wfG~riDHqUi}(3@Ip8cDI57`>QKu6 zhSw|;-l-oxWC^c*%{1^LD{CFf>WbJ#zLw8V^=ah~Fv#_Q?0yF1|G*%70J-5AkS{UF zUO;Yq2IR{OauXmoKLhep3~~z~w>|?h$RM`?a{DtN|3?P71CTqP0U0uIk$ofIR$DLs z8`&CpK638eU21!>9CstfJ^DFD)o>k;APm=MED z#p$b&(G4#ge@+-1jKt$(0>Y~M5{bxYVt8zHUuf`BWHfwWbRvfGng=gN1}_CC#^VVg z5{dT>tXmC7)uZhAVghnq( z?aRks9TTGAV0?HoLPee(j!yKAhEGgH@m6-`#98+3CaHS=Sac#b`ka#M@!`=(NRS#% zM1%{Gux^~^^syI8z&&Tit`-NOkjm2&AvEA*h(1dD+L~}=C^QjG1g}PhFI-Fnqhq0P zL~OxPcqh=-!bPf%HMhY!b5Gp(PcdSw`}zBn>EK9Pv;n;4FwdQyWDrRE-r zYGXyJDIPtj_L0${F_hxzn@Eg(f_)~cPoF8dlvf(6lf?nhM*9+FdWB7jfCZX zmYniH8W$o%!`JW~)#{Kc8BA3to$ca<^eaC}m4jeji_S~6IQtRid+8;`U`iBP- z2(3CYiZWu6@GvIFIX0|ll)3uH67f^}&&a(bRnY4TtaX%rGe}Mu`YeWjEF?LUw~L|i z$WM8iiM-S*OpFG@$ZShs#99$NGxS9*X*bJI+T0|Jaa!Jd{8a#{FYg@E!>bUi1%BwgvCw!dPrP;U6B=a-$sm{seM6 zPpP$Vlrp^Ce^SevBbDTahcqJ5_h{PpR z0*F&GjE+l2-4w7%^-5iI)-hWw791R#7)`__ix5Hgj1DT;64l~3?8P{m z+e_54x40xfLb$!;3G-otiY=!M2`zonFp0@~mJ1k9N@mt+0>xo@F=C!djw2`f4jl-d zJ9X^9Nswr!cqBR`oCJVm2X+jy1o1eP4qPWt-$~`POG`R2=WTju-HHC-wDLwCdMfaqSeXkEod$w{d!9=RNhMn)w&8fQ!cpg!bjTR-K z^GG~k63$SFX&9u8)Jm-@PeYov!K=fGi$L#jkhjVlmpqK(mHSHW%79fkN9BEj9O`G` zMRHg{P8oM2BdUW-d6TOmx#g0yy2PzWhcUq(d0=?*-HkpAgT9&U{6)rw0(=1(#>e2i z%H^C~ZA;Mrd9-rPJHu}eXI7pTSDs$t_*Hh%x+3MAiG66{Ht*$U*WBs4-8EnLprPmf zxs*F+;at@-=Q1^&VohhZwk=!No^9;NHUx4uu6FGbXRa;J8Oti{Ij+oRX9Mmoe==Yf zh`11Tk+Yi|>L|&GErnzp9!&@@P*^!RYsjJDT#DCDP#8Jq@fW`c=QS>8Fql`*mn|3< z&MsKR)!T9$-tP@A(GQliU48tAwraEek*7Lm#xLq#-8kDjXPn(CR<-BKD9Fmy_-6Mn zw)QOgdc^7t3+Ke@Z8=JmvvYReV+VXPa;2iFUfM&wM9)$bPBDfXGK3AlX(r$@Gf{70 zBjGI>hat{{IKp*hVT&BEU^fL?fdh>OI9u2nu_GS`@T3jzPV5EkQjOC23Q?|*;KC`? zWUvr}5y?RqW~l+8n91A-UP_-4f+%^^Svti7Cl=xc1#U#8U<=ZP`Ae(OHFXIU_XATW z94bJPS7-C@aKCF%{h#9#t~^B8sNz{+W5S&e*Q0(Dm1?|}rXf+IhJ{VTgw1uR=HKIg z4c`=0dk;{+73Pz|S|pwH9k@0g8N@c)uQaf{9a!tXFqZI3E`_!qWII_lKm!gCiHU2H zizevBA*?mD$(3xu;2<`_MBmZCm-pfa1Yt~&7xc1=p*XfRf@H^TVKklyjSfZx>Kn-v z#>Pch2XDZ@@QGxR^NbUIRTc_?hTP-|-9f-s2CXKa0bo1<2MdFz=9c%SH{)3)dRC=9 zfs`fduF1GtM0ZQZy;^jyoM9Cf4K3!rK&s zq>o?&(hkopY;S7_mYb>2m~XaditD z3tdyzlEfJJDzwxTuM<*5DM*}V9HD<1`!~?nbV0E=ikfo9<3%H?|rQ&mg(n}CfI>F_G|*vhTlk}_qi4WhLnZS_5{ zuFh7ko9VxG_~zkE^*XV7-9xAEL!+_A{;0hALoT%Ew^rR;HQRQxbIv1HwjpHE(I(nfyt*$-5HtL( zvYTbIhMSJr%VLFJ%tt?TRNUA$y=_LA8JyiP^U1k=4;-r&P4c!wvLqPyO)?J&k;`$J zRM9X3$|tPC`s>?IAyuZR2enp43guC8%oHv9PuX-#84<6uh0xNQuocnB7d3Z`4UKpi zapXf)A4V2!5tVxBrZ=ht&r@q}Oqz64|Mw{!o4BNrwzLH#8K>4eX`&cH3RC8!c|;@5 z)HdLNiB(vFkT+o6h!#Fi4s~_FDw)HKxn>>$8$5x*Iu^$+V0curO^n7bPedY<5!$DZ zW0%E7r({Ts1uVj^Q4TlYNcJ&d_(G6nBK!u063PUcnn)1$ns7fYo9e>Z2(EMVdXUDv za6LFa77v1}qHt&h!V7ot7yp-VuozmnO7AV-P2a0Wv#zQeFHOJn`lnt!_{i?Q5tt6V z-udeOY~K6G?z*vVdfhCawl`$#tzetYwa%TM$LpGmeWPgKxNtgc-;%NK679Rv_C1vL z@0kEdeI`B~ee0F^&FPj63uncat)goi>Vv9%a{7}qlWA8Q-W;A+PcZy9h;Psyb$~C? zmiAYH{8aujI4t0<5Z5Ja44Z~5DQ<`ln_qJcn6VQvhK&)+2yMbj9_&15Ct^);*Z2V& zI@TuD$jov=T*7sYZzlXIQi7LxF@j*)i<~SXrf79%GyI>z^ILo&YSIdWof;?x*4v|nXJq3|WVPB!cILzi0O zA5pqvaDM(O_t5H^E)%WZ2iAscRqbrY?;UvO_}j-9JPZ5pdA{B8-Og`yK4?F=c>0{! zeokz7UaWc{WzAZh8LLmU`ep~_z4s!ER^Jb;{R&+O9uincMa(p+AnV`nz=Ph^NM$|N zP?FYGt?!UPdGvaE8A!eU=GC49jhHWAqaoE&2wUmrs%|DRLX3=lylapw;mDO?Y|yB~ zfsJUL6@HJLIdTYx$Ya3_G(@63xexyd!hzf(ml@_NA?eG8#spB-;mIcb9#o?LHS&lb zgoFMoueuSPj%J;evm5X1y1gsYxL#~rzmWKYsrRNbTaJobjy^UT-1e05v4!*0-g01h z-al9UPSe{>>5BC!bJks*a>|4tFAwL`AMbsVFEd0RLqn5|FZomV(d)Z#B$u+S9gIYy z!62pp8<^D2d*wm-7J-uUhxm&VQ}ySsLBe1(AK+*A=Q#SEH_N~G`hUjWb9S@2W_B>g z;b$)LPW0{Q$K;i%I?AdDwYd;L&l0Ufg{aNO$^XI*1;r6M1Cj%(Bc?K#DML<%?yNv} zmat7CTP+X}WUW8Wq_v`uBRX#wq9f-TERu_Z<`HlGZ;VHPI2{GJGm zQ7%aNum@w_qWbf{VFNL4gsME_CP`GPNu3MGLBqtOt;^(LW_eT=&<@Myk!S7bF=!H1 zYR&X#sD3E*aMJuc5F-4pA!&KnrYcbslvSpAU9wm_eBJ=2`V|{t%5whtQcQG%2FWID4C+8!OH@* zCIm>}_MeS_!h(WAC@N4>NTx#vjz2HZmrKT2bX?_Mv$f|vgbH6H=UgQsAkn&-z-k4gOj0T;Obx?S z&XfLy$Uy;M8&sWTP@T9MK@N`RSz@~Am8-iU`upV!HjD5Y>ICwI4VOQLU-%{*$#i8n za#bKYL}6zN)M`b%Oj=ncs|tsXviRi)G6)CUIvNpQ*o8j2WxZ*gt-e!#yMC_ccYK-p zHDdjm`3>`l%=$gz`aS9Tz3Ga+R3C`>3Qx+KGjd>hQVP$V+S|2rhPnPs`$n;SW4eA* zx?*#xFYB&I$-RkH`|~JD_~&pW?qI-C$}M3lV}xQ&=f}pPdh4U`70ThC!C|~hWldb5 zKyrRS5m7i;6Ty00(Vq4AvbEl9+o})UDfw2pe_JHER1?pj($J@Qbc;s^1kJ(Y@Y=_7bb z*U@kd?Psw?;KQc3X#E6DjK4ChrB)KqG9j!jVhP(3;(!=K4snM0p$fLfx?Xb+SRv4@ zWjo{Yw+vx76y9|3Z@6NaaL+Q~a!Hk_jqIz6I3hClMyZ2-x*=z{`n9qFPypD%xUhcL zNmay>lZh`)BkH8^9W=+myC%sA1&+~#!p_g5KdGir87ZiW?Bw;|Mq=HIkr1Tz{{q1o! z41jW|Aq7wxF;K8fR;Y0KUm`D5oPxYH2FWv0(I)FD*aD zYkt#++_n0Lt$5Q2nnXZNiL%;NfBqMtEAwxvFIkW`NzDuMhhLL<>F1ePtL8J(rTX%} z1+m@vP}NruyFm@h{}#k<%!jJJi>52Q;7x2%-@_Qqdf!|u4b+DzDCz0npjQvW6>2e` zS`QFs_LBmAEZfXanz}Aux&o;Htx=MVrmsv)xtStcaM?LcJDf>#7kKw0*z4OFe^yTE zkhxLpr4$oTssPV$!NJLu&yQ+CD*qLxH|Pg`k+pE4`o`37xBCN41q7ds=>^diL-0_I z%MwsPBoPKpTlU7(WQk@>1esDTwy(n^w2+z$XhkMK(#m33OKPQWEe?w_IB4J|hDl#d zsHH-TXfT;f`CvCqb^*TyHp`chS^!^YVk<)9K!`0C7I- z+M4OwBX;e%Z@mA>bk`}Ol{+8y?8x-&6MOb$oi#IwTgjWrC8J>j$kPYbE(n6EJF?!! zY;8lPcAHqcE$i!m_%~bMk*)R3yf}MqW>2AlIaZ_z?yop&Wen4z35#3(CNuISBTCPX=g{aZOh{_Qxhw^9;A)GgR0>JC&IOk z(bcVDMJw~PixutJl{*0KD*q6l3lY}?YyIrT2aTIS*&uI+PjvV|#b=zIqO-Hu6A+yN z7Sbg;yK**jhy6p&?6g02a#bza%B}aRGuw`e+m0@7J+@eJ9Ia7R4uud`IS8tiYcd_1 z#g5Iz`LS%w#vM>rF|I`Ae`sxEMhP;o@8ND4#a8%L%w*0F4VZHBd&a$>Qk4*DQnX9U!8|7Y5%Xz z!=5yLcJDJt71Z?t$fUevZpvA}hj8Y}pD9;CPC68A(gnJi@=^MY^$^+{JxR#3yS=z?~o*BnRrGEFB4~?g4B9Vm7{m`OSV0uWc_%#Wc|3}lHKT2{gUm^ zC|N&VE?GaWxa11-f<_?bn><;bbSEnmg3&~ro_DQZR*2jwPtr5el!sM)Ne3fnn$`FG zFUs)e-&CI}hnT9I;YX8z(IF-i8cC=}242ECEBx#NH&vAh@}?a@3JH zphAb;Y72diPgd%-wajh!#a5L{{R5eVjy&AC6>;kK+Vjkahzp zdM|^tGX@D52qOqWhA&KvO~hph(ln*{ZGtfmUYr=cB>WYF$5{!cxbkC){x{_Oh@31r zACN;7x9}HmBr_A5N=DFOOh-xL4@y2(?J!Q)pn(J&z8Os9w=a@~PMnQ~g?~@Ut=GU) zgd`>|EYWMD`r3u|>l%t(40UM@B0MGt=MWAhsni{7HK{EYiH$*xATl&GJUEO^TE4KL z!m$*7g0#Y)QwC*B?=>WfYnUWnJYrJ~91? znbXq)DZ?YD>&B7kBQv2{&uri9<+QUk<6J8`*QT9a$nuwteB`KKbaXuQHr=t^w$1g- zh2~byhaPylQ%4@wZ(R7~{ql7EsnpSjRo+`WZtj?!obOp^eNeS=xW zcVK5#d8__r{p^N0{++V7%jWsF?YH;L_kH!y-9rm~caPrVzghN;vitlu>}k*b)V}|w z&^7g$nr^YCJ6lny{BK&7)&4g$W$S#oDz3I?iL=+1rw-<7xuz9&j@>?{!b5VhS*+O1 zwrag%MemZ$+*JOB6JS4=S7uzzqO18q+vZGLpV-#-zN-(+>(rr#H4U>pZw=0^`JJ&p ztLd8IA6D1R+R>Dk-`*s81L^8DTF`A*x_addpRKL?ZGN_A_VjG;tmWo#rgoKByK0`F z@0kxRc+$1KGsb@ih=<<#jCZr>-MkRLxBq_C;&W#oc+bt4v*>EqZP#3A-ZS4fe>v^# zL0CoIt@4}YvlBn8=*W4v+Vvk+a@+g(w+`Gna{I{Kg@v9U`ZnDU{eb`8hVS*^&-|^6 z(AV%whgU3)%ouKmDy;E^{Vmz=Hl&-dT+eAD|4FLw6zTW~C)zC3jV+tH@h zjBmf_+n=rQ%=FBJ=C&>H_o@NY)XxK`seJkbcJNg#i!Hm;m3tPQdyu4lWyZTs^sb{u zTIj<>@ovj_`$cbm+IukN%6dC8-Zi3k&HS!R=Welc_r1$vXW#t|_l30gD54<+p%J#n@<%JeI$!5MU>zrCqfRx&tD>6 zA36KrKiejMg;}_A zPp*u4!QH6K*_hYPxvR89b%FVfV(t2zn?+TS$i&0El_WE%V%}=bzi}ZfwrtPI+18Q} zrH)0x1jG8?oDIU1W(@eYmD%>yAGzu6QJs(YD>*ZZvT!vGw~pUDo-1RoR<6GJj`Oy2 zE+Kk5wbV7<+4h@9b1nu_&Q;dV45IzdiJsOR%aVjQ?i;>o-^U&Tk)<@$Duejbo-)O^ zL^7p9DUG@hf6K_EGqUW%0%=Scj@HR)UDnsipQ7hrUslh-!491`BX&quOv^wqs)}|A zkN`O&Y>x#E2^Uhjn1rPqv{AWy$olNn;X}utKX~w@DqAs@%0>PJ`Q*hUnm`rH2S+<# z`dy+mxkNYu#(J^F7`Q1WfR#MZMdbB#9&qrC*P|H0OL3@K;8>aY0(3jse!Gl>AP_zn zAWAUdsF@-<0c%-Dh{BGfsVPiPq@eF&PoBr{WhDgp(X3(*Sk*X~@_G2dXdW8u>4lK8 zghk1?&?ZD~fW-2PfRxn&JUCRP9ebDg!)ZiHR^q=h%>t4dNG8fu(Evae0*Ng^IDw8D z8V%SASfs^$B)Jtvn06Y@F$|Wiax_k^BIB6ake^CP6e$A?ftjo9PY?|XmXX{h> zm=(kwbv9Cp0(Pgu6m^v;OwkIiuJt#f5cfOe+4%)j>`Ap{8lG&3UV~@36C!yjKHFU2 z00TB5l&a*(!;$*=)Aa$goanp>L)k`QB^ppzO%BZO;9RQQgRJ7P90Y8O^3WdzWE?w4 zmhTCLfcpvoZw4?V?;zK%PhX$)yq=WFH86m09GgA{`t76+F_#ypOJ%abi?}wU>0=3_ zvF3UUMX>Wfh6{#5T|NDVQD=v`N1HO{Ii70Vlqt!B<8g=MSkutaQUl&0ab-NuGU9q- zziNh9ihsii9cC4~QoWQ1@tR=-rsqr}xa7ehHPoFR0ea3!Bg=0i#FJ*ih~TT*N3szm!@q9T2npXC(DWM58wVke3qRI~f2f#E! zQlpY5X;!}oJdC$d9~1Dm8HbkHfRc>B60-hwKA@LL(14bh&XTLxwuw!+j1r^@GJp_f zI&QHD&=IM6<6LwZ^t+>>!H8tV_AE|EYlUt!s8ognH`s*<0yU##428o1r#yhyQgkg& z$w45dLK~ z91crn2EwYvj*F={jDFeVk(^YJV$vl(*-(tn3gdnsb&u1g6Xz?O%~@x2&TMS6BOjhV)7v9UYdxB--J%JHzR<4YYW*PpLgBRX4B{WI(SeA6+pqG!fFdu-mDt!kbN zi&bl~)xK<9BT*!C#}*pWbvtKFkE-fO`)RK3K~-Qre6LgNIzUj~6>}gF10Zqz9T|VG z=~! zVmulVU=55D_LB1zf+SL?P`##=V#s*|fAJ(@WqMMjAU|?7n^!=SMkNky^lB&C&_O;Y zNW-5okiEPquS5f{tVFZK!JBjX%uREL=bvBby=VTW^Bc~^-hOf20kQpHjzi4%dcPO{ z{??DlD=!sPok7$@d(4V&=|y-72|1=N!@p)703jEKUcMgcfFi4II7luNY#@N#gHlTw zrmRIOr-8^+shCo}KI*-X+LaJfSdV@YVhY5GLH#CuvI-E?ckIz8umGWvX@Jl{m4Hy& z3nDVS=@3|YilfL)%UP`hY0&AGp-U73Rx2x?!w?P{=!NTt4jJC%Rpv74t}P(>v&qZW zNJl{Qu4Z`xd6BT3Js=E?Ct+v8aS6JmFkC|l0FnjUE{OSsKLALPj;NF5`l=}$LVwU( zmXlQB1SyP7K}$IwC=j72Fv0kJ_$15zvC%6V!i=jWE0a2*xXRQ_gQUi4B8Aht3aP=8 zG18AlL9uoyvnrGyn;O-Zrg{4wz(MsD9#ZhNWqj*J-+Cq?W)xtj*w6_ZkXf-y zT(OIy{Mq`JOnpGC4536f$9l3b~%JLa!3s5QZ_(Z3V0W+P!gPFCv#I?H? zy}Ps34Vmf=vASdKN@mqoan;tvs%?*LW<}`jBm)5lGrqN=Z|%a$%!UKvh64|(4}vbV zmC}XU$j8TMgboZeK1I|Eaz~p{NToao)f>sW%G-d|8M9wjW{@UYfB+|K7-FI_Mq6s* zQZ18aib@9PTjM}UIFBZ|Ul2tXoq(jWIZAHNdQ>fX0k38ZN$P&~nIE4qN4`!X^Opk$O+^sRh8nQqyQCU#)*9WxU%k@pKE`o>+P2O|$EtQIBhc?!OIN}#(RcQdL#&ghFirMTtB~_PeM>ulvB*6(xXg$651Z}zWeeVW zHQ)4o!}l=;YcY zoi+au>rH+X0}pH4_v^~e%a-4IvjGfI2t%?LRnOG|tV;c%VBz7RVc4HJL48VJQH(DY zt)iml(~xG4(i}#b5~Nwzl&M=S)~yB-bY_cQ^sr>Z>iODqLpN}l`)Rn0T9x(F8hDo9 z<3a$f;4=B}l)NkDg%?m)^-$Cokb`iFTEHS(GAKS5#ioENTPXf=WitPqKAg@hU^0WN z*>baG&NFv7?ODgnStvFqSjob30DXd7JVSxx{2!Fj3S`7)lEvKfVTA$5))ggcc-4sn zuiQA4Y7hyuN5yta?<1&?s#Dl=dG+Zc1|5{BEwcLZ5YANM1f~tFIw(=&pB<{wj)+xH z|26DfCfv15csbdPVXE40u=4DTT_?X+=%;{^wC6QTfs%BkUP+!nEm4LxvU$Hf#paa3usMG@FVE&isBJB;Ru6mNxN_v+^uMOv$`Do;YtwGl(c9 zr9LsaY?vdodP_!}hVlM03XH#vG{hV%JV*4C>Qx}`@)RC1 z2)o7F?yRSd(1WJ*$kZ8|*ORWI$EFZ>)m&}U>%4G9;kcA3&5N5RWQ@*L#i~3)w1%67-e<0^?$@yz?{)QZO4nU>+XtK}%Ws~JC zct|x6)E2(D^g$>=l{^Kj69zsT^1Mm`P6C!FFQil#{({s9rG6-R^7*M0p_&WmH^a%h zCcSS0ihmh37CuD|jRqkIAI2mb3(K`c>6v|b1z>xH0=%P2mVIb4#aczK5;C!Q(^qYG zZA5e(ojy7^I^V_W5M2KmIdnsU#e)Chfyk>uCzWm*z*hp^6ULz>y*yFNO0^}_TFBq7ScEg- z5R+-3DqOQn`X_6zTMn$i*CL;4#y6_LjP}a*4Vtnh zG`2d_`FILbG3PR5f;E5QNpB3RqZj9#G{Z8;Nu@aHET_(RPWaS%>5Z>TIEy1}$e(BW z#cH5_hfwqM+>2_*kF=>Z$^QaE#Su8o^{Nx?xiTNC`bJ?xZOWEcaUEHs#^iqs)KzV2 z{rb5Z_Cdo1kf1MPdoX3sD}rKl7}(KpW73Y(Mn;HW==7M<+cynw)SHlM%C6>wdTLlD zX?nFS(W{n>R@YG~)zayH52<8AG-VQYAvLY^CC|vlk^sC4tMccjmK}>*pG=omE5$cx zN|K|vKcw(Iu}e*fy!HBNxzVpTt1RzY3P@b-EZudqD9>8;Di1k&yt17@F){l)TM;i4Y9Y zQ8Lwq4p6u(56PF;EPO<-4s!koj^rlgIQ57Jaa27|U_IeJ0oq}-AC|t4lcLLiq2#m{ zN*4KB1dd*8d1P2c5XUIc%>olJ8-Gd0lM9rag;|IYRAJY%k)>p1r_{*G1hXV8BV?h5 zzM4_igu7(Q`UIZs{}AbO3+?g8S8zlV%|4kEj@Py#RWRkLlYGi^J>wjIQIZTU%K z*ZdV&4x|_FqpJE$)e5m{1#waPMEtMnOIaT__bwd1&%bvf-Mo*$8Xx+)v$f3+E9)|q z0kJZWuIwy6Jmds_P3EgTXx)@)-7B{4ec!qFVMW)w6Pfh~#q|f%>mj*1@)Q5YjDL^l z-;?Tp03KdTN2aAmZ0VsQd?0ly)-1Yvpit$i$~fEKceX$Bbmq!9Th(JbS66>4b~A<@ zL_^E$_Qkmu=R@;n7aM!BE4O6V?D;<( z7EdL}qlV58DbbSAQAY=(962xNY+7`*KlDT1_Rg`lkEQ+H3kNeBkBj*4KMtLQA31!m z#nSk8W2SZU`>mTFu2?g_In%jA?A&p`Hq&=b>^t{h#q+8DAHs0vidFPU{-4z1z*KFf zq7C-dvfk#5ceUtU&AxLhY(Zt&R)k7_)A4s7b)Yhgj{bIy?@)tv5dEh;i=%ghBvQ&4KYPxid=w6d?uNU3xGw#izdvnIUQ*`fKwC-ed>i>5eLlCH0?&5QV z(w3mW$%0dbMJ>T>5s?LjU!*B^iSAvC)?G5mPJB0wLxUpnLW=(j+1-N9Iuui&Lovg@ z99Og^j8mp5bAcr#z?D8plez%vZ7}ot@_HLh*znS9AwplnqNfLe?X0fgfru>q7-(hu zJyWPl7^{OuWaYn<%%x!qqj7T>oI4+{`c$%Q%JPgh(=;x0+f38^&}*|Kt_M+hP&lPf z_WydDQ5}AfHY2#FwHfr~^t4k7T2dn))%L~+05Qi-?rA3+M{4r1s*kKAscoi?RvzR0 zjVsW0Av>`kQHZk3OQuSWb3nVUkK%uioZo^YSt8fQLpWdXpDBb}D$&97*V$bp zlL7rTT?h0kVU16sVql(ER;S7y*0oF@B3<^G)`zl=>s;S&d?r)ZE!K5I7rU-EQ@35L z+kS7uz02vkzSQAtT}!5Jl~}im6qjKGeyxcA6>FKg_$sktxnO9Kd&^UY!DjIK<@B1m zcU^BmuIai!>hMpVR*{~2=l{D^BZEy0Ch2)sFj>4;uS3jmN~s zW6%Y^oNhe*XU;RwdGY#lYq^TLRJpw76kzTTV*^JgU-jmKx^C-@e>D!Y z5r*s#R*PrWM~8XD>Qk>-%CjMyQ;Co9xmbbj!itrV&W2Z)R4vB_LWGQ}&rd zkH~fe>0A_U?Vwy1;4N}X}1;dl}N>5MdDj;WN<6kkwt9oZ+=9*$0eLy<^6SF65b zRuDs+os&?*Omdi-_Xx2;N*-t}qebOM&pOx$$@xnjbv6o4P>VX>JL)dTQ)A=hGs&~a zrc4E2tHVr%t~4zPB@bwC;5B-ZrVE-MD=0UjF~8(^8AgR>Sdetmb*%$HPaS5WXT`rs~TJ@tA5>hZ>&$ErKA`EKtGoh?Ys= z&y$b(2bTMOmHdov7sA&Es)S#`kG%B#8b#5D6(_&OMlT4rG*H_6lXQTrdBX%UG{B>g z&=n{u4o<}46EVp+GK}M`u+NhaLW8)%2A61!CVDrp#Z{7)@Up)yiR3ZA}2=9 zC>)iTrn$=0N#u367`sWv^U+IB64A1Ru}L4JDOx7~gb*0%qcH^5&@3w$&aY0__om8m z+@!kx*3O$d=d24&nVth;&w)RE{@)FJf8asYi!ho@NAZYRGhhAHhPw@E??yVF=TpL( z?pf2`zLe`HH4QTPoOM?J`srDI=G^qj*%!ZvV~KO&+n<8voB0>N7P{wI2;cn_4B6D$ z|Jq(Ly%JRS!}h>CN8Ub?Y2PZgZ++0deP;Eou3zu^w)wk`Z#jVf9G)f4=&-{FUKd2v zjki8|^OJM-jDMquV{fO^{w?Xo6`|T&^i3uFHdt4MiGs zvb=R4`4}}Ok4qXRY}C9<5nmyvR1iyqt3XaSb@h38NDyn~dbi}tcH#)4o!hWE8(5b= zg;<5qM-~2@nPM%PI=KSev3kD!?V~vdMUr@SuKo6ooQvMdN!8pzebfEtzx&d+UK017S)%vcMvJ+5u7AFIe(-Mn+knBVtkVubISxMJ zj*S)BhL$^PZ?DZ6;eTW+PsK4hCucX#8El7G{-d+qn)-w3=HWba7NjyakUC$F#6=s+niqgN!Sa)BDVdv*{Z+M}ZOrIqNP zLTNJHkcyj6!2}_u4D_X#ap$)TFxdh~dIroFxUlVu(54DIz#4Gk2%jrl4xjri6OKee zR~|BD7@iD!AQ`U2Zxy?p%R3puhM@AYKx zI@}p#^xd)K41-0{|UDm+g%H-H>SiaV3w^AO=-+p{<($18l z$jo2Yq(kFgIi**F-D z3tytjkqE0uuUnECUUXdS;uu{_9Yo7%w<#6$GQ0JR9V^Sj{Vid3T2>$~a8zX+&7z}u z_A2hA$yRN^D&|}*5Y0wx)MA3wO=iLR!GQ4L(;#zF{Lv4?Prwh{Om-b!>5o=|8+ zo6%Od88jZh#sxghtR7ZNOI$6VYp3CPlDYi zYEEe6{c<9P0^6|I^r+v7I_PbV%uo$gdI@c-K_oQNoU~N*TDPba&FFp@N6(97>l*VL zxwNb8od4Cswy1~JvzVXOlek_GcZW*OEAl1fiLtBDKmP$Tk;;qFnLwKvFb(DjUD&ki z1m#)c3gNP9uowo}b;rVf0G8Af!s%REN7n zkVt@ydxmIb?V)kx%&tXb2TIwkh*DhvFsY*k6+5Ynm=jh}3fglCwD<_5iYB$hLYE@Y z4;W8em$xCxb?~@jcQigG#8q`w`c_#lX@Ptrl);~p6N3{cKC7QSh3cy61=5A6NWFa} zNBCzAvI0-B;9%3Ej_ypyF0o@*rsI&6RL)hllq-$BuLa1kM` z8#1j2#MT4fs})EYq(e$a<`Z9i-{$gbX$>Zdc?KXi6H^7+Z$OUAcJ^leI=ApS|~q8p?h=W5P4H@)xN^vF{~qVi^l z%9}ReaA&r@Nw%{7u&QOwGGCsq+QPVm8^z|0B)Z40K(_n&xSZ=b#^cN>td3H}82sYu zt?8<5*~WE?4P8*4?O~veAcaZFPbaf7&OIUwoRHMu6S|-ZGunjKT|Y%pPr@)-xL2F z%4R!bZ}Y9mo0D^03+?H;Eh##2Qv6=ua;M>T!(uy3uB7XCq{<&T>J}X>bH^9kw_>Ns z-c~%siRPLW*`@$?ny?F74Wq3v4vSrs)4S+EY?odvmhW~pSk#*v<@@o9IMEi=l3IW1 zS1=K2)F`%1!N;Y!UHXLCEhurrQ4V(M40>wT>gE6oD{h`*gsF+~bpv>3R#5b$xj@%c zlafboAt1a(&y)!kA|Y+268?d5Tt*@;)E8J0(5)v~IAAwoDI^NxeiJS%c?eFOS?Zxf ze7TOoq~7AG+LdH{G`-f+7QcM4kg;o$UOn5YNn-+{C;8D+vCMB6^=yK|VxZC+kT6?_ zvq|~}PaPFy`W!=8lu&ZQ76qoqX`0K@Wj6KMOsDw|!w6t;1bf3EkZsIMSLF$J2>dlT z0XNRtlHKnwAUhdtu!WBCLcUH=J>&@F{1G`{Bd3oX#-n=^$t1Ga4f_Vna-YgX`vc04 ziKSiY;YwVu3qm_zHG9D=MrM*;#h>J>N!7IJpu19waLg)Q9P=b#Cn1&|RD~gyY8by- zFIIJfpXRQ+F*Q9k$G`7xU9@7cYQ-%vcigw#X>TCqdgQ2`xdOU=uIqhAceZim+^5CH z&6&m>V&jf`{h6I7#GNNHJI{(c&!!vCO&@>sbb!=xrdHJPzPsaL-PUZiS3VY;t%gom zYqoh67BFumF4$m7VE*<@OSjn4z2Lhym2Np9Ro5)mb>n7|#@mgV`fjnl8?TGi-5(ka z{>m9A_(`zEU9(cGS(&NXCf00A*?;2n+&Dgce75P`$X7@1jy!N~%Q$z7&fWKfv~yn; zVvPYv)IT)xt^i~Hy}p{3<#J?H;mf#NMR)65N7}u5(JF6aiq=cCqKx?(;9(LA{FXkd zk`~XXq*5Iusgm>uiiyI>4xPSy66`qjBu#wCA4#~=CNl91L)S&sq{Wgg>*wIP#j$IF zOKXYuaKXjDSdO9NyX;*yd4{}teVa_38iapNgZx`?R7n_HKWXSo9)%ia8l+(`qT?lU zu?pGdex7jEuHG(OI7@*}Q8R2q{m~4y+h@X=+U+9!@4xo~9GEY1)Gj)jvsE>zBaDGc z6xqV1v~M2~d~W}u)h{n_Hu-7+(iS1DQf7!hE>#y|_9Gk6OW%QCa?Aar zH6}xpQhC9>&rrx=Q$fXxdO5OD^KU3`5B89X-a}}J^ zFSF0dpkTwIt1au_G-t~AH;MFr&pitqXpd_A^=w6b7KZejSIuwy>W;fRGMxuR{BJ%0 z%+Lgzj;5w^kn}j()08>_9(=~z36pf$+7|lXx+&YRDre*B*Di5pI7aSJrYUX~E+3 z0ySi#+>lWCt(Ngj#@!>jNqfN!?S;Mac493uEc!MYs`PzJXW$fGRGHLHZIG~E|LgRJ zI)*fT10D=e0}W6M87RVSkVc^$q0E4djk0ofEb zik@;XbRofhFWmhE%#%Cq^8VQGKNLx@d#tg`hn~D&W^Ve2NB#L3>t{TIxR!*Ck;$q; z#zdE_7(eMM3PgjlUA+zLg;B5|#&CH%<73DSS_~^Xt|fV71HLecGI}A1WA@ST6DwRf zEnVOyD3+vj$6PT;lA77ini=MsDw<`r<7^`)S!dR}3cHsI+$CYMDMvt4=+wZ95^0xI zBhymQ-ec8L*(4pvRv=p_Qth~G=t^$AyrnCza#;+}d!AXs_&JWjXeb_sk*$kk;evbI z-$8ByX=chN6+cJ5W8|>Ku#aN<=ye}Cv^EO&;gc*UShop(Od&);Fa#=elJ8w|Xok}C z!)YS&?InjAUHB3Cbj5A66hqGUF-_uxrb*o9geBBPQ~h6fJ07)kW?Rf(IfKF6JKu{dK+L@h^3T2A`_>^ZIY^_UXgy#0~ zym`gkfp-qSefXVIZ=a%T?^on3W^?O&E#1=Ay14N$aIv`+R(N;r<+JO0K6EtT&?bxx zL$;hV!w>e1ENRY_(VLaSRnYxn^VXb=UhSN>iCF;)i*;*qlorB9*{bLwP`|Bn$xUHf zP3AscI|@oK$W{iMHHkLgQW*<#8qCM|Wo%qglx*Y*=4s*2S*>WR@{c+E$T*g|HV~HF zgJaPsCMC4@<6WWigYqaLd<%OI*T*pz1tR(cB1Qz-MX~`XSuu+TFLsTN$^)=UeeI%a zfGD$2RI=y&0#BpKO1b)=JD?s{jCaX6rR(z5$mj-OS%Ihu$xrB(Bqj2_N?i)S5QmN; z7igDrUA||ix&(}!s*z6z5pP`<4Rk3yO`23vElVMXbmcLdK%}o!!6NAvCSmwIbgIGv zDc=ZxMsY2rab09@4h;uOPIR>lyUR(w-l>CuvhNB;u0;kX&@^50fuAne=^Va-OJ;E| zQzd2F{uEf2gk23v3}LDDX`sWSk&wWy2o-39%`SbCjfAdxN-}m~-teScACkIA>-3=I zPmve8F2(vDyQWGu`=|T8O1CnKcL<`OxQvx(@fUpL5F=j-EL$8K7+8o@OfRnx^0+8z zTBRs9f*226TJe-@wSnfRTC|eWM$QU2%DB_dU;bi77%;rQl2WarRJuK*)Jk3l1zP7N zw_Giamo5Y-R34J~vDhW=2ZS{g>0(K6sS$30#ckGv&n3A>3n!6$X_zw5<6U8J2*(Bm zVJ)C#2E3PL!@i(P-ggA!gBK&QkkCai>nU4{QdP#0I7`93G?(@J6c$)Qt!?8qHs`Ae1B9qJ1EVN75v4Ws^L>~sJbNYqeTtsfiAyaC7@5KfLi#DBn@Dh@h;pk4W(kDZ0^`w_YR z18&m?RDrWU=63%Tx9L%N!>f+R7AL=tpN&4|==X861Mb|~#~l6Ud|d0+#qH0F{^u9D z%9$f+?)k@-dfvY1-}acp@8hkleEp)Y=P`%hj}G%Des!+Af`_3D#&*aV$=|y29pBqN z=6__i|LRMheQC*z@W(41e9i1WvVTs6)jalf@}8MCQXPKG!Tb0n{8Q6-3GhZzWJX0{_!P_ Pf|eYJ{gB170r39-Vq4gK literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/ernie45.cpython-312.pyc b/model_executor/models/__pycache__/ernie45.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b821a569754e3fb0f117b70fe58e8a4c55a06adc GIT binary patch literal 1632 zcmb7E&2Jk;6rWx1u0IkdruixXBqvBgP8AzLsuVSah&F+WwyFf;U<9Mh?%0`R_Jf%< z#F3GDh(s!iL=Pw&dgy@*O8gYY>=C+YLK&-t~BHQaZtY({orTo@Am;D;y z#COPC;In3NnNyz>y}%``=mcJ9M|72u;+Hg9E56^T*QsA$u^kW)T3?Gq`B0fFX>v?7 z!Z6?wjsotiVfB(<>Zpwi#AaAIHN}Rs{E}ULdpkD|n=c*yyX^?)e?V<>n zWZg7c=v%mNX)XPpwx_gVX{d$Pjf6hAuUjX%guaA6brJe16=nN(tyC04Q1N|>T)Q>u z>aEoJU|;L_Tpv27`xFM~Cd)oa*}SQs0olA zmBVE>+~490C3?P%jz-bD+9mBblrtCaZXK!P{+qw6qoX$dYX8x-Qck8Zb^`|jc}dl{ zocxmDe-`*qQ%wkfDw|NCvm7Dkgwxg7$X`R=RwxT^n7oMQ$C+NacG}WM7ucE iesQvsHOH-IW=7Yaof^;#K;-J(3 literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/ernie45_moe.cpython-312.pyc b/model_executor/models/__pycache__/ernie45_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec94338cd4712e07ad8951805c6e7077f123e8f3 GIT binary patch literal 27862 zcmb__32cP@rjv4ZD}hYP@}`nXzQ5nR zH-q7Wv^gHZetrDiue)Er?*IG$zkB|j!(ruc#r3~A@dr~J_cIE}mpSiw_?njEZgEjA z%tbX(ZCn%9;I4~n6S}Z2p%3dd6t9mP62`ER`3-SR!W1?!zcy|Tn~`pen-i9>g~gfT z)`Tr=V}5hoo^XU6%x{T16Rxm}`K@txqAXmNC=Zt>D#8^k&K9psc*359H|$OL!af#f zkNXo<;VR~L#H$lE;Tq<5#%mLG;X3Ab#p@Fd;Rfb+#{-GRaAP7E4kntyO^N1kGm9^a zwVrf`2(m#2J!QPzU8?sF(lGRk^THhd1{ zAfpU`vhi~$PsLn}HVE3L&!&AhU-|G5%h8M+E$TU9dt-ZI1MeF1EqwC2Hq<)z#F?qF z7$2J&jdf2?#jgiW@e@(b;e=;Om{b6=RamYU^bUl==eZ6?O2lA`??=?y2(PhUKN z{>+D-dMacR%TCOureBao*m>#6&ZnNbFp*56cIQW~qb3!xYZySIcqJ5!OiV>%*F@h$ zGJ-dokOVKdTBv3I%_k#cC~BI&F4`C&D-%_eT06tX#wM7qLP*T?ixL(C90#*9(ToQdP2CWOq0 zwL~>z?geg46ScnP2%Dm|*SN5GfxD&&Tb|)U4$*arpPGmbMD|23Ovf&qe`;8?U7m|Q! zZ7!<)uJ(@Z-8@a2i{vL;j!V%~*p=2IC(#>O4n1-hScrY;6*c12y&9Oul zm`DZ+WMk8OU~2l6o{&W}AQr>b9Kk5XbQ9q{C?7LcQRR3)MOl>UU=kdO&&E#i{4~#( zQ<{-Ji?Jybr%%S>V|)+Qv6q}aa(0nZQh7@xG8!LACL@s~y^KKqy7AvH^rjLsy)zRt z-Ki1GVyW(Hv-=M0zp^jZ8=tt;J99mCd3vgsX4_tA@{C=JjpEzorC_o*#zycSOs%n= znd|&v6h7xK8P+{=YoA7H@+O>L-{2lqau(;JNih32%+>d6_vY-roTENtZ&+y*>`l3{ zs$5k|&Re_m#pRbW6`hZ4#yabQe#2b#$jMcA=InJuFA>#*B)V{>qWzJ>R1Lb{kLonQ7LYehKhXA z=w)BVHF$1uX>KylX2=9|Piy3{_#*PoYv*;KQ^tLPGOd;A<~5*D#(5FN_L4LxN%IW% zj%GNd-NBcCg-Yn`XYFa2F5z!iwN`tq7SY-Q_-4eN=mWIdTaH7pt>9;q8!)k}PWwfZv+~5k#GWR6#L2ulO285- zU8yqJCzqhsLCC>S8PTfb@XBQr?#N-6L--skWq;v5SyS-KKDiFaO?}0#v}Uqi_80D; zaLXZ-uTaZzm^iqI(zaBK94|j%-k3H{w#u=E`(%5;FZ;eU?BqbMOX{9N-<`72SDmfZdwDy!-h^;M&LU0a~Ci2oGz%EcJ zb&%tTVv?5Vl8dn(Tts_98_|HsnM&KfE2@b4f4_fcITK}7U+-5{=!|ol&CSvG1kchcFSDE zpf$=lQ>SF#SIJ$@8+WaKB@DH^0=%Z9~t{R9|vM+T;kN zAC&!tJO3n}PAfslg;~R5&vu zJ7tO~6Z)BpDB;wX?%YsdFh8)k%Kt+k*O=d8TkYg5-5g%43H!Rna3r+F1fkx zDTothA)b({Cn9Eh|L}y$M3?*$W#nlirXn9;?d3;^t^x&?Hd!p2B4G>UAaUg}<-lK; zuP3*~`H75(9!v3*#6*%nFe$O3G&(jF70WM=Bukq>b16@YB73o-o_J}VSP#)c0uxED zfL9OklxSd*1#s06lcf50^Ist{4|UvQ5M5$XY5C+|B&UK%4MchceJ*_pAuD6yd72ao zLr3bdOGIxgl5g?ExRS@A4XgW?3zBMz^?0*?T7gMeht{=EP`DR zD%)0#>y^C=mRwbkP}6^L8`0Ni_yI03DdxtYmJ-goe93ii!`~K$Nc4qzkg1;XW zj{Cl*-+DFc>lb_!wC8*^x8`roXMG{T7h3mqvbdJcY|9~`8da_M}Len6H0=dSPY~x;`aW94HbHUbZ zaK8}Te|I2z=otZj!DlF;A=loOZ9gWoAA5h`PY?g$VWIsC6y1V{zy0D{FRl${2c8!2 z*YY$)2RH5Rti4sRx2{%ZLqh`o>_b~R&f>rWQNyN=)z-}JvmcyT4-RLZz95>`2V_jxqYQBUpLovRIIYkP#+o?Ng$SJRlS=@M$Xa&@6xbzs@Q(vb1($~82j7ER8D z!4K^XfJ04f3xgZ>h6lcuTuno^rbnph$@%N0+T_apx4Le2v2cgr@5t3RXX_6N^#^k` zftp>xS@KKe+kM?}a&JdUAkOgCy z{bEUe4~FzMxUNS8LoZ>VMZk5;8iaj?J2eHxJxCu=3J+O+VyX3y!6iH|v9M#&5@S z)op9@_~;rsA9C6TYsOqt^nvDrd)7uUlVuJKfAF$!=xHJNv|w-8lw;w|PdAPCY&G~k z-tKyw>G;c2}PjS{2Ag+b(Jh)=hU{%RnB8Vq{^9iG|H1W3e!=mGf4A!pOHSYdwa6mFH-(-s0e0i~T76 zgZQW9m1HmUL>fQJB}2f6BBCCby~q-Kx<@ODr1m;UV03wTj1kg$NbO_Bm}yK)A6!1( z3IcixFE-|Mcpy`WVKQ|#S<;tA>OGTKW2=U zL%!o=@*NjGy-LwjEX-51X~Pl&V2ap%q8!2R_TLfQ;XeBjK1e|qhM6@$M z>35)V5q*h~Ymu4hSJ*?86jYB35#JhFHliPfJk^o6>A4WjakOX;Tm#%{$T#fDl>$*o7&!NRv>2_V4D? z=bZpYMM5S_mBdDuTxsVVCQv-P9b;R2mis!%ZPn?(SfM;+8cze}M`U>*2_VW#ibgQ) zyaMKsuiPoM=-FQOY^OS95gn^z$|FB3O2AL`$s}}<7a-KsCg(`&uW6DnCZPUPYiV9s zlZSoO=1GDkB^P2#+tO9DR2nkcWipo@aQI6Z`UExV;A`SJoZOymKf_7Qo zE%X{qzwClI!8TmrDaNJP&7aI!dT1UZO&${h49%H(rX|&#f#x{i!0T+jgMPO#t3l&1 zl8lKCsF0^()7JnOug7EjO%w`&0wDDi6f6G-A^zLsC^K59x`53^T|73$w^8~fa(c+2 z&JkTHJT=aw9&FO(zeTa-g;-@a)}c<)bKW2lw|Sa*L_KSdTVkA8EvY_*$4Z4$+odP`7Ww`OIolgAHIr6bNfSeD z&GS+6tr&sUtFBK94*gHaoiyfgXnhe5z2Cog?dtCzdcW+SAKM^Ex>4EtfkpwPG90{L zzIVNR|AHyk&`v0<7FwySt4VM*t&9t5 z$W*jI&FZ5pwnuKRrb&L1gzR%weYvul95iiJEdYcyMCSsX2k6!P#_-~>Opeg`#q;bX zbO==)^Z;3v%OI`GOpx|l775r{WUeF05XC6*3i3tDBE~!TB~YiXgH?2^Ob4r**Qa%A zFb;E*!XwBJVDp#?Py($=>^Qi^0(g57HJdlg8=-*VCYANRsALVMTkgG8ce8GJe!Z-F(YT;nczUzkb1QT+wEXOwU%vh2)#mrQ-s!rl z|B3y__Dt=m4|Xqw*2{+%O{95V7$I%+*A~CFe0{ZX_33q2ucVPCcu{j};^qWC&Rr`b z>)!SS+h(Qz*14PKR&48)U5l0l;}SI4o&|>lj9?lPc+12M|AK=Z;rPdwf41<$Uq3iH zG%Vqzkh>HveStd3z$Ad8J~}a);-5kcPvbp;>T{jFq~L~jyXHKBDpGHgho&&!mWg@IoUeMz!o1LM zdzbY$JGN{rXy-iDTMp)RqL-HYS9Ht!X+{#fom(yz@8*InTV>2!&iQJ$DwwyDH02)V z^>V)8%AnwF-}14rUqN$KEL=@!j#X}}mQ?0-EcQjM)8O5#@ZUOo^YE4depJ|BcPo7} zy=9`1nX|gys9mi6)Pj(N@2ERQ@n5-QuJPf&LX;Z5i#k-a>k?7}qJv$QDQW^nvonll z21YXj(^=HizQB3v@xXqn^s>E|YU-FdY6pU|F$8DFm+BO^nZ^vpMq!4@GkdpTNf=zR z{Jww|OR}!w>4;cdHZ8sm_wyhXWLm1zUIgo~bQJ+B(!?v;&w1mv6kS0_KX2NW0`s@R z)Kz5cr9RgJY zrP;TY=8n3Q6vwudva~g7CwQ!+JJU|SJ8eOE=*F~z0Yp22SU!lB*cj}rgU@~iA`0H0 zQkZQ3A*p}Lp))kxsdzU(YS4Q51LUTjNEI~U>f>rBoo1l$R%PF|mK47!r5<-Gi=c^8 zi&T?b!VY6wRr`C#Qv`Yy5cmI~EM?RHu6Tzf&Uy_sWY9uluYW*6V)vPC-Vlrp$!;&g zm~^{@BK|M=^D9&+eaVtGO41rKMHn&J%EwZa6Yx z%HNBDQ)4mfRJ<%G*lCE^r=P?5HFS9HBJ9MJHr+x^H$epdeNXN3^D8f|?OpdAT(I8v z)h}ONd37zd?mN6-zaI>dHtXC6&#woc&6peRS3z+jt6jXcS#O`diW_PA(k5IPfZrQ!E zgE@Cqrf&B=_ipCtyXWqE;A_qK>J(*H<p#pDihAsC*6lRp8#cMhF86R*D9_YMuH!9PT2`a`( z=~d9;%xkAS02rz?1x?rI7_`Cw{EkMRQK45;H)_S2E7KH}r>vocIu_TCR_ZY=*d;Bt zo$yL0U^eE3#W6nfoyG`P}{dww_bZ}@xo?nN4E8#(0cIhkq_F}Tb}{+UbwKlf74(4 zrt`Kl=WT$tXUo7el_e z&>p#JmoH~rtr>Hx1W@#OC^8gslwuL$S@;6pc#48glXIAyzYp^GArX>uidy|WcqHXb zSD%D4NPpuvA?t6*LjWs}08A_lypcDawXm?25P*$&?a=3JvFzwcfB_ea9n$&?Z3-BG zzX%La$jkvKKz+PYDU`QtSt!aX;{zK)(pN#19>ss(1 zGydQ3nlsGyYN*L~p#w6i#lzOt3{Kw6bVy~i(?g94=s`W6G)I=a=184FO5Nea8|H1B zN*=Q$R_Ec=#4q18f>z=Xl!`$9o3PncQxO*P+67Mg-F)rTMC-`_S`a9yK7#ylmHa^!x~_ut zv3g%1w)OFR*c_w$mEJ?XS;|muF-%y^?xA4=LW*7X5`t|SG3vH==a_$}4Yny#SCl;05AU~09Xu@75rk};m$t&1{_l~x}dMjTeWuP%m*yLd|oNw## z70TZ#x2RpzEW&DhQHM30gsixv_13T>(;L#Tt5-j_TIk3{`Pv>?$71gE=E)1T+iV37n?t+wqGo3Yl( z76K@J@_hdXAm;xiIXB251|SqDpbu>GqX|qlQnXwilhxL&5ma^2J_8jw>(IzVRJ0*P z`y}DNCR(r39!jR1kveCJ9x_c1Yx)TJNF>8lOJfr}Kn1h2tVm77XpbrED2w#zP(NWK zmPEqSm|);Yv=_fqp1wG~gPix_h&Bn)vF3^v!f}$OoGp>E z1X!8N`9i}~-mZjWXqsjCpG%_+{_y%Ta+3HTAn#>DbFq0!)MXtIfTfIl9-m`$1 z7Ri*5tO^zHW=mhTWk_gYTAJVko9{P|W^HZvY;CJU*{-2`T|@W#4*$M0+js6>-?{sN zy*VAp9dwHjsOB%S!bLX~US;ji#=PQNeLmCh? zHNn1$ba(;56OPFEiOz~(%rsM@xbwOA8X_4s6z$9Op!dVh>^m?gCI z`O!>}bJb>C?YThrO~+=SIU5)d0t0JbS`QpsazItPczUyEx0SPt;wI>!X zqV;2Es5bTG*CUW~Xoz zg+ihzZMhB^#EQ`?GpUG@&NhqYBbCk0Qe+vU%;#iL{7Wc|?Ir*NLIYH+mX zkc{Baf*CB^(lUKT&H>OuQx`K`ltzhIPG=@a#`tokx#q_Q&Er(RqVcmArD6Qw?bT;p z`vuqjT%eQw>f0bsd((E?CK<^yQ|7}$@G!LMZtE6jv{N*Te@`>7Eii59e@ZX8~2kXzEc^bm|SzVf_ITjg!5 z&0KsrHY+N^`9k%eNJ3}P#cN*g#Yi(=CEoxxyOFO6xrl2jxuU@4GV2R%xoNVjgaE;) zNiG?>cP-||S0q3%Z=SczTcO-AUe(}9_NbXXiFE5a7UNNWCUTgIo~hm%i~P)F*$(Zs z$`pLziNi%oB>+kJDX86-p>8pG2wwUWyFy`ugse(RL>2Pp;4Q4Xcv$bW;<|D-T3 zQjJo*0d;LrS1KrnKJM=g5#=IU_<}qty12Z}vdbS5A&6sH`8Z=o^AR%e?hZkp_ z(Ag%D84W%qJ?8)k~oS0#}xey(xf$Gov6$JSV=k$Ni^}; z(1yd1L{swR*;oww{Mw21Q=&3r*CAO>>qmj+c$%3d2`s%`K1vQ*LW(*d$xyk(*!(R; zc+uq1D{?KAHH$DOHk~D5w}_)%9PBurd`*+0C*ROKNq(6V_0$}NHs8TkMkKAHwtO!T z9sh`)ZeXcW3*3OB`GWI#P4WWz3DMfvnGF;bi=qdlo=uh`i6Ovkj)eH_RPFqULdm&p z)_@WaC{EYlbtk_G=LYvnttp625%tiRz&d3hThk}h^sU$IUibDd*zN~gey3%@kt4Qj zpAgu$9ymaHqNWaPl2{mA>LzobuP%Ny>*~4Z>RB83(UBh<$((q4qyJ*g?ca1&F6$TP znXi3yP;m9pHZ@qEtXzPK%y)1}50)<**dqk61!Fz1f64K)J?8}Xt|ja8xmEvWMLkUM zDmrqNwVRbecq=>B^moq*eNQ4rF6i7^HMDC>ddRXpRktqOys*-?QQ5H?eJ}A&;_j0h zJ%c%K^-8DU?M9vlFf<%ktK6vAOGWydU{B)@Eg5owmTaJ32=p&Ge(J7Uj;*}3c6!5o zM09($Lfqc-nyoIbIh1h+{<2|U>B-#5vqI(ZrL!xx)i2#OZ~B6(&4Le#ya0H`_O482 zGww}ov`1w1t9Q??2cKLz_rTW!|KNu2ghW==ob`7J{;t&*HvD_;)?^P{5Dr}UV0`1i z^MKN;UlywOgATHmRhv*duypE?nQQ5M``laS?)w8P72l7pR0;mh)#tL^$DwfpuiD~W zx&WP91GatDVN*=b-;`?%<$|5Lwod568oWzS{?g6)8*&XD%le=C>$bk2;R1U;zNq2+ z?H`-Dszy|}s_sq4ZO6*^hQDX6`A6M9=zia{vHNtcx_;$~P`&G?{s1*0wf5|W?;r(g z2G*XvduYAp$)!QOhNU#(Hc?%@P}{w9>V9)4Dzf~wjlNTb%6Wn-Evt1KmHTq`s;s?E zu-7d||H9t1^`wq#?%X=9Y_hOSl5d$sS>VzZ&4>u$?Wx_;dCki3vP96qkeb?^GvR%@tJ)GhF1h*nlH zODq$G%v{6nwLzhN|CWV<*1TaBvp(ITyii<8)>#>Ur(o%P=%Bd67F1?%)w7yhJ9)R_ z?r5g>gb;c{Xg>LnL&}!-WsRY3<@D-vYyEc(%zm$bQ0T@Mzo9LTl0NAFAo*weKV^Q# z4YL-HqOr(V0VsHO;jIssK-M9U50#irunJaF?YFo?r22}&t{sR${i@A z)E%B4b5LGremv0pIHTSj%Ge>d^klTahZeY^e&~D3u)|5U{Y82qN31-?I8)@TLe2{G zO(n}|jG3v2iG!p6&^T2i-NW=oUhosO;zgA$9pB zNczIDXsC*fBu8BVeS`}^CWS~$Z**6)KeT6%6E3AAWwy-G& zdXpA8UhbFTnx@+1RQ2AF;TQ6dK%8AW)VM=VFWiyKBZtOGx8#*Wsb1NK=l03|!d-su zAoqr0R6CA+$uI2#*r3ypbZTeeFdMv9mP*6BLI3va7w{LsY1 zSR=n<4fpyyz?135@zk(f-XvLGlw1W71x~Uh4Kk@@*u_H1_+z2YZE0QGLfzX!J#y$e z>M;2agfYgF>k}C@vRL zWFIJsS~IHE;JcYOyuyXd!*pB*6n0n`lr=y!<#8ncZ&3h)mJD#t^$t&OGwh+mZJC5P zFgA@tRss`K(pk58U13jXPeC!r|2dWQ@5v!0pacOK7e?*pD~VGh5tSsTrn!q3$f0`k z9!e#=$kZdEQG!sC=8hIe(M`vjKu-ZwEuG-XixlnL2Dy@=0lGpQIr~19^doZalJjG7 zejASHBs)jNR5K~rJ?6_`=iMenYay9Q*hT9E&QS%<#JRsrY0E67B=blk!MUj@+lwz6 zrQ>cz6OIa;A~4HrFiVeRsiYc|oT`!(t0{Jeqgl!+t>g?D7FTkL%m}seWpYSh%_qr; zlanB42F~M3%I7JGUYDquD6(Nql4&bW!AatPyK5r}aNpEv{9hCG6L7X~TPfCDZW}&D zEQ_L|rt`d$*WmnmoBlHQS^CSoX0n#;RI9r0X$2~}bN;UPecyW58QA%GY4?;B$f`&K zh;uB}G>^4pS$H0dxx2CedULHEZ%@88nQc8Hv>wT}4hpS( zoAwXgO%G~9>ouK}=|P}(&6w>wA@rSC4?IDd$U2;U^2XW4vswFI!M+zd2|D(qi*P|B zCD-7L5^v4&-YiZtsOjJE3_LWL>#hIMGq9xDbeAnXpY`k#JiG3>cjYQQ%U;Q9^j>9u z&f{G!yZPMmShjwjfWOLpcX!>ZJp4-==Wymc1F(#<)NFAm3246|m3%NGvTsQ2^x%Lp)#UwdnaEO*>8G>_OzP`gXamAe||AK~j39wZu2%x3tq%dC*E1+ZHbe zq;24Neo={phxRET7xCci0?(lEwQ?Lz8Ua=51)kv&cm`b&*Q`@Bqf03~h|EVoXj>0X z$XQw0Pk5(nxP* z1SLx=2h(-pKz+$hmCe4=49Dg=n%tyGxNLSNiZg9wQ5@uLP;s50H{eY{Q2jh?ry=|{h_<}V+X!jNp?Wi2-~!1G%OiB+z-!A zpMaVkt#FdBZ1E3_I~ zcMW9B`EyNv4+K0(efUi{>UIspdF*VtL59=Vx|IKD34q9L(Xh5)=LQ>B)N5)Ab~ew@ z*vgMC$!@w~r&i`w>K{?-no%S(n}#IMdfS?_dzP+ef_v{eAcA8y&`_2t5;pz@sJFc~ zsxo_R#JKO+Ku%4KjTeUs&;=`WSdB|YD_R~ucOfxp5IwL~-v9~Yc)*k;>0xp+MJN6G z01mv%pJFuEx>Jj{)6vP|Ny1NPxEVWIvt_M9Su3`Ar{7Aiy|i9-5++T`c2oXuL8#tl znhJ`Q^=UG~$hQee%B8lCkB0%uxCY0da0N*N^@IA^Q!kn*g5P-UYQ_8 zJspG}OTwt0ka$5S!lagRM^a0uBk2bNwsj<<+O9`0o^XnV|69~|glxf5l?AbL92;EU zJM_+>Z0Ff~ooAQN-ad?j=N=h!<<Ew*#Hye&UNfGIv%$c_>@{nQiY-qze{j+tz#ZF_efKi6XP`G zD-kz~-Wf?r1R#WCvk<-Str&qhMRXQF=`oIxI}+vXROhG2VKQnaeeR;q z;5x;9lN>gi1;}@noL!VkA0O`~-`|ms38--nOkQ$*k>Z|#!#d(PTXy(zN+xHPT6hK? zh^l|7*L$pBO&xxG{#yp@efZd6tv6=$)mwh9sXe1_V8ZN^8p9b4iL;r91lr8Abyn*# z^ljERZW(ZgsJf+n%S1kG5q3VbkT0k)>?z2+_Y`E_dx~V<)*_iV{6#Wv3Khw`DOe)& zrl?YxH^uo4*u?CsW!shw@S{*kFutr8EVU0!6y-D+x^|FzBSjK>Kjh%ua%v2{suJ{1 zA-?aGMg{e4=(K2xM55E9_?dwJi5DQcMyK(+CG1GiWY5T@QAz7auMc*AIJ`226V;QF zp%t+`(rbtO80M|`DUGQlCT!>dlkDIx(Md;4(zz$nCOqlfkQx?A)7(Uy{jNbz6nm8U z5uAdV3-P{ z-xP8(p$5qZ#cs9i(u*V~9944=! z&*dFX`gBer7PjZo%z2cDwh=}%n5Afj!zr8^&BuM5=8WcNoa3*!o{zZNk2vqoIQvIj zyWHP7B=WPqliD_G1ptCss~__`@G@$3Eirf2uWTwD^({Z|5|pH6L+9 zA92S&)#^0bPt9;XF>{)p|I9rha8G=yH6xOP^D%QOIjd*si|dy98^%Y*7ERgGB|5O; z5eM(1r!v`7 yN7`YXrfymLrscNfAxHj)Iz(@==tq}~C$*a1mrKv*#8F6H*Y2gK@Xr z9)otJabjv(vrT6*YdT+MBPaPLoo{#Z;WcCf{h}mt(wW3NlO3wr@sY`7 z_V=%M;1PnPw&U!#c@hits_NCNs#o=||6Tuw%Vp|BqPDk=rI_ROiH6>Wgs<1f?k#bD zqOrG;#jWu`BG?;baa+78(cIgdXz6WX_x5-w(c0U};*R*D#Nyt?EbffAC6@FqVR0AY zOM92GcvXCPBHSBhad&)0VrB127Wc$gCE9!2S=<}%NUZK%omkVmCehj3$fOZRb@9!KExlV3TYI-Aw)JjHZ13I9((B{TBzE-fNbKz0 z$?hBCyAoZ!U5VYjyAykQ_ayfA?q%t|_`byc-u*1@j~_@J>^;chjqyW?!@Y-DJP?02 z(cRmfIMRCr_rYjWyeDzA_b7`u$B!kB_a0B2=sl4**?Tf^s`pglbnj`F*AicvIMaJ3 z@m%k7iRXKtXKA7M3yI#|ULB|BV&T}w-qK0)q4&k;$!KfLbK!TCiBkHFE{c6Ty0~)Q zfH~Uss;M_(90ycoOK8)rG&k@JiuClO2Z4n=pw z&c=qvEF2fziIH54ku*nlVI)T?=5!(FQpKFz$l+}q{fnN8?m=C9@ke#;!{2_?YmXkl z3`nxJ97J0VJ*7V>R`M`PJ{#Mu_Ggr(b|dvjtSdI^$LCb@F0;HI#u;_*I|6|IL-F`mUNNBTxa@PueR8M`z}?+NQf%Sa!e8tjXQ z4Wg}UXlOXqmx@J2L)XwX(b7FgkH*7#(Z_oe#J@5jw3 zPgT`+rBbn>)Zp;Y?!Nwuv7zX`q0s~$Ztg!H>%SNo9Z9B8O(HVT7wNm)H;Cp9#s^c^ z#89H|+Syp-Tr3s2JeZ0M#QT!xBYmmVP~@ys59+p`iN_OrhKB|Q&*9FqyDu5rJDNyb zJ32z|OCqfb9YLRC@ko;TfE3ecYA{Y&j=>?+8y`M*F2>{5c6@ZO|Kfpt-KTM5IdSA< z&oG}r|7-UQ$43)G$K-mtG5USH=so`InewC)!&eq2p`z;JqkZW77(UVPklNC^Xl$Tw zG@gn`UkHupi?T-4968;6>d=v+d%LO}xd~aW>Z|`7# zN~|lT$%B1zB-Xzmc`z1_O7GMrKYOK*KbOS!PbBFZ9!m6`i=7-Ci-|R*8tE-k38UCR z!&=HZ#t-+$lF4CStfmb4783~bbXfG_J~5Iy5gUnP1}5;~MW>W>dL$}+fO_>a9U2-L zO(l1a4#uV4d-_Lt%=A-OPq~GqiBCpbeaNnOVg^bWt>=hWY;UqM^Sp#S_*mng9#> zFNTf$X58^xC`o^{6Mt)yG#VjX|IocUl^9t)GC0zn!g7hF+OLjo+OqlLrr7HE;Mvt9 z*HY()hgM$(_*u>1a3pp$){oE6OSj3@F*f@)L@viWMy`nto`KNF`M&r7e-Q6@34h5? zB6yj5&rx6WR^Mp2-jMZ9=_ecV-o+Eoe4H%vH9h}oTB-4feM#hQ60^K zsD2C-#JDkS#0)GeLAO$=i;d{8{xJ7xU1?UNb!kIVA2keto=|4!xHey3Azz=u+obCP zt%9-(bJ7$wr;VS*{AF`gN3Bayf7ta%zPdKjtRG`2MP<^)s9CAw94H~oN_h>@tQ>pA zPP63d*3j~^wB1FL0HCWv$=IcMY-m+z=zOd%np_p4AI#V=D~xeQiNPT}qm-jr!C#A@ zNN9CvZ8$W{SFW$D9IK9h1}){cBN(fdmV&$tIz!9XhN9s|Q=@oAM-RUPkI>>s613!< zRLgD(_E11`Gi;I;#$if$mIAg!%=|F5G)coxoe%K`5EJVG+TzzDauZ{Tvk|u3_@l@? zkH6$^BY2sctKzD>ufH(y!X!WOQuesuYJPct(czsue&f08&t*?s|Jb{ZP|g_2IYRgC zo}$%pdz;`|oZEh4-k^7wU*7YfnR9q6CS#s&nb`8c>B{s^ZkpUbxhWH$IGXJjoUPw@ zc1Aa|c;?EidFIN@s&BdP2J&r3a>q^zZ6^ii$s&7V<8@Yk&SbFJ=d7H|{h`C`G|zEn zlbJt6-P5H+(*Pg4lw1IQX$;u%_y!_a9u+~ag2;{Qv3yj3iwERO%{L%lozzNh2@xDO zrqqc(ZqjpUQyL2+ZB)SI1$A@CE*0y;7goN!M-#o@(4I5aJ~VSx&Dpb4i>H!#*UExx zo#0wG)0KB^WK-znBhr_0oF9xvV?%5yqE90%(ytwN+Jx9wn_@tXu_d^KZrq^bN`QcW0Bb(LNT#OCs^Xi!sp-nj#{#jITlUqLVN=%sKW7(HV_h9_$D4LJa_<9RWon8pZ}k z7?2G+L}z@szb_tLQ{?syr5z%Opj!R9LJp&xuIY8%O#EdC@)0 z1DDo3=Hg>XqBNwZlQbRWxZ(#Gi&eFZhZMj(c#M(eLwxc&-sq&e)@qO75;SIjG{>|I`@^}9?tmPrLB8R#^0~? z?6I5sk*Hp7IX2%6u<<#7`xplvDM$R#UW3e`sErz8#;6g0Cj6OWCSXMXjZsU?9JR(Q z@|`WJAE?T31G=dFRadVy>WJAc)T-}MBT-xIaXC&2s{wb)1^$SWaYyQ+RgbC1{g`ym zW7557yVUOLm@8(Ku{-LAt@0BC&S(vIFzC}&U9TIMQN8FZ(OnGYtK|P!_{_qD&?vB> zb6{n39|;ZceTmqWVg6#M>)4@Wk7&cDfmsZEL$rZk5Mcg0b=QB&Q+i-074|Dl zO={~>*yp4R`viBP(hj0eW7-4NH+>C=|F02gDUT>xM7SioQUN7~HY=r#C~bO69vzfc z_q|k$TDsypQ>{w=g+)rN^sA1e`HorH(@?wG_9(t&gkUtgQeh>A{&DEE_5id(yu0WO(|80X>!*pX{BFH?)p-y5<73atmFNu zP0D@LR=LmklJ;LSC|J*@>C0Cb8O^(%+>1&r(O;%91BN19*SN5w=hT?7_OAm9LInVq2tQcVsCOJz5u zvbPO9k$IviN|-Ny74iLH9s3;%O5kflJ2^BiJ~>O^is`KpVpb7@iW5*aHeq5&?{hOWl-q{N~3O1@3W?A+juDC)6Oqn3tkC zI(QinO>EYnyb-j6_@ff9l?{^EN<+9XEy>EEIjKPcqddWQ(SYy3H&V<SVV_sDMI!y+Bqt-0B*8=(6?&QBLHuWsGS>RI{i{Ry z#&4on@(thuKY5w^u$r?~O;`l0@1C{s{e~rlhQmU`;e11P#`v1^KHcvX8usQJ_Ottr z&4rGgLdVXcGmw1&xX)^P#(2-V66ng>?S;2P%YyERhF0_Q27ew=1?L|**!P6>u zT3Mt`@U#_|ZbNBT)k6bk+e8KZ+4c7V8|OTn(^GH+1xN6{GgNS{6r3v;M#6$K%u+f8 zXUCk)w8Z|9Gdb-aI=PybVs*!Ks<3*OuzFXnqw8){&T|waQBwsJ&{Z{O=9aD~EZHb5 z*|@MqR*}iD4SX%bQZ)a2)%)8>y*(#-0#e1wt~#`EARq04AV z1)6if_<38#22|VpDq0Q7*D_$(tKxH-(UbFlyjpq~Yx#$S6Ee=4#4CdoiQuRTGn4&T z`|3pq%{u(e6D#glbkA}+#t?c5D95-C-K)qsozQ=Y`&HcyosN60&d6Qk{+;29jz5VC zK&Bcm55}%QoS^Sm!}xGOfMZu8j9ZtamW4tRxDsvK&&7sfS4a3AV;dd|v>0(10etV+ z&S=i$jt=#)|AE?)O$dJSw*<>_)mv^~n{J)lIN5n)~Z-1Y$sXtweN2`r^Yw=OO^R*h&Ty{J)`MpQm5~LD+bqL+Wy_``RZetJpf2G^*6%T!`ZH>&PjmKB@>p6F4LLmLypr^s-vO#zNfL^*(i86 z-c9{c`nze6u)eCyL2$Ob{;c=Qj_lrB-8Z`hPdL-{p2zpp6k=7@7M)c^fLiBT!MT=2 zRtwJ6fTsZSws`>4tzdt4&h*{BmapETw5#a$OtwxQ&s@n=Nf44mAfmm0G$Qe&lVX+3 z$(1>GqCL@fRnBCe2VW=IuZ432sc2r#XR#C>#|usClr_#oP&)9CxJo=xO*w+V7VwsA zX-g#`l_XXew@3A_a%l@7lkJi9Ej?{`WtWC4R)G?tr@`Xaz}9g`N`=2^M_LsTj5{?o zsniboE?;RUgb!2?jR!8)SF{3sRbj7`Lm__5W3QpS{htir0DxIQ~DABj$Rg=1*=PE*8|sAFW|Ma_8ac-@m}O6kd{<&I0CSI6r$<60lBQu;DppLSnZ zqV&7;o32;pJ3#zZBd3`+m1ZDRXQ;nL*$D!S%DtCQ?1Okaeunu!UpAu(yu&s{dfy^ zr{*mgj{JN*Mt_M=DZ4&ZrHnjVeJ(&67xwmAX>VV!b(746ZQaKZQ8(^p+PG8d7M}*I zPyGV&y4Jq(E%`>?&O_MFb+ny_^<&k^&{-y#hGgz+C`r+tuwIfu+rhI+4W1hv9!=iX z^ZyGfVVga)nP^{==#eSJ$JRY|CxeCrR5_v*Hg-2!pIn5e<$e9kA0N7V=_^~Z$8(|e z`I-%dnq5N8u0qZ3KRzVw?w@X3s$*<&hVU@dr}97%ZLba#D80uGlsp*Kn!%`*M}VaD z%MlN=;kd11!@;$KYP>#6SqMx?&iAN}qNcXeO6Hs#H=Z7nox`|b-rohOv-)Ac^x zIsCF~f7mQKOM77C(t^8-q_?5$ft07YyC~YC{O}0a43QDh+|M*j#_Fa1orih?I!lva zIpU<4pga2uAJ=LcSNTTfe|DT|;9UM4fC*L5sCkzqjk?Z0A=FW&3ii z8w#}>zw7=f|L;)%Y?N92jMd0Zp#e}DIL4+xg;6mKkB~x^cQ-K9i+czA&J7Rs#gFtQ zF9Hj;UQ&<`8(2RQk^=q*ldmA#CNU$x3KDfBX@OKF77j{?=`;-j!&rAyY!3x5P|!ue zuTn5c0d4jC+Z23>0@|thU#8#_2t)&+qq_ZKkkt*A^e{d(KO7zHPnB3JqL~zYhNAqJ z=@IKyQeKMjN&a`~HlW;gpr0KBiSvCOYODGGNF|&D(3(;(F#awjErK@XFmwZA0|SHo zgQOHy;^W8*pMMPn!;KR9On4lBn}UB!0YN4H4h4io^WUK0Hz@c`1fsp9v>@6pT_pZF zlw(9YqJ;fH48}iCmYqVXzv&brEf7xg9yjZl&IMX>sKePW_4fq6Y!LZK`vsZ))ju-#y=&%%S`K^)oNrt;+k4W)9!4@!i;V zeOq>Hx^t%WUd{T6y_v4er3Vgo!Lj6B$CCHzkKXNj$NNs#cVEcYADuWrnUk##>imVe zHA3B*qQ@)$Z(3GV|2H-j8-jB+Tz%&}XRoiy?ElcmH7zZA>T|v|GlxMuG*$ggm2cw0 z7vdk7I9DV3SXEtcHAAc7-lC0#MP0(8u6JEs59)oH1NZA1vz>4BPp$ac@b~LFCUwQ& z(y2?+oo|i`!S+d8v9WFH_|5i7OR>J;tGeu#spoQmRXN`(p}u|6@KMtq>H!dnU3>5L zzq9VUZt!bbtFOBz4U;Fci@`ezHWdO}gus^B#k0w~dvYhw+zUK6X}uq4n|eOCdT&0k zZ_--yG~B4VUX{K2Jr7u=9Javw}DPgAaC1FBa$|FBZ$ zcM6MkzU$igXUHSriLm;G@S_{EF zLU0d0!(``V-_({F-E1w&H0{-)P*YXr2s%N7U;g$v_I4{*=VzkEM80;_ge_yn5WDL# zE=FF34Lu^)b6elj6ZR03cnDIiNJO+ph%G!Cr?@i`feRtlLzq_Z6S%dg$)bnd}d0>gP-p!4#`)n6t2`m1}I8v$3e1 zYY0!T7wT8fIoQ3E^VZI}ShR`@t)GbsEnDYUP2daKtNz~0vi9p6O%2)8w_d#Y;v9$Q zbZlndx4Xa9eRpwg{XU^%|2!qm?bdONmKNKVKXe_|nd%=n7(G2_q8qdZ0^aWH-E$Up zYvugSx12Yfb2fHs=jwdfw(E!I9PHM~Ro74UU-xHE3EtK@RvS7Bg_Gb!@W*Z{BGJM$ zD*gCVFNFx4{|b_{lHZttlt%^(8mX@-X0Al!5hen8uoZ+3*dve!+Xn1R0&Il@*iMq( z3qJLIY9wl}oDR9K19D#%lebm@t#yjkW#I8h_pu&cL`|eb8X1uEu#=)&78R%owmxiN z46c`W=U0{nD6ucSfgpaXH zLv=%EO0LCFR*TAefE3Bt10m8teV!oJhAR=CW&?b(8<*{eR&&k@$c8PQ7M#Fc=)A)U$mB3H=k$*v+G^7Gf9 z&tAyag){b|yM|!|_Pl$sgc5)WmXXFV?B?&I4*t(5_;U*CDR_WDG?H*o)DI1bW@z36 zSTk_U|9}dxS;er*zoc7gEdSRO{AUDOK>N3pNC6DCxa37d0BGmz+|t#>wHpCwAG+!Q zWb1r$Cj0`b0kB#q2APj9t0%nU`p!8UrDz~6-4dYvdwO`T!)Wqm>uv>a2In|Lrh`K1Q&gHJ^!9%Ish7eanr}m!9)W6rYf;?**jTDQ3-oDY=9s0D(11ZVz_gh0oUU>S zNnb^Ve%6>>jsflg#yRCX2i{oCprR8{(Z#e{YND=K)rDrY)oKLVE|t^Wk4g8SCld7a z#t>(1mhK&upMX|NHMCm149Zp$^FXXEn~986k>D)96HQ|hN70suk3@K~^Z56&s8Mw8VgWI&}aFlu|)rC1Dd-ORCz4tUjtTV*z}L z3BoP_WmAd$iX64pl7b0Gt{MR z0P@lkCV+YKTXJhqUfFvg9f4rxZ@&0<|6%IK|7m8&ZD{w$Mv@Ts6AcwMKUv=U55^9B zlCYr+u0xVh+8hYH^YaKqTQ`w8WcMSQ=*x9VKWiC0VCa%)q_?sKEgGn1hXm2oDZu}j zYGb>|PbgN}N(dZ_RWw25otQ)+StOR}3=IaAy5I|K`l$_v%`v3}235cd=`t_x-`_$*C5_^fxhRqu~K)i z!RgRq{(qt}9{Q~uPqZl}DckmplyIA}7Lw>mS|k#Jq1pRnMHI4MVq`CaI10&o9aSZJ z@fb*659ePxZG78y+Xf-f>xU-}PmbrEONtGx#Xy+C+Q6KH3$LB$%wY)H&3=2v1?-nS zW4&#isr`22w;E@Uo*|oce+K+PH+8_uc3CraSqP3|rVAB>SkPjvm=4;*3fjZM zc10^zfn953$L8oUvV~y=b)p?ssj^)RBSiYo_YDoj;z&dR-b<75M)Kt&9VxhJb&S z2Ka-c0}$^n29}aej<4ungj)didSX(5pvbNeoNZHMg0pjG%iRrNB@s~nqD`7(1fmwG z%g|u3&+>Uh7&U?4@~e{iNhOgjGlWqGwNy)qh5(eV4D(g!PH2_$KuP!v2527QbW_8AAv z4Dy%&d5E4Mh*XhZ0w`kvlmVQAM!5}8#wIqGVL>OD)f@veWVg%kZ*aoS$MC5oz#=bU z)}@0IfbSb)clp$+s4kfMS-fD}{D zIjNSpos#fvJ8q%CO6I9N>_xyppd%#7ucWw+u!648>^H z@_$Le<59mF8VL&iBQ>lE8*r!eGXLj%3)7_lYWU9;d6RfJ6ihVQw zCWBRV60D+X7QiZnrkqbBM+>V|jL9owOA5yTxDYIQV3p3exA>L0w-&%L3%I)G$JoW_ z30Haxt*5KkE|;vA?zJJT>}{Ho#1*ks##Ix#B$!=DFCx1CMx8n zJ))H{PC)0E5zrFbM7z$(Vusf3Jdqt`;N4pe-l2?H0UAI)Ly4mN8h0YSx;zymtP9#RT1?l8I~9_p!6eeKY*bw@uve_e zGlD5o&W~4?1DB$2#I97M61xyoVx`}-#+Af)6>4>oEh#j=T9h(rchpE}3ek)Dd0W~}x=%^VVF+cEg$m~cs^wC}{@b8yr)mSDK6R8+?P|^L)M_Pu zVT}?i{VMN5TW!jYgPPPYfW2jHsk9PH5hP^Sg^fzA^m}m=fadzt7A57QwLlpczW&Lz zYg=~>?6_-+#jXcdy7rziXNN@VF_S%PT9s&>|Bj|f&I+s@XGzT?*#Y588Qbh9ROm7V zj6S-G*w_X*|5=uVn?bJeM+ii-)M-&4A1Ybb@L#0-FHyh>GZJc|X-VqhGa)Kc`pqV+`Of(gl}T!q*TtS82oZ!xhanSsoUxtbLavju^M zf+hyNT;KG@2@*d~U%gki8Lg@bPT7Q-aK?JSdHd}1?|8rSQoi|FDjayf-ghH$Jpl&m zqGeN~-w4jvIbicr%Ps1dJoI4==Vhij&C81T-x4miwilb*vd>RFKRr18v0PwFamCIb zxvk!6Fpw5?JfuhG4Xy?><2A=zGv^6p2j6usW8PSb&OK0Vu-S9&Wx%;=>TayMzAF34 z8RNa`4Jd7SKe(n?-~1$N9hbM@Y!aMJ_n?r~x=U!?^{#W*eNV@o(ZcHe!s`9`)d%l+ z4!s{*UkL3KLOU~izvpa%27~vo4!RDI*#Go{b!Z*qMgGCc=Kbrf3pCa6QKfD63?lzr zK0u|-wnWp1Y2p&G#*-w3A;i#NxPb&FSd)qOS8)G`YLIqPGJ@ZQttc5nQKU(-!oXsl<7o+Tpc= zdu`6TmUURPo`Yp{Bz6AE0&0v%dWKW|D#9Babh|*3b-)| z5n;nYB1CyER??PFgTt%jE3;FFdHvcDH1y@!iuqb!HoUln)w`ioH?0TB>HuE`b_xkGOOl$Yt}lP8 z(6B~mSTp0zH>@i(Y!w=|&aR!klyB(D9DHJ-&e^l9&;yUJ;0XzyP_ecyTU+q25%9lu zO|jM|VKI+Mp0}C2)jx*h$4`FWvR&YwwdTETf8q6AZ}_r*&da&#=IUV__Aga;EeyzB z@0sX%W5ca&H@8h&XO`!dZM$22FR-@|cvc8J`%c$8m-2z*-*=vvTg~}GAFkn`=1P`s z2d2IEs#j(#MPKOa-l?uPcTd&bbQgRp1>efNuN~~~=0%0(PNBJTraj-hD`$nO)2iv` z1pkJD{~5vm%xquYzbjK!bT{R!GE136q6_vPqS4v@a~R(auBmlQwMbG@LRR?&zpnCC ziMnT0UK3WIQb_Zwc~Z+$kW0{${7S5JeUxj2I2tS7(8!U!QY9Fxtn^BjyGkD5287cC zdUB@tl=f5*%(3?PE4JrJdjh91KT`COLs~!*Qdb{}B%s7E>$N{($f8}ek9%kXt>FI^uY#Q0 z%(a9CYXAyf2Of~wV>zx1J+Fd0B)CKO+BO&3_6u$M-*xYQK>E#7C$b&WTW9)b`01TO z8g257)IsGE{(X% zm^N0zdcZ%bFrU?(##nNPSP4(TUdJrz!0eP4z_Re=e7Ms7pd=fW-I%Bqb!`>GW%g_32xVl`UgMFCn=;KnK66ZVwoKYPkH zZigBd_>I6pjbv*FzcsSRX#q92Z(k&ndA5{X-3; z93a`-NmvO1XsGBicF!w_|L~vS&b8!-jW+|}@PxXAy>P{hI8?vE38COFQYD>tujFD{ zkxi)rWpp40inRT@BgFN=jTsi53|% zf=(FGm^`goUP5dntN*B^u0v2yG)c9qz>gOw)o?Zr8?k{wSS>|l6Bz>E8cvE#w7!Sb zM@-YTa8MT1>?W``iiR+-=Wmkb%T0OTCPlzDntAx5$MQ80VAJ6J0Pfhe>pZ|K_w0A0c^W*TzG5!2+_04)`qPIT| zwnM%BZ|t6lrIW|sh~2t)^I~Rc-o64*&{hK(u{TigkiU;Q=xGo>unmIzSesQ6Bo6x*%_He%WfDHEEuWy+$e$(-$lT7K} zbwE98nJo3{$^};E0~<5l58MGDED|q(=B#oNFlB)I4-wIdC$XF=0BEL0g0DxHvPdCR zloz7L=o-4xSiBqL#S4-eTQ$>9)jCv6n;~yj0jR<<9=BkjgJdlc$}rch>~2ye#9@lO z2_OKX$=gO!XAwO-jLa^_B2ie848j5|Cb|}QB)@GYkA!eG?jiG(C{x%@#IStGV{j}b zTHqZB`#jGRbb@YuGq&}6s#Q&to)As!0sc9R zpVmc>Oa{PlBD_S&m5-H6ueD1_TM*|JaLGZm5rWV9#_ak+V2u!1b9YHDuqGckn6E#S zaok_DM<>uy_4<5T1+;fc1dR6{orCu^PTs$AI}F*WRAS&^k=+yZbpveFq%#CG_n5ah1OP5t)!L}W5Od<>PSiq8F1kTP!l86j4WJ9cCZe{H(gZm`S?4iJSnx>#K8KO? zS?DpCO;%`?VhimM?5%lY>x1e*b|1NZxxI2GI=e5wVs|dMTd3ZXH|{CcHf3YC;y2^B zpPe;=Y+ki1*St%p?Yd{|l3qh^_*Xg)&%k@#uy!8)1N&_MD(4}{WE>g+)K3x*5_4<8 zgbb}vr?a#QO68=vTBDt-@RW_Ak)|w9>hJtT3P=%Jp0un$Y0~mTlr=)ZA)0q=^6_*_ z0hvW7pOq({1(UBDlP_1jOt3G{8<#^~HnriK+uz*&t-$Q5yBqSIhjNP#3I4-*sGj{yTHkl_4DCQ3y1Ncaj-Owl;<~4181*LBYGbt&)K>J&jIDH8(A&#F`x|GJ~ zEBl+~@CZ#RQ?4{IuD~fO&0rI;4YEcn8LXvk&>&JtC3wLlk7D4|f@}K2cQ9&Wew>X3 zzy#_Chz$^n!D#3_&Z6SChQ@5GLRX|HII4_$YQUstfQhH2Bds=L@)DooY5C*eDnd+{ zKU2t3K21eFLjeIh{#6P-i$Js;>3TkLqU+S5qY`gJUQpx@P+4QF30P|MdeL|m^jKy2 zi8GlXe;tteV1MGte(2xPdM1%Km<(0Gq|2K}#?3>+~Z`+;s?IBkf%cuP_F9_iscg=bKfr-QK!=miWHzG4< zX3yl8?9Kc4!9|8|8F;2I2+OIA??C3z18*Q>E_(bK)B8^Mr1=f^w0Q|IOiec1ETu{cYoHr(u(J}me*=H2i=df(H6HrKxC z!c(4&857-`rj2hprw8($4HKrKt!lDvqAA<>u5GbWMdQ@v8Pl7;RH~vmdwJS;bDTXw zRWxTur&2f5(`WLY^%>Ix3kD&(@A?t6V(Rqt={H}R>3w&}&ihrqTwwjo>6rtwTkf8| zdm!iU&Q~4DS&zVF!Q_(6_MEXn0_NJar!WI581E=%J5j@cktxj^nYoVXRgP5W5dzhJ^$BQlriY zwlHZ(m0c)78wb0;nRRN}PL=`Y;tamH61tYF1e9DjG=cz3u>tC(4 zuv&+vg^Nn{DY3`A&0IRI32j$}11Mc#7V&Q{=#yGnc?ZVu=j)TTtWPycjha3Y+lG}5%FR^MGp^OD>41?i$gd6MdNs(HBMee~WJ2_tbU zh2(BX7LP_cHdAyedFNvB(ejdOdD4?Dm7yMr9`!A9b)rj6VU?5@SgR>OHAqI*^pSBS z&j3&}HgZ71m?-B4TFjIF2me(HzDB`K3a(IpvIw*s=9eYXnT8ifl{{r{RCQk8B`oS7 zf}cR=RqG5Mil?OIS!`&=k5jp8T}w)(%|%6TwRKscb&Jrtg{6WO8VoUYjM_qYr-1)~ zop;*`yPp^EKkz)$T}2reJK3)Y5p`9YP}TO(pmQyGpx&AEt|cf0VR+RNp=!xPi@~)N z(h!>q4%(sk8d}d>1qTIZuoQtS`Nn4W3z({(Zl0}$D2Hjy!a6!v(*d*p5L25CqI*p( zw+`Gq@U=tK6#x4z>q=FSM4napv(yE?*EG7iMF?-1P2CIbWz{mzY4E((JIERhhl_oULfmjLH^5>)U9M<}=yzFEAN_$v z5LSKTx#=@AAHRDlzl6-K0%RoTmQY1`_r_f0I92*2>)uxMD9*&c57r!c9;brTg; zc6eyq0$0_L;>_#2#*+%Whlx?CCbD1-n|l_h8&g9GL72aXfb5ti^u+xuztWTrXj6VE z+u1nft4MUIROxg&Hc{XX1LaiYxl%b+ldY7a#46TABth9#DhGZCr*QYo{2c_RLdfmn=;$Imp#K-cGaZJYEC1 z&7e#e&ex|~r%x5y_XzEK@*a3`d{EO+s97e|ESuKlYgR~p9aV?~bv;j8rmOf7$~>s4 z%^ZAmT@z)(KKT?rK7DemU5r8brQm~=B+W`5{C643ly;PR&;p8rDm#+URB3udRW+Fj zZza_s7;S#S9$mZa{I8XwAH zWCEnzZqvBiCT~RQQBRP~cgqUz_HhTCzpMB$-eoTxp@?!DnR%YAlJbwHw}eNPUCfNl zL3H34a_A0Fnqq32va}WW0V)1wBllVlRF+Fvda$fo+-lxi!HZVfrd4QG>b}tWXg#2Q z7CoA(JTY#=2x=Xcmyh7d$|H#J(=!Gob|swuTX{UI{y!Yg)Bnuztoj9wXO-rwwmmxT zIGY$=vr1=V!}zi^3m#X#N>kpWDZjKr{qD4f9fn|4=A8t-!?qsDNkKcFDd7sR%y|U~ z&?PbUT^+>9?6Nn8!J%mEs-%csOH$WSxY8O;;3RB1U@9uXe*$)*3DUC^k{Mf)s^SkL znIE9oCn)$M3Yd}nZi-z*AUc3%kh;ncsdA}mVxp7175b)eoDm4+Fjfe*bZkpuDqXhJ zuzV6GPVAITUn0~xvvdcI7D_#TMXZ)15s9oIyhf}`4PS(uoSmRGm^c@Khy=no(J6UT z9v&DVofuaVsx+~Pbcf?KB!Xrb@7W3A}C8lm&A$QMMU zbHUwOX_q9XMX3Ce2pZ8%yWU8=uRnHvIF4fw$GU#LjZ;&_CfYm?OGHmGu=v*Yo7=y( zLsF*72Rbv|#pV_NwsX1vm&?lKw<3>VeJv%(7~!qH>jVMP^LTUoj6)-Szl<`CbVqJw>&f3 zKg;J@c2h>SI_C^N2sXp5@YZLTSLGGc-9Iwuw^iYQmdUCq=k$5l;I)OPz0*CjUEg24 zWAa&Mf4E$zU4Gx!NbX&V!M16~Oj=mRTnDq8zT58E?ZUDHLh!)6$>4ANdxNR29;aiq z?VPiq>JMwlr!h#LJJoMD-)^3Dy|epX_-H;9Ln@O zxp~n1n{IvAzw&z5jCZEzZWnscRz3MF`d_!WP#YF%!#`_dEvP_htdDD6ObRl07QNkZ zyJNQbowN5=9M3nMC^VfGnogG*+gz3Dp`%$_3xRea&_3Nab9^>&FVLOoz8?r_(uVE@ zjv&obcf)zznT=0(=bE<6^7lNu358P8BP@OOBD9^MM{gkVB#cQ3CIxRJSct~08)B}{a=P_um6 zo3B}^!B!Uuo<&ogdCw9VTYYMoCx4C;c-=4dbCd>AFEMncD-WH8h7KMS$F}@@qlQO^ zX~34L19oHc_08F?FK<&v>xTQfdrFu0EYeQP+dA_7^#&THI1E;I6n=)9N=kJ6pHPrP zz|KVfganN>o~D+6)JCtOHXfDQ_@UlvtNZ)f)wa41&+1GFs|>a}9G>j*!NYwI$X;?Y zPt20{AnwvlZO(gE(Fsz)?C~$V5pmW`c75TnmOADmXyF1)2$j9?CwQe+69R}|g?02Z zDz`#>#MUJ#w;I!WoJgvB&G9d;=DJ-{bLCIdH+i0dkJE=Coh@aTl)emUNRG8X^~-3L zNk2)SCxx1R7G+X*O?I2$h7)<;&#qVMBE8Zojb15`E)nmCiQj$vX#DN`fNnoS0i&|t zqZnvQj{hqP82?ijIltq~t=lWK?}Jg!9EY5D*1ePbsE#M-Ij8 zIELE>ck3))%27(o<;*yg+t}-Xn7|ozy`d*(I8hv+9mQeUQ7>E-<4A0&UOWevI&kF9 z^g7uQ*ydi3T3%buHeij`qE@e(OSLU5VUO0KL^W!4qpkIb*C3ACqYa4HMtyjK?u}7D z?&~V{!!}@!HX^tFvAF@{He54@gIcGJ-N*Rh{#cTZ;XX7pFuZ^vg<+RCXEbg8xfcHl z`^NxUAaxZd=#YF_Ikww&en9bXQD*fHVGE9*RkOAF-epbiSWxiuv_y&6CKe*|{;e4O zcL?r|oK;e;63rKJe$u&x$2k)3jJR3zWJG&|19Pt;+piK24wl-m6&wUZ^&7e`5sS|V zM@A%P8NkVhr6Z#(24T4ogeXKMJ=(oN{ z=`6KgbW3WsY7_atDC?wHL&H&4YOMXqoh~arN&O~12YvFkOiKFT_}WvK5ft1z1ow`d zb%z9PsDBIDuXtLT?Iaq90vi3(y#e(foZWGo9Ji5%vuI+vmy)t&`BZJ7KF9|}!&D*x zzu$w2!Bldr^U3|F@W_{`FQoj0zWAG_je@&9XKk07{&gh7&v*hW2D(Ve%FJ%U2p?x( zs9_ma+}{_lvW?_lNg4?LeSEdBUaX~lk}0?J0VI;RlqzYbN~wzQfZELekfws?kq23d z_MvjoLL+-$g1u5pe**;`TmR&t#VFDH%AO*p7P5Q+)*3+&DUc(qhy70u-ckcC((TN1 zvx6`Vp(KAew|=kS-j}oPlRg1)Ku%#K?BHm?-6CvQk~jrySk|qAw_BPk5lL6eDcwlg zTKpWwfPYBA4-lwpkJcUW;%O4j|8K~_w==!$0pvlv{8GOkd6}Rm$}TnE41X51`IlYr zYG$d#;)Pc;^94eX$}adNFjrEdfgb`RzgeS&2cHP4mR)6a9aAMGg_4_Yh9?r6m8d>; zfgS45cvW-_X-~I=$VHo1V$NdD+KE*SrZz&l;H%74(W`@%v9#UDt;Cv@ay{@e=4OgI z9ty|3IF%BX8Y)A)8h*&CnWmtr2Q&V0%6tI*pP?_$@);9tJ6aZY@qbCFA_eUfkdG>j zyqYxiN*aOuB0R%13`_5Udu}G8`+!Q$Q?PKF@$XSug}QOnE7Jx9ze`fsbt(7K-1UyY z;!@h60MHnN?)OlY%qu%D|6UOtv<_sW5YbCFvfC68}YnQ{Fl&`hmTP^rj z!-!Jjkt(>P5ZojLH|4DUV$;$>(-xs=3%pbPz)^dD)#hRYyhyde^g9QpRKtd1U_~LY z0dhw*0)xZ$Dx7GZ>B$~}e7$wWjO*^uz0hfRVtPLmo^jmmz85;4!;QCgvNyMQgHX3& z=2G6f1^SQPMFnqI@P?;*1@9*Le!1XX4y8u##*FzveUS94p%7l*0S{ThWw}+)%w7^! z?Gl36zx@x~q1>W%IrqApbsYmtDm)>mKrn^Dll6!^ZZ9BMpEi^{pQ@CaN!s`fd}&e< z!~)^*kN8m>+AqwM&TID2_t$O><=*Y3t)jftSjUq=d3pK&L|=Xly2g}O$yjuq(7cXZ ze&INR;L14**Rp<|Gq%{tbX1pf1jsNil(V)h94Ug2D*X!?&QnGzZNyl~FqMrJ&RUK8ZaIRq{=iX4Oc0j1f8(1WNQX?AM#$sI0*7mk(LSDb<2x=ltI&Ot@miq z)=7irVHUQUL4(6IjGK*(n@KZn&PwBkIzHMABgH?Q^s0JQXQQSg|2e=D4LyUO`qeCs zOHfBj^4yJMGF1G6fgKtpnTsadfwmIVkfs@xz;2s5k2BS9!nkt4)!4Qt&!{Ip{638o z0cl`hzz~9+La-D18aPA_x0qZ_I8MRXgvsU4RAqHEx0)4qCh96&Fh-|v_qbK4?AYqq zRNU8TtzslgUW%EIVg8?JJ?eQo<{7W2$tF1sR{fq!pxh;UL6`9moJ-Fw^xMJz7u55# z`BeHU5@i3cG)lCKVU(7xC@k$3mUict_6XL+8Rul;fr3jFeQibmQfc6V8!&J|JMgLn z3sN!+k!r2XociUDI)HE!1M?B14O$tJBo&OO$^%O0gy1M+IP{d4G%aS?ctkgZym8}e zi&2Yl3?L$^V@@`~#6qnNKm*`<15?jaL>eBkzD1Cu~88lqn6rX{f=;Rp<~viu?Zwskk&h zn8;Ep{Rj_KWU#a}34ab)`oxFbSY63sbOKwoovUxT@zV8|rZ!D?jIW5UK`@JmI>EvtWf-OSkC&iuN4LJP>V;QQ|C9F%pudH2ek zb>*Yei@HwJjoDm$BxpiWVN}Q z`sX?P%+)rS8fOf19DZgE^Yk;f)@fSB#t6UVLqtg!A=kW(VnHX49--01@1rH@3=+HR zLzI-}75j?)my)2#Kq_D$c>uGrvPPXK=fTm1Ho2Vg1_OJ_tI8UxP_Fu# z8^;2xhrFHXJ~A3l4Z`-SFMdp64D*AS&WzzFF{>pH_!T$JEK(U%QwOgUdC~?*%7Ya- zMSxd;aivPsSS9{Z)C{h$p18uW-KBNSu1cIE7;~uxNJ=@VtEs%Fii&=!1#l(RzH_j2 z`+7;i*8J4>lfF#Z^$UFe(-a&{m8|wY>ifZ`GgjhdMGeG}V%$2_przDB@+?wP`mX4S z6jCMym0^Bl|M|6$N8P4U8naV z9^?6$3HeJ~$TFFZo~l$ERuAo7WtUQmbYKO?!X{#ULwqAWOA7m8y&2XV{lR*FX&V~_W-1md6q^5=QmkZ9Nf^+Ge*xQNQ z2`HB}!AqKVV%aaR%G+DWJ6a=@#84L6Z>;PR!MWtl>9=3H{Sqmnm6i3QEIbwAlwMXA zUZz(3_L6V6f2$p38||4Y?2XMU3r!o~Ic?^W(6kv0$OpmpLU4l++%Ut>uAMob5AG}k zj|;)$MQ8Jrp7g{(M*Hzxa8)6&P6(`3vNZCyM#nZhY$cr>3rb^HXnrD&MdP=c1LIsflLrk)h`)S|E~% z_NDqnJ81ogEPmA@(X?^U<~DyHGlv{OFqM%W80nMh$YD4?H847S-(xl=?Z${|=u*K6 zIhWMFOUi~QQ(dYW%C5A9z=@;|wyBs>bVcci5wwji5~Z9}+GsjQPBEGOx!hVQMNbTY z$Fx>KZdh$nP|1PjW4GZ78u3vNwqXNp!z3*is z#h&3_b3$X>N=_(~y4R{vyOp)Zp|%U6B9hvE7UwC+sF|Jg%W-IqBkf{6w?-Z8Tj1W! z$tg~j0}VHjk@Kx=+KW5s3y_r zncf~4agwAkDVk+Eo+enyqbb_ z1fpq5N*f9}3*i$KkO~9CANeHRvR%c+x1y+|Si$&hsN)QIg%0M4u1SYu`s%(ReX-Ey;fyiSP(Gn!tJLY}Z&p1y)kPwvJWwO6Xogdz!t@ zE`bBqMUR>+o79RP^=*ogEyIBgc$MUs)3d-~5}HU3c#;=YMFV#7D0mV{QX(qtGAyQS zHIMY{sawq>Pn@8w=11tdBz>yQZW0`ermoIh5ti+^yYZb9zrS5R3Q`g}<$ax*L-4pH zJMUbY_jkfGM$5|kp4usUHhs^tYPJoVL8wXsyR~gO=i&#B8dhsDv?z06@*reN#Ya;D zEw{GZ+#;Rgwp9phEjpKG_L8Zbw0#NAb<=(X_nn@}%aiA)u1sGQmTkV<@QzRDI&sf= z66tUUI=Ln1TsdV&Y~CU>XsbA9Fy~lS46K|!2<{8ix&hJM%Zlxtg?1=&?Jd@Zroz~V zI;wM?(3B6F*x2KI!5sD@#xTzZ*L}ZX{iLB-vv_I`^ip#*Yl^<*(=|CCoGS*4HTBsQ z`IgHFarf@68?4T~I8Z+*{nzb+_$1&B)Cy z*&x(zr2F-Emwcy1O?lv3o`W%|Z)?G~Q}FG)+m-X}%=`8ie1`?!;k>UKX0UZ_*s3-k z)S)5SmN|odb3LA(`nXU_MElO8NMDxoEl0*q%E&F*D%5U64xFv}1mD`ZcFwn9ZjDAF zB3iMdk*DmWSg&0tr2ippZ=#D#wAI^wOd9n9QYr0r|A@%v44<=n&iXmq&!}|dP{^*& z146^5$?%4L99&8KL+taG32PHu)Qj03K)@aei^S9TK|%N1hBazSj?uh-#k z+>B$SoIPW8+tUml?$Q_2c}+LkD7kP2!l%y3v5?ldjoUTu9FT;V|xSZ$HHLT zpm9f~Jrh(N(^7e5B$E2W9GnZ`OEHJaD@wa>W$&%!`{RVU67fvD=LrkrKpTgGKXaY( zlM}F7Wg}oD(uWD?M4Reqjxnvn7I=hZ2pkT49>PJTL;W$0V(9~PnC%A4*d9mQ?x#KA z0D*riEd%CLp7twAd4_@hr0A7wY&73pbZQq9^aa^bUGdn^IULKi7u6*RQDgx9Wn2Kh;|)J{o#%ILnqhV zI_3Lj)0<7d-jZ+XoOw}b>YBt6R@8!;j+sWGW?Rm=P2$t)*x-Zd%)WtH5(DBejnXJd zXn}>H1rp=-tN1<=yY>h4IhRsE>PZSoNZzUWJrvNF5UbC|K&8gCCPw^m%B5HGJro?J z;1~jh?)wBj-L74`r0-V4BvcEBzoa$y7{c$-FZLz+97M$ackIVb&bM|lB^}n>DKy~! zv$^$$5i(k(zP(tt>Y>Td3<|0d98l@r0*NkBRCu6k62@jD*6gO4On~??S$b2NTq%% z!hF?O4N{g zIFzPb1?FQEQaxo0$Wp0FB$;w*URca-(WI5CZj zZ#CU)$}QW31EK@p3EVxC-_o7;AA$S&K)4X-5CR?3d_I7a$$Ss|i#}@6_uV!3{T&7W z4goGpJPkQNp`(pe#FptS1U3tS&9nb+d*2@1#*yYXgBJl1Bmfd10SE*E@GX)MUlR3x zS&}KqmMrUGTWg!91W2MpJv=0BiwF9wi<6b^d|7HYIgx8`8QNTx=+vf;dzH#+D@iT0 z+0ANm$ps_G2P`a|=w0vCUFH6OOgg?zrBeBQJr6uIl;qk!?yf3RWKT~|cTc0IyXWig z@%w61E4JMYu1*Jciou;~p3p;imYf!vl@?jRwGUc#cjP&GtZ_ z(j}8|v7OU4y3s&sOt2NA7|{Yk3k8SiZjfT6=}{`g3Q!sa9n^1si)`<5IbDO%zfiN{ z+Q`+BoDp%5qAp*qjA9U+*cno_g|7DS>y5z$M|IAKGjdgVr>mz=Os^1~O}R2kg0FXy zW0_!vQr(iXP!c3U{^_R@HPd@x)|e}&B*>9!7Aq+B7QYgLo2sUq5oc5wYMs^39-K9c z!R}lcC7C%-XU;+~D`%~m>JTmUi{*6rBJV=UYH-MKR!Sb3EeBplu8fi-@maE_J?SK9 z?j;Kn+1HBw{NwoL>Y>OH7GFkEvGgYf*UYN)#sD%^4~dEiZj8;amN1g*`GOp9IFG9` zIlqB9bB4?Z)){5*8jo5o8sKJLmBS1kd1lA>*|QfVi0tj-!(=tfB=C~(1@A~b=-soG z`NxPXzcLqSBS7`(u^J2ds9~~Th0QyB0e1vz5}8tA4FnkA1fB{Swd_0k2@F8!t2kiA zu6_l;jlpU_M?pDrK{-odIi}A7w;d%~3dEF{Fh2+Q{&67|WGse>p7jdheelq>=MTj< zYm)9to}K%4fQ~mEAzVIhn2;u%GSr%&=f`Y@Na0HV;a~p z6NSZDO``&1KZwZvkn&+A(G5|0yI@hjH8#dU6P&YG!-zi^T*p{*)Y<(uxFetl;3bW1XHFzGn-K(DK)ys`_M(yE3fD_2#U&pT~` zJFxDV@#&MvrWcYmFN(Go<%|n<_vH#US@sFvMN6PKb$0Ol1qMo9w6#=8hEwP$3`}h` znLSm!-#RJ)`GNkE!z%EuVYN4?<0)<_=6((|M+ z?-d9D62zEXKBqbnSWbA#*7D$mZEP7fvKBCV@uKNKK6R5TrLa`WhiMeI0)9khC`V0X zEJ9MA;^~7AeqQVQHFRCpLa_M&8HkNS^CN$cse}$8gAk*DwznASY}gaSPEr~n4T!9h zk;~Gkl*fKyk~9Sr4}IT7gQca5HopS3KwwW%HPNz8SvN4R#G^3M)Rt!Ltkyv_K^tS% znV(!3m9$MP(o1`IQ7xEXa)J9a#r_P9W;Fccl!cmC-Wa|h zqp={CjQOJBr6n(fA8hi0Ae2oZqE5;qgBs~f##8JH@t|!N)KpHWApgRgh1AMXp zYm1M!pWx;`(#)fEZ ztQ>}uFushsPna0amlq(qwx};wp@npcy0p`!`VCPspR`B)%jBwCCViP&p@pxYHABo>!Pj5w`^;oZL+$HA-WQ|1JU-F|A}&UAa}j|#2|X2 z6W1a6I;^%Jg46~Od>f zBovZSd;sFZO0}6LRGd|*SfmT(l$zxvlq*f@jZEaxSo0+xN<-0y(t0E4_Nw#3@JU#S z3=#%B?6Xb`zg3(zI(}>v4!IcbjVNHYW!YC`CQADQ`d@qv!SZ_+W$z&H7nj|$6yzz` zuRxJVCf9zCSM;7Ix|QvKO7?4T`;|P5>}L#TD)$w)3lnYo&y8GEoDvB^M!N(RAwY0= zG^pC#k`*VI*uvN-8ITCSPgED#sE>^1d$8E>gqbQAaHxb`g)5WsKY@DccST zKsIE8urFO65X%G8Z{C5}t$G(^eZ>i%0cqa5BUP!|qGrqA7{O$L77w^I4A!>jO%w6T z-t=bB3WV80uocpA#S-WK5YJY=n(!XSYjF*=RQ)`l&FTAS1c3Qq^~aV54)&aiSne_keM zV?KuxTP7#5^h*xyoJGB*IvImgQTe^$H@lRIp#8uN=KEdKW$9@RS_GxEvBEX_4XzPX zWP9Nw?KW%CHU-APn(qS?r&{l0^Fq%=8BdSR3!RWB&Rdbsd%_55Ynd!@27i|SpFP0kCAaCJ5G}YXvy)tN@e___f`DBi_o7bzLq7lWsuO? z_X&SQJCXm5szO(?Y=VD(8tg__2gvkpb)6G^%sj+i_(ju)AN^?nj)Y6<%YGF=E5!0VXQx`Jy zY^L8xzai^9OM2MsgP|;~)VgdfHmS0rjO5<>*wAr2Jjxs?N*2~HQ%z(@D!ffUmdI@3 zcPaKQs=z5JuACZi=&(*8ct-Zgu_O%%yEX(H6W5X84iBt8<4^?)5MtzFST1WGYkD z{h4q-IhAD?=-wsR&8~%+F}s~?U7M<2w`A1UvzseWVQ?!CK=5zuy|OoLYlBs|deI=- z8q|xTJERHqRUhXsOF zjZZs%6Fxss-!$a`tJO5w6FoS-^1B8LsqDLa)C5vQPC-9#r%BY2hJT){LoLCbCdMs&=j0TxjO7O(+2ta0mkk#?#sbTVUIvRu<&|q|!K7sI_ z6g+k@*c*|Ay`;;pX!v8u$Xb~cgBGWeIU{W|GbCJiSE9`ZQ`r}OiVRu(g`udlfbXUo z^!Y$lc^!PFH6wd0RczdgZMfm1l-hLu+#vKZfNT|=So4HKwxrxRjoF)M6dSWPSF*gG z6k5!)-Pdg*9BB_gXXP7jUwJ!auTO+Tdqd*&+xFJGHNi|xLt@~@zU%u^jeS76$Z`Cd zUIcEq^VQc@p~Y6ttJNiBpNM?pCMRoHF*bWbRdH84y5BEu1+f#1!pap2ew5rVO)@rHdZu?OO68U z8r@@NMu9v;YlIUcRhfguEpIbUY&{qo3la$t#Gt@dm_Fd}=u0@p!B$2^t&;Y~GJs2P z=v*`ia5loOf}iiU$UY%Jmc-b2R7Q)Tx@J=zjGaVif!vmRWf4aIstZv9ut+=-8>MDHN8g}!GoW21XLiPC0 zhg)v=uKS?h)xR$l+Mf=+AckJZ*ea)Zh#JD620k_i*s4yL)TGQU^?0@%6r?hFi*E%D}*W!XWP_uwN$h0xfME<)eXph70=) zARsJLv+lR7*@T)ot!csFsWj{&APCg-3jnd7re^?H5R_@bO-+^~U4vsVX#5 zkDI3S!nX+3KB}Lz$9c%kUqp{N;*PlGOe;n?rrV_bTp6yd>>7Lo$5`4T;DGP1i zf-&<-zYO16HCa8Tn&QQ)mE4oAxGO42Cs;V?#x=9skWkH*p!Y#zXNi>E5cj+8@rP%+bk)$FwE%$hMTT z*3jaML#s9MAlegF$g9(jYDwc`S`vOtOVAeeIU4;E55_|UIUd!LCP2BVJSge6QCQGE zEe=#eyfNN%QXPkJKaErA_&f zg7xM9pzh{^HmbZEB{xQ4v(lb_{38p}-G3k>D_Z{dJF?#Uzpw^VM_0U79%r>vy7%6W zuZa7|_;|AAv13|mC2D!(c%d1GwT%v~xeo|r^?vDWp!FH;`*Q<#ZhyggoVTLW=B!y|RS8VeOBs)>vY|5w?Oc81)+AOgs8MpIY)FJ#+3iABZpHW#0 zZC%EIF9SvClfwD&3&$=B`>5<^^w6JBu%0frQ_M@T?@(+F#nw@gc1tq5WGdgpP0q^Ob%lW{XtX%%AORlwkBlK9gB~4&_I7^D zz-&Gh^((M!kDnPlF>*3%WL2s;VHXNB9sE#X6WWutvN##r2uhq)TrTm(nU|cbg;?TJ zRMzfdP*H?N#zQxR^Dip~nYm0E3IwOT3YXaIH?5l+5C}kt{!6y3AR{B8_Q>7q& zsv!PmwhSf7Lr}KtloEMUju=^`Su?3;4#MImTOoIlG{CYhY1}CzZV-D__&i^UHeeI^ zgom`(WcEUuqj+NLZ#xgZO{3m{0A4|%+u4$K_K8lo-mJ}3)FtazC2d`B5mRNoYyv}M z^@bnp`Tm~dmi@^C2k-P9N_h{9eTQau0GyzGru{pe$-YCEE0f;CnZET;lyAZ1A=WRj z4#?ItB6=fWi`2KK>o`pyBfVPrOZ2 z#!R&*>F>G?Th8W{SNCV!zGR^5wi{RNQ~MWO-sx9wyV~bg%^wzf_GDa*5(9en>5pA~ znVJ>TZ{M!zoZCM?CicOhPT|c3^=jA0uHIZN=WEXSxQ#$3@ZW9dNH_F}4SjbV-lT73 z%F+JOtK{jX@~VAGKXnw`??5=QGF8_J8z|t_IcLtoU^Eb!0%B@&LYUb%3-HFyIUWd( zQx@=}Wgb)Y?3tVElC3*GcI?VkpfaE@!D-O`jBmx|y?0md6dm1D71I}ITRvN}F1dbB zYR%qTL$}xLCu7m)p)H?wb%?Hxlq)jlyzS~q9zFtIyt9_{>OzU9;e90&=$QSQ7+81p z>8Wi~uTO^-Jl>4Qo2-k7o`@QO5Tdg`VV&9o*jR7iL)(lkaWv)Wy6>%sho9o`bBmmYn*xE{g==)-XMBL^0YMTOZoexQlZ52v(EQlmC9{P2(#-` zb!#49=)kNlUua^kr=$>i#@i;DcN1y|L@dA*Yq{B(h-}#KtZ2{CrQUaaX$WS+ViiTSHiVQjLf1*baY&hr`2FGTd{MzeA7QE_$}lch4V7 zd3Il}$OJpnK{#LSP6d1ChQwh1e`vTll3Kr44DL)P8MC59q-vR3`!9 zY6)-1H>H{ff|_^uizND~ED+y-DGcaly|QmU$iaKLVPD?73^t#iqZI`SI>;E#Ul7P3 zK#?pk%0;zED-kEuLkcgzeFZqegvm2ZcCj+%E~8K;c0FOYKBk@^_3?hz$CwPBz_p31 z6N$)Nql6;kt^d$B<4ZR7%|)>+U9MaxuO)dzbuEiP5F5$38!~lmnfgx11zeRmjtnjJ zoUM_)qn{S?&GMepZ;@_y=@K0G~h zI@z{)-jE7F5p6~0;WrdWmN&CEgvd(vhE`@8`yYBk5=5usFoCr>qAjp_4Sj<30ASLp z$P?`;JtUiY)xus+d9{G=z@l=MxLcOTUeNa%O%SDM>K3nb46&OyMzWK#OMZX*c2Gs|4jE;O>DjS zm$-y*gt^LyojnFfFqkqiD}Pdp$r_p2>cbAZvMy3UCZ9P)D^VfV_aik;GzxE};XQ}u zLfQ!et{&vMSlxBmOhV4dE0YQSwmpF`z&e-&FZb}s-pDtrJe z2LMNf4|Q9}`pe;8c0v{r9P~+{55nRUf@|C`Y3qaax)IaaFoZCb`Dv%4vnzu_!%_9woDyLySIq+w|gF+^xc=Ozp#5j0)xG`0Zg|# zj6ZJ!Q=FP!d)wYXj4F>9=mC?ep(Wk0QEb?F^J}Sw-2hU9TkG$+ldHC*+O~?0tw~1{ z0R*j0eT6cjqbcXa<8m&}(Udex`=DJgbs@ml{4dW|pTf=#=&`3eBGbSB*lmbaP5oX)kqf?a>L9(iYR=fTk`xvbl>;MK8 z_lJ`|Ufr0lE)Gd6Qe%0?GmbFVSQ(CaVcUM3K zNKvVzu%zNl>o^Ufpse2f2{Jyhc3sKCzDWb;1SW%*T1Ywrgmf6d>f*-o~0o*)&} z(w~6q(~$?HSfbLWz+92nkn3);4dklMl<0h zD3ePRpQnH%g^~s$BTuYS{8>ufPQd}JN(I?TfR{B%$ZnG6w^6cOkwLDt9?a;QN?DUu z!E!8zk}iFYNH8V*3u+(J6{okxmIoFvsz>+*(u$mQJc>@1A2Ul!Lo3>jm7$9B!I@b& z)3_E6cOF=b3?v$moDTT=7R#|BbTWoylW1>B+uKEZJER`=$XsW-XQ$Y+bACwd8Mtj9 z$N*G2&@BeK=R#tjZ*DBz|E$>mY`XuT*ncp2=&;y-L<}66D*p@^Luam@$+&}w#&meC z7+#wWZx+Lw;chMrJkIU#Y?1bLiN3D1Z;j|%!`x(ciz{T1CGg#9tETi2bgf+VC*F^| zb34+&TI<_$>#1b#bIBtw+-ZAp%5cZgykPUCZ6OiMS;HFIJ?|6O?n{RE-?2TXXkspS zx~C0kPrK-8PkXvWPxk{(@3m$s{W(3(3zaT_wt20WE1>%8fTFKo;=_`%uJ+dVdAx)| zZOXW+)k8FvChbJ(zUtQq#pMHzw`eHhz-ZO&gFd(E?9hb^F#(?&GO!Y2XhhZt{5BZp zGNVlOHq+szHfBxaM;uzUG@mh4q=7;O^8j1Gp?TCmDRe~s-`HqDRj`(Eh7)}$XDe)D z>~4umo#>zIOZV;;dv_=8;go$((!59NtrVWe&?tmYP`0h31d;;u7h}U5yC}hV$re!A znhRv?1jn}LqvInn5EA(HmbMC&m5= z1=}h3B?VJ-gYk~om-#bFBiX-19R7Q{yGWNy6#PfJT!A24Q$VooCjQZG_T^$bl%ktQ zXpbx3q@9aIc@ohnK^!w`loYarG^P0|pbxu1!elmoCFkj1qYk6DB>b9g{1pP)BXL`$ z_jitpP{89uIr>XPByZ_;p`^k4xyj(RV#8SB!6!OLgZDv&HDF2_d=Gq_w;AI-Q`3o0 z{F(tC7B+q>8T<5QLcN(#S0=og{q+Da&~II;c6w3FoonVyPMF)kH_a@R>*+ivpKtcg zyMDO-bC$NKSF=5EtTpy0_T)I6ZUV#*r};x)&?z^@8;#regBb{TkMLt^KR zf$IY|_Fvymo_ZQ`rb=UF#uLctabB?0CmKad^P&+Kx#045?8ctVitd~q=LJh;$-|6D z&4t}^7MwM*AT<|p!WGVm9EZ~_{5+g;&2D4UY)y{CDcLu0t557hD{+zQ<2J+f5CcB5 za*>`)W83Ey3?R*OZR6FA3P@TI;thcAEBQQi^hOd{tcRts z9~+Skbw{D}b#_Q-r-ya0%3>0#F^CL&i(|~h7_MMwPX{+Ifcj)#v@XiuO+gO@y%aDq zg?+BcK$*M_kg=FRs)>vSAo7F_sb5kG`y38XY(E7rQLsS41q#?VS)kZoQ}DkiFw=~z zrl64mk~0hclNJDyXbOKzu|t$^f-c{t;1UH_DfkuzJ!d(k8|JW>XTgkeXj8y*Ys;{+m|{$ulvHx@eTL5-g{j8J+A8>hxmqj+}ba72444o zL-2)_)p+%^^A)&C%)R_>ba0jKdupZv~(6IsP#JK=0;F znKIkg;pxg=QP5M8>!*YI@--3`li%Myp^Qjd*yC!W9Y()E{?I7F8M z!#ZTBy5YT!8YqrBZRnds2i?cVGr;*)C%Kv>lU8cZ%Jc1jF>RXJ3tL0JeXg3FZdQXx z;M;R%o?ivrQ}@gUN?t{N>~JDKb~w@9u6)@psce^2whN!yYBxFmu4#Hu9^yg4*>ifF z7s@MhMx619t#IVZC}!s1;?6=bD`Jb3duf0lr{5*qxOn~I5=Zf+3&z)Uyfqo@S>hRIw)IkG&Evarwg{{zS^`33+0 literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/ernie45_vl_moe.cpython-312.pyc b/model_executor/models/__pycache__/ernie45_vl_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e9e8138c145dc70909946a6de3659fe94b756c5 GIT binary patch literal 27968 zcmd6QdvIIVncv0xO#plWBta4cpWssxAEMr}^?s1HBwL#>aZ#EVlt=+2eF51L7mS_M z+fW;4O|{L6>Uzg?cBiJsZq0PtS!Sl4c-uef-R*R61!aNU(z9%%ZO7eChmPbr-L}*I zzH=V{Xi!el{Lw4%-1B~X=iKjnuXFI{E|;Cd6El8i%KQq){geXgV^%zm_UJk84##t2 z9IxZ`FE@UvZXL6-I7`eHw~yJG-x_no zonubsx5ZrXvazzbd(6$k_LwK`9rMP^$I4mQ5vzz-j#VH)Ny()S{WUBPbWiiY_Fa>#ql+-b9`;I`#pmaJIrG15F3bErv(-K z`L~vHKg(H4O4D^oL9 zPL59?B06>DYBJIL2~`-@N!ByTs4$MK@SCR+v8f628J?QCIf}P_W(Kb3Ss|Jf#?c0U zY<6aH>I&j)PbHJlndH>$Ojs{Dj*U-Tjml|~{cLyn)AJ@?A2m!{ z3p309X@*9N<9Xe@h2tjmi`*nom?3j_iDhod|n36s8p6< z3n>!4F&71lCV+so#{{1Pe~HkpoJ31Yg(xteF zR{kaz){oxfupCW^>!@3@MgTIb;fTcU3Jn25$ESwJ#hdu3d>l8|hX$V6;Bk%&YR z1e#zl_}bq*+n0>b_03Jq^ zwsC;0nl_IDm$)b8xWwJljfM>ju0j~uCClWj0PymN?4q#lhR}m|f;tNT3%z&?G{y5Z zUsxc$O4$ok;1C>uBPbUwHV0%;^R4X+{&UySLXxaVzb?#HW-qf>l z{oNby+<5oZcV1nu6x$Dd#v$%=tI^c9#leyLNd=3()_O$H^Jo)J;n!;8zW9%?q%W8^ z;LnIZ6aLs&%A2D`-V!xM%~8vwz6LcZ@mA!p;m?kAN7TwYqc$bQ6}2nAvZ$GN^Sa3Z z`@TK+(jB}v>YT1cHQOG(e1~{<>G(-AU-7zetc>9cH}FN38}fjtaAYV{;tYP{OYf;)xpJ(91Dn=b6HT8n!gp99+Nz>?OV z<-P^1yF)(U5!9#cAt~VO2(XF#pj5f)&%aadYhTBhDtAgPXuF!V?Jk|OARp^vvPx}( z#~4euJdggqr{F&Hl;(#$DdP=Y0wg=ZuhdAX9P&H0{M@uh_2=Ix>(7T&-#q36Wwp6L zu4*+c|9)awZH`Lk;tlhd0kq*(>r0ss(t4P(BIH%mlC7$bjkZLO(TdihnwRP?2~&H@ znR5Le#~XiN|Gw!xWo4ng3bkaCb_n+DSZ|3wFsOm)Qfjxb+Le`!+O@e(Sr$AyK*h>t z%`y*ahG+f-?=PIQJW2bP+UpYiD_s*HZQ{*I+Jo3Ly-W4w-|DxR@}|mDo-2m;Z3R#x zWX+!Bpz1?Q+Dxi(Y&J^wsv)08^`VARE6$<7)(+S@3i?1~341_aIY$NhTm*`{BFAq2 zXx`RKsBuo1od)e?Borx}rd(&>oSe7x&a$*msASk8*%*!{CWDYP5%Pqv1t2yKg~t<7 z$pvC~COUf~l1Sc+MI{rTyg7&1xQxc56NIPdO}*pEA}j>@ekFGjeTYT09Yr`yvE}(#ZF4bTpO9NSN?FV=$(xY{8Q5qn z$_^tug=97!fo4kHI0!ldp+CtigF~P-E9|8hFL7ilTOtoug$#BhfzYXBWTUNQ^pxCt z>Sw+?a-&aE+%a+poWcomPLfj$AK`h#NtWcC+;MX@5>FmBALe zag4A9@2jz9AZ)c=TO)73iaHXbn19f6kAs}8j1jmGthF2cLu)Vp(SZ-W|MbX%%Jx?|ZPEC^>fb0I%9IZ;T5^pY6b|Iv6Vk4~Y0*l(Z3OT+yuzWSjSk&HLA{h|MQ5fs<5wP0k;@ z??1EhwYB5ffzu-Xx1L7huFdM&J2SUuR+8^7yt5!y4M?T_YR3Z z>ZTL1cNT6htni|*lZ9(r@N(zc4^#au5B=es)0cC4-W<6#f>+iV6rDllX%?N$Id4s_ zrZra`So+HHSJM^kkKNW9JIcErS8~1%E&s~EyNBL6BszCdUEXS~$Zw=8S|2;iJ}Tna z@^Ix%i{@N;^_`yEJuAD#@|KlX#qy!`-J!>|9x>(gfBOARnN-ueUZ=Sz(o=s+_Skp-zRfqR{ zx}q6nt5Ej4Hbyi{HkEWsNG_E~OoC5OOc1T5KD1Xb+6z)OaJ|5Rr8pwJN@;x^i*>=U zU`!dN3u{qoLurl1QX616nAnDG5Av3Xy#m_&eZ5M1d>w5pm=??n7Ub6GrF=!$nN2CP zDk@`rLvG~ND3*e}$eFUJc^9zPPiwSBK`v|2NNpIY4HURx^ppRC2Qmx;aB|opfHdR4 zPD7HD6zD4!#>k0~^S8-)0Z!O1W2GoXOu~^2*bgLKMAE$^>8?q-mn8>bugL86>O5V{q6eYg^ai7mU+>zcww`=>Q4A} zc=@@vzjpU)t1TaN-|JpC{>b@5XF71=qdiMlh@-bGi@L?#i{ro~Z@zl#)#aP3O{*6& z?!H?_iU)eY_c(QX3ZMJ#mGMk<$D(7i(tqdN?Q<)ROl9{i+oE|%mvdJwy4Vs7+ebeM zzGLA=KOtoK2?x>Wl~1q!bn&16{Ndq~qoWcR_6P$=5oi+^hTupB$Z{mpB!o{%$;eMl zBqc*4DG0Pmu?f!3#spdh0@Ww8pKPNr%d#iNiwgTFZ9kku0sncBBFXs*B|sug{O6XF z3%29}p_srKAXh-`Oa9H9 zi4S>F+7%6!3-**-1zNhTy=r=6|_RLs#TJKsIUy+qbHvSQOY(&C4 z1zcR|)}Jt^yyMAGReIqGYnXjy3{RMSxh0zY z>TR>%q>kYx_y67O7xXRc8@v*V>XJ|fEf8T2{rq_Bdh~=K z%nDLfem)bBh9#95CH_Y8GO?7b`H)q+vS!Lgnr)^amL*6grXYsijTnmlJ;HaWFst|r zQ!c!QG^U~w7+@H%NE$48zc9%H$$cdgb?}7E6Lh3Mj3VOcfaL{M(9aUyD@xg~D1I8<;HlBJp{i zKP6wG&}+LkclLxP?)ne~6aNh~&d(w4au$fY9+HeJQ`x6UxHfzTGQLBQgE%)r;nlYE zp0gi4l?jceFI*Hu7iGa!T~oHMN383ikbko_n62F<*6vz#Z!|Qm^ko_#`KsG=xZgZ@ z>tMRJ2ePe=@<6VsyHfZ8|R zRatkd=x$vZ7u_AJC$rs8i``FuxLfQ#ao>F+Z9S28pV(**zngj|wc+%>dG6M^W!>^n zwr;mrw|lKgtQ+_jCw?^Y!;uHhvyTk=4hVuEAltHXp3Zbnz5;+G>j>R0ZSx>j< z>0Z4cdivJdvU{Ep@!#{zmVt9MJT`O91K^CafnG7tn+x?rLY1xS7VElm_2FD?aM{1o znD*_?HMTrqZ*o|<(1;C;{F!`8tZH5n2; zA?9flJy2DPo^YhxyI&f;~ueb59`(* zF|cQCw-^|Nbni~=cC4VU9of)fF?4wSB{B3AI+zOtsosw6Y{zl2$F*D&!~lhj$YV!+`^C_H4FVfA!IkRO+O_91H3!rluT(C*3h^6N^}t!b zRmpj}(|u>|yU(PpXVUI7o6f3q&ES0}Kmc&;z3-k)$T@wN1W0MMb&T-Sue5~q&XcM9*dek=)TFL@WO)V+sAQu8o^ZN#^x7H=y z%x5Y6lJ1-T7=6}MmHGT?pOk4O_sVGVa(gbJ9%A4qWz}ragdHkX0CN<1TmJ&`GnGmoJBMhR*wr9G1TLgkB zO&x+dt);MO%BW&8GHe_buA-Fici@Eevd+T=<65wAWH1YRR>rC^zAsz}{mIpFs8@|I zPeosm46r;Beh&pDJxt9=tS|g7Id4&vVSMH$nGVf?9h5m38na{vCp4Rg#HOxAg*Pd} z0?v%;6qc#zKV(spF>oSKbu-4KO*Q~UTVUcCFNdTrTKt9;g)(hg!uQFcZA!Aw3R72z zok6ed3WrEK`ah%`G0IUfIW>d*tw`SnRXTBr`8`jnlXEGo&$is@Dp>kG3 zajlgKv0fZ2=PZJ`!oU<&Sive(JD@0_)EbLRtISMM`6{LH;u;dPu@tiga;QLwwcWB2 z{DGprdKz0}!i60h>MJK#-JGrJ5UV;C?YXLu>|HGvtGW^Pw`TpFqQ7%>&+2@}zkkuS z=?J799qFFK*`BAxo~Ix59N#3$Wv|!>Dyu&035uTJA6_SIbJ`Qkcm_cr2g^ifU~y!b zlv>U0+0dXE8q76xtsesY+dTA$(>IshXOwJNqv+~P_Z`jl9T)qK|G4kuW^i{lI4lN1 zxUV129y~7|Jf8`UWP=yQ;KfYvxwNxB*SIU)uxs&|jn2Mo=YFws|N5a!=b5y#eY1aW zw*Tz?{{qU(EjY;NpbLGu3=aD=tcBo=#&oQA3~lz7)-ix4>BEmMwdNsSv+t# zbKol&$f{d(4l{Fuw)^h3+|a4)&@1wf(uC^wAI|POFYY^^1K;-aW%ObH2=!q<@(iV&jp?Q# zjLO@V_4J6Io{XoD;MD=%O?2)@X-{R=&H6EXE<1cl9KMtpejfcuHxG!O!OiBu^xli< z&?S*gXC5@A*F^zntZZTQi{O45y3z+m&}j1jTN&sQ`U86dnUx9+u8oVK;q-ygk6scF zTo6MSK`2xlo%#Q--2A* zAE*=bYFvRCZISX4Do2ejgj&DMsF;rC5qm%zS|fMqY&qilkJZbfpxO z=)bx$)QUCVGcpiKS)e@Al`_^WDrMvCYOhJDkupN)?tswU`JQ4Z0vK7vkw1ny0qiE_+Dg<2n=tZ~0HPZyf%?y!zXn!D`* zgwg)?r)V67DE$^tc_){A79S`+Fmy_QQD3%_5b^4%DbwpM-(b!l8Yj8m&eWg|#J z?!wdL93v+|4(5mxlH?pE=M6X~=k2|89wDQd1yaHlY13vf*@0u~1ym)5s#|-=f;6pLv9M4`x|7 zi4gV|XeAC&j^81Nctfdz6b6hBV22sxQxPokTJ&ZF{kR^Lk4iAxP|2nNpg_IBWbcOf z)Qs@=sa{f8FiAbzJ|zQmEVBCGbxL5}Bo0?{PtAaYmo-zQz>)kg%u5z@hAA*8C0tRU z64(aBQV?gl0@kY<^or>UWc}9{^d;;lM;`cjsXl2Mgx@FUPssUKTAW>?N1$oe}(e+Oj2+3H)o?aPGrV>CH;N4opiefP1n^;p_{ z407rcYK@KhmX*uto~J(CovDXjpmw9YX1Q%8obEXaWkJvJO!_&uBg8`^5ukoN5Sut(+wwKh9jeq0ueyy37A8X z1^gu;jz30W$O6haXJy(^m-DqOo`yCj8`vcVcCFQC0!MC*FgA0)*ajB+qmE1)n7n2% z^~0O~z}sba%W~C?x!U$E6Iavzh%>{>`4;@uY z{GF?}uRaLwe&B$Tsb%^2+o$iIUU?xsa6$|``%yKV#Sw5up8DmhX?I)N+J=hjo8(NH zK>y;%raPb}$y}RZRAJj(CD}HbnjLvJ;a^hf9&+}PvmZ{OMJw@Z0y)1+p!xwkGUL|0 zOEF$`oFwB_;@!wYyqjW3YGGkmuquY6HWs#%At@^Z7{QR##bTe;2TbjnAu0T*M>Qm+ z5O_NOo$lM+E0tn->z0k8>BC1trz>#MCnIi97T#%4(l1+Zx6SSqQ_kWud zawX8yZyOM=u|;Z29QWI%TgFn%9Ef4+Cwq`XT@Y#%-pHY~bBpY9H8h9(CDdEUp|zj4 zfqhS@v6LE1=uT=jf~EU_*g!#-6z?rlN)N_S2X;lTUX6j)HD$+)JI~yx?H8u!?MqRA z%NOg1CBJdeG>P-fuUlD5@`*xaYoZeqs4dT&7Fv)NoChoc1H#wa9-%#NS+>JyjiszY z-#4pU7W$$wyE}|rOV3+@@(|Zt(U)$qpk{4;z~tCQVOhn_D9eKO#65kULD0Sc8Xs** z1fNGCINviA-KC(vrK9^4u%R1~hz9~1Y>dWIVwkXwp3VnP~j32 zCUZT6bwI8hJ$kf|(+NCj&TKS8VW8OmnZKd}C-^qKfvh(ydc#?7zv%792{B(N>+2JJ zeTz8Nrg=BXuBI0oB{O4U^nRCE+eIh9z|G+0U^z5S>7m2-ox@p2`+Z0IMi5jyENV!U zoVO)W&2yM>i_N4Ox0JzxCKKGn%u^l?A7iw4i|B0mOmBk0lauq+^cM?_bx`=ce6Ds*<`kOiciSa9{{8F5Hh2`M}+@&Jd~zg z#c@fw$wDV=B0oNSHq-wZ{sh$;3^LkNa?49lYj=Jd2g|ln1&3&v255Q-UEsD2I$jwR z-NBWbjJqps?P3ENWyG&kS!7TsD;06@j@6TpFHcD1>Ea1OLh3_{jy|48SFwr;(b0QK zj_E?!C~-nf%J@ylq!uhQbX>cT8&Yx<*kQl~ANt`k@I~@91vq3!SQlD&gK~hv`eob* z+R!#or2)JU>&%B#pDLw0&bXP8X*OVlC! z7jkGT5PnL|e<0^SlJk%p62c091t;vtFFBbP`fE!1ujG869NJs+%}FMP*Oyt5((aC2u;;dGGuV<14v4{lwXbG^N0wYT zfO_lnW<`6tqCf5IhnoWz)hGs@S{%vwx>nDMzP*d5A6C_Y?>Tq(+-hYuyiW}8TOat* z;U6CUi>kAWCw|h_1+l^Mq16tsM4yrdFL&k{_GVny-C?M(s?X^l(eMPm^Np;#^S--t zqoOSr>RbPc=s5`lPzeJ|J@Ac{3k2WZcX!{)3u~t{b%$`N4_L65z&@p|0rpYhz~$&D zWRC*`w?T4-$e|S|+$5)yoW10bo=cu5HXUEWHQlG&rrcA-yY_ zI0hm%iU|xrd(=tf zEhjF{F^#$S%F?-ZDlNB;QB+|RWqfti9rfT+ppq@g7X=aaL~Y(EJD!1(K9nqHI=u?u zmKtfBmBYzd;n?`~#CYud2(E&_4GDDZ0q)vT-tr|8l8p`!$WfN-bIb#o%-7KtuBxLX zCGsTW_XI}lY4nUn<6>cB9+VAq454kO9dYN$ugc#+q*ju@qx0aP^tDZhos=ZB$)3WK zLrIMfO6hq`Y?eRqjaoFmF0W$KHiZ(LozS1>WpFCpueO?hBbHFZwkKsQ>We0xd*7(i z{^*PPNpav^8r2Ha-V~;n-X5oeYMK07=tJHT9PM-{pGWnT;{Q;(1lJiUSIZ>ZRbOdn zsLDL_UW4WiJt8E#?Lnz3H40ppX_QVuKmvhvs;xxH0{H`JScJbo)^K%!R^}&&5F~Pl z*O0j?hLW2p_;Ye-Ka{su(uWBjlXHli|3MBh4pK#ODoS@U@&%VtG15;m%a<1lLp<4gc^`Mhh1%Zje5W%T+9!tgWkW~B(9y-?bhiYI z_cwxlYv$~(r^H=PWr9x=8&JQA`zOxcI-7M4!5jyhyv_Avt4zfUL}WjKeji3Mqu=q~ z{*<&P`~pp{msbTN#6Ce4g%my8_UFUJWK!YOY$N!8i)M&eQbxkf@vnXxYPij4}#kt(hQL9no+#6T;7~A=Xn)c z?J8Hwz{rH}#qBbhF@dd7UXcnX!=%FVO*@ZUArcp9yuBz4A-adFXJIjfd# zrb9#PF3@f@cA17_vT;~a)vFX3Yq|8Tiic6-j!YR>idg|Gg09d*stFYYdf|T|hJDw{ zg@~+C$t-`j%B_eB^HxR|$HI_Qu_K-D*m?BDIlo6hAlwZgSaB&v*4rj}+i>>`&e5#B zkntV^Yp&f`Az73=Xp2U!?WX@0Q-22+JP|pypwHjIS!&EgZt=!~+v-Z*cT!4Ps$yu| z6Q=B;3x>SGJQkZ`jzaKMpZ64xT6UpecT>|Aa3JD1w`6z$c^NAtA3LDsyzMEqKMBn# zbIAh*X0=VBJY|~Jc87eGyp}`9y>XPcq|7gK2`f&@)7!|~=q>2p7)j~DFusDmf%RXo zF4%BHKIN{wPhh{D#Ue|w#RcXe-5ig996bhc!=FGo0LQqrki?3E!p3u0%czwJ- zKi-sOI+$O9st4?i$K?MyT}Cm1B9-Q>{Z@Md&jkgw7%GO%M;81%1xsQ_K{!2*s>EJohvwO+Dd~VghS>^;`l$!K z$8o9RN|#vOgE9|sS>?c5<%5bLs?y(#+tU2uB~vcgnho}g!Tu%JPhheWC0X+W&mqZE zwH4-u&g-^%xymNwt?1ZjYFT;i-LZGZ)(n3TS-U8<4riM7EuF=!a35T|cP-m{M(jQF zQIptvK3&L(r>5krdH3T_npZ!p1o_t*&~Vo>{sn8 zsdQi<*U-B;=m6MWYLX&I(5>2A^qI*V*m4E>+?&an{|zJ>|)iiHkPS7 zx^(;roupo$%QT)}Hli_q-)eI0xd*=eXdApzP5s-hyRMZh5B$ArEq~negPsp95B8kS z)i$hL6Ki)PPcDG_6vRN!(uvI$oH!yIzNL|$R5s=O^*Eyiv&>vmI2Y>r%wevo{;Fy3##opiCeLVts^%DDSds)3*y!cHgjC+cWEzGL26m%a;x0C;s3=U#~K93e*j( zJ-2=!Q+I0VIM(A*3UQm-NFTOzVR*~0KG<~veQ2#-8u=7!G_=yXTK}MOZ_Zhhb=Hf{ z`eptvoXuMk2Ck)Ri#HVT)i5U*1=bxPtT#x`5FDv`PDXUtu5kH&9@EuL(n-Y^kqDK1 z1Un~x9Y3dXh5b0kFQ*OB5#lA*1Gmdur>zmcw8IH4corb+qK;V4j}z+a$7p-_Pug@W7EC=?}InUPPmGLv;CkK7bjYr>6)>hUD_QN4Tz z;<8b+1s+)_s_c~BRKIe1_4&2_brZW8v;Vl*b3$x6xy4b^NBtiq{&e_r=9iT)1flQ| z$r6e1vlIBy1oy-&eldpK#GL3Ie{n(vxorw3V3WALGQpIIk{K5_)7DDlwyeb>_6;}5 zz-S@?;SIK+1UuPDQE)wpnthJ){#8qz8aqF8%rqe(ffj5b7zyB-g`CY)I}U6JAZ^3z5@I z4&fgrXruMeN-@La93*FfoF9?HMDq+2bWpI59Fix(o=L`3hlRty4EWC_HsRnR9i8se zU7YTW?ytC(kGa6dT=h>m=f_;-$6OdOpPM<|uD|4-`Iu||m^<<@H~hKYsMCMS!THS2 z=}zlD=1zX5*X#73n+rEw&~iI z-H$oE9{YKnu6p^!x2Nt-J%*G{SN(X@@SH(6u&jUEcGvcpBmd(Wb5y5mUnzgL=AD|y s9QhyfhG%rT`sL8u;k)6-9Qhwl>Q5VV^(*>!ZSUAt_kYGwh(X~00uYoNR{#J2 literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/ernie_mtp.cpython-312.pyc b/model_executor/models/__pycache__/ernie_mtp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9fe9738d9fd842f1657dcfe76f826adb7d2a8998 GIT binary patch literal 10929 zcmcIqTWlLwdOkyPc&A886h%ptMO}QMY|)P8__DSWJ5~~VZF$|;Nl|7~mgbBs%Dj1I zWLsPc^=^Z%gl1u7)7Dxf!7{tRMyUg=_MuoUP+*NdrRYO7k+x)}4(cM=hZKFFBL&&E z5AFY-xsX<=1PQhW(&2xubIzIn|Ic^6GymxEI0=M|!4>eoAq~As zd$IRSZk!kRT#6sg=Q7igr-fXaA0E4OA(G8=d?xk@){ZiY{fxv5$;%o3JS2P<1YQ!7 z=^W3U%IC(@S0K;%LMD?nQnu&DE}qW|St#(H%U?-L;sqg};zcnpL`_P|g`|+oWcbXv zvD18#Q-Uw#Q_0JE=F{2BJjbPT*lRn-k0+-xQsM@mzA_;tGWjIO3sBH@CI6jhZbzhdqX zNgY4LbCweM0hHg>au`x#CN_%U0x-=QkrTTC4v2vy&tiu*%=65Z+A(u!(S0W;H>b;hjlBr=<9zB1W zL|HU8GhYxt!C5~83CrblQjE4JtT0DY$BtUP`T=jSAm>AbY0assfO{6)F z&n3ikkyqS8R!ro;I1|t_G?WSnK6{zwC|*oym9dIq%H)6i~4h z2R6B!$)~PHS;ZpqnQ;NvP{8F=teTQJ5{XnMDT;}Nh$Rxqn{7pVJ1xl(zFPaBt~d|T?{1L|CvmjQ*<^cY&E8e@hb#W5?2lIbBeH*F*7|XHp!`f_@Sr?+ zaQT_av2V%8zO@#-!JpIhs_h(+Q1y?sBP{?T;R z-CA*Plil0Oe%ak`^z;0N*%WcoiW^qawR`dCvURod$yv(}TGnAqMDe6^g(*qIx59Ix z;wa?Bw3N>0L?sBedOe+=67_c^O5pxP#VZ#0R6>;a0_X$B-%Q~m>j5=@+VH=Fa0jgP zmg$@1dkn{*=5fr`U4$Hik`kFj|G6oYn7dl;3TL-F^XZdAEOb7M4T|{I0ONZ3Q=t1dErJ<;ELP7SVOgHPhux`Ai70rEyUNe z;@!1k-v#l5UC~+3e7gMDt2x=e{Xr;P2}R{lba8k!ba>WN?b}}IdtB~&eC6P|)xNP6 z`!-r*j~v=_-|fGBX70@V%*u{qE5YOU_OH30SvQ+HowJ^roeb=JJN8zr8th%1xhK`k zWZH5LUJuRl<;QPEDEEuT54xQj&k9=bZ%o(CP_JEd1k zH5O#38|b`q?A2p68>Z}}Z^zs2x7;-crktd!r-m(RZsPRb4$p-@_dtryo(>`f&o_eZ zx!1n|<9!q$aHcl^fC5ag@Ro6l1`%4~>{o>c;}*{TvL$YJ5C9jgTefyK2MU~Pi+uMM z`HmteGUwqPTfvoaH`fBt!wI8luhBAFjaW-pqB8gyHU*0~Xlw4d*BMbXA!|E#4F++c#xE zQtHt2z!5eBNn7PML7eCqbq702Zd+2OG)b9W>H&9YX#gV@-T6wmq0)fc#OrW^Dp_79 zZyHYeHPb7-7U*HdXaRFIo-6elt<94WU9SJaYW3Gsx}=W}dNNi>+Nqa9n}{ygfA#S~ z8w0RS!?+kawi_gBI$yNMGKl-dShQW}f~Kkq3Zn$`CyGBemDP7twWiXBs0Mad0nQxQ zHj(CTDj}?w<8MkiJRl&VRtrU?1xv7r+MK#K0cWT<$EZIXg~|#mBy(4I;V3>BC#YDp z>IjAv=XE>PTc+J zMs!PcpcE(dZDn8vM2bT*@gYP2k?z|)IzW1`86Z^$qLqQ8^1xBVGm+}H!OFJd^0woc z>i#&`eP`m;iApdk2csKq67H)8LY2TCIk3kN7NyGGr)2mGJhj1^!>#Ksho|MC6Quc5 z8xHW}^QT_3uXP=&270RjF40j6v;pOzKO!0J4aE>M+5pzH#HJ z#a&yzs7*CR>e4zi;bx_H7_ zkX0Lc26Ht7z_r{oMd5kKqpy=1fC|MZ&M2xpN0S3`OhaJTvFUk;0D<|4yK}`AsfHt! z@UR>nUQDls56_NOy=P{n+uxb{&Kn~OQl@sb?%Yz7AFcR(=u zgwb(~5RD4BSE^q(!o?q=*M-enEw!hl5?V-oZIv=!!C>~`&Za}A)s-sg(IZ3Do zacr5BJBDg5%yN@ZcMUs%Q-cav*S(k;BHMZ&I_zxFfW9C@=dF>N4JA9a!*b|U1G+9q zsoIJ&Ou@eqyqdKB*RUA>|3DYOW$^b1bsd}zYnwB3YH17a+!FM*0Bg&}T`;bWEg|(6 zv{#=$ZsmLkdpSSv;ahl1bMQQFLi5Fc7>$>=pah42RyYLsxHjGoJg)^xfOq8DA+ME! z=r#b+A;qp5+@%Y`Q_x5_g^__09zo5h=G5u@4BqD^)C@=ZuZx-ikr~cOa=|lbC^3?L zPy+YE8SdxZMiR_$Rg&LmN17qU&X2t@vs-~z}kjFsbCY|H|({IP}8rg zp#d#lgG%D-5N(2%!M;i`CI@4c;9)s<7-8J~_Tg%1;N5e}!FwI6d(HrF7DRaG1llaL zqw4nEK09}|;_e4>%}5N$?t%Mly=Cj7W3_GnhRqVf7H;s2Q2prK(TZ#6eb-R6f5+QH zZw*!YkI4N;D*Y$q{u8sKx6jR;`;EI7=nuCFnFNX{wg!H09<6uxvJwh8e46szO;kOIEj#kpd|857qBj}@eBLF0e zabuE@)GHH}$HIL)W!M}fP^KQ?Gzc(7t4b4!wr>*7bQkh4d=Cjh@EDr|o);0mm8^F) z=r?07nZYNS)lIp$i`R+U2)%OxE*r0FBHEb>GWiHS^Z;B&71cSnI>e(8p|fF(4XBsA6n+z8vBQ}AqY&$Fp-}9D2$sM}d;wK?cFEpdi#^MhU%P+hzL#C~URtqVqDrLLRGSwc zIsOa1&&l5A-rXqRc3nXI=t)Z*aR! z&koHTz0g`>o3X~AxZ2nR=9Rr1DCl(p(gDswr7(=w*qDsrB5twDFZQFjqJsMcwD4Y}BmSuU)-PufY`SHT9Rd23C0mI#H}wwVbO)j#ihs z4o%%x;MOKFRZuHAXo_B_p;AGM9QI+;$^|4fi!It|F{avLu}#PWIW>*@b1a>^o){PS zYl;~dsG>WYDa81ja6KrAu_7D88$*Rn&>ss@RO%@jorvOj^jh7cSdi3Ge8|~z^;>Zf zhbwujj=C9h--K0-y~}R4nvadZs!Z z#Yr2`I|;?5O_A2KixY4O1ALCc(kgUVA}%m^P9vySHg0^7_T7P4Y&U#qWA@)cOYtY5 zB)7}kRbQy;9bB}?-aS+dELi4dsvX^xjstSXfu&2oeDN1A z-Wy!)cy`|M(c@=j-(&O6g|myH2W{Kn7}z#c4TP(0p|VA88w3$ff$am39~fG)ET5J4 zJ&VO1-Sp6ScByN%hn;A6PSiwTUFs(@YWvLzma~$y=wh$ATstcGknhg>CFLlCo_0_ zFb2G5ESavu{nRV=lBwBl$H7e{Q0L}1;pPYFbKC_mezU%Rm`Wxjt1lApF3tO}YHe+G z@_hzcfhV#TFFAFseNHvQh%~`RRF#jQ$^(#!WRp^Af))S)mx6rupFu?tcVl(VD$<#TbW!880VQwcj$nz2_+6+AmTG4Q zz*2kROwit~($TYyf9c*od$#KBTHxOI_G>m$4Tcw9To_wyTL}zTg9GI=<)e$qCD-E2 z^2wFJ6F?@1TETvRI|={LRr8SW2;S8OZkOgtRd=L3wiI3JUOv0he;V+k4^AOqX}-X$ zOC_kx=NlQSz*l3a@KD-7Q&CWe`U|(!Q-usK+`#fb#|W{dfJ@Yz!TL6qVDv-y7l&Yt zbOx)1JxLiX{6z)}e`^tjeZ0MRz>4J;O>V8JMF zRfB6;ykqU%zceauJ6v;N5*VxoQpLeyk95>;e0G0P z)lIiX>SfZ%7qn3CqHvSM@AT?4fJzt8`>Nq)&DEi*pXq5>B{3zXGorfZduSVxzcz)R z915t&ijy8p#{nvc0wQpQ&EOY%>eofA+L&Sp9~N52=zW^{A;3voTzm!Z5cPG?k1g!S zU=;8T!V?0r44~9R%s^B#GYs=^A7OU=Mj-O2QnpL+YuTW2;1h#QxfgG_Kizz=*j2#6a3^R$Tx%nG-s=cX5i KJ|mc-qxoM2Zcdp1 literal 0 HcmV?d00001 diff --git a/model_executor/models/__pycache__/exaone.cpython-312.pyc b/model_executor/models/__pycache__/exaone.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..765dbd7f0eab50a2e66ec468596c99cfa997100b GIT binary patch literal 19637 zcmdsfYj9iFo!`B9KM8;!36KOyh@?nDeCa{cvMt%xTB0ReWMui3AoijV? zB?e67uG^4_Q&ZEXqFbkCvTZ}vZq02wv&wYZEWyXg$Bs11<2@~qNzXUAVUa75Rc zwA1bHe=hC?NP%)@x1V|?o_pS}d(Qv-{*Qw{b-NuLo}}r!i9b2baeqkx?Xgxo_Z~HH z+;vXiqMX1Bh9sXdL=8OojY(t56g4rwDQQkwq88>iC#_K{(k)4A$`-Y;IBU|Lazq`> zZ%aB;uBeOo?MZj4A=<$Fj-)5$je1kQsE>u6$;OmF>Q6OAn^@SDY)-XATbSRS45Wh5 zAoDjQTT^Y(wp4qxJr#k?M?gGQT(3l?q40%q~MzA~vwRVdUNO;JoiwIv>Siz}UJ`qNjcdaY!6~-&`5rV^_?P-i0SmrDsG* zOizo$nRN0>_{8OSCM||j89_{jr!%R!crI}+DTZH9vL8A;4Z@kCk_p30xTH*u_h+>~uUS#?tW=jn{uFjqj5Zg+x3jzQE=u`zcSK4m0?? zI_uffnX|_*GnLSpGZBm8J2s!oysUof(`q{nXUh14J5KO|0y+&&38&f zV@iK2eo>5xskz*hSfZx+%J<2}X5!QM&{q^E%b29vvWiO`#+)S1BrZ>@i&b?!mB76- z@O&F|PvoLJXrIB((Lcc{7(}CB#Ggqt31-nO@C%@nfar@ex_lGo8blfn{H4+ z$D3T#w!~fLqxR>yh)Zcu7yr|kAz||LnJLA2J|PHVnynedw`~Ztj1)&D9m7yuuh9p_ zlBIR6n9e2QS;e9bLa|`k&r9iPjcn66Z1^KmbFTpB`}}n-hqwAHTl3uauuZj%&2zcR zrscRipEu0%7#L#m=DaCy`KO#<_<`XAqqed0T&zOEI4)<`>dYHZQp;~besfL!YVAhB za`9n~lRP=MR#LF$jcCL61LOO&Uhw=~KBs*Oy^m;>M#P>gT*N;0DNZKtx9*Wy%w}wU zE;c7+E=0_VBaxn)!&%FO4qKvvp^{8EIRK{yN6k!e=uhFRK=u;Q%Ayd$q-%3?Dw z1Yb;7$cHF|@ktVw*k*e)ys&2^{1qZ@lr5adhO60TGEz95d3_{eSImgTqP4}aRuj2c zO!88*uG;=fjTB{3I|ecoPtJ=cBq<|F4J-kT#%9tH8#*K=XQVzvO8w*vki*8tMo9xy z?6KH%GM>%GVp*EGu=-s6?a9$xYHoBcF*lrxgA(P2FV8>x$l;3*i=)ZJxzV{Rx$~Ly z=p|6VQI)=lm&Ix9QArJEM@2@#N9L|b7Z@nv zf3T-E)5`__UStX&(_7AR1Lta4ZMo&_y3-gcHSSw~bbY8W{&KP^Ez)rcyDKYmT1+Nmu^YT}1hH9~^(Wb&q`mMm z=V?la!}-mdT-n7PdbrfwR%!`;<_Vd5?luR@X1vQ5F4R@FlF!B=&Q3lD7l^En$t|O0 zCk0)c$6t1nuYvP5m8p2y%Q+f8_rdo+kC@CpHb~ayG@8?&){5`m&*8cMNKg+391V?? zqrp@;8caA8%s3P*qE)ob7^s@cVPV5TVHfR{kVA9`PSIHjxkRhr#vu}9he!hs5trZ* z-4{Z7e{_%F-6h^#KYqp{_}(-{J?uF0;yCgv9(Bd&_-G20hn+5ruqgKQd@6PxNC8JC zd2!;aAqN49Snj--iz_bXCnSL5S_!1$mt%98Y=S@t4(1A^$W}>{;+iI)RXN5OC6zydr&8B;2qD^PQ>f;SxrQ&UVAl` z+TT&9UVk-L{gxOA;bKlLA9B&5x#M|YS^aC&&n1`^&}Z~dha&(nT#Sxe_T&MTVmcVf z?a||DzU#~lV(ZKSV)cG#HK0{{-j;U)`j`QiEbmuVF52^Hd2&Pu*|TfCb^5?&&7jwA zMeVj4Fs6TZ!M^Cu+umr-+b$4v*zsulM{v-dkPL!z(VZh;!Jb`5U#CBqLv!AecYok| zzp_s-#ulxn+^FV5oxk(SK;Aa3T=k9Edh5ecnP)dG=LNE}o2431(w8LL-|yFy(@M1b~xOC>=yA zFDRzj+4-3Xg0K>5lde$v#H>^)Dh|?kR+L_#F1o=KrNzwUST=VhDgM?x63^w*Kn!P* zD7{3E4s|1~(nAzMa81RGwAiJulk*%opiP`&L`$qRzy>IL4u|(wDcIlzinVFOQy3FU$UthQeA@7EJV>-x6cg*Zhk+gG4As|14(l2l z4r_=;t+rOhxon)YCKYRLj`dC=u%O}z+O>p}s?SvwUC=x%1a};MI&MuPHQR%=@EhRP zPz`P^wf7My4c=)S-)ua*WG!_KP&fq8RP^-7o}RT?*)zQ1yX84pupKOT4whOwimk(P z>+nWEZaoaF+ED5qp(0%+r>E%bmYv;ev$veX05DC#qJN+4-?#p<>>sQo3nDyiOtXiL(QKAyVfl0j*a7+!AH=l z8?C*4bopq(87_sw#n6x(8rm=w_aB$>KXiQQ_*;{umbUBZYw5Mzj~9QqD7TCet}X@I zi-7?-Ft9%M;n5pMg1rh7jLMhL5;y41B4Yi++A z+F$DGDRzy^UE{SN{%Zw)|CYzrN+907)x@>#V=WHKfx&g*!_1{Wwed)8gWfur)k(anJ;el~t<;8ZEJXKC`2wr+yAlgpEpUYCg1XeVByt$)jH zZKtL^TVAfYlk!b1Piff^I=y_FeZPG&I))M&gYsg*--B*9qwM$W3^Xcs4e?|Zx5m(8 z1Buz`S_Xjt0|PaXTkg(g=Lc z7R`&6MI3h*>{`j{oALG*4I;38dQ)yA zMzEd5{vCtE+;Z0+ynf``k+KK-^ymL) zma$1aHGeOenZBs+jj6xkp?yE&N3fgG$lr%&2jd5p=s_||FnJ~hGpzu7_Pm1uYaW2l z@P36x0d!K`>;d32PAxbHG^x6Q8_vJsAON|l;EL9QV%IB;xc@eoMny41I;#{a9InMM z9reD34rG3gl35~VJAurd)`610m7UXFbgl*VklXthxizA=Xu&_Q6>zoDN!OsRb)CR& zJK2a8WM2(@!tN&<8y!7``8Zs~vc~#@HR58Y1Lz+W%M_r-M zpCWrU42RKYZ`0e~T>fUMr3(js+01!6?{P-2qs$rY4s}2a$SEz7vy-a*0Y#GYr})eM z63&CDT6=}4;a#C=lT_vT`tp)`a;rR)FM>gbPv)vb3n|8`oCfgA z@G~%DULhd>Wd&Od9fz9EbNd>e2?X{h;s6DgW@25EWiTyd1B<&NO*bUo64||I3!*csDG1zXhZ5a&_BKN*P(fhFMeYkAo z+^vPy$7I(LD8+jAt@jlUKKs!No84y%wl2i9?OAInL>{^M&}QqgUvq{w$A5kqRjo{{ zo`5!Of_E&DF0DDj@I$B=9FT(pP}~(;_RB5%@3e+X-j+gec>Pjw^pre$3Psu)=;Z0H z0T>K&FkK&GrP~{bm43^<)yXw=mx5ZIoxR1*ak&$k!j59#fE+l08t(Xlt1WA7>o0El z4yoG07gnACAKye3*oco0txgm}BXVeDW1=`VDUVIwZknRX??8xBFQvl4SK)?Ylf-O7 zfF^62P>uHJ2-Z{0F+6;{?Pu}g@_v6G56C3WEmpA)P%C4TJ z$<@PH5%1aGvFi}7wHpu)0w6blR8RkpBR`C+Pu@Jd*?nBLbuGD9E>nKnJGQls&EP16 zL!Qv;`GTjfVC!Qb2cTc&k%7$X@yLw(r4vBqjY&`_T_xuYIN*rq=aS+BIN|S5R&uuR zm;DhO!i+X=CAOK`y~%{hQ^axS=gqtuXYx85Rb~rV6@;--7NMT-(>FB zu^s$1*p5Or4!CdV+R$2)+_<-Frzi*5)}{Hk^UL|w7vFp3omUE;h-}~Yg%fEi_@hp5 z1Ap2fd=J)e+rS@zlXn3`R3n%f6N7l>Hyfe=5u)XSeqf=(%0pVvdhs(bm;oHKL3`@m zwXB^P*f{Ff0L`ggP;Moch^EOo)!?WW|ags$vt=e;1BMZQfoCSYI{N4K>3{8I9_-)g- z&D%_NAZ**t{?^A7^+bK&F(GdqEchK*9@Nvge8;@JO{cj`8Zn*dESu_mOytms&8|80 zK2&v&?_<#P)WcobFluU1(`#MnNS*S!E~lDP8pyS4A&|{FOB3~nv>esYl%d#s7*(u@avj7 z_;p&l%@+q)xK#ym~GK|#Dg{is*mG#8M&UGJD}wR-)EgQ)Niq-W*r#= zi`rnn$-tk!Bd9Xk5j6gLG?CvoVlQa>g2`+6HKZI=Oq-VQGW4^ zA|r8-;#nd?auEd8bL{F>6`(1$BguH`oDhF}p|55e>+o#k2*h6TWcG1*)O;_aPg$A; z0Jhuh!vHI#jtKn^bbM|%hg7M_f??#m^aE;qg`5mIbcjaUs?ZjCKB5?!b)WtGj7BP| zqoH0aj!R^;sVd@(ufqtE2sKj%&LpHPY(3-Yi;6#&5Xl-a+^L zX-P{o$%>EZFI9`gimjwG3r%DW<;V`xE~bTwmp{*-v-Eu`M_74A+C$#1FR{pkK%c>h ze@2bg`5-EICQ1UC&y!C+S&N)2vPVW*^>6Y7eF4H3|LZDu$L7{-%h51pkD)8w_@Z3`FM2IaOvM0x;mi~e5OPx>v}9cSRX zj~AW&x19Ze!$nu$Emz+MN8W$DI5=@@aN?7|aLL#-673hF?>)CAG~QP9)3M z*t%Kwrx;vUnzotCsutN$z_BK>>cY=X>!%iK)6ZA|_1Dr9{S2y|2Na-3YR#M8fM|Kq zoc8Sky>)xOMYF&yTG9<*4^#^xsMv_mEt>P7`OIgjktFP9DMBE0$AgwNKvt8YogBxx zSHZ_FS`8d5Ees?h(NZ);-CI6W4gJ)|TjxZ~Hq=N?P?KPMS64|sk2B$dekN5r!dk4% zU8O}^-VB;-nyR68h8n+vkuB+8q7Lcbl0$S{`hc8&MGo~)S|^8$JfwdOCt|NIEtOyU zca-$+$}g^SLz1|z)2`g{ym7vMd|kz~S?kYbe$gsD z3pJ3H?X+5Xq)jmA^!=mljYWIjasm5h=cA1UhUx=YTZX(P4I_BLqV8YpF|yjSXjq2l z@}>*Atg%`b6yR7=^zhtU%XtI#=qs2L%i8ZdK(gQ@#)yibZkE}B17@o$OFu%Y{6eK z>%wU_l1N_yWD;LnXs(Hz&SYcEI#$>gjf*gqXvMwlLJn(d!8}4T*hqu)E%Zb2lAV=M zy(5!_88)s&5!h8cbQwrBzhzf4^=vqRZP@HJyIQ!;;3FG zQp~I{Yf*79KeISjoR!hD^0w@GSiKXL*r_2^b%lzHTCLn*%4+6bD&D9~d5b9je}r?D z`?bN^?I_qfAxVU#!C0|RLL!GF@IeW%AKRd&L`x~Cw^wy>^xEM zo>+0fLFU%pwP)nu=!*GHxPRTe{;k_jzPK5FX~lha@4@2Uae43f=HA0MkKNjP^xEXl zjuxIfD?jx-4hdi9XK1H$Ra|@Zw)YUuk)n5x?A^1r@3t3kH2CKaJu7<;tT?mY)v+r4!qrn2d9HV`Ji~L|4(+&8 zk*HUipr!gWIVZ_+lJg8X-kTU_tpZqu41z&0&Kd;MyN0Myfb>tW!Uvh&EDu?rU2s53=!7B|cN!{ELN}y@ zHb~j*^@KgR=TJ8uGBz(MZP^99wp~j%)-NsiMVH`XGQ_4`a;T}hv@W`+EF^@@kPy0s zmR;)DrL@{IMi3w=p%4^3q8Il;>W`B#2t;5p6F!lF7?f;9$wnq$^y9#4S9~gXoybT} z#pko}H$R;$>>a~ybr+M;ymhSw~Q38%8N19i>`fU6`yJo zS0gh(0pjWuDyjJd0tk20Ub-hBv_mugw~%7eJ*j$9Ofr## ztV-wf*nJG*C)iWHW|*(JRG2pix+zN42okl}^m00%qv64Ni&nZ`gJg{LFcr)FKccMsV1))3*M zIoGe1L(l5q$h`RjtA@xh4qb8yDdM13Qk&InmOH(t$9+5y*94GyV3jHBDtT>6@7{!-*;8UEqW1h}} zb%aT~%E_)%2I`XZ&&lZ{hi#T0laFm8i;7-U&_%0D@v{ElZnJicTVk@fe}lC9p0$bfQ=5VDEs~EyUUeF07ldY> z$SqH#)VHtLH!k;$7yBNS`yMU!9h3Wxt(dO6uDNb|dp?0~x^I2>rhl{T=+d)ydI5PL zY@-&|+o9&(i`xFowFlh@?b~c0WK~-@x3B05$*xe*bp$GJpeDOp6N5<~(nQkg+QuFe z>0;#TFg#2EhWE~=9)n!HBB}|+4AL)9P{kR41YfO4jKwbOxsRNW>=1?-K}4B|SrvvY zEH7ZLWKU#0vgsKs*v4u^Vv;TqsW0~$K^JzH=#erwuhSpmIkjQ%(l1dG1{+Lt%VsT2 z71@i0Mo2q`X@_9}5a^c!5VJO~Tq$&q-*iLb5p*oM zfQh`2Y_zLjC4>CiJ{!?Z_SqhXXV+137tq+w2Oq8y3-vmw!=FSBP>w&xOQj+et3+y4 zv8cOtn_;5TK))n0kBjJP;zH$TE^7DS_8g5gXxi==v<-ITMPHxn>%#?%{15XRuWtID zg2-Ao^OlI$sz2H}q}0kX^CY9IGsb%MM-t?dvwRxAa8Nx1X$CG3UD@%}E^acNwN)U& z2DvEtw}w+5eCede7bg)=YUZg`_($pTvgqAcCAr+e+;P0wJ# zHpmtabVXB_91CHa@CQ~!nle(KruBNgXRuP}M9i5hfsjVF1pn>rBeTGPcn=wn> zgKN%$se^GTVUSEL5jx5z^jr5ny1JS z#TttVnQ2^2_e^K-^DNBhD?1WDH_fIUIuIz&a=4eCWx{90lDk5Z3-vz4Rhn-iV~VTh zsL!&C6iNeW@~f8P>K(Rr7D;mlYxNHr3v3&x5kEAEIGD zO~G#UsdxDXA5?<<6ip3?Vc4es0VUWEnN)tMM*V?+2i0E{kcn)osh0>3J)nM7MM{Dr z_Om?tW>o=NfM#~MUU7AzRoXOCku}Zd5?CQjMn=4u;${W2;%oEtlSWKEK|eWywS?cA z5wj9OYN>;An$@C=7BXBHrldXObd%EqN3l#YYNP((!B;5y56JlrIq#A~i}?npF6^W5 zLr1L=sK=%MgCHmYm;G;W%0`~&KO5utr}_WR^?uBSKIU5fl5>5`HGRxQSj^GC;70#~ z`^v}Mk&n5

H`+>V5yoExIVqaN8HweE zDyP_ejOKhach_CAxN?Mx&`emC^=qxh%AdtRn2pjMQeGgDoDQ- zk4q|UoW6cqC~4wLnwISD_YPe>lydn}E_bqcJ?~oo$mP5F6^P(c)%8oIz8e>=UwG=s za}+}M>GLmpOr_0FOa-L{a|O$#SSkosKW~M!u4iFv(Yk91%Z7?%VB?~-?{RVYjRV&Y z2*r(jaihKz-y>J&LKpAqQ8KsXBJ)DZ+-)jpdSc2iDVWP&F4kqfe&LNp>ki7iJlU{e z(Yg_W!phqB`)>6KzINW%uFqaqiTL|hZe8JB9VkdB4f3VIr~0BQJ5W?*LDE{5URJWV zf6=-xWvvjbwY;@9@Krm6s(!wzKWX3hG@F)6xu*Q`WO*>zx^2$8~I-JPIS?}_2d5KY*Y0*$dId1{R#-`w07K_+&Z{u zJ-VXQoqW|!p=uvrwJ&MkudjYMOeG9>_lQyoD;< zeOzAse%Y-up}dtZZ(XID6>TW5qF|0Bpc-TQ?sY6$2cEmU>SS=s53O6Jweuie=fO1T z4$V<%9`K|avXvXAIK0vYMbQQofW}ORMYQIHX3>l`x01 zl^9spDZ!=}pv6FXYb1ZlphcxNjTUuPZKf@BgY_QwwRnj_B>14pl9Vtgf2cWYVTh?J zM`)o&F~b&ZZvQ6QV>W9x`z5#n*}^KESBrmc4nsPTmKVuO*cDf0BSvZ)sI!5pM;Kc7 zU&{*TNm9u*<5;CQO7Do3=oCR}6dJVOuwL;(4SwEQhDq-5OnPTf_Y7{RGRYTjC01sg zN~-EH>K6tjtGyFFwNXv1dN_MLsNCo!w3lDvEsM8mIT`h2MUI=n8?C1($)EeZPAjoZ zO|E)YDN%jP(3%};WYuG&6ce`Mjgx!wShpHg^%yC@=-PUN8d3Eqxy|P2;yvfsiawD; zZ4LVUYKE$Zq=gwgR%JaeUKYve(%$SUrJxtji}mHLN_y>Au1kX!qWGvTFVEWy%GL4U z#VH4;4d~9u->*=O^;f~*KJHg>&;3(!Zo>J|e}-br9kZAn9*wg7gdb6VnmKW?OIaG| zmpuL`tRnnOfx!<=b6FihAEU#+YXm9`!y}gw?Xv$NuYEH)=1;f`RfwjAPa69Ghp zn1cn)F!x>Z=E07FRw9ft?jwqFl^kjUwhDnB%J?|b;^zJtg+<^%D`J?NfPRjoa&ni# ze}kNVNzOOOX(gwf99qS2{~V5(9UZNB(T z8H=AaQ*S~$#;j~D^ceF+bv(&eU~l5>O$*aVsZ260-*T!6gNxP{%mM8#!RCKp^DlXtQmzt7c|mXm zcvs-|w9vGTZ`vj_?dF?yf1HJRb#dXq+fzJg5rf%h-nx+K(e@ikowwfB3IHC+#D z2Il$~?UhS`mJd&Va9Rj#7r&*YNh5P1zmSEP|_@ceQ_ac)>j%#xjw&ufN}mnm23!&l(ud{MbFJZ~n0PgJz+whp+3A zleO`#Ho=7@Ul(uhx=$;C`d*}|XK89k_r&$yj&JN(vbW8jTC}fE*^59^UDJnUACw8T z>-pOCOZMuU(?U%zU(@^0zTtiZIn?zahdP!+HA<~*{LuP=Rq%K7{%$#T4`0*s(B6Ch z6k^w|N9JcR)+iL-#sGY}OVLLU~0Um#j1A_f9Z$G@`D!Jjf?f^|#FVos* zIiHnhk;^MIHMVNKBIPI&9L>C=`5rO6u@9v-qSPN+>)4uNM4k8HHKyHQwwhRkhxUpw z8-XwL;>?%niNRMeR;bD5u43<)c9dzfS9<8_c%VZ>>45?wA?whM6(Wt8+Ce zcXEy9_{xkEYXCEL$+HG$vI4PSfiw)VvGl7*Df%!rpP}7WnCK{GH;Oe8SC1IdrYjp> z3&N2z)AHc@b$Ud~HNk8)z=^IEsVPc~xT@~V{xNQ5Ha}x5TZ5pl(Ma|>GWqGdrj*S| z?@Y?$O}R@1cjE(h@St&!H6e}eo+h-~G;Ctm6tNqAQ48t-P2Mt#0XXER_L zqi&UP8Fws%8Z1Wdnw1R_ET7Z`nUUsvILlb|z(^@b*DKR!M0Uy~HZI zNKsI4hrtC<5veb*&O;r7`ya^BdP0u$sUrqT3XXZQF)?QuiAS734#3(f}K*^qRuOIp{#O+gS>)SSYodf+OQ{9J;)mbceJJyEFY z=Bv7&=H++_=JGK3tc@1~yXsD|lAx z3^Y#YI)d4-BN)mb@P4ZejRNEgp$4M{LePi1Ob%!G;2Hk6 zi&R#aIyKp!O-@}93&@p%M3<)ZA%_&nGiB{TOe~OXF`>Lo+A_@Om`bJz?b+!GsIZe3 zNNi-1?3G!FSi&fSi7^ZdSs+u+7QIj)K{V2}h7lr57&^^E9}DWmCV=IZ7BTARZ1`ef zEQBpX4isOYZjrDh>iSc}d?_DCU*D821@34RZ|VCtQpR z&btL4$!&WAmStfZ0bR_NW8zfs^VIX~&rMtTtD+6@;tk_FD1O+% z6ts+UER=IJ_>>st$SAn$R5)fNi~{WjA}D2&iw}@%aAs@&vBSGFEN@^Y?f>%HAAv9X zFGu1XPzM|xpPs2ZIx&9Pe`56TDYDm*0UgMoM^!HDxMLR`ik2N2FUL5V;d25xKP2Zb z$w`s(2{}vT{53iM6FL7g9MKMo{ZTB=ncX%lfE{`%M_2l7x9E^&O{An38Vz%gC>xQS zzoNYIRWo=+WwSzO-mj=g(&P(PF*9xZRNw;to>M0e?>ce9&kTh8WVu9UOEjKil*LG0 zyDwtf%I8<$1S_-|826AuS;zk&`B}n127gP=V{-l%a@58V%^|5eL{g0SPLqD)OIiw8*o0)J;The7#rl^t z%L76ZW_h1>nC#xAqCmgTGs`T7HAebPFR3=BMk>#Tp=&?Yo&;TyK3DtxJm zrc_mXs>quvt512cAZSk^ygZPq+sHgslEu%{az~&5JKuBmYv*s9m#Y==jHezd0;&WH zo*T~V&XlW|{XwqP$h#WZV-jtpTvaJoWy)0sEdrr>17E#?d3yQk-lggmp?V`K-`ipqmV#l+{$c2{c^m6+HF4 zr~dY-4?`b>n9z)@Uba7SZhKEXGt z0%R)MAJ=HO3E_*pI+H(R!)87xr%UZB$Q)w{K=2G+PQ5dVkRO{XWv~ST@o9{rD61yL>v!YVpq>7upZ!~~nv6>3(LVhcs-})qvApg)`5WKdgg5) z3IojBNOIP;TW#~jd`0_m6ANoj%P7dgT1e=gxRro_yQX`Yu?T!EaK2Z0weqIrX5{_x zTjTTn-#dKgaI(6OFWbcDZ~D7-N-4?C4xq!0;JB2OSrnNV(My$a!L+S_nYO)1zP7L(rfm*p+GbnDV9l;GZ7XD^ zZO&CFkDl72wZ4>2vYqv_;yw zh^ueGRSPyE5USzE&}IOP$4DZ_F08dFea2-Ziw>Jx&sPpU`VFdUx^wC5hR@S!W18))T`X-%3Y^~X}^jDi^~fGw<)BhTCcrs5#uaXKWXno2q z$euPSe(l$|4UM_3>^~vbDkTTR^ecYtmw3eW(3my0O$$|gFH0hajF%y5772Flt8eVMOP!Cxci5IGdgy-ChRaxRhcWpeV! znTI3h?K!Y(aIpWdXgPWGQ2(H4*?ssZAo!dN3^9bY#qu1c+Ypt7Muw+`&y0?b#xHXd^sIs*teosWaWb^; z=T3XjI59StVPB8VL9f_;<9WgNXiDGJEsR>oemkzZ~rX^BOjASTRs zS6XLLbIi1?6kTLs{;9Uc4fyU#%dvM&wE3trGX_oynGctzD*ZxbJ73u@;icTS@|6QZ zYcqUNaK~`m0k2|2?Q^nh> z1bZWIZ(MBIF0}OX__O!_*kpDLm?5}Ov~hyGOTsTXz_%aZT?e4bk?%3iVjthKkGJom zEDjiEK@cdsYd>XC3<-eSck2SS=><;%?`aS`?YyUbv8!L`I>dJ!df+*fv>!@(4y6J? zA+UiDY!Cwb_`troy-?)QC2He6ZArGR5BE_ZC_N`ebn(8fd&NoLHdg6JO|3%HM!spI zWT2g_XExe?C<_R-`Ukf9JGP~wO49P#@gzH&utG@H$7OQY3a$;jYr_Kf(6u>f-~3}p zQvwXKgy2w6Q^<25Oet>QiyM-zhUGj{z5@y_c9Xpl`SP~c=I{LV=M`#@2 z8wVaX9wL(Jv7!XzP3=y;b|>44rUtng7iv2Bnog!(L$Rt6%ikacCVibhHf2{9q*Qp8 z9-(zB-@5f)SlB+uZy$VUKT6DROcAC^fOiIn(++BK(mtBY`Jtl9KwF|lCO>J`<5onn z;s29gf{X3AoHXn4Y`yh3vn+36b5>ddgyZOyDZ`lBnZPLan9TE%F)G7T|6 z>%r+Z%vyMw{qD7_iQ$*(F$NQG)?W20n8n=c#Hm9}5KTyvBR>;fI z`>J}6K?}X|-eaEF{ZhTBdu7k3-gC{&Rp{FI751Mi))^ET&YD3F!uJkut%T=tC0`ew z&oDy%QS~v@C8J*Ki>!sce4SoZKQw6TSKSZM2YXibDDID6eP7b$XgmTLeGdqo!03D6 z)}s^XzUOH;0=g1k(fgpw@%iiET%`_038y0LfS^bF$~e=p5ZKIFMOqL&cqDW$ zrW`TTcJh405A8jI`y`Ag|KwRe!R9u_{KFGr|MV0*(t6&%%|EkVLAMQ$zC9Wng_el_ z%w^@dUe4f`*7*}iB$_*B>ZlZXnJ*VYXuY2)?+eIV((<^0?kF1{eHY$Ly^CL?u-_%; zdvJnP5>ml;DB#<0!~*C)o|90pxC-ogO53u(P0uUomc}SaEWx&FZ`17u}GE&z3u+Ln6F|*Dn^qN`OHkbR9F3kxzT7 zvPr^5P=0;vzARPONZ~b)Y3II$_qNR0AJHP(_rT_3>+&G)2_`+gNqaBcRAYQ9yK zPI)&ZOE)f?%zeG7-mMD*_g-7JWR>hRL({+uaMK)sn_?f010U~t zP;=xlrS5xn>ZgS!s0FWu06}OFrFB54g^q)r3{9L2^})(eABt3?_S=>7GmF;E&jtFB za&=~ANmvh7hV`J!(l7Lp8u=D!FN3(SC3VfC=Ys zofoRt@zv|pJY={8mjstk(}f&r3g?cdWw7C%bJ2d_k-b{52Y7oR<*r#ItD?Pp?Ovhw zAYXeh={f}c6bM;RY*n(VJ=wWy(Y|}hUU@T=?AZC2_Fa!4q;gdAj%vZt#5#f|Z?&VP&?)Sung+BK)zvE1wdi^4 zI&WKe?GGb=6#awfqU$guIq05L;cQl6t1g#SD%{W4_6xO#_}W8B*Wnkha6jML|IpFD zMujUj-P_Ms?72V8SM2|IKVN==FFu(n@!#GEm364clyoeBBb#3hJBY_CvQ7d0_#$7j z{l1$o*?s>aUp#137%|Ptn5Jj_DooqM*X|K&2l(27r0d`sOk1??l3xN6;eFQ5s`NeN z`nal22Na^bJL&yif(a>{8%*o>y$e6LXx~R!y#@G>AKG_5YG|ghuQpZRG}r$_yI<0W z7{o}Oj$2_x2-~jxci50x&3*)PeUe+r^ChHEFQe_+I3!fkWh6VJwS&ApXJ|$8^Rxyu z*c^M;rX8@MSIXDn?iuE%Y}s-pX)z5-P+|fTK~>%uD^cpA{c0P`m_@1EnnuztqnRXX zr>nU48z$PE+wRk_+m=eOOQ1BWv-BiOMqcWA=OyUM&B#lA=Ms4@RnNSP{M25HF|1AN z1&Z(G#xV44vVJw_yYlXUAyx)t_=ahW-a&?1gBd{Dyz>~ zM|_h+H)^(7@oB#>kb*goJ_VunppHi4$Eq}KMTWD5j8+GHwr|1g^IKobd6_XgKch!! zZ=%Tv=YsoAe1}q-v7L%f`!%k~*lsO`;zLY>QOzi(!dr%KsO4uFyMTP3 zHrA-)!vq~LwpXd2_RG0+NBpGWS0&6}#?IcC>42{TT71Qa_mFn)Ui`g2sN|si(yp31 zH!61Nzicl}LC4Ax zUWfv9SLEd=VE8r$G%$JZi*Q11{rDG^8Yy!S?(fk%NHZnl;{GFA;H`{6+)AT&d{}v6 z97`ElXN%$^x|l|N#`vu^7%7e3>O_$;E5Te-U($1q3ZP!?B?_GwbB$=ymlRRrBE12n zO6{=&aPIkHV&1BbnQ?vY2Gps!`_gh>BPMcJ=M0xk(mx;y%^ZAm`!YI?bsQOS zejGx~4C0vnRfk`KF@WL~aEC69MyI1FhntMWWNWn;OA{F$Iq#Pe$g;@a2WPqenjC!l zOiWaHCriwgG@2xvsvb!KXrOJS2`nYiBfVq{lZg8R^1VZjCXW1X=$7UZIJt?=zv>H} zotU2CIw|$(#Ay0I(dS~nh6RF* zU;wS6hc?(}L2HT(wFD0-KrOD@E37-juhVHwIaTP3wgro@ZZE%XZ*tvXh*6>AR07ki zY!aAeyNk$j%zxV=_7Wqn#; z0?g8Z4-NE0Vc5&1eioRR873C>4&x;kJ&?=WI-F68YS<_4}^nJ3e^G&rf2kNX*% z)N!Ia9Zo9c@-Gk;mIN8Z$gG3#$5oTW=~d0`-?QGa3eB7O=FO?%`UT=Za0@sPEV^b{ zbOlfYNG(-i>O%R)fn@c%htQ|7Km-ln^8TdT6^_%(n&3{?7~HYJoeP$Gmiq;K-@)X%gMYRwx$X!A zj>%)Mfk7n|H-}&{b-n*nx2ZM=L2dQcr={u0Pp<0uNM-{TForcgur)qz-XJvZ&1Jj7%0qcKWlhyc!YB#wB>|zNjW21N+xNJo^Ls6K zT2k&(!QITeo0F~mh886@~&3c&jPIn+Xql)#SRjC(WN1UMI=L8!PUYO z1LTN>CRfXRKxo;?x9nVW?pg{Cu<@jYFK$7jwy@Wy5pSs!YX{%5W6`;DsrdklMW^^t z(<9aoUG&bmcA1xhrr@Tet)5AA(@?{*G4j)Yb{_WXs9`ZXX2dk7$ha6vP>vFK6$T6G zs9|@^gTWl>v&_ETUqw`AKpK?Mb-s=O30+54Nd994%xru1r)Uj`tzSfiV)e-9rz*tO z&9G3t=|S}-e2pHbA#5>@aDRfxX%5V)~W5v|ohQX?Kbbp#}@2RdZ01>3$(nuu8YHxf4J43{Fuk zUWQY2xQ-F0{!6HRu2TDiC7erYy|5s0qc`S`(eyRrQhFn^%uxj}iCmz{n1Po=s4{st zyWAYMj(ekJvsPw_(w&hBZrOlgAYSAoE3237>A!$?1Ok`LOQCu;Kanpl##KR-;dV^^ zOSUuAo!5Sgk#8n(xR3WKfkvWBSO{S?p^HOO$czj4KJqXND8{wxW_!uHSwA=0)uVl2 z4eF2&`pXi__Dsj=z`fnWaU2L8mcUtJ`=$(fH)VQEqP}GPOSvjmm@oRK~l?qnWa_q;djA zT#mHDqctu&&N(JJ$q!qeDOPh%hO==>hX{sIW{6+~XMC%a#d0-s$56pw4MSXE%U!YI zd0XC%M#g3f0^B>g^A!cQSn$GB4`pUENEbR>8BG_iX-gw%NUf z7Q5aX_Ura!)s}lF|LCnhcuUxJlHYc6(RJ#Rif-5&Nyo=@@WmZiX{z@DzBrJAt&o2U z@87~a>v;dVR82h{_JSXwW;0*289*4)a`>q`-&b@!4@)Gau%3nR4<^1pA#6CpZ#c5( z8vLZ9^`$d+Ay8xBoVlAN3Dh2j({lt^4@aeGgj?B%2Ok4()J5Ck&=Z#lE@0 zpXB1)80Js_U}!oXcsds1Ke+N=T}fK^C7t_7%L4Xl9lX6`$=M`0H}K94;Fe<}9>A)r zagc8ueAsv_X@_t`x2~`ky?6NP;bi&xg_A#c>-%pBy@&bU!;AJKpLn`bH5*a{__~w# z?|kUrt)hTFCWEe4zPR;4@uA<#kpwGGZFz)sgTsEFLVv|0<+?bE33RgE=D%&0u(DG{**zX`Yl?0y53x z{sOmL5)L5YRmf7{m@)yV=pidk4Aau*LlT7os~CaMqJ?><93mUKG7znxLt1oYhjYEyb>x^#Y|k~k{6vG`L|e}?xMk@D`AeSFRQYcllCXqh8zvojAB}!$ogfRFrCPT3jk8MIf)#& zmV^avZXypZ07>BjloW1$!Vb3};ecyP6v8DrfxO^QD576HA$tRPA(ru^FyY2?XQBYE zD^Ud3o$$adN)-RH`8(z`SW}}Rz~Ci+_M&;V43rwY;zn9Ny4(z2e>R|$BR7LVfQtbD z)70>)IB9 zM!R+Z@svPTQG?9uZplaTAj&{iTQZ+xY&jS%4SUB+quyDJD1bdJXPi)gHfX0OH@U|Xa|FQ@0NW;pDkRB)^5qKce($|IHmla8C#&dl_(wCpwC6}hfAqh z!{ryyp4?&Zpkll-S~cs-b}`l?fqu<|c~w4CDxjxW&;yugye8_O z#irH^wN2%V(`?LJ244HDTI^oQU*rZcM&^akeVJj8>#Z@jQjU^%&=TDQ851l9{hc^ z0rcsSVGiKjv_sv2(_sE1rGG?@NX}o8^Pk9}FAo;UWjdq_i2%9eV7^qNWs3YMIX@=n z2|0g5&LeVIeuv=u5q&RkoR1?dlan$?EN7IJ@Goa%_iqHaxRcV>u)ZLoN;o}m+Y`!`X;5OdEnup`27^8dUh=5 zkNqQHRSVCDYfLQqkQVg*Fbp( z;M1UETTJEhZt06pxv@bg&D*PPmpE!@v8T@P? zU)?8EZ{e!}jhO*ITjQeTs|%8h1_nRd{J_!7;Ad&cUYl&_eW03sk$yrN!0acU*3|)K z0V<2K;Hn34cnYYf0u)ujimH0xsA5VFX_?ZlIP@F>51fJd=I?df>A2_STejT$%7d1j zf9)E4ZZ26J0XbuW1Rsg)*LCsfH2c zk<-Iy;c`I?M$kDI6TtH9Rx260tRpJNj~$Q&vIwr z@8Vv=-^I;x&kN%34rPG|JqsuRWGMK%>{i3yWw#RkF7rzGyUZ)$?=r83zsqhl{9Se{ z;qNkU2L3J!)9`oMt%SeJyb}H{^Jd`hvM>pMmw6@pUFMbWcbPW>f0u=2;P0}q4E$Xd z_6+>pC+#fOr5sZ(0J+VU;+t`xLs^QSWFz1y1xRlb_019dSFaA)_vAOxS^v>t^S%g% zEpU8Qgxo?P<8ty|72pXOkh$4U8^cy)J;X_Lopw$fH80``;4GdXG zVS<8sFn0xh=Np9IIWTx~1%PKP4Z-t8z&ibs$~t!%)CLP*46>8$DX=YuFj8|6cS&#) z%K5Qup-mm0YoteChfh62R@VNaInhk3;fifwBHjo2!;wh*{A5_gqx_{DWb%I?)ldK7 z@F-stOw#3(0US3yQXnOhMS6#_9J)7j$yV%E2(F37;qrPac{mWJMO%Sdoc?pUO)x@et zqgasJe)yCr>8PSN1e*YuTd_) zOV0P;1WTAaO3ZMWVwv#xTN(cGO0@D^d;ZXp~G3+v?x;F)Dx=}iVR*ola!dxU$KcccPk&hWJ{yN>Nj9J@yZUJ#;x}Iaw znhs{RfF2tAC3GIlT#M+4rkU$AKqxD#zkLS!EzoB1Zn#&(d$-KlQ?|03yB^r8QnhUh zSqop{*X;)C5O1%3RNI!cSEp>=n_UlVm8ss7$FVU|ohjr|RQ)tRC+G(#*~YEQHii^tDcv?#fSG4L4juRNR=)vK z7|venyWS^w0=y@%1ew>NtA_}7%;ioNujgIs$%<#<`UJCnLC(M+bC`-7pP2HB3+AkV zO;dBOm_NN}-9Uh0Wy$(pcppOqec<{5p}3JRZv5@(58wIV9Y!&T?v`k6CR#Ua0j&f< zW+~Y^m#?RF1q9h+oXFOEy;&$~;ENjcwEB#w^aMa99k8vZvV4C3qP2&p^d{?i7OlM` zZ-h|a&pZ8sGsrt7^!Wk4ec*v}fDZ6;4lLo!s=ix&g0G$TwNtSNuO0-8ZoZZTntIjF z`?deB_b;~n={ANOh3vqhd{boyDpXm31K)KuO4hDV_VzDY_Yq`O^Q~r~qK&U;lXJZG z&h0}B?mrCtQTrdX1IrAD@WH0c*ADP6U%{LcI{*r&leOIoM{!vEBWs0Vt;Hr!%HdnA z+9Xu%;;VKERr~p>{Ym=)P?KMPZj)c0EDt7Iw=G(?Q>|-~En63@bi^9j?GcU@dX5a5 zW}J=LanSr&Z(IgMw~Fsq5CgvWdzK$KzVCRNZSLOjXvbdAE1|RPGaJIQ5lj`Ktz{x! zA(w*h7a+rA)z<&M|4)wmcSn$8*9r5Z?R$`=1bo$RepSdTGe_j>ZU@p+im;R7Tufs_#X zMX14t5uqe^%D50}FzpYU?`C}jqUeG|D|_%H*7vFcBli|DBt*`a;oy-M@r=gQl?;1; zn5&q6r7!+h*BYzM&cU%ePJ+L-M0t{fTvsP)C4I~;2TO^;@gghEoma=+$CRlpoMN_Rs0g0z(8+9koqYX1J zLnxH?%vh*R1LVJrI0LT$0vnqmh?;GJ;HzP_Nypg^gS1V?wMqmKzRn=NFHy(ZpnK)B zu>Gz>QFWYchVp44>zU0u3VK$iN;I?l3{)8t{;&xrrOV?zYuDgx%Y;5Z8g4VLwet(R z?z%sczc)&biQwuj4=!xZC;+#s$5(sG3#D3%f>+v8f`&b%6+I<5+lq1QecdRsb!EMk zap1~Jk0rBhy53;aA6tz%o@hb%b_0sgA2BwV$9ArK28HmpyG0+#-w7z1)`at^N6xnE zXxP0nU7~HQ-yjrYU|2ms1WK!pMEhNj`nGi>I$#}1OHFne!O$mOY29@1nx3bhp+0wu zl@WiI)h1Xn`1d>ej~(7^pp^OLXU{Fa?7tj|cSx)L{}oKRlOrix(n?!&GISZm5D(}w zj>v?fBpSlu@HBnoIx&m8lclA)lgCsz#e#5*3{pZdZbb3pI*FD2?+{s8CkKQ11k(e6crjPP_kls7oD`A` z#b=j6GsE7VekvoV!KN zcgUgW61lljtUI}@N3k-C9Fz>xK9uxwc=zlK{eS0WotLCHiL zF-!*a5ev!UXLRCR=qz%d(b0)a zjnrSECM0c7F?aORSprq6prhwTCZk|(L^dcQp~|IDWD2KsPe&zE!rKfTJ zA9x|=ygeGZ80`F+DZq-k(>U%AEs-z3ns_Pm*h`7mku^V4m1fGZ)U;xOW}B)zB<7KbP;Cgu%GO)KV+ zs103JEX|A)LKi+aA=;+m3QN26H{EP`2pcl>arep;0JDdFBJj)<~Sf$Xmb{@@DQ&QRo?DNx|`;5Ke4%C zIrhHemSf%`H1FW?SGHqrV5zkHX7BskZf%>#QGX2OYCpdYSm^tc17Cbt`sUpJrTSJ0 zuqajGOI0+bs@hXU-c%WMKx@h0uq9OE%^5yHEmfIV;A(or7T|0gAE6bH8 zchQZ!>v?mQR9&Y~*URJ2S~FL0?a<$VrV8KtO}CnE*WPNGI|PUu!pZ95U0td2S{$B` z@&{AZ0m7dz{M3=_C``E^q46t!RWJqyypXe=ch*0I_Nw~% zVf_HVe&FLee*F=?^axr8@L{JvI4uPF_&}c!*vbdCCbt~{j8pQ&(8r17nek+NIvLpd zFz_~SuX_{#$#o#vTP2jPLx!oMW`-a{0BEU_N=+P5UY)AxgeESka-;fsH57)QdJ4P+ zb2exZ+e>fah>C`#h8CgW1mAEXRm$pGT`yF3^VQucUu&wo3L=XD(4A`M4&5rl>5s&t z0ls0tFfJJW)R8Agzgf0iWU_mu(->|Z=WYJm{U09v;Hc2BiEr3+ua0ln_Q19c+_^D% zeNrfGyQKpS0FJZ8G_4g^C`&qUX8GM*A_cD@^slwF^WArP5%BPo+t^jif1jr}Q&v3ey%5 zGrfHdg{e3R zWnM{Xmw6?nUFOx4cG<0_w99TKrCsLDP}*f-n$j-2m6UdwS5n$#-VCK(7A7g}GOwhx z%e<1(F7sw6?Xs{8rCk=*Lhx#dTZ#EFU(>zJShV~MrQN@4r<79Xr7kmq|LWB$`<`^8 z5B=j%%#GMo1zQIcZ0&2Q*XEnTbuXk~TL=Zx3Y9ymCrpaBM&T|f(7GdqP_iwAlC7d( zo33hGx=LC}!PW~!Sxwcpe9gEOD8&bb+)Dgbtr@d=&8MW8Yg8MF4nNd)Nx{}2#@Uu| zEfi&on4+u)+Y0sE6qxi}in3pa_&h^V7Qkiy-&2$&)zgqB#y1qc8wthU%?ev4N}x?* zAgnivk5a5k2VEtVfmRbzsrj$1&4ie$CQLXtoEOdsTmKnaEJRx&?9!z-5^$SBs+b@2 z8T6)b{@V&ByK$}Sl#;aHc!T0M5=)cFD%P|LH`dM#DKjr7lBL`XwnWC(&aEXgFA2~D zD$1%*8h8wbwT6l^L4z5T`CKZ>27*GSqFkm>1ich{ri!w3THY5BMUp1{K74nw#jL?W z{6ES463uzC*bn#T@X!1cS);kfKQ%mZq1(R?>*25lj19f!$%*mHei$;f%UaETs6aER zfnS2(hL}e_9Wcg@6QrnYF)Kf3$l02N0Fk(eUpC{4ggesopUa;^{~0S=P%6u7xVKP0 zEZ#3iqY=PMjfgHud=*kdB;Y6)1CIJa&#u1EzN5#F>^dph2=Pk>;1UZZSruVpP0h)E2$=wSm1X z=v=9J{kOC%{zWv}PhUdwI%O|;l5MW?(z)^Fk9;+PuMG=h!MC0FZNGPk_Z<*?hk4)O zl)VPgDr+fP*&|;i`(=1hUA(_b@Neb)Ta*61x9UD_5DtEkKlnxA;936Q*}0r=Id!oJ zWQtN&Em1-1c;7lHpN7DE*Fr5{-9xv&B{GX_=e_MnZD+^Q*YM7o+go^NN7CAnbap&) zmk91U-d!iSTX}cuV&@*A^AO*8=z;qXu&0D#0<{u~3F>OGd!NvKnD0LPp!hIWsS5IK z2aY(?9+e!HwZPwiRU~Xs2|jlfHSlcDRv=(IwqkMo>eRr`xSvlA976`y0+|;brK>53 zKQj(t3(7k_`kbIHj^U9z2gUMiwrc}u34)wqJI;hYx_ZgTMp8gO3&nGM-kvKR#f6f( zR8~3LsU~}lwM$`Y>_}NKvH4jkSgaHF zG}r?;#B$f7><@=?wY_4dD(}?tR(u5Zqr`_wJo=4vrK`%X(!y-FhjVDVG6@^@NNvh4 zN1BTLi`S1rq-|VCrz#Yx%-F%eI#3HpUR}V?cp3Ys%C|46E%ooG(mQ%F$&rEV-Sap@bXx! zqufAu1-z0+(6l#+OaqK~1@I!>Xrk`cCY-NuTrld>uZCOny;y}?X?^PX`9vlOMtpMX zO%inDrz`#4Sj8NmXsx~xa~jtoEcJ^DouPxla%8MQ>uZW{qFk4zQ3=(4jnbqUM`-U% zx^aXSo@pGhYR|ux#Wk*4KQd37*7fHXADxZ+7JbE6=@-geKv!zB#u>%8%3DO{5JtVj zlV%Q~y+!Wyw+K`jEE!{8*U|E-IABQem^l-{PanD)D>xn0qV zw{P0_PAy{W#m8R*Q5>x~b#JTj?89iiQ?kmt;Z2n2-VL=E0BFzLh(W5q&>OrW1JcH>qn?f?58EQlMAon(eWRi>% zLKEp!zD#>PCjErqke%}u9*s&Gdl`mlOpJPqvS5l!zXqS^AbRysSZZ1jDnRQ?b`Iyq!_bAds+y=cmB`nfjG_n)q5|nTj zj+jp@U{Kgl7zB{^`C$lxzf0*dn5ywJMTtKlMa;eM^4|(uRL&KBfV+fm)!Vz za_B8#XpG0-#ny##1t|_x^?CsW*gE5gvd4Z+q_o7uR*)iO&oS48sgFyoVVc z!<*s#l0YDkkc3`XNU|i$wk%_$8OcHjNf%_c)& zx7k{5x{b2^>}D=t&zRyyPSkC?Pi*0Jn|AyEe}CuBeavtNkYu}QyY~a-a7egux$r=Q3-Usb?xXyRt0S2 zWo^`6+JRnY<*qvY$PUObNZnqd0J*f=J4P=|bj+HnKMB=HNbh9l%bJfjJTITTesZcJ zxMA}f-q*di&Q5RG^LoDjP*31cAAg9Z(Db1*{38c3f6i++(WKm}+o}7AZU<9eOeb>w`G3AmN$H_`_Q=RZfe|m7-36bDw(m4OBbmo4LtiQZX5Exk#Hq@j7M@u%y5vfh8_iRrE<{7<<9Po3tUq9610Q|FY*o6|Yl z7D=z>QEHV}`y{oBZY<;$LT+lf*`vdROE{*xW3Z`dcoJ+UBwyL%G?hPV zT`$0BPN$YhLU>Jlanld=ZzR5+c(W_GW7kjfev;>Z;>qb9PrljVcVkG-qpw5S&D7V> z&4Z9io7O4YL{>6mOQ=l+VQp&jZ`eC)eVDbWe4=YAdDhw*HXfpEU|6%Z9Q7YR894q7 zkH0O?V0h$_xbIr!kx-i7eD&OG7hbt=vo+Yc*UXT7NyPL3T2KjlXX5Tc#~{uz+{^9kr81v!*6m%@L@XOq(^W z#fm2vv_DYXGF{vv<(xHbVg71=?WVU(n?t1t8AGL1PsYxgNXh$Ow7T=G$;lY7)x4v6 zVkl6vfybX?<4EjV2Ad$<)s3A^cbL*KB37>l&D46=t*|kQuF^Y!O%ivOHe(&Am75`F zmYlc9d7GR7IYDyXA!m-9KPP9NoOj9j89BGfd5@f*lk*F5?vV4B%>T?QZDqY5!jsrVc z?}ddHux3qOGZb=-LKA zwx$oWBJ%R)c?&OQU(C6YAhL>>dUdg{#8>Jo^SQibeHD>;G}8MQ-W0c6mT^*5?JMvW zG4<-&r6sBN8b%dEM|G%I7lhQSU5cKlDA-a@px{d2-eNIR$ZZyCQM&FJC>`R+NEdrMY-ay(&u|gNqeix6K&u2rLpYo?OYnKK06Tm(9*O}k`A$& zpbBbnr_c=Il(Un*%Mq8in$^4~A{s$&32X+%Kv@=_Q@YsPsqxhgC`)`;c&2n$C@~FX zriZEhqfBp2^!bHR+^;NO@JCM{F(2U^xje5~=+tuPq8xe&dt+4JSamDrKded%X zh#_Btw*hN#Y2VSM`NUm-uS`+veXX50GQ(!-|64g%S~=ZI-^P8coMo7MpX$nq_4L@% zmcs^ENbX#*o+{_mJ+G%XvciBdXf;)qcOR?i4SVGJyxi)gzGkdyEj_9JB@*^29DVu! z*{-zMtG=b!DL%b)CWw2zi}$@>^eIST9=GP? z76z|Rr)gDSIi$e;^sCwzwOY;dC9z;JeLv>B7RnuTQtHP27ia+Yb2vcz&{fyYX@`_( zd2zT#z!S-q?qaRz_yL8z3Yt7%nYOF@nw2df?ec*EW z-JZ#_ulZi_-8{=TZJ%!HxwZGFPyXad|C2rZuAZ5uo=~#CyBmw_zg}LcWzGKO`qd+^ zDA{}h>dJNFy_xT0B&yHTC=uogH$u*r(VXiBp&SMy@b8tzd=ubiwaA6@WRHauxoH!YdAWH z&6&a99_WE)Pp`XosF(Xgii>}`caYfLz`X$!4Ab78i|O|a%*FKf_s_-lp8x=kH1ip1 z3-=u=mk?XHm&l>Dma~)d6>?%IZi1Za>)W z^M4H`VCcR`1??i|lO^49A5oirkJ^;%_IS<=dORlqKNGUl_#HH0F7cV({&PUR!J8VK zGoKjj8SFg;Fr5D0L9CdhSQc6`x!+L29EOqkBl7(TIe$vde}pr*22FL|&!qr2DO$}EdIRg_;1vTCKd9DZ zRnFL|0=8D()(UZhWJd_mqGWL5E5Rc~k2T-ML0+DF%nSHG27_v-N} z&&|Vct=$(atMaE6zmo>de~rBs{Gs+PZJ=#0-?n$QZQqQ2Ki{@*vhy2_B$JoUWFe2M@uQp(r z#+p8R;*@{j0$50=!B;{Q-R{z@Q@1EA_Xr^!+(w8tB)*qg6fAU29Jsb&y6~dEX?vh) zKi{<9fACSh=~0R~JPH+cw2 z2ucD}4SP_~aR+eeoE z0N>#tyX<7yFDoZVASVrcZbQ(SznGlh0>Vi#Kww6XFJv$fnquBk40vn{-e#bxjjw78 zRBh&~2$Q60+pT=Qs%zTP#Xw=of&CG%*Yft-$rPY?OrGWK&HmP|K%FFB~M;?lG$J^<$-47 z$h*2>Y{O!fnZQF9ay3@_=%Hy#85^U6eA&UEosCODX`o;&U$B-?V{-C?Ma2Mgai#Fq z;>(F+j{-j2RTC)N#h2|;PY$z@$YC*ra*RF-2pFp)z>urQQ+P|!g#KmAbxWYEjW27P zI>MKA-0a}XI;Sn2X#3Tgu^ItoXA+1i4U@gRy=kf<(9*@Xbluv-x9sNayD^X=W4ehi zY6=v!@JWo1cRPzq86-fq5`Ezo~70mXQ@1??_r9P?(gIL&d0HtpWfN?iA`9b7g;mr0cL)3 z>?8qS74zvN3zxQL3HnuD%H_lm+*LUb3~D=;I4Awe$$V>0K_I7|dQmjp|jf_X)O zye2-ci7k9<_}n%01BcL8&R7nX24T?*<`rJOICc@1U#~ti_E6AK@?nkz;93QmJQt9e za;q;V&X-gMN;dH&oAC1w*lF#XXqh}Wb!5ig5p-4r3!5QR2#llB6q34W31J*nY4VB# zxnxB5V{Rel2pfU|O=;Eg%>oiqX@(#P#D0i>_d%yY<2axl#Lh(XW`1^WLhNkyHW z$OF+4LP}!Z^&XwRLkp;_?bo&w+}47IMcgb5d4ffy3o$GzxN}#?cN_9WhprvEt7q{G z-FlrqjV6a}%yux(!c`k@Hsb3CJUV?EzLF78NcC$83P}wI3HV6wR~(6rfI^}O zl6VP0A+ewgg~Wma6cY0aP)N)xKp`=&428skG87UE3Q$PQ8v%vH;usWC)@*{6iZq*AhnjJK-AtHy>IJS_r_la1jxWdX;jgxiWSUKNhhv!E14;5b1Q z?9;??^1;}rnj93ci6jXwSR7PML+a8$yeV$>v?2IF^X+_56xlo=h3mGg@}HBbZv7(S7tA}%$<1xzY|RN+FBM#?Nh z(ux2)rs|=5@tIqH@AUw^DDF3P?s&P(uWP;+>ea7nhYG~9)my?OVKhC`XadqgBiq*k za#4Z;x#(}4DD0QDy9sGX0`761J=Z%hbYi%dfQy|c24&13hA0GloL@Q!yeJA{(VW)# z%i<-uhMx8ycYpus^MLtr9yu{I*y7ypN{b3RG-n{XKBb#86YLP+eFdOs?kO}`AcP(v z-%qh0nKQ8_hd_lmn!$5P46G>B6^~o6$VGBxoSPgcr7;TG7(Md~hyYGDqv3T?ii?yY z4#3^VP8^SJ!L@oR`Ki03ZtbpK4&c^Hp@gRKn;10G7Ah)~5cVV~86x~nBcI$j`6!>< z9!TEACvUpBZYFuxNL#0ep`+n3FfY_N5KiHejPo;68TcGdjIj@b(IZ5~Rpu zNe@`^rY(7X!#;ohz8ScdJXC8=4w!O)VHz-%^QQ9I>J5SFZoaxZP`#h8-al>HAGD^u zXue{e&0oV?*W5|T4qB_`Q;Pzr4SZ@tAaxy|y3U`{GL`R#IJ>oJKDE@JQg$~%PuRhL zzQr-Y(Kxx~wOz05;_aJQi|n~0v0t(XTMR}vl_`xc&>Of6N8PY%3zh{ia^m_hl93lg zN<*Oy+sSLxP zkZV<0GNWkD8^EYamD?uz@Y6Sl;(O>%x)=|kNu_3^T;s>pPtt3S(og7a%Jr?jhY3-7 zm>`RdhV?K(*+aA@L+rbJhv2RSEIxHjVk3zK`ix#9r}M@SXkpTq0OeFlF~Jh|AkIz@ zWQ?Z|1QuJfSO;>b&9X%HA%TtiXSF_)5BM+06B2*ZXa;g2(5l)dnk7CHn?ow%wd|=` zF}C;qF0;jU%Xf&{nng?phRPG_PZ;_MA;qZ8K64mSj6D0okz(Z8@Nu=6#)uGv)Y?i& zEoyB|3PWm<+ZvA0p@?@|yhdm_sm8)2FLf?t7u_m+)wO*uzp>4>$2-n&F z_4dlq1KDEFk}n47hyOFpFl6} z?_CfEfm$zM>rVU~g?-1uFbGKCxkpwsp^!8S4?&bh7~P#S^`Gv=#vKLs zoUfm=Ks{a5-X|?}V1`009bqcR7MsRX{Ul1Hh?f}gbl!aTn;_Xg)>O|8Flu%Leal* zbtjx~eSa{=Q1%`A|il(43kl)1TH~E|Q%;fL&8*-s@DXCQE7KBa(rcI(XR;2=b zm;u|-p+H_8k3U#aP@3L7dF3RC{;-CCLxyv~!qnpp8AcfOO;IRDH%|7=8an1-IK%EQ zZkjbThuPj$R#Y?D7-;C^@n`LX#Z|JK8(9U};x(ijVT)H;MZ@G{vxW^+MV7y4?W~~* z!U4AI7kyWJ)0yoPX)otp&jZYXt9``uUP(R7X!x5CF2raOGLa0dk5gl({Pv0uG&;L6 zXsYnnbn>Q7dTf54Os_y$wCz&c=#J^c;$T&^-vm=5qus$==g5{X?Y^B|KH+}(^!3wx zcH77{v2@VxcwSlv5U}W>A{?95Ir2@=IQ_U9$!98~FSN<*Zh1X+a}e zS^yKwxe5c&{`__R*1fZahcT4?EK)y)CFMJ!5tMq~S+6WtwXyjPC^sMF=7g2o>~Dcp z<~^)jQb?XPY`zUTfF3Z$Svj~R&F54GavJ!Yh7tW+2D?BU=*Bt~u9izDZQASHeN~_Y zsETr`DXqG-Oo>m2?=Z4QUTWtxkV;h%a0Eo(%UbT46|BUn)+Evye6cI$R@p^Kx%DgN zj_e}BQtr4Fb1zqSysY%f8tC1Kty9}44M}@YHDa)s z6K8S5dX%(c8&rDtv9&=jv|+iPQah=~jAR=prH|SwD7?z)q54XFe10lcBdhUxwIJF{ zmUOX`%tR7);Rz@dq7qWX6TLS!%9A*+ghHj&T-FZMD&Bw!6{61})+gT;Z$Xv9;MN|~ zDM}?)b;RY30dli=vN1rh7JC=Eu&Wh3a1|x|Dzr;F`~8ZwOFBLLinU8xBZdmaSE_!F zQ8QVA-S^Z??#eHN%ScI6R@6IfE6#lIc3`$#k}$zZC8m^ZHqV?RoddVGIezmb|flK z2Z(Z9Cf6zYLUVYv^{;L$OkwXa8JO|%Sb%meT(Az1qFAk^^&d2d)NZDDGN)(fTFya- z(rj|1-{2q!GXb}SEdlRJ3r=C#t}l(R1p|&~rWgCr)~Vw$#&cKfaecPLu}jhg1-w5)_cnOU^hs z*T|9D%{5Z4jN@m9&cdz<@d10t<|^kQjwHt#&gcy$>Mf(8&}t#5D-gXw&_!^O?|;E@ znIp*vtkqI=0u?1>0#!Kc6QM;J!TkfunM(;ZhUh0}1**$jvKUvtG)g3Lz?`a_N+3%N z5|1N{av%^4euU`Z&8s6AoZ(J`Yy*OZ=fv@+hX#9kj}3vILIfjlh%aEHH<-uh2h4gU z*{uv@)bknjlXZN?nvq0UlME!6^2w!v-dL&LhAa9#VECh^Jpm+^myk^o9So;va z_Mw^LU9eXP(cM5%BVW`wc_6T6C%~P{-pk@nSvt=f?b7c3Obo*%Q+v%m3weyvAFYmg(YpOWVvWIWkz`AaJUH7fC{JOm}6%Pk0 z4)7HR) zed3E-rt0|OHh=p*2zK-BkMP9@FI#|>2)M6wP(>DP8guTb2h8 z%$l}`0)PTLFl%}w6apaDzPC;L(QqK5G3%g&05)$;e=+e&;^-m3FJ@Q1+WA`7D_ytZ zX05v++=(MUp^U|pB#m^=!bGS+1KB=^0tSlK@uzFPE{?)>7^F?Gj~4s$8~tl> zYTp5UkU;iYK6@?8TL6L(^OW+YQYj{2qPi!Nc~ixNJ5arm$De7V+GDx098{JiRR(It z)>*^0JK1Z3NvVOPB0i~znQATPQ_BOXwR~#rWHMOzEY8KJx&o;+d}`ok!(o z$8|Si#JwKG!eYC*Dq86APx2){#f>8>?VSF6UC;0t-0E@Tj^LZ&(sS&aKxA3rbMSTY zv2OwopHhpIN%IbGT-{O|9dhhH(>l8aWrI%3;OL0L*fE`$0~o$}OF48n0_hceIvJh~ zq?hyQ<-wE;`pdG9Zn?U9Y`5Q53Bh-(aX}N8U=)ZoKSpUsrBOw|h=2A*Zea|Tc_U?L z3W;;+h!9Q}#JO~8J0!2JhenfKNTKq6kV1NwLy3yCQmhz9JWMtDj$^tlnipb@>W&kX z7-}GaIk)DV_KNO#Lo6vH$B5L15FD7Qxw+5kc2Y02YSIux(wRq1^i7Sv!Q=+cqBQAL zFZV(mGau}j2(=E;QG{KfGicRL=3*ZFM$~8yIyE6UjF#G2OV{TNRi?NhZGj>f@}jn) zrLp+Ugnmo=G_s=nuSCmYMWBj(c<)RRJqC*Cdf|6HoWjiXh0N-aY}@%pmx1fX&zajz zP8a&hm14efW#mZPHjYdVL%VqS=V`)J ze2J}P36Wa-#OPuSr78G^%WSGVd6dip&_ zhkBT~zZ0j<^v}^+Lj!}g29ot@FZeF`7>u*u(Bv;{nt=;QX9x74>Ul@~h#rRUz(EjH z+lW9sZ%LWx3{9H-N;mVRn}gVum2~hW z9YK5U)#|ZoDEg&A94-Oka2c93;8)*A5$1R*BHoHo=jY`7f}A_#{3V<)$VmD~u~nVad9kV|15OY=;gV(Wu=-gAO8KC1r^aiA zaBIwODpq*;`>RHKSA+WVCSg@jS@UniiXdVP0x;c|;mee1k0?p2Cb)}Om9>w)?|7mj z!-iOC-4=U`Z%7QR1!SOIlGv^MOl>9iT40;bh_YxDBQC_MP%sgzBBqR3`i)0ih*evo zHGR&Aga|@TDnxO`UL~%3c=PIUAXMaB1vm(%wZt9AY~cPHRk)1Y|HO^EP0k!Se-39Z zj-6HOLihSPvyz)%7apuacLOW-BB+__N)Ul~!E{Xlc|) z42oiHic|Is6*f!GTjab=PJo;sIq#4o^+SQYy2_ss_W>1fmz-F%YmnY{*jZW;5LR8% z%^I#)-KxtVL;jJLT70Xp)V^O0rDUjf2dxKi6eVF?9oUoxQfho;+EVB@JUn3>_xc<5 z`imc)F+A+I6y7PS6tr5Qc^$Nu26L){DVe~l4`yP)stOjh2D6HT1x?J86UeINvudH# zTWsVF`6H&wPmgP%m}+$fO=fmRkxo#;>Bfpfigb2i9-N}`gQl#Y$rd!F(K^YLa4UIx zWzf|b%&Vhn>6kz#6X}e>J`8Jl`&#B%!`s(@Oz`6AE2o1Q_7?}P3@p0RX;YdmZc5`Y z7B2h!s?Yt1-WB~!1RXl|sZB(Q9R1um`X+KF3Yy8GMUhJ)AH+*FoRysD%5_^QH90&< zOdP~yCFMF@wm$E6M$LjAcYKyewT^u2cj)wuw=HQeJbUR`f+}CoQv?!Z0EvKkVLTvf zA%S^~n)0fJMCLVVGP6m5;jWnlABo+g)o++w^IG#O&97~EWy1o@Bj`8W9JJeD57wD62+p`1qCTK`6=n+CAtm#s_;wi?U zF`5Di6?{U)-2@6hqK#D9YiJ^sz0z`3t-bdvHfd3>vbPk0ibRz?7L--?SWr;eV_rdJ zk9h@^J?52F_E=C>*<(RLWsi9yRQ6aLQ`y^cEpH)<1qGEo=FKJzx*X=s4O6 z-pgm+0@B_qWZoju-YaI_k}&PPQWjShR*;Lum4|8Xu_2OF_D1!io~!+1{S%!pKXm;e zKNMbV>-dCqA5~FK;c!MX?I`}Gt2*Sn`%O%}e_fh{On5QlrGHrVi7xD}a_=8q=~%#}=iC+lCptDlI$0ef?LaWv8-X6;R4U zM2h>=!$TQOw4q?cjOcoJ`tOAnlV3qSytw<)!;6O=oG~NIXMP$638)< zlL*I^997XxIN&pe&{H(CpCso9IX|VHLlPB1hj7k9zVDDv)?!OWoODE2nzk>CvRORc zxyv*3^l5i>59DS~4E6MEf&A1UZB+jDQ&32|ZAoVqR!XNWF2BL$x442z6*ZN|{}h^P zU|=M#6+V}{Xt-5Gch$AhhOz68u9b$LNx<<^DioW+XI)lRs5cvq3v5X}vCsh(~H zoimjaF4H-S5rE)_$H8zV%_rH9SaRij1dV8cE%OQ2JwJ`>1}yxKU0 z+mV;YDa?-iC(vRVqiAh4M`>%gO?SDiVK&_rv)SxUa_htNzvLNq?;|h0effPuvnodX z9fkewNZU?rv@K!Uc5)BGwe3EwR+65bS}UVkME`_Whm}_VI=15q8*9?FQ|m=kd*z>S zFJcwH7WH*Ztrw(=r`C&Eg+~2-C%5`d`vFj)h1Lhh5l?rg!h>(rJ_UWyC1Wu6~umj4YZ-kC_nF zL5h>9q@;Ev>kX-qN?l3!p+m(sv!pY{R&1PaWDFH+snwpM~x6ytT#ae^zPj78)W zlT$(t@y`{lRpG7;(kD_uN0qz+L3I*OhoeC^NPFaPm8PS#Qx0Nkf_-KC*P_03JF9sl ziHs!vE6_aZ^cS|w8e0FK=8>T6_ODFy=u1h$k*^y|L%6OH?dRx=boZ~Z_P{tEyuzh+ zo>Mb`hc$nddqTGs(l4lmE;WP^k-*X?t@}b+auw}T*&B$`DllRsY3xIPBJ`yV%N?p; zE*Wi+PMLBndZoe%RXe+V>b=4_T(}ftvie@B@o-g6-mC3Z#E45J-B;Vv#Gc9b2O05* zwn^s@>?R=JB1u~d($}uUNZo1LSj;6zy5e(2pd}5RIDYQ*xj_Lp{hLT2>^$VH29LNf zNwyy$Vz_N3g4MF(!at{Pb2fGtpViy}bYj`HvOqy2U(hIVAOof<-c&WwJDEIVY6zIx zc~kq0X=A`d8n%DpAq%lHriTT6)xtFp2Q4&`{7D`JNl~1nOerxg1Exxn{oFgLoiWu1 zOl`cWZN{`AVA{f)w*1L~A0K-2(2QxXP#o!3#uTvPa__CUTcqbo*>z&}I1N-g4c80- zC#hgc+|z)mf`mgm06A@{3z*jPru8$X_FEogCoN1}K5Bo-2yn+=Z^O8Z8 zK&r?qB-HTjghvy^K}J zUWR0}xD&JkrRuq6xDdLM#Vm>WuGk0DhSe1#?@;HG1OTv}Bj--^yFJ_>EtuF$3fo8; zw&9kQW^Q!p%eypfPooefeW{R$8sGoY;cJH{d#53&O=3|^#9oyJrap|(n9J@YS3=7^ zA@hBc2AbUWA*eZnia$9ihcICw`XC9PPfng)_Xz1V8S~Zwa*_2q=`Z~;8n$U z54YW;IE{Gr2no`#Ion6G_pedveh3zRfV&_&vPi!7$hl39ERfQLpesR;lG#gnPm)6& zQpn55XTqOQ@+^7Ef5Jj!LTlu1FcFw6v_U$eMS2u6FvTnV)|m8*9#dd&97QUnUv|sV zFCRkHf1CD!#a!Npm`jejmBQa8#k0aTNd3RT1*t9l%rx7v^|8 zhL%=$tS9zZ?1<)=7G|nVkL%rWo;WorN42HR9q&nS8}Ofie_v|m-H8lh`CiI3 zxy^uPPC`D@J+;T=F}uyf_(&kP1-X+}5a=knr_!zHVXic?lETM8G<)Efqh<)VY?p!uy{<2RCmVIhce|oIk zT^J^qQOxbYAoUJl`7+W9zIo4qDkd5a5L72DNrCC?JU2)J^6uvbdQY7=>TIk6ysPt6 z@6f4p{Z43H4tV+poj{!J;7&RB^!E;teJ@s2mGhvWBiP~uYE`4NY;fo_=W#oG2Pxd_ zWVVjT?>to1T-kKE(kURuI)_d>&#@E(CypEenvyG2+&+e_)Z*NFrlFo;+HQWjcL3F& zKQZWWRyvOyt6R&WnQAJF4B^N!NZe2%n@;os!*(C+DmlqWmXlCqsY$1~6UR>s^!7VL zijAS<9Y=>wJj3!M*Va7^^);KQ8gVdSQLAd`^hwWvv#g`pxnp3+Rmn0sMKoUY7bc!g zxIIpS(iMh_rC60L%(K{iMaaU$s~}FR0#>RKFTk>-{T#czT7+VykVb8{rnXrZpjgyM z86bktR}dEiUr{*(++%r?vJR=os4aNQE8{Tf7ZY--EO~4xN?q35q zrCR&|7Gxz1C{n42d&4?BQLOdV3QzLnUHXHFfm9a`CBrR|6~X>F=YQYPn3&=hHJ5yF&~=r-=*~YKCgXr$t~n z_~M2d#IQR-(TYYfR=z`_!T6if{PM>Ay3Vcf#eF93y!O22*_g*P=d~`A0&$yr1X+jM z4#J3H6_zF^t&Rg8k9$yMCFqrw3~>=)>qoVq(N0>0!)x|G4XEwV@~YHC8VoJ3p+(49 zCcWe)`N{t1)u(PGM%6uD{>ei=LkMa87oXuHNd(RtQBgv~2@S3Y*^ zPmuEzIr!3RxJ>-OI0rdy3Ni*0JI&Jzn~MWd-_bMY=F;fQ4Np%m+I(UNYDnie4_8k~ zVmcn)CD5^$ntm*R`Ud7Q6ul8XNIlf0Ifo*i;fYghNMn62sRPy!PaHWn&wxw-FFXqy?4*={IhQx*2F$g*xpvaV zn_B|rcHZ1R_1uhk$A~TnU~u&W{tXY$nD>q7XwRPXLhGg0S$nl$Fl;)paca{WJO5zk zJmG{Z4qw>9SJjZ(yuY$@wBTyh*Q#y_XyA!vDqu8!I?;hV{+#_YDF^(f1JFFTIs=vx z-cm9PV`229dnRSK-?SS~pw5+8%O}>(B-X!r^0m{ioSshXT+qdv9i&OW!7W#jB4l8+AnKyh$zBfyk&5XuVJ^pT};%Z6?~wHMF|-NL7Vei>4E%uKEK}Y zY*^4}8!ON+{6i`ZZ5VVT%8A9SjcocbkA;qngNo#JM?8inw z1snaC&zXEinc$}OHCH1cuOL_!4>+j=1}tJwl&wNbi2}r&r~nU3REkZ2BBRD2Hvnvf zs3EvM0r4?_X?YXws~;vsKf(l?qVyvHKq*o`E~U7Q?3-y-)SV*qQt1v@6wg)mAU?C= z`6^jWiuaqLtV{5FrS}UunGrDZSXAipnxnkWB*i!+xf7+H>D(slZOkXM-nefo;IgBP zPEwcE(VK)7dt*`b#&X}iu_!^sL9ixTbHv4c|AF3|${taEa|{GM zv`Y_zwT>p%6b>Oyy2R`P#q;A5-BfEYiKr4>yIM=5i7ti1h|4Vv6J3fj$5I|>#8s%v z?o@F;DMwlUl`H0wy1J%fbfdI6?Vq&GDr6AljKJEd0HT(@3)&uO9S6N!D_f*t=_L`T zOAzep5vQ7li80M30y62CrIL0nb4v3k`{O#W!PTn+`4^0 z{J5kgSy>R^V+3=00wZ%HQUg-zF+7^n9v3)}!bWS(+%q8HE%zKf-9I?IedXP8b+~Mh zY@$7^q~~crM%x1H$AY%3(TBelf4R6X^K`iuIB+PR=!Qo}&kKj|MJ99CpmwTUDkjS8?dfY^3A@G!jp24|T0=nYlVK{^=gETin8)?NYL{evx zUv>BVg3r;LOvVz0HwkjD**2mLnlj;-E3rQY$j|OeyGLLJQJc{GtwC%O6Pj-fen>a4 ze4)Yk^rfc*Nlre=DIytbN9#tpv9;rMH#%S5d41=ssp%$hdoB53*IJAtG1%KCWR0f! zb8BY}bwPtUU~s}*@_6>d;H;tHjydIpM=w1ZFz4~+JY|VTZs=b&UN_E~8g8yZX-Q6$ zmSh~!38kg@b82P{wQ^~Wi6bZ-8}c$(*Zk)BuXtWO_2p9lbLxJjdnEA=Af60Y3;}Ba zZ!Mro2;2GWX#VJtv65?5V0;(UPgTsCwhCPCkvTw3*k(BoIkIAxY>EK_VX@nJYodGhoT*E%}Pd#tu)^&*W53o}4vp@;^*a z+13K|$!Z*lWql%L;#F2uKMC}pbxf4)3vKhzhkf?Svw=((pXn0ep*lv-j@OSpGx5Z% zwdGbZ>avxiE?d&bcI@@dDFIUvZz>vpe9|^+YMF)^cu#p9>1JRQg=<1@;{ zvbrW5QxDBrcL;b`nJ$b@X41$ubX_XwvB~&ZQ!^EG&|lI#Yg&g|#Bm9l^1hWW%N+;+ zTQ{}8+l@iMn9bxrDng}WmZS?h*#3HgN4hP}m7pp2TY6bo;MFy+wZ76ib@q4L{d*tr zJL_kh2g8dXYt91BGun|~K;jQM!Tf^php%OjKR1|{Pxow?s}`O)vPr7I7ENl3jso;a zziY+0{of|xD5HvltR_RHz-X4XfG8<+fh4FZPX*GRs=RE8E|}yJFfA?i;6nC$P z$w{B>h%59gfg`;Em0k$XagZRS4N1a7+;*YwhGp8KwtW%X;*4krf}yq;SzA;_M$s0U zSp`c=5G7FL_C5bKpfpP|dusSc*tTnLNJm_sX@%5=u+(M+Vv^QvQp_vtiXyMRipUuB z89X@PoZ{P^3yyTw-K8t!1rTaV%O}NK=K1R^G zcgGInvn@R%phM{yea_SjDlUII6_-I@+$E!1zl7dJEswau=v^wGZl-)nn~R7Gs;8ko zgJP=tEcyj?O5F-AR;gbqWd)j`+NME`dWlaW=!>lP#J@9KA{9jitQ&7#hoOF4!~SLoVl2x zQ)lMPj6>cN;*fJ5dJrc}LT-$TVrMEMO3R0-(6QOn!+0WKm~bba9e!jLhpJ_Tcom0e zMY;2I@_Gx^)5$AElUfi+DdtmvjXRT4HnKUGoBz%H@x9~wC$!`H$4h~I(#z-8Og8+^ z+1CrFI;ILIpLwN~&uI^2Z{o8zg*N991j|Vp*#+W%W!0qayG`FHCN9Je^=~A-o;1=G zOwWDsi7QWxSIneWk8BM(3Pv|vX`GCI&G?G(hil$wd%bO@cE`w;`BdAB>#nSOvHePW zAhnH8Z3{va%$flUsloJ&(Sk1@ysU-YprAGTt_0VHy?Pgf z1Y+$8uxV3p^@*`37GmJPgQKVSig!T|KhCf?Jko62!l#!l#8Z?(W6xPgARkN#p$QgmT9LER>%N;#8-Zzt3H=Vg>((~HDD+9N-_}A_kG0kTH z*?Ii2nT(q0j9rtRWOnyfn!j-u4))0zm(PCHHUhk2htREetl7b|tUy{3pH?)!mrpBY zp>jTiH2sci&E-zwq>a~2rxi~%2Gi1mX&HbtPEJ{jF#t>~MPo)uG$1JHHrQ;tuk0T0 z;L}Q2IW9hpppw%n-zjTAIlHdx8qc3jE1F6}AyNYTOp;SRi~%qraH>=6iO>Z7ChM=*>lO{W6)7Fk8+%YgJgNe~UwqM#FFy->5+|l8wbiXMVviOVfu?a>D zVS=5A5vE)sD#*#6o6@sn?Lsm_Wk#~g>sNc|W~p0Od%jj{!Z*%cy)brRqVDUTnF5y+ zE#4f_;+(J+M@jy^QDd?JsXJ&*4_fWCzDXpTZ>@Q$^;+x1+23xzneMk1%viUf0&^1O z$P{yAt&rnGrUMp(9BiEm(;dKXO*r4v5&cE%X}GPl#%x506f6ZDhGuoGCZmKILB1fWY; zf2Y_OMXd4HC`MtU9s~g)3tSKcuwDum#2Dj07wb!xIdkeV?l%mgj5$$A8O)``^XpUD z&8QJS)TFLZTV|icOA%WI)iAkqiI5>b757Szj@PQeUq(65e^Stw78y{KN%&jE_DOfl zBf7vINa!9eiz1&NCZF@1XQ&!VA%BHj+}CN6|0bN_^4$ad&pG#>*~@vHAxvP8yL!0Q^DG4XhF}ijOikw==fLS9Cn4f`PI+L3(`^*LW{`L!nHv%X zs$wEFkjAOz{+Q}r!UdttKilwcfrqyHAs&eKIaoJ$$FOa-_PJR2g)O)Cxi|$rD}?_b zxMYw8mwsJ~;b0vT*a)njZ2QT&O&p5hv;zYt2Tq?Ka4H0oO6RntE-e$r!Ht^A1lXNS zfPF5Z2T6NQ!bInA>ngT~OnP6j4HPpUqu>5ol>J%FZHx8t_AmH?nYJK&T`*!F@^1aY z1^7rlB(GtzW3pkg*Pqiold{flTE~_OY=r*?B?yZ}fPC_b5v1TdCsK>@%@U$SDE?1U`z1GbFTpBXvQ8+qY6-Ycrt*n8^jTD}s_j zy`lhEq`8Dex|mI`Roqb+*&aYph)t*E?#d~ zNGa93m^KOScpIcmLcaal!&-e#m^ukXAO)$DFt4Ca!n}ey3G>S8BrLcwrcbL+QtFdX zm=sFrzVK*_UXLS_0&5guS)~L4=wK9-2J-*o6KXZDph=Q{PpvYmLtna+qQqM;Dpr7WCgVQ^e%3PRmWV!r3r&xb zS4f)yjS#_?37rxfbVRb2O>aj{4x|(yLRBF-L*?ZzTTV1sQK2mf>Ukdk8q^u4fl@G+ zw`7m;P~Gk`Ly}B0lfMl+#n6VHZR(5QiBOQhKI%)@WqGyY)=?{-av(VyX596lz6p=xD(%FBJa3GNwt53;h&4aPMR6Rr>?OI``KTGN7RLUJ_wc5}zx& z&|bMGM)X87Bnzf-Twl77O6h`qDlvW4BsSIbasUK~Z7*G6^B_SfC8wAlMrAyM@sh@c zg`_h$7)9ppC2(HfMLmLa7(1Uans9)QrPOD^cpyoPxvavrlF53pz2!JPzedh)lf&e{ zn84Rd6r86bzD+(FF)o9G-yz=*$axtKj!<7D|1NUAM9y!J^8z`S$+=FB+&qa|JxFg! zJmlgoXgAZ|Rt1DBVrdK)wEOocY4AsA8e1Q2IWG=h86GhSbX36A2hvOV^ioJcC00>w~?S^lE6vxX+LH9P-m z^H_7hQN=r|Zg^hqyWSVxR`YyH=8IKVs?gq=e3ag{NsHpiRJ}fXv(|60DJp<8kbh0* ztYPcC!9vymD<;lDrfc4wceQM+EMO-y3O5eC{P^|9rxJtBn}f~k=G!;Faq9I`i}5kl zyR_g*tnbh+BxoE}cQvt&P1+F)>Jx{9Y|Fz~`xX>^*)r~!=!NvyZG$~v$cL#1$X(6m ztPA9H@HriUoNas#EGBMWjL|0S)ZQVZ@h*Qw+pM9TdY7Fil4@j`5@=$G%L zDB{bI0X@|7x^=_I61@cZ1EsX=qek9TC!B&Mu5vGq-XZHyjw>+QIm+vd^6?zN$ zG9#LV;P*mNAv#}=EVHHdsq$ufbC2oJ%Iu{jsrMR?CZUf%WUI4M2trYo+6(yFa zlvspv%2eWu5nrwn@9-8tBqHI4QM6a*D^b*-B8}uNVQsBeDa(nH5^tD9`*N(srM@!E zqpv95Vy#Nf(0jOe-|y1rQnXraTuRaZdX<{X5Z|a0k8$xj-3eaT4fFRBgy+~|<++oH z674Hj)JBBdh|BBhBO+PEMYGMFB+fps&1?3S->`_s315Y`0=5FB?VXynckg}nLh*K$ zoA++0-+|7NC)_S&w`)l-^`#c5N9?DvuTJ*xEg$$5>OZ<2GDnnZv_f?@J0^2z()FH!Jya{hpvKP2aef=ZNtcp`HZa5HCNll+F*p*QAzf) zxAFM1Z}aasG?Vef$hP^yia=otU)ZuDp*~1}NP_h+Q&BzC8%7c@KRS9A1mEP87vrzQ zkLZI1HGzUPJpK%MBgV^J@1ae0$JNrY(((MU@{ums22RclSnGIeT`(&@m|Yrlx`MgI zuvVO$^u9UHoCKzL5T?jNe>s2(B3}5z=bOpg6VnV4u4uz(2-4h z^H43|nA=hoG_lDk*m38SPNehM)g!wgjzP6n^VVviR%kV#zC?X^eb`9DT4Y`WpVu&x zw+80KvGXsk2o&w+i+0o4WR?aqi(tl_Frw14m>shk-d?k4F)&kV*yfLxg5!~y6UeCL zGirk=1PpX9G{@(INs>cD{(hRqltL$lim{4dvSZZ4Cl`(%@?%UhZKNd;%*cH4_?6=r z3JAY~SwcQIE1aB6yTd=5fAqK(V&Ry(&8)nfDT|0DFkGKW5R;xEF zxg-b@lLWVgI2~9dS-DqjW45b#V|fcP6t3EMvyrdbhTkq&B;bqP6N}_2oxU)PwL=jK z76}W=ED{zJSR~9Vut=C!V39Dd%pzeynMJ~a0*i!sBUmIXPG*s?pui$wUV%lzyb&xC z7ALSsm{(wtFt5NOVcrN935$zhk+8V(aKmaWPGpfph2o-A!la>DaTM#fbXA0WcdPN_ zSD$m@Nt8Gs#6K~uzyWdI69>cu%0F?=BQ7^_M%3byz~QiXlE4K?0vAN&dxY~s(xT)Q zcn#^`Xvn+}GNh(fkC4KKTC%_~vExoA)S{=%UiN9uvQKltUm*^NO6kDT$^$1Og&8?Z zgyTA7=J!?7x5o*8`74xoT=P8CO#nK8!rB;n*^(9anq| zA+Z@Edxi15KAw4_b4zBm6k{jxo#b(kg0?~?x722rS&1&Am?sc#P|g!YogZ6&VAe-! zwwhV_bl>q|f>{)`?6$YFG=?@<~?y!kr$zD7=vp3af;4mp1g2gnbM9K=|W?EB5W)lr#} zj1|dfM~pN1Z7P-qQjICezW6Xs^hJ6e?Sm>nOC)ffQ@xNy1?1XaP}6=!4x>MX_hXRu zE8%pQ{yV#sP!!P+s=`+2 zFWz$Vr2oKUfdd{Me_K8N%wr?lRoShRJAy4+!T)n@*WLy@RL>Z!I>um?aSfI-Si}1~ zZ%Vzqb3Ack@2sijc4FG7_6zOvAVM+Vx^muDeq+tct=C&8pAEL{Kvk7p+S}{eQB_t2 zfEuzkYA@};5aXo0+VmqkaOf4eonUZAuxgjKZO7<^iH=!QHHaV)Ox%;#Pfk??TUznw zZ$Hep9G>6U`D62&X7E{Sb`zhKn79dyi5nYY;wl-fYE0ZPLquTW9=d*L$`M@G5?r@s ze#4eGe6Rb6iQD}kOx*GI*@oSLhJ$>=!9c@dzTvRH;K^A-516)U4BR{51MB^nl@o4& zkPuTCfr^R!vxa)LK=}mlDC)wPzJQOoO*`dk-cda<6sXz2&N4hYYZc-UjYyrZ-*9RF7B&pd0eA|Fy9hv=YZPDVWm_wjzeQrBiZMRNK#ez( zEAz^=II+XdR1;Qp$DQ25+5tn#NRcMIF{O(XXer9nyJO+%$n{#IA^CbU7<=8OEJ5&N zw$t4t{wsEOjdGhrpW8%p4bR_Jl&;3Q)Okz1 zF1*EJ#-MH%^UHUbhcb(7ZuWM{eHDr_TjZ3Y&s&c9wO$O#cW(va+vTw6^X4-a?M5*y z-;sWk91?x3rTDEyUg{%(*sYN%H5vetgwCA4N=0wBFHPyKWcBS-c><=s+k)}2hKU~4(Mw%G%)qCr) zCY1K|%IQU)xGMR|6t&*hZ}~=g*h~%oB_9#%lkfNOOD55fjdBU153!%nFBxypE!HC6 zZ)AKAdnmdd^!I%uGfXt+llyg_RG*8&O3wYW{_W#y{+8lfdQ5mTN>}_EUcB!ws!#cw zRGZ89zQWL4{!~{O^||b=yJ7ntkt1+nZ&~b4X8Xtn#cJ>XYmm3$MwYZzvsFk@?!TyU zRICZgx$*#G7rA~P)oF3&@!8I6T=s76=R{ii1vx|~+j>+w&LKqGzy|!^rP{mjT1k8R zFt)osEhZh}A}_Z_alBBKxW z5HZ}mR9`Cmq#U7;z5cjo0GjMv`~SuUiu>SI zl6-$p&OeaD#-6=}|4UD!3bB2Ko~~AYjZrOS`L+K`6@8ZqU?fmR_+&KC^YnnRD1V21 zFH;iwk_%8eKVWg>Frq0Vl@gUyK=PG@-2`d5f20SD(RmF8U=+ScL%5itjZZ%RlwHD+geA=kdz8Z%DtCp z3p$JZKw-%$31n6ASygXm)m+xkr`ku?j90&vTKDSlDbLNrZ>`-IEUWUT6~B`P4Qh?O z7DU1JE^VM~FWSw+0+FGmaX6+rE)5zkUCFcHVg6Om^j@ zmd~ym+4YVk`<=`(zqQPtSr*JIo=6|N_yI;_o9@+y_cfZPZMxYLr~CsKKGJATYd7l< zMYp?j>(niRl%(BCA*6%bxJ`E@@x9cdV4-W`z_kt2g%|xz+XGGe`KJB;gOBn}k5bIx zfxzJl{NW4!&s^jWU%b+FyPy<~J#VDzoy3g!lwyC0dnU!>H+eo$sllSw#dKp)n=C14 ziz+Ed?f7RSDfod22s@*V0Y?Szfd0;mqjtWmI#9NcFWVQ)%_B)VM;YY(f{tQ{Oaf6j zklVoLHUNkhl5VafKvF@taP&B21!Wnz5(wr6GD~^L`(-(qXddDFilTYWfTNLjG%~Tk zdfr|ibmWl^{*zapgv?$jniYD|gTdrFIWo>*}+td-htm9?}U)DKo=|m&0 z){NBz>?Hg+2@ydcckuS6sfs{L7vIta)=0~4-oE>C{Cts%#OeY?O?**Pps1BEYMtu3 zm4EBtt!)1j$NU4Q{Y9-aMQ3%Y%0O!kjeSgE__cjRbP)`JDP-eo3&jDVSFj$ZO*Bn%InA!{@FcXgJF1YQRHW`lW!S1*oT3}!p8J~Z}F&{6VXjs-Hq1)4kntMj?lmlNkpDgz~(_>xUQ zdvU;C4nf$7mdSHdM`r9DL1#sJw2)&l3_^pKicoT$Pt2BAVf!t<37xKs-YYXwd z*$@o=IU>!Q`Pql;u^h?4aBuFuOR|)Eca+589o2u zVB}-aPLAduvj{hr@JxUIscO%_GlNVPaIS%cPn~vq z`g=UjdXAohvC!)N-sfPIt@_wGstNc^)p&lcznAL~3~A0Kg{Pm36Ee&tNI6&%MHbG8 z={y%pm2n1YOe2LYtn!{-*qq~fU=WWZ?$?}^#kv86DcFm1b0EEH=F(U+K)=rSa&DL& z0T$S^LtG*iMrWP5ME1P@^zq}6SRj%X$IN*XIe;T>mjEC{5+Ga(IjL~wDp;oQ?kw%T zVAWm#6-uYP8RURrr{Rcm#1XZ6u6}hz!OWgCN`hvP7DK@~sE9SIDk7ATBa1`tf*F*HG@Z=2F%)^%A=n1-Mw{dIpTD2PO4_(15|Tv=tTb@mruo1 zt+p5;Pr*G^&GJ+q>+co4Q$j_Q zQV}ipR>Udc^$Deb8lmA_lEgnVw6KB5Cyt#d1u$ zk645RU)I~-5BRdpRO)tewvf|F&KYvHlCzDR9po^r8MYPJNWl(rHo=)QtY@pRr=6R^ zHv?;iV7zi6Myu7{HE6V^cQr-tYSMq9fv5Ion*D#NvHeW5@omk3qkTyGkuL1XM+PzY zp^>EtC45Yg@I#|U+w?O{^Sc_?M>>=SJXK@*iSUn!jw*qJ}(+Po(bB zYYY9Z&5IiHEIwsC6^}5bC(q(jmeYEz(O?DM|$S7eLS;kd>!4Ie^n!sFdJEPtW z?w%De+7V7`qi?*R#7(JBOvsRFyasX)SHnqqZ zEtJ|AiY|tq9e$3&oI`fHpbzDP>MNzqTZCOKnMFUTqap-6uH(w5PmHEGv6iEc$D#g;ObNI8d8rL^U_!9%n?Q z-+kYyW^so^ZK{ZaGVct-i54&C3Iur<&$Wou2N#>$hCz`|jPiAR5p1Tt9c!b^CJv1=qDa ztTzw4D%x^5Z=dV#x!~$9Ks)P^_a@vTtb){Z-L1#dmCsDpLer8apDLcDIYN#cUKJLV z{~;~1%F`-Tf*h!12If(yRDxiUgvwA+s60moUN~DmWI@&;1uB24%t)$0rN}Y^Qzxfp zf&G&vkwvaEN^KDfD4!t}+FxZIXMihIi7Jt14EMbXl_Bd}wVl!Yd$zp`?Lu0q?cF=- zYy1BChV~#ms)RF&_-h4C3`tu9))yJj9!$mbELE~jHKI~wTWT25u5D`6?5IhxtO+Wn zRb%I&{ZgqH(zr{l)}q?(4ox8$UQkv{3rtRX&>5ybYpmCHKVG0tE1*E)`L|S zYRPHIBDbH-Q3Gmus+QXb?N7G8<8Aj?A_kZd)cys(F&O zM5qFEw9S~H_j{m!He|rIm+%JE2so9&8-Mztia>kw?U2)6!BtMHWPM?5!IJs4XRy@{ z5G79PK%x@7L%;ojc6t z0v<`m=AEePd(`eiDr7>P`7=}6Az*Nk8Fk^3oOzD-5-15B-&!YiI&9@njlHxhoX8pf z-SW)I=IA(#`3czh@Kqi@l9(o(+SFCpNs21>eJ=;On&zj+85S>5O!d3DG0647N!}%u zI4vv|AR5Nm;4tgJZdz2* zL0&WtvH*MB9NeXYqIL*M_(KOpvQJdu=0yYJWBg12N~huO4fkTkc~VrnUE@3}st2Wv zlJ35)6fKRUO4J^SK{x1ia}q>e6UknYyejHb_j#Awp9RM<;9w3I$WT35UQd862Hjcg znXz~qW$`Y^mf{5O8=`U?tf3D~`dt9H0-_pc{!|tat!|HJ(m&~QgFY!^ic}6sGz5o+ zIELqN_jK6s5|p7124pHgdYKpcEQ9VWI4D|VYT8VG(VP;Q!MAXL&^!*nHIi{*NF}>+ zqA|zh87y5!j)~e-0~x?Al5UweD3XJsDX$0MU19IV<<6%M%5Z7eLO)_3(v_ zGG|=ncxkgQHZ3$odzM<_l^yZZH`Z-8!&f)Wgsn7MC0Og9DxT?=^(!?i1Bs@yFPbh0 zO&2~J{CwookwnvV!FoM>C0SG+?T-9a_?-=F$^Wqi?xcP}AFYWFBrN-0SoRB+{Z9v< zjVzBOEFIy?NlSV3%EuE+b+IX-s!g!8tvFY`U#R}*{z5C9x*;6AAy{stxGsO}Ub4rA zg{oG;(z;T<+Mnh+c|&MValJMYdTLG|(a+b!9gPWHQ{L zaB+BHIO>j7B}!WVWNMowleWsJdtv{qHfgK)s69$YI|SR_*zgkl+wP@dq2Z`tJ36a< zX*6Sxdj4XpblvEjB{yv4AJs(rqK$&BHg-47TVI$LR4XlC99kHP-i(p4m@z2 z)P~MFKOpF8X7x#(dA=o9F4*=7x_z^nJT#)&tR&0_HmeE8-jCa3bgVcO)pQc zT35TDy!YHHw4MEZ_jCUFC84!Pu=j+oy);|qh9krCzI26>=A|Dc%*Vr*z~DtCFU+-q zxi;3bQWM{MIAJ~l;R0KE^y=f@N4-xEKI>ZUN>pJz!dEx!wcEG&{QMtl-lD~%*-@~) zMdyMuI`%m5DDd>wv;O7&#O`B)?HDWwmv(0I}cqmAVV5bN9-wxQ(Pf_LT*!E z^6rrVhZoa)1w)Zg8~U;J%c-Rp1wFQYa)6U2$Kck^+K^%tN1C>Lw2~&}TXuj%3J7~D z-%6VDEu<+S45}#9cW7f9jaUoRfH>IJPu3=FC&l*T0s4YH**cK3RS4jz@NJ5Rv%iTa zV$ZR#vY@O~cvx<)Fpo6GnfBhm6LI9Ury4EbbEb7yKd>S7&NkvRF*iiWzClVtP6?S` zGHai8NberH5XD2pAoUe`=ZM%NzzMzrFu!{2tjVh{BHy5D9}72@x+Ng8d}ofEXY#S|jshb2_8LHszU7hMnZi^$VA0>l=5d zU${J5zZhvCDy>6SWP4ilBt7!*%wZheo7G2#kV+nd+l157EBSb-myrK~?^{nkcuu|Y zLGp9OPEqs-Jo@B6+T{461$L&00{HWuX|P8;L_)vdaVFzlITb8is0kL2BRtfou3hNs zW)ES?VMsW96$ z(fuEAy*fz3M6~^J$D@u|`=hQU>!Z_am8X^_o=q)JtxPP>tlnIHFJ5^{($`B=!)qD* zU=Y4O0~yOf@-;*dfRKX2n2KK@B$S;5C|xAMG6BNSq}sgW(~Z6u2<4!?<7sCtgTI()i3k@dxL_-QfxF zGUkjC8LR?T8J$?8s$;{y@&D4l z(hWY^YRy{1@ztr{g+2*=aZ@;bJ$}1y?Q~yUbtg{U`R?`XFQ|r$c1fye_C)mXi^_JP zvVFbsP+WB=P91tl?TRw%R9(ti7#mur>f@^VI92~zr7&v3=U?jxbMf2{BR>TH`)0Hx zHofZjUDYR5>!yq0i(gt#g|B?6wI>}VQTxK(S;Nb!Ju$_vF21Pi66(5E+Y@ys6IG|; zy7DAd6;rHHwSV4s_|dM_*2tBlX?Lu4%~Y2xZ%83~*G%=v^2W%Ojq;}1D;sT{f~qCl z1Ae9P%ksUkOY7w=3*9R%;cI`+!!{b4Uo@N)8cwb^oc@L4vEd`bXSY7T`{`YXSU|tk zj`==8Rgu)0V74T6MbbY2MsTm>e^Y7}eF%j)pn1*|G0nRZx-vY^c81H7j+*p;WmB@U z=|7rF)SAr$gwZzF8R?wAmyo7jYKvC)vMgR4nWWEYVyA&3TJfuUYkDP2!cyZ>n*ly) zL`|RczMk4ql%rl=)Wc&s$20fG+0+)X4@;G{ek9ABTqmTPN|GdB83@w!Z$#6kMoqRv>+!;PMF6;IAr!iY6%Q&l6_X@c4}N@E#Uffj zn>7+zOB6d6TNYY2_eyA;jMhtNgN!yxXcJLh`Plf#xapM8eQEUHn=$%INhplzM$V{+ zn)+l}Mf%@X@{Qv*Nmk=!3yXPEiIEN140<9xn`(@Jmke`sATqG2l~8HQVpK1o2Et;W zzp!|9;i^zny-7(lBLUJ{A}vw7VA!>3l4xe4xMZ_P0xcM{N}vsc#S&OTXsCw+4+dVB JVpKBi{{qL*3%mdT literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b57e9de15148d5872304744381842492c90dcaf8 GIT binary patch literal 7738 zcmcgxZEPDydY&bB`7Me1{)()X6kDPaQBM33TefwlIM$tWvC^~Tv`}F1*4(8;dqpz4 zOW6{rLd^xOd~p#6XdC5%LzRCrlD7WQ0`5QmDUd_|kd=L6HV)w8?nm>Fjtult6lkBB zSyGfNr)dgw0nX0s%)Il?%=5nU%*%hSsRKiaw%y@aR%P0)hv_^Jjqmlo9Ocq!px->5vF8hS^X4zoTr zmDaSzqt;@i`d!k6D(Q7vK;7=M_Xxp6Kn-Jr20u~T)Ou*P-J?MZVNQ;&(4-OV8Y}IZ zP{UhLEt^r-#Kvx^X!(>8`P|W`75X9c!?UXj#s8s)?|EL)Wq|9Y1#EC2d$wUK-9UShMNW@Vu_i4bP=jO^<2Uw0PD^8^gM?pc&?{qqbsU zlZWHZhHGlf(o$v`^;s*ao6*dI-IFw9O594$v!-Q@d4XfWXoO1C-pwf_%j9>y_Dx@; z;EQbfh6}#o2ejB<@bz!{h6=tRR@_w^E%>6FzQYCI;gVk(@Z`m3OCb_!$TuI!2an{% zBU_M5>;O|emQv;b!{kgVZD6qzSrb~%aYV-{`pOSoPlm>Q%md}fvao=Cyj+b~r>*KL zdl1!4y~C0_=gzt2iSZ`ecft6>oGZs`#GLpFc~6SFW=Pz1u?cM@$gCdMkqHUXCl2o4 zcim_a$&zPDQpu7x=Xr%(d=5Q*mgF=hIbTlrR4lh&^5?wE?wo(gd!spr{FRrmxWbK2 z3x171s+1Eyf@jPTq~2diW&Ph4M3Snhk_+5c$@N06>s9i$kn>JLe*wQG=_-l%CdO?c z`qh8ox#cpNF^28sxH1=?8&79bs;tOrGLg^>EoI5o;E~PvtTv~aGQc1&igGd~FFqHQ z=U;p6oNQ=sWs~qEvZCvJ+|VrcHj+aWM>Q`ZA1c0Q=ZCpk@cQTkLMYYE0@u#@hys!S|xq`Q6?eZN@;lOCIb6~x_(0M5DYkL$R z&mP{A+^vCs?K^a{uH+^)t(&2Jh0wm$h4tb4p^;+Wp^`-WO(lXhkks9m_jP{Jv47>& z2fZhYo%`44AG=+BwfRst6bbj`>jsMlju*mxB@b!XpC34}QGen=i(G6OE4KFMTL#v} z_38E3N<=t*qU3fppV)GXtqmn2);BzAC7lNz50LKT#nztnqmbGiz)+Y|s5>7V%!`9a zi@|+)ao;yzj|tGmq!7Q={&GMxo$z9gzaF?f zzFBwT*Ztp^bhm!q(k_4uxUnQgC+$PG^Jg*#emC+4P0rC8ihhWC93Y{(4Jr@GM(})I zJa52j65E4w34|g}#g-qsh;%|*pCX~lx9n1dr7Bp!9lOfQcTF%43e|POccmFQ7aTqv z2M?swKuX22(h+j*a_th_{c@e<=WRhH-OS;NzjBws2v;fskdkr;6JfCUv8HE$#Y8rR zQ!|~S2-4H(D{?jioF?P5axdS%Ug|2TH{3pbhn&-w!z-OiBxle)_z#5*{u$%C&yy275(2Ghky>QKzHa0ftV6330lY+tPKmshhyHDi5!5v zGHb>8u|03_R<@#)#mKC2gc>%>QDLAsP@R)i)6r)G0G*4=|n}D*?1h(K9SX<@>o^Lk-%9! zz=gF*&}FL{)tMa=HIh*bWv*)ZMymF&x&>Etgzrl}>awHP@G?_OC)^GPwLWhj<5voKOdWiL#tmm;he zKCd?_pG|fqYUhA;Xxars{s9t59RePgq0sL(aE*>$wgb@Yz+Zqr6^{A)L@{avtY_40cE?Nb}U zv67cajg&66Gtkvl}vqF|#902TO+SQXj>)l{sv_Xr|2Wjt=Np?=p9xmQ39?CoW0?w|GuX7#4TV zhH@TLvnY_JjI8!N6QrnWk{(0=@`B9T!!tt(Sh%AIKu^(z&d#zpUfwyMm}-tIb?lyf z8UbBm9WflWDFp`y^QpVccn_O(VDx=We&?)5D*TTg1Zg&XmAY0ld&$v|tukAUh+rJY z7Ec(1NFyQx1qV9)Yz!e`Os4{FgR(l7{3)$_7RfT%>L86BE1n0zzO^&!NAE^|6Z~nh zvHch4z``3lz^X%G4mm1r2gE=UIK=)Q849sIucN{z4iPrGa)Rno-5t1+S`?b>*(*o^ zSdua`GK2+jLCU#|UdXR7n{sbbA=J3IeoHN`zg!M3%~uAn2CNkEEIwchd3rbX<_P{y zth@S#C*K`iz3|s>e)#4`7Yn#>p#d;g5A+4X zH;??HyI9v)3~~tCHGFmXjdrKMqnl4>x^)0>d&>RGwSE5rX?AXJG(Li8imMzIK}1BX ziin8UpB53RL^zy?h_ocmATE-V;Avrhpxe~v_8{l2ieZR3Z+T?++i=da7i`hSF&Y7P zuNwm**}L6#tOu=5h-91G=R9QzhLrR0NYEtW)Rjnyh)9X3{2dm7S-?3OkdT!ph{iV0 zD3f*OT~lguKB*$4;c${hAJyCh*q2bt6ijPqab2;}2hD4BK$C>iWz}dMBKWK1f zq#XXi8d4p~?8agFJ5?`>LzIUp<}jb9a2M9(JF~zYll;D3|Nlbtn7U`KDRgzr5muts zUPZ=F$-UsJnjYmGE_yI(YI-6Db52Fyx-!oqNxhYhJ0>KblHdH6&b%u>jOn1epQm@d z^8LFy3SVRz?}${;BZx&R#*wLtRJ74Z#dX(S2NGfyj%~sInsb5^x#-vtxjtXozL(LxFi;+k-eUbVL9{*CReobzi2n zEPN<2gxTI~if3w^3}W@xECPt-qGKtXEYNX(=4>gCgU|?&L6g75&-@1@G!7CaHT4@* zdnBJtZG_G*pDB7n%S-Up2%nw!vGAGnFC!CMh_`t@ADLLcx)Rv*brgIZt4G%Q@B0Sx zBNK@GNeIKC0eEfXx#ictXp~pHU$peCj6Xo6#(TTve%-NRxO+2vpb$Q=Hh(W1{XEn;q! zNdCv|FV~6@VLZ}e*TvI1;x&{?no;Fa+@ME?LD6JO7XCUi4H~|&B}P<@QJQbq4n~y! zSm6Ji_}Dca*Fl4xLAD!?$rdlA(>nVv0rIl2Q(L4LLLN_Lu_bz=;B<_gr*w&L5l*yd zrV`%J=_ps1i3^(3X-w%}B$Uh7<-JHA*l7~DF+@{XeLu=yN3i8jYpdIDR1no9|iFe$7bDiQj=({{J-AG>(|v%_2L8^(){@}c}up`$Vs@()7| zB?)f~Q`>Sge*I#}OG~~d2BjUbyB_~mkmgi@{7^U}JaUJHa4}e0V(<2zwZ0F#c>dT) z_I@b&Ru2Ek^p=Enr2%qe scheme] + self.target_scheme_map = target_scheme_map + self.kv_cache_scheme = kv_cache_scheme + self.sparsity_scheme_map = sparsity_scheme_map + self.sparsity_ignore_list = sparsity_ignore_list + self.config = config + + if transform_config: + self.transform_config = TransformConfig.model_validate(transform_config) + else: + self.transform_config = None + + def get_linear_method(self) -> "CompressedTensorsLinearMethod": + return CompressedTensorsLinearMethod(self) + + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.float32, torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + def get_name(self) -> QuantizationMethods: + return "compressed-tensors" + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + self.target_scheme_map = hf_to_vllm_mapper.apply_dict(self.target_scheme_map) + self.ignore = hf_to_vllm_mapper.apply_list(self.ignore) + self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict( + self.sparsity_scheme_map + ) + self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list( + self.sparsity_ignore_list + ) + if self.kv_cache_scheme is not None: + self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(self.kv_cache_scheme) + + def get_quant_method( + self, + layer: torch.nn.Module, + prefix: str, + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + if isinstance(layer, LinearBase): + # collect schemes + quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) + input_tfms, output_tfms = get_linear_transform_schemes( + layer, prefix, self.transform_config, self.packed_modules_mapping + ) + + # choose quantization method + quant_method: LinearMethodBase = UnquantizedLinearMethod() + if quant_scheme is not None: + layer.scheme = quant_scheme + quant_method = CompressedTensorsLinearMethod(self) + + # choose transform method + if any((input_tfms, output_tfms)): + return CompressedTensorsLinearTransformMethod.from_schemes( + quant_method, quant_scheme, input_tfms, output_tfms + ) + + else: + return quant_method + + if isinstance(layer, Attention): + return CompressedTensorsKVCacheMethod(self) + if isinstance(layer, FusedMoE): + return CompressedTensorsMoEMethod.get_moe_method(self, layer) + return None + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig": + ignore: list[str] = cast(list[str], config.get("ignore", [])) + quant_format = cast(str, config.get("format")) + target_scheme_map = cls._quantization_scheme_map_from_config(config=config) + sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config( + config=config + ) + transform_config = config.get("transform_config") + + return cls( + target_scheme_map=target_scheme_map, + ignore=ignore, + quant_format=quant_format, + sparsity_scheme_map=sparsity_scheme_map, + sparsity_ignore_list=sparsity_ignore_list, + config=config, + transform_config=transform_config, + ) + + @classmethod + def _parse_sparsity_config( + cls, config: dict[str, Any] + ) -> tuple[dict[str, SparsityCompressionConfig], list[str]]: + """ + :param config: The `quantization_config` dictionary from config.json + :return: A tuple with two elements + 1. A dictionary mapping target layer names to their corresponding + sparsity_config + 2. A list of layer names to ignore for sparsity + """ + if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)): + return dict(), [] + + sparsity_config = SparsityCompressionConfig.model_validate(sparsity_config) + sparse_scheme_map: dict[str, SparsityCompressionConfig] = { + target: sparsity_config for target in sparsity_config.targets or list() + } + sparsity_ignore_list = sparsity_config.ignore or list() + return sparse_scheme_map, sparsity_ignore_list + + @classmethod + def _quantization_scheme_map_from_config( + cls, config: dict[str, Any] + ) -> QUANTIZATION_SCHEME_MAP_TYPE: + """ + :param config: The `quantization_config` dictionary from config.json + :return: A dictionary mapping target layer names to their corresponding + quantization_args for weights and input activations + """ + target_scheme_map: dict[str, Any] = dict() + quant_format = cast(str, config.get("format")) + + # The quant_config has multiple config_groups, each containing + # an input_activations key with details about how the activations are + # quantized, a weights key indicating how the weights are quantized, + # and a list of targets under the `targets` key, dictating which + # layers are impacted by the quantization details. The quantization + # details follow the structure defined by the QuantizationArgs + # pydantic model, which is used to verify the structure of the + # quant_config and also store the details for later use. + + config_groups = config.get("config_groups", dict()) + for _, quant_config in config_groups.items(): + targets = quant_config.get("targets") + for target in targets: + target_scheme_map[target] = {} + target_scheme_map[target]["weights"] = QuantizationArgs.model_validate( + quant_config.get("weights") + ) + + target_scheme_map[target]["input_activations"] = None + target_scheme_map[target]["format"] = quant_config.get("format") + format = target_scheme_map[target].get("format") + # If no per-config format defined, use global format in config + act_quant_format = ( + is_activation_quantization_format(format) + if format is not None + else is_activation_quantization_format(quant_format) + ) + # TODO(czhu): w4a8fp8 is in packed-quantized format + # but needs input activation quantization + input_activations = quant_config.get("input_activations") + if act_quant_format or input_activations: + # The only case where we have activation quant supported + # but no input_activations provided in the config + # should be w8a16fp8 w8a16fp8 can also run for cases where + # there is an input_quant but it is ignored + if not input_activations: + assert ( + target_scheme_map[target]["weights"].type + == QuantizationType.FLOAT + ) + else: + target_scheme_map[target]["input_activations"] = ( + QuantizationArgs.model_validate( + quant_config.get("input_activations") + ) + ) + return target_scheme_map + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + def _check_scheme_supported( + self, min_capability: int, error: bool = True, match_exact: bool = False + ) -> bool: + capability_tuple = current_platform.get_device_capability() + + if capability_tuple is not None: + capability = capability_tuple.to_int() + if match_exact: + supported = capability == min_capability + if error and not supported: + raise RuntimeError( + "Quantization scheme is not supported for ", + "the current GPU. Required capability: ", + f"{min_capability}. Current capability: {capability}.", + ) + else: + supported = capability >= min_capability + if error and not supported: + raise RuntimeError( + "Quantization scheme is not supported for ", + f"the current GPU. Min capability: {min_capability}. ", + f"Current capability: {capability}.", + ) + return supported + else: + return False + + def _is_fp4a4_nvfp4( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ): + if weight_quant is None or input_quant is None: + return False + + is_tensor_group_quant = ( + weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value + and input_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value + ) + is_symmetric = weight_quant.symmetric and input_quant.symmetric + + is_group_size_16 = ( + weight_quant.group_size == 16 and input_quant.group_size == 16 + ) + is_float_type = ( + weight_quant.type == QuantizationType.FLOAT + and input_quant.type == QuantizationType.FLOAT + ) + is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4 + + return ( + is_tensor_group_quant + and is_float_type + and is_4_bits + and is_group_size_16 + and is_symmetric + ) + + def _is_fp4a16_nvfp4( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ): + is_weight_only = weight_quant is not None and input_quant is None + is_tensor_group_quant = ( + weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value + ) + is_symmetric = weight_quant.symmetric + + is_group_size_16 = weight_quant.group_size == 16 + is_float_type = weight_quant.type == QuantizationType.FLOAT + is_4_bits = weight_quant.num_bits == 4 + + return ( + is_weight_only + and is_tensor_group_quant + and is_float_type + and is_4_bits + and is_group_size_16 + and is_symmetric + ) + + def _is_static_tensor_w8a8( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.CHANNEL.value + or weight_quant.strategy == QuantizationStrategy.GROUP.value + ) + is_tensor = ( + weight_strategy + and input_quant.strategy == QuantizationStrategy.TENSOR.value + ) + is_static = not weight_quant.dynamic and not input_quant.dynamic + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_8_bits and is_tensor and weight_quant.symmetric and is_static + + def _is_dynamic_token_w8a8( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.CHANNEL.value + or weight_quant.strategy == QuantizationStrategy.GROUP.value + ) + is_token = ( + weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value + ) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_8_bits and is_token and weight_quant.symmetric and is_dynamic + + def _is_dynamic_token_w4a8_int( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + is_weight_4_bits = weight_quant.num_bits == 4 + is_activation_8_bits = input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.GROUP.value + or weight_quant.strategy == QuantizationStrategy.CHANNEL.value + ) + is_token = ( + weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value + ) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return ( + is_weight_4_bits + and is_activation_8_bits + and is_token + and weight_quant.symmetric + and is_dynamic + ) + + def _is_fp8_w8a8( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + # Confirm weights and activations quantized. + if weight_quant is None or input_quant is None: + return False + + # Confirm weight scheme is supported. + is_floating_point = ( + weight_quant.type == QuantizationType.FLOAT + and input_quant.type == QuantizationType.FLOAT + ) + is_symmetric_weight = weight_quant.symmetric + is_static_weight = not weight_quant.dynamic + is_tensor_or_channel_or_block_weight = weight_quant.strategy in [ + QuantizationStrategy.TENSOR, + QuantizationStrategy.CHANNEL, + QuantizationStrategy.BLOCK, + ] + if not ( + is_floating_point + and is_symmetric_weight + and is_static_weight + and is_tensor_or_channel_or_block_weight + ): + return False + + # Dynamic quantization is always supported if weights supported. + if input_quant.dynamic: + return True + + # Confirm activation scheme is supported. + is_symmetric_activation = input_quant.symmetric + is_per_tensor_activation = input_quant.strategy == QuantizationStrategy.TENSOR + return is_symmetric_activation and is_per_tensor_activation + + def _is_fp8_w4a8( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + if not weight_quant or not input_quant: + return False + is_weight_4_bits = weight_quant.num_bits == 4 + is_activation_8_bits = input_quant.num_bits == 8 + weight_strategy = weight_quant.strategy == QuantizationStrategy.GROUP.value + is_token = ( + weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value + ) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + is_symmetric = weight_quant.symmetric and input_quant.symmetric + # Only per-group symmetric weight (4bit) + # + per-tok symmetric activation (8bit) quantization supported. + return ( + is_weight_4_bits + and is_activation_8_bits + and is_token + and is_symmetric + and is_dynamic + ) + + def _is_fp8_w4a8_sm90( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + return self._check_scheme_supported( + 90, error=False, match_exact=True + ) and self._is_fp8_w4a8(weight_quant, input_quant) + + def _is_fp8_w8a8_sm90( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + return self._check_scheme_supported( + 90, error=False, match_exact=True + ) and self._is_fp8_w8a8(weight_quant, input_quant) + + def _is_fp8_w8a8_sm100( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + return self._check_scheme_supported( + 100, error=False, match_exact=True + ) and self._is_fp8_w8a8(weight_quant, input_quant) + + def _is_fp8_w8a16( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + # Confirm weights quantized. + if weight_quant is None: + return False + + # Confirm we have floating points. + if weight_quant.type != QuantizationType.FLOAT: + return False + + # Confirm weight scheme is supported. + is_symmetric_weight = weight_quant.symmetric + is_static_weight = not weight_quant.dynamic + is_tensor_or_channel_or_block_weight = weight_quant.strategy in [ + QuantizationStrategy.TENSOR, + QuantizationStrategy.CHANNEL, + QuantizationStrategy.BLOCK, + ] + return ( + is_symmetric_weight + and is_static_weight + and is_tensor_or_channel_or_block_weight + ) + + def _is_wNa16_group_channel( + self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs + ) -> bool: + input_quant_none = input_quant is None + is_channel_group = ( + weight_quant.strategy == QuantizationStrategy.CHANNEL.value + or weight_quant.strategy == QuantizationStrategy.GROUP.value + ) + is_static = not weight_quant.dynamic + + return is_channel_group and input_quant_none and is_static + + def _get_scheme_from_parts( + self, + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs, + format: str | None = None, + ) -> "CompressedTensorsScheme": + # use the per-layer format if defined, otherwise, use global format + format = format if format is not None else self.quant_format + + # Detect If Mixed Precision + if self._is_fp4a16_nvfp4(weight_quant, input_quant): + return CompressedTensorsW4A16Fp4() + + if self._is_fp8_w4a8_sm90(weight_quant, input_quant): + return CompressedTensorsW4A8Fp8( + num_bits=weight_quant.num_bits, + strategy=weight_quant.strategy, + symmetric=weight_quant.symmetric, + group_size=weight_quant.group_size, + actorder=weight_quant.actorder, + ) + + if self._is_wNa16_group_channel(weight_quant, input_quant): + if ( + format == CompressionFormat.marlin_24.value + and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS + ): + assert weight_quant.symmetric + return CompressedTensorsW4A16Sparse24( + strategy=weight_quant.strategy, + num_bits=weight_quant.num_bits, + group_size=weight_quant.group_size, + ) + if ( + format == CompressionFormat.pack_quantized.value + and weight_quant.num_bits in WNA16_SUPPORTED_BITS + ): + return CompressedTensorsWNA16( + num_bits=weight_quant.num_bits, + strategy=weight_quant.strategy, + symmetric=weight_quant.symmetric, + group_size=weight_quant.group_size, + actorder=weight_quant.actorder, + ) + + act_quant_format = is_activation_quantization_format(format) + if act_quant_format: + if self._is_fp4a4_nvfp4(weight_quant, input_quant): + if cutlass_fp4_supported() or envs.VLLM_USE_NVFP4_CT_EMULATIONS: + return CompressedTensorsW4A4Fp4() + else: + logger.warning_once( + "Current platform does not support cutlass NVFP4." + " Running CompressedTensorsW4A16Fp4." + ) + return CompressedTensorsW4A16Fp4(has_input_global_scale=True) + + if self._is_fp8_w8a8(weight_quant, input_quant): + is_fp8_w8a8_supported = self._check_scheme_supported( + CompressedTensorsW8A8Fp8.get_min_capability(), error=False + ) + if is_fp8_w8a8_supported: + return CompressedTensorsW8A8Fp8( + weight_quant=weight_quant, + is_static_input_scheme=( + input_quant and not input_quant.dynamic + ), + ) + else: + # note: input_quant will be present for converted models; + # will be ignored during inference post loading + return CompressedTensorsW8A16Fp8( + strategy=weight_quant.strategy, + is_static_input_scheme=not input_quant.dynamic, + ) + + # note: input_quant can be None + if self._is_fp8_w8a16(weight_quant, input_quant): + is_static_input_scheme = input_quant and not input_quant.dynamic + return CompressedTensorsW8A16Fp8( + strategy=weight_quant.strategy, + is_static_input_scheme=is_static_input_scheme, + ) + + if self._is_static_tensor_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=True, + input_symmetric=input_quant.symmetric, + ) + + if self._is_dynamic_token_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8Int8( + strategy=weight_quant.strategy, + is_static_input_scheme=False, + input_symmetric=input_quant.symmetric, + ) + + if self._is_dynamic_token_w4a8_int(weight_quant, input_quant): + is_static_input_scheme = input_quant and not input_quant.dynamic + return CompressedTensorsW4A8Int( + num_bits=weight_quant.num_bits, + strategy=weight_quant.strategy, + group_size=weight_quant.group_size, + is_static_input_scheme=is_static_input_scheme, + input_symmetric=input_quant.symmetric, + ) + + raise NotImplementedError("No compressed-tensors compatible scheme was found.") + + def get_scheme( + self, layer: torch.nn.Module, layer_name: str | None = None + ) -> Optional["CompressedTensorsScheme"]: + """ + compressed-tensors supports non uniform in the following way: + + targets of config_groups: There can be N config_groups which each + have a quantization scheme. Each config_group has a list of targets + which can be a full layer_name, a regex for a layer_name, or + an nn.Module name. + + Detect whether a layer_name is found in any target and + use the quantization scheme corresponding to the matched target + to select the CompressedTensorsScheme used for inference. + """ + + # Find the "target" in the compressed-tensors config + # that our layer conforms to. + # TODO (@kylesayrs): support ignore module names with ct matching utils + if should_ignore_layer( + layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping + ): + return None + + # Will be empty for models with only sparsity + weight_quant = input_quant = None + if self.target_scheme_map: + matched_target = find_matched_target( + layer_name=layer_name, + module=layer, + targets=self.target_scheme_map.keys(), + fused_mapping=self.packed_modules_mapping, + ) + + scheme_dict = self.target_scheme_map[matched_target] + weight_quant = scheme_dict.get("weights") + input_quant = scheme_dict.get("input_activations") + format = scheme_dict.get("format") + + # Find the sparsity scheme of the layer + # assume that fused layers inherit first component's sparsity scheme + sparsity_targets = self.sparsity_scheme_map.keys() - set( + self.sparsity_ignore_list + ) + sparsity_scheme: SparsityCompressionConfig | None = None + with suppress(ValueError): + matched_target = find_matched_target( + layer_name=layer_name, + module=layer, + targets=sparsity_targets, + fused_mapping=self.packed_modules_mapping, + ) + sparsity_scheme = self.sparsity_scheme_map[matched_target] + + if self.supports_cutlass_24( + weight_quant=weight_quant, + input_quant=input_quant, + sparsity_scheme=sparsity_scheme, + ): + # Have a valid sparsity scheme + # Validate layer is supported by Cutlass 2:4 Kernel + model_compression_config = ( + None + if sparsity_scheme is None or sparsity_scheme.format == "dense" + else self.config + ) + + scheme = CompressedTensors24( + quantized=weight_quant is not None or input_quant is not None, + weight_quant=weight_quant, + input_quant=input_quant, + model_compression_config=model_compression_config, + ) + elif weight_quant is None: + logger.warning_once( + "Acceleration for non-quantized schemes is " + "not supported by Compressed Tensors. " + "Falling back to UnquantizedLinearMethod" + ) + return None + + else: + # Find the quant_scheme + scheme = self._get_scheme_from_parts( # type: ignore + weight_quant=weight_quant, input_quant=input_quant, format=format + ) + + # Raise error if device does not support the scheme + # (e.g. fp8 needs ada lovelace) + self._check_scheme_supported(scheme.get_min_capability()) + logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name) + return scheme + + def get_cache_scale(self, name: str) -> str | None: + """ + Check whether the param name matches the format for k/v cache scales + in compressed-tensors. If this is the case, return its equivalent + param name expected by vLLM + + :param name: param name + :return: matching param name for KV cache scale in vLLM + """ + if name.endswith(".output_scale") and ".k_proj" in name: + return name.replace(".k_proj.output_scale", ".attn.k_scale") + if name.endswith(".output_scale") and ".v_proj" in name: + return name.replace(".v_proj.output_scale", ".attn.v_scale") + # If no matches, return None + return None + + def has_blocked_weights(self) -> bool: + for scheme in self.target_scheme_map.values(): + weight_quant = scheme.get("weights") + if ( + weight_quant is not None + and weight_quant.strategy == QuantizationStrategy.BLOCK + ): + return True + return False + + @staticmethod + def supports_cutlass_24( + weight_quant: QuantizationArgs | None, + input_quant: QuantizationArgs | None, + sparsity_scheme: SparsityCompressionConfig | None = None, + ) -> bool: + """ + Check if the layer is supported by the Cutlass 2:4 Kernel + Conditions: + - Overarching condition: Sparsity Structure is 2:4 + - Unquantized cases are supported + - Weight only quantization is not-supported + - Supported weight quantization strategies are TENSOR and CHANNEL + - Supported input quantization strategies are TENSOR and TOKEN + - Only 8 bit quantization is supported + + :return: True if the layer is supported by the Cutlass 2:4 Kernel + False otherwise + """ + if sparsity_scheme is None: + return False + + is_valid_sparsity_structure: bool = ( + sparsity_scheme.sparsity_structure == SparsityStructure.TWO_FOUR.value + ) + + valid_compressors = { + CompressionFormat.dense.value, + CompressionFormat.sparse_24_bitmask.value, + } + + is_valid_sparsity = ( + is_valid_sparsity_structure and sparsity_scheme.format in valid_compressors + ) + + if not is_valid_sparsity: + return False + + # Unquantized cases are supported + if weight_quant is None and input_quant is None: + return True + + # Weight only quantization is not-supported + if weight_quant is not None and input_quant is None: + return False + + supported_weight_quant_strategies = [ + QuantizationStrategy.TENSOR.value, + QuantizationStrategy.CHANNEL.value, + ] + + assert weight_quant is not None + assert input_quant is not None + if weight_quant.strategy not in supported_weight_quant_strategies: + return False + + supported_input_quant_strategies = [ + QuantizationStrategy.TENSOR.value, + QuantizationStrategy.TOKEN.value, + ] + + if input_quant.strategy not in supported_input_quant_strategies: + return False + + return weight_quant.num_bits == input_quant.num_bits == 8 + + +class CompressedTensorsLinearMethod(LinearMethodBase): + def __init__(self, quantization_config: CompressedTensorsConfig): + self.quantization_config = quantization_config + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.scheme.process_weights_after_loading(layer) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + """ + Use the CompressedTensorsScheme associated with each layer to create + the necessary parameters for the layer. See LinearMethodBase for param + details + """ + weight_loader = extra_weight_attrs.get("weight_loader") + layer.scheme.create_weights( + layer=layer, + input_size=input_size, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + output_size=output_size, + params_dtype=params_dtype, + weight_loader=weight_loader, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ): + """ + Use the output of create_weights and the CompressedTensorsScheme + associated with the layer to apply the forward pass with the + layer input. See LinearMethodBase for param details + + """ + scheme = layer.scheme + if scheme is None: + raise ValueError("A scheme must be defined for each layer") + return scheme.apply_weights(layer, x, bias=bias) + + +class CompressedTensorsKVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from compressed-tensors + checkpoints. + """ + + def __init__(self, quant_config: CompressedTensorsConfig): + self.validate_kv_cache_scheme(quant_config.kv_cache_scheme) + super().__init__(quant_config) + + @staticmethod + def validate_kv_cache_scheme(kv_cache_scheme: dict[str, Any] | None): + """ + Validator for the kv cache scheme. Useful for controlling the + kv cache quantization schemes, that are being supported in vLLM + :param kv_cache_scheme: the compressed-tensors kv cache scheme + """ + if kv_cache_scheme is None: + return + + type_ = kv_cache_scheme.get("type") + num_bits = kv_cache_scheme.get("num_bits") + + if type_ != "float" and num_bits != 8: + raise NotImplementedError( + "Currently supported kv cache quantization is " + "num_bits=8, type=float, however " + f"received num_bits={num_bits}, type={type_}" + ) + + strategy = kv_cache_scheme.get("strategy") + if strategy != "tensor": + raise NotImplementedError( + "Only support per-tensor scaling factor " + "for compressed-tensors KV cache. " + f"Expected strategy: tensor, found strategy: {strategy}" + ) + + is_symmetric = kv_cache_scheme.get("symmetric") + if not is_symmetric: + raise NotImplementedError( + "Only support symmetric scaling factor " + "for compressed-tensors KV cache. " + f"However found symmetric: {is_symmetric}" + ) diff --git a/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py new file mode 100644 index 0000000..dc21258 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -0,0 +1,3534 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import enum +from collections.abc import Callable +from enum import Enum + +import torch +from compressed_tensors import CompressionFormat +from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy + +import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + FusedMoEActivationFormat, + FusedMoEConfig, + FusedMoEMethodBase, + FusedMoEPermuteExpertsUnpermute, + FusedMoeWeightScaleSupported, +) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + fp8_w8a8_moe_quant_config, + int4_w4a16_moe_quant_config, + int8_w8a8_moe_quant_config, + int8_w8a16_moe_quant_config, + nvfp4_moe_quant_config, +) +from vllm.model_executor.layers.fused_moe.cpu_fused_moe import select_experts +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + is_valid_flashinfer_cutlass_fused_moe, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + BatchedMarlinExperts, + MarlinExperts, + fused_marlin_moe, +) +from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa + WNA16_SUPPORTED_BITS, + WNA16_SUPPORTED_TYPES_MAP, +) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + find_matched_target, +) +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + build_flashinfer_fp4_cutlass_moe_prepare_finalize, + reorder_w1w3_to_w3w1, + select_nvfp4_gemm_impl, +) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + expert_weight_is_col_major, + requant_weight_ue8m0_inplace, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + check_moe_marlin_supports_layer, + marlin_make_workspace_new, + marlin_moe_permute_scales, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + prepare_moe_fp4_layer_for_marlin, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + prepare_moe_fp8_layer_for_marlin, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import swizzle_blockscale +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + all_close_1d, + normalize_e4m3fn_to_e4m3fnuz, + per_tensor_dequantize, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import CpuArchEnum, current_platform +from vllm.scalar_type import scalar_types +from vllm.utils.deep_gemm import ( + get_col_major_tma_aligned_tensor, + is_deep_gemm_e8m0_used, +) + +logger = init_logger(__name__) +import ixformer.inference.functions as ixfops +import vllm.envs as envs + + +class GPTQMarlinState(Enum): + REPACK = enum.auto() + READY = enum.auto() + + +__all__ = [ + "CompressedTensorsMoEMethod", + "CompressedTensorsW8A8Fp8MoEMethod", + "CompressedTensorsW8A8Int8MoEMethod", + "CompressedTensorsWNA16MarlinMoEMethod", + "CompressedTensorsWNA16MoEMethod", + "CompressedTensorsW4A4MoeMethod", + "CompressedTensorsW4A8Int8MoEMethod", +] + + +class CompressedTensorsMoEMethod(FusedMoEMethodBase): + @staticmethod + def get_moe_method( + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + layer: torch.nn.Module, + ) -> "CompressedTensorsMoEMethod": + # TODO: @dsikka: refactor this to use schemes as other kernels + # are supported + check if the layer is being ignored. + # Check if a using "Linear" to select schemes + if "Linear" in quant_config.target_scheme_map: + matched_target = "Linear" + else: + # May have instead defined the linear layers in the fused model + + fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"] + current_scheme = None + for fused_layer in fused_layers: + # Check if one of the fused layers are defined in quant_config + matched_target = find_matched_target( + layer_name=fused_layer, + module=layer, + targets=quant_config.target_scheme_map.keys(), + fused_mapping=quant_config.packed_modules_mapping, + ) + + # Only valid if down_proj, gate_proj, and up_proj + # are mapped to the same quant scheme in the quant_config + if current_scheme is None: + current_scheme = quant_config.target_scheme_map.get(matched_target) + else: + assert current_scheme == quant_config.target_scheme_map.get( + matched_target + ) + + weight_quant = quant_config.target_scheme_map[matched_target].get("weights") + input_quant = quant_config.target_scheme_map[matched_target].get( + "input_activations" + ) + + if quant_config._is_wNa16_group_channel(weight_quant, input_quant): + # group_size=None means channelwise + group_size = weight_quant.group_size or -1 + # Prefer to use the MarlinMoE kernel when it is supported. + if ( + not check_moe_marlin_supports_layer(layer, group_size) + or current_platform.is_rocm() + ): + if ( + weight_quant.strategy == QuantizationStrategy.GROUP + and weight_quant.actorder + in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC) + ): + raise ValueError( + "WNA16MoE is not supported with actorder=group/dynamic." + ) + logger.info_once("Using CompressedTensorsWNA16MoEMethod") + return CompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config) + else: + logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") + return CompressedTensorsWNA16MarlinMoEMethod( + quant_config, layer.moe_config + ) + elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): + return CompressedTensorsW4A4MoeMethod(layer.moe_config) + elif ( + quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant) + or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant) + or quant_config._is_fp8_w8a8(weight_quant, input_quant) + ): + return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config) + elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant) or quant_config._is_static_tensor_w8a8(weight_quant, input_quant): + if envs.VLLM_W8A8_MOE_USE_W4A8: + return CompressedTensorsW4A8MoEMethod(quant_config, layer.moe_config) + else: + return CompressedTensorsW8A8Int8MoEMethod(quant_config, layer.moe_config) + elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant): + return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config) + else: + raise RuntimeError( + f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}" + ) + + +class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): + def __init__(self, moe: FusedMoEConfig): + from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 + detect_nvfp4_moe_support, + ) + + super().__init__(moe) + _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) + self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported + self.allow_flashinfer = _nvfp4.allow_flashinfer + self.use_marlin = _nvfp4.use_marlin + self.group_size = 16 + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.num_experts = num_experts + layer.params_dtype = params_dtype + + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + # 2 fp4 items are packed in the input dimension + hidden_size // 2, + requires_grad=False, + dtype=torch.uint8, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_packed", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + # 2 fp4 items are packed in the input dimension + intermediate_size_per_partition // 2, + dtype=torch.uint8, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_packed", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # Weight Scales + w13_weight_scale = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + # 2 fp4 items are packed in the input dimension + hidden_size // self.group_size, + dtype=torch.float8_e4m3fn, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.GROUP.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + + w2_weight_scale = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + # 2 fp4 items are packed in the input dimension + intermediate_size_per_partition // self.group_size, + dtype=torch.float8_e4m3fn, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.GROUP.value} + ) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # Weight Global Scales + w13_weight_scale_2 = torch.nn.Parameter( + torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_weight_global_scale", w13_weight_scale_2) + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + set_weight_attrs(w13_weight_scale_2, extra_weight_attrs) + + w2_weight_scale_2 = torch.nn.Parameter( + torch.empty(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_weight_global_scale", w2_weight_scale_2) + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + set_weight_attrs(w2_weight_scale_2, extra_weight_attrs) + + # Input Global Scales + w13_input_scale = torch.nn.Parameter( + torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_input_global_scale", w13_input_scale) + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter( + torch.empty(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_input_global_scale", w2_input_scale) + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # From packed to weight + layer.w13_weight = torch.nn.Parameter( + layer.w13_weight_packed.data, requires_grad=False + ) + delattr(layer, "w13_weight_packed") + + layer.w2_weight = torch.nn.Parameter( + layer.w2_weight_packed.data, requires_grad=False + ) + delattr(layer, "w2_weight_packed") + + # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel. + if self.allow_flashinfer: + w, s = reorder_w1w3_to_w3w1( + layer.w13_weight.data, layer.w13_weight_scale.data, dim=-2 + ) + layer.w13_weight = torch.nn.Parameter(w, requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False) + + if not torch.allclose( + layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1] + ): + logger.warning_once( + "w1_weight_global_scale must match w3_weight_global_scale. " + "Accuracy may be affected." + ) + + # Take inverse of global scale saved to disk + layer.w13_weight_scale_2 = torch.nn.Parameter( + 1 / layer.w13_weight_global_scale[:, 0], requires_grad=False + ) + + layer.w2_weight_scale_2 = torch.nn.Parameter( + 1 / layer.w2_weight_global_scale.data, requires_grad=False + ) + + if self.use_marlin: + prepare_moe_fp4_layer_for_marlin(layer) + return + + # swizzle weight scales + layer.w13_weight_scale = torch.nn.Parameter( + swizzle_blockscale(layer.w13_weight_scale), requires_grad=False + ) + + layer.w2_weight_scale = torch.nn.Parameter( + swizzle_blockscale(layer.w2_weight_scale), requires_grad=False + ) + + # w13 + w13_input_global_scale = layer.w13_input_global_scale.max(dim=1).values.to( + torch.float32 + ) + + layer.g1_alphas = torch.nn.Parameter( + ((1 / w13_input_global_scale) * layer.w13_weight_scale_2), + requires_grad=False, + ) + + layer.w13_input_scale_quant = torch.nn.Parameter( + (w13_input_global_scale), requires_grad=False + ) + + # w2 + layer.g2_alphas = torch.nn.Parameter( + ((1 / layer.w2_input_global_scale) * layer.w2_weight_scale_2).to( + torch.float32 + ), + requires_grad=False, + ) + + layer.w2_input_scale_quant = torch.nn.Parameter( + (layer.w2_input_global_scale), requires_grad=False + ) + + def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + if self.use_marlin: + return None + elif not self.allow_flashinfer: + return super().maybe_make_prepare_finalize() + + prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None + """Return the appropriate GEMM experts implementation.""" + experts = select_nvfp4_gemm_impl( + self.moe, + self.moe_quant_config, + allow_flashinfer=self.allow_flashinfer, + ) + logger.debug_once("Using %s", experts.__class__.__name__) + return experts + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + if self.use_marlin: + return None + + return nvfp4_moe_quant_config( + g1_alphas=layer.g1_alphas, + g2_alphas=layer.g2_alphas, + a1_gscale=layer.w13_input_scale_quant, + a2_gscale=layer.w2_input_scale_quant, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet." + ) + assert activation == "silu", "Only SiLU activation is supported." + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + if self.use_marlin: + return fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + None, + None, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + global_scale1=layer.w13_weight_scale_2, + global_scale2=layer.w2_weight_scale_2, + quant_type_id=scalar_types.float4_e2m1f.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + workspace=layer.workspace, + ) + + # FlashInfer fused experts path + elif self.allow_flashinfer: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + flashinfer_cutlass_moe_fp4, + ) + + assert is_valid_flashinfer_cutlass_fused_moe( + x, layer.w13_weight, layer.w2_weight + ), "Flashinfer CUTLASS Fused MoE not applicable!" + + assert self.moe_quant_config is not None + + return flashinfer_cutlass_moe_fp4( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + quant_config=self.moe_quant_config, + inplace=False, # TODO(shuw): fix later, now output is high prec + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + else: + from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 + + assert expert_map is None, ( + "Expert Parallelism / expert_map " + "is currently not supported for " + "CompressedTensorsW4A4MoeMethod." + ) + assert self.moe_quant_config is not None + + # Cutlass moe takes in activations in BF16/Half precision + # and fp4 quantized weights loaded from the checkpoint + return cutlass_moe_fp4( + a=x, + w1_fp4=layer.w13_weight, + w2_fp4=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + quant_config=self.moe_quant_config, + apply_router_weight_on_input=apply_router_weight_on_input, + # TODO(bnell): derive these from arguments + m=x.shape[0], + n=layer.w2_weight.shape[2] * 2, + k=x.shape[1], + e=layer.w13_weight.shape[0], + ).to(x.dtype) + + +class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): + def __init__( + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.quant_config = quant_config + self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights") + self.input_quant = self.quant_config.target_scheme_map["Linear"].get( + "input_activations" + ) + + per_tensor = ( + self.weight_quant.strategy == QuantizationStrategy.TENSOR + and self.input_quant.strategy == QuantizationStrategy.TENSOR + ) + per_channel = ( + self.weight_quant.strategy == QuantizationStrategy.CHANNEL + and self.input_quant.strategy == QuantizationStrategy.TOKEN + ) + if not (per_tensor or per_channel): + assert self.weight_quant.strategy == QuantizationStrategy.BLOCK + self.weight_block_size = self.weight_quant.block_structure + assert self.weight_quant.dynamic is not None + else: + self.weight_block_size = None + self.block_quant = self.weight_block_size is not None + + self.static_input_scales = not self.input_quant.dynamic + if self.static_input_scales and per_channel: + raise ValueError( + "For FP8 Fused MoE layer, we require either per tensor or " + "channelwise, dynamic per token quantization." + ) + + # For GPUs that lack FP8 hardware support, we can leverage the Marlin + # kernel for fast weight-only FP8 quantization + self.use_marlin = ( + not current_platform.has_device_capability(89) + or envs.VLLM_TEST_FORCE_FP8_MARLIN + and not self.block_quant + ) + # Disable marlin for rocm + if current_platform.is_rocm(): + self.use_marlin = False + + self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + # cutlass path + self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100( + self.weight_quant, self.input_quant + ) + self.use_cutlass = not self.block_quant and ( + quant_config._is_fp8_w8a8_sm90(self.weight_quant, self.input_quant) + or self.is_fp8_w8a8_sm100 + ) + self.disable_expert_map = False + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.num_experts = num_experts + layer.orig_dtype = params_dtype + layer.weight_block_size = None + + params_dtype = torch.float8_e4m3fn + + if self.block_quant: + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + tp_size = get_tensor_model_parallel_world_size() + block_n, block_k = ( + self.weight_block_size[0], + self.weight_block_size[1], + ) + # NOTE: To ensure proper alignment of the block-wise quantization + # scales, the output_size of the weights for both the gate and up + # layers must be divisible by block_n. + # Required by column parallel or enabling merged weights + if intermediate_size_per_partition % block_n != 0: + raise ValueError( + f"The output_size of gate's and up's weight = " + f"{intermediate_size_per_partition} is not divisible by " + f"weight quantization block_n = {block_n}." + ) + if tp_size > 1 and intermediate_size_per_partition % block_k != 0: + # Required by row parallel + raise ValueError( + f"The input_size of down's weight = " + f"{intermediate_size_per_partition} is not divisible by " + f"weight quantization block_k = {block_k}." + ) + + # WEIGHTS + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + if self.weight_quant.strategy == QuantizationStrategy.TENSOR: + # Allocate 2 scales for w1 and w3 respectively. + # They are combined to a single scale after weight loading. + w13_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-TENSOR quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL: + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, hidden_size, 1, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + elif self.weight_quant.strategy == QuantizationStrategy.BLOCK: + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + 2 * ((intermediate_size_per_partition + block_n - 1) // block_n), + (hidden_size + block_k - 1) // block_k, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + (hidden_size + block_n - 1) // block_n, + (intermediate_size_per_partition + block_k - 1) // block_k, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # INPUT_SCALES + if self.static_input_scales: + w13_input_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_input_scale", w13_input_scale) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_input_scale", w2_input_scale) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + else: + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # Fp8 moe kernels require a single activation scale. + # We take the max of all the scales in case they differ. + if self.static_input_scales: + assert self.input_quant.strategy == QuantizationStrategy.TENSOR + if layer.w13_input_scale is None or layer.w2_input_scale is None: + raise ValueError( + "QuantConfig has static quantization, but found " + "activation scales are None." + ) + if not all_close_1d(layer.w13_input_scale) or not all_close_1d( + layer.w2_input_scale + ): + logger.warning_once( + "Found input_scales that are not equal for " + "fp8 MoE layer. Using the maximum across experts " + "for each layer." + ) + layer.w13_input_scale = torch.nn.Parameter( + layer.w13_input_scale.max(), requires_grad=False + ) + layer.w2_input_scale = torch.nn.Parameter( + layer.w2_input_scale.max(), requires_grad=False + ) + + if current_platform.is_fp8_fnuz(): + # Normalize the weights and scales + w13_weight, w13_weight_scale, w13_input_scale = ( + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale + ) + ) + w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale + ) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter( + w13_weight_scale, requires_grad=False + ) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False + ) + layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter( + w2_weight_scale, requires_grad=False + ) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False + ) + + # For Per-TENSOR case, Fp8 moe kernel needs single weight scale + # for w13 per expert. Use max then dequant and requant each expert. + if self.weight_quant.strategy == QuantizationStrategy.TENSOR: + assert layer.w13_weight_scale is not None + shard_size = layer.intermediate_size_per_partition + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + for expert_id in range(layer.local_num_experts): + start = 0 + for shard_id in range(2): + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][start : start + shard_size, :], + layer.w13_weight_scale[expert_id][shard_id], + ) + layer.w13_weight[expert_id][start : start + shard_size, :], _ = ( + ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + ) + start += shard_size + layer.w13_weight_scale = torch.nn.Parameter( + max_w13_scales, requires_grad=False + ) + + # Property to determine if AITER is used + if self.rocm_aiter_moe_enabled: + # reshaping weights is required for aiter moe kernel. + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data + ) + + layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + + elif self.use_marlin: + prepare_moe_fp8_layer_for_marlin(layer, False) + # Activations not quantized for marlin. + del layer.w13_input_scale + del layer.w2_input_scale + + if self.use_cutlass: + assert self.weight_quant.strategy != QuantizationStrategy.BLOCK + device = layer.w13_weight.device + # ab_strides1 and c_strides2 are the same + self.ab_strides1_c_strides2 = torch.full( + (layer.local_num_experts,), + layer.hidden_size, + device=device, + dtype=torch.int64, + ) + self.ab_strides2 = torch.full( + (layer.local_num_experts,), + layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64, + ) + self.c_strides1 = torch.full( + (layer.local_num_experts,), + 2 * layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64, + ) + + if is_deep_gemm_e8m0_used() and self.block_quant: + assert layer.weight_block_size is not None + # Re-quantise the expert weights so their scales are UE8M0. + block_sz = tuple(layer.weight_block_size) + requant_weight_ue8m0_inplace( + layer.w13_weight.data, + layer.w13_weight_scale.data, + block_sz, + ) + requant_weight_ue8m0_inplace( + layer.w2_weight.data, + layer.w2_weight_scale.data, + block_sz, + ) + + # Ensure column-major TMA alignment expected by DeepGEMM. + if expert_weight_is_col_major(layer.w13_weight_scale): + layer.w13_weight_scale = get_col_major_tma_aligned_tensor( + layer.w13_weight_scale + ) + if expert_weight_is_col_major(layer.w2_weight_scale): + layer.w2_weight_scale = get_col_major_tma_aligned_tensor( + layer.w2_weight_scale + ) + + def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + if self.use_marlin or self.rocm_aiter_moe_enabled: + return None + else: + return super().maybe_make_prepare_finalize() + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> FusedMoEPermuteExpertsUnpermute: + # cutlass path + assert self.moe_quant_config is not None + if self.use_cutlass: + from vllm.model_executor.layers.fused_moe import ( + CutlassBatchedExpertsFp8, + CutlassExpertsFp8, + ) + + experts: FusedMoEPermuteExpertsUnpermute + + num_dispatchers = prepare_finalize.num_dispatchers() + + if ( + prepare_finalize.activation_format + == FusedMoEActivationFormat.BatchedExperts + ): + logger.debug("CutlassBatchedExpertsFp8(%s)", self.__class__.__name__) + experts = CutlassBatchedExpertsFp8( + self.moe.num_local_experts, + num_dispatchers, + self.moe.in_dtype, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, + quant_config=self.moe_quant_config, + ) + else: + logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__) + experts = CutlassExpertsFp8( + self.moe.in_dtype, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, + quant_config=self.moe_quant_config, + ) + + self.disable_expert_map = ( + num_dispatchers > 1 or not experts.supports_expert_map() + ) + + return experts + + # triton path + from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 + BatchedTritonOrDeepGemmExperts, + ) + from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts, + ) + + assert not self.rocm_aiter_moe_enabled and not self.use_marlin + + if ( + prepare_finalize.activation_format + == FusedMoEActivationFormat.BatchedExperts + ): + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens_per_rank is not None + + logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__) + return BatchedTritonOrDeepGemmExperts( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + allow_deep_gemm=( + envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM + ), + ) + else: + logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__) + return TritonOrDeepGemmExperts( + self.moe_quant_config, + allow_deep_gemm=( + envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM + ), + ) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + if self.use_marlin: + return None + + per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN + per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL + + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_channel_quant, + block_shape=layer.weight_block_size, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + assert expert_load_view is not None + assert logical_to_physical_map is not None + assert logical_replica_count is not None + assert isinstance(layer, FusedMoE) + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + num_fused_shared_experts=layer.num_fused_shared_experts, + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN + per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL + + if self.use_marlin: + assert activation == "silu", f"{activation} not supported for Marlin MoE." + return fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + None, + None, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + quant_type_id=scalar_types.float8_e4m3fn.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + workspace=layer.workspace, + ) + + elif self.rocm_aiter_moe_enabled: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa E501 + rocm_aiter_fused_experts, + ) + + assert per_act_token == per_channel_quant + assert self.moe_quant_config is not None + return rocm_aiter_fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + + # cutlass path + elif self.use_cutlass: + assert self.moe_quant_config is not None + + # small-batch fallback on SM100 + if self.is_fp8_w8a8_sm100 and topk_ids.shape[0] <= 8: + from vllm.model_executor.layers.fused_moe import fused_experts + + assert per_act_token == per_channel_quant + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + quant_config=self.moe_quant_config, + ) + else: + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + cutlass_moe_fp8, + ) + + assert per_act_token == per_channel_quant + assert self.moe_quant_config is not None + return cutlass_moe_fp8( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + quant_config=self.moe_quant_config, + activation=activation, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + ab_strides1=self.ab_strides1_c_strides2, + ab_strides2=self.ab_strides2, + c_strides1=self.c_strides1, + c_strides2=self.ab_strides1_c_strides2, + ) + + else: + from vllm.model_executor.layers.fused_moe import fused_experts + + assert per_act_token == per_channel_quant + assert self.moe_quant_config is not None + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + + @property + def supports_eplb(self) -> bool: + return True + + +class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): + def __init__( + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.quant_config = quant_config + self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights") + self.input_quant = self.quant_config.target_scheme_map["Linear"].get( + "input_activations" + ) + + per_channel = ( + self.weight_quant.strategy == QuantizationStrategy.CHANNEL + and self.input_quant.strategy == QuantizationStrategy.TOKEN + ) + if not per_channel: + raise ValueError( + "For INT8 Fused MoE layers, we require channelwise, " + "dynamic per token quantization. Found " + f"{self.weight_quant}, {self.input_quant}" + ) + + self.static_input_scales = not self.input_quant.dynamic + # if self.static_input_scales: + # raise ValueError( + # "For INT8 Fused MoE layers, we require channelwise, " + # "dynamic per token quantization. Found static input scales.") + + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + params_dtype = torch.int8 + + w13_remainder = hidden_size % 64 + w2_remainder = intermediate_size_per_partition % 64 + if w13_remainder != 0: + hidden_size_padded = hidden_size + (64 - w13_remainder) + else: + hidden_size_padded = hidden_size + if w2_remainder != 0: + intermediate_size_per_partition_padded = intermediate_size_per_partition + (64 - w2_remainder) + else: + intermediate_size_per_partition_padded = intermediate_size_per_partition + + # WEIGHTS + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size_padded, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w13_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.bfloat16, + ), + requires_grad=False, + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition_padded, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + w2_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + dtype=torch.bfloat16, + ), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + + # WEIGHT_SCALES + assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, hidden_size, 1, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # INPUT_SCALES + if self.static_input_scales: + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}) + w13_input_scale = torch.nn.Parameter(torch.ones( + num_experts, hidden_size, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_input_scale", w13_input_scale) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter(torch.ones( + num_experts, intermediate_size_per_partition, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_input_scale", w2_input_scale) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + else: + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + pass + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return int8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + per_act_token_quant=True, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet." + ) + + use_ep = expert_map is not None + if use_ep: + start_eid = layer.ep_rank * layer.local_num_experts + end_eid = min((layer.ep_rank + 1) * layer.local_num_experts, global_num_experts) + topk_weight, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) + + dtype = x.dtype + num_tokens, num_experts = router_logits.shape + + if use_ep: + hidden_size = x.shape[1] + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + expand_tokens, + ) = ixfops.moe_compute_token_index_ep( + topk_ids=topk_ids, + num_experts=num_experts, + start_expert_id=start_eid, + end_expert_id=end_eid, + ) + if expert_sizes_cpu.sum() == 0: + return torch.zeros( + (num_tokens, hidden_size), + device=x.device, + dtype=x.dtype, + ) + else: + expand_tokens = num_tokens * top_k + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + ) = ixfops.moe_compute_token_index( + topk_ids=topk_ids, + num_experts=num_experts, + ) + expert_sizes_cpu = expert_sizes_gpu.cpu() + + # expand + reorder + quant + i8_hidden_states, a_scale = ixfops.moe_expand_input_dynamic_scaled_int8( + hidden_states=x, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + topk_ids=None, + smooth_scales=layer.w13_input_scale, + ) + + if i8_hidden_states.shape[-1] != layer.w13_weight.shape[-1]: + padding = layer.w13_weight.shape[-1] - i8_hidden_states.shape[-1] + i8_hidden_states_align = torch.nn.functional.pad(i8_hidden_states, (0, padding), mode='constant', value=0) + else: + i8_hidden_states_align = i8_hidden_states + + # w8a8 group gemm 1 + pt_output_1 = ixfops.moe_w8a8_group_gemm( + input=i8_hidden_states_align, + weight=layer.w13_weight, + i_scales=a_scale, + w_scales=layer.w13_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=None, + bias=layer.w13_bias, + format="TN", + ) + + # act + quant + if activation == "swigluoai": + pt_output_2, a2_scale = ixfops.activation_swigluoai_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + ) + else: + pt_output_2, a2_scale = ixfops.activation_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + act_type="swiglu", + ) + + if pt_output_2.shape[-1] != layer.w2_weight.shape[-1]: + padding = layer.w2_weight.shape[-1] - pt_output_2.shape[-1] + pt_output_2_align = torch.nn.functional.pad(pt_output_2, (0, padding), mode='constant', value=0) + else: + pt_output_2_align = pt_output_2 + + # w8a8 group gemm 2 + reorder + if use_ep: + pt_output_3 = torch.empty( + (num_tokens * top_k, hidden_size), + device=x.device, + dtype=x.dtype, + ) + + ixfops.moe_w8a8_group_gemm( + input=pt_output_2_align, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=sorted_token_ids, + bias=layer.w2_bias, + format="TN", + output=pt_output_3, + ) + + reduce_mask = src_to_dst == -1 + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor, + mask=reduce_mask, + ) + else: + pt_output_3 = ixfops.moe_w8a8_group_gemm( + input=pt_output_2_align, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=sorted_token_ids, + bias=layer.w2_bias, + format="TN", + ) + + # mul + reduce_sum + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor + ) + + return final_hidden_states + + +class CompressedTensorsW4A8MoEMethod(CompressedTensorsMoEMethod): + + def __init__( + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.quant_config = quant_config + self.weight_quant = self.quant_config.target_scheme_map["Linear"].get( + "weights") + self.input_quant = self.quant_config.target_scheme_map["Linear"].get( + "input_activations") + self.pack_factor = 2 + self.group_size = -1 if self.weight_quant.group_size is None else self.weight_quant.group_size + self.weight_symmetric = self.weight_quant.symmetric + self.gemm_format = envs.VLLM_W4A8_FORMAT + self.format_mapping = {"NN":0,"NT":1,"TN":2} + self.version = envs.VLLM_W4A8_VERSION + assert self.gemm_format in ["TN","NN"] + + if not ((self.weight_quant.strategy == QuantizationStrategy.CHANNEL + or self.weight_quant.strategy == QuantizationStrategy.GROUP) + and self.input_quant.strategy == QuantizationStrategy.TOKEN): + raise ValueError( + "For INT4 pack2 Fused MoE layers, only per-channel or group scales" + "for weights and per-token scales for activations are supported. Found " + f"{self.weight_quant}, {self.input_quant}") + + self.static_input_scales = not self.input_quant.dynamic + + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + params_dtype = torch.int8 + w13_remainder = (hidden_size // self.pack_factor) % 64 + w2_remainder = (intermediate_size_per_partition // self.pack_factor) % 64 + if self.gemm_format == "TN": + if w13_remainder != 0: + w13_shape = (num_experts, 2 * intermediate_size_per_partition, (hidden_size // self.pack_factor) + 64 - w13_remainder) + else: + w13_shape = (num_experts, 2 * intermediate_size_per_partition, hidden_size // self.pack_factor) + + if w2_remainder != 0: + w2_shape = (num_experts, hidden_size, (intermediate_size_per_partition // self.pack_factor) + 64 - w2_remainder) + else: + w2_shape = (num_experts, hidden_size, intermediate_size_per_partition // self.pack_factor) + else: + w13_shape = (num_experts, hidden_size, 2 * intermediate_size_per_partition // self.pack_factor) + w2_shape = (num_experts, intermediate_size_per_partition, hidden_size // self.pack_factor) + + # WEIGHTS + # use process_weights_after_loading to get get right layout if gemm_format is NN + w13_weight = torch.nn.Parameter(torch.empty(w13_shape, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + if self.gemm_format == "NN": + setattr(w13_weight, "shard_dim", 1) + + w13_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.bfloat16, + ), + requires_grad=False, + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + if w2_remainder != 0: + w2_weight = torch.nn.Parameter(torch.zeros(w2_shape, + dtype=params_dtype), + requires_grad=False) + else: + w2_weight = torch.nn.Parameter(torch.empty(w2_shape, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + if self.gemm_format == "NN": + setattr(w2_weight, "shard_dim", 0) + + w2_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + dtype=torch.bfloat16, + ), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + + # WEIGHT_SCALES + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + # The following scale or zero will use permute(0,2,1) to get right layout, init here to avoid rewrite data_loader + w13_weight_scale = torch.nn.Parameter(torch.empty(num_experts, + 1 if self.version == 2 else 1 if self.group_size == -1 else hidden_size // self.group_size, + 2 * intermediate_size_per_partition, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + setattr(w13_weight_scale, "shard_dim", 1) + + w2_weight_scale = torch.nn.Parameter(torch.empty(num_experts, + 1 if self.version == 2 else 1 if self.group_size == -1 else intermediate_size_per_partition // self.group_size, + hidden_size, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + setattr(w2_weight_scale, "shard_dim", 0) + # setattr(w2_weight_scale, "load_full_w2", True) + + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value if self.version == 2 or self.group_size == -1 else FusedMoeWeightScaleSupported.GROUP.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + if self.version == 2: + # INT8 -> INT4 weight scales/zeros + if self.group_size != -1: + w13_i8_weight_scale = torch.nn.Parameter(torch.empty(num_experts, + hidden_size // self.group_size, + 2 * intermediate_size_per_partition, + dtype=torch.int32), + requires_grad=False) + layer.register_parameter("w13_i8_weight_scale", w13_i8_weight_scale) + setattr(w13_i8_weight_scale, "shard_dim", 1) + if not self.weight_symmetric: + w13_i8_weight_zero = torch.nn.Parameter(torch.empty(num_experts, + 1 if self.group_size == -1 else hidden_size // self.group_size, + 2 * intermediate_size_per_partition, + dtype=torch.int32), + requires_grad=False) + layer.register_parameter("w13_i8_weight_zero", w13_i8_weight_zero) + setattr(w13_i8_weight_zero, "shard_dim", 1) + + if self.group_size != -1: + w2_i8_weight_scale = torch.nn.Parameter(torch.empty(num_experts, + intermediate_size_per_partition // self.group_size, + hidden_size, + dtype=torch.int32), + requires_grad=False) + layer.register_parameter("w2_i8_weight_scale", w2_i8_weight_scale) + setattr(w2_i8_weight_scale, "shard_dim", 0) + if not self.weight_symmetric: + w2_i8_weight_zero = torch.nn.Parameter(torch.empty(num_experts, + 1 if self.group_size == -1 else intermediate_size_per_partition // self.group_size, + hidden_size, + dtype=torch.int32), + requires_grad=False) + layer.register_parameter("w2_i8_weight_zero", w2_i8_weight_zero) + setattr(w2_i8_weight_zero, "shard_dim", 0) + + # Add the quantization method used (per tensor/grouped/channel) + # to ensure the weight scales are loaded in properly + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value if self.group_size == -1 else FusedMoeWeightScaleSupported.GROUP.value}) + + if self.version == 2 and self.group_size != -1: + set_weight_attrs(w13_i8_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_i8_weight_scale, extra_weight_attrs) + else: + setattr(layer, "w13_i8_weight_scale", None) + setattr(layer, "w2_i8_weight_scale", None) + if self.version == 2 and not self.weight_symmetric: + set_weight_attrs(w13_i8_weight_zero, extra_weight_attrs) + set_weight_attrs(w2_i8_weight_zero, extra_weight_attrs) + else: + setattr(layer, "w13_i8_weight_zero", None) + setattr(layer, "w2_i8_weight_zero", None) + + # DO NOT SUPPORT INPUT_SCALES + if self.static_input_scales: + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}) + w13_input_scale = torch.nn.Parameter(torch.ones( + num_experts, hidden_size, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_input_scale", w13_input_scale) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter(torch.ones( + num_experts, intermediate_size_per_partition, dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_input_scale", w2_input_scale) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + else: + layer.w13_input_scale = None + layer.w2_input_scale = None + + self.gemm_format = self.format_mapping[self.gemm_format] + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable |None = None, + scoring_func: str = "softmax", + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + routed_scaling_factor: float = 1.0, + ) -> torch.Tensor: + attn_metadata = get_forward_context().attn_metadata + use_ep = expert_map is not None + # unsupported ep now + if attn_metadata: + deepseek_instance = None + for value in attn_metadata.values(): + if hasattr(value, 'num_prefills') and hasattr(value, 'num_decodes'): + deepseek_instance = value + break + value_types = {type(value).__name__ for value in attn_metadata.values()} + is_same_class = len(value_types) == 1 + if is_same_class: + assert deepseek_instance + only_decode = (use_ep == False and all(t.num_decodes > 0 and t.num_prefills ==0 for t in list(attn_metadata.values()))) + else: + if deepseek_instance: + only_decode = (use_ep == False and deepseek_instance.num_decodes > 0 and deepseek_instance.num_prefills ==0) + else: + only_decode = False + else: + only_decode = False + if enable_eplb: + raise NotImplementedError("EPLB not supported for `CompressedTensorsW4A8MoEMethod` yet.") + + if use_ep: + start_eid = layer.ep_rank * layer.local_num_experts + end_eid = min((layer.ep_rank + 1) * layer.local_num_experts, global_num_experts) + topk_weight, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + dtype = x.dtype + num_tokens, num_experts = router_logits.shape + if use_ep: + hidden_size = x.shape[1] + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + expand_tokens, + ) = ixfops.moe_compute_token_index_ep( + topk_ids=topk_ids, + num_experts=num_experts, + start_expert_id=start_eid, + end_expert_id=end_eid, + ) + if expert_sizes_cpu.sum() == 0: + return torch.zeros( + (num_tokens, hidden_size), + device=x.device, + dtype=x.dtype, + ) + else: + expand_tokens = num_tokens * top_k + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + ) = ixfops.moe_compute_token_index( + topk_ids=topk_ids, + num_experts=num_experts, + ) + + if only_decode and self.gemm_format == 2: + i8_hidden_states, a_scale = ixfops.moe_expand_input_dynamic_scaled_int8( + hidden_states=x, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + topk_ids=None, + smooth_scales=layer.w13_input_scale, + output_format = 1, + ) + + if i8_hidden_states.shape[-1] != layer.w13_weight.shape[-1] * 2: + padding = layer.w13_weight.shape[-1] * 2 - i8_hidden_states.shape[-1] + i8_hidden_states_align = torch.nn.functional.pad(i8_hidden_states, (0, padding), mode='constant', value=0) + else: + i8_hidden_states_align = i8_hidden_states + + pt_output_1 = ixfops.moe_w4a8_group_gemv( + input=i8_hidden_states_align, + weight=layer.w13_weight, + i_scales=a_scale, + w_scales=layer.w13_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_gpu, + w_i8scales=layer.w13_i8_weight_scale, + w_i8zeros=layer.w13_i8_weight_zero, + dst_to_src=None, + bias=layer.w13_bias, + format=self.gemm_format, + group_size=self.group_size, + ) + + if activation == "swigluoai": + pt_output_2, a2_scale = ixfops.activation_swigluoai_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + output_format = 1, + ) + else: + pt_output_2, a2_scale = ixfops.activation_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + act_type="swiglu", + output_format = 1, + ) + if pt_output_2.shape[-1] != layer.w2_weight.shape[-1] * 2: + padding = layer.w2_weight.shape[-1] * 2 - pt_output_2.shape[-1] + pt_output_2_align = torch.nn.functional.pad(pt_output_2, (0, padding), mode='constant', value=0) + else: + pt_output_2_align = pt_output_2 + + pt_output_3 = ixfops.moe_w4a8_group_gemv( + input=pt_output_2_align, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_gpu, + w_i8scales=layer.w2_i8_weight_scale, + w_i8zeros=layer.w2_i8_weight_zero, + dst_to_src=sorted_token_ids, + bias=layer.w2_bias, + format=self.gemm_format, + group_size=self.group_size, + ) + # mul + reduce_sum + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor + ) + else: + expert_sizes_cpu = expert_sizes_gpu.cpu() + # expand + reorder + quant + i8_hidden_states, a_scale = ixfops.moe_expand_input_dynamic_scaled_int8( + hidden_states=x, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + topk_ids=None, + smooth_scales=layer.w13_input_scale, + ) + + if i8_hidden_states.shape[-1] != layer.w13_weight.shape[-1] * 2: + padding = layer.w13_weight.shape[-1] * 2 - i8_hidden_states.shape[-1] + i8_hidden_states_align = torch.nn.functional.pad(i8_hidden_states, (0, padding), mode='constant', value=0) + else: + i8_hidden_states_align = i8_hidden_states + + # w4a8 group gemm 1 + pt_output_1 = ixfops.moe_w4a8_group_gemm( + input=i8_hidden_states_align, + weight=layer.w13_weight, + i_scales=a_scale, + w_scales=layer.w13_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + w_i8scales=layer.w13_i8_weight_scale, + w_i8zeros=layer.w13_i8_weight_zero, + dst_to_src=None, + bias=layer.w13_bias, + format=self.gemm_format, + group_size=self.group_size, + version=self.version + ) + + # act + quant + if activation == "swigluoai": + pt_output_2, a2_scale = ixfops.activation_swigluoai_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + ) + else: + pt_output_2, a2_scale = ixfops.activation_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + act_type="swiglu", + ) + + if pt_output_2.shape[-1] != layer.w2_weight.shape[-1] * 2 and self.gemm_format == 2: + padding = layer.w2_weight.shape[-1] * 2 - pt_output_2.shape[-1] + pt_output_2_align = torch.nn.functional.pad(pt_output_2, (0, padding), mode='constant', value=0) + else: + pt_output_2_align = pt_output_2 + + # w4a8 group gemm 2 + reorder + if use_ep: + pt_output_3 = torch.empty( + (num_tokens * top_k, hidden_size), + device=x.device, + dtype=x.dtype, + ) + + ixfops.moe_w4a8_group_gemm( + input=pt_output_2_align, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + w_i8scales=layer.w2_i8_weight_scale, + w_i8zeros=layer.w2_i8_weight_zero, + dst_to_src=sorted_token_ids, + bias=layer.w2_bias, + format=self.gemm_format, + group_size=self.group_size, + version=self.version, + output=pt_output_3, + ) + + reduce_mask = src_to_dst == -1 + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor, + mask=reduce_mask, + ) + else: + pt_output_3 = ixfops.moe_w4a8_group_gemm( + input=pt_output_2_align, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + w_i8scales=layer.w2_i8_weight_scale, + w_i8zeros=layer.w2_i8_weight_zero, + dst_to_src=sorted_token_ids, + bias=layer.w2_bias, + format=self.gemm_format, + group_size=self.group_size, + version=self.version + ) + + # mul + reduce_sum + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor + ) + return final_hidden_states + + +class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): + def __init__( + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.quant_config = quant_config + # TODO: @dsikka: refactor this to use schemes as other kernels + # are supported + check if the layer is being ignored. + config = self.quant_config.target_scheme_map["Linear"].get("weights") + self.num_bits = config.num_bits + self.packed_factor = 32 // config.num_bits + self.strategy = config.strategy + self.group_size = config.group_size + self.actorder = config.actorder + assert config.symmetric, "Only symmetric quantization is supported for MoE" + + if not ( + self.quant_config.quant_format == CompressionFormat.pack_quantized.value + and self.num_bits in WNA16_SUPPORTED_BITS + ): + raise ValueError( + "For Fused MoE layers, only ", + f"{CompressionFormat.pack_quantized.value} ", + "is supported for the following bits: ", + f"{WNA16_SUPPORTED_BITS}", + ) + self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits] + self.use_marlin = True + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full") + + # Will transpose the loaded weight along the + # intermediate and hidden dim sizes. Will + # shard for TP along the transposed dims + extra_weight_attrs.update( + {"is_transposed": True, "quant_method": self.strategy} + ) + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size // self.packed_factor, + 2 * intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_packed", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition // self.packed_factor, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_packed", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # In the case where we have actorder/g_idx, + # we do not partition the w2 scales + load_full_w2 = self.actorder and self.group_size != -1 + w2_scales_size = ( + intermediate_size_full if load_full_w2 else intermediate_size_per_partition + ) + + self.is_k_full = (not self.actorder) or ( + intermediate_size_per_partition == intermediate_size_full + ) + + if self.strategy == "channel": + num_groups_w2 = num_groups_w13 = 1 + self.group_size = -1 + else: + num_groups_w2 = w2_scales_size // self.group_size + num_groups_w13 = hidden_size // self.group_size + + w13_scale = torch.nn.Parameter( + torch.ones( + num_experts, + num_groups_w13, + 2 * intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_scale) + set_weight_attrs(w13_scale, extra_weight_attrs) + + w2_scale = torch.nn.Parameter( + torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_scale) + set_weight_attrs(w2_scale, extra_weight_attrs) + set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2}) + + w2_weight_shape = torch.nn.Parameter( + torch.empty(num_experts, 2), requires_grad=False + ) + layer.register_parameter("w2_weight_shape", w2_weight_shape) + set_weight_attrs(w2_weight_shape, extra_weight_attrs) + w13_weight_shape = torch.nn.Parameter( + torch.empty(num_experts, 2), requires_grad=False + ) + + layer.register_parameter("w13_weight_shape", w13_weight_shape) + set_weight_attrs(w13_weight_shape, extra_weight_attrs) + + w13_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_g_idx", w13_g_idx) + set_weight_attrs(w13_g_idx, extra_weight_attrs) + + w2_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_g_idx", w2_g_idx) + set_weight_attrs(w2_g_idx, extra_weight_attrs) + + w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices) + set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs) + + w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices) + set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs) + + layer.a13_scale = None + layer.a2_scale = None + layer.marlin_state = GPTQMarlinState.REPACK + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + num_experts = layer.w13_weight_g_idx.shape[0] + device = layer.w13_weight_g_idx.device + + # when running models with grouped act order, + # resort to g_idx values provided in checkpoint + if self.actorder == "group": + w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx) + w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx) + w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx) + w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx) + + for e in range(num_experts): + w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_weight_g_idx[e]).to( + torch.int32 + ) + w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_weight_g_idx[e]).to( + torch.int32 + ) + w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][ + w13_g_idx_sort_indices[e] + ] + w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][w2_g_idx_sort_indices[e]] + + replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx) + replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx) + replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices) + replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices) + + else: + layer.w13_weight_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, device=device), + requires_grad=False, + ) + layer.w2_weight_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, device=device), + requires_grad=False, + ) + layer.w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, device=device), + requires_grad=False, + ) + layer.w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, device=device), + requires_grad=False, + ) + + marlin_w13_qweight = ops.gptq_marlin_moe_repack( + layer.w13_weight_packed, + layer.w13_g_idx_sort_indices, + layer.w13_weight_packed.shape[1] * self.packed_factor, + layer.w13_weight_packed.shape[2], + self.num_bits, + ) + replace_parameter(layer, "w13_weight_packed", marlin_w13_qweight) + marlin_w2_qweight = ops.gptq_marlin_moe_repack( + layer.w2_weight_packed, + layer.w2_g_idx_sort_indices, + layer.w2_weight_packed.shape[1] * self.packed_factor, + layer.w2_weight_packed.shape[2], + self.num_bits, + ) + replace_parameter(layer, "w2_weight_packed", marlin_w2_qweight) + # Repack scales + marlin_w13_scales = marlin_moe_permute_scales( + s=layer.w13_weight_scale, + size_k=layer.w13_weight_packed.shape[2], + size_n=layer.w13_weight_scale.shape[2], + group_size=self.group_size, + ) + replace_parameter(layer, "w13_weight_scale", marlin_w13_scales) + marlin_w2_scales = marlin_moe_permute_scales( + s=layer.w2_weight_scale, + size_k=layer.w2_weight_scale.shape[1] + * (self.group_size if self.group_size != -1 else self.packed_factor), + size_n=layer.w2_weight_scale.shape[2], + group_size=self.group_size, + ) + replace_parameter(layer, "w2_weight_scale", marlin_w2_scales) + + layer.workspace = marlin_make_workspace_new(device, 4) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + if self.num_bits != 4: + return None + return int4_w4a16_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + w1_zp=None, + w2_zp=None, + block_shape=[0, self.group_size], + ) + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.num_bits == 4, "only supporting w4" + layer.w13_weight = layer.w13_weight_packed + layer.w2_weight = layer.w2_weight_packed + assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]]) + assert self.moe_quant_config is not None + if ( + prepare_finalize.activation_format + == mk.FusedMoEActivationFormat.BatchedExperts + ): + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens_per_rank is not None + return BatchedMarlinExperts( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + w13_g_idx=layer.w13_weight_g_idx, + w2_g_idx=layer.w2_weight_g_idx, + w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices, + w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) + else: + return MarlinExperts( + quant_config=self.moe_quant_config, + w13_g_idx=layer.w13_weight_g_idx, + w2_g_idx=layer.w2_weight_g_idx, + w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices, + w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet." + ) + + assert activation == "silu", f"{activation} not supported for Marlin MoE." + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + return fused_marlin_moe( + x, + layer.w13_weight_packed, + layer.w2_weight_packed, + None, + None, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + quant_type_id=self.quant_type.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + g_idx1=layer.w13_weight_g_idx, + g_idx2=layer.w2_weight_g_idx, + sort_indices1=layer.w13_g_idx_sort_indices, + sort_indices2=layer.w2_g_idx_sort_indices, + workspace=layer.workspace, + is_k_full=self.is_k_full, + ) + + +class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): + def __init__( + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.quant_config = quant_config + # TODO: @dsikka: refactor this to use schemes as other kernels + # are supported + check if the layer is being ignored. + config = self.quant_config.target_scheme_map["Linear"].get("weights") + self.num_bits = config.num_bits + self.packed_factor = 32 // config.num_bits + self.strategy = config.strategy + # channelwise is not supported by this kernel + assert config.strategy == "group" + self.group_size = config.group_size + # grouped actorder isn't supported by this kernel + assert config.actorder != "group" + assert config.symmetric, "Only symmetric quantization is supported for MoE" + + if not ( + self.quant_config.quant_format == CompressionFormat.pack_quantized.value + and self.num_bits in WNA16_SUPPORTED_BITS + ): + raise ValueError( + "For Fused MoE layers, only ", + f"{CompressionFormat.pack_quantized.value} ", + "is supported for the following bits: ", + f"{WNA16_SUPPORTED_BITS}", + ) + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # Will transpose the loaded weight along the + # intermediate and hidden dim sizes. Will + # shard for TP along the transposed dims + extra_weight_attrs.update( + {"is_transposed": True, "quant_method": self.strategy} + ) + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size // self.packed_factor, + 2 * intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_packed", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition // self.packed_factor, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_packed", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + w2_scales_size = intermediate_size_per_partition + + if self.strategy == "channel": + num_groups_w2 = num_groups_w13 = 1 + self.group_size = -1 + else: + num_groups_w2 = w2_scales_size // self.group_size + num_groups_w13 = hidden_size // self.group_size + + w13_scale = torch.nn.Parameter( + torch.ones( + num_experts, + num_groups_w13, + 2 * intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_scale) + set_weight_attrs(w13_scale, extra_weight_attrs) + + w2_scale = torch.nn.Parameter( + torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_scale) + set_weight_attrs(w2_scale, extra_weight_attrs) + set_weight_attrs(w2_scale, {"load_full_w2": False}) + + w2_weight_shape = torch.nn.Parameter( + torch.empty(num_experts, 2), requires_grad=False + ) + layer.register_parameter("w2_weight_shape", w2_weight_shape) + set_weight_attrs(w2_weight_shape, extra_weight_attrs) + w13_weight_shape = torch.nn.Parameter( + torch.empty(num_experts, 2), requires_grad=False + ) + + layer.register_parameter("w13_weight_shape", w13_weight_shape) + set_weight_attrs(w13_weight_shape, extra_weight_attrs) + + w13_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_g_idx", w13_g_idx) + set_weight_attrs(w13_g_idx, extra_weight_attrs) + + w2_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_g_idx", w2_g_idx) + set_weight_attrs(w2_g_idx, extra_weight_attrs) + + w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices) + set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs) + + w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices) + set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs) + + layer.a13_scale = None + layer.a2_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # Reconfigure packed weights and scales to match moe_wna16 format + layer.w13_weight_packed = torch.nn.Parameter( + layer.w13_weight_packed.transpose(1, 2).contiguous().view(torch.uint8), + requires_grad=False, + ) + layer.w2_weight_packed = torch.nn.Parameter( + layer.w2_weight_packed.transpose(1, 2).contiguous().view(torch.uint8), + requires_grad=False, + ) + layer.w13_weight_scale = torch.nn.Parameter( + layer.w13_weight_scale.transpose(1, 2).contiguous(), requires_grad=False + ) + layer.w2_weight_scale = torch.nn.Parameter( + layer.w2_weight_scale.transpose(1, 2).contiguous(), requires_grad=False + ) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + assert self.num_bits == 4 or self.num_bits == 8 + config_builder = ( + int4_w4a16_moe_quant_config + if self.num_bits == 4 + else int8_w8a16_moe_quant_config + ) + + return config_builder( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + w1_zp=None, + w2_zp=None, + block_shape=[0, self.group_size], + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet." + ) + + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + return fused_experts( + x, + layer.w13_weight_packed, + layer.w2_weight_packed, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + + +class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): + """ + CPU-only MoE method using dynamic 4-bit matmul kernels on Arm Platform + - Weights: int4 (stored as int8 values in [-8,7], packed to uint8 nibbles) + - Scales: Fp32 for Channelwise , bf16 for groupwise quantization + - Bias: Same data type as original weights + - Activations: FP32/Bf16 dynamic per-token (A8 Int), + quantized inside the kernel + """ + + def __init__( + self, + quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.has_bias = self.moe.has_bias + self.quant_config = quant_config + + # Validate scheme: weights=W4 (channel or group), + # activations=dynamic TOKEN (A8) + wq = self.quant_config.target_scheme_map["Linear"].get("weights") + aq = self.quant_config.target_scheme_map["Linear"].get("input_activations") + + # Must be dynamic per-token activations + if aq.strategy != QuantizationStrategy.TOKEN or not aq.dynamic: + raise ValueError( + "W4A8-int MoE needs dynamic per-token activation quantization." + ) + + # Weight can be channel-wise (group_size=None) or group-wise + self.group_size = wq.group_size if (wq.group_size is not None) else -1 + if wq.num_bits != 4: + raise ValueError("This method only supports 4-bit weights (num_bits=4).") + + # CPU only + if not current_platform.is_cpu(): + raise ValueError("CompressedTensorsW4A8Int8MoEMethod is CPU-only.") + + # Arm: check _dyn ops availability + if current_platform.get_cpu_architecture() == CpuArchEnum.ARM: + try: + _ = torch.ops.aten._dyn_quant_matmul_4bit + _ = torch.ops.aten._dyn_quant_pack_4bit_weight + except AttributeError as err: + raise RuntimeError( + f"""PyTorch {torch.__version__} lacks _dyn_quant_* 4bit ops; + install a newer build.""" + ) from err + self.static_input_scales = False # always dynamic per token + + # ---- parameter creation ---- + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # Shapes per local rank (TP/EP): + # w13: [E, 2*I_local, H] int8 (int4 values in [-8,7]) + # w2 : [E, H, I_local] int8 + # Scales: + # channel-wise: group_size=-1 -> per-output-row, single scale per row + # group-wise : group_size=g -> + # per-output-row, (in_features/g) scales + + E = num_experts + H = hidden_size + IN = intermediate_size_per_partition + g = self.group_size + + # Per-row scale columns + def _n_scale_cols(in_features: int) -> int: + return 1 if g == -1 else (in_features // g) + + # Register unpacked int4-as-int8 weights the loader will fill. + w13 = torch.nn.Parameter( + torch.empty(E, 2 * IN, H, dtype=torch.int8), requires_grad=False + ) + set_weight_attrs(w13, extra_weight_attrs) + layer.register_parameter("w13_weight", w13) + + w2 = torch.nn.Parameter( + torch.empty(E, H, IN, dtype=torch.int8), requires_grad=False + ) + set_weight_attrs(w2, extra_weight_attrs) + layer.register_parameter("w2_weight", w2) + + # Register scales + # KleidiAI groupwise kernels accepts float32 scales + # KleidiAI groupwise kernels accepts bfloat16 scales + scale_dtype = torch.float32 if g == -1 else torch.bfloat16 + + w13_s = torch.nn.Parameter( + torch.ones(E, 2 * IN, _n_scale_cols(H), dtype=scale_dtype), + requires_grad=False, + ) + set_weight_attrs( + w13_s, + {"quant_method": "channel" if g == -1 else "group", **extra_weight_attrs}, + ) + layer.register_parameter("w13_weight_scale", w13_s) + + w2_s = torch.nn.Parameter( + torch.ones(E, H, _n_scale_cols(IN), dtype=scale_dtype), requires_grad=False + ) + set_weight_attrs( + w2_s, + {"quant_method": "channel" if g == -1 else "group", **extra_weight_attrs}, + ) + layer.register_parameter("w2_weight_scale", w2_s) + + if self.has_bias: + w13_bias = torch.nn.Parameter( + torch.zeros(E, 2 * IN, dtype=params_dtype), requires_grad=False + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + w2_bias = torch.nn.Parameter( + torch.zeros(num_experts, hidden_size, dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + + # Placeholders for packed weights (will be replaced after packing) + layer.register_parameter( + "w13_weight_packed", torch.nn.Parameter(torch.empty(0), requires_grad=False) + ) + set_weight_attrs(layer.w13_weight_packed, extra_weight_attrs) + + layer.register_parameter( + "w2_weight_packed", torch.nn.Parameter(torch.empty(0), requires_grad=False) + ) + set_weight_attrs(layer.w2_weight_packed, extra_weight_attrs) + + # dims for 4 bit fused matmuls + layer.w13_in_features = H + layer.w13_out_features = 2 * IN + layer.w2_in_features = IN + layer.w2_out_features = H + layer.group_size = g + + # post-load packing to dyn-4bit KleidiAI kernel's format + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + E = layer.w13_weight.shape[0] + H = layer.w13_in_features + I2 = layer.w13_out_features + IN = layer.w2_in_features + g = layer.group_size + + def _pack_matrix( + int4_as_int8_2d: torch.Tensor, + scales_2d: torch.Tensor, + bias_1d: torch.Tensor | None, + in_features: int, + out_features: int, + ) -> torch.Tensor: + # int4 values are stored as int8 in [-8,7]. + # Shift to unsigned nibble and pack pairs along input-dim. + tmp = int4_as_int8_2d.add(8) # [out, in] + uint8_nibbles = ((tmp[:, 1::2] << 4) | tmp[:, ::2]).to( + torch.uint8 + ) # [out, in//2] + + # KleidiAI groupwise kernels accepts float32 scales + # KleidiAI groupwise kernels accepts bfloat16 scales + scale_dtype = torch.float32 if g == -1 else torch.bfloat16 + scales = scales_2d.to(scale_dtype) + bias = None if bias_1d is None else bias_1d.to(torch.float32) + return torch.ops.aten._dyn_quant_pack_4bit_weight( + uint8_nibbles, + scales, + bias, + g if g != -1 else in_features, + in_features, + out_features, + ) + + # Pack per expert + w13_packed_list = [] + w2_packed_list = [] + + has_w13_bias = hasattr(layer, "w13_bias") and layer.w13_bias is not None + has_w2_bias = hasattr(layer, "w2_bias") and layer.w2_bias is not None + + for e in range(E): + w13_packed_list.append( + _pack_matrix( + layer.w13_weight[e], # [2I, H] + layer.w13_weight_scale[e], # [2I, H/g or 1] + layer.w13_bias[e] if has_w13_bias else None, # [2I] + H, + I2, + ) + ) + w2_packed_list.append( + _pack_matrix( + # w2 shape is [H, IN]; we need [out, in] == [H, IN]. + layer.w2_weight[e], # [H, IN] + layer.w2_weight_scale[e], # [H, IN/g or 1] + layer.w2_bias[e] if has_w2_bias else None, # [H] + IN, + layer.w2_out_features, # in_features=IN, out_features=H + ) + ) + + # each packed tensor has identical shape per expert; stack on dim 0 + w13_packed = torch.stack(w13_packed_list, dim=0) + w2_packed = torch.stack(w2_packed_list, dim=0) + + replace_parameter( + layer, + "w13_weight_packed", + torch.nn.Parameter(w13_packed, requires_grad=False), + ) + replace_parameter( + layer, + "w2_weight_packed", + torch.nn.Parameter(w2_packed, requires_grad=False), + ) + + # free raw tensors/scales/bias now that they're packed into the payload. + replace_parameter( + layer, "w13_weight", torch.nn.Parameter(torch.empty(0), requires_grad=False) + ) + replace_parameter( + layer, "w2_weight", torch.nn.Parameter(torch.empty(0), requires_grad=False) + ) + replace_parameter( + layer, + "w13_weight_scale", + torch.nn.Parameter(torch.empty(0), requires_grad=False), + ) + replace_parameter( + layer, + "w2_weight_scale", + torch.nn.Parameter(torch.empty(0), requires_grad=False), + ) + if has_w13_bias: + replace_parameter( + layer, + "w13_bias", + torch.nn.Parameter(torch.empty(0), requires_grad=False), + ) + if has_w2_bias: + replace_parameter( + layer, + "w2_bias", + torch.nn.Parameter(torch.empty(0), requires_grad=False), + ) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + # CPU dynamic 4-bit MoE path does not use modular kernels or + # fused_experts; quant config is not needed. + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor: + assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet." + assert activation in ("silu", "swigluoai", "swiglu"), ( + "Only SiLU/SwiGLUGU/SwiGLUUG are supported." + ) + assert expert_map is None, """expert_map/EP not implemented + for CPU dyn-4bit MoE.""" + + def _act_kind(s: str) -> int: + # 0 = SwiGLU_Gu (SiLU(g)*u), 1 = SwiGLU_Ug (SiLU(u)*g), 2 = SiLU + if s == "swiglu": + return 0 + if s == "swigluoai": + return 1 + if s == "silu": + return 2 + raise ValueError(f"Unknown activation '{s}'") + + # Apply topk softmax on router output + topk_weights, topk_ids = select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + ) + + return torch.ops._C.dynamic_4bit_int_moe( + x, + topk_ids.to(torch.long), + topk_weights, + layer.w13_weight_packed, + layer.w2_weight_packed, + layer.w2_out_features, + layer.w2_in_features, + layer.w13_out_features, + layer.group_size, + apply_router_weight_on_input, + int(_act_kind(activation)), + ) + +class CompressedTensorsL1OptMoEMethod(CompressedTensorsMoEMethod): + + def __init__( + self, + moe: FusedMoEConfig, + ): + super().__init__(moe) + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + params_dtype = torch.int8 + + # WEIGHTS + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + w13_weight_scale = torch.nn.Parameter(torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + setattr(w13_weight_scale, "shard_dim", 0) + w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts, + hidden_size, + 1, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + setattr(w2_weight_scale, "shard_dim", 1) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + pass + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet." + ) + + use_ep = expert_map is not None + if use_ep: + start_eid = layer.ep_rank * layer.local_num_experts + end_eid = min((layer.ep_rank + 1) * layer.local_num_experts, global_num_experts) + topk_weight, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) + + dtype = x.dtype + num_tokens, num_experts = router_logits.shape + + if use_ep: + hidden_size = x.shape[1] + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + expand_tokens, + ) = ixfops.moe_compute_token_index_ep( + topk_ids=topk_ids, + num_experts=num_experts, + start_expert_id=start_eid, + end_expert_id=end_eid, + ) + if expert_sizes_cpu.sum() == 0: + return torch.zeros( + (num_tokens, hidden_size), + device=x.device, + dtype=x.dtype, + ) + else: + expand_tokens = num_tokens * top_k + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + ) = ixfops.moe_compute_token_index( + topk_ids=topk_ids, + num_experts=num_experts, + ) + expert_sizes_cpu = expert_sizes_gpu.cpu() + + # expand + reorder + quant + i8_hidden_states, a_scale = ixfops.moe_expand_input_dynamic_scaled_int8( + hidden_states=x, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + topk_ids=None, + smooth_scales=layer.w13_input_scale, + ) + + # w8a8 group gemm 1 + pt_output_1 = ixfops.moe_w8a8_group_gemm( + input=i8_hidden_states, + weight=layer.w13_weight, + i_scales=a_scale, + w_scales=layer.w13_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=None, + format="TN", + ) + + # act + quant + pt_output_2, a2_scale = ixfops.activation_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + act_type="swiglu", + ) + + # w8a8 group gemm 2 + reorder + if use_ep: + pt_output_3 = torch.empty( + (num_tokens * top_k, hidden_size), + device=x.device, + dtype=x.dtype, + ) + + ixfops.moe_w8a8_group_gemm( + input=pt_output_2, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=sorted_token_ids, + format="TN", + output=pt_output_3, + ) + + reduce_mask = src_to_dst == -1 + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor, + mask=reduce_mask, + ) + else: + pt_output_3 = ixfops.moe_w8a8_group_gemm( + input=pt_output_2, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + dst_to_src=sorted_token_ids, + format="TN", + ) + + # mul + reduce_sum + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor + ) + return final_hidden_states + +class CompressedTensorsL2OptMoEMethod(CompressedTensorsMoEMethod): + + def __init__( + self, + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.pack_factor = 2 + self.group_size = -1 + self.version = 2 + self.gemm_format = envs.VLLM_W4A8_FORMAT + self.format_mapping = {"NN":0,"NT":1,"TN":2} + assert self.gemm_format in ["TN","NN"] + + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + params_dtype = torch.int8 + remainder = (intermediate_size_per_partition // self.pack_factor) % 64 + if self.gemm_format == "TN": + w13_shape = (num_experts, 2 * intermediate_size_per_partition, hidden_size // self.pack_factor) + if remainder != 0: + w2_shape = (num_experts, hidden_size, (intermediate_size_per_partition // self.pack_factor) + 64 - remainder) + else: + w2_shape = (num_experts, hidden_size, intermediate_size_per_partition // self.pack_factor) + else: + w13_shape = (num_experts, hidden_size, 2 * intermediate_size_per_partition // self.pack_factor) + w2_shape = (num_experts, intermediate_size_per_partition, hidden_size // self.pack_factor) + # WEIGHTS + # use process_weights_after_loading to get get right layout if gemm_format is NN + w13_weight = torch.nn.Parameter(torch.empty(w13_shape, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + if self.gemm_format == "NN": + setattr(w13_weight, "shard_dim", 1) + + if remainder != 0: + w2_weight = torch.nn.Parameter(torch.zeros(w2_shape, + dtype=params_dtype), + requires_grad=False) + else: + w2_weight = torch.nn.Parameter(torch.empty(w2_shape, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + if self.gemm_format == "NN": + setattr(w2_weight, "shard_dim", 0) + + # WEIGHT_SCALES + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + # The following scale or zero will use permute(0,2,1) to get right layout, init here to avoid rewrite data_loader + w13_weight_scale = torch.nn.Parameter(torch.empty(num_experts, + 1, + 2 * intermediate_size_per_partition, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + setattr(w13_weight_scale, "shard_dim", 1) + + w2_weight_scale = torch.nn.Parameter(torch.empty(num_experts, + 1, + hidden_size, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + setattr(w2_weight_scale, "shard_dim", 0) + + extra_weight_attrs.update({"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + setattr(layer, "w13_i8_weight_scale", None) + setattr(layer, "w2_i8_weight_scale", None) + setattr(layer, "w13_i8_weight_zero", None) + setattr(layer, "w2_i8_weight_zero", None) + + layer.w13_input_scale = None + layer.w2_input_scale = None + + self.gemm_format = self.format_mapping[self.gemm_format] + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable |None = None, + scoring_func: str = "softmax", + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + routed_scaling_factor: float = 1.0, + ) -> torch.Tensor: + attn_metadata = get_forward_context().attn_metadata + use_ep = expert_map is not None + # unsupported ep now + if attn_metadata: + deepseek_instance = None + for value in attn_metadata.values(): + if hasattr(value, 'num_prefills') and hasattr(value, 'num_decodes'): + deepseek_instance = value + break + value_types = {type(value).__name__ for value in attn_metadata.values()} + is_same_class = len(value_types) == 1 + if is_same_class: + assert deepseek_instance + only_decode = (use_ep == False and all(t.num_decodes > 0 and t.num_prefills ==0 for t in list(attn_metadata.values()))) + else: + if deepseek_instance: + only_decode = (use_ep == False and deepseek_instance.num_decodes > 0 and deepseek_instance.num_prefills ==0) + else: + only_decode = False + else: + only_decode = False + if enable_eplb: + raise NotImplementedError("EPLB not supported for `CompressedTensorsW4A8MoEMethod` yet.") + if use_ep: + start_eid = layer.ep_rank * layer.local_num_experts + end_eid = min((layer.ep_rank + 1) * layer.local_num_experts, global_num_experts) + topk_weight, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + dtype = x.dtype + num_tokens, num_experts = router_logits.shape + if use_ep: + hidden_size = x.shape[1] + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + expand_tokens, + ) = ixfops.moe_compute_token_index_ep( + topk_ids=topk_ids, + num_experts=num_experts, + start_expert_id=start_eid, + end_expert_id=end_eid, + ) + if expert_sizes_cpu.sum() == 0: + return torch.zeros( + (num_tokens, hidden_size), + device=x.device, + dtype=x.dtype, + ) + else: + expand_tokens = num_tokens * top_k + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + ) = ixfops.moe_compute_token_index( + topk_ids=topk_ids, + num_experts=num_experts, + ) + + if only_decode and self.gemm_format == 2: + i8_hidden_states, a_scale = ixfops.moe_expand_input_dynamic_scaled_int8( + hidden_states=x, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + topk_ids=None, + smooth_scales=layer.w13_input_scale, + output_format = 1, + ) + + pt_output_1 = ixfops.moe_w4a8_group_gemv( + input=i8_hidden_states, + weight=layer.w13_weight, + i_scales=a_scale, + w_scales=layer.w13_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_gpu, + w_i8scales=layer.w13_i8_weight_scale, + w_i8zeros=layer.w13_i8_weight_zero, + dst_to_src=None, + bias=layer.w13_bias, + format=self.gemm_format, + group_size=self.group_size, + ) + + pt_output_2, a2_scale = ixfops.activation_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + act_type="swiglu", + output_format = 1, + ) + + if pt_output_2.shape[-1] != layer.w2_weight.shape[-1] * 2: + padding = layer.w2_weight.shape[-1] * 2 - pt_output_2.shape[-1] + pt_output_2_align = torch.nn.functional.pad(pt_output_2, (0, padding), mode='constant', value=0) + else: + pt_output_2_align = pt_output_2 + + pt_output_3 = ixfops.moe_w4a8_group_gemv( + input=pt_output_2_align, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_gpu, + w_i8scales=layer.w2_i8_weight_scale, + w_i8zeros=layer.w2_i8_weight_zero, + dst_to_src=sorted_token_ids, + bias=layer.w2_bias, + format=self.gemm_format, + group_size=self.group_size, + ) + # mul + reduce_sum + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor + ) + else: + expert_sizes_cpu = expert_sizes_gpu.cpu() + # expand + reorder + quant + i8_hidden_states, a_scale = ixfops.moe_expand_input_dynamic_scaled_int8( + hidden_states=x, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + topk_ids=None, + smooth_scales=layer.w13_input_scale, + ) + + # w4a8 group gemm 1 + pt_output_1 = ixfops.moe_w4a8_group_gemm( + input=i8_hidden_states, + weight=layer.w13_weight, + i_scales=a_scale, + w_scales=layer.w13_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + w_i8scales=layer.w13_i8_weight_scale, + w_i8zeros=layer.w13_i8_weight_zero, + dst_to_src=None, + bias=layer.w13_bias, + format=self.gemm_format, + group_size=self.group_size, + version=self.version + ) + + # act + quant + pt_output_2, a2_scale = ixfops.activation_dynamic_scaled_int8( + input=pt_output_1, + bias=None, + smooth_scales=layer.w2_input_scale, + dst_to_src=sorted_token_ids, + topk_ids=None, + act_type="swiglu", + ) + + if pt_output_2.shape[-1] != layer.w2_weight.shape[-1] * 2 and self.gemm_format == 2: + padding = layer.w2_weight.shape[-1] * 2 - pt_output_2.shape[-1] + pt_output_2_align = torch.nn.functional.pad(pt_output_2, (0, padding), mode='constant', value=0) + else: + pt_output_2_align = pt_output_2 + + # w4a8 group gemm 2 + reorder + if use_ep: + pt_output_3 = torch.empty( + (num_tokens * top_k, hidden_size), + device=x.device, + dtype=x.dtype, + ) + + ixfops.moe_w4a8_group_gemm( + input=pt_output_2_align, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + w_i8scales=layer.w2_i8_weight_scale, + w_i8zeros=layer.w2_i8_weight_zero, + dst_to_src=sorted_token_ids, + bias=layer.w2_bias, + format=self.gemm_format, + group_size=self.group_size, + version=self.version, + output=pt_output_3, + ) + + reduce_mask = src_to_dst == -1 + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor, + mask=reduce_mask, + ) + else: + pt_output_3 = ixfops.moe_w4a8_group_gemm( + input=pt_output_2_align, + weight=layer.w2_weight, + i_scales=a2_scale, + w_scales=layer.w2_weight_scale, + output_dtype=dtype, + tokens_per_experts=expert_sizes_cpu, + w_i8scales=layer.w2_i8_weight_scale, + w_i8zeros=layer.w2_i8_weight_zero, + dst_to_src=sorted_token_ids, + bias=layer.w2_bias, + format=self.gemm_format, + group_size=self.group_size, + version=self.version + ) + + # mul + reduce_sum + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weight, + scaling_factor=routed_scaling_factor + ) + return final_hidden_states diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py new file mode 100644 index 0000000..ca28667 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .compressed_tensors_scheme import CompressedTensorsScheme +from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4 +from .compressed_tensors_w4a8_fp8 import CompressedTensorsW4A8Fp8 +from .compressed_tensors_w4a8_int import CompressedTensorsW4A8Int +from .compressed_tensors_w4a16_24 import ( + W4A16SPARSE24_SUPPORTED_BITS, + CompressedTensorsW4A16Sparse24, +) +from .compressed_tensors_w4a16_nvfp4 import CompressedTensorsW4A16Fp4 +from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 +from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8 +from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 +from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16 + +# This avoids circular import error +from .compressed_tensors_24 import CompressedTensors24 # isort: skip + +__all__ = [ + "CompressedTensorsScheme", + "CompressedTensorsWNA16", + "CompressedTensorsW8A16Fp8", + "CompressedTensorsW4A16Sparse24", + "CompressedTensorsW8A8Int8", + "CompressedTensorsW8A8Fp8", + "WNA16_SUPPORTED_BITS", + "W4A16SPARSE24_SUPPORTED_BITS", + "CompressedTensors24", + "CompressedTensorsW4A16Fp4", + "CompressedTensorsW4A4Fp4", + "CompressedTensorsW4A8Int", + "CompressedTensorsW4A8Fp8", +] diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2795d5ae5b65f82b694ba05312de453d3c5cc9a5 GIT binary patch literal 1227 zcmZ`&O>fgc5Z!gYKU>ID#J2{r)5^~1hRkg;}C^;vC@+ZpblS-rdor1jv)oXhr z6@(&+#Go&R(I||1rx!vA2-Rmo$4=Yn1kSve+X>vcv4dgUu=EC`*lG3-+ef|qU5EEx z9v;3p>hC}2&kp)My_HlEMV${^YDOaz=)!W7IqtRj>V@(3ja{FK59LJna8p@5J^$RfZuM!Jo4y?**{Sv;O@ zi$`W%CcWNFh`(_>hYv5$0>>oO_med)KMPz_XuiT9#>US)1x#tznRbEg>8<2aAe;KZ zE|YV)-qa7yMsFmgK?V46%?#&Bpb~b41K0JWEM0G>Ih{RjMz17jpd(y~&T;NZ$@yn; z14r^Zv||<6fgfzgUU)9z?W=L?k^8D8Z0Voa!6d#I4ed)Qm90htA-T8`r{j1O+R~ed zFtT5dy&Is0r{RMJc__6Z-U; m>|K*K{C-&}YT;FfnhUtz`$~>(vxK&OrmH3ET`t^&SN{N3h<1hm literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_24.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_24.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d803c5628008b6ccddf0969a3550c51dbd3e5aa GIT binary patch literal 16077 zcmd6OX>1%=GrC5n=$!!jv}ltu?_4XLry)=azARV170gRUxC zBGL4C4KJ3OosFbmcV!{Kng$lJ3@aGTpNtj=vXkrrn;=2bMbhb>0vA!TULek|)=-9b zHh=PcudXK5l#l(BC-L<=zxTcCJ74{0uh&IENLl_c`SlTs`a8^M!Kx;nOT`- zv}4H}srnK#5n6Vfj;B)bYbg%W=CRDuWW*#_ozA8gcuo+K*~~OpT;yf!ri9429 z6vS*gmR%44g~m%9Kg+SFv#G^&=CwEvNOGx*&?C;vRhKXPXlD`huh($n*l8{!WO-pK zF~_BGFb6?v;k`?_I~*^@#B3}v7tdt4)QzOT$su7O&I=r% zB&Onm5EB*`7P7p^u@Q@0bt*1!?_9ii2~kLMBFD>(rHtwU z>*PH1jcT^+XTGT{qb6Aw=VINQpK}rD9>{s2H&dadkE@#Tu>KD$(dzFsf}N#jEO5Xr z(HbYkHh!nRRe(!(1zf84C|B0bUbz-ltp--DcF!4NzX~%4?1O-P9jtZbdJ|g<_4RCB z<#E~iJ<7k!KJ7iH5R4sKq9b9x9#(2ncD}9Ld5)DmH^`ZekrLUR%q%R5Y9^FMViAKo zu2B(4XJ(SK360R=vRmK}Nz(RzrSa6}=o}TpauRYT$U(W9GeZu}wh}}*!%CQUXer}2 zXZcL+w_?p%S!$8KN%IG2DrcQ148UlKsd?@0log9UEi)g`QpWFmwUO3R*WnxihC2Kl zH6PUKj9=cKvqCGkR+6Ln`JDOpW}!7+kX=E$b7EMlsC37)R)_&VmNU=q(^?z99%{vw zv+)6u(aPrAw3P9i@t4)N8|7MR#V&Sf>6|TR|4iKvn2|o$B})1CDLomf<&jMAh&_+Q zfww^<*U#`kjGh&J&pBeV00PLh80!uTZbC!NTdk3rXnL$(N9YG(Ibo1q!siEV9y2n;eI+h>v2EpP%JSw;>2 z_|ri#y)d|tT<90$!0*KVn~O(|j$A*&4W^RU1{apZxol?eom474s7#2v$pKHw@`I`P z62}XJ>UAV6cu<=?$Hqj31r7>?2MQIcVnf3N3rn&i7E5N5Vl1}YbXvWxIs=6kT5T^N zEuhHs9#!&DjW4ZQe(ozab+1}B9Sy~BC;o=E!aarXDJgua=xN_Jo0?pCN2uhWyv=K8 z@}BPBQYP=wVt^?G_Dg~Ng}@OhaAeER6#PSyf2iOek^Cc1&2-h#l9h6LHyy1d7v%{Q z-0kb`_G0_+>X}VX>r)%$tt$q?`9N1O5G({bq(H|;plh@Dc%gS(>K)(cJ-Z$_`=p`0 z(9kC}^yO>&RwsTlc<7Jk3w6Egb-n-gSmF4q((zYU&u%q$tj#=_-Dp0%dghNNOHQil z99=R~RoxJI>$YKb)TN)WI)!|#V! zuEqnGPG(|>_(J?zG6iRQxwFzDKi}>L&|g3*2g_}v0@eHw0<@A%mt6{}6_U%GT$f!G zk?w-HA|_D|i(#=Khm1BcIO;~d?1857w4k1OuS&#I**MGb37xRQaF|9?wY>?cdz7dz zghp}%w}^{D0!wQ|_kB8N&d~<320~89^maK5 zrLBiHnqWUqv39NWecDMGBwTzU?yQ3~i#p4xMD7iY0b8d=f>)gXwAEo9%RtP(^BTGS ziKDu3)Ux?jU|q(1pW7jKj-GEdc1BCF?wr0Z!1OD(@#I|E+U?oKn{$`7q2?>F%sVvy zXRvDR+StY~pQ&6vk33;l5P@|oVZKL88Nc&=Mp{b|X8*D{sO4XLsw&+G);wb>TY02c zy8^7d%HYY(@M$+?#gn;Nd2alH%H`iMXC!UA!d`P0Oy{ZhY4MQO{2S&CTD}qtO7Lyx zyyJFx_BciUoJU(DI7QU_h&G<_3n%1g<(m0pMk_6KhZfrQX#L`I^WE1_7b;)8hCArd z3s1?874z<~Vu763Sh38*vfVhLt%317|B8{;Qf1hVYGuanx5t*PdRVPtvf}yw1dngE zYJ05g&a+sV>eb{N+MH<+)FZ*k<*QR54hS4``pWdhv8gF0lNA}=*fGGMnS)7#U52^F zG1nH8DUrF66z8D%nK9RLpiqIqsoJeKC*dv@A(7R;;n9u^z|EB%{WH-uG@B7Wk=?9Z{d}~!w zplvPwd7w)=IsH2U<&5{MgY z$x87Pkdm!LuaI4a_%GX4B`IQ78lTa6=vupgRw_+vCz*+e0$KL*+}n#uF#pA7`8X@v zl-WjF`Dz@^^&?cf#(5reDqku)n*>0y8%b826L=(PJl&U!#t09tY`cB~v^askj3pPaq{`S7_0;f}uyPWkS0NJ6)}U&G z>W`v5_;#MP<+g7=j|fzF6K>3V)UzNJ=-%{4wyK*-CeW(3&E^_ki86bApjR=6SFM|l zree4Qe?!H{82)w^dj<+Uqf*c4ubVb{CQ+qpd}gDnJGZLp?uGA$w*oc!V9!RNw;1Ry z)-@LDx~01AXLd_9f~*e};eTV-vnpq`Yt>c?>dMf`Ez1d*X@z z__h^(Ap<3#+x31C?AWSpy!YnaH`l7x#vWez#hahLxmkO(xUct_)m-aa9e?JaS`VpB zwu1Hf(7;A;uo&!nL`%T~#m0Sw#y+XB50I*>2Bg9;#U|`r?9p$Yqc-kdT``YUdxJ}P+(Nm@7ujUVeb~7I z@()iwL0z?VJGq;VYzUwJfQ?(5MbUoNn3^f%( z5h)ZYgodQh(B|Rug~OMn!)mIszX_lVRzQs^Gt=pKDi zA1>5)N%dWY`u$S<{>|ZYh2dAF;a3a8SES)9>-AUifh+m?D_i>p)nV5zKjeOq{xrSW za6%cdrTUH=2CS+p1iGa__k-rg?#}~fV8@$UKN`6|QV91;;eOn`=-ucV{jl~I&7U@J z)*kuwA}H-m1KX6T$+vnDCKhHs>b~EtoP<4^hx?&<*tdECPo1`{KYny!(=%%9qB7wp z4sXH1NDgK#^x*L4jv=CnCy)?p0XUFi6vzNgNA5JQ)s+46$Dv%;f{hB`=Fn?*fKr6I zE9B^hv?hf^3#4(oLQFTW=IFOT?|#cHdbK9NEp>^WKlTkKP9(j%0*t7e`HZScCRNt_ z(4xs1pzW6%MJs94zl4>kP(F|5xm`hQ)Cwx;#+6K;L{pmh?z0tj%@XF_rj2U+LMd{U zT>dDx(2D;Xt`F<;30fTR`^7^QiPfI1etdc{7pZX5{D`>E+?fO;VcKccT9&q#% zVl3ykZ^=4vkC73J$e~G?AJ9_9@3)7j*>SGIUapu{%-{X=$cHAfZ zUMx8yu6PtgYnLWJXd(im3aCtc21Ro;?jI3QClX!3G|x< zKAQlaH^&-t2Ko6S93e$!BeEaLY!eZhe+_;?U?y_h6gD{z(JcItpe5d$j5H_`ZAZ-T zPG~NSvX#?>Lt``|L!JU?vVATt#6^+ke*`)H8b%3>5EaF}@?%U9ZRagaA!Xw6@|U4E z3R$@-Da42b45uNskOh%ib|ghEO-#vVaJ2<31}AY_qzcnz|Gtr!l6U#^nRrX)X6Rys*n(0eIox!GQOg z1wSL9FC!xPBb)tWh5mC=|G7f{q|`sT?w{Q1==tQ0kKZVC9FaPXYz2FY9X(3A*wt6) zIxck`NAb1}K)%#na#Nin+mx-d`i}MUK-;rws<1 z*nBh42J(}q>h_6SCkpOn$=zIV53aihAC3R&;x8{2h9{)q2}LN&Pff26PZy7$R2HS+ zX_h?AA95e1?xza-2Bm$2kA|gvMX+d0hLKO@Gl;IC@?>dVZt%g5+pi z_1s~%>YMJ}ynC}yACc-K5Bh%1iO2|TqV9>Pp|{Sl zjYd01COpZuV$b-1pP@d!E3DXarWIS+Tq?+H;Jl?PNnB8>^A-ba4!O22ca@c!D#>6u z^F!Nb=sAJ_J`~41+Cp{(%`>9T+A&2%?D2nD&c}X8TyFv)?uv0pV+v>nLCRf5$cK&% z>l9yc1V7K=gg$0A`3^@o3NRM%m}%?{$rT1QC&o@a5kfNQAfL}@TqbXHI7N9yt_T3taQN?HU&l&XIU6cWwbli&*-jrHYWjZd~u8FbCvH*4`@OT2FknN;dOix^soiK{( zco(o=fEaGTEe_3*xm|%m!r^2KntJ#&afa&|GhC`6nFW8-;ji8%6$*wO`jF{Uh$ zY=0Y2fxS~kG()L?Bn$VO1cmW=;|mL^C5`PZw^m|wJDPkAFogdF5iqtkVm0{L$*o}c z!|}D3AGL1;kF1VAK_1g8`CI?|$|uo}qwD^`EhNoDQuENG1CPfyn@`ydmtMPF^PCUoz_-4nos?w!ATz8GxL*%36o zH+y%sSXKMX@9?`;&0qius9bT=-2?-6%bWu~In zRp?jIcu%>kcv4D?>lstRL=%%cIQ{X&lK_lYo$p>VsHCKdV7ynt z{2uI(KT>(l-xFG(%)L{AWs+l6X%#D|rDeERV5~%t_3KOS1{xm@;|m%#^8JDowOY^lH^9>eHP?B=o~ zvzSJ$M&N%4qskWYz6#HYY@N9H>e#fRy8ILi%s-f%=2svE_Z06Yc!Gr32I zZr}5UDlf2zcc}+2=s!Yp{etEl&8zR~VtL-*mv{6*OiEfLe@ouqn|Jg={A8y^_ECFb z=#(^csxWjx8oKb`gRlL^w>N{YtvdgwKLm#KeDm3j>T`L|IkXyr@fU^Wf~QULv~6}C zE_9xdI!~;7PLP*LjD!!J@O{!|n*D_@ly4Z{@J-|$6NLB}s_G7z8jmp{Lm$E+{{!z{gs2h$ z`k9xdMO~h*L@Zd-H`JS7Tt8!5_0G_&wB?hPv%Ym}aSj>BsP*2W4SsSAqP@nC_Z>UTuLn zLmMl9r0lhZ6=ZLlrrQ8K=WW+*Qv%jWfq4~nb;Y^jS}~a@cvt1jxpKNDq{4JVt#idB z;^SQs;7yB4vR6Dg&lGjbTj}Yk@m^`K)l+nyy=Ig%ffj&g4#Ljykc}ivD~=lwL&^g7 zHNfx&Y5@NVI_UD|8T~0I6E7F%%j9lFz}6IlMFDIYL~vDIpD5Am;1p?>RD)l0F(nQd za{e^&STAN0D33GX&dnNhl6;*2vb15lfi`7*RjC~Xdi-31V-zZWOjzJthyQGHW`@Jh0T_Huy92;mx)i+Afb$k}V=kGPLm#~W zOKLz+UWhW?L!kbM;JVN9JK%BfdBhTMg3L0unQ^fV3gL!F4YW1MWB*E&KK-i!^?EnWxNn(x^ zP;UV=`^)hZ2K2V-PN^V`ZPLbouROAeB#gis=E1VvQnk509Ug9$z5v?MdoePqMN{8V zFf<|*EhzTZ=P)K}pMtNd^v-4BorR`eW;jLFE4N=NX5SsaH50I2c&uCjG>$g_+w1Te zn8^+5s+Ac4H3Q})xbpl#h!X1kM;bsOg|30W?Nbo_30?l0G=~rWO@?P{#eB1}1VvV1Y}6s3k-I_2fLa*yHT zA;cJW<~`LLAJZ(8!U8|lVbp&B?>+dgTZ-XfuwWKL zE#Q(SzDgGr>?+iZNHrsm!y7f@dC$1Q_EC1_??42P6gn~e#~7ip7@j$hci`}37^n?A zM9(BR-ix&;gM%)QbK$43039f@c{#bjBe&)M1x9xv(jGn-hG}93{}XKPCH*x$SIeq1 zm)%M&d7Q$3fbeE8!cCBCjFkfIo={mq_LLWrNj(2JVw7wLp9CPwN5A(ToWix{A%e9o z4O1;I74{vJ_8lzjJ1gxwn|Fju7MF8^-sQyWX22_itEDnuHL;`vXiWX^3~kF zdh6;}PRy1C8{w0y7hyhuCR~VGs%fwg9+kqQ zTY+G)CGsTHTnP0^p}q%~rO+T4+6$q>Qs^)|EeNCiy1D(C*Ire;c=GkdadJ=-j92?DNL94Ptew; zCt!QJc=uw-iWwNMy8d3%-KLVAWF1tXR$mIZmC!;P@R=L4Wa}cGvOkf9&q;~ECk(`| zCCL8EHqcsulPgDwkk4vE0?@vwA)g~DpE9*PUpr8)QY%N4xaEi{9IDWMQB0-;LULsb ze&)lw@kpg1l3Ou9u=BH@0U`nj11a($ETQ6rp3A3R(6}6@feJXN-w5$&jggOhTR_VD zO+^id522I~qmF*t-elg&YQ>Z_040f%N4{JC8zb=%wL5xFln08Kd7;MJ@wMpCNpFlCWveleTn{C>NQgIwZlrA jwke28KFZUYceTD}|9h(Une8W5x+&jy;42Cf1d0C#2Im2k literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ae6949237fc54814405bc16f9d4b0aa04d1265d GIT binary patch literal 2420 zcmb7FJ!~9B6rTO{`RubXNeDjyO{^r+A$$iyQH({15`&@?SO|iMPLWo7vvYSF?;kU> zw(qJ4+)+hBG9{w3Qb4MRiWZ3`bkSBwsnT8SAS5c@%gy=6C)}HbPJ2Wb|3ZV0RG)TgXG67NAgTX`$ZIH96MApD>zQ%&+HyAIwz*&~5g`@5*jvd?FUMiRDCg9>Nh2OJ{srmLkdoX) z-d2jq?hXvL5JfG`LoMCYS_U<#<>}P!=$^4@-Y*nKeQN@w;8i)Vz}%vfcTN_Hd%8QX z?3olp&j#IB4_Ti$V!bk$K&WGTlVe|6wf}b}L{ij$F>w>O?FYW-=k}%Aww9e(iN%dD34j_AIP?i)F{^8tK?2U@Y53o7 zn<7k_iJvS8mvyOF*hnvZu(*1OHUq!iO!~mLsJRvdVKa<98aQ-=l2pX38MuAQcyliT zO;WPb5jicJ1t^DzoLtmM`q|5Humce}<1hwqA3!drc;le%oWx94s+`@G z*c||ORw^C}?w2LFU+p6S5I~V5ahdQ*8n_G>j^i6qk_s+qBziu_oirknH`US>n}h@F zMrLvvbTShF=9#?;eCl#86AejX)6i!#igVP-8OR40jtc%oz1LtblkqSaS!Su6WZFh% zwtbhc5EE8M-?8L!yC5#z{ydFT$a}dj3*jgnZp!H4U++OxlvZ+-7I;F5-|-1BRdFR} z4CaC+ytztQL<1<9UYrI{FBH2;68O|>j6qxiinMtY4XYJ3`dX#h3ri=Og} z(XJ0h8I)atqO9N28(9!B6gh)R26GfNIc7AVYf!;?3aJ$B%W=8C@M2>L`!egJ9m=cy zN)r4fEZgy6<9KpXm`t*kosx~bLfL6(xJ;PzG-pLkg3&KYasDV$yr6bg+9`_ROkNC% z9YhDg=sT7SgH9RHvKs^dsM`U!M%QByyPh9)Wx{2>2LPoaha2_ba;0vRCXf(Ultv;- zJyj(!At@W*PdO0!7zi$190mVo!eRn1+$elUIpGwi4=x@B#`Ue%=9SA?3|yO zoxjgamFgRyw4E2L%CLH%R5Jw<{u4Bt=$URJweldO`UsGJ%4xNx&4kd{{#6O IL8nao2R7k=g#Z8m literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_24.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8198602a70cf817c05d49a753939c174e5380a7 GIT binary patch literal 6711 zcmb7JU2Gf2wVqw>lFR=e{aU1CI+iRkj-^;}94D23BgIYZ$g-rQDR0rmio232@h`JW z+agHBEnI*?@4dcyl8xlT4p5*n93T#Gf#g12-?sqwg^UG=Sg3%2ra*O{=*mHXzO-j% zxum4faXO%8&YU@OX6DS!Ip2)_&+T>~ND1z@@&9i_=#QjgCcZ{&uK}@wWF#{Qlw_g| zLufX^Mp-B=2}_cTa+Jp<_@p&zqcooolJ=;b($<6{>5Mv)uBeOFZ3%bM6ZIs$QE$>0 z^(FmLKjjOFK(Zy;07s8H>A zGo8#}MO77fT1lyCtWL$QC`l#4>cR69)0f3}&rV)^dt4ko2Gj@Q#M#NS7cY)o{C*?I zO#gIp?ChvG9^rKVIZ0KnUcC5jeQ3Sw%_~wWr6ewArIa=mlM+g!t@AuiXEV)OS|+5} zf+Fuy1=Dt^pg_W+wwO^=z2|$2_}Q_uM~_csB&;gK$F?{yO3W~mk-KV@?I_rE1t}=X z$Ov}7MP?O?Y*9FwgFhx`Lul42^Ede@uUKWP!p*X>?IvgLNYo|^iZCD8soJ3DTffhf ztuiyq$@ZIE)cH^KbHH4jic8`4fl?Vt*$tz1H5tL9cxGL)7e?^@QzQ65U$}`r|1nhw6SJmp4CWSnyxaqO|zRdna#5w!^zgDZy?-{(SLI0 zdHOZe=$B`SjX`T&Qy$HmYIc+|w=jSZ?u6F*_>JZp&K+a(jd2#yB8phYw|pR)>v%7v zW;2;I))YBxS|xlkoa-12Pc>`Md1|rTaKmunB%I~&j1r!UUsY1!YgZu4s`1oZ*fe;U z3P)_%4`bpM$aK3FPbk#x`T^sdz^R@ZgA+dW-o(VE$?36Car);IV^eVACv?aAQX;F2 zVVuT#=vm_D-kzS)9UqdDO^+ma08UjBv&N|8e9)m)vd>{ifBVglmdp%g;+Z3wgy$6P z$o1@Nub)_WO&LnWXNEG1+Ld%_=xQR797?8TB_S%;l~`6wO-aNk@-^KO4-z*C>kdR9q9qT>lTdkU>-M0IWbQKz0LFI>SZo zU)`0iUMRekeL*NZo6eTC;7HjyvJnVX0+H2$)vhXHjty)KjF$Sxsusq7p4qloT0B)` zaeJORP;lg{pO(AEh>A_8_twd!lef=*<2<;^eeS*QEe)IoJ*Do;j{}n>*JRa#B4?PA z^WZj20O?k&Xjz#$joTkrAsimjq*q6s?Qt_yk-2Y`;6|!%7@uF zL|SnUOLBzAL_~LAQ{r=1G%-V?oX*LTCK0WZBhT5)AMAac40Z{!8|d4f{#AZ`;3wsQpR5m@DG!_}O}w|( z^L~+E6M`Gvhwf$9`(H2jzrNmos@#9-Yh|r_nlyE7wDsK1-p$>~-Tmm!M_(yxZKFm0 z-yFvG;fSX)SI62dlt?Hc(88HLPs<-PdAf==#$^-$_%}AE5;28N9 z%|wEL6aKbYFgS!0UNc*ox^CjuRogodTFfB-YX-sxYPRR|O}?`nY)(OFxc|JxbpHjr zM&?0N;|rTNp@`W8WuAlaNZ{O)h;G?J^1S^S5&&`Y0+HCUN)m1Mq|Z1LHp)D}5^G*) zt}k!>%=VyW&4L4Dnn0Z!tZYqr=$xChQhPX`vtB2xhi}X&ZbkWA6S)Y+T_ z7!tcOSkti;hM8=U&jG?KPX#ckZIRdpu|i07q4K61#~ttL>{9mo8#=ys&*}*hF_3dqH8bs{2h#k^87fHCoBlQc>|~ zG8QqL9!N^p#rphcXu=nuK_&itl348QhQi_VSS{?fpy$GxXZ z!hvrCeH;GvmEdx);u}~U{e1lXc-i;HS5nz`y5fs87z<_J+Z*k{m4)Snr#3F&sX9e`Iy~bMd}d_K#F; z$lmtUgW9@QE-YWD_(CvJrTqoa+eY;E9Dd@qbvsJJzG?uue78<7oh~^KZL}SM)wO#n z!O$x6#6qYer)atK)>8qszg%vc{OV|_ZL)OcGLXf$zU>&Ot?c&Y-zmRe_;g{-Km6#( zQw!VaD7v7F@8FJ=pDp{2JoJ}+!w+HUuaTt_&NEjWtaSGntMxgGTs4I3o^>Hy7Q*-T zKkQ!<-lz!fTD|9CXiYe^v);ZYoM_bBS6kPFK|sdtLu--IvTJn1)&GQNdj?8E7fiCP zXT#;cbz$j3sq@AE>iaVCMP$u2^7R<3qRmmXL$_crfF`d2nwaZ4HxrlCT{mWO4k@fI zfzu&3=4Ln60@oa57kJ9G5{a0W~+S0 z8UZeLA?VR;{d9J-OpaKgcHt?w^Oj%v^X_?aAK#N2fDr< zdU@)b4ChXqkE7x?M`h492uX}pSTt@p)tVI&I-p?t?q~Yc{NtVTkXNRMG4^-gy zNk#y_p?i{0hVJX)u&i6DUrpA^g<6@??KQVcL<~|T=(U+wbW52`VzKTNUwV#H+$FsX zI@Jxx=ryyYb3M>k4)hgm6>saU+)}Q{Rs8Mi{)1)z!J@U&x^Lyu@};7E)9br+b?NGV z%-lY3xBpK6y%+8re(XI|k5 znDWc#N<*XN@aSXzSkVfX{gN{XUCGhE&8@k*H@$5|H$CPNW;_xgj}s|)wiQL)A&LM) zvJjy}-6e|f_>ri!Sm2!pzeDz^2Qu9mOGv5;7se}T853lM&k#17O6k^jsRleq7+eBE zGk%Q-5XXV13Dask0ArbO2uPrFq0x*!fT%zsj4Gyzc^h*r2a_Tre)PN#qhnG zu(8*du9F~Bg%S8(aE5K2RU0J*bfE9v)Tj2UomL&_;GwFMk}lK{B2ldBrd1E>3~zfW zInVCr+ba;1K3M)>MO+rE7E+H4ejd6XGU$`%{^u9}{fn;FXMF zz7r7B_ebRV1M2w`I`|#SGVEWi7RUsiVYd-v-?R($??(++a|}~$gkUiWm~Xn vVf)w;I`G7b!o8*5vGv|}%DwMA?!EYU|GOm=dTO0unf6lO$ajbkbXWfY0FVr^ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_nvfp4.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a16_nvfp4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c2c5ed9c9658920f304ff6ef7e069bf8f3da2de GIT binary patch literal 4861 zcmcf_OKcm*b(h>Fm*kS7C`*O1$rSX1tbP85Hu&>=t@R`p8DP{f0C#q zy>uYWym|BH&D%HcH~gE|>t>*7+|Lug^)t*jIH@^F(KuOxy-pkbEiBpPfCo5DR0aRIOhTz^Vyi!8AW&q zGu5$aw@!?8?R}kOa@jX> za@x2Xmo>F<9zAj^qo~?DYGPs0Xkw4vQHgcnC-dAH(z$i;k0ry|1G;bN4)|8$ZUcn90?_8o%vtT;g`d;8USLFoVu zbml$bF5&^Mv}aM)r9?WLGo%GAGbd}3j`tL|wH%2s+R{!Cx^){$(U3KA$HVMx--i9?MoQm4VY*E^wmz*I(NR=I8AWaZevD)-Bd z9Y8YFK~y70((#675DU%JBL~1blV?#^cL4qr9n&7ybREfq1g($;Kp9dBTrDnV<++5G zF!C#B4;Snp^&oK3!5Bcqs)O+d2pTfwO}){cOwPbd2j?{ss5|q9blPj#0$!Jt1#fPw zQxQNprAvyD&#KgW&-7eL%gBmK;tpV4Z+Ily*3LiSiq{#Vp@M>Kvj*RH7HkE!X*LBr z%o-+FaKfw>ycouC)fpSe)oe+z4mHPi=M?M;y8?WTdR$A3j7m( z6K@FGeWAc6@s{^B#ZeG8Yx1&;(N$mm4hg&KbWKgPNY+zdZH^#W?G%v8S%>et4v|VI zxau?@6`H8ozC)@u_XGFAB#+hiH^+Ut9B8i8p>NTp`w*#J1$R?wIQMv+S92sc|L;2X z93ri|;Au*mXCUBtW}o3DfeZ7dZ#+YV5);KyPLU0nx-73!5-IAgIUc}d2nZXVl$GO4 zssds26mB<{x@u0=8kwnBLz;1#1`fvVu%FuXT#EX%%t8W!xpXg~7>hc=ctHDV#~*ZX zf__GM2(9r&%B!iYk*9nvkv67ifT#-z9jXKj2Ufj6(4(kbQ`6Kr4?)1VFir#%9AiKA zny4vBok(4nPn+>REE04i(vKY)jwuhp%eFq&v2+g-fKJWF+S9+jw6?VCpHB zfxly;yPd9pd(}}r!nk~8Ay^WE8$+LsYzx!CZ0J&r*>CO^Lz7#BQ(y33dO!DWch2HQ z-ah0m3jI}~v9lPuRGj+pwr~^n_P*3kkRSg3_NTW?{>vatxhGue3AYia*S#uMdm2nP zj(;|=EnNND(^2+}{M|G1G&J?J``G%sYwv#CRUE#&6`J|twJ&ddervn?Cza!8K&lhI zqIevn@&?M{sgiiAI6PgveCsQ5w(4QTQ8UfV?kONJJD7>@L(uh}jKC&q@eL>%Em@1c z4FmuR+Y$>~vO|IW8QYBf3gnR_n&G~dY-En+!e$LjVwtk98I3HFM0?)X>RD_BxbBmy zk*h#14q{oG+L656!o5w*Pd4Il^H}(*HudHA&{HE%8t8XsE3kF>S&UxX>IjqY-2 zq7<4aho(!R>A!YwAHBWG|3a*Yokej794Z*Dh~8iEWq-Kj510Mtz@@kR(XxNMnHy9!JgB8%A2wdo}>r<2_JA zweugqw(hU-0c>lP*2K2)7ocraOf^Cn0Nb#q7OedVyIJFL7t8;!KzTDO5z}DI=sg4U z)O9aImUPH-Ds`CoCtVwt$g8j(b`lhD-!5_;ahMd$gc0VNMnqmg?5oh}_`R4HEQ9h| z&((T97(Lv8_7J9!(~qD7gAp13XeH29>FQmdS(~Z!9WVEtDfOMH^asoR(Ncf30>M7B zmVxzh*LbOG+yu^-y3SWRyCLHH-Brfra+|jrW^aJ$W9%q?U0ZOR%Vv#T|oi|Jf(pLp#QV5YLnXQ0_3UjGYp51HpS8$bAo0fx#CLZEkaRsj*%xbQDP z1a|CoL{+%R-)6t=IP!S;(emRD9(_>d5dl6C8s513k*n%7rvh_wr0Oz3NQ(oQB&!~C zDl)yn9WR1AX*R&0t-izzho8GTc=sNrFbAGu=i`}2GgT+f1O^V(9%;XRv*Zg^aRX#r zcgN#_M+3Vam|2SV(~fvXgV!uRaP){g7q=9HQHuu5N+*&|N18|2( dGtgBtvW7Dazh^$sQm-Y}8x0~UYxQ)?R2&XXE9rz6_q^32Liq_7n zDD@4^y~?PNj?&Qjn5~|PT#h-Oq)}ShL~Glifzr`hN)IDj8uT*I#Jq_z-q3_Bl$A2W zx5UQ&G)k%Vz23{iv~p5Z${HwDJ%_{}c;gd&-lf#L`n+8?kMbibgW$_r zC{2BTP^Xr{7~LA4_(k2w1`@{5a`z-kuN+kN(6^7SpYzF;a_7%slp%}7_V7p5Ut_;f zDH?eV|MpoZ^^zaf*tc6JPil>8hWU)&XTHXLC|sJy%sUl2PF;PEcN%=w<0w(orylm3nY~ zY<_HR{;4D0WQu0Pe3BjZUSjAlM|-0Qjt5ihU5@gLKlY}gsVYTgHL{ROk1WuMaP%8< zoj4iKG_qi4kp=TcIb}i+Ajx_&?n&8(KiP72HtD?a2DUr05lBZ8i&P>goVNrJ$ojuVib-8+Zikv=;x`-xO2PEh~G@p$l zbeLkG841`%BUKzEc^L>tISC2522JGhSa?jv#$akyG?1i(s$|Bu zAMym1F5MtJ*9Ru`I#z1!$Z38_Y?-Y^^MGI;$eTyEtR0)yu2R>bH#C1wY}xxa?fz1i zU#eK_MT_sD#aHt5{J^T0)5Fb;J1yt+-_=xS*eJyYQ$t&d4g{$XJT{(Mk z_A=a#)Dec`&}3c~%_KTt1CsG*f~+04%+S8O;mOXYfeDB3=Q{Q|+7QDJ&BG<9t7LU- z0}&Io(xw$#l4(Ab=E)Qsr6{_QNr!|J-O&UhqZ31dxvI5`1au9@Ig;X6Q?zJVmM&ay z0Sr?#8_^-&m4v|A4F9%20ZoDCs<59`DW-Q#pH*dXveu%8T2-%EJ=BoyrBs6?9&wdY ztZM8P3S1g$+^X|r)fAoqzI+wlV^v_Njl8c??#YxIAYXf5d#?iOYlbY|5X^O1!@Y{U zI97wU?~u@?%vsY!on#Gebs$+~7LZD>l-G>=NM*=<;co|hDYTMCTBF*ICia#oJg0+XBzGqOt}5*4E4 z|A-EmbA6A1RaA57xYz`!7X^=qT#i!wBF7#?v$a*GfASc!7Syf<^olHsS~{NMS4C|) zn&6L$4whbsa^MJ}Lod68EIKWsnxPY-ejbiGf9N2Kjvm{C=oX1|l%&VvhvpLjg6&03 zKSJ44C_|#KhfpT1Q@6AX@XE2LfgV^GTFF4H<`+10+|IY!jOzEFI=)QXa!) z2KKdwyE{IISe}F*>N>XL!20}o|H(q{(|Loh)O9$g-88h90;kZozce_Bz5_ct%+bGP zZ@KNc1(@i}HxCwULnWL4?kT}Gven|by>x47N3U@-l}%Wyy99p+26pU5hbgBkH)F=8 zqQNT|yc=Eb^=}%E0kf_n6=wfk#}dYstvEv755~j#8+xf7+*z^ zn4ykbDORyVRkI9P@UrV{Bwpu2RZs(1~C9d``H=_X@Qp} ztNTmviuv{k16APM{UlY9<2zMy~1V_K$AQFNUn6lS(DE%A% z4%Io4!`^5I=q2Ykm^Cz74`oosH){QUzt@_fZY732D`~Xa@4ff-@TzL1HXnoy8rlV* zHaL+Pum~C8vWIW8qG7;b)9f_huSIdN$ptn{W$bmHG0*bgrpc(Rg0SUcdVU@viSaPc zMpwwN=-fl$=E?Bm>o5b04&3Aag~T3(Qotp_k*JP`S40(nB6E)YO^fEhzih2kPEQa9%t(89PS=1BhPSl2()M7Ybtf46Q z0`irhawkV|&$>xw&`^oLl}<(B8jz+*K8ltrx+*bLssfqvrGi5tu|3e*kz$h(2-_;y zN`~jr4FFy1!NJcQ*%xWQb==QjBJN!YcuO7K#g4;5$KhhfF`?sFq2qY5_wtaAo_|#Kp+AIVyH+21tNGq{qEX3 zYX#!OCs%jWsy-9y?*q^xY{-|)dx8(m!LLo2S`x9AxW zJR?QVLBVq{cm7eQFL&`jU40w#Z!f*I^t-;_54}HB=s)?uT5w&;P5;T(vvI!Idsyf_ z{5$&hOYbigdQU&NSg>9B%oH3kxYON3 zcPAdCH+>g!Q=3HR7F4Gn^lf%Omz(+{(e-)nk@wQY@w3AC*+TERJmD)@oj0@V**hx_ ztpg?hiK73s;6MG~V8K6K^v?+XnSY=9kC}g+DfnN^+xoVAgZZIn@|Uh|`d(0&mu#-0 ztw*r+6m5RN=Knwo>gj1htzA0?tkqj0EN{4q_JCjy+?^^8PYLj6pAv}a(t$3xiL`9za zAMrBWpw{%@Q-XinaP7$3Y`pra6|B>Vc3uabAlC4%fEKeSQPqGc+uL{)?@@--`cl{d zY|X@1?+aNYc*v$G{yM&)BVt~KbM4y*-D7>VcBQnxwqjNERr;bg zk}764XswgTUmK&8l#xH=SD^Iw4fiU$J2s?ns(rIsg)8W*Z6AFZk-j+9ISc~gBg)vU z@pZJ5ckOF*q?se=k!pDG?{c9jg=*1#{5MA|-B&pznj9}k0Eu*lC2v`Voob{O?SAR_aZP!0i*%xxT%UPKwv2m2DrB);&pim}h4 z3`OB3EEUzs6emq68kG1r`vPi3D9K(y89Lhma}5sVm7yy>j85|+q4?NRKuW1q&~*j9 zU5dm?-7BJ~kH<;mfr>;WI!z|P&E?F%v5Jb`N*K-(PNkSt1r;;>`{LlPes_Tt_rC0K z5`al^_-)J4UvfB0J^o_PaiQmUskOb>Iw-UbmimW^{YQoVqov+JvG;_~dt%3|b2jBB z%T1WAIS=vMmbN=*ZpGeEmz*uRbB|ixx0i1%=ZKQK?YH_j^&9w`rj4m$@3_!AE)V}8 zEc8B;Z#|VGz<=x-z5B|0eZ_<4goEb_oil>Llhfw15E->NByjB#h^`Xhmdc$1(Fui{ zGwU-zDqHkUQ%+yDRZz8P?iS458z(l+PnA4fznyq<;=cOsg@XUbc|RcTiQM@u$Kc&( z^Nw>Lyev4*txuLL_M0!SzkKJ#hnBuAN9*mttw7PyEjYT1jv>JT{Ys6l{a{55If-o#O@DxYVm#uysS>w)vJB6j!!j&f#rL?`-9 zGfxE4F$5XjU2sY=nnHK^Va^np`W*Mlw zgGsVTkq_zJRLYQp#Qig5H?S{No!XA_GdQNRmsM)5bK}sL7<{(X3ZZNo)6U?Jtj%bF zH(yvI)DG<83Uy!#`v$IFWj(4HFqoBDXYO1PntWw603szO>&^D{_8l|SWHon+)<}|J z=!o=R0fOPV2rDUM64oa=q}Wt2kqFkpQ0O8e8qpI>5WOeKk#(C?CnIT&PsT}PPmsls zjE`uR+T;fzR*y!XmMFU)aDws!PO$y~Cm4C+0Vh~_z+nxD(d74Kl!nVV7u@q7A4rKR z8By`r591_BSs9OR39{SJ(TGV+bLiS6#r)(@o=fU2OIGK(NOoCzi;JWFXtRlWl7zQO zl9c!0HAF{oAUPsun2-;|N%l3Mfx*K#Wbeyr9LK-XVR+x4G2&Cq|6iEV@SYQ>{#VSC7b-(HY1zMm#WUNB$Km{CZ`r$vil2M>2(4KRL zL(+Ba@F%p|*GHiln z5$4idf`i(WHf78SGi@`cEg5UVN?}Xdma!-76t<=v8E3+oaV1<7x25@vJK?6VJ?+VO z6W)w3;mi0F{!AbdpzV%yFw>N1Vi}H+t#b1dR@WvGlAIH4A}qP&rm1%fRqJ(%O|(Fd zR=H*Bc)itto>YOFv^I_zVI=-jMsmvbCnlYoXrJZc?pKJ!SWVAijd0+ zpp!tv?6eX$YyR`1BHzAr>GGIJ#Eh)UL~Fe;DP}>$hcjYUy@tK&w8#j_&E&VE1joeW zv@E?P3Y6e+Sdj?oY+hEh$hT(qlYw)C7xRO!%&=fdeF^o_Q$;o&!tO0GGLv8>Cc)jn zJvAA7kub|9+0p{6vQ=VFL+zk{Qdoj^$t>F>3;eh_8^cIe*uHPz98R6&V{aoq6p$UfPI8cANjd>~=hHr#$WFz%58 zpPCcFA2bT+J#LnQ(68wS^=kq>_HUy{^A7F$iaEBYj@zXWXb^&v9wAq2qMppa^m@97j!@I1z)aQPI;Uis24A6gfm$1{foeu7ajCdU^ph-x|M)YQh3Yff zZ?j}~1DCbH{fd!nb^27$;H=liN&OY`H|Dz3dY=M^=Md&xo82rEw~xMxLF=pkgYVTn zfHVu;in&v0iZR`LC@~^`Jd+}FjM8Esqbo?wQQsB2lTs(E?YMoh!$$_^!Z)(YOg^6@ zsw~BfO^yx4=EA+PYugdxF3pvP!y=5+9Va+F62hVx3v>sXW#sCW8)KSdQdEQqAtl|_ zoIe)RGx9JYIif{SU*Xz~v9T*xuMfW~T>sOt;cLR>b7Pu$TAqayMwOh8Qxq5fRih9uk>5C?_v{9tb$OhpiA zWDw@Mc5<4%1`C{2M)_MP?lBb~)7J4g^61QhdDR{%M`8=+zjT*dc0uJS$MzRvM;`5Z z<^;@JZs}U>KEBpHRD%DOA!K;z^4>qYc=loPD_7U!>%aZ!=RaMU_^NwoJY{UujFk^=GNp5@jG!w&SU_xxzBXSmce{Gw;1xbMA+iS@tFZkSBi z(d71QI+p-VIQg(X2#PXGk_#vt4Y3kJ>vPgC%B`jdP=~5&(aJLDiH@ zD+;p5NH>&P05(a;K%h&CdGS^%olibenRyRL<1z5fdkY@og&pVTU}q&WhAM@7pMj8qZInapG{%!45h~bV z)R@syuO*9$C!c=XB`O66NZht3j?!?h_83y?^;4^%`%~MK>It0p6>Q%-qPIcrHs`f2 zSF-<_oA(zyibqA8rJvL+IV5w6E%<-K{uMhP$a>Ux1I-Mp;rumcP}Ktl?Yp&-Tn#!M zH2TzQ(5ZUv3bv=b0nZ2PdVRP2ffUga3{&0m)cH6G1DAw`wM3 zCPUDlklj#d7RZW{lbU-RVw^fC$OD=F@vNTT(aa0CiJ*aFvV&s_0>KEOS@r9%W=>_t zbDBw%B;r6`Hx}qeHH)0dtFxK~w!8lb!C*-+S`l=_wDs74W~1ZwAJGCto=7QhwbfOo zmt1T0n%qp#a}hK*(uW1U3lOv>as-QhEa*mCr|&>MsAw$>8NrqaAKrQk7n1rd-KCG# za9(+myF=AUr{%QftDkuNnyv-(S`f1mEJ(w4Tmx zDfNuWlO*sWAXlwRncKIMP4vFa_#VoZ;LP4*HbYEzPqF9Bi#=zH_O5Ss9W8gpmroTt zj=%Kvlzs7v#k2?W#FqBYypO!=&8=(AJ*DQJayYv5$%9YIZL!ZrAB`3x`^%x$rR;-j zxw&gI%|mFU{x;nLyZRsJHRbn!#K^M2udcI2y`p6vesaNC4tEtH@eL0x@avsFg8uhM7e^ny``ML8SC&75TsP)~T(=E0ioi5FdX`U? z+K(3PtzWyM<=}zksZwz8*}1QRLq&e57#u2hop_cgb&VAHj&J;X%As&M94WW%E4Oy6 zweByq?gw<;A6kk&h{D>GeLc(Xt{psGf`8xX^-y$a`oZ+3%^dJ-x?KV1g0&K3Iu5Ki zwJd$`-~-CP3z*hI@lq%bY|Wn67F#Q_+snPqowH0=l){!X7S=r09?HEO|&0% z_mrb;%j|0tLWj=Kdg0P0&qU%?T^@e;8~L}>pHHv)k32uLY2qTz1s+oL_TyDL?B+9n zvHjFP9sTE_e;iu%f2hw2bfCjmc*fybv&TyI*yGNXy{q;UWxKnIcdWFn+RyBW7rO?E z$3|A|?|mKWD)YO`(f#FUyxi7T?l`#VvWDS)@(UMD!;|PBjSn+k{B4|sVhQs*gVIn> z)$-zcV}UiYP?nixzFGZhYtNHh5+2(OOAy3ybQ749=qncMAOy`YM>0-ebyk{=RF`wPrtDvL9aQ{x|!ee)VAQYUW!hQAwJC&G;~c(#hj1 z*AJU~fh(AdRO}i<#zBCw0B$j#PtO|DkDKcgCX>KMQcx(U2Ko`|JjgU)b9;B3{96M5 z4!fhn)~{Ag_kXtdvo&u|$=kEsyuz*wyzmYdtFw-qMz4>?!{kHY)|`R>m+P4{LDaN7sa`fX;D$&iD@z? zkwefzu3$5l&1%-mbR5l`h6q81kcO*FBrI^BHLLy(OXp1k9o-~-E$~@~zJTFHR5=ED z!_S$2KiKzVxM=Qt9DIE3^T-SHVX%{3d&^zj<=!K&-7S`(mx1Pr1#0k2!BEA9kez9c zJlyl({fYxIC)0%3sfr6Ro(Xn8R!e~c8`!yWjIsJF9A^nU?*A_astuFTP_fIb#re|P zbpP(+-TR*`ep0bu0}K;tsn{rp<5V0JbTVDtprE7TqA1Vo-dk}~(8Gk=&?Qy86!kHY z*oL2iS52(t#LCg%4}LlL`?Fu3g)6`1#B+56tBTcRIZ>n$r`h?b- zN!^9K5FYcBDFr5^Cq^{gIr+zmno(0}rMLQyPZ633Y3YHs8zGT6YBAQKlb>rw|u7gBFyu?wleEw63o|K$2Ldh)lJgl0djTkaY1 z=g+2e5`D6;jZQd=!T-w>oyY-M151%P=ZAc2xmii%SdfoYkJ)6iOE zOUfFz8huvTma@m~25gfZDJIUOoN=dt+htdZjk76t+@12oJw~5H_NIJsABAWnT1EdN zWv&7vm+YvBv)M{iycTBEiM7c?o4pliu?$qXHPGk+5?t>fff4PC7IQcr znxlC)+#?%U`2Pbsl4`2`z*GmegsJtpAI4PZ-P&viWSKU{}rc5oq){5; zs>7P)4lPh=O0z(3*+>!6tboxKBFzR^*|=bgR+-AkZ3^#G*$O7m8vPNtjm`0-r!uzL zRuF^oo6TTV6e5))ikyn%>br+_ z>W?wl4?J*?R5hdmSBUfwF(7W}`qisH{PDHHSGa3GxjHz+T|RqNcirIS32_kP4A!mV z;xwH98^ojyQ;gYlt0KxHCeOh0j1_Y35u<$fZjYME_GG2(A(h9YqIzg*;zVEX_zAH` zmWF$>)9P3z-E&)(Q#~o-1UPX@OiZX5+#?f@qx2ZdN;$Y}pi5L86{nh>s>i%~Wrrbd z@+jX63ZiyrL6JCs!0B7>q4M7n(|lIvRZxJ+m4eEJO#FqZjg>KPXwK zz-!dH#Zu!fA&bYm!JzPwrQSjlaC_gk4l5$vjzx6>rxR5rB)w1!)g_K@9!*m*`pRQM z@ELLKmV~S*1dCweXY?SPGMAFlT!PQ?!;&ni)47AYZu)uR6Tn0vq45q{^ZD@}!qaBl zhR$XtRKwng6}rmSW>{o-Aj@M_B3ES?D){V88ZIkht~#!}0pwE(ClKeZdnOI%4>5@s zM4YHv7Od@rAF-|VSKtWdkyvOHG$+(3xz$sZxodWf6>Ng-fqk*;17>_GISr#|zKYkF@oWC1k>pgy z$`mL`eC@UZzG_gd#7C4Q*1N3;j-B}}md_RTvm@suaV*=)E}Y7D2%N}C1K*i|T34B> zHdLTwr~+-anp3l>^%Z=lX#0Svq(A~hKh5#LwMgs+%1ltBk}W2@wH8<{xMg>a|0zL0 zvu|0&{V_dLqj|AUZLM$>f&vYzdp@SMnqN~tqh@N;9<^=Ds8^sRsxr2VRK91V+F5eH zR=X)VjWoaJ6MTXn(pK~!uvpeMh$7W{hA2{V1(O5BWHSgy&c_^jU}Qp;xw7plV?0=LebvOD?PtqWhPg9*!VD!wMAF%uh%;O^1HM|( zonyShjdGGOm21DQKvEECS0W=B9NEoZUB=E5)+drQH${AIP4>+-AvhatCSwe=oVfO zFtK~x2Jtd6rrX3+R-GoJYOfx|;;5uR-fgZ&GH^_s2PVE%x0f^aIvgf_?WF1;RVS&y z=^-2=)d5nOM>akQ@mtXww}gQ0E*)nkjn!mXl=WJ1WJFA;(ruC3)EeCjsawK)BTPyP zIDRZnOT;9O>4B}c#*rQ1 zKIm77w=px6ZM7Lgy$K9biDea5a-F*=U-z?wFA?6~fqT1yHtJD(G#@>^yuUxsv=tlL z^6jSz4X0+^B|CEa@Al92e=z(N8~M7cx8RR{nS5eFp0*N$_D6ry|7m~T9RV_bG=Jc9 z!FT#$+gH8;$QYciqPH_2J(KS|Tl5~xcb&-}JPSF6XMp4sPFEg?+^A{KC#@g0E}Sbg zcF%f?p^mRYCl+3w^%PsX^5L#L6Z&@Vv0`g%;ideZ6Knox(H|?>Ec?B%a%0OUz7KuH zI*|2XG1OA54{y}5`&@aZb)yM2bmzMVm+Q~ZvPHIjm2EAst@qW{&I<+jXD>W*2k%~- zyOt_3(9=T<3jav$cTijZeJD-TWwgq=e{ZZ!z4wKs~V# z=+J4Sox21{So85h_}0TfK71>G{w9F4m)00hd70Lwjuob_SjG1&HLozIzpig9y7v{s z2a4fXu{l!Qvv-5FH$ZOqK5HaQiE^w2bud#mBYisw2)sN{(&&D+N)n1~MWZS)ov2>C>fx`r9P)Y8#qUvIv=>zL)r`D?Ly+z-^ci{s!GGa-Y=*4-QjZx`A0-{9im zMQS*43p{Rh9|x`qqO0+&QGnV4P8&`ylaVo5S7#G4uPE@09m@!KKlI=M(oLt+x*fij zz!JJu2LFzakTG(=c!*TwigmmB8OPYN5o3(|F^_?ygvJ{@0)TP|T-Cei?_E8MgL!M~ z{hIqjpN5vLhrx~P?I?D36(jqfc!IW5Yr(pb4O%#in);H1Kn$u2eX#%DYb7VaT&S(R z#1e=&pmj2*6h-zxiKc9U`$ztXpjo$6I!g8%l+Cr~tGzolH+A=|xwlF-(gED-8%quY zBJomosF28Rvu_L|z|7B#xMx{F?O^E^rANLZF!)4Og$S;i4qGcymMml4p3M5M~ z^93Wr@Eg=5$d*6HVe%w1lpdcUk~u)3Ar<6|?BggVDLfWQMZRL_HQPQkoO|`!kfDUz zv{PqZHtpv$&cZxUPNb+%L6D=UrwpRnzd`PQpnXqinxg+@w?M^MDf$V5>ZuD+2flXJ z{`}UmbI%?78rzp=AC3p$4B-~ zktonj&>Dr*1~BwXDT1Oh5Fj>eApV(O1qu{sixwzp$slo1fdDBA-Q$t6 zL^+QiU68x8v%9l%yR+ZS?qA(*2ZAJi@Qc4;tGw~ZzQi`xztB}daS$;MX z<&y%I6Dc14l7O`!49EyrqD``pl`ub>#+6PWvn8HRy(?jrSJQm_hLB20@=`*PA||cv zyk4cs%)|v*x+GvBDX9|HnqEtbl6*}{%-v9H+}2AH))hO@{n#PJuSF+Mjb6w^BRZkxg3-Y`J>8Ob)+Ksi2A>^?jo)e(FUYK8FQV@V zzIDY0+L-IN&T?YIElbRwwMLpSSuu@MDkh?}Botmz!EEAuB9&QGd4<|Xykc_nvslPU zeK!Qz2Ffux!~>Vh>cY7c#I4vm(i}HI70=g+ihK{&Gl7W`;6} z%=4;%=Op#{<;Afx(SD17>vYZ@B64T+OWhuU>rtuK9NoDAreI`E? zui81U>V`g~e~D73iC>Ber$&i=4`#9&$HS*5RGweyI@0I{jXuXgd8Gle_faVjS~Y** zF7=MDnjdo~c7cgBE0)wnoQhXhgm59z^)mypfwD}5o&NF?(D%qWHpOK{A>Bh5Pzq^& zSPDJ~$1yHsgzE`8p=MVGj#lh&?-L+J0doKq%U0y`<7S|z<75|&L=>SKy$xL0Q8aI~ z>?_skhXv%(J4mam^}PSUq)Vc zJpVUX2e&aw?J%eubH8H!49WrkOkV&icVp?D#RPaFehv#_#H-^0**nd8b!vL%^(&Z| zkLFsU9s*zpyoctI({l-+_@#uX-cU3P9RK(YjZLLAd(DZMML`ubt3Fa=R0=$r)4($j zgo&9(OqlGeW}*0@d6ty@?#qBB{SbC!a5@fdlh=hRyf6z+jF;1bm`Kg7j2(R*pEC!Ub$ZW= zy81o|+zAxJBZctD!|D8$Yx(fVR`{o@Q;%Jp2b!x=ioU*rukYbv*@U`I?;^I#vpTg6 z-5n^@af>a!L8?B{BuvfHL}%uY1Q(9>vN~K=&gxZn#HeZ2)=??VZshetqqm`d*eW$ZnhZ#?GuDe_T(>^2sF}^v+M^L5nLZ{M2<+UnLcrJG|l@A zs#bzTvswkWdPnL3V?=l&E1GiF2j=?~P`had044B!Ac>Zo-GD%bz;~VKMXf@^e4V3B z;d`QXXw_$aL%nqr*;#hqcXb|65vuTAm35_GRgGhBI#Ta`tHw1^ffy7 z82xH{|4sFUJ+K+NgSN-jK0}ts)z|zZh}KnN=`?+Werq=D?dQxsJ)bjr)XHb;WJsAG zsEshFoCEGCs2!}bu8lZy4&xiBjSL#&Y9+q-e_u`K5!yO)&YHGa;y7I7Vq>94{!4ON zF)^ly8FxdW*>$WC6G_ccffJzd5a~xY9>AXT!>36`xDb?52BBAiZ={sPOeT$0Net;| z5;8*KiIw(&P_4DXPDn}!?nEI>0zgd6ArjC`%3>0S2;+Ikv||-T2$~%LFh?3E<|^>j zP=T-Oayq_1p$`*}j>ANTMM`GWtmd8ta8skabUHaQo6`JPnoB6e15qF(7pi>@)OP+y zlhe~v7qOj4u+gC-!cYU6>dl{Z>qjiLR?=U08zfUBRP-Lzcd_L<))S zAk?oW-MM<=Sb__x;agWu9_>w^?l}qKln=oc-bZ^)sOMz<G%t)mTfLr zicFHSxAfe3QudYF`$!omb@h_+IJhQ%?@mL@M*Dhu$rma4PVTg{7h58QmdKvX?Dv!% zs5MZ6Ut8Z^gWc~~wU(QZ-BaX31unE1xI4Pd4MR-r@>N}Tv=EFwaz=OBU)X7G+jw*R z&5uvyd&jrBCO&&7AD;YM^WQuE(y`rqt<)YYb@sy?LC@Mnn8oA2{npxB+nvM3&NGG1 zGmqS7cG`yx361%l?{9_2|2Xxh*Z%O@cJmc7(qbYb!ErZsl>CEt)lYBUyLmTVM$FlP zoq==t{>ie5X*kd9noNzJGBUY6(6v4EyZ+zwe=L2Hyp!B&J6&vx7TTh#uJy~S-V*1o zd{pPd;5K)bew6O(ciK)pTak9p>Z@fAx&1{~cfr-Y`PRc%AG^+#oygTk9lpjaYv$_- zL5Z7Tzl=m|lU_M~1;kUHnFU0s`d_0vBB-{1e5C=n)>8%vFWjq&yI(y`9s zvHrrb{!&Z(MtVIB>|*0^p>dcJrwWayN)63>Ue4zzBhK!in+32)Vuxu=u3C{sav4s+ ziw)0f4xWd5_o7T_7th0OTduU2025({(`gx#i;U)s%Yvf7GtZ5*sF@Ne6}JN$6JJL` z4R?`VR(JpcX_{FEB*UjjBe4hk5^dBUf$&AbAulU>=twTa^u>Z^1kjYnkljLMmbFZ5 zHkJ_-pNSBAS|-3Gtqp|l-yHcjF#NkFgR$#rvJ8|5&R8ay&%I5zm)DkWzqj^Y*+Lj# z`OSf{jglM+_iWDGv6t<%>Od!Y%T7wVP-}2wVSSD$q)dE;qWmf|EXRtAOZ zaebAjb2uJ9^vW@~{}lui1Jiy;-b@Ccelr==-%OtBJy=It<;?_Jh@i9IRM4WDkd?v8 zDAlX z8O4@CVV%MSBO3sIPIw`0S0sem6}~_bHy@X^|?> z8JfmrMr1M!92q~A={qgLX z*?F;)yMC@byO=KuS#+ta_2!TOeRJ2!vs}JdUY9v3mlXwVz(gk$u51jf4Vv0+2>F#& z+0-~($ZlK}xN=DXD(DBgkj@*7SFr+G{IC{IYD`rdZo`sW%@&J-*kT%L!z7b=w|Ih| zWyL&i%#n=GNvqki0L$e*k4o$1Wm2K#_+Z~)<+GIjFYzuuC;D+cY1lP z_s2BeAvk!azzU9jXyjc`bHl#}dKhTK%?iG2KHdvscklte19HEM;=5mKL_hRB)TZyw zL+HL+4$#cnzV(`u@492lbZz)jJqS-hV;iUQm)H!o-n<0I2W&CdGzn}&xrQARHL$0{ z_5FWlu&1lg6^bJbIH}NUKz;QC4%is|lWj(6&G4B90diwoK*DN}D-6_{QWa{=XfWk# ztf|r{f)p%;{(+QnNDLC>pGW0RqsqfP+VSuVUZ+QmSwWvJL!vlZVUS<;8})XFsMuj@ zZ*!o`{mFPB@|jklIWKw8Xk7#BdG#|%bQo>z=kxgoKqok6@aZ{X-lzVSmg5GswKDDd z`5guHE`z(312x#H86}`bjB+b~Ys%3Jju);p$I;j zM|jr}KG<5%joIw1$@A;H2rsAR#!#RWz|&hLLA*w0Nz|6GPl&~0fBw6)Tqvi@`SOIE zMKIfmo9i=g&0e1o(qjHBeni1rGILte;VpOH93cwhF`=JwfM9WpRUE{mH0fSMO05m%@bEV z@v3KHFOhyiKlxeh=zGf1_nyU-ql-IbjIc{_L6p#zqB<2=1H{x0tH$llm&ZS0~lc5!Fy5+;nkFpZ-24c7XmpZ85y+39Lv zdM|ZyZ+PONTuUz~>4oa>IW;*}OU^0DIrWXH+8c|?8;k0Ax;Fm4GXDO7%Qoi23MCJG zRCG+m^O;lPb4`&0n@hS2sn&(he<5Gwa@lhBYF>n>;nw(}8}wu5%g~90Px5W5Zm0ZRXd3rwBWFho zHsUz25Uvc7Ap~v^mOx;s^@5Gz5U)ugW+Z9umW1YM@L&VeSQ7L}4HNY?EKn?Ec>(2| zFsAY0vIqY6{R9r>=F%80nWp3>d{!)v8tt2|ilstxO|awvI=L2~9cm5N2%y^53Cc}q zXa`%@kAOwL1(Iz*_BL_#?c2;+m>__zSSvOFVoQ6eM~zWnu1&{VaRI->xB_gtAvEYQ z7Oj;7?79uN4bpu^i`)c}_Or)W3ED8wYnvIuxQ&2e({sqUOvTe07shB4zoDocHuQn9 z90va;dq|&_3TyDt)CT;etV^0*KReAxOTT4!OjnLn2f{0(S)}!X<`YXRd5E038+l${ zl@RPBc-|7quQV`X7zhZ@PP5G~UOIaLVc@a>Y;Z!(#eEe@T*%=0)o z3yI`~1kj=N>F&VSxJc-0B;6W4)D^tU_z~{Y54LohI_RZbiCt&%n~~A#kyAS(r>oAm z8a=jU+jWN2p(M_u>fk8OgWxvrp4>jU8=9(xjw_+#&ppT0KwmA8QUa-J;8>MC25Hmz zZ7uAw!{4wYs=r6|52*e@)qh0wCsluU&7V~K$pf#$>pSSCx+YXMQ5`+~r|`ao>2hz` z!85shHD^q5#vTM89^Z9NAK0nj^d6ghApd^z%T0wnrm{mS^vrhF*kOeoR@wf2k3Hz# zV(Y$^Y~)F3*LezhWBoNx^0_CuH!xLw^Zc{$&cMf87l`nP!j3%n^vQ*<*uSdVD0Y~VFZM~09`kkP0OaW!mKqzLqng|MPZXE{`+5?QJXN8%;=2V^hlulPoB}C zy5m)b?=^J4{ugy`b7>&D9|y^%kJQJ-DbK7^k{-%8FAT3UH^YDZ0?q?of1cX37K1H6 z$YTU(4c9FFBlS;=fo*V&h#C#BdoiQj%6Y~VtecR|Bxm^va;;0D_4ANP*7_4!nl#LnSkjPT){_xfQpVMVdGh>0VrJd8iqq0x8;i~w~ikQ zFaM|9HVh&?BxnQSeG@@ul+es`&&=M`6+npl>^&B;ntw>~50S!%;vd=b_kCgeV)G%t z<3GCZvc2h3*om!kf4}%(yv|a+q5GfQ`{W-o+vi?*JMVtF{poIGx)wR1L{2>Sp4gjO zZuI(vP>W0{k*R+heVX`FxO(}+o#~GNXv}@t59w=GiOfFt&Jv89R05Mv5=vmE%Fa{+ zGke4DywX4PIRg5{uzr9!gVF5^09%1@HG1+X|Hr}~3QxIe_~K62Qk7l8cou)q`}@H! z2cItN_RVkEe#7pClMgDj(OG45_L*%ryo3>VK=t%iJwto3)Wh-G*a>Cq#IwY1>@r5y zunE^5&pbKybYge>UCfxMjlHdmyRn$Jo(GooQHkNx!i5!lbe(D3t9V1Z3U1n$PS*7P_A1)|Wy zV$jz>l#1msZA7fg7@?c7eQTWmhq2o}a2hic)p5>Bh3 zcr7%cgeKIU{iHBgtxs*DU#>jAnt1MsOQbENb?P zq#w;H!Zks&>5op>8qgw6KE`}@{UIKGiYdo&g0I+Q*~ybJN$_q-Uqf<-sxwiWubMbk zryzeO@8hgKPgC~JI>Xqy9vuB23bK8RL8v>YZD;5g9o=_tZr{YqR<~gSc!S|&-9d^@ zDxP?-{H3ezB2_ol6Rdkk5u9d!-Ajr-3byF_z3cT3QuR~8*nTH2?yu6LwwZc{rV^>| zT)j5;OJhMDJD$$FbK7%u2d+6O&_&!9zxYt`#p_t0?xx%wcSGBu0}s^ny-8{vxsoUf zIsBd=O=hp=5Lq~~w$a>=-l-Qp3FJY6dU$XR9x_Lr<`F(6s ze;<3P-^n)PHr~e&c68dzPwi-3&Wq9{e&8b|@)$^Nh2#QBu9pOh34ZICP9gvtvS@F; z-Y8^c8sp&5T$_D8dGgGstBGG5^w&*tgATXrsC5^{Q4_*X3rdIC~NfXiZkhox{_R!GjNB(C*4tZ(i8P0y-{z{7xj?@ zMaXu!<`!wbHR`8`R|$#zfRLOrdyB4))@G@Y>k&pmq|Tla6-69Z@AbL5k+dW$7v=cml(s(FcveQHRGZ0T=I%{s92j(0QlP3uLMEc5 zL`10zSb_AdDrJcSPV`fp_;n6eZxbg+ zj@*dnD8N9$X3Nolt;v!lLd+<_+tcmmhepR#e|X5mxOZ{PL7^G9X)ye^y{O-`M1svA*=;dhk*}+dJJkX@B;|hbVikx33G03 zSf?>ttb-tK|NF34OJ;gA@yvcrM3b_%|LV-){>b!UxmSsg_hz!%R65m*h3ZXW2NvY3 za%@IRqh19^3$^#%4MC_kwiapx&5Sj@=2@#-ObJ&aVgxKgi-a>-ofV)9aZ8xpv#r5} zEt*ykUp)`ted4j5a0bhqr^JN{T7?k>2yZ)ESg zBgRyu5QyNZhk^F_mwz!*?l?Nn6xp`&uJAnbH5*s~zHl~3)@D#DR*lYh%xH#9V#O~K zSu#Y8J~{*an>aYrF-27|c@VV#)cv4MAsJ5zF)<^K$CbF2o$c9vZ%8 zyG>e&aL%E5E%KbhN}6+Qj!j_Wf1&|5>;q4<%Oo1ou)i2B(P;J7pAxk-zgyT}p+WH3 zl5+wlgeTfIoT>%|oM?%j1-;KXb54nwJ`8^A#A7+ zS_1pP6HVJ~4df_GLphhWmc4liobr$mY&@)@7rJ9Gi1Uti(yYE7&apmKf)0K z3Tj3YHBon(@mq=~byqdSLmy!a*X?G^4$&x#xg5X%PlPCQz#!tQf!(Ow@nyiQxS-J2 zeI_z1qqLAoL*~^T;%r6`r9{ZClQG?MMR*q{3^6IBprpsRx(SbO#=UU?mTy?#ud>@P z7HZZ#a2JWgJwmt=m$WGr;gn`pCD;p}C|eK6gnbqr!r%oAUId_GMHnU9Ixy=DPYlU$_Y=%>S7*DnHNNm>N^C!BJ9rM% z7k(o6(!8z62L2fuEC+(0jeI&%+<&sP|8xOmosaNb$w>V8RnbgUeH<=&oq*{{Nd z@LL7G`(gN%Jm0-)C;T0^`|cdMd1RUQ=6xLx_|7ulz3gvX+;@H7s)O!nt*}I6QyG5E zovUt#-!*TCO*%a#HdtVTOHDUkDzb;5tm1vw1{Ma2xMak2;e9T$+|svP-@N$t^|ycB zmhU+FV9&sB-p%hm`RDq-?)++JvHoJYB~WhdhW!UT3&RzjsB2mrxIR#9kCfVv7ut{C zuRWgUkLPQTFSi`B_|)e29(l0)=pTmuIP&|EV*ML|)@~$pWU88HB`Is|`&(T;l&VuEb*E6FW{sACRClU!`cr!?> z)r;%nInt_5NFqzzBu62Dy#~|z&J0S&;8TTV%Yrz8eHhoM5L|(3%`a6fNYJdb(K+bP z9^T&UpUL&7uvo=K;rm3H<*#uIT!}qcU=QBi`!yReOK$S2&Wy)Jb<+hEU!jcM<8KYm zw|l6lX{#O?BhXo3U}?ZDW-?0F+JA^y-!Xav$j}%76^Bh61q}a37XX{>+0|Zy!+wJ55X&RMuD)6dhi@pE*ixEpD$dNjT@!0U4#o=<7`Y7crc0qE|k>@fS(b6>*>8U zlxKD=)h>4pCg@K~X(feQON zX>&dF)?K^0aP``c7k*r^;Q?@=-Ftzxv*IvNmT2p!IE}80Xbdb)U!SgU2Fepn!4w`b}kE6@=<^Z!E1Sa?Ch}us9wwRcYvYSL~@PdU)d% z-uR}VbH-N)GwC91==z{BviTE0*!%>rX?=JL{?$(a2-j75|953%=1w)dmSfGj!6*-Q zZpFl?LySz1lev-ZjVb}*EN_@HI);0Ov1~RVojq!r_%SpEBqaRRK>!t+B+17tLGJyA z;QvmX_!rUkn4(DPzjhh`Ya^)@0>HmLgsc9OOAnl_@7q`HDU$5Z2M<3caK|{8{{yE{ BFl7J$ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4dd992677a947223fd152f10ffe4adcf0fcbc88d GIT binary patch literal 7662 zcmb6;TWlLgl0CyWIeb$jCD9V~aztB}VnvQ+ZzRR<-Po3uNWS%Pvp1kPBZ(4U=^4hB zLamKO4v_b5&(Q)0+1MQ701K=F8(8nx`o1oo2i$#-u@;;i=l}=1_u)T!CF6kmxvTEs zh>}g^(Wa-XtE;Q4tGlbJOaI|?+7Xnr@gGtv2N3!*Y3RjN1Ge5~5L!b55|}i~FmZ+< zFq>xMEYyayA!CdiDbAQSWz2Cig-vNo#u~R$*qpXy>~VX>5qHqGCCz1=aVLeXX;;P_ zcV|3tPsSVfW_)p9#vk`*0`UOtx22mh!FZ5CEE3IP=mE$=N<1vsXPJ1j7@Qx{FuT8i zeRFqPg9XQ@Y&-&eCtvLY>|3v5>4hb zU5jkY`HVDTjA}iTNT(CiX>lAHzEep#^AmUVZ7O;*~HYM}v-0ZA~p&2@p z%jB^rNun?%W~CgKCX;hwMvSql|MJA8R8~yjONm7hpUGusQnP9(IhV^xBA?0gY1+vz zh&U^zfmKH)fdR*3F)t=$UP>m?qQvL2Jjlf3M0P=nnN{!Ugd~1=>C)wi1Wsf`S;T6~ znYlz3SiDqn@UTL~=nC&g9aCOtwD2uc7zO;CU~1eGdkt<%oO< z%*%^;QBs4isNu)sr;ffEW1ks8v(K#XNfI6F#Z?2g?$YxsqBtXG}w2RL-R6?=>y}avZA9uR}u9K6p)Iw#L0PRv`!y1mRwUu@`u$t+g}CPm?2)6(oLI zgv-De;Lwk&R!PPQS)5%|t=U3`pH9h=YM&*?j+atPqH2>CGjLFGDydo%NjXQZQqru; z2sP{;BDZl%)@&J5WS3ZkiiqDKDl*WjSr_OrLJKIgQZ(FR1vbkR4S=oPYM?R6U|}MKr$y*C_>-(BT&7-I;G!uVLXh^g3j>juVW!wfQ-$LiXtR4;kFG zi^^1Fe+?(L-aqfxDeJX`2<}!bj%{E&(x?lW*L7L z1>$G_2Sv3cHFL6b^D?zZQO!U}Q7nE?NMSKb_cr=(9L71UHdC|3 zPhOpvxboiAxwHJ#-%gyHl=XqNGMNg7`#38averwTWP>jgpw2(S!)?2tmbZ zfu=;xTa=dn{rP~L$q(dH`6F@y&x-Pq8-?R1h8K>D1L@TCKz>o4%Vh^XOs6vgnVcY| zdGUsrEWlM7NE6p04eZ{Gf#i-y;$_Vx4M@}{Ne#OA>*I-|Z}#Wm_VPS+ZTwPuV`JX0 z6U2!0{|TR4sOmwHw)^3GqbtS@Ygh?KSB(GYRGK@Xaw*Y6rRdn&Bp4E1jIo~rboFZZ5*()+{GYd@+Q81F@9%U~b_4KUD!0^zmM zUyW`XkTbR(E^~ub1F|`P>E1RY-{JM)vUjLzp@=P54^quolnXeU+_5zT3x(Pza(>p( zA`@fBpG^S#1qm4p$+RR9@+byVfm9!CAD;m?o=oHu)2VbyUR;XpqtI7L-vJg9F^HgA zGxGQ_G3%;{ZnNsh736#YA_HPp?Wybz9ISQZ6If1>c!iQkYPe3r=S9r#^s5|ziHyVx z#C@yI>tbqljzl#HVw|lor(QS!h#}efD}Q_j-PlF*Ef2oco5$E`P zE|rxf+yOHzbzPO?#$FOb?gL@6(r3g(g3C2guLS{?P4jO;Dx=zKE)ZZ0 z77(dU5(tVyEfiGkwI-o83JEz8V<{azOSqpRj19EHX9(gnL9iO)Z2}uHsa~RZMx+8V z%0rvAwtQ_imD6=bUgh-`)u6Z5S62#1KsnKwC_8;T4OC-B)pec!fOzplhR?$5pR}dj zZTEG4DZd-xz0S`U%LQC8%CU#vlK}edMQ%zzfFE6Sl*ylF}RTEjDgh>#AiKoUSq`>56 zP>MH)R38>+Qxc>yG%-n11Cu9}YOdu*LYNpxOm00MBp-5vFv%_9W8_2fN!qTP7p_A9 zEve1BLZz2NpU}&x7e1+_W*pjVyACXebJwX%>9m+uJ@q563si*O!>EBCs z1=vbfh|+GM?I7yuE%lB*=^iUtJO0vnOlgm;|De=%{Hdo`@x-boL$|AHMa>63bKi3- zO&v<8b?xTrO(hcjZ2aDMDSSu?wyb4Wv)fIME_=z^zTJXC{iXhMPlD%H98Vp=ile>k zXumI4Ub|3+pX0(N=eu+9_Qg_o+@y*@Z(co`A16JNXb8Vf+v)JxSIpH2YCF6cXkNRxdXdt1D#1=5XmUL_Sq^|m&DFpjiuI|=k&!a| zJR^#yv(AwY2-;Wd&wa=f-E2KpZoT%Ux72#AbpFQxu3UQR50Tf*W^;cPnOb~GdzAb- z4^}z{%bkPIEr5g7hdQ3y09rL~yHI3^sKuqUM%J0<1_B)!rS-}s(6shA+HCHtA~UdS zYk!%e4bW9P;phP96Z3Weg=00<+vicJx(6`yr{@LMshqq0JAX#Z^L}@#`?X*N(upAF^`ZON*|L)QI8{FxygB=Rj zrL-PWT4PFNKxymSc9=u(8vUh%zN?Zo_X(Fnd*RVn9!Ws2k#L@(_kQMg9y)b=kNJ%4*>4kWHx^$9a{eDX?vPFX) zcT6HY2MqM(I}`bQdQo3IIMBKcC#uB@@R5iZ#L`jYhBMGw>fUGRd)O&tFbNE_cXHs) zN4Gz!xO>a)-u0$O%%hBSrID5>Ok!n}P0fxirlqVG@9=j$}F^Ns!%~%L({(z~Cu@X0ut< ze3{Op8q?s~@EGYMI>s~PL-trTYkx4HtEOI=Zjq)5l6R%aSK^$d;}H1#J^EAMz=Lxo zWBYyo{mIY6PmD()Ez)^V>F81V2cJ9JP4=h0rm6{QxH|q|)j}XEY6{=&UVW!(BP}~z zcxe5g>L4u+`FrlmW#8c~GPru0L1r%`UQE9GZ~PTOwPnx|RqJUM*t-LFZrr|c=jQF3 zRTDwLJi+Fwg@RU^eFrA<}jdiQ;WDcA-{6Qy|mHHFYNo_OeZnY<0NtpHTPN->l zLkCH}^=lyzCO1YsO)+~Da{XFDZqNAs2K;LYIZPg^>ezQ>RLjKnXHqvHvV}i8B~ubC zNDJd=$aeC(vMR`_wA5ewixX|oU`{>Ihyjf_X;?IC(3^+fB`dYSt%8>XuUZOlorb1|RT}|@r;iQ}<1RvBq8(Z_jC;uN8|2eXIs-3f zXHWlVYLZ@)m+p^heM`IMqj&`dl6Qmj&+vi#Aj7<{BBtlhi2E-z`roMQ1e8RIaZ0BEJsd0iM;hBG*Uu=ZF7NTPLvJ|y+8z@ HNA&*zJJT&9 literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py new file mode 100644 index 0000000..571ce26 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -0,0 +1,392 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any + +import torch +from compressed_tensors import CompressionFormat, ModelCompressor +from compressed_tensors.quantization import ( + QuantizationArgs, + QuantizationStrategy, + QuantizationType, +) +from compressed_tensors.utils import combine_shards + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, +) +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise, + sparse_cutlass_supported, +) +from vllm.model_executor.parameter import ( + BasevLLMParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) + +__all__ = ["CompressedTensors24"] + +from vllm.platforms import current_platform + + +class CompressedTensors24(CompressedTensorsScheme): + def __init__( + self, + quantized: bool = False, + weight_quant: QuantizationArgs | None = None, + input_quant: QuantizationArgs | None = None, + model_compression_config: dict[str, Any] | None = None, + ): + self.quantized = quantized + self.weight_quant = weight_quant + self.input_quant = input_quant + model_compressor = ModelCompressor.from_compression_config( + model_compression_config + ) + self.do_sparse_decompress = ( + model_compressor is not None + and model_compressor.sparsity_config.format + == CompressionFormat.sparse_24_bitmask.value + ) + if self.do_sparse_decompress: + self.model_compressor = model_compressor + + if ( + quantized + and input_quant is not None + and self._get_quant_dtype() == current_platform.fp8_dtype() + ): + static = not input_quant.dynamic + g_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN + self.quant_fp8 = QuantFP8(static, g_shape) + + @classmethod + def get_min_capability(cls) -> int: + # Only cutlass 3.x kernels are implemented so far + return 90 + + def create_weights( + self, + layer: torch.nn.Module, + input_size: int, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + if not sparse_cutlass_supported(): + raise ValueError( + "Sparse CUTLASS not supported. vLLM must be built with " + "CUDA 12.2 or later to use this feature" + ) + + layer.logical_widths = output_partition_sizes + layer.input_size = input_size + layer.input_size_per_partition = input_size_per_partition + self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype) + + # parameter to store uncompressed weight + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=self.weights_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + if self.do_sparse_decompress: + assert all( + partition_size % 8 == 0 for partition_size in output_partition_sizes + ), "All partitions must be divisible by 8 for " + "2:4 sparse compressed models" + + shape = BasevLLMParameter( + data=torch.empty(2, 1, dtype=torch.int64), + weight_loader=weight_loader, + ) + compressed_weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 2, + dtype=self.weights_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + bitmask = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 8, + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("shape", shape) + layer.register_parameter("compressed", compressed_weight) + layer.register_parameter("bitmask", bitmask) + + # Check if quantized, not just 2:4 Sparse + if self.quantized: + if ( + self.weight_quant + and self.weight_quant.strategy == QuantizationStrategy.CHANNEL.value + ): + weight_scale = ChannelQuantScaleParameter( + data=torch.empty( + (sum(output_partition_sizes), 1), dtype=torch.float32 + ), + output_dim=0, + weight_loader=weight_loader, + ) + else: + assert ( + self.weight_quant + and self.weight_quant.strategy == QuantizationStrategy.TENSOR.value + ) + weight_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + + layer.register_parameter("weight_scale", weight_scale) + + # input quant will be non-none + if self.input_quant and not self.input_quant.dynamic: + # register input quant scale + assert self.input_quant.strategy == QuantizationStrategy.TENSOR.value + input_scale = BasevLLMParameter( + data=torch.empty(1, dtype=torch.float32), + weight_loader=weight_loader, + ) + + layer.register_parameter("input_scale", input_scale) + + else: + # for sparse-only, pass in 1 for weight/input scales + weight_scale = torch.nn.Parameter( + data=torch.ones(1, dtype=torch.float32), requires_grad=False + ) + input_scale = torch.nn.Parameter( + data=torch.ones(1, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("input_scale", input_scale) + layer.register_parameter("weight_scale", weight_scale) + + layer.register_parameter("weight", weight) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + """ + Compress weights after loading. Store compressed weight and meta + tensor + + :post-condition: layer.w_compressed and layer.meta are + set to the compressed weight and meta tensor in the + format expected by the Cutlass kernels + :param layer: The layer with the weights to be processed + + """ + if self.do_sparse_decompress: + layer.weight.data = self._decompress_bitmask_compressed_weight( + compressed=layer.compressed, + bitmask=layer.bitmask, + layer=layer, + ) + + # compressed and bitmask tensors + # are no longer needed after decompression + del layer.compressed + del layer.bitmask + + # torch.compile workaround + if hasattr(layer, "input_scale"): + layer.input_scale = torch.nn.Parameter( + layer.input_scale.data, requires_grad=False + ) + + if self.weight_quant: + if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value: + layer.weight_scale = torch.nn.Parameter( + convert_to_channelwise( + weight_scale=layer.weight_scale, + logical_widths=layer.logical_widths, + ), + requires_grad=False, + ) + else: + # torch.compile workaround + layer.weight_scale = torch.nn.Parameter( + layer.weight_scale.data, requires_grad=False + ) + + # Set all negative zero values to 0 prior to compression + if layer.weight.dtype.is_floating_point and layer.weight.dtype.itemsize >= 2: + layer.weight.data[layer.weight.data == -0.0] = 0.0 + + w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data) + layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False) + layer.meta = torch.nn.Parameter(meta, requires_grad=False) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Returns the output tensor for the layer with 2:4 + sparse compressed weights, given the input tensor + and bias + + :param layer: The layer with 2:4 sparse compressed + weights to be used for the computation + :param x: The input tensor to the layer + :param bias: The bias to be added to the output tensor + :return: The output tensor of the layer + """ + if self.quantized: + scale = getattr(layer, "input_scale", None) + + if self.weights_dtype == torch.int8: + ops_output = ops.scaled_int8_quant(x, scale=scale) + q_input = ops_output[0] + input_scale = ops_output[1] + else: + assert self.weights_dtype == torch.float8_e4m3fn + q_input, input_scale = self.quant_fp8(x, scale=scale) + + else: + # Not quantized, nothing to do with the input_scales, use as is + input_scale = layer.input_scale + q_input = x + + out = ops.cutlass_scaled_sparse_mm( + a=q_input, + bt_nzs=layer.weight, + bt_meta=layer.meta, + scale_a=input_scale, + scale_b=layer.weight_scale, + out_dtype=x.dtype, + bias=bias, + ) + + assert out.is_contiguous() + return out + + def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype: + if not self.quantized: + return params_dtype + return self._get_quant_dtype() + + def _get_quant_dtype(self) -> torch.dtype: + assert self.quantized + assert self.weight_quant is not None + assert self.input_quant is not None + + is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8 + + if not is_8_bits: + raise ValueError("Cutlass only supports 8-bit quantization") + + if ( + self.weight_quant.type == QuantizationType.FLOAT + and self.input_quant.type == QuantizationType.FLOAT + ): + return torch.float8_e4m3fn + + if ( + self.weight_quant.type == QuantizationType.INT + and self.input_quant.type == QuantizationType.INT + ): + return torch.int8 + + raise ValueError("Quantization type not supported by Cutlass") + + def _decompress_bitmask_compressed_weight( + self, + compressed: torch.Tensor, + bitmask: torch.Tensor, + layer: torch.nn.Module, + ) -> torch.Tensor: + """ + Decompress a compressed 2:4 sparse weight tensor using the bitmask and + return the result. + + This function also supports sharded decompression. + + :param compressed: The 2:4 sparse weight tensor compressed using the + sparse-24-bitmask compressor. This is different from + `cutlass_sparse_compress` which uses a different scheme (2 bits for + every nonzero element that represent the coordinate within the block + of 4). The bitmask compression here uses a bitmask to indicate the + positions of non-zero elements. + :param bitmask: The 2:4 bitmask associated with the compressed weights, + representing the positions of non-zero elements in the compressed + tensor. + :param layer: The layer whose weights need to be processed after + loading. + :return: The decompressed 2:4 sparse weight tensor. + """ + + sparsity_compressor = self.model_compressor.sparsity_compressor + + def _process_split( + bitmask_compressed_weight: torch.Tensor, + shape, + bitmask: torch.Tensor, + ) -> torch.Tensor: + weight_data = dict( + compressed=bitmask_compressed_weight, + shape=shape, + bitmask=bitmask, + ) + return sparsity_compressor.decompress_weight(weight_data) + + split_weights: list[torch.Tensor] = [] + split_bitmask: list[torch.Tensor] = [] + split_shape: list[tuple[int, int]] = [] + + if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)): + split_weights = torch.split(compressed, layer.logical_widths) + split_bitmask = torch.split(bitmask, layer.logical_widths) + split_shape = [ + (out, layer.input_size_per_partition) for out in layer.logical_widths + ] + + if split_weights: + decompressed_shards = [ + _process_split(compressed_weight, shape, bitmask) + for compressed_weight, shape, bitmask in zip( + split_weights, split_shape, split_bitmask + ) + ] + decompressed = combine_shards(decompressed_shards) + else: + decompressed = sparsity_compressor.decompress_weight( + dict( + compressed=compressed, + shape=( + layer.logical_widths[0], + layer.input_size_per_partition, + ), + bitmask=bitmask, + ) + ) + return decompressed diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py new file mode 100644 index 0000000..a7f9076 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod + +import torch + +__all__ = ["CompressedTensorsScheme"] + + +class CompressedTensorsScheme(ABC): + """ + Abstract class used to describe the weight creation and forward pass + of different quantization schemes supported by CompressedTensors. + """ + + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + raise NotImplementedError + + @abstractmethod + def create_weights(self, *args, **kwargs): + """ + Weight creation for the particular scheme. Inputs to this function + + """ + raise NotImplementedError + + @abstractmethod + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None + ): + """ + Run the forward pass for the particular scheme. This is where + scheme-specific dequant/quant steps/kernels should be applied. + + :param layer: torch.nn.Module with the registered weights and + other parameters relevant to the particular scheme. + :param x: input to the layer + :param bias: bias parameter + + """ + raise NotImplementedError + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module): + """ + Called after weight loading is complete for any cleanup that + needs to occur. + """ + raise NotImplementedError diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py new file mode 100644 index 0000000..dd0f4b3 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +from torch.nn import Parameter + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( + GPTQ_MARLIN_24_MAX_PARALLEL, + GPTQ_MARLIN_24_MIN_THREAD_N, +) +from vllm.model_executor.parameter import ( + BasevLLMParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedvLLMParameter, +) +from vllm.scalar_type import scalar_types + +__all__ = ["CompressedTensorsW4A16Sparse24"] +W4A16SPARSE24_SUPPORTED_TYPES_MAP = { + 4: scalar_types.uint4b8, +} +W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys()) + + +class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme): + def __init__(self, strategy: str, num_bits: int, group_size: int | None = None): + self.strategy = strategy + self.group_size = group_size + self.tile_size = 16 + + if num_bits not in W4A16SPARSE24_SUPPORTED_TYPES_MAP: + raise ValueError( + f"Unsupported num_bits = {num_bits}. " + f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}" + ) + + self.quant_type = W4A16SPARSE24_SUPPORTED_TYPES_MAP[num_bits] + + if self.strategy == "group" and self.group_size is None: + raise ValueError("group_size must be given when using strategy group") + + @classmethod + def get_min_capability(cls) -> int: + # ampere + up + return 80 + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # required by torch.compile to be torch.nn.Parameter + layer.weight_packed = Parameter(layer.weight_packed.data, requires_grad=False) + layer.scale_packed = Parameter(layer.scale_packed.data, requires_grad=False) + layer.meta = Parameter(layer.meta.data, requires_grad=False) + + def create_weights( + self, + layer: torch.nn.Module, + input_size: int, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + assert params_dtype == torch.float16, ( + "float16 is required for marlin24 compressed models. Set dtype=torch.float16" # noqa: E501 + ) + + pack_factor = 32 // self.quant_type.size_bits + output_size_per_partition = sum(output_partition_sizes) + + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition // self.tile_size // 2, + output_size_per_partition * self.tile_size // pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=pack_factor, + marlin_tile_size=self.tile_size, + weight_loader=weight_loader, + ) + + input_groups = ( + 1 + if self.group_size is None + else input_size_per_partition // self.group_size + ) + + weight_scale_args = { + "data": torch.empty( + input_groups, + output_size_per_partition, + dtype=params_dtype, + ), + "weight_loader": weight_loader, + } + + if self.group_size is not None: + scales = GroupQuantScaleParameter( + output_dim=1, input_dim=0, **weight_scale_args + ) + else: + scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args) + + weight_shape = BasevLLMParameter( + data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader + ) + + meta = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition // 8 // 2 // 2, + output_size_per_partition * 2, + dtype=torch.int16, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=1, + marlin_tile_size=2, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight_packed", qweight) + layer.register_parameter("weight_shape", weight_shape) + layer.register_parameter("scale_packed", scales) + layer.register_parameter("meta", meta) + + max_workspace_size = ( + output_size_per_partition // GPTQ_MARLIN_24_MIN_THREAD_N + ) * GPTQ_MARLIN_24_MAX_PARALLEL + + workspace = Parameter( + torch.zeros(max_workspace_size, dtype=torch.int), requires_grad=False + ) + layer.workspace = workspace + + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None + ) -> torch.Tensor: + qweight = layer.weight_packed + meta = layer.meta + scales = layer.scale_packed + workspace = layer.workspace + + x_2d = x.view(-1, x.shape[-1]) + + size_m = x_2d.shape[0] + size_k = x_2d.shape[1] + size_n = scales.shape[1] + + output_2d = ops.gptq_marlin_24_gemm( + x_2d, + qweight, + meta, + scales, + workspace, + self.quant_type, + size_m, + size_n, + size_k, + ) + + output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],)) + + if bias is not None: + output.add_(bias) # In-place add + + return output diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py new file mode 100644 index 0000000..3afadc6 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import torch +from torch.nn.parameter import Parameter + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + apply_fp4_marlin_linear, + prepare_fp4_layer_for_marlin, +) +from vllm.model_executor.parameter import ( + GroupQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) + +__all__ = ["CompressedTensorsW4A16Fp4"] + + +class CompressedTensorsW4A16Fp4(CompressedTensorsScheme): + def __init__(self, has_input_global_scale: bool = False): + self.has_input_global_scale = has_input_global_scale + self.group_size = 16 + + @classmethod + def get_min_capability(cls) -> int: + # dont restrict as emulations + return 80 + + def create_weights( + self, + layer: torch.nn.Module, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + # Weight + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 2, + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_packed", weight) + + # Global Weight Scale + weight_global_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("weight_global_scale", weight_global_scale) + + # Per Group Weight Scale + weight_scale = GroupQuantScaleParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // self.group_size, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight_scale", weight_scale) + + if self.has_input_global_scale: + input_global_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("input_global_scale", input_global_scale) + + def process_weights_after_loading(self, layer) -> None: + # Process parameters for marlin repacking + + # Rename weight_packed to weight that marlin expects + layer.weight = Parameter(layer.weight_packed.data, requires_grad=False) + del layer.weight_packed + # Rename weight_global_scale to weight_scale_2 that marlin expects + # Note: ct stores the inverse of what is expected by the marlin kernel + layer.weight_scale_2 = Parameter( + 1 / layer.weight_global_scale.max().to(torch.float32), requires_grad=False + ) + del layer.weight_global_scale + + if self.has_input_global_scale: + layer.input_global_scale = torch.nn.Parameter( + layer.input_global_scale.data, requires_grad=False + ) + + prepare_fp4_layer_for_marlin(layer) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return apply_fp4_marlin_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + weight_scale_2=layer.weight_scale_2, + workspace=layer.workspace, + size_n=layer.output_size_per_partition, + size_k=layer.input_size_per_partition, + bias=bias, + ) diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py new file mode 100644 index 0000000..b603bdb --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import torch +from torch.nn.parameter import Parameter + +import vllm.envs as envs +from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501 + run_nvfp4_emulations, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + cutlass_fp4_supported, + swizzle_blockscale, +) +from vllm.model_executor.parameter import ( + GroupQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) +from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer + +logger = init_logger(__name__) + +__all__ = ["CompressedTensorsW4A4Fp4"] + + +class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): + def __init__(self): + self.backend = "none" + if envs.VLLM_NVFP4_GEMM_BACKEND is None: + if has_flashinfer(): + self.backend = "flashinfer-cutlass" + elif cutlass_fp4_supported(): + self.backend = "cutlass" + elif envs.VLLM_USE_FBGEMM: + self.backend = "fbgemm" + try: + import fbgemm_gpu # noqa: F401 + except ImportError as exc: + raise ImportError( + "Backend fbgemm requires fbgemm.f4f4bf16 operator, " + "Please install with: pip install fbgemm-gpu-genai" + ) from exc + elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"): + self.backend = envs.VLLM_NVFP4_GEMM_BACKEND + assert has_flashinfer(), f"FlashInfer is required for {self.backend}" + elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass": + self.backend = "cutlass" + assert cutlass_fp4_supported(), f"Cutlass is required for {self.backend}" + + if self.backend == "none": + raise ValueError( + "No valid NVFP4 GEMM backend found. " + "Please check your platform capability." + ) + + logger.info_once(f"Using {self.backend} for NVFP4 GEMM") + self.group_size = 16 + + @classmethod + def get_min_capability(cls) -> int: + if envs.VLLM_USE_NVFP4_CT_EMULATIONS: + return 80 + return 100 + + def create_weights( + self, + layer: torch.nn.Module, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + # Weight + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 2, + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_packed", weight) + + # Global Weight Scale + weight_global_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("weight_global_scale", weight_global_scale) + + # Per Group Weight Scale + weight_scale = GroupQuantScaleParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // self.group_size, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight_scale", weight_scale) + + input_global_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("input_global_scale", input_global_scale) + + def process_weights_after_loading(self, layer) -> None: + global_input_scale = layer.input_global_scale.max().to(torch.float32) + layer.input_global_scale = Parameter(global_input_scale, requires_grad=False) + + layer.weight_global_scale = Parameter( + layer.weight_global_scale.max().to(torch.float32), requires_grad=False + ) + + if self.backend == "flashinfer-trtllm": + # FlashInfer TRTLLM FP4 GEMM requires a different weight layout. + # FlashInfer provides nvfp4_quantize to quantize + shuffle the + # layout but we use our own quantization so we have to call + # shuffles ourselves. + from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a + + weight = layer.weight_packed.data + weight_scale = layer.weight_scale.data + + epilogue_tile_m = 128 + weight = shuffle_matrix_a(weight.view(torch.uint8), epilogue_tile_m) + weight_scale = ( + shuffle_matrix_sf_a(weight_scale.view(torch.uint8), epilogue_tile_m) + .reshape(weight_scale.shape) + .view(torch.float8_e4m3fn) + ) + + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + layer.weight_packed = Parameter(weight, requires_grad=False) + else: + swizzled_weight_scale = swizzle_blockscale(layer.weight_scale) + if self.backend == "fbgemm": + swizzled_weight_scale = swizzled_weight_scale.view(-1).view(torch.uint8) + layer.weight_scale = Parameter(swizzled_weight_scale, requires_grad=False) + layer.weight_packed = Parameter( + layer.weight_packed.data, requires_grad=False + ) + + layer.alpha = Parameter( + 1 / (layer.input_global_scale * layer.weight_global_scale), + requires_grad=False, + ) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + if envs.VLLM_USE_NVFP4_CT_EMULATIONS: + out = run_nvfp4_emulations( + x=x, + input_global_scale=layer.input_global_scale, + weight=layer.weight_packed, + weight_scale_swizzled=layer.weight_scale, + weight_global_scale=layer.weight_global_scale, + ) + if bias is not None: + out = out + bias + return out + + output_dtype = x.dtype + output_shape = [x.shape[0], layer.weight_packed.shape[0]] + + # quantize BF16 or FP16 to (FP4 and interleaved block scale) + x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale) + + mm_args = ( + x_fp4, + layer.weight_packed, + x_blockscale, + layer.weight_scale, + layer.alpha, + output_dtype, + ) + if self.backend.startswith("flashinfer-"): + backend_name = self.backend[len("flashinfer-") :] + out = flashinfer_scaled_fp4_mm(*mm_args, backend=backend_name) + elif self.backend == "fbgemm": + out = torch.ops.fbgemm.f4f4bf16( + x_fp4, + layer.weight_packed, + x_blockscale.view(-1).view(torch.uint8), + layer.weight_scale, + layer.alpha, + use_mx=False, + ).to(output_dtype) + else: + assert self.backend == "cutlass" + out = cutlass_scaled_fp4_mm(*mm_args) + + if bias is not None: + out = out + bias + return out.view(*output_shape) diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py new file mode 100644 index 0000000..a23961e --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py @@ -0,0 +1,183 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +from compressed_tensors.quantization import ActivationOrdering + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( + MPLinearLayerConfig, + choose_mp_linear_kernel, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + marlin_repeat_scales_on_all_ranks, +) +from vllm.model_executor.parameter import ( + BasevLLMParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedvLLMParameter, +) +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + +__all__ = ["CompressedTensorsW4A8Fp8"] +W4A8_SUPPORTED_TYPES_MAP = { + 4: scalar_types.int4, +} +W4A8_SUPPORTED_BITS = list(W4A8_SUPPORTED_TYPES_MAP.keys()) + + +class CompressedTensorsW4A8Fp8(CompressedTensorsScheme): + _kernel_backends_being_used: set[str] = set() + + def __init__( + self, + strategy: str, + num_bits: int, + group_size: int | None = None, + symmetric: bool | None = True, + actorder: ActivationOrdering | None = None, + ): + self.pack_factor = 32 // num_bits + self.strategy = strategy + self.symmetric = symmetric + self.group_size = -1 if group_size is None else group_size + self.has_g_idx = actorder == ActivationOrdering.GROUP + + if self.group_size != 128 or self.strategy != "group": + raise ValueError( + "W4A8 kernels require group quantization with group size 128" + ) + + if num_bits not in W4A8_SUPPORTED_TYPES_MAP: + raise ValueError( + f"Unsupported num_bits = {num_bits}. " + f"Supported num_bits = {W4A8_SUPPORTED_TYPES_MAP.keys()}" + ) + + self.quant_type = W4A8_SUPPORTED_TYPES_MAP[num_bits] + + @classmethod + def get_min_capability(cls) -> int: + # hopper + return 90 + + def create_weights( + self, + layer: torch.nn.Module, + output_size: int, + input_size: int, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + + mp_linear_kernel_config = MPLinearLayerConfig( + full_weight_shape=(input_size, output_size), + partition_weight_shape=( + input_size_per_partition, + output_size_per_partition, + ), + weight_type=self.quant_type, + act_type=torch.float8_e4m3fn, # always use fp8(e4m3) + group_size=self.group_size, + zero_points=not self.symmetric, + has_g_idx=self.has_g_idx, + out_type=params_dtype, + ) + + kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for CompressedTensorsW4A8Fp8", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + # If group_size is -1, we are in channelwise case. + group_size = self.group_size if self.group_size != -1 else input_size + row_parallel = input_size != input_size_per_partition + partition_scales = not marlin_repeat_scales_on_all_ranks( + self.has_g_idx, self.group_size, row_parallel + ) + + scales_and_zp_size = input_size // group_size + + if partition_scales: + assert input_size_per_partition % group_size == 0 + scales_and_zp_size = input_size_per_partition // group_size + + weight = PackedvLLMParameter( + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + packed_factor=self.pack_factor, + packed_dim=1, + data=torch.empty( + output_size_per_partition, + input_size_per_partition // self.pack_factor, + dtype=torch.int32, + ), + ) + + # TODO(czhu): allocate the packed fp8 scales memory here? + # the scales will be expanded by 8x via `cutlass_pack_scale_fp8` + weight_scale_args = { + "weight_loader": weight_loader, + "data": torch.empty( + output_size_per_partition, + scales_and_zp_size, + dtype=torch.float8_e4m3fn, + ), + } + + if not partition_scales: + weight_scale = ChannelQuantScaleParameter(output_dim=0, **weight_scale_args) + else: + weight_scale = GroupQuantScaleParameter( + output_dim=0, input_dim=1, **weight_scale_args + ) + + # A 2D array defining the original shape of the weights + # before packing + weight_shape = BasevLLMParameter( + data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader + ) + + # per-channel scales + weight_chan_scale = ChannelQuantScaleParameter( + data=torch.empty((output_size_per_partition, 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight_packed", weight) + layer.register_parameter("weight_scale", weight_scale) + layer.register_parameter("weight_shape", weight_shape) + layer.register_parameter("weight_chan_scale", weight_chan_scale) + + self.kernel = kernel_type( + mp_linear_kernel_config, + w_q_param_name="weight_packed", + w_s_param_name="weight_scale", + w_zp_param_name="weight_zero_point", + w_gidx_param_name="weight_g_idx", + ) + + # Checkpoints are serialized in compressed-tensors format, which is + # different from the format the kernel may want. Handle repacking here. + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None + ) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py new file mode 100644 index 0000000..aa0c52b --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py @@ -0,0 +1,153 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( + MPLinearLayerConfig, + choose_mp_linear_kernel, +) +from vllm.model_executor.parameter import ( + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + ModelWeightParameter, +) +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + +__all__ = ["CompressedTensorsW4A8Int"] +W4A8_SUPPORTED_TYPES_MAP = { + 4: scalar_types.int4, +} +W4A8_SUPPORTED_BITS = list(W4A8_SUPPORTED_TYPES_MAP.keys()) + + +class CompressedTensorsW4A8Int(CompressedTensorsScheme): + _kernel_backends_being_used: set[str] = set() + + def __init__( + self, + strategy: str, + num_bits: int, + group_size: int | None = None, + is_static_input_scheme: bool = False, + input_symmetric: bool = True, + ): + self.strategy = strategy + self.group_size = -1 if group_size is None else group_size + self.is_static_input_scheme = is_static_input_scheme + self.input_symmetric = input_symmetric + + if num_bits not in W4A8_SUPPORTED_TYPES_MAP: + raise ValueError( + f"Unsupported num_bits = {num_bits}." + f"Supported num_bits = {W4A8_SUPPORTED_TYPES_MAP.keys()}" + ) + self.quant_type = W4A8_SUPPORTED_TYPES_MAP[num_bits] + + @classmethod + def get_min_capability(cls) -> int: + return 1 + + def create_weights( + self, + layer: torch.nn.Module, + output_size: int, + input_size: int, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + row_parallel = input_size != input_size_per_partition + + # Compute effective group_size + if self.group_size == -1: + effective_group_size = ( + input_size_per_partition if row_parallel else input_size + ) + else: + effective_group_size = self.group_size + + # Ensure group_size divides input_size_per_partition + assert input_size_per_partition % effective_group_size == 0, ( + f"input_size_per_partition {input_size_per_partition}" + f" not divisible by group_size {effective_group_size}" + ) + + # Determine scale partitioning + is_channelwise = self.group_size == -1 + repeat_scales = is_channelwise and row_parallel + partition_scales = not repeat_scales + + mp_linear_kernel_config = MPLinearLayerConfig( + full_weight_shape=(input_size, output_size), + partition_weight_shape=( + input_size_per_partition, + output_size_per_partition, + ), + weight_type=self.quant_type, + act_type=params_dtype, + group_size=effective_group_size, + zero_points=False, + has_g_idx=False, + ) + + kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config) + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for CompressedTensorsW4A8Int", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + scales_and_zp_size = input_size_per_partition // effective_group_size + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, input_size_per_partition, dtype=torch.int8 + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + weight_scale_args = { + "weight_loader": weight_loader, + "data": torch.empty( + output_size_per_partition, scales_and_zp_size, dtype=params_dtype + ), + } + + if partition_scales: + weight_scale = GroupQuantScaleParameter( + output_dim=0, input_dim=1, **weight_scale_args + ) + else: + weight_scale = ChannelQuantScaleParameter(output_dim=0, **weight_scale_args) + + layer.register_parameter("weight_packed", weight) + layer.register_parameter("weight_scale", weight_scale) + + self.kernel = kernel_type( + mp_linear_kernel_config, + w_q_param_name="weight_packed", + w_s_param_name="weight_scale", + w_zp_param_name=None, + w_gidx_param_name=None, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None + ) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py new file mode 100644 index 0000000..904a9f5 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -0,0 +1,138 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +from compressed_tensors.quantization import QuantizationStrategy + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + apply_fp8_marlin_linear, + prepare_fp8_layer_for_marlin, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise, +) +from vllm.model_executor.parameter import ( + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) + +__all__ = ["CompressedTensorsW8A16Fp8"] + +SUPPORTED_STRATEGIES = [QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR] + + +class CompressedTensorsW8A16Fp8(CompressedTensorsScheme): + def __init__(self, strategy: str, is_static_input_scheme: bool): + self.strategy = strategy + self.is_static_input_scheme = is_static_input_scheme + + @classmethod + def get_min_capability(cls) -> int: + # ampere and up + return 80 + + # W8A8-Fp8 kernels support only per-tensor and per-channel cases. + # So if we have a fused module (QKV, MLP) with per tensor scales, + # we expand each scale to its shard's channels. + def process_weights_after_loading(self, layer) -> None: + if self.strategy == QuantizationStrategy.TENSOR: + ws_channelwise = convert_to_channelwise( + layer.weight_scale, layer.logical_widths + ) + layer.weight_scale = torch.nn.Parameter(ws_channelwise, requires_grad=False) + else: + # required by torch.compile to be torch.nn.Parameter + layer.weight_scale = torch.nn.Parameter( + layer.weight_scale.data, requires_grad=False + ) + + # Weights must be transposed for marlin + layer.weight = torch.nn.Parameter(layer.weight.t(), requires_grad=False) + + if self.is_static_input_scheme: + # required by torch.compile to be torch.nn.Parameter + layer.input_scale = torch.nn.Parameter( + layer.input_scale.data, requires_grad=False + ) + prepare_fp8_layer_for_marlin(layer) + + def create_weights( + self, + layer: torch.nn.Module, + input_size: int, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + layer.weight_block_size = None + + # WEIGHT + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + if self.strategy == QuantizationStrategy.CHANNEL: + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + elif self.strategy == QuantizationStrategy.TENSOR: + weight_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + else: + raise ValueError( + f"Unsupported weight strategy={self.strategy}, " + f"supported strategies are {SUPPORTED_STRATEGIES}" + ) + + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE (to deal with converted checkpoints) + if self.is_static_input_scheme: + input_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("input_scale", input_scale) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return apply_fp8_marlin_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + workspace=layer.workspace, + size_n=layer.output_size_per_partition, + size_k=layer.input_size_per_partition, + bias=bias, + ) diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py new file mode 100644 index 0000000..ee99572 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -0,0 +1,200 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy +from torch.nn import Parameter + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + W8A8BlockFp8LinearOp, + create_fp8_input_scale, + create_fp8_scale_parameter, + create_fp8_weight_parameter, + maybe_post_process_fp8_weight_block, + process_fp8_weight_block_strategy, + process_fp8_weight_channel_strategy, + process_fp8_weight_tensor_strategy, + validate_fp8_block_shape, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + Fp8LinearOp, + cutlass_block_fp8_supported, + maybe_create_device_identity, +) +from vllm.model_executor.parameter import ( + BlockQuantScaleParameter, + ChannelQuantScaleParameter, + PerTensorScaleParameter, +) + +__all__ = ["CompressedTensorsW8A8Fp8"] + +strategy_to_parameter_type = { + QuantizationStrategy.BLOCK: BlockQuantScaleParameter, + QuantizationStrategy.CHANNEL: ChannelQuantScaleParameter, + QuantizationStrategy.TENSOR: PerTensorScaleParameter, +} + + +class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): + def __init__(self, weight_quant: QuantizationArgs, is_static_input_scheme: bool): + self.weight_quant = weight_quant + self.strategy = weight_quant.strategy + self.out_dtype = torch.get_default_dtype() + self.is_static_input_scheme = is_static_input_scheme + + self.weight_block_size = self.weight_quant.block_structure + if self.weight_block_size is not None: + self.act_q_group_shape = GroupShape(1, self.weight_block_size[0]) + else: + self.act_q_group_shape = ( + GroupShape.PER_TENSOR + if is_static_input_scheme + else GroupShape.PER_TOKEN + ) + + self.cutlass_block_fp8_supported = cutlass_block_fp8_supported() + self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled() + + if self.weight_block_size is not None: + assert not self.is_static_input_scheme + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(*self.weight_block_size), + act_quant_group_shape=self.act_q_group_shape, + cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, + use_aiter_and_is_supported=self.use_aiter_and_is_supported, + ) + else: + self.fp8_linear = Fp8LinearOp( + act_quant_static=self.is_static_input_scheme, + act_quant_group_shape=self.act_q_group_shape, + ) + + @classmethod + def get_min_capability(cls) -> int: + # lovelace and up + return 89 + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + maybe_create_device_identity() + + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.weight_block_size = None + layer.orig_dtype = params_dtype + + if self.strategy == QuantizationStrategy.BLOCK: + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + # Validate block quantization shapes + validate_fp8_block_shape( + layer, + input_size, + output_size, + input_size_per_partition, + output_partition_sizes, + self.weight_block_size, + ) + + # WEIGHT + weight = create_fp8_weight_parameter( + output_size_per_partition, input_size_per_partition, weight_loader + ) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + weight_scale = create_fp8_scale_parameter( + strategy_to_parameter_type[self.strategy], + output_partition_sizes, + input_size_per_partition, + layer.weight_block_size, + weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = create_fp8_input_scale(output_partition_sizes, weight_loader) + layer.register_parameter("input_scale", input_scale) + + def process_weights_after_loading(self, layer) -> None: + if self.strategy == QuantizationStrategy.TENSOR: + weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy( + layer.weight, + layer.weight_scale, + layer.logical_widths, + getattr(layer, "input_scale", None), + ) + weight = weight.t() + + elif self.strategy == QuantizationStrategy.CHANNEL: + weight, weight_scale, input_scale = process_fp8_weight_channel_strategy( + layer.weight, layer.weight_scale, getattr(layer, "input_scale", None) + ) + weight = weight.t() + + elif self.strategy == QuantizationStrategy.BLOCK: + assert self.is_static_input_scheme is False + weight, weight_scale = process_fp8_weight_block_strategy( + layer.weight, layer.weight_scale + ) + input_scale = None + + else: + raise ValueError(f"Unknown quantization strategy {self.strategy}") + + # required by torch.compile to be torch.nn.Parameter + layer.weight = Parameter(weight.data, requires_grad=False) + layer.weight_scale = Parameter(weight_scale.data, requires_grad=False) + if input_scale is not None: + layer.input_scale = Parameter(input_scale.data, requires_grad=False) + + # INPUT SCALE + if self.is_static_input_scheme and hasattr(layer, "input_scale"): + layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False) + else: + layer.input_scale = None + + if self.strategy == QuantizationStrategy.BLOCK: + maybe_post_process_fp8_weight_block(layer) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + if self.weight_block_size is not None: + return self.w8a8_block_fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + ) + + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + out_dtype=self.out_dtype, + input_scale=layer.input_scale, + bias=bias, + ) diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py new file mode 100644 index 0000000..8400d0d --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +from compressed_tensors.quantization import QuantizationStrategy + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( + ScaledMMLinearLayerConfig, + choose_scaled_mm_linear_kernel, +) +from vllm.model_executor.parameter import ( + BasevLLMParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) + +logger = init_logger(__name__) + + +class CompressedTensorsW8A8Int8(CompressedTensorsScheme): + _kernel_backends_being_used: set[str] = set() + + def __init__( + self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool + ): + import vllm.envs as env + if env.VLLM_MIX_QUANTIZATION_TYPE == "TENSOR": + self.strategy = QuantizationStrategy.TENSOR + elif env.VLLM_MIX_QUANTIZATION_TYPE == "CHANNEL": + self.strategy = QuantizationStrategy.CHANNEL + else: + self.strategy = strategy + self.is_static_input_scheme = is_static_input_scheme + self.input_symmetric = input_symmetric + + @classmethod + def get_min_capability(cls) -> int: + # turing and up + return 75 + + def create_weights( + self, + layer: torch.nn.Module, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + layer.logical_widths = output_partition_sizes + + scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig( + is_channelwise=(self.strategy == QuantizationStrategy.CHANNEL), + is_static_input_scheme=self.is_static_input_scheme, + input_symmetric=self.input_symmetric, + ) + + kernel_type = choose_scaled_mm_linear_kernel(scaled_mm_linear_kernel_config) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for CompressedTensorsW8A8Int8", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + remainder = input_size_per_partition % 64 + if remainder != 0: + input_size_per_partition_padded = input_size_per_partition + (64 - remainder) + else: + input_size_per_partition_padded = input_size_per_partition + + # WEIGHT + weight = ModelWeightParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition_padded, + dtype=torch.int8), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + if self.strategy == QuantizationStrategy.CHANNEL: + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + else: + assert self.strategy == QuantizationStrategy.TENSOR + weight_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = BasevLLMParameter( + data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader + ) + layer.register_parameter("input_scale", input_scale) + + if not self.input_symmetric: + # Note: compressed-tensors stores the zp using the same dtype + # as the weights + # AZP loaded as int8 but used as int32 + input_zero_point = BasevLLMParameter( + data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader + ) + layer.register_parameter("input_zero_point", input_zero_point) + + self.kernel = kernel_type( + c=scaled_mm_linear_kernel_config, + w_q_param_name="weight", + w_s_param_name="weight_scale", + i_s_param_name="input_scale", + i_zp_param_name="input_zero_point", + azp_adj_param_name="azp_adj", + ) + + # Checkpoints are serialized in compressed-tensors format, which is + # different from the format the kernel may want. Handle repacking here. + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None + ) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) diff --git a/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py new file mode 100644 index 0000000..2267395 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +from compressed_tensors.quantization import ActivationOrdering + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( + MPLinearLayerConfig, + choose_mp_linear_kernel, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + marlin_repeat_scales_on_all_ranks, +) +from vllm.model_executor.parameter import ( + BasevLLMParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedColumnParameter, + PackedvLLMParameter, + RowvLLMParameter, +) +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + +__all__ = ["CompressedTensorsWNA16"] +WNA16_SUPPORTED_TYPES_MAP = {4: scalar_types.uint4b8, 8: scalar_types.uint8b128} +WNA16_ZP_SUPPORTED_TYPES_MAP = {4: scalar_types.uint4, 8: scalar_types.uint8} +WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys()) + + +class CompressedTensorsWNA16(CompressedTensorsScheme): + _kernel_backends_being_used: set[str] = set() + + def __init__( + self, + strategy: str, + num_bits: int, + group_size: int | None = None, + symmetric: bool | None = True, + actorder: ActivationOrdering | None = None, + ): + self.pack_factor = 32 // num_bits + self.strategy = strategy + self.symmetric = symmetric + self.group_size = -1 if group_size is None else group_size + self.has_g_idx = actorder == ActivationOrdering.GROUP + + if self.group_size == -1 and self.strategy != "channel": + raise ValueError( + "Marlin kernels require group quantization or " + "channelwise quantization, but found no group " + "size and strategy is not channelwise." + ) + + if num_bits not in WNA16_SUPPORTED_TYPES_MAP: + raise ValueError( + f"Unsupported num_bits = {num_bits}. " + f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}" + ) + + self.quant_type = ( + WNA16_ZP_SUPPORTED_TYPES_MAP[num_bits] + if not self.symmetric + else WNA16_SUPPORTED_TYPES_MAP[num_bits] + ) + + @classmethod + def get_min_capability(cls) -> int: + # ampere and up + return 80 + + def create_weights( + self, + layer: torch.nn.Module, + output_size: int, + input_size: int, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + + mp_linear_kernel_config = MPLinearLayerConfig( + full_weight_shape=(input_size, output_size), + partition_weight_shape=( + input_size_per_partition, + output_size_per_partition, + ), + weight_type=self.quant_type, + act_type=params_dtype, + group_size=self.group_size, + zero_points=not self.symmetric, + has_g_idx=self.has_g_idx, + ) + + kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for CompressedTensorsWNA16", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + # If group_size is -1, we are in channelwise case. + group_size = self.group_size if self.group_size != -1 else input_size + row_parallel = input_size != input_size_per_partition + partition_scales = not marlin_repeat_scales_on_all_ranks( + self.has_g_idx, self.group_size, row_parallel + ) + + scales_and_zp_size = input_size // group_size + + if partition_scales: + assert input_size_per_partition % group_size == 0 + scales_and_zp_size = input_size_per_partition // group_size + + weight = PackedvLLMParameter( + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + packed_factor=self.pack_factor, + packed_dim=1, + data=torch.empty( + output_size_per_partition, + input_size_per_partition // self.pack_factor, + dtype=torch.int32, + ), + ) + + weight_scale_args = { + "weight_loader": weight_loader, + "data": torch.empty( + output_size_per_partition, + scales_and_zp_size, + dtype=params_dtype, + ), + } + + zeros_args = { + "weight_loader": weight_loader, + "data": torch.zeros( + output_size_per_partition // self.pack_factor, + scales_and_zp_size, + dtype=torch.int32, + ), + } + + if not partition_scales: + weight_scale = ChannelQuantScaleParameter(output_dim=0, **weight_scale_args) + + if not self.symmetric: + qzeros = PackedColumnParameter( + output_dim=0, + packed_dim=0, + packed_factor=self.pack_factor, + **zeros_args, + ) + else: + weight_scale = GroupQuantScaleParameter( + output_dim=0, input_dim=1, **weight_scale_args + ) + if not self.symmetric: + qzeros = PackedvLLMParameter( + input_dim=1, + output_dim=0, + packed_dim=0, + packed_factor=self.pack_factor, + **zeros_args, + ) + + # A 2D array defining the original shape of the weights + # before packing + weight_shape = BasevLLMParameter( + data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader + ) + + layer.register_parameter("weight_packed", weight) + layer.register_parameter("weight_scale", weight_scale) + layer.register_parameter("weight_shape", weight_shape) + + if not self.symmetric: + layer.register_parameter("weight_zero_point", qzeros) + + # group index (for activation reordering) + if self.has_g_idx: + weight_g_idx = RowvLLMParameter( + data=torch.empty( + input_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_g_idx", weight_g_idx) + + self.kernel = kernel_type( + mp_linear_kernel_config, + w_q_param_name="weight_packed", + w_s_param_name="weight_scale", + w_zp_param_name="weight_zero_point", + w_gidx_param_name="weight_g_idx", + ) + + # Checkpoints are serialized in compressed-tensors format, which is + # different from the format the kernel may want. Handle repacking here. + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None + ) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) diff --git a/model_executor/layers/quantization/compressed_tensors/transform/__init__.py b/model_executor/layers/quantization/compressed_tensors/transform/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model_executor/layers/quantization/compressed_tensors/transform/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/transform/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1564fb6320601d40c333504a814995163f6a3129 GIT binary patch literal 213 zcmZ8b%LxKO4BhbqB3OurHAfZ15^P}@HR|B}%w%QP7A(LTti=wzT7h`;WcKQVm%Jp8 z@E)Vlun@T3)wccBmVbPDvbhyaF;=U4K^>Vst>??YK0&gnWJ7!8vaY7{*>Q^C>>5(T zhZx}OoChDb=rq>2sW=ii(;#IyRVJ{#7`EcuM4t$A#;s-yIm(^MguIL7L2DZ^@R=m4PDe8*)0siqkeEBpHB~qkBQzB(O8B3J>lx$0K9LaB6PAuE$D(j^f&Pb-rhcYvA zY?Z8?AY0I=v6dIxwT#_G6m(H2$Szjk#7IA z=icFvltOzQYEX(lxk&TFFD7(Gdov^J`n)Ik{?)yE8zh8Q!+#Eg^1m}$~P>UB|b%ra>qX??UN zRy$c6vrbw`-Vn9L>L%+*nu*rO?2~qqHbxz>hRFsBY0weGnm$6TnX`VTRpvG6WG$R- zHlR+o{GIJDr_~f&!!@zB9K&iC3<$AS&doJ(TezksXfUn8F>B*IP}pr3LaNm>sdRr^j+2Z9lyg<^=o8iG!%`7 z-i>my@dy{^_>h?3{giA9g~Ri)`DjSw#{INhbCwUqh3NzzJIK!nvbB^QO~j`oGjd(2 z;AA2k5+jLtMVpE6MJ~nxQ6?e;V<9no5yo}DedyScqi2IBPahmRbS8LyM{wfYn{S>z zbN0}fY&#i=b0Pi|CtggjZ-fNg*EO1m&GDQdaO_zwE+lxNM7Hi|hz-R;JX@Lsw685? z&d$$8xiDU?8Z5=cI!{=QLz5JXCN(TIspV)^!|7Npr)Oz$bUMz!F`Th@X!;E~1B>90 zn2+esiUcQ3r1wkuOzT-A9D$iNah6%PI-&A+)>BTaDPThen9s^FgmG#hhgr)C6ntu% z@^{+8+F*n_U>-DCZ$zx?8;w;DR6EPm)D&xfLAkO^hU&G`4D0wvH|by-IA&VII$>T7 zS80Ev-1)WBRF1rKXv*0yd}8uJc$*K+3BIWp^ii5l`4YTu>RkN2`A}SpEO2Zk4^8<( zan=`NS%J`t#OLNkQk0k%aZ*&dTkweq9~TiXa=cH7u$(V3?Sl-^j?6^jp{Or3Hy6F? zixJKwv}6kzF{l)&If1bJxNIUr1;y!@Ae)uJl#HI|#Cbj*)~Oo|U!2C$X8lhf0trZg z!~+SK(#13>J%y55flktb<~aJ0(x9Y1sYz--(?C-Qya`HlNh+yL(yGLo06CUKo6G## zaUOSqE7U&CMaC&Z6P+5*&D~B#eqp?6N!E(_ccZCbj zgR}@lLsvOo2o!go-2OngbX$X>a$N(ZO%5pJgg}&B^U_H1%PYf$%?kpBS6#n~Ti#iXnc66ML`B59}ShG{E&mBk%UjT?KIx z@53wT+qi;0Wd(heE9m?B3i?Vb0D4$J1?H~GmT5i_D{hdmu=52xx{0p?m7UsQ1;PkK z*U?uttX5X@BW#OTD-@n?ot(?5~+Wkve`bOGVH(H9fRCB*({V`L!JL4CY8?XZK|v6IN&vn=!6Gk#ZUVju7WG~+ zSE73reM*h{b;_xf4n(#V8C|_dvQ6c7wFI;XEpXB4O+5un8J`}HC7;X$gOPYd3a*#$Qi<-$HDLT> zhTnP=)(vN3m@S=w+5~o-`m9)*Vv=aKbS~uDASkIYAR(+8bwCUU8eZDBsEJg zm|CWMQ>LUTX<(@#$YQBcbxc)lR&-_cz*QiJk~BykJ*)YtK56(l^#h7RqE&4MQc7Ko z87SAO<>hbEthU(D56$AMzbU9KS?#4U5CnfEVtGIU(5}*c)}_{$zpA<})57v*77LY4 zA!Yd%>A#}7Pt_zXKLKQ$toaH0nKo4`deuo)Um=_TLv%q=m!zz4#;t0f@;6zVw5s|7 zw22k&ckL^U+%5yW&1Ee%)^7$yoV}4hl!B(8YS3%Y;MCN2$S!jbMn&AY09p8qVymaky zJ$D7PeQ^Q7kSOwkvb4HNxTnm}wqaX?m$%Ec@%dO;e+c{_9uO-ixweEMl)lDNUe*hM zT0}v>ml7+)4S{G!ekW8d?0!YXxbX~*Kv$s+juh0R6SeKmZ5@=h4&L)-w;oH^V1?P5 z^Ylud-kfJZ@(koWuS=fSGoV1to`>xtl6fTWZO?gkNZuV8izmP1jf|yb>6&EeTVuM@ zbdG6~n5G=lEiv5%#28KadRM+(+$~Gu$Jg#&%XxQ8-rZU6 zK-N8YuTFCB&$*9C?jz|FPh4A;#y&oM_jJy)L-Op%dUj=9yH`V!YcS^;mR!R**I~(Z zIDItl+qRTiIk<9hWirz|m>z%H=+E06cdWOqd0Ru?)||I>=WU(qW_?p_`WQ^NraotB zl`O5xfmQ!Q%TT_pCq4Gi;(KbYO^Z3JSF(Du0MNESFt?LA=DdB9w=d`2BYF2^y@Ofz z-g~W*dno5VD!GpqXPNW#NS>aYXQ$-Znf2_>y7sKTC%N|KTqBZeBJkvFt^^9a)2kvX{*JoT~Qr%cu|HRk+^NAH_X8VzB z$I-0sSQ!?DD`5w|PXtQ-H6#GGpjr5C4znf5SguKypbV%SY8KPk6jWtAhAzOpOliat zTuN%n&_z{T0(H}o)KnWur9+{c9W=U%cpnT@1gdiL9G?h-^FWnI!O%1o9#{}Y;xjT6 zz8HzJJP0XMM2N)UBF4iUku-j7Sz5?OVg6l3-UxW9a6y%8c^EPaW3PBEUpkW?0HvZW zKyHO{uLdM$;GXUwGqTpy@^Q=Emd8#0^sy(_rX~Nwjy;d819@9h&Nd*~1|He=KGPxh zYYMKVuE_Mek&v*#D8OKloa9LWWiL9(59;8Ai=YkKmqJpbDwh<1z7L2g3N0%+rP#LG zNh?rMHBW&8nyms^KlqMlR0o6>Rluq`2jG~42z?huPthdSrS#CF#6Kz(1jY`y3eop9 z08#;_Vm11GjpEUnttq!sQz|DV>0eMP-&Lo8;quK<8t6}eK|Ky`Cj(YD!3gU&C@Kjf zQdvj52K-i>HgcdFu$ztJhe=&Hnuv4I9GJhTl#b)n2^^gwEr?!_^`Jv%#EX>%ufl!f z&}gCO6}gYhnZ1(Pn=x+#S(vl6OSbmqx3adb^uaZ&J)O)jThhrq zw6ZiymgbzLO|rBtA6nVI!e%Wy(^P>$n1W9-`!eR9Cq27Y+1#EJ(w-BKdrnHW?nT30 z?^4IIGvBu}*Ec5hjpZE;KMXB3EH|t;KWWJ{^<|uWlA}N8*dsaiWF3R|y9!#=e~4N~ zn*RDl>W8h1=a$YbpTGOPOvAQJ{k8|T9;ojfEYw6iX3!JHdjfT#c*2fHI3E8OD}w!-rIb)GA3l;7>q9gwJ6Xn?sVgfvbn>{ z>g$=t@`r(#BH(Q-c+hUXdi~=3$9IdDwg(}_+7BG3IsQu z$Kd-A&nI|UN1(ETr}y9%uqQ}Cv3qd}Ls>8%h{(csPc|uHFV1pTD!{H5`#rIUSD>sS zu69FyVdxdbRaKJ#%V)t4SNb}7f`M2iRLom#OZ3OqyH?5C^~CARv<_sQgBjc4)0Vd7 z@y9JY7fpG)>&}7O2bT9q_P}cU-}L>u?|!SaV=QYQd*W`-bnMT%hcb?#f)06iJqOVS zQ1t&3=&iZ7wY8ScmtOY-X6@^+a-JZ z3X`?>liHE2`#{EVpkPFe+X2Yi9C$yL>71`$^7Ut|{TZfTk&-^xH@}nL0Uy~EELu)r z!C8Xff(=C#D^4~9gKQ!U#b#pRDSmAp`+j(=D`lFEghg2!iHkgT@$k(!YR5G?a0Z7j z%9?mw)}K<$GR!=(j<~^y@xo)MBf9`gg=ha>EaQ+NcMKs=6ck?g($Y8oO@anzOT@gD(JFS74QvoVyAJoMli$(~^BDo@wfl9K8i2E;Avgd#P>d z&ob^_sUc7><1z~}?f9|=QtMikWZ^Zl?f{_+U!c2zkO7x58wjl%ahdxNr8`coSsk~o z+_-|bzMvx|K%lL&U{KNwYVBB__{3N+DtQxXXe^kOv;{ev3N=c)7P&lkF5SMgZYB9I z$2B^bs}ptjpBWvxx(%a19>%cL-5R#^&$uZ})P@KJV~Y&5onWfO6N77f~xQ5p;@?|aU*=Lh9 zfVa-5f?)8Qnh1!z_y=j1d^yUDar_5p>x#60EMlEa0sxz;YXE*=0^ZKZGuYIEZL0ishM&%+WNOM^BR7GgD5x-0+5Lj$iy+%@ zRq+8Rh+RAUX_y(0&qg*V&o+XrkIcjqJUmImJ5(Z$A6tmZ;W22Db;rivJa?7|A6b9; z+*zDC3l)Bg;?oIZdMu7O996tgY?=$vRN!_YZ*@FTZ0DheY$VEzOc#qiMR`@IFxf+v zr#xPjuS#)IJP&>T9)7}qKm-W89@RUpAJ0Q{;;FN5)sz`JlXXs9KmD|0cV^Gw#~nu` zrZsI|Jim1INk?Gi+T)G`;BZJEUot+mH7+#)(%$u}p-+eIIsR_T-)?ziJCt`c-Wk6= z{>ah32AXX6_VB~@-MRMtQv3b~_We(K_uo75xijz!_i$F7{%F zz%RA$Q;JhSE7vG;EjYs`=H~=i7ZJIbK=hHHR(OZF?B-zNgfTY0Qt=U7j-xN&Cty?U z-_d`=QR@-Ym@#X{r+;>j?=wx~;Tv>>1KAOc`bsyE093&TuH-~G0!A_Gdmmok;mtiF_=o@rkbdJ>cKAX( z=TqLzp?U*`_rv6*cm-P7X+Wo z?A9Kil2RkwyveeK=VFP=oG?E<9k~LYG9oy!%!EhndHA1A zP^pn?h&)s>*gId5CA^B4RP=;hgx)-k{u6#e6e744EvS1W*EK42jXv&zYtWXqEXMNY zhMc)YGPf)xADMfe=~4Ukb)D8x``fnu#SbB%;U6QS9#get{zGu$bM)Pg`&v1(` zk+!B5;Cc693Oi_B>7fkm`u&$?s3NoZ<(7kWdVi~I4JV@TzaRX7D+EIChIxE>c&zV1 zt$_DMOeEmX1(?J6@8LpE_$7Z?;IS3M<10vX5RYMw+*qZ@ychwhM4Pp~f2{cWG!&$M;*X$NQfv6n9{7O;10VVLw?a|bBVs%d{E7dRz)xZN zT0!Lb@8awpoYjf=uUq^DT!6KzB6kUoZ8R8UW=NUl2#<9QRziXq;)0fi;+7 z`vO_M(&#A7I)doGEQktxMQb6&Xdu$7(R0(3hT5L#9e$2*qHtK}r8@Fv+pWPHgSQUc zI8e|+e$8et;BpZ7uFe%OK6(lUoMqq{W!b*WuI&6IvfLth`U^%}3`UDNPcvzII-W6l zB)WM?TpnF%$+Qnh-oXq#_)M$OYu3%U=@`XOfxNRrA*zMUnyxlIzc{hfw=$e*7?iAg yC9qj^P*HF?sm46aM2o+@+gICG#e1W_zPcKadJmGqHJvft_Q7|a>!1Nq&;JKc-@Hcv literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/transform/__pycache__/module.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/transform/__pycache__/module.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c21fa7e16c6c3478d92d6c4c2ed9c3557b0602a GIT binary patch literal 7745 zcmd5hTWlLwc6Ue)-xMiPeCiQOwq?-|Qi&gO96vU3oOtc{vFk?zdW#9g8Chh8kK7sA zmX<=iTQ64brij#_kc1Yn0+b2*Lm}wJRT>3B3gcy{YjXR-(tfkYy)`oIRFcUCp;sP6xJq@ z$(vH9yg6lN^nOdq0{v!@&0AB}ye(zR+f#OJ%pzLyj+CQ;=ghlOZVhHdPrf77p}|(s zoA;%B8f+6g^IfSf4YrH^e0Qo_gB@ZZA4~-`*eQnc;Z&F*Ch|7nT{j8u7A(0}jE%SI z+};*!KzGbKlv@a)qWx^O&}CEtDXxqU696$H2*riBB`Ct z<~zkqTFw>=s<+v7J~JWYg*c;HF9-#xNCDbCn=1~0*0#?Cbe zAJ2~oJfAIGfsw%Zi8K}XKM=B4CS*j&3$j2n#!X=DZZsYZGKBmTo>?F%h9@c0MR@L% zS+EGKU>!F_08Oy*rbAfU0Wxb4jyjs77VP6q8|Uz>;LHUZYiy`;z--SJ-YR%_o51pR zp+jJC#c}5>8E1IMO?S%6J8zPdj}P%Kp%b2)t^ui)o>T36h3 z0>9VFIKN8>@_wMvtLQNzCfp6f2Hvq-2BZ zq(tnr0Ph`>i?J~wHYo{w>{?cyh`o2Lm@iQvBk(%6NjeKPH+5Q&CyM-g8aZth8Tb%6 zHr65@1AYLc*l?jJ$Fh0gwLIX!5>tX4*Ji5D7Pe~DPor9;Od1ZSvSNBlpsGjj=J1fv z3}*|aNm=7;t|U<8csZ+yO0YO7<506lLy|Jq4V+*H7zOmvUjsk>8M8p-=HRW4GWiow zN_QGP3@I~Z!kG+&EP+--xCxORM!$j2Gw;7i2zALWV}v)|fmN{ln2|j?L`d2E3$xJ= z6m89Z%Z#;5au%cAs&kz!*no^x-z0zd9w9$~UC!8k0K1#9&p2kBWwvZX4xe!WMZYnp zRhLa=N7-4nmR)xmYIw#Cn1L3C0Rf)-PU8+{Jb=?-jI`=ps0ABP*;e+H-DUSUtkg~# ze_^@PI4MIu8neLN=6Z~|t=c#xL#9ZaJ@YkQ=k&p|1rhYwMn=A zE5(raq{&jLNP+X>48=2oKDMB0R9Rl0Dhcrpl?4=FE<49%3t5@tC|)_mo1^&9s{c$; z{%(tdk5gKt6t9AI;V_$(g}kKtKxfKagI~1;JX#N*mebt(!j$Ss^L&dxRR@SzCM%(y zk8)SjViphu*a32JOEBVI*sUOr>wD_PMez_6^&Z8mSDh`HP@TyBK!X>qt8Pxxj0RT# z)r@k57DTnA(bl*c(xPqCwQ3*B}-Ev{DtIIQOqZG<0M=cGLv$VCN;s9k_}U%8MXs^w8|7TC~3!O)Al3%@>%vx+pKFR(pVS zPsi-3x}Ai3tD(3OimwcQn4o}s-U9oTfXM6k+9F=4mY6f9~3Pn?Kz0kXWf~9po z{mC`eTB{#ai3BPGl-{?UQD#udv{P9#rWa6Ys=L;@Eg|QbmRikeo?{COp|UC0RHLnt zmYOxx>NKceSig;`Dl_Ord^Bl6gE!c_yyp=_D#fGwB2*Z9(Iy&f{wsq zDhs-4QlO&9AOkDHp7m61`wY9;40pjKUG4Autm zWZGGD-V9=3*lEByPrfZh?e1GkHKBGYLVu4L%cggO7V4SO&PRuLw-j7uOFQpCTLGo% znbKIF-E{qS+|uraKiuSPEIBkZ&lKKbyu!wpGZFbE1JO`rW$+zy14eJFhSLtVS_WiL zEklrNctvezGn>w4r@@!|_WbexYu^z2uxC{JL?ih5`hTE&{onA>wR4xIohioPx*HS+ zNVTB%29ty|&+*y3>XWjUv4UA>4H2dHvoPx(rjK zdtjkKle@YWx8B~lw6hu=QQ+qqsrkdR?||M0@8IaIql=@8YfII&Lvih>x{``3xq9%P z>u}BM|M$0ulA6avcfEfu!7cS3Vt_F50ft@RFD}lYUZ#{_i-%c(i!30HLsYXYY z=x8;1^j`Gny!AJi{^8xff46QX9zUEd5Lq}ie`?vLbPvxSf3V46Hf%Zg>z-fs-2L9K z`qy@SrxJc0w$`0o@7`VMPS(P`i^A=RrHSRMpOrr?SNiv^j^E|M^9lsN3Wn#$76Z3? zmU=!Z)`C3`ZRS8P?4mPRcauo}^GAivb?#dC#Vfv@weIlZw%fx?!^`_WJM`(H3ix?L zceeuB&aSVzy5|SyF5>w=N!GeTK(;enH-np|*-VNIs&UFwmJ+R|?wkPt(Pz_fYianh zY20+4{G2%h85UEPZ@3aIdQZC%sAQ&JdY;XmiaUM^ND@XgH^_Zo*WCV3oQtQ{yu)ku zVSTSC3UmlRJK^(Gz>G#&+wh;lR?)QQpyS!{BX|3LFhWf?q0Gol18*2AdZKJ@XWZqc zac}1ZYDON62Ntn}xkK!)ilqLzupCzfJ$Kx!)t-xMCYa(^~NxXTNyA zaN2hJr6=KNaR)sL6@-D1^^xRsAtO-ShiVraX+(+|N@c%$=G?^#s`c!}3s6&ZrB&x8 zbnUcw)=+d>gsHLb4!~&fh4hS?qcIqs9()1sZHOrU81SUOg3k?7_Yrp&Dz!D&P|Y7% zI68lH%{};~D=;VgMQYtOj9P8x*35eBhE`M&kRJ=MX3 z%HY9!od;{7!8PCD<1XTj{7hW4M<07hM|hE4e`(*kXMbBlh~JhFuJC0?SRN7TC~-a4TB;$!}!|I6m$gS zCO@@{g*3kWBiK(Hhck!f?U};s(Ow((PK)-wC6&!i3K1w+S#X-oxi+h`OEVdg3;L%` z63Q5TA0iit`6wyl1!|=gGya&tSWu4!@nKXF_o!x3C}@6$Y7w)NtXj1INYZ=;dIqpG zfud&-!kf|^6AbcHJ$q7;Dg7Qa;}+d{z&WaOJWC~+)1pd8GfOEUd!hHB$Aw#=HNC5i zP`xlegpwuw4SYbmd%NcLqEJ=5Tb9%JyxZ68+t<9?@B2fG7Bu|2ov(yPD*l&O2NeI_ zs{d8R|7z9$hT?x?*8afdUk?peLkT66xaUgLB7JBztC6GDyE7JE-&>{JihboTKLNT{tp-r0UlpvC7EtivRfi z=+@=$|9N7qd-x$s`VZGx;!bMP25C9G`gjx*aG0y&IMvDFUmB400ocuPkSh`!BQ}oX ziy7#)@Y#&4nzIF&x?sBM<+yYK@_kxD36fS?f=*?}A#*INEVy%-3Ds06sMgaOVT!t* zUc?VN`&vX!QM%{}{Gdcqd>e{3mZAV_8js=&sh&n^EF*{_$H4&-&?|v{Am7p75ygSx z2YF6nVB~i{ArDO!)(&n*XJGF9Vqz^Ytbm(giPzi%70bW_w{OlrC(K0^cW=egTel3c zJ05@o;$JMRMRqCOyX!3UfOp~xe0=@Zb@cw~Htd1KQmpPk$Vs*g)m;d=$-uVd^Pf7N zc%b#~=NJcj^kMG~wqx;9oj|pc(W<*ckFZsLmtolW^4U6p>Ni`o>K}(6VXJP9ussjF z-OUwQXrqS|tJ@H=6K`+bfsm5~!gUuyZW4^tJqUFWr{|Fup#Ru!XXCoFfp-$`rTd^# zy_q8XTb04|y_86gW%MV}E`1Egd!Nq=CYo76{YlU$>2HNkr}{A8nBX##l3dJl7zDUB zH0bMy^OA-G|9QzYJ2nYv;e?Sh=D>PFaz{(~4ro|UXD$*}6O(5~WNcC9}H1>;XU-RAeTn~7JT{eP#YwKI6FJbv)K zeopka!_1XxKcoxh5&8i%k>w>+gmp8+FpsQ+asMkx{Fb=?(_~>x4+(spI0+N^H`4zF qIrjxQ@Wf<-F2H-6`GQ>ff*gHpyll`6n;*eT$T1HMJI}M6%u14+KdIMLSpLHg|b9KR=AjJgq@_eQ#DKsFtM=p zHz0lr3sM z_*fOSe$^=XRL!;BT*+s%6E3;EwR+(zW`pB^*Y>b{i24FJMAHr(v$i{=Pw7^D=(ve+b{(>=^FeaO_6CC*lW2apqTnVaB8)Y{c zE9$y2g{hiaGp5|u6h-(1OGNm4ni<(WFx~;bG!VMdnzOCxHayzv<|3~btBK%-}L_5O4g}{N`Hq~th z&QINOCTj=7&Yq~dDDlSv{Q%*2NC>%#f!zNIhu_zyUz44;qd##-?tRR!03Y7s9|k+L ANB{r; literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/transform/linear.py b/model_executor/layers/quantization/compressed_tensors/transform/linear.py new file mode 100644 index 0000000..bd1964e --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/transform/linear.py @@ -0,0 +1,260 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable, Generator +from itertools import accumulate + +import torch +from compressed_tensors.transform import ( + TransformArgs, + TransformConfig, + TransformLocation, + TransformScheme, +) +from compressed_tensors.utils import is_match + +from vllm.model_executor.layers.linear import ( + WEIGHT_LOADER_V2_SUPPORTED, + LinearMethodBase, +) +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsScheme, +) +from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import ( # noqa: E501 + HadamardTransform, +) +from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import ( # noqa: E501 + TransformTuple, +) + + +class CompressedTensorsLinearTransformMethod(LinearMethodBase): + """ + Wraps `CompressedTensorsLinearMethod` or `UnquantizedLinearMethod` and adds + input and output transforms to either side of the original apply method + """ + + @classmethod + def from_schemes( + cls, + quant_method: LinearMethodBase, + quant_scheme: CompressedTensorsScheme | None, + input_tfms: dict[int, TransformTuple], + output_tfms: dict[int, TransformTuple], + ) -> "CompressedTensorsLinearTransformMethod": + from vllm.model_executor.layers.quantization.compressed_tensors.transform.schemes.linear_qutlass_nvfp4 import ( # noqa: E501 + QutlassNvFP4LinearMethod, + is_qutlass_fp4_scheme, + ) + + assert input_tfms or output_tfms + + if is_qutlass_fp4_scheme(quant_scheme, input_tfms): + return QutlassNvFP4LinearMethod(quant_method, input_tfms, output_tfms) + + # hadacore or dense gemm is selected by Transform module + + return cls(quant_method, input_tfms, output_tfms) + + def __init__( + self, + quant_method: LinearMethodBase, + input_tfms: dict[int, TransformTuple], + output_tfms: dict[int, TransformTuple], + ): + self.quant_method = quant_method + self.input_tfms = input_tfms + self.output_tfms = output_tfms + + self.input_transform: HadamardTransform | None = None + self.output_transform: HadamardTransform | None = None + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # get weight loader for transforms + weight_loader: Callable = extra_weight_attrs.get("weight_loader") # type: ignore[assignment] + + # HACK: UnquantizedLinearMethod does not support weight loader v2, but + # transforms (specifically SharedWeightParameter) requires + # weight loader v2. Until UnquantizedLinearMethod supports v2, we must + # hack around this by getting weight loader v1 so ULM can load correctly + quant_method_name = self.quant_method.__class__.__name__ + if quant_method_name not in WEIGHT_LOADER_V2_SUPPORTED: + weight_loader_v1 = layer.weight_loader + extra_weight_attrs["weight_loader"] = weight_loader_v1 + + self.quant_method.create_weights( + layer=layer, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + input_size=input_size, + output_size=output_size, + params_dtype=params_dtype, + **extra_weight_attrs, + ) + + # validate schemes + num_partitions = len(output_partition_sizes) + self._validate_tfm_schemes(num_partitions) + + # create submodules for weight loading + if len(self.input_tfms) > 0: + scheme_name = list(self.input_tfms.values())[0].scheme_name + location = list(self.input_tfms.values())[0].args.location + transform_name = f"{scheme_name}_{location}" + + transform = HadamardTransform( + self.input_tfms, + layer, + weight_loader, + input_size_per_partition, + output_partition_sizes, + ) + layer.register_module(transform_name, transform) + self.input_transform = transform + + if len(self.output_tfms) > 0: + scheme_name = list(self.output_tfms.values())[0].scheme_name + location = list(self.output_tfms.values())[0].args.location + transform_name = f"{scheme_name}_{location}" + + transform = HadamardTransform( + self.output_tfms, + layer, + weight_loader, + input_size_per_partition, + output_partition_sizes, + ) + layer.register_module(transform_name, transform) + self.output_transform = transform + + # compute partition ranges for slicing activations + starts = [0] + list(accumulate(output_partition_sizes))[:-1] + self.partition_ranges = list(zip(starts, output_partition_sizes)) + + def process_weights_after_loading(self, layer): + self.quant_method.process_weights_after_loading(layer) + + for submodule in layer.children(): + if isinstance(submodule, HadamardTransform): + submodule.process_weights_after_loading() + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + if self.input_transform is not None: + x = self.input_transform(x) + + assert bias is None + x = self.quant_method.apply(layer, x, bias) + + # In most cases, input transforms are preferred over output transforms + # (@ksayers): confirm that this is done concurrently + if self.output_transform is not None: + for part_id, (start, length) in enumerate(self.partition_ranges): + x[:, start : start + length] = self.output_transform( + x[:, start : start + length].clone(), part_id=part_id + ) + + return x + + def _validate_tfm_schemes(self, num_partitions: int): + if len(self.input_tfms) > 0: + if 0 not in self.input_tfms: + raise ValueError("Must have same input") + + for part_index in range(num_partitions): + if self.input_tfms[part_index] != self.input_tfms[0]: + raise ValueError("Must have same input") + + if len(self.output_tfms) > 0: + scheme_name = list(self.output_tfms.values())[0].scheme_name + location = list(self.output_tfms.values())[0].args.location + + for tfm in self.output_tfms.values(): + if tfm.scheme_name != scheme_name: + raise ValueError("Must have same scheme name") + if tfm.args.location != location: + raise ValueError("Must have same location") + + return self.input_tfms, self.output_tfms + + +def get_linear_transform_schemes( + layer: torch.nn.Module, + layer_name: str, + transform_config: TransformConfig | None, + packed_modules_mapping: dict[str, list[str]], +) -> tuple[ + dict[int, TransformTuple], dict[int, TransformTuple] +]: # [input_transform, [output_transform, ...]] + # there can only be one transform input scheme per (fused) module + input_tfms = {} + output_tfms = {} + + partition_names = get_layer_partition_names(layer_name, packed_modules_mapping) + + for scheme_name, scheme, args in get_schemes_args(transform_config): + for part_index, part_name in enumerate(partition_names): + if ( + is_match(part_name, layer, args.targets, args.ignore) + and args.is_online() + ): + if args.location == TransformLocation.INPUT: + input_tfms[part_index] = TransformTuple(scheme_name, scheme, args) + + elif args.location == TransformLocation.OUTPUT: + output_tfms[part_index] = TransformTuple(scheme_name, scheme, args) + + else: + raise ValueError( + f"Cannot apply `{args.location}` transform to `{layer_name}`" + ) + + return (input_tfms, output_tfms) + + +def get_schemes_args( + transform_config: TransformConfig | None, +) -> Generator[tuple[str, TransformScheme, TransformArgs]]: + if transform_config is None: + return + + for scheme_name, scheme in transform_config.config_groups.items(): + for args in scheme.apply: + yield (scheme_name, scheme, args) + + +def get_layer_partition_names( + layer_name: str, packed_modules_mapping: dict[str, list[str]] +) -> list[str]: + """ + Get all partition names associated with this layer. + Names are returned in order of their partition indices. + + ```python + mapping = {"gate_up_proj", "gate_proj", "up_proj"} + + assert get_layer_partition_names("mlp.gate_up_proj", mapping) == [ + "gate_proj", + "up_proj", + ] + assert get_layer_partition_names("mlp.down_proj", mapping) == ["down_proj"]""" + for fused_suffix, part_suffixes in packed_modules_mapping.items(): + if layer_name.endswith(fused_suffix): + return [ + layer_name.removesuffix(fused_suffix) + part_suffix + for part_suffix in part_suffixes + ] + + return [layer_name] diff --git a/model_executor/layers/quantization/compressed_tensors/transform/module.py b/model_executor/layers/quantization/compressed_tensors/transform/module.py new file mode 100644 index 0000000..f5589c8 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/transform/module.py @@ -0,0 +1,173 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math +from collections.abc import Callable, Hashable + +import torch +from compressed_tensors.transform import ( + TransformArgs, + TransformLocation, + TransformScheme, +) +from torch import Tensor + +import vllm._custom_ops as ops +from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.linear import LinearBase +from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import ( # noqa: E501 + TransformTuple, +) +from vllm.model_executor.layers.utils import dispatch_unquantized_gemm +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.parameter import SharedWeightParameter + + +class HadamardTransform(torch.nn.Module): + """ + Class which handles weight loading, postprocessing, and application of + transforms. Meant to be used with `CompressedTensorsLinearTransformMethod` + and attention transforms method (not implemented yet) + """ + + transforms: dict[int, TransformTuple] # info parsed from transforms config + weight: SharedWeightParameter # container for shared tensors + + scales: dict[int, float] # hadamard scale, usually sqrt(matrix.size(0)) + + def __init__( + self, + transforms: dict[int, TransformTuple], + layer: torch.nn.Module, + weight_loader: Callable, + input_size_per_partition: int, + output_partition_sizes: list[int], + ): + super().__init__() + self.transforms = transforms + self.scales = {} + + if get_tensor_model_parallel_world_size() > 1: + raise NotImplementedError( + "Online transforms with tensor parallelism is not supported" + ) + + # Similar to row/col parallel params, but tensors are separate + # to allow for loading with shared memory + self.weight = SharedWeightParameter(weight_loader=weight_loader) + + # create shared partition data for each partition of the original weight + input_size = input_size_per_partition + for part_index, (_scheme_name, scheme, args) in self.transforms.items(): + output_size = output_partition_sizes[part_index] + weight_size = self._get_weight_size( + layer, scheme, args, input_size, output_size + ) + + data_key = self._get_data_key(scheme, weight_size) + self.weight.add_partition( + part_index, + data_key, + size=(weight_size, weight_size), + dtype=scheme.precision, + ) + + # validate that shared tensors and schemes are correct + self._validate_input_transforms() + + def process_weights_after_loading(self): + for part_id in self.weight.partitions: + data = self.weight.partitions[part_id].data + + # required by torch.compile + self.weight.process_weights_after_loading() + + # precompute scale as a runtime multiply, not division + # do not fold into weight in order to utilize FWHT + self.scales[part_id] = 1 / math.sqrt(data.size(0)) + + # FUTURE: avoid runtime transpose by processing weights + # prior to apply + + def forward(self, value: Tensor, part_id: int = 0) -> Tensor: + if part_id not in self.weight.partitions: + return value + + # use hadacore if possible + if self.transforms[part_id].scheme.type == "hadamard": + if self.transforms[part_id].scheme.head_dim is not None: + weight_size = self.transforms[part_id].scheme.head_dim + value = value.unflatten(-1, (-1, weight_size)) + value = ops.hadacore_transform(value) + value = value.flatten(-2, -1) + + return value + + # sylvester transforms are symmetric, inv => transpose => original + return ops.hadacore_transform(value) + + # fall back to dense + else: + weight = self.weight.partitions[part_id] + weight = ( + weight if self.transforms[part_id].args.inverse else weight.T + ) # linear := x(W.T) + scale = self.scales[part_id] + + if self.transforms[part_id].scheme.head_dim is not None: + value = value.unflatten(-1, (-1, weight.size(0))) + value = ( + dispatch_unquantized_gemm()( + self, value.to(weight.dtype), weight, None + ).to(value.dtype) + * scale + ) + value = value.flatten(-2, -1) + + return value + + return ( + dispatch_unquantized_gemm()( + self, value.to(weight.dtype), weight, None + ).to(value.dtype) + * scale + ) + + def _get_data_key(self, scheme: TransformScheme, weight_size: int) -> Hashable: + return (id(scheme), weight_size) + + def _get_weight_size( + self, + layer: torch.nn.Module, + scheme: TransformScheme, + args: TransformArgs, + input_size: int, + output_size: int, + ) -> int: + if scheme.head_dim is not None: + return scheme.head_dim + + if isinstance(layer, LinearBase): + if args.location == TransformLocation.INPUT: + return input_size + + elif args.location == TransformLocation.OUTPUT: + return output_size + + elif isinstance(layer, VocabParallelEmbedding): + if args.location == TransformLocation.INPUT: + return output_size + + elif args.location == TransformLocation.OUTPUT: + return input_size + + raise ValueError() + + def _validate_input_transforms(self): + assert len(self.transforms) > 0 + location = list(self.transforms.values())[0].args.location + + if location == TransformLocation.INPUT: + first_data = self.weight.partitions[0].data + for partition in self.weight.partitions.values(): + if partition.data.data_ptr() != first_data.data_ptr(): + raise ValueError("") diff --git a/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py b/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model_executor/layers/quantization/compressed_tensors/transform/schemes/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/transform/schemes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d6b88bb564184c53091c655c87f0998b9f38953 GIT binary patch literal 221 zcmZ8b%LxKO4BhbqB3OtA%~1uh1X~!!HR|B}%w)wCJFoz2uogS;Y6arWli8~eUh1*XT5!u&g)|I8!5KI8-LEJsY;-E2B?@IpbP0h8*S2WJ2CV@{r3Ndd#4; e4VJYYr~28O%h$@`g1Qn$33DUfZKV{oMg0IRk3XCM literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/transform/schemes/__pycache__/linear_qutlass_nvfp4.cpython-312.pyc b/model_executor/layers/quantization/compressed_tensors/transform/schemes/__pycache__/linear_qutlass_nvfp4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..306d6c83749f918b92c41b3faba91725ac290ac4 GIT binary patch literal 2667 zcmbtW&2JM&6rWjp*XzwD8{#BTl+r)~VG*$bxuu|#f(TUoNScyVl`N#y;vFX&)*qQ& z(v zH*aR%d%ySmM{BEsU^wBM_LmWa{^U$!NcG894kovdg)Ho#6#ohi&I&0Z8%l*R=R;07 zE2hM(l#+lCTjHdUidYg6EIbcZtq74xlt>xjEwE(CM9D~2bTOQ25zt4-QWlXV5^+5g zoR?DPu-@{RGo!k|cAw8@3zWDnu_j2)%~N+gJw>vF?RjnCtK_-l#X=G`9eQo;Gdo92 zIzi2xo5|Dc2=S)!7Hew?CW-}zurAv*W{RF;x~`EaBn`KI)E9wl^xDO-WMHXoK%FU? zIj^Co*ttT{GrUaJWfCP`k>=7u@G75%f$~>w{Q%?^@~|%jQbJx+FN4?X^I1&a!^jhx zT6n#_!@^e&BLwS*Fe+io_v>^cUJZ5lPRp+_^=S!B`!V0X!tITtyP?s?{0Pc#pi4}# zT|4J`W-d)AzdQ>$B*&z>TjVKXT83q3nKDW9#e(75^F$YDlw06uxZF%{@83%aFIz|y z?82aD(n;bC&K6IedjIN4l5p%RiNYMjBA2-4IN3xtZxP2Jvm{;g@-*R?bA-AH-^up8 z>Di!{ZsvpG1qqSx8evTYH*>)*4|g+`xoZ&3cwvr0HX@&N@av5LSwPPe6pgJ8Jv?x_ zynpC{{J~QsL@(e+s9UQK}pE_KN zghgo!45BnAjlq4v$JT8i-$Kv{N?{A7ge8>07G9K7p(sGY3X$+mVVPF_-P?#22Hm!8 zBA)>e?JO635K23*W@Ia}D5m5kNT*bE9O+TYlP1kQ)fimR0{>f=AUS@-+cFnVH#4`P zj*!Qb#UC%T*%dxPH?Y^VyBsZ{@7f{vaY-nJO5u`Nk}{HwN|D?6d;E>ObDk~)R$3eO zch=g;J=?Jp+~&XUu48Oh;6wQd0oFvf zSRjtl!TtJ&(F91l5m4 z4I}NBQp4rdptms=XeWOpZwB)NZz`?v?a$r^%I+^fUI1XbQTyJtgFher>F7iKT>0?% zhuUyi9xiLckJ@6(la;pq4RzOs3iQB++P$If-ca{EX_Yml)`qnGPc)bBsxWAREX{y z21&w8k6)1Z>)cB~UM!+#!d@{}JB!gl{Yg|3`!}`DmD%OlmFvsbYa;M938}kQhL(qF z5zfg7)}3W%ZKBe)zs5Jz6r^Y?J2;;;9i z9YmA|G=TG`|HE*z!Nr5dM)_|7${Rk57>3Dx$HJDK_E^Zyd5~cFyc6^~?!UVz2p&fr zlZS$MouNa(a!vb qAz(J!J6Ffo;^nTxm3F-*f&~07DO(YkJ?rkn{pEKq{Dt7_pZ*WSkdmeV literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py b/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py new file mode 100644 index 0000000..f0bb47a --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsScheme, + CompressedTensorsW4A4Fp4, +) +from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import ( # noqa: E501 + CompressedTensorsLinearTransformMethod, + TransformTuple, +) + +__all__ = ["is_qutlass_fp4_scheme", "QutlassNvFP4LinearMethod"] + + +def is_qutlass_fp4_scheme( + quant_scheme: CompressedTensorsScheme | None, + input_tfms: dict[int, TransformTuple], +) -> bool: + return ( + isinstance(quant_scheme, (CompressedTensorsW4A4Fp4,)) + and len(input_tfms) == 1 + and input_tfms[0].scheme.head_dim == quant_scheme.group_size + ) + + +class QutlassNvFP4LinearMethod(CompressedTensorsLinearTransformMethod): + def create_weights( + self, + layer, + input_size_per_partition, + output_partition_sizes, + input_size, + output_size, + params_dtype, + **extra_weight_attrs, + ): + # initializes fp4 qparams + assert isinstance(layer.scheme, (CompressedTensorsW4A4Fp4,)) + ret = super().create_weights( + layer, + input_size_per_partition, + output_partition_sizes, + input_size, + output_size, + params_dtype, + **extra_weight_attrs, + ) + + assert self.input_transform is not None + assert len(self.input_transform.weight) == 1 + assert self.input_transform.weight[0].size(0) == layer.scheme.group_size + + return ret + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError() diff --git a/model_executor/layers/quantization/compressed_tensors/transform/utils.py b/model_executor/layers/quantization/compressed_tensors/transform/utils.py new file mode 100644 index 0000000..2f353de --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/transform/utils.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import NamedTuple + +from compressed_tensors.transform import TransformArgs, TransformScheme + +__all__ = ["TransformTuple"] + + +class TransformTuple(NamedTuple): + scheme_name: str + scheme: TransformScheme + args: TransformArgs diff --git a/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py new file mode 100644 index 0000000..25c7d33 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -0,0 +1,224 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm.triton_utils import tl, triton + + +def is_weak_contiguous(x: torch.Tensor): + strides = x.stride() + sizes = x.shape + is_not_transpose = strides[0] == 1 and (strides[1] >= max(1, sizes[0])) + is_transpose = strides[1] == 1 and (strides[0] >= max(1, sizes[1])) + return is_transpose or is_not_transpose + + +@triton.jit +def scaled_mm_kernel( + a_ptr, + b_ptr, + scale_a_ptr, + scale_b_ptr, + c_ptr, + bias_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + ACCUMULATOR_DTYPE: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + BLOCK_SIZE_SCALE_A: tl.constexpr, + BLOCK_SIZE_SCALE_B: tl.constexpr, +): + pid = tl.program_id(axis=0) + + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + accumulator_dtype = ACCUMULATOR_DTYPE + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype) + + # NOTE: Some tensor inputs are so large, they will cause int32 overflow + # so it is necessary to use tl.int64 for all the offsets, else SEGV will + # eventually occur. + + # Offsets and masks. + offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + masks_am = offsets_am < M + + offsets_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + masks_bn = offsets_bn < N + + offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64) + offsets_a = stride_am * offsets_am[:, None] + stride_ak * offsets_k[None, :] + offsets_b = stride_bk * offsets_k[:, None] + stride_bn * offsets_bn[None, :] + + # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create + # appropriate offsets and masks for each case. Same goes for + # BLOCK_SIZE_SCALE_B. + offsets_scale_am = ( + tl.arange(0, BLOCK_SIZE_SCALE_A) + + (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M + ) + masks_scale_am = offsets_scale_am < M + + offsets_scale_bn = ( + tl.arange(0, BLOCK_SIZE_SCALE_B) + + (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N + ) + masks_scale_bn = offsets_scale_bn < N + + a_ptrs = a_ptr + offsets_a + b_ptrs = b_ptr + offsets_b + + scale_a_ptrs = scale_a_ptr + offsets_scale_am + scale_b_ptrs = scale_b_ptr + offsets_scale_bn + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + masks_k = offsets_k < K + masks_a = masks_am[:, None] & masks_k[None, :] + a = tl.load(a_ptrs, mask=masks_a) + + masks_b = masks_k[:, None] & masks_bn[None, :] + b = tl.load(b_ptrs, mask=masks_b) + + # Accumulate results. + accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype) + + offsets_k += BLOCK_SIZE_K + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + # Apply scale at end. + masks_scale_a = masks_scale_am[:, None] & (tl.arange(0, 1) < 1)[:, None] + scale_a = tl.load(scale_a_ptrs[:, None], masks_scale_a) + # Need to broadcast to the appropriate size, if scale_a is already + # (BLOCK_SIZE_M, 1) then it will broadcast to its own shape. Same goes + # for scale_b below. + scale_a = scale_a.broadcast_to((BLOCK_SIZE_M, 1)) + accumulator = scale_a * accumulator.to(tl.float32) + + masks_scale_b = masks_scale_bn[:, None] & (tl.arange(0, 1) < 1)[None, :] + scale_b = tl.load(scale_b_ptrs[:, None], masks_scale_b) + scale_b = scale_b.broadcast_to((BLOCK_SIZE_N, 1)) + accumulator = scale_b.T * accumulator.to(tl.float32) + + # Convert to output format. + c = accumulator.to(c_ptr.type.element_ty) + + # Add bias, it's already in output format, so add it after conversion. + if bias_ptr: + offsets_bias = offsets_bn + bias_ptrs = bias_ptr + offsets_bias + bias_mask = offsets_bias < N + bias = tl.load(bias_ptrs, bias_mask) + c += bias + + # Save output + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + offs_cm = offs_cm.to(tl.int64) + offs_cn = offs_cn.to(tl.int64) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + tl.store(c_ptrs, c, mask=c_mask) + + +# input - [M, K] +# weight - [K, N] +def triton_scaled_mm( + input: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: type[torch.dtype], + bias: torch.Tensor | None = None, + block_size_m: int = 32, + block_size_n: int = 32, + block_size_k: int = 32, + use_heuristic=True, +) -> torch.Tensor: + M, K = input.shape + N = weight.shape[1] + + assert N > 0 and K > 0 and M > 0 + assert weight.shape[0] == K + assert input.dtype == weight.dtype + + scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a + scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b + + assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point() + assert scale_a.shape[1] == 1 and (scale_a.shape[0] == 1 or scale_a.shape[0] == M) + assert scale_b.shape[1] == 1 and (scale_b.shape[0] == 1 or scale_b.shape[0] == N) + assert out_dtype.is_floating_point + assert bias is None or bias.is_floating_point() + assert is_weak_contiguous(input) + assert is_weak_contiguous(weight) + + grid = lambda META: ( + triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), + ) + + result = torch.empty((M, N), dtype=out_dtype, device=input.device) + + has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1 + + if use_heuristic: + is_small_N = N < 8192 + next_power_of_2_M = max(32, triton.next_power_of_2(M)) + if next_power_of_2_M <= 32: + tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256) + elif next_power_of_2_M <= 64: + tile_shape = (64, 64, 256) + elif next_power_of_2_M <= 128: + tile_shape = (64, 128, 128) + else: + tile_shape = (128, 128, 128) + + block_size_m, block_size_n, block_size_k = tile_shape + + block_size_sa = 1 if has_scalar(scale_a) else block_size_m + block_size_sb = 1 if has_scalar(scale_b) else block_size_n + + accumulator_dtype = tl.float32 if input.is_floating_point() else tl.int32 + + # A = input, B = weight, C = result + # A = M x K, B = K x N, C = M x N + scaled_mm_kernel[grid]( + input, + weight, + scale_a, + scale_b, + result, + bias, + M, + N, + K, + input.stride(0), + input.stride(1), + weight.stride(0), + weight.stride(1), + result.stride(0), + result.stride(1), + accumulator_dtype, + BLOCK_SIZE_M=block_size_m, + BLOCK_SIZE_N=block_size_n, + BLOCK_SIZE_K=block_size_k, + BLOCK_SIZE_SCALE_A=block_size_sa, + BLOCK_SIZE_SCALE_B=block_size_sb, + ) + + return result.to(out_dtype) diff --git a/model_executor/layers/quantization/compressed_tensors/utils.py b/model_executor/layers/quantization/compressed_tensors/utils.py new file mode 100644 index 0000000..f880921 --- /dev/null +++ b/model_executor/layers/quantization/compressed_tensors/utils.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable, Mapping +from types import MappingProxyType + +import regex as re +from compressed_tensors import CompressionFormat +from torch.nn import Module + + +def is_activation_quantization_format(format: str) -> bool: + _ACTIVATION_QUANTIZATION_FORMATS = [ + CompressionFormat.naive_quantized.value, + CompressionFormat.int_quantized.value, + CompressionFormat.float_quantized.value, + CompressionFormat.nvfp4_pack_quantized.value, + ] + return format in _ACTIVATION_QUANTIZATION_FORMATS + + +def should_ignore_layer( + layer_name: str | None, + ignore: Iterable[str] = tuple(), + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}), +) -> bool: + if layer_name is None: + return False + + # layer_name = model.layers.0.self_attn.qkv_proj + # proj_name = qkv_proj + proj_name = layer_name.split(".")[-1] + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in fused_mapping and layer_name not in ignore: + shard_proj_names = fused_mapping[proj_name] + + # Convert fused_name --> [shard_names] + shard_names = [ + layer_name.replace(proj_name, shard_proj_name) + for shard_proj_name in shard_proj_names + ] + + # Layer should be ignored if shards are ignored. + should_ignore_layer = None + for shard_name in shard_names: + should_ignore_shard = check_equal_or_regex_match( + layer_name=shard_name, targets=ignore + ) + + # If shard_idx=0, set layer ignore to match shard. + if should_ignore_layer is None: + should_ignore_layer = should_ignore_shard + + # If shard_idx=1+ confirm scheme matches prior shards. + elif should_ignore_shard != should_ignore_layer: + raise ValueError( + f"Found a different quantization schemes for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme." + ) + + # Unfused layers like down_proj and o_proj will match + # the safetensors checkpoint already. + else: + should_ignore_layer = check_equal_or_regex_match( + layer_name=layer_name, targets=ignore + ) + + assert should_ignore_layer is not None + return should_ignore_layer + + +def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool: + """ + Checks whether a layer_name is exactly equal or a regex match for + if target starts with 're:' to any target in list. + """ + return any(_is_equal_or_regex_match(layer_name, target) for target in targets) + + +def find_matched_target( + layer_name: str | None, + module: Module, + targets: Iterable[str], + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}), +) -> str: + """ + Helper function to look up which "target" in the compressed-tensors + config that a layer corresponds to. + + Recall that a compressed-tensors configs has a concept of + config_groups, where each layer can be quantized with a different + scheme. + + targets in each config_group will be a list of either layer names + (or regexes corresponding to layer names) or names of torch Modules. + + First, we try to match the layer_name with a target + Second, we try to match the module's name with a target + Third, we try to map the layer_name to a list of fused module names. + *All* component module names must match in order for a match to be + successful. A successful match returns the first component target + + :param layer_name: layer name + :param module: torch.nn.Module + :param targets: list of targets to match the layer against + :param fused_mapping: map from fused layer names to its components + :param fused_strategy: either "all" or "any". If using "all", fused + layers match if "all" of its components match + """ + + if layer_name is None: + layer_name = "" + + matched_target = ( + _find_first_match(layer_name, targets) + or _find_first_match(module.__class__.__name__, targets, True) + or _match_fused_layer(layer_name, targets, fused_mapping) + ) + + if matched_target is None: + raise ValueError( + f"Unable to find matching target for {layer_name} in the " + "compressed-tensors config." + ) + + return matched_target + + +def _find_first_match( + value: str, targets: Iterable[str], check_contains: bool = False +) -> str | None: + """ + Returns first element of target that matches value either + exactly or as a regex after 're:'. If check_contains is set to True, + additionally checks if the target string is contained within the value. + + :param value: string to compare the list of targets against + :param targets: list of targets to match the layer against + :param check_contains: whether or not to do a substring match + """ + + for target in targets: + if _is_equal_or_regex_match(value, target, check_contains=check_contains): + return target + return None + + +def _is_equal_or_regex_match( + value: str, target: str, check_contains: bool = False +) -> bool: + """ + Checks whether a value is exactly equal or a regex match for target + if target starts with 're:'. If check_contains is set to True, + additionally checks if the target string is contained within the value. + """ + + if target.startswith("re:"): + pattern = target[3:] + if re.match(pattern, value): + return True + elif check_contains: + if target.lower() in value.lower(): + return True + elif target == value: + return True + return False + + +def _match_fused_layer( + layer_name: str, + target_layers: Iterable[str], + fused_mapping: Mapping[str, list[str]], +) -> str | None: + """ + Match a fused layer name to its corresponding individual layer in + target_layers. Returns first value in fused_mapping which matches targets + + Implements an "all" matching strategy where a fused layer matches iff + "all" of its components match + + :param layer_name: layer name + :param target_layers: list of targets to match the layer against + :param fused_mapping: map from fused layer names to its components + + Examples: + layer_name = "model.layers.0.self_attn.qkv_proj" + target_layers = ["model.layers.0.self_attn.q_proj", + "model.layers.0.self_attn.k_proj", + "model.layers.0.self_attn.v_proj"] + """ + # find layer_name in mapping + fused = next((key for key in fused_mapping if layer_name.endswith(key)), None) + if fused is None: + return None + + # expand path of unfused components + unfused_paths = [ + layer_name.replace(fused, unfused) for unfused in fused_mapping[fused] + ] + + # for each unfused component, find a match in targets + unfused_matches: list[str | None] = [] + for unfused in unfused_paths: + for target in target_layers: + if _is_equal_or_regex_match(unfused, target): + unfused_matches.append(target) + break + else: + unfused_matches.append(None) + + return unfused_matches[0] if all(unfused_matches) else None diff --git a/model_executor/layers/quantization/deepspeedfp.py b/model_executor/layers/quantization/deepspeedfp.py new file mode 100644 index 0000000..4f742d8 --- /dev/null +++ b/model_executor/layers/quantization/deepspeedfp.py @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import ( + QuantizationConfig, + QuantizationMethods, +) +from vllm.model_executor.utils import set_weight_attrs + + +class DeepSpeedFPConfig(QuantizationConfig): + """Config for DeepSpeed FP quantizer. It supports fp6 and fp8. + + Args: + weight_bits: the target quantization bits, 6 or 8. + group_size: group size for quantizaiton, default to 128. + """ + + def __init__( + self, + weight_bits: int = 8, + group_size: int = 512, + ) -> None: + super().__init__() + self.weight_bits = weight_bits + self.group_size = group_size + self.valid_types = [torch.bfloat16, torch.float16] + + if self.weight_bits not in (6, 8): + raise ValueError( + "Currently, only 6-bit or 8-bit weight quantization are " + f"supported for DeepSpeed FP quantizaiton, but got " + f"{self.weight_bits} bits." + ) + + def __repr__(self) -> str: + return ( + f"DeepSpeedFPConfig(weight_bits={self.weight_bits}), " + f"group_size={self.group_size}" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "deepspeedfp" + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "DeepSpeedFPConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + return cls(weight_bits=weight_bits, group_size=group_size) + + def get_linear_method(self) -> "DeepSpeedFPLinearMethod": + return DeepSpeedFPLinearMethod(self) + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + # Need to figure it out + def get_min_capability(cls) -> int: + return 60 + + @staticmethod + def get_config_filenames() -> list[str]: + return [ + "quant_config.json", + "quantize_config.json", + ] + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["DeepSpeedFPLinearMethod"]: + if isinstance(layer, LinearBase): + return DeepSpeedFPLinearMethod(self) + return None + + +class DeepSpeedFPLinearMethod(LinearMethodBase): + """Linear method for DeepSpeedFP quantizer. + + Args: + quant_config: the DeepSpeedFP quantization config. + """ + + def __init__(self, quant_config: DeepSpeedFPConfig): + self.quant_config = quant_config + self.weight = None + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + weight_loader=None, + **extra_weight_attrs, + ): + del output_size + del input_size + output_size_per_partition = sum(output_partition_sizes) + weight = DeepSpeedFPParameter( + torch.Size((output_size_per_partition, input_size_per_partition)), + params_dtype=params_dtype, + quant_config=self.quant_config, + ) + set_weight_attrs( + weight, + { + "input_dim": 1, + "output_dim": 0, + }, + ) + layer.register_parameter("weight", weight) + + def quant_weight_loader(param, loaded_weight, *args, **kwargs): + # Calls the original weight loader (if any), quantizes the result, + # and then loads the quantized parameter. + if weight_loader is not None: + orig_param_data = param.data + param.data = param.ds_dequantize() + weight_loader(param, loaded_weight, *args, **kwargs) + param.data, loaded_weight = orig_param_data, param.data + param.ds_quantize_(loaded_weight.cuda()) + + extra_weight_attrs["weight_loader"] = quant_weight_loader + set_weight_attrs(weight, extra_weight_attrs) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + weight = layer.weight + y = weight.ds_dequantize() + return F.linear(x, y, bias) + + +class DeepSpeedFPParameter(nn.Parameter): + """ + DeepSpeedFP quantized parameter class that implements fp8/fp6 + quantization deepspeed. Weights are stored in quantized form on + GPUs, and can be dequantized on-the-fly when needed by the model. + """ + + def __new__( + cls, + orig_shape: torch.Size, + params_dtype: torch.dtype, + quant_config: DeepSpeedFPConfig, + ): + try: + import deepspeed + + if version.parse(deepspeed.__version__) < version.parse("0.14.2"): + raise ImportError( + "deepspeed version is wrong. Please install deepspeed>=0.14.2." + ) + from deepspeed.ops.fp_quantizer import FP_Quantize + except ImportError as err: + raise ImportError( + "Please install deepspeed>=0.14.2 via " + "`pip install deepspeed>=0.14.2` to use " + "deepspeedfp quantizer." + ) from err + data = torch.empty( + ( + orig_shape.numel() // quant_config.group_size, + quant_config.group_size * quant_config.weight_bits // 8 + 4, + ), + dtype=torch.int8, + ) + self = torch.Tensor._make_subclass(cls, data, data.requires_grad) + self.orig_shape = orig_shape + self.quant_config = quant_config + self.fp_quantizer = FP_Quantize(group_size=quant_config.group_size) + self.fp_quantizer.orig_shape = orig_shape + self.fp_quantizer.orig_dtype = params_dtype + return self + + def ds_quantize_(self, tensor: torch.Tensor): + assert tensor.device.type == "cuda" and tensor.dtype != torch.int8 + return self.data.copy_( + self.fp_quantizer.quantize( + tensor.data, + q_bits=self.quant_config.weight_bits, + ) + ) + + def ds_dequantize(self, fp_out=None) -> torch.Tensor: + """ + Return a tensor containing the dequantized weights of this parameter. + """ + assert self.data.device.type == "cuda" and self.data.dtype == torch.int8 + return self.fp_quantizer.dequantize( + self.data, fp_out=fp_out, q_bits=self.quant_config.weight_bits + ) + + def ds_selective_dequantize(self, indices, fp_out=None) -> torch.Tensor: + """ + Return a tensor where only the weights at `indices` are dequantized + (to save HBM -> SRAM bandwidth). + """ + assert self.data.device.type == "cuda" and self.data.dtype == torch.int8 + return self.fp_quantizer.selective_dequantize( + self.data, indices, fp_out=fp_out, q_bits=self.quant_config.weight_bits + ) diff --git a/model_executor/layers/quantization/experts_int8.py b/model_executor/layers/quantization/experts_int8.py new file mode 100644 index 0000000..5241f9a --- /dev/null +++ b/model_executor/layers/quantization/experts_int8.py @@ -0,0 +1,240 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any, Optional + +import torch + +from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + FusedMoEConfig, + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + int8_w8a16_moe_quant_config, +) +from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.utils import set_weight_attrs + + +class ExpertsInt8Config(QuantizationConfig): + """Config class for Int8 experts quantization.""" + + def __init__(self) -> None: + super().__init__() + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "experts_int8" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "ExpertsInt8Config": + return cls() + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + return UnquantizedLinearMethod() + elif isinstance(layer, FusedMoE): + return ExpertsInt8MoEMethod(self, layer.moe_config) + return None + + +class ExpertsInt8MoEMethod(FusedMoEMethodBase): + def __init__( + self, + quant_config: ExpertsInt8Config, + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + int8_dtype = torch.int8 + + assert "weight_loader" in extra_weight_attrs + weight_loader = extra_weight_attrs["weight_loader"] + wrapped_weight_loader = ExpertsInt8MoEMethod.quantizing_weight_loader( + layer, weight_loader + ) + extra_weight_attrs["weight_loader"] = wrapped_weight_loader + + # Fused gate_up_proj (column parallel) + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=int8_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + # down_proj (row parallel) + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=int8_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + w13_scale = torch.nn.Parameter( + torch.zeros( + num_experts, 2 * intermediate_size_per_partition, dtype=torch.float32 + ), + requires_grad=False, + ) + layer.register_parameter("w13_scale", w13_scale) + + w2_scale = torch.nn.Parameter( + torch.zeros(num_experts, hidden_size, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_scale", w2_scale) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return int8_w8a16_moe_quant_config( + w1_scale=layer.w13_scale, w2_scale=layer.w2_scale, w1_zp=None, w2_zp=None + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `ExpertsInt8MoEMethod` yet." + ) + + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + return fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + + @staticmethod + def quantizing_weight_loader(layer, weight_loader): + def quantize_and_call_weight_loader( + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: int, + expert_id: int, + ): + tp_rank = get_tensor_model_parallel_rank() + shard_size = layer.intermediate_size_per_partition + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + device = get_tp_group().device + loaded_weight = loaded_weight.to(device) + # w1, gate_proj case: Load into first shard of w13. + if shard_id == "w1": + scales = quantize_in_place_and_get_scales(loaded_weight[shard, :]) + layer.w13_scale.data[expert_id, 0:shard_size].copy_(scales[:, 0]) + # w3, up_proj case: Load into second shard of w13. + elif shard_id == "w3": + scales = quantize_in_place_and_get_scales(loaded_weight[shard, :]) + layer.w13_scale.data[expert_id, shard_size : 2 * shard_size].copy_( + scales[:, 0] + ) + # w2, down_proj case: Load into only shard of w2. + elif shard_id == "w2": + scales = quantize_in_place_and_get_scales(loaded_weight[:, shard]) + layer.w2_scale.data[expert_id, :].copy_(scales[:, 0]) + else: + raise ValueError(f"Shard id must be in [0,1,2] but got {shard_id}") + weight_loader(param, loaded_weight, weight_name, shard_id, expert_id) + + return quantize_and_call_weight_loader + + +def quantize_in_place_and_get_scales(weight: torch.Tensor) -> torch.Tensor: + vmax = torch.iinfo(torch.int8).max + scales = torch.max(torch.abs(weight), dim=1, keepdim=True)[0] / vmax + + weight.div_(scales) + weight.round_() + weight.clamp_(-vmax, vmax) + + return scales diff --git a/model_executor/layers/quantization/fbgemm_fp8.py b/model_executor/layers/quantization/fbgemm_fp8.py new file mode 100644 index 0000000..6ba18e5 --- /dev/null +++ b/model_executor/layers/quantization/fbgemm_fp8.py @@ -0,0 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Optional + +import torch +from torch.nn import Module +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + apply_fp8_marlin_linear, + prepare_fp8_layer_for_marlin, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + is_layer_skipped, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + Fp8LinearOp, + maybe_create_device_identity, + normalize_e4m3fn_to_e4m3fnuz, +) +from vllm.model_executor.parameter import ( + ChannelQuantScaleParameter, + ModelWeightParameter, +) +from vllm.platforms import current_platform + +logger = init_logger(__name__) + + +class FBGEMMFp8Config(QuantizationConfig): + """Config class for FBGEMM Fp8.""" + + def __init__(self, ignore_list: list[str], input_scale_ub: float): + super().__init__() + self.ignore_list = ignore_list if ignore_list else [] + self.input_scale_ub = input_scale_ub + + # For GPUs that lack FP8 hardware support, we can leverage the Marlin + # kernel for fast weight-only FP8 quantization + self.use_marlin = not current_platform.has_device_capability(89) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "fbgemm_fp8" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.float16] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "FBGEMMFp8Config": + ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"]) + input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"]) + return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignore_list, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedLinearMethod() + return FBGEMMFp8LinearMethod(self) + return None + + +class FBGEMMFp8LinearMethod(LinearMethodBase): + def __init__(self, quant_config: FBGEMMFp8Config): + self.quant_config = quant_config + self.fp8_linear = Fp8LinearOp( + act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN + ) + self.out_dtype = torch.get_default_dtype() + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + maybe_create_device_identity() + weight_loader = extra_weight_attrs.get("weight_loader") + del input_size, output_size + output_size_per_partition = sum(output_partition_sizes) + + layer.logical_widths = output_partition_sizes + + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + + # WEIGHT + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE UPPER BOUND + input_scale_ub = torch.nn.Parameter( + torch.tensor((self.quant_config.input_scale_ub), dtype=torch.float32), + requires_grad=False, + ) + layer.input_scale_ub = input_scale_ub + + def process_weights_after_loading(self, layer: Module) -> None: + # required by torch.compile + layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False) + layer.weight = Parameter(layer.weight.data, requires_grad=False) + + weight = layer.weight + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=layer.weight_scale, input_scale=None + ) + if input_scale is not None: + layer.input_scale = Parameter(input_scale, requires_grad=False) + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + + layer.weight = Parameter(weight.t(), requires_grad=False) + if self.quant_config.use_marlin: + prepare_fp8_layer_for_marlin(layer) + # Activations not quantized for marlin. + del layer.input_scale_ub + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + if self.quant_config.use_marlin: + return apply_fp8_marlin_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + workspace=layer.workspace, + size_n=layer.output_size_per_partition, + size_k=layer.input_size_per_partition, + bias=bias, + ) + + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + out_dtype=self.out_dtype, + input_scale=None, + input_scale_ub=layer.input_scale_ub, + bias=bias, + ) diff --git a/model_executor/layers/quantization/fp8.py b/model_executor/layers/quantization/fp8.py new file mode 100644 index 0000000..0479bec --- /dev/null +++ b/model_executor/layers/quantization/fp8.py @@ -0,0 +1,1333 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from enum import Enum +from functools import partial +from typing import TYPE_CHECKING, Any, Optional + +import torch +from torch.nn import Module +from torch.nn.parameter import Parameter + +import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.logger import init_logger +from vllm.model_executor.layers.batch_invariant import ( + vllm_is_batch_invariant, +) +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + FusedMoEActivationFormat, + FusedMoEMethodBase, + FusedMoEPermuteExpertsUnpermute, + FusedMoEPrepareAndFinalize, + FusedMoeWeightScaleSupported, +) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + RoutingMethodType, + fp8_w8a8_moe_quant_config, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe +from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + FlashinferMoeBackend, + apply_flashinfer_per_tensor_scale_fp8, + build_flashinfer_fp8_cutlass_moe_prepare_finalize, + flashinfer_cutlass_moe_fp8, + get_flashinfer_moe_backend, + register_moe_scaling_factors, + rotate_flashinfer_fp8_moe_weights, + select_cutlass_fp8_gemm_impl, + swap_w13_to_w31, +) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + W8A8BlockFp8LinearOp, + create_fp8_input_scale, + create_fp8_scale_parameter, + create_fp8_weight_parameter, + deepgemm_post_process_fp8_weight_block, + maybe_post_process_fp8_weight_block, + process_fp8_weight_block_strategy, + process_fp8_weight_tensor_strategy, + validate_fp8_block_shape, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + apply_fp8_marlin_linear, + prepare_fp8_layer_for_marlin, + prepare_moe_fp8_layer_for_marlin, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + is_layer_skipped, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + Fp8LinearOp, + all_close_1d, + cutlass_block_fp8_supported, + cutlass_fp8_supported, + maybe_create_device_identity, + normalize_e4m3fn_to_e4m3fnuz, + per_tensor_dequantize, +) +from vllm.model_executor.parameter import ( + BlockQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.utils.deep_gemm import ( + is_deep_gemm_e8m0_used, + is_deep_gemm_supported, +) +from vllm.utils.flashinfer import has_flashinfer_moe +from vllm.utils.import_utils import has_deep_gemm + +if TYPE_CHECKING: + from vllm.model_executor.models.utils import WeightsMapper + +ACTIVATION_SCHEMES = ["static", "dynamic"] + +logger = init_logger(__name__) + + +class Fp8MoeBackend(Enum): + NONE = 0 + FLASHINFER_TRTLLM = 1 + FLASHINFER_CUTLASS = 2 + DEEPGEMM = 3 + CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4 + MARLIN = 5 + TRITON = 6 + + +def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: + """ + Select the primary FP8 MoE backend + Note: Shape-specific fallbacks may still occur at runtime. + """ + # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100. + if ( + current_platform.is_cuda() + and ( + current_platform.is_device_capability(100) + or current_platform.is_device_capability(90) + ) + and envs.VLLM_USE_FLASHINFER_MOE_FP8 + and has_flashinfer_moe() + ): + backend = get_flashinfer_moe_backend() + if backend == FlashinferMoeBackend.TENSORRT_LLM: + logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100") + return Fp8MoeBackend.FLASHINFER_TRTLLM + else: + if block_quant and current_platform.is_device_capability(100): + raise ValueError( + "FlashInfer FP8 MoE throughput backend does not " + "support block quantization. Please use " + "VLLM_FLASHINFER_MOE_BACKEND=latency " + "instead." + ) + logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM90/SM100") + return Fp8MoeBackend.FLASHINFER_CUTLASS + + # weight-only path for older GPUs without native FP8 + use_marlin = ( + not current_platform.has_device_capability(89) + or envs.VLLM_TEST_FORCE_FP8_MARLIN + ) + if current_platform.is_rocm(): + use_marlin = False + if use_marlin: + logger.info_once("Using Marlin backend for FP8 MoE") + return Fp8MoeBackend.MARLIN + + # deepGEMM on supported platforms with block-quantized weights + if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant: + if not has_deep_gemm(): + logger.warning_once("DeepGEMM backend requested but not available.") + elif is_deep_gemm_supported(): + logger.info_once("Using DeepGEMM backend for FP8 MoE") + return Fp8MoeBackend.DEEPGEMM + + # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights + if ( + current_platform.is_cuda() + and current_platform.is_device_capability(100) + and block_quant + ): + logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE") + return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM + + # default to Triton + logger.info_once("Using Triton backend for FP8 MoE") + return Fp8MoeBackend.TRITON + + +class Fp8Config(QuantizationConfig): + """Config class for FP8.""" + + def __init__( + self, + is_checkpoint_fp8_serialized: bool = False, + activation_scheme: str = "dynamic", + ignored_layers: list[str] | None = None, + weight_block_size: list[int] | None = None, + ) -> None: + super().__init__() + + self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized + + if activation_scheme not in ACTIVATION_SCHEMES: + raise ValueError(f"Unsupported activation scheme {activation_scheme}") + self.activation_scheme = activation_scheme + self.ignored_layers = ignored_layers or [] + if weight_block_size is not None: + if not is_checkpoint_fp8_serialized: + raise ValueError( + "The block-wise quantization only supports fp8-serialized " + "checkpoint for now." + ) + if len(weight_block_size) != 2: + raise ValueError( + "The quantization block size of weight must have 2 " + f"dimensions, but got {len(weight_block_size)} dimensions" + ) + if activation_scheme != "dynamic": + raise ValueError( + "The block-wise quantization only supports " + "dynamic activation scheme for now, but got " + f"{activation_scheme} activation scheme." + ) + self.weight_block_size = weight_block_size + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "fp8" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.ignored_layers is not None: + self.ignored_layers = hf_to_vllm_mapper.apply_list(self.ignored_layers) + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "Fp8Config": + quant_method = cls.get_from_keys(config, ["quant_method"]) + is_checkpoint_fp8_serialized = "fp8" in quant_method + activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) + weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None) + if not ignored_layers: + ignored_layers = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None + ) + return cls( + is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, + activation_scheme=activation_scheme, + ignored_layers=ignored_layers, + weight_block_size=weight_block_size, + ) + + def get_xpu_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention + from vllm.model_executor.layers.quantization.ipex_quant import ( + XPUFp8LinearMethod, + XPUFp8MoEMethod, + ) + + fp8_config = Fp8Config( + is_checkpoint_fp8_serialized=self.is_checkpoint_fp8_serialized, + activation_scheme=self.activation_scheme, + ignored_layers=self.ignored_layers, + weight_block_size=self.weight_block_size, + ) + + if isinstance(layer, LinearBase): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedLinearMethod() + return XPUFp8LinearMethod(fp8_config) + elif isinstance(layer, FusedMoE): + return XPUFp8MoEMethod(fp8_config, layer) + elif isinstance(layer, Attention): + return Fp8KVCacheMethod(self) + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + if current_platform.is_xpu(): + return self.get_xpu_quant_method(layer, prefix) + if isinstance(layer, LinearBase): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedLinearMethod() + return Fp8LinearMethod(self) + elif isinstance(layer, FusedMoE): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedFusedMoEMethod(layer.moe_config) + return Fp8MoEMethod(self, layer) + elif isinstance(layer, Attention): + return Fp8KVCacheMethod(self) + return None + + def get_cache_scale(self, name: str) -> str | None: + """ + Check whether the param name matches the format for k/v cache scales + in compressed-tensors. If this is the case, return its equivalent + param name expected by vLLM + + :param name: param name + :return: matching param name for KV cache scale in vLLM + """ + if name.endswith(".output_scale") and ".k_proj" in name: + return name.replace(".k_proj.output_scale", ".attn.k_scale") + if name.endswith(".output_scale") and ".v_proj" in name: + return name.replace(".v_proj.output_scale", ".attn.v_scale") + if name.endswith(".output_scale") and ".q_proj" in name: + return name.replace(".q_proj.output_scale", ".attn.q_scale") + if name.endswith("self_attn.prob_output_scale"): + return name.replace(".prob_output_scale", ".attn.prob_scale") + # If no matches, return None + return None + + +class Fp8LinearMethod(LinearMethodBase): + """Linear method for FP8. + Supports loading FP8 checkpoints with static weight scale and + dynamic/static activation scale. + + Also supports loading quantized FP16/BF16 model checkpoints with dynamic + activation scaling. The weight scaling factor will be initialized after + the model weights are loaded. + + Limitations: + 1. Only support per-tensor quantization due to torch._scaled_mm support. + 2. Only support float8_e4m3fn data type due to the limitation of + torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856) + + Args: + quant_config: The quantization config. + """ + + def __init__(self, quant_config: Fp8Config): + self.quant_config = quant_config + self.cutlass_block_fp8_supported = cutlass_block_fp8_supported() + self.out_dtype = torch.get_default_dtype() + + # For GPUs that lack FP8 hardware support, we can leverage the Marlin + # kernel for fast weight-only FP8 quantization + self.use_marlin = ( + not current_platform.has_device_capability(89) + or envs.VLLM_TEST_FORCE_FP8_MARLIN + ) + # Disable marlin for rocm + if current_platform.is_rocm(): + self.use_marlin = False + if vllm_is_batch_invariant(): + self.use_marlin = False + + self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_deep_gemm = is_deep_gemm_supported() + + self.weight_block_size = self.quant_config.weight_block_size + self.block_quant = self.weight_block_size is not None + self.act_q_static = self.quant_config.activation_scheme == "static" + if self.weight_block_size: + self.act_q_group_shape = GroupShape(1, self.weight_block_size[0]) + else: + # Use per-token quantization for better perf if dynamic and cutlass + if not self.act_q_static and cutlass_fp8_supported(): + self.act_q_group_shape = GroupShape.PER_TOKEN + else: + self.act_q_group_shape = GroupShape.PER_TENSOR + + if self.block_quant: + assert not self.act_q_static + assert self.weight_block_size is not None + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(*self.weight_block_size), + act_quant_group_shape=self.act_q_group_shape, + cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, + use_aiter_and_is_supported=self.use_aiter_and_is_supported, + ) + else: + self.fp8_linear = Fp8LinearOp( + act_quant_static=self.act_q_static, + act_quant_group_shape=self.act_q_group_shape, + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + maybe_create_device_identity() + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + layer.weight_block_size = None + + if self.block_quant: + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + validate_fp8_block_shape( + layer, + input_size, + output_size, + input_size_per_partition, + output_partition_sizes, + self.weight_block_size, + ) + + # WEIGHT + if self.quant_config.is_checkpoint_fp8_serialized: + weight = create_fp8_weight_parameter( + output_size_per_partition, input_size_per_partition, weight_loader + ) + else: + # For non-serialized checkpoints, use original dtype + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # If checkpoint is serialized fp8, load them. + # Otherwise, wait until process_weights_after_loading. + if self.quant_config.is_checkpoint_fp8_serialized: + # WEIGHT SCALE + if not self.block_quant: + scale = create_fp8_scale_parameter( + PerTensorScaleParameter, + output_partition_sizes, + input_size_per_partition, + None, + weight_loader, + ) + set_weight_attrs(scale, {"scale_type": "weight_scale"}) + layer.register_parameter("weight_scale", scale) + else: + assert not self.act_q_static + assert self.weight_block_size is not None + scale = create_fp8_scale_parameter( + BlockQuantScaleParameter, + output_partition_sizes, + input_size_per_partition, + self.weight_block_size, + weight_loader, + ) + set_weight_attrs(scale, {"scale_type": "weight_scale"}) + # The weight_scale_inv name is intentional for deepseekv3 + layer.register_parameter("weight_scale_inv", scale) + + # INPUT ACTIVATION SCALE + if self.act_q_static: + scale = create_fp8_input_scale(output_partition_sizes, weight_loader) + set_weight_attrs(scale, {"scale_type": "input_scale"}) + layer.register_parameter("input_scale", scale) + else: + layer.register_parameter("input_scale", None) + + def process_weights_after_loading(self, layer: Module) -> None: + size_k_first = True + input_scale = None + # TODO(rob): refactor block quant into separate class. + if self.block_quant: + assert not self.act_q_static + size_k_first = False + + weight, weight_scale = process_fp8_weight_block_strategy( + layer.weight, layer.weight_scale_inv + ) + # Delete the weight_scale_inv parameter to avoid confusion + # with the weight_scale parameter + del layer.weight_scale_inv + + # If checkpoint not serialized fp8, quantize the weights. + elif not self.quant_config.is_checkpoint_fp8_serialized: + qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) + weight = qweight.t() + + # If checkpoint is fp8 per-tensor, handle that there are N scales for N + # shards in a fused module + else: + weight = layer.weight + weight_scale = layer.weight_scale + + # If using w8a8, torch._scaled_mm needs per tensor, so + # requantize the logical shards as a single weight. + if not self.use_marlin: + weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy( + weight, + weight_scale, + layer.logical_widths, + getattr(layer, "input_scale", None), + ) + if self.act_q_static: + assert input_scale is not None + input_scale = input_scale.max() + weight = weight.t() + + # Update layer with new values. + layer.weight = Parameter(weight.data, requires_grad=False) + layer.weight_scale = Parameter(weight_scale.data, requires_grad=False) + layer.input_scale = ( + Parameter(input_scale, requires_grad=False) + if input_scale is not None + else None + ) + + if self.use_marlin: + prepare_fp8_layer_for_marlin(layer, size_k_first) + # Activations not quantized for marlin. + del layer.input_scale + return + + if self.block_quant: + maybe_post_process_fp8_weight_block(layer) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + # if batch invariant mode is enabled, prefer DeepGEMM FP8 path + # we will use BF16 dequant when DeepGEMM is not supported. + if vllm_is_batch_invariant(): + if self.block_quant: + assert self.weight_block_size is not None + return self.w8a8_block_fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + ) + else: + # per-tensor/channel: dequant to BF16 and run GEMM + weight_fp8 = layer.weight.to(torch.bfloat16) + weight_scale = layer.weight_scale.to(torch.bfloat16) + if weight_scale.numel() == 1: + # Per-tensor: simple scalar multiplication + weight_bf16 = weight_fp8 * weight_scale + else: + # Multiple scales (fused modules like QKV) + # Try to infer correct broadcasting + # weight is [K, N], scale could be [num_logical_weights] + # Need to figure out how to broadcast - for now just try + # direct multiplication + if ( + weight_scale.dim() == 1 + and weight_scale.shape[0] == weight_fp8.shape[0] + ): + # Per-row scaling + weight_bf16 = weight_fp8 * weight_scale.unsqueeze(1) + else: + # Fallback + weight_bf16 = weight_fp8 * weight_scale + return torch.nn.functional.linear(x, weight_bf16.t(), bias) + + if self.use_marlin: + return apply_fp8_marlin_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + workspace=layer.workspace, + size_n=layer.output_size_per_partition, + size_k=layer.input_size_per_partition, + bias=bias, + ) + + if self.block_quant: + assert self.weight_block_size is not None + + return self.w8a8_block_fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + ) + + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + out_dtype=self.out_dtype, + input_scale=layer.input_scale, + bias=bias, + ) + + +class Fp8MoEMethod(FusedMoEMethodBase): + """MoE method for FP8. + Supports loading FP8 checkpoints with static weight scale and + dynamic/static activation scale. + + Also supports loading quantized FP16/BF16 model checkpoints with dynamic + activation scaling. The weight scaling factor will be initialized after + the model weights are loaded. + + Args: + quant_config: The quantization config. + """ + + def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): + super().__init__(layer.moe_config) + self.layer = layer + self.quant_config = quant_config + self.weight_block_size = self.quant_config.weight_block_size + self.block_quant: bool = self.weight_block_size is not None + self.fp8_backend = get_fp8_moe_backend(self.block_quant) + + self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN + self.flashinfer_moe_backend: FlashinferMoeBackend | None = None + if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM: + self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM + elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: + self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS + if self.block_quant: + assert self.weight_block_size == [128, 128], ( + f"Only support weight_block_size == [128, 128], " + f"got {self.weight_block_size}" + ) + self.flashinfer_moe_fn = partial( + flashinfer_cutlass_moe_fp8, + moe=self.moe, + use_deepseek_fp8_block_scale=self.block_quant, + ) + + self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM + self.allow_cutlass_block_scaled_grouped_gemm = ( + self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM + ) + + def create_weights( + self, + layer: Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.num_experts = num_experts + layer.orig_dtype = params_dtype + layer.weight_block_size = None + + if self.quant_config.is_checkpoint_fp8_serialized: + params_dtype = torch.float8_e4m3fn + if self.block_quant: + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + tp_size = get_tensor_model_parallel_world_size() + block_n, block_k = ( + self.weight_block_size[0], + self.weight_block_size[1], + ) + # NOTE: To ensure proper alignment of the block-wise quantization + # scales, the output_size of the weights for both the gate and up + # layers must be divisible by block_n. + # Required by column parallel or enabling merged weights + if intermediate_size_per_partition % block_n != 0: + raise ValueError( + f"The output_size of gate's and up's weight = " + f"{intermediate_size_per_partition} is not divisible by " + f"weight quantization block_n = {block_n}." + ) + if tp_size > 1 and intermediate_size_per_partition % block_k != 0: + # Required by row parallel + raise ValueError( + f"The input_size of down's weight = " + f"{intermediate_size_per_partition} is not divisible by " + f"weight quantization block_k = {block_k}." + ) + + # WEIGHTS + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + if not self.block_quant: + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + w13_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + else: + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + 2 * ((intermediate_size_per_partition + block_n - 1) // block_n), + (hidden_size + block_k - 1) // block_k, + dtype=torch.float32, + ), + requires_grad=False, + ) + w2_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + (hidden_size + block_n - 1) // block_n, + (intermediate_size_per_partition + block_k - 1) // block_k, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale_inv", w13_weight_scale) + layer.register_parameter("w2_weight_scale_inv", w2_weight_scale) + assert self.quant_config.activation_scheme == "dynamic" + + # Add the quantization method used (per tensor/grouped/channel) + # to ensure the weight scales are loaded in properly + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value} + if self.block_quant + else {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + # If loading fp8 checkpoint, pass the weight loaders. + # If loading an fp16 checkpoint, do not (we will quantize in + # process_weights_after_loading() + if self.quant_config.is_checkpoint_fp8_serialized: + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # INPUT_SCALES + if self.quant_config.activation_scheme == "static": + if not self.quant_config.is_checkpoint_fp8_serialized: + raise ValueError( + "Found static activation scheme for checkpoint that " + "was not serialized fp8." + ) + + w13_input_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_input_scale", w13_input_scale) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_input_scale", w2_input_scale) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + + else: + layer.w13_input_scale = None + layer.w2_input_scale = None + + self.rocm_aiter_moe_enabled = False + + def process_weights_after_loading(self, layer: Module) -> None: + # Lazy import to avoid importing triton too early. + + self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + # TODO (rob): refactor block quant into separate class. + if self.block_quant: + assert self.quant_config.activation_scheme == "dynamic" + if current_platform.is_fp8_fnuz(): + w13_weight, w13_weight_scale_inv, w13_input_scale = ( + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, + layer.w13_weight_scale_inv, + layer.w13_input_scale, + ) + ) + w2_weight, w2_weight_scale_inv, w2_input_scale = ( + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale_inv, layer.w2_input_scale + ) + ) + elif self.flashinfer_moe_backend is not None: + # NOTE: weights have to be swapped since the activation is + # applied on different half for flashinfer vs vllm + w13_weight = swap_w13_to_w31(layer.w13_weight.data) + w13_weight_scale_inv = swap_w13_to_w31(layer.w13_weight_scale_inv.data) + w2_weight = layer.w2_weight.data + w2_weight_scale_inv = layer.w2_weight_scale_inv.data + else: + w13_weight = layer.w13_weight.data + w13_weight_scale_inv = layer.w13_weight_scale_inv.data + w2_weight = layer.w2_weight + w2_weight_scale_inv = layer.w2_weight_scale_inv + + # torch.compile() cannot use Parameter subclasses. + layer.w13_weight = Parameter(w13_weight, requires_grad=False) + layer.w13_weight_scale_inv = Parameter( + w13_weight_scale_inv, requires_grad=False + ) + layer.w2_weight = Parameter(w2_weight, requires_grad=False) + layer.w2_weight_scale_inv = Parameter( + w2_weight_scale_inv, requires_grad=False + ) + if self.rocm_aiter_moe_enabled: + # reshaping weights is required for aiter moe kernel. + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data + ) + + layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + + # DeepGemm scales need to be transposed and aligned. We try to do + # it ahead of time for performance reasons. + if self.allow_deep_gemm: + dg_w13_weight, dg_w13_weight_scale_inv = ( + deepgemm_post_process_fp8_weight_block( + wq=layer.w13_weight.data, + ws=layer.w13_weight_scale_inv.data, + quant_block_shape=tuple(layer.weight_block_size), + use_e8m0=is_deep_gemm_e8m0_used(), + ) + ) + dg_w2_weight, dg_w2_weight_scale_inv = ( + deepgemm_post_process_fp8_weight_block( + wq=layer.w2_weight.data, + ws=layer.w2_weight_scale_inv.data, + quant_block_shape=tuple(layer.weight_block_size), + use_e8m0=is_deep_gemm_e8m0_used(), + ) + ) + layer.w13_weight = Parameter(dg_w13_weight, requires_grad=False) + layer.w13_weight_scale_inv = Parameter( + dg_w13_weight_scale_inv, requires_grad=False + ) + layer.w2_weight = Parameter(dg_w2_weight, requires_grad=False) + layer.w2_weight_scale_inv = Parameter( + dg_w2_weight_scale_inv, requires_grad=False + ) + + # If checkpoint is fp16, quantize in place. + elif not self.quant_config.is_checkpoint_fp8_serialized: + fp8_dtype = current_platform.fp8_dtype() + w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) + w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype) + + # Re-initialize w13_scale because we directly quantize + # merged w13 weights and generate a single scaling factor. + layer.w13_weight_scale = torch.nn.Parameter( + torch.ones( + layer.local_num_experts, + dtype=torch.float32, + device=w13_weight.device, + ), + requires_grad=False, + ) + for expert in range(layer.local_num_experts): + w13_weight[expert, :, :], layer.w13_weight_scale[expert] = ( + ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :]) + ) + w2_weight[expert, :, :], layer.w2_weight_scale[expert] = ( + ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :]) + ) + layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + if self.rocm_aiter_moe_enabled: + # reshaping weights is required for aiter moe kernel. + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight, layer.w2_weight + ) + + layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + # If checkpoint is fp8, we need to handle that the + # MoE kernels require single activation scale and single weight + # scale for w13 per expert. + else: + # Fp8 moe kernels require a single activation scale. + # We take the max of all the scales in case they differ. + if self.quant_config.activation_scheme == "static": + if layer.w13_input_scale is None or layer.w2_input_scale is None: + raise ValueError( + "QuantConfig has static quantization, but found " + "activation scales are None." + ) + if not all_close_1d(layer.w13_input_scale) or not all_close_1d( + layer.w2_input_scale + ): + logger.warning_once( + "Found input_scales that are not equal for " + "fp8 MoE layer. Using the maximum across experts " + "for each layer." + ) + layer.w13_input_scale = torch.nn.Parameter( + layer.w13_input_scale.max(), requires_grad=False + ) + layer.w2_input_scale = torch.nn.Parameter( + layer.w2_input_scale.max(), requires_grad=False + ) + if current_platform.is_fp8_fnuz(): + # Normalize the weights and scales + w13_weight, w13_weight_scale, w13_input_scale = ( + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale + ) + ) + w2_weight, w2_weight_scale, w2_input_scale = ( + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale + ) + ) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter( + w13_weight_scale, requires_grad=False + ) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False + ) + layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter( + w2_weight_scale, requires_grad=False + ) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False + ) + + # Fp8 moe kernel needs single weight scale for w13 per expert. + # We take the max then dequant and requant each expert. + assert layer.w13_weight_scale is not None + shard_size = layer.intermediate_size_per_partition + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + for expert_id in range(layer.local_num_experts): + start = 0 + for shard_id in range(2): + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][start : start + shard_size, :], + layer.w13_weight_scale[expert_id][shard_id], + ) + layer.w13_weight[expert_id][start : start + shard_size, :], _ = ( + ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + ) + start += shard_size + + if self.rocm_aiter_moe_enabled: + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight, layer.w2_weight + ) + + layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + + layer.w13_weight_scale = torch.nn.Parameter( + max_w13_scales, requires_grad=False + ) + + if self.flashinfer_moe_backend is not None: + # NOTE: weights have to be swapped since the activation is + # applied on different half for flashinfer vs vllm + assert not self.block_quant + register_moe_scaling_factors(layer) + w13_weight = swap_w13_to_w31(layer.w13_weight.data) + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight) + layer.w13_weight.data = w13_weight.data + + if self.use_marlin: + prepare_moe_fp8_layer_for_marlin(layer, False) + # Activations not quantized for marlin. + del layer.w13_input_scale + del layer.w2_input_scale + + def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + if ( + self.rocm_aiter_moe_enabled + or self.use_marlin + or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + return None + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + if self.block_quant: + assert self.weight_block_size == [128, 128], ( + f"Only support weight_block_size == [128, 128], " + f"got {self.weight_block_size}" + ) + # Wire block-scale flag through prepare/finalize when using CUTLASS + prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize( + self.moe, + use_deepseek_fp8_block_scale=self.block_quant, + ) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize + else: + return super().maybe_make_prepare_finalize() + + def select_gemm_impl( + self, + prepare_finalize: FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> FusedMoEPermuteExpertsUnpermute: + from vllm.model_executor.layers.fused_moe import ( + BatchedDeepGemmExperts, + BatchedTritonExperts, + TritonOrDeepGemmExperts, + ) + + assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( + "Marlin and ROCm AITER are not supported with all2all yet." + ) + + assert self.moe_quant_config is not None + + if ( + prepare_finalize.activation_format + == FusedMoEActivationFormat.BatchedExperts + ): + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens_per_rank is not None + + experts_impl = ( + BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts + ) + logger.debug( + "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s", + experts_impl.__name__, + self.__class__.__name__, + max_num_tokens_per_rank, + self.weight_block_size, + False, + ) + return experts_impl( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + ) + + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + # Select GEMM experts with block-scale when weights are block-quantized + experts = select_cutlass_fp8_gemm_impl( + self.moe, + self.moe_quant_config, + use_deepseek_fp8_block_scale=self.block_quant, + ) + logger.debug_once("Using %s", experts.__class__.__name__) + return experts + else: + logger.debug( + "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s", + self.__class__.__name__, + self.weight_block_size, + False, + ) + return TritonOrDeepGemmExperts( + quant_config=self.moe_quant_config, + allow_deep_gemm=self.allow_deep_gemm, + ) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + if self.use_marlin: + return None + + return fp8_w8a8_moe_quant_config( + w1_scale=( + layer.w13_weight_scale_inv + if self.block_quant + else layer.w13_weight_scale + ), + w2_scale=( + layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale + ), + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + block_shape=self.weight_block_size, + ) + + @property + def supports_eplb(self) -> bool: + return True + + @property + def allow_inplace(self) -> bool: + return True + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + assert expert_load_view is not None + assert logical_to_physical_map is not None + assert logical_replica_count is not None + assert isinstance(layer, FusedMoE) + + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + assert activation == "silu", ( + f"Expected 'silu' activation but got {activation}" + ) + + if self.block_quant: + import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 + + e_score_correction_bias = ( + e_score_correction_bias.to(x.dtype) + if e_score_correction_bias is not None + else None + ) + routing_method_type = layer.routing_method_type + return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8( + routing_logits=router_logits.to(torch.float32) + if routing_method_type == RoutingMethodType.DeepSeekV3 + else router_logits, + routing_bias=e_score_correction_bias, + x=x, + w13_weight=layer.w13_weight, + w13_weight_scale_inv=layer.w13_weight_scale_inv, + w2_weight=layer.w2_weight, + w2_weight_scale_inv=layer.w2_weight_scale_inv, + global_num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + intermediate_size=layer.intermediate_size_per_partition, + expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + block_shape=self.weight_block_size, + routing_method_type=routing_method_type, + routed_scaling=routed_scaling_factor, + ) + else: + assert not renormalize and custom_routing_function is not None + result = apply_flashinfer_per_tensor_scale_fp8( + layer=layer, + hidden_states=x, + router_logits=router_logits, + routing_bias=e_score_correction_bias, + global_num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + zero_expert_num = getattr(layer, "zero_expert_num", 0) + zero_expert_type = getattr(layer, "zero_expert_type", None) + + select_result = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + global_num_experts=global_num_experts, + zero_expert_num=zero_expert_num, + zero_expert_type=zero_expert_type, + num_fused_shared_experts=layer.num_fused_shared_experts, + ) + + topk_weights, topk_ids, zero_expert_result = select_result + + if self.rocm_aiter_moe_enabled: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 + rocm_aiter_fused_experts, + ) + + result = rocm_aiter_fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + elif self.use_marlin: + assert activation == "silu", f"{activation} not supported for Marlin MoE." + result = fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + None, + None, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + quant_type_id=scalar_types.float8_e4m3fn.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + workspace=layer.workspace, + ) + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + assert activation == "silu", ( + f"Expected 'silu' activation but got {activation}" + ) + if not self.block_quant: + assert not renormalize and custom_routing_function is not None + assert scoring_func == "sigmoid", ( + f"Expected 'sigmoid' scoring func but got {scoring_func}" + ) + # Delegate to CUTLASS FlashInfer path; function already bound with + # use_deepseek_fp8_block_scale for block-quant when applicable + result = self.flashinfer_moe_fn( + x, + layer, + topk_weights, + topk_ids, + inplace=False, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + else: + from vllm.model_executor.layers.fused_moe import fused_experts + + result = fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + quant_config=self.moe_quant_config, + allow_deep_gemm=self.allow_deep_gemm, + allow_cutlass_block_scaled_grouped_gemm=( + self.allow_cutlass_block_scaled_grouped_gemm + ), + ) + if zero_expert_num != 0 and zero_expert_type is not None: + assert not isinstance(result, tuple), ( + "Shared + zero experts are mutually exclusive not yet supported" + ) + return result, zero_expert_result + else: + return result + + +class Fp8KVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from FP8 checkpoints. + """ + + def __init__(self, quant_config: Fp8Config): + super().__init__(quant_config) diff --git a/model_executor/layers/quantization/fp_quant.py b/model_executor/layers/quantization/fp_quant.py new file mode 100644 index 0000000..15a253c --- /dev/null +++ b/model_executor/layers/quantization/fp_quant.py @@ -0,0 +1,420 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Supports FP-Quant compression, see https://arxiv.org/abs/2509.23202 + +from typing import Any + +import torch +from torch.nn.parameter import Parameter + +from vllm._custom_ops import ( + cutlass_scaled_fp4_mm, + fusedQuantizeMx, + fusedQuantizeNv, + matmul_mxf4_bf16_tn, +) +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.utils.torch_utils import direct_register_custom_op + + +class FPQuantConfig(QuantizationConfig): + """Config class for FPQuant.""" + + def __init__( + self, + hadamard_group_size: int = 32, + forward_dtype: str = "mxfp4", + forward_method: str = "abs_max", + pseudoquantization: bool = False, + modules_to_not_convert: list[str] | None = None, + ) -> None: + super().__init__() + self.hadamard_group_size = hadamard_group_size + self.forward_dtype = forward_dtype + self.forward_method = forward_method + self.pseudoquantization = pseudoquantization + self.modules_to_not_convert = modules_to_not_convert + + if pseudoquantization: + raise ValueError("Pseudoquantization is not supported for vLLM") + + def __repr__(self) -> str: + return ( + f"FPQuantConfig(hadamard_group_size={self.hadamard_group_size}, " + f"forward_dtype={self.forward_dtype}, " + f"forward_method={self.forward_method}, " + f"pseudoquantization={self.pseudoquantization}, " + f"modules_to_not_convert={self.modules_to_not_convert})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "fp_quant" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 100 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] # no extra configs. + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "FPQuantConfig": + hadamard_group_size = cls.get_from_keys(config, ["hadamard_group_size"]) + forward_dtype = cls.get_from_keys(config, ["forward_dtype"]) + forward_method = cls.get_from_keys(config, ["forward_method"]) + pseudoquantization = cls.get_from_keys(config, ["pseudoquantization"]) + modules_to_not_convert = cls.get_from_keys(config, ["modules_to_not_convert"]) + return cls( + hadamard_group_size, + forward_dtype, + forward_method, + pseudoquantization, + modules_to_not_convert, + ) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> LinearMethodBase | None: + if self.modules_to_not_convert is not None and any( + prefix.endswith(module) for module in self.modules_to_not_convert + ): + return UnquantizedLinearMethod() + + if isinstance(layer, LinearBase): + return FPQuantLinearMethod(self) + return None + + +class FPQuantLinearMethod(LinearMethodBase): + """Linear method for FPQuant. + + Args: + quant_config: The FPQuant quantization config. + """ + + def __init__(self, quant_config: FPQuantConfig): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del output_size # Unused. + del input_size # Unused. + + if params_dtype != torch.bfloat16: + raise ValueError("Only bfloat16 is currently supported by FPQuant") + if input_size_per_partition % self.quant_config.hadamard_group_size != 0: # noqa: E501 + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size. Or other skill issues." + ) + + assert self.quant_config.forward_dtype in ["mxfp4", "nvfp4"], ( + "Only mxfp4 and nvfp4 are supported for now" + ) + if self.quant_config.forward_dtype == "mxfp4": + group_size = 32 + elif self.quant_config.forward_dtype == "nvfp4": + group_size = 16 + else: + raise ValueError( + f"Unsupported forward_dtype: {self.quant_config.forward_dtype}" + ) + + qweight = Parameter( + torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 2, + dtype=torch.uint8, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, + { + "input_dim": 1, + "output_dim": 0, + "packed_dim": 1, + "pack_factor": 2, + } + | extra_weight_attrs, + ) + layer.register_parameter("qweight", qweight) + + scales = Parameter( + torch.empty( + sum(output_partition_sizes), + input_size_per_partition // group_size, + dtype=torch.uint8, + ), + requires_grad=False, + ) + set_weight_attrs( + scales, + { + "input_dim": 1, + "output_dim": 0, + "packed_dim": 1, + "pack_factor": group_size, + } + | extra_weight_attrs, + ) + layer.register_parameter("scales", scales) + + weight_global_scale = Parameter( + torch.empty(1, dtype=torch.float32), + requires_grad=False, + ) + set_weight_attrs( + weight_global_scale, {"ignore_warning": True} | extra_weight_attrs + ) + layer.register_parameter("weight_global_scale", weight_global_scale) + + act_global_scale = Parameter( + torch.empty(1, dtype=torch.float32), + requires_grad=False, + ) + set_weight_attrs( + act_global_scale, {"ignore_warning": True} | extra_weight_attrs + ) + layer.register_parameter("act_global_scale", act_global_scale) + + forward_hadamard_matrix = Parameter( + torch.empty( + self.quant_config.hadamard_group_size, + self.quant_config.hadamard_group_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs( + forward_hadamard_matrix, {"ignore_warning": True} | extra_weight_attrs + ) + layer.register_parameter("forward_hadamard_matrix", forward_hadamard_matrix) + + backward_hadamard_matrix = Parameter( + torch.empty( + self.quant_config.hadamard_group_size, + self.quant_config.hadamard_group_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs( + backward_hadamard_matrix, {"ignore_warning": True} | extra_weight_attrs + ) + layer.register_parameter("backward_hadamard_matrix", backward_hadamard_matrix) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return quantized_forward( + x, + layer.qweight, + layer.scales, + layer.weight_global_scale, + layer.act_global_scale, + bias, + layer.forward_hadamard_matrix, + self.quant_config.forward_method, + self.quant_config.forward_dtype, + ) + + +def ceil_div(a, b): + return (a + b - 1) // b + + +def fused_quantize_mx( + x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, forward_method: str +) -> tuple[torch.Tensor, torch.Tensor]: + return fusedQuantizeMx(x_flat, hadamard_matrix, method=forward_method) + + +def fused_quantize_mx_fake(x_flat, hadamard_matrix, forward_method): + rows, cols = x_flat.size(0), x_flat.size(1) // 32 + padded_rows = ((rows + 128 - 1) // 128) * 128 + padded_cols = ((cols + 4 - 1) // 4) * 4 + + xh_e2m1 = torch.empty( + x_flat.size(0), x_flat.size(1) // 2, dtype=torch.uint8, device=x_flat.device + ) + xh_e8m0 = torch.empty( + padded_rows, padded_cols, dtype=torch.float8_e8m0fnu, device=x_flat.device + ) + + return xh_e2m1, xh_e8m0 + + +direct_register_custom_op( + op_name="fused_quantize_mx", + op_func=fused_quantize_mx, + mutates_args=[], + fake_impl=fused_quantize_mx_fake, + dispatch_key=current_platform.dispatch_key, +) + + +def matmul_mxf4_bf16( + x: torch.Tensor, + w: torch.Tensor, + xs: torch.Tensor, + ws: torch.Tensor, + alpha: torch.Tensor, +) -> torch.Tensor: + return matmul_mxf4_bf16_tn( + x, + w, + to_blocked(xs, backend="triton").view(torch.float8_e8m0fnu), + to_blocked(ws, backend="triton").view(torch.float8_e8m0fnu), + alpha, + ) + + +def matmul_mxf4_bf16_fake(x, w, xs, ws, alpha): + return torch.empty(*x.shape[:-1], w.shape[0], dtype=torch.bfloat16, device=x.device) + + +direct_register_custom_op( + op_name="matmul_mxf4_bf16", + op_func=matmul_mxf4_bf16, + mutates_args=[], + fake_impl=matmul_mxf4_bf16_fake, + dispatch_key=current_platform.dispatch_key, +) + + +def fused_quantize_nv( + x_flat: torch.Tensor, hadamard_matrix: torch.Tensor, global_scale: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + return fusedQuantizeNv(x_flat, hadamard_matrix, global_scale) + + +def fused_quantize_nv_fake(x_flat, hadamard_matrix, global_scale): + rows, cols = x_flat.size(0), x_flat.size(1) // 16 + padded_rows = ((rows + 128 - 1) // 128) * 128 + padded_cols = ((cols + 4 - 1) // 4) * 4 + + xh_e2m1 = torch.empty( + x_flat.size(0), x_flat.size(1) // 2, dtype=torch.uint8, device=x_flat.device + ) + xh_e8m0 = torch.empty( + padded_rows, padded_cols, dtype=torch.float8_e4m3fn, device=x_flat.device + ) + + return xh_e2m1, xh_e8m0 + + +direct_register_custom_op( + op_name="fused_quantize_nv", + op_func=fused_quantize_nv, + mutates_args=[], + fake_impl=fused_quantize_nv_fake, + dispatch_key=current_platform.dispatch_key, +) + + +def matmul_nvf4_bf16( + x: torch.Tensor, + w: torch.Tensor, + xs: torch.Tensor, + ws: torch.Tensor, + alpha: torch.Tensor, +) -> torch.Tensor: + return cutlass_scaled_fp4_mm( + x, + w, + to_blocked(xs, backend="triton") + .view(torch.float8_e4m3fn) + .view(-1, x.shape[1] // 8), # *2//16 + to_blocked(ws, backend="triton") + .view(torch.float8_e4m3fn) + .view(-1, x.shape[1] // 8), + alpha, + torch.bfloat16, + ) + + +def matmul_nvf4_bf16_fake(x, w, xs, ws, alpha): + return torch.empty(*x.shape[:-1], w.shape[0], dtype=torch.bfloat16, device=x.device) + + +direct_register_custom_op( + op_name="matmul_nvf4_bf16", + op_func=matmul_nvf4_bf16, + mutates_args=[], + fake_impl=matmul_nvf4_bf16_fake, + dispatch_key=current_platform.dispatch_key, +) + + +def quantized_forward( + x: torch.Tensor, + qweight: torch.Tensor, + weight_scales: torch.Tensor, + weight_global_scale: torch.Tensor, + act_global_scale: torch.Tensor, + bias: torch.Tensor | None, + forward_hadamard_matrix: torch.Tensor, + forward_method: str, + forward_dtype: str, +) -> torch.Tensor: + x_flat = x.contiguous().flatten(end_dim=-2) + + if forward_dtype == "mxfp4": + x_flat_q, x_flat_scales = torch.ops.vllm.fused_quantize_mx( + x_flat, forward_hadamard_matrix, forward_method + ) + y = torch.ops.vllm.matmul_mxf4_bf16( + x_flat_q, + qweight, + x_flat_scales, + weight_scales, + 1 / (weight_global_scale * act_global_scale), + ) + elif forward_dtype == "nvfp4": + x_flat_q, x_flat_scales = torch.ops.vllm.fused_quantize_nv( + x_flat, forward_hadamard_matrix, act_global_scale + ) + y = torch.ops.vllm.matmul_nvf4_bf16( + x_flat_q, + qweight, + x_flat_scales, + weight_scales, + 1 / (weight_global_scale * act_global_scale), + ) + else: + raise ValueError(f"Unsupported forward_dtype: {forward_dtype}") + + y = y.view(*x.shape[:-1], y.shape[-1]) + if bias is not None: + y += bias + + return y diff --git a/model_executor/layers/quantization/gguf.py b/model_executor/layers/quantization/gguf.py new file mode 100644 index 0000000..78625a4 --- /dev/null +++ b/model_executor/layers/quantization/gguf.py @@ -0,0 +1,651 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any, Optional + +import gguf +import torch +import torch.nn.functional as F +from gguf import GGMLQuantizationType as WeightType +from torch.nn.parameter import Parameter, UninitializedParameter + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.utils import set_weight_attrs +from vllm.utils.torch_utils import direct_register_custom_op + +logger = init_logger(__name__) + + +class GGUFConfig(QuantizationConfig): + """Config class for GGUF.""" + + def __init__(self, unquantized_modules: list[str] | None = None) -> None: + super().__init__() + self.unquantized_modules = unquantized_modules or [] + + def __repr__(self) -> str: + return "GGUFConfig()" + + def get_name(self) -> QuantizationMethods: + return "gguf" + + def get_supported_act_dtypes(self) -> list[torch.dtype]: + return [torch.half, torch.bfloat16, torch.float32] + + @classmethod + def get_min_capability(cls) -> int: + return 60 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] # no extra configs. + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "GGUFConfig": + return cls() + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + if is_layer_skipped_gguf(prefix, self.unquantized_modules): + return UnquantizedLinearMethod() + return GGUFLinearMethod(self) + elif isinstance(layer, VocabParallelEmbedding): + return GGUFEmbeddingMethod(self) + elif isinstance(layer, FusedMoE): + return GGUFMoEMethod(self, layer.moe_config) + return None + + +def is_layer_skipped_gguf(prefix: str, unquantized_modules: list[str]): + return any(module_name in prefix for module_name in unquantized_modules) + + +UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16} +STANDARD_QUANT_TYPES = { + WeightType.Q4_0, + WeightType.Q4_1, + WeightType.Q5_0, + WeightType.Q5_1, + WeightType.Q8_0, + WeightType.Q8_1, +} +KQUANT_TYPES = { + WeightType.Q2_K, + WeightType.Q3_K, + WeightType.Q4_K, + WeightType.Q5_K, + WeightType.Q6_K, +} +IMATRIX_QUANT_TYPES = { + WeightType.IQ1_M, + WeightType.IQ1_S, + WeightType.IQ2_XXS, + WeightType.IQ2_XS, + WeightType.IQ2_S, + WeightType.IQ3_XXS, + WeightType.IQ3_S, + WeightType.IQ4_XS, + WeightType.IQ4_NL, +} +# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization. +# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add +# MMQ kernel for I-Matrix quantization. +DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES +MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES +MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES + + +def _fused_mul_mat_gguf( + x: torch.Tensor, qweight: torch.Tensor, qweight_type: int +) -> torch.Tensor: + if qweight_type in IMATRIX_QUANT_TYPES: + mmvq_safe = 8 if qweight.shape[0] > 5120 else 16 + else: + mmvq_safe = 2 if qweight.shape[0] > 5120 else 6 + # HACK: when doing chunked prefill we don't generate output tokens + # so input to logits generator is empty which causes invalid parameter + if x.shape[0] == 0: + return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device) + # there is no need to call any kernel for fp16/bf16 + if qweight_type in UNQUANTIZED_TYPES: + return x @ qweight.T + # enable MMVQ in contiguous batching with batch_size=1 + if x.shape[0] <= mmvq_safe and qweight_type in MMVQ_QUANT_TYPES: + y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0]) + # Use MMQ Kernel if it's available (standard + k-quants) + elif qweight_type in MMQ_QUANT_TYPES: + y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0]) + # If there is no available MMQ kernel, fallback to dequantize + elif qweight_type in DEQUANT_TYPES: + block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] + shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size) + weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype) + y = x @ weight.T + else: + # Raise an error if the quantization type is not supported. + # Might be useful if llama.cpp adds a new quantization type. + # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type. + qweight_type = WeightType(qweight_type) + raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}") + return y + + +def _fused_mul_mat_gguf_fake( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, +) -> torch.Tensor: + return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device) + + +try: + direct_register_custom_op( + op_name="_fused_mul_mat_gguf", + op_func=_fused_mul_mat_gguf, + fake_impl=_fused_mul_mat_gguf_fake, + ) + fused_mul_mat_gguf = _fused_mul_mat_gguf + +except AttributeError as error: + raise error + + +def _fused_moe_gguf( + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + qweight_type: int, + qweight_type2: int, + activation: str, +) -> torch.Tensor: + def act(x: torch.Tensor): + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + if activation == "silu": + torch.ops._C.silu_and_mul(out, x) + elif activation == "gelu": + torch.ops._C.gelu_and_mul(out, x) + else: + raise ValueError(f"Unsupported activation: {activation}") + return out + + # lazy import to avoid triggering triton import in CPU backend + from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size + + out_hidden_states = torch.empty_like(x) + # unless we decent expert reuse we are better off running moe_vec kernel + if ( + qweight_type2 in MMQ_QUANT_TYPES + and qweight_type in MMQ_QUANT_TYPES + and x.shape[0] > 64 + ): + num_tokens, _ = x.shape + E, N, _ = w1.shape + top_k = topk_ids.shape[1] + BLOCK_SIZE = ops.ggml_moe_get_block_size(qweight_type) + + sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( + topk_ids, BLOCK_SIZE, E + ) + out = ops.ggml_moe_a8( + x, + w1, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + qweight_type, + N, + top_k, + num_tokens, + ) + out = act(out) + out = ops.ggml_moe_a8( + out, + w2, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + qweight_type2, + w2.shape[1], + 1, + num_tokens * top_k, + ) + out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_( + topk_weights.view(num_tokens, top_k, 1) + ) + ops.moe_sum(out, out_hidden_states) + elif qweight_type2 in MMVQ_QUANT_TYPES and qweight_type in MMVQ_QUANT_TYPES: + num_tokens, _ = x.shape + E, N, _ = w1.shape + top_k = topk_ids.shape[1] + + out = ops.ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N, num_tokens) + out = act(out) + + out = ops.ggml_moe_a8_vec( + out, w2, topk_ids, 1, qweight_type2, w2.shape[1], num_tokens * top_k + ) + out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_( + topk_weights.view(num_tokens, top_k, 1) + ) + ops.moe_sum(out, out_hidden_states) + else: + logger.warning_once( + "There is no support for fast MoE kernel " + "for current quantization method. " + "Falling back to slow implementation. " + ) + for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)): + inp = x[tok].reshape((1,) + x.shape[1:]) + current_hidden_state = None + for ww, ii in zip(w, idx): + expert_up = w1[ii] + + out = fused_mul_mat_gguf(inp, expert_up, qweight_type) + out = act(out) + + expert_down = w2[ii] + current_state = fused_mul_mat_gguf( + out, expert_down, qweight_type2 + ).mul_(ww) + if current_hidden_state is None: + current_hidden_state = current_state + else: + current_hidden_state.add_(current_state) + out_hidden_states[tok] = current_hidden_state + return out_hidden_states + + +def _fused_moe_gguf_fake( + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + qweight_type: int, + qweight_type2: int, + activation: str, +) -> torch.Tensor: + return torch.empty_like(x) + + +try: + direct_register_custom_op( + op_name="_fused_moe_gguf", + op_func=_fused_moe_gguf, + fake_impl=_fused_moe_gguf_fake, + ) + fused_moe_gguf = _fused_moe_gguf + +except AttributeError as error: + raise error + + +def _apply_gguf_embedding( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, + hidden_size: int, + dtype: torch.dtype | None = None, +) -> torch.Tensor: + if qweight_type in UNQUANTIZED_TYPES: + return torch.embedding(qweight, x) + elif qweight_type in DEQUANT_TYPES: + block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] + x_flat = x.flatten() + assert hidden_size == qweight.shape[1] // type_size * block_size + quant = torch.index_select(qweight, dim=0, index=x_flat) + dequant = ops.ggml_dequantize( + quant, qweight_type, hidden_size, x_flat.shape[0], dtype + ) + return dequant.view(*x.shape, hidden_size) + else: + qweight_type = WeightType(qweight_type) + raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}") + + +def _apply_gguf_embedding_fake( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, + hidden_size: int, + dtype: torch.dtype | None = None, +) -> torch.Tensor: + return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device) + + +try: + direct_register_custom_op( + op_name="_apply_gguf_embedding", + op_func=_apply_gguf_embedding, + fake_impl=_apply_gguf_embedding_fake, + ) + apply_gguf_embedding = _apply_gguf_embedding + +except AttributeError as error: + raise error + + +class GGUFLinearMethod(LinearMethodBase): + """Linear method for GGUF. + + Args: + quant_config: The GGUF quantization config. + """ + + def __init__(self, quant_config: GGUFConfig): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + self.params_dtype = params_dtype + output_size_per_partition = sum(output_partition_sizes) + + tensor_shape = (output_size_per_partition, input_size_per_partition) + qweight = GGUFUninitializedParameter(requires_grad=False) + set_weight_attrs( + qweight, + { + "input_dim": 1, + "output_dim": 0, + "tensor_shape": tensor_shape, + "is_gguf_weight": True, + "data_container": [], + "shard_id": [], + "shard_id_map": {}, + "params_dtype": params_dtype, + "input_size_per_partition" :input_size_per_partition, # restore shape for qkv and merge + "output_partition_sizes" :output_partition_sizes, + }, + ) + set_weight_attrs(qweight, extra_weight_attrs) + layer.register_parameter("qweight", qweight) + + qweight_type = Parameter( + torch.empty(len(output_partition_sizes), dtype=torch.uint8), + requires_grad=False, + ) + set_weight_attrs( + qweight_type, + { + "is_gguf_weight_type": True, + "weight_type": 0, + "shard_weight_type": {}, + "ignore_warning": True, + }, + ) + set_weight_attrs(qweight_type, extra_weight_attrs) + layer.register_parameter("qweight_type", qweight_type) + + def process_weights_after_loading(self, layer: torch.nn.Module): + qweight_type = layer.qweight_type.weight_type + if not (qweight_type in UNQUANTIZED_TYPES or qweight_type in DEQUANT_TYPES): + qweight_type = WeightType(qweight_type) + raise ValueError( + f"Unsupported GGUF quantization type {qweight_type} in layer {layer}." + ) + # For MergedColumnParallelLinear and QKVParallelLinear, we need to + # materialize the padded weight parameter for CUDA Graph compatibility. + self._create_padded_weight_param(layer) + + def _create_padded_weight_param(self, layer: torch.nn.Module): + """Create padded weight parameter for GGUF MergedLinear layer.""" + qweight = layer.qweight + shard_id_map = qweight.shard_id_map + shard_id = qweight.shard_id + if len(data_container := qweight.data_container) > 1: + dtype = {data.dtype for data in data_container} + assert len(dtype) == 1, ValueError( + f"Data container has mixed dtypes: {dtype}" + ) + dtype = next(iter(dtype)) + # concat dim0 and pad dim1 + padded_side = max(x.size(1) for x in data_container) + concat_side = sum(x.size(0) for x in data_container) + # Pad the quantized weights to dense tensor, and create a map + # with the location of each shard in the padded tensor. + padded_data = torch.zeros( + (concat_side, padded_side), dtype=dtype, device=qweight.device + ) + # (dim0_start, dim0_end, dim1_size) + shard_offset_map = dict[str, tuple[int, int, int]]() + for idx in shard_id: + id_in_container = shard_id_map[idx] + start = sum(x.size(0) for x in data_container[:id_in_container]) + end = start + data_container[id_in_container].size(0) + size = data_container[id_in_container].size(1) + padded_data[start:end, :size] = data_container[id_in_container] + shard_offset_map[idx] = (start, end, size) + qweight.data_container.clear() + padded_param = Parameter(padded_data, requires_grad=False) + set_weight_attrs(padded_param, vars(qweight)) + set_weight_attrs(padded_param, {"shard_offset_map": shard_offset_map}) + layer.register_parameter("qweight", padded_param) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + shard_id = layer.qweight.shard_id + + if shard_id: + # dequantize shard weights respectively + shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id + qweight = layer.qweight + result = [] + for idx in shard_id: + start, end, offset = layer.qweight.shard_offset_map[idx] + qweight_type = layer.qweight_type.shard_weight_type[idx] + result.append( + fused_mul_mat_gguf( + x, qweight[start:end, :offset].contiguous(), qweight_type + ) + ) + out = torch.cat(result, axis=1) + else: + qweight = layer.qweight + qweight_type = layer.qweight_type.weight_type + out = fused_mul_mat_gguf(x, qweight, qweight_type) + if bias is not None: + out.add_(bias) + return out + + +class GGUFMoEMethod(FusedMoEMethodBase): + """MoE method for GGUF. + + Args: + quant_config: The GGUF quantization config. + """ + + def __init__( + self, + quant_config: GGUFConfig, + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + tensor_shape = (num_experts, 2 * intermediate_size_per_partition, hidden_size) + # gate up proj + w13_qweight = GGUFUninitializedParameter(requires_grad=False) + set_weight_attrs( + w13_qweight, + { + "input_dim": 1, + "output_dim": 0, + "tensor_shape": tensor_shape, + "is_gguf_weight": True, + "data_container": [], + }, + ) + set_weight_attrs(w13_qweight, extra_weight_attrs) + layer.register_parameter("w13_qweight", w13_qweight) + + w13_qweight_type = Parameter( + torch.empty(1, dtype=torch.uint8), requires_grad=False + ) + set_weight_attrs( + w13_qweight_type, + {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True}, + ) + set_weight_attrs(w13_qweight_type, extra_weight_attrs) + layer.register_parameter("w13_qweight_type", w13_qweight_type) + + tensor_shape = (num_experts, intermediate_size_per_partition, hidden_size) + # gate down proj + w2_qweight = GGUFUninitializedParameter(requires_grad=False) + set_weight_attrs( + w2_qweight, + { + "input_dim": 1, + "output_dim": 0, + "tensor_shape": tensor_shape, + "is_gguf_weight": True, + "data_container": [], + }, + ) + set_weight_attrs(w2_qweight, extra_weight_attrs) + layer.register_parameter("w2_qweight", w2_qweight) + + w2_qweight_type = Parameter( + torch.empty(1, dtype=torch.uint8), requires_grad=False + ) + set_weight_attrs( + w2_qweight_type, + {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True}, + ) + + set_weight_attrs(w2_qweight_type, extra_weight_attrs) + layer.register_parameter("w2_qweight_type", w2_qweight_type) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.") + + assert activation == "silu", "Only SiLU activation is supported." + if apply_router_weight_on_input: + raise NotImplementedError( + "Apply router weight on input is not supported for" + "fused GGUF MoE method." + ) + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + return fused_moe_gguf( + x, + layer.w13_qweight, + layer.w2_qweight, + topk_weights, + topk_ids, + layer.w13_qweight_type.weight_type, + layer.w2_qweight_type.weight_type, + activation, + ) + + +class GGUFEmbeddingMethod(GGUFLinearMethod): + """Embedding method for GGUF. + + Args: + quant_config: The GGUF quantization config. + """ + + def embedding(self, layer: torch.nn.Module, x: torch.Tensor) -> torch.Tensor: + weight = layer.weight + return F.embedding(x, weight) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + qweight = layer.qweight + qweight_type = layer.qweight_type.weight_type + hidden_size = qweight.tensor_shape[1] + + return apply_gguf_embedding( + x, qweight, qweight_type, hidden_size, dtype=self.params_dtype + ) + + +class GGUFUninitializedParameter(UninitializedParameter): + cls_to_become = Parameter + data_container: list[torch.Tensor] diff --git a/model_executor/layers/quantization/gptq.py b/model_executor/layers/quantization/gptq.py new file mode 100644 index 0000000..9332fc9 --- /dev/null +++ b/model_executor/layers/quantization/gptq.py @@ -0,0 +1,393 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import enum +from enum import Enum +from fractions import Fraction +from typing import TYPE_CHECKING, Any, Union + +import torch +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from torch.nn.parameter import Parameter + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.linear import LinearMethodBase +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils.gptq_utils import ( + get_linear_quant_method, +) +from vllm.model_executor.parameter import ( + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedColumnParameter, + PackedvLLMParameter, + RowvLLMParameter, +) +from vllm.transformers_utils.config import get_safetensors_params_metadata +from vllm.utils.collection_utils import is_list_of + +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.model_executor.models.utils import WeightsMapper +else: + QuantizationMethods = str + +logger = init_logger(__name__) + + +class GPTQConfig(QuantizationConfig): + """Config class for GPTQ. + + Reference: https://arxiv.org/abs/2210.17323 + """ + + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + lm_head_quantized: bool, + dynamic: dict[str, dict[str, int | bool]], + autoround_version: str = "", + modules_in_block_to_quantize: list[str] | None = None, + checkpoint_format: str = "", + ) -> None: + # GPTQModel use `dynamic` config property to allow per module + # quantization config so each module can be individually optimized. + # Format is dict[str, dict] where key is a regex string that can + # perform both positive ("+:" prefixed) or negative ("-:" prefixed) + # matching of a module. + # Default to positive match, override base quant config mode, if no + # prefix is used. Value is in dict format of field key and override + # value. + # Negative matching will skip quantization init for this module + # entirely: + # non-quantized inference. More details and quantization examples can be + # found at: https://github.com/ModelCloud/GPTQModel + # Example: + # # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9 + # # last 1/4 of the layers 16-21 has 8bit and group_size 64 + # dynamic = { + # #`.*\.` matches the layers_node prefix + # # positive match layer 10-15 + # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, + # # positive match layer 16-21 + # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, + # r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers + # } + super().__init__() + self.dynamic = dynamic + + self.weight_bits = weight_bits + self.group_size = group_size + self.desc_act = desc_act + self.lm_head_quantized = lm_head_quantized + self.pack_factor = Fraction(32, self.weight_bits) + if self.weight_bits not in [2, 3, 4, 8]: + raise ValueError( + "Currently, only 2/3/4/8-bit weight quantization is " + f"supported for GPTQ, but got {self.weight_bits} bits." + ) + # Somehow gptq_gemm 4-bit is buggy, maybe fix it in the future. + # For now, show a warning, since gptq_marlin will be used by default. + if self.weight_bits == 4: + logger.warning_once( + "Currently, the 4-bit gptq_gemm kernel for GPTQ is buggy. " + "Please switch to gptq_marlin or gptq_bitblas." + ) + + self.modules_in_block_to_quantize = modules_in_block_to_quantize or [] + + # used to identify GPTQ model quantized by autoround + self.autoround_version = autoround_version + + # GPTQ v1 and v2 format deals with zero points differently. + # Currently GPTQModel stores v1 format checkpoints by default, + # but provides the option to set `format="gptq_v2"` in `QuantizeConfig`. + self.checkpoint_format = checkpoint_format + + def __repr__(self) -> str: + return ( + f"GPTQConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"desc_act={self.desc_act}), " + f"lm_head_quantized={self.lm_head_quantized}, " + f"dynamic={self.dynamic}, " + f"modules_in_block_to_quantize={self.modules_in_block_to_quantize}), " + f"checkpoint_format={self.checkpoint_format})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "gptq" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + # Need to figure it out + def get_min_capability(cls) -> int: + return 60 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "GPTQConfig": + dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) + dynamic = {} if dynamic is None else dynamic + + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + desc_act = cls.get_from_keys(config, ["desc_act"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + autoround_version = cls.get_from_keys_or( + config, ["autoround_version"], default="" + ) + modules_in_block_to_quantize = cls.get_from_keys_or( + config, ["modules_in_block_to_quantize"], default=None + ) + checkpoint_format = cls.get_from_keys_or( + config, ["checkpoint_format"], default="" + ) + return cls( + weight_bits, + group_size, + desc_act, + lm_head_quantized, + dynamic, + autoround_version, + modules_in_block_to_quantize, + checkpoint_format, + ) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Union["GPTQLinearMethod", "QuantizeMethodBase"] | None: + if isinstance(layer, FusedMoE): + # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility + from .moe_wna16 import MoeWNA16Config + + # TODO: maybe update this for GPTQv2 format checkpoints + config = { + "quant_method": "gptq", + "bits": self.weight_bits, + "group_size": self.group_size, + "sym": True, # GPTQ typically uses symmetric quantization + "lm_head": False, + } + return MoeWNA16Config.from_config(config).get_quant_method(layer, prefix) + + return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod) + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.modules_in_block_to_quantize is not None: + self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list( + self.modules_in_block_to_quantize + ) + + def maybe_update_config(self, model_name: str, revision: str | None = None): + if self.modules_in_block_to_quantize: + if is_list_of(self.modules_in_block_to_quantize, list): + # original modules_in_block_to_quantize: list[list[str]] + # flatten original modules_in_block_to_quantize + self.modules_in_block_to_quantize = [ + item + for sublist in self.modules_in_block_to_quantize + for item in sublist + ] + return + + unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32] + metadata = get_safetensors_params_metadata(model_name, revision=revision) + quant_layers: set[str] = { + param_name.rsplit(".", 1)[0] + for param_name, info in metadata.items() + if (dtype := info.get("dtype", None)) + and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes + } + self.modules_in_block_to_quantize = list(quant_layers) + + +class ExllamaState(Enum): + UNUSED = enum.auto() + UNINITIALIZED = enum.auto() + READY = enum.auto() + + +class GPTQLinearMethod(LinearMethodBase): + """Linear method for GPTQ. + + Args: + quant_config: The GPTQ quantization config. + """ + + def __init__(self, quant_config: GPTQConfig): + self.quant_config = quant_config + + # GPTQ v1 and v2 format deals with zero points differently + self.use_v2_format = quant_config.checkpoint_format == "gptq_v2" + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del output_size # Unused. + weight_loader = extra_weight_attrs.get("weight_loader") + if input_size_per_partition % self.quant_config.group_size != 0: + raise ValueError( + "The input size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size." + ) + output_size_per_partition = sum(output_partition_sizes) + if output_size_per_partition % self.quant_config.pack_factor.numerator != 0: + raise ValueError( + "The output size is not aligned with the quantized " + "weight shape. This can be caused by too large " + "tensor parallel size." + ) + + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + exllama_state = ExllamaState.UNINITIALIZED + scale_and_zero_size = input_size // group_size + scale_and_zero_input_dim = None + if ( + input_size != input_size_per_partition + and self.quant_config.group_size != -1 + ): + # For act-order models, we cannot use Exllama for row parallel layer + if self.quant_config.desc_act: + exllama_state = ExllamaState.UNUSED + else: + # we need to partition qzeros and scales for exllama kernel + scale_and_zero_size = input_size_per_partition // group_size + scale_and_zero_input_dim = 0 + + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition // self.quant_config.pack_factor, + output_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=0, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + g_idx = RowvLLMParameter( + data=torch.tensor( + [ + i // self.quant_config.group_size + for i in range(input_size_per_partition) + ], + dtype=torch.int32, + ), + input_dim=0, + weight_loader=weight_loader, + ) + qzeros_args = { + "data": torch.empty( + scale_and_zero_size, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + "weight_loader": weight_loader, + } + weight_scale_args = { + "data": torch.empty( + scale_and_zero_size, + output_size_per_partition, + dtype=params_dtype, + ), + "weight_loader": weight_loader, + } + if scale_and_zero_input_dim is None: + scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args) + qzeros = PackedColumnParameter( + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args, + ) + + else: + scales = GroupQuantScaleParameter( + output_dim=1, input_dim=0, **weight_scale_args + ) + qzeros = PackedvLLMParameter( + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("g_idx", g_idx) + layer.register_parameter("qzeros", qzeros) + layer.register_parameter("scales", scales) + + layer.exllama_state = exllama_state + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # for torch.compile + layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False) + layer.qweight = Parameter(layer.qweight.data, requires_grad=False) + layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False) + layer.scales = Parameter(layer.scales.data, requires_grad=False) + + # exllama needs to shuffle the weight after the weight is loaded + # here we do the shuffle on first forward pass + if layer.exllama_state == ExllamaState.UNINITIALIZED: + if self.quant_config.desc_act: + layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int) + else: + layer.g_idx.data = torch.empty( + (0,), dtype=torch.int, device=layer.g_idx.device + ) + layer.exllama_state = ExllamaState.READY + ops.gptq_shuffle(layer.qweight, layer.g_idx, self.quant_config.weight_bits) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + out_shape = x.shape[:-1] + (layer.qweight.shape[-1],) + reshaped_x = x.reshape(-1, x.shape[-1]) + + # GPTQ v1 and v2 format checkpoints deals with zero points differently, + # and require different gemm kernels. + output = ops.gptq_gemm( + reshaped_x, + layer.qweight, + layer.qzeros, + layer.scales, + layer.g_idx, + layer.exllama_state == ExllamaState.READY, + self.use_v2_format, + self.quant_config.weight_bits, + ) + if bias is not None: + output.add_(bias) + return output.reshape(out_shape) diff --git a/model_executor/layers/quantization/gptq_bitblas.py b/model_executor/layers/quantization/gptq_bitblas.py new file mode 100644 index 0000000..92f10bf --- /dev/null +++ b/model_executor/layers/quantization/gptq_bitblas.py @@ -0,0 +1,482 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Optional + +import torch +from packaging import version +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + set_weight_attrs, +) +from vllm.model_executor.layers.quantization import ( + QuantizationConfig, + QuantizationMethods, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( + BitBLASLinearKernel, + MPLinearLayerConfig, +) +from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( + BITBLAS_SUPPORTED_NUM_BITS as GPTQ_BITBLAS_SUPPORTED_NUM_BITS, +) +from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( + BITBLAS_SUPPORTED_SYM as GPTQ_BITBLAS_SUPPORTED_SYM, +) +from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( + MINIMUM_BITBLAS_VERSION, + bitblas_repeat_scales_on_all_ranks, + check_bitblas_supported, + verify_bitblas_supported, +) +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.parameter import ( + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedColumnParameter, + PackedvLLMParameter, + RowvLLMParameter, +) +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + + +class GPTQBitBLASConfig(QuantizationConfig): + """Config class for GPTQ BitBLAS""" + + # (num_bits, is_sym) -> quant_type + TYPE_MAP = { + (4, True): scalar_types.uint4b8, + (8, True): scalar_types.uint8b128, + } + + TORCH_DTYPE = torch.float16 + GPTQ_CKPT_STORAGE_DTYPE = ( + "int32" # GPTQ Default Checkpoints use int32 as storage dtype + ) + GPTQ_BITBLAS_STORAGE_DTYPE = "int8" # BitBLAS uses int8 as storage dtype + TORCH_BITBLAS_STORAGE_DTYPE = getattr(torch, GPTQ_BITBLAS_STORAGE_DTYPE) + # "original" or "rescale" or "quantized", + # the gptq_bitblas prefer "quantized" + ZEROS_MODE = "quantized" + + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + is_sym: bool, + quant_method: str | None, + lm_head_quantized: bool, + ) -> None: + try: + import bitblas + + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION + ): + raise ImportError( + "bitblas version is wrong. Please " + f"install bitblas>={MINIMUM_BITBLAS_VERSION}" + ) + except ImportError as e: + bitblas_import_exception = e + raise ValueError( + "Trying to use the bitblas backend, but could not import" + f"with the following error: {bitblas_import_exception}. " + "Please install bitblas through the following command: " + f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`" + ) from bitblas_import_exception + + if desc_act and group_size == -1: + # In this case, act_order == True is the same as act_order == False + # (since we have only one group per output channel) + desc_act = False + + super().__init__() + self.weight_bits = weight_bits + self.group_size = group_size + self.desc_act = desc_act + self.is_sym = is_sym + self.quant_method = quant_method + self.lm_head_quantized = lm_head_quantized + + # Verify + if self.weight_bits not in GPTQ_BITBLAS_SUPPORTED_NUM_BITS: + raise ValueError( + f"BitBLAS does not support weight_bits = {self.weight_bits}. " + f"Only weight_bits = {GPTQ_BITBLAS_SUPPORTED_NUM_BITS} " + "are supported." + ) + + if self.is_sym not in GPTQ_BITBLAS_SUPPORTED_SYM: + raise ValueError( + f"BitBLAS does not support is_sym = {self.is_sym}. " + f"Only sym = {GPTQ_BITBLAS_SUPPORTED_SYM} are supported." + ) + + self.storage_dtype = self.GPTQ_BITBLAS_STORAGE_DTYPE + + storage_nbit = int( + "".join(c for c in self.GPTQ_CKPT_STORAGE_DTYPE if c.isdigit()) + ) + + # 4 Bits packed into 32 bit datatype. + self.pack_factor = storage_nbit // weight_bits + self.nbits = weight_bits + + # Zeros type for the quantized weights. + self.zeros_mode = self.ZEROS_MODE + + if (weight_bits, is_sym) not in self.TYPE_MAP: + raise ValueError( + f"Unsupported quantization config: bits={weight_bits}, sym={is_sym}" + ) + + self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] + + def __repr__(self) -> str: + return ( + f"GPTQBitBLASConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"desc_act={self.desc_act})" + f"is_sym={self.is_sym}, " + f"quant_method={self.quant_method})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "gptq_bitblas" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "GPTQBitBLASConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + desc_act = cls.get_from_keys(config, ["desc_act"]) + is_sym = cls.get_from_keys(config, ["sym"]) + quant_method = cls.get_from_keys(config, ["quant_method"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + return cls( + weight_bits, group_size, desc_act, is_sym, quant_method, lm_head_quantized + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + can_convert = cls.is_gptq_bitblas_compatible(hf_quant_cfg) + + is_valid_user_quant = ( + user_quant is None + or user_quant == "bitblas" + or user_quant == "gptq_bitblas" + ) + + if can_convert and is_valid_user_quant: + msg = ( + "The model is convertible to {} during runtime." + " Using {} kernel.".format(cls.get_name(), cls.get_name()) + ) + logger.info(msg) + return cls.get_name() + + if can_convert and user_quant == "gptq": + logger.info( + "Detected that the model can run with gptq_bitblas" + ", however you specified quantization=gptq explicitly," + " so forcing gptq. Use quantization=gptq_bitblas for" + " faster inference" + ) + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["GPTQBitBLASLinearMethod"]: + if isinstance(layer, LinearBase) or ( + isinstance(layer, ParallelLMHead) and self.lm_head_quantized + ): + return GPTQBitBLASLinearMethod(self) + return None + + @property + def torch_storage_dtype(self) -> torch.dtype: + return self.TORCH_BITBLAS_STORAGE_DTYPE + + @classmethod + def is_gptq_bitblas_compatible(cls, quant_config: dict[str, Any]): + # Extract data from quant config. + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + sym = quant_config.get("sym") + desc_act = quant_config.get("desc_act") + + # temporarily disable on ROCm platform + if not current_platform.is_cuda(): + return False + + # If we cannot find the info needed in the config, cannot convert. + if num_bits is None or group_size is None or sym is None or desc_act is None: + return False + + if (num_bits, sym) not in cls.TYPE_MAP: + return False + + # If the capability of the device is too low, cannot convert. + major, minor = torch.cuda.get_device_capability() + device_capability = major * 10 + minor + if device_capability < cls.get_min_capability(): + return False + + # Otherwise, can convert if model satisfies bitblas constraints. + return check_bitblas_supported( + quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size + ) + + +class GPTQBitBLASLinearMethod(LinearMethodBase): + """Linear method for GPTQ BitBLAS. + + Args: + quant_config: The GPTQ BitBLAS quantization config. + """ + + kernel_type = BitBLASLinearKernel + _kernel_backends_being_used: set[str] = set() + + def __init__(self, quant_config: GPTQBitBLASConfig) -> None: + self.quant_config = quant_config + # Verify supported on platform. + verify_bitblas_supported( + quant_type=self.quant_config.quant_type, + group_size=self.quant_config.group_size, + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + """Creates quantized weights for use in linear operations. + + The function initializes and returns a dictionary containing + quantized weights, scales, and zeros + for performing quantized matrix multiplication operations. + + Args: + input_size_per_partition: The size of the input partition. + output_partition_sizes: The size of the output partition. + input_size: The total size of the input (unused). + output_size: The total size of the output (unused). + params_dtype: + The data type of the parameters (expected to be torch.float16). + + Returns: + A dictionary containing the quantized weights ('qweight'), + scales ('scales'), and zeros ('zeros'). + + Raises: + ValueError: If `params_dtype` is not `torch.float16` or if the input + size per partition is not divisible by the group size + in `quant_config`. + """ + if params_dtype != torch.float16: + raise ValueError( + f"Parameter data type must be torch.float16, but got {params_dtype}" + ) + + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + if input_size_per_partition % group_size != 0: + raise ValueError( + f"Input size per partition ({input_size_per_partition}) must " + f"be divisible by group size ({self.quant_config.group_size})." + ) + + kernel_type = self.kernel_type + # Validate output_size_per_partition + output_size_per_partition = sum(output_partition_sizes) + + is_row_parallel = input_size != input_size_per_partition + weight_loader = extra_weight_attrs.get("weight_loader") + + mp_linear_kernel_config = MPLinearLayerConfig( + full_weight_shape=(input_size, output_size), + partition_weight_shape=( + input_size_per_partition, + output_size_per_partition, + ), + weight_type=self.quant_config.quant_type, + act_type=params_dtype, + group_size=self.quant_config.group_size, + zero_points=False, + has_g_idx=self.quant_config.desc_act, + ) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for GPTQBitBLASLinearMethod", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + # Determine sharding + if bitblas_repeat_scales_on_all_ranks( + self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel + ): + # By setting scale_dim == None, weight_loader will + # repeat the scales on each GPU in TP>1 case. + scales_and_zp_input_dim = None + scales_and_zp_size = input_size // group_size + else: + # By setting scale_dim == 0, weight_loader will + # shard the scales in TP>1 case. + scales_and_zp_input_dim = 0 + scales_and_zp_size = input_size_per_partition // group_size + + # Init buffers + # Quantized weights + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition // self.quant_config.pack_factor, + output_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=0, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + # Activation order + # Ignore warning from fused linear layers such as QKVParallelLinear. + g_idx = RowvLLMParameter( + data=torch.empty( + input_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + weight_loader=weight_loader, + ) + + # Scales + scales = Parameter( + torch.empty( + scales_and_zp_size, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs( + scales, + { + **extra_weight_attrs, + "input_dim": scales_and_zp_input_dim, + "output_dim": 1, + }, + ) + + # Quantized zero-points + qzeros_args = { + "data": torch.empty( + scales_and_zp_size, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + "weight_loader": weight_loader, + } + weight_scale_args = { + "data": torch.empty( + scales_and_zp_size, + output_size_per_partition, + dtype=params_dtype, + ), + "weight_loader": weight_loader, + } + + if scales_and_zp_input_dim is None: + scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args) + qzeros = PackedColumnParameter( + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args, + ) + + else: + scales = GroupQuantScaleParameter( + output_dim=1, input_dim=0, **weight_scale_args + ) + qzeros = PackedvLLMParameter( + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("g_idx", g_idx) + layer.register_parameter("scales", scales) + layer.register_parameter("qzeros", qzeros) + + self.kernel = kernel_type( + mp_linear_kernel_config, + w_q_param_name="qweight", + w_s_param_name="scales", + w_zp_param_name="qzeros", + w_gidx_param_name="g_idx", + bitblas_quant_config=self.quant_config, + ) + + # Initialize or retrieve the BitBLAS matrix multiplication operator. + self.kernel.configure_bitblas_matmul( + input_size_per_partition, + output_size_per_partition, + params_dtype=params_dtype, + bias=False, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + out = self.kernel.apply_gptq_bitblas_linear(layer, x) + if bias is not None: + out.add_(bias) + return out diff --git a/model_executor/layers/quantization/gptq_marlin.py b/model_executor/layers/quantization/gptq_marlin.py new file mode 100644 index 0000000..0699b12 --- /dev/null +++ b/model_executor/layers/quantization/gptq_marlin.py @@ -0,0 +1,1099 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from copy import deepcopy +from typing import Any, Optional + +import torch +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE + +import vllm.model_executor.layers.fused_moe # noqa +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, + FusedMoEMethodBase, + FusedMoeWeightScaleSupported, + UnquantizedFusedMoEMethod, +) +from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( + MPLinearLayerConfig, + choose_mp_linear_kernel, +) +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.gptq_utils import ( + get_dynamic_override, + get_linear_quant_method, + override_config, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + check_marlin_supported, + check_moe_marlin_supports_layer, + marlin_make_workspace_new, + marlin_moe_permute_scales, + marlin_permute_bias, + marlin_repeat_scales_on_all_ranks, + verify_marlin_supported, +) +from vllm.model_executor.parameter import ( + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedColumnParameter, + PackedvLLMParameter, + RowvLLMParameter, +) +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.transformers_utils.config import get_safetensors_params_metadata +from vllm.utils.collection_utils import is_list_of +import ixformer.inference.functions as ixfops + +logger = init_logger(__name__) + +#[B,K//8,N] ->[B,K,N] +# less memmory +def unpack_k_batch_opt(packed_w: torch.Tensor, num_bits: int = 4, chunk_size: int = 2) -> torch.Tensor: + """ + Memory-efficient unpacking for 3D tensor. + Converts [B, K // pack_factor, N] int32 tensor → [B, K, N] int8 tensor, + without broadcasting huge intermediate tensors (avoids OOM). + + Args: + packed_w: torch.int32 tensor of shape [B, K // pack_factor, N]. + num_bits: Number of bits per packed element (e.g., 4 or 2). + chunk_size: How many bit groups to unpack at once (tradeoff between speed and memory). + + Returns: + unpacked: torch.int8 tensor of shape [B, K, N]. + """ + B, k_packed, N = packed_w.shape + pack_factor = 32 // num_bits + K = k_packed * pack_factor + mask = (1 << num_bits) - 1 + + # Allocate output tensor once + unpacked = torch.empty((B, K, N), dtype=torch.int8, device=packed_w.device) + + # Process bit chunks iteratively to save memory + for i in range(0, pack_factor, chunk_size): + # Precompute shifts for this chunk + shift_vals = num_bits * torch.arange(i, min(i + chunk_size, pack_factor), device=packed_w.device) + # [chunk_size, 1, 1, 1] + shifts = shift_vals.view(-1, 1, 1, 1) + # Compute small chunk only + chunk = ((packed_w.unsqueeze(0) >> shifts) & mask).to(torch.int8) + + # chunk: [chunk_size, B, k_packed, N] + # write into output + for j in range(chunk.shape[0]): + unpacked[:, (i + j)::pack_factor, :] = chunk[j] + + del chunk # release memory early + + return unpacked + +# more memmory +def unpack_k_batch(packed_w: torch.Tensor, num_bits: int = 4) -> torch.Tensor: + """ + Efficient vectorized unpacking for 3D tensor (batch version). + Converts [B, K // pack_factor, N] int32 tensor → [B, K, N] int8 tensor. + + Args: + packed_w: torch.int32 tensor of shape [B, K // pack_factor, N]. + num_bits: Number of bits per packed element (e.g., 4). + + Returns: + unpacked: torch.int8 tensor of shape [B, K, N]. + """ + B, k_packed, n = packed_w.shape + pack_factor = 32 // num_bits + k = k_packed * pack_factor + + mask = (1 << num_bits) - 1 + + # [pack_factor, 1, 1, 1] + shifts = (num_bits * torch.arange(pack_factor, device=packed_w.device)).view(-1, 1, 1, 1) + + # [1, B, k_packed, N] + packed_expanded = packed_w.unsqueeze(0) + + # Extract each group of num_bits using bitwise ops + unpacked_groups = ((packed_expanded >> shifts) & mask).to(torch.int8) + + # [pack_factor, B, k_packed, N] → [B, K, N] + unpacked = unpacked_groups.permute(1, 2, 0, 3).reshape(B, k, n) + + return unpacked + + +#[B,K,N] ->[B,K,N//8] +# less memmory +def pack_n_batch_opt(x: torch.Tensor, pack_num: int = 8, order_map=None, chunk_size: int = 2) -> torch.Tensor: + """ + Memory-efficient batch packing with correct bit order. + [B, K, N] int4 -> [B, K, N//pack_num] int32. + """ + B, K, N = x.shape + assert N % pack_num == 0, "N must be divisible by pack_num" + cols = N // pack_num + unit = 32 // pack_num + + if order_map is None: + order_map = list(range(pack_num)) + order_map = torch.tensor(order_map, device=x.device) + + shifts = unit * torch.arange(pack_num, device=x.device) # always 0..unit*(pack_num-1) + packed = torch.zeros((B, K, cols), dtype=torch.int32, device=x.device) + x_reshape = x.view(B, K, cols, pack_num) & 0xF + + # process in chunks for memory efficiency + for start in range(0, pack_num, chunk_size): + end = min(start + chunk_size, pack_num) + idx_chunk = order_map[start:end] + shift_chunk = shifts[start:end] + + vals = torch.gather(x_reshape, 3, idx_chunk.view(1,1,1,-1).expand(B,K,cols,-1)).to(torch.int32) + for j in range(vals.shape[-1]): + packed.add_(vals[..., j] << shift_chunk[j]) + + return packed + +## more memmory +def pack_n_batch(x: torch.Tensor, pack_num: int = 8, order_map=None) -> torch.Tensor: + """ + Efficient vectorized batch packing: [B, K, N] int4 -> [B, K, N//pack_num] int32. + + Args: + x: torch.int32 tensor of shape [B, K, N], each element 0-15 (int4). + pack_num: Number of 4-bit elements per packed int32 (default=8). + order_map: Optional order of elements within each packed int32. + + Returns: + torch.int32 tensor of shape [B, K, N//pack_num]. + """ + + B, K, N = x.shape + assert N % pack_num == 0, "N must be divisible by pack_num" + cols = N // pack_num + + if order_map is None: + order_map = list(range(pack_num)) + order_map = torch.tensor(order_map, device=x.device) + + unit = 32 // pack_num # number of bits per element + + # reshape to [B, K, cols, pack_num] + pack_num_int = int(pack_num) + + x_reshape = x.view(B, K, cols, pack_num_int) + + # reorder according to order_map + x_reorder = torch.gather( + x_reshape, 3, order_map.view(1, 1, 1, -1).expand(B, K, cols, -1) + ) + + # mask low 4 bits + x_reorder = x_reorder & 0xF + + # bit shifts [pack_num] -> [1,1,1,pack_num] broadcastable + shifts = (unit * torch.arange(pack_num_int, device=x.device)).view(1, 1, 1, -1) + + # shift and sum along last dimension to combine bits + packed = (x_reorder << shifts).sum(dim=-1).to(torch.int32) + + return packed + + + +def get_moe_quant_method( + config: "GPTQMarlinConfig", + layer: torch.nn.Module, + prefix: str, + moe_method_cls: type, +): + cloned_config = deepcopy(config) + + if isinstance(layer, FusedMoE): + # False = skip module, None = no override, else = Positive match + if ( + get_dynamic_override( # noqa: E712 + cloned_config, # noqa: E712 + layer_name=prefix, + ) + == False + ): # noqa: E712 + return UnquantizedFusedMoEMethod(layer.moe_config) + + if prefix: + # Dynamic per module/layer rules may override base config + override_config(cloned_config, prefix=prefix) + + return moe_method_cls(cloned_config, layer.moe_config) + return None + + +class GPTQMarlinConfig(QuantizationConfig): + """Config class for GPTQ Marlin""" + + # (num_bits, is_sym) -> quant_type + TYPE_MAP = { + (4, True): scalar_types.uint4b8, + (8, True): scalar_types.uint8b128, + } + + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + is_sym: bool, + lm_head_quantized: bool, + dynamic: dict[str, dict[str, int | bool]], + full_config: dict[str, Any], + modules_in_block_to_quantize: list[str] | None = None, + ) -> None: + super().__init__() + if desc_act and group_size == -1: + # In this case, act_order == True is the same as act_order == False + # (since we have only one group per output channel) + desc_act = False + + # GPTQModel use `dynamic` config property to allow per module + # quantization config so each module can be individually optimized. + # Format is dict[str, dict] where key is a regex string that can + # perform both positive ("+:" prefixed) or negative ("-:" prefixed) + # matching of a module. + # Default to positive match, override base quant config mode, if no + # prefix is used. Value is in dict format of field key and override + # value. + # Negative matching will skip quantization init for this module + # entirely: + # non-quantized inference. More details and quantization examples can be + # found at: https://github.com/ModelCloud/GPTQModel + # Example: + # # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9 + # # last 1/4 of the layers 16-21 has 8bit and group_size 64 + # dynamic = { + # #`.*\.` matches the layers_node prefix + # # positive match layer 10-15 + # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, + # # positive match layer 16-21 + # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, + # r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers + # } + self.dynamic = dynamic + + self.weight_bits = weight_bits + self.is_sym = is_sym + + self.pack_factor = 32 // weight_bits # packed into int32 + self.group_size = group_size + self.desc_act = desc_act + self.lm_head_quantized = lm_head_quantized + self.full_config = full_config + + if (weight_bits, is_sym) not in self.TYPE_MAP: + raise ValueError( + f"Unsupported quantization config: bits={weight_bits}, sym={is_sym}" + ) + + self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] + + self.modules_in_block_to_quantize = modules_in_block_to_quantize or [] + # used to identify GPTQ model quantized by autoround + self.autoround_version = full_config.get("autoround_version", "") + + def __repr__(self) -> str: + return ( + f"GPTQMarlinConfig(quant_type={self.quant_type}, " + f"group_size={self.group_size}, " + f"desc_act={self.desc_act}, " + f"lm_head_quantized={self.lm_head_quantized}, " + f"dynamic={self.dynamic}, " + f"modules_in_block_to_quantize={self.modules_in_block_to_quantize})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "gptq_marlin" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "GPTQMarlinConfig": + dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) + dynamic = {} if dynamic is None else dynamic + + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + desc_act = cls.get_from_keys(config, ["desc_act"]) + is_sym = cls.get_from_keys(config, ["sym"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + modules_in_block_to_quantize = cls.get_from_keys_or( + config, ["modules_in_block_to_quantize"], default=None + ) + return cls( + weight_bits, + group_size, + desc_act, + is_sym, + lm_head_quantized, + dynamic, + config, + modules_in_block_to_quantize, + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg) + + is_valid_user_quant = ( + user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin" + ) + + if can_convert and is_valid_user_quant: + msg = ( + "The model is convertible to {} during runtime." + " Using {} kernel.".format(cls.get_name(), cls.get_name()) + ) + logger.info(msg) + return cls.get_name() + + if can_convert and user_quant == "gptq": + logger.info( + "Detected that the model can run with gptq_marlin" + ", however you specified quantization=gptq explicitly," + " so forcing gptq. Use quantization=gptq_marlin for" + " faster inference" + ) + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, FusedMoE): + from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config + + if not check_moe_marlin_supports_layer(layer, self.group_size): + logger.warning_once( + f"Layer '{prefix}' is not supported by GPTQMoeMarlin. " + "Falling back to Moe WNA16 kernels." + ) + return MoeWNA16Config.from_config(self.full_config).get_quant_method( + layer, prefix + ) + return get_moe_quant_method(self, layer, prefix, GPTQMarlinMoEMethod) + return get_linear_quant_method(self, layer, prefix, GPTQMarlinLinearMethod) + + @classmethod + def is_gptq_marlin_compatible(cls, quant_config: dict[str, Any]): + quant_method = quant_config.get("quant_method", "").lower() + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + sym = quant_config.get("sym") + desc_act = quant_config.get("desc_act") + + if not current_platform.is_cuda(): + return False + + if quant_method != "gptq": + return False + + # Marlin conversion is only valid if required properties are found + if num_bits is None or group_size is None or sym is None or desc_act is None: + return False + + if (num_bits, sym) not in cls.TYPE_MAP: + return False + + return check_marlin_supported( + quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size + ) + + def apply_vllm_mapper(self, hf_to_vllm_mapper): + if self.modules_in_block_to_quantize is not None: + self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list( + self.modules_in_block_to_quantize + ) + + def maybe_update_config(self, model_name: str, revision: str | None = None): + if self.modules_in_block_to_quantize: + if is_list_of(self.modules_in_block_to_quantize, list): + # original modules_in_block_to_quantize: list[list[str]] + # flatten original modules_in_block_to_quantize + self.modules_in_block_to_quantize = [ + item + for sublist in self.modules_in_block_to_quantize + for item in sublist + ] + return + + unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32] + metadata = get_safetensors_params_metadata(model_name, revision=revision) + quant_layers: set[str] = { + param_name.rsplit(".", 1)[0] + for param_name, info in metadata.items() + if (dtype := info.get("dtype", None)) + and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes + } + self.modules_in_block_to_quantize = list(quant_layers) + + +class GPTQMarlinLinearMethod(LinearMethodBase): + """Linear method for GPTQ Marlin. + + Args: + quant_config: The GPTQ Marlin quantization config. + """ + + _kernel_backends_being_used: set[str] = set() + + def __init__(self, quant_config: GPTQMarlinConfig) -> None: + self.quant_config = quant_config + + # Verify supported on platform. + verify_marlin_supported( + quant_type=self.quant_config.quant_type, + group_size=self.quant_config.group_size, + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + output_size_per_partition = sum(output_partition_sizes) + is_row_parallel = input_size != input_size_per_partition + weight_loader = extra_weight_attrs.get("weight_loader") + + mp_linear_kernel_config = MPLinearLayerConfig( + full_weight_shape=(input_size, output_size), + partition_weight_shape=( + input_size_per_partition, + output_size_per_partition, + ), + weight_type=self.quant_config.quant_type, + act_type=params_dtype, + group_size=self.quant_config.group_size, + zero_points=False, + has_g_idx=self.quant_config.desc_act, + ) + + kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for GPTQMarlinLinearMethod", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + # Determine sharding + if marlin_repeat_scales_on_all_ranks( + self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel + ): + # By setting scale_dim == None, weight_loader will + # repeat the scales on each GPU in TP>1 case. + scales_and_zp_input_dim = None + scales_and_zp_size = input_size // group_size + else: + # By setting scale_dim == 0, weight_loader will + # shard the scales in TP>1 case. + scales_and_zp_input_dim = 0 + scales_and_zp_size = input_size_per_partition // group_size + + # Quantized weights + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition // self.quant_config.pack_factor, + output_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=0, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader, + ) + + # Activation order + g_idx = RowvLLMParameter( + data=torch.empty( + input_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + weight_loader=weight_loader, + ) + + qzeros_args = { + "data": torch.empty( + scales_and_zp_size, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + "weight_loader": weight_loader, + } + weight_scale_args = { + "data": torch.empty( + scales_and_zp_size, + output_size_per_partition, + dtype=params_dtype, + ), + "weight_loader": weight_loader, + } + + if scales_and_zp_input_dim is None: + scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args) + qzeros = PackedColumnParameter( + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args, + ) + + else: + scales = GroupQuantScaleParameter( + output_dim=1, input_dim=0, **weight_scale_args + ) + qzeros = PackedvLLMParameter( + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args, + ) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("g_idx", g_idx) + layer.register_parameter("scales", scales) + layer.register_parameter("qzeros", qzeros) + + self.kernel = kernel_type( + mp_linear_kernel_config, + w_q_param_name="qweight", + w_s_param_name="scales", + w_zp_param_name="qzeros", + w_gidx_param_name="g_idx", + ) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) + + +class GPTQMarlinMoEMethod(FusedMoEMethodBase): + """MoE Marlin method with quantization.""" + + def __init__( + self, + quant_config: GPTQMarlinConfig, + moe: FusedMoEConfig, + ) -> None: + super().__init__(moe) + self.quant_config = quant_config + if self.quant_config.quant_type.size_bits == 4: + self.quant_type = scalar_types.uint4b8 + # elif self.quant_config.quant_type.size_bits == 8: + # self.quant_type = scalar_types.uint8b128 + else: + raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.") + self.use_marlin = True + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full") + + self.is_k_full = (not self.quant_config.desc_act) or ( + intermediate_size_per_partition == intermediate_size_full + ) + + if self.quant_config.group_size != -1: + scales_size13 = hidden_size // self.quant_config.group_size + w2_scales_size = ( + intermediate_size_full + if self.quant_config.desc_act + else intermediate_size_per_partition + ) + scales_size2 = w2_scales_size // self.quant_config.group_size + strategy = FusedMoeWeightScaleSupported.GROUP.value + else: + scales_size13 = 1 + scales_size2 = 1 + strategy = FusedMoeWeightScaleSupported.CHANNEL.value + + extra_weight_attrs.update({"quant_method": strategy, "is_transposed": True}) + # Fused gate_up_proj (column parallel) + w13_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size // self.quant_config.pack_factor, + 2 * intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_qweight", w13_qweight) + set_weight_attrs(w13_qweight, extra_weight_attrs) + # down_proj (row parallel) + w2_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition // self.quant_config.pack_factor, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_qweight", w2_qweight) + set_weight_attrs(w2_qweight, extra_weight_attrs) + # up_proj scales + w13_scales = torch.nn.Parameter( + torch.empty( + num_experts, + scales_size13, + 2 * intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_scales", w13_scales) + set_weight_attrs(w13_scales, extra_weight_attrs) + # down_proj scales + w2_scales = torch.nn.Parameter( + torch.empty(num_experts, scales_size2, hidden_size, dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_scales", w2_scales) + set_weight_attrs(w2_scales, extra_weight_attrs) + # don't shard the w2 scales when running act order + set_weight_attrs(w2_scales, {"load_full_w2": self.quant_config.desc_act}) + # up_proj scales + w13_qzeros = torch.nn.Parameter( + torch.empty( + num_experts, + scales_size13, + 2 * intermediate_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_qzeros", w13_qzeros) + set_weight_attrs(w13_qzeros, extra_weight_attrs) + # down_proj scales + w2_qzeros = torch.nn.Parameter( + torch.empty( + num_experts, + scales_size2, + hidden_size // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_qzeros", w2_qzeros) + set_weight_attrs(w2_qzeros, extra_weight_attrs) + # don't shard the w2 scales when running act order + set_weight_attrs(w2_qzeros, {"load_full_w2": self.quant_config.desc_act}) + w13_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_g_idx", w13_g_idx) + set_weight_attrs(w13_g_idx, extra_weight_attrs) + w2_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_g_idx", w2_g_idx) + set_weight_attrs(w2_g_idx, extra_weight_attrs) + w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices) + set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs) + w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size_per_partition, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices) + set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs) + + device = layer.w13_qweight.device + # layer.workspace = marlin_make_workspace_new(device, 4) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # Process act_order + # if self.quant_config.desc_act: + # Get sorting based on g_idx + # num_experts = layer.w13_g_idx.shape[0] + # w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx) + # w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx) + # w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx) + # w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx) + # for e in range(num_experts): + # w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_g_idx[e]).to( + # torch.int32 + # ) + # w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to( + # torch.int32 + # ) + # w13_sorted_g_idx[e] = layer.w13_g_idx[e][w13_g_idx_sort_indices[e]] + # w2_sorted_g_idx[e] = layer.w2_g_idx[e][w2_g_idx_sort_indices[e]] + # replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx) + # replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx) + # replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices) + # replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices) + # else: + # # Reset g_idx related tensors + # num_experts = layer.w13_g_idx.shape[0] + # device = layer.w13_g_idx.device + # layer.w13_g_idx = torch.nn.Parameter( + # torch.empty((num_experts, 0), dtype=torch.int32, device=device), + # requires_grad=False, + # ) + # layer.w2_g_idx = torch.nn.Parameter( + # torch.empty((num_experts, 0), dtype=torch.int32, device=device), + # requires_grad=False, + # ) + # layer.w13_g_idx_sort_indices = torch.nn.Parameter( + # torch.empty((num_experts, 0), dtype=torch.int32, device=device), + # requires_grad=False, + # ) + # layer.w2_g_idx_sort_indices = torch.nn.Parameter( + # torch.empty((num_experts, 0), dtype=torch.int32, device=device), + # requires_grad=False, + # ) + # # Repack weights + # marlin_w13_qweight = ops.gptq_marlin_moe_repack( + # layer.w13_qweight, + # layer.w13_g_idx_sort_indices, + # layer.w13_qweight.shape[1] * self.quant_config.pack_factor, + # layer.w13_qweight.shape[2], + # self.quant_config.quant_type.size_bits, + # ) + # replace_parameter(layer, "w13_qweight", marlin_w13_qweight) + # marlin_w2_qweight = ops.gptq_marlin_moe_repack( + # layer.w2_qweight, + # layer.w2_g_idx_sort_indices, + # layer.w2_qweight.shape[1] * self.quant_config.pack_factor, + # layer.w2_qweight.shape[2], + # self.quant_config.quant_type.size_bits, + # ) + # replace_parameter(layer, "w2_qweight", marlin_w2_qweight) + # # Repack scales + # marlin_w13_scales = marlin_moe_permute_scales( + # s=layer.w13_scales, + # size_k=layer.intermediate_size_per_partition, + # size_n=layer.w13_scales.shape[2], + # group_size=self.quant_config.group_size, + # ) + # replace_parameter(layer, "w13_scales", marlin_w13_scales) + # marlin_w2_scales = marlin_moe_permute_scales( + # s=layer.w2_scales, + # size_k=layer.w2_scales.shape[1] + # * ( + # self.quant_config.group_size + # if self.quant_config.group_size != -1 + # else self.quant_config.pack_factor + # ), + # size_n=layer.w2_scales.shape[2], + # group_size=self.quant_config.group_size, + # ) + # replace_parameter(layer, "w2_scales", marlin_w2_scales) + + # if hasattr(layer, "w13_bias") and layer.w13_bias is not None: + # layer.w13_bias.data = marlin_permute_bias(layer.w13_bias) + + # if hasattr(layer, "w2_bias") and layer.w2_bias is not None: + # layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) + if self.quant_config.desc_act: + raise NotImplementedError( + "GPTQMarlinMoEMethod now not support desc_act. please fix it") + w13_qweight_unpacked = unpack_k_batch(layer.w13_qweight) + w13_qweight_repacked = pack_n_batch(w13_qweight_unpacked,self.quant_config.pack_factor,order_map=[0, 2, 4, 6, 1, 3, 5, 7]) + replace_parameter(layer, "w13_qweight", w13_qweight_repacked) + + # quant vllm/model_executor/layers/quantization/utils/quant_utils.py#quantize_weights + # if quant_type.has_bias(): + # w_q += quant_type.bias + # use quant_type.bias as zp,(ixformer support) + w13_zp = torch.full_like(layer.w13_scales, self.quant_type.bias, dtype=torch.int32) + w13_zp_pack = pack_n_batch(w13_zp, self.quant_config.pack_factor, order_map=[0, 2, 4, 6, 1, 3, 5, 7]).contiguous() + replace_parameter(layer, "w13_qzeros", w13_zp_pack) + + w2_qweight_unpacked = unpack_k_batch(layer.w2_qweight) + w2_qweight_repacked = pack_n_batch(w2_qweight_unpacked,self.quant_config.pack_factor,order_map=[0, 2, 4, 6, 1, 3, 5, 7]) + replace_parameter(layer, "w2_qweight", w2_qweight_repacked) + + w2_zp = torch.full_like(layer.w2_scales, self.quant_type.bias, dtype=torch.int32) + w2_zp_pack = pack_n_batch(w2_zp, self.quant_config.pack_factor, order_map=[0, 2, 4, 6, 1, 3, 5, 7]).contiguous() + replace_parameter(layer, "w2_qzeros", w2_zp_pack) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `GPTQMarlinMoEMethod` yet." + ) + + assert activation == "silu", "Only SiLU activation is supported." + use_ep = expert_map is not None + + if use_ep: + start_eid = layer.ep_rank * layer.local_num_experts + end_eid = min((layer.ep_rank + 1) * layer.local_num_experts, global_num_experts) + + if apply_router_weight_on_input: + raise NotImplementedError( + "GPTQMarlinMoEMethod Apply router weight on input is not supported for" + "fused Marlin MoE method.") + + if (hasattr(layer, "w13_bias") and layer.w13_bias is not None) or (hasattr(layer, "w2_bias") and layer.w2_bias is not None): + raise NotImplementedError( + "GPTQMarlinMoEMethod moe_w4a16_group_gemm not supported bias, please fix this") + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype) + + + num_tokens, num_experts = router_logits.shape + + if use_ep: + hidden_size = x.shape[1] + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + expand_tokens, + ) = ixfops.moe_compute_token_index_ep( + topk_ids=topk_ids, + num_experts=num_experts, + start_expert_id=start_eid, + end_expert_id=end_eid, + ) + if expert_sizes_cpu.sum() == 0: + return torch.zeros( + (num_tokens, hidden_size), + device=x.device, + dtype=x.dtype, + ) + else: + expand_tokens = num_tokens * top_k + ( + src_to_dst, + sorted_token_ids, + expert_sizes_gpu, + expert_sizes_cpu, + ) = ixfops.moe_compute_token_index( + topk_ids=topk_ids, + num_experts=num_experts, + ) + expert_sizes_cpu = expert_sizes_gpu.cpu() + + # expand + reorder + # TODO use kernel + expand_hidden_states = ixfops.moe_expand_input( + hidden_states=x, + dst_to_src=sorted_token_ids, + dst_tokens=expand_tokens, + topk=top_k, + src_to_dst=src_to_dst, + ) + + # w4a16 group gemm 1 + # pt_output_1: (expand_tokens, 2n) dtype + pt_output_1 = ixfops.moe_w4a16_group_gemm( + input=expand_hidden_states, + weight=layer.w13_qweight, + w_scales=layer.w13_scales, + quant_type="awq", + tokens_per_experts=expert_sizes_cpu, + w_zeros=layer.w13_qzeros, + group_size=self.quant_config.group_size, + dst_to_src=None, + format="NN", + tokens_per_experts_gpu=expert_sizes_gpu, + ) + + # act + pt_output_2 = ixfops.silu_and_mul(pt_output_1) + + # w4a16 group gemm 2 + reorder + # pt_output_3: (expand_tokens, k) dtype + if use_ep: + pt_output_3 = torch.empty( + (num_tokens * top_k, hidden_size), + device=x.device, + dtype=x.dtype, + ) + + ixfops.moe_w4a16_group_gemm( + input=pt_output_2, + weight=layer.w2_qweight, + w_scales=layer.w2_scales, + quant_type="awq", + tokens_per_experts=expert_sizes_cpu, + w_zeros=layer.w2_qzeros, + group_size=self.quant_config.group_size, + dst_to_src=sorted_token_ids, + format="NN", + output=pt_output_3, + tokens_per_experts_gpu=expert_sizes_gpu, + ) + + reduce_mask = src_to_dst == -1 + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + scaling_factor=routed_scaling_factor, + mask=reduce_mask, + ) + else: + pt_output_3 = ixfops.moe_w4a16_group_gemm( + input=pt_output_2, + weight=layer.w2_qweight, + w_scales=layer.w2_scales, + quant_type="awq", + tokens_per_experts=expert_sizes_cpu, + w_zeros=layer.w2_qzeros, + group_size=self.quant_config.group_size, + dst_to_src=sorted_token_ids, + format="NN", + tokens_per_experts_gpu=expert_sizes_gpu, + ) + + # mul + reduce_sum + # final_hidden_states: (num_tokens, k) + final_hidden_states = ixfops.moe_output_reduce_sum( + input=pt_output_3.view(num_tokens, top_k, -1), + topk_weight=topk_weights, + scaling_factor=routed_scaling_factor + ) + return final_hidden_states + + + + + + # return torch.ops.vllm.fused_marlin_moe( + # x, + # layer.w13_qweight, + # layer.w2_qweight, + # getattr(layer, "w13_bias", None), + # getattr(layer, "w2_bias", None), + # layer.w13_scales, + # layer.w2_scales, + # router_logits, + # topk_weights, + # topk_ids, + # quant_type_id=self.quant_type.id, + # apply_router_weight_on_input=apply_router_weight_on_input, + # global_num_experts=global_num_experts, + # expert_map=expert_map, + # g_idx1=layer.w13_g_idx, + # g_idx2=layer.w2_g_idx, + # sort_indices1=layer.w13_g_idx_sort_indices, + # sort_indices2=layer.w2_g_idx_sort_indices, + # workspace=layer.workspace, + # is_k_full=self.is_k_full) diff --git a/model_executor/layers/quantization/gptq_marlin_24.py b/model_executor/layers/quantization/gptq_marlin_24.py new file mode 100644 index 0000000..2fb614b --- /dev/null +++ b/model_executor/layers/quantization/gptq_marlin_24.py @@ -0,0 +1,320 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import ( + QuantizationConfig, + QuantizationMethods, +) +from vllm.model_executor.parameter import ( + BasevLLMParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedvLLMParameter, +) +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + +GPTQ_MARLIN_24_TILE = 16 +GPTQ_MARLIN_24_MIN_THREAD_N = 128 +GPTQ_MARLIN_24_MIN_THREAD_K = 128 +GPTQ_MARLIN_24_MAX_PARALLEL = 64 + +GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128] +GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128] + + +class GPTQMarlin24Config(QuantizationConfig): + """Config class for Marlin24.""" + + def __init__( + self, + weight_bits: int, + group_size: int, + ) -> None: + super().__init__() + quant_type = { + 4: scalar_types.uint4b8, + 8: scalar_types.uint8b128, + }.get(weight_bits) + + self.group_size = group_size + + # Verify + if quant_type is None or quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES: + raise ValueError( + f"Marlin_24 does not support quant_type = {quant_type}. " + f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES} " + "are supported." + ) + if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES: + raise ValueError( + f"Marlin_24 does not support group_size = {self.group_size}. " + f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} " + "are supported." + ) + + self.quant_type = quant_type + + # 4 Bits packed into 32 bit datatype. + self.pack_factor = 32 // self.quant_type.size_bits + + # Tile size used by marlin kernels. + self.tile_size = 16 + + # Min out_features dim + self.min_n_threads = GPTQ_MARLIN_24_MIN_THREAD_N + + # Min in_features dim + self.min_k_threads = GPTQ_MARLIN_24_MIN_THREAD_K + + # Max parallel problems to solve at once (improves large + # batch performance) + self.max_parallel = GPTQ_MARLIN_24_MAX_PARALLEL + + # Permutation length used by the marlin kernels. + self.perm_len = 1024 + + def __repr__(self) -> str: + return "Marlin24Config(quant_type={}, group_size={})".format( + self.quant_type, self.group_size + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "gptq_marlin_24" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half] + + @classmethod + # Need to figure it out + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "GPTQMarlin24Config": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + return cls(weight_bits, group_size) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + is_marlin_24_format = hf_quant_cfg.get("checkpoint_format") == "marlin_24" + + is_valid_user_quant = ( + user_quant is None or user_quant == "gptq" or user_quant == "gptq_marlin_24" + ) + + if is_marlin_24_format and is_valid_user_quant: + msg = "The model is serialized in {} format. Using {} kernel.".format( + cls.get_name(), cls.get_name() + ) + logger.info(msg) + return cls.get_name() + + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["GPTQMarlin24LinearMethod"]: + if isinstance(layer, LinearBase): + return GPTQMarlin24LinearMethod(self) + return None + + +class GPTQMarlin24LinearMethod(LinearMethodBase): + """Linear method for Marlin24. + + Args: + quant_config: The Marlin24 quantization config. + """ + + def __init__(self, quant_config: GPTQMarlin24Config): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del output_size # Unused. + weight_loader = extra_weight_attrs["weight_loader"] + if params_dtype != torch.float16: + raise ValueError( + f"The params dtype must be float16, but got {params_dtype}" + ) + + # Validate output_size_per_partition + output_size_per_partition = sum(output_partition_sizes) + if output_size_per_partition % self.quant_config.min_n_threads != 0: + raise ValueError( + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f"min_n_threads = {self.quant_config.min_n_threads}." + ) + if output_size_per_partition % self.quant_config.pack_factor != 0: + raise ValueError( + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f"pack_factor = {self.quant_config.pack_factor}." + ) + + # Validate input_size_per_partition + if input_size_per_partition % self.quant_config.min_k_threads != 0: + raise ValueError( + f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible by " + f"min_k_threads = {self.quant_config.min_k_threads}." + ) + if ( + self.quant_config.group_size != -1 + and input_size_per_partition % self.quant_config.group_size != 0 + ): + raise ValueError( + f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible by " + f"group_size = {self.quant_config.group_size}." + ) + + # Check that we have at least 4 tiles horizontally in the shard + num_tiles_per_perm = self.quant_config.perm_len // ( + self.quant_config.tile_size**2 + ) + if output_size_per_partition % num_tiles_per_perm != 0: + raise ValueError("Each permutation group must reside on the same gpu") + + # Quantized 4Bit weights packed into Int32. + qweight = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition // self.quant_config.tile_size // 2, + output_size_per_partition + * self.quant_config.tile_size + // self.quant_config.pack_factor, + device="cuda", + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + marlin_tile_size=self.quant_config.tile_size, + weight_loader=weight_loader, + ) + + # Meta + meta = PackedvLLMParameter( + data=torch.empty( + input_size_per_partition // 8 // 2 // 2, + output_size_per_partition * 2, + device="cuda", + dtype=torch.int16, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=1, + marlin_tile_size=2, + weight_loader=weight_loader, + ) + + # Determine if channelwise or not + input_groups = ( + 1 + if self.quant_config.group_size == -1 + else input_size_per_partition // self.quant_config.group_size + ) + + weight_scale_args = { + "data": torch.empty( + input_groups, + output_size_per_partition, + device="cuda", + dtype=params_dtype, + ), + "weight_loader": weight_loader, + } + if input_groups == 1: + scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args) + else: + scales = GroupQuantScaleParameter( + output_dim=1, input_dim=0, **weight_scale_args + ) + + # Allocate workspace (Used for internal locking mechanism) + max_workspace_size = ( + output_size_per_partition // self.quant_config.min_n_threads + ) * self.quant_config.max_parallel + + workspace = BasevLLMParameter( + data=torch.zeros(max_workspace_size, device="cuda", dtype=torch.int), + weight_loader=weight_loader, + ) + + layer.register_parameter("B_24", qweight) + layer.register_parameter("B_meta", meta) + layer.register_parameter("s", scales) + layer.register_parameter("workspace", workspace) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # required by torch.compile + layer.B_24 = Parameter(layer.B_24.data, requires_grad=False) + layer.s = Parameter(layer.s.data, requires_grad=False) + layer.B_meta = Parameter(layer.B_meta.data, requires_grad=False) + layer.workspace = Parameter(layer.workspace.data, requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + qweight = layer.B_24 + meta = layer.B_meta + scales = layer.s + workspace = layer.workspace + + x_2d = x.view(-1, x.shape[-1]) + + size_m = x_2d.shape[0] + size_k = x_2d.shape[1] + size_n = scales.shape[1] + + output_2d = ops.gptq_marlin_24_gemm( + x_2d, + qweight, + meta, + scales, + workspace, + self.quant_config.quant_type, + size_m, + size_n, + size_k, + ) + + output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],)) + + if bias is not None: + output.add_(bias) # In-place add + + return output diff --git a/model_executor/layers/quantization/hqq_marlin.py b/model_executor/layers/quantization/hqq_marlin.py new file mode 100644 index 0000000..5fb67c3 --- /dev/null +++ b/model_executor/layers/quantization/hqq_marlin.py @@ -0,0 +1,371 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Optional + +import torch + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + GPTQ_MARLIN_MAX_PARALLEL, + GPTQ_MARLIN_MIN_THREAD_N, + marlin_make_empty_g_idx, + marlin_permute_bias, + marlin_permute_scales, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( + MarlinWorkspace, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack +from vllm.model_executor.parameter import ( + BasevLLMParameter, + GroupQuantScaleParameter, + PackedvLLMParameter, +) +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + + +class HQQMarlinConfig(QuantizationConfig): + """Config class for HQQ Marlin""" + + def __init__( + self, + weight_bits: int, + group_size: int, + skip_modules: list[str] | None = None, + ) -> None: + super().__init__() + assert group_size == 64, "The only supported HQQ group size is currently 64." + assert weight_bits == 4, ( + "The only supported HQQ quantization bitsize is currently 4." + ) + + self.weight_bits = weight_bits + self.group_size = group_size + self.pack_factor = 32 // weight_bits # packed into int32 in GPTQ format + self.quant_type = scalar_types.uint4 + self.skip_modules = skip_modules + + def __repr__(self) -> str: + return ( + f"HQQMarlinConfig(quant_type={self.quant_type}, " + f"group_size={self.group_size})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "hqq" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "HQQMarlinConfig": + wq_params = config["quant_config"]["weight_quant_params"] + weight_bits = cls.get_from_keys(wq_params, ["nbits"]) + group_size = cls.get_from_keys(wq_params, ["group_size"]) + skip_modules = config["skip_modules"] + return cls(weight_bits, group_size, skip_modules) + + def is_layer_skipped(self, prefix: str) -> bool: + # Split the prefix into its dot-separated components + components = prefix.split(".") + + # Check if any of the skip modules exactly matches any component + return self.skip_modules is not None and any( + module_name in components for module_name in self.skip_modules + ) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + if self.is_layer_skipped(prefix): + return UnquantizedLinearMethod() + return HQQMarlinMethod(self) + return None + + +# Empty HQQ parameter, will be ignored during loading +class HQQEmptyParameter(BasevLLMParameter): + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + pass + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + pass + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + pass + + +def error_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + raise ValueError("No loader provided for HQQ parameter!") + + +# HQQ packing creates issues with sharding - therefore, prior to loading, we +# repack to GPTQ. We also reshape the weights to their proper GPTQ shape. +class HQQweightParameter(PackedvLLMParameter): + # unpack function from https://github.com/mobiusml/hqq + def unpack_4bit_u8(self, W_q: torch.Tensor) -> torch.Tensor: # uint8/2 > uint8 + assert self.weight_bits == 4, "Unsupported quant bitsize (must be 4)" + + dtype = torch.uint8 + step = W_q.shape[0] + tmp = torch.empty([2 * step, W_q.shape[1]], dtype=dtype, device=W_q.device) + tmp[:step] = (W_q & 0b11110000) >> 4 + tmp[step:] = W_q & 0b00001111 + return tmp + + def __init__(self, packed_factor: int, packed_dim: int, weight_bits: int, **kwargs): + super().__init__(packed_factor, packed_dim, None, **kwargs) + self.weight_bits = weight_bits + self.input_shape = self.shape[self.input_dim] * self.packed_factor + self.output_shape = self.shape[self.output_dim] + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + loaded_weight = self.unpack_4bit_u8(loaded_weight) + loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(1, 0) + loaded_weight = gptq_pack( + loaded_weight, + self.weight_bits, + loaded_weight.shape[0], + loaded_weight.shape[1], + ) + super().load_merged_column_weight(loaded_weight, **kwargs) + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + loaded_weight = self.unpack_4bit_u8(loaded_weight) + loaded_weight = loaded_weight.reshape(self.output_shape, -1).transpose(1, 0) + loaded_weight = gptq_pack( + loaded_weight, + self.weight_bits, + loaded_weight.shape[0], + loaded_weight.shape[1], + ) + super().load_row_parallel_weight(loaded_weight) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + loaded_weight = self.unpack_4bit_u8(loaded_weight) + loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(1, 0) + loaded_weight = gptq_pack( + loaded_weight, + self.weight_bits, + loaded_weight.shape[0], + loaded_weight.shape[1], + ) + super().load_qkv_weight(loaded_weight, **kwargs) + + +# Zero points and scales in HQQ must also be reshaped to correspond to W_q's +# GPTQ shape (transposed - we transpose them too when processing weights). +class HQQZeroScaleParameter(GroupQuantScaleParameter): + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + loaded_weight = loaded_weight.reshape(-1, self.shape[1]) + super().load_merged_column_weight(loaded_weight, **kwargs) + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + loaded_weight = loaded_weight.reshape(self.shape[0], -1) + super().load_row_parallel_weight(loaded_weight) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + loaded_weight = loaded_weight.reshape(-1, self.shape[1]) + super().load_qkv_weight(loaded_weight, **kwargs) + + +class HQQMarlinMethod(LinearMethodBase): + """Linear method for HQQ Marlin.""" + + def __init__( + self, + quant_config: HQQMarlinConfig, + ): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + self.output_size_per_partition = sum(output_partition_sizes) + self.input_size_per_partition = input_size_per_partition + + weight_loader = extra_weight_attrs.get("weight_loader", error_loader) + + self.scales_and_zp_size = ( + input_size_per_partition // self.quant_config.group_size + ) + + qweight = HQQweightParameter( + data=torch.empty( + self.input_size_per_partition // self.quant_config.pack_factor, + self.output_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=0, + packed_factor=self.quant_config.pack_factor, + weight_bits=self.quant_config.weight_bits, + weight_loader=weight_loader, + ) + + zeros = HQQZeroScaleParameter( + data=torch.empty( + self.output_size_per_partition, + self.scales_and_zp_size, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + scales = HQQZeroScaleParameter( + data=torch.empty( + self.output_size_per_partition, + self.scales_and_zp_size, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("W_q", qweight) + layer.register_parameter("zero", zeros) + layer.register_parameter("scale", scales) + + # Ignore extra parameters in the HQQ model. + # To be added as needed. + ignore_parameters = ( + "axis", + "channel_wise", + "compute_dtype", + "encoded_state_dict", + "group_size", + "nbits", + "offload_meta", + "optimize", + "packing", + "quant_scale", + "quant_zero", + "round_zero", + "shape", + "stores_quant_config", + "unpack_view_dtype", + "view_as_float", + ) + for name in ignore_parameters: + layer.register_parameter( + name, + HQQEmptyParameter(data=torch.empty(0), weight_loader=weight_loader), + ) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + dev = layer.W_q.device + + # Repack to Marlin + sort_indices = torch.empty(0, dtype=torch.int, device=dev) + marlin_w_q = ops.gptq_marlin_repack( + layer.W_q, + sort_indices, + self.input_size_per_partition, + self.output_size_per_partition, + self.quant_config.weight_bits, + ).to(dev) + marlin_s = marlin_permute_scales( + layer.scale.transpose(1, 0), + self.input_size_per_partition, + self.output_size_per_partition, + self.quant_config.group_size, + ).to(dev) + marlin_zp = marlin_permute_scales( + layer.zero.transpose(1, 0), + self.input_size_per_partition, + self.output_size_per_partition, + self.quant_config.group_size, + ).to(dev) + + layer.g_idx = marlin_make_empty_g_idx(dev) + layer.g_idx_sort_indices = marlin_make_empty_g_idx(dev) + + layer.marlin_qweight = marlin_w_q + layer.marlin_zeros = marlin_zp + layer.marlin_scales = marlin_s + + if hasattr(layer, "bias") and layer.bias is not None: + layer.bias.data = marlin_permute_bias(layer.bias) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + workspace = MarlinWorkspace( + self.output_size_per_partition, + GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL, + ) + + scales = layer.marlin_scales + zeros = layer.marlin_zeros + orig_type = x.dtype + + if orig_type != torch.float16: + x = x.to(torch.float16) + scales = scales.to(torch.float16) + zeros = zeros.to(torch.float16) + + marlin_out = ops.gptq_marlin_gemm( + x, + None, + layer.marlin_qweight, + bias, + scales, + None, + zeros, + layer.g_idx, + layer.g_idx_sort_indices, + workspace.scratch, + scalar_types.uint4, + x.shape[0], + self.output_size_per_partition, + self.input_size_per_partition, + True, # is_k_full + False, # use atomic add + True, # use 32-bit reduce + True, # use float zp + ) + + if orig_type != torch.float16: + marlin_out = marlin_out.to(orig_type) + + return marlin_out diff --git a/model_executor/layers/quantization/inc.py b/model_executor/layers/quantization/inc.py new file mode 100644 index 0000000..4e73637 --- /dev/null +++ b/model_executor/layers/quantization/inc.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Intel Gaudi supports quantization of various modules and functions, +# including, but not limited to `Linear`, `KVCache`, `Matmul` and `Softmax`. +# During model loading, +# INC will patch layers with quantization/dequantization operators. +# Meanwhile, INC will convert original weight to target datatype +# and loading to target device. +# static scaling should be provided through Quant_CONFIG: +# `QUANT_CONFIG` is an environment variable, +# that points to the measurement or quantization JSON config file. +# The measurement configuration file is used during the calibration procedure, +# to collect measurements for a given model. +# The quantization configuration is used during inference. +# For more information, please refer to: +# https://docs.habana.ai/en/v1.21.1/PyTorch/vLLM_Inference/vLLM_FP8_Inference.html + +from typing import Any, Optional + +import torch + +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, + UnquantizedFusedMoEMethod, +) +from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) + + +class INCConfig(QuantizationConfig): + """Config class for FP8 using Intel Neural Compressor.""" + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "inc" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "INCConfig": + raise AssertionError + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + return UnquantizedLinearMethod() + elif isinstance(layer, FusedMoE): + return UnquantizedFusedMoEMethod(layer.moe_config) + return None + + @classmethod + def get_min_capability(cls) -> int: + raise AssertionError + + @staticmethod + def get_config_filenames() -> list[str]: + return [] diff --git a/model_executor/layers/quantization/input_quant_fp8.py b/model_executor/layers/quantization/input_quant_fp8.py new file mode 100644 index 0000000..7ded8ee --- /dev/null +++ b/model_executor/layers/quantization/input_quant_fp8.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import torch.nn.functional as F + +from vllm import _custom_ops as ops +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.platforms import current_platform + +# Using the default value (240.0) from pytorch will cause accuracy +# issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm. +_FP8_DTYPE = current_platform.fp8_dtype() +_FP8_FINFO = torch.finfo(_FP8_DTYPE) +_FP8_MAX = 224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.max +_FP8_MIN = -224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.min +_FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0) + + +@CustomOp.register("quant_fp8") +class QuantFP8(CustomOp): + """ + Quantize input tensor to FP8 (per-tensor, per-token, or per-group). + This CustomOp supports both static and dynamic quantization. + """ + + def __init__( + self, + static: bool, + group_shape: GroupShape, + num_token_padding: int | None = None, + column_major_scales: bool = False, + use_ue8m0: bool | None = None, # for Torch compile + ): + """ + :param static: static or dynamic quantization + :param group_shape: quantization group shape (PER_TOKEN, PER_TENSOR, + or arbitrary block size) + :param num_token_padding: Pad the token dimension of output to this + size + :param column_major_scales: For group quantization, output scales in + column major format + """ + super().__init__() + self.static = static + self.group_shape = group_shape + self.num_token_padding = num_token_padding + self.column_major_scales = column_major_scales + self.use_ue8m0 = use_ue8m0 + + self.is_group_quant = group_shape.is_per_group() + if self.is_group_quant: + assert not static, "Group quantization only supports dynamic mode" + self.group_size = group_shape.col + else: + assert group_shape in {GroupShape.PER_TOKEN, GroupShape.PER_TENSOR} + assert not static or group_shape == GroupShape.PER_TENSOR, ( + "Only per-tensor scales supported for static quantization." + ) + self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN + + def forward_cuda( + self, + x: torch.Tensor, + scale: torch.Tensor | None = None, + scale_ub: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if self.is_group_quant: + assert scale is None, "Group quantization is always dynamic" + from vllm.model_executor.layers.quantization.utils import fp8_utils + + return fp8_utils.per_token_group_quant_fp8( + x, + group_size=self.group_size, + column_major_scales=self.column_major_scales, + dtype=_FP8_DTYPE, + use_ue8m0=self.use_ue8m0, + ) + + assert (scale is not None) == self.static + assert scale_ub is None or ( + not self.static + and self.group_shape == GroupShape.PER_TOKEN + and scale_ub.numel() == 1 + ) + return ops.scaled_fp8_quant( + x, + scale, + num_token_padding=self.num_token_padding, + scale_ub=scale_ub, + use_per_token_if_dynamic=self.use_per_token_if_dynamic, + ) + + def forward_native( + self, + x: torch.Tensor, + scale: torch.Tensor | None = None, + scale_ub: torch.Tensor | None = None, + ): + if self.is_group_quant: + assert scale is None, "Group quantization is always dynamic" + return self._quantize_group_native(x) + + assert (scale is not None) == self.static + assert scale_ub is None or ( + not self.static + and self.group_shape == GroupShape.PER_TOKEN + and scale_ub.numel() == 1 + ) + + if scale is None: + if self.group_shape == GroupShape.PER_TOKEN: + x_max, _ = x.abs().max(dim=-1) + x_max = x_max.unsqueeze(-1).to(torch.float32) + if scale_ub is not None: + x_max = x_max.clamp(max=scale_ub) + else: + x_max = x.abs().max().unsqueeze(-1).to(torch.float32) + + scale = (x_max / _FP8_MAX).clamp(min=_FP8_MIN_SCALING_FACTOR) + + # Even for dynamic per-token scales, + # reciprocal performs slightly better than division + out = x.to(torch.float32) * scale.reciprocal() + out = out.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE) + + # This currently generates an extra Triton kernel in compilation. + # Fortunately, we don't use padding if compiling. + # TODO(luka): benchmark torch._scaled_mm to hopefully remove padding + # in general. + if self.num_token_padding is not None: + padding = max(self.num_token_padding - out.size(0), 0) + out = F.pad(out, (0, 0, 0, padding), "constant", 0.0) + + return out, scale + + def _quantize_group_native( + self, x: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + orig_shape = x.shape + hidden_dim = x.shape[-1] + num_groups = (hidden_dim + self.group_size - 1) // self.group_size + padded_dim = num_groups * self.group_size + + if padded_dim != hidden_dim: + padding = padded_dim - hidden_dim + x = F.pad(x, (0, padding), mode="constant", value=0.0) + + x_grouped = x.view(-1, num_groups, self.group_size) + absmax = x_grouped.abs().max(dim=-1, keepdim=True)[0].float() + scales_raw = absmax / _FP8_MAX + if self.use_ue8m0: + scales_raw = torch.exp2(torch.ceil(torch.log2(scales_raw))) + scales = (scales_raw).clamp(min=_FP8_MIN_SCALING_FACTOR) + + x_scaled = x_grouped / scales + x_quant = x_scaled.clamp(_FP8_MIN, _FP8_MAX).to(_FP8_DTYPE) + + x_quant = x_quant.view(-1, padded_dim) + if padded_dim != hidden_dim: + x_quant = x_quant[..., :hidden_dim] + x_quant = x_quant.view(orig_shape) + + scales = scales.squeeze(-1) + scales = scales.reshape(orig_shape[:-1] + (num_groups,)) + + if self.column_major_scales: + scales = scales.transpose(-2, -1).contiguous().transpose(-1, -2) + + return x_quant, scales diff --git a/model_executor/layers/quantization/ipex_quant.py b/model_executor/layers/quantization/ipex_quant.py new file mode 100644 index 0000000..5ca9167 --- /dev/null +++ b/model_executor/layers/quantization/ipex_quant.py @@ -0,0 +1,467 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any, Optional + +import torch +from packaging import version +from torch.nn import Module +from torch.nn.parameter import Parameter + +from vllm._ipex_ops import ipex_ops as ops +from vllm.model_executor.layers.fused_moe import ( + FusedMoEMethodBase, + FusedMoeWeightScaleSupported, +) +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import ( + QuantizationConfig, + QuantizationMethods, +) +from vllm.model_executor.layers.quantization.awq import AWQLinearMethod +from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod +from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod +from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform + +MIN_IPEX_VERSION = "2.6.0" + + +class IPEXConfig(QuantizationConfig): + """INT8 quantization config class using IPEX for the CPU/XPU backend, + including AWQ, GPTQ. + """ + + IPEX_QUANT_METHOD_MAP = { + "awq": 1, + "gptq": 0, + } + + def __init__( + self, + method: str, + weight_bits: int, + group_size: int, + modules_to_not_convert: list[str] | None = None, + desc_act: bool | None = None, + lm_head_quantized: bool | None = None, + is_sym: bool | None = None, + ) -> None: + super().__init__() + self.method = method + self.weight_bits = weight_bits + self.group_size = group_size + self.modules_to_not_convert = modules_to_not_convert or [] + self.desc_act = desc_act + self.lm_head_quantized = lm_head_quantized + self.is_sym = is_sym + self.pack_factor = 32 // self.weight_bits + + if self.weight_bits not in [4]: + raise ValueError( + f"IPEX quantization supports weight bits [4], " + f"but got {self.weight_bits}." + ) + + if self.method not in ["awq", "gptq"]: + raise ValueError( + f"IPEX quantization supports [awq, gptq], but got {self.method}." + ) + + def __repr__(self) -> str: + return ( + f"IPEXConfig(method={self.method}," + f"weight_bits={self.weight_bits}, " + f"group_size={self.group_size})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "ipex" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.float16] + + @classmethod + def get_min_capability(cls) -> int: + return -1 + + @staticmethod + def get_config_filenames() -> list[str]: + return [ + "quant_config.json", + "quantize_config.json", + ] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "IPEXConfig": + method = cls.get_from_keys(config, ["quant_method"]).lower() + if method == "awq": + weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) + group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None + ) + is_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False) + return cls( + method, + weight_bits, + group_size, + modules_to_not_convert, + False, + False, + is_sym, + ) + # otherwise for gptq + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False) + is_sym = cls.get_from_keys_or(config, ["sym"], default=True) + return cls( + method, weight_bits, group_size, [], desc_act, lm_head_quantized, is_sym + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + if not current_platform.is_cpu() and not current_platform.is_xpu(): + return None + + quant_method = hf_quant_cfg.get("quant_method", "").lower() + + if quant_method in ["awq", "gptq"]: + return cls.get_name() + + return None + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["LinearMethodBase"]: + if isinstance(layer, LinearBase): + if self.method == "awq": + if is_layer_skipped( + prefix, self.modules_to_not_convert, self.packed_modules_mapping + ): + return UnquantizedLinearMethod() + return IPEXAWQLinearMethod(self) + if self.method == "gptq": + return IPEXGPTQLinearMethod(self) + return None + + +class IPEXGPTQLinearMethod(GPTQLinearMethod): + """GPTQ linear method using IPEX for the CPU/XPU backend.""" + + def __init__(self, quant_config: IPEXConfig): + self.quant_config = quant_config # type: ignore + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + bias = layer.bias if not layer.skip_bias_add else None + + try: + import intel_extension_for_pytorch as ipex + + if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION): + raise ImportError( + "intel_extension_for_pytorch version is " + "wrong. Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}." + ) + except ImportError as err: + raise ImportError( + "Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via " + f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`" + " to use IPEX-AWQ linear method." + ) from err + # Using the compute dtype (lowp_mode) as INT8 to leverage instructions + # with better performance. + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + # The weight will be de-packed from INT4 to INT8. + weight_dtype = ipex.quantization.WoqWeightDtype.INT4 + # The float activation will be quantized (dynamic, per-token) to INT8. + act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK + + assert isinstance(self.quant_config, IPEXConfig) + qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + act_quant_mode=act_quant_mode, + group_size=self.quant_config.group_size, + ) + layer.ipex_output_size = layer.qweight.shape[-1] + g_idx = layer.g_idx if self.quant_config.desc_act else None + layer.ipex_qlinear = ( + ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight( + layer.qweight, + layer.scales, + layer.qzeros, + layer.qweight.size(0), + layer.ipex_output_size, + qconfig=qconfig, + g_idx=g_idx, + bias=bias, + group_size=self.quant_config.group_size, + quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"], + weight_qscheme="sym" if self.quant_config.is_sym else "asym", + ) + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + reshaped_x = x.reshape(-1, x.shape[-1]) + out = layer.ipex_qlinear(reshaped_x) + return out.reshape(x.shape[:-1] + (layer.ipex_output_size,)) + + +class IPEXAWQLinearMethod(AWQLinearMethod): + """AWQ linear method using IPEX for the CPU/XPU backend.""" + + def __init__(self, quant_config: IPEXConfig): + self.quant_config = quant_config # type: ignore + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + super().process_weights_after_loading(layer=layer) + + bias = layer.bias if not layer.skip_bias_add else None + + try: + import intel_extension_for_pytorch as ipex + + if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION): + raise ImportError( + "intel_extension_for_pytorch version is " + "wrong. Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}." + ) + except ImportError as err: + raise ImportError( + "Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via " + f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`" + " to use IPEX-AWQ linear method." + ) from err + + # Using the compute dtype (lowp_mode) as INT8 to leverage instructions + # with better performance. + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + # The weight will be de-packed from INT4 to INT8. + weight_dtype = ipex.quantization.WoqWeightDtype.INT4 + # The float activation will be quantized (dynamic, per-token) to INT8. + act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH + + assert isinstance(self.quant_config, IPEXConfig) + qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + act_quant_mode=act_quant_mode, + group_size=self.quant_config.group_size, + ) + + layer.ipex_output_size = layer.qweight.size(1) * self.quant_config.pack_factor + layer.ipex_qlinear = ( + ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight( + layer.qweight, + layer.scales, + layer.qzeros, + layer.qweight.size(0), + layer.ipex_output_size, + qconfig=qconfig, + bias=bias, + group_size=self.quant_config.group_size, + quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"], # type: ignore + weight_qscheme="sym" if self.quant_config.is_sym else "asym", + ) + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + reshaped_x = x.reshape(-1, x.shape[-1]) + out = layer.ipex_qlinear(reshaped_x) + return out.reshape(x.shape[:-1] + (layer.ipex_output_size,)) + + +class XPUFp8LinearMethod(Fp8LinearMethod): + def __init__(self, quant_config: Fp8Config): + super().__init__(quant_config) + + def process_weights_after_loading(self, layer: Module) -> None: + # If checkpoint not serialized fp8, quantize the weights. + if not self.quant_config.is_checkpoint_fp8_serialized: + qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) + # Update the layer with the new values. + layer.weight = Parameter(qweight, requires_grad=False) + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + layer.input_scale = None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + weight = layer.weight.data + weight_scale = layer.weight_scale.data + output = torch.ops.torch_ipex.fp8_gemm_w8a16( + x, weight, True, weight_scale, bias + ) + return output + + +class XPUFp8MoEMethod(FusedMoEMethodBase): + def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): + super().__init__(layer.moe_config) + self.quant_config = quant_config + + def create_weights( + self, + layer: Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.num_experts = num_experts + layer.orig_dtype = params_dtype + layer.weight_block_size = None + # WEIGHTS + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + w13_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + # INPUT_SCALES + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: Module) -> None: + if not self.quant_config.is_checkpoint_fp8_serialized: + fp8_dtype = current_platform.fp8_dtype() + w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) + w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype) + + # Re-initialize w13_scale because we directly quantize + # merged w13 weights and generate a single scaling factor. + layer.w13_weight_scale = torch.nn.Parameter( + torch.ones( + layer.local_num_experts, + dtype=torch.float32, + device=w13_weight.device, + ), + requires_grad=False, + ) + for expert in range(layer.local_num_experts): + w13_weight[expert, :, :], layer.w13_weight_scale[expert] = ( + ops.scaled_fp8_quant(layer.w13_weight.data[expert, :, :]) + ) + w2_weight[expert, :, :], layer.w2_weight_scale[expert] = ( + ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :]) + ) + layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + import intel_extension_for_pytorch as ipex + + ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts + layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( + layer.w13_weight, + layer.w2_weight, + w1_scale_inv=layer.w13_weight_scale, + w2_scale_inv=layer.w2_weight_scale, + a1_scale_inv=layer.w13_input_scale, + a2_scale_inv=layer.w2_input_scale, + use_prepack=True, + experts_start_id=ep_rank_start, + ) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor: + return layer.ipex_fusion( + x, + use_grouped_topk, + top_k, + router_logits, + renormalize, + topk_group, + num_expert_group, + custom_routing_function=custom_routing_function, + ) diff --git a/model_executor/layers/quantization/kernels/__init__.py b/model_executor/layers/quantization/kernels/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/kernels/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..938551aafffc5982cafa083dc875b03fa01253d0 GIT binary patch literal 192 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJV#p{>k7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?E|R?rWO_J z7nUaGm1I^WmSh6eWv3S9rREgt$H!;pWtPOp>lIYq;;;c~EX_%^D`ExO#|Xs5AjU^# LMn=XWW*`dyq_#7! literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py new file mode 100644 index 0000000..7aeb1f8 --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -0,0 +1,94 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod +from collections.abc import Callable +from dataclasses import dataclass + +import torch + +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.scalar_type import ScalarType + + +@dataclass +class MPLinearLayerConfig: + full_weight_shape: tuple[int, int] # [in, out] + partition_weight_shape: tuple[int, int] + weight_type: ScalarType + act_type: torch.dtype + group_size: int + zero_points: bool + has_g_idx: bool + out_type: torch.dtype | None = None + + +class MPLinearKernel(ABC): + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + raise NotImplementedError + + @classmethod + @abstractmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + raise NotImplementedError + + def __init__( + self, + c: MPLinearLayerConfig, + w_q_param_name: str, + w_s_param_name: str, + w_zp_param_name: str | None = None, + w_gidx_param_name: str | None = None, + ) -> None: + assert self.can_implement(c) + self.config = c + self.w_q_name = w_q_param_name + self.w_s_name = w_s_param_name + if c.zero_points: + assert w_zp_param_name is not None + if c.has_g_idx: + assert w_gidx_param_name is not None + self.w_zp_name = w_zp_param_name + self.w_gidx_name = w_gidx_param_name + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + raise NotImplementedError + + @abstractmethod + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError + + def _transform_param( + self, layer: torch.nn.Module, name: str | None, fn: Callable + ) -> None: + if name is not None and getattr(layer, name, None) is not None: + old_param = getattr(layer, name) + new_param = fn(old_param) + # replace the parameter with torch.nn.Parameter for TorchDynamo + # compatibility + replace_parameter( + layer, name, torch.nn.Parameter(new_param.data, requires_grad=False) + ) + + def _get_weight_params( + self, layer: torch.nn.Module + ) -> tuple[ + torch.Tensor, # w_q + torch.Tensor, # w_s + torch.Tensor | None, # w_zp, + torch.Tensor | None, # w_gidx + ]: + return ( + getattr(layer, self.w_q_name), + getattr(layer, self.w_s_name), + getattr(layer, self.w_zp_name or "", None), + getattr(layer, self.w_gidx_name or "", None), + ) diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/model_executor/layers/quantization/kernels/mixed_precision/__init__.py new file mode 100644 index 0000000..171d16f --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import vllm.envs as envs +from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import ( # noqa: E501 + AllSparkLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas import ( # noqa: E501 + BitBLASLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.conch import ( # noqa: E501 + ConchLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.cutlass import ( # noqa: E501 + CutlassW4A8LinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.dynamic_4bit import ( # noqa: E501 + Dynamic4bitLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import ( # noqa: E501 + ExllamaLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import ( # noqa: E501 + MacheteLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import ( # noqa: E501 + MarlinLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKernel import ( # noqa: E501 + MPLinearKernel, + MPLinearLayerConfig, +) +from vllm.platforms import current_platform + +# in priority/performance order (when available) +_POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [ + MarlinLinearKernel, + CutlassW4A8LinearKernel, + MacheteLinearKernel, + AllSparkLinearKernel, + Dynamic4bitLinearKernel, + BitBLASLinearKernel, + ConchLinearKernel, + ExllamaLinearKernel, +] + + +def choose_mp_linear_kernel( + config: MPLinearLayerConfig, compute_capability: int | None = None +) -> type[MPLinearKernel]: + """ + Choose an MPLinearKernel that can implement the given config for the given + compute capability. Attempts to choose the best kernel in terms of + performance. + + Args: + config (MPLinearLayerConfig): Description of the linear layer to be + implemented. + compute_capability (Optional[int], optional): The compute capability of + the target device, if None uses `current_platform` to get + the compute capability. Defaults to None. + + Raises: + ValueError: If no kernel can implement the given config. + + Returns: + type[MPLinearKernel]: Chosen kernel. + """ + if compute_capability is None: + if current_platform is None: + raise ValueError("Cannot determine compute capability") + _cc = current_platform.get_device_capability() + if _cc is not None: + compute_capability = _cc[0] * 10 + _cc[1] + + failure_reasons = [] + for kernel in _POSSIBLE_KERNELS: + if kernel.__name__ in envs.VLLM_DISABLED_KERNELS: + failure_reasons.append( + f" {kernel.__name__} disabled by environment variable" + ) + continue + if ( + compute_capability is not None + and kernel.get_min_capability() > compute_capability + ): + failure_reasons.append( + f"{kernel.__name__} requires capability " + f"{kernel.get_min_capability()}, current compute " + f" capability is {compute_capability}" + ) + continue + + can_implement, failure_reason = kernel.can_implement(config) + if can_implement: + return kernel + else: + failure_reasons.append( + f" {kernel.__name__} cannot implement due to: {failure_reason}" + ) + + raise ValueError( + "Failed to find a kernel that can implement the " + "WNA16 linear layer. Reasons: \n" + "\n".join(failure_reasons) + ) diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-312.pyc b/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/MPLinearKernel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d13d79ee21eb8726c719701d97d9cc83b976dbd GIT binary patch literal 4534 zcmb6dTWl29_0H_f?##aJwT;0x*x&$Lj5lcr5HOGwNXpAe3u)>8jK({6&6wE-cV>%i zZG@9j38$3|U$)gs@kc&D^uv5n^(*aHr2eeqRBT6ygp~H9e@jdhsb4+k?!&vb6P0WG z+;i@E+%xB%b07YDDC8&5Ea`Wq8X)9rB$%oBqcf<-byQpku&Q9xR>JQ*n| zaol6c8E?|damn%}eZZG3C8H))KIXOjnLsii5b71l+r;qQCWi7*tj_VUHdQjnc`ESy zRCwsA4ktrXQatbFml>o~CM(^R6}}S^EUYc|u82Fi28-k5CVlsFW7B z!#I@4p3UYQ9bK|DjVbU&uI%Jk>Y6JX+zDwcm$O(q zFdmRKfaJf0gp^qqA}99Gom1dHv}1BQ#-|V~P9}#NfCay0gvcgk&@ zt5RnQFmb90oR$oE3!Q*URAO`(sO1F>AN(nGO`~DM5DY0DH2k;aWDtA^ z(9lG0)x~Bwv9p<~;#^ZQg6nf9qf~-9J73632xi>9h4t|FjPoytIBcnvR*cAPxe>Sa z7xE<9h)x@g@vl>i_S;gj+Do{3!P>{b!%o;!yWDVVqPt#3 zodV0Igr?{sj@#lIX8t^J8mte62g2wB!HtdPoO2l{>kQ30)Husn4!2lO%Cc=#?q`?^ zZaWmKmNBzhO3&+Kre!))(*v#Z4{sLf0o64uz8Uh|OCq~V$^fquS4fSfYR3m~3Hbpx z*&g7tK>%@)?L|Bez((!@qEIASL#`+S z)d)_}0~A{1ttd?3KD(hYsVL3VEKTq`L8}1pKC)6ul)Z5x)a&Ugvc#sJKaN|Z!9oU{-JbNb3^8b z&Wq>i1CbpDH`!5ixH(Vz@+_C4wp}~HZB0+Zg+{Y-x?yI=r-wFt_zy~RSilExdC60ti%&_`$E7^_mchqCx6+$`3BIa_TqH_a%`)y-4-ydS7;$DyJwq zji5|4Y5*pywF$M3t@lNFt9mkoKZ*4@>plr&ivPqtqi@0f8DjKC!DJNfT1+>f9DJ}l z>W;%0)*HjU+!eD~SADmBuXQE7_{P0WtMCSjzKNo0&Vm~bin?l+-mIdVYAC9~wK{93 zb1YMpnBKJ^r8Wwj1>?2`0BXuhg3(f-=M&$3`M$G!WwrlUssF7nyLTHPFR|Kp*dy`uN1+iPd2I%V2B^>g3|d&wCR0f4htIdO$%?HpsLkd2R4b{@ec1b8py4)MK*^&^R32mIMs zj=lwFaYrS9RFL%UTIyIl zRzbZ=nE1nM5ybX_Tinm(*B&2zboBA5N2jZW-RUx&l3ufyT$>u|XFbvRsbOv~oGtsCV!Hvgvqe;)J4YBgc|(Q<31xynfO0eq7E z61ccC*>3=-cmzTCP9ehJR|I|sz9zwclM7#weXu5C;5$(f_E!i1h`AW3Ao;xOj3D^$ LUjCL~&tvdExVrk> literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7450b5e90e2b44829ee5ba6602fcb5d590a3f1a9 GIT binary patch literal 3766 zcmb_eOKcm*8J^{m6yM^*lJ!W`(IbWE$fO;}u_73jWj$QMdZbqC08HJ@YIh{9yboq} zDOds(I6+aU51`J;0g?dCEk+JaZ$1{e_Ci(%u?qt=(35WrWVb+1{b!d;(URgG)B%_K z&;QSU^S|bw|F7|Q1i?`Kzf-P@Sm2 z&s+_S#N7$(;LzmW%FwahvELAkk_eLkjj*6zYbWu~J(G|ryBFfC%Q&mwDkFpm; zX`NW)2s^zXQdQAk;`;2w!n>^;JLApg#VtZX*HvW=j7dr*N{Ma>MOCy`4XW8G;oYU> z<+lGddmxHjc-@^nieCJ0kF^|a6jgAwvX!?!C(fj%n zE*ld~P%DCAOsyM+NwBD6FPq&{iLG_f!V>Tmt*8LE|l7m%tR+%Czq>07+(6o}sxu)Rw;l#sB z4yvfWujtmD5o|Q3*>U_cu<5w@O^etrFqq|*NY{vkWwNPAWCSa#c+t=aE}6u{9~>?$arN76Ma*mhVJC#rbrbg*zUlTd{|N_xbQId;L8!*aAlK(29cVX`!L&8=%z%c+8uS zIQLQg918VR+WMbFdn?hwC+XKJuV3FwPgP=5PZE9miP36ebT9GtE?l@W{VjRz zY4hVu_rr-#Co1W|%HUWn)4iV=sb)s@GPz1RUq{@9f!e@mrThN!&e^P-W#5-hThnXKlteH(Jlb^?ZP4UONw26{88oVVkNos!qM(clo@*d7K(P% zeO&llZREnA<{r&`YwV58e$R8^8{7fUM}u|52Z9ISQ7RVe+McM;Q`zOx) z<-q3Lz~)*5n`;C%cOnQ}*f&lK2d&=6Joe0K(cqSm)+sbZ{|C(= z=ULxMStDi7pH?&3ZFL3K>r!?aPwzV7w#bxqb*D$u+|HVY8QSb=&DETSoU%*7UOX+> z%QqE`0w`t8s-19cQw|{?C3{?Sc|={s4tJU%dxSaW}Z{`FFTrE%u|j^S(6pc=XQ& ThkhCe@8+t3?&kqm;AsC}h__Nd literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/allspark.cpython-312.pyc b/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/allspark.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a86a137a4e544735f554aa6907241a821f62cdaf GIT binary patch literal 5581 zcmb_gO>7&-6`tkqa=8>mOSG*&i?ZazbgUoSRU{{p9mRi%EvdHUBn=T{vF5Jim6kuA zUD}dBCTZb=lopVEl5&cCX&lHv9H2m)qCirhhXB2BAq8XyE?}T83Ijc`k&6Jmv~QMN zl9K5l0lFY(-n@D9=I_mW-^`zVJ`TaBI)1I3sYmE5fc(g4T=FQCsi>j3rP`+Ff$M3OgQnztD_?m!ll7_meRh&ztbm1`rBZLNZW9LVexGdsx zGLFkCX|T$p;+%|6B;u3GRJ1s?V(KXsHaXC+jARsHBowj9c8QT4lQzkA!?9Kpi8z64 zmt9cfmRZ>&Ip*C6P1>5Ee$pX1t0XzeC3~lLml{@O)6G?d5-Bl~1KPW9I3jf(Bzb>Q zTNe7+Q$wfc8GLihFoT+{CRvHQ;f&PJIl???98uK?x;tAI)P-d|h2v4W-qjzqmrVyG z%LpvSHvafL(4QhO5_?qDG%6nYPoh_kC3}-fvRfDNl&p8prUs4<&J4)Cs&ctEIj3Jq z#Cxx)YOFVwkYrVmXXR*0PvBnIWf^O|S5snKSLQ`s0l67-hqd0AG7EN-#Bx;8DBm)w zp5z?a0i6gjB`!q8qUh z^FINxjM5A}ncF|i%rQ_?-L518H8$Nk)wH#1ds_tOncCMg>Kvsy_UU%e!t^9tYk>Llp33Sqjn#1|Fgf)Y!ratx+6zt@^b zwYm3fk=aF6T!I8F!pkqc{Nc;_hH**_e>j|PIQPhZB+vVDe3!v@tzIzr=btJqjV_J? zb7^dGjAmyS&u)5M^;`kDJRA- z|1vgcQ^U$KcD-yvwd`ivw_Ow1q;0zHY3^ESd>KdDF%1!H`NVS0^eaYhxAt$!pIh?X9U7rR~#)D{U=mtz<{pMMF^N4 zTw;sIiKl{N9g?Vvgn^ZTq3Oz0Dv{ENxef)Df;SBfOV#seiyfgM`#4RLKgX5Ld+*@l%c#71a`~CiQXo z`Zf{DsTN@bjRu8K^;e9nZKX9UDbNIQ60QbiEQW+qWL*$p!k2K>va(&(|-VTRgYGJaM6&Ez6NxkzBCf2=?Eb{6hU)T@RjE z;PMS^xrQS~!;u9xA8gMB_Zh)`cWg%Rz@130d)RVDUT+!9vb!HNcdYKa)A(82r)}%a0}K8~{-$j6 zfsKR18~$Ti_E^?`>;c=B-P3cw;~OUX=3C#gzW|Aq-i;TAHvGd`b~x)F27#5t{lM2( zzhz&iJ#>6L_|aglrQ2xf&hxD~zQf=Nu1O^&6r7Txh@u4XlTb=0Ycp z(8=}CX;2FykDA**-v815T=PMr`QRNP*F9o%k7O^rnQcC}-u%{r9|ri+aPH`Yade_Q zxYgsi&HXJVq?gG~VLMzwG^ zAds5)2oMOGE~9Q!pP8f?D-yi}ePvu3cZSWdfO6;=-u6ih@XPVIQYkU?V3wn+=NVFP zb8n_trv^foiuXyz#5|l{#$C0rR@!5tpdZv!8-=BvrF#Wn6gpzP5_xMPFhC-%HE(gA z8E@MD%K$_Hiifv-X}h(C3<8N7E0-?Ed$?`5#~=R}KJyAt@luz(%_=Eiuoj|iyt{WbLd6zqUNP0%wvj0ivDni zKm$SP_Fy_hjkqDe&_@X3G&4-^?f4iZq@nDQuMdi#kw$>xNT^Mjj+r<_NfMxsS18AH z%&$@Eb(+wNNU>i#ged}vhED^JaHTFKVU~F4qegK6u3}i!=)LV}U+WW-Np;Thw9mN3 zY>{~b^k{#9vrdNHsyo6^TDQkoj+IioKg4O>Q$pIoZq#3Iejag5AIo+2alZJ z6l^#MfuNEqQ(i9r=o5gKvB24=%m8_}+@L z?%$tf_nVF*%#4QvgcZz?EC_@X1b9?PL4*?sF9`6Ipi+bYC2v#}H4UDxt|TO4htP}Z z6$Ddj2fUDi3nF-+L_#HYO~;r%w%~4>3{U~rn5h(>F$&|Z+e>{*K z8Z}-V`;Ib84S*_ig`KCFd;MPwem?lc@z0MJ;3j{X`C8vJ^TN9f+86$pXOA=d(BHT; zyEwb_{^I)uCzPQlKCm>jI8<=c5{sZ-T~t>t8Fd{6+MvK8&cD>U*!si^rQ(F>T*7{H z6#%lpdkv<}xOhCeX}M?X`%O>r{idh-{iY|SE2?Hf7&7#e+VqqUU{Qs6?95k!wpx13 z15Mv1D_9IuziV!Aaa*=-6=8>pmU0ZoK#tyLHR>!1c7|a#ImGz>jynH}&V6sQGq&$p Y#7r>%K*yiBM1~2k9Q!w-xw%gN1DGiC`Tzg` literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/bitblas.cpython-312.pyc b/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/bitblas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db087b4f1740a582511b4c0fc9d70ab739882f48 GIT binary patch literal 13151 zcmbt4X>1!;dNaJq8Qzpc>ZGm1q9sed9p6VRIgTaykeoDZuoIdyk}30$ha(>fnRo*) zmdYNscF{GB#Wo7MK(x&Q_4Yur-5>QX_D75M2glTc%+x`=+Z09qF_D8TiURHT&2UIb z9=i?tB))ma_ul*F&HL{6&vv_wfD|`;Cl(tdh=0S1nvB)L%AZ5w4#5&E87IQ{)Wyk! zF03Q5tdHvxhOj{{8{)=J!nOnzrV@0RPT0ftgd^-oIK$3_ zE9^?R!|sG9>;YIaYl(Xk-mo{}3;ReyM{owN;eArUB;2SYP7tj1Cc)Y`@B8{{YtwZy zM16*pu@EVluW)=imP$gAiX~$L6HiS|aXeH#XJ;cxAvPBgp!@k$axyjrt!|#1iASOw zGZWz>2~Ge|s}wkPa^l#?(Q)SVnTeC5C&NR`@X*nT^XG=fr8c!|{QQ|Sr_W6c9cNCQ zJAM8PGY(+mQq$&z4rW3Ndb zwI`k8g=$-C^dc9%#MF9bXJ()?&7?0zW;m(y@+`++U*F5I%w>E-Mz}u1r6uodawYqiGu+*fPI8Q%8Hpvi2!D#>lU!W#s`Zh`b&gl|E2?a{ zk~^xEm3umZ_%@snjtG-15!Rh2IDOc_897q`KsYl?9)KsS3tKoV)bw)A5Vmm?)Qni; z=;`L_Y^xru3EEp%HB^hN8QR-5+AYu?)M!WejT-INm@(|(ywKbGWG_ce>ZTEPA{3Im$6^8!`I=Kt1I5>feL}Ji zjU7EUGQ>=rA3HgALeeD#$v%2?V)Xn7GjjBm)8{87^JF{~5q4~^7Pjq{EEnV^C>df& z;W^2nvV!9iubdegACnv^StlYwVm2<3QOR+YxvX#?CJ9^!>*>{X=T&BIW_5%6Dl-Ml zV|9~HomSxjQJKR<;WX9~mXT|&?1Svj$U8*Flp&Z}w*Yjkq(q%BYnD(S|5CI^MY%-~FHW{VKvr#NBDwb?y;pS!e&8;r*;49;8^E~b)$ zSK{%+U?RnGafZ9bMQ4Q+KNv@5l^&FFtFrpROEN=C4<=&QfJe^oTr`FfVz9bj12fn8 z5G+2|s&OC&YA*y3=^n`bc!Q{Th_-Dx!?Gpt&|;rAixz*Gw&&i+zp?aY;mFx9^*XC< z1t1B@b^$(XmGzp^(MTNlvgxwScB2M_s3sFUm$JqTIgR_eE=^PG8lbAqHEn;gFJqimnfsXiO(#Rp;*Hu zVBY*d#yXAUx-JPwy>gmH>Rp#=JNb;$m9-`R3sy18tDMN#0;A@#~U~0ETG&y653pXK)(X&$^`U zN&QUJ_E+P}WK7eW*G&M`XO4I#O+S-PZOLX~i@F+XnFhmQeacYk+UgU&Ezm;7hO`i} zj`7G4K>c=2aY>Gk#F=O+0h~FVo@-Ram?9m6v2<`OmE^&+ zwB)K*1U`~XPp0^Uq>s+bN`~kx8{wS@Zc^m6WRp>3P^?KhAtf2*1$b1CCDR0#1S3H* zCT9~f*Ci9nU5Q0G$@WqtKFba9Jan0G@`wS2_2p_=ygqw|8RKb+3!qNjD+1+ zu@N5s9sh#AtI417hPqj(X_rUYuOI zRP^pEc@K!*110YX(R-rkJ(;tVyF)o+xgk($=oK4!OAUi!!(gdlpV+W3XDd?;`Q`^y zXW7?!=hVWfpBmq@ziTi021>rYqHk}}w|{=9Y;Vf%ePHh@d;JA(SAjlK=s8k^+}l-d z-uO=6-M*i4?H%EVS`Tp@=zA84K%nkiu!w=5h?t1$a z&b0r5z2{R`!yS5o{>ax=_6Ogw-L;kcTSWhsqJOaD-y!;U6#ctO{{5nVf6;%i?C)GL z>KhtY^hUSmE1K~57Cg64+!q7GnTR(3i z?4HN1@C^%XdrNIc#I_^%n~QBDMbBt~9)09&%$pXC1!s4`(#<2QkxV=%%<{=78mp_5 ztTmy6bcX;pfNt!wVJQC$Ig36TPp1()s2o-gsF6$}mSm!lnaG7$JSJS9>(M%67h#=+p8^sC^n3zmMk-(m2sc2(n)dYh46y1kaWst9ym`?ys&A*YvLt zSIH1PHX+Nvx$RZ%8LT?4P~ctVQ^~1;;F&lVNprziGA%^n@n9A3(81xkbLt=fM!8o7 z;n=4Q3SPJ#6-AXeFFYGc2TlIq4?rt-myw<+M|&q6HCm*xdaE& zc0+YbRy!gF)`0LZWBLp*Ghjd+odrv4#Xwm@|4KK1>TJA~ozLc9F12kH+qM>++y3^% z2cy3jedyW*40o5~3qt2|JO)v;v7#dzdLQ;4E^Ind(UT3w$Q8Z5$x|WpZcoJwK%e^v zSJxv~1Hid=e(K#^@boQhs~C0efr`=Sb5sbU-SOB#H0}Fv!#`~K<(6XO3puJx(YFrH zA6%qFst*KE+0&lC{7&|6HlL~x%jbh<&Fa) zb)YsZb2pPu!murSKJ{)bc;uybRP+WMW`Lz>!qpE}c8Ts1>8{1IBE3nY)*;dzi=;?z zguYuN^CK%(qss=UTWx=Qyn}GHd_j=bP`R`B0UZRqtfBu*BN^X3>O8I^eygL7yY#$=94GFTr8_w)GAn)MOF3Nr>sy0s$t)RI}!)j6rEtav8{EA}I+R+H_eo=xoE03S#+w%Pjlk;xo(rZ1ncTXXooizv)a>_6P0ZEj z)9PKl0_=qbv*#(HO~suxsqgp}xegoTAnTIWdUb5z4Um1TOCWH9>zbents6C34O$jB z4=P8Jf)N^4tGfEDaC)<*tQkBHHLLS!=Qm?gN0G!j=*=29nd{IQ%QyY&MriC3*tS*W z2;GX;iI8~=n6={Al-;04B|>7Ay`R`3n=xyYKqNvuYAZ-H=+m4h#|a)cfIkBn1U5*9 zXQ7lv0uA0!ltIgZcqDOwjU1YLUW@t8@Tn-+qE*J9W+Iai7lKGqga!FIa3G!n^D})2 z3d$sZgVQ7Zfp75F=VkqHt}6@@D5yxFC&g%W(NLpg26Zzc2;e$J7qI+}LogG67&A20 z_$X!^WRmqF#E;}H;ZentTujZLR2;#QNA6|hNK!1xf_0dd%xPr>9_fjH1qXV^QUb_K zMgDNCA~;8|`vuI>9eV_VasS)Ek1w77t5<*a>WbO0-zHl2V8PK<+;U)_rCG&H|~Aocl7a!o}haa`^^?hPI3`2;ov}0 zsac^`q2;)1TJfAnh6}MsI*O#ECRFf{NvwPyGKg3Z(`wJErz~(Z1TZ*mwOz_JAy}V8 zc0c24>ryoaVg#q3DPw*g)c{Dyuw*U+`d}l}d9*PUdX?QM0dQzxgc)AArXea0t&l|_ zo+~)uDrVO(yAGLT=aP{NagGsYQBjvnpoPF*;jRej)d7;KN$Y>w8ruGsr$PS{ES(;M z3?ds2BG6XyZx;QVbM~^kq2%rq-JL)a=#dGE&S1&eFFN~khO&h&S$afEPnm8k(@kX> zLLdR4oHFf!{7a|BYE$gw)v63?_42p;5zybNFbCWc(Ed&teE^1JtsVV7+iY3^kqnigGj>p$Oc}?|srRecW?e9Q z%@VF}L&lXcsHz*Ngtff@5lq)yv-GGOo_tO^?CUkzX?DDyMeK8zWL zkofOGCUr3A3rg`!ln35bC*&0Pcq8DKm7T*OwTVVp(V+ zZ{u%aRzr_}TLxBmMwOYQG0s+#kZM5|l>F<^HMd1e96ps3eGEg=z{!B_zw{B-&?A3q z$=@gX`<6yT|6ZW~y8J6QwdUw@Lo@g!8#a_0`o)I+wMD9_M0JT&*OKW0HBfHp$c>Z( zZMhfA{?`0Ne)x7GccScU0jKT*=f-l|#@n`XYsaD?e*n>OHoV{_RXfWqSrSHPCDS2hwIou&3;V*4@RRr!OkMu#nD_?@Li;Z(?( z)Pxd9=2U-J%c=AU9R_8={F3iIWsM1A7VbL%9UIkvP1cM7qFT)Yp_!UDO`X0TZe>ZC zwmkraVSsfhW3GpPfJd69Yk}4}V1`r;6X2898(COg54^*|8h&n6)obX{@#7G*(lE+c z)6dQ74DuRUAynWyD!-6nw0O!IT!H_Q;F+`jM4z<@HUHamOC4{ul(DT^-8zn*K^$p= zk<|Y(vKC|loa!T~uRpTJ^>mU+;d)Xa0{00R7A}NE1txhV#>OIyau)$i3ODivHX=kW zfPW5DZ{ecUFcnW-fH;=Q8^FyBcesWD*rW{Okt2MDPEBx58-&!%r8#$%T!W{p3?(?% z76jQ3fvQN74W=Vkpd_S%>PiE1fuO>Q*dTH|;GGDLHc4=4J2VqV&z%^WkQuM!P;TWgqeJ84M^6mF<&y*$Z4ib_qd-&HBR&-t`2%R3+o8!K zpUvl8zy#8EnNfaXIrxd?#oq_F6@xnpmbQ-q+lzsnImaU^SfaW`s=G*STH5x20$0_B z61`KTca~k*>xBuDY-(TE40*eZ(!MVRIWcKx}GchLQ98Wc5vcZh^F2p z)8e(l;7GwY3aYoe0~~+uraROEl^swy?0pTQMUZAIj%;M_f)BCSnL|Ue_8B0S+Ib(Z@6<};Y9xV{JGmBg|35z&cmYbFobbk?%TWW z99%fK*!^DryZr^{Aebig&@NJBiQ4*r+FCX0^Tv1RyYw>MU3Pcf_9+3}MYiDXLldXr z*6Z`H=l2%tVaGL>v}#BDMZolFmZ;Xz25Clu6=4u+c5VF=zlA}|x#)nhH%cHrjUga8 zs#AB-)Z#j`5D?O0VNcy$Inu*L*pQy@!%q)a^w7%$v6B>3!F=5e4?cn!9u371Z5TT} z!JLE}z(b=$V-qTeeIkyr25ap&#u|`A_@TGKDi}!h-g;yHjgqrRboLY+o51zodzysY z4!NGNwu8$BE&y!YQ*=C6uskO-8=!`1ewKJl{ z=%y>M=YRYbQEuOJW8`+%?es#ZXxLM3Zoe^lduP$m4r)4-b`%Y5k1Y0E_Idm5>*du2 z$GjtN!xr1T?ergudaTVFQ#m z^_6@3%li&}?xc;jM^t;o2v2ZD1zIa+EP;aGTCrluMl`qOg}coa3Tre0HiGZ&zKR`d z4x(e*y`5tFo-c8lN*7G~xpkMZ=aDaPXJlcdVuUhG>Tbl#3l%fgEY(_G5M7-s2wX{$ zgvkvdE~9602LywRo)x{?0E4ls&uA;OZ>tdSyvKcb{C?*@8h*XGa9~8-J^BSURtG?- z@*3G?93t=S`rx@=JomxjUmUIwSidi<$j{0M>4Eq4HC2r8gxMPd6*HDVP`BQ>v~a0n z#hQ(9dMgx`Xu{@Lu|w&L-ffUgk{!nO$}y6HcXax0U7Npl>&^K$D@LfpWZ<`h7+S@Q zH49;L-fEq1U9m##t7fBduS_lS5eWI^<1AV6qD}vJv zeC3 ztOcp8%2q1@{iqkE733>H^n<7$$!FTHNd4Irre2K_$yJad{AdRJJP{AzPT8e;HE+bL`650BVb641d7^3$)Q;+?i~+T? zj8}W0Zm*8Y*c2D3#||8j-Ge}qJ+gO-jWpO4AEmJ9G?M*uNUp)&hmN9Dq%r9T)jp$x z5My~J6zJ-Z3|tjW&?MUq|0g+_U&HB@o!GaP z3AZw!%*Y(byXLq^tp~}z*OGO^JX;uA_Pm5{uNii*rPQQb_RjH<`Xm?fTLC!q=;d`r zaT07Mh#sXjTid9ota1PrQIZL3Gae||YLFrMt=H7VHYJeF;P0_`u6qhCxRWXyPU7L2)yP1o|ok4^9CR(WrtO9~8 zCOQ&jDOPH9tM;D%0mvdsGt=ciYkIRHag8AzGP5<;$B-$ioFAh z+R#jAnPBVXw&nu8e3f&(UbF1N@tix~3}9Mp2{1+qO>W zcGqJP#f<4aCBG09!(OYR8>XbHSl%bZRSYRgC{YB(6s{?f@B#Q^g|?NZ4+Ke<1(PHN zX-ZP`t`Ivu65=i2b^B=S97cR#5X)KljcmjI0?!8e3Jz4$Qt+at_Dm?-xHIeDNhJpt2A6gm%J~khG;GgYy_de5 z&gdZ7bBuY~7|u4lK}8R27Jce#x_#-fFPQc1E^ut1Yct16T|1#+WbtHv~JS zkw}loZfP`zLfinPHVm;C0){9} z!gVjIF-cbRsop^~2G_fBq<5VjT3i?h0o+92GzK%$QrCf{#=fk-FWcDnjlb)D*JoG1 z{KeP%pZJIK%`J=X-g$Sqc~7o+&*SFq`D4F3m2YgCKV!SpI#bbY*`_M`(*I*BFjVQN ze|$r+WG5_;?4n=~1$zO!^r(#=>8Oq4bd~`CuR7`{{-duxXmGQ7v1&rQhO^w@5uy(D z9mNbA3vm`_V0DrRFeBMVK`#J<0{|JWl;VP~^qExnHJ$7bouTMW&somX{M-+q;76XC zWp^;=4rcKE@lW6X<2vLEeM%Fcvp@*H)8I1!@|36L_o!n9M9?w&KIZAl>wb6GY(&Lg?*lc`DP8ei+;8 zEl!eUG2|r&C`F)vdRLLb)(y+ON@!q`w*gp;XgQ17+n^St!BT9cJS@fIYO)V$1H{3eAa5lp&`GYHU&GR*O zxP_RJP{4AFcH$64%PWdd>L*kTxL*{(Bq~iVF!DuJ$uQs@c{wIq4n;RfD{PI>bFz3d z0i_Z|Z7il*4#Om5N0A&O(aRRA>y~ra)=K&)-8qhRBSy$cs-TGiTAJ8rOsMzTfND4X z0$>j1`T7FO@D2C-o+Id19mpB52?eK(weua>?fVM|`bYT7lV9!pn)~bC?BKcFo5Rm3 zu_ORkVX&4z!94oe7XzOUd~x*iqXk6AUzw|RU+6}=L;v!$@+~WMO}AziW^UbBxKZF? z3={bSv^p!e=*W%Wzq+7i#&fk@1u6iQoVV^)=R)TTAB>9op<4+xS+!6{LU9f+s@{aD zsCE%*IiV1TtC`RgM}`4dLHf?>UH{JNwR1duy057h>TSj~4|YOho6rPVvPJ@|({J&# z8?apVXkx(f*yD-)2L_7Oah)hiI=mk2*P9`V#S+;*|MfPp+=E4*I6@*Y3C$6P2B6?z z80Oz@#O(eSwS9;D-=hoPqR{u~D4Z4_Sms)CTuYv-UxT#exHg;8k>ffF9=@B+qF^Sm kig<>FyP|vVXMOjTEb7eQ$0+>5d6r>XGPOS-+S^C^KTnoVC;$Ke literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/cutlass.cpython-312.pyc b/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/cutlass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2b2b5c6530f38edf87f56b78e816d3e2a1bafac GIT binary patch literal 6374 zcmb^#S!^4}b(UOGJVj9w9m|%KReVGn%aRWzmK52J;`pd7$&%$H0Fz*`;x6T-mxpJU zwn$K^S{SI5HkI?yT1FDs1q#%Gjl%gT;-CeR{Imi3BU%nfENoOjQxyJVtrQLVlfK!5 zq#Vjkj{!ORX6C&&*SqHZ(eL*lC^G$H>DLVi{h2hZ;xK{LHvzbe1SC*03X`u*rsB4+ zog%PZrsIyVLx*YE8Ltc1>99j~#ob|d+!OZbZKv#w`@%jQu9N-o`fxqKF2OAa;(>4= z-Vkn}kPRJ0g6ARI~V zLnvx%f$=CUIIA$dLY?S~Z88a~KC#xSh6%zfu8VXy;6{S)KcN%cFbDH(eu8J6eto^L z`kGWxm*BnV2nXlTG!<@~K&)ST;!s+Zc}01(cc`!Wgd^nOtwJO+0RL41s-IDpky;*| zx98B?G(tH_LOI(nsh?65QY{Gv$yJ>elbFkbD&L#78#berLtL3EgIN3M5x8}yqVlQ~ z(Y#Ycz&J&hFsJ5F(jZWpi{qq(q;edF`4J}MntSZl~*L1`NZ@{xi{|##22hDYa7t98ujlo!-~qcXprHBTJJ# zsf>Cindmt$%kiFgQV?ZMoE9T#HHmvbrbVpu=yOV0ST*3C)n!%biA&R>z@@MlkrdJo z6lC7+R0cP}#93yYJLtAH*a%n(3)R=N=$^|zUzc}nT->_kVy|t#<6^&T>9`*H-SNWU zSblKqZp-)_{dWB-AS2C*MKz5RYwnT^5Di}?d%lc7$8Dnz!T^7OK>6$uSy9O4GMGrV z)(B_7#iaxn;ZyviBui>0`^HJP@%IR z!DFu23*#HD-pcoNIL{f|j6G))JXtsm#gVfc5DvnuzS>mpqW5j)oHcA+)d|+Hdir_@ zZ$I#^LH(}H;GL;w1EZ|%NVfgZ%cqBjCMK9bhMTLnZ<0)`6Ohs6kWR&UO#dID^rod#dgm(8D+UkiE2K*V*DfY(qvJ4+l zrSrNTD}<_RD5AndNi2w1N=z|{q*}GU$qcN|SUFh{Az;HIPBN*alu(t&V;{--bZ5#C zkH>Uod)de1cRhMKFh%c!S@UR^ljP`3gDGW2X7`>xMp7z^&2B%z@Ckt#W#Zt_nMsim zr1M~yCS{R7HPWe>N_GNk#6GG$u{=0w$r0* z022k%>Ldk%2Z7g`0T9IVQbeq>Cz=b6S|`#RU`ZoqG`~J&A4e`Cn$X;5c!itdBwhbj?k%$eA_6VY7OLN%AQL6*i0Fu_j| zPF^s1QIQAhDJq(eAX_cVSC|IUAK??66i>-w9O9Vlw)Je|8Z=@Dm2aS$MPCMXFaCFgQQBxc82?z1TFkvUOWw>tG&!u8ukHrPqtz4U0{8ysg)K*OfcW9-U*~?O>sQ zJl{XQ)OTvBVWQ-)6K;0k<{{kl#a?Z{(tgMLw2puB)1f~c|Lpju`3k zIBJwy@f2*ah9islEt<;La-=a5*F93oV~?S;oNIO5mO~Iy(xaGc43l-RcFip%z-n-U z6xTe41u@_hA2|y_5%jYz%>(8q0kK{pB4To4m~2wB!%IVB%wUNV6A_5GIIvutX@(*C zQ+p>hTHsY)qg2fkNhVZjDxFL#A5a=KT^+~xRm>+8^1|RQaObka>#1o9CnF++god$D zI6eyR7Y;0-07k!iP)@?zL^;sChT92Pta2JE2+o??7Wum!d+s*vUGnW+YTA3x*Kw`m z`nlWVpYQ&{H*znuu@LIdhx%`Ke6s!H?Vogh-1*yQ3Iiwe11FaU#tQ=z`GJY$ftQy< zugneoNG*M z93j<;h^mCeoEITAXmN>^qav#Zo)f56yjF|Pw}sAz`+yPdhl=&!=LuXEDtv%6Uw}%p zfwSo(EfQGcLDIFs5;h{89C?R|fq>PifIH(>cO<(v97DF2@zw*v2 zh0ykVX!~-gvk=;m5AC>lxX^Vl-*s>~^a2qfUEq51p`KgRN6rtO%b|U11xpK>UNLe8 z+?c-MHL^x5ts!dwT!vejKp#+}_(^De>rh;SG9RB5_ygIa|D$$Uj9!Cjbay+8J{!^% z0Srn5S5ZDaouIiuqqzj68VIE55cqNuAme9Ab%<0)Ene`b3?~!>% zpqfPwY1$on=ySM3r3U2o7hG+5SKFd^?ewqS_{AHyM((;^EV_J;?_Ju~_wkM|Tu1cH z;}?OgMo&t-5;6S&bRa}xEZtfDTsH`yI(qloYKDgXvhr2Na+IZ1uLI8+$Bc8PZl(^x z_39Z+^ch#q7AyOo3J-yT{_<0*x}T&?EblhsUIz=j+$NNxYk4lgNW4&4;4|Za9n|nh zId^#n;CN=-GhVgq(`r4!a_*cX=Y;ucnH_J=1;)`dh%>&N{RhFEFBYoo$$}s}Aj$y+ zOsKEcspLEshVB&~d;`VsicXEf*o{@REkD%n)3pz3qX3B+aqXh5-l53dProi zTNTK<7~WqYH%N4_T#Q76pHHRajOBONkE7S1y#c(G-$ONvNeBGFAtM&PJ)YlsaB0g>zIkZQy|Veqs|T(exW09Ha}Q);=IFWD zy>e_f+z&RJ?izJ?|5=Ay5WEOU{* z>uXyHgcjbMe{)e<4s+3e90$p!H25M;^Kl%cm}Fx_&mL)B zJ&6U0)-y>#vqMaTJAeZw4jCR)X?U4{%}6F?&90~z_mVN<_B2}}p*cr%#`q-ZaGn+u zN)qEK((s$fv4|+k9HeS7QDxl+zMx+iCW#NdB7R)?4OCxWL|@tLj#Cso1f0Q=jdCDaW?zPp6rdrQ3i(x+QKr$2jUX>d6I+{i=HX%YZb3R8`aqtvZu zKkECi@1uhs9$ZDFUD}1Vv43-Ka4oplf4}5_Hq77)k~dAMjwN1!ywEbvoNR_|G|q)4={-zLrJ*7FNV|J&y06cD4?=$w5k6vGi1#c4rN z6|tZvg5|u@KR>o0TH5x1RK`|-g{#Z1mTnUzxDCLfFWr+s~StO$X zC7}Qy9V&@mRF`x~fLHBQZ%<1$jaaCiEEYR>`qy z-PaOgR98}>t^nUv^XRqih%zBdLLAFMMI-G}Nppdy-W5+84KR@qQKCXnjEiH6s_4no zo(-1Q^*XN?G}nl+e2AWV>X(2(1{E9*(u67}E0pF2j6-V{=_{KPUd>fzuG}V#N>XVm zO!jPDN~185xGr6%uvelbruLYR(Wi8XzOZ`k8aKN#w{H3cGC`PIHw{#;b|Qq^V4k|y z`}F!sTezV@Rm%_sT2e_1jP+GpE0n|%OM$-TN=vm&!HOmJ>S36<9ad?r&Gj+(m^aCW ze*35F|8*aBInb&#w^TK*P~puLs#=D(ROt#eNhe)N*AAqwc!dsxTfvjkl4I2je1*SS z!m6B~G9fxhUqTnq4K%@ALN_QsH|(beQcdqh)g<44?jo;fyq1W^V_27^fz+mp(Ulr+ z8V*lfC{M=xV1w(=$D({6j__E%mQb*)@dIj1)DP_E#h|WSH_cANRuh^O};9 z`Oef~KQG0AJsQ(_`MOLLl9cO;B=h>X%!6kilQAC)@k%tF(D{%of(rrmPJ{BN?B|lB zF#rHQ)mIf#)|?l>m<3aLAtLILgj%v{{!CQUMOEcRJ}Tb;+1J54fbIL~(P6(A6KgfB ziY5f(3Bz?mR<4ZeCVm)A7cQPV_wM=8QzwOyi+#hR!sw-Qr!E*iV)no+h@gm~%R!T=X* zuDuqcu20voGM3QgQy9mv;Sz-MY6QX0V1R9e8;K^Apb;O#^E@UtXt;x7R8S&uRgM5; zrgm;{rd?~;nqZ{12mI5A=!Iun-m@#`+4Uz+XQ8pD;Nqr_PaU6Q^8C>pe6FK~#+HRG z2j=^p3_Tv2Ke4d6f1$B|f$M+4ZOL;zIj(1sJMj5WpWJ$U>nS%-sP|@_GtL+FO7Kfn4U z_Bi&`b*f0)-3>2%t=Tg(XR^aH!+GD)obTuz>YF-LzxiLwZsco!1;ZPj`&tXlZJ#9a z+xF$Q?OSN;DI#kBX$tZt$XQEE!z*O*HWV4){Z=D>?)UzVlOFVyXWPjh=Bs@aXfuA5=7Ge{aWu?!IS<#Y((VMC>Rw&6T(W|th!Zkel&xQeGYPA3F+N3FA1!tq% zzH?nQ5T&}VRi{(~WopKp+bgTDmT60xt-Z=OpUeiQX5P6+<)~%SrthfCw9yS;wTg}k zZ?z2XuF@4MZIf)jwpY%s^2VT$?qA^`)>Pv*N~p@6M1IHcOA;EH5$a1~XfKHcnz;7z z-;f3%XmwX0P`H9c$=_sHgXW{cumMzv;IR$+4dI#qF)}!LQIZTAfPox`!6uZbeuQ*u zW;fnOTASg`qbnqCT>}<)E9tacSL7RpWlRw@!#-w0OV1(9Ll-6|XMArJbi~ErRTBiu zaxn2F1|5wWj&qe~m0{CLO8V`FMU&MKCfiv;5jPP4@p1{W49?URUSy=vu#&fytQpQR zH3rj|5Juh5$Zi4bn5l$4TjMwulr^m^MiaykJm&-eMM;TXNgdn>i{3cw5KN;bO>8P~ z_0wmk&St1*E!*bUIsL)p{mI3agBf?BWoN#{pKI|y+`icIwpst>*u(7)J06Za?3f$R zwRF$-?Cx3VT4~<{(d*-+2T6>?mdJ9e8%U+tf^sEGOAf&l2 zGXR{;Te6pDF3&a0jVw0p&I}YhzUkCdYQeL!u(kbO;BMgF2X{Ys2BHy4|_G6Tz22+4#I8}94=CbK4 z{`WM_zs>DtC9r!q*O?PpUP~;+%9EW8> z*e@B5SV9*{q8V_2h85}%(`#hs%N}ASJ{99}HCgcx8^@<>*ggO`+Gmh`h>9F?o7S44 z3ZDAu{DG2ZfBcaMba{_t6s01nm z0TYNcEC3O`m_!(cE2xT^27klGW0Ju@J0?FIFxgLo)e{ii12Dv5s=;VF#=A(Ld3xU` z4AvoY65MZAM&+m$!+4PJ5EL{uGyOGokt)eZyY?H%Zll6he|~FMZfn=VmhJ^h>vOig zNK>qL?!YSq#S(+8Ub6ul9HH9S&1Rqs<-GjG$uD<&ZTang;H)ZhMgo0AK0!OmCUmvh0FdX<1}3aNAAD-AjHMb*=qn)K&Ur)MY*%%fF1e z61t*lCB&>V$5j3^YUX3F^l+>4*CEjbb8{yRGRS9Ihr$o~&?V%aKElz%q*Pee*{J^umOt+y5c literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-312.pyc b/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/exllama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b31dc7d7ec08b498e61a4cb964791a9b0e4168a GIT binary patch literal 7644 zcmb_hZEO=sn(l6Y+wC}sZCFSGodgohM=~M65QZJdK!BM^vM>a=%kHYT<8B9s_6OB% zNGy9MJ3<;~ccp-Is~GLB$Vj)EAdrUnHSGPl-Cu`vf83Ld9JM$p(%tFAKi`DCJMEvl zx2oN-6B=|QZON*x?|Q51t>=B-@<03iUJ8P2|7CpW2Ndd3CFmDr0udZ;Tm@(+~aPNcgUWEcianUr|6P>3E#Lc!H%;u#Zc!d z(S3&!J(BN{&6sE0KSM{nFK}iwLhC+0no-nLf={It$Z}~RdV{~25t3>=D~bFqLC#1D zA5W?&9Q7ZLFnZH_f+F1-9v-tOw2furMQ`GpS@OJ{dKnHDXUSvwjRlSfC_ooEE8Z zMzT%0>zf&AC?%V{xn`0XcSRjbE6P->IBP~($W$uHWoE{cjVJD@&h z7aesi8$_oRnBq2%i*r*Qua?0qt~>VeW)CI$zE3~V4Rdt8IvyOMbvcn2q_?b%xkc|C z$9QPQ9%1$1(6lTI31QRTJCUSjP%?^-raoY8S+_&9K!Lt6qc2WE`XL2JYKzK>f}3-H zN%pFVbZiTE@qUK&Z!xPr?iL+YONjNS}$@QHYmj|ypFA}+_(nQWKE zs(M~+w+#eV@Ku|o)&foIkf3xrIvQngsv4Z|CF@=zYL=s?%$m9{;-I{~SG7*0vC2Rl z-n1>jsJrntMIm~Ic};&C?l0P*7R#|ESSflIq{u>Bk*<#HfW8*0ud?P~Qw?US_O14H zTF;xKrW$K)&D1R#?Gjz0+q%zh0B@}O{f7FTJ;q#q<1WFN&E`C{Z{0dOU}mp%E11~_ zHLpDZsAqrYz!J>(BMqYnA7g|$myHibRSrohBuXfroaCYzgruY@&v40<$|;$2I)zk8 zmPI1TK-0)p%N#xKxbHf^nIP2~oH`+`j(9TrZc*B>0KAszf*? z1x*B%yDf21A<0EkAbk`|PdZ^4vl2>i=@cNig4Y|Vu7XP?p;dK#3Kx@8g4%ZsL{{Us z$fZ=yW!X#E7;qTCxqaEb$Q!n#wowG`@m8GRvg~=3%A`33>xb(L4H3u#Gj zie+SZbCIvw086HOurOrvbT=%PSSQ^9TN}NuyQ;gOhc92bc=1OcjSZdUFI^cN9plIT z_TtcG-8+ew!{ZSm3~GXtL}7>&tVxI$_VS4l?Zn74K|68ro0f|e#t~|$G*GO+?CaEgor{B- zuWPB}3H`PID}VW|^BVm2pVxdBo(+EE8>zLPKQVXWx8VnE_uIblb*%=6@LWHf`{AOk z%nfMp^9`&uhF6<+6>mPs-_I9Q6^icZTMKS02M=n&gQey}6$jJQTXEpoDTm(_VVCBE zE}E}>aqx@b&xbYNTTfu#qZ?!8?&pfh3dI}>t%btnP`4K9UZSSQ8bW1qoAFOSgm^^Vt5Vinn=3(xWW4C#d=3F!d@>tL< z)Gm7NPqZ0 z;FH031w#GG^R6wsYqbuS*^{OU-U3~qVm6Vzdhy>W3hMCR&evo^M7`;Qp=y7qw;M5FM&60Cs z25e6XOzuQx63h;0Z_Qj2oVZ|$FNnM=H>C93&VD2I# z(vNK>g3?fCWn-rNAn}%9qJExH17?+##mIrbF=w;l)KeSCZuY0B+Fi>5OODtLj%?gA zo9-d33@fYd1qUCfD^4SfkP)Bm1%EB6r~+sMKoNkZ@kw|k&|Tn@395<^-hJIJ z3aX%cqbYc2n9QUyif)4k2%Viwt2cqfWnwW|LaopeVS^HJB8&{^Fz)J{=5aT|y3ih6 zVpk+Nrr>coj+>pGt&*^M6}p+d@K;jCn+nXM%s})%v()ooTTxhPKfDs`FR}flVE<}x zdpX#t1v^W@-V)mj@$*niacJ>KxwBvE>|YKYDGdD^+rHSobo1$@XMNwYBdg(U3s>)5 zEr+|baMyBpU*X(yHc)0;HMX^Qb@9s5;Flv0N6H7!Xa~=%uxD3;I|?I2zc*<#-fp7N z{{V?!)Bit>RyUx*&_vL~s=Xk3i1%|#nA*_YT7{{HIf}3L1}~VJ^3__HsX5w;CxMl! zLK*0rqh8+_bOgGhqY%L@?#3S%I);m|;6#tm08X925jKHn7ev6yiL%sKxs^gFe>H_r z@-V6LK15au@!UM@Z`o6?9QEL@&aDPR3rFr9DGCp!?oZu4vK)+**a$h`$PV2JiVkmK zx(^#X-tgaa&uu=NCU*)Qhe>t!ZT_Z_-3M8ONgBd9Nv@CX#@^CITu=-gw`0#px8nlc zK_2w<;Al#H-}F@^aR?#sA2H?9U5asKbtakAJsYom2tQQljePBXkyk$i&l^D6r9@ht zLA^K)k|{*8h!4FB1?09OET!%wUKCt-{7z{4&bq@e?0vX}j|^m35_Agp9>ftjRjgb; z`x#tz5>6ZUz3q*bCENGC??0YT`7J2tUuLOQwsHQ$xep6;#Yr_b&F6oaUv1l4IQy&7 z=i%1kmBq7PjC?+_96nqQ4`|_m!Z|qUa`=!IKC~ogVep2&9{y@r3!jFHmexAIt}wh7 zXq?Z_<;#J+T43)=p!;P5wPV)>-?{t2 zf%^wmTiRcG96OrtIiSz>{T0R*=GJYF?aknsH8sCvsqoP!SDp_3{I~g#M;5 z^qdW@Hiqx23m@P6`0q1cOn*LI?jF#(2cGVImML`)EO%ddX=A{vbAw}vebDdhHvh4T zm-082*>(*SdGAuF92w9e1EqI{0lL<9cNNaAu&r34efRg3cOKMs9(>*yUhv)X70*6) zeCc`UdD^woI9R%LwPK@M4y{v6OGDv9u;2dB-J#-^z)xS8Ch|^x9VoMmVWDgk6abw!e4;%iq`uI;Jp~$;@`m` zE?`uGf~_HM&oNUQ$W^NqX(+EBty*Qg{bNJ@C;*g7DmiFK`IMW z&Cw<*{sd&myYj%e^Ip-G_vJjZt`Fdy(8S$O>7198IBF&82KrhdZqAW&E6N0K9jNzW84VgkPM?c8=Gc zsmME8l?~TFyk$F#=cIU2l%_GSw`emK@uK%&0^J2nnmhWf zH&bjbUV3ot{5GmL(uAIMjzIdki(cR(2!SbGy+Mbh7FKK&Dl{yEt&|tx{ z*3nh&7|=QZ9+o>!Ywq0zTjBEEJw+EF!j4@9|5_ke3hXYq&z0KGEkhjGy&Bk73Urj* z7fO3BEJGaVc-|CRXuH=2=y|vEu3Egj*u8Y*N!uUkfAasyUp{?JJAG~4#x#2iPK-H? zC11-b+l=Kd+E>`N)j)Xu)L5XKC63e3RU z9hk{fO4e-(kk(FIhdm)Qj2k#d$du>`$&5)!@Qsj!s~d5Ii8VPzgdc`TgdYl(XAs?? zR@)-wwmz+`@5#w>|NC11`=!JkSrIW+jv61g^VYLIKD#P?y+Hr<{a`fxtUmgGY)K{n0DN;}wW2n86 z|Mcv1MAibK`RTdo`A_CPsW>1Hldw(mC+AL9TsY^ZpkAJni(^_tdj)rZbN2@3TjyF| z`XFa460a}PP~{f*pB{vv)VNOrIN!7RpPwG%pP!!ke|~y0YFt)~BA%oT`q5_9pIlh{ zpJR;#>W!pf-9L@w=n;lha-PO1Y(5#?ebP{ycTon~VMtQ2ZdGhFO|N??+W+rV=bssd fX8zkr(U<9ep(6i5oqFlK$k35u&z~v$C0p=+x1{D4 literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-312.pyc b/model_executor/layers/quantization/kernels/mixed_precision/__pycache__/machete.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1a185bd7e496a0bece6040c58356a0340ce029e GIT binary patch literal 7850 zcmb_hdu&r@mcRG<`fVq197BO5bxI1DM?wgN5J({qUWH_NbX&>W(VZLL8*sRO04R=R4o)oZmU;^S}809tx7g{(by!S5nlMSTU2`C_KLmg&9hq1X`jJ zbd;vC%t%a>fv-)nCD^E4Yhxux!WnfYTv1oT9d&DMcFB|QM!gAN)Th-Ql0VT9ZAdgm z8@0MqYDxs60Vul!x73_yjy5NP(I8DR)L}~ST%rW8*nG>TuO-@&Wg@<3Sc%Z8GtCnv z&PyXu_Hwa|tfUfLDlJEBYHRGQ7(2%$c=#$JCuh>>6j5YOKFg;?b?rn(B-!P?qQIRY zsZ5%a<2h0OZlZ|{pHw&{n-=8=qlVKwOr*8JFo8SIOBqq-;z=ciS?@OWwM_E+Xn;>( zKQD{tj~^dC!4p0KizaF)Es{hAmJj>n6P(0nQyGPW%{Ij{gov<+w8Sf8DUx_bRksV_M z<`R3$Ac@+c)h0Ti$0@o*x4`C{6g9@Ig8ng9u-8#~1c&Gyf5jMB_ZeSZS22n(i}Mm2 zZE(X)|Cn)t3+8D5Hb24r9qrnBVfB`y##|t4_GnX z(^=(gD%pKrk`mpClpso+I4Q<5N{V#DrHe%FHsp&};y`y!yIi?D5uXHsOA|2`m$BWD z_O5hRZH76xL_EpG_%uHnm*Pq`x6Wc$UEij|`;<}lFH%n%1J9vAsdQ{4#$d@vaPgG3 z?^bA?p_J;GjMq~c#HIkrbe688#3Ngl1m>2(321G3dfc&$0IDPm?XGXP3oI@i`ZPdW z+OPMW+X=1{KP z5WR3JDP_Y38jAu{!w3FyXppqPAf(%7(h+eY6DOh^j-?W5fWUA~j>D<3@UE@l4M$RG zz{d!|xo~WJFs6iwm?DBm;>j~%10!L+WJ;-g8ByQW}eBE}lqBVgk%quEPqMmXNd|2QPrbMe2ovY6_Pc*B2YtKW>ba zgZ*DMzFKPBUToYxf9i2#UpYAP)a$#l`||GDp3<7$;+o$1!Qz@7_d81c!^QsL&-h~h ziO<)Zs4(`HhDrnF^IsX59(Wu+@L=%K@WbK7@TsEr)YGO=xw(CI;-kX#!fdKS(Oo@X z1$WF(+$-EIECvrQG#{$inZOacVn-&F-EVN(Gri~WtNS1HJnDbgzxe7%(K}M*l)s*z zO#-JaJ9SREySc^S{)OfP$mw7mrxy)W)8?|zU-ESpeVubBi@x?aYV#fmg?rSz}qqAJVo9Bz7}qcpgBQ;EeHGs=G+FfQpy^Wz0zuytlX7p z$8FX;jrtu-*&sN`PRPLdB0Yxo25Ksy3jPr z!FP@9H0f*Kysd7eeSBT5$1IWdmvDr$Sf281jboO|QW4k45~m79HweGycniOzXQ({J z8`R%1)3okwo~CZnBLwXuxV-eF%A%jFST@| z19zK~QoI0$qic_pit&>CCKPlg$6+QJ1@u2JQgi%o=^NW`IDb8^sh?(jo}t)9XtD`D zOLl_drtRc&K(1dh^^Q`WQVePfxS@N$claVCq~M=KDG1DXzM9C1CE(N)fHsI9_#m%(0;6MsXbN z#`Aa~mVex_ z?QyVo!PmPG?0w?vnCrMPasT9JJzw~S%Z*J}-oO0*leWDN=tqu+j`F_2NB)QY^0DJD z>`eOzebrNOP^&xVhJM>IceuD}^Hs-_RlDx5EBEjIbj$Czln)*PveifEtMo4%;7F`k zRSIn^hBn^lxwGTujyruf`)>7@Htj2J+PAoAe`(WDansP^rbFdzz4tcW-H4wpcej)W z-uyKB`{)a&ZN*V~YVhZWDlV!eG<8J7gC**Z%Al$LB`Ex9$`7iKa0rP;#je^fy$$p6 zs^Zt$OvM0!HAX{ZgaXUf2`o5gTVV+4fefSpf+MO<$ojCl2fJuZQ1m#~@ZRb#PyVxs zfh@vwGEV$5Dfq&-w;WnA^Uk$*N}+Yd(7MIY2E2rFu*K>MstCLT5!m*pU4p>WBXG_2 zfeS&kTb2MttU^$x9%+@TyEDy#OJU{S7}mjR25<=Mc_s(q1p!(WXKTyIGgg>uI!rk3 zWk?G`V^|wQXzEdpbi*{H7qT<@e3efl-sQ4`5=k@xLTFN*@W#L^3Q=7bIGIbrr7;=rQ3E0ai5lufFqh69{WC#PE`1JWl0Sg# z`?2WBs@>(Lz>MRXW0wBNf89Ud@xsQeZkT!tkR`Z!_Vu}zPuf0iTWszwHE%CAZ=X8w zbj9k~)7Qr5-uh(t(q8tgS4!~`Aw9`Oe^Di&WMvW4A6$cj|!MFsPXFS(D|B%7p@V0m-c{91tK2U5LD7S=4Et`uin{S*fwsg;T6kB@d zCq8fK`zpNtwym_GuehP_zVbLcbd~*u{~NZ^9s2q;)*bo}pW7X(yh6GCC0AR~)ix{6 zz5U5gKK{x4@MG8hvdd@H3cDG&JOH~Z`8tYV9bdoEa;N=f`~CJWd`Bua%C|~)O*Q+@ zj>dU8R#j*?7=$dm_2-`fB%*UY4wdDpz(RY~cGPSrX{_H==_3JE*|f2WfE`p}wzh=4+uQ-n%vwP& zVe`(5EJnEkRd9ZfBkRuFfr~{nnRn(L&|?uvf-x~prQprm{yLENj^lwWOAvR#{a^yi z@3lJLyvL-_Ec>(ng`~d-IaHcgZKK=-c>~6){!!zP4Sc$h)dsUFYtE+XtJdK)3SN`m z!ZFPpz6=|$KN+jB;y8c&-jgcu9bbsdXwZpRqW=~x`J~vnbQ19Gx|75Xq@50)_V$0x^+jx@#2tvS_ zYgr7gn;ODK)a2mG1pp-<;L_Ts!Ie`(Uj&|;O-+SS;H}9Tu;PX|}f_F{X*Np3)Wgq+6o;HTAd@%jNY<#hC!-8vrjsy|dU?iZr zI1Q>fPW5mc{Lhe)L@4_>{P#*iy9_mNOyXr3{(m}~637UQBN#prd>mKpN(Nr+RCY9# zl2n_l5b`?qVR)=E$)xHS)_9V4u)*=Rm;}2-5M8ubU0(`@aRhx9EkoY&?;yKGmF-Ox zhPF4)^?psk=edn?1hfW#Tr9n2_bjycR4Dk)i}!~fto@9AxN%|6@#2o*ud&e>0Hw+a zdb|B7J^#kNzPo+*_TJt5oWgnq{(E>m@&|XjJ@T}%<;vvrkBW`^qt@)z(BjlGRc?%CnD>%R67k8|H>hV zy1GK_#(LM%zjnIxzjnIn|Jv!wC~-;dGSx?`X)meT>#F_$yX(jC(%5r4Cxm`Ia%{6= zqNzFl^H_@m)CP^v{GUv6zT(LDZO67G%U9wsP8pgrk}2`>%+QutvQ-Or zD`ktd)ooYuriGQZ*vO3mJBtP4EEb4^T_6Fre~_UAatAJIv{@|tpIW)d?*6m;eQ#z+ zN*;T+Ku6-6H}Ck~^}XMBy!Wq8r;UOqYWT~@#=HS9`8G!DV!nWn#7)uqSVN#8));7nZ!>F&HpQ9(O|cb$6(rvpeJR!) zXpXf6TF5sYZH=uA>u4&*1lmZ7ExIb!9%v_Vd$c3AIo2A{xM*1=ID zgYs8($T~ScTce#fWMW;{41x8mk9BL8U_CFaU3<+K*r211QmpqH#ny36AL!+_1D)fv zzy1k+4ESlu9t@`hF%b(Ul7inMHTUm7-FIvtc;?*T;K|cx4<8C1J$>@rVDQYbz~M7e zV78>P(Tr4R9(O`rflbWhCt|s_VAsGsD!8mtS@+w(Lj*q28E_g8#5+vWUL?IlC za)RVhk_G66Lb+B+N^^W7m`p_Cq9CohoZ|TLWxeLu;N?^(E(XQ%B#^SF;?RZBU^o#K z{5r{VAS7^C`uh3@LwqR4i5xHa)E*@K@KoGM0=v4U{sbG>mS zGH;$>dU|9=913CSPG;aOhHxJva&g!+%y0kVhfqvS?^Tk!ZG_N%epuK+V%W6G9y=JI z6u#8Ggid0JAsC?Drra+iqH@*_W*`;2$ngwRL{4L1Zz(M>T$GF92-k7l!`+?CR=?7n z(}WkRdL)mVW2<_kjsWSZ9m3JcJ$S;vEb~4HZqt&9<*r1++>`%-hsQ;If`Q*A8D%sx zl3_|F*z9rGAFCi!GF*uOLRnLB0T6{tbCOO>NCudRUdgPWgJkA8Lcnk5o1qThf)(uq z4&+skETgasaj!{qT%t!MLo6hWN+#h_WC-@0OPOeHEE$TkP{^f@GdRpAQb|Fws0jjc zGfu1W{CBKJj3s-Lkz|(`;)gl0Yb^E3_TJG~xSnX_Vo!2hyp)LdT!}_wJ+TDKMT6WJ z7f!)AdjQNhUg%MRY z1QXOkBjxqYTBf@vPLyg_%ydt8f9J%+tCOLU#eHLE-qJd;|FOe)qj9Q#_Eg^9I&o;h zM7bJ%;=9#+vw3dx{<=?kKJIzgbUa^kyyzUvI|p;8&g9O$mOKAO!5Nr10)-n^7VFpL z>(|YD3iTT%PC`o4y7_JS#!dIm<{P#b>v!ktcmEUj$;iheh5BO?Cm*|7i>@_!*P6M@ z^KJ8|3a*}_>y^CgmHYJlO$FEPiB})ny%Q%Fq3e`HkI7&d1a1wNp$T#jVhHqY1y-4% zGP;R65h0p96%fnNA1JBnJF62BHp!y}Wd!0ZYKhAG4qb)<05Sz=^#Ef;&8n2c7y(*C z)(8MXrvWZm{pdPa4fAn|HGH5H{Vq*%q4xm0n(&q}WB`y=fXbSy`mF(X88d6l=(J#x zHNl@*rK1If*ixm0S`s0cwIKLHOkdS{?b6ZWSB8c$yl~ctV9)^ZrM=5q0rssZl1304 zmOZTbj#XX9k%ncN@F|u>)=1;CxoS-X%5AB3n0y=VS4}7f;Kn_+gJ2;-^;SmK9GG<* ze>?Hln_l)~VnPOS8PpDk!j}|KSCKQEh>M{}TwvZNumf!THnX)0a8@CviSJ`On6t7_ zs9+~<>`n$%B4$155^jmNIvEyqrrksWRInY!beaR05CFUdMJR!Esv1P4tlFsU~clya0*40a&?7?XliMqT$IyZA+`oi40dz(J${jm3;Z&$u%SJ8PO?>z8e z(}U3GwFPG%h&&7B@XeY)Ru*mTd0YFuE@x{m*ftbx-FaL0oz&fL-u`C6wsYd(V|&fS zaeh0@Oc;LUM^%p(Z7^Lk6h{SUmI~-NJxg{_@tvZCB_Il|Jc0qUE<;H|Mzto=`_Yk_uJ%cRN?*7}@u(q|Zf zX00I$YXigCJ#GM7xVE2|xYZUGk1d5^qLw@$AgDz|`xy#sYu&&TH1=aoj1mQkX^14}G0w1r4tqGTy6 zD#!>z#j-~g{Q^BsSHx+C)>1>t!U|LkEy0@6U7`6puwnGt`R{_NkFeNLW5$WvHAd)93HU5 ztOI`Ck#vi^l1w5V9cL6D2*7tD&Kx{l536R1stZx|#*b;H+72P!8wyT~oUruH2co?<`l#+|10zp<36rue?3E+KmfFooDlc z(TLTI&YGt+lzT(T;Ve2j@{W$VQ+dZ)jjxq?$I3Z6?^p$S*ZZdW7Oh6N4eDEM@(wo= z`%8P1*T52d-}-&q3{~0aMDx|!>`I!J8-+M%a3z;L?^vg-TAN*K^RhItDeAzc&_S*8 zR<4kQjjB^2=esnN(Xbl$Z-Cyev3VzS$EokqqF1FhQm4i$?`3nKK}EHgR4~)j*~<{1 zCA-v`@7N}7Lo}&vK29Dg;X7DxMs=fC#{fggqVTEOb|1|n9=TE_VBGR=9SA5 z;ghT+qmgb@Eh3tV|Eg+ERy|vG*_?DdTSA?a=d3g;fXg1SGDD~bE_?2jExW8q+@NM= zOv4mw`56uNlFEl&YF_27atGL!l{UI7`D$!rlNzhM!QQmpv8&Xx77^7&@;q-I)v|n} z`11c%He;!p^I8xhsB$iQSO=6+Mn_XGSW~4;b@hzCLjMIdHP#PYf(V`P(9}5PcMd!O zP@vKmpH(S9)?8g8n3xnAV4d>oVsEEv>^$?viUg~fDp|lRQG&1!xH}i?A^0&39uPdT zp}wb^qGj#y3^>*X_%o2}*AbIIGN4;bz{y}HWZ1XtiiTnr+0dT!foHqlo#Yc?Q2rHp zA_PN2;Hm}}b%;g(=B{W0+}gq(h{!bV0bv2R=6_C5bD>|-^IPZ5KO2#?-)i*@H7e5c z9nh5@>#*k|khJ{KQlJ#fBiIDBaN?IetPW9;e@#@6xSm6OO0R{ptPUL6%IaomD5If* z@YW1y>tv=uwL&!}3vPEUzl*xNM#_p(rL$^YmX61tGZF(C$q}*ay4Id2?U9U>_j6U7 z%077R$ov_4%D_@9U_@fs`%-bqgFL9DqR&cW{y|kV|5UG{Hq)z+;+zjaAX&@vAU%zflGl+gMO_hf9_*r8%N{@*ET+wiIC9 zgn~ZKB?C_%tzIknbnX8OK}!hzJvL{@*&w1NR zF8B59RCbR3!qxUw%iag{XQoe0rG5K9bAIY9o#=aN)U^)KH*DaZfa*JcxU+wk%z?o?|!4$vMt}TtyPh?eOVXv5PEDsiiS-gCG9cKtp2 zBj<M%-jYkVDuimhgymbn}`9t%Eb8VY%9DlT;3zXVT8G4bTU-}jeao@LSpwX~C z*Lm>4{?Gb9?SHU0clzzz`cR?%V$OXL@*3ObR($cw$%hStldqP1Ek)nDyl-8>w|;W} z4~~?YTj13XEls}q0{Hz$m?sn*RVoWrIZ;yus~{_a$XnzAF+*3wBoVwCs;t%-pbC1h zl@*W$rAzZ1aEjJaXzv(QV1ey#9FVsILJ}OfS|jl!904Pv+u$r%iNo2FkvK^GX2}Le zCUMw!uvvN3Re1zfK7oOV0db;P;Bbx@NQ%%T+`B|(F5z0?&&9Pu3A!L0g8;bpk+)?w z^ssI7L+_THV@uAv<&mRpu5JGE{Zn5$_LhA0Gp|j*R`jjO`_>eE>&S`&Q*E8zIvct* za&zRy)`HicbNHWKdc+1IdH2EYz@?uuXh~j>vY>78&5&D0T&Sr$ERlKIHSzE{4q7_M zk1B>4Da&G&21Ti?@FF&WHZj9Q7!529?8M?0Gz!vGC<6+l(>t)?3H(#iCze;x!)C;tN*n}h(E@v(pX-R9fP z_iMlK?_AJRD_$x#ZOS)oy4RL(+FEK_fro^D(9j z-+b@QpTBi)|402F_7}IF$ZtLI`5SO-SUjK1pHJp4^ZD~ap-G%P{-}X@0h*OzjL+gk zlqdNJ2z@8dT~0;7MGfb%Ay)Dz>T<=M;%}DB;Mog_BHXF~@fwW74HEeP*#x2vE^Abl zL$X!0ZJvj=B>UCiFkF$5E1_j3nc*xKZ@9>4p$C0lB9MWqHU9>da0e@^LN0hn)~msE zlF*Q>#6c5{jB?l;K~6N_wHp2~4zA1)k}e*Xtb^qXFJ_R^I0znnxl%394a1=UXycVD zBzDOlaM2@1@B@k3fWX ze6jiX!^RWE#@F(VuN4~4Pab>3tbX6}o~6ia&NG`!buFb;YZr`E+m=O2*9M^80w)~q zh8w9__E!96{GqGsk-OZB-1g&Nx=(t6pu%Yo9fsb5+S5Sbw#ApHndDqOAMBl@A(Ui=v>-}9_CxLibyl!ZH#R-Vc^ z`~{$yHD*m&bJoJr8H1J;q2&-K7n`hLF<4ZZl{eU9x)l`A(vTM{kpV+=8j8coul5N! zS|VC>iL5J22s^Y;Q)yL=@g5j+*-BGWLA9}~>PpsLM;XROcy8uZJa zj2=`UMT%w}8Hbj&nX#2ygA%fW!~h*xND(c`jAw+P8A;~LGSb0C7y0s@=`up5z;|N> zBD!$-{SN*ecuBQEJRDKLksyvxK~4F9l1MuAFcIM7uVXbm+;5Z`l)H}A7Z``RSd6sd zck(x}&f6I5zyO;im?Y@|tuYYzzlAi(BttRjA)IT1swca=4P(L0EHsr8iSMBdU>H$+ z+wu)cx%+~qZ+l6JCSY&jb7&LH2M6qR1&!KP7u$OCZN2w>g|@wLX=YMCdFDpPtm$!m zQNlZMCOE|&oe;cBj~;9WC$=&Kq}vBsaT@!#tx`8&yf zSE;MF)Y)~n?{;5l&HATKldE%5zfeQDYOjw@jn4WDH68E`u6ggI?APX57?Iy2S%O69 z1cQ<-7zB$n73CoA2;v3IC<;gTvWKG~L4cck){ExAKsh<$#e?mUw!*?E)DB_Ef0S44diXSWo5Fs~}u2ChUdqGDVYv;E7 zfr8hfo-%nz0t^LXD~z^W>!t+?-uJlshaR+lZuoS4ZdYG^TmK(0QKX}`@6}EGeUUV|@70|WBT=C{Cg1lZAM)uBYSrWjh(|Qwu`wQTnMdH` zQG)Y$5Wu4wjvOt?+GmSayJgj(Vhu?(q@a3tR*=uCwrkaXey`<2Cb`pSZSV+qJffP+ zL2g10fk(L$%i~=g9yN8TIWmTh5RQi{jf!KXdngqT<53q}Qz$2*BBiF9p$=Z9mb(6= ze#)AZoz#)9-SRJCKki^ZOiIc>U<%m*y%-z7fbi`ZjBUpNd0DdTlDEh{v&KIn$6Q%`I3tcz6`c~&nP=z3RubsT^ul}U- int: + return 80 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + if c.has_g_idx: + return False, "Act reordering currently not supported by AllSpark" + + if c.zero_points: + return False, "Zero points currently not supported by AllSpark" + + return check_allspark_supported_dtype_shape( + c.partition_weight_shape[0], # in_features + c.partition_weight_shape[1], # out_features + c.group_size, + c.weight_type, + c.act_type, + ) + + # note assumes that + # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} + # `weight_scale` is: {input_dim = 0, output_dim = 1} + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + device = getattr(layer, self.w_q_name).device + c = self.config + + # prepare the parameters required for the kernel + properties = torch.cuda.get_device_properties(device.index) + sm_count = properties.multi_processor_count + sm_version = properties.major * 10 + properties.minor + gemm_args = {} + gemm_args["sm_count"] = sm_count + gemm_args["sm_version"] = sm_version + + self.gemm_args = gemm_args + + # transform param weight, scale + old_weight_param = getattr(layer, self.w_q_name) + old_scale_param = getattr(layer, self.w_s_name) + + assert isinstance(old_weight_param, BasevLLMParameter) + permute_param_layout_(old_weight_param, input_dim=0, output_dim=1, packed_dim=0) + + assert isinstance(old_scale_param, BasevLLMParameter) + permute_param_layout_(old_scale_param, input_dim=0, output_dim=1) + + # unpack weight from K / 4 x N int32 to K x N uint8 + new_weight_param = torch.nn.Parameter( + old_weight_param.data, requires_grad=False + ) + new_weight_param.data = ( + new_weight_param.data.t().contiguous().view(dtype=torch.uint8) + ) + new_weight_param.data = new_weight_param.data.t().contiguous() + + new_scale_param = torch.nn.Parameter(old_scale_param.data, requires_grad=False) + + # reorder K x N weight as N32K16 format for Ampere W8A16 + new_weight_param.data, new_scale_param.data, _ = ops.allspark_repack_weight( + new_weight_param.data, new_scale_param.data, None, c.zero_points + ) + + replace_parameter(layer, self.w_q_name, new_weight_param.data) + replace_parameter(layer, self.w_s_name, new_scale_param.data) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + c = self.config + gemm_args = self.gemm_args + w_q, w_s, _, _ = self._get_weight_params(layer) + + reshaped_x = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (c.partition_weight_shape[1],) + + output = ops.allspark_w8a16_gemm( + a=reshaped_x, + b_qweight=w_q, + b_scales=w_s, + b_qzeros=None, + n=c.partition_weight_shape[1], + group_size=c.group_size, + sm_count=gemm_args["sm_count"], + sm_version=gemm_args["sm_version"], + CUBLAS_M_THRESHOLD=ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, + has_zp=c.zero_points, + n32k16_reorder=True, + ) + + if bias is not None: + output.add_(bias) # In-place add + + return output.reshape(out_shape) diff --git a/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py new file mode 100644 index 0000000..59c6a4f --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py @@ -0,0 +1,323 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +from packaging import version + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( + BITBLAS_OPTIMIZE_FEATURES, + BITBLAS_SUPPORTED_GROUP_SIZES, + MINIMUM_BITBLAS_VERSION, + bitblas_make_empty_g_idx, + bitblas_sort_g_idx, + check_bitblas_supports_shape, + query_bitblas_supported_quant_types, + unpack_gptq_qweight, + unpack_gptq_qzeros, +) + +from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig + +logger = init_logger(__name__) + + +class BitBLASLinearKernel(MPLinearKernel): + OPT_FEATURES: list[int] = BITBLAS_OPTIMIZE_FEATURES + ENABLE_TUNING: bool = True + MATMUL_LAYOUT: str = "nt" + BITBLAS_DTYPES: dict[torch.dtype, str] = { + torch.float32: "float32", + torch.float16: "float16", + torch.bfloat16: "bfloat16", + torch.half: "float16", + torch.int8: "int8", + } + bitblas_matmul: object = None + + def __init__( + self, + c: MPLinearLayerConfig, + w_q_param_name: str, + w_s_param_name: str, + w_zp_param_name: str | None = None, + w_gidx_param_name: str | None = None, + bitblas_quant_config: QuantizationConfig | None = None, + ): + self.quant_config = bitblas_quant_config + super().__init__( + c, w_q_param_name, w_s_param_name, w_zp_param_name, w_gidx_param_name + ) + + def repack_bitblas_from_gptq( + self, + b_q_weight: torch.Tensor, + scales: torch.Tensor, + qzeros: torch.Tensor | None = None, + ): + from bitblas.quantization.utils import general_compress + + assert self.bitblas_matmul is not None, "bitblas_matmul is None" + + quant_config = self.quant_config + # qweight in gptq old quant linear stored with + # (outfeatures, infeatures), should be transposed. + qweight = b_q_weight.T.contiguous().view(quant_config.torch_storage_dtype) # type: ignore[union-attr] + intweight = unpack_gptq_qweight(qweight, quant_config.weight_bits).contiguous() # type: ignore[union-attr] + if self.bitblas_matmul.weight_transform is not None: # type: ignore[attr-defined] + qweight = self.bitblas_matmul.weight_transform( # type: ignore[attr-defined] + intweight.cpu() + ).cuda() + # scales in gptq old quant linear stored with + # (infeatures // group_size, outfeatures), should be transposed. + scales = scales.T.contiguous() + + if qzeros is None: + return qweight, scales, None + + # qzeros should be de-quantized to int zeros. + weight_bits = quant_config.weight_bits # type: ignore[union-attr] + intzeros = unpack_gptq_qzeros(qzeros, weight_bits).T.contiguous() + zeros: torch.Tensor | None = None + zeros_mode = self.bitblas_matmul.config.zeros_mode # type: ignore[attr-defined] + if zeros_mode == "original": + zeros = intzeros.to(torch.float16).contiguous() + elif zeros_mode == "rescale": + assert zeros is not None, "zeros should not be None" + zeros[:, :] = intzeros.to(torch.float16)[:, :] * scales[:, :] + elif zeros_mode == "quantized": + zeros = ( + torch.Tensor( + general_compress( + intzeros.T.contiguous().cpu().numpy(), + weight_bits, + ) + ) + .to(qweight.device) + .to( + quant_config.torch_storage_dtype # type: ignore[union-attr] + ) + .contiguous() + ) + else: + raise ValueError("Unsupported zeros type: {}".format(zeros_mode)) + + return qweight, scales, zeros + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + is_bitblas_installed = True + + try: + import bitblas + + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION + ): + raise ImportError( + "bitblas version is wrong. Please " + f"install bitblas>={MINIMUM_BITBLAS_VERSION}" + ) + except ImportError: + is_bitblas_installed = False + + if not is_bitblas_installed: + return ( + False, + "bitblas is not installed. Please install bitblas " + "by running `pip install bitblas>=" + f"{MINIMUM_BITBLAS_VERSION}`", + ) + + quant_types = query_bitblas_supported_quant_types(c.zero_points) + if c.weight_type not in quant_types: + return False, ( + f"Quant type ({c.weight_type}) not supported by" + f" BitBLAS, supported types are: {quant_types}" + ) + + if c.group_size not in BITBLAS_SUPPORTED_GROUP_SIZES: + return False, ( + f"Group size ({c.group_size}) not supported by " + "BitBLAS, supported group sizes are: " + f"{BITBLAS_SUPPORTED_GROUP_SIZES}" + ) + + return check_bitblas_supports_shape( + c.partition_weight_shape[1], # out_features + c.partition_weight_shape[0], # in_features + c.full_weight_shape[0], # in_features + c.group_size, + ) + + # note assumes that + # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} + # `weight_scale` is: {input_dim = 0, output_dim = 1} + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + device = getattr(layer, self.w_q_name).device + c = self.config + quant_config = self.quant_config + + # Default names since bitblas requires empty parameters for these, + # TODO: remove this requirement from bitblas (allow optional tensors) + if getattr(self, "w_gidx_name", None) is None: + self.w_gidx_name: str = "g_idx" + if getattr(self, "w_zp_name", None) is None: + self.w_zp_name: str = "qzeros" + + if c.has_g_idx: + g_idx, g_idx_sort_indices = bitblas_sort_g_idx( + getattr(layer, self.w_gidx_name) + ) + self._transform_param(layer, self.w_gidx_name, lambda _: g_idx) + layer.g_idx_sort_indices = g_idx_sort_indices + else: + setattr(layer, self.w_gidx_name, bitblas_make_empty_g_idx(device)) + layer.g_idx_sort_indices = bitblas_make_empty_g_idx(device) + + if c.zero_points: + raise NotImplementedError("Zero points not supported by BitBLAS") + else: + setattr(layer, self.w_zp_name, bitblas_make_empty_g_idx(device)) + + # Repack weights + bitblas_qweight, bitblas_scales, bitblas_qzeros = self.repack_bitblas_from_gptq( + layer.qweight, + layer.scales, + None if quant_config.is_sym else layer.qzeros, # type: ignore[union-attr] + ) + replace_parameter(layer, self.w_q_name, bitblas_qweight) + replace_parameter(layer, self.w_s_name, bitblas_scales) + if bitblas_qzeros is not None: + replace_parameter(layer, self.w_zp_name, bitblas_qzeros) + + def configure_bitblas_matmul( + self, + infeatures: int, + outfeatures: int, + params_dtype: torch.dtype, + bias: bool, + ) -> None: + enable_tuning = self.ENABLE_TUNING + layout = self.MATMUL_LAYOUT + bits = self.quant_config.weight_bits # type: ignore[union-attr] + self._configure_bitblas_matmul( + infeatures, + outfeatures, + params_dtype, + enable_tuning, + bias, + layout, + bits, + ) + + def _configure_bitblas_matmul( + self, + infeatures, + outfeatures, + params_dtype, + enable_tuning, + bias, + layout, + bits, + ): + from bitblas import MatmulConfig + + bitblas_dtype = self.BITBLAS_DTYPES[params_dtype] + quant_config = self.quant_config + with_scaling = False + with_zeros = False + group_size = quant_config.group_size # type: ignore[union-attr] + zeros_mode = quant_config.zeros_mode # type: ignore[union-attr] + if quant_config.quant_method == "gptq": # type: ignore[union-attr] + with_scaling = True + with_zeros = True + W_dtype = f"uint{bits}" + if quant_config.is_sym: # type: ignore[union-attr] + with_zeros = False + W_dtype = f"int{bits}" + else: + raise ValueError( + f"Unsupported quant_method {quant_config.quant_method}" # type: ignore[union-attr] + ) # type: ignore[union-attr] + + matmul_config = MatmulConfig( + M=self.OPT_FEATURES, + N=outfeatures, + K=infeatures, + A_dtype=bitblas_dtype, + W_dtype=W_dtype, + out_dtype=bitblas_dtype, + accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype, + storage_dtype=quant_config. # type: ignore[union-attr] + storage_dtype, # type: ignore[union-attr] + with_scaling=with_scaling, + with_zeros=with_zeros, + group_size=group_size, + with_bias=bias, + layout=layout, + zeros_mode=zeros_mode, + ) + self.bitblas_matmul = self._get_or_create_bitblas_operator( + matmul_config, enable_tuning + ) + + def _get_or_create_bitblas_operator(self, config, enable_tuning): + from bitblas import Matmul, auto_detect_nvidia_target + from bitblas.cache import get_database_path, global_operator_cache + + BITBLAS_DATABASE_PATH = get_database_path() + BITBLAS_TARGET = auto_detect_nvidia_target() + + if global_operator_cache.size() == 0: + global_operator_cache.load_from_database( + BITBLAS_DATABASE_PATH, BITBLAS_TARGET + ) + + bitblas_matmul = global_operator_cache.get(config) + if bitblas_matmul is None: + bitblas_matmul = Matmul(config, target=BITBLAS_TARGET, enable_tuning=False) + if enable_tuning: + bitblas_matmul.hardware_aware_finetune(topk=20) + global_operator_cache.add(config, bitblas_matmul) + global_operator_cache.save_into_database( + BITBLAS_DATABASE_PATH, BITBLAS_TARGET + ) + TUNING_MESSAGE = ( + f"BitBLAS Operator {config} tuned and saved to database." + ) + logger.info(TUNING_MESSAGE) + else: + _message = f"BitBLAS Operator {config} created without tuning. " + logger.info(_message) + else: + _message = f"BitBLAS Operator {config} retrieved from cache." + logger.info(_message) + return bitblas_matmul + + def apply_gptq_bitblas_linear( + self, + layer: torch.nn.Module, + x: torch.Tensor, + ) -> torch.Tensor: + output_size_per_partition = self.config.partition_weight_shape[1] + out_shape = x.shape[:-1] + (output_size_per_partition,) + args = [x, layer.qweight, layer.scales] + if self.bitblas_matmul.config.with_zeros: # type: ignore[attr-defined] + args.append(layer.qzeros) + output = self.bitblas_matmul(*args) # type: ignore[operator] + return output.view(out_shape) + + def apply_weights(self, layer, x, bias=None): + NOT_IMPLEMENT_MESSAGE = ( + f"{self.__class__.__name__}.apply_weights is not implemented. " + "Please use BitBLASLinearKernel.apply_gptq_bitblas_linear instead" + ) + raise NotImplementedError(NOT_IMPLEMENT_MESSAGE) diff --git a/model_executor/layers/quantization/kernels/mixed_precision/conch.py b/model_executor/layers/quantization/kernels/mixed_precision/conch.py new file mode 100644 index 0000000..53b2e15 --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/conch.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from importlib.util import find_spec +from typing import Final + +import torch + +from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_ +from vllm.scalar_type import scalar_types + +from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig + +_CONCH_SUPPORTED_WEIGHT_TYPES: Final = [ + scalar_types.uint4, + scalar_types.uint8, + scalar_types.uint4b8, + scalar_types.uint8b128, +] +_CONCH_SUPPORTED_GROUP_SIZES: Final = [-1, 128] + + +class ConchLinearKernel(MPLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + if c.weight_type not in _CONCH_SUPPORTED_WEIGHT_TYPES: + error_msg = ( + f"Weight type ({c.weight_type}) not supported by " + "ConchLinearKernel, supported types are: " + f"{_CONCH_SUPPORTED_WEIGHT_TYPES}" + ) + return False, error_msg + + if c.group_size not in _CONCH_SUPPORTED_GROUP_SIZES: + error_msg = ( + f"Group size ({c.group_size}) not supported by " + "ConchLinearKernel, supported group sizes are: " + f"{_CONCH_SUPPORTED_GROUP_SIZES}" + ) + return False, error_msg + + if find_spec("conch") is None: + error_msg = ( + "conch-triton-kernels is not installed, please " + "install it via `pip install conch-triton-kernels` " + "and try again!" + ) + return False, error_msg + + return True, None + + # note assumes that + # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} + # `weight_scale` is: {input_dim = 0, output_dim = 1} + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def transform_w_q(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) + x.data = x.data.contiguous() + return x + + def transform_w_s(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1) + x.data = x.data.contiguous() + return x + + self._transform_param(layer, self.w_q_name, transform_w_q) + self._transform_param(layer, self.w_s_name, transform_w_s) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + from conch.ops.quantization.gemm import mixed_precision_gemm + + w_q, w_s, w_zp, _ = self._get_weight_params(layer) + + output = mixed_precision_gemm( + x=x, + w_q_packed=w_q.data, + w_s=w_s.data, + w_zp=w_zp.data if w_zp is not None else None, + weight_size_bits=self.config.weight_type.size_bits, + weight_bias=self.config.weight_type.bias, + group_size=self.config.group_size, + ) + + if bias is not None: + output.add_(bias) # In-place add + + return output diff --git a/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py new file mode 100644 index 0000000..8ef6457 --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_ +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types + +from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig + + +class CutlassW4A8LinearKernel(MPLinearKernel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # dynamic per-tok fp8 activation quantization + self.quant_fp8 = QuantFP8(static=False, group_shape=GroupShape.PER_TOKEN) + + @classmethod + def get_min_capability(cls) -> int: + return 90 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_cuda(): + return False, "CUTLASS only supported on CUDA" + + if not current_platform.is_device_capability(90): + return False, "CUTLASS W4A8 requires compute capability of 90 (Hopper)" + + if c.act_type != torch.float8_e4m3fn: + return False, "CUTLASS W4A8 only supports FP8 (e4m3) activations" + + if c.has_g_idx: + return False, "Act reordering not supported by CUTLASS W4A8" + + if c.zero_points: + return False, "Zero points not supported by CUTLASS W4A8" + + if c.weight_type != scalar_types.int4: + return ( + False, + f"Quant type ({c.weight_type}) not supported by " + "CUTLASS W4A8, only supported int4", + ) + + # TODO(czhu): support -1 (column-wise) + if c.group_size != 128: + return False, "Only group_size 128 is supported" + + in_features, out_features = c.partition_weight_shape + if in_features % 128 or out_features % 128: + return ( + False, + f"K and N must be divisible by 128, got {c.partition_weight_shape}", + ) + + if c.out_type != torch.bfloat16: + return ( + False, + f"Only bfloat16 output type currently supportedgot {c.out_type=}", + ) + + return True, None + + # note assumes that + # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} + # `weight_scale` is: {input_dim = 0, output_dim = 1} + def process_weights_after_loading(self, layer: torch.nn.Module): + # TODO(czhu): optimize speed/mem usage + def transform_w_q(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) + x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t()) + return x + + def transform_w_s(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1) + x.data = x.data.contiguous().to(torch.float8_e4m3fn) + x.data = ops.cutlass_pack_scale_fp8(x.data) + return x + + # Encode/reorder weights and pack scales + self._transform_param(layer, self.w_q_name, transform_w_q) + self._transform_param(layer, self.w_s_name, transform_w_s) + self._transform_param(layer, "weight_chan_scale", lambda x: x) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + c = self.config + w_q, w_s, _, _ = self._get_weight_params(layer) + w_ch_s = layer.weight_chan_scale + + x_2d = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (c.partition_weight_shape[1],) + + x_2d, act_scales = self.quant_fp8(x_2d) + output = ops.cutlass_w4a8_mm( + a=x_2d, + b_q=w_q, + b_group_scales=w_s, + b_group_size=c.group_size, + a_token_scales=act_scales, + b_channel_scales=w_ch_s, + ) + + if bias is not None: + output.add_(bias) # In-place add + + return output.reshape(out_shape) diff --git a/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py b/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py new file mode 100644 index 0000000..d09bd86 --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.platforms import CpuArchEnum, current_platform +from vllm.scalar_type import scalar_types + +from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig + + +class Dynamic4bitLinearKernel(MPLinearKernel): + SUPPORTED_QUANT_TYPES = [scalar_types.int4] + + @classmethod + def get_min_capability(cls) -> int: + return 1 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_cpu(): + return False, "Only CPU is supported" + if c.weight_type not in cls.SUPPORTED_QUANT_TYPES: + return False, f"Unsupported quant type {c.weight_type}" + if ( + current_platform.get_cpu_architecture() == CpuArchEnum.ARM + and c.act_type + not in [ + torch.float32, + ] + ): + return False, "Dynamic4bitLinearKernel on Arm requires Float32 activations" + if c.full_weight_shape[0] % c.group_size != 0: + return ( + False, + f"Group size ({c.group_size}) does not evenly divide" + " the number of input features " + f"({c.full_weight_shape[0]})", + ) + if current_platform.get_cpu_architecture() == CpuArchEnum.ARM: + try: + # Attempt to retrieve the operation + _ = torch.ops.aten._dyn_quant_matmul_4bit + except AttributeError: + return ( + False, + f"PyTorch {torch.__version__} does not support" + " _dyn_quant_matmul_4bit. Install a newer version", + ) + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module): + c = self.config + packed_weight = getattr(layer, self.w_q_name) + packed_weight = packed_weight.add(8) + uint8_packed = (packed_weight[::, 1::2] << 4 | packed_weight[::, ::2]).to( + torch.uint8 + ) + + scales = getattr(layer, self.w_s_name) + block_size = c.group_size + + # Handle scaling factors for partitioned weights + if block_size == c.partition_weight_shape[0]: + scales = scales.to( + torch.float32 + ) # Float32 & Bfloat16 variants requires float32 scales + scales = scales.view(-1, 1) # Channel-wise scales + if layer.bias is not None: + layer.bias = layer.bias.to( + torch.float32 + ) # Float32 & Bfloat16 variants requires float32 bias + else: + # KleidiAI kernel requires bfloat16 scales with groupwise scheme + scales = scales.to(torch.bfloat16) + + # Repack weights as per kernel requirement + w = torch.ops.aten._dyn_quant_pack_4bit_weight( + uint8_packed, + scales, + layer.bias, + block_size, + c.partition_weight_shape[0], + c.partition_weight_shape[1], + ) + replace_parameter( + layer, self.w_q_name, torch.nn.Parameter(w, requires_grad=False) + ) + setattr(layer, self.w_s_name, None) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + c = self.config + x_2d = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (c.partition_weight_shape[1],) + + w_q = getattr(layer, self.w_q_name) + output = torch.ops.aten._dyn_quant_matmul_4bit( + x_2d, + w_q, + c.group_size, + c.partition_weight_shape[0], + c.partition_weight_shape[1], + ) + return output.reshape(out_shape) diff --git a/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/model_executor/layers/quantization/kernels/mixed_precision/exllama.py new file mode 100644 index 0000000..9fba4aa --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/exllama.py @@ -0,0 +1,161 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + pack_quantized_values_into_int32, +) +from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_ +from vllm.scalar_type import scalar_types + +from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig + + +class ExllamaLinearKernel(MPLinearKernel): + SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128] + # In theory supports `scalar_types.uint2b2, scalar_types.uint3b4` too but + # currently untested so not added to the list + + @classmethod + def get_min_capability(cls) -> int: + return 60 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + if c.has_g_idx and c.partition_weight_shape[0] != c.full_weight_shape[0]: + return ( + False, + "Act reordering currently not supported by Exllama, " + "when the input features are partitioned across " + "devices", + ) + + if c.partition_weight_shape[1] % (32 // c.weight_type.size_bits) != 0: + return ( + False, + "Output features must be a multiple of the pack " + "factor (32 / num_bits) so that we can correctly " + "pack the zero points", + ) + + if c.act_type != torch.float16: + return False, "Exllama only supports float16 activations" + + if c.weight_type not in cls.SUPPORTED_QUANT_TYPES: + return ( + False, + f"Quant type ({c.weight_type}) not supported by " + "Exllama, supported types are: " + f"{cls.SUPPORTED_QUANT_TYPES}", + ) + + if c.full_weight_shape[0] % c.group_size != 0: + return ( + False, + f"Group size ({c.group_size}) does not evenly divide" + " the number of input features " + f"({c.full_weight_shape[0]})", + ) + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module): + c = self.config + + # For Exllama, we need to set a zero-point tensor if there is not one + if not c.zero_points: + self.w_zp_name = "qzeros" + device = getattr(layer, self.w_q_name).device + groups = c.partition_weight_shape[0] // c.group_size + out_features = c.partition_weight_shape[1] + + if c.weight_type.has_bias(): + # if the type has a bias we have to create a zeros tensor that + # contains the bias values repeated for each group (-1 due to + # a bug in the original GPTQ checkpoint format leading to + # exllama kernel adding 1 to the zero points during inference) + # Documentation of the bug can be found here: + # https://garden.danieldk.eu/GPTQ-Checkpoint-Format + zeros = torch.full( + (groups, out_features), + c.weight_type.bias - 1, + dtype=torch.int32, + device=device, + ) + else: + raise NotImplementedError( + "A 0 zero-point is not supported by Exllama due to " + "a bug in the original GPTQ checkpoint format leading to " + "exllama kernel adding 1 to the zero points during " + "inference" + ) + zeros = pack_quantized_values_into_int32(zeros, c.weight_type, packed_dim=1) + setattr( + layer, self.w_zp_name, torch.nn.Parameter(zeros, requires_grad=False) + ) + + if c.has_g_idx: + + def transform_w_g_idx(x): + # Exllama wants the permutation array instead of the group + # indices + return torch.argsort(x).to(torch.int) + + self._transform_param(layer, self.w_gidx_name, transform_w_g_idx) + else: + self.w_gidx_name = "g_idx" + empty_g_idx = torch.nn.Parameter( + torch.empty((0,), dtype=torch.int, device=device), requires_grad=False + ) + setattr(layer, self.w_gidx_name, empty_g_idx) + + def transform_w_q(x): + assert isinstance(x, BasevLLMParameter) + assert self.w_gidx_name is not None + g_idx = getattr(layer, self.w_gidx_name) + + permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) + x_cont = x.data.contiguous() + ops.gptq_shuffle(x_cont, g_idx, c.weight_type.size_bits) + return x_cont + + def transform_w_s(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1) + x.data = x.data.contiguous() + return x.to(dtype=c.act_type) + + # Repack weights and scales for Machete + self._transform_param(layer, self.w_q_name, transform_w_q) + self._transform_param(layer, self.w_s_name, transform_w_s) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + c = self.config + + x_2d = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (c.partition_weight_shape[1],) + + w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer) + + # gptq_gemm supports GPTQv2 format by passing use_v2_format=True. + # However, the MPLinearLayerConfig doesn't contain format info. + # So hardcode GPTQv1 format here, to keep its behavior unchanged. + use_v2_format = False + + assert w_zp is not None, "Zero points are required by Exllama" + assert w_g_idx is not None, "Group index is required by Exllama" + output = ops.gptq_gemm( + x_2d, w_q, w_zp, w_s, w_g_idx, True, use_v2_format, c.weight_type.size_bits + ) + + if bias is not None: + output.add_(bias) + return output.reshape(out_shape) diff --git a/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/model_executor/layers/quantization/kernels/mixed_precision/machete.py new file mode 100644 index 0000000..b756c8a --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/machete.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from functools import partial + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.machete_utils import ( + check_machete_supports_shape, + query_machete_supported_group_sizes, + query_machete_supported_quant_types, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + pack_quantized_values_into_int32, + unpack_quantized_values_into_int32, +) +from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_ +from vllm.platforms import current_platform + +from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig + + +class MacheteLinearKernel(MPLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + return 90 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + # Machete uses CUTLASS, so it can only be compatible with Nvidia + if not current_platform.is_cuda(): + return False, "Machete only supported on CUDA" + + if not current_platform.is_device_capability(90): + return False, "Machete requires compute capability of 90 (Hopper)" + + if c.has_g_idx and c.partition_weight_shape[0] != c.full_weight_shape[0]: + return ( + False, + "Act reordering currently not supported by Machete, " + "when the input features are partitioned across " + "devices", + ) + + if c.weight_type not in query_machete_supported_quant_types(c.zero_points): + return ( + False, + f"Quant type ({c.weight_type}) not supported by " + "Machete, supported types are: " + f"{query_machete_supported_quant_types(c.zero_points)}", + ) + + if c.group_size not in query_machete_supported_group_sizes(c.act_type): + return ( + False, + f"Group size ({c.group_size}) not supported by " + "Machete, supported group sizes are: " + f"{query_machete_supported_group_sizes(c.act_type)}", + ) + + return check_machete_supports_shape( + c.partition_weight_shape[0], c.partition_weight_shape[1] + ) + + # note assumes that + # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} + # `weight_scale` is: {input_dim = 0, output_dim = 1} + # `weight_zp` is: {input_dim = 0, output_dim = 1, packed_dim = 1} + def process_weights_after_loading(self, layer: torch.nn.Module): + c = self.config + if c.has_g_idx: + assert self.w_gidx_name is not None + perm = torch.argsort(getattr(layer, self.w_gidx_name)).to(torch.int) + + self.act_perm = lambda x: x[:, perm] + # use `ops.permute_cols` if possible + if ( + c.act_type in [torch.float16, torch.bfloat16] + and c.partition_weight_shape[0] % 8 == 0 + ): + self.act_perm = partial(ops.permute_cols, perm=perm) + + def transform_w_q(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) + if c.has_g_idx: + x_unpacked = unpack_quantized_values_into_int32(x.data, + c.weight_type, + packed_dim=0) + x_perm = x_unpacked[perm, :] + x.data = pack_quantized_values_into_int32(x_perm, + c.weight_type, + packed_dim=0) + x.data = ops.machete_prepack_B(x.data.t().contiguous().t(), + a_type=c.act_type, + b_type=c.weight_type, + group_scales_type=c.act_type) + return x + + def transform_w_s(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1) + x.data = x.data.contiguous() + return x + + def transform_w_zp(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=1) + x_unpacked = unpack_quantized_values_into_int32( + x.data, c.weight_type, packed_dim=1 + ) + w_s = getattr(layer, self.w_s_name).data + # pre-apply scales to zero-points + x.data = (-1.0 * w_s * (x_unpacked.to(w_s.dtype))).contiguous() + return x + + # Repack weights and scales for Machete + self._transform_param(layer, self.w_q_name, transform_w_q) + self._transform_param(layer, self.w_s_name, transform_w_s) + if c.zero_points: + self._transform_param(layer, self.w_zp_name, transform_w_zp) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + c = self.config + w_q, w_s, w_zp, _ = self._get_weight_params(layer) + + x_2d = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (c.partition_weight_shape[1],) + + if c.has_g_idx: + x_2d = self.act_perm(x_2d) + + if c.zero_points: + assert w_zp is not None + else: + w_zp = None + + output = ops.machete_mm(a=x_2d, + b_q=w_q, + b_type=c.weight_type, + b_group_zeros=w_zp, + b_group_scales=w_s, + b_group_size=c.group_size) + + if bias is not None: + output.add_(bias) # In-place add + + return output.reshape(out_shape) \ No newline at end of file diff --git a/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/model_executor/layers/quantization/kernels/mixed_precision/marlin.py new file mode 100644 index 0000000..5783d86 --- /dev/null +++ b/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -0,0 +1,325 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + MARLIN_SUPPORTED_GROUP_SIZES, + apply_gptq_marlin_linear, + check_marlin_supports_shape, + marlin_is_k_full, + marlin_make_empty_g_idx, + marlin_make_workspace_new, + marlin_permute_bias, + marlin_permute_scales, + marlin_sort_g_idx, + marlin_zero_points, + query_marlin_supported_quant_types, + unpack_cols, +) +from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_ +from vllm.platforms import current_platform +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + pack_quantized_values_into_int32, unpack_quantized_values_into_int32) + +from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig +from vllm.scalar_type import ScalarType, scalar_types +import ixformer.inference.functions as ixf_ops +from vllm.model_executor.layers.quantization.utils import replace_parameter + +from vllm.logger import init_logger +logger = init_logger(__name__) + + +def unpack_rows(packed_w: torch.Tensor, num_bits: int) -> torch.Tensor: + """ + Efficient vectorized unpacking. + Converts [K // pack_factor, N] int32 tensor → [K, N] int8 tensor. + + Args: + packed_w: torch.int32 tensor of shape [K // pack_factor, N]. + num_bits: Number of bits per packed element (e.g., 4). + + Returns: + unpacked: torch.int8 tensor of shape [K, N]. + """ + pack_factor = 32 // num_bits + k_packed, n = packed_w.shape + k = k_packed * pack_factor + + mask = (1 << num_bits) - 1 + + # [pack_factor, 1, 1] + shifts = (num_bits * torch.arange(pack_factor, device=packed_w.device)).view(-1, 1, 1) + + # [pack_factor, k_packed, n] + packed_expanded = packed_w.unsqueeze(0) + + # Extract each group of num_bits using bitwise ops + unpacked_groups = ((packed_expanded >> shifts) & mask).to(torch.int8) + # [pack_factor, k_packed, n] → [k, n] + unpacked = unpacked_groups.permute(1, 0, 2).reshape(k, n) + + return unpacked + + +def pack_cols(x: torch.Tensor, pack_num: int = 8, order_map=None) -> torch.Tensor: + """ + Efficient vectorized version: pack int4 values (0–15) into int32. + Each int32 element contains `pack_num` 4-bit values. + + Args: + x: Tensor of shape [rows, cols * pack_num], dtype=int32. + Represents unpacked int4 values. + pack_num: Number of 4-bit elements to pack into each int32. + order_map: Index mapping defining the order of 4-bit packing, + must match the unpack order used in `unpack_tensor`. + + Returns: + Tensor of shape [rows, cols], dtype=int32 — packed result. + """ + # Default sequential order if none provided + if order_map is None: + order_map = list(range(pack_num)) + order_map = torch.tensor(order_map, device=x.device) + + # Number of bits per packed element (e.g., 32 / 8 = 4 bits) + unit = 32 // pack_num + rows, cols_pack = x.shape + assert cols_pack % pack_num == 0, "Number of columns must be a multiple of pack_num" + cols = cols_pack // pack_num + + # Reshape input into groups of `pack_num` int4 values + # Shape: [rows, cols, pack_num] + x_reshape = x.view(rows, cols, pack_num) + + # Reorder elements according to order_map + # order_map is broadcasted to match shape [rows, cols, pack_num] + x_reorder = torch.gather(x_reshape, 2, order_map.view(1, 1, -1).expand(rows, cols, -1)) + + # Keep only the lower 4 bits of each value + x_reorder = x_reorder & 0xF + + # Compute bit shifts for each position (e.g., [0, 4, 8, 12, 16, 20, 24, 28]) + shifts = (unit * torch.arange(pack_num, device=x.device)).view(1, 1, -1) + + # Shift and combine (bitwise OR) along the last dimension + # Using sum() is safe since bits don't overlap between 4-bit slots + res = (x_reorder << shifts).sum(dim=-1).to(torch.int32) + + return res + +class MarlinLinearKernel(MPLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + # Marlin uses inline PTX, so it can only be compatible with Nvidia + if not current_platform.is_cuda(): + return False, "Marlin only supported on CUDA" + + quant_types = query_marlin_supported_quant_types(c.zero_points) + if c.weight_type not in quant_types: + return ( + False, + f"Quant type ({c.weight_type}) not supported by" + f" Marlin, supported types are: {quant_types}", + ) + + if c.group_size not in MARLIN_SUPPORTED_GROUP_SIZES: + return ( + False, + f"Group size ({c.group_size}) not supported by " + "Marlin, supported group sizes are: " + f"{MARLIN_SUPPORTED_GROUP_SIZES}", + ) + + return check_marlin_supports_shape( + c.partition_weight_shape[1], # out_features + c.partition_weight_shape[0], # in_features + c.full_weight_shape[0], # in_features + c.group_size, + ) + + # note assumes that + # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} + # `weight_scale` is: {input_dim = 0, output_dim = 1} + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + device = getattr(layer, self.w_q_name).device + c = self.config + assert (c.weight_type.size_bits == 4) , f"MarlinLinearKernel now only support uint4, uint4b8, \ + now quant weight_type {c.weight_typ}" + + # device = getattr(layer, self.w_q_name).device + + + # row_parallel = c.partition_weight_shape[0] != c.full_weight_shape[0] + # self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel) + + # Allocate marlin workspace. + # self.workspace = marlin_make_workspace_new(device) + + # Default names since marlin requires empty parameters for these, + # TODO: remove this requirement from marlin (allow optional tensors) + # if self.w_gidx_name is None: + # self.w_gidx_name = "g_idx" + # if self.w_zp_name is None: + # self.w_zp_name = "w_zp" + if c.has_g_idx: + assert self.w_gidx_name is not None + perm = torch.argsort(getattr(layer, self.w_gidx_name)).to(torch.int) + + self.act_perm = lambda x: x[:, perm] + + def transform_w_q(x): + # assert isinstance(x, BasevLLMParameter) + # permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) + # x.data = ops.gptq_marlin_repack( + # x.data.contiguous(), + # perm=layer.g_idx_sort_indices, + # size_k=c.partition_weight_shape[0], + # size_n=c.partition_weight_shape[1], + # num_bits=c.weight_type.size_bits, + # ) + assert x.data.ndim == 2 + if x._packed_dim == 1: #CompressedTensorsWNA16 + #[oc, ic // 8] - > [oc, ic] + x_unpacked = unpack_quantized_values_into_int32(x.data, + c.weight_type, + packed_dim=1) + if c.has_g_idx: + x_unpacked = x_unpacked[:,perm] + #[oc, ic] -> [ic, oc] + x_unpacked = x_unpacked.t().contiguous() + + elif x._packed_dim == 0: #GPTQMarlinLinearMethod + + #[ic // 8, oc] -> [ic , oc] + x_unpacked = unpack_rows(x.data,c.weight_type.size_bits) + if c.has_g_idx: + x_unpacked = x_unpacked[perm:] + raise NotImplementedError(f"GPTQMarlinLinearMethod has_g_idx not test, \ + Please check whether the model's inference results are correct, and annotate/modify the statement. ") + else: + raise NotImplementedError(f"transform_w_q pack_dim {x._packed_dim} not implement") + + #[ic, oc]-> [ic, oc//8] + x_packed = pack_cols(x_unpacked, order_map=[0, 2, 4, 6, 1, 3, 5, 7]) + x.data = x_packed.contiguous() + x._packed_dim = 1 + return x + + def transform_w_s(x): + assert isinstance(x, BasevLLMParameter) + permute_param_layout_(x, input_dim=0, output_dim=1) + x.data = x.data.contiguous() + return x.to(dtype=c.act_type) + + # if c.has_g_idx: + # g_idx, g_idx_sort_indices = marlin_sort_g_idx( + # getattr(layer, self.w_gidx_name) + # ) + # self._transform_param(layer, self.w_gidx_name, lambda _: g_idx) + # layer.g_idx_sort_indices = g_idx_sort_indices + # else: + # setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device)) + # layer.g_idx_sort_indices = marlin_make_empty_g_idx(device) + def transform_w_zp(x): + grouped_k = (c.partition_weight_shape[0] // + c.group_size if c.group_size != -1 else 1) + x_unpacked = unpack_cols(x.clone().t(), c.weight_type.size_bits, grouped_k, c.partition_weight_shape[1]) + x_packed = pack_cols(x_unpacked, order_map=[0, 2, 4, 6, 1, 3, 5, 7]) + x.data = x_packed.contiguous() + return x + + + if c.zero_points: + # grouped_k = ( + # c.partition_weight_shape[0] // c.group_size if c.group_size != -1 else 1 + # ) + # self._transform_param( + # layer, + # self.w_zp_name, + # lambda x: marlin_zero_points( + # unpack_cols( + # x.t(), + # c.weight_type.size_bits, + # grouped_k, + # c.partition_weight_shape[1], + # ), + # size_k=grouped_k, + # size_n=c.partition_weight_shape[1], + # num_bits=c.weight_type.size_bits, + # ), + # ) + self._transform_param(layer, self.w_zp_name, transform_w_zp) + else: + # setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device)) + #weight_type = uint4b8, using c.weight_type.bias as zero point,according quant method. + #[ic, oc]-> [ic, oc//8] + w_zp = torch.full_like(getattr(layer, self.w_s_name), c.weight_type.bias, dtype=torch.int32) + w_zp_pack = pack_cols(w_zp, order_map=[0, 2, 4, 6, 1, 3, 5, 7]).contiguous() + weight_zero_point = torch.nn.Parameter( + w_zp_pack, + requires_grad=False) + + if hasattr(layer, self.w_zp_name): + replace_parameter(layer, self.w_zp_name, weight_zero_point) #GPTQMarlinLinearMethod + else: + layer.register_parameter("weight_zero_point", weight_zero_point) #CompressedTensorsWNA16 + + self._transform_param(layer, self.w_q_name, transform_w_q) + self._transform_param(layer, self.w_s_name, transform_w_s) + + # if hasattr(layer, "bias") and layer.bias is not None: + # layer.bias.data = marlin_permute_bias(layer.bias) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + c = self.config + w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer) + + pack_factor = 32 // c.weight_type.size_bits + + out_shape = x.shape[:-1] + (c.partition_weight_shape[1], ) + x_2d = x.reshape(-1, x.shape[-1]) + + if c.has_g_idx: + x_2d = self.act_perm(x_2d) + + out = ops.custom_gptq_marlin_gemm(input = x_2d, + qweight = w_q, + scales = w_s, + qzeros = w_zp, + pack_factor = pack_factor, + group_size = c.group_size, + bias = bias) + out = out.reshape(out_shape) + # if bias is not None: + # out.add_(bias) + return out + + + # # `process_weights_after_loading` will ensure w_zp and w_gidx are not + # # None for marlin + # return apply_gptq_marlin_linear( + # input=x, + # weight=w_q, + # weight_scale=w_s, + # weight_zp=w_zp, # type: ignore + # g_idx=w_gidx, # type: ignore + # g_idx_sort_indices=layer.g_idx_sort_indices, + # workspace=self.workspace, + # wtype=c.weight_type, + # input_size_per_partition=c.partition_weight_shape[0], + # output_size_per_partition=c.partition_weight_shape[1], + # is_k_full=self.is_k_full, + # bias=bias) diff --git a/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py new file mode 100644 index 0000000..2a885ec --- /dev/null +++ b/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod +from dataclasses import dataclass + +import torch + + +@dataclass +class ScaledMMLinearLayerConfig: + is_channelwise: bool + is_static_input_scheme: bool + input_symmetric: bool + + +class ScaledMMLinearKernel(ABC): + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + raise NotImplementedError + + @classmethod + @abstractmethod + def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + raise NotImplementedError + + def __init__( + self, + c: ScaledMMLinearLayerConfig, + w_q_param_name: str, + w_s_param_name: str, + i_s_param_name: str, + i_zp_param_name: str, + azp_adj_param_name: str, + ) -> None: + assert self.can_implement(c) + self.config = c + self.w_q_name = w_q_param_name + self.w_s_name = w_s_param_name + self.i_s_name = i_s_param_name + self.i_zp_name = i_zp_param_name + self.azp_adj_name = azp_adj_param_name + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + raise NotImplementedError + + @abstractmethod + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError + + def _get_weight_params( + self, layer: torch.nn.Module + ) -> tuple[ + torch.Tensor, # weight + torch.Tensor, # weight_scale + torch.Tensor | None, # input_scale, + torch.Tensor | None, # input_zp + torch.Tensor | None, # azp_adj + ]: + return ( + getattr(layer, self.w_q_name), + getattr(layer, self.w_s_name), + getattr(layer, self.i_s_name), + getattr(layer, self.i_zp_name), + getattr(layer, self.azp_adj_name), + ) diff --git a/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/model_executor/layers/quantization/kernels/scaled_mm/__init__.py new file mode 100644 index 0000000..dd59e5d --- /dev/null +++ b/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os + +from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import ( + AiterScaledMMLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import ( + CPUScaledMMLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import ( + CutlassScaledMMLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 + ScaledMMLinearKernel, + ScaledMMLinearLayerConfig, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import ( + TritonScaledMMLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import ( + XLAScaledMMLinearKernel, +) +from vllm.platforms import PlatformEnum, current_platform + +# in priority/performance order (when available) +_POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = { + PlatformEnum.CPU: [CPUScaledMMLinearKernel], + PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], + PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel], + PlatformEnum.TPU: [XLAScaledMMLinearKernel], +} + + +def choose_scaled_mm_linear_kernel( + config: ScaledMMLinearLayerConfig, compute_capability: int | None = None +) -> type[ScaledMMLinearKernel]: + """ + Choose an ScaledMMLinearKernel that can implement the given config for the + given compute capability. Attempts to choose the best kernel in terms of + performance. + + Args: + config (ScaledMMLinearLayerConfig): Description of the linear layer + to be implemented. + compute_capability (Optional[int], optional): The compute capability of + the target device, if None uses `current_platform` to get the + compute capability. Defaults to None. + + Raises: + ValueError: If no kernel can implement the given config. + + Returns: + type[ScaledMMLinearKernel]: Chosen kernel. + """ + + if compute_capability is None: + _cc = current_platform.get_device_capability() + if _cc is not None: + compute_capability = _cc[0] * 10 + _cc[1] + + failure_reasons = [] + for kernel in _POSSIBLE_KERNELS[current_platform._enum]: + if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "").split(","): + failure_reasons.append( + f" {kernel.__name__} disabled by environment variable" + ) + continue + + # If the current platform uses compute_capability, + # make sure the kernel supports the compute cability. + if compute_capability is not None: + kernel_min_capability = kernel.get_min_capability() + if ( + kernel_min_capability is not None + and kernel_min_capability > compute_capability + ): + failure_reasons.append( + f"{kernel.__name__} requires capability " + f"{kernel_min_capability}, current compute capability " + f"is {compute_capability}" + ) + continue + + can_implement, failure_reason = kernel.can_implement(config) + if can_implement: + return kernel + else: + failure_reasons.append( + f" {kernel.__name__} cannot implement due to: {failure_reason}" + ) + + raise ValueError( + "Failed to find a kernel that can implement the " + "ScaledMM linear layer. Reasons: \n" + "\n".join(failure_reasons) + ) diff --git a/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-312.pyc b/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/ScaledMMLinearKernel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a76db3b138e8c85e8d3f33d53559437bd760128b GIT binary patch literal 3478 zcmb6cTWl0n^v-i<-+in|p~be=$11QY8mcJ}A;kEgAA*K#OeWKryL9U8>~d!+-KJ4O zYC@w);j8c^zYRb7rJsCa{Ap8-n~5eQ_~YL|F(iI@&Yjt97q%E~vge+2&$;KGbI)V$ z{*uk62nsLl30UQr*v2Hp8gfwy22PuXL#2*+E}%W%@NHAK-TJJrd*(E)(jXsGjVf5m zY;zLSYP#+kRjTVjO4qA3Yt}(LqwD8q4JYQvQ?;5C*@L`*+i%Qoh|v* zdZ}*L5BLTMnaqA*aUB3k02+)9MjTvj<>q6tf&8couaCC`Hy6>$JmRb)%) zvL|h!yb7`j_-XLNeJS_}mR!kM=|#Dmf#7Foc4m;<)B$Gpb&#CkJ7p`oby~TH%1L7N zJ(ZRmoH*J=dwz#??kjlSbyr2RdKQ)TO+B3@c$;0dKFw+ri|#xApI1@h81_Ou`k~~F;wwps7OM4U3YK8c1)+J$8m|PWt{>hsEezopE5xPFLPBKS zHsBKS1Y_B=z-PMw6h*cN@gV>n?jDX!%$kO)+bvh-N4r|rA;WcIW=H1aF6ig=y1|Sp z?>hm|UI%S=(s^5-uWuLVGk`X%nQdz(_JY>HK=5FE3CIdrAT!%T6-zA$H{(@L{D52% z7K8<&x8)W@pxSq~AOVFM`B;#FlG~I5RO`D#7F3|r7Oz40eHIh!Iamr*GiufvUH~@M z05C7c&}qJwQs0{*f1z$BM7j7+4Bg-0C;#2!_kOR3fjBv+P<#O z4|NSyi;V&DP5`(_!amYd_&q`+1-GOweOxVCfKYP;id(GPB56A)-ZTiFb`J=4x+V8)~M-g!iFyVFmW zY-f5)BOyEJMB5%2J4o`^U8iVEvZ9=hD@qL#&;SkJXY3`AWiKNrB6tPCs|a2L(2+iU zIyTKHccR|m?a*8w7kmemeF^}YYk_17S597g^SdM0k8EU$e`NaqmpXd==tkz1Sn~J1 z``7pG-^d&Y(`rvDBx*7hJyNhy77Hv56U!jT0tnLl85zAS8OA1%Y}aRdQ2?twQ2bf= zurN%^K!SHD?=}om8(1}FP6wjv2I?vPv}2e`fqI6zUX8K0fMDYYdJyoZ5`IS*zNR?9 zCGR4n(jsXpJx!%Q6a{7U#-aNJrcH^c1q8jJ3}mPR))HCUcQ{0upHAKyyDiGm9tmRuIN|wPy+f+ zGS}3y%bBH2=XAwi*Y?IskKtibP7d9Z)Rg)Ft|d+pj7w1qBxA~CSnE+jq}%^CsK@YQ z#Eu}q?1&=S8Qnmeh~D`}88&?E6CP$lC<%h_P$R4%~u>{74< zDzV!hoVGxnlN-1}lA8@5+M|0*&%G?G6k-=S2ogsBD&HD9kCG%4c~8 z<`tjn&-xwMr?jYnY(NcWgU;Hogw$|0?7%HbM2%*n4jfQoYHPMtjc4NwK&|<+o&t>- z@tX{xBpMhGLVkyG-di_j#G>#fOHK_pa2|xBn~cn7yHGc3MLnn&1^Nhyej6hB!!JdD4wPMpH6tcQ(V@eJ8OC;x@w5y0#&25@G#oST=R-KSpK|YdMyhs5WS1apXJ96YN7!V(unP5@YV5xR=M%8e*3% zvG-ynR${1^BzDIK09kQ#wL`vLW)tjkCSThQOb-Bge;up?^wbQv@lSG&FY$RV-s^M# zOeZ@iA|E-1u7Ua1(>RHjd{1jP$hS42`=8csx&<&SarurWds>=n^9V?RPdV>A1AhO@ z5_9ho+1quRN8Mt+lJ8!!oZ@tAX8hePLlyn3-RlB4u@^ zURQz5jMxm^8G@2*i0kA{f$Q54X#~*%;3$!XN#6r(PRQl#)(vvsB1QssHMae()9fyH zen|ozq9O7oh)@)23QT7v#dI8}XxeitqmL2e;ovX+>U77q-n?x|ZHj6g92t z5>fB!TK_EnN&YkK z6U`c(`Eq@4aDIPqxjeXRt*q`1u9d@UmB_$;~tos$RgGyC!L<@otZ z&vzbv`02wnE3BYeto=GY#FirxbsGvzaL6-DkF#bkfBYO-1{zDiWJm`2+V=U*GT&+Sq^$0Ag^%y^U1h$@>b+?7j2&BEur4lI zJxdOzvA-JhU13-Yv^^O6W8`z`@tF0!neyAS<@0k^?_9a#Cx^htu+?@j&nz{3W(0q!KaL~xFEXe%&Ijgy0l-D1B9$>vihKhGV+K+z zLtxejnp}#baL@>u;Zzu4P7N8nDWn7nhmDBYlxkv703Ajue-5dFCR}4{@>9_wlZZSd zgP{cDHOZNR?c_{3m$zZqf^!*DR&+p`4Y~Pi1Nwsr7NAeDxk3{ zqXn}8HE=F?%_mB6Fw1Hoz=Snk6V%{T7@=&S9q`$p$}OObs3EOse4Wp@;4|K_VBmu) zqXx6VX_RHu@VRKJML=rP_u}Vatxe0w6;_z1`-W32%a!HT$T=<*n?{pNDn5)7QLnv^ z+{j{$)->&fu;vu7l_9sXu#xMY4?=el!G#Bg9;N`kN-mDg7`9EwrEjx6j+yVt>-la+ z!KXB*d$O>5Pw(m7T92Wpd-6qRENAt+V;E+SnNu}G)+V(~!O7tsgIuTGbEcqJjy|P0 zI?$b_S8w;&)F5Qj?4fp)%on{@kS&|KC1;eplGY8~DNgNJ@l465sbjERn|R1s^gxWy z1AsgxGn5Gs$ZzDZ^whrr;39Gw8W~?mgI^XsiXSDPxAy_ zW<0tWOGY4#D~39e6RP93A`Fgu5*67AG|8Ss6HFpJlnDHEN?Gyv64uTXbgbFZC{^I7 zG%==U#w6XAtehkH{%8~>yO7W4u%oF`&H|Lxee8`xrd0RSBqqluOTG60E~Q(xqZr`b zWC0>PNyPnbWpS`YRkxKiuq35^ivUs@HmW&}Zk>`we|F@^pbUZr<>SNsa^Jv6|FL~9 zj7o~7E{l_o3=H-65t8I;_)$sKvP!{lh>AvVK_lDz_Q1;9%vn#gXG?a`G&Kk78EK(8 zgN%`(L>LpJ^TN6#jc9jbx%xG z1HrbIYBVgg{3E*VQ4Ga5-;eFA#CG0`{iqrU0V;-K@%y55SCpy&7REe^bv%CbJc_k{ zjX0s@E6UFT^sP;HeKEKn{YmJHu%8MH^d3k3f06f*(@^i*3c!a9_#b?0Z6@HNUx6N7 z=9=%53@W0l4ETY^fe~xi+n5LSYPKy;X!@x!$Ci~WNkK+VQNeH$tQWL3Ba1hHKu?h6 z3`sn)?5prBRUwuDIkkIb#jU!;OTcWCn|OQ{{gZE=30L^;S?%xqk8vwZdd#F3OzVoB zA=h1RJOK2_kHOO}A{UJ}LQb7z;i2CK6_kP{t`sVTOT5dgOaYYySFppcK>(@;r1u$z z$fRw2;cEVz8*oEz*cE=yUh}bU1AQq{YAQwD=zB|Kxqu-3Y4Lxly=}X^giP@JB`4(T!1VfyKNCo%aSd?5NK+$*!On2?8^IG zZD(}vB5*HrdbgQi{WT&h%P+O8wn9mCo5rDrTiWXPsD{WYrI^co2den$vfLIbt95l* zu1KlYxG@}X<93tV3g0!QzEJfM;k8rhB38V^bQ z#KbbDZhmjINOalnst@V=>0zk~YD#_j=t-!xTV+yy*7A^?SFmE*9t*D{FYXt%y5;oB zv{djoYHyIl#+1C~anx2lKAnZU<-G8c7p?;w*pp_|Y6XE@o2>Ls^m>ewXv15G(nw;% zounh_(_<&(Go*u+b?Ab+JeBvt%2Zxf)N%YnN-{}ECJA*}xYTc_FS;R~G?oJ((2BWUf0OU{KiriIVNbyXA)B6HEfc^1CY z#YZ93vGIO;veKTMJ#)SDx^ugDul?vu3uWK>KwLBJF6MBnA++|Hi-&)Gcv^TMww^D} z6febR+)L?-D1F>_SA1@^^OMBY#Pzh*+1Op?JLaC6*s%G^ zz~zCDUY~uTvSCM=Z=c(=<%)h;|LFAW(aNS|nP2}E6n&kQu05YeH&&kARb|6Mq#9zI ze=;8?5DLU1k2+9nQ>|j0WoL&!Iezu{wYRQkZoPOrcw7HWe6!5INoyZW9=3^AkC$aj zF*RBCBC-t6!Gb~Hs7zeJ@G(uaFoSA(ET@9$S`H=|5|dH`yrTfsEtFBYoZ+#ygYkNT z>n6B>WqF}NDwCFRUTB2IPW%eNHv0u%Ml%dqPB8dcn58e+gSEC0K>I@{NmRLKJzRJ#pwiGK7lrK9pz2Cs|dcg`fqQ&!`@N<8Y#bWtg?6bYclCG z091X2VYpq_cirs0(R=g2jRRGXx9hg^4f#}C&~u5egah1B=7HFDesX5={JS&nRyi^O z3q|AS_s;CChUt(;Kw-=nmqscrTdI@?>^dTz-!QY`zfFXsrf_nSL~~6uLGt>>q$0!# z=sb~(lTZH8)M82tovpw8Z2!>$Pjyl!WB5mcytd(G{?pK#p_;>_5UVXu-y{6q+7<7| yBY+@vt-T$(D$6j;d;~GgU!u-0(DpCT^IxK!U!a#Cg>;5F%9N#-zd;nDvi=8RtMK0d literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cpu.cpython-312.pyc b/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cpu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d015b9c2843aeb881c2ad7033934e0a753b78a34 GIT binary patch literal 10421 zcmc&aYitzPdNcce%sy<^@7MYPcx|u^wu3KVLV^wSf=vh}P$#$3&CVEhy|0{^O>Ad3 z!y|MU$)gs7^xDrh=ftE1eLB-`m4EWt9n?{-8*t5q^7DO^@n>+{`6PBGqbag zVa$bGRrkm~a~|LM=A3W7_t}4TIBW!zu;I(#nHGZh58P3TG2htzB{Y@^hG57rF^QkL zFd5NJ>Pg(zL)$QEklKc@F=CoDNo`};9I;GVq_!z+jo2n_QrjH1N2p0kYFomNh;z~j zZ7X97yCSYhSHwN(CJ7yJj$rIx5Ddk-?&;-yOjgX2KF9aCGwvfrI~_=Hd@Mr8;v94h zZ1iLNtUzo$><_SX+>iVbmS+(RRtI9yk6Faie2fm<^hcv?_>&;V!i*6FXOU}bzO(yF9VReKOp**SsblpF$r`3~jP47=y*$XI z5k~c_31-Z!g|#wFQv9%D@;)AZ?6^wTEi+3rilM`jSq*$&icz)|SIhY~&($Wx@(C0TVAd}!SUz_RP%RT6p}O(!FF<>P0Ey8D!W@Uez5mPf z@{xFNJQzRB`_T=SKRlB-_LHIMV{C6Y_+f8+mcJQ`_I?}=M|vYMh7Hr~3>!%BG1MEz z!o&4`l<-IS;GCZi0-P+STrVdHA03JGf&ll#XT?guOh4F%nO&_y*+ark9kFY@4d zN5CJYgOPZcjeyFU+h2meR4w(Cfnm+T;pLwb-?>_{t^_pFoV)b&Nis=jQXo&2lrE_YVJ1f& z(NMQZf-dX}Ai(0POOm{o=O~hxB>={&h9F?H3`@HPL>ZP1a@9rJUmN(!Uk{L`HJ|s@TvRAv)29D)*+#FC_VC&YR*yaEY&Gco$FQKP`%RJ z38D2wajsjSy4Od)p^i#oUMAo8OH1;79a^s^-@M{45_{U7)VT#d>JiqAz7{kMv0&Ba-zRiK)2 z56oT#viL+lQ&B+!?~kB@F{Gh470%j&i}owKlDOi4(knh0qR4ZLWOPYI5seiEikSF< z6%2u$QkRkwcB$tGLgC+(IcW+VesR8JHFjH6q`~M54*tNL^yWzX%(Nz*PwohhCE%wptKa~srIJ9 zt;*MAhy?bqwJf(+!CHJ$1Y@~plKbF+w<$MUeq(eh-jW>**G+$sqeio=qJi=Aeo-H0qoOk$yAgzlnf@fm@HaUe2XImxfUtavP8?+xlVb=URjh_V4#I|D zfX@5%kT8SD!4g!~V)+*J5&w*6=JElCHAv6J^AScc7@dlVhG3K*5_Nn`G(xNYC@cs@ zMI(wOqKv5L5^%^VaJKP`nfoZgvU99R`2mTa3CW1`kmeI>0*LkVEG>KabRql~D{$Dt zuv7j-7@ks4LqUuXrr|tH7;=D4bO`n-#RCZ2M+j3mdMd%O3=R2)L>OKtM+q`g(XN~p zvU{Ri?m&zbqvJ6+0G{TDXdd~4JSQ4pqnv0({^$*s#)MFs_=wUTMLmFs=KO&;90JPG zhf+&X2iakSLjAF{ zYrCd-C9%fbjogW3Y6jBOZ|(JK19wl{IgxE2e$qa?-Ed^{&|_ov^n`GFBGYg+eeor) zPG(#BgqFU?M(MR@rPuB_3E1U{`4d_D;f($8c75yG{&mm&rj4cyy#B&7*FGF7WnEoQ zTwUAl)|{s<=c!#DTN>N;)GeQ1I-m6%5IhIgKD~SE&aKSBp|6cwo&y=rIN+|Rd~PEk z1T(Jiw?A9`Y@Ppi_rQ1UUF(CJ9S?gR^kmxK+H!k=bS)jXr&p)H+JC=mqbt)qxM!i?i^3m0; zcdu=Aoy>Gyf3DY6RPNY_>egj?iC&xC925=?2~|T`&zpkh&5Y-*tY<{szn%Ea zi7n4a#&c~)PgHo|WEE{dE2_0(5`_lGuyvfaZ%_wb{@*5&uMx`#8}*M7jrflLn1 z;;6if^^T`h?>4nh>chhIfu~g8j+>~cDn_B3hKKeC_H5s~Px{{7u4-Jlx;Aom?9N!G z>S+4xc0=3R;CjdX!yAV)4MXXRx$4H{&{Ama@}}X}qyI4W_hXBpO!ZIG=W^9eYgE=d zAb1Bd-ZzBmH`3?etvPoU#JgE{tKe?kc2_N1mMmF!o8WF+JN9V*mb)$EKK+B);BY>t z@>|X}91) zPPa%Uls7y|A3t9`h%UJIdcF`&?-`^Q-&sgi72g^v1k%N^B-~eM1>M@g?a!}jk3%3$ zhHwH_RTw@0`a3d&aFSG26bC^oEH67j-h@Nippe{^sHKt6Q(TD@4rnzT(U6Aw0V#lj z^8n!z8ER8)6uL#csxPLohaW*7{E*d zj7qFT?2RCYlYWJZL(wStB)EDWp{W;)f$f3q@2G6wTSDJk*s?VBW}2=6B2d## zqPC^XeaI>H-1o%g`-7baFas)Oylrtw%K5XU&(`>-mP7e|wx(04>0Iyl#?mX_a*#99 z_XrfU3{cYu|B}we47`kkgn}l3fqVg-pev^QuMnsSCVV8qYkY-m2^zu3n`6Izi7!)&XO~01c}=FfdrNlCxMa{ z;EjArk0g@LD+FBsNr4?2fZh8jxD&xqOF=im&lNaXch#21H&U!2-G9mo2&-U(EeN^- zl}JVrR(qn6jl}s`DM%FEv;ffRW5YR!G27w#;GmK3?gJ)*&;0z3k zWCTT|@mPZAT+qU7TvFr+25a!S!+WVO&3l}Ou!Oq{mBi1Thq4Dx2?tL-N@Na>3YLzv zK7D1eeUV?e2Civ+Q?_nUs2j{tjx@J8veI$8d$l`TKPc1>?&@{*wt1%n-T_L>;aoU7 ze|GW6mw%EbrTdU3fojT9Edte&CbymY7N+N?SB|XsR}QR`_stvTr_R2#e%oTtS{emQ zb5*7_#H!%n{Tfd3T4H5NnfSc3p)FRL;)KOm zRY3ft`^TSC;7;H~f~rK}hb6%78sY;2mKRxIP~G^1&y1#FvBXjmHzGfD#X5-z3jsHz z5cr$4%#(kP!M#wy(qfQZ(0M41=RblK2#FB zG___M`-R5-&CyKbaluk2egD?Q%PZuz%d;>$Kf92ePi9?%f(tWEj&k9@miDcd-t_3g zrN6keIRV29=jP90&FL^>&1ueSPFb5*i56N)K+v>kqw~1}Xj3#keHh=OLeCxu`#COu zkEsVzA?(N^Om4``B;aBmyo^{Z454uSiRM|P#>&YZ5%*tdT5{`HOP_vsCKhk)*5&n|xKjF7~B z-~U)`M%y#^NHg;@3%BNP?HHjC8w79TkeK3!LQA=&^X_? zYsWpWgvevuSB41sC5UiOg$NH75#io|gvhDd5%fzC;hqW+9#RlJw?j|7@}oCS{!u_}`Ctz)IE$v>M|RQ=0}wXb60b{)sNzu3a#BvPokD*JaKNAh zhtuObdXgk}Z3OA~Pon)>qUu}X!Vfwfsr$i1kZ+UU65Zbteg8#Ver`IaCv98JM}Hu2 IL!$5h0Sjpn+W-In literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-312.pyc b/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/cutlass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0393d3b162ca22a78fa363700a0f3ecb566b1aaa GIT binary patch literal 6915 zcmcgRZEO?SwKL-xe~x22aXy@o5RyP}2nk;c>5>GNuhO!~!a_E?VfT#&&xDL)kG(Tv zNV4Oy)uWQrR|)egNmZ55{vZgX1+@}sRjE{}v{e0*S<~7(N}rHkpH}-XuY^DE*E@IY zi5+9;womU}S$EF8_ndprJ?GwgKKw_w+lfJmkpCup)qr9Dff{C^3c&MA0L)<=#^Dic z41I|R9wo*sID#z^GD?k6224h*QQMf!fT@T*>KJnXY~^ebXVf|7jJn2LI7VRK#yI7;+Z%ru~A;(CFraR#l$PT zq%ca12~Dt~$VaXUG7lr(P(qS;QDJ~c8IMWPuW^_+Na+6aAvVHuLql&1BF{=^cnOH~ zz~i|$#H@su9No)C}R}#T6imr z*myhd;K&&phK&;e7#}A&ssz)?S$S6&Bwt>_jpaQU$e$yD-gccFt8!qR>mQ=E!#XV$ z=5)NG-{2W`P?2if&N;7BW8P^pNb60<6H0`Y<;`*|NY^UyNu9h4^lTl`iwj?y1C~Z}m+c;(H^)rQZ0oG7%Gd zuS6oz-e`>DBMd*qhZ0Im>Wv^J%Do>aSWyvXSVaK5NkiRouWTFwCK~MpT*P-zd|I!D z*_o&yG9fn3UJ@dLGCkAtntk=Wj9(2b%jnL0iaqoAo&$jCcxX68AgE9fZ0x)TfH_Rr zkV?1|tAK1CCmEl{r-`6tI7pnD36?R{D)ApD1c{ehrGzL7;^o$u*m`{A#9KW<;sQ`e zR+vq<2{K$Mj)k@t+L9kC87xfEjYI7V}I+tQzaqLxGD8^%n77Np5OQJr{%YDEf zl5bftH5*1}rmTc(%Q~QG;&aq}>%GF>&8HK`#GjY!>k#a7)Mx`9XPW_P*fSiYTdPir@Q=4SKigRO#@Z;+P4k z?iqL?(QETMxh1O%`azSb_${G7QtUM$Q|?KRTBUl>O0VF84p*4l;0#{Cr1$^iqJKGiRSn1g=!>+(vDfTzN>+(;_ zZcO1NG|vh8Zrm7T(cUT@$-|t5bCyV^(s_58w=`M(3f?Nba8jU*w+g#9c~h$myIfr+ zr(-X3Tf*r#)nRpI9Zq9^f3;GrOk+WMSO-t?>!ZMyge0vK!^5Y*Pj{Q1Cv#b1ISC>h z6k}1GG9Bl2E5~0ELJ*BfDCW2jwCEO2i0ZaUo{s}y^ak5>8@SP|qDZ>^D)TWTLU5;B zQN+e7IweoAab60=WOoRe@6`WVEVWSQVy#h)=IVj&d=Yl#oBK*FmQYdB=;2hr=t*_NGej!Q_#POI;3Zl}d6G}{{ z0NTF~2n12L$cd;3y?T=6yf>Bo}C$ zzqC-lxObsxiO?Fl($s2m+pUWW7nimzjVv`TAN+0quliTczL!1wk#_c@r_KM8cIFy8 zes*q&Sn@3qiz8a&uCy)Zt)Fw>bkC1J^={7vTGQ6Q%ii*c!O`(`;OHd2?#G&1=J{;P zKCNY6rg8rRR%<+z@~qak%qJGPJJH+GO#Oir{iUmM@xYzKw-3X{zGxpw`!SV0 zdOfLaMraujyUiI`{>(E^&5ciHKgoK!zVLLddbj2LftprG6AJ1+%t!+7-+49}A<8@=t0&R;QXn~#w zhaVkzc;xZL&zavb+Wre#!wBfv*wOrfTLTLNz`@SDJF{K=T30`)=H1biuKrBd`|B3M zSH0%M8rtWs-MqFGcrc;`-_+{g%+?KRb%Vbj`TXMVF0RxKX6nwREgRDfE)3p(E2bk#%KfK(t)S+9#D(dUn2S1 zE%PIbC+-a09?I12OP$KqZJ7(-3@@I0K>p_BZ_oVdOgfyYJDvJ=&Q#alFFboewoPq@ zHe2qtWE%UH4{43Z(l%HMu1(JC&w96M-ff$2&Y|V)#-Tj=!bZBQ*69MD*`{u-se6h0 z6TLs5F2JXB@{+*HA@uZ{1stV(!r%&oj&m3otQB*?NnB6qTl0zjv1dOCJ=}OWZmUN4UzUUP#hKw=0Y|D=R?dF6)A( z>AtE~)uuY`>4-QVw2ZNMuIE^?Iw!L*30?xd$=wW^or1!tO6T`Hy0svR6l z%YEV=O0PgUkCjlOyiZvvp|pTRDAJU9Re zNKF}5UAI6|SoamOc!r@e(tOe`m`blLUZtRZmAwqQ$rXSKOC+q_R}-uGbdqyC5ezZlHz z*`M2UAou3+b(^K-BtAQAP;oFV?Oky-J*y2Y`d4bVr%rv@*m|>LkxLz4t*Oh_bZa%; zIWOG)np{H@`Zc!a8n)*A^|{)>y3gUSTEiT0>HV0uChO_YJRLdOldhQ$E$v?pKIvE) zy_g-nq>WyJ7t!ln!w84dcngB~aL}jQ8N(G~7~RPf+!=srh5@}tkk1G$R|sWt!TFkq zaiBF(k?l6S?J)p{zwSZbsX31Her`zIMt4*#rceF zQR19Cch>cZ#m*4s6AJ46+t1Hh9l}lpe#@P+vv)GumkXQ2;P!I}pP(joP*WnZgNf3T z9BEa#l^ZoxIF*S8ngY>DpQ9`l5Nb*LLElmwwa2pPXr#38YJRb2TNv&Ua@N6)(o(LX z7Iq4&3Oxl!P6 zFKbrrSYsq=Gv5}_^ezGErUK*|0#K#tM`wV)DbSN`%DFH%{+??@Uf2lTa4fRe9Ervj z+sEITym*{6sC%IicB1*fZ+u3n*YE;|P?IbYyB!6rK?SC~aj9+jk-KC?F4$d6@#hVm zc7^GAjVKL!JnY1S;N0|F-?XjJy5Leb>MV`zp4Iq&&HEu>F8X~1J*W?{Yeqk2JTmlAFD zSdQz@)#X^>gtpi#;4tnpONJCx4B5}`3m|?uXGC= z`q&Nfi+(K27%#@k1=r$sZ-0q4PIG=8n3r-^wsKyUmgTL%JcF2+X8@y^cX7oEL)zKO zxvsJUaGEB`4hl}pBQ^pg{a_4Od8!7S?FsTW#{v;99)#4MmQihA_wq({^!oW5@85`i zsvcXPHuh{OB>aLEW(C%&1uGkJUR&WYP65^y(hH+m+vq#wOjf83|36A!9GYm42*wJY5# zi3&2jQXW{F>P{sEA?XP8v`fF9-RM7@hzsx~6{~9_-I2daAU#Gv(ymGqsq3r`X~g|v zK96txqIot53NWTcuQSnV@z73xjr=B6$QxE?Dnj$X7 literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-312.pyc b/model_executor/layers/quantization/kernels/scaled_mm/__pycache__/xla.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cea8b87bffa9507b792c12303d27f3aebb2bf837 GIT binary patch literal 4859 zcmbVPO>7&-6`uVgmlQ=xmSXuwk=BwNn6W}ewo)sxonPBcV@pwDIcXUKi#2yBt+GG# z?9vhmDlrP9kdT9vla*6spnz>SK;@Hj6naY0i)b03x^N2vNm2O5MoJF7v~QMNiBhGa z$w-`eGxOfe+c$5%_vX)yjbQ{u=Rec>VhDXpJAM&r#M(FzH<5}|Mn|b?%cL0SvpQ>V zDUP9IoX#6UO7Li24;aBz(4z%iG(xEm&;d25hmCM5Y&4`A7{sDqB2~PB)DRBeSe3oO->b2t1Q_LT0A_+%#37Y01QE)x`(ZxAt3n-TSPkHY z@qIPHx@P=9U8hE>jLL(2@CKiXsv^jB?7$P+WxT2buZBUo>1RtffOK;Rsf|BL@1LbB zSZ~KK%!q2_hLGBm=i^bg?bjFH*m{1oQw!xR14eEm~stEiUdxV##u+QGC&&HoP(vbl2xu~y5{72C1*^tCGgNO zOO@;~E2pcHX*tpr%B6$rCBQvfhAih-MpnlL_$XG-5@Jz*=F+-tQ!S)_sY%DkCbL@h z1xF#H*m+?xcjD!Nk5AyFu3brH^RT>`yr%0$(y&yl%Xku}bB;xlI`v&Uc{Qh)j#f|{ z4fsCxyxdOOp6|C_KN5ITQcQ+XAZEtx#P@-$f5qSf=sC zHI9pgD~Jn)*I*pEOnNBIQmR%c?b*KwM7{mT>$z&o{$;Y6JX2)G`Py(@Lzp}ZGl9CH z8VNES%!oVA@OdE~7>cv!3O&B(z*kEIU(IO*PMYLQQ!_^;%ap(k5|lByBcqx(6+=r) zO4`w`dB&DJBio)#-;OeN-T?O?yGkW?l&ZrwX##7bV@_?SI6E>F7YH?h8_;ao$>!X> z(Ao~@la@6TJXE&RW8iCUtk&~2aHGIjw{n~%5CRbK4wfE>B#|b%aAR6AWzF~4!r|R~ zWD6UGFM9wgaF1ALIrdyB_S~b`k=2&hpEN|u4c!kLx>sAyKZ!M!W75NzwAwQ0(VY)t zovSVHJc;cmd+6SR=h9oeh@PtFMSlb(>%FMR6w!{JRP(MW*70K++Kt!Q@TQ%-XS@OS z-d2k2cx2lUbXDdn_FipYKe{o5od-)?-7A3x}GZ?-D$>pR77v+NF<{tV0VKf$uk z`g(bm+9@!ix^vjdE%~yWt-3pR*Cv-wZQ(xbkB&d*Q=6?SsDZD7{;4r2kK*FcIe`C0 zAAjUgqNoJ&1@!daP~cjRnij!@%fke#5`-5;8ulUpjYV+JO(!9JIEGCrXT$u2=8Q=Y z3esZFilexkj!ieJTcaAp8F@leoiUrr+n&4|0Bs*X3v>rfv_Rl}Fz~64X)wt4}m<~J^_pe5p=lXA+{`~as58pm= z>qxn)ztq*g_`!pZ?tipAd}+C>f2He#b&d^%*PGD6j(J?}I9BR7wsPS3qEb3=VmkJ? ztz$m7px!oa87po5(~-Y499ZbTee%}Ha`)+n-KQTP>|5+z63VZSlwKcMIe2OMLM4Fq z^_1I=mfDWqd2^*LIUT8Lu^MTb9h@0_9BH0?d*lg__G^ zXG!c_jkK<|k{<=g5~>i-OuGCPO|jDy)cryeArIs!9^VAGzKHHJLwC_#j`YC@`30>` zKxI>|jU#L@EpQf9rt1~DcRR2;rg!NQ^s-$?tFfkO(Nk%QCKLocO(@D!O^R#*sEdfa z3=MgORwtp_s))Fug|wR%KF9l1m~^C4SMPPLUr`i_!|(Q1{$wrtUd#6 ziyL|DW$^7OeoB}M&_8%MAf1^CIvXqQlr{bT5f37Upt4_a_iF1+iB6-hN0C!ej_Mcn zM>bm+-684~#UeM#`fFW8aejzUtB=rRxt7afc{9t3hul3fO-O5(*?Se*ZjXmfGR<2{lTTTzG+=Lxe|MKS$x-X zLeIfCfR}hek&pc?q-@e3``p&G;ekH&N z`{oaS+51H=o!@r{|3Sa2e?3x4yg}#pRifx{{GXv#VVHRmYn`2(nVkJ}=F^Hm2LPY} zgHO$zssz2Rh=4<%(dS1>@XqpNDq$3k&9=|9uQyPZ>OoO!#2ZPH8pZSS?o7^1S95`_ z7s7&*v2;13TN9qAyD!kkWMb=MGEsd@Ch8xPiJYV9b~PCo_(?K}i2-Fmu8Mo%KkAA5 zZ*1Fp!VsF;d+t&_+O5a!il?ek{8f?yJ_>&}jRh5sVVE@$F-_m1$X`+a-_iLWSe9Xb b2q32SALzaHz=sSo%q({t`vG~BXRiMMCDV`& literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/model_executor/layers/quantization/kernels/scaled_mm/aiter.py new file mode 100644 index 0000000..038a92c --- /dev/null +++ b/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops +from vllm.platforms import current_platform + +from .cutlass import CutlassScaledMMLinearKernel +from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig + + +class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + return 90 + + @classmethod + def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_rocm(): + return ( + False, + "AiterScaledMMLinearKernel requires `aiter` which is not " + + "currently supported on non-ROCm platform.", + ) + + try: + import aiter # noqa: F401 # deliberately attempt to import aiter + except Exception: + return ( + False, + "AiterScaledMMLinearKernel requires `aiter` which is not " + + "installed on ROCm.", + ) + # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled + if not (rocm_aiter_ops.is_linear_enabled()): + return ( + False, + "AiterScaledMMLinearKernel is disabled. " + + "Enable by setting `VLLM_ROCM_USE_AITER=1` " + + "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " + + "`VLLM_ROCM_USE_AITER_LINEAR` default is True.", + ) + + if not c.input_symmetric: + return ( + False, + "AiterScaledMMLinearKernel only supports symmetric " + "quantization.", + ) + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + super().process_weights_after_loading(layer) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + `AiterScaledMMLinearKernel` implements a fused version of + `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)` + where scale_a * a and scale_b * b are implemented using numpy-style + broadcasting. + Currently only support per-tensor-per-tensor GEMM + and per-token-per-channel GEMM through AITER + w8a8 scaled gemm. `AiterScaledMMLinearKernel` also does not support + ATIER block scaled GEMM and mix-precision GEMM. + """ + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) + + # ops.scaled_int8_quant supports both dynamic and static quant: + # * dynamic, i_s is None and x_s computed from x. + # * static, i_s is scalar and x_s is i_s. + symmetric = azp_adj is None + assert symmetric, ( + "AiterScaledMMLinearKernel only supports symmetric quantization." + ) + x_q, x_s, x_zp = ops.scaled_int8_quant(x, i_s, i_zp, symmetric=symmetric) + + assert x_zp is None, ( + "AiterScaledMMLinearKernel only supports symmetric quantization." + ) + out_dtype = x.dtype + + assert w_q.shape[0] % 16 == 0 and w_q.shape[1] % 16 == 0 + assert out_dtype is torch.bfloat16 or out_dtype is torch.float16 + assert bias is None or bias.shape[0] == w_q.shape[1] and bias.dtype == out_dtype + + m = x_q.shape[0] # a + n = w_q.shape[1] # b + + per_tensor_scale_a = x_s.numel() == 1 + per_tensor_scale_b = w_s.numel() == 1 + per_token_scale_a = x_s.numel() == m + per_channel_scale_b = w_s.numel() == n + + # @TODO: + # Maybe broadcast the per-tensor-scale into per-channel-scale + # if one of the scale is a per-channel-scale. + # For now, it only supports: + # - per-tensor-per-tensor a8w8 scaled GEMM, and + # - per-token-per-channel a8w8 scaled GEMM + assert (per_tensor_scale_a and per_tensor_scale_b) or ( + per_token_scale_a and per_channel_scale_b + ), ( + "Currently only support per-tensor-per-tensor GEMM " + + " and per-token-per-channel GEMM through AITER" + " w8a8 scaled gemm. `AiterScaledMMLinearKernel` " + + "does not support AITER block scaled GEMM." + ) + + # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects + # a to be [M, K] + # b to be [N, K] + # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format + return rocm_aiter_ops.gemm_a8w8(x_q, w_q.t(), x_s, w_s, bias, out_dtype) diff --git a/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/model_executor/layers/quantization/kernels/scaled_mm/cpu.py new file mode 100644 index 0000000..feb1e0b --- /dev/null +++ b/model_executor/layers/quantization/kernels/scaled_mm/cpu.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm import _custom_ops as ops +from vllm import envs +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise, +) +from vllm.model_executor.layers.utils import check_cpu_sgl_kernel +from vllm.platforms import current_platform +from vllm.platforms.interface import CpuArchEnum + +from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig + + +class CPUScaledMMLinearKernel(ScaledMMLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_cpu(): + return False, "CPUScaledMM requires running on CPU." + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + weight = getattr(layer, self.w_q_name) + dtype = weight.dtype + N, K = weight.size() + if ( + current_platform.get_cpu_architecture() == CpuArchEnum.X86 + and envs.VLLM_CPU_SGL_KERNEL + and self.config.input_symmetric + and check_cpu_sgl_kernel(N, K, dtype) + ): + self.linear_method = self._apply_weights_sgl + self.process_weights_for_sgl(layer) + else: + self.linear_method = self._apply_weights_onednn + self.process_weights_for_onednn(layer) + + def process_weights_for_onednn(self, layer: torch.nn.Module) -> None: + # WEIGHT + # Transpose to [K, N] for convenience + weight = getattr(layer, self.w_q_name) + replace_parameter( + layer, + self.w_q_name, + torch.nn.Parameter(weight.t().data, requires_grad=False), + ) + + # WEIGHT SCALE + # oneDNN kernels support only per-tensor and per-channel. + # If we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), convert to the per-channel case. + is_fused_module = len(layer.logical_widths) > 1 + weight_scale = getattr(layer, self.w_s_name) + if is_fused_module and not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths) + replace_parameter( + layer, + self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False), + ) + + # INPUT SCALE + if self.config.is_static_input_scheme: + input_scale = getattr(layer, self.i_s_name) + + if self.config.input_symmetric: + replace_parameter( + layer, + self.i_s_name, + torch.nn.Parameter(input_scale.max(), requires_grad=False), + ) + setattr(layer, self.i_zp_name, None) + else: + input_zero_point = getattr(layer, self.i_zp_name) + + # reconstruct the ranges + int8_traits = torch.iinfo(torch.int8) + azps = input_zero_point.to(dtype=torch.int32) + range_max = (input_scale * (int8_traits.max - azps)).max() + range_min = (input_scale * (int8_traits.min - azps)).min() + + scale = (range_max - range_min) / (int8_traits.max - int8_traits.min) + replace_parameter( + layer, self.i_s_name, torch.nn.Parameter(scale, requires_grad=False) + ) + + azp = ( + (int8_traits.min - range_min / scale).round().to(dtype=torch.int32) + ) + replace_parameter( + layer, self.i_zp_name, torch.nn.Parameter(azp, requires_grad=False) + ) + + else: + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + + # Different from cutlass, oneDNN kernels only need the AZP adjustment + # term for dynamic quantization. And s_b should be folded into the + # term. Such as: + # s_a * s_b * [(A - zp_a)B] + bias = + # s_a * (s_b * AB) - s_a * s_b * zp_a * B + bias = + # s_a * GEMM_output - s_a * zp_a * adj + bias + if not (self.config.input_symmetric and self.config.is_static_input_scheme): + weight = getattr(layer, self.w_q_name) + weight_scale = getattr(layer, self.w_s_name) + azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.float32) + azp_adj = azp_adj * weight_scale.squeeze() + setattr( + layer, + self.azp_adj_name, + torch.nn.Parameter(azp_adj, requires_grad=False), + ) + else: + setattr(layer, self.azp_adj_name, None) + + weight = getattr(layer, self.w_q_name) + self.dnnl_handler = ops.create_onednn_scaled_mm( + weight, + getattr(layer, self.w_s_name), + torch.get_default_dtype(), + getattr(layer, self.i_s_name) is None, + not self.config.input_symmetric, + 32, + ) + # weight is prepacked and maintained by the dnnl_handler, + # release the original weight + setattr(layer, self.w_q_name, None) + del weight + + def process_weights_for_sgl(self, layer: torch.nn.Module) -> None: + # WEIGHT + weight = getattr(layer, self.w_q_name) + packed_weight = torch.ops._C.convert_weight_packed(weight) + replace_parameter( + layer, self.w_q_name, torch.nn.Parameter(packed_weight, requires_grad=False) + ) + + if layer.bias is not None: + bias = layer.bias + layer.register_parameter( + "bias_fp32", torch.nn.Parameter(bias.float().data, requires_grad=False) + ) + + # WEIGHT SCALE + # CPU SGL kernels only support per-channel. + # For per-tensor quant, convert to the per-channel case. + weight_scale = getattr(layer, self.w_s_name) + if not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths) + replace_parameter( + layer, + self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False), + ) + + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + setattr(layer, self.azp_adj_name, None) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.linear_method( + layer, + x, + bias, + ) + + def _apply_weights_onednn( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) + + # ops.scaled_int8_quant supports both dynamic and static quant: + # * dynamic, i_s is None and x_s computed from x. + # * static, i_s is scalar and x_s is i_s. + x_q, x_s, x_zp = ops.onednn_scaled_int8_quant( + x, i_s, i_zp, self.config.input_symmetric + ) + + m = x.size(0) + n = self.dnnl_handler.n + out = torch.empty((m, n), dtype=x.dtype) + ops.onednn_scaled_mm(self.dnnl_handler, x_q, out, x_s, x_zp, azp_adj, bias) + + return out + + def _apply_weights_sgl( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + w_q, w_s, _, _, _ = self._get_weight_params(layer) + return torch.ops._C.int8_scaled_mm_with_quant( + x, + w_q, + w_s, + layer.bias_fp32 if bias is not None else None, + x.dtype, + True, + ) diff --git a/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py new file mode 100644 index 0000000..5879f5a --- /dev/null +++ b/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -0,0 +1,160 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise, +) +from vllm.platforms import current_platform + +from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig + + +class CutlassScaledMMLinearKernel(ScaledMMLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_cuda(): + return False, "CutlassScaledMM requires running on CUDA." + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # WEIGHT + # Cutlass kernels need transposed weight. + weight = getattr(layer, self.w_q_name) + self.format = "TN" #默认weight都是按T排布 + m, k = weight.shape + if(m % 64 == 0 and k % 64 == 0): + self.format= "NN" + replace_parameter( + layer, self.w_q_name, + torch.nn.Parameter(weight.t().data.contiguous(), requires_grad=False))#原始排布是T[m,k] 处理完后是N[k, m] + else: + if k % 64 != 0: + pad_k = (k // 64 + 1) * 64 + weight_pad = torch.empty((m, pad_k), dtype=weight.dtype, device=weight.device) + _weight = weight_pad[:, :k] + _weight.copy_(weight) + weight = _weight + replace_parameter( + layer, self.w_q_name, + torch.nn.Parameter(weight.t(), requires_grad=False)) + + + + # WEIGHT SCALE + # Cutlass kernels support only per-tensor and per-channel. + # If we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), convert to the per-channel case. + is_fused_module = len(layer.logical_widths) > 1 + weight_scale = getattr(layer, self.w_s_name) + if is_fused_module and not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths) + replace_parameter( + layer, + self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False), + ) + + # INPUT SCALE + if self.config.is_static_input_scheme: + input_scale = getattr(layer, self.i_s_name) + + if self.config.input_symmetric: + replace_parameter( + layer, + self.i_s_name, + torch.nn.Parameter(input_scale.max(), requires_grad=False), + ) + setattr(layer, self.i_zp_name, None) + else: + input_zero_point = getattr(layer, self.i_zp_name) + + # reconstruct the ranges + int8_traits = torch.iinfo(torch.int8) + azps = input_zero_point.to(dtype=torch.int32) + range_max = (input_scale * (int8_traits.max - azps)).max() + range_min = (input_scale * (int8_traits.min - azps)).min() + + scale = (range_max - range_min) / (int8_traits.max - int8_traits.min) + replace_parameter( + layer, self.i_s_name, torch.nn.Parameter(scale, requires_grad=False) + ) + + # AZP loaded as int8 but used as int32 + azp = (int8_traits.min - range_min / scale).to(dtype=torch.int32) + replace_parameter( + layer, self.i_zp_name, torch.nn.Parameter(azp, requires_grad=False) + ) + + else: + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + + # azp_adj is the AZP adjustment term, used to account for weights. + # It does not depend on scales or azp, so it is the same for + # static and dynamic quantization. + # For more details, see csrc/quantization/w8a8/cutlass/Epilogues.md + # https://github.com/vllm-project/vllm/blob/main/csrc/quantization/w8a8/cutlass/Epilogues.md + if not self.config.input_symmetric: + weight = getattr(layer, self.w_q_name) + azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32) + if self.config.is_static_input_scheme: + # cutlass_w8a8 requires azp to be folded into azp_adj + # in the per-tensor case + azp_adj = getattr(layer, self.i_zp_name) * azp_adj + setattr( + layer, + self.azp_adj_name, + torch.nn.Parameter(azp_adj, requires_grad=False), + ) + else: + setattr(layer, self.azp_adj_name, None) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) + + # ops.scaled_int8_quant supports both dynamic and static quant: + # * dynamic, i_s is None and x_s computed from x. + # * static, i_s is scalar and x_s is i_s. + symmetric = azp_adj is None + if isinstance(x, tuple): + x_q, x_s, out_dtype = x + x_zp = None + else: + out_dtype = x.dtype + x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(), + i_s, + i_zp, + symmetric=symmetric) + + if x_zp is not None: + # Currently, static is always per-tensor and dynamic is per-token + static = i_zp is not None + azp = None if static else x_zp + return ops.cutlass_scaled_mm_azp( + x_q, + w_q, + scale_a=x_s, + scale_b=w_s, + out_dtype=out_dtype, + azp_adj=azp_adj, + azp=azp, + bias=bias, + ) + return ops.cutlass_scaled_mm( + x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=out_dtype, bias=bias, format=self.format + ) diff --git a/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/model_executor/layers/quantization/kernels/scaled_mm/triton.py new file mode 100644 index 0000000..3f4ec7f --- /dev/null +++ b/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm.platforms import current_platform + +from .cutlass import CutlassScaledMMLinearKernel +from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig + + +class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if current_platform.is_cpu(): + return ( + False, + "TritonScaledMMLinearKernel requires Triton which is not " + + "currently supported on CPU.", + ) + if not c.input_symmetric: + return ( + False, + "TritonScaledMMLinearKernel only supports symmetric " + "quantization.", + ) + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + super().process_weights_after_loading(layer) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return super().apply_weights(layer, x, bias) diff --git a/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/model_executor/layers/quantization/kernels/scaled_mm/xla.py new file mode 100644 index 0000000..ddac9f1 --- /dev/null +++ b/model_executor/layers/quantization/kernels/scaled_mm/xla.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import warnings + +import torch +from functorch.experimental.control_flow import cond # noqa: F401 + +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise, +) +from vllm.platforms import current_platform + +from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig + + +class XLAScaledMMLinearKernel(ScaledMMLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + raise NotImplementedError( + "TPU platform does have a concept of compute capability, " + "this method should not be called." + ) + + @classmethod + def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_tpu(): + return False, "ScaledMMXLA requires running on TPU." + + if c.is_static_input_scheme: + return False, "ScaledMMXLA requires dynamic activation scales." + + if not c.input_symmetric: + return False, "ScaledMMXLA requires symmetric activation scales." + + if not c.is_channelwise: + return False, "ScaledMMXLA requires channelwise weight scales" + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # WEIGHT + # [out, in] (different than cutlass_scaled_mm) + weight = getattr(layer, self.w_q_name) + replace_parameter( + layer, self.w_q_name, torch.nn.Parameter(weight.data, requires_grad=False) + ) + + # WEIGHT SCALE + # XLA kernels support only per-tensor and per-channel. + # If we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), convert to the per-channel case. + is_fused_module = len(layer.logical_widths) > 1 + weight_scale = getattr(layer, self.w_s_name) + if is_fused_module and not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths) + + # [out_channel,] (different than cutlass_scaled_mm) + weight_scale = weight_scale.squeeze(-1) + replace_parameter( + layer, + self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False), + ) + + # Only support symmetric dynamic activation quantization. + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + setattr(layer, self.azp_adj_name, None) + + # Filter warning for cond usage in apply_weights. It is okay + # to specialize the graph since bias is not dynamic. + warnings.filterwarnings( + "ignore", + message="Pred is a Python constant. When used with torch.cond, it specializes on one of the branches.", # noqa: E501 + ) + + def no_add_bias(self, x: torch.Tensor, bias: torch.Tensor | None): + return x + + def add_bias(self, x: torch.Tensor, bias: torch.Tensor | None): + return x + bias + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + w_q, w_s, _, _, _ = self._get_weight_params(layer) + + # Required to register custom ops. + import torch_xla.experimental.custom_kernel # noqa: F401 + + out = torch.ops.xla.quantized_matmul_int8( + x, + w_q, + w_s, + quantize_activation=True, + ) + + # Explicitly capture control flow to make dynamo happy. + # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501 + return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias]) diff --git a/model_executor/layers/quantization/kv_cache.py b/model_executor/layers/quantization/kv_cache.py new file mode 100644 index 0000000..78456dc --- /dev/null +++ b/model_executor/layers/quantization/kv_cache.py @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.platforms import current_platform + +logger = init_logger(__name__) + + +class BaseKVCacheMethod(QuantizeMethodBase): + """ + Quant method that adds `_k_scale` and `_v_scale` attributes to the + Attention layer to support loading those scaling factors from checkpoints. + The k/v_scale will be used to: + - quantize k/v_cache entries before saving them to the cache + - dequantize k/v_cache entries before fetching them from the cache + + :param quant_config: the appropriate QuantizationConfig + """ + + def __init__(self, quant_config: QuantizationConfig): + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module): + """ + Create "weight" (aka q_scale, k_scale and v_scale) + for an attention layer. + """ + # Initialize the Q and KV cache scales to -1.0, an invalid value. + # If the q and k/v_scales appear in the checkpoint, it will be + # overwritten when loading weights. + layer.q_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False) + layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False) + layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False) + # Initialize P = softmax(QK^T) scales + layer.prob_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False) + + def apply(self, layer: torch.nn.Module) -> torch.Tensor: + raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.") + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0 + # regardless whether the kv-scale is available in the checkpoint. + # No need to process kv scales after loading if we are going to + # calculate them on the fly. + if layer.kv_cache_dtype != "auto" and not layer.calculate_kv_scales: + if layer.k_scale > 0.0 and layer.v_scale > 0.0: + # We prefer to use separate k_scale and v_scale if present + k_scale = layer.k_scale.to("cpu").tolist() + v_scale = layer.v_scale.to("cpu").tolist() + if current_platform.is_fp8_fnuz(): + k_scale *= 2 + v_scale *= 2 + elif layer.k_scale < 0.0 and layer.v_scale < 0.0: + # If no scales were loaded (both scales are invalid negative + # values), use the default value of 1.0 + k_scale = 1.0 + v_scale = 1.0 + else: + # If we find a single kv_scale in the checkpoint, we remap + # kv_scale to k_scale during weight loading, and duplicate + # k_scale to v_scale here + assert layer.k_scale > 0.0 + scale_to_duplicate = max(layer.k_scale, layer.v_scale) + k_scale = scale_to_duplicate.to("cpu").tolist() + v_scale = scale_to_duplicate.to("cpu").tolist() + if current_platform.is_fp8_fnuz(): + k_scale *= 2 + v_scale *= 2 + + if not isinstance(k_scale, float) or not isinstance(v_scale, float): + raise ValueError( + "Only support per-tensor scaling factor for fp8 KV cache" + ) + + if layer.q_scale < 0.0: + logger.warning_once( + "Checkpoint does not provide a q scaling factor. " + "Setting it to k_scale. This only matters for " + "FP8 Attention backends (flash-attn or flashinfer)." + ) + layer._q_scale.copy_(k_scale) + layer._q_scale_float = k_scale + + # These are used in the final Attention.forward() + layer._k_scale.copy_(k_scale) + layer._v_scale.copy_(v_scale) + layer._k_scale_float = k_scale + layer._v_scale_float = v_scale + if k_scale == 1.0 and v_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype: + logger.warning_once( + "Using KV cache scaling factor 1.0 for fp8_e4m3. " + "If this is unintended, verify that k/v_scale " + "scaling factors are properly set in the checkpoint." + ) + + if layer.q_scale > 0.0: + q_scale = layer.q_scale + if current_platform.is_fp8_fnuz(): + q_scale *= 2 + layer.calculate_kv_scales = False + else: + q_scale = 1.0 + if layer.prob_scale > 0.0: + prob_scale = layer.prob_scale + if current_platform.is_fp8_fnuz(): + prob_scale *= 2 + else: + prob_scale = 1.0 + + is_singleton_float = ( + lambda x: isinstance(x, float) + or isinstance(x, torch.Tensor) + and x.numel() == 1 + and x.is_floating_point() + ) + if not is_singleton_float(q_scale) or not is_singleton_float(prob_scale): + raise ValueError( + "Only support per-tensor scaling factorfor fp8-quantized Q/prob" + ) + + # These are used in the final Attention.forward() + layer._q_scale.copy_(q_scale) + layer._q_scale_float = ( + q_scale.item() if isinstance(q_scale, torch.Tensor) else q_scale + ) + + layer._prob_scale.copy_(prob_scale) + if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0 or prob_scale == 1.0): + logger.warning_once( + f"Using uncalibrated q_scale {q_scale} and/or prob_scale " + f"{prob_scale} with fp8 attention. This may cause accuracy " + "issues. Please make sure q/prob scaling factors are " + "available in the fp8 checkpoint." + ) + + del layer.k_scale + del layer.v_scale + del layer.q_scale + del layer.prob_scale diff --git a/model_executor/layers/quantization/modelopt.py b/model_executor/layers/quantization/modelopt.py new file mode 100644 index 0000000..4765218 --- /dev/null +++ b/model_executor/layers/quantization/modelopt.py @@ -0,0 +1,1788 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import TYPE_CHECKING, Any, Optional + +import torch +from torch.nn import Module +from torch.nn.parameter import Parameter + +import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, + RoutingMethodType, + fp8_w8a8_moe_quant_config, + nvfp4_moe_quant_config, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, + FusedMoEMethodBase, + FusedMoeWeightScaleSupported, +) +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + build_flashinfer_fp4_cutlass_moe_prepare_finalize, + reorder_w1w3_to_w3w1, + select_nvfp4_gemm_impl, +) +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + FlashinferMoeBackend, + apply_flashinfer_per_tensor_scale_fp8, + build_flashinfer_fp8_cutlass_moe_prepare_finalize, + flashinfer_cutlass_moe_fp8, + get_flashinfer_moe_backend, + is_flashinfer_supporting_global_sf, + register_moe_scaling_factors, + rotate_flashinfer_fp8_moe_weights, + select_cutlass_fp8_gemm_impl, + swap_w13_to_w31, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + apply_fp4_marlin_linear, + is_fp4_marlin_supported, + prepare_fp4_layer_for_marlin, + prepare_moe_fp4_layer_for_marlin, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + cutlass_fp4_supported, + is_layer_skipped, + swizzle_blockscale, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + Fp8LinearOp, + requantize_with_max_scale, +) +from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter +from vllm.scalar_type import scalar_types +from vllm.utils.flashinfer import ( + flashinfer_scaled_fp4_mm, + has_flashinfer, + has_flashinfer_moe, +) + +if TYPE_CHECKING: + from vllm.model_executor.models.utils import WeightsMapper + +logger = init_logger(__name__) + +QUANT_ALGOS = ["FP8", "NVFP4"] +KV_CACHE_QUANT_ALGOS = ["FP8"] + + +class ModelOptFp8Config(QuantizationConfig): + """Config class for ModelOpt FP8.""" + + def __init__( + self, + is_checkpoint_fp8_serialized: bool = False, + kv_cache_quant_method: str | None = None, + exclude_modules: list[str] | None = None, + ) -> None: + super().__init__() + self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized + self.kv_cache_quant_method = kv_cache_quant_method + self.exclude_modules = exclude_modules or [] + if is_checkpoint_fp8_serialized: + logger.warning( + "Detected ModelOpt fp8 checkpoint. Please note that" + " the format is experimental and could change." + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "modelopt" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + return 89 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["hf_quant_config.json"] + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.exclude_modules is not None: + self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + """Detect if this ModelOpt config should be used based on + quantization config.""" + + if hf_quant_cfg is None: + return None + + # Use the community standard 'quant_method' + quant_method = hf_quant_cfg.get("quant_method", "").lower() + + # Only proceed if the method is explicitly "modelopt" + if quant_method != "modelopt": + return None + + # Look for ModelOpt-specific config structure + if "quantization" in hf_quant_cfg: + quant_config = hf_quant_cfg["quantization"] + if isinstance(quant_config, dict): + quant_algo = quant_config.get("quant_algo", "") + if "FP8" in quant_algo: + return "modelopt" + else: + # Check for compressed-tensors style config with specific quant_algo + quant_algo = hf_quant_cfg.get("quant_algo", "") + if isinstance(quant_algo, str) and "FP8" in quant_algo: + return "modelopt" + + return None + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": + # Handle both ModelOpt format and compressed-tensors style format + if "quantization" in config: + # ModelOpt format: {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError("Expected 'quantization' to be a dictionary in config") + quant_method = quant_config.get("quant_algo", "") + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + kv_cache_quant_method = quant_config.get("kv_cache_quant_algo") + # "exclude_modules" is the key in the legacy hf_quant_config.json + exclude_modules = quant_config.get("exclude_modules") + else: + # Compressed-tensors style format: + # {"quant_algo": "...", "quant_method": "modelopt"} + quant_method = config.get("quant_algo", "") + kv_cache_quant_method = config.get("kv_cache_quant_algo") + # "ignore" is the key in config.json + exclude_modules = config.get("ignore") + + if quant_method not in QUANT_ALGOS: + raise ValueError( + f"ModelOpt currently only supports: {QUANT_ALGOS} " + "quantizations in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration." + ) + is_checkpoint_fp8_serialized = "FP8" in quant_method + + return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules) + + def is_layer_excluded(self, prefix: str) -> bool: + """ + Check if a layer should be excluded from quantization. + Handles both exact matching (for fused layers) and substring matching. + + This method handles both regular models and multimodal models that use + the language_model prefix. For multimodal models, it checks if the + module name (without the language_model prefix) is in the exclude list. + """ + if self.exclude_modules is None: + return False + + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping): + return True + + # Then check substring matching for patterns not caught by exact match + for module in self.exclude_modules: + # Skip exact matches already handled above + if module != prefix and ( + module in prefix + or ( + prefix.startswith("language_model.") + and module in prefix.removeprefix("language_model.") + ) + ): + return True + return False + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import ( # Avoid circular import + Attention, + MLAAttention, + ) + + if isinstance(layer, LinearBase): + if self.is_layer_excluded(prefix): + return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + if "vision_tower" in prefix or "vision_model" in prefix: + return UnquantizedLinearMethod() + return ModelOptFp8LinearMethod(self) + elif isinstance(layer, (Attention, MLAAttention)): + return ModelOptFp8KVCacheMethod(self) + elif isinstance(layer, FusedMoE): + return ModelOptFp8MoEMethod(self, layer) + return None + + +class ModelOptFp8LinearMethod(LinearMethodBase): + """Linear method for Model Optimizer static quantization. + Supports loading FP8 checkpoints with static weight scale and + activation scale. Future support might be added for dynamic + scales. + + Limitations: + 1. Only support per-tensor quantization due to torch._scaled_mm support. + 2. Only support float8_e4m3fn datatype + Args: quant_config: The ModelOpt quantization config. + """ + + def __init__(self, quant_config: ModelOptFp8Config) -> None: + self.quant_config = quant_config + self.fp8_linear = Fp8LinearOp( + act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + weight_dtype = ( + torch.float8_e4m3fn + if self.quant_config.is_checkpoint_fp8_serialized + else params_dtype + ) + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, input_size_per_partition, dtype=weight_dtype + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + if self.quant_config.is_checkpoint_fp8_serialized: + # WEIGHT SCALE + weight_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + # INPUT SCALE + scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + + scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("input_scale", scale) + + def process_weights_after_loading(self, layer: Module) -> None: + weight = layer.weight + max_w_scale = layer.weight_scale.max() + if not (layer.weight_scale == layer.weight_scale[0]).all(): + max_w_scale, weight = requantize_with_max_scale( + layer.weight, layer.weight_scale, layer.logical_widths + ) + layer.weight = Parameter(weight.t(), requires_grad=False) + layer.weight_scale = Parameter(max_w_scale, requires_grad=False) + layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + ) + + +class ModelOptFp8MoEMethod(FusedMoEMethodBase): + """MoE method for ModelOpt FP8. + Supports loading FP8 checkpoints with static weight scale and + activation scale. + Args: + quant_config: The ModelOpt quantization config. + """ + + def __init__( + self, + quant_config: ModelOptFp8Config, + layer: torch.nn.Module, + ) -> None: + super().__init__(layer.moe_config) + self.layer = layer + self.quant_config = quant_config + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + cutlass_fp8_supported, + ) + + self.cutlass_fp8_supported = cutlass_fp8_supported() + self.flashinfer_moe_backend: FlashinferMoeBackend | None = None + if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe(): + self.flashinfer_moe_backend = get_flashinfer_moe_backend() + if ( + self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + and not self.moe.is_act_and_mul + ): + logger.info_once( + "Non-gated MoE is not supported for min-latency mode," + "falling back to high-throughput mode" + ) + self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS + + logger.info_once( + f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels" + ) + + def maybe_make_prepare_finalize( + self, + ) -> mk.FusedMoEPrepareAndFinalize | None: + # TRT LLM not supported with all2all yet. + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + return None + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize( + self.moe + ) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize + else: + return super().maybe_make_prepare_finalize() + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None + experts = select_cutlass_fp8_gemm_impl( + self.moe, + self.moe_quant_config, + ) + logger.debug_once("Using %s", experts.__class__.__name__) + return experts + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # Use FP8 dtype if checkpoint is serialized + weight_dtype = ( + torch.float8_e4m3fn + if self.quant_config.is_checkpoint_fp8_serialized + else params_dtype + ) + weight_loader = extra_weight_attrs.get("weight_loader") + + if self.moe.is_act_and_mul: + w13_up_dim = 2 * intermediate_size_per_partition + else: + w13_up_dim = intermediate_size_per_partition + + w13_weight = ModelWeightParameter( + data=torch.empty( + num_experts, + w13_up_dim, + hidden_size, + dtype=weight_dtype, + ), + input_dim=2, + output_dim=1, + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight", w13_weight) + + w2_weight = ModelWeightParameter( + data=torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=weight_dtype, + ), + input_dim=2, + output_dim=1, + weight_loader=weight_loader, + ) + layer.register_parameter("w2_weight", w2_weight) + + if self.quant_config.is_checkpoint_fp8_serialized: + # WEIGHT SCALES - Per-tensor scaling for ModelOpts + # For gated MoE, allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + # For non-gated MoE, allocate 1 scale for w13. + if self.moe.is_act_and_mul: + w13_weight_scale_shape = (num_experts, 2) + else: + w13_weight_scale_shape = (num_experts, 1) + w13_weight_scale = PerTensorScaleParameter( + data=torch.full( + w13_weight_scale_shape, + 1.0, + dtype=torch.float32, + ), + weight_loader=weight_loader, + ) + w2_weight_scale = PerTensorScaleParameter( + data=torch.full((num_experts,), 1.0, dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + + # Set weight loader attributes for scales + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + + # INPUT SCALES - Per-tensor scaling for ModelOpt + w13_input_scale = PerTensorScaleParameter( + data=torch.full((num_experts,), 1.0, dtype=torch.float32), + weight_loader=weight_loader, + ) + w2_input_scale = PerTensorScaleParameter( + data=torch.full((num_experts,), 1.0, dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("w13_input_scale", w13_input_scale) + layer.register_parameter("w2_input_scale", w2_input_scale) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + """Process FP8 MoE weights after loading from serialized checkpoint. + Only supports pre-quantized checkpoints with FP8 weights and scales. + """ + + layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False) + layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) + + from vllm._custom_ops import scaled_fp8_quant + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + per_tensor_dequantize, + ) + + # Handle scale parameters + if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None: + # Fp8 moe kernel needs single weight scale for w13 per expert. + # We take the max of the w1 and w3 scales + # then dequant and requant each expert. + if ( + layer.w13_weight_scale.dim() == 2 + and layer.w13_weight_scale.shape[1] == 2 + ): + assert self.moe.is_act_and_mul, ( + "w13_weight_scale should have 2 elements per expert " + "only for gated MoE" + ) + # Get the maximum scale across w1 and w3 for each expert + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + + # Requantize each expert's weights using the combined scale + # w13_weight (num_experts, 2 * intermediate_size, hidden_size) + # where the first intermediate_size rows are w1, the next are w3 + intermediate_size = layer.w13_weight.shape[1] // 2 + for expert_id in range(layer.w13_weight.shape[0]): + start = 0 + for shard_id in range(2): # w1 and w3 + # Dequantize using the original scale for this shard + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][ + start : start + intermediate_size, : + ], + layer.w13_weight_scale[expert_id][shard_id], + ) + # Requantize using the combined max scale + + ( + layer.w13_weight[expert_id][ + start : start + intermediate_size, : + ], + _, + ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + + start += intermediate_size + + # Update the scale parameter to be per-expert + layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False) + else: + layer.w13_weight_scale = Parameter( + layer.w13_weight_scale.data, requires_grad=False + ) + + if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None: + layer.w2_weight_scale = Parameter( + layer.w2_weight_scale.data, requires_grad=False + ) + # Input scales must be equal for each expert in fp8 MoE layers. + if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None: + layer.w13_input_scale = Parameter( + layer.w13_input_scale.max(), requires_grad=False + ) + if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None: + layer.w2_input_scale = Parameter( + layer.w2_input_scale.max(), requires_grad=False + ) + + if self.flashinfer_moe_backend is not None: + if self.moe.is_act_and_mul: + layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight) + register_moe_scaling_factors(layer) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + return None + + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + g1_alphas=layer.output1_scales_gate_scalar.squeeze(), + w2_scale=layer.w2_weight_scale, + g2_alphas=layer.output2_scales_scalar.squeeze(), + a1_scale=layer.w13_input_scale, + a1_gscale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + a2_gscale=layer.w2_input_scale_inv, + per_act_token_quant=False, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `ModelOptFp8MoEMethod` yet." + ) + + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + assert activation == "silu", ( + f"Expected 'silu' activation but got {activation}" + ) + assert not renormalize + return apply_flashinfer_per_tensor_scale_fp8( + layer=layer, + hidden_states=x, + router_logits=router_logits, + routing_bias=e_score_correction_bias, + global_num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + # Expert selection + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + assert activation in ("silu", "relu2_no_mul"), ( + "Expected activation to be in ('silu', 'relu2_no_mul')," + f"but got {activation}" + ) + return flashinfer_cutlass_moe_fp8( + x, + layer, + topk_weights, + topk_ids, + inplace=False, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + else: + from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts + + assert self.moe_quant_config is not None + + return fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + quant_config=self.moe_quant_config, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + + +class ModelOptNvFp4Config(QuantizationConfig): + """Config class for ModelOpt FP4.""" + + def __init__( + self, + is_checkpoint_nvfp4_serialized: bool, + kv_cache_quant_algo: str | None, + exclude_modules: list[str], + group_size: int = 16, + ) -> None: + super().__init__() + self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized + if is_checkpoint_nvfp4_serialized: + logger.warning( + "Detected ModelOpt NVFP4 checkpoint. Please note that" + " the format is experimental and could change in future." + ) + + self.group_size = group_size + self.kv_cache_quant_algo = kv_cache_quant_algo + self.exclude_modules = exclude_modules + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "modelopt_fp4" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.half, torch.float8_e4m3fn] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["hf_quant_config.json"] + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.exclude_modules is not None: + self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + """Detect if this ModelOpt FP4 config should be used based on + quantization config.""" + if hf_quant_cfg is None: + return None + + # Use the community standard 'quant_method' + quant_method = hf_quant_cfg.get("quant_method", "").lower() + + # Only proceed if the method is explicitly "modelopt" + if quant_method != "modelopt": + return None + + # Look for ModelOpt-specific config structure + if "quantization" in hf_quant_cfg: + quant_config = hf_quant_cfg["quantization"] + if isinstance(quant_config, dict): + quant_algo = quant_config.get("quant_algo", "") + if "NVFP4" in quant_algo: + return "modelopt_fp4" + else: + # Check for compressed-tensors style config with specific + # quant_algo field + quant_algo = hf_quant_cfg.get("quant_algo", "") + if isinstance(quant_algo, str) and "FP4" in quant_algo.upper(): + return "modelopt_fp4" + + return None + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config": + # Handle both traditional ModelOpt format and compressed-tensors + # style format + if "quantization" in config: + # Traditional ModelOpt format: + # {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError("Expected 'quantization' to be a dictionary in config") + + quant_method = quant_config.get("quant_algo", "") + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo") + if kv_cache_quant_algo_raw is None: + # No KV cache quantization by default + kv_cache_quant_algo = None + elif isinstance(kv_cache_quant_algo_raw, str): + kv_cache_quant_algo = kv_cache_quant_algo_raw + else: + raise ValueError( + f"kv_cache_quant_algo must be a string, got " + f"{type(kv_cache_quant_algo_raw)}" + ) + + # Handle group_size with proper type validation + group_size_raw = quant_config.get("group_size") + if group_size_raw is None: + group_size = 16 # Default value + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError( + f"group_size must be an integer, got {type(group_size_raw)}" + ) from None + + # "exclude_modules" is the key in the legacy hf_quant_config.json + exclude_modules = quant_config.get("exclude_modules", []) + if not isinstance(exclude_modules, list): + raise ValueError( + f"exclude_modules must be a list, got {type(exclude_modules)}" + ) + else: + # Compressed-tensors style format: + # {"quant_algo": "...", "quant_method": "modelopt"} + quant_method = config.get("quant_algo", "") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo") + if kv_cache_quant_algo_raw is None: + # No KV cache quantization by default + kv_cache_quant_algo = None + elif isinstance(kv_cache_quant_algo_raw, str): + kv_cache_quant_algo = kv_cache_quant_algo_raw + else: + raise ValueError( + f"kv_cache_quant_algo must be a string, got " + f"{type(kv_cache_quant_algo_raw)}" + ) + + # Handle group_size with proper type validation + group_size_raw = config.get("group_size") + if group_size_raw is None: + group_size = 16 # Default value + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError( + f"group_size must be an integer, got {type(group_size_raw)}" + ) from None + + # "ignore" is the key in config.json + exclude_modules = config.get("ignore", []) + if not isinstance(exclude_modules, list): + raise ValueError( + f"exclude_modules must be a list, got {type(exclude_modules)}" + ) + + if quant_method not in QUANT_ALGOS: + raise ValueError( + f"ModelOpt currently only supports: {QUANT_ALGOS} " + "quantizations in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration." + ) + is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method + + # For FP4, these fields are required + if is_checkpoint_nvfp4_serialized and "quantization" in config: + # Check if required fields are present in the quantization config + quant_config = config["quantization"] + required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"] + missing_fields = [ + field for field in required_fields if field not in quant_config + ] + if missing_fields: + raise ValueError( + f"NVFP4 quantization requires the following fields in " + f"hf_quant_config.json: {missing_fields}" + ) + + return cls( + is_checkpoint_nvfp4_serialized, + kv_cache_quant_algo, + exclude_modules, + group_size, + ) + + def is_layer_excluded(self, prefix: str) -> bool: + """ + Check if a layer should be excluded from quantization. + Handles both exact matching (for fused layers) and pattern matching. + """ + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping): + return True + + # Check regex pattern matching for patterns not caught by exact match + import regex as re + + for pattern in self.exclude_modules: + # Skip patterns that would be caught by exact matching + if "*" in pattern or "." in pattern: + regex_str = pattern.replace(".", r"\.").replace("*", r".*") + if re.fullmatch(regex_str, prefix): + return True + return False + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import ( # Avoid circular import + Attention, + MLAAttention, + ) + + skip_layer = self.is_layer_excluded(prefix) + if isinstance(layer, LinearBase): + if skip_layer: + return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + if "vision_tower" in prefix or "vision_model" in prefix: + return UnquantizedLinearMethod() + return ModelOptNvFp4LinearMethod(self) + elif isinstance(layer, (Attention, MLAAttention)): + return ModelOptFp8KVCacheMethod(self) + elif isinstance(layer, FusedMoE): + if skip_layer: + return None + return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer) + return None + + +class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from FP8 checkpoints. + """ + + def __init__(self, quant_config: ModelOptFp8Config | ModelOptNvFp4Config): + super().__init__(quant_config) + + +class ModelOptNvFp4LinearMethod(LinearMethodBase): + """Linear method for Model Optimizer NVFP4. + Supports loading NVFP4 checkpoints with the following structure: + + input_scale: torch.float32, scalar , + weight: NVFP4(represented as byte) Shape: [1, X, y/2] + weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale, + weight_scale_2: torch.float32, scalar, + Args: quant_config: The ModelOpt quantization config. + """ + + def __init__(self, quant_config: ModelOptNvFp4Config) -> None: + self.quant_config = quant_config + + self.backend = "none" + if envs.VLLM_NVFP4_GEMM_BACKEND is None: + if has_flashinfer(): + self.backend = "flashinfer-cutlass" + elif cutlass_fp4_supported(): + self.backend = "cutlass" + elif is_fp4_marlin_supported(): + self.backend = "marlin" + elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"): + self.backend = envs.VLLM_NVFP4_GEMM_BACKEND + assert has_flashinfer(), f"FlashInfer is required for {self.backend}" + elif envs.VLLM_NVFP4_GEMM_BACKEND == "cutlass": + self.backend = "cutlass" + assert cutlass_fp4_supported(), f"Cutlass is required for {self.backend}" + + if self.backend == "none": + raise ValueError( + "No valid NVFP4 GEMM backend found. " + "Please check your platform capability." + ) + + logger.info_once(f"Using {self.backend} for NVFP4 GEMM") + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + if not self.quant_config.is_checkpoint_nvfp4_serialized: + raise ValueError( + "NVFP4 quantization was selected, " + " dynamic quantization is not supported." + ) + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + if input_size_per_partition % 16 != 0: + raise ValueError( + "Unsupported model when in features size is not multiple of 16" + ) + # The nvfp4 weight is still represented as + weight_dtype = ( + torch.float8_e4m3fn + if self.quant_config.is_checkpoint_nvfp4_serialized + else params_dtype + ) + # Weight + weight = ModelWeightParameter( + data=torch.empty( + # 2 fp4 items are packed in the input dimension + layer.output_size_per_partition, + layer.input_size_per_partition // 2, + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # Input Weight Scale + input_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("input_scale", input_scale) + + # Global Weight Scale + weight_scale_2 = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale_2", weight_scale_2) + + # Per Block Weight Scale + weight_scale = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition // self.quant_config.group_size, + dtype=weight_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight_scale", weight_scale) + + def process_weights_after_loading(self, layer: Module) -> None: + # global scales: + input_scale_2 = layer.input_scale.max().to(torch.float32) + layer.input_scale = Parameter(input_scale_2, requires_grad=False) + + weight_scale_2 = layer.weight_scale_2.max().to(torch.float32) + layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False) + + layer.alpha = Parameter( + layer.input_scale * layer.weight_scale_2, requires_grad=False + ) + + # Calculate `1 / input_scale` so that we don't need to do so at runtime + layer.input_scale_inv = Parameter( + (1 / layer.input_scale).to(torch.float32), requires_grad=False + ) + + # Swizzle the weight blockscale. + # contracting dimension is input dimension + # block_size = 16; + assert layer.weight_scale.dtype == torch.float8_e4m3fn, ( + "Weight Block scale must be represented as FP8-E4M3" + ) + + if self.backend == "marlin": + prepare_fp4_layer_for_marlin(layer) + del layer.alpha + del layer.input_scale + elif self.backend == "flashinfer-trtllm": + # FlashInfer TRTLLM FP4 GEMM requires a different weight layout. + # FlashInfer provides nvfp4_quantize to quantize + shuffle the + # layout but we use our own quantization so we have to call + # shuffles ourselves. + from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a + + weight = layer.weight.data + weight_scale = layer.weight_scale.data + + epilogue_tile_m = 128 + weight = shuffle_matrix_a(weight.view(torch.uint8), epilogue_tile_m) + weight_scale = ( + shuffle_matrix_sf_a(weight_scale.view(torch.uint8), epilogue_tile_m) + .reshape(weight_scale.shape) + .view(torch.float8_e4m3fn) + ) + + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + layer.weight = Parameter(weight, requires_grad=False) + else: + swizzled_weight_scale = swizzle_blockscale(layer.weight_scale) + layer.weight_scale = Parameter(swizzled_weight_scale, requires_grad=False) + layer.weight = Parameter(layer.weight.data, requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + if self.backend == "marlin": + return apply_fp4_marlin_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + weight_scale_2=layer.weight_scale_2, + workspace=layer.workspace, + size_n=layer.output_size_per_partition, + size_k=layer.input_size_per_partition, + bias=bias, + ) + + output_dtype = x.dtype + output_shape = [x.shape[0], layer.weight.shape[0]] + + # quantize BF16 or FP16 to (FP4 and interleaved block scale) + x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv) + + # validate dtypes of quantized input, input block scale, + # weight and weight_blockscale + assert x_fp4.dtype == torch.uint8 + assert layer.weight.dtype == torch.uint8 + assert x_blockscale.dtype == torch.float8_e4m3fn + assert layer.weight_scale.dtype == torch.float8_e4m3fn + assert layer.alpha.dtype == torch.float32 + + mm_args = ( + x_fp4, + layer.weight, + x_blockscale, + layer.weight_scale, + layer.alpha, + output_dtype, + ) + if self.backend.startswith("flashinfer-"): + backend_name = self.backend[len("flashinfer-") :] + out = flashinfer_scaled_fp4_mm(*mm_args, backend=backend_name) + else: + assert self.backend == "cutlass" + out = cutlass_scaled_fp4_mm(*mm_args) + + if bias is not None: + out = out + bias + return out.view(*output_shape) + + +class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): + """ + MoE Method for FP4 Quantization. + Args: + quant_config: NVFP4 Quant Config + """ + + def __init__( + self, + quant_config: ModelOptNvFp4Config, + moe: FusedMoEConfig, + layer: torch.nn.Module, + ) -> None: + from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( + detect_nvfp4_moe_support, # noqa: E501 + ) + + super().__init__(moe) + self.quant_config = quant_config + self.layer = layer + _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) + self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported + self.allow_flashinfer = _nvfp4.allow_flashinfer + self.use_marlin = _nvfp4.use_marlin + self.flashinfer_moe_backend = None + self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {} + if self.allow_flashinfer: + self.flashinfer_moe_backend = get_flashinfer_moe_backend() + logger.info_once( + f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels" + " for ModelOptNvFp4FusedMoE." + ) + + def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + if self.use_marlin or ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + return None + elif ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS + ): + # For now, fp4 moe only works with the flashinfer dispatcher. + prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize( + self.moe + ) + logger.debug_once("%s", prepare_finalize.__class__.__name__) + return prepare_finalize + else: + return super().maybe_make_prepare_finalize() + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None + experts = select_nvfp4_gemm_impl( + self.moe, + self.moe_quant_config, + allow_flashinfer=self.allow_flashinfer, + ) + logger.debug_once("Using %s", experts.__class__.__name__) + return experts + + def uses_weight_scale_2_pattern(self) -> bool: + """ + FP4 variants use 'weight_scale_2' pattern for per-tensor weight scales. + """ + return True + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + if not self.quant_config.is_checkpoint_nvfp4_serialized: + raise ValueError( + "NVFP4 quantization was selected, " + " dynamic quantization is not supported." + ) + + layer.num_experts = num_experts + layer.params_dtype = params_dtype + layer.quant_config = self.quant_config + weight_dtype = torch.uint8 + weight_scale_dtype = torch.float8_e4m3fn + weight_loader = extra_weight_attrs.get("weight_loader") + global_num_experts = extra_weight_attrs.get("global_num_experts") + # GEMM 1 + w13_weight = ModelWeightParameter( + data=torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + # 2 fp4 items are packed in the input dimension + hidden_size // 2, + dtype=weight_dtype, + ), + input_dim=1, + output_dim=2, + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight", w13_weight) + + # GEMM 2 + w2_weight = ModelWeightParameter( + data=torch.empty( + num_experts, + hidden_size, + # 2 fp4 items are packed in the input dimension + intermediate_size_per_partition // 2, + dtype=weight_dtype, + ), + input_dim=1, + output_dim=2, + weight_loader=weight_loader, + ) + layer.register_parameter("w2_weight", w2_weight) + + w13_weight_scale = ModelWeightParameter( + data=torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + # 2 fp4 items are packed in the input dimension + hidden_size // self.quant_config.group_size, + dtype=weight_scale_dtype, + ), + input_dim=1, + output_dim=2, + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + + w2_weight_scale = ModelWeightParameter( + data=torch.empty( + num_experts, + hidden_size, + # 2 fp4 items are packed in the input dimension + intermediate_size_per_partition // self.quant_config.group_size, + dtype=weight_scale_dtype, + ), + input_dim=1, + output_dim=2, + weight_loader=weight_loader, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value} + ) + + w13_weight_scale_2 = PerTensorScaleParameter( + data=torch.empty(num_experts, 2, dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2) + + w2_weight_scale_2 = PerTensorScaleParameter( + data=torch.empty(num_experts, dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2) + + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + + use_global_sf = self.allow_flashinfer and is_flashinfer_supporting_global_sf( + self.flashinfer_moe_backend + ) + global_scale_num_experts = global_num_experts if use_global_sf else num_experts + + w13_input_scale = PerTensorScaleParameter( + data=torch.empty(global_scale_num_experts, 2, dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("w13_input_scale", w13_input_scale) + + w2_input_scale = PerTensorScaleParameter( + data=torch.empty(global_scale_num_experts, dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("w2_input_scale", w2_input_scale) + + def prepare_static_weights_for_trtllm_fp4_moe( + self, + # args_dequant, + # args, + gemm1_weights, + gemm2_weights, + gemm1_scales_linear_fp4_bytes, + gemm2_scales_linear_fp4_bytes, + hidden_size, + intermediate_size, + num_experts, + ): + from flashinfer import nvfp4_block_scale_interleave + from flashinfer.fused_moe.core import ( + _maybe_get_cached_w3_w1_permute_indices, + get_w2_permute_indices_with_cache, + ) + + """Prepare quantized weights for kernel (done offline with weights).""" + epilogue_tile_m = 128 # FIXME: this depends on the kernel internals + + # Convert quantized weights to proper formats + gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape( + num_experts, 2 * intermediate_size, hidden_size // 2 + ) # packed fp4 + gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view( + torch.float8_e4m3fn + ).reshape( + num_experts, 2 * intermediate_size, hidden_size // 16 + ) # fp8 scaling factors + + gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape( + num_experts, hidden_size, intermediate_size // 2 + ) # packed fp4 + gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view( + torch.float8_e4m3fn + ).reshape( + num_experts, hidden_size, intermediate_size // 16 + ) # fp8 scaling factors + + gemm1_weights_fp4_shuffled = [] + gemm1_scales_fp4_shuffled = [] + gemm2_weights_fp4_shuffled = [] + gemm2_scales_fp4_shuffled = [] + for i in range(num_experts): + # Calculate the permute indices for the following: + # 1. Reorder rows of W1 and scales for fused gated activation + # 2. Shuffle weights and scaling factors for transposed mma output + # for both w3_w1 and w2 weights and scale factors + permute_indices = _maybe_get_cached_w3_w1_permute_indices( + self._cache_permute_indices, + gemm1_weights_fp4[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm1_weights_fp4_shuffled.append( + gemm1_weights_fp4[i] + .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)] + .contiguous() + ) + + permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices( + self._cache_permute_indices, + gemm1_scales_linear_fp4[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm1_scales_fp4_shuffled.append( + nvfp4_block_scale_interleave( + gemm1_scales_linear_fp4[i] + .view(torch.uint8)[ + permute_sf_indices.to(gemm1_scales_linear_fp4.device) + ] + .contiguous() + ) + ) + + permute_indices = get_w2_permute_indices_with_cache( + self._cache_permute_indices, + gemm2_weights_fp4[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm2_weights_fp4_shuffled.append( + gemm2_weights_fp4[i] + .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)] + .contiguous() + ) + + permute_sf_indices = get_w2_permute_indices_with_cache( + self._cache_permute_indices, + gemm2_scales_linear_fp4[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm2_scales_fp4_shuffled.append( + nvfp4_block_scale_interleave( + gemm2_scales_linear_fp4[i] + .view(torch.uint8)[ + permute_sf_indices.to(gemm2_scales_linear_fp4.device) + ] + .contiguous() + ) + ) + + # Stack weights for all experts + gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled) + gemm1_scales_fp4_shuffled = ( + torch.stack(gemm1_scales_fp4_shuffled) + .view(torch.float8_e4m3fn) + .reshape(num_experts, 2 * intermediate_size, hidden_size // 16) + ) + + gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled) + gemm2_scales_fp4_shuffled = ( + torch.stack(gemm2_scales_fp4_shuffled) + .view(torch.float8_e4m3fn) + .reshape(num_experts, hidden_size, intermediate_size // 16) + ) + return ( + gemm1_weights_fp4_shuffled, + gemm1_scales_fp4_shuffled, + gemm2_weights_fp4_shuffled, + gemm2_scales_fp4_shuffled, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # GEMM 1 processing + gemm1_weight = layer.w13_weight.data + gemm1_weight_scale = layer.w13_weight_scale.data + + if self.allow_flashinfer: + gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1( + gemm1_weight, gemm1_weight_scale, dim=-2 + ) + + layer.w13_weight = Parameter(gemm1_weight, requires_grad=False) + layer.w13_weight_scale = Parameter(gemm1_weight_scale, requires_grad=False) + + # Common processing for w13_weight_scale_2 + if not torch.allclose( + layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1] + ): + logger.warning_once( + "w1_weight_scale_2 must match w3_weight_scale_2. " + "Accuracy may be affected." + ) + + w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0] + layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False) + + # Common processing for input scales and alphas + use_global_sf = self.allow_flashinfer and is_flashinfer_supporting_global_sf( + self.flashinfer_moe_backend + ) + if use_global_sf: + # For backends provide by Flashinfer, the input global scales are + # shared across all experts. + w13_input_scale = ( + layer.w13_input_scale.max().to(torch.float32).expand(layer.num_experts) + ) + else: + w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32) + layer.g1_alphas = Parameter( + (w13_input_scale * w13_weight_scale_2).to(torch.float32), + requires_grad=False, + ) + + # This is for quantization, so we need to invert it. + layer.w13_input_scale_quant = Parameter( + (1 / w13_input_scale).to(torch.float32), requires_grad=False + ) + + # GEMM 2 processing + if use_global_sf: + # For backends provide by Flashinfer, the input global scales are + # shared across all experts. + w2_input_scale = ( + layer.w2_input_scale.max().to(torch.float32).expand(layer.num_experts) + ) + else: + w2_input_scale = layer.w2_input_scale + layer.g2_alphas = Parameter( + (w2_input_scale * layer.w2_weight_scale_2).to(torch.float32), + requires_grad=False, + ) + + # This is for quantization, so we need to invert it. + layer.w2_input_scale_quant = Parameter( + (1 / w2_input_scale).to(torch.float32), requires_grad=False + ) + + # TensorRT-LLM specific processing + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + # Prepare static weights for TRT-LLM kernel + # alternate: prepare_static_weight_layouts_for_trtllm_moe + ( + gemm1_weights_fp4_shuffled, + gemm1_scales_fp4_shuffled, + gemm2_weights_fp4_shuffled, + gemm2_scales_fp4_shuffled, + ) = self.prepare_static_weights_for_trtllm_fp4_moe( + layer.w13_weight, + layer.w2_weight, + layer.w13_weight_scale, + layer.w2_weight_scale, + layer.w2_weight.size(-2), # hidden_size + layer.w13_weight.size(-2) // 2, # intermediate_size + layer.w13_weight.size(0), # num_experts + ) + logger.debug_once("Finished shuffling weights for TRT-LLM MOE") + + layer.gemm1_weights_fp4_shuffled = Parameter( + gemm1_weights_fp4_shuffled, requires_grad=False + ) + layer.gemm2_weights_fp4_shuffled = Parameter( + gemm2_weights_fp4_shuffled, requires_grad=False + ) + layer.gemm1_scales_fp4_shuffled = Parameter( + gemm1_scales_fp4_shuffled, requires_grad=False + ) + layer.gemm2_scales_fp4_shuffled = Parameter( + gemm2_scales_fp4_shuffled, requires_grad=False + ) + + # Additional parameter needed for TRT-LLM + layer.g1_scale_c = Parameter( + (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32), + requires_grad=False, + ) + + # Clean up weights that won't be used by TRT-LLM + del layer.w2_weight + del layer.w2_weight_scale + del layer.w13_weight + del layer.w13_weight_scale + elif self.use_marlin: + # Marlin processing + prepare_moe_fp4_layer_for_marlin(layer) + del layer.g1_alphas + del layer.g2_alphas + del layer.w13_input_scale_quant + del layer.w2_input_scale_quant + else: + # Non-TRT-LLM processing (Cutlass or non-flashinfer) + w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale) + layer.w13_weight_scale = Parameter( + w13_blockscale_swizzled, requires_grad=False + ) + + w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale) + layer.w2_weight_scale = Parameter( + w2_blockscale_swizzled, requires_grad=False + ) + layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + if ( + self.use_marlin + or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + return None + + return nvfp4_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + g1_alphas=layer.g1_alphas, + g2_alphas=layer.g2_alphas, + a1_gscale=layer.w13_input_scale_quant, + a2_gscale=layer.w2_input_scale_quant, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `ModelOptNvFp4FusedMoE` yet." + ) + assert activation == "silu", "Only SiLU activation is supported." + + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + import flashinfer + + from vllm.model_executor.models.llama4 import Llama4MoE + + a1_gscale = layer.w13_input_scale_quant + (hidden_states_fp4, hidden_states_scale_linear_fp4) = ( + flashinfer.fp4_quantize( + x, + a1_gscale, + is_sf_swizzled_layout=False, + ) + ) + use_llama4_routing = ( + custom_routing_function is Llama4MoE.custom_routing_function + ) + routing_method_type = layer.routing_method_type + if use_llama4_routing: + routing_method_type = RoutingMethodType.Llama4 + router_logits = ( + router_logits.to(torch.float32) + if routing_method_type == RoutingMethodType.DeepSeekV3 + else router_logits + ) + routing_bias = e_score_correction_bias + if routing_bias is not None: + routing_bias = routing_bias.to(torch.bfloat16) + out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe( + routing_logits=router_logits, + routing_bias=routing_bias, + hidden_states=hidden_states_fp4, + hidden_states_scale=hidden_states_scale_linear_fp4.view( + torch.float8_e4m3fn + ).flatten(), + gemm1_weights=layer.gemm1_weights_fp4_shuffled.data, + gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn + ), + gemm1_bias=None, + gemm1_alpha=None, + gemm1_beta=None, + gemm1_clamp_limit=None, + gemm2_weights=layer.gemm2_weights_fp4_shuffled.data, + gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn + ), + gemm2_bias=None, + output1_scale_scalar=layer.g1_scale_c.data, + output1_scale_gate_scalar=layer.g1_alphas.data, + output2_scale_scalar=layer.g2_alphas.data, + num_experts=global_num_experts, + top_k=top_k, + n_group=num_expert_group, + topk_group=topk_group, + intermediate_size=layer.intermediate_size_per_partition, + local_expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + routed_scaling_factor=None, + tile_tokens_dim=None, + routing_method_type=routing_method_type, + do_finalize=True, + )[0] + return out + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + if self.use_marlin: + return fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + None, + None, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + global_scale1=layer.w13_weight_scale_2, + global_scale2=layer.w2_weight_scale_2, + quant_type_id=scalar_types.float4_e2m1f.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + workspace=layer.workspace, + ) + + elif ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS + ): + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + flashinfer_cutlass_moe_fp4, + ) + + assert self.moe_quant_config is not None + + return flashinfer_cutlass_moe_fp4( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + quant_config=self.moe_quant_config, + inplace=False, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + else: + # If no modular kernel is provided, use cutlass_moe_fp4 for TP case + # only (no EP). + from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 + + assert self.moe_quant_config is not None + return cutlass_moe_fp4( + a=x, + w1_fp4=layer.w13_weight, + w2_fp4=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + quant_config=self.moe_quant_config, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + # TODO: derive from arguments + m=x.shape[0], + n=layer.w2_weight.shape[2] * 2, + k=x.shape[1], + e=layer.w13_weight.shape[0], + ) diff --git a/model_executor/layers/quantization/moe_wna16.py b/model_executor/layers/quantization/moe_wna16.py new file mode 100644 index 0000000..2090c86 --- /dev/null +++ b/model_executor/layers/quantization/moe_wna16.py @@ -0,0 +1,541 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any, Optional + +import torch + +from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + int4_w4a16_moe_quant_config, + int8_w8a16_moe_quant_config, +) +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, + FusedMoEConfig, + FusedMoEMethodBase, + FusedMoeWeightScaleSupported, +) +from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + check_marlin_supports_layer, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform + + +class MoeWNA16Config(QuantizationConfig): + """Config class for MOE WNA16 (W8A16/W4A16) quantization.""" + + def __init__( + self, + linear_quant_method: str, + weight_bits: int, + group_size: int, + has_zp: bool, + lm_head_quantized: bool, + modules_to_not_convert: list[str] | None, + full_config: dict[str, Any], + ) -> None: + super().__init__() + self.weight_bits = weight_bits + self.group_size = group_size + self.has_zp = has_zp + self.bit8_pack_factor = 8 // self.weight_bits + self.lm_head_quantized = lm_head_quantized + self.linear_quant_method = linear_quant_method + self.full_config = full_config + self.use_marlin = False + # Avoid circular import + from vllm.model_executor.layers.quantization.awq import AWQConfig + from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig + from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig + + if self.linear_quant_method == "gptq": + self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config) + elif self.linear_quant_method == "awq": + capability_tuple = current_platform.get_device_capability() + device_capability = ( + -1 if capability_tuple is None else capability_tuple.to_int() + ) + awq_min_capability = AWQConfig.get_min_capability() + if device_capability < awq_min_capability: + raise ValueError( + "The quantization method moe_wna16 + awq is not supported " + "for the current GPU. " + f"Minimum capability: {awq_min_capability}. " + f"Current capability: {device_capability}." + ) + self.use_marlin = AWQMarlinConfig.is_awq_marlin_compatible(full_config) + else: + raise ValueError("moe_wna16 only support gptq and awq.") + + if modules_to_not_convert is None: + self.modules_to_not_convert = [] + else: + self.modules_to_not_convert = modules_to_not_convert + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "moe_wna16" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "MoeWNA16Config": + linear_quant_method = cls.get_from_keys(config, ["quant_method"]) + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) + if linear_quant_method == "gptq": + has_zp = not cls.get_from_keys(config, ["sym"]) + modules_to_not_convert = [] + elif linear_quant_method == "awq": + has_zp = cls.get_from_keys(config, ["zero_point"]) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None + ) + else: + raise ValueError("moe_wna16 only support gptq and awq.") + + return cls( + linear_quant_method, + weight_bits, + group_size, + has_zp, + lm_head_quantized, + modules_to_not_convert, + config, + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg) + if can_convert and user_quant == "moe_wna16": + return cls.get_name() + return None + + @classmethod + def is_moe_wna16_compatible(cls, quant_config: dict[str, Any]): + # Extract data from quant config. + quant_method = quant_config.get("quant_method", "").lower() + num_bits = quant_config.get("bits") + desc_act = quant_config.get("desc_act") + + capability_tuple = current_platform.get_device_capability() + device_capability = ( + -1 if capability_tuple is None else capability_tuple.to_int() + ) + # Avoid circular import + from vllm.model_executor.layers.quantization.awq import AWQConfig + + awq_min_capability = AWQConfig.get_min_capability() + + gptq_compatible = quant_method == "gptq" and not desc_act and num_bits in [4, 8] + awq_compatible = ( + quant_method == "awq" + and num_bits == 4 + and device_capability >= awq_min_capability + ) + + return gptq_compatible or awq_compatible + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if is_layer_skipped_quant(prefix, self.modules_to_not_convert): + return UnquantizedLinearMethod() + elif isinstance(layer, LinearBase): + # Avoid circular import + from vllm.model_executor.layers.quantization.awq import AWQConfig + from vllm.model_executor.layers.quantization.awq_marlin import ( + AWQMarlinConfig, + ) + from vllm.model_executor.layers.quantization.gptq import GPTQConfig + from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig, + ) + + if self.linear_quant_method == "gptq": + if self.use_marlin: + return GPTQMarlinConfig.from_config( + self.full_config + ).get_quant_method(layer, prefix) + else: + return GPTQConfig.from_config(self.full_config).get_quant_method( + layer, prefix + ) + elif self.linear_quant_method == "awq": + if self.use_marlin and check_marlin_supports_layer( + layer, self.group_size + ): + return AWQMarlinConfig.from_config( + self.full_config + ).get_quant_method(layer, prefix) + else: + return AWQConfig.from_config(self.full_config).get_quant_method( + layer, prefix + ) + else: + raise ValueError("moe_wna16 only support gptq and awq.") + elif isinstance(layer, FusedMoE): + return MoeWNA16Method(self, layer.moe_config) + return None + + +def is_layer_skipped_quant(prefix: str, modules_to_not_convert: list[str]): + return any(module_name in prefix for module_name in modules_to_not_convert) + + +class MoeWNA16Method(FusedMoEMethodBase): + """Linear method for MOE WNA16 (W8A16/W4A16) quantization. + + Args: + quant_config: The MOE WNA16 (W8A16/W4A16) quantization config. + """ + + def __init__(self, quant_config: MoeWNA16Config, moe: "FusedMoEConfig") -> None: + super().__init__(moe) + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.quant_config = self.quant_config + bit8_pack_factor = self.quant_config.bit8_pack_factor + group_size = self.quant_config.group_size + group_size_div_factor = 1 + + # make intermediate_size and hidden_size divisible by group_size + # we reduce the group size to ensure that + # and we would repeat the loaded_weight later + while intermediate_size_per_partition % group_size or hidden_size % group_size: + group_size = group_size // 2 + group_size_div_factor *= 2 + assert group_size >= 32 + layer.group_size = group_size + layer.group_size_div_factor = group_size_div_factor + + strategy = FusedMoeWeightScaleSupported.GROUP.value + extra_weight_attrs.update({"quant_method": strategy, "is_transposed": False}) + + assert "weight_loader" in extra_weight_attrs + weight_loader = extra_weight_attrs["weight_loader"] + wrapped_weight_loader = MoeWNA16Method.get_weight_loader(layer, weight_loader) + extra_weight_attrs["weight_loader"] = wrapped_weight_loader + + # Fused gate_up_proj (column parallel) + w13_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // bit8_pack_factor, + dtype=torch.uint8, + ), + requires_grad=False, + ) + layer.register_parameter("w13_qweight", w13_qweight) + set_weight_attrs(w13_qweight, extra_weight_attrs) + + # down_proj (row parallel) + w2_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // bit8_pack_factor, + dtype=torch.uint8, + ), + requires_grad=False, + ) + layer.register_parameter("w2_qweight", w2_qweight) + set_weight_attrs(w2_qweight, extra_weight_attrs) + + w13_scales = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // group_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_scales", w13_scales) + set_weight_attrs(w13_scales, extra_weight_attrs) + + w2_scales = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition // group_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_scales", w2_scales) + set_weight_attrs(w2_scales, extra_weight_attrs) + + if self.quant_config.has_zp: + w13_qzeros = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition // bit8_pack_factor, + hidden_size // group_size, + dtype=torch.uint8, + ), + requires_grad=False, + ) + layer.register_parameter("w13_qzeros", w13_qzeros) + set_weight_attrs(w13_qzeros, extra_weight_attrs) + + w2_qzeros = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size // bit8_pack_factor, + intermediate_size_per_partition // group_size, + dtype=torch.uint8, + ), + requires_grad=False, + ) + layer.register_parameter("w2_qzeros", w2_qzeros) + set_weight_attrs(w2_qzeros, extra_weight_attrs) + + if self.quant_config.linear_quant_method == "gptq": + # some param are unused, but we need to init them in order to + # load weights + invalid_param_keys = ["w13_g_idx", "w2_g_idx"] + if not self.quant_config.has_zp: + invalid_param_keys += ["w13_qzeros", "w2_qzeros"] + for key in invalid_param_keys: + param = torch.nn.Parameter( + torch.empty((0,), dtype=torch.int32), requires_grad=False + ) + layer.register_parameter(key, param) + set_weight_attrs(param, extra_weight_attrs) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + weight_bits = self.quant_config.weight_bits + has_zp = self.quant_config.has_zp + assert weight_bits == 4 or weight_bits == 8 + config_builder = ( + int4_w4a16_moe_quant_config + if weight_bits == 4 + else int8_w8a16_moe_quant_config + ) + + return config_builder( + w1_scale=layer.w13_scales, + w2_scale=layer.w2_scales, + w1_zp=layer.w13_qzeros if has_zp else None, + w2_zp=layer.w2_qzeros if has_zp else None, + block_shape=[0, layer.group_size], + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.") + + from vllm.model_executor.layers.fused_moe import fused_experts + + assert activation == "silu", "Only SiLU activation is supported." + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + return fused_experts( + x, + layer.w13_qweight, + layer.w2_qweight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + + @staticmethod + def get_weight_loader(layer, weight_loader): + def convert_awq_tensor(tensor, tensor_type): + # convert awq qweight/qzeros to a standard format (assume int4) + # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8) + # qzeros: (k // group_size, n // pack_factor_bit32) -> + # (n // pack_factor_bit8, k // group_size) + # pack_factor_bit32 = 32 // weight_bits + # pack_factor_bit8 = 8 // weight_bits + + # 0. suppose origin shape (a, b), dtype int32 + # 1. convert to uint8, shape (a, b) -> (a, 4 * b) + size0 = tensor.size(0) + tensor = tensor.view(torch.uint8) + + # 2. unpack to uint4 (only when weight_bits == 4) + # shape (a, 4 * b) -> (a, 4 * b, 2) + shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device) + tensor = (tensor[:, :, None] >> shifter) & 0xF + + # 3. change order, see + # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py + # shape -> (a, 4 * b * pack_factor_bit8) + reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7] + tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order] + tensor = tensor.view(size0, -1) + + # 4. transpose, shape -> (4 * b * pack_factor_bit8, a) + tensor = tensor.T.contiguous() + + # 5. repack (only when weight_bits == 4) + # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8) + # qzeros shape -> (4 * b, a) + + if tensor_type == "qweight": + tensor = tensor[:, 1::2] * 16 + tensor[:, ::2] + elif tensor_type == "qzeros": + tensor = tensor[1::2, :] * 16 + tensor[::2, :] + return tensor + + def convert_gptq_int4_qzeros(tensor): + tensor = tensor.view(torch.uint8) + shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device) + tensor = (tensor[:, :, None] >> shifter) & 0xF + tensor = tensor + 1 + tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16 + return tensor + + def moe_wna16_weight_loader( + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: bool = False, + ): + if "g_idx" in weight_name: + return False if return_success else None + if not layer.quant_config.has_zp and "qzeros" in weight_name: + return False if return_success else None + + device = get_tp_group().device + tp_rank = get_tensor_model_parallel_rank() + loaded_weight = loaded_weight.to(device) + shard_size = layer.intermediate_size_per_partition + + # convert gptq and awq weight to a standard format + if layer.quant_config.linear_quant_method == "awq": + assert layer.quant_config.weight_bits == 4 + if "weight" in weight_name: + loaded_weight = convert_awq_tensor(loaded_weight, "qweight") + elif "zeros" in weight_name: + loaded_weight = convert_awq_tensor(loaded_weight, "qzeros") + else: + loaded_weight = loaded_weight.T + elif layer.quant_config.linear_quant_method == "gptq": + assert layer.quant_config.weight_bits in [4, 8] + if "weight" in weight_name: + loaded_weight = loaded_weight.T.contiguous().view(torch.uint8) + elif "zeros" in weight_name: + # add 1 to gptq qzeros to align with awq + loaded_weight = loaded_weight.view(torch.uint8) + if layer.quant_config.weight_bits == 4: + loaded_weight = convert_gptq_int4_qzeros(loaded_weight).T + else: + loaded_weight = loaded_weight.T + 1 + else: + loaded_weight = loaded_weight.T + + # repeat the qzeros/scales to fit new group size + if ( + layer.group_size_div_factor > 1 + and "qzeros" in weight_name + or "scales" in weight_name + ): + loaded_weight = loaded_weight.repeat_interleave( + layer.group_size_div_factor, 1 + ) + + if "w13_qzeros" in weight_name: + tensor = loaded_weight.view(layer.tp_size, -1, loaded_weight.size(1))[ + tp_rank + ] + if shard_id == "w1": + param.data[expert_id, : shard_size // 2] = tensor + else: + param.data[expert_id, shard_size // 2 :] = tensor + return True if return_success else None + elif "w2_qzeros" in weight_name: + param.data[expert_id] = loaded_weight.view( + loaded_weight.size(0), layer.tp_size, -1 + )[:, tp_rank] + return True if return_success else None + else: + # Delegate to the original loader, passing return_success + return weight_loader( + param, + loaded_weight, + weight_name, + shard_id, + expert_id, + return_success=return_success, + ) + + return moe_wna16_weight_loader diff --git a/model_executor/layers/quantization/mxfp4.py b/model_executor/layers/quantization/mxfp4.py new file mode 100644 index 0000000..b95d1a6 --- /dev/null +++ b/model_executor/layers/quantization/mxfp4.py @@ -0,0 +1,1162 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable +from enum import Enum +from typing import Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm import envs +from vllm.config import get_current_vllm_config +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + FusedMoEConfig, + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe import modular_kernel as mk +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + mxfp4_mxfp8_moe_quant_config, + mxfp4_w4a16_moe_quant_config, + ocp_mx_moe_quant_config, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + BatchedMarlinExperts, + MarlinExperts, + fused_marlin_moe, +) +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + OAITritonExperts, +) +from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts +from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + prepare_moe_fp4_layer_for_marlin, +) +from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( + _can_support_mxfp4, + _swizzle_mxfp4, + get_padding_alignment, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.utils.flashinfer import has_flashinfer +from vllm.utils.import_utils import has_triton_kernels +from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import is_torch_equal_or_newer + +logger = init_logger(__name__) + + +# enum for mxfp4 backend +class Mxfp4Backend(Enum): + NONE = 0 + + # FlashInfer Backend + SM100_FI_MXFP4_MXFP8_TRTLLM = 1 + SM100_FI_MXFP4_MXFP8_CUTLASS = 2 + SM100_FI_MXFP4_BF16 = 3 + SM90_FI_MXFP4_BF16 = 4 + + # Marlin Backend + MARLIN = 5 + + # Triton Backend + TRITON = 6 + + +def get_mxfp4_backend_with_lora() -> Mxfp4Backend: + """ + Not all MXFP4 backends support LoRA. Select backends that are known to + have LoRA support. + """ + if not current_platform.is_cuda(): + return Mxfp4Backend.NONE + + logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend") + return Mxfp4Backend.MARLIN + + +def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: + # Backend Selection + + if with_lora_support: + return get_mxfp4_backend_with_lora() + + if current_platform.is_cuda(): + if ( + current_platform.is_device_capability(90) + and has_flashinfer() + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 + ): + logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90") + return Mxfp4Backend.SM90_FI_MXFP4_BF16 + elif ( + current_platform.is_device_capability(100) + and has_flashinfer() + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS + ): + logger.info_once("Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100") + return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + elif ( + current_platform.is_device_capability(100) + and has_flashinfer() + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + ): + return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + elif current_platform.is_device_capability(100) and has_flashinfer(): + logger.info_once( + "Using FlashInfer MXFP4 BF16 backend for SM100, " + "For faster performance on SM100, consider setting " + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, though this may impact " + "accuracy." + ) + return Mxfp4Backend.SM100_FI_MXFP4_BF16 + elif ( + current_platform.is_device_capability(100) + or current_platform.is_device_capability(90) + ) and not has_flashinfer(): + logger.warning_once( + "MXFP4 MoE is enabled on Hopper/Blackwell but FlashInfer " + "is not available. This may result in degraded performance. " + "Please `pip install vllm[flashinfer]` for best results." + ) + + # If FlashInfer is not available, try either Marlin or Triton + if ( + envs.VLLM_MXFP4_USE_MARLIN + or current_platform.get_device_capability()[0] < 9 + or not has_triton_kernels() + or not is_torch_equal_or_newer("2.8.0") + ): + logger.info_once("Using Marlin backend") + return Mxfp4Backend.MARLIN + else: + logger.info_once("Using Triton backend") + return Mxfp4Backend.TRITON + elif current_platform.is_xpu(): + logger.info_once("Using ipex marlin backend on XPU") + return Mxfp4Backend.MARLIN + elif current_platform.is_rocm() and has_triton_kernels(): + logger.info_once("Using Triton backend") + return Mxfp4Backend.TRITON + + return Mxfp4Backend.NONE + + +class Mxfp4Config(QuantizationConfig): + def __init__(self, ignored_layers: list[str] | None = None): + super().__init__() + self.ignored_layers = ignored_layers + + @classmethod + def from_config(cls, config): + return cls() + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "mxfp4" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16] + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + if isinstance(layer, LinearBase): + if self.ignored_layers and is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedLinearMethod() + # TODO: Add support for MXFP4 Linear Method. + # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation + # if you are interested in enabling MXFP4 here. + logger.warning_once( + "MXFP4 linear layer is not implemented - falling back to " + "UnquantizedLinearMethod." + ) + return UnquantizedLinearMethod() + elif isinstance(layer, FusedMoE): + if current_platform.is_xpu(): + return IpexMxfp4MoEMethod(layer.moe_config) + else: + return Mxfp4MoEMethod(layer.moe_config) + elif isinstance(layer, Attention): + # TODO: Add support for MXFP4 Attention. + logger.warning_once( + "MXFP4 attention layer is not implemented. " + "Skipping quantization for this layer." + ) + return None + + +class Mxfp4MoEMethod(FusedMoEMethodBase): + def __init__(self, moe: FusedMoEConfig): + super().__init__(moe) + self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled) + self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN + self.max_capture_size = ( + get_current_vllm_config().compilation_config.max_cudagraph_capture_size + ) + + assert self.mxfp4_backend != Mxfp4Backend.NONE, ( + f"get_mxfp4_backend(with_lora_support={moe.is_lora_enabled}) found" + "no compatible MXFP4 MoE backend (FlashInfer/Marlin/Triton)." + "Please check your environment and try again." + ) + self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {} + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + self.num_experts = num_experts + weight_dtype = torch.uint8 + scale_dtype = torch.uint8 + + # FIXME (zyongye): ship after torch and safetensors support mxfp4 + # is_torch_mxfp4_available = ( + # hasattr(torch, "float4_e2m1fn_x2") and + # hasattr(torch, "float8_e8m0fnu")) + # if is_torch_mxfp4_available: + # weight_dtype = torch.float4_e2m1fn_x2 + # scale_dtype = torch.float8_e8m0fnu + + mxfp4_block = 32 + + intermediate_size_per_partition_after_pad = intermediate_size_per_partition + if self.mxfp4_backend == Mxfp4Backend.MARLIN: + # The moe marlin kernel requires that for each linear + # n % 256 == 0 and k % 128 == 0. + # In gate_up_proj: + # n = 2 * intermediate_size_per_partition_after_pad + # k = hidden_size + # In down_proj + # n = hidden_size + # k = intermediate_size_per_partition_after_pad + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 128 + ) + if current_platform.is_xpu(): + hidden_size = round_up(hidden_size, 128) + else: + hidden_size = round_up(hidden_size, 256) + + layer.params_dtype = params_dtype + layer.num_experts = num_experts + layer.hidden_size = hidden_size + layer.intermediate_size_per_partition = ( + intermediate_size_per_partition_after_pad + ) + elif ( + self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 + ): + # pad the intermediate size to be a multiple of 2 * mxfp4_block + # for to hold non-uniform sharded tensor as well as swizzling + # other padding to increase performance + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 256 + ) + hidden_size = round_up(hidden_size, 256) + elif ( + self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 + ): + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 128 + ) + hidden_size = round_up(hidden_size, 128) + elif current_platform.is_rocm(): + pad_align = get_padding_alignment() + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, pad_align + ) + hidden_size = round_up(hidden_size, pad_align) + else: + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 64 + ) + + self.intermediate_size = intermediate_size_per_partition_after_pad + self.hidden_size = hidden_size + # Fused gate_up_proj (column parallel) + w13_weight = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + hidden_size // 2, + dtype=weight_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w13_weight_scale = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + hidden_size // mxfp4_block, + dtype=scale_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + + w13_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + dtype=torch.bfloat16, + ), + requires_grad=False, + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + # down_proj (row parallel) + w2_weight = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition_after_pad // 2, + dtype=weight_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + w2_weight_scale = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + intermediate_size_per_partition_after_pad // mxfp4_block, + dtype=scale_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + w2_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + dtype=torch.bfloat16, + ), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + + def process_weights_after_loading(self, layer): + if self.mxfp4_backend == Mxfp4Backend.MARLIN: + prepare_moe_fp4_layer_for_marlin(layer) + elif ( + self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 + ): + from flashinfer.fp4_quantization import nvfp4_block_scale_interleave + from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache + + layer.gemm1_alpha = Parameter( + torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False, + ) + layer.gemm1_beta = Parameter( + torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False, + ) + layer.gemm1_clamp_limit = Parameter( + torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False, + ) + sf_block_size = 32 # mxfp4 block size + + assert ( + layer.w13_weight.dim() == 3 + and layer.w13_weight.shape[0] == self.num_experts + and layer.w13_weight.shape[1] == self.intermediate_size * 2 + and layer.w13_weight.shape[2] == self.hidden_size // 2 + ) + assert ( + layer.w13_weight_scale.dim() == 3 + and layer.w13_weight_scale.shape[0] == self.num_experts + and layer.w13_weight_scale.shape[1] == self.intermediate_size * 2 + and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size + ) + assert ( + layer.w2_weight.dim() == 3 + and layer.w2_weight.shape[0] == self.num_experts + and layer.w2_weight.shape[1] == self.hidden_size + and layer.w2_weight.shape[2] == self.intermediate_size // 2 + ) + assert ( + layer.w2_weight_scale.dim() == 3 + and layer.w2_weight_scale.shape[1] == self.hidden_size + and layer.w2_weight_scale.shape[2] + == self.intermediate_size // sf_block_size + ) + assert ( + layer.w13_bias.dim() == 2 + and layer.w13_bias.shape[0] == self.num_experts + and layer.w13_bias.shape[1] == self.intermediate_size * 2 + ) + assert ( + layer.w2_bias.dim() == 2 + and layer.w2_bias.shape[0] == self.num_experts + and layer.w2_bias.shape[1] == self.hidden_size + ) + + w13_weight_scale = layer.w13_weight_scale.data + w2_weight_scale = layer.w2_weight_scale.data + w13_weight = layer.w13_weight.data + w2_weight = layer.w2_weight.data + w13_bias = layer.w13_bias.data.to(torch.float32) + w2_bias = layer.w2_bias.data.to(torch.float32) + + # Swap w1 and w3 as the definition of + # swiglu is different in the trtllm-gen + def swap_every_two_rows(x, axis=-1): + shape = x.shape + if axis < 0: + axis = len(shape) + axis + + # Create a new shape with pairs swapped along specified axis + new_shape = list(shape) + new_shape[axis] = shape[axis] // 2 + new_shape.insert(axis + 1, 2) + + # Reshape to expose pairs, swap them, and reshape back + x = x.reshape(*new_shape) + x = x.flip(axis + 1) + new_shape = list(shape) + return x.reshape(*new_shape) + + w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2) + w13_weight = swap_every_two_rows(w13_weight, -2) + w13_bias = swap_every_two_rows(w13_bias, -1) + + # Do not interleave as the checkpoint is already interleaved + + # Shuffle weights and scaling factors for transposed mma output + gemm1_weights_mxfp4_shuffled = [] + gemm1_scales_mxfp4_shuffled = [] + gemm2_weights_mxfp4_shuffled = [] + gemm2_scales_mxfp4_shuffled = [] + gemm1_bias_shuffled = [] + gemm2_bias_shuffled = [] + epilogue_tile_m = 128 # FIXME: this depends on the kernel internals + for i in range(self.num_experts): + # w13 weight shuffling + permute_indices = get_w2_permute_indices_with_cache( + self._cache_permute_indices, + w13_weight[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm1_weights_mxfp4_shuffled.append( + w13_weight[i] + .view(torch.uint8)[permute_indices.to(w13_weight.device)] + .contiguous() + ) + # w13 scale shuffling + permute_sf_indices = get_w2_permute_indices_with_cache( + self._cache_permute_indices, + w13_weight_scale[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm1_scales_mxfp4_shuffled.append( + nvfp4_block_scale_interleave( + w13_weight_scale[i] + .view(torch.uint8)[ + permute_sf_indices.to(w13_weight_scale.device) + ] + .contiguous() + ) + ) + # w13 bias shuffling + permute_bias_indices = get_w2_permute_indices_with_cache( + self._cache_permute_indices, + w13_bias[i].clone().reshape(-1, 1), + epilogue_tile_m, + ) + gemm1_bias_shuffled.append( + w13_bias[i] + .clone() + .reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)] + .contiguous() + ) + # w2 weight shuffling + permute_indices = get_w2_permute_indices_with_cache( + self._cache_permute_indices, + w2_weight[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm2_weights_mxfp4_shuffled.append( + w2_weight[i] + .view(torch.uint8)[permute_indices.to(w2_weight.device)] + .contiguous() + ) + # w2 scale shuffling + permute_sf_indices = get_w2_permute_indices_with_cache( + self._cache_permute_indices, + w2_weight_scale[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm2_scales_mxfp4_shuffled.append( + nvfp4_block_scale_interleave( + w2_weight_scale[i] + .view(torch.uint8)[ + permute_sf_indices.to(w2_weight_scale.device) + ] + .contiguous() + ) + ) + # w2 bias shuffling + permute_indices = get_w2_permute_indices_with_cache( + self._cache_permute_indices, + w2_bias[i].clone().reshape(-1, 1), + epilogue_tile_m, + ) + gemm2_bias_shuffled.append( + w2_bias[i] + .clone() + .reshape(-1, 1)[permute_indices.to(w2_bias.device)] + .contiguous() + ) + + w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled) + w13_weight_scale = ( + torch.stack(gemm1_scales_mxfp4_shuffled) + .reshape( + self.num_experts, + 2 * self.intermediate_size, + self.hidden_size // sf_block_size, + ) + .view(torch.float8_e4m3fn) + ) + + w2_weight = torch.stack(gemm2_weights_mxfp4_shuffled) + w2_weight_scale = ( + torch.stack(gemm2_scales_mxfp4_shuffled) + .reshape( + self.num_experts, + self.hidden_size, + self.intermediate_size // sf_block_size, + ) + .view(torch.float8_e4m3fn) + ) + + layer.w13_weight = Parameter(w13_weight, requires_grad=False) + layer.w13_weight_scale = Parameter(w13_weight_scale, requires_grad=False) + layer.w2_weight = Parameter(w2_weight, requires_grad=False) + layer.w2_weight_scale = Parameter(w2_weight_scale, requires_grad=False) + layer.w13_bias = Parameter( + torch.stack(gemm1_bias_shuffled).reshape(self.num_experts, -1), + requires_grad=False, + ) + layer.w2_bias = Parameter( + torch.stack(gemm2_bias_shuffled).reshape(self.num_experts, -1), + requires_grad=False, + ) + elif ( + self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 + ): + layer.gemm1_alpha = Parameter( + torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False, + ) + layer.gemm1_beta = Parameter( + torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False, + ) + layer.gemm1_clamp_limit = Parameter( + torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(), + requires_grad=False, + ) + + sf_block_size = 32 # mxfp4 block size + + # Common shape assertions + assert ( + layer.w13_weight.dim() == 3 + and layer.w13_weight.shape[0] == self.num_experts + and layer.w13_weight.shape[1] == self.intermediate_size * 2 + and layer.w13_weight.shape[2] == self.hidden_size // 2 + ) + assert ( + layer.w13_weight_scale.dim() == 3 + and layer.w13_weight_scale.shape[0] == self.num_experts + and layer.w13_weight_scale.shape[1] == self.intermediate_size * 2 + and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size + ) + assert ( + layer.w2_weight.dim() == 3 + and layer.w2_weight.shape[0] == self.num_experts + and layer.w2_weight.shape[1] == self.hidden_size + and layer.w2_weight.shape[2] == self.intermediate_size // 2 + ) + assert ( + layer.w2_weight_scale.dim() == 3 + and layer.w2_weight_scale.shape[1] == self.hidden_size + and layer.w2_weight_scale.shape[2] + == self.intermediate_size // sf_block_size + ) + assert ( + layer.w13_bias.dim() == 2 + and layer.w13_bias.shape[0] == self.num_experts + and layer.w13_bias.shape[1] == self.intermediate_size * 2 + ) + assert ( + layer.w2_bias.dim() == 2 + and layer.w2_bias.shape[0] == self.num_experts + and layer.w2_bias.shape[1] == self.hidden_size + ) + + # De-interleave and swap for w13 weight, bias, and scales + w13_w = layer.w13_weight.data + gate_w, up_w = w13_w[:, ::2, :], w13_w[:, 1::2, :] + deinterleaved_w13_w = torch.cat([gate_w, up_w], dim=1) + w1_w, w3_w = torch.chunk(deinterleaved_w13_w, 2, dim=1) + w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1) + + w13_b = layer.w13_bias.data.to(torch.float32) + gate_b, up_b = w13_b[:, ::2], w13_b[:, 1::2] + deinterleaved_w13_b = torch.cat([gate_b, up_b], dim=1) + b1, b3 = torch.chunk(deinterleaved_w13_b, 2, dim=-1) + w13_bias_swapped = torch.cat([b3, b1], dim=-1).to(torch.bfloat16) + + w13_s = layer.w13_weight_scale.data + gate_s, up_s = w13_s[:, ::2, :], w13_s[:, 1::2, :] + deinterleaved_w13_s = torch.cat([gate_s, up_s], dim=1) + s1, s3 = torch.chunk(deinterleaved_w13_s, 2, dim=1) + w13_scale_swapped = torch.cat([s3, s1], dim=1) + + if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS: + from flashinfer import block_scale_interleave + + orig_shape = w13_scale_swapped.shape + w13_scale_interleaved = block_scale_interleave( + w13_scale_swapped.view(torch.uint8) + ).reshape(orig_shape) + + w2_s = layer.w2_weight_scale.data + orig_shape = w2_s.shape + w2_scale_interleaved = block_scale_interleave( + w2_s.view(torch.uint8) + ).reshape(orig_shape) + + layer.w13_weight = Parameter(w13_weight_swapped, requires_grad=False) + layer.w13_weight_scale = Parameter( + w13_scale_interleaved, requires_grad=False + ) + layer.w13_bias = Parameter(w13_bias_swapped, requires_grad=False) + layer.w2_weight_scale = Parameter( + w2_scale_interleaved, requires_grad=False + ) + elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16: + + def _interleave_mxfp4_cutlass_sm90(w): + w_shape = w.shape + w_interleaved = w.reshape( + w_shape[0], w_shape[1], (w_shape[2] // 4), 4 + ) + w_interleaved = w_interleaved.permute(0, 2, 1, 3) + w_interleaved = w_interleaved.reshape( + w_shape[0], w_shape[2] // 4, w_shape[1] * 4 + ) + return w_interleaved + + w31_scales = w13_scale_swapped.to(torch.uint8).view(torch.uint8) + w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w31_scales) + + w2_weight_scale = layer.w2_weight_scale.data + w2_scales = w2_weight_scale.to(torch.uint8).view(torch.uint8) + w2_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w2_scales) + + layer.w13_weight = torch.nn.Parameter( + torch.cat([w3_w, w1_w], dim=1), requires_grad=False + ) + layer.w13_bias = torch.nn.Parameter( + w13_bias_swapped, requires_grad=False + ) + layer.w13_weight_scale = torch.nn.Parameter( + w31_scales_interleaved, requires_grad=False + ) + layer.w2_weight_scale = torch.nn.Parameter( + w2_scales_interleaved, requires_grad=False + ) + elif self.mxfp4_backend == Mxfp4Backend.TRITON: + from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig + + w13_bias = layer.w13_bias.to(torch.float32) + w2_bias = layer.w2_bias.to(torch.float32) + + layer.w13_bias = Parameter(w13_bias, requires_grad=False) + layer.w2_bias = Parameter(w2_bias, requires_grad=False) + + # Ideally we'd use FusedMoEModularKernel.prepare_finalize object + # (stored in self.fused_experts) to determine if the MoE has a + # batched activation format. As self.fused_experts is not + # initialized at this point, we resort to checking the MoE config + # directly. + is_batched_moe = self.moe.use_pplx_kernels or self.moe.use_deepep_ll_kernels + if is_batched_moe: + num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8 + else: + num_warps = 8 + + w13_weight, w13_flex, w13_scale = _swizzle_mxfp4( + layer.w13_weight, layer.w13_weight_scale, num_warps + ) + w2_weight, w2_flex, w2_scale = _swizzle_mxfp4( + layer.w2_weight, layer.w2_weight_scale, num_warps + ) + + self.w13_precision_config = PrecisionConfig( + weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex) + ) + self.w2_precision_config = PrecisionConfig( + weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex) + ) + + self.w13_weight = w13_weight + self.w2_weight = w2_weight + layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False) + layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False) + else: + raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + if self.mxfp4_backend == Mxfp4Backend.MARLIN: + return mxfp4_w4a16_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + ) + elif self.mxfp4_backend == Mxfp4Backend.TRITON: + w1_scale = self.w13_precision_config + w2_scale = self.w2_precision_config + return mxfp4_w4a16_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + elif self.mxfp4_backend in [ + Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, + Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS, + ]: + return mxfp4_mxfp8_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + ) + elif self.mxfp4_backend in [Mxfp4Backend.SM100_FI_MXFP4_BF16]: + return mxfp4_w4a16_moe_quant_config( + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + ) + else: + w1_scale = layer.w13_weight_scale + w2_scale = layer.w2_weight_scale + return ocp_mx_moe_quant_config( + quant_dtype="mxfp4", + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + if ( + prepare_finalize.activation_format + == mk.FusedMoEActivationFormat.BatchedExperts + ): + if self.mxfp4_backend == Mxfp4Backend.MARLIN: + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens_per_rank is not None + assert self.moe_quant_config is not None + return BatchedMarlinExperts( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + ) + else: + raise NotImplementedError( + f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for " + "EP batched experts format" + ) + else: + assert self.moe_quant_config is not None + if ( + self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 + ): + # B200 code-path + kwargs = { + "gemm1_alpha": layer.gemm1_alpha, + "gemm1_beta": layer.gemm1_beta, + "gemm1_clamp_limit": layer.gemm1_clamp_limit, + # TODO(bnell): part of quant_config + "max_capture_size": self.max_capture_size, + } + return TrtLlmGenExperts(self.moe, self.moe_quant_config, **kwargs) + elif self.mxfp4_backend == Mxfp4Backend.MARLIN: + return MarlinExperts(self.moe_quant_config) + elif self.mxfp4_backend == Mxfp4Backend.TRITON: + return OAITritonExperts(self.moe_quant_config) + else: + raise NotImplementedError( + f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for EP" + ) + + @property + def allow_inplace(self) -> bool: + return True + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError("EPLB is not supported for mxfp4") + + if self.mxfp4_backend == Mxfp4Backend.MARLIN: + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + ) + + return fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + layer.w13_bias, + layer.w2_bias, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + global_scale1=None, + global_scale2=None, + quant_type_id=scalar_types.float4_e2m1f.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + activation=activation, + expert_map=expert_map, + ) + + assert _can_support_mxfp4( + use_grouped_topk, + topk_group, + num_expert_group, + expert_map, + custom_routing_function, + e_score_correction_bias, + apply_router_weight_on_input, + scoring_func, + activation, + expert_load_view, + logical_to_physical_map, + logical_replica_count, + ), "MXFP4 are not supported with this configuration." + + if ( + self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 + ): + from flashinfer import trtllm_fp4_block_scale_moe + + if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16: + assert x.dtype == torch.bfloat16 + x_quant = x + x_scale = None + elif self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM: + from flashinfer import mxfp8_quantize + + x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8 + x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x.shape[:-1], -1) + + trtllm_gen_output = trtllm_fp4_block_scale_moe( + router_logits.to(torch.bfloat16), + None, # routing_bias + x_quant, + x_scale, + layer.w13_weight, # uint8 (e2m1 x 2) + layer.w13_weight_scale, # uint8 (e4m3 x 2) + layer.w13_bias, # fp32 per expert per channel + layer.gemm1_alpha, # fp32 per expert + layer.gemm1_beta, # fp32 per expert + layer.gemm1_clamp_limit, # fp32 per expert + layer.w2_weight, # uint8 (e2m1 x 2) + layer.w2_weight_scale, # ue8m0 + layer.w2_bias, # fp32 per expert per channel + None, # output1_scale_scalar + None, # output1_scale_gate_scalar + None, # output2_scale_scalar + global_num_experts, + top_k, + None, # n_group + None, # topk_group + self.intermediate_size, # padded to multiple of 256 + layer.ep_rank * layer.local_num_experts, # local_expert_offset + self.num_experts, # local num experts + None, + None, + 1 if renormalize else 0, # routing_method_type, renormalize + True, # do finalize + tune_max_num_tokens=max(self.max_capture_size, 1), + )[0] + return trtllm_gen_output + elif ( + self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS + or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 + ): + from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + # Backend-specific preparation + if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS: + from flashinfer import mxfp8_quantize + + x_quant, x_scale = mxfp8_quantize(x, True, 32) + + fake_input_scale = torch.ones(self.num_experts, device=x.device) + quant_scales = [ + layer.w13_weight_scale.contiguous().view(torch.int32), + fake_input_scale, + layer.w2_weight_scale.contiguous().view(torch.int32), + fake_input_scale, + ] + + fi_input = x_quant + extra_kwargs = dict( + use_mxfp8_act_scaling=True, + input_sf=x_scale, + fc1_expert_weights=layer.w13_weight.contiguous().view(torch.long), + fc2_expert_weights=layer.w2_weight.contiguous().view(torch.long), + ) + elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16: + assert x.dtype == torch.bfloat16 + + quant_scales = [ + layer.w13_weight_scale, + layer.w2_weight_scale, + ] + + fi_input = x + extra_kwargs = dict( + use_w4_group_scaling=True, + fc1_expert_weights=layer.w13_weight, + fc2_expert_weights=layer.w2_weight, + ) + + output = torch.empty_like(x, dtype=torch.bfloat16) + _ = flashinfer_cutlass_fused_moe( + input=fi_input, + token_selected_experts=topk_ids.to(torch.int).contiguous(), + token_final_scales=topk_weights, + output_dtype=torch.bfloat16, + output=output, + quant_scales=quant_scales, + fc1_expert_biases=layer.w13_bias, + fc2_expert_biases=layer.w2_bias, + swiglu_alpha=layer.gemm1_alpha, + swiglu_beta=layer.gemm1_beta, + swiglu_limit=layer.gemm1_clamp_limit, + tp_size=self.moe.tp_size, + tp_rank=self.moe.tp_rank, + ep_size=self.moe.ep_size, + ep_rank=self.moe.ep_rank, + tune_max_num_tokens=max(self.max_capture_size, 1), + **extra_kwargs, + ) + + return output + elif self.mxfp4_backend == Mxfp4Backend.TRITON: + from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501 + triton_kernel_moe_forward, + ) + + return triton_kernel_moe_forward( + hidden_states=x, + w1=self.w13_weight, + w2=self.w2_weight, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + else: + raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") + + +class IpexMxfp4MoEMethod(Mxfp4MoEMethod): + def __init__(self, moe_config: FusedMoEConfig): + super().__init__(moe_config) + self.moe_config = moe_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + super().create_weights( + layer, + num_experts, + hidden_size, + intermediate_size_per_partition, + params_dtype, + **extra_weight_attrs, + ) + self.original_hidden_size = hidden_size + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + import intel_extension_for_pytorch as ipex + + layer.w13_weight.data = layer.w13_weight.data.view(torch.int32) + layer.w2_weight.data = layer.w2_weight.data.view(torch.int32) + ep_rank_start = self.moe_config.ep_rank * self.moe_config.num_local_experts + layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( + layer.w13_weight, + layer.w2_weight, + w1_scale_inv=layer.w13_weight_scale, + w2_scale_inv=layer.w2_weight_scale, + w13_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + is_mxfp4=True, + experts_start_id=ep_rank_start, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor: + assert activation == "swigluoai", ( + "Only swiglu_oai activation is supported for IPEX MXFP4 MoE" + ) + hidden_size_pad = round_up(self.original_hidden_size, 128) + x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1))) + hidden_states = layer.ipex_fusion( + x_pad, + use_grouped_topk, + top_k, + router_logits, + renormalize, + topk_group, + num_expert_group, + activation="swiglu_oai", + ) + hidden_states = hidden_states[..., : self.original_hidden_size].contiguous() + return hidden_states diff --git a/model_executor/layers/quantization/petit.py b/model_executor/layers/quantization/petit.py new file mode 100644 index 0000000..402cebc --- /dev/null +++ b/model_executor/layers/quantization/petit.py @@ -0,0 +1,320 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py + +from typing import Any, Optional + +import regex as re +import torch +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.petit_utils import ( + apply_petit_nvfp4_linear, + prepare_nvfp4_layer_for_petit, + verify_petit_nvfp4_supported, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped +from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter +from vllm.platforms import current_platform + +# Initialize logger for the module +logger = init_logger(__name__) + + +# Configuration class to support the NVFP4 quantized model +# generated by the ModelOpt quantization tool +class PetitNvFp4Config(QuantizationConfig): + """Config class for Petit FP4.""" + + def __init__( + self, + is_checkpoint_nvfp4_serialized: bool = False, + kv_cache_quant_algo: str | None = None, + group_size: int | None = None, + exclude_modules: list[str] | None = None, + ) -> None: + self._check_hardware_support() + self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized + if is_checkpoint_nvfp4_serialized: + logger.warning( + "Detected nvfp4 checkpoint. Please note that the " + "format is experimental and subject to change." + ) + self.group_size = group_size + self.kv_cache_quant_algo = kv_cache_quant_algo + self.exclude_modules = exclude_modules + + def _check_hardware_support(self) -> None: + """ + Verifies that the current hardware is supported by the Petit backend. + This backend is specifically designed for AMD GPUs and is not + supported on the CUDA platform. + """ + # This check ensures the code is NOT running on an NVIDIA GPU. + if current_platform.is_cuda(): + raise ValueError( + "The 'petit' quantization backend is designed for AMD GPUs " + "and is not supported on the CUDA platform. For NVIDIA GPUs, " + "please use a different quantization method such as FP8, AWQ, " + "or GPTQ." + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "petit_nvfp4" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + # Petit supports the gfx90a and gfx942 GPUs + return 90 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return ["hf_quant_config.json"] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "PetitNvFp4Config": + qc = cls.get_from_keys(config, ["quantization"]) + + quant_method_raw = qc.get("quant_algo") + if not isinstance(quant_method_raw, str) or not quant_method_raw: + raise ValueError("Missing or invalid 'quant_algo' in quantization config.") + quant_method = quant_method_raw.upper() + + group_size_raw = qc.get("group_size") + if not isinstance(group_size_raw, int): + raise ValueError( + "Missing or invalid 'group_size' (int) in hf_quant_config.json." + ) + group_size = group_size_raw + + verify_petit_nvfp4_supported(quant_method, group_size) + + kv_cache_quant_algo_raw = qc.get("kv_cache_quant_algo") or "auto" + if not isinstance(kv_cache_quant_algo_raw, str): + raise ValueError("'kv_cache_quant_algo' must be a string if provided.") + kv_cache_quant_algo = kv_cache_quant_algo_raw + + exclude_raw = qc.get("exclude_modules", []) + if exclude_raw is None: + exclude_modules: list[str] = [] + elif isinstance(exclude_raw, list) and all( + isinstance(x, str) for x in exclude_raw + ): + exclude_modules = exclude_raw + else: + raise ValueError("'exclude_modules' must be a list[str] (or omitted).") + + is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method + + return cls( + is_checkpoint_nvfp4_serialized=is_checkpoint_nvfp4_serialized, + kv_cache_quant_algo=kv_cache_quant_algo, + group_size=group_size, + exclude_modules=exclude_modules, + ) + + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant + ) -> QuantizationMethods | None: + if not current_platform.is_rocm(): + return None + + qc = hf_quant_cfg.get("quantization", hf_quant_cfg) + algo = (qc.get("quant_algo") or qc.get("quant_method") or "").upper() + if algo in ("NVFP4", "MODELOPT_FP4", "MODELOPT"): + return cls.get_name() # "petit_nvfp4" + return None + + @classmethod + def is_petit_nvfp4_compatible(cls, quant_config: dict[str, Any]) -> bool: + qc = quant_config.get("quantization", quant_config) + algo = (qc.get("quant_algo") or qc.get("quant_method") or "").upper() + return algo == "NVFP4" + + def is_layer_excluded(self, prefix: str, exclude_modules: list[str]) -> bool: + for pattern in exclude_modules: + regex_str = pattern.replace(".", r"\.").replace("*", r".*") + if re.fullmatch(regex_str, prefix): + return True + return False + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + exclude = self.require_exclude_modules() + + if isinstance(layer, LinearBase): + if is_layer_skipped(prefix, exclude) or self.is_layer_excluded( + prefix, exclude + ): + return UnquantizedLinearMethod() + return PetitNvFp4LinearMethod(self) + elif isinstance(layer, Attention): + return PetitFp8KVCacheMethod(self) + return None + + def get_scaled_act_names(self) -> list[str]: + return [] + + def require_group_size(self) -> int: + if self.group_size is None: + logger.warning("group_size not set; defaulting to 16 for NVFP4.") + return 16 + return self.group_size + + def require_kv_cache_quant_algo(self) -> str: + return self.kv_cache_quant_algo or "auto" + + def require_exclude_modules(self) -> list[str]: + return list(self.exclude_modules or []) + + +class PetitFp8KVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from FP8 checkpoints. + """ + + def __init__(self, quant_config: PetitNvFp4Config): + super().__init__(quant_config) + + +class PetitNvFp4LinearMethod(LinearMethodBase): + """Linear method for NVFP4. + Supports loading NVFP4 checkpoints with the following structure: + + |Tensor Name | datatype | shape | + |----------------------------------------------------| + |input_scale | torch.float32 | scalar | + |weight | NVFP4(SE2M1) | [1, X, y/2] | + |weight_scale | FP8-E4M3 | [X, Y] | + |weight_scale_2 | torch.float32 | scalar | + + The weights are quantized per block of 16 elements. + Args: quant_config: The ModelOpt quantization config. + """ + + def __init__(self, quant_config: PetitNvFp4Config): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + if not self.quant_config.is_checkpoint_nvfp4_serialized: + raise ValueError( + "NVFP4 quantization was selected, " + " dynamic quantization is not supported." + ) + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + layer.logical_widths = output_partition_sizes + + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + if input_size_per_partition % 16 != 0: + raise ValueError( + "Unsupported model when in features size is not multiple of 16" + ) + + weight_dtype = ( + torch.float8_e4m3fn + if self.quant_config.is_checkpoint_nvfp4_serialized + else params_dtype + ) + + weight = ModelWeightParameter( + data=torch.empty( + # 2 fp4 data is packed in one uint8 in the input dimension + output_size_per_partition, + input_size_per_partition // 2, + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + input_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + + layer.register_parameter("input_scale", input_scale) + + weight_scale_2 = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale_2", weight_scale_2) + + group_size = self.quant_config.require_group_size() + weight_scale = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition // group_size, + dtype=weight_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight_scale", weight_scale) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + input_scale_2 = layer.input_scale.max().to(torch.float32) + weight_scale_2 = layer.weight_scale_2.max().to(torch.float32) + layer.input_scale = Parameter(input_scale_2, requires_grad=False) + layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False) + layer.alpha = Parameter( + layer.input_scale * layer.weight_scale_2, requires_grad=False + ) + + prepare_nvfp4_layer_for_petit(layer) + del layer.input_scale + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return apply_petit_nvfp4_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + weight_scale_2=layer.weight_scale_2, + size_n=layer.output_size_per_partition, + size_k=layer.input_size_per_partition, + bias=bias, + ) diff --git a/model_executor/layers/quantization/ptpc_fp8.py b/model_executor/layers/quantization/ptpc_fp8.py new file mode 100644 index 0000000..26ba8e5 --- /dev/null +++ b/model_executor/layers/quantization/ptpc_fp8.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase +from vllm.model_executor.layers.quantization.fp8 import ( + Fp8Config, + Fp8KVCacheMethod, + Fp8LinearMethod, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + is_layer_skipped, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp +from vllm.platforms import current_platform + +ACTIVATION_SCHEMES = ["static", "dynamic"] + +logger = init_logger(__name__) + + +class PTPCFp8Config(Fp8Config): + """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8.""" + + def __init__( + self, + activation_scheme: str = "dynamic", + ignored_layers: list[str] | None = None, + ) -> None: + if not current_platform.is_rocm(): + raise ValueError("ptpc_fp8 quantization is supported only on ROCm.") + + if not current_platform.has_device_capability(94): + raise ValueError( + "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer." # noqa: E501 + ) + if activation_scheme == "static": + raise ValueError("ptpc_fp8 as of now only support dynamic quantization.") + + super().__init__( + is_checkpoint_fp8_serialized=False, + activation_scheme=activation_scheme, + ignored_layers=ignored_layers, + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "ptpc_fp8" + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config": + activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) + return cls(activation_scheme=activation_scheme, ignored_layers=ignored_layers) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + if isinstance(layer, LinearBase): + if is_layer_skipped(prefix, self.ignored_layers): + return UnquantizedLinearMethod() + return PTPCFp8LinearMethod(self) + elif isinstance(layer, Attention): + return Fp8KVCacheMethod(self) + return None + + +class PTPCFp8LinearMethod(Fp8LinearMethod): + """Linear method for Per-Token and Per-Channel FP8 Quantization. + Only supports loading quantized BF16 model checkpoints with dynamic + activation scaling. To load FP16 model checkpoints, user must specify + to convert the FP16 model weight loading into BF16. + The weight scaling factor will be initialized after + the model weights are loaded. + + Limitations: + 1. Only support float8_e4m3fnuz data type due to the limitation of + torch._scaled_mm (https://github.com/ROCm/pytorch/blob/8c0504d7f3fb0ee4c278c096a5c3caedb01129fa/aten/src/ATen/native/cuda/Blas.cpp#L1041) + + Args: + quant_config: The quantization config. + """ + + def __init__(self, quant_config: PTPCFp8Config): + assert current_platform.is_rocm(), ( + "PTPCFp8LinearMethod is only supported on ROCm." + ) + super().__init__(quant_config=quant_config) + # Force weight quantization + self.quant_config.is_checkpoint_fp8_serialized = False + self.fp8_linear = Fp8LinearOp( + act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN + ) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) + + assert layer.weight.data.dtype == torch.bfloat16, ( + f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501 + ) + # Quantize the weights. + qweight, weight_scale = ops.scaled_fp8_quant( + layer.weight, scale=None, use_per_token_if_dynamic=True + ) + + # Update the layer with the new values. + layer.weight = Parameter( + qweight.t(), requires_grad=False + ) # Pretranspose the weight + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + layer.input_scale = None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + input_scale_ub=None, + bias=bias, + ) diff --git a/model_executor/layers/quantization/quark/__init__.py b/model_executor/layers/quantization/quark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/model_executor/layers/quantization/quark/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/quark/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f20a6cc65932290a1225ce7628ba8c780d8a1554 GIT binary patch literal 190 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJV#p;*j7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?E|R?rWO_J z7nUaGm1I^WmSh6e0Xaq4`tk9Zd6^~g@p=W7w>WHo+Dda$?TT1|wlM;6F^KVznURsP Ih#ANN04)qM-2eap literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/quark/__pycache__/quark.cpython-312.pyc b/model_executor/layers/quantization/quark/__pycache__/quark.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3979479ef4cce8d48addafd7e9fbc7d813021af7 GIT binary patch literal 22202 zcmd6PX>c1?npih3;s!{72Y5Ec6Ql%6rbLkxbNGRL|1qhEW4N*M`d?#!(|F(}hhD^Qf8R^LT~Xusc#YT1oPza8;yww3_72;hKnN z)Dx*4t&P-;)q)sK+z@FTZHzRHHj#U4n2R)zHj}(9yeZN$+QKq|M#M4ZpV`OR z(N+y37=^YEVWxPEwriL}4DYzf@J^xPL#|@FPMSX<#4b+qq@?=n#Fgp5L@X2+_`-((#PuS5 z5Wv1@lc-lCRTLKQpPU#AjjLrs9x?zZ#~zNI+aCyClpjE$9h!@mVQ91}%+NwgPVPVD zKXG2ZISnl$f@CF6UhW_6KRDGd+0?ru6S006iIo?GDSvP>G8GU7$rZggIUVNxq49}H zQSgTYGh``jFAJgZi?Qg5z|<5ea0aJEQJ9GNr^12Q*rXVd$_e1Y*$3%sqp3;0SzFLB z%$tDE0yE0;%&5l1;6J{V*9clcC+NpC9;nJ|1%sd?IW}Gf_qw8|v`jDP#*A~!7|R=O z+D6OxHr{wsH)`a|psneWyRdtjM|fiz-UPKQf)(o7;Ldzg_n};M)DC41D6>EfCmB~e zZ-sn0Zv%*}8zEk5-&h9NsRKA}-a+~^@J@hG3H_B9&7q8E#&mo|LGM*$O!B;86t042 z)qDr<7HoW_Py?`Q#teMbP5r2cuP%Z~*QKUJ8ExZ%k<@<6Hnq^E?pwC0gEsYiJ*-2+ zw^+Bb!g0e48v#nghE;2VRpa1ETL37|GZ+a`V!c}F^)tQbISfl_Xcxz3I@ zpJu$B@)n{yb40RU!Cefyglts%<*WW6?nnQ)I5|BfIn;Z(f@LZYyzGw<0ufZ`I3fyC zGW=V+0Lh=Q_m~*`((9H!!Tb>FD!9lp3C#`7I5VqBunEQwyM|r~4cy5Tkzo>AxYMc6 zba1EkH^ba#N4y3}ADxCxD;fQMLf`yiJ=7N)Flof32@NUN`=Jv?mss>X`~nZHWcMaUr(l`t;6${>wXso^a?w&(sW1$BCY+;c%oUGRX^J zzi?d$PRAz29zs;2J!DBkaY7jJM!ZZ@-_(rQ0^P*jgrd+_;8O9Dlj5RC9+N{Nsv@~ZMn&w8b^__FN6{9EHxEX)zvQ0kxt%Kv5NwS#E|yFMPuT05Xg^URzVGd02YXF7svdu zNk3NgM~JjF$iZ)v8xtq-;rjMmCqf3|JRK2u6&6lV?Sh*eBi&!2=NMYarB7D&H_E zsfDbZ%u)TDiVvc0Y30WZZ2jcU#8YopN{o(!J%@3v>%Irg4_xWi19whH z)vej#k)K~l4ZoPI*|j|Uux8Jr-FuSmoy+V)_h7D?sr2P)nTCBVaxD*`7m}WEPsuO^ zh!wgn;`Rv^QUl!MgoeCJXb8`Ow#Y=-SuM-t_g$V6#Eej%Tx1_Hm(-oDlrzkw68oCh zehBdKQuZy-8V4;Q`w=9OOHSoJ|ND_n(a=P5%%zkowCgkzL+(JX1RJYT@=70K8$iW@ zuDd`&(tM;>Y9Vzbu zGTVlEFbpm$+2|iZ`ljhk+ne@#fM%EEZAt3L37==28c-@MF>FUJUZZT|H`r$&51NwD zH%3(v=vNgi z@+2;%5+rJ5c*1HKls?rF8De09kVj;=q>?SAEU&aFEoaLL##g468gJPysRS05 zL_1^EO8J}(3>#%PjRk$H^dfRn}gbMCa8I} zjtY)%nP$xe<1oQEOgD`Q?Z#e3O~QOv^LB#{n9!_w*7TM=1~BL~XNPv?o5oxAjqRd^ zqqQh=c!RxV=k-Y6OX8VTsfPr<2#amX6NCT)Fubi<3(q7hyp;&ACS4)(dlt+vlpKLw z=|{y3Yr^_{J@jOyJ!u$1>b1atbRRiB)X|kV4QtEFD|C(VI!7dfa2-{s_^C4>ywN;& zS(xFd^^LnAfDB3a01BH#c%u|xS|pNY6h%5h?>IsO(e6kEL#oyz3nR=$s-SFwQc9VM z6jBIe2fgsOhHA#2u*Vn{6psY+Y8R{^n^%8tsrwLCw-%mh@-(EBeu?Z(S))H?@3AcN zcD;^yd$SH0-#^t}W2p+QC5#9(&xuGN1}dmU&VV^q0F$Q1w^gzfj9=1D1!9s_HMT^B zSQJ+n#MT>B^)?NS3lp%M;+}X{Av-|XSr#>YL*dC_AROJ}Q|taIz=-xk^1J_mR)rj+ ztFHLGV`$#8+SIae{FhDJ=CzOYOl5t>-H~#4ELxUMXWRpinzrSjaPYH^q2H5H{G+YC zZ2TU+&yp{OyBGk~0-mbFC$=Mw2>k-6AskF61`K{PcVRllO-!m4l~+u447pa6y9U|_ zs4deIDvHRcP^gQ@_~I+FBcMu1-uY3;6ufhMuLbYClvUvt7^616*TQ$I5mDh1zXyp| zD+b_4DudSs6P%EAM3X}1qr5RufSYggu0DYJQABV0u98*ff6=%+{-bM|>aJ9E*W&i2 z>P+>(FB;`n!nLqPGgqq}xoGtNYAyZ(phvetqTsEz+WtA`y>s}nmML$T zKc8&xOEv6V_B`13SM|w*=TZYNXPoChAc~XXIw@GvhJ^`93tLOv3pGU~Dv}Q9aa1yf zK(oF8Pb6(%Vn#&6BjmziuS491RY$SFMk%=behWx2F|WRgRmuzK7!lf|lB2K)`@V=3 zO=QOY;Mlkr#(M{ynqR>gk%v+-0!b9rEuek$`X2o3+egWurGVbWt6waTiwhnk5JV1m z!QX@RtETa%=6B2s{pp&`bGEFdCTZbTE9!F^wtUZ;R_n6o7_H6z*q9fZ8>rBn+;uGN zJ)ZWQ0C~Bp_TAnETZY?~;jSI~WEiKu`O>dq2qONiN zwRD|t?)YkR+xz@Sy??1)?0;`!Y3!3rKf9D1KAY}4mu?0DxN((hO}6dKv<;=&hSF_& z(%jypwPCffIk~Af-MD@3WX=GHo^iINoNWt}E1-?Ls@9Ntf6>vscygsDZg<*t^m({Iy=^v47q%eD~y?7v{YHDUGxR zk`>qauk=iLZMI|hE37kj1c14#*BH9p*7WfEn9;g%eXtKre>;eYJ{9HlImXolQE=u9=yJ_B+PxmTh;8NssrTvn!{C zlE=`#<^Am+X+Ju>Xnb!^viCry_eiSu$j`m$-ZSaOvoffz?DqaUjwH7=<=pnUqay2R z%GPhQptPFI3t^gsi_m` zOrg2ejVc!1kp8g{>>o6e#VPOGPuwKFQZ@b~f%{l4=Eo#`2}s(Sgm7z5E?u0s4eStOmZR zG58~YqUE6`AMx-)gdsk-5G-TtJp=8>f?Y3ay%ycy5dlxJ(&^W5CgY|W;HiB!$@ zxg(!B-0!@yTD2+b-kYu3xol7FJoG?Jx{qe7x)ui)H!lT}?*6s1f?elsNLuS3xoRke zS+5ZXHZ>WbqaWEdA+<$Vh8RY$41y<65N5xR3Nt&yK0#Gf)n{Id>gjb>bfe#{r1G$? zF5qTqWnEQ>t;{T7W=U9jMG@f*R}pWuBH$({V_+4Bv6k*%g)6c%WlyM+hik<7%o-8~ z@P?X34oZ6B=aGzKQ~iEnd!!fb!G3-QwtXllm0h8@a9<;t!PO|sL6QaY!Ha>32_bw& z#ElCw5%OYh8LPv4u=1SNUM{z(KIz!RGmb|qW-vaXt(k+FF) z*7}sSe*WdOwQaFBW$pcnVQo9J&boV!yN-pjw6k;ZXv*0KN`k2p>T{bjO+Bfmo+WF# zX?L=2Hz0hrt1r_vnCco_9!Pf`02X_c1v;(mHYbf8Uz&llrJA>;`p#m12F6-_d+)8i z^FwHFa#ce?YtILJ-`l%1kZ#%ipgz@dB5CBZjt*!_5WX~4@B9A%t9kv^r|%OLPk95u z!6)qj)vRJgcok3FcLnuerfqM^zXtdP6&|U**F17i`~l*u z4v1{O_(zZxZ)1W8F8&aon4&Wg5TwOW@QnD!P_Pl-ZGdRlTt;{k+*0L1!Ysvo$pu-N zZm9qX^DIK=Dnw^_w@?^ z97=CL4tC1a_H!Vmn5sa^Q8&aZX=_Ka`*6~F7vUvTc)Nn zRnxgxpRVaky8FIjG?nJ%Km50^|JCbRYt5Qgvk$j}X2?8eg8kFfO$DgMKTJ37T8n`tv<#YpSeTI7CH}$QO;-~bjS}OUnr}n+B-Z%EGZilDzZBki~`Kf)ctM`q4tJc7$ z^sQO~dD~Ss>UzS87uEX4epTBZ_B-PMD_#6B*+H_$RTkwyP=g>l?!k*{0Xsxgi1&z+mWCWV)jXW1W z_)Opl_a}=6a5i%U4z!kY0PzeMTMRTn6C@Jh!t^+mC$#J23E8kBGkh~r$jRW8KXN@@ zMFt{kDEV>^aK@+=l691D)mm};_^sop3HLqLf)WfQ$L)cnh}Z_y-Ri{a?_wNzaKD>xj(7YciI)l%;O|c-rD!eCeU3XKB|bdw;g~!BD#QRPxNZ zhrQ?I9+RFISFA6slQq7Zwr*ZB6!i64+TmMjFYN2=iuD|AWmWn*l(w{jF1p41C8M*N zVeEF#ZT~GlNG`UvV1tp?@@UM|lg+4)3#isC7FyG1gEs*iQSi+|Jc z&8{b&#C4)Z20q+EM;1WF>nbD8lzx)XqA?C+VI8(;Y&uLZ2jh$UJKScZLZ}`DO(M#N0R&7IbEuX27MAAA5eps{0345w;Uv zO6yr`L5qLb*_t=>WlLBSHoz%;0&@S4*IH#+>P+nkyX^GQ%<8XE_YYtZIDf1o;tg1n z-~4a5ip(#nZ5lmR1-5DMHWu5a!75Fx(@JNuf29)a(JZk?YpDHMybOS3>oc__+M*W& zA}?EQN*SMfi-_Y`Xad6yz%v680o2q}!h7-xQqE@RO0hy~s_ma0Ik$ZNN7niC3$sfv zraE?Joc+IOkj>7XdN#i8An{uS^2Xm{6Ef{gc3x1Mi4HBt@opb?_4x4<5LI<$IwV3= z7MNm5z*bZs#%452b5s}gkyQ}U`dZ{S*Fgb#aM$)g0>>&qwZF=O`V7kcRosv=(t-~X zp0B^Kma^`k3TQ?owhJOb(d7Yw87JXK9ET)|^o^6bk5gVlP++MaMCP>$4|?h{uWvh! ztKAOJZ!lR`)%N#2|Ot_X_i`pBlt>w4hi}W4NPTI z#@(87w=SGcySQtmBtrmV$1uglc*rr^J&H`~;bX&Ov5f!?2LIsjHGQ1(0f!H!%9 z_UD~fDp<*a=#btp=x+6iOLPNC%4AllORA;LWWve@qRh^5v#nHv% zIffnBmD6e}cNN*L-OUT-DOXF<+VVwn+p5$3qtoxU+&O>OzM!Lq{tUMx#qCJ;9Zhq` zpo2|zu-boR)Y^tX!Sn9QbnZ)a?)zou^C@TR9qatqXReloqf6FLUArMd=8sRnoV@#4 z>Q)H8zkSj0PlnQ-t#e0L+q<69)yXt>3cBimu3BKdowdn^?J39hY_)GOo~j;9ItRbl z)c%zA$I{$!Xx{?un^4l}NLt%}|J7+0miG5B^agv>0`L73&q0gsud6% z{Gv4^!xaI;7O@OTwb_`BnI2Bgx!;lQJP1Z&=UD@e<5cnYa6^N>PM*K6#g7AUXF{vtV zprTDEhn5`jonkDD?--fHp2f=$N)6}HVxbEVeoZTL*I-m6+`-4aOf+se+!~xE&dm_X z4NA9KqPTorBa^Ws5pZopAiPrG3w*!C&zomd_i*5EVuGT9*oVnaF#$D$5fS@D#81VE zLcPAS;xF@3E1isdR|!)_dp26)@*Wb&t{4mGq)NC}(4|8k`U{-3>s%|PDx%8bYD4q;Eeo@mj{a0f|FS3Du|M7L!raKJi_6xuXFIyHU`q!_ zC%1K}F3s(JT&8n_e_97>tC_J^5X}~cV;fj?HO|j2hSRQHN$ajJ?BJSZZH?K+?s-2j zyDbgb2JiexPOq_T29DPIR>}f+FOudXjG|f4qH?e3qiwWN}bQhhbaQ# zVoI3^6S`9Rs6zS_4F|2#%HbUy5mCgyfuT|CMa1>$sepADONo?)GB6FJMY2zX9OB6n z&%kZ5MI?_|KSpdi2o(UE8k#c=5KGdx6o0TI-EbsntOdqdQI#p*k}BVl^zBQR56>B( z$YjeHTT{l?g>#v2j%IqS1Vv&vNP@~7jYFf>KcZY+?zsv+W_SX$ z77T+?6&R>ogdF?0BY1=aOrY?eIIkf>Z!>6Ncz})z#UK&~^s%eZb^;8RiK~#H=+k7&MBU6i) z`7V=Cp}1Sl6hF3M(EJ2EQfgmSYST=~+E>-uG*$;-(@e?QSLE7tc-+!Yx&w9Rg1=~( z6~2vSPC7R7fDCU00{@M|1Rq6CF8(Xn0-%Go!VlSIMLMVh_ccATN@rGAsOhhJ_~*Fu z=K&7rGZl>0d3*P*-SbT=V6Y4=Z%_I5g8%W}(A}XV*SG9QyU@{-wK)D^D+Ea3Hxapz27W#lOYcHu(^4PzZ zK`DvIk+oH{yos3j@IeeH&F^k9VC~(!HM?y`wyyO{i%sA0sH%3pcR@Sfe|I9MhkIC0 zPs9A_cT5W}fGaLnhD8|WH6L8)oxk`_<6>*deQdUQ>zWobxjwhvovp0B zcmD2qJV}w$;=OMh`F&(6Pgb<2OdV@_D9vpq0FSYH^MZRJxY)aJ@x8_zgZE1(z9PSJ zpIDR|)hPWhoLOvn@0E3f`qE(6dzOMZ27VrZjU0ZGFP&S%-yGMV?<`oYg;=VpJEw<2 zm?SJ>%Y1ac^X_*S_*7+At_+KfjM=_s!rY!({fq3zwpd!KF)635Nw(jocW-PBg{2x} zF>Q;v@3GtT9ogE3oECpOHh*yaz3U&m{@&~4{*lQx7rfIuckx!^njW6##@GhEd68f0 zUDhu3-=D}aaQmQP4S#ZbO!}$?{RhVPj2}4ObL1GfT|P~I9=y7SceyH;-o3Ct$H33x z{-xIYM}Lc1%I6UcgYc(bxHa%v;|QNuKK{V7f)+k>pn=2pI{4&+{*qFU)+XsC)l{4> zD-%q-Q7{W8q!hdf?jX-wpy#SQbR{HWc^i*`*m7!7g zthfqK71n_HLe@ZtT)dXBgTjw@s0Z%<4RQlCG=Ln~(uXGzIr@%?e2G+dq-aby9x|pu z$PkGRe%D{6WH%6@p)0(=11pbRL&R380hnG;fDaH*^(pEPj6py-e5obC(<5<&>*0Wk zruKJG5bxPIwWmURfb&JQ59VvM-L~GcW{l6JjL$9Y_|!N+W-ispdMh|$Nl+3r4akYeaYw_?baSEgBR@9m`579{C7` zS&%(Txz$Q48Z12~4X0EL3FAka59NMg-L+m_{9mfl(^=W9z8|!4u(m@?_!%1Eej0>D zkSF2T@nkRrN2Pg=SQ7*gBIP)5IAa_X1$d+aL%DH6z}FZMXQ&v5=N?e2r-CZ6Qv0~m zg1{AGB7Xmd)FDfyg5!l)AQaBas*+9Ctr08W8#QGjLD2sKt0z9f(`7?R%Rvky7<=sEU1G-xlkG*d-OYydD0$5`&!W{D|LTD*sJad zcm?(~z$b(a7kEgt@lVw^reG7re>}oeJaJ-uLM(X#x~YKHqaC&($b+sRa@?CJDS%hk zz~2R^7KP(7XzhZEDiGR$>`Nc_4M@i+$fmZAcML0rsmOFRh9|aoIFLOdpPI;T1h9(X z@&lqFjuBUhWTQvYWnxyOum1)sVFOZ~e4HY$uSChd>|CI9gJUmW2b0+m^N`9W7}`f6~}b$$B%ihtmxy z9K%CQ(wGq8vITQQ6#gaVGMFHZ6+gvVFb^@0JuX;lVaJu}H)yrM#Y289|_0`M9&lP?lkU?Y`@=b#{dj9}nt`URFK zCv!kJFS8}Dz@wH7AGLtby2xLR05Jkhs%ROQngnen>LXByd#*I^Mgu2)p6ChYu9BSQ zLwJ{X!G{J^S_9|Bm>+^ipcHS{Ni|?G2;rAD{K|PBS;(Q^CE5hQhXu-rpPE7jd z5seB7rrNYnDwT;~bTd>9#E7%tf`Bd6;{zK^GV*s$*1hsULb55Qsy$IdxH_E?s)D}_ zFCadmx-|;AQX3Wx5o}gOyxOhAu5acpPF@qPf_+Q`c_efdh&B!5y@QZ1o}387NnE7g5SblGjwW8xyY*0S#fMV#91LDD^rG~>p6dQ?pP#=LvPK1yD0KEc=H^L8{ z;2%LQN>N;v9CA3Y?iX<%7o+%-cF^w;<16!O4nE@HK-+&ZdmBCFN{cIFxp7 zp3`S7&O1jRS{h}y!1nC+fu)@}En7a6cMt##b#2O2w52NA7DDNYt#f5rOU0e=L(3-8 zN>jd%eYEe;@(ftg%a7v0Cp;~N)_5&rZ6P$$YhA}pq%yyspA7o_dA0&Ei6q{i+EmtC zV{rQ7XP8Kk6dFB{C=$wuC)uJya0f<@FEtvyF=y21I~M-nYX*LRdK)Tonkv10sdXv3 zynpHX&wLNIr=C0X6$52xFsOKpPbrH+nBFcD)B$H{W~Shsi}*HZOal^m8tj0aP_mFu zZP1WJ5#?y9m6Q-CoBGW&`UQ@tPyUq0q(HukAvHgxD(pY_;3WCphKTAH`38s7@st*Y zNH3xyE%iQQZQliuu@%}25Y*o-W@W@K2$-!VJV%#L3( z*8j@1{E8X)twzUcAfS`gd~Igf!|ZQ0TF64hlkC?T4XgRBLC z*7+;lo=29(q`B!v+2lK%tz=kvn= literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-312.pyc b/model_executor/layers/quantization/quark/__pycache__/quark_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..613fbd2d8cfa66d8730d77b8fa246f6a3dd2429e GIT binary patch literal 26404 zcmd6QX>?oHnb^ZdVtF7y5?n!mk2^tel|+$}D9Tz$Yo%mKmK8QJ4Dmfmpa77504<3F z#!BirrBlyX>N1j=lZwt{VruNhbWeJs$)p{1nbz%e!4v6u$U97{WSq|UM^E91KFOIs z?f2ccV9}#Cd)j$6FYbQ7d*6Nc+pqr2>9kYuM9hCbTydVF{sI%~V^JaxMhz5oonk50 z5TT-maf1Ql#)vU$8aI)+DPkTsL!K#Oi586)kvwyxIBFfYk+>ydk2=O3BwiGuqt072_4r%JE8)XNy!ttH-NJ+#ack){fVb zxFh0?){WOied9iorX&7n{dhfzJ0ncAVZ0&QINlg-8gC+bC6VT6%XkZkyCSX8w(&L+ zcSqWzf$@NWGE$tA>v#|5jL-O{a}-NKvk?yBraiICWAN(;PRzy=)6wAcOdOIW{PaXL z7z!siUdeHUW8p+FGJWnG#|O-kb#yk)u}7yzq!Kl@Z#p&^J|~r_>7!iY{4~2a6z8Nm zC69ZF3!gimI5`oDa3^PHW~TWB#|BJNx!UH1*-$K@bW}4rGZ?%$7#a*lr@7#(SQ4C& z$Ecc~m!ZEA>dCUyY&4hT4)W=>Uq02C_$!T7x7cfet$Mzi$ z9z7G>d*s-@!@-jW$44Zml67+8JQu}&?9cPlvoj~phh{hcnub*jPDG~TT(FOo>SEJ; zG!zNXaY1ftbjxHcn3z^#vvX3#42+lHVsRLf<;Yy1CW0I162XgPRf3^Jg2%;oPt5W> z2eX}tgc1N`6!PdetXT;9xIDwfVek@IDgFYizr4y)1xdd&xMy&5X0R|@nTM7=A@3kL zv?+-PF^K#<_}DmV+`v-fM%KWYCXEZ!q=7ZQL64g`3u|J{tc5F@D#`6Yj!(HC&SQ%> z8(Yj$lUCOHhWS0E!nl2bx?~u4oTLIaz7{ei6TGGgV@`YA;`sNV8G?UoxK1Us!Z(dc z>UZFHNE+17#6Vq!T|EZx&=Q}8b(XBbAla-zSSNFwi%jx!Gs^I9Yl?TJpaTqMX{;wEMj z(|ixvK=B^sMTQb#s0RuDf*j&T>z=vHH^4Y^Rr#-~TZOI{3gftHzqvxKS)Fed3)boz z0q+5tdR}{CQK-G?m+fQjygbJ6^g$4Ws%xct=7#z2Q_=bH8cWpq*K*F>l_W zhDEDkff*F%!Y;>?=A=cP0bi9+H#CL$Ny8MuGWo64)AA$*Jv#HtFjdf_ORe!FJtj)k z+H8>u`8N%L;xVaYdNwgLn^4YyxMV&X4#nq=?~}jw$mLFkykv&#wK#JzoH)jQkPb6nJmNVR(_r;i2hfxn?CN3XtHpvs549D=` zOh68+?mH8F6;k=wbmCxiCIY8m3=ZrOo}cC=Iv9=z)gzks!lWfT9@NSaZO3BynC9!S zkUlXRON65wX&oryTOpHg#~^^kW%6Fm+w@N5U{t!K<@v~|coDQ>$iw3)A(a56aM?-TB2^ph^5+L2!l2(3HPJ^bHHX;X895ZCml3NLg#|JA9&}L2xvr8aJ;x zwxk9QiUUW5fukQyt`5AoR@Ra+8cVw$n9SuR8OrP|$=IoyO=5MoP~E+BX{CB_!I`cK zh*e!eRoBw=O4aZJecw?fI+_JX^WxW*kE}ZOtd+Mf`R>qnC&kg1gwdB)%Fm?eGihgy z=xi38awVsJrBBNIsrsRnvRx^97vBO?2+(-s;VBRdO7Kr-O(#BXS9k-vv35 z0GTA<$eNHBO0vL#HgR4u#$r;@QTbpo#S=Wg1L{jo<VP8(O~i482g-ppG*-|RF*MTSccP;O&JsX-gj4Aw_dYecU^O3ERcp8 z71fzyj9Dpr?XPSQdticA>)47+k&G|_ooi3G5_|AtIJ*kb76-^`(pUpkSR-f3Ntw8! z$s&T%%z)C00hw8+%CtVU$dpHmY7szY4z?Kntng=pcGX&u7MZlO_BYJq&Zp|v!P1jv z)(P!Oo~|9|V(B?J521Alw07I5NBze(Jyq{6m|>$fF)hNnH^?uARVm}jfI=-}JrF05 z0cEPjDj`<2L0h?QHCL{*s^QFRE$3mqkE~w@Bl)1FAAai}S(ABWX~QE+9r~qQ#RmDh zt0wP7@~%4A#y2eEm4pPU0urd19G(0itT4YF1Ef_Z%-Vue6drU0i&sKeKnxWr#E^z( z4L}$rsh~zo0d}W?*qS_m?Q(loa;G$msbs)U7UM&E4WC1Mgr?wmYVs--vzfG}7yXeg9+thb73J7SVCtqKUwxD6nf?ChegU-3cJ7MOI95{=!lt}_n zH+iNIPZvC_RjojXJJxl3RDtstZ@dIF-E3I;-pGwA`yO#Q2&qyI!vRlx|z}GS)kSa+StiR zcA$jV%YcQ$6LZ`KFmyt{3=f=P7`owr^5b}fhf&~xaTQQ5hT~i(qjV=%nZCfqRDzHI z)BWjJU~NVNl{^E#3eHz(DMDaLz|A9RND_P>RFX>6a%B}2;!#;tz8ve=j*pxOo;r4T zWK6P?=*ZZ~V<)5vSO$eZKZgu*FpeCuWIr8>%*y1RR4!8-z?b2$@@n$9QKgFWAy^&m zweSRxj-i>**>EJBxGb5u*lTgA_Vke>M}wzEPM!*m9y_sbBnWRWcy!N+BL@Kj;PVys zT`GnZMu`QFCynGm!c>+506ogZP}IRnl{q2}q*ImrKrxSKuHXRX4`ALw44NQ_BiA7l zn^$DAhj4a&LShy&-5P^;6kE;#tpp6-I!)P16c%sQ>Py!J7R;;GnzXkKf2;5N8%2MQ z;O`OrTLk}><%28!VJz^beGQ_oOYn7xzCOX%w;WjU4Pim;ecCP3^#WbLI3&Ti{m%>i&wsQ_=s$y%JfB&qmfp7x zi8OPMW_~y>_8$`Z4`GRO)!O)35moBFd10-r4k)_P!-o6iRbqLoP~Ms6`}dz;|iw#=`%cckbYzoHDzo;6pw=;{_+-5CqO_LghiPSJJHjo@fn zb+n}YO$+u*qZ}I!B{-55gjzw6870EV+edEBra&@4@fF9=VR3 zqag@M`AM1}rSaUzY3E>O;7zlbnFYKZ)Npe`!!dd|W8RWeu3C`I>-l2frL{9x?G13VC0VT20qmgR!enu>C?`KV zqb~_!fE4)q4Pdoy0Bcdwngdf7R%fm6Qqe8*u zy${<4bF?OHIdl9X>#hs!V_{Wj&pcS|8^CHy+Ox1Gi>B%e76+27J;A8CQ#u^Uln!kX z*5p-rJJOc>mUDKhJvaZVA!gcO2RO(MSkGC3r{BxEYpQ;IIX(Za-v>$4GeP@h?Sxi! zPh@|;?41C+K!*b5|1@XbU()-}=~4f^>wQy>4ZbNl>BxRlIlJ7EyUTM%{Qo%zdV@JQ zleF#}=-fG623;UvO${fJRHeEffGa2&`xF=Mwn4HG#61czr$S`MgXj1VE7>phZBg_u zlI`MVB}&LY$(=2eNmj|F)lf1NA-ya~BlIHFfgW?{T%fEs=cWTy{3`$yk5mHxG6a%+ znh&3oNeih|A;Zr`rY9~CN`&8yb)1ut=}=-&(J1lHVV>77+koJbfcbT$Q zLPeT8!E;E9Dcsgvvp%VvUn>cH#SZ|k1fgyj)py*mU|F?xrR&=8w?_K~nj6y9>Bd(4 ztzWCE6RSFes*VMEy@;yxUO#i~j9AeoRJ5(prEed8^KjbjO}k4|o-KlV3&txO*KHP% z`O=h^$+)P>_6L-;(!OAaF>NKHl@YAWV%^f*s&&U2Xmn~*Ed#68!B0Fj*AHAfAbMH_ zPit<+(05N^mlbW$WrcmgmfNLwDGdE!mo=&8ZL8J+kedP%;P49$|6=DoM>jyvwA{LM z^O9KKC)D??ftb~P&3p^rwOhx$I~p}u6{SY8L)?zmRz1NUja zNVf=dOS-gvwPBaouwQ7{FE$(!8V;r0hu2L;pz&c?U&_~&>e;hO?_HzoZUj@^&;2>Q zCk_0HlM$TE;?aA~zEA3#ADT_tQ=j%!q^bu5&%j!3{jIK>UF*f>(lQXh*EeUJWNyXs z+}uw(wuPou9Cs0%&ziQw|vtmA&xeoaP zxtho@yHxtCq%t~Rh21r8oHxyz^Ll5p?bR6c)2PO?zmTi54tc#>=8NWwpJ9cK1#e5+ zAJ5R2si3bMMe{dWpP@f&f{j2v)u|@A|f^|LCVcwS5q{0Nf zcB=90uX;l5r@r=g;R~>{)_F&^4QtCOe-+B<1ZrK$1O4fJdv;WUCLQpfbq->`Z3I&} z2xWzo{>+kc8yZ`xMEJZj(MN`XU~01(%l;;vNnqu3TYMGX$s5%cEa4>g3_(od8x6)HZ?Ydn!19AwEU+2YEJDes0>Ng zp}xm?(=&Wc8=o4wXW?TYD0&76HvXQeDSU|q5C6LfWMarO^_&vP`O)7eQ+so&C{d`% zJe;dnQ{B0^59eL;k5T*S;&E5O9{M6Dy}O_y~|1~8;j&Nx)? zz5-{}E5^%c_ePJ+9Rk)!HJW5Vo5IMtmOPD6Cv$c-0qQH%NoAE*s#*(GS7X2&cF#@8 zYO5UfHO?f?hZ35V5!x){lhZF68uT*2T4T!1*~P8jm2M` z<+wSH|0-sp9zk-&&(BUyg1wHy>Plwd%0oPA@Oa!MfzlkFoAA;ESi(VPAx7?8vV-D{ zXJwT}2~0VNq6k^pCfVfo5Dc?mCJFr~B&%E-GHmRt%IJ~-tcDbTSZppivZ>ASQWzzV zS;vITu={2}zs7-er813RXcFZRAhlrOt;}uF=ieXM=zqXy@fU%P19o0!b&+#QyF8++ zS#UL{T%F4nv3I}FyFb-?WUZ0`9aY6<&?HrrfhMW6Y~4zEm_>`o^b1V?YWcRGMAGgG z(cLV#n}JQ+ja+DHmFR8~+)ZH3RnfL?0zPuxPPKHrd*Gb|V)Hhkc^hfbBDh;bcR+9l zR@|HJ^aKCfG63vUO9|nXpqu8vyMylxicS4OQ@=Kb=x!I>?JMq%J1x+nc^kB7CM_5M z3{_Wcf}`zYM*zX-N_%QVPqW}@UgX5qA)$5X4kr$u6oyYeFd0kBJ~L6y9Set2)$L0? z0N{?(1{ej+rxq*3hHXN_wmTMa@Q5&Y1ZwXn5$J6T2d}=qR08AV$^(`=J{n2Y_pQ{Q zK)CDHV4i2*IU}|V2rUC?H?wR3Fk1%!%vJ(sBdoc*KIQ6IYw!BNdfO_t?-1H|q&-c` zxB`KlP%%(;)w1etff)d&uE`9RYd&g8`FmIV#}`K4I<^Ljle3)lODKWJT|XSgpK=A( z+B)C8^v)%*ZM)F6J?&{+#${{Y0mHPDVVXhSk`;HXIJ(p2)na)-C=aB{`%`p3gy|Z; zSTiit3@;o?`x-^x7Qweg^bH8UfjfIrzJV3ru;?2Rd?O2EYre+Cez9qX(6r;veLJt( z($zIrEo)56V(9zUs|VJY{&clZtZo-z#cPD>Ez2W!M(!RGhQ?A`$L<|}A+_}dq51@= zLM;9+3)S22K;?Z8ET)Fas|Vp7)zpdAokDfz(pT=3+`TLezOYh#0$yXq3B%P9wa>5CEZse`vj$YTcCk z&iqSU?mPRKZUmKYNm-5vC5>9sfZz(eKQ4Cd5xVx=bL~ktwTn&Lg{JLd(>|eT-`%|{ zP0y#?%vzxPgX-JWVqm8b*qQb;FONXq9Ye7G9c2Am0VrtMI3P3*h>d%M#yxi%R~kpI zx*oef04RKMt@j+QpR{ii+ed}=QSzBEf}0WD_<1ePzWe$+ugkh(*u(+Y#4QU)Zfq*t z!kr&+WDB2?w{WhgYS|0h8@C;7@83Cg-&y*0@LRz(x^wBpW!UaJ!00gLZh>Lp^Uv6* z5+9Lmy3894_GBTzJ=C$!~{V*w(`+r}(%isKro=%5zOZL>xg1RW}-!E?R(xM1TJ z``}H)5rcvq1)LUvbVaRLu}P@dL?RtRMMr)lAXEg>OtZ*r7MRV;QGwY<%qV&VU+?l! z!MA7KS?soFsA8L4whu5|0<{3lo#3l;VXzJWo7kD)1<3(!7^+)`gl3l^V(a?-krFh)n$Vj+M&c?u;2#pQm6QB`La-<3Z z8DMoFE=VT4IWWJj0y_n$hW6IrEJ7Lv(K1}B3(d?#E`vD?nUA7Qn2yOR2+5wM{v?NN z0u$tBB4;JH0vRIg;A>&-qErQxRT!xqaAcV|e>qO#2zZ5B0A4i_$ORC-ZoY=BXwB2FC~GHc3jOM@>-&!P95N7a z=mFcaW#ssgz4@LGgvWSA_jGxMxy&WHZ_|=fwfg}RcrMPr16}gp!{ECZ{5}SMfWh}M zxQ)RdVu0_4e;)z~ybxG0R*Mr?EW)jTR;ONn8s>sWHX3-hH2pRJJ$Zid?Q?mObyEZyIMSc-vAwR&bmNEDc zgMW^Jw&?sn#3Y&LU5Lf;J4L=q|B<;Z3pwQHyt;e6n-m^Hq{%^Wh5D?UqN~7ZCG8#o z;tCDzPzeWye_dkpPN8{cy4IVnZA<&Q(q&cY>ZWv61I8i0rX}4tKq5Y|rdO!xU9WMr z*ad6D0)6%Tq9Nng2)oaX&?VRsdOFkodP12u3FS=?xxV|_?zAtEu4zbnThd(z2`1hw z_%=gwz1;2rVCw`4C^hh6DsXy*KAkFWTZ8`Gr5fR= zus~_Ihx`wmU8~@3CB;Zlrro}@yDsgnUIXVI|DfO>6#Y8||4sw}xHK5t=fc5V&XUg? zDd%Cs&&!#OL9eb;peIeWIjmRF*az(_A%w(j*XY<7e+Ay(7}z7nrzaD@gr9>XJel2N zqx`>w2vivhRP!hn#74CI&~a;5oqC#F3}V za>Q`I%6ntqV)=~&*S?;yUSQ;D>`h z+Wq0~429{ti3j92bHHS`*n#pxh5hy5wO|I*_ic4I5`wMifdvwoR;sZr-O!S5+mvo> zPIq*tw+y7~AlZ=~fSs^8UCX5F8q&R6G1DWo^buq9v5KKI0-$H@G+5XuftRQ%aI$l-Iv#zS)r}CrJJXKF~Yma3}FcuBO5Vp=jj67|D0->rM6 zZrLm}Y|Ye@Jcer9lxZN*M)a?0BGG1qu!Tfh!F=IX=grO~k5Jo{X(M^s9z9wJy|-99{DE%|OG7&y$_PBU zWRI+pH>h#fQ}yoR+^k#i%PIxfO2I$L#g$JQ!Gk~R1nqg0ua=Es+n#F7a+pO&!PFrM zo>=<%;Jw6Fg1=Q2{8dAoz{}QXv08|EH)t!@t>Y?{R` zrbpIneq?FOBTHTSrCjv}`MMD2d26+Tw^j}D)~W@%vz_>|7B~y!d28Xpjpy!z`w;%W z_tqjlQaaqxqkXi%Gs-OcSn=Q^CCB2WS(|YdY^{Mu0ekCb_E*xpxQV}#1|iV;i0=v> zwg@S>DWvKO9=J|Gv)~3FAS6S{1N~@ys(G;gpR^x*Yvf62Tx>!+O;{UCsip%krov@X zRMs>>SM{pT9a4H6rK;S4-nBWyjI@zd=m0}I5)EpKwX14G;24|KTs;N2%2`7Hn!!pU zptaCBIn)-!>&s94@{lnpcr>3q9R)c{tfk4@)>abc)LGC4B*BD4#R=3@-^1lB{baDh z4t2r^I&eY>N)2?Ngc8?xG^IUWCGi+o-0GT>YbQ#__~!v&$%fK1A{&xDypEn}A3eS` z@F-6+Qd`x)J+)&Qe+U=uFa}5rO2)|<$sD}|p_|ZqTZ7!@Xdm1{a%uD2S;8e0UIHTD zFQDiOiNa3>CLCv!8%V$@4zCsnXMO*>nd1=-u5@9hXTa$!6l22i_$=2AR78 zG&tYoTyns~z|ZY9jyZY;R=EemZOp`U6nx8M|2O6&XdvP86u2HiX0;jZ(iKvnz!}LiVLAKJPa6)G3Viu8@zL7 zA{+;KUiaL#ht@(}3T-0bi%2GuJB>gM&)^Hj51v1Y!59Ykg~$rf7a=AUqZAHUcF=&z z*T@jQTlMgh6U87~#oJ|MTVfl&Cz(4(`4#LK9xnconQFPdGw&7196uhf{NKU)W%%N9 zm~~{+QRD==Lir8~e8W;PC%*#z8`$uh5NzOTcMQvkL>bSpN4nY-?qSWU&hR%}VtET;kyNFK)W$?pp4;yYt@W zF~wVLKls5lq{8yCN}cnn?N`k&PG?S362)^!PWT}$VLx@}_JuuwOACvo@mO5G83 zjRSwcdeOT{@NQaqUhr-ey+eX`=+3FT+g7}Xu)KP$hWTDas(r_u{VO$l7R+yzeBy(v zD|QOLooVz3tP{YNla)OQfz$WfiL0QZe<5b4cyooJbbF4!otcR=tCta!Jls~A4I1J;wMI)Q!etYO! zLu)kH%%W>zx~zA#Z;#k_ROmap(swM?dptuKUNE!)J0`Xb3vI)xR^aIj&lwtkM-v;i z2n}0O%vQKiqrX36DJt#%+++ngg@=wz9vu*S4z5(7sro+%w z03^`hWrgxq-UY@K#yQx`Eb$4CE2dwN2Q5HeYhZ~V9Ba|3qn41Gfub?VLXI!lS^)?A z6e+(?k)XK^iWym!gkbW&fIzu*NYR&*>q=%q6Bj_40A>%0f%`)n9F-#3TK^Nb`9bLG z3YB(L-e|k$Xh@e=-YC5`a3k~&hEjC>)geqbTpJ{&*}!%-Tzme;tJiid?zvaqlA>Eu z@GSE~vq-)9fu2TP12CS>&M(iy@(aC1 z9d146Et<004E&M3h<7Y2f$kllKWz5)3GG;**ImkbU89WAY4E_p( zpJI@}0QU$#i$N&_pnGV<_(lf~|65q914$e~$dQN>F*p%{Zgg_yy9WJlJURb81Yk>4 zN!h%jwE@J~X{UG9w_Wt@7JR#9(~uNB3dTV;`#KTw{noYLdNQXVk8DS@czD?(_U#q= z_O7`16*~1i!cXXvC-W0hMoBgAyL(C;IW3HwUU9!vILsq_hj19hcL?<8bq%)$Zw`vy zF2UP{4oQdJJXGN6v|8UQxO)o+sOtvjq&j=bTD=aUOy*Ym&33W2Q>g9C?F7A=R=4aI z-1`fgd%K{y*S_F_*0!1(^{I}bRqHNrYf9AwQk!AR#Dz-JKff5Z?q&KuE`QV$>L>5Pd^8Qc*-Qlj4m z_h-R_=K=cEXN@=Ki$L3K1RqA&i1^GG0T$h$CA7l%z@tB65D9YfKzWH8&;CAk7J6+q zfTds-#bg#5*9^F>_=#qrGh)sxbR{F-_;CVx6!PfdDLotn>!G6o*M(4bO!e(-(2oxO z$#ay@=#poGyiT2M_V<|}*X!qTkZbQJ2XY;m$y7rYHZ}I15+PC%%NWs%gGDdR4Muz6 zJ{V%*%3py>#OfDpT7M34sfye%+YRkc{zO`0JBZuo~KnTcJ!v zHZrF9J;+gvSRbO%A8Ew86^M}dIf8iL-V?;2OU`{2a=skbfFK8So7_^-BZ6I*u*tvm1R zy=w$3zSid#ETYvfSpBQkru%468W21I(bEHVhFa&f^d9-pxIp0QAMXK8|f~d zdiUjbUS4r;g)VMQ-<+29w>j>aU|kB9mbQ|&?Qh!UiGOU}&}9)NyaYeI29Z4DK1$cP zPggFje-cA3ZkU1xZE;xYeZzajasg-$jAI}I`2{3p`sfD6Zes9HF!(kEl0$L)CU*>5 zc>E%H2?Ih=X*-nvr;rzitp(du-k`T2u2|MRv_Utvta|}>=P)#tE$iAeF-TgKlIYpm z)ubzGpys;Eie_|H#f)x3L2A&O{E+2F-sn1Y*#MJ2#s4KF0tWIC64kT93p?Q^<>`xP zp@xF2xh7qCWp`esml1Hp`B$j>u5y`+U4-P{oWVw(eH*?PGQAi~`fr{~7~Q;a!Xo8?7abVf_G|HHYH= z4d(t^41SKmzr%o-^$^5FIIBO!y#Ii~T?hi+9DKBhx$j``T?~F7gYRMR2N--G0|EdU z`!**35CcTK#4zpY&5gioWi~+oiF=_J72)ap%16vcP)RX4DqLdOxsorS^dUsWs4s$_W^q$ zFkI65siH|gwWJ^U-1(0%V|bN2;U{a+!~Yp3_hMki0P&A3$^Xw7!&SP#`_zedC1p_>S`2U7M3IlvW`o=v-4HDJ$4-D{%PQ!a5#y$OD z?UHNT18Ja3a!q?MN4D(2Sjj$xj~ve${E+mo4jH|chRhB}fn5)zvvxg@$kb9rC3wlO=SBiqbfwFAH!q1cB){|3KF<79zE@oS?3~yZk-XIZL~Ru#R_!01 z;8Be)4L_o-qHUD*m4xZ(R{qTjl0YXy;{GS?c)Cs$&7I6fp~#IVI=)9);Oc-PYgOb< z&|*Lud^#w|KieTU_aF>tRUvyy*64L4*F;gFR6@$tfB7OVD;C`(6Fnf5e*%NA!4AoZ z`weVW91P;W#u1dSBS&l|8|wTe##%6F#%k6f`EK-Ko`!P}{-@&q2!f2sU@&}cr3@`U zqbmP`qJK$M|BPz-8Px&duZ1R~n=QhvPGb=X#m7-tR$F3Bw7u6buQ{Zbz!SDKxpv6#fV{9F6ST>Zb?|00b N4CQ!hI{p%<|1TBf+v)%S literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/quark/__pycache__/utils.cpython-312.pyc b/model_executor/layers/quantization/quark/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5deb13241e7ed7c241d10e1a7a51168d1446b4ec GIT binary patch literal 3631 zcma)9Uu+b|8K2odxA)J!KQ{J(gS}uJ`%Jh?VhCU?OT^%c6ljH^ksKjseLJ=fukS9i zd*HjCD~lk-SCDX|rd$M7oQFz>Xr(+<6}&}xYSfpz+7+j*BT?n0yq#;RhQ74l?B1OX za?_49-^|W@^UXIizwgifv8l<8plHI)`0tw#`j&mvi!}oun_mIBgH)tq4b9I<4Q?o>tMP#sVu;+k@(&KttGn?pp1qX*dDVhXD+;!%aH zgplecn0R)>gz6z2@er^5HN{zDOomhS-Vn!|tTjyyZd-aYNN;u_wMF%TL_gF3aalV% z)gY|$WrT3+Q?_lGGo5D)LX}C4n9?~Vm5L{(!`SrKx<97L`OL*kioh6uGLboNifTM& zykR$Qnhr{gG)=@f`wSKpfbvi`_rS?MLLcEf$f%tWp?u@K+s1JH4$_$73^v$mtK^Zv zq*>)bZ4Y`hkFLNSy<8sD2J-m(W@nEgWbNUquU|q-c#2cG8_qmBh3Zo38jYbBw5g*2 zHMD6r04a9acDAjt#@S~35E_H?=ds-j^XErnje7wJ4yXfAH{S;G3HL*U0cSaMxecKO zyzREQ9A+rU^LAU;j6RYFn|vE*Dzp(HGrT;z2V&r)`O z)Cojq_bbbbi~kyr8gr>=DxNxID0G?_hvw5qj~|;oN}^hPGMdU5Gs#5ss;13F=aMSX zCdfRArHv$wYD$JsJ^DdfNf`00V#I+DKIm*TZNxP_lFFE_p=pvJ^C>!AeGm^xcDHsi)#=U6%sOJ@;DIq|S=Re|s=Lcyp+5v~2k#F3 z$vwUu!|k z|E%QPy9rw&({0`8L_(P(rXxO`NK#@pO{I0BPR!XB8f#bx@Uv&M?5z#Z#1OLxHk3nG zwquLQg`#qGT&q08&;p<1bKF%#-_3D3dql_yr_gON#!aIbcexdKM3A+8KT8rquu`mJ z?_=&(n}8M^3!;h^q@3dvx_l5;It{VLjB?H#{*6%QUvTB5MLy?RkZ$eDL7!Qrj&F4s z4ZBvwoNyPZjvN9i{YtFseH{xZ(X>r2dO@xPa=CZVb)1vNKz|H%LA-{-&hw)tjy(N0 zl=~dr16rAq4Rbwva5R}tsInrf@u?|7Ny3n~V_DW?Gh~kF@>G(_SwW5`Zt;+d)!IRne*%F@0{M_+MwxnrQ@3~jj4fy0|3-|qfv_rS8R z%A=yywbj}_!da*a>G_|7;jg}(_a$$fmS@7C86{A-%n!DMrYXm z_3pPF0%h)M=MeZB%zOk!x#hl>5CnRB1I%HkU=!xD*WK*CM;JnM`PvKtJW&AWmQRr5 zx=iMkn4x843qmpgQvoajh-3>y462rt_>^p;Tn4X54UiW%X5?N<274J?l|-gK52$AT zF=8!yAuO?DT4Cue=zM}3VF^6u%VDW*-nCE^tcN1%g{2UTLKAU)N7RRT+6*kF!`9Y< zz0Q0NL?#<_EG%RX{dc?BA(C8#B|0B&Jr_#C1mgbZ-u;oXYP3hrN}KTwzjPD&Um2QWKmJGGdOS z`$2xgv2HD>r#j%?zh@SLgDnR_3QJt zqVpF=!GpVP`(fr|auAM1+gXcR@*1IGfxZcx*0WP)DyEcD49Kbzwd_i+fho$k7fNGl-nckEKvq3Ke#(NnMp-s1PUwm;0Bbbx+gnWBFstKPX&U^K@5Q+Q9*O0~;RX z4plsX+e7)Go5O`mpS}0#dv`CF!8-wi0~o+zy8X*Xe%@8_wLpGiAv26e!+z5jOKKX4 zu}jh;%4CeP15ySfQ)CI0PMNWT6=v9P0A&fODO!GoLV!l5FquqhR)Pe1ltF1b3|f)E zvJmTJ_mQ=qWjv;H2vWtgMusU%X<0s^e+8t "QuarkLinearMethod": + return QuarkLinearMethod(self) + + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + def get_name(self) -> QuantizationMethods: + return "quark" + + def apply_vllm_mapper( # noqa: B027 + self, hf_to_vllm_mapper: "WeightsMapper" + ): + """ + Interface for models to update module names referenced in + quantization configs in order to reflect the vllm model structure + + :param hf_to_vllm_mapper: maps from hf model structure (the assumed + structure of the qconfig) to vllm model structure + """ + quant_config_with_hf_to_vllm_mapper = {} + + for k, v in self.quant_config.items(): + if isinstance(v, list): + quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_list(v) + elif isinstance(v, dict): + quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_dict(v) + else: + if isinstance(v, str): + mapped_v_list = hf_to_vllm_mapper.apply_list([v]) + if mapped_v_list: + quant_config_with_hf_to_vllm_mapper[k] = mapped_v_list[0] + else: + quant_config_with_hf_to_vllm_mapper[k] = v + + self.quant_config = quant_config_with_hf_to_vllm_mapper + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + # Check if the layer is skipped for quantization. + exclude_layers = cast(list[str], self.quant_config.get("exclude")) + if should_ignore_layer( + prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping + ): + return UnquantizedLinearMethod() + if isinstance(layer, LinearBase): + scheme = self.get_scheme(layer=layer, layer_name=prefix) + layer.scheme = scheme + return QuarkLinearMethod(self) + if isinstance(layer, Attention): + return QuarkKVCacheMethod(self) + + if isinstance(layer, FusedMoE): + return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix) + return None + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "QuarkConfig": + export_config = config.get("export") + if export_config is None: + raise ValueError( + "The export key should be included in " + "the configurations of Quark quantized model" + ) + kv_cache_group = cast(list[str], export_config.get("kv_cache_group")) + pack_method = cast(str, export_config.get("pack_method")) + + # In the export model of quark, the quantization configuration + # of kv_cache is stored in layer_quant_config. First, it is + # judged whether kv_cache_group exists, and then it is judged + # whether layer_quant_config has a quantization configuration + # that matches kv_cache. + if len(kv_cache_group) == 0: + kv_cache_config = None + else: + kv_cache_set = set(kv_cache_group) + layer_quant_config = cast(dict[str, Any], config.get("layer_quant_config")) + layer_quant_names = list(layer_quant_config.keys()) + layer_quant_set = set(layer_quant_names) + + if not ( + kv_cache_set.issubset(layer_quant_set) + or any( + fnmatch.fnmatchcase(layer_quant, pat) + for layer_quant in list(layer_quant_set) + for pat in list(kv_cache_set) + ) + ): + raise ValueError( + "The Quark quantized model has the " + "kv_cache_group parameter setting, " + "but no kv_cache quantization settings " + "were found in the quantization " + "configuration." + ) + + q_configs = [ + quant_cfg + for name, quant_cfg in layer_quant_config.items() + if any(fnmatch.fnmatchcase(name, pattern) for pattern in kv_cache_group) + ] + + if not all( + deep_compare(q_config["output_tensors"], q_configs[0]["output_tensors"]) + for q_config in q_configs + ): + raise ValueError( + "The quantization method used for kv_cache should " + "be the same, but the quantization method for the " + "kv_cache layer in the config is different." + ) + kv_cache_config = q_configs[0].get("output_tensors") + if kv_cache_config is None: + raise ValueError("The kv_cache quantization configuration is empty.") + + # Since we have already set kv_cache quantization configurations, + # we will remove the quantization configuration for the + # output_tensors corresponding to the kv_cache layer. + for q_config in q_configs: + q_config["output_tensors"] = None + + # In case q_proj output is also quantized, remove the configuration + # to keep qkv consistency. + q_proj_q_config = cast(dict[str, Any], layer_quant_config.get("*q_proj")) + if q_proj_q_config is not None: + q_proj_q_config["output_tensors"] = None + + return cls( + quant_config=config, + kv_cache_group=kv_cache_group, + kv_cache_config=kv_cache_config, + pack_method=pack_method, + ) + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + def _check_scheme_supported(self, min_capability: int, error: bool = True) -> bool: + capability_tuple = current_platform.get_device_capability() + + if capability_tuple is not None: + capability = capability_tuple.to_int() + supported = capability >= min_capability + if error and not supported: + raise RuntimeError( + "Quantization scheme is not supported for ", + f"the current GPU. Min capability: {min_capability}. ", + f"Current capability: {capability}.", + ) + return supported + else: + return False + + def _is_fp8_w8a8( + self, + weight_quant: dict[str, Any] | None, + input_quant: dict[str, Any] | None, + ) -> bool: + # Confirm weights and input quantized. + if weight_quant is None or input_quant is None: + return False + + # Confirm weight scheme is supported + is_fp8_dtype = ( + weight_quant.get("dtype") == "fp8_e4m3" + and input_quant.get("dtype") == "fp8_e4m3" + ) + is_static_weight = not weight_quant.get("is_dynamic") + is_per_tensor_or_channel_weight = weight_quant.get("qscheme") in [ + "per_tensor", + "per_channel", + ] + + if not (is_fp8_dtype and is_static_weight and is_per_tensor_or_channel_weight): + return False + + # Dynamic quantization is always supported if weights supported. + if input_quant.get("is_dynamic"): + return True + + # Confirm activation scheme is supported. + is_per_tensor_activation = input_quant.get("qscheme") == "per_tensor" + return is_per_tensor_activation + + def _is_static_tensor_w8a8( + self, + weight_quant: dict[str, Any] | None, + input_quant: dict[str, Any] | None, + ) -> bool: + # Confirm weights and input quantized. + if weight_quant is None or input_quant is None: + return False + + is_int8_dtype = ( + weight_quant.get("dtype") == "int8" and input_quant.get("dtype") == "int8" + ) + + is_tensor = ( + weight_quant.get("qscheme") in ["per_tensor", "per_channel"] + and input_quant.get("qscheme") == "per_tensor" + ) + + is_static = not weight_quant.get("is_dynamic") and not input_quant.get( + "is_dynamic" + ) + + is_weight_symmetric = weight_quant.get("symmetric") is True + + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_int8_dtype and is_tensor and is_weight_symmetric and is_static + + def _is_ocp_mx( + self, + weight_quant: dict[str, Any] | None, + input_quant: dict[str, Any] | None, + ) -> bool: + # Confirm weights and input quantized. + if weight_quant is None or input_quant is None: + logger.debug( + "Quark model is not in OCP MX format: " + "weight_quant or input_quant not set" + ) + return False + + # Input and weight qscheme needs to be per group. + if ( + weight_quant.get("qscheme") != "per_group" + or input_quant.get("qscheme") != "per_group" + ): + logger.debug("Quark model is not in OCP MX format: not per_group") + return False + + # Input and weight group size needs to be 32. + if weight_quant.get("group_size") != 32 or input_quant.get("group_size") != 32: + logger.debug("Quark model is not in OCP MX format: not group_size=32") + return False + + # Activations and weight scales need to be in e8m0 format. + if ( + weight_quant.get("scale_format") != "e8m0" + or input_quant.get("scale_format") != "e8m0" + ): + logger.debug("Quark model is not in OCP MX format: not scale_format e8m0") + return False + + # Input and weight dtypes need to be any of fp4, + # fp6_e3m2 or fp6_e3m2, possibly mixed. + if weight_quant.get("dtype") not in { + "fp4", + "fp6_e3m2", + "fp6_e2m3", + } or input_quant.get("dtype") not in {"fp4", "fp6_e3m2", "fp6_e2m3"}: + logger.debug( + "Quark model is not in OCP MX format: dtype not fp4, fp6_e3m2, fp6_e2m3" + ) + return False + + return True + + def _find_matched_config( + self, layer_name: str, module: torch.nn.Module + ) -> dict[str, Any]: + proj_name = layer_name.split(".")[-1] + if proj_name in self.packed_modules_mapping: + shard_proj_names = self.packed_modules_mapping[proj_name] + + # Convert fused_name --> [shard_names] + shard_names = [ + layer_name.replace(proj_name, shard_proj_name) + for shard_proj_name in shard_proj_names + ] + shard_configs = [ + self._find_matched_config(shard_name, module) + for shard_name in shard_names + ] + if not all( + deep_compare(q_config, shard_configs[0]) for q_config in shard_configs + ): + raise ValueError( + f"Found a different quantization configuration for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme." + ) + return shard_configs[0] + else: + layer_quant_config = cast( + dict[str, Any], self.quant_config.get("layer_quant_config") + ) + + def _matches_pattern(layer_name, pattern): + if "*" not in pattern: + return layer_name in pattern + return fnmatch.fnmatch(layer_name, pattern) + + for name_pattern, config in layer_quant_config.items(): + if _matches_pattern(layer_name, name_pattern): + return config + + layer_type = cast(str, type(module)) + layer_type_quant_config = cast( + dict[str, Any], self.quant_config.get("layer_type_quant_config") + ) + if layer_type in layer_type_quant_config: + return layer_type_quant_config[layer_type] + + global_quant_config = cast( + dict[str, Any], self.quant_config.get("global_quant_config") + ) + return global_quant_config + + def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme": + if config.get("output_tensors") or config.get("bias"): + raise NotImplementedError( + "Currently, Quark models with output_tensors " + "and bias quantized are not supported" + ) + weight_config = cast(dict[str, Any], config.get("weight")) + input_config = cast(dict[str, Any], config.get("input_tensors")) + + if self._is_fp8_w8a8(weight_config, input_config): + is_fp8_w8a8_supported = self._check_scheme_supported( + QuarkW8A8Fp8.get_min_capability(), error=False + ) + if is_fp8_w8a8_supported: + return QuarkW8A8Fp8(weight_config, input_config) + elif self._is_static_tensor_w8a8(weight_config, input_config): + weight_qscheme = cast(str, weight_config.get("qscheme")) + return QuarkW8A8Int8( + qscheme=weight_qscheme, + is_static_input_scheme=True, + input_symmetric=input_config.get("symmetric"), + ) + elif self._is_ocp_mx(weight_config, input_config): + return QuarkOCP_MX(weight_config, input_config) + + raise NotImplementedError( + "No quark compatible scheme was found. " + f"Weight config: {weight_config}, " + f"Input config: {input_config}" + ) + + def get_scheme(self, layer: torch.nn.Module, layer_name: str) -> "QuarkScheme": + layer_quant_config = self._find_matched_config(layer_name, layer) + + # Find the quant_scheme + scheme = self._get_scheme_from_config(layer_quant_config) + # Raise error if device does not support the scheme + # (e.g. fp8 needs ada lovelace) + self._check_scheme_supported(scheme.get_min_capability()) + + return scheme + + def get_cache_scale(self, name: str) -> str | None: + """ + Check whether the param name matches the format for k/v cache scales + in quark. If this is the case, return its equivalent param name + expected by vLLM + + :param name: param name + :return: matching param name for KV cache scale in vLLM + """ + if name.endswith(".output_scale") and ".k_proj" in name: + return name.replace(".k_proj.output_scale", ".attn.k_scale") + if name.endswith(".output_scale") and ".v_proj" in name: + return name.replace(".v_proj.output_scale", ".attn.v_scale") + if name.endswith(".output_scale") and ".q_proj" in name: + return name.replace(".q_proj.output_scale", ".attn.q_scale") + if name.endswith("self_attn.prob_output_scale"): + return name.replace(".prob_output_scale", ".attn.prob_scale") + + # If no matches, return None + return None + + +class QuarkLinearMethod(LinearMethodBase): + def __init__(self, quantization_config: QuarkConfig): + self.quantization_config = quantization_config + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.scheme.process_weights_after_loading(layer) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + """ + Use the CompressedTensorsScheme associated with each layer to create + the necessary parameters for the layer. See LinearMethodBase for param + details + """ + weight_loader = extra_weight_attrs.get("weight_loader") + layer.scheme.create_weights( + layer=layer, + input_size=input_size, + input_size_per_partition=input_size_per_partition, + output_partition_sizes=output_partition_sizes, + output_size=output_size, + params_dtype=params_dtype, + weight_loader=weight_loader, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ): + """ + Use the output of create_weights and the CompressedTensorsScheme + associated with the layer to apply the forward pass with the + layer input. See LinearMethodBase for param details + + """ + scheme = layer.scheme + if scheme is None: + raise ValueError("A scheme must be defined for each layer") + + return scheme.apply_weights(layer, x, bias=bias) + + +class QuarkKVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from quark checkpoints. + """ + + def __init__(self, quant_config: QuarkConfig): + self.validate_kv_cache_config(quant_config.kv_cache_config) + super().__init__(quant_config) + + @staticmethod + def validate_kv_cache_config(kv_cache_config: dict[str, Any] | None): + """ + Validator for the kv cache configuration. Useful for controlling the + kv cache quantization schemes, that are being supported in vLLM + :param kv_cache_config: the quark kv cache scheme + """ + if kv_cache_config is None: + return + + dtype = kv_cache_config.get("dtype") + if dtype != "fp8_e4m3": + raise NotImplementedError( + "Currently supported kv cache quantization is " + f"dtype=fp8_e4m3, however received {dtype}" + ) + + qscheme = kv_cache_config.get("qscheme") + if qscheme != "per_tensor": + raise NotImplementedError( + "Only support per-tensor scaling factor " + "for quark KV cache. " + f"Expected qscheme: per_tensor, found qscheme: {qscheme}" + ) diff --git a/model_executor/layers/quantization/quark/quark_moe.py b/model_executor/layers/quantization/quark/quark_moe.py new file mode 100644 index 0000000..30772c3 --- /dev/null +++ b/model_executor/layers/quantization/quark/quark_moe.py @@ -0,0 +1,683 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any + +import torch + +import vllm.envs as envs +from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + FusedMoEConfig, + FusedMoEMethodBase, + FusedMoeWeightScaleSupported, +) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + fp8_w8a8_moe_quant_config, + ocp_mx_moe_quant_config, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + prepare_moe_fp8_layer_for_marlin, +) +from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( + OCP_MX_BLOCK_SIZE, + OCP_MX_Scheme, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + all_close_1d, + normalize_e4m3fn_to_e4m3fnuz, + per_tensor_dequantize, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + +__all__ = ["QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod", "QuarkOCP_MX_MoEMethod"] + + +class QuarkMoEMethod(FusedMoEMethodBase): + def __init__(self, moe: FusedMoEConfig): + super().__init__(moe) + + @staticmethod + def get_moe_method( + quant_config: "QuarkConfig", # type: ignore # noqa E501 # noqa F821 + module: torch.nn.Module, + layer_name: str, + ) -> "QuarkMoEMethod": + layer_quant_config = quant_config._find_matched_config(layer_name, module) + + if layer_quant_config.get("output_tensors") or layer_quant_config.get("bias"): + raise NotImplementedError( + "Currently, Quark models with " + "output_tensors and bias " + "quantized are not supported" + ) + weight_config = layer_quant_config.get("weight") + input_config = layer_quant_config.get("input_tensors") + + if quant_config._is_fp8_w8a8(weight_config, input_config): + return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config) + elif quant_config._is_ocp_mx(weight_config, input_config): + return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config) + else: + raise RuntimeError("Unsupported FusedMoe scheme") + + +class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): + def __init__( + self, + weight_config: dict[str, Any], + input_config: dict[str, Any], + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.weight_quant = weight_config + self.input_quant = input_config + + self.weight_qscheme = self.weight_quant.get("qscheme") + self.input_qscheme = self.input_quant.get("qscheme") + per_tensor = ( + self.weight_qscheme == "per_tensor" and self.input_qscheme == "per_tensor" + ) + per_channel = ( + self.weight_qscheme == "per_channel" and self.input_qscheme == "per_channel" + ) + self.act_quant_group_shape = ( + GroupShape.PER_TOKEN if per_channel else GroupShape.PER_TENSOR + ) + if not (per_tensor or per_channel): + raise ValueError( + "For FP8 Fused MoE layers, only per-tensor and per-channel " + "scales for weights and activations are supported. Found " + f"{self.weight_qscheme}, {self.input_qscheme}" + ) # noqa E501 + + self.static_input_scales = not self.input_quant.get("is_dynamic") + if self.static_input_scales and per_channel: + raise ValueError( + "For FP8 Fused MoE layer, we require either per tensor or " + "channelwise, dynamic per token quantization." + ) + + # For GPUs that lack FP8 hardware support, we can leverage the Marlin + # kernel for fast weight-only FP8 quantization + self.use_marlin = ( + not current_platform.has_device_capability(89) + or envs.VLLM_TEST_FORCE_FP8_MARLIN + ) + # Disable marlin for rocm + if current_platform.is_rocm(): + self.use_marlin = False + + self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.num_experts = num_experts + layer.orig_dtype = params_dtype + layer.weight_block_size = None + params_dtype = torch.float8_e4m3fn + + # WEIGHTS + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + if self.weight_qscheme == "per_tensor": + # Allocate 2 scales for w1 and w3 respectively. + # They are combined to a single scale after weight loading. + w13_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-TENSOR quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + elif self.weight_qscheme == "per_channel": + # quark's scale is 1 dim. + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, hidden_size, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # INPUT_SCALES + if self.static_input_scales: + w13_input_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_input_scale", w13_input_scale) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_input_scale", w2_input_scale) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + else: + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # Fp8 moe kernels require a single activation scale. + # We take the max of all the scales in case they differ. + if self.static_input_scales: + if layer.w13_input_scale is None or layer.w2_input_scale is None: + raise ValueError( + "QuantConfig has static quantization, but found " + "activation scales are None." + ) + if not all_close_1d(layer.w13_input_scale) or not all_close_1d( + layer.w2_input_scale + ): + logger.warning_once( + "Found input_scales that are not equal for " + "fp8 MoE layer. Using the maximum across experts " + "for each layer. " + ) + layer.w13_input_scale = torch.nn.Parameter( + layer.w13_input_scale.max(), requires_grad=False + ) + layer.w2_input_scale = torch.nn.Parameter( + layer.w2_input_scale.max(), requires_grad=False + ) + + if current_platform.is_fp8_fnuz(): + # Normalize the weights and scales + w13_weight, w13_weight_scale, w13_input_scale = ( + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale + ) + ) + w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale + ) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter( + w13_weight_scale, requires_grad=False + ) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False + ) + layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter( + w2_weight_scale, requires_grad=False + ) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False + ) + + # For per-tensor case, Fp8 moe kernel needs single weight scale + # for w13 per expert. Use max then dequant and requant each expert. + if self.weight_qscheme == "per_tensor": + assert layer.w13_weight_scale is not None + shard_size = layer.intermediate_size_per_partition + max_w13_scales = layer.w13_weight_scale.max(dim=1).values + for expert_id in range(layer.local_num_experts): + start = 0 + for shard_id in range(2): + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][start : start + shard_size, :], + layer.w13_weight_scale[expert_id][shard_id], + ) + layer.w13_weight[expert_id][start : start + shard_size, :], _ = ( + ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + ) + start += shard_size + + layer.w13_weight_scale = torch.nn.Parameter( + max_w13_scales, requires_grad=False + ) + # quark's scale is 1 dim. + elif self.weight_qscheme == "per_channel": + if self.act_quant_group_shape == GroupShape.PER_TOKEN: + w13_weight_scale = layer.w13_weight_scale.unsqueeze(-1) + layer.w13_weight_scale = torch.nn.Parameter( + w13_weight_scale, requires_grad=False + ) + w2_weight_scale = layer.w2_weight_scale.unsqueeze(-1) + layer.w2_weight_scale = torch.nn.Parameter( + w2_weight_scale, requires_grad=False + ) + # Property to determine if AITER is used + if self.rocm_aiter_moe_enabled: + # reshaping weights is required for aiter moe kernel. + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data + ) + + layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + + elif self.use_marlin: + prepare_moe_fp8_layer_for_marlin(layer, False) + # Activations not quantized for marlin. + del layer.w13_input_scale + del layer.w2_input_scale + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + per_act_token_quant=self.input_qscheme == "per_channel", + per_out_ch_quant=self.weight_qscheme == "per_channel", + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet." + ) + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + if self.rocm_aiter_moe_enabled: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + rocm_aiter_fused_experts, + ) + + return rocm_aiter_fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + quant_config=self.moe_quant_config, + expert_map=expert_map, + ) + elif self.use_marlin: + assert activation == "silu", f"{activation} not supported for Marlin MoE." + return fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + None, + None, + layer.w13_weight_scale, + layer.w2_weight_scale, + router_logits, + topk_weights, + topk_ids, + quant_type_id=scalar_types.float8_e4m3fn.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) + else: + from vllm.model_executor.layers.fused_moe import fused_experts + + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + + +class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): + def __init__( + self, + weight_config: dict[str, Any], + input_config: dict[str, Any], + moe: FusedMoEConfig, + ): + super().__init__(moe) + self.weight_quant = weight_config + self.input_quant = input_config + + weight_qscheme = self.weight_quant.get("qscheme") + input_qscheme = self.input_quant.get("qscheme") + if not (weight_qscheme == "per_group" and input_qscheme == "per_group"): + raise ValueError( + "For MX(FP4) Fused MoE layers, only per-group scales " + "for weights and activations are supported. Found " + f"{weight_qscheme}, {input_qscheme}" + ) # noqa E501 + + self.static_input_scales = not self.input_quant.get("is_dynamic") + + self.weight_dtype = self.weight_quant["dtype"].replace("fp", "mxfp") + self.input_dtype = self.input_quant["dtype"].replace("fp", "mxfp") + self.fp4_dtype = getattr(torch, "float4_e2m1fn_x2", None) + + self.ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype( + self.input_dtype, self.weight_dtype + ) + + if self.static_input_scales: + raise NotImplementedError( + "QuarkOCP_MX_MoEMethod with static input scales is currently " + "not implemented. Please open an issue." + ) + + self.use_rocm_aiter_moe = rocm_aiter_ops.is_fused_moe_enabled() + + self.emulate = not current_platform.supports_mx() or not ( + self.use_rocm_aiter_moe and self.ocp_mx_scheme == "w_mxfp4_a_mxfp4" + ) + if self.emulate: + logger.warning_once( + f"The current mode (supports_mx={current_platform.supports_mx()}, " + f"use_mxfp4_aiter_moe={self.use_rocm_aiter_moe}, " + f"ocp_mx_scheme={self.ocp_mx_scheme}) " + "does not support native MXFP4/MXFP6 " + "computation. Simulated weight dequantization and activation " + "QDQ (quantize and dequantize) will be used, with the linear " + "layers computed in high precision." + ) + else: + logger.warning_once( + "The current mode supports native MoE MXFP4 computation" + ) + + def get_packed_dim(self, dim: int, quant_dtype: str): + if quant_dtype == "mxfp4": + assert dim % 2 == 0 + return dim // 2 + else: + # FP6 packs 4 * 6 = 24 bits on 3 bytes. + assert (dim * 3) % 4 == 0 + return (dim * 3) // 4 + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # Add the quantization method used (per tensor/grouped/channel) + # to ensure the weight scales are loaded in properly + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value} + ) + + params_dtype = torch.uint8 + + # WEIGHTS + w13_weight = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + self.get_packed_dim(hidden_size, self.weight_dtype), + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight", w13_weight) + + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + self.get_packed_dim(intermediate_size_per_partition, self.weight_dtype), + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight", w2_weight) + + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // OCP_MX_BLOCK_SIZE, + dtype=params_dtype, + ), + requires_grad=False, + ) + w2_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + hidden_size, + intermediate_size_per_partition // OCP_MX_BLOCK_SIZE, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + + layer.register_parameter("w13_weight_scale", w13_weight_scale) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + + def process_weights_after_loading(self, layer): + if self.emulate: + return + + from aiter.utility.fp4_utils import e8m0_shuffle + + # Pre-shuffle weight scales + s0, s1, _ = layer.w13_weight_scale.shape + w13_weight_scale = layer.w13_weight_scale.view(s0 * s1, -1) + w13_weight_scale = e8m0_shuffle(w13_weight_scale) + layer.w13_weight_scale.data = w13_weight_scale.view(s0, s1, -1) + + s0, s1, _ = layer.w2_weight_scale.shape + w2_weight_scale = layer.w2_weight_scale.view(s0 * s1, -1) + w2_weight_scale = e8m0_shuffle(w2_weight_scale) + layer.w2_weight_scale.data = w2_weight_scale.view(s0, s1, -1) + + if self.fp4_dtype is not None: + layer.w13_weight = torch.nn.Parameter( + layer.w13_weight.view(self.fp4_dtype), + requires_grad=layer.w13_weight.requires_grad, + ) + layer.w2_weight = torch.nn.Parameter( + layer.w2_weight.view(self.fp4_dtype), + requires_grad=layer.w2_weight.requires_grad, + ) + + torch.cuda.empty_cache() + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return ocp_mx_moe_quant_config( + quant_dtype=self.input_dtype, + weight_dtype=self.weight_dtype, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=None, + a2_scale=None, + block_shape=None, + ) + + @property + def allow_inplace(self) -> bool: + return True + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet." + ) + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + if not self.emulate: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + rocm_aiter_fused_experts, + ) + + out = rocm_aiter_fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + quant_config=self.moe_quant_config, + ) + else: + from vllm.model_executor.layers.fused_moe import fused_experts + + out = fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) + return out diff --git a/model_executor/layers/quantization/quark/schemes/__init__.py b/model_executor/layers/quantization/quark/schemes/__init__.py new file mode 100644 index 0000000..7620d6e --- /dev/null +++ b/model_executor/layers/quantization/quark/schemes/__init__.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .quark_ocp_mx import QuarkOCP_MX +from .quark_scheme import QuarkScheme +from .quark_w8a8_fp8 import QuarkW8A8Fp8 +from .quark_w8a8_int8 import QuarkW8A8Int8 + +__all__ = ["QuarkScheme", "QuarkW8A8Fp8", "QuarkW8A8Int8", "QuarkOCP_MX"] diff --git a/model_executor/layers/quantization/quark/schemes/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/quark/schemes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75a62bec0ae21c218d8efa70f388e53c1d96f684 GIT binary patch literal 450 zcmYjNy-ve07_^(DDW#z+F90G)#bbk1At5BB4*ZFsD2o*u(^|C?<2cZ;G4lZIY&;8Z z5UC4HY)BOg6OKzOzTtHD{oGl8^Sma|#q1|KfB<|{Vp+96GVf{f3}O&N1~?2Mtay#p zxD{H4Tg>K8=ooIZI(I`Cf)R+F7Z^5*y6=8!Z{Wjn{UVR#4uR}o-=YF+d8%SvTRes bmjxmGZUDGF16wn&GXuNt&Hke+Jw5ydKGA$P literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/quark/schemes/__pycache__/quark_ocp_mx.cpython-312.pyc b/model_executor/layers/quantization/quark/schemes/__pycache__/quark_ocp_mx.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd5d669b8fa9b3ac8e8eb1a000e890462bc2f001 GIT binary patch literal 12598 zcmd5iTW}lKb-Mr-Z-4*+zC?-y_<|t5L{U$RrY-Abxe_ItvMXcK>_FHh32O1s?ouKd zs8%yhOC_F)Y?G2s#}ld_H9b-r`pfD^r*fxFG}9kKln2NTJy9mgbo2+C@}$j2+jH*j z0w4p~mfMeB5_j+O+;i`q_uYSUx$G1q!SsW8yo;iKjTJprX@!k|k)jqTj^b#6O3)FS z#$B#aRwDH{b-f{8E*bHq&QCc%=hMy#aF2)2j~#xsIF;fNV%D&dT{NRL@?C#oV< zq-+s932($p%2uH|;fweZ{)nH{Z9+|=Hd33Yi`0?2U8qkqL>dy?BHKvaAv7kMB29_y zk?o}J6q*w)krpVsIJXc;1S3J3GElsiZ@opUdy2I23|Gasj~Y17+ooIEMk5_I+?iMC zOoSpK0;^iskqAe^r0f%RCb}YBr0f^E6Fre00=Gk-FIr)CjnJFui}WRSMRvi+THZEs zK|j#alVe7WTpjdrr8d39)jy@Z;VJFg_;#)l`bKpKr6pOSc2J7`2RV^902O z-=?@$K6neL9#iDtG}A`$`}qT-hIUG{z*}Xe=tx&2{3i8I!xd`E_$GCQ4!1qV!C_jl z9*GJ<^n$=+`M4O3$?;S&Y*3h3G&asFmUL8<<52;6jfa!d!|*im$;%Qv9r0vbW`)$) z7%##E7sp?kj3#9^F*TarqxjUyrpB$K_Ej3S4@P;<968ILKF_{%>dcWpU`I|yjwvp! zX#}uJ@Gz$SgqWI4zluwa#G(R!HY!FFyv&P=?`$-7k>@U-I(53(2>5zplcLB2f@vWt zkEX=LV`MMf2mlc;s(=-<$jg&rGKR;aCj%7lcsE4I7ARR`ONyGno-IjduPIt=r778@ zcTbr0Qt1g}Je$T?OMQB2g!+IU4l`mkG$@w1#EPj{Ld1be;0@R!``Vc!r`gv>jW;OJ~@%@OUKhaa#S4S<({d@{RaJb|;1q_R)DPJ;>WP`MHRP;;tTnJ^ zZT*}w*gDpoo}9Bk>+JvB2(?F8YtK5{3lwhbtw~b` zo*CwmX~Ap`}Z z92e3euZ;{l6h<140`<1TXoUezj*Tk}pGeEoVjnb#ZP;$Q9OtiyyRaU_EP##lxv*XA zz-lLEA1^N9mmuWF1iwz$61{R3HOPIpmz{{ z(tksCjVgF3m;1(v>nG+8Uw>tm{)MA6UsbnS)$*uu*NXSf(Fcu(=dE+b`H_dUb#umi zZS$P*iIwu!E;!~Ld2jXHYxA{>FU`A`sSqi^gvM+zyqLh&qqk9i zLj(I(>NTJMnWz8GoMB*o1yKM(L=!4Nh^FDk{|$*yW7wcl3zAUSBo4qs+=JO}%t)(w zEA1e~ln6+|7?l?bi$5CFY@9LjOO4cA;o~A@KBzb8{fJ9oxHPqHhN?1`q*4- zPMj51o!yJae{$;GQ_J3W&t#q5D<|(pe?I>4IJ^Uoy#V^ePEMN8UYJoVsWb~}0W2OU zj7}zFiZd}OM`d1OK|4x{4cBJliL{_Nxww>$%CT|wB0n7iB#H@%9@B%3~z$E8R}Nv##wj^Sp3#9i{H-Kc*jJ&x=49G zv8}wOmN^S&2WSp}=6Qn`23%K5-{_D|cxHJUA_+KP%-)5Ao_uu__jDV}&T4H~!964R)fpaR~9 z%4sH0bYI13x}le$SWUgciP2CiLIm|ju|;mn7@$^Ew;7rv)KE$JaYoLht8AEqSdo-5 zeC;_!H7pmE%|s<7Lrr*e=%pt}fJ(|&+a+LBQsEUxa?C~04x*1U!&s*dS1!^hv=-qf zUy@OUkpvWLy=^Pod1)@{B9i1*y;K>?isxzP96Hq7PQ6r)1;DCs4%$APa(ih_ws@Y& zU2+L4)P^KghV)EW6;}Y$s7r@idU7t^sBwaJ=hkO_mzL4Y zCTRi%^_Ha5wv(Eor2rUMPdlIe6pnzX2#hGc(-JT^K^Os87T>)Lbv5a0Krb5aBmorE z9b1yrD?g_<{fSvdxlhuy*3#Z!CMt$4Y2w8#4N(1SimQ6h@T?~x)yOaDuqKY_rP7o0 z=n%o7`&wr!p8c{@m8!n%M8*EEz{Xn{HtfB+GF<5Ky~mXHFiqiK*gve;(BTh{>d zH`J`bne(rr8>vd~0NRNG2~==A79dhPK>R5w5SId)D=$n3k|{ZWS{Z@@Ntx$*17`(3 zD)E6-nokCzNf<0m^1U-}1w;r>fQKgqh?^XcQ-KRSz=Z__a6bbO5CB-o9zHP%;Q?@I z1L*Yzb#Nj-%8NK@T$a<)bA5edfZgPU-dHNp2M&0mk9f8OiUv~!!tru6E=avIYv;y! z9qoV~fe3IZKnS-WO{UW+Q4YXC#V_-L)91mB@56jw0G0riMZon2M&blNKv>mn0WH9y z`Qfl*E`Twd%j)v49(^^igPeAJh9{lH@q8FgSP<|Oz&GW()$^7COCg@*qhdgH^R?w+ z!FVz-4nB1NT#8s+f)nbUiB#N_winpgmu{vzaA8sofI*k!1xY(O)Xjt}o}Z~E=xQ2V zYy0~^ROHEdP4n>D;f0R@nW?y8xaa@b`GnhuIjndgy25h&XmnE0wnrQ%#d1jvn25)L zZ^RRjDHf3j;s75`k0Gc`0{s@SOPsQ89c^2rhh9uq}a zjzp^*Iwe4;MX{)y#gjmfYW$ZROdHF_gs3F3tYT$}9S$Y48e4>;h>FR0a*RzSV|>`C zm?T~pEuEQ!_7=Dgs_y{PNnDbh@s|mcUcEH~V}kk1b}gUCHtxwa4rUt% z@0`24Z>{ka?5TgWeP?d_q3rfUc??rEWt~lnT&}q<+uXNdG&I|B%(jA+f=)+M*3qryyeHQ+nC%+Obsf!i9ld+* z-tM)oVen~!ZXC2-gV~sMG%jAb@913bJczK`9(DGtd)ij~x&Fi1{=>QcliB{0_u5y_ zzP8ps0<%M8b{ovzaUd5ulnouab80Pga<%OwVj6hpsLDCo?mODnt2+t?+PindX!N-Y zl+opW;-s3obB+76jr-Rc53bf7giu|*y}0WoOV$~>?>wQSeb9wvixW~qZFm)EExl+ck6pbUA=#o4%>#sQ-FbZ z8Z*@3LzpRcRO$Ozes^M^SRP31evBwTMu9PN;`OA))*yGwjqi0;4` zXt}cC&(Cy$v`#_9ejHdEb@~-Cl^p8^-B>|?hiUNwEFLzBad?OqF~g(X%=_-mYtC$Y zIzB(c$(JCzM&%u~i=q3DVBS}|=$+rc82yXERcFiGAl8HP2WHRYUG=L?2ePgM5B+t} za$@n){PRnP@B2DdogJ&bj=b8sM9&YcI)gy8$RTa{w%(;4Aevpl0tB3bC1O>h4mGfJ z41xrk5}^w`i8JeK0#?69d#m1uP=!Litm#1f%w{CcI z|0gh8LIS)-J*@JoRLT%FKODoiuO}N62rfT_9Nb{Nz2bexMA4-!4#OBR>Xj|2Qf!!` zZySmZ8C~W&QxkeHr3{LA=o{fubR$ckz6LJE9ZWyPxe*`3KEEdHs|Il9D!7;=E^WMO zg5ILrSHd!51ctOSxXHJ)pa6l3#Iz-GHV&L#&V(*C=hR0})NL6CRox9E zkSgMkDF!318ot5#gM{1kj*I8|^{M14BV#;@qzuEkZo74wfgZi%>r*GQ1tX+&lBT~I zt$$4#s1&68vNy5?%i_6#TUIrE97KXk_i4aAi*jNw?19Mh$4F@6FlK4Y@U0RVFQ%aM z7#UVP0~v&Fr?HF~fHQ+Afa|ERTeVsZ$)sW{UZ9f5zhdH|a9POVq!4Bp_D}=8d!Yok z#k>enQ(m#hQb{>JHkq1~L=5kU`ydOORq1^hhx*I!$XM-;LNQ1Q_$3v$-mBi`{a%E4 zdm1j)c(51MaS)>&#Vb1@73Dyi%!JG3;Fk{l1>hom55(#i z+WkRyZs(!w&O`V1uGWRu>Rz2?-f@B@bYt-QgX_W0_v>%g=Yj*-;6UC}lk>D@J*_!U zDC-HWOo9>6x@Utjw7O@Hed25etIFfgRrTGk>U$XK&V>$ULkDxAp=@Yq)zhAD=*cw< zWE%$N&2xsim-3-QxzLN*(2IG0?PBZv%Zt&a>ZR9~`)~f?a&*PJau}?~9n~-mE+%#i z=0ZoZp(A;?p}4ddTzqwjUg}?pk`e!2-88@RFT24e@pfoK5$N6OyRWTLbh9ZJ+?@^Xo^!7U`&Qg{ryc~) zLb;{=CsXfEe7e@seux2w0q;R761I*sRjC7&w3Of~}G?`G`;AO_)qMp4H=j3ro#LMHd zTP(B@#Wx{Sm`MmY93TrQ)y1y_2<72tCn6EL7z2g7(m2jSIVb)pW`Bm+EM~a8V!j9- zE!-(JXnR2N-b~R1c_LyfUZlEXA`fv7UHWY{bT{|?CoCxaGi2cKR#P22R(A}owZE`x z4dz?+&N2_IP4J;bb;ml~o;J=m<~`v&h$>GT+?=kq58mN&Luax>XVyGt*J~RWF3w+k zVlh>_K{V9W=hgph&ADye*=^nVZJ{SMj%xd?xlm8p-0FuLOZCh99$0t73iZ3nSJ?5O z`}wTr`HCECovPg^`QjJ5=I$xPUcf$}U^wqSZKo zL6t43(xpx%n*wFpGwhM7bjb)iD~9n)Xy~7#jZh+5mtZ|`@DRU^8M2^iDD1)#3z*TUuIf_j1cBb;yIF0oe><`O)ne$Os&Mbl|vy2uv| z<#xcPt{_l}^ydH%zBX_|7zUL(1RHX`_N=dc>2>naz~0=zvFyOHyO(}`<>M=BzB98& z^Zt5Oku4uy^Y_jkTle`F4$L1|JhrrF*}vxNU3K<8^3^XKoIj{WEe>XV2lE~;7zhi_ zd1uj1AUw7!>)8c`1^2vr!(sMAB*W}-t0Yd&u|^!l?b=xu0@0K3MH8zySq%6Icufs8 z6Bm!kAj4!)MB*2b;1wgtC=q3r;vfMv^siH#IEe!WFv9?tVm?hkir>SENr3O2RDp}E zA^wnzIL9X?h~vG7E$D%&nBiv};xRq@4iqF4oaEb7!SFK8xbwk|rP%Vo()i8hf{`>F zIdRW;Z{(i!H97YYu(hpsDrTsIE>u16)ae@Qf-|D z8H`}tES;yRj__wTFXP?>3hDsL>Af*{eXwA`nw5g3gzLiMxvV=_ zzzG1;UUj4CdeajJ)YJpQvk5<^RK-$ot3bY`lzO8VVj`X_d_$mpUn!B;E7TzjE2mO| zBr?!XVzG)DY#6X%)pv@oO!1LVb$XM@UNRmGqNt$0M|)Yr0z&vPW+oKXB)Fbbkg+T9bExdIxoa@oRIvudylHmunZ4F{H-o6tajWott1vwDhYCN@_n zUsIQ_?#vX3ZMT`uu*Wf-(Iz}gOH^A@;iH1P4Hj$ h;~soreZP~?T>f~^kr$4=K>h0rj^i%l&zv;m{{!+?3}OHP literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/quark/schemes/__pycache__/quark_scheme.cpython-312.pyc b/model_executor/layers/quantization/quark/schemes/__pycache__/quark_scheme.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f06e5bcf6aee016572bf0057f35a94408b23448 GIT binary patch literal 2321 zcmb7FO^6&t6t4c6+1c4Y)+Cz5HB?p+2HY7%B)c!w!3f-M_+KT29%*4XKsMuG6_#D(Bi{;8Xf>_SZrYZX?Z9a|zSFUqu8poB-`zpJ z!>oHwey~~U*|pN2TBzBXbMeyUY@W2F5`?Odsc!7AXEWDQA~rs!T^2H$AIs-TAsp<( zW*0Hkw0zXGeXHp(mz8{*mD{%O?6~&|!{OST0xd)`k}DvW*z~RAh2g|<>*Wcd*!pF# zJ^7sPQ!n^l=}#flF8kBtSekSH?@q|1I&pfwSEs} zmT8j2Lcs;C9yZ0Po0C~tFqMjkS`*Ody)O2_JWRVtjZ|M-HLL%&IjG~m!W0g9#KSa% zpf@>Xn39CFc)(RJ_piwMcMa{|_8c3;c!~gPz3X?{{lNCio zhpCnA^xXTa8|PRf;H^f|171aq%^(OHVeGTOW7~|TDi)1^^q7#1$*i>?HX23lx#ZU{?DO8tB+)g8^Sx_DM64P*D&BJkF{ViwkH-Y|%HIiFrR2a&^ZX5O;$1pXV!M+wm6aN)P=9ZaU@kp2h^ib zKT9K%<|KPHm0y(mb)7N%wz|OUk(V5q6`3%~+nfULOy%?XW2smo8yieSECAAX<1_%q zGfa{s;LNX&fm;K#Nr+8uYXGpI-Fg(&KZ*S`fUe`JYvK~D!@;S5#wn1~9XN_c(>sPX z70`>|j)A4bf(2|7dN3~=)2c&t9i1cNx9NR%c}1foVxBF22v4Kbyu7{4%i^7c4+sY4tl z{8}Pn%A_2o?3p0E;`V-R5`ZI3-vOuN#;7f170-*H>YkTXJrCX`pbqS3Jr6h;6q+f| z^J5C1XUr>~|3eC0lFZ?e5_8(A{u9eehJ~3OMOiuLVOGA*A{h%|TJlve+RK~LRMPyF zm}v$y@_U$e&|}*vRR&d5UVK3y3cBxPd^lmT@ z-v;(vY5C!a)4MnB-q`i-dILwxwfYZBdrSHL{H337{&e$~kA6S@>AqGP&*s?RjBx<2 zethr2>iyLR7w%u!N5ebJZH7U$a^{d075arTP>5yNWdIeh#PjmR-qcF{0_C)=e2@eD sp{T((O*@HU8aS3^{Zm2K8;{WJA2TQKG;U8lDZgbcfBX912o~evKSDfLYXATM literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8.cpython-312.pyc b/model_executor/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..253bf975844e99134e7ed96784fa78593ca9c92a GIT binary patch literal 7144 zcmcgxTWlNGnV#YB&f!JW#ga&hly!+H+LCPfB3Fu2HI@^{wtP_%yL7tKF=r%E8V=br zqu4Si^a4M~Xo}dt0*vdD`N!FjI#ZHR)3}?*GR!MCA5)$_*ff8tm z%Fs!g#%YUW$uLPKV@+B~o{?-x8V&oS~9K4R!F<&=wzFPdW8}^w;?CG9x;Y?$$iUA zoc#i4;(&&nsMAp6y2v9wBdQ{TjOWr4j}~WA^I}GXtos#| zUCPbO^Eolj=$_NLv2$rzC|wcLbMtDgbl*i0y)Md17VT^f zDh5(Z2#K=F7af94bk14?`)%u^isYnAXc8Py>lU1%XQ91PTo*0C{@MKr zF3~4C$#`zjDR`h~*X}l;3o8a@-GUdSH@PSw_^n3pfy~|pGW~m$lbOQ&8uprX3W3|U zWHaokW)uLay7vY#5|_%#v*|gVP0P6@wUV1oRW>0R-VPzoZ`mz)*;QD4<5MATe0h4o zJXB^8!~*@O()LlMWuD@yk_8LotY&k8f?QQb-tyBkb>pyRtGuP*ILz*GSS+n7yG~L~ zVvx~ zAqz9A%2)T9^)PA!ouIVEY*C9J+v@C4T~jq=wpa+8sak9v6BylZmM&9q`?T)(kwS3h zwC+wToUknOnRH6`_FNMkh`o@DxCJnRP}{ku!>qnx46E8R0{V zT)2UIHaEse1QB(cnnkI3y$Ocq1aX#Mk|0)>bE59b!nq*HxK+0*qBM)Jbx<=I15a1Y zE&{mW?G}UYe;0?F1o95vTsnSYZ1K1_ETyjx=a$v^tUUa-BxQy(7%VyQ zrkGlS;fE!DSwzZk1$kk$pdDn;;;=DSQiT?|8)N(!2f7dCmUSlw%aK+&?pA9(ZVs8n z*rduQ5d4JNwo}ob4@Vz*ivuULffG;T>jS6OyQWsH8_rPaz#$y>l;Q_*+*$JMD|mV~ zhYtVt$kTA~#2M|xnU4;vpSZF*NSZHIUD=!cZ#F0 zYNM~>8t;a4KPVdNFGaeGkufbYR@&cR+&`i1pV+oBA@{2F9sjnI>WmgUMzoHR^^U`< z)?fIxV1krxN20ofIR266-gLgL3Aq+~~$3OuEyFY=( zK!N^Utpel%R(aQF9!K=5iiad$o)#;(78P&qN=+k30fUPa{#`<`6H$8FU0H&`}6< zpOl?TgQE_J4BR*3cGQp4HbRSrKs+VggD)dDjOz%QpMpok)Ele7cqE6iDN#{4W1$p|pN0Do zCuMmdEzjK=+=1v{+ZHRY6kwGA+?^B~D0;dyPuJRk=boWWf3)aNX#T{8|KMg<-1W*mhWt1hv4(UHf~UwQFTJ)eALN*!JAhaZIR+CSevUfLHaMFvXI-ePoAi;jY* z9-qcWSI@rJw#`!EzO^@hef5K@|D1ffv=N$GoqFfoX7}L3@n_;j_qo-nj|n6Df<)u;vbQZ!MFj%d-5C&4F|)}zPovYVmaznR15Fo%g}OB>zi z$sE8dXpR)3<4;c&CoX6c7YgGS3z19f!OI2qG7vAeP5*OG|7I-yu&+2ct__YC2Vc?# zUs@lWEL?hXBX$K#iF_LCB{5-XG>P|98#s9Q?vvw@n#N8Sr@*~e(c3Gh4_|`up{&;H zEF}31IK9L{G;dOPdx{_2#kV$O;qZvhm*2a$YgjjO5*-_XT( zJ+?locoZv6-~q2WVl5VId8dicM;yn?q9p)@nT=Hvx&y93)Vx1JrYt+wZwe=h7RrMaeyU!29SV>WU`(Fs@+gf~^(ie7|BcNu3}NN03+<@iGyy@J~j-o(2DI21QV zXK_HPu&*Dt7^S2D(<`B-xF7DWOBup;2p=tkJrCU`W^(GX?wtj2P{%3{YH1zJqYgdpP{!i@yK&?!9H6b??~0PCgO z7jMAbQ_(vrlf#I%6U6EW8+#B6hABd!Mqu2Bd?_UID!c)bw<6_MXa{rdtju94KO|JS z9i;jO3IiwCdtWLzyGzldtF{ei7yvxjzuDAwKXflt3dBo+fz7s1u`RB(#evX*K46r6 z?Irl{d||uE6?CuK%Po}4S9HcSXKbzg;jsGFi+{-cF0;}4dMOkxbz#;G`|g}8vs6oavAJJs?tggTdGoOXd#uoWY%_G&?A=`G z8(;69`2Ex$&j0TGM(btV)y9NtiM9Md|-R@r-(R{bTX zx1wOzlkkS~WIYpg9{PIeA>Zm*xXX6+ynpS%wa2ah8u?9RJ#xGl8P_7?t7jk3tJ5X6 zsmS(fZ136+o}B%d{chPyu`dwn(&?MJ^?I6DQWZ>x)u$-zSm9g9);~h9*G+Q)x7hVz z4r+AYgwiU<)Vcr)SH0M}(uOy^JOg-Mkud{@XF6`zZRGUT?Z7^eVt9C0AqAe_eu#7% zpUX+hjhewR?#Ep)f1@in!2POEFvjNflJ3>L+qq-!?x_10-1#&F;N`JYpyk~k-}!OT z->dn1OTp$+bNl^?dlTS}6+04IN21i$d0)OKm)ax6_PEv_FEzD-Spt1qlq29SQx2D# z7zkh@0TKa(rsFO6nFij+M2^$l90$)nOA=1A9DW;?fnDQ*1X7A`Cw zpzKX$i^UdP8~q;&qAkWO*m54X^_3BQ#5PHP>Th}X=AD~(QOh=50B5ATZ*ArSSJ^?b zPO7KB>>^2kfbjjrdy8cc$+A>?Y|BfM;79buzjS$R?j730yO42N!(gvR8VRZq66PX<$p{bmNNe*~8pXH|Iyeq~m*O~MBkkN= zx^u$V=qC}T2JGM}uR>5}Xqx`YNzr}(MTI}3*iWdg|EA7=LS6im#Y$Vgq#*c{+f04s drsxBodO8a3&Yw88?N=?d`|kKx6po1<{|W&U1GoSH literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/quark/schemes/__pycache__/quark_w8a8_int8.cpython-312.pyc b/model_executor/layers/quantization/quark/schemes/__pycache__/quark_w8a8_int8.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30759d20465ef44970562c05bc55a75ac993bca2 GIT binary patch literal 5708 zcmbstS!^4}b$7WWm)s>OQD;m_qN2m1ZHZRmLu%x>IaD{gV%IhjFbNhb?n+u}d3bge zTMFF-g<}zS4d*1_C5KfRYviF$;aTNcxdKwo(wF=ttk|az#?1 z(i{VFcHYdJH*em&dB^KKJ>iHt63(d8fb9yCa7A54+o8Ag}<)R7#b2)xh8r30^U9_<0E>B?Z&Ev>>4Q%dt5r zA%$tZ{%Jv#UY(ejyeJ?cA!!oQk32IcBmo%^lbW$qd8~0VB}&Q*k~}-7m3vz*O6ZD| zR8y!LJ!bCAtc?SJ=I#=RVU3W8C@B(A>M9nFZKH%4r6rqWe*_@OA(A8T6)DLs+9an) zFF66STdf&~=(t5kL4cxDa?emAbBo@m!ip}5iytWwt@y-)6+o#ex+S0F#%qeK_{*ZoCKF->sGmPj54w@F}aD(nK zRN#f7r-q(OYD0RUq#XRT5Sy2hqRLN8a&nemP$f}!UQ-QK(VJzJS2aPCW4xS9FKD1* zScRr^UkO}HfchXg7NaYerl5`Wdv^e;H6lZ12!4M!LjhE}69kd50fh05vPj$|r@}Vm z0|?dOsvas(SXz~onFHM_UNQvbKT1SPq$6oL-LDB~R?_-!ESw!4nm;Q=6nQ$5UexAN z$;hjUl87X*BJDYl@cP(;C(NR3>>IxjE5HUSR+G#@edX$+xx{`!zG1Om{82GWZ< z!$T8pCco5NCDH+_KLC8GxO#&qazty#GW{l3=p9+6^GsKvV_=#7gbD7#Frhn;q%ELi zY+oF>7m6V~yuCGIkqlE)UyMWhF18T1n4+o}J%pN|*8Q+uJ|QRhn2;8xWkuE&mqOKS zeI26*W~y+nFoO=F&W~C!DZ3#Jy*ag@;ge1aNRzQdcvW7KRK2xy;sEez36;ln4p4=J z%8S}!TGBn&4LbpbOAsX#a{y_pFwkJ(zxyUwU(gFGLzeYID_KO)2_la7exQitJX{PI ztE5=M5vc_{g6SV9c;ueVvbHy!8eZQh8K-5LZ!#Gs&RPBCFHrQBA*=N&iAFW7n?{L) zUpZJrx;DLylAe>8*BX@L_(TtsLnxILh9oq6bVkvzEi+0j6R5RWqZ!KDcgCi*TD|dh zt5yEqvs-c#9ilU1zsGEp&gG^nL&iJHm{v=4Wn5LeJoFsV4Xw+X_1!tI;Je$RQvL$p zB}p4xyRBW^bZd`VST|XU$gn?X<=PUfy)u{l6`Xl3X|PzURceg*1*sjixcfhRg)O}D zS$C3K3EOlpxc(A&T~?*CJ6<|+RfR|))T4%GQmES5fQPahZr}@S zp%A_d*n5kDCg?8HfrxTKcb9w&v=P3Ny2ErD;QHY<5a<}Li~%?1N;9I&Fl7!J{ane* z7$L7-UxJn-l;YDV@YFbZHscitn)Nyps2Fw%OKDyZ<6(zR#&qv>{u*{@LV{01Jd9zr z3YM#3pUlJbiV6M_xr)AUi|&QsU51#Pzb=c~oQiPHaUh66_&jh(f!F1Egx!MfFoS-b zmXk9n-6n`4!f~{2hnOWcr`x4OT3gg@>OumeKrrV=(yXjP7;mnR1A64bGn_NvhPB5Z zL)h)?CzZDKn@;jQ}F7cBlhaAzWe0UOJcg>kv_?dix=%1s^jr5Z0u};|KuZ z%ZK`*(pxH%is;3QRE7F+h@jdXU6i}c!8_i=1%0e6WiR$9Cvavust}M8ywAZKiP8-{@$JX=C#0` zz)oF5w(;0j-SKSy=-;mXJ^JV9o{jQ*Av$YmEx_-{(LImT?_PGm4qV=CCX{1B>#gsc z$uo}u4p+B2vNH19a}U_QoxtgxrXy=F-FfNvUD@7|tw+y&D1DT;pU5{|DFlLr_T#Xd zpm%v<&rKYCip+9dpN7tUeM0$F)Hy+t|7`a%W_HuK45!yjvOx$U9Z2_^KSp${(RHW#~zSgz`HyS zh#vno*PY|K*I(Hj&vQTiyyIjg{;|8q@=d2dcm?o7uzIj!Lbm5@cKEqGH};R_&Vu(u zA<$I_gbRVbLU5qa(Fe!T;$EhI!y2)3476*%U@vb3@PB~@&VJ&hXl)+@8X}~@WjkFF z`;sDA?yns)p!P#(!iz$E)x0H&S{X zCa71TdV{FckWDeqJh8+0Zrigh$MVea0_%f|`he{yaP_7pz`_2ZG0#1@!!_I<%?A4O z+`vPITV+?+ZDugX3~u&(!VH-y5qU$Wr)5E{%5Je0H_jJ7RKWQXarhoA@%?P3JOX(z zmD$flE)xjn4j>RiY#t$ARF?4^9Qd1g zUgPR(E3a+)`f|R$cbYcI%~M;xp=@cZ;bt@ph`O8SAxB*6w@<-yMaXC za8EW>L%NUW;k_t@-v&}i6|m{)R7ycOsL5lb= z^~-O!Zqb89nz2V7);AXI@P%95*|V;_%@&;)Vu&N{>-1ZJT^EKL2}gaAqU`?l$Nx>h zXV+$R6q!q;-TlzlxO!vd#_H=UuNUpO!=!fkZCBB0fDF;qTXY#sH_;MYo4+$(WDO9K z|In_-XvQh0eR%Wq`$O*yy?_3_^F;!{542tUDf(^p;Q|=#7w^2d#^2$KHVlUc-VNRj zn(cE>|LNsFzWnjCxpNndj)yMqYH%e8_SHUY3I>ZK?ALv6j^2L+&ho z{}?a>3q%9DK5X<sR%zl z=xw#;8bOs2z~Df|h+>T3$B1AI9YsIF1BMn bool: + return ( + current_platform.is_rocm() + and envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM + and envs.VLLM_ROCM_USE_AITER + ) + + +try: + from aiter.ops.shuffle import shuffle_weight + from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4 + from aiter.ops.triton.quant import dynamic_mxfp4_quant + + from vllm.utils.torch_utils import direct_register_custom_op + + if is_rocm_aiter_fp4_asm_gemm_enabled(): + from aiter import gemm_a4w4, per_1x32_f4_quant_hip + + def gemm_with_dynamic_quant( + x: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + rocm_use_aiter_fp4_asm_gemm: bool = False, + out_dtype: torch.dtype | None = torch.bfloat16, + x_scales: torch.Tensor | None = None, + ) -> torch.Tensor: + M = x.shape[0] + if rocm_use_aiter_fp4_asm_gemm: + if x_scales is None: + # use hip quant kernel for performance + x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True) + else: + x_q = x + x_s = x_scales + + # 32 alignment is enough for dim0 padding of output for + # gemm_a4w4 kernel + y = torch.empty( + (M + 31) // 32 * 32, weight.shape[0], device=x_q.device, dtype=out_dtype + ) + + gemm_a4w4( + x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True + ) + return y[:M] + else: + if x_scales is None: + x_q, x_s = dynamic_mxfp4_quant(x) + else: + x_q = x + x_s = x_scales + y = torch.empty( + x_q.shape[0], weight.shape[0], device=x_q.device, dtype=out_dtype + ) + + gemm_afp4wfp4(x_q, weight, x_s, weight_scale.T, out_dtype, y) + return y + + def gemm_with_dynamic_quant_fake( + x: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + x_scales: torch.Tensor = None, + rocm_use_aiter_fp4_asm_gemm: bool = False, + out_dtype: torch.dtype | None = torch.bfloat16, + ) -> torch.Tensor: + return torch.empty( + (*x.shape[:-1], weight.shape[0]), dtype=out_dtype, device=x.device + ) + + direct_register_custom_op( + op_name="gemm_with_dynamic_quant", + op_func=gemm_with_dynamic_quant, + mutates_args=[], + fake_impl=gemm_with_dynamic_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) +except (ImportError, AttributeError): + dynamic_mxfp4_quant = gemm_afp4wfp4 = None + + +class QuarkOCP_MX(QuarkScheme): + def __init__( + self, weight_quant_spec: dict[str, Any], input_quant_spec: dict[str, Any] + ): + self.out_dtype = torch.get_default_dtype() + self.qscheme = "per_group" + self.weight_quant_spec = weight_quant_spec + self.input_quant_spec = input_quant_spec + + self.weight_dtype = weight_quant_spec["dtype"].replace("fp", "mxfp") + self.input_dtype = input_quant_spec["dtype"].replace("fp", "mxfp") + + self.ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype( + self.input_dtype, self.weight_dtype + ) + + if self.weight_dtype == "mxfp4": + self.packed_factor: int | Fraction = 2 + self.dequant_func = dequant_mxfp4 + else: + self.packed_factor = Fraction(numerator=8, denominator=6) + self.dequant_func = partial( + dequant_mxfp6, quant_dtype=self.weight_dtype.replace("mx", "") + ) + + if self.input_dtype == "mxfp4": + self.quant_dequant_func = quant_dequant_mxfp4 + else: + self.quant_dequant_func = partial( + quant_dequant_mxfp6, quant_dtype=self.input_dtype.replace("mx", "") + ) + + self.static_input_scales = not input_quant_spec.get("is_dynamic") + + if self.static_input_scales: + raise NotImplementedError( + "QuarkOCP_MX with static input scales is currently not " + "implemented. Please open an issue." + ) + + # TODO: integrate (or test) mixed-precision kernel. + self.emulate = not current_platform.supports_mx() or ( + self.input_dtype != "mxfp4" or self.weight_dtype != "mxfp4" + ) + + self.rocm_use_aiter_fp4_asm_gemm = is_rocm_aiter_fp4_asm_gemm_enabled() + + if not self.emulate and (dynamic_mxfp4_quant is None or gemm_afp4wfp4 is None): + # Currently need these kernels if not emulating + raise NotImplementedError( + f"{self.__class__.__name__} requires AITER to be installed " + "for non-emulation mode! Please refer to " + "https://github.com/ROCm/aiter for installation details." + ) + + if not current_platform.supports_mx(): + logger.warning_once( + "The current platform does not support native MXFP4/MXFP6 " + "computation. Simulated weight dequantization and activation " + "QDQ (quantize and dequantize) will be used, with the linear " + "layers computed in high precision." + ) + + if current_platform.supports_mx() and ( + self.input_dtype != "mxfp4" or self.weight_dtype != "mxfp4" + ): + logger.warning_once( + "The current platform supports native MXFP4/MXFP6 " + f"computation, but kernels for input_dtype={self.input_dtype} " + f"and weight_dtype={self.weight_dtype} are not yet integrated " + "in vLLM. Simulated weight dequantization and activation " + "QDQ (quantize and dequantize) will be used, with the linear " + "layers computed in high precision." + ) + + def get_packed_dim(self, dim: int, quant_dtype: str): + if quant_dtype == "mxfp4": + assert dim % 2 == 0 + return dim // 2 + elif quant_dtype in {"mxfp6_e3m2", "mxfp6_e2m3"}: + # FP6 packs 4 * 6 = 24 bits on 3 bytes. + assert (dim * 3) % 4 == 0 + return (dim * 3) // 4 + else: + raise NotImplementedError( + "Unsupported quant_dtype in QuarkOCP_MX.get_packed_dim, " + f"got quant_dtype={quant_dtype}. Something is wrong, please " + "open an issue." + ) + + @classmethod + def get_min_capability(cls) -> int: + return 70 + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) + + if self.emulate: + layer.weight_scale = torch.nn.Parameter( + layer.weight_scale.data, requires_grad=False + ) + else: + if self.rocm_use_aiter_fp4_asm_gemm: + # shuffle weight scale + weight_scale_shuffle = layer.weight_scale.data + sm, sn = weight_scale_shuffle.shape + weight_scale_shuffle = weight_scale_shuffle.view( + sm // 32, 2, 16, sn // 8, 2, 4, 1 + ) + weight_scale_shuffle = weight_scale_shuffle.permute( + 0, 3, 5, 2, 4, 1, 6 + ).contiguous() + weight_scale_shuffle = weight_scale_shuffle.view(sm, sn) + layer.weight_scale = torch.nn.Parameter( + weight_scale_shuffle, requires_grad=False + ) + + # shuffle weight + weight_shuffle = layer.weight.data + weight_shuffle = shuffle_weight(weight_shuffle, layout=(16, 16)) + layer.weight = torch.nn.Parameter(weight_shuffle, requires_grad=False) + else: + layer.weight_scale = torch.nn.Parameter( + layer.weight_scale.data.T.contiguous(), requires_grad=False + ) + + def create_weights( + self, + layer: torch.nn.Module, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + + # WEIGHT + weight = PackedvLLMParameter( + data=torch.empty( + output_size_per_partition, + self.get_packed_dim(input_size_per_partition, self.weight_dtype), + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + packed_dim=1, + packed_factor=self.packed_factor, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + weight_scale = GroupQuantScaleParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition // OCP_MX_BLOCK_SIZE, + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + if self.emulate: + dq_w = self.dequant_func(layer.weight, layer.weight_scale, x.dtype) + qdq_x = self.quant_dequant_func(x) + return F.linear(qdq_x, dq_w, bias) + else: + return torch.ops.vllm.gemm_with_dynamic_quant( + x, + layer.weight, + layer.weight_scale, + self.rocm_use_aiter_fp4_asm_gemm, + self.out_dtype, + ) diff --git a/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/model_executor/layers/quantization/quark/schemes/quark_scheme.py new file mode 100644 index 0000000..412a07a --- /dev/null +++ b/model_executor/layers/quantization/quark/schemes/quark_scheme.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod + +import torch + +__all__ = ["QuarkScheme"] + + +class QuarkScheme(ABC): + """ + Abstract class used to describe the weight creation and forward pass + of different quantization schemes supported by Quark. + """ + + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + raise NotImplementedError + + @abstractmethod + def create_weights(self, *args, **kwargs): + """ + Weight creation for the particular scheme. Inputs to this function + + """ + raise NotImplementedError + + @abstractmethod + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None + ): + """ + Run the forward pass for the particular scheme. This is where + scheme-specific dequant/quant steps/kernels should be applied. + + :param layer: torch.nn.Module with the registered weights and + other parameters relevant to the particular scheme. + :param x: input to the layer + :param bias: bias parameter + + """ + raise NotImplementedError + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module): + """ + Called after weight loading is complete for any cleanup that + needs to occur. + """ + raise NotImplementedError diff --git a/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py new file mode 100644 index 0000000..1e5ee93 --- /dev/null +++ b/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -0,0 +1,179 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any, cast + +import torch +from torch.nn import Parameter + +from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + Fp8LinearOp, + normalize_e4m3fn_to_e4m3fnuz, + requantize_with_max_scale, +) +from vllm.model_executor.parameter import ( + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) +from vllm.platforms import current_platform + +__all__ = ["QuarkW8A8Fp8"] + + +class QuarkW8A8Fp8(QuarkScheme): + def __init__( + self, weight_config: dict[str, Any], input_config: dict[str, Any] | None + ): + self.weight_qscheme = cast(str, weight_config.get("qscheme")) + self.is_static_input_scheme: bool = False + self.input_qscheme: str | None = None + if input_config is not None: + self.is_static_input_scheme = not cast(bool, input_config.get("is_dynamic")) + self.input_qscheme = cast(str, input_config.get("qscheme")) + + per_token = ( + not self.is_static_input_scheme and self.input_qscheme == "per_channel" + ) + self.act_quant_group_shape = ( + GroupShape.PER_TOKEN if per_token else GroupShape.PER_TENSOR + ) + self.fp8_linear = Fp8LinearOp( + act_quant_static=self.is_static_input_scheme, + act_quant_group_shape=self.act_quant_group_shape, + ) + self.out_dtype = torch.get_default_dtype() + + @classmethod + def get_min_capability(cls) -> int: + # lovelace and up + return 89 + + def process_weights_after_loading(self, layer) -> None: + # If per tensor, when we have a fused module (e.g. QKV) with per + # tensor scales (thus N scales being passed to the kernel), + # requantize so we can always run per tensor + if self.weight_qscheme == "per_tensor": + if current_platform.is_fp8_fnuz(): + input_scale = getattr(layer, "input_scale", None) + weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=input_scale, + ) + if input_scale is not None: + layer.input_scale = Parameter(input_scale, requires_grad=False) + else: + max_w_scale = layer.weight_scale + weight = layer.weight + + max_w_scale, weight = requantize_with_max_scale( + weight=weight, + weight_scale=max_w_scale, + logical_widths=layer.logical_widths, + ) + + layer.weight = Parameter(weight.t(), requires_grad=False) + layer.weight_scale = Parameter(max_w_scale, requires_grad=False) + + # If channelwise, scales are already lined up, so just transpose. + elif self.weight_qscheme == "per_channel": + weight = layer.weight + + if current_platform.is_fp8_fnuz(): + input_scale = getattr(layer, "input_scale", None) + weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, + weight_scale=layer.weight_scale, + input_scale=input_scale, + ) + if input_scale is not None: + layer.input_scale = Parameter(input_scale, requires_grad=False) + else: + weight_scale = layer.weight_scale.data + if self.act_quant_group_shape == GroupShape.PER_TOKEN: + weight_scale = weight_scale.view(-1, 1) + layer.weight = Parameter(weight.t(), requires_grad=False) + # required by torch.compile to be torch.nn.Parameter + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + + else: + raise ValueError(f"Unknown quantization scheme {self.weight_qscheme}") + + # INPUT SCALE + if self.is_static_input_scheme: + layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False) + else: + layer.input_scale = None + + def create_weights( + self, + layer: torch.nn.Module, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + + # WEIGHT + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + # TODO: update create_xxx_parameter functions to return + # the newly added parameters + if self.weight_qscheme == "per_channel": + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes)), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + else: + assert self.weight_qscheme == "per_tensor" + weight_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + + # min requirement for fp8 kernels + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + input_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("input_scale", input_scale) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + out_dtype=self.out_dtype, + input_scale=layer.input_scale, + bias=bias, + ) diff --git a/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py new file mode 100644 index 0000000..42d2ed2 --- /dev/null +++ b/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch + +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( + ScaledMMLinearLayerConfig, + choose_scaled_mm_linear_kernel, +) +from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme +from vllm.model_executor.parameter import ( + BasevLLMParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) + +logger = init_logger(__name__) + + +class QuarkW8A8Int8(QuarkScheme): + _kernel_backends_being_used: set[str] = set() + + def __init__( + self, + qscheme: str, + is_static_input_scheme: bool | None, + input_symmetric: bool | None, + ): + self.qscheme = qscheme + self.is_static_input_scheme = is_static_input_scheme + self.input_symmetric = input_symmetric + + @classmethod + def get_min_capability(cls) -> int: + # turing and up + return 75 + + def create_weights( + self, + layer: torch.nn.Module, + output_partition_sizes: list[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): + layer.logical_widths = output_partition_sizes + + scaled_mm_linear_kernel_config = ScaledMMLinearLayerConfig( + is_channelwise=(self.qscheme == "per_channel"), + is_static_input_scheme=(self.is_static_input_scheme is True), + input_symmetric=(self.input_symmetric is True), + ) + + kernel_type = choose_scaled_mm_linear_kernel(scaled_mm_linear_kernel_config) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for QuarkW8A8Int8", kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) + + # WEIGHT + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), input_size_per_partition, dtype=torch.int8 + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + if self.qscheme == "per_channel": + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes)), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + ChannelQuantZPParameter = ChannelQuantScaleParameter + weight_zero_point = ChannelQuantZPParameter( + data=torch.empty((sum(output_partition_sizes)), dtype=torch.int8), + output_dim=0, + weight_loader=weight_loader, + ) + else: + assert self.qscheme == "per_tensor" + weight_scale = PerTensorScaleParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader, + ) + PerTensorZPParameter = PerTensorScaleParameter + weight_zero_point = PerTensorZPParameter( + data=torch.empty(len(output_partition_sizes), dtype=torch.int8), + weight_loader=weight_loader, + ) + layer.register_parameter("weight_scale", weight_scale) + layer.register_parameter("weight_zero_point", weight_zero_point) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = BasevLLMParameter( + data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader + ) + layer.register_parameter("input_scale", input_scale) + + input_zero_point = BasevLLMParameter( + data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader + ) + layer.register_parameter("input_zero_point", input_zero_point) + + self.kernel = kernel_type( + c=scaled_mm_linear_kernel_config, + w_q_param_name="weight", + w_s_param_name="weight_scale", + i_s_param_name="input_scale", + i_zp_param_name="input_zero_point", + azp_adj_param_name="azp_adj", + ) + + # Checkpoints are serialized in quark format, which is + # different from the format the kernel may want. Handle repacking here. + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + layer.register_parameter("weight_zero_point", None) + delattr(layer, "weight_zero_point") + if self.input_symmetric: + layer.register_parameter("input_zero_point", None) + delattr(layer, "input_zero_point") + + self.kernel.process_weights_after_loading(layer) + + def apply_weights( + self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None + ) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) diff --git a/model_executor/layers/quantization/quark/utils.py b/model_executor/layers/quantization/quark/utils.py new file mode 100644 index 0000000..dc82f94 --- /dev/null +++ b/model_executor/layers/quantization/quark/utils.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable, Mapping +from types import MappingProxyType +from typing import Any + +import regex as re + + +def deep_compare(dict1: Any, dict2: Any) -> bool: + if type(dict1) is not type(dict2): + return False + if isinstance(dict1, dict): + if dict1.keys() != dict2.keys(): + return False + return all(deep_compare(dict1[k], dict2[k]) for k in dict1) + elif isinstance(dict1, list): + return set(dict1) == set(dict2) + else: + return dict1 == dict2 + + +def should_ignore_layer( + layer_name: str | None, + ignore: Iterable[str], + fused_mapping: Mapping[str, list[str]] = MappingProxyType({}), +) -> bool: + if layer_name is None: + return False + + # layer_name = model.layers.0.self_attn.qkv_proj + # proj_name = qkv_proj + proj_name = layer_name.split(".")[-1] + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in fused_mapping: + shard_proj_names = fused_mapping[proj_name] + + # Convert fused_name --> [shard_names] + shard_names = [ + layer_name.replace(proj_name, shard_proj_name) + for shard_proj_name in shard_proj_names + ] + + # Layer should be ignored if shards are ignored. + should_ignore_layer = None + for shard_name in shard_names: + should_ignore_shard = check_equal_or_regex_match( + layer_name=shard_name, targets=ignore + ) + + # If shard_idx=0, set layer ignore to match shard. + if should_ignore_layer is None: + should_ignore_layer = should_ignore_shard + + # If shard_idx=1+ confirm scheme matches prior shards. + elif should_ignore_shard != should_ignore_layer: + raise ValueError( + f"Found a different quantization schemes for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme." + ) + + # Unfused layers like down_proj and o_proj will match + # the safetensors checkpoint already. + else: + should_ignore_layer = check_equal_or_regex_match( + layer_name=layer_name, targets=ignore + ) + + assert should_ignore_layer is not None + return should_ignore_layer + + +def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool: + """ + Checks whether a layer_name is exactly equal or a regex match for + if target starts with 're:' to any target in list. + """ + return any(_is_equal_or_regex_match(layer_name, target) for target in targets) + + +def _is_equal_or_regex_match( + value: str, target: str, check_contains: bool = False +) -> bool: + """ + Checks whether a value is exactly equal or a regex match for target + if target starts with 're:'. If check_contains is set to True, + additionally checks if the target string is contained within the value. + """ + + if target.startswith("re:"): + pattern = target[3:] + if re.match(pattern, value): + return True + elif check_contains: + if target.lower() in value.lower(): + return True + elif target == value: + return True + return False diff --git a/model_executor/layers/quantization/qutlass_utils.py b/model_executor/layers/quantization/qutlass_utils.py new file mode 100644 index 0000000..555bb50 --- /dev/null +++ b/model_executor/layers/quantization/qutlass_utils.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Modified by Roberto L. Castro (Roberto.LopezCastro@ist.ac.at). +# +# Copied from https://github.com/pytorch/ao/tree/main/torchao/prototype/mx_formats +# +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Literal + +import torch +from torch.library import wrap_triton + +from vllm.triton_utils import tl, triton + + +@triton.jit +def triton_scale_swizzle( + scale_ptr: torch.Tensor, + scale_rows: int, + scale_cols: int, + output_ptr: torch.Tensor, + input_row_stride: int, + output_block_stride: int, + BLOCK_ROWS: tl.constexpr, + BLOCK_COLS: tl.constexpr, +): + """ + Rearranges tensor data from row-major to block-scaled swizzle format. + + Args: + scale_ptr: Pointer to the input scale tensor + scale_rows: Number of rows in the scale tensor + scale_cols: Number of columns in the scale tensor + output_ptr: Pointer to the output tensor + input_row_stride: Stride between rows in the input tensor + output_block_stride: Stride between blocks in the output tensor + BLOCK_ROWS: Number of rows in a tile (compile-time constant) + BLOCK_COLS: Number of columns in a tile (compile-time constant) + """ + pid_row = tl.program_id(0) + pid_col = tl.program_id(1) + + rows = tl.arange(0, BLOCK_ROWS)[:, None] + cols = tl.arange(0, BLOCK_COLS)[None, :] + + # Calculate starting row and column for this tile + start_row = pid_row * BLOCK_ROWS + start_col = pid_col * BLOCK_COLS + global_rows = start_row + rows + global_cols = start_col + cols + + mask = (global_rows < scale_rows) & (global_cols < scale_cols) + + input_scales = tl.load( + scale_ptr + global_rows * input_row_stride + global_cols, + mask=mask, + other=0.0, + ) + + r_div_32 = rows // 32 + r_mod_32 = rows % 32 + + # 2) Rearrange to (32, 4, 4) then to final (32, 16) coordinates + dest_indices = r_mod_32 * 16 + r_div_32 * 4 + cols + + # Flatten + dest_indices_flat = tl.reshape(dest_indices, (BLOCK_ROWS * BLOCK_COLS)) + scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS)) + + # Calculate block offset using provided output block stride + LOCAL_NUMEL = BLOCK_ROWS * BLOCK_COLS + block_offset = pid_col * LOCAL_NUMEL + (pid_row * output_block_stride) + + tl.store( + output_ptr + block_offset + dest_indices_flat, + scales_flat, + ) + + +def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor: + """ + Rearranges an E8M0 tensor scale from row-major format to + block-scaled swizzle format. + + This format is suitable for Tmem as described in NVIDIA documentation: + https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout + + Args: + scale_tensor: Input tensor in row-major format with 8-bit elements + + Returns: + Rearranged tensor in block-scaled swizzle format + """ + assert scale_tensor.element_size() == 1, ( + "Expected element size to be 1 byte (8 bits)" + ) + assert scale_tensor.is_contiguous(), "Input tensor must be contiguous" + + rows, cols = scale_tensor.shape + + # Calculate blocks needed + n_row_blocks = triton.cdiv(rows, 128) + n_col_blocks = triton.cdiv(cols, 4) + padded_rows = n_row_blocks * 128 + padded_cols = n_col_blocks * 4 + + out = scale_tensor.new_empty((padded_rows, padded_cols)) + + # Input stride (for row-major format) + input_row_stride = cols + + # We probably want handle multiple blocks per tile but + # for now keep it simple + BLOCK_ROWS, BLOCK_COLS = 128, 4 + + # Output block stride for the rearranged format + output_block_stride = BLOCK_ROWS * BLOCK_COLS * (padded_cols // BLOCK_COLS) + + grid = lambda META: ( + triton.cdiv(padded_rows, BLOCK_ROWS), + triton.cdiv(padded_cols, BLOCK_COLS), + ) + + wrap_triton(triton_scale_swizzle)[grid]( + scale_tensor.view(torch.uint8), + rows, + cols, + out.view(torch.uint8), + input_row_stride, + output_block_stride, + BLOCK_ROWS=BLOCK_ROWS, + BLOCK_COLS=BLOCK_COLS, + ) + + return out + + +def ceil_div(a, b): + return (a + b - 1) // b + + +def to_blocked( + input_matrix: torch.Tensor, backend: Literal["torch", "triton"] = "triton" +) -> torch.Tensor: + """ + Rearrange a large matrix by breaking it into blocks and applying + the rearrangement pattern. + + See: + https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout + + Args: + input_matrix: Input tensor of shape (H, W) + backend: "torch" (PyTorch path) or "triton" (Triton kernel) + + Returns: + Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4)) + """ + if backend == "triton": + return triton_mx_block_rearrange(input_matrix).flatten() + elif backend != "torch": + raise ValueError(f'backend must be "torch" or "triton", got {backend!r}') + + rows, cols = input_matrix.shape + n_row_blocks = ceil_div(rows, 128) + n_col_blocks = ceil_div(cols, 4) + + # Calculate the padded shape + padded_rows = n_row_blocks * 128 + padded_cols = n_col_blocks * 4 + + padded = input_matrix + assert (rows, cols) == (padded_rows, padded_cols) + + # Rearrange the blocks + blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3) + rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16) + + return rearranged.flatten() diff --git a/model_executor/layers/quantization/rtn.py b/model_executor/layers/quantization/rtn.py new file mode 100644 index 0000000..5265626 --- /dev/null +++ b/model_executor/layers/quantization/rtn.py @@ -0,0 +1,652 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright © 2025, Oracle and/or its affiliates. + +import os +from collections.abc import Callable +from typing import Any, Optional + +import numpy as np +import torch +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe +from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + set_weight_attrs, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + apply_rtn_marlin_linear, + marlin_make_workspace_new, +) +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) +"""By default, use 8 bit as target precision, but it can be +overridden by setting the RTN_NUM_BITS envvar +""" +NUM_BITS = os.getenv("RTN_NUM_BITS", "8") +"""By default, use group size of 128 parameters, but it can be +overridden by setting the RTN_GROUP_SIZE envvar +""" +GROUP_SIZE = os.getenv("RTN_GROUP_SIZE", "128") +"""Global Marlin workspace shared by all modules +""" +workspace = None + + +class RTNConfig(QuantizationConfig): + """Config class for RTN.""" + + def __init__( + self, + weight_bits: int = int(NUM_BITS), + group_size: int = int(GROUP_SIZE), + ) -> None: + self.weight_bits = weight_bits + self.group_size = group_size + + if self.weight_bits != 4 and self.weight_bits != 8: + raise ValueError( + "Currently, only 4-bit or 8-bit weight quantization is " + f"supported for RTN, but got {self.weight_bits} bits." + ) + + self.quant_type = ( + scalar_types.uint8b128 if self.weight_bits == 8 else scalar_types.uint4b8 + ) + + def __repr__(self) -> str: + return ( + f"RTNConfig(weight_bits={self.weight_bits}, group_size={self.group_size})" + ) + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "rtn" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.half] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> list[str]: + return [] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "RTNConfig": + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + return cls(weight_bits, group_size) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + return RTNLinearMethod(self) + elif isinstance(layer, FusedMoE): + return RTNMoEMethod(self, layer.moe_config) + return None + + +class RTNTensor: + """A wrapper over Tensor that enables quantization on-the-fly by + overloading the copy_ method. + """ + + def __init__( + self, data: torch.Tensor, scale: torch.Tensor, quant_config: RTNConfig + ) -> None: + self.data = data + self.scale = scale + self.quant_config = quant_config + + def narrow(self, dim, start, length): + factor = 1 if self.quant_config.weight_bits == 8 else 2 + return RTNTensor( + self.data.narrow(dim, start // factor, length // factor), + self.scale.narrow(dim, start, length), + self.quant_config, + ) + + def __getitem__(self, key): + return RTNTensor(self.data[key], self.scale[key], self.quant_config) + + @property + def shape(self): + shape = self.data.shape + factor = 1 if self.quant_config.weight_bits == 8 else 2 + batch_present = len(shape) == 3 + if batch_present: + return torch.Size((shape[0], shape[1] * factor, shape[2])) + else: + return torch.Size((shape[0] * factor, shape[1])) + + def copy_(self, loaded_weight: torch.Tensor) -> None: + qweight, weight_scale = rtn_quantize( + loaded_weight.cuda(), + self.quant_config.weight_bits, + self.quant_config.group_size, + ) + + self.data.copy_(qweight) + self.scale.data.copy_(weight_scale) + + +class RTNParameter(Parameter): + """A wrapper over Parameter that returns RTNTensor (a wrapper over Tensor) + when its data is accessed. We need this wrapper for the data loading phase + only, so we can intercept a weight copying function (torch.Tensor.copy_) + and apply quantization on-the-fly. + """ + + def __new__(cls, data: torch.Tensor, **kwargs): + return super().__new__(cls, data=data, requires_grad=False) + + def __init__( + self, data: torch.Tensor, scale: torch.Tensor, quant_config: RTNConfig + ) -> None: + self.scale = scale + self.quant_config = quant_config + + @property + def data(self): + return RTNTensor(super().data, self.scale, self.quant_config) + + +class RTNLinearMethod(LinearMethodBase): + """Linear method for RTN. + + Args: + quant_config: The RTN quantization config. + """ + + def __init__(self, quant_config: RTNConfig): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + output_size_per_partition = sum(output_partition_sizes) + num_groups_per_col = ( + input_size_per_partition // self.quant_config.group_size + if self.quant_config.group_size != -1 + else 1 + ) + + scale = Parameter( + torch.empty( + output_size_per_partition, num_groups_per_col, dtype=params_dtype + ), + requires_grad=False, + ) + factor = 1 if self.quant_config.weight_bits == 8 else 2 + + weight = RTNParameter( + data=torch.empty( + output_size_per_partition // factor, + input_size_per_partition, + dtype=torch.uint8, + ), + scale=scale, + quant_config=self.quant_config, + ) + + layer.register_parameter("weight", weight) + set_weight_attrs( + weight, + { + **extra_weight_attrs, + "input_dim": 1, + "output_dim": 0, + }, + ) + + layer.register_parameter("scale", scale) + layer.output_size_per_partition = output_size_per_partition + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + """Repack weights and scales for Marlin kernels.""" + weight_bits = self.quant_config.weight_bits + + weight, scale = repack_weights(layer.weight, layer.scale, weight_bits) + + replace_parameter(layer, "weight", weight) + replace_parameter(layer, "scale", scale) + + init_workspace(layer.weight.device) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return apply_rtn_marlin_linear( + input=x, + weight=layer.weight, + weight_scale=layer.scale, + workspace=workspace, + quant_type=self.quant_config.quant_type, + output_size_per_partition=layer.output_size_per_partition, + input_size_per_partition=layer.input_size_per_partition, + bias=bias, + ) + + +class RTNMoEMethod(FusedMoEMethodBase): + def __init__(self, quant_config: RTNConfig, moe: FusedMoEConfig): + super().__init__(moe) + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + factor = 1 if self.quant_config.weight_bits == 8 else 2 + + # Fused gate_up_proj (column parallel) + num_groups_per_col = ( + hidden_size // self.quant_config.group_size + if self.quant_config.group_size != -1 + else 1 + ) + w13_scale = Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + num_groups_per_col, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_scale", w13_scale) + + w13_weight = RTNParameter( + data=torch.empty( + num_experts, + 2 * intermediate_size_per_partition // factor, + hidden_size, + dtype=torch.uint8, + ), + scale=w13_scale, + quant_config=self.quant_config, + ) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + # down_proj (row parallel) + num_groups_per_col = ( + intermediate_size_per_partition // self.quant_config.group_size + if self.quant_config.group_size != -1 + else 1 + ) + w2_scale = Parameter( + torch.zeros( + num_experts, hidden_size, num_groups_per_col, dtype=params_dtype + ), + requires_grad=False, + ) + layer.register_parameter("w2_scale", w2_scale) + + w2_weight = RTNParameter( + data=torch.empty( + num_experts, + hidden_size // factor, + intermediate_size_per_partition, + dtype=torch.uint8, + ), + scale=w2_scale, + quant_config=self.quant_config, + ) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + """Repack weights and scales for Marlin kernels.""" + weight_bits = self.quant_config.weight_bits + + w13_weight, w13_scale = repack_weights( + layer.w13_weight, layer.w13_scale, weight_bits + ) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w13_scale", w13_scale) + + w2_weight, w2_scale = repack_weights( + layer.w2_weight, layer.w2_scale, weight_bits + ) + replace_parameter(layer, "w2_weight", w2_weight) + replace_parameter(layer, "w2_scale", w2_scale) + + init_workspace(layer.w13_weight.device) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return None + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if enable_eplb: + raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.") + + topk_weights, topk_ids, _ = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + ) + + return fused_marlin_moe( + x, + layer.w13_weight, + layer.w2_weight, + getattr(layer, "w13_bias", None), + getattr(layer, "w2_bias", None), + layer.w13_scale, + layer.w2_scale, + router_logits, + topk_weights, + topk_ids, + quant_type_id=self.quant_config.quant_type.id, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + workspace=workspace, + ) + + +def rtn_quantize( + tensor: torch.Tensor, num_bits: int, group_size: int +) -> tuple[torch.Tensor, torch.Tensor]: + """Quantize a tensor using per-group static scaling factor. + + Args: + tensor: The input tensor. + num_bits: Target precision for the result (supported values are + 8 or 4). + group_size: Quantization granularity. + If equal to -1, each row in the input tensor is treated + as one group. + """ + batch_present = len(tensor.shape) == 3 + if not batch_present: + tensor = tensor.unsqueeze(0) + + q_range = 2**num_bits + num_groups = ( + tensor.shape[1] * tensor.shape[2] // group_size + if group_size != -1 + else tensor.shape[1] + ) + """Calculate a scaling factor per input group. + """ + input_flat = tensor.reshape(tensor.shape[0], num_groups, -1) + input_min = torch.min(input_flat, dim=2, keepdim=True)[0] + input_max = torch.max(input_flat, dim=2, keepdim=True)[0] + input_max_abs = torch.max(input_min.abs(), input_max.abs()) + scale = input_max_abs * 2.0 / (q_range - 1) + """Scale each input group, round to the nearest integer, shift + the range and truncate. + """ + scaled_input = input_flat / scale + scaled_input = scaled_input.round() + scaled_input += q_range // 2 + scaled_input = scaled_input.clamp(0, q_range - 1) + + scale = scale.reshape(tensor.shape[0], tensor.shape[1], -1).contiguous() + inputs_q = scaled_input.reshape(tensor.shape).to(torch.uint8) + inputs_q = inputs_q.contiguous() + + if num_bits == 4: + """Pack two 4-bit values into each byte. + """ + inputs_q = (inputs_q[:, :, 1::2] << 4) | (inputs_q[:, :, ::2] & 0xF) + inputs_q = inputs_q.reshape( + tensor.shape[0], tensor.shape[1] // 2, tensor.shape[2] + ) + inputs_q = inputs_q.contiguous() + + if not batch_present: + inputs_q = inputs_q.squeeze(0) + scale = scale.squeeze(0) + + return inputs_q, scale + + +def rtn_dequantize(tensor: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: + """Dequantize a tensor using per-group static scaling factors. + + Args: + tensor: The input tensor. + scale: The tensor with per-group scale factors. + """ + batch_present = len(tensor.shape) == 3 + if not batch_present: + tensor = tensor.unsqueeze(0) + scale = scale.unsqueeze(0) + + num_groups = scale.size(1) * scale.size(2) + batch, input_dim, output_dim = tensor.shape + + num_bits = 8 if input_dim == scale.size(1) else 4 + q_range = 2**num_bits + if num_bits == 4: + input_dim *= 2 + + data = torch.empty( + (batch, input_dim, output_dim), dtype=scale.dtype, device=tensor.device + ) + + if num_bits == 8: + data.copy_(tensor) + data -= q_range // 2 + else: + """Unpack two 4-bit values from each byte. + """ + tensor = tensor.reshape(batch, input_dim, output_dim // 2) + for i in range(2): + data[:, :, i::2] = ((tensor << 4 * (1 - i)) >> 4).to( + torch.int8 + ) - q_range // 2 + """Scale each input group with its scaling factor. + """ + scale = scale.reshape(batch, num_groups, -1) + data = data.reshape(batch, num_groups, -1) + data = torch.mul(data, scale) + + input_deq = data.reshape((batch, input_dim, output_dim)).contiguous() + if not batch_present: + input_deq = input_deq.squeeze(0) + + return input_deq + + +def _get_perms(): + perm = [] + for i in range(32): + perm1 = [] + col = i // 4 + for block in [0, 1]: + for row in [ + 2 * (i % 4), + 2 * (i % 4) + 1, + 2 * (i % 4 + 4), + 2 * (i % 4 + 4) + 1, + ]: + perm1.append(16 * row + col + 8 * block) + for j in range(4): + perm.extend([p + 256 * j for p in perm1]) + + perm_arr = np.array(perm) + interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7]) + perm_arr = perm_arr.reshape((-1, 8))[:, interleave].ravel() + perm_tensor = torch.from_numpy(perm_arr) + scale_perm = [] + for i in range(8): + scale_perm.extend([i + 8 * j for j in range(8)]) + scale_perm_single = [] + for i in range(4): + scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) + return perm_tensor, scale_perm, scale_perm_single + + +_perm, _scale_perm, _scale_perm_single = _get_perms() + + +def pack_for_marlin(weight, scale, qbits): + batch = weight.shape[0] + + n = weight.size(1) + k = weight.size(2) + groupsize = k // scale.size(2) + + tile = 16 + s = scale.permute(0, 2, 1) # transpose + w = weight.permute(0, 2, 1) # transpose + if groupsize != k: + w = w.reshape((batch, -1, groupsize, n)) + w = w.permute(0, 2, 1, 3) + w = w.reshape((batch, groupsize, -1)) + s = s.reshape((batch, 1, -1)) + + if groupsize != k: + w = w.reshape((batch, groupsize, -1, n)) + w = w.permute(0, 2, 1, 3) + w = w.reshape((batch, k, n)).contiguous() + s = s.reshape((batch, -1, len(_scale_perm)))[:, :, _scale_perm] + else: + s = s.reshape((batch, -1, len(_scale_perm_single)))[:, :, _scale_perm_single] + s = s.reshape((batch, -1, n)).contiguous() + w = w.reshape((batch, k // tile, tile, n // tile, tile)) + w = w.permute((0, 1, 3, 2, 4)) + w = w.reshape((batch, k // tile, n * tile)) + res = w + res = res.reshape((batch, -1, _perm.numel()))[:, :, _perm].reshape(res.shape) + if qbits == 4: + q = torch.zeros( + (batch, res.shape[1], res.shape[2] // 2), dtype=torch.int8, device=w.device + ) + for i in range(2): + q |= res[:, :, i::2] << 4 * i + q = q.reshape(batch, -1, n).contiguous() + else: + q = res.clone() + q[:, :, 2::8] = res[:, :, 4::8] + q[:, :, 3::8] = res[:, :, 5::8] + q[:, :, 4::8] = res[:, :, 2::8] + q[:, :, 5::8] = res[:, :, 3::8] + q = q.reshape(batch, -1, n).to(torch.int8).contiguous() + + return q, s + + +def repack_8bit_into_32bit(input): + output = torch.zeros( + (input.shape[0], input.shape[1], input.shape[2] // 4), + dtype=torch.int32, + device=input.device, + ) + for i in range(4): + output |= (input[:, :, i::4] & 0xFF).to(torch.int32) << 8 * i + + return output + + +def repack_weights(qweight, scale, weight_bits): + batch_present = len(qweight.shape) == 3 + if not batch_present: + qweight = qweight.unsqueeze(0) + scale = scale.unsqueeze(0) + + if weight_bits == 4: + """Unpack two 4-bit values from each byte. + """ + qweight_unpacked = torch.empty( + (qweight.shape[0], qweight.shape[1] * 2, qweight.shape[2]), + dtype=torch.uint8, + device=qweight.device, + ) + for i in range(2): + qweight_unpacked[:, :, i::2] = ((qweight << 4 * (1 - i)) >> 4).reshape( + qweight.shape[0], qweight.shape[1] * 2, qweight.shape[2] // 2 + ) + else: + qweight_unpacked = qweight + + qweight_packed, scale_packed = pack_for_marlin(qweight_unpacked, scale, weight_bits) + """Marlin kernels expect tensors in int32 format in a certain shape + """ + qweight_repacked = repack_8bit_into_32bit(qweight_packed.to(torch.uint8)) + qweight_reshaped = qweight_repacked.reshape( + qweight.shape[0], qweight.shape[2] // 16, -1 + ) + if not batch_present: + qweight_reshaped = qweight_reshaped.squeeze(0) + scale_packed = scale_packed.squeeze(0) + + return qweight_reshaped, scale_packed + + +def init_workspace(device): + global workspace + if workspace is None: + workspace = marlin_make_workspace_new(device, 4) diff --git a/model_executor/layers/quantization/schema.py b/model_executor/layers/quantization/schema.py new file mode 100644 index 0000000..669bd9d --- /dev/null +++ b/model_executor/layers/quantization/schema.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file contains the Pydantic schemas for various quantization-related +parameters. When a relevant quantization technique is specified, these +parameters are loaded in the form of a JSON alongside the model weights +and augment the model with additional information needed for use of that +technique. The format of this JSON should be specified by one or more +schemas contained here. + +For example, when the KV cache is quantized to FP8-E4M3 (currently only +possible on ROCm), the model can be optionally augmented with KV cache +scaling factors. +""" + +from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator + + +class KVCacheQuantSchema(BaseModel): + dtype: str + # Each key is a TP rank. Each value is a dictionary mapping a TP rank's + # layer indices to their per-tensor KV cache scaling factor. + # TODO: Consider pulling this and its validation methods out into its + # own schema class (tricky as its members are variable) + scaling_factor: dict[int, dict[int, float]] + + @model_validator(mode="after") + def check_is_fp8(self) -> "KVCacheQuantSchema": + assert self.dtype == "float8_e4m3fn", ( + "Loaded scaling factors intended for KV cache dtype = " + f"{self.dtype} rather than float8_e4m3fn!" + ) + return self + + @model_validator(mode="after") + def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema": + context = info.context + if context: + tp_size = context["tp_size"] + num_hidden_layers = context["num_hidden_layers"] + assert len(self.scaling_factor) == tp_size, ( + f"Loaded dictionary has TP size {len(self.scaling_factor)} " + f"but LLM engine is currently running with TP size {tp_size}." + ) + for tp_rank, layer_maps in self.scaling_factor.items(): + assert len(layer_maps) == num_hidden_layers, ( + f"KV cache scales map for TP rank {tp_rank} is malformed. " + f"Expected {num_hidden_layers} layers, got " + f"{len(layer_maps)}." + ) + for i in range(tp_size): + assert i in self.scaling_factor, ( + f"KV cache scales map for TP rank {i} not found." + ) + return self + + @model_validator(mode="after") + def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema": + context = info.context + if context: + tp_rank = context["tp_rank"] + num_hidden_layers = context["num_hidden_layers"] + layer_scales_map = self.scaling_factor[tp_rank] + for i in range(num_hidden_layers): + assert i in layer_scales_map, ( + f"Could not find KV cache scales for layer {i} in " + f"TP rank {tp_rank}." + ) + return self + + +class QuantParamSchema(BaseModel): + # TODO: Generalize and extend with more fields + # (e.g. weights/activations params) once functionality is enabled + model_config = ConfigDict(protected_namespaces=()) + model_type: str | None + kv_cache: KVCacheQuantSchema + + @model_validator(mode="after") + def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema": + context = info.context + if context: + model_type = context.get("model_type", None) + if model_type is not None: + assert model_type == self.model_type, ( + f"Model type is {model_type} but loaded " + f"scaling factors belonging to different " + f"model type {self.model_type}!" + ) + return self diff --git a/model_executor/layers/quantization/torchao.py b/model_executor/layers/quantization/torchao.py new file mode 100644 index 0000000..3fee71e --- /dev/null +++ b/model_executor/layers/quantization/torchao.py @@ -0,0 +1,380 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +import json +import types +from importlib.util import find_spec +from typing import Any, Optional + +import regex as re +import torch +import torch.nn.functional as F +from packaging import version +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) +from vllm.model_executor.utils import set_weight_attrs + +logger = init_logger(__name__) + + +def _bond_method_to_cls(func, obj): + if hasattr(func, "__self__") or not callable(func): + # If the function is already bound to an instance, return it as is + return func + else: + return types.MethodType(func, obj) + + +def _get_weight_attrs(param): + # record attributes attached to the weight, so we can + # recover later + recorded_weight_attr = {} + for key in param.__dict__: + if hasattr(param, key): + attr = getattr(param, key) + if not callable(attr): + recorded_weight_attr[key] = attr + elif hasattr(attr, "__self__") and param is attr.__self__: + # if attr is a bonded method for an instance, and + # attr.__self__ points to the instance (param) + # we'll record the underlying function object + recorded_weight_attr[key] = attr.__func__ + else: + recorded_weight_attr[key] = attr + return recorded_weight_attr + + +def _restore_weight_attrs(param, recorded_weight_attr): + for attr_name, attr in recorded_weight_attr.items(): + if not hasattr(param, attr_name): + setattr(param, attr_name, _bond_method_to_cls(attr, param)) + + +def torchao_version_at_least(torchao_version: str) -> bool: + if find_spec("torchao"): + try: + if version.parse(importlib.metadata.version("torchao")) >= version.parse( + torchao_version + ): + return True + except (ImportError, version.InvalidVersion): + return False + return False + + +def should_skip(prefix: str, skip_modules: list[str]) -> bool: + """ + Robust skipping logic: + should_skip("model.model.layers.1.q_proj", + ["model.model.layers.1.q_proj"]) # True + should_skip("model.model.layers.10.o_proj", ["o_proj"]) -> True + should_skip("visual.model.layers.1.q_proj", ["visual"]) -> True + should_skip("model.model.layers.1.q_proj", ["layers.1"]) -> True + should_skip("model.model.layers.11.q_proj", ["layers.1"]) -> False + """ + for s in skip_modules: + if prefix == s: + return True + if f".{s}." in f".{prefix}.": + return True + return False + + +if torchao_version_at_least("0.15.0"): + from torchao.prototype.tensor_conversion.api import ( + convert_to_packed_tensor_based_on_current_hardware, + ) +else: + convert_to_packed_tensor_based_on_current_hardware = lambda t: t + + +class TorchAOConfig(QuantizationConfig): + """Config class for torchao.""" + + def __init__( + self, + torchao_config, + skip_modules: list[str] | None = None, + is_checkpoint_torchao_serialized: bool = False, + ) -> None: + """ + # TorchAO quantization relies on tensor subclasses. In order, + # to enable proper caching this needs standalone compile + if is_torch_equal_or_newer("2.8.0.dev"): + os.environ["VLLM_TEST_STANDALONE_COMPILE"] = "1" + logger.info( + "Using TorchAO: Setting VLLM_TEST_STANDALONE_COMPILE=1") + + # TODO: remove after the torch dependency is updated to 2.8 + if is_torch_equal_or_newer( + "2.7.0") and not is_torch_equal_or_newer("2.8.0.dev"): + os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1" + logger.info("Using TorchAO: Setting VLLM_DISABLE_COMPILE_CACHE=1") + """ + super().__init__() + self.torchao_config = torchao_config + self.skip_modules = skip_modules or [] + self.is_checkpoint_torchao_serialized = is_checkpoint_torchao_serialized + + def __repr__(self) -> str: + return ( + f"TorchAOConfig({self.torchao_config=}, {self.skip_modules=}, " + f"{self.is_checkpoint_torchao_serialized=})" + ) + + def get_name(self) -> QuantizationMethods: + return "torchao" + + def get_supported_act_dtypes(self) -> list[torch.dtype]: + return [torch.float32, torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @staticmethod + def get_config_filenames() -> list[str]: + """torchao doesn't require additional config files, we use + `config.json` from huggingface: `model_config.hf_config` + """ + return [] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "TorchAOConfig": + """Create the quant config from an hf model config""" + try: + from torchao.core.config import config_from_dict + except ImportError as err: + raise ImportError( + "Please install torchao>=0.10.0 via " + "`pip install torchao>=0.10.0` to use torchao quantization." + ) from err + + quant_method = cls.get_from_keys_or(config, ["quant_method"], None) + is_checkpoint_torchao_serialized = ( + quant_method is not None and "torchao" in quant_method + ) + + hf_config = cls.get_from_keys_or(config, ["quant_type"], None) + assert hf_config is not None, "quant_type must be specified" + assert len(hf_config) == 1 and "default" in hf_config, ( + "Expected only one key 'default' in quant_type dictionary" + ) + quant_type = hf_config["default"] + ao_config = config_from_dict(quant_type) + + # Adds skipped modules defined in "modules_to_not_convert" + skip_modules = config.get("modules_to_not_convert", []) or [] + + # Adds skipped modules defined in "module_fqn_to_config" + _data = quant_type.get("_data", {}) + if not isinstance(_data, dict): + _data = {} + + module_fqn = _data.get("module_fqn_to_config", {}) + if not isinstance(module_fqn, dict): + module_fqn = {} + + for layer, layer_cfg in module_fqn.items(): + if layer_cfg is None: + skip_modules.append(layer) + + return cls(ao_config, skip_modules, is_checkpoint_torchao_serialized) + + @classmethod + def from_config_file(cls, config_file: str) -> "TorchAOConfig": + """Initialize class from a config file. Example: + ``` + config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()) + fn = "torchao_config.json" + + with open(fn, "w") as f: + f.write(json.dumps(config_to_dict(config))) + ``` + """ + with open(config_file) as f: + f.seek(0) + f_read = f.read() + config_dict = json.loads(f_read) + + hf_config = {"quant_type": {"default": config_dict}} + return cls.from_config(hf_config) + + @classmethod + def from_config_dict_json(cls, config_dict_json: str) -> "TorchAOConfig": + """Iniitalize class from a config_dict json string, got from + torchao_config_object = some AOBaseConfig object + json.dumps(config_to_dict(torchao_config_object)) + """ + config_dict = json.loads(config_dict_json) + hf_config = {"quant_type": {"default": config_dict}} + return cls.from_config(hf_config) + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + if not isinstance(layer, LinearBase): + return None + + from torchao.quantization import ModuleFqnToConfig + + if should_skip(prefix, self.skip_modules): + return UnquantizedLinearMethod() + + module_fqn = prefix + if isinstance(self.torchao_config, ModuleFqnToConfig): + module_fqn_to_config = self.torchao_config.module_fqn_to_config + c = None + if module_fqn in module_fqn_to_config: + assert not module_fqn.startswith("re:"), ( + "module fqn should not start with" + "`re:`, which is used for specifying regex" + ) + c = module_fqn_to_config[module_fqn] + else: + for maybe_module_fqn_pattern in module_fqn_to_config: + if not maybe_module_fqn_pattern.startswith("re:"): + continue + elif re.fullmatch(maybe_module_fqn_pattern[3:], module_fqn): + # we'll apply the config for first fully matched pattern + c = module_fqn_to_config[maybe_module_fqn_pattern] + break + else: + # fallback to use default if no module specific + # config is provided + c = module_fqn_to_config.get("_default", None) + + if c is not None: + current_torchao_config = TorchAOConfig( + c, self.skip_modules, self.is_checkpoint_torchao_serialized + ) + return TorchAOLinearMethod(current_torchao_config) + else: + return UnquantizedLinearMethod() + + return TorchAOLinearMethod(self) + + def get_scaled_act_names(self) -> list[str]: + return [] + + +def torchao_quantize_param_data( + param: torch.Tensor, torchao_config: Any +) -> torch.nn.Parameter: + """Quantize a Tensor with torchao quantization specified by torchao_config + + Args: + param: weight parameter of the linear module + torchao_config: type of quantization and their arguments we want to + use to quantize the Tensor + """ + from torchao.core.config import AOBaseConfig + from torchao.quantization import quantize_ + + assert isinstance(torchao_config, AOBaseConfig), f"{torchao_config}" + """ + Avoid real weight allocation for faster load, since we will + end up setting it to param. + """ + with torch.device("meta"): + # linear can't be top level module since quantize_ is inplace + # while some of our configs need to do module swap, and only non-top + # level modules support module swap + dummy_linear = torch.nn.Sequential( + torch.nn.Linear(param.shape[1], param.shape[0], bias=False) + ) + + dummy_linear[0].weight = param + quantize_(dummy_linear, torchao_config) + return dummy_linear[0].weight + + +class TorchAOLinearMethod(LinearMethodBase): + """Linear method for torchao. + + Args: + quant_config: The torchao quantization config, a string that encodes + the type of quantization and all relevant arguments. + """ + + def __init__(self, quant_config: TorchAOConfig): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + weight = Parameter( + torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + if self.quant_config.is_checkpoint_torchao_serialized: + weight = torchao_quantize_param_data( + weight, self.quant_config.torchao_config + ) + + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + + layer.register_parameter("weight", weight) + set_weight_attrs(weight, extra_weight_attrs) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return F.linear(x, layer.weight, bias) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if self.quant_config.is_checkpoint_torchao_serialized: + if not hasattr(layer, "weight"): + return + + # record attributes attached to the weight, so we can + # recover later + recorded_weight_attr = _get_weight_attrs(layer.weight) + + layer.weight = Parameter( + convert_to_packed_tensor_based_on_current_hardware(layer.weight), + requires_grad=layer.weight.requires_grad, + ) + + _restore_weight_attrs(layer.weight, recorded_weight_attr) + return + + # online quantize the weight if the checkpoint is not already + # quantized by torchao + recorded_weight_attr = _get_weight_attrs(layer.weight) + + weight = torchao_quantize_param_data( + layer.weight, self.quant_config.torchao_config + ) + weight = torch.nn.Parameter( + convert_to_packed_tensor_based_on_current_hardware(weight), + weight.requires_grad, + ) + + _restore_weight_attrs(weight, recorded_weight_attr) + layer.register_parameter("weight", weight) diff --git a/model_executor/layers/quantization/tpu_int8.py b/model_executor/layers/quantization/tpu_int8.py new file mode 100644 index 0000000..64bfa8f --- /dev/null +++ b/model_executor/layers/quantization/tpu_int8.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Optional + +import torch +from torch.nn import Module +from torch.nn.parameter import Parameter + +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization import ( + QuantizationConfig, + QuantizationMethods, +) +from vllm.model_executor.parameter import ModelWeightParameter + +ACTIVATION_SCHEMES = ["none", "dynamic"] + + +class Int8TpuConfig(QuantizationConfig): + """Int8 Quantization Config class for TPU Backend.""" + + def __init__( + self, + activation_scheme: str = "none", + ) -> None: + super().__init__() + if activation_scheme not in ACTIVATION_SCHEMES: + raise ValueError(f"Unsupported activation scheme {activation_scheme}") + self.activation_scheme = activation_scheme + + def get_name(self) -> QuantizationMethods: + return "tpu_int8" + + def get_supported_act_dtypes(self) -> list[torch.dtype]: + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + raise NotImplementedError("This function should not be called with TPU Backend") + + @staticmethod + def get_config_filenames() -> list[str]: + return [] + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "Int8TpuConfig": + activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + return cls(activation_scheme=activation_scheme) + + def get_quant_method( + self, layer: Module, prefix: str + ) -> Optional["TPUInt8LinearMethod"]: + if isinstance(layer, LinearBase): + return TPUInt8LinearMethod(self) + return None + + +class TPUInt8LinearMethod(LinearMethodBase): + """Int8 Linear method for TPU Quant.""" + + def __init__(self, quant_config: Int8TpuConfig): + self.quant_config = quant_config + self.quantize_activation = False + if self.quant_config.activation_scheme == "dynamic": + self.quantize_activation = True + + def create_weights( + self, + layer: Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + weight_loader = extra_weight_attrs.get("weight_loader") + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + def _quantize_weight( + self, weight: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + weight_dtype = weight.dtype + weight = weight.cpu().to(torch.float32) + n_bit = 8 + eps = 1e-5 + max_int = 2 ** (n_bit - 1) - 1 + min_int = -(2 ** (n_bit - 1)) + max_val = weight.abs().amax(dim=-1, keepdim=True) + max_val = max_val.clamp(min=eps) + qscale = max_val / max_int + qweight = torch.clamp( + torch.round(weight * (1.0 / qscale)), min_int, max_int + ).to(torch.int8) + qscale = qscale.squeeze().to(weight_dtype) + return qweight, qscale + + def process_weights_after_loading(self, layer: Module) -> None: + layer.weight = Parameter(layer.weight.data, requires_grad=False) + device = layer.weight.device + qweight, qscale = self._quantize_weight(layer.weight) + qweight = qweight.to(device) + qscale = qscale.to(device) + layer.weight = Parameter(qweight, requires_grad=False) + layer.scale = Parameter(qscale, requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + try: + import torch_xla.experimental.custom_kernel # noqa: F401 + except ImportError as err: + raise ImportError( + "Please install torch_xla by following the instructions at " + "https://docs.vllm.ai/en/latest/getting_started/tpu-installation.html " # noqa: E501 + "to run vLLM on TPU." + ) from err + weight = layer.weight + scale = layer.scale + out = torch.ops.xla.quantized_matmul_int8( + x, weight, scale, quantize_activation=self.quantize_activation + ) + if bias is not None: + out = out + bias + return out diff --git a/model_executor/layers/quantization/utils/__init__.py b/model_executor/layers/quantization/utils/__init__.py new file mode 100644 index 0000000..07c1802 --- /dev/null +++ b/model_executor/layers/quantization/utils/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .layer_utils import replace_parameter, update_tensor_inplace + +__all__ = ["update_tensor_inplace", "replace_parameter"] diff --git a/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f263f1db61bd875c077cbf81b438691933f5f100 GIT binary patch literal 319 zcmXv~y-ve05VoD5My)z9z{G@DIuws75@O&1cz{UAGWDrgoD<_Xs9|H~0eA+Ug{5;Q zBo?|MbwFa`62T4k`+fKKHl2Fb@sPqd(v0B;y literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/allspark_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/allspark_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32264475302d813029d31f7d85eb27723951b6fd GIT binary patch literal 2201 zcma)7O>7%g5PoZW{r{WTNlM#f>kz1$#HMZ9xKdJZ5;sM4Q#Vcy7^Kz4yGb@)dzZKC zQoAOq1Q)~sZnT1mD~L;xkhu2FW#d+=wW>rV#32WymC_RznCIU#r4mni^Jd=6d^_{r z%)Z}zJ~x6P*}fG2aw7Bx5xS$L0pVa5$Tx^bJSCw#SuGM(vg9ok;jNOjWXs!1be;y; zmbVM^j=u}hU$>~dlc)18m~P%Kcz6d)Crp=MK)ff@WN=0K;u_myJ zfLVpZny86#**GRwwAZNITUf3rEhoeKi6CxmYb;NY4HsAFum!psFRXT$R+tBvB4?%n zZI2yGm+P-Tb=XhoI(^r|Q{_ON-Z_lc(f91^bvth7-u}S4Y006wRU`4V#w`j5K<}tqLGSGEgXDTZx2%x{70@0v@Z;SCRrS^>ty>AG zzmEq2&C~Y)9BQ$dA_#TaoKJGJ5&>>o&j+wDYTmAckPw-rtE0 ztnB{-V?PT3`*|Q&YO_~l%%nZ8P&s zZ6eOxC`;9scc;%X8~}AZ1U_E7`2X=gu5e|IA@#$Yu7zVA{Fiovu_O4H6i)3-FX*x$ z@GA4VVrC{FZV3JzhAZ>TYYUs0z11><4mxx=jI3y430j0_iUKYRk{Yjl*!#3hPNS2# z%Jdola5g2G)6QOG0BobfPrZyh#cWjIfZ?zY`x#Wcmh9noIvYOn%FXb?EwLc51y12M zL`l@D@X?rMZ6)Tyq}`0jmH_Q|)X1=FGAoufV>Fe?ko4v8RmR@3SuBTSk z*tHL`>6|g#)2FUy)2nIr8k@?zzr6CQVx^;?H5nJS4M!(i!`aO{=D-e!2$NpIE<#B6 zU=I-ZclF|DwTdDMM(CB8RpQOG$KQp7R#Fm*s7z}dh7_l7S7t9Ii?c#P5;qb`Roj-! ziCdCXN|aV5&>-PrHj=_|eGer^EAm!}E`a-_>p5=4f0Wy?EcE&t;y@ z<@V-skLT8PTg0>@=dpVW4acZHKJySa#xFHJk$VFT&++HMh-slhk!ED<>x|B%O)C{z zrcBxv_M6D&^P67e9eFY~zvr3nn2t5aChwI^L`{dEh0p1O=k@c8CT$6&!HXtdGf@k8vF%CEaW?>5e)JMqjmXWx5Rd=w&)FCC721bK%{ z5AqE_g1+(Q_>3OC@bD&>`I3-d&<**y+y`)&DCWmQuo!-FiSh1}rnZYZ*xEewo?&>+ z5GF5#5j|S5*V)DOg-j~Pu3cSC=dRw!EE%o})P8c~?71W!B#se6!o=H3nu2bFxbqI`d%$=}h1=MymMiuv09s@;Vw8EG_ZZeM93W? z5|Ig$QEW|NX4*7rVi0D-?6i5*ti$FoH*FcUOj}2-@Xn3eM9Ziho;-LQ6GXKB2{Y=1 z)&^}pw03A+(B?tw79FBg@_=J`qVHq#sBiyRPF~6Yw@fUD)m6ZQ-?fO9&{wYa;hdR%n^*^NYFh$Bo@Li?uLo4eaU5_%(@ju(y7( z{$ny)JIe$DTey^g0Hay2NmLF+BLI5FW+;^+iVzD2mGLN@4w$sOp|M~%NQY-*lIE0k zSWw_ap5t1Zk2JS#d0?J^dF#KRZ1DhZoxy4Ws(@{afrf%?5o*T@wSYRH+9EKrML^=W zh#4@gxsYf+qN9FLH&g*Lbs{Mcku4x1TQ;bAhBeETpe)SAv;tAO78;X;v0yBCITQ{l zvtx*xDG0OhZ2vn{cSxLBXi$(py-e=LO!qxZxAUgBX~85i5qI1)$)!7YH9L4roDoe^ zFcURRkRcK$lvgYn#7X4#g|cf@HxPS=Crm?R7X~i=g|cfjV@7siAbo%xp!ngf9;gB~ ziuDVEsFQZ~R}KDuVDOu7qh+W*Cx|7%ahhR2`q_qV%gtXV$E@LT4K%LDzU7B%Bh#QDb7AjYr#AhCvol?LyJX|$i~T! zh-i*;P;+qxYD=7j+8Q@OZTr+Q4ssS}KD5OdWk<$=)(y0F9X~JI4-}41Iivs=;1o%i zo3|*a=X#mUA;odaB$}8P%EPqgemkvFm?6vH>)KhUcr*QCeir9PP$nVot?6)@Zb1?)=f#$h`FXdqG zWZpCb*X^rC3E`krq{B;_QOf>Ts!ctI{_)tVvg5fZqMtrR^VCtFk zWq1K}+>11lBsM=V4G#nyx)GwK@CAARE7VM_vb%Q}w@?@wxo~0d?csB82xwnUU2M07q?=+f&bCgCY zIGP&#q#vst^F<*hBbqyFH}vD7hoDzR{Qx_Lg`)Y;4hUmcq_L^})`m90@G+!m1mOe= zp2WO!llskLCaI$>>i>MTgQj;~IxzMmpYH&cY+n3{ZP*vi6wjKfe0TO5M?aW=CD+Mo=CNo*)EGg=>Ge}iA9Lx;PWdPx z2H}xwp;{o%oy2`8X)9cM>%K*8IJM21+?Ef|FT8QH20V<@v+y0t1Kt>No+%hY=qo{= zyniJR)42LN*m-lDmECC0-y$C|%FaA-bJkSN650Y&1x)%%wY-bL@Qh^0JNM$MTLDqC zN|XxI@`Q}6=&Mf4HmFl*JclQxCjfC89{D&_uxbnOHm!_4E_-ruy}0kgepv6JUpm}L zcZurut2RH>o2~=PrR%O5)mD?@U?=o_(0tJuB{rkzK2L~AQ~*s@Lg?JJ1EI)EM)I;F zdqe_tQ{TwH2LSvdLDz1SIXF*|nK}t-UXtB3(WLt_Hj4-T$owgn-AeFbX)yx(mwS%1 z56t52n}dxTQD8X;CcA5v!d;vgvsiUcYTM1hESV(%`+#AAYv0lR2mjJtL;GKkb{P$= z7`hgcLzlx6e|eVYr$Z4zxk9C&C`1sgznMQ9jmRNUqCAzvnXyo0f>)%79Mw+&%s7N! z7>n@Wg6K>ng5%di$`!t;X?GOHPsEg~&2#=VYuU5ed!G2ox}2QtVr&Yr{5i3*9tv^# zntzA92UyLQ@e<%Qgm=#Mo_}*#_cX!)MHBw=@$bMj1f+`4Hmpuzh0a7qsXgP~_rvUq0^L1@U!rJkKS+jP5>Lgu9lI{l86#%PzxO%g?O|3eW zBBs*~DbAeVu+3WBY13XvZ2Z^$_Jp={}^o8h|uVvsrUm<=cU@V;`vp zyu#93Yt|~Yw*B#BqV~TgOdfrc00&nKyp` zHr{BjGdp1!JJ3VxVLG^>;6@tNH5yLikuE0bAnwk@m+Xi1kl%W zoPX0+pLEu%hhBeD_SNWf)&!fBr+YjweE(1IOc-aPH432vqE)&3)1^8isFUW1h@sesX z0%>{Q8cd>!yii1D*Ni*&s4P;Pd~V8w1|s{pIdfNoAcr%&1!QYD5kpt{ z7LfHoLMNX}S7$;Lwqt?@#Rr_m!OeQ?ie`#LH2VdRqiIP2$>OBxm@=!eaJ7HH=)^KE zJQMI{Hh4NXB?&sJFd>A*8}uR|{V6=#XfhYbro(fqZLw{`QJHX5Zrbv1ITjsDU7w8H zyL9)`%C)a;C!R#1M>D+u zJ(?AAG!PFcnh6AD7<#}`jN(9{&bfeNkRuWy%_i?L9EF)Z&@l}05P6Ta4;dB5;I@~!*YRdY$w;ZX}}6OKCdV4Lb_Q_XGrjmnN=xd9@B zMs9Mq=)gy?WSz;R+349_@9Y9ZhjC^O%+x%ajmmQE8+;{)Hi~-U^(AAhzPs!kMAze{ zUHs(8@xctv-TWD_X#SjgnC5=otYG$2FW?q)@uGh(v$8kH@72PpxHxCTp7WMCxJZ%w zp-Jz76&p!oh)pfX;T&XUMavxQWZay=8#iIj`}_4~FwZhvq(pWO-XY5a?lxc@Fp{}2 z6VWrWJr7vgk1uF;=m1n(dcM}CnXiSU>xN`;U|e23s&R7?jmjFQ$3dDIlZiT+#(|}t zkOEfCa@9aFgG9+%9@u`2H-&3QHMbO&re!_KG$wP+L}J0QDM<7gBaCzlKLf5e2`NUv%re#rHn(-YdUbzMfyd(331Vu>8jD$%Vm9Z`qyqZoikb zyKhx2RxAYiJS4oFYHTw)@!6V6( z9c5YmT`G&||43O(`;;Yz@`AGDP@z4_!e$eKarU=T79NxZ(xq_|Ueqx0nN|Yq*{v*G zMp@!)1|zK|elyv%E4fmbETl}CIZ$0tBv?V#b)zz2j#9tv8OlV1_>L)0NY{@cjFy>B zU?pKSj+L%zR{+u+peGm>3BiaMuxQq+M#_;+;-G7f%Cu?j9gRt2MR2kxRQnPjMSFB6 zjgFU=Bx3&+RG>62@2&7+_><%JPTxJf?mD<|KItu4uDg9{VPLbQ{La+vsifU=t8%e& z>1x96TRwTO^KR!#L8APXl}Mtz19IX!YEyFSgT)UFwJEhM3_?}~)JE9w9!_`w1&Up7cPjs3%Zo*eg3)mf!^3fsK@-3;$6xXMT*y#*Y+_ z-iZ-skm|n~C>T_t|3IOK5o*T@ZK>uve_{B$JGsgJu6KnCU2k{w^_}aZ2lesHg>>-3 zaBn~4JcXWfUBe@9L+-Qgr9;Bm!GYo4^CN>JLt0q|E1N(aFtVXTns@)$(4~IOd^s8o z>u%o2!E*Yq8;ymFlAcMT=(zRNk8zRe!q*`eO33(%lRttLjLvDlh4>9=C!2LlhWJz zhLvNVjcy~HI!5XP&+LbplBB0B#X|ekQ?w-B&P#CsL+&oWAZ10!Mh=uO58s|kA#N&< zSR6NxEfp;(x64#Vd4j9hc3}S>Fr?U*tZR5?w=*7nMrdKi6bEg3HiWXX;Tz1Bx9vcv zjC6K0&sYoN*NL%hcqOI3A3vqHPdWQG2mPr+p2&BLS}wqSf;sgp&&`xQuK#)CpEYh1 N0JrP%nRZ?7{u>%>sx1Hj literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/flashinfer_fp4_moe.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/flashinfer_fp4_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b9b498ff80e5e0974bea67ab5e9bd3d0684d6e4 GIT binary patch literal 4159 zcmbVPT}&I<6}~g$vB&rah>e{eqMeP{fWwa$OhR_^qfH1bTah%Izm=SzHSJ4VB@cP*ABY)Y!I|* zujF&jJ#+54cjkQO$G;B*S`ZA~^^=V3N9cRHaE7nWY+mCKdWbZXL>i~@I+x_w%x8t9 zz|~}~teE0Clyx)eVN^c~lVJ``BTf1hm+a8IIFjbImakpOs26Er&4;;ntl*^`aa94_tB+Tdn~nNB7I5p8Z+1nf$n>Ye5|8* z{JNFVGuEO!gY`Tnrkpm2eCfvc~MyA0lX{-2t+rHw5S+7=yslbsxh%-!8P#mwmMU0a14Q&PO4h(J!S2p%Q%UEC6-Nye6m?$B z=<1Y??J&Ux(ctif!G(lk8OlOpVbC6&Dr9u6`H-C!cHY%?6KoYoF4g$b2nn#Dzs;Wl zd5Em7XS4hkyT#n)$Owll(GfhUGa&B;xyO;uN)FeUn{+$2>9X)@=uuANoL02B!_iVPN_M03d{7154_aCR?Ka? zaBkkT58arYyr^8iI;M=n#);1^jgMVXF8&afmxl?xI-zf3d%!`df~MLX)SDWf&!n)D zQuFE*1LSYfCTO95PRFfmKAz9y`z@7BW2^seVdSHeb0aveXQtx$MQg^$#piWB8_yaV z))jmggP#o&*VRS9bNmhkVy38C8PF>L*v)to;HXmp4g>i`;-xpEJ>#d z34S(-0HN|`G68Da} zo_kUQud{HYoulL~iPL=Z8ME{o?--sLTc5q_dHSypKRT*6VXz9X>KxpOG@~uZ* zk4GQeS`YO6Xng7X1AZg0Z{_Cl%}3$Ki3c~=1Kl+#jf&ukciFo-v`SWc9t(ezx++q< zEXCKQ!RJz7uSZo;@Vl!>aKb`>$V)XNeJc*`!}~l$fLNG!j@>e+#?4&-TVgasV3Iymh9B6dX{`iU(OCS)OGDteG5W+=cbF)CDvhE4FLOFm%>X+U{$ty|9H$jy41Z2vSzN?Sl;d zfd0$10&*^=jo1JBumo+tx21qdwqW6UkX_HgpD4O!fFvRM{b;Aj^1CgaWE zpM^IOOw|ZFb>@{2A`JR!Gba! zZ=7pnuuX1?`{sNM7G!@sFc;v6iAXM~`97!5X|BaYUM8abJt8_J?!LKH+A2EVBXez{ zOEN9(HI`9HMR)!31zv1;kDu!hJ@xBFZ~b!U*9iTBViVNAP#MR7mJSDd_8Mt3Jopqyi9#Y^?Zr>LY@#S{4l@yE<_+QI1m#cGFHvN%+0I3$gf(mB+IQ> z2+5jOOoXgj(*WxsYt34)#$fWRRujpZv(|E+PXo)-L)vaYSza{X2TT}5ivg|jS$>v; ztuqv>;FMbv(p5O(q!LTVE=vGh0cuDtMD{!ppuysg{}IE}BTtTEbBpGvYzb|KBF!92 zq&0IqmVhXFHEh!yK(~=}5{_EYEa_w_vaC6=NYcqq@8yB*l}fE5Vl1wCWATeoIhvRU z5{%Bnsc3-;o(K@G(H~bD(gQ`()7s{v@_YtJ6!2|DO8cO`3x3M)K(t1F&QBB_q@{K3 zY|%_8i~N&RyfKYE~+xGc+-8+z%L;;GEg%e*(a5 zk|j&kwb%jz(gf6$PE0S;W0A3WU>8ugO$!c*@jj>{7?#SEqvtUe`9lwD-+{%6zaZ$jiw=5Baa394f}Qb&EA{QJB=T4T1SPHhXXkAH9f zz5NgN+&i=(?9U5_RpIc%@rQjm;qZnql^14IVdh6~{nM4dyRsp?Su~q0R{dDqi&8v3 z9=RgL7B8h0jb-;3+4<@nN1A6RWreuvo|@t&68^*(wW5 zc*kpkG1Dw>a5bZZBU$IR<8l8V<9#dr%$;=@vwOx$k)75Aqi2>x$~*Fmb95Sg{tu&Z zmC>$|u&pF3Oj+MPR#P1Q>P%ubAdlE@c)LfV0P=X&Sp4i9T6_ z715m-ffYsQE{MWzid0G`FhXmD4nw3_Xf&}XY1U{e1zG}%4my7cSV(Xfk-A--{p%IGzZl!}4glqm^_IPFGk&Py>- zloAmIv>mMxa7r{Ti7}KTtf+Gf(}*mj4n$ZZS&CkkM2(AS?O$wBTED2*m3a*7ZPYQH zp(MgsqI1O_noQRyWjJO$DH)uKPQtv=bW&n)!`H~Kt;90TJ#KDWJ6mY#&NmIHO#^qM zYSXT@)0O;1wJ8kw)`5KMpxQdPeoSp0dC;S_PUQFih`)iIIlkwSyXoekpSru>=L*6= zUKmn^p`W=1*QY<}QHKt1xL*9+Y_dCwX3`LP>LLTXzkB+l)48$fjsBT@{{^-GLP7A| zd_C8;TMg`9KfP%-4OjsuNZQ{19N$&w>4Ug>gx6Y&9_aeZIh1#cV6gFg$_X7Ze5%H3EvFT{MqgPnQzu* zh=pm1n%kfnJSz|-G?@XyLA5*vlEsDDSYk19F&0%QsverRqWeS^p*p2GKq*=-!A*;D z67-%(rIDfq;ReXp;EplOy3mRw6F_~bOnSbo-vM&ujZ9h%fefpXR(8-TK^vD%tB(2- zXU+C1=$-eWJBD|;-_VNxty)yI+7nsK`z$@D8=Ki z30Yf~&sspACrhE4D#$`#bl{Y4a(J}9W~7%`>}Apv|pgh*~|KW<2jvV zrzNnD#Piou(#jZGiu1tOLr?`YHy%+y$Ceb9L}@6U3|*8$XpY91tthVS+pY)7hwZ!7 z4z%Z41A($>KO;xu(Y;{!t{gN>-cZFT2`x)BA<0SzjM-2!AzuqAnN%uCaWx>YLVE9T zlh$BdRsHJHQ?Nz!Wr)zwhDGaBsmK#8P=oSvp{ZEhti2Yt##?IYbAT{)|5$t0|q0!5}oBv9LHh(MTGOH0;;>l|33+K-U3E z7J3$keF2T)k*!AYh;9^DnvPWZN76J6Js5&nM9@-P* z$Ry^`+NKAv)|^Z!AmahE7OonsJXvN1rkU?RUo=Wp&4YGC)rn~iEYP{8X1`*HV0s>g zpl{$f-CvOP3^TYt(m>J4gl3ac5l}{!>6_TxS|S-2okA2yBJ>rE7{%ZvuR6vvm_;jw z?JSKTgiGH><`3wszG}5->DI21E&_%aV%f>${^!lavj3O-nLPov4)fcN7|Yg|=|P*I8)q&bJ>?+m952Js(bgFugvRA3CAJfA9pW8duxL z3;jn6!Qp&xuNvH293ZZf+$Qn4tVQB0S$^~!?3(7=s;99Fqj^DTja_EW9-W%>W(K|J z%ubr;0f7-v)G~34wknwMAX1PGiyXLYoP@#PUMZD=DF|W*>%r_}On<=)jMp^tD;-_& z2t{@rwy``gWJRd8w)diFkeTkScH3R-X|H+ozR0^Ii48Orq@&b>67 zbI#`YS*H2a_hhs)jU!#MG`I6hfJCd_K$sE+D6KC?1--sHHhdaigN2B~-muxV zLFB4z#X;s&%1&_yHgKqcForcnSozn@VqGV&EDAl{YF@h+1m`=3K?N0lrLSII*yuQ8A4Ita&TkymU<) zwsr#nVBFbKgC;61#uU`bfu(~x86aK2m=X!i{+dx1<{3lR8ny~>&1dAxk{tB>QX@l> zD@fINA@$oBERTrRTJ4owfwq4QKjq&cS|g8Mc)fteNT(`vGN4Npx)=~tg#93TR&>e_e`ig6Axx`=ibcqOlLpb@=Vuq%`=)VNg;1;BJChi9EJ^NJn@7VY74Tk>o z=ZIc6oXoH%S`alBQIb-MBrP**_#&t^%le6J@xbBEGA4Tg0Q8l<(+2bhaBZmTDdm<= z5|V&>y=toRpPRB=<&FRo=2u=k!CpW@(a==+eI!(Ri3l-`V;D5dQ0Y}9WW0)$v`gm8 z0NSQmm=riZeR%9hY_bgR3iJAdj7>~2M*_98M)Yk+SO?9@HY26rGRrhHelZi1#p)*+ zwEp3N0_LYMR$u!>L*Ie^e+xflA4I_G1+wqJgR}3?=H30OyFc$9Qr$!AZ++?>`%z$R z3T5?3L1@Sgp8_&LAz5~hy4cHyF>gE2e?&yBaf|T7Rj_k`cWV7y3exZ^`=yRsUe#KcV_3o-Z4oeCiRrjv^7jS>z|KKn`>m zem2)Ty8*E?^vD@3G=!<%&cS))o?T)M+NEVbZ z4C(TGvW~11$i5zl+dnoL%9PP>)dkN6E_`^b`y{~CbGk5Ritfa)Wd*nhrHh9bVJ2*< zusMnc9=6awhJ^kJMrSa(fe~6rVY^mhJi4KZF4J_RV(o2VN~ZX-7MMb_71;7Lz6?j{ zX=dp&F36}%voNkjFX9OEID%WZ)|lP)O)UKuM%zt@-L49~gNK=!$)(GzxS=m_>q^jN ze|wejR?V&97SP+!<=^0^d;k&9emeo9$Ty(+2J*h6s_*E-z=rQ-csO>pn2;PFX1NO3mg?>2L|d*mpS`ViGcJPJq8{O{z=5SrDYwmo<&cmdC|1Wr9B1) zkM0r90WM`d3I4g_Sn=TTMh)Rw!~7ii(zY=(p-twcl0DRaakL z`LY$!cnq4r&6Gq$!g$?0Pc_WS%8PQM!y04l_ECUZQ^kc##Jy{|a{vE+)r)ZY5mvFNy0{ehMM{&%|S#M@; zXK!WUhI7_+%lBOOT$>0MUAsAJ!Pk2G@U6o|Gs0v0e(3tXYZKw(95)UnPVbEa*ALt{ za{Wlr4Ee{U))s)lmi4!>zBbGW1nP!v4c+y_OJ~uJMGoTf6rBjU;M<%#CvPnk-I!}2 zjsBbBEl1IVIWK7o78?<2B7HlHK7{iovn-uc>Q0CSf(2X~$;G&bK3-U=4Yn0egR z^oBb#YS6j90-E^`h z?S8g1!ad=8o_n4mf$S$IH!&%Wktyyp_nFxayJ2s+)34gQiY)Weah5)|*z%TM)zZ6Z zfhNT+G614~hF!5h3X}B)ZoGZ{?K|gHPfyW?MFK7yd%%`ig}h~_YT3Ez z#Qaf?1lkMD!9rK4uwyqM#{Zsy&suf?dgJH-29I$0pe&GwQS{>|+At>&kN<{zT`t-& z=O7JDMJGZoIEkVgp$0q!4?brq+DYhCX p)`qzYTmxs#^&i+IkQRHb+}_;S$%kom>~xV}=C$)rF=f}_e*rbPr!N2i literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/fp8_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/fp8_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c73a6211a346d968f6b6233461b4ab654374a70 GIT binary patch literal 46575 zcmd_T33Oc7c_#R3-&gDl$-+t$34kQHimSj4q)0-dNQo9GxmZ9INRWkvuL`6H6%;7r zX#tiy1Y)HL+HzF1lNK1MTTmw%!HGMjPdsCFx=(jK79Escm<~NrP9M#go&im{-N~8m zp6|bJsR!_ZRAhI?bNUgyb=Uvy_x}HX|NGy6?eRD{dE@EBy8vVnO}%v7%-Ci) zS1er{V-@=0vC3uqtCsPvUdI0yDd7sV?WSMp60SrEH)AY?W2@qu;+x+Ky=#*CAEWJ8 zxn*lm?k&I4C9FjWx3ZS4i*JcL%gc?~#%18Fj&agl)GZpQ?G8@)7mF_2Gh5?rv2F3akLjT&FhaL4 z)1FcvZbXb6(GX8t?{d?`PCp9g%KhB;1s!ngVqpkMh z|33WRkN^Jo!FXHTCXGj__Px&70S%-MwAq=5A^nQ);0MtYhcKVJm))(GTj5IM78!4# z^~@gY0HnjQL(7bu!^`+Tg&8VgA4h0q{6h2989R#k`qZ*DSPp+N$BrT0kyt0n2E2CZ zcm5koeSjl9jj`|y{vXHxXI0FpfT2ih`w779jGdI@Jr_5t^F=?W-+nkJwCMAI@=Wa1 zGNsc=qbU)UuTUy!wtlg4xMN>JIgV>!e=#`L*b9L3EZaN&zPo9c+S1GI&tGUgT4qc# zSiL5;^EhLtF|JR1p;G@tm(?D75oJBO>^xX*CV#P6XN|pt{(SE5GkbcCu`eUd^ULl~ z5|EBJ9mdWMs!!T!kL zP!dirKiJbBiT0)9yzJxZ8|X_#5`$;Y#(6}rA0Lhmqz*o_4S<~;{NV7=iF47Rxad1e z_DDBB7>)HrlPN^4+V|YaqwOb7MD`widf$-<(nL-?_sla-KYQ}P{;)|b-lkwOR|K#w{&>)|J ze{J&I;BX>Fkyttg2iZgjXrgZ*9_7WF6dxT(Qr?kdZ=`P^H5lR1SH?E%s7vS{VwsbPMgN2ZL~+D7;!(KQeLIgD#uN)1bMDH*TixbrgB z$x_IMf_3ulG9+l8(1b&0<2bwgKD=LQGX zT}mYS*Yyv^;)zK7<#^99vR#Kxi1W#H7ikdpjYLy@h&P<-OC;A(FEU%}&=t{&u>lXh z4!Jc`wufPiad&Nj$(lExyY}MLi+60bdAH|k$3(}tVZms!l;nN>HxJAguNLsOfJXa)`DhP1s0Ov2-KYgat$lpp_HRWHQ1c#3KFuqTyuN%GU!F z-#|tq8LP>_VBz>Mj3iYn#6G^3Jl844Rtjx}F~)u7=iI(|S4eP$@^%OQu9_xH`QXaD zKZw6;<|*@MF00$Rz*#NU2S~w*#*3Itm*ahB&!t4$%TkF%x8xzY7PTjhd^X3Z2-^0f zwxd=`o70wenU31rQfpE-If_@jy+RD8C$J;l)Dx(I{9(Tf}-ef>j;Ma{1cuuG^h zn7xKkD;fyJS^C8o**4+)g85iG-__Ov~1yRaWNN7}*v zm$dz?VHwD#7+1-C+c4@((Li9I-sDd^^}7zfREs6wduF9WM_p-G%(zV6tXBWCF=mm- z!1J^SF8R`&GQ14jSuSnVZ5_D|Y5SC6)UBz9dkHW&({8|!r>r&6 zHf1{<^`t$hr+)c9Pd$x=dZq$OUTGH=8qeZ|X4ISZe%=;6!D(RWE?QxG=ci_5$L8Fp zw3Xg8O7n_lU+bs;4d18#6-L-0T9Sm<$@37?Z!lSiDO!34hpt3K^WZ={DcTUNe;9}i zt=nPWqgOrAJru?2IGjlJ4FMxH#rpb1E4*UBux#g|$q4K$4m={P2%IiVzT?%xv6v07pE>vuFGu91{($Y>Un#MU~ie}zGL5TZ`F=W**hz4teCkt!)I&S zva5E?9)yv#SKqBz`F7>?%Dg)?Q}+JKn=9uoe}DA5qgcW!o&UiZm{>hoJK6olx&;g8 zC|j_?n7lZ>_3d5Pcg+NZs2C6kw?U!6(M zb$wKg{Uzuew<5W}bTT?!`*zdyrp$)xYbOUY$!zHw!M8SRU;BwKly`e4t<$zl^~}iJ zP9*X>u_-y6|G(Ij_?;LD{BAOIx0~W6qbEY^J%1YUrzp){GHCM^m2oMLGSPMsBmuBF z5i}Jcbqq=3XfXHuv9)_QVJFyYfuFzxh622eW;i|rcZFeY2EG_B z;Lmj02&CjC{P2V{N|*^lN~jEc1ja*qtB;X4GR*{%kncvU-=&zfFxXCN@#h`>Np$?m zyu&wHO}}<;e$|Hml-#C$6r}i1@{QV{E*hHS0r?P``LUilv@Um4as#ykZYmh{dP2wQpmLp@T!)jxwgo z(?cVdoV0X(lIZz(G87H5$njVR?4^*zg$XetKQs)E!`YC$_J;a;Ln z48=-X-X*RY@{NGIpoaBm$tZQ1KL|YJ8sTspt;*7eMV^Zg(#F1&noc<$LntTNby?@7P!68@AJL z?cK(mxyIdOt-D(hp0C&~RP3JPa}|5X&0q80t=*QZW${)ar6(}&ZV}uqdH>pMO9ya1 zkN3Wbb4c(q#dm5pEtGTq;(6bi+rBlQc{oo=-jBcHdH)*0zb0Sbl&@SlU%5f3-0+#h z;_(8Xc6feluLmw3EL{jl4z|a4iWb%?8vKRO8PVL`7fte&uu{F?D8?$-@q zV5oBmN#rj^{(7U(K-@=Vq>oxbo$AK}T~8D?XhS_Hj5Y+_YC~$B`7mlv73Q0r=DXhP zRj6&g7~y(Uw3X07CDy28fa0lhTd67LDvO8tE0+Y^1IufcG|Ul18g&@Cv_p>))-U^e zCWWTequSH9-vDbkZ0_WVlgjANn+C@73EM^UrM~!O{s;vS9m7P@GrDVEi?P#WJLpqz**8G;P{IO3DmWiJB#LBSW~8v@)^DqM0$4=+IE& z%1EW|bYWrlkt+FTFc{A1ugdyrv-Vn;_lnA|ZJ*jc-JNN8XYGx(Gu=1V<%-(I?fJ4D z(T9n|y)y3p#MP2_mrpll>gQ{B z3AMY3_2dMhRa!Y;vRWuvJ!7A3oNLaNbW9w)S5h%w(kzrT=lztRdDeyLR_Yx;2>R3Q z&$_DeB~{b0sh7qN=6&nOFI~Mdab!XJINk6U~i2PyQw%W0s-;l1jZa9s6Hc<1d6g1sT{E}A@@X`E@Ev)(e#KY2=c@)Tx*gJy!iblk&+>mrIDaIij$geotG8zBuG2JL%xO~r??Qe$n2xfpV%;Tl zWXY$DD{%jaz-J#DiN8e1j6%m~y$tAliVR}U^E5K~*U5MhhGqGRun+-%6mmaBEeyqw-AcD*#e-Drt_M%yl^S+{aU#)-z+Si)3x58w@ zwoO>oHoGmiYNtGG^Df`j-Cx~(7wh*EQ%}&)oG)506s^zu%kuuxyuT{%ufFRqo;)cn zxH&)YoAPVkDenW1t=PN3**xCQf?RRK-y{AbhH(B7i)~@~j2U7^1r@o_g!)U%uu%`C zV2iH>i=2vB=nc1anj2}0z9Rn73X!9L<}#hl0O|&@PU-N#Oe*;QTu>j6GLYZ z;|Rn2nkj7nveFKCt2IM;nWK#J#MD~y#Y{0%fh=G(cZ`|`$~Cni)MClEWTQd~AU<}W zLDJ+TypMgDoP=90`81QWdPxwR?`cj9!`4o|1Ad>5!8phd!30x|FcR?0FW`r#MwHMX zY9eq6*&%eKcVKvgr|uIg;3FS4GU7lwj2a#Q@4QOuAKi9YOJ<10EcR003Xm?5)Zm5q zfK13rGzchH(F8h45$PaGmWA*WRKD|M(EL`AFrBF_8U{qe1<^@_l~gXVOs@#UPh_hA z)qE*7=42}YBbrB6Ex!)12s>yM_`d)|S_MF4gau#ty{9SJQz z)}$C`X|N>$=L>E*cYVc#kzajv;?;TIZo#+v5k2lMzS=p_iM>YK5bmRV zv`R2s=#O9_{t@4R=2b#9zuL_NF?2dK1{OGIIyt1*cmn6Kyccohrq=)i&xM8SQmdaM zC&SX5KqnqNDXowe{te{BxFXUbd7Yf!gb_ALsKs04!5AP^bF3l~t~(6J$Oiqsd<2hX zggTefj&b)x9428=*?dvGP*i^xle)MfUtB(4+$t2e=9`-HE9)?$1HlE8f(hJ*kV+#` zT-!6XC$m3W)Rc8MF+60I;9GUqUA&m#AukEU>R8mh5Vz@+hINq$Ce8q);vx~z8Hqra zX&7QR5z!r?6*(b?*dmeGU{55%(==k62v{M5g9)C|EoT1qa!jTtoN)#acLO;F{Ni=X2jvVR_PU4>JPG@`8mNNC{HHJ>_1o zF{hmag`Aj5EI63c$(2@-09IzdP|~#EVt#H801^|4>63!DW`ThOL(1v9T0K$yz{jF& z@mosofdpxuYF@CwuE~X5g-pohRMNW0)rz9tcU4)uYI@i;>B&_{Pp&0pBrmrD(nGHO zRQ>y|HE1uTOt5QUlB)odTuWe*R{6@q!EVEqb0s*mt#xYPbD1LFjicUgVZC_-bDiX2EfuA zH5Az8dJ9+q2OK4)o6f$aKx0^?Y1FKNN65>P4>EN+3^xS^SQ3v2RWNHk)*BW|Rdi$& zUB=AU=dv(}RPQI;TJS@URKU{Xn?>`I047g|pCl=T&xoA(mQe>vFpcn9TaU*#78vI2BWWkxL-6yyOR)+NksdH_5zw!E^QuLh$tL@Zd0F zuyTq{2m|*GLGC)503Jh4BymA!D83D_Bd8ez&Mf9AWhdF+m}NEHw!Khluhv?Qh?M1;~q&YiXM~cE?>k?`{;_jd$ElbHif??z#h4k4zky zw&dI^#}4Gn%CB9Sy7Ki`1at9N$M~tcF8|eC6T7C%h^hI_6*IQ?T{m6tdUCEU^RAtO zYv-+!tZQe^)iJg|4>*S>4o{l}cQxPyf|JipRm}Tq1b6@qDeQCbsfY5Rv*V2)7b!5#QY@MLWUz840 zL=&}xcC|30hHSNX9+su@66$z17LXHAd%9>4`^QM|&uAj7hXpb$#ayr^Q!%qI7g#^`)ZOycGtRm2_t$-Q-JSBM1aryQQ#EM0tEEkfrvh2n&4wN-vht}I_`k*QBLXaK8o>Y>bv=5sDxAoUx*@gbVre|g z27DV*fF3<@TVlmYpWdeSE^0_0HUw+~Y40xp!9)<)sgMubif;tl*C?bbe^H2|ew=Xo z88i~Hb~Bq%(^5M8e4NlZl_@zINC(oU3)H?rO)9+&nM((#{OMrKhz*UF27Hex|0BrK z(IQQLIs~dFlz2HyK250JHe96iobsz-w-u+0({{GOmq6C{fMx{fFvRq>V@=-WONs@j zCbaV5&~%AXqNTq;vPwovHK}!&Vd0Op|l% z!|^c6q~0$}2Mf~5U~|ed1S3H&l`&naj5G+-S#DZK-f9n>WS5CrqCGuv&`c04i-lB_ z1dBU(YIOaFep?uCt%V7RIB8qfF7#LxLs5A-Rn}lEjw&d1BY{YYvg*nh4qYo!)>Fw# zUO_Prxc{Kk9?}lx{%G=oXkk(8HzBkd(0Cf*BtVmy4Dwk)aeX+OA;jmMswYg^yE~q@gP>o-2`o;r@susgV@jS0eo2 z-O zX663|slP}25OE!VkhOB9b(!-*$@;NQFelw!hG_+It}0+tPVZ!^;0%rJ14ChQK=9R% z9RfvGQhx2=)WNsf-rjb7Tduf1lNO3Mf-<}6nsB|j_u8SULpl43j8m|$xo&%;7SY!gkaoiBVeug--cf&@zxbGL*gQZ>E_n19PhdZaN zlE6fiUEGgdT-tZJUkI~c5U~r>`bs!~LB)51sCAIlx`^4RC5qEUzd-33F`F*_;t&}A zQUW}qfI<^FZqT>U7!P z3(u%hMjGK6{GR|6|3etz6?~TL0vWf-$dPe}j32|Gtt&}8rMx}ze@-FuWaPpOEptlkwMN zd;%j$2QqqFqv|K^k#)=OkyJY-t;zqNfJMB&B}ZUd8r>&Dc8TEKEZ}N9hB<(*%yNOxP`R z8(xY^9AVQai52T?meS?Rw1Hi`G%@Ll%BQ#(i2^Mf`Tx7^? zVDQoxvohE=z_uxM_;rIuQbCK3NJ$`Ut)yn0Se@+vauhZ_o-h;btV&z->Y!gUja-Rk z)R}gU)oRM7N3=rhT2Ce;ZF)~7_5jDs$Mo}4+Z~8PFSVXtYeVj(Qes26G^Ht(n@975 ztCYH>(MX}_5Wj_z>T#oSAnH(-=~4H0}Xh|eE&+BZ4=Nlf52ei`(rWP@KNW#YYuFT+pP$c~2L*&&Dr&G!y zwH>bC8b-aTuoCb58pT%lO?%b3;!CO5c{LYgeK9BXLrj%qiWSg3f6NM`*32JGwI~qN zo?b~5RjGmg2(=Rb9&M>}f)rC{FBoTf;Z~)@O6#P3X@5HKp8H*SrWaZ%MY475`=ypk z?iDp(`IM@VZ`c$46peOJIfGfA;qfrDV?aKqkvOs7T%VEH7Kc-z$1yvSeP?cNn?lTuOK0xLsG#N3S6{;2#YC3qWB67V+YAc+nO zqL&8yVxd^yrM?&yoGxM)vSNo(Qj!>SsDuW4WiGzLSdtj8*sD6vbhLoA+y23~$apG|MEQQ_xXDdl(kZ5p>0{5P_K;!8X#H4TQpd?v^r!ldS(g7GN_MB!6f9>Z2Wj9zpQlk9So2S3b7 zRw&mw8&8JIrOQZ!Gx6_`aRi2FmDp#JFvtvf|1KH-l){`$$p$nS;S%#M^4&wmzaryl z3b#S%uxB_H6+L7r6t$HCjgfugLvnIr~@CDH~5PuoEnFcSDYN_5x8$;+B z3oIOzjy)N1!%#umig>~Y`Ts};qhnaHRoaG*{4PqulI(2Aa7;9GG8Q0zpOX9(Mw0TS zu3h{|@+`kevB+q{f|h(A1{iSbIeQ>$E5BR3CRf~y`}oZp9zGwE;EK!O`@ENV0V|d4Q!!~2Q;hGr{LQg;?!6RXf zn?AF1_VRh#D#5nuj;$sWeP{5-;GM>;ckLc_Bfvf3p0~FO_EzXw8c$B{o36~9m{~o$ zb?&fwAE0P*-}uYpJ}@;Mj1BpQW3o%IuNrI5+k<26ciq8xcdg*A&1}fI8^#WN;xBr0 z-?SxDblsha=8BtUaPZs=5-kuMd+K9bwd1$L4XT>1SgbUl`bYYn(|N! zEctNGsFs~^^el{XNnI@mn+Q;Upo;Z!x0a>mgk#^|JfpQ0iSUM@^ zbRGGUzwGNGU&@8mw;t#UTAVaei>(G%cXhf* z4QbNULXQVpoG!twEVVuL#CyRV17!L}!v*v_X3#*^8&9^viJX#tKljqV1C%wIlsYG_ z>H3FS&6ZvRZXMMYb+~}i#4PVw6@KC9+Q-0ac&LpZ4X;_rZL}rb0?sP6YwKh3XiT?) zuP*n;yE5Zoblqc8H6c~-p;YVBVNE*P7?*tMwdrQ1P1EZZk2`{vUcY$supEfzA4c1j zLSli+N$&b|1Ke!HF?Yk)1T*P!(mS8lA>B3OM!QDIr6slmA4-Sy2=qs%RNHUEK<^hS zjSl(zIZLCHQd62$kFAXkE2xnEi>+p(ZC`918{G)v)SGU5czm!`&X?Y(GfL2Rqnnnd zNn2>nEsdYHk}n;Hj&9bBR~>nP^rrNN^yc?$%8Ui)+!VHt{K=!v=NY}Hp;={p5ra4^ zX3|&5P*ZF$?RjSJ8Huo5M&zoQVli0k_?u{r z5FbxM^A(XvlC+gVOP<*$6T|FgG-MFX5G71@GMOm;`)EJ@78!pEBkX4f`27D0JI>xg z8A`bPk0`=XG6;1SE$kSd|A_1+R7#R5{I?YTkH~n90;J=8Tr|ZB%7wQS)HI5QeS9ND zBVG*u|B&%tV2FkzqD`V{aQH4+F5rlOoy|w0{RL~mw_9T!P$Tpd;`R)PE*V05GSaVg zTu>c*laWP^0sg;Iq5c~gL@DtG7@}ziXZ?&Yf(4*d>wxGd8i{-%Pss`{+NJU%g{!xZ zPl7aIY21=G$;JVKMe<}ns4__mP(39}_W)FlkRw)66h(VO(47y<$dSqWToD_^^v-ZfD z_CTRU@So-6dlSSLe8rP#!B_V?-AtI&Fbho; zPYS*#Z?y~F1E8259emg0yZXY!3s8a9ob#+6JDB%)Cs%)?X8I%q5rVau7ldHzZ0Vf& zkD_z74_4iJLfCLr2p$zY$E28z-v~_apDvqPmpLp1TV`!iL}C4th`C;kdC%*g+&X<) zC~6YCO;n;%0Qc3P(g@f#eL*M+3*IoqD}u$>nx>kjkIpn_tG0fR|Nh8#NA3ilLX&uc zlN;XbzIJx%>~u6!ohxnqbI-bQ!(B**K=JJ4!A$9l`S+qTwi~NvH_Tp~+bL}A5H@xQ zH60%%@3>DaiuikmnN7<_ycaB*OrYN5wtTQ+`jil?AGh6w)W+F~vy*2twwbj;XtPkd zIp^InZd%U&p1lYH8<(aop{Rkcr!zgjS2J^RX7!DPP}VkkVy^g)F3y#G@S;%GF4)`0 zT~HA=*_^2mf@>h0GHzYuoUksGao+U{m7Lx2`sLRy&)X^lTLt7)PG3KL+Xninj4nk% z=w-+C9Xa>v2}2&_AGCPRJE{an734)mu8-VygbVSuU*Dc{H%=JtsVOC&al<_?Roq+q zJI}uJ!i^VZF3vXR8ukd@J&fXBE(Ajbjr8r~Gv>@GVMU7&Xc0UuvyQorTl@d4=2nNW z^C%4h&rysOZ^>^)r;8`gf*8*PkWc3FOzK}8pSdio-a1z!tllAbc8(oXq*0!SfX(xN zZmyRkOliU89ki_3Bw1E0A(G|7B25i0#ADRkZ(?#jj(SrV>3UKW+H1w5X41QZgDo>1 zY~gsz3={f7VA|3qnD%eEpwR(TmGqt0p`i5cygw#|%y3qq((hwZ__T2)drHOs6)Z{l zjX}4Gf-m4o^rK5(^e|)`SfD@X<1eJtix36h&D^LhrQ)=D*BxTcf-EQG57kQ#=XG#I zNMp#kJ?dy?KX=uB)zM?>!*!XL0vyN0)a|2=RE<3EI z#$5DI!TJr*(4r}Qosz!r8*^(*f5p{fG{~?nL+$qlXy?8RpBJzp8@R7E$_l_e(KM>z zq+Lnk-G&Ci1@6cG8Y9MkFK$C9*cY+l2~DDV&9aO-c!?n}CZP>p;>ez4SQKeUm+p4} z1p|W^jMpewuk2z-l)nNX;CS~(haf`_^bK0!&tkjf!FL5OeXy66rJIyAF}6IrGqR*4 zVC*hmkkHA&P@E($!IY+G44za-Nv!j1-=+9KsE|XbI|-gNO5T$gOvZ^(tY%KQAce4^ zNgC#*;>ANleLWY@MvBZn^qvj&^f7_cMTHIZoC8om83e3LxX#VsY1>RRghWGR#MrrO z#MFFb*NCTCQnSpi7Va8Z(ag>fb`jf_TE1CYWthgCU8hAu^N97O*7M0htUpr5$&oG3 zNy^o<-52WR2m3<}kYPnH(!EPbH5*gRCYPpjq)$p1i}ywe5!Q_+kBZjNGmH zlOAvC=j3pA3mZVuL``TPD?q5X5BzpIK@Ia4dm>MPp# zI35V-iHpX;q-ch&E(o%oA0%l~=;fy?l@Jt-M-wan*Dnb+u3TEL@3dWe;Ou4`K$;K{ZdlK0M zpDjs_b$(J-K{A@x(o^ZXk|^j^--PcigB*jC^51hT=}J07sp!MCLgNO}LEyg&4r#RLr-PHyAY55pgX z=h}b2dA@DG(6;|h+rc|U9YWi|8|N}TGaJF!HGTcibkEeWY}>)xMIAr)aNc_0in!Wg zalPL8S|^n6JGR`jmqYQ4#fqCxZvWptDB_9_8CcA$r|$2VPW3=?y?-P+7{^*Bkly-ckh8y^}6?UpLXwsRxPwNeKCVZlmZGx86OY3EB%hz zm=17s%+fDrX7TiOjIl5cR9!7KF(@sqB)*FtbQZ?A7mx(i!vz#PK~gPJ;J51S^MIh2 zhg1VqqHvj#JX{yTXO?_QX&#jman!!*#oYlNZi& z4Gr(3*z@Eg=4pg8pnE zVV4PkfnIExDW=0|lcfyKv?mo7TvFznodgOcv(zBPAc=(lcao8-6#|14574pmya&#jxESQw9C>RXg2NDphdzb zk{|^`p}=P>q}{7Y%h0oBQ|;BHk`Xze+EOW{l*HO%IZ#0!2G$xvmuN6iswMEQwibvz zC@pb%L)*6I5R8`!MO0e$N@_^Xj-iz#BS8qR+9whpRZ91;Fo#BPLn^b(G6fWjGK?@O zAxR-gY9yIh1=A`iu}dF%IEm=E2O${0WmvRBa}x$S`BC<1MU{+!r?m?AziIw6)@PFP z3XMrTGW$+k>+X(4caOA4+LzdMu%&OmwC=_dmeSrBemR8OL0Sv?c5{FYH$PrZF;0s@wj z_C#eB*6`Zpbs2!vZvvO2rS)HlPp3V-2CNHsz(p-48jTK?cr$8)7lF*j4xDT+Fpraj zEQQA8b~F>gGZOw>ZAej%Qu}QN1hn5L5`qX$ON%qM1axz&VazjxfO(hC55j0L;L@ zlZ+3_KR{G);_C@O0~w8EFm5?csHAs|3b?+^#@Q#mSHauSKKynV;NUU>hVLRDmCc?c z!iA4I9*Okk!RZ&lRX9GyG=QQ*7%Iq%W2j*o5Ldx7$|il$G!{o?f(zxn0dh68sh z4hn$|a46l}idFgQ#>~a*$JC#d`HISXWz~I;jg(vdENHSgLAqFKGR1(aPRlAD>VeZvFrORY5oSMr7Rez+7=&>GGa z_~}}*t%pd`*dgg298FW@qjiPc5B@ip5WUyc^s3-?GL~()LaG#WR*^?*&AhM`BfBD{ z3dSd zRR`r}jM*x2u|R{{s{B8>u8`5t!*(_I07=uc$|=;qR<+|oIpQ%uDJz62wwN9MdgZcF z&6WwXvs%l2IBJ6`L&JHMBdI_}-RO-dU45c48wuHe+wjkv&9Lg^g^9wu&E zVHu+jG8s68gt)&g?WFR6O0df|=r+)+6XRNXa97fOegy-u$k2=&os(cC*HNEFDNkq- z0Oo*LGoF%tPrVa1>VHgpf7%kWYT`rN0JPM5AXkGka7FqCg5s_(US>(YV>F0iD6fQMumm=Jpbe*r$sgbHF09d+PH%772I^f9uYcqGn71V!>EREs2C!J86S@?!dNV>qAXUu(M^bw*`27n9xh0 zYj~#tm^WRKhcGN5f!I|;`=Q#tMUg-JrS@Sd`9q)lS6~R4IqTQ zxM|eu`Ayo)H@2M#v-(h$^f7Vq#aV{dt&_Oi?5)Ss>t8|{miTRWNn0o_KHt_N->^q- zL41KEj?bKyA^=U7EEirn6N<%?J$zp`l^Z>Z;c*hbJbuB!{aXrW!oajD@F5tY^LbMAbb#jvd9j$(Vz)}qcB*f=@`M`mhsiiX zspw5FJn}G*#M`%VC>Q_(nDqnyB!yZi-fLuIauWe?haqfEZ@0$zL_7*@KF1IkF43sj zW*8o2q8!NeOB`Jf;?n?;pCz+ij*wz|2wzLTb`n9rtRVJq>}An(IdT!I5h8<$m}r5% zgJE8@Twp&o=zXQfQ1QGbUM?MqAb?p!^JOT&5gnJ=qq2Cx6yg+$HA$Nc8)!=niEgQ; zcW99$A`fi#-UU5JMN^Nzg32VYX(ay?7RW(|IJ@`t^lRz=>DB4LZCfQyXYbiOyO+&aL>K^(F>gY~m!h&_j(%AtDPG5G{I*bX|a6GnUZ$M%)?S9AVVl5DW?@P&A1h*QO(c294%?R{O2Fjk{SegOcnmh;vUGk}UKx6dzC1K>0 zQA9Zed6;-_0kMQwE$*K|Gqa2&Ve91do(N#--*v=WrJI}lM&!M0sLV{(eRdX22+O>*GvQSSl6>S}A%hOh`B506XOQnZ z+lO0I5n@g7uaTc6b~bW3`OpgwVx#(S(9j@>v_Y+49M3Vcqh0nCva|{FuLH;oE%*(v zV8LhFW^gA(UEf`6*p(JKHd~Wl)BJwZ&8C^k*(c~G%WnB5%Nn}L;-#A`Zw_A@of^IE zs%I|`=bcaD@!awsl;3iG)ReWKz^*_SU+VAUzK8#=y*~5oOfb6*Yt>Wtic7C`Om&PM z%iEZs&t%Q7r}MtR)yor?r)$2N&f2TS(|HJLy*TmWbj=%)apNKf3hk{LI|vHf)0%gA z@W<}uY9wBe1HkOj1vdZs+n(VRatZn@|3j1P}Lfip{qI7Vx!pGq8Gf-hKZYJeW`%g<nZ;wi1$1? z1@?qd5Zf_Z!rE^m(T{%nC`=se>8uN27~)Pmj%@5P%e(SnQOp71A%lxkNhdBvyorS20?B>$>3vYp1^G*&Qkp}T z&-L}3V~BANB6r6J;=O$kB_L@AMdp{dxlGd+gh=88gTrUfN#S_^78x)FTXfdYAMYRJ zuSicEsiHe5=!Z~kWDcD!uv4`@mz}PVfA+oTS0YxV} z++jqQ=#q~uSaX&a0+>dUAxPu)5nUvf%F$FGmO8meAi$2MAe`a>v^!%xAaS|`*Nl z&0YgzoCOO<$(VN}pl3}xVA{TAQPvIMK{7PF@e!bTtHFuXgTq1|I09DJ4~;kjD-Ret z&ZH4Xzb$Dq6jnNuHrOE~h}rG{%M2Sl^Ru>XFqRZLp zm0y`LA)_=>Ib~uTi9|!Ibfp+CT1YwQj;k*MegKJpqxcy`3y9t zkK+;C6wk-VWl9eBGqM<}jjzFjpC;ov8Q&n|n`AJoh!XR^O@>Br2@4Xn))1aS6uoft z$N95(Oq*W#5WUoz2p^1Kb?LbvmgrV6Ng53n6RHxKS`O(Z*57Qg(NHY>J1E58(27Hk zQ;l&y3v!l&hEKdDlkw>ngr=jprp}xfs_Gs(_W+WD*NJPs>dL!Ivz2XvyKVODZTGIc zyFPOna^SkO0SFJ#wI zvp){ji#mOakI|f!X3H~5sB|%gw%D6y(Tw#Wisv}lT&2T1r`#5VE#Jgw!K;}o*tC$Q zzw~(QAm6HH#?u-pU5?)an2gU9HuE#&%O(UtVUdzlej2#aaIKOw6KHluwz#kw@E;(N zAMukrkOm{x%~|#te&SjMss5{{Cr;kA!9RxU@;y@kk6#FaJ^d9Htjr<;44{ARKhtdkdmibR+~lQ>vH_G)Bqgh7KM&$X@b2I!C2kdRH@njo@Dult*K#I!3n0Q0?jo4L%^=5XhNNGQ3FTGQ-@wRQ_$NrV_ ztB7y)Wbsz+$|vSFWLNC^;d4KF=?5>}37mj#{e0zWO#Faz!h)&qEWcB=@wQ`Q-dB<7{Vk~Vnh39=Og0E%l zzy#fTa{0%ON@TP=(9zJE#p~b0q1l5#T4I_RAH4@@P}ss*FFHf|^cZx7=Pc3&%`J58 zUZ+lW1rC;vN5WJm5)IHL9~L=jBAM8p_oO_L*ls0DdljY=iOz#GO!f6>gv}PsEoEPn zMmw8Xe}Hk%< z;2ovG{tVJ!PXn9(py4O(I(bift0=o-W3FP;_YQpj*msZJ@jOKv86>#DZUKqtXUz7; zj1AEX*?Nc&CL)*nVySbuCCM=qUaYimtvcGsfR5iTb_}aZZbncOP}bx z2T@omX-=Fo4T4z}r`L5wI)wrW>!`ACgPXR*^Y{*P!N{V^GS%HTK1ASM9B)QEu~WlOoR zybIAEdTXs#hbJjhXG&Y}e*!pNG|J8Z^ceSZFEr<1y&-Keb%L$#j;-P2=0o|SiW&E< zwKx@N!Kp}i+&bx=zJQ1Nsy7qQucRhlMmSZqRWO&29i43a%*}c0GFx(IgGRA#A*uN4DyXm!5Oc z0o-|d?0~sxkeZsbhy!E&m(&5QtJGeCZ=r*zLM(jAbg$Ft6uAhbT9$3*{0{*967|YX z)+;}AgRP5M{W*Gt7{$S*Ew0@I09BgTLyKD+d%)*xaWCx^)UJi~U2SiZ-Yy}{TKQ?A z&3b*nsL#jt0Dp^`-dJelutl#Mq{*V~3)W5Bka$U7)adkuOb9DIL8r9i)DwH9o^U?g z6Sce9{J`v}!|YhaW=9mgwRjFdP!QsR(hLAh^-~E7LeG{v2$Ya{#^y3${WbKPh#f5T zgWmcJ_OeP4s5xt{S6;av8Q3r!Emoc7b zbmC)s0KH@W_hwpyF=obrqdel%Ttp?_z5pz+2N%Ioce*Rw6)eY{5~gjG^uUqm7gm4~ zbmNj^rGt>l@X&732XP3fM2N<~jSk4H0(rb)m*L}-lxoZqEYQ|CjFMl$V(SW>BuVHy z+j+i2F&X}G6#=5-8HH-aM!{H31yDtjka>iTqLpJS(Lo1Wj5Q|uRI9YZJ+zh7$*k!| zo^<^i6yqB-S3X3^#yB9M^R8OKRhxCSX3ecIe{RO^f|nbscHtO(-d--)%cmROZob}( zbE_)cRx4v_)9yALfkpv(rEuOC5_};z=6yA|%|GuWMFw|Q)@2UN*Y6POcjQ*?{J3Oo zuA~KzLpEaW)#AK<4|bpO71v&#dU?JqER=<3!nZbO%fh*`!{e5(x$my1r6(BXSF96O ztef?HG@MKS z^I4!YBp@V-^*n$LLz@^0e3-_Xp`ZUPIF>A@m5`vpi(HD_3)q#UFI`e8o<*|Xr?El% zB8?6A)y)%|-`F}C{VD{K%f@%+B?02sI!GRQyoMr>;=cAbcHoL+^m>EfT6NoXIMe>l zp&N&0j(_I}n11z};4>Hae))IHZ`J&9)kg;5iNjg*Vb+DxtDcs#fBY}!KmM!qU+n5U z*x4C2VDnJVn!{fb9qi}^?{_Bo4wR4IMRoI`&lPyCeGsTuQb{H>~vZPXZ*uk2do^E>bzfHygYEE{xzmlBa zCZmoF8wE^|?J(Kak!?E}?PR<_#%VIT$%vEDN5*%^_(L-O7=~y%501Kn9S>sE@=Q6V zq$cy9Q=H!L%11!#}jJ7)0)ZCX?(|X z!*k!`HCXdDH_53jm|(vr+bytz@$T?mg-*!@8@X^OAh{gma&m!^$rDqK1sA!%;xE6p zc53Yc!NWVUR@dt)0uMy^7|$W^Eqxt=ooL)EM}Z!XGvYvr1G(4JtqQ^%NKzlYa_tEZ|LEU=?y^4qjF zausSuF36Qg@7B7=<>o3@EO^LYJRnV|&K$EKgnp_3YO#%(u z{ZiINtRlh6$x5n?!-9EJ{H)2~yw8!jkg#%XTkv|W0naZh!=5>+YnXo6dZD^?fd${g zqfeXJ^O_}ZS6{Ejg&)Dv_`pJu7pmQc6*6kGV1oS~W(M9(Ua-K9N?2VwDQy(Ck~7+0 zuY9d?!PsV~TBzpyLH3CA3UWbBsh9HmX$@QtOq|gz#X1iHK~A)jMUxZNkgHxv)Bu;9 zi2U~(IVe^Q)#pPE`I?6Oikkbr9)qD+tus09`HL1TWI>Ugz6Bezg6w2>m8OdY*NO!P z^K){QD;HeM>ZV|NbF^M?tz7UhKQD*3J7N>BEU+xV%+;-bfW^g~G^H}f4Tgq=O&lbR zXvS}5&Mk^_D|2pBoKG<4cG>yU9psd2%RIiqXdP%NM_#yX5Jn{&ZLzdk%QPQNQc z((hXIF5LGlj`6zjc)>+3 zx3Xz?$R*8OvORAIV8KBe6MpZltXZ(&7x0R!r(;ul7Hs4~WnHg->9sG-Tgn7W*#igp zH8mIlcwwF%ZJRG{6p9-cOz^o^9b#EpUdtS#4=vP}8lE@I48Q;K&6nT*wVS{8fFt)p Wo5}E$K^akGpWXBsN52fi|9=5F=39;c literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/gguf_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/gguf_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae5c01396de72c822b94c96f01bf2a01fb9d1756 GIT binary patch literal 26672 zcmeHweQZ?Mw%>gFoH=u5JmU{+gKcaRgCPWLY%q`u*nrJPVq#+gF(kv_GdBLf#%F8- z&N!W0z1w<1RFCtbGNecur_swem+G1-)pew(Ax}?wBvsYf8S9QamnWfj(@O3iPbA!a zsDHFwYoBvwj)yZgBs7((oMg7o-g~Y6efIvXwb$PKJD#`d;2AVs@7uRqr~4CnlxL0n z@<=e~bT@Q@PS6kPx}?AQE(7=tgT_myE|Xp-lnbW6(07>yqhJ;+LXN-)tY8&5!6xv6 zU2xpux-3HOU+B7W1ShzR-~yKw^1!tU`QUOw0k}4y5L{kZFRX#o>_QRv9N;SkU#?Kn zkbb;8<@^bl8_4pZK4ulp^OAD^y#p%hXqfK&>-rv4YCP__G8z2x)e z$TZv4vC?~rhLLINR-j>Jn)(%JtTN5c6=*n_rf~%tn@qE31sYzad2t0AyG*lh1saD; zb6^FUT$!eM1)73BuB%cggI=k+rmv}3LejPxeKNnb_2rk^$2>!$edC_dzTu(HYnQ!C zCW!4es*+>j9sQ5~fx#i_*Y&F+mDdU5Xo{yFlgM~}EBhx*euJPN*re0>499f;Sg)O$ zU?>K9zkb}J(+NfuwNZ+hFauvsMi$@|On$vG5GQo15<$r`AHPl+Wc^%boKB~dcUkVb ze*SSv!62A#TT;1Npk$6!sd9uIzhN9oB^btGa+qKQ;{@xs_;b+L@6_ODZnH{x{meL& zc*3fxpX+fBe~#aZbP5-z>o3rtV*O02?3^ma+{bBEv0C}*mgnc^#vutmrZ0~S2!YxG zl^IH}^K<>m_|IyaS_vcCV9fuYj&&SoW+kk8RyIW1VcUw(3vpscB`E#?h zjXXjp)N`{||8XnypLX2S_aB#~jPl5uC{VTc3VA|%Ka2k3l)lN@e}(=6g+|qXg{$^o zuD`%{G>i@dxo4=?o3wZ? zU-klh#_+$>0)qTY?2rm#Uy+8=&NJH?&;_LPddVOLI(53@?1{p`;wd-=f)_@ zh?Epd|6-62@yL4Z=YP7ljb6IE?Q-Act)m_hI%MnBv0b|x2X=Y44fb8wcKO=q#o?iC zR|W?!ZM!rqcn96ytKRN0=)Y})o@-vww{1j)v$u_n_6_>B_4bbSxasQ)m#-!5*vp zxo45HP3;Qq3e^Vpgsp_DjCj7~HpWaFW8B6EuF|k)`URk8*eN#1hAKn-ai${Bbf3$e z+8f*(8o9nN!IcuOG|rVrYQN<+#7rAv+=d5s$5cDG28fdwGJdCVX|YKE_wa`R#MN2)`%La^IdeH4q#KZ;Z5%lFf1ZmKeV!VcS7$ zJLX#FPseSC1Bbt3_$e;Py?^+|@#*6ceWYc^8qePn^^*J@bEoHf<4nuF0>ZQ{GWPGy z2G$X1dT7z{uBl_eW1+VqwbO4$#JFv9;P4`!H+436HoP{{ezPp4i8E#C z+FyxtWeKhl+APj(i0Z%PYGS6E7+14e>p%u)f@iLGCHP9hSH}4bkb%uP)f#NQel%fQ zPi*Vsw#vX^C|+JsxSF;g%J7oD#6YD3+jozunPv(dacE2-H-O7{}y-bGhYI4@!) z#hZz1%lAgT^Z7?cBX9rj7M;TxyzqX(jpFIz>z6)g3Xg_czt1u9&;3`UnYBJN>T)ap zY}YvpZ?MyBxF+gc$lVe+x|myhgPZ35l27DrBDtI5xtpah+qCWDhEMj+>>q&=#S)OU`^Y+5X-`lu=5i=LTzd!D^p@wts`JQUk-h?E?jJichld#@?v3$K=GS@JB zjg;>uw%zZZ_};8@RXjB7avL7naL6199=X-<;oh5jqi_E1?b){{kHl^JVEjQ044g&Q zKD9QuHq;(03m1I2_NFwB>PYd{IWsA4h%q};waTCJ2mMedyCu*f)#=oYv(slI&gnNI zS4qLPKRC9pUZtYa4?K~)$kmy$xtjT=JI|AfeX+8Aq-g)-(M6lw@z>kJebJ^l^XySl zwv*U)zT1|nQY}=;mKSOXx5T*0n5ps+OddMX3eFNj>7_z+D$(ggX9%4TI&w68J`N!}s9R6n2I}qv*K!mX&YbL)RXmWKPKOI9 zu5gcD0_TQqG!@s6t7gXI*Nvt^wDbtKQ}X>u-K?QaM1j1@UsE;d8l&r>V$3}jVf6kmfsVi!1aWombdSjwz5f%pp$??yK2JXT}` z7qlZbLBp#%BCQHZ19(AC!7xxx=tprKqEA2Km5c|v47-><{Rnn4p4$dR0MZhw{d&Qu z#7kfeT+?q15(L*$xTtbND3$RjFn_}E6s6WscC1=zb4IBxD45EEgHe^YmKbXQHAl`k zjLB8%2f{_>qts7eP?}1pU&MP!}lT7!VJl zLnjwRgG<=&6e{Bv=egSF!)avDJCw{tC8N9fr4t8_yH6hNI(#xcutg6t;a3NU(}%cL zJc}Ol7+|`_BnKE3u%~wjen}Gs&6>p5k-!ST&wXKVxO>1S)?r87$@t9p0$-Ql z_W)*@-*a~?&cFPPC)R#4)^QTtdtSnKCit_2Kf7@5jl{Vga;_)N_X4*u=e7RK2l-#G6zeccS;g*_2y{thOZeZhTzVnTIc<8))fRYhD? zA2)x}I@20=Js%w*uI+@WmExV#yArNS;;Mv%cg^eq>=*SA*EYgze`q$YwIBep|2fBK zb3D`;&5p;XEBu1#w_gPAFWeM^t}da&{I!_*W!Hpw=fT$ zIvsoUG`bz934bQRy9w`JIDa8=zMq`$kMjd7PKPCN_N8yyW3RLWckGqZ_qqssCeXaX z{HZXUBCou9?=`}nmS#|dYOY9S^y&iF7&!ER<3q+^W5T+YSl8Za{;>6CYuvgaYWlQ& z&N%0qGe%#Xwax3vrrpH4XZ`|V4+NSQfyEizm9Ul&Yssy;4|m<%6}MJJMn7$einG3G z%Zz_+gludi*4^_Rgx$Yv-gX5!oSxy&lg+|7BIqo95!(HiS@@aH!YTv?-?|aMaX>IR zZQ?7?@X}X}dutMcIgD@kRoAdj{CU!^aTs`3f*T+viN6LK{K|Y&;5_vgb|ucYeKQj4 z=!~801oz$;;ZG&_bA&&)@VYzkdM|msH_l&N{(S)|SYWp#*d2u3F?Z~)CC;{fqrX@C zb=^HPVcT$=J44OEy)jeaspCQm8erHz-T!r4zR z9T-qPpz30fMDT%)X6RMvSydXbDU|8ItKfRbPgxIOtJ)~9Z6;Y7!6hN%fh`BKU{+RC zt*zgcN4bLMVJ%j2v-}LX;fhSRKfq047ft!h0G1}H`c34d_`v=|S?{y(v8k41RDGr; z4>59Ctm(4A#u5k{ekCSH;FR1lV$?jwtUSwO)I7GVJj-L$JQT}lmTh?qC)l%;+AgqJ zV$?j#OReTv9;4=2UTQVZ@)$MG@=~jLXiRS|m^e66y-5ufgj~dp6iu;$Gxcuv=K$t| zp2HZ#e$pmNJA5^C{*_=9A}BfuFa0mrP7$wycL}HHr2`m&U%H5&E)^&#=33g0Uc^!q zUx_%2*H|S$MWK}zszNN4hE}ktG76vyxFi{p%8zvCf;m|3(V*~(IvN9iCU8s^J;VLPxe0_$ zhjVHoI1%o=?fPBGY{`P7KH+$gI9{CZx>piAedg=(xZ}0Jk^A=CsY}62;X}6#zhh>Z z1$%A6zMI&0&u_llAGfy$S|4zz^AtmegZm{#c4QA|J}cr}eS+IdxV>LAe|hZlV{xuE zaA=XqeGirb(Ahf@OgUl7Bi3kBbR^EyCYVOTG=AawvgGrUICCh_w8-bb*B&;8U14MB z)u21UZy@}JNEaZf_Bg)_l-;b96yzH!B5X;5tt4zEWG%+oIzUQR+tjAurbWj3PVMA{ zkZaOA*%-8i^@J(@CG5%D;9-R`-}0~$CHyKU=Lj5pSgo`1Axqdb%@J;0#2E3+^qeEXffWXpcS?T?xEKfchTvV|`6e-EGFFAF`i20TCaLchvlm|mDI$`X-ffwiO; zX2B%n+)i0qO;`k&qyacVUys8~I+0%PErKbv9%{$oXH%4PnaxUXmgzyMLP3O8`kR&Bm`Y!vcOaieNo+DD zm!;=nK~`F9709_#FLHh})ZL;el2t8UqS4}@XhPKs^j5CY^=@Pc2RPePHn7h`kLL7XoB6Lb*RiYUp6dFUW8udzQB92c(v549@- z#LbR^;gRq1u+K|veZ+r?>DeTG=Okuu8J!?H@1XN8I&Yyf0!~uzN$M{o_1zSw`*0md zqjOCi=nlZ=H-PLD`1$6c=KuHc`ICcgWdnN6N6yH>NC*51ZDPhYKY&Bu4VLR*U%+)x%9h5-!g7-WC z=TC>%hK7j!xrBWyv2Ts`&sWU1$L;$Q_M^mp^j^h1RLsWh9lvf3GzCT`EAGS27#O;Q z#v`>6Ppoj$XNKQ1cbEl!H;4fF1!2Q7&c!&tAFOMji4?GWOZ@Qa&8wjEea91Wp%65AG5BJ*7GboWSSuo(pH@UqMlVE9 z&Q(NfX56rKw=!+V?2}UdlIkxl@MBv5B*2=K1i0Suz0Qdny-sDL7pH0E0X!*5c-vuG z(~$5CnyOsMX8&nat52`WH3DW|uF6$!SFW~Ix+1#QM9Gk!vKrEjP6>mqq{_vEbx(?` zaY4*@Qqoac+5!myQImH0o>6W&sjX2s6sX{_EIBgi`b{t?KGha-7Cw+C6?p^CDXAod zmDIT=72dZ}*-KI9g2bhiR7TZGjGBikA3;8m|(kh(ysl(0MtHG;1cYGX{-CQG|>%i7)QXQ{%Ko}-$u z`Sk&V-*!?rnv$yfwIt=#84#`0Lw-FdcA+)!0aa~on|Ko5RCtj>Kx*-W08vua;p>@t5)ll}>Lnxbr^JM~Cr!bhh&xh}MITyuU%n<*K|l>JK|<79h#I&s$xAlk zJ)UmZjQ1(t;hr9!7dFU^BfeU6>cFB-rpQs^ERxl@5A6xn;8AM_y&llx$3XY*;pbZe z4U`={-hW2g-TZYQv`6&e&gr&9{$`TDIl5U^&v)DBwX&d^5K{7zUaiIr+^=W65DhQPr^O3*QVGE^PhmatYJK-wJj%(>#$ z`oO^l_O-Vzd^mJ-DBAt6E9TngM&?e=RnHF0A0aOsy6gL<_MYeK3-=nn8jp3p`qe** zopXPC-V^H{;<mbWQ7vS6)ESiw91RqLL6P}7&7rhIT$s#sk8b5?F3!K~vu0l| zVe(g&^Ho${D=p3GUDlPB=4X>v8V0PkFdM)wD+N53?XjkI2{ji81DyX)^D(KtSvI}c zW#I!gaC$*kEnM15DBLgrOBIe>cwo_$moQMOvVsE_Q&@}?B_CC3NU%rR7o6ZV<6O81 zPS{g%RG}gBNohJX-N%m)OtXF*lTHvkN4|a#kMuS!ME1}Y{bqTCOPh#u%omBl@lGWD=U#Rc=IUlx>UZ}yakflF@ras@|LFZRa$&`z7|d10?J#K z%2%6LR?cSRTS$4!Q~4_Wwme^#X1;4E@A_1}+Pt!|G$Y?4^d>6R;9R@BIFM+g6dJzic_j(MZjFcJ-zl+G7GI(MNN7g zv#m!5&&N}RDOH`qNlBmkVxPO7z6=h-i*&@&=LXNtlm|%Co@Hc|*>I^gX}DAeza8+a zhhGEycEWEL{2G%60pbOS7a(4Mcmd)Ch!-GUX!KP;p_Tz|EsuBrs4~r12gENR^zY&4 z`y6DG>>&JEnOIO2Zc`{fXiqR@geeP;M{DEE)&x^enEKB;e&2PcE6yAMs|jk0F%miy zgfo>DgrlH2&TUU{y9l@I3&WSz&#iIpAgF7pd4)b)88(FgTb3sH=Lr8?q<^k*&J*Vw zQ}z|%oY3)58DZBa*iD4p6fK$Sn0Ln6y`ayP6ux05^irstuoVfmhOjl!&2yf4SDbxO z(?r$|Cq{$42(WWQb@=kYRrG-~+#GI)U%~X&2plum04GH=LTgo_O*bE>RKg^>2P_}r z;JD#P_z=mhfRbTZe?rh&N&X?u%E$nfHa3vzUGuf1`bFaW6=C;D$^X1m4_NEuS!lVY za6TO6By1TL;wJumKLvM)rOR#yinbQdJMu%2iPLjy8y;RVAt~tVfN;E=w#urjtfnB@>ta2 zvRXRal$A~=Xjy;J;ld0L&xBb8(NH0k@hm%rXwQfQ?<$7BYHe{^aFYR>rVs_%Vg)d0 z-PokEOwkak6rCFgd+7r-DeH(;($fH_sr1rXUA54OI3wdxm2ai=(2hugxYXcR#ien; zMgnK6lzyewu%@)KU=@?n5KH=6HU+rTF&wp@k?eWsp(C)R0XTzO+G6c;@u{@b<;tFxYWM%Ka@kX9*;Q6K%i9aem#Y0r?d3?N zrMA<_p4Lk3m77XSU9RPIRF`X|_Hw4uQkN@xS}V1eYuSE*mbP3GH#k|mR&2v`X)75j z+9`C3Xr<6O)2+RXP7#F^F)~|vVEIKA0SaBV^uSh)(ka4yw)A90LP`mcEj{H%E%{a? z_|{TZLCfkPt1?oGifrl0>WGwbAX|EhsHP|pvZYrj*H=->WJ_;N23@xFiex&7r(%7! z^old+vZYrd)45hj&zeD(ksh_;S*xro*z#~w74Gxo!9Gvw_P73Z8Iw79{dvk7pmqBc z?DMQs;z7JEZ4*xck4n6yVk<7IO96E5q7ws0G83XwEZqvC5-W&XvJDWGXG7?v#zASx zmWsRrrULV^P8>xC#pNa3VOhfUSu*GU7w}e-`y|3IJ$p5PjP#`Bd+?Ub8S=h~r_HG- zOgD|hTS!57i}F|5Et2f~`x=$gl!H`4P8xjOaG)Hd))A0eM?h*F0jYHaq}CCTT1P-? z9cdK*C1m2mW%CK8)t#_qgsoW!x>DaCe-3F?ePSSIHfO=UJz;Mo_Qv^=yIpbnDRoV4BT7^BF2*9txcd9SW@>Y;l5J52EPDu1|_)isI~M zMF>3zX9QY9C4^miDqhW>?I)5#-neyN$5hz?R5*$4aGH zIK>WzIqjjUP+h2M@=S1RxQbXyBh4UeR7LBeRgo(*$L1<#PQZc8s>b z*h7$q)drjf;57dz&UBtlKRVAJ&zY)c51A^M;HeJ4;_{sFq~&0+Kguk%#)*(+ zc|VQCmWG^{&Re^d>BF}&ic(*yW7d?EwM(qNk6DqgRcU3*9WLbow32FRwQZ4xjO;gz zret8ajtBZ4Za_t7L?10%oOJysm^7NpOhp+9aI@(yz<((T95!3&2$K+`V4;41Y={!6 z9L!R?CguE)vNdLstcEBBmFDCP5cqNw$;!`4=S|+-4Y!vXAvATlkakPKrSs`+XRyIi zFt4g6(zl>ouLC-SOFaHJ0F`z&x{zdq^Lb|flxLUeWdP**kmn!vl z-7p4tEg9msK!p#;4}6>WD!fVZ0}OomdK8o&L@FFm_)j;*sOUhY1}b8pn6MJwPXm0^ zf+j6^7ZhA=Hi#+LV2G*xxcj(tEy@P;^8;{s=$O~%?i;Beg6l+k20f#2X%rksz;nJT zGqHD(nvt#)bzh_h2~yrc=^%k&(3dpyU4V_LLGcsh{ww6sU-O~7vK;D*2QbR1x@HRU zcVG31e}eF%2=o66+C|!w+C-dN9y|BTSf}_P9}+&U{ABaY<^^UeTyglqKCtj_zf%e? zgxQ~9S_sqfH{#!pe>EOwIucA5VY*^xy|Lblai$M$uT&YA70(tgICji6#p(~ujo+=k zGXXEeaX8_4nK)j)*B|RT8+V*r^&Bz1)#X(pUFl*!1A9b{LV#?Q;o5LzXe{`4!toq& z;09Gi+))p=RZ_{IX&TQGmnB%(fWouH$L5{0Z4}Rc2e)Ar1&cy^ZjDC7o8vLp=FcjB zzwyq-1$GZy^pT@}KlQa8>Jxk8Jn1@5j+~DPUefA~^$(Di0g^vBX~4tAlMU~%aJ!{r zJ<1feNfEBTcr&`3*j8q(()W1_P*%2}XPr(@lar$t$ZhB9QFO6AC5j9tD`qUY1 zjuy=HeYW*(4&3qqcYJ(hCv_)C^$E%eXRU+YP)D!}j(RlTEQlCxT4Q`=%v$*^Qw6tQ zLaQ)vf_Q3oaQF4}g&>Hcln37_s_KUqKvgiFrN>XN1x#N8vf97)&et3vD^& z38NqGTb|Xo)tHhFdS%?9!|kV<+B)INIF($F_iES3x4@a`jd3`$CoR$x;KQj0KU3-B zw8)^V9A-KQ^n~`4T zE9p4!wn=(XBfXK1-o!*Ni=p=g&_hYowv_H|Q4>K#86I>4zk>(~5XFV`Q?|8PjZLGO6P z-(GI8}Ohw#O89AX){67k)B1}~ru2`<#^>B?X&k%4tDl_U`v?Klx*&QeR literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/gptq_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/gptq_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1ededfe770aa97278871bf3f241b79d49ef5aad GIT binary patch literal 5958 zcmbt2TW}lKb$783ycPsNQg{iVz^4R?1Z>f=DajTsQKD_xq~yw05==BGh+UAN7LVLr z(j*HuOyU`njb=oBv_dAzl=~k4(n#YDjiC@qAm($7||VVho4K}#dguPXhcY` ziJl}Qw0z7&y=G)ag0E@KE_wt9{5TEMDf-028dMv%ihj{Y#?7-yi_rEl8x0tcXiTAd zaDUR06gfv|7du3c7skRNkrF-nC~45H!hO0IxX|IJWFbVU#_@PAm&!~5Xc9y*m&oQ82t0=42_==y0PLuvT*2A-g%1{T zBFx!7_{NosvC+3Lj=uZO#9I>~gKBv%l@a6kd|Vb)_lKEp&BilIYEcww)8nEtofQDB z>5T(`hTraALVpJ-^>;@okMfj2<LegDBXI{1tvk()=~x%WRu;a=siJ{F7z)BGV4 zOb65bA^H)c+x>Tm<{fax!88x1SMv^V%NCud1<>k0DG#$IkJC+Whm;O|_kMXC%!y!u zSMWByg7siLcm@7P^xkb3&;knCCRAoJrO2vf3TJ0?F&W$gaXNA$2EOKL5ExOdQ#s{Z zv9xC1DlIRh7yCZU$g{az7Av9@l3&**`E#)nr|#}13^H4LuCks5QB-vKP9IB zlvvPHqOyyZM`tk>Gm5k@5X@$zh2V+EP~?@!@F0)~ek7)*rj`9iq~ze<;mIrx66Z4z zoSapHQ&}ar$Z3a%Lk_ilN>pOPLMEP0C1Tk*5#y8~s@59OSnb*=IhB!>cqSpLbSk4@ zVmg@E1H4N%D`8@csx=o++=wONMY32m5vo8_#6-v{my%;}N!6rz-|=%-RLixvG%H@j z7?3U6>%JEXJ0!t zeB(7SBBdrHxdrgvnaG?Zr6cLAAWAWDUQEmahY=~hAYwVvESSiwl9J?zCP@tp=N44E z{wA@S*f4kof_3cfg)L)e#n-jM-L!w{?Og9#Pj2@Om->dw-c!qCTa2q>w0-jJ?|i## z^c2P)8bdpFZ=r9i>%^9QsAw80+J|%w#QWF&<@!HgFFW5|;dZQt){=LHEo*nh-?cJ! zbNmV4y3Kc#_>Oh%$F@&xMQeC7`Sa}0vL)-qomPLIOeSz?yhJTudXReKu*O7}7Nm;xDB ztsN9~VSghU=y;GBHTLJJhX12(zCob{!~cn1!{_azC*}>yOrA{`D0CI&*%@M;FEq#` zXq|%^9)t4qRTN?-77c@^#}*C8Plt@EvF>72Hl>JZaHZf+up)nyQl?b{7F9MKR}#}I zkHs_?TrD9|X)tKr8T7-s_&DhXpi_;xxT1(Sqw;kiCZrNdlV`w32`({Itvd^V{C((_ zQI$ts$9~-LX~%Zw$x`RZUw6J%G6hzQYyKzJ)|$o@h(YYwoww#!=L^Pd&p^pDu<=3J zb7J$-Bm2l_ZvS%*wK#5lef8^WQ`@folB<8?m9p!l&HhKWH!AklmA|OCyB|8bpSZiX z-A7CAqZ{q7q zLdLs&pq!tePfT~vDw=5|aO=oXVOOreC~I=J#B z2Iw@ORHp;G7+t)#4L4m3T4I+N0WEQP78E^#&|jJM!t#m<3IqCMQ=YnjbaT)ZeaV>T zmg&54iM#2~1AOB(kO+0NYfoJT-~j?K+Z=!ylxc6GPYMt2kI?6UzLt{ZxvOw13Vut> zM<`^RP^s|K|Av-mekgL;A^7d~LHF0xyNJ@P{W=8PB`Qy?!ejjnbq6ZEn-&JLfPbVv zqJBb6`~-z))ewG3Wi?@`ygVJpLae5+GNBm^E^E4{nqQw1Gva&>pIz*S_*Zj9T5xGd zJp8(pO~fVnY`B4&1`;yK)_(tQ==Y$k$k5yJ+0f`l?;2b1Z#jBPL!)2q5`Xo*7ALjX zb3s(Z1j#VuY+BUfdj{e=6e>ZDF*%q`1{cSIaV+Wyf+&Q8=k~2@>K3diUr@;gr{i;C zP?;8kG6e7-TrB|!KsFQp|2`ikMu zd#5z~_Qq`COJDOHB=J0tSDVw4Db1iKN2H&jdi2IKy2A%-i zt7OyX(0v7Z;6YCJ9sV|7xLS5V@Gm)fm)`+TW^%5LmyDfN4)wge>3!IJW|yUXmM^-` ztl6tHvbEisSe+=GTF*Y>U#)bXfzW7nLwNRCAUu2gEBxpF{+0K3I(v5OYPL&*y1}5U zrO0>IC$MPVCZ*yS0iKXX`U3 z3LJm#N1nrFO$yML-S4dMPps~uwY$i4Ltg=yaewu@2=F13;P`jm^DT^~45|~#W2nYs zX|3*#B_w%Ya3{NjWka6DaitNE!Iv^KjS^MIh9aGrX%shmvp=Qo)j3<@6sJC)XYz)h8uSwcnv^-?U#;o>^eR#g{o!2RfNhFfpVw$M{5FLQ5R$=<*WKS$Qq-^$V1!zR-O%|E zg-MhhgMYFR;xTzVObRVbDs~V~IyMa-KlYRzlPA)YRrN!HJu8WFER~5(f?K;0Q?d<2 z)kUfU64dHQ_25LF1v?{j-j2l{&L$ukstWN z>im*FTo~W%d&Hmm!X8`?lI#2i!(VEB%O$!n=B7N^8{d_sevW`g{zlax~7{iRg67CFMT zx2$Od5vDryCy?hL?oGxTG8FYMXquv7O(BRw*Xlt`KB&|S$?W6|+zl#h1#8NJkC3$> zbSgtKKuw`kMoIw!@u8|AlTo>GP3u%fD=~?DwJ4;$T1^>Duh$gwES`o{Vkhzd^iW?= z)W36x>UoU3k5R{C)b$tze~Vf_NBqB{_y5B{Q-)_2@OzJ$&fhTMCr%|0gbAVd!?M$^s0u+Yc9p)|vIIpPCD2OKtsCf(nOlmR&P}qErXvXkrXT iPppnx!>hx$Mpj3vER2DO%~3TH$OLiZxfvksHU9_9)_#)! literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/int8_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/int8_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62bcf44ace3c7a370384c2dea799ee545de24e54 GIT binary patch literal 21359 zcmb_^32+-%dS*9npmC7keMkbSi;#GT)InJyZBrK|OXiF$*|I5XAP@~wAVGla21Sw6 z7(-J|i5O1}!FVPDV{ZwXIAuEW)=AzJAC5&VKKI|Np)J;omzPHVU4w;hoUU=P2s$F`zuIyywv| zBSl@OSc;{?RKNVD`)Lx^_3Ox6->-+aE^HVz_8V!L#?)_uIDObWYU#I-I78SvYU{U+ zGW`rq>8PM?xLT>K_zpU5(Q>HYX{D?b+r}D$rs0i63dP5o)`>R~}-zHl>Bd_aMzWwDYk}h;J%VDn6D8R%qwYjvdrtj|WYx z3tCahmS|J4rR&5O$85d+Sy=bmhW;w}ZkFL)N-K-+TX`?n3}3x0Tr*nRUrSTL+Tndg z8pX%%V$0vAl^p%`L1y@Xl2&}P6>n1onvD_nI#iWvs�pg0|u3izO?*K^Tme9%cxbA(a6@p2y4v%-MTZNK3K|b2yXrto(VQ~Bg}*?>-1)m_K2{ab>bV= ziEq@7AGENWRim;sNDZG-M!ooEw?NOg!rwOd+YWzB5Zs_qp35v*je|C}NmWu4jP5JN z5)>b-FYh`td&fHQJJ*SCUMIe|LRx@Ad&O$G$I*8k+F%GWSR_yhAzb<+dw!P81eC;>7Znn z<9vL;9}Y?kAznYdtVf{O%9${@kp0)Kk=dvpkx>~%^oT-CiqeUlcb;XKn- z(E`JVc%3>%_Ompynp^plxM_%5hYm`prM^!Kv}P%UVa?HTa||{Z`4LQlj@7G6e-mnn zTjJIjFy=gsMW6+%pciydEB0s2^ReMpzQ=K!V7zJ+z?}w~xJ^e1wqa%sIRw`!$Ea*X z!%t?R1oL;OGnCiV_u2o0hr_Y>42KZDsXaOogOBE8V^d?nk7&tAJYJJzxEu;jN=E*Y ze=I1~js-biEIJa5_=dRX#F%eDP zhQs0Du=ku59o)t_>>6hWN3zJXp@S_nMy}L64i(^pGGED06_eap9`kyN&p-RxVxwbi zW1+E@n4cR8##*jSbUwF#q%+tS4h^)8O~o!nBW;(%;nB9yC>sp>f>(lpiCC0t3;U;n z9N$KI6PotNLO?eW3x)YM(i7rs9h;Ks{bOU{Dc{8yWu2*4ub-bgpDD{UW-GR4%Q`cc zKNM~Xi#<2rfabhraLd)9skkGxvG~$LwTZi=G=Y9=4Fd3Rhh2JG%wnITJf{G zzpQ(#*EvmzV=ETQV$YhZmP@PBTjn;*n3kF5nSHa*C7(<0y>W2<;LLM5X8RJ;EHcdt z;Vg47YdH7_8ca#_74oqH>eYj@oUFY+CTlNNSaDjbL!gz}7sq*CSRq)t{3zQNSt0tk zK~rm-Gwp^XRKHd!sL_Zpecy-)nXbbrLSG;z~2umw)3ATe!WH2h3{XhW;k{O)P z=)|a`_Yd%rF%b5Tj*)!a7Rbz@crDoi(MT*bG!dNuiotNy58DVA41~tGDC`_w8`prO zIAk1(El_4C;)8CG!imS=$GU*0ASVh%6d%MH<;};6+$8Md{V^UHQIU;sD7H%unH$BT zgu8KjA#Ml!@i^)U>amS-xMxo$Po|AI`^Lo4<*MrIQ*%@AeqC9fOv!9_vO8UMqi(+L zy$uWIA21&=w;VZU-xAX$GF^8ovP@Tw=}jD2uGx5_X1*qK^urT3Pt05bhF~vG#gbov zGE3d5SLbS$N*ctHhRmhKDGO|;`o9uOQLtGDQ zS)o?F$r?7odN2xl+^bmKGz=v8arZOB{5Pm#4J6hAYlqeVvkT4PUF6-N;9j{dLWH~ot%Mw!pe^UACMn` zf&PP6ju(u|%Kt4L?B#|vw6;7gi5E;o+CkkF_=mCt$O{a4+t8hMQ&d17&~<_T+lLL* zx6c?T!3t|zKb>2>wqfD06M-1-Av>|WXxX58XKN*k`s}5ky!v^Mp9`v~)wKDw@3~l* zKo^@yc2()C=*imh_#><*1{Dx04@!Q;O7nX+skP9sAkY&oBhwMlge!s0dQBX%KEf5a zt>BZ)gdObekc^x^G8817!)xbIW8*r}!H+7LWkDpOct#{TBGDs~IpTwTJ;+Puk-W!t z#m6d(gAPgbFps-IksWwEd4?dDm~S$71c>&-A8&wV1Dkz0Ro$3rn2TqI7fWxB&YVm1 z;xc_9`NGvBiJrvxva2N3nf!YCbn>lCr|8@P$xp^G?o{A;PGhYokD^E+Rgq3@SgrY3)9<4n)8y>hvvG}SxDydTK4 zXU1>tUFa1XcQ2mKZS2gJbUrO@MNgML_eigII9E)RyCQXZt};!})qHN&JL(=&5dFEH za(hAy6s?irK@WuY5c;DCr7nCf%fr(I<4RT&RcNDrNxr0|?K`!D8hJ&M^WZ|w$Haf^; z0I7BtcM_<0964oXT1nZy!QCBPA4I=_=_BAIsE4kyCD%65wQbq%#Q%=cnPU%2E3Vhh z)uxZ-N;hXqwl3Iz$bK?>d-!f;Zr}0j_7nG?p1fkjIboqHYL+WDJT9}9*j6Z;)kZ#k z09Hcb!4FP8Ix4>Ek1kNkAphp9`waCF-RCus?#iqIcrlDO?)D$iL^$BFG5MuylLrV> zxib(w1b;lTRUDz}bm=?7>|4pV-sf&i&QIoC+q0JKPwCJ1As_SyT=~%wgoibztkzbf zCBUl2r?9ywb-{*H@M;OI%E+FfxUT}C*UXV|F~XGM{ID`7SifWiU5E`{4h4e1L2**b z0|=WQ#o@P74ICLP;)F2)0W=)3(%CD?E3@%rT%KD?u4d8Iyl^__YFl!3h^~$wc6`!# zyEErHn6(@v11T8~o%+h*lYxRd3*L( z+z!_~f=#JKd9wyZm;f<_VP!?dF4%xUF{relAQE@P%9MhH0&)$p#dek}>BTn)BxYHV zQH_y9umao5j}fhA?OG-11T)ENSK3HZQ^+dGcK;7bRUZMP8};)ek}(Q01osbr`fry0 z=I?}Gc7OI3P{8WG051R&$jr8ZLNLVnM|~kyGWm%>AQ?bi$WQ9$fRqEFl9E*mFZz%H zfDQn-(jzQZ-!pX`5Mcf&Mq0`7Nt{&L3u-c+vIyRbLsXipql zc9y63WEf->8#BYFUP}t1tv+%1zO6iU=AE|0;fDs(wc}Th&%BlzOiziGO`@Y|VUOr& zTP(Y?=jVnO@8^@>NLPxEO_{2n8@x~QRpfDm5A1<;3gDh5sCNs9@PSm^_|`nK@<0@f zr-FZRT@kcfhYM(eb1HR0W1Rz(ke~l?y{deTmA+2v3E;>Q6$&Ucz(H008XQ7Zzadtt z49zeo3~L{uAT-5|u>znFE5HYWktJ#xTTouuf@%V49yN?x=9)+OGP2qht5cE%dQes) zK_LE;c$uzOXk1eU>hcRUz5cpn&=Oo zQNYw>ACBy2{}+Hy&?!RT=TO1qM$jQ^+8UV{^-cP@FPa<~a_xOqxO63Y>ugKDSbiom|>8;ec?Xc`^#breAkKo@vl5#r!* zwM!$hM3$*ceF!k=Hb=Qxu%cHiz^vNwwM#!P$8 zzBO_5VM*C{52uZpvUz*PpDS-#04$?9apD1EM%@`m;ez> z7}ch$fmJ~5HnL>`y6`U|jM!E6X|O0&ss^i3rD_Qyc0~wL-bEB7?hqWX`7+zf=Ct)rAKmgP@&L0DOqkz5w%nC~t&&}}NLXJ~f$Z?7d7xXGP1;AQ= z0U065PoO=Qfd>J03uudspeXPyXgjO$Ts5+C9=guGiH;wfKCgpFQ``W?1khp8!QRU3 zY>f!D0~1h$1WqVxslM107la6JiDaB2;*)s_AV(Cfzyqo-cr4%{A{F6ll0G=bbM07C z2ReiW8-_rr;zluyQl3OlNoHSOq|!rn37l~r=U$%m6<#tAtWyb8YY(WnS;+PQ{PF)C z91yE^QFbQ*X-jiV9f(l|>&)5I@uW{Q)C2fxU}j>eOUbuH!$y#QFm&}RiQX?Edl}~J znLP{qJK{mP3i2hakrdCx_p6->96g{6P1i>82?&{(IJ3&5nh` zIp;2vvr1FP^D>rwwkg?^YESWVo$2G5OF3KHlC49ubu4y%(tW#oY4=HS_sN{?R8a~$ z2e9$i(q}TUpBtJL(d%3)@N+{gVTYU#x-Hq{VL?caOF~4z0FLqigOg=34@`a{izz9p zcOD+5r-5CuIs(O-Ujmz%TX`CCFq>aIs%(Yh3LdEo_6y&nRWg$n1JoCmuK^BxG^C@r z5#lxEvbafAvIeUI)=Fs|@d)~ljmqLzAM*}tu{=#+oHgMQGL)|Fe%@agJ=h8cX~Nny zTcMKqG?=8Sg_fcur^)>C^>Rm5C2Fv-;o2t~t0j{WO&Jffbum2dBoAoZdCgH3S2&2i zmDl<~VlxXCl-OWoP{LXBDDy|4i6n0oXwaNDD@4Ov6kqXO)PCbk5r|g-_Y7;sve?4L z%UTsy9k;U<(0I2ir3u!8yb7(vtpGN5JlbFcyWsfPb_=#7e2TsBGX5wOeoQ%3Tc52z z$9P{0a7Tk+bah=g|h>t1Ekal8yfXI z-|gv;cN(NEG8;J)36t}H^(w?%_`v2MXMwXjJS@b+N&n?&i1o0c%OMt94je36tcutx z$XQr0Vemu;J@QF|vVD5`qA}PbFZpAhXe2!4xkyeYTX7sO7CMJj!$~PWF*X+E$gT}T z5{XXcN8%M?oT3_x*u+>kcwR2)EIGhvhL*yqXR92#Ah+nrk0BqBtg3g-7kTnyVB?6Y zL-3O6Zrt<3em(|60!T5BC%e#Vm`t94KN5+?JOe>8T6}0A3}-RW<>L3aI9-8+&{rf> zsUkc~@;+_?T%t8`m%-;XuGy%G3c^icQn{=beFHO~azxH!Ww-?wFZXqHh>|3rk7!4@ z=MV&(0FwGBXk0qqVGdXLR#SW?HOn&sOagExX|i39;Dw&a&CjLv3g z(Y)!Nx#7OWF>6g)XYEP*lBG?wv@P2mfaM%c*JjRqymirXtL?{o?^gfm3t&8AwLwQp z%2J1CuFSX|6g0(t(Xt6OL^lX9Urwn5jV2+g*vj=)eH4P z6^YxRG8wB8Hjo4LszTn4!Uo8*5hX8NA@NP51PDPKY(;U8ut_M#a#ZcnkeJ|rt&&=Z!2I>4MK?$7vCl{g5eI1*ml6CwhLv2lqOX#G-OJl zT&Nh-AvcpBtuq)<9ie1=;@+5Ij}920s${MCWEKkJqvQuug#QMm>`>*@l0mmTF=}RE zhfqoS1brBR)&O!=KacyfLq*AiU9ZjD(2De*mP z^T?-_Yv~0OuHk7|CGowgnXYlWMA##=3wu8<0A~P5>AjZefBQ5zib(z{i#->K(a*(07iAS=9s8hWq#|XJ1303Ia6x_| z9D@Q^3+$eYhe0dCYKgdumL*^}$oFx$!nxH5mbat`V8NLjs0cFF9sp9wh>8L)OP&WX zFN?8y=#-3bA&|wrjd7L%Vq4U{FK=YjzE1|V_DOV)L?4pq!;-ECWYt4Fm%vOYUP|;y z$t=s&U~D9NMhZuU{?Veh7(1ZyMhXOhB5fcdG5JHs9^N;q4vZ86hj`zrK!p1qHtKiK zxsFa6o2MUxV@h(g0Uju-;EhNnc*=pXU^__~E?MOELk?yz#PM)=LGsw;XatR*1fv{D zUeM*};2`gVe#)MKh-4x$BXAaqupgN856EvbbcIA%2*CgknV}CPiUFi_0<6XTpgeJS z=|!Ub56nU za@FQcNUUnh8p@Yl?rbGm>40kBu1&uwy1j{$_sc3%Q<;NeRqLWXSJs_4wQR3S@6ObV zbuFU36#z;{Nw#dW=-9kq7wvlzN5Q^4C5W!g9}IjLx*1xe7rS!XUKCv~-t7^cN735s zsmXX0{6_MP)Ofl%=hzM>zYb?=`|mcS&!)GF?#9d;qPumm@{Zw8{CCW^H{E?sY(FKs zPl=9~Wy)>8Tbe$Su9|DhoDkhDi)NWn-1Q<*?kZ3|aDoY3`n*`yBs!b0MU_yvs{uL< z3H#F{VwqQTdI4i~mtSw1Yf7J5XwKGk{+Ro-=^stsbH4;b0v2xV-yOI2soc%Z&LxV7Eibq0~^uys%UB>=i5b=A8Rx^y|exu#|n5yMB4@ zGBmaHUE%s$b8mgXeK>t{I#<=Pc;-(^?;N>Pb^A@Rs^{+EpBnzk|5NjypAxISDq6le zgJ)i;=1h&~-XU6cfcd^Um^7_aQO;c}wUotrZSv~mlDS4S*S!DAjq~&8KQ%YOB$@3? z_PyVMhWfWfQ&r_hAKc2hr&QXBCi*U>#BWej7HuTiQu#3HprT$0Jyk* zOFU!N?^>-6$s(A=gEkP$Cbm z97CEy4@&xx;FP3?$)|afrw%*1_R3!VlX3qDLotM6Tn#uN#%-j^HZQq5M0ZEdy=TeY zCAz!roVmL>=RTG=xm;C~Zb+A<4`=8!pJ`cS7UJ2i{_OdIY=0md2xbRIv%yF<63bRh zB>Em!)m)#Po4hW}3HN2YyjfS$1-5p8;x1J;iPcRXSLdoba+ckTZ`^ePl3Z;9QSh;e zvea8*!Z}dGBVft zuJ)}MXvYBneyuf+856*;V9sCiPmijo^5Zn2%{n&!GmpCFUr^mgy7hn2U3sL_@{=+> z_j9o&SQGSE+GR{+26lDI(rlbxIXD5ZN+P^niim}OA*P|NiVEwm`Palng=XLfc4_DPI|#=6)wVSBand&HTmoYo{05KUf~31p5c{cWAI8u)!sv5`PLl*lLTeylEKk0i1ygUGV@Y z5(|xmLjeNDtp(SJl%;WG4|W7G{|MgcokR%;ZqMhB!8M>dVyceuyn^BJdEo6V$YA+H zbT}xMB}@>$^3oYU1{ZP%LlALMCX#npOupv(t*6M+8BFnNMtra?oo$Y9W1AJvP&bN2$Z}xz5q1a6I@XA#Z;S^8%QA{p; z8088jDP2}9)V2yLy9BwEJn{;~$e@;7<7dc32<9qV03ctJQP))$ypUO;KKRH3&iCk5 z8#$Gv@#ykaeqk%uk!a2fv~Wwj0bT?`mG7}7WUS571y{ibSpW0WO>4!qeEYgUbOtH! z`BsJU&!7e#HRS)Dpnkmj&iLK-A5HxD009ZSH1}U2;jC}ZCE*H!Y~-adV-;kdXU|Yj;Yq?TCCJ-=LXU?=gEZdl?JG7D z1`Qj+y$6n0FBx7wdbWp*AQ7Q??3E{xA4db~P19SRHHJv_*EoKAAv2609PSfq1vo^I7Kk`%LNV!Q{czIBE@PZtm-u-W=1ssQbin z+j7T!+x{ujl{Iw1uLk6;LY^?;$Cm6%?n|Bi*1>f9jTh!$xL4P_=>DYYcGaDpKLqRg z?kv-tZ_ib7CAm;zdV7v(%nW>Z>E@+{@tY%`GVNJIdzNW`Ccy)zi(sOG6gLx2`_pGd zX9J*vU^&7hnN)A4Ylg{LcA?#?ccEe7^rG!f`)$Xa>7O?ID4uh^oap@JNq-+b?Dx%Q*?YL1Dey?}Gts12Kz>$hdb=U*@Nu`2IcS{s_uq#ECG zK=Et64Op%m3*OJYm7q)QpqSDxcA^iAjU9>OGGo&9^~u*WcS=@b6&}S)WDj5tN@mob zK{e)nfguz87!L+;b<3CEo%riDt;+8%@v9G3Ps4BtN>>61}p%tE|Y7yl72W8lP}v@VL9@#;FU3sTy){SiM7n4(~M3fI=KAdYAQG3 z3-|+dKs z{Uycxl5+l%vi>bq`S+ClKT|vZj%s>ft(^fZyjHZNWzUw+9p!Y_ zGW=`@{IbT19{dl$cC54KLF zTUHp1vQtn{bFw*IF4{J%V9B6Xo0w~FUVZbC3q!l8ox6Uplykuy==B$6Rrv? za-hE9#>x4Ua^Qi{HnW%f*hj^U`gyn@vsE;1dt}5+EA_Q>C7O~6NyrKQ2Uh27eX@SV z2!2(AF=SCHw_@l31y^nKWlQ=xQD1jYzu|8UTY%nVyLSHS`7d;g(e_ABnMzmC2j>hX z`?db7{h#Y-V-v=g5FeIUi0>ltD?oumqqAJAy;{4XtE6`zmf*zJf8l}{)>)E zx@x7PK)Q{tTiK|PZh}yr6ysl~VR-RJQflXGKezVKba`R4Fz}$HY{iHk=z-0(VkTY- z1>X*s=?bBT4`gCAMv)=3$908foTllmD|@K23i56DlHfii*hzx>mEdzEcpxAA;vfd| zttKg#pjc*!E*GI?x@~mHve~wx$G6KZzc;|ICt!GID|tUKT4y%T@H0(Uzm{S}W9=74 zNW=mwNcNSl(Nx2>4;?ohpF3WsX-8oUFz_HheT*1@ae#RM)_yBy60|5mD+$_^iOi6o W9cS{+xt%Md)WRfv?7~#C+WtQZRv|?I literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/layer_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a1b15076ce5e8dcb5f9290852827176cfdd68ef GIT binary patch literal 2026 zcmbtVZ)h837=Q0Bm)td%wo9pYvoxt`T~fM)DH01Z7-d4c*{Pi{_a)wP@7lEK<@Vlt zvnjXHsj!s^?NC_4gl-_{sGxqb;s?Kp;FqS^NIm=@=oh~Q+E4rB^ImgJiM4`0QBmNjAH`5)pI)Jllw4729ZrS+|hQ9VDHNBV-1~p+{>C`}aBUXVF>+{CzDV*n$+Y zZ6z<(S|dx2`RXltZ+$X@p1@Lx0Qki$kM_!pt<5WI(Ib{^-bU{yD%jf4( za|I3OGk6|nEm|O{yjsM>OkK59o#qx)ngctF=JMvQCgbu+qv&*72IPS=)U7F#(~Z2E z#UuphGYFb0ycW?`uzh)CX{02q%4K;o*uUXBvk^N}g?F&O?nk0@>-^30E2H+$a!X*RFGfV0V_9ZX^QoXW%iv zhu0$7@uOgPCALDVZM}=5e@M#h$vg5V7fP{@->FD_TS~|B*wPqG&e8JGP33vm*OW*_ ziBy&Tni8)l@v1WPMYfhaT}hsVGq&E_ z#Z7Wsa{F@6QqOW^DN+|)K{VE2hEiV*KN?=|{-Ni)p4!OS3jBu7?&^LMZN@%#>8=xs_jTS^Jd=9 zyf^QAZ+~iQ3nLhY_<{aS5TRe#raoGBiRu?XtRM|(f`R7Q>NA9#Z{C;l&-(?W`89FD zH!o@d63~JqNPLGtJ}YXWcg6Y8-%cesm9!RuZtSaH^mOI=;a&YONVaONB&W@Miip3SI+O6Lj|@gg>-6$du<&YK{yvka5hS|rp| zEK|=rwil$tacMpy0H;2DK=|RSz6a9^x`QZo>O6FVahpa1S8QUzcELA`8YJMMn@0mz zYUb7SSA`ilK$&qb;DY04YD>}!@m)IUwJ;)?9vn)p3KSj**(@adM>^r;tc0ao{f8nP$X^SyP{x{{ zAyXDr1CSENFs?!xbz);@k>I@~;st8D7Pj>waIz-F)3x;Dv}zL#n|Uq{?(0h&B$(CP zK$shxpbPdGruH1FbOB$V(+x7as9NMAK8vBCa4$~cL;~mCoRZcZ z`(!;3`TD?~mHt7j=QTZ}I`B*uq5YXfHJ>L2ZY-dhxa{Bg69x@HQYmW(^vE8yA1GO% ztO+(E?S&P_gHm{MXg4Um#1v(imQm#(l42iEtxVsaemMMX>Z??#bEq5|@}k`(se3!N z@80CyLP&V!96BQJ2`6Mxe0{!~uC5h-U$*3C!st#G?r}ME(YK+QS6Iv}qci$m)x?aLPKG7tmOhdw%%maM(f9bS(o8 zK#GUAiug6P*O12H-{Kl1Mf*URWy1atCWuH7s$nFw{f36OdyYSteb7=uM=L>5NUhC5 zltOA<+ZZdSW~z*?vR5-Nu$`5_t_f-&(8}SYMN4d(# pN<1obJ?YqA@v}AlDqpt);k#$4ICA1<{XDF?cL4*-|D`xibOKDPh> literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/marlin_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d6619670405a2cc129bbfd61cc59866c1d41039 GIT binary patch literal 20344 zcmeHud30RYdEa}pZCPx0U?Qt7D)1Ri1z>vFqnbf3`hbV z$dIDMMCzJg?3Pd@ACa~iftFfBj#INICpF#X#A=hKZ#c3sexX}9l}`@;=s6>S=}Fw4 zw!iPbS;2uMTmBP$F&- ztHruxD@`Q^AGHXjXzOAloKClyup>Y2jBfJIO8)wzDrM<}cJ$gQbiB$9cL|;O z>k3_X@16KDddXi0G_(sn?cB7F&F7;V?RIV2E^BPi@NU%Y7WU-FA&uwWPgrA%(1Sj9 z3%#GKk4MnQp3l+;aPC9hy}~}saF4JbVXyEg@H>FtLHstO(|+I=PEmX&W;AIN4x!IJ zffuVrHNxRd-sH$8`A0X&AJ`;+Gm3j`llsRt$v+OMmy$jwz+ERdsi#WMLB@Z1DSEU< zI0Zi4_mAxTN$RKG|JQ%_Dq#rkzW+D;?whUIZwBWQ;c?*n=nU^Zcb`ak$jvKO&&YHr z9Gvt7r$UtU1$<$TKR7lfN=RA;eF4!c9rA`mq?{wuk|YMgo+-aKJQ|cH-5SMmX2k3F zN@r)LM8y_juqTW}=sxLdR4`9@MuDkyK{J7#B+VFihKq3X zbzztr>LX|)d_?n35~Dd?R*5dBL?6*b^s{J0JxGONQ>uq?W{Ra!YCOiD;j(Jja!V;ThA+86v#Y7%^nkqjlKuWR(_eC_-{fGm8a6Plhv`TR$YRg?s`9183b9$pAnp z9TUTzhZaaNgo7SmAgnaJI4w#uo=LCd_XRwm>8Yup6cz=~i_=~Z7o#4DCPL9qqk6Yu zWYPH^#Y{lY`L3NkO2Mc<=neOH#O}#mqXEUnlHDG0=VaHY+b9vhODKL@Ct4UdA`;4f{}UI_&d@I+8EU675qn5*d{& zXao@Y5rTOxVc|@bve9*~di!d1f3&(kR()7DRjd^?$VIIShvd$KtDTQUJ0FX69#3dF zb5+8G`b#_S8f)W@!YgKKZl8bYUg55|t#I*x+_ZaLt0}TvF+4DEMcr|mbFo8i>`vw- z44kuU$*}Mx*|g>7#)1!>HLK2ssIx(KG|Hw%v_OWVG3sc1>-p8@zG!owY~Oc9`%`OG z+~)ZD;A%;Gw50uew$;w#(az&?@rkJI#FfKwN6{6_I$niSESdFGEMrn|ddd^>&0^{$ zF+42=Ms!pw`4BBLe*FLd_*MIv`BfLuCHYnNiTtX~@E~z+M4RSLeT0kXFY;1}pcQlx z?sZPkNAw86D+rAdEkaX7gV6j9>!=2t%D-%i@EI;aTL;=&Sbb|IA1!R(&{K<<0=He| zl(Pe~TTd~s7OI6C+To3Ub|S~L_EWcy-8qynY736>3BAE-NpJ?UC1FQrvaxb$_=Ap<(b|)&%UvrcR`#LsLB8Q$ z!=rNj0S4D5w3>q2gkDcg^{lC_Olrb=<**J-4P%`a0N1&Ungdxw34zd)jP^W~)tqh) zD(4J$i-+=Ola+rN6x~QLASk@&p_wn+Wa7Ip`UHNVEePhh2R6>J^~U&0>+Ml_;OUtC znU^1%KXRo$ZgtF`RNsMcPCaRLJ%F5t-yy}S>z@{nN>We)scs;<4pdZ(q9l1HLt`OAg;83P&@PISZ}eehCGAGN zY5YP2;C&5TY1{IN+m<^6v67Q7pTf&-`>E9tca+JF2H6Ce+HlWau~Z(j*UP5*gr2iG z*)%AX!RheSbeM@o&y*;6ro2+vM_NOv@C7z0v1CeeXn_)D z3l=NO%ozkB2|R749#Xn*=rc13L6@W-j@zUUJrjI}OudKt&_+~KsTvfi&FZBn3lp^$ zRov!aFvHDoZqtxNR@-dj(@aRZHk~yRa?LKEDp>{JMPJBw-Y>e&&$wKZur|UMB+)B) z0#xl(yX$Z;5b_D4-R&ilge^I&PvmP0F__zg)g{v?8sK{ zuCb}`i|w-wNz^iHle?ZYWK&+uvJjh~S`B$?W#lmGxV1{fV^5rY(vuXiryyp}9y@)s z|A=Qu+J%|gwD{yLowNrPB#;6p^-!=CL5NnF2w$c9n!I!$@`NsUL8SXp_a$od0|bz= z5Nh=wH1VThP>9f3Ln*c0GcoeGfHtLX<9^xNF=ZH}>5gk2W5A zzyEIIDYXZ=>$u!`B5vV|i%CmY3FK(1v)%gTl^Nwn&P%*dE0?IRqqc! z&}uM87S2}ocMl+}%0A|JeM8()vgp0;yXO1agzTvQv8n!lCA7%?Wr#HQEA~R}nt6}* z_Y1qU@9=xH@9Fnw6+N?tN7D0ydTD+22T0_~dI5gql&rtZhu|BIb(@o!^nJ)$kvoF*~WaYo$A8>bYwgynS?a-(Ym#AQji5*tRX+&=;@ojTcv}7B@zV8{@^b56pU- z0V{1ZePq?!%wP_^S>=iBRZDB=KLZ}5S;i4>eC9Nptk))UZ373Og<)eqOjE&}fvo~l zo#_kh1k#sz3FiS3ZP=V@0fU`?Cy8gK9&K|4{Y7rh7%`v?&Kn{|#5j+L=n!iV>#?ea zAw?JR`KOt$fvu?OdT^k3w_dSc@Cky54JvI~>ag<^rAblndBb8pWnJT)n)1(hu+2d| zPtueR2C(6U4p%JRk+AO~+YBn&N$+L1DT^?UV?5~u0;Mcz_l9!&QHs;~S^e14nM^2W zRmy0JkO|$ARHsLk>~%@G5EM69&moUWa;MSc4t}BQ28*)w(ZXC5t0b>`tGInzlfG zm>=Wgw(akZtTe2Q$J+a&w*EU$Mr{MIh4jU+v8$WqmR`B47j4%}kT+VhD{ia%Sc`bg z*6^_&F*d9Pu7p{yH!uQK%$K@gE{#lx0>GU~Fh|&sF~kfavIcQ%#e+ooaUO{ckDy7J zJQ2+-h#lmioefxM+8*;=6a%j0o=F`pxm9ybdczXDP$5?ukv5k%Ah^=Vw6Px44X-PF zL3Eu89(84tl8f0FLuxhJ1xrNRaJGRmXGLg$Gn980f#OVVcHm1;DKJEDmR{GIWbq^x zHhTm{tkN^68pkh0`zz*VsHl>4C2?!X)khZ|U8;{+Yvj6)sI^1Zb*PhTP&bk}(@C7l zrgI9w2Ky3Rn9+kYg^yBF+Ppci5G=mpvmO3$LU zQXq*hPW#}x@r+4c0jlv4%FQ*5`k(PU(SN#saPa7$qKAWK&2PJJx>p9?Z-$hbz%oXH@u8Md$gJBwMlOT@r8)Ep67gdHj?S@LBf4NzVaC>@z)xu(6yLyg44iDJh8c z4Ew~8QaU;95BogG0aK9~J`$V`gr$FihLV>$(Xw@y1nNl}nhfQ#KxN+L@dU(65~-{I zfnSL9F}7pC3Q`GazP|!7)zLjJ13o5u@4xcwAmU;+9_J1Wh2In4bW>Oo}iW6ume(6`o;aev4-f zLiDQ1nJmvHZC2AW=J5%ar7xn69u?QS0nBqmygds&tJd16wf3H=;Hq`Oy4e55xi_DE z?b+pvADH&s5n&A4wybkGa%bcp#jHx?aFH%jKxZb|^HcKh{LD^Cv`4k#)uafuX+ogr z*|g=nFRdVBPg?El9Z5c+nk7Kf+6~Q;PZqu8v1(JIcaA6D9PKS4Jh_KeUzJWJ3nPvj zAo{|J2Fl8dI-sF1AySGM<{m18f)~i{$Po`v9qNG!RZxlJRkSO_`-gS{0C7}qd9(Jl z+8fic${nkfozcq9@0EVP^1aGfWgiq&-0GAI8>7}Hd25es?U8joD#M{^`XS?lO%?WL zn5r>uE^Biv@TnI(!oAQ5fr8f@PqIh42LKtS%WiQMo8mEUMDqeP3R!8`L^Df_R{EOe zD>fa<=F(Og&wXX5j>gw!^_DKh_}kd{+I-^++9O=|Gp|D6g`CD7>-87%kV!5 z*jQSatRxd)QsI3HKduyJa$wR0#vn;)tc-kZ(K_1|*KyPu5KpZ>AwjM~&%Dwns&))raU z^7kK`x$2(3|JcAaoZ&-OV0FvUzt8eRm*Ws`QTUKzAfwkak&JVXh|;#Fj*(tw+227w z^rZdUIDBOXv+jsC>G5pJPl%B3X&%mak4MvFgqv*KZ)+kt^_I>Y7ijzp{iJ=T5A}ni zAX@V|+CArC3nKVTn{m=@8=lmgQV)Oq_)|76kskK4Nu@2lj=C-)rr!p@{N2snv8LToUH$wC z*;yYq7A_9lHC8RnE}yFfFPIm1E?!)0i<#YbLNkQuuZOJu1F|O zs)W>9p7(h}(wnHfgI4ls01yksCD->}+dDsauevo>-4@kV%=gR2iuCM%J%lr^SosdQ zs6A%t_%1K+IwE%-iJFei_rpND(si|Wp?A?QTN_uc?NMuc%-SjII@Ptwo`bZh_J1(y zW@{rbrHK0yY4*rFkK{B$FB>0$&HpB4T3VY?T4AD+4G(hjn0GMfl6cJN!`4)jjG%y) zDFZtj!gOJUJIo4svpR7g^GU`5wjI>d*K@d({4$c zp&(r({WD4uC!$gwu2!sp=}Fu~pzRcdsp^uH4x)bc)dtlKl-?`BU63LV)jpTrLW6E1 z5Hh#OIn04Xwq4t{RDZ2?{v@=()#nzTi^IF)y*_qrY{`3V{6_uT+iq@KuD{tTn|8z< z&a0OfF30VTt78je4~#myVSWJTf%f9XzMmA-UE%MQxR>jf`)@U^7;YQxSa4o;#c<`t z-!+rIaIIMj=T9;6Q8c*tW1=^r6Y-pLt~8_nCD6>a*ix*N)&-kL+FwoXLyu@^ha^>G z1VKK6!A)fCH`7MYoFT9mEp?x6J~)tsE_9NY#Ew zd>K^nHBR2o4Nq!TKcnU%6KBJd=^|tJn--%i7w)DMLuq} zUaeZFT0H$m@zU_?b#jS2W^Vm<*Yf!C{;0Wk{tz6cCg)<;D|7cstFAwH?YX=4y|L0q z7A$dd;o_6OTbs=NW=*WLeZlgfy}eli|G-f6VCvC(u7Tclg}>>+<$r zm35AxA5SY7HVZ7s%j3Fy(m*hT(|jBGR10+clQ7P$93V-R4%SJ z=G;L^UK<3D6E{oC6#Nba|B8ZtO@R}E+bYp?NZ+C0WeQd(c#nebQjjr<{*aPn88MCl zAO0JHbNNuw5wR(b>43m)ASM}6F{k)$>S3O&^j><=gA0qPDp{=g#~9%SV$DAT0M@i~ zw(`X<-Z-{Ax-uHG?w>!BVZW-nH+Q_Y<6ZrCthcOpy9Z+}r(#t@3yu^YW*2`KrTww0 zLko`07}Dyvx^H3MQqdb{Zq&tUyX1#3Oa@%baTW}{YJ zdTPlQvo^0H@cV0R?2Qy#GHrbkHdz;Il%w7*Rn23ZM4^r&TKZPw?9FnX`;7R0KgtIaK&5HBe@&>(W0$6CfUSl z*|uoewq-tA*1CLhwQYa2ZU1d=wC$j5>W@3BRvq(S+b3@9m^B9 z>u&{akKP%5|3a+f>Ac2Sj(?j}G5yv*MDXjlVN;?*!o{sH`cR*ImU_PZQ^q9+zY*?k zlLelXm=R>jy>TVo;_uO~BaO(?qoBHmV?5n1VsHq~M z(G*nwtyagb?=Bb#{d&aazL8iE(X>T|Z+xJl`Zf#qj!F(;>)?^whgXanLd?iFxQMB!*pL z;G$0o2B@j)qF3^H>2BOpgM+71_sGtjIqK;@`}nEjhdupAju7#6<`aKi`~)Jm^G;yok7I zs?G0gV0S6`8Exj&JD;;6WX5AyG z`m?8c&csa26@=4vA9h~p(<;Jd#(;g7j{{CjJPfG&D3Wex-DX9{PTa}bk$yx0t%^zp zX;~uukn)J{;c)zq1pg-r{+xmha)ellyw<5h!kW;9;Iv=x9>F*(y1E+ivAMK z9HeUM#8%Of3l5ejn09n(*Uscyj%07}N!T)yBo{aHlp)^uz*(I4YMiS5QU{T91)J2u z34Ik8F^%$c-Z5v-a)%V;b3A9=jxj~@*GTKg{Ju2grUhl@UEpC$GUJ>h<3$_KL%@*& zfxK!~uqC+-@YPhr5HSK5u;|7o=?64L%n?h(5wQvScMb0pskP>u5of;pKM{LsN`xF~d=qbR_>3RrnTy5Qz%Lq}hQgwky=DZrNzTu)ueM(qE&|9}=_v1pqMX z7OtrLdhNB^CF#cQxBG7PElW2K#0q!KpNKn4uUoHKmkuwTmYpuSX}{cfAnH6YfBeJJ zuDGS-YR5tc?j%`i;+Dpoq@(0&WFZo_7hj!Nn0R0{lo{rS9ymFB;c7vBw4fe`T5*#N znt+8REG&$h3UK8nZeoC=a@A26b=2M9qYm6>ZNW|Bcu{q{yg6R6EnZUjpt+d5m5El) zR>UqwxnnpF)itSRM%qfm+tCJ?ajEqAxzzTVxU?XPOKqRZrTJ@oUM{tN3YXd#ms&P( zsqIs^)E==i9!;vJIY-2imq!aSJbLRP9+iHI6&Zl`Q4nXbA|q*{dFdVnKSSVleijC5 zrs^36O7o7iMg>2oAj2-?waDfaf>mxQSGk3@cS&DDl39L%CchwNA<+V6srd)SEJg8r z%#!Am#v;ZkEqOVGoUf_pH_0jHPvjJRQlD9}IK})aoT889ukm>~#ri3nVrHCTBu-KP zm{v1?0;iD1vqnrgoMMaE@^XqL!zt2-SO}Hu3pSEHQxv}vos*%GL+3d}E``&(!6cFY z4eFPY8J#5ZvVQ9$ zBKujX+9}mPvN0 zS7KjMOYCrj_8N*+{T`Qne#{Onn2%L4c|33sh#n8yrzy_tuZqqdA3VzL)JlEmLHYs( zt<+aJ?Gp!{fB}2@=#g`Wk9v+A9q2zdcoz0;Fz9F3mefx`_3G!=T70|0KE;!|sE^$g z?5Cigf&mInBTx)y+2>d6y0U5qjZhhFz}RI+c3qI&d1H6Gq*tlnYZSar!J8DkMZq6Y z@W&MV2?c*j0o%4BhaOX&;tg+AlsT{ZEuaOmm?j>|F4{3(v1x zIKJSoINL9{il1?f?7!z{-1h&*?fx6CN3fC+NJz!KWC`=d$G9{ZOs4fOHSg&+17B7acRms-s zsJ>?1O8J94S5isCn~l7a^@tcfCiIAty%Ch@jZ)~%vTh})jN5;R|43`#8(3qCTec@y zdUr32*YxIfJ@OMpF5VKaYapC0Yg_OgBw~zESVH((FajXpN?#sP?*t9-HoPabLCo49 zPPQRPZ9`DL_oMtR1-$vjY=T2{JH(f6B;q&VRCW_3wf#E}OTNkUN1-z!WeEIp8pI_GlPE>K-yFb#JPy?5~ z2N&=K4#7J*S)8tkyhIt#AL4K55*)=VVKu&^TW9Hn$-wWAJBn!*_s8p-)Oc?ni<7gu zKT)LR4L6RyJ$Mty0$e%w{pa5UvH%k{Dk^=u=4Opr)cpOn_u7(08oqbw+?&t8_B`&N z^S#UB%F*u+zBhQM`A2O(Y?Hf(qV134mMGu*vBk){6D1bDAPzrXt=$Q zN_d#`1y^kgwuF{okPeHtx2y3Z`&pbG9CZ@yrF?0;u;{e0X4;;*V?0yyfEx3oUgCxW*L3iWZ7TT6dqL$i3C1q7{WmO465mu_f%E-KyQnM-aD|cj}4>*lJ;4Jhhl|i3U z8T2WYL7!3?^eN@!3ab-^1gYo~q@qtySq{?WlvP1U6JkI=iF7rkrcy{3w`L&?IE^&m vETk!wL7Gw)Ld8mujb^fdAUn;ZgCJ)PDut9)M5xekz*t4)K`ByfTlxP0%Ihjv literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp4.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68cbc841c32cf312d6c12cd2230795f2da4794cb GIT binary patch literal 16876 zcmeHuYit`?c4igD_fvf8Em4+anYJaFk}TV@t@cam9>3eK>B#PIC{;cs$YkYHE&5u}p<39IQ2IR#~*K^!dfkKOiSk^D){ zsbY~7O|`o{$skzlg>>t_ALpKX?sv|;{40}5Pk;+5e?M?(FG2i0Mr7yA2VSH|f>UPZd4c2kLp8)QG=ML3>rhGQIi-} z1A`nVB(0fHPzIWq>Xyou z`jl~tbki2;<}K36yBdO^EL-GO)3uZp{%u?2OSSEE9p$K8V!}Z=f210%r(BRfP&xxC zJhlJTG|EAd6H2P)N6OKrH>g(&^_r1o)&Zq9m0!4L=sSYI$QIY~1f zl~8l};;?Vzd|$8c!tja9$A^6@sHEKFnvp~ z=d)&LCKO|7-&nvO6`W;>(Q$u}jtY&@>ycQH^2MUG&(B6efpMRoqI?WZ#l~qEPW5<< zVdyaHn+f{ai3k&dG$YpaGd}jt3?1F5hJlZl5~&Iz`2S)zc#8y^=OludQj`{x!W1cU z+C{>nIJQ)^vwXwYurEM9}5Hn?43u%3#|H2bPpSv>6r=4bg_PB zl4iSZ$MzlAKfRCc2?oY`X6~@pBjKK#!Cqp0x9AV3Kq(*Rsj}6$VLV#742VVIcNLQarTZBlW-RyPREr|y=8*z`(?xDkbcDu)7tQ$6@n+}gh zVqumB5#bKgldyko%Ei0gN5?^4_{ZpkO0{3IxjdbwPREADj@BHv=GjmC?XmTF~T$ub^S*=ym@L zEvN!v);l1WCV~+^yWdCmhx#VMf_^*#QfM+3iA5igf=QM#qHvDlK-~;(C7~gFvqTEb zFp+UO8udx5#O#6+jqo4c1vL^x)>yanE^lnk7}|J4+cQJQn%11rHt^bpoTAF~*2^lD zPM@nLoZB*vcHYtc$>pDqJ{(Ou_9V{a)MDy(-myIy&vYK(I}bc_`_YF z?Mgd_R$HM`el$4dPhy$QLB4Y^?Kmi={;W0mZt~JHnY@(jTC%A@7tq7>`CSQxJ;Uf>5kb#McZ+k-8snY(CDzQ%vrKJXGYh+>l%I%d+_1?55LfLLGr8x&ji;}&+<{ezKeHs zuUJ>ORX@M)ZQlD1?|26}u4bt(d3eRkxApT4dwIh^N;x1(Ry^`%VyvLPMF%FYvv7Qh z-#lf^r7>Txperabf*OxdUpW7r7L=j}V?iBdVg^(pNjnkL49&)v@OY`%!2@38^ZzFE zIeg0bCpgsvc^68@^(=CW=r~;^@m<8|oQhLJn>fZY2dM=? zSLJm=m0H9({_hS+c69RlSM}n{6G~jDF1^7Cy3VgkoYY=^- z=$kktq?=h(S4203noXI55|-D^4$4$>D)6nG8hjgP0N>7Oe@y;IGH&FI6^~NAd<4Mm zbDm{Hp^tD4R9`YAkYRzG7Sxgs$n-+GK>7u8T+omCZus(Sr5=+sT z>G*jYL*m$~A~R8;dUA%nA*+j%bST8&%7P$3+y}CtH{_l0n3w}tYY@GI=p8~22Ui>l za|ENwy()qwUh%qWz9 zAO54AutO3=u92|SEH*7PEiuWxKR@{J;4<^@t+cf-aWY%&Tr?~gmX0r7NL9O&Bgq4N z_3p&?o;$r+z2jcbT+dQ#THlydFK>V9{GW|~)cE=3zq$GsS6@~sT~QsR>7YPJxjD)K_XTp$~usU-V z3{?{rd(Mg>8)3HP>=<$o_QsqOLN8rVH)GxTg>`4H2I9F|qPqUxM{^%7-FRkc&DCMy zdZOBuYrs$=hMF+cOw?}AwP46i)a}T%LMSnk+fEp5_lD+%mON=gdrH}!7i*#gVZ2nZ z;K3k@oeyEdg3SY>lKW<)ND?K&1FUxi(E~>o14mgH4z9QYghPen4iB+Pq9m@YEKK4m zNUwmwDv1-WiUWlnQS5{Qk5%bl-=cm&aL988p{QmH$~~u+w>U@m3PO%WO-poBsBV=x zO3fA&-zmgC5OHC;{;lS|N#dAf zZeqrPCU^*yxgUPNa(ZQ{F+mok1&hb2=qk5RprG*$C}=FBU|~1kAw1@@|Mb6pD-vVe zZ=b*HrXpZi3P;d%1!k&n-dZIYanX7N#-x7tcq9ZTsAvfm&0LsPYC)3hRbcpW12_O1 z*7rpm+Z73eaZ9cbCv+4n$&%ppJbf2_`i#@&ksTqTLc&FaU8k+~7(2GNPtO3K57fA|yd9`$)x{ z#1iMw!vzuHg`f?`LI9fB=m;8}7%-OwVEjI`(LGWMWVEEHev*mAW_*aqML^M(e$Ua|h>de4_ff?xAjF`!mCV#EGogelIu|%$Qqva|`4c z9vaf--5GN?Z|=^R5Afy#Y4gFv+s~b~8Rssr{AKO+S(7zusmDL7?WI92qDwR3s9W5> zus>t(;O!mB@ak~N-jTMSgsxBLw2&!fWbB>1y)$F)-Lt#w-(+?9!%SN5~tVfJFrf|715DLcuhsOf z97@+5nKiC8ZF}(E{r56WJ$zHoit+Q|R8vp7>B6k)nYMntt~pcJ#n*LZ>h|z;dsZ6K zb%$roYjuN9hthQ?A=J|T;P(C7nHDeK;$7)ZoxhxF@upj@%&NZBHa`DWT|C#AXJhjd z&ooU*QR7c-+n=>K?%kcco3XU;mX_qHO z_iqGVwUqV*F;Bh^I(=Nn7JwH=a5}l?xPFp|8$f^8-6R+-r;_fTIAz=vH-?n}p;QUOz>;#Y2| z5(u6`3v8Jy0B8$afV-7|EN1}-ZQ_iS3gEA75vk@3Q$1x3LX@kPM?n%F2b4sKqOqq! zX~PNPo^>2<2geofgUS-Oa@MH=KrNOB1ASiITTp!LR6^;v71Hv&u?d`_@uAF3?JG-w zD5s~4PYUo0S}1}`SbG3}dG&?_?)ccMBq+1UC(NsSLRs!Ydxf#sL=bO#&FbN4Tjt~$ z#%-K!>Tp>mL^(}K36g2v%&k;)=_p=P-~Melnyt*EfMd;=kJ59?n+e zH{c%y@Z3IjSf<5C_5zLUZ_vMeQ~&m|{zbmw?8W|}?-=mA4dl_Eh}@pv@hF3fQZY(? zT)mkCall(HiMQf4mDim`P$xRBMy8AW1x{%D4RM0&4OYUnX(e2n zSAugD*Wly-p(O5rlp{q+cdKH^V?WEh4|EW0x4eCMW2|I>_4P8SoG`zQ9v&sq4Z9CQ z9)+MU76+q$uWZJxf@DdP`2mz0VaCMLOdLLf;#RL<+SG+$*%XByDY3ku#n#390u8i{ z0Z)5TV`1Qg1MrUO58S-a3>y^;*8>zqhea$ew19pAW@4HO_~GG{cnz^hn_)hH`jUb& zg`v$_3NwvqLF_^UFOpC*Vo;+IR1CgwkghC4SSE~KL2+RsSV#?)S~^S#RpWkE(8a>h z8!?&&t!oC$-oO%?Xui=(3XG_>hzg9Tw$zGh3j^l`A!#fm&5U3KceONw7=}dUL>UJ4 zNVG-^29^#-L9+-3qHNI?Esz1`CKkU1ol&DrY(J?3Xay4(bn|uj(?O<5+hCptja7 zIg`q?wPRMDb=A%qvWDvU6LW{NP0dN`{maRt^CNR3>kuL5M`lOXEzWz~9G4`M1Iq&| z?>;)1YT5UNWnW6WFJ;;H-0GUY@t%Z`M232SqK1j&zh_dwdZZ&v+FOHJ0< zuy}Rh>L<=j>t4QfZ`wI9JCe1E-7g(XTRWkU!M@f2s>yPD%6p98c`V&y-+ZPW_9eiY0DZN`AFK*ma%N-E!&eH0Q=7>MUuht<+-T7dFjUe&e>Dz@+_aV^sXFCx4iYG z@kmaCbJY{o&U<&}?!dd6x}nc@Ex)_e`e4`nT}l5>yH|GcbwjIU+Hws0cHFx?cROQg zFxP`Hx@!7D+T7CI9j3WBKoHqW-O@4L0$g?WjH1 zr~k{Q+H?E$f4!~t++qEA9!NtbL4bNGDVjGyy22YAcu+8XqT~)jEVS9X2KD0#@kXSy zrFrN^lyJ%V~$?h5n3+7+k(BXX42DZUtzlwjyt zLm9-Ul%O8x$ESN`e0VESVg0}5_^|eyj!$2*Muppl;;f)$WSufsR5pP!-0qbLp*cBWBo{mt41AYL$eR5V{`D$S~7-Q2~|$RU}p zrnbo`#qU%{F)W83=L01xRY9qw$dXY%AEmX+P@|0eTQFL}=0GFG6Rg3{y{IErj6=NTf5h-UR@V$U1vt;Mi-rQ!1mhGJ5$@JfC!s5jCpCx)I7(Xt4O zqbLA3V_ymSHsP%0*CP3v(z7poJk5Lp)%F0gP@UY12T)G{~CoxG{@i7K<>Aiv{a+H@!}0w4?Dhh(+7 z*}i!f^x(bUkgD!@a{1G%kFKt|{=Df=n*e3m)QOV-I_uk$Q)u&^S{>vE&ZI6}cLb?e66(pI&=(EnPeKl;Xvlbz5VSTr8nDGoBN? z=fvu}Y0uj!?K_{h!V_eh`hTnSKcr>{<_FU1`kabTS>S3xXNQ{s@%7902fOd@PU}0r zsv>ne*7emH{We~|EjhS+Bdzyl^!>cPf91*_Ui<7?T7T>-l|px%%vC`fFX3%VoqV~` zgqIGVi0^LT;vr-A@^){=eu%dpO52Cl;DQC$)C#ZqJP$o9BhO3+vlb`*6>mb|J@?M6 zzWUydxgGa*&Fz|>SPU)%)B3jMk*5QnPr<_A3v9TXaqd|$!PDu)ndkQYm1}AHvBc?h zgA=cKmYSEFo*Gkz6Dj2h$@~vIS8xSXHULuXUk8;HZ=x%`Z-IBwDC4R)QhJBND&?_M zdiAmeeo+==FygWbQnsYk0_u|cDWVO)T6tLxhz-SInR^9m{d!|nbMP}65o@U>OjClj z0EfM19^Ztuur~glpv(dFDquqS6ahR1vj+sI2Gj-mH{!kg_~81Y3=0Ad`!&Xgwcm7n zx{@`jfE=Oa!keSw^h>ao=9}|3VXX?Bv*W-9K33YZEolTy~cM9idzmq0RR zGO&dUBcP=F)z-f|_41g>RKQH7xXdWiK)gz;&>x^@W2HVz=J*W~WTFaCm03bn8x+iD z0}Y#u2%{D;gd8gZDrN~o_tC2as9Yt0%8iA7g5FQjLwv=Era=T$629uiPyt0T{~4o~ z&_k2sYj6`<6yf=U_%LHj$n*f)Nuu`xky^=Ut z@#qB`!0TOboip6_4&|<@AFBQ9eA8e51Vioi@y^Q)R#olzH~vM9E(JP;v;bJf_pfx0`S;Sg=7jodHAxP9t=&e}eSMNNg5Rts z_vLmHtsU9MzOU>%$PO@~BRI`fL3kaW5}lqqol`*=bPRj#;_ijrISs~uBkC7VFPu*H z^N!A(4wFQaCYacB28A6SeasqU z4qu`vXOp6fUYmE$eZS5<_k8DG{mN=JQQ#t)ABF4#6!otdC^<)wc>M1)MJ-b-#nKUK zT=~*b)wr5QS{2bmwc}bst0TIoeq0|lj2oiHabwgpZi<@6%_L0|u|%!oRzhndwy1sF zPE#t1({c^Bp?-AZ46FM-HQvbT!FO=_Nt!iqRCv3RTJ5vOP2x?P#G5yXw{R}j3P0N> z>GZ6gqF6iE#5UB=G1V7J8c<{+M)`4}h^rVh;Liv{eMR^yL zZzbiM>i1D;t9}}^?}qYiY%|Px`&HW4@(4S7+DFUgP%I<_BJrsyj`wM0_nFs@1^h4X z9SppB?C6;z#{x%>`Nz(jdR_KJgM1_u3q*q#xWMH&e<3j)oZtd6?y~GCCQWmEbVlF; z=R(1R?5c@OOavobLT*c(kIzKdz)XS*1ci7sG!Y20Y=Gz3nF$WM(=ai^^IS{_Oh$}y;}(KQ=G8-%FKahhA$l8 z`Xixp{nJ;4^YK{!rAQ>&AC0qIB*0zaCT4^<-yaEH<@iMZ#hGAC2+anC5agQ?LXkv& zadt=`FgZQkH+@xhhZ0D_6qA`w%uG+md4Xej20GCMzXWd9IqJT-HD?}>%mb@ivgZ96 z&3@hv@e}32LqG#RGBN)F1ng=$NsCk%=vpU~Cn}6(YTO0bmAsl&B~`J`uPv{uDGWhU zUd5`DDpqq%#cF3k4$T^1uW3ZO62jP|TEPAi7ls_QPNYhU$7*J^kY6KeL^ak06eW^4 zSXSNXM>vc}kb`=J)jY!SJ;JF0=GL;vLb4W?X5zf8IUkHn%7$~3k$6xT7?O?g7?%h{ zLKis5sw?uV1s8<7G7V`lLAERFUx~1mk`$(=6A5hDi;iyqO76z;lfdn4xe&=32*reW zKxu^E0y$gYm$(An9F=!C=T7DI)*p)Rihn+{Ji9czuJ6jXv@Y25M*H=)g|_!yA9UR4 zSU2`Uf6-Jvj~1L)p->j^~6KJ~mOkqDT|o`~@t_G6j59&AE_>jzxYg^QID~#Cza# zyO@$RY?2Ggw*Xg6(zB2|X%ui(h!YKAkoC2WRh764xJnCkR`<7<8DfD?Yuz37?PA?Y zlVB~ShHWLP@@1VN(n%|CTwvDFU}0z&vS|MLEP=+!{ds| z8su6YR3J*}sX9KLkQ=6^g^MMjJHA1rpyQdr324mnhwISzY-P*5Ghgg6k*{GPESX!R-XZI;&MTrKMvV( z#ED+mDsxn!g=%y!w=cD)`1Jl;hi)EP;ct#+nZ0u-@(r$K^O8ArB=u^h!JGD{pOG4N z&As^0HIO$sulFzXr@FGHwzO`g=dSCQZGY4D$(etC`yb!_O08~B_0cA&D4p4v8?tDB5aG?zx0G(w)b zqZG}vAgwlx^_5NQQ&sAV9mbsjEAUNsh7s0KP1eu?rR(*nYGttBU>%@-0P;*!{Q%?= zDQOg*DXo}j#C;^1?pSXZrDf7894e<;%iL=H7Q4$@v%AVNNWjCEIFPlr(yZ_taQ)#@ zO6B|RTlM4X%g{->vhu=Msh0XRlt*X#7NfJ(jIJ!tu2T5VXFvVBfB9~FhW8$S`HYv1 za|v%OE_lxeFLB-&Ku!1*5u_JzwAcUguy-OJ1?7XlQ17`Jfy7l~Q*S8YJ;#M&Q(got zecm?-2-qEuMXr|1gL0Dq80)%6MZsMgdgW3@reroqP*eR z{gH~OZ;B#^i~f<8-v=N5AbO}M5yeb4#Ac$PcnS&sN3?9^xr;L)=zah@L3B8|p&H%* zB)i0;nC0EzeWc;>h^VT~G{whfrj=fx?DloZI>l}RFe%8#VDy1F3`zDVN*>7>fUyRq zgS-$z@GmzLcvUY&4|av1An=NYIf_jaJrz$3Mc2fwC2P6pv~X2c1ALSju)u&y0{|1C zBt+~Tl*c1WK}FC8($9c!6+wiysTzJyfPyyAugNZ zd}t~_urmKD)?k2TI(McZzh#>wCjunq@^niv4CECK<38j9iI6E@J(=BTf9l-8WnxFwCd zpi&$6ex=rGbp;#cXwEebNR0z?C-aT&T;m?8anIbz`^}wcZKn6&-J{v&F@V;e>m7N! z^SZbo=ImXPy(@iT^-yNp=(>G0V;IfYM;|;j(b{^onlD<37H079XiRsQUJrdKCp2_tdka`bfnc)Rp-q`wn zBEA2ovy#zA^7TqgZ;sg^F*{cF=XM^Fb{@(yBlEhvg~?d9JaBK%xc4kIW`>S`LQ6v@ z=1yQso{ZiLmWu163!_57+Rba(-r>6^*L#oW9UbYWTb(yMvt5IdV{kRN?l_RK9LP8h z)HY38@tnEF#GfRJmbzWK8f+I^sWwMnf>#+2aNkK>C{N-B>e0W zvwdFo(CN#Y8d9w*Z?BC5LUZZnUV_DkFI9?a3*(UxJ3 zp4AC&8aqmzeGB?(70QYTD0fyom8t-ljQ}!R z*sGMT_QT!PA%!l1fuv19qY821o)T60iZ*~`Fc#K$$5aMncF`2ZL#D=w_LAB~Q}2yy zs48e$2Uv-wqtx|=3Dp!ep*jop^<;zCP!9&Nno_Mxs`p$P$enC}xZ*AV6JrJ7)&XT< zJlAR*sLtj)RUk$v0~j!98^G*VG!W3hCD@{<1(0Qhb=R^sqGmI!9XLpJ7S2L2gc{fe zGzMF z4L?mfIX01pAhSRo9xL@obQI+wNGg%J<0wPkrcESOQ)w@Or0V?xNh*hc)@0&DN2z_- z%immnIX1{I$42?3NF!*WBE?kiy{VyCC!8aHj}l8jJfQRg;40+nfqI;j(fxExHqnj=&r2Gd!J~) zoG_uH$(eS!E(I4qJWhoI(%_h+$Q2O(5>P&sY`Q$Kw{+KIygXPU)ez02IhaR~&YuR) z&%cah8D_%;yZ{lh>hhp$EtV@@3Nagkh|c@?*B~#nXTbKzjiVk~L&Dd?pTUaXz{<>s z`{8DmU)?Yh&l9$BI-YfQ4{E~ zBp#j-5D?8_)FgUSBp!N;Z!9W!n?H|<1ZI1xU^dP1Xy4`g(W^n$ni#--VrP)e0vAgF z=#7LDLgg|`rbGM%EEoY#rmrYy8>|4A1EeS_hkO*h7Ql{2p)}Z0>ggq*o;T;s4U0z? zp3S!d+PrZlJ+|my@IQb6z389!e-7B&;aMJ88cC0(FRfh4bPugNhBB6+jAQ7*W(g0O zrp1eY(L6tPzo~acw{r0#ACsipE)}7EnOe@ZuoL7eNszbwq?(v{$aZ}*S=S3 z-*gm>T=w#DglhKHK=+3Y?em(v*}m8>nY%uGZN>HTamn1DcekeEl6&|33-?`Z zsWa&pvaa1Z*B;5WXVsM(7?TD7&Oblzzi)9Y9!m{>YS{|5iHx&Xvi9EDyE1e8pk&<- zRb)B_B=^AYSI0fe)(2m^OaS*S?HTjdyxoO&kXgGoJ+^M&2GiH#{h;+mYxPcw2Jlk_Xav!)mul~2BY98f2c0)Mvz{F*LCMp-1u!@InMTyb{cM_CO85MzqwlWBz!>h%vQ# zG3$C}-v7YWnmUqpr+d=w)G5j3%enSRu6DPv#!bO^5Iv!sEe>-eMd(+}fioIv= z&Rd+tK-S)!v-e2$iWSN}Jg*_^56vi>D&NwPx_F~^{^SFTTiF`-ItNz|WjjYdwLAxA z4j2Pia9r1~E?oT>j4Fpe-nsHhs_TQDH+H6jKkHisi_76PI%|I(d*Hl&W#LNEuD5lS z%hJ3?{OPld^`hzTKDyjx)P-eU_3Z#99wn40k#E8eVa=R zE}oNk1!=A+4#C^SD3We{GQFyQwkK0d`W0NI(|n7ls%P}K}^X64z z@h0~dia|Ny72;mBiR0ZvG$?qtqe+}&y~Kp>EtP6JtZs zsxu!?XnZ>bHh7uwjVtj9nBmHAN9x^t zi$(#jgNO@vl&H#=RSA{V_g3Odl&FFCAVq#9??do)N0GAuXm*$LY3ucz+=SD2jAgTc zxnlH$w*Vjp(1AhMP`9)RO}QZ0Os5t2gnKTu2BpY-rJrE23eZxZl7j#h1bzSd9)va}@HT6dN)s>bkvoFUCOU!VVIhbP(OU&WBe3p5B zZ9rmfWm=-ieLuWrjahjY$@lJj8Bc}#L1%Q}z4EmGwU z(0dps?UwJRZ`FU#dI+>({9m~gY@l2_;9BbX_J!@KA$UeuzM3*ruBINq zQ8ltKlImCig?cEX86pRakAB_P!6UQD4ft5TFEBBafIk5U;3G_PRY1A6Hj_Nc1v`&> zPCiItHa@ZMQ=ZuORX=0sBd_ZFiX1`$yap=ZaiSxy z*@5yT)DR%}HxN*44GdPd1uN0LPBb-Q8i1$J@pyzh*(5+;*1Z&GXCfTAQRaVyWylUg zP9=|3$>b<_*f0X4gS?0NpF=7Ngv5*B7t}OOKQ>Xc_1~zD-%#EErP9%=|D@2fQS`3g zP`=NN&c(N~#*R7NmpYmr`qE&gn+m(A_I+P4z4SJKr??&kHP8>>zWCI_se%@0&=MTY z%e$6#74#T_N1ZObq%IgSWTLDM1v63>q^wBUD5z=I!mgB4GPM`%7}bEUKCzKPBc(I{ z@SS(xdF;T@8#FaI^o9DnG<}4|_JCV|MfvsRvr)V=c*503$ILxkl678%mG} wEPw>bk3_+*q4o`bp|;U4()jGvvScYJfn9qP;(^vg`lH1xkGFKt2gp?a4?9sNMgRZ+ literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e1d8ee92e43c6e56346d8f93be719f5ce27d870 GIT binary patch literal 7010 zcmbtZe{2)koqyw*@sAnXW9K&sA%=997z(*%8z2Ezg}`7L~G-n{pHzrUZ)`+lGQ$>ZS=gaq@$*x6=;euqD-;I3xY{}VC`NJ1eb zQIah|g={3HLo}pxf=RL=mO>IEu?c(95ppD*A!pJRa*;B7f=lutKIsm*lb(<#=?!@) zWJBLVlH(?loDXc~C?P?1L@CL2lM4AH?j{QPC0=fd+9dZ)Ce-XgQj6q~>3E+txq2#j zWmXd8CdntaO8zgAu}wP=igHrZO*Yi_RXs)RQZtO&K4lBGPWO*$u|!Op3PdMT5iOQZ zseuVqmIBeV63}E-Qv>0Y6u2y>B3F`OWn5jwGdmTe4DM_soCquDrZVy>1!Y0n@O|s_ zxijLo55IHr_$l$+@sr1lR%nxmrNoS^BquakRH2Qm8co&WX<12&nRG0rsX@jN#$-** zgd^i(G#t^=iqSfj(XNVDC&DQ$HZ6CLcETiF`Nr*R&SuN8Ut*Sc+#bL~iL83om9#nhK zkyz_t3I*9y3T`?*ke<-Qs2tWN6j==%3f#yBVrn3kl493lYV2}C4qToJB;k0pD~cSJ z#1xS+n8<__zHc*FNrS^R?2>#f7LkLjVF#s1hBbqQ!--rmSbTh{LBsJG4mF~Lwa68a zVpKU1HQbG38}7zY;43@{2d#t?2|0n4VN$$qZNI3(QLA|0133J5v|meR`ZKZ2E-kEp zg6x`{*f+R;e4pH(h+Xc_Oleorss3w;M6y4bmgIyePs))Am|uS)JS8h?e^pn)Shf2n zK-bm&s`8N!5$pKw%#`62#aJq)iQ;s}28Gyd)wIB*)DT2Dv?6rn6J=pXG5k>IUqg(G zTX7FByGP3Ikrm(Od}5updA#d1l$W61je2$#6Xh*KONny#NXgmxMf>K38}m0xy#s$g z`su|_E`E0H^Y(XVm>;>2%b#34~v`su%8Z;|$KCpuGxV z7lwm44#1pteLVo`D?|1^L-4(dL9Srnui`1M{x1Yj9ag;(2G}|P^Jb<>z6P6`NXiL7 zD1BADZrIJ9t7*z$u$aN%A?olWdS$d#5o57c2q*py0i?2WJQKZH2Z{ zN8dxfuf+6~_`Vh1KX-KYQ2y{kzPrR6DQqh4E%q%2myRy&E%h9!Fh@#!_a7s(W=Al< z&e@%F*9tB3e_qgv?=EWh-(NacYCBTlhI7L!oQG5=DqL^<%M`x0$lS}8+6F4z-rVrx zs-Tkp#s@=phVC9Lwe?lF;Lo=f&lR_qxjoC=P?;NAdi~SEPX;U8F=)Vhb0Y*p&%g&O z1m5}ePB@7Lq*WCY6qei+I`g2)%2`zBdY!HvH46${m4IGqA!~>7mnd7@RTHgcEJcj- zwW3-^vn(AH_tpw)nc8fShWUYnxX;SfQFoEaK+|d6iJ?3V=P?aLops#W#2^jYm?7R_ z(CtlX2t|FfohOa{2(6`2znGu(hA;JgN|e- z2t?sBvWHc2_1aPy!#)A`>+5?IyeUc$OR`EhH6|MY-q(2LBpn{gDzzk z7T=Adek|t%5%F?NQh!DPvxVk%ZNat-6ibs3@@`hp7-nhVilbWy5s2Xezt?WlQMp|8@kGk0R8sdZt;{EplSjKA%(+kY6m z-9Ou3;kuT&ZDnrT-|YEt@E+i8S8n(V!GC*lb~4}m!KOQ#3a{PSTu?upx;M4xzo#$W zC~rSf>KOj#J^wQJkAt5Jqq(;pd79=9{KnHWL#;S{bHhKO@~`FJ$dBjWD0KwOEjx-= zmi+e<<(8qcb7=2a8LN9?eCl)J1c>~iZHY+94ZTkmc}c> zNbar2d~@z?^Y%Oh3yjnt4jFxCglVaNiC9 zCa1HIt>7-tyikuknoSl4Z51Xdfdy$3r4kabD`TWgGY z+buT(by;V_Xh9h5$1qxr^x_y78-iZj(X0zdfLJ6-$IIJ{A<6Z#dOI$)4_Z2~E%QgT z9*HO@OYBc~7tM~#6+K53x2ugVs3QVg4-vX=nBv!E> z6?B?Hr2ZH##pa{g1{4CVf)=4we1ys?5KZ@3loLo!s2aGCfo}xBsoekdz*t%XUqpdZ zKtZ=M2njG=CL^aL!!A#1kShCdAsbed@RZ>?HW`sK*nn5?Rwz4g9i5OZ8kbX|F*Sj z=GbGOf9_1)KR-Ipuhg^o@Vrp_%%7RR_}ph}HOzF+(HKk9xLcRqo6GLa9}iTz2R|Mt zhV$VKI(^_-47|o4@L)bc|HICrZu} zk9pzt!P$fProTR0=)5bII(Aj~-9Nv&xTBaY^M{uC;W9s5;g98xKH`1Ld{>$8${(%p zTXRQOcp-PJ_IrKRK>at|(>5JEE^?+L*1mRn+Fi7{CCKsj`ld&+5 z1-+*6WI9yD62jJvX+^PeGNVl?m_Ug!reFpj09P==6+(2FrcpT#!NXFo*<`(wY zoYlN)IKj88)0>KqtAVeqtZA?bRu9!@G$jGje+EC5CcL%gMQ!a1=jYFV=Tz>^nKxI2 zP0K=WS?JBZ1rC?9Y3`kej`kJcFK&fz&9^<|drC}CiSGduYWu+zr)y?tZn`j7jKPb? zc6*uW%^jP0eU6$N$OoUgQOl-<1M>&+S3hEZ;kxfy+WKkVCw-qc59dz)tHbxGw||jd z9If;oC_4i)^vvko$wp)Gk-w)v7v2RQ@50iJivL7zjuiqaZYCcss^WlYi7b1G-KI;FXx~g^kZAxQq}?gHK5O zMe?Fo^#?COnzd&gm@XP9g?iCTpap^~r`0?|fGiLX5Lk^w@WRXhCP$T!vV7gB0qEiu zbuhc@{ve@&FX^oYeh&}@ue){67?btZiKl^Lyw*&uv1HKo44{x6};GctMYQFJ}nI{~eU^+F?9mcsj#EJ3pEY1lZ1brL9UW2&7VKax^ zL8=nL-4V}Af}CNS&L~%KQ4B|MT;!^v5o*S9;bDkW%ug0*t<(f>bNKpIaU*~g5dS`I z_dgJU;nCVwhwn=#JaD``aQw3a7619?Vfx6ot+1ysy|{O2 zX!*d~vkbVL-D0oW2^JetJr!!N_T@xKmX zg_sKp#x;I`CVxwiCtwAG!K8S6{swOv7iOIk*O7~YC=Iz6uwbrGU9nxd#Dc0}#{9cub5a{oKp z{vT+xj7I-pV=3D@g6I!Ejyg|0K{#IPL&Q(Bzu)4cTGn3eqI~(tI)Y^FO$xob z;|c#Ys%6FD{=WVleT|0vV_)YQ3n||GmNf^?Igzhr&4qIuxWu>hSv@b6g>CD&>Dno( blWNHid~op2!8HWg#Slq8JNp!Wkrn+PS}hcj literal 0 HcmV?d00001 diff --git a/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-312.pyc b/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f05872fd9f33a502dba6b870c3b17808e81c11ad GIT binary patch literal 19041 zcmeHvdu$s=nrAoPWb+|X5+zX&Q??$aY)h8>kS*Jp$ZuP=e0Cm_IJP;onzBVvq|~JR zpqmMu$zY@;7*mtrL|rl$dNx4lv3H~F1FK*Uh_jf(1~}l*Z8}R`j~6FkvcUO2C-y%x z|J;3FHJhYl%E`{%-r|5OTh-N7Up>3(tMB{OSNty)i-`gk)VwqDlYJERdwfwGNr9Kg zPr#d{Sn4#z(yS^-pQg!Eby`K9>eFg?s)L#_?P)Ddu^Ltz)Q#y+>q%G_G>jQf8zHQJ zi#ly$4d64Z5qvXi0^hx*8CQI+QwS^+F=!IeM@t?g0(@4ovrXYAjS?cm8`?B z4y;!iSbVaTeg|6xf7S5kEKf6RVr!s`YPPm~I$P)0V*X(r>jFCGRh6e9*7Z_!Bsdbi z>K>j94Mj)7p@@4j;%D8%Va^@(N1_q8FT}df`$Iz)$9&vqFQCbMA;qP%&g&!e` zWf2#E_mz&Nd5TxxQQgKI;8^(gGZjk>tLoq_fMu<4teU59=SzxfzPEI`wDfZ2@f52r zl&57iaUHJ;=vGt;udH?yJ|m}Mbs?J7$Mw91*TtY6;6p2Um_jkg1wOpRfF5EDB*utT zatyD}$HLd>%bac<4s20zCtI zB=vuS^XMgQG|UZMlypAM7rNk=bewxjcIBN^C8)XRl0N8}J8(qG`hljGhiQa*51GDO4U94<4- zbUqx7hR1^bVF*J_elF57hxca4vXZ;JZ33;yV) zE0euDcZ~M>yMiO8_yfs-KH=<=3_kSx46- z@RCSZo(qr{FY*v?&(_ZIt5Sy-IL6Rq&=>W4$pCQvFvt42k_DuJ^s*Tg<=hr3mpN`^iA(hb&IBkZ`HK1aVa0sFPa*^)vAn5kik-&yeyiV(#;~XE<=kZPiC)Z>ikx( zHg-KS6jHlIriG+_Yt$G!9;qp<JCZoOSXrO)&FoHhiz+WG!l4GVVB((Q-bEyWB4Z2P=QCoiyr<6|JK0louzmxP@0P5;BLzg*j`6_ zfh#sf^ZXk)m+IL$J^+6EE48v(-vs_2DBV5s<(@&FK*zDHGQ2phj^e^4t{i^@UPA@U z3Pthx>|BJu+e>Jy#hgkgt_elIw`}ArD_lvbjulDpT9R7z%+xY18rEE*xYAZTpDqDf zU03pjRv8`mGTLQdMiZ!gx-5CeV)cY;YhlMQT%tLqq;zA+x7C#NtW4XK9DsKKsKeL($6Q_}wL zn?+^57U@F|NC-NZC-r5<&%vt{$w85NK$_6YZ3`feT;W)U(xNl4QsRcVF>Z=8p{*?a zFRH`Ln}0|hS7@^K3~>{A8ewOK_U6YBSEYsQ;=mK)R^DKvV28)BO%B_LL=vyyZE-ta z5qI!bz9I%3g=f%@SMoOA2s9Y3;_bX6ux3SP;k9JWxSvi{#)7Sk0nZ^NRr8g6)&G%_ zoZq{)SNr1OH}rvZPq&tJ@YMm_`&PK(H!aAXHT-HNIDcRXwb6nk7-(BjA-wW6d@ZOY zjyqLKo#o~yPxGK8;p$eX;&q|kH_U%p@0OSJL+j(Kc_-vlzORdSmCFfZYvHxSg}tQQ z4B!i^IbILEN>4b>9==}DGl;gq469MwSP@$(#{#Fzw_`;*oln=Tl3txpcPi=ehR}6r zgO0R;Us2lsr8canRiAI8NonJX^qPEn?JDVY`E=JR>Gk>a1|>b-@J2(xQ)X_`yalEF zVOm+=@y6%|g%0MV1+)qE?YyKAZ{iyRo0QbzlV$iOzM=eFQOZ?#E=}=uP)ZpNjkPIr ze^>?jMB^aGKqWEHfh*(wp(rTRJ)78(F@GomGO{n|W+CJUyFjB$0j3DKnC3G$fc3U$oZh( zeg3Mudpn1c)N09FdSYvfC4+`CIR=QsicYy>&hmwGC@K-{43srBjOjyIYt%!m@FC0w z>LIe*hbRF)L{a1+%BBw+!1L5_I6EZssYyU0#{363F3d^V$VFdX>*DN?%%LnKX^Ac- zX+f3R(j#du`hvrf;rwtg?2C49m$W26QlYC@9|zhgCFv$Z=SM;;S51n5yxX@*x(mMO zML#DQCqt2mNxvV|L2V@J8yc02A^&Bse{4K@m1sJWLD7FaYoLCd9~^c-qDP5N$hq*v z3Az@0ek2<4LZf933O}l(AB~LrxQJiUlbqhIl4@)-{B^@$5BzOW5_&k~Tawm)Wt`hA zKf2`y^b-o+O#E)*%Q0J^qwJLk5aXj;(#th%Rs!9UE?>8fXmrqLS+$ik_yj}<72Uj^ z=Vv7|dCx0(5!}p6mC-U`t_54{kt^{={9_~DNR*o#icWHV);r9F$Gmc-zpge_~?M;<#~U`b^^BH+JW&YsLk-6BKvt`nC7{vw@kwJ-z!u zwQKgZnb)RGi$?2>`sw-)t3PthxjugBlXExE{q;+K``Tx({kHWVHhj53s2voIg9Yku ztS+IkGc)<=wOiLRW5TWhq5fFbdR#CZUuz$uN1Z(6Dp2F=PIqx zIg1rLXZZnSI?7<2>K7{7M01;{M;%$ekqq4CdHcLt*mC5nW?_&OkFvrMzhE9-HhKn2 zet1S_M(^pH7af&T`b9_0Y{yJT>QvgEzHqZ4b2;nig;-Z}`sCckTwSMF*O}q6bv@J8 zDb>{8q-tjG(lfHvuTA^rcIRAOqN{6Om33_q3|ppkm~+ONbRDqdKbFImfczVo}7KJXy1Fc`>yY+?k|V4_Ty8!C5t2JPfg4OL`!qx z;G(7S#%t5BrPihbLdWw$U4PcHH)lB@S`OR|WGyEX2bW;}+3)GsE}0#_?Ea-MWl#1_ zpIEe2|I+s#?5Y0bh3Pi{1F@|Gpv2O;^mMdVWoZStq{3N3zZP%@KuRoa->NC2<&l{( zxzpYEj4l5fRlv-Dx^AP!(gujg(+P|g63k^1EILTAmPxSbA%S$GmU6mg&&`}mpUk*! zo)cQOWvjO*1|L-JS%fv!myD&i!*Z$8C62*xXe)D?dQnpkpcbEb@zYmty*kg`K6_uY z3x*&$myoPAj4JEp}me)XD9ki!RJ0V5?E3G2+0nS7R3Ybx`OL0Ijq6LJ!EM`

-KFC;U$wXktF0r< zdkKrg;5?EFmO;_&rHn;9o|*<&3<>W~O*h7WgDZ@)soC~;c9Dp*=qFhrCkU3%>4c@5 zty@F2%0Vci6rqm0Sha?fG7obp$}A;Z=3b8yulw)TqrY}*ooZ(WIals+y=^vCN2xtk z?hI8qPFC9&j?Q3FlFQ1>rKpxw9L`f2Ns{s$p(pU67~uQZmn`!$lC@H1kIla%{2qzLDFDt9&0ZZe8;zO~jM4>u)OSLx>Lip7HumeZn dYy?h6Ab7Xzb>qglUAyhwZGHk98e&CJe*ge1!p#5x literal 0 HcmV?d00001 diff --git a/entrypoints/cli/__pycache__/collect_env.cpython-312.pyc b/entrypoints/cli/__pycache__/collect_env.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6ece6f09f2734eae8d42e9bc091dd3cd037eb49 GIT binary patch literal 1690 zcmah}%}*pn6tC(py5|Fi2?~rs+hs_0R_VE5c1>0zL1Z_(l8x-ezVtR-1vGZ|wADRi zXQBxQZeEBN{1Y_zpLhX{0n3k&7!Tei4yQf&s(S_ovx!BhdSCNCe($~hpM^%k%uK;x@A2c0O(@lRQ5-Slip#bWt}&XnS+wrqkt`&j z1(^LAxQJfV2eel|gt16i(g) zWK)zgFDgvKNx>%pkWLh*_X{MMM;1ypJw5WX%4o*ZROAUI6Uktl^0#5Evc}hzyOl zfjB~6;N2@ch9(Gxok;K}f3F@C~#;g zTH27gvO|NI7YR)~=EnkLuC^0*n*yc5Tp}`ibK+Hr`V4AE948;p8@4`>3a(O6_c-emYSrp1Y5q#3wXm92_nJjw(2qxB%O~dsa$`Z&5W@ADst_+GtKpdc7lzSJ&nf(th)XAPC zsjnU0KC9MGp$r7fw^RR!r!OO|nrVBwt`^Q~x6W!yr?sV?4BUnB#-$2x9w!SS#^}b^ z;eR)08^c^+*oHjEH$|cXxV%w1%xBgjHV?7|_)kRP`{=il`BUlENpb!}na_b9XRm{X zYiTEjf#TWRSR;$T#K^Leu&IfevPPE1f~B$GhcsbgSb6#Br%zU$l?SUUj~=eC@!8D2 zo%%sy4SV5aYz8+UU%v5q%qrnZ;3sL8Hv4Z;_wy-upS=aa#O@>y;Ofa3_yxup}OcBxfy;hXm^5j?#+7}pQ$hhLsn7cLRd H865utS4*t; literal 0 HcmV?d00001 diff --git a/entrypoints/cli/__pycache__/main.cpython-312.pyc b/entrypoints/cli/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41270a8db5c6a609ae5bec7ade729c3d1281e202 GIT binary patch literal 3560 zcmbtX&2JmW6`v({mqYH7qCPB9woGiLNS3UMEE3mpkTkU|S&l1N7A&iEu0x9@9y^WK{e{v{fX05Wv;H`?#R0Q`e~cq6#aJikGh4}l66fy$@>ommXHG`JWf zG^n!%x5zOd01$`xEvD(U7*g5y!6HPw%B>9pfFaStt^JS;SP!fRM)>l3H&&4}F>^&i zISa2B@>x?)O7Cdas+N-sMaxC3yrh)!nkr=wwunGN$_m1gW)$+6xMHNN5kcx}3Tk$QaRe== zS+bsAAy%1vq%MKvmuBNR&$*dkRG@<@%?8k!eHz%O!F|fQG@x?^UuBF?HNa5&pbm|2 zHOPPp$aWo^qRD5wkB}t}%f5y8@$cl_ci20dJS_Vbc%<*--GA77g{!gwPzm_GWes{( z){w_#Ne{j_yk1q=wMo*U`2Sn0Y%dA0-_5_UPJ7X^V;<*IwY3l2s)zI*BT*%Lq{3xi z_N_cF`$LcOska(@igXRsPZ)hwp6;;ocW?2mlC z$Em_2(X)IM-3m12)lemr{jq1`Q#Gcxsd4MPC&^y)IG<)O`Mk%e?X5XFw%DeWWG0lo z>{<9!?NqxS9cv+h8mNXV>?F874?s0iiDa+%wjSr@rz%tgV);6;6f0tO+SB{A0zD+z ztG?9Zh|jFY`}9$_=R+`yl`yfI^R*sFCI$E2V8OiwmadGTeQ~r*XC>;RvwrUBD^az_%OUO{5pQ`KpC0sFr3^ww~=C6vpomaU7FNSVB0C^=P9k%clAQor9k@lnpCw?L+dF>#j) zx?(Nou_0-ul*?PvLe4CpjJB*Hb0T%YLDAU2B(}b z3081ZN>mV|w~HD^s>71V7MZJE)y#rIu*%EDT+{CuEn?akpy5&nZs8OuUCJ$pq2RR? zy-*$LNTW<%*GYwFot+a$TI35Tr)W;{NF^@jMW0PLQn5;rDTI%_+jI)jF)ZT;br#!amNRj&e@WTsIw- zOjL6C=99tURJ2rF!Ep+SKC7JgewWH#XE`y=l#70gIvp-(Pi>|$g(4QHD@*z_@9?y> z70VG7Rh7wH?v`=F%@>TWEjiFYmZB<_;@01ZQM*?XD!b;y5 z|2?0!j6%Ag6~;&sR**IJZgKq8b8F)$P4bg2tXr%3T$*M%eW3nnQvNipajLM6>6^m+ zp@lQs$*HyhcG=V}JG#pb?Xn}g?BL(oYhSSE8$1X{YcOHML=E=Yu&)LOY&cMd;x~o; zHtatn9J1k14UXDy^pI-UhQo)1)V?9`o&3i1vpC>7-oLiT11Q{k<^H+5=iaM6c>75x z@w@qtZ+>+16Xl~@pI&})@|CABc3-?JHUdoaGP4&9h9m!hAUyD;&{Y!#ZDFt`oUw&7 zJHpv7#Is+GoY^|Ld1Xf&dMb9@pT0X?kI(M~1JT$s2x2eQyOQ;`;aXeTZcBf5=W$zl zyKQQZ3&!}3sRjg`P-A;+wufM@!_InJ;%U6M7Vo#?{k8aMJAV4pv)l2P8)3jD_5jE6 z4H0xCYVAXI`_O0HAEzI;4{f)_5Z##E?f%|oq~3Y5);VN%4sCZ1Z^GTq6Pput zv8^U1Z87<oc)fm7+MImQ`C;O>i4SI?>y|XLnjH^%^UaMy8D(}xIr5ZexmNc(SL?wo7UrF z!#kl7w+2ZYCLda#!h|3q04DEGfG1r2>&*@D2Bt3-dM}!;O2HXxxxRNe~syDR2aaDo8>3L-oLT%S5u83V2`_@ F{|Dn&S@8e> literal 0 HcmV?d00001 diff --git a/entrypoints/cli/__pycache__/openai.cpython-312.pyc b/entrypoints/cli/__pycache__/openai.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07583ccb9d795f6f92bfdc17d57ddb146ab65137 GIT binary patch literal 9687 zcmd5?X>1%vcCMc3p6R(bhd2~3Ssj}?C~1b2MN6_QTh>8Rv_y(Fxq?N<8IRaaa;Ujn z-9w49B$m4nu#~NhC?J63*ugSLkVvotcz=cY5yTM|L6H36M9gyB)&>mX{gEF5CZ>J_ z`jfm@-8~0Ak~asCEpT2{SG_vA>eYMSd(FT1`P>9jlK(KV@*E-mf)7>_&BEqkj*vS< zCNh^KV@A%6aqQVPW@EX1%nrFN$)|)dfkPg9(vfnGIa!%ciYeEaiJLMVka6}QP z6ZjlSZ^}33<4D@8km+`7@=8ADlbzSem|qs5U8>YuCE29}m5}0`u(d$d1TVX<^JCkf zHZ)V3ZI%Z?}hB`UjH+pHfH+K4L@9A>` zL;X0~aYq&47F6*_J#&!9YHxZcg*2LfluT_^KIgL5>}v{k6ewQ_mdRxXwc zbL3OAmizU`!BuQY)#J>PPTSCU^<=o@ZSdQS0=ZK?Q8kx2q|AM4z7R*)P!yRuZMt2Z zRdpWrNMjF)NnMzLy;M|em!v1By0lcbE1Sqhv^bqqw8*vGk(ZBNIihqW6PLTPv)WW9 z-8GX;rn+Er>1;NWNNZ}>crwwIfg_72qS;x!K}}4i=7Uf%&s`0hu% z+pLM+jxI<4kv~WSI5ZFYqv{T-XgQi5FN+T}l?CiQsI5M!PoWVLN9IU9L7L~FzK&pA z<5USm%VG2;WZF_cRt4nKo~j44D08yS63uy=h8xP#oNbQ#%wBG>!>qVlTN3rvub=1B zhvw|lxCvX*G|K0eWWDpeFegmoo^MGqk7{8WPkc+7=QV6%Nfze#kI5g{DpEbi&)Fwz zBc#(Yq&Hj`?j1Tg5IZ?M5Ifgh_>hv*T(CygX3iIi}tKzt^iNX9{3#f;HZJYtC@Lw*A> z7)+t@aV;^UU|+fyICj8KT>=8vgWs{n@7(ZjyLIixwS}|AhS06#jpRQx^nY^sr_&!z z|1|wkdNuOu%GHO@^)K{4YHVFT^7-3qjjt^XZa6(Z{=tuaPz<-+iQSIfaBsBjU2p3u zv~{hw^%UBA9=07`6`PCUUF+d}h48+62MgiIdbqm~?p}0%&AS!{@9w+XzUtn+#_!qi zgqP-5Jsqoj2fG)eJmMi9w4pq`{=o87D zI@QVR&he>CVq8&m0VE3~!a1IS^Mmu=1+zoDL)EAfPw9?KPRr&rT^OIrrLWK~XjSpW z@aT<`55<@u#m2$@#f)CG2RIn(MJOzgqR+q3_f>HFqEHNk@Az)}?#2rNX+5yJ5ZDcM zp1`eFKYX?54J|2mQnynD?+c54kG#R9T}-vTbx-?+x2NPHq2|YKBDk;j7d^p+!DpP{ z2QX>X2_F2Pp5Ov^b0?mbc`^bF%hH_vTc>acPNCdhdkSNNcTX+Lzwqv_zpuIvtgoh|V0xI@R00k|BaGuMu@b&ByER9d1(_uCpre1- z2(nvkXH0GxZj@~Mq)Inx<*{UYMPs^%t7^)uo7&n*s?9K;ni*)Oj63hjMsV*f^P`<=GiZR^3kh2Y-xU{@j7RSdN+&#jy( z*-5B-lh{J;MF-%iP{(rL!{Gi!M`g};RxTFW&#Z~PrR~Jy{|{gaG=D=lSLjP`uw;k) zi5i3z|B2_6k38_XPPNz{G;=_8N7g{|)t8`bU_OB^cMw=rw?Z8znbXQ3S++4%kJ=Uz zgJSi12vt4669eP$T|JyNJ;d4`090H}Yp)ch}6p z{1IrDG80S;X)c{clw$Zo5p;q;3;--hZ}pChPUMmj;2jAh69fc=lhd;XP0>tlatf|5!)NH_Ni%VpKpdq;!6bX*vK#|+&KyAS zcr<#Sql0kR)t*5phFtIG6gnHrWbt~b6wT@->oEN^l(Jl>FHcwydhvJBtm0#w~0 z3ct4@%&iM-5HHurNaWp&D>lHgm&p zJF6Wf@Py4f`PNS~vsnKP9b3)fEYVPVvLrjLedNISgZVEzXtHRv%sZd@?#|!j_hjb3 z>`>M<|MV5u1$rFG>p({7Ni3d0@*0v8Ksr6lr7{dJT1#DcKbIK4VhHX4vb})>?K89L zv=2-DKsrUk$&;`G&9yPeIV@E`C?ZJ}VG}dYmW9^s<0(1suewrEtU%rfi2efkdPn4* z{BXyig4nv~Ug|40x34$vD>Uz0^c7!zWzoCgX$F_8=xtv2?ksqB-fLb-6{PcP-oXuz z|JJb&j#XNkR?hs@8-MXeLF!-gp4|wy-FfHsJL};Ch46v(@S#HZP_aq6cl-+;%yv@` znC+&9#XhjeO?&S-9)`OX`ziyqebH2q&aQa}0P%h;`c@l`t%!PjCgs%d4gAic5XJ@+k+Q9XIjY5 zTU=*$*?-=_0c9$`M&BSzVzBy0P^hnO7#=9YXJ9iEAPCd`gI4)Af~eO+kR4ThoU-_t z>r+!l;Vgg%-1$8xWf}FPWSJ|qtU^(@qDe+|8_oM85d+Fb3}Zc%e-AaVQD+%&FxHn9 z3G5nLNyBinQNZQMCIMs~Ff2aeTqE89wiG5&bJ=W$Le8+VIhY!y7vSBW*dp&gaYiES zPUz0T-jR`${k{29BLH#%I%60G0Er18;;LjaNU&(Ah2RCV;Z#wxU|?YZfUnaMh99H% z?Q|F>(48qogE$DKZ^08dbZCU>2$E4G-vd%@x0!JVmkABLns2VYKxojdmXER9+dviw zNQ|rP?%_4@`J&ih;Mk&f$D?3lv9THKb?^k(VpscPBD&mQuU&qNB~bK+8P|?t=%UHv z0G(Gr?2PJ+h*X(*%P>x{yQf z2;#18++gEvDQ+#j30*q< z=wdP59mB|44gzd3-5UeA7f&Mcg(qh$CTGT>z_W0k7aTY(F%Gggm63H3gp0kX?ivDi zH5(sS=sU17iYkP?w;qg*HrZ zxu8_q=MuV$t90h2OaNGpxX2Ex$Gq_0 zUYGD)Yq)tBVlx42b#=g*I0V5{dI~s)~ht|IU3b|M~RKrdPw~zUYG|j5T&FUtIGZu%KA?ns~4b#ddZqkAD39 zB`?m}wRNb$&y%a+!7nBX;lauOC@oR}xE+wUN zsml-`#BdA~OvF@^maWZV3k%{yU<1Mk>2U>O4-C>U8%p1WjiA2+M0X6obpbv|*OmCNw)*Ss*97Djmo!gmC|a6^c4)IBG7sYL~2Fw6FgsX zGIYcP@^RNgNA1PnNPTp~Ai@6!ESiOv%3mnMBK$aDDGiMCNcb^OmSBqkjY^a7Wvv_n zdlq2YZU81rddi@QDtIb(It3ZMh6HgDO#o4=p%8-y`RBGCNi7EYH|&MZ=@tysgh(}7 z5|KE_wgaUwLqZ;ggqleJK1%E_`1jxId+0y3DjqUSeiGM#>ahxvumvLdOZc%s7ebPk z;3J+1Nt%R61(Co50usnfqyeQ;H$x+S6OCYwXasVT2TG;q{X$O_8i6N)wVw#@5%qEueyw%y1{Y4lMr(KjtjLcT?~7 z4+lRv@v!Z{swc9_N7$-5+o_5p;rCUFK@{o+qB|h63Vsg7<-wOb_J0n>2jQUc5XI1| z8b$a`5mo7}C@N9>?q~joL=^lIgNvf((pwlqPD>=!sD(9S?2#c%z2Rid|F5v`(-iTe zE@(LbNyeSu3l&CSg4sBX_ZzyQvLDZ59lrY(Inc%91~7h&-h@_wL_~cA1Uzbv+Y|{G z_=>o`BEGMP`0e;Ir=No^UwCcKk=_`?AW!wqo=T=XO(aL#vyPomI8im*WIJ* zZIMD-q(tE1^Ia?Um5VE$!i#4%@wwC?a-roHHVI^o7Q**-2_TYKlfrKJotVl6AIY;M|4?ZMPq}+1IX`>th3(2tMbwm{JA-72&mz?sddp>4Ca!XxR zuimSA|M%+8>2w0YX9z#m|0p5!PX;ujnmAWZ%wLRd8Mse_2R+`mhTeZ8e3$TyRzHTTVAq>d=C_t z!5q12yhTI>l`OkdB-rzbRV+7*9Y`R;$jp5B6WE1-;K1R?MR zaD`tb2)HKH*h?RW8KenKM{h+!>`S>s*HD{w4V`9V@-|2<8j=}bkG%B0SW^NHMbKIV zZtOWJC+waQ(odweB-Sf0u}-|r$769De3icDsy%}g9O&5aNpt4VD#|8`gqaCqR4`TD zs*VKC4g4-U#4cmk)vaZDVzwaP!^G8XOCEKpaL+Kzs>EFWHo$PY+x1l9DJ!1mC=l5| zTkV@Awu!wJ?7K3|L@&ECwp7RhEgPYNV-pXeM$<^O{}f71cSF(+dVz7vLDrB}$JQ;+ z&DjoIq~{>|lt5IH0v)Ir<{K4sYssc@-DE5beUVDQUe#1jrLl$*v?G*6X$VHULD`0| zO4-mEGnQ(klU>aF#Goj{+VswD?MOAbCuU3;F&JiPVb*7n)_Uh0j{Qv;9lzqtDItG~(r{>pE!{4I68g??i% z_2zRaxaX(E$NAl!i+`1}had%_38)oLP~6;dBmdYwYSn;W1Mor(fNWahD$Uv#kcI=~ z2RrkXPvcp&dm?T%>c%=BWEkI!>2=;?ehI$QD#q!zK51p_8n3jtuJQ9IE4A{Y(^Nmk z3OG^;^{ajZTq3tJ#dJJZhRz`Wkmw$k33h!0c05~N(hZ!e-aSd}2dTvp?(4368$x=b zFb4g_Q9afwSkC1V^S;BP*p&^ttQs;@-}e_5W@X)Sd~e+LJ(vOpZm#+c3ndvsCoJ3Y zm@XHJvv(G971y>5dCZn?-&uG89y4tX8}jvFdN<2cVFeowO={RJ6W#G3*J;#u)nyF% zEwUb9YS|d+2IG=p7-#~Lw_yw_SgMH$^9mKAN0_QdMeOL1ky(KXE;g2k+&b6=LWYUk zIPS88kTNnEU^jHMf=FR~w0Z$-H+j5C<&KkFhGVV*!v^{@_wr8 zVc|hxzx&&Df$M&!E}}DYT)h*W|K=f*&L_5|v;WSZ^Ov?SUEA%y4$v3F*J{I)JHwNo zj{GJ2r|jPF0&MO{L;IaW+tLsj0v_2El@x_3tSB_0C}2R}VC$5k{J>X@1}3H`nq5{D za*^qjf&%I3WmaQsjk3}igb|Gw;U3pf%a{x@x^89OUkslI?nIe6fv|`qV=(**1^fde z5!M-l;T2f8w_v!BJ{Mj;NcL{LcM$EaN6tiF+q_uIjO=7a>Ii^e_WXxUhddH90k-~H zS5)5b8?N=`clz>m9-xEx&`}iTkWavZ_1$9+6_(aPJ3kAHN62fn`sVXvnS;dP;L#My z_$H9G<)#vZQQ%f`1rSs4yR6b~pf6H`yD51)d0|_)5Zbwie2blBMUG}#fd-go#UgPZ0Q?Zn$C4CIe*HKnN>GG`;Zs*=c3!{pqP&h2l+m zE+7a_5(K-BchPu(7V%=Ys9Nj~?b<+HpeXid3KZP}{X<#`$eqeqE!KY)=nu74n_~a8 z=iHegDMs1tk6v4M?(^K&x#xW6T>eX4ori)TGvAj&0~GaJ%vix;CSH7(rl`*-ff8t$ ziWpxyLX*5LVk2LB#7@3U!Vz)M*v2M16RwDhr0p`Ba7Ww;PsEe(M!ckqk$s7}NL|7o z@sqqm4kYR$^(5_-8xp}tkfdF5W1=b2MAEF>oM?%(khEKFO|(VYNZKQ}Cpsb>iOxtT z$$RClM0cc{qJlXcKc<96(e=n~){F>EVpw#wLw3<7G=I!QMuirU zioa*nSI1D_3iV@upuP?2Pq`_f1O7UN_KzKr)2no->vu?ICPeFX7p_dtWfo$oL?W6L zLU!GwWRkopDl4L*H&oL6os@DLihQc5WzuGYkWCLHqPImpsx0y8r~=*ej+Lk^2~kZn z3IwAmwoqZX_FGr3T;=C(zJB5ABtJWGV{YmOKQ%jj<=Q2v^h!xdi$=EUv6h&Snop2%S z)Lk)_(NZ|khN~G_lW>Hp!fs=u5>^!s4owoGwk&2;7_W6=c6yHNGY$vU)`&fKqOc8% zDFwz%q-9Z)Qb`_ii&06=C}Q*^@X!@9DK5&%saRCzuBFA~#PkTZk7~F@91!P>Q{=Ri zk>X(MJr$BFw<1NkH|FPO!`!5}7=?+d*!KDzF*%CS>5-G+mtW_kMJ}1*5-CBHIZ5Tz zv>1~XB~b`_Jab}Hi7j&rt6WBvl1pc~_eMsRMLGT2IXJaNY3Y)Z%B0V~hg8M8X&H8q zOQmr+D)v#u7^b;lF}xH$;W4{j#et0OCpfe0qJm5S?CIgSo2pnICMwH>xVoIW!$F3N zuD~8eas7*ObV=o+8kb7O#IPq;eQnAT={}TxMu`+kA&@{cNFQ5-5gmeEbjJPFU5vEA z2#!TYaDL1@GAkqOA8G4?`{x!|px1O5J<$vF+|b6i>IixC0AYyAeOw!!Uxww9gM5!u z%lDK6?%;rtxD_~44z6%mcaTOpgBMMADsW4c8xAkeQhd2Jv~X7UG16)P%Gs}$3%1+PQ}JnDt1e4f;*q9Ssjuxd zNqW^C-b)Ku=jS6b@M%!-^!rFA>hDhv)pE1mFBaZv8} z6Qf!pJ(`x%BO08jsEypsoO$(?+h@d4Sy~uPuWHMwoyWhZKk)*FqGDw>fef-CVZ{ze9AHJd`Z#_@gNjm+cR*e; zswgiSTZE)IP8o)e&RY6rt*v&cR?T^6ry@XEr*_!>jqxwoVdWV1^}vlX)RuZ+dgYt) zb7=J$m8Ik5y8;DfIQ%|nc$6dv^kbW))N?k?WtC@b))JxAZI#Axel_9n5gyyE{#hGe zzHeFE92H__);Qz|Q{KN*=Hzd4@0f!3+pv>fK~!UkL?p59$e@-5E#^e=k`O{J-4hiA zOPw>xXhPJzlT&jSZcNY4PhXo+a3u;(Tz7&PN<=l?DW)Y*wIRDQf_2DW2owNAm}FEu zY|c6|e|g(^)c9*XwFh^weggWcGZ3v)drqpYzu0=B(0ZcetN(QRlj)y+BmeSSdv>aA zhj`2dr_?{hdyYAk72YihQkg>a9$Qb;U170(7OWPn(eM>Fvw?xlT$_c21TS`b= zfOKU^It%}#=FQ%m>&W`5 zW{q-*1np%j6@SG_Rld;cQ2vUeG^z=L6L%o%(_XVG;;2Y#Lbjrf==e8lDk1Bm9@jrI z?^&+SO=auiC?slvB~W1h|L|!wRQ4Jup;V~}6-ukr!2ct)JJenK+teL86r53dVa&B~ z1TFi1nsMFQj)U(AhDh1LGRWPRST_&MXMk`ubl-`If_~m%okDSmsb}Q zN#NxLG_{f`jfa| zesK|^Kc_*|!`BG$vRt)FD6kelO5)%UXzCN2$_lpowu-F%h+d@tNv}2D5M!dWA`0A) zz^T$wGAf7X!NuS-OnluThzpq|-7~kUYGPvQuB2flHq0b+$D)GBBd9KCG$C~-$t&Xf z84(%^CYV?^@Soox%rArcLKc;^!D{RsK1a4fJs&>UY8hs(BB%ZLI`y3Of9m_hmv26{ zd2yRP{i{G)8o*<$Yay4ywMNs3?F)UY-ZHh>G?&uhVDO(uNrhrmta$4X+HT zp{jGYY=5wR4QIg#?vA7_lNNBgg2(~VQox4bhE&6(OMyKd79>?uq=gKul|!!rHdsPa zIt+6NBvOExz#U*XWY8Dkl#(*|GS;qFrUM&}e8E|a#TlN9VVGVm8~uSfg+p%LZP*|D zVp0#1%`}}Lrm5pqp#y04F zYoqv>90k+L4V=OQIZa*j&jBN7xYfuF;n09G0OzNBZbjvcIHf2lMQ3m!XvkoGwGC~DbrZZNJ#+V=wHyAjJ-E_1JEpC6}@H=CN(Low;Y_al8*6& zEE?BJ14am4^Z?$e^1di=D{K0}3KSF|x*I)l#@RIh8>8GpC!BGHx+e~1D!(Wxs-`<) za!M6L0bT?JRy!}p)M zxE+66zI}Sj|7M2{2SS*ds^~-xK#WogQ$DX>55t=u*{I~M#t=*QsV#`>eWo-RgseQQE zK2d0&$TKawuJ(u5pSpTT)7dTm^*nohw;h0V%f8d$=-&5J4)6D_et7lIj(isWVR$3F z&5Z3Z9q4Gw-vLSK{lvRxqwB8HyS}!PuY2F^aC>0PrlE~@J|Eg{eB&NlV(W|S zP=OuVoX@jE+w7?l>)-Wu6uqYk-c#$7B}|SLykqN=J8c6SlbeHEZKHXn`8iDT@`sm! zxO-@q?I^MBK-=Nmucum$?*zGG@MIx)veY?JYU(XEjTV|li%qW=nqDt;bV5m~xueu^ ze5bXy*cvLdhBl9HwZ2@!{X1P~Ih_xl-f#4@IM-+P+89S~iESyeeFe5}n;k&e4=c(y z4Q{hTq@t_9c74J2{P(_{at98t`@M?Rz%PgD-{_?Nx%23>&3?Gy@*b$puQOW|+nW@C z{X<=CIXX)lW{qmJqQO-*3H<2M&@n+m1?E&a7Rsi}yTGqGTh6Yb>x6{E+zWBMdJcHM4_i~q5_qx)5^v~)(IvEo24HyM)#MfcY$+q{#+neuc0r61g&g7 zWNle{7MxPq`uHiUw(^D2@k-80WdllmHV{8!H#+!NfrHt)^^Onf4>Kq}Xb@cIfW+Z+DrQEu?gRe^fyZpc zyg?{G2ASCJjmzp3)2`q@62)E%*ozU9E2_@@u`N};jXe+uC@My1Mi@F=ft22U5H-N7o~VFL0M?EY!4B(H)E`*yJ6|zKf9nk?s=G1j!+WHP;0t@`CO&F(b(--@~>| z2Y|G7toN|WZJs}HN_%A*yxE4oeFqVeiaS*WTFj&YU521e!RW8yPu&I~2*@SS_-XEb zF3)u3yoX4~ zp9fnG0XU_h@%78wOn+(c2!8vX9|Cwj@6HELZ3VBbU$zGLiB{~LeA+t+7^dX=)(g8W z;Mt>W03=wdrKi|@w9tHX)4SC?zCJ^={aB%CEMGsiK3xro{M|+W$%6mn`lV9yV6pjp zq4_*CZ|pBNo-H(8JeWchOE_8=~6X*cw z)<070yIAPExZU^0ZbS2avv*EamRc_rFqLUj+o&^HQPvdj8BT#%u=^I0g{!v>t~|_%9dZ zaDF*OBNYmw->ay+|FLtT<>H|Ip9UQlM_e&eeLy)yp%_O)=f&Sc1R%tImsUY|N)-4b z?T5W)2%wBDA)a$oc~k8b(2*@GU#lhfYsvt@QY27T_NyKSJG3(FAW96B*&+k#gh6#; zsci71F(Jp=%TwjIEn*&nl_{-A0{G>t-?t6fU|Gtd8rFS=g=tu&L_g^6@@xAHyl~h- zVkijybvu~a3Oc0~)c49!j1XmooK-g;ozr@qNz}Au{h1XBeiM*1ucZ0jp%`PTD0+=s zXe=m8V5JN^W^$gXO9ZvXT& zyk!iGZ!<6LFbyAHLzMjHhi~q%?GMA7gHMi00ABQ&hO>c`|HvDVyTRJLGv!Z}|%Oj3&vd@ytYg2w}r%q@Yh;Zz9c!J1uND z)!l7Osi633`^{2qTA}rJxza*VKfia7Kh8f8sRNd)-dmlV}b$T?MabhY8Vuv z`HK>w^7T54lrc1_KLypB@(qm8TcDtWn7r2L3}P7aG^0#l;UGpteGqm=I=akx%)5EF7^aW5+0y{0xL&G*Gi556H%p%N~0Bai> zxkH$KOqg+|t$q^l=l)*6H^t*_$fNYWD)jIvRF*t_f(v$~rH(;h8CIQEW zn(2Wr&5)-DF*~K|N+(WtraI2Hg@{FqiV{SS=+pdX(@Mj{O8n$T3aRn*#`EsY_isJ9 z$*sWOvyvV-imYcr5L)0}^pe<*QfYaCZ>7D2%XZRJx)-wk@?2BeaJ7O#>HtUN&-z>E zYj=OH@1B=;$8bZu$$lW8ovd(Ny%W+SE!sIHElnAc2m5)lXm6UL21J#gxq;8zl=oNX zgHG>ph!Xechz!dMVi6xzWp^*hwoT3j|6g4fxbET=fR-k(E^!q=&L&mA^zZnQpY~s$ zPiI% zi;m+)QJlJ|A4d|6CGuXr3aSjl&B|Am4mg&H>v4g@5mx}zvI9Gne~fa|Ll*)YBe0WV z83CJ^wO?RkGr(cUCjeiOUkz(mt$n#OEG&;STDV8Qd1tV4{bJ?%h@kZS)fX8Z>!i4n z8;putq4wQpgQj)Sv_=GkA3h!<8I^T-@k?WE(Acadihm|*97LdvFz!X{~@dgl8 zggx}5$Ex=5^ZVP*)~DNBcXv8>)N8o2?QZG^vOTTc$;7NORo|TZ3(BjSuT+(9=dJ%h ztV1OHUgUcKBb`z@E)%-+8>tQT%0R#NlYZ@Y6hi?lIYq;;_lhPbtkwwJTx;8qWyC#UREg9~7hHO zP}3d$xa+eZ7J3|}$}FzACnaYaVZ81~9z#2$H#rZvAt#FK37&_L0#oYxuiIAaM^@B{ z7GsFU;>GRmQvKT260^Kc%Zhq&I}9vr!Gbuv7lj>2XSrU-N?SrqECj%edP&Kl)CQ9c zkwm|C3TWn}R~bve4SP;bG%{2DT(~j>i6knamb`7#&;_+3VxG*{wi5(l?8Kcg5THGiuAI+cLNqBr z8|mbJceXht>K)wLTyqe5A)nh6>T4mt_#?97E+dC9W>YAvWbj?*I_Pjb;v!rF ZMUteUMx@zq2EUjJ_G;& literal 0 HcmV?d00001 diff --git a/entrypoints/cli/benchmark/__pycache__/latency.cpython-312.pyc b/entrypoints/cli/benchmark/__pycache__/latency.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa78eb819efeded15a62ab851f1531ec9c736a8d GIT binary patch literal 1264 zcmaJ=&1=*^6rai0Zns;lyG5aBiCVBLbaN@zf(S~%ix$D+As}fsGwrTTvgu4#+~T2! z9t07*_3BaSKj8o1MGvA52!eumtFB%=`CbyYi=YFUc^|(w^M3EWyquV*5Llt{sPkAO zo=i>DUSNOQGFcXf)o9vYG9nv`hd8ezsRVJ0{C^*r9S`m|wv zMP;R-sjAPI7la+p7aLL;k>80MRGqlWkJ(jU@*uw>--ijMpzje* zT*`>6G3x3}a}B1uCbyWuOLr$n!*ZFK%vv{CY1h~J{}Vkng0A+hMnz2z z6TR-IkaPbq-|h3OZEmlHxqZ!+LniimSJ-R!!Z5O1$aAf!N(F3tVR)gPLU>`eyX&@Z z%TBxza=YcHK^rZCZ}qrLWm8#+FC-Vr59cZkRQN&g{1({~j&z{*nH{^Q#@8UH_$KojFfrGh2K9{GS+umyFsbUp)yEZqc{V0g z+4EvQ;-04}o)>jlFGRWOd0Ra{EIcL8W8DDE1{##n=3%17gV2{U;!sPb%D^q_sycE- zZ1f^1>&>jmS?yF+rheK93h>wp+#(Y{;GzaeJSSN$85vOmL$+n@yaU=Ud2cLy9G}`* zem8z)XZfQ!d7x4A)YG}QXO>={Svnw~ykOt)80f@0mU#~>V%1QpKbd=5n|)oIMO^*a z6yoZLt9?Un{xUw=C<&Z6bppqtIUEDoEao61lC?mc%^)L_(}c-63~9FD-xv75=WJ$B w!fNO3DqqU0d|8|Z8}?D2g6%-3lnyLHC%=#jAIP;IR+ZLYoxb>!z?p^r2A$eMxBvhE literal 0 HcmV?d00001 diff --git a/entrypoints/cli/benchmark/__pycache__/main.cpython-312.pyc b/entrypoints/cli/benchmark/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..088ebc2169e9e1681ada7ceb8014a29b1606b14d GIT binary patch literal 2913 zcmai0Uu+vi8lUz4@!DDE(zL{GY^S~?Ewz-`5=0G%0L?#g;nG}0dM7Y&%f>rNHtgDF zc3lz6k@C=oA`q0Pq(ad0M5|OfPWN*6bUY&6i+#Y^HwP#ZcMrT-Bv1Enx^H&vI86FCm^ZgtDEffkO7^d*HG44m`JGN;Ju}K_!1jG)~knVgT2r6S$u}) z7>_pvOU#HaEtrz!&-ginxs9Hdy7o%^fEUex70d)Vlv6O;=xa+`R3_*{b4Zu2Bi)b1 zJ>KJI6kWz4ES`n!s!tDG7c$+OTr&7Q6E7w?8lJ+r?7F3q*O!Yc*}P?GIekjAF^EFb z3-fIzFwXvNVd09p{M)JND>LfSD_S_`ENR5X zBR}T zmHJ$cOCQ6hsGyNGn@R*bMUq?0a>x9_V#R(37hu{!Wwdcz+60!-UX#nAO>~c2yvNZ~ zx?vYIC%dk$7IRt0$mfzAVHs*OXyU)mryZ-1E*OPT2fmJ-(Kn0ZPd@SbI8K|!O1iM= ztmkv-Yo=+XAzoy&kT-IUoz9v@+O?l$McUGgT&l21`K+avI%P`=1Y!0~mb%+$Ush`J zKvf>tb-t2^4}gcVJ~kg4?qvmF?7&q>2#9+q>7#;;%~j8Sb|R4=OeK}CX{MoT4lc!} z9!}O&OBw~N%`$8o)n$eBvZFzWSi#Lh`vD`mKXnu)QB_=QtovJ&Y)e zZG>~WCEqyHqT1_^#^XmVhd6b$w3kJd<%m1p`ICUkzGakD7E2m~bmZbv6t5 znZ(Qa0xLfIRY>A49ot!AxC|;5ZEX!-rNK5DATq&k6b*uPDgfA_T%r^_rh})#IyMW0 z^=#75Cgv(=u@dV)mY!5-P}6nQOKd9SGz*h+0HNKC-A;MJQk-_HjFmMt+s3x4QpLuO zs^e9yXgW4wKBfvg2W79a9z|FUQ3)EHr8!i>1;fm*B_;9*un8-4s(80)e!s`twLeS! zU?ixsX^0VRfdq#W?(hn``$(i3Im0OpPIV~n2QdSn!u}sj+o&O-$Y3ozQVoyPm7e$J z-oL8XK?ogm;wpmHH#Gombv}r5+o)y>M%x z9*OVjw~brI?#p+%yN~{1s{Wax6Rk`rz@L!{U ziB^;gUn;Y8<#bI+RF%Y?_?Jrh>!FF-&~$ZZy52wFnS5vR&e|vD$L5{iSHfe5yl)s( zeWfHCr%~iG*NC8=SVfMy-7VQoWmRR-ho}Zs)ynHdlhKN*zERZ7ro^wRdOoYFWDwlu z<%wcDnqy=w3=e=#Wr(^POBz@NW$>b8v6~l!y_)otx?EgY@|x(R`^XR$NHWSMR(P&4 zZbXU9wrpZ)Wg9SENBUE_Ji4#M+FV^DeYCO##p-*QH*wT>t#8|z5u+~3T?H_CKu-gxuI}~B{tZ)rrm79g% z(O`|rZTAr%c93)QytW$Y8n^7dL-2mODjvKSm02RQ93|tRf&FY&liTQP<=hv_FDqSx z6=Bc|$}h;{AfXa;1|zpdhgic%9eZ9=P-iK(WZ3hXT7mysUY+)LM6w2B6V^>>-$Q_+ zW0=i)4PAX{X;z*7)$H`I=NISbSr_ZdQ?2l-9-IY_c27R=FU`F)6->kSZg?WDRG#pX z>ltzxPBZt}3otc!j^hqw#P$3Oh5m!)zCmN(pox7xP~*dY=fmF$m-pl6Yw@vae5@k$ fA4)@9Y(!&uf5892i#>=sAQTkDTXpr~$@eDP?IP$v-n{R5zxQ6=jEz+YoXB_@KGF#J zjKaY&`%LdTFx!L?MkC_ltwl7}TrJjJouaHqMr^t!B^sF{%-AB#nn@$}!=ypwshd0rTCp$I7COs8JC6N?-Sjga6gdhyC@2Scn{eV% zMqG_iS7(}QFx@q|#SC7$KRz5*@XTb^ior@-#!lbpj_?XAgZIgQdXEfwSG!iDB2OO{ z%Efw^tRB|8F}>L0_EH9VzGPzxL&O>%l*c}#cuJSAQ7iq1EvKt~vZ z9z>ZK0Y1Y_w*C9IlgFu(hUrWWli~Ty!_Jl2t7})d6NSr8+Ra<-#6b`aPjb;s+hLMt z4h+&!pm21KLe)&W>XD5L19S_2x2ie31V^z7T*FQBrF{JP-FM|PJ&=era7BUv8c_Wk zDH%p8s^AM)jpD*>$S{p9jsOMhG258@Pjz6gV+{5Oi>f|Zd{z-yXJbs3JumTN?s>A} zd2yR{BIK){_n_lPeM`ynSUUiYc1+xu1tY-+5#nvSg_ra8ZLIUJ+G`0wYSGP(%&$9!Oi{qj7O>w6=Nc!|3JBTYKjC zzDCW77t`<0&F-F?-6tTtWBKswtos(RYRJ`}Prt8C?bfEyum18l`qj~|_8qOo z%lK%cByiv430#WiUO6zaWUj0SjR^fjDpmICl literal 0 HcmV?d00001 diff --git a/entrypoints/cli/benchmark/__pycache__/sweep.cpython-312.pyc b/entrypoints/cli/benchmark/__pycache__/sweep.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9294b4d7f182cc3d2537abf429514685eed1d2b4 GIT binary patch literal 1234 zcmZ`&&1=*^6rai0esrsKSNx)C)C%20H{d~Akb)w35Eeo45Rf#@jPBYb8z)(H7Y{x3 zAc)XguO2P^2mBwr=s~Ijf}kMYR#z{cd~dRD*FyuDc^|)-yx;r1yqTUZ6L>=7dAq9- z@&%oLk2x}i%V2B~Mi>o=i@O%mNOQGFcXf)s9vYG9nv`hdI$_2JVJ0_s^!&VQ^=RGt zj!v_#$%@aI7lduk7xzp&&lUer+R@g9hMoG8C%d80KBm1ID6X;6Ms+Z-l zy+R*zp6pfIpTFJW_DYJ;6+0cXu^bv{|PDfzW(C2vbo4JH?x9 z`_Bz0ixMYkCkq+q!Lx-Y-AhZ0YnQkawpX2`m$f>vgT*^M&O|TiwBsywKt4y2LBX87 z*rcaE(my|Db?g5IH712Wh&ABot&^|C!<%>B7iWjy5ozF91Y<4`+Mg$x$gM2hDe4eVVU;8k%uzu~cIWy3xdGh7_?%Abx zXO{*9oY(9pZbO||hg5c85vz(;ZF7FNI`^(Rhq&6S!-%UPuKEMB`Iqsd^@6}zlgDu& z8sh;-8~FslR46J+0(nk>CZ?tV`MpDpRrtFF{?R#~Q)pqobGeF_^FA+&GY~@^)040a dbV}*aB6Q{(S^Pw<|FSBy_V)C}-voAb{4YG)Hwyp& literal 0 HcmV?d00001 diff --git a/entrypoints/cli/benchmark/__pycache__/throughput.cpython-312.pyc b/entrypoints/cli/benchmark/__pycache__/throughput.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dade87e59b522e37931ae08ff528e5e183c59691 GIT binary patch literal 1272 zcmaJ>&ubGw6rR~%`O#EsQY;iJ+gh*@n!O0MC^n_TKZX5O3k&CL7W_crgx$BRe|PkHIS zk`Vd|&iPOWMt?;Zhln78J!HZvdDxds$(KzTgD-oEubL`G5?V(@*+)dB>am=jH?&~i$i4hW_GjQYBW4I zpt>70C=*bs4{mq|xtK&RxS%K|S z%i=}L@>`_sfnBn!CvDpsL~@ozT8`LOV1p}7>V None: + """Add the CLI arguments to the parser.""" + raise NotImplementedError + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + """Run the benchmark. + + Args: + args: The arguments to the command. + """ + raise NotImplementedError diff --git a/entrypoints/cli/benchmark/latency.py b/entrypoints/cli/benchmark/latency.py new file mode 100644 index 0000000..60f2b03 --- /dev/null +++ b/entrypoints/cli/benchmark/latency.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.benchmarks.latency import add_cli_args, main +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +class BenchmarkLatencySubcommand(BenchmarkSubcommandBase): + """The `latency` subcommand for `vllm bench`.""" + + name = "latency" + help = "Benchmark the latency of a single batch of requests." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + add_cli_args(parser) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + main(args) diff --git a/entrypoints/cli/benchmark/main.py b/entrypoints/cli/benchmark/main.py new file mode 100644 index 0000000..2ff9857 --- /dev/null +++ b/entrypoints/cli/benchmark/main.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import typing + +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase +from vllm.entrypoints.cli.types import CLISubcommand +from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG + +if typing.TYPE_CHECKING: + from vllm.utils.argparse_utils import FlexibleArgumentParser +else: + FlexibleArgumentParser = argparse.ArgumentParser + + +class BenchmarkSubcommand(CLISubcommand): + """The `bench` subcommand for the vLLM CLI.""" + + name = "bench" + help = "vLLM bench subcommand." + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + args.dispatch_function(args) + + def validate(self, args: argparse.Namespace) -> None: + pass + + def subparser_init( + self, subparsers: argparse._SubParsersAction + ) -> FlexibleArgumentParser: + bench_parser = subparsers.add_parser( + self.name, + description=self.help, + usage=f"vllm {self.name} [options]", + ) + bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type") + + for cmd_cls in BenchmarkSubcommandBase.__subclasses__(): + cmd_subparser = bench_subparsers.add_parser( + cmd_cls.name, + help=cmd_cls.help, + description=cmd_cls.help, + usage=f"vllm {self.name} {cmd_cls.name} [options]", + ) + cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd) + cmd_cls.add_cli_args(cmd_subparser) + cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( + subcmd=f"{self.name} {cmd_cls.name}" + ) + return bench_parser + + +def cmd_init() -> list[CLISubcommand]: + return [BenchmarkSubcommand()] diff --git a/entrypoints/cli/benchmark/serve.py b/entrypoints/cli/benchmark/serve.py new file mode 100644 index 0000000..6616305 --- /dev/null +++ b/entrypoints/cli/benchmark/serve.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.benchmarks.serve import add_cli_args, main +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +class BenchmarkServingSubcommand(BenchmarkSubcommandBase): + """The `serve` subcommand for `vllm bench`.""" + + name = "serve" + help = "Benchmark the online serving throughput." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + add_cli_args(parser) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + main(args) diff --git a/entrypoints/cli/benchmark/sweep.py b/entrypoints/cli/benchmark/sweep.py new file mode 100644 index 0000000..c385207 --- /dev/null +++ b/entrypoints/cli/benchmark/sweep.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.benchmarks.sweep.cli import add_cli_args, main +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +class BenchmarkSweepSubcommand(BenchmarkSubcommandBase): + """The `sweep` subcommand for `vllm bench`.""" + + name = "sweep" + help = "Benchmark for a parameter sweep." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + add_cli_args(parser) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + main(args) diff --git a/entrypoints/cli/benchmark/throughput.py b/entrypoints/cli/benchmark/throughput.py new file mode 100644 index 0000000..2097f9e --- /dev/null +++ b/entrypoints/cli/benchmark/throughput.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.benchmarks.throughput import add_cli_args, main +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase): + """The `throughput` subcommand for `vllm bench`.""" + + name = "throughput" + help = "Benchmark offline inference throughput." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + add_cli_args(parser) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + main(args) diff --git a/entrypoints/cli/collect_env.py b/entrypoints/cli/collect_env.py new file mode 100644 index 0000000..ad943a6 --- /dev/null +++ b/entrypoints/cli/collect_env.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import typing + +from vllm.collect_env import main as collect_env_main +from vllm.entrypoints.cli.types import CLISubcommand + +if typing.TYPE_CHECKING: + from vllm.utils.argparse_utils import FlexibleArgumentParser +else: + FlexibleArgumentParser = argparse.ArgumentParser + + +class CollectEnvSubcommand(CLISubcommand): + """The `collect-env` subcommand for the vLLM CLI.""" + + name = "collect-env" + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + """Collect information about the environment.""" + collect_env_main() + + def subparser_init( + self, subparsers: argparse._SubParsersAction + ) -> FlexibleArgumentParser: + return subparsers.add_parser( + "collect-env", + help="Start collecting environment information.", + description="Start collecting environment information.", + usage="vllm collect-env", + ) + + +def cmd_init() -> list[CLISubcommand]: + return [CollectEnvSubcommand()] diff --git a/entrypoints/cli/main.py b/entrypoints/cli/main.py new file mode 100644 index 0000000..a3e73eb --- /dev/null +++ b/entrypoints/cli/main.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""The CLI entrypoints of vLLM + +Note that all future modules must be lazily loaded within main +to avoid certain eager import breakage.""" + +import importlib.metadata +import sys + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def main(): + import vllm.entrypoints.cli.benchmark.main + import vllm.entrypoints.cli.collect_env + import vllm.entrypoints.cli.openai + import vllm.entrypoints.cli.run_batch + import vllm.entrypoints.cli.serve + from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup + from vllm.utils.argparse_utils import FlexibleArgumentParser + + CMD_MODULES = [ + vllm.entrypoints.cli.openai, + vllm.entrypoints.cli.serve, + vllm.entrypoints.cli.benchmark.main, + vllm.entrypoints.cli.collect_env, + vllm.entrypoints.cli.run_batch, + ] + + cli_env_setup() + + # For 'vllm bench *': use CPU instead of UnspecifiedPlatform by default + if len(sys.argv) > 1 and sys.argv[1] == "bench": + logger.debug( + "Bench command detected, must ensure current platform is not " + "UnspecifiedPlatform to avoid device type inference error" + ) + from vllm import platforms + + if platforms.current_platform.is_unspecified(): + from vllm.platforms.cpu import CpuPlatform + + platforms.current_platform = CpuPlatform() + logger.info( + "Unspecified platform detected, switching to CPU Platform instead." + ) + + parser = FlexibleArgumentParser( + description="vLLM CLI", + epilog=VLLM_SUBCMD_PARSER_EPILOG.format(subcmd="[subcommand]"), + ) + parser.add_argument( + "-v", + "--version", + action="version", + version=importlib.metadata.version("vllm"), + ) + subparsers = parser.add_subparsers(required=False, dest="subparser") + cmds = {} + for cmd_module in CMD_MODULES: + new_cmds = cmd_module.cmd_init() + for cmd in new_cmds: + cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd) + cmds[cmd.name] = cmd + args = parser.parse_args() + if args.subparser in cmds: + cmds[args.subparser].validate(args) + + if hasattr(args, "dispatch_function"): + args.dispatch_function(args) + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/entrypoints/cli/openai.py b/entrypoints/cli/openai.py new file mode 100644 index 0000000..fb49be3 --- /dev/null +++ b/entrypoints/cli/openai.py @@ -0,0 +1,256 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import os +import signal +import sys +from typing import TYPE_CHECKING + +from openai import OpenAI +from openai.types.chat import ChatCompletionMessageParam + +from vllm.entrypoints.cli.types import CLISubcommand + +if TYPE_CHECKING: + from vllm.utils.argparse_utils import FlexibleArgumentParser +else: + FlexibleArgumentParser = argparse.ArgumentParser + + +def _register_signal_handlers(): + def signal_handler(sig, frame): + sys.exit(0) + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTSTP, signal_handler) + + +def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]: + _register_signal_handlers() + + base_url = args.url + api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY") + openai_client = OpenAI(api_key=api_key, base_url=base_url) + + if args.model_name: + model_name = args.model_name + else: + available_models = openai_client.models.list() + model_name = available_models.data[0].id + + print(f"Using model: {model_name}") + + return model_name, openai_client + + +def _print_chat_stream(stream) -> str: + output = "" + for chunk in stream: + delta = chunk.choices[0].delta + if delta.content: + output += delta.content + print(delta.content, end="", flush=True) + print() + return output + + +def _print_completion_stream(stream) -> str: + output = "" + for chunk in stream: + text = chunk.choices[0].text + if text is not None: + output += text + print(text, end="", flush=True) + print() + return output + + +def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None: + conversation: list[ChatCompletionMessageParam] = [] + if system_prompt is not None: + conversation.append({"role": "system", "content": system_prompt}) + + print("Please enter a message for the chat model:") + while True: + try: + input_message = input("> ") + except EOFError: + break + conversation.append({"role": "user", "content": input_message}) + + stream = client.chat.completions.create( + model=model_name, messages=conversation, stream=True + ) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) + + +def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + parser.add_argument( + "--url", + type=str, + default="http://localhost:8000/v1", + help="url of the running OpenAI-Compatible RESTful API server", + ) + parser.add_argument( + "--model-name", + type=str, + default=None, + help=( + "The model name used in prompt completion, default to " + "the first model in list models API call." + ), + ) + parser.add_argument( + "--api-key", + type=str, + default=None, + help=( + "API key for OpenAI services. If provided, this api key " + "will overwrite the api key obtained through environment variables." + ), + ) + return parser + + +class ChatCommand(CLISubcommand): + """The `chat` subcommand for the vLLM CLI.""" + + name = "chat" + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + model_name, client = _interactive_cli(args) + system_prompt = args.system_prompt + conversation: list[ChatCompletionMessageParam] = [] + + if system_prompt is not None: + conversation.append({"role": "system", "content": system_prompt}) + + if args.quick: + conversation.append({"role": "user", "content": args.quick}) + + stream = client.chat.completions.create( + model=model_name, messages=conversation, stream=True + ) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) + return + + print("Please enter a message for the chat model:") + while True: + try: + input_message = input("> ") + except EOFError: + break + conversation.append({"role": "user", "content": input_message}) + + stream = client.chat.completions.create( + model=model_name, messages=conversation, stream=True + ) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) + + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Add CLI arguments for the chat command.""" + _add_query_options(parser) + parser.add_argument( + "--system-prompt", + type=str, + default=None, + help=( + "The system prompt to be added to the chat template, " + "used for models that support system prompts." + ), + ) + parser.add_argument( + "-q", + "--quick", + type=str, + metavar="MESSAGE", + help=("Send a single prompt as MESSAGE and print the response, then exit."), + ) + return parser + + def subparser_init( + self, subparsers: argparse._SubParsersAction + ) -> FlexibleArgumentParser: + parser = subparsers.add_parser( + "chat", + help="Generate chat completions via the running API server.", + description="Generate chat completions via the running API server.", + usage="vllm chat [options]", + ) + return ChatCommand.add_cli_args(parser) + + +class CompleteCommand(CLISubcommand): + """The `complete` subcommand for the vLLM CLI.""" + + name = "complete" + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + model_name, client = _interactive_cli(args) + + kwargs = { + "model": model_name, + "stream": True, + } + if args.max_tokens: + kwargs["max_tokens"] = args.max_tokens + + if args.quick: + stream = client.completions.create(prompt=args.quick, **kwargs) + _print_completion_stream(stream) + return + + print("Please enter prompt to complete:") + while True: + try: + input_prompt = input("> ") + except EOFError: + break + stream = client.completions.create(prompt=input_prompt, **kwargs) + _print_completion_stream(stream) + + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Add CLI arguments for the complete command.""" + _add_query_options(parser) + parser.add_argument( + "--max-tokens", + type=int, + help="Maximum number of tokens to generate per output sequence.", + ) + parser.add_argument( + "-q", + "--quick", + type=str, + metavar="PROMPT", + help="Send a single prompt and print the completion output, then exit.", + ) + return parser + + def subparser_init( + self, subparsers: argparse._SubParsersAction + ) -> FlexibleArgumentParser: + parser = subparsers.add_parser( + "complete", + help=( + "Generate text completions based on the given prompt " + "via the running API server." + ), + description=( + "Generate text completions based on the given prompt " + "via the running API server." + ), + usage="vllm complete [options]", + ) + return CompleteCommand.add_cli_args(parser) + + +def cmd_init() -> list[CLISubcommand]: + return [ChatCommand(), CompleteCommand()] diff --git a/entrypoints/cli/run_batch.py b/entrypoints/cli/run_batch.py new file mode 100644 index 0000000..64d1bec --- /dev/null +++ b/entrypoints/cli/run_batch.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import asyncio +import importlib.metadata +import typing + +from vllm.entrypoints.cli.types import CLISubcommand +from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG +from vllm.logger import init_logger + +if typing.TYPE_CHECKING: + from vllm.utils.argparse_utils import FlexibleArgumentParser +else: + FlexibleArgumentParser = argparse.ArgumentParser + +logger = init_logger(__name__) + + +class RunBatchSubcommand(CLISubcommand): + """The `run-batch` subcommand for vLLM CLI.""" + + name = "run-batch" + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + from vllm.entrypoints.openai.run_batch import main as run_batch_main + + logger.info( + "vLLM batch processing API version %s", importlib.metadata.version("vllm") + ) + logger.info("args: %s", args) + + # Start the Prometheus metrics server. + # LLMEngine uses the Prometheus client + # to publish metrics at the /metrics endpoint. + if args.enable_metrics: + from prometheus_client import start_http_server + + logger.info("Prometheus metrics enabled") + start_http_server(port=args.port, addr=args.url) + else: + logger.info("Prometheus metrics disabled") + + asyncio.run(run_batch_main(args)) + + def subparser_init( + self, subparsers: argparse._SubParsersAction + ) -> FlexibleArgumentParser: + from vllm.entrypoints.openai.run_batch import make_arg_parser + + run_batch_parser = subparsers.add_parser( + self.name, + help="Run batch prompts and write results to file.", + description=( + "Run batch prompts using vLLM's OpenAI-compatible API.\n" + "Supports local or HTTP input/output files." + ), + usage="vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model ", + ) + run_batch_parser = make_arg_parser(run_batch_parser) + run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name) + return run_batch_parser + + +def cmd_init() -> list[CLISubcommand]: + return [RunBatchSubcommand()] diff --git a/entrypoints/cli/serve.py b/entrypoints/cli/serve.py new file mode 100644 index 0000000..96608f3 --- /dev/null +++ b/entrypoints/cli/serve.py @@ -0,0 +1,249 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import signal + +import uvloop + +import vllm +import vllm.envs as envs +from vllm.entrypoints.cli.types import CLISubcommand +from vllm.entrypoints.openai.api_server import ( + run_server, + run_server_worker, + setup_server, +) +from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args +from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG +from vllm.logger import init_logger +from vllm.usage.usage_lib import UsageContext +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.network_utils import get_tcp_uri +from vllm.utils.system_utils import decorate_logs, set_process_title +from vllm.v1.engine.core import EngineCoreProc +from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines +from vllm.v1.executor import Executor +from vllm.v1.executor.multiproc_executor import MultiprocExecutor +from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus +from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure + +logger = init_logger(__name__) + +DESCRIPTION = """Launch a local OpenAI-compatible API server to serve LLM +completions via HTTP. Defaults to Qwen/Qwen3-0.6B if no model is specified. + +Search by using: `--help=` to explore options by section (e.g., +--help=ModelConfig, --help=Frontend) + Use `--help=all` to show all available flags at once. +""" + + +class ServeSubcommand(CLISubcommand): + """The `serve` subcommand for the vLLM CLI.""" + + name = "serve" + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + # If model is specified in CLI (as positional arg), it takes precedence + if hasattr(args, "model_tag") and args.model_tag is not None: + args.model = args.model_tag + + if args.headless or args.api_server_count < 1: + run_headless(args) + else: + if args.api_server_count > 1: + run_multi_api_server(args) + else: + # Single API server (this process). + uvloop.run(run_server(args)) + + def validate(self, args: argparse.Namespace) -> None: + validate_parsed_serve_args(args) + + def subparser_init( + self, subparsers: argparse._SubParsersAction + ) -> FlexibleArgumentParser: + serve_parser = subparsers.add_parser( + self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]" + ) + + serve_parser = make_arg_parser(serve_parser) + serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name) + return serve_parser + + +def cmd_init() -> list[CLISubcommand]: + return [ServeSubcommand()] + + +def run_headless(args: argparse.Namespace): + if args.api_server_count > 1: + raise ValueError("api_server_count can't be set in headless mode") + + # Create the EngineConfig. + engine_args = vllm.AsyncEngineArgs.from_cli_args(args) + usage_context = UsageContext.OPENAI_API_SERVER + vllm_config = engine_args.create_engine_config( + usage_context=usage_context, headless=True + ) + + if engine_args.data_parallel_hybrid_lb: + raise ValueError("data_parallel_hybrid_lb is not applicable in headless mode") + + parallel_config = vllm_config.parallel_config + local_engine_count = parallel_config.data_parallel_size_local + + if local_engine_count <= 0: + raise ValueError("data_parallel_size_local must be > 0 in headless mode") + + shutdown_requested = False + + # Catch SIGTERM and SIGINT to allow graceful shutdown. + def signal_handler(signum, frame): + nonlocal shutdown_requested + logger.debug("Received %d signal.", signum) + if not shutdown_requested: + shutdown_requested = True + raise SystemExit + + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + + if parallel_config.node_rank_within_dp > 0: + from vllm.version import __version__ as VLLM_VERSION + + # Run headless workers (for multi-node PP/TP). + host = parallel_config.master_addr + head_node_address = f"{host}:{parallel_config.master_port}" + logger.info( + "Launching vLLM (v%s) headless multiproc executor, " + "with head node address %s for torch.distributed process group.", + VLLM_VERSION, + head_node_address, + ) + + executor = MultiprocExecutor(vllm_config, monitor_workers=False) + executor.start_worker_monitor(inline=True) + return + + host = parallel_config.data_parallel_master_ip + port = parallel_config.data_parallel_rpc_port + handshake_address = get_tcp_uri(host, port) + + logger.info( + "Launching %d data parallel engine(s) in headless mode, " + "with head node address %s.", + local_engine_count, + handshake_address, + ) + + # Create the engines. + engine_manager = CoreEngineProcManager( + target_fn=EngineCoreProc.run_engine_core, + local_engine_count=local_engine_count, + start_index=vllm_config.parallel_config.data_parallel_rank, + local_start_index=0, + vllm_config=vllm_config, + local_client=False, + handshake_address=handshake_address, + executor_class=Executor.get_class(vllm_config), + log_stats=not engine_args.disable_log_stats, + ) + + try: + engine_manager.join_first() + finally: + logger.info("Shutting down.") + engine_manager.close() + + +def run_multi_api_server(args: argparse.Namespace): + assert not args.headless + num_api_servers: int = args.api_server_count + assert num_api_servers > 0 + + if num_api_servers > 1: + setup_multiprocess_prometheus() + + listen_address, sock = setup_server(args) + + engine_args = vllm.AsyncEngineArgs.from_cli_args(args) + engine_args._api_process_count = num_api_servers + engine_args._api_process_rank = -1 + + usage_context = UsageContext.OPENAI_API_SERVER + vllm_config = engine_args.create_engine_config(usage_context=usage_context) + + if num_api_servers > 1 and envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: + raise ValueError( + "VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used with api_server_count > 1" + ) + + executor_class = Executor.get_class(vllm_config) + log_stats = not engine_args.disable_log_stats + + parallel_config = vllm_config.parallel_config + dp_rank = parallel_config.data_parallel_rank + external_dp_lb = parallel_config.data_parallel_external_lb + hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb + assert external_dp_lb or hybrid_dp_lb or dp_rank == 0 + + api_server_manager: APIServerProcessManager | None = None + + with launch_core_engines( + vllm_config, executor_class, log_stats, num_api_servers + ) as (local_engine_manager, coordinator, addresses): + # Construct common args for the APIServerProcessManager up-front. + api_server_manager_kwargs = dict( + target_server_fn=run_api_server_worker_proc, + listen_address=listen_address, + sock=sock, + args=args, + num_servers=num_api_servers, + input_addresses=addresses.inputs, + output_addresses=addresses.outputs, + stats_update_address=coordinator.get_stats_publish_address() + if coordinator + else None, + ) + + # For dp ranks > 0 in external/hybrid DP LB modes, we must delay the + # start of the API servers until the local engine is started + # (after the launcher context manager exits), + # since we get the front-end stats update address from the coordinator + # via the handshake with the local engine. + if dp_rank == 0 or not (external_dp_lb or hybrid_dp_lb): + # Start API servers using the manager. + api_server_manager = APIServerProcessManager(**api_server_manager_kwargs) + + # Start API servers now if they weren't already started. + if api_server_manager is None: + api_server_manager_kwargs["stats_update_address"] = ( + addresses.frontend_stats_publish_address + ) + api_server_manager = APIServerProcessManager(**api_server_manager_kwargs) + + # Wait for API servers + wait_for_completion_or_failure( + api_server_manager=api_server_manager, + engine_manager=local_engine_manager, + coordinator=coordinator, + ) + + +def run_api_server_worker_proc( + listen_address, sock, args, client_config=None, **uvicorn_kwargs +) -> None: + """Entrypoint for individual API server worker processes.""" + client_config = client_config or {} + server_index = client_config.get("client_index", 0) + + # Set process title and add process-specific prefix to stdout and stderr. + set_process_title("APIServer", str(server_index)) + decorate_logs() + + uvloop.run( + run_server_worker(listen_address, sock, args, client_config, **uvicorn_kwargs) + ) diff --git a/entrypoints/cli/types.py b/entrypoints/cli/types.py new file mode 100644 index 0000000..f22b844 --- /dev/null +++ b/entrypoints/cli/types.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import typing + +if typing.TYPE_CHECKING: + from vllm.utils.argparse_utils import FlexibleArgumentParser +else: + FlexibleArgumentParser = argparse.ArgumentParser + + +class CLISubcommand: + """Base class for CLI argument handlers.""" + + name: str + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + raise NotImplementedError("Subclasses should implement this method") + + def validate(self, args: argparse.Namespace) -> None: + # No validation by default + pass + + def subparser_init( + self, subparsers: argparse._SubParsersAction + ) -> FlexibleArgumentParser: + raise NotImplementedError("Subclasses should implement this method") diff --git a/entrypoints/constants.py b/entrypoints/constants.py new file mode 100644 index 0000000..b5bcccc --- /dev/null +++ b/entrypoints/constants.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Shared constants for vLLM entrypoints. +""" + +# HTTP header limits for h11 parser +# These constants help mitigate header abuse attacks +H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB +H11_MAX_HEADER_COUNT_DEFAULT = 256 diff --git a/entrypoints/context.py b/entrypoints/context.py new file mode 100644 index 0000000..7a41c66 --- /dev/null +++ b/entrypoints/context.py @@ -0,0 +1,572 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import contextlib +import json +import logging +from abc import ABC, abstractmethod +from contextlib import AsyncExitStack +from typing import TYPE_CHECKING, Union + +from openai.types.responses.tool import Mcp +from openai_harmony import Author, Message, Role, StreamState, TextContent + +from vllm import envs +from vllm.entrypoints.harmony_utils import ( + get_encoding, + get_streamable_parser_for_assistant, + render_for_completion, +) +from vllm.entrypoints.tool import Tool +from vllm.entrypoints.tool_server import ToolServer +from vllm.outputs import RequestOutput + +if TYPE_CHECKING: + from mcp.client import ClientSession + +logger = logging.getLogger(__name__) + +# This is currently needed as the tool type doesn't 1:1 match the +# tool namespace, which is what is used to look up the +# connection to the tool server +_TOOL_NAME_TO_TYPE_MAP = { + "browser": "web_search_preview", + "python": "code_interpreter", + "container": "container", +} + + +def _map_tool_name_to_tool_type(tool_name: str) -> str: + if tool_name not in _TOOL_NAME_TO_TYPE_MAP: + available_tools = ", ".join(_TOOL_NAME_TO_TYPE_MAP.keys()) + raise ValueError( + f"Built-in tool name '{tool_name}' not defined in mapping. " + f"Available tools: {available_tools}" + ) + return _TOOL_NAME_TO_TYPE_MAP[tool_name] + + +class TurnMetrics: + """Tracks token and toolcall details for a single conversation turn.""" + + def __init__( + self, + input_tokens=0, + output_tokens=0, + cached_input_tokens=0, + tool_output_tokens=0, + ): + self.input_tokens = input_tokens + self.output_tokens = output_tokens + self.cached_input_tokens = cached_input_tokens + self.tool_output_tokens = tool_output_tokens + + def reset(self): + """Reset counters for a new turn.""" + self.input_tokens = 0 + self.output_tokens = 0 + self.cached_input_tokens = 0 + self.tool_output_tokens = 0 + + def copy(self): + """Create a copy of this turn's token counts.""" + return TurnMetrics( + self.input_tokens, + self.output_tokens, + self.cached_input_tokens, + self.tool_output_tokens, + ) + + +class ConversationContext(ABC): + @abstractmethod + def append_output(self, output: RequestOutput) -> None: + pass + + @abstractmethod + def append_tool_output(self, output) -> None: + pass + + @abstractmethod + async def call_tool(self) -> list[Message]: + pass + + @abstractmethod + def need_builtin_tool_call(self) -> bool: + pass + + @abstractmethod + def render_for_completion(self) -> list[int]: + pass + + @abstractmethod + async def init_tool_sessions( + self, + tool_server: ToolServer | None, + exit_stack: AsyncExitStack, + request_id: str, + mcp_tools: dict[str, Mcp], + ) -> None: + pass + + @abstractmethod + async def cleanup_session(self) -> None: + raise NotImplementedError("Should not be called.") + + +def _create_json_parse_error_messages( + last_msg: Message, e: json.JSONDecodeError +) -> list[Message]: + """ + Creates an error message when json parse failed. + """ + error_msg = ( + f"Error parsing tool arguments as JSON: {str(e)}. " + "Please ensure the tool call arguments are valid JSON and try again." + ) + content = TextContent(text=error_msg) + author = Author(role=Role.TOOL, name=last_msg.recipient) + return [ + Message( + author=author, + content=[content], + recipient=Role.ASSISTANT, + channel=last_msg.channel, + ) + ] + + +class SimpleContext(ConversationContext): + def __init__(self): + self.last_output = None + self.num_prompt_tokens = 0 + self.num_output_tokens = 0 + self.num_cached_tokens = 0 + # todo num_reasoning_tokens is not implemented yet. + self.num_reasoning_tokens = 0 + # not implemented yet for SimpleContext + self.all_turn_metrics = [] + + def append_output(self, output) -> None: + self.last_output = output + if not isinstance(output, RequestOutput): + raise ValueError("SimpleContext only supports RequestOutput.") + self.num_prompt_tokens = len(output.prompt_token_ids or []) + self.num_cached_tokens = output.num_cached_tokens or 0 + self.num_output_tokens += len(output.outputs[0].token_ids or []) + + def append_tool_output(self, output) -> None: + raise NotImplementedError("Should not be called.") + + def need_builtin_tool_call(self) -> bool: + return False + + async def call_tool(self) -> list[Message]: + raise NotImplementedError("Should not be called.") + + def render_for_completion(self) -> list[int]: + raise NotImplementedError("Should not be called.") + + async def init_tool_sessions( + self, + tool_server: ToolServer | None, + exit_stack: AsyncExitStack, + request_id: str, + mcp_tools: dict[str, Mcp], + ) -> None: + pass + + async def cleanup_session(self) -> None: + raise NotImplementedError("Should not be called.") + + +class HarmonyContext(ConversationContext): + def __init__( + self, + messages: list, + available_tools: list[str], + ): + self._messages = messages + self.finish_reason: str | None = None + self.available_tools = available_tools + self._tool_sessions: dict[str, ClientSession | Tool] = {} + self.called_tools: set[str] = set() + + self.parser = get_streamable_parser_for_assistant() + self.num_init_messages = len(messages) + self.num_prompt_tokens = 0 + self.num_output_tokens = 0 + self.num_cached_tokens = 0 + self.num_reasoning_tokens = 0 + self.num_tool_output_tokens = 0 + + # Turn tracking - replaces multiple individual tracking variables + self.current_turn_metrics = TurnMetrics() + # Track metrics for all turns + self.all_turn_metrics: list[TurnMetrics] = [] + self.is_first_turn = True + self.first_tok_of_message = True # For streaming support + + def _update_num_reasoning_tokens(self): + # Count all analysis and commentary channels as reasoning tokens + if self.parser.current_channel in {"analysis", "commentary"}: + self.num_reasoning_tokens += 1 + + def append_output(self, output: RequestOutput) -> None: + output_token_ids = output.outputs[0].token_ids + self.parser = get_streamable_parser_for_assistant() + for token_id in output_token_ids: + self.parser.process(token_id) + # Check if the current token is part of reasoning content + self._update_num_reasoning_tokens() + self._update_prefill_token_usage(output) + self._update_decode_token_usage(output) + # Append current turn to all turn list for next turn's calculations + self.all_turn_metrics.append(self.current_turn_metrics.copy()) + self.current_turn_metrics.reset() + # append_output is called only once before tool calling + # in non-streaming case + # so we can append all the parser messages to _messages + output_msgs = self.parser.messages + # The responses finish reason is set in the last message + self.finish_reason = output.outputs[0].finish_reason + self._messages.extend(output_msgs) + + def append_tool_output(self, output: list[Message]) -> None: + output_msgs = output + self._messages.extend(output_msgs) + + def _update_prefill_token_usage(self, output: RequestOutput) -> None: + """Update token usage statistics for the prefill phase of generation. + + The prefill phase processes the input prompt tokens. This method: + 1. Counts the prompt tokens for this turn + 2. Calculates tool output tokens for multi-turn conversations + 3. Updates cached token counts + 4. Tracks state for next turn calculations + + Tool output tokens are calculated as: + current_prompt_tokens - last_turn_prompt_tokens - + last_turn_output_tokens + This represents tokens added between turns (typically tool responses). + + Args: + output: The RequestOutput containing prompt token information + """ + if output.prompt_token_ids is not None: + this_turn_input_tokens = len(output.prompt_token_ids) + else: + this_turn_input_tokens = 0 + logger.error("RequestOutput appended contains no prompt_token_ids.") + + # Update current turn input tokens + self.current_turn_metrics.input_tokens = this_turn_input_tokens + self.num_prompt_tokens += this_turn_input_tokens + + # Calculate tool tokens (except on first turn) + if self.is_first_turn: + self.is_first_turn = False + else: + previous_turn = self.all_turn_metrics[-1] + # start counting tool after first turn + # tool tokens = this turn prefill - last turn prefill - + # last turn decode + this_turn_tool_tokens = ( + self.current_turn_metrics.input_tokens + - previous_turn.input_tokens + - previous_turn.output_tokens + ) + + # Handle negative tool token counts (shouldn't happen in normal + # cases) + if this_turn_tool_tokens < 0: + logger.error( + "Negative tool output tokens calculated: %d " + "(current_input=%d, previous_input=%d, " + "previous_output=%d). Setting to 0.", + this_turn_tool_tokens, + self.current_turn_metrics.input_tokens, + previous_turn.input_tokens, + previous_turn.output_tokens, + ) + this_turn_tool_tokens = 0 + + self.num_tool_output_tokens += this_turn_tool_tokens + self.current_turn_metrics.tool_output_tokens = this_turn_tool_tokens + + # Update cached tokens + num_cached_token = output.num_cached_tokens + if num_cached_token is not None: + self.num_cached_tokens += num_cached_token + self.current_turn_metrics.cached_input_tokens = num_cached_token + + def _update_decode_token_usage(self, output: RequestOutput) -> int: + """Update token usage statistics for the decode phase of generation. + + The decode phase processes the generated output tokens. This method: + 1. Counts output tokens from all completion outputs + 2. Updates the total output token count + 3. Tracks tokens generated in the current turn + + In streaming mode, this is called for each token generated. + In non-streaming mode, this is called once with all output tokens. + + Args: + output: The RequestOutput containing generated token information + + Returns: + int: Number of output tokens processed in this call + """ + updated_output_token_count = 0 + if output.outputs: + for completion_output in output.outputs: + # only keep last round + updated_output_token_count += len(completion_output.token_ids) + self.num_output_tokens += updated_output_token_count + self.current_turn_metrics.output_tokens += updated_output_token_count + return updated_output_token_count + + @property + def messages(self) -> list: + return self._messages + + def need_builtin_tool_call(self) -> bool: + last_msg = self.messages[-1] + recipient = last_msg.recipient + return recipient is not None and ( + recipient.startswith("browser.") + or recipient.startswith("python") + or recipient.startswith("container.") + ) + + async def call_tool(self) -> list[Message]: + if not self.messages: + return [] + last_msg = self.messages[-1] + recipient = last_msg.recipient + if recipient is not None: + if recipient.startswith("browser."): + return await self.call_search_tool( + self._tool_sessions["browser"], last_msg + ) + elif recipient.startswith("python"): + return await self.call_python_tool( + self._tool_sessions["python"], last_msg + ) + elif recipient.startswith("container."): + return await self.call_container_tool( + self._tool_sessions["container"], last_msg + ) + raise ValueError("No tool call found") + + def render_for_completion(self) -> list[int]: + return render_for_completion(self.messages) + + async def call_search_tool( + self, tool_session: Union["ClientSession", Tool], last_msg: Message + ) -> list[Message]: + self.called_tools.add("browser") + if isinstance(tool_session, Tool): + return await tool_session.get_result(self) + tool_name = last_msg.recipient.split(".")[1] + if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: + try: + args = json.loads(last_msg.content[0].text) + except json.JSONDecodeError as e: + return _create_json_parse_error_messages(last_msg, e) + else: + args = json.loads(last_msg.content[0].text) + result = await tool_session.call_tool(tool_name, args) + result_str = result.content[0].text + content = TextContent(text=result_str) + author = Author(role=Role.TOOL, name=last_msg.recipient) + return [ + Message( + author=author, + content=[content], + recipient=Role.ASSISTANT, + channel=last_msg.channel, + ) + ] + + async def call_python_tool( + self, tool_session: Union["ClientSession", Tool], last_msg: Message + ) -> list[Message]: + self.called_tools.add("python") + if isinstance(tool_session, Tool): + return await tool_session.get_result(self) + param = { + "code": last_msg.content[0].text, + } + result = await tool_session.call_tool("python", param) + result_str = result.content[0].text + + content = TextContent(text=result_str) + author = Author(role=Role.TOOL, name="python") + + return [ + Message( + author=author, + content=[content], + channel=last_msg.channel, + recipient=Role.ASSISTANT, + ) + ] + + async def init_tool_sessions( + self, + tool_server: ToolServer | None, + exit_stack: AsyncExitStack, + request_id: str, + mcp_tools: dict[str, Mcp], + ): + if tool_server: + for tool_name in self.available_tools: + if tool_name not in self._tool_sessions: + tool_type = _map_tool_name_to_tool_type(tool_name) + headers = ( + mcp_tools[tool_type].headers if tool_type in mcp_tools else None + ) + tool_session = await exit_stack.enter_async_context( + tool_server.new_session(tool_name, request_id, headers) + ) + self._tool_sessions[tool_name] = tool_session + exit_stack.push_async_exit(self.cleanup_session) + + async def call_container_tool( + self, tool_session: Union["ClientSession", Tool], last_msg: Message + ) -> list[Message]: + """ + Call container tool. Expect this to be run in a stateful docker + with command line terminal. + The official container tool would at least + expect the following format: + - for tool name: exec + - args: + { + "cmd":List[str] "command to execute", + "workdir":optional[str] "current working directory", + "env":optional[object/dict] "environment variables", + "session_name":optional[str] "session name", + "timeout":optional[int] "timeout in seconds", + "user":optional[str] "user name", + } + """ + self.called_tools.add("container") + if isinstance(tool_session, Tool): + return await tool_session.get_result(self) + tool_name = last_msg.recipient.split(".")[1].split(" ")[0] + if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: + try: + args = json.loads(last_msg.content[0].text) + except json.JSONDecodeError as e: + return _create_json_parse_error_messages(last_msg, e) + else: + args = json.loads(last_msg.content[0].text) + result = await tool_session.call_tool(tool_name, args) + result_str = result.content[0].text + content = TextContent(text=result_str) + author = Author(role=Role.TOOL, name=last_msg.recipient) + return [ + Message( + author=author, + content=[content], + recipient=Role.ASSISTANT, + channel=last_msg.channel, + ) + ] + + async def cleanup_session(self, *args, **kwargs) -> None: + """Can be used as coro to used in __aexit__""" + + async def cleanup_tool_session(tool_session): + if not isinstance(tool_session, Tool): + logger.info( + "Cleaning up tool session for %s", tool_session._client_info + ) + with contextlib.suppress(Exception): + await tool_session.call_tool("cleanup_session", {}) + + await asyncio.gather( + *( + cleanup_tool_session(self._tool_sessions[tool]) + for tool in self.called_tools + ) + ) + + +class StreamingHarmonyContext(HarmonyContext): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.last_output = None + + self.parser = get_streamable_parser_for_assistant() + self.encoding = get_encoding() + self.last_tok = None + self.first_tok_of_message = True + + @property + def messages(self) -> list: + return self._messages + + def append_output(self, output: RequestOutput) -> None: + # append_output is called for each output token in streaming case, + # so we only want to add the prompt tokens once for each message. + if self.first_tok_of_message: + self._update_prefill_token_usage(output) + # Reset self.first_tok_of_message if needed: + # if the current token is the last one of the current message + # (finished=True), then the next token processed will mark the + # beginning of a new message + self.first_tok_of_message = output.finished + for tok in output.outputs[0].token_ids: + self.parser.process(tok) + self._update_decode_token_usage(output) + + # For streaming, update previous turn when message is complete + if output.finished: + self.all_turn_metrics.append(self.current_turn_metrics.copy()) + self.current_turn_metrics.reset() + # Check if the current token is part of reasoning content + self._update_num_reasoning_tokens() + self.last_tok = tok + if len(self._messages) - self.num_init_messages < len(self.parser.messages): + self._messages.extend( + self.parser.messages[len(self._messages) - self.num_init_messages :] + ) + + def append_tool_output(self, output: list[Message]) -> None: + # Handle the case of tool output in direct message format + assert len(output) == 1, "Tool output should be a single message" + msg = output[0] + # Sometimes the recipient is not set for tool messages, + # so we set it to "assistant" + if msg.author.role == Role.TOOL and msg.recipient is None: + msg.recipient = "assistant" + toks = self.encoding.render(msg) + for tok in toks: + self.parser.process(tok) + self.last_tok = toks[-1] + # TODO: add tool_output messages to self._messages + + def is_expecting_start(self) -> bool: + return self.parser.state == StreamState.EXPECT_START + + def is_assistant_action_turn(self) -> bool: + return self.last_tok in self.encoding.stop_tokens_for_assistant_actions() + + def render_for_completion(self) -> list[int]: + # now this list of tokens as next turn's starting tokens + # `<|start|>assistant`, + # we need to process them in parser. + rendered_tokens = super().render_for_completion() + + last_n = -1 + to_process = [] + while rendered_tokens[last_n] != self.last_tok: + to_process.append(rendered_tokens[last_n]) + last_n -= 1 + for tok in reversed(to_process): + self.parser.process(tok) + + return rendered_tokens diff --git a/entrypoints/dynamic_lora.py b/entrypoints/dynamic_lora.py new file mode 100644 index 0000000..cc0f437 --- /dev/null +++ b/entrypoints/dynamic_lora.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import model_hosting_container_standards.sagemaker as sagemaker_standards +from fastapi import APIRouter, Depends, Request +from fastapi.responses import JSONResponse, Response + +from vllm.entrypoints.openai.api_server import models, validate_json_request +from vllm.entrypoints.openai.protocol import ( + ErrorResponse, + LoadLoRAAdapterRequest, + UnloadLoRAAdapterRequest, +) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def register_dynamic_lora_routes(router: APIRouter): + @sagemaker_standards.register_load_adapter_handler( + request_shape={ + "lora_name": "body.name", + "lora_path": "body.src", + }, + ) + @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)]) + async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request): + handler: OpenAIServingModels = models(raw_request) + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse( + content=response.model_dump(), status_code=response.error.code + ) + + return Response(status_code=200, content=response) + + @sagemaker_standards.register_unload_adapter_handler( + request_shape={ + "lora_name": "path_params.adapter_name", + } + ) + @router.post( + "/v1/unload_lora_adapter", dependencies=[Depends(validate_json_request)] + ) + async def unload_lora_adapter( + request: UnloadLoRAAdapterRequest, raw_request: Request + ): + handler: OpenAIServingModels = models(raw_request) + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse( + content=response.model_dump(), status_code=response.error.code + ) + + return Response(status_code=200, content=response) + + return router diff --git a/entrypoints/harmony_utils.py b/entrypoints/harmony_utils.py new file mode 100644 index 0000000..47a2523 --- /dev/null +++ b/entrypoints/harmony_utils.py @@ -0,0 +1,535 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import datetime +import json +from collections.abc import Iterable, Sequence +from typing import Literal + +from openai.types.responses import ( + ResponseFunctionToolCall, + ResponseOutputItem, + ResponseOutputMessage, + ResponseOutputText, + ResponseReasoningItem, +) +from openai.types.responses.response_function_web_search import ( + ActionFind, + ActionOpenPage, + ActionSearch, + ResponseFunctionWebSearch, +) +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent, +) +from openai.types.responses.tool import Tool +from openai_harmony import ( + Author, + ChannelConfig, + Conversation, + DeveloperContent, + HarmonyEncodingName, + Message, + ReasoningEffort, + Role, + StreamableParser, + SystemContent, + TextContent, + ToolDescription, + load_harmony_encoding, +) +from openai_harmony import Message as OpenAIHarmonyMessage +from openai_harmony import Role as OpenAIHarmonyRole + +from vllm import envs +from vllm.entrypoints.openai.protocol import ( + ChatCompletionToolsParam, + ResponseInputOutputItem, + ResponsesRequest, +) +from vllm.utils import random_uuid + +REASONING_EFFORT = { + "high": ReasoningEffort.HIGH, + "medium": ReasoningEffort.MEDIUM, + "low": ReasoningEffort.LOW, +} + +_harmony_encoding = None + +# Builtin tools that should be included in the system message when +# they are available and requested by the user. +# Tool args are provided by MCP tool descriptions. Output +# of the tools are stringified. +MCP_BUILTIN_TOOLS: set[str] = { + "web_search_preview", + "code_interpreter", + "container", +} + + +def has_custom_tools(tool_types: set[str]) -> bool: + """ + Checks if the given tool types are custom tools + (i.e. any tool other than MCP buildin tools) + """ + return not tool_types.issubset(MCP_BUILTIN_TOOLS) + + +def get_encoding(): + global _harmony_encoding + if _harmony_encoding is None: + _harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + return _harmony_encoding + + +def get_system_message( + model_identity: str | None = None, + reasoning_effort: Literal["high", "medium", "low"] | None = None, + start_date: str | None = None, + browser_description: str | None = None, + python_description: str | None = None, + container_description: str | None = None, + instructions: str | None = None, + with_custom_tools: bool = False, +) -> Message: + sys_msg_content = SystemContent.new() + if model_identity is not None: + sys_msg_content = sys_msg_content.with_model_identity(model_identity) + if instructions is not None and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: + current_identity = sys_msg_content.model_identity + new_identity = ( + f"{current_identity}\n{instructions}" if current_identity else instructions + ) + sys_msg_content = sys_msg_content.with_model_identity(new_identity) + if reasoning_effort is not None: + sys_msg_content = sys_msg_content.with_reasoning_effort( + REASONING_EFFORT[reasoning_effort] + ) + if start_date is None: + # NOTE(woosuk): This brings non-determinism in vLLM. Be careful. + start_date = datetime.datetime.now().strftime("%Y-%m-%d") + sys_msg_content = sys_msg_content.with_conversation_start_date(start_date) + if browser_description is not None: + sys_msg_content = sys_msg_content.with_tools(browser_description) + if python_description is not None: + sys_msg_content = sys_msg_content.with_tools(python_description) + if container_description is not None: + sys_msg_content = sys_msg_content.with_tools(container_description) + if not with_custom_tools: + channel_config = sys_msg_content.channel_config + invalid_channel = "commentary" + new_config = ChannelConfig.require_channels( + [c for c in channel_config.valid_channels if c != invalid_channel] + ) + sys_msg_content = sys_msg_content.with_channel_config(new_config) + sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content) + return sys_msg + + +def create_tool_definition(tool: ChatCompletionToolsParam | Tool): + if isinstance(tool, ChatCompletionToolsParam): + return ToolDescription.new( + name=tool.function.name, + description=tool.function.description, + parameters=tool.function.parameters, + ) + return ToolDescription.new( + name=tool.name, + description=tool.description, + parameters=tool.parameters, + ) + + +def get_developer_message( + instructions: str | None = None, + tools: list[Tool | ChatCompletionToolsParam] | None = None, +) -> Message: + dev_msg_content = DeveloperContent.new() + if instructions is not None and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: + dev_msg_content = dev_msg_content.with_instructions(instructions) + if tools is not None: + function_tools: list[Tool | ChatCompletionToolsParam] = [] + for tool in tools: + if tool.type in ( + "web_search_preview", + "code_interpreter", + "container", + "mcp", + ): + # These are built-in tools that are added to the system message. + # Adding in MCP for now until we support MCP tools executed + # server side + pass + + elif tool.type == "function": + function_tools.append(tool) + else: + raise ValueError(f"tool type {tool.type} not supported") + if function_tools: + function_tool_descriptions = [ + create_tool_definition(tool) for tool in function_tools + ] + dev_msg_content = dev_msg_content.with_function_tools( + function_tool_descriptions + ) + dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content) + return dev_msg + + +def get_user_message(content: str) -> Message: + return Message.from_role_and_content(Role.USER, content) + + +def parse_response_input( + response_msg: ResponseInputOutputItem, + prev_responses: list[ResponseOutputItem | ResponseReasoningItem], +) -> Message: + if not isinstance(response_msg, dict): + response_msg = response_msg.model_dump() + if "type" not in response_msg or response_msg["type"] == "message": + role = response_msg["role"] + content = response_msg["content"] + if role == "system": + # User is trying to set a system message. Change it to: + # <|start|>developer<|message|># Instructions + # {instructions}<|end|> + role = "developer" + text_prefix = "Instructions:\n" + else: + text_prefix = "" + if isinstance(content, str): + msg = Message.from_role_and_content(role, text_prefix + content) + else: + contents = [TextContent(text=text_prefix + c["text"]) for c in content] + msg = Message.from_role_and_contents(role, contents) + if role == "assistant": + msg = msg.with_channel("final") + elif response_msg["type"] == "function_call_output": + call_id = response_msg["call_id"] + call_response: ResponseFunctionToolCall | None = None + for prev_response in reversed(prev_responses): + if ( + isinstance(prev_response, ResponseFunctionToolCall) + and prev_response.call_id == call_id + ): + call_response = prev_response + break + if call_response is None: + raise ValueError(f"No call message found for {call_id}") + msg = Message.from_author_and_content( + Author.new(Role.TOOL, f"functions.{call_response.name}"), + response_msg["output"], + ) + elif response_msg["type"] == "reasoning": + content = response_msg["content"] + assert len(content) == 1 + msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"]) + elif response_msg["type"] == "function_call": + msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"]) + msg = msg.with_channel("commentary") + msg = msg.with_recipient(f"functions.{response_msg['name']}") + msg = msg.with_content_type("json") + else: + raise ValueError(f"Unknown input type: {response_msg['type']}") + return msg + + +def parse_input_to_harmony_message(chat_msg) -> list[Message]: + if not isinstance(chat_msg, dict): + # Handle Pydantic models + chat_msg = chat_msg.model_dump(exclude_none=True) + + role = chat_msg.get("role") + + # Assistant message with tool calls + tool_calls = chat_msg.get("tool_calls") + if role == "assistant" and tool_calls: + msgs: list[Message] = [] + for call in tool_calls: + func = call.get("function", {}) + name = func.get("name", "") + arguments = func.get("arguments", "") or "" + msg = Message.from_role_and_content(Role.ASSISTANT, arguments) + msg = msg.with_channel("commentary") + msg = msg.with_recipient(f"functions.{name}") + msg = msg.with_content_type("json") + msgs.append(msg) + return msgs + + # Tool role message (tool output) + if role == "tool": + name = chat_msg.get("name", "") + content = chat_msg.get("content", "") or "" + if isinstance(content, list): + # Handle array format for tool message content + # by concatenating all text parts. + content = "".join( + item.get("text", "") + for item in content + if isinstance(item, dict) and item.get("type") == "text" + ) + + msg = Message.from_author_and_content( + Author.new(Role.TOOL, f"functions.{name}"), content + ).with_channel("commentary") + return [msg] + + # Default: user/assistant/system messages with content + content = chat_msg.get("content", "") + if isinstance(content, str): + contents = [TextContent(text=content)] + else: + # TODO: Support refusal. + contents = [TextContent(text=c.get("text", "")) for c in content] + msg = Message.from_role_and_contents(role, contents) + return [msg] + + +def construct_harmony_previous_input_messages( + request: ResponsesRequest, +) -> list[OpenAIHarmonyMessage]: + messages: list[OpenAIHarmonyMessage] = [] + if request.previous_input_messages: + for message in request.previous_input_messages: + # Handle both OpenAIHarmonyMessage objects and dictionary inputs + if isinstance(message, OpenAIHarmonyMessage): + message_role = message.author.role + # To match OpenAI, instructions, reasoning and tools are + # always taken from the most recent Responses API request + # not carried over from previous requests + if ( + message_role == OpenAIHarmonyRole.SYSTEM + or message_role == OpenAIHarmonyRole.DEVELOPER + ): + continue + messages.append(message) + else: + harmony_messages = parse_input_to_harmony_message(message) + for harmony_msg in harmony_messages: + message_role = harmony_msg.author.role + # To match OpenAI, instructions, reasoning and tools are + # always taken from the most recent Responses API request + # not carried over from previous requests + if ( + message_role == OpenAIHarmonyRole.SYSTEM + or message_role == OpenAIHarmonyRole.DEVELOPER + ): + continue + messages.append(harmony_msg) + return messages + + +def render_for_completion(messages: list[Message]) -> list[int]: + conversation = Conversation.from_messages(messages) + token_ids = get_encoding().render_conversation_for_completion( + conversation, Role.ASSISTANT + ) + return token_ids + + +def parse_output_message(message: Message) -> list[ResponseOutputItem]: + """ + Parse a Harmony message into a list of output response items. + """ + if message.author.role != "assistant": + # This is a message from a tool to the assistant (e.g., search result). + # Don't include it in the final output for now. This aligns with + # OpenAI's behavior on models like o4-mini. + return [] + + output_items: list[ResponseOutputItem] = [] + recipient = message.recipient + if recipient is not None and recipient.startswith("browser."): + if len(message.content) != 1: + raise ValueError("Invalid number of contents in browser message") + content = message.content[0] + # We do not need to check the VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY + # env variable since if it is not set, we are certain the json is valid + # The use of Actions for web search will be removed entirely in + # the future, so this is only necessary temporarily + try: + browser_call = json.loads(content.text) + except json.JSONDecodeError: + # If the content is not valid JSON, then it was + # caught and retried by vLLM, which means we + # need to make note of that so the user is aware + json_retry_output_message = ( + f"Invalid JSON args, caught and retried: {content.text}" + ) + browser_call = { + "query": json_retry_output_message, + "url": json_retry_output_message, + "pattern": json_retry_output_message, + } + # TODO: translate to url properly! + if recipient == "browser.search": + action = ActionSearch( + query=f"cursor:{browser_call.get('query', '')}", type="search" + ) + elif recipient == "browser.open": + action = ActionOpenPage( + url=f"cursor:{browser_call.get('url', '')}", type="open_page" + ) + elif recipient == "browser.find": + action = ActionFind( + pattern=browser_call["pattern"], + url=f"cursor:{browser_call.get('url', '')}", + type="find", + ) + else: + raise ValueError(f"Unknown browser action: {recipient}") + web_search_item = ResponseFunctionWebSearch( + id=f"ws_{random_uuid()}", + action=action, + status="completed", + type="web_search_call", + ) + output_items.append(web_search_item) + elif message.channel == "analysis": + for content in message.content: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=content.text, type="reasoning_text" + ) + ], + status=None, + ) + output_items.append(reasoning_item) + elif message.channel == "commentary": + if recipient is not None and recipient.startswith("functions."): + function_name = recipient.split(".")[-1] + for content in message.content: + random_id = random_uuid() + response_item = ResponseFunctionToolCall( + arguments=content.text, + call_id=f"call_{random_id}", + type="function_call", + name=function_name, + id=f"fc_{random_id}", + ) + output_items.append(response_item) + elif recipient is not None and ( + recipient.startswith("python") + or recipient.startswith("browser") + or recipient.startswith("container") + ): + for content in message.content: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=content.text, type="reasoning_text" + ) + ], + status=None, + ) + output_items.append(reasoning_item) + else: + raise ValueError(f"Unknown recipient: {recipient}") + elif message.channel == "final": + contents = [] + for content in message.content: + output_text = ResponseOutputText( + text=content.text, + annotations=[], # TODO + type="output_text", + logprobs=None, # TODO + ) + contents.append(output_text) + text_item = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=contents, + role=message.author.role, + status="completed", + type="message", + ) + output_items.append(text_item) + else: + raise ValueError(f"Unknown channel: {message.channel}") + return output_items + + +def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]: + if not parser.current_content: + return [] + if parser.current_role != Role.ASSISTANT: + return [] + current_recipient = parser.current_recipient + if current_recipient is not None and current_recipient.startswith("browser."): + return [] + + if parser.current_channel == "analysis": + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=parser.current_content, type="reasoning_text" + ) + ], + status=None, + ) + return [reasoning_item] + elif parser.current_channel == "final": + output_text = ResponseOutputText( + text=parser.current_content, + annotations=[], # TODO + type="output_text", + logprobs=None, # TODO + ) + text_item = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=[output_text], + role="assistant", + # if the parser still has messages (ie if the generator got cut + # abruptly), this should be incomplete + status="incomplete", + type="message", + ) + return [text_item] + return [] + + +def get_stop_tokens_for_assistant_actions() -> list[int]: + return get_encoding().stop_tokens_for_assistant_actions() + + +def get_streamable_parser_for_assistant() -> StreamableParser: + return StreamableParser(get_encoding(), role=Role.ASSISTANT) + + +def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser: + parser = get_streamable_parser_for_assistant() + for token_id in token_ids: + parser.process(token_id) + return parser + + +def parse_chat_output( + token_ids: Sequence[int], +) -> tuple[str | None, str | None, bool]: + parser = parse_output_into_messages(token_ids) + output_msgs = parser.messages + is_tool_call = False # TODO: update this when tool call is supported + if len(output_msgs) == 0: + # The generation has stopped during reasoning. + reasoning = parser.current_content + final_content = None + elif len(output_msgs) == 1: + # The generation has stopped during final message. + reasoning = output_msgs[0].content[0].text + final_content = parser.current_content + else: + reasoning_msg = output_msgs[:-1] + final_msg = output_msgs[-1] + reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg]) + final_content = final_msg.content[0].text + return reasoning, final_content, is_tool_call diff --git a/entrypoints/launcher.py b/entrypoints/launcher.py new file mode 100644 index 0000000..cabf95e --- /dev/null +++ b/entrypoints/launcher.py @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import signal +import socket +from http import HTTPStatus +from typing import Any + +import uvicorn +from fastapi import FastAPI, Request, Response + +from vllm import envs +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.constants import ( + H11_MAX_HEADER_COUNT_DEFAULT, + H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT, +) +from vllm.entrypoints.ssl import SSLCertRefresher +from vllm.logger import init_logger +from vllm.utils.network_utils import find_process_using_port +from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError + +logger = init_logger(__name__) + + +async def serve_http( + app: FastAPI, + sock: socket.socket | None, + enable_ssl_refresh: bool = False, + **uvicorn_kwargs: Any, +): + """ + Start a FastAPI app using Uvicorn, with support for custom Uvicorn config + options. Supports http header limits via h11_max_incomplete_event_size and + h11_max_header_count. + """ + logger.info("Available routes are:") + for route in app.routes: + methods = getattr(route, "methods", None) + path = getattr(route, "path", None) + + if methods is None or path is None: + continue + + logger.info("Route: %s, Methods: %s", path, ", ".join(methods)) + + # Extract header limit options if present + h11_max_incomplete_event_size = uvicorn_kwargs.pop( + "h11_max_incomplete_event_size", None + ) + h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None) + + # Set safe defaults if not provided + if h11_max_incomplete_event_size is None: + h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT + if h11_max_header_count is None: + h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT + + config = uvicorn.Config(app, **uvicorn_kwargs) + # Set header limits + config.h11_max_incomplete_event_size = h11_max_incomplete_event_size + config.h11_max_header_count = h11_max_header_count + config.load() + server = uvicorn.Server(config) + _add_shutdown_handlers(app, server) + + loop = asyncio.get_running_loop() + + watchdog_task = loop.create_task(watchdog_loop(server, app.state.engine_client)) + server_task = loop.create_task(server.serve(sockets=[sock] if sock else None)) + + ssl_cert_refresher = ( + None + if not enable_ssl_refresh + else SSLCertRefresher( + ssl_context=config.ssl, + key_path=config.ssl_keyfile, + cert_path=config.ssl_certfile, + ca_path=config.ssl_ca_certs, + ) + ) + + def signal_handler() -> None: + # prevents the uvicorn signal handler to exit early + server_task.cancel() + watchdog_task.cancel() + if ssl_cert_refresher: + ssl_cert_refresher.stop() + + async def dummy_shutdown() -> None: + pass + + loop.add_signal_handler(signal.SIGINT, signal_handler) + loop.add_signal_handler(signal.SIGTERM, signal_handler) + + try: + await server_task + return dummy_shutdown() + except asyncio.CancelledError: + port = uvicorn_kwargs["port"] + process = find_process_using_port(port) + if process is not None: + logger.warning( + "port %s is used by process %s launched with command:\n%s", + port, + process, + " ".join(process.cmdline()), + ) + logger.info("Shutting down FastAPI HTTP server.") + return server.shutdown() + finally: + watchdog_task.cancel() + + +async def watchdog_loop(server: uvicorn.Server, engine: EngineClient): + """ + # Watchdog task that runs in the background, checking + # for error state in the engine. Needed to trigger shutdown + # if an exception arises is StreamingResponse() generator. + """ + VLLM_WATCHDOG_TIME_S = 5.0 + while True: + await asyncio.sleep(VLLM_WATCHDOG_TIME_S) + terminate_if_errored(server, engine) + + +def terminate_if_errored(server: uvicorn.Server, engine: EngineClient): + """ + See discussions here on shutting down a uvicorn server + https://github.com/encode/uvicorn/discussions/1103 + In this case we cannot await the server shutdown here + because handler must first return to close the connection + for this request. + """ + engine_errored = engine.errored and not engine.is_running + if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored: + server.should_exit = True + + +def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: + """ + VLLM V1 AsyncLLM catches exceptions and returns + only two types: EngineGenerateError and EngineDeadError. + + EngineGenerateError is raised by the per request generate() + method. This error could be request specific (and therefore + recoverable - e.g. if there is an error in input processing). + + EngineDeadError is raised by the background output_handler + method. This error is global and therefore not recoverable. + + We register these @app.exception_handlers to return nice + responses to the end user if they occur and shut down if needed. + See https://fastapi.tiangolo.com/tutorial/handling-errors/ + for more details on how exception handlers work. + + If an exception is encountered in a StreamingResponse + generator, the exception is not raised, since we already sent + a 200 status. Rather, we send an error message as the next chunk. + Since the exception is not raised, this means that the server + will not automatically shut down. Instead, we use the watchdog + background task for check for errored state. + """ + + @app.exception_handler(RuntimeError) + @app.exception_handler(EngineDeadError) + @app.exception_handler(EngineGenerateError) + async def runtime_exception_handler(request: Request, __): + terminate_if_errored( + server=server, + engine=request.app.state.engine_client, + ) + + return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) diff --git a/entrypoints/llm.py b/entrypoints/llm.py new file mode 100644 index 0000000..b0786bd --- /dev/null +++ b/entrypoints/llm.py @@ -0,0 +1,1768 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import itertools +from collections.abc import Callable, Sequence +from typing import TYPE_CHECKING, Any, cast + +import cloudpickle +import torch.nn as nn +from pydantic import ValidationError +from tqdm.auto import tqdm +from typing_extensions import TypeVar, deprecated + +from vllm.beam_search import ( + BeamSearchInstance, + BeamSearchOutput, + BeamSearchSequence, + create_sort_beams_key_function, +) +from vllm.config import ( + CompilationConfig, + PoolerConfig, + StructuredOutputsConfig, + is_init_field, +) +from vllm.config.compilation import CompilationMode +from vllm.config.model import ( + ConvertOption, + HfOverrides, + ModelDType, + RunnerOption, + TokenizerMode, +) +from vllm.engine.arg_utils import EngineArgs +from vllm.entrypoints.chat_utils import ( + ChatCompletionMessageParam, + ChatTemplateContentFormatOption, + apply_hf_chat_template, + apply_mistral_chat_template, + parse_chat_messages, + resolve_chat_template_content_format, +) +from vllm.entrypoints.score_utils import ( + ScoreContentPartParam, + ScoreMultiModalParam, + _cosine_similarity, + _validate_score_input_lens, + compress_token_type_ids, + get_score_prompt, +) +from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args +from vllm.inputs import ( + DataPrompt, + PromptType, + SingletonPrompt, + TextPrompt, + TokensPrompt, +) +from vllm.inputs.parse import get_prompt_components +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.quantization import QuantizationMethods +from vllm.outputs import ( + ClassificationRequestOutput, + EmbeddingRequestOutput, + PoolingRequestOutput, + RequestOutput, + ScoringRequestOutput, +) +from vllm.platforms import current_platform +from vllm.pooling_params import PoolingParams +from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams +from vllm.tasks import PoolingTask +from vllm.transformers_utils.tokenizer import ( + AnyTokenizer, + MistralTokenizer, + get_cached_tokenizer, +) +from vllm.usage.usage_lib import UsageContext +from vllm.utils.collection_utils import as_iter, is_list_of +from vllm.utils.counter import Counter +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.llm_engine import LLMEngine +from vllm.v1.sample.logits_processor import LogitsProcessor + +if TYPE_CHECKING: + from vllm.v1.metrics.reader import Metric + +logger = init_logger(__name__) + +_R = TypeVar("_R", default=Any) + + +class LLM: + """An LLM for generating texts from given prompts and sampling parameters. + + This class includes a tokenizer, a language model (possibly distributed + across multiple GPUs), and GPU memory space allocated for intermediate + states (aka KV cache). Given a batch of prompts and sampling parameters, + this class generates texts from the model, using an intelligent batching + mechanism and efficient memory management. + + Args: + model: The name or path of a HuggingFace Transformers model. + tokenizer: The name or path of a HuggingFace Transformers tokenizer. + tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer + if available, and "slow" will always use the slow tokenizer. + skip_tokenizer_init: If true, skip initialization of tokenizer and + detokenizer. Expect valid prompt_token_ids and None for prompt + from the input. + trust_remote_code: Trust remote code (e.g., from HuggingFace) when + downloading the model and tokenizer. + allowed_local_media_path: Allowing API requests to read local images + or videos from directories specified by the server file system. + This is a security risk. Should only be enabled in trusted + environments. + allowed_media_domains: If set, only media URLs that belong to this + domain can be used for multi-modal inputs. + tensor_parallel_size: The number of GPUs to use for distributed + execution with tensor parallelism. + dtype: The data type for the model weights and activations. Currently, + we support `float32`, `float16`, and `bfloat16`. If `auto`, we use + the `dtype` attribute of the Transformers model's config. However, + if the `dtype` in the config is `float32`, we will use `float16` instead. + quantization: The method used to quantize the model weights. Currently, + we support "awq", "gptq", and "fp8" (experimental). + If None, we first check the `quantization_config` attribute in the + model config file. If that is None, we assume the model weights are + not quantized and use `dtype` to determine the data type of + the weights. + revision: The specific model version to use. It can be a branch name, + a tag name, or a commit id. + tokenizer_revision: The specific tokenizer version to use. It can be a + branch name, a tag name, or a commit id. + seed: The seed to initialize the random number generator for sampling. + gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to + reserve for the model weights, activations, and KV cache. Higher + values will increase the KV cache size and thus improve the model's + throughput. However, if the value is too high, it may cause out-of- + memory (OOM) errors. + kv_cache_memory_bytes: Size of KV Cache per GPU in bytes. By default, + this is set to None and vllm can automatically infer the kv cache + size based on gpu_memory_utilization. However, users may want to + manually specify the kv cache memory size. kv_cache_memory_bytes + allows more fine-grain control of how much memory gets used when + compared with using gpu_memory_utilization. Note that + kv_cache_memory_bytes (when not-None) ignores + gpu_memory_utilization + swap_space: The size (GiB) of CPU memory per GPU to use as swap space. + This can be used for temporarily storing the states of the requests + when their `best_of` sampling parameters are larger than 1. If all + requests will have `best_of=1`, you can safely set this to 0. + Noting that `best_of` is only supported in V0. Otherwise, too small + values may cause out-of-memory (OOM) errors. + cpu_offload_gb: The size (GiB) of CPU memory to use for offloading + the model weights. This virtually increases the GPU memory space + you can use to hold the model weights, at the cost of CPU-GPU data + transfer for every forward pass. + enforce_eager: Whether to enforce eager execution. If True, we will + disable CUDA graph and always execute the model in eager mode. + If False, we will use CUDA graph and eager execution in hybrid. + disable_custom_all_reduce: See + [ParallelConfig][vllm.config.ParallelConfig]. + hf_token: The token to use as HTTP bearer authorization for remote files + . If `True`, will use the token generated when running + `huggingface-cli login` (stored in `~/.huggingface`). + hf_overrides: If a dictionary, contains arguments to be forwarded to the + HuggingFace config. If a callable, it is called to update the + HuggingFace config. + mm_processor_kwargs: Arguments to be forwarded to the model's processor + for multi-modal data, e.g., image processor. Overrides for the + multi-modal processor obtained from `AutoProcessor.from_pretrained`. + The available overrides depend on the model that is being run. + For example, for Phi-3-Vision: `{"num_crops": 4}`. + pooler_config: Initialize non-default pooling config for the pooling + model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`. + override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This + argument is deprecated and will be removed in v0.12.0 or v1.0.0, + whichever is sooner. + compilation_config: Either an integer or a dictionary. If it is an + integer, it is used as the mode of compilation optimization. If it + is a dictionary, it can specify the full compilation configuration. + **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs]. + + Note: + This class is intended to be used for offline inference. For online + serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead. + """ + + def __init__( + self, + model: str, + *, + runner: RunnerOption = "auto", + convert: ConvertOption = "auto", + tokenizer: str | None = None, + tokenizer_mode: TokenizerMode = "auto", + skip_tokenizer_init: bool = False, + trust_remote_code: bool = False, + allowed_local_media_path: str = "", + allowed_media_domains: list[str] | None = None, + tensor_parallel_size: int = 1, + dtype: ModelDType = "auto", + quantization: QuantizationMethods | None = None, + revision: str | None = None, + tokenizer_revision: str | None = None, + seed: int | None = None, + gpu_memory_utilization: float = 0.9, + swap_space: float = 4, + cpu_offload_gb: float = 0, + enforce_eager: bool = False, + disable_custom_all_reduce: bool = False, + hf_token: bool | str | None = None, + hf_overrides: HfOverrides | None = None, + mm_processor_kwargs: dict[str, Any] | None = None, + pooler_config: PoolerConfig | None = None, + override_pooler_config: PoolerConfig | None = None, + structured_outputs_config: dict[str, Any] + | StructuredOutputsConfig + | None = None, + kv_cache_memory_bytes: int | None = None, + compilation_config: int | dict[str, Any] | CompilationConfig | None = None, + logits_processors: list[str | type[LogitsProcessor]] | None = None, + **kwargs: Any, + ) -> None: + """LLM constructor.""" + + if "disable_log_stats" not in kwargs: + kwargs["disable_log_stats"] = True + + if "worker_cls" in kwargs: + worker_cls = kwargs["worker_cls"] + # if the worker_cls is not qualified string name, + # we serialize it using cloudpickle to avoid pickling issues + if isinstance(worker_cls, type): + kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) + + if "kv_transfer_config" in kwargs and isinstance( + kwargs["kv_transfer_config"], dict + ): + from vllm.config.kv_transfer import KVTransferConfig + + raw_config_dict = kwargs["kv_transfer_config"] + try: + kwargs["kv_transfer_config"] = KVTransferConfig(**raw_config_dict) + except ValidationError as e: + logger.error( + "Failed to convert 'kv_transfer_config' dict to " + "KVTransferConfig object. Dict: %s. Error: %s", + raw_config_dict, + e, + ) + # Consider re-raising a more specific vLLM error or ValueError + # to provide better context to the user. + raise ValueError(f"Invalid 'kv_transfer_config' provided: {e}") from e + + if hf_overrides is None: + hf_overrides = {} + + if compilation_config is not None: + if isinstance(compilation_config, int): + compilation_config_instance = CompilationConfig( + mode=CompilationMode(compilation_config) + ) + elif isinstance(compilation_config, dict): + compilation_config_instance = CompilationConfig( + **{ + k: v + for k, v in compilation_config.items() + if is_init_field(CompilationConfig, k) + } + ) + else: + compilation_config_instance = compilation_config + else: + compilation_config_instance = CompilationConfig() + + if structured_outputs_config is not None: + if isinstance(structured_outputs_config, dict): + structured_outputs_instance = StructuredOutputsConfig( + **{ + k: v + for k, v in structured_outputs_config.items() + if is_init_field(StructuredOutputsConfig, k) + } + ) + else: + structured_outputs_instance = structured_outputs_config + else: + structured_outputs_instance = StructuredOutputsConfig() + + # warn about single-process data parallel usage. + _dp_size = int(kwargs.get("data_parallel_size", 1)) + _distributed_executor_backend = kwargs.get("distributed_executor_backend") + if ( + _dp_size > 1 + and not _distributed_executor_backend == "external_launcher" + and not current_platform.is_tpu() + ): + raise ValueError( + f"LLM(data_parallel_size={_dp_size}) is not supported for single-" + "process usage and may hang. Please use " + "the explicit multi-process data-parallel example at " + "'examples/offline_inference/data_parallel.py'." + ) + + engine_args = EngineArgs( + model=model, + runner=runner, + convert=convert, + tokenizer=tokenizer, + tokenizer_mode=tokenizer_mode, + skip_tokenizer_init=skip_tokenizer_init, + trust_remote_code=trust_remote_code, + allowed_local_media_path=allowed_local_media_path, + allowed_media_domains=allowed_media_domains, + tensor_parallel_size=tensor_parallel_size, + dtype=dtype, + quantization=quantization, + revision=revision, + tokenizer_revision=tokenizer_revision, + seed=seed, + gpu_memory_utilization=gpu_memory_utilization, + kv_cache_memory_bytes=kv_cache_memory_bytes, + swap_space=swap_space, + cpu_offload_gb=cpu_offload_gb, + enforce_eager=enforce_eager, + disable_custom_all_reduce=disable_custom_all_reduce, + hf_token=hf_token, + hf_overrides=hf_overrides, + mm_processor_kwargs=mm_processor_kwargs, + pooler_config=pooler_config, + override_pooler_config=override_pooler_config, + structured_outputs_config=structured_outputs_instance, + compilation_config=compilation_config_instance, + logits_processors=logits_processors, + **kwargs, + ) + + log_non_default_args(engine_args) + + # Create the Engine (autoselects V0 vs V1) + self.llm_engine = LLMEngine.from_engine_args( + engine_args=engine_args, usage_context=UsageContext.LLM_CLASS + ) + self.engine_class = type(self.llm_engine) + + self.request_counter = Counter() + self.default_sampling_params: dict[str, Any] | None = None + + supported_tasks = self.llm_engine.get_supported_tasks() + logger.info("Supported tasks: %s", supported_tasks) + self.supported_tasks = supported_tasks + + self.model_config = self.llm_engine.model_config + self.processor = self.llm_engine.processor + self.io_processor = self.llm_engine.io_processor + + def get_tokenizer(self) -> AnyTokenizer: + return self.llm_engine.get_tokenizer() + + @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.") + def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: + # While CachedTokenizer is dynamic, have no choice but + # compare class name. Misjudgment will arise from + # user-defined tokenizer started with 'Cached' + if tokenizer.__class__.__name__.startswith("Cached"): + self.llm_engine.tokenizer = tokenizer + else: + self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer) + + def reset_mm_cache(self) -> None: + self.processor.clear_mm_cache() + self.llm_engine.reset_mm_cache() + + def get_default_sampling_params(self) -> SamplingParams: + if self.default_sampling_params is None: + self.default_sampling_params = self.model_config.get_diff_sampling_param() + if self.default_sampling_params: + return SamplingParams.from_optional(**self.default_sampling_params) + return SamplingParams() + + def generate( + self, + prompts: PromptType | Sequence[PromptType], + sampling_params: SamplingParams | Sequence[SamplingParams] | None = None, + *, + use_tqdm: bool | Callable[..., tqdm] = True, + lora_request: list[LoRARequest] | LoRARequest | None = None, + priority: list[int] | None = None, + ) -> list[RequestOutput]: + """Generates the completions for the input prompts. + + This class automatically batches the given prompts, considering + the memory constraint. For the best performance, put all of your prompts + into a single list and pass it to this method. + + Args: + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See [PromptType][vllm.inputs.PromptType] + for more details about the format of each prompt. + sampling_params: The sampling parameters for text generation. If + None, we use the default sampling parameters. + When it is a single value, it is applied to every prompt. + When it is a list, the list must have the same length as the + prompts and it is paired one by one with the prompt. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. + lora_request: LoRA request to use for generation, if any. + priority: The priority of the requests, if any. + Only applicable when priority scheduling policy is enabled. + + Returns: + A list of `RequestOutput` objects containing the + generated completions in the same order as the input prompts. + + Note: + Using `prompts` and `prompt_token_ids` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the `inputs` parameter. + """ + model_config = self.model_config + runner_type = model_config.runner_type + if runner_type != "generate": + raise ValueError( + "LLM.generate() is only supported for generative models. " + "Try passing `--runner generate` to use the model as a " + "generative model." + ) + + if sampling_params is None: + # Use default sampling params. + sampling_params = self.get_default_sampling_params() + + # Add any modality specific loras to the corresponding prompts + lora_request = self._get_modality_specific_lora_reqs(prompts, lora_request) + + self._validate_and_add_requests( + prompts=prompts, + params=sampling_params, + use_tqdm=use_tqdm, + lora_request=lora_request, + priority=priority, + ) + + outputs = self._run_engine(use_tqdm=use_tqdm) + return self.engine_class.validate_outputs(outputs, RequestOutput) + + def _get_modality_specific_lora_reqs( + self, + prompts: PromptType | Sequence[PromptType], + lora_request: list[LoRARequest] | LoRARequest | None, + ): + # Grab the lora config off the vllm config on the engine, + # since this is the same for both v0 & v1. + lora_config = self.llm_engine.vllm_config.lora_config + + # If there's no lora config / default_mm_loras, or the model + # isn't multimodal, leave the lora as is. + if ( + lora_config is None + or not self.model_config.is_multimodal_model + or (lora_config and lora_config.default_mm_loras is None) + ): + return lora_request + + if not isinstance(prompts, Sequence): + prompts = [prompts] + + optional_loras = ( + [lora_request] * len(prompts) + if not isinstance(lora_request, Sequence) + else lora_request + ) + + return [ + self._resolve_single_prompt_mm_lora( + prompt, + opt_lora_req, + lora_config.default_mm_loras, + ) + for prompt, opt_lora_req in zip(prompts, optional_loras) + ] + + def _resolve_single_prompt_mm_lora( + self, + prompt: PromptType, + lora_request: LoRARequest | None, + default_mm_loras: dict[str, str] | None, + ): + if ( + not default_mm_loras + or not isinstance(prompt, dict) + or not (mm_data := prompt.get("multi_modal_data") or {}) + ): + return lora_request + + intersection = set( + mm_data.keys() # type: ignore + ).intersection(default_mm_loras.keys()) + if not intersection: + return lora_request + if len(intersection) > 1: + # TODO: Would be nice to be able to have multiple loras per prompt + logger.warning( + "Multiple modality specific loras were registered and would be" + " used by a single prompt consuming several modalities; " + " currently we only support one lora per request; as such," + " lora(s) registered with modalities: %s" + " will be skipped", + intersection, + ) + return lora_request + + # Build the LoRA request; the ID of the default mm lora is the + # index of the modality name sorted alphabetically + 1. + modality_name = intersection.pop() + modality_lora_path = default_mm_loras[modality_name] + modality_lora_id = sorted(default_mm_loras).index(modality_name) + 1 + + # If we have a collision, warn if there is a collision, + # but always send the explicitly provided request. + if lora_request: + if lora_request.lora_int_id != modality_lora_id: + logger.warning( + "A modality with a registered lora and a lora_request " + "with a different ID were provided; falling back to the " + "lora_request as we only apply one LoRARequest per prompt" + ) + return lora_request + + return LoRARequest( + modality_name, + modality_lora_id, + modality_lora_path, + ) + + def collective_rpc( + self, + method: str | Callable[..., _R], + timeout: float | None = None, + args: tuple = (), + kwargs: dict[str, Any] | None = None, + ) -> list[_R]: + """ + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + [`TimeoutError`][] on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. + """ + + return self.llm_engine.collective_rpc(method, timeout, args, kwargs) + + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + + !!! warning + To reduce the overhead of data transfer, avoid returning large + arrays or tensors from this method. If you must return them, + make sure you move them to CPU first to avoid taking up additional + VRAM! + """ + return self.llm_engine.apply_model(func) + + def _get_beam_search_lora_requests( + self, + lora_request: list[LoRARequest] | LoRARequest | None, + prompts: list[TokensPrompt | TextPrompt], + ) -> list[LoRARequest | None]: + """Get the optional lora request corresponding to each prompt.""" + if isinstance(lora_request, Sequence) and len(lora_request) != len(prompts): + raise ValueError( + "Lora request list should be the same length as the prompts" + ) + + if lora_request is None or isinstance(lora_request, LoRARequest): + return [lora_request] * len(prompts) + + raise TypeError(f"Invalid lora_request type {type(lora_request)}") + + def beam_search( + self, + prompts: list[TokensPrompt | TextPrompt], + params: BeamSearchParams, + lora_request: list[LoRARequest] | LoRARequest | None = None, + use_tqdm: bool = False, + concurrency_limit: int | None = None, + ) -> list[BeamSearchOutput]: + """ + Generate sequences using beam search. + + Args: + prompts: A list of prompts. Each prompt can be a string or a list + of token IDs. + params: The beam search parameters. + lora_request: LoRA request to use for generation, if any. + use_tqdm: Whether to use tqdm to display the progress bar. + concurrency_limit: The maximum number of concurrent requests. + If None, the number of concurrent requests is unlimited. + """ + # TODO: how does beam search work together with length penalty, + # frequency, penalty, and stopping criteria, etc.? + beam_width = params.beam_width + max_tokens = params.max_tokens + temperature = params.temperature + ignore_eos = params.ignore_eos + length_penalty = params.length_penalty + + lora_requests = self._get_beam_search_lora_requests(lora_request, prompts) + + tokenizer = self.get_tokenizer() + sort_beams_key = create_sort_beams_key_function( + tokenizer.eos_token_id, + length_penalty, + ) + + if use_tqdm and concurrency_limit is not None: + logger.warning( + "Progress bar is not supported when using concurrency_limit. " + "Disabling progress bar." + ) + use_tqdm = False + + if concurrency_limit is None: + concurrency_limit = len(prompts) + + def create_tokens_prompt_from_beam(beam: BeamSearchSequence) -> TokensPrompt: + token_prompt_kwargs: TokensPrompt = {"prompt_token_ids": beam.tokens} + if beam.multi_modal_data is not None: + token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data + + if beam.mm_processor_kwargs is not None: + token_prompt_kwargs["mm_processor_kwargs"] = beam.mm_processor_kwargs + return TokensPrompt(**token_prompt_kwargs) + + # generate 2 * beam_width candidates at each step + # following the huggingface transformers implementation + # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa + beam_search_params = SamplingParams( + logprobs=2 * beam_width, max_tokens=1, temperature=temperature + ) + instances: list[BeamSearchInstance] = [] + + for lora_req, prompt in zip(lora_requests, prompts): + # Add multimodal processor kwargs & data + mm_kwargs = {} + if "multi_modal_data" in prompt: + mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"] + if "mm_processor_kwargs" in prompt: + mm_kwargs["mm_processor_kwargs"] = prompt["mm_processor_kwargs"] + + if "prompt_token_ids" in prompt: + prompt = cast(TokensPrompt, prompt) # Needed for mypy + prompt_tokens = prompt["prompt_token_ids"] + else: + prompt_tokens = tokenizer.encode(prompt["prompt"]) + + instances.append( + BeamSearchInstance( + prompt_tokens, + lora_request=lora_req, + logprobs=None, + **mm_kwargs, + ), + ) + + for prompt_start in range(0, len(prompts), concurrency_limit): + instances_batch = instances[prompt_start : prompt_start + concurrency_limit] + + token_iter = range(max_tokens) + if use_tqdm: + token_iter = tqdm( + token_iter, desc="Beam search", unit="token", unit_scale=False + ) + logger.warning( + "The progress bar shows the upper bound on token steps and " + "may finish early due to stopping conditions. It does not " + "reflect instance-level progress." + ) + for _ in token_iter: + all_beams: list[BeamSearchSequence] = list( + sum((instance.beams for instance in instances_batch), []) + ) + pos = [0] + list( + itertools.accumulate( + len(instance.beams) for instance in instances_batch + ) + ) + instance_start_and_end: list[tuple[int, int]] = list( + zip(pos[:-1], pos[1:]) + ) + + if len(all_beams) == 0: + break + + # create corresponding batch entries for prompt & optional lora + prompts_batch, lora_req_batch = zip( + *[ + (create_tokens_prompt_from_beam(beam), beam.lora_request) + for beam in all_beams + ] + ) + + # only runs for one step + # we don't need to use tqdm here + output = self.generate( + prompts_batch, + sampling_params=beam_search_params, + use_tqdm=False, + lora_request=lora_req_batch, + ) + + for (start, end), instance in zip( + instance_start_and_end, instances_batch + ): + instance_new_beams = [] + for i in range(start, end): + current_beam = all_beams[i] + result = output[i] + + if result.outputs[0].logprobs is not None: + # if `result.outputs[0].logprobs` is None, it means + # the sequence is completed because of the + # max-model-len or abortion. we don't need to add + # it to the new beams. + logprobs = result.outputs[0].logprobs[0] + for token_id, logprob_obj in logprobs.items(): + new_beam = BeamSearchSequence( + tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + [logprobs], + lora_request=current_beam.lora_request, + cum_logprob=current_beam.cum_logprob + + logprob_obj.logprob, + multi_modal_data=current_beam.multi_modal_data, + mm_processor_kwargs=current_beam.mm_processor_kwargs, + ) + + if ( + token_id == tokenizer.eos_token_id + and not ignore_eos + ): + instance.completed.append(new_beam) + else: + instance_new_beams.append(new_beam) + sorted_beams = sorted( + instance_new_beams, key=sort_beams_key, reverse=True + ) + instance.beams = sorted_beams[:beam_width] + + outputs = [] + for instance in instances: + instance.completed.extend(instance.beams) + sorted_completed = sorted( + instance.completed, key=sort_beams_key, reverse=True + ) + best_beams = sorted_completed[:beam_width] + + for beam in best_beams: + beam.text = tokenizer.decode(beam.tokens) + outputs.append(BeamSearchOutput(sequences=best_beams)) + + return outputs + + def preprocess_chat( + self, + messages: list[ChatCompletionMessageParam] + | list[list[ChatCompletionMessageParam]], + chat_template: str | None = None, + chat_template_content_format: ChatTemplateContentFormatOption = "auto", + add_generation_prompt: bool = True, + continue_final_message: bool = False, + tools: list[dict[str, Any]] | None = None, + chat_template_kwargs: dict[str, Any] | None = None, + mm_processor_kwargs: dict[str, Any] | None = None, + ) -> list[TokensPrompt]: + """ + Generate prompt for a chat conversation. The pre-processed + prompt can then be used as input for the other LLM methods. + + Refer to `chat` for a complete description of the arguments. + Returns: + A list of `TokensPrompts` objects containing the tokenized + prompt after chat template interpolation, and the + pre-processed multi-modal inputs. + """ + list_of_messages: list[list[ChatCompletionMessageParam]] + + # Handle multi and single conversations + if is_list_of(messages, list): + # messages is list[list[...]] + list_of_messages = cast(list[list[ChatCompletionMessageParam]], messages) + else: + # messages is list[...] + list_of_messages = [cast(list[ChatCompletionMessageParam], messages)] + + tokenizer = self.get_tokenizer() + model_config = self.model_config + resolved_content_format = resolve_chat_template_content_format( + chat_template, + tools, + chat_template_content_format, + tokenizer, + model_config=model_config, + ) + + _chat_template_kwargs: dict[str, Any] = dict( + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + continue_final_message=continue_final_message, + tools=tools, + ) + _chat_template_kwargs.update(chat_template_kwargs or {}) + + prompts: list[TokensPrompt] = [] + + for msgs in list_of_messages: + # NOTE: _parse_chat_message_content_parts() currently doesn't + # handle mm_processor_kwargs, since there is no implementation in + # the chat message parsing for it. + conversation, mm_data, mm_uuids = parse_chat_messages( + msgs, + model_config, + tokenizer, + content_format=resolved_content_format, + ) + + if isinstance(tokenizer, MistralTokenizer): + prompt_token_ids = apply_mistral_chat_template( + tokenizer, + messages=msgs, + **_chat_template_kwargs, + ) + else: + prompt_str = apply_hf_chat_template( + tokenizer=tokenizer, + conversation=conversation, + model_config=model_config, + **_chat_template_kwargs, + ) + # Special tokens are already included in chat templates so + # should not be added by the tokenizer in this case. + prompt_token_ids = tokenizer.encode( + prompt_str, add_special_tokens=False + ) + + prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) + + if mm_data is not None: + prompt["multi_modal_data"] = mm_data + + if mm_uuids is not None: + prompt["multi_modal_uuids"] = mm_uuids + + if mm_processor_kwargs is not None: + prompt["mm_processor_kwargs"] = mm_processor_kwargs + + prompts.append(prompt) + + return prompts + + def chat( + self, + messages: list[ChatCompletionMessageParam] + | list[list[ChatCompletionMessageParam]], + sampling_params: SamplingParams | list[SamplingParams] | None = None, + use_tqdm: bool | Callable[..., tqdm] = True, + lora_request: LoRARequest | None = None, + chat_template: str | None = None, + chat_template_content_format: ChatTemplateContentFormatOption = "auto", + add_generation_prompt: bool = True, + continue_final_message: bool = False, + tools: list[dict[str, Any]] | None = None, + chat_template_kwargs: dict[str, Any] | None = None, + mm_processor_kwargs: dict[str, Any] | None = None, + ) -> list[RequestOutput]: + """ + Generate responses for a chat conversation. + + The chat conversation is converted into a text prompt using the + tokenizer and calls the [generate][vllm.LLM.generate] method to generate + the responses. + + Multi-modal inputs can be passed in the same way you would pass them + to the OpenAI API. + + Args: + messages: A list of conversations or a single conversation. + + - Each conversation is represented as a list of messages. + - Each message is a dictionary with 'role' and 'content' keys. + + sampling_params: The sampling parameters for text generation. + If None, we use the default sampling parameters. When it + is a single value, it is applied to every prompt. When it + is a list, the list must have the same length as the + prompts and it is paired one by one with the prompt. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. + lora_request: LoRA request to use for generation, if any. + chat_template: The template to use for structuring the chat. + If not provided, the model's default chat template will be used. + chat_template_content_format: The format to render message content. + + - "string" will render the content as a string. + Example: `"Who are you?"` + - "openai" will render the content as a list of dictionaries, + similar to OpenAI schema. + Example: `[{"type": "text", "text": "Who are you?"}]` + + add_generation_prompt: If True, adds a generation template + to each message. + continue_final_message: If True, continues the final message in + the conversation instead of starting a new one. Cannot be + `True` if `add_generation_prompt` is also `True`. + chat_template_kwargs: Additional kwargs to pass to the chat + template. + mm_processor_kwargs: Multimodal processor kwarg overrides for this + chat request. Only used for offline requests. + + Returns: + A list of `RequestOutput` objects containing the generated + responses in the same order as the input messages. + """ + + prompts = self.preprocess_chat( + messages=messages, + chat_template=chat_template, + chat_template_content_format=chat_template_content_format, + add_generation_prompt=add_generation_prompt, + continue_final_message=continue_final_message, + tools=tools, + chat_template_kwargs=chat_template_kwargs, + mm_processor_kwargs=mm_processor_kwargs, + ) + + return self.generate( + prompts, + sampling_params=sampling_params, + use_tqdm=use_tqdm, + lora_request=lora_request, + ) + + def encode( + self, + prompts: PromptType | Sequence[PromptType] | DataPrompt, + pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, + *, + truncate_prompt_tokens: int | None = None, + use_tqdm: bool | Callable[..., tqdm] = True, + lora_request: list[LoRARequest] | LoRARequest | None = None, + pooling_task: PoolingTask | None = None, + tokenization_kwargs: dict[str, Any] | None = None, + ) -> list[PoolingRequestOutput]: + """Apply pooling to the hidden states corresponding to the input + prompts. + + This class automatically batches the given prompts, considering + the memory constraint. For the best performance, put all of your prompts + into a single list and pass it to this method. + + Args: + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See [PromptType][vllm.inputs.PromptType] + for more details about the format of each prompt. + pooling_params: The pooling parameters for pooling. If None, we + use the default pooling parameters. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. + lora_request: LoRA request to use for generation, if any. + pooling_task: Override the pooling task to use. + tokenization_kwargs: overrides tokenization_kwargs set in + pooling_params + + Returns: + A list of `PoolingRequestOutput` objects containing the + pooled hidden states in the same order as the input prompts. + + Note: + Using `prompts` and `prompt_token_ids` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the `inputs` parameter. + """ + + error_str = ( + "pooling_task required for `LLM.encode`\n" + "Please use one of the more specific methods or set the " + "pooling_task when using `LLM.encode`:\n" + " - For embeddings, use `LLM.embed(...)` " + 'or `pooling_task="embed"`.\n' + " - For classification logits, use `LLM.classify(...)` " + 'or `pooling_task="classify"`.\n' + " - For similarity scores, use `LLM.score(...)`.\n" + " - For rewards, use `LLM.reward(...)` " + 'or `pooling_task="token_classify"`\n' + " - For token classification, " + 'use `pooling_task="token_classify"`\n' + ' - For multi-vector retrieval, use `pooling_task="token_embed"`' + ) + + if pooling_task is None: + raise ValueError(error_str) + + model_config = self.model_config + runner_type = model_config.runner_type + if runner_type != "pooling": + raise ValueError( + "LLM.encode() is only supported for pooling models. " + "Try passing `--runner pooling` to use the model as a " + "pooling model." + ) + + io_processor_prompt = False + if isinstance(prompts, dict) and "data" in prompts: + io_processor_prompt = True + if self.io_processor is None: + raise ValueError( + "No IOProcessor plugin installed. Please refer " + "to the documentation and to the " + "'prithvi_geospatial_mae_io_processor' " + "offline inference example for more details." + ) + + # Validate the request data is valid for the loaded plugin + validated_prompt = self.io_processor.parse_request(prompts) + + # obtain the actual model prompts from the pre-processor + prompts = self.io_processor.pre_process(prompt=validated_prompt) + + if io_processor_prompt: + assert self.io_processor is not None + if is_list_of(pooling_params, PoolingParams): + validated_pooling_params: list[PoolingParams] = [] + for param in as_iter(pooling_params): + validated_pooling_params.append( + self.io_processor.validate_or_generate_params(param) + ) + pooling_params = validated_pooling_params + else: + assert not isinstance(pooling_params, Sequence) + pooling_params = self.io_processor.validate_or_generate_params( + pooling_params + ) + else: + if pooling_params is None: + # Use default pooling params. + pooling_params = PoolingParams() + + if pooling_task not in self.supported_tasks: + raise ValueError(f"pooling_task must be one of {self.supported_tasks}.") + + for param in as_iter(pooling_params): + param.verify(pooling_task, model_config) + # for backwards compatibility + if truncate_prompt_tokens is not None: + param.truncate_prompt_tokens = truncate_prompt_tokens + + self._validate_and_add_requests( + prompts=prompts, + params=pooling_params, + use_tqdm=use_tqdm, + lora_request=lora_request, + ) + + outputs = self._run_engine(use_tqdm=use_tqdm) + + model_outputs = self.engine_class.validate_outputs( + outputs, PoolingRequestOutput + ) + + if io_processor_prompt: + # get the post-processed model outputs + assert self.io_processor is not None + processed_outputs = self.io_processor.post_process( + model_output=model_outputs + ) + + return [ + PoolingRequestOutput[Any]( + request_id="", + outputs=processed_outputs, + num_cached_tokens=getattr( + processed_outputs, "num_cached_tokens", 0 + ), + prompt_token_ids=[], + finished=True, + ) + ] + else: + return model_outputs + + def embed( + self, + prompts: PromptType | Sequence[PromptType], + *, + truncate_prompt_tokens: int | None = None, + use_tqdm: bool | Callable[..., tqdm] = True, + pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, + lora_request: list[LoRARequest] | LoRARequest | None = None, + ) -> list[EmbeddingRequestOutput]: + """ + Generate an embedding vector for each prompt. + + This class automatically batches the given prompts, considering + the memory constraint. For the best performance, put all of your prompts + into a single list and pass it to this method. + + Args: + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See [PromptType][vllm.inputs.PromptType] + for more details about the format of each prompt. + pooling_params: The pooling parameters for pooling. If None, we + use the default pooling parameters. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. + lora_request: LoRA request to use for generation, if any. + + Returns: + A list of `EmbeddingRequestOutput` objects containing the + embedding vectors in the same order as the input prompts. + """ + if "embed" not in self.supported_tasks: + raise ValueError( + "Embedding API is not supported by this model. " + "Try converting the model using `--convert embed`." + ) + + items = self.encode( + prompts, + truncate_prompt_tokens=truncate_prompt_tokens, + use_tqdm=use_tqdm, + pooling_params=pooling_params, + lora_request=lora_request, + pooling_task="embed", + ) + + return [EmbeddingRequestOutput.from_base(item) for item in items] + + def classify( + self, + prompts: PromptType | Sequence[PromptType], + *, + use_tqdm: bool | Callable[..., tqdm] = True, + pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, + lora_request: list[LoRARequest] | LoRARequest | None = None, + ) -> list[ClassificationRequestOutput]: + """ + Generate class logits for each prompt. + + This class automatically batches the given prompts, considering + the memory constraint. For the best performance, put all of your prompts + into a single list and pass it to this method. + + Args: + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See [PromptType][vllm.inputs.PromptType] + for more details about the format of each prompt. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. + lora_request: LoRA request to use for generation, if any. + pooling_params: The pooling parameters for pooling. If None, we + use the default pooling parameters. + Returns: + A list of `ClassificationRequestOutput` objects containing the + embedding vectors in the same order as the input prompts. + """ + if "classify" not in self.supported_tasks: + raise ValueError( + "Classification API is not supported by this model. " + "Try converting the model using `--convert classify`." + ) + + items = self.encode( + prompts, + use_tqdm=use_tqdm, + pooling_params=pooling_params, + lora_request=lora_request, + pooling_task="classify", + ) + + return [ClassificationRequestOutput.from_base(item) for item in items] + + def reward( + self, + prompts: PromptType | Sequence[PromptType], + /, + *, + truncate_prompt_tokens: int | None = None, + use_tqdm: bool | Callable[..., tqdm] = True, + pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, + lora_request: list[LoRARequest] | LoRARequest | None = None, + ) -> list[PoolingRequestOutput]: + """ + Generate rewards for each prompt. + + Args: + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See [PromptType][vllm.inputs.PromptType] + for more details about the format of each prompt. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. + lora_request: LoRA request to use for generation, if any. + pooling_params: The pooling parameters for pooling. If None, we + use the default pooling parameters. + Returns: + A list of `PoolingRequestOutput` objects containing the + pooled hidden states in the same order as the input prompts. + """ + + return self.encode( + prompts, + use_tqdm=use_tqdm, + lora_request=lora_request, + pooling_params=pooling_params, + truncate_prompt_tokens=truncate_prompt_tokens, + pooling_task="token_classify", + ) + + def _embedding_score( + self, + tokenizer: AnyTokenizer, + text_1: list[str | TextPrompt | TokensPrompt], + text_2: list[str | TextPrompt | TokensPrompt], + truncate_prompt_tokens: int | None = None, + use_tqdm: bool | Callable[..., tqdm] = True, + pooling_params: PoolingParams | None = None, + lora_request: list[LoRARequest] | LoRARequest | None = None, + ) -> list[ScoringRequestOutput]: + encoded_output: list[PoolingRequestOutput] = self.encode( + text_1 + text_2, + truncate_prompt_tokens=truncate_prompt_tokens, + use_tqdm=use_tqdm, + lora_request=lora_request, + pooling_params=pooling_params, + pooling_task="embed", + ) + + encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)] + encoded_output_2: list[PoolingRequestOutput] = encoded_output[len(text_1) :] + + if len(encoded_output_1) == 1: + encoded_output_1 = encoded_output_1 * len(encoded_output_2) + + scores = _cosine_similarity( + tokenizer=tokenizer, embed_1=encoded_output_1, embed_2=encoded_output_2 + ) + + items = self.engine_class.validate_outputs(scores, PoolingRequestOutput) + return [ScoringRequestOutput.from_base(item) for item in items] + + def _cross_encoding_score( + self, + tokenizer: AnyTokenizer, + data_1: list[str] | list[ScoreContentPartParam], + data_2: list[str] | list[ScoreContentPartParam], + truncate_prompt_tokens: int | None = None, + use_tqdm: bool | Callable[..., tqdm] = True, + pooling_params: PoolingParams | None = None, + lora_request: list[LoRARequest] | LoRARequest | None = None, + ) -> list[ScoringRequestOutput]: + model_config = self.model_config + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError("Score API is not supported for Mistral tokenizer") + + if len(data_1) == 1: + data_1 = data_1 * len(data_2) + + if pooling_params is None: + pooling_params = PoolingParams(task="score") + + pooling_params.verify("score", model_config) + pooling_params_list = list[PoolingParams]() + + tokenization_kwargs: dict[str, Any] = {} + + _validate_truncation_size( + model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs + ) + + prompts = list[PromptType]() + + input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] + + for q, d in input_pairs: + _, engine_prompt = get_score_prompt( + model_config=model_config, + data_1=q, + data_2=d, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + ) + + if token_type_ids := engine_prompt.pop("token_type_ids", None): + params = pooling_params.clone() + compressed = compress_token_type_ids(token_type_ids) + params.extra_kwargs = {"compressed_token_type_ids": compressed} + pooling_params_list.append(params) + else: + pooling_params_list.append(pooling_params) + + prompts.append(engine_prompt) + + self._validate_and_add_requests( + prompts=prompts, + params=pooling_params_list, + use_tqdm=use_tqdm, + lora_request=lora_request, + ) + + outputs = self._run_engine(use_tqdm=use_tqdm) + items = self.engine_class.validate_outputs(outputs, PoolingRequestOutput) + + return [ScoringRequestOutput.from_base(item) for item in items] + + def score( + self, + data_1: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam, + data_2: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam, + /, + *, + truncate_prompt_tokens: int | None = None, + use_tqdm: bool | Callable[..., tqdm] = True, + pooling_params: PoolingParams | None = None, + lora_request: list[LoRARequest] | LoRARequest | None = None, + ) -> list[ScoringRequestOutput]: + """Generate similarity scores for all pairs `` or + ``. + + The inputs can be `1 -> 1`, `1 -> N` or `N -> N`. + In the `1 - N` case the `data_1` input will be replicated `N` + times to pair with the `data_2` inputs. + The input pairs are used to build a list of prompts for the + cross encoder model. This class automatically batches the prompts, + considering the memory constraint. For the best performance, put all + of your inputs into a single list and pass it to this method. + + Supports both text and multi-modal data (images, etc.) when used with + appropriate multi-modal models. For multi-modal inputs, ensure the + prompt structure matches the model's expected input format. + + Args: + data_1: Can be a single prompt, a list of prompts or + `ScoreMultiModalParam`, which can contain either text or + multi-modal data. When a list, it must have the same length as + the `data_2` list. + data_2: The data to pair with the query to form the input to + the LLM. Can be text or multi-modal data. See [PromptType] + [vllm.inputs.PromptType] for more details about the format of + each prompt. + use_tqdm: If `True`, shows a tqdm progress bar. + If a callable (e.g., `functools.partial(tqdm, leave=False)`), + it is used to create the progress bar. + If `False`, no progress bar is created. + lora_request: LoRA request to use for generation, if any. + pooling_params: The pooling parameters for pooling. If None, we + use the default pooling parameters. + Returns: + A list of `ScoringRequestOutput` objects containing the + generated scores in the same order as the input prompts. + """ + model_config = self.model_config + runner_type = model_config.runner_type + if runner_type != "pooling": + raise ValueError( + "LLM.score() is only supported for pooling models. " + "Try passing `--runner pooling` to use the model as a " + "pooling model." + ) + + supported_tasks = self.supported_tasks + if all(t not in supported_tasks for t in ("embed", "classify")): + raise ValueError( + "Score API is not supported by this model. " + "Try converting the model using " + "`--convert embed` or `--convert classify`." + ) + + if ( + model_config.is_cross_encoder + and getattr(model_config.hf_config, "num_labels", 0) != 1 + ): + raise ValueError("Score API is only enabled for num_labels == 1.") + + # the tokenizer for models such as + # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing + # lists of tokens to the `text` and `text_pair` kwargs + tokenizer = self.get_tokenizer() + + if not model_config.is_multimodal_model: + + def check_data_type( + data: SingletonPrompt + | Sequence[SingletonPrompt] + | ScoreMultiModalParam, + ): + if isinstance(data, dict) and "content" in data: + raise ValueError( + "ScoreMultiModalParam is not supported " + f"for {model_config.architecture}" + ) + + check_data_type(data_1) + check_data_type(data_2) + + def ensure_str(prompt: SingletonPrompt): + if isinstance(prompt, dict): + if "multi_modal_data" in prompt: + raise ValueError( + "Multi-modal prompt is not supported for scoring" + ) + elif "prompt_token_ids" in prompt: + prompt = tokenizer.decode( + cast(TokensPrompt, prompt)["prompt_token_ids"] + ) + elif "prompt" in prompt: + prompt = cast(TextPrompt, prompt)["prompt"] + assert type(prompt) is str + return prompt + + if isinstance(data_1, (str, dict)): + # Convert a single prompt to a list. + data_1 = [data_1] # type: ignore[list-item] + + data_1 = [ensure_str(t) for t in data_1] + + if isinstance(data_2, (str, dict)): + # Convert a single prompt to a list. + data_2 = [data_2] # type: ignore[list-item] + + data_2 = [ensure_str(t) for t in data_2] + + if isinstance(data_1, dict) and "content" in data_1: + data_1 = data_1.get("content") # type: ignore[assignment] + elif isinstance(data_1, str): + data_1 = [data_1] + + if isinstance(data_2, dict) and "content" in data_2: + data_2 = data_2.get("content") # type: ignore[assignment] + elif isinstance(data_2, str): + data_2 = [data_2] + + _validate_score_input_lens(data_1, data_2) # type: ignore[arg-type] + + if model_config.is_cross_encoder: + return self._cross_encoding_score( + tokenizer, + data_1, # type: ignore[arg-type] + data_2, # type: ignore[arg-type] + truncate_prompt_tokens, + use_tqdm, + pooling_params, + lora_request, + ) + else: + return self._embedding_score( + tokenizer, + data_1, # type: ignore[arg-type] + data_2, # type: ignore[arg-type] + truncate_prompt_tokens, + use_tqdm, + pooling_params, + lora_request, + ) + + def start_profile(self) -> None: + self.llm_engine.start_profile() + + def stop_profile(self) -> None: + self.llm_engine.stop_profile() + + def reset_prefix_cache(self) -> None: + self.llm_engine.reset_prefix_cache() + + def sleep(self, level: int = 1): + """ + Put the engine to sleep. The engine should not process any requests. + The caller should guarantee that no requests are being processed + during the sleep period, before `wake_up` is called. + + Args: + level: The sleep level. Level 1 sleep will offload the model + weights and discard the kv cache. The content of kv cache + is forgotten. Level 1 sleep is good for sleeping and waking + up the engine to run the same model again. The model weights + are backed up in CPU memory. Please make sure there's enough + CPU memory to store the model weights. Level 2 sleep will + discard both the model weights and the kv cache. The content + of both the model weights and kv cache is forgotten. Level 2 + sleep is good for sleeping and waking up the engine to run a + different model or update the model, where previous model + weights are not needed. It reduces CPU memory pressure. + """ + self.reset_prefix_cache() + self.llm_engine.sleep(level=level) + + def wake_up(self, tags: list[str] | None = None): + """ + Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep] + method for more details. + + Args: + tags: An optional list of tags to reallocate the engine memory + for specific memory allocations. Values must be in + `("weights", "kv_cache")`. If None, all memory is reallocated. + wake_up should be called with all tags (or None) before the + engine is used again. + """ + self.llm_engine.wake_up(tags) + + def get_metrics(self) -> list["Metric"]: + """Return a snapshot of aggregated metrics from Prometheus. + + Returns: + A `MetricSnapshot` instance capturing the current state + of all aggregated metrics from Prometheus. + + Note: + This method is only available with the V1 LLM engine. + """ + return self.llm_engine.get_metrics() + + def _validate_and_add_requests( + self, + prompts: PromptType | Sequence[PromptType] | DataPrompt, + params: SamplingParams + | Sequence[SamplingParams] + | PoolingParams + | Sequence[PoolingParams], + *, + use_tqdm: bool | Callable[..., tqdm] = True, + lora_request: Sequence[LoRARequest] | LoRARequest | None, + priority: list[int] | None = None, + ) -> None: + if isinstance(prompts, (str, dict)): + # Convert a single prompt to a list. + prompts = [prompts] # type: ignore[list-item] + + num_requests = len(prompts) + if isinstance(params, Sequence) and len(params) != num_requests: + raise ValueError("The lengths of prompts and params must be the same.") + if isinstance(lora_request, Sequence) and len(lora_request) != num_requests: + raise ValueError( + "The lengths of prompts and lora_request must be the same." + ) + if priority is not None and len(priority) != num_requests: + raise ValueError( + "The lengths of prompts " + f"({num_requests}) and priority ({len(priority)}) " + "must be the same." + ) + + for sp in params if isinstance(params, Sequence) else (params,): + if isinstance(sp, SamplingParams): + # We only care about the final output + sp.output_kind = RequestOutputKind.FINAL_ONLY + + # Add requests to the engine. + it = prompts + if use_tqdm: + tqdm_func = use_tqdm if callable(use_tqdm) else tqdm + it = tqdm_func(it, desc="Adding requests") + + added_request_ids: list[str] = [] + + try: + for i, prompt in enumerate(it): + if isinstance(prompt, dict): + self._validate_mm_data_and_uuids( + prompt.get("multi_modal_data"), prompt.get("multi_modal_uuids") + ) + request_id = self._add_request( + prompt, + params[i] if isinstance(params, Sequence) else params, + lora_request=lora_request[i] + if isinstance(lora_request, Sequence) + else lora_request, + priority=priority[i] if priority else 0, + ) + added_request_ids.append(request_id) + except Exception as e: + if added_request_ids: + self.llm_engine.abort_request(added_request_ids) + raise e + + def _validate_mm_data_and_uuids( + self, + multi_modal_data: Any | None, # MultiModalDataDict + multi_modal_uuids: Any | None, # MultiModalUUIDDict + ): + """ + Validate that if any multi-modal data is skipped (i.e. None), + then its corresponding UUID must be set. + """ + if multi_modal_data is None: + return + + for modality, data in multi_modal_data.items(): + if isinstance(data, list): + for i, d in enumerate(data): + if d is None: + if ( + multi_modal_uuids is None + or modality not in multi_modal_uuids + or multi_modal_uuids[ # noqa: E501 + modality + ] + is None + ): + raise ValueError( + f"Multi-modal data for {modality} is None " + f"but UUID is not provided" + ) + else: + if ( + len(multi_modal_uuids[modality]) <= i + or multi_modal_uuids[modality][i] is None + ): + raise ValueError( + f"Multi-modal data for {modality} is None " + f"but UUID is not provided" + ) + else: + if data is None and ( + multi_modal_uuids is None + or modality not in multi_modal_uuids + or multi_modal_uuids[modality] is None + ): + raise ValueError( + f"Multi-modal data for {modality} is None" + f" but UUID is not provided" + ) + + def _process_inputs( + self, + request_id: str, + engine_prompt: PromptType, + params: SamplingParams | PoolingParams, + *, + lora_request: LoRARequest | None, + priority: int, + ) -> tuple[EngineCoreRequest, dict[str, Any]]: + """Use the Processor to process inputs for LLMEngine.""" + tokenization_kwargs: dict[str, Any] = {} + _validate_truncation_size( + self.model_config.max_model_len, + params.truncate_prompt_tokens, + tokenization_kwargs, + ) + + engine_request = self.processor.process_inputs( + request_id, + engine_prompt, + params, + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, + priority=priority, + ) + return engine_request, tokenization_kwargs + + def _add_request( + self, + prompt: PromptType, + params: SamplingParams | PoolingParams, + lora_request: LoRARequest | None = None, + priority: int = 0, + ) -> str: + prompt_text, _, _ = get_prompt_components(prompt) + request_id = str(next(self.request_counter)) + + engine_request, tokenization_kwargs = self._process_inputs( + request_id, + prompt, + params, + lora_request=lora_request, + priority=priority, + ) + + self.llm_engine.add_request( + request_id, + engine_request, + params, + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, + priority=priority, + prompt_text=prompt_text, + ) + return request_id + + def _run_engine( + self, *, use_tqdm: bool | Callable[..., tqdm] = True + ) -> list[RequestOutput | PoolingRequestOutput]: + # Initialize tqdm. + if use_tqdm: + num_requests = self.llm_engine.get_num_unfinished_requests() + tqdm_func = use_tqdm if callable(use_tqdm) else tqdm + pbar = tqdm_func( + total=num_requests, + desc="Processed prompts", + dynamic_ncols=True, + postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"), + ) + + # Run the engine. + outputs: list[RequestOutput | PoolingRequestOutput] = [] + total_in_toks = 0 + total_out_toks = 0 + while self.llm_engine.has_unfinished_requests(): + step_outputs = self.llm_engine.step() + for output in step_outputs: + if output.finished: + outputs.append(output) + if use_tqdm: + if isinstance(output, RequestOutput): + # Calculate tokens only for RequestOutput + n = len(output.outputs) + assert output.prompt_token_ids is not None + total_in_toks += len(output.prompt_token_ids) * n + in_spd = total_in_toks / pbar.format_dict["elapsed"] + total_out_toks += sum( + len(stp.token_ids) for stp in output.outputs + ) + out_spd = total_out_toks / pbar.format_dict["elapsed"] + pbar.postfix = ( + f"est. speed input: {in_spd:.2f} toks/s, " + f"output: {out_spd:.2f} toks/s" + ) + pbar.update(n) + else: + pbar.update(1) + if pbar.n == num_requests: + pbar.refresh() + + if use_tqdm: + pbar.close() + # Sort the outputs by request ID. + # This is necessary because some requests may be finished earlier than + # its previous requests. + return sorted(outputs, key=lambda x: int(x.request_id)) diff --git a/entrypoints/logger.py b/entrypoints/logger.py new file mode 100644 index 0000000..678a7b3 --- /dev/null +++ b/entrypoints/logger.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +import torch + +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.pooling_params import PoolingParams +from vllm.sampling_params import BeamSearchParams, SamplingParams + +logger = init_logger(__name__) + + +class RequestLogger: + def __init__(self, *, max_log_len: int | None) -> None: + self.max_log_len = max_log_len + + def log_inputs( + self, + request_id: str, + prompt: str | None, + prompt_token_ids: list[int] | None, + prompt_embeds: torch.Tensor | None, + params: SamplingParams | PoolingParams | BeamSearchParams | None, + lora_request: LoRARequest | None, + ) -> None: + max_log_len = self.max_log_len + if max_log_len is not None: + if prompt is not None: + prompt = prompt[:max_log_len] + + if prompt_token_ids is not None: + prompt_token_ids = prompt_token_ids[:max_log_len] + + logger.debug( + "Request %s details: prompt: %r, " + "prompt_token_ids: %s, " + "prompt_embeds shape: %s.", + request_id, + prompt, + prompt_token_ids, + prompt_embeds.shape if prompt_embeds is not None else None, + ) + + logger.info( + "Received request %s: params: %s, lora_request: %s.", + request_id, + params, + lora_request, + ) + + def log_outputs( + self, + request_id: str, + outputs: str, + output_token_ids: Sequence[int] | None, + finish_reason: str | None = None, + is_streaming: bool = False, + delta: bool = False, + ) -> None: + max_log_len = self.max_log_len + if max_log_len is not None: + if outputs is not None: + outputs = outputs[:max_log_len] + + if output_token_ids is not None: + # Convert to list and apply truncation + output_token_ids = list(output_token_ids)[:max_log_len] + + stream_info = "" + if is_streaming: + stream_info = " (streaming delta)" if delta else " (streaming complete)" + + logger.info( + "Generated response %s%s: output: %r, " + "output_token_ids: %s, finish_reason: %s", + request_id, + stream_info, + outputs, + output_token_ids, + finish_reason, + ) diff --git a/entrypoints/openai/__init__.py b/entrypoints/openai/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/entrypoints/openai/__pycache__/__init__.cpython-312.pyc b/entrypoints/openai/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1325b804ade59c6bb5ce65a48f349c7d0908e20e GIT binary patch literal 168 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVx$BqY7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?c_`t p=4F<|$LkeT-r}&y%}*)KNwq6t1)9qU#Kj=SM`lJw#v*1Q3jllNDf$2a literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/api_server.cpython-312.pyc b/entrypoints/openai/__pycache__/api_server.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85e6bde2a661442e76a9317a65b33b021ff20c79 GIT binary patch literal 90943 zcmeFa31C#$eJ_0P%$;4*Xc_GSMo55$MM7+1vB5$TAPfQ{yiMXvgzg0fW+ZZF1ZXTA zjGaWpPK>jG8>hyJo8Xu<%39K-tueTDlP)ui(ov?gkA8Jto3DM}Qv!XqabDl|`=7hb zj7FG16FaZHpgZTDef{s}oF5eyIypQc;fY{TBhURKeP~ai>bUwTFUOta?&U;IJY3~CgnN2n#d*uR*$ouSt7691C$QvcHMGXFB>=L&hl z%l*rlyCAe8+~#j%?!wT@@GAc*<}M1Y4zKaA3E$zrBYda-&hT3QTITN#tqb4fzl*tx zL+#=9{`KJwe@D2}-x==mcZE0jH?Xjh(8lm4|0d?{u?8oE2Y#lMBQ%R*bj zJ^mi%_Jp>DxBIs&okUta-`@`V@|3G-qKNvpXKM?+q|3hKfFNY)kNI2?` zhKKw^;e-Bz;Y0pI;luvJ;bH%9IOdOqAND^S9`TQakNA&-AMrmD{;>bU;iLYe;Yan0r=eG<@8DoVmTBkB6V|Kf&Dfp(n$i z@P8uwN&hF=_w3M9;S>H7%-s-rI{YdBrp#ofi$Y%u|GxkCnY%gk z_3)T~jJaDv-w1!x|4rsz9Qs!H+x~AecWdZ7;qUst%iK#se-J+BKNtR<|9k9vY3L8b z-}ir?xtE205dI_oA2Ijx&>x5Y#Q!qSNd+hPeaAGgVf;$3h%4^Er-_rkDz?4ED(S!tC& zq4vOL#J?ooEiDw=r6+a^;ud@-)$dQ@dn>-j)$dO!?SpR*e1EL^-jDBX`2HL9`!PM% zcKD`L-;e3O_rP~T^*y2c?tt%4RNv#eZ!dh)s_$oX-<|ONTh;d?y6?U4y*wrEeenG| z)%SC#>HYZrd-eNw@cjV3|C{>#d3-;J?;+QReX3P@J zCB9!;diJ50Ov*Qn3;K2`^7=r=Yhry4`!tLDNFU2T1m#Q*oQHnC;hdS zb`LLAp>}^|#L!&ujcab>SKK8H1eG+G3}4Olj%miVn8xorruiEquI9Q?nxJsDUz@$s z8T8EvoAH7u8vUcs`G1|ZpQPWWehQj%+Pqh7^X@!t{vYC=$2tH1H|@OItIZ7+QUr}W z{Qq;>RQvQ)`yc1@{+$#)EX*F);qeaPr{y+5x`+yAoG{|?T%9f&q^0>)pcB$$kDT-ZV9ic->|#bQ zY5lDI`rnwg*B*Y?dhIs|)ARJ&F16Q2ZqRFQPTOlo{*(6FZnf7Qxk0bJHEpkbIB&1r z3XJ?!Rt@G^(3FX**s9sDw&dsyT5^5bmOT0%YDq+G$*~)>p(5&OWYDhte6= zQ=*#xM{bb+FQ?7_qq+ItGVOQ!V?M05SOO(jnQjPQo#?7Z;LWL#-P4VRPJ$HilbJOVRyx3%mS-E&##3Z_kWIw#%pS?LN{x_mwN#hi3s%1T!>ZFw)`r2D zeL3%pzIi;iOZw3*^X;a6-l;R}*MQ@6Ig>hPzLM29CDZoJnVdd-F)LlERErr`CV8ZV zkj_1!$`YB}82%o~bAp%35N5)=mPwDbuySdal#<@uZQ5;;c1sh|Tg2Ia~&0Is$-yuJwK$?j36X|Vo?XjVx3iwyz5As2&O#P*@%T>J1N~Q84E&nP> z%Mn)5$vwqA#Xn{07xr-driW&7-1{NlJfgh$BPO=~-BU+rgwy(DgdbtGdD`@(SK#`E zBc>z#5#;T!p0+V8V0jhOzm^IjKS<&3EOrBmH$PUvX;(0KVvBth_9hf z+NJ4Y{W*)ZQ2gQJv^E!sKVrYlVp9AI7SaL?J&wPm_?M4c5MwdQZx#P)e-p>ay;!eb z(q<^53ce}+_1U;KMu@!xF&;vSC7FKo4eDXf)LLtKA>}fp`~YGu(^9hh8#69ws-V}LgYdgWqp*Ye{diw9gc?k2Kx3$GQ6FecJAEX8|{k@ zMSSM8qkUjt5Fe75Hn$H9r)^t;lx@goL(ul_9fL#oOxrr81JZyPN!vE`MWXOe3mXRI zaJrE4>N?yn9f$@82hx^%h9ntz?2tY*Bt@bqtU`6(*B1(keH5unmItXCMIGCA^llA` zkl!EblO2M6|OUPxXT?%OW~`XYORfxZI=D8KH3 zNK_sQO9N=lp!_1AE@v%?N`c@&Ai5V>hzAA-L+DQPy*sv~ZJXdB$&qx?)*;q++P8)J zhWgL4b#Q1P>a(P6?Y$ej+YcN_+qO!P2)dD4*e?YSN@+`P|KI^BE%c&ykffk55|QL+ zU_d&k^!0tAP`G1oV0Um2LfvfGx(4di)a097445bS6Dl&)yc z`qnxiU8Vc0Rf?z`ovzLkqI4;WGm`a6@4s%>%8z z6}jH(_?LHt(3!#A!G1PvjOaPuYV?w+IhSU7Y0)~R=-_^67-N!_V`-+B8m*!$yh{>A zbkO?YsD!>%{fl*fEpH_h7n^IrZdjbqVWb7aJaG;6}SGa=7lko`v&%F;YHe~))*~+eeRTG_^6QzFu`TLzJ=OyT;CUwwhoF?XnS9DZ@MfiocYso_hfmi z<29uqeH5r5wc4!XDU0f?EYv<_y^;}|c}GyKn(Ww_{hh9w8m4qFGAh}L8MmD+^sOr9 zFsF+;rSKpHP@Mn)3b%G_&-ip2Zl5(>*dGc8q=AC~3(=tiX?JL_PYm=Cs1Ni5N2NWX z!99V2!GVA%?d}^2MFRvl(&Yz~$sg#)s0E@Dpap<&+M{`g2Iam$cu*V)Ns)BPpx0z@)E0k6tEh=A4teD0NoA^}<269irxMvyBI zK(9y8%s?Prl@TY5?h8^61;n5nn|%;Cry1x(9zHM_9Ee7m2eBObg3Wygf(ne5V|74R zEN~GG%OVw4FWL>9-WN?51>~Ut>aswzFS0+9HUq=c38R?6SvNV^GdH_bFD6LEnQ5`&~%ek)!pEIB2sf2f1PWw}DcXyi=+}+#&9;`9bawkN8NheSOV*i1(R#`i+t>c=-4< z{;vKmUZ3T)0gUQ{oNv&g`rz!dW@e_iwT^fUjeQvTf6jE${85VqQ{yNPd~ zlbvnclQZyVqpH8i_YeujXxZRk+7by#(t)%9$P!6c-nV7T*1(o+8w0&N+jsT`x_fqZ z?YOUfOS(WDr1chvfRrPkStv20D*%{;g%P1<9=+b(5)B_{IS@RsC<@pkMHd|&YFo8( zf1A`23hrt-FdW@GIM70isU>4EwJ3|J#aK+u2Zqzm0G3WaTjR01P;fUXFG~P3?vJOMJUSI9mzVc{AVc63z=&a=UoE zeJ*quk+W%s-W~EskWxO1SK8GNsu-Y&mbv^e`IrH@(j|BW#K8dtQt#^P-=DV10R04g zD3VnHxjsQ2!KeH&@)KAdly!uriP!7BOa3SY*tPDEsqG)92;aj?Y5$~9ZK;@W6+N}; z_^Om^<|Wt6iTR7Z?t9Ufn%|z7-#)tanqV$$yXNFPwW+eXiL$xR$G$X@EL%3(dAXn_ zRWLJAF!NM&yr6NSpg2`9J5ezEg|(@Mm5GLxKQ35xrL1za^O~J=RF2!LCQX84=H-Ht zEAFzmd)7p;=cMat*JleStz2R8@$N|nSL~g%GxwYH;mL|*M(EpW@Gm24Q2WRDj$-~J z>kB#@+$*J)^{w11m8ImKZ-)OX&FtM;-eEOgu$tk2!69@MnlBWR|BL2y;SeF3fXbPr zy<#vzEL&i=Ot@!24CnwII0Rsr;Zn>&N&!wHEfA`d$vRY~MJ#^?FB+lfkUXHnXW?I13ly4?#@vAV4_} ztfb%=hE-fW={g32@87oL?yenyt@m!(*}Z+owvNEouAQ5KJ}jJ%AX^!6)+x>TPoP@pbT$Y&DhfT*Ui^ue(=G*vlp=8YX<* zc)#JW6kGp|!)wyXIg2NXJWmw7QHaks$nWR$dUKM!nTO%~%g845#IcUn&UyTWrF>_R z^};eUx!bIrrREEFy5PQO=R1pqiw-Nfi+OsNT00wU7pwTr+JcMKHgbDy@ZTU4<(CDw ziDG4N`wBW+f!lx%G_!M_5&o-ea(Pszw)a_bBI7d?3X5h@2${pe5ep!M<%ku~ z&J}fOX-0I&KEMaLBW6H09d2oHkY5HIJYt(Bj$Jfsu;+*!w5=r{mCaxpmk-9H<)uc6 znhU=9ptpI%Hex^PcuD0B&?@Uu(`N1%KVr=X$2FQ8M5u+qLiHAHK3wp7?ASG8KAsxBDO)PGbrkez3QYj*ujV~Zx>tM9gI?4=#>B?C3!WAK#>-C zg9D3H(BT~->cEQ#UM5)tF{~u^b_QjsKN=d&NR|oQ)Py;Sc5Lr(vIILAB@tm6EA{XW z%HAFjgx=t8=F8|tFrwbweZi0ib%~!H5?;Cx{wxp?B9%)VpUe>Rq%* zC3F^PgwCSON}(yu3{-<&w`*1Fk$(rBE`Oe0&*J5?r(J|3R8oi)m@ZR_Vidouh@KL+T}4y$x3l32m<|M)IUOG8OZ1V^-7acE(SN4wEcm7mpy;48tX!p;p+`?6#={z-K+%xaouA^Hg z%Dkzv#za};=?{$ElPqgHx@D@*q)=w5pYYVBJc|;ZMH5vGU~(qB3sT;73GcdzIg3+s zx)XD{CuaImGuI?$u9=uIf1-L0{!f;0<;_<)N4ay7~zK|MH0u-W&7!}x;F6-C^@k!6lz|IG^cLlbz@9gT`DgO|)d6BzH)l@*o zUy_Snh16$650SFs@=dF>I-2Hb}xPr zQ~T^qErNUtz+mF?NuEYHL!@_BCr!XY>I9A`7;Go%p)-%#k^(bUg4VDnA8{gDwIwxT z3OKcxAX)P0!}AdqnL^27o6jDb&s34%m9#d6 zG!`eJCZ?VR_W|vk{5NPD(Eero#40|Z72C9K`yB_M1zFoX-`CjVE0hl-T&|(l6L_Va zL0wgg<$n0UAnL&8RXs%^Xq7g}P_hz>2l68Aq|8VXE**xLgEZ1$VbE66xtbLj6x9Ga zVO8g-#GwhLul~X?afigmkycXvCxn&I45q3b<-oTT&r7|sF0VN?|Ap|m2gc`a zNP0FN-3l-7?B`<7#NzYUCuemW-8wGRO$fG+ZGLofN+?eVyO)zcj6gu-f~cZuV=9mrdHJ8=&19S^}Nuh}RF9^%5 zxmET5pYUIbO0n*M*wj_})SBaKQm(p$t1jhQl5j1F3rpUGW>n=gLt7qGTjDT+2%6%k zglQVHJmFd%7nZ*Zjj8F=Yz>TIfq(*{%Bjt}FkGi__y60Wwm(Dp90rYvV@ zNTE1oehJL?Z8TvWnh>GIHf44#PPi7wg~jhe6RLBYpzEfmG{mWj!@!ft|BhC4Oxv6l z3D=6au;N{4PDM_04k$(qQ`%zF0*EZzIBi3gCR|J7!qRu4A-Stw(u_r>G{s>A324aO z_>-?;cz!dLe(^>SH$rwDQx~>{`O9|HbQ{%n@;6pn4V5$eFe7KKFmgjT2Aa~(b~Suh zcA?U}_{+bRS0`MncA-|O=dJ&;-R*)r>>F7X#lU~ExCdU{E_SKPowi3w*78Hz}@QQ$pg(x1wQSP~|% z$s+L8IQ2+9NL_k2;xQ@Aq*bu^E*I6Fx|>9MMavUK%acWIDSKPo-gentdAeiVP2xUC zRU9kddfip?3yw#8HQ*8j;9HgJ@$E(Y<8@29(|oJ&yn87(MjqebmvY}KZ+Dr`+fDd9 z?}YJ$`Mk>vw>*JbrMINlZu6IEM^~uK!!~-V{>p zK9Y-p8z$ob7zxWHMNrsYhAYc5BI*oo&I(lQ%JM~=hk}Qx=bMnLqS*6t1y!eKjM@Ic z^$k~QaeHEMyP`#ld+!@}-5(e3e~a0899z`F#~dvx;(#qQW7Z%r^|Wu$r+rJH z7j|bbfauz>W7`hMx1g5;Y=m8^+)kO&EKEDu@(74S;R9(46LO~of}HYpiZTzc7cES5 zqe!)FssJwSP|*+eeehC*KslqwD(E;MTlZo)^Qc*u06FTSv2qH#!10p zDVeCLPt~+0YFfwUom-Qv*_iTd9CcjtaQ4y@zGojAw=KBrhJD$Ll&2}-X-azLpL6_) z`+M%x$}NeNTOh|SR^-@_Z8<$By2od=j5`)1#p&);FCT=^#ct;?>Nzu?T462S7yRkKr7OA=K}QdKJxRVy!+y?oDw3ML`H;x3-Fm>eZ< zBlm*xtoRf&Rkb!zwf17|_r)qRV#Zu8m`mz3$q%)h|F=Lb zUG)7Q2(@&m@hd^Cx-6)r+mRVC2wEx~FJwU>UAV47Azg$Y)wh7>vnj|Yv^TV3M!5!L z&nz_y3R!yOi4`_60{FIs-C zHEzG-?aMCDErSDdK5$^p00$hvL1hPLzQ7f9;$6_vvBrF1jRo$DoVnA?XVHKJdd>K| zD&mzblV=)R3d18eyn_vnPMJf^q|@X$UeRJ~89l-eTZgSaTWoE|;D8vUtqstS4to_< zj3hH73=K@lFbUZ`2zv{~q@QdwRNIT@^i0(>LXXQ7J+O#S;%+&8=n_JnK;2(r|x zl0Arqo%z%nBx$)z$XHqKR<5q;j4N5YELFQPQM)l&+np-!9^G)cY}Q28%v6;xQRPck zEqJ|b#;M&egvZO4fxDW~@Z6SXwxsG7CF&MkvsgSuFo`QHf;DtS^)-k7Y33?6^VcIp z6ds$=zL0;8Z=WxmZg;`igKdpA)T}QG;z?28$xNj$mmxMfftf2(gEM;|J@@7sFLxv1 zUsK{XVo9IuIc<|G|+Crx%o(YNRQg2T^MGv}Q5bMlz9 zarU*0SC2Vc4z{)^3a6|EARyz%@OPECt?3s4v&K+i03zy-HkEZwMOy@>j74C|r^eP& z8n}<~@-x|FHT5WnMvzH4mM)L%9Xu51I|wqE*^;Y@GkpR=Vn)Ui@@jNd+NOjCvJ52T zyAi}*|Bbqnw&Sx&^Lmlz#QTy(^+z{effc@|HQ{MZdX}b&mLA=B*;RSkmhvu1c$Xx- z%f^}F3RA|t?0UqFUoA~GXc24*rx#XX~uOEN%g49fr&?rt-h!(KU(o)`okbj~k6Enz~e8uJY zOepSbINCuiG>^L)<3gj#^p;ohGU>arrU=1EX0Uk_4y7R@+!AhSj)(?uk7~i55g7Tv zFjAc`M@%BjH$M+EL_jpvPm4*vB7au5DL^dS7c69w2@^%047{IQ4j8t01^9S2HSDof zKpSXYl;H`Te^87Cz{6lSwRaF$;vl%62(X8@x3`N7vjNB)lxkwT`sRX zb?@m7XS$Q+D@L8z+2O7@vG2tDPgk9plbXLSF@N35(fIsz$@yF2zOAqAj2G-6(NP)2 ztELSig~;U1r-$E)5YvCmyq5onaEIwQzuNk7^Bty7ELzL|A%BPI2d1@rtdQV@q0S|i z^Lh#G8Ej$oT%^=JKl4!f4$K3#1k`3sd7bFZX9k8jks&r>!V;Q?InR4eSR!cH`#+HF z&@FT_*^cNOtt$+%iD*5mZ`@$O^f3rfbTNs_M7XqcQJuEPM^WTsER5}c8BB!EXidvR z*n6%w?SvUGTSQ)QP>OhQDuY)wqx0@it!^S-9KArYKvnym!*s}`oza9KKSU#ZWlZ|U zOh3})niVVA*MX=|_f;3G-kL{bdOn14DFz{< zC5f6PV{PZ|PS$Knc{YJxDY#XBW%0P9H7~#7s=Hjg=u9wKyfWVP03;9vR@n4etdq4| zc{lMdx4@&ULCcT%I&J*f@(vq!p65F(!g-TGZfpBG_+PN___$zmbPDDR#VsA{%oo;K z;JzrB5gMz_0z)kiYXcYvibr6Wsu^kp#dHzd!$KO4zTI@1hgF48MA&E#K@NtBD{c&- zgc0F3Ae4}g%mIX=%}hEj)Cna`q-)6#s#@Mbq-(jMNY`@HB3<2lHVaBwqF$}TG$~ODXXznw^AetUNsn*r@E<(#jYm>#n-guDbLFm^ z#%F4B*G;L$wnSrFvT@b8dv)Bt`mzF~_9lzl;+^+r0I7Nu_!c(N-ilyVg;EawY$Z0b zVux{u4Lghla@zr=9Q*|v&vv*vICsDAl6uBhF7LUBn1X zVYYO7h|Jk<0}kb7w+yxG6bSTgD3$c}2xxJWDl5ZMnL&+*-V?NqH3MqoKnh>6L7yn_ zp;(y`|1r?PXUPBu1O(*0^kVCu`zh>Z>}f7Y|!%bn*) zL2{la4as>+do}!Z6-mboT}5Jn`@#$}LSqF?Hlo^mW-VAEks0A|LMI$Dd3e!eNJDY- zoB44Zx0oUKFcvh}*`p1H?$Xc$=QH7F`&%$t^c)dz?uW@Bw|nW_4D&vnh*x6K`-ZHL zC0eh=9I+TO2n`L(c2hYIhpW`ge*Z37(XA3mV>JeLpmO^J%GU67F z=*$Z2HOyMbi0y2VmIvxFPxrTo?z6>O4HyUr0|e+y4A`3w9kFJVsc&M?a-bcZBUUYk z%rc|8E;rx4%X}Lf44@U&wd|rTW52sWdg#QCu@@;{-|6x&r&dF^FyG zDd|zx{O8cZEQt*X-bwy=_3!b@6y|4iGk3EF95DaLq@|G~J^6O-7|L{mcZXC+pRCj5 zqz|EzGrb`vA{krGqj!Xa?Rq)rQV7187DvR^aoPqO^NcRi7M}NMb8was+tp3&DSjEvZ8BdORaO%4TxJ@ zr-v>`R4o__q?T_>EZ>$~PJ6OcUA7|lop%G)TRR|+IB!Py#_E{fj-+NejaluTgVO|Y z;*umoTcb2x4pScw(2LBQc0zbopl-3D!>^>+Ed97XY?V_?>UDOXGGubi42s&5p$L3~ zhU;JH)yKvpr#=cJ&$g*LP^NN5BQtQw&@CF{&bWI+SJ6!Qo~9SE^Ozz3Ol~Ga{tz{X z;ZrtONY2NQYFZ!~J3`_VhFbAbuq#PDWqRFDqe*+f7r=&TiQ={i&z+D0l&pjlpu{<9!5-A&rZbkLyE$%eo)ii! z&ip10HHJLkC+>M?97*?*xP8g%jxsW1SUK)kHQ}yGxf>GhhB2J{(vWno7!~yM2v(dE z#_e~KRG@R*?Tg!eue<6_KNw%rGw#|J7q(3b7$wt>ziKfeLDhob350XFRtYAadDLe!57zB z;C|I?hCAjVd`_DmT8xbYku1Ef3j}sz%v6y!zYn#@ee#K`4VxsKC_|tD5AZ(a$sPl{ zlNcL>yrJ{gDS~_$Jln33;dx!-ujTy?cwR8D(TETwVJ~~MTEad|$J!&pzIrX(_|-rx zLT4MknrmO9<}!Yby|}>EIxa-%bG0~n>5VsAdeo<-H8u-Ez@YXu${|?Jo)KY+IN@C_|iJVd4bKBbFT83~|DF)G*d^1Mu5{DeCRe_q{LyCOPD=Fy55N{Ki2N(Y~s%sly@|7=j-{7xx)F` z0=eg|=MZpVE{~53O^(hv<_oPYonG@ruLbUlbIb^Z#RP#)eJV@?HGhbq_`eEjWwRF(1JX0~S~uJ%ktub~ z(A1fuv$gznK;4|Fli!yI5kUN8&W8+wJp6UQujR{e(B`XRF_63laMw*N)ol`g@Sc!_ z+k0ZZ8^BNfeDQo#E|UY`>>b$Ak>L!s_8LU*578gKN||(4830sM?CXo-wushsC_^UV zi4m=IHla8yKT0K1>C@UD%a2hMdi{5Tpd@i(5cFEzJE+yxO@WcWIc@)CMKGWcr(S^2 zlRZ!OOq9<$U7f05ny6nI_bi*J^rk8oBq|qNvk63{3ZPPPa!84?@nPO=TLr8ktAI7k zDqt0}3V_?Xere|#{=zCA0THWEx>u;&!O(H;1IT$x-gY8ppUlMRGWskMRu(u zphH+9uPNmyl}xWT0x&;CGzGxebCoDLqP>n>0K@DS^6n13UD4hY*rM+ah3O>Q9r~-A zPF_`DM8}6gaVZuQBGF69b#nb4WYpkR4l!A5HxsQ zhXx=wJg}rL@=il;tezGcG$(4B&m21EO4f9yJe{{rZajNr+_5Mhp`inVYOcKN)(8gq zI9d#>5An~`MJ=Ec9>d?&58|bwd$^P*pW@QE03-8&l)Q#R>NYNBW+SLuvSN?z9E4>6 z4tvJA)7X3i<&{2J=$KnF>c{nYQQCb&2LyrgSep4*6%9(?kQf|n$vSiiH}c%zRFKX2SX6zR z=@i3HJ>;FY$iQUAiaO>z`RvOyZG~7q6x%+SC>G-eo(3el;IXwA)*o+autm{(!Swwljj zL)u7=?b)&;7UNi59oucy4{G%+nIS9*tCMT484U% zF2y3rp!bB`bVe;qU;0G-r1-4qep>|K4m4V^z_BXEH*6)7@lD#vmR5kofo!jxV%SPE z+{+%t7)#sslU1dN{8{AiQCjIXICPO*9XoAl8ay$TYUHYBog8_3BvsjzsBB7B&QF!h zKe~C+q)F;lBx+WS#a?cE4LfrlNO>N(m7O`7rg+)7V|iW-l5LDr&y_BC^NQzzTe6b9 zs;DHaolr^G`Hoq_d51vmqKb|N?tCNPQ7@dIBaqwI-iCk+&>c5$7aAN;5nou?(phW1 zSZiVGVlzTx6*=HfIiPJC)kI#Xr`y6zfqHt#O+Y>F*9!IMEGEn&dj1RQy_cZgf*Vk< z4C<-#aXLaa3+9z)!94kC`ur4L=~9)rW#C0Ug79`o+?|+58obkq7R3POGgRobRxs(Z zsWpkn&mhg$rhvJcQwx$sbEg4w?Ni9%cK~yAN#B_N<;vF}mb)+Ix$kzs+>&v}(whf! z_uUehlRt+}iP^A)nXZ}oFDCfEm|*Qtx2)ZM?;sviE|c~)G5*Fk9T0(dtdN$iA)I&M zYEJbC+}nW@gsP_(jD4QUq}MpD{VZ0vTs_Pp?;hhRB#th5Uy!I-aAxBe&O~ZYdD?G- z6@Fyg(VTaMYp14x6<&WktS~#qB+uIZjq}3anZgT4vBh1f8O@hmhL}Qc$Q09cs&uCe z*kUl&A5;O@P4mUxCY<_*YiMvb2H92Oeyc(6=HNhIdv`{qd@A>xE>LfF3JlUQer!U0 z0uw5Wf2M0@mG%(x{JRLgM)Pek=B7eYdpL_LB~&Ja%1M*SapyI&0PeYfE3G}%cKYsQ z$>J0{T0ZVtd|iGDsp$xsCHdg8K6Yd~Iz?@f?>NjU(h2plh?{8168SXhVndn6i742u zny1nc#_|_e^Pl1?g-=_m`Pln8CAIO zN0zixs_MNv_d+&;Gg*d0q8CRmdUrv_5ut$|pes+rp?*4Zsa{T_@wf|>U4p8b#^SV5 zfsWot3v}p67A_-3=rR0Vr9BI(DdtUIP@Qi-56)=8tu*SzsIClScKsBNfm-7%iG1ek zxPr+X)kP3s&@o%+!9h1>W+SIabce5fldVc`AcS(@&tK zt8B&(x2^L`{{vmduAHZ*^w<@uJ>?6$=0w$$v7#O#hy*Oh|m z33uIT>m~PGtd_c2ryd@!UGREIL#m`DQPMKzdU@`6?|pQqWXb)LHcO>*)WtUQoW6V9 zy*O@Ptlgkg+dhMjmC-pYICcy7aPAHsR_;s7+U7+a%qGF7;J6(D;7e#8J^2nW$dDG9 zvTs&3_BDuR47ZvG6B!mWk)b8_DW}4%duiJbP9GyEs5HmRu77LB#e1485z@i_Az8*s z!dYhsaSmSX)#S!C_?>l}r4nGO=n_l__z+twO-v2^fkC;dd;U9a|Z$xcv(Q# zhq{it;+~dq`(oqt=QW8USgPNdvfmk}tvFeyKd-%hl{Ug1TheahzsgzuEV1v71> zGTJqIzh%?nS$!;xX~AwwT!dxc7rKqPF(sE@Ha2{I+S=6th6K1(A`lSKYLy-1cP z=gtZiLa$}h&5C8?_T|Q`XiF5eC5u+2?5pCqYc_jUtiCS)K8hq&A6*!qHTo75!GOZI z;kY4YP~kJ8=}+e6d|Brh8iHt>9$9x7fB@nW)QcYNR~MV8I2)P=UBEmY*yzE1c!XEe ze5-O2AzP8wbOEj+l!pV*G4zF%i}RFDe-!1V&DfjBdLFwt*+#x^peC6FjZD$lYb}ky zHhd|N+{%^IrHY#p#Z9T=g^A*YXYL>CPZqBpwHSlZoG5BO8yQ>vQY=}tI%TH=DX_() zxJo~$-NrfAT#wK;;$!XR_A36og>SDA&Ra|1hS}PozWq{QD0`q#31y%N^Z)7trFAf! z%(sVQ_nFiA)qIrlWluh6E}9HG4Ge`ti)$PYkr%b=!A06{Wp93^Ea9Y1^P{%NNDZ}JKPpnbF#fSFdvT*f`R&@*s1rr!T!_uydRw_iB zf=Dw!#<^_VUe`utHSIjD^PO3ctXq+)Tc4;~pRDUl zRdkNJi1&PG?CyBuUE}WdxSgb3m0dWxQ^lbU!l5?dJ7w)Syv2=uPN5KP62?KO0r^0G z)}RoSV^DNLSJ**0miL?R>dF8dc@A8F;?SD!)VL7~ z&VW(@RW3-uRok|s1%|D?=s4@tO77+Ma7XMTwodM$4d_3|h~v?U4Aq$32en~7&n_QGXPR7<%J+@XWq}n889GM}yM(kKf`cW9PkWTPVI-xSu z3*Co#mHS`=RCDG5BDXLfcsghE3npV$jF{L4RnsFTV}mLL`QK*Cy0GVZ+2-DDJzi0w z!}`7KK!ep@wy!B|g*pD-K{4jt8jP^3xaTPzxYBk2G6i-mu$NiNe-;f%TlYhzuqTqX zMT21p*I43=%s--){wMUJ81f&JO9%0EH6%M3SOF6yjPpyosXCpkHtZyV49nIJQ*yQw z>dTZHqsfc1mvGt=9m2f_3IWD8NS&jq2@Hk&lu|LHSO$EY3YEajUFpyqrM5;zS-oGT z06G&w-hfxci7YgRM%(`X3Wd_^K>}6}!l6(Vg4IzzQQerT_9d!)$?65EvIV1^lMs(o zoUD4f>Qqy*q+!%L;VKzjd%1Mh<+{14x+RIaC8@fViMo{&m9#;$45tyBaT;*}S3B#u zre~T?uQ;PI{M)+AdekIlVtg*ATQP zs@icn*Xg!zmyboiRr~UN0EgAP6#b18(nv~toDp~yli2-VBy3} z@2LH6D`t#tm~fYmKJr4%nGN6G`GaL+TgL0#CyJ|1x}SERw*I(y-WA3Dg8B4@FS&kP zybw1~SI&g-L@~|}7_-N$`nV>vRgR2*wx#vtrymGP9ZY$?0 zpR}0^%io?fBf<4aFKT-|(uMv%#&^xk3%Ch!F z__$clBj93VSr_)Uw)xmzDhu4N&JhqAD}mWbUkFEdzz!0kgFp@ut%Luww+H_75X+*@ zDFgiHzv0?~D|S`z0Dx^iT44Jc<*Bw*cKp`WZ$_7pjws*REfngYCa8kT#sBmS`7e}E3>85|xY6Tc?Q_9*RUs;@RiLMaVenAJ$)VhRhyGjx}YK=bb+0PIj_S%-KsTRwTi8 ztinr%A!Ka^0ihIvuys#xK#FV#o}#udW--0Haa=g!i$iqIy^gV1sa~~}UUp`vH)e;J z2%=Xp5-ZM3OH4`d8f96K>Fq0G$Vxts5CuUojKmPmx2cQE0uj^|^l%1zQ4m%!1!RI9 zMhT}2b}C3y!Ffz|#5f*0g2t!X#u%9DPpRBLqk^o8h1`^_y+5QF^m>?(63O=!q*P(H z?BFNLy{Ym!iSjwga=!Zf{J*>!n6J3xTbi;miYF2F0Hsni%91*su#qw zh$q4?^p3lHXC6A&cWKc&Emn^ai{ZMsP{|`9tY__ zne0xvW+hy+=rWESJPJQ!jxXyTFWQVc$L;q3(~$r5$z_~lEB_Y6vIV%e1J*uDHDMv> zDW52xePY|`m5AxVbpZmJX6K4aCLIFZ?V}t)X|mkS>!<6?JA*@V7O) zH{;M7iu5+pvikip@=+``pfwNBT8A5Gt)mcV4X$U`o8kG2x4dID_sVj!I{S|rA>pL`=36v`ApsO-oy?A^{SG;ON(zP)zY<%k~Rqa@* zhff!G_uRLwqkSjN$q(#&aC=uTwQOLw1o3jeM5asHwTD?+6&>w!XhUwx4B6K07%VXp z%RFR3UMpOUlYF;N=qy*pmJC6oUd^qctXteY*r4SD5+(QC@)*$XyT647#+A)Gp+%VH z+;VLO0lCn{nbR2wWLO%Nvdc9y%moUVazeTK!Wvzp5NwT$ctT10#VPQ8WnGqU$|IR_Vf3J7D zJV+L?xGyK7X?XJ}o3eDi+j_P1#b{HBGEAa+rz^5osGpzEFy(-2eUDjVl0%Eq3Urx(V;s$x$WCa7y- zCHk1LZji)gLkOX&`3&h~x-hG+@~d)wObCfyJb0?eMl?V{NSb_dzJm1C%UxF_idH0x zR;KJLIQ(LpG){}YVf-~kdMlUW2? zWVxe<;CC3kg3PYUKX?-~c=WR0%;06;%&QG>p21srUH%(nns$c<_s~TzutEq>BeGVT z6;p*)+|@WSGAG-b*?5SE4$_+{;NE3(EUqhI#QB9aE z{e(J}->qx`pqfAir`Z3axBEm~BCK_}{P&1jOapr#q8;Ua;;NZ&mra^%7SH9vk`pcC zg|kohT`sFUx$5avr|$Xm9dDSqqJ~uA>`R5S-+rC$xeyQ$qg-^nXrf^4%kKC+JHVXa z_zNieJa2>0w|K5ce8~DEt~K0=@)q;&Io5Drx2&{%+q{N5hrjPT+UJlZf{r*RrYlC8EJ7-i_{0Kr%QW>m0Pm0NLePq-HMq7QVhz$7%R-X zUP3{#LKS(X!_eNrAZkGLLSE1pBIhvnhs}Pt??5oHUm89#yV=R|B14b}b1kBWxj=oP zmdMT&==jyuBjQAhg)h0r6_7junpcOsd-Pcw?(-HVbz=Ss&#cv>m>{@ zJvZ9;^|a~96#~kG0Fnz}u_(_le0b)22&%cB>z1MD?nNsheHcM_+P1eZvNsgmh5J?Z z_AObyB5egHMK)SiF}O#H;1VnvP^AxWC!P?ILc0}nt$$#vH=1K(We?zFh}BOw zS(~*eK3dRpJkL>XqO4+6c+&opZTG?){m&1+R{w0$QW4n)^nDPq^Q@RC6CAil1E7UL?qWioCKIKOHcWCV+;BPRq}IjO~-e z00MO5otJFU6@2fv zZII33W)&PiO~>T{8U&On=X1zEqewKG=^6qDdZ7XJh*9NcYik6$>#&sO18EC09!Wdd zPk`oh+QlYq07sGb?@c?E4+@A7v?eIR9tGbN;QqGY?qOw~$L35sb2FnYqS^a2^1|$$ zv+yhLd;Wc6vyyY5ou2&@H!e@9x>CD5Rl6=xyY6LkvbGb4E00=F@F$i-HT2XS$M1Mz z?WqSYx#l3=>*aNHHu&kTuWWgK%UC6DMoE^hPL;PO%G;CW9iz@G6}6+TiJFo9EJk=7{)4jVx!L_*#+?b+nW?Xu%$$x`v7Yj*6w$7gCC^7|$aWy-!7T!6Z6eji zN)a8RGsv?$5L6M^5gYDRuuW-uUNeH+SiZs@cC7u2FoxY5jUGrl4oSNrgZ=w)#}Fx2 z(>4{4r5*C%Agn}zwdC`tD2~8?8{siC@D)4nK3#C{fVMg$G03(zEkX>e9|G;lb)}>h zPYaYuxRSHKfX#5@k)PvgL`g zfxoGh(WFU7~s2%eL|6O_wVhU?4US zU)UAzy>Gno{?Sc8DXYyY&|UV_;p2x-Sxy}~BP896;`T*sq2WH5zoNeQYkCm_khTQ^ z;$VLuAkQYRi`02cKm9M{qJEcOq!;V(Ipk`j*IarL!jhfnX*o%*ae94}URNkBy=G8< z?Sj|i+|Nx8i+!@fWVxR|wUMrNyPrR^bFAUT2Nm}@T-`)|lXkbI_B0~kXH2+CKa;Zy zEX}8+Ne(|}_OPFGJK4|6!T9!_=LfG*h*CTi+K=+|N2hkz!Yd!OD+-`#aWS+X%U-gA z&b5hl(7I;HAqpx@>mZ7ESnxU1o(Fa)HV56JLC?ni+)2>c@L=>P%yj2r0|Y3)2NtP` z7IlkW1t-u|pc+-H+?mC)0LRkj#g;Jhw=~x5&QH{Xc9t?_5OKH?*P|TnrRq7Grsz2mrz?blF zF7-Ifv7HZC^0bn16aqttr?6 zrT|g-|Io6)T{_AwA*`FTICHdKxd;3T9G^8QlCTlnOsgO|hT67e^63iFGtAzP@W;*h zQJR6q@}Vw`EXzmkY5Y9MO+G*2m#H{5amUdsFQK9IK&5WYHrF4vLS1fQvNBujFq^gt zY2)>FKue2xPl(^UduV{|NFkkRcnB9;EAti;{J`JjWcbjORm6Eha3 zW-Lz3SUfiKqUX|;(a&TB z?&K#h6fN0 zwh%IVUC2um8I7XEg7m}KW235hNjSqY0FN~!Ly^_zUA}m!w^x!62Kyy1Xh-edb>J)& z2q07JQw&B_*$vy`7B=KafCZr-WHp3TLm@@}qAn#x76g3BuIFau0wvP^9g_UQx>(t{ zqgNN=o<6TuQ^1L+bo=O1cGJ)o<24ma)=cSM9Yqdfp2#JXl#yFEgBhR2tjffSjLp9R zUd&{X3_ZGBZQu~mEDB&UEXH*x?20(p_2w{G)8oes+^ge9Qg}0FA=X9`Grjw#%ohEe zrl>sZ?b`#3AukR_kmP20CM`b3SE#^-Df27P1AxtpF3IjOg(*Ovatw`RfSJ5wpum_B zTB{<`=pg3QhD)y5c_7Tc!7^pm>y-^xJhkx|WEN6g3jSb*_nN~}RW!Qk=LMXnE|YA( zG7Z@-*HPExP}y{is!Xo~nXtnO*ioDZb`&c#G=2zn;D=xbestIYw=$$Fjh%H;0(13H9y=HKEbw$WD6< z+JWrS^I4nAFXs1GsHAhIOgPgn(G%3>%$m=ziQlBFh(Kcd1jF}^F z5!IS^E*PQ>rxs7NQ*63y5PfRNn@YEMmr;i3FycRm`fKKgK38mU*I^u$(huA-2qQo6 zLrm!d_#FkS6ea+#V-PmSo0;(-w)@s~0AqFNS8VjM_&Zc=ldhUJ#5%ZZA9irW%KMSN z*|<%knPs1L)9&evn~l=8{=I`hYLT=>ltPf~vt#DJNPz1+`oQN(dkC7+3@}?uf`juYq={a7-f8;*b%%1=eIIZ) zOc$ZBV5WW`!|Exno+);Kqa~m_6-0}3U-V8fJgsUgNC?1Sr>YIB4WPNy|D=GV{tr<9 z2UuUoi&4%~1g+MixktH4p~5ov4HGUPg3Vn)#ZzmKuZ2yzt06vb?IqXRiMo}kx{Zmt zjjuVAb-klov8OrZZoK4fJToKZTbuB$J@=u6uRZ14nDA{(`LS05#{^E_s$7z&Tr$?mvR$31Tpcf7 zJz2^X6uo(^ge&l*Tn!0V!!J0SW$qQ1=ae(ynteJp=DRfS4m#T2ktm;krZrK%h>Y4R zPR&iYy{GRxv+?;rYR<~UoR#OQ$LDlRRL(fH=jjih9!^xYUMg;#D5*SI^K{Lr4dW#Z z6tw&4M=ll5d2_;5I9Y;1u1CD+{A2BFJLYh&@a?nlb74-q(1oj^7gXT=ss*>%nX|}1 z!gTcHRsKEB`I-6;3O%dDmai%ru|qUsQ`LVqrv9^&ctj+1pcCIN7G@G%a2E_adiUc{XQ3X<9R>Ax;W!Dzlc~bm z15ix7m+}mg@<>@uCdmDVcEjk7B=^dMRK6jvAc7!s(=?Ipu`;9TI@R_2G-t_v92mc% z3d$cB?NAVYY_DY~Gi_0(H_0HfR65k%3|kWZrYaMzJYs_VU4GHCE_ooW*x|f~DUO(9 z5Eu8G0+cU%up3DbNftp%G3Nm+{%Zb^eE5hoz8b&(e$*H$BhDyi8pV$;hDPlCqHJAk zF5e5tZ5K^7c?H^<6ape~tLR{O(*svMx1<)8fOh(Oj2hHMD<(ZXY2n1aDHY}7#YfodxEL2NTwF+6!$ zi|H$hEmPKuw+~DcNH03T6f;q=F(=uE!Wns?G@^q$6xKz-T|G>tmFAuD=TTCcFN|5q zGJUo8XEY;X*7tR8>*;!k2|dz+h&xg7MmX$UdokhVJ2BEemty>2RW@OzOYa>Bl4dn? z7t}JulV76}tTGmB@BrP|%t3X-+&`rN3zRXZ%1Jv|y8?&~y)9e+ zv?&U;u~TixU#6W)c|`ODO05(m606KupqT;Rq($mP4oA6vw3kf^f}>)hYQ}Q~&lH@# zKUuYCv>V4|I0~O~ecW|oIAQmG!ZlG+HfjYuj1E$$gTZ927*^uWp5 z1t;tikjEdsR5owIQ*(0b(_67^YDIkZk{^4P{-k^c&TlNQpP18fdgoXC&-=&h$vLY| zIVY-SK3Dim;hBTyVu`t%$E)t1tkf%Zsi=O^%N19Q7QT6{l`Ee83(n-IxKdXB@ot!L zJ1X9uY~t#?soIuAZ3}|R>;Iw3i-7X_>yZYG%129Z+1@Lyb30by$~_m^6vB)f?^W(j z7tZ(R@n#*L1-S^i(BD-ub@F2rgwhBqhd5{)n^%Z_rdyXT)-jca7F0b1c&h|=lF--NqZ6GnKs4xkY3+$W#N z)MzHmmiYg_xG#Zk>p1Ux4|upi@V>wcBzRvU#Y>`2ikB!+q%O;r6PJ-iS{5yl$^&#U zR7@*wO(kwkHAziv;)ZU#H61&R{A=1W+wNAelg4d#TRegeV1)Xa-gVmDZMTIaI*!wB zcmLnK3j`s{Zj$y%oEgl0&o|#S7x^l~gu7#SSOY47RUk~de3aA=pm{Ac(P#|e1|FG;wSauYUK$ie&*~%BUNQ#Ah`UQ~Wbt6UOjS6Hq)&2h30=1ZgQ+-b zEi`4#l|?yN^Ig|7UfOeqoY9@U!PGWiO7yF?>rF(yYKvNpriQsDh)kLq=0VYd6}1~o zwR6W;5nL{5AMAD5yW{Ux zgu3>K3gm}#jLt1wzT^{21c%33{3PKFn(%k$GRE=CMw#|;#SCVUKL8xUT|om0?)f|* zlHcN=<$3OkCGiATn@K7ukorqV4HlVU1EZlB8B|e@wA8_IAL)*wSXvH7#KUxm|6(88 zc)C%`o8dKl{L}47l>h^x4g;u0g4n2Y66-7i!=}f&Ff(mx?yZ96KiFl#<#6(_|iv8IySEE8nH>MEb@w8yO<-!Q()d?%$lT~ z1CR#;I=WC{5pq}ZEjZ#N-Arnx5r+(n93MZ6YxaJ`gf1>0n0lDhx2VG)Qz}6wRo?1H zhKEk#hJDOGE>3J@OrLtg)gIC5FzeF%Ep0D1(XlcS=8l=m@=kYyPiD4Ja^_F(4X0&) zelI|htm05+9qx+rBf-q(>7M_QTQp+=`^{|D)opV6yL=AX_@ZX)~gTAJrc~UUGdh3yq%J_GwAJ}>3=KR`)*$0>|`*n zcHY03*A%X4TQDtT1ZuY5NRw(hXZyqXwab7;9M>G-{QJZCmErsfJi?aizMa8k=B?Ve z+#)Tk0;19gup;1A9q0DkN=Ab|U}yB_I`Vey;J;hc;ej<#t9B0C?_#G5qq~7&GG>Mh zR9PaK*%2^RED?Oqbx%Yyx>y-(~7q-_75@y5R3&Vz3Xht{9k0y1TL{Tv*GPF2ahi8eQF z3yBeLOSD0))vt3=h-0_T!3ly!$zQ_s+BK9eDHjFpl46?3quqXNe5_8J^%^!!-q|P8 z;s(0RcWI&^&Ts3(PznhpN(A?Bzcc_X4clp8v=uDwz)yftY+WkA|1g_4Yv8FoHfE>| z-5uofN5fBv49V7M zWHVbd|E3>Wn@K9OMN!~VRm;6n*pX!Xu9Jtk1_YyBpABXh^92-_gutelq4OHZ9P|1u z@)iDNnh^Ld@nb1^yqFl;4HcaE+I@c%yK?*|R@~zRh#6$M_n)$ywi5oHzy=)`te%Cy z{o)CNQ})x2OSribSdl0Wuo`rL#(fE6a-;6JX!tqF4{5A$F%Kqapa-Z6e4?itba_DZ zc}aqNxzW;uzU2vhL7#~K!1|{^0;~=-@!=fjCqY#9seLwV_1CL@!!|u!Mw)4hnY7^~ z%CA99dzbvQaO4NXo0cCSq|uFPTHU_D`%r z)IMX_H~S@rzgrFU_o}YZ-KtBsCpGOvlMXeHOHTh@HO6Sa>eB5=O*fF3mUtxe7|<^C zD>1nZ=((saJ+7F8{ed6-(7w}SBj+#p52(374TTL#Ei#_dcBjj{!1)iU;iGgli~IDe zI$(Zo^hjJVeu5swj~-p`Kb)zJW!9Rp%#In$q)Gk|@On%t0ctL)a&TTn9E>6ZVFR$3 z0)I7r+{aut=%N5KB|OC$6bIY~SrcRZ;|DuD!xzUue>%>Dl86jd?m0U~_%d>XCju0n zhe=fgh}nc$h*28~`4NOG+Iv5OB&ST!;v@Ou$O%ZkjEF-@(Ks)~V?!boBaPsNph&J3 z?mKp3eKDpS*Y0K6l{ zkqgJy+->Qg`Xc`B{0)qF;dcGN!*D4FA);xYE|-9C$|>$s2Qv{NHE{|YxXMKk%BoOd z`XG#^9YR3gCg8kQx7xepscBD?uxKz(GabDn5#x6@4p;^pX@XX$mSWix}EKemg#LhKPHREEt2x zl{iuL@Y$26hKIlqqu)#tN8NLdf`BGABOXS};%7w6(l+=JE`R8EWz!qY6#X%ZPJA>Xn~3I^iD|Wg+wY7pEgVL6*4u=(Die5LxQ>bVF@~E#y*33$ul-BbLRLLE1`cZ zGiNsaYwm0AujB_a%ORKIa6Qw0rTyux*Y?b}FJv#)>hcauV%$iW<$!qMw4 zhbX!M#2bh!W6wOq6taPlNtCPf7T`aKwCCT(L>?!y)p?;HOty}kKI6Z*{SpLbj?!8r zLV-M+jn8DEVl0t zm)BU#~fK$AjlJ%_Psw0r!vXs&ax?e{A z^x!QU9#vch&$9J;-BQX<64Zh3s*y{ro!_^V(hic{^xy}+7S#E&b5||@8~g_0nKU?_ zH?#;}7n;m3@{RU+W0U!1p+)!>kAPSCCiCy}EyA}uHsM!4sc`V4x|_7V9^(#Yz~X55pB<`SOef zUsdCCHwCTQWT)ji_46r&HR&4TpdC%^z+@=B7; z4jN|<2=N|_|LJjvmaAiqEq`3o(}E5B*qU@*CjqFR0Mrg9uVLb$5ij~5rE>(^#@RD# zWgG^?KS77cT2vjn2!U^+OjD&AhBd0Hh;Ysj-z@@x&~?E)SU0+C=e+&!&AjS8A2&Gp@=@ro2kd&JscL-6W zUN;8vn&m>wTuv_Zil*k0UeTo7fA$e>+=6xdhq3oZ@nJezc4HtT#p@z! z7s-10mMIdj5pgNti+sBjnRKT3w^WpiNZQKfg7fCl5Vj+C%; zUERCx>qYnV4&jzQbg28_k?w;-2fGjLhen6)h+S_Famqjn!7uhwciB#!K7)l%3Jj8D z=MWe-&*E)U3$EB_z(Wd$kDUgQbU4PT55XX#o)!^1I5*FZFs98&y4KY=vjG6&onv3b z_AsPyafBZ`qeztd@VL^ci;$@u_xYhQ^2^i{lVq?#q`?tSlksWlVIx#wFi=%IPkvt@ zKZjgHCPoXa2jg@d0v-cgJ8~1Cd~*^JeJskNgm2J~jviKFm%c5mVa1%J`ERK-H zvuN=wyE4N$`E&m3{_j1?)G%H#u3EX|tgl{Ja#cLK z>urPlJJ!+{jx1Sge=O{MM=1E}BTGWrieL!|S(1?TrE^O{At{m^xH>S^`Q*NLh141E zl8~iEShi+`v)rMqYALH4a_b>?gXC_|ebk_+gEM8Xa%Ph`n!=fFA(LA&xxczCXsU`D ztfsvX4Ua;X&tCr9r*AX+j*&B0%R$i$Ex}S=W6-oQYDj^O$5$8^{y~0e>%rUL!#~JV zk|sGUTF!YZUaWn-cCn=U+s&b-E|8W2&@ouj9ZVgNO1fvgf8qU2FMdj?0|Cpx)HxVo zt0QD}OIG(%UeiJk1nZZqoh#IHWs8=wm_T~=1)z_~DtFLQcIA>##k#nD(NeECkocFcTYIo!(!dujSU$T~aHp4o9_$LyvhM`_rZ9C8*) z&cfLXZ#pZNu>~hpN=cRTIZH_!!_JcF&SwU$49vC$oh5Ud-gMT21{Y2%n;V_`WT5dt zAm!j4qk&XaL+@rZmCG()&EXt{A$x^nub4Lm?X^>KUt}*@vR7=*XL7FO1k$`gTWQpg zVXJ%P#BB~1HK>evL~A)`^3=+&cLn4$IkYCu9 z=(}VZgZ3uyZwfK^e^7ERnPX64=EOD7wY=kwmnT3C2kurx$)WoL`;aaH;OC?`1WCwo zfUJqR-EoOPKo#M!?nY|WRUIdyKMfmoMIithiS)=)_3w#PnVoec)vep`O+;Zbj3%HIftE_Z0=_Y59z#D@%Db$ZWT`^b=x67*p` zmw+yWT7R7Wd_b+IZX<_8z5MUr$JUpXJy+@DI zyP1rjzx5T-l~$~K`t@bU*G@kKyh0CR$jZtyW3;gBPA zsW?iI7Vut&BAMz3ErbOABl%2FRsE?l_bM+JN8GZ=l%f!F=$KD-0kZ>s1_j##on+(i z9tS@IbnuN+Wk7@xl^<{c$x8ZMj^XiRU|QASd?h&uY#__1B@B=Fp?cOAaVw#q&>bIv zib+Ph8#+rEn@E}xMzj*3cg~!JPC=%xml011mvj6vh&X&5C=&4nGX5isNSe zhON`Kl{Cd)I3sSKC}!$8!MH&E0#4wqOl`Sud%YH=41k0$3t2EUaKP2^YSDbgwEs(& zmXb?fE&4$GYg}=m-#6K(PRJoPA>;a2{Fd7Oh-u{X!z06LH)tcE#}FOJQnXUO200t* z95IbNj4>Y6uOQZ~9Y}qw5)A{mmi5kOxv~+U!HJ~HlL>tBm}R8wAJ65;j6OZ`Thgg) zvI3jR))XyCv65=~Dmq7`E0Opd)$>7e(GnDYpNuOoBIzWI6W>EREs; z@?oHuGD8QpWr_ymz`X{LM{ur2=^f-lj|6{z^MroEx9ttMc}dc6EZ$U#+RuyaB*W>P#P_F zEII~?s7d(#F>F<0IbvWZVPj24f{gvW4p$^?-CXg>Gw-)4>*vY%HjGFX$i3sI&Y_>y z?l^4Z|1(9SaboQLI^Gv9g{ZT3^@=}0+^-Y<_W}%=Y-8urJ)yK3DXnH|FT++ANc752 zRk)xyRL~^hztz2hcY#K)jm~}&NKF=}Xwg!%oLhb!n9)wDsdFi}>xye7w_w@@(r-6^ zKQO6=9P?XUM;5xCPMU3=@A}rBm-hV7+7{@dx93cmz!K(|@$gMv-(rt44Cw z1YLC@Yn^1R3)_-Hw%kPm89gTg<78Sk)nYDmQgt`geo^nm74=w_6F1Y zr~2P5X$fa}!?{)A%wllCc+J#FMJU*j60(;`_Og(@O|rKI?VCfw=76v{S_+L0vv}*M zaFaknB};p{hMn)t+!gO{k(rs^|$9mV^r1rNVYl1421< zQcfMlg3W|2Qo)u`LBCYcpCDC{H&oOr6}5&T1DoF<n8YE{;TDtAkjyA#x}p*hs>fYk7SUQwE_AG4OlsiB&}Pnn9UnvrN6K&A??#bS?&69ZQam)od=QIGj=t zPRR_V)JQ2cVb6weQC+yOIb7EmsvD5%2Ew%sq1t|_wja{aAy2d9X?`!&2E=*}Wm~|q z_27jnXZp;zctX|0q-dBBrlHkx_{}2UsAC5Q=4T4SJzC+y|U0nzDU;76S9*zi*i5tcR z;thhT{t^aspd$+(b&>{AWq|)4_n;vK-cvMj{}8cf=pOW0#GT@nlhR4i(Nxc}C4ctt zT=BL0m+)*NR0$PbEt)pJ3zhEMu56n#zj7p0w?nGivE-a=YIo^SZxxC=4H&h zK|b+cOep0V_kpIVq@jCgB020e;||FVQWW$TNCP&d{|bu@@O=Y)4~Tz_U*Zyl7({B2 z_$G`s7uDRgqj7fZ~Kg-#!v*>6Kb~mmIt0j#oM` ztaad;z9S4Y>$?Z#{X$>UC?oz;=}$}&CPDv*n?aFIuvbj-=XkM3rx=rRr9|<-A`dC~ zM5|S2)khrRIg`3efdNnxr%1HJX*D*W#02|9BHA%z(2Qa^vF@YR#WA(U6Bi#+LrugZ zzJ?be^^Qj)Cfp~L_iE|N2)=8m4VABq?VgEbPbH8j&^^r5$$iEX;cF+1o=a7FA`MBT zCBpUpI7SHg9f;uoc_T~XfMe!QX+GV65wY$axyTrL#C_y9hUnsV$wfC&%t_+diKMa% zwS0$W74k*0WiPEv1u^Ag>Ef8{a@6}cL_kiedk4LfF0P=EuhTS@Pzs+`Y|m9PaWcSJdUCtnyYEdJV~*B%X2 zYzh``4ivO6ltOGaX(w)DMtXh~>^itt6t#p3yQIRdU|~-{$XlWFzg^0259RNb^7jU; z_@4taYji75n^A4fagfcp9M;?^|t z*;;Cbx#sL8-?$I2h-xcH-BrLpfbNpNH9HBH=mZtpVjg`+7(-np!v?1Np(tjpBv=br zA+#IBeQF*!$GAkRC9W0;PD?chtrpsOnMiw8;!@N+V%c)zdMGEZ;`k|M#qeWrqX}ZD z`9dPP0wpKl*nng!r1QY#!6o9`cm!HW{l#j@&{eMNpMZ|Ms;{~SAs4)6^(09ol&N8f zB!~T_T4=fI(rvIRV_DJ1MkGO-#Png3(oGOT48^1&WRiab5(kePWG!sOQ8QX8=NW5W z;6*1ZqdL}Cc3-miHJb0Fwi}Ch&uh|z{m=58aSYNBO5a&8S%+Gnwc3OQ$OK82kz7ukw*`(?mm)iDf9g=TA55};&^N$$(Pu_*XgIyA4s|krA-5P zOlL%d)=%P?1~{II?e^hu@JjYU6`&}dJ>!RT&U=^=vC4~FasxE^xl+bnt!$h+4ti}Y ziMCNX>7bj+RW#=WDo1^EW5PtBL3@P%M1+56!qQOB{xN}^2=ALPwr|)F;oDy_$PX`6 zWygqd45Dfx-B==NG1Q5nGpEi%DwAotkWZL2g`YGgQUnqtYpE7$GCE`;*-DD|NbF3A z7|%kD8v!wR_XVt@#6T+gMFKR%bClXFH$GxH)Yp5s``|v(o^pzwQlrQ9B2M~od_?rK z7ZeF%!%4_^PYH&K*k~pU(Zxvo8~W{{;}Mh!@_`O+83evfA!d?D@tum;s1kI<`4c{f z6gxiRKOx~C&|rKDeK*B{lhovpHF4Cwa%xG~vMl7xW`HY{p%M!hErl;M&G|zW+og)_ zLGO+m#f#p~WovH8S}R#==MRVKx}>_UC2Kc}PKV(u#`&&L?QW@dcd)YWjgH02eaqIu zkhN8^wqEy#+73x=hnB2|-xUCR6XwYd6yQ@AE?-zq#*z80oR@Nf$s4Z-E19{^9=&?> zH;3j*=7wK9`TWU1MgyLON%qe51(WNi1*&!Oll^y~f2v+?!M-Q%zM~o-IDD(nEa==(+h)L#CQKw%JPS1`dG~H!1)k)|tCNJBsezfUWqaQ20Mv?=O7^~mdKe+0 zZ&B#`8Dl!VT+_mTlW!7!&)mYVdG>@@O<6DRMUTSBos9R7MgCyb#Uh{L=Bf@lbb zL>B;D)lqc(r{+Od=|mt(TrEBXh|y}{P|HXpPn)P7p8%9Fk9yR)==Nxd?p9rifElblWrm=n3N_}WJSK3EzhGzD~}sF<6Z@uMbBmT2FBwd^g2;;U|1GwjDhbzBJ5nAACPDW6H4k zt4Wi>nw?0Mtb<}(`lrsB&%qr#V|ZwHdfd8xbb+SmjH?iSzN~mbl^gcA8K{+GrWJc# zEB-3!X$&p zJ5~mOE8=31NvD9LzJxkY6dzLGzmA9L%7!8GM$hdgvF%gigdku+9C-vs|D)n9+FaSu zZ=U0eDS2=VZu@23P!l)F=bR{Z;xo2?M?S=vn{p_<3-e5UY<3esn0ZkI{j} zf1y-l{2d*M%jg}I_)FB$Um6IuYUa|4=Q2XYEmCnyFtv4RPuP+guoQ$V8mC<|lajUc zU1;vLIziTa-(*ZNzsJDn37seri(+PWr?|t;4WQA4oMn=;EaYsFoGl?|ujK3vJKZ3Z%4m-&h_^(IX+`#x)XcC( z<4DbXHuq{SfHP*1tWm9$Rf{SBFOt_T<+TU0+m|W!PN`@oqcv4ZIhEnuf>3U?lv}Mk zNN}lEa@U5d>qFK3Qgwg0xGYrME)}=mDn_B8NhKHD^1}5#mzHavPI|upO?{uWw}&^H zGQ+OCkQJ|S=B-$r&)Bcnf880~$0G`(pKODPR6mVUqMx!<%(zAAe-?!#pe_FcE#W`= zp2ZBfysLS4IsaNK-`ik*t<6a8_Qu|J#wSoB;O^1P3Dh zAeryoB>W&nAa|x|cPjS-H@`bs_(7gP?xHmEZ{jIFvWB^dC!conZl~o9j^Ax`zQJ3_ zZMDGvjbt9F-bhW`?KQsPrAp9akEiim3?6%|X4}&s9#-h+tLwym0M-%+QF>i7#Jo2GT}x+un&>R#Fu@yi8Y+%R!>8z`$PZAX2n37d zp!teUN>W{@u@U*i0*Y8jMiChv7?D&xwk*hbLM$f#WZfUV4|T;Ql*CI$DH&yC5MC*g zDvu7ms6l!^FvcPQXc3ztcYDq!@AnS&TRBMxV2flbF@Pl)smGKVf*|QBpIAj<8L?sL z1Nli2g6JY%B}jjH<?Nb0jD2JbkTFQceliY_agdBdWE_TZ_x7kAO}tt% zuUh@xnyy79)e|QlIeDBt2g6v?BGvayq&)}2*xe#k_e#K6DoM{7opILfg#jNBNPr}zlv zo3$>oyziW#XpfR{l8jL@9wXxvjEKB_Da-pb`M6j<8fub(+7UbRl-YOBP%L>PV{Fy+ z%p9TAs@``zcu7mgI!K zi$GhRRzevn>6~8nxkkL;#J1mQ0rzy8(-V1G#4`V(%UsW>1CnjEMnC@I5kKr&O7J14jls zq2gn3VCV=gfYA0ac<>ODuZU!`%HwzE?jba~r>`G+EV}v*iqBEX*wD`Vx;oS-;+vFt zm<)FDT_INz4cwoTtB_m`WNaiuQ$!VWBmN-;5OskO8yEtXp-knQ;$KncnPnh1VwaaL zJJP>Q0YKW0RT=4hc==Ua0-+)fVqjP3O%W?z&XwOyBM`Zvp+V+Gwr<5q0c9K8#bfYC zO*!942FHPZ%=5S^^2tBqTtDJcnCbWtmvxid@FUK~ekT2h!+%6@+~f-1<_3b?z)i0D zCYSd%S1obXKj!*=#O1IEX^8nYcR0iymbk+=xt5#U#<#g%iR-<|)!gK^-sIZe=5`0U z-8VV!+uW`@YX7ZeXL4#H*Syc^g*+a9|tJ~+gr1Z)t zB?C#soIbNvGI?$%;ipm*1@u6X^_sHkHnOrB`QmV@Cu)SPltnHoD{3QGf^x_^rX2iK z%AtTqc{6XHzyA&g>sA_PPya&RlzG+6^KG*^s~oJT)y2o=clO*6gVf8d(}9y~MxJ8I?m)|vC<*BbC{Ug(m%J1L-*0yfP+w+O62Njv#% zBELq;6IP&b14s>ptrXBm0d2Du3TO+IZdn+XN_SBJYQ*!+Gxt+KbD+3oL6C~KQb02W z+_l`MnKnw%G<#{@FBRz(-6Gd#3tpODD3waKQ%+kbC$&{`N!JB%8IwP2+2&bR+0B9S z9ptx};^(psd_jKRfqf_o6zaeTc^HsCxIau?Rx5=VGY+(L)y$DBa3@D;KSmA#44jwf=hcLS3L?H}j=fAuEN=w_V>CsNNN|nUc+5 z?KdRd;cNy27{3kaQ9~x5KRX!ZU@cVKCW~sAekYS+WZ`GDL91ozFSeFZi=frViqg<) zs82edk7*a>U@t(`3M{!o=@cacKX2D2@vf*x?c)9*pBps@eCz!EP^rbYQfxQ#06VWT z410on_AP^fFQdSmD7m6`JD(d)E(jEES||+^ZkLjGM2+xTF{RIR&SuQ?Nv6U(CU`{M zTKm)E+W$<_=fU!ww`k3qJsGIpwjc(ob_EJLS?~W`x zGrCb}aKKp=uKrLJy8q&vT{kOGtF1_ z-ln*eZB5ib{x}%qIT}c64D!%Ml*YHs8}W`f-$v5~7Da|8BDkW}T2H0@D|;%?&=V-{ zT{qPO^<9C|?saoL(7r#=a3E1|3lE35vo7wKK=?WTHHgHetgZ(t3Oy{_KmNe7^^gJk}@SG$O+UDg`q$Z-RUk?eKV zCQp4 zA9L3)yb`&5{#<>OgSBw}Hd*Vc0Ke!aW%#B)@scIb&>gJo2~_k>koTeWwPeSri5;Wr zvBkzJ7gIZo?bXN}-r7K2$9J1wZw=J-N!~rTnQu(J={&oYMNzeB5Ek@?lp}Iy?CUVZ=|71zr#vS!%u>ql&$v{TrxmyDj61)3@f%|{KQ!;SVPAApfNv; zXQKu#4?xGEtr0p7p>NR6x$9?hLb-K|xpniS!Q9PIYq$cIwg)diIO7VMig4`r@csNP zlf%>)&diS*VXtI+=CbDtrQ%KBDf{D!KyinZz3VnbK*46uEdyKtg3QjRK5_XIfzTe-W*a#%=bs(=5a4bdz`YJ&VZiyibH9!+owwZPU@@rsRt8+rPpPL< zAgL_Kmt%FupVv$l59;ZF|CwDP+!@J&e0xAiq5zYzdpPX5BcukuD%?3Y#)#Z2@mn zAiFsrw1iFW*$3tx59G^1sWaWPTLbBp0ih}k9kdVD7WYZEZe~&F;jEy~H zDh`^8*M`aqTweT?cCI<(-2o<4THb7BB1s7@K3JJpP_Sr%dvB2kp&Qupohd@LavaP! zB~FPP$7Sa;<_^s_KmTBqgY$aVLi4M8SIPOlyN<7?30nXDr}%t+`)v*e$scFuOI-T< z=6ni;5#654yXLzH>~UQ`v`W_dn>O;bw>cQmQNEf#%)fd#%8|V=yvpq8gFtw+yyU0T z9jF6nP32pnX?qNO@=RZpBYUo5mD$k;g;6745>79Q8p*NZNQs)rfj!NU5ydN=a8l>3 z+L-e}i=9uob{0+Y&OLDLacWe`_0~Vy_UbljR7%usRAtt4+aV<)uH zy39}X0I}sNxOOC%RQg2ETaMI_BY)A6A9NHx(e;*4c+I~gl&%P_sh+3yT;B8fiEE|7 e;DHw*Rrkv literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/cli_args.cpython-312.pyc b/entrypoints/openai/__pycache__/cli_args.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..759ac59c8a1ae673bf8974eb6ffac21d81f736ee GIT binary patch literal 10785 zcmcIKTW}o5akF=O?*r}t9A1Y97cYVUaR5nxr1%mE5gvXBu&heJJc>{`F2iM23`MHg{zyemRjQoWl~Psmai^1w^O~&CN&MkI$Ye?> z=O^8>w~Hf0`SFphy58xT-kzSGnV#;R``4-}KY>T(zn1*d1BCo1BG%(B7amP>gxn)C zi4&QV9V!>+*xM0zu(vbrgtt@WQ?9s+LmFOnr#x{FE4x&0$`|*svRn110`UMVJJevR zDqh9P9<@3ZiicR)tAOJ9(>}R4sw>rq1!|bE8N@}v$hhx z=QOc+w-QnUjJgJ>Yu|I2_3xF5t#`afu2Td#q6pKxTz{L7@0J^Y^JFDYvpnsP8=-v< zqjktlQ0|*|^ffR1nSW|FsR`3bRS^=Iv@RvnnxM}jl1fQwSx}Q{MUd!BE~TV(Ak0wK zeB;7}iC)crF{`9U&&7Z^E9uE=z(iB%4TTO0=X3%1URSd4P74y$XC*4>AS!l+Y~)OW zF@+B^X@53Hvl&g%2K~Q6kth2&!+TkIBd4Sj3Y1;bNkx^R;JJ|06)LGv^vRMgB~(e% z0C9u#)5)1WhY=dp=F^Gs^bANoN@p~qDwD-7BvsVsvkFk`Iz21tQ%WkUfxO^@?Zjphs)fY0CMkg|IPE}r%^jWB>naX6;7bOb% zn~>7dj6$I@kW43aQO(SlXu}y*xrJJWVKLR2^oh#(W*ZDD9?b%{M-&3vhr}H|B70 z-IInJW*mlN_|j5J(XvuPG2EcooMKOjsOdDBo=LbWdPPBSA3SP>Rez5xI+qS3^jqISFB(tudW+|xW0A}?SlIS^(Pdc~j?9xTq6e+E zxv*8*es>ev$dN^Eu5P=|D#@-z_t(ic9NQ9HC2#$m&Rgy9ZU0i>bH0VS_&O{}y=?Jd zRs$9v;K^HMj$DP4U@7qBz+3-ezHf3j$t~wqa+B-xO)j*K-^wZpU6BQIVF`LhxCZtL z%$BD2IVKHaa3>6IU}4X>G~3g{`O6n4(fq(>XI3Z{Q!`YGrPyB7jtUE17A3Pdrp5aV zkyD5X?~=Zt;ZJH{F?9*7l;MQcZ1|?Y#*9;%p@vJ#s!82&OIfgzGR0{(_&F_;He6~( zk~JfQ!i*`2jBI_Oun#?&qDwk3<7EYg5j1#BQKxAH7*L#Git|bvk)IEx7jd)(4~8*i z@VMCw9?bDg^K4lXNUAub|{<7#&j_AiXOX_JACAs>xY#gHF<3) zJFm}X(nB{?H8liQlFnx{NjL|FGB6}5Ih0V7B7CI{X6FsBC?+IT6~%?-owi`mYT6CG zYXR8G%VevX1Z!8GE!8)!oVg#UU!DG|#K(co`?W3k)}zJRWBK5*hy4R9zMn^;D`TZl z-ReRi)KLmIl|r?v@|`cOX&c>dzgcSQ_-6d=_=fZA;$8nLzbco)&6{DN5EkBP{~rH6 zx!7^+C*k8;ysxI=zR>+0|F`@bBSm4~>bd(ZeT9}o8|MJ5j+G*9o00B9q07ZytI3$a?Z8-hKb`xQaBjZZ_^MH17R1;cBM1QY3os!rcq`t`|NW z`|wb{g(-GOap%$f2wDL=f*+y#5a3p4pa%-tivVtutsv>? z`y|lr+W#=zv>EO#gnPGKfNi-+q;<2dr%>0k6et7sCBpKCJhX zrk1s-y9p3e$Ad~0-~k)2W_ZxV!FvnIRkz9e)s{7ihm{(| zRpt_^pb9@hRa-$-Q$bbx1XW!HRb5#EM?7NH+9QqFZS~OBfW6yo^>$lh#i$#~wTvjM8IQ(W6g@soCpjQ#* zp0j7zF4!9FWV0vOT)Qf`cGz4(a_1Ae>$bE6(s$W?_prXZZG1Pwdu+Uy;k#{o55s$H zypQ2~%D5xmUm;6hxwb#Pm+5S;O*K$KHDJ~%vAI|6q1uJ`Pz7bo{(hh0wdZrNT&Q^{28YAgJt|k{3yc@mGRiz4O?ux z5I@d%9JYC$VE8lTZ=RKpC@1Bk%1MYtj@{<%808ei8OM#_8Or=@8LX1FFk;!J?SSI5 zk;HWQ3oE3UH2B$!rYBGB!gDm6p}OJB!70io)6MPYU zXjR9N7skWOn4w9Cc76|`=<~q!X)@9C z(C|6pyo%qvfB=WE-D^q#-$4HV>h+I4r`K6Uk`<~M9x0m?uPgILkhwr*2hON^B4RUm zDW_+Q)+Z`O40K^0Mbiy5T6CJzbdh4T3GXLp4IhM;;9W^+-Ke4v#bnZ&BGQbi7y(UF zQ2-2RjUXZkh3XZU#40qS7F;DOQkGQ(_K9LdsiNm#H|Uw`N?H>o%`AyY*j<9PEv)S* zreFysvowRd1GbpaTppgKzoaZ$bJSRF!bt$08Boz|CJ7s`OSzR$bF#uJHSy-Gg1y`A zVmgynjIddQKBXB+Qat>OT2^h=h*>oUE8hsPIy3iRgm71z=>U;UM<6BLDsT0)5wQm( z!L-5}f;@y~w3a1>aiy{l@tc~E!4FTWn$cp-AWLolD@hj7l3d5}N6U?{ni7hbOq&DC zX%NF3?d3McFS(#lXrIoA5DqXu+FlwcaVDo{bGl}<%|eO+B?eX?jH*By0!$RFR}DBA z5v$o$G7K~aYm<$cMGp{DgJ7r;Hkp~UI?o~p!)?xT!VaWCnP{+>*No)7wIYFRoaxu#$ z6_mHb=c0g{|GZ#_z10GNyPSc0AVMY010fl&j`2-rMmk+6js!Uanr`A*2p{!wG$cnbgergfsvm zJ07`qdv52VI~oCfzs^cyKmx@v*Y;ToOGZep>?MRHNZd;vST(LWj4yUPas*N=9Xl!k zmFwjOt0zn*OmmR(@LtM8IG&%uQ29GH{dJ&5B z3y8bYtnD(E-Ed4oB8ph2;vCB-vpkcAH3@ofPMX>S(3(c1DMlpMU(pgRa0$*x>0Y++>{L(w}i)i;d zuNI>RRzi>cU_YWTVz3NIq$ScGO!NHB2`QYK-hCTL@ z;Zxj?oy7xVE1|XOg12|WUGNUwAF)wy!Ha?K@G)o)7Q8*{v)I0G*g~fZ-dHJ6yP8>7 z3k}1Cz;G$NYdu&9$2Mm2;X|e2gBD@3;NADwP1-v@?i}9iJYDEKUF;k$ww=j)qos!C zd$V_E^SkyJ8xE`l9#)4}TUM*`ZRhjB7q*Ul2=;Dt zZe0HEEAPJYLEQ(5V(hv6!0G&+u|jb2Lsv043Ej0EfboVKR;o-_bd(OmB+(HB2NB#s zfCir}Tq7ikQaYW192RUpO%&+~Xfm9TIHJ!Y(HH`Z_USk_@Yi6g(KCpnkHR)K{VJC1 zeN8VRio2Sn(dZPGt{}j4F?|`qRRrid(^n8Muk|%3L0(Q%4d3bUOH<f2bQVDpfWa&28=BE{1#UjX3m4CxJvVt4(*KiZ&Ycxs9G!Ze z&LQP>1S*0Qf;55*f-C~aLKBLKU`i2SE{%@f4wdj-VC65dZzwfHmrt+Mt>|~6MLt^MFT!m5La=^Z4XZax-cz4A9Zl{>#Ni4-gR7&| zJCNtw*ZCqpP~v+^?Ol1kepN5>T_wKtXC1r0zw~j(`0~ZoskN@VSBw03iI4svwI!i@ z`RvNnYS*2sYmq`#TaoYHa&WG?hx{{dRg3@4cv7HrNrCr^5zF`eC0Sr$5thsgV@;6_5`?X^8 zUX-WE?=A5!lv;)5b1T~F)SZPQFF-#(Ywh@c|HrK-md``R^736@kw1YF9t8pS#4N?4 zfPEm~;BcPrUYCpf0aQZA!`*xHeA^l*8(2@2y8H4xq}7UiAGY>B+;=d~_in%d4OiTTAP#oZ2Hn6&w}+0EVW{lt-RwMB=sXE?d3kN* zZoJ5!Eb(JcstNnq|FC0so^M*~F7mr^z^Gxrdvth9Gup6^^A zEAso0&&b1Vb(-Vpo6Pea>!8R<#LsaLh2z)}EZgH)ao}OAu-SUB(0UM;8;lzJYifg` zfC}1hxL#1IyLQXb?`p`m4Q~;AKQsAjd_QtR%O}`kZXmXI&=;ea`*^}jhLoe}C5Pcy zf-5O92k)Ja%xzyw$>0UQ&%cM(3?8^ZdHNSF*$uaH-uHdKlhQ8-w*56ZC|5yJ5)J5Y z;~tX3@7Gwq6<6ttql>ZQStPcvS>ZW9)x)o#WZqe=ve;NwX$a)aw_$eTldN*Y`zxSNJx4m(Z>v)}~|3V1Qj zSV=cB5DCMLxD1!=Ik?kFu!D{De_c_s=7qWCfeGMa;vvBEBE{&pCt*paA;j#i?DPRbLV-fKnbpIvnei6YsfQ8yv45I0n8H2+=0w5|~;A4`O_!VxJg`Vd#H&NyD z74t5Xvj_@kFeA`h8riZ@ZL{^pfEV8EFT8aLCk_{vsT;^0La;7K_`d~cXMZdh6eb{* zV_pGX9i6x!+)V1TY&A`1FolOxT>ko~FstiX?dZ@DCIAMRgAO-ZN_q(Y6`<)uCfgh? z47dSf<1+8MX7v>O{eX?S&rQFE8o-E${v`rtM%dYmE-;XxNzqURg6Ri`Qqpw=8Cgk< zZ$sN-B>FA@b}!}%Z1Rl-zOm%3yI&P9Ro9fN&TjdLtLZUux!iCUvrP~wg<4ACda#Do zaMQv_+(1%YLDE|CRwJ7*uo>A3Je3Aaq@(&Qhey8I$>SiYj&4@%E>!Jazfr6jTt0KZ zs&%ufuTa&uvAbBcfBDSAK+W=mnM>kfG};pu6D=QjALJ1Z>>jDOyoVfvWs_(a>>>ts zJ096_{V-^fn|E$s>I0>nci1L>uWW~zg++bucxWShhaTdNM+c40(Ce z#Wm{6X)jD`*(^Bv4iHpsg(!`qzZ3|4VhTiis`7clPioW{F$DWRL z`8(-vr9bd|wEHA1YzUuREo&p2EwMsN>|=fq+!k+e)6?~lr)y(ub71tNfzkWbjrpc4 z#p;*y-j_?Bz^14DBTxIfzS(>7qu!IOajIDTQr`O#1Uh?8ad}TWTh$5Z*M4vS3!nC*{xC#XCifc#VD~?EIvU-}7eQ&kmbH_` z063F-**{|Y?I2Bm8Wc4-V*4(oqcipC-~!wp#&ShFmq zGa}h}8Pes+#9$>I?q0)JR>2L8zKwW*;ijRb=w;E{2yiE}Jc!Zq8J)50yqP^=p&u?p zBl=k~kQkHpni*ta%w_EU3{`moQ28J}-l;2)l|d6=bblUkMfO=U^D^crV6DTyv?$sM zqnUloq{M6wo4#_&hsAEgNaQxZFx=+bK-wMx*w3X2PFDS(*H}+^nkSgJ2~~K!^t^5 z^#b^BFX5i!9+2lAkgf;h_%D2s)h`x(?aS`R?p@se)m!)8y!+;30>!N{$O?N)oc~kD ztb=RVA_&2ab_5W3w|G{2#nsK#=35VJ5quw>c!ck*NiN7;;nu&n*>kwibNCTKY^$B; O2J*t8#{}MN8U7#PlJl(q literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/orca_metrics.cpython-312.pyc b/entrypoints/openai/__pycache__/orca_metrics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3094023994c6489b43145523bb7081393b9832d GIT binary patch literal 4734 zcmeHLO>7j|5$>Mp?)kM1_Q2XkW*^HMdkmhzHpZ;^3&B9ZfHzneIvV z8-qt<$&n(3C`b_H0MTX@OX1*%9E^mrjua_Uw6|PlFe|JMQ9#)oavRoMHmB5k-R&Q6 zL=HJcdD8UztFEf|s_Lui`BP0zn1hxTexiN4ndAOOCteG>gXIMn+~ZVkgj0Fdm*q!z z=>1t=&OhSk>6(xga^i@{#$r~=1x5n7;7E|?d>j^W=ppZL8VRcsj;H|~#WgsBYtz0a z*q}c(i2bkT_yO*Y|Bj%*T=89N=eX|y&9r}7AR+IqlAkcQp>vOe z*|26`AW@HYW#3=&mxQ1BZg&Wr0^Fwk8b2+*<22%RkZ)d8rHML^-)g_AX@+Xr`5$ZA z_zq2jYM|u*1e~%Ye8PPy{M2_l>FMmb@-#SSNrGKp30(0ID5WRfah3=>%QiJiAkdsP zQSXgb`;zF%Sj&BlMYpWddLy-_owf~WIz$deRvs%_TEi-G+?cVGbrs2~Y9Xay5hI7(mk===rk;~0 z7&QwzPpHEMsn}9XEl(q~AB_`|xB8Pw)ljTN#xOEjoKTD$ofmRgCrLHgb-Zh8>|AbO z@XYD&WX-Xmvtv^?#tR>57p@rlOf7q%QyK2vJ2Xs&P7Yg1tZL*ak;e{`#NJg6@}>d$ zd9pGZ883`6F^aCFlQ&f>IhHlXk~vw^*-Vv*oHR^DPEPi)9^zhBQx0Av7`HIv{;_OA zWD@|5em3v)?jz+Mw%sCTydMq2b-F7?nq97QDX6_|_{8Pmwmqor%*CPeZE@$?I@jMN zYCo%4!;U1I*eaN+Iz?tqK%9Czd=mwn4)q(M_m|E0mU{TFPieWl%rvj=W z)hLW*F`AUK1q`sqXfH5qS~<@UGnAaY>LMYN4?0}C){l-MRbxmin^TTt_f*&M6p|g$ z|IWv#N{(EhWH++LR}csQ{hZ@4t=!AYn>elAMD%59BkHM+;B^2C%g#LlP74IzB(^YN z8G7;9cV$xtzWr#l)f)8-qFG2c2(k)!nhn6)UH51DypbEun>1aT5qK1Rdr+}SSG*PP+kJw_?xPeVuaoU&Fv)WiPKSjn3 zJvo`p=8_P{<`m1jR?^60UDlkSapWXSixR7u@iLS2SN&3hG&5MO zUn_8S>ytp+Qp=8CUitZzR}#1T0C29Uec91hiS4>OIJ@O}L(Bc^zv+3}kbHb*q3}6= zy6-G-ZrCxqZMJ6a+jE7v%#%&~mpb}DLQU|_3;r?EJK=yIACO2Q0(dk3L% zS%WIWD?wMP42!?r4SAO8{YE1--a<+6c#@T>0c2B$d82uBX`eUZx!Yp`)L0j}xFFYn zYGLZp$Bk?jD}<&t07)nkz1)^IA<=yd4-6n$uo(O}C7gvUSY`M-j&qEQiykym-E}m& z4kboWmSQ##pwYLzzCAwb0`c3q<-DQe4MCbRyvj1by7cf?c7QzLoXN=z>>y5F@KON1 z2fN|=WJ-~hahxhxat5dJ*i`J8UdRD7d>=9eu~Ii=jnKN-S8Svq-oL>c#U{41;cFq2 z)~2=}V8a&Z?U=MiGHDrWOHgtHc$y7#B}O03@Q8~Jd^JP%b?uNMhJzQbo$P=&keq=X zVsJwFTa-0D95P#AWbdGL`#Q6vsFhQz+|4-9ora%Pga^RSRxYu3=3G_cTKkt;555xp zA4p|k`|BWAw{x-f!}EyWVgV%?Qk_x%3P4n8`#7(03^1g}K5R3hlX$h`D> z@r%|YOVOrhQB;Yd2RmoAUnL&5E=Yff{w})MeWJYM z7ErW+d?SJikiRl$^qodnjfOSVf#qfH|5uU_yzW=)29+eU@BdOccRuc8QOPO;7_SsH z7X7~ULVM-o+btvZz@7>}>wS34aIX*-ZCaU!Th@Mh=yQLuLoi z#Ov)qDg|F4sgzSo)wzyL|5HfFV~R=R%8pQYOE@K(Nn_I9%2p=35-tuUAhzHS1qN#l z;003*bbv9qNrfqgGbfR z^ZYBx#|z6rj*tF@+xd5H=U=%aFGI1rmls3ZXQZmMpRZqvy;t?qzJ1T!rTLx(alXIO zd8E>Ql&!uHqqi>J-FfSBMcjHnRuNm4MWC&kd?Vj9w{KpYR~CBbCn|f7Rd#=KnS;gZ g{uqCFwx`NLH`fRJ{PPDYyALgp%I@Q@=^|77Ux!&H_5c6? literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/protocol.cpython-312.pyc b/entrypoints/openai/__pycache__/protocol.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be009e63c2936b83ae5c1240d972270add4f04c6 GIT binary patch literal 108493 zcmeFa33yytbthh{w$f6PO8ZvY7fa<$-gjHxZP{(Bw}NiDtoLloR!Qbn$%|T|-E;$T zr=ew*SY}D9LlbN;!2}WvG@&6tnq)|-ED==~p2Q@bg#43#M#eoE;>=9ucg}rVl}ha< z6Xwf&-#lBl?mPFqyD#USd(OFkoRgDf!873awf=_hTP=S@FWQr?dhWI4SS*(<4_hRQ zWF4^h#jn+G#k+057PR~AR)X0F96_hw8BFu11=IcM!3=*!Fw>tI%<^XiU49qiSO>EG zIUG-}KNr7_fxMvG?+)hs^Euo(P!KHid)S{gP!ufom#{y5pfp(KFAJ9Y%YzmEieRO` zGFauW3Re58gEjt|px5sW*7|FMb^f|wy}v%#;BN>v`Wu5y{-$8Fzd6|AZwa>gTZ3)> zw%{WFqF}qfo#V|wyd8cY`!nHR>|etEEcloDm$Bb9usqo5U%~$DftA5k{?+Wy8CVls z>tDd|HRThoNjWO!+CszrgtGbo}+e-(bRjhGW?X=q3~NeGJ_U=oS<7S%z)}bejqK97DSR-EM-u zpP@Sd-D#?y56B;s$K?-D&dN{%|3zt+d{MH}yR=&#SABbcl}6#>{aBi`_c@3EdAU|@ zB23xKCHpll**?8w`;k_+DF-ib4h{f%&;h8_mA#{_+mp$`N4hzU9& zzbH>|4lZ#H9@TU37%(4~U)1~QNBDa`pzo3=bm&JhLR`|j^`8GR#(Dx+Ps*p|4EbYJ z@-H~x;PHQ)LwgZ=Bt_`2%O8rCjmWWf_{pj1Aqoi@qU`2gMbd1pufS;(}2Fm1bvC23ZNkqbc&&2 zK!;7xml=8n(6c7!Z!+{8pyy4{&oFcZ(5Fn$R~R}9=$HxmS%$tB(5Fq%-(u(mK%X%| zKgZDb0s5>7dX=Hi0s4Lu^cq7y0O$uz(9bh;9MFp<=oc9JJfOd3f?k)uAYbPe{vx;V z3wjHG2$&x>Vg5FM|019hCg_(KdI`{vn4mWp`cXhXW`cg1p&tkI*G(<_I}E)H=%fkv zD-68?=qF6LUuEbg0sWK-_jeikX+VF&1pPgRz69u${CoNs`5HrC2J|;gxHlR489-k# zLBGz>&jR{eCg?X9`Z++anxNlg=ruq;Z-Tzc&@TXb-30v>L%#^bKJG*V{ma|*MZUwpi*STMRpZc}@A^8%AOtO20 z!+*%(KhXRAA0of6n@apgT#6q8`bQ?{A2ali0sT)V=zn79j{yA>6ZA(6{Zl|ACg`8Y zKa&51bMU8}gQ%W^Tfm$)VMh4-89;w*f<_toXMoO{ptl$r1N03Obef@YK!0KyO*0I= z4d{P1L4Pb~N`KC4=$~=E{zA{!9bo>{6zeR1|CfONl?fVS=$nB4wF&wLL;nWQzcoSQ z4E-6Pe`kXJgrWZ*pnq?I-e%}O0Q&!%p#Pbn{|M-RF+u;Fp>u%VH9`M^p+5)oo(Xz~ zp>F~DUro@TGW4GS{ck4dUo!L;fc~=y`d19S59q&`>hVp6{&ztChY9-E4E>*g{;LW4 zH|MSO7I|xfMag0sI3M))oHC2FGJ@M8-~er1$LEzjbaUk4+fO*7D>_)u1pc! zrI(;o3*9k1*hjV3Gc+_nbrLVuLUs?H9uDsg%RxjO2*wLE_<`ZD8W7jhFkq$}RLuXv z$Uv?Zh$;x44#$hM_d{|ZG&G1(Qqu97dEg%TT-elT)u|Emg6iprUKy{~@E?_r9+m@2 z-wDwZ5*n=`B~o&8rH)EOsX2GEvg0@R*>x(=dEk&&K2=v1pcp0z!ARF=AEaK&@C5B3d7 z2-`8F1Os7&4NCoi!9m1G(P`;ZT_eF`{l@_{x(t6~3bV!YcOyPggJ;zN9r7o?0w z;<<+dsBg5Oic)eotPJ;shZR{8SRoOB1YAmBP#OyM4i6*a=v|5&8X7nw_YNIB*?aVS z?|X&=1A~E}+((nR_NY7w%e@nTT(-y-;(qyU4`UxL+a>FDo4%uSNOr$dPLtE+4Bo}s ziGCm-)@+AZr{yfkc|A@00{g-A=PX}PQ__C@+2`%88S%UWr{%#dyN&MPM1>EXMJb%< zi%Ot7?h=hjIS$1gXO+Nd1Ujfe`?O3`)M@ym_j{9ApFWs=85tk7jas!#jM>8~6USm1 zwSq&UKG>1)6S9rk4_ihpC)KT~7EZskG{b#GqHc)$+m` zhvg!2+N+m2oUg?>o_@?K*`CW7wRRZ|q}G$t94^rCP8MlC<5#keT2Gc3@0w35>BaOB zuwJ$LwOXitTOHkTG%yzZKD{;Y9kttX{M~DU^Kt85kQkTI7EhpV$E>5)i?%b?U$aX?ubBOxev$^@!kirnyUJi%4`Ha<`!LHpcrH&rff;51Gi@4j zhN&-}Wen1|ll{sgD41v$8X{itY*yZlm$)ZY?Ts|NlR6&2_n~jVL2cemvlNs~dOlh; zo_0I0Y_j1M$4p*htgvi+f2_Fd^3bKBaaYV$9C1~@Szh(Y#__$e#+KQ}7172OvyJPb zjqCobaYNKyJCQZHJ?5^soOda2Dr?5Q=yqlO)b^{ZZgfpou89<`nOG4mTod!uTt0B= zz|{61d)nsGElnHeGAuQ1a zO`eLnTizC?l>M3` z!xQR)^M1=u3#+Df&J?y^*f*DE@h!WsccOH%ezNaM%T&*m#nJq>X~)u^Iz5w3GtSzW z(|yCg%(} zqj2JO4FBYcIZvVFydQs|8ur9;+|TZdW#v4(J62kGVf*;<3D3o~(~e4jp4%K@$G-B#db^|}D z-^4T44+MforNG9K(xf!8Hi(c*_$)-L{sqg`ldo2Mv+8DEw0#%15(t|-Nj?_9jM{q) zr2UX`K5pxm;%-j0my#6)h^K*;ll#I61)-IM3qs|-Y%z)vCMZ8Abs9i}V1-t_8qIOo zqC5uBHJkEY{9r`&2j!$`bQS22IYFXtY44oa6CG6&42sz-dSoM7M_(G%G54#;4l8S_S~JGqA^^=t0%M(3WQ zBwr?n^YR%^e4rl`$14PUmYk&C{3(Kw^Ai3-{cx}lWw@T*87n9gUASys7xt82*fria z?I@=%+BIH3;kXDgqas#TMPE!bT--D5sETo2VvAJD~;b0MK)9BOG&2m8^dBPOKkHad5v$zi!uiX9l*oncQ!jg8n$~B&vhk zX&=?WXAvKF7=@eWBwo8)vyy9==L@5nxrX^m!snVwuENhz06Cw>pGJHXl#1FbeWSAW zM#D_yDpXX%`1y;AXB?|U!Rd?C1+PQM{DO<~XRY|>tsp}@f;rbSbV?q4m$$W>2o1O3TEwT^4J={9iN?n(VnWLTkX94G~9Kq+;=nhG@mg8OH`uw-iz8x@|{@xo(rD z?=;D#PQqNfAXXj3u4L|%My~2CzASZT3WixXvE>F)dBfC=uuLBtdOJe=tH=eHHn|#p zhXTli>6@N=gK*xaoc6?W@?vF^ z99TxEW8mS%$}VRfv(c2GV~fShMue^oW8xPzl@Tx91J?FojGbUWt1OiP>!6(Qnl~X-8U!F|XO;=_(b-o9MXfBx30mf$=m4wDv_XE;v}0eayc&~K*F@*Vy)%w#(IOOU>J~YOkkl=L+Q+0U*{?hF+9!UaUCv0V zecps{j}>FuhA~~ZPd*;#JAYVZj|K*M0>|UdqcW(=ZsqGpQ=zUDFG4*s|C9!J;J8-3 zc6m^0&0M%{xmv$Nu^82wRGCzV3OTRiFVq1CmFdcTb`PqrbIog>*B_f%{s;!?WapH9 z>hM&?m5o=sq7_S~9goB^v!4??0scOr+U$sjo_VVnv1h4EV$$PvjCM!rI}cgBr>pP0 zi{@3VHbo^R_0ZhJ3R0dz8u2U*2$DMk9g(=(q3+g<q$c4<<^as0I9!X-8`;w_sxV#r_#biRfAMS?VfkK}cfHnpZ_C zQHoW-8Lvp_D~Z*hd>?U%vZo#6bCB-&>z`3t9%e~p{2|3a&Y$5gR0t=jjM;h5?G;5L ztavq!PTwX+sf)Y}U%p+DC1%9S?amMxYGBGiM2yleK)l?@PvCUL?9-$7<&EEMl zxlsd*N`pRmj`K1R9*?N)QB|IxjnQ1_r1F9VXaGHV}3>sB@gd9N@V))B~!gf3R<0SdzsOwj*8) z{#&?zaCm4qq^@e)oN`XY7F_aMYv^rU4xX?erWEcJvcW(XVlMS*ru+$Ccn?YZLwVMjKeERMwqEfR*I0+EzTUp zio{ZMqp>8dNkv0mo5x)=NQk6N-$q&rIsbycP&yoLBaQ7zA*nN&T>YkQX{A%d#Ce&k zzb-|_jicp*)a{fw5?rVaIIm|k1koBv#rX>gCFfu9r?o*YuQ8^Um~pg;HXzK@#XE|S z)NR1L4~(%gVCaOQ*r3>GGlGgpPac_rY41%tYt@9uTAke^1)?kCP*d&8-sTax+1nrT;{3|X ztXo;~9zBnvPQW15aYj~r-ovu&?dSj#7^9#=|FlExBVNHg_M+Pl2kE2~J>MI|!6=SQ z;f2HtmM`B(S+loqXm~*4l#bFtDho#7^mOC{bozQk#NVsOA39=0sb(-VeERf|0^yxs zNft3opF}kacJp7t7q=-gc$0x}--&n@1Yd?_J}FZ6Qdp)gDaV6R;ekxUH=Y(ajpJ+y zE7admbbm{*{CTB|XX+(YI5BrJO57^*bgv%YiqqR7l0Hht3@P`J2$Y;~`UHm$XEO>e z9gkI3M>5KOQdU2{`*wrx>iJjeXBswMI{(`4*{(;TU5||Kx?NawdG)2$lRKv#n<@0& zF80QXTdtN~&6=~=mbb$xYQJZ9loZTa9Qg%vS_7O*^Uj-jt#duw!Kzxmq7Ro2b($7v)K*N z?1o6=iXUgMoNKg{H_SCz+_jMm?;Dwgw{uJ9Y_^;Rq@0J0I5M;5vMkx|U%XXo@l^fX zVn-(Qd`6b-eSOR8%Ojox_w2Tkf;%Opb52`+*}u%$fp$Mc%k+2LuC2>#-(8lut-$%+ zbp_jUoPUtxgx@Uq1ra+#s4ii(q2-pIdkSVe^JxcMJAX%PWAlG2JU)}4ilVtn;-mi9VSlpZ2mmit-}Pj-iC8JBR;{c&&Fw; zL3)d`KAXSLgeOk?Y<`biq{k=D{cQeXxkQJ#*CYQ@zP}XQ{{kutC^&KYm+dbzrNL)_ zsHbuhtcbt0`77i~J-%WURwY;Kuo5+m8rh53OX&@aiCQ`RdYR56u0vhctLc|Beuclm zR6mt!yp5)`su))7Z$jQ{=vzHSuZrDls*zfPQMytc1a|AMH)tz_zXdJnlp6JLo78l@ zSr2WM+Yr7efyQu3Ejo6E)M~Uf%I04R#la2IGSt^{xlPo9)G056O4N$yoc`tTJC50;l~9lB zL=CKvR$X7M)w~4NFzw{hzj9vaT2tt%d7!90ahp+28TwfQ~?KHKMCpf@+ZcJ_batv5pyr?;C+~(F})uRjZ|P?D@1I4 z@?akXp0IlL(q@(pi_+CyE+m2|>UcH@5D{k#D`qGTNyu}(TBOYVkLMCdmyHA)2)jU* z@KipHw(p4-(#~*T0EbT|Bo9QG&h*pH86{%zZ19M2U?pUMGxZFDf*lSIVP^u3TZ{5d zL>PA*8<5Y%ogoNT$?=TSO8=13A3h(?CdvRuzrEpp$RRmHI4qZ$If^}e+y&t#oO(el zU6GYPp;%lZGc2CU$2D;$K|+G>4@UoKTG6<^DCBhS>3D|J9|{~DkR=?2b9UUwQ#=cm zGuX!jMce@qTqR+y#tbd!(n@$PgB~=fJgv71VS;Nm96?iE^rn&{5C`mK$=yyr84S=e zOH}0r%T(ia*B5f8o{oCOG0^=l6D4CX;^7@2Sz~W_NKjO2kk7mO7)z6pgsZH0p~WUB zFvyB|`@`+tfY&5O>OI>(FyIBrL;`WBS6T9Tdx$>kmxyvhJs^P-7!Aiyc+p)rVvt@C zUV|u_M(TMXG=}UDF;AjwT2Jtyem{t^Fk?ZX0oAu!+-n;FXfX$UikV#wZI zJG5Ab6pDblir`EX27?9x*DJoh{{^ZPt?zh;M9I~f@U}qY4mA*z1A`$i2HyaBl$Ryk zAQZ69tMcO4dY82Nyx>Hc;(;`tg0cn%y-P$5DS`fwjBUOtc0+aweipawqMA3mWX z4V~$SUVs+^84+^TVfYbUjGSRtg_(=;J>)$OA>$BLBo4jMmV@3P$uW0Ow+2alre7Hv zByu`LjV*gA9dGCaQ_w^`vy^c=eIAA^3er1}|jw?+0H_qBUFmwWq8U=l%`3L=7}_{z=>bR<+CssU2iqUVq} zI~8Ktik04@{b4jtH#QIn^!b^6`}S*lgGZM5;#pjAy;P8qagsPkPY!v{o`8fLN{l@N zZ9x#F+DX+}HRKJTl28z%`K9mZP0^>F-j zxMK))M94mGycB|`D84Z@YU##H^l4Mf`&t0e^LXYqwb~RHI*O79Ctje*4v86>&T8W& z8cdU?>80s87*c+Q@B3)F(w1+6T-s98`E>Gg0YblH_Q7I9q$o8W1h3WY@tA$o zrfo=RbDnCu4Pj=-g&J&BHB#4fFmFA($?X?m!nSO}78{~d|N?ZN&*jxlb>o;+UK+qHeimWTKC^d8=_ z|KPse-8*{^ZaK7N|6!gVm=IF_2;YhGtaNdr+^g)Os0uN8?k0^Mn((2TL;|SFP72AP zX}x!d&qbiPR}P6e?PNsFrk?p+nv0-5Xi;|1m(H&3`+By-vv%z6-mxXESwO{P%0zB>lLhsg76S#bDQ<6+5TPc8%}3Q@7?; z-KLv|A{iuHH(RkNTCpgSy=Z)I#MK!qUpia9B3iy8=B}Cy&(^Gu)~vtzM6_lv7y=c! z<9oqwC@#P3zvPc(*NktExLRVKMYEp8QP1L-8x`ClMpkT}F55Al3s7}kB%?g$%6oCm#WfL^cdF-ekH7r5kY|rHZi!Xbz0`lDf415e zt@e!{xZ|#wbvH!a4Oeq+xmUzI^|PKuQO}}?XVsgH>tkgtvt>)7WlP?&+nNfZ?mEO? zP&=F77|n0IT6HUb*mpTe+4E z-PYOlk44u%mJq-!-~^Qr2ilrfqtoy#xEfi{TOYI%g()U}v>E-JRY5KH2hga zk5!h!KeA{ZWxw#C`W{8F91&cw+(F~7zqmf)s)-erzL{M-5xzWnY4nwz>Fl)LD*DO2M)V`Pj;B5?q`>+SRrWGO zh04w=eDU1HbCcPuQ&Ix$5fzX=<(bKAx?NBisX8!Sa4?d65Fz6((T7#42Qi5L%Y9e~ z9xhOEdi6eIt~OoYPOUb3w_-)!MpRs4aUZCiD&k|x+4=j4U(PKufg0!1Ia22N( zh90JqWKvR9T|-q>Q&nv|NR%!eQG1&Wy1yBFoKTqd9syAMgB_^HFerAhQ^I}?)x(=7 z-!+HmX$C)@`rQ1QEyd2Q?-bZI_~o5Nue#ii=vw1?k6Q@!CsfzeON_eSH%;qN$8GSp zJEyXv6&=?@Gw$U#J73GbwQm2L1(h-PL97;qx!{mPu?TatCsh|D@|L^cpCg3_k8lv^ zwr9-5z0GKaa2n#`=a@|!={Q5fm^QVYEvysN8crx1Ct$R6^E0Zq)|d^ab=Vx>v=^7Nog$>Wyz=`)@*op=F5C&Yo#3XM+v#4gpjoH1XL z#$>8;m~-^F<%O5Dlv>jQPoa^~>A_df1QnQz=%o!d*m(g3J8|stvBA{_za|mT176rR zFoM(_ZOE6U5n9aU!A=t7XUKc-ycB>8Odl8!gEUgWW5aeAA*5f7%{&tEk_Hy@60oO6 zV&EwXT}C5HEm6cWFu?2w!LGnAn{tZnHV*EBO!Q0XH;zqfY`{-?g zw$4dtfD!)=sK|?=VEa9)%xh+a&pec#*gDvPFf{I9{ZWz^ z;Etjm3?`JC=y9l{9v^AadYe8Ero{VengstVVgyZIKos+1mmZruK3&u_o(09pZ0K=T zy;6R4&#Q&6$~UuTYPZJ(lowKs=uwe6p?W#lhq`nWCXX`5NJExKs$YZ9Cvd-fq4`QBM~ zQ`Frw75=fieXh(>R5Mp;DQg*bj6ZrYhZnzJyw%Ewg0`H>J0%q#apCkYr}AIkDzKC` z{~T(OOW*XAU*30V-<-`_w)QQ%y{H(4DQcMYP;ys0|ID*wE(cimL-i>6w_ICWE#K)Z z-@4ZJ-B#DuRrc?$w354a@wRs7A1t!MW#z2awaGGPOjCUVQ3$C^CPRq=74k1E*z*>> zk^m3y9fdUd|Df0ZNe&S`%)wVkC`PfMF__{|D(DsR1{Q^gP=yY6nRpZXJQrS}@e3<0 zVX?pLAf#+UT0*{+utng*6M3tw_U0(Gw3l9_{scV@>jvSZnWb2exG;=A^AEV`EToV>gCOdJk- zEUvt}MOm47_w3ou9DI_UH{ppCcSQ4jQI~Jp;fvMOlf3?<;}R5G>LC(ilG$&G)iy$J zYa0arpy$#^su8=!PfU6)4nfZ)Nh_;)5i5I5Ij&SpI~K+0723IYMYM~Aq>V0N6RT^w zuzURcWM?$5hSb0I9YPkGBjvlI1-qvmhe*3-`*%595PQz&G68r~?ax+=|-F6W3Gqz$_~R^v5F>LrQr?c*yT5x#!f zQ5{>c2{oG@DQ%0o7hRR2?q$=CO^G@`kdZE_e4}*LoWqv0%lcM&dPy2SvEQ?#JJasE zEz4FxA~;sG9qQD}(zq%n`=*vpp14wdwIN#OyJxWj4dbgk=FX=mx+sdGG*U0pqNpaJ z;yH&iXZ>60S=Ai-gmfwEo+S&PmRgz?#oR^o=@NX|gfuEACH{1-)Uw;U-x_gLO!h?T zR^IeP>o!EIHooSGR&9suJ4SVBMJ(TQ*J=0CS95O5lFqy7K4;dQ%9=SRe$lh)nyxxt zu9zbz@Uu$qWs>iRHQkwYwTn{8x?5mLE1GPXan|3pIh`I-s%}7+b9(OP8Uc1a0KU6f zMnJkA0G~OkW;Nih4L&gfsNeVDPk%JZ?cM}@W;p{A+9R$WPKa4bQb>>Lr`AHMO{myt z@&hbZ3p~>Qg%(xPDYTF{NKYYMkO}F6JUwQXB*6JBWD6wM^=yq6gk*t%qe&4Im|#MR zAPI*ClNKzMiYC;wv-Ct>v8CX~r!RD_q<(?%HP{&>qtV}IYEmEtaHq%qmq!a4p z2C0Huvq7q)UzP%Ckg9ZlV?qwA)fk)bT`FmXT7NUk4%9JTI;0W|>5zK$+g3SU&qsp_ zYctgn%P>gk{zV*9lZw%9!e~}uW=RB=WkC6Trt-HkUK`{ONH$^-y`lD(pf*?oRL=!V zF(AdIoMH!mmF|Zm0=^P544MAravt7<#6qpVlf#xUX03k(zb)lAn}4OeO3vdxv+Vlv zFJLX9C#lbOs<^97rCOoF*38S_N(!MEW&8D2T8#d+S|38bR3o_<8|l)T>udF~v5s@5 zn#4tDt+d`0x}HNfXtCqlji%5I9JKEHhI6F)@TR7IONwvAtguTmzy)7m7ZWHz%6Ly!pUGEWlO*s2ZI6F`#7mtyh z3FBhF3BTKfzsppn116kAd*1=nmQ(80!)?+L)7J;(hx8H$ zOgKkPIEPI0N1q8tGT|IH;m9VOVC>L(PltYzn+1De%sv zzzd|nJD&pYXbQZM6nK4*!g?xRl&BbRNGPF_{1NX2LfyQ5`CFQM>51nbW48TZ-+5Th zfGR44hO)r0hgG(tFhI3KX~9Iz;y=7A8R;^Piz>PqUlT*!2SM!T)J1H4U{#g^h3R1N(&J>+c}6uj(^cZsJ#AEbXc z7|~#p=0qpaO!)HQbvT>nh>5k z1(D_cLC8=QD6s1tW>Fr!WVIAQE-Vuq51-J1eC%1SCCR6(C?J>(>X9e75p3zIR~~|~ z4~kqUqsI$Fr}|I#vY3pnsvWNioeqFqfsBRE$})6kNs2}f{=0w0gQJ;uQAUbYc`npE zi@|8|632s7!L=hE)v5>>AjawM2dMQjj|QaPvqOp$8VNvPhYl%(tj?f(7BWTLH+4}R zG&9L%)8U3DtpgKHVI2<)Xt2%BgOA0sGGGi57Y!$z5Vsi`kbO!yH?G`wV(9*d6H-$J zvxoyHa;~Qd6JLV$L{UMo1`;mn;ejT@1zs4*x2Vi=K6`=qP68?o0xmciSS02gY((ng zf#jNe@ggMDYh**M&!6J{2ohvsX?~0=h?j}#;pjE7GI7F7gDpfwraU6EtU+$njj(>k z-ytF}AHlt+x&)68l}Tb}uwUlge&DdksVcQ3q^>w2l6iyzpDZ$?wGG7CbTKYy^g&(; z+%_Bzp!qO{)H(vEa_DTxdwyt`_>pyJN62ap)Fp@oaZ1f0U9gQ=cKu<92eEA~mUp6a zEwyyqs$^Y<#EXukth^&WUF%SWl{wOK>r>4&u}f zou(!{OVy1E75#)2FXr`=9js459hKS|;#~9kk~a*SnIW)r!MjDIh>XRvsP~G&r{#|u zm^y+kNJbKesMKSyoCg^&R0cvRepPg-)fX?I+DRP50TLeA;w57wl%8N5V|4L6%1tN@ z^|3MtMTQb;T}{ZR^^?4rI=n;=!O-jHdH_?Kx+7)((3YLv76^lZaSo1aM_~B4kdDyA zATin?cE-IMv1vzghA5@b6`^RT#;A@p1r^TCi2%6fgF{|SxnMN=ypLj(%UD^E2#Hf7 z(=tw#ATcNkVydzL7nsH&jIWY~BX1pM*#WtZ<;&_w{aBO4Pdx)N9Ftpt~=Stf~X_L7|mypilq7=S-dlyzRu2etznX92$COV89&SrJYHyM zqPc0%@Sz*J9&!3HxRT};p7lfr;rT_cL39CFapG|ZDvWa14%mR)twjT_I(+k9!i5RE0^EGW0xVSl(|j=Yf^(xGMES21ykZ z2(d)V{YyuV!z3$~5Rwt%BW;6BQ0Ez~mnw2_=nUoxOuJ{6_?9m7Eg|v5GfRC-d`tLD z14oHCAEBWV8p1RcLiag{qXIN0i^(1lNuVxpfUX&e)x?L8^k66~2P9wIWy;b0k81f- z?3r1;?eWi6M-v8VWzmiMx?Xj;B_!^daLLU9_T z3+7QQbD}HKrj9Ks5=k@U#VIe&vsvy)lPbhIn1D`;xOdOt1KsL}OnU;PZL+-0nmTr~ z*b=HD{O2`tC^3_~#Ddnr(kL?MWt**wRr8NRZeEkU^fuE1*UW2tv(Ri%69M66kOHTN zLYndfN!^1C&=yBQ5Ba=A&dKLUZc<+Beagu3T5ogkylRTpxBvXUM5XiQb~OWQz3*Du z?p^i--ql=CEWCqgMid~GhUMn5F`x2Z&{X%QKp!#fMrsVAx|+3J6`~aTI6m{&$v5B; zQH(Z1tqC%OwgaF-yv^G7Ywc5-Z>-taHL#A7pq+>kC3${oENm1ob+D2Fi7r6;2kS64 z1C%9Syj;lpt6RU`K*-qR{gk)_<^l-P7gQ!RB?L{WZdye_yt}*5L1^<9B1(tISY2fX zWEa%3yt@afVAwKKk09EURVZ|`Vu~ujX>HxM)fIGG-62{>;8(pkMuAtiGiwbHdWOj${}f69mnMu$C>BC+FekF zi5HwY(<^9DXwUI?*k_x_E_GTqN6B037IkOPs`et*?m+*C@hlP#f#OTMkTRs5E+oRB zxq{Y#pbKbUP|aCnVzKuejW=kcouX2>$l4G3KA4t0(AF&~X~imLzVm zkOb>9BdS z+24-*D2%RSqCH5cR>Z`0uYVY#DlrSBK=RwfoYMNwqST8}O5}31VQ2)=LMjDk(-oJG zT{;#iZW-@-amVvJt_H@fzqB~`Td)G|jg^#=95v#5`qI-RS4~3YzG#IHXZo|vJEF}y z-pWZQvFUV8inmlOWhUJ=mA2?}7;T2DIMrTvfe&%fR;EDBtM(xKN$Q_T59`~QV^qUa z=-Z^|XR5wU>X;YM{z>YaPUM-!PSS&z)1b+dI%XOvsq$?g$biJ6}XYYr=`xBri`N9p?SoUc41?vl$ zqps2HkJ?_yaahK(#sCd3(qb6R(vDQ5^iy4!WZYpAXyQj|gxF{niH+hi4ZR;Br#=nC z_#Mj`&3Pt{uvGP(XT+)HHzCHHXTCL>lS=eJ%3u-K0%?Jk?4_>5&)}yPh#`w(Xh&$L zhHLze<&NfFul7tvPgMkH7b#bbPMsvpUM)M$Gyhi(CG%vhX^M~my zj2}b5I3#kDcXE>!X8cN-5DCFTK+l*vyh`{imXn(`pYc2D)-V?+L!pL?@U5EP_~kK` z0zPQ%w~XcE8}yarN6AYKAI%qLsKX?+#?Q%Jn$P&9-X`TDhCK#M^L^ARQQZ5qcjFf! zZiI9j5Y2}W%I87t-S|Za4EVscjXL4Nj1OGLXgXZyXa?K@e0Rve*L(;mL`aYJZu}y| zgOEoIh~`5`5kel*-i_adW-ZeC!&k+lnQ%++t>5@Y^C6^k)C#u@;qTU<#xFw35%Qz~ z(R>K0K*$m8-S|aFrD0zmIFb>j`5?@$(r3V~0?sil#P~%>H9}4p5Y2~>8ib@+pfT$V zXjmr)HJ|Z2m}d$namJ(vF?$hnin#|NwFps?KENwN>JSo68B&jsGbuwFM)QSsFSXK# zArjvQQPDp8SoWCftfjT7dqL6}Ry3P1c*zI?c#ZKi!8yZSG=gImFG#_b_&%~APHail zvWK-VnMH()6EbkK72hnRTp{9+Ny6jgdz_rZqWT@IyTkjfzv)mk+4`S zDLo1iSQghMQdS`qMP;0v=gE12oC$KcDvkmLr@8W=LK6rd8MF8~YodzUp*j}&Xx}50 zAJP}%5{q7GICvUz-*7|9CkTxzjzwvg(Ca&`Lax1}in~XNCDmRQMY@9maP2&#iOET< z$JPQ9WltO)p&@#Td<$0Kn-t@_s9e@Fq6%Im2vtx#6FYg>_63VaxrRtWL>2M&Ai+^+ z%3$%~k(LE_1{yy|-9xm~wg7h1EZ7$0TI!o;>()f;);`_eINn<9RVxdBjyW z75?1$-#ia})Y?SM1kn1OC;d38e0n~+hoRbEpxuPKt}1NwB^HpZlYnn+$-tgvpjusK@Te09UE z!qu_08z;JiUhR)Pt6~u9FKUezwO)PVR?*s6-O^de2iI+w*c+>8ic~C%c$UG9E#El1 zd{1=w9^=C$`iBviDJ$9#YwVoph9+&KV#TX%vC^hDD>vM(Tm^N`vhuf_Y3qvTAnr>} zWidKl?c!PQny7cpYn>7AnrZK@iQTt1Z27kT>;Bg>r#BqZ5}EO=y|a4#xAVS{H`VjG z-rwwf{m^(86p7I!w_J617B8P&yd%1J$M~K|$)ad>`|Z3cq3-*0ORYN#R+nEq_WZFq zJN$P%rI+`8WZ#?`>SwDvZ&i2BW#S!T$=+piF7jnt${OZ! z$d_v=ES<|EpPPL7Ec_Zi{H$zo6Uz>K*ulURuRf-i??C6 zX63D#m2+$8eXW+ab@aBL_zi;}8JycdZySvY-bC-4ttis1lGZ5{eT$VKmfFm$f4;ZV zYAN1j4bgt|2iB~EYtw#^?S!kWp=tCmpb0v_!~&qddq?0rSVK^$TtJso*TYND{?(0S z2sPGgj&7xwvO#OX{gsEA4GKvPHa5jdWtsx8IQTVmfQetap$D=xM|=xtxR$6oQBcB6 zkc=AXNIj=7fc|Mz#y#5eF~29<{UQ`!Bc6sCSL0RbODC?KxN&aAx0y9yAD%9FB$E9I z^n_DsUh4I_5k-A)y@FL>duH8(>Q*Tlg`PESOhYhBq8j@!s|`D~Z4T<2FhQOwZpOs+ z2-ZC*SNur&4UyC`Cy~eG$1v)FyxEaA+{UKHXGhA(@nx7&rtlWdBYlJ1F4$`_wXYI- zjlKyNy{6UGtuU>`6a-P+ibDNo)e#}fqRVKJwNAnF51fI*9kcs2C2~^-y1#V(hlWzH zI6Z-226KstZQQZcCqx8&N67kzc6M>Zq)D9mBRV@6S<5*XCQUeR;`i-ra1>=Y8ZzQi zg<(jHO!V|thoO@vs)49QzBiJn7$#ogxk9s8%i2go>OP-{^;z_W(05Rs+||7eFP*z` zF4D4Qx@K)8d+qpvh-;^&@}4o1S3C9e&HP)EUZv)qUOt)dYgzhCXjM)X>N6;wn|i z-em~bqM_&^wQ39r8VKegm^fHPcnt<)syyIIJ#DWkWt_1l#G?-)b+}}8^CqQFO`Wqq z^|2g(_mBzm6{{u49WGc+H31lx=Sr4g?E=;^?E+SDNt~7SPOsS&+La3`ybRN?dNHvk zZ=wp$Xcx2!dix1{F|zaR`e}-YV7_-SjML)eYgWIHw!TOFKyl6LoW<_R`f1UU3G1BQ zlG8Ms-5Skqy;?h+-F~BMCVTZ8MN9tq9@WURRbDGH&zq#rpt_V-$f2W)`%gZO12Ekr z1qn$(%En6udFaSi93VsV3fjtacx7Z&llbwrG?NGpotg3ux0z;=X0gj{ZuJQj$ss5IbscRFpWdeY$w=qFr2)T56VGRn@GmY{=mOQ^3F%XNW?ap& zf?`TQ@`-HbydPZ@UYj>BlXM-INEqd-nXSaVJ^w z$)8yI{2DCJP`AxrbI)!k0||8Vsuncqd8zbD=~Ty!;_0&0u4yj<>2cwSoycZUlN>uw9QX2Bq7}h5fEiS{3gEpar3QTlCx`qck<{4%? z<{DheWM-}}A!g3eeA^F7g<&OJ%eS`TOM(@h1y-YOQ9-h8fCYua7RfbcONgKC7|+kS z@);=`$E0eBQ%z!0j+P!QOc-_Hm>*Xt-&6~^Y{xxs)MmD5{yt)^I#8>*Cd_=s(c2hP zutRcF`@q7xk)AV!MT=Zh-RMh!S}W$f6pT927W49=)hFy{Chr1q`oms`VZq0SA)UAz=g9WNr7>k@i9ze#sha9nG9| zntTl;TtVaM7S((_WinH6WY0D-4GD`4eCOC&?~$kK$QVW4T5p|ZaDd{}{SaEIqhtvI zVlO&;Z0yJmzG7meCPn@=v?EAh!~?-D*eIfGYh|kS^7R@cE4QD6;4_J|L&yX+=Z{cQ z0$qjr8k;q zTET?DIsHUcG`ogaP-T-{Q8Jr$wIkZN`sR*XjazwPQG;&jLD9ypJ9)*EQWT`DbamCs z!AR3OGj&Rou4iIDf+KAkqj?*@Hv;~^#1m0>{VP4-4(zh#Vok%@e(|cC4w!*ky*ILI zABffc)<(ERjj$=NS+n<8@{9PxsgYamrMErRli{hISH`YOGoIzQJtdQl$w#l`OnaKH z*52|gy3^E-t1c;-n5T5IWMV&fWfQKsEUa@x@3>s>P?OK+Dq#LCy)Y`IxJXR&r!+hT2-BCVU} z?AEfaRxE@SU^EGf6`t~`(r95T@Uci%ER0v%dbK#}T{4kFX>@$GaHeF%tA#h8m|nB@ z_4nLbb5KpM<7N%g+ims5eA^-&U6kIA#PndrqvTWjC%RC`$vx4M=BsJZlEpVROqXnq zHMUM>!SDd?VXta~u77Er$W&QpLWWbX+_Y-*%6$Z$2k*)k+_&oUeF7n#g4;1l<7b5C z(fz6x2yO6)F7qBCkv&pJN!0AXCKfd20{S2r_Fm$GEXV{ALusBC&O%I(1<+eS9N$}s zDoS92w9mLYU|+7~%Po<0hh~-?R>`d8?|@<=Vw>xoPO~0d@6Y@@+5=OzP!nt=C)p|* zTLvLhf2l-LMb*qV>$QyPW^-cQO#GXzb+t;#mRxw+-GKOoC6`Z=j}L_S3~N7?gHKI( zj%FRkPY?JdWbV;nvNTS2c8gvPUa_Z@(7(B0iT4k(KNg(1UYn)_>rm_~+P^5{Ti>Ib>Ia7kQmN%;E z$To`2>S=q+=`hy|JBCPpXGs|%mA>Y4t*J;bF22J4VxSMXz?7(6V z(>|FFGVPP*%tI7gtlMD(#xrp|QoSNtOm<+3H%vRaaNecfsZQ2ZTToLl?JDf_v||x7 z%36V7T-3gE{4lKI_{k~`TI9e)54A>OgC>@UZ2c5ZIuT_y+4`xZyWU}VrY6?hO1HjG z7GBy7`p1Kt-dkhM?GZ=y6s~$l(v{Q=rX7{B;u5l?Gwmor#c}S)v`?Ea;;5bKq>iz5 z2VFS0d#dl+iW^(6ua7jXidL?^*%__eH0{_y#)J+`9G>jFge*7lXShIqa@(cz=vXDx zvEHcg-tf#+tc1;>@Wi%@=cgSju{y(CESb}><}7|IJ*^-O)zEOyk_JmVI5cgc&IU_6 zMX)JHr+9pP}=hVNIo>7}dcPrEN;`b~WNTd{;Z8<%_kFyhE!*Yhj?(6Z33@@uIbq>qsfx(p_Z zFc_x7FT1pq8{0VCk+TWCq!Hns2?-ZypVB(U+uDdKwk%zV&tsmwXvn4b7~Kygub+JE zN-n0m(hkv{S(b;ak61Y!Yt9<f0VI-ySL29(C_{9rs)A=l+GG&zibeNjYpw#qtZt!jvmbw@ZbQH0O3Zvts2{b9VY& zy`F#XWR~4?;(e}R$O;Qoo{~8yc`)_VlL@M}xpaQbfJLggO!j7xIVu-T5%7AqB>|x_5dp7v*C!zCwkBfG``t%u2{>tqD0sc= zPC!tNUx|F)b>LO3rnH9kVOiCmOx1VqAHm2o8@j?pQ80BSS=qpcdO0XybX95wu~zV_ z`sj_Z#69!4abAPyU)yQj$Jd#-=1Gf@uXvQ4d|S8%OJh7?3`>KFtG29U3=7vP<*4RF zw9rg7X16J3ap{%|rm>9FvKcQ2mu_Xi)QB*Ol@6m=IeLEhYAqPy@JLp<=w!QEFQy+r zeWq8{d)=)YyP^xYw0z}LdN8+y3%F2E^J|1nYoYYzFmd1OF5}W*Y%~isS7mAo zVJOS$uQuhrSpBxf1QVBbSz#2*Xh&fv%jvIWlPkhRmepTp%DFI(W%bvaVyaMMr|Z15 zoQdnas{D;?3W-f)p}b96uh8*o_**ATW08>;ulm-)Z?*hZ=Wpd!spq!_e;bDhLs$(k zghk&qahTQLE_cWo+)B;YTQsI!QlDs5aeedh!k2oXHW!=9yGX@eA}n-wDy~MlqIIPkh6Ap}K={S2$IQvXET_&9UCY){)&URC2G}Bj%)?4(7~yU|MN68>wPbSh!&BsFaPK{|L+}aX*qC zmG{8N(qp(_Zm%BtxR&z-ynQgY7UDw~OjXn8*^h63XMF&&hJ(&XUp%i$% zDew-Zz&nxx?{EsdKnlE`^VZhBc>X-?TVXg!SbL>C`woRS*@tKY%1j65HxR4g@7$x#>h!P@) zc?HZNASsb}jw%%*2an29=>C2(&jFU1w}qJBnoSWHrbskP!go(S&}@n@z@re?VdS;L z{Xqzs@I_jEWURxm2}MexP~p_o9Ce<%(9Xm;2)Ocejx6qx@QEXkr<&=wf;4DNcgQr4 zwL`ZyAwb;Qk{F;J&7{kJ&%J zPCn)V>FTY@B*936`u?NuKsy!9r7GJFzGIzKA*B6DN{4w(_2l~*>L#_?>1~97(`vh5 zgNn)mm!15Z)lUm>DHgHWk{;M;)o%eY-=R%b^PxgO?82mnl}vSIDkBisB*2*TTz6;+ zVB#hbo`hUf4?K5#8a|v;Jw^bAwv&%}a?E4oxg|)jm*6BRi!$48qLJ)1YifyP*F%(fo{H$51#b+x5q$0NbouV_y>I5#{L(6+%F8H4YVFU*P}OLB zJ2yUQMPz#in@uUa0B7wt? zR!^ZdB=L=JV_CT8&IOF(q)+ks%nwq-PwM)l#y8NcY{3w;OBrLX!O#B^-wDB!_65Eu zH&W1itnVo_2(we(MrY~G)o=&p1>#Lr0wKDxFZCC!`{@qXXfcoGguR+y>no~WW6sHX zEyVbhGH|0SMm9ZTx#33PgI;5k<}-drb2Ut8km@ame6<-kn(xBL487B^zN-%}(ZU5~ zjc+S8i17=JSxYLj*MQQYO40);9YQ5NfUZsg;T1ZZBu<5gbVEBvH-*ZpFC zl1{H1+L!sP)9W71S9N;x#sCe|5gI?Ed0HKzA9tbWxx<^ZFxKhKQ+0ZaG$?YqS@Rpe ztf80!&N`r^Be~VU)qJc2ngTvrpz46q;>6F%9h%Sh)pbC388FQUeLd0v-J`u5zX%~6 z(0v9(^C2W3A>G=$@r#fL>wq3K@HHR4D@2T`wL?h&YGg6)k!j`Ne{lPM9907z<5OnX>g|25v>jfv@d{j zRP!6Z37Vj!)hQWanhz;>5pqm>H-6D0PiSwc^g*HTX@n1Gn8q(+u0_l##ymnut8>u6 z)qDsctxh6&lOBXLj25KS>dYT=zr9u`3u3YQp-DSa`5b05Ro_#nCVmD0g=K9&OTOPC zhqVuxbfZ1!1JpK|oNFdLCIM-0C=QBGlaF;bna*Rvo%e@Hx|_URB%MH^ues)z)7PZE zu-FFknfgCP5bks~ z^8eI0Q>byi6uc4~KX9k6>2s|wx8ADjy5aa*-k0-UTN+)`h1*c?RMm=WNoQR(QCCgG zwFpWhvw8J6Yn{z&iRQIj-G3`@4TQFE&1fvYNL(^Ho8K1AZ@U`2mA{VlM3+UYm(5mh zj8<=)aK%dMA|<|v+XpvR@0+b(8?9eE;rKCZ<|KX|@ol`7zY&+`%+{`m)~=Yyjg>S; zN|t{0WXw}H)36Dmc&k(;5sR&~0L@#qXtr`iv~tDGaHMj@bY<5>=Iy4o&z*evi+OrV#6RQb9qTF4!o-iH3#Y4s;%WcK^ox{l(DeUJn*ON(Uhf4l-9Les zk!|y*dm*!@@j_u}SMVi6FxQ`2Z|EXQseFqP`F(P}O%78OufYe+9A+SWhoJv}B$e;N zm#84al3XK8e%Y+Nk5-`oqWndIgyk=<@`8);7E*axNPC59lK$@f49>!xN!v92hu2JX z>zb)%jTo~&Dl|&y8h1J!5VzLZP;%BGReqOXzef(ybR!K%G!;ris{yw0*`TJT61tFD zCPkoi4k6jjvVJ4pg*u0p)W@hZynrlmU-*^kVU||DlyN0vs(iY%J(Ar%zANHdbMwUO z`QI2|jpl9B1znMBNWZ_mM)Q$()NdXvJDs47k|aB=+@LmDN6tK{T@s@lX`NrI3utt| zLSy_Z6ccTB5|rQdne`vvQIWt#PpvAgi-XgJw$PRYZ#7hct6ue4chq!|t~N)5S=>sM5`z;L zW_3n3ZWo4h#R(c6Atjup=?c$)s#mr`R9(MC*@rzIPnf&`;@!<>k+|5z(EN#TU57** zH*(_IRU5bioo{1~=ZT}cUJVc;4w081p8w;udQ+?UgUGyge~0*XWcl0lkA*(EP6Kfq zY4JdWF^bYlrIRhw`Ss&z5Y6O~Q!`z_M%(aJHKkbv448W*4F~jB|D+Dky#26 z$?S?NPh zhz`RYipAWSOK^^&VEOmM?3t>+^7xg!Xm<0}`e=5?4abecU(SqXuesS3 z&E7oi*h%+>nk1x4t4MGdF)gC2O5dTNJA|xp?dL?-#gil@T~$NU&ver#z2)SR==!sJ zlBCvu6;kS`g&pJDaLp<%cddF`ne}8T@v6H23*^E{fILYOJW*C0`^%*(Ab`9A_t-nH ztcwZGN6oJ!N6k0;G8!~xNS2yH?)D_g zQs*WLG2`{FH$jLwI}rh|uxMo#wtk>A_5+hxNKTq2@II^u`a^34J@-i6>OpeSGNhzg zMmkON{jciY1U|0oybryz0+;~?z+kZN3$YLjcS;l`Qrt-`kStQNP1~YC%#eZv0@NK) zGGWT5P1&K`#0FhAf*hx%Uz!&xW?RMe%Tt!tZk+dPTF34TfjmNEIi~WG*VM`T6*R_? zqbz;@|99>-cLpFN*_JXSzI*RE=bn4k?|kPw-(pIYCX|eH(3S{>45{xZ#!FUSD zlLvCqMV2I@l8g2n4V$ugNLOT}D~0@R5v1NCI4iZRpSUCgP3OUpcsO2qwalctkCwx! zsv!+6Ih4mM9I=&<6Ai>G`5lRkLfW$GYRq~&D$Qx1V!n_;WSF*&4l)6-QhFh%+%>KQ}6+|H*sQwz8WBpCz`XF66>8z zkvOj3wDHY<_kIfS#VKY-67bOmr z*ZQ{8XK@fR0q-7T@H+tq6!85uHVI!;M{)%PMS^Zb{DHy8f@Di|;N$T9rk_0qw-cw~ zo*)6gO;XikQsPW-Vq|LU>BO5~lUl$r&KMld!SmoMtD@e1r!{$-6$#nJ9mNo|endXP zrQ#`Wj`^m7P7!dF$*qy=hfr=EUB9ayQ+?quTx<@5%_k>>{~u?-A^=R|?8PgtQ2lm9 zan@*t*@8+4hMk5%15P!=F~z_NV>>Uk7e!m@MtVN);WtR;uhZnDi-Mth3@on--Repuo2j3j{){71?%$ z*W4zc;Z?7>%de-lWWTwdmKIn~%My`97^_kFD%R6-*Lqr!sC2BSj4gMpXHki*FxFGZ z;KwT+u~m+;h<1>a0Wu|yG+ZzJi5l1HE;>R+>S{;o+JdP?Q%F+R#_JrZ>qMVLG=e0y z0jquE)h27TrzVrJ8XMngalGA-sI*pl(fE-xZSi(T8t644hYqX$o$)S5Y;&Rov7+^Z zcU$A#S9`vS5y2ls)q4xb@VCeNKoxX=Dp;F|+1LiFm*GmwpawP2T;8 zKoDA%cRwZ&thSAJhtbqIEMcxWvhF;Fn_Dy3Zq4Oh$wG+!$1Fo(F86m3HSJhX>@FT! z#%jk1o8`OAQ5fC0v3@_N)l~lpzy7xbuGLbOkLm7$-I}*DKFNywomr8-&eg1W6!+6z zYo4rwg%&;;>ay_3P?u3I16aY*GE@Wp#M4R#J_h|Wf&H4?z8jNjh8B{Wt*LusjvpKfqHWm za+4Ze`XUC%trl`1{~(q|AM_iDSn)dwMPqAmk?paOCg@1xR4taRv5}_OktXEGr$n}u zAuJ$aEFj@b-GhpJ&EHhQRM$xI$v1Z-Cr?blG>md|cbw`|aIic|4{dN-r`*kQuY|v2 zBcs@ZU>|}_j_F-ic`Gk6GI?{CBZ=wf_N`@@3E5e^`M)`~fh>d_C0iNsqX-9oC@srm z?FXT}r2IUOC|8SPC(iPqOF7arUOEePr6fG;IW)%1{o{7Y9QK>I|3o4>j^q|jS-7*E zS`X8AcqWqDGC4b(Vsx{tH0mgaw7L3W*J6wUGlbW-vcqv44QeJqfnAR=Ce^OT@7~{p z@KN|y+lz0xqi{aC3`SqO+z=Yd!|J~P?6)+8oL1WJyw*5)B|g`A5y{1AIvLsf~UvNz|E){GWT<|2+3v>>N0`~Q0qB^ zhj|O;tig?wH)N`cm~M#o@au&;Iltifo&0TdTrz|_1Y`ZPPo#=lqzfSr{SVw^*S zOD`O_QP)PyHs^1%xh$`eIq}vR7g0~3RSjYmCT9e*Klod^58zfKxP;90VBGhdR(KTK zV;G~XC{vZwS&9WZEbOYBB5-+QF_uW?IERHThf`=_H3yA3oy@^Qi7fY={xrg-n+g=i z+MCX!hnC8b?7a5%J-_e&M#=A#{9br|^Ip|#pFu<>Gzw<~Afb+pOo{pJzL0x32`5ej zXzazUg=4@TPA+8A$XJ5^#u2gspTP*pVWlm4y@6uz$ZDI?1AFId_L0jr=ihfD6uxj^ z(d!Eos}eGgOvFHeg$P^jj`&;Ji|Ko-j>cJ#*nu(5fQrP9gTbv@CQjMN@f_l zb)ynjOcG@Z7gty@f-MX1V{}=Ka~I+__rE=Q*?oSJzm{Xfsd={>H_mxin2>dt#dZ>> zMtdvb)T=5CWLW?6da4^?YSKw1!_?$b#BWl` zY>6BsZ=s(DSA%~1iC_-OG0aj~p@M9HWQyYftVI@@fzd9a1oOB#oG2AkRJ;oIICkyv zFqB$RZvKvvwGMMeE>vV9o8ou`>~cGGg?!g!mou+QM{H#xYM~jEUCuOBjx<${G$xar zX{sG*VvaPz8jsfk6;%Tj)n&>aO2A9|lexGk@YI=vepc+P=5z*YC@U@}zzZ5O@D3~1 zuq;omHA)uGnwX>XJWLdh zq$<~>!)s`&wY-W}KU0nC(&g)5o%Pkqny;LluiOALtefWi8*XuQ_fG>Eu`zPSKgT|Q zVk`Srw2-nSXiZ>|fWeIMXCi7??42BiEtrX^Vg0+vG7~-psfDp)rzYbg<9)^Mp}-_c zfmA5O4eYC(40Fx7^^73c$xduT3Up%bq3Zh0=kHBbQYx=4tF4oWEwFta4@`UDi79H#U+?pHpitCHyU21#s3H%QS`> z*qFv%;3#UG2-YJNq{gHT@=_EdMM>O862M4jF8H}Qg^(_6q%fK7xhZgJ4(3TXQDQ9Q z&OC+La@L29YSiYW!*Dn_Bgd2aXHdXQScv(_6DMH(e>f9S%w|*n2PFRFlaSC78}5A( z_ka@QE{(pE4Qx+^yZIl^L=}TYgr9T-%aUQH_ThZ;lh>peBCBfhI`Jx>D7(5{gHouO*rxBa93$=>vqBJ%*l+ z2d`Rw(!l)kB?9bwE2UzD`5;qRn((o|tr&GQ;sd!>3WCjN5o}gJTPbB1i4s)g}oN;>|pBx)a=$qLMMBc-qa0U-4V-sV^$06A@0&ih&e(4Auw1DQJM=^?- znI{u%V{ncGrYl^M;Yfk^bO&|{XE1WQ0f{rdHPFUmhqf{0=*1DQzCvF#$S^s13f|1* z1Veom2R5VQXDBx(GHg(lOvuOik$n#wR%avXFel8<7^TgWsBt_NPl|3`rovFHv4Z<5 z-GkX4y)s-yLqmqjBt(ZJq#oGFvs=_NCwO*I44wg z;Seq1A52AP0Y4%Z@Y}jfOW)v>y-HK#kQwrpZGxQn8fA1ld8vN(NV>9J#C70kxUs z%Y0WR0Z>_~AiQAQI2FXuFOCZmjE7{;6pSka!XcO;2G`gKlbs7V#X)6ISV|be9;U+D z@d&3l5j7B{`JS*v&W8G;P9$kgj-1x;p=$hFYLeE7dq~czpUJ9&S1zwG3Z?_2m;;0Q zA;h~!#itQA-IzZUtv$yiM$1weQen@L`1-y7{{EMDrU&+@J|fnZMKV(^sN0@P`ezZ9 zQ(&%GQeZZ*lmGEk<~y!bm#0t{{c|j|oJV|~cjQ@Qij=pmB#vn~k%|E_(d60WRN@r< z&VY_Njk!LhZ^xi~v*zAMj$rqfJb}Ya%XnqdFnakSUd`ux%aj#r!X^{gLdrLYltbpc z%w?IVEP@UjsG80dUfF%InI*2NeGk(SOmJ%REFH^Mq$=IDXFjs`{O*fjVeVb%y93M8 zfmCcwx)f{G9j{i^rTccxSM8<>5oe;Hg0jj~bZxo}eDMB-!|%tU)st#Cn2sL$qrK_q zgLD4(!$bv5x%+10FG5AAZy_9;^T)t16-AU(#~rXIQOMYPR`Ujes?k}@yn%rdYfD!E zob&oUve4x&fwHh+`3h%R@+Zw<@u^&R5z_^C?_*C1LsAg9;AvvKK976n7G{@RJmWX@bMRj z!(eRLz+n~&)`uDPuA$vq7yOIMAYc7K6nt2xhdRNp`BPz8?CCqGwA14pHE3bM5bdvO;$`))9;B7n+rbKJfs?HfbBjz zl}MUBpYb1`jGxv25&6GaH*)&)xFK@4ZW7*4rv|Z+O^lq%gy7zFY~&a3i+(6_@eDP?s}~}b7w%uESGdH;GfXPLTi zV+`YU=bW{vvOJt+U@d7X<6iBD3@m0hQk8Xd?#5w7rSDP7T*q%>PUw=L3&_QWg#i%n%D36z&k`zF=J@upeQ}gdJseD3EmO44C^O( z>q0P3F^u3pvb`VYlWeFNRJgS3k?Sh4h?Cl>G~^ZBSRx=tAd1S#XT{rFC!5MCMBpS@ zaDqQ%zty0K=H@6`uW(5HLb&3&`&8wauyEzx1Gu6rr=jwS>EnoLl0I0TQ=~t}_E>gn zQj_xu3q^dJL*TuKB9Ou7)gjXQ>D)3h!THOXFpZSc-qxLbYI&Lc_4)YNhoy{ksL?(K}Hsd7HuAT7}c+hD__8@sVWGQk1ZF={X`G zI?w;zVi=-YgDHxyfZTgSMr6aS>X#=q;>A(2Luw@E-l-H4U9%-bWJQt9jP#}wT4q~` z9)$>^2GMUymOsS1HyP_%Hm4auFh?}-_vmM=H@V(y4K?uO^Kd8Iu(CKvu7Cn~u*-{} zRjjDS_^` zGx1%R0&LLiuD9@Njo?+&$_h*0=y# z)nMDZxI`|Ca;Gh9yIN)BRUqV#6Co3npu9qW$ZgGvaqHb|)wf?p% zej0+y@9N6He#unl4!T0B7NnKm!y=#BnYqtarT}=>{$%< z;Y?tBP^SoTa?-vdp*KnUaT9A`J~aE-KypsD*i- zLJOO+3}e3^{=Kr7Kan0dph#c)Ano1dZQZI4l!^SA?OGr`AM=N_b$6VO!Gmifq5)rngd+SWo3vS(&(I@CPpU%L>o z1%mlz;pSj@oE=q&)gXf1Hv2(Uo`T(#dlTbRBhDTimM+vkhwtSJIkR^iHD)4r-+qB% z1kbUaEmF_i-O@r+8$=Y6Ge@4EwvH|NLgDHiU2XNyr|e~m8zBcCw%Re80AglVtl0#? zOqn!+IS7VTuUZ2@4tMhh!LnU1n^k#R8p1UQzR1QQ_$(V|n`)fudUpTC`126zsbA>n zw}!xFX!>_i7)Xv*`UWW@*a|_th%Kncyb8A)$G3A>I2YuEl1VmZG3E$`^h{Y*bx`Dk zP<+$Ak51{ZS&@jFt#fEXS!~2?&y-GK-oS4KW_==_3GqoDlEaYbnuLJn#8hA28Tgk_ zDrF8q5WP%_7uhH>ja;qu%Zwnn#MbJP*18d;j5e&BR#wk^;8JAH-vT?Ru%m34VxIRa z{bzZtaKrq+%QlRhsrhoL6@&#L;b>AY(l`iXgPCDMyC96i%M4TOIWNU{tuRv+6r0+s zmhT4``-~(iIB|q%#VtA@bQt+C!ZP7<04;-q8}d{ADcQm^paPKs$XWIUFuDl&c|V;ay)U&Hs@l< z2s>&tz0T2GL#uei=i)OPvci9~!tBtHmGN+`=WMZ-3RAUes=)cYnxu)2!FpS_= zImP}Zl9?-Uu6gs^r~Y-+$7#R3H(l8aMXH6Njpz5J!d;gkiGxG#jgTvb`0|~=R85h0L{lBL4cAHT_T1YfgJoavPMzfm?vQ3H%_CWEfrf_2@bem5}$<&u&AT@cK4dANc%d8^7H(145sp7&NJ80=A z6cv>AnlUvo zfVXILJy#FI62IkFa(ag%`TQ+)psLQ^;dbV|tx*vVLA59hv#Ft69SWHlrZ$DJ+>x>h zDML9irtY{e9)j+8)MALoXoxjY?b6B+_Bw=tW9}Jdj-Z@Y^`S-icT27YtSW=60W5zD z?TpsK!`9+zq%m^z&vG2e8J@L;oGxePL!*`g)Vn7ge5)$Vu?mHb39IJ}*MV>;qpz@385eIq~!4P~@(_J#eG9O*a7lo9o_$fWW3|tfX>nsgH z3fU*WjTDBMv6EA>YF&(>bQ$%;SFekK`M-0I=dEg8kwjX*1u9;vQsH3j@`gMrsx4gx za3T`5U|Bm}S*j358pu4cmt8QzV06L?Q(GKbx3R-yiz6EjY;qh%C@{4RB`w1g1&~*n z(*#8vrW{R`sG|{_I)yW(vfnoLB*Xfb*=4dC+T^Oe_4o>7iI@vp*XorjuWo~({~Gi9 z*95M;fl$WN>z(CV4e|d&j4}OxXMUXeedTUOk<`p4YFu@ciD47$+{o+8lIg$0Sb{&} zxE{mjK&I#F$mde%HjAYob^U60|0{dvyLSmm{*iPAB;R(eNOAMqg-6}nCN8KUYZDi0 z6J3b7Fv*U5jaVT&KwK1OItpXyJe8nKAWnXlL1e6+zL55R>-5F%v)c$@%Er`sP}F?5 zcFtexrZ4Dv>He8bv$0EXN7aQrk(<7NhvUM^J^V)D?)kdR5L@fv=SN-0mqHE|AMibL zReV4ll;=?ga-5I#KH{brb5(NkYU@KqZ@KF0zs(+5R&r-e?l8*zqP67zJ?DL)CBNFW zXg_aZJG0L$iLpe}uQ3#*92L@Q#Rd4I2>ZuxHEh9O_q*)Z=VG$WdXjX8~- zJG1kuv?^R8vi(zfa&(0=(XJJ~YT=VI^hO~e-sviE-zSph0C~5u$*Ss7&bq7undDPT zZwZgcNVf5a3x)h=j&p)ilqUIp8RvNtzk|CK3h7IifS%_3*LidvP0Eym#=G_t$tgaZ+T1|iM6fR!5T`T_>#eXwZ>f?AxtW&&mgub%LUx`l0&!XOfH>;P zZDSM>414={LN#$FD#u{TlDpBoswZl4!tAbS%zwd(EO7XoViVh)lIEk`GuQYyN8sw2 z|Hd$aM)u6zC_6BJQ!};4N2y8L(kg}p;CD2ySLXbCULD-{&CT zA5;qhvnpFzG7{ZNn6lgh}t54Z>UzZ2c+q6ESF&DRqwLOw9R& zgB=Jlo z?w=js=bc^?qW9{XL0EZAl!BjVIbpspCV5&lEx2Fg`Xq1*>9@B0n+Ylqq_A zBsnrQrR(2AX*1<82$vW?7C&?9^sy&kGN{j!@tsPZq+wXgFwE;}Ri1Sia5-mL@4Dpu#d1GV zJnK0Ose;3}^T^XP8k(!y-CSY|55|tn=1NL};XaSltk0YAvkmk=MK!6aqO}=5+M;<) zk<|YT32tLs^hsM__jvoAFU9A=n_h}vPR^G$&yM~{X~&g#I=m_6-}lWJz~5dAZxZ8_ zzYuNl=gi0&WWnDOIUmj$3GBB5pIKu4IaYdrO>yF(HtIn+?N*jc4Py`SAWpks*Ij9y z8W!DyV%Wh9lk;sZd`c%g9){f>xZ|ov@en3vAK?13(s)*Oe4>bnkTH^|!kuuH6EWNg z7a6ImjnpEvEoq{Xn(qnkSV}FbDJWqL&h5?8h>J{w(Ve9krfiJ{kJcsXtuS$uiF6H# zMk`EIR8aPMgE`kD@f|twJ&yR^L|=)< zfAOJsZ(?n{FVP=gi*og&PGx4V#o(kz10utIY9%xOO5IAABxgVxpr1!20Vl zl_u7=G`MqyG-tES=qB8}fVpX-PBSBAeo3z;qxcX~xfjGwBW$`M?=u!nz7w;r5KW#_ z4hw;^d4Gmhl~yR{-7IG_X0}?7TW~j=iNaW#I(MMBgQ5xlj*ak-1eEO06zis$MKa?v zLA6Ng|BDI2Y_w8>G6L}{{{|!ep5Pw{G)8y`glTKrZ3-qKN^&MC_T$7@pkB#TQ39dY zsltxD*Du zH?st<<31BOHU=rCBs0N+r+*mNOudbST}7Gh&mSbqCO(1QFhpGcyuAH#*Iao|s<=lj zNpG(L`u}9#E&}8hxXvy^Zd*>TU8%geX~#KumiPfOuqhG*Bg)CCOnGtyCd()nqnw7q zGJ>q=6CVJtBM?(PS&suGHFoe0OJsUa)$5BuA;wz>0JdJy8>4IlN zUJxkpmN7_Tobx<^_uRjK?uiG^`2mCHLV!g`Wgee8k|L%UF)V@WhnSLcKETkq0AToB zF<|Mr62LON0Y;PihnUDY?-S0N8A%XRj&~@u>iQw30x=O+Fdh+8iQFm*#zgT>v|x-E z@5BnmRN)+Ghb?1VB^@wjP z7~g=hcNC0i#5-LDW10}tT`;EkT+u)Gd=4~NU+ZutWTQ7hNk9}qwGmUYn?!>8e<#RA zdiwuh*k2L+HNncrt`MG+qsbrg-S4893cm?z6U9WqZy!WNUrb-ecu`HX%dqnIdWfKw z{iu9T4j`=WK`3>MU-=uB=M93N5d1BH)Km&8g|h3+W=V$e8c?74Qz)?Xgg$u+B5{HO zrd#r{h%^2)Fh}w$C^6*T8bF+;iW?Tfl`|i{7H)s7ePim5`{&z-Q>85+Tq~-6q%~E9 z7Rqa`m$zRlZ(sD`>Gj&q#Q?6*n{3;-ScE$WaW=GHuOGZtKe$-J=a5#@e7$`j`sx=QRR^e_jrnUE8Z`*#YZTn(1pKG-G-s^Rn zuGMW?tmSi^R^N8LuK!wH|6)C#8?>s17Y4sDxY)?2CatRDdbH-E~c zYqfogt$c3NT8FN;+%PuqwjkAUDw+0T3o~DZY|dM z!j3QOSnT0buU6Y~y{6|{P0wN<(|WC;?|S`) zYxNrzH}H9*U8+rd-fX7cf~T8XwW_+s+Ze7jna(17!^6XUgR5cP%jDsPRLvk6$Mz83 z$OuvlO3q!5FNtlGDz3=4k?o^zAaD)lMurKwu!(oA1Tw4#c_)HiGFWAQEfY$H?OlwO z`O1x<{v!_W9}_&o$G;&sK=2O?|tkg-Y_k7AhO({EgHm@;{IBJ?IrxT(}RX{;=V=P}@L_(V{6R^QW~@xL>pD%3PPC3a2Lx@u8To2pxzt`Jj`9o>*8ybD={#cXWOvGQW6NHoAfT&jLkI!4p6 zF)pM-p9qsQSd*&hd3o!6;EtPKPoQ_nrvu!H#bGmNlVif~p zT0={!<@R*_wpaG1>+fFVhk-Ie%|EKe9ZO$%(~CPbrZ|#DF~Y2$Nz*MIMUZoEZF9Te z?abnL;hw59!tug=(Byd;;j**FeNo&L&sPWB#gLw}@bN=pg&DNjgCqLXUYJte_cR3U zGi8QMHfUQ~t}hy+@zk++rqm#3Rj?oQ9LYN(dD^mv3f=L9KHX$)28L{YCQ}Z@k-t)d z$+w3kIO1v4TLgPDP3D4O74Qh00~w;A3R}1)=l|m5B>2~yoSf$1NeZ`k!N28387GJT zzzxM%^WSqL1X1Gd1%L02Y7x!z58Mdhd~ba^M0jAwJsr0`@2MohU=Tm&p36cbYiDk67svs{Bq)>$=QsNi|5!?7a zqek&({d~sVtx7`@QAm{+*y6{C)M1)Tnr@IGlM9!E}t*h#{e`|&3d zqf>2@%I^Dd)(JMI&j)JoZe0V(8T*=>z>}Qj?xwD zY=O$;V=x6DKRpT4(DZzpJahW=qz+xBA;~IJGI|Dv-~|Z=PbH=vpNwaGXY_HgvjrXE z@e%lVq+nSlAbxQ(esTpyF%GOB@A1tS z#agWHg>9F%%|4v2>b&R!eVK~(UJLiWR@HmG>b7*%ZLb98s&=GGcSuafwQ$F4RW0+a z>*lJ~U$445U3K?2tLLf?q)O?~&tAvxzz((Yl=hA_=|snbiU5mg%9Z(Gp&AAf|8W#FQMTvF(? z#S_9M7#6gY*SLB+O6NH474e=phOWpsi)mJ|Y$h&k+zs2xcL|y~z7FBLpVn^FHc+Mz zn+-H%W$eNOAd5@N7R$6i$pVztLX9_lo|2{;q1u}PZ%G{+yZ&a8ucRKWA1=RH;x8%B z`htNn)s)P8R5hN0c$S}W(q_{q$JloI?-DQT+5S!ST0T zU=;D&2F1TN=qbXl2-g~QFsE2WQ51Pts096!D0;pfiDo2ljBIIB8PBd!+{ttVLu~8; zz2f}8lYU$Wq@KO zN4ttoQVbRT#G;%np$!Kcun!mfsu>Hx2rDmTVxMx_^&;>vHl47Q4hPRbplxJ4Nd-)6qLy`6w1@zQns8_KB;X)-a6VZuZmtNCt8<*9ur^`GtF5 z7srr`eot9LwkX&Zq;!OTNec#oU_W*DFH}_DC_R!b3Uni;8lHvXmp+!I^K>;anExoU zwtiMeehN*LPoPjdPLO2iY*U{YWLi#lLMY$LQMyxdfaerPm&02m za*z{OBeu*DTPCq`>MF5dc{^g^Ez-*0)a6lrAnhSlj*~xsqS9qpOHO8u9IG5TMhm7k z%?T>skapfU2TsK5(YRJAOe0&>FezAP(in-FI7|qd#Hx8cS}P_6tE|BdqB3__)9BR^ z{Vbr;YoYO@{dzYCj*O^}G3Fe>vjj5U&hze*1ngn`0zo|iy&Fzzxgz#xah(3lJ5Tc$Zkj(vqIEWFdlaF2~h2^40 z58HlU;+B@@r#$DtVb}kdW!O$|CxBB{LH`Z{ruz#tok7){On_e?s^rI%pQSHgmg(PR zUff*u)w#EYt_l3>NTib!pk$iZ@`|%44KHmj%`Gi0jeLxW1tGJgRU^L2FdoS&@`HtT zqk%s5Cc$B%fn7+EhXx$Br(W;qF&r>e!vRw#<~*d>#PQP2ie<3D60rgJVRYS8z3BIs z-Q&p?m2}}%2)iw5C5ViPT|kMfL!5(WsaV;$Q`3cIKhg2rMw@L#S3jvD!m`RR5H6aT>&o* z>M&Sd9#!q2ATpER5;^Q|b%g*%6H6{qH!Rq>_>2|LCu|DMJgL8{uIpJ>g5O|Wd48BT zQ&e>=GKPsqRc#c+v;1zsvB^XszslZ`8W{aY8l$icp^eB^C%DRjasP~hm6kte=yfw@ z)cCMm@GQTjAza)oj5P@|iT07S_yYkkN-QEqf~w7trPBn8$)%b8h|rj@qQOQ%Kb)OnBuXP%LnGG2F~BNP+d3guTyiH zX-ARF`r%oAOC7j4o3hXp78_AD_f3w(@0)yR*RJ@;Y3#v`5t=C(pVUW=iClrYJv}n@ zc;WuO4PVs>ewW2IY(|wwFAPH*(6kxF=ux9^a>29wmUwnlxTNTMZ1EgjFSes(b7y)d zrLsod!xQ5L>!9C`k0=j1nXps`3@NbQ^QWBMBSNU&iTBjr&ecKN8A0$3$CP7DEfI98 zoljkYaVW#-4HH8klTtHsq?Yj|!RIf1I-wt*OeRDKGvj-DYARDDW<$*Jx_?qX1xB3F zMshLvP)nZ!)+pRykN^{ZIhIC?ah`!t2(eqCGIvB}6rRgL>!ED?L2i)e9zwesvBi#9 zxQRln$O#&;p_3YP$WR)28*ZYYMJ9d$IdYL!qy$l>2z1FRtgnllpoCUBLWNpIO3<5D z%1X|0p-H{*=+!FgOI4II#J_483N?fVHEY`VQ8I;I$Pe1J7MVh$|=Wa1=!UT12Td2hcSLbjP0wy5wTg9A7^DosFYjV70xjZm%(V-sg4!Ss;` zP#_~yN&R~)nrxoG#5}?fuC^-h5aYrGB7rx?JCgPKQG$;U942^@;DZG11e_`Qukdb= zpo5^3;4cv|YK#HgY8tu5d|vvx%=L2~aIwATy<^(5kTrjTDO|sgdY<*4Zv`LRnTwGo z?!i0eKG*S{;*XoN>RCThna355RLD87Y|boAp`5+M09MWcq_TMDcGk!_xNOe8WpWlp z94qG_QUMdYf5^FL*_>I=LS+_LMpn*6NL4_P3^^Asn=|WKDCa;Jec6QEx zj&-;A@@861I~OzTx!h%yI>3-tYr-N?I51f1JYtr8Jbdp=6Zq>KeVkOVVaYcqdJnq@1pneHis0*8P% zTr!(G(Mn0mTZ(0}n%;|{8{v*rcqEtoZY6j>y7>C#{N`k|HXS#;mr#T8?HBO zNjJa{1;i$v8H+IkVzzIqp8{numb(y?_5 z6}7Xm>-AgH^;=(lCSAW1FH}b^?)y2sPz-ZB*E_bQJGP}t@xLV%-o4O%`}Oua)9rUI zRCHlWd%a>XT``y%+MTZ0vlQ@kM35>Q(kfc6m-na3`xoju7i!xVDyv_pzf`{vtu_9) zbX;%QoNn2?(7Iuvx&38|77>|JY|gmi1}%Yp>U>yH>aE&nr8wR}Q8t2Vsff zg-=}iM7A+Pp{s}?bfp(&=8#z-Soepx`2~+tZ2WmoVa)nfbR&OL*1R_{h~DTpWwWqf zA#5ciu0ch8%0qtQN$<1XWxfcF4)Y7o`lhVY#fzR35V3u(_?&mI%~vzZrk9=bCBrt> ziIo=3T}pm^*yNf^yYmZJ&luKIQs74y4}Mq(D6jui16ldR{W%SRy@!KyM%2$ zSU`J*url)~BokEP?D|1G=!Xb6-}a9hg^*Sv2}=pHr6U0U!gH_Y0U>)%`v?TioO(Xb z7hJ~9!8xQ^b+ZD$?0LcC(SEVrubuV$8{adYm%RIjVQTP8$X+LwfOPp$+-JPWNjY{* zqFk7}=hdwJ2p*>Q7o4@$@nepjuPK8GU4%)n29rC6?j&FBqo?&9)0ot<>H5iEYCk!z zr6N0jYx|XlesOAM?+XVn9h`mWa`jxS=X!bXZ*BkEC61K~YkEEUZ=s0Tj68~GDX!WX z5{c?8XeRdX@PiLMuxH=l!@G9hx9`}#;UkBRd{9p?{;vUEGR{6p5b)qih@8pOAYRz? z>J0QT^9;0QL9NlZ40ucOAnR%T3XKFVGWN7>n7v}ihMP5o|$ z9Uu^n5@)D1n;Ja!DCP?3(%8%lvPO9Em@%{^%;x9sm9Oz^Eu{6q19IJL*X7uY z5M>-(*swL_Z@;|xO4nB)q`39nYoYQ=Hp^iWdiwO5clpmlNrR{#12;x?C?Cv&D?|rY3+b8DGb#wR7L@BhJ!q! zYMPCk^?Sf`*mKhdk;a9NHH&h+d+#@={!>`)ZU?uEWf1zT%7C!bpot51LU`GV~EYQabhXXb!&5QgXsGTw<1#;(53)=*BNLt4PZ0ny`lcXl_`l_xM#t?_J)W^Q`EEU!XeBGHR_mnRO(>>Jj-wC zv$!l_Gl% zMF^F%iw1AR>S4895G7=>aR`EbT4VIvI`#U3|8AoLyKmS>BmRx5H~HmJ={^JCS$<28 zTOQ9G>92()7~SlYEq6@9K$Dj3D?=oY(i@JLm~<7>uc4UX6Kv zW8+pVzVTF5Pdd_jWpg?LONCn({P*4%Sg(%n+SB2V%SUjAH|Jl!;J-^XHZvbZzSa-V z^2-z1vGG|UZm;lq(U>ClWb!f9X$hI-tCu$_BpQYT(bX zl`jHJS5DCYk?O~H+x*Zf>hV3+<8v&%43y9FPUv%G$W)lYTyWo^@gW(QqN;S2UvN!g z?*4Rw1x{im&zNU?sDFntKGc&g1v$K9dE-MH(q$CRw@!ZcUEDSUE9;Hc(GR~giegwl zZh!e(`Z6eg=WMWNv~c!0md%-KAWw{9oO#+VM+IG4i=f07Ibu08{AxbPi3D7(B*KL_ zfe7Wg?)qIVu}&mt1;Q+gES^SwVaL~C7Xn`si6tpN?K_z7m}fv0VE~!(cv)Mb;y-1KejbMbHuI_w>)v2eV^iw>fi=t>xa*fA=uWEG^@Fu7gMkJ^ zIuSFn5EI;)vd^cO%#Z<>&A81_Cd(Kyi4tmai)GhsbA}7>gK<*394j~(K(X5RbFh5Z zoFUfY5U)(y0Hu#b_ zL$LWU?*cbbe*pE+A0&7mKmq3AT?P8z;D?0a07jc5tu-W z2#-&HgkNTFv^{1Hy|1AIrq{0i)4MtTb|DL}>A)NM=`h|}-5X<0dpF~4E5ke}u7l@R zh({*u0GX|_A+iu=L5`eo&_ueh5F3q+<6iimVe=Ba&V~N~QmlrJb~E_aIA*ux9KBxO zlFzJ);kd`mt6Pk?nO$JuA<)HVQ@gzt`D|c2mDJ2UwdnI&%qH^fLYiVW#SCT>uO_nz z_rh$#6>9IRkt}4(j-scnAD-oRE3vYwoH1<3mB}Wqpl`Z!$X6#A=78qnV=-3@*HVle zH9jsEJj?IxvQ^3*CA63-YL~x{1<{WKx`*4x8AfoJ6MU3ULiI9e)F_Z#@GQUg9(Tk^ z{y2Ff`U%$NB*7n`==$Tl8zU&h`OtZ2=5C$y3?q1yt@9+Gq;(iHYBY#k@GQTdH@Bg| z(dXtSa2Z~MF8l*>4e~W03Naa^?!sh{G2+^vdo+SV!y5RU zyAn%B24M(w0m|5Ghsb1@DNl#nF2im>4^17KJO<`HYP70c45CY0qKyaVjzo6N9=OG( z_?@w?AB7qVXn5fz(0-?}%Ql#Du#Z*(rm<n-t~ zhIHI@m^qo65WaXYUUoHNm8Qb+HPeZ>FJ1w^C3c!hu`p-qHZX@OM{E=}=>mzEAvO{v|SremX^IW+^v@jPGszG_^51x6v|jB0r+XLYU*Slj+Db838n zE167OuqFu~Z;+lv z@JSe@N+J?0(y1hrhTFu{;vmd|>CgavdQ6|3pnz6#sErC?{7zf)@yRpeaefvIjQHfK z0lq$^j-6@8rfp<&l$jns+qMaAw5@1kSf~MH`B*b?U6O(67qM&j%eP#kBqtVNe zyqp=myYA)<_+2)9GF9OQ-R>?8de@8o;(6@EY+ zHIgQF9h{ed<%Vl#W5KTBsx{vjM1 z=8zc+{`EJiJ83ipH#tgHHyTQu~ajJtPY)oy84O{W*LhN1~0&l><7qqr!4`1$_{lJS6ps%{Y zEL9oQaAze)Q9tX4XZfvQ%%dPH)KBd?ixnY=3hBrkQT|9xCWYcOd%Ew5+Ghd3#=_J_oUt_EN5O?(0sEnFk7rn4>pC}F_%VS*kRjwxrT6o1mMU{GZGZvBdB z&6w;Z{WthhP0mYLNulxzm-Ra3Gj(#Q^hyGXV(L`{6ui`{32F#x2`FBvbLi`2r0QZy ztBH5b1T6#*Z_@NOf_8!qf=+-;lX4$w5}AfJuW6K~i%A+>Ng(5cwOU!NS;GX)xd{$U zJg85eL|u}SCNosHjxZ;rY73F35h%5w_m0spk>1V11<}`;hI??}(3;p@chlCl(ZV+LURO8iK~r&x*YhHC~Nc-o>;#n5-!;nV7{e z((PoryO}O5>2UT8F?oF#L(0q$o-!wOVv9_bgt6t>Zy=FIP9^odjEPB%OW;)BhbVnN zfpAO?GNtM()f@LPz}Z)0(i2AM?&XVOH0T7=1;%KOCF3VCrVBIYekM3eaGu}-!Iues zgWx5Cs{~&o_-%r35d0y*e#M%BlrQq>jXk!3Bfgo5veaR zEKBe=1T-$9{|Uig5&Q!I&5Gzgf+#_ZpoReF0+^x%?F4HH))Qh6b(hGPZZQd-Ago~lpx)1CJ}*{M|t-cft;wz!MPmEk$oym zIAP%ke?-^-vf~xPR>&V&>|C^Orua_iWS<#N?9hJ`xu83>WOcEoE&4nj4_Mot((h^E z?`w@e(B5}ldtX|6-~XdUzo(V{KpXtA*X!~AIH-9Z_PpV(_ITgW^1%|remr0#e8gkC zV?1yF3vI*qwT?HuevkL22Cx|Mc^>h6-@omRkmk9^^L=gL2im4Lydh=|@MD5E>NKSJ zfp+BkTIct*_U~(hZ+HV9FI10_0OdvC_SZ{dGar~MX*nOfQC4^H;m?2Q`47!j|Eq=< z8|KQo&WEzWW>3w`SXRR&d$-5$iOiH|@%czL;_+0^Jdo9J$rhJ-T4pA*8ZOzEV$WU= z6Y-j@@p^9O11{N+$FpX3C*N3;E%tkMsGN3WBVNzYEIxrtHWc)<&VC}R;gXF8JY}=% zU^T&0maXu4>aHLkT(YHJPsL?~;{t(_KzFKlPujm{(WiM@FPA_d(BsX9eE94!Cchp0 z%0N}BdVSizKDFsc+JA%zqchu#gkH~j)&dv4G&tMJmj?Nbc9sJdmOH}l;gT)(dit** z4KCS8z%z8Yp5+_DHYZj$>!YE-y4%xL+frJ^VzAXyu~0$oM@7$~j}IGR2l2Po%KeQ% z$;Hl3esCt14pc7%5V;7i&w<7i1j+pyQ(Nv&`|oEt>t^eXaz56Qmj#+kl3 z?SX8t))Pv#Zp&)8W;gSp9@`*XZ*JBCr5E>HxbM6ti`ul#wqEIZx&5WJSq%@_y`HeA zcy`^D+g{%A(j8e14={!rXil|mNc%UWHt$dS_p=J+GiIOkdqOiP2QJwUdj{};^5c?y z)Z_JRnfV|abqhP8aRz0?vr!GJjoGLVU5iiSk}VB-s@ZP1Fnq%-(&LgX^?BBF=;0!1 zIJm7eJFPU8eAnlxRB0YC(riNOBRwwJanA?P>X(s<*Ts)|wLn*@duQ6elTA}IbJplx zKZ=23HIb5<0EQ&xB^xWo1VA`0*&3gxPfgH1w#G(I7+lz@d!?^<<;M?s*s#2E0@bkSxcsOhTbJw#x&~ z83OeS!Dg5)3bfn^Ho?G_W4FgNyoa zHkJkY7n`)AwKtn1fwh<&Rk7j3%PSsB;_`@nd2O%(59fHck? zTGC>`^+Q^qA=NaH_77Y+m-cUe7B1NshUx4FjnV4GggeXojcCe_zEt1C>GnrbTGh-4=d?$%!82ZbduLYT zHLC}_p32!trmo~Tx5sL+rMBo`9!^b;BiFf!3Qj`(rlZ uPZr1G9<3M_tUX+4{ST-750e^r#Pep7`?Wu44T3Doc*934eVGam;QtTvMMVt& literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/run_batch.cpython-312.pyc b/entrypoints/openai/__pycache__/run_batch.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8b9934325ea595434e0e5c6c57073bd2485d196 GIT binary patch literal 21409 zcmcJ1Yj7J^c4jxwcmpIzf+WBvNa_WO1oeJUl4XhyN!I(}$I#KQ;o7vi}qAhu? zQRc^f=XN&$Ql#ZfcALcQ)Aycx?xXKL_ndRj?f>TSI5}K#;XASazMJFzfgX&@t^*Ig zZsxdmIFS?iI5()>`9U7frno6#9yIe5XO0UA%b+D;9kjA%A#O|92ki{D#2pFepp(IT z+%@Q8wA_Pk+^um>qHM5?rP<=;iSog62HWG_M8#l5qH?g3Jv-u63E!ZP!OnPfqGqs$ z!LE31!awLw1O@|%y1}|c{a}5fVXz_5IM|qI8f;254>mJAcRZMA8Ej#&C*GQ98*F26 zS$s)i>EP1DvcYBSxjeo+v0`uqgT3)kqJ6NP!4>h9iH^Yz23N*A6XC%ygRA0QiB*HE z80?F$POKSR%X219YM0i%$7?+}xL&N5Rt=lPnr{n(8^qdgbAua2|KKJQx04eC-{!=+ z_e?tGX0cxC78|7QVL@yJ_7<@T&&|?SDG&gP*dX%5RxyZ_ZDI>jTJdX>o?-EJaS7tL zGYZ8VOMyYT?BX({J}WMl{Nf6VlLDX=LR`utSr?h_LZT+F-Ca#m#8a=>s zi{Px6c9!7m9Qn5;1awedwMgC##+zVorcBy7@`q36M(iIw8GC(WvDDFuMd)vuL!TSD z^dueh!vI>Kj|k(=b7%N7d}#BpD8v2`uR8We6Ou9>9g;#O)zNb*8cRlx#U<6=8;!>a z0*-Un(W3{CB%{d*1yA-PNjWxpeD?vs6-qKHCnG14$?=FH$tNWlkwWq{F#&4jJ<6HU zq5je1u~Df`+0E)8Pi^`@uxYBR$O&c!^!gGZ1>-g@1sP(H5W~UV30cPIjGqZA z6XWA!a#9J#$A+SDmdq$99YiAudcqcO(Sf+u6qAhEUJ&if_Nk>k=R7%?&L2wr_+Jh|3b`*cq2bQ@^t>dOA2d0k$p&$A)Q` z05>>vA}U9RlHhkVi1f(aeLA9o=24r*v63Yv5d#y0?|9hh?9-aj9ejQ?9!tcKMqMb4 z(n{Ewkg)O%DXNq8SR{$+qyqgcEqD;cO6Z&kC8(o;^;=N`)L-iqOJ^tEYRa}M%>Gh) zX~gP1`eg2)wwwl6jE!Q7j7pM7)f^7atqL3QQZf`ySvxyv62+cvM*ly55TMZ~c#Z6> z{maW}D)fZbpPk9ESEW&-u)2`J))yir%xzY#Apyf zbmQcu6tDr*P$&ttACkafhT;>V)XDHugY%2glQnX|vC*J}h3*Un78^~$6?hyCo;ra* zNy<_*5zzz@g}QHaELmh4p^#N|)8LEhi>GQv4%tN&BW|xH4#i`Hpr~eyo@=}uM2bR7 zW$^K?F3hv8@z{7L1f%0pvh(!B#?6~v-6(a%W5>G2&m>Qbjdq=k#}i%Pee#*{F;Gyt z#vo8cV_ovZXyh0PJmK*(YI!31szk*^w3S-!LILem(n%C`j>`+qoKTw;YV&q?-d>Zh zsGjk7XHBNE^|LO{;(y55E!Npe&f+S;sx4w|SOlvMRPtWmtcfpcoOLilRuC#HArzSL z)cx_=DNl3WTc7tF(6KBQ>%%gxtTE>aWj&#*%ceZ5&kfu!R%^RiPn-*%US*aRcy^774sTJERM1wUtXX0R_8ta#cI#GONv>hAy6?JN{aDv z?#grf9-#ZVAqtGk41j(Q_9A$POBMtsk-u&#iV#V{VOLSG7%2!VMjFQ%iVK!k^ZwM5 zlY95>3u^LWC&{wlhTua%Hb#ONEntvg66{1t8Iog6xW-&+0zEcbR3{rcY^=(ih}8zE z943UdMsS)ES`eJ$W;|_~RWD3?UOYE2o0#+(=mWrF~ z4xSyCFgHh&n8Mv_!>fiTmj-o#8>}iFJUbd0P9!J0&L+o_5cvrXzI;}SN5>UOoZNa= zmJ(6OxTD7>JA!Ay=p}~oigQl?8e--H8u{;V5=Ul$L6gWsDKv{Fgo0>BXb}a;3T4m& zWzcrU8nUTArYC^u9mi6n9K~YrDymy}KOjf6*}Dre+9IFkB7n3@gxD7;V4vYaru{Ny zAg&HZh8C%o$oR3Sd<_bZBE^SG`c}0^B20RZq#75hMc7FG*C42P5nwvk)SeeExbm&b z&I`ZqWIa8nvQ$x4S$cUSo1f-K0L{NdKIOp>H5D7es{Si#Nr;y>;wEoKP+D7uGI0z` z9j$du3s$#~{wC$r>uYzv<@!C>#n-ZS{~x%XtZ$)n4P2e4>Wif?Ks#r80OX1m$)ipwjx=bx)BO4m-j-JDjfdhy4 z^&Dk`17oBcsj5p8n<8YHQ0ScmgNzHR0!`smV8LNeNf&{uoIr4n`=z%!=WWS)TXNo2 zS?{VT@0yHz&0Y74Ka_GCc4jy1%xyTB-EeSf!{N*eFJ?Eqh#}+fJ+yK)b@|GgcUHc= zGGBp3I+*td^8U7q2WDzpA9`#xPE2~|te5lp-b%fh%Gd*JknKkwfQOKP3{~A3i6q9v zi8#UTNaVGNXk1H?x1eJ3GZgHgfOW(XK-V}%Mq~8rDT#t_CibXWz z!cr-}R$MAa1W=!M8N+-X_bn_0XEEvi$n=i+-&iad@@amgFldX}iVJQ@9NB4@QQOE}cpbmv_#MKp7r%9F+-i>14L$frtnM2*3wZu16VH7OZ;Ize z&8N*@<4%!HeSUi=534QZA#OW|8f)o%YG*i&8ex`}V#NQs`4k_r?0;+{jlH|8>#?1w z?gUvTlEa;wu!!2F(V;O?G_iI=!Gc*)ITeGw%odG}5w_~l%!EgE}hJ<(VQ+|x#AGt?Z1Z;0a}w7gt`j{nWGSMo+%V z_s)j5H+**!aLZO_J*#GIoNwhLWO$UXTJi{71QZ@+s@flc4F0nl4?NaLY2j2=`{OS_ zadpPC>Nm>Ypb2km=_}{|n%}X_`eQSon|w`g*nG2=-?80(({CoY-q&lh-0b3edHc;( zW`ftcDSkWOYqo#H34lKm+{pWpm+!S$KdPuCxXs&VHh;9%MKCbK=36|Eytm8@w(y9* zWwZ7M9Jf4tuea=0nT_CT8(}t3u3O9b-ge>E3M;{3YaeI3wT|!I=)SeyM(|b};y*T9 z`rNjU?F6eX&Cmb^2;;!R7M6$>r3{rWth8t`PDEyKjT03Gam+DTEU%jls~cvSDb4qB zuYeOynv2{9Gw(|T^0cWqPkHV_v%onx(L#U5?8Qv6JTd+)+Pq-n-3q+OyW(O?uCgs#*)~na=$6e%K zS&q?Uad5@)w9x8RlBx?@g|Y##wLn)nAd2Y_TeYmEtIVV|ZIV+Z0v<2gjJjZS+tSvN z!U|KQpXSoG>vjXLgsMef3+-w9NI?`V<`W&SLTivi$#Nr3bS{kfJx+8XrNY1~-i`I^ zd))U0qg`+CqLu!P8ezTCTB+9u`R%m%a zinz#MXcxFMR;=ZZ@cUc*-qcrN>seZ#t@O_3ufi8DFT4t0*rdgS%J1UWGlvrE?I$Q5 zD&*KT2YW$T;=)Ttxygc}u!NI`CLL)@+WaawwY)J|X^e)EY6B|XMGr6?Y2iIuq;Ww3 zs?{K^%ql*IXKOs1Y~l=~$a^#n(jv(4NqL_}M;<`8 z$=eXb{t=uK7KELEVREu1V+jdu+^?GCV|W&b`mgRFG1WW(V=yd0#Y$5R!%l4jxjUG0 zIS=cb19*56n<$lFAAEuwAp}=MVXz=W|56b(Ev#`8%mJ?c9zp!vi26#||I8e7H6IhM92baHl_VU@Q-5>0|bT-$#d%Ah|otN_U zZSQWsy#4CX-TKuttt(MZ+sgdPweN4gwms9n1?6n*%y+JT|CMX6ycfxIY(p_E+fhne z6G~}pLMg%Lc+mQy9!>j#+V|^E9h|Ro)(-H|tdlD7DAU&e2sOcfwypoMQK83Fqvw#? zy!#O-;6K~E`|&Ihdc-KP;8UXcPY>HUZ{Rb|?Fi({YcAE_EnhO@t$D}&wma`Tp7+(~ zebsqi;Gx4(QTfo#)i%A8csp^Wf2yWE=WW01ZU1FWbG~N%jpH}ImU-sj^r54f=U*C@7Ma#wu->#7OrCDKh0L4sNX0{@Z#U_@3QmX3l9WLHywPx z$$rymCfHLxAXsiT`1>1eH`np~g8SxrGr^lS_uCyGt#$$aXe-}ub$+x>0DOxN0{>PE z-yd?`YBdwQbaQ{F?c*jF;EzLm|4RGE?E=AJ7wCVynIAA&KkgO?envnQJ~8otKM{CP z`^3sZyS2a0^GO-sUs3rKc{`j@#rX|;peCo6dhJFWfOU7xJw`!{$$S!W}7 ziw*I&8O_^*b)d#^+sO}j%5J-C1XtQn*6n~}m&J0sb^R{R@{?MAXPx~g{#Js!t*Fl( z&VtlC7BgVAOlRiU+KFK2bdJnO7Kd-)_uvKa$FFDwseGChBkQ ztb$lC3ziQ<)`Bg;&~k(lSy5O>HZ=02&5%2F%}Ud97Pdz~YSY(Wlw8on7ObNU_Ll<1 z;<^lJ)cj?V;z$+xblzpuazP2gZq!J0Tz49}9;5?vsf`Fan)%Bm3(+Oo(dD=2kH?O*QENMB}-PpA-W|}u}3w16?gHI`Y@g82VPViKDvVRI?!6=MB zqb{t!zWn{dqF2OD7F)?J(4vA3?2H+5=pFpV@!R@@)~ws6kXFm2uF9wn!;Ohmn({nH z$<+ieCIvw{G6XDAs0+dvi8;C*vqX~ywl0-v3iW)dnkY3V_Nyz@b6JL#@3-^1VC{x5 zMe>?Pwb(Fu5MxSy8NvP(Gc1t6qaiU3KOrT^$>Z3>@Gu09Q9`EInW}xh;MP%sy;NZ~ zfB}ekBzMRO1Zo9wz0yvIq^x1m{^KEwDv(7${vE-P9n1d23muX*1euK&rVx!nEAhsuuQ}7y#R3zzDCfdjnJzHr1 zU|h8_+ajDAscaf4Y9+ZnibsvA3{advR&|-|6Co?JB5H;W+8u)HB69(X=shVTj#)eu z(&I~IET?AhphD#DA?<(Rr;s0=CSv(Gd)0!r#h-TU&$|K{SIdmM@~u5@?#cMOr`=m- z8kT(Tx%2yeQQdlxpYc{-jOJ>VXKR*E)r4+%e&&4!*1?*#`|d!_-IR4VU3zWWUCh4v z#+GUK_L<5?+NJk4=De+0Z|jtI3515Mw|&ank@K$2de`3AIpy7&uWNp26RIHM2%fS} zDZo~KWnHeKBU{mtulDBy!Mv|EU)!FqZT{S4tt`Wqd1Vy|DXzC&nTB0=TK;y~-z=N< z9-66HcICA{JNbi?S0|@xw&lFrGTvFm{gQ)S&bdw15}JsU#(9?dp!70*1n z?;Utl%=&n?j&p2ztk9%*qt}UXe9O%bR13E}%>;LJ57e7KUd9j9T0dUC9`Gj(5IU@% zGztW_gn@IroF7;&-1b@tuH_MbJHWzv%fJ%b?Vug;s!jJRQ5{T(k0g}ibIg$>dP5lC z9*}qMT%UlV0mxX)U=lGw9MBbCWGGz4B|T|3A{XAtTbL?Lo;vGJ^DtiOCBXP;!rmQ> z@fLDJ>3r;BH#WgLubEVrrnp34iYx!q?yw-;i%+n*6?*|yi5*GL^k&3n*_;7O};aeX_& z(JCcB3p8*}sv@=>3oV_q-Po)-^^npRS+IdLEJ!X_$V#GFN3EQD>6I6+#-?1GGQy@` zGXv`z9+=0g$VCjyH6k$=>&WoL=nxch-P3`()VOFwvZj!(r!~a{7XoaU85kvZLy#(L zei<8VbByK5!iY`Mi)L4GoEF_thYQm<_nRg`%!XTMm7tUE6 z?IKq)+J(`0adxE5NuNR5m>cqe{};6ysWIpl?+Y2O7Oj0j?1l(loU6j9r^IjtgX#5x zp?6Zi)@4E5Dwb;WCY*}}wmEo@iBR_aGVO86R}jF3ML0Glo{|5AV*ZqZA0S9AKdk%9 zB{0XCUmQGU6*%~i2}N_93(|Ujjb~FBtMU8DraEKr?giHv9g<|S{mNwOlWF|Q)Y}X4 z26~~4cc=m}8a8Rz9(KqgU#B!Seg2#v#%9=t0d3E)rB=6Qkh`4DALPG88jj6y%C7*h zdGo20s|j7LeZS#a!&LQ#T=|A`yMAe_$lJX$j^^o>H9y?<@iQ6wVekXH^`V{fw_|($ z56^yg^5R&gd|AGxQCnzds+Z-f8?Wy9FRs7x+;~0HzVo5kR8t1Ow5nR{-us11whcwRn>zbF@iqM zzdG-($~5mnVSeX%H&QL`oKT$=s;A-Z=c)y}n%%~EH$GNqvb<5#yNSQV_ihj__cj51 zVw0XY!2{~S(zR$I&lF?qU@LvW6=|d}(?LwXPVOQ+_9_t!u@NW*j-0Q~Km z*5#V|vQ2$cO#>OB{(eI+*U+79=*~6l$u{iC*zxa2n0MA?S~q8%n`g>u@?|SBq0N~U z-7sElY05XXynEvEiOlk6rkb{A8lU}8m^JgB0a(h+<<42o>~KOvbaf+e?B&@Kr+W0& z0#2;tg8Eme$ed-*g$TA*T!qx38#ESNC%)-fc>S0(jZ%s(dw^QuI*ze|l)}nUNZ{n! zqO=2yg1l42xLz1Bljc!4S-`wu>f_!LhD^s%|5vaRh)p`GJG;{GY+9%W@X6Hc^|t9k zy#U1?e3V;g+Y6g)+(==|%b;8I(0pQbv?U8G*Fwe%(VQ;uZWv7pn;=+)Eo{Z2S&D1Y zE?UyIlbk$IG*ke`j`l3H5n_h>h81ZG7#~JxF3h`t*)%T-kBALE8H=^Xp~a5EO=Qvm z!wA*On`|?1S-n^dt~(543Bz?_ix9Y{dASa4U0T4mjx00a;vFUzw_&gmJMLAmBVdN^_jj_RV$GOR}WS5b$2)53_ z=8uGt)kePJ-Kg2f+G3mmiC%*jVk6n~3f|9TdAj_|s8G7XU#Ly4H)xD(FreZ+U2f2w z^rpQdn~Et0Bvw#w%U@4BzUfXoM!Jpo;+^$yMY`f#W4df)TQS~%lG_a^?G-DpR~bEM zlnf){$Ras=3OOs%-n7Sfr^TFU&q%LM z<=IS%yn#>tj0Qn zKVyD_#t|b2xZ_a+F5X|SM*IuKX9F_kd7*@&Q5fquIZ#dtr{L6h(R|@Ogr*DEUQ7oIIRC^Rwsrh(2fvwc!*utsn!3@Z3#l&Ly3pCz&Yr>c%Yld>xf`?NXfP8-Cz50EZ7wrx6Z)2}2S?755wbAG zA^E8da~?_DOhVq$eVm!hs5ayFIz5%0fP(8YI{_78C!nx>OsAMp_uHtu z{1Xc9P%G?1aXM)?d`5NRL|g<%9i_O!OulRrALa?lkn$gf_am8QnGr@4qp@cm4y(K~ zv#R2*)gc%@B1yr;PI*PgWBOTn9PnlgC{uz@T84MG2vf?C3}fgs!wWYuwESl6p$ z5=fcXIqU;;Jh)J*Hog(nPxps+rvOYKEY0A%@MR;&8;l z%5k{-iZXFEru#E5c-5<;7v2qd6^oi5v0G1LU9399G4vKYK0u62{x;RadK?oUW;wFk z7o1O-C7<>HLN$z2FpWvAeBym5+FT^2BY#A}U7}Jxub+2KvrT|`f zWO!8dvSK5pF^p(wOodi~(kOUXYlSHa4JKx|c_J{Jv5kX|@~1$3nH1!2LQB-POzfQ9 zt!g_|VEVuFBW67mR#fB zY~$X1s52LOE*p9-zkFqG`9OC0z~@y?=tqdT z*p>mZmyoMCOC#-s06Ak{`WeSNcJMREx;N)*&-&W0?){);%DweNKI?{G4dK>s9^b{+ zv+iZt_8kvhoTH{d`*WKBs&1lM&ROcB?VuDAPs zbpY=2o(6d0c$>*?=MCn(%d_6)GrqNXe{;U|2+V8MH4n{jok1%q0{7hwId@yu-S($@ zZj^smduRF7lEc&PBQUoWj(X)izPDa}^W{uK$6ZecmD!Q0+DK)Fa^B5%y_@wi8>!4- zrhNGmW$s(N%*?81KU{ui+RA@ZlX4~WXe}O(dI8Lx#z=;cXm!KIXdlr zzO>DScQNDbA}^=lk}EI%*~>q8`N}>xU#;mx(A@dZVyme9KW2xgZ1xos7dXJrLYQ$@ zyw&h#!|ylE)U~`Dz8uc9cW3IhEl= z{LRXygByjLt6BhmRKXwIDtuJgC*a|B4S#UEaJzPQ6(0V!z2=Z$&T#x8lQqK&1Y0@~ zpIOZx@(P(XE`qo72(u<0sae5t$YaYo?1-6Tx21jUZ{hdgAp*?`N5>fAT!NqE;Xuse z^H=y3)+8k6qRq8<)QLVjl!g@-djz!iGw))bY|(8fC5NJl`#dB*uq$mYx_1;L_M)@M z9G!q2I09AUKo%Vxg6tk5HxB$9x+5{i9GDr6`EJK^jtl9h3nA7Aj!nekIFhgJnrVAo zn)MQi?&rznvzMgn)5!sBZFrc3tjXIr8-!0?uoH$6*osL4hX{qwRKy^1577xBcHF3y zM71zkL-hmsVQt|J7ezJs0s2GrF+^5CSStpn`ek}WGFf)+Do1*$*HoAoV_Ds@A$%n6zuGKt=QT3 zTFGtE(fMeWV&RE6C+#;1X>4z-?%BluCEwF%{h=AqkGwr?=9@OYr`dYbE)eW?AsQ!& zdRl~=ek;MvJjJ*01h-jx*4u7|_?`~;&2}5Xt8Iv<1C7+cNJKx^uIV1d7eQiqP?ve3 zn5FDwbte@qRfkjD^FFSje_2ALy)63f1++3zlvDK!VCn-v4DDl5p8O7)s1k8)KY``} zgr2c@Tw)*VP{Pq;Lo(T@W!iAUhhT8{Oxw&h#dL?0Z%`^J*=hyquU9RiTh#58NS0Hz zS|3;R0jw~6$zdXi&t$Lwm^gg zYi;X|%+e(rb5-={qkwGC>XIkQ(Z{|aT9P=ZsRN;>p%&y*O)XgVG<;3=Whx5OI@G04 z%fSRsrdz1YZy~IAN!7j&5jhR0_{REcOu$u9gAyjNu*n7 z+nq(-g?I;J6KwKpoPty_wM-ZD^)JfEzad1GQq0&`l3%39RTP9N&b}2Lietjh$mAk{ ze@+R3kzhxN=O5TPzWf(l`8}@b@3^jeT>U++_8+*iUvTbw^jQ5rxM%Nio%guaUzi-c z=?f0Q0}4L(aX2P;kK6lq-1GOi?f1Ag;N0U{?{Qo2aqIrx;k)?Kl%we!KGm^@=PNS} zYaVa}eD-ydiC+cZit^X^bFMF}e!lKg4#DgOE8lRX3XVQ}!_}$>bepYo^IiE0->jML zOGBCVt=Xm9*wcNB~=?dK#{W}{4#zEBSQBg5qdgs zc-D+NBZm9zVcyOA^PVOu!k=FmX7~G+S}Mvv+hpc9vqZX6BJL~^_gQxvas*mt&A8tO z1q<#F70Oy?Z3NjltN(!m&}VyjJMWyWiJADFe7*{w4`9%JXVt8QLC9ovKCo%QzR=@ECtC|pBI!nTcWrff zdzt1mwQAbF`sen*rR2M3E}yxYeE;;d(?7Fsx^E<}p0=;KFO;9#No3v@E*|}^^QS`n LY$GRVC%^wMqtHkV literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_chat.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_chat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..196ce68ccef596b4f7d139c4e970e7a935a45145 GIT binary patch literal 47992 zcmdSC3wT@CbtZc7A_)Q{KoTTD5(Gd3Bmocr-*1W%Me0FOlC6jBn6|@96OcrT1eF8G zwqzi(liaD2stuJkG2JBBbkk1gTDf5+X`?ossgj$fPVUTC9FR7SQO@@ zzha?ys+gtIP3hQEKUIRKW}$S^FlAtI3KoouWmDzMU${`QXqqxHf6+qaV%1dDV)az@ zqIt@^STj|_!nF&vi*-|Vi*fdZs*!ty8Uw-YM^*Z_2mmpYkuZO|>nyPqi<0Om!@F zPIWGJO?54HPjxT$O!X}GPW7_<^b389{Zsvm15*R+y<}lL7bBTNqm$pE|_+g3`>QxCECs)dIaPfeX#JUw-qy;m=sSv)&+cJYy^M;0HQdX$Bk z7am*u$ka!ezh>d_#i^+&1*hbKM}kkhu8>B1>PaPcgbUPuiVM^Q`(Ib3L!Y{$a9jR? zUQW7|snXH#mC)>wU?|AXM3(qe345Q2NUv^1s_0DcW0!-W*&zIy!*ijT1$YWiQ(!nk z-V?!ac;-BP)*lL;p9=-Y7UqJXh+Cbqja`_DoDD8sT9}Ch$Cg471Rq}F7iS_5U5d;t zg;Es=d?v_;XXq`RPpbBOFfx5%E{xh-nT{+iElkhOEG&elXG4)x+2YK_U^Z}iF5oUq zRZ@aL@R{Jk(xo6jy(rZu)kML_@QX{K zE14vfd@vM9XFj`xehE??&~qi}{ye^P{(O*km!~SJ>&7xs5dJW~^mM8!Z*VwOpFcd^ z=c&fspHI##1_Ni&vtt*Q=4OKr&+s#gdrM4LXm5$9gW*f)wP4D!J4-oS%5iUj^4BvF z>gIHuJvm6<h78z#*k)L2`r=_Qsk-A4?AL{1TtZ%Y?pL zyc9W0VDJdR>O|<=QmQ)LL^;}tU}PpR6PZaBW%AKxVq>P1Jaj1-8l5;3E=Rn5vl+|4(20Zg{p!ajadMCNCHq9 z&ZJk0au5S@COE^-USMDlPL-ZPvlh@o=^b(OqoFIT%(<&UK4r|*9KA^)ZdIywapua? z!D%#YZe{^t*%=y6RYiD=8xTM?5YslCs(uDh(ci&oUcx0b^JW&Qs|shr(+lVyc$+e@ z(7?G!nEHDb6MTt>m*NM3slu}oYLi??`sMBmSnGZRt6h+z6%MPL0{d3ejn}zzia^z;N~g41E-(bG?wm2;K?|(EM(R?jN071=DQy97&>k!gHb|`s_%d~+^8D1_s6ivu;9PN_ z4JjO24s9I@vZUB|OfbcOE#;B}K#Oz8uQvS^3*&5XH&RNKr@vv2 zrkBES%0Mqc!oygEXJ#*^N`j0!n4V?iL8@SpVa8MmpI%a?`K5(ms#L}XNDhRSD%ssF zmR+yYvl3B3t2L+!{s16=A0_7?Ib-CElXHli!{i(x=O{T7a8dl_ z0=&tkaM41%$wWC9HQ=ov6O)@SmHpZ0yfN#SeMnW5_0Lzx{_HcVy`_}NLBwVo1o|2+ z#+y!#7v<(@ChYk|_0i&6s@K!)L44{P_KWK9mZ@X31aF4Sw^F=i>WZ8*@#GjBH}A;$ zWuJ_skh)wBLu#k&&p!DB56HDS8%OrV%A#cf^+gwkTN%)V6w$I+Sp;v=Z@yLbWuJk9 zXj%SpWda)GoWE7xY%a2I-Y@&I&p;uoPf>tiz$}M{3Ij#C@BhLwv{VN2%Zw}X>y*pN zKKZc#AR%8bKcnCc=c%Q6Mj%i;j4U4Xjf$M_$(AAeB7L$?9%~sgv56swvdG^9RP*}G z8-258-sqc6e!IfC^(WmL{ul!INpen*!%zUtZ%`1RxpyPW=@6&}B7Jgj9}8pnBBdrA zn$j@rbk+Yz7+BUG2%ej{yb!Sy^09~0^pRbnSM2A2F6~X>LHwl(FU~E_O<(K;UYQ96 zmKIZm=YdI=!XT@%I}yU+hu!v+CVUzDl~mC*uqi0GY5q9X{UkXLk#m5Yqi|AsRvW^0 z=jP6*%I+hDQkBz0Jnn6277M0~EJ99;C6qW$MP%?|s+w{M%$++oEjLmkvQpLQ9@rBe zPL-XPc8=J_$ki)IV{Y{vQQ6uH=j0E?QY#`?E(KF%(ozIs9lbHlf^?U|L3ziNDhNJ@ z-C}?zypk$IM_pRtLH#|=FFi{mlF~CDwo;T&swNX*7K;kH6GY)9V0QGb?$VXW1yqjW zxhqoYaB$&V4!E!`XE3v;-+#STB)oJPt71w!jnxfX%<1VcA!GY}zZX83Y>O;jYP&Rd z$rqX7(cQl1F82-eU+fFEEzCXLCRMiWnT3VLHf)ypE0>ngm~b05wxOB1HmuUL_h5TY z|D`MZ6DZ_r?S1rxUxp}ZSU3rGz|UXc?zuQ^>8A^YBC}ZAB39Ihb#Ad`RJ642=s0c7 zJ+44oykq37t=H9?MYUqRM|yTWL{GQadYGOrv9X(;ww(g5u{qh$DKvB@8^(o(aSF8} zw4o_!9~A6^Vq@oyifb6U%PAX*>0|v)5$76U(Zo7ivaVgIYZvQX@!=DQXX!vZ3yWt) zCf$!IC}R7z_b5HxqKD<*BDS<9Tl$5T{$$I;Ld(M&EvMlNyHlXas) z-RPaig}Ree8#Aj-T|!qU8f>ePWXqU<|AsMfbZl8Il^FN!6O1Dw3e>v5cGpd z{ivWH-7@xU82j$2l&DfE*VsZiT1AVUo;BzhPaFA~(Sz63Zx)&FVtlz&VVY!0>G+vv zWpZ*hLwO(yt-+tf-`xW`j+^8Z*c1VK11|Rdq6!%&e-bRh=aI)3lrO3NjYh-8R8iH7 z=17LRna@x!**pTuXog-OYUm7=lA&v|@#c%Ne%Z%YJs@s|W|3oZ+$R)T$~i-iWWPie z0o5(FOtZw)h(&GR^P4Z*6M)yKT5fp^bX(q9=b^9Ul`PX+s&sFG+M74sUrMYnQZ0u@ z3nH1-U#>CV5UI<2&y0nFi)2>A>}Z^=eby!mW#~DU!!`tjB##R@zg5Efm6@L%3~GCYgUexePX}G zm>6;@)^1a*GDbZV$i|a>FBHxZm7Iaa$a(n-;$M*c*=JM{xhTIyD*~lN!OLakFX5Kq zb+wctR)rp3kW*)$d_BUKvhT7lN_aj`o#hdURfmos<;Uds*(XcMXWwPte0F;-2hr+~ zuMEHz{bJRTXC)uo#XKkbv(IR?Tpu=*p3i=febG{RRHD^U^(~{^ABeAf0ZhhFBkB;7 zV`ra$rjKXeWS`vIduKz;oP(3ABQIp)r`KV&YJO4<%|4?Qs2|mlRue6o|4lh{_8F~_drW~DbmJ>(u={dgxJ>>UFwv?PRvrV*5G{KW6i>{8R{nN2 zC)vkWqZK#hx9k(x*Mb^rz%eO^8eS*f5PoM+9^(Iw^p@-9kH5F)&TxI)viim?=9*`f z3anQ#?XwiByzxl~7db5F89AEq(I%JuBD8B^zxk80FZ(>Fc5?jwSV<18b}IW-_63UN zK4K7_XZ?ibdiW>pu znsZx)8aiX(8|9-LBEKb<6V2e_4_DTWul@Hf<7=vC74FKbKQ5Mcc4Nd5$R4xUM<1TB zhrt6_nBnb_CA+jWYY9`hzc`cXOfYD_5Q$t054N=tyO-FPerb>A$GKozU}-koHgjpt z#|O^^ab`FhY$Mh~+bnkYkzghpB(p~@1f@bR1!w2Z&EcR0TXDbrEDlgv_#!GZ8)ng@ z%4AaKsk%S&@#NJCc7|qub}n+k&NlAgZ=3}`CA-&_c>;@zIDO)0!u(kb+|@@LiGk9H z)-Nr9VL}ItcG{rKfzNCA`6N!UkK*_!iZ2}n>=JyHLckU9Njz!l!>5Dxa4?cm6T=#3 zP2lISqi5;hHN!7AQ3qw%=hGLT#qK)HSSW67mZy@coCyTL3Sp<+#Iuli-Ki>KB+rE| z2dB@`33ukewwNjej>v2zoYG=cE-zwN9_Fv0p^Sl&DyRIy)b~gwoikr`N!`AZ5&D!4ta(hv_ABG$?UU7J?x@j4Gt` z(n&Mk&qprsL|=ffRUq-DQhFK`c{_*<3ZzW=7pU@|B8L_Ow~9SdB{+=)U;jdI1_#vP zRM91VZi%0ZTyYn$@BD92g6nX=QAwW#1@&MrE9%g@xe4G+1F5 z|A97}?qexEhAlXIQECWJv`$LH4ha}fCRND%!9c1wfQm@R2dN62iqi2SF{f}sdx=i` z7_%>x<|q+EFI7UH)3re^L^!63=fEtY;|~b}rPBhUj&QWg z=2`mmLO!2JX$Tm?>_9;-IdFOLQmT47b4a-h2qmy%uyTO1H3HgXcwo?*DoV>Lge8XG zRm9?n*5HY7;E7I1)uQnbd60pFrelUlR{ZG9!sQ@?wv_fP*lLnjfBG_jc`^M~Fh3WX zpXp4M$buN@_&6R4N>UiawVEO3+cc{9^b9<5X>$PyTAHXP8gyhNn8M+cH2-k)DxH^C zvyr)f%v0K62onZ!CBak`i<5>Npg0H-u$e?K%hLvp{{)qx0w|{pRF|i5E{y|aI+#vX z;!rE+UE)cGQ{@QW&8H0WPf{-Cy$n*MhyZFrJZ*FMrznn&PRzk6gro*?Vzt{n0}9hq zfv3$|N|#f9${?RYr_)RGq9~14f~kU;OW@Z9QYswrrwZVpEh)x41}jvS`oO6eg`;Fy z^v24dCnv_a%V3XYzPs!B5LKUMK2J1n$^baW@tQQF!0wc0>gVW_cK1x+qZFtn08Z(F z>oP(&DT6$m(%67eoj#_M-xWa#2L3G-OQcw;I8!BHvAN8-o|J$msxPIO#tg}dG8py_ z6Je8-Hj_`L&~#<^&(L>3S{PjSY!8Hq3nH()d7nJ(8-UHF`Mak-ue$E%l;)553UUem zUmA}4FT^?BF*CMihhXnmZ`rU9CTj<;8$?}gT-Qu2#p9no9A ztQU0kNnKMy*Cdu#CCi;cxieYbCX~0u^=&)q;|lG#f^kB-gt{)#SiRm&jL>dkgqB`E z4sL0!^_9Ap>ykBXLQUJcexqjJ^<&%C#-z1fu(l_yBZ75g!+IcTK5%_vyVjbl^$4|| z>yz&oYF6BvO+A~2-g`w{S?gNgrm<_w*u47erm=0y*s@l*X>120vCNz_Is~I*^_hgx zC)U)Xm;{QMjCW6NnHyIJHvJ=;=KZ4Ck~F&#W*4dhZ*_-I-La!oRGw1YRjJJ7JDjSl z9IZ80e>V2%Slr-5Gjg*vnUkjBglTxYzTsx^_VCzWocOa78}-Nj^u+r3O5l~HmzUnO zw%tClTpX`Iwp~%3tZ*bM9NXY4H;xF6Bci7x>De!M_KWWJr2CNIJ|wnxC)*zu+8^F2 zO zZg(e#P6$IM#Ezb1$7!MCG%M?f;5j0;b|zaVgw_es<`%6@No$*6ZOc}`-YT~ElP#k{ z%jixGSJ!@*E2%608COzS{;pPCRxXp|M>b5uIbTJ5E#(a6 z6?M|=6U@G~i?{ov+H9Cl#;YHS>mLRjXe%&!mag0Ln{^ZMBJs2{l9x>0{PUQ~;qQN@_zdhw2OQlahLa{AUnNhdhX_GQKLKGEq)I`<3C z{dWxUhffIyPH#BR2*!qG-O8y|1?&7ap`tBb(If6=^8Cg~o2t;@E0% zu09p-IwUk5-e@=?dfSuUqk{M7J*~!CzDx%P=;sPgqQWCK`;yILLi3miFd9!f4+zc! zcU8)ks+$i1h|qh@2ZZJW_{d<(EtuU&^FhIUkiB*b=I*XxeIhwc?~m4htMFPQzJ zx#gZlRau4cEvrl#?P!%~vZAwBM+Fnu;U2NUx1;6C8t!o#fMhvmcEqba@2W5i+crni z)+^Y0*Pl=J9TWPFZP+H0HIsm$GV47(XR@v=ZMN;-tT-Urok@GUU~d;~UeVzbZGN$- z1DQ79a6;2ib#nsaHmW$JNFF#V95~Bb(1dy+lGxlOHn@@v9YRCLP6;ACqPWWyjw-5t z#uW}#38uc~lW}JsYDK3bor1kHX&(~oL!!Bv(xWSOXCXSB_}*D!pSD{iP- zo&XXmt4kUiFlejpw~XF*A5wBnBlkX{u1A4I?PXtHNQ=$VKcj=jezf%Ski zlX{Dwx5VqmHudAIaz3HLw_dicTJKmt_3bCV@x=Oxxc}fr#n_wrv0vzgPYt;{Ic}88qrA zDj&5fUQ(P~cMn~}SL!IJGsrrX{EBQU6W%%-=;qj`o!`xq&E6(;7{B^}>DPH0Yin9X+ z@y1atUWC?uL`yhpAMtaE5yc~3bz;8?{tcxP;Tr`C`P~%%O|JrFz3JCJ zTBmxmujbJz)mugd{BM;}%(tpE_KS%2nleing{cs`T_OaS zWH6%wWGaW!Uv0Uc4xr7$eG#7rK!k}neSs5o0@p>r{4Z4xS);`%1hP?a>;1f26J z1%qStuwZD77q#-lqf9BnyH4PU^GJVp$KgN#UjeQ^H_km}21iN_ewc!t&J;vd0p-;a zgsE;2mrTjHSBe`Z7LJo6gx9Ls$(mbr^-N~B4vE;=%OqsY_W84ydGHJ(_5~7&wlAHt zhajmP4A`H>4)0ZFm)NjQ$@#$7`k2zhbF2CutDN= zQ6=OCY3P*jF_EK~I%L3i>ENxHeWxMJ`OacuW6ERTqnh-<`;ev}J@8#%&Z(m6bILOu z4ZGxX7ea8K-;bIePAQ&egT;m|rHG{zUVNfs8$JmKn|G1?Z^KEI>>8I;$>mEVeH(z} zJbDegddMx{4FbJODaCA<7`hUPEe&fqI3o+#LrKwDuU>WDKacK@^;k~!8eJj%@}a_<6QhD?!Hd=i$-b~3PKu4PUp|-MOCy={=xFAoHK4i( z1QRX_sADKMRtPymR(8ZBr=JH1?)?F<6mq@{#OXY=AfO=>94n4$!$wHO6+fd0LqwXM z{C)XH)!}bOC_4Mi*UG-^6Vh$U?0NURCHqD8WzIu&Q60|Rbn}_hZ8-)BWxt;8!w`hG zWBOmwF8!W%>0?DvEw4?tLW4FIOKlA3V(LhPTn?71M%kZzMvDUl6g#T9rO(v0BtpCq z_KReA=+TmhTMo^n(&JAok|Jfs{LW^-;evyz&SH8Mt zYYI43eV97pOex)t{58ylJMiP>j^O6i8E?ju_uQx`^EuQCW1#p*1F^HJVjNOcd5#D2G9y)2i0&WVm z@-Y7&;ez>0EXHESS!EgrSF8M<#_Vh3;FDpG%kObBZ2S-Z6|~CP*Ho!OS+-3}IECSp zlnTfL1k2x3wpuW$C6;okNX9zg)GEPu+oA88Ij9Rg2fht>d#`~-oWVe0GCBWs~*RnCx@mG-$;%fj=NN% z%igAdzaxjOM(>dC|A3QHEj>M-Dv_4$G%J+z>ujsT>zxTl4|Xf@8f>%~7f7krR&SLzCCgid^47KE z8|4Gn$0)wx`p}o=*D7uWHmzNM-gP_to&GyBe>N;OdB9yFX|NMJ4`bg~AGdbL4c+mg z?t3ScoO>|oIxe`5V`sN=0y}PZsi1GVJ{mVPVc*^0Ts`<|VO(GTE>=bl#NXoJY#7_t zhvP>+B6NLZ+wG4Vo3R02F4=iTk(=|oVFhRQztiaas{2*9Xzo}a{(kvi)qJ-G1vsjf zCz0L2xMEe8v@`Bl!=}Cajq>G*Ek}3KaYS$&5zP&&`w-Vzb@TZ0#LCCsEk$nY-uU6i zgwDrAZ`-=^wIgw}d)wCm<_%J@J$SER?-d)n)(;DfL!!kE0X=8c@^LT(nmXRGxJ5_X zmU~ci58mz-+y~#7gRq>Z@m>Mf;#qTi<-&T|s|#XN+r5&)dh3o#>#V(}(lu2rpV&FP zt6BG2xu)Kv^QhoFN_-0HucoS-Czg+`Du~t9xOzHK-LmECe#h?H@{NF;@OO%x!{B!K zo53?^^{>so?pi zX=o7)ElESCVCYO54hV(=w3n^U@Dtg_xV-8&$7u6g{Xw`4Q2bk!hYPqL6pS1xQvFq- z0xthUgi735V*2v`o}BNHL(JOQTs*2rfC}(<_upA%3BW8h63Gz0`REc^Rw}a=6=CJo z29%%-6!~yEq#slvUH)$|T|j+a5!GGArXi*eXfpm1P%Ndu+9dOpOIe}X^y8Z&L}RgE zR43O2vR=lttQXoBGe&fB9J$Vr?Ls+LGs?!}yBxTqa&kPRX@7{yNr%8YTsiFAtIxByRnlACqJQ1n&Jo z8VvgT_*3+nj0edvlMWq=C)8lAlnWb#>{_zx|luEA%{loD6><3 z%Rc3rG0M0h)`>9${9YW$>OJI-kr|~@eDBYeEBjtFM@x6* zktdFv2iroHMvIpf`~-5NUG<(Hn_>I*1mZPkqwCB+*PeO?qtg_jeJA@xnmGGvJWzl#ZO@`8*@k9 z`DBe^O;KC4iFzsKfvy?z@roNHJA78oBm3k(ig4mY?)klV8oB3*|3cw0E?}0|3rLA^ z^N;UIidUqa${U&|M~36_)ByFu2=4ves>zH)E5oU+s6oCJaOPTBx);1r@BByi=7Zkx z0e<}XYYKK%Gh(zM<;?vlqir(mU@h_^&eQkDks*{}82=3yW`{SQjAg9-0k!K0=~1_H z_vef{`@m4qH79tz*uoAnu%3rc-9f_J>kmUSo(ob``zdEBeX5h7Cx7{Jzpyktp~?>%Kh~3f6oHp5*V+TM3FNjHKD?2d!UZR zvI1`AzQ3%&!T;| z8e}nu%8&pIlV|4>XeP`7n6qXu zYZjx*?aj0`HuPb~J@$ZJe6b?~!`}QO%-i$BcK+BhJn%T8$v{K?+4Fd&R7FiWk2gtxjX<>KxcoguW83 zM2m@jE%?TIN43k7f3mjr*@stJ_qo&>u&KVnqF)8WV&kCCY2mP^*HpjD9njr=gkKc9uJ&}Hxm z2BT^C`ITklr{MRNJNbZe_ve&5kRO`)hWxlAr=5B0ki{nHP zqTWLfV^s(Krttw514T@}P+2AxzCEXWjtN|VMg~`GRyC`nI7ETVUvl^m`P-7le7n;3 zzF-C=aRXWH`MscAF1YdEvTc?rF+c#9S4Zdb2w&cl310b61LF4QpHX~C{_xoTkpBT= z-54m$r`Z#5e+qO#KAbtu$u*ZTKWej|AIV?a`On|q0-V6^|FD=Ye_z6|mGYKHh9&Id z^7nXGF8?Ol8*K%xlb;Xsq4zqBGt(-sG_mGoMd)v_+dmNX@I5*3;+g-#{q2_f3j!11 z1tHA=-u$7{sOZELrK@A3z>{#%-!I=^23qNsX)LTbmZ%4A4dNfn;r-?v*Yt{)hp!u< z4I%rzTnz;fV*~3$JKHO9h?7(95eew zj6&@5(&rYjLs=Y>rJ%2k=Ip=@Wp`lnf0)fj_F?Y$<^Fik4(-~3u$ic4YB%)dO^{K@ zi+TAX+P$$@sz6)paHux_Tm+{x?;Hh0x5p0eou}6hBlf?Q>zjT4tIXaG8E?dzvCe_V zMtDCz*LDxzuukUB#E#^0W1~l)GkJt2$-gE~cDeXQm6}`D-S~^N#-#CC4iW}-xi=ss za3oeAX_VjRGqM?SNUj74J9o**Y-D3(B_g8rGEB(N*ltd-ia5mqJ$S~<2R%qGR{!z( zE9qxe4-&QX$|ZcLT4ppK|3}smdYbwh=X$wl1Aig%nq1$kw)(Fg9wQNkICVCVGb6k)Yp#Q>dL_9M06rQHy<;&JA-MCfuDbDH$Ojb zdnR|m>?B}zau3WFPJcwM8(Q|;vOoLepFns%n&hDeaPh&l14rA89%;(K=VrM#F5(=F z*AS&f7$j5_=#gg_+SB`GRqS~F@*Y4{9LH%uA7}$jE^TmQOwEN(?t#gJzjEgE%XR$M zf=PM4pp|UiWakT;Q;@?}a922{b31VIY9-0%l6q{~Y_5Oy!sXD#vswAa!#r`_nUv3K z3Z|p^frBZ#k-#f7f+UlPlBf@FXPQ5xD|@woRM!UWSCz%ZS4$=T6XOq^JoFUat(go+ zOys__5VrhJC=-&gk|9F3nn~oOG?Gwd5p>L;7WFhlZX|&gh?r0{AeZ?rB`bjZOK>)V z>67fOn-=^4qM~+C& zJLHg5lDlxvr&M{45E2B&SmSdfk>nslTK<@X$|w%g)SE@8W`1B4^Fg=CE zHm1Njo{5(A@c#Va`Q$Wsvfq$Rn(^g}`vo?)Zj7kY^XoMQi4?PbA`vQR5i9-eDP?4K zmpHw4m^AjDyTp%NJ#@cZ6iv@xrr+ovURZ*}R(Qmp$);BgF{vIDDCya8>Tc_gJqK40 zijJnQ4*mYndgr%?zA=>SI-2M@3Q?K{$5(Z)Le!||O?%JVo`df*8RbumDo53d*9!O- zkPSrDnVsWs$|!lH3Bm>gCh19GI-C{m;!6+&KwvOJe!i3(ipbZJ!@2@y%^@RrnGXVX zXXoYsq^cpDyB4& z*=HqRO*LwVbG33fBcCSscM#~SC@s!7k}ALd`&CZ{kghv6HJ6u zog^!$r}PKOxkAn>LH4%J(N*phLuHX%=8k| zY!35&ibPTZ43{#I#*`WM6ksKgPwW0?K_pd1*lX9w@PAEdNsvVnZ6whZ(o4;n=4G%& zR3zVnu}K*u4SmuJXR??nbr2{$rGkD5@1+rB5W?UL0r|t>!%8_9*_;}R5@%7(v)L{_0w|v23bN9!yF0)7chz>usFo>l1+){U|A@2!>}Ta z6aPhugB3_h2UIKyi} zY=OS-sy=^!#_#)734?sMfh25^3a6@Jq63{JgP62P6Ht_8!4%L70N^Ebbcfof zCPFYRN}O&YKSJYMB2OYVb}6&`O8LWr2&o)NUsOO%&>U^@zRr%5ihACHjKbUDCRO*1d!%%3B6>aGKvRR zr=-aqC;3#=rebFGK@v7sp*lq6VM;Ke_Y75&o=+*`xzKLS7~V%2F(R{xa!U(z%S|fF z;D6lVfiVgO7!st$N1&LNRwyFF4;-ng7iS<8PS#IJ?7MU~ z!19$;vMy&)VsuKQB=D+4bNd~s-_y5+iFbPUFeYQ$RWbpynfEe`q=uk+rV zS#SBa`y1}-I?-ldnO&WPUVFT#PP95!9$S50sPBXZxoD}Uq!v55WK)%_n7*@r{LT}b z`%gha+E)7Z{!_ObU{K&qV^4hlDd?$}Savvsqofw^>SJrq3r&M0qF!rTsa_otYW&wr z-_cbrpHEh~g(~-E>xoU>Nzv7obPWlvp`>eEaE&Kj69WG0o3>qEs8mAwx(rh8?)NxF z3G5?~k=8-MIC#5n!w6Am7^>aTa}L*P_1cio&>M$sRH|o#pleu#saF4%bL4h(!+BiL zH*7nhq1k|n)zz<-Exii#WYyC6cstCv4r@&92q6Ym-7#A8Ndf>YrcN3C%-s zy+dqhUM*Z}73@7ww%=}YlSv1)PFn$u#H7KQfSChFOVZIVIG|oupD_-A+BpaAaw=ze z+-Tpa(rYW;hGHl*;JXvf?rk!KKxPP5AKx&w?WmLk<=ZCPyBg(y{idGe^P5(lSl4t`r4K8q-9933}FnqiiM)K>m~8(wwd9uT|-He91|Q2OVX zII-b7DZuc6+5H|FQV?CON!LEXwGT-MeC)}(9-*#h{rpDV$nrRB5?CFpLvPi$tthsd zkBW_stwt!xyG54=+W8JE8q(|$J$^{Vx74oa?p1LO`^b<2a)SD|aDUD3R2ffbA`PnYQ5FLri`z7fd(w^~;YeQ6RyZ*Rij z3AXM#-Emv@hVAHz8YX^MBdh1u#^ctGO>@T%ZeFss?;MBLg6*Z~=c3|~iMZMIV~b0) zyVst)-6+8Ju6>_q_pY6Wra#IS?A_~+ZrF$Ks1x?_?Y5!Y2Zgr%ceFy=u|(U$Z#*5h zcz;~iy4B=a8+xm$Z&k77e^~UiZh3m+`ybl$JS_S;#Liym1++D;>Yy3mKBN#^I>gpt z(c3Tj`qxf~p6+|P0(ax82C5Iv;jPXgvG<5LI3)HQM!UM2R}cLGwaev8whRj`!*6uQ zTZT7U9$r<8F8^9&?cDl!ym?^LKCt8Cn)`SBm}bilZCe{&x%lOa%ZEj?^%ecg`sGov z(ZBYwSKF2+x2xT24T);6Xtr-xTa(o-P-9p-A^}3&dt9_Ny>jj4YjN9P-26y<_>m2` zwm}BY`vk{6vDuq!?iHGQMR(iRZLitH<~Ff=DA|2f=sqg8^(Na+32mofF4{SAmn(4A zzFY)Yb-GrHwyW!;qC|K9?J~hVv{EX%`;zVn!94+QQ_D&*nv-<(Z@T(dj*1pf($Xnd zI@kL*EyJRvdCi~nObVXKgy+;7ykI%?@-fUM_W&W4o@Dd5&^*4P6J5PY*HOWB^yQN6 zrq-`M@#+&R#ZW$}Z^8J(Y6EOMi}tSd(S*Ghp*0Qf7IS9TT6ePbK%(`4X!FHw199^J zT+!AOw~fZlqi~Vu`o{# z-(iLIT~66y6)ZkXhK~N*1~}*z)$@sonG*|Gg71 zb-i{hQQs@tnv=Hvgsp$8efZAR&Gys3U$Wia4ad^5Hk`2Zy>0i2c3;xoFWCEUTQ=>7 z(F>l|m182hqGe>WWn@*g?QHq# z>kY3y^y)F#qj2rVINAN{^$FXE=p9IU_Y2%XVbP|QM#sil}ZRa{5UYRR_EE7P zXZPKbLQ>1&>ODJF4v}pK1lxfvd&^n@fKYJtqikerX9mj&0d3fUq2wZ+ElFp;;Ozei z3T{Nf{ljq>{%HWBc-zsw{#3H(Sfc0HcGm&$C4{b{tH;rx)e~S|myz=+7cQ3dNj!9MVgw}0E&h2_NCcu%GFSyxU#gTA9@ z<>Xq&2Eeh#o~-c-HQx2&jhew-vmguk8E_X{NAENUt>ZwKTdkezy~(aIp=)e|1)x>+ z4HzQ9(h;}7mP~q-><$Tei!>gif_?OkBYE&q;ozg2_Q$rn`YvC+{K-H0SVwMd{9>F<$`xGqb zB%KEZ`hRlU-UDoBAB@|3#P0sp(lx+z$9pi` zUHK-1>DA+aDR~ZIBsO=A#5PSErbg%L2Yzg7rZv42=-1gHPnG=HzDncMwtt1J0y&3n zcO>G{S`)(`f`hfP%)e^C%FK+J?eFK<5onxzpZEs)FdsgtC75zg96tVr#CB08{_t47S zHcE7dw=?M-7Q7hvR%GiwMcI;eAF{Q$*PcrH4kdhKj*2E)!r}vn!2(x}KT+csUHwVd zS;2J{ii;ws4okCW@rxFpXlYDZyn@9G)a0&xxdeo}yXW@Qrt2tjE#hk2YvW1pXu>-x zl8G;W!s_2OUvpP2Z~sc^>c?O>N$frpH)G*Xfb0R<@n;t~bMxxQHq7p|DZ$)_C%#s? z))X5i4@~4VCp!-cod*G(f@usvpgzn^agYpk4avGTp{^}n*Y~!?3DolXN#K(LX86mW zsP?aSC%Xw6OQJ)+nD^&_txd6)RP+n&Cp z=U~Ef@U8}LFxth2M~{BQMk8ry6D(~KaZAIA6$oo;RdQfL7?{}TI41hLlm3H({~*y> z%^s{}^~3i#QtgC=Cy+O-LUn7ry6YWAi;9hHVv`T#C+aLynD+pUE*a~Cj3P3#thlB2 z_jIsW2! zeH~vv_S!Mg-kP+J2=oePqM;=*mez%R5+$ z{asrf2gP0@13Eo_@9BQcbH};n+3*}eNrV~-X`RwC2x#Q${i2wpoqcP#y4T@#QkHU?sY7pTYT0@+?)wR{Lu-V7k$e@7vBfwA3N zpou+0ge2U9@qH%+_es$4$WZeo9l*hYIYc*`+>XTS+BfRPm&Y;jSG!l6*VOS^-9Y^}ctH zS3skA%h?il4Phya8ymK)Ezr&uoBPCu&UnvZp@DWXw(?~i)(LELE4mXE-NL}Jt^VVP z14tNvp7^Gz2VmYbst~Ii#5ym-j8?JMDb_YYRl5mW<_6kJ^dueQf@2(p=hn|+l%0ow zc$LzI8dSY4jwRIEzOIf};g*yhKtPi#T`PUVR{MYmQ+S&_?H>g#Qpqu4!2XT6J&R zy5l`Z-zXD$U>NcE8%N{ykHBtRr4=nD`+hBgsYSB#NA}|EZqe-%n+It_Y;^!vRd)Y` zD};@r60X+!{yi%j9ksUkovit@iB0{nUp3N6J(-&N#f}z9KLCgs8a9jU?}vW`{?R8! zn_|_9@3xd(Zcu->|DX>q-*0wbZsz{d=y_&<`^!Q4`~y4gmf(KSsD93%{6VwgIeozo zT2$osoWzwb+@@bo?ob`vH>>}rVmyA*ph3EyG;5#fRsE!G-!mPmpL!|eryUxE{Ipj^ z{sHZCTGh{#`<_#)ezu>!{#>m=$j`O(_2+s8O8B{9tP9!wqf&vOe^hB>rK*3__s6uV ze^OI|f6{6Y@=v8&_;=~5(-Ffb@pt!FxqM3cFVI!T9Uu?T*e#A0$Fw)}ecX-0UM}SAw97IU%&IyU z(|ueAtLp5Tm5zvNCE0k$Sd$KSKHUw;D3v|}MxpeJYUG?_C6Ja>G3)Ely4?HA(E~2U z-6^KM-#M7RTGl*;6BuX`8|JI-{~j|U+?*%84DBe#8Fij-xYJLO&uY+%-O)0pp_#eF zlxb+vf!v;7)QEfg^5*eCopVh_X0-x6!m-;o?0d4sE2KqL5r-Te&Di95QiVRCTR7+j z3_9!C^9xH_{@i@?o)36^aT4Q}NA@hM#~!PWR?oZdPYFv*3fz!Tn2~)fkCx}tl#f+L zd~#~YHJPH7h)2hvdw$ToPeXmo94g(L2KTg}mGr|+6djTe65)n?dh(E2s)&}qSc-DG z<(Szg?q4hp)gWbWHca+G5;AW%Bq4S9(l5u$KGEZY**Dq8S4Q^9ex%C-oAMlBGRyjm zR9Q__Kfhm2nSJiLWg?0(JCOY<`}of@D+r_Vd-lm7?_l;__GPUs#cD%GfOp2_FTfT< zF|({nZc zvN;J^`~RFjHUD^ITFxtf3>k{DHM$IYPf^Q@r8zehW%P>EmgSg+_3j!0BUxaH#hSLn zq64n5D!FKV)QWpta4JVX*kLJw%%LVNw`+Zt?mMyO;vTf*x%~Cy9}tfIapBx?4y&N; z!_=B;`P2c_&K|Ypo9$HR$SkWa9E@+Cbv0_wr}G{yxn+{^WYm7EQXW~k^_Zt6fhwjs zUvdLF%(?xC-m^tbkgCpN;d>A02SfhwfSL7!AvagT_qn(`^dqD_C$}y8KXfLRgAKP&hPQA>cktbnx}NWMH6x-kDOeB0i3?1C->L08UwUDiPDtvb2C z0cB;s9Lhq6qm{7+=#3e3YLT(cA;CCg`w?ePEgE+9rXvSK&t-d4_60Jt2Cdz*3g_k% zt%FqsKcFLTFQd2Y5Bes-K4L%|2KY43ni|m22*w&e)T-V1|NGThb7N9X{g_7plK;&> zK@8SeV6UYfR#&VfvX-{j!u)wH5ZL6o8!ctCi|nAzInjT<32sRmK+_Q z=4eY4e@uekA>(6M>fz?2*_z2dz6E7`Tz<~8G&<{$lYEjqpXeO>JS~#7d z^&-EK30J}1ks31muE?h|VcPl6$o}jzlWTt3R*%|5vaw|!QL^|#0>yiNuq;p{1;MgT zUfl)kY(9oB%X<>y6<-1biMX8*mS$p3%pgyGn92u@xJ&IrLvG9y`t}22=h?0BuVS@t z{qXTImzGd}Yvgz2Hb$-UzbpH*&p@+0V_|Wm_E*Mk$%)+&ZOunb5^#HvN*8^gI zAzMS)huH2fXYQ(novM$)wvs0wj)}I)5Eb(VTBBa1^g@5c6ZN2t|2HUcANsLV&Nutq zHE&JXaM>s4dJ&RHuzHnursmI?FlWA98ETRj3iE1ns-m7K)jZ#tXABsjZ&)DJl-&$C z13c$GBk7R%ajDtZ0*TPt19M@Br-hiXnWQy>#z2}G6B`2IO!%9`b9oCnO!Bgxe6&Y| zoP3z#QY5G#{3r5bolXnDk!&mM)8`50J1Bu9BAzOgt(|5i&PjMwD!G_^I*P4eMIAd5soXGC8&qC%9DkG3^4f2a~p`d;493;{q3u>2S!s#Nn zv}kWI;J2ST9oN6j@Bwy5QjTo8Uy>^Pk5o14(v+fjmp%pYCc5xpO(*#TKBB2E>g;Q9ay1=z4+5@^{c7pF-JOs} z!$A#T43lw3rr{9xJ|c{ty?tiw%zD+=o`CoLFp>SAFqA2FwTl8-yAH#XX&0IkQ)D@z zpmqpo_x$A9nQnKTtl`18ph;-&A0fU3yXm$+_usAUrPOvi%1di^C>_Obw~wx#yD_$0 zAl>1T)Hew2qv=ktH7NKBREF=Oniff*%M_Cyp*l0|BuU0TEt8y)mVXi7Q{@+*nTA2% zP#8BRrf&#iMwTUMZiOqQa9>q=?N>{lkOYt+L>eBnrxZ`|tyJC#zK_c8hm$It3111# z&Ml=%$LQ)f5?N)_fG2#+f0>*=A?G|fdUBYWfR{30xFaLq&qExI<3}mV1#-a9=J-hp zW10))_lvqq5O1S+Y=Tles5O0Os2qzW(0gmK3k zpVeXFA3+Sa|HDev^V5i)(nQdV;Q!Z?*K1Dtn>oh_q@n)cNT*tt>@riD~lT{g5D9QCN;yTJq za7qV-6nVH(bx_MdD_}i4t(}oGu>7->VSyxxp3*byC9gz5`eY(;=(HvfQ;Z<{=#pj& zROckko0Lx0tpY%$sxsApAa{6{pOZ9ia(aX50x%izFi~{Bo+z~>Ed8fQceVZ3MaOEe z0K;uq4H*8(BP#9jU)S!a3$?vWidIo#hY+pBc2m2fg8%12z3=Dm&5~3m;{o!`Bc##wCc5nan!e1e_yiJnEZo%MQYuPY# zURR4%wq#Yyn^i5>N#+w*vI%C_^$AF9yt4o0{nw9+^$pigZnMj15|BAQBFV4Co#SFP zuEA=H8;`|1j%~oLZeusjkw~wIQ``oh=SHk%MyCLkAO>4jL7rTivIcv|es zMXW-$4TEjVHE`R#0m*G+)3SD@Z*^28G1K-$HIB}m%M;QyZbAiaWW!l3t`F-J8avkq zHX8RqHnpie*)%3J;byQ}(T+3ZR-C0akK>3}-CVVN5@)J5U((twSi6^xY+2oC6rxZ> zM@P~zEI7#gwyo;sk$a^$LC<`pv*)ViBM^IlSXHh474OS9U3Q4|B)(kV8n^V`9!U;8 znizT%B3?C2rnwPfE{Qq_VRKezvc4lx-?3fq*d{U3PF&5k-7}o*Ig#i&@ufl%IiUN% zAXXed^Vq#Q13TolbtRrem9?$;{Y%YxZ`-H_vURCQLo=)Hbbk-yV45{ATT=pDTp;7rTovVRUSn%q!KB#Nvj@x7l$)yZkNt@OQ7i4l4Z_BnRBf==^hr`!*_}k?qiT*iW?yopEvuzX=q)Z5KZo+sa-I&zin>W zf@p}#n{zPikPaLG>_LVUBSZG)YhY?#i_?A+SLs}DZkn5ycNm97M2vmm0)gmQZm-U+lV(?5ts{qs`E=Ssv%3nB}M zd^l0rxqd#`J0bK=#CuPPq+3A!&pghA$%!g8wXFx@&cTH9^q+_C41VVt$@%U-4Ka(i zE9!G6*SmVIrQ@y+(%g{QYw1{3-qUjRO`^LMM)ZHGQPo=^oYvCuK7?UlN?)s^`L|=| z93Ghd2N>J_plkKnuM9(0$N_;6--<3L>sB^1F;||spmZvg0^YOxC9e=K`?a!ft0u~_3Qe?D$7d?Hx~$& zYZ`uB;m2&ZwE{K~y`m*i(E?blLjSoFmG14zIwpPoLqLVCEw^x2c2KDjdVmcC_Yv4E>~?ObBF>J5KpSRC31@sb}KEl>klwX8jj z+y4Yp2QGnJEyp}hSF>uv)V@BRF!gUcaNB|4=vto;93u(G_?<@RiD3DN*LrS^ZyGuv zh5yxoR|f>PMcD8@vf+3H5&o{WVYz_Bv3kEf@{N&X*M!hDk?4Bpjnjhd;blF97MBYM z8NF$229l{G0Z(I%v~%{WvElbC`JJ~rYbWUzqj&J*WlK#U9|KVkA(&$MTJ#UO}+gwT8 zkYF3SSE(}5;hOTinl581-6!jMCIIDeGCsx$FG;(t!S>aBgq(;|Q{jd5Z zYtp_!71(odO)QX9d}0SKd+uN&SE(>~%8_JBk26=e_ihw3in zYl@D7KT>rmzozctzM<$+epAuIeM^A|f9yP1sQPY=65j9D6_MXjb5N=J?y!RV`!opu zo>Bw<_X>~b@bbfo`iTzZ58Em24?DESdNe;A^dD>2{K%z%|3~gZg#V~rL;jxe2E6=r zmGPk#<6k$A+wiifF`cb7Zq^*9x@`|u!ToXBXe}Op>mBXGL!!BvVs%Yg5&eF<6SL`) zVKkECO7TZrgu6I~q08XlNTckxM{ zab|8HPj*=n_*KLvqwOE}!f*U`6^wvGc!SbK2wthH)~(^Ntf6@+h%Wo2|_=H>BMj=y{ydjoUlR#ojv&nv?( z53d#8)~*b1)E(TY8vALb4tKelQTTgQjUCRQEq?zl2QM?3@#WF>gJlI0r-Cs!Y0!4v zXP223BY@wRIPlyHqM8?#Q4Q{O;pWley+6QZX0~J{$iRIE=Yq`%x=~IUEh3H_?g1!% zu`F6VpP38U+@hKL-ClS&TF7HIM73Y!zO0PtV5w4-cgAE=K+3Kluaq4m5~N>Lmopo5 zd1r$vpuUxvJDJ%~AkQW_1bp{U6gT6s_UIu!WR%OuJ|SWEtLnqeS7hVMKAaV4G4BiI zSs$s)e#yr@4;0F^i=!Kn z_5eiD4_CJqU#%Zw_6pJ$f%?LcD+|GlNg*)hrMv2(ygtZ(lV&TS3;tVhz^xiJrH0l_{^k%4tv_blgxhMGx&yoo(d|%&8wGop%&JHRB<|-(4e0A(=~d~ zePxW3nD6G=yaq(h5o$Vx3gG^+a&|8`Xe}ZR8{-BaXyX#wHtx|bYZuDeK|^aBL6KXU zRv&+>uI!JgL(?IJnpzQypwV7nYSyvpz79&R+`mfqz<+eUlR*diEPw*J4$t}V8S>I2V%yXB<1ul|aay zWCL4vVY@8sb~md^E0s#Elt-k#(52fxs`~x^_>u&tr6cRif39<8{Qu|j|L1%M17cs`iLWWu_sfxcBlpb@ ztf`UJz}u_7v)KGlUGu_3=XIA0-;w1T>Y>3qHtZK(YaCD;2ktm_7lLgueX6gIchm&e z8=BV}`m#_6)M5DV52oH;3-+tQ{uT48d=i5)xij6^Lz}w%|MWMfCf7nkYG`Ps!1%~RL*L$d5)u>mqXgK=(eGH32>8{a7t`lj?2|h7& z*LkvNBK0AJ0%E`OkFit(HU)?nrQUNP4jhHJ1A8LiMI4mL7>wW#-Frn1^CV!#2&wLD z`7Z6lWE@~E3vwP>gHAVO)DyDDW)+t9gAoq+yX~8$$qKi%@U;I;nJx*PTjei2h16#B8^Ew#(%FG;CP8UGH4#`E>A)S{CCxF6B%Jw3& zjgjo&$1)X}1q9PLF}r;I3eN2&Y0wKG`!88hcS-|E0^{5is)pE{n6W$)-V+2cclc=W zndkL!yg75=62wP`LSz1BPA4Np(ZB*THxIZ?_!!Jr3F0G&imtQ&2*Hb^`}0135=K%g zeiGeOQAA?;Es4PIP%1{)HQ2%f?sj_C9J1<=v7yS*xh2fbvpPx$OMr>IED$Un5_w4? zd|PHFp-c0vxm#6t>z0)RaNc+>U>OHacWl`?)Ui8s&E2cId$*h%S6(V^%{{8RM;}C? zY0p;MX~b>MN!_lVf+l<{YY$k~FswF0iV(Q>QuKq2O7<3d+>VK4>AnG*M)r{rEX4f6 zkRM#)7<%K?!T`BgkDI1Ek7EgAaSB;dX1^#Lyo;03OP97|!3t_c(~CV9rV6lh;yiWZ z)xwsQVv-#PCKtA`6zlgBABLaBL2|J+Wl0*u@r6NY;f;P^Mjc9QLiv;y*p>*tX^tKk zK`EHPl;Pky9HMyH1j(1s4G)FG=dS6Z$=Q>60>m zn-V9ArWHyRoJ{4L(-T;XF~*I{dkiG1e1*LE>2e~UF=I|FB*Fy#WRAw>nXmvyDoGUA zNBIMizC3$YevaV9&Juz?75O9v8|1Y)LV4ns%|TiHk;M8NWJAwp5p5k%;OScjNkfNo z+HqPtTiUw+;^|IXy3?-iCzZkVL$KELsn}gC2iF?|>%Qh+Onp9ezc=k0fI{74yJg1a zIBUgcuG_99Gq%VzyLK+{Xog7|uK%B1F$azRS@Li(n$f4#%q4q_$l?RCW zk{nu{K(h>h!HrMFz=%MDPygnn`uqmPCKN)I_~v!@n9=_jX|oE37& zC<~ah_;^yGok6UxQE2Lvah(B7s1hIlfc|?dpy0Wbz7&E+M9MKrPW=P(PR1G5&E2%p zA{++y&mum7XmL{SN=7L?Iqy)!*LcN${g}o?hmWmy!0y5>7 z^tGCY$tuy@rd2apBkFxxEt6iDO0|Pb*5SZlt)9sSQFCyq_wL8qA@=%2ONZ9Tq)esw znZ#~rzZPIJ2rL1O>#l`_t^7p`hhi+&((%mvu}SLRtbnec2)t6wp{3;AiyDVRlx*H5 z)6&?IRkb#yF}bl0uS}`dvso*rzENh9I@5>0r3u_0Bw&w}Iw5yumCuYesd_1(3EWmr zZP6`jLbyiZP$LmlE48N*n!s(Phi?!19(o=|A0AXseXvErxg4OhQM}hP=S73mw^>`K zNkpKkwScj@dsQ>D*CMLDnw3c#;5jrqlRzTSKR##1tkCAVb#p1Ix|&jNU=pfY+p;cB zc+&KJyX5B&K?F*#Z(Vk(b^V%|{d)0CVyCWGvoq-swe{)xF16-}=H!5KQ6+0GCf%a` zm{!4L<<8HT^>El??OT%M_ z<{Xkv8a6#msUEj0~!dOts_aiR1l$ypc2%p1xjTl%&4vjTTbMg<^!si<1pp09s}Uo%s^z^Ua~uKUq| z^cp_vwuHOhP?DUtT~vNX&BFz92H|8}<1<)>HJ-%E!$eo)(zxEvaVsTJ>+n}fRF+(r zC3snydnhS#w&7$XB{mVeq5PX@^JPG$MrPyOSZ2y+W-)_|hjaasLKBjVQ}4>o0bw;V z5zAD~PfkvC#kd#k!W1?+gQ{?woT<-6C6$F)z>p>rfFwtUIfof|LT{nD-K0r`LSv|m z@1+cRa7R54_u{0a*I{+LbC#7N_( z_a3CbQ!ob0!QJ|Z;$>(uQGRlo$l-1!NECMkA);E~WJ3Er8-t;F!fn*Qq0&k@+FmIK zgXt!?@5==DE1nl1f`%3GOU24i3vf*6JN1^|m=~OGE&2PoE6c ziog`oOQrWwyr1*w>l|m4sh9$^QDV0)o;}7F!3f+<4^ZN!|Cu^I}_FysuFimJxlZ zGpamCI{ZZ9J{+vOG8nQJVQ@VbcKn`-p$##xA-w+(&c~wev8db-^&6t+8*zBsC>e|} zju?#F4htLs=NUOKDuvWmcw&Z*v=+1BSAF>GQc>n+a literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_classification.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_classification.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bca706346a7b18ae754d8d0fa329277cc732398 GIT binary patch literal 9202 zcma)CX>c6Jb?(`j+55n;I2Xs_Ai%{*f&^(&v_%L!#Y+(>IbJK?TJ8+M1^4pIfCQk0 z!m?-wjOCcD*dbkVDs&Z7p^8XFNu?6ypXkUXsZy!J22gft0wzqEsY?E>1gJzS`H}Z} zrgs4lj?=_;zkdDte%YAxlJS@aVe7Kq8x{` zEoDpdQJ&I#N{9-;<5ObP4yllGq@7VGEfZ6&v^(miv_0iXd!t@TJ5s*1KkBEnGZjcz zMXS=)(du+fv?g5}txX4`!E{};E?pn3Pd7vxsGKX+m~M(TrJJM8=}f-P zTqjbM-28yIxDm>&25yZUDRcW~`AF@5p-6-?+$Szvcodv@2vi$97 zS=E5_9?eW9GxB&UDQ7U>K7J{#U69kcR9utCvl$K8$FfQ~uAR$i$!rEhd{*t#*~v*+ zi8zfqtTUd4Ix>n_#YWS3Dz2)@iDV*+**2XkT zX^fUl*;v9bvIvs>)Wyw7Ez@q0l7s-<;klq@mV3J$I1B>w{+$FEW`T8UNHvc_QBzZ1YZXwT2 z(K~5#-FchlW1PHA!*SBU#-V(ax6Si;eyWOm>E!&MlcnMc9%;YQSqVs_@vaLJ!8tme{1O7SlPT0k2R%e#m zv|rtf&-zmS2KLL9cTM%Ml1e)7&buYo6{2*&Dn=@NMoDh1kI_>DjH;xI9@fiwvSAc% zZHURRas_%)os+148%7Sj?q-seboNTmuF_Q-8%aI|LeG)+~nT+WIuW9OZe^TBHCBjXtaZ6 z3hzD|roP6gN;<)J6Xn3XPdm(bQ{#-Pq>`V_jl(mv&8W3KOgislvMf#5z5zGOyzdR) zc`LojT_rR8H^^1)&-trdBrunEhpq6El1<2}8cu97cj06POon#CWiz{+Cjdp5K&B+P0$5vZIWzn+=9>!x}8SA#c zn1h?{>Dc9~ab;4al@z=%lvaogAwI2TBjN^5U0jl4lX6B@C4OS$4QHB|2uOuZ|LYu2n zqyrX5k|Q>w+UzZsRjhth!wwyTeV*$&nYj{AC8e!vY)w9_&F18}ZVDh^t>`M6n*zof zdxS|?YZ&QN&_XD9Obwq*!Bi|k(Ujqvl))@o+oc$;3hA&qGTccunSo(v60(BU#c;9- z8V;x(*EGdwh|#eE9@}g!&2{d=T2(MZ8ODWrLxmI<4hk)u(u)FqYh>owRmFo$k*btY zmr!JETt>VEhS>y63S6MX@Vqc~B=+*r-+JZf`3r`SN~(a5U_;SJT)!TMKxR(V1-T6K9$T&#YYSuL!B1e#7>4wo=M0i^z<8o zDyJrtb~-YqsM(8ChL@_R#ZmB*q`GPF03{4Arr;%J)FjFau^&0$;Kn=Un^r&YoiD-a_ZzV&@Bm&KH(Dj}%*uER4T(=5cfDpY;Ev zf8o@VK+~Q6mEOZEfib*r12nZoXLZl2CiG zdAQI#yc%p=X&QRhyBz$g9&EUM=GGZK*z&lsyVy8VXdL;Ba7~A~)llc%z4zPS>s}5$ z|77>*YUBRZ#`e3yQqPh1Czcz}tcmYFh4`T8W%ksXnRi*tJyEskaUnTSp76 zqc`1pZR_3ULT$eeV-AfLLZf=)q1Eo8V)wy9_ra1#TK26GzNNlI_`3RaCkeLQnOf{# zsU7}1w;t)gIldC?ftru8@?aq}ct7zM>37o0p+m5s?OpfA7rPhLcOpMK_il8#{V=R| z|DIz1SfPJR?-`*T?k$A&E{z^v4xNCm243KxuK}16-vcX(;}634OCHkLQf$~=Xn?&r zuo~_whW8c1`=CPO2*@|qz}Tv5O0RRI^(gnN(k@t!r=`OjwDGI;Mp7O6>B)OnioMSj zdY@Yg9QusdT>DnNHAQbr!P~ObI={=6L#UR#`=N$c`FCppSO+eg3Jeg?u1MDAP_*h4GTVKAG1N` z7af@SMQ0Pz&r$l&SQ}(MZ01gQgbzb5q`R@;!yRFuKN6At$j%|{qOpg@zP6Ly{73ym zC)@dtn>e69ZWclE<8~40ZXW1O&j`E~@m^cI28ZT1KOrW`yba7YzX3v(p97j5+Mxk~ z)2{(x>@gx}c;V8J7Z@lwLVo8VA+LeVyf|;K0K%#ABZAd|_U+p}AXrQDj=W>4rXtO# zd8bx>L}(3+yX|Aa$=YJ+yh|%T6aes)p9QQHLZ-?9zQU7iNy6a!R7Zu+s0u`!cWY?6 zXvm9sciyS(DCf(gS>zvBD?abZ+osT#Z3%e~8!-Sg`_#@Y#gGLU%Wr7g%jDa6D{2oHcnHv|^g-awAuh+9iZ17vmR`sic;a)i8p< zRQw`1DJp1B!RZ;!q%=ZHA_5`-NuDu8nm0sjLNz>@>9lE~WPo9Y&oYW8E;GDZR;!>i z(Xb^Y!(k!`$q*^tGz4l_#c2e0k>(0eQVzfX6nwK90TTfN>WE#8Yl%x>@1a}7i?V_m zYq%x=Zb2D%bR5>QC>OAR2ZL(ZGC9Kz_BS1$QETD3!5S2lh)M8*s75ue z+{T7gqhZTD0j0@_%d!O1kY-?#6bHj9Ku0Dfm_;OJ&NKkvPD&;~17dWal=C<`0sB&t z*!OD?ZG%yivnUrRgE3QE=DN0V;tXvOn#UIV!Nc+ltkC-g7DTqGFj+R}bkeEY?9ml*Uf94YyUujYrR zZ=AjpT=BN)?K_L@2MX;6mb@*xxAk7_(vH#Pj(r6${I!;LkoGZdeJ81I)|>nFroMHN zuXZnp>uyrjqX%k!_{NPl0Id{*-GyMcUQ>5_=+=;4U9Z)P+-tCAL>Y!? zlg+{NPHj#Uh>bP}ZdL1^fr4^?yQ6Gl%1pE`WqVWME|%RYaGTi9!t>1y2O$RfJ*izX z>cP{N0pF{{#6 zgg!gE0(3&}5ZbFbL$rDdM(6=j@qB%k=(Y8Cg0~JX3$1$R*s?IDckEaec9evdIC0-& zUsFj$qGTuThC9K6t98vm^O9{)bkkx;u{3;lu;3bAb71ycoG7}N2F6MR()UyAn3$u$ zz7y~(2j&7=`w_5yZ-duY;-Wmd4JGjJO5#ri@YjM*R~99wY?o~4%mFX1!=k{yD|uv> z)1>Fl8b(X(;vt}4hcAK45x?%9gK<~=_0`rL$fOnqt^n`bC;(mXoH|tru~h@R79=}D_k=b^#-u!+O}QyA=dZbf{co1_9-!1Ge-BfZg*Vs}H_PLcRYs)BAs(fKO4N{yF}v!3~1) z;u+&Vli8;1&wLrV?A_rLvau(K1*4J-+>+bk+-OgfQ=aO^hQefwoD!xwM9BUct;U2bgW8K2Tun07r z-6m^UKD$q#!j`iO=hHm9Y{TH=3!Zlx;NuI{rGeph_~cpA?q8Z#+`nMA6+Aa^ zEl`!Hk6GK5GrtYSz<&!L0Tp~^P*Y~u;h~V6m{la?phH_hUpzI2;C!WuM8T+kOABXE zifR%r8fubtk^CWK=LWXdo$W{SuTVqHK!n$dlQg&gq@mEX|2=-W>Cl3I)l+vfb1%K( z*{j!fEO|Rh0uO#DZ*$SxQSf%$?Yl1(hn_DCJ-^~T3_hHywk2ol({%@_Y6jmA2yX9q z;GyVC72VADTE=YTq0Kcm%s=!rjrH*lyE&x$M2I6n<#muz@)#kARsH}Y)C?tvkrSim zFv8=c{4Pd#k(!7A7sf;hxWmdVjBsBSeB&#)PT>AD@8pD>O2uLk&RkL4@WbY{{2dg< z=&#|YegM(a>qKwtToUR^Rb)p*KXFzc*sX_md{!G2k8_V3x=JDv01&FXOAbo9NmI{# zq0l%`^3c4OG=}b6xHVVu(X5~J9^*;@N>`DdzEU+MYe;iji8cy8BDeqZI?5dgh>^$D zJtYxR=%c>1F;K=7L%y#<3w#UBSk_f2@WLgWZo-Z7b{ggmtngE13drmQ}_Ws3(vH5(irTi@i zohmCm1}~r>;z_EI$~B}AUsFvroQF(Y{WaxJfrl!r<{&EZ9LIg`BwWiQ((p^-`xjF4 zh=l%w3_c>qACcW(*j$|L3j)y^MqdO4&h{yR=yNyWj&qO5$RpDAZ)E%t8T@xw@aAtX iyV|bV*X^flocqq;I>B`PHDN#3w$w8GIf0ap=zjsGo#~F5Slu$a*^N5wY@JQ|n8_rAOUlOeU3tQ8-JSM7IAX@JJGUWd>hoz{$Ly!EZ%fy2WO;K-9 z2`WLSs43}9Pto94q!el8lrpWFQqdT$1iyMpLuge>o35LxBQ%}TP3a(BozkZbQw9>I zNg30oDHEZ!srs~e%1r3GlqGGQvZfoR8q&5YTiQNlPd83Arkkdk(vB%d+BxM+yQW;} z=BZ}FuS>bpo+(e-JLM&Qeae^iPx%RLNCnctsUV?^sg`u>RBJjk6(W99sx2L!3a2Ae zk#zf1I|-{#MbjNq9fUTgI@4WKU4*uzy3;*VJ%qNVV(H$g-gMtoAMrP&`qKkb1BAAv z2Gc`RLxi@chSMWcBZO{D?MshNjnb5YVmg?yx1m4rncAIp%2bXzH-Gsu!$$RD!?F4FLW;>H=Q9xYI6HqSD_W~zW%iwHPcqqs z`An7(ojcj2uwz%|lQWFydpPDoj%B9P<@g;brSM9grsFI-&sMy4s8D(#cX9qIlX)sT zeVI9(nVlCMMA_Yb*9G_n9TU;3k>^WGILpMoJmcm7vc*f!+2`` zGEkSY&?UBAeCOvAObUaIPcoSV!ycQ@%qB16NL*%e@$~eycn&L*746w8^NXoOJeiqE zEhd=wB33V|6phEzmzYFWs<>z*RhNC1WH!!_=yE-v978gb%wc_SJYei#6tHGM%47A| zVs2qE2UV~fg)Y3nOtUjr9-n5X(^;|p0<`k4D~2Oq!pbx- zHpb30O6f>g1LOl?2|K(S;oZbE!K!m4c9k$|PdGoXnsO#wkYgiBhxOT+wwlg3Ye_T% zw@XJs%|i({)6CdO8V^uzDBmNMFSUilc*`aG;O&PVkou&a@y^yK0?-~`r4NF@A;ko-^e~y}ODRTP2GU%P zCtv|3Q(3XO96LMD&M>=YwrGaA5od69#nS;_7ObD}PXE>N@<3swbmr zwhI!n-RSh7Lq-P2LsU&K=H^8;8NwTbPh~-(1QN{b^kOO(n5ldq0@<>(2@t7}4a|Zt z4YXy);4f+r)!D@bh864LFmphp#YH2@FFr#=sAzpixR6;PI#94ClC!gMIeSUyh>mg% z_Jn82JQA&!rH>+fxa1NvPy$F%QI&+R1bo|6Std1Gv(dY!qo~QwFT#u$b#a)@)7fl1 zo<+qdAU`)heIk}iFT@s-3*EVCkf*usYl|bJ`>u{Ku~hO>Y~gzD%6umFVk(u6!FQIu zzA&H6z`<6}!f$&;*BhY*6 zGwZWUC+-^?zI680vx4P>U}+L8_M)YWw{-nXt1{J>w3M;^0aTX~wPisd8rbQV;<860 z38#t#KvrBepM$>#mrU?6X*)~N@IirDT~?HkMSeJ1R-Zlx3hB$Rs{cUohVr-7YAUbf zl&^r4xPU1YGJo}5OIEN=kKnDW@M`+sq&a%7uIiVmSFDgn4SA7J$Tfu;(KRK57sBi1 ze5&t9=BV72V-5N7%D*&6RW-*1$i+EZwH7i3l9!&d%f9OS?Q$P~h5DKzugW>(m>hoT z$TR1Zsp?ynqH?);bmuMNT#Z9R+*slb(t1OQELW8s+=+Spo`+@#F>$uESeRF1+|9M!ac}w1!Z-D-_an@Iixsx&qb5k;1eRGX+ zeL#;=eT7Qsxq4X1$~;QM&sO*wIT)bxB@ttyZXLHv-@Cn&~v|D4U?$^y$G9_tc%3oltZiU+Nn?@hFk$Ig7_M1 z(MaWeoR7sG|6DaMnSxcW=iD&zYR4Y_{VKQ|;()pw%u zC4g4d)F5&rGB;9B|5iH=Kpspf9&76d>k6`%pH=c>igfP zPhO&~DW9cYqRAYA0fqTu-7`}{dBeZW`0pr-e*O93onLOhJZ~nb&qCjnXLOJY-c+2S z;AF@F z>6c{w+-ow0rEz{)lIKwu`+^)-eRD0as-+bT>-lZ?I^dIU%{&FY`$fq|QFF^ORei%e zus>?WB`UfIuCEKbSM`y!7ilr0`dZaaYHy@)mcCxO^B6C08om_ut#8? zibfI}2mj^V6&62lBHazZwnoB}L}O~6otB{j$U`G#%1!_$dlsE@;6#<=C7N>VG{BTs zm}$W2vtr!>o1ABpx$994iDyyaXCDJcG?bf?W3J^y8v#9&H^^j*UwsMioUGV@_5*@D zrBX~P&Q52pLa!Jyi|I;#=*sB;V6w=42go?FJg~UYxM7pbEP+Ha3!Rl=W^zn|Jqj@u z)fQ-ib=p8Ne9!@&-=`(CBSF7Pffla#n(~iTe?Y%MgW;$`U6)p^YVuPJ)^aTHsY0*+ zv|gg0n|N&U_-FO{Z&GM6V9U0d_*@oDG>D(vWbs=lHqK68WdOKEBTT%EoXxX0PCW{K zk{0Vu#Ogq5er7tAJ=9Y(r-M*d7I*4D{|7{>1gCE$zcRmevC!1NY3Uc5+V5G~Kgr^j z>T?zw9X%@=<1<&7nXA$aAsC&gCFnAf5cLTdQVDAYXpXE%Fy9dIQAVKUc8V$nmJ53V zn`kZ%GJt?Uff>sZzcig?D}~3uyOX2T4r!YH~pW($TU)AWqSQeAQ`TBI!T?!F+^S;*=;EYoE?Qn~3p;%ofoe zzqFVH15%l*jFRe~o=znVR2I~nirv?FX7zE67!VUn490}mk?c8W)Qp^<;{xDy|^$i=b+;=Ebil3 zRGmcwiaF?Wlr3TdsqOB?uPsXf7WZUqGdhSXiTWf6Cm3M(7_k8iFzl?Q8OH_%SUGD+ zTGW--2P0~x7XZOcu&tP1H_1OStV$KB8!zhO_J|0r<(Up3$xrWNwBfzm^2aRqOMY+ zC|9b?$38)dgAN0u1F+z5zdHE!(c7b6*}v(CEEzv^_urkn=RRGibG~nA z68grM&K8Yc-srt&>?x^Sx?#cXFS-YK_rUteP4|JNb6aLx(H!8-fuebE!#pT>0=INq zU=nE=`q zXnEqEz4t?Jry#di@U#lSZlSBU*ma8UI<>7+`x=(@B@1PD7i}FIwhkfOT?`-N!^Z?? z%j!fibc7Ebf#HfY-Fgga66q<1kMQ9m5Ggo4MQ4C>m2{Lf zxJ{|84Pd=;wiX;6&?AkmEq`m#Kg|1w*IzDDt-kFBnEQY9{CCdr zZAaee-V8jxY`SmruFT(y9lmEfA_PLkKrbKY75rU7Yq#L<5!(8olwjWjN*!#vb-Gj! zqjQ`t9=XULxd^otoNef~^$FntA=q9F_VK~Kk_nief`QhK(@h^!+Wk$uePnsE5E_AM z;>zpi1O3Ioem<~YaE3`Pd$SNiF`?_Q(0QQ2Jog^68S3vbu|wAa4n-M&Gk^ zeKfD2+74}BQ&671^^Su3Ak6ixM%Nq9-FmLr*vU6`uASR#9EGLPF-!N=C|L z|I*~ElRt$;)g@S+Z}@NdS0@VA_6_T)wUd8x`kSZkI{w1{r+ygopDKZ63&z{t$=f>% zmQLtDo%y4B%Gf~G=m2jU5G)SC;uBz0EX{(&gRUncG{fBKgNoY+q2joJ^{{}OseK35 zXYanu51&~wmlQ$W@cV(@k15&|y*0Z$y|AW-e9TVSq=pFNB`0;|Vbqj^uIPVx2EDn{JH8(%-?{jXj_*73Gnlr~kLoCUbko*RFm`O29ZTj< zvbbOQT;o)XUZJ0`scs)T1mq8#Z5)*LO<21`Qg6tHt_t&OFz}4`H^p22c92Cjyiy^a9Ht7fZ}P@(;)?~rk@UK zc#RTiogTw&H1P0EG~~;>i0jck)vn~*dY*JCH*7S}8+JADY`Ba_xAk0bDK|;@CJEmp z;hSwe7hTGGB>Wx;zvpTK{(G^Wr&^WoLcQ9R@A@#GcU#p+w=0qEq@kkk_7JyE_q0#> z?wIRom-0Ou4fJ~?pZ8p9qPIw7^Yye2VXZw}!s}w-Luc<@2 z-t}y^^8E;nbUPOOezzLw{)0fX2qv*#M~CdQzl>B>YeH3!Y>sC1K*uP_!X$10#Tv7y&49ykMshz@Bg98fzT_0JB-++|K?f z7m{~y7PcPZ@QA~n2kNbadNXwrWV9NN{z{lFA3Cw44(R zQ%ztHa^(G)5zYzq4?LVBskiqs^v5@7;AnZ6;~R9|2k{OtS1D-fGDY}85A#*y^L{AT z14pu;RoJtc6Sd2R`qeBE>fAh6qXfuT?k5}ca;^T+?k<`0X6ra#R>K*~{Y$_850J7( z-mmQA{5Q2{;AEx@6cgimKEbKIr!UMzkC#=G7jmUhqZBL z)|xvk2XkgH+Lk3B7mWuh;SFrp-qC?o8@*$Z4~phHxQ^QGPN?o!W$B;qgqC;iZaEj_ zbn&2EQf3rPOB1pW%t+n&E~xuSIi&i|A-X{xu1l`7{LXhnIo*|Vl-R>y%BAvMzuuLp zlin}UvMDdoAj83B5ZE^PkJt$;e(O$Llz>F)Q^!OyLBM|rLkh?cXjCIgs5LqJYs}3r zRLljU1F`K2=2VSf3HD>dBb-S9N)R#!Y-=_Ps8vJxVDXL<#Zj%OP6FO}4G>iX*X!h> zMHApC$?O$CTBZS&5)D}_UGlKEu?gP-2QYp`G9hZqFcgc+3Vh@iLH3)j<+A@B z98oQu<5#eMhCZX5TLutO_E`vuwz02bOuYpD#pek$n`LobN(Y=t<7SY@ie0>`P%>Zy z2V{R26W~b_%HSN06Av$F*AoRIpW)aIKT_EbqStb+L z*f$}iG_P^!3HHgZU~U>|blEr1)4a11IHU;oO2dp+j}+LT$TBjQX`HFj+J#)IV~eBD zR>oFT&sFNHpLF%TG(eGSsx7(eSv|}9V(Xo}5AYSY(BxkJ%*xArW5?Qac)(dN zczr9!Ry%o5ccIP&pdq@=z9sXPD^PUBcvozx{(k=n!PvY4*x`QOvma2h0X<(ATQU_K zu~J=Jp$oqs9sL*%D49lX%`eZbwhFD`V(SRsI#K{&rKPRdGQ=YU_P4ecp}n)%evofJ zSg^MV;m+m9wtT^_&)uF|HV8If!4}57{}>-S^egQB0{~^&Lx>d)juiHt+8jI$d9Kd! z_CA0U{4HO<`n9XeC%_u_b>nU0>W%fYeB?0iJiI&(u@m$ux?u0wa<>%S-MqWI;2tg5 zp(_ED@V&u(nOi%v{=&D;3;tNazrWzz53Yc$0~_uEp`&Z%tl;k`_=gM5VQ@(dX0mSG zaQAOT2UpGrJ%?5v6FSFkp93tkV`%-w@4o!)m-*W_6zM@EBe*GKQuuZ;jXuy zd-J&!z0lEH>=@%a##YQir2DPkc=I<_j0mfCtOfrtzwRjZo!;m>EsUNnc!mKPT{*M* z!tL`P0WQ<|erP}ljohU-LSvAfw{6SeD>@>)BT{e-2>#B3f281ivM~DOCb<3)p*2!$ zJ;t{l6T(A6bVwN7UmTp^2PcGZOz1uOP9NWUT!;-9V^8q0Cjd$eO(2pOYQ9|u=wYaR zrEbd}UVUNH9=&s6z3IE|Z@V|U5Aybd5FpqiYxJhQ1AwG(vHvLFe{{3w7;irYK|-jb z7~02&_AQ?mTG}N{d-?2#ooV>Y~H5j;~%wC&R(H?^lk^=etg9Q z2)>l0c=1#fR*;0*7@k^=PwqX!BC5ZT#9Q2 z(57Vwg8Ig3DED1s_s>5>JW{C(6AWyGw~hR{^6vPbX*O+#mnH;z^UB~#cvV$sXe%1q z3dXj7DrtfDQ`XPSk ze(G@x@Zbj^E#H9cYrYQ73F_ca4@#y1{)`+=&T;>5*ie`_6ZlPYa_SxLd_7k$ZOwD# zEd^{UOto%PR8(9u#Mb~?0PHZ;0#8USz6iCj!kJS3#IowMx861)HjG(dR>e(iZ~DxpVh za2DEJF40<(i@+C9f|-<1BOfRM>H($O_Y?qL0$JEIq1ttWn$;{5&)e6^4bHXKY{@Hw zoQ(hi%3Xkf9#p!mWs%*OnhYmRY9?n%(KpRd6=QiPwY_&Bq6aZi(RSTYB zqnweAaK>7Y2fWU_Fuv3!;#>Xy{~ModrgJdf1>M5nTc zW517HQVyvFsm~8E3?vW5!thFaz^XDYZO?uSBgh^tiWF{ESlpShe}xX}>g-R@L0CEJ zl=f7s7;prg33R@U4uSD6BJ~tHxWi?C42}qbu5u2nvLz;YiuG^MP1$x8ofvG25~pw@`5t&YF|1nc*9GRg1^1!ALad{ z@QoLU*&WVBphE2%SU+7HJkJlF-}Fr`JqGsBVsi)I+_7{PlxKIS*xdPUbLX-Kv}NfO zLBTOpund7K*qYxs@MX|IAR@Nmhza5L6@%ao7u@{?dq239Bx_G>IC_N8;EGO&46hi4 zaNljiR_IJ2JX{E!x!Y76JDT~ z@SBG#eo(MMhYt2i>T@vimj6wEsjNBcsYw6&iSM5M_E|oB7)lF-B~$tG>2fa1=Rmc5 ztMkpy<+EG1X2~82-66OmP;|jQSs0q!1Q)7-_W8rS|FF=~iKcUKdq;}BNBG_<7)m z^E}^q9xkQ^j)3B;2{he01?LXnY{RJ!eZ%*gy(^c$(j~Y;g13!mvcr7KFkCW~jOs$D z8_!CBcG@*i?3&o#Z1`Yt=={de`LAkMHB!CrIr|D1KC|sG`5TrEa3ltd z;q|xbS6;qn>D;n}?kHtlcM~>TmV+CXPzlV8?yk~{G-dMrk*&}+v>6=cjl&y8Z|SUp zYL0GCDky8S(A+8b`UO`##_kPc_xort_Z2O@8hvlXlMP>@q5My=zZP&V9SSlx9=O8VK~;Ud~l!`Ji!M~6pX(6=AdA93HE@{ z;1TR@p}`^e`?gJ5i@ro@b$S+&&mHoOh_*k*FqCgIT5#kCrHB *luYR6Q-Kd+?`& zKRsR#;WThHR+GVt26Cz$@4Rt`YFD{nOVqMoD_<5cff}gbOenzNx}W*xIZicKv8+|I z+JOgK;G?2WD9N=lRK;{?Dct*Tl}iHfS*)wYQN2csAI@LKk@-Gc@schqptVo8w>
QSWbdBBV_=>`xNgrQt7G!# zc7pnn=}Wd5Ws;gveD+{voNVwz38t&h!$cW)L=V^!8XQN?CVh~1_r{LCDD(Z|9{D}R z>xWffm&x1mCb?!bwZmRnQ4h-Mb_1uaHHuutVo6gOP#$XXXcJmea|0?iHA}0^r@R4_ zqG?cy@GD9lxL#L>N)ad(uqDFh0`weRuKP=%VBF|9Mh+QQ@bds(Qjk7Kdv6{PU1ah` z1zsPE(qv^`#*A9&R?s|ABO#VVkBRCf&EQ2yENT+W z4E(4Di`qc>)Tt9H11nWyl?XcsM4RC73d+ha^$Eq!^IzG*Oa>5FzS7MUseS zL>6~euYm**nmY!tXiV07K5_JP;M2R78 zOidz1eK`gIP&nwnvo_dYz`u~iEz%ApqC!gA1Zr82#u$GAa{dD95Fdd~OLT~EI~=O) z8$i7w&;mH&7yydSEm!jr9%i((6wKXlD#zsCvNjg2y}Y#-k5{$8F(*&k>a#_6?A5bN z<4e~7n^}9}O>@EM70iKxxm|Dui_UiVsI56So&C$|_icdPMAqt7ZxlTJ%g47Y&E@l9 zZE$Q}u(T8{QQi_=3*EEC;LuLd65%b8HPyS8?kzk9#C!KG>$mWLUW|9f1iMeL!xzOX zM2^6jH&-v5d2=-^tGA6*FtmE67#ik7!*{itfg{VNM-|k4&$1727Hd!{YTHhA3={?? ziU7n6oY@>WcQ1N=+h%BN`lx~Gn51u?UKw9`QE>OKPj9$KN@h5uRIUoZKYh1PBML-b zZ9?o+rAp1%N&b}=SEp}ZFZz3Ve=nS3g)>eDwQE0=4f*~Qi`Z70tPR^f%3@nSylHM- zePIK-&*Cpy!n_4eM~=N~*}o-wSAS!jTH+IHI=LHh<4ag&}+;Kid3H&u+69_GDE_dbvp;(7KPY!hf@<4oz zO>gBAZH*1Bu_96LDCBKAY`c`wwkxU@_1GiQ-5s$h$1Y~zzSf>!5)vDx6Nz{huX9z5 za-yD?z~J0}DhK-x27Vk1ElRkg%a?GXI`$M)TG}?@7LxtH==?u){uvxmb!mP+MT}x( zLlsr+up)hm4EE|^k&NM|t-vsZVjiV}BoNZzyV3Qi?M1E27=MA1fPrKn3G|@57X5v^ zzi<6B#i5h@(8-OVbAJQ=^RThFZ*~igPNB7<*gDR)jtl;FS$bONiuxuC$dAcYVQG25_TKqm7^JYS!jk$-#+>_N$y9E(5=`Sh?qK#B|{ zcDraQ|F8~TT!_cvQwjPCMeZj_2jkfY@}tv*5o6HtKmR-h>ULM3(A_U|_6mWvpV|WI z`YlImNeyq<3s^%XEmC^Q6<$;E&aRRHy?{eUOD3f1;WT8)jFbh#tVlIb&Ay*uMx`?} zM1Q2sDb>y`YfDLu1mxP}UCG`0wGs&j39omqC|547=y+XAp>2fMjoj_#btfO_2zN?* zU7-$=)JQ;Tmv`m)wR0elc_*9`CxLa8yM0Z)HnV=P&~t=uALpBomUJWr&P+Q?20|Js zM{~(U$a)Ml6VgI8xk^?-Hc<9J$wo*!OIt}tNI2~pC>aQ8qa|{^sx>oY3qcO zDSD9oB2KJXa76PAT>gYRd0^+w_Do-zk(LF{c~*^%7FG`E7S|WRb}d~a%3?C@EWjaf ztCcJz(T>0I)FWMo=>gMfZXR+aYfyBQqbfiD)C1}h{2Ucomtx?d*m?5fP}NLOoJ)%q zw;&QWgd$ur-IGO2ARw|7QL7chkIGa2!6~VpEGkMY;#9vp1-VR=CGtdcJ(RtCA&9JO zvFV}s#T@*KoOCS=^>oofIHYS^Jp>AB#;OpB<3R=kVJP>L7eh>P2z2hHERiUpk%WQT ziPyiRTT{|F?71_<1~B&^W)mbyB*+e~DZ@3sIH_Du`MU$df^x0g1}RGVjYO#ptr+Ph znMje<$s=V%a5~XXI9}}QAvIxe#CQFEqI88#i86T<>qf-f6jEe+e+WydJ0MB+LoED+ zDFy$jENcHHB~8;0>L}X$zbL~`sL>Cokq;=x-%+LyDEHq}-5*eWA1e&B;$y1fI4SxW z`UC332UP!0sFOdTPV?0152)4;sNe@w_$So(2h`XH)V>d>*ay@Tf2*@EKeMUx-PCMr Q;(A)YlKvTmZ&I)S57QMXI{*Lx literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_embedding.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_embedding.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73399d2706ee6748f1868feef2b7e88e84f8fa7d GIT binary patch literal 24167 zcmdUXX>?oHdEk540b(PGjo1k81Q(DLB~hfPg_20gk|N2Hyuqa^0dspvWx%N?IFeXLI8PI{u8$xLJ?UOMT5C(;G_MxJs`+;gV$XJAViJ5FZ# zzWWw{G~}c|=EuAP-1pu6zPo()+wT3T#bTo1No#&Dl}S<5&(I@3S_!zjNK@2RDoG{j zG!@0SDotlpQB_7ARnzEKr!^UERGZO7bs2qBPr~T5A!>j)P1=|-MNK43o36>2qhdyqCflM$OBjhRq1lxd1KWtyYSnU-h^2{)%(Gi}i}0$bAUnT}`&fvxGz zOgP#_U|YI7(-Z9>usz+Ii9{orzGz>jKiZ!ehz^kO+VrN(V04hcb?MESEzvCmu1{~x z3`K_s>_~6R3`d7EBhit}_UQJ^Xmm8QBf5j8R20+0?0f_I6rboW6}6vAI-jGGE~e=X zwG_IWgt{Tr!|YZ<$1c+0hJV5&N5U$>I+ndSo!HMzGi;ojVFmqH;C<7gpdOpP zC}<~A)A6*RNyM{Um=^TMnXk<;1T-9Hva>VOSq4fm$Fo_6GjZ9vjPcv-T zAh^aeruZ#i@Zs$sBHInJ)c$sx5BA>!u<&MyS*i*s?o zGS0FytWvr0p=|s#^T71vOjuKqZoGSg!ZU>G!BFt3ZWpJXm@Cuhzw z)7fJzv=n=%eqDyI%p{p~7CZ4cGo55u=&s4sY4mT%R+E}eaj~@6Uz&>klD_ij95*`$ z%{ewRlZJ*!UWeq}kZ7G3heJwsEY8L=*cxYsVNWx$I8K~ciW4UeMwk+3=BCfa(kYl9 zTERrRao@>{vy5OJpH9r+0E$Bmans3Ee0rM6W(5a>aZGGhY+URVjrF8MhMqGr-_=B^finb(=$3Lm!Tq;1#(y?RY@b{umNlb*aUDb)K!u#nnn_gy)QA5`L(&52+(|3MSurGOV?2i8%t{}G?vut-k)~;px$<>(ZZQ@u+6K21|O zY6?O&K2y4lAr!Ts=8SS^PJMx90sZIHQ)Jtb!^O9Z5_42cDJQ39gOFODFBKWsy6WMO z8+ucGF2d%t-v2s%1hA4eJ2#8SE*2xJFD95XxL#ukvPuOD&S!>|y_!?t6&jf7_^C7# zBMXH|#%9@>1grr>4n8Puc82A$)gmpK_=OmQ9Un_G)3CTSStdOxXyWYYte`u49^dRH zXs=+5#S&@2#<5ryMaScaiFS7km_Pl%jREWquu?-@#v*3L8 z40Xp~xuoX}o~72jA$+agg|<8*ZD&mokOmI3FCp zt5)@x7B#De&W}x$J#Z~hXd1oUG+I(q7H`4a!J9h(G+I6|cuP8}&Q;Qjp1V*dC8!hJ z1wqF$+#EZ7B%#7E>B0H(Ed1|216$-FiUyofo}rwQGDp85!3(qzXQ@oY$mPI0M^~Gt z%Iu?K%FKrRRp#lGrJ^til-K05Fyo$r7XGpNs`_7OwbX(N%D{EC;mN7w`e1$3%8}Ju z#?{I3^0%?yfl{4vX!%_|e@<0Be9BeMDMR;3>5=2h?+bJmS28_SB@RNp<&-jXpR)aO zeEFTE?~^uIP9sCtRc{zXcw^sx6~!Jl43*zW zja)au)dS@i8R7=SA#^6k=?Wkcfgao=nD0i^M9Bb%;NKa1vaYEHypH#3qqijb#9*0&ynP zVe;%OlSn}zM@TIoR%^vvu^Cp%4=7Yb+Vic1#tWsUL$fSXM$X{FdLSIgsBGBF;^x4* z!4c|XnfS$13^a<1#gp+_AcwMoo)xKN!JM99<1yK1VF)b@2pWlD!Gd{6t5Wl@kyuC> zETQzkYf8bkkmcgj35La)C)lC2a+_d}h;xEL`pzu78{0F6&K`93fg@O@+QhbJ1&dg; z+-F+yF@hs@YA%&dLK~67ly@uB!}0VSLpnn+A_XUcI+4T_jLd}uL&$Z(nP3^9!kt6#0}Ty9yhMi$j2RlvBl*w|8N9O4^?ZmNG&^Mjg|#(j%>SIz#l zU~?hZ&jv zOq8#&;O*nReMN`+>Y*!#ijIH;6&#(sqqErD_FDHV-G%1ueDn6(&BtzL3%ic-yN(qH zHWvmS^P_ete*wOXcQ?EQ#=osfa#!I!7um3J(^woVz8J%?> z8#T3cMTe98+^dfEQiyVRN=B+@bD?{T?;b0*hhIDU%GpBu5Z^vjgr@m6^S;eDPJI9I zHy>Z|jg_=i&mQ_NrS5_9)a`Yb)vNaAk^#%;qZ-EOUzF@n%V#AQ#Qg#~uh#eN2lBS| zPbigf>m74l!R+VF{(J!XYhSV6U8wKn>wB*!ZZy3yxl+I71M`+&mb8%aSJ?|N3cpi3 zv6KFz9pi)4Teb9fkLInqJ?Qy~jUMmQ{KRfSI6y<1p9II$;CZKs9v{%W(`-bz8zbK7 zp%LyQ?f`KIZ4=wo@2VmbTh;IO(+Ce}A?3SUwFqxl1I!+P%Gd4ixKjhhrtbbJI9GA+ zRwz&ikRI6IrBGH47)mwmg>=g9FQ=k_<|VsRRVsBs#VHgb*&C((qnw-22vv6e>$J>@ z13j%e0*tl^sXL(SPIG4j7st*`CvZhcSV}}&WJ!m0LNJy+GcyP4buL?tie$G!{Q{vD zMPd|6WpQt02&=_aiCxG37C2e#FM08?C%`k`vXQW>f+NzH^(){!L#@@=3N;bF zCbE2fwPvW;5V&-3tszio=;j-`Z#RrBbA`S!zHe;t;72xJ(cM&R=_<60@hxM;#`Z$v zFyALWScc!U}R9JT;QPE%uCuu$W?{0!F{+9MC zb(&fLIo$$XCXO%F$V+A)^~6Dn`YLqdf(pn^oIK>osW^p9&Z#dMFBvYGfUJ@Rq)fa3 zDW$GPIxlEp5h_fcTq-Q0N+N{jYUG$Q!7&e{^aB0DVGYG8M0E~`zv~*AU|P^|3i*6V zKcx_Ia#=7{^iv9fye`yH_Q_C=%IU6a-;ll*7P0OKi*pKAvW-YBkXw=nmKu?1OL8a$ zF~J@yNj-oJfekpxT!6h)=BFXdvTo4}W^RU4D3{l@f-031^x}q_6trZ83z{Sfr?hiO z@Puoixam2N&j12pvFR+5e1g5Q*jOqF1P@60pf!RT5d@O(EKUHyB#|nRMleei#kf=k z+AiXg>3D_-o1yJMuw{i>sB1l*64a?Ae3M9q0W~0KFeS@j4JLt<8WO@cD$5Zhvvvu?Q_WPkMksyhTre_lI zbarQ?O6KRFn(Uu}17t*TU~qA~7;4I!10OYa=X>`4UGu&x*30V4kKJ+i6up5v!NH=Z zsn|JGZ0#$y^xiXS8(sIb22b5(O{tD*?JczI;9GX&?V&Zh^Q!rZdFdov}&hyaUAwxM_aMpb#>>JokfqY;OXQ&okdr} z)#)qKMW?su^4+V^Ijkj0XR(T?5?ZZ1NJx=&{5=4^rKv=bm-LskmvrT=hp;uUY8F(N zG}X2~2vtz2Qa%NticqB-3TsW3RDD~!PDn$QSL#$1B$-po_@s;(U?pjAJG9A?KvCwG z6{ic%6gC46EE5o>BrhWQ%K^#F#lw0wjIrm@L3)I}hz>4!K{J(|nHIFkxy)=fY+#?o zh)T+_O@S zuA;NyPD9{j?~C4R2UZ$-VG`loP)l<|(2^A*=)@%=Xh|c&M#8-a8WhvAP0$*_wqgAd zvPxW^WTldB!A$}>P@(AAXCQ!m9-Z~enJstVZ-qj;;SoE~rf?eKlAI5Ic^~2!X1@u} zuLyD9((x*{+OwPQ7`ruC*!wWQ_u-Z1NAiZoHJPR!=ex!iZ3+!O%r_4&))bpsU-P}< zD>M!9O+$;OPc=a!^bk0o?xt%2Y^Th=rPH@-+Ljw{9=ILe_xJVfEA^dub7#@wzox%! z>G=El#y{MdH#dD`_AE7iU=E9`4)#gqK^S+zyAf~*$*Dw@fof|E<~u#VYd^zye(9)MR;2zZ-{&e z*#^YIIx$<0E(_Yq#WFkRBV|i@QT8H7RxilSJ-ZlA?x0>Ns?Y2?2qe zBwfXwNo7OQz7`2<*snl?*jjXu4q&%{1D#0b^#OphxRpz+D0>Rv`QDAa^ra;D+gL^? zI52BzaMd~EOVoF?3pEvTdJ_;}(n47Pc14QEqipafc<_C_6Gd2_tq3l1+$O5K0^j4YI z5kf)D8+ye_(fhf+mtcTiSn^0)3DoNRnKFuO1GR}fs^i>eH^lT z&V%b1v67%_$LzX%G2gs-#k}Qzs#TpP-rTbIz~vkWep=dJi@XxaHx1nI-0ZsP zzxBxbkG+2=-}q?W><2{EvgKBKwfRKe+z6$5{fh_hvyq z-lEta+=o?F_(BD5qsT9B%vMnR3fe`JL2Z zYPk%Wny#$ZRIT2mJ_mWK)ypY(6!5g_y0Ryopg2Vrbg2e3$-^jJ1G$@8t`|rH@Ei)(7odG;_Ynylm^qIoGd-b;Gjno7fNpt;#K(ZWmXo4R0?9!!rIYI)9F7K@(JIb-)5yN3l21c@v`@Qb~k4AZnaU_B1Nuz?f&9je?}) zfmn6~GFFOba03*@GwfeL)E9ZX^{xG1m>s2!zzG8<>+~!c|M1A75%wY2zD}=(Mpj3U z~lSbQF-&wkF=zl(+S+1$SPq zDYkSJTDI`;vxgwMai}5)Br_$Qs`e<2wggQjtGc!yp)l-Q_J+K#`?jt7jw@7jbQT
H4`*>vX~A`Pzz}%>~bP-m|^fdAQiStJvNL#|Uu3m#uOj;!4oaoar;^o@X>DMy(2Nqqqvt@s z!D;{o&HgzW@TYoym%>`bWr`HgS3I=QQZ3YZCZ3g2h#E}XfWsn{st7R&g31DKg&Y^i zDN_kfX0)#2_*XDmf-9EFf)emxQwPH^;d!bC3=Q{UoZpGXR zx-3hiXm7qYnYZ_Rq6W{Jz46+)yuG)imvs$-tdy0A=U1Q<7Aa7{EQY}H0x|7677RD% zXW@VMzk?&PV8Ac{yzv3Xh(`A4J8EFis3{zz4G(ODu!%u#9K;PzS$a-b9EA`mC#xzi z$MSU5O9U22Axa=c6(S?42Bp8$PS~KOIwi(MH$S$QXnSEwC^RtHOygDg7}Q=T<7_2^ z9ueqXb_u?KFdKv!VP66-i13ilVE+|5$gvPMf_)jGYv>?r4vB^kCNkvzfRHj;EQ%)t zTbas7!^BJ$Nk54;7s>L*m?+hbiF}tVyjH$uKC*GbeR=w0IP*{lMs(0XxdxkFe)7d9 z^ZuUYvE}sglX>sxqHWD!E*Lyu1XVQqz|N^y-vko9LPMBu2m|44YyiU9XhH-e;jx0P zlecwZO0yTU$cc;Ijyq2O(utQJfAR64Aw;Wh%l=Z_>gk72@0so7&?rC`~f2;vc&!?R%G5D}9n zXG&_2pqKOJ^n|DAgF29g+3)~8Z^~(rdy!RZUl^?#P#vPQbHkHUl#?K7ISr)`Zb$%L zDBbvLN(b4{)Rrna&2syp^gjkABjO1;w) zsA1#h6>F+GBHCUj!4mY6C=2Kb32$$ZEg>nw0y+Zf8rZJXj`TMBC_0a!a}1q7 zK!=FIeuxnApCbK!9Uu^JT>!(hB>Of7&!N+b!FJGj&ZW6>qXpx*Y%3MEmbWjVrow=O z{Zq`3M2EPcJ%P}xSR9>Vai0_FzDOF$f~|_~F)FY~pQD&S+Wo|rM4`V6;q%QKH@YwO zcN#ahAS~?9L<=kPdJ8KZ1qM^DVcs=-v*kzKKj>a@O)S=Y3`DE%PDAiTH{SrpNGlD) zh5F&Vc^JhgALWLE4%XbT_3BdTIU)XFm2*}n{Et#f7hG43WEpv z!GkNlLj~s{AeAlNl9|Z5w()^&H|tjdI~I@J0rRZL_1;z6HjqT+?QJEU+5*z1V8PzT z+uN?4MzfRMeBbU>`xr>S?4Ys`d4w>r^BUmY1NlwIR@@Kf?GLUswN zd}R0EwT=N3ahblnV-1aClI!#lKkzbTtl09acN!u9zpJPBx9Q$B zv?AQ2f<#~ghVY)1@tBinb6Zsq2_VJlQheDk-MA zHe>~9nY#XPL zrc;~B@iLUFEN8PET7F-*$t8gxUN?mjy$#PZ_4oO9yZmj^s1C~+tCu6EUZ}~LrqFJB z!&C9S)!QlIiCoP*FnJ5+oH1F8Gx!-DC`kdvSv_q*=z{73Il`7RqorL!olub(Z%o!H zF*O@v>g5<%e-(Kyt6p&ZGuGRESAEifEmeBJP{ERfAc8!wiX@f;mV^}RmP)ygO%GM zw`n8IYPuX6f|4NU}a4*V{<;m zol$gsifPn3tDuNUNj)LkPC)`mSP0r3ILC>)mTYKd8qP;EN}Zq)0ZpBfIRghhGjLWX zlwmS6>_yNAfz541iN>;_sdMQ#zR59pC_5p33z{I2h65iAr`^-(i{*?**mt16z?1_~ zzq|Sf5M>^&5t#L`8<<4eTD<2O?ECLTMvt2LXBwDkeHc+ zBcA6W)BN!G1vnZ7sYA!aqu0m9MvKS8%VuVffN=D9_OmB+8rXRDYgj2tyyqJpnwAu; zkW4h#458w4Bm@V-wZ!gueyb!&7nPczDF`Lt>^GDHW?DyPpuejR8~b-yO97oCIREs2 z;DJuUy60_sMeUTVrkMAlqLp;i7qk^b)<8b_WLPVTnK>v!&`b8Eq8W1{4pzrW>6DmY z0F%g676RtmCqQHY-v%1x@>C>!nI-KOzb!QHmxc}7FbSP38lOqA;c$hdgGd*Mu;CPb z8?H!Er)P!ZS7hRgX3*?kV+;-zoHd?Jr#LXKhOtXw=t&GUJ%FkSax_a+DmG!LhS-|J zv1GK>j>SMG63-G7od1j&m5~#)aNR@VjG#LSifFj_K?mXteCvtP=-?)?*Un*yDsD#5 zvhiuSjbRbX2%+;sbcj%dNI!lLz0K&nfX?5b^B6c`Pg&@}{w)Lub<%8*dlQX&oM2K1 z*GGskSv*FDYaGqdi8KYRdf-4W!h#lrLSSkn+6R@(041laDT)z`c%3xw>y$O2Bp^)` zYRbmz+45AAjbW9!C}WG``@vRrt*5Whv-@_>?oX*Bw8h1{IxkOMYcJOOujvY*0X{TvyMAEJ z-(2wb^8Vg}e~|YN-q^e1-*$Ptq|!M1@3eI;JD2Ca`_$`C-Q2d)x9`^1_`U;t_<@zS zgG(lm19_VB;qAP8`x=}Y@98ae_Y}MO3tij!uI+`cF}`c;R(PfB@cR$)T}MAQYDYlI z(zQIod$xjIb@S#M2l=L5mk)xSy}$k17`m(0guI`iwia}>p2nqfFF*a_)63^p-CNhfecuhf9xQ}M`S9qARy1sK zy%u~Wc*D2aumk$It6z*B&VH6~ZHYc;qL}_X2MP^5Kz{@X`G7 zNBQuhAkJEIH(i4Q2lDRWwJrOXOxJea_H2T$X=u5&bH%eMzx4p`IRL<#Be?YRiX(EP zop)@5cM<#xD~{e9wY+1~2aaCI5@@;KIDbUOnSH(CUq88If{z{EamUlMJPEBHcI6%6 z<*pUSmeP!h^6UT!7EYwk@0qDU$8}TQHwYc%>|bl^d&vajDynykE}c?QC+Uaj#iMt;fu)3a0CdGWuz2XBV9T{b;J<7t)(1dN$JY<8 zfwWGXDQhjg%hPb8({#W1p>Z*T_=u7-?%MJ7u{hiAL zxBWekwtMKNA8gZ?Z19bJqqiDYBM;_1-53!mbdB;|qc@{`*MTJ)nTI2M=(^$0?|391dK5aR6=Zm-KoAOxY+Bv4=ho4c-jhXt&S6f3Q4t zpq;KzzGWNl+yO;@%SI@~;rZ+% zn+F7yfWoSuJXJ%~ud*({fzQ?+-$sASf80R5Ndxrf{Q$kyOdkqs-fB@J+>XI-bzq9O z)d;_>u_Nrz91g19_RxoVbZ|=bbvt@J{_6GkD%>M9+Jn zu>g49-+`X@cka=H=chXBVZG|74i~~bG?ewzzS_eY?O##Yg1=JJ2x|zeuRZM3{*_0C z;a(bH-=6K@`CAv3_qXmb3wZLa=*hS30GMw^m~UwX__iM5+XmzDE$Z8~4afV{x4USB zyR{hJuSIx^8sKmX`!m=>1XC;qMrm_!;RxJL5{rFpE}j;N0&;%`i?#qP()&as!aj*m zEjoU5PM}kd4$3FlZ=$n?4pEE@B1GsHH$sFEAT)p)y-0|#s3#HiDSv?wAsD`c5K6jO zWDG@u;cbLaC&A*z2j}3$8%z>RIvtC>PK&!Hj5W2J=*1KkD zE@|NnrFgqadW6hi?o_fMWTi~j#p!%t;Kt^BV2HPFEN~}J-H3Z!008&2=_dh7$e-yWK5cq`bj&v2g&lBHEJ?aNeA4> zR0DT0K@7=fW-N>qA?02sOVq~LleJ7OQ-cczZg{GbQfNp?brrWI)nhtx2NWr7QcO_~ zcRM-AJxWfnwQ>rsbys~U#Gar5$}>3uhunN{5P~XwL;0Ev8(Q6qcP@aHP2wyu zml1Tt1T`yIW#Uu3ui*&08`>=14?)3lVQF+wx|6`wiVxa(pj8L@?>-C;v6F_k_*^o8 zf&tQqHn%xF93;f`3l9jsz=k5G5`B;Yj`7CiQsK-4_@tB+o;-&0?i$io7DU$+3pf9X ztpUai<*p)L+Ktc{I@nq`u!akLW3Kgy#7`phHDW87u9gUD6n@P&R_}1-)}j=ALl!yk z&%lbzMH#K+-XgHMDb_a>ec__NyTVw2+#qNnk%?E}~aRw8f?mZ|= z>0p2<1N@(b|J^$HELpb%5)_Qv@Km!EdqYaIpoTn3r2@$b^fd2nXdLyh5;Zf?+~uIBqsO($|M~3!*$HBHmyX zNz7#Wk~4{HUwk&z%QBM;TtJdw`e31nw+d$aL}LM^a+F7)=`^VtuaB8b!Icak4T^+_ z-86=0ph<}=NhUTaw64iz=?;Qb|#eK{nsrSQJ?=xP}DX8IYT$8QEH{TD>^U#eEf!oU~fRwE|`Z(uFQ^ z?||h;KFS4K7_{B2x7zYpb!JSl3Oy;mQ<@_517%{#kSoV|-Ca&oYZx3%T1?bqT( zJV)5d+gh(ZexswXX&=97-->NKZ=FC*^Z~TcHP!>CW@{}vd__+SUL65HZ?NF);k`Y@ zaBm@ekPjccXVKeCi)v6-yL?w3x$@+-y(`X6u%XKvI`g*9&$Fn2e0FTd9zFdwMeosS z-lnwx6Jgsr)vtp8K7tGz@ELn5vqQCYXg* zy%kJekbqSoH;7x6t&cIS)l7F1J&boCqh3(X8;|PHkHj|09O-)OeWDQ81)1?1o>j0Ao7(EFP1ppd!PAJ z-9+aloJG@+57DrR@;0I+30&X^G-_ZH%y1`M6K`(1 z*0BtCTzsih4_B9P3lRRnsumrGBfG zMz~80?z*{x-$_t8Xo?h6UKX>1;zQXHA*J+`PZi1I%pQah#U;4msh+>uZ6)_fY1&XA zc%dd`HkD(`R|23>!Pm*GfQ7WoWf#TM;jk8sUQ@ z%3Vym`1W1Jw$9fcf93H)+c@7gUhL^F^c?1U4ucg_BOGzlG+Gyr0tIew%-gz(?)JRB z{f^Un$dE-sL}cyALN{XGGBenMu>==jKe4|==U3p6bLA2b z_P;R@ok{GX&j0`(tmp~mHBLAc(YvV_YWt+#0W8d#yQQQ>0I2@j){>rpCSXODHN2;% zR73nA)AKEzyfR<15U-W$n4n8G0^6zfu2L-l>nLxqL<)r}OxB+}N$4i3qx)lEaKdY~ zO(iY7p(0mciM#S7vOQry1HisL!5iT;26{#Q#i9)YgO~Sq@s{qA7JMR465x&P_w?YE zSQv~5nza$a48pt2xqPYz@5dB4BEg4U5#Gc8F1!SrB&h{@AsdOGO0Y-{6Mh$j>=*HJ zWfsu~i+hHs7WhZZrk{i>mVrVcqa)Nxz6glD$t}xS!iEdAVhJ%~ewf87SX}F5_~0ZD z+~El~oZyYi5tzf=3^awz3c)8uDc5^O$l99YQt2#T2e6|k#}MjC3R(Uy z9cISGB)CWn>FbDiZL-+!tr#hej2Ky-IWaLs8>SFPMoa;ngqInLvF%uQ2RfbTgwg3j z=V$038z%}&aCW{VMFhjBC>Pqvw!+$nAC+AK2k0)EzN@2X^FL67|3J0yRLjpO z^M{o0L#q8BsLlUK^?pe0|BxE^RAr=9a7!Jng8S+Ka%d3(=jUdM-cNr>ZTgVf^&!>r zsj4y}@*%YiqCQpWARz_L$9Br(y!_~jG5Cz`o-S>mw_bkY9)<6FPu4z2(~+eoe@@|> HwEX`9*HWN| literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_engine.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_engine.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04a2fcb58801653548a383e1bcbb1ea0c00bec1b GIT binary patch literal 49057 zcmb@v349#ac_&!a2hivn=*E3DZZwF41OeU`2;K)s@zBvaS|GYWHi!o3>V`-{4Vj{3 z8?qG(j1!BBr3kWQn_4kzI-5-B#IqyOcC2K7``a!8cDHM=vvi`J#JjUQpuohE;+@(5 z_v+|_Em`@^6p7cbUcIZ{``-7y>wWdVI2=|E*SP6x;XnRsj{AGM(XJvjvXU@!+{@fa zF2sfSaW0_z@&TUR8v+LQYYZ6KuPI={uVLIADF_rq3Il}^OTZE-3KT`G0V_{sjpMe6 zJz!^X)3_tz3^-ZbJno9P1MWz1pg2+zD2aFio`^T#WzYEW(m*N77L1og$^+#rt#G^| zQW>a>R0XOc)q(0rO`wLQTgGc6b%8n-FB-3p_yRtj6AWWJwHHhOhOB2F<2bPpHQUg7 zW1x}eCJNYHy>f@ReY`2s9B5Yi(GqD5w6eHkye-lmXpi^S z3T%pO4s4EW32cdM4Q!2U3v7#Q4{VR@2<(Uq1O_5I13M#wfx*bGz^=&d!0yPNz@Et7 zz~0Edz`n@-!2ZaAz=6oYz`@9&z@fK5`*&A#yQrk)_p- zPe+~(Jk8?1@pxn=FcWzu@C>_e82@7AQs7eL*}${xzH$6=>-?N4^yJ5=(0u z|8nFjfv-en1GDVDdHjXQR|8*V@s{zgMP3ZN82P=x??t{I_b$l*zC2)ns z+s1!C@{PbZc+S8H5#bNs;T2d5{GqTz*g0Lual$SZ9%12D7Cykj6BGu%DR_jJL+x8o zOxPq0vM0w`xP?8x%AWh#^9O~8S=uWst%Id~Q@F}GO>5b-DU9Yn6HCE#rtj0A|W2jenBeYiV39nM#QB$bzc~juqdg^OxYClrn#OQiM z>x4hLzWyD9+V)$l?G0#qqfn{$=sPTR6H+&4q<)vBZb9l+;XA^2g-W4Bc#G8zdV13+y zJ`U*P@lI%`a4y8tC}ULB7)Esv<9Gcc@89)n>f?~#EIS9H7biyc3KN1j9Getn%i!?% z`0%N5LALCVA&p{%hlkId3s0PuEysl~ObHVsf}fX5PMi^i;n2~^$?@G6gpnx}Myhq+ zi4#YU$A)85QB-amn7D`<$A_cQ2Zu$ukTnw?k&8}TJSPl{hlitbAw}^JWebl{?a`Ru zBp2-*jtYk-L&CUh9h{sP4WHf>9*N23Jz-%yB->c=(C|5Qkjm{tKLjy~XaPOhAN3o4 zO|$fCI|8J#=0a2hZc0v|$Af1k!z03;sfm$TcyhvTkV|$7!;#~{usCu?dq=r$1TPp9 zg3(Da7CeRL(coF(VsKO|#mn~H6Q{!y!r*vVn27lc<*LCm!?8giuyKLvJuF0{!>5I# z!{Ts6uBT@w1Uy5Xc&8YO?3olJ!?7dhD6{NA;!}8y;f$hk8Ag8mV(`ppa0JDJF}1v0 zt)xf7(U>?qp8N3JuoxAT#E8;|D7qb+5{0PTj9_y7DfO{dVQ@t0QE-%X3FDO29oF9q zG<*(qk`7ItK8?}$$&K0XJTx2;Lg^t^-n}$Ck6MLXmYWr_hp)@;#mRy#btTt7go-HkFNoJCkZI=+6JS$9urv)vWcX#BJ5DH;gWcIHr zoqAu_-1;-Ej^d0IwWXrnqBtq;pBSB#9V~iGP@mW}aHSP0muPppgwgOsm_d+BUB-GJ z-RT6il9K&Lj*62b0E$Ub>xGNfFy2FN33DtCPn^}ByR`>VEyaF(WKz^yaj17{6P^?5 zNM(*orJi{IuxO98W@PUPG#?!i!)#$_jh5!5YH!4nv8b?y)T{)(N6rZo1N)B);!}7t zhDm@GWGg`?U@|I*a&bC-cz6QqRP-BVJ0YH9!UXUMvSfjRnr2s?VnqNep{2^;Q^Vuo z5Fv_~2-Ltj5{v>;@X(6Ej+~1TR>GZ~y=hcQLt}0wM({Q{0dx?ROHT{2;5k+XD^gEx z3`c{)g>&QKk#H;s{0qb=21A1Kt7J#@HiCoTxUw(}av_!|o}H4NN;B#$p-J1J$zucB zU;+6Zo*IvZF-XJXyM|-K4Ao|&ojkdJ7ek)tae8eWnTnm8ipgFbco^SW3_Plo1zN^XmJ$9M4o^fe3Bw~o z@Qku0$fd#sN&x1}O0K~uu_V+H427e#&4q&JhehfTdS}PRrGcd=yALa9R!^ZJ1c-(v zBf+Vua0nY#1=~gshet5r!?BSwdRDn+WO965Ko3SSt-)d1ZG)%v?baF@J}W4xa%n^m zPpdb1jC>KC3TG7p=g|1kdqvfN1B=E*?xCaP=5C;|;ZSdXaE{Y=RWWl!(i54UH z5|^^Oo;#Q-tGc@ON_f#!FV1Lr@{e679%ql;m<35Wmta=L^P++95!)eDlp#CRF%xL( z!GINrQz$GxMZtLlvKbIFat36OFaZQFUZe+c3Ruscq1YuYm`5Z^6cXOUQ}ObaD_ite zxP(-Fr&q9t7^SoVQ2-S&0X8pvnGhxLSlNW&WiF;ai*qrRJ_l{B#xC$8$pQE{_Y{Y? z32_7B{BiDW2XnOpa2--5;WA!jJ+NZIS=TNC#74~@9FW3QfHm$3Vn3r*$% zHcJ0p-i^rNEA@UfVeh2gx1#q~ zpGw$U(!G~0>QX)dOkqAXIW-Z=4~q|@g-los@d4v$E?^SWRU9%fMy)Vpq_vDE7NJP6 z()!k)nCJXb`m2N3GW*?WBI4vCBkJVb~>BX|&nD2Ns9Irc{e4j_St$9-()RdYl&sBkMK z(t9qTz!efkhfz&%6r{nVcu{t-3WCTQRJL#N5AjxhcMed9FC$fal>)+eK(|w1^~9$s zAwa<+6fg)Wp%@Lk_&Nmy65=Zq#3@LlW7xqG1FYBsjSu$Fi9-m!>Rbb5mE3 z&vzwy_ehO<6197z^1bhS6Xk~`%TVHhlal4+iqT+p+%4b+`F(tH$1J6DlGwDIfwsGKCmQ zU@;9GETMwuxeR~+5f4Z!CL*+TZ~4s90+xB*s9*-qrW z!b@L>$%X3jA0pc9Jm!)iAla&{Ln0nztA-)M*;rBE!l=2-Ei6XU%&swgE7MZ2si{#7 z+Odi8Lw*B;2iX)0gPw~cOM13t%a-xs$f?lqw&~K$zISWMze4t?3&ADs&qjXK{O|lf z^#9eW#dUj>Dqh0_*{(<>LP4ruwvSJW!}_}xF=7g=hsYW}0ZP^j1R4YR3G$YPAbJke zaa8;!YVsH7LibBZRiO7Ricvqr3luP5rngl<{W`@+@Fo5U1q3~x4&dLWrxg4Gf6)+v zFLBFG&RMo#Dor)EUOG5eezosP-GZq#<@P*xgayB>sFk|+ELQBjbZD+=!L&EkwhFOT z^CqdJdBL=5xqxf!xU_#Zel;Yy8W&7{2GyzB`joHvp4sRv&`=^Zn+4n2bgtx*Vzm7c zAC`-8;HB$w*Udgjy($&QgM$xKbwZLVCUFn7#WEHHV)>&?p580k~25d!KXe4CE zI4Lh&Pvo$6{-@d(OwPit8rh8c-^QW zRD*bZs8;adzCkI49Cb)-R8yOT2DT}hLiI>(zAx2>)D|_hOi**{eKLg_@VxcD78;S- zrlz**Ei@t3e_!5aq;}kw+Je+op;n=Dj5eH{3vbqj(Q3z^U+AQF8?}TwFgo3zyM!4f zR-r@>Ms9W1=yamxHEOCl6Bx5Dr1oO2M%j~YHvihwf=~}?TC0}nQ{W6`R-;TmOZA|q zG;eEIxpkP!UOZdRo~;!&2pb79vu1W3`bWBfO)O8po@X6uaticxJ)Ukxk5023RQm=H zwQ5}qj?=AeME|yA<=rG~&Cbnwpp+;R*6A~B4Q)o*ZJ{k_SAqSl!giFb&V=dEHuhc` zgjqt{(YGB%T;{*f8T4~N(GRgV-GROivQ`K5x_1h@LW8Jl7yfqRc}7btr_zHx>Jzil zhm0rr`nEUUI#p+a+So(og}uT)_PasYPrsu^7FLD!;q4EwwYeW_v{X1Kw6n2IuhIj; zA&iIG7j=vd3L7y?yM@DZ_;x;qSjm` z>=Xj5mjt_nQ3H8r>QSvVHliDZM`>o1nzbkD$l61XW%TAT*3M|?apCdM6X*-I5@c{{ z#uz-Ak@JbruvQb=u%f@GFap6@aPEO8k8}Pp*^wn?og%V9* z3P#NroAe1#a$M9HzrVZmO4PT1m#Zs*6tgE>2GQ zMusPRkQ#=^FZyD___*)yF1;E%6K3T{K->68!sgR=Y2Q;(U$)f0`|mEj#_E%c(}W;bJ*rARk4X~Hms3z~3Bc9Fyobk(F7givDStn5&k%Mqshkqa1^37%7{8Tlbn z9uofz#zg!J1S1;pLBA&a(eKI)$O8V52hE8a<$9qY&ix*EHw`d&E)H=?P^UlR5SRqD zy12LzcT^j@;wIc_d>3XBGXUMbK2C#j-g%bq+cNh%}j8 zJ}BNq5%Gr<{3QhpFi8F4T8haePaeE$D`tx@P7I_N1US-z}9&uUjJnXK4*+^ zgJ_8Jmke>^@r+eJrmtu!rd%O?&GOh3<9x_?wjDsR32>{gcr7nhR=(AynQ^7FaUorb zBbKJI&X~T6(^*4i4QKEfxFjETi1S)bydU1HiHqs$rI`~;XG}4VR&S^v&VP#w6~1QF z%EV2V4Hy~IGp6(WdG3Po5$-(iw@kP06=J^F83CKdQm6VZ<47Xeb{8MO+?Wj5?YdG4x-3Y;qh>cb}~|AMLT_A)EUOMbV~4rPftKS6Y7>N zik=}nGVP5BvEeS*Rz!w-4pG>(w!3$F8;c_5Q}z#EWcVWOnp3AwQ|GW-Mo=x&F#5Er z)c2%rPOoP=;!VD;F6BMDXe7GmJ-XC^p#3O}zOzy){BCNSu5Y6O7&*fR3WK5ymMENt z8U?z~kl!j7C|j6p3PT&MAPY0=rEJ9(D8{1a!?81Rp{g|#_hEu%7b!5=Fe>vU7wx_< zA}9t9%nXC|*KZbohTP)cQ1G`@u0YMs3sKs>7*bJW{BB6@vZ|b}%Rw+X&zlI*{5dMQ z!~rdol+70W(&bAu_a&OQq&$_ghnFnQmuxTCmJJ5S#(PFnk#*VbC@M|4>gPAzc6AYM z8I*bllfC<;-u=nm6H@Pql)IHsjq_dsS6u$G^NKU&J%NPc5+szhB}>;yrRx&w_b!y~ zo81G^hP!Om@zFg8=dS#aGZ&REl~gYq_+nqmw>nY3W|`xA8<$bIam8rzIMG9I>2i_D zQTi{-Mr8gZT7jOwKHweb=icit+UYUAx6QrNX}sk$BQBRJzy_Ks6BCW-v;oTM7-tqY z4C4UXMEQ$+QHE+JpEN6D4IAQyONIm7Wt52<*lO1C4O`8|m`*`JeZzmt5He=0W+Pk8 z#%GKftJyTYN?XnPD}2_HF4J>mzUbEJicf`CllVAZSoRV$rC2Ec=lpIYw_)x{)QGNm|z2wygPt$zU&U7%=dE$q#U*zbZhCX&=S!Qg;bk!R|mL zLFkB1ZsUf?X(LwSN;5WIb)=2?Mta5^H|AS}GX+T37aM9*ubD!m=h@(AEJ(L#xzfKg zMM&4l^o(>XW?m=JGtzBwTfU8b#vZp{vZdLCkiOAh=HvD;ooUFZ$q_5n?$RhlEQpcd zl3kbi7_qACit{m@AH4P2dy| z65~)`$a0#CyW_=i)Ab_F*7P(#Q;Ih&i?!+b3bb~2adU>pA4DzXF~9z_bQkwh#fafF zH)42v6P{JXR_V_w#z?H4bHyvh2>x>}q^{0M!tHgd)^j}Xz5n^B;fmqKEhe-*Q}K1~ zF#!5ZN}^Tz9>r;&^TJG2avM=HKxKpV@o# zBhOr9^fJ%t``7jz`SY#(mb@$9tzo2$cf4Z)l)cyw$bLW@m3VX96Dq!5qTz{|j(A6i zi<_&snbJAKi;tK%%zZ(V>T`~sV>dgQ?ynY$cWL7huL!QvQeQlbF?mG0gFw!iu|_+! z5?I}}z}47=fU86LTCIoy$0-+j1pJ`bexeOT@$Q@M456Dzqh!#8x{`a&(3i^$T1smlO|B! z(f>{L|0e`!`nwdTA(jg#nZ<8ZwkUlSLdvMSFcvV;(khMmSIbs<<>2{nD0W8tfXZ4G zfk;r8jEes!B{-Gd1#N()j@Mdi$}Qf?nC8nlK4+ z0mQe^e#ix_l$&)u1t_x-mTuH-w)6aTWxy7D1#)% zf0A_J^IwV_!j>_l)0Nq5ZpNeIJb2uHF{wN*1W;u)$ zaRvtouxeYAHN8?z@2vd;Ti3PGMO*(;3CT3>Svh;z+~}(#i?+s8m2dW7s=gs%uSwN6 zN%opMRle6dUg?;Bc%iDBp0vC^{mOKrW6RA`3-tpmmzJ_nzZLg)`Zp#l)%VJ{n)*b2 z&qDR;gr(wx%Jnzx?@laKo}6`lP`fHsT6VSS$`h&bid03@y+Tu+HDRf`XXhOCsq(f| zO>L^OI@Pcy)x17c>%ZqHMBj}?rPkS^WiMA+K3nubaoOB-vbb?}V5zg`u9s`*Sn*g( zO74~*cGYyx#Z|6Kb*=gSqi;WopJdlgscUCybzgGzK56wnv{qRKflp;^vcfM__&;N#HWtWJ1a<}Z-d)4D|Sx*MZ6AGkS~XxuU9S}LzgmUl?y9ZTNT zF#AhZbVwB)$%?g7#oAO&SE{BvRkJGPU$b1qRralL=E|xSBl@*sEUGA(GvT$1%U-r$ zv0pv$b9du%E!w;1<5u;4zx(a(n=Ofs9m(b$iRK*(r2|s)j;mjo8@|eaBZP$6J&DqR zI~7&4`915!oTRm=RURCUvQ{C4%)A09~6v?Oaf zrJBwqU+=Hl29}Ln=Z55}ZPKc3@3y~Nb$ivpJO2Kpe~aYblIq&1%*eZE-VI1yM^dY| zC=-)vU6X9xD79|9Yvk8#y3@M>Pwo{LH8lQ{(N>wt-N`T8xaJM_AS0}4ocG_T zd*8cQJv4hLRna{E==@lsV$E0gFCR8=B~8mm4O~%4(oz8p+11KLOVd(ygI)lDpDJyc zKRZ92DD9oK{=K(uc2~;fdFkmFp1#`q!WZW|Z@apGFs*985-t5VP9-WgUMWZ?BwE%b zD%Z~yq$(Pc741?*`?a!#iq*5besBM>gDb7S=jL2Bsq%)Dr|w=A*SPB1rp1PJ_bMHw z&bt*{mB;UtS0~H;Qn^1--j8W)YD+fu-EQo=Q`rDRwxn;hZ#c;DBD_}`T7>A_FJG~hpGbxK`jxVde%A!Hk3 zmPFl*s*U(t*@Ga{Iut?%q8M{7($?n_;a%r7rU|AKhUqU_we5fe3vnl~WH; zCdJMu`YCY|TOG5WP&Q#vF>a;`PDY{HiJlpT%Erei66krTpGli8finEkBqdIai zT@b|TE9kjGSb3FmNqSMHJw}cLl!ImaF=8-n{h3xIN^hb?nb~+va&;}>`0Z>ry5$44V$Hg%|NGa>oN!3jAGOHCh})Z33fbXN}Pyf z8c$s2{roUIN+@P1G(bD2!s8ekoEcE8J22EwW**^tl8q2qmP~}$8K7>fJ~J5Hkna?% zO`s$Mjld2Cl2VQin~50PGn41l0}3!h7okXGrxb*cUI8;}>C~nqeSATop`go9PC59y zeY@4xX)-6L#?w`VHG&qs0L#OG0@O%�RocoIEFnL5)(U(8+N6Kn0y{$e3(i{4W?O zZ31aLvnJ3RJVnCu8>FfYH!Bya24`IIuY zf!QOezQNgDNqehgZ=Js=+1I38bqPzIV(VZWiCxH=L0at3;cvx`K+$VlaaGdHZt+TQX#9CTrxW2Q7et14ds}s2*jJ) z`*9RhHFw!y;L7ZG+j0|q2m&uH4WXth^bWNU@;{yZ_{cJ`*H`; zrDF-D=hL;Hf*_Q~t7$+<>jR*q(M-DhlFOgy*ty^uOjri*#}hQz49Y}AbUkIBvCUrn zz1&(&u5MKHjAVpDGNyzHw@9psuOdOyiH)Jm|Y|_HGOhPwrDjmFv2QVmw5rh`@*z0d-#;pO*Co)=&AySR**q2 zKXK)W`N4(K_F3z%>e_$9@kKRr?I}<7SC5bqxaOm)W28IQ$M^7pYbaqE`Z!vHS}uFn z^WQYJ82*TFD)^SM#qh`G_57QBi{U#Z>$8EH2H%LbSC;S^pFQ~bsGfIFiY}-)_)L}W z1%AdLn&SpdIEbmxk+@QV!t{jZ$zPbxr^V9wbK9FS#-MstuDCI#t3WkX6Z&Gz=!hKk;&h!_EIlS#zKcA-{}%pU#NU(n z>r+woX2oL*$OcB(F!2fzGU^dbp9bDOwgS{+cMMrYyNCQmIFJn`{m5CxY*2g?J&=pj zZV*&?jo1&6P{eQjl*7YtEyyHna;mo(n9G9JP>#p5b9dqfl!4TdZJ2C5nHf@&ooN&f z1QnB=={nfGO{)eEqXlS=R-3xs#7(Glx;%Ta=}UMIt+!vGgtp#j_gp1at-3b&{ey2G zT&UWR^lnJlHq7=TNR^a-?O@7Vk@R**-i~W)7rZ?QTMs?zVUOk|#@zFIm$g)le-peaW)ESsT@I z_N}dpwl%4e+68;<9kN_#l$?$8YZshtFlAV7;z~Du1QA44|3{F?RBffOy#Eu0F{&Tz z2n@!(dkJx*a{9vo;y{az6Zx!-`?WS867VqZ=J50CM^&tJntvrS8EbO2#YYWtQ zqNgx&Sv(P+4?Be5EkNUbh?INQM^?=FxTLeY05rvTmS@$cRsP4JWbhk@84`;Ib8!A__k$Zvyh*cXkEc_1?D{ylu; z-=iCc!U{o*4aZ_4TLj`?V=$C0L8bW!gvEUbvgkZ<2LWV#4nQ&^KXN0YuAe6eFo^-X z#4R$Gd$q<^#I-+bG>`HrVtk-5OWt#~VJ z%)F4(nIP)|v*pdd<^2|Ac$pzja_sTc3@2HtTDq&WOVNXlE zt7_&3IqLN4W9mV1;%j`NNsy7GE`o{mIXY*c=4O_hAeG1%8*>92(DdBUPAwj!;6A#h zET|GdIgK{w%uwE3<1|fVYXA<&!nw=mqAx#v<>^H8`bF0U7=9^}k#g^Z$dsl+5t+J~ zxF+f9lw6$)uI_}T`~In+t?4=Zt=vKj%7zSU00S8U=i_!R!Gw0A6O%&InRDeYk#G6k zw~ePMs=P}wz3ITjH@)%Z+Z+vX({buMu@E`eB?6{A+hry&rEQS%*P-hr+ zsrY<0O0Xy~#?8#4q!2Ba=5z`59jN36%Fy&mV zK3F#8s}Y2XZsCL+bvZ?G8#0FqGz1F!BHH0_!TQTE)}X{Q!?;Bo4OlA7LRa0SJ&hZ_ zjyVXHX!ndnz}^vS)ly*K*$nHkyz`M^y;G#Mp#6>*9wGDgqGyVZtG{XUPeyiFrhAWr zY#^U5S#$^S;lQG5TK4Jw?x9a=6bB1Lo@q)p%7@ibi zm(7&9ZyOb@t0T+(kN5!4fWK6H3x(wp#&c&q2j)=R2Z@IXUoZ_Kkuc&PP{0t0ksf@P z%G*fs32h}DQ7BqqMg3Qxx@p=^8Q-LGg%k@@>{SXDD7b*YZ_OZgH1+|G4MfuGfbhve zgklvXYG;p>F%fGitw^Vc6q8n>d>IYI0w!M&2as{PAqT6eYn$6X^=TzUrtQ+2OY-Fsv0Uu^%u_T<_F(%J(H)d!QM z2bT*uN7b^Ot7}ZwZjfp>-1IEeZk-)ka#ddKzTQO$vb-hXZn@(sNxB*&S3|<(Pm!c} zja0rS(R*~E{DFk~fe)Iyu8kzScSzkke$hN2xqVPJY)!SafA5Jmo`6~Rje#4(H;yM7 zH_o~4fCqB>kR#=7N_iVn)&5jNbFyKL)Ichb;)Z*i86svUhcg~E-BL|=vZi0E>8G6D zMk=CZPBr!{)i%!`|K6i-JbI(!=I}!84oJpp*R61cwIy>!%OzZKb<*7+xgnJQxw~_@ zjceJ--|OI-H{R?>H0_=pTm+d1o8Lj6D?7x0d~XlWEP4$^>-63E`pAujcSaXH{Xe(& z|I4x&kA53{95Z&=xVxVJ^159;+>iKOU8aBAir9NiJQCk)u0i}3NAX+y&Q-YiNhjU> zq>HC`Ptoog<4-nJ?k+d}w3tWyrzK`Q{b{+G;x$xn76nHrgl#YHVxpDpM>GS~Fi;x?w2VGQyO{oMEVk+F&+U5H~A^dJ}lY z3|`D+;DWaJ^ohKSJdi7`MRqadit~N>Ml*d{FRq(_YNMti*MesX7;?pL4stb5uO3tw zscbJOitIvR^KwzENXF6Ab;oi%sxZPz$FS2Kdw^>p1cAs&#V{lbr@Eqwv;7HbXS;x?rbopP)YZw;wXjTHzOKQuo!UxDi%{v*iPzmj>&g64@nei z^gpAV%FtC5txJ72f~l&#I+CngE!C}Fs9H075FFao(fOyPy589%3^g@LuBL=*RR(H0 zworaN;XZ!H*Oc`2-S+iCCgbp58HJA21I#q%n&)prrD{atM+sjalg`Y2QF68FRiwN- zKoC}xfFLX`f%}TJJNbpilRxOWw)WlQ?*+fT_6I#5FE*YeOi~Y=@zDpaiVq?7S*H_} z+4uueAmEP|@y?g|!8-o=nw{O;JjLEvkJx(^6n(GKPVpAgz(&J+etuwm!FwG>ig#Hl zeLYX{jird+a#H?VE)&H)g*$zwTh%5C>z$O+#h!K-4OSa(t*;y`Gyd4kBmU!JGfMrq z%uMlWBjU1a=Ws}++jo;_Q7%-yk;)cu^~?biSvrxkSWF?zTvW0O@4<02^RzBkK0xqk zN&|5)!$V7EX_TQ$<0~FRO*6m(SevG2OJh23na^}yD+6cX-i)038dhq~tUdE|Tudip zp$EhnzcKwBT)JFZ6PP}g%QzpX3Z4Xf!(Ihkj%=baf!!Fk5zb$voaHn|FTwyzxG(~W z11>^~7~OJ;&LiWLCmh(}{OAZ7@=z2bJ91WJgi9Y9m(7g()ub!bkT^`iDQc3{NzIC0 z+};xiQ?Bxa zt7*wynRGWv?xvr+TRxz8i{x%ubhjXt8Z56Nl~UNJvE=jrfO(;#zZ`g-!KF^GkZM2ewxN`EKoLwrXuH} zjA+zc5mAbE0xr9hY@sq_{+4?Ru3jB%Jc3;jF~-@#`suE0s_u7WmuZ$Zu~~wF42*p? zUD=evWHEJ6s-pf{`@1&i=^brw8s%{QgR|m=v9FD<6q>x&S#nY4u9$V*&#=;54uVioh`Ky7)}c|Ixvdpl~0S+q9t9fuKL%HN9k z+^4jYS6-a2pW2(`n*2k)Z4^&rU&mcQG~1 z*BO&&9{R*~0zzqa&eZqhC&f>ug+0OroK5YP9kBA;LyisEo+3^FxctSCF2j)`D-~3Q zLaE9V)MG#I@k3#ZnM>W$`+3Pz~0RIoPnEx*n zQ3xIgf9qzQcWkbttx2*q&F}r*;WrL1+Im4dq--TGZGBy_b&i$3^6gkghE^^fRd2KCII-bX7S+RcpVNd=^c zO;mbL@uevnaa>bzaDne};FKhzH(&+K)*f@fXB-ZNM2;RB&oMLjpJBih1(GAXF1O~Q z5ytkcX?Xah9?&L3OG8zq)+iPetk>{}t$20RicELpEE;Axkw32qkP3ZVkv6U>1!maSH?Pn za{gK7{Q^1jteO1vYmm+@kOMj{e^0fm4%?O_rRp(6zMf#n4^4_B6^>g?fxv)1ai-#L1wgL_vA;- zFK@oGIqB(?Je}7b`u^i@KQ4JTC+wTSGTK{UZ<(yyF4b*+w{W3uA53~+!sF^lmA59` zt$Fc3{1pIiZ=OH@y_q*=<|h;VdlHp<7o7VNmVNh=b2vC;jo;*7j1N*4fuSFn^_GcUC)C~h827*A{Jtpo&FfGg;tzh~cn4yVb9N1xUZ}KcM30w^R3sMzwW2It4W@?Dj2|n_G z9raC}vohp|U}_>39``*Nd~&nzyui*)G5bJ>Y4AxA`1LpkM<>Vvx40QdoDN0lH84{` z4by}1BccnRGJs1H1=s5tWsBMye6EE|A5pIuqSZpIqH<`;Dn3W?Llj)0-~|kfa_q)@ z>S7GyoUc;CGzDMAb9hIB1OJ%DjJ`?fvwC6Tis0x}X$t=N@#-Y(Q(j#miY_C|v@Z{S z&)P;-(-OOmqRiT~srAj4S;q&ivbpo~)%%$MKxwl8_>uAYbWT>Y9q>-)KP!=2_; z*Y+e%(}@r924?>9&thkjmh_^z32 zS&vPk!UwssyEb8|QMQ5-Y?Z<2Ibj4-sH(NITn*^*+rP$0e5SpQE;WVp<8$u`&#wFY zdGq0WX#CTQ*o}ph2h(RD5*?$Hm!k|@fr5C!C656IOd(4#fn!j2v|O%grf_0IydY#g zi|1m!mLG@l^Ex|#0!>9tdFt_IlqOHspPDmXnAc$fkOotq`zca=?&OI z#kN5%RGJBif6O|jR7a=Cm}Rhrur!jbJ_z4Pu&96qg>JQhynt~WfDXo6_u`7>#gL5xeKkr+6NVWwCqIqNpu~C;PSwx z!3O@lEs)2W-rL%Q_$?nlSZlh~;H5asd+K4{(~9_yYk3OmDU_{>{GK2nORA;|bpBt| zt)Q3~W>~ps$In=(U~MkJwb_h$qK5WUwh)V%kxw2X)W*@NW#lcOWv$)Cjh7%am?@a( zz_O*bbFO$AkriU`cdNXJW)RDU~e|GWucbL41Q^v{<4<>1eBg?P7969^fXEayr?nu7;kRJ?{R+ zlU;x;jiTQ|OKd$ry4UE>1H`F{#uxUd98C;laWlUw*}P6_UYBUX{|$+XjW6uKQ_+}c z-jry<|ILYtEwlUYumgZNMfbviJ4AV|m+IEvELy1B3Cgp+J6Yc^)%V{BFVycyR_>TR zkRNW%T-AW;wCM6c6GUNt|k*7ROY1>))P$^t{;cWHuFehW~_Bt+x7e>@O|>J*aw zW=tnMW4ewoADNjZAk&25Xo0rJh-2hUGoSpP*{BVrYYg$}nnH$jT_Ho*NSeodgknA? zIU(K0=1f7X9B<67(5LhQJPx!O2(d>ZEPPG>>Oz)-9W2W07VVwjaWHe7sf)}B}mZmv;FP5(lwh)=-cS#vr~i(xN!WVplOLq!>Artnke zC%;xGwaK)90c#`KECy>+_CZtu{Jj%mi1(Id9;~ zdIMkkw9&WChoH+hZSZ&dy-XyguuSBYNnAq#*%XRfDCnTTPXW9B-2 zK*0_QdMIc^fbWUH&BcAMskj56-7IV8BGQd*Rhm->Xy#?>By){{F@h9NkFy%?-^|!< zc;Ar_-Kv#=z*lK=x`n2|kJ-7>imOH6z#;Q{5N4F6?4A$Y%}ef%C3nq|yLQRlv*h+K z(=qLPW~<$bqq^?Ot9=SN1wV}+czyG2OWw7K{@qL7u0+pvd;@87$($9UxZ1YYhh7=F z-gl$$ovn#YM;B@ym@8T;^QFqRGW9z*tcAb+LW(;?|*r{RqA{VYlKzR=xr{_N?^ODdmFuL+b+;=I-E2#4IwWm6^umD;Dt+_i*E(;S7Am*S9)PK~z4WR{Ilq0| z)NCK89Tr07chj4s~z9N z=}qLDAeXDk`=|&@uSR?YO$<#pE2o2)?Mv3-AhjY^H4X^XDkvp`!(q^muT9Y*Lq*7_ z9&&gS{Z=T(@6!rod|ne`xdIQlCV!numG?=ywKvl^EY`k~cahfMQ8cSCOkQRVw@s?q zh838ym!=$*vqLF3DlOt_x5Hwt*7_T+pr~rD73zF%)mINMJ8<_A0I$}1&k1p2qH_C! zb4S9mLp^U)-p9XUYB#*bw-)@dvEA^7xsU%Y-)^{O@55IhGodA#1ujh52>Nd27J8oDK{O*ydv+iO04foOXY8|YvKu0R1Q)gPsHQ0B!x)R2##U^T$?Ifjg` zPc47K1l0=I_zwKp75-uR0UdfiFAy?89e_0p2-Oc+5#IpTKbr@qj}7T82gKI8&`ta{ z0n>L8WJ%Za!*t{YCl#Fkf#8VYxLb#1A(jRaf+JX})LmJnB5l4+T_Q>xiXdFvtHZiD)-b`Tj& zl+i>PO|X~ppp9h{%7OVs;wRAtG=I5!pq{_M4^$WYpdPXJ-26ao!F$Cvir4Z8$$PHi zNha$PJ+e3JR(0qkS3|x6H!gXLWU(g%8}zqw0s)jU#%L?tdwZY?X@uAdqnUf9&mBJd zISeb=K^O4{sAT%tLGoKo4y))J(nPMD0}%-7j7f6Uo_69qX$=0LS@*ltL^y*Uj&)_0 zWTz*DShw$(FbbzzbUI!+2FB`waGBm7$y88pvK#Qm&NY)}TT_$#IpX5Kq2MnN6Mln_)t3#W>dA##H)wjz#} zi|}zkxPqdO_S!Q1U5_|WuXYhec@BRop8!7eEl_)!@jK&4Z)adbEDUcsfVWhvHsj1V z`EG;{Qh1z$>6p4%GUGPaj2qrdbsUCT3UG!2;znREt(KB_NnRO<73`RS$HyGBjTEez z0kRm=jAvqNEG?Qfq}g1Z!K;I~l*=awjT`WdC>Y&_`FP1&{MY&WBr=`}SNcpxs0{Dq ziaSCDuNsy7GoHBnzB1mpEyPXH*EM1Do^~k@SyZ9cBRq34)-sQIOvrK;o=9g(wi4P(Tv#J{Du4Tye3I5UY;=pw^ipa5tOXSy-+^~VTKMb@uiA;syxY?{Ca#ix&+ zKvhr;;@jr5Sxipg!#TQf6yx0o3A+y}p4o`Y^9>x`PaN_w^$5I@pl%J(ZYUz&q}MP^ zR~}`@ap1v(cxL+K!La(VpXnCnvyr%djcNe#t(@B-$Bj^Dg|Xi-LlW_Spw}kadV2kR z6$)^F1G-(Sm%26Vqpv(OO(_XIeGphPM|J2IlPUa)#-^B&QK}b6`aE}3@x^Fm&OU-s z_#0&YBK#GKXPYJz5=lnIRFrbT)H(W?yFyEaD48fy(MJJOZ~lb(+(x(LZJ7C0k)6Y# zu?%W~DLxg({7)(O7-f8#0wy-TN-^>eDgF(f`YSU8$c#c`3RHcU5_Q-Re?*lLWGMUX z|4cE)F)(dUxoWHy0@Kfwv_S>N#2+J{?1b$Csegl`Q{?LsI$h=C<@o44jLC+fe^KcQ zjR4ZWWTftjg=z(IiQ=I~am){{1mG$gBhk|$qbvhxEJ}-5lXTzrtBAhc*ljhh9IONP zeb*u9xtC~xx4654N|x?f``pC*89ZOppQ>t0HLg!JZA^jhE~`h`H7HvPzPhL`X=#=$ z&54$cw_CQ~Jho`r!FY8z!b{b5yfC!nD8E{?*tB-h(U)pEFyEJKIv_P2NVzNT8coek zxVKt1;}^7vr~PgrVxVE`oG%PPu29zp7OKvfs%nBaI{d#|h&$Acj75me4x#nRwq!}` zY#~G%b^aTsL~S3wUtMQivU}$CUVVJQzUsF9@QuRc>OpDs;JZ&rs}I9sx2T#dL+X}1 zwO3CswryG5ene_JvgkRwY~oAT;CZnZvWoJKYfTGf-3fR1FJXG;UnuKLmTi{GHovinY8-??A%-L6u7kP@RDaVhqWYUi7gp;;5jQ?ZtG5ADBx!4vY^@1cM6TVGTstJK9ioRll700DwvxH^R}U@P z{MSw^C(aVxhf*DDk{w&5jxDo?ucFGfJFe=h6N%cc8;)yVOt`isEZdfk7%;6LLqz3i z`*?XB-t3d;2CRU~I}Lj`^S4a=-nFJ%=1Pj!@q5?dAjQsp-2Bwb@7-wnY3a^--27}E zPvJ%iMOs7b_#Zo|sB;@e)JZz5=kT}kDguS340AJu860w66+a9ERcaB%^ee7=st3jN zxj!JJSjv;QoOEZPQdO!Gr%^5O!co}kuwBnEy$1fXnD`gCUHRk`?G->Lbfbt^lruyF zBo=`!E>6@d=QTZ<0u{~0E}e%|j&7R8LwE;88p;%HKfx{eh+s9VhcTfE{~6`_E(L5v z^;4qhu@!&8aHIMyH*KS%LDtchg05V|98ka2+V2x|Wh>rMM3%8D^21ucB5#308K}&oKj!WT++c9o3%YwY+tpUJ!1KSt^P<@!?yrU&OB9gAuMg zj)mON=4G;YYTa?SMzG>7bw4dH`al96#$PR55SjBGaMy@ue4mGM6z2>gZnYfjZ2?ru zr&QK+W_+w7zh*J66VKF_e23e$I`Lr(UCS6Rxo+0p0Y-Wy?A(Taav6##AJu!Xj%0eEtgFQ`06!z*NVZJG-J}~+g@-* zOm2zkUM^8fu!pK|U`UCW6Fd-48QK%YF*MV^aMbDt)JIPShM`5Lpc$EB|DFP}JY#Pt z7pQK+*_)Etob13Ub$rE1JFU(JCPjq^BZ-ev@C4;k=z7%^Hc`3E_FczL>4<`-$RL_| z*Ad`&*sbZUCNRI~o3bD?IhWT~9B!4W9FHt(Bt;5#}D zuhqb@Z^~UoyM1{ph#GA8j#|j}Ks(sE+NReJymDaviwiZ_ol_gOBsUzAHXLHvYg6ue zB7^Eyr9d({xQ&NFB@}K1l~A}Aesg8@HXEUF~xsjdSR zq-xsWZE6l*22OlQMDF+i|roOh2GPP_O&3{ zd~M14E%0l*z&!dc)pRFz99^t=054qIH9K_2-=#Qe`O!#X=dp!t$0h&q*&~bg);s0R z^IsrRcHiAF&$&DwM+Y&}u#nibiYz2{`e7l_h4`%^)6Oo#EeFnj7u<3hDPBx@Zmr@e z-qnKmk4yN$Qqzw;0~NT*e;|US|5=6*N~JwxWi1C0X|=zy5D2bXQbj39Jt&S zVFGodp!8*H!)V#WU*6;J27S|L;F`&RycB9LrGQRZre`IZhX z+M1R;P4GJH=}*}EKX5hAKe*uPOjtS<{ZxUXc*#;V(B42h*2-5AuyY~!&Al!WM$W&p z=~gFfCxnv`e4Ht-1Im26onmkV(g`~n236h5=%Wc3K!zj1RLw6 z{38Y}?e!uE=F!a(&b-g}VY=>rqLF)=9Rygsl|>$erGG$=<_K z@8S1H7kW=hmY&&y1QdZ?cP#d#rCPF7CoQd#1v$Ze0S(j;C+4XH*@A^!)b>wYq2|T) zB>IpAk2DF-;ctZ)Vx9OvJom~+U}&3IE*U_F)8m&(CJ7sOObt&H| zGqY_*F2Nx)I&#YnU@9M$6o=7U@kt6!A;9c0FsGLl{}Y02&rc_ilGkjR9pnwFen+g3 z*fPgm3huRV&F#sieyOQH*|bk;+PBbjAX$3=&dq;it6lQIMsdNjCShOm0aU!@f4J+IqOs`OAHtvu=$ps}ohSn!8+t+U|Dg%UKsK@)pAd-ZF1BFt zc%RfE=XnK9MIY*7hcrA@w@W%+ee_$8&laX^(7D&8JXJvJuBxP~QF1jVU9C9eKmW{) z^9!!+3CninP=!8ls6`)KLXL!cfM>{GH12Y*d_bQ;D`Oz3EWYZfiube8F}zNwxP{~3 zb;8JVk8l^$Usp9k;HD2eiG`T!Ecytl4zrA%O<$LU-FdAUHAsRE8zPGno4r5Q1MGVf**y7Poj2I z`KGm-`yY!o8Gp!cGRiJS9-@o%>E96uXMZ14nzf}+5w@kA!vtsUtm%D~L}L2NAwzru zg`5iqgVTF!-W)NVvC&o$J_l@4KGq3h-{Lx1M z@}jY}bf2^sTy(%{17D*8S(P?jg}A}`?8&%w3~l6I;{N<&87g|7Ly3;u{B+h!TZ>w6 zz${zyX^rU6E)F%C#>lWP=hEs36*3qxWUdf=$4pr%>WWca1QnjAk7N6^ih!DHW{P5s z+WlCI7EAxa^P37$=ws1kSlyGCwX_2uzjf$s15`fv7&gARBc4{(((pdazWf~81GSSK zHsz4|!vbg|9F|S=mFVd@c7PsoI7sEvAJ}G|BcKcF7Qcg5MLJzHqBhUEM7zye{H=T$ zfkw1FzL8X$X%YbnI2EK=88yfU7N3$+-@DS~uj}eqV#bUCvGj-(WAAx8MZIdR+c_8hwehV|3J zgJ_UW=V=O-?k(e!s4TjzTdyEYo6;`St&nb9L2JofE7!XHYSFzSuCyVAkLA?%NVPr5 z+CHhaFV(mTU&4?Y`;v{@q{eN(f%#z>#O$SIkm9#)|CpV2zPx>a=b@}(>=eD9$TU#Q z)`#$i;#iKoX)E3SmfqPzG3dG>XV5Jx$hU|HvujSAq~II^9Q~j#H8GEwvi%sHDU1k; zUxojb3KzkhjQaKN68b6>j)!NAg>0wcgzp|A1462K1RGqoL&8=WM;y=jZHoUprrP>j z>emW2PkX5B)MiSV6R`lZBK{Y8%ye77pguVDIfLkRbdtGfPfI=2bKrFu+9;*2tcjZL z{MSrtFU{c>&=v5RZb)Tc9hWLsEmU^RI^jS8tkG>--CLVTLogt94=fIzTx@y}bYQ~M zny|MbykoCQ+8ZQ$11wqR@uib~sj)xVxJhc<^zP|J_YkDLkcSj-<<&UuGY;2qI766l zwI(dB%V&AcS(BY9dC5t>p)!tQrAG@=*h*#2mhHtHvoI}T`G3ZXu&v?DC4jr zvu}5|a*mo`x*Jl(70Kdeskr%C!R_Mi8*Rzn-BR!FMDO9{0+XZSmvvp&!V7g9=4>SR z@h59HOSPL*)s2bj?kj~icEXDdoEFx>cZ89t)8LyGr=$JXCX(IzZ+Gv% zqFBa=nxT=r`q11=!rcrP7uuwL{NV#UD*NbOBj@!&f#;}La+fJ?3KIo>k%S-x+z@%`0i#6|F8aM4Mtr$qS%0FXKs zCG_E7XLMZ=>R}Wqqe@TVhAh)WGx1;Wd#X@ylqcuHw;}T~BcS*U`52P)ZqEE5P3vD2 z+ysp4P`+FjQhLX9-$2oN{}r=W8`fEl(U{^}BCBv-o;fSWMOw6^k%y@mL>P6c(z>e` zu8t*`g9WDb^GP;e!qy2uPkP!VPkRE&Vpm1dRWG^f=Pe8PG=as>I5zP+s3waircq=g zNk;j^6GLGCuVCA9MhRbRRK3zboWS@^c0@yykWuTIOe^zDaGyVc``z7qg+_q<$X?=p z>4FTVT>UCk($y}x+OJhDxcU;7KK6!?HSIxjUqa>~MZm#>N;u&!Jh5j87oH6%=emc* zDdZ8KqToCQe@+3h8H$a=MT*r>u!n*c1o*fgb_MZWx;IfULcv}Nk`z2f!M~^AKTvR( zf+r|ANdb+r7@)vUaK4{nofNZD>>vf*bURA1H5B_c#rj!5x92EEZsf(S6l|gZ*aaeW zdT^Lx0~G9}fE0b=48_QzQH)YBNonMBM{x`B62+dQ;4%g5v<0)SW&1HR>L5E?cKSj5 zQ;K04IA%txoO%Btr9~)>*^AHwiL}hcc?#+&Af``zo&u7DiHlSsNHHRV#IqDIRYx<$ z{v$nQVpWoyD%ud@KSYu(;p2qxrxzYkz84~rn*9ahlLcLI{RHhTHs7C@@Kf>9kYYp{?kX zn7)WUnQ1%Yhd$6Zcly$o&gchHYm{-VPNwfGWX65$cg`;i3C&awXV30AXD|Bq-+#~9 z%P~(-lsx!w532=ELAk|-(lckVJaP)g+`oZNe~CC?TWbGA`~=9vlK+KAi2wXlD2H>< zQ*p(L*UQekpJB9>9(eH@9<63JJq>osz4b)Dk_hqj3T2f}7}o9w7F=|>kB_2$u?Yt~ z2y4T~_8AGOec}!6gw}9&$apo=U!K43Rg51##$nwE6{m{1@=zr_Y#G}YU6!$s8=L#MiMR)6|x0REn~PDV5SeGA-&~K7w4_blS%-8_-T^@75B-$ zdMK(SLaJ1}sS}nqWKU)EDDm{DxJO3ofs~R6i6>EtT1Hn>r)bn8XS5b^Mk8vh_?cy7 ziHdmCE8n!qguG1Vcrq>rSr{P^lO&f%P=WlT18B8>QDwEV6T2o^6b&0D*U zH3?UphTP852<>`xS;jz9r)Ydk?qV83n`YOUqUW`)77^jgM2gO60v^BmhL`(xTYbA5 zMjM_0{XYo78i^~NFpr356w%OQyosiduz{}%`ed^8$0l2XNNEWY2EjnnM_9Up%IHw& zB~-@7|4Y%y7#e*{2?R@fD#zclrr&B(MC}k;M;dxW9@SWs5Xfc+6{n#bJcxp2bTkbn z4JBmP!jKXO9!4IX_;a#Js2Urj$46xof(+19ol%?{LJJwixuss^V-%e$-3%}~r^TMj z5(3#+sua0xcl7QRWpI*RcSh~HGdjZgBxIN?fv}YeVIJIf zQFfN+7sl^jxF56 z7o6qY_bx9KtnP88Q!?h5L&_kGI*$D!`6#+eY~EVkJX{^yRmHZiU6a-6*Q))4jgIS@ z++Po6D*fYDXk1B(wkpdgwq+$MxRp9QrVLK-ieyJ=?(R+HukSg`{t#Wc9J>{Z+-xKZ>5s~&$Pkg>4&#M55vvOGPU z&|`Trbn|o{_NsZ^%`TQKI8$=G9!)5T5Pw0kPw9kVDC{YBS?xJxP}ILM90n-bCXzj+ ze0jQb9c!3qSz@~iQaV;&<0?d07}cl*(O1j)h3WG3`?*SR#4<*k5h7wV7>55+EYu#< z+jVMhHTj8!H^+cs(xVd1j7Qg7a@{E-8L8AZyJAEjjq5sY5T#Y8LRz>!kbRx(Z=V1qUfw`%>ZFT z7&7ajq>?mrqLffEQan>kT1L8rZ%=Pi=OIMr^>B>7i$kab4KajaYazb7gwgQ*6vOvWD7;?G}-yu@41Z+ynW!s>f--0 zZM|58d2zajt)beTPTWZ4IJJ>8UEz0U3im*v2Dp~&rfVCl8Xr-iw|@55C9Guqgl1}_ zB-P|OTnbr061Ax2$8s%}zdqyxDC8{Lj@g&&<*9wJirAY#VwlHWy83(py~UX~lbGuL zl9YsZs5DwM@gypGx9$1w%jD+U{;ChFC_S`YwtHx~d(`S4{YUpWjspg}mV=wD;HE#s z7Owri>v3>bb<0Sx?eq45h7>(Ja7ZgPh~D7usJPO$e(^`sMa^JC(>cN(ph)vPwJO+h z)kOQvbEiK%V^8^AMCX|^&v|J^443s+F1NgM=j>g}c@vXPXc*@Ug{$_hn{M2hh1FPF zJz%|rEaYZ@0IfBcKPCbF+-b~Yam_H3u*BO)64oup0Wrhim)vT5TrwLEx zg|j(YbAk=)lQXnqI&ab}8_sFCKwPwpd$97<2s1|Me%gyEFBqhbKkbC8}(#GiVF|G5)cTdzGv)p(sax5}C?)Vbg@zRP8X?v>BG{SH4RGkOd?b(+^IDJv zUfd;TexF3%CJ{fA>@8|}lLi;_Bmq+6YY`@X&5H@?;?dIh*iKe$;yKEnhH3LWfeXZIzk4?Ci$TpUjn90vns{v#?>W7@wkrP#W$2{gYXZL^@LK}E1E{5^zj8mw3H0W()2$bK zkr)Epv2nbWy^pe1&))|=8e@M{}7TD4@#c!yw(js~j&%XhAn+_KvCz%#GVva3|vE$a>^vPKQw)GYA^s<0{!t%`#$c-O)E9c@MQ ztvjUVz!?UJ`-L`+M*Yi(7T|c9%=ZbjNr4WXBq*w%;0o}7BAkw_HNAq3yMrv-D%3GqwY>oMrq0@Lm4q^dojhe zKn8ugD!Y$ay~ipdQat%s9ILxhRdC0dnnxK6r1UoWHlfvTd)`v3JRDbZ>XlPlB#9aUGL>hj$_V!2{#22#}+_L5eH_FG*CDk8n+c4&vR(rjhR zgw;7owLhexsaj%LWFLv_6ERQ~{$-JVB+^erXH}$^#o!|`_(TjtStYLl9K&X8l@6H3rj`6&q4cMCvk9oVR3pO(Mc?;C}(sFibH3 literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_models.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_models.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed11729ec0336becbd1ef5cc0aeb6bd453c2148e GIT binary patch literal 13786 zcmb_DTW}QDmAB{p)QmJ5X(UFY)(bIu@{oij1G0n=CV;>K+wmmc$)Ik*$fFshy9G!d zr3jD>u5~K-BZ0gs)b3VFE<3fc2$1V7TKBVm20)nN`AHmgcor3W6!zW zJu?zRaJKfA=JvU-b8p{1?|b!Yx7$G=B}{)7ztBO*zhOa7b`4l-V+grIIKnXrGR;I7 z24O?OFl~$&DQrxbrp*yEg-r>|v^8R-usLCywnyv~wj>&|wBkt+4NZGU} z;+ghFywl~8a@xn5sF?Oed=$1NDyRJsKZWgys_E)T^>iQ-pk+s*X1X>~%Mb(Mef*~P zV11Y(LEda2`w8c~NI2Jf2DSD*^+6(aTB{pc%bsWr(UBf#^XhGpdaj(ePQ9QZD0)m9 zxC*|3^TDr@w{m{oHEH9jE}A2abJgL%@33c>QA~$Y6K}xN$?=o1*@VQ!CnPA^xtJ82 zNW?@D3XT_#A3u6bib=Cra_vi=j3@cMi8!B>!dAt;PY_bVf#hULaZ_-d7iUsQkyok? zrDEKn)cDX47n_lIVVwWwEH6rm{cwuo6MJI`l!kVQ~{h zTStsqVXYB>D}*Nzh9eQf%Y--bCXU$!wG=WSG-nK(6!&mUNXPmdfhM72%wq0hW48I9%7i_^%*d3t?kSQj*QHtjTY^AzPx?f?75WuC>U{;G6 zu2%?R)`VFdW^}dn2sP*#0wNC3UyHt5 z7QH6NM2wt)Gyas}Oz=Wx-ppHYU7Q6VD}@aBvXhoeWRl^m7t10x&UTSR?3|r<082V9 znj%iFigWTVj`oCl_oRt)0b`Z{?14UR=;O5$=vBs-a~|HE?^(gSRqUJ>`um{062|DY zCaxSt`spaBRYw8t3aCX~v~RxF2es888r*=v#aB?gCe9DFflK5p6R9~y!qtlJ2vE<^ zfnzH1z!wABng8o!r(y!nvGJrR#gY>|E2Y>|sRYNO)L>&LQnL~(o#I(!U+9M4lci+0 zMGUjCB*(^7fgpA{94w?4p)SFTbc_Ild2xW&ce355w{@!oTU3E)f%k=2RKru$HcMs3 zldWSYx#$=`2UI62Zl21q=mZsHinoaBkPby*Gkn(xT@s-9w!ja|YgHh&f52QJ5){<* zmN`wnXC_1!s3z~)vUJ{o-nFHFTto1 zJ*JQ(HY6C8{|0j;Y#0;pqlYUMvp72g+6RO_)m2fU73zg8kSRW38!0}aQfZzZDRl~} zjuZt%s(5H!oQqDVN=hlyI;pBn6mY>xHK>9p%r=t}B%X^3yw<8%Vl%+KoZ`X`dbNOC zFCZ-{73;9UawgtT>@z}Yf)~Y`h89%AGr(42x(~M215CXU_>#a<@ZyK3GEHbm^P84SI{e58qg0Jiy?F#8V;SB!sZHj@Qo zLDZ#jNjsrLktQA+4(J9nNjkvV23dAskI#xqRVG(UvqsK~|= z30CDl7L5>8OsEgK9F!6v8;`|BP)6}dP*rp6WGtRQ{f0_s3~Wm~_0XIZI0i_x^_$2 z0!-p;lATRU&i1UUB3l{ARCe60?6@?tvT0kkHuUrGkHU-6U(LNc_rPcfmR&joLmFB# zY@f{bWdqIExlG?zZ*3Q3 zKVWngeI11O0t1WzwHwo0hBrVL{E)fAfbR4IsS+g9cNs8LD6xc%ilJ+(Vy048_au9hb6sCY z3}~nWg{^ohmMe~YP?_l;`?P-yz-7;oC7bGvXcnAC?(04;&QCw@Q2;DGn;bX;4YE9xVJ>351S- z)DkwU)`6oS^cAOOxI?^66h;sqpaAE20xLwcEl~z&c7k7o4T|PDT#FXXhf6VN$*ykz zKJg)Bs=ZK2>YFm59iN7Fta)tJj(cUKzUQ*zft$Fivc9@2gYOJxLyeixRynja>#xuF z+hl*+O0a!7*txW+>%Z-0Fu7eYB*(i@MgzQn&603$}0B zO1>SaIn@E&30^Hjj%j_*fUrJme2ttzhi##WY$KG`us5Yi>@6~9K(N0CqZqK3)!x*# zbBz5CZTYZjbD$Rl(hWl2Q4r4fc@)oZjMI!H>Ql{o$5MI>lAVOm3a7ZoQqm|ww8Ohb zVJ{A}f$tTS=4N=siLTSE2;mQopYJ>bUKa#5ll&Rght%Sb1FHGd+9I}guxJ&r2!2t0 zY<2=eDyp@XpSX42-wFjpB&r=IXA_CJu5hJlJE23Nu+fQA{KOj&o7oWa)SW*1hC&zi zHG%>YFkB|Qgjo}2NRNsQW}(3*IIKB4!O!5af`D$m;v60tiH`4k>E(ULj?=KZVjnwl zJUV*h<*^aPKOyj7-$tneit!_QtfQ$U_3TA4#o^!w++FlZ3xZA6R33Gp`U2pRsUO2W zn~cLb5Plpq1{DU*Nz{(G(ZVUH6H!V~rKDzRbZ8JNOFOMkY|eKONg9YkcUt|yOl6B) z*>cGa?kKpdzBbv{w%DKX^)CB*v&~yF%?IS>1D~`mH;-pRtvYpz8()Jt0~HU5p}gYZw){lD&ou(l zFLF&d73`I|UH>C~0$<@gJ6g&7IWuCm{Dl#qU-jxx8Xx0Gvc&-Xfh#eYlr#|ci?~z2bq1lEw_mU;M-OP>TlZ_gq;lZ zyzRE{8!+Fl8nW)|HQ%XYpl~N-f=+iDaqyikX5Ti;oo*a_r`L?F14eA!jVSI6T1LyQ zADNj^o9iQs6=Am(>eo4&c~$-Of*(;@Rb#ZQgtM8Z*75&~sy3#LpsK%=DuK0>i0!2f zyNQ(dQ%WmnP_Wutfci3|4WPb^Us`=7K%~D6zrFBt{*j{HG&Wy9&QGUKgOH>y@{^KQ zh@)JnZe<0n-+EzcDH5iQnlLSKNR$_+XmAVP#7uxpi?*qfje4{q)NY8a`AXoMZ!Kjp zm+1IQ#P?IkzPzY9C^sMcWdCyW@l5FWQ$&@El>0N@M%mkVt$)eesR@pyK>r8*cU}9k z{ti`Sj4k<&JS@*MGDwW$n#5QF`BB8mxEJTy;SONsVGpn}K)>)0R|6{#mjf#U1gs2| zz{ls`8IX!s-zeghgf ziKje(MwKvw8E&DF#0>f7T`kCi4$itD4@L+|$0wg<-uwfiIuDu39ya2w&Un~QJ?x@2 zv!(yjE&VILE!n`KY)$)0aC0WOT@G%4U^P{i<%kKCV;OPzGR~mv3|=#3*zGd>Ik)48 z+K$CnGo5?n&OJ*VgE_*yz|=xsQ@dJ>$8JVE=z^9JZ{=fpTy=ivxuJol5auc@^A8aw z5THgjz&W9!6rm+&Sml+vD%257#bm7SnhU2r(H#JdC@8gxo4L1aP-Q7k~V zj>OI)y1r6E_t`>!8IjFGwvosvaUFj)agAh}`(*fY_B}ydOM@>h1;(>ATPS^Qm4jQ0 z=+jk1R~b0BLMeF&l8Swbc|vi4 z{|3h*;GJu#A3U-Rm}x9MPsFfFEtMi?dT>~RTuT}pd4GPv2;ME;TPP6q-!bo&K5QB`RuP|B59MbO={9{K!kz;jbsCjWG{+kY}VTg z&-Werd|!q-ie_-q8C)=fyJ&_j1F&}Vb3jp%$`G}NTYg;M{vx7^;=e0$BQboE|&2fSj{V%(aXH~^mqnS6wfG& zao|@BNfJLj0~e`MVA;|e9UFuxI}4}ACyRpF`V9~LS`|9aL&C|TQzNX&+YJNk7+4S8 zU?3!=_+*?H)}2M^SF0R22GbL#Mp%h9xqkcv2j*0QQ*#7e@{NIwJSrTy4^OqnwU`3C zrDtnT7?0eO`L`9|v?+k{Gs93|e1$)lATo3VQ->q$N{H^-5X+&6sck zXkhgCd>5<)Z;Qg!G_a+##TPpqG?Jt3c#AkcJPOBp&>KDuc2<=f27B~x1-IVj*cJ%+ zwBYH2Fo$0X&S@pRUFe#pcOdmwV_WB&TiG*Uc&U4c3x-QSaHkILfueF$*VH9&uwbe{ z=iTepH}7G&6r5dPcm6Eiaix0HFJKsnT4w^@*c1+9hT$RsBlSWzWQt3p92z5{H(}L_ zJPGWP_l1N3EMlNhz3rM0B`MZ3F(C!VIf!`UE8UZ>7HS=sK9bfzczs13K^TD60C*nP;5V{{xujj~~u59(DOm$eU z4lhF1vp(K~XBXXas5=wtl|#MYYn6=pWP=Tt4nrUY?(zh??*_Xs+p;a4nU(>$W#EIs zPYx`%yppMZ<+5wlS1tXXs}?Ra%5@YU72t`)XTy_xaAX$e$3JgfzeVAz8{VMw4e z8){!Xu=xD-)KYLXTh~Fm_sDDyOa|Sfg;>*L@E0G$5efS$^9Zcvnit{G0&95upoav* z_leElfQv8tQCZHw)b*@BvHWbX4%Sdp`#B75_Mu2`?w?An7TNgk#jm)Uaa?@l+*gjMZ^|x4N)MLBVU_p3`9rNv>QNQU!FEd(h z`>=cv;M-mU)ZeZ^6t_dnsLyh{9#PzGZiM$z1EK%xRQ^(W~lktZ$vnNeLk+GdC;=I z%ldI6v%kglag!C{HY>CW2jR8DwHq!%E7oX~OHD+h0!F0ZN)WO=o@}UhpxY7JPg_o; zQVBIGOyk4!0yW;-pofOSHxNSepWZ$aFwiQLVK#>uTAKoz*TOl-6t{MNZ-P%GqEWc5 zf?`1gb|WvST-pMlfEj9c(E}}yE|Pl&i`kZ|CB6OkowerOtJO_8GdzK7Hq~EUkZU`0 zRxCP5^`@(P!J7+|o(2AuTTWPMOXyA!Wm9o8V%?yQHhYwp|6g&$pP{_6IF;^_^%@KH9KX{{4 z?m6-hVfE5E-6kr{_u__~#OxGiFGBXj$)FJ`*G3^LzPMCy^U`?V*ih>B3S2F~v5?>_ z4Zjj3xY1%9gX0pMSTWJ_4HF)0z{ebhDriBsL$G19nVvmUjicQ>(u6T}%`giCRTA};Qy+XSvwKY5J$AS32t=^< zBc2KzrMAvWarl1=?FKKs*Q>HIRjdVWrSy=*v6MUM_SRt$4O#VKm2H=LS#ghW1?uo) zihl!H&d4y#=QfC6d`6l+BMqOCP5(hU9~x|o;UR(S9%i3c5T^Mv68>j0^cm^+7klO9 zuP)m|=Pba12IoVAfq~<7$kvQd%+aEQHW-=K9Dz)00O+CXTSlhh9>F{}Y9s#5S^X&C kp4G}&vTkqA2+!5B@|+o-FhfN!hb=i9vG^X?0irMUzhOv6fdBvi literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_pooling.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_pooling.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8cdb1b3506198aa0a8d978924126f3c4e9911c2d GIT binary patch literal 13003 zcmch7dvF`cd1nvaZ-96ZPm&rPuza;l2!ZgZ8YIOR)a%U5wQBDI*ElghfvUe(=ySd)9} zU8hp{x(5xA6xT_mayJCKzyA9D_4j?#{pKKcf+@hXGR z+lWOhlSe5vW>O4EYf>7BHF<48m(msVDLsSpwRuCqm@*bjDHBOEd2`APd|lpBu%@hp zr_b98_LQB_hPL_%kI!Sp;zN-*TMG0-qcNb!*7@=+Xc%diNLuh-xx6qgBBeWym zUl>RY5Zakf6b4g+gm&eJ3VTv}3d5;kl5Wc<3nQr!Lc8;$g|XCFVQ*@0VP9%rVLUZn zm`F`9NQ1ZmZt^b34L_+V4SEi-p0^O|<)U}B4etJ{Ov3knaAGE*Q5@64)nfKJuE_Bj zvBWE;S?;TgTrtZ*p8iy>n8_=;Y(@|hjAA^Cvjp)SMzLiCf#bz=k-NByE;wuYcu2h5~_C{-`OBXLf8y3ZWlIKhO zSx#6e6$MUlzwpW#zLezz0W(R*if0?IQP6v)RLbXy7mr^RIiXd(!Nmii)dQ4Qf`M^( z95lz#$f0RxGu`OKvx9LWBW4uyc_DL=d!aa2!t;4$fh$hGFw60mVKBh(Y-7A!V!6Bk zY};9`$Z|XgWiE${1=E)^`5X&!N{jqrF`L2TN(;Fq4l=E|VoprwRS_`?ooZv4uPllS ziy|zPwK2ML8Q~4ZKD)TEP~t_7#Yt$ySK#=ITsngnI-L{Mg%%Qe#d5N6fn!gcySl(B zj+4b~iG`7?%LI8vHkT#t^Z|ySUQh*{zJMjI_;zqhWHeb-%hb@*NNl_h#C0%c z4q*cZ<7MDa^F0l#<+QAh)3JI^&ly<5oF)h*StDm;O-n|g%$%t~SvWImtH|yp+oR{?Ef%d(e zkM%*^&b70C&d&z80E_FBnsY8T_?9jeg!V(g4bSP=5Tql^=n9kSm_><*;@!5;jiopR z%MykK6~r{JYHvCVs+blVMxi)iRnr_+`LrNrvaoI(v7vO97?ENq5RFls0$t9u2aL#3>tSRftrhCj?otnd-vbK!U8d|cfhZJoUl?{+;l!V$wNKqG$uA+oy zCZXdKP?aCV!5$po;Ux@;9;VKLOG>AS8Kn6koH>kxBo6RYz;WmVEG+#$AmY9q;ot315e&{D^4%8{fq8iOLjNeF&<4Q&RH z*>S@tnF4aCU-ks$a6*m@)y>Ep+(de_rS3q{k!9VwDI#|a;y5Jldj-b{dFa%#?xAU5 z0~$h#v7sJ{mCw9Mm9M$TxCoy2Z^Qq_k8B9NgcvZ)GMaC^HE;p?lo|Ri`E7U$e+`!O zcQkKnzoplsingr%hNi5Y#WkA?PwOEx-(2Dr?{0&v=vWQZ0)yQ7lr^-!@1pN(DtggO zONdrV&6Ax2O{p<$nbxitL^~}}W<&?2?>0tSF`DtX+bF*kw`N>1K?!#&hf?!iO0{BM z%Q6*nF$Oi-TU<)bV^6c=p(Ad}Rojp&7}iqjW1STbt6emW}*%DAm)-rPMX6h^>-5tbz9XO=kDf z^VsfpJX==4_BpkNc2t36*;dxyGtv=0B~|+teXxtzH;{*x=Of$lv2!7hINGGJF_ULb zOKYt#yryiw$W-jbzuQuu5yxp+T9W^FFq0|DZN+6~et#=PDc~NYbSs7y4!2U2x@V%R zzs(|#(fn2nZrcnkKhxq+YWGoGVFVjEcC_mV(fn~*yA`u$7WZ(X#i7(a%UzR7?_8NL zgHNeA;Ll$C@-`WsqIH`xoG4r7U!dGp3_ZTsN>K{N{W7InaZ9)EHMM!?h+riCEjnxy zU!nC|wXt_<>Gnlbae+TE@&};TU2NhTu5B}W${Puq7t!Zh$d_pEt(f|bN?Xwdv;Rsf zhf=K)kpCN$+lsH9WgRfq-)wOxbs2##b5z{Lms#hpDdB_?hgsLJDdB<=Ly*6lKf-=$ zm%hJ?Soim}RIc!5*>&s{ciFvr{mTrGJ@9TlddhBS-Mf1|(yzYNFRW$Te3nY26_b8< zc_UB_Wsu0x(q%hs?*#hFTL^s_-aHjg#aq@xZ@#hzG)PbChW4mhvUs)GN>d9eA(pwwltW9f zFRFw=!vcHJKDwvUsqg-Vjzr3la=7fl&iP&RGQV`j@oK{7p)=Vpu8f=V<~PqVvS-4))U2$^RGY`-j>$nM8Q$4=nWH zHp_9*OQN_S92^3}GFvK)u%)apl3B9bloJTQkSpY}0x6~*?xF-o!R_JvWV5lEB|U~r zh7}8&D{w^tfE{6}w?#*x!R#UrV-oXMqs5XKZJza&_68UPs0$Bp6#xgNH=v27AIy}Z zTTrm*LVgjDw`i^iAVNORvB~I}JeLu;D4A_kERk`73>IMoA_148nIcOw`WN_|c9M{j|g>`x{1t+T%3%djjAlE9Q0AZzr|j?s-= zagj^UVXQJ;0K5t?ImL~OlLZ8(DmoZhTSJzBxu}$&7@z?F;+7)ki%sE!>}m-HC6i!SwI?bxOhd!jk<=?cR{;u$_u5ENU!#AoP{KcVGc0s~g; zBA>}}=}TM&U`IhQ!HSn)(XT>18^M^;DssmkfzsKnC;;ru;Uxt)0e&@8oZu_4ldmvv z@-{Y>BjD}J|Hi0EwSDGKO80{+SMTW7{d@kT^r@hOtZzD8 z%#ufiqoZt2fJH8HS%5?n&nY$l;<)S^X)<>LIw=-a4#iA?Q(Wl_U_Mlc=^U$Q0q0S4 zBABpZ$-%ONrGXRS@jhPhfVEt`z=39p=?t4$0CW|WL2XXeEX5owiu0Mhik)6GG}tOEs+hSeS&rN`DB&i=4wm0=A~1c`Xxc>Zbq!>@!O~&d zBxs{zfC~<6zGkin9u)3KIK_$qd00kb^KP6Y?KMJ8O7%+SmfoAF%JYYC2QIo;EL96f0~x12n@KAS*kWTLOJH#h zI`zf^+GrXtq{8Hy;JVs`;-tdMvG_)$z@NrFo0}~vE|B43UZlzb7h?uh&WaWEpfNB( zaWq|A?kmMU$^9?Usp7j&MjJfEY-si2_Bzu0+5u$m8K}j_rTBO)eq4$l|MApX{H3ZZ`lNrT*1uos z-(Ty0QR;tD4iCL+scX@|OUwq+47fi*nwWdVv1veo*zM$x2G;$D{&Y$9#(p?^ulGm6 zYR{pMwVG!bcin*cC#t;@w`M+U3$H%6-aED4wqM55I4Z@C%7bII!DG_kF*z|>OPrJv zC*{%c+UOZ+^i17|;>R|SF794&)-A{zuX>Vlc(CdngcL|TG$MsYWN%pZc0gG3_DJ3y zIhLr!rli=^gV@XOj@J&oEFF4T9-F9*ot4JU!bpZjW%%_cYyHzw|8(7l!lN6=8iu)8 zeeRFUx;D4$^}+80Z)`J!+z~w9BkOHP<^GXc{}HMGh}<{)9}B-%AS*sDg~#u|wiZ58 z*Q0@>8%T>~qK&z4IW{fG-(8I#`)|4*zxEURyI-$%zp~bTrs_IVx1r~m7nx;CU31Jl z`GK>KEbxHjAE^4r|1<>iSnn9w)M^f@3k|CcOKw0Qt?s>2_ug9fjMP1|)_vxd<4N~G zt^1(VeQ>S&@QMS@Vmf}d|fjt|{B{RFG=gcP2TLzDN#+SE(Z)JxUL)9+1J12dpd5l|NmsEY|j z`+rdWPI<-l#2LN~Ix{7~&pFl7)%Nh}>D8lk#0>V!SceWt{fDZ3hu@v9YZ>3M4XqaI zv9`@!w`lDLWmjj-)hoGrZ|CoysEwbL#!s%hPJxQ|ChrvA?>Qp5qASeG5xIBd_UZe~ z{qFmoYHVV~`2=fw6f$LRT=sV3o1^YwxH)R;hMOa3+KmG7s(ZPCRt&`Dh>NJ-T@qMaJ$AR>McCrojKS#B=Y0Q(4Z0?#ZBH1Fzj50>#rx6^8<#zo}AV| zqPqV8lg7?|bx^4(Kr5$-vm&k4|^P;A94iLDu%X z?$gTsgie?+Jr;c3rTxId0R4d#H}Qc>k7*ws5dXWdl^~lt9)m*aUJpY|{B;~)SSj1M z*#QzEj49z$;|4B$ow=^LuD!0iuD@=$ZhRXpYnKhn`eoxXv#eXz%o$ncEmOrrxRhsL zHNZ39(6ib%24EAT2TF%2-l#!k6UEhL5%Bd*zPZI0o9EXXO+=r;pBWftvM{9PgD;Ui z@W1hYA^0{74lZ#pRLN{eKA8hfuzf}001pQ-mb;?p;IpI*B<Mi{fMLMiTu|CtTW&f8ZH~Rd&q66hNA~Z3glqjF4v3x-?Ig3pDynL6^PD2! zlY3*+eNBDTy|$D;axZhC(B{KHqWiCpZ`Y8cd1_&y8+5Da-_ zZ26=d?XKE7K8y|BdFB1s;aiRs?aJ$q{DX3+<54gv2fF2fDY<7vjt_5|^`Z7ny(!>c z(ZQRyXSfzWBE^s39jeRsw(XW}_1wBECcCwLq2-bIcVKX!wYm!>_A166L_lUhk9B+;GI$pRif&#g6%{?ZSXdAe7XClujQ@lK zqC@`$Q!Pc>O^2`p!8){6e{7AFCh*D9%l}-8Lz+8;H3lm@Xw0MT2{}3X2&lO1fM*^w3!_rAqeVy$QE9`_`uqGr|aG4A0$qE;OSfQ3{-6c zvORL!_`u%(fv4-=AFbNDKePo_yB^vSs(ND=NFVGQvHEVLAlRzh;HS5iYm|qJ6%3l~ zcz}z|WXIDo3{YVB=1mKrBS3cm?_3lw5#WPA4pl%&@kJ1S0S6ckQMsBH3bPeW4*fbYZ%8xAhw0Kd(!X_oTj z134bM6CUpyh|}lsON*E&53e}H|2_`h!T|=o_%Gt18wXJw#BhMsilB+AxAd z!$?*-onUzUNsa)sN6!#f&-dXhT9vem9JDY&<8B}^L%_TAA*fjxCQNjp#=yn61|Qr$_CF4TXDskaf@ zjrs=b9zuFisIyLbs<$JH<6}SJPNM#ykBq0Z`oNR6uDTu*(0_Z!s(9-+>VyyH8cVPm zPDAq6fgcDGpPcEN!V71CFqDKA)vl$R`XlUB!R6LMYyL@)zJS` z2^Bjg>Z(Ma$}N#^JzL!n6~()WEozIQa#bCPX5*cVy7R#s8>R2_+N9r)sx#sJ|ER)b zt0kz`7JRDZCaj7*iqig6zAA_i&a{%u#^l48VMZ$s1_n(46Wb&m4y z0}tD?a2WzPJTuIO2{F$9fvoWRIU4^t>iipIe~iM9(cu3=!;jH(kI~pC8Y`m#c&ib7 z96-!8^BA3ajP^c8haaQ(V>JA8H1-(n|3qV8G#?`fK5`+8Z{>??=FV$|O~bs7v8*2X K7~z+b=bRo|Ha literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_responses.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_responses.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8774dd9b60c0ffa7cfae99d34a4d908f157c7185 GIT binary patch literal 61788 zcmce<34B}Ec`tY`R)PQskN^pQ-~#RhS8)?XYNe=sx24!I9G9jENFqg$$^~RuH0an) z`a&maOusIco1|7!&qS`(iMmYVrhPA#lyp2z-Yc$12Um#R8)e2bZC~G;A06pEFU|aB z{@=L^KQPXWI1D$+?*E_{^)V}+JAYu{xL3I- z7v-n9i2Tb(c=m3H7}&2dV#Kdu+7vfO%sizrPFv#Eh&5gmDT>=7ws>)*n1!3B?QuuM z!TjcFXWSWaGQVZIB<_m1nBO|>j+aJCnV+97iah-kJQH-A`S7zNMpPy(iCrw zG{;*aE%DY!YdjnY$J-)p@%Bi2yd%;P?~HWDyCPlj?nrlhU1VLnC(^^}EuQX;_eJ{R z{gHn5Zl4~Aua68ezhinsd?+%^{Lbl(@lBCU%wICSIld*bh5236TjSdz+u|dU5%%t$ z-X0%~jK+6FcEoo^cE)!_cCql%>D}=?kv;Lfk-hPKk$v&~k^S)lkpuCAk%RF=kwfvr zk;CyLkt6YkA{aCDP1*Fr@uQKW%iACHX3pNc%i@_VNz;?YQy`OBwc@#B%>%wI8m zB7QP*GCmoZj88?TSeS46RD3!z&HVo9czh-@%lv`q)A7$oKF|D>(*pV|!Tdq^laVCz zS52RZKOK3R`KzbT#-E8i6F(O@$KGqE=i<*so@M^p=~R3^GS71cF82A@b8qqTtc-lY z!0qOup)YaKx>)orqZ0c3IX+zf_w;fo%xBHf*yqp0!lrD=NaEbg#O~NkOc+nj3Rx$6 z-G=6#-HALm4ZTBg3@5~Wl_Jj~iB;>Rm_f4xbeRAf6oVsz1)W>FLr|IjC zj0@xOY;`UbK7J;N(Bw!osy44mCwhnWoh@qYt}`sPfg32hsiG zCt`)Q9E&}Z%xXy=j*TZ~XVCSm@>;b7Etpd0bw`bU0<9ZL)OdO7W-HY9f^xL_S-6(H zK~1jpkvyAMO+0OStV&ck<@Ae^J6@YiC!3s(t;X` z(TWNivepB$Cr%5qPi0FL&#~FlijUUm=q#I*@r*mxX0u{vG2I2)b-R7nbFCX!<(d5w;wWx*I+69B9v2d^v01igw4CK~s1S>f zPtt@&1JGk?Vr?kd7i7zY*i00aP{Ss)j>phA6#$E&JP{MZC0PfX{pwnB?i7SsL7k=9 zvc1$F$Jj^ih$Y7-(K?U3a90QVlw#GK2az~Bo*d6D5M}sP@a0ugj$)WJn2Ta$m|r;C z?_IbB;PBpsO9^tY!>yK_P+FQT%?pt$rOuIkpJ9NdQkz3+I5Nu#s3#Afj?IkhJsK09 z#%LhGvnt>qwy5bu*qCj~O}7b5b#*+)&Q2yzvH>5P0Bp{7AQ>ikQVtT7dJ=fe8$dSG zur#t|SONm(nVdR#?sSaST`>V=Y7j;it56;y>{Rj7Nxa)JZIWZt^1$;#7&|;dOOB4y zx>1%kmEz8=RC%ZY4dhh#PW0+{Jdvg4g#~m*h(+Zz2|0qQ9ho`DYMz{9&9DpOGf~X# zGiN5F6Ecn=yS2*8okJ#$`!dku7)K~E&|yAGe@6V{m4Bvh*rMi`F=~mKqSlx>YMirj zKxktY#b=9Iqs6hJs2w2={5rLml9)|N<3hL_f2H^CsrCOLy3j}J}c&-_Qf3J9=Aj* zzhsVhS^8iMwX(KX#mW&=jnXPs$DrmKg!))$ZOk7H#R5oKhpfv47M8C#+JW+0e}#PEsYA*P$oo;s>O@&>tOvSc?OF>v zloocQg`N27F4k^2)D&HZ&~B}Vay?CLT*q=+qCF_Lhn3rl8PFH&jaeuL<@Djp`%yyw z>d{~|^eMRq5I;ci#|_c-@UK_=gYXZsnm1q+hVptpZ%$wghf({6=;|~Z(GImQm9fg7 zNo)ocMK_^_A(j&}R2flIbTdMS)lkfvEqLFE7CX?st?+MR-=fVGxgPetdNXEQY;&ya z1RpC?=EkZSecT%zL5W-bNlV<061Qq2_G#Ox_Evu1mPJR=x7!xDXZXm-Q7*hQTd@in zt8hE9D~n@|EnA7B#2H}*;FIm&*nydEY%-e2y0Ih4IOEu9+K7d0X>5j$He=&w0FC86 z+D<=ey!?Wc0KC~MB{7gEz+JV5sIq%jDzVs@Ct^T1CZ^9sV?YQPBCEs!L@S$rBI}}; z7;Vb5-%p&%I%5pC8k=CqRn`(``%Jb(kPl7@e9t=BUIWa7;Z@nHRj=4l7@kGw0d^B% z3`;eVZ`3OGq#SrF(%!hKGS#2^ohr}yRo`2R2f4MnQ`}UA8lU?Wwjo@vRDUjw z>YI0^Tv6kxR&=N#YMS9wu6b9oQjMLeR(-kOs5#}*uU9>_^IWnK zjgJ=PS`@YE_s`S3;5tF=)fuy>Z4(MzD~`bARQJ8LCVSK{b#~-hkt)JhQAYNl#|l*!Zas zYMzLN&c;GBIJpTSoS0^xj^Rf}-IaQiCu1R;xrA6()_MYPdNz@mJFsh32;rp@FCA3C z6=rK6Fj85EvX8Ou z%{tDG3+%i)i&4qiP99f_E|yzMhnK7!J*J+blc zKwCUP2W?7jXOt1;#r+ia;uFH`nVINVay)S=!S>5=Y1Wj8O&>2@Jh{RA)ppv%>=|GQ zS=$(Pdyp5##u5a?Lj}+CpB(8<#!q*jo;=-|92YR^ozI*Z9NKVdFxEXi`BeAmbIFsl zGu=;5Psh7Kj}gwDo}B~{up1^W+9pDMn??5)$>Gj8T17`|Iy@vghBJ;4(J^w%J+R~+ylXU| z01H>&N0rpv4%TLZ>%`zXsk$}2=^#3^L28jtkDnAfQV9Sk<7VV^Q`JE382 zh$EbiaJM^u3FlSXSMgi;By-puj`p-ws=MbYU_io1)UbAnuY)6|Un*iu-!#t~5UXI8 z`Sk*OG9)$3aH=RjPWE56sCd*-?wmDTG}i}`bm-~v=}FMA01Oj=>LJ=N7;#N~QGg}3 zjFU0iM%?)rKSBJtz%k^J@%4}#i4AJzMDk>4E*JvHip8QR8foI=&y2~?5b>coUsvb| zb~0>ips$?4HX~>5%9c;guvTHQGB$~PGI-ONEy59Dj6qVXLcGk{i9QID<^&E1*wC}i zRh49os3}`M!KiP_0YilV0x>kR_U$7(un|A>nVmAsy})2^ku z-8byJS4@1V_mePjsp=K0dRMHRx9bjP@|LV{w9bn;w^#U0`Z%l7>-y9F zc>uz-roGUc;-|E!kFu0UMcjn?HKr~oVGo$P#@y6J9W-(Uk1&j~>5x|NB)X8kq#zKl z_i2miS89t}r);WI=a_V#I?bQHPN1%qL&B@5cCHI(3l_~&iO@Lkx0zWWpJz^=o)ti8 z4iVv1#qvUI)g^=rl<7Y(tnz8#q3WWI&c+h?xkFDfTwwDt;Y{pFhWugm3Nc_tvCutf zX<_Hk1XdFe*I5OLrFD-5ejLTz28d`|zi7>bwu_tWy-b`KZjk?||XETF)Zw&6u&%a_amABk=aMdjeIPx%X zR7~Kg7~rUab9gfLwj1`gOJ_6P`)_pbUzOXy2L|5PK$(r@HB(-*jCUxY3`F^}>L#L5 za`1(WnnMG7=a)_*%6~bupW^}6Ug99GaL<$C)!qAxkfC@M0Pc4TuNwcA+04xwQ^ptk zNew^H-Iw{4Q9TEPreHiuwP-Y(TxGdh=1nQn3x<@5rPSHSb-`Z+O4UoN0}@~4*8qtg zl%WI864t08#pltPbb;HvIc0v~C`$0=zDf1IEHY#gs-@Q1gOzVOINp z-U1mG{8Rh$i;yW=UFsJs&(l|E)CRp$j-rHli;Y^KQ6BQrX`?-FMGB4HluI#Hm-DN> zlvQhE5pwB`M1z_(_p5G<+PpF<^U8)Y8guDXpWs7yi~5%P1=Z1__Q~fkKj(|*?emVA zTBIiaSHVN6Q#N%&Ev!42KSzzhe9@uDT+WS4Ny;FUrr=I(E2s>wV_JWm;O0T6ns=}9 zb&Ck!ac}+?zN8;Mwfo+*Ir<&ieQ!Ot~S1FmaC=smCDtqdux`frC6)n z7mO+QeCbSCa=-euDYrTsqc);KCJg(y7kTu2sf|k>)KWRrR6Dpu{)w-0+~)wv=F8@3 z1szsX<$hBhHT}Fd<$b}F@=h3djzOs-xm2nzuQi@%asC--4(Qu_Ip)++HEr%URgSuk z<=$1FI=km9QikWT>SyYa?ol;1_lp_$+1#7zQ`=oQ*XL8-P7v@W9N&ix9`QxH>}99Lu1^*--My-%s3x!)JQjPQxvyXwPgRraO%K&oOYs)px& zpLZpRY{?$AmQ=-9uNtPGYrZm7IYrdZf=4Yu*qRDVohpdLtA1%yM93?6@I5*!Ozk)Q za>mxHg&_B=>d*b^LnH7)%4n9QJgJJ5KV?O#)44pVFXg*ze@lVR^TCu$4??7ZYH#yc z7w5lb;;?G=YbChU*r+47wo4 z^1?&Ob82AnS=Bc+r}}cg>L{x1ov%*LtFcqhslMEAivL0dX#3SCxTrJAT`qY`!BXaH z(DqNctEVWf&c2?inR-4~lj@`Ie9(@L9@1Nh8dXD26<^N1sXpv{6{)H( z`0{iw4Oo+Pl$KmVTHw~BF>TI2_of&>$tuLu{<$mu|t8T9p)~erkUkRmXfB#CZ z9@Y0!h|QncsozjTa=!~9h=LfRrC-Y1Ro3X?AWA24NWr7uQ|cT$%B5=Xr$1%{ZC@!U z4zFz9X``afx}#WiNKYh*f=54nA#6X&4RVX-^GnPfVs2>z*TXTK(F7cx;aY|Qee7^n}<73=e9yv_s|BAU^ZQzn^Y9o>zny*Xqty6td8pKsYI4;?*`j49% zIbrL(Gw;MeME`;(YE*lZ?f63l(RfWci58f0P7U81huNd$pD;W|xUTa#C&PL1dzL@T zY2Rl&|6=}^p>2)xPV~7>jmd;sQ_88fPW^pAEr#<>^df)$Z$Cxv^XETOTX+6{N85f& zt@WRyZGUJy%ZJMj&3TwKpiD4fgc5M-cFi@)p_4NpOU6KCnSit;gCke@cu@oDJFl6HI3025iKsh}Vi@s^jDhw#b zymZJ%e{gdq!OFDAq>QXZrlDrdvVewFp%388Oq6Rv!97qc$6@7g9{k-Q81vf(oS6+% z@D)CKWWbkj22^3`QSMFt5QrRKL&mJ}^z3Pf5b|SLyGqOeA%+n^5;)Cpq355M>8?ea zrlEfj9p5rHv<8u*khIaY2@J0B>BJUznm!>jroT^FofOW5 zcSzn)Axg<5sgsXMS}LNxS?8*_Z1E%G(`RBkNx&v+I|f#y>~%bRW`<^h{AQV&oS7Q$ z%{o;%WjWY$Xg0Z*geQqs2fNj8B92d4kxv2*E;5 zCS*oy1;GW-UA2TUVv?R9hIoF)Y?aC*h{?jRPo0^ZPEN|4+l25xQ=LX+5zfOW{EzGv zm1IpcFan{SSqF&_XzSj9r9oIm)~-x;;>>1i#)LC7V>x=Gnw(mPKFIpkh?iGj>3B3c z76s9iN$FB4*(P-y{hp_vbLS-^a&(uVW1G#o~0m8RP1Sdr^4V5&jg#32gb;w55|RQaL(V zmpU(H=$puvC_-oII}b2UEh^R;4&j0P;TxNVEPU2A{xn3!ndls2%xC#Y8Is%OhRdxj zQ^gU8K_v_134crt{4;X?gq$bgWQ*nLLz6OV(<-J#lz@myt`CL3pwva`=VQKL8VEl^ zV4@DrlVDce@7X3pRCgi3oNM`%kkvwP^xuKu`0?KnC|xPz>Y6g49x>E&rDZ8JoT(gM za7p&cw7r=)!v|hEkPdWRaV|P3PS~wsDs$UPj7R&^?#X$EJ$5LSP z!ak{{K2x(!tXY?-*&^0#S*qEV@o!t$d)wv7xN2{>YHx!p=j{`{DC6xXmc08vFmk2C zi$xc@-`oNw;3&VF&ul*?Za*e@gBPB-wD*qDFv^#G$Qd@5iS8jvcWKMrDz2*U?Gu?T zN5m~hmV*!BYj4^sq@Iz5{TWBK=%`+HbfGgC-@EvBOE|4C`WaE*lnJd9L+jF^O%iL8 z=&H@QT18jurNFYQPjc5~+^wRU-rRkKDg4W>-dqa*vb(o1MR3_QkV_FEbl1kR4>&_}S=wEX z{%mN=)Nc~&H%Z>+OQuZg4zYCyk~EYpy1*axH>7=S=>PJ-?b?P+?Rv3x{gr1kgZsq6 zeM`0bGlBi!l$O?Dei0+J;fAL{^3}Zh+`oSAR{hZ1t?wLOsz3Ic_5J#FSNv~pU8>&? ze@*+>XBT(hCT2;i=xI%R`k5%lCb4DH+m9`^>`r?zznU|?A<;LK_Kn=yd~9j+BNtrD z{$Z(ZU8ZhStQ%e2C;93!zP+Mv@3L?I?W&GU)u>oCDuudbW=tmZkQjQ1O=TL!J7!~b z4fwr*){K8x^barl_u|7y#$&TswOOid&eZmZwLMZp@0A|0VL+t#v zRCkKiod{eh<*K*b;cV4qi?$UnR~pQ?>qK|m#qdwv9UsJau4&8N8J=@{7Kgw3g^P7J zTn*`>hP10es%aIAf@xcTxJHceBlQd|>>y!~3xkakYi&^U4od!7$=@RRTbZX` z^4Bqcn6S|NwRuIY6TRT1Kd`6`-`V)B zjTbx9fpx5$MbkM)SP}SQ zUb}5QfK?{2UJR^XD{*DXhn$vU+`+jE z+%4vuzFUs+TMj=HAqt6(&@D(2*)8tkUvNc$nRg*|r&ea|uak1M5=?~|1$d9LyR|H<85n0b}I;0(6)Y-N!|!{v!9 zb#EPC@(jG^82HB(Gon6O>CkEFX&xW+@!c}c9ry*uqYqR*=IR2Bi27yXOW4-FRQ~uT z{x$y54%6>$uY&j5dj8RF%e6rx`G>0>9WY+6u)}}7j(@b*biF=6{!Jsc&z5pOGV`Cc zn|@^34F6vl`OlV^{>oHDe#dAVQvI)kt&eTt{<@Zj|F1(M0lfTph<|LO<;TN2e0WKh zZg(T4J(nvK~IPyRYenQb^8S@mitl%jmz`kNQUu{D9V3q2N`9pa?Js{6pkOs`@!Ux1eQZIl) zfgYh5p*jp)gq9VS!*Nh4P};AN&J?ybV*y&{i{@>p38d2FU|JKaXTmUVJ`1wi`7%a2 zF&+Ya0A~j1t`;*f)ac|yG9l|#oI-s}uos%BW*IA%)kkDeRe^}J!b5OW@yaB>kK7JazqAhe?3x}|=y8RbwFLk9|8`4D^KK?~5 z)#yNyPZGo&e{pF0X8tci+t-_}21lImUu)&JZ!}#Cd*ELsQbDVUh&6Y90tXUex~mea z4+Kxao)dqJYKkooOVAZ;#Ej3a3S-`y)5x6jrlhv|Ql`tfa5Gv#LtOB@K;(X%@RT`f z+Jv-NVWty&%6x*SnIASE5{Q#DH^L-~T1Yy+g$ckba@QoXpOd=+!B9Pug&@{1lNA;1 z>@d&Bf7#-RaU5?!(U?w#EdtD%aKcYfL$-O%`7e_W6zYyZ%=m2CPNpJtjOlhAVN^-M zgQ8Gk!Af8=pXiS4SehUqQF+qK6aH`TKG*YUCj6>aSOHxV#8ATF59c+A>(q$$nv2ah z?5(%Tnk4s8;Mg9Z9n)qE?>H`EgfjT6u&f2bO&Dd*fMpTo!A#KEB~;S3vRm-yXg=$t!;WB@Fig^pP>US_f;)P< zO4Ps@h5UIB;;A(SPs;S0hVymU8O$ibptqA?W$4XUBU}f0h_!m}| z3Eoc_N#@1Y?M3__c(?PWAGqLLE$PJj)s6gii|OhnJNz;|2v(6~sud$XF*47D2Hx;l zF8s0STm?S1f&vviU!9xMIpC@{Webs#`J$vIO|Hh>E3Y+gvyrU3hVY;b$^-Ch(vc=I z$&ySY@z%@rfeXdtw9JZpQ}Ov?pb;H}O5o47o|`!bZV$dke)J|gI6%!zCr<|{ifS@j z#EY7Me8@dgVptHe)E;sCNNOdi@##_W`gv%j>7iT6QngJfy9yN4Uxn{wf6o5tY6NPD zv^fskgIhUM{$QTjj8W#8MGf-~)Bw)?dIrP&YH~b8&rQP zC!b+FwF6735qTQb*xYZ_#0~<+s2P%Szq>{n3rq9Ws1<1zSjt~t)rZY$c?ErF($ff? zFHMy`&m^((T3f286RP%gw1{mSEKf=|Wzr~s4`^DNUgJd9~Wq8&cIeW5O zP625QP0|SHSm~6;*H+`P+q2e0jp+0I*`)PEZeAVN&^Bdene@2h^Ry{y9P(T}kTEk& ztRr?o*C+fC=-yly7RAT2k&8 zhc~y0o52B)9fW*m={V&3)Fp1um8AOgON7>}GG57@YD!40xIcAGyVP8{U;Uasb%}d& zC8<9B5+PfqARd<;s`mT5KPNjibwJIZ`z7=qI`AdXiZ6oxwzU6G zzJ~dJBv*#&Q}(R_SdX+1t0{87Q=iHCRUf3?9#Q?dUv)K-v>c8Om3k6}(DtT-O&}G} zuT8&XI&3_67f1z;au41ID3!bukdL8tC6H~9ty6`flL;KHXJQlJ;F3rJ@pIvo(LRf; zu9Yo@ih~l9HIF|9JrX1xhd%4{G?IP~jh)-3D7~sj9M#TGf(}6wX9O5zN;35eIJq+s zG)Gib6!7_-?DPX>DLc84wij8WlHbP202SG+$X2XMrGM-r+DMSKaqY{{rPK8!CfE2D^%=5NU% zJQ2eR!4({fFp!K3o^4gB7z*c9X~-n(Xh5r&3^u7^1pc*#n^9A;rC`-F%w3xX*$N7} zw~J-ozVkSc_QXl>Y-s=)H$H2J&M)-RnH8|CRsKz6gZF77J1ffvY1ROlU8{}?3sSyF z87LOCYC2`T)Xc(pBoKHK2vTR=Bn7~1r$A^yn4Cxm?R13kD5d0&yf8|+I%r5*$=Oe@ z|A`zX2=Z0(Es*nLa{hrF8rpC$@0k9_6v_~Pc4qoMy%Oy+cLWgFIqHWL_Dgcu>{v(j z^^n764D5^Il!ZRV9K`~K+yL{+ogu)=C?|in!c``D1vZlf7#)RN8pWIFo7t=rh{2n6 zF>R)(#$``{@R>(3%hAphpG;Bs1rp75uR&2zK?kgStr7}oqjTHOvG>W;G-v?%)Argz zcG3HuIME9N#oya7!{BVf$ z)`0oo9KKDgWa13}X~UKAKWSd_4K2HeK7{g;lJXzH5pfQG@}Wx(L3aFK@-%$1QpMGG zE?7VUTqps3j(Fq55+%iHPxXZpZ#=i`S)X>SPkYug#qRB5`Sy1Pm&*4;uN#L>SN9E9 zw^Y}fsp}Ezz?D9DWp}3kpxA#fU3X})M5+&G>U+id-YcLA9uWHur27t~>kluwq^6Ec z)2P@q3WobFSJ~qFuReR>p|5;FqE9E@|!iC`4ewdgeC7II7s{G$X!~7C~vi_ zbe8rEr5!_XrKVAu}Nl{++rC|PIOU8k*~fV$wXpC5)P7Of~t0+3J03VPyzmz`R;m( zy~PVfD}I$xs|rReLolMgIa5C%)(>RrheYCS|DgTP+n4HhExK+)V8hdPX*ScfL+sk| zuKh;WQE6~1HK#@Nwp=RuuKi8B=o%6Ri}mg9@9vJ>8g((sVlnbHV_6+4jfS_Qv7V7cbAkiWW!Vw(b^73OPOc=B&G zA?0-=rM&LqN6ReN-IVfrg&DEegGP$2u~SHxAMG$*Z)36REISO=>+AW^A;F^xnDu1$d#EF-G@8A!) zEdSnVB7fk?wk*bMG z=*elajD-#Hsl+N0DA7#lRB`73dj3@onn0!;y$Coq#T7hnDeo%wyMue;*A)#RGt(2Y z%v*u88%})=dxsH}8R#pfta{|n7aaQGGo>k`E=9?5LZj!*7vYqWPyJF_m9(qPO%)|I ziV*72s5)vqFl=5uo##<=il#id6sivf8I6}sYHIb%q35Rbj~c>r=f7-XUmUi~ZFra& zS(8m1U<5i(h14;SZU2ypvKB%-GLTHig>zkmp>fZD`T9Gbn9n4ScWxlfd)P6Xh`q_f zCJYI|kzq}6&tSYQYgJ0fnx8tCgc;0Y;XfdE*36K$tTP%T0|?57p0&uY!Yw2b+9;h7 zQfI;uw!<@-H498vBSY9GBO$vHBhad3SVz_fltBQM4Dw~Hc1YvGXph@#g3*-Hvj1wdK!l={YaQhw$Uzm$))HjNLb8J~p^;JXE!u+n1N0)eVTR|e z`)DE}ik=tw{N<-ktGqCcE5-;wl-H2Ui{)$15qWcp!&stUSC%L(PD6@WeZ&$fN?941 z$d0>pv8F$)TX;YH3jLs1zN1TKuu9I%$Q)t(4+QS$|QJk z?AFLpaxA&{a#PB9*{Uw!87M(sE>ejUYJM!|Z(=!{!RxafnhR)aH4bydWC;N9^yx{g zuDM}#HNfBkOk$}CyK=!S%gJ#V;-Xa#k!37>W(!5wN2_Z;IRt}+gK!}KuC2MSNjOa5 zv_^%8$f2mPU1rA6S{GoppA(Le!`5|}!ahTFtQ4E5QB$p)^uWHwpQ?4_%N~=_|z~n333-bq<{}nrsZYpgptXp%p%N^ zgA;lyKV{M7M_Q(c%uw(ktqu*1Y=kIFnZ0`mB2OK|V!U=O8wAn~2lkT}1LO8B$k}B1^BHhjR;QO$#2%LRu?h zmevz(6Y@^#$&_@6B^^>ltyEr3ZlDPVVOQrJ&g?2)Fn&w*Rv45rD0~3k&=-gc6~w*XE=BGLe z0`Yp;r2^_1>H)X<2L5}unmSsl-=C=>^jaBAxqB~{t6w=^3^dS@vcYbIW2#3jA@`fJ z8Gy4{@tUcMb4XCiEH<}m*a;*f+jlA(g1M;>9Ea<~||Uj8DO;7nxg zY70??@D!D8V)bP0<1^4(l#KHGHIezytWCQbM5v{*e^;)N8YNI=S(7k39g_*y z)Znb~_{3P&OxHq;Wu4m1E%c85lD~yTLf?(j%Q@en7TaL|Q@-OTTrE6F6)cgHA}0d} zrp%R#BeedKJL)*4n4%PRpt=gzAt4VALzQVLgc?fqIHl6A@p)M7ce?3jRJJP)ojcZP zIL?v{RkuHOL}RzrP>=%`uP`^s5h)gY(u zl`85p6)j>#%ffEp9aMBZq`b)T=tl8n1sba&^}WvW`Frk+gGMzLw5RNtkj5Y;tHm9;BQu5{g9&I$r` z83$s13AhIfj+=Jxt?FK>-2aW>*Q+nMKo$r7B$bEW_pZA#ncnrtviH#yqajd+M*6Fu z*iuunK=QZHI;t5FYet~!MMxT4QXcCUYx*UaUajg7t2$s^wst!{-BJt5-QYs;;_eS1 z*5?o(ekEjwCo+RH!ia`hc|JjJyoAC$qjxc6=; zSBfI5zVZ0ipSW;1?dyO;`0vfmUdCNrC#;YhJ67YFaWiJz`DIe{)i2 zyODjx!#S&9B(9c>#5ujJ-mS}ycCbknNvk=>cS{w% zMJuU^IQ_r;#pevjpP+UAwbDIi?ytH=40}3F@0x9U+Klg(`1Ujz-)%C(zsd$CAwvxL z{P*xMA}Y+H5iluyU=~hSS~O)OAtq31u}zdHC>#g>j>6cUqY0$@oI*W16~SjFHW?pA~7X-GE? zq#M^Sc?K6oC5K;@$6pThUpcxI8hXz`gaGe`yB4mZ{nFrfx4gM!seE{$7}Hftd$E{J zt6;h)$Y_15#N7$2V3VO9FpW(H8gv8}K0*zs+?2B5iINq*f-riG8or?s|7dVtcosOR zhRx(MU|<#OJQv1sjLZj;m+sj{n^07)-Z=+Tb33M{Igemx&dEut7<__1?*-A%onNo) zNB$7Hoc(9_;~Tf4emvdpNT zPFb*4{nB%%p_~msbqRv7hy_a}Lx}+92-qcQAU>R>WybdHr{T+*U^qbtJ4yBg7$B5i zm`wF8v{*I@z)&bc2xK;YALRxJQHmg19ryqwL?a7*kYJQbpZGQ<`;Z)3)nqdSD)Y2R zRUD6&se+!d6?+f8LU5t{wWYAvQrJ03Yn0>wuttly^&6nGLK>j8H|(|0R(bjGONVb) zHL|_7Pps|x02YhO?i$H@r<-#G()LC$+FyO{mFF(TGtKKo{PztmK-q<5ys&54(S%Jj z)S0R6TdwU}v=f*5)N=c__dFv~LszC@{c^+l3s%V=dfoM!>vn6O)ZY8uO>b^OE!8#u z)o2b>Lf^M>5CpqW|J_Q?QL%X9)%Yv%H;yhtS#y}|(q*-<2os5BzNd0Q= z9Q8SaB^Mbi(T58{g?_l6j5nPy=S_ex#;0*r5Q(nn!;mOc0LiR}vSqpJHRVesV`qi`iQtQyn2L}HQ|^KV6Z5+W7Oubv zTk-|>V#gs9n9O<H{EX@*QQ@ptHlKszPF_VUx5@!!fRaiQev6z7R2b}mFsmM_NhhE3Uigt&w-%u^ z5~uKU34*hFY|@rIEiXmTZcN@9Rx3TuZCMLX$^qT`<$r}h@H9RIAlSlf*qRyKb7OE1 z>Ez4TIrQBK_Pra0{?%tJEl_9gdv${K*W zxd%o9IzIqkr~YD8-XPM=+W~<~%jLFb+ykO};L7B(dmAK=h0C5fye*J(@oRlnM9MK8gor_ZA3hx`^O^Tjz@wr;IvvS>>e zd8+*mj1d%I=oA&D?5rJ*ymmO&Y=>j*b^zyYOWEg(d2WUWMp=|9j#@~&foa7!^WT5n zuo?CN(SiZApd_!}k~Qls(XE$Bw4>gpRFUp!B98`pevq(ST{^xtHn_VjHx~R(8q^s=jVX>e^9%f!`2zZ+M|1sZswiw}``6@TfWFT{GZfFpZf|K@z;8 zcekQ<-D|Y2UCjZl4VU^YYRRkm!JV@vsis4Zcf&5N-m2BCI^$e1qMR#=m=!(RvK-{l zg8cjc7f;sAgn$2@0+{H2wrCu3!RK&$bJ!<5Orb2pF!{bsS=o7m$nx^8is_So{P-rl z(#a$1m3O5*@}<%TWfCI@%_l(|hI*(O7vbX0Wp-S7nacfJ$~i<1eUp5I0cU{|XgY*T z^GKZ9IYr)s*>3zTs?v(fsoAN;rk-CC5PKuJaXQMBu|QeY8SfyeKO+k-6|1SoCJFbL z08Lhr>Bhz^yhJ5CSQ43#0fA|abV*w`vDOM`po$cP;6>iK%@5w5SBw0wB4T`>cIR3Q z4?7^O>!kw{1fE0hL%VtRcMraKa49gnu#`d< ztlnim?8+?p_lb^uFYUpO3`H@hl8FN#XXixA|6e{TqE3&;9vI>1Z`Fn`wJopTw_LkF?cblS-4EkGvcaG$zNO#>si!wx)pfI~ z4&=|8t!Q3rPr7IHopG^ece-b9x@8|(Qu33{rz@WaeN(I-PWw0hthW1tkqT~D?jBvP z-I4b1NZ0O=f;F#id~M^UwxuAbs!cLg_0rMzsz@pwT$9E%IDtMv=lnB?0yJnjJM{(^m50J_dGi#Z%7fRb-d>tfVqacE}Vp^0vVy8Wvo85 z?Z1q_J2jX>vI1}pnRrV{1++my0>A3}Yz%*W|cDIIbWj+}HjEqFHJ9S2NY zpCFez-MAGn zQ{=}5i5GmCK)Z1*HRn&DJ3K2QQd1erzp+Z(Gy;hu&$t;eY7eX3_u9!rq&; ztuUV8Y*~y+ZkV2{7v1$22bSGo$sNkL8%1~Hd+ugp-?V)6+PRMaqsf|=!8Xh+j>(3_ zw!Y)bj2sn5jxIZnY5F{xqRwyZVMfA+KOtyeopF$rviBVIAEg~t+RG=m?H(u{y|M4o zGk;mV+_?L8Wu0tE2R7U8qbninNx-UK+Si+Q_risdOGPHRXTRv#FLe$q9$xmgsX7vx zDZ5Xw|1|C4QRl~rix{96H|=urKk#q2a94e%?Iy$3Aiv#Uxmsl;e{BQ8uNipwubE&r zg1ctsM_agS7Tc)Dc+E9DT4KCzGb7}Bi5Vf+Jx20Zl%mM%EoGxa#_K~p@c*!l-_dFL zVY`X^-2*$<8-LVjhyO?2{LWs}kJeepzuvN|$oivA{LZb8A8oe6|5qj(!dEfGw2jDt z?MdEc+8I)gBUb}w)-g@wQ3sr14XO7=kr|eAWJv4T0?Cyf50+ff?*p}u z*caM!ZBu=D+!T%Kr+uG^ZCQ|jIe)cK5(aJj2L2)s)<0H3FM(a*PtkQ@r|=6@UkDu; z0R6NeQjxO!HkI=>$`t;b9MViIgn6`$t1wT7cmFp+=DL10cozhwl=x4`s95we*@mU+ z;S0wy)x%=-@Y{pw>SOO5yHS1Y-ABagV;SGEg+0LHfGnPT!?|ql(t{Rca<)LT>Q{yq zq)}z%9NoZ!{h#4KzUA`&_d>oBC}e$v;y#iAkm&3FWr8e~zsT<}@m~sT4|3mxqLk_P zOz?fX17TN7_|bODRhN=mOoWG@WCg%5Cxk{s?!I39^4s*{~r87s_!_Y}LPm5B}-i8qf+h72?K zjT(u}LcDascY&>H#JF4`Ib0NY7y(80;MzB|m7Y^*xX?{@`xr3@;N%ZYzr(-EgA|}0 z^HbI+bQ@FxQdBlPLUyMtXn_+gu-vC?A{m(USP^Q=G*GKp_yg8_9Lw6w%mi2u&6R&< z=G4sW*_`Pz9HNIq@`VN8!xt#RWM|}5pOmTRy{>|+^KAv` zeLR^IvZk}+lS!2?$&i(QOY8Xi~4EwL_*4 zza11?cZx+#3-)w*6Vxr!!Jb#fuRQlIFAnZquuJ}i1?TOOGKsJYGVO}9InLjn&>IaG zPc7B=-YDxul8XmLfA6AMstEq3{g$Wl!obC5P}e>E=?w>Ncn(Nj2q(d26)>SiClFG zZyDixwQ6LY@lWjV{ab#d%k+aT_^x*G3RnwHn%6c>sx*9afV=rn%9XbfvYE{!uQx90-hB3?m`FYUt zL9H4xs^$SDNN4xTEvIMrd6Q6$o#ejs7TN{1{X{|CdF~64BBMVnu)oIN*YNi_{O$S_ zi|rBZ%Aclu3XUF152{Wjc1;b1*F)T+7JQKckmy<`YL!YgQT98W_}@mSWbL?y1Im3T zuu~-D8~q)|{Thf9?m!o;NL&bW z+;VAo5NFK`9#}JT37;f*Q-dO9cmp0POnBZG-bgUd>8jOE-dEa}d|N)mJ>vv}05>29 z1Ow>ZuXW=v?W#=IZxme{wZu|22(zP*u<`fa;f($=vY?^ntK@3i7WSM!{9#amXCc-W zvS5+%bcmjgOOG$XI)r^_p&t%xGrqk4rTw@H!eDeh!oxtwtHq0CypgD>WPf0#Qw3=R z)>eVGPga^?_x@vS#W~v`kMMbtq`Th@z6bN`B<8bX=284F6Mu$5d$Dv!CxGE-4S*qh zuXc?#8?Txj@LnzDM?;pYWoGjG>M4AP!mr`zUT?W(q3~;VE5fgp*~wqck2YeT?t%Zh z+dR6NyI#(ZZZuu5Fp8~Im3V50s4vH`<}vzFFj6gRaKJ{>iF)0k7a(4#Lyjh;%wBU{jY z6~4K$hn34(WUIK7(NN2rqlH~P&hA3$VmFjSBafXAiLRWr9Xfn$Y}et>9NK|Jr){+P zS7Hma70-37v5(4jgbHq3O3~ErMYiWTc6UWsbcLmIXv(!-3SM$v`CQt&{g%Hz0OL-uOUu>U+jrsGtJssIwxafWf*B+R&66%+|1@{4BQL2jKrU+got{=Q?mX1*`dsCJ^B z)uXy%GM0AWm>y`z_`5`Z7nJyEKy5UjHf2DwO*m&jnTM{pg6Ub1U12P13^t;kf;lp% zdIKtwVWf$CRtK>M*^#sY*r$xLMEUt*{Ml7ooY;hssPKVMw*EciIf|1LC6o~Vnd~g$ z7nU;em?tW|tSooTO_k*|p481xk!~o{W9&T7pKnd+#^@n4;7M=&^r@rlC=I)gPGJ`` zy>yTtSoNQkYuv-kFL#;{v*M=iO)RU9)|3 z6faq`B^2vQdC^~)lsobJNsrpvoZN_>wxxPZR|y8t>+|YS8ii&s+-y0v541F`DSJ*j zC0dknknSaHNMbF}le5UEmaUQ=X7tO_bTLzos3YoRrIn~UhijL{+Epjl35uwT+67yV z#W_o-QFjy)6)?Y0sGg8<6=Ai6Xa)L!zN0u*e%ZHHTWQVbk3h0r&7Ufd`k~XP zprX*=tfbMo98`OAzJk?95Q{z0Dpp&?1A1B0xK&2~zjMS=cBPN)|NXP{pKH_^#M4~? zFEp__yR|;6MxPb4xu%=`a?M)fHj`Xq)|_ND^k_BI=GCAx%RrCsQ`6>t)w#tqib<5F z;L(R<2-VSx#U&iPp2geCA$27&vnF-wbslT4CRp?(m7anpi;QMSPmNeORY1IT~&CaGvY5SuJP3kYkakj%_x$2Vo$VDfhxWe zT*`5|>A~ZcgQ6*46!^g9W)-=f_p`FMYGt>eYzKo`{>!av&HcO{)u0DK`;=dUQeidk ze1O$Z2&DpRLn-2I$?W%V+aZ~T{A*|*w4mh+8Ygj<9wQr4S%*sGWpB7c6*|vy43p`S z6O#EWQj^Y>YN@p#my$&dc#~(kET3q%Qg-F(sCD6zOW3h}1E{F1Is4b}!9WO8F8&9! zP&iFdbR1VgVg7P8E|}AKBO7YA6AXf^K>E$LWj|uu6TXj02-G|`Q zoda-G!WvURTtolt1+tNVw1tn%upoUVd2E@=Ao`oc?%y7XD9q zWn+S??>J$W9MW#bShH!vl(-lHN1H&O1hjhcCm&VvNwpSL*aVvT;GwWWS@o=D8ghX( z=@fkTPHS2>U2F|7J^eSW_b-6Wsei%gA^aV+pDv`%pBKV^M!-MijELa~zG_B<%YI%n zaKV6E1Yb?Pk|Or0nJf3bdf>MY+^iY+Soklf|6j5fvi~Bz@Rh<|XwCJ)+$(cwU#Hv) z`+j?$)(iK}Re46yg2Rhq^ z7Zs*V|J9&VwwO$iIUEw6#9C%j$ns6jLkN{^83gY!_>co7aub<(hi#PqZ{a}a@(h5e zncRMPH>jkz8gfXKPN1WkP)!c4@vLcTc5+4t(fb#uR>Hys+B}7Na^4{4I7P6&VRqI? zKul;Lhv?Pe%AB}Y-bF8eLh;`x=Oc2mlQfb<0ybQwH{4HCP2OjAiOlLzOK; z?yY5NRE7$3AD}VY)w5YTqoiZF>9Z z%+|vq{5eP9wC?q|7x$*GQSw&Z^tIe-4qtBkZrht}Z*?p+4=x^-e2w5D-tcwZ ztQoykwLxkck=FG}og=rJhNUp>vRyBAbV==N>)irmFj@rt*P}{Bipo**A z!53yij&Lt!mGjvxl+RB2oVAsAI48>&+{WMX4@h-eC{=hfT~OP$m8>y0^-4`GEU+C_ z*4Ny1qM|#-l3>Lht^{T)YB(6Jen|9orM+QXvXKW``OFdKF`abm)+BlyYsihw#IM5X~JLPjyJ}0`O ziYpIa8WFvnX>V)csNSsH`v1RwZ+ERrd)se&LK#n+=xKwoa#rl1RJV;9iawG@uD)9~ zBAx56yGnmmG{|Pg)8%s3F6A3nWgQgj2O<65*q&+FDK_j}ZrF|4PPdEi6hk|uuKrBd z7O`uK)YL-|}I`Fzf;|CR~OYQ(v}Nf*Ir6bPJ|dsyE__W*VTTA*p$zRNamaD6hbT@gb=V z2260Pvep0^)Ij6=ehf48HF_{>U?$j|EF~bDV;#O*LUD={jX>mALAT_rQ z--r)@X&NfIm>OM-MpxuF+6Tie0A|fJrL87^*`i4)Yn@a*M2$lSC|_T9->-+&mq0x_ z;I_XCR&nsvxVT;Y>QWmk7*+$(t@&SFSNPSB@m$#mzjBbrkbTUE|1Yk4*v9{{>0sFK zXZ*ot%lC}%eSbZ|uJ-I7z|X&H<`MGm!aG8ESw+w%-2q0>|IZ-p=MnS`Am|%$xt0l+ zYned&)g#5b{=d?$2DYv1IzLLHC{p5&MT(;SMTrzeN}?=Tv8+Ga`mt=we~FzmZQ4Xt zM8}aT%kHCGG@6XXP0=Q8fy&!jD+9JAz=|l1u&Qf;x@a~yYcgcOhVog?_{vJCvTWE- zv5t;n*x3qf-#L%(k(M59IU}U=&b#m4bMHO(=iPVDx#!U0NBPa!(}Es#^Dng(Z`YH* zt6N3$IpU+(lr!?iXQ7%~hCZ2HZ);xf^o6>eqa23>-9{WJ(>dp$mo~L|@0fGwZO>o& zv3|Io#y~_9XJ8xYU-i?aYnP7AM;dqVrUTkR`C(JTp{EZJ$svt8J>wp8FpCFn8er`= zB?3^nqHchvk0VCRpOkvUl0FNfwj*g9Mhao#FvanU#ax&4wyhO9iG_2;OEz`;Zga`z z4bvx3B3g!S`bF$tXvE^`JwIzMgIpb9`h!oE`vH62ddu%T`wJmBF zmz@@|{EN7-B~GQcJZQc!T$rUDI0wI$gB$df4J>E&O?O?6y0UY0`n4?l^e{gc(#Jw0 zWe-|2@T*+gj@v9ksQ6^u;*G(|Y#W1I`zDQJRT{_cA*Rsd*o?NRhgPkQ)XbGo55TrE z*39(F&d)6KNmxmj2{HQ|&JufPW;_LtDDCn6EGbB2zv=hh^HMgW@XbV&ARErbwb4!!RE1-pyMPP}Uw`*w=3csbmJx2kZ zlI35d5!4j1hW^a0!)fFb%+_NtqkJt3PxV?snPtA0C52uqd>gIs)F)_#%=w2zm%gQt zIxF6z_ubtFnn)wkq_u%IKjK(^#*B9#{`YD8Z%)lY+vYyrG%eZMcKY2coYdoem}aDC z*#o3dTI;)6Qs}f+q?KpAi)gds+-5r;XyokExYUMuLx{BF-?y|&t$p@(;}O%ZrS4E# zYw01i#W%kPIsTv)e|M^Nx7{X7o$5@snKF2FK&6|$s&8=2S^tw>nPlN>rYcB^gqn-C z?vl5kR$F%olf&r}so1>X(%(X9MCZxmVKq+r8rh?k1S_xa<9(*12?~mTqW?s9E!UMn z@sfVJG{+)ZY-KHnLq+bHBWn3{oj^X|>P{^U9B}T4baQX6m^q>4@UV)duaRzu&U$FI zxYVnyf85g|()V%tri}C_)9F+!vNy61jzJ$&@6*@Fei;5pt*B`2x74rRU35ZlCZ(4`Xd{#@Z)U0qjHPau9m)7;eb;oF{ z+K0t=^vBbspX!I}j?Xl;Kjm3-T&vZQ)U5e{BW&qtwz-q(pVsIf+ZNZ)Xz`D4EB@wI zy6c?-z7(tU(%0thRQNuXMa!U%olvilqHqUn>R}FYqlR>x2?7BYNj?k zXN%-r5X3WhlS46WQtvxorz9D(wMLI;Yft*Sc+P__ql|)hc;3JnPp)w`w4p{2tXzk+T?Cm0__B?`uRZE$Cvo*Q^0yUnWokfa7p1 zI?EdU_b4XImv0ibM&LC9X>l|O;Kd|?-xByQfLT3JwNK!(RPfsaR3AP>9F=Ej zPO+r%3q*r>mayQIA!+84Rr?>HFys>rru-ls+hlE!TSPpMC@Rh0Q1m>C2W1n@gdi-+ zcN2AD>}*JDSY2}Aj58vzgU_tumSo{2m3cOJZwe5MMNTVHdK}=;As}*9|AGorkg_!x zhP7B6p&WlUVT3ZW_#BkqIiUkxG?91iJUqgg2||tJ%;{MZB!AI!VmHo?Fp`R$$h{y= zCGsGhizH0vaR~FYIIXrebPpky1eh&V&wi@?x^^0FtKhi6o?+#YOv9_SXP)Mdv6)(h zg+7e|o04QQZ?-OLETyVA`FoEz+DX&dMvabcgQeIF5q1`CTdCGf_a=gZr)*eKsqT|w z`V_4A9Qp(L-ZiwP(qP0akw-C)aqe*qTHY$lw@f9h4q{BJez5Y7qL)JbAo!3X1@8d$ zL@_)^K+6Wcz3J>U>*p(8!@f>^{9}p&7ZT$W=O;+!YvBk{vQ8_ysc@{q!h@fzAh6#9 zMY{buC1+tbOT}yHDSd4yQgKbN&EoPzF`MI3#ndb`PG|yY)i#)>!KD>PBoVP9Yp)~h z5CQsl7wHWzItW0iT1a?MNzK8;sM7h^m{;1=CuH>SDxEM+og0Hw4cdiC6!mj2J*5mK zn{r30NDpJ!uql>=MF||nAmPKyB3XD4X+KhIAmAflBS5>+qMbkqz$cOCi~mb3h1DcM zGyo*rB!yS~mb28JRn!udsH`Hazq^AlTg5y|YSl#dlrF=61`Xas@!jbtiqg)PPyqM| zg|>tyu7?I8PQqbMj>3N=o1ZbkF$a^eX&xfjg~Ybx`Cd()uh}M@fwI$C>YHq>COW(4 ze#&6kQvNkICZk4td214VIE+&z`ZZoi^t}zSf@(R?94qj#nErge*_{&8->C1t;p~YO z^~9V#>-OTf-7new@7V*g!?$RFS<{%qCkMkZM}yqj0n4Z4(l~kriZjZ=S{$-n99wa= z%D%vBBd?CceVvl8^JzO~MNX&8WHdflbUeOq@XqPJ5;}v~UMc>t__bQIa&0X>4 zL8*E0?ZJ18R)!vn4Lu%f{`A7JU)CL5tsRzw56j`6cz93>4_@n+!o%_K5h;8`4nol0 zBexxpJ9a~m=Bt(&Gb$exRSaBc$b8FtNxOcAvX*~-r>&q8RFce2J@@BY}c;zsx zaLYAe+1CI$b*Oy726q*#7|<>I2dQ+dP#}i~QP{Y@Tk>~bZIk?casPniA3$YEH^5A~ z0cPk1xH6I=ol-YS+MFe3rwsiGB?td5REwyYNJ9a_`^;(uu9tPK2D(r!-I_IqAmD5) zUnqirlMCOLrmJ6aSiM8>AU>rWEmVFB%J05gCn8<`w)>W!6bm%%iI%R#<8sq(aCD3Q zM6wn}`o2B;(rnz`ykc)&ca>flxjYhgg`pO)9Juc4l06mV$Sv;akUSkRPw)E`ZF1W~ zF;7@--udH-A5|^eW1;;Eb{&lmE&Hs>8&EqyAK3vz1~E?yQ-F)^fm8#aw&B55e<$^3 zaEDrj)E!U*YY}nDMqILiOVx_b!&ZF&|7)RFL-D$OQr*6Fe*j&WB>IIF?ps13B>Lq- zi6^73q3f>RPhQvT+ASNT-Ga{7zHL8l`ccbrO)S)})4qr0;C}95Epl5Y`U2_oSWVQwHn$FsfMMDHh*sb9 z$b!}iS}>)x(l2|DrZihn$;|4Z=ThAYNE-th2GA>TIOFyv$=>vyy#;2v#(y&T!^!yW z!_w}Iy+IVeXC3G+rJQS-vjQNF= zSnf)4Nb(+%y#d+Rx@?ww-C2@C6$-h1gn2a>YZ|!Lcda!RJh@VRD&{%0-l)#{4Sn+` zSVN~HLzM_ce_ZlDzV2t8qCiU)Z6fG$u^Lpb>^-QpC)%8vaI>aTkL*2=HI7T6bI&-b zzLVC9TJ_>pp%t(l`I5R5rc)@JQU57I|AFQgd>m0vh;8i&jW;q{A9g`E_k=w+JVWo3 zP81Y-@Pbx#&U9XJo4PH07%^XlWbbh;T^7DzETm~?iIZ991G2xD6}`5;6`}34PMTls zc)buSpf$IVYbRFN!3vyOcU`euw!|GFXg4iR-Eefv>Pukf4RA-h!Jvt|dL&oR)uaFD zI=t>|U4BS%?vZzO5)Z4re>VB@WW2gts_uSs*VVa|o};mzdLe^_7n{a9_MTn8^6^d=_lHoRSf%}Yn#IJUSi*3h%1?~w+b)`76ffT~-HnXG7+ zIako^h(_@qPILbjPa2O{X-W67R2KX=+J$BKSFGb5hHvMMx99w!VZ0T`?}1$z?L*+Y z+f6X+2YkO|pd6PES`B#oL39 z!3PB&T_Yo~2A>DBoVg11twuYLN=Gzwv2eNwPw1Ox)9+$o`V6h!7A!j@Ho{va{FcJU zWxgto)-%4U)-<&}NZ+3PiylY5waQJX?c|WjEZ!I-If)Y6a^Kj6Tq?LqhGqPG z_!KRGXUtUFXPl|dyOR4gqY?ft=d5Z@-?GDi$zS3ByZE>5#uoBd>FlpCsclZHr6BWR zb2%uPO|4Zrx3nS<^B;-;-{te$odR;Rb8V_rIuR>sW6xf99iuNqkBxc}J+do!e}gNT zmimIW)CsLO>C5yi3Ub%M2vS=hMfi<90N*iO$ruZBIefINNz7*9B6xC(s zC>;(G$7arn=>tRCI??|fjJzmq72J7VfQHLLQb?Xf~)NDKFjW-;Axu{~{i?9^> z;tAR5dGRP)V^;P(e~MJI4z(m<#W%BZyiDC><9)(>)NIzJ#mm@E)PyvkM4JMF6JUcz zq%r|z9<;)-2Ru}={{zHGl$<_$e&#vsiNHWjy85#Z|35gUOYs-Pp+e-Sd~?I+M%l+v z>K9+h$uq#Z!_R<Sbx{&_Pj=hn8xYx<-bnEWc0 z9W^kzE>{I{Rtl?I7|X%>qSkiXA=t}an*8PzjAdA>@RoM&S;h89Srb0#;duoM*B!+h zx%o~Td{SEJ>z@tt@8KG>^CN(QONr@QHgKvVyov#XT?b?h( zIdxbq!vd@vY>`{q<))o-eWP6K|Ij;TFb393>yt)8=(LV6X<`(XY8n>v$nsDj-xmp< ziiJln&nB&WYZC&)hNPWwj4O0H zlDTTbh?}IZ5>HA4$y{91PjHhQFd(y=rtw_k?)4qsq>&I*+g-IVbNQ(xhhwmtOJms7 z=!lheN(FmlOU2DxgSm3uQhL*vWA_X*(NC%pWm+94KrbT zb{P9O7Ov@78z#DKuo*4<3|BlW_9sn*6$mBu8)m}VF~8h&e$im`a5O>?ue>p7VzfZ0 zXpA-XNaejrGY43RMj@j(edI}68MUD>NjszX!fj9PV6<2$EngVAJeg$rI1^$ae;$%K zv{Ay(pUEvXI@gP_{k(82RvMIwLrEh-kbh};(nJ_lGHE8vf>w#yY9w>*Mj_qzQn?H3 zkltCbFnGByX#|dxPVXxnFcq3K(XEo4FdSS_ib6>+sak9=o_VQi!%BB24NR*HoKk54 z&!k1S3N6CYwCIj##cYANtyRLGdFO_eLU)fFjHS#`La0V45~B(oMwL3%N@gz_oT827 z78xtochn?}xFQiz=Eo+wuM>84-ZTYLErly$XkCJC3phceWDaha>Hef4U@YZMO4r`~ z$y~Y~eDjPSq;PqP8m^$G^*f9lPjZjZXjyh9 z1zfIHZO|oYvmgjKE=kYXJYz989U(|fgMrc1V7MBQ8V98E!K9f(l@YKewFk4kzUTEl>r&~muIFXo>pG^W(v)Fc4p?h2oW6&S;EsPq<$iO zJcu_a#Pke&HjSO1;1xx}eGY=yv57D{L=CG;@F>fB6^SbgTUiFYgJ={JpwCf0P|Z?U zMTE_QYnI#cnt)}>^pUQdFDc=W$|vCK`tDeP;B+%*0ewSsn4(&$Bg}(wfR5ju%wB zC~MX*sNND=mC!15>1>6#n?NT4+AL5!esvSpL!g&HAA!99iP9|;r_P;$NS@cIiP|ks zQ~cEGx}PZZ6Br;cNPt(F^zkST6F5lV5P`!4csa{U*Q0bxAE=3jZ27<+^Osm)%`6s= zZ!0p^->1Q3T59XmKes@t72(*L;9nC;)`W^R!SjJ&1#&y5%8+wg zxEI*qr@eyVl;KuRjv?n(zF_G27b4SgJ0};Z3yg*wyzzh)6%sKO-0}#9_S-pj3cCy3 zsui#@SQCbCV-q9?Cu~8~A{aW>gnes5{hH9eCN!=I!Cwo9)`adgq3(UNYyQ(K=IZD3 SKg!3JvTfnXTLN8qZvHh9RISLBc_IgO-WCz`+e_g{TGYHMBqwm{v@TJBILhgLbe!>(b^7 z8)a!+pSEPIQESE)wPoy4d&UuUWSmiF#uasC+);O?CEAklL_HaA)Jy9d(ybX^)JNmS zv_I1pZKH8hx;+z!1~MJdj!b8?GZTykGoffG(-rNa<>qvErYG7%<7~P&(--ZdaZ9>C zGY}o1acer98H^6nxGlXSvopFgGZY=7X?uD&6NyGLyP~^j+L0c~j7CQ@W6?2&s0cT} zjlT(VgjaO8iabn`&KF41#rfV;%efPmn6UdFFfkKWiPlNsQa150m*x0)KF5ptlkvH^ zRQ61m5!I8~OK{Ua#yvC733-UJN3v&9S#Bzw;AYx`QePM1XE>2PmB{g&($FHOV$Fj3I9OhT%xa{PyQyl-S^kl9r87Ff|E-i$0Vhg1+mCNQ~uwRYy_={&m zH_bi?V@g5Qcv{MJ#u7OJCL<=KGO2W&PvtL({@AnebSfFoLoSXtmdeh}=VNIuD~R4i zE;Gk-f)LB+&U4vV{?Z(jCk4@dhRe&<=Xj`t6YFab%JcKtL_D9$Wud194&TD#NI>|i zQrVo!rl1q)9WbK#MDDRk<(;WS_g8bdG(5JP^OgBL3`u&X?YXcVSa|1@Jaom2_LC_g z&&SjC9H?Q+#LsiFIIbG8CByM&iPgR7x2z!?!xL>2HB$HIV z&Lq{GI;r6_NiC;M>K62bB=wvwX-Mj3O+_-xB#kdvqk1F3f61<-i8Cb4oRKs6fNYYj z$1I$gV`*zEXG+>Q3uof2vnpCkqH5=CGJPrS;OtPxw55&#=sBnRje(2)-o_%}I@8Kfp)al6FrTp1kHu7vborwTfs72Eq%TEz>!y3=YeLZU{cPOzzo<(RxK+Hndj#{Z*H* z+7_<1@2J5p@qI+JM$-mVBchS=O3I7$dG2Cfh>eJHWK`74Q!kp7caTboF1dxoG0$Is zM-oIfo#W#%rMd-JT7rw6<>E<>7ZOJ7NxsZ*|33USF2nr4sC%*fQNjSv!nd*j*^9;GN2KUuwUiyT}MChNQ?#x;*SS={jdnVvT!IS8ShE zB~>pN3dU*jrd;p?7-W)s2UgQhRj;W3SgR$A<~)v_UIlZ(sPr{Wp4>;sA3@KH?4kum zW>IQ2?gdSOO{$R#6)cLDuxKrq&eQPdSyYf^F$#*Kb1(VV*JnK{PvpN@D1?_7fOA5}{ z$e*bf-9Rs-v}oK5?t-!4q)%=y=&u_ZyRKks_VhrdtI=ACo$FO%jeEgWumg2{!2om6 z*T`35*Nt!D7KBR~J)*XPEqsdVc+R7l30ddJ8S1WK3BN zYH2ml_w8J4EwmQA1?K^1deymyEHPJGx=FrOq0~e8o<#>dQ=5|0xaR{(yx>qqsN55( zZ$W>Hj^`Yw1;+hx^p@;p=UHGL0l3I&+84w$uiKh1jCl;>62eTkm^U#jQEd+{l>ljiflta)p} zx2b2pQcB;NFCS5626(=&;J?n+cb=!2%!6v4sIwhtii>9q zejB?W_=!qpQ{+k5-iwTiTu~R8StdoUsD94;fcYu&3Ih{m(||BR!f!A$Z;-G?R1NPE znLNK2NQhR2(@LtHfE)~@!)1O1ON|H407AGp#~)f4*~Z(s%xNx}1XWU&0SD5#L_94V z8m>3_8c+~$J$>>&kn@);C*CsO9J!KR{#w~OQR>?Lfpy|7^QW}O@4GyVXq2>OsvAoh zv8*yfPMG13;()ZYTag)hLnZ3wP)!&3eNga*n%?;RSjjNQr*f#6^T>Js0p~BQ=5g9Y zJ#CNDM2d`tTX~dBC`n=X=~zs3Nxh=*i7D?u)KkSbl@oPm;`y^6dDRQ4InmZo*Xh_r z9e^kJSGiQ)IH1bso6^+OSK3<64}|*8PW7aJU!1*2qm^8cn)!d zq=Eo$kPwv`^l{P1WdTnCUgJbF>Kziy5U9)-bzC+9stJDry5@VZe>LC_Q47EeFq#)K z!T}yvADsw(9AmpNn808U2Dl(LO(Wn7L@`wUidI<-l3t8}4+mfbCMg+HvP_T#!j^(K zQ==_Z1qm*{N~x}B!evE)ryyFl%@M2Y8IY)XuZYAE5S;PL5tMg1rke42*agNLMzVUN{`OY483 zqC*E^^UK*%C=P4jxyV^>aK(*%i+m;YL(wq zwAKuybM#ino*R#q3~hJ1!hbpZv*A0qnbs^PYHC%t>0_b_m`VoUJp<_w^j8){X^T4Do0+0s^FLuEE}$KGBVez5e= zW98vfx9yLAq*ht1H8bfy&eYuG5#~5kG*x}Uif^>+8(s5_!>BZB<3yG9OU+mMSH~+m z4wQEsxXm7fafAj}hkp~?S7zIblf}hqXvYe>I=LEOJyq%$DSl<0wO817D6HB$tM<0# z$!pfCx3?MygC1xJ+#_1(+e$3n3L7Z1fvVkIv3Hg2T@`z{Y!6r6p{l#9>h7+FhN@kW zYM`eQ7%B&bY8K)c+#q^~^CPtp2(*yU{(FO@edNYqsqJ9N&|1@xJ^PBb^?e6#>fUwz z@~O4S$IE*kFPi?Yedou7F$S;oErpg&Rogp@$CsXY>16E;1JnB1y*^_1{~pRGsxEKE zHB@#DT~FKyy*az)+Wi5$`yXp+DEh2+2nc*qYTpT+jO{L(tE1yZ^IeB$>6zENm(RSm z^EZzE;$*e0cP0Mn!IFDmX{zMfU1E1bSY_cc@vSclAF$muEpZOO{5icp7MAz?sPOYs ztNy=;{l>BTOShhY)~v1goi%gY@-w&0!K%Oax_b4|wSm25|K8%Gcl|r7p3c>_o9AJ5 zwV8@GSOfOXlA+^MAq7kHg-PophnTmV%#nW0Tdv6tNW4ATeq@lmV?C^g#IIU3N844u z@-avKx?lO#7!Q~+f6#c;tNzuP@2FG#9?L-dJ&P9d-*aj)?p0&l&p`e6+7EX@Lezn| zBo6yfx{oGgwFt&Q2&%@QuSH!R>(Hy917%i=>N=>~veA$#L;6Jn zw3c&}+IR=?Ye-T{_FxVrVZL+Lynvk|~WJE%!bAe%o!s*Q;h{#oSjm_m#|} z)v^7Rv6=GNOz~vNA1SlD)@}aflO&zgpMnkbEt%DyXu(G37F7%&bO@?l1Z?UoWq^5?Na6%Fkv0xZNY@E$4B}1C}RNJ;^BK?{AiuocGaKIy9wiY zl=)d~O&SZWDW>3!)$wpvRd52nBj?aywCy@q#}ADHK%9UEkqN;ZBQDr>T{kJ1bJ16D zG<{!n38o5F+HTs{cax49gj z=aRwM93M>ZIYAhr`ZwSq9-np#XhZuB&gjM=2!5iegEp{{)1AgFst~$e(Ol7iMUW|| zt`kLw$q>#15RRePPz4g^^CZFy{}6N#*7K8a6IJ;U_>GG8O{|@w7^yu~_Hk4KN?Q;Z zm*Jw~f}e(rg^6tff<9zSdC1QNeg73HA_|j0Uk$8%)(`EZbMV^HrNhgr4{gK|so|fBAo+3gX~LW0qua*Q4;NvJvN55>%c7Bxe13g#5sMd&2U&9 z`$$q=;1fvhISjsw9m+UuE}r5A(cO644QR$MU}e+}__Nr;ljLUOaIUotp7N+>@#itf zVDLN!DF{SUW1dB09dhwmtbj@a{|wer=jQYLWrzuAr9%Mq^yi?cU{J1z_DB9=XwXv+wrstbU9FE0fHf=l?QIKkp`j~`heX9aA3!SHIod$)#DHfG8K)n+!MY-q&}ca{S{j@B z<>PCi<0aei;&TvGLwhQrgXPe{wa|l0);odm8&9nTrk70B$np1&mxjNwWc|RtV?DHI z$+`}zW9wkKb+FPpQf?io+5ygWAnXf-s~tNiqzspRVE}KwQ2=i~Yw=jkS%-ehsSntJ z+HOR0d$%IF2d9{uPrRFe=>jnOFk||Vs15;|`OrWDLx|>jWi%J+DSl;n@)q0i>Bgff z;+A3IhwA0;mTiYihQpuUvK{_Z zco7!kA5S_S8D-w`S|9Gvz12IZg~Z!~%)>pJw|7jcAo0#{`@{X@UDFgMe$8qg@u_}o zXCCqDe(g|W+>OnD-EVxvt^V~c-y=5l`$h)h@0*a)`!+4c-D-?`8L0ohkLGvi9~siT z->1QF&;X^PnU7yk4n4NODiaKnYSQP_geX$zTwSh1seFMZE9WkvMIQFp_Ny7BQZ$|W z&M5Q^J-VQV^IxI>u0=x%H28c&hbHH6jN2}-r5lwNpmAz1tCA$!1>*vv=k}|hZKf|P zPjlbMOhCoHodzMzG-=^*zOJDan7l`6rRacyeN(yW{&K!6U)OGvs?X`SiJl8O>n7~6 zsShUKzI_b5rofzG7MW~lOTGKX$~yqi)=yFtDxhmO-aE62f~sjxD(FeIrKe_3?I@`J zKu@Yo`a9@8%6!6`ZvJjifN#6_5Y%mU8lzWERzbJd6-?JPZ=x;+*Y=fg3VUm!y@OT_ zW5TmpnS%E7DNvpEA`OS(JY)y_SgNz>W<-zd&cVr0UD8&Bh2CxMCBc-?M8OpGU@1oD zl(0dvCp-&}Aek2?FxEtCP-^~RJ8#$xChtuA(rFHisrgtu8J`1>4%w<~!7Rnp3tmEy zjOX(_k4HzMB@@3GlPuY2YIf50TMk)8y?okA^(WDl5@H$jr-4Rbs-;c&7m$=DnM&lT zS|+k_K5-Uoya_l-=cu+Ns-auH73M01B{jNj4KU1`aq>%>|u)!M{x**;Hl&Rw?`~1drE;R2NLR7OrgJ3B* zQQG?`=oLm&jV1QBlCA4QHL*C>y=@imaM?S&`khMTa5-{#%{x_bOx5(n=)sEaiY-{S z1uM4kTek5#tY_)j*Ymg8p6jL?6aQiS_P|4To$XcUo|~G=-ecvx$0~cDDDQouwC5Wo z`!_+9x~w$WsI`?=<}r59AHSEARVE5@~!K}n_h zaup3-D-){|w+*A~W?RMFUN*N^%tN=#Lseg(;u|ab#;X4AsyBdtt({;jt@iAZ)P$OY z_{TSh(Qo~j7~L>JwC}bOm{{L%**A>MT07|9ea#}?& zP6opW4M#AnJAeR5YR#l#cP|LiFBH4XCPBq)Ip$Xv)ZYU(1P~WR2Z-^z-*!OU?m!-hO?Nfk`pvoC4Fs)i>Zbz4=JInSlum5^gGl zniwP+f)dRSr%1j-84ERI=wSw%wAv)-^vZlIcWTC97WL2R;T_X^!;C?Nh-TpjCWS*$ zcctLuSC0i{C&XYr7d!>>0pK|od>oGa1h7lMQK2B&CW7+Wps7KDKBqE&V z`228|%SV#Agbw7-2>Y1rV(MFp$KN=LF_VZfnMxQjqpKvhC{BeKkp4{zzbY?m3E%TA%pu-9dV zn}m8R!I5%s*m6=2YEgnGVZ+7GS^Q;cSLAH?2r?wryR;KWmE;KVHuf7{JK{@X1(ApQ0$fkj**3TVaImy=|SOJ?c z>!kWT*yeb)d1-cF6Oz*A>&&{&R?Yc3Vk*?Vx3ZSaxsq>}!ju+uR2bBmwK6_DK5K@X z;0W%E8NYth2=)2;c}0DMssgY-V4iiPwW&uvh?J(@Va-kr6tM;)(kWtyvi>z-{YQZH z*OAt_I*MxaRcHOpFj9TyWY!;4+ELbz$Bj)|<-3(y1%ooX%3WsthUW~F^}~G|>(|Af z2UbyMypn%BW&g-hlfhgzcpQ8mCXe+Cz$www9t$4`rZRJB4ju4?Ao#2V1>lQyA}Bn7 z|8o2+_(To|r#T^YCac)9!5ba?C{L%-;6oF2vZ>j0?m~o*pP~2D>D=i^CZ5V-P6T~erQRi31pcaZdeoyPHxGi6 z`u7UKc|m>-IxAHh52e+EI=$^Q$C{{;l15u_;n>_HH<`P^JA zDFgAP{%Qa_YMKwrlvF znmF7`$9`_UE{30V15&x5w8W1iHo|c;9QWe z!wmo#y2cqhoYL3jk!%an7Fe()$#VyeAKDqp5zsL|l}&ON`R@T`NLi#K8`;_;viV#d z>>kvfB80V~3bbIotX+y)s=b0G1ixd_&g17#VA7xmO+lb08fW3dAF4$24ovGMWmuqn zQSV6U%<5?<uNy(-Ap4y4(?;geqo^;^K2Rv}ASqfD1v_;C5mqgFB1b zYI^`&+wOMsuiAdovG>YMacaqTr!7)x+gEPecQd@!HeEb%*VR+?fICW8#WPs;3|2h5 z%AQ^Kv}(7tI1TU%7K%iEUqJ9WS4_a$?zio9(EwfuFldy}Q>!6J-|u0^l;% zx$j;parmo#IGPv)3Jz0Id(TAR)WYVgx`S0Poj5#Irx%=F>?U9dMiX^d;p+@`i@_Zc z0^LxzlntdrZ1*`E>ZkICGQc*$R`hDRp^VDng^v;%8_KXrxzaRI|7@ESJx#%Y)@ayR z@;1miHkZnu1xR%?N*!?aXyT00Oc8{S1LP-wjj3z)F+gJj!p1ZXzoZU6d_<>yP~-6e`r1u&>Q8eX%6U)^!2v_-2NsY=n3HUJGJa@b@(*p`ZJNq(!x3C2@xOttfN110SsAHx zkKgJZr<>++V36wb+$H>=2c}NoW?(coaWoqK-@%Rl_YhzqdK3#hvYN21!4UcXgmsYj zNS1%OO0{`=n{%}z*@xMD?%>P7p1Cr! zJaL=ts&)?4x#s3mCVS8uAFVjxjc|-VO0!t%uilG!58ZLy~Uz6t$V>C816M4arD&m7&8%H z->RnEy0d1+Bnw9H8dilyChJEwjO`>7`>Uf9@b{rB%4j>+U7a;8Mu3ddvy`~ztLbSH z+>E89ktR*V*}2kQb_~_bG|iHh)|!Pzt)!!CrF%tK8F=-n)nqv^TC>r7JMng{OqV?a ztK(%)q~@TRPSV=BEUZi|Uj&a)DDXyVbR6JFZnBoFU1ekUN~UZa+i0QHFREI!gX@kC zscS7HV4Ut?O^-3drjBby%rOBJD0z02?K^5_OtX!&1=BDOwxY0fq6CK^#?Xch^A2p9 zktZ4L6ho(k#&Bv|mQGzWE>D+04`|<0w(qIwr6NOoEb-SAme(IX7)HPAIPt>%S-FT`-fu0t?JMUPr;7c~r zv$3XVzEbB{*)>)( zV;-EExoQ@S!SZSS*ak64ufx3=-Xme|e+hBXmdJrUIf38h3&Zi#2_E$v%1OX(eF^^u z1{NiLNg4QAfE#=l!J+0xv2h$?XdJkzr@vo_mxT?;5LXL zjF-e7<#Uu5QK5(uPGWnA11M&out4nlJeq?1QLi*k9(9O6T$!R4fihyT{XWW)c0Zp_ zr3K1bB_@hm9X-#b9&07caOR{XsHo`U-;ZLv=2nG{lurUcH1)5Ei>}Z7EGsx(}dG8sp zD{pB+yULU{WVS6%lTJt{GwT#Ob!XZs+v!XL>F%c4PR9?!Nc##?x0}vvXZnW{lObP! z^ql+jBrHt#k4}3*_nmw0_r2$y^SkHzXHKV$KuB=E8}Di&2(CNiNLMv>{;(8-dRyOi6RtOnJtHC20*?DQ!yFlJ>Bj(&mIC=?pt5 zZAnxmU13+UI$WJ}huz7Va81$^_9VSwZ_*d`(K6OVZPFk1Q`(lOOV)?$DQ!H^_ig8!Zk;iNYW@>bAx5?GQ8^k*L?tQ2s)nJ|jA|T-r=p2*$TJ@mUYQmo8A!*0)X8{C z7*5266s9)}pNh)Ih2&HsDhtEulnm^Vw3v*_N2cU>It4|X#oEuNPo5OSkXd!_7v%J5 zAr+q$idkyS!z^hkostCAc|a7?;!&ZP^c*kN!5;N`E}f|}s>_UGJ)h1I$5In%y*V7$ zktrcHGYWpR9ucEwAny5tW~LtSaEIX z$6ZXU{uvP82RRc+nBhs7DA{U`a|Y2k>RU_*DV?Hdr&6BWY~-$W4^{*<+L1zWVj586b}iM zFhWXim1IniDwZTC(^G<|S|SlDDG?FW4-uO&!f}VJ zDklkviOSk*OKG#SMqu6GOhzJ-6{-e{;r!RfdgSC(&s2P>OOA>s1-a|o^p2-?p57t! zB;qG}re@?*=~T~|L?YP(2U47wO2<>O)RP8@kH&i>P2?l8cB1IUc2CWSEzrem%VYPx zTc6MdC@1ZK=*#ouel4+9y>3=4b@_(&yr(`N2<00)3RYsRzfX)-TcL_HcVxNSmWF)a z5&VYo9V1!pj%CX#G(c2yQ4UnQ7(J`YRB1P{hy#AF!EbdC_U#$Mz(LB8$)e)J0(!_W zx;lIf)c9|}-u)4K+3-6?BbhT~48H~YeGC*m)Gh6*UK4hk)$6`Z{)C<5tk{QcCd$P? zZuD|x7&Dz`K)LvdUnRg?CL!jR0EO1+q6r=Fi-*wt!Z&+gNNfKkfTSLg4C1aWNlxymgKIS*si(U@O&ROQHFj}9UU;aWb zwdIsf0T-pcB0vjxXez=rRySGYor02hA}Ru$TNAX5hp8fDtm}{C93wg4Y?F1zNG5|7 z%|H^`!IMxCfZJBVf+TR)n3`O&i13|A>6L2)k1XHgDb28o=r!9RPdKVzN@Jz5v~ z*}4vwOm?lqEy?ILPM4F)9_Q>Q$(%D|-%q|f1btV*`s`h&B`#CNgC@LYE^En$SLsGI#ABhsf^=o!kaRV z$=y1q`~`hw(Cf$87jYdO-*bSbqu;a4S#r+sB00+NZh_ZW zTGhKI8999x?2DusF0*Qr)9FNnkH=)GY}Hg>l;^<5efJXb%8+Mb$}*h!j;q(iVq#09H!H1QB(;3OkfeNUBZioS#lkss6~BXd(^} zU04gZv`xSTr8@+pDlZN#salGwE@=P+MOf^?2t`W!S3jwg7S9U@-@ zU$6tJ?fGb8TAJ0plPZYJ=HVkEk7W(L?a z5sy-+T5QGnwqvvfvs_yL+HOLts!cx$abCoes#?pNWhw0whub2ezEzosT2+L#+KPeT z396Nj8wXPYy;`n_hh9P}tphmyD{jK(T*YF6}YX| z{Y*ZvAs6UV0)304%Yoh5XIGkAa?QO;b8oJBP-z}qZhksf_jGnF?+@hsA;lkB2rc{j zvj;zQ)z7!zZhi8$Ye&IE>NZ@Sdvk8d-}#w=`07`7?)`h`-#VA;M}Fj76D<+kosw(iY`dUK%zO6Wkow?EhWoYMOo6m8wR zO1RdVOD?G4ZC&zo=L1`pyjvj!qi*O?8hY~HK;GK~an9SOc-!(DLiv_Zu4RYPvg20E z;p_dmfy2tc;rzBIa@&q7+m6!iMwP9j`Ht*aI~W zh}~E7nU!O1gp)2+Ve^VRja>Y!2`ywbGPGo)}~n_ zmR$W#rGDr2`sMoJ?4eIx8}gpUyno}}8sDY(54!UW&AEn7rJ?h_)mUvSP>lYY3<(T; zRj48LL(DgySCNKMrqBTM{j$&i1OE!f;|{zt_Wqe%`+(9uu;kkPIblIw?mB9;IV?2| z-*)WJd;B?1m*VMKbT1kfw=MqK-@NpfFD)Kk>fX2P8NTBf{-$8WTHi`G zcnfCyB*+{b;C`}^LwZvu6#8imhp~UC4n7}m!&(st!ClI@&9%E0Ic$VclXfmM31|BaDc+_&Q%d84rFz- zM3D+A+KN!b=j+th5q=4Y>fNBYH(gXe%DYXeUH!W62m74WDLlB2j536%+A37P3|t z4ew3YXkPa3%sN(dZ2+c1Y1p;gusi47owcr5963uJUg{mS^X7M}-l|%V zKkNhzg|~VZZuJ%dwxMZIX&THsJb8x?;y1=$A73?d&9(=GtFvVtpqXo1u53|!9a-C_ zu8j)~w_W{9mi|Hq=$bD4LfMp2dL_|NT^?;5A#pf_>0Kg4yG>XsY ziQ;Jg5y+F!*@iGlJ3TdPAT*rUiS%se`8 zr2W=?X2XqO<384KV>g5J9wX%5WQ{=IG#h}9mAsYEprQC%MT`$V&(hfdeg1Gc$i)Dj z&ys#cZze-;1R+*LUyx4Iok8R*VGG605b>@(+x_^}U+?ztLu>?_K2Pqt0t;tzJ!4AG z*lpLLCCecZoh2bZdQ4N%B;EsT5xiziCy;hTBCkwG6Izalr(6WL6%oN zo~Dv0`msDl=!25}3!*PC5P(iy+w;LqpL@KNhWNN?m8c zPSXxj7nnc(=4`=9lU1aBgekZvT}?J`Ex0LJLmHY3w9$f>*s8wpQEulLW3;Wfg9RfJ z&`WJo!AwaDu>qV^tf5saO+U{z7@t^iw-k)6>G<;6_Y!y#uu3tYx9*R z#oE1U#8jb{F+RgA94Qd|e&~2WzjUkU^kXpdQozHCVHHnt`Jv<2iC}=|4F+v8JaMr2 zQ~_xNrs3hkBp3zr1dFzRg5{0y2w{1IYXYv7H(wxO+YB4Bh?7v5THkT)?u)+YHD&+> z6_3Iyio)Yu#k#VG*A8a3^(cKp2+F4()`Z%ncUp64bi=!kk6znLsMp9mMJ$RS=R8?)d2+6h;tDOhI+iRQbYa&_1SOqLWIIUT!{=E>E)8HM z^p)3H@Wsa^9c_X46Fgb*lCN!!0d0USR|MVwb#Un*0JdAd47a8XSiTCrw3%UeBT5O>#M4|6MA!O!sAZzHz^;bOvwP9fl{O4)9=b7#I~wHUq7ai4GrT#8;r2)WM)WbRQNZVbG26Pv z+;*=koqvTjFF>Gv{Krwg-0fqiDjBJht=*5C?m7rR= za;5~he!yO=hbIxkoMDcw7)1%jl}524!$415@xcWQ%^W;4uG@a;!UIn(KKkUs9fPBI z48{k}XxiIZjfMxRv+x8tJ2b30z=F}#OfZ#Bbsai(WW4KPkqT;}EFpwj!`EP5w`zt{ zfsgj^E^j^+m8hh?29p+tFrpG{jKk}$B%TL`$|d6PcK}8zm#P68SDDi)b4KN883f9R z*@}t~Ys_w{5YzQV|0Wcbz7G)ys=ek?-@Ns=f+uG2qnc<1QspY!%A-rhy;9Vplb&~C{wy5gzL9{Ot4O1#f8U;VphR6+sxyT*M^(~aJ` zeI71k*N&wKY*E4pAHq~uBoa-f(r~~5FvBaYhDUJKL{!Yw6o`0Q>FSAbq!8?AP=eYr zj8G&cyrlj%n1#SH7l4`1d*9r(%rzA_*4R1kosV7Vn?Lnd^D2R4!QwF1UXd4u7d;CH z-}~Mwf#hnV)#%sS#uh_us=f3V9fBm{DFbi!%oMoHwV~l&iO(Y2JP`%=^HiKdCe@3t z#@(8Cvl}kBaykaM3QdL@C}x!&l)LF$!L%GtNE!(9KT<;T0#mT21|P{y(+BBtH#Qs; zF1-fY&>gO!IlK&McZki8Yeaw5)`l*kl~sRbVePLGD4z6*F7?Nk8FFuS{U{Lf#@@gzNjJ05Oa@=+#}oXk=A=;*S}f4 smtI)5HlH`$HyvV`XP7I_`-BpOFh^MX19k)BFA#`U4Uqf{lT?KM8{gswvH$=8 literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_tokens.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_tokens.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d25024040592d10e8c9bb7f8ce4b37b8e7f77df5 GIT binary patch literal 10031 zcmcIqeQ+Dcb>GAH$AJI=5CjNpJ4&GN!Q34! zi9Kdy$4TIcI+mu>N}i+>dM1C!BXy$w(T+OPOd7WnyXikTQ4P))X{t`unWi(HflaEm z)Be%7dv_qk(D7eg343pM-@e^@`*z=N-|?S39v6a?vwS~`5(xbpE*QzG6B{lHq3ei6 zER{n^?MWpmD4TMoyg6y6aGe=wOVUbcOU{rrF}zvmBprsSStDrNu+V{8Ek+v&90`zsMKQ zsG(h}1vR#N{h?w(gaK5~5uPvdU*UwMVnN_kuRcRgA64~jRp>0Coi&Hf6|)&m4Z<|? zOXBI`d9LuXkUq;DFDw+*MqQ9G!W<{2*|eBeoi(0>N%fpu;tDgzPjUQf*}_@1A(Knz zmr_eaU@BKU3)C3_lo5D}Z?4F4Ib2PXK?Q9BP zdh1TXNI4kv#WbJJV`28RkjiBRF{7=cme=crjsFDVdtl-m!e*T`u~gE`nri>dKeVzI z&ca$bD{JFytbN6f5bNOVtP_4N__^UnbM^&a2`x~p=gqpL!-?1^TgN$BFXz&G_&7I5 zg+JDU@hdJ^G?(DqfiaCBK$sIf2nCnB()z zfoF}v$(K(v3(JKJHq3M`D_&y6BJ>M-m;_<=f6IL~_P~&yTYkW zJSQ&mg%qg}Qm_DoN@ZC=^};fyIBcXTK}=`PtIb>?eI~~l)}i+SY3XcEQ2lx?zfjCw8OO@!F=-4P~B>fmZnzoVDVN!C4z*8GTD58x&wo5H4?&Ap2A5 zIuhZjrMH|C`eQf>B}y_$X2~kqZd14Pqme?AT|z07!7DkSWXMI51xl7$n}rXF&cpo!-qnb;9tT#pRn@nz`%FltP^LoH(7b{G}g2*P%PvwG5GD7tiWiBV*454 ze0FIGb^udbku+XEMi99r29}G-7Ba;=X3!NT3?}TVRajo)c-4_gk4W2DQ!of{JKOG8W9r2#R`pXJ1Xi_2pZ zB z{mwTmBe_}iJXy5WN}VbU1;DqRkGZ7 zB$W11{A^V^_Twk>k-Oyq?SK2kTPLcBiu67~l&7z1Ms({22uG@2*Q@I0(-%@k!KQ7B zn(<*)2mHPbzm1DvEGG~J2a$vpbw7j|j3iNpfqWZ0?jM1n{h8^y`MXvtS~W}N*TZ7X z5El*KZFGevOG)NiItLi8`4k=n8~={9i088-@u4r^tr_%Ihd1B|T?NzM&BP^&HQlbw zf^gMs^)#qO<6t%_;3-**cVC6W7^DZbHR>d6P--#wyYt;NVjb@i4Ndkb~Horbt{V`2Wvk7w1_u0NuU{r zF;6}UJ@IbekzV)iUJue7SQ_Zv>4bx7v3o~HD6YCdXV~_4Bp#fY9UDMg?5lKwAI_7K(~7gdh2QEt7g+?-KO0jWjFiWyVcjA z;NW$^94(R?<~eHAZar_iZ|RbVhMbp8FQ7LlkpIjUk3qpf?*v&^$!;9;J7;zZjiIa5 zmA@TBS59{zalg^C6VB*W8l+AbHCs<{%Ah6Mcw^(4F};Gl}`&R zzOx$mX{~6J~xKRH%AHJ`Gso9G7M}UQiuNe748~C`q)cbPA+o&SS}3j>pEXS_#M? z9!j+nnqyTL3!OFip$1a<^rbUg3Y<;~Fzk{DLW#RoT|`W(kj`_eFLfp@aK=;s!tmG# zc^?@95R3>c##`qNFznYrzviYj7^^yong|AREO2Q6NSvVB&!vU5DDphEB-JbE;5DVG zLG!#YXVMiTD>nu=;GqHR0yZL3e_AQ_ns$uk;U}?Z*H_cIWsa;ez;PbN=X4YX)6nh; zWShHP8Wh^8L7l-afCDrl%eJ_O@x@7BDXR5A5c_iMMrCm|>Y2 zz7t<(CM!*oC9mRcD!aR}2cLawwj3F}n_=jZwP?;Wj)EF(%N@KLrI3za?-LWg(z2N)9do~ai?k~|*8wxdDfA!j{m4-gK zq3`DGdc#D?^{64Pgqthj0XaNy^Zebh%G8`ZHMbs~FE^g7nyJuBRLQkzL(O~dF5ZtG zFFV2?xkJkEOzA|0j>&ZFK0UbUK#{(?M=J;Bt2WH?9nfT=m}f^*#cH(9JU4d?TsEIxDe3IX0+t?p1pCSMA6X z+eBv1IH;|AxZE}V&S@pqR*8+uvC%uPR`wl}_Z?f09skr!9ejZ*&B35dXQg#mZXK?) zPRgy5iZ7=4S`}ZD5=$s8eM*F>L=ti&QS~5ZY6IDsx-#8dZMMM#10Cf6);{D)Bvy$` z$dL)f*I14uZcg4=QGCpX)zsAfDKZ7ytH|VOug;iIbn5eJlEPzu4sg%c^a-M{q{d2M zKn@Jt&fICcwXh!8_s{gc|A94d?*DDI4!VC1>+t2g|0}!=hW19ou}SJWbu?hPao7Rm zKeWv-N5kkB6m>LY`Gsi+(<9BtTX^gwf!B;_zX@1B2vN}bLD+fBYW|=tatxV2 zn4~a0Wrfxckrn6dJ2iO z!9#p8#yq?961@hOB%_w=ZH3up=Et-}ouK zy+`sa`gb%zakUQWcfqKuUdgdrJ>MgFFWX;$_?n@6f?H}Bc54KwPWl2Fw=G8eYSmW& ze75Qnag=~0m`#&GZ#^X+)HiRH4T`6CuP;!nRmAsel7MHIH#am42wAt5)q2Uji0|S% zl2mW-!n)JjF!Jw}y8=#QmA>*b3tY-~L(kgUZIiYumof6HAEb2em>3ja4@>^1WbDFY z$RIQFix5}UlH`|Y{9A$IZF_A^0$`=srS3?49A<#s-j2ja#TvHWo#VE{n1K{X)owFV zK(eN4>n`py#6nO*L(TzltX5}V#9(Vjs!6pkVmPrQ-FDu>VH8NKL3ks4A=#1m64XDt zqY{d|B)MVNT!5YpV4WeUVK@FEAWjBk3nS=!D%G~a= zUYz;eH<8S&OTzQtW$&MQ8GYq+Zrqa6cZNJnk}&0 zMb%O3;aPxak8lp)ZCWT6h|Gjp^=ME{M}cZRgp2g(v{41)iU8p?RM=Hh zmR0Q<&SX_9!P8JCk!3-(F5`Hz>eYv-&BlNJwG_bFg0R5xIs{hH^9dVqX{z0b|M@gR z8w>+#{u@LWSphpazxJ*Di$>r#iKv0J&>AG-K*zESZB$4dRI# z_YK1cRGYR;x}id0OJ_u*Zp|FDdmwrDr{0raz@ixoTc=g&j6fSuq|w}h<%xi9c9 z0mDk-i?0c(r>{2O$3p+nxHWA0Q8@a;(H~8`Gx39|^~N~h{ocWnlYr*#vh zF_}JArelvmO+6Dz|FjYxSDFStHd8%An^trEo|3idAlJG6d+z=_k;?Gwz2Vu%z0bnC zS6U|ImWfR>)i+f-0<#3A4UE8T2!g$j109Mtq68Vm->d|qiofwe^YEQSW#oiBa$>!C zR%z|2v`)#bQ=48#&{c9&Lu8Ehd+zo%u@axW7oYrfQ`=SBqmFpl$0&iw^#j)q{J3K+ zTj?H?yT{hs$L|HkmH3`Yd|ZYJhg*I`zeD3mcmGTG(7Rbsa6{K8HWci96bx5_aXA>T z1oz6py-LS`!n7-i38lNg(mg46Pbyu#N=pY^mKsK%AbUfo8bHzZYA5oAOWx0gG??@o z{U<2uC#D$+{iWrg>943~Y=3P&X!;uqRQ!}WX!;oi#Lp20Oh2bOp9`3O-etmc+=1z# z$a8h(_Z<|_?>ntf|9+hn(}APULgAPGA;`4@@@p^@{|aQ8?Uw-z1o4$(<5v{^hz!u# zvUSQoL??Mbk@dpF5>Idjs##GJ8nqZ-qo>NK->WJ z#!L>Qx2>q-iibe2-Do=nv!nE$7f`~n5<5hKNsRU-cR&y?<+vJ5 z8C7fCNbIU<)Acn5C6%aw3z&W(@)G){c{u?bS*FGR2}L{RK1w;p{ABfW=C7Lmkg|peJ0^fVnz$Le zYbhrV$W6~w?WEFy8p2g4Azhes6Ou;Jk*bG~b;yOoby%#IRLq*u(B6-ojCGoN6lh=D zD+9W);t~wd*uBQy+4{{X~$lg$WM7;JjVGw_g;qk{n{xH^xo3!aK7{WCkqXx8c zMj)oI(xgtmn33J1D{$wR4D6lMDB)ff;SW0ow-zNuD#*o&{}ViZ56S*GgH&l)ww5#5M&Pj2U|&yNB{r; literal 0 HcmV?d00001 diff --git a/entrypoints/openai/__pycache__/serving_transcription.cpython-312.pyc b/entrypoints/openai/__pycache__/serving_transcription.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e574e6fd4e42fb9503eb6b92b7447f9a56ea785 GIT binary patch literal 5567 zcmds5>u(g-6~D9Z*_U6t_WFUnfO%|yT?`?{G>;NW%p*VqR$Gk}jm9(A_Jo<8<<2a@ zu2azjwM3;-zBEv&ANr}naZ>T2Rrw$Kg;guD9f3sEL~6fnYY;S_dd{6$d)K`3A*rH{ zy=Trn_ug~wJ?D4NIlF&KrQ!r0BlL6qLY9!fW25I$Zt5^&7CTMtidD9$+voQ;1t+MZXo{n_nJK1}ZI$U2N4LuUC^x%K z>TB8fMDZDiQN=toXXz?+2it~tUCl+^Zr*7qtxNT^$^3;zo*-;nroUxC`yybBo>LWC zK6vyDW#{$s>i+t=Pr+?{l?1g zf%zFAj}l4Hh!oHSDX0bbU+~vKEkr|Fn1;0ojcCz@C?Q&mMl}&WaT=XTUM4ewmbjFb zVp@`l;QzRmrU|WsCTWVMwdhQsAN1oq+JST?rgdHlNu7LT7wv*k-OW)rQtO6UJ+w#5 z&khP;h8wZz`-PDsN?9|gop)Lxk!K$0?BW|sL3e;r zr^?E*W1XdCTUKn3lXcB@JHUuCMcB!DE3*|nXk~b zT`UFMuw4a-8y=S_MaW6B}1PsRpy;Jt6Vy77-k8sJDaaqdfBl{7QkH5OSXq|xp@|TlZ%yk zHVR8DY<{}Oi_P8VVW7Pe$fJwoeujwY52LkMu0F7>-qT+nEY!ChtoLrJ5A6V^(Lu!i z2P7uOS71DG<4RT2Et$uXsP8;C!IXF9W+UQ3Mn#u3A9TY9@L8Dwava=ol`N8X{E*ok zgm2G*PM|4+QLTD>@u$MFpA!aImk51M`CmRXUJGMe1z43=6(iv}%h=`;!{^d#% zbiBWPO!!17g_gr(g4+qksv6GPsoZ!o5a}BB^1O={^Ri~AkNPU4W=NZ|twum)^Fgx( z0kSLSfq6c<7ek9-O<2RYvx;yCSQsYYc`vdU*+ce1fy2=q^w9lYbTRr_=wsokAdo8^ zfb{rF5ZFAlpIqo^vvyIPtCr6$#W+OVtfAWuT+#MZv;D9Xn?P~^h}*^I$ky~bRCRa| zbca@9UTJiZ z?12>`X5*K|e`UzfmWM+iT$&6FH#(5(1la?e-q@ZF$Lj+ljUc{Xc-4EK znBwny;_z>VZv<~l-4JUV57fj1H{Y&_$5z5Hw2|)#=k9fFX@uboYY&cG`|)-2=G3j? z!P>~{wShy8D0ak1HrEi5i<7}UjRbN@GMI0qkV}(z?ppBL)HSgtZhF`OB35V~TLotI z`F|)7bC2H{uU=o+&{iIL!j|HY^?@DMl|SEf`6lvP3|pZrkE?G&vIR(MM7Zc7bri%4 z8=rQ%)l%qfoc)|qXyf`)$iqW>24uTH)3>1L`O6em^S-dSq1Tjf*I^mtRZdvR zuT`d=$1&Qh0o~pDG@Z?_<=!tFyh?k<;Px+G-LQt6;F|sY_n;)nB^Li_%Jy)DuyrH; zAS}c1tlD1=S>}9Wve3fa?#Ny3H3_?{Z2oEAe*zpA!{DSPdOU51s5<=i1X_5yy xB1{T*$>Fc3Ync!>O_YEF6NB|^ALA)jEqApXS4oae=Sn`ELh9Pv50)>O>Zdwupcs!10 zhni^0)J|qat(_IUJG-Gvnu>mHX=;y2;)yNWTU%&RS+v_yEwkgP+WdnerR;dJwfnu- zXz(zc>~75y_4*y(``-Os@53ins|i8KXuh9*VGlxok26ZpRufO=DTJ=0=Mar(Dud$k zmx@y)uZpY4R~=WwSC!FZwQ+4$7uQi3PMy(b4RHfWYcj^HDQ?P|bTXMJ&B)*tt01MxsM z7!PJ!;w{k~d{KvYqixk~U{HWMlCdNn0`- zvt98nlD1~Lvpw-1lD1`fvwiVClD231vjgz~3K`|q*c9Ky1en$tJwow8#>{M?ox5Q9 zREV+AE@tqy`<6;x&G=R*>teRuf;GhjMqP&+gus@l}B+q7;7$wX=|Or zH7J``0jxu4d=5`*G&i1yrM8}!VsgWyrx^CtbZ$a&pPFKr)Vb67)66A)WyRcUpvLkv zli^_DOefiVc8bSK4LNfs+;Fie zxYQ~d|5wqPcQmw?(a<_ZOY0dOZNP_}HZpqJ#29FEGa#WYkh3!Saoa2!r)b-owz!co z(RR2H7F<8Bq8)E)ZdJRBTWKd_gX>`j2nXY2To{INkE;l*j&{GPiF-&rhVw99xFTkz zf%MeCG}4Vs6Vpr^s_>*0U}{)hipR+Ka6E3In*g7m^w|u(H8BCDEy3smTz)1<2N*XU zgc-Gv(pKn^q~P0%OUSH4kPG1)e8aU;`|6&@U33`wi@^BXCJ$Gqyy`|4X03SgXF57zEwwa=H$L4$AnUuB*%ru z^K6JCl|6jEhli6J!nA?h7%r4apJzh+>Z=(_=4ew00x6YmR_{T{qyU{|fT;LbhM#70 z2|jr(J)lX;x<-(lw12&QT?Ua#G=ra6^sgK9B>_%wDcHyWSM`YLV_RMTO@hZb! zWYav5)DX>#C#N%fC7Nu=_*@$`h$IKl7#MnN`n@wIy zB&X?gJ~5uoFkE_uNo3DTI++tlnl$D#MoGglnen=7hEMv+xt5FxxFB$^6A2D)JHDtm zy#Cc#AD^A-n@Ugh@JSZt(Q|2f+m7w$w=sRdE&8S|^XKxpzE?AuY#;D-_VQFdo#VMa zAO*Q(x^L~Ad#5h5{dlYVU#Bg-%Hj_K0Qc`8`to)3v;i4ySM-9RN%VD!?q<;+6FY~+ zKv%_#jLlDx&Sd(O5Y;Dnv(dF0yR4m(88`&7BV|uD6a} zIVw61iw;lO(JeT-f1_1fZ51`LYwM(uyj+Mh*DdjpB*44d@Ui@CB7}KY(DXkF!`r#GD=7)?eonEOqBFhFaIyH;sP#J*i44x633l^k(#Nv{fK%LF3 zb^jtjaQJ$@j{d>0@0XtNLVG!~S%_?2@(j)zAKKkN7R?cM$bocJ)bKU(3L9BHI5ah5CM3PK(JSLPKLTSg^H!yVuC;oD;%nr7(xKF^ zd{^*qK}9X^R%#||WL+tNzJa;aBl43?D=5rjRIW+mRYxFd=S@#OBCA-Z1kI2w5$V%EMC2$v0&uX1>FRd^Fd$XuR|4d ze4A2Q&?~!A8=+|e70m_H2s(2Huvxz0JVX@?E8hZ!t;;$mF#}r{KwvN-T=jf_bv14vbZq3x%$NBvCLIK<+(-+Ry;vsEaBJy##j%>OPg#!oOC7 zYN$J&M#urwdMtSDGJCAaU!YThIwP5ZF)9$TMnYO7=c~y~8ty(*t6`5pC+u;EBz1<# zu{&_e^n5Zi%@BoAQl;~fe&1!D;YLret+-Z$i5$BXC(VR`GYODV8OcJ}`m`*uN*+*l z^B1ceHFa(}cOGkIjukEo;^S0;lPti;Gaw|$lT-tCWPzC)iP3j2Bcg(8>^Pbbd|eIx#hR_BxSdrl$@i|7P$6?&eh>T>lfZ>WT$U`_?mE&bgw_xdBe4*U)jL`GU1IzHT z(_7IaqvxpwS?sej-`_j`f?#f6;LDxELg(;(^YBAgpcEWha_uZxc7ATPzZJg{pEo_Q zwk^E+RENBQ>&LDgTWDAu`SFSOPL#V33EhVtcn)7VFgrZUexXOU_NOM~@t575g1d7u zd`I=by?NHKY;nAG;>wA#rC+f0{~A%&U819De(>hr8+*(CodWzhc2=~=*!b9hf?adr zYwkOma{sW|!KlA!$4^(uh!FT=TYcH3*I|T0zvB59; zTQNE@7%m5Ug-Pkm1RJhqkB`Uc^BqVgwy_4P&sOxcD0}b06460 zneTzH^IhP(|3YiQHffiTS2Kbp8 zdWU$@rPQx{6EzNR#H-S{Qlq3+))V*^n#2;{x&W#rbz7%!r`oEk`t9M%Z)**sy!2#IxfX}GlyQaF@ra?t3U{OtCrDI)SEg*zz zVFj(w&QLHb%K0fU$rh}ytB#;IDB4tX7OcSSV=x+5!Lg14D_+V+mB z=;3!OFgzC5Nq|{vV9cJ|c4gm`Z_!)uot)$@X#s7cJw>YsenM8b2; zJVz{gGTs0GH|b$1Lot?t1k{V46ExV$A!#PLd~Rmj{%Mv4^FZcuD4&A}>@CLUft~s2r2x89EjL;SgR)Qs-sun^;{WJuy=N8&74U6pZzDU`)535-@emSP>s;=mrtFCy+lHhK9h18g0nR602Y=1Y#H zp$Ypf9AWw)8L?ebd53l8QNHiN4!;|Omodlmzk6yX^pcZ!j`nKZ}aI}rO;5>H#BP(&AyVkLv%+Zg5Tv+s!V|#?yp1a>&iXAO&I9BqG&6+;3S@46hu;kx0du$mDc^eK18xDz0t)f2)f_LmN z^#rM7wl9#n%_dk{XAjNAK|nr5y+D-=!HOzi+zJDUbdHOf z;}+D~@x*S3+8)~=HFvOLMNV(o5fL1bvZG6Ibcr6H=xGx@En>K*VnojF-yp5i1v*e` z2MF`lNCiP-klQWpxAy$mo=@EyMMq<)x%ZB#wCmWt$<@o)1nc15P;w8JEQ1h>Ztv{K zXYD&mj?kx}{x1;a?3<5>o~HSIH;>#nvT)(XSUJ!uz@Ml0+6b&rbOqxt+QbzBRt&-tr5}mVc~h0rGdh?uOO)@~KN{x~v?jufu*-@xhdTdq33w<IE6+ho>J>ozgAk7i0<^v6N z#ANtDtHx==@CK;(u%7~;4+okceb0z7-Lq1F^`65x>Qdip?j5zNKhjf>{>Y$(+K;SS zoOY>k+N*_bKJpol=+z&!H6Kx_KiWm%^ll9Ku}TZ+kM$VxV-p2ker#1kFCRPFk4DrV z58}Q)-fF<Q5ReNPp6V+kO(%;&cSJW&aG*uNsY%996?=)wlrk(MhZaiTv3~ zU}xl=EvA}ru4d~5V8*Sy*b)BGOA4ic!n5E(4!^MjBgBvFRZY`dEvx~Nr-l?(EtP=N zYLNDy{CiLcWHpno3bhFFK;Pt0Pa(Kfty9D&P?zc&c#9?w7O^^njd69<$V(Jw0pX0Q zr%Hlh&B``GT}^FMMg*&Bsb3CeURBWs>V#?pgu^qS3V@1aCrSh&>an6kI0}w>Z+L$^ zwqn+;4^f*jo>3jc?_|x?yE0m0;H_u)t!toEJW4K6MJKBTGp3WMFP1eUt795gMn#gd&C^i+E>h}))+iD76GxXoQw*NoWYeAFnDzl*- zFlwO7_0DkDNGu;0*K;s44<{I?KqY z@%ffKRqn9XZ6l~K5?I4@ZaP2B$^IPJh9xJxDNFJ>EGvVc%khjg+&Ct1lk21Bvz zrDv|PdA!kV3b)nZq-0*TM3Qq=2IK;GT3NeuFDGHLR~SdW!pB1ObHTQBQrS$^Xva%^f#(yuEX&v2)fU!VANhYiDLn%Yk6Y&@7U8 zxhYU?+9ot@yWjM}-EHNaF9$n)`UZj7Ve=mQRFYi4m>^%wkM7IyrvVd=X7x`c8~)b+H~t6h2j<2^PxQ8X z@%eJkZlP!Q66kiGJta?cxiP2=aq}-b_T39U=y>6V8K&F0pAsW|;>P~+#y!HuJ>rJm z@`fQ{!_ZT`rqefXer!R(O`y?2}YIuoT zq+sD6wseY-&0@zkF}$g?<)9Eg2&h_Jb9fZdA+a$m`n$!z1~Jeg`r1Wb^r=-Jb(I`p za3pEpQf?a&+D61k$DPx#aqR~{{ZzF>Z3tGhxwX{VS8D0MqbW6PS#oTFSEqp?v8509 z7VIqz>=T0fpsN8}$r5;A>8Ug!d-#DNR5`DL{r&Cz@W7W(UsfRpXe=6I$LA~B)2_Sp z-{$@%_rNkn6r62>d)t3h-yQxBx+VAC*%8n*=LhFI7BnSSv}}o%EYW|g7y-fWxaZ-- zzA@ZB+D!el(*RF}Kkd^T+O7HNX6n$8;irRYoZfa|H&lE$r~{x6hbSoeaJL#t?jhr7 zgZiFvSUu`ge`L`@^+!%E6n)g7hBW&epkCu{fu#a0p*;D|z}mpo0{d%0c73QPozG}N zx>J8+i>mp5H73$R15fQ*kk#So2&C&e(mG8XHqXP2Wq$xLYAi?nUPv(Du>OK}5^Eak zLV=<|W~0%csAOkQMFznB*##{;5GnXIGa&!NgHO$TT&wv#V^pLN;^2VU&bk1;K=1^i zNPwWd{GKruj9(vP!T5i*O4QXK;JL&!vXt_x{|%uxrkDBRi!dh1lLrsFOma#l(h1oD z3WmmWNpJ@Oj|bok*>p~Fg2!2ou0BN*f3KKnm9Y*V9?2vtlh|26vM6YAj!oifDlJ(! z23W>9%+6UJyanVOJdcAoL|)isPcp1;t#Aa{?dT;an`y02ox2A7sXLzZ3-|P{18ZP_# z1z-Oi-`(%r8(Q)`KYJ8LZ*{*lc4cgSFw1;M@MdvO{$EXNN0(1S4BL z*d<#>B|!R?-Ko?UldnofveF>tTmqvs>>ngjt8ZWJj(-J<26vozuKKW}66EDj?|xTr zXpZl3`junh1|k8eDT51e{v z)rT8j%kv;aC8pB5fAa_ zn9y1H`9g@C#Sotdk7qbwAtD-u&R#YtCo_}JV}B9(1qHlhz>%o?ED{tbmr;Y$N@H_coM<;O$v<$>pv z33z>3)26!7L!kpOoj(BoEb+ZwH?mQs53&wq`ozJOtVpdZ@`&;)SXYE?^AMKz0%iFg zBqVT=8JE3p0zfogb{($4WSZRaJB~DAS}hhQvs9A#mq8M`4~#rPjK{K za2-6%Ok_c+0=c(_PO@B>PBY**!;|CfT&YROU9cbEraCzz>EuChn2IYq2w$1Fybpyl z?e&PudaCytrZE2+z=6Vekh@_ny`V3-Hq7clG5{}-7q7jDiA)nvl|bm`i5n-%fx|-J zFcu!clf&L!GIT@y(Ah9|;hnbmiMP8;u_3`XRN6K6-<-#1he6;3D&Z_ew+i8{CGWP` zgJMHqK6x!&_C^J7bjjPfxLNS_Lb2UDZ(6cP#MaQljvK`}ix_OXW&&%r>jVYDlOs?v z_z4+^#juMr))T16xt&Aw}M=#-StGR6jv-8wc%yxOABA<)$6P1ams*9 z_E~=Jh+uDB@RsZyg0b_75m&rOO;Fmc%bw{M<6$8@TCtK^8#0+^U!LE;;F%v4tk6}*Vo)${ePSoI z&#Es_)gj^pbQo%1IDTjMPufeH4+`yvgx13qJ*hDuZ&Srck|yNeR56pJ1^Gi2D@lTi z(pG946a2?3c2eL#on57_V?xK6XtI^;y@ILlv6|}auQ*Ay2f8Y??-g2~A=MiMQw*v@ z8!KK?-God{^XmE2^G3lKdD2YE#?@YJgXjy~+<0SSg`_ucCEsPEb@uFAlUF7MWAhU& z09AS$wSk3ae-wW=UO|w&yPtgSwUJNh+2_8%rQ}MH;}aWV{{$bUXL0mB9Q|8};5T!5 zc+E`V-yU$i$+Ic>jN{WT8ONt{DITs9M{bd%1BYv1AM)kWV+_Eos?OM2(B{E!{(sPKU2-bT;@cw`iC;JVxd4kDMI=u(kAG&@3;I8whY^rlEHzfEY7Y9n^6v!Xn=Lzt zuX5k-%@gHF1j$EAWE06+=XZ|KEy+Rvl;>(pbE>N~ z&8IV5Z#s)T6%r)FVneg+RB9e2<;4EQhHxW$B9YAHz~DwU9LQ%F1AmI6Ask^s^B$5< z4fe*747+3kyodb~vY0M$cOa^$DT)FPLfJn<#?O!${{A}}e1xJ9FQLJ|N3KW6{|~6? z5$bt_2EI_4Db*LK7BwO2dFl~5_y~1ALT!)Gu1Bcj5!zDGsHv{cG%lbXZQDyjV?t!C zggkTUB{cS_#yq>f)Y$bv)BQU2OQUD*#U*3Q>$<1#J3T5eAN&o%-!Cg}WN^bX9M$!y mt#!e;WZU@0Aw1HquQ+@JerDC|+kt+z!@A$7e&0Yr{C@xei{SeJ literal 0 HcmV?d00001 diff --git a/entrypoints/openai/api_server.py b/entrypoints/openai/api_server.py new file mode 100644 index 0000000..3cf66fc --- /dev/null +++ b/entrypoints/openai/api_server.py @@ -0,0 +1,2096 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import hashlib +import importlib +import inspect +import json +import multiprocessing +import multiprocessing.forkserver as forkserver +import os +import secrets +import signal +import socket +import tempfile +import uuid +from argparse import Namespace +from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Callable +from contextlib import asynccontextmanager +from http import HTTPStatus +from typing import Annotated, Any, Literal + +import model_hosting_container_standards.sagemaker as sagemaker_standards +import prometheus_client +import pydantic +import regex as re +import uvloop +from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request +from fastapi.exceptions import RequestValidationError +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, Response, StreamingResponse +from prometheus_client import make_asgi_app +from prometheus_fastapi_instrumentator import Instrumentator +from starlette.concurrency import iterate_in_threadpool +from starlette.datastructures import URL, Headers, MutableHeaders, State +from starlette.routing import Mount +from starlette.types import ASGIApp, Message, Receive, Scope, Send +from typing_extensions import assert_never + +import vllm.envs as envs +from vllm.config import VllmConfig +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.anthropic.protocol import ( + AnthropicError, + AnthropicErrorResponse, + AnthropicMessagesRequest, + AnthropicMessagesResponse, +) +from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages +from vllm.entrypoints.launcher import serve_http +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args +from vllm.entrypoints.openai.orca_metrics import metrics_header +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + ClassificationRequest, + ClassificationResponse, + CompletionRequest, + CompletionResponse, + DetokenizeRequest, + DetokenizeResponse, + EmbeddingBytesResponse, + EmbeddingRequest, + EmbeddingResponse, + ErrorInfo, + ErrorResponse, + GenerateRequest, + GenerateResponse, + IOProcessorResponse, + PoolingBytesResponse, + PoolingRequest, + PoolingResponse, + RerankRequest, + RerankResponse, + ResponsesRequest, + ResponsesResponse, + ScoreRequest, + ScoreResponse, + StreamingResponsesResponse, + TokenizeRequest, + TokenizeResponse, + TranscriptionRequest, + TranscriptionResponse, + TranslationRequest, + TranslationResponse, +) +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_classification import ServingClassification +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import ( + BaseModelPath, + OpenAIServingModels, +) +from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling +from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses +from vllm.entrypoints.openai.serving_score import ServingScores +from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization +from vllm.entrypoints.openai.serving_tokens import ServingTokens +from vllm.entrypoints.openai.serving_transcription import ( + OpenAIServingTranscription, + OpenAIServingTranslation, +) +from vllm.entrypoints.openai.tool_parsers import ToolParserManager +from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer +from vllm.entrypoints.utils import ( + cli_env_setup, + load_aware_call, + log_non_default_args, + process_chat_template, + process_lora_modules, + with_cancellation, +) +from vllm.logger import init_logger +from vllm.reasoning import ReasoningParserManager +from vllm.tasks import POOLING_TASKS +from vllm.usage.usage_lib import UsageContext +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.gc_utils import freeze_gc_heap +from vllm.utils.network_utils import is_valid_ipv6_address +from vllm.utils.system_utils import decorate_logs, set_ulimit +from vllm.v1.engine.exceptions import EngineDeadError +from vllm.v1.metrics.prometheus import get_prometheus_registry +from vllm.version import __version__ as VLLM_VERSION + +prometheus_multiproc_dir: tempfile.TemporaryDirectory + +# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) +logger = init_logger("vllm.entrypoints.openai.api_server") + +ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format" + +_running_tasks: set[asyncio.Task] = set() + + +@asynccontextmanager +async def lifespan(app: FastAPI): + try: + if app.state.log_stats: + engine_client: EngineClient = app.state.engine_client + + async def _force_log(): + while True: + await asyncio.sleep(envs.VLLM_LOG_STATS_INTERVAL) + await engine_client.do_log_stats() + + task = asyncio.create_task(_force_log()) + _running_tasks.add(task) + task.add_done_callback(_running_tasks.remove) + else: + task = None + + # Mark the startup heap as static so that it's ignored by GC. + # Reduces pause times of oldest generation collections. + freeze_gc_heap() + try: + yield + finally: + if task is not None: + task.cancel() + finally: + # Ensure app state including engine ref is gc'd + del app.state + + +@asynccontextmanager +async def build_async_engine_client( + args: Namespace, + *, + usage_context: UsageContext = UsageContext.OPENAI_API_SERVER, + disable_frontend_multiprocessing: bool | None = None, + client_config: dict[str, Any] | None = None, +) -> AsyncIterator[EngineClient]: + if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver": + # The executor is expected to be mp. + # Pre-import heavy modules in the forkserver process + logger.debug("Setup forkserver with pre-imports") + multiprocessing.set_start_method("forkserver") + multiprocessing.set_forkserver_preload(["vllm.v1.engine.async_llm"]) + forkserver.ensure_running() + logger.debug("Forkserver setup complete!") + + # Context manager to handle engine_client lifecycle + # Ensures everything is shutdown and cleaned up on error/exit + engine_args = AsyncEngineArgs.from_cli_args(args) + if client_config: + engine_args._api_process_count = client_config.get("client_count", 1) + engine_args._api_process_rank = client_config.get("client_index", 0) + + if disable_frontend_multiprocessing is None: + disable_frontend_multiprocessing = bool(args.disable_frontend_multiprocessing) + + async with build_async_engine_client_from_engine_args( + engine_args, + usage_context=usage_context, + disable_frontend_multiprocessing=disable_frontend_multiprocessing, + client_config=client_config, + ) as engine: + yield engine + + +@asynccontextmanager +async def build_async_engine_client_from_engine_args( + engine_args: AsyncEngineArgs, + *, + usage_context: UsageContext = UsageContext.OPENAI_API_SERVER, + disable_frontend_multiprocessing: bool = False, + client_config: dict[str, Any] | None = None, +) -> AsyncIterator[EngineClient]: + """ + Create EngineClient, either: + - in-process using the AsyncLLMEngine Directly + - multiprocess using AsyncLLMEngine RPC + + Returns the Client or None if the creation failed. + """ + + # Create the EngineConfig (determines if we can use V1). + vllm_config = engine_args.create_engine_config(usage_context=usage_context) + + if disable_frontend_multiprocessing: + logger.warning("V1 is enabled, but got --disable-frontend-multiprocessing.") + + from vllm.v1.engine.async_llm import AsyncLLM + + async_llm: AsyncLLM | None = None + + # Don't mutate the input client_config + client_config = dict(client_config) if client_config else {} + client_count = client_config.pop("client_count", 1) + client_index = client_config.pop("client_index", 0) + + try: + async_llm = AsyncLLM.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + enable_log_requests=engine_args.enable_log_requests, + aggregate_engine_logging=engine_args.aggregate_engine_logging, + disable_log_stats=engine_args.disable_log_stats, + client_addresses=client_config, + client_count=client_count, + client_index=client_index, + ) + + # Don't keep the dummy data in memory + assert async_llm is not None + await async_llm.reset_mm_cache() + + yield async_llm + finally: + if async_llm: + async_llm.shutdown() + + +async def validate_json_request(raw_request: Request): + content_type = raw_request.headers.get("content-type", "").lower() + media_type = content_type.split(";", maxsplit=1)[0] + if media_type != "application/json": + raise RequestValidationError( + errors=["Unsupported Media Type: Only 'application/json' is allowed"] + ) + + +router = APIRouter() + + +class PrometheusResponse(Response): + media_type = prometheus_client.CONTENT_TYPE_LATEST + + +def mount_metrics(app: FastAPI): + """Mount prometheus metrics to a FastAPI app.""" + + registry = get_prometheus_registry() + + # `response_class=PrometheusResponse` is needed to return an HTTP response + # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8" + # instead of the default "application/json" which is incorrect. + # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364 + Instrumentator( + excluded_handlers=[ + "/metrics", + "/health", + "/load", + "/ping", + "/version", + "/server_info", + ], + registry=registry, + ).add().instrument(app).expose(app, response_class=PrometheusResponse) + + # Add prometheus asgi middleware to route /metrics requests + metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) + + # Workaround for 307 Redirect for /metrics + metrics_route.path_regex = re.compile("^/metrics(?P.*)$") + app.routes.append(metrics_route) + + +def base(request: Request) -> OpenAIServing: + # Reuse the existing instance + return tokenization(request) + + +def models(request: Request) -> OpenAIServingModels: + return request.app.state.openai_serving_models + + +def responses(request: Request) -> OpenAIServingResponses | None: + return request.app.state.openai_serving_responses + + +def messages(request: Request) -> AnthropicServingMessages: + return request.app.state.anthropic_serving_messages + + +def chat(request: Request) -> OpenAIServingChat | None: + return request.app.state.openai_serving_chat + + +def completion(request: Request) -> OpenAIServingCompletion | None: + return request.app.state.openai_serving_completion + + +def pooling(request: Request) -> OpenAIServingPooling | None: + return request.app.state.openai_serving_pooling + + +def embedding(request: Request) -> OpenAIServingEmbedding | None: + return request.app.state.openai_serving_embedding + + +def score(request: Request) -> ServingScores | None: + return request.app.state.openai_serving_scores + + +def classify(request: Request) -> ServingClassification | None: + return request.app.state.openai_serving_classification + + +def rerank(request: Request) -> ServingScores | None: + return request.app.state.openai_serving_scores + + +def tokenization(request: Request) -> OpenAIServingTokenization: + return request.app.state.openai_serving_tokenization + + +def transcription(request: Request) -> OpenAIServingTranscription: + return request.app.state.openai_serving_transcription + + +def translation(request: Request) -> OpenAIServingTranslation: + return request.app.state.openai_serving_translation + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +def generate_tokens(request: Request) -> ServingTokens | None: + return request.app.state.serving_tokens + + +@router.get("/health", response_class=Response) +async def health(raw_request: Request) -> Response: + """Health check.""" + try: + await engine_client(raw_request).check_health() + return Response(status_code=200) + except EngineDeadError: + return Response(status_code=503) + + +@router.get("/load") +async def get_server_load_metrics(request: Request): + # This endpoint returns the current server load metrics. + # It tracks requests utilizing the GPU from the following routes: + # - /v1/chat/completions + # - /v1/completions + # - /v1/audio/transcriptions + # - /v1/audio/translations + # - /v1/embeddings + # - /pooling + # - /classify + # - /score + # - /v1/score + # - /rerank + # - /v1/rerank + # - /v2/rerank + return JSONResponse(content={"server_load": request.app.state.server_load_metrics}) + + +@router.post( + "/tokenize", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +async def tokenize(request: TokenizeRequest, raw_request: Request): + handler = tokenization(raw_request) + + try: + generator = await handler.create_tokenize(request, raw_request) + except NotImplementedError as e: + raise HTTPException( + status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e) + ) from e + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, TokenizeResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post( + "/detokenize", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +async def detokenize(request: DetokenizeRequest, raw_request: Request): + handler = tokenization(raw_request) + + try: + generator = await handler.create_detokenize(request, raw_request) + except OverflowError as e: + raise RequestValidationError(errors=[str(e)]) from e + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, DetokenizeResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +def maybe_register_tokenizer_info_endpoint(args): + """Conditionally register the tokenizer info endpoint if enabled.""" + if getattr(args, "enable_tokenizer_info_endpoint", False): + + @router.get("/tokenizer_info") + async def get_tokenizer_info(raw_request: Request): + """Get comprehensive tokenizer information.""" + result = await tokenization(raw_request).get_tokenizer_info() + return JSONResponse( + content=result.model_dump(), + status_code=result.error.code + if isinstance(result, ErrorResponse) + else 200, + ) + + +@router.get("/v1/models") +async def show_available_models(raw_request: Request): + handler = models(raw_request) + + models_ = await handler.show_available_models() + return JSONResponse(content=models_.model_dump()) + + +@router.get("/version") +async def show_version(): + ver = {"version": VLLM_VERSION} + return JSONResponse(content=ver) + + +async def _convert_stream_to_sse_events( + generator: AsyncGenerator[StreamingResponsesResponse, None], +) -> AsyncGenerator[str, None]: + """Convert the generator to a stream of events in SSE format""" + async for event in generator: + event_type = getattr(event, "type", "unknown") + # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format + event_data = ( + f"event: {event_type}\ndata: {event.model_dump_json(indent=None)}\n\n" + ) + yield event_data + + +@router.post( + "/v1/responses", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +async def create_responses(request: ResponsesRequest, raw_request: Request): + handler = responses(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Responses API" + ) + try: + generator = await handler.create_responses(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, ResponsesResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse( + content=_convert_stream_to_sse_events(generator), media_type="text/event-stream" + ) + + +@router.get("/v1/responses/{response_id}") +async def retrieve_responses( + response_id: str, + raw_request: Request, + starting_after: int | None = None, + stream: bool | None = False, +): + handler = responses(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Responses API" + ) + + try: + response = await handler.retrieve_responses( + response_id, + starting_after=starting_after, + stream=stream, + ) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(response, ErrorResponse): + return JSONResponse( + content=response.model_dump(), status_code=response.error.code + ) + elif isinstance(response, ResponsesResponse): + return JSONResponse(content=response.model_dump()) + return StreamingResponse( + content=_convert_stream_to_sse_events(response), media_type="text/event-stream" + ) + + +@router.post("/v1/responses/{response_id}/cancel") +async def cancel_responses(response_id: str, raw_request: Request): + handler = responses(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Responses API" + ) + + try: + response = await handler.cancel_responses(response_id) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(response, ErrorResponse): + return JSONResponse( + content=response.model_dump(), status_code=response.error.code + ) + return JSONResponse(content=response.model_dump()) + + +@router.post( + "/v1/messages", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_messages(request: AnthropicMessagesRequest, raw_request: Request): + def translate_error_response(response: ErrorResponse) -> JSONResponse: + anthropic_error = AnthropicErrorResponse( + error=AnthropicError( + type=response.error.type, + message=response.error.message, + ) + ) + return JSONResponse( + status_code=response.error.code, content=anthropic_error.model_dump() + ) + + handler = messages(raw_request) + if handler is None: + error = base(raw_request).create_error_response( + message="The model does not support Messages API" + ) + return translate_error_response(error) + + try: + generator = await handler.create_messages(request, raw_request) + except Exception as e: + logger.exception("Error in create_messages: %s", e) + return JSONResponse( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + content=AnthropicErrorResponse( + error=AnthropicError( + type="internal_error", + message=str(e), + ) + ).model_dump(), + ) + + if isinstance(generator, ErrorResponse): + return translate_error_response(generator) + + elif isinstance(generator, AnthropicMessagesResponse): + resp = generator.model_dump(exclude_none=True) + logger.debug("Anthropic Messages Response: %s", resp) + return JSONResponse(content=resp) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +@router.post( + "/v1/chat/completions", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + metrics_header_format = raw_request.headers.get( + ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, "" + ) + handler = chat(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Chat Completions API" + ) + try: + generator = await handler.create_chat_completion(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, ChatCompletionResponse): + return JSONResponse( + content=generator.model_dump(), + headers=metrics_header(metrics_header_format), + ) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +@router.post( + "/v1/completions", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_completion(request: CompletionRequest, raw_request: Request): + metrics_header_format = raw_request.headers.get( + ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, "" + ) + handler = completion(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Completions API" + ) + + try: + generator = await handler.create_completion(request, raw_request) + except OverflowError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, detail=str(e) + ) from e + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, CompletionResponse): + return JSONResponse( + content=generator.model_dump(), + headers=metrics_header(metrics_header_format), + ) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +@router.post( + "/v1/embeddings", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_embedding( + request: EmbeddingRequest, + raw_request: Request, +): + handler = embedding(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Embeddings API" + ) + + try: + generator = await handler.create_embedding(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, EmbeddingResponse): + return JSONResponse(content=generator.model_dump()) + elif isinstance(generator, EmbeddingBytesResponse): + return StreamingResponse( + content=generator.body, + headers={"metadata": generator.metadata}, + media_type=generator.media_type, + ) + + assert_never(generator) + + +@router.post( + "/pooling", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_pooling(request: PoolingRequest, raw_request: Request): + handler = pooling(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Pooling API" + ) + try: + generator = await handler.create_pooling(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, (PoolingResponse, IOProcessorResponse)): + return JSONResponse(content=generator.model_dump()) + elif isinstance(generator, PoolingBytesResponse): + return StreamingResponse( + content=generator.body, + headers={"metadata": generator.metadata}, + media_type=generator.media_type, + ) + + assert_never(generator) + + +@router.post("/classify", dependencies=[Depends(validate_json_request)]) +@with_cancellation +@load_aware_call +async def create_classify(request: ClassificationRequest, raw_request: Request): + handler = classify(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Classification API" + ) + + try: + generator = await handler.create_classify(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, ClassificationResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post( + "/score", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_score(request: ScoreRequest, raw_request: Request): + handler = score(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Score API" + ) + + try: + generator = await handler.create_score(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, ScoreResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post( + "/v1/score", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_score_v1(request: ScoreRequest, raw_request: Request): + logger.warning( + "To indicate that Score API is not part of standard OpenAI API, we " + "have moved it to `/score`. Please update your client accordingly." + ) + + return await create_score(request, raw_request) + + +@router.post( + "/v1/audio/transcriptions", + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_transcriptions( + raw_request: Request, request: Annotated[TranscriptionRequest, Form()] +): + handler = transcription(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Transcriptions API" + ) + + audio_data = await request.file.read() + try: + generator = await handler.create_transcription(audio_data, request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, TranscriptionResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +@router.post( + "/v1/audio/translations", + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_translations( + request: Annotated[TranslationRequest, Form()], raw_request: Request +): + handler = translation(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Translations API" + ) + + audio_data = await request.file.read() + try: + generator = await handler.create_translation(audio_data, request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, TranslationResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +@router.post( + "/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def do_rerank(request: RerankRequest, raw_request: Request): + handler = rerank(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Rerank (Score) API" + ) + try: + generator = await handler.do_rerank(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, RerankResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post( + "/v1/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +async def do_rerank_v1(request: RerankRequest, raw_request: Request): + logger.warning_once( + "To indicate that the rerank API is not part of the standard OpenAI" + " API, we have located it at `/rerank`. Please update your client " + "accordingly. (Note: Conforms to JinaAI rerank API)" + ) + + return await do_rerank(request, raw_request) + + +@router.post( + "/v2/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +async def do_rerank_v2(request: RerankRequest, raw_request: Request): + return await do_rerank(request, raw_request) + + +if envs.VLLM_SERVER_DEV_MODE: + logger.warning( + "SECURITY WARNING: Development endpoints are enabled! " + "This should NOT be used in production!" + ) + + PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) + + @router.get("/server_info") + async def show_server_info( + raw_request: Request, + config_format: Annotated[Literal["text", "json"], Query()] = "text", + ): + vllm_config: VllmConfig = raw_request.app.state.vllm_config + server_info = { + "vllm_config": str(vllm_config) + if config_format == "text" + else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str) + # fallback=str is needed to handle e.g. torch.dtype + } + return JSONResponse(content=server_info) + + @router.post("/reset_prefix_cache") + async def reset_prefix_cache(raw_request: Request): + """ + Reset the prefix cache. Note that we currently do not check if the + prefix cache is successfully reset in the API server. + """ + logger.info("Resetting prefix cache...") + await engine_client(raw_request).reset_prefix_cache() + return Response(status_code=200) + + @router.post("/reset_mm_cache") + async def reset_mm_cache(raw_request: Request): + """ + Reset the multi-modal cache. Note that we currently do not check if the + multi-modal cache is successfully reset in the API server. + """ + logger.info("Resetting multi-modal cache...") + await engine_client(raw_request).reset_mm_cache() + return Response(status_code=200) + + @router.post("/sleep") + async def sleep(raw_request: Request): + # get POST params + level = raw_request.query_params.get("level", "1") + await engine_client(raw_request).sleep(int(level)) + # FIXME: in v0 with frontend multiprocessing, the sleep command + # is sent but does not finish yet when we return a response. + return Response(status_code=200) + + @router.post("/wake_up") + async def wake_up(raw_request: Request): + tags = raw_request.query_params.getlist("tags") + if tags == []: + # set to None to wake up all tags if no tags are provided + tags = None + logger.info("wake up the engine with tags: %s", tags) + await engine_client(raw_request).wake_up(tags) + # FIXME: in v0 with frontend multiprocessing, the wake-up command + # is sent but does not finish yet when we return a response. + return Response(status_code=200) + + @router.get("/is_sleeping") + async def is_sleeping(raw_request: Request): + logger.info("check whether the engine is sleeping") + is_sleeping = await engine_client(raw_request).is_sleeping() + return JSONResponse(content={"is_sleeping": is_sleeping}) + + @router.post("/collective_rpc") + async def collective_rpc(raw_request: Request): + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}", + ) from e + method = body.get("method") + if method is None: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail="Missing 'method' in request body", + ) + # For security reason, only serialized string args/kwargs are passed. + # User-defined `method` is responsible for deserialization if needed. + args: list[str] = body.get("args", []) + kwargs: dict[str, str] = body.get("kwargs", {}) + timeout: float | None = body.get("timeout") + results = await engine_client(raw_request).collective_rpc( + method=method, timeout=timeout, args=tuple(args), kwargs=kwargs + ) + if results is None: + return Response(status_code=200) + response: list[Any] = [] + for result in results: + if result is None or isinstance(result, (dict, list)): + response.append(result) + else: + response.append(str(result)) + return JSONResponse(content={"results": response}) + + +@router.post( + "/scale_elastic_ep", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"model": dict}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def scale_elastic_ep(raw_request: Request): + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException(status_code=400, detail="Invalid JSON format") from e # noqa: B904 + + new_data_parallel_size = body.get("new_data_parallel_size") + drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes + + if new_data_parallel_size is None: + raise HTTPException( + status_code=400, detail="new_data_parallel_size is required" + ) + + if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0: + raise HTTPException( + status_code=400, detail="new_data_parallel_size must be a positive integer" + ) + + if not isinstance(drain_timeout, int) or drain_timeout <= 0: + raise HTTPException( + status_code=400, detail="drain_timeout must be a positive integer" + ) + + # Set scaling flag to prevent new requests + global _scaling_elastic_ep + _scaling_elastic_ep = True + client = engine_client(raw_request) + try: + await client.scale_elastic_ep(new_data_parallel_size, drain_timeout) + return JSONResponse( + { + "message": f"Scaled to {new_data_parallel_size} data parallel engines", + } + ) + except TimeoutError as e: + raise HTTPException( + status_code=408, + detail="Scale failed due to request drain timeout " + f"after {drain_timeout} seconds", + ) from e + except Exception as e: + logger.error("Scale failed: %s", e) + raise HTTPException(status_code=500, detail="Scale failed") from e + finally: + _scaling_elastic_ep = False + + +@router.post("/is_scaling_elastic_ep") +async def is_scaling_elastic_ep(raw_request: Request): + return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep}) + + +# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers +# (requires typing_extensions >= 4.13) +RequestType = Any +GetHandlerFn = Callable[[Request], OpenAIServing | None] +EndpointFn = Callable[[RequestType, Request], Awaitable[Any]] + +# NOTE: Items defined earlier take higher priority +INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [ + (ChatCompletionRequest, (chat, create_chat_completion)), + (CompletionRequest, (completion, create_completion)), + (EmbeddingRequest, (embedding, create_embedding)), + (ClassificationRequest, (classify, create_classify)), + (ScoreRequest, (score, create_score)), + (RerankRequest, (rerank, do_rerank)), + (PoolingRequest, (pooling, create_pooling)), +] + +# NOTE: Construct the TypeAdapters only once +INVOCATION_VALIDATORS = [ + (pydantic.TypeAdapter(request_type), (get_handler, endpoint)) + for request_type, (get_handler, endpoint) in INVOCATION_TYPES +] + + +@router.post( + "/inference/v1/generate", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def generate(request: GenerateRequest, raw_request: Request): + handler = generate_tokens(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support generate tokens API" + ) + try: + generator = await handler.serve_tokens(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, GenerateResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +if envs.VLLM_TORCH_PROFILER_DIR: + logger.warning_once( + "Torch Profiler is enabled in the API server. This should ONLY be " + "used for local development!" + ) +elif envs.VLLM_TORCH_CUDA_PROFILE: + logger.warning_once( + "CUDA Profiler is enabled in the API server. This should ONLY be " + "used for local development!" + ) +if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE: + + @router.post("/start_profile") + async def start_profile(raw_request: Request): + logger.info("Starting profiler...") + await engine_client(raw_request).start_profile() + logger.info("Profiler started.") + return Response(status_code=200) + + @router.post("/stop_profile") + async def stop_profile(raw_request: Request): + logger.info("Stopping profiler...") + await engine_client(raw_request).stop_profile() + logger.info("Profiler stopped.") + return Response(status_code=200) + + +def load_log_config(log_config_file: str | None) -> dict | None: + if not log_config_file: + return None + try: + with open(log_config_file) as f: + return json.load(f) + except Exception as e: + logger.warning( + "Failed to load log config from file %s: error %s", log_config_file, e + ) + return None + + +class AuthenticationMiddleware: + """ + Pure ASGI middleware that authenticates each request by checking + if the Authorization Bearer token exists and equals anyof "{api_key}". + + Notes + ----- + There are two cases in which authentication is skipped: + 1. The HTTP method is OPTIONS. + 2. The request path doesn't start with /v1 (e.g. /health). + """ + + def __init__(self, app: ASGIApp, tokens: list[str]) -> None: + self.app = app + self.api_tokens = [hashlib.sha256(t.encode("utf-8")).digest() for t in tokens] + + def verify_token(self, headers: Headers) -> bool: + authorization_header_value = headers.get("Authorization") + if not authorization_header_value: + return False + + scheme, _, param = authorization_header_value.partition(" ") + if scheme.lower() != "bearer": + return False + + param_hash = hashlib.sha256(param.encode("utf-8")).digest() + + token_match = False + for token_hash in self.api_tokens: + token_match |= secrets.compare_digest(param_hash, token_hash) + + return token_match + + def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]: + if scope["type"] not in ("http", "websocket") or scope["method"] == "OPTIONS": + # scope["type"] can be "lifespan" or "startup" for example, + # in which case we don't need to do anything + return self.app(scope, receive, send) + root_path = scope.get("root_path", "") + url_path = URL(scope=scope).path.removeprefix(root_path) + headers = Headers(scope=scope) + # Type narrow to satisfy mypy. + if url_path.startswith("/v1") and not self.verify_token(headers): + response = JSONResponse(content={"error": "Unauthorized"}, status_code=401) + return response(scope, receive, send) + return self.app(scope, receive, send) + + +class XRequestIdMiddleware: + """ + Middleware the set's the X-Request-Id header for each response + to a random uuid4 (hex) value if the header isn't already + present in the request, otherwise use the provided request id. + """ + + def __init__(self, app: ASGIApp) -> None: + self.app = app + + def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]: + if scope["type"] not in ("http", "websocket"): + return self.app(scope, receive, send) + + # Extract the request headers. + request_headers = Headers(scope=scope) + + async def send_with_request_id(message: Message) -> None: + """ + Custom send function to mutate the response headers + and append X-Request-Id to it. + """ + if message["type"] == "http.response.start": + response_headers = MutableHeaders(raw=message["headers"]) + request_id = request_headers.get("X-Request-Id", uuid.uuid4().hex) + response_headers.append("X-Request-Id", request_id) + await send(message) + + return self.app(scope, receive, send_with_request_id) + + +# Global variable to track scaling state +_scaling_elastic_ep = False + + +class ScalingMiddleware: + """ + Middleware that checks if the model is currently scaling and + returns a 503 Service Unavailable response if it is. + + This middleware applies to all HTTP requests and prevents + processing when the model is in a scaling state. + """ + + def __init__(self, app: ASGIApp) -> None: + self.app = app + + def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]: + if scope["type"] != "http": + return self.app(scope, receive, send) + + # Check global scaling state + global _scaling_elastic_ep + if _scaling_elastic_ep: + # Return 503 Service Unavailable response + response = JSONResponse( + content={ + "error": "The model is currently scaling. Please try again later." + }, + status_code=503, + ) + return response(scope, receive, send) + + return self.app(scope, receive, send) + + +def _extract_content_from_chunk(chunk_data: dict) -> str: + """Extract content from a streaming response chunk.""" + try: + from vllm.entrypoints.openai.protocol import ( + ChatCompletionStreamResponse, + CompletionStreamResponse, + ) + + # Try using Completion types for type-safe parsing + if chunk_data.get("object") == "chat.completion.chunk": + chat_response = ChatCompletionStreamResponse.model_validate(chunk_data) + if chat_response.choices and chat_response.choices[0].delta.content: + return chat_response.choices[0].delta.content + elif chunk_data.get("object") == "text_completion": + completion_response = CompletionStreamResponse.model_validate(chunk_data) + if completion_response.choices and completion_response.choices[0].text: + return completion_response.choices[0].text + except pydantic.ValidationError: + # Fallback to manual parsing + if "choices" in chunk_data and chunk_data["choices"]: + choice = chunk_data["choices"][0] + if "delta" in choice and choice["delta"].get("content"): + return choice["delta"]["content"] + elif choice.get("text"): + return choice["text"] + return "" + + +class SSEDecoder: + """Robust Server-Sent Events decoder for streaming responses.""" + + def __init__(self): + self.buffer = "" + self.content_buffer = [] + + def decode_chunk(self, chunk: bytes) -> list[dict]: + """Decode a chunk of SSE data and return parsed events.""" + import json + + try: + chunk_str = chunk.decode("utf-8") + except UnicodeDecodeError: + # Skip malformed chunks + return [] + + self.buffer += chunk_str + events = [] + + # Process complete lines + while "\n" in self.buffer: + line, self.buffer = self.buffer.split("\n", 1) + line = line.rstrip("\r") # Handle CRLF + + if line.startswith("data: "): + data_str = line[6:].strip() + if data_str == "[DONE]": + events.append({"type": "done"}) + elif data_str: + try: + event_data = json.loads(data_str) + events.append({"type": "data", "data": event_data}) + except json.JSONDecodeError: + # Skip malformed JSON + continue + + return events + + def extract_content(self, event_data: dict) -> str: + """Extract content from event data.""" + return _extract_content_from_chunk(event_data) + + def add_content(self, content: str) -> None: + """Add content to the buffer.""" + if content: + self.content_buffer.append(content) + + def get_complete_content(self) -> str: + """Get the complete buffered content.""" + return "".join(self.content_buffer) + + +def _log_streaming_response(response, response_body: list) -> None: + """Log streaming response with robust SSE parsing.""" + from starlette.concurrency import iterate_in_threadpool + + sse_decoder = SSEDecoder() + chunk_count = 0 + + def buffered_iterator(): + nonlocal chunk_count + + for chunk in response_body: + chunk_count += 1 + yield chunk + + # Parse SSE events from chunk + events = sse_decoder.decode_chunk(chunk) + + for event in events: + if event["type"] == "data": + content = sse_decoder.extract_content(event["data"]) + sse_decoder.add_content(content) + elif event["type"] == "done": + # Log complete content when done + full_content = sse_decoder.get_complete_content() + if full_content: + # Truncate if too long + if len(full_content) > 2048: + full_content = full_content[:2048] + "" + "...[truncated]" + logger.info( + "response_body={streaming_complete: content=%r, chunks=%d}", + full_content, + chunk_count, + ) + else: + logger.info( + "response_body={streaming_complete: no_content, chunks=%d}", + chunk_count, + ) + return + + response.body_iterator = iterate_in_threadpool(buffered_iterator()) + logger.info("response_body={streaming_started: chunks=%d}", len(response_body)) + + +def _log_non_streaming_response(response_body: list) -> None: + """Log non-streaming response.""" + try: + decoded_body = response_body[0].decode() + logger.info("response_body={%s}", decoded_body) + except UnicodeDecodeError: + logger.info("response_body={}") + + +def build_app(args: Namespace) -> FastAPI: + if args.disable_fastapi_docs: + app = FastAPI( + openapi_url=None, docs_url=None, redoc_url=None, lifespan=lifespan + ) + else: + app = FastAPI(lifespan=lifespan) + + if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: + logger.warning( + "LoRA dynamic loading & unloading is enabled in the API server. " + "This should ONLY be used for local development!" + ) + from vllm.entrypoints.dynamic_lora import register_dynamic_lora_routes + + register_dynamic_lora_routes(router) + + from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes + + register_sagemaker_routes(router) + + app.include_router(router) + app.root_path = args.root_path + + mount_metrics(app) + + app.add_middleware( + CORSMiddleware, + allow_origins=args.allowed_origins, + allow_credentials=args.allow_credentials, + allow_methods=args.allowed_methods, + allow_headers=args.allowed_headers, + ) + + @app.exception_handler(HTTPException) + async def http_exception_handler(_: Request, exc: HTTPException): + err = ErrorResponse( + error=ErrorInfo( + message=exc.detail, + type=HTTPStatus(exc.status_code).phrase, + code=exc.status_code, + ) + ) + return JSONResponse(err.model_dump(), status_code=exc.status_code) + + @app.exception_handler(RequestValidationError) + async def validation_exception_handler(_: Request, exc: RequestValidationError): + exc_str = str(exc) + errors_str = str(exc.errors()) + + if exc.errors() and errors_str and errors_str != exc_str: + message = f"{exc_str} {errors_str}" + else: + message = exc_str + + err = ErrorResponse( + error=ErrorInfo( + message=message, + type=HTTPStatus.BAD_REQUEST.phrase, + code=HTTPStatus.BAD_REQUEST, + ) + ) + return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) + + # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY + if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]: + app.add_middleware(AuthenticationMiddleware, tokens=tokens) + + if args.enable_request_id_headers: + app.add_middleware(XRequestIdMiddleware) + + # Add scaling middleware to check for scaling state + app.add_middleware(ScalingMiddleware) + + if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE: + logger.warning( + "CAUTION: Enabling log response in the API Server. " + "This can include sensitive information and should be " + "avoided in production." + ) + + @app.middleware("http") + async def log_response(request: Request, call_next): + response = await call_next(request) + response_body = [section async for section in response.body_iterator] + response.body_iterator = iterate_in_threadpool(iter(response_body)) + # Check if this is a streaming response by looking at content-type + content_type = response.headers.get("content-type", "") + is_streaming = content_type == "text/event-stream; charset=utf-8" + + # Log response body based on type + if not response_body: + logger.info("response_body={}") + elif is_streaming: + _log_streaming_response(response, response_body) + else: + _log_non_streaming_response(response_body) + return response + + for middleware in args.middleware: + module_path, object_name = middleware.rsplit(".", 1) + imported = getattr(importlib.import_module(module_path), object_name) + if inspect.isclass(imported): + app.add_middleware(imported) # type: ignore[arg-type] + elif inspect.iscoroutinefunction(imported): + app.middleware("http")(imported) + else: + raise ValueError( + f"Invalid middleware {middleware}. Must be a function or a class." + ) + + app = sagemaker_standards.bootstrap(app) + # Optional endpoints + if args.tokens_only: + + @app.post("/abort_requests") + async def abort_requests(raw_request: Request): + """ + Abort one or more requests. To be used in a + Disaggregated Everything setup. + """ + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}", + ) from e + request_ids = body.get("request_ids") + if request_ids is None: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail="Missing 'request_ids' in request body", + ) + # Abort requests in background + asyncio.create_task(engine_client(raw_request).abort(request_ids)) + return Response(status_code=200) + + return app + + +async def init_app_state( + engine_client: EngineClient, + state: State, + args: Namespace, +) -> None: + vllm_config = engine_client.vllm_config + + if args.served_model_name is not None: + served_model_names = args.served_model_name + else: + served_model_names = [args.model] + + if args.enable_log_requests: + request_logger = RequestLogger(max_log_len=args.max_log_len) + else: + request_logger = None + + base_model_paths = [ + BaseModelPath(name=name, model_path=args.model) for name in served_model_names + ] + + state.engine_client = engine_client + state.log_stats = not args.disable_log_stats + state.vllm_config = vllm_config + + supported_tasks = await engine_client.get_supported_tasks() + logger.info("Supported tasks: %s", supported_tasks) + + resolved_chat_template = await process_chat_template( + args.chat_template, engine_client, vllm_config.model_config + ) + + if args.tool_server == "demo": + tool_server: ToolServer | None = DemoToolServer() + assert isinstance(tool_server, DemoToolServer) + await tool_server.init_and_validate() + elif args.tool_server: + tool_server = MCPToolServer() + await tool_server.add_tool_server(args.tool_server) + else: + tool_server = None + + # Merge default_mm_loras into the static lora_modules + default_mm_loras = ( + vllm_config.lora_config.default_mm_loras + if vllm_config.lora_config is not None + else {} + ) + + default_mm_loras = ( + vllm_config.lora_config.default_mm_loras + if vllm_config.lora_config is not None + else {} + ) + lora_modules = process_lora_modules(args.lora_modules, default_mm_loras) + + state.openai_serving_models = OpenAIServingModels( + engine_client=engine_client, + base_model_paths=base_model_paths, + lora_modules=lora_modules, + ) + await state.openai_serving_models.init_static_loras() + state.openai_serving_responses = ( + OpenAIServingResponses( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_auto_tools=args.enable_auto_tool_choice, + tool_parser=args.tool_call_parser, + tool_server=tool_server, + reasoning_parser=args.structured_outputs_config.reasoning_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + enable_log_outputs=args.enable_log_outputs, + log_error_stack=args.log_error_stack, + ) + if "generate" in supported_tasks + else None + ) + state.openai_serving_chat = ( + OpenAIServingChat( + engine_client, + state.openai_serving_models, + args.response_role, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + enable_log_outputs=args.enable_log_outputs, + log_error_stack=args.log_error_stack, + ) + if "generate" in supported_tasks + else None + ) + state.openai_serving_completion = ( + OpenAIServingCompletion( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + log_error_stack=args.log_error_stack, + ) + if "generate" in supported_tasks + else None + ) + state.openai_serving_pooling = ( + ( + OpenAIServingPooling( + engine_client, + state.openai_serving_models, + supported_tasks=supported_tasks, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) + ) + if any(task in POOLING_TASKS for task in supported_tasks) + else None + ) + state.openai_serving_embedding = ( + OpenAIServingEmbedding( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) + if "embed" in supported_tasks + else None + ) + state.openai_serving_classification = ( + ServingClassification( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) + if "classify" in supported_tasks + else None + ) + state.openai_serving_scores = ( + ServingScores( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + ) + if ("embed" in supported_tasks or "score" in supported_tasks) + else None + ) + state.openai_serving_tokenization = OpenAIServingTokenization( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) + state.openai_serving_transcription = ( + OpenAIServingTranscription( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + enable_force_include_usage=args.enable_force_include_usage, + ) + if "transcription" in supported_tasks + else None + ) + state.openai_serving_translation = ( + OpenAIServingTranslation( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + enable_force_include_usage=args.enable_force_include_usage, + ) + if "transcription" in supported_tasks + else None + ) + state.anthropic_serving_messages = ( + AnthropicServingMessages( + engine_client, + state.openai_serving_models, + args.response_role, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_auto_tools=args.enable_auto_tool_choice, + tool_parser=args.tool_call_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + ) + if "generate" in supported_tasks + else None + ) + state.serving_tokens = ( + ServingTokens( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + log_error_stack=args.log_error_stack, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_log_outputs=args.enable_log_outputs, + force_no_detokenize=args.tokens_only, + ) + if "generate" in supported_tasks + else None + ) + + state.enable_server_load_tracking = args.enable_server_load_tracking + state.server_load_metrics = 0 + + +def create_server_socket(addr: tuple[str, int]) -> socket.socket: + family = socket.AF_INET + if is_valid_ipv6_address(addr[0]): + family = socket.AF_INET6 + + sock = socket.socket(family=family, type=socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + sock.bind(addr) + + return sock + + +def create_server_unix_socket(path: str) -> socket.socket: + sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM) + sock.bind(path) + return sock + + +def validate_api_server_args(args): + valid_tool_parses = ToolParserManager.list_registered() + if args.enable_auto_tool_choice and args.tool_call_parser not in valid_tool_parses: + raise KeyError( + f"invalid tool call parser: {args.tool_call_parser} " + f"(chose from {{ {','.join(valid_tool_parses)} }})" + ) + + valid_reasoning_parsers = ReasoningParserManager.list_registered() + if ( + reasoning_parser := args.structured_outputs_config.reasoning_parser + ) and reasoning_parser not in valid_reasoning_parsers: + raise KeyError( + f"invalid reasoning parser: {reasoning_parser} " + f"(chose from {{ {','.join(valid_reasoning_parsers)} }})" + ) + + +def setup_server(args): + """Validate API server args, set up signal handler, create socket + ready to serve.""" + + logger.info("vLLM API server version %s", VLLM_VERSION) + log_non_default_args(args) + + if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: + ToolParserManager.import_tool_parser(args.tool_parser_plugin) + + if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3: + ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin) + + validate_api_server_args(args) + + # workaround to make sure that we bind the port before the engine is set up. + # This avoids race conditions with ray. + # see https://github.com/vllm-project/vllm/issues/8204 + if args.uds: + sock = create_server_unix_socket(args.uds) + else: + sock_addr = (args.host or "", args.port) + sock = create_server_socket(sock_addr) + + # workaround to avoid footguns where uvicorn drops requests with too + # many concurrent requests active + set_ulimit() + + def signal_handler(*_) -> None: + # Interrupt server on sigterm while initializing + raise KeyboardInterrupt("terminated") + + signal.signal(signal.SIGTERM, signal_handler) + + if args.uds: + listen_address = f"unix:{args.uds}" + else: + addr, port = sock_addr + is_ssl = args.ssl_keyfile and args.ssl_certfile + host_part = f"[{addr}]" if is_valid_ipv6_address(addr) else addr or "0.0.0.0" + listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}" + return listen_address, sock + + +async def run_server(args, **uvicorn_kwargs) -> None: + """Run a single-worker API server.""" + + # Add process-specific prefix to stdout and stderr. + decorate_logs("APIServer") + + listen_address, sock = setup_server(args) + await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) + + +async def run_server_worker( + listen_address, sock, args, client_config=None, **uvicorn_kwargs +) -> None: + """Run a single API server worker.""" + + if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: + ToolParserManager.import_tool_parser(args.tool_parser_plugin) + + if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3: + ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin) + + # Load logging config for uvicorn if specified + log_config = load_log_config(args.log_config_file) + if log_config is not None: + uvicorn_kwargs["log_config"] = log_config + + async with build_async_engine_client( + args, + client_config=client_config, + ) as engine_client: + maybe_register_tokenizer_info_endpoint(args) + app = build_app(args) + + await init_app_state(engine_client, app.state, args) + + logger.info( + "Starting vLLM API server %d on %s", + engine_client.vllm_config.parallel_config._api_process_rank, + listen_address, + ) + shutdown_task = await serve_http( + app, + sock=sock, + enable_ssl_refresh=args.enable_ssl_refresh, + host=args.host, + port=args.port, + log_level=args.uvicorn_log_level, + # NOTE: When the 'disable_uvicorn_access_log' value is True, + # no access log will be output. + access_log=not args.disable_uvicorn_access_log, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs, + h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, + h11_max_header_count=args.h11_max_header_count, + **uvicorn_kwargs, + ) + + # NB: Await server shutdown only after the backend context is exited + try: + await shutdown_task + finally: + sock.close() + + +if __name__ == "__main__": + # NOTE(simon): + # This section should be in sync with vllm/entrypoints/cli/main.py for CLI + # entrypoints. + cli_env_setup() + parser = FlexibleArgumentParser( + description="vLLM OpenAI-Compatible RESTful API server." + ) + parser = make_arg_parser(parser) + args = parser.parse_args() + validate_parsed_serve_args(args) + + uvloop.run(run_server(args)) diff --git a/entrypoints/openai/cli_args.py b/entrypoints/openai/cli_args.py new file mode 100644 index 0000000..946362c --- /dev/null +++ b/entrypoints/openai/cli_args.py @@ -0,0 +1,302 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file contains the command line arguments for the vLLM's +OpenAI-compatible server. It is kept in a separate file for documentation +purposes. +""" + +import argparse +import json +import ssl +from collections.abc import Sequence +from dataclasses import field +from typing import Literal + +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.config import config +from vllm.engine.arg_utils import AsyncEngineArgs, optional_type +from vllm.entrypoints.chat_utils import ( + ChatTemplateContentFormatOption, + validate_chat_template, +) +from vllm.entrypoints.constants import ( + H11_MAX_HEADER_COUNT_DEFAULT, + H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT, +) +from vllm.entrypoints.openai.serving_models import LoRAModulePath +from vllm.entrypoints.openai.tool_parsers import ToolParserManager +from vllm.logger import init_logger +from vllm.utils.argparse_utils import FlexibleArgumentParser + +logger = init_logger(__name__) + + +class LoRAParserAction(argparse.Action): + def __call__( + self, + parser: argparse.ArgumentParser, + namespace: argparse.Namespace, + values: str | Sequence[str] | None, + option_string: str | None = None, + ): + if values is None: + values = [] + if isinstance(values, str): + raise TypeError("Expected values to be a list") + + lora_list: list[LoRAModulePath] = [] + for item in values: + if item in [None, ""]: # Skip if item is None or empty string + continue + if "=" in item and "," not in item: # Old format: name=path + name, path = item.split("=") + lora_list.append(LoRAModulePath(name, path)) + else: # Assume JSON format + try: + lora_dict = json.loads(item) + lora = LoRAModulePath(**lora_dict) + lora_list.append(lora) + except json.JSONDecodeError: + parser.error(f"Invalid JSON format for --lora-modules: {item}") + except TypeError as e: + parser.error( + f"Invalid fields for --lora-modules: {item} - {str(e)}" + ) + setattr(namespace, self.dest, lora_list) + + +@config +@dataclass +class FrontendArgs: + """Arguments for the OpenAI-compatible frontend server.""" + + host: str | None = None + """Host name.""" + port: int = 8000 + """Port number.""" + uds: str | None = None + """Unix domain socket path. If set, host and port arguments are ignored.""" + uvicorn_log_level: Literal[ + "debug", "info", "warning", "error", "critical", "trace" + ] = "info" + """Log level for uvicorn.""" + disable_uvicorn_access_log: bool = False + """Disable uvicorn access log.""" + allow_credentials: bool = False + """Allow credentials.""" + allowed_origins: list[str] = field(default_factory=lambda: ["*"]) + """Allowed origins.""" + allowed_methods: list[str] = field(default_factory=lambda: ["*"]) + """Allowed methods.""" + allowed_headers: list[str] = field(default_factory=lambda: ["*"]) + """Allowed headers.""" + api_key: list[str] | None = None + """If provided, the server will require one of these keys to be presented in + the header.""" + lora_modules: list[LoRAModulePath] | None = None + """LoRA modules configurations in either 'name=path' format or JSON format + or JSON list format. Example (old format): `'name=path'` Example (new + format): `{\"name\": \"name\", \"path\": \"lora_path\", + \"base_model_name\": \"id\"}`""" + chat_template: str | None = None + """The file path to the chat template, or the template in single-line form + for the specified model.""" + chat_template_content_format: ChatTemplateContentFormatOption = "auto" + """The format to render message content within a chat template. + + * "string" will render the content as a string. Example: `"Hello World"` + * "openai" will render the content as a list of dictionaries, similar to + OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" + trust_request_chat_template: bool = False + """Whether to trust the chat template provided in the request. If False, + the server will always use the chat template specified by `--chat-template` + or the ones from tokenizer.""" + response_role: str = "assistant" + """The role name to return if `request.add_generation_prompt=true`.""" + ssl_keyfile: str | None = None + """The file path to the SSL key file.""" + ssl_certfile: str | None = None + """The file path to the SSL cert file.""" + ssl_ca_certs: str | None = None + """The CA certificates file.""" + enable_ssl_refresh: bool = False + """Refresh SSL Context when SSL certificate files change""" + ssl_cert_reqs: int = int(ssl.CERT_NONE) + """Whether client certificate is required (see stdlib ssl module's).""" + root_path: str | None = None + """FastAPI root_path when app is behind a path based routing proxy.""" + middleware: list[str] = field(default_factory=lambda: []) + """Additional ASGI middleware to apply to the app. We accept multiple + --middleware arguments. The value should be an import path. If a function + is provided, vLLM will add it to the server using + `@app.middleware('http')`. If a class is provided, vLLM will + add it to the server using `app.add_middleware()`.""" + return_tokens_as_token_ids: bool = False + """When `--max-logprobs` is specified, represents single tokens as + strings of the form 'token_id:{token_id}' so that tokens that are not + JSON-encodable can be identified.""" + disable_frontend_multiprocessing: bool = False + """If specified, will run the OpenAI frontend server in the same process as + the model serving engine.""" + enable_request_id_headers: bool = False + """If specified, API server will add X-Request-Id header to responses.""" + enable_auto_tool_choice: bool = False + """Enable auto tool choice for supported models. Use `--tool-call-parser` + to specify which parser to use.""" + exclude_tools_when_tool_choice_none: bool = False + """If specified, exclude tool definitions in prompts when + tool_choice='none'.""" + tool_call_parser: str | None = None + """Select the tool call parser depending on the model that you're using. + This is used to parse the model-generated tool call into OpenAI API format. + Required for `--enable-auto-tool-choice`. You can choose any option from + the built-in parsers or register a plugin via `--tool-parser-plugin`.""" + tool_parser_plugin: str = "" + """Special the tool parser plugin write to parse the model-generated tool + into OpenAI API format, the name register in this plugin can be used in + `--tool-call-parser`.""" + tool_server: str | None = None + """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname). + Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo + purpose.""" + log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH + """Path to logging config JSON file for both vllm and uvicorn""" + max_log_len: int | None = None + """Max number of prompt characters or prompt ID numbers being printed in + log. The default of None means unlimited.""" + disable_fastapi_docs: bool = False + """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint.""" + enable_prompt_tokens_details: bool = False + """If set to True, enable prompt_tokens_details in usage.""" + enable_server_load_tracking: bool = False + """If set to True, enable tracking server_load_metrics in the app state.""" + enable_force_include_usage: bool = False + """If set to True, including usage on every request.""" + enable_tokenizer_info_endpoint: bool = False + """Enable the /get_tokenizer_info endpoint. May expose chat + templates and other tokenizer configuration.""" + enable_log_outputs: bool = False + """If True, log model outputs (generations). + Requires --enable-log-requests.""" + h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT + """Maximum size (bytes) of an incomplete HTTP event (header or body) for + h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB).""" + h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT + """Maximum number of HTTP headers allowed in a request for h11 parser. + Helps mitigate header abuse. Default: 256.""" + log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE + """If set to True, log the stack trace of error responses""" + tokens_only: bool = False + """ + If set to True, only enable the Tokens In<>Out endpoint. + This is intended for use in a Disaggregated Everything setup. + """ + + @staticmethod + def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + from vllm.engine.arg_utils import get_kwargs + + frontend_kwargs = get_kwargs(FrontendArgs) + + # Special case: allowed_origins, allowed_methods, allowed_headers all + # need json.loads type + # Should also remove nargs + frontend_kwargs["allowed_origins"]["type"] = json.loads + frontend_kwargs["allowed_methods"]["type"] = json.loads + frontend_kwargs["allowed_headers"]["type"] = json.loads + del frontend_kwargs["allowed_origins"]["nargs"] + del frontend_kwargs["allowed_methods"]["nargs"] + del frontend_kwargs["allowed_headers"]["nargs"] + + # Special case: LoRA modules need custom parser action and + # optional_type(str) + frontend_kwargs["lora_modules"]["type"] = optional_type(str) + frontend_kwargs["lora_modules"]["action"] = LoRAParserAction + + # Special case: Middleware needs to append action + frontend_kwargs["middleware"]["action"] = "append" + frontend_kwargs["middleware"]["type"] = str + if "nargs" in frontend_kwargs["middleware"]: + del frontend_kwargs["middleware"]["nargs"] + frontend_kwargs["middleware"]["default"] = [] + + # Special case: Tool call parser shows built-in options. + valid_tool_parsers = list(ToolParserManager.list_registered()) + parsers_str = ",".join(valid_tool_parsers) + frontend_kwargs["tool_call_parser"]["metavar"] = ( + f"{{{parsers_str}}} or name registered in --tool-parser-plugin" + ) + + frontend_group = parser.add_argument_group( + title="Frontend", + description=FrontendArgs.__doc__, + ) + + for key, value in frontend_kwargs.items(): + frontend_group.add_argument(f"--{key.replace('_', '-')}", **value) + + return parser + + +def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Create the CLI argument parser used by the OpenAI API server. + + We rely on the helper methods of `FrontendArgs` and `AsyncEngineArgs` to + register all arguments instead of manually enumerating them here. This + avoids code duplication and keeps the argument definitions in one place. + """ + parser.add_argument( + "model_tag", + type=str, + nargs="?", + help="The model tag to serve (optional if specified in config)", + ) + parser.add_argument( + "--headless", + action="store_true", + default=False, + help="Run in headless mode. See multi-node data parallel " + "documentation for more details.", + ) + parser.add_argument( + "--api-server-count", + "-asc", + type=int, + default=1, + help="How many API server processes to run.", + ) + parser.add_argument( + "--config", + help="Read CLI options from a config file. " + "Must be a YAML with the following options: " + "https://docs.vllm.ai/en/latest/configuration/serve_args.html", + ) + parser = FrontendArgs.add_cli_args(parser) + parser = AsyncEngineArgs.add_cli_args(parser) + + return parser + + +def validate_parsed_serve_args(args: argparse.Namespace): + """Quick checks for model serve args that raise prior to loading.""" + if hasattr(args, "subparser") and args.subparser != "serve": + return + + # Ensure that the chat template is valid; raises if it likely isn't + validate_chat_template(args.chat_template) + + # Enable auto tool needs a tool call parser to be valid + if args.enable_auto_tool_choice and not args.tool_call_parser: + raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser") + if args.enable_log_outputs and not args.enable_log_requests: + raise TypeError("Error: --enable-log-outputs requires --enable-log-requests") + + +def create_parser_for_docs() -> FlexibleArgumentParser: + parser_for_docs = FlexibleArgumentParser( + prog="-m vllm.entrypoints.openai.api_server" + ) + return make_arg_parser(parser_for_docs) diff --git a/entrypoints/openai/orca_metrics.py b/entrypoints/openai/orca_metrics.py new file mode 100644 index 0000000..3808262 --- /dev/null +++ b/entrypoints/openai/orca_metrics.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Utility functions that create ORCA endpoint load report response headers. +""" + +import json +from collections.abc import Mapping + +from vllm.logger import init_logger +from vllm.v1.metrics.reader import Gauge, get_metrics_snapshot + +logger = init_logger(__name__) + + +def create_orca_header( + metrics_format: str, named_metrics: list[tuple[str, float]] +) -> Mapping[str, str] | None: + """ + Creates ORCA headers named 'endpoint-load-metrics' in the specified format + and adds custom metrics to named_metrics. + ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0 + ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto + + Parameters: + - metrics_format (str): The format of the header ('TEXT', 'JSON'). + - named_metrics (List[Tuple[str, float]]): List of tuples with metric names + and their corresponding double values. + + Returns: + - Optional[Mapping[str,str]]: A dictionary with header key as + 'endpoint-load-metrics' and values as the ORCA header strings with + format prefix and data in with named_metrics in. + """ + + if metrics_format.lower() not in ["text", "json"]: + logger.warning( + "Warning: `%s` format is not supported in the ORCA response header", + format, + ) + return None + + header = {} + orca_report = { + "named_metrics": { + metric_name: value + for metric_name, value in named_metrics + if isinstance(metric_name, str) and isinstance(value, float) + } + } + # output example: + # endpoint-load-metrics: TEXT named_metrics.kv_cache_utilization=0.4 + if metrics_format.lower() == "text": + native_http_header = ", ".join( + [ + f"named_metrics.{metric_name}={value}" + for metric_name, value in named_metrics + if isinstance(metric_name, str) and isinstance(value, float) + ] + ) + header["endpoint-load-metrics"] = f"TEXT {native_http_header}" + + # output example: + # endpoint-load-metrics: JSON “named_metrics”: {“custom-metric-util”: 0.4} + elif metrics_format.lower() == "json": + header["endpoint-load-metrics"] = f"JSON {json.dumps(orca_report)}" + + logger.info("Created ORCA header %s", header) + + return header + + +def get_named_metrics_from_prometheus() -> list[tuple[str, float]]: + """ + Collects current metrics from Prometheus and returns some of them + in the form of the `named_metrics` list for `create_orca_header()`. + + Parameters: + - None + + Returns: + - list[tuple[str, float]]: List of tuples of metric names and their values. + """ + named_metrics: list[tuple[str, float]] = [] + # Map from prometheus metric names to ORCA named metrics. + prometheus_to_orca_metrics = { + "vllm:kv_cache_usage_perc": "kv_cache_usage_perc", + "vllm:num_requests_waiting": "num_requests_waiting", + } + metrics = get_metrics_snapshot() + for metric in metrics: + orca_name = prometheus_to_orca_metrics.get(metric.name) + # If this metric is mapped into ORCA, then add it to the report. + # Note: Only Gauge metrics are currently supported. + if orca_name is not None and isinstance(metric, Gauge): + named_metrics.append((str(orca_name), float(metric.value))) + return named_metrics + + +def metrics_header(metrics_format: str) -> Mapping[str, str] | None: + """ + Creates ORCA headers named 'endpoint-load-metrics' in the specified format. + Metrics are collected from Prometheus using `get_named_metrics_from_prometheus()`. + + ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0 + ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto + + Parameters: + - metrics_format (str): The format of the header ('TEXT', 'JSON'). + + Returns: + - Optional[Mapping[str,str]]: A dictionary with header key as + 'endpoint-load-metrics' and values as the ORCA header strings with + format prefix and data in with named_metrics in. + """ + if not metrics_format: + return None + # Get named metrics from prometheus. + named_metrics = get_named_metrics_from_prometheus() + return create_orca_header(metrics_format, named_metrics) diff --git a/entrypoints/openai/protocol.py b/entrypoints/openai/protocol.py new file mode 100644 index 0000000..65bd15b --- /dev/null +++ b/entrypoints/openai/protocol.py @@ -0,0 +1,3299 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py +import json +import time +from http import HTTPStatus +from typing import Annotated, Any, ClassVar, Generic, Literal, TypeAlias, TypeVar + +import regex as re +import torch +from fastapi import HTTPException, UploadFile +from openai.types.chat.chat_completion_audio import ( + ChatCompletionAudio as OpenAIChatCompletionAudio, +) +from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation +from openai.types.responses import ( + ResponseCodeInterpreterCallCodeDeltaEvent, + ResponseCodeInterpreterCallCodeDoneEvent, + ResponseCodeInterpreterCallCompletedEvent, + ResponseCodeInterpreterCallInProgressEvent, + ResponseCodeInterpreterCallInterpretingEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseFunctionToolCall, + ResponseInputItemParam, + ResponseOutputItem, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponsePrompt, + ResponseReasoningItem, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, + ResponseStatus, + ResponseWebSearchCallCompletedEvent, + ResponseWebSearchCallInProgressEvent, + ResponseWebSearchCallSearchingEvent, +) +from openai.types.responses import ( + ResponseCompletedEvent as OpenAIResponseCompletedEvent, +) +from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreatedEvent +from openai.types.responses import ( + ResponseInProgressEvent as OpenAIResponseInProgressEvent, +) +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent, +) +from openai_harmony import Message as OpenAIHarmonyMessage + +from vllm.config.pooler import get_use_activation +from vllm.tasks import PoolingTask +from vllm.utils.serial_utils import ( + EmbedDType, + EncodingFormat, + Endianness, +) + +# Backward compatibility for OpenAI client versions +try: # For older openai versions (< 1.100.0) + from openai.types.responses import ResponseTextConfig +except ImportError: # For newer openai versions (>= 1.100.0) + from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig + + +from openai.types.responses.response import IncompleteDetails, ToolChoice +from openai.types.responses.tool import Tool +from openai.types.shared import Metadata, Reasoning +from pydantic import ( + BaseModel, + ConfigDict, + Field, + TypeAdapter, + ValidationError, + ValidationInfo, + field_serializer, + field_validator, + model_validator, +) + +from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id +from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam +from vllm.logger import init_logger +from vllm.logprobs import Logprob +from vllm.pooling_params import PoolingParams +from vllm.sampling_params import ( + BeamSearchParams, + RequestOutputKind, + SamplingParams, + StructuredOutputsParams, +) +from vllm.utils import random_uuid +from vllm.utils.import_utils import resolve_obj_by_qualname + +logger = init_logger(__name__) + +_LONG_INFO = torch.iinfo(torch.long) + + +class OpenAIBaseModel(BaseModel): + # OpenAI API does allow extra fields + model_config = ConfigDict(extra="allow") + + # Cache class field names + field_names: ClassVar[set[str] | None] = None + + @model_validator(mode="wrap") + @classmethod + def __log_extra_fields__(cls, data, handler): + result = handler(data) + if not isinstance(data, dict): + return result + field_names = cls.field_names + if field_names is None: + # Get all class field names and their potential aliases + field_names = set() + for field_name, field in cls.model_fields.items(): + field_names.add(field_name) + if alias := getattr(field, "alias", None): + field_names.add(alias) + cls.field_names = field_names + + # Compare against both field names and aliases + if any(k not in field_names for k in data): + logger.warning( + "The following fields were present in the request but ignored: %s", + data.keys() - field_names, + ) + return result + + +class ErrorInfo(OpenAIBaseModel): + message: str + type: str + param: str | None = None + code: int + + +class ErrorResponse(OpenAIBaseModel): + error: ErrorInfo + + +class ModelPermission(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") + object: str = "model_permission" + created: int = Field(default_factory=lambda: int(time.time())) + allow_create_engine: bool = False + allow_sampling: bool = True + allow_logprobs: bool = True + allow_search_indices: bool = False + allow_view: bool = True + allow_fine_tuning: bool = False + organization: str = "*" + group: str | None = None + is_blocking: bool = False + + +class ModelCard(OpenAIBaseModel): + id: str + object: str = "model" + created: int = Field(default_factory=lambda: int(time.time())) + owned_by: str = "vllm" + root: str | None = None + parent: str | None = None + max_model_len: int | None = None + permission: list[ModelPermission] = Field(default_factory=list) + + +class ModelList(OpenAIBaseModel): + object: str = "list" + data: list[ModelCard] = Field(default_factory=list) + + +class PromptTokenUsageInfo(OpenAIBaseModel): + cached_tokens: int | None = None + + +class UsageInfo(OpenAIBaseModel): + prompt_tokens: int = 0 + total_tokens: int = 0 + completion_tokens: int | None = 0 + prompt_tokens_details: PromptTokenUsageInfo | None = None + + +class RequestResponseMetadata(BaseModel): + request_id: str + final_usage_info: UsageInfo | None = None + + +class JsonSchemaResponseFormat(OpenAIBaseModel): + name: str + description: str | None = None + # schema is the field in openai but that causes conflicts with pydantic so + # instead use json_schema with an alias + json_schema: dict[str, Any] | None = Field(default=None, alias="schema") + strict: bool | None = None + + +class LegacyStructuralTag(OpenAIBaseModel): + begin: str + # schema is the field, but that causes conflicts with pydantic so + # instead use structural_tag_schema with an alias + structural_tag_schema: dict[str, Any] | None = Field(default=None, alias="schema") + end: str + + +class LegacyStructuralTagResponseFormat(OpenAIBaseModel): + type: Literal["structural_tag"] + structures: list[LegacyStructuralTag] + triggers: list[str] + + +class StructuralTagResponseFormat(OpenAIBaseModel): + type: Literal["structural_tag"] + format: Any + + +AnyStructuralTagResponseFormat: TypeAlias = ( + LegacyStructuralTagResponseFormat | StructuralTagResponseFormat +) + + +class ResponseFormat(OpenAIBaseModel): + # type must be "json_schema", "json_object", or "text" + type: Literal["text", "json_object", "json_schema"] + json_schema: JsonSchemaResponseFormat | None = None + + +AnyResponseFormat: TypeAlias = ( + ResponseFormat | StructuralTagResponseFormat | LegacyStructuralTagResponseFormat +) + + +class StreamOptions(OpenAIBaseModel): + include_usage: bool | None = True + continuous_usage_stats: bool | None = False + + +class FunctionDefinition(OpenAIBaseModel): + name: str + description: str | None = None + parameters: dict[str, Any] | None = None + + +class ChatCompletionToolsParam(OpenAIBaseModel): + type: Literal["function"] = "function" + function: FunctionDefinition + + +class ChatCompletionNamedFunction(OpenAIBaseModel): + name: str + + +class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): + function: ChatCompletionNamedFunction + type: Literal["function"] = "function" + + +# extra="forbid" is a workaround to have kwargs as a field, +# see https://github.com/pydantic/pydantic/issues/3125 +class LogitsProcessorConstructor(BaseModel): + qualname: str + args: list[Any] | None = None + kwargs: dict[str, Any] | None = None + + model_config = ConfigDict(extra="forbid") + + +LogitsProcessors = list[str | LogitsProcessorConstructor] + + +def get_logits_processors( + processors: LogitsProcessors | None, pattern: str | None +) -> list[Any] | None: + if processors and pattern: + logits_processors = [] + for processor in processors: + qualname = processor if isinstance(processor, str) else processor.qualname + if not re.match(pattern, qualname): + raise ValueError( + f"Logits processor '{qualname}' is not allowed by this " + "server. See --logits-processor-pattern engine argument " + "for more information." + ) + try: + logits_processor = resolve_obj_by_qualname(qualname) + except Exception as e: + raise ValueError( + f"Logits processor '{qualname}' could not be resolved: {e}" + ) from e + if isinstance(processor, LogitsProcessorConstructor): + logits_processor = logits_processor( + *processor.args or [], **processor.kwargs or {} + ) + logits_processors.append(logits_processor) + return logits_processors + elif processors: + raise ValueError( + "The `logits_processors` argument is not supported by this " + "server. See --logits-processor-pattern engine argument " + "for more information." + ) + return None + + +ResponseInputOutputItem: TypeAlias = ( + ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall +) + + +class ResponsesRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/responses/create + background: bool | None = False + include: ( + list[ + Literal[ + "code_interpreter_call.outputs", + "computer_call_output.output.image_url", + "file_search_call.results", + "message.input_image.image_url", + "message.output_text.logprobs", + "reasoning.encrypted_content", + ], + ] + | None + ) = None + input: str | list[ResponseInputOutputItem] + instructions: str | None = None + max_output_tokens: int | None = None + max_tool_calls: int | None = None + metadata: Metadata | None = None + model: str | None = None + parallel_tool_calls: bool | None = True + previous_response_id: str | None = None + prompt: ResponsePrompt | None = None + reasoning: Reasoning | None = None + service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto" + store: bool | None = True + stream: bool | None = False + temperature: float | None = None + text: ResponseTextConfig | None = None + tool_choice: ToolChoice = "auto" + tools: list[Tool] = Field(default_factory=list) + top_logprobs: int | None = 0 + top_p: float | None = None + truncation: Literal["auto", "disabled"] | None = "disabled" + user: str | None = None + + # --8<-- [start:responses-extra-params] + request_id: str = Field( + default_factory=lambda: f"resp_{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit). Not supported by vLLM engine V0." + ), + ) + + enable_response_messages: bool = Field( + default=False, + description=( + "Dictates whether or not to return messages as part of the " + "response object. Currently only supported for" + "non-background and gpt-oss only. " + ), + ) + # similar to input_messages / output_messages in ResponsesResponse + # we take in previous_input_messages (ie in harmony format) + # this cannot be used in conjunction with previous_response_id + # TODO: consider supporting non harmony messages as well + previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None + # --8<-- [end:responses-extra-params] + + _DEFAULT_SAMPLING_PARAMS = { + "temperature": 1.0, + "top_p": 1.0, + } + + def to_sampling_params( + self, + default_max_tokens: int, + default_sampling_params: dict | None = None, + ) -> SamplingParams: + if self.max_output_tokens is None: + max_tokens = default_max_tokens + else: + max_tokens = min(self.max_output_tokens, default_max_tokens) + + default_sampling_params = default_sampling_params or {} + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + if (top_p := self.top_p) is None: + top_p = default_sampling_params.get( + "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] + ) + stop_token_ids = default_sampling_params.get("stop_token_ids") + + # Structured output + structured_outputs = None + if self.text is not None and self.text.format is not None: + response_format = self.text.format + if ( + response_format.type == "json_schema" + and response_format.schema_ is not None + ): + structured_outputs = StructuredOutputsParams( + json=response_format.schema_ + ) + elif response_format.type == "json_object": + raise NotImplementedError("json_object is not supported") + + # TODO: add more parameters + return SamplingParams.from_optional( + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + logprobs=self.top_logprobs if self.is_include_output_logprobs() else None, + stop_token_ids=stop_token_ids, + output_kind=( + RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY + ), + structured_outputs=structured_outputs, + ) + + def is_include_output_logprobs(self) -> bool: + """Check if the request includes output logprobs.""" + if self.include is None: + return False + return ( + isinstance(self.include, list) + and "message.output_text.logprobs" in self.include + ) + + @model_validator(mode="before") + def validate_background(cls, data): + if not data.get("background"): + return data + if not data.get("store", True): + raise ValueError("background can only be used when `store` is true") + return data + + @model_validator(mode="before") + def validate_prompt(cls, data): + if data.get("prompt") is not None: + raise ValueError("prompt template is not supported") + return data + + @model_validator(mode="before") + def check_cache_salt_support(cls, data): + if data.get("cache_salt") is not None and ( + not isinstance(data["cache_salt"], str) or not data["cache_salt"] + ): + raise ValueError( + "Parameter 'cache_salt' must be a non-empty string if provided." + ) + return data + + @model_validator(mode="before") + def function_call_parsing(cls, data): + """Parse function_call dictionaries into ResponseFunctionToolCall objects. + This ensures Pydantic can properly resolve union types in the input field. + Function calls provided as dicts are converted to ResponseFunctionToolCall + objects before validation, while invalid structures are left for Pydantic + to reject with appropriate error messages. + """ + + input_data = data.get("input") + + # Early return for None, strings, or bytes + # (strings are iterable but shouldn't be processed) + if input_data is None or isinstance(input_data, (str, bytes)): + return data + + # Convert iterators (like ValidatorIterator) to list + if not isinstance(input_data, list): + try: + input_data = list(input_data) + except TypeError: + # Not iterable, leave as-is for Pydantic to handle + return data + + processed_input = [] + for item in input_data: + if isinstance(item, dict) and item.get("type") == "function_call": + try: + processed_input.append(ResponseFunctionToolCall(**item)) + except ValidationError: + # Let Pydantic handle validation for malformed function calls + logger.debug( + "Failed to parse function_call to ResponseFunctionToolCall, " + "leaving for Pydantic validation" + ) + processed_input.append(item) + else: + processed_input.append(item) + + data["input"] = processed_input + return data + + +class ChatCompletionRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/chat/create + messages: list[ChatCompletionMessageParam] + model: str | None = None + frequency_penalty: float | None = 0.0 + logit_bias: dict[str, float] | None = None + logprobs: bool | None = False + top_logprobs: int | None = 0 + max_tokens: int | None = Field( + default=None, + deprecated="max_tokens is deprecated in favor of " + "the max_completion_tokens field", + ) + max_completion_tokens: int | None = None + n: int | None = 1 + presence_penalty: float | None = 0.0 + response_format: AnyResponseFormat | None = None + seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + stop: str | list[str] | None = [] + stream: bool | None = False + stream_options: StreamOptions | None = None + temperature: float | None = None + top_p: float | None = None + tools: list[ChatCompletionToolsParam] | None = None + tool_choice: ( + Literal["none"] + | Literal["auto"] + | Literal["required"] + | ChatCompletionNamedToolChoiceParam + | None + ) = "none" + reasoning_effort: Literal["low", "medium", "high"] | None = None + include_reasoning: bool = True + + # NOTE this will be ignored by vLLM -- the model determines the behavior + parallel_tool_calls: bool | None = False + user: str | None = None + + # --8<-- [start:chat-completion-sampling-params] + best_of: int | None = None + use_beam_search: bool = False + top_k: int | None = None + min_p: float | None = None + repetition_penalty: float | None = None + length_penalty: float = 1.0 + stop_token_ids: list[int] | None = [] + include_stop_str_in_output: bool = False + ignore_eos: bool = False + min_tokens: int = 0 + skip_special_tokens: bool = True + spaces_between_special_tokens: bool = True + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + prompt_logprobs: int | None = None + allowed_token_ids: list[int] | None = None + bad_words: list[str] = Field(default_factory=list) + # --8<-- [end:chat-completion-sampling-params] + + # --8<-- [start:chat-completion-extra-params] + echo: bool = Field( + default=False, + description=( + "If true, the new message will be prepended with the last message " + "if they belong to the same role." + ), + ) + add_generation_prompt: bool = Field( + default=True, + description=( + "If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model." + ), + ) + continue_final_message: bool = Field( + default=False, + description=( + "If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + 'This allows you to "prefill" part of the model\'s response for it. ' + "Cannot be used at the same time as `add_generation_prompt`." + ), + ) + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)." + ), + ) + documents: list[dict[str, str]] | None = Field( + default=None, + description=( + "A list of dicts representing documents that will be accessible to " + "the model if it is performing RAG (retrieval-augmented generation)." + " If the template does not support RAG, this argument will have no " + "effect. We recommend that each document should be a dict containing " + '"title" and "text" keys.' + ), + ) + chat_template: str | None = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one." + ), + ) + chat_template_kwargs: dict[str, Any] | None = Field( + default=None, + description=( + "Additional keyword args to pass to the template renderer. " + "Will be accessible by the chat template." + ), + ) + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + structured_outputs: StructuredOutputsParams | None = Field( + default=None, + description="Additional kwargs for structured outputs", + ) + guided_json: str | dict | BaseModel | None = Field( + default=None, + description=( + "`guided_json` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `json` to `structured_outputs` instead." + ), + ) + guided_regex: str | None = Field( + default=None, + description=( + "`guided_regex` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `regex` to `structured_outputs` instead." + ), + ) + guided_choice: list[str] | None = Field( + default=None, + description=( + "`guided_choice` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `choice` to `structured_outputs` instead." + ), + ) + guided_grammar: str | None = Field( + default=None, + description=( + "`guided_grammar` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `grammar` to `structured_outputs` instead." + ), + ) + structural_tag: str | None = Field( + default=None, + description=( + "`structural_tag` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `structural_tag` to `structured_outputs` instead." + ), + ) + guided_decoding_backend: str | None = Field( + default=None, + description=( + "`guided_decoding_backend` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please remove it from your request." + ), + ) + guided_whitespace_pattern: str | None = Field( + default=None, + description=( + "`guided_whitespace_pattern` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `whitespace_pattern` to `structured_outputs` instead." + ), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + logits_processors: LogitsProcessors | None = Field( + default=None, + description=( + "A list of either qualified names of logits processors, or " + "constructor objects, to apply when sampling. A constructor is " + "a JSON object with a required 'qualname' field specifying the " + "qualified name of the processor class/factory, and optional " + "'args' and 'kwargs' fields containing positional and keyword " + "arguments. For example: {'qualname': " + "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " + "{'param': 'value'}}." + ), + ) + return_tokens_as_token_ids: bool | None = Field( + default=None, + description=( + "If specified with 'logprobs', tokens are represented " + " as strings of the form 'token_id:{token_id}' so that tokens " + "that are not JSON-encodable can be identified." + ), + ) + return_token_ids: bool | None = Field( + default=None, + description=( + "If specified, the result will include token IDs alongside the " + "generated text. In streaming mode, prompt_token_ids is included " + "only in the first chunk, and token_ids contains the delta tokens " + "for each chunk. This is useful for debugging or when you " + "need to map generated text back to input tokens." + ), + ) + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit). Not supported by vLLM engine V0." + ), + ) + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) + + vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field( + default=None, + description=( + "Additional request parameters with (list of) string or " + "numeric values, used by custom extensions." + ), + ) + + # --8<-- [end:chat-completion-extra-params] + + # Default sampling parameters for chat completion requests + _DEFAULT_SAMPLING_PARAMS: dict = { + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_p": 1.0, + "top_k": 0, + "min_p": 0.0, + } + + def to_beam_search_params( + self, max_tokens: int, default_sampling_params: dict + ) -> BeamSearchParams: + n = self.n if self.n is not None else 1 + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + + return BeamSearchParams( + beam_width=n, + max_tokens=max_tokens, + ignore_eos=self.ignore_eos, + temperature=temperature, + length_penalty=self.length_penalty, + include_stop_str_in_output=self.include_stop_str_in_output, + ) + + def to_sampling_params( + self, + max_tokens: int, + logits_processor_pattern: str | None, + default_sampling_params: dict, + ) -> SamplingParams: + # Default parameters + if (repetition_penalty := self.repetition_penalty) is None: + repetition_penalty = default_sampling_params.get( + "repetition_penalty", + self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"], + ) + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + if (top_p := self.top_p) is None: + top_p = default_sampling_params.get( + "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] + ) + if (top_k := self.top_k) is None: + top_k = default_sampling_params.get( + "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] + ) + if (min_p := self.min_p) is None: + min_p = default_sampling_params.get( + "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"] + ) + + prompt_logprobs = self.prompt_logprobs + if prompt_logprobs is None and self.echo: + prompt_logprobs = self.top_logprobs + + # Forward deprecated guided_* parameters to structured_outputs + if self.structured_outputs is None: + kwargs = dict[str, Any]( + json=self.guided_json, + regex=self.guided_regex, + choice=self.guided_choice, + grammar=self.guided_grammar, + whitespace_pattern=self.guided_whitespace_pattern, + structural_tag=self.structural_tag, + ) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + self.structured_outputs = StructuredOutputsParams(**kwargs) + + response_format = self.response_format + if response_format is not None: + # If structured outputs wasn't already enabled, + # we must enable it for these features to work + if self.structured_outputs is None: + self.structured_outputs = StructuredOutputsParams() + + # Set structured output params for response format + if response_format is not None: + if response_format.type == "json_object": + self.structured_outputs.json_object = True + elif response_format.type == "json_schema": + json_schema = response_format.json_schema + assert json_schema is not None + self.structured_outputs.json = json_schema.json_schema + elif response_format.type == "structural_tag": + structural_tag = response_format + assert structural_tag is not None and isinstance( + structural_tag, + ( + LegacyStructuralTagResponseFormat, + StructuralTagResponseFormat, + ), + ) + s_tag_obj = structural_tag.model_dump(by_alias=True) + self.structured_outputs.structural_tag = json.dumps(s_tag_obj) + + extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} + if self.kv_transfer_params: + # Pass in kv_transfer_params via extra_args + extra_args["kv_transfer_params"] = self.kv_transfer_params + return SamplingParams.from_optional( + n=self.n, + best_of=self.best_of, + presence_penalty=self.presence_penalty, + frequency_penalty=self.frequency_penalty, + repetition_penalty=repetition_penalty, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + seed=self.seed, + stop=self.stop, + stop_token_ids=self.stop_token_ids, + logprobs=self.top_logprobs if self.logprobs else None, + prompt_logprobs=prompt_logprobs, + ignore_eos=self.ignore_eos, + max_tokens=max_tokens, + min_tokens=self.min_tokens, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self.spaces_between_special_tokens, + logits_processors=get_logits_processors( + self.logits_processors, logits_processor_pattern + ), + include_stop_str_in_output=self.include_stop_str_in_output, + truncate_prompt_tokens=self.truncate_prompt_tokens, + output_kind=RequestOutputKind.DELTA + if self.stream + else RequestOutputKind.FINAL_ONLY, + structured_outputs=self.structured_outputs, + logit_bias=self.logit_bias, + bad_words=self.bad_words, + allowed_token_ids=self.allowed_token_ids, + extra_args=extra_args or None, + ) + + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + if data.get("stream_options") and not data.get("stream"): + raise ValueError("Stream options can only be defined when `stream=True`.") + + return data + + @model_validator(mode="before") + @classmethod + def check_logprobs(cls, data): + if (prompt_logprobs := data.get("prompt_logprobs")) is not None: + if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): + raise ValueError( + "`prompt_logprobs` are not available when `stream=True`." + ) + + if prompt_logprobs < 0 and prompt_logprobs != -1: + raise ValueError("`prompt_logprobs` must be a positive value or -1.") + if (top_logprobs := data.get("top_logprobs")) is not None: + if top_logprobs < 0 and top_logprobs != -1: + raise ValueError("`top_logprobs` must be a positive value or -1.") + + if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"): + raise ValueError( + "when using `top_logprobs`, `logprobs` must be set to true." + ) + + return data + + @model_validator(mode="before") + @classmethod + def check_structured_outputs_count(cls, data): + if isinstance(data, ValueError): + raise data + + if data.get("structured_outputs", None) is None: + return data + + structured_outputs_kwargs = data["structured_outputs"] + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice") + ) + # you can only use one kind of constraints for structured outputs + if count > 1: + raise ValueError( + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice')." + ) + # you can only either use structured outputs or tools, not both + if count > 1 and data.get("tool_choice", "none") not in ( + "none", + "auto", + "required", + ): + raise ValueError( + "You can only either use constraints for structured outputs " + "or tools, not both." + ) + return data + + @model_validator(mode="before") + @classmethod + def check_tool_usage(cls, data): + # if "tool_choice" is not specified but tools are provided, + # default to "auto" tool_choice + if "tool_choice" not in data and data.get("tools"): + data["tool_choice"] = "auto" + + # if "tool_choice" is "none" -- no validation is needed for tools + if "tool_choice" in data and data["tool_choice"] == "none": + return data + + # if "tool_choice" is specified -- validation + if "tool_choice" in data and data["tool_choice"] is not None: + # ensure that if "tool choice" is specified, tools are present + if "tools" not in data or data["tools"] is None: + raise ValueError("When using `tool_choice`, `tools` must be set.") + + # make sure that tool choice is either a named tool + # OR that it's set to "auto" or "required" + if data["tool_choice"] not in ["auto", "required"] and not isinstance( + data["tool_choice"], dict + ): + raise ValueError( + f"Invalid value for `tool_choice`: {data['tool_choice']}! " + 'Only named tools, "none", "auto" or "required" ' + "are supported." + ) + + # if tool_choice is "required" but the "tools" list is empty, + # override the data to behave like "none" to align with + # OpenAI’s behavior. + if ( + data["tool_choice"] == "required" + and isinstance(data["tools"], list) + and len(data["tools"]) == 0 + ): + data["tool_choice"] = "none" + del data["tools"] + return data + + # ensure that if "tool_choice" is specified as an object, + # it matches a valid tool + correct_usage_message = ( + 'Correct usage: `{"type": "function",' + ' "function": {"name": "my_function"}}`' + ) + if isinstance(data["tool_choice"], dict): + valid_tool = False + function = data["tool_choice"].get("function") + if not isinstance(function, dict): + raise ValueError( + f"Invalid value for `function`: `{function}` in " + f"`tool_choice`! {correct_usage_message}" + ) + if "name" not in function: + raise ValueError( + f"Expected field `name` in `function` in " + f"`tool_choice`! {correct_usage_message}" + ) + function_name = function["name"] + if not isinstance(function_name, str) or len(function_name) == 0: + raise ValueError( + f"Invalid `name` in `function`: `{function_name}`" + f" in `tool_choice`! {correct_usage_message}" + ) + for tool in data["tools"]: + if tool["function"]["name"] == function_name: + valid_tool = True + break + if not valid_tool: + raise ValueError( + "The tool specified in `tool_choice` does not match any" + " of the specified `tools`" + ) + return data + + @model_validator(mode="before") + @classmethod + def check_generation_prompt(cls, data): + if data.get("continue_final_message") and data.get("add_generation_prompt"): + raise ValueError( + "Cannot set both `continue_final_message` and " + "`add_generation_prompt` to True." + ) + return data + + @model_validator(mode="before") + @classmethod + def check_cache_salt_support(cls, data): + if data.get("cache_salt") is not None and ( + not isinstance(data["cache_salt"], str) or not data["cache_salt"] + ): + raise ValueError( + "Parameter 'cache_salt' must be a non-empty string if provided." + ) + return data + + +class CompletionRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/completions/create + model: str | None = None + prompt: list[int] | list[list[int]] | str | list[str] | None = None + best_of: int | None = None + echo: bool | None = False + frequency_penalty: float | None = 0.0 + logit_bias: dict[str, float] | None = None + logprobs: int | None = None + max_tokens: int | None = 16 + n: int = 1 + presence_penalty: float | None = 0.0 + seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + stop: str | list[str] | None = [] + stream: bool | None = False + stream_options: StreamOptions | None = None + suffix: str | None = None + temperature: float | None = None + top_p: float | None = None + user: str | None = None + + # --8<-- [start:completion-sampling-params] + use_beam_search: bool = False + top_k: int | None = None + min_p: float | None = None + repetition_penalty: float | None = None + length_penalty: float = 1.0 + stop_token_ids: list[int] | None = [] + include_stop_str_in_output: bool = False + ignore_eos: bool = False + min_tokens: int = 0 + skip_special_tokens: bool = True + spaces_between_special_tokens: bool = True + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + allowed_token_ids: list[int] | None = None + prompt_logprobs: int | None = None + # --8<-- [end:completion-sampling-params] + + # --8<-- [start:completion-extra-params] + prompt_embeds: bytes | list[bytes] | None = None + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt." + ), + ) + response_format: AnyResponseFormat | None = Field( + default=None, + description=( + "Similar to chat completion, this parameter specifies the format " + "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}" + ", {'type': 'structural_tag'}, or {'type': 'text' } is supported." + ), + ) + structured_outputs: StructuredOutputsParams | None = Field( + default=None, + description="Additional kwargs for structured outputs", + ) + guided_json: str | dict | BaseModel | None = Field( + default=None, + description=( + "`guided_json` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `json` to `structured_outputs` instead." + ), + ) + guided_regex: str | None = Field( + default=None, + description=( + "`guided_regex` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `regex` to `structured_outputs` instead." + ), + ) + guided_choice: list[str] | None = Field( + default=None, + description=( + "`guided_choice` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `choice` to `structured_outputs` instead." + ), + ) + guided_grammar: str | None = Field( + default=None, + description=( + "`guided_grammar` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `grammar` to `structured_outputs` instead." + ), + ) + structural_tag: str | None = Field( + default=None, + description=("If specified, the output will follow the structural tag schema."), + ) + guided_decoding_backend: str | None = Field( + default=None, + description=( + "`guided_decoding_backend` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please remove it from your request." + ), + ) + guided_whitespace_pattern: str | None = Field( + default=None, + description=( + "`guided_whitespace_pattern` is deprecated. " + "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " + "Please pass `whitespace_pattern` to `structured_outputs` instead." + ), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + logits_processors: LogitsProcessors | None = Field( + default=None, + description=( + "A list of either qualified names of logits processors, or " + "constructor objects, to apply when sampling. A constructor is " + "a JSON object with a required 'qualname' field specifying the " + "qualified name of the processor class/factory, and optional " + "'args' and 'kwargs' fields containing positional and keyword " + "arguments. For example: {'qualname': " + "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " + "{'param': 'value'}}." + ), + ) + + return_tokens_as_token_ids: bool | None = Field( + default=None, + description=( + "If specified with 'logprobs', tokens are represented " + " as strings of the form 'token_id:{token_id}' so that tokens " + "that are not JSON-encodable can be identified." + ), + ) + return_token_ids: bool | None = Field( + default=None, + description=( + "If specified, the result will include token IDs alongside the " + "generated text. In streaming mode, prompt_token_ids is included " + "only in the first chunk, and token_ids contains the delta tokens " + "for each chunk. This is useful for debugging or when you " + "need to map generated text back to input tokens." + ), + ) + + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit). Not supported by vLLM engine V0." + ), + ) + + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) + + vllm_xargs: dict[str, str | int | float] | None = Field( + default=None, + description=( + "Additional request parameters with string or " + "numeric values, used by custom extensions." + ), + ) + + # --8<-- [end:completion-extra-params] + + # Default sampling parameters for completion requests + _DEFAULT_SAMPLING_PARAMS: dict = { + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_p": 1.0, + "top_k": 0, + "min_p": 0.0, + } + + def to_beam_search_params( + self, + max_tokens: int, + default_sampling_params: dict | None = None, + ) -> BeamSearchParams: + if default_sampling_params is None: + default_sampling_params = {} + n = self.n if self.n is not None else 1 + + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get("temperature", 1.0) + + return BeamSearchParams( + beam_width=n, + max_tokens=max_tokens, + ignore_eos=self.ignore_eos, + temperature=temperature, + length_penalty=self.length_penalty, + include_stop_str_in_output=self.include_stop_str_in_output, + ) + + def to_sampling_params( + self, + max_tokens: int, + logits_processor_pattern: str | None, + default_sampling_params: dict | None = None, + ) -> SamplingParams: + if default_sampling_params is None: + default_sampling_params = {} + + # Default parameters + if (repetition_penalty := self.repetition_penalty) is None: + repetition_penalty = default_sampling_params.get( + "repetition_penalty", + self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"], + ) + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + if (top_p := self.top_p) is None: + top_p = default_sampling_params.get( + "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] + ) + if (top_k := self.top_k) is None: + top_k = default_sampling_params.get( + "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] + ) + if (min_p := self.min_p) is None: + min_p = default_sampling_params.get( + "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"] + ) + + prompt_logprobs = self.prompt_logprobs + if prompt_logprobs is None and self.echo: + prompt_logprobs = self.logprobs + + echo_without_generation = self.echo and self.max_tokens == 0 + + guided_json_object = None + if self.response_format is not None: + if self.response_format.type == "json_object": + guided_json_object = True + elif self.response_format.type == "json_schema": + json_schema = self.response_format.json_schema + assert json_schema is not None + self.guided_json = json_schema.json_schema + elif self.response_format.type == "structural_tag": + structural_tag = self.response_format + assert structural_tag is not None and isinstance( + structural_tag, StructuralTagResponseFormat + ) + s_tag_obj = structural_tag.model_dump(by_alias=True) + self.structural_tag = json.dumps(s_tag_obj) + + # Forward deprecated guided_* parameters to structured_outputs + if self.structured_outputs is None: + kwargs = dict[str, Any]( + json=self.guided_json, + json_object=guided_json_object, + regex=self.guided_regex, + choice=self.guided_choice, + grammar=self.guided_grammar, + whitespace_pattern=self.guided_whitespace_pattern, + ) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + self.structured_outputs = StructuredOutputsParams(**kwargs) + + extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} + if self.kv_transfer_params: + # Pass in kv_transfer_params via extra_args + extra_args["kv_transfer_params"] = self.kv_transfer_params + return SamplingParams.from_optional( + n=self.n, + best_of=self.best_of, + presence_penalty=self.presence_penalty, + frequency_penalty=self.frequency_penalty, + repetition_penalty=repetition_penalty, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + seed=self.seed, + stop=self.stop, + stop_token_ids=self.stop_token_ids, + logprobs=self.logprobs, + ignore_eos=self.ignore_eos, + max_tokens=max_tokens if not echo_without_generation else 1, + min_tokens=self.min_tokens, + prompt_logprobs=prompt_logprobs, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self.spaces_between_special_tokens, + include_stop_str_in_output=self.include_stop_str_in_output, + logits_processors=get_logits_processors( + self.logits_processors, logits_processor_pattern + ), + truncate_prompt_tokens=self.truncate_prompt_tokens, + output_kind=RequestOutputKind.DELTA + if self.stream + else RequestOutputKind.FINAL_ONLY, + structured_outputs=self.structured_outputs, + logit_bias=self.logit_bias, + allowed_token_ids=self.allowed_token_ids, + extra_args=extra_args or None, + ) + + @model_validator(mode="before") + @classmethod + def check_structured_outputs_count(cls, data): + if data.get("structured_outputs", None) is None: + return data + + structured_outputs_kwargs = data["structured_outputs"] + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice") + ) + if count > 1: + raise ValueError( + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice')." + ) + return data + + @model_validator(mode="before") + @classmethod + def check_logprobs(cls, data): + if (prompt_logprobs := data.get("prompt_logprobs")) is not None: + if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): + raise ValueError( + "`prompt_logprobs` are not available when `stream=True`." + ) + + if prompt_logprobs < 0 and prompt_logprobs != -1: + raise ValueError("`prompt_logprobs` must be a positive value or -1.") + if (logprobs := data.get("logprobs")) is not None and logprobs < 0: + raise ValueError("`logprobs` must be a positive value.") + + return data + + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + if data.get("stream_options") and not data.get("stream"): + raise ValueError("Stream options can only be defined when `stream=True`.") + + return data + + @model_validator(mode="before") + @classmethod + def validate_prompt_and_prompt_embeds(cls, data): + prompt = data.get("prompt") + prompt_embeds = data.get("prompt_embeds") + + prompt_is_empty = prompt is None or (isinstance(prompt, str) and prompt == "") + embeds_is_empty = prompt_embeds is None or ( + isinstance(prompt_embeds, list) and len(prompt_embeds) == 0 + ) + + if prompt_is_empty and embeds_is_empty: + raise ValueError( + "Either prompt or prompt_embeds must be provided and non-empty." + ) + + return data + + @model_validator(mode="before") + @classmethod + def check_cache_salt_support(cls, data): + if data.get("cache_salt") is not None and ( + not isinstance(data["cache_salt"], str) or not data["cache_salt"] + ): + raise ValueError( + "Parameter 'cache_salt' must be a non-empty string if provided." + ) + return data + + +class EmbeddingCompletionRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/embeddings + model: str | None = None + input: list[int] | list[list[int]] | str | list[str] + encoding_format: EncodingFormat = "float" + dimensions: int | None = None + user: str | None = None + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + + # --8<-- [start:embedding-extra-params] + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt." + ), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + normalize: bool | None = Field( + default=None, + description="Whether to normalize the embeddings outputs. Default is True.", + ) + embed_dtype: EmbedDType = Field( + default="float32", + description=( + "What dtype to use for encoding. Default to using float32 for base64 " + "encoding to match the OpenAI python client behavior. " + "This parameter will affect base64 and binary_response." + ), + ) + endianness: Endianness = Field( + default="native", + description=( + "What endianness to use for encoding. Default to using native for " + "base64 encoding to match the OpenAI python client behavior." + "This parameter will affect base64 and binary_response." + ), + ) + # --8<-- [end:embedding-extra-params] + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + ) + + +class EmbeddingChatRequest(OpenAIBaseModel): + model: str | None = None + messages: list[ChatCompletionMessageParam] + + encoding_format: EncodingFormat = "float" + dimensions: int | None = None + user: str | None = None + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + + # --8<-- [start:chat-embedding-extra-params] + add_generation_prompt: bool = Field( + default=False, + description=( + "If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model." + ), + ) + + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)." + ), + ) + chat_template: str | None = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one." + ), + ) + chat_template_kwargs: dict[str, Any] | None = Field( + default=None, + description=( + "Additional keyword args to pass to the template renderer. " + "Will be accessible by the chat template." + ), + ) + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + normalize: bool | None = Field( + default=None, + description="Whether to normalize the embeddings outputs. Default is True.", + ) + embed_dtype: EmbedDType = Field( + default="float32", + description=( + "What dtype to use for encoding. Default to using float32 for base64 " + "encoding to match the OpenAI python client behavior. " + "This parameter will affect base64 and binary_response." + ), + ) + endianness: Endianness = Field( + default="native", + description=( + "What endianness to use for encoding. Default to using native for " + "base64 encoding to match the OpenAI python client behavior." + "This parameter will affect base64 and binary_response." + ), + ) + # --8<-- [end:chat-embedding-extra-params] + + @model_validator(mode="before") + @classmethod + def check_generation_prompt(cls, data): + if data.get("continue_final_message") and data.get("add_generation_prompt"): + raise ValueError( + "Cannot set both `continue_final_message` and " + "`add_generation_prompt` to True." + ) + return data + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + ) + + +EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest + + +class PoolingCompletionRequest(EmbeddingCompletionRequest): + task: PoolingTask | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "If it is a classify or token_classify task, the default is True; " + "for other tasks, this value should be None.", + ) + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + use_activation=get_use_activation(self), + ) + + +class PoolingChatRequest(EmbeddingChatRequest): + task: PoolingTask | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "If it is a classify or token_classify task, the default is True; " + "for other tasks, this value should be None.", + ) + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + use_activation=get_use_activation(self), + ) + + +T = TypeVar("T") + + +class IOProcessorRequest(OpenAIBaseModel, Generic[T]): + model: str | None = None + + priority: int = Field(default=0) + """ + The priority of the request (lower means earlier handling; + default: 0). Any priority other than 0 will raise an error + if the served model does not use priority scheduling. + """ + data: T + + task: PoolingTask = "plugin" + encoding_format: EncodingFormat = "float" + embed_dtype: EmbedDType = Field( + default="float32", + description=( + "What dtype to use for encoding. Default to using float32 for base64 " + "encoding to match the OpenAI python client behavior. " + "This parameter will affect base64 and binary_response." + ), + ) + endianness: Endianness = Field( + default="native", + description=( + "What endianness to use for encoding. Default to using native for " + "base64 encoding to match the OpenAI python client behavior." + "This parameter will affect base64 and binary_response." + ), + ) + + def to_pooling_params(self): + return PoolingParams() + + +class IOProcessorResponse(OpenAIBaseModel, Generic[T]): + request_id: str | None = None + """ + The request_id associated with this response + """ + created_at: int = Field(default_factory=lambda: int(time.time())) + + data: T + """ + When using plugins IOProcessor plugins, the actual output is generated + by the plugin itself. Hence, we use a generic type for the response data + """ + + +PoolingRequest: TypeAlias = ( + PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest +) + + +class ScoreRequest(OpenAIBaseModel): + model: str | None = None + text_1: list[str] | str | ScoreMultiModalParam + text_2: list[str] | str | ScoreMultiModalParam + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + + # --8<-- [start:score-extra-params] + + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) + # --8<-- [end:score-extra-params] + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + use_activation=get_use_activation(self), + ) + + +class RerankRequest(OpenAIBaseModel): + model: str | None = None + query: str | ScoreMultiModalParam + documents: list[str] | ScoreMultiModalParam + top_n: int = Field(default_factory=lambda: 0) + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + + # --8<-- [start:rerank-extra-params] + + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) + # --8<-- [end:rerank-extra-params] + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + use_activation=get_use_activation(self), + ) + + +class RerankDocument(BaseModel): + text: str | None = None + multi_modal: ScoreContentPartParam | None = None + + +class RerankResult(BaseModel): + index: int + document: RerankDocument + relevance_score: float + + +class RerankUsage(BaseModel): + total_tokens: int + + +class RerankResponse(OpenAIBaseModel): + id: str + model: str + usage: RerankUsage + results: list[RerankResult] + + +class CompletionLogProbs(OpenAIBaseModel): + text_offset: list[int] = Field(default_factory=list) + token_logprobs: list[float | None] = Field(default_factory=list) + tokens: list[str] = Field(default_factory=list) + top_logprobs: list[dict[str, float] | None] = Field(default_factory=list) + + +class CompletionResponseChoice(OpenAIBaseModel): + index: int + text: str + logprobs: CompletionLogProbs | None = None + finish_reason: str | None = None + stop_reason: int | str | None = Field( + default=None, + description=( + "The stop string or token id that caused the completion " + "to stop, None if the completion finished for some other reason " + "including encountering the EOS token" + ), + ) + token_ids: list[int] | None = None # For response + prompt_logprobs: list[dict[int, Logprob] | None] | None = None + prompt_token_ids: list[int] | None = None # For prompt + + +class CompletionResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") + object: Literal["text_completion"] = "text_completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[CompletionResponseChoice] + service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None + system_fingerprint: str | None = None + usage: UsageInfo + + # vLLM-specific fields that are not in OpenAI spec + kv_transfer_params: dict[str, Any] | None = Field( + default=None, description="KVTransfer parameters." + ) + + +class CompletionResponseStreamChoice(OpenAIBaseModel): + index: int + text: str + logprobs: CompletionLogProbs | None = None + finish_reason: str | None = None + stop_reason: int | str | None = Field( + default=None, + description=( + "The stop string or token id that caused the completion " + "to stop, None if the completion finished for some other reason " + "including encountering the EOS token" + ), + ) + # not part of the OpenAI spec but for tracing the tokens + # prompt tokens is put into choice to align with CompletionResponseChoice + prompt_token_ids: list[int] | None = None + token_ids: list[int] | None = None + + +class CompletionStreamResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") + object: str = "text_completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[CompletionResponseStreamChoice] + usage: UsageInfo | None = Field(default=None) + + +class EmbeddingResponseData(OpenAIBaseModel): + index: int + object: str = "embedding" + embedding: list[float] | str + + +class EmbeddingResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"embd-{random_uuid()}") + object: str = "list" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + data: list[EmbeddingResponseData] + usage: UsageInfo + + +class EmbeddingBytesResponse(OpenAIBaseModel): + body: list[bytes] + metadata: str + media_type: str = "application/octet-stream" + + +class PoolingResponseData(OpenAIBaseModel): + index: int + object: str = "pooling" + data: list[list[float]] | list[float] | str + + +class PoolingResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"pool-{random_uuid()}") + object: str = "list" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + data: list[PoolingResponseData] + usage: UsageInfo + + +class PoolingBytesResponse(OpenAIBaseModel): + body: list[bytes] + metadata: str + media_type: str = "application/octet-stream" + + +class ScoreResponseData(OpenAIBaseModel): + index: int + object: str = "score" + score: float + + +class ScoreResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"embd-{random_uuid()}") + object: str = "list" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + data: list[ScoreResponseData] + usage: UsageInfo + + +class ClassificationCompletionRequest(OpenAIBaseModel): + model: str | None = None + input: list[str] | str + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + user: str | None = None + + # --8<-- [start:classification-extra-params] + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt." + ), + ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) + # --8<-- [end:classification-extra-params] + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + use_activation=get_use_activation(self), + ) + + +class ClassificationChatRequest(OpenAIBaseModel): + model: str | None = None + messages: list[ChatCompletionMessageParam] + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + user: str | None = None + + # --8<-- [start:chat-classification-extra-params] + add_generation_prompt: bool = Field( + default=False, + description=( + "If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model." + ), + ) + + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)." + ), + ) + + chat_template: str | None = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one." + ), + ) + + chat_template_kwargs: dict[str, Any] | None = Field( + default=None, + description=( + "Additional keyword args to pass to the template renderer. " + "Will be accessible by the chat template." + ), + ) + + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) + # --8<-- [end:chat-classification-extra-params] + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + use_activation=get_use_activation(self), + ) + + +ClassificationRequest: TypeAlias = ( + ClassificationCompletionRequest | ClassificationChatRequest +) + + +class ClassificationData(OpenAIBaseModel): + index: int + label: str | None + probs: list[float] + num_classes: int + + +class ClassificationResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"classify-{random_uuid()}") + object: str = "list" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + data: list[ClassificationData] + usage: UsageInfo + + +class FunctionCall(OpenAIBaseModel): + name: str + arguments: str + + +class ToolCall(OpenAIBaseModel): + id: str = Field(default_factory=make_tool_call_id) + type: Literal["function"] = "function" + function: FunctionCall + + +class DeltaFunctionCall(BaseModel): + name: str | None = None + arguments: str | None = None + + +# a tool call delta where everything is optional +class DeltaToolCall(OpenAIBaseModel): + id: str | None = None + type: Literal["function"] | None = None + index: int + function: DeltaFunctionCall | None = None + + +class ExtractedToolCallInformation(BaseModel): + # indicate if tools were called + tools_called: bool + + # extracted tool calls + tool_calls: list[ToolCall] + + # content - per OpenAI spec, content AND tool calls can be returned rarely + # But some models will do this intentionally + content: str | None = None + + +class ChatMessage(OpenAIBaseModel): + role: str + content: str | None = None + refusal: str | None = None + annotations: OpenAIAnnotation | None = None + audio: OpenAIChatCompletionAudio | None = None + function_call: FunctionCall | None = None + tool_calls: list[ToolCall] = Field(default_factory=list) + + # vLLM-specific fields that are not in OpenAI spec + reasoning: str | None = None + reasoning_content: str | None = None + """Deprecated: use `reasoning` instead.""" + + @model_validator(mode="after") + def handle_deprecated_reasoning_content(self): + """Copy reasoning to reasoning_content for backward compatibility.""" + self.reasoning_content = self.reasoning + return self + + +class ChatCompletionLogProb(OpenAIBaseModel): + token: str + logprob: float = -9999.0 + bytes: list[int] | None = None + + +class ChatCompletionLogProbsContent(ChatCompletionLogProb): + # Workaround: redefine fields name cache so that it's not + # shared with the super class. + field_names: ClassVar[set[str] | None] = None + top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list) + + +class ChatCompletionLogProbs(OpenAIBaseModel): + content: list[ChatCompletionLogProbsContent] | None = None + + +class ChatCompletionResponseChoice(OpenAIBaseModel): + index: int + message: ChatMessage + logprobs: ChatCompletionLogProbs | None = None + # per OpenAI spec this is the default + finish_reason: str | None = "stop" + # not part of the OpenAI spec but included in vLLM for legacy reasons + stop_reason: int | str | None = None + # not part of the OpenAI spec but is useful for tracing the tokens + # in agent scenarios + token_ids: list[int] | None = None + + +class ChatCompletionResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") + object: Literal["chat.completion"] = "chat.completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[ChatCompletionResponseChoice] + service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None + system_fingerprint: str | None = None + usage: UsageInfo + + # vLLM-specific fields that are not in OpenAI spec + prompt_logprobs: list[dict[int, Logprob] | None] | None = None + prompt_token_ids: list[int] | None = None + kv_transfer_params: dict[str, Any] | None = Field( + default=None, description="KVTransfer parameters." + ) + + +class DeltaMessage(OpenAIBaseModel): + role: str | None = None + content: str | None = None + reasoning: str | None = None + reasoning_content: str | None = None + """Deprecated: use `reasoning` instead.""" + tool_calls: list[DeltaToolCall] = Field(default_factory=list) + + @model_validator(mode="after") + def handle_deprecated_reasoning_content(self): + """Copy reasoning to reasoning_content for backward compatibility.""" + self.reasoning_content = self.reasoning + return self + + +class ChatCompletionResponseStreamChoice(OpenAIBaseModel): + index: int + delta: DeltaMessage + logprobs: ChatCompletionLogProbs | None = None + finish_reason: str | None = None + stop_reason: int | str | None = None + # not part of the OpenAI spec but for tracing the tokens + token_ids: list[int] | None = None + + +class ChatCompletionStreamResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") + object: Literal["chat.completion.chunk"] = "chat.completion.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[ChatCompletionResponseStreamChoice] + usage: UsageInfo | None = Field(default=None) + # not part of the OpenAI spec but for tracing the tokens + prompt_token_ids: list[int] | None = None + + +class TranscriptionResponseStreamChoice(OpenAIBaseModel): + delta: DeltaMessage + finish_reason: str | None = None + stop_reason: int | str | None = None + + +class TranscriptionStreamResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}") + object: Literal["transcription.chunk"] = "transcription.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[TranscriptionResponseStreamChoice] + usage: UsageInfo | None = Field(default=None) + + +class InputTokensDetails(OpenAIBaseModel): + cached_tokens: int + input_tokens_per_turn: list[int] = Field(default_factory=list) + cached_tokens_per_turn: list[int] = Field(default_factory=list) + + +class OutputTokensDetails(OpenAIBaseModel): + reasoning_tokens: int = 0 + tool_output_tokens: int = 0 + output_tokens_per_turn: list[int] = Field(default_factory=list) + tool_output_tokens_per_turn: list[int] = Field(default_factory=list) + + +class ResponseUsage(OpenAIBaseModel): + input_tokens: int + input_tokens_details: InputTokensDetails + output_tokens: int + output_tokens_details: OutputTokensDetails + total_tokens: int + + +def serialize_message(msg): + """ + Serializes a single message + """ + if isinstance(msg, dict): + return msg + elif hasattr(msg, "to_dict"): + return msg.to_dict() + else: + # fallback to pyandic dump + return msg.model_dump_json() + + +def serialize_messages(msgs): + """ + Serializes multiple messages + """ + return [serialize_message(msg) for msg in msgs] if msgs else None + + +class ResponsesResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") + created_at: int = Field(default_factory=lambda: int(time.time())) + # error: Optional[ResponseError] = None + incomplete_details: IncompleteDetails | None = None + instructions: str | None = None + metadata: Metadata | None = None + model: str + object: Literal["response"] = "response" + output: list[ResponseOutputItem] + parallel_tool_calls: bool + temperature: float + tool_choice: ToolChoice + tools: list[Tool] + top_p: float + background: bool + max_output_tokens: int + max_tool_calls: int | None = None + previous_response_id: str | None = None + prompt: ResponsePrompt | None = None + reasoning: Reasoning | None = None + service_tier: Literal["auto", "default", "flex", "scale", "priority"] + status: ResponseStatus + text: ResponseTextConfig | None = None + top_logprobs: int | None = None + truncation: Literal["auto", "disabled"] + usage: ResponseUsage | None = None + user: str | None = None + + # --8<-- [start:responses-extra-params] + # These are populated when enable_response_messages is set to True + # NOTE: custom serialization is needed + # see serialize_input_messages and serialize_output_messages + input_messages: list[ChatCompletionMessageParam] | None = None + output_messages: list[ChatCompletionMessageParam] | None = None + # --8<-- [end:responses-extra-params] + + # NOTE: openAI harmony doesn't serialize TextContent properly, + # TODO: this fixes for TextContent, but need to verify for tools etc + # https://github.com/openai/harmony/issues/78 + @field_serializer("output_messages", when_used="json") + def serialize_output_messages(self, msgs, _info): + return serialize_messages(msgs) + + # NOTE: openAI harmony doesn't serialize TextContent properly, this fixes it + # https://github.com/openai/harmony/issues/78 + @field_serializer("input_messages", when_used="json") + def serialize_input_messages(self, msgs, _info): + return serialize_messages(msgs) + + @classmethod + def from_request( + cls, + request: ResponsesRequest, + sampling_params: SamplingParams, + model_name: str, + created_time: int, + output: list[ResponseOutputItem], + status: ResponseStatus, + usage: ResponseUsage | None = None, + input_messages: list[ChatCompletionMessageParam] | None = None, + output_messages: list[ChatCompletionMessageParam] | None = None, + ) -> "ResponsesResponse": + incomplete_details: IncompleteDetails | None = None + if status == "incomplete": + incomplete_details = IncompleteDetails(reason="max_output_tokens") + # TODO: implement the other reason for incomplete_details, + # which is content_filter + # incomplete_details = IncompleteDetails(reason='content_filter') + return cls( + id=request.request_id, + created_at=created_time, + incomplete_details=incomplete_details, + instructions=request.instructions, + metadata=request.metadata, + model=model_name, + output=output, + input_messages=input_messages, + output_messages=output_messages, + parallel_tool_calls=request.parallel_tool_calls, + temperature=sampling_params.temperature, + tool_choice=request.tool_choice, + tools=request.tools, + top_p=sampling_params.top_p, + background=request.background, + max_output_tokens=sampling_params.max_tokens, + max_tool_calls=request.max_tool_calls, + previous_response_id=request.previous_response_id, + prompt=request.prompt, + reasoning=request.reasoning, + service_tier=request.service_tier, + status=status, + text=request.text, + top_logprobs=sampling_params.logprobs, + truncation=request.truncation, + user=request.user, + usage=usage, + ) + + +# TODO: this code can be removed once +# https://github.com/openai/openai-python/issues/2634 has been resolved +class ResponseReasoningPartDoneEvent(OpenAIBaseModel): + content_index: int + """The index of the content part that is done.""" + + item_id: str + """The ID of the output item that the content part was added to.""" + + output_index: int + """The index of the output item that the content part was added to.""" + + part: ResponseReasoningTextContent + """The content part that is done.""" + + sequence_number: int + """The sequence number of this event.""" + + type: Literal["response.reasoning_part.done"] + """The type of the event. Always `response.reasoning_part.done`.""" + + +# TODO: this code can be removed once +# https://github.com/openai/openai-python/issues/2634 has been resolved +class ResponseReasoningPartAddedEvent(OpenAIBaseModel): + content_index: int + """The index of the content part that is done.""" + + item_id: str + """The ID of the output item that the content part was added to.""" + + output_index: int + """The index of the output item that the content part was added to.""" + + part: ResponseReasoningTextContent + """The content part that is done.""" + + sequence_number: int + """The sequence number of this event.""" + + type: Literal["response.reasoning_part.added"] + """The type of the event. Always `response.reasoning_part.added`.""" + + +# vLLM Streaming Events +# Note: we override the response type with the vLLM ResponsesResponse type +class ResponseCompletedEvent(OpenAIResponseCompletedEvent): + response: ResponsesResponse # type: ignore[override] + + +class ResponseCreatedEvent(OpenAIResponseCreatedEvent): + response: ResponsesResponse # type: ignore[override] + + +class ResponseInProgressEvent(OpenAIResponseInProgressEvent): + response: ResponsesResponse # type: ignore[override] + + +StreamingResponsesResponse: TypeAlias = ( + ResponseCreatedEvent + | ResponseInProgressEvent + | ResponseCompletedEvent + | ResponseOutputItemAddedEvent + | ResponseOutputItemDoneEvent + | ResponseContentPartAddedEvent + | ResponseContentPartDoneEvent + | ResponseReasoningTextDeltaEvent + | ResponseReasoningTextDoneEvent + | ResponseReasoningPartAddedEvent + | ResponseReasoningPartDoneEvent + | ResponseCodeInterpreterCallInProgressEvent + | ResponseCodeInterpreterCallCodeDeltaEvent + | ResponseWebSearchCallInProgressEvent + | ResponseWebSearchCallSearchingEvent + | ResponseWebSearchCallCompletedEvent + | ResponseCodeInterpreterCallCodeDoneEvent + | ResponseCodeInterpreterCallInterpretingEvent + | ResponseCodeInterpreterCallCompletedEvent +) + +BatchRequestInputBody: TypeAlias = ( + ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest +) + + +class BatchRequestInput(OpenAIBaseModel): + """ + The per-line object of the batch input file. + + NOTE: Currently only the `/v1/chat/completions` endpoint is supported. + """ + + # A developer-provided per-request id that will be used to match outputs to + # inputs. Must be unique for each request in a batch. + custom_id: str + + # The HTTP method to be used for the request. Currently only POST is + # supported. + method: str + + # The OpenAI API relative URL to be used for the request. Currently + # /v1/chat/completions is supported. + url: str + + # The parameters of the request. + body: BatchRequestInputBody + + @field_validator("body", mode="plain") + @classmethod + def check_type_for_url(cls, value: Any, info: ValidationInfo): + # Use url to disambiguate models + url: str = info.data["url"] + if url == "/v1/chat/completions": + return ChatCompletionRequest.model_validate(value) + if url == "/v1/embeddings": + return TypeAdapter(EmbeddingRequest).validate_python(value) + if url.endswith("/score"): + return ScoreRequest.model_validate(value) + if url.endswith("/rerank"): + return RerankRequest.model_validate(value) + return TypeAdapter(BatchRequestInputBody).validate_python(value) + + +class BatchResponseData(OpenAIBaseModel): + # HTTP status code of the response. + status_code: int = 200 + + # An unique identifier for the API request. + request_id: str + + # The body of the response. + body: ( + ChatCompletionResponse + | EmbeddingResponse + | ScoreResponse + | RerankResponse + | None + ) = None + + +class BatchRequestOutput(OpenAIBaseModel): + """ + The per-line object of the batch output and error files + """ + + id: str + + # A developer-provided per-request id that will be used to match outputs to + # inputs. + custom_id: str + + response: BatchResponseData | None + + # For requests that failed with a non-HTTP error, this will contain more + # information on the cause of the failure. + error: Any | None + + +class TokenizeCompletionRequest(OpenAIBaseModel): + model: str | None = None + prompt: str + + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt." + ), + ) + return_token_strs: bool | None = Field( + default=False, + description=( + "If true, also return the token strings corresponding to the token ids." + ), + ) + + +class TokenizeChatRequest(OpenAIBaseModel): + model: str | None = None + messages: list[ChatCompletionMessageParam] + + add_generation_prompt: bool = Field( + default=True, + description=( + "If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model." + ), + ) + return_token_strs: bool | None = Field( + default=False, + description=( + "If true, also return the token strings corresponding to the token ids." + ), + ) + continue_final_message: bool = Field( + default=False, + description=( + "If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + 'This allows you to "prefill" part of the model\'s response for it. ' + "Cannot be used at the same time as `add_generation_prompt`." + ), + ) + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)." + ), + ) + chat_template: str | None = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one." + ), + ) + chat_template_kwargs: dict[str, Any] | None = Field( + default=None, + description=( + "Additional keyword args to pass to the template renderer. " + "Will be accessible by the chat template." + ), + ) + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + tools: list[ChatCompletionToolsParam] | None = Field( + default=None, + description=("A list of tools the model may call."), + ) + + @model_validator(mode="before") + @classmethod + def check_generation_prompt(cls, data): + if data.get("continue_final_message") and data.get("add_generation_prompt"): + raise ValueError( + "Cannot set both `continue_final_message` and " + "`add_generation_prompt` to True." + ) + return data + + +TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest + + +class TokenizeResponse(OpenAIBaseModel): + count: int + max_model_len: int + tokens: list[int] + token_strs: list[str] | None = None + + +class DetokenizeRequest(OpenAIBaseModel): + model: str | None = None + tokens: list[int] + + +class DetokenizeResponse(OpenAIBaseModel): + prompt: str + + +class TokenizerInfoResponse(OpenAIBaseModel): + """ + Response containing tokenizer configuration + equivalent to tokenizer_config.json + """ + + model_config = ConfigDict(extra="allow") + tokenizer_class: str + + +class LoadLoRAAdapterRequest(BaseModel): + lora_name: str + lora_path: str + + +class UnloadLoRAAdapterRequest(BaseModel): + lora_name: str + lora_int_id: int | None = Field(default=None) + + +## Protocols for Audio +AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"] + + +class TranscriptionRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/audio/createTranscription + + file: UploadFile + """ + The audio file object (not file name) to transcribe, in one of these + formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + """ + + model: str | None = None + """ID of the model to use. + """ + + language: str | None = None + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format + will improve accuracy and latency. + """ + + prompt: str = Field(default="") + """An optional text to guide the model's style or continue a previous audio + segment. + + The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. + """ + + response_format: AudioResponseFormat = Field(default="json") + """ + The format of the output, in one of these options: `json`, `text`, `srt`, + `verbose_json`, or `vtt`. + """ + + ## TODO (varun) : Support if set to 0, certain thresholds are met !! + + timestamp_granularities: list[Literal["word", "segment"]] = Field( + alias="timestamp_granularities[]", default=[] + ) + """The timestamp granularities to populate for this transcription. + + `response_format` must be set `verbose_json` to use timestamp granularities. + Either or both of these options are supported: `word`, or `segment`. Note: + There is no additional latency for segment timestamps, but generating word + timestamps incurs additional latency. + """ + + stream: bool | None = False + """When set, it will enable output to be streamed in a similar fashion + as the Chat Completion endpoint. + """ + # --8<-- [start:transcription-extra-params] + # Flattened stream option to simplify form data. + stream_include_usage: bool | None = False + stream_continuous_usage_stats: bool | None = False + + vllm_xargs: dict[str, str | int | float] | None = Field( + default=None, + description=( + "Additional request parameters with string or " + "numeric values, used by custom extensions." + ), + ) + # --8<-- [end:transcription-extra-params] + + to_language: str | None = None + """The language of the output audio we transcribe to. + + Please note that this is not currently used by supported models at this + time, but it is a placeholder for future use, matching translation api. + """ + + # --8<-- [start:transcription-sampling-params] + temperature: float = Field(default=0.0) + """The sampling temperature, between 0 and 1. + + Higher values like 0.8 will make the output more random, while lower values + like 0.2 will make it more focused / deterministic. If set to 0, the model + will use [log probability](https://en.wikipedia.org/wiki/Log_probability) + to automatically increase the temperature until certain thresholds are hit. + """ + + top_p: float | None = None + """Enables nucleus (top-p) sampling, where tokens are selected from the + smallest possible set whose cumulative probability exceeds `p`. + """ + + top_k: int | None = None + """Limits sampling to the `k` most probable tokens at each step.""" + + min_p: float | None = None + """Filters out tokens with a probability lower than `min_p`, ensuring a + minimum likelihood threshold during sampling. + """ + + seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + """The seed to use for sampling.""" + + frequency_penalty: float | None = 0.0 + """The frequency penalty to use for sampling.""" + + repetition_penalty: float | None = None + """The repetition penalty to use for sampling.""" + + presence_penalty: float | None = 0.0 + """The presence penalty to use for sampling.""" + # --8<-- [end:transcription-sampling-params] + + # Default sampling parameters for transcription requests. + _DEFAULT_SAMPLING_PARAMS: dict = { + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_p": 1.0, + "top_k": 0, + "min_p": 0.0, + } + + def to_sampling_params( + self, default_max_tokens: int, default_sampling_params: dict | None = None + ) -> SamplingParams: + max_tokens = default_max_tokens + + if default_sampling_params is None: + default_sampling_params = {} + + # Default parameters + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + if (top_p := self.top_p) is None: + top_p = default_sampling_params.get( + "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] + ) + if (top_k := self.top_k) is None: + top_k = default_sampling_params.get( + "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] + ) + if (min_p := self.min_p) is None: + min_p = default_sampling_params.get( + "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"] + ) + + if (repetition_penalty := self.repetition_penalty) is None: + repetition_penalty = default_sampling_params.get( + "repetition_penalty", + self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"], + ) + + return SamplingParams.from_optional( + temperature=temperature, + max_tokens=max_tokens, + seed=self.seed, + top_p=top_p, + top_k=top_k, + min_p=min_p, + frequency_penalty=self.frequency_penalty, + repetition_penalty=repetition_penalty, + presence_penalty=self.presence_penalty, + output_kind=RequestOutputKind.DELTA + if self.stream + else RequestOutputKind.FINAL_ONLY, + extra_args=self.vllm_xargs, + ) + + @model_validator(mode="before") + @classmethod + def validate_transcription_request(cls, data): + if isinstance(data.get("file"), str): + raise HTTPException( + status_code=HTTPStatus.UNPROCESSABLE_ENTITY, + detail="Expected 'file' to be a file-like object, not 'str'.", + ) + + stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] + stream = data.get("stream", False) + if any(bool(data.get(so, False)) for so in stream_opts) and not stream: + raise ValueError("Stream options can only be defined when `stream=True`.") + + return data + + +# Transcription response objects +class TranscriptionUsageAudio(OpenAIBaseModel): + type: Literal["duration"] = "duration" + seconds: int + + +class TranscriptionResponse(OpenAIBaseModel): + text: str + """The transcribed text.""" + usage: TranscriptionUsageAudio + + +class TranscriptionWord(OpenAIBaseModel): + end: float + """End time of the word in seconds.""" + + start: float + """Start time of the word in seconds.""" + + word: str + """The text content of the word.""" + + +class TranscriptionSegment(OpenAIBaseModel): + id: int + """Unique identifier of the segment.""" + + avg_logprob: float + """Average logprob of the segment. + + If the value is lower than -1, consider the logprobs failed. + """ + + compression_ratio: float + """Compression ratio of the segment. + + If the value is greater than 2.4, consider the compression failed. + """ + + end: float + """End time of the segment in seconds.""" + + no_speech_prob: float + """Probability of no speech in the segment. + + If the value is higher than 1.0 and the `avg_logprob` is below -1, consider + this segment silent. + """ + + seek: int + """Seek offset of the segment.""" + + start: float + """Start time of the segment in seconds.""" + + temperature: float + """Temperature parameter used for generating the segment.""" + + text: str + """Text content of the segment.""" + + tokens: list[int] + """Array of token IDs for the text content.""" + + +class TranscriptionResponseVerbose(OpenAIBaseModel): + duration: str + """The duration of the input audio.""" + + language: str + """The language of the input audio.""" + + text: str + """The transcribed text.""" + + segments: list[TranscriptionSegment] | None = None + """Segments of the transcribed text and their corresponding details.""" + + words: list[TranscriptionWord] | None = None + """Extracted words and their corresponding timestamps.""" + + +class TranslationResponseStreamChoice(OpenAIBaseModel): + delta: DeltaMessage + finish_reason: str | None = None + stop_reason: int | str | None = None + + +class TranslationStreamResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}") + object: Literal["translation.chunk"] = "translation.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[TranslationResponseStreamChoice] + usage: UsageInfo | None = Field(default=None) + + +class TranslationRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/audio/createTranslation + + file: UploadFile + """ + The audio file object (not file name) to translate, in one of these + formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + """ + + model: str | None = None + """ID of the model to use. + """ + + prompt: str = Field(default="") + """An optional text to guide the model's style or continue a previous audio + segment. + + The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. + """ + + response_format: AudioResponseFormat = Field(default="json") + """ + The format of the output, in one of these options: `json`, `text`, `srt`, + `verbose_json`, or `vtt`. + """ + + # TODO support additional sampling parameters + # --8<-- [start:translation-sampling-params] + seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + """The seed to use for sampling.""" + + temperature: float = Field(default=0.0) + """The sampling temperature, between 0 and 1. + + Higher values like 0.8 will make the output more random, while lower values + like 0.2 will make it more focused / deterministic. If set to 0, the model + will use [log probability](https://en.wikipedia.org/wiki/Log_probability) + to automatically increase the temperature until certain thresholds are hit. + """ + # --8<-- [end:translation-sampling-params] + + # --8<-- [start:translation-extra-params] + language: str | None = None + """The language of the input audio we translate from. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format + will improve accuracy. + """ + + to_language: str | None = None + """The language of the input audio we translate to. + + Please note that this is not supported by all models, refer to the specific + model documentation for more details. + For instance, Whisper only supports `to_language=en`. + """ + + stream: bool | None = False + """Custom field not present in the original OpenAI definition. When set, + it will enable output to be streamed in a similar fashion as the Chat + Completion endpoint. + """ + # Flattened stream option to simplify form data. + stream_include_usage: bool | None = False + stream_continuous_usage_stats: bool | None = False + # --8<-- [end:translation-extra-params] + + # Default sampling parameters for translation requests. + _DEFAULT_SAMPLING_PARAMS: dict = { + "temperature": 0, + } + + def to_sampling_params( + self, default_max_tokens: int, default_sampling_params: dict | None = None + ) -> SamplingParams: + max_tokens = default_max_tokens + + if default_sampling_params is None: + default_sampling_params = {} + # Default parameters + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + + return SamplingParams.from_optional( + temperature=temperature, + max_tokens=max_tokens, + seed=self.seed, + output_kind=RequestOutputKind.DELTA + if self.stream + else RequestOutputKind.FINAL_ONLY, + ) + + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] + stream = data.get("stream", False) + if any(bool(data.get(so, False)) for so in stream_opts) and not stream: + raise ValueError("Stream options can only be defined when `stream=True`.") + + return data + + +# Translation response objects +class TranslationResponse(OpenAIBaseModel): + text: str + """The translated text.""" + + +class TranslationWord(OpenAIBaseModel): + end: float + """End time of the word in seconds.""" + + start: float + """Start time of the word in seconds.""" + + word: str + """The text content of the word.""" + + +class TranslationSegment(OpenAIBaseModel): + id: int + """Unique identifier of the segment.""" + + avg_logprob: float + """Average logprob of the segment. + + If the value is lower than -1, consider the logprobs failed. + """ + + compression_ratio: float + """Compression ratio of the segment. + + If the value is greater than 2.4, consider the compression failed. + """ + + end: float + """End time of the segment in seconds.""" + + no_speech_prob: float + """Probability of no speech in the segment. + + If the value is higher than 1.0 and the `avg_logprob` is below -1, consider + this segment silent. + """ + + seek: int + """Seek offset of the segment.""" + + start: float + """Start time of the segment in seconds.""" + + temperature: float + """Temperature parameter used for generating the segment.""" + + text: str + """Text content of the segment.""" + + tokens: list[int] + """Array of token IDs for the text content.""" + + +class TranslationResponseVerbose(OpenAIBaseModel): + duration: str + """The duration of the input audio.""" + + language: str + """The language of the input audio.""" + + text: str + """The translated text.""" + + segments: list[TranslationSegment] | None = None + """Segments of the translated text and their corresponding details.""" + + words: list[TranslationWord] | None = None + """Extracted words and their corresponding timestamps.""" + + +####### Tokens IN <> Tokens OUT ####### +class GenerateRequest(BaseModel): + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + token_ids: list[int] + """The token ids to generate text from.""" + + # features: MultiModalFeatureSpec + # TODO (NickLucche): implement once Renderer work is completed + features: str | None = None + """The processed MM inputs for the model.""" + + sampling_params: SamplingParams + """The sampling parameters for the model.""" + + model: str | None = None + + stream: bool | None = False + stream_options: StreamOptions | None = None + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit)." + ), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) + + +class GenerateResponseChoice(BaseModel): + index: int + logprobs: ChatCompletionLogProbs | None = None + # per OpenAI spec this is the default + finish_reason: str | None = "stop" + token_ids: list[int] | None = None + + +class GenerateResponse(BaseModel): + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + choices: list[GenerateResponseChoice] + + prompt_logprobs: list[dict[int, Logprob] | None] | None = None + + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) diff --git a/entrypoints/openai/run_batch.py b/entrypoints/openai/run_batch.py new file mode 100644 index 0000000..4b9dba0 --- /dev/null +++ b/entrypoints/openai/run_batch.py @@ -0,0 +1,547 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import tempfile +from argparse import Namespace +from collections.abc import Awaitable, Callable +from http import HTTPStatus +from io import StringIO + +import aiohttp +import torch +from prometheus_client import start_http_server +from tqdm import tqdm + +from vllm.engine.arg_utils import AsyncEngineArgs, optional_type +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + BatchRequestInput, + BatchRequestOutput, + BatchResponseData, + ChatCompletionResponse, + EmbeddingResponse, + ErrorResponse, + RerankResponse, + ScoreResponse, +) +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.entrypoints.openai.serving_score import ServingScores +from vllm.logger import init_logger +from vllm.reasoning import ReasoningParserManager +from vllm.utils import random_uuid +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.version import __version__ as VLLM_VERSION + +logger = init_logger(__name__) + + +def make_arg_parser(parser: FlexibleArgumentParser): + parser.add_argument( + "-i", + "--input-file", + required=True, + type=str, + help="The path or url to a single input file. Currently supports local file " + "paths, or the http protocol (http or https). If a URL is specified, " + "the file should be available via HTTP GET.", + ) + parser.add_argument( + "-o", + "--output-file", + required=True, + type=str, + help="The path or url to a single output file. Currently supports " + "local file paths, or web (http or https) urls. If a URL is specified," + " the file should be available via HTTP PUT.", + ) + parser.add_argument( + "--output-tmp-dir", + type=str, + default=None, + help="The directory to store the output file before uploading it " + "to the output URL.", + ) + parser.add_argument( + "--response-role", + type=optional_type(str), + default="assistant", + help="The role name to return if `request.add_generation_prompt=True`.", + ) + + parser = AsyncEngineArgs.add_cli_args(parser) + + parser.add_argument( + "--max-log-len", + type=int, + default=None, + help="Max number of prompt characters or prompt " + "ID numbers being printed in log." + "\n\nDefault: Unlimited", + ) + + parser.add_argument( + "--enable-metrics", action="store_true", help="Enable Prometheus metrics" + ) + parser.add_argument( + "--url", + type=str, + default="0.0.0.0", + help="URL to the Prometheus metrics server " + "(only needed if enable-metrics is set).", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Port number for the Prometheus metrics server " + "(only needed if enable-metrics is set).", + ) + parser.add_argument( + "--enable-prompt-tokens-details", + action="store_true", + default=False, + help="If set to True, enable prompt_tokens_details in usage.", + ) + parser.add_argument( + "--enable-force-include-usage", + action="store_true", + default=False, + help="If set to True, include usage on every request " + "(even when stream_options is not specified)", + ) + + return parser + + +def parse_args(): + parser = FlexibleArgumentParser(description="vLLM OpenAI-Compatible batch runner.") + return make_arg_parser(parser).parse_args() + + +# explicitly use pure text format, with a newline at the end +# this makes it impossible to see the animation in the progress bar +# but will avoid messing up with ray or multiprocessing, which wraps +# each line of output with some prefix. +_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n" # noqa: E501 + + +class BatchProgressTracker: + def __init__(self): + self._total = 0 + self._pbar: tqdm | None = None + + def submitted(self): + self._total += 1 + + def completed(self): + if self._pbar: + self._pbar.update() + + def pbar(self) -> tqdm: + enable_tqdm = ( + not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 + ) + self._pbar = tqdm( + total=self._total, + unit="req", + desc="Running batch", + mininterval=5, + disable=not enable_tqdm, + bar_format=_BAR_FORMAT, + ) + return self._pbar + + +async def read_file(path_or_url: str) -> str: + if path_or_url.startswith("http://") or path_or_url.startswith("https://"): + async with aiohttp.ClientSession() as session, session.get(path_or_url) as resp: + return await resp.text() + else: + with open(path_or_url, encoding="utf-8") as f: + return f.read() + + +async def write_local_file( + output_path: str, batch_outputs: list[BatchRequestOutput] +) -> None: + """ + Write the responses to a local file. + output_path: The path to write the responses to. + batch_outputs: The list of batch outputs to write. + """ + # We should make this async, but as long as run_batch runs as a + # standalone program, blocking the event loop won't affect performance. + with open(output_path, "w", encoding="utf-8") as f: + for o in batch_outputs: + print(o.model_dump_json(), file=f) + + +async def upload_data(output_url: str, data_or_file: str, from_file: bool) -> None: + """ + Upload a local file to a URL. + output_url: The URL to upload the file to. + data_or_file: Either the data to upload or the path to the file to upload. + from_file: If True, data_or_file is the path to the file to upload. + """ + # Timeout is a common issue when uploading large files. + # We retry max_retries times before giving up. + max_retries = 5 + # Number of seconds to wait before retrying. + delay = 5 + + for attempt in range(1, max_retries + 1): + try: + # We increase the timeout to 1000 seconds to allow + # for large files (default is 300). + async with aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=1000) + ) as session: + if from_file: + with open(data_or_file, "rb") as file: + async with session.put(output_url, data=file) as response: + if response.status != 200: + raise Exception( + f"Failed to upload file.\n" + f"Status: {response.status}\n" + f"Response: {response.text()}" + ) + else: + async with session.put(output_url, data=data_or_file) as response: + if response.status != 200: + raise Exception( + f"Failed to upload data.\n" + f"Status: {response.status}\n" + f"Response: {response.text()}" + ) + + except Exception as e: + if attempt < max_retries: + logger.error( + "Failed to upload data (attempt %d). Error message: %s.\nRetrying in %d seconds...", # noqa: E501 + attempt, + e, + delay, + ) + await asyncio.sleep(delay) + else: + raise Exception( + f"Failed to upload data (attempt {attempt}). Error message: {str(e)}." # noqa: E501 + ) from e + + +async def write_file( + path_or_url: str, batch_outputs: list[BatchRequestOutput], output_tmp_dir: str +) -> None: + """ + Write batch_outputs to a file or upload to a URL. + path_or_url: The path or URL to write batch_outputs to. + batch_outputs: The list of batch outputs to write. + output_tmp_dir: The directory to store the output file before uploading it + to the output URL. + """ + if path_or_url.startswith("http://") or path_or_url.startswith("https://"): + if output_tmp_dir is None: + logger.info("Writing outputs to memory buffer") + output_buffer = StringIO() + for o in batch_outputs: + print(o.model_dump_json(), file=output_buffer) + output_buffer.seek(0) + logger.info("Uploading outputs to %s", path_or_url) + await upload_data( + path_or_url, + output_buffer.read().strip().encode("utf-8"), + from_file=False, + ) + else: + # Write responses to a temporary file and then upload it to the URL. + with tempfile.NamedTemporaryFile( + mode="w", + encoding="utf-8", + dir=output_tmp_dir, + prefix="tmp_batch_output_", + suffix=".jsonl", + ) as f: + logger.info("Writing outputs to temporary local file %s", f.name) + await write_local_file(f.name, batch_outputs) + logger.info("Uploading outputs to %s", path_or_url) + await upload_data(path_or_url, f.name, from_file=True) + else: + logger.info("Writing outputs to local file %s", path_or_url) + await write_local_file(path_or_url, batch_outputs) + + +def make_error_request_output( + request: BatchRequestInput, error_msg: str +) -> BatchRequestOutput: + batch_output = BatchRequestOutput( + id=f"vllm-{random_uuid()}", + custom_id=request.custom_id, + response=BatchResponseData( + status_code=HTTPStatus.BAD_REQUEST, + request_id=f"vllm-batch-{random_uuid()}", + ), + error=error_msg, + ) + return batch_output + + +async def make_async_error_request_output( + request: BatchRequestInput, error_msg: str +) -> BatchRequestOutput: + return make_error_request_output(request, error_msg) + + +async def run_request( + serving_engine_func: Callable, + request: BatchRequestInput, + tracker: BatchProgressTracker, +) -> BatchRequestOutput: + response = await serving_engine_func(request.body) + + if isinstance( + response, + (ChatCompletionResponse, EmbeddingResponse, ScoreResponse, RerankResponse), + ): + batch_output = BatchRequestOutput( + id=f"vllm-{random_uuid()}", + custom_id=request.custom_id, + response=BatchResponseData( + body=response, request_id=f"vllm-batch-{random_uuid()}" + ), + error=None, + ) + elif isinstance(response, ErrorResponse): + batch_output = BatchRequestOutput( + id=f"vllm-{random_uuid()}", + custom_id=request.custom_id, + response=BatchResponseData( + status_code=response.error.code, + request_id=f"vllm-batch-{random_uuid()}", + ), + error=response, + ) + else: + batch_output = make_error_request_output( + request, error_msg="Request must not be sent in stream mode" + ) + + tracker.completed() + return batch_output + + +def validate_run_batch_args(args): + valid_reasoning_parsers = ReasoningParserManager.list_registered() + if ( + reasoning_parser := args.structured_outputs_config.reasoning_parser + ) and reasoning_parser not in valid_reasoning_parsers: + raise KeyError( + f"invalid reasoning parser: {reasoning_parser} " + f"(chose from {{ {','.join(valid_reasoning_parsers)} }})" + ) + + +async def run_batch( + engine_client: EngineClient, + args: Namespace, +) -> None: + if args.served_model_name is not None: + served_model_names = args.served_model_name + else: + served_model_names = [args.model] + + if args.enable_log_requests: + request_logger = RequestLogger(max_log_len=args.max_log_len) + else: + request_logger = None + + base_model_paths = [ + BaseModelPath(name=name, model_path=args.model) for name in served_model_names + ] + + model_config = engine_client.model_config + supported_tasks = await engine_client.get_supported_tasks() + logger.info("Supported tasks: %s", supported_tasks) + + # Create the openai serving objects. + openai_serving_models = OpenAIServingModels( + engine_client=engine_client, + base_model_paths=base_model_paths, + lora_modules=None, + ) + + openai_serving_chat = ( + OpenAIServingChat( + engine_client, + openai_serving_models, + args.response_role, + request_logger=request_logger, + chat_template=None, + chat_template_content_format="auto", + reasoning_parser=args.structured_outputs_config.reasoning_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + ) + if "generate" in supported_tasks + else None + ) + + openai_serving_embedding = ( + OpenAIServingEmbedding( + engine_client, + openai_serving_models, + request_logger=request_logger, + chat_template=None, + chat_template_content_format="auto", + ) + if "embed" in supported_tasks + else None + ) + + enable_serving_reranking = ( + "classify" in supported_tasks + and getattr(model_config.hf_config, "num_labels", 0) == 1 + ) + + openai_serving_scores = ( + ServingScores( + engine_client, + openai_serving_models, + request_logger=request_logger, + ) + if ("embed" in supported_tasks or enable_serving_reranking) + else None + ) + + tracker = BatchProgressTracker() + logger.info("Reading batch from %s...", args.input_file) + + # Submit all requests in the file to the engine "concurrently". + response_futures: list[Awaitable[BatchRequestOutput]] = [] + for request_json in (await read_file(args.input_file)).strip().split("\n"): + # Skip empty lines. + request_json = request_json.strip() + if not request_json: + continue + + request = BatchRequestInput.model_validate_json(request_json) + + # Determine the type of request and run it. + if request.url == "/v1/chat/completions": + chat_handler_fn = ( + openai_serving_chat.create_chat_completion + if openai_serving_chat is not None + else None + ) + if chat_handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Chat Completions API", + ) + ) + continue + + response_futures.append(run_request(chat_handler_fn, request, tracker)) + tracker.submitted() + elif request.url == "/v1/embeddings": + embed_handler_fn = ( + openai_serving_embedding.create_embedding + if openai_serving_embedding is not None + else None + ) + if embed_handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Embeddings API", + ) + ) + continue + + response_futures.append(run_request(embed_handler_fn, request, tracker)) + tracker.submitted() + elif request.url.endswith("/score"): + score_handler_fn = ( + openai_serving_scores.create_score + if openai_serving_scores is not None + else None + ) + if score_handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Scores API", + ) + ) + continue + + response_futures.append(run_request(score_handler_fn, request, tracker)) + tracker.submitted() + elif request.url.endswith("/rerank"): + rerank_handler_fn = ( + openai_serving_scores.do_rerank + if openai_serving_scores is not None + else None + ) + if rerank_handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Rerank API", + ) + ) + continue + + response_futures.append(run_request(rerank_handler_fn, request, tracker)) + tracker.submitted() + else: + response_futures.append( + make_async_error_request_output( + request, + error_msg=f"URL {request.url} was used. " + "Supported endpoints: /v1/chat/completions, /v1/embeddings," + " /score, /rerank ." + "See vllm/entrypoints/openai/api_server.py for supported " + "score/rerank versions.", + ) + ) + + with tracker.pbar(): + responses = await asyncio.gather(*response_futures) + + await write_file(args.output_file, responses, args.output_tmp_dir) + + +async def main(args: Namespace): + from vllm.entrypoints.openai.api_server import build_async_engine_client + from vllm.usage.usage_lib import UsageContext + + validate_run_batch_args(args) + + async with build_async_engine_client( + args, + usage_context=UsageContext.OPENAI_BATCH_RUNNER, + disable_frontend_multiprocessing=False, + ) as engine_client: + await run_batch(engine_client, args) + + +if __name__ == "__main__": + args = parse_args() + + logger.info("vLLM batch processing API version %s", VLLM_VERSION) + logger.info("args: %s", args) + + # Start the Prometheus metrics server. LLMEngine uses the Prometheus client + # to publish metrics at the /metrics endpoint. + if args.enable_metrics: + logger.info("Prometheus metrics enabled") + start_http_server(port=args.port, addr=args.url) + else: + logger.info("Prometheus metrics disabled") + + asyncio.run(main(args)) diff --git a/entrypoints/openai/serving_chat.py b/entrypoints/openai/serving_chat.py new file mode 100644 index 0000000..59e1c8d --- /dev/null +++ b/entrypoints/openai/serving_chat.py @@ -0,0 +1,1772 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import json +import time +from collections.abc import AsyncGenerator, AsyncIterator +from collections.abc import Sequence as GenericSequence +from typing import Final + +import jinja2 +import partial_json_parser +import regex as re +from fastapi import Request +from openai_harmony import Message as OpenAIMessage + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ( + ChatTemplateContentFormatOption, + ConversationMessage, + get_history_tool_calls_cnt, + make_tool_call_id, +) +from vllm.entrypoints.harmony_utils import ( + get_developer_message, + get_stop_tokens_for_assistant_actions, + get_streamable_parser_for_assistant, + get_system_message, + parse_chat_output, + parse_input_to_harmony_message, + render_for_completion, +) +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ChatCompletionLogProb, + ChatCompletionLogProbs, + ChatCompletionLogProbsContent, + ChatCompletionNamedToolChoiceParam, + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, + ChatMessage, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ErrorResponse, + PromptTokenUsageInfo, + RequestResponseMetadata, + ToolCall, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.openai.tool_parsers import ToolParser +from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall +from vllm.entrypoints.utils import get_max_tokens, should_include_usage +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt +from vllm.logger import init_logger +from vllm.logprobs import Logprob +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.sampling_params import BeamSearchParams, SamplingParams +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.transformers_utils.tokenizers import ( + maybe_serialize_tool_calls, + truncate_tool_call_ids, + validate_request_params, +) +from vllm.utils.collection_utils import as_list +from vllm.v1.sample.logits_processor import validate_logits_processors_parameters + +logger = init_logger(__name__) + + +class OpenAIServingChat(OpenAIServing): + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + response_role: str, + *, + request_logger: RequestLogger | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + trust_request_chat_template: bool = False, + return_tokens_as_token_ids: bool = False, + reasoning_parser: str = "", + enable_auto_tools: bool = False, + exclude_tools_when_tool_choice_none: bool = False, + tool_parser: str | None = None, + enable_prompt_tokens_details: bool = False, + enable_force_include_usage: bool = False, + enable_log_outputs: bool = False, + log_error_stack: bool = False, + ) -> None: + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + log_error_stack=log_error_stack, + ) + + self.response_role = response_role + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format + self.trust_request_chat_template = trust_request_chat_template + self.enable_log_outputs = enable_log_outputs + + # set up logits processors + self.logits_processors = self.model_config.logits_processors + + # set up reasoning parser + self.reasoning_parser = self._get_reasoning_parser( + reasoning_parser_name=reasoning_parser + ) + # set up tool use + self.enable_auto_tools: bool = enable_auto_tools + self.tool_parser = self._get_tool_parser( + tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools + ) + self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none + + self.enable_prompt_tokens_details = enable_prompt_tokens_details + self.enable_force_include_usage = enable_force_include_usage + self.default_sampling_params = self.model_config.get_diff_sampling_param() + if self.default_sampling_params: + source = self.model_config.generation_config + source = "model" if source == "auto" else source + logger.info( + "Using default chat sampling params from %s: %s", + source, + self.default_sampling_params, + ) + if self.model_config.hf_config.model_type == "kimi_k2": + self.tool_call_id_type = "kimi_k2" + else: + self.tool_call_id_type = "random" + + self.use_harmony = self.model_config.hf_config.model_type == "gpt_oss" + if self.use_harmony: + if "stop_token_ids" not in self.default_sampling_params: + self.default_sampling_params["stop_token_ids"] = [] + self.default_sampling_params["stop_token_ids"].extend( + get_stop_tokens_for_assistant_actions() + ) + + # NOTE(woosuk): While OpenAI's chat completion API supports browsing + # for some models, currently vLLM doesn't support it. Please use the + # Responses API instead. + self.supports_browsing = False + self.browser_tool = None + # NOTE(woosuk): Chat completion API does not support code interpreter. + # Please use the Responses API instead. + self.supports_code_interpreter = False + self.python_tool = None + + async def create_chat_completion( + self, + request: ChatCompletionRequest, + raw_request: Request | None = None, + ) -> AsyncGenerator[str, None] | ChatCompletionResponse | ErrorResponse: + """ + Chat Completion API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/chat/create + for the API specification. This API mimics the OpenAI + Chat Completion API. + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + logger.error("Error with model %s", error_check_ret) + return error_check_ret + + # If the engine is dead, raise the engine's DEAD_ERROR. + # This is required for the streaming case, where we return a + # success status before we actually start generating text :). + if self.engine_client.errored: + raise self.engine_client.dead_error + + try: + lora_request = self._maybe_get_adapters( + request, supports_default_mm_loras=True + ) + + model_name = self.models.model_name(lora_request) + + tokenizer = await self.engine_client.get_tokenizer() + + tool_parser = self.tool_parser + + if isinstance(tokenizer, MistralTokenizer): + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` + maybe_serialize_tool_calls(request) + truncate_tool_call_ids(request) + validate_request_params(request) + + if ( + request.tool_choice == "auto" + and not (self.enable_auto_tools and tool_parser is not None) + and not isinstance(tokenizer, MistralTokenizer) + and not self.use_harmony + ): + # for hf tokenizers, "auto" tools requires + # --enable-auto-tool-choice and --tool-call-parser + return self.create_error_response( + '"auto" tool choice requires ' + "--enable-auto-tool-choice and --tool-call-parser to be set" + ) + + if request.tools is None or ( + request.tool_choice == "none" + and self.exclude_tools_when_tool_choice_none + ): + tool_dicts = None + else: + tool_dicts = [tool.model_dump() for tool in request.tools] + + if not self.use_harmony: + # Common case. + error_check_ret = self._validate_chat_template( + request_chat_template=request.chat_template, + chat_template_kwargs=request.chat_template_kwargs, + trust_request_chat_template=self.trust_request_chat_template, + ) + if error_check_ret is not None: + return error_check_ret + ( + conversation, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self.chat_template_content_format, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + tool_dicts=tool_dicts, + documents=request.documents, + chat_template_kwargs=request.chat_template_kwargs, + tool_parser=tool_parser, + add_special_tokens=request.add_special_tokens, + ) + else: + # For GPT-OSS. + ( + conversation, + request_prompts, + engine_prompts, + ) = self._make_request_with_harmony(request) + except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(f"{e} {e.__cause__}") + + request_id = ( + f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}" + ) + + request_metadata = RequestResponseMetadata(request_id=request_id) + if raw_request: + raw_request.state.request_metadata = request_metadata + + # Extract data_parallel_rank from header (router can inject it) + data_parallel_rank = self._get_data_parallel_rank(raw_request) + + # Schedule the request and get the result generator. + generators: list[AsyncGenerator[RequestOutput, None]] = [] + try: + for i, engine_prompt in enumerate(engine_prompts): + prompt_text, _, _ = self._get_prompt_components(request_prompts[i]) + + if self.default_sampling_params is None: + self.default_sampling_params = {} + + max_tokens = get_max_tokens( + max_model_len=self.max_model_len, + request=request, + input_length=len(engine_prompt["prompt_token_ids"]), + default_sampling_params=self.default_sampling_params, + ) + + sampling_params: SamplingParams | BeamSearchParams + if request.use_beam_search: + sampling_params = request.to_beam_search_params( + max_tokens, self.default_sampling_params + ) + else: + sampling_params = request.to_sampling_params( + max_tokens, + self.model_config.logits_processor_pattern, + self.default_sampling_params, + ) + validate_logits_processors_parameters( + self.logits_processors, + sampling_params, + ) + + self._log_inputs( + request_id, + request_prompts[i], + params=sampling_params, + lora_request=lora_request, + ) + + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + if isinstance(sampling_params, BeamSearchParams): + generator = self.beam_search( + prompt=engine_prompt, + request_id=request_id, + params=sampling_params, + lora_request=lora_request, + ) + else: + engine_request, tokenization_kwargs = await self._process_inputs( + request_id, + engine_prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generator = self.engine_client.generate( + engine_request, + sampling_params, + request_id, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + prompt_text=prompt_text, + tokenization_kwargs=tokenization_kwargs, + data_parallel_rank=data_parallel_rank, + ) + + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + assert len(generators) == 1 + (result_generator,) = generators + + # Streaming response + if request.stream: + return self.chat_completion_stream_generator( + request, + result_generator, + request_id, + model_name, + conversation, + tokenizer, + request_metadata, + ) + + try: + return await self.chat_completion_full_generator( + request, + result_generator, + request_id, + model_name, + conversation, + tokenizer, + request_metadata, + ) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + def get_chat_request_role(self, request: ChatCompletionRequest) -> str: + if request.add_generation_prompt: + return self.response_role + return request.messages[-1]["role"] + + @staticmethod + def _bracket_level(s: str, opening="{", closing="}") -> int: + """ + Calculate the current level of nested brackets in a given string. + """ + level = 0 + for char in s: + if char == opening: + level += 1 + elif char == closing: + level -= 1 + return level + + @staticmethod + def _filter_delta_text(delta_text: str, previous_text: str) -> tuple[str, bool]: + # remove last '},' of the tool definition stemming from the + # "name"/"parameters" outer object or closing ']' of the tool list + # count occurrences of opening and closing curly braces and + # once level 0 is reached stop outputting text + # if 0 is reached while parsing the delta_text we know the current + # tool will finish in this current iteration + bracket_level = OpenAIServingChat._bracket_level(previous_text) + updated_delta, passed_zero = "", False + for c in delta_text: + if c == "{": + bracket_level += 1 + passed_zero = bracket_level == 0 + elif c == "}": + bracket_level -= 1 + passed_zero = bracket_level == 0 + + if bracket_level != 0: + updated_delta += c + else: + # if a comma is reached at level 0 we can stop + if c == ",": + break + return updated_delta, passed_zero + + def extract_tool_call_required_streaming( + self, + previous_text: str, + current_text: str | None, + delta_text: str, + function_name_returned: bool, + tool_call_idx: int | None = None, + ) -> tuple[DeltaMessage | None, bool]: + if current_text is None or current_text == "": + # if the current text is empty, we cannot parse it + return None, function_name_returned + try: + obj = partial_json_parser.loads(current_text) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug("not enough tokens to parse into JSON yet") + obj = None + + # check if the current text is a valid array + # containing a partial tool calling object + # if not repeat + if obj is None or not isinstance(obj, list) or not len(obj) > 0: + function_name_returned = False + delta_message = None + else: + _, finishes_previous_tool = OpenAIServingChat._filter_delta_text( + delta_text, previous_text + ) + # take the last tool call from the generated list + current_tool_call = obj[-1] + + # once parameters have been generated the name is complete as well + if not finishes_previous_tool and ( + "name" not in current_tool_call or "parameters" not in current_tool_call + ): + function_name_returned = False + delta_message = None + else: + if not function_name_returned: + # get partly generated arguments from the latest tool call + param_match = re.search( + r'.*"parameters":\s*(.*)', current_text, re.DOTALL + ) + arguments = param_match.group(1) if param_match else "" + arguments, _ = OpenAIServingChat._filter_delta_text( + arguments, previous_text + ) + + # if this iteration finishes a previous tool call but a + # new incomplete tool is already generated, take the + # previous from the list + if finishes_previous_tool and "parameters" not in current_tool_call: + current_tool_call = obj[-2] + + function_name_returned = True + tool_call_id = make_tool_call_id( + id_type=self.tool_call_id_type, + func_name=current_tool_call["name"], + idx=tool_call_idx, + ) + delta_message = DeltaMessage( + tool_calls=[ + DeltaToolCall( + id=tool_call_id, + function=DeltaFunctionCall( + name=current_tool_call["name"], arguments=arguments + ), + index=len(obj) - 1, + type="function", + ) + ] + ) + + else: + delta_text, _ = OpenAIServingChat._filter_delta_text( + delta_text, previous_text + ) + + if delta_text != "": + delta_message = DeltaMessage( + tool_calls=[ + DeltaToolCall( + function=DeltaFunctionCall( + # OpenAI API returns None + # instead of name every time + name=None, + arguments=delta_text, + ), + index=len(obj) - 1, + ) + ] + ) + else: + delta_message = None + + return delta_message, function_name_returned + + async def chat_completion_stream_generator( + self, + request: ChatCompletionRequest, + result_generator: AsyncIterator[RequestOutput], + request_id: str, + model_name: str, + conversation: list[ConversationMessage], + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + ) -> AsyncGenerator[str, None]: + created_time = int(time.time()) + chunk_object_type: Final = "chat.completion.chunk" + first_iteration = True + + # Send response for each token for each request.n (index) + num_choices = 1 if request.n is None else request.n + previous_num_tokens = [0] * num_choices + finish_reason_sent = [False] * num_choices + num_prompt_tokens = 0 + num_cached_tokens = None + if self.use_harmony: + harmony_parsers = [ + get_streamable_parser_for_assistant() for _ in range(num_choices) + ] + harmony_tools_streamed = [False] * num_choices + tools_streamed = [False] * num_choices + + if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam): + tool_choice_function_name = request.tool_choice.function.name + else: + tool_choice_function_name = None + + # Determine whether tools are in use with "auto" tool choice + tool_choice_auto = ( + not tool_choice_function_name + and self._should_stream_with_auto_tool_parsing(request) + ) + + all_previous_token_ids: list[list[int]] | None + function_name_returned = [False] * num_choices + if self.tool_call_id_type == "kimi_k2": + history_tool_call_cnt = get_history_tool_calls_cnt(conversation) + else: + history_tool_call_cnt = 0 + + # Always track previous_texts for comprehensive output logging + previous_texts = [""] * num_choices + + # Only one of these will be used, thus previous_texts and + # all_previous_token_ids will not be used twice in the same iteration. + if tool_choice_auto or self.reasoning_parser: + # These are only required in "auto" tool choice case + all_previous_token_ids = [[]] * num_choices + # For reasoning parser and tool call all enabled + added_content_delta_arr = [False] * num_choices + reasoning_end_arr = [False] * num_choices + else: + all_previous_token_ids = None + + try: + if self.reasoning_parser: + reasoning_parser = self.reasoning_parser( + tokenizer, + chat_template_kwargs=request.chat_template_kwargs, # type: ignore + ) + except RuntimeError as e: + logger.exception("Error in reasoning parser creation.") + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" + yield "data: [DONE]\n\n" + return + # Prepare the tool parser if it's needed + try: + if tool_choice_auto and self.tool_parser: + tool_parsers: list[ToolParser | None] = [ + self.tool_parser(tokenizer) + ] * num_choices + else: + tool_parsers = [None] * num_choices + except Exception as e: + logger.exception("Error in tool parser creation.") + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" + yield "data: [DONE]\n\n" + return + + stream_options = request.stream_options + include_usage, include_continuous_usage = should_include_usage( + stream_options, self.enable_force_include_usage + ) + + try: + async for res in result_generator: + if res.prompt_token_ids is not None: + num_prompt_tokens = len(res.prompt_token_ids) + if res.encoder_prompt_token_ids is not None: + num_prompt_tokens += len(res.encoder_prompt_token_ids) + + # We need to do it here, because if there are exceptions in + # the result_generator, it needs to be sent as the FIRST + # response (by the try...catch). + if first_iteration: + num_cached_tokens = res.num_cached_tokens + # Send first response for each request.n (index) with + # the role + role = self.get_chat_request_role(request) + + # NOTE num_choices defaults to 1 so this usually executes + # once per request + for i in range(num_choices): + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage( + role=role, + content="", + ), + logprobs=None, + finish_reason=None, + ) + + # return prompt_token_ids at the first chunk ever + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name, + prompt_token_ids=( + res.prompt_token_ids + if request.return_token_ids + else None + ), + ) + + # if continuous usage stats are requested, add it + if include_continuous_usage: + chunk.usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=0, + total_tokens=num_prompt_tokens, + ) + + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + + # Send response to echo the input portion of the + # last message + if request.echo: + last_msg_content: str | list[dict[str, str]] = "" + if ( + conversation + and "content" in conversation[-1] + and conversation[-1].get("role") == role + ): + last_msg_content = conversation[-1]["content"] or "" + + if last_msg_content: + for i in range(num_choices): + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage(content=last_msg_content), + logprobs=None, + finish_reason=None, + ) + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name, + ) + if include_continuous_usage: + chunk.usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=0, + total_tokens=num_prompt_tokens, + ) + + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + first_iteration = False + + for output in res.outputs: + i = output.index + tool_parser = tool_parsers[i] + + if finish_reason_sent[i]: + continue + + if request.logprobs and request.top_logprobs is not None: + assert output.logprobs is not None, "Did not output logprobs" + logprobs = self._create_chat_logprobs( + token_ids=output.token_ids, + top_logprobs=output.logprobs, + tokenizer=tokenizer, + num_output_top_logprobs=request.top_logprobs, + return_as_token_id=request.return_tokens_as_token_ids, + ) + else: + logprobs = None + + if self.use_harmony: + harmony_parser = harmony_parsers[i] + prev_recipient = harmony_parser.current_recipient + delta_text = "" + for token_id in output.token_ids: + harmony_parser.process(token_id) + delta_text += harmony_parser.last_content_delta or "" + cur_channel = harmony_parser.current_channel + cur_recipient = harmony_parser.current_recipient + else: + delta_text = output.text + + if ( + not delta_text + and not output.token_ids + and not previous_num_tokens[i] + ): + # Chunked prefill case, don't return empty chunks + continue + + delta_message: DeltaMessage | None + + # just update previous_texts and previous_token_ids + if tool_choice_auto or self.reasoning_parser: + assert previous_texts is not None + assert all_previous_token_ids is not None + previous_text = previous_texts[i] + previous_token_ids = all_previous_token_ids[i] + current_text = previous_text + delta_text + # avoid the None + list error. + if previous_token_ids: + current_token_ids = previous_token_ids + as_list( + output.token_ids + ) + else: + current_token_ids = as_list(output.token_ids) + + if self.use_harmony: + if cur_channel == "final": + delta_message = DeltaMessage(content=delta_text) + elif cur_channel == "analysis": + if request.include_reasoning: + delta_message = DeltaMessage(reasoning=delta_text) + else: + delta_message = None + elif ( + cur_channel == "commentary" + and cur_recipient + and cur_recipient.startswith("functions.") + ): + # Count completed tool calls to determine index + base_index = 0 + for msg in harmony_parser.messages: + if ( + msg.channel == "commentary" + and msg.recipient + and msg.recipient.startswith("functions.") + ): + base_index += 1 + + if prev_recipient != cur_recipient: + tool_name = cur_recipient.split("functions.", 1)[1] + delta_message = DeltaMessage( + tool_calls=[ + DeltaToolCall( + id=make_tool_call_id(), + type="function", + function=DeltaFunctionCall( + name=tool_name, + arguments="", + ), + index=base_index, + ) + ] + ) + elif delta_text: + delta_message = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=base_index, + function=DeltaFunctionCall( + arguments=delta_text + ), + ) + ] + ) + else: + delta_message = None + + if delta_message is not None: + harmony_tools_streamed[i] = True + else: + delta_message = None + # handle streaming deltas for tools with named tool_choice + elif tool_choice_function_name: + if ( + self.reasoning_parser + and not reasoning_end_arr[i] + and not reasoning_parser.is_reasoning_end( + previous_token_ids + ) + ): + assert reasoning_parser is not None + delta_message = ( + reasoning_parser.extract_reasoning_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + output.token_ids, + ) + ) + # When encountering think end id in delta_token_ids + # or think end id in prompt_token_ids + # i.e {"enable_thinking": False}, + # set reasoning status to end. + # Only keep 'content', remove 'reasoning'. + if reasoning_parser.is_reasoning_end( + as_list(output.token_ids) + ) or ( + res.prompt_token_ids + and reasoning_parser.is_reasoning_end( + res.prompt_token_ids + ) + ): + reasoning_end_arr[i] = True + if delta_message and delta_message.content: + # This need to be added to next `delta_text` + current_text = delta_message.content + delta_message.content = None + else: + current_text = "" + else: + # Just to add remaining `content` + if self.reasoning_parser: + delta_text = previous_text + delta_text + current_text = "" + + if function_name_returned[i]: + delta_tool_call = DeltaToolCall( + function=DeltaFunctionCall(arguments=delta_text), + index=i, + ) + else: + delta_tool_call = DeltaToolCall( + id=make_tool_call_id(), + type="function", + function=DeltaFunctionCall( + name=tool_choice_function_name, + arguments=delta_text, + ), + index=i, + ) + function_name_returned[i] = True + + delta_message = DeltaMessage( + tool_calls=[ + delta_tool_call, + ] + ) + tools_streamed[i] = True + + elif request.tool_choice == "required": + assert previous_texts is not None + previous_text = previous_texts[i] + current_text = previous_text + delta_text + fn_name_returned = function_name_returned[i] + output_token_ids = as_list(output.token_ids) + + if ( + self.reasoning_parser is not None + and not reasoning_end_arr[i] + and res.prompt_token_ids + and reasoning_parser.is_reasoning_end(res.prompt_token_ids) + ): + reasoning_end_arr[i] = True + + if self.reasoning_parser and not reasoning_end_arr[i]: + delta_message = ( + reasoning_parser.extract_reasoning_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + output_token_ids, + ) + ) + if reasoning_parser.is_reasoning_end(output_token_ids): + reasoning_end_arr[i] = True + if delta_message and delta_message.content: + current_text = delta_message.content + delta_message.content = None + else: + # reasoning ended + current_text = "" + + else: + # either finished reasoning or no reasoning at all + content = current_text + + delta_message, function_name_returned[i] = ( + self.extract_tool_call_required_streaming( + previous_text=previous_text, + current_text=content, + delta_text=delta_text, + function_name_returned=fn_name_returned, + tool_call_idx=history_tool_call_cnt, + ) + ) + if ( + delta_message + and delta_message.tool_calls + and delta_message.tool_calls[0].id is not None + ): + history_tool_call_cnt += 1 + tools_streamed[i] = True + + # handle streaming deltas for tools with "auto" tool choice + # and reasoning parser + elif tool_choice_auto and self.reasoning_parser: + assert tool_parser is not None + assert reasoning_parser is not None + assert added_content_delta_arr is not None + assert reasoning_end_arr is not None + output_token_ids = as_list(output.token_ids) + if not reasoning_end_arr[i]: + delta_message = ( + reasoning_parser.extract_reasoning_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + output_token_ids, + ) + ) + # When encountering think end id in prompt_token_ids + # i.e {"enable_thinking": False}, + # set reasoning status to end. + # Remove the text and token ids related + # to 'reasoning'. + if ( + res.prompt_token_ids + and reasoning_parser.is_reasoning_end( + res.prompt_token_ids + ) + ): + reasoning_end_arr[i] = True + current_token_ids = output_token_ids + if delta_message and delta_message.content: + current_text = delta_message.content + delta_message.content = None + else: + current_text = "" + # When encountering think end id in delta_token_ids, + # set reasoning status to end. + # Remove the text and token ids related + # to 'reasoning'. + if reasoning_parser.is_reasoning_end(output_token_ids): + reasoning_end_arr[i] = True + current_token_ids = ( + reasoning_parser.extract_content_ids( + output_token_ids + ) + ) + if delta_message and delta_message.content: + current_text = delta_message.content + delta_message.content = None + else: + current_text = "" + + # handle tool calls only after reasoning is done, + else: + delta_token_ids = output_token_ids + # First time to tool call, + # add the remaining text and token ids + # to delta from previous + if not added_content_delta_arr[i]: + added_content_delta_arr[i] = True + previous_text = "" + previous_token_ids = [] + delta_text = current_text + delta_token_ids = current_token_ids + + delta_message = tool_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=previous_token_ids, + current_token_ids=current_token_ids, + delta_token_ids=delta_token_ids, + request=request, + ) + if delta_message and delta_message.tool_calls: + tools_streamed[i] = True + # when only tool calls + elif tool_choice_auto: + assert tool_parser is not None + delta_message = tool_parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta_text, + previous_token_ids=previous_token_ids, + current_token_ids=current_token_ids, + delta_token_ids=output.token_ids, + request=request, + ) + if delta_message and delta_message.tool_calls: + tools_streamed[i] = True + + # when only reasoning + elif self.reasoning_parser: + delta_message = reasoning_parser.extract_reasoning_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + output.token_ids, + ) + # handle streaming just a content delta + else: + delta_message = DeltaMessage(content=delta_text) + + # update the previous values for the next iteration + if ( + tool_choice_auto or self.reasoning_parser + ) and not self.use_harmony: + assert previous_texts is not None + assert all_previous_token_ids is not None + previous_texts[i] = current_text + all_previous_token_ids[i] = current_token_ids + else: + # Update for comprehensive logging even in simple case + assert previous_texts is not None + previous_texts[i] += delta_text + + # set the previous values for the next iteration + previous_num_tokens[i] += len(output.token_ids) + + # if the message delta is None (e.g. because it was a + # "control token" for tool calls or the parser otherwise + # wasn't ready to send a token, then + # get the next token without streaming a chunk + if delta_message is None: + if output.finish_reason is None: + continue + else: + delta_message = DeltaMessage() + + # Log streaming delta if output logging is enabled + if self.enable_log_outputs and self.request_logger: + delta_content = "" + if delta_message.content: + delta_content = delta_message.content + elif delta_message.tool_calls: + delta_content = "".join( + tc.function.arguments + for tc in delta_message.tool_calls + if tc.function and tc.function.arguments + ) + + if delta_content: + self.request_logger.log_outputs( + request_id=request_id, + outputs=delta_content, + output_token_ids=as_list(output.token_ids), + finish_reason=output.finish_reason, + is_streaming=True, + delta=True, + ) + + if output.finish_reason is None: + # Send token-by-token response for each request.n + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=delta_message, + logprobs=logprobs, + finish_reason=None, + token_ids=( + as_list(output.token_ids) + if request.return_token_ids + else None + ), + ) + + # if the model is finished generating + else: + # check to make sure we haven't "forgotten" to stream + # any tokens that were generated but previously + # matched by partial json parsing + # only happens if we are NOT using structured outputs + auto_tools_called = False + if tool_parser: + auto_tools_called = len(tool_parser.prev_tool_call_arr) > 0 + index = ( + len(tool_parser.prev_tool_call_arr) - 1 + if auto_tools_called + else 0 + ) + else: + index = 0 + + if ( + self._should_check_for_unstreamed_tool_arg_tokens( + delta_message, output + ) + and tool_parser + ): + latest_delta_len = 0 + if ( + isinstance( + delta_message.tool_calls[0].function, + DeltaFunctionCall, + ) + ) and isinstance( + delta_message.tool_calls[0].function.arguments, str + ): + latest_delta_len = len( + delta_message.tool_calls[0].function.arguments + ) + + # get the expected call based on partial JSON + # parsing which "autocompletes" the JSON + expected_call = json.dumps( + tool_parser.prev_tool_call_arr[index].get( + "arguments", {} + ), + ensure_ascii=False, + ) + + # get what we've streamed so far for arguments + # for the current tool + actual_call = tool_parser.streamed_args_for_tool[index] + if latest_delta_len > 0: + actual_call = actual_call[:-latest_delta_len] + + # check to see if there's anything left to stream + remaining_call = expected_call.replace(actual_call, "", 1) + # set that as a delta message + delta_message = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=index, + function=DeltaFunctionCall( + arguments=remaining_call + ).model_dump(exclude_none=True), + ) + ] + ) + + # Send the finish response for each request.n only once + # In OpenAI's API, when a tool is called, the + # finish_reason is: + # "tool_calls" for "auto" or "required" tool calls, + # and "stop" for named tool calls. + if ( + auto_tools_called + or (tools_streamed[i] and not tool_choice_function_name) + or (self.use_harmony and harmony_tools_streamed[i]) + ): + finish_reason_ = "tool_calls" + else: + finish_reason_ = ( + output.finish_reason if output.finish_reason else "stop" + ) + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=delta_message, + logprobs=logprobs, + finish_reason=finish_reason_, + stop_reason=output.stop_reason, + token_ids=( + as_list(output.token_ids) + if request.return_token_ids + else None + ), + ) + + finish_reason_sent[i] = True + + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name, + ) + + # handle usage stats if requested & if continuous + if include_continuous_usage: + completion_tokens = previous_num_tokens[i] + chunk.usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens, + ) + + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + + # once the final token is handled, if stream_options.include_usage + # is sent, send the usage + if include_usage: + completion_tokens = sum(previous_num_tokens) + final_usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens, + ) + if self.enable_prompt_tokens_details and num_cached_tokens: + final_usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=num_cached_tokens + ) + + final_usage_chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[], + model=model_name, + usage=final_usage, + ) + final_usage_data = final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True + ) + yield f"data: {final_usage_data}\n\n" + + # report to FastAPI middleware aggregate usage across all choices + num_completion_tokens = sum(previous_num_tokens) + request_metadata.final_usage_info = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=num_completion_tokens, + total_tokens=num_prompt_tokens + num_completion_tokens, + ) + + # Log complete streaming response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + # Log the complete response for each choice + for i in range(num_choices): + full_text = ( + previous_texts[i] + if previous_texts and i < len(previous_texts) + else f"" + ) + self.request_logger.log_outputs( + request_id=request_id, + outputs=full_text, + output_token_ids=None, # Consider also logging all token IDs + finish_reason="streaming_complete", + is_streaming=True, + delta=False, + ) + + except Exception as e: + # TODO: Use a vllm-specific Validation Error + logger.exception("Error in chat completion stream generator.") + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" + # Send the final done message after all response.n are finished + yield "data: [DONE]\n\n" + + async def chat_completion_full_generator( + self, + request: ChatCompletionRequest, + result_generator: AsyncIterator[RequestOutput], + request_id: str, + model_name: str, + conversation: list[ConversationMessage], + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + ) -> ErrorResponse | ChatCompletionResponse: + created_time = int(time.time()) + final_res: RequestOutput | None = None + + try: + async for res in result_generator: + final_res = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + assert final_res is not None + + choices: list[ChatCompletionResponseChoice] = [] + if self.tool_call_id_type == "kimi_k2": + history_tool_call_cnt = get_history_tool_calls_cnt(conversation) + else: + history_tool_call_cnt = 0 + + role = self.get_chat_request_role(request) + for output in final_res.outputs: + token_ids = output.token_ids + out_logprobs = output.logprobs + tool_call_info = None + + if request.logprobs and request.top_logprobs is not None: + assert out_logprobs is not None, "Did not output logprobs" + logprobs = self._create_chat_logprobs( + token_ids=token_ids, + top_logprobs=out_logprobs, + num_output_top_logprobs=request.top_logprobs, + tokenizer=tokenizer, + return_as_token_id=request.return_tokens_as_token_ids, + ) + else: + logprobs = None + + if self.use_harmony: + reasoning, content, _ = parse_chat_output(token_ids) + if not request.include_reasoning: + reasoning = None + + if self.tool_parser is not None: + tool_parser = self.tool_parser(tokenizer) + # NOTE: We use token_ids for openai tool parser + tool_call_info = tool_parser.extract_tool_calls( + "", + request=request, + token_ids=token_ids, # type: ignore + ) + content = tool_call_info.content + message = ChatMessage( + role=role, + reasoning=reasoning, + content=content, + tool_calls=tool_call_info.tool_calls, + ) + else: + message = ChatMessage( + role=role, + reasoning=reasoning, + content=content, + ) + + choice_data = ChatCompletionResponseChoice( + index=output.index, + message=message, + logprobs=logprobs, + finish_reason=( + "tool_calls" + if (tool_call_info is not None and tool_call_info.tools_called) + else output.finish_reason + if output.finish_reason + else "stop" + ), + stop_reason=output.stop_reason, + token_ids=( + as_list(output.token_ids) if request.return_token_ids else None + ), + ) + choices.append(choice_data) + continue + + if self.reasoning_parser: + try: + reasoning_parser = self.reasoning_parser( + tokenizer, + chat_template_kwargs=request.chat_template_kwargs, # type: ignore + ) + except RuntimeError as e: + logger.exception("Error in reasoning parser creation.") + return self.create_error_response(str(e)) + # If the reasoning parser is enabled, + # tool calls are extracted exclusively from the content. + reasoning, content = reasoning_parser.extract_reasoning( + output.text, request=request + ) + if not request.include_reasoning: + reasoning = None + else: + reasoning = None + content = output.text + + auto_tools_called = False + # if auto tools are not enabled, and a named tool choice using + # outlines is not being used + tool_calls, content = self._parse_tool_calls_from_content( + request=request, + tokenizer=tokenizer, + content=content, + enable_auto_tools=self.enable_auto_tools, + tool_parser_cls=self.tool_parser, + ) + tool_call_class = ( + MistralToolCall if isinstance(tokenizer, MistralTokenizer) else ToolCall + ) + if (not self.enable_auto_tools or not self.tool_parser) and ( + not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam) + and request.tool_choice != "required" + ): + message = ChatMessage(role=role, reasoning=reasoning, content=content) + + # if the request uses tools and specified a tool choice + elif ( + request.tool_choice + and type(request.tool_choice) is ChatCompletionNamedToolChoiceParam + ): + assert tool_calls is not None and len(tool_calls) > 0 + message = ChatMessage( + role=role, + reasoning=reasoning, + content="", + tool_calls=[tool_call_class(function=tc) for tc in tool_calls], + ) + + elif request.tool_choice and request.tool_choice == "required": + tool_call_class_items = [] + assert tool_calls is not None and len(tool_calls) > 0 + for tool_call in tool_calls: + tool_call_class_items.append( + tool_call_class( + id=make_tool_call_id( + id_type=self.tool_call_id_type, + func_name=tool_call.name, + idx=history_tool_call_cnt, + ), + function=tool_call, + ) + ) + history_tool_call_cnt += 1 + message = ChatMessage( + role=role, + content="", + tool_calls=tool_call_class_items, + reasoning=reasoning, + ) + + # if the request doesn't use tool choice + # OR specifies to not use a tool + elif not request.tool_choice or request.tool_choice == "none": + message = ChatMessage(role=role, reasoning=reasoning, content=content) + + # handle when there are tools and tool choice is auto + elif ( + request.tools + and (request.tool_choice == "auto" or request.tool_choice is None) + and self.enable_auto_tools + and self.tool_parser + ): + # In the OpenAI API the finish_reason is "tools_called" + # if the tool choice is auto and the model produced a tool + # call. The same is not true for named function calls + auto_tools_called = tool_calls is not None and len(tool_calls) > 0 + if tool_calls: + message = ChatMessage( + role=role, + reasoning=reasoning, + content=content, + tool_calls=[ + ToolCall( + function=tc, + type="function", + ) + for tc in tool_calls + ], + ) + + else: + # FOR NOW make it a chat message; we will have to detect + # the type to make it later. + ret_content = content + + # try to use content return from tool parser first, + # tool parser may do some modify for the content. + if content and len(content) > 0: + ret_content = content + message = ChatMessage( + role=role, + reasoning=reasoning, + content=ret_content, + ) + + # undetermined case that is still important to handle + else: + logger.error( + "Error in chat_completion_full_generator - cannot determine" + " if tools should be extracted. Returning a standard chat " + "completion." + ) + message = ChatMessage(role=role, reasoning=reasoning, content=content) + # In OpenAI's API, when a tool is called, the finish_reason is: + # "tool_calls" for "auto" or "required" tool calls, + # and "stop" for named tool calls. + is_finish_reason_tool_calls = auto_tools_called or ( + request.tool_choice + and request.tool_choice == "required" + and output.finish_reason == "stop" + ) + + choice_data = ChatCompletionResponseChoice( + index=output.index, + message=message, + logprobs=logprobs, + finish_reason="tool_calls" + if is_finish_reason_tool_calls + else output.finish_reason + if output.finish_reason + else "stop", + stop_reason=output.stop_reason, + token_ids=( + as_list(output.token_ids) if request.return_token_ids else None + ), + ) + + choices.append(choice_data) + + if request.echo: + last_msg_content: str | list[dict[str, str]] = "" + if ( + conversation + and "content" in conversation[-1] + and conversation[-1].get("role") == role + ): + last_msg_content = conversation[-1]["content"] or "" + if isinstance(last_msg_content, list): + last_msg_content = "\n".join(msg["text"] for msg in last_msg_content) + + for choice in choices: + full_message = last_msg_content + (choice.message.content or "") + choice.message.content = full_message + + assert final_res.prompt_token_ids is not None + num_prompt_tokens = len(final_res.prompt_token_ids) + if final_res.encoder_prompt_token_ids is not None: + num_prompt_tokens += len(final_res.encoder_prompt_token_ids) + num_generated_tokens = sum( + len(output.token_ids) for output in final_res.outputs + ) + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + num_generated_tokens, + ) + if self.enable_prompt_tokens_details and final_res.num_cached_tokens: + usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=final_res.num_cached_tokens + ) + + request_metadata.final_usage_info = usage + + response = ChatCompletionResponse( + id=request_id, + created=created_time, + model=model_name, + choices=choices, + usage=usage, + prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs), + prompt_token_ids=( + final_res.prompt_token_ids if request.return_token_ids else None + ), + kv_transfer_params=final_res.kv_transfer_params, + ) + + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + for choice in choices: + output_text = "" + if choice.message.content: + output_text = choice.message.content + elif choice.message.tool_calls: + # For tool calls, log the function name and arguments + tool_call_descriptions = [] + for tc in choice.message.tool_calls: + if hasattr(tc.function, "name") and hasattr( + tc.function, "arguments" + ): + tool_call_descriptions.append( + f"{tc.function.name}({tc.function.arguments})" + ) + tool_calls_str = ", ".join(tool_call_descriptions) + output_text = f"[tool_calls: {tool_calls_str}]" + + if output_text: + # Get the corresponding output token IDs + output_token_ids = None + if choice.index < len(final_res.outputs): + output_token_ids = final_res.outputs[choice.index].token_ids + + self.request_logger.log_outputs( + request_id=request_id, + outputs=output_text, + output_token_ids=output_token_ids, + finish_reason=choice.finish_reason, + is_streaming=False, + delta=False, + ) + + return response + + def _get_top_logprobs( + self, + logprobs: dict[int, Logprob], + top_logprobs: int | None, + tokenizer: AnyTokenizer, + should_return_as_token_id: bool, + ) -> list[ChatCompletionLogProb]: + return [ + ChatCompletionLogProb( + token=( + token := self._get_decoded_token( + p[1], + p[0], + tokenizer, + return_as_token_id=should_return_as_token_id, + ) + ), + logprob=max(p[1].logprob, -9999.0), + bytes=list(token.encode("utf-8", errors="replace")), + ) + for i, p in enumerate(logprobs.items()) + if (top_logprobs and i < top_logprobs or top_logprobs == -1) + ] + + def _create_chat_logprobs( + self, + token_ids: GenericSequence[int], + top_logprobs: GenericSequence[dict[int, Logprob] | None], + tokenizer: AnyTokenizer, + num_output_top_logprobs: int | None = None, + return_as_token_id: bool | None = None, + ) -> ChatCompletionLogProbs: + """Create OpenAI-style logprobs.""" + logprobs_content: list[ChatCompletionLogProbsContent] = [] + + should_return_as_token_id = ( + return_as_token_id + if return_as_token_id is not None + else self.return_tokens_as_token_ids + ) + for i, token_id in enumerate(token_ids): + step_top_logprobs = top_logprobs[i] + if step_top_logprobs is None or step_top_logprobs.get(token_id) is None: + if should_return_as_token_id: + token = f"token_id:{token_id}" + else: + token = tokenizer.decode(token_id) + + logprobs_content.append( + ChatCompletionLogProbsContent( + token=token, + bytes=list(token.encode("utf-8", errors="replace")), + ) + ) + else: + step_token = step_top_logprobs[token_id] + step_decoded = step_token.decoded_token + + logprobs_content.append( + ChatCompletionLogProbsContent( + token=self._get_decoded_token( + step_token, + token_id, + tokenizer, + should_return_as_token_id, + ), + logprob=max(step_token.logprob, -9999.0), + bytes=( + None + if step_decoded is None + else list(step_decoded.encode("utf-8", errors="replace")) + ), + top_logprobs=self._get_top_logprobs( + step_top_logprobs, + num_output_top_logprobs, + tokenizer, + should_return_as_token_id, + ), + ) + ) + + return ChatCompletionLogProbs(content=logprobs_content) + + def _should_stream_with_auto_tool_parsing(self, request: ChatCompletionRequest): + """ + Utility function to check if streamed tokens should go through the tool + call parser that was configured. + + We only want to do this IF user-provided tools are set, a tool parser + is configured, "auto" tool choice is enabled, and the request's tool + choice field indicates that "auto" tool choice should be used. + """ + return ( + request.tools + and self.tool_parser + and self.enable_auto_tools + and request.tool_choice in ["auto", None] + ) + + def _should_check_for_unstreamed_tool_arg_tokens( + self, + delta_message: DeltaMessage | None, + output: CompletionOutput, + ) -> bool: + """ + Check to see if we should check for unstreamed tool arguments tokens. + This is only applicable when auto tool parsing is enabled, the delta + is a tool call with arguments. + """ + + return bool( + # if there is a delta message that includes tool calls which + # include a function that has arguments + output.finish_reason is not None + and self.enable_auto_tools + and self.tool_parser + and delta_message + and delta_message.tool_calls + and delta_message.tool_calls[0] + and delta_message.tool_calls[0].function + and delta_message.tool_calls[0].function.arguments is not None + ) + + def _make_request_with_harmony( + self, + request: ChatCompletionRequest, + ): + messages: list[OpenAIMessage] = [] + + # Add system message. + # NOTE: In Chat Completion API, browsing is enabled by default + # if the model supports it. TODO: Support browsing. + assert not self.supports_browsing + assert not self.supports_code_interpreter + sys_msg = get_system_message( + reasoning_effort=request.reasoning_effort, + browser_description=None, + python_description=None, + with_custom_tools=request.tools is not None, + ) + messages.append(sys_msg) + + # Add developer message. + dev_msg = get_developer_message(tools=request.tools) + messages.append(dev_msg) + + # Add user message. + for chat_msg in request.messages: + messages.extend(parse_input_to_harmony_message(chat_msg)) + + # Render prompt token ids. + prompt_token_ids = render_for_completion(messages) + engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) + + # Add cache_salt if provided in the request + if request.cache_salt is not None: + engine_prompt["cache_salt"] = request.cache_salt + + return messages, [prompt_token_ids], [engine_prompt] diff --git a/entrypoints/openai/serving_classification.py b/entrypoints/openai/serving_classification.py new file mode 100644 index 0000000..167ee15 --- /dev/null +++ b/entrypoints/openai/serving_classification.py @@ -0,0 +1,235 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from http import HTTPStatus +from typing import cast + +import jinja2 +import numpy as np +from fastapi import Request + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ClassificationChatRequest, + ClassificationCompletionRequest, + ClassificationData, + ClassificationRequest, + ClassificationResponse, + ErrorResponse, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import ( + ClassificationServeContext, + OpenAIServing, + ServeContext, +) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig +from vllm.logger import init_logger +from vllm.outputs import ClassificationOutput, PoolingRequestOutput +from vllm.pooling_params import PoolingParams + +logger = init_logger(__name__) + + +class ClassificationMixin(OpenAIServing): + chat_template: str | None + chat_template_content_format: ChatTemplateContentFormatOption + trust_request_chat_template: bool + + async def _preprocess( + self, + ctx: ServeContext, + ) -> ErrorResponse | None: + """ + Process classification inputs: tokenize text, resolve adapters, + and prepare model-specific inputs. + """ + ctx = cast(ClassificationServeContext, ctx) + try: + ctx.tokenizer = await self.engine_client.get_tokenizer() + + request_obj = ctx.request + + if isinstance(request_obj, ClassificationChatRequest): + chat_request = request_obj + messages = chat_request.messages + trust_request_chat_template = getattr( + self, + "trust_request_chat_template", + False, + ) + ret = self._validate_chat_template( + request_chat_template=chat_request.chat_template, + chat_template_kwargs=chat_request.chat_template_kwargs, + trust_request_chat_template=trust_request_chat_template, + ) + if ret: + return ret + + ( + _, + _, + engine_prompts, + ) = await self._preprocess_chat( + cast(ChatCompletionRequest, chat_request), + ctx.tokenizer, + messages, + chat_template=( + chat_request.chat_template + or getattr(self, "chat_template", None) + ), + chat_template_content_format=cast( + ChatTemplateContentFormatOption, + getattr(self, "chat_template_content_format", "auto"), + ), + add_generation_prompt=False, + continue_final_message=False, + add_special_tokens=chat_request.add_special_tokens, + ) + ctx.engine_prompts = engine_prompts + + elif isinstance(request_obj, ClassificationCompletionRequest): + completion_request = request_obj + input_data = completion_request.input + if input_data in (None, ""): + return self.create_error_response( + "Input or messages must be provided", + status_code=HTTPStatus.BAD_REQUEST, + ) + if isinstance(input_data, list) and not input_data: + ctx.engine_prompts = [] + return None + + renderer = self._get_renderer(ctx.tokenizer) + prompt_input = cast(str | list[str], input_data) + ctx.engine_prompts = await renderer.render_prompt( + prompt_or_prompts=prompt_input, + config=self._build_render_config(completion_request), + ) + else: + return self.create_error_response( + "Invalid classification request type", + status_code=HTTPStatus.BAD_REQUEST, + ) + + return None + + except (ValueError, TypeError, jinja2.TemplateError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + def _build_response( + self, + ctx: ServeContext, + ) -> ClassificationResponse | ErrorResponse: + """ + Convert model outputs to a formatted classification response + with probabilities and labels. + """ + ctx = cast(ClassificationServeContext, ctx) + items: list[ClassificationData] = [] + num_prompt_tokens = 0 + + final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch) + + for idx, final_res in enumerate(final_res_batch_checked): + classify_res = ClassificationOutput.from_base(final_res.outputs) + + probs = classify_res.probs + predicted_index = int(np.argmax(probs)) + label = getattr(self.model_config.hf_config, "id2label", {}).get( + predicted_index + ) + + item = ClassificationData( + index=idx, + label=label, + probs=probs, + num_classes=len(probs), + ) + + items.append(item) + prompt_token_ids = final_res.prompt_token_ids + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + + return ClassificationResponse( + id=ctx.request_id, + created=ctx.created_time, + model=ctx.model_name, + data=items, + usage=usage, + ) + + def _build_render_config(self, request: ClassificationRequest) -> RenderConfig: + return RenderConfig( + max_length=self.max_model_len, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + + +class ServingClassification(ClassificationMixin): + request_id_prefix = "classify" + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + chat_template: str | None = None, + chat_template_content_format: ChatTemplateContentFormatOption = "auto", + trust_request_chat_template: bool = False, + log_error_stack: bool = False, + ) -> None: + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + log_error_stack=log_error_stack, + ) + + self.chat_template = chat_template + self.chat_template_content_format = chat_template_content_format + self.trust_request_chat_template = trust_request_chat_template + + async def create_classify( + self, + request: ClassificationRequest, + raw_request: Request, + ) -> ClassificationResponse | ErrorResponse: + model_name = self.models.model_name() + request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}" + + ctx = ClassificationServeContext( + request=request, + raw_request=raw_request, + model_name=model_name, + request_id=request_id, + ) + + return await super().handle(ctx) # type: ignore + + def _create_pooling_params( + self, + ctx: ClassificationServeContext, + ) -> PoolingParams | ErrorResponse: + pooling_params = super()._create_pooling_params(ctx) + if isinstance(pooling_params, ErrorResponse): + return pooling_params + + try: + pooling_params.verify("classify", self.model_config) + except ValueError as e: + return self.create_error_response(str(e)) + + return pooling_params diff --git a/entrypoints/openai/serving_completion.py b/entrypoints/openai/serving_completion.py new file mode 100644 index 0000000..a114b77 --- /dev/null +++ b/entrypoints/openai/serving_completion.py @@ -0,0 +1,715 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import time +from collections.abc import AsyncGenerator, AsyncIterator +from collections.abc import Sequence as GenericSequence +from typing import cast + +import jinja2 +from fastapi import Request + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + CompletionLogProbs, + CompletionRequest, + CompletionResponse, + CompletionResponseChoice, + CompletionResponseStreamChoice, + CompletionStreamResponse, + ErrorResponse, + PromptTokenUsageInfo, + RequestResponseMetadata, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig +from vllm.entrypoints.utils import get_max_tokens, should_include_usage +from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt +from vllm.logger import init_logger +from vllm.logprobs import Logprob +from vllm.outputs import RequestOutput +from vllm.sampling_params import BeamSearchParams, SamplingParams +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils.async_utils import merge_async_iterators +from vllm.utils.collection_utils import as_list +from vllm.v1.sample.logits_processor import validate_logits_processors_parameters + +logger = init_logger(__name__) + + +class OpenAIServingCompletion(OpenAIServing): + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + return_tokens_as_token_ids: bool = False, + enable_prompt_tokens_details: bool = False, + enable_force_include_usage: bool = False, + log_error_stack: bool = False, + ): + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + log_error_stack=log_error_stack, + ) + + # set up logits processors + self.logits_processors = self.model_config.logits_processors + + self.enable_prompt_tokens_details = enable_prompt_tokens_details + self.default_sampling_params = self.model_config.get_diff_sampling_param() + self.enable_force_include_usage = enable_force_include_usage + if self.default_sampling_params: + source = self.model_config.generation_config + source = "model" if source == "auto" else source + logger.info( + "Using default completion sampling params from %s: %s", + source, + self.default_sampling_params, + ) + + async def create_completion( + self, + request: CompletionRequest, + raw_request: Request | None = None, + ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse: + """Completion API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/completions/create + for the API specification. This API mimics the OpenAI Completion API. + + NOTE: Currently we do not support the following feature: + - suffix (the language models we currently support do not support + suffix) + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + # If the engine is dead, raise the engine's DEAD_ERROR. + # This is required for the streaming case, where we return a + # success status before we actually start generating text :). + if self.engine_client.errored: + raise self.engine_client.dead_error + + # Return error for unsupported features. + if request.suffix is not None: + return self.create_error_response("suffix is not currently supported") + + if request.echo and request.prompt_embeds is not None: + return self.create_error_response("Echo is unsupported with prompt embeds.") + + if request.prompt_logprobs is not None and request.prompt_embeds is not None: + return self.create_error_response( + "prompt_logprobs is not compatible with prompt embeds." + ) + + request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}" + created_time = int(time.time()) + + request_metadata = RequestResponseMetadata(request_id=request_id) + if raw_request: + raw_request.state.request_metadata = request_metadata + + try: + lora_request = self._maybe_get_adapters(request) + + if self.model_config.skip_tokenizer_init: + tokenizer = None + else: + tokenizer = await self.engine_client.get_tokenizer() + renderer = self._get_renderer(tokenizer) + + engine_prompts = await renderer.render_prompt_and_embeds( + prompt_or_prompts=request.prompt, + prompt_embeds=request.prompt_embeds, + config=self._build_render_config(request), + ) + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except TypeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except RuntimeError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + except jinja2.TemplateError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + # Extract data_parallel_rank from header (router can inject it) + data_parallel_rank = self._get_data_parallel_rank(raw_request) + + # Schedule the request and get the result generator. + generators: list[AsyncGenerator[RequestOutput, None]] = [] + try: + for i, engine_prompt in enumerate(engine_prompts): + prompt_text, prompt_token_ids, prompt_embeds = ( + self._get_prompt_components(engine_prompt) + ) + + input_length = None + if prompt_token_ids is not None: + input_length = len(prompt_token_ids) + elif prompt_embeds is not None: + input_length = len(prompt_embeds) + else: + raise NotImplementedError + + if self.default_sampling_params is None: + self.default_sampling_params = {} + + max_tokens = get_max_tokens( + max_model_len=self.max_model_len, + request=request, + input_length=input_length, + default_sampling_params=self.default_sampling_params, + ) + + sampling_params: SamplingParams | BeamSearchParams + if request.use_beam_search: + sampling_params = request.to_beam_search_params( + max_tokens, self.default_sampling_params + ) + else: + sampling_params = request.to_sampling_params( + max_tokens, + self.model_config.logits_processor_pattern, + self.default_sampling_params, + ) + validate_logits_processors_parameters( + self.logits_processors, + sampling_params, + ) + + request_id_item = f"{request_id}-{i}" + + self._log_inputs( + request_id_item, + engine_prompt, + params=sampling_params, + lora_request=lora_request, + ) + + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + # Mypy inconsistently requires this second cast in different + # environments. It shouldn't be necessary (redundant from above) + # but pre-commit in CI fails without it. + engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt) + if isinstance(sampling_params, BeamSearchParams): + generator = self.beam_search( + prompt=engine_prompt, + request_id=request_id, + params=sampling_params, + lora_request=lora_request, + ) + else: + engine_request, tokenization_kwargs = await self._process_inputs( + request_id_item, + engine_prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generator = self.engine_client.generate( + engine_request, + sampling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + prompt_text=prompt_text, + tokenization_kwargs=tokenization_kwargs, + data_parallel_rank=data_parallel_rank, + ) + + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + result_generator = merge_async_iterators(*generators) + + model_name = self.models.model_name(lora_request) + num_prompts = len(engine_prompts) + + # Similar to the OpenAI API, when n != best_of, we do not stream the + # results. Noting that best_of is only supported in V0. In addition, + # we do not stream the results when use beam search. + stream = ( + request.stream + and (request.best_of is None or request.n == request.best_of) + and not request.use_beam_search + ) + + # Streaming response + if stream: + return self.completion_stream_generator( + request, + engine_prompts, + result_generator, + request_id, + created_time, + model_name, + num_prompts=num_prompts, + tokenizer=tokenizer, + request_metadata=request_metadata, + ) + + # Non-streaming response + final_res_batch: list[RequestOutput | None] = [None] * num_prompts + try: + async for i, res in result_generator: + final_res_batch[i] = res + + for i, final_res in enumerate(final_res_batch): + assert final_res is not None + + # The output should contain the input text + # We did not pass it into vLLM engine to avoid being redundant + # with the inputs token IDs + if final_res.prompt is None: + engine_prompt = engine_prompts[i] + final_res.prompt = ( + None + if is_embeds_prompt(engine_prompt) + else engine_prompt.get("prompt") + ) + + final_res_batch_checked = cast(list[RequestOutput], final_res_batch) + + response = self.request_output_to_completion_response( + final_res_batch_checked, + request, + request_id, + created_time, + model_name, + tokenizer, + request_metadata, + ) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + # When user requests streaming but we don't stream, we still need to + # return a streaming response with a single event. + if request.stream: + response_json = response.model_dump_json() + + async def fake_stream_generator() -> AsyncGenerator[str, None]: + yield f"data: {response_json}\n\n" + yield "data: [DONE]\n\n" + + return fake_stream_generator() + + return response + + async def completion_stream_generator( + self, + request: CompletionRequest, + engine_prompts: list[TokensPrompt | EmbedsPrompt], + result_generator: AsyncIterator[tuple[int, RequestOutput]], + request_id: str, + created_time: int, + model_name: str, + num_prompts: int, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + ) -> AsyncGenerator[str, None]: + num_choices = 1 if request.n is None else request.n + previous_text_lens = [0] * num_choices * num_prompts + previous_num_tokens = [0] * num_choices * num_prompts + has_echoed = [False] * num_choices * num_prompts + num_prompt_tokens = [0] * num_prompts + num_cached_tokens = None + first_iteration = True + + stream_options = request.stream_options + include_usage, include_continuous_usage = should_include_usage( + stream_options, self.enable_force_include_usage + ) + + try: + async for prompt_idx, res in result_generator: + prompt_token_ids = res.prompt_token_ids + prompt_logprobs = res.prompt_logprobs + + if first_iteration: + num_cached_tokens = res.num_cached_tokens + first_iteration = False + + prompt_text = res.prompt + if prompt_text is None: + engine_prompt = engine_prompts[prompt_idx] + prompt_text = ( + None + if is_embeds_prompt(engine_prompt) + else engine_prompt.get("prompt") + ) + + # Prompt details are excluded from later streamed outputs + if prompt_token_ids is not None: + num_prompt_tokens[prompt_idx] = len(prompt_token_ids) + + delta_token_ids: GenericSequence[int] + out_logprobs: GenericSequence[dict[int, Logprob] | None] | None + + for output in res.outputs: + i = output.index + prompt_idx * num_choices + + # Useful when request.return_token_ids is True + # Returning prompt token IDs shares the same logic + # with the echo implementation. + prompt_token_ids_to_return: list[int] | None = None + + assert request.max_tokens is not None + if request.echo and not has_echoed[i]: + assert prompt_token_ids is not None + if request.return_token_ids: + prompt_text = "" + assert prompt_text is not None + if request.max_tokens == 0: + # only return the prompt + delta_text = prompt_text + delta_token_ids = prompt_token_ids + out_logprobs = prompt_logprobs + else: + # echo the prompt and first token + delta_text = prompt_text + output.text + delta_token_ids = [ + *prompt_token_ids, + *output.token_ids, + ] + out_logprobs = [ + *(prompt_logprobs or []), + *(output.logprobs or []), + ] + prompt_token_ids_to_return = prompt_token_ids + has_echoed[i] = True + else: + # return just the delta + delta_text = output.text + delta_token_ids = output.token_ids + out_logprobs = output.logprobs + + # has_echoed[i] is reused here to indicate whether + # we have already returned the prompt token IDs. + if not has_echoed[i] and request.return_token_ids: + prompt_token_ids_to_return = prompt_token_ids + has_echoed[i] = True + + if ( + not delta_text + and not delta_token_ids + and not previous_num_tokens[i] + ): + # Chunked prefill case, don't return empty chunks + continue + + if request.logprobs is not None: + assert out_logprobs is not None, "Did not output logprobs" + logprobs = self._create_completion_logprobs( + token_ids=delta_token_ids, + top_logprobs=out_logprobs, + num_output_top_logprobs=request.logprobs, + tokenizer=tokenizer, + initial_text_offset=previous_text_lens[i], + return_as_token_id=request.return_tokens_as_token_ids, + ) + else: + logprobs = None + + previous_text_lens[i] += len(output.text) + previous_num_tokens[i] += len(output.token_ids) + finish_reason = output.finish_reason + stop_reason = output.stop_reason + + chunk = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[ + CompletionResponseStreamChoice( + index=i, + text=delta_text, + logprobs=logprobs, + finish_reason=finish_reason, + stop_reason=stop_reason, + prompt_token_ids=prompt_token_ids_to_return, + token_ids=( + as_list(output.token_ids) + if request.return_token_ids + else None + ), + ) + ], + ) + if include_continuous_usage: + prompt_tokens = num_prompt_tokens[prompt_idx] + completion_tokens = previous_num_tokens[i] + chunk.usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + + response_json = chunk.model_dump_json(exclude_unset=False) + yield f"data: {response_json}\n\n" + + total_prompt_tokens = sum(num_prompt_tokens) + total_completion_tokens = sum(previous_num_tokens) + final_usage_info = UsageInfo( + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_prompt_tokens + total_completion_tokens, + ) + + if self.enable_prompt_tokens_details and num_cached_tokens: + final_usage_info.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=num_cached_tokens + ) + + if include_usage: + final_usage_chunk = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[], + usage=final_usage_info, + ) + final_usage_data = final_usage_chunk.model_dump_json( + exclude_unset=False, exclude_none=True + ) + yield f"data: {final_usage_data}\n\n" + + # report to FastAPI middleware aggregate usage across all choices + request_metadata.final_usage_info = final_usage_info + + except Exception as e: + # TODO: Use a vllm-specific Validation Error + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" + yield "data: [DONE]\n\n" + + def request_output_to_completion_response( + self, + final_res_batch: list[RequestOutput], + request: CompletionRequest, + request_id: str, + created_time: int, + model_name: str, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + ) -> CompletionResponse: + choices: list[CompletionResponseChoice] = [] + num_prompt_tokens = 0 + num_generated_tokens = 0 + kv_transfer_params = None + last_final_res = None + for final_res in final_res_batch: + last_final_res = final_res + prompt_token_ids = final_res.prompt_token_ids + assert prompt_token_ids is not None + prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs) + prompt_text = final_res.prompt + + token_ids: GenericSequence[int] + out_logprobs: GenericSequence[dict[int, Logprob] | None] | None + + for output in final_res.outputs: + assert request.max_tokens is not None + if request.echo: + if request.return_token_ids: + prompt_text = "" + assert prompt_text is not None + if request.max_tokens == 0: + token_ids = prompt_token_ids + out_logprobs = prompt_logprobs + output_text = prompt_text + else: + token_ids = [*prompt_token_ids, *output.token_ids] + + if request.logprobs is None: + out_logprobs = None + else: + assert prompt_logprobs is not None + assert output.logprobs is not None + out_logprobs = [ + *prompt_logprobs, + *output.logprobs, + ] + + output_text = prompt_text + output.text + else: + token_ids = output.token_ids + out_logprobs = output.logprobs + output_text = output.text + + if request.logprobs is not None: + assert out_logprobs is not None, "Did not output logprobs" + logprobs = self._create_completion_logprobs( + token_ids=token_ids, + top_logprobs=out_logprobs, + tokenizer=tokenizer, + num_output_top_logprobs=request.logprobs, + return_as_token_id=request.return_tokens_as_token_ids, + ) + else: + logprobs = None + + choice_data = CompletionResponseChoice( + index=len(choices), + text=output_text, + logprobs=logprobs, + finish_reason=output.finish_reason, + stop_reason=output.stop_reason, + prompt_logprobs=final_res.prompt_logprobs, + prompt_token_ids=( + prompt_token_ids if request.return_token_ids else None + ), + token_ids=( + as_list(output.token_ids) if request.return_token_ids else None + ), + ) + choices.append(choice_data) + + num_generated_tokens += len(output.token_ids) + + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + num_generated_tokens, + ) + + if ( + self.enable_prompt_tokens_details + and last_final_res + and last_final_res.num_cached_tokens + ): + usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=last_final_res.num_cached_tokens + ) + + request_metadata.final_usage_info = usage + if final_res_batch: + kv_transfer_params = final_res_batch[0].kv_transfer_params + return CompletionResponse( + id=request_id, + created=created_time, + model=model_name, + choices=choices, + usage=usage, + kv_transfer_params=kv_transfer_params, + ) + + def _create_completion_logprobs( + self, + token_ids: GenericSequence[int], + top_logprobs: GenericSequence[dict[int, Logprob] | None], + num_output_top_logprobs: int, + tokenizer: AnyTokenizer, + initial_text_offset: int = 0, + return_as_token_id: bool | None = None, + ) -> CompletionLogProbs: + """Create logprobs for OpenAI Completion API.""" + out_text_offset: list[int] = [] + out_token_logprobs: list[float | None] = [] + out_tokens: list[str] = [] + out_top_logprobs: list[dict[str, float] | None] = [] + + last_token_len = 0 + + should_return_as_token_id = ( + return_as_token_id + if return_as_token_id is not None + else self.return_tokens_as_token_ids + ) + for i, token_id in enumerate(token_ids): + step_top_logprobs = top_logprobs[i] + if step_top_logprobs is None: + token = tokenizer.decode(token_id) + if should_return_as_token_id: + token = f"token_id:{token_id}" + + out_tokens.append(token) + out_token_logprobs.append(None) + out_top_logprobs.append(None) + else: + step_token = step_top_logprobs[token_id] + + token = self._get_decoded_token( + step_token, + token_id, + tokenizer, + return_as_token_id=should_return_as_token_id, + ) + token_logprob = max(step_token.logprob, -9999.0) + + out_tokens.append(token) + out_token_logprobs.append(token_logprob) + + # makes sure to add the top num_output_top_logprobs + 1 + # logprobs, as defined in the openai API + # (cf. https://github.com/openai/openai-openapi/blob/ + # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153) + out_top_logprobs.append( + { + # Convert float("-inf") to the + # JSON-serializable float that OpenAI uses + self._get_decoded_token( + top_lp[1], + top_lp[0], + tokenizer, + return_as_token_id=should_return_as_token_id, + ): max(top_lp[1].logprob, -9999.0) + for i, top_lp in enumerate(step_top_logprobs.items()) + if num_output_top_logprobs >= i + } + ) + + if len(out_text_offset) == 0: + out_text_offset.append(initial_text_offset) + else: + out_text_offset.append(out_text_offset[-1] + last_token_len) + last_token_len = len(token) + + return CompletionLogProbs( + text_offset=out_text_offset, + token_logprobs=out_token_logprobs, + tokens=out_tokens, + top_logprobs=out_top_logprobs, + ) + + def _build_render_config( + self, + request: CompletionRequest, + max_input_length: int | None = None, + ) -> RenderConfig: + max_input_tokens_len = self.max_model_len - (request.max_tokens or 0) + return RenderConfig( + max_length=max_input_tokens_len, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + cache_salt=request.cache_salt, + needs_detokenization=bool(request.echo and not request.return_token_ids), + ) diff --git a/entrypoints/openai/serving_embedding.py b/entrypoints/openai/serving_embedding.py new file mode 100644 index 0000000..51f6106 --- /dev/null +++ b/entrypoints/openai/serving_embedding.py @@ -0,0 +1,695 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +from collections.abc import AsyncGenerator, Mapping +from typing import Any, Final, cast + +import torch +from fastapi import Request +from fastapi.responses import Response +from typing_extensions import assert_never, override + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + EmbeddingBytesResponse, + EmbeddingChatRequest, + EmbeddingCompletionRequest, + EmbeddingRequest, + EmbeddingResponse, + EmbeddingResponseData, + ErrorResponse, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import ( + EmbeddingServeContext, + OpenAIServing, + ServeContext, + TextTokensPrompt, +) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt +from vllm.logger import init_logger +from vllm.outputs import ( + EmbeddingRequestOutput, + PoolingOutput, + PoolingRequestOutput, + RequestOutput, +) +from vllm.pooling_params import PoolingParams +from vllm.utils.async_utils import merge_async_iterators +from vllm.utils.collection_utils import chunk_list +from vllm.utils.serial_utils import ( + EmbedDType, + EncodingFormat, + Endianness, + encode_pooling_bytes, + encode_pooling_output, +) + +logger = init_logger(__name__) + + +class EmbeddingMixin(OpenAIServing): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + pooler_config = self.model_config.pooler_config + + # Avoid repeated attribute lookups + self.supports_chunked_processing = bool( + pooler_config and pooler_config.enable_chunked_processing + ) + self.max_embed_len = ( + pooler_config.max_embed_len + if pooler_config and pooler_config.max_embed_len + else None + ) + + @override + async def _preprocess( + self, + ctx: ServeContext, + ) -> ErrorResponse | None: + ctx = cast(EmbeddingServeContext, ctx) + try: + ctx.lora_request = self._maybe_get_adapters(ctx.request) + + tokenizer = await self.engine_client.get_tokenizer() + renderer = self._get_renderer(tokenizer) + + if isinstance(ctx.request, EmbeddingChatRequest): + ( + _, + _, + ctx.engine_prompts, + ) = await self._preprocess_chat( + ctx.request, + tokenizer, + ctx.request.messages, + chat_template=ctx.request.chat_template or ctx.chat_template, + chat_template_content_format=ctx.chat_template_content_format, + add_generation_prompt=ctx.request.add_generation_prompt, + continue_final_message=False, + add_special_tokens=ctx.request.add_special_tokens, + ) + else: + ctx.engine_prompts = await renderer.render_prompt( + prompt_or_prompts=ctx.request.input, + config=self._build_render_config(ctx.request), + ) + return None + except (ValueError, TypeError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + def _build_render_config(self, request: EmbeddingCompletionRequest) -> RenderConfig: + # Set max_length based on chunked processing capability + if self._should_use_chunked_processing(request): + max_length = None + else: + max_length = self.max_embed_len or self.max_model_len + + return RenderConfig( + max_length=max_length, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + + @override + def _build_response( + self, + ctx: ServeContext, + ) -> EmbeddingResponse | Response | ErrorResponse: + final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch) + + encoding_format: EncodingFormat = ctx.request.encoding_format + embed_dtype: EmbedDType = ctx.request.embed_dtype + endianness: Endianness = ctx.request.endianness + + def encode_float_base64(): + items: list[EmbeddingResponseData] = [] + num_prompt_tokens = 0 + + for idx, final_res in enumerate(final_res_batch_checked): + item = EmbeddingResponseData( + index=idx, + embedding=encode_pooling_output( + final_res, + encoding_format=encoding_format, + embed_dtype=embed_dtype, + endianness=endianness, + ), + ) + prompt_token_ids = final_res.prompt_token_ids + + items.append(item) + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + + return EmbeddingResponse( + id=ctx.request_id, + created=ctx.created_time, + model=ctx.model_name, + data=items, + usage=usage, + ) + + def encode_bytes(): + body, items, usage = encode_pooling_bytes( + pooling_outputs=final_res_batch_checked, + embed_dtype=embed_dtype, + endianness=endianness, + ) + + metadata = { + "id": ctx.request_id, + "created": ctx.created_time, + "model": ctx.model_name, + "data": items, + "usage": usage, + } + return EmbeddingBytesResponse( + body=body, + metadata=json.dumps(metadata), + ) + + if encoding_format == "float" or encoding_format == "base64": + return encode_float_base64() + elif encoding_format == "bytes": + return encode_bytes() + else: + assert_never(encoding_format) + + def _get_max_position_embeddings(self) -> int: + """Get the model's effective maximum sequence length for chunking.""" + return self.model_config.max_model_len + + def _should_use_chunked_processing(self, request) -> bool: + """Check if chunked processing should be used for this request.""" + return ( + isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest)) + and self.supports_chunked_processing + ) + + async def _process_chunked_request( + self, + ctx: EmbeddingServeContext, + original_prompt: TextTokensPrompt, + pooling_params, + trace_headers, + prompt_idx: int, + ) -> list[AsyncGenerator[PoolingRequestOutput, None]]: + """Process a single prompt using chunked processing.""" + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] + token_ids = original_prompt["prompt_token_ids"] + + # Split into chunks using max_position_embeddings + max_pos_embeddings = self._get_max_position_embeddings() + # Process all chunks for MEAN aggregation + for chunk_idx, chunk_tokens in enumerate( + chunk_list(token_ids, max_pos_embeddings) + ): + # Create a request ID for this chunk + chunk_request_id = f"{ctx.request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}" + + # Create engine prompt for this chunk + chunk_engine_prompt = EngineTokensPrompt(prompt_token_ids=chunk_tokens) + + # Create chunk request prompt for logging + chunk_text = "" + chunk_request_prompt = TextTokensPrompt( + prompt=chunk_text, prompt_token_ids=chunk_tokens + ) + + # Log the chunk + self._log_inputs( + chunk_request_id, + chunk_request_prompt, + params=pooling_params, + lora_request=ctx.lora_request, + ) + + # Create generator for this chunk and wrap it to return indices + original_generator = self.engine_client.encode( + chunk_engine_prompt, + pooling_params, + chunk_request_id, + lora_request=ctx.lora_request, + trace_headers=trace_headers, + priority=getattr(ctx.request, "priority", 0), + ) + + generators.append(original_generator) + + return generators + + def _validate_input( + self, + request, + input_ids: list[int], + input_text: str, + ) -> TextTokensPrompt: + """Override to support chunked processing for embedding requests.""" + token_num = len(input_ids) + + # Note: EmbeddingRequest doesn't have max_tokens + if isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest)): + # Check if chunked processing is enabled for pooling models + enable_chunked = self._should_use_chunked_processing(request) + + # Use max_position_embeddings for chunked processing decisions + max_pos_embeddings = self._get_max_position_embeddings() + + # Determine the effective max length for validation + if self.max_embed_len is not None: + # Use max_embed_len for validation instead of max_model_len + length_type = "maximum embedding input length" + max_length_value = self.max_embed_len + else: + # Fall back to max_model_len validation (original behavior) + length_type = "maximum context length" + max_length_value = self.max_model_len + + validation_error_msg = ( + "This model's {length_type} is {max_length_value} tokens. " + "However, you requested {token_num} tokens in the input for " + "embedding generation. Please reduce the length of the input." + ) + + chunked_processing_error_msg = ( + "This model's {length_type} is {max_length_value} tokens. " + "However, you requested {token_num} tokens in the input for " + "embedding generation. Please reduce the length of the input " + "or enable chunked processing." + ) + + # Check if input exceeds max length + if token_num > max_length_value: + raise ValueError( + validation_error_msg.format( + length_type=length_type, + max_length_value=max_length_value, + token_num=token_num, + ) + ) + + # Check for chunked processing + # when exceeding max_position_embeddings + if token_num > max_pos_embeddings: + if enable_chunked: + # Allow long inputs when chunked processing is enabled + logger.info( + "Input length %s exceeds max_position_embeddings " + "%s, will use chunked processing", + token_num, + max_pos_embeddings, + ) + else: + raise ValueError( + chunked_processing_error_msg.format( + length_type="maximum position embeddings length", + max_length_value=max_pos_embeddings, + token_num=token_num, + ) + ) + + return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) + + # For other request types, use the parent's implementation + return super()._validate_input(request, input_ids, input_text) + + def _is_text_tokens_prompt(self, prompt) -> bool: + """Check if a prompt is a TextTokensPrompt (has prompt_token_ids).""" + return ( + isinstance(prompt, dict) + and "prompt_token_ids" in prompt + and "prompt_embeds" not in prompt + ) + + async def _create_single_prompt_generator( + self, + ctx: EmbeddingServeContext, + engine_prompt: EngineTokensPrompt, + pooling_params: PoolingParams, + trace_headers: Mapping[str, str] | None, + prompt_index: int, + ) -> AsyncGenerator[RequestOutput | PoolingRequestOutput, None]: + """Create a generator for a single prompt using standard processing.""" + request_id_item = f"{ctx.request_id}-{prompt_index}" + + self._log_inputs( + request_id_item, + engine_prompt, + params=pooling_params, + lora_request=ctx.lora_request, + ) + + # Return the original generator without wrapping + return self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=ctx.lora_request, + trace_headers=trace_headers, + priority=getattr(ctx.request, "priority", 0), + ) + + @override + async def _prepare_generators( + self, + ctx: ServeContext, + ) -> ErrorResponse | None: + """Override to support chunked processing.""" + ctx = cast(EmbeddingServeContext, ctx) + + # Check if we should use chunked processing + use_chunked = self._should_use_chunked_processing(ctx.request) + + # If no chunked processing needed, delegate to parent class + if not use_chunked: + return await super()._prepare_generators(ctx) + + # Custom logic for chunked processing + generators: list[ + AsyncGenerator[RequestOutput | PoolingRequestOutput, None] + ] = [] + + try: + trace_headers = ( + None + if ctx.raw_request is None + else await self._get_trace_headers(ctx.raw_request.headers) + ) + + pooling_params = self._create_pooling_params(ctx) + if isinstance(pooling_params, ErrorResponse): + return pooling_params + + # Verify and set the task for pooling params + try: + pooling_params.verify("embed", self.model_config) + except ValueError as e: + return self.create_error_response(str(e)) + + if ctx.engine_prompts is None: + return self.create_error_response("Engine prompts not available") + + max_pos_embeddings = self._get_max_position_embeddings() + + for i, engine_prompt in enumerate(ctx.engine_prompts): + # Check if this specific prompt needs chunked processing + if self._is_text_tokens_prompt(engine_prompt): + # Cast to TextTokensPrompt since we've verified + # prompt_token_ids + text_tokens_prompt = cast(TextTokensPrompt, engine_prompt) + if len(text_tokens_prompt["prompt_token_ids"]) > max_pos_embeddings: + # Use chunked processing for this prompt + chunk_generators = await self._process_chunked_request( + ctx, text_tokens_prompt, pooling_params, trace_headers, i + ) + generators.extend(chunk_generators) + continue + + # Normal processing for short prompts or non-token prompts + generator = await self._create_single_prompt_generator( + ctx, engine_prompt, pooling_params, trace_headers, i + ) + generators.append(generator) + + ctx.result_generator = merge_async_iterators(*generators) + + return None + + except Exception as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + @override + async def _collect_batch( + self, + ctx: ServeContext, + ) -> ErrorResponse | None: + """Collect and aggregate batch results + with support for chunked processing. + + For chunked requests, performs online aggregation to + minimize memory usage. + For regular requests, collects results normally. + """ + ctx = cast(EmbeddingServeContext, ctx) + try: + if ctx.engine_prompts is None: + return self.create_error_response("Engine prompts not available") + + # Check if we used chunked processing + use_chunked = self._should_use_chunked_processing(ctx.request) + + if not use_chunked: + return await super()._collect_batch(ctx=ctx) + + if ctx.result_generator is None: + return self.create_error_response("Result generator not available") + + # Online aggregation for chunked requests to + # minimize memory usage + # Track aggregation state for each prompt + prompt_aggregators: dict[int, dict[str, Any]] = {} + short_prompts_results: dict[int, PoolingRequestOutput] = {} + + async for result_idx, result in ctx.result_generator: + if "-chunk-" in result.request_id: + # Extract prompt_idx from chunked request_id + parts = result.request_id.split("-") + try: + prompt_idx = int(parts[parts.index("prompt") + 1]) + except (ValueError, IndexError): + # Fallback: extract from result_idx if parsing fails + prompt_idx = result_idx + + # Initialize aggregator for this prompt if needed + if prompt_idx not in prompt_aggregators: + prompt_aggregators[prompt_idx] = { + "weighted_sum": None, + "total_weight": 0, + "chunk_count": 0, + "request_id": result.request_id.split("-chunk-")[0], + } + + aggregator = prompt_aggregators[prompt_idx] + + # MEAN pooling with online weighted averaging + # Ensure result is PoolingRequestOutput + # for embedding processing + if not isinstance(result, PoolingRequestOutput): + return self.create_error_response( + f"Expected PoolingRequestOutput for " + f"chunked embedding, got " + f"{type(result).__name__}" + ) + + # Handle both PoolingOutput and + # EmbeddingOutput types + if hasattr(result.outputs, "data"): + # PoolingOutput case + embedding_data = result.outputs.data + elif hasattr(result.outputs, "embedding"): + # EmbeddingOutput case - + # convert embedding list to tensor + embedding_data = result.outputs.embedding + else: + return self.create_error_response( + f"Unsupported output type: {type(result.outputs).__name__}" + ) + + if not isinstance(embedding_data, torch.Tensor): + embedding_data = torch.tensor( + embedding_data, dtype=torch.float32 + ) + + if result.prompt_token_ids is None: + return self.create_error_response( + "prompt_token_ids cannot be None for chunked processing" + ) + weight = len(result.prompt_token_ids) + + weighted_embedding = embedding_data.to(dtype=torch.float32) * weight + + if aggregator["weighted_sum"] is None: + # First chunk + aggregator["weighted_sum"] = weighted_embedding + else: + # Accumulate + aggregator["weighted_sum"] += weighted_embedding + + aggregator["total_weight"] += weight + aggregator["chunk_count"] += 1 + else: + # Non-chunked result - extract prompt_idx from request_id + parts = result.request_id.split("-") + try: + # Last part should be prompt index + prompt_idx = int(parts[-1]) + except (ValueError, IndexError): + prompt_idx = result_idx # Fallback to result_idx + + short_prompts_results[prompt_idx] = cast( + PoolingRequestOutput, result + ) + + # Finalize aggregated results + final_res_batch: list[PoolingRequestOutput | EmbeddingRequestOutput] = [] + num_prompts = len(ctx.engine_prompts) + + for prompt_idx in range(num_prompts): + if prompt_idx in prompt_aggregators: + # Finalize MEAN aggregation for this chunked prompt + aggregator = prompt_aggregators[prompt_idx] + + weighted_sum = aggregator["weighted_sum"] + total_weight = aggregator["total_weight"] + + if ( + weighted_sum is not None + and isinstance(weighted_sum, torch.Tensor) + and isinstance(total_weight, (int, float)) + and total_weight > 0 + ): + # Compute final mean embedding + final_embedding = weighted_sum / total_weight + + # Create a PoolingRequestOutput + # for the aggregated result + pooling_output_data = PoolingOutput(data=final_embedding) + + # Get original prompt token IDs for this prompt + original_prompt = ctx.engine_prompts[prompt_idx] + if not self._is_text_tokens_prompt(original_prompt): + return self.create_error_response( + f"Chunked prompt {prompt_idx} is not a TextTokensPrompt" + ) + + original_token_ids = cast(TextTokensPrompt, original_prompt)[ + "prompt_token_ids" + ] + + pooling_request_output = PoolingRequestOutput( + request_id=aggregator["request_id"], + prompt_token_ids=original_token_ids, + outputs=pooling_output_data, + num_cached_tokens=0, + finished=True, + ) + + final_res_batch.append(pooling_request_output) + else: + return self.create_error_response( + f"Failed to aggregate chunks for prompt {prompt_idx}" + ) + elif prompt_idx in short_prompts_results: + final_res_batch.append( + cast(PoolingRequestOutput, short_prompts_results[prompt_idx]) + ) + else: + return self.create_error_response( + f"Result not found for prompt {prompt_idx}" + ) + + ctx.final_res_batch = cast( + list[RequestOutput | PoolingRequestOutput], final_res_batch + ) + + return None + + except Exception as e: + return self.create_error_response(str(e)) + + +class OpenAIServingEmbedding(EmbeddingMixin): + request_id_prefix = "embd" + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + trust_request_chat_template: bool = False, + log_error_stack: bool = False, + ) -> None: + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + log_error_stack=log_error_stack, + ) + + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format + self.trust_request_chat_template = trust_request_chat_template + + async def create_embedding( + self, + request: EmbeddingRequest, + raw_request: Request | None = None, + ) -> EmbeddingResponse | ErrorResponse: + """ + Embedding API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/embeddings/create + for the API specification. This API mimics the OpenAI Embedding API. + """ + model_name = self.models.model_name() + request_id = ( + f"{self.request_id_prefix}-" + f"{self._base_request_id(raw_request, request.request_id)}" + ) + + ctx = EmbeddingServeContext( + request=request, + raw_request=raw_request, + model_name=model_name, + request_id=request_id, + chat_template=self.chat_template, + chat_template_content_format=self.chat_template_content_format, + ) + + return await super().handle(ctx) # type: ignore + + @override + def _create_pooling_params( + self, + ctx: ServeContext[EmbeddingRequest], + ) -> PoolingParams | ErrorResponse: + pooling_params = super()._create_pooling_params(ctx) + if isinstance(pooling_params, ErrorResponse): + return pooling_params + + try: + pooling_params.verify("embed", self.model_config) + except ValueError as e: + return self.create_error_response(str(e)) + + return pooling_params + + async def _preprocess( + self, + ctx: ServeContext, + ) -> ErrorResponse | None: + if isinstance(ctx.request, EmbeddingChatRequest): + error_check_ret = self._validate_chat_template( + request_chat_template=ctx.request.chat_template, + chat_template_kwargs=ctx.request.chat_template_kwargs, + trust_request_chat_template=self.trust_request_chat_template, + ) + if error_check_ret is not None: + return error_check_ret + return await super()._preprocess(ctx) diff --git a/entrypoints/openai/serving_engine.py b/entrypoints/openai/serving_engine.py new file mode 100644 index 0000000..c50b0c4 --- /dev/null +++ b/entrypoints/openai/serving_engine.py @@ -0,0 +1,1433 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import json +import sys +import time +import traceback +from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence +from concurrent.futures import ThreadPoolExecutor +from http import HTTPStatus +from typing import Any, ClassVar, Generic, TypeAlias, TypeVar + +import torch +from fastapi import Request +from pydantic import BaseModel, ConfigDict, Field, TypeAdapter +from starlette.datastructures import Headers +from typing_extensions import TypeIs + +if sys.version_info >= (3, 12): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +from openai.types.responses import ( + ToolChoiceFunction, +) + +import vllm.envs as envs +from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ( + ChatCompletionMessageParam, + ChatTemplateContentFormatOption, + ConversationMessage, + apply_hf_chat_template, + apply_mistral_chat_template, + parse_chat_messages_futures, + resolve_chat_template_content_format, +) +from vllm.entrypoints.context import ConversationContext +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ChatCompletionNamedToolChoiceParam, + ChatCompletionRequest, + ChatCompletionResponse, + ClassificationChatRequest, + ClassificationCompletionRequest, + ClassificationRequest, + ClassificationResponse, + CompletionRequest, + CompletionResponse, + DetokenizeRequest, + EmbeddingChatRequest, + EmbeddingCompletionRequest, + EmbeddingRequest, + EmbeddingResponse, + ErrorInfo, + ErrorResponse, + FunctionCall, + FunctionDefinition, + GenerateRequest, + GenerateResponse, + IOProcessorRequest, + PoolingResponse, + RerankRequest, + ResponsesRequest, + ScoreRequest, + ScoreResponse, + TokenizeChatRequest, + TokenizeCompletionRequest, + TokenizeResponse, + TranscriptionRequest, + TranscriptionResponse, + TranslationRequest, +) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager +from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig +from vllm.entrypoints.utils import _validate_truncation_size +from vllm.inputs.data import PromptType +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt +from vllm.inputs.parse import ( + PromptComponents, + get_prompt_components, + is_explicit_encoder_decoder_prompt, +) +from vllm.logger import init_logger +from vllm.logprobs import Logprob, PromptLogprobs +from vllm.lora.request import LoRARequest +from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin + MultiModalDataDict, + MultiModalUUIDDict, +) +from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput +from vllm.pooling_params import PoolingParams +from vllm.reasoning import ReasoningParser, ReasoningParserManager +from vllm.sampling_params import BeamSearchParams, SamplingParams +from vllm.tracing import ( + contains_trace_headers, + extract_trace_headers, + log_tracing_disabled_warning, +) +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import random_uuid +from vllm.utils.async_utils import ( + AsyncMicrobatchTokenizer, + collect_from_async_generator, + make_async, + merge_async_iterators, +) +from vllm.utils.collection_utils import is_list_of +from vllm.v1.engine import EngineCoreRequest + +logger = init_logger(__name__) + +CompletionLikeRequest: TypeAlias = ( + CompletionRequest + | DetokenizeRequest + | EmbeddingCompletionRequest + | RerankRequest + | ClassificationCompletionRequest + | ScoreRequest + | TokenizeCompletionRequest +) + +ChatLikeRequest: TypeAlias = ( + ChatCompletionRequest + | EmbeddingChatRequest + | TokenizeChatRequest + | ClassificationChatRequest +) +SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest +AnyRequest: TypeAlias = ( + CompletionLikeRequest + | ChatLikeRequest + | SpeechToTextRequest + | ResponsesRequest + | IOProcessorRequest + | GenerateRequest +) + +AnyResponse: TypeAlias = ( + CompletionResponse + | ChatCompletionResponse + | EmbeddingResponse + | TranscriptionResponse + | TokenizeResponse + | PoolingResponse + | ClassificationResponse + | ScoreResponse + | GenerateResponse +) + + +class TextTokensPrompt(TypedDict): + prompt: str + prompt_token_ids: list[int] + + +class EmbedsPrompt(TypedDict): + prompt_embeds: torch.Tensor + + +RequestPrompt: TypeAlias = list[int] | str | TextTokensPrompt | EmbedsPrompt + + +def is_text_tokens_prompt(prompt: RequestPrompt) -> TypeIs[TextTokensPrompt]: + return ( + isinstance(prompt, dict) + and "prompt_token_ids" in prompt + and "prompt_embeds" not in prompt + ) + + +def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]: + return ( + isinstance(prompt, dict) + and "prompt_token_ids" not in prompt + and "prompt_embeds" in prompt + ) + + +RequestT = TypeVar("RequestT", bound=AnyRequest) + + +class RequestProcessingMixin(BaseModel): + """ + Mixin for request processing, + handling prompt preparation and engine input. + """ + + request_prompts: Sequence[RequestPrompt] | None = [] + engine_prompts: list[EngineTokensPrompt] | None = [] + + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class ResponseGenerationMixin(BaseModel): + """ + Mixin for response generation, + managing result generators and final batch results. + """ + + result_generator: ( + AsyncGenerator[tuple[int, RequestOutput | PoolingRequestOutput], None] | None + ) = None + final_res_batch: list[RequestOutput | PoolingRequestOutput] = Field( + default_factory=list + ) + + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class ServeContext( + RequestProcessingMixin, + ResponseGenerationMixin, + BaseModel, + Generic[RequestT], +): + # Shared across all requests + request: RequestT + raw_request: Request | None = None + model_name: str + request_id: str + created_time: int = Field(default_factory=lambda: int(time.time())) + lora_request: LoRARequest | None = None + + # Shared across most requests + tokenizer: AnyTokenizer | None = None + + # `protected_namespaces` resolves Pydantic v2's warning + # on conflict with protected namespace "model_" + model_config = ConfigDict( + protected_namespaces=(), + arbitrary_types_allowed=True, + ) + + +ClassificationServeContext = ServeContext[ClassificationRequest] + + +class EmbeddingServeContext(ServeContext[EmbeddingRequest]): + chat_template: str | None = None + chat_template_content_format: ChatTemplateContentFormatOption + + +# Used to resolve the Pydantic error related to +# forward reference of MultiModalDataDict in TokensPrompt +RequestProcessingMixin.model_rebuild() +ServeContext.model_rebuild() +ClassificationServeContext.model_rebuild() +EmbeddingServeContext.model_rebuild() + + +class OpenAIServing: + request_id_prefix: ClassVar[str] = """ + A short string prepended to every request’s ID (e.g. "embd", "classify") + so you can easily tell “this ID came from Embedding vs Classification.” + """ + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + return_tokens_as_token_ids: bool = False, + log_error_stack: bool = False, + ): + super().__init__() + + self.engine_client = engine_client + + self.models = models + + self.request_logger = request_logger + self.return_tokens_as_token_ids = return_tokens_as_token_ids + self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) + self._apply_mistral_chat_template_async = make_async( + apply_mistral_chat_template, executor=self._tokenizer_executor + ) + + self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {} + self.log_error_stack = log_error_stack + + self.processor = self.models.processor + self.io_processor = self.models.io_processor + self.model_config = self.models.model_config + self.max_model_len = self.model_config.max_model_len + + def _get_tool_parser( + self, tool_parser_name: str | None = None, enable_auto_tools: bool = False + ) -> Callable[[AnyTokenizer], ToolParser] | None: + """Get the tool parser based on the name.""" + parser = None + if not enable_auto_tools or tool_parser_name is None: + return parser + logger.info( + '"auto" tool choice has been enabled please note that while' + " the parallel_tool_calls client option is preset for " + "compatibility reasons, it will be ignored." + ) + + try: + if tool_parser_name == "pythonic" and self.model_config.model.startswith( + "meta-llama/Llama-3.2" + ): + logger.warning( + "Llama3.2 models may struggle to emit valid pythonic tool calls" + ) + parser = ToolParserManager.get_tool_parser(tool_parser_name) + except Exception as e: + raise TypeError( + "Error: --enable-auto-tool-choice requires " + f"tool_parser:'{tool_parser_name}' which has not " + "been registered" + ) from e + return parser + + def _get_reasoning_parser( + self, + reasoning_parser_name: str, + ) -> Callable[[AnyTokenizer], ReasoningParser] | None: + """Get the reasoning parser based on the name.""" + parser = None + if not reasoning_parser_name: + return None + try: + parser = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name) + assert parser is not None + except Exception as e: + raise TypeError(f"{reasoning_parser_name=} has not been registered") from e + return parser + + async def reset_mm_cache(self) -> None: + self.processor.clear_mm_cache() + await self.engine_client.reset_mm_cache() + + async def beam_search( + self, + prompt: PromptType, + request_id: str, + params: BeamSearchParams, + lora_request: LoRARequest | None = None, + ) -> AsyncGenerator[RequestOutput, None]: + beam_width = params.beam_width + max_tokens = params.max_tokens + ignore_eos = params.ignore_eos + temperature = params.temperature + length_penalty = params.length_penalty + include_stop_str_in_output = params.include_stop_str_in_output + + processor = self.processor + tokenizer = processor.tokenizer + if tokenizer is None: + raise ValueError( + "You cannot use beam search when `skip_tokenizer_init` is True" + ) + + eos_token_id: int = tokenizer.eos_token_id # type: ignore + + if is_explicit_encoder_decoder_prompt(prompt): + raise NotImplementedError + + prompt_text: str | None + prompt_token_ids: list[int] + multi_modal_data: MultiModalDataDict | None + if isinstance(prompt, str): + prompt_text = prompt + prompt_token_ids = [] + multi_modal_data = None + else: + prompt_text = prompt.get("prompt") # type: ignore + prompt_token_ids = prompt.get("prompt_token_ids", []) # type: ignore + multi_modal_data = prompt.get("multi_modal_data") # type: ignore + + mm_processor_kwargs: dict[str, Any] | None = None + + # This is a workaround to fix multimodal beam search; this is a + # bandaid fix for 2 small problems: + # 1. Multi_modal_data on the processed_inputs currently resolves to + # `None`. + # 2. preprocessing above expands the multimodal placeholders. However, + # this happens again in generation, so the double expansion causes + # a mismatch. + # TODO - would be ideal to handle this more gracefully. + + tokenized_length = len(prompt_token_ids) + + sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty) + + beam_search_params = SamplingParams( + logprobs=2 * beam_width, + max_tokens=1, + temperature=temperature, + ) + all_beams = [ + BeamSearchSequence( + tokens=prompt_token_ids, + cum_logprob=0, + logprobs=[], + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + lora_request=lora_request, + ) + ] + completed = [] + + for _ in range(max_tokens): + prompts_batch, lora_req_batch = zip( + *[ + ( + EngineTokensPrompt( + prompt_token_ids=beam.tokens, + multi_modal_data=beam.multi_modal_data, + mm_processor_kwargs=beam.mm_processor_kwargs, + ), + beam.lora_request, + ) + for beam in all_beams + ] + ) + + tasks = [] + request_id_batch = f"{request_id}-{random_uuid()}" + + for i, (individual_prompt, lora_req) in enumerate( + zip(prompts_batch, lora_req_batch) + ): + request_id_item = f"{request_id_batch}-beam-{i}" + task = asyncio.create_task( + collect_from_async_generator( + self.engine_client.generate( + individual_prompt, + beam_search_params, + request_id_item, + lora_request=lora_req, + ) + ) + ) + tasks.append(task) + + output = [x[0] for x in await asyncio.gather(*tasks)] + + new_beams = [] + for i, current_beam in enumerate(all_beams): + result = output[i] + + if result.outputs[0].logprobs is not None: + logprobs = result.outputs[0].logprobs[0] + for token_id, logprob_obj in logprobs.items(): + if token_id == eos_token_id and not ignore_eos: + completed.append( + BeamSearchSequence( + tokens=current_beam.tokens + [token_id] + if include_stop_str_in_output + else current_beam.tokens, + logprobs=current_beam.logprobs + [logprobs], + cum_logprob=current_beam.cum_logprob + + logprob_obj.logprob, + finish_reason="stop", + stop_reason=eos_token_id, + ) + ) + else: + new_beams.append( + BeamSearchSequence( + tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + [logprobs], + lora_request=current_beam.lora_request, + cum_logprob=current_beam.cum_logprob + + logprob_obj.logprob, + multi_modal_data=current_beam.multi_modal_data, + mm_processor_kwargs=current_beam.mm_processor_kwargs, + ) + ) + + sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True) + all_beams = sorted_beams[:beam_width] + + completed.extend(all_beams) + sorted_completed = sorted(completed, key=sort_beams_key, reverse=True) + best_beams = sorted_completed[:beam_width] + + for beam in best_beams: + if beam.tokens[-1] == eos_token_id and not ignore_eos: + # Skip the eos token in the text. + tokens = beam.tokens[tokenized_length:-1] + else: + tokens = beam.tokens[tokenized_length:] + beam.text = tokenizer.decode(tokens) + + yield RequestOutput( + request_id=request_id, + prompt=prompt_text, + outputs=[ + CompletionOutput( + text=beam.text, # type: ignore + cumulative_logprob=beam.cum_logprob, + token_ids=beam.tokens[tokenized_length:], + index=i, + logprobs=beam.logprobs, + finish_reason=beam.finish_reason + if beam.finish_reason is not None + else "length", + stop_reason=beam.stop_reason, + ) + for (i, beam) in enumerate(best_beams) + ], + finished=True, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, + ) + + def _get_renderer(self, tokenizer: AnyTokenizer | None) -> BaseRenderer: + """ + Get a Renderer instance with the provided tokenizer. + Uses shared async tokenizer pool for efficiency. + """ + return CompletionRenderer( + model_config=self.model_config, + tokenizer=tokenizer, + async_tokenizer_pool=self._async_tokenizer_pool, + ) + + def _build_render_config( + self, + request: Any, + ) -> RenderConfig: + """ + Build and return a `RenderConfig` for an endpoint. + + Used by the renderer to control how prompts are prepared + (e.g., tokenization and length handling). Endpoints should + implement this with logic appropriate to their request type. + """ + raise NotImplementedError + + def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer: + """ + Return (and cache) an `AsyncMicrobatchTokenizer` bound to the + given tokenizer. + """ + async_tokenizer = self._async_tokenizer_pool.get(tokenizer) + if async_tokenizer is None: + async_tokenizer = AsyncMicrobatchTokenizer(tokenizer) + self._async_tokenizer_pool[tokenizer] = async_tokenizer + return async_tokenizer + + async def _preprocess( + self, + ctx: ServeContext, + ) -> ErrorResponse | None: + """ + Default preprocessing hook. Subclasses may override + to prepare `ctx` (classification, embedding, etc.). + """ + return None + + def _build_response( + self, + ctx: ServeContext, + ) -> AnyResponse | ErrorResponse: + """ + Default response builder. Subclass may override this method + to return the appropriate response object. + """ + return self.create_error_response("unimplemented endpoint") + + async def handle( + self, + ctx: ServeContext, + ) -> AnyResponse | ErrorResponse: + generation: AsyncGenerator[AnyResponse | ErrorResponse, None] + generation = self._pipeline(ctx) + + async for response in generation: + return response + + return self.create_error_response("No response yielded from pipeline") + + async def _pipeline( + self, + ctx: ServeContext, + ) -> AsyncGenerator[AnyResponse | ErrorResponse, None]: + """Execute the request processing pipeline yielding responses.""" + if error := await self._check_model(ctx.request): + yield error + if error := self._validate_request(ctx): + yield error + + preprocess_ret = await self._preprocess(ctx) + if isinstance(preprocess_ret, ErrorResponse): + yield preprocess_ret + + generators_ret = await self._prepare_generators(ctx) + if isinstance(generators_ret, ErrorResponse): + yield generators_ret + + collect_ret = await self._collect_batch(ctx) + if isinstance(collect_ret, ErrorResponse): + yield collect_ret + + yield self._build_response(ctx) + + def _validate_request(self, ctx: ServeContext) -> ErrorResponse | None: + truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens", None) + + if ( + truncate_prompt_tokens is not None + and truncate_prompt_tokens > self.max_model_len + ): + return self.create_error_response( + "truncate_prompt_tokens value is " + "greater than max_model_len." + " Please, select a smaller truncation size." + ) + return None + + def _create_pooling_params( + self, + ctx: ServeContext, + ) -> PoolingParams | ErrorResponse: + if not hasattr(ctx.request, "to_pooling_params"): + return self.create_error_response( + "Request type does not support pooling parameters" + ) + + return ctx.request.to_pooling_params() + + async def _prepare_generators( + self, + ctx: ServeContext, + ) -> ErrorResponse | None: + """Schedule the request and get the result generator.""" + generators: list[ + AsyncGenerator[RequestOutput | PoolingRequestOutput, None] + ] = [] + + try: + trace_headers = ( + None + if ctx.raw_request is None + else await self._get_trace_headers(ctx.raw_request.headers) + ) + + pooling_params = self._create_pooling_params(ctx) + if isinstance(pooling_params, ErrorResponse): + return pooling_params + + if ctx.engine_prompts is None: + return self.create_error_response("Engine prompts not available") + + for i, engine_prompt in enumerate(ctx.engine_prompts): + request_id_item = f"{ctx.request_id}-{i}" + + self._log_inputs( + request_id_item, + engine_prompt, + params=pooling_params, + lora_request=ctx.lora_request, + ) + + generator = self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=ctx.lora_request, + trace_headers=trace_headers, + priority=getattr(ctx.request, "priority", 0), + ) + + generators.append(generator) + + ctx.result_generator = merge_async_iterators(*generators) + + return None + + except Exception as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + async def _collect_batch( + self, + ctx: ServeContext, + ) -> ErrorResponse | None: + """Collect batch results from the result generator.""" + try: + if ctx.engine_prompts is None: + return self.create_error_response("Engine prompts not available") + + num_prompts = len(ctx.engine_prompts) + final_res_batch: list[RequestOutput | PoolingRequestOutput | None] + final_res_batch = [None] * num_prompts + + if ctx.result_generator is None: + return self.create_error_response("Result generator not available") + + async for i, res in ctx.result_generator: + final_res_batch[i] = res + + if None in final_res_batch: + return self.create_error_response( + "Failed to generate results for all prompts" + ) + + ctx.final_res_batch = [res for res in final_res_batch if res is not None] + + return None + + except Exception as e: + return self.create_error_response(str(e)) + + def create_error_response( + self, + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + ) -> ErrorResponse: + if self.log_error_stack: + exc_type, _, _ = sys.exc_info() + if exc_type is not None: + traceback.print_exc() + else: + traceback.print_stack() + return ErrorResponse( + error=ErrorInfo(message=message, type=err_type, code=status_code.value) + ) + + def create_streaming_error_response( + self, + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + ) -> str: + json_str = json.dumps( + self.create_error_response( + message=message, err_type=err_type, status_code=status_code + ).model_dump() + ) + return json_str + + async def _check_model( + self, + request: AnyRequest, + ) -> ErrorResponse | None: + error_response = None + + if self._is_model_supported(request.model): + return None + if request.model in self.models.lora_requests: + return None + if ( + envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING + and request.model + and (load_result := await self.models.resolve_lora(request.model)) + ): + if isinstance(load_result, LoRARequest): + return None + if ( + isinstance(load_result, ErrorResponse) + and load_result.error.code == HTTPStatus.BAD_REQUEST.value + ): + error_response = load_result + + return error_response or self.create_error_response( + message=f"The model `{request.model}` does not exist.", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + ) + + def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None: + """Determine if there are any active default multimodal loras.""" + # TODO: Currently this is only enabled for chat completions + # to be better aligned with only being enabled for .generate + # when run offline. It would be nice to support additional + # tasks types in the future. + message_types = self._get_message_types(request) + default_mm_loras = set() + + for lora in self.models.lora_requests.values(): + # Best effort match for default multimodal lora adapters; + # There is probably a better way to do this, but currently + # this matches against the set of 'types' in any content lists + # up until '_', e.g., to match audio_url -> audio + if lora.lora_name in message_types: + default_mm_loras.add(lora) + + # Currently only support default modality specific loras if + # we have exactly one lora matched on the request. + if len(default_mm_loras) == 1: + return default_mm_loras.pop() + return None + + def _maybe_get_adapters( + self, + request: AnyRequest, + supports_default_mm_loras: bool = False, + ) -> LoRARequest | None: + if request.model in self.models.lora_requests: + return self.models.lora_requests[request.model] + + # Currently only support default modality specific loras + # if we have exactly one lora matched on the request. + if supports_default_mm_loras: + default_mm_lora = self._get_active_default_mm_loras(request) + if default_mm_lora is not None: + return default_mm_lora + + if self._is_model_supported(request.model): + return None + + # if _check_model has been called earlier, this will be unreachable + raise ValueError(f"The model `{request.model}` does not exist.") + + def _get_message_types(self, request: AnyRequest) -> set[str]: + """Retrieve the set of types from message content dicts up + until `_`; we use this to match potential multimodal data + with default per modality loras. + """ + message_types: set[str] = set() + + if not hasattr(request, "messages"): + return message_types + + messages = request.messages + if messages is None or isinstance(messages, (str, bytes)): + return message_types + + for message in messages: + if ( + isinstance(message, dict) + and "content" in message + and isinstance(message["content"], list) + ): + for content_dict in message["content"]: + if "type" in content_dict: + message_types.add(content_dict["type"].split("_")[0]) + return message_types + + async def _normalize_prompt_text_to_input( + self, + request: AnyRequest, + prompt: str, + tokenizer: AnyTokenizer, + add_special_tokens: bool, + ) -> TextTokensPrompt: + async_tokenizer = self._get_async_tokenizer(tokenizer) + + if ( + self.model_config.encoder_config is not None + and self.model_config.encoder_config.get("do_lower_case", False) + ): + prompt = prompt.lower() + + truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None) + + if truncate_prompt_tokens is None: + encoded = await async_tokenizer( + prompt, add_special_tokens=add_special_tokens + ) + elif truncate_prompt_tokens < 0: + # Negative means we cap at the model's max length + encoded = await async_tokenizer( + prompt, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=self.max_model_len, + ) + else: + encoded = await async_tokenizer( + prompt, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=truncate_prompt_tokens, + ) + + input_ids = encoded.input_ids + input_text = prompt + + return self._validate_input(request, input_ids, input_text) + + async def _normalize_prompt_tokens_to_input( + self, + request: AnyRequest, + prompt_ids: list[int], + tokenizer: AnyTokenizer | None, + ) -> TextTokensPrompt: + truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None) + + if truncate_prompt_tokens is None: + input_ids = prompt_ids + elif truncate_prompt_tokens < 0: + input_ids = prompt_ids[-self.max_model_len :] + else: + input_ids = prompt_ids[-truncate_prompt_tokens:] + + if tokenizer is None: + input_text = "" + else: + async_tokenizer = self._get_async_tokenizer(tokenizer) + input_text = await async_tokenizer.decode(input_ids) + + return self._validate_input(request, input_ids, input_text) + + def _validate_input( + self, + request: AnyRequest, + input_ids: list[int], + input_text: str, + ) -> TextTokensPrompt: + token_num = len(input_ids) + + # Note: EmbeddingRequest, ClassificationRequest, + # and ScoreRequest doesn't have max_tokens + if isinstance( + request, + ( + EmbeddingChatRequest, + EmbeddingCompletionRequest, + ScoreRequest, + RerankRequest, + ClassificationCompletionRequest, + ClassificationChatRequest, + ), + ): + # Note: input length can be up to the entire model context length + # since these requests don't generate tokens. + if token_num > self.max_model_len: + operations: dict[type[AnyRequest], str] = { + ScoreRequest: "score", + ClassificationCompletionRequest: "classification", + ClassificationChatRequest: "classification", + } + operation = operations.get(type(request), "embedding generation") + raise ValueError( + f"This model's maximum context length is " + f"{self.max_model_len} tokens. However, you requested " + f"{token_num} tokens in the input for {operation}. " + f"Please reduce the length of the input." + ) + return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) + + # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens + # and does not require model context length validation + if isinstance( + request, + (TokenizeCompletionRequest, TokenizeChatRequest, DetokenizeRequest), + ): + return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) + + # chat completion endpoint supports max_completion_tokens + if isinstance(request, ChatCompletionRequest): + # TODO(#9845): remove max_tokens when field dropped from OpenAI API + max_tokens = request.max_completion_tokens or request.max_tokens + else: + max_tokens = getattr(request, "max_tokens", None) + + # Note: input length can be up to model context length - 1 for + # completion-like requests. + if token_num >= self.max_model_len: + raise ValueError( + f"This model's maximum context length is " + f"{self.max_model_len} tokens. However, your request has " + f"{token_num} input tokens. Please reduce the length of " + "the input messages." + ) + + if max_tokens is not None and token_num + max_tokens > self.max_model_len: + raise ValueError( + "'max_tokens' or 'max_completion_tokens' is too large: " + f"{max_tokens}. This model's maximum context length is " + f"{self.max_model_len} tokens and your request has " + f"{token_num} input tokens ({max_tokens} > {self.max_model_len}" + f" - {token_num})." + ) + + return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) + + async def _tokenize_prompt_input_async( + self, + request: AnyRequest, + tokenizer: AnyTokenizer, + prompt_input: str | list[int], + add_special_tokens: bool = True, + ) -> TextTokensPrompt: + """ + A simpler implementation that tokenizes a single prompt input. + """ + async for result in self._tokenize_prompt_inputs_async( + request, + tokenizer, + [prompt_input], + add_special_tokens=add_special_tokens, + ): + return result + raise ValueError("No results yielded from tokenization") + + async def _tokenize_prompt_inputs_async( + self, + request: AnyRequest, + tokenizer: AnyTokenizer, + prompt_inputs: Iterable[str | list[int]], + add_special_tokens: bool = True, + ) -> AsyncGenerator[TextTokensPrompt, None]: + """ + A simpler implementation that tokenizes multiple prompt inputs. + """ + for prompt in prompt_inputs: + if isinstance(prompt, str): + yield await self._normalize_prompt_text_to_input( + request, + prompt=prompt, + tokenizer=tokenizer, + add_special_tokens=add_special_tokens, + ) + else: + yield await self._normalize_prompt_tokens_to_input( + request, + prompt_ids=prompt, + tokenizer=tokenizer, + ) + + def _validate_chat_template( + self, + request_chat_template: str | None, + chat_template_kwargs: dict[str, Any] | None, + trust_request_chat_template: bool, + ) -> ErrorResponse | None: + if not trust_request_chat_template and ( + request_chat_template is not None + or ( + chat_template_kwargs + and chat_template_kwargs.get("chat_template") is not None + ) + ): + return self.create_error_response( + "Chat template is passed with request, but " + "--trust-request-chat-template is not set. " + "Refused request with untrusted chat template." + ) + return None + + async def _preprocess_chat( + self, + request: ChatLikeRequest | ResponsesRequest, + tokenizer: AnyTokenizer, + messages: list[ChatCompletionMessageParam], + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + add_generation_prompt: bool = True, + continue_final_message: bool = False, + tool_dicts: list[dict[str, Any]] | None = None, + documents: list[dict[str, str]] | None = None, + chat_template_kwargs: dict[str, Any] | None = None, + tool_parser: Callable[[AnyTokenizer], ToolParser] | None = None, + add_special_tokens: bool = False, + ) -> tuple[ + list[ConversationMessage], + Sequence[RequestPrompt], + list[EngineTokensPrompt], + ]: + model_config = self.model_config + + resolved_content_format = resolve_chat_template_content_format( + chat_template, + tool_dicts, + chat_template_content_format, + tokenizer, + model_config=model_config, + ) + conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( + messages, + model_config, + tokenizer, + content_format=resolved_content_format, + ) + + _chat_template_kwargs: dict[str, Any] = dict( + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + continue_final_message=continue_final_message, + tools=tool_dicts, + documents=documents, + ) + _chat_template_kwargs.update(chat_template_kwargs or {}) + + request_prompt: str | list[int] + + if tokenizer is None: + request_prompt = "placeholder" + elif isinstance(tokenizer, MistralTokenizer): + request_prompt = await self._apply_mistral_chat_template_async( + tokenizer, + messages=messages, + **_chat_template_kwargs, + ) + else: + request_prompt = apply_hf_chat_template( + tokenizer=tokenizer, + conversation=conversation, + model_config=model_config, + **_chat_template_kwargs, + ) + + mm_data = await mm_data_future + + # tool parsing is done only if a tool_parser has been set and if + # tool_choice is not "none" (if tool_choice is "none" but a tool_parser + # is set, we want to prevent parsing a tool_call hallucinated by the LLM + should_parse_tools = tool_parser is not None and ( + hasattr(request, "tool_choice") and request.tool_choice != "none" + ) + + if should_parse_tools: + if not isinstance(request, ChatCompletionRequest | ResponsesRequest): + msg = ( + "Tool usage is only supported for Chat Completions API " + "or Responses API requests." + ) + raise NotImplementedError(msg) + request = tool_parser(tokenizer).adjust_request(request=request) # type: ignore + + if tokenizer is None: + assert isinstance(request_prompt, str), ( + "Prompt has to be a string", + "when the tokenizer is not initialised", + ) + prompt_inputs = TextTokensPrompt( + prompt=request_prompt, prompt_token_ids=[1] + ) + elif isinstance(request_prompt, str): + prompt_inputs = await self._tokenize_prompt_input_async( + request, + tokenizer, + request_prompt, + add_special_tokens=add_special_tokens, + ) + else: + # For MistralTokenizer + assert is_list_of(request_prompt, int), ( + "Prompt has to be either a string or a list of token ids" + ) + prompt_inputs = TextTokensPrompt( + prompt=tokenizer.decode(request_prompt), + prompt_token_ids=request_prompt, + ) + + engine_prompt = EngineTokensPrompt( + prompt_token_ids=prompt_inputs["prompt_token_ids"] + ) + if mm_data is not None: + engine_prompt["multi_modal_data"] = mm_data + + if mm_uuids is not None: + engine_prompt["multi_modal_uuids"] = mm_uuids + + if request.mm_processor_kwargs is not None: + engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs + + if hasattr(request, "cache_salt") and request.cache_salt is not None: + engine_prompt["cache_salt"] = request.cache_salt + + return conversation, [request_prompt], [engine_prompt] + + async def _process_inputs( + self, + request_id: str, + engine_prompt: PromptType, + params: SamplingParams | PoolingParams, + *, + lora_request: LoRARequest | None, + trace_headers: Mapping[str, str] | None, + priority: int, + ) -> tuple[EngineCoreRequest, dict[str, Any]]: + """Use the Processor to process inputs for AsyncLLM.""" + tokenization_kwargs: dict[str, Any] = {} + _validate_truncation_size( + self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs + ) + + engine_request = self.processor.process_inputs( + request_id, + engine_prompt, + params, + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, + trace_headers=trace_headers, + priority=priority, + ) + return engine_request, tokenization_kwargs + + async def _generate_with_builtin_tools( + self, + request_id: str, + request_prompt: RequestPrompt, + engine_prompt: EngineTokensPrompt, + sampling_params: SamplingParams, + context: ConversationContext, + lora_request: LoRARequest | None = None, + priority: int = 0, + **kwargs, + ): + prompt_text, _, _ = self._get_prompt_components(request_prompt) + orig_priority = priority + while True: + self._log_inputs( + request_id, + request_prompt, + params=sampling_params, + lora_request=lora_request, + ) + trace_headers = kwargs.get("trace_headers") + engine_request, tokenization_kwargs = await self._process_inputs( + request_id, + engine_prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + priority=priority, + ) + + generator = self.engine_client.generate( + engine_request, + sampling_params, + request_id, + lora_request=lora_request, + priority=priority, + prompt_text=prompt_text, + tokenization_kwargs=tokenization_kwargs, + **kwargs, + ) + + async for res in generator: + context.append_output(res) + # NOTE(woosuk): The stop condition is handled by the engine. + yield context + + if not context.need_builtin_tool_call(): + # The model did not ask for a tool call, so we're done. + break + + # Call the tool and update the context with the result. + tool_output = await context.call_tool() + context.append_tool_output(tool_output) + + # TODO: uncomment this and enable tool output streaming + # yield context + + # Create inputs for the next turn. + # Render the next prompt token ids. + prompt_token_ids = context.render_for_completion() + engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) + request_prompt = prompt_token_ids + # Update the sampling params. + sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids) + # OPTIMIZATION + priority = orig_priority - 1 + + def _get_prompt_components( + self, + prompt: RequestPrompt | PromptType, + ) -> PromptComponents: + if isinstance(prompt, list): + return PromptComponents(token_ids=prompt) + + return get_prompt_components(prompt) # type: ignore[arg-type] + + def _log_inputs( + self, + request_id: str, + inputs: RequestPrompt | PromptType, + params: SamplingParams | PoolingParams | BeamSearchParams | None, + lora_request: LoRARequest | None, + ) -> None: + if self.request_logger is None: + return + + prompt, prompt_token_ids, prompt_embeds = self._get_prompt_components(inputs) + + self.request_logger.log_inputs( + request_id, + prompt, + prompt_token_ids, + prompt_embeds, + params=params, + lora_request=lora_request, + ) + + async def _get_trace_headers( + self, + headers: Headers, + ) -> Mapping[str, str] | None: + is_tracing_enabled = await self.engine_client.is_tracing_enabled() + + if is_tracing_enabled: + return extract_trace_headers(headers) + + if contains_trace_headers(headers): + log_tracing_disabled_warning() + + return None + + @staticmethod + def _base_request_id( + raw_request: Request | None, default: str | None = None + ) -> str | None: + """Pulls the request id to use from a header, if provided""" + default = default or random_uuid() + if raw_request is None: + return default + + return raw_request.headers.get("X-Request-Id", default) + + @staticmethod + def _get_data_parallel_rank(raw_request: Request | None) -> int | None: + """Pulls the data parallel rank from a header, if provided""" + if raw_request is None: + return None + + rank_str = raw_request.headers.get("X-data-parallel-rank") + if rank_str is None: + return None + + try: + return int(rank_str) + except ValueError: + return None + + @staticmethod + def _parse_tool_calls_from_content( + request: ResponsesRequest | ChatCompletionRequest, + tokenizer: AnyTokenizer, + enable_auto_tools: bool, + tool_parser_cls: Callable[[AnyTokenizer], ToolParser] | None, + content: str | None = None, + ) -> tuple[list[FunctionCall] | None, str | None]: + function_calls = list[FunctionCall]() + if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction): + assert content is not None + # Forced Function Call + function_calls.append( + FunctionCall(name=request.tool_choice.name, arguments=content) + ) + content = None # Clear content since tool is called. + elif request.tool_choice and isinstance( + request.tool_choice, ChatCompletionNamedToolChoiceParam + ): + assert content is not None + # Forced Function Call + function_calls.append( + FunctionCall(name=request.tool_choice.function.name, arguments=content) + ) + content = None # Clear content since tool is called. + elif request.tool_choice == "required": + assert content is not None + tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content) + function_calls.extend( + [ + FunctionCall( + name=tool_call.name, + arguments=json.dumps(tool_call.parameters, ensure_ascii=False), + ) + for tool_call in tool_calls + ] + ) + content = None # Clear content since tool is called. + elif ( + tool_parser_cls + and enable_auto_tools + and (request.tool_choice == "auto" or request.tool_choice is None) + ): + # Automatic Tool Call Parsing + try: + tool_parser = tool_parser_cls(tokenizer) + except RuntimeError as e: + logger.exception("Error in tool parser creation.") + raise e + tool_call_info = tool_parser.extract_tool_calls( + content if content is not None else "", + request=request, # type: ignore + ) + if tool_call_info is not None and tool_call_info.tools_called: + # extract_tool_calls() returns a list of tool calls. + function_calls.extend( + FunctionCall( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ) + for tool_call in tool_call_info.tool_calls + ) + content = tool_call_info.content + if content and content.strip() == "": + content = None + else: + # No tool calls. + return None, content + + return function_calls, content + + @staticmethod + def _get_decoded_token( + logprob: Logprob, + token_id: int, + tokenizer: AnyTokenizer, + return_as_token_id: bool = False, + ) -> str: + if return_as_token_id: + return f"token_id:{token_id}" + + if logprob.decoded_token is not None: + return logprob.decoded_token + return tokenizer.decode(token_id) + + def _is_model_supported(self, model_name: str | None) -> bool: + if not model_name: + return True + return self.models.is_base_model(model_name) + + +def clamp_prompt_logprobs( + prompt_logprobs: PromptLogprobs | None, +) -> PromptLogprobs | None: + if prompt_logprobs is None: + return prompt_logprobs + + for logprob_dict in prompt_logprobs: + if logprob_dict is None: + continue + for logprob_values in logprob_dict.values(): + if logprob_values.logprob == float("-inf"): + logprob_values.logprob = -9999.0 + return prompt_logprobs diff --git a/entrypoints/openai/serving_models.py b/entrypoints/openai/serving_models.py new file mode 100644 index 0000000..24b9587 --- /dev/null +++ b/entrypoints/openai/serving_models.py @@ -0,0 +1,304 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from asyncio import Lock +from collections import defaultdict +from dataclasses import dataclass +from http import HTTPStatus + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.openai.protocol import ( + ErrorInfo, + ErrorResponse, + LoadLoRAAdapterRequest, + ModelCard, + ModelList, + ModelPermission, + UnloadLoRAAdapterRequest, +) +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry +from vllm.utils.counter import AtomicCounter + +logger = init_logger(__name__) + + +@dataclass +class BaseModelPath: + name: str + model_path: str + + +@dataclass +class LoRAModulePath: + name: str + path: str + base_model_name: str | None = None + + +class OpenAIServingModels: + """Shared instance to hold data about the loaded base model(s) and adapters. + + Handles the routes: + - /v1/models + - /v1/load_lora_adapter + - /v1/unload_lora_adapter + """ + + def __init__( + self, + engine_client: EngineClient, + base_model_paths: list[BaseModelPath], + *, + lora_modules: list[LoRAModulePath] | None = None, + ): + super().__init__() + + self.engine_client = engine_client + self.base_model_paths = base_model_paths + + self.static_lora_modules = lora_modules + self.lora_requests: dict[str, LoRARequest] = {} + self.lora_id_counter = AtomicCounter(0) + + self.lora_resolvers: list[LoRAResolver] = [] + for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers(): + self.lora_resolvers.append( + LoRAResolverRegistry.get_resolver(lora_resolver_name) + ) + self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock) + + self.processor = self.engine_client.processor + self.io_processor = self.engine_client.io_processor + self.model_config = self.engine_client.model_config + self.max_model_len = self.model_config.max_model_len + + async def init_static_loras(self): + """Loads all static LoRA modules. + Raises if any fail to load""" + if self.static_lora_modules is None: + return + for lora in self.static_lora_modules: + load_request = LoadLoRAAdapterRequest( + lora_path=lora.path, lora_name=lora.name + ) + load_result = await self.load_lora_adapter( + request=load_request, base_model_name=lora.base_model_name + ) + if isinstance(load_result, ErrorResponse): + raise ValueError(load_result.error.message) + + def is_base_model(self, model_name) -> bool: + return any(model.name == model_name for model in self.base_model_paths) + + def model_name(self, lora_request: LoRARequest | None = None) -> str: + """Returns the appropriate model name depending on the availability + and support of the LoRA or base model. + Parameters: + - lora: LoRARequest that contain a base_model_name. + Returns: + - str: The name of the base model or the first available model path. + """ + if lora_request is not None: + return lora_request.lora_name + return self.base_model_paths[0].name + + async def show_available_models(self) -> ModelList: + """Show available models. This includes the base model and all + adapters""" + model_cards = [ + ModelCard( + id=base_model.name, + max_model_len=self.max_model_len, + root=base_model.model_path, + permission=[ModelPermission()], + ) + for base_model in self.base_model_paths + ] + lora_cards = [ + ModelCard( + id=lora.lora_name, + root=lora.local_path, + parent=lora.base_model_name + if lora.base_model_name + else self.base_model_paths[0].name, + permission=[ModelPermission()], + ) + for lora in self.lora_requests.values() + ] + model_cards.extend(lora_cards) + return ModelList(data=model_cards) + + async def load_lora_adapter( + self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None + ) -> ErrorResponse | str: + lora_name = request.lora_name + + # Ensure atomicity based on the lora name + async with self.lora_resolver_lock[lora_name]: + error_check_ret = await self._check_load_lora_adapter_request(request) + if error_check_ret is not None: + return error_check_ret + + lora_path = request.lora_path + unique_id = self.lora_id_counter.inc(1) + lora_request = LoRARequest( + lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path + ) + if base_model_name is not None and self.is_base_model(base_model_name): + lora_request.base_model_name = base_model_name + + # Validate that the adapter can be loaded into the engine + # This will also pre-load it for incoming requests + try: + await self.engine_client.add_lora(lora_request) + except Exception as e: + error_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + if "No adapter found" in str(e): + error_type = "NotFoundError" + status_code = HTTPStatus.NOT_FOUND + + return create_error_response( + message=str(e), err_type=error_type, status_code=status_code + ) + + self.lora_requests[lora_name] = lora_request + logger.info( + "Loaded new LoRA adapter: name '%s', path '%s'", lora_name, lora_path + ) + return f"Success: LoRA adapter '{lora_name}' added successfully." + + async def unload_lora_adapter( + self, request: UnloadLoRAAdapterRequest + ) -> ErrorResponse | str: + lora_name = request.lora_name + + # Ensure atomicity based on the lora name + async with self.lora_resolver_lock[lora_name]: + error_check_ret = await self._check_unload_lora_adapter_request(request) + if error_check_ret is not None: + return error_check_ret + + # Safe to delete now since we hold the lock + del self.lora_requests[lora_name] + logger.info("Removed LoRA adapter: name '%s'", lora_name) + return f"Success: LoRA adapter '{lora_name}' removed successfully." + + async def _check_load_lora_adapter_request( + self, request: LoadLoRAAdapterRequest + ) -> ErrorResponse | None: + # Check if both 'lora_name' and 'lora_path' are provided + if not request.lora_name or not request.lora_path: + return create_error_response( + message="Both 'lora_name' and 'lora_path' must be provided.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST, + ) + + # Check if the lora adapter with the given name already exists + if request.lora_name in self.lora_requests: + return create_error_response( + message=f"The lora adapter '{request.lora_name}' has already been " + "loaded.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST, + ) + + return None + + async def _check_unload_lora_adapter_request( + self, request: UnloadLoRAAdapterRequest + ) -> ErrorResponse | None: + # Check if 'lora_name' is not provided return an error + if not request.lora_name: + return create_error_response( + message="'lora_name' needs to be provided to unload a LoRA adapter.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST, + ) + + # Check if the lora adapter with the given name exists + if request.lora_name not in self.lora_requests: + return create_error_response( + message=f"The lora adapter '{request.lora_name}' cannot be found.", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + ) + + return None + + async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse: + """Attempt to resolve a LoRA adapter using available resolvers. + + Args: + lora_name: Name/identifier of the LoRA adapter + + Returns: + LoRARequest if found and loaded successfully. + ErrorResponse (404) if no resolver finds the adapter. + ErrorResponse (400) if adapter(s) are found but none load. + """ + async with self.lora_resolver_lock[lora_name]: + # First check if this LoRA is already loaded + if lora_name in self.lora_requests: + return self.lora_requests[lora_name] + + base_model_name = self.model_config.model + unique_id = self.lora_id_counter.inc(1) + found_adapter = False + + # Try to resolve using available resolvers + for resolver in self.lora_resolvers: + lora_request = await resolver.resolve_lora(base_model_name, lora_name) + + if lora_request is not None: + found_adapter = True + lora_request.lora_int_id = unique_id + + try: + await self.engine_client.add_lora(lora_request) + self.lora_requests[lora_name] = lora_request + logger.info( + "Resolved and loaded LoRA adapter '%s' using %s", + lora_name, + resolver.__class__.__name__, + ) + return lora_request + except BaseException as e: + logger.warning( + "Failed to load LoRA '%s' resolved by %s: %s. " + "Trying next resolver.", + lora_name, + resolver.__class__.__name__, + e, + ) + continue + + if found_adapter: + # An adapter was found, but all attempts to load it failed. + return create_error_response( + message=( + f"LoRA adapter '{lora_name}' was found but could not be loaded." + ), + err_type="BadRequestError", + status_code=HTTPStatus.BAD_REQUEST, + ) + else: + # No adapter was found + return create_error_response( + message=f"LoRA adapter {lora_name} does not exist", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + ) + + +def create_error_response( + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, +) -> ErrorResponse: + return ErrorResponse( + error=ErrorInfo(message=message, type=err_type, code=status_code.value) + ) diff --git a/entrypoints/openai/serving_pooling.py b/entrypoints/openai/serving_pooling.py new file mode 100644 index 0000000..ee4c5c8 --- /dev/null +++ b/entrypoints/openai/serving_pooling.py @@ -0,0 +1,346 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import json +import time +from collections.abc import AsyncGenerator, Sequence +from typing import Final, cast + +import jinja2 +from fastapi import Request +from typing_extensions import assert_never + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ErrorResponse, + IOProcessorRequest, + IOProcessorResponse, + PoolingBytesResponse, + PoolingChatRequest, + PoolingCompletionRequest, + PoolingRequest, + PoolingResponse, + PoolingResponseData, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig +from vllm.entrypoints.utils import _validate_truncation_size +from vllm.logger import init_logger +from vllm.outputs import PoolingRequestOutput +from vllm.tasks import PoolingTask, SupportedTask +from vllm.utils.async_utils import merge_async_iterators +from vllm.utils.serial_utils import ( + EmbedDType, + EncodingFormat, + Endianness, + encode_pooling_bytes, + encode_pooling_output, +) + +logger = init_logger(__name__) + + +class OpenAIServingPooling(OpenAIServing): + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + supported_tasks: tuple[SupportedTask, ...], + request_logger: RequestLogger | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + trust_request_chat_template: bool = False, + log_error_stack: bool = False, + ) -> None: + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + log_error_stack=log_error_stack, + ) + + self.supported_tasks = supported_tasks + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format + self.trust_request_chat_template = trust_request_chat_template + + async def create_pooling( + self, + request: PoolingRequest, + raw_request: Request | None = None, + ) -> PoolingResponse | IOProcessorResponse | PoolingBytesResponse | ErrorResponse: + """ + See https://platform.openai.com/docs/api-reference/embeddings/create + for the API specification. This API mimics the OpenAI Embedding API. + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + model_name = self.models.model_name() + + request_id = f"pool-{self._base_request_id(raw_request)}" + created_time = int(time.time()) + + is_io_processor_request = isinstance(request, IOProcessorRequest) + try: + lora_request = self._maybe_get_adapters(request) + + if self.model_config.skip_tokenizer_init: + tokenizer = None + else: + tokenizer = await self.engine_client.get_tokenizer() + renderer = self._get_renderer(tokenizer) + + if getattr(request, "dimensions", None) is not None: + return self.create_error_response( + "dimensions is currently not supported" + ) + + truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None) + truncate_prompt_tokens = _validate_truncation_size( + self.max_model_len, truncate_prompt_tokens + ) + + if is_io_processor_request: + if self.io_processor is None: + raise ValueError( + "No IOProcessor plugin installed. Please refer " + "to the documentation and to the " + "'prithvi_geospatial_mae_io_processor' " + "offline inference example for more details." + ) + + validated_prompt = self.io_processor.parse_request(request) + + engine_prompts = await self.io_processor.pre_process_async( + prompt=validated_prompt, request_id=request_id + ) + if not isinstance(engine_prompts, Sequence) or isinstance( + engine_prompts, (str, bytes, bytearray) + ): + engine_prompts = [engine_prompts] + + elif isinstance(request, PoolingChatRequest): + error_check_ret = self._validate_chat_template( + request_chat_template=request.chat_template, + chat_template_kwargs=request.chat_template_kwargs, + trust_request_chat_template=self.trust_request_chat_template, + ) + if error_check_ret is not None: + return error_check_ret + ( + _, + _, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self.chat_template_content_format, + # In pooling requests, we are not generating tokens, + # so there is no need to append extra tokens to the input + add_generation_prompt=False, + continue_final_message=False, + add_special_tokens=request.add_special_tokens, + ) + elif isinstance(request, PoolingCompletionRequest): + engine_prompts = await renderer.render_prompt( + prompt_or_prompts=request.input, + config=self._build_render_config(request), + ) + else: + raise ValueError(f"Unsupported request of type {type(request)}") + except (ValueError, TypeError, jinja2.TemplateError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + # Schedule the request and get the result generator. + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] + try: + if is_io_processor_request: + assert self.io_processor is not None and isinstance( + request, IOProcessorRequest + ) + pooling_params = self.io_processor.validate_or_generate_params() + else: + pooling_params = request.to_pooling_params() + + pooling_task: PoolingTask + if request.task is None: + if "token_embed" in self.supported_tasks: + pooling_task = "token_embed" + elif "token_classify" in self.supported_tasks: + pooling_task = "token_classify" + elif "plugin" in self.supported_tasks: + pooling_task = "plugin" + else: + return self.create_error_response( + f"pooling_task must be one of {self.supported_tasks}." + ) + else: + pooling_task = request.task + + if pooling_task not in self.supported_tasks: + return self.create_error_response( + f"Task {pooling_task} is not supported, it" + f" must be one of {self.supported_tasks}." + ) + + try: + pooling_params.verify(pooling_task, self.model_config) + except ValueError as e: + return self.create_error_response(str(e)) + + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs( + request_id_item, + engine_prompt, + params=pooling_params, + lora_request=lora_request, + ) + + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + generator = self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + result_generator = merge_async_iterators(*generators) + + if is_io_processor_request: + assert self.io_processor is not None + output = await self.io_processor.post_process_async( + model_output=result_generator, + request_id=request_id, + ) + return self.io_processor.output_to_response(output) + + assert isinstance(request, (PoolingCompletionRequest, PoolingChatRequest)) + num_prompts = len(engine_prompts) + + # Non-streaming response + final_res_batch: list[PoolingRequestOutput | None] + final_res_batch = [None] * num_prompts + try: + async for i, res in result_generator: + final_res_batch[i] = res + + assert all(final_res is not None for final_res in final_res_batch) + + final_res_batch_checked = cast(list[PoolingRequestOutput], final_res_batch) + + response = self.request_output_to_pooling_response( + final_res_batch_checked, + request_id, + created_time, + model_name, + request.encoding_format, + request.embed_dtype, + request.endianness, + ) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + return response + + def request_output_to_pooling_response( + self, + final_res_batch: list[PoolingRequestOutput], + request_id: str, + created_time: int, + model_name: str, + encoding_format: EncodingFormat, + embed_dtype: EmbedDType, + endianness: Endianness, + ) -> PoolingResponse | PoolingBytesResponse: + def encode_float_base64(): + items: list[PoolingResponseData] = [] + num_prompt_tokens = 0 + + for idx, final_res in enumerate(final_res_batch): + item = PoolingResponseData( + index=idx, + data=encode_pooling_output( + final_res, + encoding_format=encoding_format, + embed_dtype=embed_dtype, + endianness=endianness, + ), + ) + prompt_token_ids = final_res.prompt_token_ids + + items.append(item) + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + + return PoolingResponse( + id=request_id, + created=created_time, + model=model_name, + data=items, + usage=usage, + ) + + def encode_bytes(): + body, items, usage = encode_pooling_bytes( + pooling_outputs=final_res_batch, + embed_dtype=embed_dtype, + endianness=endianness, + ) + + metadata = { + "id": request_id, + "created": created_time, + "model": model_name, + "data": items, + "usage": usage, + } + return PoolingBytesResponse( + body=body, + metadata=json.dumps(metadata), + ) + + if encoding_format == "float" or encoding_format == "base64": + return encode_float_base64() + elif encoding_format == "bytes": + return encode_bytes() + else: + assert_never(encoding_format) + + def _build_render_config(self, request: PoolingCompletionRequest) -> RenderConfig: + return RenderConfig( + max_length=self.max_model_len, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) diff --git a/entrypoints/openai/serving_responses.py b/entrypoints/openai/serving_responses.py new file mode 100644 index 0000000..06efb43 --- /dev/null +++ b/entrypoints/openai/serving_responses.py @@ -0,0 +1,2021 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import json +import time +import uuid +from collections import deque +from collections.abc import AsyncGenerator, AsyncIterator, Callable, Sequence +from contextlib import AsyncExitStack +from copy import copy +from http import HTTPStatus +from typing import Final + +import jinja2 +from fastapi import Request +from openai.types.responses import ( + ResponseCodeInterpreterCallCodeDeltaEvent, + ResponseCodeInterpreterCallCodeDoneEvent, + ResponseCodeInterpreterCallCompletedEvent, + ResponseCodeInterpreterCallInProgressEvent, + ResponseCodeInterpreterCallInterpretingEvent, + ResponseCodeInterpreterToolCallParam, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseFunctionCallArgumentsDeltaEvent, + ResponseFunctionCallArgumentsDoneEvent, + ResponseFunctionToolCall, + ResponseFunctionWebSearch, + ResponseOutputItem, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseOutputMessage, + ResponseOutputText, + ResponseReasoningItem, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, + ResponseStatus, + ResponseTextDeltaEvent, + ResponseTextDoneEvent, + ResponseWebSearchCallCompletedEvent, + ResponseWebSearchCallInProgressEvent, + ResponseWebSearchCallSearchingEvent, + response_function_web_search, + response_text_delta_event, +) +from openai.types.responses.response_output_text import Logprob, LogprobTopLogprob +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent, +) +from openai_harmony import Message as OpenAIHarmonyMessage + +from vllm import envs +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ( + ChatCompletionMessageParam, + ChatTemplateContentFormatOption, +) +from vllm.entrypoints.context import ( + ConversationContext, + HarmonyContext, + SimpleContext, + StreamingHarmonyContext, +) +from vllm.entrypoints.harmony_utils import ( + construct_harmony_previous_input_messages, + get_developer_message, + get_stop_tokens_for_assistant_actions, + get_system_message, + get_user_message, + has_custom_tools, + parse_output_message, + parse_remaining_state, + parse_response_input, + render_for_completion, +) +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + DeltaMessage, + ErrorResponse, + InputTokensDetails, + OutputTokensDetails, + RequestResponseMetadata, + ResponseCompletedEvent, + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseReasoningPartAddedEvent, + ResponseReasoningPartDoneEvent, + ResponsesRequest, + ResponsesResponse, + ResponseUsage, + StreamingResponsesResponse, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.responses_utils import ( + construct_chat_message_with_tool_call, + convert_tool_responses_to_completions_format, + extract_tool_types, +) +from vllm.entrypoints.tool_server import ToolServer +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt +from vllm.logger import init_logger +from vllm.logprobs import Logprob as SampleLogprob +from vllm.logprobs import SampleLogprobs +from vllm.outputs import CompletionOutput +from vllm.sampling_params import SamplingParams, StructuredOutputsParams +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +class OpenAIServingResponses(OpenAIServing): + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + return_tokens_as_token_ids: bool = False, + reasoning_parser: str = "", + enable_auto_tools: bool = False, + tool_parser: str | None = None, + tool_server: ToolServer | None = None, + enable_prompt_tokens_details: bool = False, + enable_force_include_usage: bool = False, + enable_log_outputs: bool = False, + log_error_stack: bool = False, + ) -> None: + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + log_error_stack=log_error_stack, + ) + + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format + self.enable_log_outputs = enable_log_outputs + + self.reasoning_parser = self._get_reasoning_parser( + reasoning_parser_name=reasoning_parser + ) + self.enable_prompt_tokens_details = enable_prompt_tokens_details + self.enable_force_include_usage = enable_force_include_usage + self.default_sampling_params = self.model_config.get_diff_sampling_param() + if self.default_sampling_params: + source = self.model_config.generation_config + source = "model" if source == "auto" else source + logger.info( + "Using default chat sampling params from %s: %s", + source, + self.default_sampling_params, + ) + + # If False (default), the "store" option is (silently) ignored and the + # response is not stored. If True, the response is stored in memory. + # NOTE(woosuk): This may not be intuitive for users, as the default + # behavior in OpenAI's Responses API is to store the response, but + # vLLM's default behavior is not. + self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE + if self.enable_store: + logger.warning_once( + "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may " + "cause a memory leak since we never remove responses from " + "the store." + ) + + self.use_harmony = self.model_config.hf_config.model_type == "gpt_oss" + if self.use_harmony: + logger.warning( + "For gpt-oss, we ignore --enable-auto-tool-choice " + "and always enable tool use." + ) + # OpenAI models have two EOS-like tokens: <|return|> and <|call|>. + # We need to add them to the stop token ids. + if "stop_token_ids" not in self.default_sampling_params: + self.default_sampling_params["stop_token_ids"] = [] + self.default_sampling_params["stop_token_ids"].extend( + get_stop_tokens_for_assistant_actions() + ) + self.enable_auto_tools = enable_auto_tools + # set up tool use + self.tool_parser = self._get_tool_parser( + tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools + ) + self.exclude_tools_when_tool_choice_none = False + # HACK(woosuk): This is a hack. We should use a better store. + # FIXME: If enable_store=True, this may cause a memory leak since we + # never remove responses from the store. + self.response_store: dict[str, ResponsesResponse] = {} + self.response_store_lock = asyncio.Lock() + + # HACK(woosuk): This is a hack. We should use a better store. + # FIXME: If enable_store=True, this may cause a memory leak since we + # never remove messages from the store. + self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {} + + # HACK(wuhang): This is a hack. We should use a better store. + # FIXME: If enable_store=True, this may cause a memory leak since we + # never remove events from the store. + self.event_store: dict[ + str, tuple[deque[StreamingResponsesResponse], asyncio.Event] + ] = {} + + self.background_tasks: dict[str, asyncio.Task] = {} + + self.tool_server = tool_server + + def _validate_generator_input( + self, engine_prompt: EngineTokensPrompt + ) -> ErrorResponse | None: + """Add validations to the input to the generator here.""" + if self.max_model_len <= len(engine_prompt["prompt_token_ids"]): + error_message = ( + "The engine prompt length" + f" {len(engine_prompt['prompt_token_ids'])} " + f"exceeds the max_model_len {self.max_model_len}. " + "Please reduce prompt." + ) + return self.create_error_response( + err_type="invalid_request_error", + message=error_message, + status_code=HTTPStatus.BAD_REQUEST, + ) + return None + + def _validate_create_responses_input( + self, request: ResponsesRequest + ) -> ErrorResponse | None: + if self.use_harmony and request.is_include_output_logprobs(): + return self.create_error_response( + err_type="invalid_request_error", + message="logprobs are not supported with gpt-oss models", + status_code=HTTPStatus.BAD_REQUEST, + ) + if request.store and not self.enable_store and request.background: + return self.create_error_response( + err_type="invalid_request_error", + message=( + "This vLLM engine does not support `store=True` and " + "therefore does not support the background mode. To " + "enable these features, set the environment variable " + "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching " + "the vLLM server." + ), + status_code=HTTPStatus.BAD_REQUEST, + ) + if request.previous_input_messages and request.previous_response_id: + return self.create_error_response( + err_type="invalid_request_error", + message="Only one of `previous_input_messages` and " + "`previous_response_id` can be set.", + status_code=HTTPStatus.BAD_REQUEST, + ) + return None + + async def create_responses( + self, + request: ResponsesRequest, + raw_request: Request | None = None, + ) -> ( + AsyncGenerator[StreamingResponsesResponse, None] + | ResponsesResponse + | ErrorResponse + ): + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + logger.error("Error with model %s", error_check_ret) + return error_check_ret + maybe_validation_error = self._validate_create_responses_input(request) + if maybe_validation_error is not None: + return maybe_validation_error + + # If the engine is dead, raise the engine's DEAD_ERROR. + # This is required for the streaming case, where we return a + # success status before we actually start generating text :). + if self.engine_client.errored: + raise self.engine_client.dead_error + + if request.store and not self.enable_store: + # Disable the store option. + # NOTE(woosuk): Although returning an error is possible, we opted + # to implicitly disable store and process the request anyway, as + # we assume most users do not intend to actually store the response + # (i.e., their request's `store=True` just because it's the default + # value). + request.store = False + + # Handle the previous response ID. + prev_response_id = request.previous_response_id + if prev_response_id is not None: + async with self.response_store_lock: + prev_response = self.response_store.get(prev_response_id) + if prev_response is None: + return self._make_not_found_error(prev_response_id) + else: + prev_response = None + + try: + lora_request = self._maybe_get_adapters(request) + model_name = self.models.model_name(lora_request) + tokenizer = await self.engine_client.get_tokenizer() + + if self.use_harmony: + messages, request_prompts, engine_prompts = ( + self._make_request_with_harmony(request, prev_response) + ) + else: + messages, request_prompts, engine_prompts = await self._make_request( + request, prev_response, tokenizer + ) + + except ( + ValueError, + TypeError, + RuntimeError, + jinja2.TemplateError, + NotImplementedError, + ) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(f"{e} {e.__cause__}") + + request_metadata = RequestResponseMetadata(request_id=request.request_id) + if raw_request: + raw_request.state.request_metadata = request_metadata + + # Schedule the request and get the result generator. + generators: list[AsyncGenerator[ConversationContext, None]] = [] + + builtin_tool_list: list[str] = [] + if self.use_harmony and self.tool_server is not None: + if self.tool_server.has_tool("browser"): + builtin_tool_list.append("browser") + if self.tool_server.has_tool("python"): + builtin_tool_list.append("python") + if self.tool_server.has_tool("container"): + builtin_tool_list.append("container") + + if self.tool_server is not None: + available_tools = builtin_tool_list + else: + assert len(builtin_tool_list) == 0 + available_tools = [] + try: + for i, engine_prompt in enumerate(engine_prompts): + maybe_error = self._validate_generator_input(engine_prompt) + if maybe_error is not None: + return maybe_error + + default_max_tokens = self.max_model_len - len( + engine_prompt["prompt_token_ids"] + ) + + sampling_params = request.to_sampling_params( + default_max_tokens, self.default_sampling_params + ) + + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + context: ConversationContext + if self.use_harmony: + if request.stream: + context = StreamingHarmonyContext(messages, available_tools) + else: + context = HarmonyContext(messages, available_tools) + else: + context = SimpleContext() + + if self.reasoning_parser is not None: + reasoning_parser = self.reasoning_parser(tokenizer) + if sampling_params.structured_outputs is None: + sampling_params.structured_outputs = StructuredOutputsParams() + struct_out = sampling_params.structured_outputs + if struct_out.all_non_structural_tag_constraints_none(): + sampling_params.structured_outputs.structural_tag = ( + reasoning_parser.prepare_structured_tag( + sampling_params.structured_outputs.structural_tag, + self.tool_server, + ) + ) + generator = self._generate_with_builtin_tools( + request_id=request.request_id, + request_prompt=request_prompts[i], + engine_prompt=engine_prompt, + sampling_params=sampling_params, + context=context, + lora_request=lora_request, + priority=request.priority, + trace_headers=trace_headers, + ) + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + assert len(generators) == 1 + (result_generator,) = generators + + # Store the input messages. + if request.store: + self.msg_store[request.request_id] = messages + + if request.background: + created_time = int(time.time()) + response = ResponsesResponse.from_request( + request, + sampling_params, + model_name=model_name, + created_time=created_time, + output=[], + status="queued", + usage=None, + ) + async with self.response_store_lock: + self.response_store[response.id] = response + + # Run the request in the background. + if request.stream: + task = asyncio.create_task( + self._run_background_request_stream( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + ), + name=f"create_{request.request_id}", + ) + else: + task = asyncio.create_task( + self._run_background_request( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + ), + name=f"create_{response.id}", + ) + + # For cleanup. + response_id = response.id + self.background_tasks[response_id] = task + task.add_done_callback( + lambda _: self.background_tasks.pop(response_id, None) + ) + + if request.stream: + return self.responses_background_stream_generator(request.request_id) + return response + + if request.stream: + return self.responses_stream_generator( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + ) + + try: + return await self.responses_full_generator( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + ) + except Exception as e: + return self.create_error_response(str(e)) + + async def _make_request( + self, + request: ResponsesRequest, + prev_response: ResponsesResponse | None, + tokenizer: AnyTokenizer, + ): + if request.tools is None or ( + request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none + ): + tool_dicts = None + else: + tool_dicts = [ + convert_tool_responses_to_completions_format(tool.model_dump()) + for tool in request.tools + ] + # Construct the input messages. + messages = self._construct_input_messages(request, prev_response) + _, request_prompts, engine_prompts = await self._preprocess_chat( + request, + tokenizer, + messages, + tool_dicts=tool_dicts, + tool_parser=self.tool_parser, + chat_template=self.chat_template, + chat_template_content_format=self.chat_template_content_format, + ) + return messages, request_prompts, engine_prompts + + def _make_request_with_harmony( + self, + request: ResponsesRequest, + prev_response: ResponsesResponse | None, + ): + if request.tool_choice != "auto": + raise NotImplementedError( + "Only 'auto' tool_choice is supported in response API with Harmony" + ) + messages = self._construct_input_messages_with_harmony(request, prev_response) + prompt_token_ids = render_for_completion(messages) + engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) + + # Add cache_salt if provided in the request + if request.cache_salt is not None: + engine_prompt["cache_salt"] = request.cache_salt + + return messages, [prompt_token_ids], [engine_prompt] + + async def _initialize_tool_sessions( + self, + request: ResponsesRequest, + context: ConversationContext, + exit_stack: AsyncExitStack, + ): + # we should only initialize the tool session if the request needs tools + if len(request.tools) == 0: + return + mcp_tools = { + tool.server_label: tool for tool in request.tools if tool.type == "mcp" + } + await context.init_tool_sessions( + self.tool_server, exit_stack, request.request_id, mcp_tools + ) + + async def responses_full_generator( + self, + request: ResponsesRequest, + sampling_params: SamplingParams, + result_generator: AsyncIterator[ConversationContext], + context: ConversationContext, + model_name: str, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + created_time: int | None = None, + ) -> ErrorResponse | ResponsesResponse: + if created_time is None: + created_time = int(time.time()) + + async with AsyncExitStack() as exit_stack: + try: + await self._initialize_tool_sessions(request, context, exit_stack) + async for _ in result_generator: + pass + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + # NOTE: Implementation of stauts is still WIP, but for now + # we guarantee that if the status is not "completed", it is accurate. + # "completed" is implemented as the "catch-all" for now. + status: ResponseStatus = "completed" + + input_messages = None + output_messages = None + if self.use_harmony: + assert isinstance(context, HarmonyContext) + output = self._make_response_output_items_with_harmony(context) + if request.enable_response_messages: + input_messages = context.messages[: context.num_init_messages] + output_messages = context.messages[context.num_init_messages :] + num_tool_output_tokens = context.num_tool_output_tokens + if len(output) > 0: + if context.finish_reason == "length": + status = "incomplete" + elif context.finish_reason == "abort": + status = "cancelled" + else: + status = "incomplete" + else: + assert isinstance(context, SimpleContext) + final_res = context.last_output + assert final_res is not None + assert len(final_res.outputs) == 1 + final_output = final_res.outputs[0] + + output = self._make_response_output_items(request, final_output, tokenizer) + + # TODO: context for non-gptoss models doesn't use messages + # so we can't get them out yet + if request.enable_response_messages: + raise NotImplementedError( + "enable_response_messages is currently only supported for gpt-oss" + ) + # Calculate usage. + assert final_res.prompt_token_ids is not None + num_tool_output_tokens = 0 + + assert isinstance(context, (SimpleContext, HarmonyContext)) + num_prompt_tokens = context.num_prompt_tokens + num_generated_tokens = context.num_output_tokens + num_cached_tokens = context.num_cached_tokens + num_reasoning_tokens = context.num_reasoning_tokens + + usage = ResponseUsage( + input_tokens=num_prompt_tokens, + output_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + num_generated_tokens, + input_tokens_details=InputTokensDetails( + cached_tokens=num_cached_tokens, + input_tokens_per_turn=[ + turn.input_tokens for turn in context.all_turn_metrics + ], + cached_tokens_per_turn=[ + turn.cached_input_tokens for turn in context.all_turn_metrics + ], + ), + output_tokens_details=OutputTokensDetails( + reasoning_tokens=num_reasoning_tokens, + tool_output_tokens=num_tool_output_tokens, + output_tokens_per_turn=[ + turn.output_tokens for turn in context.all_turn_metrics + ], + tool_output_tokens_per_turn=[ + turn.tool_output_tokens for turn in context.all_turn_metrics + ], + ), + ) + response = ResponsesResponse.from_request( + request, + sampling_params, + input_messages=input_messages, + output_messages=output_messages, + model_name=model_name, + created_time=created_time, + output=output, + status=status, + usage=usage, + ) + + if request.store: + async with self.response_store_lock: + stored_response = self.response_store.get(response.id) + # If the response is already cancelled, don't update it. + if stored_response is None or stored_response.status != "cancelled": + self.response_store[response.id] = response + return response + + def _topk_logprobs( + self, + logprobs: dict[int, SampleLogprob], + top_logprobs: int, + tokenizer: AnyTokenizer, + ) -> list[LogprobTopLogprob]: + """Returns the top-k logprobs from the logprobs dictionary.""" + out = [] + for i, (token_id, _logprob) in enumerate(logprobs.items()): + if i >= top_logprobs: + break + text = ( + _logprob.decoded_token + if _logprob.decoded_token is not None + else tokenizer.decode([token_id]) + ) + out.append( + LogprobTopLogprob( + token=text, + logprob=max(_logprob.logprob, -9999.0), + bytes=list(text.encode("utf-8", errors="replace")), + ) + ) + return out + + def _create_response_logprobs( + self, + token_ids: Sequence[int], + logprobs: SampleLogprobs | None, + tokenizer: AnyTokenizer, + top_logprobs: int | None = None, + ) -> list[Logprob]: + assert logprobs is not None, "logprobs must be provided" + assert len(token_ids) == len(logprobs), ( + "token_ids and logprobs.token_ids must have the same length" + ) + out = [] + for i, token_id in enumerate(token_ids): + logprob = logprobs[i] + token_logprob = logprob[token_id] + text = ( + token_logprob.decoded_token + if token_logprob.decoded_token is not None + else tokenizer.decode([token_id]) + ) + out.append( + Logprob( + token=text, + logprob=max(token_logprob.logprob, -9999.0), + bytes=list(text.encode("utf-8", errors="replace")), + top_logprobs=( + self._topk_logprobs( + logprob, top_logprobs=top_logprobs, tokenizer=tokenizer + ) + if top_logprobs + else [] + ), + ) + ) + return out + + def _create_stream_response_logprobs( + self, + token_ids: Sequence[int], + logprobs: SampleLogprobs | None, + tokenizer: AnyTokenizer, + top_logprobs: int | None = None, + ) -> list[response_text_delta_event.Logprob]: + lgs = self._create_response_logprobs( + token_ids=token_ids, + logprobs=logprobs, + tokenizer=tokenizer, + top_logprobs=top_logprobs, + ) + return [ + response_text_delta_event.Logprob( + token=lg.token, + logprob=lg.logprob, + top_logprobs=[ + response_text_delta_event.LogprobTopLogprob( + token=tl.token, logprob=tl.logprob + ) + for tl in lg.top_logprobs + ], + ) + for lg in lgs + ] + + def _make_response_output_items( + self, + request: ResponsesRequest, + final_output: CompletionOutput, + tokenizer: AnyTokenizer, + ) -> list[ResponseOutputItem]: + if self.reasoning_parser: + try: + reasoning_parser = self.reasoning_parser(tokenizer) + except RuntimeError as e: + logger.exception("Error in reasoning parser creation.") + raise e + + reasoning, content = reasoning_parser.extract_reasoning( + final_output.text, request=request + ) + else: + reasoning = None + content = final_output.text + + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + output_text = "" + if content: + output_text = content + elif reasoning: + output_text = f"[reasoning: {reasoning}]" + + if output_text: + self.request_logger.log_outputs( + request_id=request.request_id, + outputs=output_text, + output_token_ids=final_output.token_ids, + finish_reason=final_output.finish_reason, + is_streaming=False, + delta=False, + ) + + reasoning_item = None + message_item = None + if reasoning: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent(text=reasoning, type="reasoning_text") + ], + status=None, # NOTE: Only the last output item has status. + ) + tool_calls, content = self._parse_tool_calls_from_content( + request=request, + tokenizer=tokenizer, + content=content, + enable_auto_tools=self.enable_auto_tools, + tool_parser_cls=self.tool_parser, + ) + if content: + output_text = ResponseOutputText( + text=content, + annotations=[], # TODO + type="output_text", + logprobs=( + self._create_response_logprobs( + token_ids=final_output.token_ids, + logprobs=final_output.logprobs, + tokenizer=tokenizer, + top_logprobs=request.top_logprobs, + ) + if request.is_include_output_logprobs() + else None + ), + ) + message_item = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=[output_text], + role="assistant", + status="completed", + type="message", + ) + outputs = [] + + if reasoning_item: + outputs.append(reasoning_item) + if message_item: + outputs.append(message_item) + if tool_calls: + tool_call_items = [ + ResponseFunctionToolCall( + id=f"fc_{random_uuid()}", + call_id=f"call_{random_uuid()}", + type="function_call", + status="completed", + name=tool_call.name, + arguments=tool_call.arguments, + ) + for tool_call in tool_calls + ] + outputs.extend(tool_call_items) + return outputs + + def _make_response_output_items_with_harmony( + self, + context: HarmonyContext, + ) -> list[ResponseOutputItem]: + output_items: list[ResponseOutputItem] = [] + num_init_messages = context.num_init_messages + for msg in context.messages[num_init_messages:]: + output_items.extend(parse_output_message(msg)) + # Handle the generation stopped in the middle (if any). + last_items = parse_remaining_state(context.parser) + if last_items: + output_items.extend(last_items) + return output_items + + def _construct_input_messages( + self, + request: ResponsesRequest, + prev_response: ResponsesResponse | None = None, + ) -> list[ChatCompletionMessageParam]: + messages: list[ChatCompletionMessageParam] = [] + if request.instructions: + messages.append( + { + "role": "system", + "content": request.instructions, + } + ) + + # Prepend the conversation history. + if prev_response is not None: + # Add the previous messages. + prev_msg = self.msg_store[prev_response.id] + messages.extend(prev_msg) + + # Add the previous output. + for output_item in prev_response.output: + # NOTE: We skip the reasoning output. + if isinstance(output_item, ResponseOutputMessage): + for content in output_item.content: + messages.append( + { + "role": "assistant", + "content": content.text, + } + ) + + # Append the new input. + # Responses API supports simple text inputs without chat format. + if isinstance(request.input, str): + messages.append({"role": "user", "content": request.input}) + else: + for item in request.input: + messages.append(construct_chat_message_with_tool_call(item)) + return messages + + def _construct_harmony_system_input_message( + self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str] + ) -> OpenAIHarmonyMessage: + reasoning_effort = request.reasoning.effort if request.reasoning else None + enable_browser = ( + "web_search_preview" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("browser") + ) + enable_code_interpreter = ( + "code_interpreter" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("python") + ) + enable_container = ( + "container" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("container") + ) + sys_msg = get_system_message( + reasoning_effort=reasoning_effort, + browser_description=( + self.tool_server.get_tool_description("browser") + if enable_browser and self.tool_server is not None + else None + ), + python_description=( + self.tool_server.get_tool_description("python") + if enable_code_interpreter and self.tool_server is not None + else None + ), + container_description=( + self.tool_server.get_tool_description("container") + if enable_container and self.tool_server is not None + else None + ), + instructions=request.instructions, + with_custom_tools=with_custom_tools, + ) + return sys_msg + + def _construct_input_messages_with_harmony( + self, + request: ResponsesRequest, + prev_response: ResponsesResponse | None, + ) -> list[OpenAIHarmonyMessage]: + messages: list[OpenAIHarmonyMessage] = [] + if prev_response is None: + # New conversation. + tool_types = extract_tool_types(request.tools) + with_custom_tools = has_custom_tools(tool_types) + + sys_msg = self._construct_harmony_system_input_message( + request, with_custom_tools, tool_types + ) + messages.append(sys_msg) + if with_custom_tools: + dev_msg = get_developer_message( + instructions=request.instructions, tools=request.tools + ) + messages.append(dev_msg) + messages += construct_harmony_previous_input_messages(request) + + else: + # Continue the previous conversation. + # FIXME(woosuk): Currently, request params like reasoning and + # instructions are ignored. + prev_msgs = self.msg_store[prev_response.id] + # Remove the previous chain-of-thoughts if there is a new "final" + # message. Note that this also removes these messages from the + # msg_store. + if len(prev_msgs) > 0: + last_msg = prev_msgs[-1] + assert isinstance(last_msg, OpenAIHarmonyMessage) + if last_msg.channel == "final": + prev_final_msg_idx = -1 + for i in range(len(prev_msgs) - 2, -1, -1): + prev_msg_i = prev_msgs[i] + assert isinstance(prev_msg_i, OpenAIHarmonyMessage) + if prev_msg_i.channel == "final": + prev_final_msg_idx = i + break + recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1 :] + del prev_msgs[prev_final_msg_idx + 1 :] + for msg in recent_turn_msgs: + assert isinstance(msg, OpenAIHarmonyMessage) + if msg.channel != "analysis": + prev_msgs.append(msg) + messages.extend(prev_msgs) + # Append the new input. + # Responses API supports simple text inputs without chat format. + if isinstance(request.input, str): + messages.append(get_user_message(request.input)) + else: + if prev_response is not None: + prev_outputs = copy(prev_response.output) + else: + prev_outputs = [] + for response_msg in request.input: + messages.append(parse_response_input(response_msg, prev_outputs)) + # User passes in a tool call request and its output. We need + # to add the tool call request to prev_outputs so that the + # parse_response_input can find the tool call request when + # parsing the tool call output. + if isinstance(response_msg, ResponseFunctionToolCall): + prev_outputs.append(response_msg) + return messages + + async def _run_background_request_stream( + self, + request: ResponsesRequest, + *args, + **kwargs, + ): + event_deque: deque[StreamingResponsesResponse] = deque() + new_event_signal = asyncio.Event() + self.event_store[request.request_id] = (event_deque, new_event_signal) + response = None + try: + generator = self.responses_stream_generator(request, *args, **kwargs) + async for event in generator: + event_deque.append(event) + new_event_signal.set() # Signal new event available + except Exception as e: + logger.exception("Background request failed for %s", request.request_id) + response = self.create_error_response(str(e)) + finally: + new_event_signal.set() + + if response is not None and isinstance(response, ErrorResponse): + # If the request has failed, update the status to "failed". + response_id = request.request_id + async with self.response_store_lock: + stored_response = self.response_store.get(response_id) + assert stored_response is not None + if stored_response.status not in ("completed", "cancelled"): + stored_response.status = "failed" + + async def _run_background_request( + self, + request: ResponsesRequest, + *args, + **kwargs, + ): + try: + response = await self.responses_full_generator(request, *args, **kwargs) + except Exception as e: + logger.exception("Background request failed for %s", request.request_id) + response = self.create_error_response(str(e)) + + if isinstance(response, ErrorResponse): + # If the request has failed, update the status to "failed". + response_id = request.request_id + async with self.response_store_lock: + stored_response = self.response_store.get(response_id) + assert stored_response is not None + if stored_response.status not in ("completed", "cancelled"): + stored_response.status = "failed" + + async def responses_background_stream_generator( + self, + response_id: str, + starting_after: int | None = None, + ) -> AsyncGenerator[StreamingResponsesResponse, None]: + if response_id not in self.event_store: + raise ValueError(f"Unknown response_id: {response_id}") + + event_deque, new_event_signal = self.event_store[response_id] + start_index = 0 if starting_after is None else starting_after + 1 + current_index = start_index + + while True: + new_event_signal.clear() + + # Yield existing events from start_index + while current_index < len(event_deque): + event = event_deque[current_index] + yield event + if getattr(event, "type", "unknown") == "response.completed": + return + current_index += 1 + + await new_event_signal.wait() + + async def retrieve_responses( + self, + response_id: str, + starting_after: int | None, + stream: bool | None, + ) -> ( + ErrorResponse + | ResponsesResponse + | AsyncGenerator[StreamingResponsesResponse, None] + ): + async with self.response_store_lock: + response = self.response_store.get(response_id) + + if response is None: + return self._make_not_found_error(response_id) + + if stream: + return self.responses_background_stream_generator( + response_id, + starting_after, + ) + return response + + async def cancel_responses( + self, + response_id: str, + ) -> ErrorResponse | ResponsesResponse: + async with self.response_store_lock: + response = self.response_store.get(response_id) + if response is None: + return self._make_not_found_error(response_id) + + prev_status = response.status + if prev_status not in ("queued", "in_progress"): + return self.create_error_response( + err_type="invalid_request_error", + message="Cannot cancel a synchronous response.", + ) + + # Update the status to "cancelled". + response.status = "cancelled" + + # Abort the request. + if task := self.background_tasks.get(response_id): + task.cancel() + try: + await task + except asyncio.CancelledError: + logger.exception("Background task for %s was cancelled", response_id) + return response + + def _make_not_found_error(self, response_id: str) -> ErrorResponse: + return self.create_error_response( + err_type="invalid_request_error", + message=f"Response with id '{response_id}' not found.", + status_code=HTTPStatus.NOT_FOUND, + ) + + def _make_store_not_supported_error(self) -> ErrorResponse: + return self.create_error_response( + err_type="invalid_request_error", + message=( + "`store=True` (default) is not supported. Please set " + "`store=False` in Responses API or set " + "`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when " + "starting the vLLM server." + ), + status_code=HTTPStatus.BAD_REQUEST, + ) + + async def _process_simple_streaming_events( + self, + request: ResponsesRequest, + sampling_params: SamplingParams, + result_generator: AsyncIterator[ConversationContext | None], + context: ConversationContext, + model_name: str, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + created_time: int, + _increment_sequence_number_and_return: Callable[ + [StreamingResponsesResponse], StreamingResponsesResponse + ], + ) -> AsyncGenerator[StreamingResponsesResponse, None]: + current_content_index = 0 + current_output_index = 0 + current_item_id = "" + reasoning_parser = None + if self.reasoning_parser: + reasoning_parser = self.reasoning_parser(tokenizer) + previous_text = "" + previous_token_ids: list[int] = [] + first_delta_sent = False + previous_delta_messages: list[DeltaMessage] = [] + async for ctx in result_generator: + assert isinstance(ctx, SimpleContext) + if ctx.last_output is None: + continue + if ctx.last_output.outputs: + output = ctx.last_output.outputs[0] + if reasoning_parser: + delta_message = reasoning_parser.extract_reasoning_streaming( + previous_text=previous_text, + current_text=previous_text + output.text, + delta_text=output.text, + previous_token_ids=previous_token_ids, + current_token_ids=previous_token_ids + output.token_ids, + delta_token_ids=output.token_ids, + ) + else: + delta_message = DeltaMessage( + content=output.text, + ) + previous_text += output.text + previous_token_ids += output.token_ids + if not delta_message: + continue + if not first_delta_sent: + current_item_id = str(uuid.uuid4()) + if delta_message.reasoning: + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=ResponseReasoningItem( + type="reasoning", + id=current_item_id, + summary=[], + status="in_progress", + ), + ) + ) + else: + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=ResponseOutputMessage( + id=current_item_id, + type="message", + role="assistant", + content=[], + status="in_progress", + ), + ) + ) + yield _increment_sequence_number_and_return( + ResponseContentPartAddedEvent( + type="response.content_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=ResponseOutputText( + type="output_text", + text="", + annotations=[], + logprobs=[], + ), + ) + ) + current_content_index += 1 + first_delta_sent = True + # todo(kebe7jun) tool call support + + # check delta message and previous delta message are + # same as content or reasoning content + if ( + previous_delta_messages + and previous_delta_messages[-1].reasoning is not None + and delta_message.content is not None + ): + # from reasoning to normal content, send done + # event for reasoning + reason_content = "".join( + pm.reasoning + for pm in previous_delta_messages + if pm.reasoning is not None + ) + yield _increment_sequence_number_and_return( + ResponseReasoningTextDoneEvent( + type="response.reasoning_text.done", + item_id=current_item_id, + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=reason_content, + ) + ) + current_content_index = 0 + reasoning_item = ResponseReasoningItem( + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=reason_content, + type="reasoning_text", + ), + ], + status="completed", + id=current_item_id, + summary=[], + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=reasoning_item, + ) + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=ResponseOutputMessage( + id=current_item_id, + type="message", + role="assistant", + content=[], + status="in_progress", + ), + ) + ) + current_output_index += 1 + current_item_id = str(uuid.uuid4()) + yield _increment_sequence_number_and_return( + ResponseContentPartAddedEvent( + type="response.content_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=ResponseOutputText( + type="output_text", + text="", + annotations=[], + logprobs=[], + ), + ) + ) + current_content_index += 1 + # reset previous delta messages + previous_delta_messages = [] + + if delta_message.reasoning is not None: + yield _increment_sequence_number_and_return( + ResponseReasoningTextDeltaEvent( + type="response.reasoning_text.delta", + sequence_number=-1, + content_index=current_content_index, + output_index=current_output_index, + item_id=current_item_id, + delta=delta_message.reasoning, + ) + ) + elif delta_message.content is not None: + yield _increment_sequence_number_and_return( + ResponseTextDeltaEvent( + type="response.output_text.delta", + sequence_number=-1, + content_index=current_content_index, + output_index=current_output_index, + item_id=current_item_id, + delta=delta_message.content, + logprobs=( + self._create_stream_response_logprobs( + token_ids=output.token_ids, + logprobs=output.logprobs, + tokenizer=tokenizer, + top_logprobs=request.top_logprobs, + ) + if request.is_include_output_logprobs() + else [] + ), + ) + ) + current_content_index += 1 + + previous_delta_messages.append(delta_message) + if previous_delta_messages: + if previous_delta_messages[-1].reasoning is not None: + reason_content = "".join( + pm.reasoning + for pm in previous_delta_messages + if pm.reasoning is not None + ) + yield _increment_sequence_number_and_return( + ResponseReasoningTextDoneEvent( + type="response.reasoning_text.done", + item_id=current_item_id, + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=reason_content, + ) + ) + current_content_index += 1 + reasoning_item = ResponseReasoningItem( + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=reason_content, + type="reasoning_text", + ), + ], + status="completed", + id=current_item_id, + summary=[], + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=reasoning_item, + ) + ) + elif previous_delta_messages[-1].content is not None: + final_content = "".join( + pm.content + for pm in previous_delta_messages + if pm.content is not None + ) + yield _increment_sequence_number_and_return( + ResponseTextDoneEvent( + type="response.output_text.done", + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=final_content, + logprobs=[], + item_id=current_item_id, + ) + ) + current_content_index += 1 + part = ResponseOutputText( + text=final_content, + type="output_text", + annotations=[], + ) + yield _increment_sequence_number_and_return( + ResponseContentPartDoneEvent( + type="response.content_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=part, + ) + ) + current_content_index += 1 + item = ResponseOutputMessage( + type="message", + role="assistant", + content=[ + part, + ], + status="completed", + id=current_item_id, + summary=[], + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=item, + ) + ) + + async def _process_harmony_streaming_events( + self, + request: ResponsesRequest, + sampling_params: SamplingParams, + result_generator: AsyncIterator[ConversationContext | None], + context: ConversationContext, + model_name: str, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + created_time: int, + _increment_sequence_number_and_return: Callable[ + [StreamingResponsesResponse], StreamingResponsesResponse + ], + ) -> AsyncGenerator[StreamingResponsesResponse, None]: + current_content_index = -1 + current_output_index = 0 + current_item_id: str = "" + sent_output_item_added = False + is_first_function_call_delta = False + async for ctx in result_generator: + assert isinstance(ctx, StreamingHarmonyContext) + + if ctx.is_expecting_start(): + current_output_index += 1 + sent_output_item_added = False + is_first_function_call_delta = False + if len(ctx.parser.messages) > 0: + previous_item = ctx.parser.messages[-1] + if previous_item.recipient is not None: + # Deal with tool call + if previous_item.recipient.startswith("functions."): + function_name = previous_item.recipient[len("functions.") :] + yield _increment_sequence_number_and_return( + ResponseFunctionCallArgumentsDoneEvent( + type="response.function_call_arguments.done", + arguments=previous_item.content[0].text, + name=function_name, + item_id=current_item_id, + output_index=current_output_index, + sequence_number=-1, + ) + ) + function_call_item = ResponseFunctionToolCall( + type="function_call", + arguments=previous_item.content[0].text, + name=function_name, + item_id=current_item_id, + output_index=current_output_index, + sequence_number=-1, + call_id=f"fc_{random_uuid()}", + status="completed", + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=function_call_item, + ) + ) + elif previous_item.channel == "analysis": + content = ResponseReasoningTextContent( + text=previous_item.content[0].text, + type="reasoning_text", + ) + reasoning_item = ResponseReasoningItem( + type="reasoning", + content=[content], + status="completed", + id=current_item_id, + summary=[], + ) + yield _increment_sequence_number_and_return( + ResponseReasoningTextDoneEvent( + type="response.reasoning_text.done", + item_id=current_item_id, + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=previous_item.content[0].text, + ) + ) + yield _increment_sequence_number_and_return( + ResponseReasoningPartDoneEvent( + type="response.reasoning_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=content, + ) + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=reasoning_item, + ) + ) + elif previous_item.channel == "final": + text_content = ResponseOutputText( + type="output_text", + text=previous_item.content[0].text, + annotations=[], + ) + yield _increment_sequence_number_and_return( + ResponseTextDoneEvent( + type="response.output_text.done", + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=previous_item.content[0].text, + logprobs=[], + item_id=current_item_id, + ) + ) + yield _increment_sequence_number_and_return( + ResponseContentPartDoneEvent( + type="response.content_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=text_content, + ) + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=ResponseOutputMessage( + id=current_item_id, + type="message", + role="assistant", + content=[text_content], + status="completed", + ), + ) + ) + + # stream the output of a harmony message + if ctx.parser.last_content_delta: + if ( + ctx.parser.current_channel == "final" + and ctx.parser.current_recipient is None + ): + if not sent_output_item_added: + sent_output_item_added = True + current_item_id = f"msg_{random_uuid()}" + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=ResponseOutputMessage( + id=current_item_id, + type="message", + role="assistant", + content=[], + status="in_progress", + ), + ) + ) + current_content_index += 1 + yield _increment_sequence_number_and_return( + ResponseContentPartAddedEvent( + type="response.content_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=ResponseOutputText( + type="output_text", + text="", + annotations=[], + logprobs=[], + ), + ) + ) + yield _increment_sequence_number_and_return( + ResponseTextDeltaEvent( + type="response.output_text.delta", + sequence_number=-1, + content_index=current_content_index, + output_index=current_output_index, + item_id=current_item_id, + delta=ctx.parser.last_content_delta, + # TODO, use logprobs from ctx.last_request_output + logprobs=[], + ) + ) + elif ( + ctx.parser.current_channel == "analysis" + and ctx.parser.current_recipient is None + ): + if not sent_output_item_added: + sent_output_item_added = True + current_item_id = f"msg_{random_uuid()}" + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=ResponseReasoningItem( + type="reasoning", + id=current_item_id, + summary=[], + status="in_progress", + ), + ) + ) + current_content_index += 1 + yield _increment_sequence_number_and_return( + ResponseReasoningPartAddedEvent( + type="response.reasoning_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=ResponseReasoningTextContent( + text="", + type="reasoning_text", + ), + ) + ) + yield _increment_sequence_number_and_return( + ResponseReasoningTextDeltaEvent( + type="response.reasoning_text.delta", + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + delta=ctx.parser.last_content_delta, + sequence_number=-1, + ) + ) + # built-in tools will be triggered on the analysis channel + # However, occasionally built-in tools will + # still be output to commentary. + elif ( + ctx.parser.current_channel == "commentary" + or ctx.parser.current_channel == "analysis" + ) and ctx.parser.current_recipient == "python": + if not sent_output_item_added: + sent_output_item_added = True + current_item_id = f"tool_{random_uuid()}" + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=ResponseCodeInterpreterToolCallParam( + type="code_interpreter_call", + id=current_item_id, + code=None, + container_id="auto", + outputs=None, + status="in_progress", + ), + ) + ) + yield _increment_sequence_number_and_return( + ResponseCodeInterpreterCallInProgressEvent( + type="response.code_interpreter_call.in_progress", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + ) + ) + yield _increment_sequence_number_and_return( + ResponseCodeInterpreterCallCodeDeltaEvent( + type="response.code_interpreter_call_code.delta", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + delta=ctx.parser.last_content_delta, + ) + ) + + # stream tool call outputs + if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0: + previous_item = ctx.parser.messages[-1] + if ( + self.tool_server is not None + and self.tool_server.has_tool("browser") + and previous_item.recipient is not None + and previous_item.recipient.startswith("browser.") + ): + function_name = previous_item.recipient[len("browser.") :] + action = None + parsed_args = json.loads(previous_item.content[0].text) + if function_name == "search": + action = response_function_web_search.ActionSearch( + type="search", + query=parsed_args["query"], + ) + elif function_name == "open": + action = response_function_web_search.ActionOpenPage( + type="open_page", + # TODO: translate to url + url=f"cursor:{parsed_args.get('cursor', '')}", + ) + elif function_name == "find": + action = response_function_web_search.ActionFind( + type="find", + pattern=parsed_args["pattern"], + # TODO: translate to url + url=f"cursor:{parsed_args.get('cursor', '')}", + ) + else: + raise ValueError(f"Unknown function name: {function_name}") + + current_item_id = f"tool_{random_uuid()}" + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=response_function_web_search.ResponseFunctionWebSearch( + # TODO: generate a unique id for web search call + type="web_search_call", + id=current_item_id, + action=action, + status="in_progress", + ), + ) + ) + yield _increment_sequence_number_and_return( + ResponseWebSearchCallInProgressEvent( + type="response.web_search_call.in_progress", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + ) + ) + yield _increment_sequence_number_and_return( + ResponseWebSearchCallSearchingEvent( + type="response.web_search_call.searching", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + ) + ) + + # enqueue + yield _increment_sequence_number_and_return( + ResponseWebSearchCallCompletedEvent( + type="response.web_search_call.completed", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + ) + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=ResponseFunctionWebSearch( + type="web_search_call", + id=current_item_id, + action=action, + status="completed", + ), + ) + ) + + if ( + self.tool_server is not None + and self.tool_server.has_tool("python") + and previous_item.recipient is not None + and previous_item.recipient.startswith("python") + ): + yield _increment_sequence_number_and_return( + ResponseCodeInterpreterCallCodeDoneEvent( + type="response.code_interpreter_call_code.done", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + code=previous_item.content[0].text, + ) + ) + yield _increment_sequence_number_and_return( + ResponseCodeInterpreterCallInterpretingEvent( + type="response.code_interpreter_call.interpreting", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + ) + ) + yield _increment_sequence_number_and_return( + ResponseCodeInterpreterCallCompletedEvent( + type="response.code_interpreter_call.completed", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + ) + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=ResponseCodeInterpreterToolCallParam( + type="code_interpreter_call", + id=current_item_id, + code=previous_item.content[0].text, + container_id="auto", + # TODO: add outputs here + outputs=[], + status="completed", + ), + ) + ) + # developer tools will be triggered on the commentary channel + # and recipient starts with "functions.TOOL_NAME" + if ( + ctx.parser.current_channel == "commentary" + and ctx.parser.current_recipient + and ctx.parser.current_recipient.startswith("functions.") + ): + if is_first_function_call_delta is False: + is_first_function_call_delta = True + fc_name = ctx.parser.current_recipient[len("functions.") :] + tool_call_item = ResponseFunctionToolCall( + name=fc_name, + type="function_call", + id=current_item_id, + call_id=f"call_{random_uuid()}", + arguments="", + status="in_progress", + ) + current_item_id = f"fc_{random_uuid()}" + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=tool_call_item, + ) + ) + else: + yield _increment_sequence_number_and_return( + ResponseFunctionCallArgumentsDeltaEvent( + item_id=current_item_id, + delta=ctx.parser.last_content_delta, + output_index=current_output_index, + sequence_number=-1, + type="response.function_call_arguments.delta", + ) + ) + + async def responses_stream_generator( + self, + request: ResponsesRequest, + sampling_params: SamplingParams, + result_generator: AsyncIterator[ConversationContext | None], + context: ConversationContext, + model_name: str, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + created_time: int | None = None, + ) -> AsyncGenerator[StreamingResponsesResponse, None]: + # TODO: + # 1. Handle disconnect + + created_time = created_time or int(time.time()) + + sequence_number = 0 + + def _increment_sequence_number_and_return( + event: StreamingResponsesResponse, + ) -> StreamingResponsesResponse: + nonlocal sequence_number + # Set sequence_number if the event has this attribute + if hasattr(event, "sequence_number"): + event.sequence_number = sequence_number + sequence_number += 1 + return event + + async with AsyncExitStack() as exit_stack: + processer = None + if self.use_harmony: + # TODO: in streaming, we noticed this bug: + # https://github.com/vllm-project/vllm/issues/25697 + await self._initialize_tool_sessions(request, context, exit_stack) + processer = self._process_harmony_streaming_events + else: + processer = self._process_simple_streaming_events + # TODO Hanchen make sampling params to include the structural tag + + initial_response = ResponsesResponse.from_request( + request, + sampling_params, + model_name=model_name, + created_time=created_time, + output=[], + status="in_progress", + usage=None, + ).model_dump() + yield _increment_sequence_number_and_return( + ResponseCreatedEvent( + type="response.created", + sequence_number=-1, + response=initial_response, + ) + ) + yield _increment_sequence_number_and_return( + ResponseInProgressEvent( + type="response.in_progress", + sequence_number=-1, + response=initial_response, + ) + ) + + async for event_data in processer( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + _increment_sequence_number_and_return, + ): + yield event_data + + async def empty_async_generator(): + # A hack to trick Python to think this is a generator but + # in fact it immediately returns. + if False: + yield + + final_response = await self.responses_full_generator( + request, + sampling_params, + empty_async_generator(), + context, + model_name, + tokenizer, + request_metadata, + created_time=created_time, + ) + yield _increment_sequence_number_and_return( + ResponseCompletedEvent( + type="response.completed", + sequence_number=-1, + response=final_response, + ) + ) diff --git a/entrypoints/openai/serving_score.py b/entrypoints/openai/serving_score.py new file mode 100644 index 0000000..9cbfc97 --- /dev/null +++ b/entrypoints/openai/serving_score.py @@ -0,0 +1,503 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import time +from collections.abc import AsyncGenerator, Mapping +from typing import Any + +from fastapi import Request + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ErrorResponse, + RerankDocument, + RerankRequest, + RerankResponse, + RerankResult, + RerankUsage, + ScoreRequest, + ScoreResponse, + ScoreResponseData, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.score_utils import ( + ScoreContentPartParam, + ScoreMultiModalParam, + _cosine_similarity, + _validate_score_input_lens, + compress_token_type_ids, + get_score_prompt, +) +from vllm.entrypoints.utils import _validate_truncation_size +from vllm.inputs.data import TokensPrompt +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils.async_utils import make_async, merge_async_iterators + +logger = init_logger(__name__) + + +class ServingScores(OpenAIServing): + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + log_error_stack: bool = False, + ) -> None: + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + log_error_stack=log_error_stack, + ) + + async def _embedding_score( + self, + tokenizer: AnyTokenizer, + texts_1: list[str], + texts_2: list[str], + request: RerankRequest | ScoreRequest, + request_id: str, + tokenization_kwargs: dict[str, Any] | None = None, + lora_request: LoRARequest | None | None = None, + trace_headers: Mapping[str, str] | None = None, + ) -> list[PoolingRequestOutput] | ErrorResponse: + input_texts = texts_1 + texts_2 + + engine_prompts: list[TokensPrompt] = [] + tokenize_async = make_async( + tokenizer.__call__, executor=self._tokenizer_executor + ) + + tokenization_kwargs = tokenization_kwargs or {} + tokenized_prompts = await asyncio.gather( + *(tokenize_async(t, **tokenization_kwargs) for t in input_texts) + ) + + for tok_result, input_text in zip(tokenized_prompts, input_texts): + text_token_prompt = self._validate_input( + request, tok_result["input_ids"], input_text + ) + + engine_prompts.append( + TokensPrompt(prompt_token_ids=text_token_prompt["prompt_token_ids"]) + ) + + # Schedule the request and get the result generator. + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] + pooling_params = request.to_pooling_params() + + try: + pooling_params.verify("embed", self.model_config) + except ValueError as e: + return self.create_error_response(str(e)) + + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs( + request_id_item, + input_texts[i], + params=pooling_params, + lora_request=lora_request, + ) + + generators.append( + self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + ) + + result_generator = merge_async_iterators(*generators) + + # Non-streaming response + final_res_batch: list[PoolingRequestOutput] = [] + + embeddings: list[PoolingRequestOutput | None] = [None] * len(engine_prompts) + + async for i, res in result_generator: + embeddings[i] = res + + emb_texts_1: list[PoolingRequestOutput] = [] + emb_texts_2: list[PoolingRequestOutput] = [] + + for i in range(0, len(texts_1)): + assert (emb := embeddings[i]) is not None + emb_texts_1.append(emb) + + for i in range(len(texts_1), len(embeddings)): + assert (emb := embeddings[i]) is not None + emb_texts_2.append(emb) + + if len(emb_texts_1) == 1: + emb_texts_1 = emb_texts_1 * len(emb_texts_2) + + final_res_batch = _cosine_similarity( + tokenizer=tokenizer, embed_1=emb_texts_1, embed_2=emb_texts_2 + ) + + return final_res_batch + + def _preprocess_score( + self, + request: RerankRequest | ScoreRequest, + tokenizer: AnyTokenizer, + tokenization_kwargs: dict[str, Any], + data_1: str | ScoreContentPartParam, + data_2: str | ScoreContentPartParam, + ) -> tuple[str, TokensPrompt]: + model_config = self.model_config + + full_prompt, engine_prompt = get_score_prompt( + model_config=model_config, + data_1=data_1, + data_2=data_2, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + ) + self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt) + if request.mm_processor_kwargs is not None: + engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs + + return full_prompt, engine_prompt + + async def _cross_encoding_score( + self, + tokenizer: AnyTokenizer, + data_1: list[str] | list[ScoreContentPartParam], + data_2: list[str] | list[ScoreContentPartParam], + request: RerankRequest | ScoreRequest, + request_id: str, + tokenization_kwargs: dict[str, Any] | None = None, + lora_request: LoRARequest | None | None = None, + trace_headers: Mapping[str, str] | None = None, + ) -> list[PoolingRequestOutput] | ErrorResponse: + request_prompts: list[str] = [] + engine_prompts: list[TokensPrompt] = [] + + if len(data_1) == 1: + data_1 = data_1 * len(data_2) + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError("MistralTokenizer not supported for cross-encoding") + + tokenization_kwargs = tokenization_kwargs or {} + + input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)] + + preprocess_async = make_async( + self._preprocess_score, executor=self._tokenizer_executor + ) + + preprocessed_prompts = await asyncio.gather( + *( + preprocess_async( + request=request, + tokenizer=tokenizer, + tokenization_kwargs=tokenization_kwargs, + data_1=t1, + data_2=t2, + ) + for t1, t2 in input_pairs + ) + ) + + for full_prompt, engine_prompt in preprocessed_prompts: + request_prompts.append(full_prompt) + engine_prompts.append(engine_prompt) + + # Schedule the request and get the result generator. + generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] + + default_pooling_params = request.to_pooling_params() + + try: + default_pooling_params.verify("score", self.model_config) + except ValueError as e: + return self.create_error_response(str(e)) + + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs( + request_id_item, + request_prompts[i], + params=default_pooling_params, + lora_request=lora_request, + ) + + if token_type_ids := engine_prompt.pop("token_type_ids", None): + pooling_params = default_pooling_params.clone() + compressed = compress_token_type_ids(token_type_ids) + pooling_params.extra_kwargs = {"compressed_token_type_ids": compressed} + else: + pooling_params = default_pooling_params + + generator = self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generators.append(generator) + + result_generator = merge_async_iterators(*generators) + + # Non-streaming response + final_res_batch: list[PoolingRequestOutput | None] = [None] * len( + engine_prompts + ) + + async for i, res in result_generator: + final_res_batch[i] = res + + return [out for out in final_res_batch if out is not None] + + async def _run_scoring( + self, + data_1: list[str] | str | ScoreMultiModalParam, + data_2: list[str] | str | ScoreMultiModalParam, + request: ScoreRequest | RerankRequest, + request_id: str, + raw_request: Request | None = None, + ) -> list[PoolingRequestOutput] | ErrorResponse: + lora_request = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer() + + truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None) + + tokenization_kwargs: dict[str, Any] = {} + _validate_truncation_size( + self.max_model_len, truncate_prompt_tokens, tokenization_kwargs + ) + + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + if not self.model_config.is_multimodal_model and ( + isinstance(data_1, dict) or isinstance(data_2, dict) + ): + raise ValueError( + f"MultiModalParam is not supported for {self.model_config.architecture}" # noqa: E501 + ) + + if isinstance(data_1, str): + data_1 = [data_1] + elif isinstance(data_1, dict): + data_1 = data_1.get("content") # type: ignore[assignment] + + if isinstance(data_2, str): + data_2 = [data_2] + elif isinstance(data_2, dict): + data_2 = data_2.get("content") # type: ignore[assignment] + + _validate_score_input_lens(data_1, data_2) # type: ignore[arg-type] + + if self.model_config.is_cross_encoder: + return await self._cross_encoding_score( + tokenizer=tokenizer, + data_1=data_1, # type: ignore[arg-type] + data_2=data_2, # type: ignore[arg-type] + request=request, + request_id=request_id, + tokenization_kwargs=tokenization_kwargs, + lora_request=lora_request, + trace_headers=trace_headers, + ) + + else: + return await self._embedding_score( + tokenizer=tokenizer, + texts_1=data_1, # type: ignore[arg-type] + texts_2=data_2, # type: ignore[arg-type] + request=request, + request_id=request_id, + tokenization_kwargs=tokenization_kwargs, + lora_request=lora_request, + trace_headers=trace_headers, + ) + + async def create_score( + self, + request: ScoreRequest, + raw_request: Request | None = None, + ) -> ScoreResponse | ErrorResponse: + """ + Score API similar to Sentence Transformers cross encoder + + See https://sbert.net/docs/package_reference/cross_encoder + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + request_id = f"score-{self._base_request_id(raw_request)}" + created_time = int(time.time()) + + try: + final_res_batch = await self._run_scoring( + request.text_1, + request.text_2, + request, + request_id, + raw_request, + ) + if isinstance(final_res_batch, ErrorResponse): + return final_res_batch + + return self.request_output_to_score_response( + final_res_batch, + request_id, + created_time, + self.models.model_name(), + ) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + async def do_rerank( + self, request: RerankRequest, raw_request: Request | None = None + ) -> RerankResponse | ErrorResponse: + """ + Rerank API based on JinaAI's rerank API; implements the same + API interface. Designed for compatibility with off-the-shelf + tooling, since this is a common standard for reranking APIs + + See example client implementations at + https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py + numerous clients use this standard. + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + request_id = f"rerank-{self._base_request_id(raw_request)}" + documents = request.documents + top_n = ( + request.top_n + if request.top_n > 0 + else ( + len(documents) + if isinstance(documents, list) + else len(documents["content"]) + ) + ) + + try: + final_res_batch = await self._run_scoring( + request.query, + documents, + request, + request_id, + raw_request, + ) + if isinstance(final_res_batch, ErrorResponse): + return final_res_batch + + return self.request_output_to_rerank_response( + final_res_batch, + request_id, + self.models.model_name(), + documents, + top_n, + ) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + def request_output_to_score_response( + self, + final_res_batch: list[PoolingRequestOutput], + request_id: str, + created_time: int, + model_name: str, + ) -> ScoreResponse: + items: list[ScoreResponseData] = [] + num_prompt_tokens = 0 + + for idx, final_res in enumerate(final_res_batch): + classify_res = ScoringRequestOutput.from_base(final_res) + + item = ScoreResponseData( + index=idx, + score=classify_res.outputs.score, + ) + prompt_token_ids = final_res.prompt_token_ids + + items.append(item) + num_prompt_tokens += len(prompt_token_ids) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + + return ScoreResponse( + id=request_id, + created=created_time, + model=model_name, + data=items, + usage=usage, + ) + + def request_output_to_rerank_response( + self, + final_res_batch: list[PoolingRequestOutput], + request_id: str, + model_name: str, + documents: list[str] | ScoreMultiModalParam, + top_n: int, + ) -> RerankResponse: + """ + Convert the output of do_rank to a RerankResponse + """ + results: list[RerankResult] = [] + num_prompt_tokens = 0 + for idx, final_res in enumerate(final_res_batch): + classify_res = ScoringRequestOutput.from_base(final_res) + + result = RerankResult( + index=idx, + document=RerankDocument(text=documents[idx]) + if isinstance(documents, list) + else RerankDocument(multi_modal=documents["content"][idx]), + relevance_score=classify_res.outputs.score, + ) + results.append(result) + prompt_token_ids = final_res.prompt_token_ids + num_prompt_tokens += len(prompt_token_ids) + + # sort by relevance, then return the top n if set + results.sort(key=lambda x: x.relevance_score, reverse=True) + if top_n < len(documents): + results = results[:top_n] + + return RerankResponse( + id=request_id, + model=model_name, + results=results, + usage=RerankUsage(total_tokens=num_prompt_tokens), + ) diff --git a/entrypoints/openai/serving_tokenization.py b/entrypoints/openai/serving_tokenization.py new file mode 100644 index 0000000..39aae0c --- /dev/null +++ b/entrypoints/openai/serving_tokenization.py @@ -0,0 +1,203 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass +from typing import Any, Final + +import jinja2 +from fastapi import Request + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + DetokenizeRequest, + DetokenizeResponse, + ErrorResponse, + TokenizeChatRequest, + TokenizeRequest, + TokenizeResponse, + TokenizerInfoResponse, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class OpenAIServingTokenization(OpenAIServing): + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + trust_request_chat_template: bool = False, + log_error_stack: bool = False, + ) -> None: + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + log_error_stack=log_error_stack, + ) + + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format + self.trust_request_chat_template = trust_request_chat_template + + async def create_tokenize( + self, + request: TokenizeRequest, + raw_request: Request, + ) -> TokenizeResponse | ErrorResponse: + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + request_id = f"tokn-{self._base_request_id(raw_request)}" + + try: + lora_request = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer() + renderer = self._get_renderer(tokenizer) + + if isinstance(request, TokenizeChatRequest): + tool_dicts = ( + None + if request.tools is None + else [tool.model_dump() for tool in request.tools] + ) + error_check_ret = self._validate_chat_template( + request_chat_template=request.chat_template, + chat_template_kwargs=request.chat_template_kwargs, + trust_request_chat_template=self.trust_request_chat_template, + ) + if error_check_ret is not None: + return error_check_ret + ( + _, + _, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + tool_dicts=tool_dicts, + chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self.chat_template_content_format, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + chat_template_kwargs=request.chat_template_kwargs, + add_special_tokens=request.add_special_tokens, + ) + else: + engine_prompts = await renderer.render_prompt( + prompt_or_prompts=request.prompt, + config=self._build_render_config(request), + ) + except (ValueError, TypeError, jinja2.TemplateError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(f"{e} {e.__cause__}") + + input_ids: list[int] = [] + for engine_prompt in engine_prompts: + self._log_inputs( + request_id, engine_prompt, params=None, lora_request=lora_request + ) + + if isinstance(engine_prompt, dict) and "prompt_token_ids" in engine_prompt: + input_ids.extend(engine_prompt["prompt_token_ids"]) + + token_strs = None + if request.return_token_strs: + token_strs = tokenizer.convert_ids_to_tokens(input_ids) + + return TokenizeResponse( + tokens=input_ids, + token_strs=token_strs, + count=len(input_ids), + max_model_len=self.max_model_len, + ) + + async def create_detokenize( + self, + request: DetokenizeRequest, + raw_request: Request, + ) -> DetokenizeResponse | ErrorResponse: + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + request_id = f"tokn-{self._base_request_id(raw_request)}" + + lora_request = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer() + + self._log_inputs( + request_id, request.tokens, params=None, lora_request=lora_request + ) + + prompt_input = await self._tokenize_prompt_input_async( + request, + tokenizer, + request.tokens, + ) + input_text = prompt_input["prompt"] + + return DetokenizeResponse(prompt=input_text) + + async def get_tokenizer_info( + self, + ) -> TokenizerInfoResponse | ErrorResponse: + """Get comprehensive tokenizer information.""" + try: + tokenizer = await self.engine_client.get_tokenizer() + info = TokenizerInfo(tokenizer, self.chat_template).to_dict() + return TokenizerInfoResponse(**info) + except Exception as e: + return self.create_error_response(f"Failed to get tokenizer info: {str(e)}") + + def _build_render_config(self, request: TokenizeRequest) -> RenderConfig: + return RenderConfig(add_special_tokens=request.add_special_tokens) + + +@dataclass +class TokenizerInfo: + tokenizer: AnyTokenizer + chat_template: str | None + + def to_dict(self) -> dict[str, Any]: + """Return the tokenizer configuration.""" + return self._get_tokenizer_config() + + def _get_tokenizer_config(self) -> dict[str, Any]: + """Get tokenizer configuration directly from the tokenizer object.""" + config = dict(getattr(self.tokenizer, "init_kwargs", None) or {}) + + # Remove file path fields + config.pop("vocab_file", None) + config.pop("merges_file", None) + + config = self._make_json_serializable(config) + config["tokenizer_class"] = type(self.tokenizer).__name__ + if self.chat_template: + config["chat_template"] = self.chat_template + return config + + def _make_json_serializable(self, obj): + """Convert any non-JSON-serializable objects to serializable format.""" + if hasattr(obj, "content"): + return obj.content + elif isinstance(obj, dict): + return {k: self._make_json_serializable(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [self._make_json_serializable(item) for item in obj] + else: + return obj diff --git a/entrypoints/openai/serving_tokens.py b/entrypoints/openai/serving_tokens.py new file mode 100644 index 0000000..69a526b --- /dev/null +++ b/entrypoints/openai/serving_tokens.py @@ -0,0 +1,269 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import time +from collections.abc import AsyncGenerator +from collections.abc import Sequence as GenericSequence + +from fastapi import Request + +# yapf: disable +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ChatCompletionLogProb, + ChatCompletionLogProbs, + ChatCompletionLogProbsContent, + ErrorResponse, + GenerateRequest, + GenerateResponse, + GenerateResponseChoice, + PromptTokenUsageInfo, + RequestResponseMetadata, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt +from vllm.logger import init_logger +from vllm.logprobs import Logprob +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.utils.collection_utils import as_list + +logger = init_logger(__name__) + + +class ServingTokens(OpenAIServing): + """Provides Tokens IN <> Tokens OUT functionality to vLLM API.""" + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + force_no_detokenize: bool = False, + return_tokens_as_token_ids: bool = False, + log_error_stack: bool = False, + enable_prompt_tokens_details: bool = False, + enable_log_outputs: bool = False, + ): + super().__init__(engine_client=engine_client, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + log_error_stack=log_error_stack) + self.enable_prompt_tokens_details = enable_prompt_tokens_details + self.enable_log_outputs = enable_log_outputs + self.force_no_detokenize = force_no_detokenize + if force_no_detokenize: + logger.info("Tokens-only mode is enabled, skipping detokenization " + "step for incoming requests.") + + async def serve_tokens( + self, + request: GenerateRequest, + raw_request: Request | None = None + ) -> GenerateResponse | ErrorResponse: + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + logger.error("Error with model %s", error_check_ret) + return error_check_ret + + # If the engine is dead, raise the engine's DEAD_ERROR. + # This is required for the streaming case, where we return a + # success status before we actually start generating text :). + if self.engine_client.errored: + raise self.engine_client.dead_error + + lora_request = None + lora_request = self._maybe_get_adapters(request, + supports_default_mm_loras=True) + + model_name = self.models.model_name(lora_request) + + request_id = "generate-tokens-" \ + f"{self._base_request_id(raw_request, request.request_id)}" + + request_metadata = RequestResponseMetadata(request_id=request_id) + if raw_request: + raw_request.state.request_metadata = request_metadata + + # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is + # completed + engine_prompt = EngineTokensPrompt(prompt_token_ids=request.token_ids) + if request.features is not None: + engine_prompt["multi_modal_data"] = None + + if hasattr(request, "cache_salt") and request.cache_salt is not None: + engine_prompt["cache_salt"] = request.cache_salt + + # Schedule the request and get the result generator. + result_generator: AsyncGenerator[RequestOutput, None] | None = None + try: + sampling_params = request.sampling_params + if self.force_no_detokenize: + sampling_params.detokenize = False + + self._log_inputs(request_id, + request.token_ids, + params=sampling_params, + lora_request=lora_request) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + result_generator = self.engine_client.generate( + engine_prompt, + sampling_params, + request_id, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + except ValueError as e: + return self.create_error_response(str(e)) + + # TODO(NickLucche): Implement streaming response + + try: + assert result_generator is not None + return await self.serve_tokens_full_generator( + request, result_generator, request_id, model_name, + request_metadata) + except ValueError as e: + return self.create_error_response(str(e)) + + async def serve_tokens_full_generator( + self, + request: GenerateRequest, + result_generator: AsyncGenerator[RequestOutput, None], + request_id: str, + model_name: str, + request_metadata: RequestResponseMetadata, + ) -> ErrorResponse | GenerateResponse: + + created_time = int(time.time()) + final_res: RequestOutput | None = None + sampling_params: SamplingParams = request.sampling_params + + try: + async for res in result_generator: + final_res = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + return self.create_error_response(str(e)) + + assert final_res is not None + + choices: list[GenerateResponseChoice] = [] + num_generated_tokens = 0 + for output in final_res.outputs: + token_ids = output.token_ids + out_logprobs = output.logprobs + + # This is top_logprobs in completions API + if sampling_params.logprobs: + assert out_logprobs is not None, "Did not output logprobs" + logprobs = self._create_tokens_logprobs( + token_ids=token_ids, + top_logprobs=out_logprobs, + num_output_top_logprobs=sampling_params.logprobs, + ) + else: + logprobs = None + + choice_data = GenerateResponseChoice( + index=output.index, + logprobs=logprobs, + finish_reason=output.finish_reason + if output.finish_reason else "stop", + token_ids=as_list(output.token_ids)) + + choices.append(choice_data) + num_generated_tokens += len(output.token_ids) + + assert final_res.prompt_token_ids is not None + num_prompt_tokens = len(final_res.prompt_token_ids) + if final_res.encoder_prompt_token_ids is not None: + num_prompt_tokens += len(final_res.encoder_prompt_token_ids) + + usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + + num_generated_tokens) + if self.enable_prompt_tokens_details and final_res.num_cached_tokens: + # This info is not available at the /coordinator level + usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=final_res.num_cached_tokens) + + request_metadata.final_usage_info = usage + + response = GenerateResponse( + id=request_id, + created=created_time, + model=model_name, + choices=choices, + usage=usage, + prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs), + kv_transfer_params=final_res.kv_transfer_params, + ) + + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + for choice in choices: + # Get the corresponding output token IDs + output_token_ids = None + if choice.index < len(final_res.outputs): + output_token_ids = final_res.outputs[ + choice.index].token_ids + + if output_token_ids: + # Log token_ids only. + self.request_logger.log_outputs( + request_id=request_id, + outputs="", + output_token_ids=output_token_ids, + finish_reason=choice.finish_reason, + is_streaming=False, + delta=False, + ) + + return response + + def _create_tokens_logprobs( + self, + token_ids: GenericSequence[int], + top_logprobs: GenericSequence[dict[int, Logprob] | None], + num_output_top_logprobs: int | None = None, + ) -> ChatCompletionLogProbs: + """Create OpenAI-style logprobs.""" + logprobs_content: list[ChatCompletionLogProbsContent] = [] + + for i, token_id in enumerate(token_ids): + token = f"token_id:{token_id}" + step_top_logprobs = top_logprobs[i] + if step_top_logprobs is None or step_top_logprobs.get( + token_id) is None: + logprobs_content.append( + ChatCompletionLogProbsContent(token=token, )) + else: + step_token = step_top_logprobs[token_id] + + logprobs_content.append( + ChatCompletionLogProbsContent( + token=token, + logprob=max(step_token.logprob, -9999.0), + top_logprobs=[ + ChatCompletionLogProb( + token=token, + logprob=max(p[1].logprob, -9999.0), + ) for i, p in enumerate(step_top_logprobs.items()) + if num_output_top_logprobs + and i < num_output_top_logprobs + ])) + + return ChatCompletionLogProbs(content=logprobs_content) diff --git a/entrypoints/openai/serving_transcription.py b/entrypoints/openai/serving_transcription.py new file mode 100644 index 0000000..33da703 --- /dev/null +++ b/entrypoints/openai/serving_transcription.py @@ -0,0 +1,148 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import AsyncGenerator + +from fastapi import Request + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + ErrorResponse, + RequestResponseMetadata, + TranscriptionRequest, + TranscriptionResponse, + TranscriptionResponseStreamChoice, + TranscriptionStreamResponse, + TranslationRequest, + TranslationResponse, + TranslationResponseStreamChoice, + TranslationStreamResponse, +) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.openai.speech_to_text import OpenAISpeechToText +from vllm.logger import init_logger +from vllm.outputs import RequestOutput + +logger = init_logger(__name__) + + +class OpenAIServingTranscription(OpenAISpeechToText): + """Handles transcription requests.""" + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + return_tokens_as_token_ids: bool = False, + log_error_stack: bool = False, + enable_force_include_usage: bool = False, + ): + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + task_type="transcribe", + log_error_stack=log_error_stack, + enable_force_include_usage=enable_force_include_usage, + ) + + async def create_transcription( + self, audio_data: bytes, request: TranscriptionRequest, raw_request: Request + ) -> TranscriptionResponse | AsyncGenerator[str, None] | ErrorResponse: + """Transcription API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/audio/createTranscription + for the API specification. This API mimics the OpenAI transcription API. + """ + return await self._create_speech_to_text( + audio_data=audio_data, + request=request, + raw_request=raw_request, + response_class=TranscriptionResponse, + stream_generator_method=self.transcription_stream_generator, + ) + + async def transcription_stream_generator( + self, + request: TranscriptionRequest, + result_generator: list[AsyncGenerator[RequestOutput, None]], + request_id: str, + request_metadata: RequestResponseMetadata, + audio_duration_s: float, + ) -> AsyncGenerator[str, None]: + generator = self._speech_to_text_stream_generator( + request=request, + list_result_generator=result_generator, + request_id=request_id, + request_metadata=request_metadata, + audio_duration_s=audio_duration_s, + chunk_object_type="transcription.chunk", + response_stream_choice_class=TranscriptionResponseStreamChoice, + stream_response_class=TranscriptionStreamResponse, + ) + async for chunk in generator: + yield chunk + + +class OpenAIServingTranslation(OpenAISpeechToText): + """Handles translation requests.""" + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + return_tokens_as_token_ids: bool = False, + log_error_stack: bool = False, + enable_force_include_usage: bool = False, + ): + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + task_type="translate", + log_error_stack=log_error_stack, + enable_force_include_usage=enable_force_include_usage, + ) + + async def create_translation( + self, audio_data: bytes, request: TranslationRequest, raw_request: Request + ) -> TranslationResponse | AsyncGenerator[str, None] | ErrorResponse: + """Translation API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/audio/createTranslation + for the API specification. This API mimics the OpenAI translation API. + """ + return await self._create_speech_to_text( + audio_data=audio_data, + request=request, + raw_request=raw_request, + response_class=TranslationResponse, + stream_generator_method=self.translation_stream_generator, + ) + + async def translation_stream_generator( + self, + request: TranslationRequest, + result_generator: list[AsyncGenerator[RequestOutput, None]], + request_id: str, + request_metadata: RequestResponseMetadata, + audio_duration_s: float, + ) -> AsyncGenerator[str, None]: + generator = self._speech_to_text_stream_generator( + request=request, + list_result_generator=result_generator, + request_id=request_id, + request_metadata=request_metadata, + audio_duration_s=audio_duration_s, + chunk_object_type="translation.chunk", + response_stream_choice_class=TranslationResponseStreamChoice, + stream_response_class=TranslationStreamResponse, + ) + async for chunk in generator: + yield chunk diff --git a/entrypoints/openai/speech_to_text.py b/entrypoints/openai/speech_to_text.py new file mode 100644 index 0000000..b9b9b1a --- /dev/null +++ b/entrypoints/openai/speech_to_text.py @@ -0,0 +1,405 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import io +import math +import time +from collections.abc import AsyncGenerator, Callable +from functools import cached_property +from typing import Literal, TypeAlias, TypeVar, cast + +import numpy as np +from fastapi import Request + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + DeltaMessage, + ErrorResponse, + RequestResponseMetadata, + TranscriptionResponse, + TranscriptionResponseStreamChoice, + TranscriptionStreamResponse, + TranslationResponse, + TranslationResponseStreamChoice, + TranslationStreamResponse, + UsageInfo, +) +from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.inputs.data import PromptType +from vllm.logger import init_logger +from vllm.model_executor.models import SupportsTranscription +from vllm.outputs import RequestOutput +from vllm.utils.import_utils import PlaceholderModule + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] + +SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse +T = TypeVar("T", bound=SpeechToTextResponse) + +logger = init_logger(__name__) + + +class OpenAISpeechToText(OpenAIServing): + """Base class for speech-to-text operations like transcription and + translation.""" + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + return_tokens_as_token_ids: bool = False, + task_type: Literal["transcribe", "translate"] = "transcribe", + log_error_stack: bool = False, + enable_force_include_usage: bool = False, + ): + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + log_error_stack=log_error_stack, + ) + + self.default_sampling_params = self.model_config.get_diff_sampling_param() + self.task_type = task_type + + self.asr_config = self.model_cls.get_speech_to_text_config( + self.model_config, task_type + ) + + self.enable_force_include_usage = enable_force_include_usage + + self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB + + if self.default_sampling_params: + logger.info( + "Overwriting default completion sampling param with: %s", + self.default_sampling_params, + ) + + @cached_property + def model_cls(self) -> type[SupportsTranscription]: + from vllm.model_executor.model_loader import get_model_cls + + model_cls = get_model_cls(self.model_config) + return cast(type[SupportsTranscription], model_cls) + + async def _preprocess_speech_to_text( + self, + request: SpeechToTextRequest, + audio_data: bytes, + ) -> tuple[list[PromptType], float]: + # Validate request + language = self.model_cls.validate_language(request.language) + # Skip to_language validation to avoid extra logging for Whisper. + to_language = ( + self.model_cls.validate_language(request.to_language) + if request.to_language + else None + ) + + if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: + raise ValueError("Maximum file size exceeded.") + + with io.BytesIO(audio_data) as bytes_: + # NOTE resample to model SR here for efficiency. This is also a + # pre-requisite for chunking, as it assumes Whisper SR. + y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate) + + duration = librosa.get_duration(y=y, sr=sr) + do_split_audio = ( + self.asr_config.allow_audio_chunking + and duration > self.asr_config.max_audio_clip_s + ) + chunks = [y] if not do_split_audio else self._split_audio(y, int(sr)) + prompts = [] + for chunk in chunks: + # The model has control over the construction, as long as it + # returns a valid PromptType. + prompt = self.model_cls.get_generation_prompt( + audio=chunk, + stt_config=self.asr_config, + model_config=self.model_config, + language=language, + task_type=self.task_type, + request_prompt=request.prompt, + to_language=to_language, + ) + prompts.append(prompt) + return prompts, duration + + async def _create_speech_to_text( + self, + audio_data: bytes, + request: SpeechToTextRequest, + raw_request: Request, + response_class: type[T], + stream_generator_method: Callable[..., AsyncGenerator[str, None]], + ) -> T | AsyncGenerator[str, None] | ErrorResponse: + """Base method for speech-to-text operations like transcription and + translation.""" + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + # If the engine is dead, raise the engine's DEAD_ERROR. + # This is required for the streaming case, where we return a + # success status before we actually start generating text :). + if self.engine_client.errored: + raise self.engine_client.dead_error + + if request.response_format not in ["text", "json"]: + return self.create_error_response( + "Currently only support response_format `text` or `json`" + ) + + request_id = f"{self.task_type}-{self._base_request_id(raw_request)}" + + request_metadata = RequestResponseMetadata(request_id=request_id) + if raw_request: + raw_request.state.request_metadata = request_metadata + + try: + lora_request = self._maybe_get_adapters(request) + + prompts, duration_s = await self._preprocess_speech_to_text( + request=request, + audio_data=audio_data, + ) + + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None + try: + # Unlike most decoder-only models, whisper generation length is not + # constrained by the size of the input audio, which is mapped to a + # fixed-size log-mel-spectogram. + default_max_tokens = self.model_config.max_model_len + sampling_params = request.to_sampling_params( + default_max_tokens, self.default_sampling_params + ) + + self._log_inputs( + request_id, + # It will not display special tokens like <|startoftranscript|> + request.prompt, + params=sampling_params, + lora_request=lora_request, + ) + + list_result_generator = [ + self.engine_client.generate( + prompt, + sampling_params, + request_id, + lora_request=lora_request, + ) + for prompt in prompts + ] + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + if request.stream: + return stream_generator_method( + request, list_result_generator, request_id, request_metadata, duration_s + ) + # Non-streaming response. + try: + assert list_result_generator is not None + text = "" + for result_generator in list_result_generator: + async for op in result_generator: + text += op.outputs[0].text + + if self.task_type == "transcribe": + # add usage in TranscriptionResponse. + usage = { + "type": "duration", + # rounded up as per openAI specs + "seconds": int(math.ceil(duration_s)), + } + final_response = cast(T, response_class(text=text, usage=usage)) + else: + # no usage in response for translation task + final_response = cast(T, response_class(text=text)) # type: ignore[call-arg] + + return final_response + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + async def _speech_to_text_stream_generator( + self, + request: SpeechToTextRequest, + list_result_generator: list[AsyncGenerator[RequestOutput, None]], + request_id: str, + request_metadata: RequestResponseMetadata, + audio_duration_s: float, + chunk_object_type: Literal["translation.chunk", "transcription.chunk"], + response_stream_choice_class: type[TranscriptionResponseStreamChoice] + | type[TranslationResponseStreamChoice], + stream_response_class: type[TranscriptionStreamResponse] + | type[TranslationStreamResponse], + ) -> AsyncGenerator[str, None]: + created_time = int(time.time()) + model_name = request.model + + completion_tokens = 0 + num_prompt_tokens = 0 + + include_usage = self.enable_force_include_usage or request.stream_include_usage + include_continuous_usage = ( + request.stream_continuous_usage_stats + if include_usage and request.stream_continuous_usage_stats + else False + ) + + try: + for result_generator in list_result_generator: + async for res in result_generator: + # On first result. + if res.prompt_token_ids is not None: + num_prompt_tokens = len(res.prompt_token_ids) + if audio_tokens := self.model_cls.get_num_audio_tokens( + audio_duration_s, self.asr_config, self.model_config + ): + num_prompt_tokens += audio_tokens + + # We need to do it here, because if there are exceptions in + # the result_generator, it needs to be sent as the FIRST + # response (by the try...catch). + + # Just one output (n=1) supported. + assert len(res.outputs) == 1 + output = res.outputs[0] + + delta_message = DeltaMessage(content=output.text) + completion_tokens += len(output.token_ids) + + if output.finish_reason is None: + # Still generating, send delta update. + choice_data = response_stream_choice_class(delta=delta_message) + else: + # Model is finished generating. + choice_data = response_stream_choice_class( + delta=delta_message, + finish_reason=output.finish_reason, + stop_reason=output.stop_reason, + ) + + chunk = stream_response_class( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name, + ) + + # handle usage stats if requested & if continuous + if include_continuous_usage: + chunk.usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens, + ) + + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + + # Once the final token is handled, if stream_options.include_usage + # is sent, send the usage. + if include_usage: + final_usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens, + ) + + final_usage_chunk = stream_response_class( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[], + model=model_name, + usage=final_usage, + ) + final_usage_data = final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True + ) + yield f"data: {final_usage_data}\n\n" + + # report to FastAPI middleware aggregate usage across all choices + request_metadata.final_usage_info = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens, + ) + + except Exception as e: + # TODO: Use a vllm-specific Validation Error + logger.exception("Error in %s stream generator.", self.task_type) + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" + # Send the final done message after all response.n are finished + yield "data: [DONE]\n\n" + + def _split_audio( + self, audio_data: np.ndarray, sample_rate: int + ) -> list[np.ndarray]: + chunk_size = sample_rate * self.asr_config.max_audio_clip_s + overlap_size = sample_rate * self.asr_config.overlap_chunk_second + chunks = [] + i = 0 + while i < audio_data.shape[-1]: + if i + chunk_size >= audio_data.shape[-1]: + # handle last chunk + chunks.append(audio_data[..., i:]) + break + + # Find the best split point in the overlap region + search_start = i + chunk_size - overlap_size + search_end = min(i + chunk_size, audio_data.shape[-1]) + split_point = self._find_split_point(audio_data, search_start, search_end) + + # Extract chunk up to the split point + chunks.append(audio_data[..., i:split_point]) + i = split_point + return chunks + + def _find_split_point(self, wav: np.ndarray, start_idx: int, end_idx: int) -> int: + """Find the best point to split audio by + looking for silence or low amplitude. + Args: + wav: Audio tensor [1, T] + start_idx: Start index of search region + end_idx: End index of search region + Returns: + Index of best splitting point + """ + segment = wav[start_idx:end_idx] + + # Calculate RMS energy in small windows + min_energy = math.inf + quietest_idx = 0 + min_energy_window = self.asr_config.min_energy_split_window_size + assert min_energy_window is not None + for i in range(0, len(segment) - min_energy_window, min_energy_window): + window = segment[i : i + min_energy_window] + energy = (window**2).mean() ** 0.5 + if energy < min_energy: + quietest_idx = i + start_idx + min_energy = energy + return quietest_idx diff --git a/entrypoints/openai/tool_parsers/__init__.py b/entrypoints/openai/tool_parsers/__init__.py new file mode 100644 index 0000000..89e439d --- /dev/null +++ b/entrypoints/openai/tool_parsers/__init__.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, + ToolParserManager, +) + +__all__ = ["ToolParser", "ToolParserManager"] + + +""" +Register a lazy module mapping. + +Example: + ToolParserManager.register_lazy_module( + name="kimi_k2", + module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser", + class_name="KimiK2ToolParser", + ) +""" + + +_TOOL_PARSERS_TO_REGISTER = { + "deepseek_v3": ( # name + "deepseekv3_tool_parser", # filename + "DeepSeekV3ToolParser", # class_name + ), + "deepseek_v31": ( + "deepseekv31_tool_parser", + "DeepSeekV31ToolParser", + ), + "ernie45": ( + "ernie45_tool_parser", + "Ernie45ToolParser", + ), + "glm45": ( + "glm4_moe_tool_parser", + "Glm4MoeModelToolParser", + ), + "granite-20b-fc": ( + "granite_20b_fc_tool_parser", + "Granite20bFCToolParser", + ), + "granite": ( + "granite_tool_parser", + "GraniteToolParser", + ), + "hermes": ( + "hermes_tool_parser", + "Hermes2ProToolParser", + ), + "hunyuan_a13b": ( + "hunyuan_a13b_tool_parser", + "HunyuanA13BToolParser", + ), + "internlm": ( + "internlm2_tool_parser", + "Internlm2ToolParser", + ), + "jamba": ( + "jamba_tool_parser", + "JambaToolParser", + ), + "kimi_k2": ( + "kimi_k2_tool_parser", + "KimiK2ToolParser", + ), + "llama3_json": ( + "llama_tool_parser", + "Llama3JsonToolParser", + ), + "llama4_json": ( + "llama_tool_parser", + "Llama3JsonToolParser", + ), + "llama4_pythonic": ( + "llama4_pythonic_tool_parser", + "Llama4PythonicToolParser", + ), + "longcat": ( + "longcat_tool_parser", + "LongcatFlashToolParser", + ), + "minimax_m2": ( + "minimax_m2_tool_parser", + "MinimaxM2ToolParser", + ), + "minimax": ( + "minimax_tool_parser", + "MinimaxToolParser", + ), + "mistral": ( + "mistral_tool_parser", + "MistralToolParser", + ), + "olmo3": ( + "olmo3_tool_parser", + "Olmo3PythonicToolParser", + ), + "openai": ( + "openai_tool_parser", + "OpenAIToolParser", + ), + "phi4_mini_json": ( + "phi4mini_tool_parser", + "Phi4MiniJsonToolParser", + ), + "pythonic": ( + "pythonic_tool_parser", + "PythonicToolParser", + ), + "qwen3_coder": ( + "qwen3coder_tool_parser", + "Qwen3CoderToolParser", + ), + "qwen3_xml": ( + "qwen3xml_tool_parser", + "Qwen3XMLToolParser", + ), + "seed_oss": ( + "seed_oss_tool_parser", + "SeedOssToolParser", + ), + "step3": ( + "step3_tool_parser", + "Step3ToolParser", + ), + "xlam": ( + "xlam_tool_parser", + "xLAMToolParser", + ), +} + + +def register_lazy_tool_parsers(): + for name, (file_name, class_name) in _TOOL_PARSERS_TO_REGISTER.items(): + module_path = f"vllm.entrypoints.openai.tool_parsers.{file_name}" + ToolParserManager.register_lazy_module(name, module_path, class_name) + + +register_lazy_tool_parsers() diff --git a/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4c1ffbc3dfc51bd54f3386c3e57060319b045bd GIT binary patch literal 2762 zcma)7O>7%Q6rNps*Bje$;-5H9(n4vQg44!s?4VM9DkW)}UpFC+5K3At8_(9+WOr?L zHzf|K)B}e|ATAt=1RRk-y>LQ3aNy1*Ih0nb-Z*h{+XE6O-ptzZPT;^ud3o=f?|bv+ z&3N{=?(Q&xp$9+Leu*OV2MO8(Vme;_4F4V=j4-DogInVaevRi4=5e6PuLZi0hHwxI zUvXH(A+Qqe0vpB=uuYJ&j)h|5bbj z>}&We*mL+i*kODDzmDI)Sv-O-;!AiG=Wzbp@b}zW5MTZbtqD9*g@>)OIn)@-7XBnr zmUqLewVJ-HShi}p@s3|oOr@$?RsdL55R4nas#>>Ibz8n$1YNov-Yv=wq>$@gj?{Ia zm4Ws_v6Ib>w%H&a19g8phLDl>b>w3{eMq%TO&xy|q{)`c$l}x9yiX!jb%P30#F34f z$|%w^fR<`%sfJa(BioBrEydIvb+mAKWAsi1794CLG9dDu%84u<8>Vhdb+WjjmWI^Z zmOH*K#b-R7PaRW7M87=PZQnTF&jIIDJx^e4NzwMF{AIB^_UZ5 z#YtZRNi!Yjr*43_uk9BYWpb`XDfo24mSSuuAc|9m5%kQHIUhN+tr?oUT>yHrzbTLpIQI8D)CWo4=te_NGr4HZ)T+lwH{%drx;FMx88qtdhS2p%$4e z+15G!3)B*IR`Zy)V<|e5K3>u&$S~tI-KeoY-2Np3{d@P2H&LsrrlK)vdzS#4m}9c0 z>YLg)sg_};mjNZ^vWEqEQ|+T+_=*3lA^q_us#&bSuhnAMsrLy!MR2wj%HK8EW+W*4 z!_p$lg=Y=RHQQz~$Ke&jOSa8WU6D1!f?7|7tb8!WF z7Ov+z&`)5m7POh zTnM5Dd~a^<4d2yuBd3~<)u`9t$+2_Z#&b^RdfT~d&>fIhmzEag<%yN@^hy~5d1ZQL zuDm+E;tKFU8@BgdN?B?ZItkZ8SN0n4&f^_j1$od^4At$rqv@(l-Eakdp0?*lJQ{qo zoK2g2-p;`Ido}ME^?Y5ck2;D~Rh`k@ohw)0+PokO0KpZArY_sY-v%g@U) z+;>NobM=Nh$QJ15S+`*61NgJAfw_;KWhb9xXOFV8zh~!`w_T@(>NY_$UzIZgS6~h#Pv0PSNxF2kfnoQvd(} literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/abstract_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c5639b88c195b355d192f7f38c5379a919b21cf GIT binary patch literal 11487 zcmb7KTW}lKdEUj1xRC(CJ4H$>QX&P31a-4z$&npdl%tD9+aet|a8h`QT~eTMVRo02 zh(L+raVJ#UX{l*aQIlqZbTm_W zjXNLb@$5)DGS0X&aKJ{#vAvtx;yR5_~ZUeARfpB6cV)Wc-8|>u zvE8O`$iO%FN~z%|JdY zYUh*2(i&y6`LI?*m6iOWU%+*2Qe}G~)Ov(uT2xg*!%hn7`P^Ju60$i>n3RP% zRh9%PrN~JwJs%TZQ*swmk}NSqCaa`mRS>h1kdd|XIZ3q;Sc6kp1}w=+>LPHlnPfG_ z8g-v$Z9~#zV75Y3cjCWx8pRE6nk#S#)%6O76Y+6l# zli3hGl*vhQ8g+}e9J))D(^E<-=uwGELHPaKBU)y5WHvQBtYNgQ4PTx+bojXohvboT zYI0yTXzrvzuhANto!7mI1e1kC zzNNOuF{^QqXxWY8Dp!f^U2v^{i3K=+=?&_25NSN0uX_lMhdEx3N}?O6jfr+X9` z;*?F=YmY{Ir=G>b4eZklfo`~F>>a6fSfVh_y~jfC^SGlT}3%9O@`pcpI)zEM`G`t)dDS1bfZ8W|?F;vq!PJ_4@ zoHmM?KtI13{i>tDk8_&+7J#z33ggXd^Re&qmKwg|C^){I)o4a`6?n~S)h!rB&;oXw zSt;-dkJVNHB{MnAT#qnHAvoTWwm0RfAkXl}Rqi{l4dp+i1ny z#K;@2giQ~vQnVp|hRHUmnsH6kQb|)Y80v*76fGx+(#)Kyp-mRdeG$OGkSR@8Ozl7z zV6Os<=ac7isidsC=|R=oj5Y{vDm$G}Ogp6q>Wo8PuxyB6SQ@$sG)^Fa$G8co0T&6i z?lJ6%?$YLGWj!FtYEntfk~u?{rlgWuv{BiG_g3g#6_OwwMkAF~H4)}hq19KKs5Js` z)9NI0hS||wM3q8otkC9S78o^!_Eh&6q(z;K+ZDJboZeRV5_J}ixDN|r$U>8`e0Xza zZS!3dww8~r?;^QWzlXw*j*dUS^2b*yE!_)a4}>F?&TXrmd&-@A?jHKPqknt!Q>SCQ zzvS(O6a}}J0z(f%T}y*2{m-q04nG(kT^@dUv0){=yXxT%oa7f9R>Ql?;oU3Y5fkjd zO6cH&ZTptD9awDmZt%gt$x7GW%9g#Au7Qg1Qf14wPXnI8$YR5d;Pv22xW5|a!W|1u zhGYf}wP~zZ(iSriO?CROb)vYzX}0Q-_&a9(9X{$1hw$(;t}~4qY!ZxnTFyYT!WI!j z&{n`P0Xx)qLlwxeu?>d0CN#OR{iYyeKp72PrVG24)+uHups_#hf)5W8jqq*dNItmW@uPQ$SE7z?8U2 zIVnQYgp@W2gA6H@)9}P5 ziglXs|H+d!OAv3u;Sy3=n=LK=dYX0xwrwL+O%`t{e)%<=zoP@}O zRCt|WG#it(pXqtfgV$h=jl_IRlYby&yHpwTAv$6D7My1f&!75Syq&P${>faf9LA1E zy@qoB0_w_9Dqf)CMJm2X#W5;Kb*scEqz33R6m`$z9ABF_KloJMuK~VI$W1BeV^Z+u z1}T8253i4Ts0rPlNI+4|rR79I4<-^{pA*w&PeURh<&uen5&}kLD-~qhb(aJir#rDE zx)biEvV(e%svCFMC)66K=H=Qtt3Imvx!#?{@tbF>0eT8@ zJv)lWZyv2S(o+-H-B;|t*;@_KQ<#ekR-367;r#7OuClM^(-u4$FG}y2#NWQi=+m|K zmyq{bIQRyll(;j+#a%K&K?($oAdpJgCpn0h(CcTt0YG@HFwul?5`-K?i0F~r_IQ`% zsS6?-H|a+(X?fQ1COPQie4NxO`78_wDLn`pJv0l2ijtR=0XZzWCGQkM&3E1Q_+}G- zBWP$MIv7N#Zia|41LkNWN}qtaHoGyK{XTMwMb|WM)r@suVXRjQ*HP07YTVWg2&WO^ z+LX4TFJe;Yl3MC|+ULE|cD=bKXJ^GM5)w-OADO;5P9c_ zMluMfDaB#r$b?*0m`W*#Uc_W_om+KIOsO!9XXJV24GF0!<~>d2=CV?q^VLXX8fHxq z2J^weI;f~i8N(7NucFYKk%FQ4n3n7WwV9RWZ3g zwgKcFOs0Mr1%iYK7Yr{RSZ(Pq;}__!w6q)b;%K=gS_(!V20|OVca>Xq-ELiOiIsw} zkDJ<;dW%|d>h>$8j>F4MM@rr!On0thLfU-BgndkHK8?}73KqNKxZ?EH1>l>8-wum? zuDEKL9f-XHfxP`zY{FU7aWW+T^8nd_<|d>T1wvn#=XFk z!Y;VSxqeQw!|olhu~$4=!~&dYvud@wW6^-YnLc9#GFEz{Q8o3F-T>YbH8>Iokgd6xTcG*!8&=zin16Oaa- zmsR2|5_nvOWAZv0^8Jqs!DGVp7XVb>Ljf`HaQy?Tef!IO`|lns_dQqkZd(W}#;cC7 zFY+L=Wi=8lN20g)EJvPQIR2nZD7F-5mb>;XoPN;KyEOCu?v<8j9<*<(bZlQt7k3oD zQOuMMyk2TO2dAwgg3zTo0{jDe7mlw6d&vb z9<791E8#7baPy}=cViRWm&V9{S3>~$y-IQMcaKGm4RRk2`d(^sez>>mC7<&jd~Vdo zbdRxa_WH48hZ}KZQcjxehVdLnxd0&oky}}Qd|YcG=IgF-Kvql%C1e}oCOD<0(ErN*^+{5 zl>6Fg%;qjQ)k92xZ2`5gavZEy6r#OJPr*(04q{BsAG-=}s0CNSvspF_?_NJQS=UW* zlDhcQS6tk@1H!%Q9P>ZmIII&@#^{hu)0IF1K0jjYsF0!L8O)OtQ^aClg<#+$ZkvP3@bu3i2heVwBhH+sWwc- zC>49DI7J1$tYN_@bXQKGutYgY4H6YH3Z1e(6r&JR8fHvykS~KKS$a%{^)OOhxc17&ku9a3 z)60=FrQn%gwF(QPzi8=PnkbI1?tG?+5KI0|%#gAjAu*{S0_rvEJt6K7{xZp#Kdaap8vkLBx~_Ai{m6oI)!SM2b}qI4%)8a_qi9O&vK99~qvbo?cli1o zDOBa>SKhY5-k`jl-)_Ig&DegEH8Ff0=QXJI_Z@eL{c!PU8OPbUMl`Xj=3$Wnm0VBi znQ|>~3p_g^4PJ2-TsTDVh|x`1)a1=Nl~@-C>_QGp6eKyBLvj`wp*rXP4LXHh&$wTF z%C4A9YqMvK9cZr&wY-socGd7j*Uv;_VzD}EURBlu)J8tWA0{Qn?dW(eBiDdyROPob zn{KOjQJ92e$YQo;2#}dgVi)T4uQN9}qX=kQcc4eyMTTzeK8o)-o^+&lg$$d`T>v#N zm~`7YxaO(tc+I3=^T4D#vz^yG-)^b-!EPU%BW)WfXMriO}1c7u`uSWT#PH#12(pRI{}%_3dBYRI}jVRKsk zncAj2VFzPhNaI*Dt-cVe4f{{bs!pT$9Zq7qdw&?a8Cy70>FW7m&&{5N(;wsXWydY& zN@z!=t9QwB-Cb#JD>ZMwHFLN119fHK`ATQcb@#)r{-xle`&Vt7jcMEGvg^*0^0bfyY|-1P#b*nb#$q; zwQs)-b~YnTTiqG_xml#S0c3^H&~pD5Z1N7W$vbbgu7n0)lS^A){E6%S8$S-*yHe^o zyBrxS1;^kf2P36GZzUL74GQI;aNBcl^r!ApP*@3$Ra&|h&Q?7kU&mjK-s&#*3{^SQ zKWP16ro8XiZ>V1F!we;Fm!UYJQS8eOK>r#|^-8nul$YBq%$%W-8!jskWrd0BaBSlD zejXf@0kiNyA@cbbkp{u(J2RXrxqCRE5oF3XAq7Io2{RP57Mulkb|k09aMeQYf1)g3uicd;_p8wDdN-- z{+dD`KPhif@l`6mh5~kFoa*ybNGLX(_?gK8t>&racRa2wABSonP9k>~oaE&?x^7Hg zpI&VrD7O#XQvT-Bk1nka9x4wWx))d;99?cdabGOApIiu39d2LK0|eA>-QM%_;Il?B z?W86;tkwhs|Bk22~Qo51Hn6Jya1!*Ea_ zqr6PTm#APlx|38pMFoq5NPFs`L_*9W>0%^|aneF22A?g=!E@FF%ylq6lu{;WAmt`> z7fF$#&@)rR7pO*Tpr}+_rs7*voS|YH74$`na@Hy?qN0|uVt<4a7Y|Nc!rZU2EQ%sQ(AtF^cz#qlbK+wFc#^UdwbW?!J_l#;p>T_R&Lur^xn%fwm#e?PWV3me2Visn~zOJQ{Md|jD^1!pDormtV-kX6AEA2d0 z?m1odoqiMms5vrQq=zV72n9IoOEPvbik;Ykq~z5AjUW>BwJ@37LwvKzINIDq@i^xL5pr+>pHW(x`jCU?|lWls^LqIU6d)iB%`h z^J`v?Z~iyV_e(DJ5x4U*mxFiw*6ZONpK(+)ar`;{Bd-552iiaLaQyQhar-~w4*t8Z ib@30DeOs@3e(M?MdH*LImA_3p-g59wk2oqBfBY{Z(||hw literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/deepseekv31_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/deepseekv31_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95af2039e946145ff42227737e791d502e267483 GIT binary patch literal 12457 zcmc&aTWlNIb;GClk~k76iIPZ)lBfqIin1P-ZP}J3OSacuTYgBkcZDrjnlq9q^Oc$5 z+Ga&_aCaOEg1ab`tQcxn82`~qk{Z1lrqnM0Gaz3 zff1TGme9;=G$^l)Yv*;4>f-u@VcsCt>Ep&k)qGXLG;d0n=gkSryd`0swlwPL<1UYBspJ2aRUBWp?LM;dAN`FbsO3L{MKVuYErf21qb zHr&<*EWbpBfJQLQlHbUX$tXDkh1x{q1{r2jsdzXViO0h+B2XpNPhF2NQ>jEcPBO7n z@)GJ#GeYe&8D}CVGs!5bor3OyO{_Xc(sX2z6spD291JElx=-9tyegyX5jMX_j`Ox~VL-5`^(EJ;PB^-15_QsMb6 z$owI&fW+oC1U9cFG^B1pdly^K5ZZTb^Lj!@8VEgUBn)KLf{`%3tDiR!RZwSwe>41B zNHb|^f>8*olCvo}3DQbbXR9!50Viqz-gXzerJ1ju#R7JrewrlHvm|-r%J87N62Z(U zSZhm|q6D!B{XgG<+CS0U#~4&A{@&BF*dKvdu^O4Zz&Mh^_IW;A!WXu4@CfDx)V=9BG{)#Z> zGME|}hQ*e=a$ZeoE7K;I2#XB&ZJ^syyIilPOPy+7&OuLQEpi=($s@r^9dhm08dJer zhJIm8r(FBB#>8bPH7&za;oa97vrZjL&XtX+NCAx5CAX{TJ2t5AQOk1fj+GgZ^Mq9% z@7tQ&Sim-O@|XV$zl<9?9Qp4b{}}_@|3Clq9i)$FNUxELu_RQFXPPT_Ad`gH-12hs z3*cx+?;0Q4W$2p~Tb0Rsj$JsiP*Q#2WF$e3hkB0%%7&xKG{WMAei3RhlOU4}y}L77 zTiM)`F!u|9r3t>y_<>CKS05#NdG0#tOQZ-g?o;%VFOi`c-!;;gj?gqoK!cBgUaAiB zMN>(dp)ye>Mfr#f6-zFvRiM*D*{ibd>%TG_8WcfcG^EL)V02m?RLi=J#ab^Xxwt!18(3vzz38rvZJP^V{Lo`ECKwRncLKF+rP!#IY z6!~`9krtsS!Tv(u7$G#kI!FK^!oVXs49AiPAv7SY#G{Z7i)cYL95#ZptR4_8)z&Mu zFLZXO)oNcYMG93>I8Dt? z6f9RF@eFx_qEeJ#cpJ!lP0%fpjL@V^`@-tr)E64dniQl_gG*^BUn6Q!X%&6}!geW> zWMTXzX8x(7MU>4lT+dJ0n9h-ITZLOwu z__vE`BFMVBkez^3((uc_pl52DP{&U>^ zbA_%M)YO4iU(4HiIa_buHdM4bczYjb@5|eVpVfI0X8X%9Ghq10bD%FK*d$p^Wten^ z5vr(?ZE2)PCPO8oiZKIyk*zJ7Mt=oNxeun!5*lIGkJY@bp=uZfj?jSNqj{{AZQohU zj35=WNCjU~3?Q{mHMN#?i`cTB)lFl+7s9Y*18aB-%&ukQauuszQp_V2lCjEpLdPg( z4Qphp9_v3U8ElUYa!x(~HQ4v|=>cB?{+I0X2x?k+%`$W|wrna}xslbgCaQ`xm9L(_ z%1a#>qnHeSjOx;0jB?yBDdvJYziQw?&sWkos(lli#cpD^bZ=reH38GiN!ax^v^Qx{ zm6C*DQM3q+v|y4!n%X2Er~$2@NBT@KD<>wZVs3-RKLFUwWrLzJLk9yXT`JJogT|Cj zkOeB0@KGe4h69QW1*{Y@6)40e$_F2zY9W>+(1pbqNrUDfD8wxa37DWqO@jHvttgom zFD6Fmt|6F7xhi0$f~fyK^ch4zP8oCQ*2HAsszg)!QIi4gH__`twbDaem|8^V4K)fq zX@s6n{!t)hm{3<*oq7(?*`CU3>*bzj(V+hWpF7yL4YT3AwTrWMt;RX)*rsEk=xJZ2 zo*dcHX`P@hw4g35-s-w-3)U0lyAN~ShxzU)u6yd~*+TclJnrA}wB{`T>KV+PC=MUw zocnU;R*n{(Uf$WsIXgF9zG7S7X6InBbLdF|hV1h07z|C#pX;jX8$cs^hqfEAmhOD_ z)m1ki7~=wCPe%B|SGdDh3W2Nn<~MRxn~lT8w$5T};M4BVgHtHukN*1t(n26LOnh{b*WC6f|M+>z| zZ>7p!0+B0bqRLGQ&`?y^I&ALpyWNtqnV?tc2X{N2tRP^!$<k!JKGyyMBbC;$8t$;bx`6EbBm7>tCS=6~(1a zDdV_odL@*i-YP(uS;KEJm*tgEUpAMOz?NkYUeQ6sFmlp6qAD)&DXPgeI8 z_-B-9@Fo0v)#w=YH+x8ja=+JLtd`Yf0lDQGwq~hO#Zu0}8qmhu^^>t9Oz8IY3b+bu zSK!Pd9J<==`h8R?FWW#qE93LKjjQ{u#+BCT*@Y|=6?`QLm<9>B2eM$gfpXttT4e~> z*QK3DSj4tgR{M(e?Md%tTT)+=JG&@*;H6sf$up@bVabBY64i#4Jw{l;&mnu%kOAda zd&{{cbm83fgS}kBeoqaCJO#d8;80YxT27OEU)(K-Lfs;*(7tK^UFdG?=6jr5{a!U}A6oNV1Z($@^U`pTML)G9v)oc|V9``65Ad1-ai57xDdDaJp57siY%V)7QG5G$K!AeN$dtHh83?ORA?l0;iTBYJ}bEIV+D zOp+8t$B}Cb+z}|Yg8XD{KKe#1omM5KT1+cP+y}_(oj@g-oQZA;T>6 z9~5db$ygM^Nnw(XM$%+f+rFHgI0=y&`nrO51OhzbnteoUVF4mM5yp2tBE^KRkz^9$ zKvGP`cbjAo*?=@?jyZS*U;}4dJOkq=Q%Mqhg9dQgk+($WAoM~eUO+1*A~C6h2sL~L zf=y&Hm07$FSc4Bo?v7dmdg>x1)EImOZHy4xvcWm(c4_@$@W5Ac_^7M)-Hb8UeW#}p ztDDd|L|Q^*OYvNa40lErBQXe;i4+EuX|c9-R$(OKZzt{RM1|T&hg_W`<4eSdeiT3psRAOA5aAR_Lg)lCp{#Wg=pd3DaPE^FsSrsi zt@Apt1?Erz6*&zBdMIQP3I>9_mRY3Gb&oiw?s828Y zpery0?b7iWL%C7jNWmPjv|x-&sV)v2L1r<9V#*Xck14dZDRh~qP;6PyK~KR5Z&g6D z1S3I8Da0KS?@g#1deR_Rq?iJcNu&im5sNZZ8)`$wreIv8QkgVOA?uDp#*ttk#kiOy z2Ac>%eDDxPu!x~P8nKodM;#~N6L3pj-;<~U9iY@}=!1?K>J0jzV?lBU*Q+5%DQGAX zLvX0oC_pAvIzgl)_ilODI;B?aS*vdEQZNvl>L52^xny3*z>^)(j|nUS&m~&@!h%qR z#(*d*c+inov&?H*ifsx_sv}c=`6IfmNgBOVfhx%xFJ_U$TJXLMAO;{yP}CIALZ}hf zt4Om@jTWUAORiF-f<|H?`@butl263 z9c;&mneE@Vy=TkUcdleNOhcQd8mZX3dVa%nWXo#jtpUy&$PXUptj9NPb>Bbx-r0Q9 z{tescrlWP$vf&uc+lGtvt$ckqSKqy==X<8Oo~fso3iYq?^%uDM3q^3m+d4U0=L!XJ zyavl%%eL_kbY>TLx;qc>{v(|KND*Rsct3~tLx;|qysZt|I`;GJV_duHIB&n2x3v~K zM)-~~u4Akm#19}o0BIlQ+YWMV2cJ0kLv!4rxkB6Jyv=Ht@-!y9X-W`*Y?kr|16pd*@fCdH*=)A1^qMJ=Jo~sh`z1-@E)^ zgl`$I+PT(qeCr(7I#+1DlCw!m>^`)P zA1XR~@_iHQ&WUG^hTOLbj_%dAbw^*ZXN2z=<9bBGUgi#8F7#X}x|%=JX#))q*#P)J zf8Nf@W4-Te=#(i({JE?A`|*JypP9+IDBYrXagTr zXmjw1pC6my#%BHnKfepB;N-e<67heK_m6V^Q6=i?zm%W7%=xe6@wQ@X#}7)Y=@}~^ zhIQq8j~1Lqw;YYUqnC5^<{d*nilRv_a$^?@qnGkmuC9+>-E=nJKYQ zY;55hLtJBMmEi}bIQVOv<{Mw*8ejXQrO&mHu zE6BTsIoEK0WO~DOV$0J8oT_gEPPyDGV_aiDAXo&Zc?UQzFr?t!pF34->0G(Kax>pD zu;%AlMsw38WFQ*cv~SDhhPV!dhb%A*?;PNqA|I!@ebWW!iBFuI^Bkm`E#1%D0SG(u zU1MC=*k)kt>D)%(3?I161uho{_XCE#S2WKtZErh7oIKubU6sFiM`w07Y-45^ycu(~ z^Nj(nF|g|Ad#AbH>8IBUji+;Zpu>Ik9*cw%ZMi2@`mIoEjp*oA`YBCMo-7)El97X8P0 z|7p&Dy5N8Pfw?$xH17(mc5$x3;-SMR+qdo-edhAMZ+mFVcOKYq9R$*YM8U>&gw`FQ zqQjqe1QDx)NBQ6+7o2?BRS2HWRe$R66^DQe8_t1UWG!|MY<7$kI!1xM#lR>;(0bc{ z(H{Ju_lbX{x6poUr@9J}){IDNMWnTgq;>m%d3ami-h;%zpKl!4fkm{fHg7mbU^%sj zw=NHC83WerMH^eL*s);M4x~AY`1n&jKQYHm%)zA9H4xsz9k~%npbPdsSxDFIQ{v|8 z=J0MF4{>;CmD<3EVAbAs-ZR2^M$mJF(;WOE6*0qkW{P+V@W)$ocLoUV3Gwbh&ONx+ zS8$KtJ@-u3+aqhq4ciGtjSsAOHf+ZbJKG+4m7P2O)UtuUUUa#CVED|0HFSLN%~BJ{ zasAO9oem<6Cg7>qT-OTCF8?!EXR*DDZy)8_M>ji1i@gWLsOT@eJr9@GhzCmr@8KOY zL`XqieGMS5O?5fLwi7#{xunVIc^oN`)oDIB!38JQ@kzi4#vOm=^ghTw9tQpuU7f4m zb=QGSSIdKQ1y}Fm*)?yWe`4J=0WG|1ALrV)=G|~jY;_I)XzEE#VdPYPig~ME$CAIzNDpl`V(s z?%6M%*666;4J@%<9JWlfc^cEQ+7!H(Kgux z$v<@+w?gvILz6~G@`sK)At}_VySLI1H9%CQgRKf-M~WMkW%MS}63kQ+#AZ&7&N8b}PL;K#n~M-TVa| zyjv2jXX+*@R+nBvMM*H>!f=P_hguq4(k4W+heDwk@q^53`hEC(@h-Nl4H;@yf}dlM zZ2!Kd*HE|B(6nuU6okI6d%FtdOqiwXxf$i&(9Gx!uC2!AZ3D_cAIMgTSrb<8+BS<> z3+CxunSFR}+bR}qnA^WH_3-F+wOFjdn%h@89(uOzVi8Bv)QZ_U%+l~c&zar7cAzqa z!V4nCfkt?6mx{+p@u4^!id>6IG%$$o)hP68S7eP z(yOmfno2P#m_VX4x&laq6>kLSRfYJnLww^Q9D3CtqHSAxItCbmMXA`BQZ0o-)q+)o zm!5=B=v|M{Ce}drB#mCPlN2o>9a5evO5%XbO!59C(&P~)(MNpdVLBqo(Kz)T=!BR~ zPs3+hr_pFWH({F2pJR3Zg&q2})}+z?8iUVg=<|7<7CuHy_5!T+#3b6_l|ML Rc*CG^Jvj3@hEj2s{{sWkN$CIp literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/deepseekv3_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/deepseekv3_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2ac9593974e539c40ac3ba4b5322613154a28bb GIT binary patch literal 12551 zcmc&aTWlNIb;GClk~k76X(W*pB~cGb6lq(sY|FM*vSfSh*z!ZNcB5>oiSlfR?Su=Ewh%Sb=I1+&DxUoS$onk>qu75 zR!e&LIf*zt5Slc3mm zI&~TKXIQazf=aNlQ`yuUsvU>!qC=`WPcckvffB2w(liVvHF{3oVCmQ#OOZ;|nbdrm zPR0LlZ3ky=wF_yYHoxV<`;yH?*lh-GIYfFWv zHz4!Jzyb=J)sfh&p43r>dHro{UPtQRcFY<{17#wOl$kV9Rr6-j{I+q{LRLYY1^%t@ zZ=;Jl8*qAL6X2Bi{@0a@rDH7d|j znC0914Dg`iRm(s4KZ#kg8M=LA2TXr_02%b>BOPGtk4RhMD)4Z!OEXzH(h1LppEFnjRJPLaDZeU0)b(AI0aif)2pI*2GEq3#BnYtq zVI>`=OjJS(s^Pd2-DUNFXsNbdt$n7mORLuU>S;=>nuB9Go}fgh)`O-Ns2ieB#-|MJ zOqxa~ZB%Y=d9Gbru-xdB*eK(K4pz}N6H8>NlQf;CMbjHV?yI6jT#8Gp?r;~MWa>p83@PaY>JI1<@tjK(a2DVc^Vz?qBRQh<4;?vt{SXyN#IHs&H`#0HROF3BA_)S*(l;~dIBdR1Izy#m z@rZ<*EMQE8gp-?rllzSUnT&FMICGOmx-{oiB(wZv4=dt?^cV9A7-FB@!Zt0{?^N*? z;$G8+rDH3!|F#j*Vt5})TZ@4%lzO(!ShaK8gjuVhaod80&g$fT!ILO$+v+`y()MD{ zI7`9t3W-y@@Nc#1zfRR~U_ zS-OhdeedVq%f0{2_rAGC3PTt8p$moXi>N8E)f4*A_Q3XG&4ZdXRycHtKXj?ka~U;t z7Q1@iUwm)z{nUG@wQ1qtdH&$}Lf0f}>OiZn6&$_1qc`t3P;|NkXCLqE%R2|2)cFu* zePx)LFnsVS&=(UOvZSW7Y$nT!RdmTVHPaNErBidNH3WT;tuEPC4}mFn!B$#CBkcNd zx;J!m4XeVDI|l2gu59roS*M!=Ya|3#-Vf|gcZ zwF2FYEm_OfZsLrbm9FBfPwrPnwmvpyvyzJg5IE zHif;3-7tI=ds7#znmh%&-huWeBdJn~6m6;&VUQL~v9OtgdeMlqm}pfGRaAuzYSE$` z$kVxyq{q;iK*=`}bm*Yr5X zG)NjTibm2U8c~yIJ$Yk}%1E~wvwTqzt&~y~w9z5de;@knM?h``Gx^HJX5pU2(C8c% zO>jAyyM`_caE3_Bs8%~m3)PC~)S-s}h(Vb6)gK*31J#vQsvQSRuDhc8dcNa1G$;mp z<`%Z?z#O<>@8a!Us|nsdyy=P*z3r>?qhmV;y&Keq9@K`-S6#Pl!+JtO_ffw4sL(ym zcaJ|lSLnW!#{*m5)@9qjdiO7%ERKxx?xV|R?k%k*i*BFb?&RH_n}omE*1Orczu0-; zQ4$92^6i*RP0gPgs_Gj+H~J22H()K@`R-R%JwkAp4-P*X6pqgDM`sGbSMtrTE>~?f z4iwuui><+ryTkWR?^GFyx^2u@TL=2(;1R9(*^UEqdw-1?tLrxL1_Af;xIf=^VFSM? z4bZiA;1LJY6Lr7F^Z;yve!pc1pu}H{jhqNzKlPf?HxL?k8UD6T2Vb!oSqbrUmWi@J zL(!oc2&iNOtB8%sEs{AQcSM!|9DIyetM*o_oFx#YVscKslmQy5G818wmtPYX)lCJx zN;(LdfpI#gEy$e5lp=?B{+0T-1YmQR9D`4a6jgj>377y0xC63axsGz*Wm^>p*w>|#f9SAf zUFuzFw;&32OSD4!rv3M!2ciOu%P6~WSx0wqcFubJH4NsX{eV*iN61@58GkDFP*zP^ zrM=y-$G)mo`s=Wph`JT#tV|tgH}omA(o(WU8F2~ca9D|+!&zj(NWU!DL>#^dChhJY z=~Unrk(IOSN8(CrO2^?#I9V&{Bya%1-Wb@`8H!}B(tinhf^l2h40V#$!1tg?9fxb2 zhqYIbx5^Z`p}(O=RN-(*h7EK2V^CsMUC1I6P5M=aXAi0E_(PY1|Kbs)R!d3u!+NFt zVS|G0g6?6XQu{e)Wre+3cJ9@2buTzCE9kItIBlJ90jhgQC|E5yIp-9{9#ybcwU3K) z(BA_}AR_JhExxp?0*ahNS=*AEbJMMydvWafUdrgwWvPCtVX2X8;9O)AnuKeFTn);l zF2TGfo`>FJUVaCr?~;_Dl9~ z1*ppNd;yKx`~tpfI=5vH?bgaQm(k_(>FVcNw7y&m1Mds>}Z&5-R1A5e%=f^6hk=vQIZ8BI#|lHD>nZTWq*C`cdT`xia|B;E28!1-@r zwSUc?QI^ITv=dOFDE&T+-FzK%KK++v^4X%$psY^S3z8M~9j;~ZE6T`P3VbcuJ9Lg~ zsRT`8;I8qM@u1SpucMrQUyACU{}-z3C0$X`jw_=@#XYRh4Qx-iM@m*x3+Jl5-z)E2 z#P@H(DOVY$l8)fi^Ia!Ff?Eu^O{5>V#fWEY)(c9DmXZMlYaRAgu-sJFSLbA|M)MsE z`#YJOYnNj{s%)Nw$cpO1l0poOe?FZ}k!?YpBYzjTB~YXV`OVt= z%=LICqq@P6pBoC{T$}1-(;V{1Z>C{Hqv$7%+>yUR;(99AG=*X>XsC3`AM>ZEH)Xs4 z@koqisU$eYBzK#CE|F%+f;J)l)hr9)9Bmo&VHh!I%ChtQhsBy~Dn19nr6|SB#WGY* z-@cR^Jq3{)=9-Fk3<5sVn*C&aejXw|G1h-ACdY`bQdA1!LUN48f0JSn*?=@?j%j#T z-~bO@A`9cE(kTkOh9>ajQ8(yeXog`q)YW_N(^_$ z7GiM-okxcZG z6zfmM$$k`23u^)*lMvw)OF<|FGU1$k0q7u+9B}TFy{iyeDXsGwum$GO0982+2YYB_ z8j2>8x|&^}(Y238_BC?kXA-eFO7^{?OS))+cpt=Tf~Zd~`k<>Y1o1M7I7@p_-b}+B z@r-Cr$f+R#96{zWjl$G4I*)0zwdqFmLE&oA06j%BynX@663rwfrx1liytklk=#_+M zlcNk|HklEPWPFaL+fW-aIYsjVoz7+$8d-NVGLA$OCB?~Xao9u<@`I-}qD>0@F^IMF z2ZXIZkBm3OR-L| zNpol_Z;2#dHh8wjVO1rUw(P8zut5H6(f2%*6qQs!ClF|jv}jeN#cH%NtyuD#suc{9 z6V)AFDI>ecJ+Kn=JgLk@4)IB(FF=lS`@f*GTm!iQ(=vY#(tHcsabs5J_Z{yz^7Wl7 zxed#KO-qej>|MREVL7&CcMA3(Zx818pWy8$HXU`}KljeLeAB@V$Izy$b=9`v8pt~a ziuJ8ReK%jW!(GQ6N8UTRfnV5c>{va!(Re72AKF4uzi!^yy*kW04}xD_?lroB zj}<*VQ0Y0iY~6Bu?_RibVP!%HjPQYxg8QY%dfq+$^ZMpHSMCi8EdzYZK%stc?b0U( zUH#!-*0R1aCj@?O!E@@F_lgs^C5Sk&gGC6}*>t@1=rwYS{*% z1EDp-w?@{C!oHXJeJ>YU&wSL*x1JYTr}@_DLhH=3LtbL{k#+n?(cP2p8(nvgK5;cH zf2-i?UTs@<^%Z*tg`Q!)M>H5ePXoS(YF2WIkkTd}p{2c^~Y3>Of?y7Ilp3-04vu13Mt z%e#8>t^+@sLz7(Mhc6X|F6U=nSs!|3)7^ac+?{jzw#ystDd2jsu|;SM^NrzER)~!A z@YgsYG``F?zWh;3q4CPHQFJu|Djxq22ksx;HegNOckTD>E2j%YNFWAyVjw>_u|b^N z^0on|>YIR5glA=#Z|nyIi@-Erg!chM3ciEOr;9C}E7w-u%(q0=0({HR@)SR|1)6sZR!_q=W`)6?m3mjWE6&pRvjwk-WUzuUQeC+fWL;Hl# z1wM2^2wmYrSBmaV*q;4$c|7=u8FTj+hb{<1GyKp@Vdz!fJpkddo2$X%z9S$!EAu?G z9~?p1>+9|VK;I^>Kpf_Y!vZnF6C?STE*6MOu$KA(7>gJxM$QY7X+APth|KUr=$>`u zNOACZo(QgX@x=b(k)tTvw@wT_A$;#T?mP0Ghc<}AzyS~}*uSpux+`3C1@f*C;&|w| z5E|n{V~@KEp)T?3bcriJT?lW)BNZ(Oj=z7AwS%;JSfX`!P%$C>bi4W+GE{3-Ywu^9uKe5 z8~6cOwXa?94)WeX^jP5x4}VBgO!D5zBHjXQ^3~j)1gd+(f@eSP*}v9T@QmC(|3p#W zgKMb`$4OP2N7lR>j+YQu+aCDT4LtJLwt;`8NO*o=`ow}YbiDV?QWJ=C{qY@x0iur< zV64?;$Bt`b%HW{lzu%-eSRbbjJ!&QxIE!1Bh)? z-Lh%hjh)n8)-4+a9O;tP2_ZDfhep@&F~A7M9eLvR-OD{30R9z;&Q;$!acGlhxp%%m z^gf(g^A-9>*NIVR5r};}v2V?{L5yy74g6^QQB7g+bbjzmzU%Drh0;!3@vhl6+((~y zdUoxyVi;DmoBh_Z4d!iFJ_(qf(aq?VC!XED>+X|VNL>Ooq&F+Pv*U-%>Y&hfg6}&~ zfCzJp?>@cmJY5{UgcgaIiO$JmJpAFukq9@z*<_N`tj6hl1f4EhF5>pN&z?95i1u0u z;F#d>@D5Mjdv?R|m5)JnnV|Z&%xiF@|I~G&2Oh-!-Lcb7NdD0=)&$8vb)B$7^3Mmx z%#h@doFE`6)YYJG+gK|kKhxKuuiJpW#2FVP{~q$d_vcn$wAb+Spboyl8uS`XvUsAR zEgA)@Gn=5IQPB~N{y{dDkXvZ zl=KXp3CFI^$uuxYZ`o+{C|GPk@4doh@4dova5;=-*?5AHjduTYJ>(Z*;S8N-(=dTd zXLKEq2`k+Wq*oy*S}i>S5s$oR5Xr-6w3W4ge#=8wBpkaU0A zb7nX*ltO3Y9f3X(wgAqYw$9t8ZS(eN`@Cb?G4Gsq&by{v^EJ~o^Yk>WjJ2fQ^PXvs z(zd2+=e^TjrEN>s&HJW(6fqIj%hrEJsWVUeP2>z=>>m=w!FoP3m$3~?RMhz;0vDo` z?6}Cjx5#FaY}76{j9*O%9)P*=uYjwaju!bt!6DR|cNmfW}|IPP1g1V#u_Kp;+^bX_d@SjOjxfcG=8Y z84GJ;tgL;;#@IfzOgk7m;2iLC!q3GzS=WBx!qn(3T5qY8HH>@NPRNXx@c=!&N^Vfo z?u#T^E7whOnG`#4)L4b=6m*GAWht?;zzr{HkJ6{zJGfKOXVdx^3qT;b`Y=|296DqJ$f-kifRuZUm>ik& zX+1^_BWg5=aR5YXkZ?e2%{6N+qh_2!lLpNl)LKUUS$Unn(W_xWf;O#f)O?N5p+SsG z+pRx^y_$TcLTdrGM{653U}FZPwPx{s z(QhwP1CV}$m*}W_Iq3?PALUgK3RhE^YpKj^a6Zeh>7c6PhV(&+D@oUIuQF)F<#G)R zpx)w1P@f|%hMmDK1ECY0{__9eS$1fI%dTOL!k?$WC%xmXodTXOoF5vU!$;o!PUvIBdi| zeI*s^!;S{MLl+y-0b}+(1jd?-?L!Lht~Du`1;t3K5(9#l?eQy%Gc#bQ&9a4O(=!|@ zEZG^4C)03i;&C1gRxtR%@8Z3}{6gsz3J4I-i0OMYBtmR zemXth3s-?#TF9m{0^gfmU^9u7`o5@h^S!KUZ9xclt5jrcVTnVXz1;LIQykN-9lp{0 z5_m}8d_Zy0?6HS)xz;Gxsk`|FrbfYzn%tr@7=H zUVpJR{HQj(?Q1TXsJbIh&E|SfiJ0A5#5~LnBKg+arwbSap0UdaIVntDt5I=T|GZ{ zed_wT=TpxEp*Z-4IQT}P>rL$H0MR$+Hs&@n8<_`F#iQrNqvs2e3)mHwg6*5G4cDe; z!?Pt6dq>6I(L!(xyIP*m-Xa|q>9B;2E|Kn18WE9>lGu*Eg^DJ4h zIon*VL8rACbBbs#5JTO!Xzdky&N8Pty4ql>9tE{k}tM-*;*) zO?HZWeBA=O$G~q+cP_M<6qnEd^p-~5MCNouuXhr|MO$%HoKj8Bl5=qOoTKuj7*Yv^ ziQzJ~6D~p#K{v~Dt;P&mi=j11VB|CJL(=pEa*vF&rn3pq#a3oXN;#)z-9=0Jo;Y660Jg!#wUe=TPY&InXKjC)U!%#!j)ZbIVff8W+39 z3yl*cv+02EzkPvQkvsmM^pvcmF0f-GfdMJhA+<(-*%`ZYX2)&`_)5f5=K~Rr9yh>m zc4*>n`7N=yecRr~qBkgdgZa>FkG-#N?>i!eA`ki>=0Ny>@3+JR#8w#hJ03mlzZtI| zKTiI&--hM!o{28=zjabj%5HSaQrSg5F2EVcwC<511P(2v2-F_Me^Upduci10b^2%> zRa+*}5EqhqI0|CuI^P3V1%vvV5NKD``{rdgAxTplh}7q1Rr|2w$T{Y8ZJ>=Mz_*%B z&@{-3Gv{QeYw#|uQQRfQbd&6wM-Q$IevJ9r1iUUL#PHqnAN+hBT2CoJ+S<#(IES4~-41whJqCj)rp>gqMpxESenU(;drM7z&LX%&!NahSV^7Uj?;o4|O4ry06c35q7!~5`yJ{WG<|=jG6)Kt9bwvdFES!7H7|+t!u@dbE~JbX5v<0 zPo9128yGXZL;78tg&mCRK8ynCDFkB&fn#LP)BtL;SD)`1oC`NnopSEbx4D7eRZg3N z*Pu&Oon9OH9WzrOkL==t)_BkJJ-DF!RVb^;&zLlRd*rV|uZ{d}`Jwml9+3geIpFPc z?^@e;+zfo{n@CRgJ6AjgWzJJA38TpdZyLS_jTLtnosm5c6sCI{|p8z_G-+vb1QRR%E&ACEv zc;;}bJ&$lwLsaJhlAY2<7&TzW3`lDUlUnN;>#4H)&lp){e~rlHzUwoKI@2t5zedw$ zD_+HGsHTnRve%$%qMYOTZZ+DnN^c+_`WzHCOV_-gi$jU7P#V6evdYX zScSvftrA~wm@iM3HJ8E}SA-7rQ#{(AQs)Sz#LPbgowh>dsGFv0UKGA-Kcj9_5cxG& z;7$5FGsHv`d>`U4PYRKdDfWWwP;zJB&naPnmeNtZcjxddn_+J(a3jm3-^Xc(tQqF- zAl``$ry;|Y=0{>Wi$8}s`4AM}{12vSz-{mM{BmsS!PPa#orz*|pV-{@3vZt|Hub+e zp8B8dZ>BiBd>}4H7t3z&HWgnqcp})rFNaPiApfRhF3ZtFFp*g@;w(PKokD7hY=zhe z5^^S3;wWI0KrL!gm?~k;m$=cM-tEYJm!%@~MTm9UEIMuaV`0+>;j&1uLw zS~(29xMrk56U6mmfji9M9p%nJ0pS~aWpS3nZzCMW-W+-g9Nr%84J`0IknPtKkp9Zd zDiW#@<^*=)>!!q-s?3<#;4WjKEC;^~aj!rTty3erHxY3WizzJL!s2Z#-oZi%`5f9_ z%1$+3m||`y%qw!JdWY=QqE#h@2t#E%W*@QPFYi61Z1q@?$1h~%q&O}dVhA3y9Dk(8 zhUJEO$(qY7Di|2|XgT(L_fCL$c)0A!4f7MH`^WTO(sx{FkZTS}M{$p8 zqxX|fiF$t_qexiggDoceE3MdX<*F`foPpS?~fF`om-(t-b0c+okRv ztUDo}i}j&xU&9(xY&R=Ibv}EB7kp>o7PNO2 z!$V?tNb&~q`^WAK{_Vvre=#~LMn?-BW8#6aym$Qbci`T%d-8N}2lB-A`TEXceZN@W zU#LCu;Jrt+#~{4F?YZU2w~jsbj!XLvNP(VWU{DMUKBUCJ(CRBs0wFbNQ4I8of!^(w z(7oxq(_7AB*O1sXRP34%yCw=PlfSW=Pk2@*)I9F`M6vyt2!Hf($UoOL6}@eux2@>y z5xqSnGjZ1y=}wVWauvg3*RV*Rfcwx8Ql=gd0|Q_9L(;*v;=z9LV83*r{oe51VW}l@ z@ABQt+o66b)Ljf65kp6cp<`m`m=x(PMvjS*V`?e}awKg{J5EfGm|e}j9jm*+vpQL7 zRwfNTqJ!(BBHi%|Iw&0oub;V}#A7@w_Ma`p&WY{k^6lsI2QJ`yDb)v7UwyK#b=|q{ z%I|~3{>L<2pgTi%Udj6+Mc-l3cX*2__>QdFw`&i4dVT%Oa#-yLK#Cb-ZfsYyCTU!y&i&F~pR)$v&b z-Q^D>Nfy3u#ldW$W6Gm(Ii+Yi4pYNQfyRU1g5sMGNy*e@ZCda7ia=GGqI#^pCyo0{ zR;b}7`8rE>Y&nQ4^3;i~OVnk`Iz~O&*Hp49Ef@uDyV`b;hCs=wwq2y9WBuaY^Q9UU zra?QckKa8}a;vb1G_|dV@3xd`RoIKO)v0YCaW&qth|cD(>lK8uaf;~CMh-S^{x_xavk`~s6}d*CSCgO|rc&gPf`Q=~9zX4}93Nj4QfWS> ze;ZYgegOqR+pR47Q%F%@8+|a@F|6L15pD%QJYfDb6eTl7QBNI&qJKr&e@%Q}TO5?> tYXZe@u=vVng2G0q_Fs{q|8n}*-Yqx}eqh_NT{lr<)E(DXL}@7V{6A{n6+!?2 literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/glm4_moe_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/glm4_moe_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d94baffa37b87ec031221cbd831dc28be599c9e GIT binary patch literal 9266 zcmbt4TWlNGm3R0i#g|A*qA1ED^`A8Q7mks3s@NFa8q=~(?0;sVDipzmj9vNRpT z?vix}Ba=b`^Rr+bhU7U4t%gJ_5+8NCfnPFJX8`g-9)pB}P#~D~XXPhyxhBtH(6Ke#R34UhySvWIiX$3B9 zgIcqywQ|n}*0x|IWX8$bp}%#BTxTM-3nb)_nkQ4~gXgo{`7Fz&s!Nf~f;w$YWvW={ z;D=|nEdcZ%m^(y3sGQ!^-RqE9eC`?LAa4WT>N2ygyRxh}eRCRm%@5^97FTIN!q2@SQo@ zlySa=qwza)v@0B8U-l^VYRNYUL8Zl-l#}@(oJ+D(fmh39PqnVp>hHJ033f@{?Z1kr zI&WnhLTY%b=f;|0geHaN1_@cFCqDfjNDB?)0=&Fv9A5UZ1?%`wP0Lq<3J)CLf$OQk z_5bawmol#mgf3wvNjuAd zB2Z^iJCP`^vx-qCj&Ni2^8CyUh-s~)>QVgCCayQ zPF4^6&@9N>XgbTu8yzh}!*e%i3yieTQC|{=mDBD3BK}LrzPw4cOtv?TqRD%=b<@;U z?A^Ph!!o=d%Z_445KDh?_&^!nGlFGjv1=I1z@y%QLhlK&_k@i7q$7B5VP#=sVDb;H zKeqn9HQ#Xo;r^{)*L%KqeMRfRUtRun;+KhSt)|n0m_3gT1o|Xbl11ypA2fe7|K1F^ z4jI>0G7zVy;OKhj=-P7mOB$x}z_wQFwwH+3W-nPtXx}d^?^}Lhf8V|?6h@yDN1x00 zKaX5{ioFBxCErcHmw7j{ezEY>dGV?9`QB-Hip7@JCEeTBZCE>ztmPY#ofqcj1<6P& zhJb-`!aU8yRrLb2D86N7^*hMH2x6+riq+KAwW!R>Nl+D%tWgPSO;#tUs%XZ<>fg{U zYFR@~f1RMJo*5l$1Ux9Lq->lie9L8075`OuRb4LX1yv{}75kt{2u2goR%Xf&Gnple zhMfKkc?B-Tq7g<^)mpVvQWc$5Sk9;j+C@{&l*G%oE3syk!u$jZi%MaEk-fVZK(p>z ztGyZ$WYL;yfZ4x7$R$|SqHWQhvnAWU&gp{+Xwi|ge4 zfKHclsw;4Q%f2@NN2g+{G9>5BS#yqY7~mGzFF8XJU*@h8R;j{*;Ev2UP z_mRa$VMu|p&YU6Fc)w9u`vp>ibAidm)Z)5`uwUufO=vGeYWDSjf~l6QTNxE7t7Gos z8`r!{E|6>Fy7pyqjR|?C(KH3UkJ?T$p{9^)dIa)x1laG80ccX9fwFx99FRLCr@6`8 zU~(F`cQ@hAX>-~nAa|#vDi5ld9J$&L0+_kg{)9fNj}awQw8V=6UPpHutdNc&Y(RG>m9RG=cU9P9lov?M1{ab04 z@-6)v&i*gA-2Tr9V{7`z=_zTU{HkP-seFZ&KCz{5O`mTGZ#cs$UD4@*Ls~jl(jzgC zD(#1!7IMejrTrPxrG3vcPJUxPY1aPM#6Z5_thH#41bX;SaiyQ43|jD^fCDcia1`K! z0c^pJ$SwLb5WKIUsE8ypVCK_cfAS#>Jq!>TOOeM#FmSE1bd+sxj+HFfk{di)=@|j~ zO)LZ^zzi=tCzWSXe$4nx0{k2o5BVf5JTgg_No=rCrdFq?u*azPj;eg#~e>dNV}h|tRjP$_Y^ zQRs(AMySeS3M;UyyriFtB`E9^Zay0qBrTkWd?U*sXq78)F!VWK%A*}7OQ#)KSyZ{c zE`k<9-}(OHk@Noz*8_!;%iOxv;wx#P{G_?Hq=ynrbB}iuWR81y{fy`yD;W`JB6iod z8EU0|i@EjZ+7f}{zIGdnl9%l1EVK`c?Zbuk!(#j42ZQx=$C!9ODUM+*MKqW|!Np1l9WvbpH%x;L~kv=+?!2A7Sy zT3&y_yI1t?EqD)z-UI8CdG9#z@%fjHA9wcMo?1SAx2d=%crUS%`03T<$>N@_dskPk z{xrQj`BB@z+UWY^gBSnhm4AL^v+bFWI>T$%@|~l1jaweyopZO(Z3K@#xb|o0A@H~9Dvak`g}y!H31jSn52MPKhm--~PhLgJ645le@DTPF+dJ3jgK>a*!1>YG>|Z`^A)HJp4f5^)lz?k)FnO8eZ`|J|83~>@rhybyWzpffc9TI7|10XI{1m~ zJRcR{K1o*9x5t(RVh=*BjFBS&WkU4g!SCgzMwMCxIVxQgz;wJ;#;bnR3_NYrw!aiIQYjQuNBAIcH`Wv|VVo96D6#pr7iW zsXz!J%9gN()hwC>vjWeV?&}neV9lZ&kyAZy1*{*FVtIkpZ%Wpmv+A=cysCk$MN7^C za~>sRSwk0}FwgRun&gnDI)WSyRnI_fscR5apAI-WtD`D4WqtL>yXu1-JXXJFJzo&4 z%Zg?PEB*gJGIQ(L4lZ?kR%N_znSY(0_2$oN6n?wrU#Gu9`3)n(`~i!>fn6zQ0*}kG zV{PAXGZvt?kANqR8XD4KLru?yB%IOiq|6QOHNJu7n!5`he|9HU^$5XkC%L*DSuHLD zyl`v1z;QCLn#)xJ`b2#9nTt|=%egP0vafCdct}F6?9GU1j;)5dtbd*_e~#EWa=z)->lv? zn=HfYNli;dhO3e&XZq`IHH-I})aU^C&ICNnyWusvjW5USBejw_)G$pwuy%?PM>b=@ z3qa#)!3Nq()@`y8yb%1l)f&s(s9LUkn4X7jA)TZL>k)#^ngm-lkil3+rm`(apFz4- zilWN4h&wP{J;ZE# z$NFRIU5jYyeA|i+_4w`aB~#JkzjOZf`PJ@1=eXE8p7$JGG8Y>>cQrpgUTkY$I& z(cy;ZvgqhqjXiYq0dU#0n z@RG6E;99=?*3l{(`p^+7_Vq98Hl5wYfk7<$Adrsb-Yr+tGFxcgCpPcPyTa?;g^>wy zWa6P~0vx=8vrlyP6`Wzw8D68C&XLDj;%=@(o_9@rQsy%(HV@}rBkOG8(3E&+>Y-~& z=CfCH?kzY+MCZu*$fom<%4ZjO-Zcfm_3r*c*AcPnNYUxtXgzgz^ye4WJcZCnF?2HD zb4qMGwc$Md;foNC@3wDP17P?&-5c({f_p@CkK`Q()?a(*7~66--?87eZ*-j6be@LD zMbSH0@Q#Y!(Fcs^J+k!uN8Vt;J0N-o3f>{nJG9jvycb!CteFe_N5uXkh5j>Q|CxOI z*}v$uN9{{z3f3;s+O>M7&^;!?pY6=U_=oY>fqLpJ?q{(-r#1 zMfkHG1tYmBC=ZRHkxx9qBDf)YM#McM#kTHy<16FE_TGE1uDrSx94Q6|3c&+n@IWCr zCI-ify+eiGF|l{7(0fYkJ@r_x1qQyy{*ZaOfjOqmm^GA9UFH6xOZL=xg(_R2f4Z_*5~tPM^mNLsn(K zRTdH`UnMPkDJsjB8`!i}zCFb`_+}RcfwvE!a@ga9>^#y&pSbDe3ay6nk!RU`X0^d=O6`+EuQLdUulec7+ zs}|DUvwC6We5pZhS}`zndgW-zCO7TG*SXrY(q3}NO(zc2C|6y?(tKAZn*Cq6u`O?z zJn4{+eh(KQJNAxvHkIP!upl3fU5-wf{KpZxv+ literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/granite_20b_fc_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/granite_20b_fc_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0003070477bfabe29ae6b74cbd38a617dc9d8b8 GIT binary patch literal 9334 zcmd5iTWlLwc6ay?-w%ltNlBD6dQcK6OLFwKV##(a%dulAc4X&4BumhokxZHo?aVNi zg`trGDWYIgEWK!987Qzax_}kfz^dHXn5d(OH3++@-t_@b(xgvoA%evb?Cr!E4Iwg9++C`1ua z-me$os;D7m^c!O) zzbR(+n`0KgC1&+oW7YoZSdG6%?4ya=Vs^h>gtgJyn8WXgIsML<%kPTS`Rl}bU9>*d z;BOFNebgOm^fwYnf#?Rh=?)>S-ruZ1ClO_MA5liyen(lXC9e`*(;snR#!Cp=S^CFG zIv%0{Hk_V)b!L(dB`BJKqWV}gnz#Z$bu4(94seM?G!P0#qk%Bx)d;o6F9o^diP%Dv z=E8~iYZwj73DuKylnYKJ;~`vo9J&iev1*!T+2A}an8ea~KqfZ2Pb_jwFvQW6Ty-iw zmtbN+JX|n7fk-QNfi>wdd^X6iuvC>`3MJxfGDZj3g)C-yre;B&PrKeylELN5`Ob6@>&JQNhS!(ruz?G zoH|~n%+j}d5VaWBb48Zne&P5$83QSZlHoW<$0?c; z@!(-emL{)+xl7`BdNGIz%Jy-hY=A%nI2vk#K;j~9KpmJ1^+v++NKix~cj(oN#S1J= zkykF!ak8g}jt4JB>7HPcOY~rjJ)uj9aEOK$sRS$QDej=W@}9Ynh$zU=B;F!Rb7EQ0 zaV54XimbvJJ_h*rXc>~fB5oiKa_Rd}!K1%`qt6p}ibn{J7r~=IX*`c2D!F$#haSbl zg%GEc>m#CikxNoeDP+trrb=$RioD91CCg!%1>z~2SgP4m@iyy&bEjsL3p68W0|8M` z1A^{i0?3Z{Dp*B6upG#Bu|~nrf(DCyI7$l^iA_MI07K8yi(aLmV(I7{ML5Y51^cs`3Y)WdvB%_sMj;a+*8eb9*=g5;a` z(2myhp(dkst<-I6+wRx-mQ^2_^JM#N^DXnTYFpd5)7Pp7MX8pcVZXU90}jT) ztK>dd!9}pN?fGPEaZsQs9j}hyj(a{{8EM#41x23VwY;t#$-AS}c&J<=&_{DB^b9Ih zRjgbsk1SJ(S5Km!>O+co6jHnc@|V(cjq*Au4c9Egyq?nHxw#g(CekWHw-L8gKb1(vDbhocxK+B@z+ebn zqS-%U#m!&|Uvic#izj-rf>zeEf+`-2(Si-`Gm>!I; z7ZR^RP;pm5^&Wu2`K8_y3wM2uu;b^+`Xn)I+<2jj+=@VRF2 z6(XpjVU}aC^uU6^T!Gu05vnA+Lr_!n#pJx8jDk|v(Q#NE19z&Rgu56n0Ic7Is^9|X zS)dRTyb_=!qYdjU2CD=?C0aOYDj8c~1r^56;KL*6PArD#1<|%r(|8A7GehE0?Kr_Q zCMW@!pefQ*sKMxEf{9#V1tKh{#dg6Y-8G3h$)a+_ff!Hr4oIstJ%NJp;@Nz7HiBl& zkYh8Y{4x082sK5@mMu?q$@`Q`;`?2P0R{Uxh{JoRP=$=voPo?3$TefeFt}su%Qv>J zF&l?>l?uDLfD|V4t`=G9e-c|(78HFt^Nyo_WiZ#!mucwBHH>B&MmOBqhN-OM4Dp`I{!O&?{#gONDtj7J-2nYbbn(=n+jUg zcW}MsWBQZOM(3}hTfHx(Pp%wWC2pR$K9jR|W$azscHcc!L5Zrp(6_oRXZ2>R-n`v; zW9Iry-qpVP&gQ@?o31mDl!{u%eTVa*T2Wi~SgES6DcDeB-)<8!*`zh~e{y_t^mKOc zm2Kmhhc47H`p9XlH$QX$l%C$LMb&lry0)E0Z@$sDHk@hff6&@-+jq;CcYD{mGw$I} zF91Q>>UT9&O^*Lo>j1wUMi!jNf1TbaK9PR`7w+E_lI-thssBWrH`5DnR;u_T^1*0tQ_92_(e(PLt)nR)@ z2R}X7Pi1=k;7(G8io4walqu3MW#A1JI7&PbY5hTkO{Cxr6skwORvsx@MaGng*Y9T) zaV6V^hmr8lge{}8bs|%kKw=x{>*rkI5AJg5hwT+35wL45Qc^~9Y_SeI`(|n!if`MlrleMG*}l{I-k-CxYGo4X1~_`=aVc@ye1fT@F{(im*0k2 z@A2kH=bq(4kt$h!DGP65{*J>}l=wyPJ+YsE@Y1RDr@zWrBJX{n~F zc`ILyWi3_1k@E0Jn+%n6O3T;CGlQ*F@+3IyAc^1pvvDt!+H3BXI2jp`Ys)#M2aB|L z!tRzV(s|;pNv=)VV2$X>S*OhMx`AG#Jbqu=TK1=1nclqZX1xM;syBV&Fh7j1WnbbFGEl)jRpX(CQyZb0sS%3dl?3GT^T;Jj5;aPLo$ ziYKN2PdjhGC2R`p`S#}>p63}Sz83aU!vigzrpuVB^68BC@wH6PGuB)Cgnn?qXvgId z&_31NA$id})XH*|a`I03^z$`;_RjOmNQFH7zc|(SMhKMrbbMOCmvqT{j^Ieoo-cwO zqdi}wX-@?d!4Gvk^Zb@>ass`Dt|HmDgf)!G7y-{v%2lBU#^siBPT6^v%$sMzeCoF|Y7$9T067U7n?`c!DG8cp^CuQB-U`vXDukSsZUokT?uaUZpwFk7cmC zEqb6+4A$^Vj(Hefa#=|#JU2InJxoIEbRI(3MtU(6O;U6qo`}=nx2nPYqZdWT6?)-| zO#t^;bj+~VnuwEe`ikVddML@`#kw8(_TWs#%tFTC>qAh4DG`tC|3cJHa(u~4!8jGA zN$_duVC)HJ7_V+=eD-p9VFA~NE+yla$skJxv4fn9a%5son!G%==+40t04aH?%?v*8 zk`JpFSI*)nG=rn9f}?Dm!xIJe=>sedNMhqyP#!yTMo`Y4e~rNrG6r8J43-53hsFdO z1R=|B08&I!P=yi@Hq`?aC0CghOw++AewCmp%o}D9kH@N6uskt!m=cr_CS~yT#$d4$ z9MYa>DgYc~1As&HW*KZRGuV_C^ioI?d+H312*a}kI4WhI7X~tn6?Z>^lcSJ$>m;B3 z6g2{1CFv;x!ESfl!0j`x$4giQT7qhrYxhE@>%0*OF z&H+2LK@QP!z?bf#;4Ilm{)~nb+agd=Kr~k*JqPu!ZwHurBC`X%=_H*z*JayD;tf$u!32WQvrx^HR7SvoS7j#cjVwOiM=EyFvu z<~9AcZD7+lP%xsV)?DN0R^#Yjs&D9iru#?wpThqT{+FSDJ@n6q?)fqk=ak{EN2LXts4=%Q3Ly z8QCy?`qroZ&C#=)uf37=ym@^lJ-gDGw>RhP9T|Jaj;m|Uy&nG5dv8AT(uGa$>si+u zu=ZL9ti85r)d>NK^r;=PXDtj-4{|DfDsON6g>Cix?H}FxQMP3;V;|hx+#-cB)@M6Vj)7egjlQ@so*g-L@7UJJblS8+<(m34P5oQefz87Lfwfe&d;C*X?&$f<(es;cyqP)rX14oHAf?@v zvvp={oog>_*$(8}hjQ(QGVO;py0h(5>FF<=?K@oueyRUhzdn91x!rYP#kAAhx;mI^ zo5-|HY;d_Fvl;knJD+X7u%gO$4Xsn_^k&!L71IM()6b1JjhmkFjnuYldWSslfb7Zp zdUL*sjBjG6XCmK!1ZdXV@rRCsw;~(VY9!lnd{?J#X)P!Hz>aK>|Z<9u%XTn7F;ujM?iXFRXxU2PEb>TslIU{@Y5bajgC8_BedY&l2E z9J^`yqONnt+w;r%kL%ZGHqX7j?R{g#oNwq_o7rj@!$-n=&y@ANzA}|R(7!&oKD0Ky zGJW6KzUe&pf5KZ}Gpx9CaCvgu*p|1smd|{%ODmDRGiU40*t&DJ;f!rKXPeB}CclCJ zm%GHyliSvpzi>7b)KK}|?j@xdBvb2LdxKlylct0Nhu}C>;n0#z5f5$zZhDq6q}3L?{qo>@dCfq9-Up12fn^ zmBQYqF@%F+f)dOe@#v9}=k=6&=~ zp;2oKHEQ)SV)ejdg!6)VRBg_?n+r;u`;LhDeVyl#8p;KZ2rAUB`_B4;8Uvv10Mv+} z7S*~6IuX>PM$hW(&FO+cEEHA(0f~TL9ySzJgkOlV$MRl7Owjy=F-- zwn};vV(^}oY?}uB_S%b0o#NYTZwSA^C%JHxm5d4Udp)Gr+1>>v!6jft5`VBIA~8fX zOboba#0mK8NI3K~B%)Oz8BKUgz;5`T9F>o|436gszULY*ty(nFL~{q*XQH_#7{u|? z(+Fe37+b^|m@dwWW*Nguo9->W`AcGm-}WVI4U5pnNa9K>Mxi|{3GxK<=TOAL#$q-U zlmtOM(jvn6C2Ic?xqpx9{~gtQiTeKw9r{|KB@|yHNFL(kag73sRcb=<2to4LhzJ-l c_8&TX`pvAa>AkAms;H81t@s}!oQre+FW`-uBLDyZ literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/granite_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/granite_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddef57a2592983450fa08e2b92c5c7bc930eaada GIT binary patch literal 8520 zcmd5hTWlLwc6T^@bNG-*i4^r3iKHY-7G=xU!-_1&YsrZn%ZVe|yJ52>7|uwh%-7Bg zV@V|wZxNshb{DG@Eh3{WqM!jH!v>-N0jdH;n<5F|K7fj*P%;Ar>+QN&h@H`Wg0TX*LmI0oELg#gdl*sEda$ zvLPXvjD;fMSS%D}{5rAW_=T`=JQ+{LSRtBBJdfw`f><}n#)R;RbRvRVkHc`0lA5Mj zo)6EnqFJiV!eUZq%haO4g(CvXC`~65b4e~9#?wXWo`v+v5TMD3;pf5}&vJgXXpSTk zd^*mC_*6K;iuH5R1QUuR+(43;=%uM}S2^f zI-X?M7$2ZdCImLYu#7Yl4@~nceJLtjkY=!pVaz>#K#&dsNIxX7&=LlImRAcbL=y{P zX}Z!CFiO>PJj>9RF0cu@zn@KnUy8B);k1zK$7}URE+nH77JB3wP}^Uow!E&iM3`e~ zyb+!iq`GJjDsoYkWq}KPu<`HilaT#6@g@?Wl+){~4EhyZ+zfH8d@10<5E&Gz%x7Rk zsC+dLT0(yE%(8hrBX1e7XC{_Sk_yC&_xTRDwnh^MbWD2+$l|>@+j2|d7O}?;lV<5G};ls7;e(L^% z5>N>5hkm{dvac_rUA_5rT~6;>YueF!ZZ-9+sDEj>`?aXpp1}&(San(|h;6r9; zt0NVtGQ*&>R7(YamducxK0`8ubQe_@M-bvHf=L<4kn^yE>uTlsoIwU8;2lb7!JCIHOip_zUz`Dh2*yLU1V~7Mc~PS~6Op zRjE~1P;e1I!6f>%CZd{05!FwJ5Xx$Vc4a)H6FL+)qha)z7d5-=RQjqVj50_ZBd;43 z9M?@1h*F?8RKFZiqpTLNRe5U8Xc<(?(+DWfh$^eOggi)aC|JCR3uJXQyY?wP)snGf zbP6s4)!?{KZV}ZBpq{h3S9CMw@)B_gEs`&wON8HghQlHei9a|2*OtQ99drlWewGo9 zmCMhII?y`sVhYR6Q;R7U>oA&$Mg$sO&RCQe_R;f6fnIi2G#u??Xgpndkavr4z54=0}k=@4jV&j1iyxSuu^Mj|h4?MH&IDMB? zIqP9^UHu-AW&ic**H3RZ4SsNZd-PQP&@(&K={qjeHG0=cxh;1Z0a~5jYe02P#U{^g ztH0RVyD^+=9lYJy^=|Lmy~P&)MqjRF_``ERlE=NL)3!DKm&Q<6Uvj|Ak`tMoB?JrY zwIIj-VrxgS*;jP6?CI2)e6`u~-(R#s@7H@h$ky})Qo~lLdYPb;HxBGj)Aw!pOT6W8 zOBxvX4SxXm`;WcH+9zG;Bg5FFUHvg>1^8pT2Inrrl$HFr!!u1(AxRIS#IfTr+~*B7|kQzB&-=UNBO5v92<@(qt`9Q5w!f@DWlHKJ=2V zdKq*CSPm6U1y+Jec~`Rfj4GpNkQ+Tj_dG&dah=sQGkO86IXOLA$1TI4AArAhN9FYu zt?>x_kC^pc{9NNb)K|39#TVfcN&KnNpH%f7&XCtD`fvupuJQA5U*c>+MUO7n?(2kV zqT|v1HN6!bTV2J9mO-jng8{$$M);XPgIDyfBH4^Vk@kzA3bItzT%(UEg_~7fltvtx zp|ZxrL-)$4=ZPA5SJqetdo*hjDjFP90Dor8b2m~j@ft>xF)`Y4Xw4XTVhM8KUQcyj z3WWKdwZOitHQ0>GIuD-1=rOj7lL)O6T<=515LmX{#~I5)pMtAeG6n@h)|Rnx?+aKT zNNFLUK-Ds1V>DGeAzN2NDO%WH9j8DKrU>t}g0((Vm#N2hGHVyS%Djbc1*(>eDPvb; zTWcW*2bD1o#wLJ`B&Ur1dPRB*U?1L}81;3tLf3VRf@_|*ZdF>dR>1MS_qnKARN=j5 z?8{(jM#zW?pMn~mRiq1Ygcl4vn2OB=rv`;8w(x_oE*!pp2jGR;6k35tkLG9HgZZWG z8!S!h!)`hsdB|z1SFmRt83&xnX!QkooSYcD@Px9TYh|EDc8==~W$nkxuHjnwCBIj8 zjJlSmt3<;6kX_t!Ctw%a8kzk7`QLlaf7s0e+2F%+UwW{_FrLTU?FM*P>>1O;=X2WX zJxmN_8o2()(A#j&OV9|j8zrBsPB^L%oq<;6G-aI`r*h9T_MiXs`=5)NGw}c7{^IvL zge_Jn;X4ApPK#1ssPrf;ARD?0E6~CNd@=BoAQ1SNTdPxU0fAnCIEUi<01ZznYr{H2 zSy#(^-oXPLk^t*AmvkP}X=~F4_Cy@T=n|IQ)dqp7{KKAT9SK zz@cIj$@Dygq_73YLm@}DzykvC)XcMI=p|P892;(uyToB-wcLn3v5I>IUfSH;81{xh z+eui!-CB&q(hM6)Boi#SSQ-c*vWt?B1%o7S11&qj`GJTbI9SO9onSA?ewL4soinV9 zVJvpk{A%tL6tYvKiZaqVimL{}8`(#rFN70JjHSWvVZ-rz{5DM7@{yT~(NqezM=qoj z7wIrhhp{i0j)99gCnK-URop)yO?I$4Ic!CVMu})1hw)@DZxmC8%?%D)d}3qOMuzw< zc0@xwjxgb;Pb80>J}r_nv(Ixlav|-9!+sKn%`(vr!KG>dRgN@?>PV7fMI-Q2_6)&w zn-0ftNS9@>c(`FaAFEB#dJpO-Ba#sJ;;^5ystX7UaGVuS!&;mhgUsI~JD^YEhGRJU5zg=#=O%GB zg|iOGM1#aa@F*5V&0H)z4_m540r1KYGp@##<-?)_N~8%jE=W0ij=f{>S|Dbf=J3X3 zm%mcw<=`{~3loB9R{BHu$ksT)Qoz6>ZBiM+iP4M`2m)ztBWwPts z@?EDGM=#~Wxg7XDPIPsA1S4I7Q*;^aIZ#9AddK>UdB@<2p=ffh8P<&3CQrc>$e98g z!h5e=ePzcqvFq}!M?dgvGJlhJKan4t%5_bBn8~|lS57`)%Bxpj-7y`lPD#F>%nzQ( zb)C3DgWJ@>5{242 z3$3F!TSxy)^QPfX4S&b}L-cQ>|2p(SCvLTS*5=;fH^%<@)m+=~s%h8Ry+Lih@L|)9ALowEZugwqah@-B^{qa$u6pCy zI~L^VC_0-u!Do)Pu*Z5+vWAKp5a>prro-m!U$frGg0E7}|dTVKxB zSFjD{Y=fJFJGS9nTjQJ58`O5|@RqtTdOA0HddD_hbhfNg5YK6J7aI28Y}j9<4;1L3 z96hu-m#2@cnngRk+tPEp#b4~|Ep(0Ly2gsWfnCp`yk~gVKUVY)!$x{Lf8V+9?cOc# zdT+k-=$={E+EyYBUiThpYHeJdD!I_Xk=4n6cMJf_I|j?kU(C}#S~dR%EZ^L=CKTGo za_wVVhYC+Tn}h%M=km>GS0{J9J@1}+`_xAE#?X%U?CNyU=P&ribH4Gd*}{>tIr#TI zU+|sJ`OX(z9ynoLjjLy1FFrpEbxNWd$$3U@I!CM0xnlmbse9Mo|DOA*d-Kfp+4DR8 zAFo-8%{?1uZZ?nY`bM@aH_Un8`Lz?peS@2aHitH**QRedz1z6APVx4M) z;-9-t)IqU5I(~M9%pD>C&J7b79w9M2W_W&-yh-*we~`R+Py?{via+&8YBD4mLm^N= z=@=UdiBu@`vvfFC?$L!pOfnJ*adx0V`t%}_@RV}c6_Y}%95!z`5;Bnl(?!z7+ybte z%io3~EPQGWfklDC_&hdfo|K+x4hJiw%!51Lg6!+p&>fXdqc7EKG(GG4zC=)zEd3fw zv8BC4;e9(ztFp-6ahM;B^5|CxLp@;3^p;(1==n>oZrTOD3sG zp%(A@@heA4W~pjH?#^}3mDZ9~s@hO}cd1SS>yfc>O`S8ed}YUV4u4pXDE7BW>K!zH z5~AtSCt^Mjekmd`f;$VtL~PF8Z&U=ZO%X^+k!D`DOq%hhumCnx%Adjl5&T6mEkt9y zYy!~V86kfx3#7QDkOTr`VPHE#=DuWBfd4FqtEIrTc=%z9NCtpx^57kSY~dqz?I$-5 ze-#pYzt?=3YHY5_MhdpBB#TQlN%Q5fI)|fKVu#cM=n}kSRB^n#=|JUcsC?!w;x!~& z3!j&(NEU(=BG-@0r*?w-Bj~~>i=Tz8L=psXSC0tlGvxgpa({*f{}UblN~I@MUm?iu u;OxtK6;!nvLUk8G_9cY~?`LT2w+6@RPx6Mg%i29{kR(9+eTi@>Vf|lcxa2$l literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/hermes_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..551678934020711a36ddc9891afa15aec5a6ce5f GIT binary patch literal 15493 zcmc(GYit{7l34R8J|#XyiXtUaqDWB>(v~IJmSvA+$uEuHubEMHkD%Etn>0mgyUDT5 zqi*Md1>#LMSb1`RJwcGw1Ph$z#>pwc0r3S1;?3SA6XcL%i<&g2qsw};o?9%CAF{2@ zKJG_S)y;0w8hSLhheI0D_pI+#RbPEo-&g#vX0w5SFQfhkssDY6Abx@w>C@B_FaI?p z9uOo!Dl)_z#>$LhK{=;Xh-Iodl~AXi(?G1sXct=MT7)`vMz^4!(+g=$#;{q6UH+k$1zvS6LF3NYG?ZNWZg7t$>m$AWXtDWr9o_666RYr#F|RuD>p za#9^1E5zB&d6dLSg4BPDAPtoLV^yuR^R6Od{26A>MHIa5GWCrDl}%ER)=XtGx!aJi zE+lSI@q8|qi6;}8Ogu$KT6pL5%|w1Wx3HL@@~K?*61HRVymf}k zscb4A&*X00fUJ@?O=a(1$=#x|sUk)5ma{1a#>mJeNpb07Xsu;lK7_>I1L{%4oPs3g zl%#@E%`5K_^9oY=Ez_KuR8g9FC8_?FdQMAfs1{O7walx9`WC3ykvd9KE7#9!0nh3= z17KQ18sQ(>k|xSXnL1z;(k!Q1<&@Z?nQSYz5X8KNv_SjTd&C{ZT-#+LV&xsDD0+cn zMlR5~`qFuQURq{SSz8E3TKGo-zx+K2|FPl$k;hWu`?jh?{6k>al0sq(1(BB+m?#lx zY`f#j>!gB@YopM6^1^n&^0Zm1t;b|T@6G#6x7MMhREdb!*bh@@@0D_Czm%%S(%PX%iD`&kdTja1J=&yk znq!MG=ED*Msgo%8cNNW{(-8@_9s{Jio|96f@{zt_T~4X|uhp+dYE_SS^xH1gK)(Z0 zx*pFE-&H1+H;AP2>p+(!pnNkfOvmaiq*O_HL*}&m%J2Bq&4S&Kf?tK43 zj-)ate)fOiD;g*Bk~lt6G)=}DazhPO{n3FpB8|1BS1Th$|0I*76Zz!L_+o-QQiE?S zDqg*ElM=^nnBPKy$@{NU{>22tzyk6)e;%Ofg2|uEWkKi`lKC9%Ckr$#L@ol!9tELo zOORxoS)`JwLqRY+`|mGmzzNi!}9a!)cbFX};|h*c8dP!{jMwLdFyH z4JHmJlF)*8%7~7`glOy>=FQvd#Z;E86)I_pZ%M)lnaWVScINz*sWWHzwmKwD-JtI9 z)&^w9#UjJCM5aI;$EEOEIE;brd`G=q!@zJ9GrX%_B@ZrCXh}+F+gS%_Yzqrd+->Z* z-de1(UoXthQ#2p}7%!iQ=czmSh?-Y3RAyepu})Y%494r@@ni;=As%PcfPKH;|K6{E zE0$kaj4h@XhXGYLsQmDq!v2E?ZtbUHnbh^z;@$ksTsHP@CbJL&y3ltQbE#~eiRBil zY$7Fc^rCRUGqIb(*(VZBEQ>DQrE$jAsTaLYva0-SeZ-8=D8qZ%^&*gLVT10b`NGx=GQ zpEHG7Q+UUQCZcz?)?mfV-bTRKwn;N3pKG zE>}eAh_J=MrdzT(@D=EY`3Qpd2w-co>tT@dOtA20ny9vQJiNmB4zcjpb_kf$+WC;= zx{k12M;_B`*RgfWv8vs-;@|>ruz@$8DA~aDx_$boKgjtHu>J$f=IY?^vUwc3uSZFa6LyxtbBetBb8plk`egvkqezRJxfGnJcIH-UamP6fJRz{a1aw00gP=zE_XNVzN1&wG z+my<$4!DR-k&jUP+FbX;tgV05Rk7{8*HSgwzI*uZAAa~2=i9^j_N<*)_f4Sm+;i{b zhACWaYroe@AB2{?Nj&j$h5TY6pF{?fKGf@ly*~o`1onQ@onKPCtDsx+aywE1_JZP( zQZj%q6Z+Te!Za9SJHNWsw4^GjK#Nq@^wuEgt?ImNp`>N&q~1dbOh}X_v_?v0)sRwS znWWK!4jWPvOPVEhNdvuWYrO*sm$Z4=jv+PqHmTo|megVo4P&E9wBrM)7K9=dk93m0 zU)TH}8Kg28e?N@hvZO765ey87s??6JIi95zc!(uk1GbH&mXeN!|3-|HL}RX|AFE~k zSbLl3CGrkw*dRfNz($pAh%_!_$48o^@d5I+v`#AD0hxQ7xTAW9xUGno&z*>q~J}U!4_yLo?w!x zR76Rih8FZJ1aKvx&ff*4_7r5U6o&-W&Y#NqL8Rj42WrPSt{<;g{PT2f!B11nBB;<* zG-9TEupeY78f`$nWj>W9@nS`tX5fxQlgTAWhQMdwo2&${YVM=ks)NZZ5bLApJ_Hc{p3s~e6ef^J~t6PNr*njDc!r?FGFnWP9F43{1XsnYwJx+adMP zyzL~MzN#<085phxqO0$+f$?Xdf!|Mmn6CPVS8uWYiBFs`t#H?Ni?+w}e>8e$`<4S* zZn@wr+ajQ&Eicg-E{CtJdb!918<}`K${o7O9=ckITq}FtT5j2N?W+d*s@;*#!qJB( zw_DV1+ZLg=+CZ%vJ6O;D>O~ml_p28==<=7_1BAo(f>4`no0fLY;%6=Xa^U=i<$};z zXl>+i2`1&Xy&#m(kp|#?&ER?d&$_3>#J`VB`_w-OX(9Q8p^51p^?wY-ApOI3Ecv0& zfMIxOrd{QuD}S^0dJBGDWIZgYKfI-CJ9D~Rfdp@+G@I;RB$dS-}}I{ zknsg6KLN5xkn*?_cYGz4wEve>_5EL$)(vG%^(d+Jt*-$K&y*Bj72XIJyPnsaAE-6b zT_{JtdT$^-K^-WmP&-^w=H=r*EuSp)g_8g$$!(B&Uk>26`_I}H*>rZ=e<>bil~ zn`?2q(t631Ro8glBJe#tM>M+`C-p^;1xaOG<^tgkRKA4=Y@rUb<4gDLD1aELrmd5?q0R zm08onQobIOZBn~txAUY$s?91&)+H-x7384;=B!Gifw|)&trGl_tprb{C7XvJt-@Jo z-324IlC@5GSbGzGYmSjS$~#KLO35nluqvsXgk1e>%cJ=#e9hbE(YzJwt&i-|{L^5f z@BB!|Bd1jVs9l=v4aFmuR0{V&s`SB~?O<+A&uiE#N)dL+Ub4TYoxCIKy*9S1uM@0d z`y;p9qorgKPI43N$sdy1rV%wezLM#Vg24ywj#8V{Itx0_l4BQ&c}flmUV$rYB2O8z zE{M625_|&PUzSIJ;8S|FK%^ z=~b%23QX@gO6K2ftf?lh`Tw z?n0nqlUB{g`3jP#{WWy2%PQ9{xauj%`)t?qq_^a%HT^_c9FxC$LNgqER_3ncI1Qv0%knRw0G8+bl{Ug|A!cK^m)5-0 zk#*(qzCifWb5g1vmt3UbQNKiyrABNvvxOohRjEUo18^ebK=Ve~(b4RrsoRSUyG6Eb z5Eo0HQU{zs9ufypvVU1}(a(T$ z|CgR@eHC6wgsHtkCW^<|AK#8UjzD|K6SO)@TJ4kV5^|X}wN~3sR9Ywl#CQJS zH88KPC?U13$K-JHu_DoP#n$>G>V&_KDcW~jya|P#bKuM&Uq*|58UFcPAxj1#3gLRU z7&}R2DexK*?ycdX4_!O(qCDWox1>e+IvbzrArmbI|}uV-}U*3?~$m_(kCG} zAv{eMJ1(P>1@@QA`V;;vbz7Vo%srOK=cxs_*%t23{mD#@q3ZJ<@?S6H{n>_<2p921 zO(8!&e1LB)WK-xG5vQ1BVv#B;dzOkvPk_4#b5ovtB1?+P_LHgkdGK#ZAGb6gvNbdl!$bKA*%H|3;ZbAim$Up>6H{kXN=!4|!<@4uo{Gy|fOY;@Zm04${ikP;X*v)c0!cEY zUw-}&SjUmLkw8x!fl#n=MkwvREHef2N+07NM#{)di(=YAd;! zc$ccf;hPAR_b;T#VSGM_)|pMD1kNdu1+NlFM2qIH{dNyN(ezAwjTz`EcJ@Ftgh_c^ zp$0sjA~dT1MGL-{U>`6+U`<$Q=tPQ!#nccFWpY!PN|H*wOOd!hUa?d(-T*=gd;;wT z#rGX>X~FJ*A~%7TfHw7M%0!Mv_R znpw=G^1K!%N2eCS`ALXryfUKE{@_*UX{5S*hm>hgU%} z8f|gO{1lcOgS7uG#jFXbHG^*;nFj2WoARX_3 zJX7=R5*LM|w$-7;Sy@sH;6QNmri6wJl1O^nYZ{9WeqNY9K5jNV_<_$HdeyC01-8=J4*ZN)4w@qbd-%4>qH?k>Z2UgE-=q8^U+ZvxH zC6BVP4b%9hy?fQLVc%Ca?b}i()Ot4z*f+uj_p`zMkJa3POYDJ5mEh&FDNyZ+aXn*f z&)6Etjh|!3&sBQPmrdQ(-WbIjzy53f58R~>H7F~mBC)@E79k!P0n2d4X` zvhUo6<@~0rcXf8dwZClHFS^A5hG6k+WAKGd1 z2pc?7aiAxTWBNx<&;6?pN4d^@Z0Ej;b9C+EbCtq*;HS>c``7-_+gmM!wFh2kyZSlb zFzXv$CAsJcHhQAsJNb!%_04j=i>&Wr#dmqx0B@gMcZ}_ht*N;^r`SEGD&40)>0!Ii za@|+h?kkney`{+GL-q}gL{-#ad% zA$)M%aiaRIL-#g*Ye!{i|qc3mEfiF<*RJ)8u%Yo zyL;a^Nkpw6QA6c{!xhKjr*;=-A7JeRW&6k&_@(l-x7Nqs+H`mxoVkCd z9JsXMxD1?8b#-#CDC>%@=DFB33xBQ|&UK1)o%*D+;<~!5=It(+ir4@BeIFbGFTf7p z_lzGHS58*kL!5gb>)ux$o!M|7f9eYWHJu$mO}BStf^`kU1gk*Ut{B?|M5}a-EuXA* z_O0Apxn1sztp(Z6@#UG?WWcepW6x8!7hbaAY1{zaaE=)35NJKa?wP4Lj(_4{9cLll z>Wt}Rtd(6g=5d)v1N ze0TQ{?jFt+VO^2cAU80>4$M5cUU8jXRs$X$lP{>nGHv5$5N|(vCZDm zO7A$Vxf&UVxBG#fpY{xWIPf^QG5|tu+uVZ7)#GxFxLl*KT(2LP#1bgC^&@G5WmgO* z9$58kI7We-R>Zg42Mf~>o-UlYQ~>ig+tC-cYB};m%^khM9=!sCnp?L`gvGu*Dl%E6 zZBSyI^|on&;lr#Y%vqwWCAvy)SVmy@t{%=e%KApvu5e?gS@^@PeU9~=t6Dn2*|n?n z-Z@|~UzGFiWxadX1}okp_s%|(nMZyzgsX=m=Z)KmVOA8xa`c{9~+tY;C^cKfG+- zbbG$1|3F`L1UK8=4^sD2E9Odj6y;UyY5VANjnX*=vvh_y#~|w%tT=`@9qkWJ-#`7* zsO_*VYo50fj*e<1`iGq#b*^3fxQlg!o<)ZL(DRXJ?QHqN)eUgvfD%c1#@0OL!|06x zT{^nTefwGaeq@L6K`wlh4Ih2tsf5q`8B=w10fH_lY|p_x*9LAYK&{@f^4M&7e74;4 z=5m{ifa6cTxnX(hnLGH*-B$%M*fY-djBoahR|gIP)k0lA?dty^y+%GvSGo>u>opx7 z*ctwI*cmu?;7pWZD5_RxxS^x$(9w!z>PeinTmdN$xVCkFcw|ehw!)g-=p|F`-m~G@ zTMZAE9l>o2;p~1`e6$aF*4_DVmh%p?-rT0;6k}*RGcX2bX7SIC}VXIDjnszqq0&o{hdUT;E%4-&@t; zy}-kvw{gRVx?sb9rDe^rmh-SY+!yFv&9?P^pIIH{29L3W$130 za~5RhRW;W?$-3F-zzafQ z9j=ClDvsb5C@jN&wLPmKEFI8J?}jbHnY^sYTlURvnBM#h1h)o?f6d_4?GNq8`l*A& zpH7}KL-gMaQymcfdGMGKB6iP|7NYe7qsbAXGSmv`%Au)lh(7gNF^sA(961q!XtU#( z2crKu>Y~ELkBl>V$o$0V#c*Vj3aXw(6_9?mXD_5b4`BN9UInIu*vaQ%BQ>u2d_+i( zDKI@Qqz`t@dsKhfp@1-A!C!p{Kh=O=eZ=Ene-tuQJkFcq@oy9o8L@^&FA>4{pUz&}4562LOx;vWX*BMqF3*+*695Cr79pu4obD~XkTej!zLm=#oZT`C zSrg$6u1tS$c*`tgTM19kO7920tu`TR!C|aI(nc8CAF5fs_ZN1|3vRT+k|JJOu)*4r zxl9IrdIZJ^6HQ!C3Yel9SSzm)eu5%eV0e?T)6|Qikr&Mg9)*l(ZFb^MIHCb qo@4NHn-T&op$PtnIP+t@WBKihzUw{hw)R4c!u@dK=LE*WaQ_d4n{&hf literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/hunyuan_a13b_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/hunyuan_a13b_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9225866724d280c79e7e3c16bf029b6c3f49365c GIT binary patch literal 15019 zcmdUWdrV_jn&-WKzc2<27-NGm=3xWoSudcdqM$0NM=Fo1>`YT6mWyjBQ^qFunxvqN zl9|a)Q|a0rvZI+$lZ@hMv}%`9tD$#$HAvgLs%-CQTiX4Di|O&rT_Vxy+Uad+q;*n6 zN#~#4?>m=!ZHOz6)o7);i1Yl;`<(Checw6$Z)IgB3ci@;N70EHf6 z@D9aL3>~8;&{xIiIn{)U#;`i3p3_WdNSKakCv=deiRtGI6NWkCgmKO^VVW~fnCC1L zmbtQtvN`L7mC$Nq<#VRn6Ha>?CZ6RnOH-)R3?-Ry*gIaL|;B zir6D{AJS6qCY&njDT*<@NipV#?L+l`YW*VZxBM0(m;JP8xDt6~Arha8Kv+E*U-aw6 z>aiQ)EJ{Czv6Y&?2fJ=%M3BGd%UP!lSK zj;NeZ(m1VS43K7G zj6iFGe>41BBIbw%OE6_h$f|^-l4Vj0(>A6Y%3HDQw2HAoxO_ic0bv_c8L>cm6@)7o zd&ENG8m1cJm83m6O~=$gTGa}5i=MDwq5QRC_0tRS#f5Nuw736EVO~XJQl7FY)&9Jo zr-6UmCXe5OE%ayfJ5&-=$#+MUqW%SJ`xGs2FPfU6QdCGO&(ILt-()aqpbuevk<* zL-OSE(xvZJs(tcy`>DZeFW+8%DFuPa+aw6}`2FZCS&9eXSBsTX3oHxGhR6V-aVBz0 zG)Q5F6HT0)#rZ2lEw?ZqVMRkIL|8ybET3m1uSzA!jELn2Yh%Qk0|_`SA*@m?0?{S{FDpOOG+&<(g-0$nT)2g>scYPXaS+lH8Dm-q* zuwn!(Q@?3QX~vbBWZt-}O=(&B?&~Q)<~v3wwXA~~P!OTe2BUE8Sv&)WKDmF(dLxyr zmNRE-E)JbDy2*5r&UuRx?KSv#2!qmuE0#t@V~5njcTTn4!NX{V%0SD1M2Sf zYt$|E52)8@zxMKyeK>g|8oxQx9+Y@8iGU2FosNZPxT$@yjMRav-w*$fhr#(X)kTU1 zi6lk61f#)KU8a{+cQh&b<~Rjd&T47ujyk2f2|0gGzeCg1-D(ZBNc|i2YxI5k@_ov$ z6;;6=QI8xW%qB%`G8Bn3q8c47N#bxDd4M;(!FEDOG!D;1;*neP?8wsT(kB?$-#nX` z0@*H9yzPU-u>>$%ZX~GG@FG;m`N8?}f8jCDQ>uFF$LFs6Rj}1aZdbYtPecVFb(`a3XBuD+k80AHbElWjLr#kjw6ja z9A6Z*Gi+jE9!JJT=S9sd2lAvgmIwo47|-6Cip-;g={KkdR#`+?^@LCCntnB--E;<5^+HSF$Jai%wqEg9 zFXviL@GU2@Ekm0&Z(gOTaPQb1?_7BM!e(Rdw!KfN_ip-*Zq^^&vLDSiQ#BXqd^=@x ztzF$LZ`w38{qD17X!py#2FlU&8AVsPKeoH_YVg0}@SJ_*Y8&(Fey(ziwW)sIW*qC+ z{=AO{SJbnTLNZm%K~ZwU_~S=#M&6-}$aRa)8)V^E;IQPBsh2e=4cDAfvsJ*$itCfe zL9JteSIOKF_?W_t84bv^>IYgm?+R7Kg-h_*l)|x=wMm7irnE^#=9Fbut}Us^hb4KR zB1zKJ;r`sBIp9|*y0~96z(tb^mrNl! z&<7sfn5^BOB#JR6QPClv%zNLZ{!+!5lJ#OEP6Q z+kwZZ7;Ev~D5^zSE1(2k4j6M_&j6+%>mwgvO2yY!v}h%#R6hV@Qt+mH>5NHcMEjby z2W3eQ;ar|+HZkW3dr(*fH9q1YJlrFREf)SejpKsm&`#V4S-#_~ckz$7+eh%bI z3s%JuMI*w9CL*Mgidp1ROO00xEivI_*c0VE@kG)Sj(b2(4ljxt6naIYqE(1mEFA(K zzi&VAtJvqDCiZ!7pgmQT5jDw0kngWxDjZ_4PE@fMz{dkW9|Pr|#m=&$=$yfHQdFv#4RBQ%M%j9gB(;i)e0U0c==CjMm-Go?PeCeCN}@aAiCH8E^5g zJhfTv|5a7nr@+eVg@%^*6ZaCCn;U%_uVfoWSD)Ic?H8I_*ZVeJ+f%C?m4M$Bl^`TH zHorf6Z+6pvGTSh+@+?T+_J*~@gXDJAz>clv9n0I6P4~#Q?UYm+gm}T#yyfaor}p$3 ztVB~$nYVznzH;G_yEXm#-?@*yZCq8aUVY^7rUxF_w;e&D)}6k}*ZPH8Z^p;h26yeQ zM=l>A+Odom%5H^Gl>uT| zC|liUl-g+Bsp=No-VD1j3^Ql9enzRvta&wLe8n}uqWt0Uv(?mp7&U|YpY>y9;QeiN z?b&Yi-v()L#WK_|qlpDBl!QeUEsCy+F%xJS$*`0|R8ff#%{HMZ2s#xCwUSi6Uqnur zDuy9c01)6F4-$S6R1MT{=#yC?>n+roDpEs5U*yW^u2pWv~<}o;umL&27@&RL2AodEvKOXi&pFBFbPJl|PlnMhq z+Mgc)4^dcC}W)-ESP=CrS zQu_?F93WiYNXGE)jjOL^$ROaZ+@ZB1E$+Iodr zRQYvF$4%b^JPFpL*eeL^6yQeU>mq&4PZMs+;wb!97PUw)_Mq;@Lyk(O5LO$o@VSZb1h%EYS`M z5x2mC`3DUsXv_&Oax5w>V!cuevS`UhM65$-dO4Vh#v(MzfawS_{pFH|F^#RvpmPJA zC_1y~+(ZYVLo|}H!NI)+Sc;t`M^&_t)`)Hr0c)3p5s3v*Ld^2jpeF;unUk8aox0m|9E3g#GWS}K3NFZjG=Rm}}RJYo<+ebIu&( z&B09auV26a`nLJhj-w?V{b}<$^EZi~B(ivaYVJ z@-Csge(gM8-kNEK$fK&pjA^^7f78;xYp~=DUf$qM53d{7O`FX_+lJxY>bhL@0AD?@ zKAEjPvts(#TrE_&H!bc*we@S`IaiQ(1#_-ryzAHoRDU*Gdv3+DQ|(xDWvg3POoF+3 zt>UdyLRGDRl{fK*rgUK2&?SJbRTJQA0-GJD_?puzX41?NzWRt@sas35xP6whQL%fPDxVC}G3@3IeF76qsfy+f6}VSymNf{gkR;&yiU- zi>$H8;=Ty>0gx`C73rHrvE$GuTQL?DsggyTgkM(>ve@5)1nDq&AS9Mwxbp1f@yHYy z!p^d6g7vGVGl1J)w2%|ZNT+k(7%2%jYy$F^I!bT6l85_$Kml$X91v<$T4TjdedGIs z_Xg9kY<>UArJeG+way2cZP-15qpO-<+S)+LwFSR?R9=-UZ|2LJL7_BOd~C6SLTRk{ ziky@`?Chb|t44L|kF+$n2bKjVhb+s)A#?;@3Mq#_nG*xghAytsY-M2?zomqVaQUxF zQ!%uJZ3`6pTcE(ra_E~?Onrrn;!ToLvHu-RW(q16sF(t#sToZ{4f}AP1QoVrO;R!F z74Qy}U~NiMxJCh_Rn(eNsK@9Z7~~mxVEk}@6f^XL0^VsE{S00g6-yyhNZF!zl~l@H z$C#0sADHV>I7qJDdKT1`o`KBt>iD+3 zL+I#4zg-wTgZ*sh?d_nC@%E9OTIbq8uD+YE?_Rg?^(V8nBM+OlYR3eJE9dCs9i5p; z-Z7YSjPj1r-L78Y#E8&00M{ArhR@ZyI%nQSU8Y~4*EB`M8@;^aaUG@?u8Nhj3@4-n&y;wZe3rnp4K(b?H_?{5`T%nr7~~jD z@u~cOYMH(}@~!2FJbkFgLr>^2p)J{CO0EHnJKr`(rCOx#4kDQQb(hKcK1U8X`^T^x zY%Mr|z~sa#rw%MDzPi;#f11;|Tjmg$r_MIfW(#wh@{meHl?cL7og%3~31Yv7hl_k?zdois%g*i?jq z0{Fvt!8%$1{gk!<(zA-fTNpWG2D|G1CVo&>1npKiby?4tpU@X+z9fj$5QqfI5CRGL zifq(W$+sQ*?E>q7n`d?aT2U;=avc;ki)Z4{$CS&xgCkh5gqJKYQ|`Ay-yKNlW#~q& zoItk(w1}4y?!Evakq(jbC>b!{fRwB3A0Wjaq4UqtL1gw@Bzz=D`6Ntbe}v?Sp3>Py zV-i7Bzd<_NxW@@cW?zMnXn?D{B+)J-zcTb*O)#PkG-sancJ0#nn(AK@G*|zzFhSpWH;A+p*^R6Q~*AVX-+8E8c zMpli2t0~=)c4xvrHUGP^%}1DIP2HL==j`O2o!OeM_4=)vVInrP^7hu8y^FVZWv*}Adx22_ zOcxVoYle4gU8@!uR1dHzJf1`cc@EYK*c3cWLz~yChV_7IrzR{)!#YbJu9EgT<*633 zGhk##Q<_qYMrI|7HsdX@nUo^^jst_l>uJ2A2iDjQJAuAyg5D?x82Z2eG32RSintP+ zgbO1NgUs<;sR?7yeg{e&QWmi?YF4>QlluWr6=bHNQkX^ws*bgDKEL$taX34n^!qRd~9WYuCDs?KCy@fjEvm(#&vMFU^ z%84y{2tpPjJeWbSko)xwb)?K-XGas|p^vf2qh}=TwCxZ9MCE?K$iDqd>2QXKUHDs1 zA5wB#DZ?y^lZQU0Lhcv)(f2sZ72lL&38uwEjOxeW$(iPkCihW#4CIQf2>WE7Ni4=s=XIR8G%vn0X7a1NQ&_ z;mV*rR_$A!h>2(g`g3(jb$gk;10hK+rO8!15m`Zld=t~rz9dPm_$+{kx=ZTi&f6>s z!b_H-H^YM>eOsiL{WWYWQoe=G32;O$Jjcg}`j|k}R7I2}H#-G0k|gpXPDs{fhM zxkr(&UhtuUd<`4rLhzhcdXX%a5y_gErsfmeeko`)gfYp)hDFIs2QI`!r>8l5ydz87 zk}nysI7t6aGB@dqa?5laaGiSt^jTTv2E$c7%G_DcaiOs}*Laj~Jeq4f$v2+dh-Diu zZdzPodE=&UAm;}MAMP&f|!@V+C0!@ZV*#|w?ebB!Z><4CsgG%P?%&FVAIyrU(f+Nuo* zZ3FAC!1IPR%dV?k@N{Ob!OZ16r+Ck)tY`G$r97_37don;`QD4^k*vFKz4ueS+I{p3 zcn=Lp@BUuiOx1bw)>5l!_u*RMBME5Em8eV+T?3n=&h-R}c0vK0qrPE{$u;!w4Lw`9zHD#p1xxG0 zE8G6_IsaAOe^ux`0vcc7Mb!8L-aU913FcNknp=&;+-j#>O{)r}L2qF1e2E zbb$$X2zGq1^*60OAIxqr>Dg@S80`2aFN~+X0mjqhT+`x?Z-K3X<7wfmTGCV7Re{Xn z#`&%8v#6d=tT%n|@)o@Kku~$puFAY1bm^pKi56RcaI4!JjOlFHO;o86UJ_{ z6YHA48h?L5toR|(6vC&$3vdA+5-p+7D+}S6l*6Lwi2Oc4R0AV}_s9$I3XQzxVo@NL zUUEe+gc>hAl!Bij=rMX5odh~)?CVq7@bb^m4y{*kHRjde?;5Qu-0IakOKbgU`@O-<@)q9M z`dAA&`GIj-YbA6f5IB`rV_?_V@>q+3{3TVd*1lWo%xl4i34)LwLk6nakvC$sVChcGbBH-m69<5!dhi#31XZK_D^W$`khA57d1cG4rAc0e7 zz90d~T8fk9caaNU`1wpC7K@Od_i@4S^(kp_bnq+>UOTV=L5Sy(naC~jG+XrGFGzw# zzaR-pzaR<1yYXZK>XOzP?+J>)9?cx&`5k$UC!TmxB@*UJKSP1ez&3*45=!smq@SR$ zsK|?f?^J#v8I5t$GZQl@AUzah5v#=}k^=R|IsAQMgylknKB)Xij_haAMChcd$VJ${ z#-aQG9kTBRFw})kCpe;ESUMad?7xR7Zc^?tIB+9P(~nIQZTW=qeL~gznyUOlW1v-E zP~d!u&gYdXaC8)Xmi~ll{)8I*g!2B%XkY#Ptg+z@-Jb5Oj<&B|_?$wYwD*4j66h$e literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/internlm2_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9252edbc69b27bf1c027fd60a9cc0e17f97c1ddd GIT binary patch literal 8009 zcmc&ZTWlLwc6Wwva!BzdiJ~cqk}TSkEXkG~KV-{}*RtZ5 zp$-#&1Sqo)L~@%-#(qTEEwDChbk+XU1q!UU&4-J&yP_$S+<^jku_=lL`oY99lBPdB zcZNesvK*)T(+l!G&OP_sbI&>V-gEB!i`8mIkP_M-#D3a^(CEA+S4zY-( z5-3DUIzc7r5KSpCO-KVUO+uU0g>*7Zo6si>A%k4kC5%Z^$dojP%t=egl4L?m(i*ZR zZ6RB-A=Dt_=o9v&Bjk|lhJ-We3c2LEG2u=&h8pF%DbbW{4mDGVMqDH3xkoA7g<5Fz z8e+}wBi6z>?rEw}?+q%*{GK$%gOp^P;NHn{sVE0^-O)rMeGMv(WaJ7L7SicNI2uVL z!Z9{zkQ$F&jtIxn$xMP1V(HX5GL9D{#|W1YBBR+vlMympV%8Bdswlvdnbcp8V4lO!YT*!hBw1I(63*Qrp$1PVGN*GaOzR3bT8U%q4#YOA5EDu9sKLVo_Db+Z@a3~RlYvXUAAU%#rGpLbtd^W>j$ruLlf+WL| zF_Y$FL_yH0<+;Qpb^)ej3WuYK2+xPZJek<Tn~VO120a&+?0Ym zNmWJIOCWy8raLzi;`;fz$jDb9BcgOR&u%L?x8G?lI(Ov_WsCE};qMR6|52%RXQ6fH z;^^bnp^6dp?##co%xo(+H05o03yj=ct{0&PPEL^ARfHWNr|PK%;96p9<~ji<>$hUk zs$owcFlCUU84Mb8?sAd^K)RkU{j%SyIQN4sg7^& zU(@TXk5X$5Xc{!1V@yc2NTO9_?o}n5vu(~V|B+~mle1()VvXv0 za}8nxe)k3TWU6X}SgxGvmL+tnuosLq3)@r-RWm|Rg@_IJEh-1`UKLs|S?0b~?Z0ok zS3RqzsQV2nG-qcSu$9^=4_uHlVMB@!Yyd!$)v7coG()BSl0`W?l(&GwuII4?XS2TqoD*>on1e=$^L1ov4Y@hoF`EF-Yt-$a?PoB(cVqU^~{y zw@DbT$+J;xQrD7e{)6P!{Qo4k=CyKrnN5o}wvotJv`rB=mk3@?@7X6%7ZTiRJ5GG0 z4Jl3>qzx%f+^7v{E5aTMt=S_T8xWw$8dMvj&|Bb{TtnA2Z=q`x@Nihg20DG~=p0ph z>LuG$r}L5-o9mcsXAFAAr|pMCh`hj!5A)!i3*upLhYZm)gc#t9lM`G*-*`9T91=5$ z>{Kiz9{$Ew>*G?aczAkyDlPcAR609#*{^t=Q21p}-4E`2+J9=|?7061CxA~2zS!@H zJ0Te-tFdB`!sJvr4@@xJ^=Kl?a$(~72DOqdmSVZ}pFA;{#X z51kypaP-{CqvPlOeg0SyA{0J$DZvpAE=R6%{+bu(#~jZI{$z~ptHDO^l1Wp17IWbU zAC1L;nE;uC#s{E~=aT``y~i<5V?V?yvDB18JjKE^ynwk#Qh}uteiA45PY+LkBc36^ z=;dtcia)~pBV_t)Lhz?2VKiXY2UcTP(1)KRlLR{?O@d2d61iaFv`Gzh_YNYX7z>eE zCJ_^$5O6FbHLdF=A`bIJmE@X|G)K>zku($M&q*%Oh(IEPI1j2<0x3zCNJqed)JD^o zlguE0E<-GXm#kxv1PRDFmS|7XD{-`>W4TM&DNOECOk6QcJVFdkB$CX@H29ohHk$;N zl;v?O@uK-K5R$_;Ngv5TjK^Yf^GdQOyXJi%g0ZBNMIdPjk1!d~;uTd8KSwehR~eK{ zHC}@rCB*3=oRG^KlhYXYk*uF&T3B|G!MupWtyi{$q?=4c zrg(`V@`->U!2+;ktr?usRBbm*giT}u-nnEWj(I(5kgTAVHBINbrmO_1p=OCyUeqzX z(qAK4ZCh1*`Hh1orH|q_0X%H_s>x38+$^s7wh<)4;r|-kubZgiMoinn z+Xc(66<_-TQ}p%aM}9cA;%dKRUUuzXVs=;bh_RO}?FCEwLdRoE7wMvxUA;?8?-N(^ z-0`6zli;P>|bB`^?`pr z@TjM7_Xz-{bhe& z$-k@M-?bPh`VY)lf8+9(dk0FrM+&`19(ELZkIjrMySK0SwtwymmIFPdz)&GDRPG*F z>Dp85+Pe}QDhKzjYUrNM|LolHNza3=3q8foBdb}7+yL;$~&wuMU1mT^5QpfH>$8H&?x8Ux*Gg8{|Y5{)kqfa$- zr*p;zYU=1N`3DRB!NuXC|LBbM3qA65-~AJi@2%s%Z3)~xyyEuGotZsT_FMqI8k=B5 zYX@{Sc0yOM@8{mT-o>${*WX+YURZJmo&@%nJ+Ci$ca*$?1@GYEP|Dije=sYp1@{GDsbWSw{iWc+Lh#^&v10I4er&$Czy!+M_AI3HBg;%z+2zTf z`Fb@?BlotFtGD3lExGm-!yNj=m7XXs~H;E zTfRg(lV^o-mKa}w@h!EUT4qi|^TRVAoGIJS0{2!MsL}1ISOD)Ep4^+CI_U9X^r^dP z>>&EI=cFAP|90@u_>ksT`zWaY>HtORLmE;a-hFnr=GTK1*=hJ$qLb0=mivO zj|h|CG|a)PVooE{%^MJg^e6d0N0ev`85=@jbm~N;XNdieO2mDW&aVA2LRbu z8^t8ffhTH)Tzb6x?Sf-5i32c+sKL1-4yWSiQL>MBK4`ne6tE&7@Eu#)66dv4vzuAPZU)XJm>1NVI@2l1wK47Stq5)%9j$QK8PbLbKnjiVXj@&KtBfsn}Za?wM?gww&agf{P=k52} zA9feEpD6fFtk`$Jn8m>d!ZQu+aITt>qvfx5ERH-pwcL4j*-ma4uPuKHZW#B@a%)F< zOZSSuYhmQ4raMP}Vx8Bn8nkX(1!=A1-rKXk-dtr6P;P0vGq@?2kqp0i0osBC383ohhr9;#IVGK!|!Ax38e>bn0~OitQAw^E_$xR&=BSLV&74t{PFJ zr(%+;X4KlfFmZdVVv(B+@@-u>c6+#Dm76x??Of=(-CAjon|9>fR&mHxCo(t9YYQgd zQI=%9qT0MS5dMr00rWrCXg^erzXxJBM;5bC582~Lhq`ToeIs4Ocy z3Tu=|M7c43=_heZe_DxIlz?Xo`K;7W;-BhgrT!>QNMlKFTL2{rYD@fkwWXDSfUPMIpw3X4JAUoQD+~`mNb31c;}x#u*vBFd zP{J@wt{168h5-E(FGp&aS610y`!J~3VVW%DG|7n3+(nAi$@7L}Jfs-#Vf-#MiH-5& zkX1AkMLjhliunv}{T=dth6euw9r%hiQuJ2{vM)&X%t=G0N7NhCXQ=BlH1u1Od*))% V)OJ(9sz0uw%=10Z5Gm#P{ulK3Rl5KH literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/jamba_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/jamba_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f381ae54832d6452a8f375e1127ad25743c953cf GIT binary patch literal 10133 zcmc&aTW}lKb$0W7%FVqkL-~5dG5XEo^$VcUHr&yw<7qGhOfu}^(BOUjw|X%Rtr!6847PB1~Ehu zg>kM+5-D9+M<{LjuwL#ngh|NtNn^?sHpzX4q&Z~?TjVmCw5DufTZ#%(DSOzSa)ccz zXV{r?gflTE4SaI;*tBz>s^;RABnnrun6hFb}w zLu@naf1gk=4!7yh%ZRbPi5QCYysxjew%;a#_Fv-4*&rdA7uYXn*>sGBGI=tY%-n=R zLn?ZmjR=`cG7^g>laV+RG)ax8u0@4Ynbb;>72=unMLdodq=soWDMU|a(=pt73cw{w zZklI#KDx|GcDc3)lgXX_nOg!EjR`EHHqEA&GF&Q(hfCBm6PZ;Iz{!fs=c63Ya!@E{gUnuS|S+Tn^f4q=vk66u;h|KZX==1$QW5uwcWgA zWK3@w!WPC1Z5H^q!oQ8RvbF;-Gq6N0*|m}~BgHs!CWMw;j1$J&@1R>m*s*|uF3EW& znz|aT5kj&E>b_#SD&9E5M;5dD=~Za`bK-3z;8ywjnl6w26#DaodQ1o;s7D;-QNpM~ z)^fqDmfx?g!r26y+Qw)zeFZ3!M-gp?WtcAlJ5fGH4>fHjxzD2Y8K6(Tg1Chtp{C6* zU~8Ve5>gwz59OA+9{=PUSrdAmXD{G0Zu;6bSlHJ$8n`G#8Mxc|K7|0ix7CNy+5Xs z#%qCS;puoTiIx^~!BwN=Nlk|_p4Kb~YI@S^RnjlB zf^?v+w{8JYB2sf*m$nA(pe|<9LOiAHJE)foJeyqN@Wqxakw`2FT#ZC{EYfKDjbDFp zNJy;=t;AObfeEnW;H~V~k@4$e>`*d(b!g?ba4nM_x{*w#fIk9vdnE%(f*;DPu<2-A zIZ+CO`Jn`srwS#NrqIf54oioeZpYGM(%BsqGa4MeV*t={=rGPZ%3VD;_g9R_?yQi= zVuwxu8<uZqL3ZoV+LNzMP5`^oJe z-iEaTRy?E)CIM*Y!AH)6fJQ|}G>kme>oFpF8IiuwcUs|MhjIG@Z1%H^WUUEotOmf=27!ugK*_vR zwfNy8ftHahAc?an=;q%eBnlK_mSZDPJ{FG$b=(w;;Eq8O)JX>6Ht57js4PkZog0u< z9oC*ziymucysE5c=_M|cqB)jd0rky>f)0+xQ{h9zbweVVmf~p!`y7S@5AFk*%tRTU zqi{bqASIH?rdD{#fWahvl1)pNnOiY-MRqofii;pwShXo=R<1M14dqTt4t1Y0NqEW4 zMQ=uSwb`Gka}#**QAl`f?b6jx76_7C^;>F$&2{dU55Ea>7SA>bO6Ls(_aDTttw*%= ztR+R;#I|Rs+}63qZA|Xyb>6C`vC?+;<3M*QFe(N{OMzoz;MhjK7?^u_K@6NLICjja zr|;eH{qVY@*nND{)%l4J^-Vr)rrO<~G(qXEeWwvMww2p@w*!OaKxpj-O!EYWB@0g68E&oAU8k;I!7+GmX4XqUf6IJ}EwRf}kwKab!I3WfnHeM_p zy(}KRTnxUp*>a^|+HM{x2fE80!R0O{{t%lsbt4yRy>pCBLZ+l zqBIK&*Q{W)3Sc_mvTTnjfY_J=gNBzHG-$2KSskJ_oQ-MT5Fm!nv@EC@C_PtD>p)Ob zwJV+1KhV6d1+=$_&Kqwa&M9buI&V~k1Eafs2#zXP@G!C-7eEDnoJ+u1$zKBBsa;<> z@Z09jo?WR+yGpqhZ*%TLT( z@|vPpwboJ3TAN80=FYqoFzu|Zq?YhHX(EnRZO<89rTueyK~WS4Zgj8Sz1L%OIam?n z1mjws17o=UN5I#=&D#=KeD3-f?PRP{d5ZfBp;rY+0Is{gJjIx5$~1w^pj{u}fK8%Z z-`<$XYFbmst955wz_ykA?LAU>LF=a;XxvF)(`(nq*dAz#@qt}c{>#Jzhsx7c`|pDD zE!{00#yd~tz(|SdV>+zX(h&YzwWoDB-HUs`C%9GM+iPc@F+t6sBD*&R32Y(m`WTCf zH>mk`wKEY=OSPPF?oB(D1_a96W%;p(6yTShb6Q;L>Z^{tgF9D~O8aq)X%Mi2mB06^ zc^wxW58Ud^`*P9qewA*|a*?ssQ?#H{cmg?&&5T!usrguyo>t*%=F+E%Un)3~Lx9SYQS#g7r>`q{-Zr7J@P%)}I`C1#6Phco}_F2uB_o$Sq zH(&QVRdpxG^Lvs5o@eC1kAN5d0$lv3--#FAeW0DsaISG5S{}%I^Nlsm3CC52$?er> z?rPIMv$W)!{+}Fa`aN-^=^2j5G8yaZh+m@v$UShcv>0+ibF5SdkO3{VvRc|JatyAS zZ(m7FsyBo{UqH7t_X}`1q0SBS3_df@DYd7TGp)6G_TirPAoT2t!8UmRk^Pikf*po9 z>|x5jUG5MLd9vwDcKMnb-GqRu93TTn4T@(L&Yh)ivx4l?o#ybNk`PRWUA#qjfT6&l zN@f{0f@3V;^O4|vvA1NW51_DrlxvnlgV@>2r0Fz!Q}Ow_8O2w`dK4hBgb5nBFl5|G zNF-gHk!O*JUWw?;P0pUZeDdP#$+L^}ARSMwWOzP)HHqW2^tI>>mezby>_V~vor*Jq zT3gPAokEO3w;a9g=5sx#;~d11YY|-y7$%6Z@f$2N0noX?0vqLG*YH?~tJ3({34+hU zO%{UN)@~mDa!t!X!`g5RA`k6X{9Oo3DvmB7nq~lHC0eVrRh{2ZZfXG{PAj-Eb}gH} zPDgnkAz5u;)op}mHqGMraj;YIma$IfumQ)tf)jl7Ijl7$Jr3sT7Zxu< zpn4@47dR~YB@ZxKz!5I&QAd=pD2Ibok|CCXaIjVLmU+oOA5G%NJC?ymf;)quNQS+d zUFNVo!eIlF1B(=Kr*Rj=0#}mJ7|Y=xDu;tY9Ck}2s}jD#zPehQ zH_9o%egrp*lUE@LHY@J>dE9UTCl_(DfRja>T*3*qA0=`r8C~Wj3hyThX9S|+oMhL$ zaJgFNvsGU{B?D$0-V!)_Qq9koLmmz}46pH6vV+TqgY8NfL^8vvuKI`1bjWm-oEl)H zI(Zd{SNgTdt982yFF$hfIB2Ox7Ix)Iput}~4N7oZezSmZKYkH_CX1 z;gM&c+&f_8r?EJXRhag^)(5=NCPP?Jt zsT~U$=&a~XhuU}amO#s05~45%+Dg6=(KoU_U3&2q@x@n)zS)P1qVK|%Z}u_W^$z*i zPZwsM^!BYy6?>0toD_Sf3WhCLS2;9-^M37_q2wACUBm0cTdq-vvAj*aMQyf?s>p7+ z=F83g0#)67=#eK>re7%0FNyR^>q|v?s$l<_hu-!ddg2e3d-_T}6JpOqxqE2);ETnB zquarWauDyPuk#n31NZwjy6*KAJCEONJy``&VM{jFuD$Kxn8yPE1|I|AKNPlal zVnv-jrNFot7~k-gj?9bj7dTrAToeNr%ia#4U8lR?fRIsOxQfa3BK-#i`{UN`>UoN- zj}%+S3g(ZSyWaZ}aNt`rKReL#-mz_O%iD8r&6RzZ0VfD$8BtpRx*9v7D>#UsO4jE$ z&tKXKUf%TfRDm}GBkNZ-k6+mGUVPj$UiQsxwhWY7hQ*fQ_3>iMl>(B1sRTS&3?&!)-VY zDCxn};$+=@8&~bschDFMKgfxIaU{PsVax4=^48?RBXNQ?FTW1S=Wn8lu8VZs8~6-C zR+%GAWbfVGyG5ASSBrC*|ylihs^lv~Yug0MjE+b?B8HH!qXi-=r z+b}ptDnB>K4-Zn;^Cpq44aK11IsB|8_5E)^Lkfb}^pR}x0Oj3+!>?gdKyHEGGo^Xi z661JfL!sIa3-F1PI`@U+?8xfNqq4OW1gCC)ZkI?atE#AVF WqNV)}e%m6qdy>QcId?jRw=&MeVc>-nvEol407gR5D$%R61R{WSllGnWjxk=4ta%*>o9c zt6#J%l~0$Gv|+Jg$vSN%>5@g;Qss1|hSE~3m96?zBlkFM*HR}bru1EkF|y^K>T;!y z8`^;BmzW7?B;zFegH<**%T5GJrRvcuGl|jVrIkfC5nYa5z^ZscsvKh%6Ei1PW3yN~ zIpx)eR#lP6d*B~;?HIfC$20)HsDr~gDxQ9se#p%Pe1e(&jd>K}u0@ft_w{P)m%AnUq-7k}{~bsW}Ve6j4)&W~F?gMM>q~OnJhuVrojC%NOPf2DbK$b z4&>8H3R=39bpEYif>C>v(%%VZodT7A7vS_NrGQh#uar^L81;d6Ry$A4YTx!z)Edb9 zx28s`^m1LZru(s$*WK2>-=qgjh$T|5^Z^*vv{LqevEK%zkKch3YJpau@^6JYphd0y zop3hhaVe<+oWO1Q2RIKZ>HPc0Fqakv^LZt8vn+93Ni&rSzCYI70KRJo@82m%1*#@a z{PKUnPokm}*cA?ERg|JBD(N}QC%~3RW?`*}N8&75I3kzX`DlzEPSzEzWn&D~xeMy{ z0M-_(l;P}!RZ2&dg@tmvzd8I?erWQD-r|Sfinq6oymnX}b0o?P|G^&~e!FAvEv6#? zwa-D#aPzB6Y%H-Gd@i>_MwABe;kJ>%wh`ZR&}4AXXCM@4AK8N&MhSJvOM8YB`rcy+ zwFi>!m*7oaoVvo&OUn$qNUMtxy|fxn(3e?yWhNeH8K|HW&?>)h(X-33c!FDB4xI8 zM-iu^G18|xCSRaQGQ?L`SWYrVBE(IINHw#o9EU<7LX6EUv7kP&gjBV{vDf56#169I zBdCaBlP8%k%`C36$2o49lML5DO)g8i zc{U-{s@*=vC7865v!F^HJXX{dR21?m%ETpCL0utGVND^25MN$WkwOs3asgx>U6Mx~ zFi841yEw<87bBJC#GM2 z9lUyg4KGG7hgWVSt}MsG*A^F-V0t9D8!O9E5Oa8Wg^kTb!-NWXdd0(6Va`Ra_Q;Zu z3qvb6ICOWCwY#0qkTQ#K4Udn29slko_1IYcUa4So-L1_So5W!EEj_+NUHEPg{Vn+R z?37UD)*S<7Du>D)Gu6^TibY=&zH6bnIUtv`w&A-$Y-z`Lz1Z1j+QE4)jqaV zd@%mr_)~+9CIFsiW-8qM*}^9aqJ@2`(}qcD$1@`p41G5E$zW<=D&27r@w9DQ?OAKn zmbK||m1jq*sqB5K)7dL_C|!BQj*)5)eOC8L-DmZm)IZc`4^9dPC)0r`gliF7J3dZ+ zl>GRIAN_EX$qt+o2F|5hU&AUtjMhVE-6LD>k*5azfcY!;$a<;hbl*wdPTu+9?H}Hs zPdh^?TWDREvJQP?pvrBcd*o|uiQug}6$Jdqmiq{R_W}5keZh|acpm|8-my@D&dBGG{`EIz=sAvV|G#m!yn00cZ=EgUTD_!C zu-6k(3CAW@x!7#Z|As~sh1yScIFBF!+hffdY~0p#yk-sd9Scyl`!`0t?FMhsz#Dk| zH7c%o-^lA1)Ll-#HE&?Fil@H@o4+;f`(8clh!)g6P`>T&jZ;T(>Ur=`=X9{O(r(I=3vqItm^TS=yFnxT`EDuWi5?dIpv|Fpkj`T z1$P*0%4Ez0qhyjZfC-Gh+&_o@WE{7OghrYhLMT^c7Kx1j8DJ@a3|JEhfyuyLWAlQh zQAB(b9yckNGEUE}GSN|c%OqCS+$sfaba}GQp^S6r)=AM?bGLQN>J#gHABOLRH(ED) z(sctcDe4<nWtohxjNX0JyR=qaJi zRPC_+@rw1G!2XMGw=kN$gDv^)5Bm=`j@@X4ysLRRahC&r2oEbm!T z)xj@}okSlbi|nN>uO?Pj6KF@s8R4udO6UhqfQXv-LEJ=5pkVip*Id(Z6$urN(SQr0tf%1tH0Ud6xmf^Sn_YEu~ne$#b z?mS`%n4n3Qy2mEFi^M7%Rm0-g9G5hM#BfTQ5SfLNQE3^_N_yO7NG5e#4t;6C1=lBT zz#ei62$)I+$uyaJnDHWVo5Xn^8S(YOFSq|&))jDd#S zI2N74B13#-F$ylv%nG>Y42L$2!*f$fk8LH>@$0kf3ONxsu}U&vmRI64*!V0qb|_>J zwnSutx!17D00-)`S75OX=QnQgWYZq& zvbWJ+BEkVM@x-)PY}v9Fp{!+NQ79XHY!8dx#trV_;b%In13YLgc+lpi@+w$t+k)BF zA)$3B+d3+=jy^h*Zatr}`JdQo?pSVHQr-ipLuWI#iEVGgy7}+C-RsB2?g7COT0ec? zyX_BW{r!T!e{(kNAAHy?_>W}$qk@0*(P-NL>gQF0|IGSX(NUjuGz*UA$1YlIXcv7Q zkDI&2=AMU3FqD@1XQd^zb>HfY)ipaNs=jB(L%Ca1t*>u*vVlP%F!-=9J9J4Hx|9yQ zo~nCez4WoOSM)WD4S_FPLw8R;E7iNIb|`&i6}Tk*hw|C)o>?e|_Zv!IUiH{kleN); zjZXQ_Wo)kz1TC9A4|(X`RrL*}1z-cT`%N5IfWLZWaFnM0Rc#4A>EM`E_pd87@RZ7N zRf;aJ#)-=(S=2)dkb`7~Q<}t#TtkjHx^irC?UIlQ0U@3SzSl z!qW7ohQfP>V!kyvKm}Gbe=`%(Ix=vC zimid#LTZ2*{9OePhPw!LNZqbqytQD|#oP?seH6i2KndQ2;O>R|ijS{2jxq`^hG3;( zs5-FMd1|3`k(Hu2W`**%Kx%8IHS<~-Z{`M|t$Jgkj0~n|djYMgP0rUdr3yBX4^pMl z&6^6x$KV|f+A#9t%eW7V_s9LkKK+^U{Q(Atf-(Q*ORiCC7T&@&ForebKG@6!*q$42 z?9x7vWrdJy%y#|q4lQ54P-MkZ>N!gSy>;^A%O9xg;3SnVXPzs6p1TS4HszK9 zt}H9w1#TC=-B5Z|M!#0USHwGu%OdXEhX?8{6|C3!@#_l5515PTrRs1r!&X*|y@a`I!y%FV%xWKD|^U zO8Mvb_`LMNztt}^?WF;F_wOxLWo>&Ql;b5(mG2pTkG`s22=%hQ>K+BG!v*<*OJY!P8hE*)#m5k)V9ph- zIe6U*#`FE~!p`gWz$?=ke#v-cJ;SyBHfWrU*nsy7EK7bzR!!;}cZ95H=P zyPZVC(Ua-gJ2s{|XT{LNBR_I1q^=&I>3vE~{$1=|pD9-)2l}WsE$V000M;U7$=kJc zZR{$UlgKNWRlqBposHjzAIJs3JEF|>bq)6u-p*HEeT$+X{Q-;#PYiece(WlcPtg_% z)w!A9w-4=3r9I3mBjPf3#COkOJ>s0F47>&xRlYXxp)_FVUu?y z@N|{@Rw<6=T?=?3v+KvyDsT(Oc2z-^sZ(lW8s4?$V%)@PHhfQdbqk|K`f@9Mt<~~6 zIBTe_qZl`_R<3;{;o)6*jRU?F(evWmyso{jMUL_=GIw;dT4Whe*A4B9tqaZjjKr(Z z06MXc-194XJ_4fnC=+;yM}EBPLA}z$!njhJe=~*$4NCn3TH(pO=7CQsoy2h!qpWuy z${H06MBDe(>O6JNuY_QaPv^(0q+m=%>>$`7*so#;6cc)AMa3jy5`|M+&J%9nB3I_vI=%` zPUu~$=e@;N62`CKV&v7f$STb=t8K^%x7NUWc{kHSbiM&ntt3?sDHo<<{{U#2+B>3s zXrXzUdzGhON(-FdtoeA~|3ON8dnsYcRZ431(e5^dGjPC3ZHG08NgY z4!-)w75PyV?jqKlsPC}qlJ`EG3RUyg%;Yo8oo*;13t>O;CR8 znphvey7?xqwgC3^z1q%n<@uI_bn<@g4e0e!UQ71V)dD^Bf8Ui0w9gN+>f1Zk;u0hl z>|er|{{>L{&k}DcaJ(+>7bzU&J_g)PR}X-0{dIwUsy4NttV!Vu@$3JL_b>c`f;s;N zx#lh*xg_r|294)%Yx{W<-G%b|y|~`||Ap&XVk?T;fdXC>ox=*>U^fFNsAR8D$G>2{ zzeMgYfGB1mc-8MDp3VOQ&+kWIP2rB~Ek$48j?4Aj{LkO>D@`b-N8yZydK1;z}UsWHq)LU!757Imy=-p{ZY@ z7niI)4)0AsyaWW_=0_EmqizB41`C)cS6ATjDISNuh!TfS z0`EA`APiZ7>nDux#rs$v9lsh~Ss}%U9)lzx6H5Bj%iDZ-Bq~LIa*$ivU!q}7)=pxm zD$bFz$*3InhjXZVnqRxi{EV;nQ{GZ-F21PBC^ zd--I;q^cpPIL&do6E#l-O-{zIkXZ1`5L2_XvJ6Uhc@bp{Vz|cU+sRQh^iq`R#2fig zUh~wM3PE%bAq#^GCClc45k#Ue<_`I89|C4`bY20eLmzoSb#{gVUHL0~5_~1;U>6uHXX=FSmdX+>uMw$_!D1)G+c*Dwh|&Tr{I(FA~it zRX$&cF^HBcGjJIWh6as_Wa8K*P&klxTQ0aA0#`t_BTCLRw+i1Gka$;!=Ym^$len_XC;)ydMlB& z(t?#vm-+6GW&H;Q|G}-YgQDG;wTHLtVF=y0Q*pZj;+v}{HQNqX*3l+7+91jULN?4) zSKqomV+(H6{w&=s(A}GT+1`^v@5xlp$y{kgMK z$k--DfA_ln3!4u(QR$S6do#8^(GyrVi8e>p77%QKjICqa?zuB^dn9A`KX$innlkRe zlw%NpT>gz$GR|YR9)r&gZkgAQt^WYTh*3oiTV)L*BrHu^ zmZqP@HwJ(DBf&E8g{6tarMC#y7VKk02#jo5M+#(kRF+}8;AmH5SXr%-Nho^Qo*g_T z44%pip4m8+ab8T>E~YL_$%!5Ptls2;F*F~@`VR~K!-Pqlf~^yXY_3RId{EcapKTlz z8uM{OjjyLH4PsMYwrNmk8Y~3y0|*TujlEglLBV(Mp(A@}N;ou?_FYU_>P6b0vUo*z z!>uzQ(8}gi+u)XMP;|7WI*x2Pj)&qt7{-A$ssNIh-`Zf`2&e79HalWy6*PHyM0-Aui)-YSNCn6 z|5~T1KKP4j_w7r6@diX!S2n`Lsc*}AI|Xm&29ph)5JD%?-jkne1n;S=_q^aepY~3! zn<2m|+YlBS!khYR*Q-L;tLcXE&l`n?v)P6zpHv*@AMk8d|L z0Y;b!pb-8on}0(C%KwGUFNS(?mUj@rbqKbOtgT0|^=$TJYy&E94nFi}2PcHViT`3d zw`)|vV_S}6s59MJ|A62hP`|nGUr0?}6#SP`HlNtg^yfKh+6L1oRJ$>%X5EFaCUy>?!xax#<%*%frnX7Kd45=a{$N@ zLB#c8p&rDSuJ2zzDY~2QU%CHI${pVHgIQW1E6~oaZI_2=Cm^n_%{sz@gGhZ$=o(8q zj(_eD9B1KOK;L6m{fCx&mQ?e}jO*1xV!=K=@dV&AmTb$Q&@%WqF!*RH6F8L(ToeKq z#qNF}uKkh*tWkU8GlRifzoRSl*FDpjJT*I%2?78;>MT-OdoJU64Z7adi!z5qr!VUa z2u@xcsB?joCEavkA*mf0iB+=%I(BY5M4+DuEtb9~VRY zp!W9}!O70y-)l8p6_!E!V&kMXm=ju7jXO zIja`hvWG;wKV=W1G6#=jgU5v6u}3ZG;P`sESl5xQ3k!9qX?6YUrC-`Q zy+7p)KLZ~4HtI5tKHx@W4HkRBa2Tk%dNglJ0eoUsZpS5Dv<*MfXOB(^N2j3E@(M7) zHv4*?Jgw8#4rNYmSx1S9X%%d(SzAc3g*Ld1tp`|8-_~ftZ=HRjI1qiCv5e)oy4Hp_y&200>W=S|dewdnKQd=* zr$NX6-0-!Ls%iS@hq)^Be2zTR>A>GHf=ZC;ss^Zb`JcF&#m1Iw~W6K5Ii$x-pgw9unPM@U0s5!YqLJ%I=bD` z`?sSHE7E-@Q+?y9mQ(BJa#Q!dchj743_bC*?J@~s2q@Yuep7)p@Ybv!hmnqJE@{@s zp0`I^j^o?7S^zh|L9#XpR`fsmvK_Ao9j~MzczH%>J-KB)DIPtKM4~V^^x3wg*<%5BZ6Ph`vaXCoQgQKAp359Ydc=DR0W7sLyjTp-S@#UofeDetL_ z<@A?ezYS3QoA_n$jsHX6D}zf1ss9wF;rX9!Cp+Qw#qhBM@X8LnG7K-NAD@5Mcgzp3 zzpot)!t1M&3Vc={YlGK+8>xin&ntEKtUlEUum5qdK31#ypDqnN;o#v2bTkpL;}_Rt zYY~ymkqB%rR~OkxM6yI8KY+7FxrW1_JMyV1$3sdokf?P@e;Iae9N9MGKAAg>4?HB3 za$B8NeVMEn+ zP3?{W-Z0CmT6an@Wu(k4PfeJ*tT|a?aBVy5b_|$+HjpeONh4M5+A)!&nesN@pS*W= zr;KF54fEe0y?11%oMbDgy2krW_q;n+lC@zsl_Xh3nQQLq1(WAjcFdD+ZU}9GYWQq@ z8NQ4l|Cd5MG;?`Y=9YncJClUikWHlg9TfHtdr5VJCzo=tM}sd23B3G86NevyN3Quroa{BJg- B;-UZm literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/llama4_pythonic_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/llama4_pythonic_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c52ea0a39dad86a5fcea5e54f58189d274b0da12 GIT binary patch literal 13678 zcmc(GeQ+B`mR}FPh;M-SAVHB7K@y|{ep;eFsSleXWm^(Oc}3c5)6yakW+;IIL2l1b zBDr9q_*{3?tuCVOyeBH>DpR?wGF?^@?$xDMn@Vb(+ugdm+#dl%DuCv=RdU^468{IA zyj$CKmCbuS7yyK!U2m#V=^}f2UcY|*^}74_UJw4pW-}8AG2M5gO?wIXJA6@tC70M7 z;t07*c*1irGO9eeQ4ZcUG0lW_RLdbv8`DkbNA)bt#SEhrK-0yH6Q)s9j%J>)j9OS) zAG1!_Mr{-JQTv2r)G^^4bxu@{R!+D^T@&t6H{&tHJQLngFH2X%swS#Ot2v?}Lbc%g z5ZZ%bv_?Z-BE0c!!kdI0A8K>dT1GVk)gpNF)H>cOknvuHr#R+qf__ZL+uzoW?&KZ7 zbG$fLNdu1?c$~oF=PQLqcyAIKg~l;W4KVQBTm>PJ?}Geh-VOO4!OeSx0AIy;ojDFO zUkw~Bvs}>k2?`zxa)C4A_89d3c%1Y2_J8y@72mCt)mtbMKU71h0@+qaHn>RAOck*s1?^z%-+^Bb$1U<5Ekd5mr zY$l?Sg3`==&+LbQga|09UzrC#sGeUb;+H_b{Zka?KO*>XI{e5e`lE4w*sso$UzBEJ z0u%MBDEK=&VP-GIgwF7^G}(!TIwMymqadI&_XZ+5b9I$WITJGXyBFTv{bsmxuCH_S zV*7>bZ5MWLdp*|G)^R~>3$~R~FNneCF9gfveSHCKC;^4HU$`inBz3(ovyh1eLyiIq*E zP$U)>#ZX8@*?#{GS_Myvx<1$VtQqW5h4lcNhzsQN8)U<1yIGMk`j%=Q8=KkckPTFj zrfEE)ZUw+dFwMGwntGQc$T(6C1_9g%5DVFfFzGRg-zx)X9l6ys-Ym0jCe z;=C|3*Z$0=f0g+l%CkdsgFJP((~jnpqdBQ|L4PeAn-+$)%>>R0>w*gs&;>3*LXc9z ztkQJa1g}92YboJ6?RD>7Mh%vrluvi zf+}Z01e>P@6C_x34G?2<$BXA9$HnJWhx!wolrLYP^O_xGo`cf31(pO}%j?w5YM$fu z^I8+mtf7PrUdORDB?0QoZ;Iu%!3PFaHm{lH=Cui}XqT#0A-w)F^yz^~J>-XpiAbp2 zER@?23(I6)C!waZkkE~zW|aiq%r-y^uTkaC>!n7ODWR8qYI+=Jyd|^9Ej1 zJW^7-%FEkT-nY1BBH?miAwXysM5Ki)c93zUu^P?@%}hf!bTQ$?R5h3QyiqRQofvBBFVN3=_4spmXkJGaf=1 zaxe%^oo$EB7@eH(hy4)+X9<3`(EL;orzYc~(3LO#rSoV)Ig>&nKoKc%eCSJio+QNi z#BrIt2uP_yicScyQ0ESw7cL7k{@km7GQ##G-hcgy5cdQ8id>1tFBi@|I3OiSpz*G` zp08DK8h{xbm9QW5`ddWU@2z+miXl8h1ztAe8+#GuipXSKLVSdC!nUbxdo`|f+~PN1 zf#?j(81B~>VU^5tQhpDC)tj49Slv2cED7p-|1o!$)ge0lKqG6qo|8GDc;A#wM=uKq9O?17I#2&pBXyrz}>=;2L zqU^N(iD^;tUlKq;Vo^v2VE(szFNq{0&# zId%5rxmQO*!zcUC4Gj#-))7VdSgT}%g4gAW>tPy)hRJ$fxHNs4;xoA<9<4rFR~J6T?o@4e^0^m2 zYFev3xiAO^a?)6r)mTjSr%q4W8Av$;X=hu?*_QG69|Y2EBdNBLtd>-^ZxT(VD{COW zx@?8EvJPmr#!r338Q5@lXX=|)=)S(iPjnyXmhSzuZjXQ5FCDyzh3omYX z>y}=7@6CH}u2kOp*2*_m`8DqmpgL-nTEE}?{_gv`*B$MDuKg61IkV^B4{QIdc2f)K zEuz_D--2rQWg1(c`lj9Q*Z!N@O}*A%^>JgzQs0)A)O4)$J@$2F8aqBka@I)d8@Fr~ zK3A4hRJyiYq#8PN0;E*~1v>Hh)_&rw`;6$IiB{L`p_@a=+L3kZtN1c_b1>sP`BCB@ zwb%8|!tswOCEe~mmYz2zj5oMhk}zI?!(rYuZsFwE~b1*i=^7n zBw-mxh+PtRJ%eNhp5!5!Rmy{ngjLF0(&K2$lmzuSqICt3$X5U)D{DJ{6_SaT0%YWk zYOCMko>l&~tq7diBs_9hNZ8b}3OJj$&)Y7Oc}Eer%tNvPvdCQoz{tkZIbWG@0=RBc zB^91mVXP^4s~Yn|`B5zB0F(+8xYd-}j)e0n?8daVJl8mG*Cm0^&l*P(&Z2tqE$7X1 zaHe3pW;js?emqOUe2trTC0x{-09v_TKpD`Lh^EkkgiF*X%sk<(J4l@4P4jM`*nnbX znqXt?)+gM&eGXa(x$f~i2re|4cks>+p(PmR-391dzP|}4%1+=|yFbjqnt4ycbA!9} zmkCF?(MWjIk%ZCkjORfttB^%d_V3H*jCU2d)D&?4)3e$(qnTk|$FDDqjj1-G!3}kh zyA{-tgo9Rr{O8pA3Qy6@S4K&cyNs4?-oOF=;@!ycz@_$gm?X?1>J500O9QXJIsjT# zRFXFp?_;wt(+v9C81zT*!2&@4PJsRg)c5hjYO3($-D(TAaiXzY68NIIc>@4w-VpE( z$=sW?4cNi6fI~siB>Mk`hbfLA#ZDLFUW$F6d-=COykLq@Av^(itPJ*j2gMg--y`L* zFYU#~;Am1*=1=fsn9~F=e)7M6^{ZdWTu|obWp0HT zad0O^rKWfg(_;|HMzFdRB(0}o(Ks)=LKESuLI`|0Q9kGBWiXe%gk@gF=oChXZ|P}_ z&R{f%5t_dAEJU(Nh(jl77>qI8M&&A{GXfui?u#LCU@{ad8^TlIX67m0Apkf=3vyag zU~!0{2id3^3(RFMGw4x-fWws1W~_DyBUnC!p2O%xh=N`PJ&z#aRg7N4=$jb5j?o(! z!A1_STInxf^H7-Qp_vQ;nH8#%F3{;Wu-ti!5NNYmlr79wWNj8@J&dFH05iTRZRzdsaiV)*N;`G>9bOsFzKv*Ik&kVGeMhb|p`zkIkXz<+r|0~6xfhxuS z3Hv<&djOtwX-8|y(Yh=GMqV9E?>m#)cP3-q33z+I8L+X(zhYi@_av=7S=~9#^PZhcyZC1L5@Jm5lFV|Uw0e;j%3IAl~}s#g;duI z$-Zx_b)8Q(3@=(Yn)k29)|&g1R(}Q@i;nh8b#12QaLU@WaC*u7iyXk8Tl2O(6X3h+ zm&U#ydH>4&E9>s=KNkVuA1MO(M-bp2LV#Za(z9l=!~b6GzecFP!~StiU=g8yWnj7Q zv9mQ(6Ugd`tvX8}4>)V<7tU-ncdU%2n-AS_Eb13$GH&0}zI5&0RPEkJ?!6gL-P5-2 zMcuk{*M_@lQA$_uO;zt*bN8%HKXM<es}Vf z^NWrr)w@?lA8J1M)88V}ro#`LABt;D{fmwbUwghO zt9oe3L*qv+f7JknOO0z$4`z+0Cgl!fJPm12Ys%BQJiYGeeCn%z&wkII^6kykHNH1= zZzx^2FNJ?EJoVM3eLX2(&+5rFU++WV$MGM<)5l*+!JqH7$G+a|^CWQi(?O0nYnS(> zn|mKM_hvc{K6EA9k0#rWv9bEtv16mUdF4Cn)kl)fBTu{y%LD1ap;X||!#$4zN6{z? ztejY{?oK+ppH|m?@!13i)A9LM9Aq+6$p;pdYAT&xx1PxxSm!`{g~w0zuG1dfPpkXA@bZ_P^$`D+r3&MwJ~ur6`am7T|J9?%xTfcf zL;E*&4&q=HMes-wdsFoH$rYgxKN*3mI1LrhX*4kvjR~@0;M@p!B`BKE6b%h_Z6F)r z<_4}Dq*+-DcW%m7X%Gc8>SZl#nH1f^imPE5Q)nWx_+scu}nba(vDcveG9=QSCl`L^k%3G~0(aN}Io z%;^Kkro;b0;0d~5uxB-pTJHZ037cA#0E%VsrS&^g`klzovZBobL(8fT`C5>#fzh*u zgL-?Wt|6<%XWR4a`P9_1sfYJ0<-Wz~eNU@*Wc8Q;p^&U#Nh9(4vL==^llqqB;d^JZ z7WQf-bQHuWRQu^V~DXqpGBOcvV|Ko1uj`c_jf_rYW)>HRbBgyY@9ub76Gyo`Sa&*E(z4 zbroxCu9Ki{Xq$rI`bGHj|IXJN)GGLj`4^4-;6(?guLO5wcmXGKSHFsQcZI!Jcd5q-j#-9T|~LnL$8a+kEZTg01~gV7WC7ylbXa0^>e zx!C-UcsFq;v0f3#I9#`9Z_Xx-z73OmvHxB7d%ku@U7)GhVD_wv1$mm}|w-ydK1?b*_6o#u~yd+z8Kxy3!`xwPMY z>E=u8-r!0j_$E`8XvfjaC<&@@o6tahnGFoBo?ERtQrH%FP2s)}?qzCJfn{$&^V=7gAqnZX!~6P z-oN5&HKI;Gs0%D`b}DJ`@|)+b0XBov3Cos*M1J3gtF}7e)iB>!V}b*JZrQ8*4|5WD z?N!)^Mef!?Deo69_<2F~wcyDCvyy%tS%#dfi$${Xm&8n#@oUR6H3t#iIc zs0%!-fBiK^|8K#=R`zoTb;Rc}A9cMXfR{3d&cw=Ra8#-s6n*R*9mu!O#QJqK0bgT6 z*}TuaP({IK3%+|DeDsjV3`IP7`Sbc`aO8b|^M-^zVYmuk^L(W(hTqth{09oqmWo70 z*|y|I1MU>ca}?TQN*EKSd|S$oZl0U!ag@-f3^T#e_WT$sx`UXk-0b96nCN;2hhCi> z5oU*X!`(;T7hm}F1XlH7JOp4}DfJn^pRGYJ?Y#Mx9jMCaAyuEF)&NzRU!)py)BsRD zY^0Iu$x*w3>P0H7jQv3O=IBRID-&ki5EJGxz>LAFq2sub$lPxFCVPHI<}R}2+>WzR zk=->5ykCS*Fsb0b1Ye9^6(o^{fS3LWL~~U|T!7WU3wT8k1(E(UGDdlJ} zP%tl}QnY%~VG z$Pf?UO>vnS!S5GB04JL%Bbw55@l}*RS`iapaMc&cuk-Kh32d;a)e+_Ws{7o?2EvG_b(S7)p>X{$eFg{;Tc z){L!karpP&eB$;mpG^n$rvm%e0tX-VtObs5R0Wf5FRoR+lytuI8@NuaRrM#G{l9^$ zf30dD=^Q|=(MPt1Cyu)1`W2dL+`Gy@YV1W1(w;v&yzE&PR>JqA_ZwCR9yK0%>hV4G zG^RZ*DNoB|&#otxJC~2Gx>C*i9-33lC*TVGsk2_WMPD9y>}<<+6W^}vUSf6LK63NO zcaPl|$W&Nck$lEm7ewbZm_VOo+lNarTS%>s%cw^JgV7~sfPo; zXL;cL)AvuWh<|!6S-&r7+>vp&q}}Z)cl%2Jn)|s0W5&BH?d?o?J69rW-n|RvjK#e; z_sG%+0H9P}Lvu2)f0;*Ayz2hb$z;O;xckpE^rRaOry9TywbpPP3I2U)e{ag)3#0CT zK54AaSQ?VehaOoDfymj#jJ9jPUPz!o~|47DKwbTryZ=TLg+HXzx?wf@oSud&F4GtBn zbFnRDsaqOPT6QOOyA>?sQSZuQvmp@{d5dby$kVomq8A^xaGoMvR zl*86=L}_Ug96|_|N&`pn#4i1&~n4wcv%fED5@K|#9v83<#g6)Z=VZ%|Eaa7;_&du*+ zDr+;Y>J4ve21I&WQ{L9Jw>{-;&+OO*xVy@}Wip~AYBZVA_h0|+aI$g#TK$1!{h_4e z@Qs&0H5036tAd!IdiT=)_4cD5&8)Y-0xnf3pf%_X;1IXFvKB}&g%7qW*G1ohj=(3$ zlkg!O`;mp%6}}W<4oukq<`0}26t!N~Q~bh@&S2UK$3$GjA6CE##N4E^34fQ+g`aQH zK4kId-t)gx=u&>C&;_1UX%ebaT&8Sx(5FkEJptvB3Um@!>>}< z!9^2DWs2az6kY)_<&cd>l`p%F)Bg-KRCDoTh_YIa<2DV1v-~Zw{G0@SPOAQnxPDHa z`+MU1rN+o^-JDUm0f4r+U}oNJ?L`rH4#51pfzl%S5vP literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/llama_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee712fc2f26e36c20e5ece4531a0546e3d7fd03a GIT binary patch literal 9198 zcmd5iTWlLwc6azDHKZt!5-Ew2MsHD1+mbEGw&a)W#I__qBs&Q=2|;s4GHJdtGmK?r zn5NqRRnQcXw=FDx6p?`iBEtrv_NOXPpw1!%Tx@}AM75-66~NndvB)12D@dAtwCB!n zNLi7cv_OGgoA-I{x%ZxX&bjBD(XXvm3xX8U{YmIZFG9b=4K?ZWm8ZXg$}L1Anus7D zE;SJ%s_|(ErBCbA%40g89!hP*5H zsbM_#2xYgvS`B&$(U$iSUBXn{)8>2YZV+DbOWc_75|U|>c`M1pf(+F4$0CvVb*Q+` zv&gb$ob#)7!_1b|DV+%?W{TwsQgta57#kjaCdle0V<2=aJUrR#KTI?hG| zc)CPBgUFc3nt3!$ z(AxJ&xOyF9p!JNAHZZ1X18sa?=QGnL=rhC50>2W*!jx12CYmhNtc98~lca5PMuevA zbSccY&ZAkvXPZP`yW|{;1fqeyGjNRJ!RNlTP>^s3g)fGJ^8r05pdTf}Q+++NR3JuE z{ZuqgGZBso#ds!0GqenoN=?}sM`*{X>0zknC-at<{ z6blC^wX??}PrJkc!s}O=7}ed)!~$0$Om`s3$Gb6>?%>sUD9FHw(t>Sv=eH?i?2|8p zinB5%xvO943a|{tF?^3aAeniUDTA8)z2HIzKg{G$4Ilu2LEJ(-?v>LA8Ug(&2(dui z%L@mAW{`mVh50lAt-Of9y#Tx)W>P}FGgs$`|gzbGh5{dT0X4xBwILH>)8&7})gcJc%_?+vNxmr1{($$l=!B%6k*N*KX z);%iux&VaxIb`3xhc->t4~(M8y;!wjYRZ{V@7{UchN(W=+=@$2wo7g|>`ex=paS}=;LEeoO=Kc2fi zmnAQ3lD3ade{lM#UQ5B?znDBvVHZduWi8^5C;3E@myB#)=NnjtPqMLKL6iX23O`ve z-v<7=1xz2t7oJuT|f#Nj$$GnnfgGP?kGWQQT8nLHlB? zW+Z6C=A9To73frzkQ+NtYH^*|x$CJ_%}GLzCz_n$^CBt`lDS z+;1)Tfy60qX5goe!kR#SFga?PjYp{frP>L~GfA-wmx#wWMtOVi^mDPu4T@1r6^=SH zd2Ryc;J1{Q-`CJgEDn>PeuLsby~UF;Iux6s;OPv-;JKX5KVulxWHOP6vph$E&bmr* z02W~sjQFLFCL??Z-e9?hg9i{a20XtsbpW!f# zOF`FhSP{zml=Zws9Fk1adE-pdVO=YkLAxcRa1u_k;Kia~FR|QRLL!+MoQ3fRxL_#c z)v*0=9qfL{yc$Wz-vCYD2aTzTxvrCNVis%*l|uOF1tKdnE|An~XYu{ASg6<*$Rq>D z1lZtJNk0R0OTZ(Nh=h13cs7)fbh729myI>a5J-S}rCBUsEPnDNJ)MjuILUl+HpnDo zlhL49a*~-*yS!$$6;J5ES*JV;%u86%gmL>}9Z4I%5|&802{k6jbL<`rs6ap$hqVO0 zJvp)Mc(6Rr4~k~Tr03?^c2(#<1UwE1xJzJ{yoYk-sI20a?WS$1OSJc_G_88XeJ3{^ zz1iBvWp?%OmR94m<&ef|%UJ+0V{aDi%^7>AXzyGeOWTL$^^Z!OyMmjV%(kq{bL*X( z?=0>8vv=oTe%yEHZ$|!VG!XOJd)t^k1G`>pL|+wpR1fV{hmD zncsNY7fg>_^;xH9v$FbD_-1&?ov!Ry&}Qw`8GD0hZ`gEtvbAlS4c*y>=B3MbYM!)q zd=kDBe%u5H5S#Yrv)mQc&OO=MhUM^s=HhYxj1J>eU0>H+a_6_Aw44LS*ty}L5@9nmPTL8 zG%^C*_Z;S{&#cX(X7t%!*QieSE7P$<(D+=Z$GN#{bWr>G00FsV#TH5^p5(yNo#iF6 zV2j|E1#D(Mpp3|-i~@*F53n;hsjL89=qk-mQ9CAr1!E8PguPAqRs1FJ6TFI=aNab9 z{z8Jc4%=iqlB)l>+TCZ-1v8f;`GVmpXzy#(ner=IFk*}7Svz0YV_5cwi(|MJYk2Ya zw$XwaTEtL_=HQGelNpTflo@^&cy<35=VmL4@7Ied6a$wmRU(vZgD=qF8c>#<<5kxV zb|RF}s2abmd78ftWr|E$1Y+w&Y4?X}lo z)rW#DjHP@>q7CZVQl&yE``3J%3J}IRdq)yVX-mEQ;oRxD{!C|7w7te=9!^4%)T?yK<-r&5woe&4FD8{Vt-7E7?b3Rcs7 zySmm4alcHRm2$uuKlz-qnpq7#t5A;lH>eG2Fki+X>XmZDG&tWzzak%)=Rl``i&~I5 z+5)YDt-t#YcH?!rT-`U&>&3Li=h}szRT_YFI-aKux#{O{mQ!6l5>NY7vAMV;cEB1l6=l)UoEqJ;FdJWwucyzFbK@}6=>3#;!A$6o!(r#6hE;-PnnM(O0JpF+G}kQ$MW`7j#^gOU?AH}NDcM0R_*ir8u{cjLv3PO@ z;;z_AUZ*S6smx4F4rI4NtL{(dH^=U{V1GMTe%R4^k}UN_J+@oQs1{7LUENc#MiM*A*wNg;rcUtfT-IOQcuFPC>yA zK_+QJw2Wu2Q}Od4*rhml)K&0U!Bth_bk8__J#)j8*FuQ|?hIZ{#;#ET@Q$%#4gM_^ zpH@~bPRx5q5c8)P>`Z#=B#XRR4hN_ej}g0IY%R{P5|p$!s;8Ztx+rNQOpL|P3wso2 zICLU8-~x)VerP~oT+#)>OP4GK&ybU>oJ_+SBJ$WGK(!{$)d!=un)>&!so=ds;FE>#6UFZsl36ASI3Kg@OE6C|f;)^`+g*|TXBx>0-U$w!=AC=B z$tfR0AovL}R;xPh$2^6P4i1lGRYL@RIYMBQqqr(0Cl%UxR0idX3x~Br-b4ODp}5SR zA9}*r|Ar_pFv{HCpXfXnrwFhjw*u?{ZgeHGR%OU0k!)HT{bb_K#0J^7B4h?8#DR&2 z+{VD<6G#2BWy8_8M)u`MRM(iP9bB&+{B!*+^DoSQ$NYWhZ$tmI|DT8cap+-(I5IV_ z&uK1d%RIMLefRFW8?KQpEvj%Y>OQvJwrx1u z9#>W8jHs*{AYIfio0cN!#=iB+zRi|_Rr0}W4}5Eb=hrU2l5TnR=ETC}Vq4Z(pK-Q| z&elyg_{l4w2i}J>;_*vs-pgtCE8q)PR6-E9qHf7`zi*kpHzYRni_ZRq)6nT|UgAVo zN4CBLJns7a5Fx0l&U(&1)k3)jZ3F7+T^7X7!;9L*v!c5_>ukt4+eK&lvSY*9ovSHs zFJ0LO9{Av))#3EO>4(SG2gVnyi*%-LpIEnV-QKr0c?sNkYuy%!?2d)ubZNuVh4s?b ztg8kh#%0ch@wB~h=|bAx3R_I2JBJ_WGDoMxqf=|Iyeb}jHQo6tP}}LwINC%<+w$IZ zM_0Caf2MgzY#v(eOgEoe7=Pqy-fZvs)N?`2)Sqb@5t~L<`OJ|? z5&kz#rRy&(>ay+oSLhXHt^M$#^@+RgWAZk+)-t@B+Hj9=Qe97|?rcX-rej3x7}@L| z$?iJ>MC@t(ee0e(;Z=GmoNheZNm$?4SRF8rw^B%S-VpS z%Vg~PMEky#eH-=zoA%0ES^^t?x^t3%nYG?`Bhfy6M2C_h8n00FKnr`1{7LI~}Xd zOC9OPBU@HuZCy@lZm!rbquRouzxv((0+1&ognLB5eQxErVG+$U4&$f6oEr-RH z!>d!7;R_=CZ@HLhxh%F^&bpg`?OH1rCg4;pUI2B;*Eb+G4XnEci#K-L`lzaH)7$-N z_1)@~iM0!tH@vSb+Ojq6%MUmjLgn5Tp*pico`vQ|V;_uV?dRaithTQVAPBh$$XM?(nRp_(f z<3kr*=<|VYX#7LV$!aKmjZRfUA-WDv8TfYQ2MYoF)Eu@>j>=k{bwX9nn()99WZ%7yzS0=CD)98XMZ^zO&5>YN@|Kz~8iC|6COU~^`f9^Q{B7S+BZf zZ@v7y!&k{TgwEwZTHtyDSAsGn*q;HIq?f-Jve=)Lu%Wiotmwg3MNeD~KXZz0Lc-rP z#5mcoU^$uOQVsr8*t6|ZVUPTMJ;{e6oNPQu)DIw(kBL1AHqOUkf68TH%R{*x*$^qg zO&oWVqvF!g^N`3!g<=Qc1H#?IUtMhfD9PIJ0Xu%ud}Y_Np@Xe0+4z!5g&j12K~6u)?S#>>oiBGZXg;WH~KC5Km2rAiqG(zeCl(LFHedeP5tG z|Bi;f)tCs)w+OPYaQ02P2AT#PZbJ5r1rg1^H9HqxO`Gf9Gi(|DKufq64}ODiDKGrr D>uH|i literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/longcat_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/longcat_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f1b93a43f98d64564ba56c0c281a54cd26079ea GIT binary patch literal 2024 zcmcIl&u<%55Pti^&c3Y|^ z-9T&^sfS1;0#rDl!U>K==^xND9J%1Cm7`UWK)`{UTZ3phF>lxFHI;?Lfj#WZoA-S) z^JeGGeix6&5IlzXp`MK(^fOxo53IeXTL5t#X-MM?RN_GL2459Q0>^BEAyy-$2#0v| z8q&mFq(z8uAo$c$w9YBf-`L`!!nqT#6Ivzq^OvZ#Zdt}9mD+@YT|R5p*R3nW)Ek7B zeX9TN`9a-zSbqRn5HEuguW>}!;P=o5r}4XTN!&v_Tq$xHDS~@^%`z)x)j4md_GTEz z9d-I4_<#n0j!^K*OAlT+!vbYV< zUuuFQ1(q#2&_a38^9mkE0&5d(9S!tQZY4rJ&;Th0dMGvVW@Nx`Kdb-bmsRMo=P|!W zf^G0S5$K^TqHBDaub?vj)-wpb4>cQls;Q6>XidJ-ul=X|M=23}Fg}j=5oqgppoj8* zh&UE_uR{);m&bbpni9R(NURus>tP4l8<$nXSZz$KdAL=GTGcmQEk*CBRDO=T= zZV)$Gd}ICW+L}8aA}Ohm9e2EHX`~mD-CH+G%rcnVsSr633hSEfJ{4~D!+9G~jdFOX_@(V4WpVTrbFvBOE(=mZ`FvY>Ul4{nm4!Xe|}cq zn5)$(y9|vJ!-r5V_#g~qCt#{Pyo&Bg@%@pOH2L||9ck{z?3q3Bj`ZwdW|hg*FDGWd zY<$r;lrP_t$F5!2ztD{csTkWzc48=-Yb&Q)%IUUJY$?S<`Dj~SXvqt=UhD`c`zqH( zeAdG*bfPG;*iN5nrBAif=UVA=9(KMZ&)+%**vu;ooAI#o9U0Bcwv&b1$pQphjQx}> zd?EoT{zO|&wd7P=PPgRrjmaCayYdT%$-;dGVi{#RV;*z)c5)e*Szs>z8*>?$EWf+b z!lMUAz{#TrF)-L{IDV6xD<;Hm({sg1QHionHq|PDPL5#=owsc;UB>vGZPoC1DEq`- z_1$>yyO)V!V61R-nw6W)@RwNbIY{KM{!g?9~Ok4^u|m zh}ZU%jRALw^+#@?KXQJ5*9Yo;|2*Cw?Kx@nyT({xGwf??V~!kT2obS0$elg0(pv}wUSZCQr$N-5%X^`G8NHC#tVtpmJ_Mj1cY@bQ#T`(Qtjx~nat?o!csDknOjVq zM}Txjsvb)uGcyy*saZ@Nok=DoD@mG6q|-B35|WL?E-o%6NoLddjSM?8n@KQA(&^OY zMRs8Zi%Zr66&V!}=!F@>&(E;w1Pd|CTxu>8PcB}$LZVja;>E>liPYRmf}NE|B)ipx z$ix2yfqw^!kf3nlr?reGp}VYIqb_S0?Hkr!I ztdD$*4r0nyhb;@S3N_CFF_;I^HXQ0J5qF8cq<3hB=m<4Wjjmdphu%v33T{JJJ{QAwSDQ3(}&#tyZ; zRE#U3`DrCo^p)MAp>9$U$JB~*ctOS~Lm5E0@mWCVBP*`qt}>~%;kndxz%_JgW+Cy^ zifOn@j*qN3hL>hoNX#VILZ)@NOHCS?U~x-I)JjL0#=^1a(~+|6c@DH!fK!_E9d5Fa zUu<>dix;mZ=!Hclk)%}=NiQs?GxVhdy)=_fCm6_}GXPaYr}XS%DxG1MXETc|%`CHX zsVl`KAX%}Mi-?3LXKv7wy`&seg;WZlmXi#fTFlVN#o3um0_t8&(m0`XIy1v&x)LdD z52#j}o=cHt)o%G#BG=EJoun@;&&~oJy}X=E-lXRimKNEJTuq^kP@48dVv-@fyp&)i zV?0jiVqB_OViVWn>I%kZSXQbnMoIgS4Q)%#Vls3{NtCBGE(1bkm*;LsjzVG)%%RS1 zya3}YrWC>IW|vtuk;=#&!rc;2LsYUKNS|Y*I+*DMWHZp=E9p4!1*C}NnoG-NarWd+ zB^b$id1fvNurSe3SIfi6EjSLmD{dWz%LhAmpJyTSPL14oaR(O z<+STF$>oG(kt1^3#<^6S#E~(HD>X`vnc3Oph2QR9I+1}oaHcQ^)+m#Zv}{7EoCWp_rbMb3J9qKq*|U-(mpxg8(7o7WmCaJZJ69F`ibA5rpv`z#HU9w-wO?zY=SYs8w0z>(4z_} z(*BY8j#;!G|Ka(6die)0Kh$X>W`xAN*6;_@?@f!==!d61I(_Ff)3mSERQEm9>6}&jl+ISQZ>A!h*x*N1cdE8B!oU~!fiL94 z=ShoVYxINp_vSxHy_ed$C>)vOk4)xUr!Xt9XR!;GFmDOxEZtjULjO6w|6H#B#d|eg zY@of^z#}tdvj2_sO{-`f*mnJs_Xl2K@Oggl`QPibCi6pBYf7qM6PaZ;HCqs}Fs;aO zBVz*MpC+T`&a0d?fh3N?@6D^6buo|8GP*|U@2LN*1-@Uu((+6qm0*#k%+SlJIgrFk zh|B3QAP59ePW={{Q9Lcn%W%#N==MvxtBD(TH0&wJN@Is;b`)HxE`Fs@X$h*GV^*SN zH+orE2CEE!^EGO(y6)SVTPts`yu%6gZG3%OR+p{G)pz9TI&=0|&KP6I0Peta;gsMp ze&~i?;dB8j22uGVF?ITg0HRJIiD60d5phJF!b8FdiYhAeM?{sPaDwfrOdP4RD4d|c zqY5XaTv0f|yrOV|F;zGrOn*^0!PZsb1Xwh>(B%?u9;ffnr)&&h(R2~RO5=GX0JO|U z6o>$F6q!Vf@KqC2EJ`ww8rBaUixsjoIze<0HL)RdTG2sXmqq53ji3`nryZRRbUM+A zq0@y9iaoHkk_liZ&O*3k`>@>*f6QJfOR%nD`FFs9z2XO~gV?I68T|$^NU)br^w1c0 zi!CI-Neq#6zvw5}tVIljFrOAZO){Jt$^-UNiqQleHj7OJxiu*1d;TDT19BwAy?aKh zU~J)yErKz`8$+KM+sNLOj54E=VQ1By2HnMdhJO#Y!MOz@^Srt%A))XoE4HeEe1tyq zF{+@=Rs)18JQu-=5NZEo>d-3U09*H%I*JemPQ&OJeIs=NdJHJUutFE~FoZv+FSCma z^lQtDz^K4hLMAsRY5bC=?YB6(-~K=N0F4+hYe_N_x}JW0E^}2f0wpJ*h?Za!7KNBJ zP8eOlE4DZ=KB(~_#Cye87M+xWd4XtGSe`Y&st_ia}ACVtGTJ=v`^d)kk9C@Q>+#j0m6C+TTN6(ZMmp{A^8T# zZ-aal5f$@Sb)2rGoKD#c*9M>`T26PNv{p5j)LH?x*1(o|!p3DPuj5wHGDd){gC1C5 zKFBw(qBF`g33dR0ijnd>IujU)BaG7Sy1eW>)($^x}2uhVEtRbd1gkj_*FnkE|z=rWP)W*0#Mng!tvHd;*5r$C^af)iCC zA2vw`zNAMHA)=KkfhPqfot1PDfOth}dEpX74VROPGX&rgNRo+}l%&T1K@jd8(?#e5;oDgT<9d`r5Nw7?OZGDt&P|Oa zWbEPl~hzgOEL)Q0%lP`08!)9fbbxBkN;xswkIRaJKQzwlaktll7-Gs<*bxr#h$~QH`TI z>PH?kxIfmoMjNRg*SkiKP(SXd9;I|Yep&CdRvJ+xRgT_qx_Olo zgBA)OqarQ^y{+o!s8s`}WtTba81>2$MXgqppvMYEue6ueyyf63=GEh@)JaigQJ5Z4 zIFyN&)>f`GV<>6iyR3l&(*x>_R{;s8G@qFOe>NIn4cY}#q=3#RLPl&rhs{r7!G}kjcC=(nPG<7AG3ny&$)sY zt~{hsS>t(CF<$J5%28Q4>wI%DT?sKJr8LZ+6?h$OBejaKEeb@@KLo2~%m~X^R&AVx zLz3$+St$c&!L=x}UUtUH6>_1BQ3*3vvUWC@C10^kCB#`do3i4Zk+Yya|E*m;l?f;i zj6Q?xIr%t!MpZ`7BbR>YkLX2&h>jzeRpe z(wNyVg3rE+5j_&Ri8~s}3hK>eHW8mm&(6)=0a8WOY$j07EH8ixCJlV!3YJ<1CsNDy zVBv~1NEowM*_Sb5xWXp|NR1A4pz{ zpF(J%+B9fS$vT3v7uD~NA+%*6@erkdN(e2eH0s>9UVi)KjW1`< z{xf1*XEw>V9pAA2!qu?vq#C^YZs;A5``QMfwwJH%6>5k0+M(^CpVv;Tjo)i*dN*=A zk_+~4C1m|lzVVs0=OlZh*z6OUyZGj=EsM~7lJ7ow*O%`e%Qug&n?-L#@OJUut}RX8 z+bejF@!n&3@6ftMbb1A6gm*@=!ChyU=f`t@{zG?itOaZpz^nt3CH>ePXTuUZeNj=Tk_9&Q)NaT&tT=@oPi zL<|lzkbG)hl|hPi0Mga6R*<^NNF^x{c@;FcTESHS>1qWwLsh_()Ka0erlHmyH2cvlwMgw;6vZc%Rtt{cME7q@sfE6jnB9tNM zjQ&5VssR}B85pgm9MOedDwUV`%3Q5h0;6KMHU^lIs28}(MqpPM-Kw!<7gcaN&RDvO zK#FTpDgzm;6|CE+k|JL@g2_}W?Exh2V)7l{od+ewh;d~^QEwntCPufktmIJ48U;h3 zd~WD)YBALbGMH>qiNq#AHnS4$msmi-JY&WfiKNla0uov(GsUh#Z-~)7Ml;hccbp8!GN6>j29YSgI}b5?+^KM=;ftP~s7jOufE^#oj^ZKO%r#?Rs1$K8jgPj=_DA!pNkm%X4hHV5Xa6 zZm_qYcp5vc5+a!n)hPNJwP<>!wVb&3X}I5Dm8*mX?N?D&hhPcumQePuc+22Dz`xD@ zEW17YNT+pyZeI(!ePh^EB{q205O)T{Lf`}+I3Wa{I=S@=p&EndYzwv>E; z%>M^h!TxtBrK*{fJWyX%0O@t`dS`dV2n`BT~sk5n34$N~TaBUCdEJWF<~;aAmGsE@N1M zRh`wAToS`8YU(NwtCKpD3SE2b+UrA9_uP_pZbJCDaDZoM^I9WdQDCl6NlOf`qN_}nhX15$64kCa;nZ@)xkLmae)^{8WlPGLFdpa0YSTi2umkD2f4>(si2b#m38e~pwj5A^Lx^9>zZttsCE zW8!L5VZ+HeVJ2X&u>Z_eb2Xe3M&yEitJgS-Bja33FW^M0^8&C>7h#l;auNWNR?-M2 zDB>%`3n~^mFfKJ!DOq|wvD{v`23VU7Lr=<-EW3xP@|RN?OR2C?JiD1ur6)&Z&vt%Xy=L5{k+iv8#0 zQ?mejTHQk>xn(clDm`7Tg*t;pXeGqeB9?)(c#`x@rI6*%4b<_36L(+-KLHLkh^uvv zZK2%A>y&w3t>@|sr*$1pYaLe)r_~MEo|-?cjH&2zZb&cesbVUel)CcisqTXfdR#vTqdcUBwTlCj;=_Xi>GUuygS2QVg;SN~WvAS}3AMP=& z5Kf?(Yc8qriP(UgKrfshFQNM_Ko47hB5Tp1qtbBde9+f&G}5Px z6R2Byr%Hm+4}KW*SbTq`;MIxe&tOa({|avnOa72+P4 z(K2-+-Ur^kc&#L$wh~MM&r8M!wE!0X1aWPUJSq;nq9l)63yj_MOiU3(8p7I;)XPTv=%@!dMl$KPdK zeFd9xe`yQlYGhPu1g$>Dm2dUZp~@k;8qFk>R4$sitb~;@fK9oci^5tPxF}Ok&U9F| zDKqX>wBw>V?9j)B0scfuy|^l6;IOtbEe4nd?g=HJ?wTj%GYzDd*tfYu9fK&>#m>_9ZSNva^?2TWzcz*okfVgOr%P*@u=y7+pYMRghitXi*_lN zMyDMe)S<{1WE(Mr;;DS47SWu1Up9szq+n!Fa48(GcP+_AXC(40Zf^EAIveP`j7|uh zNpuiXv#1nhE70+v^ENtZbcitkr-=PqbVks58l4C_t?0anP7^v`N9QZ(kU9F-75?r5?_t7s(D{3G-bd%ZqVwO-c?TS+N}2nD5yXs-CJJGA^@5h@I9>)t zKw=$DQrQt#v zdG-UZb;uQYNsq6T@alhhISKl4+2k)@j+IPu3_Dw}@5_yt*u=sNJRM10DO@lnt;oRS zssVP@Lhb?-24$8bWUw|F5Xz8gs7irizAz$e$x7y;kjt*)+P#U+H_-VeI>;EoyDRy+ z9*l$3Eji%D%iK&7-t{OcHu5y-GCOmH+|s~9quPKzUqH4LHGSFwRD8O$(*5~tM#U0M zAi}4AWx}h14;m>)^QK;)hj@Bud!DDC6~UThZR4$NyVi&p>R21wwfY}aP>#lbv3ocB zg}?|O7`eO12fldU7Zl-2ovVj;^$4y(-Zi-0k#{`<)**k*IzA_H`8QAQI71%>vWyTL z=VRmf=)}j>JJB;>&lh}sysr=LJP7Vy-rb9Z>h|2h!tIBCp?#EZAI-bR9vHO#aSgHg z!v%paeCwVsPtVC(XA z;wANF2%(M4vt#jWjtRm3onZf7W2?~E!8dkfFAANf_|8-L#?iIsM0?}LMWMN$Z|>i* z_wNyd)Q+Pq+rv9LK5?{(f$+M1*FlScpzH@D==+N4h5Tp-rnl|Yd!T82a}(aNhap&9 zD>yoNN9R@x?>GTn6r7#BvvaGFcOKvC=*BXUy#uLYs7rKDY(#~|DBl>(!o-a8?c@2z ziG6tSQ@1_^E8}jx2XivD+cKJSkBZQuk*f6xt~TD)mUl%2S1<4C-Rc+mp5yzT+jX5K z7P~O-4(H)|%B3B5H*~9szF!~6p4qJ*$T=oPE%YdW#2My&A)H37aRfU`Wz2Zy(3^EIc|Mn!wg`lWB& z6dkpq)h`?UvVmP|Otjj+-M5|*>bv;*uDr9Gw|1|M?pr9kvtZT4*504ZMNi~gPwrSw zLX&%*o~@wJdzy!z2YTh+YX}JqQNDpp&I!K#MDEo2e8YvcD$&xgael|r0=COs=twSf z6t>Kpr(u(c4tjIs`=i@>;n*a9Y%TSU84u($E{w(KD6 z1+lsH-B)kFnjI55#`q4r4)Ze|+5T{Or31F_X-&-oO6!GtC@}v`p?61ak7NVcmAq$Y z-H4ax*>8>Pg?qN5+b`w96FFB19+3$>qkPY3ZtRQso)^}~c3m;Cy-R4H;M*ti?ayLD zSZr%YKdAU{-^lIR=odV_yr(x`--pk%ruc!Wo%$(ql)89#m*76iyN~XLJB08lK72~- z>;Y40v-gqC(9rnEK{bW%H$~t%m(cnY-};moXcGbhd|=>SWJrv33XxGhGWszKX3?l0 z7Pqzgk*%`XyRS0^8z1Q`O?CT}rM3?46~VbH9I3p;vxgJavSV+_>V@#&PIyoZl4Ibz zS0A|V4#7hc!9U3R2k(WB-gWMV#^DA|L*u^AK-WFeRiaV25{$Sma&tGh;|waLp#{9L z8-RBUSn*SQ_^G=MJK-~X!OjoPy>|}Znw{qRPv?VY%d%yIw_t;@Mu zArH>W`JMVMG19v=oI7zo7rC%ewXb(H_(Zsl6dmQGqaq#K!xlE zjq`2en|kO$C<3;1tQDbg|DNOFM?WtHyB>g1-VZmUz0uP3g=M{aVK;IbrW}{ifXi6v zty`af`3`p!_Ey1)@ON2C1QH8*WX$GfB=;kaq>&PiKeou!OE-_gwWG*1Ri(tAC6SV-j?wIg)h= zkwG4QO+#>Lz}qGhm{C49nvah0-Z8k2))d;jwwcT|^{t!spihwS8CJQ3Juo-VN|oISPE6x)j3 z9p33Zx7RUrds2kapWkWj5<8CF#&v=F87=*MOaHcsZ#j9tyKn2vKiJ>HJBHzoJ#01) z&9-)LdAB-qttU1tqNi2xbn%|9E&Za8}F>dlOZo6i_rg&9Kia-$+vYwApWW;1`RJ89uLzyveCPP) z#QkvZ*3lpK?}ksn5Vt4zj#GOBPi-gpf#Y`12dbm{qoF%PVpIEe2+(3jW6mAMvp@@;cHYyTeQn3n4Nvco;2r0K z$8&+qBF<8m?qkHW3- z_-o5ENy0kES=4Eh_t|V5Lqw_~QXi4D^q7VMkfcw}foNvII1#W67y`DEVmHyjt0nS_ z86pGjKvc379-_}C;9n5LL3}*{spJLZ3Bu~K4lINYUQbU~0H^xLH>iE>n8skL-h9DEr#Z7V4nhSyN`Hx`yOI!|CDCX zU=|yj_jTyEMza^+yGFu$rt0;vH(MSWAbtP3rr%JtSJ$v_03WtTP1}AYhK!WC^`Qwv zNzE05!L?W4xNpD!z(BB)1c3!}@0&=_Of`o#FTiQFkf@bv3T}?xern%FqE%F*e=~5q zdEZW=4s52H1ZyaB-G-hwHT}VfadJHSjwVZrSBuD6OM{QA zgou+MTj9%p=nU))e6U!O%Hjo6kjtCoTrwR~|CNWl zCvo4(B0KJoJCr;rXabdt!}4YxVgENoVcSsYGvL5c&}beSDUJ1)l;u}c@K;pbuc?|p w=#3ifA1H7hp!540EjSgF2C|3##pGIlDR1(At>RI|Ws}CW5&b=dK56U!1B#dpcK`qY literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/minimax_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/minimax_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..142134e74ea2ea3a5854dcba4bfdda1cc29ff3be GIT binary patch literal 30942 zcmd6Q33OZ6dFFdqNNfZ^f+V{&=@tlqe zRfeixq2^^T37^c|0 z$ey*asAHjUIB+(UEn%TksEj3g_CFI50>hC|P>DJ`ab{8&4^Z)}ZB<1kB@6mwA^$@G zAsiAAst!rD42?`gLc(|`I1-42hJqt!&LGh`GBFYv8k;Hkaom^i8AIG4ALIsgK|Z8Eql1CWS*3b|7q*>!7d`VTT-chKy)e zSB7Q8}6%E{)fDCkA* z23A`m>jB0UESWCmIOQ#mu_)MtoL*M1)YnESALh}AlqSuTW~BInEvU=DX0#RI=HLc| zDUQBQTc{;e!TRG@eeJ4G`c_Iy&R_@1x61WZ$n6_~orv9_*0_<4LmtH$Q?LuEZ7fwz zL60{f&L88R;RoAKa{iud=`pO$@xU|r#hf)q)U}CKs6GbxXv40IA@+Cqm$-6OT<=;VLgaJQ_8MWu* z72j)e8>rEw#085GGAp5b)L*Rh9c9#(?oq~6P)6;2(2N??5d z|Ar0mt@*Ixd$uytruauY6kq-wv?*=y+(rEJmmL4)|HgAopDi1n5`@r1L_!546T#3k zSyP12!bmWjwS<+V@U(4!MH99N@0u@KO`!>D4A>{^9hqQls(tG69zZ^7=yYKC zDe4kBKtUkGVT}1~m%lP=3{Ra830cz+P#v(^P}YG=p};tjhtNVKl&w(bnj!@HT4*lo z%15IQa->cOWs8S_#7D+L*`j?XPVG5*H0#Lc6hde5jmz2eQwx#6*`f1+2ypB~wv=U~ zZ?Gl`Um&bT+0UajDtdbA%o%jtF)}e!SY6h^zKql#!pK?6Cjw(rq5Xm|DP)ZoFpH=(wkqGPf^lK#Wh?V3>d+{@NW%Hq;->?`1p46DXAR-d*cpMgPS!j$G(3i{KQt63 zkm&V3`{B^0$oTn9=SR+O#KfV^jn7PN>%Z@*ZJ|wLBd0f=zZf|;IkD-&*w{EeV??-k zeiGzHc+(_?7#NY3xU>wyoA5mdu1h;bitIXnQK&;h(=~Sh{VwHe5*!MPTDLL~C2p+MU>!-h4vbd?LB|lS{U; zuN=R0oEkaGzptkT^xT%m_!Z0zm$gaDWpXNVekzhJ5<-zFVPaUtaL7WCm|@CMctDEt z&v7%lh)e54O{GHAD>*PQiBXBY~$^ zHzGled+f4~u1N^Pvg37^?9bvc|K*90qQ;-)GM>g)9=!VCJpc0X*s-O0?<;4np2;{jCCU;{{zcPY`hV)b zt=Bc!V<)itE9>TllAew8k0o7OleVqG7E~bYfa5o?Nf7R*h!%3BnGkx(zm=SQa8{4l zeU}kT`#WfD&;J-PHukJ2uGseMT7bpz_xq#M)#V$19>j0 z`$)YpQqCFvWA%u?j6Pw{8K(bI1q^#>AAo#1KQ_Xsq2aN~FoC$!VEIEb+{2fV2sST$ zo^=7A`Vl3lx4Ps#03PcKz}V_SUIy&|t3gzGkDNSl+$#wmBIu<$ifP@<%yFRtAHZ(} z0S2P%eAX};hJ3;}HW>iv>!6bRLc^djCA!4~IwVY0`}NYsIDo=g^KbyPP!L4gn(YBX z1oJec&eM=IKU&-4E?GZ=`oq`ZJj>Ye8La z*;{i?&RLx_RfP$h{$Axq?mMnMdi|R^p4__1J)5~VH&;Ah)Bn)Q!_8Vixkmy+ci+4; zZrZ%k=Df!y)}K{3LpbEmAc7J#}%(TZD$lHoz(xA@mH<5F-~t-qky2-CdHOuI*kX zY?JB*Kf#vSN8Bf@7FJU2>MAr(V6zW@E6s2oom>dx}RVZ!TP88Mm%&W0vJ&zu)_P2YDXVo+8-TMW_$ z$zwpZ*2+2o9FN?rVxr9?6z|gL>~zaV8WT}X z;m^Q%mV39HbJk>BbxB*@a3R4(z^a2{9H6lHhijzNhk# zz=Wa2zrlRnjfBf1IqU>&vRZ$E7wi#r7m-jZ%7bW@+ag4Yl0-1Bd9;vURIlu&8N*p_ z#uzp1~1uD5xh8S-og+9xubJ)`A{ zFaM63l^SO(QRApPpQ8Ap79|Jv>1hLZdENk_;fytE#E6b#M7E$QYP-OpH50^YlCLCc z)v7B(L^OMXmI$%A?1`cV`U(X-t(TJuUdUx;qFDaARS_g!x2e7o#aGbVhYVa)KT7yx z&7-{%m69_h(UPG3scl#*&j5Nmq9rxly5GzmwF{q#+V8anB0i-XQB^33Io$OzRt2v) zn*29(L>*Gk@ijIA!Xg|Y`kOuZZ#|7Lh(hR!Cd>vd=$vs*7?}&gb;gk*9>s zqSmN$y*UB4TQ|LMe+M0fA&|kUAat}!?s~xsWH5ANk8`v9en~`r)^tYJ z4MH<#gBWy52n_|o!y_Yp9n;1MK@5nXo(b<1(Fh?@8G1t` zCG<`XGr@FFk<(K5B$+E13Q{hTyw+bS?5D~|#3qo2L1>1PEj|M|Hn6^g|BYdADiMgX zNpQ*tct!>YhbjLN%3l;XPZCYxL5d+YgFwh7YYa|}pATmZR9n_0gf4_24i0AZW01U> z_k)T%Pn{5mfz6tUs$*e)X#v~(B&9iJu@PhkvWTl(nJ-EA8UZgA3TMlNz|%vkQo_O> zsz8#;l2kzvK7dIXnLrT(y|Pv&jD%!@#{3y7QL?InZs-v&1W+_g zIEohPsz;)L%H*jpkaA2n-$?-7nd_@m@o!-@fs?7{Jl=HeHnDcwf-P0MA1nit8McVl zmgL5rqIKtzYg4AKXW#C;O!t;V;15ml{qv1~aA+ZPW6vAsl1{Ki&1qM==xTq<)tT{ZOndr7 zPhZNjCGFWEdUh=AOL_Lp?*A)q%bX$8;3Hw{%h5SQwzeU@@7pI{J(1Y^R`Y!er+)C{ z8&BTsdu!(j@Je<5rLNxBN3V@8l&89O#jP1{Z`#{0dixhT-tr!J*UB}v=FD78SFVPu z@x+e&Rb$_5Nyb$@_v9^C`~2q?#&2zUFjHQiE^iUbTb8Q4nZ}N#mYz&YZ(^(1vVEz& z>-F|)?V0ADM2FbC>*k}FhmNM(Mq_LJZKJ8W=6$`TqB3XXnl|S=TpdQ!BYJug&FS8w zV(-zE=UD7m#^H(UZaJDV&6^XSPK+m;_obVUh|Nb*%||gCzi>6HlaRpJ`)BtrdE4eI zzU5du{!CL>x~WfW0Yv-2X=sX{e)(dieQR>tp#|5UhHq^ClZ$UOB-;#&gYW(su_FTIbJt+rWa>`Tz0uV?21y_qZan z9rES0%_G`88RvtTtrGj~BCfo~&<>f*%K=WEMtW|8QF3Q-}XZSEoM4}XeM(`P4 zC`E`Lm5FmSr#|`8&}FKU*o%6|dM>GW;?eUY*^pFP1)^-ydm;LCfYfZ0Pr`zemDvr_j&lU7E;(q zY+Ke!e04~ZhpQZ#Vk63{tJp}U+wO#7+ByFLjfDwM0Fv3c%9@u3uMEb`sfzZPC4RK^;+^1u3eWM5W&4g>$E@1r+TAP&Vr!DKPkr(|@LZ(!J+GP=t=0krvJP$k}sD zXG@w|hM~QaXS|S_sOTQ*$&az5hN<>@&`_s@sgT+MNx2K9@p>IlJHJX=LOd>$mXMg? zcXg_~1Z>X?e@UmcWiLVPVTILLjjHsQB`UR;Cypd4*LWCg38d!2`aX$QmLJLLF$n^# zb6F(u?6V9Z3y&gddO&0CjzC#ENotFcz!+2TvZ>V&t$;S?!tJQ$114L6?$2faMfL94 zJy@Kj z0jn_m_{f9+qdMqRB$9|mAbbgqBr5&P{SsN>9@6fvrl?dq>#wQOK{&sbWTn;V@-DHw zD`7}?9}>F{rCf(&dzTz#w`q9qRM*da_M5(M4$n7#ViG9gF1Y^au~gHp#p>NF zM&x*((>v|&nz$+tq@3kPcql`i6?g7Z;zSr2zYfpccBsH#EKr7FzZ#Hv%>$s_12C;A zHHq>U0LH_OEZwxGAcAtb1m=lsUGwC@{A>8s^f0yNpQ}rHUx=*hKt|G*kjX)B_#Cr& zz_yT7{G=)bkRAin7rZj{2sUHgjk%7<@?&h$yZ1;P!GSD4fJI0+A!|4dz##zj$xMNl zd~7%A@>0K50?NuEX|S>}gF<0zq6E#HR@y>DIG()qsAI6ektvYZWzY6^mmej$ma*`N9Z(axvYaKGk37ml_UgCH`k_5df=5k6B8?KgrfXLV8B@TZOUO` zWeYDmKjW8L$;)@y5AN&Vu;zrYl>;%GUUFsr zHNDs$cYVA5)%t{Gp>JVxvittG8ux(xhzH~0WNpX%;DUa^0G=~hyK}L8=euUE5{!ic z?`!6IW1ml&>LiJ}bt3e%GV)a$kf1r+fX{&vA{_7xPta!V6XmC|w1;&=YC@0)S+BVr zA&cX*>VO#p%op=DklI%}r|^&3@mjhsN^V2#T3|+V&XwK^ zdT2FhzUiB3%m;nlwcls`5)uHD%4ydbh313kZvuX+zPn}*1pe|rQV+qty{UqThh%#t z6^R#uImyh*D_aQ3DeHo{f=#g9dpz{C0vj1G7i0#PQcL#$cyhOhcVs4CNrvAJ}>zQvsu30mqeL)XHut}zb%)_7^#gGRA zy)fZ83Eau*KF5$u=g3fQx={nft%2a!iio<7&AG1>XG)Rpg}RKd{rX;&Q0_cTNz0`5-z2&=QT$%9d+(xaU#7g8 zbo}*zi8%*X+7NF}H*FE|-??Sho@s5HH7q)P8D~Y#B4;Y*N^0kV>G~eAz9-R^*p{r{ zA=-A(a9gF4iL*C~#jOXETMs9jj<6ZXHF6-4?2RdVW3E}MMk2d0r|P4E+!{oYEntCF zyqby0F*+X!g+q`uFxFBwsFg(*!24ut3--u+flO1wT}&b^RHBCC*La~GZ@g$#g{g6Q zQB)2IR3(gXSJ~%k5{sHSAEzbaMuk`+MTO>{WYVe10yKwv3Z-yFKurVGkhyOps85gw z`>7I35(j^`oMb`9CaP~w@InB`gJ>huY(W@L0vbr%WqW3Iyxy>UlBi3dQAp&+HC0jY9GjfbdWCOw|sps|~@G6rbEx=`n{9AtVW&RIj+*(^GnlP&$C zb9>6RV<8~gb~8~*+T;;Uo@8CmEmN znXEe_n~_y(O%)JGa^8=(b-{O0dG_SZGQeYb8IPsUb7gr>jFD0$B>g2I5M(F0Kp50S z^%8HSydi8NId5Jlr+gl^REZT)p}^Dx9RXppzwRO@XaSUqDWH`e0bHG!8b1wb0ab>y z@I7dP!*ue5GBemy&}yd_f`^SgB%OeIOR5XIr$Cu0&|rb)5jRAdNy)0uK*}+|ErR5E zPDl|NU!zzm$GD$0mOiy?@i15=bwpBlvB{awM+QA|!hrD>MSvKm4 ziZuhNik&gbQfcMv^v_D0VtXJqHq|n6suz|cT5YNpIIHSjTbjS@Wt4s4 zd@B88hyZ+wHn!C^&ENy`H&Ig7^ zA{Qmyhq`y9<^7R2+VRy}45cBQ3qc#qbSi3xq=in#83!0;4S6h0f7Z^njA}fa)r|y? z`;7vLGo@|A2<)dQNTBG>y+c*5**oP!vi_ebR)(bEPq%7p8jXCHX}5eAl`@d4xm#TB zIYYXtQ>^No9~7(ZOS$@Erg!Y6v+ZA=j9YHm+o5Vn+v-GE*EP0+DqX7SNY!k_@tV@I z+xjBt1@)YxOv;?Hx8E+KP}`Ts=SFYYVQF6el@pgv%z+kfl=88Zl)Vk4TGxFFx}19g z^n9kHPYn;=GPT{gA5}sFMpr~=XmV{@in%h^vd)EqYHzXhS}`BOIdW)u73v1E#q8t- ztYc~Nz>P_gr-~t|8U}^KI3vf*hBYZjt&IPM7F9DUWLz0Io3b{stSw#EEtY|)N|kMm z6+<;JyY0)-q^b7K@2Dd21cR%vrGp%X?1T%@xvDW^@b5<;DrE#rQV!wUQT>boWgy0! zfmI`{7w=y7!kV&0Bp_u{0^`JeYMPPN+s)EIhBGlL&*EAkWigP{V8(>phNuZ*SThdo znlEsIJ6O!#F2HA+;585BKBKNO*qU`mi^4XPrF!IAYhVK`=z=ELXBH!^SThadll#2h zEb*5wp#MCxwXTg83DtM)6^`k@CgUCS3NoXQRQr#ZG{ijphPw6+0D&{Y<}}EZU_cnb zX$6@&VU|VG=}l~y_2Dlv+@Zq0$F?{MLWG)XW1@&oFT@ajhlFOxY6hA50Ro`szMgQ~T(3Z^Zx%Bj$@A=PXs;Xhy!<<_0tSRGdnJrnabbsA97kt_O z!g$7AJGbY`({bMm$XORJy=u8?%v9IS1+Kc|d%spM$ITftwchxiuWipX`Qm|BTVT%8 z+CIM};fhbr9n3UrnBOKg^d@F*7K__XFe~dG(c6>QvfxW}E_!$4oYgOCeex~k^Uc;zWohH?juW|j-I1us=>Q>$_*(@^h zuHdwibo`R!0v|>EDzjM3G*cEQkj0-;R&9*7X?{iESQQ=kmJXaVP^0d0g$}p}^pZOd zac#b2APoAJ4zjDQ9v28w6_|_?w1QgpPbl^fIV3oTp(Lo3kU)xLlIG&HFV%D2Z#qq?{YO*?NL9(zG`~`RwLVqtkJ&Sxw)y(`hNPz-`*PXpNn4vlYg5YF zoVIQft(y}1MeF8Fc|~mhQZ2+)TYpx&?Mm@1Kf5^t8n*9+9T2F_J$&WJmHnBD>ba9I zeo79{?$5YAasI{WOhb3Vo#;t?_Pbk>4gI&O`rj?#JoiBcTjfu?H;V3!^TR23Z|qRU z;ht-JKALg2%~#L2CEa~#cfaWFPq}v_ojaBt-n4`1m-Q(}XWG##I(k!%%}LYdLeT}| zY*&E}1IEj^$v?*E8TG0GItnOND1Gziukxolo_Ydb2o@0FpBVqhHopP%3egl$;DV0O zR6c9*a#59+Bj6zbRA_E^l~$O5k{DDFH%i!R&2x|XajHXbvvVw3nj%nBugRPuPGOj$ z7N%Oqg+TO%ZM$7*47Tn4u-ia>t&<-LI3@iK&SzLh)sr7edGO3C@Ej$<-I_N{nZ@;96A(BPx*kY37}|B2w%`)C1h zs|KQ{bj2pI0verE#g>?5*;?^+*WBRCP(`)jG}>}SL);=(bX>B;jIl7$Q~Tq-*)POw z%N5n>iVmctD|*F>-UMV;_s1-6w|2*-(~btw(Gces9ZiW~%y5^S){302sANM1M&Hcp zI&t(yJ8YC^&D6-_vtLL$TS=f*K6@%Q4KVt_ZCG5@k+sZ*Omlal=KJ;EtxxsrN%ri8 z5i?nGllAsy(X%;m5{AoZ&u-DPo2<8$>TaRBVif)`T!xXqIZ*LHCHKS1-FvI_KXUVM zR{=Q!3=GKS)&p`XKq00N3=6Sz{6$g(YcWwOD+ueCLLr&Dk!6rb7&|6cW_Nzk(Ki4; zJqf_TBr0je6h`PSJz%K>l(qV2pwx&%>2`#UBCrSZZi<@o2hO6#>*^sYEx;Hox~?9B z!|^jhW_Zr>IDjUP9zv9o>HnodhC{IXfelVx&86^#w1Mv-CXs}9GF7%$wj-iF_z@uE z|AoMOADx+%FW{3-yfd)Mn8D%#U| zkSYPx+M#po`%07_gURZb)|BVpJ-Zi(pDgSppb9G`SD)zWOB_kLcE?Q1R@bcX{4UUU z3>Y$SYhyzGjMeqk@Z9znzi`XiO3Cipw0n!_-jW!*aVX_JcF7hq&-Som{qALxEp2KR zP0jJg7EN7CpbhU-8E6SXm&-hJkBMci@rYQ~87t1zHOEY|RiddT)7c%f%-#pp=-j77 zlYe>RHdS(Yha^)pwY}r0eaBgsscFho`!eqOrMj-f;l;XL>AJ&W-Qi4&AF9^+Be=Ln zS6_MCTwGd~)0?XxR<+XA99H>ij#W^WzBYrht#I!~N~)xc1Y>EI$v8+?04+74?HXuF zq@_X7MUBq_Viu?iG>x3rz(X84mf;(Nd+Wg5{6XPjwTuK42}YX8$Z3`rJW-SIvi33L z{DD&8`G`tlQdtG|6Lb|m2RNxhL(HQ5;ax5AR_tA*VO~fqoh1t^8`(K$PE}RlhJ_8>Vegh9B`{C zJOdK{ZsS*50O9*DZTy!17vuNaR+H{`MNcXPygtSLFAGJfohGWi08i!PHINmpl;aY> zr`0IDP7b5ud3L(=lt3&xu1&?-2g)ovUZwzC21YALe4t#S)k+qLJt!S zm&ac+N3a@!Q7w53q$0_SuJrlcb5G3=-m2e5 zXsW@NZrCX{>|7YT`CzKyF`y9Osea(8xx=_f3OU}#{IKLGWQ>uJyK9nrho73zI1t;Sl%}85zDtOyBpK)cG2BF z-?!-A#H{Ih#j4)Ls?G1zZg|Jz&$RX|`L}1f?!$q>_NKpY>U_0*p*h~3YTAv9rt0c* z`r_uA+j_I7GRK*5-f}ybly^69l?|_MXXj9Nh`t>QM^e6nxF#nvaA4Llw?lMpNVeT4 zI{PzQb}M#D@h3!Q@3N~d?P?WWt?}@pt0SWrH!qfVzvK43Q`M4b+PKuRHPgNwU1_cV z`}(%4qlw_$XsUh(x>8+(u6Qfam8vpy1sJyyU6I5$Hf0f#vTl{_3E=>Sb^k5C^_Pn8 zrhvrp5uOAzOs%DxAv0Ia(}KVYsBrC*$G*=OUn+(re;alWKjDU?nPksvo=Ym+<~igx zGBuU{azENOjOsB<9TrW~CC4uC*hpEX7TK&RB8X8zu!Ffp4 zz?n*Fp=#=KAI=@<&{hqO-WBJ$l~^(Em@a7*GEPdEaKi@y-kJ%ybwpEj#d zPMZn8MfE=d=bC)lteA`&nT99=O@2^13qTAMfYP&!oR&_VZPSoM;A|F(NK`gSro$Qs z&i);GvEoO4%7HUot^Mu&k9>DmV%x(0S^ezOSM0Z(UGcv8idT1}oLz4|`Rf&G{Wq&U zJeUJ7ypk30Q{-!egUbgXUmAftIe&FsfgC1=EPj&&y#EtrcS&8u^{HeDENS4fHc9X! z4_&_b4;Q79dQRBGT2byem1cw%sSKv?mzaz)I>okn_?q(NJ92%EAo3FG`vAB3(e;@= z(dwHY5Ul{gRX9cst#rm+6|-gRWtp=2beT^q^TlzXdq8X+SQtx{Js8`Yf&TjR^A}-# zo3S;dZLOk>AdRgfQ(F&h*tXbFx~DdU;5OQPpNn8#iM`P`-A>B`hTU@ zRa#>EVe3#*leT$9n>XI_?T%MFzOgY;@%`%WR{v3LvT^5Iwq3bm zw?#gjqK*^1N8x;!HTtUo6Z8FrIGYNc|ulNrSUdda>HjtZ?Q~&XaFZ@%=`fc6_vm z<|BWLd0zdY;91NEH%cII%~S9lY*8CHVan%QlL}Y3$MjpG0?dZZI$#_NT)%ste4IuuWvyCTL zz!pEgftoh1pO2|`x|BRn?^NS!cPrshLdk2MU=c3(l3Sr`kS^vgC^FGO?*Q`WPrFEF z9+Z8$Zrw|9rkmyK%T+m+B;{lJU?#ny)4Bc{;fqv1&5y*?e1&{3lk*ScyhsjanJc(P z$4kLONdXDj7XJao6Oav46z>8xu~7&$wFJ1QiKMk0&KwFV^+TbOmJrNM9G zVkI($4m*(|6AJ1}A{acY^wEr7D&Xog4pgp}|Bta~!t^r|%$$af9{EI*FMf2<)T16b zT5>haTNYh?Nn0O_YZa|HF1cv!&D1xg>wCre-o*A){jS)Fw+9~l>cG5X(RE+a_Gt3q zN8z#pq~4AXELyt@Yieyzw;mK*58kw-T2G?Qm~D3aZEKZu^7)pnA-+?zb-!h6c)Nc; zDzz=Twj^yQZf=INObp#lu@UTLs`0_3sh%|=T?DjOwC-JYmZzOCOXU=YgA@V&^BIUSj1tMQ11I$VAgECzSCxRf^J` z37c3xz>+&eXUC#*W5Ixk{wYn{{G!dDv~9^yA2y24jfn=)dH)hl0h~xVo9FpkPCpJw zEz`v96YKZgcp_E*2oY-5`lKY`CDwV5*surJ7tggPTl?9q?Maj8-6GEBMRPbsu;|*E zwC&6p%FR`o+J* z?2eRUW1@_>Z0e>wmSyGgWVE_y+s@8%k&rs$^d#$h(2sIEE+r{{lwU5bNSF4Ar9Fwx zT2e(b^S{;MAs^D6kP05~5vYT}0tYe{%LSA)5{hGh8zIT!)zHDR8nmT z*Vr@4u$jdcT;H?4LQ92WfH%0NPC0&q5O54mUXX_T9xD5XWX5V$cgP@KfEi?=%x|rR zUNRpd8S`-gT=f4R)>rr&w4|7^7xK6VGTHaMY+R6~#FOi<*$A$k&`TiR@Q8)!>S0!Flcn`7DegBN!o6*&90;R!= z;?C)+>O0xaUAU6$hAmZgEYEhjYXLH9Jg?MVt<98o%s=ttvY%AHS^d6VS5vlPG}hVQ zHNeNNlqqw)+H8}Tve?WPl@|j$~_r( z&8%fvVc%afCUuDQv+pxOxF(&e3eB4S54p>D}=I_HC?$om6 zE{IdvuxSXPpw@?CpJXXu{B)#l(#(k#qzkzO6RyJ`Lfb&4Vd;ZiP6{gN=IDz!U>JaW zCU2#)U$zvK$qejqToXz66XLgM@Y^;-Naabrf9^(B!3cp9nY(7YeV=+|^{On4QezXk z-FuSlVnuYK?C^UX@7gQt#S2CN<7){Ns7B$@y30e3_iDz{wifm8TN_$=K0XDCryI`~f-NB8Tyx-yt943mJ#VRCvs~ zgs^7Tc#M8VOS01Y3(9FCC(l7LE|+$g@K59rUSuFapihu3k$){}IE0&3hX8kofoEse zWjsnEEPVzxD;kDpe zp;vC8+*mU|oa69H46o2D*ZB$FXkV_Z&Kb!AuA^*2u9$gEoTYWe%)IAyR-l5J|hu6Z?jgvQhHy(TA3o8`N zwKp5m+)$4KzXm6aOH5*P_y z3m6GrxmLGv^E{fuD-oe7*qrON8a?wq>b580qi%b0HSL%JMB$ZaPE0MFyy07T?7LqO zdk?NqT<$U6W86a3;B`ay6Vscfo31xYSTOm>qcn|Masy4qnoPMnr>A!dZtKB&*=&!6 zXHQ+4p6iR3U&Sq2Euy(~#fX&LA->ya$#@!adU|i{X76Ql#hgJjdsd8y%mw)ays@0+ zVIImvo_sC}t^4tD8#T3TjYKZ zyAv6|bTm0OhBJkv{OAgt9u{Z=Ws7ikGvK*EI12Yk(U}5C|58gop4Np!t;&z5b-@@v zGBgz#84I&>qgn4AIi#Og>w@4VGKtnCkU`?-0^K{vCFvq&b|rJR|8A8qU@83^8+8`| zNj379Vg+K;vh5$Kywqzu6|k}b($CEa#KC17Sq$o*2!js8DTc5#$}aWi{v^m)1gtD{ zQ}-DFd4zmh$k|M_m9H<+G2a%=OO}-Ns=(-ZN);A7;h8^i}^4Fq6$# literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/mistral_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..beaf1270952c73d512a7bb7285ffff341bb50320 GIT binary patch literal 12635 zcmc&aX>1$Wc{998ks>8h)I~|7Ta-k}w!A)MOSWvurzEd!c@M4~hUSc9$~=^rkq?Gy zjbej(ku6rcNMRdTG%`|PW!S*FMbIimfNZikoF+v@Q!Tj@E#gI71pAL(Ynw${wEey} zoFS#v#!31k59FJ7zwdqDd*Aim&ulge1tFsQdU$9*Mg0mhO3>#MPc1Y>-Jn>Cr6ZId zznTag)%Z0urnM1mROi=`v@W8L8vKT+(QhPqI%4vhNI$dR0$+WkBx?0rNu42LiB9+l9e-%laBh}Fwe+@}nBDGPE z-$T+Rk-BKTzdqXFZ=fj+#d$d9F2Ih#-^dv_V**o68tO2`T3@AD8|S{OCDoL_nJwk& zSv%Jin&wRT;D@=c?3JZC7cp^JDBX?xiKuY3?htTr9*v+8DYJ4~ICxtC4I$ zAruY=BAh64ydW9a@Kjh7ppAYg5{X~N#Gx=3VZBDF;^2j#crYHFfyUu@>=ceCh*H@w z7ZHPpW@8~NJs6BcBr7Qz;{+i%#Yr}jn~29Fq|!5TMdX7akzuweT6ATMw{VS`w16D#z+ZdI9n`D30>eEalgnK@w?M`c>+lMkSMu({lz^yBAt= zMppMKbvHkf-voIrr)Tw?8AxP!Rj1Zit{S~2$*#g6BV`>38W;sB{j?LJ8ug=5)zd~<(hKmKmy&_N`FhfWn^hRYg z28s%MX0A%6y^&z_JR2NLRBc#fk5bbDfCx5-u2J_+WV**w-Qz3W&+|1<0>rbk++=Vz zA_gX5lf?O}k|xZq!;b;l5&Rs01e|qxI)82;s|HcUm8eq+Sx@2sWN7@%*vP4)2l;9!B4F~hn5u#((X?@upT~V0v=eZ3 z*QlI{a+JSu`L)a6xcqi<@hdBpThsP!%cgB)RLQ_|;w&Ev6|V=^QMB45wb^QwU2uiw zflC$Eo>bS#(ib}@3RbHb=Nn;2I2OjR5RAlTqmnkl#k@3Mk6_}|3|StWA|Y@o7zwkm zeG;`BmswQVjU5pq*QiJKvNsa1CBB|~Xz%`8?ZSa?85g(Swl15x$pXi{C6Xx+hy|lu zARt)+foPnajbPdu2z+H0pdmH93C1GeNLo0ml06Uz#$s`ia0mfg3z!Z?IoPc%A%mnl z4||jMV#f_s!V^J`MKdIXLlAw9`k>UAYI%O8^yIYz*R^Tg$w&IKCmOB3@2QqDl;<#& zvyc?6_ubUyDEKVsKEV%Z@k9eDc_WbXW`}if5jY8X9>cf7EbFL|bTHW`yiz;jdFm4GliGZW`Ueg`xq%@I%6sK=)A`F$%b+ta4!9-3ZU>do|@O!c2! zKJ#L#|HYhEqc@QChoG$*%JQkF^N{$Rs+e=AqQS&_aV4wY1?nvzGfvG#>=U?Z zzO_$o-RsQ{BVOPb?n)5FtsryW5Xj$v$UzCR_I$UV>2Pd1$SB1<7SjEU0I*%Yz{MD! z4_Np420q6r4I|IQDkfgY|-dbFlqoO zF9#Op52)j@TF4=fCxNt7$7OX;hi8cdfDHNihA@rnZ9{;Tc=|U4WwkJ7sZu32D5+^i zNfo}VUPRVQ!gQ08Dtr}Oz=-8aso1WhHpip_1>gCMxedSf+{ju;h#6VLw3-J9JW5^R zd#yz5QgWFN3S ze<2)7SYMeqabh%ZaA?>dG>6|-zF45KTO3;@W^7*&;b_aaty1X4{zxv8q z;q2*io%_ztc{}zEoE17-lHJL(f;VwY0fI09nJX}%_-uq_K;kfwcnHoKgJ%&Bi_9Si zyubqT#d)UR=ygkaV5XpAf{Y^?Xn@}iC?HH&=FA3JCzW(zPrcpe0zn>R zU1*l)LG_Z0!>r_9S3m@$0QHg^_z?%+1n&5!NXH(N3_J*k?Wl`Z>M>_e;8@;8pZc6_<&g%#_Ipi1>_y{=m^HDp`b@axHT zlYDc|K-ug$J!Q7RAdso)xUPG>H0xo$dF%eP)PK6cdH+@7iJP1W|U zY#my09LPEJ|Fj z%_rxyX3C84sIsuohJRu6@@Z8X7TPAY3J;&tO;L0Dq;8lx*GExvhNOYjp-whugp{77 zOiBG_(mH7r?aGML4kcCiCQZs5tl@3d&Zeo?pVLvXif8mKz^6~@hp9I#A-$}Fr=L=Z_xT38ciibh5=Z9{-oW^BdM_biGmzGqNVB}(c|&FedL)LhA&1*llJ zVGzh}PDORaG7hapyTh6+DM;_6wb1e+SP{H2X)WG4tgSfLOo^)YF|BG11(;eWeg`oz zR4HDl``y&f)7o5wv&|@JR-m2Jzf8UU)63Kq?aS0(unF0tIWln&m4YWg5 z_nAifX+vO3l@W^#Tylc6g_VaiFN2Y*%;vQx{=Fj z$O&h7PMC?u1g?icH&+<4>Nu0(80!PKijU_&5f7n?f_h4XW4S)p6MUD|>LEpX_pXG~`43O%xQ4sV33>9k^84ip(&^bbpSymag zQ61$FNMb8tuS9-xtYne7ET|O%N_sR`M4kbh@+fXTMwJ{~06NqsB(pk~WSY$T@%VGt z7P;shTC#F61TjE-h1gBJ3TjRODksJh-9!~YbtvzWLyX|;va&?*tD@8ECI;jk^>-zG3oa+^chyzUCnfX}ux|A!9%>n@v zEuUAOKHn;egMJ8v1us(-Wp%tU_}btCXd}K=XK%KyX^~&ryQbB+KpD}1GGd{sS~8Vg zsmiX!;djUHj4vHXSMJMHj-)C_(v^p=TXVXd=F*(OZ5zxwD>Kg4htAe)eN(1>OR9cL zrhZqde%F#GU4Ll)@S~2-WmoGj8ar<4vrX-b^zDwD#%x2&!pU1#Zt6fcs_x8GZAn#a z$+mWXcl-DHztjI7o8EIggLrtZ?>*y@lvX8$K$59cRFu(W<9>eiz!e4g9=z(bIqF3Q1AXv zy}7JB=Yp=eO3GH5qhO?5JyqG9ad)KL9gD7a>+jU3-Mf~`Qtmz14-*76W@}qlo7)yH ze{1jk#)r*)H;rpX9b!ud*fLQ4gN5v8YkicfD&uTVIom&>47NeQW@BrH*`8vyXPCVy zX7AG3G;{31))X@~Z(D8Yc<0#dV~ca?mVt+k#+(g0{P!p20Mch`-ISyH6H0F`10%)ee=-H5t@3R(j6+N-q-aXvgzI*+ym(!_8TDn z5j|u$WYPZEsDaFn%`~PhCQO%iAEvcGp=gLD8(JUX_^bdfg)3knsCEYCEa1-)gK`Zq zQ{;wdR)GBgCqiwl7J=WEbWsY%L&X{gs+>eWx(2B))Q>cv9~DW!r1qX_--B+nsY5Jj zxJ2=GQPqHwhP!!x1FJ!oCl40q=2{N#DDo|AqWZ=fe|LRRttx9_mHA+&5&*}X33M;b zwBi|4D&hMKZ9q%(D6^Rool2+aO=NY_3_4TZfB-G0fCY<+1wGLg>`4pE$S7kKzBo^e z4ujtE1-hTchZ^wu^ZBcG-$2G^ei=Ji)k?4 zjCw#PIH}<0QMEhP2sxbs?B)CUa9~%m z47`G6(|bM-A?sFfQbWy^B^`nxX%}>4RdnnhAYYrb^VLN=106;i0$Zt!ftTli4?1=$ z>EM43BmMI;&cCDh{KLh56+ls5G}~tTusT_mELW^@*maNt$%K-m99CJr*(#G|3WPg)MsbIF)%6ZmKE!u(3KM`+nuH!8~1KRuqsQRNngf?@oXSB8#Q*y52 z8Mv%Xx{?(I8jAarEk()|ndyGUD4wMI|49+|pNJyvbrd0d9(90#%7KxWf)%cM2R6IS z*z_)B6CVj!@bKK_8MxFbCxoV6rmm`1I>4}Bp$&}VTQ|-@rKa#@>k8vMW5)VI%iYb(8M%8#D5vPGZ4Xe zXe>{_8v-jBm}sR3&^Zd$vK|at?g}xNp%of{iAuPxzKaWv8Mp!A)?GoLT&@{~@_8uL7Vd&J+q45dE$AqIZaX|>9;{o8k5WEDp zc&Zx#Y;aCwqG51eC}ki24_u@q9B6SPEUf}s@vTCl?NFE(L@}UdZ7d;&K`i42riI+LJceJz$@JqO-0IP-W*o1WLlD!7-#6)%%0JJd{;FfPS zNoDsmY^D znD_R|hxYWuDgGj6op2c{qB|NL5BcXV`~|Ghh2n5=Vo@CkVDOIxBd~P1 zhC#=K>|oHd-1*rlehA0M=LY-~M$;IDFv2a(qrJg{f<;Ld*{6(-3w{zyi8ll9Ao=}R zZ{W%Pk@RFSJem@G1S2%~y_K>H<1!Xp!RRVRC~o+#VwA*)_%`&Dks!Dzta#On7sBMa z$)h_h&&*WO34M5N?pkYRv}(T$&M?@1Uox#LawVI zh?VSW!$5xUd?P`wR|n7MZAyFc%1c0p?mG8LAtO6dJfC-w1k_EE==ptIDuwp{1hWZG zL1Fv6bERR?vf}Jpw)Vk|fz=^j04>;4CB5H1xMWXnKk{Jc;r21`Ah@b>Rtj7wb-Nzc z?fQoPhWRb?ce(F{zZ3rXbN{;gr@J3?rUoaj>vNiz##{w{6HiB`)|aaFEwb;%?!?ly z154*qwL{lOb0&k$v)a8c(>;>v9?80xg;uzW@|Dlqz>TtN&(c78=TVq)CwLxivYGmA zsrqdX9evBE&%phYt$xi)Ih^wYX?x?s$%po~Y-R15g(`E+kEI<=3n$Z#wtIcxsOTDa zPnX$0k=j48{NhWg{V%1vUIN$^m&fAPhtBS7`>ss;-cs;pgYXeRe) zElX!s8V+U}j;9)q!=+osvoqz{xunbN8co5^GxmT7fSH<`x-|pU=zGlgvYkDd&cRgY z;HqyhyKM*faeCT*-L~cS^b)%;oo+k0X4W@0<+R3*`ZcY&vF4^8TszfunX0~2Rp0&L z%=ROx?MKp8M;}b2s!py{9eu>K+|obtF!Lji+dCHr((SvKhEnYV^STvBOSY#Ezg<~} zE92-&Il3~AZ7IjL``cC=J60X;8`d|i%XK>xU{@St*-Fp6HNW_tht8fXvo*s!mtvl~ zKbd9*=4~H1nN?56V~;o6)|qMRPqp=DTYFcVw`H0~Qq3dj=EJMr{;U_*)Y85>aHe+3VPHS$dS<{x(x#vf~XVWr&5x0pw&G;rUJd(xO-_}#dUPUW*`9X`J(}@OSxVEfMGM#HNqCLQ0?3Crnx!mthzq> z+1h0t6p{W*sL0T3hK8!mvv8=BnQc$AS5Me`o!Kk@|t#eOgcbpxX!O_iILU@OjWM zJOH16sT*mBPg=isLZ@A!X-KbVX-w<1m^SuJ3}{#O(hz$cJo;PsE{wV%l8k`>8xI8n zJo>tLH2-*ftHYnh2>mMLzJR}kDJ?{j78Ej`$n&=`Ys8q1ZC ztd;V#EF8Qwkh76&DOJ<7(0r>dXD3+)PEE)*$M=*`b?ncE2-m39j zPn_JJ2(r3lnz_mb;SorPywj8zd@bCA*Gu`=!q6ID3-`?MaWM{n$+SdWMJkr%(}Z{ zB6iQoTeqs9OYNSG*w_rnoA>0&lX){wW`5^n*56sJCITU<`A+2Zy@dP&z9_+*Nv!uU zgv=6_uuPQn%TK1Cfp=9@HKOiUGe}cMH6z-7Elo2~UB4b^ny6vK*l*0xOe5xgGfiuw zmJw^eb;QM;!f*k)r;h5of=1#MSSjJi4fR#MAGgX??VKq@=%uAu7U^aNc*J zJsA2+RpbO=4Q~?G$d$dT&QQxK)dW;C=gCqlSPMsn+hv~on6+}+0S#+=Q`5hVwFA$g z{9HK=JTBmI0FRF?;;P}jhO6eP2UMlNz%rA1LLlD>`L(PI^4*+^^>BW+nDRO@9458| zIO-;tfcLj3xF^6!hFV`$+Ah1L1BTGnJ|4J#0wk`Z{|oWA_k+QgM%C&mODz0GvhJuY&JZNv`0_G4AFVhn05;;xV2d<$D5Z$q)?2LL*!-D4Bx6kx_O$ifKzQ z`1*J#D(AdI)=~avS6ao9vDUH3ShE=72RX6%=J>t?`>*ZeTBDJxtz#47&}gjndNexH z%Ed%}Vr(=L6NT2%F)kL0w2JTr!7=&?K#PuywgvM@En^eB1L~krB_sqbL~oLm#=v_N z`lfF!0xE{}&mj7vysu@Mel_T;W|+<}%9D*)70|JBwuE<8G680fnllWjrpW-qYTmT; z8#xoJ<;(*rR`;gn-At)|3uhTF+ady9Sv_ZFwX6X&(wNW9nm8NG2|Gw~4Cq)3=wcCQ zvW>ODza9P^oRc$CjV{WjoY|CI(#0C6wmVoCly^^)n@qo_mjv8W$$9L}3$*(q;hd&~ zCO{udJ_tzjg+V)g@&Nci=X_Yo*XDyE=c0m71pW4n@i6{j&W8g-`2}A{pb}!-jVR2E zW`P3g*ooK%E8z9V&lM}xN777x*wuHVP=U)$r#;M#mRSJ{>7PhGb;*1 zCdfNX4=f{EVSJ3^an+!UP!LuSjle|*cvN4>7z~D^AwdWR1(fac-QtVk$>WTkq#9R1 zPg@jO`+!TpRp66bWX)i`txp=fbEQuVwR8!PbUY`H^Rck9$^j$6H2n&kj0+#a0E#`3m z|20iKi~?|RgeF$`r!`Lu6>PjskQ`)%pi|P=VJMJ&$xTm^PrvYhGTBXgI0b#ZO zRt;(0l+K8nidwQI3@f`@9v%7*RfC3Mx01>|AL*6di#3chQ@XfLu!&VlKC2rfarGm! z(#yTXNW^L-cNi6GOUTXoDZPlwMnhabT(6{aPu4@b8L?TR0v;rB!*Gl8o_ii8%7`ib%D3iyr~O5x?iw?r3py}m6n82T<3>dejF_E{ zn2|N)_mtSC@Ujkt_YEejV(94JAhkrq)r|%};aeCYdX@KKTyeGpHe2H+PFx(f1l9zj zkeN*kxkT>#(+zS{eTm#qYB$o_cPqu9_WDgSaVgyr*dpZ{fb)g=5msFyy+o-k;G8^Z zLIi;!gIxhCY(fZhTMoe{O~ElI0}3C{31g!%fs;2wEIwI(a3BMegNQ|hLpxd;4hIgl zZlV4b*OSjwO+JZvO=7Mb2f(WrBO@FvOFfhOE^>q1O<(5SHyWlZa0YS!j1CXsZk^rL z0q=;gDaTqS+rCu6aRAt`x5M(+?yD1EZ?m8%fwq@pB@@2MjhFP{(U^!h7l(;1%9}PO zgs-SR>lYw81N#(iPFG;FnPPGaCQR+@l38$R%;bKbnPnJq$ESg9_&=&|F!FY$mP`== zfk+6f7)c!hdqjg)#ycuW)$)?WaQVB0WIQ;?p#i`jn(TbhcE-upiM%_~F={y&9fgE& zs3luG3Lhk({`?Qzr_)5W+xB$l!G*53OmjPb7=AExe`wXQ{U_z0uH#z&hQFTan{4Vs zKq>DlzL9Z31XSe<0bqg@ASguvWXL($L~Roo24AK+D%cJIIfYj{#Z%a5^SwhQSn(Ou`sIG=#B&WbZqF{%o)tSc4ajojCTYq@~Iu zX}H*RL8^P@?Adcc%5tIWV(&4TC)juD+_Cd7_XT^8b)WA!+ACT5WVNL2m2|Q}Ch2d4 z_!u-<(z4vu@j)JUT7C~il9r-fNe`wIs3k9Hu8y)3JWgUsa~Apn#t;lh4VaVym}@eG z4Z;Wyf=Py&M~x%Nc0rJkyDZZlMD{FV`V!~qGyzk3`Sz1C>nKKG_$MoM`>-? z36eV^PAKlJNbA)_6+p9Af9xd=|C*~cRavvZCps={s8wzoObx3oZ74C? z);!+XOZP6##g{!z)5q656?3mVcxs7|Ro(P4lG6rKS-oM^d!1>bFLG`;NeO(-5s+2_6!?lypPyrh^{dP$8_@oW=R^Zd zwK(tg-0n$~_pMl7#+S3V&!!y5-i!Y)HGYj2SpMaCJqZ6~!%ZBe&xw|nS+!IoEftB% zGb@&}zuD3c&uKsCf$=lp6pZ@&^`2uLhW8ITA^yd-=HrC?RYli<rV}md51Sc?CF>Z^U5|{8gC2u|NtP@e$2Svby%>^n zoH<4`hJq1RkczV0S&}_dL?MI?&|D=TECtX@*6Vc?`l^Prz*k{Gmz1)}3KkK$X#fD0 z1+WOf;)Tnqm9o4ot`czvp@GfPSyD!(R8YDQMAl?W+v6&h;5>Y*)S__f%jUh0D%WxL7DKr>20|l~1mSlmfRYVdEajS@CDGh}j zKMGXiE5t0AW5E#`w+^Fuxg~ra$O`5hBxkeV`Udl&@^@`{kk%&FDiy?SNm=-{*shz5paDmBp;eIU9?lvmxmL49tWz2N zDYw|FRj8cw}o}4c7q2({ zuDlO-DyiI)&FlL-PIRAa32d>_YBbg+a^_P>kHj1VCleZ-la8(S&t%c(j2|9n1mA=a zYTN%e5K+y&s%4~AG4no8lGNv6u_9G!m|%Y6-_S1~$(5st28#vzd#E=bx4 zx^^W4C=Ipfr6MIozZ1^+ILVGfGsH#NpfEl#5V?sa9bifU0Yt_Y+VYT1ql(X)JGq%K zx%>ZS{=5NI3YCB;!CXSHzG|;e+Uw^9uoc!^RSTvSS6jl;me%w#M(;CwxtyKYeqhDk zo~mkm)SYiCQ$g&h8*-Jhu2zhXZC9EqkAkAy_q@#WSNiIx+Is*^L8 z-_-6|j4s!9CoH}cII-=Gsgm+k-GQW~X8QD;2@R-i4@&QsrW$syHXKei9A0VYT6SDa zHgwJV-|PFZ|NZ_CF3-4U%kGu^NkyWeE9tlhhDVug`ZScRsGRGcI}4^)X~Xns#X6c? z_B6cEI&xLc4g5I#_Rzzj6<6z@3t$~}j*8J^EnGAZ6am92jzc?c2b$` zqf-A2+DS$J`K~99`c$buttHlyG=V(eEU%nCvsT-*(7#%{|DJtDJ99JT^3LsEE#H+a z-?il0m2y`+YiOO(tT?u>xr%4R)skJwl3mNLw#D%!*TIzAyXtODx?5M>dz0?Hi~Nea z{h2@TsC&`$?)k@|WYgiK|L~Ik)%V(dHJs>vHQ}g!>hjY@74XP3VNf|Iim$@&|R9uDyvrA^+|XA{P>Ey`I)!!f$hF6 z>D`s8sD99Mzh|{#cM|`;^2}SY>TOGU+ZK;4d)ps#Kac$^wtDE5B>Z__dE#wPcM$)9 zk54khQ9i$WwYGh!wmsFf@3Aw{crek>Nqgxdd)Zn^?ZP)!N;(pbj;Efg`J=1;{Yn4+ z$2*t&2ho=GFC1AZX-zm$(p6- z!OphYz&qgfrp;QT_cH;f{ev#cnKJTzQw_wwuzAin48QO@G2VN`45SYj=NXUY16Q{P zUVas*g!pgt#Takva>3(ocU3_AcMcQAWo>62>i=SAAcpk{%?%zc4IY88qz?w!(Qq)x zBmO~$;K*1c%JGMgp&KJK;po|yWPqbpI5!g~BsCn0%PTMVtkC5msbK}?!KNg#tLPG@ zV4@ScC-8p&Dao2SR||8|XfO!775d-ksnu8H12*(K^B7fQ27n*($(!V3m0oK|`$$Rk z%+=Z9d&6lJFGPSfZe0uS=|5nuGFtDm zlCrcG6Cf0ldYUv4k2h_kNfW88oA14UE^Vf-7E%H2x!;+#(pMWPtC_F8UzxVkR|j!! zPZ!aolbDL#4A_pO&_#uUwqw@D&K;Az!Q^b?!jgbmz-Crj1BYVF zT~$~e0sMRgjxwl~$bj#GDfGR0)^*mS@IBR?N;p(ePpPJu7xqU1OI&FIUkfe7!P^p` zWvV>uS5>I)?17Hzg*9g_kev8z&4;1piX2@@vFhuD-}!<(4Ye21gHpJxHj}S=LB2kd zZ92U=B+7RT^g`v=OPPvZZyT(o6Nsb1ua7Y@w!y#$uB@X^OP#1p( zqCWn+5FelPQyT+~hOH9eyh6F&5wPbvfWM6*>L8Ld*d&U$B@I?7JL}bxaQvX7Uo1%O zLLDI^bp+{>rU!aRQo*$ZHUBLX{B5K%*E4r1GROUYj5!$nclZ~oV9|ox68fT<+P8$+ z_`Udw-k-8N?@rvFNEp0pM%PUD4_yzu_q{8|nl-yK>Fj!}{<+~V4a?516?@m3vo2Bp zwPok43CpXn9hX+jbw4JD(#HU?`($!$3^#yb4o0cmLG;XWnId`%k&b+c@E>F2iQdY;^S8l&DbA8ELnfH2Q$+`_7 z8nnw+1=8#_iJFtkwo?hisZZCdp;4bcHxWmNV1bVMh1qw^q5G>6=dogqq`@KHG;hG6 zByR;5pg%Wn1%+-kh#KIhq0n-uEF|F8%eq=~j$v@?kwP*H-B+7pBIxja!B%j^4~{8y zZn=o7htba45){`&Tm|_BmV5*+cj`4Hw~(?bu|yF9N0jKZqM?xM6I{*eM;hgeA8E5G zosxn(YWkkZ;!`nduH{Wyyj;i&AQ_K%EnG|(=eb}=2uC7tf~ldoXjH`QnqIGiYOdyH1uSwg=RFhrQuYU2;YP^n1E)_3_BaS7lF0P`_Q{$ZdkKb zCu-W4Z5;_i2cYuVQ}<5Ip1pVW*143!dFvD?@oeW?ozwcY>OG5tKOOnW$a3}3Y2(cH z*~WW~ONMQr((dZtm|ZDr$y!lms%YCI|9@_Hr(yBZuU=kh=mps8G;L_L7G0WXwL1Cs zjs^|kO-hZM0mU~r;R9v~c!KlKo{6FuN6D3^nH^p=rH-pv2KG58?DSZ6ONeLBBH%o_ z0(ez)->Qx?ycgOx67E3^>6ZInKsEXU&nTT55BP>4xVD14{UjyoIm3@(MIMrco=$XcR)-Lbo zcmC5C_X_ml&q1z1k;_6|;BEh#FVXvd58hVtMOT%kFdr3uOUPYj5zyyfh$DMdghxcf z@JF-lGt%!m7>BFpP}aIdG`!|6xxiT!TLh z_j}u-Q=~v!^l*hKi>z|(&h-YIffeS+wZ#}W#Esdu6z;n$H?PHBVl7ZN_F-eT4`tm! zd|RmRvolO^zJMcpu}px`VO>+EJglERe}q|m0JjI2S8{ziINMO6$9~Fm#|Bin_mHa1 zP)mU-k1tXU8LA(sZramGb!VupK=mLMX2u?%douJ6)XKPN5;{0;8UXwqDDF9g3yH++ zpnh$M`G)M;ZUW+D*|~^7Z(hLtUIA{L!}Z^*a6$eWCki|Z4E(=GuHrl{z>#q4W{49w zfgeDQ|9~7Nc^q)A8=atHIN=a_ulXU21~K9=qO=A`Nz7%5>6DmuiD{xY>18Hl4;c9v z@{S>!dTew|#&bFzjP=MEA4Rg39{?w!qz#Xb$3)2hXVI`pacscM{|K|KnJ)p?AAAu% zfiz8cD8x%n_z^$^?qCcCSpbc)H`4%a*&qT($%H8dGG;j`e$E_pyOCo6@T`og>6Rr( zo-I^a1b+kS6;RP|2g?L`BO3uCkC7cWvTs4R+#)G=@vU=fCBAw0_rH1DJguD;Q)c_j z;iS1HWhq^?_>vaLdSaus((+?zh~LM?{V9*|Ik`-AklDqx%foF zapL!I9a%2!PB^-M4_Ehc@zI3iD01~LS*xDfE9NT~_+<62MRuvW9bI%g|MaI(=>z=r`KP}oe-?`{a*6x06O4c5M8y?RbmGUKz`MxKP zhIA|OZcpzb7RTL=+a2HOymd6CH~-PpA5P8muINkW`W{@le`TRY1)wTY+v;> zCq2yz;bqURX;aGVnweZOR|68*DzBf8 zbqEQ*-K)O#q^})%-FFzS&!o&%iQ4^3=KY9}^O~QsgAp)uae87}^O3;=!#G!yv{cT! zmMs31!#!)gXPpbLIBN3KaCP}vk?Yp!HHUYu^1nK^Pis&M+P>Y9vJ}tNLh0M5GlTZ8 zCVc0$T#>Yg6zu?aw#6~ikTh4!4JXVy5}F+{o^dO8Hlh#(3!O@Q98zHb zHraq!o5Gz#6vM+7n$5~33L$UUrqp~5Y(xd%OAhb|s?!9Zg$AxxWC2VrmtL*V#EGy! zhxwreXjg0ra3fkJW;f0kw6eh6;J5~kIC1HM@JOv3AHy-JCa*nNP)Dzdz+GnEn`;kj z+4(hV0k=a>QF&VB+CwjFS+Hp9dMMoH??Wy?WJK2d4=}=2Oh)1i1<7El(##y)AeThw zPM1mk8m4e?WKQfn{!bwa==jH&-aMp|HG2+&11=dLhGlYKKtM;Eyne~c7mtN8dKs55 z6a{A^I^wC6r@icqzjf}ZzBpxv^OCxxy>5PV#SW8cZ^FJeScT80elGjP+>G0h2LtyaiP!>x*DbqVMzye5bPQ}YCY^ecokOmJzHR# zhza;hGT4Er+gUQ=53ySCIwOA!*?pP!><_bA|6flbE`Q5!$cRj}6x# z;jXG|!{H48YH;yyLQ2w#U8G4^Wc&``~Vk045` z8HQQc5yt%Y#QbaG|1~N82jcuSdFdaC_jf7-qxv0z=wpoj*{OmESp6Rv-7~K)8@Juk zZRi-rI2YI;@YpbPs~PXy@CLzW`mC;)InB(`3$ST|>4mm+`b_V#Fghgzk_-FS2|Uu< zx)~-g*PABzoPT+tZBf0j|6y=ZOxAU*Q)2q8s+7^C$|}=pd4p|EX&3w-y}@^j literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/openai_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/openai_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..038ca0821f678bee4cddcd38457fdbce74c82a3c GIT binary patch literal 3796 zcma(UTWk~A_0HHH&y4NFRj678TB%Jx+JZo;Dpe|o zI*QLd_uO;tJ+FK2$sbZFnZRd6KPcTv5%O1@gkP+VtSthvKy;#W1}Wgk8=T1(cncYwBNk7a3r#$sY1;CT3v?;8^5itpr5i+-X~V-vo!U0VP@B+s2?~7{k_>%}fnNyyU5hRYVLIJTYW} zqh-Y3#p4;w5+$qTs)jv2?oX2^tf_PMWonhCDJuqc1#nw^vNjLMHz5#|6gZs}c)f9+ zjBz@5BUy+@L{IAx8r7pTrpKs2;~l`Q^V4yF0u{$ZJ$@rrkaQ7vB>2hjOKk8ZH?Y(O zhC2yV4)z#J>kT)eg~U9WTlq9Xvr% z=05v+ziXEJ%cb%`S7YPUJvdo;_2`kyuhM>_bg{oY6rJ$vPbeFVvm!_&|Cfd{_S2WsCmLVm)uW*v>{}p(!Fldu z4SMY5D_05IFA*1m?tc?4A+|YgS#8^-hS-c4o*s^~lEu%6v(ij#qC4c;9Nk?ZuE*V; z5dR!&v*G&YxM{5(L9fn8kHm-d>-b0tu}~L|-1-OXy=PNC#B}-cK1gNm*1thl;nshm z#Lb0kTjCxIcM5W5CN?8HN`%#Tl=Kj{DU_OM53$X$m#|KbxSb(o0`JdrpDu)Fa0LB! zp4|G;H8L4FPp)yfr=d5)ACrof;)LHI^o#Xbl(tTHdAz3wDpoTuDm^Zb_9;G4W$T0 zUKwMy3CBBS+j3~$6GM5ZXHiQtsV6~Es+drE93;6@P~sFg$%~DZEX{x#m&DB+6?db$ zC;Qb0Cyo~{+LjBFxmK@v#CC`AX~LbT?(4+xa(fYjTAmobV{9+dqG+clF(sq!-gGX4&y#hOI$8YD5!HqP^5V#}k5=%9H41k(T|JPSDCw zKy`*Um`VN~PT45Apz|fovZ%qZI^<-A*N0(RG7LHE0^kWj!DX0)Uds1_T35=8n9ewu z1j#CAS$O}>`^(Kei`s|Hd#f#(YS+#mAG>#K@yfm9D_!}euKaDW+T6A< z@ZEvM(Vt)V$%U2dz*2VLvGJ$uJ2io1x>vdmeAsp1{?*6LD@TWy;IDtUCI}seR`>P& zLVh6s>Xl!=_RH5EUs^tVdU@Yl_l9c?Bzy3#RBhS0(vn?j$*#65)m;ZxvxlnLeZlre zma<14XO^?4esg6hJ6zp=cs&-`vvWNrbaXz6h^=k4Ceqfi@ZO#G7Edp?9;o*A-8oY` zM%wqTA1CR))sD>F!VkOeA6n|@t8U*_ZP~RRkEZ1siAwUb^}R&tUFkl&)P4A0Bnm2K zcCTdmmNI=+Wn?|V@7M_j zP7v*6vBDVSYcP}H-E_48Z`x$UUci4hw{385kQ(bEA)&bSjl!1_e`{ElpecFjdz#}% zrVqhoQ!Y}x?G-2$eTu1B70pmEZIM{A6zJow0;)o`yU--$I0m(oppl)<8OUFZSHqQES9m8jswDSQC-k z?nalS%+qF=`Y!Yp&uwRZ32iLeP`~~6#K~(Hi-BhZsId?|76r?^o!ErtA-%&UG=E8B zrfp5B6}MzKK{;1mV6y#AHD6}7Yr`Rd-+dor4*A8`FT>u^&lB})XuZh=-5_}I@NI2# z8Md2V!j}(fJ;N5v+vQV$-E#1s9Lk(P)%-?7?&psu*cT_jb~c0xeE|Um_5lgp$RkDEK@ dzWj;Qa{Ft`(#|ao=OjSDBtX){0n)_KI<)1$+HQ8o_BngE z$LwBgpUzc2q^hHW0993zFID(Ns_=nImCCmklzi$JyNOh5Yos9P2m0xVO^Ilw_PyQP zvk9Pfq@8*5X6DVCdB1t@?Vpp$IDuk`zbyW-jgXJA6BemSto;UvOT;85XORN7yv3FH z0?#2WSVBoGh(66(QXv97(TbMjg6#7oD^`jZ;yxX*5+$Xel#+#HDOE_7+6rx@bRq4J zMXmNyrjYSz+3F}|3t5ivgl6fM>s+8op_3<16Ek*!m~oo9E;PAa^IR_RDR#zkoF|Xd zSE{sZP@sh)<#}kjo@ewl(~D(lp0>}@aQ`B+vDe85P&bdrCT$E{BM`7`^TKQs7Kd;7zU|m^>9t z!4xOOMKZ~m(gmd;nGqT>qcmy?RGy5O@&&OFGh@Ju!dkn2GY!Ai%PYqV5MS; zwJ0HzX|oO1Cl|>aS4fSMT-xh=e!BSZDbV(WW0yCMQ2Se4g@OzfFHo)cCsW}j)XH>m z=;5IftWit0Ni9b$mR(vlsp$__9jeY2-D%aGroIq5r{k47!>(@*uz}`M;OH*ZEK4ux z`I%yQMpr{-KJG7j!2yA@)3mG(4bifG#-c-d)wPFE%8)T_7Yzy{K@T(>YHA4DJ*=T} z+64 zyWT`EN2W-fXsz`o2fZeS!6-0a6T{X1%<%oe8-uY(I5tmm;@GGEfpXl7O(M+TpiVC8 zNlvvwnI~(S4=T;eR2V(C3EVnkUYk*6jF#bQ@sf+C*ZCDjSuat|fpM?XaT$f4sp)LW z(ZHbo5-)bf#>McAa)Kv1)S6^zP|b^JnqldVqiGHvN>$H)`O{&yR2i-mD?={GpzhFI z_0f^X&OST&C%Dg*mmxs?;R%sYEWb+jp@Zk*G6|iD406}0YXBds;+sfBE;N~my ztP^&s^?oxn^Wi=8!zAZ0RQJx4RXKSvx-4(KyzRE!*O1A`!A0@5+`H2MAhz8r-`$68 zUn4@2Z4HUUl0m0;a8Z0MwFWYXr}$vj?5bO-x?Yq8-oa>D66U1f^KtPHu=ZwD%xs>8 z(x3nixa&<+fcN7U1h-}RnU>XqxmeeQI_I{KG9zz{gjDDm7sR^g^lf1B8+R8r@aZN( z38n}L@2$vg!A2=khovHXvI@IHx_7>HOPH*zBJ}Z)i7-+O{yB+nInb^49g?K$a zPWp(8xbe$OHl#M%e!_AK(M|W%wJ{b_X52-k{8DdOB-Dx{FA_7c5dB&7)hj_e%gvHG z;YBjb84%44enFl^J$8jMV9$1rY|{KbTp0X|WSoRDa@pE5aSR7on&A<>yNa*Y4-bwy z$?TGd!I7TtMp_c>sk&SktvkVANq zNzYWLJf2Z6;!vF#(_ZA*bEl7-Jjr5sQfZ3W)ru!#(DNkA)=kF~EL!&3PK-Y{c8nTu z>%O+0L{Vu^fH=Sqx(t00!*mfgsg^2^7aN^3Xa%#JT!MAusEXB2oD!|*E*zxnMIj?H zrm5q#hX)TbEeu8qYN54^LA`dS&uU+p$IyFw0z_X2mGGezRBQ*1WSW-R2y;#?_bmb8 z>r?lE2s{mCWlWFP>c5w*R*B&}zr77N zBr_|$yRQED?TNQ0-raX=_an=_kGv-=_a1yh`J{i(+b7;Sajkx<|Iku;=W4q1lJcgq zl6~@quypK&TZexHBHJ^c?InW;*Lu@~sn2!*wUphx-b32At#<9X)it=Xt#7p_x6+fp zc5b=nz$d#Ne0%V%!8^T!OZlgkd!K3u1V+yOrTg(aJNLhPXnE)1m0W&3CiQJ!kH&U% z{xh0v&osJ0Xrq@TyBY-cSbvbu}3 zA|^8}eSff;?YhkU>i8Fu0O}H`{?!m*!oTk-Ao1S%UXtE+mk6=ceV29DzbpX;zI0B2 zIo|K*N4Jvq2aY6;WyuHoMd*BR;D|EXA>QEIp>reKF`5=`4D^mBgqslwI5!g#aBik$ zq_=jB?iX(E*ssnlC9OUn{%_v9KtAImlH5oqq`{?KBMk!Wd*0&IbbA|;b8TN^Q?;-qKy)$uE*8VT$vq`Uv>(JN0hlGsa;?Ypk_UFm7G zVJ}TG1C4g1G9=!4SzL~Ff6)P5|K5Csb6eQ~P#5xQ+p?(t%je|vGe&R^5%AGqD~6cx zBu1y`9NUky0{@O>2mcqr+50!@J%E{Tep4o#H}Dsz>J}}>_r0F_wHZOKoUbt3wLyWv z7XH1Xv-h2~@4LN`Z(F)4p+>y{Uhq#FIMX1_W(w+?@Ye@1hGF{0p2KP{R=)90A%$oP zper8=Q1m3503KS}L3A_(j^oy3!YLn;{trp#$0YM1+52}g@&zB^_|FMeDITf_;rc&{ aW#9P8t=Nw9k@d&~$7L@UJ}21v+x!=@u5e}m literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/pythonic_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/pythonic_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56fcc6dcb7757be0b1dda05bf7090c8d4c23a71e GIT binary patch literal 13216 zcmc&*eNbE3b-z#2)Atwp0Aaug5ZD3(Yh$l%u)V?H-L==o?&3{$Sv!iX_b|djkoz9C zA!6!ow~3u~$GA;n+-AnO&CIg1+mvK76Q|Req)FN)(`hRiT;x~M@lMlD^G9d23vG6H zI!({HPfrq9ZrndQy@T$%_uO;8&pr2d&%OM$!(k=h5{9?p_c{srCyZ#$mJh5AaD?0< z0ui_b8B)L85C?HxLN}%#(sM}DCk$i!5YNI~!Z>6Cnjv8xvkY1CH0zjc$i~8a!an90 za*R2LoMWyb*O+_AJytPPG3FWajCqH=jK`SpjroWCENn_tj#Uj+aYRSNDlzabv8h>{dNFDeceJ3T(0V2QOyF3U6; zld%GobTm0UPRF8{U9oQ{$gHtI%d8ka6{V6$A!dsw<8mZ1e*U~jW9mSuetv$)njPY| zh)7UchIG(xo#KtWk`ymbh*){ABp;!4oDPKbia8QVM#scRM6pI9W8=bP0>k!5pyU!L5%f8T|@VrL?Lu5)5a9vM$|UQ8s$I>n?+ zrzXbZNm=R~pAeJLc&7|Kh)l39NO|6PEK)-4n3$q&sD^o4At3SKy-BhLGxclq4c)B( zD!R2#!2423hXrm(4WqfCZ!%&BMB@WSA2SZ`y|}qE+BU z+ptbBzG-+jpKHi2+DEr+DgvT{Nwf*PUyqIDLCQh zf}dOTh&CqF6@`$e5K?n`1v3+Px8Q~RzN_RiH{>5AVV~kT#RfkfE6U0A-&lj-kPwW4 zUj3t^a1 zDg4}-aGA8HFQ64mz{1&~MSdG(ImNx3 zU!WYBBJXemuqt_JazdoI2-%vBz=AYLVq%!0C@7XlB$kLuQY0c_*}>oyS_xlT2i|ne z<8$7j74s}GNVu3jxb!7%FpO7$(0BBMy% zww!EqtXQMP8Wm>_GV;@QZO6vWHZ&1D+>I2-O3k`ufuIw<2?|7#&anWfspE^)JD@rh=z5nnY0|;Tgxb!-(7b2V_U1uiA+lG?J?hg! z#cxq$XAE+cMo$?=16sKF6--QhnFYO8{tPdpv@nBif&<>8sg)L_k>e%cAgO+y#r)x7GY$QC4Jqjo|j=V;$|H~zE zS^pZj#Km+RYu_aTwYR7ogh1^r8_Eo8DaEf~FRA^BQW4X=Mh1yiTi7xEq7^q1bb0gw z8^?zIVwjGP1*5^33Xeo}BM4Ganix+?Vn-qS^MGAUa(qmL6*4fr_hs?CcsZDl2ghSf z59BuuKt!>Tc=CL4TY~*dmPMNEnC^P2jFW(sU?+o>*d1(^K(E?y>yaY37m9*n#Tbi7 ziYYdplo7Jvcr(?oK{Fx@>4M+dDezu`*+A`h7PQ?ACl@p*tjzpu5i(j>qmUu*w7L8$ zcZ=i5^`HT^fIrq>;#7UASFCXuccqR4iE#)>hdK&5KMU=WaHah#?h!en+wFX~^T0yySFCe8zZSbQ za(iUOz3rPdAFp9w-UzjDr$5qY9A~)1@d)LiGwSi03BHQ(U5SCwL0a zV25G?FbmQ}6~no4VTwM3NrvOlFPI8|^acR=5&-BCKs$hrVWQ5%2L*^>RmF1Ta!j0H z5Z)qc!LUi)X|`cy+p$DnTq@29Kw&1%ic!Vfijg%%pnHH-f@2NA1qUv_mp(qJWp}H` z*$!iQp949CS++)7dP}M{eJWJCt#UR@|)_cWc%cd@r=pdOFj3I;STUZEHkV z;mH|Epf+dHSJVQ{(eOclxI?Sn&TL)d0=;)&U9a;wL2?||1pxzv(> z$JOUo{k3zSzVrI+*B2^oe|F(Bi^7t>2dJ*cFiBwtnvC_<@*gg=6culq%d=vy&Dd+xbuTU3kN<3A_djIaHv!)#(g7Ih zH{1If$e-(a_xA_M4|+WPHP#w)Otgu%>OHsrSVXS=u zD;zYMZnU7wfSPwg0kTSV8M&ED=>cctxfzDw3n>Nqj5%e#!cCEs`3&gA8Ow}S#u}L$ zEp11hma=Fi&e&wl0zy)@az`~m&ya>eAO)mhmkWDn$}Sg>(I{fzO-|Fqw6=(S1Rjuv zy&M919BD|lA`%h`xZw@%N%?O$N-&2*#-=lua%eqPF~^K^#&Mp^xJodSQ{(6-v)ozO z>)BYkXDU)|z~^O9)CCx8%3ZI*^auiI=8A|4as~6ml8;tP%6$P86|E`HHQH2?B_9+_ zT3(K%+$HrCS`J56P`B8w%ba8YtCA&Uy~xdYQXcA00j*pwAP-=tn6B7^lt*CFXg+!UH_|;tK4X$d?lmd8%3qR$wA(KPL)j=ZQaBQQQ-d1C$;Ue?lQB1J$-3x zO!YAxs;OD-de}fxE?NoY?|8gs$tt+wB+i}3gA^V)nFIb(E#P|3t@U@1q^uIkO^3!? z{0jOD{UB8(IR#Ux{+NQ9W{B3o5G`)L%mJcp2SmF|i;v>YxXB3>S_?LC?%l)&QNdH> zd=0Qy0pEHD6z+96y|x3>{~G|LU;`!;L1rn?qj-#;KKxV=FCxiQjE(^`DFgPdVZo(f zuctiNqd0JMA9xD+IV$*RUHI(J|Ld2({H4N$6>di1wkcerVgc*lE3C0bjXSz)V zdtmfaQC?7mN?<$6H>!x^JEji~UWiXjAY*J~GI;?FN>My5OePqPL|j#pF^_1#J{lC! zW*M$ktbmiLEM1DrBNVq1#SEmz31$jo_>T5MDn$fD`_Mz#q9BZf9zpMU z^l*ouFMy|5#3b~JM&XnZ!ni&mCY4IH3!)H#PD>H6o-quk7^4$l*A(!SKaq$6X!l0& z{4*)5z%atLUd60|Vv2ZMVf%Io28>clQLEAy(L=ieJ&N9^zzh3T{5XJwljxm7@7K_K z8NEUDP{$}fwc~(JBT+$srZV_oRjF#YNT)AhzT@cOf@1Th*w|@^wOLa5IGSn{b5gv- z4oFIc7Ghll3%IDbaPUXOgbyv zs>wF*&)6HU9-FiNERS8Mm;9|y#ID}Dx#6$J-X6I-vh3~rwglLn{*mV>B z%vs5n;GLS^M-;oo`F?e17Ex?PXukJ>yCqv4%JIZel_QV_oHcbC zmY=P5tlGXXbWiurXYWb(+Ydi#*>g{yY3Z3g z3FSBKT4~&$Y21IW>7KOI*f;B14YU=SvdBYA?wP;e{NsASxIvc8*0&ijtBoIhOVRC`k(CQzz~i-jE;(E zvKX6)Cq%{AfATb(ndyGyLmbYG!-^Tq`e0a=rxZP0!l?Sp2)1<4ZHgXL8bye#9)wR| zNRJ-sQcA&79C^cYOiUyq5zz9uc}Tb;d`h)Iqy3Mf*N6$Qf0K{iBp>K_-khr?RSmP} zZjIg?&FP5cWnI>6yvMYiwm!$ck1Wk=Jj8PX_Zf~4JgVA~ z<1qk*LeRv5X5tUzEG%dxb73ENUmU(4N}|a}E}Dk}Zw%O}FcEE*5nY&$e6z z3wnsPa?X&k)c(SYarKr82Sw3??xjHpZ7?L^_~JmX89-;j4FQfPb1Cw4#v^J6h*db; zXt@cP>z+^+*0{-O0GK9=$1H(^;}0o!)GB-YDIB$g!g22nu4sNOUlLGJmNz-AHT~oU zcS9G`#{mqU1*0)L=x{T-6j!dlGHx@>r3x+hUETFMFx={AbTix&$D;scwfux=Euc-% zLY%x!4qB!wIl$@4)m<!9Z1=;$@b`G#t=Wo^5p!Cl=Y5;hEM5D;9y2m=4_ERA8k z3WAvR%iIU{Sui=ua4WJKs#UlPkHgPx5ipM7#VuSCgZ4m>H7S8Ilpijnn2QhepQsNzI~ zh%ja;Eba<;3U@Jo6_>v^`YlXB?{DEJ{T+C4<7TRuZTgCID|IuqYzk#vo*PrwrqbrX zs>M6o_ciaG!0o`YrE%5e$#{D2>Az?GQ}dFiciGju>S<25d}hh>YTEuP>`B$NbA5MS zy#3;Q?CsIJqsxJv>%87=eLt}CreT(w-H9eS=ZzPxy|C;LFEoJBEo0x24eb07nE*~$ ztm|fC^_5hZ-MRZafo}ztT;1OcEc7nBL-W7(z`boTka2aV&D}q9Hl!P0Tyh>wn~#3H z=7K^#{?JO?JrdrP|G*aP+iUz`m8b8ZK{4P&Y>-tNs}-PC$h`L9v|$kUyg@F4vU4thFewq*FRbFQdTlG=06r zK#FQX&=u`Na4S%-_mq1skkY{|lR@k8dwd~e)IxB*ZyK1cJep)^zGTth(+qkZYR6j@ zyO@+FsThe$v3MLV4Gb(5&m3^aY>=k_3LtCPnYV4@u|&0i8wRi(JOnAaxS*I}iCh|| zg2ZH6T|s(SpA@BlqFqigj7soUg%>8rCZw=|ehFy~?VQ8jlBkCrlU9@gIPf^2A_qKb zvGdkHLO{ZG3Jb>OpA~M!Z^l7a1nvfwZLO;)jonMmp0v3KpyaKiH;>*re)IU16Ir+C z%F(R1>ej)p9K32;ZP>GT{ySse99wGWziOG?cB}1X+kNv^SQ@^DpV_=wN7ZUYUAAKD zd!avReW!KtwI9E-+&T!5&0}5XdAl)3c;2Yq-5T(mv_Ys*88e~23>~;dz%*BS-9t3p z-cQax%r7kVJDgjFra0`{h|gd;3j8J~6pUU{#S=IRMk$I0aE|sD+Gk1#!U)zJ<&f z;G(^NCW_4{8og%tCvX&uT{FfMpE6#6Hyw|)#rP}RQZQ~qTTCfa*|ro$0}TGkSe6~<7N9Yky8W~Z>iB+nB#cuhJbm>t0j7ZZ5UVy^hy z0ao>X+#XWuZe6R7IwA(hWltAVP{FH)Ha8+b!N^|6sgsxQym2~2)Ax}7QHjz z{T$4Izf>)xuR>Je&M4eLh3i(h9rQCA6JrlsXcP$ovgs$rC)EALNa5fKu*(c~GiJO~ zcsdNWCWViUPbOu>47Xn*JaH7lHu_b}>d1EoOhnKvI)*euY$Qq*54@s`C%}Xl5dht( z_AWD6?;?os6f1@_Y**l95H^3c=0lF*NjTgxD%-Ur#lMNF7C|pSy%NeAZen?-?q#0? zB1P}t;3s_%#_I~n`YNxSSgi`q`@Z_?*KAk$t8&)nntd)~Ys}iKSM0%z9g-f{Te6Od z*}*Tr{?HqoKd}lSFHVb`K?zo8oqOgn#P|GaNrT!`L{->qNlzu&O$kuUJb*RbMi z&iI-i__jT)*gAi3(UWP~eb1U{It-V`kKA?Y{qg+i2kzEfCkbrJ?IL#fjh<^gZymhS zpEcROH1iuXvxCc~>bcW*&fY$|(6wCG2}f|Z@1e;zS2s^H)vXJ$`_((Mb+FlY&G)~3 z?C!Az>5opP>vpHjTe9Bf6>nR{+qTfR{#{qCS(|ru`o66J zz`*A6>YLJ`J@W#>=0)!xji>9Mg*l@p?V+sOcgu0pF&A5QH1&6yKLp+BVjE7yckW>tm z1K+Sy>1iYEMP(375%Wm8V+0TZT-z40LNV1w*p5e}eMQ(0Ex@D2CMR$yoeY{-bx&xe zQ%f&?Q--4xMBt$)a3B6gq9;!f2rzg7&D{S1-+2&K>!TtU@opxm%VCAzH;KBsWR&V z<9c(()jU7G?1IJgY})l~HUOv9gYS74B1_?e>Fo#8fkRgv4{i0UuG*}t>c$tYeIZ*> zll4@s`dhM4q`xKOZ&~rTW&CZ~E!*HITj^Z4nDMA;wph`8U-z}abi#Q?UmwvP zWC`Zug_rFe>dW>HFty6#P@QU1WwXO9scgnje%ZN+ouHKcPb!36z`$k%uhy`=uv5an z38+^^DwU$dD>jy2eb+=$##JjZyE=Liqj-{IhFN&E1TVzYW85wzvV#{Q3x$^_;r|%e z#zik<5)%Y`{;8TU(MqP652z254$(h@D2loCK6p7j$8l?B!rA_T*nUbvKP8p_L_9ww z&-^n9{6c5ubiW|reSqG-dvxF#2^V_b;+uVS$+Go|aoxyqmbvgcfzP@*uHyo8Th61i#$})$gf6_I<>AW>07M>UACx*%16U7sBMc literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/qwen3coder_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/qwen3coder_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df12d0757efcf4c055e8834deb6501e2d1c8d8e4 GIT binary patch literal 23599 zcmdsfYj9iFdEmv9c#!}B5HA8GUL*mK1Ye>^$`WOX6lFa~$&zBnOkx@$APF=GG8dpM z(U-B?^|Vy$PM9=ZQ^|Hs&w6XBR%@nBcbVDjM(L)lclQr2K?gTC>WjYFr0->WqHcFm9MO zjvJ><+A8 zPh@7UUL{ErbpF!Ji>YjSK1ELoV->>Ma^k)cCg!(csZta!4=jx$rMjYAq^>BE%GXTe z>ZB^AxuQ&}UsI23lbYA4@v5XYrAbz$bSb?XkR)~URS>FDhAY~n9-xd#1C$%#-;^>C zSXIh&#gMFqvg%|_#kw^D7QkQzD09*RP%3CagDqT9CuOnESMm_mHy zN6JJl0xp3vtyq>+K?){Y_$Ael(n%=|r1Vls3n{{-n3Jr6lu;_fzL=zx4pP-(ilL@_ zVqAW{TaGHIe505<sqwzCiw0}m7 zrq7JAPQ(cF+$P55XU>@K5L3x&k$Qd;)@!O$%$J{2J#t)(0i?9DVje|_m|)&LV(zbm zvqr2XKX>7*7jpsU0WmH=f5Qa%2j#RFd)1g95aY=j5#Mhr@`uFSsUu=cekRR;ZBR~& zv8gA;nEVv!O+nqHqL#6ddE?hna zE>F)*=4WVsa*j@CugY1Vt>W_+i7fShe)KG<2AEJk0nl6~>CevO{h67`L_P(mXEJ_V z4u3A6pz|H6EH(%FD(6pUNvl$?0`t-9XU~rMFV0O)f(E@Zm&x4pr>AFU=)8cY+(uK* zABYBcO>S;BMf19e38E1vc*o=%O{cQ?39{%}SkZ|bBzfyBow_b$Vn-$tG|ij$z^0SD z1BRP|DoH55nwtQHBpAL z$vHBZJ)PUtXVP{m$(ybu(i!L^%rlJ4F+rztsXX)y`wokQeaxqLC7t4{Cc%^eSYA7P z?$WWdXL*<0J1hb#Ck557rf%?NxkSj=1u6|dC~?c6a;M8!2$2a>fyM}LcqWmVOP!$U z8JgEz2XeT~tFETNqLOj!n$ihe1i3_-2+TKKfYGL>%eb_H@x!arU_SBsiHXS!knzL> zk^))viKvQK=Tey~G%6t)rDzV7vETpdFE4lGr)N86)3fbBj?kO-8*_aF{V(>VIx^|Y z9kVy{*JiRE*E5-Epwv8lb9N@3&F4C1W>eWjT976J73Mm=jK;(y8WX}g3HkBan=~qc zd0!=CBQ6pvYT4ZDV3>XKDz&AnzE#EQT+8lFU1+y!YIl=kBYNQ3FC^nLqgE{Ox(pbmV*V_iujpCY1R}nRi#cf$w#Hf8e_V zd+Np7-t*k?aHglXOf_FUb?el9jVeYeHt!fHi@jtHZJ0w_R_~5dQQLE0rLxuRP^#)0 zD2%paZ{Dl9Q?r&Y^*zn@Jzb1kAl>1bWA9A8HT6#Rt?b&RQtv3+J6dcW!=li((Ofb{ zSYxDM>|7fz9XQ7xI9E9E?3T3w8)%gqcwnIP=C2xW897tmy8ZhN-)$%joM#8l|6HZi z8}7rRP`s8-<>%<^WO+M*d9pEh8WOPSQRJo4Daq&~kR%YzsVT|mlgpCIq{>5mi~2() z*re+D#*?XRibhG9@Xuw_u;)c zoNWb1d%+wp=;CxA!0nkYoD<}@`#*vJ=L-xIhzUPp@=EhZQqmj}0~l+PA4y4bNK&%l zjK;BSIAaU4;fzMLY&c^|Hk`4ZY&c_?Y&c^|GOyBTwo8UHX4EQgUXo{E}z>t0=xO{Q+pei$Ebyp~F!_&_a1HWy9qRRRLBDe_wkXtZ>Jl)8JqxcMP z^?rQ1xJHui<(f!-kP9{oaG@rA`nh_q0O#^8s$Z{`s`-i7^Ksq)7Q4m#?O<4_9)Ri} z=o$zS^Lm^nVS+WNjB;a>5;a2akopK+{}-U&Hz-E=sxnPogy%5zrT+cXCUAW>0BBN@&q`ncy%&8nWwuj zyXvI`odvTmsv$anK93ci#uBYC5?(9tYz_(6k3T^})PaV`qoqrRoda_c$2}k0PeFVd zHVX)IKY-v>YDY(zY8Q`iwz|brKXKGAj%?XH%Li8uZrWm77U!~d#jt5<{Z7@o_WS1V zn%{S9cAnq1xHxm&S6{yM^72&C99ezly@@*$rS@ZN`>}h^72C(QoDIv@-+blvD{H1r z=Mk>n_w`epyYbD&+l?i659{t((r(#&%U9k`ZrWPcO!u^G=LlCHSZQPHJ2vbcTXl^q z+Pmt_x^~F?`sg-R-|Y8Et%CaHyXl-NH;Gi8>sh_Pgq*j8?M9|)ZU(X$VPUV;ej17VOL zYWIOKiV(C(Pvs?XA^Y?89f9Pt$?b<*@^jEJ!0i;rF5_O%GTL2jYQ?_HgHe(+6N*qW zT3BPbUlI)VD_IM(-1TT`TTn2Hq!p;l^%&Z+F$zLmWg`ozD``j7Kw7H1PUZHSWrx`B zeP_ZcLeMUl2_!Z0o02?KV$N<)D)#&$1@MpbL++YQ0w1FCvcFOC>RnNWDcT1<%}4W> z>CN)Bq#XkwV*&-0SY$@KA^Grne*seO75!v7_BNp=7=qLir39$bLp#~_UFG)9)>V@GK85ca##153JI8$Y#zKbG?|lK{f9iVVI)1DDR(^1 ztFRB6E18)@UhrCe{#R+uPbz-4reG6Vbs0L8N@RI8MtJSzbT&cXj4EM%XxN2lt|T(K z)co|X)Ppigf8+(k@q!<0uNMfO7ov~p7lDe?c~~Xrn|9{%6s$0>0XJXbCa;1AJ&p^g zMoVk{`e*6XWFm_$OyVFVl=o6Pf6b5kqBu-&QgB_$Yw}VW+?(Jm^3MP%(SGnGUz-6S z@T$%t;R1Qb=NEppcEKP+mI0!_o=*7B6StKAg#?(2nKW3MiOfU_$S4wh;W2X~&~Q9z znkA?hoDG@JzZtU0{$ z%%-_*ztppv=61PkeAC>yqc&8Bww#{jCyLINf~950PSv9bFwSwI65O zkKgOtY(KN3Gy`xry4qR{A1qh`FkV;mZcm}($ztm<)_LsS(;Lo_9TipAyrWXrT4CfK zU%?kIx;u8NAid)S+=cpnwz{A5G?hH9tf#f;XPz(+% zo#E=63-;z6yP?{;)etB(#My@U>dVE3CzgybcCuohJUFR@g?w9Iu*E*6bTDLbjTLpn zifubAG3ix#m0h6?OK_(a;QsMJaKCzoA6e}%<@S|aZLF*9Zu)y3*o{E5&~dxxjj^wd z0cp8=VZiP%=L&MJ!2hbQs&nk90N@`VIIunA-RBCnKJ2Mg>Z!8sz^)1Wxio=5I6F1~ z{AF$oDE+_s3?oOW9~ylpkxe?zYo<)qmG1AmnSsoiI6X!&8Vo&JuL8k^Rx8;4G=QKLRI7dDpa{ z0^1W@3m$3#Ty#@d^PwN3W+c}M>^BtPVo}ZmOp<~++BCHQr{I+wBvaCENk+&>HZ{0z zGT^$=F}g~-oL&9XGjMB#(L-B?yyQ+~bc|l^g)|l;W1Ny4A9AbU&MA+U)-XnJ1!K<< zw!+@)Ay~LCLtv>&J1fS>7#MJ*yj!;8nJUI`7@B=O3yu+KACc7LZDL=*86%xlPdUU~ z`N_Z~kh@jlUh=L^is{7|xKIpZykhWYI3iwQ_}Tw~AMsv{+)?l*IN_V4Qxl2YWIBCE zNqiZ+Uh+6Zm4b@{ogGmVuSpZH=<|@Gqlm5wJS9nRtP%B}HWxJss^zya|7pzE<3SR^&R0w1ihdhG=?LBxLFu4hg*U);oXKl~UsRP& z-VhWNs;V5eSXMsdA-GapsCja!i5t@y$;V3gVZObx7TO1V2b-|KToQ*$U31Ca%Gz63 zhl=*jMLp+=uEtge3a;M8YR*|-a(1!KuC?}ibw%gNMH5%oymE7`>#c>AR|<7~iw3UF z34y1fN=1OV%K34X*SQtu2c~oIA8KymGPNjxL_w^7xjg zR&A?azT;eV7aDqso&$?#w#}ZBxrsG5tt2?-25QkG7)(rJG%~31} zt#%h0;!DQurf8{YkZl^|EcMIdtKPK*I2vlMSvz$r6KeN{E7yb2!9F9E}VFarNA?pc9R%dsrDQh-u9+ycKw|Epk>RTqg*cT%~adTCcIsv0BP2>N%RKj2<$!I}6v1iUH($KP7m1_5Ky z;3%M?I)UfDg{3uwec)h$PGqm9a=eD5`5HR$l6)4GKmh zGm;=-nNNU`_G1}F0sfB~oy<$r8 ztL?%_Rr0ZkHNi^Sua=BLib=I}U=gTZ{35KIlBvon7#-pPd#DPm61_|%yVhGH)`fb* zuu>XE563+E-4v9hcBk-22LzG&B?>%lk|zv2UCrQy0vZs0Qvoq1Ka)~h0J$&4?)BcC=KMfc_>H19wz>+ahCQHfaXN~?<6Kg!MW$WO4!Bu+wFerID z-1$(#oey1$zJ~J!O1@s)7H5jSp~bN+vvp~(Xb!FLj34G=im5#;QPRGj*Ir~XkbGQw@W-V(3|So zk}1TRLM2m_HAOc~tv~U#Ea`chm$SPBi>qYsX6@bE;r3GaAR9hd3O~VypWve1rRWoE z^odgR1RFi^kw)eA{9L7heVK-8h`>hD7cF_)S#SGlvJ`)cjXzcNo?0B`tPLe=h_!-= zR0S7P6qSyQkLU0i+B zk3+HLss~kSoGLX;m6q!2m6N~txE)6I$%B5%;{KRY>ua`5){@D~n!E+ynN8E#hi3a@ zGTRz}`(^GRtogrfJ7J)HU{qsh4xP}bf6yI*_+L7Xx4`3vgIWmxN~6Kh&~{=#^;i80 z2>EKfPM4mUgX?Lq_Pj~Fhd@#WxM!G82qom+odAe;=HMb1+#Qghr7Ux~ikLCE+Z00D zB%10qkcW7Qol)eRx2Q@xwOvxX>J~Kt^e6lnRqihsE&XRuOD*QfPeuz`?rm9W0o3mH z%Yc>kafs;IIF+k`GHGlw1e`@v z^zoz?Mj+vT1Y{d*f$;iAtBgU6Rjwt{9asq_# zN-HAf$>-U$9s1cN=Zi5BV^T@1P{mD3<^e{vyHER4V}*GX$GK1qy{bTqOf^|aXbm=m zHA%&S8LVezr5pa0=aDgs7VUzWF(<2pzGq`h%@kh$d*}zyW!VHata*&_jy9XtXBtc;WPdwCpNQ!h0kkZN!K9?wfq@-(8dA!6&e&p3K z{cAeFD@k&(jG~4m%;I<#>X^D*q%w3R>IBGs8J`$S8j{A{`x}`n^T$LOs40y_%%f0K z9hOsKEI%a1_GwRf*C@_#(sZ|4^qpQ++?D9NtOuo;7BK?lXz8B;-msh_#&X9XA8de! zf8^Fr51Dg`UO)R7<nr#u(#Ly^bCBuKn+ss098Rd15Z2yB_j{_A9ip8 zPn;xJB<)w3egVTHDF2^>L-y&T%ry?iQRbR=3D-D44mkBlEvG7s4Xw)EZpAVP?WmkT zb^6mdvalZ(*JHuOxagVus0amED)5rHj^Jf@mZH80?Ju|&Jd9`RoLEnOCaoeg~O>!1k6E#ImCpfzVzu?N!E#T z0WSGaQZ2#ud>R%;^8BZB#rg|jCS0~GUWB$P*MG!JEHv#~tAf;&T5V>U%JQh1X%^bg zX_zLa8P}FpF-;5MY>1SgrdKh|kVamV+sErNw;9Ulktkc(d$s{}wgdKS^8AajoDXDDJ8*`ZX|IHTmqdy)?Mz&dJVcJ{ zhtUC;9w&?1%d%)6985crZyihr^!oo7j>552pka}(|D`yKSO#t4dZYZ9I3T8e9U4zM zA5Y_q>i<=GW7=u-_*CEoI{avfRk_w~DF!qRdH^h1C ztgS9|Fub;zP`L9pjtNHt`ju)B}^@&}4DU_9fI>z6=q<7M|su58#wl-)PC zl$8`Dn)Ux$@rD8p0mEwQrs7XjFDc+g4IUef!4pj^`UC`UyrqnH!mV65StmzC;%m^t zA-euB-X*>?R!&qTespUx+1H?rM3~aZR7*j$dAKM{33@F^sgeVih1SB3*?xM9D+sF^w*$( zSEkE{Vi$2h3K|czc*V;QTfz(_fqMAZOtC=!E|k*0M}UL#s^B62ysFI~jP{au6KULE z(dbeX&g4)mlj{xQnH;`ePmbN_dMwgm;K$$$2BR3@29QP{1C6F4{S*dgF~FO4bQ}XR z04+^!*3!R^IWJ@ICm7tt038ctrRXOy=7IojEN5@hdQ75uN29-k_F?c%44#GnFEzo7 zTlm(nu-3Sb6W$iXIidd)1Ki`%BN#9k;Oi7LYFGNVF!%}v-@xD`1koPh0P)W-xVM(W7sGRN894D0u8Wli zrx#M#m-0Jgq=47c@CpNbnF8Jf(2=haU|U9Vjk(-MgODJ7uNJTX3>=f);A?;_#VgHZ z%%qRyQkqFD|F&2zTwDMiA|KdGI?z- z`y#KsnkVfV@S_{$iYD^ej8twiF$?nsvjin#s#-X+D<^Y3(3cnFBjnE-6Qq#$(cV|3s<;K#ua-b9% zWJ81ZX4uf@J`9A(L2wsq?<&~`So^?wThV?JZsi26OL%^453U^Bur=Kc;X&^SHh!WQ z8+pHWBX)W_&|V7kuz?=V;*U_O) zpdH^^#{1HEv|hhqtjFW!0~_H3l^TP~M`qXZfl_@pTi?B5?%sxvFIZYOEG?^DtflP( zOA8l@EU7mwel8Rip5Z7y15$ph4bv^qUSn&ip_6UsT(g(D&aho)iVbJs9-z%xvc*_i zZ1u$r+rV~b@6zz5y?xtR4;7#-b7aFzu5XtQ=hqrp%af!|J8NrS^RTujw%aVzJ8{U16*tEV@eeFY0nKAz*p zS#NwbU+Or;cAP4DPZ!{was0eQkTnJi&3zljKIkZbwze1Gvm7<%MHN?L-E-7G$c6{+ zsfyNNIP8ae43~fBCTFSROn!k1S5=#)HqK=JM$b}SxX4+wb+e}K#i1P|Wwyb41h}r) z+S`+b*kG~w=!WqqG`a02w+oQySog?c&9*aCaz(Kh4d)EqG=a4YsTx&)>{ztrR)VMvimsU2rwS z*YH54aeE$EC~xFL0B(+znvbx}N4QW+Db&Y?`nIBnxM+JRI>bhY-lySCNh}E6Z|;Qm z4txzeDt*}VKxOngb|@oU@NgkRmU*pctS3Yo-mryn)(6?>;Jqm}dY%gtCJJmhLm#@D z;L8N1-~byO*lId>&$iig;sM?l*->fyjt8nLyugC*H^AEmu96K=iM62_%nB84p;dJ$ za+Hl6y*IECIlCQdA*>0@)X?(T*CX8i&fr^vYvEG&DYpC6`-h65X9~dQw$KW_VT*F1 zw$<@M|5Js~sim_!YD=AuOcdTu-6e1ThPQt^&~#s~ zY4I!@;R=qo1z!4S>&4HpoMn5@{-|~It*RBp$^n?_clzJzUmYoide&Yp^_^k+&XoGj zvwi1_q4Qkp!BXo{w)JSK^%UEBYGs%kyMQ7LFWRgh1KwGGnGK)$aX7l8r`pC8u#TSQ z$Ihz9x}=3E6@(WEGhxQKvhdn=`j)DJXT1%JXF014_|od&9F1>ezm{FORBAfJHXT|Y zDmtEAQgJrNa`5W|oZZb?yd_JBwS-oNHZ2j(V*5r8#jol zIPy;`;0?vsbZoGHy^9SVhBsdVvDGJ54;KRcOQ$(!pyX_0oo%a^Hl5udTGte7%9ZDr zk!_ng-Z}TyIY};rN0&x9XLwb);fz6P?7(}CcN*92rGYVaV5}HDw_L-ubiMP+Td&~f zdB)hjv0}@)Wz%+jboIhUeFu=v>hTS4d@X+O@J9E!?Y2X=M>!bpg$-W^*Vcb~6j1uZ zrN%zCv2T5VZ5)1}qq+{Po&Iz4Th&k$Y1_u8i_X|~bLU#aT6>}S$z>zF!&9p7fS3E% z)SLCaT&#C(a*Zy;4lSR7DrXhXDwfqH7m9;5b*b|t3xBSs2(nVkF}CFx=k=Go2U+jI z4e#Og;NvJe;#oGqyGJ0VVnb{Uc%g;V?_s??Fu1+AfDu|f%jO4Gs;%?ALw62w-q!Ud zkSA@Pf-8dL<}cN^vh}U2U*4$igzuV&R9*-jULPtvdA@M;>4N{lN5En+;O$T|UJph| zRcIPqpDY}|P&jt6;J<_{7z1qh&QN3n_D{!HSNjL92$&5Y`rEl!yc9de#*S^Z55Di) zY=0JnO&|!uCIVux%I^VT1F{F~Ap2Jres_Ur-d5Jux;l^Ax*_uB%s2Di7tmT@!)|%J5z8za@ zD>NTnHg4B9mf?-1{xPLsINEX$GtAFh>X`ySQzKd)sORhNUikDmmHe3g`eF0dIumi-I zg=%RnH6LZ0kCJz&z#>EP*l_i(b%R>mix8!mW9b}-nMm6^^KZ?sbrjBjZnNc?f;|k2 z(iB@7{>CWR(S=Ob2GVY?PC-=??;{|v`47@Ut}JC>I{N8N?2*5 zvF||@Na;6E-#)#PFSQ(HTaLb;+Vo!H{DGC=M=B+BOGgF6Ag$a@U|t{{b%Ob`c7}Bw z{lL|^)zXUNCZv6U^&Ys_3*(7+wmtRB$9FW!h#wdNz7p2^or~+mysCjt|3KT)bPbg}1tna|ef@a(4_SqY2&$tZECwWP}@FGN2Pkgr7Wk1ucb-DMg)+ zcX@YI@cd;C&!RR%#|ARp)Q^0v5dM9)K66y{_k#)uqiz}EWf zFV7`1WMef!o1vI|bxn9(=t)eXU!1;)fffUFz6kqYbTh89v|y!HO)KC4vm#_!&Kn2DO!jfXPbuZCE2FaBx_q=eP07b zJDt}Qnntc3XpBVKI?9owL*>Z%i$Y{uZ@RBRn4L?idX0e#gm=iZy{piDnr%Pxev)m! zK(e;=wM)ad8t-cWZ0C?w)3B!6q2RHuUcdNV{e6t?M7lLK+Ya}R2A)6%)|QA!f$}x2Tm%JRBuTIx!Yf0!kL*;FWDVsB zu7qy;cFZJc!Debn)Jho~%W78d{RbPS$=g6=hNJcLZ{TbbSH@w8;chsSfp@>rcMwlp zo+L}iYvE;L*oD)$?!1P4tB?+1+>f96i{l0F@@M|y!e{>C@DZQ<4B(;(MG*2Kt{6fG z#4|&v1h$yMYiRtyewc{sg1qoWV~@ZQ1b4=TJJmuiU5!04lKR4D4QV|1<^v=Lz_U60 zL^OO`Xksp(&gA0K*D?j7N5&A|=|-ZvLkZJ^H!pbIVS!r)=}#aDlt|^!Zv`BuP~6v2 z3gb^G{QFxf{I`_j?$xe|M<125E_d{!fMc|sf*LXWQek7oOuOV)g)-Go!;dhz~>mB8jp@n zOkcVb5vXhE;f3idk*V=mL>Q9?M0N`m&*neEh~DHP9L@5GF>Dwyg^llUBj&IPt|e@a z6of62!i(l`!8_&=Yq*g43*fiHZ$q5@qA_g4yCZC8adyNx;CF_dks`zwBeW=n`XWUa zja8@?@x_RDgum|s@;r4L3S`NZ0-m#37g)0%_jd&sj zkpivcDx@wCS5rvXhxZCfg?B&RE5kMDlQ&!&sftunOXPMTwhpnrhs#6jm9}3j2=hqe ze?*#kq^SuvX#Hq}zc$>Y`2+CRg`44*!+4b2g0T8U6FPwc!oqT!HV-Y)#=lnR9=J>_&NSGJZv^)~?Q7jLh)-ivTEps{b%+;&Y zLR8F2J3e)FCfdOgp_&dpG!^EjXQBvXImVQY$CeFkINDyDzX|6i7w0bL#?V+DKNsPY*qM7@R^m2T-Xes3Yq{`;49GL3gA;0F?XE{77qW?F-VoQQ3(HV~)XY)pH0Q3`=M-5{(E4e~CPr?+|TzAtFYiquPMt4Im<8 zp@PLsxmG@f3sHa!?6gc-HaIdBRsub0jd@8+wa{$Ja;*hc#D|;i|6XUORU%eiEb$Y7%^@)ke?#NVBxPEnd zd@3q-PhX8pg~sKvLLC5vV)turM5g+$O-{%_P!8?7dOc$u9cA!kG}gHNg6vWg1<)$- zB%HUowAGoiHb~Znq_t_IfIE1^FlTwmrns?|r! z)u#EV6ozprge`1GUYt|&QNAM0N86)V^HH2j^HG^@%|~%1nvYsms`)5ondYOE9@Q6Z zSKrhvir5M@44b)Ieb4@?DGl6pW6%<-d6qUt4)mZk0984=YW7qbHm?>3r zvanH>tt>z6_0YsjBvYdkh?Y#f9ua_#GZp#KvU3fF56+76CmX*Ju=`qBJ&2 z!t&9pLIhh+91llEwGL)VSg_nVC1oWG%5N|0cV#$&Do68T2iPq4R{s2aAG|>zyw`D+DoPUD>!d+01A~=fC-NE zI-d5vKV8=-|2DVLZ(}+Tq~C^gfcYEJ)qW&*r5lE?F&ZA`ax z&~H<^v6aQI^9}S{lWt(e*R3o0P`{j`Azja6YSS&OgeI+oR#tJ~0b1Jl_ok?=460R0 z)~dT!pU{MUWNZX9$6lS8x{@gnFkr&em~x4MfSYj=N|88EK81hcr4s>x(6B5eHC?2IR8k2qg&DKw36D zY*KR@sNEWv%cj^ir#UaDUZ18>q6QqZC9*z+w)DxeO`i(l1>%|iMSap6uY$Z@*JOL8 zzQv7didk#idY(qLR*lR43WpKSXZ`A1*m|WE+rB5-sD|k0|HP)%VnYg|2(~FE^cG3(wnirkFvH|g<>({wwuiu;(qm2w@r;g4NCphSLAkW-EJp*;hD|OQP_~3 zkMmq37uAq{6DQQq7Dfq-v*)r;earra3*&_$3-6h zqpwDIFv&BaiJXE!$3}#Up|J>}Ls4WNi;TY>5qVa)s5N7HD#E{Vou?mSZYD#>hd-1{ z`KMTspkC}+h9klyaX|A?Gc@o%0EUZohgdAR6X+O%wB*k+@V_O0t^i?+UAUa_8cIiZt90I zz~49AG#I$|`pn#Q!ylNiUX1!J`oHsAbY;gDy=VX~)Wf0g8ujod!7udV5c;8EOa1oi z;lZc|3)CHfvLLW}{1U?4>zpzUli6_y8Z(C3UmD+>9S#;`tSZ5qF^-3YAO<62AIB~N zaS8cV#s(&DhRtlogtd3elreVoWQ@@a`iR8{a0Ur5GU10uMmUS8*qJYJOlS}`Pry+}qPVl-vudufbK&SlC0A4T;n@$)e$Lry zYVYxln>0)t7^cqecm96orVBm<)z@yC5dT>PGX1wrqWOMf&wfM3hS`w?6~b|Z>2I08 zF`Jl{9|J2t!AlNUd4Bs%9Ojfwd33w2j7_#{yEwf~9yTZ?R4}gOXfkC;H z&G{lWr;!0E6B~A+899f8mW(-i{c1!GI*A~Grkrh3;WXazwx7JnzM749gDL}L)Rx=U zjrZh7PcFrzrv0@0PO#nQxaFWNcfzoatq4!5^`O*x@OG)xdW<&Q3EFVM_AQ%~Hr$E) z4Oer7Hw+8I)bDfpUH&rn)!(Ih&XaQiPR6F~@>tz*L6{c!H^!r{@)sfKd?hq?MU`U? z@BxvDBOog)RJdw3g~76nX(BQuv`~q=sKi>Z&Wsb+WDui-?p5?D&n;2X!ZYM_Q648_ z`;{a@JB1VFl(8MZHWs6WLl!cy^g%m=W5OYdC?kQ~cxXZ{3QeDxie`#0;sXa7g94Es8OPM@h&^KaepMJt zb1o7a4ZQN8nyYJAZ2DgNN9}91T?wmiodkd;*A zUpT5Z3I>ag8`3rPAHMX#ONroMB6u)Sb7)FYV`3{!Yo?xpXDzA4pU1?p5FJ zZ1Clzf4@#uJ(9O)`IO{6ct6?s%GI*!YFQdebvz+;JaPNTYRB>WHNoYkRNrZ-?{u=}%)C8~RBm2!^NZC< zcgK2r=e#-TZcSI#C*1YGTvSG*?E=$=$NFUvlqCmsO_9+NHAgrR!4JKC0E5 zDr;LUYfIzB6OcTC#VbiqA4}FIm9;I6OJ#dkhEjuHTOIsbHqGUvr#HW-Y#ilZuWL-z zos{ZMradj_AzwQ0L|xvdgu4-f8V|o@PL_2hoSnaTaL&MaYCh)-SPb{v6*r4-6esGQ zxLu!e53Ra~{`-cBB7P~7R_I6Ev0bM)?x&pXRGaCiwu(~$(@z5yyaVQA3*pfLQ>o09~SxrTQOcW~eg*{L{ktrEy|@ z$g7p9%e}}Ih4WL!EX|?Gs|RA{=7}h|m*5sOljxIvLy&160JbzVXAFZG!=a3!A!BF( znQIx3Mkd8fp%A$`0nROGkzv{-x}9+_xXXGaX3W%=OhH8U61T#Djtq0EM&)n@sg=%y zTm`=w+qDC{**4d|L-8WvZ;-I1TxnIhq>}x|OVwwNf=c_GW5dnWw90$_PyOBRJ~cNq z-;5P{bMVICqIvP+T3P42LmMWpribmYIR_IA<`HYfD4(sF?}E0YH3L42qxG}($#PhT zCpvJ;^RXg5yn+=3ncn|EDvg)PCe<%n&C6Dw5mG_p+a?=ZTS%`dF>*@L^$m7VSRo`-COy;=xvL+o=j~rAbct79#V6 z0mSM_Z@Qx8YFhmipJWSES7-g|n^1yq9r<9`nr%B$=*R{|C&GEPob2zmCFV%_vSSN)iGNuc70T3` zD|-wi2jX;k7Pdt-xgyFc{6sSd`Na9LvTH6GJ}FY)XQ4UPoh@xX!7Ol>i7?#qe609c zJke%bp36i4Zh7M7utA02SOvw{>v_8eHEqd3-Y>HA1tVl||AVjsBzhfcVUXzYkmwPW z%AU);>Ra|V?AjvHGm=E_y>nsqnRN)s|HUe-iqrD(dS zEYSXpxirN-PkOF{jMx`O5XH!R z;W>IEe)+EmSVBBtrep%KY`4t7LZpin81MFinlfn4n1$(yi0~3ZGB!v-#A%}3g{R@a z5NlM}jaMU6Dwn}4`UC|()A zs)aJGpURVHlE;h%f|>EF89UQ4ip2HEd;;SbuuC{*=p(Z$H2uP_kwXhZW>5%F3j~QW z^`o*{j>=}lyD%EM2%uBWtZk*NTraI`XhoclfqB^~Cei6VI>Jy&yUL zb5G7cpLSKuy|wNx-+02vxvLUZ@5Y-3M)$T#uGXc>q^mdG)=N}zOWN(-DCjRbWq|H5 z)pA5?IdZ#Yt>tvW-2_s+sx#&7le~Rv-d!p0Vaa>=c2Uwhoc48tV~~8i*L-_ZzN3=w zXyVv~r0=)!Un{s7_UMtyXk_rfuk24yX1EOM8Zs^-s^AX5?^_F_dQLMAfy!NaZPnW&`&PXjL^JzS zm0hcqUHLpjy3&`b>|L$wUA~ao{q*YYr?b&*N$>9a4PD@@(tH#2u=oAJbaPv(c|d9& zSUD#(A4)g2q#E~1jeFC9?t8(3+vdbG&nAQCAJ{B4wHv0wrs@YaXJf_unT=}3iLp++ z1bTm+ID`SoGm!KQuDd-q9XA{)cfI7UPc-aVu_WDxz+KJTHk@2b*W6Ij*|_nfk#m*l z%xkjv=*DSf(th!vNTL5vW$FK++o6>Eq~t!y=>JnH{oi<^;$$;-r`dMOWxCT{amsG` zsojEi;agZxdE`Hlag3BJ26TZTltGW&2Z+ZS-1?~KRot^}t{q4|(YI|bVoA0R(Y8Eqd*cS7B2G$X0wG%-P038;H+!zb+C;CMZ&D2gCkGbA6#MrYv>qZ;1DLNx z_%{@gADU8;1~0=t{uStk0RFB%dm;wH{3_D@p#z)9rAy0F&1J1c1%tZLf zNChP4ELgGQN+p@kySq(e#5 zKq-C$PON(;NT!1U`mw0QzlEHDU)5ZZC*`b_oV5$lRNa25Zhz7_0ML|j)l07W#imrl z0jc3Y(lwYDHYha=CS3>D%e>ewPIt;tCpqd8j+R?buEXbST6H!p6-v(TyUr%?Fo06o zH0^0*0t@Y%doy9JR?Hh@yB0w9;k<32alymrGqVRoqqjbB1G6juPb`8aiW@E(&vW39 zY5Qz>!dYJ9-y(&!=(5Q`=b2Qhig*{@G%#V+(wO6>u;t2L%tR9?GWo5Jcx&RF1Up$h z#7))Mz;ROy)yQo_7|I|&i!sxi>}T<6iTb(KTHLJcoyy@Z>ulM#&mdnx*bvR~4BDtv zGeNoVs8K9bx4l?`vI-x!thjES>Zs(*obJ4+b%H0dDC6v8BVd7539KYA!WOlxIOxoH zVP1V&o7FTyYsimnklllzqIjsXqFJIbhXCa-3e%H369h0*Sgce?N2jNDf+*UW9^ju* zk`qv2!Mqp=31hF4URP^@E^KL4Ov58tsK(U69hx5`5wA8UsISLu9foeYL#~;~r{FfK zT$wP|8Z<0i#ljTcBy@` zFdDi1?gtKAYQIc}_C5M0Ee9}=5xK&wm^U94Y zi;at~tyKi)o=n64=8ZQO`9voZO89!0_pe-&`k%a0FZG{E_|7CeXVN9! zg`?lDK_v^XF1(SbY+0&WeqMsCxpFY!9)#>@;n{E3QtZ^Cb*c9wXQHxm>H5k(sr%UN zkktKTqVmav`$<4wl99GZWi5%a?)A!=#hRtrAC@hD?T>tk`h%;L2SKAQzPRvK!rQg( ztxb8EoVO$C-Am2>`f}$FkF6B^(V0ZxiB;bdD2s1W5~KM%M(v0F%dh{@0jcFsqV7<_ zdk87~{9@I@;e@A+b@0Oj9~@W|*L>}BPo-VH1!2`yzg`zuoLr12>UPaL)0L=v*)IhS zCMpjm+y{}rrg7euuH+YkpNuUBKb``rk^3v-iXv4p!Bi4?=z;(Tqnf@0H`fSmXG$q< zxKfRE?2KxF2L>$~Py{m3rL$DDlQp%=ilxFPCIYemK$%JMdma|VvjFQ5#u|qpQu_w@ z5&~p;09tUx&H6E9kcSd1di0TKc&|f0hSdjtZ5BWuV9^BOa2jZ| zZk@%6oGAl@#f8G>@xsfRtuZ>OJi>pD7wCo`&=1!&oC-XCTK!gJZK?H35(NElJt5II zzph`0UfQ=yaGXGQTt+qO`0;C=wdt1yYFbB_6}KrSo4GaVh#*B0>^RU0ow#^mR+xxX zwQ!#Pu9IC0Q{jBVcB*q)^XxC znPMeKMa3323Brq%yeM*QT#TYu%#c1~RlON2DT84bBLrrouRf$w#G z)G4{U=Zg`Ta@DQ5>elOwD)s8*d(s`d z7S4X^uiq%-0()Rev_B}9obnH@`Uh`YCI890 z{=u}b>BB?6cPQOE@W-`3s9iYwr~WRCOI2IS%e2W=f9gH(`ChJ}4+d8i{DUC~sr;Wd z@0uS@SGGQ|nB4Xa&SbNv2M*==Wn^t()`j{=`VS$7U|K5Uwju`wbYSUd&&Qa3dUg<0 z>>%C{pc8=hbUSg14!`sm^=aT%ZW+01u1D0TBN9<_?$AFSq^stV>x*hyFFiCtdgQo4 zJ;pj~dPHhyoa3f!>bLTwyDG+Q@}%@M#)Al&+(5%IeFAQ~X`D=O2=cK({4|1^TxU3!-rABlr*w z@@ea%pIV)jDMQ#!9BJ6`IuTxlVdv`*h>1=?`FK0(EF?%q@N>h6L17~)-h#|TY;SILP5BcV%eH;#2unHf1J@Ly|L8vpEQCX zM092|ZdM2pZI-S>x2xOJGPMuFp`3NdZyjH(ArDre0uN@LaVLa|&dZHD>B`j-bJqo1 zU7Exre+9)&Brk$2L%+Y8ew0qQK>Sz{lk6&Tdq&ejFinkYO3l9tD{vU3PfbsCg4v%6 zhlDUVYdUc)X8Cin6qjubWU)fNafMdH!&D2aB*(l$GZfE~I=UpvGwN}$Y*V#bPF#qU zYWl8&tuOuBp>MPY55+uMn5+Y9_jN#3Up;aLB*O+^2EwSpY zAq{9!16tp3hzTx&r8$CCAGL-_S7~9(B39bW8KV$^t@F$)0t9)Su#cPpat@HQpPcJ( zGOkgaepQaMk(Fk~f;k;~HDkFXOwU{m7RfT&?@&@F4)bXQ6-;c^!Os*=D|Ru;aW2~{ z)8Xr*B>2eK#Lz|D4?wT-X~R)gNim9&M@;s+;Fr}>*W>_Zd8bgDP_jX)gaW1_IC;m8 z;AISxA^}Ks(XwAzdi)N8cRXH4r~Z_-GYuG9d>Mdv@!QyoOnmIEnRBKc5CQun&|yO< z{~pP|C+XO`vO9I)jC9~knhAs@XVU|danKIJ!p}nzkegw*{$t<22M)@r&)$Z z$=Qeu2Xr$$P7U`x6*o`ZIFa!6B>GO>*}djDH)qqrdK3L;?!36>c^;a(s@l2Yb%!hE z2&_2*Xr8-Dwme%LPP+P;y&chF&c$1%;!}yr-h{gs@^X^K1f{az(zB~&-ArDR^6g#q?S*v@j0~A} zZs|g*^OV$iD(OAFUK0QzSzZTCV{^~)Kq7E(et6y6kZ3-d@E-k~Ggj?S`vNInkL2rF zu2}Q!``m<}^}Ha;-vqm(>YBykL{%4MLeJ_J4<-D)l)Qgk@le7RRMgNifBMtP=Ji1E z1N+as&7Xq)tl<}jrJ6o8x4CV>{xcs9)t&tUTq-7y@F=JMl#91TbU1&SJlbglpgCV4M2B<6TF}S}j?t7k1yZ*2~1Ub9@SL zfjBb=RQ3CR2L)``e`a4pqqOCLoryLE5PBIwG1LH8>QLF+mZ0HK&@zUwnF+`0P=*e@ zEtDc$uN4~Iftj$-zhFU#Y%h+=tU42`6R!9!QITezOp4xo&2fd@y8N^yQ{njImNtsS~F#M;T9(ya~Y z!^m6X~5ijSI4nvnIKJ*Ii)5xk8Lag7aB{mNW zb`23_JEcDeY9_HG+fUT31|NDkw`bee{n+(s3Aa?Jh3D4DOm483m~ewc5!tulpqbgt z%O)x96c>b}ZoGeq_uP#KQ~3}kRW?W;rQ`iFQYU5d==ep*AR^&NcxP$FV-#?Xwrm7- z15bLH2DD+-(ZJAKjpVFZxRP|X0iHU&3EwWsv1`SX+JAm^|M_f!%SmUDC0IDH>S$it zm1;k{+I~12f8{O;FmWAE7!rw zxOxaXPub~6sXfqkYWtsETA`)Miu*oRQiJB!YGs#LXi#E16M(Qq;T1IDJGDs9rZcPw z9YqV;T)?2nqCON635sZ3u+|l5S;sk*i(uAvT3>R~X!WoZCD|DD4Z237S_*Lm=yjI2 z0C$|_PU9wFII3B2Mys?Ebk7-MEWk18_b6NtCGwR$&^Z@=VpUs!v$x79!y3`9&K2X* zcjVvb%8hNu+M$*sLbDon&{%`NC(d7Q#I)3G#Ps;%+!2G~17oBKu$cv`HpG|d)#`6CYgvEa)X@%%k;-3Y#2ZFS*S(&{f| zov>NZb~3FRT-M@_)t!_XUE)Axt+rxB1H&0Q7);iV)6rojHohRMks0i6#_S1ka(*4o zErTq$CWFcoG2@#JF;l|;A2W6E4MC6aBTCju4$%s-g1ML8upkxHbFp%tk!mDZF$xZf zPAOzhBQeuVh8CE)B8epgf{4sqk(o7SDz#i1Ze|y|$atlj68DfpXAf2FHUV4V$8a)5 zDvFfz5T&Q`aO{>8`(>DV%I(Y)v3KIv#;0)C8%YmU)=cKg{sbYh;7(*A8*SC>k@kZK z8$u3f21K@de$TfKrJbeVY36}8Y7tgcl5*~roV%BSagMG-)IIdx)At&8tu^i;s+v!? zF?+WbhFJJjC%rdCW-wW(X2l`fC0Vmj8`@mLMRlQ-d@f8nDj;hjm3y^9_xpIi8{I0uY z!^gR6HvC+1DQoph$)ey=c=^g&*OA=D(HNxZFxPIWYWH$@<@j3F(Ye#<<{oDAmu?Bt zZ&SLrpMJaOmKO38&ruDd#hY(^`z@5#)Up^`oLFv5)bE);eb3+eC>ured8Hb?Xlp<( z$V$-$D@Bu|regj$dO_#Kn3dvlNl$yaZ_oVDT@NvaEp1C}OaA5aiKhMYXCJecn#ZkW zuUyM1i1Vv4a@|J_G8bVEhoYf8?m?p+{wr`h(GI`_p92nKnQna$b6+1q3uhrYCgVO0 zj%4!+PDmb&F$By)J`aemL3mhT>6|>tawNz}*f%FVosoh?5K4h;lJ?e?2lYT0f_rVW zc3dZ{Mt$7!==V^?i@^K3hKgdvHq?v8MG1_037gSMC`7}B;HO?jS#)}5%OhThYIreP zrbZ&Ij?6V&pwJFvnzQ8zTeFl0#>WOKK^ZZeX;8)iw1ZRY8;yuYZDb3>i7^HHL$XZc zuFXx{AQ~`^e?w;&z0sqDbmuQe%1ozk^k^ANP8#SFqV?(sA$^1Tp8ZwFjLmobwS-yokkGBqzhovs276E@u=0v3)sM-9tB?(QRvr)KDZV)4t|u2YRB@v+b*{b{Npl7qMfm-4HdVn_8C+ zN=^Ib&!+qLkvV@)x{Oyukxfa&_X8(TXjAWU<8oi3X%A8onr~fow=SKP-23n1gs!`S z9fj_csydg#sjlNv7cl6FxzoT`B-m_UEo)yk|FP=_uG{8h&vBs4g(C|`=daD1k)Xtv zsC`y)Kg)!7Jvb|aRO-&R*T@7VBUvA++}*mQe0$qZY6 zR#r)Egp~R0I;{8Aaaf#{_+1!`ZbgRc@~2z@$rV@(CoxW{>M3hRj7E zonXj}afP6>cCS>sR~{7!qtb@M!F9!RPa_g%w5tcC>VcKRcTTTWKRD0mg94Vmm)Thk`t?gtbR-BV1$QwP5X%HWK{l)}|e0WQ-;|Iynzxcl}xr zg0Olv`V#uuUNhYPUqd z8#ZN;7h`cBMHmp`XneSwcn;!V47@fI#Ohye(#c^Qr22{2Q$_&3BKWf6Bl2lP4=Yt8 zdzeolT$Fs;+G8h3h}YZlWUVt{RmWW>o^s0rL5@u&jArd|Teujc*{!%q96@@*E?XXP z_^X(Oug-q2Husm&wk<2f{#C3HCfL$eR(9gD!b}F!WINq|rr!eLxvyjl zOTM%*)C7yyjbBMiOTUVi0;kBBoS=KuK5ErmFj->b|)%s6^#@kev=4?^>XTi9^H*H@{(YmZ*o6`&TRb)2%(ryKcEYC|YmrStz>K*qmzYmm2$*&#pBdoqy_H zOZ)d?AH|kC?mU-lIiK(}J#cbWfk!bgxM4c6`}kvsFmciwP4C$)n?7_6%$skzZnzfh zlDi!uS_*ARtM?@B-b?+MWV+q&Mie{M)#Q$!sLq2<%79f!ahzW>&@-g*oP zv>pS>#LqblXgSTDpKI%$8@lUkeo*-s%L2s?B?IU3_*%VU(?Lk*Q7HN_|I%s^nXJ=7 z53ag`Oc?+0-B&ocShE)6W{xUX!d);s0NmZ!0u(v9tZ@2QPKMEz1Etn%Z) z5$>6KE>YibuFxbE7~tkzctlhJ@e7-G;E*Xfa2(#YK3NU}@qkWSpKWqc$?QQqc;~Ip zHn}uD1o1i?;x@T7K`In$rWk)7-$0nT$_zDSfGwj!#j;coFAPHkMGRENA)l>eSU@1f zQKsxXeKSNk#V#XOf!Rf5N{&y3w}k&0INv{f>#*#y2~@IbC_?Xf zh9Wj}irI0Bv8El6pqA>#)NhjQq2IU{#Qb;cvP4E(ONgQsNd8TtQ_ymwU?cShcuhJl zXrMOWtQ3qO5YoAg)Fkhidd{Uj8l|hy*f7WsTc||ETmqu&ut~1rdSTESYdbEhN_ka( z$&7bpJ17Wk$ERmt6eXK3v-3n5!$C%4{S^(rd?n6tywUZz77^bGiZb!(Eprap@?*qj zELg@-QKYUDjUm$({|3HT?+(MNsbtmGG41M~&=?OR@7vtZ9PatAEzH~)#hn5R73@UM zx}$i$>D>dkzU1cV8>bh<#UpE;=DFkdT&457e>=8t?ptpy?p`YSXx~z7wQ+yC(!X%- z#rXZ*0Oy$3854PW}gQw#)}X(OBQ(%!k$p zHX*BD&V;HnH7iP za3}+HI{Dv0&orh|Egg54X$Mi&yn45-;RX-wUNlv>s{W3lr{}qxHt4nfIND0BKLT+e zYav9lnh-Z`(P?kTBin2F)mlpPLU*|1R?a_o639&3c03&n;9m)m;VLlN!$_`!hv|O^ zckD9DR5AYx0uP@lVFk0jjJqoG0L-Hfm1v2nMRtPi^vDW8I(90v?TmjVi7Sj^1lU^& z(ea1~=9U%*yF4sYh}(n7YEe$la%!1`Cn?!2%3gxw2%&Mt#^RGgR3}q2W=uGkfje~S zDIw#N%`_(LBSWD{QTPSrB_0`>WMV+1O~%@)AR}VNV{JPDzx=Zw|ABzs8PG@Ib={!I z-;aGOw&+RKA6TtFkS_D#uBn@+f9Lev&^_D%v1_p+S=M^bUAbUhxSVtcKe@CV{^Q9X zOy2%lvg@gJw-*HaP1_CI!pIspJY+n3<7~>)EP0xjno?~;Qrpm)XXt(#n{1@~~K_2PHKNoe1kw+p166Y2VvC8t!s8&`Q%)-R4o zmEAwYEnN+Z1CqDvhi0&&yDLD?^G)(0(t9`oU|$-*(a{tScVK1D?R{(fQ)~XSbo*^J z9T}(wCs)BQ7JOn(mUk|Bitvo7A`)yfG^2_@OyjxtkgA(7{wH)0yIXBpxRd zR=jKc(KY|EY;mZ)q$2IA1YdyD4SP7ZV!gkDnaG3q1nn8be-^_ntFT_@#E-Tkjnv&P z&To@KBa+0own?GcIfzr+q|k(p;y1QQp;?iNPi>PzQ~rntwn?F>Rm3)kO>O9PGk)#( zZP+GNVkwd^n_IpjH7WVUcPA^eE+uDihVi zl5;ri!r7xH$yZ1@4oU8z zbXiTRtW_#&T>^}PK|rn#JJeH~aMrAs$)6z;{t_+DGb<-4 zujf+J>FJ5pIOuu7`v5(&#S5ZEgLqUxao>5-4xnJ7uryEiQHfMY@1p(8K&oaf<^;+qzw)` z*_CBRI?$?@hT6rnpXK^MM4JslkIkRW*U^GskVVV+krWjI~UHe01adjr6H3a zzXCi|S_pucUns7eotc;zX5diZu;c{;IUsU5!9)%YfOD~zo}o)tmArgQKy2ZsFXn(& z7J~0UO-)B}B`%~R(Fnfrj9;0oq-dfhl@CWT3_zEd(Ko5O1bnL-mf7Uc%*9^#B6aZ$ zGCyWTxONDZV^Q3(7_?<96VtfISNPkgQr6s1=QH-N(Kk2QCAt~g1)v@FD!fdoLgY|8 z0q^VSogkTP#Z|(Zk!v5eg)Jrx&CXXWf~4GPW7a?beijun0Pn1nFA-aF)Gju!I$B|p zwO(GADi28Ifwl6MRC%XV-nmxZJ!ell$`e&*CCAxx86a=%y0aqXte2eiivegtvXK3Y zVT28HVNvP5%Ibv!Yn9M?w18kO_NIy&q@sq!-DD*_lIlDvbsoLFH`#dxDiI`Z?^*V( zwc|oAy7SB%{G>n8{lr@P5vlUX?Q^S@CrGc-ykRnzl!7R%u1VB%tyOhz6v7X4=3*RZ z+9wt5OIHU{)$LMs`&xD9+!=Ieu^D(K;cQ)Jef!0O-5~tFYfF^1ea=}>r<;8}VZ&%B zA6)m;{hXC%=R9>7hm9g6`0o#DzgUJ}ifx#?AG_?hv*gDIYlb?xf912o{fQwk)Molg z&_Hfy;CO}UUzZ!;GHqBMe#0!v_-zp{M0$RcjCa}Bf7qR;5D)+tRcG%2(__hz!}N(J z1fWkiR1+JjVI5plsImMi1QaX3NSQM9#eGV&5mbPttV$dMXJA7EZ5mKguS^|h2M^h| z-sn~WIvR-GLtn*vB;&ZH4)*Ce`Qx@*VAYod0ObaeNRrx?cji{>R#xGALilq`n=Ayo zfj4aapHaYHki(eDQxs~FK|-2B8Er=t4H_&5_J7!z^lJnKUO)kW0=Tcp zmvA&JPOp@%HiJbhUst#KeSGC%;Ksnh^vdyM+0om*x1UXvolH1SVrzM77p6ZElV$z) zaP%N{BevLm+mft2k#L__uWU}Z&2#sUbr`5!)zO?-8NMT`KN)6XDBXbh?v4{CUy(m7i9dqRQ zH2Q+oP^86?sd};IqkNh`hn1;`uvn`#T8foI>IL?&l$uxyLK$=H9c|4W4~@N=J!Z>3 zC`tSh&@MQo5HEtgqOUG8jZS!ajA`YwhjGPe{tdeL49Rimm))J7bDEfEcc=4_yehH> zafeT^1&^~*G{8(NB#^E~J}X54P`;kMgWgC6v-Lz%FNKkbgKSXEED*>JM7{@OKfPs5 zA$WL2tSnV4^jmn14W3u`B94Mx0C#l^N2M7lAap(QD)=edBY%Wq@D--whd+UwwEBR7 z*!97vZ9R=@R(`X1+-&3CNS^a*=9=`}hE zJMPiR>F~@1y*o!oU&A*=m6*cOQB3~msL+UP?Az9|kvy$wVS=3N zxB+pcrE4X=$$mFpH1U&D*kGwzHg9tDyYaHI!Qxvww!z_Nd1#Y< zHclFj7%T^u&u?(_yYk%a{yUx@4crlLAC?ZD-(-OsCya+I_S=mc9DeTLQ&;%GMf$Iy z$ff=bj((T>H`(vTMU&Ceu;}09@bj6q($apr>~{1=)tek%H~L;MSoSR<3;ixdmyfNK zEuXyg>zmBK5jCDNSeh2ZO^$vyzGl2^wCr9i+l1)dvitKFEEdawjbfw4w-8M<4dMoj z+h(Z_oj}Nj)sCK{T>LCYSB~8-TRHiIUuWJsjWC_~Y1<}6ZFngex{n{6lskXBapi@7 zUMwf;Pn>)1rvpl|W{Y2j_(@!Nev^JTdulE1@_f@&)1uNI6&P+_LlAvgJi?9Oo2(;{ zvPVo8jmr19og=u1I${Z%BL!hgr0`-vxZoZ0h&5b@Ff0CS__IfB5jzQd!w&7uslCZL z9pR!FP{BoaxET4Jb2!5_QiMB~T$$2yZ$zg074Ra17W!1*mB`e1EF$EI3u!r<@VEJg zaDK-C9Ee}}c^9z%?*L3@xnyvq8eUTI9`FO@XU`=ln6g-n`Ly(p;ActEq+u4dJWluO zG?nUuz#mek;aKqXr=B{?pPv~Ui-_VyFnQPc@yV-@Zpl0ukckKuEgYbzZW z?gN-(H8ST{|3g=LO1zITrt~mcOMcCB^)5olu3nPy=pG#OW+g(G*uATIcfI0{W2G++$vMYh(L2Y0I&@Aj zWX~@=ejBz)Nw{vy^NFVB1^=duRnClA^iDQ>(~0s4d2V^8Dx)oG5LOLAsV$FMvV08$ z&Uid#`gj4mzk08Jsd{6L4*(Q|O~0qzaSbT=#p*J=w>K|`M$Kotr1nN>-7d6FCzXEd z)Vy2TrITMxsnlY4Z`U@ntFspM$1BcM*5N{eqT7_Evr={}%lgI2V)rxui!W5S&d9_I zs8-y|ypDUBRW#1U%{lX?$;rdED?{k5=}}N3XdgZSl<+hP#%29L9k_`Oq<$nU-%T7x zIHFJBM(Af~eqFsD&~eTt^6oCx?-#1RrFInCcRUv6HJfPH(K7 z-=#O=>T>EMmQSwWEqePpoM55M-8WL%O>i;=^4DkOTPI#b66MR@?2}z$Ap%HPfrnAQ znNsyO3FD?@spI!a*z z{r;JE&nN=F7P@^n>Fi0Dl;8B-@TE!yq>_P^!L^c;a|LNvdCJAFy7+rIwdt!#*EFQP zwI3FLQ2e0K>;d&?E-J=7bJc!WNY^x`YP#;$bm1DhGJmQpuvQjW3Z~iyrMAIT+mW@l zBWX{q{N2=hUOrvd`eFQoc)Gsrd$k|c-V5}m16`@WUMa9Q6&REPgXy+@HR}nf?F4TB z#kCBDxR!yAC04)y)*txr@CSzzzCCo8UBid2f8e^0i{qdc-zU}YTdUc>azv^*NVm+@ zf!FiZE`(FHyQSLQYt?&J+E%NF*tt;pT5ifqSG(OdCeEFg_FhPOpL<~1ntZK#=zet# zm9Sr`-M@x&mzP(oPqGsFC2xPq3q|+9N?+1@NXx4wT&q5b8L4VWdD|o}?m4C^6;ttc z$z4d3|#C-&00Ra<~_WDOBZUpA3TFE$M)dzzIzy=>faq*dQGaqm$SCaBcsbRMMTY! zHJ)PbKQxMD@QEK)vP%nkxDvQJJ(#PR5OSj?1YskI=Nl0^*|9@AtlSpNh~7~S-& zOg92-?X6Wq*P2ccClOq(0fUZqs&6B9u+3VUsUXjt-5l zmP5KW`qF|TF|OLCC7O&W+Y;meL>9e|pF=DLJ?hLFNrjD|UI6dtftr6CLpeReBwTNx zPa^o)DuY`*n=?vWZn9x zZ4R5+s5>4vecZ7l^;EyL9x$+w`iCuf)YD_OO{t5hr`#Mm*jw$5l4CpSNw;-sUQkb8 ztQMl4bm!}`4#guhKYm*M8H+oZIDpY9+mzMx@aVryU$5xdvBbh_#%o3@`|(R)X2Qm5nuk<`5oDvBKd zYGc9BhYakVixS^L=-V!`)H`$IOv=+OdAgVD zS3Ui7KNLO&m8$Gpt?c_0w>wleE*(u)b|>83^cB1Ud}}I)lp9zpIWbr8IfUD#^A}SU zEo&7mdej=S{r`xnvgS}i^wrNCN~mA*_bZeT(Ob)Q$$MbsgycPzPd-VNIEZ%`y(W?= zoAAt<7vHbQBdb2c)y35aR?=$Zm0q&Dpt|;g=-L?M+!b?0vf#d$$+(1aTF({a5St_m z(nv8V5HBF}9zlvFl1R3*V4JKTVVNRDUN#n)m>3;}p%@dAA%=TG7M}YkCpqua68;V% ze(?^sVYFN9iyc3Q)Yrb@eZpWVUazRyu+R&wl9r7^_G;zq&6_s%I%GUyuvDy9R&QA7 z1*z~_$X=~nxp%|HUhP~>VDbD1XEz+o>*V}Ri^o1VxKYHs#awm6V&exj8!qN`Q$;20 zwUo10ESM#m|F6r)FUx|d(ZM?5KT@xM28XV=0k1bkhiJsE&?{qXKr#hT7Q+^VX%R9d z_~dVwES?m*BG;~lqD(85u?R3AxhDKmD#rKv#KdG5s8`|o)oGZUh+Si^hN7c0(Q&{Q z0=yaip%estLgQU99*$0<74pmw$tvIV!{&peee!HEHjv1>%>M04V$7V{(9kP@Omtyj zPN+aC!i5lK&(U{zM1uw_t1VT96T z&eA$4>xm8thY*4PocIkm8zzInuvx$v9Dl|+{+w(2bFSjgxwgOLO8*nL|F4ZUgYmC9 zIGg1B++&1;q)0UIGn;4rg*994+Xa7JFj-<~|BNH|!OM-$84VQ+)j#LxmsS7&0Ujo- AApigX literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/seed_oss_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/seed_oss_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..109336eb724cfdda09bdfef51d960861689b569d GIT binary patch literal 23616 zcmd6PYjhjub>QGh;z0l;Kmr5_62yZ9KoS%m67`^{NTR4WWlL(}uuf=*14*DkP#J)> z#0)pyt#?Z|?uJg2HI>IPy_>D!L|ZdG?uPDmPt+zSRnPY9VhB2zOxSbi6aCoi*&nb) zH;?|t{)O3iYBCOS&C%4Y zaJ5i>{CX^Nd}jJaGM-7yq%I;rIwQEYXJ4L~Nv5BOF|lc(b|jw6#71XRlL&u2mP`s} zl65|wPRFjrg&LB=8lI^d-!|H;08=`6n1o~&hI5e4#GmtVR zQi;q&a^~7Kk~Bl-FV8$1PbKE!%%nJ0F{~{m9{f*;{5~vIoWkXSrBRJ*X!W?3*2HyJ z)eF>B6|H^IJg%p8@v5t8TK}RJmZ$1PYTQ6q$E#oxjd4>W;Gm6jRS@dpW-)eEPn!U? zhBgC~4xp+K>Z+Emfm{p8wZyGZVgpDk*98T*CEI1<(OF0QEJIX zH$eUR1?r}1yx|fRb_%tZ;&J*yI<3sjoNXu#c?UBvTN9J88pCt7Lp?jP1VcuiEOlvu z@KFu_NGT7#tEH%~t6rrtm`i>yX<6zIf$CY6L}4n5rKY5PAc4X2d!@LFx&{yvCCY=E z)D+_1@dF4M2SAv26hjgqV8X>8t%8({g;qn#AeU(%MRqLa(9qurlU$~Ql#H9!L#jqD ztAdneTgob@s-es#r5-h2DECRQ%5y5H#HAQO$~~3xVCG7D7uvK-xgQIsJPN7yHk=MA z_haF7OSP0|SwHKgT)^2b#g*s3HbZ_$NlUR8Oqp&eP8%eA|3;PRk#eW@NHOIJxE?ZV z^Xf;|e#rdHYszZ(DQ!wIR{c%b`mE-g)VDR6sFXW3AjOpD_ewhoR zw*Il(Xnwpl=$gmI=y^X7$x#NEaqbc_ z*a_gU-lxu=^Fm!XehzwN?&+lyv^O=A@g`D}$yqw?-N}5tGmJNpNqZ-gGwDR?nl}@> z7M;_DqLCwELB-DLNh~lIQF>2u$_Bu0D+&kZqGbar0V`u-cNPNf=gyrw@5OO+OXI-9 zp)?q8awbN{X>TZ)4oAa&L6@Gr5od(zi3xJTP6!PXOgtUWOr$eRJT?tufLJUO7wT>> z@#iMwV`d`8FoN~+g$w5VvECKU5nootV)AoMp@tFSrkhe z6;Ng7#3H>YUkTa-NN9p#Vq!8GOQ$C$kUCPZr6wlAIzgL`C$BPfuvvr}f=V1a^xH&| zIAA8u>;r%_p7dV$z3|&-dNR{DdTu0cbielTJe6T?-Iz(FGU=X~8}U>u(L?Y;XJIdESyD2rM7$6Vsu6q?~Uj`7L~)AJffzqz9jEd@v%W+kE(J z%~359iFiZ}L#K=sVHE!6&(cPkyLT8;lQNttmmz#51!@YO*9{VIBo?t|;-I zfe7evs*GGZRXWEJO!`T(?G)DA@uStW#!Z2Ipa#)gJJ)(Lo{BTTDZMf8Y$^e26dW7C zc2Oqxo*sz`I*d(>{n7<^!Wuz4I}5vChv9&rxgNjyri$4QMQQAimpKF>@ZxJFq-C5c zLC=LMka#sn%{Bh*dKG7NAb zamo`ZI)0PEDHKfC<8X8`6KNVIXgX@gnq-Z8NZnk%v0%^lz7g;;bkHzls)1!GXi}sBPPZP z>57z7@x1>W)=oeGl;iUgj=~4R_-x~wy!dqSt!?;h=7S{P2h2!>^9S+i<(ryCIG1NZ z`*Mw3%}cnShi@j;8%h4V)h_qmGKnkrretDP@>scz z5$bacMa>&oBW*-e0;7{+DJ{edm^W`?O^@iQnKeOA&Crt?g$uGK)~xhV?z@GxOvxIw z(kI#^K|miYw1JEZVP%baS6GV1mMt(xTGqnWuvIWqrP*c;Yz;D_mtp!dl0k<8jYiRu z^%+?cox+oU$B#A8mOEBSXSidNW3^HYcy`^GU_fdko02&UD7JAw6-wE9_4 zu%Y}Il*?qUluyarRLNthQk|zC*#oWI18stuRI6aTyR=h*Y(XWL371u8WF9QxWK6*C z)z48(6|1IoNTy;7kEo?()gqkKBOnV|wV#p+L>Vs{e55v26s}pgYD~~(Zh=%?Eio8D zOBl3~@L5zB(}M8^10pArNyO80otN?S?g*wkyoof?`DoBeQlL5|Qr^sU@pMmnlZkXD z8a4=3*JJ5eCc`kl3*BJGF?bpSj8H(+@cv~*%J!=N2OsCqueq(ILIYni7o zt)HApT}@mQ^rTB^=F^awMiM{~T9gPJAUlKVb!~PU4xuzCC0JqZR0VkeaHQiCH>Z>G z@j*ChbSMcLOfo$jt%T_Yws;o;5R!STV_}|mHZB@|?rvSwZ#Y|)o?Cuy-5K3*G%bxT zpILYGynS+Y^oJMTxp42(>jRfJ9nHMG@zq1G99oX#?OiJkKWP1at0ql13tv4No!{ZwU-s695;`T(rGst-c7e_aoZOiR<0_)D6m2-DTxq&l$Yj8Qq zwf3*M`ZpT=%cFNj)*E{v^BXmrSiR_>n!H;rRE_;BV=s;6noivdubVIP_WFg>;$c!c zQ^f;TTxjtGHj^1HP2XhsE<@%{cKzQ&?yG3Tlw}+!Dg7j|kXGToVpO+aj{se(m=+c| z#Iu8t=TwfAN=8$VSX8V=GSj1J^%9kl4;B_mad%|oJtk2zvlnEmQYq}CX!*_&>(Cmk zDY4~BOk3*x!$vl*O{r--?zFTH+5w3Q_<*(|TbB>G^vEs{RZ?%#{kuTOM_byr3xsSm zOxNxLVU${7EQYdU12{ox1?R~u?^4?=LG6maMuK3XfFGqP@+-ryV|B`L#p)#ZIS^K8 z3!w@vs}p-s2~O%CZG#!_eK_SIEL{r}=XwO4)v-vM8tG_7N~7&awYZ+NgPaSy&1k(e z0>%X+MzSKmsR}0IvRNRr4DhWoyjO}T&z1SF&%(e24e9_|qU}^Y7h@)`#~>+&@zm_JpaD}6XwxK2 z&ZMq^N%Q&)lM$+CQ|ZJtII!n_w=@jWh&Y6qt4de54Jwv`^F0GbHL&BtIWNfo(0|Ay z3O?!iOsV5hL4$qJT?MmhMl>Nm^>JGBl8TSl6c`Pvu0V(4v6P_2h@iicNX3|2VKvMT z13IMcDxB=`x#^G9gAz(_=o!TEj2EWn8G`4T@FV&~prXtdah3ElSEgWnnHVJIZhWi` zz_Nf<{mi8cV_p%-j%J@vfZ7g5LGKKRU5pn@``2gaG^`CSVLA@>_UPRFM0wL*nyGT2xs#tn+^m4;-$G;QpMwtb0#&T0Y)u8Lm$Q2FR^Rf~HES2| zY}&LoE;n=5ZrAZaCbpj=eHgRI3}0fZ4Xu&%4^TO#1qUqJ?t#b2c9sA8VVI zhO$PkssHWE>$Zcu!@ZQu9^_i~agKfWJ>KQ!>=f5_V6~lVJDT?#D|k+Eo>O_xnS$pD z&htdxbMY19rYE>)N#5xLyqh+6!4~9f!R7e6t&6vN3ic3Z4`mzH?R$1h zIoIvIN|}4zK2X#eY8y6u!R2T3zP`n?#X7Omd~H*qwwglu@h#y(OApu5lW*x=H1YP9T>BtrAC%`;)p$v@>4NF?ZJXic zs~i{%ZC#so&$4UH9?p&zx{q<)$L{W1?>g<(y{>HV6&8B{?isD^)uA}u*?1eK9Fi3!v@N5NCZ z98aZZnfOF3J()YCiLOSfHb4r*EXgR${zneh2eWZ)M^bd>J7OcqK#-3pJARE!tcNTM{*7 zVCAvt3Dk`R9ry-KUKfnyA(}Zg3HqkU$&ja|(YoaIK7(vZ`ibnyg?y@#+B+9{e4aIp zO-#THgA!cS+YJqOju$C-ygl+EK8lfNR0jV>V{=hMB9ZP=Xa=*s*K!-5J zQP`dB1xF|6=*%9^JN7IXcvmfKE&X}-z{1&0tGi$ga@OE7 zy>9IUwvcVh`j%#ZCt!h}vqJrKYZwds*}hy$bkX!)a6fNrS{lzbugt@JOuJ>_%;Mad z*;jN@4X0FF&6Ldrd}gD*jj#9Jq2HQ%b86L{yZD*)=u=x7b%SL~r?VN0l+Iujb#(JI zoM_33nc2*Z+010QK?~0!V&gppYe7_T$k_RtLWmtd#b;?=&8kPJuV^OK*TB^DIWQH? zYgi3Ulxwh9mSSKv1=ldtZ(dQqY}ZorTJR2$uP9ipqz0ALeGRL+rkdAYqrldy1yf)b zVAZp_O4in0th<%^EUVE{YbxsHs1_{ApP>I$Fv_Y+#JD!8p4U8I77J^}2;X4-5QZDp zF}?6${s;m=lZ2akJzOflc4g2bCR{D<>2G7Hh3rPSc3@(uYw zd5hzHR~lWwv-EVli@spWX~=3y4sT3=qQQ(o?ib(=2JS83?sZ<5 z0(VZlahcZ|u;&$SD3skhuPyy1W)qnKAPKx?32IB|0_egj>)#j6cNTuOtG zxgWvs6k*;#*fLKnQ7W`R|7MdJG{@rz1)J=EC5`z%V`U;+eivikK{OxnH6zFv+%RdB zQKgdwpQ3bFSDK-+BWo`3(Zl7VPz`vtG>eL=nKIi7CNF35X3d;wV8hvP@wXZo z3eJnuD$Dzny4A990TknwNVX>zIhgYtS{UP7J9Aw_*@;~1Vb0tPw;(xlGw%s6j1{#J zgN66B6+C-6&)$_(-g9DMY{Oc&IE?yqY|YxS;q2$z+E@BkS?H;&4tiQ!_r95m?0=xs zxGnF62T{M&P_^|1TQ_Ix&P9*hb^f*IFFo(tF7V;OqK>M8lMx_Gqw{lSf59B)%y8$@ z`EyUlqCv1X^A49Nml+>O)+8 zDC;PMhq>@@zWxYkxi!9`kpi!lmfZfa+<~!N_yX5(Vd3O`d(TEo+j8WaGb<{trI&9C z{>&d)s@kg3;$mrGvGi2$UM2aP5Bgz@zup?AY>gjKT7zZ7TvsqR!(`_?XV=Z=cC7aY zWW99&_uDjF`%r&*uxqq~`l}8thMoRVpY}%tpET()^yx6{=sMY;`RjTW zghCB^S0-j=!L1F*PcTa!A|z!50XY*BONbk$2#Btg;6a!cYGr6S%UY@;WlW;1CH-*s z6H#FV4jg<(g4Lw|jMX!L`VuuI!IV8&JqQioQ5YFOp|ZH+S5f?xj|iDrSF_b1Mgs!| zS1Fk_0><+Gl`uh4-czCTtfN_FtQ1`pYixNlbOYH0Nx zjRbWG9Fd{z3P*6-!m62v)M8a>8`OOGVXuCv>~#i!u5f@euR?x>yO{KsKgp=^C2xlq?j{SO#jC_#-KK^(wc%>saJ^5;o=5 zlL!>>b`G>93#DYW0t&1YOZ%bSb}6Mif$FKW1AP!g2dcZHQKR{N(UPcG3 zrkr{JUal+Qg;VAY$d4##DJJ)(!l+4aj*;HTqXjIxL3W`S&1$B)CEUuhaxIDCLEr|b zqH@Xx@oC$BfVGle=uogqBJ33Enmc~b;zM>K1$8!YKN};QhO@N4L-NC!wBs zGxVy$K48s+uAnt2uS6B-+0qLw?%mN{NLH>P&FnT#SOK#HhvIDtJdNgS!1w@8Q}b3( zdv>rpN0o|z!xkQ1JASNHY8R9b>$W~tz%9WdH<-7+e6I2wDp**H zR709o7I#~Pc}K&{ylvNUR-hkNcKDjFWot|0)PvuIjVO?HsV0oGwMuWGp@|B0q&Bzp?ICd3w|RTz7U*i$E-9b$4%PuP z0qe}HvURMTb+~~lCr}+AKN#C#DXu)Fod%HdeLmqd3V%l~C@bEXqf%Y+hNa=fYQS6J zY#)(d8WLV0FRNbz0T zQ{OR3GfbQB)JT37*Hm|8`YuTdZV8j}T+}gt4tQl{QYlFvh5S(^EyaM>tC)VW0`e*P z_+wN?LtDWq60{op-I1LlxA%INo{8t4+&Vy2kWFAu9DtHjVhiPAX-`DR7-T+DnSKGo zLn!~BVu$R~M`hPEunnbM^L4Uo8h{Tt4am>WmXAt|vre{(F|$=u=RR5nd|UQOoOiJ< zW;%00f&y$6_K37TVDm+E_HFZ(Rmts?wrI_+W?hl5f zo?U97{=ef_xu0w!a%kE<1fdGfr*fy=NA0>YifnikMb0-7iB9Gf^EKZLqk86};ioln-P24C|>}PEO$#wnf1Ra1ZMNOx%1c?3q?_=6c~Q@{-fA zo%L2a4a;2U+gLA1sBPqYl4VA=m2Jbb4)hJg>;ud`g4xgdr|`0I$B%YOSvq?dg#_3@i3i*ROcWacOhGnSNuJ%-Y7jp6P=&CSXSAeW_RuaUzp|tIkR+pR z#4v+S9OO53U5Y7BHYkl0MhWeadM+|c?Wrj3{^!F^bV9Gv6DcO^gtFfENzN#?{ZSkN zeDN#qT{VNUWQ8?FT?weX2K0)LPX`@DbLxNaro`KUH3hf4Vg!JGug80}a8s z3sxRw75>pw^Kc4ayP2*}w$hJY<6C&8^OQtgC2!nuDLwT$DW*JW2d%$TFG>0;>a*0M z_T`N1G22wSR7ZI?tRi<1X0`n@k zOD&1!P$>UEd@SX@q^|Vh4~ldDLG`K%ZsP)4>Xz!iXr5Prx6jjnU<{re@MKUfgl%I6HTtu_q2G{qUVbmsshU%Nah}0j zE^_AtugQEB;(|I+x`V>GgW3KrW~#mju~#rdO`vw%QqAodC9m--UZXgCuk04(rDusB zvN9NOrK*NYEO1hVkq2H(N}#e8(pf;Wor?PZZp z$V*H}IpP)HTNoR_0L2u>i@_5ZoX6k=41N!TuVXL)EaRL@<$cw>Onb zZT{$etHM*}q92b3eGVCp-b8%&-gi+pH@M)~T9*e3{$b8Pe0PTP|IYok0J)~@&E5f-V*?~gmajx@t-Zk>RPVGCPT08@{i!R^tlVAPP{npNvW9zMh zIoBXu={j6+W9e=wH1@4E_I;nd-_)_9UT^BpIr@3K^L_9E^6+rWZ0_L99qZ;WAZhEy zSE)S^!`tVkHB-~_NFgw=78t11I(R2pT}uOnrarEzZ_V1bi5JivYqpMTFK6p|*Ve)N zLyOvVo0sL-22`{Cs;aprxz)n4)>oa3y$oBoc4k#)1T41B}hnl-N5`*Y_0 zQf_F?8iJ^F6}{5R***mmQ?Pe)c5sm9?1whH_F#v@o7-~vU=QyaU5XUk5zbAP;{?}v zBJUn8f}di;;uy@Vt9=7jVr;$jc+PbkT+Cq%^=$=52j}R>JHq0%^$Iu#pXK_`t~<_w z2Xw&|;#{FTEbo;y*B7k8A16oBQ9MTn!Wsj&TRa z^7}5VnJ;YC)&Z>;TtJw2#mZkK%8P>%5wf@CS!P(0>d-L`_a7$P?UNliwJ9G|I76J1+`Av;!bk58=bab9XVv6f>RU5;cvoQg z^UD+9(fZP9AWGiWvfTW(20uM=jvF|a-*cX8I}hv7XjwQ7W5Jnd&0AXubp$wTAZu8& z_VS+g*FJyy^C%A-<+{*A|HqdI;e|Oz7zl4vRR?|%Ep>}VT!-Lm!?%aQFLf^8ba1hH z)84eie06xU_u%TmyVrBQ7mzZed}mLgbCl~G&3B%{j1b?^iO+!agb#@XLl00e*$jlh zPhl-^2pl3p4UiRVSUeB>qd83Wdk^R8DY*7=u6^*WmO|(l7dpmw_kt6Hr)5i{Yjkhf zsOHdp0B(;I+7EN>hk1WT!9U3P2RFh8`EYk3e4Gm(zsG?0MZ^c)Z{M?3Q{`zXY77DQ zmd4cFP^3)Y3gJTLP}=5sQxhTZz?wZEc{|*l;=)hx0kVJFHk|(Zt{{APq2N2f`3`IZ z_uaLx2TyFFXGKw?^EPa0s?gP=3W&%-wlt!WYC|))%l&z~KdUW-j&PwPcMq(E&cPlf zTmfjirQ`9}BhdZE@aw}XfkNMDuJ81{gL(g_bHHNk{$&Q<72y3{+40=|Q#t?X#dAfi zt=>&03Oy(J=FmHX*^almzuSFJ3#;7WUNXT}ZSJ_=Ie=fhImhii_tVbvuU9RrmIq)~ z-`M~9{_JSp-@o$3!r)nM@N8l532yL-y#I;KP=6sb#D#_mp;0ab+nx%BMcrjqYhZ?2L$A%;p24GHl#7n$ zTTX({4$kYVhyUm>?w`F}aPO)$A3O=UUf-gbZ*VSZ_y#xc_P-Xr9hH|=lXs6S0!MN$ zUHR%Q-sRyPUXd+jHS3Np-r@XK|5B#VI>5CK3du3$3^#C6|ub`RvnS|4ccG? z+=$wu@b*nG@{d3Qv;|0P6u4Y9cda#dfqPMN@4L-i_x(Nh1HF8-Pm~=tdXMq@j>Des zi-6~QupQne>GEyCo^L5q2H5=|qd2}Y|K)jdymfN+&g|Tp{lI3+PO4fw$@{vpPi3#= zeEZ<-89apwP5o<4{i_iuW{^v1~$@m0N;)}%e9>(vWwKn7E#xXw~Z)C9pe0l z@~)xPCs!HHbqrjh;e4uZ1BdBA=#7J~AIwhW{RdXtR*&WUM;6cVfjw{ASDw!I9?SKP z7|rV1B)IK?lbl_@~KlF7eG{@Esm-q-*T>6-Br|l>Ii1R{Nhe9StT9QM@V) z;V%z)r&=}tr$q%}*va6vzhIo0fR`|4lkkBz!8|eXh1pnA{M;LZ)tM#?@V=LPV@-Tz zh-jCn$qHKh0tTbUVlxIrsr+M%=`b*2@MjpHm4UqTDC)xh7m`9v=^JyCanLI!K!e{0 zx#XiopCZZuQ;U@_Xu|?MNcX>fk>VRdFI?aq9WR_OYWsA?<*pAYcoeUz)ViLm<^ctd z52`J?wxaivT4&_j0!0mpbob_9NxIM8qq**jV&0~qc5&pT)(1KSC{AhXbS*2IA_b3C z?dqj>3=c3`44qKvESn9DMIA<9r_^;6t4Oq(GPXZ3km#8DtX}5;Mq1Qi1fU>VMWP^M zx{3x8HBz47@}=A7izbpZQ_X?p>UxqZw2+n%D8ByHGGEs54q#)c&=XK4PF z9n<9988R{9Cg$JaBoPJtD8x|9OeW*x^UUdJ?8+pWc|i}ae1Znapyf%>kq`bcT^RS` zXZND$SzP+;UR3<-UKBotl9>Tq;zE)YCI$pz${_XxvbdmZiAygW*bNiWz=>kaV2^-a zz)xB9!WDBFywn426ni3mgpjddP8-Pq@KhSV84cfPnV8KalIf`YMNW}0k&HzH1=7l! zlsG+y5JmNnNZZ5AZy*T-M5WPcQPikZst0CDW%@aVe}6{>{*G$+1y%PetwE*!6$QZq m3_f(IA<$DQC_ea}!Lj&M-q7-beoKGSq;f17Kcw(Un*0B=L9h@2 literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/step3_tool_parser.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/step3_tool_parser.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66b16c684516e20a503a1bcc25b2a1d28bdf6d38 GIT binary patch literal 11565 zcmbt4TWlNGl{0+5hY~4?5=n`Ai+W3b$+jfNk^Gb;JC^b&8<(XyBbyQ*>I~`F+7aT9 z086JT!UPR0H(gjk3q)BP8v)x7?*fZ8_5}m`qY+hbJgY9WZGc7hXPMeTitfkmxpz2} z%+OBSy|ymzfu}P0$2QMu~v@ zO9n`&tD>ryI-n*|n>wnAX#-kZCZoE59@;cfL(CX3#!LZI%p5SsECEZ*8nDJ}0b7g; zP#9JlwZ|#~6}YU6R>m9w2QKTQ&RA8T3YQJh>R3&nh9p!3<6vC(NO@&}S`~4MppEYk zw27&>r!KVC-5~wu-=fNtpOg&K%nwpbJj_5@eLQ}{ua~OEu7;9hiP%DvNk$U!iwJ;A zN)_WwG#NUXiic6_SST8mEVyZs;keKhMzZ4COd=7*ot_idlWZuQWN4-7Onf%M#zJVg zWO)XWK>>kPm{9pbh~*d-YUW5hk_<)@SFYfy1(rCIm}laVMY)>|#py&Wm`X+Hu)IMz zuP9WWR0B2s9QeZ!h$jIRO$OAoDxjg&0WGZw=x8nQLKiSF8b-_L82uGZz)0&E)2u2@ z%#yU>9ZSGW8yO4G&@!vRZDwe*GB(=6P|#|J+`6bI2&RIe7;08k3scK2a(Pxm+uqUK zD=a8bNdpvZ$Iy-fw2U8(FSN`W=!$nV0ViDvunzch!e13r#Z+VL)x}axu_WWDDa>rA zT`=B7*TOitr*>9H*Fjrdnz&8|+|z{LEm6}+W?|nh!51H!e2`FsZvnHw2Q2f2e6a-0 zM14sR3}1?4IB4M(m~dn^6860@d9EiKnP(6rFc%LH)h9{L% z4JV%DiC}R&4G=*z8Yf)SD771eUp>Vl(35_xq~%fz3@aIeL42lyl3^jiMeu2?#6-Cq zO@!xzzz-GoC#HVY%Ts|@W|OB#-e&a#z&Q!)mF;b@5Cf!6AwlD@PxA1 zaeW+;%;27;-h~@1qW7Y!L}+`J4Ru2=*ACh5ZxY)E>n*)taIMsC8Ja~`ZCbNz-Dzr1 zYqksxVsl4Y^Gk#834juko-HUi9m^zBY`i>c<^c4YShGa(qae&kS*4i67<=*_MuTILu9S2dB0t#>{zSG zRt}`~qPgw$5S8Lk01`N1S}JvkLF zTu;zJ5NOv}^dCV11i#ofR}^prCMP8L_X_QZ)*N{QC^x5E#0nVv(rV?nEU6;Il6nu` za(vM2!K>jl+>YP{e+F@v0uG0{_idWC7fP31h#fD)d zz{yzjNF;s@IN^(jV$4WW$9KQibg9eVH0oDPP5l<_c5$Q#38D``Nro*2hEc%XE=~TfR?n^LQyaR;F5z=juRFisVWH290NK7?-dja+UsI#sX&zl zI}F&kIAk}8rv}39z3ac@-)K4c$egX4NmGxhdePw(oy|F?UvT=@8ne#sv~j1gIj!4v z)!jASF^QIQqNV1ER$XgKTb~#RkN?(0dOXuAx_x(#-Z}cwNcv=E|GVe%YUtUaTq|uG zlyAe}!y?J{!n`b)bVRs#@StLVKP-+dkl!YFH9GdC={m`&Na6}HPGrbSArKM}nR4P< zP8IOvdNT4b@GNe#Wqu>gI{HRaD?>uIL*Ka z4IVL&bSQvxNEMc(B)YnBQf50qkVFzGJ)C%h(yrhJ%NzbFdnAeFcVRB=|6?wsNx$t} z4`D7Ho~r>UfwXc-QYf)x=FPPJ8o_Ge*MLUEN#3CBVfk~wV??dMqyKs_CK--Z(a&JW zkjpv^P>KhIg;dux(M${gwzojGzXR; z+fuSj~h-5D_4C0#rfy9~ej%W!Kkp}1cKkZcMR%WNnLhW=3?0;rl)H0r0= zCR8yX&4exub^ujPMUBTvYOpILZ6wLWILQPsm&D9PFwF8^Aht@A< zbfUNUZs1N}&9Z(u>ph$?iA^1!bbs9aN$>k-Gh-`N+wRs?=Kl1Sdq8ygM0e}m(L1AS z&1-DdJ(w96>+0`z-04_7n62x~jEOaMD_1{wTWst4Wd7s%pT$;mE8I`aVy$P@xLNDp z_O{Cwz%S~4=^ftg>{(;(b*`LT-FIi=D>YF+vf&-x0Z_wF8cHGi*0uM#zYd9ziY@+6 zCO)3{+2o2buOoc^jqVYlVFc8+j|2Ru&|zT7{Svh^b3E)Q9D zr!TzsiaS>G2E?;JnfeBr;Eljn)3>0UNtv~jN_%)~+RNKLt18~Ad)wxUH}x(}mHVtX z7sxT(ZH5=YBB?CTcivpO*5}Gq8CyYF13UCQ{Cjpol(RMz45h0m@yXS2Qi{?J_vkjr zNzqQ(r42|CIDpqlwgc{AVh(AYl2m>iGH&LIR>$rTTBEFQ$x?o#EVrDRw`54x{#@E2 zudO60nAykQigqu+P)fTOt&}y;+P$=lSK-;q(Nb5yTckU!hmpn8{S0TzP+Hkv`PIyW z>&%WqUzw3}jU}XlDp<$Jxq}3@=&o)qJ6?f&!OZ0HDB4ZA*~^bY5<+W`C7n?(y8)&Q zKgw$yqYXb*Emgq0^(BL$ z%3Z}eqbQ$$G%e9XS4rY4ly!sD74PSyUjc(T%AFh|ZAx}2Fp9qZi1-UtvPWs zeY-+~`xOeVXZd!6fOq1`sTqj2=&|DzPyY6QA%PE8c=p9l$Sh*lNxup^Wb84hu%nPI z+QJYO2t%}}2f0c;ei8%tHQ=JfY34e65P&3AgvQV)z9T&SoD%$y6-A8`RiCDy^=c0#91V1ED~`Rz0XS~{A$g;6-i!|>@(-jp9_v1KX-0A z`0|NUXW%InJ%`Dzp)QJF#3b7upc7N$618iWLwF!UOW1gq-=9&i030`#eF;rP6-O4Z zS3ffpzpioM4@kPu0=#R}l67|g%*3LXR7sawKmiOEnaV8E0W5m>VK1OQ%^U{-GA*5o zEpUFP?3KQPdM~1E8f7ymdlhA`p$t7ENcL15+|e+2x(qG*A4W`+AiVOkZy*R=Vs@4T z-`R>epfDFv9|i#L5G0CGKvm1^M)&Ym&B%s* z1VEgwoTGEo(J4BbbB(1{!zg{nr%HMIFEfe zDfWW@yyff?ot_;}cg`~)cm~#6a)YOY!Bbh!>5Nfycve*(9NTeua;_f1)su4#39g}y z!)J1b&kKjoXAi%!<+}LT)4kKsyxr2iX34f3$c+DZa@*a$(f`tx`}l_Q_)dNA+Tyx# zqy9+d)b2uRe)gu|9QY7HuIsnlLmSSar#hm#cB9sxt?Il#y)ke+H*i`QIGr6ho9&qp zswUDW#fH9I!$G0pV76fh=ptI|?@hitx$>G|Y5vjV4%LvOIt8k8ZGWz7ROlMrqK=7n z=WXi;)}K38XL7AWLhDer@vvY&{GnCs>dR=h>>XmeAN{sGw!46Y-o~7#U-0y=zbtqT z{*F+2>mTgT9laf^!sDP~*H)78CB z3a$ZY5C;z-PW5fqc~|IM{Q;r=K(_ASgO=QpDdEV}X5EyCSkx!D`f{#=g6rTqyX89a zR86?+%YbL=rgkp8niYKk9d!We;ofU@8X7^7BtrLQC0)*ni%=5Uj)7YA8JRme45M8bCKj-obF8^Bh z169^_RCF~w(QBHjGiSdr5w51~&fd>DKkZx(JYu#wXI7}~hOWGF)^NK%b$Q6Mp!) z3^{}%%0A)nfs3@>pI( zA6koJ%j9wj)=;rKw^HJZhC9Y*TcUUygc6o{>L$7T9*imTl*8M=$Sum&5|u>Zc$~_) z6+A_g_jQUyfqP__LqsV_$>K9cvKS13dF=O#PT~FHQB7EfUQAn_OZx_VG}5uN0}LANR0gz zvI+QSr~zAg26$JNEj$ebHj)L48h^bZ8Nr5=pQ7c3qb)!@1X-{ULP7gyM8Xs{WFUbr9~3By!{3K(h}k}AcdEB?y1UFFxUwZ*1yw&?x` zFmjLNkcWw|*mGu|VD_z91oQrFN1y0zT4Ns!gJyNw@`TE2%NvNgwp?wOP>V0y<3jiG zhXdK#6Sr(>L#9PEQ#o_JV6I=K1#{A))oq1I^!C|Hk_8 z2iev$f~6^aCi7<^<$CY!TW@D6-|E>-2=oB3zbEG(6Z~TjC$jzvf~74zxpMT^lxNi~ zG!BXNO}Y9(p?+|Ey)I{O6zq-LPS18@&-%!t zvs;a?18J+Ozf^1BMr|gl8ouZzEY+BBjSw7q+}ydMf2!9Y;%k8TI%43!ZuO~&uy>2K z^=pIcJfN|gfI~(TzU2HK^vx0Wi-M9V7=$Y?6=i}!$r23yAQg%hdi22{od^em@(IGH zrg&i3w~-{;rZ1crJ9GS8aOV8PiK%H8sY(1v5Q{zm!QZk-7VvqJkuZElaWz4+V`!K* znSxIlB@Gf_`Mja$3;EEYdy+kcGVDvDTL&Lr_J>fCtc4Gp!VDaiARKad1%wbOWG~5| z&!8i~qO2ZuT!Nn8ze9YX(r673vU3e?Xq;kI&&?^Zy6@(B(b;u#QluJho))Xy)@%=` zN8XL9*KbbcHCnBIwfjo~e)2Yrwq~R4C{P%FUwZXR^b4qTm3bAVZ4|3qc{Tj*7;R}T zGjnTkW&f&rb$ZnRVY_j`-M>Ng3&w%pX#wPkS{amYYSdQmII8nn_yw+2RAr`Dnlo>_ zZ_DdZi-D+Y&l^$6M3`*paAsfn>aF@GX4G<$oKR_9J2iEAEh@lhDC%+1K)_;*xM(80 zt*g^_Ci7-owGf_`)v-Gxc`L5kh`OfL<~!azg{yWnT?H;y5~k`EjbQXVao~DDvtMi4 zsjkgy;TPrtKs_oM2vgfvMkvXjZXkO5-RwA`JyM$ByC_gw5_3ruptXKa2tQXwpWCC!>;)ES?EEU*EM=BlbW2~8_!_Pw^ zJJR^uNwgSHltmvGbSK2RS!6j#W;|a0u9Za}VMvX*1;FDR`jQepmJFtn5%_|#_|YI1 zc*%s}aF_gYu#86Q3aAiqJ|$#Ap{rFg49n+jl)Vd8B)i-sWO+47lJLu>6w>R!>1rp(-z1p39( F{U5&`K>7dx literal 0 HcmV?d00001 diff --git a/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-312.pyc b/entrypoints/openai/tool_parsers/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a56b69ab7a8af8ad4feb8db15fc8b20d864e3ee0 GIT binary patch literal 9136 zcmeHMTWlLwdY<77@03VM)Wz3~W!W-gU1G9&bKK2lE&dh(#`OnPxZ{PX(4|R1O0VO8?BpTc$2*0L>N^zBM);<&k;i{kt zsu&Z7sI|p}xHu%nZ9_JZo~4)+mxpA2FUIUccF^RQBkmk>#$7`$PPfP0anF#4-#cR7 zxNpeE@13!_c>Pd)+&|>Uvr7x8Zd@KMFzQgfKbD6YHqm^bHL7(OQ(qnD2dzmB;JJa* zf?D%7ygWEGqNhb|0yU`lz_nGAwdPSJE(GwPE~DaNY^(ZgrOytni^X059s>jlO&C z(svK~6ji}$E?Pya=kGv#oYwnrKKu=6&$OMq-+Cj5x(BoOOpBrRFG!UAA)D!ca^QS_ zpB71~+T)BRndx08u#jj0p{M$r2G8y>huY z&`P-0eu@g3D9+jp=%iH)>ft3VyjP~y(8LPuy)wsH$*@;usJ66Jq4x0L^66vH&8(YRzn>*4IqGgV_sa#RVk zvB|iWFm&qfKR>8MhQA5>0>*50k`{QI?D=q!66nn3m93*r2zaR#IVRM8mP+2|bw@#)Gc0F2zVH z<60~Q9*UOGC+Rb+55;FFEIKxBV8}&uaenx4#S$)9VhX-n3&%01OVN|oyhtpmYuztT zCJhbyVLgrDO)qNr>~KQmeAdYth+vvX8taU)-}stIrp2bq`=a%X^(do|%CIGpVM`{2$8pva(xlBn{A30N z7g})X7f{U!rR_p}VD9v4L-3~qi;ZtQe=U-0e`{=|VgKBdt96apL)nR6)$PfMMQ>du zJbyI%SoYuKb9&Of6>4P`vaEk+5)QsdzS90!X= zln9&;%wA)b!??pREH=N7VJMxj5dOw2fQ|ct{BKBR4QN;+RZXZ&XH*Zz@Ddws!di_r z(=e`-RdZmiG;4zbmj~}gM>j5AM+F5A{C5hRJG7AaRoy;IiC@l)%pc1>v)Hg`TX^mh z?~dGKOJa^)KmBh1_5Pb3M~cm@R@dU_!th4b-d|R2Mha!hkx9l(DH&6@)^Y{ga8+0j zQP3o@qMx(2w5Zb0_NNjto2Z5_Jd{h?Ry#tKRQbK~s9O50xE{zhtRSTk#*C_Nr)}?% zSWwu$v|P*Kf3bEgf2qp9DJ^fwQ>y(2gjjh%2-(9nd=%H6C!&`&$HH=i;@Yi7M@JDB z6A?s4OW)&BRgGy|MN!U=2V05^7Zb@T%s`k5U(yf*H^yWHlwJfwOSS$#h~6cLDuZw^ z$vaWV*#W-SRTQFhb73X4Ddm#JbixxrE##fb*daFE&4Gi;A%+s4)D1|Gj~I$mu&wI#<6{>Jr6Ic+HDbri5V+qT7 zZoBLIvVEBJ2rfN`YEJmNL#S7BaxSt`ckkTk+n&04HEYYL^PyXw9R<&h#nac1H<-^Otg6G)WsnxoBsuPP||6Jdy(~~*&<5!FRriH%jNVac&=9Yg? z!M`WB|L4J_!%M;IUCXxRM_2sE-ZySEE*k~^!x{N=YtC(d)0Ommdhs+lTaQYH1{RvbEsUE@KF!w~!S3mc7(NkXxv|RNr zco)y+CYN5mKD`n+l(84Rf$U?ipD6koZhIT&`b(1F4qU09ug^~2bnPm+1+U+dnmd|E zO&Y`aPSZXb3yOgefs( z>!0yFPG>7c4IAN*(`?$}++^Bg$*`(h_=t7@X9sbb`S~0|?E#4a%?hTd znG(pGgxR=(yT$c5hQEwUe*qQ5X>-`?R|9)*1ww^DXeqS(>`LIohXWsN`!Ky?S9PsF4C^ZRTQR0$d%^v z0=Y_}a6y>1Rd-$LA3Ti{m`rC=b{L}=?P^Hm`=`@eRFpre(;1Kw6dFItvWL+5DlQ#| ze7;+!;q@Rf$5{Iv*co+Tt zbBw+U{jI@+)uhBlXgYfc4vC*swsJWY-DI63RwwCuHMa#7@x8kS$icM_nX|bADVK=EK-Ybo~j9`97*Sq3H2mj%4@zWc2lWfuj$- z8GL8wn>(-VF1p;Ar{^6R5st#^yK}_byKj4ab7#uXn%v(8&8a4Tev3smqH3e+Co1Tg zwZY|Abp;d31=Ol%sL&{BH&81T!KiwQD!(qwg~$eQSREp}*=_Ha@s_d1t=uSEf)u&=adA&GA!AUFCVWj$+LSXDXsc1v`qn-ebww>|)p2r4EuYfsxJ2(fJ{6-S4) zfKWMC$E-8$NQ(&8t#E4&p3HGxM1*7h_2VsaxQ@5*AH>_$W9VyQ+DYRd#tN4^W#RHL zva}*ZEcbtF?@+Z-eWqcMP!?{Zi6~EfcFy#6;*_Z4co8$Ah!Bb;1soct#-ovOIyz9E zv1)-mh|eThQ%WUMz*>eD*GC+`WvVa>m9 ze8IW%cbcyoFUqyczzrn=RWeHG*A+2h8$;6YMYu&IHmcA8iIejEg6*3XLPJ|I*nB&<^RrWDR+`U#eDubVn=SV(2S05&_R-No^I2GUqvuPfv&k{{gRcWZ zTldnlE3NnE<=|(|*5WM_0C-$eLdI~%}x z)eER_rUYJ8SfP&8)torJF=V3ODi!k0fmFju zx-&TIHMQ`rO(?)>6E zt9ehssIN?~e(@!IF{MX0m95@<(=LSW*aga}@>1pAiaq#_-1)v7{2%ShRx50|SI`xi zpQOD8rFMB)x8ea|x?+veRLxz;PJd6z3H%N|8zN;qjU(p)PZ6A)s9RGp%Vuo>&4iWGqx8WJiabk7~No7)08W6P#p z{5c*@ZBDIhr$Bud^Bo7fzr&?J0k6dm)2^m$qTuXUvM<#E1AXpm$-ep*r*8TV{9>Z$ zYP#9n`L$&0b-;5sE4P9jgp-SDIdU-@YV&xc#HPf9(47!RNDmSI;h-T{ITX zFWWP&eA5d>Z*cKU!P~LgxO4GRE}Hwp{C!X7cRjPxIFR=ZkQa9s{Oz~=-35Pl-qn4_ zs>f?rJx~Z9xE1U!1iLftFCD_R-D`F^==ry{U0He6ziqMOrt-i||AQsFP}gwfsrjdF z`F0n4yK{2E*HLWS1+u%LBwzw%$eX{*OC2ou58m=0EBKG)UB_-a8?zIiIF(iZy-P<| z{D<91pV+iGP!iBASY-w-NzLM4eFZ`(bY`?GR(U~zBug@yWDun^dn_wFm$_pjNhztrLu9l61LS6^XY zUrE5jFN15;l-dGf5DCt#oNw(d1bPeJ`$`f>x9#pVJKECzda-qLKS;bEZK-{a7$`Ob z^R1ymLnz;S9XEwFAU)A3+t0X|bLnOAj3ZB-Nn_6K7t zXpsEaJ@=CDk#c3n*)0~B2k?HLd+xdSo_o%{=N|rp(WoQfi!1*1%-J^x;@2o6e#%nf z$$y2!y97H@rcFr4gk46A5wz~x1g(!*K9ZM89XH9K;ddx=E=UTRi?KJC zV)Ik6pj?oT%-<9==NA(*3-i&qpnPQ>5&%;@7rhaSBo-Fpk*R1r9+{zoYN77<^=RVw z!rWp!hTtzCG;Bhs9*xBl(UVK_Q>gSffC~m(bT-Da(Q7fmh;x?!8LqUSSWYm}sYHwx zi(Z~U&4yUhzXJ=CS)`jlTXWT6VoIu`?g_1LCa%GS`kyl z)G=ivz>4v-4CSU3wDQ{uSYHICifI8-H?5-8P^PCfP^N{S4u1NWA!bA^XoHk8N~vPY zs+ftcD&aNLCct67O)QfW)fb7NS*ThbA30kdyP!>oSnu zVh!;f8Atp%FfvDqj6xFE2#$zI?P(HnC3XQ)0X~B8;SWQK+o0G^wWm&9&e8JkBTO2x zgjPUKE9J;*(4JQQr5sA}mP0i+iJFfqu{ktlPG-Ae^q1gY7J8~_=?ismOWwLr%rSK|v) zHzJGCL?Xt_vw{T=ZD#&jBoVzP77M0gH>j9RL=$N1E7(|knyG^Y5VVoVR6NSEkqC=; zfTG^{VzM(ax7fKjv)G;h!eWW`<)woI{WlKAI^#1}I~Q*zt}o1Yz7>zpb;2l^n~MuG z^9iY=}u{R$Gp|}YkI=fNI1u3r|ryi>mK^;o@3p&C- zfKu*)25R%xrhJqAQ?;TCs(-4nKY=a@!7vAt8jmb2B^H+wf{KYHmYDfH8x65L-e?-w zBCyR$vX>xcrR{_;6+e!cl{S)ChBsmfHYwv|%rr;B29(W88>tNWQ7M@yZz7qviC3Vm ziI~-wp~aNgDk%r7MzORU({d5Vb6YF4M8YJtnl+25a?B}2JS4%%ImNW>B8Zh)OBq5; zjS^RAf=DVWW~*MTE61D?M3gu)Do(|cckG-B+Kxh7^;c4(E>ly{kF|_nOmQIDp{J6- zd+r>_VO;8I*%at`^t(tXh&$5?5T(y+HER=lD#x@^q>gF&%8^tEYTyhK&nxf8f6((M26%_lf!r<3kg}rk(C3_v_?*-_Wlf>)9o&L9pW%=vG z+a%2RqykD-$vd55*$ZG%AS}?v^)=d{)0K)N-y)bQT8(2Jt--OXxDLfJybkMMTn!O- zP`mQ)bX4hknTahj3sZ3Xp%T|)6qYDdQKC?Z1u9lN|4>s4^9hii6r5D3#RZg{iN*n7 zAx@z~7uz8rynv+;J17;PjxH_1L5y0MrUE5IZDlMKx+sEGsnqh^ibhm_zeAEsAO!NV zK;9A*NGS^W@Z)zk)FWA!v>RdhO(vrAV&qasx7+cSylK@HOa08A@_ zs*>~<90;eb3+mF~1hZ5jmKd-SENMQWn8S@?tf0YlKq8CIBBgbqm>}w+0TgO!eIk|8 zex>;-jr0t-!+?a9ftY%S*sUR&?C&++ZA^!A-XY#Qv=PrXp1EE1iM}yk@5y-2roWl< z5AgnhtpCstryhj<+XC-Do2fsStG~q8U&_|MdV4JIYR|cPc~|fHNY-^A=NjT&Ls{2w z#$Yd~jK<)W+48RLySj|6b<^CIZ*XR)u1rH$#@tn~6ApLIKCo#YxIMmQbG$crcQ74a zAJ5uGZjTEl+s~=C)l&tTs?NP-wZEskt4sH9THAI!z7Lx3HLuy%nT?^W=Ty#fj`y6) zdR|Vc^RDJKO~%!=Y3=${Pq^C)YQiyCXd(>OoZi9f9a+6Arw{S^&|2@NzOx|H89PeE zrG_@mfqa84HJND$X3W7tJz=(|E@e#ajK=->?l(xnV$IdI@U<%Go-3TPHx)xdkf) zkag}acg+$ z;gkvKJTIyawk3gR#NVuRj1WuUmLVNNNaAUFmPr64hdRPEx0EJoqbu zz@t+qH5D{z5|VlpX<{^>_%&~lEL`7E%rtxjjVewpwk3%>eF=14#viB$_4f!`lV}i2 z#6#Ejh`&Bu9itS-c?0YucRA^9Ynpv2m zqM&K9(PFPEE~{6#cM1licUf%hh|z*hx-PQJ2++LZx+JM(ijF3t)C^0_FCg7b!F?=x zlj#8{rWb+wUYdU^8lR!dbWla}1{FqdEPj zKXV-Xu69+vdTGb%Nr&%SHbDzDIMSE+hG4$Iv*zO)I(95g?~Z?WJmVgFSi5OCo443^ z?7n=XGsUK2>F)P0|Jy6+^O=TlzR|no2<9E`bo1RwAd3pE`POdnp+i5swCOswD~E%_ zd#CT7PAAsR@GbptfY^5X(>Hi`&#qqWtlw4Z9W|@kf|aPY<;-s0>`n)NYHlmE63+hJ z4#Lv4<)G3{-;1oh$ve98Hn>N-U_h2C7?82*^IZdBZhTBAv{hS%nw-JT8|*L(AIl$% z{EceUFt%0OnRj^B;P3%6X{q{*kQuAKEGPl|3pNDH@n>F8rT>0J58;n`j;kR0ctCps zw5Y7@L|Fc}9VAMJl@JO>WEYuPSYjgySX#j#Sy)g`2bcInv{-`;C`CYIIRj%RD^yF+ zQjw|DL@YQrC7H2+Fr?nFy4{3L@h$=P>XHbCa_JLQKvDzBSW>Z4HM9~1A68e|qz>Kt z-vV17YlfQ#*aC1G*^1%jj?-|90hI;mq@L5vN~*bxvwW`vEJs0U?hhrYZGdWjCX}R- z0Sc_ka2G040oG_vN2`}eD6fLFewkztm6~EbgQ(PEDh-^0H6$cU1}F_l*k6Q0N`(>x z8?`9@Sv_sQra>b%4N9YgLyeA+ak_b!rKB+-Su$p&8*q6ZIHNeX$*PKDLe)M>R?Ssa z8X))3R3*~1N;3WmB~HxlqmIQMX%kYB(8BP-EzCCpAQpFlPxiq}o3O>WdWm3uP1ht% z2}z01I?E##Q=AF-3aJP9l4j1#{5@x$brO?lPIs+u;@wOmaC z-OupHRo}OWSZ2{}>Zy;eyI(KX-*5OxQLMg3-nWXS$y&PZ8Zr_~t4W(6S3X!lZdN)> zmeJFVLy*}szxzQop_vo=f^|W&?0p7C3_IIi?n6w`b#ygtqwRkw?FW*WucPapho`h= z4n!q_j7SGVt`@9fWj?lWI`$H$XD&Q{RcU9%s@8FJ%pg}c``lH1u0*Qm>Ypz?7-Z?F z2g`>Fb~1`(Tz%4lxvy57mkNrz#IkY>d}tB*P+WmzL&b_UNTpWJT3Q=F?MgN#UbKhC zeR^o(ni%q?`lh;~r~Nr-4{vbR8?a}X;peUCzPwTWhrIEB)iZf#_$%8{mE2sSc<>n% zW7vHuuC$9Hg!p?N7IB?QJ(72x)n_9@m$Y#p@0Fy3c5^nkzIxkawu(Ylk9F(mt#m{4apF8S~fIo7w+=>$fkTaJFUHGJYMW{VSl|r{q(z zXYZ2onS%D;ZxMOqE6)@GQN!##Q|y&HOzSgdxKX0eTuI@xpwaa2kuSC48xS2#dR|y- zdCL4Mjx*`yY!&;)E0*J3AY=YJ2lc4;+2?Bfq_1M!Ab*p7Bz@6Yl(_?EZRv!y?|E4| z$9?5_d3q16Rh6_dufbUwQ<`k9ICVGABD2F&9~}}~CJo@ClXg!(%zO*J$@NyqJ9K(~)?WF4l=C;$VBRi2UhLzk z04DF^`?BT^B^sl@s!igUIJyK#=;AjmmoUlhoU6ceN`!#=j$|-#TFlR$6;tJyZs&q< zYL%=@a3+&Zrj-^LY)V%rgWn2*DILcZru2@Lew}DoJyCk1MwZv&V>Gg1Q<4`Ee$y@+ z&{x#poP}OoSCo*@U0yL%xM^Kk(O39nG3esBq7~nuFP~dcdC<$VXGQOk-kv={5?hR! zW&mc;MVdig9E=AAuvVhx ze5K|W=3_yHpu|>q<~%}l4h3jR0UpgUgJeI0{5P0RR4Ny=*!K&+j@RL&L}mmmkkd2E zLe<`4<_0ue@xV;%@5lzFP=}n=>h_^O6l9P^h&h9TafB(n7YYy31LJK&W0r_PC1@5V zZ=oQ`#8Fe(tt2xC2^bu@=D-WXEJS#`E8|}aZD8^i(0%!vw~|*lNKD?sN$4mTL;*(6 zbOUmMOp4DGGxHT3Q)Z<>D%isGBRcR?#SxT(3nO?4T#vHg$Ho+weGrwSV+8Xu3ci5? zWVK?D?!zw-MSegx7N!pc11La@&*)KrUMB@z@r55fUohx#i}@22P^e0S91>>0ZLavv zflgY?0aT#`=TY!mm4n|K^9m|`6$KNhN^>T56TiU-x|fiTU@@(MHxca0z`TYaFQDLc z2!f&FJIGa3Fogmd1u+y%qu?3}knqEffHUy8LKkh;ma^{w6p0=W6`l(SP<$3)1uM*4 zr2M*qogc7cpinad_Hhxg=q$MB#ieLF$F6xrjI3b5?unvbg%0P6&KO4VsVaiEgBl}- zXDWC^0&BaVFZzlUN1($*6x&tpC9Twh=&MnBY+G0Ye@F{Zg4|$m2W9s~u~clL7tQzR zl}IxW7JsnnFPC15Fl{xlxy8gy>{ThDF;|oo39GFrDGg>>AZG*vY%7@1qElrlC-#k8 zWW+ZIm|Sq2EIN%=ygoN6plQy^yRJgBGfv`TYJ`Hy!8-FpYJgE#kjp6 zG~R1m8``jE-6vO1f70O22YPaWem>B@QJoDON*RGBZ#e7eOR2Z0wjHWHA8O49d%;a+ zt9>}%J_yfi9nHUO?z%U-L8oW4&Bu3*YPY8#*9M%sa=p7Lr3BZWaQ}L9O0j9}%y)L7 zxGjI;Bq9<#VA?Y6BfRz4md%+;l>C#%dDlqRcI-j)k!>swjy3i!-rlut=k3Ee`w8BD zV!OL9fB0DbKtIf;+w+-R<#58HT_9g2Q==8Y8*lRVkq523{apUQAnLU@@A9P#pX!LV z<0Nlxd1!z+H(DWYY2`b|b{wwvUcdW#-ZHQ;1U%`iS=GVvSe$u(XU^Zp`}@|TS$}`d zKg|1wv;L#2=e8UCxrQ#jp=-T%vjOIqN}ofsn|xTsAAEJYIh@kNB-{It)0FFd*$4W2 z`nB9+Zeii8gX({>Kp&+Pq$l+WX!%TPxJfC2ejtcNz<{6R<7cQkKn zUaP))WK{zV>Kv(+tR=X2)i;)T`>BVvNA_2s{W0=1xjLG&1b9mTUc`CJaK5z{ZH++Q z6Iw+*xZBq*^VC3&I>J*&9*|k;*y>BDxH)|zeP!MK*Ju9I+4UnC=ke9C{OB1G)mjs8 z39nz~Eh9Vjz>YnbZ|;VL&wE2#!NCXG&EV-=@D)DzO1`@fm?rcp`Gk;#JiA&ou%2A& zYXVkx)UTcZVR4CkO`>nj&RY(w-{LLDADrPWFXwv>AVND~0fF1;oU@B}c4eEo*DpV6 zI*Ruw5|BA-H*f7;_iS4GU~ZoR%r+f`q1fH4;}F4AMzbatn#!zi2+iI&iNuH(I`B1r zFmP`m(|+WE4gB3QX6iBUvM&?p&jp70z;HHj^g(wvFuEOT%~)Et9G>)G&e4$~^EP{0 zpS87ai=J;A2Htz(!42MfAx|BIZQ*l*g!TsuO2X~WQDL46uTS#Su?L5E>U`dJ=u^22 zHb3~kKMNv^A;71h1$+f^p&>prlx;rz!1kzlJdfP<4)DGMIp1O4cX;C}c<#XrJO!9- z9^a;XIjV=Jde#qRVTFN+F^G2p;hjKVzUS~(-^u*oFvz6?9l!19|8VvJy*8WeIJ2u& z215n8x~q9tt_^z9N?0sk=X%pd^asvI-Xpm8e%{xg891Buoy%Of$onorB7f*e?$CMu z(D{5|2xuQ_PLFQ-`eCvI?cX;P&X9z&v+y!WPzxl8YnS)^V;`KkcV^wNaU<(HopqgA zHEh>9->*$iX5Bs6+MYbswl8x6A$U9R-lKSF!82pzFTPb!7#iJMt)2gI@Wa9N8`;)l zsqrmWaINi!icQzy2d$egUj%%xDBmERprMYFPGE|zp^YbRre2`R;RcRrbQC52 z+9Yos+&IKrLAv(>y2d6z=AfPd(S{+A7mO$E%Xz}QC!BTmZP*?;N3odb;O!kbdmnG_ zTfYh}CcrUI0cM?}+eiVT&9!zYYdyHp%Ug#b-YMy`KBUQdV7EQ3$+SwO$<#=bX|X1= z5R^Y^%0&4vTpzV`c5>%ouaGabVj-;tE|mo^_Th&u!$Lawut`r5d~ z+t1{wE*O!+2_y10r|DdCKi}M+^$cuWe&jio7g;+;4S)>W@N7~@!afBE{!4wiEXhcD90b28JHgv2HJkUPseQ6scbJKeCL+2ygK}jTMx=&>7Cm*_DC=lm+ z59WHu_};NRg|>FsnHt@+cfopEte@{*)K<8^%8YftGFu8th`-$BKpvOm$6sInSeJ2~ z0S0b#|H=+r0P&Z*6Ef`HBiDw@?me#IjW;)@GLB=LhT}V>)7DQ7y?RB7WT0Gu2J}P0MIEWF9si>w@SX^dnY?{?RvLf#`n>j5b5`h$2zw8L5US8?J+J zQ#aBF(a(sHBM{|@ks63TsXeBr^~BGe7}ftYjp(58pPG)L*1xpl+^%?$X9uuL7xT*lPOYulbEp{B4*4lAoL9Ew9n3UY|I zwf-kchzmz$tWp`?ZfGhfQ3Ci9l4_hp2it-cCqY{H(--faE$DI9K-hih<981ij5u3G zGf%*f8 z;y>9s;BNwn1?ZQ-3x*5|#l^w)1-yRfjRiYP3IoqVf-jcEKffWu;npVI dict[str, int]: + # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab + # whereas all tokenizers have .get_vocab() + return self.model_tokenizer.get_vocab() + + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + """ + Static method that used to adjust the request parameters. + """ + if not request.tools: + return request + json_schema_from_tool = get_json_schema_from_tools( + tool_choice=request.tool_choice, tools=request.tools + ) + # Set structured output params for tool calling + if json_schema_from_tool is not None: + if isinstance(request, ChatCompletionRequest): + request.structured_outputs = StructuredOutputsParams() + # tool_choice: "Forced Function" or "required" will override + # structured output json settings to make tool calling work correctly + request.structured_outputs.json = json_schema_from_tool + if isinstance(request, ResponsesRequest): + request.text = ResponseTextConfig() + request.text.format = ResponseFormatTextJSONSchemaConfig( + name="tool_calling_response", + schema=json_schema_from_tool, + type="json_schema", + description="Response format for tool calling", + strict=True, + ) + + return request + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + """ + Static method that should be implemented for extracting tool calls from + a complete model-generated string. + Used for non-streaming responses where we have the entire model response + available before sending to the client. + Static because it's stateless. + """ + raise NotImplementedError( + "AbstractToolParser.extract_tool_calls has not been implemented!" + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + """ + Instance method that should be implemented for extracting tool calls + from an incomplete response; for use when handling tool calls and + streaming. Has to be an instance method because it requires state - + the current tokens/diffs, but also the information about what has + previously been parsed and extracted (see constructor) + """ + raise NotImplementedError( + "AbstractToolParser.extract_tool_calls_streaming has not been implemented!" + ) + + +class ToolParserManager: + """ + Central registry for ToolParser implementations. + + Supports two modes: + - Eager (immediate) registration via `register_module` + - Lazy registration via `register_lazy_module` + """ + + tool_parsers: dict[str, type[ToolParser]] = {} + lazy_parsers: dict[str, tuple[str, str]] = {} # name -> (module_path, class_name) + + @classmethod + def get_tool_parser(cls, name: str) -> type[ToolParser]: + """ + Retrieve a registered or lazily registered ToolParser class. + + If the parser is lazily registered, + it will be imported and cached on first access. + Raises KeyError if not found. + """ + if name in cls.tool_parsers: + return cls.tool_parsers[name] + + if name in cls.lazy_parsers: + return cls._load_lazy_parser(name) + + raise KeyError(f"Tool parser '{name}' not found.") + + @classmethod + def _load_lazy_parser(cls, name: str) -> type[ToolParser]: + """Import and register a lazily loaded parser.""" + module_path, class_name = cls.lazy_parsers[name] + try: + mod = importlib.import_module(module_path) + parser_cls = getattr(mod, class_name) + if not issubclass(parser_cls, ToolParser): + raise TypeError( + f"{class_name} in {module_path} is not a ToolParser subclass." + ) + cls.tool_parsers[name] = parser_cls # cache + return parser_cls + except Exception as e: + logger.exception( + "Failed to import lazy tool parser '%s' from %s: %s", + name, + module_path, + e, + ) + raise + + @classmethod + def _register_module( + cls, + module: type[ToolParser], + module_name: str | list[str] | None = None, + force: bool = True, + ) -> None: + """Register a ToolParser class immediately.""" + if not issubclass(module, ToolParser): + raise TypeError( + f"module must be subclass of ToolParser, but got {type(module)}" + ) + + if module_name is None: + module_name = module.__name__ + + if isinstance(module_name, str): + module_names = [module_name] + elif is_list_of(module_name, str): + module_names = module_name + else: + raise TypeError("module_name must be str, list[str], or None.") + + for name in module_names: + if not force and name in cls.tool_parsers: + existed = cls.tool_parsers[name] + raise KeyError(f"{name} is already registered at {existed.__module__}") + cls.tool_parsers[name] = module + + @classmethod + def register_lazy_module(cls, name: str, module_path: str, class_name: str) -> None: + """ + Register a lazy module mapping. + + Example: + ToolParserManager.register_lazy_module( + name="kimi_k2", + module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser", + class_name="KimiK2ToolParser", + ) + """ + cls.lazy_parsers[name] = (module_path, class_name) + + @classmethod + def register_module( + cls, + name: str | list[str] | None = None, + force: bool = True, + module: type[ToolParser] | None = None, + ) -> type[ToolParser] | Callable[[type[ToolParser]], type[ToolParser]]: + """ + Register module immediately or lazily (as a decorator). + + Usage: + @ToolParserManager.register_module("kimi_k2") + class KimiK2ToolParser(ToolParser): + ... + + Or: + ToolParserManager.register_module(module=SomeToolParser) + """ + if not isinstance(force, bool): + raise TypeError(f"force must be a boolean, but got {type(force)}") + + # Immediate registration + if module is not None: + cls._register_module(module=module, module_name=name, force=force) + return module + + # Decorator usage + def _decorator(obj: type[ToolParser]) -> type[ToolParser]: + module_path = obj.__module__ + class_name = obj.__name__ + + if isinstance(name, str): + names = [name] + elif name is not None and is_list_of(name, str): + names = name + else: + names = [class_name] + + for n in names: + # Lazy mapping only: do not import now + cls.lazy_parsers[n] = (module_path, class_name) + + return obj + + return _decorator + + @classmethod + def list_registered(cls) -> list[str]: + """Return names of all eagerly and lazily registered tool parsers.""" + return sorted(set(cls.tool_parsers.keys()) | set(cls.lazy_parsers.keys())) + + @classmethod + def import_tool_parser(cls, plugin_path: str) -> None: + """Import a user-defined parser file from arbitrary path.""" + + module_name = os.path.splitext(os.path.basename(plugin_path))[0] + try: + import_from_path(module_name, plugin_path) + except Exception: + logger.exception( + "Failed to load module '%s' from %s.", module_name, plugin_path + ) diff --git a/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py new file mode 100644 index 0000000..cbeb879 --- /dev/null +++ b/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +import regex as re + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class DeepSeekV31ToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id: int = -1 + self.streamed_args_for_tool: list[ + str + ] = [] # map what has been streamed for each tool so far to a list + + self.tool_calls_start_token: str = "<|tool▁calls▁begin|>" + self.tool_calls_end_token: str = "<|tool▁calls▁end|>" + + self.tool_call_start_token: str = "<|tool▁call▁begin|>" + self.tool_call_end_token: str = "<|tool▁call▁end|>" + + self.tool_call_regex = re.compile( + r"<|tool▁call▁begin|>(?P.*?)<|tool▁sep|>(?P.*?)<|tool▁call▁end|>" + ) + + self.stream_tool_call_portion_regex = re.compile( + r"(?P.*)<|tool▁sep|>(?P.*)" + ) + + self.stream_tool_call_name_regex = re.compile( + r"(?P.*)<|tool▁sep|>" + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token) + self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token) + + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + + if ( + self.tool_calls_start_token_id is None + or self.tool_calls_end_token_id is None + ): + raise RuntimeError( + "DeepSeek-V3.1 Tool parser could not locate tool call " + "start/end tokens in the tokenizer!" + ) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + else: + try: + # there are two possible captures - between tags, or between a + # tag and end-of-string so the result of + # findall is an array of tuples where one is a function call and + # the other is None + function_call_tuples = self.tool_call_regex.findall(model_output) + + tool_calls = [] + for match in function_call_tuples: + function_name, function_args = match + tool_calls.append( + ToolCall( + type="function", + function=FunctionCall( + name=function_name, arguments=function_args + ), + ) + ) + + content = model_output[: model_output.find(self.tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + logger.debug("delta_text: %s", delta_text) + logger.debug("delta_token_ids: %s", delta_token_ids) + # check to see if we should be streaming a tool call - is there a + if self.tool_calls_start_token_id not in current_token_ids: + logger.debug("No tool call tokens found!") + return DeltaMessage(content=delta_text) + delta_text = delta_text.replace(self.tool_calls_start_token, "").replace( + self.tool_calls_end_token, "" + ) + try: + # figure out where we are in the parsing by counting tool call + # start & end tags + prev_tool_start_count = previous_token_ids.count( + self.tool_call_start_token_id + ) + prev_tool_end_count = previous_token_ids.count(self.tool_call_end_token_id) + cur_tool_start_count = current_token_ids.count( + self.tool_call_start_token_id + ) + cur_tool_end_count = current_token_ids.count(self.tool_call_end_token_id) + tool_call_portion = None + text_portion = None + + # case: if we're generating text, OR rounding out a tool call + if ( + cur_tool_start_count == cur_tool_end_count + and prev_tool_end_count == cur_tool_end_count + and self.tool_call_end_token not in delta_text + ): + logger.debug("Generating text content! skipping tool parsing.") + return DeltaMessage(content=delta_text) + + if self.tool_call_end_token in delta_text: + logger.debug("tool_call_end_token in delta_text") + full_text = current_text + delta_text + tool_call_portion = ( + full_text.split(self.tool_call_start_token)[-1] + .split(self.tool_call_end_token)[0] + .rstrip() + ) + delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip() + text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip() + + # case -- we're starting a new tool call + if ( + cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count > prev_tool_start_count + ): + if len(delta_token_ids) > 1: + tool_call_portion = current_text.split(self.tool_call_start_token)[ + -1 + ] + else: + tool_call_portion = None + delta = None + + text_portion = None + + # set cursors and state appropriately + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("Starting on a new tool %s", self.current_tool_id) + + # case -- we're updating an existing tool call + elif ( + cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count == prev_tool_start_count + ): + # get the portion of the text that's the tool call + tool_call_portion = current_text.split(self.tool_call_start_token)[-1] + text_portion = None + + # case -- the current tool call is being closed. + elif ( + cur_tool_start_count == cur_tool_end_count + and cur_tool_end_count >= prev_tool_end_count + ): + if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0: + logger.debug("attempting to close tool call, but no tool call") + return None + diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments") + if diff: + diff = ( + diff.encode("utf-8").decode("unicode_escape") + if diff is str + else diff + ) + if '"}' not in delta_text: + return None + end_loc = delta_text.rindex('"}') + diff = delta_text[:end_loc] + '"}' + logger.debug( + "Finishing tool and found diff that had not " + "been streamed yet: %s", + diff, + ) + self.streamed_args_for_tool[self.current_tool_id] += diff + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall(arguments=diff).model_dump( + exclude_none=True + ), + ) + ] + ) + + # case -- otherwise we're just generating text + else: + text = delta_text.replace(self.tool_call_start_token, "") + text = text.replace(self.tool_call_end_token, "") + delta = DeltaMessage(tool_calls=[], content=text) + return delta + + current_tool_call = dict() + if tool_call_portion: + current_tool_call_matches = self.stream_tool_call_portion_regex.match( + tool_call_portion + ) + if current_tool_call_matches: + tool_name, tool_args = current_tool_call_matches.groups() + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = tool_args + else: + current_tool_call_name_matches = ( + self.stream_tool_call_name_regex.match(tool_call_portion) + ) + if current_tool_call_name_matches: + tool_name = current_tool_call_name_matches.groups() + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = "" + else: + logger.debug("Not enough token") + return None + + # case - we haven't sent the tool name yet. If it's available, send + # it. otherwise, wait until it's available. + if not self.current_tool_name_sent: + if current_tool_call is None: + return None + function_name: str | None = current_tool_call.get("name") + if function_name: + self.current_tool_name_sent = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + else: + return None + + # case -- otherwise, send the tool call delta + + # if the tool call portion is None, send the delta as text + if tool_call_portion is None: + # if there's text but not tool calls, send that - + # otherwise None to skip chunk + delta = ( + DeltaMessage(content=delta_text) + if text_portion is not None + else None + ) + return delta + + # now, the nitty-gritty of tool calls + # now we have the portion to parse as tool call. + + logger.debug( + "Trying to parse current tool call with ID %s", self.current_tool_id + ) + + # if we're starting a new tool call, push an empty object in as + # a placeholder for the arguments + if len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + + # main logic for tool parsing here - compare prev. partially-parsed + # JSON to the current partially-parsed JSON + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + cur_arguments = current_tool_call.get("arguments") + + logger.debug("diffing old arguments: %s", prev_arguments) + logger.debug("against new ones: %s", cur_arguments) + + # case -- no arguments have been created yet. skip sending a delta. + if not cur_arguments and not prev_arguments: + logger.debug("Skipping text %s - no arguments", delta_text) + delta = None + + # case -- prev arguments are defined, but non are now. + # probably impossible, but not a fatal error - just keep going + elif not cur_arguments and prev_arguments: + logger.error( + "should be impossible to have arguments reset " + "mid-call. skipping streaming anything." + ) + delta = None + + # case -- we now have the first info about arguments available from + # autocompleting the JSON + elif cur_arguments and not prev_arguments: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=cur_arguments + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] = cur_arguments + + # last case -- we have an update to existing arguments. + elif cur_arguments and prev_arguments: + if ( + isinstance(delta_text, str) + and cur_arguments != prev_arguments + and len(cur_arguments) > len(prev_arguments) + and cur_arguments.startswith(prev_arguments) + ): + delta_arguments = cur_arguments[len(prev_arguments) :] + logger.debug("got diff %s", delta_text) + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=delta_arguments + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] = cur_arguments + else: + delta = None + + # handle saving the state for the current tool into + # the "prev" list for use in diffing for the next iteration + if self.current_tool_id == len(self.prev_tool_call_arr) - 1: + self.prev_tool_call_arr[self.current_tool_id] = current_tool_call + else: + self.prev_tool_call_arr.append(current_tool_call) + + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + return None # do not stream a delta. skip this token ID. diff --git a/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py new file mode 100644 index 0000000..bf7f6fa --- /dev/null +++ b/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +import regex as re + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class DeepSeekV3ToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id: int = -1 + self.streamed_args_for_tool: list[ + str + ] = [] # map what has been streamed for each tool so far to a list + + self.tool_calls_start_token: str = "<|tool▁calls▁begin|>" + self.tool_calls_end_token: str = "<|tool▁calls▁end|>" + + self.tool_call_start_token: str = "<|tool▁call▁begin|>" + self.tool_call_end_token: str = "<|tool▁call▁end|>" + + self.tool_call_regex = re.compile( + r"<|tool▁call▁begin|>(?P.*)<|tool▁sep|>(?P.*)\n```json\n(?P.*)\n```<|tool▁call▁end|>" + ) + + self.stream_tool_call_portion_regex = re.compile( + r"(?P.*)<|tool▁sep|>(?P.*)\n```json\n(?P.*[^\n`])" + ) + + self.stream_tool_call_name_regex = re.compile( + r"(?P.*)<|tool▁sep|>(?P.*)\n" + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token) + self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token) + + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + + if ( + self.tool_calls_start_token_id is None + or self.tool_calls_end_token_id is None + ): + raise RuntimeError( + "DeepSeek-V3 Tool parser could not locate tool call start/end " + "tokens in the tokenizer!" + ) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + else: + try: + # there are two possible captures - between tags, or between a + # tag and end-of-string so the result of + # findall is an array of tuples where one is a function call and + # the other is None + function_call_tuples = self.tool_call_regex.findall(model_output) + + tool_calls = [] + for match in function_call_tuples: + tool_type, function_name, function_args = match + tool_calls.append( + ToolCall( + type=tool_type, + function=FunctionCall( + name=function_name, arguments=function_args + ), + ) + ) + + content = model_output[: model_output.find(self.tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + logger.debug("delta_text: %s", delta_text) + logger.debug("delta_token_ids: %s", delta_token_ids) + # check to see if we should be streaming a tool call - is there a + if self.tool_calls_start_token_id not in current_token_ids: + logger.debug("No tool call tokens found!") + return DeltaMessage(content=delta_text) + delta_text = delta_text.replace(self.tool_calls_start_token, "").replace( + self.tool_calls_end_token, "" + ) + try: + # figure out where we are in the parsing by counting tool call + # start & end tags + prev_tool_start_count = previous_token_ids.count( + self.tool_call_start_token_id + ) + prev_tool_end_count = previous_token_ids.count(self.tool_call_end_token_id) + cur_tool_start_count = current_token_ids.count( + self.tool_call_start_token_id + ) + cur_tool_end_count = current_token_ids.count(self.tool_call_end_token_id) + tool_call_portion = None + text_portion = None + + # case: if we're generating text, OR rounding out a tool call + if ( + cur_tool_start_count == cur_tool_end_count + and prev_tool_end_count == cur_tool_end_count + and self.tool_call_end_token not in delta_text + ): + logger.debug("Generating text content! skipping tool parsing.") + return DeltaMessage(content=delta_text) + + if self.tool_call_end_token in delta_text: + logger.debug("tool_call_end_token in delta_text") + full_text = current_text + delta_text + tool_call_portion = ( + full_text.split(self.tool_call_start_token)[-1] + .split(self.tool_call_end_token)[0] + .rstrip() + ) + delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip() + text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip() + + # case -- we're starting a new tool call + if ( + cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count > prev_tool_start_count + ): + if len(delta_token_ids) > 1: + tool_call_portion = current_text.split(self.tool_call_start_token)[ + -1 + ] + else: + tool_call_portion = None + delta = None + + text_portion = None + + # set cursors and state appropriately + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("Starting on a new tool %s", self.current_tool_id) + + # case -- we're updating an existing tool call + elif ( + cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count == prev_tool_start_count + ): + # get the portion of the text that's the tool call + tool_call_portion = current_text.split(self.tool_call_start_token)[-1] + text_portion = None + + # case -- the current tool call is being closed. + elif ( + cur_tool_start_count == cur_tool_end_count + and cur_tool_end_count >= prev_tool_end_count + ): + if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0: + logger.debug("attempting to close tool call, but no tool call") + return None + diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments") + if diff: + diff = ( + diff.encode("utf-8").decode("unicode_escape") + if diff is str + else diff + ) + if '"}' not in delta_text: + return None + end_loc = delta_text.rindex('"}') + diff = delta_text[:end_loc] + '"}' + logger.debug( + "Finishing tool and found diff that had not " + "been streamed yet: %s", + diff, + ) + self.streamed_args_for_tool[self.current_tool_id] += diff + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall(arguments=diff).model_dump( + exclude_none=True + ), + ) + ] + ) + + # case -- otherwise we're just generating text + else: + text = delta_text.replace(self.tool_call_start_token, "") + text = text.replace(self.tool_call_end_token, "") + delta = DeltaMessage(tool_calls=[], content=text) + return delta + + current_tool_call = dict() + if tool_call_portion: + current_tool_call_matches = self.stream_tool_call_portion_regex.match( + tool_call_portion + ) + if current_tool_call_matches: + tool_type, tool_name, tool_args = current_tool_call_matches.groups() + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = tool_args + else: + current_tool_call_name_matches = ( + self.stream_tool_call_name_regex.match(tool_call_portion) + ) + if current_tool_call_name_matches: + tool_type, tool_name = current_tool_call_name_matches.groups() + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = "" + else: + logger.debug("Not enough token") + return None + + # case - we haven't sent the tool name yet. If it's available, send + # it. otherwise, wait until it's available. + if not self.current_tool_name_sent: + if current_tool_call is None: + return None + function_name: str | None = current_tool_call.get("name") + if function_name: + self.current_tool_name_sent = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + else: + return None + + # case -- otherwise, send the tool call delta + + # if the tool call portion is None, send the delta as text + if tool_call_portion is None: + # if there's text but not tool calls, send that - + # otherwise None to skip chunk + delta = ( + DeltaMessage(content=delta_text) + if text_portion is not None + else None + ) + return delta + + # now, the nitty-gritty of tool calls + # now we have the portion to parse as tool call. + + logger.debug( + "Trying to parse current tool call with ID %s", self.current_tool_id + ) + + # if we're starting a new tool call, push an empty object in as + # a placeholder for the arguments + if len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + + # main logic for tool parsing here - compare prev. partially-parsed + # JSON to the current partially-parsed JSON + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + cur_arguments = current_tool_call.get("arguments") + + logger.debug("diffing old arguments: %s", prev_arguments) + logger.debug("against new ones: %s", cur_arguments) + + # case -- no arguments have been created yet. skip sending a delta. + if not cur_arguments and not prev_arguments: + logger.debug("Skipping text %s - no arguments", delta_text) + delta = None + + # case -- prev arguments are defined, but non are now. + # probably impossible, but not a fatal error - just keep going + elif not cur_arguments and prev_arguments: + logger.error( + "should be impossible to have arguments reset " + "mid-call. skipping streaming anything." + ) + delta = None + + # case -- we now have the first info about arguments available from + # autocompleting the JSON + elif cur_arguments and not prev_arguments: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=cur_arguments + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] = cur_arguments + + # last case -- we have an update to existing arguments. + elif cur_arguments and prev_arguments: + if ( + isinstance(delta_text, str) + and cur_arguments != prev_arguments + and len(cur_arguments) > len(prev_arguments) + and cur_arguments.startswith(prev_arguments) + ): + delta_arguments = cur_arguments[len(prev_arguments) :] + logger.debug("got diff %s", delta_text) + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=delta_arguments + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] = cur_arguments + else: + delta = None + + # handle saving the state for the current tool into + # the "prev" list for use in diffing for the next iteration + if self.current_tool_id == len(self.prev_tool_call_arr) - 1: + self.prev_tool_call_arr[self.current_tool_id] = current_tool_call + else: + self.prev_tool_call_arr.append(current_tool_call) + + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + return None # do not stream a delta. skip this token ID. diff --git a/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/entrypoints/openai/tool_parsers/ernie45_tool_parser.py new file mode 100644 index 0000000..8237032 --- /dev/null +++ b/entrypoints/openai/tool_parsers/ernie45_tool_parser.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence + +import regex as re + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class Ernie45ToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + """ + Ernie thinking model format: + abc\n\n\n\n\ndef\n\n + """ + super().__init__(tokenizer) + self.current_tool_name_sent = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id = -1 + self.streamed_args_for_tool: list[str] = [] + self.think_end_token = "" + self.response_start_token: str = "" + self.response_end_token: str = "" + self.tool_call_start_token = "" + self.tool_call_end_token = "" + self.tool_calls_start_token = self.tool_call_start_token + self.newline_token: str = "<0x0A>" + + self.tool_call_regex = re.compile( + r"\s*(?P\{.*?\})\s*", re.DOTALL + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + + self.think_end_token_id = self.vocab.get(self.think_end_token) + self.response_start_token_id = self.vocab.get(self.response_start_token) + self.response_end_token_id = self.vocab.get(self.response_end_token) + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + self.newline_token_id = self.vocab.get(self.newline_token) + self.parser_token_ids = [ + self.think_end_token_id, + self.response_start_token_id, + self.response_end_token_id, + ] + + self._buffer = "" + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + else: + try: + tool_call_json_list = self.tool_call_regex.findall(model_output) + + tool_calls = [] + for tool_call_json in tool_call_json_list: + tool_call_dict = json.loads(tool_call_json) + args_str = json.dumps( + tool_call_dict.get("arguments", {}), ensure_ascii=False + ) + tool_calls.append( + ToolCall( + type="function", + function=FunctionCall( + name=tool_call_dict.get("name", ""), + arguments=args_str, + ), + ) + ) + + content = model_output[ + : model_output.find(self.tool_calls_start_token) + ].rstrip("\n") + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + self._buffer += delta_text + cur_text = self._buffer + start_idx = cur_text.find(self.tool_call_start_token) + if start_idx == -1: + self._buffer = "" + # At least one toolcall has been completed + if self.current_tool_id > 0: + cur_text = "" + if self.current_tool_id == -1 and all( + token_id == self.newline_token_id for token_id in previous_token_ids + ): + cur_text = cur_text.strip("\n") + + # handle when tool_call is not triggered + # cur_text === delta_text + content = cur_text + if self.response_start_token_id in delta_token_ids: + content = content.lstrip("\n") + response_start_idx = content.find(self.response_start_token) + content = content[response_start_idx + len(self.response_start_token) :] + # if have , remove it + response_end_idx = content.rfind(self.response_end_token) + if response_end_idx != -1: + content = content[:response_end_idx] + elif self.response_end_token_id in delta_token_ids: + response_end_idx = content.rfind(self.response_end_token) + content = content[:response_end_idx] + # remove \n after or or + if ( + len(previous_token_ids) > 0 + and previous_token_ids[-1] in self.parser_token_ids + ) and ( + len(delta_token_ids) > 0 and delta_token_ids[0] == self.newline_token_id + ): + content = content.lstrip("\n") + + return DeltaMessage(content=content if content else None) + logger.debug("cur_text = %s", cur_text) + end_idx = cur_text.find(self.tool_call_end_token) + if end_idx != -1: + if self.current_tool_id == -1: + self.current_tool_id = 0 + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] + while len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + while len(self.streamed_args_for_tool) <= self.current_tool_id: + self.streamed_args_for_tool.append("") + + extracted_tool_calls = self.extract_tool_calls( + cur_text[: end_idx + len(self.tool_call_end_token)], request + ) + + if len(extracted_tool_calls.tool_calls) == 0: + logger.warning("Failed to extract any tool calls.") + return None + tool_call = extracted_tool_calls.tool_calls[0] + self.prev_tool_call_arr[self.current_tool_id] = { + "name": tool_call.function.name, + "arguments": json.loads(tool_call.function.arguments), + } + self.streamed_args_for_tool[self.current_tool_id] = ( + tool_call.function.arguments + ) + delta = DeltaMessage( + content=extracted_tool_calls.content, + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + id=tool_call.id, + type=tool_call.type, + function=DeltaFunctionCall( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ), + ) + ], + ) + self.current_tool_id += 1 + self._buffer = cur_text[end_idx + len(self.tool_call_end_token) :] + return delta + + self._buffer = cur_text[start_idx:] + content = cur_text[:start_idx].rstrip("\n") + return DeltaMessage(content=content if content else None) diff --git a/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py new file mode 100644 index 0000000..120e63b --- /dev/null +++ b/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -0,0 +1,200 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import ast +import json +from collections.abc import Sequence +from typing import Any + +import regex as re + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionToolsParam, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class Glm4MoeModelToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.current_tool_name_sent = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id = -1 + self.streamed_args_for_tool: list[str] = [] + self.tool_call_start_token = "" + self.tool_call_end_token = "" + + self.tool_calls_start_token = self.tool_call_start_token + + self.func_call_regex = re.compile(r".*?", re.DOTALL) + self.func_detail_regex = re.compile( + r"([^\n]*)\n(.*)", re.DOTALL + ) + self.func_arg_regex = re.compile( + r"(.*?)\s*(.*?)", re.DOTALL + ) + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + self._buffer = "" + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + def _is_string_type( + tool_name: str, + arg_name: str, + tools: list[ChatCompletionToolsParam] | None, + ) -> bool: + if tools is None: + return False + for tool in tools: + if tool.function.name == tool_name: + if tool.function.parameters is None: + return False + arg_type = ( + tool.function.parameters.get("properties", {}) + .get(arg_name, {}) + .get("type", None) + ) + return arg_type == "string" + logger.warning("No tool named '%s'.", tool_name) + return False + + def _deserialize(value: str) -> Any: + try: + return json.loads(value) + except Exception: + pass + + try: + return ast.literal_eval(value) + except Exception: + pass + return value + + matched_tool_calls = self.func_call_regex.findall(model_output) + logger.debug("model_output: %s", model_output) + try: + tool_calls = [] + for match in matched_tool_calls: + tc_detail = self.func_detail_regex.search(match) + tc_name = tc_detail.group(1) + tc_args = tc_detail.group(2) + pairs = self.func_arg_regex.findall(tc_args) + arg_dct = {} + for key, value in pairs: + arg_key = key.strip() + arg_val = value.strip() + if not _is_string_type(tc_name, arg_key, request.tools): + arg_val = _deserialize(arg_val) + logger.debug("arg_key = %s, arg_val = %s", arg_key, arg_val) + arg_dct[arg_key] = arg_val + tool_calls.append( + ToolCall( + type="function", + function=FunctionCall( + name=tc_name, arguments=json.dumps(arg_dct) + ), + ) + ) + except Exception: + logger.exception("Failed to extract tool call spec") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + else: + if len(tool_calls) > 0: + content = model_output[: model_output.find(self.tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=True, tool_calls=tool_calls, content=content + ) + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + self._buffer += delta_text + cur_text = self._buffer + start_idx = cur_text.find(self.tool_call_start_token) + if start_idx == -1: + self._buffer = "" + if self.current_tool_id > 0: + cur_text = "" + return DeltaMessage(content=cur_text) + logger.debug("cur_text = %s", cur_text) + end_idx = cur_text.find(self.tool_call_end_token) + if end_idx != -1: + if self.current_tool_id == -1: + self.current_tool_id = 0 + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] + while len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + while len(self.streamed_args_for_tool) <= self.current_tool_id: + self.streamed_args_for_tool.append("") + + extracted_tool_calls = self.extract_tool_calls( + cur_text[: end_idx + len(self.tool_call_end_token)], request + ) + + if len(extracted_tool_calls.tool_calls) == 0: + logger.warning("Failed to extract any tool calls.") + return None + tool_call = extracted_tool_calls.tool_calls[0] + self.prev_tool_call_arr[self.current_tool_id] = { + "name": tool_call.function.name, + "arguments": json.loads(tool_call.function.arguments), + } + self.streamed_args_for_tool[self.current_tool_id] = ( + tool_call.function.arguments + ) + delta = DeltaMessage( + content=extracted_tool_calls.content, + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + id=tool_call.id, + type=tool_call.type, + function=DeltaFunctionCall( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ), + ) + ], + ) + self.current_tool_id += 1 + self._buffer = cur_text[end_idx + len(self.tool_call_end_token) :] + return delta + + self._buffer = cur_text[start_idx:] + return DeltaMessage(content=cur_text[:start_idx]) diff --git a/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py new file mode 100644 index 0000000..ae92174 --- /dev/null +++ b/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -0,0 +1,273 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence +from json import JSONDecoder + +import partial_json_parser +import regex as re +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.entrypoints.openai.tool_parsers.utils import ( + consume_space, + find_common_prefix, + is_complete_json, + partial_json_loads, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class Granite20bFCToolParser(ToolParser): + """ + Tool call parser for the granite-20b-functioncalling model intended + for use with the examples/tool_chat_template_granite20b_fc.jinja + template. + + Used when --enable-auto-tool-choice --tool-call-parser granite-20-fc + are all set + """ + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + self.bot_token = "" + self.tool_start_token = self.bot_token + self.tool_call_regex = re.compile(r"\s*") + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + if self.tool_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + dec = JSONDecoder() + try: + matches = list(self.tool_call_regex.finditer(model_output)) + logger.debug("Found %d tool call matches", len(matches)) + + raw_function_calls = [] + + for i, match in enumerate(matches): + # position after the tag + start_of_json = match.end() + # end_index == the start of the next function call + # (if exists) + next_function_call_start = ( + matches[i + 1].start() if i + 1 < len(matches) else None + ) + + raw_function_calls.append( + dec.raw_decode( + model_output[start_of_json:next_function_call_start] + )[0] + ) + + logger.debug("Extracted %d tool calls", len(raw_function_calls)) + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps( + function_call["arguments"], ensure_ascii=False + ), + ), + ) + for function_call in raw_function_calls + ] + + content = model_output[: model_output.find(self.bot_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception as e: + logger.error("Error in extracting tool call from response %s", e) + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + if len(current_text) < len(self.bot_token) and self.bot_token.startswith( + current_text + ): + return None + + if not current_text.startswith(self.bot_token): + return DeltaMessage(content=delta_text) + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR + try: + tool_call_arr = [] + is_complete = [] + try: + start_idx = len(self.bot_token) + start_idx = consume_space(start_idx, current_text) + + while start_idx < len(current_text): + (obj, end_idx) = partial_json_loads(current_text[start_idx:], flags) + is_complete.append( + is_complete_json(current_text[start_idx : start_idx + end_idx]) + ) + start_idx += end_idx + start_idx = consume_space(start_idx, current_text) + start_idx += len(self.bot_token) + start_idx = consume_space(start_idx, current_text) + tool_call_arr.append(obj) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug("not enough tokens to parse into JSON yet") + return None + + # select as the current tool call the one we're on the state at + current_tool_call: dict = ( + tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {} + ) + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif ( + len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1 + ): + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + cur_arguments = current_tool_call.get("arguments") + if cur_arguments: + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + sent = len(self.streamed_args_for_tool[self.current_tool_id]) + argument_diff = cur_args_json[sent:] + + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += ( + argument_diff + ) + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + elif not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + cur_arguments = current_tool_call.get("arguments") + delta = None + + if cur_arguments: + sent = len(self.streamed_args_for_tool[self.current_tool_id]) + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + + argument_diff = None + if is_complete[self.current_tool_id]: + argument_diff = cur_args_json[sent:] + elif prev_arguments: + prev_args_json = json.dumps(prev_arguments, ensure_ascii=False) + if cur_args_json != prev_args_json: + prefix = find_common_prefix(prev_args_json, cur_args_json) + argument_diff = prefix[sent:] + + if argument_diff is not None: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += ( + argument_diff + ) + + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception as e: + logger.error("Error trying to handle streaming tool call: %s", e) + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None diff --git a/entrypoints/openai/tool_parsers/granite_tool_parser.py b/entrypoints/openai/tool_parsers/granite_tool_parser.py new file mode 100644 index 0000000..d29c427 --- /dev/null +++ b/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -0,0 +1,253 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence + +import partial_json_parser +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.entrypoints.openai.tool_parsers.utils import ( + consume_space, + find_common_prefix, + is_complete_json, + partial_json_loads, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class GraniteToolParser(ToolParser): + """ + Tool call parser for the granite 3.0 models. Intended + for use with the examples/tool_chat_template_granite.jinja + template. + + Used when --enable-auto-tool-choice --tool-call-parser granite + are all set + """ + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + # for granite 3.0, the token `<|tool_call|>` + self.bot_token = "<|tool_call|>" + # for granite 3.1, the string `` + self.bot_string = "" + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + stripped = ( + model_output.strip() + .removeprefix(self.bot_token) + .removeprefix(self.bot_string) + .lstrip() + ) + if not stripped or stripped[0] != "[": + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + try: + raw_function_calls = json.loads(stripped) + if not isinstance(raw_function_calls, list): + raise Exception( + f"Expected dict or list, got {type(raw_function_calls)}" + ) + + logger.debug("Extracted %d tool calls", len(raw_function_calls)) + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps( + function_call["arguments"], ensure_ascii=False + ), + ), + ) + for function_call in raw_function_calls + ] + + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=None, + ) + + except Exception as e: + logger.error("Error in extracting tool call from response %s", e) + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + start_idx = consume_space(0, current_text) + if current_text[start_idx:].startswith(self.bot_token): + start_idx = consume_space(start_idx + len(self.bot_token), current_text) + if current_text[start_idx:].startswith(self.bot_string): + start_idx = consume_space(start_idx + len(self.bot_string), current_text) + if ( + not current_text + or start_idx >= len(current_text) + or current_text[start_idx] != "[" + ): + return DeltaMessage(content=delta_text) + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR + try: + tool_call_arr = None + is_complete = None + try: + tool_calls, end_idx = partial_json_loads( + current_text[start_idx:], flags + ) + if type(tool_calls) is list: + tool_call_arr = tool_calls + else: + return DeltaMessage(content=delta_text) + + is_complete = [True] * len(tool_calls) + if not is_complete_json(current_text[start_idx : start_idx + end_idx]): + is_complete[-1] = False + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug("not enough tokens to parse into JSON yet") + return None + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if not tool_call_arr: + return None + + # select as the current tool call the one we're on the state at + current_tool_call: dict = tool_call_arr[self.current_tool_id] + + delta = None + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + if len(tool_call_arr) > self.current_tool_id + 1: + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + cur_arguments = current_tool_call.get("arguments") + if cur_arguments: + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + sent = len(self.streamed_args_for_tool[self.current_tool_id]) + argument_diff = cur_args_json[sent:] + + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += ( + argument_diff + ) + + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + elif not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + self.current_tool_name_sent = True + + # now we know we're on the same tool call and we're streaming + # arguments + else: + cur_arguments = current_tool_call.get("arguments") + + if cur_arguments: + sent = len(self.streamed_args_for_tool[self.current_tool_id]) + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + + argument_diff = None + if is_complete[self.current_tool_id]: + argument_diff = cur_args_json[sent:] + elif prev_arguments: + prev_args_json = json.dumps(prev_arguments, ensure_ascii=False) + if cur_args_json != prev_args_json: + prefix = find_common_prefix(prev_args_json, cur_args_json) + argument_diff = prefix[sent:] + + if argument_diff is not None: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += ( + argument_diff + ) + + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception as e: + logger.error("Error trying to handle streaming tool call: %s", e) + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None diff --git a/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/entrypoints/openai/tool_parsers/hermes_tool_parser.py new file mode 100644 index 0000000..4336a54 --- /dev/null +++ b/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -0,0 +1,494 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence + +import partial_json_parser +import regex as re +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer + +logger = init_logger(__name__) + + +class Hermes2ProToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if isinstance(self.model_tokenizer, MistralTokenizer): + logger.error("Detected Mistral tokenizer when using a Hermes model") + self.model_tokenizer = self.model_tokenizer.tokenizer + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id: int = -1 + self.streamed_args_for_tool: list[ + str + ] = [] # map what has been streamed for each tool so far to a list + + self.tool_call_start_token: str = "" + self.tool_call_end_token: str = "" + + self.tool_call_regex = re.compile( + r"(.*?)|(.*)", re.DOTALL + ) + self.scratch_pad_regex = re.compile( + r"(.*?)", re.DOTALL + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + self.tool_call_start_token_ids = self.model_tokenizer.encode( + self.tool_call_start_token, add_special_tokens=False + ) + self.tool_call_end_token_ids = self.model_tokenizer.encode( + self.tool_call_end_token, add_special_tokens=False + ) + + self.tool_call_start_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_start_token_ids + ] + + self.tool_call_end_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_end_token_ids + ] + + self.buffered_delta_text = "" + + # Very simple idea: when encountering tokens like <, tool, _call, >, + # <, /, tool, _call, >, store them in a buffer. + # When the last token is encountered, empty the buffer and return it. + # If a token appears in an incorrect sequence while storing in the buffer, + # return the preceding buffer along with the token. + def tool_call_delta_buffer(self, delta_text: str): + # If the sequence of tool_call_start or tool_call_end tokens is not yet + # complete, fill the buffer with the token and return "". + if ( + delta_text in self.tool_call_start_token_array + or delta_text in self.tool_call_end_token_array + ): + # If delta_text is the last token of tool_call_start_token or + # tool_call_end_token, empty the buffer and return + # the buffered text + delta_text. + if ( + delta_text == self.tool_call_start_token_array[-1] + or delta_text == self.tool_call_end_token_array[-1] + ): + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + self.buffered_delta_text = self.buffered_delta_text + delta_text + return "" + else: + if self.buffered_delta_text: + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + return delta_text + + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + request = super().adjust_request(request) + if request.tools and request.tool_choice != "none": + # do not skip special tokens because the tool_call tokens are + # marked "special" in some models. Since they are skipped + # prior to the call to the tool parser, it breaks tool calling. + request.skip_special_tokens = False + return request + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + # sanity check; avoid unnecessary processing + if self.tool_call_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + else: + try: + # there are two possible captures - between tags, or between a + # tag and end-of-string so the result of + # findall is an array of tuples where one is a function call and + # the other is None + function_call_tuples = self.tool_call_regex.findall(model_output) + + # load the JSON, and then use it to build the Function and + # Tool Call + raw_function_calls = [ + json.loads(match[0] if match[0] else match[1]) + for match in function_call_tuples + ] + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps( + function_call["arguments"], ensure_ascii=False + ), + ), + ) + for function_call in raw_function_calls + ] + + content = model_output[: model_output.find(self.tool_call_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + # 1. All tokens are parsed based on _text, not token_ids. + # 2. All incoming text data is processed by the tool_call_delta_buffer + # function for buffering before being used for parsing. + + delta_text = self.tool_call_delta_buffer(delta_text) + # If the last characters of previous_text + # match self.buffered_delta_text, remove only the matching part. + if ( + len(previous_text) >= len(self.buffered_delta_text) + and previous_text[-len(self.buffered_delta_text) :] + == self.buffered_delta_text + ): + previous_text = previous_text[: -len(self.buffered_delta_text)] + current_text = previous_text + delta_text + + logger.debug("delta_text: %s", delta_text) + logger.debug("delta_token_ids: %s", delta_token_ids) + # check to see if we should be streaming a tool call - is there a + if self.tool_call_start_token not in current_text: + logger.debug("No tool call tokens found!") + return DeltaMessage(content=delta_text) + + try: + # figure out where we are in the parsing by counting tool call + # start & end tags + prev_tool_start_count = previous_text.count(self.tool_call_start_token) + prev_tool_end_count = previous_text.count(self.tool_call_end_token) + cur_tool_start_count = current_text.count(self.tool_call_start_token) + cur_tool_end_count = current_text.count(self.tool_call_end_token) + tool_call_portion = None + text_portion = None + + # case: if we're generating text, OR rounding out a tool call + if ( + cur_tool_start_count == cur_tool_end_count + and prev_tool_end_count == cur_tool_end_count + and self.tool_call_end_token not in delta_text + ): + logger.debug("Generating text content! skipping tool parsing.") + return DeltaMessage(content=delta_text) + + if self.tool_call_end_token in delta_text: + logger.debug("tool_call_end_token in delta_text") + full_text = current_text + delta_text + tool_call_portion = ( + full_text.split(self.tool_call_start_token)[-1] + .split(self.tool_call_end_token)[0] + .rstrip() + ) + delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip() + text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip() + + # case: if tool open & close tag counts don't match, we're doing + # imaginary "else" block here + # something with tools with this diff. + # flags for partial JSON parting. exported constants from + # "Allow" are handled via BIT MASK + flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR + + # case -- we're starting a new tool call + if ( + cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count > prev_tool_start_count + ): + if len(delta_token_ids) > 1: + tool_call_portion = current_text.split(self.tool_call_start_token)[ + -1 + ] + else: + tool_call_portion = None + delta = None + + text_portion = None + + # set cursors and state appropriately + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("Starting on a new tool %s", self.current_tool_id) + + # case -- we're updating an existing tool call + elif ( + cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count == prev_tool_start_count + ): + # get the portion of the text that's the tool call + tool_call_portion = current_text.split(self.tool_call_start_token)[-1] + text_portion = None + + # case -- the current tool call is being closed. + elif ( + cur_tool_start_count == cur_tool_end_count + and cur_tool_end_count >= prev_tool_end_count + ): + if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0: + logger.debug("attempting to close tool call, but no tool call") + return None + diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments") + if diff: + diff = ( + diff.encode("utf-8").decode("unicode_escape") + if diff is str + else diff + ) + if '"}' not in delta_text: + return None + end_loc = delta_text.rindex('"}') + diff = delta_text[:end_loc] + '"}' + logger.debug( + "Finishing tool and found diff that had not " + "been streamed yet: %s", + diff, + ) + self.streamed_args_for_tool[self.current_tool_id] += diff + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall(arguments=diff).model_dump( + exclude_none=True + ), + ) + ] + ) + + # case -- otherwise we're just generating text + else: + text = delta_text.replace(self.tool_call_start_token, "") + text = text.replace(self.tool_call_end_token, "") + delta = DeltaMessage(tool_calls=[], content=text) + return delta + + try: + current_tool_call = ( + partial_json_parser.loads(tool_call_portion or "{}", flags) + if tool_call_portion + else None + ) + logger.debug("Parsed tool call %s", current_tool_call) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug("not enough tokens to parse into JSON yet") + return None + except json.decoder.JSONDecodeError: + logger.debug("unable to parse JSON") + return None + + # case - we haven't sent the tool name yet. If it's available, send + # it. otherwise, wait until it's available. + if not self.current_tool_name_sent: + if current_tool_call is None: + return None + function_name: str | None = current_tool_call.get("name") + if function_name: + self.current_tool_name_sent = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + else: + return None + # case -- otherwise, send the tool call delta + + # if the tool call portion is None, send the delta as text + if tool_call_portion is None: + # if there's text but not tool calls, send that - + # otherwise None to skip chunk + delta = ( + DeltaMessage(content=delta_text) + if text_portion is not None + else None + ) + return delta + + # now, the nitty-gritty of tool calls + # now we have the portion to parse as tool call. + + logger.debug( + "Trying to parse current tool call with ID %s", self.current_tool_id + ) + + # if we're starting a new tool call, push an empty object in as + # a placeholder for the arguments + if len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + + # main logic for tool parsing here - compare prev. partially-parsed + # JSON to the current partially-parsed JSON + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + cur_arguments = current_tool_call.get("arguments") + + logger.debug("diffing old arguments: %s", prev_arguments) + logger.debug("against new ones: %s", cur_arguments) + + # case -- no arguments have been created yet. skip sending a delta. + if not cur_arguments and not prev_arguments: + logger.debug("Skipping text %s - no arguments", delta_text) + delta = None + + # case -- prev arguments are defined, but non are now. + # probably impossible, but not a fatal error - just keep going + elif not cur_arguments and prev_arguments: + logger.error( + "should be impossible to have arguments reset " + "mid-call. skipping streaming anything." + ) + delta = None + + # case -- we now have the first info about arguments available from + # autocompleting the JSON + elif cur_arguments and not prev_arguments: + # extract the content after {"name": ..., "arguments": + # directly from tool_call_portion as cur_arguments_json, + # since cur_arguments may differ from the original text + # due to partial JSON parsing + # for example, tool_call_portion = + # {"name": "search", "arguments": {"search_request": {" + # but cur_arguments = + # {"search_request": {}} + function_name = current_tool_call.get("name") + match = re.search( + r'\{"name":\s*"' + + re.escape(function_name) + + r'"\s*,\s*"arguments":\s*(.*)', + tool_call_portion.strip(), + re.DOTALL, + ) + if match: + cur_arguments_json = match.group(1) + else: + cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False) + + logger.debug("finding %s in %s", delta_text, cur_arguments_json) + + # get the location where previous args differ from current. + if delta_text not in cur_arguments_json: + return None + args_delta_start_loc = cur_arguments_json.rindex(delta_text) + len( + delta_text + ) + + # use that to find the actual delta + arguments_delta = cur_arguments_json[:args_delta_start_loc] + logger.debug("First tokens in arguments received: %s", arguments_delta) + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += arguments_delta + + # last case -- we have an update to existing arguments. + elif cur_arguments and prev_arguments: + # judge whether the tool_call_portion is a complete JSON + try: + json.loads(tool_call_portion) + is_complete_json = True + except Exception: + is_complete_json = False + + # if the delta_text ends with a '}' and tool_call_portion is a + # complete JSON, then the last '}' does not belong to the + # arguments, so we should trim it off + if ( + isinstance(delta_text, str) + and len(delta_text.rstrip()) >= 1 + and delta_text.rstrip()[-1] == "}" + and is_complete_json + ): + delta_text = delta_text.rstrip()[:-1] + + logger.debug("got diff %s", delta_text) + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall(arguments=delta_text).model_dump( + exclude_none=True + ), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += delta_text + + # handle saving the state for the current tool into + # the "prev" list for use in diffing for the next iteration + if self.current_tool_id == len(self.prev_tool_call_arr) - 1: + self.prev_tool_call_arr[self.current_tool_id] = current_tool_call + else: + self.prev_tool_call_arr.append(current_tool_call) + + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + return None # do not stream a delta. skip this token ID. diff --git a/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py new file mode 100644 index 0000000..920675c --- /dev/null +++ b/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py @@ -0,0 +1,420 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501, SIM102 + +import json +from collections.abc import Sequence +from typing import Any + +import regex as re + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.entrypoints.openai.tool_parsers.utils import consume_space +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +class HunyuanA13BToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + # Initialize state for streaming mode + self.prev_tool_calls: list[dict] = [] + self.current_tool_id = -1 + self.current_tool_name_sent = False + self.streamed_args: list[str] = [] # Track arguments sent for each tool + + # For backward compatibility with tests + self.current_tools_sent: list[bool] = [] + + # For backward compatibility with serving code + self.prev_tool_call_arr = [] + + # Regex patterns for preprocessing + self.answer_tool_calls_pattern = re.compile( + r"([\s\S]*?)", re.DOTALL + ) + + self.tool_name_reg = re.compile(r'"name"\s*:\s*"([^"]+)"') + + self.tool_empty_arg_reg = re.compile( + r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}' + ) + + # TODO: not support nested json object in fc arguments. + self.tool_non_empty_arg_reg = re.compile( + r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})' + ) + + self.bot_string = "" + + # Define streaming state type to be initialized later + self.streaming_state: dict[str, Any] = { + "current_tool_index": -1, + "tool_ids": [], + "sent_tools": [], + } + + def preprocess_model_output( + self, model_output: str + ) -> tuple[str | None, str | None]: + # find the location tool call + for match in self.answer_tool_calls_pattern.finditer(model_output): + start, end = match.span() + # check tool_calls whether in side of + think_regions = [ + (m.start(), m.end()) + for m in re.finditer( + r"(.*?)", model_output, flags=re.DOTALL + ) + ] + in_think = any( + start > t_start and end < t_end for t_start, t_end in think_regions + ) + if not in_think: + content = model_output[:start] + tool_calls_content = match.group(1).strip() + try: + json.loads(tool_calls_content) + return content, tool_calls_content + except Exception: + continue + return model_output, None + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + """ + Extract tool calls from a complete model output. + """ + try: + # Preprocess the model output + content, potential_tool_calls = self.preprocess_model_output(model_output) + + if not potential_tool_calls: + # some text should be filtered out for no function call + # this text is in a13b's chat template. + if content: + content = content.replace("助手:", "", 1) + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=content + ) + + # Parse the potential tool calls as JSON + tool_calls_data = json.loads(potential_tool_calls) + + # Ensure it's an array + if not isinstance(tool_calls_data, list): + logger.debug("Tool calls data is not an array") + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=content or model_output, + ) + + tool_calls: list[ToolCall] = [] + + for idx, call in enumerate(tool_calls_data): + if ( + not isinstance(call, dict) + or "name" not in call + or "arguments" not in call + ): + continue + + tool_call = ToolCall( + id=f"call_{random_uuid()}", + type="function", + function=FunctionCall( + name=call["name"], + arguments=( + json.dumps(call["arguments"]) + if isinstance(call["arguments"], dict) + else call["arguments"] + ), + ), + ) + tool_calls.append(tool_call) + + if not content or len(content.strip()) == 0: + # clear the whitespace content. + content = None + + return ExtractedToolCallInformation( + tools_called=len(tool_calls) > 0, + tool_calls=tool_calls, + content=content, + ) + + except Exception: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + """ + Extract tool calls for streaming mode. + """ + + start_idx = consume_space(0, current_text) + if current_text[start_idx:].startswith(self.bot_string): + start_idx = consume_space(start_idx + len(self.bot_string), current_text) + if ( + not current_text + or start_idx >= len(current_text) + or current_text[start_idx] != "[" + ): + return DeltaMessage(content=delta_text) + + self._try_parse_json_tools(current_text[start_idx:]) + + test_delta = self._handle_test_compatibility(current_text) + if test_delta: + return test_delta + + name_matches = list(self.tool_name_reg.finditer(current_text)) + tool_count = len(name_matches) + if tool_count == 0: + return None + self._ensure_state_arrays(tool_count) + current_idx = self.streaming_state["current_tool_index"] + + name_delta = self._handle_tool_name_streaming( + current_idx, tool_count, name_matches + ) + if name_delta: + return name_delta + + args_delta = self._handle_tool_args_streaming( + current_text, current_idx, tool_count + ) + if args_delta: + return args_delta + + return None + + def _try_parse_json_tools(self, current_text: str): + try: + parsed_tools = json.loads(current_text) + if isinstance(parsed_tools, list): + self.prev_tool_call_arr = parsed_tools + except json.JSONDecodeError: + pass + + def _handle_test_compatibility(self, current_text: str): + if len(self.current_tools_sent) > 0: + if ( + len(self.current_tools_sent) == 1 + and self.current_tools_sent[0] is False + ): + name_match = self.tool_name_reg.search(current_text) + if name_match: + function_name = name_match.group(1) + tool_id = f"chatcmpl-tool-{random_uuid()}" + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=0, + type="function", + id=tool_id, + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + self.current_tools_sent = [True] + self.current_tool_id = 0 + self.streaming_state["current_tool_index"] = 0 + if len(self.streaming_state["sent_tools"]) == 0: + self.streaming_state["sent_tools"].append( + { + "sent_name": True, + "sent_arguments_prefix": False, + "sent_arguments": "", + } + ) + else: + self.streaming_state["sent_tools"][0]["sent_name"] = True + self.current_tool_name_sent = True + return delta + return None + + def _ensure_state_arrays(self, tool_count: int): + while len(self.streaming_state["sent_tools"]) < tool_count: + self.streaming_state["sent_tools"].append( + { + "sent_name": False, + "sent_arguments_prefix": False, + "sent_arguments": "", + } + ) + while len(self.streaming_state["tool_ids"]) < tool_count: + self.streaming_state["tool_ids"].append(None) + + def _handle_tool_name_streaming( + self, current_idx: int, tool_count: int, name_matches + ): + if current_idx == -1 or current_idx < tool_count - 1: + next_idx = current_idx + 1 + if ( + next_idx < tool_count + and not self.streaming_state["sent_tools"][next_idx]["sent_name"] + ): + self.streaming_state["current_tool_index"] = next_idx + self.current_tool_id = next_idx + current_idx = next_idx + tool_name = name_matches[current_idx].group(1) + tool_id = f"call_{current_idx}_{random_uuid()}" + self.streaming_state["tool_ids"][current_idx] = tool_id + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + type="function", + id=tool_id, + function=DeltaFunctionCall(name=tool_name).model_dump( + exclude_none=True + ), + ) + ] + ) + self.streaming_state["sent_tools"][current_idx]["sent_name"] = True + self.current_tool_name_sent = True + while len(self.streamed_args) <= current_idx: + self.streamed_args.append("") + return delta + return None + + def _handle_tool_args_streaming( + self, current_text: str, current_idx: int, tool_count: int + ): + if current_idx >= 0 and current_idx < tool_count: + empty_args_match = self.tool_empty_arg_reg.search(current_text) + if empty_args_match and empty_args_match.start() > 0: + for i in range(tool_count): + if i == current_idx: + if not self.streaming_state["sent_tools"][current_idx][ + "sent_arguments_prefix" + ]: + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments_prefix" + ] = True + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments" + ] = "{}" + while len(self.streamed_args) <= current_idx: + self.streamed_args.append("") + self.streamed_args[current_idx] += "{}" + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + function=DeltaFunctionCall( + arguments="{}" + ).model_dump(exclude_none=True), + ) + ] + ) + if current_idx < tool_count - 1: + self.streaming_state["current_tool_index"] += 1 + self.current_tool_id = self.streaming_state[ + "current_tool_index" + ] + return delta + + args_matches = list(self.tool_non_empty_arg_reg.finditer(current_text)) + if current_idx < len(args_matches): + args_text = args_matches[current_idx].group(1) + is_last_tool = current_idx == tool_count - 1 + if not is_last_tool: + next_tool_pos = current_text.find( + "},{", args_matches[current_idx].start() + ) + if next_tool_pos != -1: + args_end_pos = next_tool_pos + 1 + args_text = ( + current_text[ + args_matches[current_idx].start() : args_end_pos + ] + .split('"arguments":')[1] + .strip() + ) + sent_args = self.streaming_state["sent_tools"][current_idx][ + "sent_arguments" + ] + if not self.streaming_state["sent_tools"][current_idx][ + "sent_arguments_prefix" + ] and args_text.startswith("{"): + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments_prefix" + ] = True + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments" + ] = "{" + while len(self.streamed_args) <= current_idx: + self.streamed_args.append("") + self.streamed_args[current_idx] += "{" + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + function=DeltaFunctionCall(arguments="{").model_dump( + exclude_none=True + ), + ) + ] + ) + return delta + + if args_text.startswith(sent_args): + args_diff = args_text[len(sent_args) :] + if args_diff: + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments" + ] = args_text + while len(self.streamed_args) <= current_idx: + self.streamed_args.append("") + self.streamed_args[current_idx] += args_diff + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + function=DeltaFunctionCall( + arguments=args_diff + ).model_dump(exclude_none=True), + ) + ] + ) + return delta + + if args_text.endswith("}") and args_text == sent_args: + if current_idx < tool_count - 1: + self.streaming_state["current_tool_index"] += 1 + self.current_tool_id = self.streaming_state[ + "current_tool_index" + ] + return None diff --git a/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/entrypoints/openai/tool_parsers/internlm2_tool_parser.py new file mode 100644 index 0000000..1dd327f --- /dev/null +++ b/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -0,0 +1,227 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence + +import partial_json_parser +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class Internlm2ToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.position = 0 + + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + request = super().adjust_request(request) + if request.tools and request.tool_choice != "none": + # do not skip special tokens because internlm use the special + # tokens to indicate the start and end of the tool calls + # information. + request.skip_special_tokens = False + return request + + def get_arguments(self, obj): + if "parameters" in obj: + return obj.get("parameters") + elif "arguments" in obj: + return obj.get("arguments") + return None + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + if "<|action_start|>" not in current_text: + self.position = len(current_text) + return DeltaMessage(content=delta_text) + # if the tool call is sent, return an empty delta message + # to make sure the finish_reason will be sent correctly. + if self.current_tool_id > 0: + return DeltaMessage(content="") + + last_pos = self.position + if "<|action_start|><|plugin|>" not in current_text[last_pos:]: + return None + + new_delta = current_text[last_pos:] + text, action = new_delta.split("<|action_start|><|plugin|>") + + if len(text) > 0: + self.position = self.position + len(text) + return DeltaMessage(content=text) + + action = action.strip() + action = action.split("<|action_end|>".strip())[0] + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR + + try: + parsable_arr = action + + # tool calls are generated in an object in internlm2 + # it's not support parallel tool calls + try: + tool_call_arr: dict = partial_json_parser.loads(parsable_arr, flags) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug("not enough tokens to parse into JSON yet") + return None + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + if not self.current_tool_name_sent: + function_name = tool_call_arr.get("name") + if function_name: + self.current_tool_id = self.current_tool_id + 1 + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + self.current_tool_name_sent = True + self.streamed_args_for_tool.append("") + else: + delta = None + # now we know we're on the same tool call and we're streaming + # arguments + else: + prev_arguments = self.get_arguments( + self.prev_tool_call_arr[self.current_tool_id] + ) + cur_arguments = self.get_arguments(tool_call_arr) + + # not arguments generated + if not cur_arguments and not prev_arguments: + delta = None + # will never happen + elif not cur_arguments and prev_arguments: + logger.error( + "INVARIANT - impossible to have arguments reset mid-arguments" + ) + delta = None + # first time to get parameters + elif cur_arguments and not prev_arguments: + cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False) + + arguments_delta = cur_arguments_json[ + : cur_arguments_json.index(delta_text) + len(delta_text) + ] + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += arguments_delta + # both prev and cur parameters, send the increase parameters + elif cur_arguments and prev_arguments: + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, ensure_ascii=False) + + argument_diff = extract_intermediate_diff( + cur_args_json, prev_args_json + ) + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += argument_diff + + # check to see if the name is defined and has been sent. if so, + # stream the name - otherwise keep waiting + # finish by setting old and returning None as base case + tool_call_arr["arguments"] = self.get_arguments(tool_call_arr) + self.prev_tool_call_arr = [tool_call_arr] + return delta + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + text = model_output + tools = request.tools + if "<|action_start|><|plugin|>" in text: + text, action = text.split("<|action_start|><|plugin|>") + action = action.split("<|action_end|>".strip())[0] + action = action[action.find("{") :] + action_dict = json.loads(action) + name, parameters = ( + action_dict["name"], + json.dumps( + action_dict.get("parameters", action_dict.get("arguments", {})), + ensure_ascii=False, + ), + ) + + if not tools or name not in [t.function.name for t in tools]: + ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=text + ) + + tool_calls = [ + ToolCall(function=FunctionCall(name=name, arguments=parameters)) + ] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=text if len(text) > 0 else None, + ) + + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=text + ) diff --git a/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/entrypoints/openai/tool_parsers/jamba_tool_parser.py new file mode 100644 index 0000000..6f53dde --- /dev/null +++ b/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -0,0 +1,323 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence + +import partial_json_parser +import regex as re +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers import ToolParser +from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizers import MistralTokenizer + +logger = init_logger(__name__) + + +class JambaToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if isinstance(self.model_tokenizer, MistralTokenizer): + raise ValueError( + "Detected a MistralTokenizer tokenizer when using a Jamba model" + ) + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id: int = -1 + self.streamed_args_for_tool: list[ + str + ] = [] # map what has been streamed for each tool so far to a list + + self.tool_calls_start_token: str = "" + self.tool_calls_end_token: str = "" + + self.tool_calls_regex = re.compile( + rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}", re.DOTALL + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token) + self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token) + if ( + self.tool_calls_start_token_id is None + or self.tool_calls_end_token_id is None + ): + raise RuntimeError( + "Jamba Tool parser could not locate tool calls start/end " + "tokens in the tokenizer!" + ) + + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + request = super().adjust_request(request) + if request.tools and request.tool_choice != "none": + # do not skip special tokens because jamba use the special + # tokens to indicate the start and end of the tool calls + # information. + request.skip_special_tokens = False + return request + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + else: + try: + # use a regex to find the tool call between the tags + function_calls = self.tool_calls_regex.findall(model_output)[0] + + # load the JSON, and then use it to build the Function and + # Tool Call + raw_function_calls = json.loads(function_calls) + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps( + function_call["arguments"], ensure_ascii=False + ), + ), + ) + for function_call in raw_function_calls + ] + + content = model_output[: model_output.find(self.tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if (len(content) > 0 and content != " ") else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + # if the tool call token is not in the tokens generated so far, append + # output to contents since it's not a tool + if self.tool_calls_start_token not in current_text: + return DeltaMessage(content=delta_text) + + # if the tool call token ID IS in the tokens generated so far, that + # means we're parsing as tool calls now + + # handle if we detected the start of tool calls token which means + # the start of tool calling + if ( + self.tool_calls_start_token_id in delta_token_ids + and len(delta_token_ids) == 1 + ): + # if it's the only token, return None, so we don't send a chat + # completion and don't send a control token + return None + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR + try: + # Extract the tool calls between the special tool call tokens + parsable_arr = current_text.split(self.tool_calls_start_token)[-1].split( + self.tool_calls_end_token + )[0] + + # tool calls are generated in an array, so do partial JSON + # parsing on the entire array + try: + tool_call_arr: list[dict] = partial_json_parser.loads( + parsable_arr, flags + ) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug("not enough tokens to parse into JSON yet") + return None + + # select as the current tool call the one we're on the state at + + current_tool_call: dict = ( + tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {} + ) + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif ( + len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1 + ): + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + diff: str | None = current_tool_call.get("arguments") + + if diff: + diff = json.dumps(diff, ensure_ascii=False).replace( + self.streamed_args_for_tool[self.current_tool_id], "" + ) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += diff + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # case: update an existing tool - this is handled below + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + if not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + cur_arguments = current_tool_call.get("arguments") + + new_text = delta_text.replace("'", '"') + + if not cur_arguments and not prev_arguments: + delta = None + elif not cur_arguments and prev_arguments: + logger.error( + "INVARIANT - impossible to have arguments reset mid-arguments" + ) + delta = None + elif cur_arguments and not prev_arguments: + cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False) + logger.debug("finding %s in %s", new_text, cur_arguments_json) + + arguments_delta = cur_arguments_json[ + : cur_arguments_json.index(new_text) + len(new_text) + ] + logger.debug( + "First tokens in arguments received: %s", arguments_delta + ) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += arguments_delta + + elif cur_arguments and prev_arguments: + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, ensure_ascii=False) + logger.debug( + "Searching for diff between \n%s\n%s", + cur_args_json, + prev_args_json, + ) + + argument_diff = extract_intermediate_diff( + cur_args_json, prev_args_json + ) + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += argument_diff + else: + # try parsing it with regular JSON - if it works we're + # at the end, and we need to send the difference between + # tokens streamed so far and the valid JSON + delta = None + + # check to see if the name is defined and has been sent. if so, + # stream the name - otherwise keep waiting + # finish by setting old and returning None as base case + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None diff --git a/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py new file mode 100644 index 0000000..2b84c60 --- /dev/null +++ b/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py @@ -0,0 +1,590 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# code modified from deepseekv3_tool_parser.py + +from collections.abc import Sequence + +import regex as re + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class KimiK2ToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id: int = -1 + self.streamed_args_for_tool: list[ + str + ] = [] # map what has been streamed for each tool so far to a list + + # Section-level state management to prevent token leakage + self.in_tool_section: bool = False + self.token_buffer: str = "" + # Buffer size: empirical worst-case for longest marker (~30 chars) * 2 + # + safety margin for unicode + partial overlap. Prevents unbounded growth. + self.buffer_max_size: int = 1024 + self.section_char_count: int = 0 # Track characters processed in tool section + self.max_section_chars: int = 8192 # Force exit if section exceeds this + self._buffer_overflow_logged: bool = False # Log overflow once per session + + # Support both singular and plural variants + self.tool_calls_start_token: str = "<|tool_calls_section_begin|>" + self.tool_calls_end_token: str = "<|tool_calls_section_end|>" + self.tool_calls_start_token_variants: list[str] = [ + "<|tool_calls_section_begin|>", + "<|tool_call_section_begin|>", # singular variant + ] + self.tool_calls_end_token_variants: list[str] = [ + "<|tool_calls_section_end|>", + "<|tool_call_section_end|>", # singular variant + ] + + self.tool_call_start_token: str = "<|tool_call_begin|>" + self.tool_call_end_token: str = "<|tool_call_end|>" + + self.tool_call_regex = re.compile( + r"<\|tool_call_begin\|>\s*(?P[^<]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P(?:(?!<\|tool_call_begin\|>).)*?)\s*<\|tool_call_end\|>", + re.DOTALL, + ) + + self.stream_tool_call_portion_regex = re.compile( + r"(?P.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P.*)" + ) + + self.stream_tool_call_name_regex = re.compile(r"(?P.+:\d+)\s*") + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token) + self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token) + + # Get token IDs for all variants + self.tool_calls_start_token_ids: list[int] = [ + tid + for variant in self.tool_calls_start_token_variants + if (tid := self.vocab.get(variant)) is not None + ] + self.tool_calls_end_token_ids: list[int] = [ + tid + for variant in self.tool_calls_end_token_variants + if (tid := self.vocab.get(variant)) is not None + ] + + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + + if ( + self.tool_calls_start_token_id is None + or self.tool_calls_end_token_id is None + ): + raise RuntimeError( + "Kimi-K2 Tool parser could not locate tool call start/end " + "tokens in the tokenizer!" + ) + + def _check_and_strip_markers(self, text: str) -> tuple[str, bool, bool]: + """ + Check for section begin/end markers in text and strip them. + Returns: (cleaned_text, found_section_begin, found_section_end) + """ + found_begin = False + found_end = False + cleaned = text + + # Check for section begin markers (any variant) + for variant in self.tool_calls_start_token_variants: + if variant in cleaned: + cleaned = cleaned.replace(variant, "") + found_begin = True + + # Check for section end markers (any variant) + for variant in self.tool_calls_end_token_variants: + if variant in cleaned: + cleaned = cleaned.replace(variant, "") + found_end = True + + return cleaned, found_begin, found_end + + def _reset_section_state(self) -> None: + """Reset state when exiting tool section.""" + self.in_tool_section = False + self.token_buffer = "" + self.section_char_count = 0 + + def reset_streaming_state(self) -> None: + """ + Reset all streaming state. Call this between requests to prevent + state leakage when parser instance is reused. + """ + # Reset section state + self._reset_section_state() + + # Reset parent class state + self.current_tool_name_sent = False + self.prev_tool_call_arr = [] + self.current_tool_id = -1 + self.streamed_args_for_tool = [] + + logger.debug("Streaming state reset") + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + else: + try: + # there are two possible captures - between tags, or between a + # tag and end-of-string so the result of + # findall is an array of tuples where one is a function call and + # the other is None + function_call_tuples = self.tool_call_regex.findall(model_output) + + logger.debug("function_call_tuples: %s", function_call_tuples) + + tool_calls = [] + for match in function_call_tuples: + function_id, function_args = match + # function_id: functions.get_weather:0 or get_weather:0 + function_name = function_id.split(":")[0].split(".")[-1] + tool_calls.append( + ToolCall( + id=function_id, + type="function", + function=FunctionCall( + name=function_name, arguments=function_args + ), + ) + ) + + content = model_output[: model_output.find(self.tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + logger.debug("delta_text: %s", delta_text) + logger.debug("delta_token_ids: %s", delta_token_ids) + + # Flag to defer section exit until after tool parsing completes + deferred_section_exit = False + + # Add delta to buffer for split marker detection + self.token_buffer += delta_text + + # Enforce buffer size limit to prevent memory issues + if len(self.token_buffer) > self.buffer_max_size: + if not self._buffer_overflow_logged: + logger.warning( + "Token buffer exceeded max size (%d bytes), flushing excess. " + "This may indicate very long markers or unusual tokenization.", + self.buffer_max_size, + ) + self._buffer_overflow_logged = True + # Keep only the most recent content that might contain partial markers + self.token_buffer = self.token_buffer[-self.buffer_max_size // 2 :] + + # Check buffer for section markers (handles split tokens) + buffered_text, found_section_begin, found_section_end = ( + self._check_and_strip_markers(self.token_buffer) + ) + + # Track section state transitions + if found_section_begin and not self.in_tool_section: + logger.debug("Entering tool section") + self.in_tool_section = True + self.token_buffer = buffered_text # Use cleaned buffer + self.section_char_count = 0 # Reset counter for new section + if found_section_end and self.in_tool_section: + logger.debug("Detected section end marker") + # CRITICAL: Don't exit early if tool_call_end is in this chunk. + # Tool parser must emit final arguments/close first to avoid dropping + # the final tool update and leaking tokens into reasoning channel. + has_tool_end = self.tool_call_end_token_id in delta_token_ids + if has_tool_end: + # Defer exit until after tool parsing completes + deferred_section_exit = True + logger.debug("Deferring section exit: tool_call_end in same chunk") + self.token_buffer = buffered_text + else: + # No tool call ending, safe to exit immediately + logger.debug("Exiting tool section") + remaining = buffered_text + self._reset_section_state() + # Return remaining text as reasoning content if non-empty + if remaining.strip(): + return DeltaMessage(content=remaining) + # Return empty delta to maintain function contract + # (always returns DeltaMessage) + return DeltaMessage(content="") + else: + self.token_buffer = buffered_text + + # Check if any variant of section start token is in current_token_ids + has_section_token = any( + tid in current_token_ids for tid in self.tool_calls_start_token_ids + ) + + # Early return: if no section token detected yet, return as reasoning content + if not has_section_token and not self.in_tool_section: + logger.debug("No tool call tokens found!") + # Don't clear buffer - it needs to accumulate partial markers across deltas + # Buffer overflow is already protected by lines 215-224 + return DeltaMessage(content=delta_text) + + # Strip section markers from delta_text for subsequent processing + # NOTE: This preprocessing happens BEFORE the regex-based tool call + # parsing (from PR #24847) to ensure markers are removed cleanly + # before pattern matching. No double-stripping occurs because + # section markers and tool call markers are distinct. + delta_text, _, _ = self._check_and_strip_markers(delta_text) + + # Error recovery: If in tool section for too long, force exit + if self.in_tool_section: + self.section_char_count += len(delta_text) + if self.section_char_count > self.max_section_chars: + logger.warning( + "Tool section exceeded max length (%d chars), forcing exit. " + "This may indicate malformed model output.", + self.max_section_chars, + ) + self._reset_section_state() + # Deferred exit already handled by forced exit above + # Return remaining content as reasoning (or empty delta if no content) + return DeltaMessage(content=delta_text if delta_text.strip() else "") + + try: + # figure out where we are in the parsing by counting tool call + # start & end tags + prev_tool_start_count = previous_token_ids.count( + self.tool_call_start_token_id + ) + prev_tool_end_count = previous_token_ids.count(self.tool_call_end_token_id) + cur_tool_start_count = current_token_ids.count( + self.tool_call_start_token_id + ) + cur_tool_end_count = current_token_ids.count(self.tool_call_end_token_id) + tool_call_portion = None + text_portion = None + + # case: if we're generating text, OR rounding out a tool call + if ( + cur_tool_start_count == cur_tool_end_count + and prev_tool_end_count == cur_tool_end_count + and self.tool_call_end_token not in delta_text + ): + # CRITICAL FIX: Suppress content if in tool section but + # no tool calls started + if self.in_tool_section and cur_tool_start_count == 0: + logger.debug( + "In tool section but no tool calls started yet. " + "Suppressing: %s", + delta_text, + ) + # Return empty delta to maintain iterator contract + return DeltaMessage(content="") + logger.debug("Generating text content! skipping tool parsing.") + return DeltaMessage(content=delta_text) + + if self.tool_call_end_token in delta_text: + logger.debug("tool_call_end_token in delta_text") + full_text = current_text + delta_text + tool_call_portion = ( + full_text.split(self.tool_call_start_token)[-1] + .split(self.tool_call_end_token)[0] + .rstrip() + ) + delta_text = delta_text.split(self.tool_call_end_token)[0].rstrip() + text_portion = delta_text.split(self.tool_call_end_token)[-1].lstrip() + + # case -- we're starting a new tool call + if ( + cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count > prev_tool_start_count + ): + if len(delta_token_ids) > 1: + tool_call_portion = current_text.split(self.tool_call_start_token)[ + -1 + ] + else: + tool_call_portion = None + delta = None + + text_portion = None + + # set cursors and state appropriately + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("Starting on a new tool %s", self.current_tool_id) + + # case -- we're updating an existing tool call + elif ( + cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count == prev_tool_start_count + ): + # get the portion of the text that's the tool call + tool_call_portion = current_text.split(self.tool_call_start_token)[-1] + text_portion = None + + # case -- the current tool call is being closed. + elif ( + cur_tool_start_count == cur_tool_end_count + and cur_tool_end_count >= prev_tool_end_count + ): + if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0: + logger.debug("attempting to close tool call, but no tool call") + # Handle deferred section exit before returning + if deferred_section_exit and self.in_tool_section: + self._reset_section_state() + return None + diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments") + if diff: + diff = ( + diff.encode("utf-8").decode("unicode_escape") + if diff is str + else diff + ) + if '"}' not in delta_text: + # Handle deferred section exit before returning + if deferred_section_exit and self.in_tool_section: + self._reset_section_state() + return None + end_loc = delta_text.rindex('"}') + diff = delta_text[:end_loc] + '"}' + logger.debug( + "Finishing tool and found diff that had not " + "been streamed yet: %s", + diff, + ) + self.streamed_args_for_tool[self.current_tool_id] += diff + # Handle deferred section exit before returning + if deferred_section_exit and self.in_tool_section: + logger.debug("Completing deferred section exit") + self._reset_section_state() + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall(arguments=diff).model_dump( + exclude_none=True + ), + ) + ] + ) + + # case -- otherwise we're just generating text + else: + # Check if we're in tool section - if so, suppress + if self.in_tool_section: + logger.debug("In tool section, suppressing text generation") + # Handle deferred section exit before returning + if deferred_section_exit: + self._reset_section_state() + return DeltaMessage(content="") + text = delta_text.replace(self.tool_call_start_token, "") + text = text.replace(self.tool_call_end_token, "") + delta = DeltaMessage(tool_calls=[], content=text) + # Handle deferred section exit before returning + if deferred_section_exit and self.in_tool_section: + self._reset_section_state() + return delta + + current_tool_call = dict() + if tool_call_portion: + current_tool_call_matches = self.stream_tool_call_portion_regex.match( + tool_call_portion + ) + if current_tool_call_matches: + tool_id, tool_args = current_tool_call_matches.groups() + tool_name = tool_id.split(":")[0].split(".")[-1] + current_tool_call["id"] = tool_id + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = tool_args + else: + current_tool_call_name_matches = ( + self.stream_tool_call_name_regex.match(tool_call_portion) + ) + if current_tool_call_name_matches: + (tool_id_str,) = current_tool_call_name_matches.groups() + tool_name = tool_id_str.split(":")[0].split(".")[-1] + current_tool_call["id"] = tool_id_str + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = "" + else: + logger.debug("Not enough token") + return None + + # case - we haven't sent the tool name yet. If it's available, send + # it. otherwise, wait until it's available. + if not self.current_tool_name_sent: + if current_tool_call is None: + return None + function_name: str | None = current_tool_call.get("name") + tool_id = current_tool_call.get("id") + if function_name: + self.current_tool_name_sent = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=tool_id, + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + else: + return None + + # case -- otherwise, send the tool call delta + + # if the tool call portion is None, send the delta as text + if tool_call_portion is None: + # if there's text but not tool calls, send that - + # otherwise None to skip chunk + delta = ( + DeltaMessage(content=delta_text) + if text_portion is not None + else None + ) + return delta + + # now, the nitty-gritty of tool calls + # now we have the portion to parse as tool call. + + logger.debug( + "Trying to parse current tool call with ID %s", self.current_tool_id + ) + + # if we're starting a new tool call, push an empty object in as + # a placeholder for the arguments + if len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + + # main logic for tool parsing here - compare prev. partially-parsed + # JSON to the current partially-parsed JSON + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + cur_arguments = current_tool_call.get("arguments") + + logger.debug("diffing old arguments: %s", prev_arguments) + logger.debug("against new ones: %s", cur_arguments) + + # case -- no arguments have been created yet. skip sending a delta. + if not cur_arguments and not prev_arguments: + logger.debug("Skipping text %s - no arguments", delta_text) + delta = None + + # case -- prev arguments are defined, but non are now. + # probably impossible, but not a fatal error - just keep going + elif not cur_arguments and prev_arguments: + logger.error( + "should be impossible to have arguments reset " + "mid-call. skipping streaming anything." + ) + delta = None + + # case -- we now have the first info about arguments available from + # autocompleting the JSON + elif cur_arguments and not prev_arguments: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=cur_arguments + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] = cur_arguments + + # last case -- we have an update to existing arguments. + elif cur_arguments and prev_arguments: + if ( + isinstance(delta_text, str) + and cur_arguments != prev_arguments + and len(cur_arguments) > len(prev_arguments) + and cur_arguments.startswith(prev_arguments) + ): + delta_arguments = cur_arguments[len(prev_arguments) :] + logger.debug("got diff %s", delta_text) + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=delta_arguments + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] = cur_arguments + else: + delta = None + + # handle saving the state for the current tool into + # the "prev" list for use in diffing for the next iteration + if self.current_tool_id == len(self.prev_tool_call_arr) - 1: + self.prev_tool_call_arr[self.current_tool_id] = current_tool_call + else: + self.prev_tool_call_arr.append(current_tool_call) + + # Handle deferred section exit after tool parsing completes + if deferred_section_exit and self.in_tool_section: + logger.debug("Completing deferred section exit") + self._reset_section_state() + + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + return None # do not stream a delta. skip this token ID. diff --git a/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py new file mode 100644 index 0000000..1d6de92 --- /dev/null +++ b/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -0,0 +1,341 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import json +from collections.abc import Sequence +from typing import Any + +import regex as re +from transformers import PreTrainedTokenizerBase + +import vllm.envs as envs +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class _UnexpectedAstError(Exception): + pass + + +class Llama4PythonicToolParser(ToolParser): + """ + Toolcall parser for Llama4 that produce tool calls in a pythonic style + Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic + """ + + # TODO(mdepinet): Possible future improvements: + # 1. Support text + tools separated by either <|python_tag|> or \n\n + # 2. Support tools outside of a list (or separated by a semicolon). + # This depends on item 1 for consistent streaming. + # Neither of these are necessary for e.g. ToolACE, but both would help make + # Llama3.2 models more reliable. + + TOOL_CALL_REGEX = re.compile( + r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]", + re.DOTALL, + ) + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + + # Rename for readability. This is NOT a tool id. + @property + def current_tool_index(self) -> int: + return self.current_tool_id + + @current_tool_index.setter + def current_tool_index(self, value: int) -> None: + self.current_tool_id = value + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + """ + + # remove <|python_start|> and <|python_end|> + # as Llama 4 model sometime will output those tokens + if model_output.startswith("<|python_start|>"): + model_output = model_output[len("<|python_start|>") :] + model_output = model_output.replace("<|python_end|>", "") + + is_tool_call_pattern = False + try: + is_tool_call_pattern = ( + self.TOOL_CALL_REGEX.match( + model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS + ) + is not None + ) + except TimeoutError: + logger.warning("Regex timeout occurred when matching tool call pattern.") + logger.debug( + "Regex timeout occurred when matching user input: %s", model_output + ) + + if not is_tool_call_pattern: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + try: + module = ast.parse(model_output) + parsed = getattr(module.body[0], "value", None) + if isinstance(parsed, ast.List) and all( + isinstance(e, ast.Call) for e in parsed.elts + ): + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=[ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ], + content=None, + ) + else: + raise _UnexpectedAstError( + "Tool output must be a list of function calls" + ) + except Exception: + logger.exception("Error in extracting tool call from response.") + # Treat as regular text + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + if not current_text.startswith("[") and not current_text.startswith( + "<|python_start|>" + ): + return DeltaMessage(content=delta_text) + + try: + # remove <|python_start|> and <|python_end|> + if current_text.startswith("<|python_start|>"): + current_text = current_text[len("<|python_start|>") :] + if current_text.endswith("<|python_end|>"): + current_text = current_text[: current_text.rfind("<|python_end|>")] + valid_and_added_text = _make_valid_python(current_text) + if valid_and_added_text is None: + return None + valid_text, added_text = valid_and_added_text + + module = ast.parse(valid_text) + parsed = getattr(module.body[0], "value", None) + if not isinstance(parsed, ast.List) or not all( + isinstance(e, ast.Call) for e in parsed.elts + ): + raise _UnexpectedAstError( + "Tool output must be a list of function calls" + ) + tool_calls = [ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ] + + tool_deltas = [] + for index, new_call in enumerate(tool_calls): + if index < self.current_tool_index: + continue + + self.current_tool_index = index + if len(self.streamed_args_for_tool) == index: + self.streamed_args_for_tool.append("") + + new_call_complete = ( + index < len(tool_calls) - 1 or ")]" not in added_text + ) + if new_call_complete: + self.current_tool_index += 1 + + withheld_suffix = added_text[:-2] if not new_call_complete else "" + if not new_call_complete and added_text[-2] == ")": + # Function call is incomplete. Withhold the closing bracket. + withheld_suffix = withheld_suffix + "}" + # Strings get single quotes in the model-produced string. + # JSON requires double quotes. + withheld_suffix = withheld_suffix.replace("'", '"') + delta = _compute_tool_delta( + self.streamed_args_for_tool[index], new_call, index, withheld_suffix + ) + + if delta is not None: + tool_deltas.append(delta) + if ( + delta.function is not None + and delta.function.arguments is not None + ): + self.streamed_args_for_tool[index] += delta.function.arguments + + # HACK: serving_chat.py inspects the internal state of tool parsers + # when determining its final streaming delta, automatically + # adding autocompleted JSON. + # These two lines avoid that nonsense while ensuring finish_reason + # is set to tool_calls when at least one tool is called. + if tool_deltas and not self.prev_tool_call_arr: + self.prev_tool_call_arr = [{"arguments": {}}] + + if tool_deltas: + return DeltaMessage(tool_calls=tool_deltas) + elif not added_text and self.current_tool_id > 0: + # Return an empty DeltaMessage once the tool calls are all done + # so that finish_reason gets set. + return DeltaMessage(content="") + else: + return None + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None + + +def _get_parameter_value(val: ast.expr) -> Any: + if isinstance(val, ast.Constant): + return val.value + elif isinstance(val, ast.Dict): + if not all(isinstance(k, ast.Constant) for k in val.keys): + raise _UnexpectedAstError("Dict tool call arguments must have literal keys") + return { + k.value: _get_parameter_value(v) # type: ignore + for k, v in zip(val.keys, val.values) + } + elif isinstance(val, ast.List): + return [_get_parameter_value(v) for v in val.elts] + else: + raise _UnexpectedAstError("Tool call arguments must be literals") + + +def _handle_single_tool(call: ast.Call) -> ToolCall: + if not isinstance(call.func, ast.Name): + raise _UnexpectedAstError("Invalid tool call name") + function_name = call.func.id + arguments = {} + for keyword in call.keywords: + arguments[keyword.arg] = _get_parameter_value(keyword.value) + return ToolCall( + type="function", + function=FunctionCall(name=function_name, arguments=json.dumps(arguments)), + ) + + +def _make_valid_python(text: str) -> tuple[str, str] | None: + bracket_stack = [] + for index, char in enumerate(text): + if char in {"[", "(", "{"}: + bracket_stack.append(char) + elif char == "]": + if not bracket_stack or bracket_stack.pop() != "[": + raise _UnexpectedAstError("Mismatched square brackets") + elif char == ")": + if not bracket_stack or bracket_stack.pop() != "(": + raise _UnexpectedAstError("Mismatched parentheses") + elif char == "}": + if not bracket_stack or bracket_stack.pop() != "{": + raise _UnexpectedAstError("Mismatched curly braces") + elif char in {"'", '"'}: + if bracket_stack and bracket_stack[-1] == char: + if index > 0 and text[index - 1] == "\\": + # Treat an escaped quote as a regular character + pass + else: + bracket_stack.pop() + elif bracket_stack and bracket_stack[-1] in {"'", '"'}: + # Double quote within a single quote string or vice versa. + pass + else: + bracket_stack.append(char) + + text = text.rstrip() + if text.endswith("=") or text.endswith(":"): + # Since we have no type information for this property/parameter value, + # we can't fill in a valid value. + return None + if bracket_stack and bracket_stack[-1] == "{": + trailing_dict_text = text[: text.rfind("{")] + num_keys = trailing_dict_text.count(":") + num_values = trailing_dict_text.count(",") + if num_keys <= num_values: + return None # Incomplete property name within parameter value + if bracket_stack and bracket_stack[-1] == "(": + trailing_params_text = text[: text.rfind("(")] + num_full_param_names = trailing_params_text.count("=") + num_full_param_values = trailing_params_text.count(",") + if num_full_param_names <= num_full_param_values: + return None # Incomplete parameter name + if text.endswith(","): + text = text[:-1] + if ( + bracket_stack + and bracket_stack[-1] == "[" + and not text.endswith("[") + and not text.endswith(")") + ): + return None # Incomplete function name + + added_text = "" + for char in reversed(bracket_stack): + if char == "[": + added_text += "]" + elif char == "(": + added_text += ")" + elif char == "{": + added_text += "}" + elif char == "'": + added_text += "'" + elif char == '"': + added_text += '"' + + return text + added_text, added_text + + +def _compute_tool_delta( + previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str +) -> DeltaToolCall | None: + new_call_args = new_call.function.arguments + if withheld_suffix: + assert new_call_args.endswith(withheld_suffix) + new_call_args = new_call_args[: -len(withheld_suffix)] + if not previously_sent_args: + return DeltaToolCall( + id=new_call.id, + type="function", + index=index, + function=DeltaFunctionCall( + name=new_call.function.name, + arguments=new_call_args, + ), + ) + + arg_diff = new_call_args[len(previously_sent_args) :] + return ( + DeltaToolCall( + id=None, index=index, function=DeltaFunctionCall(arguments=arg_diff) + ) + if arg_diff + else None + ) diff --git a/entrypoints/openai/tool_parsers/llama_tool_parser.py b/entrypoints/openai/tool_parsers/llama_tool_parser.py new file mode 100644 index 0000000..02fc9b8 --- /dev/null +++ b/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -0,0 +1,290 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence + +import partial_json_parser +import regex as re +from partial_json_parser.core.options import Allow +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.entrypoints.openai.tool_parsers.utils import ( + find_common_prefix, + is_complete_json, + partial_json_loads, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class Llama3JsonToolParser(ToolParser): + """ + Tool call parser for Llama 3.x and 4 models intended for use with the + examples/tool_chat_template_llama.jinja template. + + Used when --enable-auto-tool-choice --tool-call-parser llama3_json or + llama4_json are set. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + + # initialize properties used for state when parsing tool calls in + # streaming mode + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id: int = -1 + self.current_tool_name_sent: bool = False + self.streamed_args_for_tool: list[ + str + ] = [] # map what has been streamed for each tool so far to a list + self.bot_token = "<|python_tag|>" + self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[ + 0 + ] + # Updated regex to match multiple JSONs separated by semicolons + # This pattern is more robust and can handle nested JSON objects + self.tool_call_regex = re.compile( + r"{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*", + re.DOTALL, + ) + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + Only extracts JSON content and ignores any surrounding plain text. + Supports both single JSON and multiple JSONs separated by semicolons. + """ + # Quick check before running regex + if not (self.bot_token in model_output or "{" in model_output): + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + # Find JSON object(s) in the text using regex + match = self.tool_call_regex.search(model_output) + if not match: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + try: + json_str = match.group(0) + # Split by semicolon and strip whitespace + json_objects = [obj.strip() for obj in json_str.split(";")] + + tool_calls: list[ToolCall] = [] + for json_obj in json_objects: + if not json_obj: # Skip empty strings + continue + obj = json.loads(json_obj) + tool_calls.append( + ToolCall( + type="function", + function=FunctionCall( + name=obj["name"], + # function call args are JSON but as a string + arguments=json.dumps( + obj["arguments"] + if "arguments" in obj + else obj["parameters"], + ensure_ascii=False, + ), + ), + ) + ) + + return ExtractedToolCallInformation( + tools_called=True, tool_calls=tool_calls, content=None + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + # return information to just treat the tool call as regular JSON + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + if not ( + current_text.startswith(self.bot_token) or current_text.startswith("{") + ): + return DeltaMessage(content=delta_text) + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR + try: + tool_call_arr = [] + is_complete = [] + try: + # depending on the prompt format the Llama model may or may not + # prefix the output with the <|python_tag|> token + start_idx = ( + len(self.bot_token) + if current_text.startswith(self.bot_token) + else 0 + ) + while start_idx < len(current_text): + (obj, end_idx) = partial_json_loads(current_text[start_idx:], flags) + is_complete.append( + is_complete_json(current_text[start_idx : start_idx + end_idx]) + ) + start_idx += end_idx + len("; ") + # depending on the prompt Llama can use + # either arguments or parameters + if "parameters" in obj: + assert "arguments" not in obj, ( + "model generated both parameters and arguments" + ) + obj["arguments"] = obj["parameters"] + tool_call_arr.append(obj) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug("not enough tokens to parse into JSON yet") + return None + + # select as the current tool call the one we're on the state at + current_tool_call: dict = ( + tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {} + ) + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif ( + len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1 + ): + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + cur_arguments = current_tool_call.get("arguments") + if cur_arguments: + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + sent = len(self.streamed_args_for_tool[self.current_tool_id]) + argument_diff = cur_args_json[sent:] + + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += ( + argument_diff + ) + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + elif not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + cur_arguments = current_tool_call.get("arguments") + delta = None + + if cur_arguments: + sent = len(self.streamed_args_for_tool[self.current_tool_id]) + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + + argument_diff = None + if is_complete[self.current_tool_id]: + argument_diff = cur_args_json[sent:] + elif prev_arguments: + prev_args_json = json.dumps(prev_arguments, ensure_ascii=False) + if cur_args_json != prev_args_json: + prefix = find_common_prefix(prev_args_json, cur_args_json) + argument_diff = prefix[sent:] + + if argument_diff is not None: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += ( + argument_diff + ) + + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None diff --git a/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/entrypoints/openai/tool_parsers/longcat_tool_parser.py new file mode 100644 index 0000000..c6c8ae8 --- /dev/null +++ b/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import regex as re + +from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser +from vllm.transformers_utils.tokenizer import AnyTokenizer + + +class LongcatFlashToolParser(Hermes2ProToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + self.tool_call_start_token: str = "" + self.tool_call_end_token: str = "" + + self.tool_call_regex = re.compile( + r"(.*?)|(.*)", + re.DOTALL, + ) + + self.tool_call_start_token_ids = self.model_tokenizer.encode( + self.tool_call_start_token, add_special_tokens=False + ) + self.tool_call_end_token_ids = self.model_tokenizer.encode( + self.tool_call_end_token, add_special_tokens=False + ) + + self.tool_call_start_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_start_token_ids + ] + + self.tool_call_end_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_end_token_ids + ] diff --git a/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py new file mode 100644 index 0000000..5c2258b --- /dev/null +++ b/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py @@ -0,0 +1,643 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +import uuid +from collections.abc import Sequence +from typing import Any + +import regex as re + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class MinimaxM2ToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + self.prev_tool_call_arr: list[dict] = [] + + # Sentinel tokens + self.tool_call_start_token: str = "" + self.tool_call_end_token: str = "" + self.invoke_start_prefix: str = "" + self.parameter_prefix: str = "" + + # Streaming state variables + self.current_tool_name_sent: bool = False + # Override base class type - we use string IDs for tool calls + self.current_tool_id: str | None = None # type: ignore + self.streamed_args_for_tool: list[str] = [] + self.is_tool_call_started: bool = False + self.failed_count: int = 0 + + # Initialize streaming state variables + self.current_tool_index: int = 0 + self.invoke_index: int = 0 + self.header_sent: bool = False + self.current_function_name: str | None = None + self.current_param_name: str | None = None + self.current_param_value: str = "" + self.param_count: int = 0 + self.in_param: bool = False + self.in_function: bool = False + self.accumulated_text: str = "" + self.json_started: bool = False + self.json_closed: bool = False + self.accumulated_params: dict = {} + self.streaming_request: ChatCompletionRequest | None = None + + # Enhanced streaming state - reset for each new message + self._reset_streaming_state() + + # Regex patterns for complete parsing + self.tool_call_complete_regex = re.compile( + r"(.*?)", re.DOTALL + ) + self.invoke_complete_regex = re.compile( + r"", re.DOTALL + ) + self.parameter_complete_regex = re.compile( + r"", re.DOTALL + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + + if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: + raise RuntimeError( + "MiniMax M2 Tool parser could not locate tool call start/end " + "tokens in the tokenizer!" + ) + + logger.debug( + "vLLM Successfully import tool parser %s !", self.__class__.__name__ + ) + + def _generate_tool_call_id(self) -> str: + """Generate a unique tool call ID.""" + return f"call_{uuid.uuid4().hex[:24]}" + + def _reset_streaming_state(self): + """Reset all streaming state.""" + self.current_tool_index = 0 + self.invoke_index = 0 + self.is_tool_call_started = False + self.header_sent = False + self.current_tool_id = None + self.current_function_name = None + self.current_param_name = None + self.current_param_value = "" + self.param_count = 0 + self.in_param = False + self.in_function = False + self.accumulated_text = "" + self.json_started = False + self.json_closed = False + # Store accumulated parameters for type conversion + self.accumulated_params = {} + self.streaming_request = None + # Clear previous tool call history to avoid state pollution + self.prev_tool_call_arr.clear() + + def _extract_name(self, name_str: str) -> str: + """Extract name from quoted string.""" + name_str = name_str.strip() + if ( + name_str.startswith('"') + and name_str.endswith('"') + or name_str.startswith("'") + and name_str.endswith("'") + ): + return name_str[1:-1] + return name_str + + def _convert_param_value(self, value: str, param_type: str) -> Any: + """Convert parameter value to the correct type.""" + if value.lower() == "null": + return None + + param_type = param_type.lower() + if param_type in ["string", "str", "text"]: + return value + elif param_type in ["integer", "int"]: + try: + return int(value) + except (ValueError, TypeError): + return value + elif param_type in ["number", "float"]: + try: + val = float(value) + return val if val != int(val) else int(val) + except (ValueError, TypeError): + return value + elif param_type in ["boolean", "bool"]: + return value.lower() in ["true", "1"] + elif param_type in ["object", "array"]: + try: + return json.loads(value) + except json.JSONDecodeError: + return value + else: + # Try JSON parse first, fallback to string + try: + return json.loads(value) + except json.JSONDecodeError: + return value + + def _parse_single_invoke( + self, invoke_str: str, tools: list | None + ) -> ToolCall | None: + """Parse a single block.""" + # Extract function name + name_match = re.search(r"^([^>]+)", invoke_str) + if not name_match: + return None + + function_name = self._extract_name(name_match.group(1)) + + # Get parameter configuration + param_config = {} + if tools: + for tool in tools: + if ( + hasattr(tool, "function") + and tool.function.name == function_name + and hasattr(tool.function, "parameters") + ): + params = tool.function.parameters + if isinstance(params, dict) and "properties" in params: + param_config = params["properties"] + break + + # Extract parameters + param_dict = {} + for match in self.parameter_complete_regex.findall(invoke_str): + param_match = re.search(r"^([^>]+)>(.*)", match, re.DOTALL) + if param_match: + param_name = self._extract_name(param_match.group(1)) + param_value = param_match.group(2).strip() + if param_value.startswith("\n"): + param_value = param_value[1:] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + # Get parameter type + param_type = "string" + if ( + param_name in param_config + and isinstance(param_config[param_name], dict) + and "type" in param_config[param_name] + ): + param_type = param_config[param_name]["type"] + + # Convert value + param_dict[param_name] = self._convert_param_value( + param_value, param_type + ) + + return ToolCall( + type="function", + function=FunctionCall( + name=function_name, + arguments=json.dumps(param_dict, ensure_ascii=False), + ), + ) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + """Extract tool calls from complete model output (non-streaming).""" + # Quick check + if self.tool_call_start_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + try: + tool_calls = [] + + # Find all complete tool_call blocks + for tool_call_match in self.tool_call_complete_regex.findall(model_output): + # Find all invokes within this tool_call + for invoke_match in self.invoke_complete_regex.findall(tool_call_match): + tool_call = self._parse_single_invoke( + invoke_match, request.tools if request else None + ) + if tool_call: + tool_calls.append(tool_call) + + if not tool_calls: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + # Update prev_tool_call_arr + self.prev_tool_call_arr.clear() + for tool_call in tool_calls: + self.prev_tool_call_arr.append( + { + "name": tool_call.function.name, + "arguments": tool_call.function.arguments, + } + ) + + # Extract content before first tool call + first_tool_idx = model_output.find(self.tool_call_start_token) + content = model_output[:first_tool_idx] if first_tool_idx > 0 else None + + return ExtractedToolCallInformation( + tools_called=True, tool_calls=tool_calls, content=content + ) + + except Exception: + logger.exception("Error extracting tool calls") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], # pylint: disable=unused-argument + current_token_ids: Sequence[int], # pylint: disable=unused-argument + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + """Extract tool calls from streaming model output.""" + + # Store request for type conversion + if not previous_text or self.tool_call_start_token in delta_text: + self._reset_streaming_state() + self.streaming_request = request + + # If no delta text, return None unless it's an EOS token after tools + if not delta_text: + # Check if this is an EOS token after all tool calls are complete + if delta_token_ids and self.tool_call_end_token_id not in delta_token_ids: + # Count complete tool calls + complete_calls = len( + self.tool_call_complete_regex.findall(current_text) + ) + + # If we have completed tool calls and populated prev_tool_call_arr + if complete_calls > 0 and len(self.prev_tool_call_arr) > 0: + # Check if all tool calls are closed + open_calls = current_text.count( + self.tool_call_start_token + ) - current_text.count(self.tool_call_end_token) + if open_calls == 0: + # Return empty delta for finish_reason processing + return DeltaMessage(content="") + elif not self.is_tool_call_started and current_text: + # This is a regular content response that's now complete + return DeltaMessage(content="") + return None + + # Update accumulated text + self.accumulated_text = current_text + + # Check if we need to advance to next tool + if self.json_closed and not self.in_function: + # Check if this tool call has ended + invoke_ends = current_text.count(self.invoke_end_token) + if invoke_ends > self.current_tool_index: + # This tool has ended, advance to next + self.current_tool_index += 1 + self.header_sent = False + self.param_count = 0 + self.json_started = False + self.json_closed = False + self.in_function = False # Now we can safely set this to False + self.accumulated_params = {} + # Continue processing next tool + return None + + # Handle normal content before tool calls + if not self.is_tool_call_started: + # Check if tool call is starting + if ( + self.tool_call_start_token_id in delta_token_ids + or self.tool_call_start_token in delta_text + ): + self.is_tool_call_started = True + # Return any content before the tool call + if self.tool_call_start_token in delta_text: + content_before = delta_text[ + : delta_text.index(self.tool_call_start_token) + ] + if content_before: + return DeltaMessage(content=content_before) + return None + else: + # Check if we're between tool calls - skip whitespace + if ( + current_text.rstrip().endswith(self.tool_call_end_token) + and delta_text.strip() == "" + ): + # We just ended a tool call, skip whitespace + return None + # Normal content, no tool call + return DeltaMessage(content=delta_text) + + # Check if we're between tool calls (waiting for next one) + invoke_starts_count = current_text.count(self.invoke_start_prefix) + if self.current_tool_index >= invoke_starts_count: + # We're past all tool calls, shouldn't be here + return None + + # Find the current tool call portion + invoke_start_positions: list[int] = [] + idx = 0 + while True: + idx = current_text.find(self.invoke_start_prefix, idx) + if idx == -1: + break + invoke_start_positions.append(idx) + idx += len(self.invoke_start_prefix) + + if self.current_tool_index >= len(invoke_start_positions): + # No more tool calls to process yet + return None + + invoke_start_idx = invoke_start_positions[self.current_tool_index] + # Find where this tool call ends (or current position if not ended yet) + invoke_end_idx = current_text.find(self.invoke_end_token, invoke_start_idx) + if invoke_end_idx == -1: + tool_text = current_text[invoke_start_idx:] + else: + tool_text = current_text[ + invoke_start_idx : invoke_end_idx + len(self.invoke_end_token) + ] + + # Looking for function header + if not self.header_sent: + if self.invoke_start_prefix in tool_text: + func_start = tool_text.find(self.invoke_start_prefix) + len( + self.invoke_start_prefix + ) + # Find the end quote for the function name + func_end = tool_text.find(">", func_start) + + if func_end != -1: + # Found complete function name + function_name_raw = tool_text[func_start:func_end] + self.current_function_name = self._extract_name(function_name_raw) + self.current_tool_id = self._generate_tool_call_id() + self.header_sent = True + self.in_function = True + + # Add to prev_tool_call_arr immediately when we detect a tool call + # Each tool call should be recorded regardless of function name + # Ensure we don't add the same tool call index multiple times + if len(self.prev_tool_call_arr) <= self.current_tool_index: + self.prev_tool_call_arr.append( + { + "name": self.current_function_name, + "arguments": "{}", # Placeholder, will be updated later + } + ) + + # Send header with function info + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + id=self.current_tool_id, + function=DeltaFunctionCall( + name=self.current_function_name, arguments="" + ), + type="function", + ) + ] + ) + return None + + # We've sent header, now handle function body + if self.in_function: + # Send opening brace if not sent yet + if self.in_function and not self.json_started: + self.json_started = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments="{"), + ) + ] + ) + + # Make sure json_started is set if we're processing parameters + if not self.json_started: + self.json_started = True + + # Check for function end in accumulated text + if not self.json_closed and self.invoke_end_token in tool_text: + # Count total parameters in the tool text + total_param_count = tool_text.count(self.parameter_prefix) + + # Only close JSON if all parameters have been processed + if self.param_count >= total_param_count: + # Close JSON + self.json_closed = True + + # Extract complete tool call + # Find the invoke content + invoke_start = tool_text.find(self.invoke_start_prefix) + len( + self.invoke_start_prefix + ) + invoke_content_end = tool_text.find( + self.invoke_end_token, invoke_start + ) + if invoke_content_end != -1: + invoke_content = tool_text[invoke_start:invoke_content_end] + # Parse to get the complete arguments + try: + parsed_tool = self._parse_single_invoke( + invoke_content, + self.streaming_request.tools + if self.streaming_request + else None, + ) + if parsed_tool and self.current_tool_index < len( + self.prev_tool_call_arr + ): + # Update existing entry in prev_tool_call_arr + args = parsed_tool.function.arguments + self.prev_tool_call_arr[self.current_tool_index][ + "arguments" + ] = args + except Exception: + pass # Ignore parsing errors during streaming + + result = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments="}"), + ) + ] + ) + + # Reset state for next tool + self.json_closed = True + self.in_function = False + self.accumulated_params = {} + + logger.debug("[M2_STREAMING] Tool call completed") + + return result + else: + # Don't close JSON yet, continue processing parameters + return None + + # Look for parameters + # Find all parameter starts + param_starts = [] + idx = 0 + while True: + idx = tool_text.find(self.parameter_prefix, idx) + if idx == -1: + break + param_starts.append(idx) + idx += len(self.parameter_prefix) + + # Check if we should start a new parameter + if ( + not self.in_param + and self.param_count < len(param_starts) + and len(param_starts) > self.param_count + ): + # Process the next parameter + param_idx = param_starts[self.param_count] + param_start = param_idx + len(self.parameter_prefix) + remaining = tool_text[param_start:] + + if ">" in remaining: + # We have the complete parameter name + name_end = remaining.find(">") + param_name_raw = remaining[:name_end] + self.current_param_name = self._extract_name(param_name_raw) + + # Find the parameter value + value_start = param_start + name_end + 1 + value_text = tool_text[value_start:] + if value_text.startswith("\n"): + value_text = value_text[1:] + + # Find where this parameter ends + param_end_idx = value_text.find(self.parameter_end_token) + if param_end_idx == -1: + # No closing tag, look for next parameter or function end + next_param_idx = value_text.find(self.parameter_prefix) + func_end_idx = value_text.find(self.invoke_end_token) + + if next_param_idx != -1 and ( + func_end_idx == -1 or next_param_idx < func_end_idx + ): + param_end_idx = next_param_idx + elif func_end_idx != -1: + param_end_idx = func_end_idx + else: + # Neither found, check if tool call is complete + if self.invoke_end_token in tool_text: + # Tool call and parameter is complete + param_end_idx = len(value_text) + else: + # Still streaming, wait for more content + return None + + if param_end_idx != -1: + # Complete parameter found + param_value = value_text[:param_end_idx] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + # Store raw value for later processing + self.accumulated_params[self.current_param_name] = param_value + + # Get parameter configuration for type conversion + param_config = {} + if self.streaming_request and self.streaming_request.tools: + for tool in self.streaming_request.tools: + if ( + hasattr(tool, "function") + and tool.function.name == self.current_function_name + and hasattr(tool.function, "parameters") + ): + params = tool.function.parameters + if ( + isinstance(params, dict) + and "properties" in params + ): + param_config = params["properties"] + break + + # Get parameter type + param_type = "string" + if ( + self.current_param_name in param_config + and isinstance(param_config[self.current_param_name], dict) + and "type" in param_config[self.current_param_name] + ): + param_type = param_config[self.current_param_name]["type"] + + # Convert param value to appropriate type + converted_value = self._convert_param_value( + param_value, param_type + ) + + # Build JSON fragment based on the converted type + # Use json.dumps to properly serialize the value + serialized_value = json.dumps( + converted_value, ensure_ascii=False + ) + + if self.param_count == 0: + json_fragment = ( + f'"{self.current_param_name}": {serialized_value}' + ) + else: + json_fragment = ( + f', "{self.current_param_name}": {serialized_value}' + ) + + self.param_count += 1 + + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments=json_fragment), + ) + ] + ) + + return None diff --git a/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/entrypoints/openai/tool_parsers/minimax_tool_parser.py new file mode 100644 index 0000000..982518a --- /dev/null +++ b/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -0,0 +1,849 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence +from typing import Any + +import regex as re + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class MinimaxToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + # Initialize streaming state for tracking tool call progress + self.streaming_state: dict[str, Any] = { + "current_tool_index": -1, # Index of current tool being processed + "tool_ids": [], # List of tool call IDs + "sent_tools": [], # List of tools that have been sent + } + + # Define tool call tokens and patterns + self.tool_call_start_token = "" + self.tool_call_end_token = "" + self.tool_call_regex = re.compile( + r"(.*?)|(.*)", re.DOTALL + ) + self.thinking_tag_pattern = r"(.*?)" + self.tool_name_pattern = re.compile(r'"name":\s*"([^"]+)"') + self.tool_args_pattern = re.compile(r'"arguments":\s*') + + # Buffer for handling partial tool calls during streaming + self.pending_buffer = "" + self.in_thinking_tag = False + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + + # Get token IDs for tool call start/end tokens + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + + if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: + logger.warning( + "Minimax Tool parser could not locate tool call start/end " + "tokens in the tokenizer. Falling back to string matching." + ) + + def preprocess_model_output(self, model_output: str) -> str: + """ + Preprocess model output by removing tool calls from thinking tags. + + Args: + model_output: Raw model output string + + Returns: + Preprocessed model output with tool calls removed from thinking tags + """ + + def remove_tool_calls_from_think(match): + think_content = match.group(1) + cleaned_content = re.sub( + r".*?", "", think_content, flags=re.DOTALL + ) + return f"{cleaned_content}" + + return re.sub( + self.thinking_tag_pattern, + remove_tool_calls_from_think, + model_output, + flags=re.DOTALL, + ) + + def _clean_duplicate_braces(self, args_text: str) -> str: + """ + Clean duplicate closing braces from arguments text. + + Args: + args_text: Raw arguments text + + Returns: + Cleaned arguments text with proper JSON formatting + """ + args_text = args_text.strip() + if not args_text: + return args_text + + try: + json.loads(args_text) + return args_text + except json.JSONDecodeError: + pass + + while args_text.endswith("}}"): + candidate = args_text[:-1] + try: + json.loads(candidate) + return candidate + except json.JSONDecodeError: + args_text = candidate + + return args_text + + def _clean_delta_braces(self, delta_text: str) -> str: + """ + Clean delta text by removing excessive closing braces. + + Args: + delta_text: Delta text to clean + + Returns: + Cleaned delta text + """ + if not delta_text: + return delta_text + + delta_stripped = delta_text.strip() + + if delta_stripped and all(c in "}\n\r\t " for c in delta_stripped): + brace_count = delta_stripped.count("}") + if brace_count > 1: + return "}\n" if delta_text.endswith("\n") else "}" + + return delta_text + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + """ + Extract tool calls from model output for non-streaming mode. + + Args: + model_output: Complete model output + request: Chat completion request + + Returns: + ExtractedToolCallInformation containing tool calls and content + """ + processed_output = self.preprocess_model_output(model_output) + + if self.tool_call_start_token not in processed_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + try: + function_call_tuples = self.tool_call_regex.findall(processed_output) + + raw_function_calls = [] + for match in function_call_tuples: + tool_call_content = match[0] if match[0] else match[1] + if tool_call_content.strip(): + lines = tool_call_content.strip().split("\n") + for line in lines: + line = line.strip() + if line and line.startswith("{") and line.endswith("}"): + try: + parsed_call = json.loads(line) + raw_function_calls.append(parsed_call) + except json.JSONDecodeError: + continue + + tool_calls = [] + for function_call in raw_function_calls: + if "name" in function_call and "arguments" in function_call: + tool_calls.append( + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + arguments=json.dumps( + function_call["arguments"], ensure_ascii=False + ), + ), + ) + ) + + processed_pos = processed_output.find(self.tool_call_start_token) + if processed_pos != -1: + processed_content = processed_output[:processed_pos].strip() + + if processed_content: + lines = processed_content.split("\n") + for line in reversed(lines): + line = line.strip() + if line: + pos = model_output.find(line) + if pos != -1: + content = model_output[: pos + len(line)] + break + else: + content = "" + else: + content = "" + else: + content = model_output + + return ExtractedToolCallInformation( + tools_called=len(tool_calls) > 0, + tool_calls=tool_calls, + content=content.strip() if content.strip() else None, + ) + + except Exception: + logger.exception( + "An unexpected error occurred during tool call extraction." + ) + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def _update_thinking_state(self, text: str) -> None: + """ + Update the thinking tag state based on text content. + + Args: + text: Text to analyze for thinking tags + """ + open_count = text.count("") + close_count = text.count("") + self.in_thinking_tag = open_count > close_count or ( + open_count == close_count and text.endswith("") + ) + + def _is_potential_tag_start(self, text: str) -> bool: + """ + Check if text might be the start of a tool call tag. + + Args: + text: Text to check + + Returns: + True if text could be the start of a tool call tag + """ + for tag in [self.tool_call_start_token, self.tool_call_end_token]: + if any( + tag.startswith(text[-i:]) + for i in range(1, min(len(text) + 1, len(tag))) + ): + return True + return False + + def _should_buffer_content(self, delta_text: str) -> bool: + """ + Determine if content should be buffered for later processing. + + Args: + delta_text: Delta text to check + + Returns: + True if content should be buffered + """ + if self.in_thinking_tag: + return False + return bool( + self.pending_buffer + or self.tool_call_start_token in delta_text + or self.tool_call_end_token in delta_text + or delta_text.startswith("<") + ) + + def _split_content_for_buffering(self, delta_text: str) -> tuple[str, str]: + """ + Split delta text into safe content and potential tag content. + + Args: + delta_text: Delta text to split + + Returns: + Tuple of (safe_content, potential_tag_content) + """ + if self.in_thinking_tag: + return delta_text, "" + + for tag in [self.tool_call_start_token, self.tool_call_end_token]: + for i in range(1, len(tag)): + tag_prefix = tag[:i] + pos = delta_text.rfind(tag_prefix) + if pos != -1 and tag.startswith(delta_text[pos:]): + return delta_text[:pos], delta_text[pos:] + return delta_text, "" + + def _process_buffer(self, new_content: str) -> str: + """ + Process buffered content and return output content. + + Args: + new_content: New content to add to buffer + + Returns: + Processed output content + """ + self.pending_buffer += new_content + output_content = "" + + if self.in_thinking_tag: + output_content = self.pending_buffer + self.pending_buffer = "" + return output_content + + while self.pending_buffer: + start_pos = self.pending_buffer.find(self.tool_call_start_token) + end_pos = self.pending_buffer.find(self.tool_call_end_token) + + if start_pos != -1 and (end_pos == -1 or start_pos < end_pos): + tag_pos, tag_len = start_pos, len(self.tool_call_start_token) + elif end_pos != -1: + tag_pos, tag_len = end_pos, len(self.tool_call_end_token) + else: + if self._is_potential_tag_start(self.pending_buffer): + break + output_content += self.pending_buffer + self.pending_buffer = "" + break + + output_content += self.pending_buffer[:tag_pos] + self.pending_buffer = self.pending_buffer[tag_pos + tag_len :] + + return output_content + + def _reset_streaming_state(self) -> None: + """Reset the streaming state to initial values.""" + self.streaming_state = { + "current_tool_index": -1, + "tool_ids": [], + "sent_tools": [], + } + + def _advance_to_next_tool(self) -> None: + """Advance to the next tool in the streaming sequence.""" + self.streaming_state["current_tool_index"] = ( + int(self.streaming_state["current_tool_index"]) + 1 + ) + + def _set_current_tool_index(self, index: int) -> None: + """ + Set the current tool index. + + Args: + index: Tool index to set + """ + self.streaming_state["current_tool_index"] = index + + def _get_current_tool_index(self) -> int: + """ + Get the current tool index. + + Returns: + Current tool index + """ + return int(self.streaming_state["current_tool_index"]) + + def _get_next_unsent_tool_index(self, tool_count: int) -> int: + """ + Get the index of the next unsent tool. + + Args: + tool_count: Total number of tools + + Returns: + Index of next unsent tool, or -1 if all tools sent + """ + sent_tools = list(self.streaming_state["sent_tools"]) + for i in range(tool_count): + if i < len(sent_tools): + if not sent_tools[i]["sent_name"]: + return i + else: + return i + return -1 + + def _ensure_state_arrays(self, tool_count: int) -> None: + """ + Ensure state arrays have sufficient capacity for tool_count tools. + + Args: + tool_count: Number of tools to prepare for + """ + sent_tools = list(self.streaming_state["sent_tools"]) + tool_ids = list(self.streaming_state["tool_ids"]) + + while len(sent_tools) < tool_count: + sent_tools.append( + { + "sent_name": False, + "sent_arguments": "", + "id": make_tool_call_id(), + } + ) + + while len(tool_ids) < tool_count: + tool_ids.append(None) + + self.streaming_state["sent_tools"] = sent_tools + self.streaming_state["tool_ids"] = tool_ids + + def _detect_tools_in_text(self, text: str) -> int: + """ + Detect the number of tools in text by counting name patterns. + + Args: + text: Text to analyze + + Returns: + Number of tools detected + """ + matches = self.tool_name_pattern.findall(text) + return len(matches) + + def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]: + """ + Find the boundaries of tool calls in text. + + Args: + text: Text to analyze + + Returns: + List of (start, end) positions for tool calls + """ + boundaries = [] + i = 0 + while i < len(text): + if text[i] == "{": + start = i + depth = 0 + has_name = False + has_arguments = False + + while i < len(text): + if text[i] == "{": + depth += 1 + elif text[i] == "}": + depth -= 1 + if depth == 0: + end = i + 1 + segment = text[start:end] + if '"name"' in segment and '"arguments"' in segment: + boundaries.append((start, end)) + break + + if not has_name and '"name"' in text[start : i + 1]: + has_name = True + if not has_arguments and '"arguments"' in text[start : i + 1]: + has_arguments = True + + i += 1 + + if depth > 0 and has_name: + boundaries.append((start, i)) + else: + i += 1 + return boundaries + + def _extract_tool_args(self, tool_content: str, args_match: re.Match[str]) -> str: + """ + Extract tool arguments from tool content. + + Args: + tool_content: Tool call content + args_match: Regex match for arguments pattern + + Returns: + Extracted arguments as string + """ + args_start_pos = args_match.end() + remaining_content = tool_content[args_start_pos:] + + if remaining_content.strip().startswith("{"): + depth = 0 + for i, char in enumerate(remaining_content): + if char == "{": + depth += 1 + elif char == "}": + depth -= 1 + if depth == 0: + return remaining_content[: i + 1] + else: + args_end = remaining_content.find("}") + if args_end > 0: + return remaining_content[:args_end].strip() + + return remaining_content.rstrip("}").strip() + + def _get_current_tool_content( + self, text: str, tool_index: int + ) -> tuple[str | None, str | None]: + """ + Get the content of a specific tool by index. + + Args: + text: Text containing tool calls + tool_index: Index of tool to extract + + Returns: + Tuple of (tool_name, tool_arguments) or (None, None) if not found + """ + boundaries = self._find_tool_boundaries(text) + + if tool_index >= len(boundaries): + return None, None + + start, end = boundaries[tool_index] + tool_content = text[start:end] + + name_match = self.tool_name_pattern.search(tool_content) + name = name_match.group(1) if name_match else None + + args_match = self.tool_args_pattern.search(tool_content) + if args_match: + try: + args_text = self._extract_tool_args(tool_content, args_match) + return name, args_text + except Exception: + remaining_content = tool_content[args_match.end() :] + args_text = remaining_content.rstrip("}").strip() + return name, args_text + + return name, None + + def _handle_tool_name_streaming( + self, tool_content: str, tool_count: int + ) -> DeltaMessage | None: + """ + Handle streaming of tool names. + + Args: + tool_content: Content containing tool calls + tool_count: Total number of tools + + Returns: + DeltaMessage with tool name or None if no tool to stream + """ + next_idx = self._get_next_unsent_tool_index(tool_count) + + if next_idx == -1: + return None + + boundaries = self._find_tool_boundaries(tool_content) + if next_idx >= len(boundaries): + return None + + tool_name, _ = self._get_current_tool_content(tool_content, next_idx) + if not tool_name: + return None + + self._set_current_tool_index(next_idx) + sent_tools = list(self.streaming_state["sent_tools"]) + tool_ids = list(self.streaming_state["tool_ids"]) + + tool_id = sent_tools[next_idx]["id"] + tool_ids[next_idx] = tool_id + sent_tools[next_idx]["sent_name"] = True + + self.streaming_state["sent_tools"] = sent_tools + self.streaming_state["tool_ids"] = tool_ids + + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=next_idx, + type="function", + id=tool_id, + function=DeltaFunctionCall(name=tool_name).model_dump( + exclude_none=True + ), + ) + ] + ) + + def _handle_tool_args_streaming( + self, tool_content: str, tool_count: int + ) -> DeltaMessage | None: + """ + Handle streaming of tool arguments. + + Args: + tool_content: Content containing tool calls + tool_count: Total number of tools + + Returns: + DeltaMessage with tool arguments or None if no arguments to stream + """ + current_idx = self._get_current_tool_index() + + if current_idx < 0 or current_idx >= tool_count: + return None + + tool_name, tool_args = self._get_current_tool_content(tool_content, current_idx) + if not tool_name or tool_args is None: + return None + + sent_tools = list(self.streaming_state["sent_tools"]) + + if not sent_tools[current_idx]["sent_name"]: + return None + + clean_args = self._clean_duplicate_braces(tool_args) + sent_args = sent_tools[current_idx]["sent_arguments"] + + if clean_args != sent_args: + if sent_args and clean_args.startswith(sent_args): + args_delta = extract_intermediate_diff(clean_args, sent_args) + if args_delta: + args_delta = self._clean_delta_braces(args_delta) + sent_tools[current_idx]["sent_arguments"] = clean_args + self.streaming_state["sent_tools"] = sent_tools + + if clean_args.endswith("}"): + self._advance_to_next_tool() + + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + function=DeltaFunctionCall( + arguments=args_delta + ).model_dump(exclude_none=True), + ) + ] + ) + elif not sent_args and clean_args: + clean_args_delta = self._clean_delta_braces(clean_args) + sent_tools[current_idx]["sent_arguments"] = clean_args + self.streaming_state["sent_tools"] = sent_tools + + if clean_args.endswith("}"): + self._advance_to_next_tool() + + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + function=DeltaFunctionCall( + arguments=clean_args_delta + ).model_dump(exclude_none=True), + ) + ] + ) + + return None + + def _is_end_tool_calls(self, current_text: str) -> bool: + if self.tool_call_end_token not in current_text: + return False + + end_token_positions = [] + search_start = 0 + while True: + pos = current_text.find(self.tool_call_end_token, search_start) + if pos == -1: + break + end_token_positions.append(pos) + search_start = pos + 1 + + think_regions = [] + for match in re.finditer( + self.thinking_tag_pattern, current_text, flags=re.DOTALL + ): + think_regions.append((match.start(), match.end())) + + for pos in end_token_positions: + in_think = any( + pos >= t_start and pos < t_end for t_start, t_end in think_regions + ) + if not in_think: + return True + + return False + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + self._update_thinking_state(current_text) + + if self.in_thinking_tag: + return DeltaMessage(content=delta_text) + + if self._should_buffer_content(delta_text): + buffered_output = self._process_buffer(delta_text) + return DeltaMessage(content=buffered_output) if buffered_output else None + + if self._is_end_tool_calls(current_text): + return DeltaMessage(content=delta_text) + + safe_content, potential_tag = self._split_content_for_buffering(delta_text) + if potential_tag: + self.pending_buffer += potential_tag + return DeltaMessage(content=safe_content) if safe_content else None + + processed_current_text = self.preprocess_model_output(current_text) + + if self.tool_call_start_token not in processed_current_text: + if ( + self.tool_call_end_token in delta_text + and self.tool_call_start_token in current_text + ): + return None + if delta_text.strip() == "" and self.tool_call_start_token in current_text: + return None + if ( + self._get_current_tool_index() != -1 + and self.tool_call_end_token in current_text + ): + self._reset_streaming_state() + return DeltaMessage(content=delta_text) + + if ( + self.tool_call_start_token_id is not None + and self.tool_call_start_token_id in delta_token_ids + and len(delta_token_ids) == 1 + ): + return None + + original_tool_start = self._find_tool_start_outside_thinking(current_text) + if original_tool_start is None: + return None + + content_before_tools = self._extract_content_before_tools( + current_text, delta_text, original_tool_start + ) + if content_before_tools: + return DeltaMessage(content=content_before_tools) + + try: + tool_content = self._extract_tool_content(current_text, original_tool_start) + current_tools_count = self._detect_tools_in_text(tool_content) + + if current_tools_count == 0: + return None + + if self._get_current_tool_index() == -1: + self._reset_streaming_state() + + self._ensure_state_arrays(current_tools_count) + + return self._handle_tool_name_streaming( + tool_content, current_tools_count + ) or self._handle_tool_args_streaming(tool_content, current_tools_count) + + except Exception: + logger.exception( + "An unexpected error occurred ", "during streaming tool call handling." + ) + return None + + def _find_tool_start_outside_thinking(self, current_text: str) -> int | None: + """ + Find the start position of tool calls outside of thinking tags. + + Args: + current_text: Current text to search + + Returns: + Position of tool call start or None if not found + """ + search_start = 0 + while True: + pos = current_text.find(self.tool_call_start_token, search_start) + if pos == -1: + return None + + think_regions = [ + (m.start(), m.end()) + for m in re.finditer( + r"(.*?)", current_text, flags=re.DOTALL + ) + ] + in_think = any( + pos >= t_start and pos < t_end for t_start, t_end in think_regions + ) + + if not in_think: + return pos + + search_start = pos + 1 + + def _extract_content_before_tools( + self, current_text: str, delta_text: str, tool_start: int + ) -> str | None: + """ + Extract content that appears before tool calls. + + Args: + current_text: Current text + delta_text: Delta text + tool_start: Start position of tools + + Returns: + Content before tools or None + """ + if tool_start > 0: + delta_start_pos = len(current_text) - len(delta_text) + if delta_start_pos < tool_start: + content_part = delta_text + if delta_start_pos + len(delta_text) > tool_start: + content_part = delta_text[: tool_start - delta_start_pos] + return content_part if content_part else None + return None + + def _extract_tool_content(self, current_text: str, tool_start: int) -> str: + """ + Extract tool content from current text starting at tool_start. + + Args: + current_text: Current text + tool_start: Start position of tool calls + + Returns: + Extracted tool content + """ + tool_content_start = tool_start + len(self.tool_call_start_token) + tool_content = current_text[tool_content_start:] + + end_pos = tool_content.find(self.tool_call_end_token) + if end_pos != -1: + tool_content = tool_content[:end_pos] + + return tool_content diff --git a/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/entrypoints/openai/tool_parsers/mistral_tool_parser.py new file mode 100644 index 0000000..8567127 --- /dev/null +++ b/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence +from random import choices +from string import ascii_letters, digits + +import partial_json_parser +import regex as re +from partial_json_parser.core.options import Allow +from pydantic import Field + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer + +logger = init_logger(__name__) + +ALPHANUMERIC = ascii_letters + digits + + +class MistralToolCall(ToolCall): + id: str = Field(default_factory=lambda: MistralToolCall.generate_random_id()) + + @staticmethod + def generate_random_id(): + # Mistral Tool Call Ids must be alphanumeric with a length of 9. + # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299 + return "".join(choices(ALPHANUMERIC, k=9)) + + @staticmethod + def is_valid_id(id: str) -> bool: + return id.isalnum() and len(id) == 9 + + +def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool: + return ( + isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11 + ) + + +class MistralToolParser(ToolParser): + """ + Tool call parser for Mistral 7B Instruct v0.3, intended for use with + - [`mistral_common`](https://github.com/mistralai/mistral-common/) + - the examples/tool_chat_template_mistral.jinja template. + + Used when --enable-auto-tool-choice --tool-call-parser mistral are all set + """ + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if not isinstance(self.model_tokenizer, MistralTokenizer): + logger.info("Non-Mistral tokenizer detected when using a Mistral model...") + + # initialize properties used for state when parsing tool calls in + # streaming mode + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id: int = -1 + self.current_tool_name_sent: bool = False + self.streamed_args_for_tool: list[ + str + ] = [] # map what has been streamed for each tool so far to a list + self.bot_token = "[TOOL_CALLS]" + self.bot_token_id = self.vocab.get(self.bot_token) + self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) + if _is_fn_name_regex_support(self.model_tokenizer): + self.fn_name_regex = re.compile( + r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)", re.DOTALL + ) + else: + self.fn_name_regex = None + + if self.bot_token_id is None: + raise RuntimeError( + "Mistral Tool Parser could not locate the tool call token in " + "the tokenizer!" + ) + + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + request = super().adjust_request(request) + if ( + not isinstance(self.model_tokenizer, MistralTokenizer) + and request.tools + and request.tool_choice != "none" + ): + # Do not skip special tokens when using chat template + # with Mistral parser as TOOL_CALL token is needed + # for tool detection. + # Note: we don't want skip_special_tokens=False + # with MistralTokenizer as it is incompatible + request.skip_special_tokens = False + return request + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. Requires + find-and-replacing single quotes with double quotes for JSON parsing, + make sure your tool call arguments don't ever include quotes! + """ + + # case -- if a tool call token is not present, return a text response + if self.bot_token not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + # first remove the BOT token + tool_content = model_output.replace(self.bot_token, "").strip() + + try: + # we first try to directly load the json as parsing very nested + # jsons is difficult + try: + if self.fn_name_regex: + matches = self.fn_name_regex.findall(tool_content) + + function_call_arr = [] + for match in matches: + fn_name = match[0] + args = match[1] + + # fn_name is encoded outside serialized json dump + # only arguments are serialized + function_call_arr.append( + {"name": fn_name, "arguments": json.loads(args)} + ) + else: + function_call_arr = json.loads(tool_content) + except json.JSONDecodeError: + # use a regex to find the part corresponding to the tool call. + # NOTE: This use case should not happen if the model is trained + # correctly. It's an easy possible fix so it's included, but + # can be brittle for very complex / highly nested tool calls + raw_tool_call = self.tool_call_regex.findall(tool_content)[0] + function_call_arr = json.loads(raw_tool_call) + + # Tool Call + tool_calls: list[MistralToolCall] = [ + MistralToolCall( + type="function", + function=FunctionCall( + name=raw_function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps( + raw_function_call["arguments"], ensure_ascii=False + ), + ), + ) + for raw_function_call in function_call_arr + ] + + # get any content before the tool call + content = model_output.split(self.bot_token)[0] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if len(content) > 0 else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + # return information to just treat the tool call as regular JSON + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=tool_content + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + # if the tool call token is not in the tokens generated so far, append + # output to contents since it's not a tool + if self.bot_token not in current_text: + return DeltaMessage(content=delta_text) + + # if the tool call token ID IS in the tokens generated so far, that + # means we're parsing as tool calls now + + # handle if we detected the BOT token which means the start of tool + # calling + if self.bot_token_id in delta_token_ids and len(delta_token_ids) == 1: + # if it's the only token, return None, so we don't send a chat + # completion any don't send a control token + return None + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR + try: + # replace BOT token with empty string, and convert single quotes + # to double to allow parsing as JSON since mistral uses single + # quotes instead of double for tool calls + parsable_arr = current_text.split(self.bot_token)[-1] + + # tool calls are generated in an array, so do partial JSON + # parsing on the entire array + try: + tool_call_arr: list[dict] = partial_json_parser.loads( + parsable_arr, flags + ) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug("not enough tokens to parse into JSON yet") + return None + + # select as the current tool call the one we're on the state at + + current_tool_call: dict = ( + tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {} + ) + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif ( + len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1 + ): + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + diff: str | None = current_tool_call.get("arguments") + + if diff: + diff = json.dumps(diff, ensure_ascii=False).replace( + self.streamed_args_for_tool[self.current_tool_id], "" + ) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += diff + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # case: update an existing tool - this is handled below + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + if not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=MistralToolCall.generate_random_id(), + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), + ) + ] + ) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments" + ) + cur_arguments = current_tool_call.get("arguments") + + new_text = delta_text.replace("'", '"') + if '"}' in new_text: + new_text = new_text[: new_text.rindex('"}')] + + if not cur_arguments and not prev_arguments: + delta = None + elif not cur_arguments and prev_arguments: + logger.error( + "INVARIANT - impossible to have arguments reset mid-arguments" + ) + delta = None + elif cur_arguments and not prev_arguments: + cur_arguments_json = json.dumps(cur_arguments, ensure_ascii=False)[ + :-2 + ] + logger.debug("finding %s in %s", new_text, cur_arguments_json) + + if new_text not in cur_arguments_json: + return None + arguments_delta = cur_arguments_json[ + : cur_arguments_json.rindex(new_text) + len(new_text) + ] + logger.debug( + "First tokens in arguments received: %s", arguments_delta + ) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += arguments_delta + + elif cur_arguments and prev_arguments: + cur_args_json = json.dumps(cur_arguments, ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, ensure_ascii=False) + logger.debug( + "Searching for diff between \n%s\n%s", + cur_args_json, + prev_args_json, + ) + + argument_diff = extract_intermediate_diff( + cur_args_json, prev_args_json + ) + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff + ).model_dump(exclude_none=True), + ) + ] + ) + self.streamed_args_for_tool[self.current_tool_id] += argument_diff + else: + # try parsing it with regular JSON - if it works we're + # at the end, and we need to send the difference between + # tokens streamed so far and the valid JSON + delta = None + + # check to see if the name is defined and has been sent. if so, + # stream the name - otherwise keep waiting + # finish by setting old and returning None as base case + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None diff --git a/entrypoints/openai/tool_parsers/olmo3_tool_parser.py b/entrypoints/openai/tool_parsers/olmo3_tool_parser.py new file mode 100644 index 0000000..baff33b --- /dev/null +++ b/entrypoints/openai/tool_parsers/olmo3_tool_parser.py @@ -0,0 +1,366 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import json +from collections.abc import Sequence +from typing import Any + +import regex as re +from transformers import PreTrainedTokenizerBase + +import vllm.envs as envs +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class _UnexpectedAstError(Exception): + pass + + +class Olmo3PythonicToolParser(ToolParser): + """ + Tool call parser for Olmo 3 models that produce tool calls as + newline-separated pythonic strings. + Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set + Code copied from pythonic_tool_parser.py and updated to handle + - newline separated pythonic tool calls. + - argument values being null/true/false instead of Pythonic literals. + """ + + # TODO(mdepinet): Possible future improvements: + # 1. Support text + tools separated by either <|python_tag|> or \n\n + # 2. Support tools outside of a list (or separated by a semicolon). + # This depends on item 1 for consistent streaming. + # Neither of these are necessary for e.g. ToolACE, but both would help make + # Llama3.2 models more reliable. + + TOOL_CALL_REGEX = re.compile( + r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]", + re.DOTALL, + ) + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + + # Rename for readability. This is NOT a tool id. + @property + def current_tool_index(self) -> int: + return self.current_tool_id + + @current_tool_index.setter + def current_tool_index(self, value: int) -> None: + self.current_tool_id = value + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + """ + original_model_output = model_output + # Remove xml tags. + match = re.search( + r"(.*?)", model_output, re.DOTALL + ) + if match: + model_output = match.group(1).strip() + # Make the newline separated function calls into a list. + model_output = ", ".join( + [line.strip() for line in model_output.splitlines() if line.strip()] + ) + model_output = f"[{model_output}]" + + is_tool_call_pattern = False + try: + is_tool_call_pattern = ( + self.TOOL_CALL_REGEX.match( + model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS + ) + is not None + ) + except TimeoutError: + logger.warning("Regex timeout occurred when matching tool call pattern.") + logger.debug( + "Regex timeout occurred when matching user input: %s", model_output + ) + + if not is_tool_call_pattern: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=original_model_output + ) + + try: + module = ast.parse(model_output) + parsed = getattr(module.body[0], "value", None) + if isinstance(parsed, ast.List) and all( + isinstance(e, ast.Call) for e in parsed.elts + ): + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=[ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ], + content=None, + ) + else: + raise _UnexpectedAstError( + "Tool output must be a list of function calls" + ) + except Exception: + logger.exception("Error in extracting tool call from response.") + # Treat as regular text + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=original_model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + # All function calls start with the tag. + # But since this is streaming, we may have seen only part of the tag. + if not current_text.startswith("<"): + return DeltaMessage(content=delta_text) + + try: + # Remove xml tags. + if current_text.startswith(""): + current_text = current_text[len("") :] + if current_text.endswith(""): + current_text = current_text[: -len("")] + + valid_and_added_text = _make_valid_python(current_text) + if valid_and_added_text is None: + return None + valid_text, added_text = valid_and_added_text + + # Make the newline separated function calls into a list. + valid_text = ", ".join( + [line.strip() for line in valid_text.splitlines() if line.strip()] + ) + valid_text = f"[{valid_text}]" + module = ast.parse(valid_text) + parsed = getattr(module.body[0], "value", None) + if not isinstance(parsed, ast.List) or not all( + isinstance(e, ast.Call) for e in parsed.elts + ): + raise _UnexpectedAstError( + "Tool output must be a sequence of newline-separated calls" + ) + tool_calls = [ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ] + + tool_deltas = [] + for index, new_call in enumerate(tool_calls): + if index < self.current_tool_index: + continue + + self.current_tool_index = index + if len(self.streamed_args_for_tool) == index: + self.streamed_args_for_tool.append("") + + new_call_complete = index < len(tool_calls) - 1 or ")" not in added_text + if new_call_complete: + self.current_tool_index += 1 + + withheld_suffix = added_text[:-1] if not new_call_complete else "" + if not new_call_complete and added_text[-1] == ")": + # Function call is incomplete. Withhold the closing bracket. + withheld_suffix = withheld_suffix + "}" + # Strings get single quotes in the model-produced string. + # JSON requires double quotes. + withheld_suffix = withheld_suffix.replace("'", '"') + delta = _compute_tool_delta( + self.streamed_args_for_tool[index], new_call, index, withheld_suffix + ) + + if delta is not None: + tool_deltas.append(delta) + if ( + delta.function is not None + and delta.function.arguments is not None + ): + self.streamed_args_for_tool[index] += delta.function.arguments + + # HACK: serving_chat.py inspects the internal state of tool parsers + # when determining its final streaming delta, automatically + # adding autocompleted JSON. + # These two lines avoid that nonsense while ensuring finish_reason + # is set to tool_calls when at least one tool is called. + if tool_deltas and not self.prev_tool_call_arr: + self.prev_tool_call_arr = [{"arguments": {}}] + + if tool_deltas: + return DeltaMessage(tool_calls=tool_deltas) + elif not added_text and self.current_tool_id > 0: + # Return an empty DeltaMessage once the tool calls are all done + # so that finish_reason gets set. + return DeltaMessage(content="") + else: + return None + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None + + +def _get_parameter_value(val: ast.expr) -> Any: + if isinstance(val, ast.Constant): + return val.value + elif isinstance(val, ast.Dict): + if not all(isinstance(k, ast.Constant) for k in val.keys): + raise _UnexpectedAstError("Dict tool call arguments must have literal keys") + return { + k.value: _get_parameter_value(v) # type: ignore + for k, v in zip(val.keys, val.values) + } + elif isinstance(val, ast.List): + return [_get_parameter_value(v) for v in val.elts] + # The model may return function calls where the values are null/true/false + # because the system prompt has API description in json. + elif isinstance(val, ast.Name) and val.id in ["null", "true", "false"]: + if val.id == "null": + return None + elif val.id == "true": + return True + elif val.id == "false": + return False + else: + raise _UnexpectedAstError("Tool call arguments must be literals") + + +def _handle_single_tool(call: ast.Call) -> ToolCall: + if not isinstance(call.func, ast.Name): + raise _UnexpectedAstError("Invalid tool call name") + function_name = call.func.id + arguments = {} + for keyword in call.keywords: + arguments[keyword.arg] = _get_parameter_value(keyword.value) + return ToolCall( + type="function", + function=FunctionCall( + name=function_name, arguments=json.dumps(arguments, ensure_ascii=False) + ), + ) + + +def _make_valid_python(text: str) -> tuple[str, str] | None: + bracket_stack = [] + for index, char in enumerate(text): + if char in {"[", "(", "{"}: + bracket_stack.append(char) + elif char == "]": + if not bracket_stack or bracket_stack.pop() != "[": + raise _UnexpectedAstError("Mismatched square brackets") + elif char == ")": + if not bracket_stack or bracket_stack.pop() != "(": + raise _UnexpectedAstError("Mismatched parentheses") + elif char == "}": + if not bracket_stack or bracket_stack.pop() != "{": + raise _UnexpectedAstError("Mismatched curly braces") + elif char in {"'", '"'}: + if bracket_stack and bracket_stack[-1] == char: + if index > 0 and text[index - 1] == "\\": + # Treat an escaped quote as a regular character + pass + else: + bracket_stack.pop() + elif bracket_stack and bracket_stack[-1] in {"'", '"'}: + # Double quote within a single quote string or vice versa. + pass + else: + bracket_stack.append(char) + + text = text.rstrip() + if text.endswith("=") or text.endswith(":"): + # Since we have no type information for this property/parameter value, + # we can't fill in a valid value. + return None + if bracket_stack and bracket_stack[-1] == "{": + trailing_dict_text = text[: text.rfind("{")] + num_keys = trailing_dict_text.count(":") + num_values = trailing_dict_text.count(",") + if num_keys <= num_values: + return None # Incomplete property name within parameter value + if bracket_stack and bracket_stack[-1] == "(": + trailing_params_text = text[: text.rfind("(")] + num_full_param_names = trailing_params_text.count("=") + num_full_param_values = trailing_params_text.count(",") + if num_full_param_names <= num_full_param_values: + return None # Incomplete parameter name + if text.endswith(","): + text = text[:-1] + if ( + bracket_stack + and bracket_stack[-1] == "[" + and not text.endswith("[") + and not text.endswith(")") + ): + return None # Incomplete function name + + added_text = "" + for char in reversed(bracket_stack): + if char == "[": + added_text += "]" + elif char == "(": + added_text += ")" + elif char == "{": + added_text += "}" + elif char == "'": + added_text += "'" + elif char == '"': + added_text += '"' + + return text + added_text, added_text + + +def _compute_tool_delta( + previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str +) -> DeltaToolCall | None: + new_call_args = new_call.function.arguments + if withheld_suffix: + assert new_call_args.endswith(withheld_suffix) + new_call_args = new_call_args[: -len(withheld_suffix)] + if not previously_sent_args: + return DeltaToolCall( + id=new_call.id, + type="function", + index=index, + function=DeltaFunctionCall( + name=new_call.function.name, + arguments=new_call_args, + ), + ) + + arg_diff = new_call_args[len(previously_sent_args) :] + return ( + DeltaToolCall( + id=None, index=index, function=DeltaFunctionCall(arguments=arg_diff) + ) + if arg_diff + else None + ) diff --git a/entrypoints/openai/tool_parsers/openai_tool_parser.py b/entrypoints/openai/tool_parsers/openai_tool_parser.py new file mode 100644 index 0000000..d1b36a2 --- /dev/null +++ b/entrypoints/openai/tool_parsers/openai_tool_parser.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +from collections.abc import Sequence +from typing import TYPE_CHECKING + +from vllm.entrypoints.harmony_utils import parse_output_into_messages +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger + +if TYPE_CHECKING: + from vllm.transformers_utils.tokenizer import AnyTokenizer +else: + AnyTokenizer = object + +logger = init_logger(__name__) + + +class OpenAIToolParser(ToolParser): + def __init__(self, tokenizer: "AnyTokenizer"): + super().__init__(tokenizer) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + token_ids: Sequence[int] | None = None, + ) -> ExtractedToolCallInformation: + if token_ids is None: + raise NotImplementedError( + "OpenAIToolParser requires token IDs and does not support text-based extraction." # noqa: E501 + ) + + parser = parse_output_into_messages(token_ids) + tool_calls = [] + final_content = None + + if len(parser.messages) > 0: + for msg in parser.messages: + if len(msg.content) < 1: + continue + msg_text = msg.content[0].text + if msg.recipient and msg.recipient.startswith("functions."): + # If no content-type is given assume JSON, as that's the + # most common case with gpt-oss models. + if not msg.content_type or "json" in msg.content_type: + # load and dump the JSON text to check validity and + # remove any extra newlines or other odd formatting + try: + tool_args = json.dumps(json.loads(msg_text)) + except json.JSONDecodeError: + logger.exception( + "Error decoding JSON tool call from response." + ) + tool_args = msg_text + else: + tool_args = msg_text + tool_calls.append( + ToolCall( + type="function", + function=FunctionCall( + name=msg.recipient.split("functions.")[1], + arguments=tool_args, + ), + ) + ) + elif msg.channel == "final": + final_content = msg_text + + return ExtractedToolCallInformation( + tools_called=len(tool_calls) > 0, + tool_calls=tool_calls, + content=final_content, + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + raise NotImplementedError( + "Not being used, manual parsing in serving_chat.py" # noqa: E501 + ) diff --git a/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py new file mode 100644 index 0000000..acb25ea --- /dev/null +++ b/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence +from typing import Any + +import regex as re +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaMessage, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class Phi4MiniJsonToolParser(ToolParser): + """ + Tool call parser for phi-4-mini models intended for use with the + examples/tool_chat_template_llama.jinja template. + + Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json + are all set + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None: + super().__init__(tokenizer) + + # initialize properties used for state when parsing tool calls in + # streaming mode + self.prev_tool_call_arr: list[dict[str, Any]] = [] + self.current_tool_id: int = -1 + self.current_tool_name_sent: bool = False + self.streamed_args_for_tool: list[ + str + ] = [] # map what has been streamed for each tool so far to a list + self.bot_token: str = "functools" + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + """ + logger.debug("Model output: %s", model_output) + + pattern = r"functools\[(.*?)\]" + matches = re.search(pattern, model_output, re.DOTALL) + + if not matches: + logger.debug("No function calls found") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + try: + function_call_arr: list[dict[str, Any]] = [] + try: + json_content = "[" + matches.group(1) + "]" + + function_call_arr = json.loads(json_content) + logger.debug( + "Successfully extracted %d function calls", len(function_call_arr) + ) + except json.JSONDecodeError as e: + logger.error( + "Failed to parse function calls from model output. Error: %s", + str(e), + ) + + tool_calls: list[ToolCall] = [ + ToolCall( + id=make_tool_call_id(), + type="function", + function=FunctionCall( + name=raw_function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps( + raw_function_call["arguments"] + if "arguments" in raw_function_call + else raw_function_call["parameters"], + ensure_ascii=False, + ), + ), + ) + for raw_function_call in function_call_arr + ] + + # get any content before the tool call + ret = ExtractedToolCallInformation( + tools_called=True, tool_calls=tool_calls, content=None + ) + return ret + + except Exception: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + return None diff --git a/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/entrypoints/openai/tool_parsers/pythonic_tool_parser.py new file mode 100644 index 0000000..abeb923 --- /dev/null +++ b/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -0,0 +1,332 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import ast +import json +from collections.abc import Sequence +from typing import Any + +import regex as re +from transformers import PreTrainedTokenizerBase + +import vllm.envs as envs +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class _UnexpectedAstError(Exception): + pass + + +class PythonicToolParser(ToolParser): + """ + Tool call parser for models that produce tool calls in a pythonic style, + such as Llama 3.2 and Llama 4 models. + + Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set + """ + + # TODO(mdepinet): Possible future improvements: + # 1. Support text + tools separated by either <|python_tag|> or \n\n + # 2. Support tools outside of a list (or separated by a semicolon). + # This depends on item 1 for consistent streaming. + # Neither of these are necessary for e.g. ToolACE, but both would help make + # Llama3.2 models more reliable. + + TOOL_CALL_REGEX = re.compile( + r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]", + re.DOTALL, + ) + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + + # Rename for readability. This is NOT a tool id. + @property + def current_tool_index(self) -> int: + return self.current_tool_id + + @current_tool_index.setter + def current_tool_index(self, value: int) -> None: + self.current_tool_id = value + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + """ + is_tool_call_pattern = False + try: + is_tool_call_pattern = ( + self.TOOL_CALL_REGEX.match( + model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS + ) + is not None + ) + except TimeoutError: + logger.warning("Regex timeout occurred when matching tool call pattern.") + logger.debug( + "Regex timeout occurred when matching user input: %s", model_output + ) + + if not is_tool_call_pattern: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + try: + module = ast.parse(model_output) + parsed = getattr(module.body[0], "value", None) + if isinstance(parsed, ast.List) and all( + isinstance(e, ast.Call) for e in parsed.elts + ): + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=[ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ], + content=None, + ) + else: + raise _UnexpectedAstError( + "Tool output must be a list of function calls" + ) + except Exception: + logger.exception("Error in extracting tool call from response.") + # Treat as regular text + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + if not current_text.startswith("["): + return DeltaMessage(content=delta_text) + + try: + valid_and_added_text = _make_valid_python(current_text) + if valid_and_added_text is None: + return None + valid_text, added_text = valid_and_added_text + + module = ast.parse(valid_text) + parsed = getattr(module.body[0], "value", None) + if not isinstance(parsed, ast.List) or not all( + isinstance(e, ast.Call) for e in parsed.elts + ): + raise _UnexpectedAstError( + "Tool output must be a list of function calls" + ) + tool_calls = [ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ] + + tool_deltas = [] + for index, new_call in enumerate(tool_calls): + if index < self.current_tool_index: + continue + + self.current_tool_index = index + if len(self.streamed_args_for_tool) == index: + self.streamed_args_for_tool.append("") + + new_call_complete = ( + index < len(tool_calls) - 1 or ")]" not in added_text + ) + if new_call_complete: + self.current_tool_index += 1 + + withheld_suffix = added_text[:-2] if not new_call_complete else "" + if not new_call_complete and added_text[-2] == ")": + # Function call is incomplete. Withhold the closing bracket. + withheld_suffix = withheld_suffix + "}" + # Strings get single quotes in the model-produced string. + # JSON requires double quotes. + withheld_suffix = withheld_suffix.replace("'", '"') + delta = _compute_tool_delta( + self.streamed_args_for_tool[index], new_call, index, withheld_suffix + ) + + if delta is not None: + tool_deltas.append(delta) + if ( + delta.function is not None + and delta.function.arguments is not None + ): + self.streamed_args_for_tool[index] += delta.function.arguments + + # HACK: serving_chat.py inspects the internal state of tool parsers + # when determining its final streaming delta, automatically + # adding autocompleted JSON. + # These two lines avoid that nonsense while ensuring finish_reason + # is set to tool_calls when at least one tool is called. + if tool_deltas and not self.prev_tool_call_arr: + self.prev_tool_call_arr = [{"arguments": {}}] + + if tool_deltas: + return DeltaMessage(tool_calls=tool_deltas) + elif not added_text and self.current_tool_id > 0: + # Return an empty DeltaMessage once the tool calls are all done + # so that finish_reason gets set. + return DeltaMessage(content="") + else: + return None + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction error" + ) + return None + + +def _get_parameter_value(val: ast.expr) -> Any: + if isinstance(val, ast.Constant): + return val.value + elif isinstance(val, ast.Dict): + if not all(isinstance(k, ast.Constant) for k in val.keys): + raise _UnexpectedAstError("Dict tool call arguments must have literal keys") + return { + k.value: _get_parameter_value(v) # type: ignore + for k, v in zip(val.keys, val.values) + } + elif isinstance(val, ast.List): + return [_get_parameter_value(v) for v in val.elts] + else: + raise _UnexpectedAstError("Tool call arguments must be literals") + + +def _handle_single_tool(call: ast.Call) -> ToolCall: + if not isinstance(call.func, ast.Name): + raise _UnexpectedAstError("Invalid tool call name") + function_name = call.func.id + arguments = {} + for keyword in call.keywords: + arguments[keyword.arg] = _get_parameter_value(keyword.value) + return ToolCall( + type="function", + function=FunctionCall( + name=function_name, arguments=json.dumps(arguments, ensure_ascii=False) + ), + ) + + +def _make_valid_python(text: str) -> tuple[str, str] | None: + bracket_stack = [] + for index, char in enumerate(text): + if char in {"[", "(", "{"}: + bracket_stack.append(char) + elif char == "]": + if not bracket_stack or bracket_stack.pop() != "[": + raise _UnexpectedAstError("Mismatched square brackets") + elif char == ")": + if not bracket_stack or bracket_stack.pop() != "(": + raise _UnexpectedAstError("Mismatched parentheses") + elif char == "}": + if not bracket_stack or bracket_stack.pop() != "{": + raise _UnexpectedAstError("Mismatched curly braces") + elif char in {"'", '"'}: + if bracket_stack and bracket_stack[-1] == char: + if index > 0 and text[index - 1] == "\\": + # Treat an escaped quote as a regular character + pass + else: + bracket_stack.pop() + elif bracket_stack and bracket_stack[-1] in {"'", '"'}: + # Double quote within a single quote string or vice versa. + pass + else: + bracket_stack.append(char) + + text = text.rstrip() + if text.endswith("=") or text.endswith(":"): + # Since we have no type information for this property/parameter value, + # we can't fill in a valid value. + return None + if bracket_stack and bracket_stack[-1] == "{": + trailing_dict_text = text[: text.rfind("{")] + num_keys = trailing_dict_text.count(":") + num_values = trailing_dict_text.count(",") + if num_keys <= num_values: + return None # Incomplete property name within parameter value + if bracket_stack and bracket_stack[-1] == "(": + trailing_params_text = text[: text.rfind("(")] + num_full_param_names = trailing_params_text.count("=") + num_full_param_values = trailing_params_text.count(",") + if num_full_param_names <= num_full_param_values: + return None # Incomplete parameter name + if text.endswith(","): + text = text[:-1] + if ( + bracket_stack + and bracket_stack[-1] == "[" + and not text.endswith("[") + and not text.endswith(")") + ): + return None # Incomplete function name + + added_text = "" + for char in reversed(bracket_stack): + if char == "[": + added_text += "]" + elif char == "(": + added_text += ")" + elif char == "{": + added_text += "}" + elif char == "'": + added_text += "'" + elif char == '"': + added_text += '"' + + return text + added_text, added_text + + +def _compute_tool_delta( + previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str +) -> DeltaToolCall | None: + new_call_args = new_call.function.arguments + if withheld_suffix: + assert new_call_args.endswith(withheld_suffix) + new_call_args = new_call_args[: -len(withheld_suffix)] + if not previously_sent_args: + return DeltaToolCall( + id=new_call.id, + type="function", + index=index, + function=DeltaFunctionCall( + name=new_call.function.name, + arguments=new_call_args, + ), + ) + + arg_diff = new_call_args[len(previously_sent_args) :] + return ( + DeltaToolCall( + id=None, index=index, function=DeltaFunctionCall(arguments=arg_diff) + ) + if arg_diff + else None + ) diff --git a/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py new file mode 100644 index 0000000..26261c0 --- /dev/null +++ b/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -0,0 +1,781 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import json +import uuid +from collections.abc import Sequence +from typing import Any + +import regex as re + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionToolsParam, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class Qwen3CoderToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: list[dict] = [] + # Override base class type - we use string IDs for tool calls + self.current_tool_id: str | None = None # type: ignore + self.streamed_args_for_tool: list[str] = [] + + # Sentinel tokens for streaming mode + self.tool_call_start_token: str = "" + self.tool_call_end_token: str = "" + self.tool_call_prefix: str = "(.*?)", re.DOTALL + ) + self.tool_call_regex = re.compile( + r"(.*?)|(.*?)$", re.DOTALL + ) + self.tool_call_function_regex = re.compile( + r"||(?=)|$)", + re.DOTALL, + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction." + ) + + self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + + if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None: + raise RuntimeError( + "Qwen3 XML Tool parser could not locate tool call start/end " + "tokens in the tokenizer!" + ) + + logger.info( + "vLLM Successfully import tool parser %s !", self.__class__.__name__ + ) + + def _generate_tool_call_id(self) -> str: + """Generate a unique tool call ID.""" + return f"call_{uuid.uuid4().hex[:24]}" + + def _reset_streaming_state(self): + """Reset all streaming state.""" + self.current_tool_index = 0 + self.is_tool_call_started = False + self.header_sent = False + self.current_tool_id = None + self.current_function_name = None + self.current_param_name = None + self.current_param_value = "" + self.param_count = 0 + self.in_param = False + self.in_function = False + self.accumulated_text = "" + self.json_started = False + self.json_closed = False + # Store accumulated parameters for type conversion + self.accumulated_params = {} + self.streaming_request = None + + def _get_arguments_config( + self, func_name: str, tools: list[ChatCompletionToolsParam] | None + ) -> dict: + """Extract argument configuration for a function.""" + if tools is None: + return {} + for config in tools: + if not hasattr(config, "type") or not ( + hasattr(config, "function") and hasattr(config.function, "name") + ): + continue + if config.type == "function" and config.function.name == func_name: + if not hasattr(config.function, "parameters"): + return {} + params = config.function.parameters + if isinstance(params, dict) and "properties" in params: + return params["properties"] + elif isinstance(params, dict): + return params + else: + return {} + logger.warning("Tool '%s' is not defined in the tools list.", func_name) + return {} + + def _convert_param_value( + self, param_value: str, param_name: str, param_config: dict, func_name: str + ) -> Any: + """Convert parameter value based on its type in the schema.""" + # Handle null value for any type + if param_value.lower() == "null": + return None + + if param_name not in param_config: + if param_config != {}: + logger.warning( + "Parsed parameter '%s' is not defined in the tool " + "parameters for tool '%s', directly returning the " + "string value.", + param_name, + func_name, + ) + return param_value + + if ( + isinstance(param_config[param_name], dict) + and "type" in param_config[param_name] + ): + param_type = str(param_config[param_name]["type"]).strip().lower() + else: + param_type = "string" + if param_type in ["string", "str", "text", "varchar", "char", "enum"]: + return param_value + elif ( + param_type.startswith("int") + or param_type.startswith("uint") + or param_type.startswith("long") + or param_type.startswith("short") + or param_type.startswith("unsigned") + ): + try: + return int(param_value) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not an " + "integer in tool '%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + elif param_type.startswith("num") or param_type.startswith("float"): + try: + float_param_value = float(param_value) + return ( + float_param_value + if float_param_value - int(float_param_value) != 0 + else int(float_param_value) + ) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not a float " + "in tool '%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + elif param_type in ["boolean", "bool", "binary"]: + param_value = param_value.lower() + if param_value not in ["true", "false"]: + logger.warning( + "Parsed value '%s' of parameter '%s' is not a boolean " + "(`true` or `false`) in tool '%s', degenerating to " + "false.", + param_value, + param_name, + func_name, + ) + return param_value == "true" + else: + if ( + param_type in ["object", "array", "arr"] + or param_type.startswith("dict") + or param_type.startswith("list") + ): + try: + param_value = json.loads(param_value) + return param_value + except (json.JSONDecodeError, TypeError, ValueError): + logger.warning( + "Parsed value '%s' of parameter '%s' cannot be " + "parsed with json.loads in tool '%s', will try " + "other methods to parse it.", + param_value, + param_name, + func_name, + ) + try: + param_value = ast.literal_eval(param_value) # safer + except (ValueError, SyntaxError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' cannot be " + "converted via Python `ast.literal_eval()` in tool " + "'%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + + def _parse_xml_function_call( + self, function_call_str: str, tools: list[ChatCompletionToolsParam] | None + ) -> ToolCall | None: + # Extract function name + end_index = function_call_str.index(">") + function_name = function_call_str[:end_index] + param_config = self._get_arguments_config(function_name, tools) + parameters = function_call_str[end_index + 1 :] + param_dict = {} + for match_text in self.tool_call_parameter_regex.findall(parameters): + idx = match_text.index(">") + param_name = match_text[:idx] + param_value = str(match_text[idx + 1 :]) + # Remove prefix and trailing \n + if param_value.startswith("\n"): + param_value = param_value[1:] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + param_dict[param_name] = self._convert_param_value( + param_value, param_name, param_config, function_name + ) + return ToolCall( + type="function", + function=FunctionCall( + name=function_name, arguments=json.dumps(param_dict, ensure_ascii=False) + ), + ) + + def _get_function_calls(self, model_output: str) -> list[str]: + # Find all tool calls + matched_ranges = self.tool_call_regex.findall(model_output) + raw_tool_calls = [ + match[0] if match[0] else match[1] for match in matched_ranges + ] + + # Back-off strategy if no tool_call tags found + if len(raw_tool_calls) == 0: + raw_tool_calls = [model_output] + + raw_function_calls = [] + for tool_call in raw_tool_calls: + raw_function_calls.extend(self.tool_call_function_regex.findall(tool_call)) + + function_calls = [ + match[0] if match[0] else match[1] for match in raw_function_calls + ] + return function_calls + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + # Quick check to avoid unnecessary processing + if self.tool_call_prefix not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + try: + function_calls = self._get_function_calls(model_output) + if len(function_calls) == 0: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + tool_calls = [ + self._parse_xml_function_call(function_call_str, request.tools) + for function_call_str in function_calls + ] + + # Populate prev_tool_call_arr for serving layer to set finish_reason + self.prev_tool_call_arr.clear() # Clear previous calls + for tool_call in tool_calls: + if tool_call: + self.prev_tool_call_arr.append( + { + "name": tool_call.function.name, + "arguments": tool_call.function.arguments, + } + ) + + # Extract content before tool calls + content_index = model_output.find(self.tool_call_start_token) + idx = model_output.find(self.tool_call_prefix) + content_index = content_index if content_index >= 0 else idx + content = model_output[:content_index] # .rstrip() + + return ExtractedToolCallInformation( + tools_called=(len(tool_calls) > 0), + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + # Store request for type conversion + if not previous_text: + self._reset_streaming_state() + self.streaming_request = request + + # If no delta text, return None unless it's an EOS token after tools + if not delta_text: + # Check if this is an EOS token after all tool calls are complete + # Check for tool calls in text even if is_tool_call_started + # is False (might have been reset after processing all tools) + if delta_token_ids and self.tool_call_end_token_id not in delta_token_ids: + # Count complete tool calls + complete_calls = len( + self.tool_call_complete_regex.findall(current_text) + ) + + # If we have completed tool calls and populated + # prev_tool_call_arr + if complete_calls > 0 and len(self.prev_tool_call_arr) > 0: + # Check if all tool calls are closed + open_calls = current_text.count( + self.tool_call_start_token + ) - current_text.count(self.tool_call_end_token) + if open_calls == 0: + # Return empty delta for finish_reason processing + return DeltaMessage(content="") + elif not self.is_tool_call_started and current_text: + # This is a regular content response that's now complete + return DeltaMessage(content="") + return None + + # Update accumulated text + self.accumulated_text = current_text + + # Check if we need to advance to next tool + if self.json_closed and not self.in_function: + # Check if this tool call has ended + tool_ends = current_text.count(self.tool_call_end_token) + if tool_ends > self.current_tool_index: + # This tool has ended, advance to next + self.current_tool_index += 1 + self.header_sent = False + self.param_count = 0 + self.json_started = False + self.json_closed = False + self.accumulated_params = {} + + # Check if there are more tool calls + tool_starts = current_text.count(self.tool_call_start_token) + if self.current_tool_index >= tool_starts: + # No more tool calls + self.is_tool_call_started = False + # Continue processing next tool + return None + + # Handle normal content before tool calls + if not self.is_tool_call_started: + # Check if tool call is starting + if ( + self.tool_call_start_token_id in delta_token_ids + or self.tool_call_start_token in delta_text + ): + self.is_tool_call_started = True + # Return any content before the tool call + if self.tool_call_start_token in delta_text: + content_before = delta_text[ + : delta_text.index(self.tool_call_start_token) + ] + if content_before: + return DeltaMessage(content=content_before) + return None + else: + # Check if we're between tool calls - skip whitespace + if ( + current_text.rstrip().endswith(self.tool_call_end_token) + and delta_text.strip() == "" + ): + # We just ended a tool call, skip whitespace + return None + # Normal content, no tool call + return DeltaMessage(content=delta_text) + + # Check if we're between tool calls (waiting for next one) + # Count tool calls we've seen vs processed + tool_starts_count = current_text.count(self.tool_call_start_token) + if self.current_tool_index >= tool_starts_count: + # We're past all tool calls, shouldn't be here + return None + + # We're in a tool call, find the current tool call portion + # Need to find the correct tool call based on current_tool_index + tool_start_positions: list[int] = [] + idx = 0 + while True: + idx = current_text.find(self.tool_call_start_token, idx) + if idx == -1: + break + tool_start_positions.append(idx) + idx += len(self.tool_call_start_token) + + if self.current_tool_index >= len(tool_start_positions): + # No more tool calls to process yet + return None + + tool_start_idx = tool_start_positions[self.current_tool_index] + # Find where this tool call ends (or current position if not ended yet) + tool_end_idx = current_text.find(self.tool_call_end_token, tool_start_idx) + if tool_end_idx == -1: + tool_text = current_text[tool_start_idx:] + else: + tool_text = current_text[ + tool_start_idx : tool_end_idx + len(self.tool_call_end_token) + ] + + # Looking for function header + if not self.header_sent: + if self.tool_call_prefix in tool_text: + func_start = tool_text.find(self.tool_call_prefix) + len( + self.tool_call_prefix + ) + func_end = tool_text.find(">", func_start) + + if func_end != -1: + # Found complete function name + self.current_function_name = tool_text[func_start:func_end] + self.current_tool_id = self._generate_tool_call_id() + self.header_sent = True + self.in_function = True + + # IMPORTANT: Add to prev_tool_call_arr immediately when + # we detect a tool call. This ensures + # finish_reason="tool_calls" even if parsing isn't complete + already_added = any( + tool.get("name") == self.current_function_name + for tool in self.prev_tool_call_arr + ) + if not already_added: + self.prev_tool_call_arr.append( + { + "name": self.current_function_name, + "arguments": "{}", # Placeholder, will be updated later + } + ) + + # Send header with function info + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + id=self.current_tool_id, + function=DeltaFunctionCall( + name=self.current_function_name, arguments="" + ), + type="function", + ) + ] + ) + return None + + # We've sent header, now handle function body + if self.in_function: + # Send opening brace if not sent yet + if not self.json_started and self.parameter_prefix not in delta_text: + self.json_started = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments="{"), + ) + ] + ) + + # Make sure json_started is set if we're processing parameters + if not self.json_started: + self.json_started = True + + # Check for function end in accumulated text + if not self.json_closed and self.function_end_token in tool_text: + # Close JSON + self.json_closed = True + + # Extract complete tool call to update + # prev_tool_call_arr with final arguments + # Find the function content + func_start = tool_text.find(self.tool_call_prefix) + len( + self.tool_call_prefix + ) + func_content_end = tool_text.find(self.function_end_token, func_start) + if func_content_end != -1: + func_content = tool_text[func_start:func_content_end] + # Parse to get the complete arguments + try: + parsed_tool = self._parse_xml_function_call( + func_content, + self.streaming_request.tools + if self.streaming_request + else None, + ) + if parsed_tool: + # Update existing entry in + # prev_tool_call_arr with complete args + for i, tool in enumerate(self.prev_tool_call_arr): + if tool.get("name") == parsed_tool.function.name: + args = parsed_tool.function.arguments + self.prev_tool_call_arr[i]["arguments"] = args + break + except Exception: + pass # Ignore parsing errors during streaming + + result = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments="}"), + ) + ] + ) + + # Reset state for next tool + self.in_function = False + self.json_closed = True + self.accumulated_params = {} + + return result + + # Look for parameters + # Find all parameter starts + param_starts = [] + idx = 0 + while True: + idx = tool_text.find(self.parameter_prefix, idx) + if idx == -1: + break + param_starts.append(idx) + idx += len(self.parameter_prefix) + + # Check if we should start a new parameter + if ( + not self.in_param + and self.param_count < len(param_starts) + and len(param_starts) > self.param_count + ): + # Process the next parameter + param_idx = param_starts[self.param_count] + param_start = param_idx + len(self.parameter_prefix) + remaining = tool_text[param_start:] + + if ">" in remaining: + # We have the complete parameter name + name_end = remaining.find(">") + self.current_param_name = remaining[:name_end] + + # Find the parameter value + value_start = param_start + name_end + 1 + value_text = tool_text[value_start:] + if value_text.startswith("\n"): + value_text = value_text[1:] + + # Find where this parameter ends + param_end_idx = value_text.find(self.parameter_end_token) + if param_end_idx == -1: + # No closing tag, look for next parameter or + # function end + next_param_idx = value_text.find(self.parameter_prefix) + func_end_idx = value_text.find(self.function_end_token) + + if next_param_idx != -1 and ( + func_end_idx == -1 or next_param_idx < func_end_idx + ): + param_end_idx = next_param_idx + elif func_end_idx != -1: + param_end_idx = func_end_idx + else: + # Neither found, check if tool call is complete + if self.tool_call_end_token in tool_text: + # Tool call is complete, so parameter + # must be complete too. Use all + # remaining text before function end + param_end_idx = len(value_text) + else: + # Still streaming, wait for more content + return None + + if param_end_idx != -1: + # Complete parameter found + param_value = value_text[:param_end_idx] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + # Store raw value for later processing + self.accumulated_params[self.current_param_name] = param_value + + # Get parameter configuration for type conversion + param_config = self._get_arguments_config( + self.current_function_name or "", + self.streaming_request.tools + if self.streaming_request + else None, + ) + + # Convert param value to appropriate type + converted_value = self._convert_param_value( + param_value, + self.current_param_name, + param_config, + self.current_function_name or "", + ) + + # Build JSON fragment based on the converted type + # Use json.dumps to properly serialize the value + serialized_value = json.dumps( + converted_value, ensure_ascii=False + ) + + if self.param_count == 0: + json_fragment = ( + f'"{self.current_param_name}": {serialized_value}' + ) + else: + json_fragment = ( + f', "{self.current_param_name}": {serialized_value}' + ) + + self.param_count += 1 + + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments=json_fragment), + ) + ] + ) + + # Continue parameter value - Not used in the current implementation + # since we process complete parameters above + if self.in_param: + if self.parameter_end_token in delta_text: + # End of parameter + end_idx = delta_text.find(self.parameter_end_token) + value_chunk = delta_text[:end_idx] + + # Skip past > if at start + if not self.current_param_value and ">" in value_chunk: + gt_idx = value_chunk.find(">") + value_chunk = value_chunk[gt_idx + 1 :] + + if not self.current_param_value and value_chunk.startswith("\n"): + value_chunk = value_chunk[1:] + + # Store complete value + full_value = self.current_param_value + value_chunk + self.accumulated_params[self.current_param_name] = full_value + + # Get parameter configuration for type conversion + param_config = self._get_arguments_config( + self.current_function_name or "", + self.streaming_request.tools + if self.streaming_request + else None, + ) + + # Convert the parameter value to the appropriate type + converted_value = self._convert_param_value( + full_value, + self.current_param_name or "", + param_config, + self.current_function_name or "", + ) + + # Serialize the converted value + serialized_value = json.dumps(converted_value, ensure_ascii=False) + + # Since we've been streaming the quoted version, + # we need to close it properly + # This is complex - for now just complete the value + self.in_param = False + self.current_param_value = "" + + # Just close the current parameter string + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments='"' + ), # Close the string quote + ) + ] + ) + else: + # Continue accumulating value + value_chunk = delta_text + + # Handle first chunk after param name + if not self.current_param_value and ">" in value_chunk: + gt_idx = value_chunk.find(">") + value_chunk = value_chunk[gt_idx + 1 :] + + if not self.current_param_value and value_chunk.startswith("\n"): + value_chunk = value_chunk[1:] + + if value_chunk: + # Stream the escaped delta + prev_escaped = ( + json.dumps(self.current_param_value, ensure_ascii=False)[ + 1:-1 + ] + if self.current_param_value + else "" + ) + self.current_param_value += value_chunk + full_escaped = json.dumps( + self.current_param_value, ensure_ascii=False + )[1:-1] + delta_escaped = full_escaped[len(prev_escaped) :] + + if delta_escaped: + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments=delta_escaped + ), + ) + ] + ) + + return None diff --git a/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py new file mode 100644 index 0000000..432c419 --- /dev/null +++ b/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -0,0 +1,1316 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import json +from collections.abc import Sequence +from typing import Any +from xml.parsers.expat import ParserCreate + +import regex as re + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionToolsParam, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class StreamingXMLToolCallParser: + """ + Simplified streaming XML tool call parser + Supports streaming input, parsing, and output + """ + + def __init__(self): + self.reset_streaming_state() + + # Tool configuration information + self.tools: list[ChatCompletionToolsParam] | None = None + self.tool_call_start_token: str = "" + self.tool_call_end_token: str = "" + self.function_start_token: str = " DeltaMessage: + """ + Parse single streaming XML chunk and return Delta response + This is the actual streaming interface that receives chunks + one by one and maintains internal state + + Args: + xml_chunk: Single XML chunk string + Returns: + DeltaMessage: Contains delta information generated by this chunk, + returns empty response if no complete elements + """ + # Record delta count before processing + initial_delta_count = len(self.deltas) + + self.streaming_buffer += xml_chunk + + found_elements = self._process_complete_xml_elements() + + if found_elements: + # If complete elements found, check if end events were missed + # some tags may not have been triggered + try: + new_deltas = self.deltas[initial_delta_count:] + # If this chunk contains + # but didn't generate '}', then complete it + if ( + self.current_call_id is not None + and self.function_end_token in xml_chunk + ): + # - Added '}' (non-empty parameter ending) + # - Added '{}' (empty parameter function) + has_function_close = any( + ( + td.tool_calls + and any( + ( + tc.function + and tc.id == self.current_call_id + and isinstance(tc.function.arguments, str) + and (tc.function.arguments in ("}", "{}")) + ) + for tc in td.tool_calls + ) + ) + for td in new_deltas + ) + if not has_function_close: + # Close potentially unclosed element + if self.current_param_name: + self._end_element("parameter") + if self.current_function_name: + self._end_element("function") + # If this chunk contains + # but didn't generate final empty delta, then complete it + if ( + self.current_call_id is not None + and self.tool_call_end_token in xml_chunk + ): + has_toolcall_close = any( + ( + td.tool_calls + and any( + ( + tc.type == "function" + and tc.function + and tc.function.arguments == "" + and tc.id == self.current_call_id + ) + for tc in td.tool_calls + ) + ) + for td in new_deltas + ) + if not has_toolcall_close: + # Close potentially unclosed element + if self.current_param_name: + self._end_element("parameter") + if self.current_function_name: + self._end_element("function") + self._end_element("tool_call") + except Exception as e: + logger.warning("Error with fallback parsing: %s", e) + # Merge newly generated deltas into single response + result_delta = self._merge_new_deltas_to_single_response( + initial_delta_count + ) + return result_delta + else: + # No complete elements, check if there's unoutput text content + if self.text_content_buffer and self.tool_call_index == 0: + # Has text content but no tool_call yet, output text content + text_delta = DeltaMessage(content=self.text_content_buffer) + self._emit_delta(text_delta) + # Clear buffer to avoid duplicate output + self.text_content_buffer = "" + return text_delta + + # If this chunk contains end tags but wasn't triggered by parser, + # manually complete end events + # Only execute when still on the same call as when entered, + # to prevent accidentally closing new calls + # in multi scenarios + if self.current_call_id is not None and ( + self.function_end_token in xml_chunk + or self.tool_call_end_token in xml_chunk + ): + # Close potentially unclosed element + if self.current_param_name: + self._end_element("parameter") + if self.function_end_token in xml_chunk and self.current_function_name: + self._end_element("function") + if self.tool_call_end_token in xml_chunk: + self._end_element("tool_call") + # Return the merged delta result generated by this fallback + result_delta = self._merge_new_deltas_to_single_response( + initial_delta_count + ) + return result_delta + + # No complete elements, return empty response + return DeltaMessage(content=None) + + def _escape_xml_special_chars(self, text: str) -> str: + """ + Escape XML special characters + Args: + text: Original text + Returns: + Escaped text + """ + xml_escapes = { + "&": "&", + "<": "<", + ">": ">", + '"': """, + "'": "'", + } + + for char, escape in xml_escapes.items(): + text = text.replace(char, escape) + + return text + + def _process_complete_xml_elements(self) -> bool: + """ + Process complete XML elements in buffer + + Returns: + bool: Whether complete elements were found and processed + """ + found_any = False + + while self.last_processed_pos < len(self.streaming_buffer): + # Find next complete xml element + element, end_pos = self._find_next_complete_element(self.last_processed_pos) + if element is None: + # No complete element found, wait for more data + break + + # Check if this element should be skipped + if self._should_skip_element(element): + self.last_processed_pos = end_pos + continue + + # Found complete XML element, process it + try: + preprocessed_element = self._preprocess_xml_chunk(element) + # Check if this is the first tool_call start + if ( + ( + preprocessed_element.strip().startswith("") + or preprocessed_element.strip().startswith("") + and self.tool_call_index > 0 + and self.current_call_id + ): + # Reset parser state but preserve generated deltas + if self.current_param_name: + self._end_element("parameter") + if self.current_function_open or self.current_function_name: + self._end_element("function") + # Output final tool_call tail delta + final_delta = DeltaMessage( + role=None, + content=None, + reasoning=None, + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall(name=None, arguments=""), + ) + ], + ) + self._emit_delta(final_delta) + # Reset XML parser and current call state + self._reset_xml_parser_after_tool_call() + # Parse preprocessed element + self.parser.Parse(preprocessed_element, False) + found_any = True + + except Exception as e: + logger.warning("Error when parsing XML elements: %s", e) + + # Update processed position + self.last_processed_pos = end_pos + + return found_any + + def _should_skip_element(self, element: str) -> bool: + """ + Determine whether an element should be skipped + + Args: + element: Element to evaluate + + Returns: + bool: True means should skip, False means should process + """ + + # If it's a tool_call XML tag, don't skip + if ( + element.startswith(self.tool_call_start_token) + or element.startswith(self.function_start_token) + or element.startswith(self.parameter_start_token) + ): + return False + + # If currently not parsing tool calls and not blank, + # collect this text instead of skipping + # Only process other XML elements after tool_call appears, + # otherwise treat as plain text + if self.current_call_id is None and element: + # Collect text content to buffer + self.text_content_buffer += element + return True # Still skip, but content has been collected + + # If currently parsing tool calls, + # this might be parameter value, don't skip + if self.current_call_id is not None: + return False + + # Skip blank content + return not element + + def _find_next_complete_element(self, start_pos: int) -> tuple[str | None, int]: + """ + Find next complete XML element from specified position + + Args: + start_pos: Position to start searching + + Returns: + (Complete element string, element end position), + returns (None, start_pos) if no complete element found + """ + buffer = self.streaming_buffer[start_pos:] + + if not buffer: + return None, start_pos + + if buffer.startswith("<"): + # Need to ensure no new < appears, + # find the nearest one between < and > + tag_end = buffer.find("<", 1) + tag_end2 = buffer.find(">", 1) + if tag_end != -1 and tag_end2 != -1: + # Next nearest is < + if tag_end < tag_end2: + return buffer[:tag_end], start_pos + tag_end + # Next nearest is >, means found XML element + else: + return buffer[: tag_end2 + 1], start_pos + tag_end2 + 1 + elif tag_end != -1: + return buffer[:tag_end], start_pos + tag_end + elif tag_end2 != -1: + return buffer[: tag_end2 + 1], start_pos + tag_end2 + 1 + else: + # If currently not parsing tool calls (entering a tool_call), + # check if starts with or + if buffer == ""[: len(buffer)]: + # Might be start of , wait for more data + return None, start_pos + elif ( + buffer.startswith(" DeltaMessage: + """ + Merge newly generated deltas from this processing + into a single DeltaMessage + + Args: + initial_count: Delta count before processing + + Returns: + Merged DeltaMessage containing all newly generated delta information + """ + if len(self.deltas) <= initial_count: + return DeltaMessage(content=None) + + # Get newly generated deltas + new_deltas = self.deltas[initial_count:] + + if len(new_deltas) == 1: + # Only one new delta, return directly + return new_deltas[0] + + # Merge multiple new deltas + merged_tool_calls: list[DeltaToolCall] = [] + merged_content: str = "" + + for delta in new_deltas: + if delta.content: + merged_content += delta.content + if delta.tool_calls: + # For tool_calls, we need to intelligently merge arguments + for tool_call in delta.tool_calls: + # Find if there's already a tool_call with the same call_id + existing_call = None + for existing in merged_tool_calls: + if existing.id == tool_call.id: + existing_call = existing + break + + if existing_call and existing_call.function: + # Merge to existing tool_call + if tool_call.function and tool_call.function.name: + existing_call.function.name = tool_call.function.name + if ( + tool_call.function + and tool_call.function.arguments is not None + ): + if existing_call.function.arguments is None: + existing_call.function.arguments = "" + + # For streaming JSON parameters, + # simply concatenate in order + new_args = tool_call.function.arguments + existing_call.function.arguments += new_args + if tool_call.type: + existing_call.type = tool_call.type + else: + # Add new tool_call + merged_tool_calls.append(tool_call) + + return DeltaMessage( + content=merged_content if merged_content else None, + tool_calls=merged_tool_calls, + ) + + def _preprocess_xml_chunk(self, chunk: str) -> str: + """ + Preprocess XML chunk, handle non-standard formats, + and escape special characters + + Args: + chunk: Original XML chunk + + Returns: + Processed XML chunk + """ + + # Check if this is a tool_call related element + is_tool_call = False + if chunk.startswith(self.tool_call_start_token) or chunk.startswith( + self.tool_call_end_token + ): + is_tool_call = True + if chunk.startswith(self.function_start_token) or chunk.startswith( + self.function_end_token + ): + is_tool_call = True + if chunk.startswith(self.parameter_start_token) or chunk.startswith( + self.parameter_end_token + ): + is_tool_call = True + # Handle format -> + processed = re.sub(r"]+)>", r'', chunk) + # Handle format -> + processed = re.sub(r"]+)>", r'', processed) + + original_chunk = chunk + # If in parameter value accumulation mode + if self._pre_inside_parameter: + # Parameter end: output accumulated raw text + # safely then return + if processed.startswith(""): + body_text = self._pre_param_buffer + # Trigger deferred parsing mode + # literal_eval+json output in end_element + self.defer_current_parameter = True + self.deferred_param_raw_value = body_text + # Clean up state + self._pre_inside_parameter = False + self._pre_param_buffer = "" + self._pre_current_param_name = None + safe_text = self._escape_xml_special_chars(body_text) + return f"{safe_text}" + else: + # If this is the first block of content after entering parameter + # evaluate if deferred parsing is needed; + # If not needed, exit accumulation mode + # and pass through directly + if self._pre_param_buffer == "": + # Get current parameter type + param_type = ( + self._get_param_type(self._pre_current_param_name) + if self._pre_current_param_name + else "string" + ) + # Only these types need deferred parsing to + # handle Python literals containing single quotes + is_object_type = param_type in ["object"] + is_complex_type = ( + param_type in ["array", "arr", "sequence"] + or param_type.startswith("dict") + or param_type.startswith("list") + ) + + # Only delay when contains container symbols + # and has single quotes and is complex type + has_container_hint = ( + ("[" in original_chunk) + or ("{" in original_chunk) + or ("(" in original_chunk) + ) + + # Determine if deferred parsing is needed + need_defer = False + if is_complex_type: + # Complex type, always need deferred parsing + need_defer = True + elif ( + is_object_type + and has_container_hint + and ("'" in original_chunk) + ): + # Object type with container symbols + # and single quotes, need deferred parsing + need_defer = True + + if not need_defer: + # No need for deferred parsing, + # exit parameter mode directly + self._pre_inside_parameter = False + return self._escape_xml_special_chars(original_chunk) + self._pre_param_buffer += original_chunk + return "" + + # Parameter start: enable accumulation + if processed.startswith("', processed) + if m: + self._pre_current_param_name = m.group(1) + self._pre_inside_parameter = True + self._pre_param_buffer = "" + return processed + + # If processed doesn't contain special_token, escape processed + # This is because XML parsing encounters special characters + # and reports errors, so escaping is needed + if not is_tool_call: + processed = self._escape_xml_special_chars(processed) + return processed + + def _emit_delta(self, delta: DeltaMessage): + """Emit Delta response (streaming output)""" + self.deltas.append(delta) + + def _auto_close_open_parameter_if_needed(self, incoming_tag: str | None = None): + """Before starting to process new elements, + if there are unclosed tags from before, + automatically complete their endings to the parser. + - If there are unclosed parameters, + it's equivalent to feeding `` + - When about to start a new function or tool_call, + if there are unclosed functions, complete ``. + - When about to start a new tool_call, + if there are unclosed tool_calls, complete ``. + """ + # First close unclosed parameters + if self.current_param_name: + self._end_element("parameter") + + # If about to start new function or tool_call, + # and there are unclosed functions, close function first + if incoming_tag in ("function", "tool_call") and self.current_function_name: + self._end_element("function") + + # If about to start new tool_call, + # and there are unclosed tool_calls, close tool_call first + if incoming_tag == "tool_call" and self.current_call_id: + self._end_element("tool_call") + + def _start_element(self, name: str, attrs: dict[str, str]): + """Handle XML start element events""" + + if name == "root": + return + + if name == "tool_call": + # Before opening new tool_call, + # automatically complete previous unclosed tags + self._auto_close_open_parameter_if_needed("tool_call") + + self.parameters = {} + self.current_call_id = make_tool_call_id() + self.current_param_is_first = True + self.tool_call_index += 1 + elif name.startswith("function") or (name == "function"): + # If missing tool_call, manually complete + if not self.current_call_id: + self._start_element("tool_call", {}) + # Before opening new function, + # automatically complete previous unclosed tags (parameter/function) + self._auto_close_open_parameter_if_needed("function") + function_name = self._extract_function_name(name, attrs) + self.current_function_name = function_name + self.current_function_open = True + if function_name: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall( + name=function_name, arguments="" + ), + ) + ] + ) + self._emit_delta(delta) + elif name.startswith("parameter") or (name == "parameter"): + # If previous parameter hasn't ended normally, + # complete its end first, then start new parameter + self._auto_close_open_parameter_if_needed("parameter") + param_name = self._extract_parameter_name(name, attrs) + self.current_param_name = param_name + self.current_param_value = "" + self.current_param_value_converted = "" + self.start_quote_emitted = False # Reset start quote flag + + # Only output parameter name and colon, + # don't output quotes + # decide after parameter value type is determined + if param_name: + if not self.parameters: + # First parameter + # start JSON, only output parameter name and colon + json_start = f'{{"{param_name}": ' + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall( + name=None, arguments=json_start + ), + ) + ] + ) + self._emit_delta(delta) + self.current_param_is_first = True + else: + # Subsequent parameters + # add comma and parameter name, no quotes + json_continue = f', "{param_name}": ' + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall( + name=None, arguments=json_continue + ), + ) + ] + ) + self._emit_delta(delta) + self.current_param_is_first = False + + def _char_data(self, data: str): + """Handle XML character data events""" + if data and self.current_param_name: + # If preprocessing stage determines deferred parsing is needed, + # only cache character data, no streaming output + if self.defer_current_parameter: + original_data = data + if self.should_emit_end_newline: + original_data = "\n" + original_data + self.should_emit_end_newline = False + if original_data.endswith("\n"): + self.should_emit_end_newline = True + original_data = original_data[:-1] + self.current_param_value += original_data + return + + param_type = self._get_param_type(self.current_param_name) + + # Check if this is the first time receiving data for this parameter + # If this is the first packet of data and starts with \n, remove \n + if not self.current_param_value and data.startswith("\n"): + data = data[1:] + + # Output start quote for string type (if not already output) + if ( + param_type in ["string", "str", "text", "varchar", "char", "enum"] + and not self.start_quote_emitted + ): + quote_delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall(name=None, arguments='"'), + ) + ] + ) + self._emit_delta(quote_delta) + self.start_quote_emitted = True + + if not data: + return + + original_data = data + # Delay output of trailing newline + if self.should_emit_end_newline: + original_data = "\n" + original_data + self.should_emit_end_newline = False + if original_data.endswith("\n"): + self.should_emit_end_newline = True + original_data = original_data[:-1] + self.current_param_value += original_data + + # convert parameter value by param_type + converted_value = self._convert_param_value( + self.current_param_value, param_type + ) + output_data = self._convert_for_json_streaming(converted_value, param_type) + + delta_data = output_data[len(self.current_param_value_converted) :] + self.current_param_value_converted = output_data + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall(name=None, arguments=delta_data), + ) + ] + ) + self._emit_delta(delta) + + def _end_element(self, name: str): + """Handle XML end element events""" + + if name == "root": + return + + # If function or tool_call ends and there are still unclosed parameters, + # complete parameter end first + if ( + name.startswith("function") or name == "function" or name == "tool_call" + ) and self.current_param_name: + self._auto_close_open_parameter_if_needed() + + if ( + name.startswith("parameter") or name == "parameter" + ) and self.current_param_name: + # End current parameter + param_name = self.current_param_name + param_value = self.current_param_value + + # If in deferred parsing mode, + # perform overall parsing on raw content + # accumulated in preprocessing stage and output once + if self.defer_current_parameter: + raw_text = ( + self.deferred_param_raw_value + if self.deferred_param_raw_value + else param_value + ) + parsed_value = None + output_arguments = None + try: + # If previously delayed trailing newline, + # add it back before parsing + if self.should_emit_end_newline: + raw_for_parse = raw_text + "\n" + else: + raw_for_parse = raw_text + parsed_value = ast.literal_eval(raw_for_parse) + output_arguments = json.dumps(parsed_value, ensure_ascii=False) + except Exception: + # Fallback: output as string as-is + output_arguments = json.dumps(raw_text, ensure_ascii=False) + parsed_value = raw_text + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall( + name=None, arguments=output_arguments + ), + ) + ] + ) + self._emit_delta(delta) + + # Clean up and store + self.should_emit_end_newline = False + self.parameters[param_name] = parsed_value + self.current_param_name = None + self.current_param_value = "" + self.current_param_value_converted = "" + self.start_quote_emitted = False + self.defer_current_parameter = False + self.deferred_param_raw_value = "" + return + + param_type = self._get_param_type(param_name) + + # convert complete parameter value by param_type + converted_value = self._convert_param_value(param_value, param_type) + + # Decide whether to add end quote based on parameter type + if param_type in ["string", "str", "text", "varchar", "char", "enum"]: + # For empty string parameters, need special handling + if not param_value and not self.start_quote_emitted: + # No start quote output, + # directly output complete empty string + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall(name=None, arguments='""'), + ) + ] + ) + self._emit_delta(delta) + else: + # Non-empty parameter value, output end quote + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall(name=None, arguments='"'), + ) + ] + ) + self._emit_delta(delta) + + self.should_emit_end_newline = False + # Store converted value + self.parameters[param_name] = converted_value + self.current_param_name = None + self.current_param_value = "" + self.current_param_value_converted = "" + self.start_quote_emitted = False + + elif name.startswith("function") or name == "function": + # if there are parameters, close JSON object + if self.parameters: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall(name=None, arguments="}"), + ) + ] + ) + self._emit_delta(delta) + # return empty object + else: + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall(name=None, arguments="{}"), + ) + ] + ) + self._emit_delta(delta) + self.current_function_open = False + + elif name == "tool_call": + # Before ending tool_call, + # ensure function is closed to complete missing right brace + if self.current_function_open: + # If there are still unclosed parameters, close them first + if self.current_param_name: + self._end_element("parameter") + # Close function, ensure output '}' or '{}' + self._end_element("function") + # Final Delta + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self.current_call_id, + type="function", + function=DeltaFunctionCall(name=None, arguments=""), + ) + ] + ) + self._emit_delta(delta) + + # Check if there's text content to output (between tool_calls) + if self.text_content_buffer.strip(): + text_delta = DeltaMessage(content=self.text_content_buffer) + self._emit_delta(text_delta) + + self._reset_xml_parser_after_tool_call() + + def setup_parser(self): + """Set up XML parser event handlers""" + self.parser.buffer_text = True + self.parser.StartElementHandler = self._start_element + self.parser.EndElementHandler = self._end_element + self.parser.CharacterDataHandler = self._char_data + + def set_tools(self, tools: list[ChatCompletionToolsParam] | None): + """Set tool configuration information""" + self.tools = tools + + def _extract_function_name(self, name: str, attrs: dict[str, str]) -> str | None: + """Extract function name from various formats""" + if attrs and "name" in attrs: + return attrs["name"] + + if "=" in name: + parts = name.split("=", 1) + if len(parts) == 2 and parts[0] == "function": + return parts[1] + + return None + + def _extract_parameter_name(self, name: str, attrs: dict[str, str]) -> str | None: + """Extract parameter name from various formats""" + if attrs and "name" in attrs: + return attrs["name"] + + if "=" in name: + parts = name.split("=", 1) + if len(parts) == 2 and parts[0] == "parameter": + return parts[1] + + return None + + def _get_param_type(self, param_name: str) -> str: + """Get parameter type based on tool configuration, defaults to string + Args: + param_name: Parameter name + + Returns: + Parameter type + """ + if not self.tools or not self.current_function_name: + return "string" + + for tool in self.tools: + if not hasattr(tool, "type") or not ( + hasattr(tool, "function") and hasattr(tool.function, "name") + ): + continue + if ( + tool.type == "function" + and tool.function.name == self.current_function_name + ): + if not hasattr(tool.function, "parameters"): + return "string" + params = tool.function.parameters + if isinstance(params, dict) and "properties" in params: + properties = params["properties"] + if param_name in properties and isinstance( + properties[param_name], dict + ): + return self.repair_param_type( + str(properties[param_name].get("type", "string")) + ) + elif isinstance(params, dict) and param_name in params: + param_config = params[param_name] + if isinstance(param_config, dict): + return self.repair_param_type( + str(param_config.get("type", "string")) + ) + break + return "string" + + def repair_param_type(self, param_type: str) -> str: + """Repair unknown parameter types by treating them as string + Args: + param_type: Parameter type + + Returns: + Repaired parameter type + """ + if ( + param_type in ["string", "str", "text", "varchar", "char", "enum"] + or param_type.startswith("int") + or param_type.startswith("uint") + or param_type.startswith("long") + or param_type.startswith("short") + or param_type.startswith("unsigned") + or param_type.startswith("num") + or param_type.startswith("float") + or param_type in ["boolean", "bool", "binary"] + or ( + param_type in ["object", "array", "arr", "sequence"] + or param_type.startswith("dict") + or param_type.startswith("list") + ) + ): + return param_type + else: + return "string" + + def _convert_param_value(self, param_value: str, param_type: str) -> Any: + """Convert value based on parameter type + Args: + param_value: Parameter value + param_type: Parameter type + + Returns: + Converted value + """ + if param_value.lower() == "null": + return None + + param_type = param_type.strip().lower() + if param_type in ["string", "str", "text", "varchar", "char", "enum"]: + return param_value + elif ( + param_type.startswith("int") + or param_type.startswith("uint") + or param_type.startswith("long") + or param_type.startswith("short") + or param_type.startswith("unsigned") + ): + try: + return int(param_value) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not an integer " + "in tool '%s', degenerating to string.", + param_value, + ) + return param_value + elif param_type.startswith("num") or param_type.startswith("float"): + try: + float_param_value: float = float(param_value) + return ( + float_param_value + if float_param_value - int(float_param_value) != 0 + else int(float_param_value) + ) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not a float " + "in tool '%s', degenerating to string.", + param_value, + ) + return param_value + elif param_type in ["boolean", "bool", "binary"]: + param_value = param_value.lower() + return param_value == "true" + else: + return param_value + + def _convert_for_json_streaming(self, converted_value: Any, param_type: str) -> str: + """Convert converted_value based on + whether it's empty and if type is string + Args: + converted_value: Converted value + param_type: Parameter type + + Returns: + Converted string for streaming output + """ + # Check if value is empty, but exclude numeric 0 + if converted_value is None or converted_value == "": + return "" + + if param_type in ["string", "str", "text", "varchar", "char", "enum"]: + # String type, remove double quotes + return json.dumps(converted_value, ensure_ascii=False)[1:-1] + else: + # Non-string type, return complete JSON string + if not isinstance(converted_value, str): + return json.dumps(converted_value, ensure_ascii=False) + else: + return converted_value + + def _reset_xml_parser_after_tool_call(self): + """ + Each tool_call is treated as a separate XML document, + so we need to reset the parser after each tool_call. + """ + + # recreate XML parser + self.parser = ParserCreate() + self.setup_parser() + + # Reset current tool_call state + if self.current_call_id: + self.last_completed_call_id = self.current_call_id + self.current_call_id = None + self.current_function_name = None + self.current_function_open = False + self.parameters = {} + self.current_param_name = None + self.current_param_value = "" + self.current_param_value_converted = "" + self.current_param_is_first = False + self.should_emit_end_newline = False + self.start_quote_emitted = False + self.text_content_buffer = "" + + # Reset preprocessing and deferred parsing state + self._pre_inside_parameter = False + self._pre_param_buffer = "" + self._pre_current_param_name = None + self.defer_current_parameter = False + self.deferred_param_raw_value = "" + + +class Qwen3XMLToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.parser = StreamingXMLToolCallParser() + + # Add missing attributes for compatibility with serving_chat.py + self.prev_tool_call_arr: list[dict] = [] + self.streamed_args_for_tool: list[str] = [] + + logger.info( + "vLLM Successfully import tool parser %s !", self.__class__.__name__ + ) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + self.parser.reset_streaming_state() + # Reset tool call tracking arrays for new extraction + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] + if request: + self.parser.set_tools(request.tools) + result = self.parser.parse_single_streaming_chunks(model_output) + if not result.tool_calls: + return ExtractedToolCallInformation( + tool_calls=[], + tools_called=False, + content=result.content, + ) + else: + tool_calls = [] + for tool_call in result.tool_calls: + if tool_call.function and tool_call.function.name: + tool_calls.append( + ToolCall( + id=tool_call.id, + type=tool_call.type, + function=FunctionCall( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ), + ) + ) + + # Update tool call tracking arrays for compatibility + tool_index = ( + tool_call.index + if tool_call.index is not None + else len(self.prev_tool_call_arr) - 1 + ) + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= tool_index: + self.prev_tool_call_arr.append({"name": "", "arguments": ""}) + while len(self.streamed_args_for_tool) <= tool_index: + self.streamed_args_for_tool.append("") + + # Update tool call information + self.prev_tool_call_arr[tool_index]["name"] = ( + tool_call.function.name + ) + self.prev_tool_call_arr[tool_index]["arguments"] = ( + tool_call.function.arguments + ) + + # Update streamed arguments + if tool_call.function.arguments: + self.streamed_args_for_tool[tool_index] = ( + tool_call.function.arguments + ) + + return ExtractedToolCallInformation( + tool_calls=tool_calls, + tools_called=len(tool_calls) > 0, + content=result.content, + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + if not previous_text: + self.parser.reset_streaming_state() + # Reset tool call tracking arrays for new streaming session + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] + if request: + self.parser.set_tools(request.tools) + + # Model sometimes outputs separately causing delta_text to be empty. + # If there were tool_calls before and all current tool_calls have ended, + # return an empty tool_call for outer streaming output + # to correctly output tool_call field + if not delta_text and delta_token_ids: + open_calls = current_text.count( + self.parser.tool_call_start_token + ) - current_text.count(self.parser.tool_call_end_token) + if ( + open_calls == 0 + and self.parser.tool_call_index > 0 + or not self.parser.tool_call_index + and current_text + ): + return DeltaMessage(content="") + return None + + # Parse the delta text and get the result + result = self.parser.parse_single_streaming_chunks(delta_text) + + # Update tool call tracking arrays based on incremental parsing results + if result and result.tool_calls: + for tool_call in result.tool_calls: + if tool_call.function: + tool_index = ( + tool_call.index + if tool_call.index is not None + else len(self.prev_tool_call_arr) - 1 + ) + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= tool_index: + self.prev_tool_call_arr.append({"name": "", "arguments": ""}) + while len(self.streamed_args_for_tool) <= tool_index: + self.streamed_args_for_tool.append("") + + # Update tool name if provided + if tool_call.function.name: + self.prev_tool_call_arr[tool_index]["name"] = ( + tool_call.function.name + ) + + # Update arguments incrementally + if tool_call.function.arguments is not None: + # Concatenate the incremental arguments + # to the existing streamed arguments + self.prev_tool_call_arr[tool_index]["arguments"] += ( + tool_call.function.arguments + ) + self.streamed_args_for_tool[tool_index] += ( + tool_call.function.arguments + ) + return result diff --git a/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py new file mode 100644 index 0000000..8aed7f0 --- /dev/null +++ b/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py @@ -0,0 +1,744 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from qwen3coder xml parser, All rights reserved. +# ruff: noqa: E501 + +import ast +import json +import uuid +from collections.abc import Sequence +from typing import Any + +import regex as re + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionToolsParam, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class SeedOssToolParser(ToolParser): + TOOL_CALL_START = "" + TOOL_CALL_END = "" + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + # --- streaming state --- + self._reset_streaming_state() + self.prev_tool_call_arr: list[dict] = [] + + self.tool_call_start_token: str = self.TOOL_CALL_START + self.tool_call_end_token: str = self.TOOL_CALL_END + # Sentinel tokens for streaming mode + self.tool_call_prefix: str = " or its closing tag." + ) + + tool_start_re = re.escape(self.tool_call_start_token) + tool_end_re = re.escape(self.tool_call_end_token) + + self.tool_call_complete_regex = re.compile( + rf"{tool_start_re}(.*?){tool_end_re}", re.DOTALL + ) + self.tool_call_regex = re.compile( + rf"{tool_start_re}(.*?){tool_end_re}|{tool_start_re}(.*?)$", re.DOTALL + ) + + self.tool_call_function_regex = re.compile( + r"|| str: + """Generate a unique tool call ID.""" + return f"call_{uuid.uuid4().hex[:24]}" + + def _reset_streaming_state(self): + """Reset all streaming state.""" + self.current_tool_index = 0 + self.is_tool_call_started = False + self.header_sent = False + self.current_tool_id = -1 + self.current_function_name = None + self.current_param_name = None + self.current_param_value = "" + self.param_count = 0 + self.in_param = False + self.in_function = False + self.accumulated_text = "" + self.json_started = False + self.json_closed = False + + def _parse_xml_function_call( + self, function_call_str: str, tools: list[ChatCompletionToolsParam] | None + ) -> ToolCall | None: + def get_arguments_config(func_name: str) -> dict: + if tools is None: + return {} + for config in tools: + if not hasattr(config, "type") or not ( + hasattr(config, "function") and hasattr(config.function, "name") + ): + continue + if config.type == "function" and config.function.name == func_name: + if not hasattr(config.function, "parameters"): + return {} + params = config.function.parameters + if isinstance(params, dict) and "properties" in params: + return params["properties"] + elif isinstance(params, dict): + return params + else: + return {} + logger.warning("Tool '%s' is not defined in the tools list.", func_name) + return {} + + def convert_param_value( + param_value: str, param_name: str, param_config: dict, func_name: str + ) -> Any: + # Handle null value for any type + if param_value.lower() == "null": + return None + + if param_name not in param_config: + if param_config != {}: + logger.warning( + "Parsed parameter '%s' is not defined in " + "the tool parameters for tool '%s', " + "directly returning the string value.", + param_name, + func_name, + ) + return param_value + + if ( + isinstance(param_config[param_name], dict) + and "type" in param_config[param_name] + ): + param_type = str(param_config[param_name]["type"]).strip().lower() + else: + param_type = "string" + if param_type in ["string", "str", "text", "varchar", "char", "enum"]: + return param_value + elif ( + param_type.startswith("int") + or param_type.startswith("uint") + or param_type.startswith("long") + or param_type.startswith("short") + or param_type.startswith("unsigned") + ): + try: + param_value = int(param_value) # type: ignore + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not an integer in tool " + "'%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + elif param_type.startswith("num") or param_type.startswith("float"): + try: + float_param_value = float(param_value) + param_value = ( + float_param_value # type: ignore + if float_param_value - int(float_param_value) != 0 + else int(float_param_value) # type: ignore + ) + except (ValueError, TypeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not a float in tool " + "'%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + elif param_type in ["boolean", "bool", "binary"]: + param_value = param_value.lower() + if param_value not in ["true", "false"]: + logger.warning( + "Parsed value '%s' of parameter '%s' is not a boolean " + "(`true` of `false`) in tool '%s', degenerating to false.", + param_value, + param_name, + func_name, + ) + return param_value == "true" + else: + if param_type == "object" or param_type.startswith("dict"): + try: + param_value = json.loads(param_value) + return param_value + except (ValueError, TypeError, json.JSONDecodeError): + logger.warning( + "Parsed value '%s' of parameter '%s' is not a valid JSON " + "object in tool '%s', will try other methods to parse it.", + param_value, + param_name, + func_name, + ) + try: + param_value = ast.literal_eval(param_value) + except (ValueError, SyntaxError): + logger.warning( + "Parsed value '%s' of parameter '%s' cannot be converted via " + "Python `ast.literal_eval()` in tool '%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + + # Extract function name + end_index = function_call_str.index(">") + function_name = function_call_str[:end_index] + param_config = get_arguments_config(function_name) + parameters = function_call_str[end_index + 1 :] + param_dict = {} + for match in self.tool_call_parameter_regex.findall(parameters): + match_text = match[0] if match[0] else match[1] + idx = match_text.index(">") + param_name = match_text[:idx] + param_value = str(match_text[idx + 1 :]) + # Remove prefix and trailing \n + if param_value.startswith("\n"): + param_value = param_value[1:] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + param_dict[param_name] = convert_param_value( + param_value, param_name, param_config, function_name + ) + return ToolCall( + type="function", + function=FunctionCall( + name=function_name, arguments=json.dumps(param_dict, ensure_ascii=False) + ), + ) + + def _get_function_calls(self, model_output: str) -> list[str]: + # Find all tool calls + matched_ranges = self.tool_call_regex.findall(model_output) + raw_tool_calls = [ + match[0] if match[0] else match[1] for match in matched_ranges + ] + + # Back-off strategy if no tool_call tags found + if len(raw_tool_calls) == 0: + raw_tool_calls = [model_output] + + raw_function_calls = [] + for tool_call in raw_tool_calls: + raw_function_calls.extend(self.tool_call_function_regex.findall(tool_call)) + + function_calls = [ + match[0] if match[0] else match[1] for match in raw_function_calls + ] + return function_calls + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + # Quick check to avoid unnecessary processing + if self.tool_call_prefix not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + # Check if both think start and end tokens are present + if ( + self.think_start_token in model_output + and self.think_end_token in model_output + ): + # Find the position of think end token + think_end_index = model_output.find(self.think_end_token) + len( + self.think_end_token + ) + # Extract content after think end token + result_content = model_output[think_end_index:] + thinking_content = model_output[:think_end_index] + else: + thinking_content = "" + result_content = model_output + + try: + function_calls = self._get_function_calls(result_content) + if len(function_calls) == 0: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + tool_calls = [ + self._parse_xml_function_call(function_call_str, request.tools) + for function_call_str in function_calls + ] + + # Populate prev_tool_call_arr for serving layer to set finish_reason + self.prev_tool_call_arr.clear() # Clear previous calls + for tool_call in tool_calls: + if tool_call: + self.prev_tool_call_arr.append( + { + "name": tool_call.function.name, + "arguments": tool_call.function.arguments, + } + ) + + # Extract content before tool calls + tool_call_start_index = result_content.find(self.tool_call_start_token) + tool_call_start_index = ( + tool_call_start_index + if tool_call_start_index >= 0 + else result_content.find(self.tool_call_prefix) + ) + content = thinking_content + result_content[:tool_call_start_index] + + return ExtractedToolCallInformation( + tools_called=(len(tool_calls) > 0), + tool_calls=tool_calls, + content=content if content else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + # If no delta text, return None unless + # it's an EOS token after tool calls + if not delta_text: + # Check if this is an EOS token after all tool calls are complete + # We check for tool calls in the text even if is_tool_call_started + # is False because it might have been reset after processing all tools + if delta_token_ids and self.tool_call_end_token_id not in delta_token_ids: + # Count complete tool calls + complete_calls = len( + self.tool_call_complete_regex.findall(current_text) + ) + + # If we have completed tool calls and populated prev_tool_call_arr + if complete_calls > 0 and len(self.prev_tool_call_arr) > 0: + # Check if all tool calls are closed + open_calls = current_text.count( + self.tool_call_start_token + ) - current_text.count(self.tool_call_end_token) + if open_calls == 0: + # Return empty delta message to allow finish_reason processing + return DeltaMessage(content="") + elif not self.is_tool_call_started and current_text: + # This is a regular content response that's now complete + return DeltaMessage(content="") + return None + + # Check if this is the first call (reset state if needed) + if not previous_text: + self._reset_streaming_state() + + # Update accumulated text + self.accumulated_text = current_text + + # Check if we need to advance to next tool + if self.json_closed and not self.in_function: + # Check if this tool call has ended + tool_ends = current_text.count(self.tool_call_end_token) + if tool_ends > self.current_tool_index: + # This tool has ended, advance to next + self.current_tool_index += 1 + self.header_sent = False + self.param_count = 0 + self.json_started = False + self.json_closed = False + + # Check if there are more tool calls + if self.current_tool_index >= current_text.count( + self.tool_call_start_token + ): + # No more tool calls + self.is_tool_call_started = False + # Continue processing next tool + return None + + # Check if end thinking + if not self.is_thinking_end and ( + self.think_end_token_id in delta_token_ids + or self.think_end_token in delta_text + ): + self.is_thinking_end = True + + # If thinking hasn't ended yet, don't process any tool calls + if not self.is_thinking_end: + return DeltaMessage(content=delta_text) + + # Handle normal content before tool calls + if not self.is_tool_call_started: + # Check if tool call is starting + if ( + self.tool_call_start_token_id in delta_token_ids + or self.tool_call_start_token in delta_text + ): + self.is_tool_call_started = True + # Return any content before the tool call + if self.tool_call_start_token in delta_text: + content_before = delta_text[ + : delta_text.index(self.tool_call_start_token) + ] + if content_before: + return DeltaMessage(content=content_before) + return None + else: + # Check if we're between tool calls - skip whitespace + if ( + current_text.rstrip().endswith(self.tool_call_end_token) + and delta_text.strip() == "" + ): + # We just ended a tool call, skip whitespace + return None + # Normal content, no tool call + return DeltaMessage(content=delta_text) + + # Check if we're between tool calls (waiting for next one) + # Count tool calls we've seen vs processed + tool_starts_count = current_text.count(self.tool_call_start_token) + if self.current_tool_index >= tool_starts_count: + # We're past all tool calls, shouldn't be here + return None + + # We're in a tool call, find the current tool call portion + # Need to find the correct tool call based on current_tool_index + # Only process tool calls after think_end_token + think_end_index = ( + current_text.find(self.think_end_token) + len(self.think_end_token) + if self.think_end_token in current_text + else 0 + ) + tool_starts: list[int] = [] + idx = think_end_index + while True: + idx = current_text.find(self.tool_call_start_token, idx) + if idx == -1: + break + tool_starts.append(idx) + idx += len(self.tool_call_start_token) + + if self.current_tool_index >= len(tool_starts): + # No more tool calls to process yet + return None + + tool_start_idx = tool_starts[self.current_tool_index] + # Find where this tool call ends (or current position if not ended yet) + tool_end_idx = current_text.find(self.tool_call_end_token, tool_start_idx) + if tool_end_idx == -1: + tool_text = current_text[tool_start_idx:] + else: + tool_text = current_text[ + tool_start_idx : tool_end_idx + len(self.tool_call_end_token) + ] + + # Looking for function header + if not self.header_sent: + if self.tool_call_prefix in tool_text: + func_start = tool_text.find(self.tool_call_prefix) + len( + self.tool_call_prefix + ) + func_end = tool_text.find(">", func_start) + + if func_end != -1: + # Found complete function name + self.current_function_name = tool_text[func_start:func_end] + self.current_tool_id = self._generate_tool_call_id() # type: ignore + self.header_sent = True + self.in_function = True + + # IMPORTANT: Add to prev_tool_call_arr immediately when we detect a tool call + # This ensures finish_reason="tool_calls" even if parsing isn't complete + already_added = any( + tool.get("name") == self.current_function_name + for tool in self.prev_tool_call_arr + ) + if not already_added: + self.prev_tool_call_arr.append( + { + "name": self.current_function_name, + "arguments": "{}", # Placeholder, will be updated later + } + ) + + # Send header with function info + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + id=self.current_tool_id, + function=DeltaFunctionCall( + name=self.current_function_name, arguments="" + ), + type="function", + ) + ] + ) + return None + + # We've sent header, now handle function body + if self.in_function: + # Send opening brace if not sent yet + if not self.json_started and self.parameter_prefix not in delta_text: + self.json_started = True + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments="{"), + ) + ] + ) + + # Make sure json_started is set if we're processing parameters + if not self.json_started: + self.json_started = True + + # Check for function end in accumulated text + if not self.json_closed and self.function_end_token in tool_text: + # Close JSON + self.json_closed = True + + # Extract the complete tool call to update prev_tool_call_arr with final arguments + # Find the function content + func_start = tool_text.find(self.tool_call_prefix) + len( + self.tool_call_prefix + ) + func_content_end = tool_text.find(self.function_end_token, func_start) + if func_content_end != -1: + func_content = tool_text[func_start:func_content_end] + # Parse to get the complete arguments + try: + parsed_tool = self._parse_xml_function_call( + func_content, request.tools if request else None + ) + if parsed_tool: + # Update existing entry in prev_tool_call_arr with complete arguments + for i, tool in enumerate(self.prev_tool_call_arr): + if tool.get("name") == parsed_tool.function.name: + self.prev_tool_call_arr[i]["arguments"] = ( + parsed_tool.function.arguments + ) + break + except Exception: + logger.warning( + "Failed to parse tool arguments during streaming.", + exc_info=True, + ) + + result = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments="}"), + ) + ] + ) + + # Reset state for next tool + self.in_function = False + self.json_closed = True + + return result + + # Look for parameters + # Count how many complete parameters we have processed + complete_params = tool_text.count(self.parameter_end_token) + + # Check if we should start a new parameter + if not self.in_param and self.param_count < complete_params: + # Find the unprocessed parameter + # Count parameter starts + param_starts = [] + idx = 0 + while True: + idx = tool_text.find(self.parameter_prefix, idx) + if idx == -1: + break + param_starts.append(idx) + idx += len(self.parameter_prefix) + + if len(param_starts) > self.param_count: + # Process the next parameter + param_idx = param_starts[self.param_count] + param_start = param_idx + len(self.parameter_prefix) + remaining = tool_text[param_start:] + + if ">" in remaining: + # We have the complete parameter name + name_end = remaining.find(">") + self.current_param_name = remaining[:name_end] + + # Find the parameter value + value_start = param_start + name_end + 1 + value_text = tool_text[value_start:] + if value_text.startswith("\n"): + value_text = value_text[1:] + + # Find where this parameter ends + param_end_idx = value_text.find(self.parameter_end_token) + if param_end_idx != -1: + # Complete parameter found + param_value = value_text[:param_end_idx] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + # Build complete JSON fragment for this parameter + if self.param_count == 0: + json_fragment = ( + '"' + + self.current_param_name + + '": "' + + json.dumps(param_value)[1:-1] + + '"' + ) + else: + json_fragment = ( + ', "' + + self.current_param_name + + '": "' + + json.dumps(param_value)[1:-1] + + '"' + ) + + self.param_count += 1 + + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments=json_fragment + ), + ) + ] + ) + + # Continue parameter value + if self.in_param: + if self.parameter_end_token in delta_text: + # End of parameter + end_idx = delta_text.find(self.parameter_end_token) + value_chunk = delta_text[:end_idx] + + # Skip past > if at start + if not self.current_param_value and ">" in value_chunk: + gt_idx = value_chunk.find(">") + value_chunk = value_chunk[gt_idx + 1 :] + + if not self.current_param_value and value_chunk.startswith("\n"): + value_chunk = value_chunk[1:] + + # Calculate incremental JSON + full_value = self.current_param_value + value_chunk + prev_escaped = ( + json.dumps(self.current_param_value)[1:-1] + if self.current_param_value + else "" + ) + full_escaped = json.dumps(full_value)[1:-1] + delta_escaped = full_escaped[len(prev_escaped) :] + + self.in_param = False + self.current_param_value = "" + + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments=delta_escaped + '"' + ), + ) + ] + ) + else: + # Continue accumulating value + value_chunk = delta_text + + # Handle first chunk after param name + if not self.current_param_value and ">" in value_chunk: + gt_idx = value_chunk.find(">") + value_chunk = value_chunk[gt_idx + 1 :] + + if not self.current_param_value and value_chunk.startswith("\n"): + value_chunk = value_chunk[1:] + + if value_chunk: + # Stream the escaped delta + prev_escaped = ( + json.dumps(self.current_param_value)[1:-1] + if self.current_param_value + else "" + ) + self.current_param_value += value_chunk + full_escaped = json.dumps(self.current_param_value)[1:-1] + delta_escaped = full_escaped[len(prev_escaped) :] + + if delta_escaped: + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall( + arguments=delta_escaped + ), + ) + ] + ) + + return None diff --git a/entrypoints/openai/tool_parsers/step3_tool_parser.py b/entrypoints/openai/tool_parsers/step3_tool_parser.py new file mode 100644 index 0000000..adcb9f4 --- /dev/null +++ b/entrypoints/openai/tool_parsers/step3_tool_parser.py @@ -0,0 +1,303 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import contextlib +import json +from collections.abc import Sequence +from typing import Any + +import regex as re + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +class Step3ToolParser(ToolParser): + """ + Tool parser for a model that uses a specific XML-like format for tool calls. + This version uses a robust, stateful, cursor-based streaming parser and + consolidates tool arguments into a single message. + """ + + TOOL_CALLS_BEGIN = "<|tool_calls_begin|>" + TOOL_CALLS_END = "<|tool_calls_end|>" + TOOL_CALL_BEGIN = "<|tool_call_begin|>" + TOOL_CALL_END = "<|tool_call_end|>" + TOOL_SEP = "<|tool_sep|>" + SPECIAL_TOKENS = [TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END] + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.position = 0 + # Explicit state flags for robust streaming + self.tool_block_started = False + self.tool_block_finished = False + + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + request = super().adjust_request(request) + if request.tools and request.tool_choice != "none": + request.skip_special_tokens = False + return request + + @staticmethod + def _parse_steptml_invoke( + action_text: str, + ) -> tuple[str | None, dict[str, str] | None]: + func_name_match = re.search(r'', action_text) + if not func_name_match: + return None, None + func_name = func_name_match.group(1) + + params: dict[str, str] = {} + param_matches = re.findall( + r'([^<]*)', + action_text, + ) + for name, value in param_matches: + params[name] = value.strip() + return func_name, params + + def _cast_arguments( + self, + func_name: str, + params: dict[str, Any], + request: ChatCompletionRequest, + ) -> dict[str, Any]: + for tool in request.tools or []: + if tool.function.name == func_name: + schema = tool.function.parameters or {} + properties = schema.get("properties", {}) + for key, value in params.items(): + if not isinstance(value, str): + continue + prop = properties.get(key, {}) + typ = prop.get("type") + if typ == "string": + params[key] = value.strip() + elif typ == "integer": + with contextlib.suppress(ValueError): + params[key] = int(value) + elif typ == "number": + with contextlib.suppress(ValueError): + params[key] = float(value) + elif typ == "boolean": + lower_val = value.lower() + params[key] = ( + lower_val == "true" + if lower_val in ("true", "false") + else value + ) + elif typ == "null": + params[key] = None if value.lower() == "null" else value + break + return params + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + # The main loop processes the stream from the last known position. + while True: + if self.position >= len(current_text): + return None # We've processed the entire stream. + + unprocessed_text = current_text[self.position :] + + # STATE: After all tools are done, all subsequent text is content. + if self.tool_block_finished: + self.position = len(current_text) + return DeltaMessage(content=unprocessed_text) + + # STATE: Before the tool block has started. + if not self.tool_block_started: + if unprocessed_text.startswith(self.TOOL_CALLS_BEGIN): + self.position += len(self.TOOL_CALLS_BEGIN) + self.tool_block_started = True + continue # Token consumed, re-loop. + + start_pos = unprocessed_text.find(self.TOOL_CALLS_BEGIN) + if start_pos == -1: + if ( + self.TOOL_CALLS_BEGIN.startswith(unprocessed_text.strip()) + and unprocessed_text + ): + return None # It's a prefix, wait. + self.position = len(current_text) + return DeltaMessage(content=unprocessed_text) + else: + content = unprocessed_text[:start_pos] + self.position += len(content) + return DeltaMessage(content=content) + + # STATE: Inside the main tool block. + offset = len(unprocessed_text) - len(unprocessed_text.lstrip()) + unprocessed_text = unprocessed_text.lstrip() + self.position += offset + + if unprocessed_text.startswith(self.TOOL_CALLS_END): + self.position += len(self.TOOL_CALLS_END) + self.tool_block_finished = True + self.current_tool_id = -1 + continue + + # Check if we are between tool calls. + tool_finished = self.current_tool_id != -1 and self.prev_tool_call_arr[ + self.current_tool_id + ].get("finished") + if self.current_tool_id == -1 or tool_finished: + if unprocessed_text.startswith(self.TOOL_CALL_BEGIN): + self.position += len(self.TOOL_CALL_BEGIN) + if self.current_tool_id == -1: + self.current_tool_id = 0 + else: + self.current_tool_id += 1 + self.current_tool_name_sent = False + while len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + self.prev_tool_call_arr[self.current_tool_id]["finished"] = False + continue + + if self.TOOL_CALL_BEGIN.startswith(unprocessed_text): + return None + + # STATE: Parsing an active tool call. + if self.current_tool_id != -1 and not self.prev_tool_call_arr[ + self.current_tool_id + ].get("finished", False): + end_tool_pos = unprocessed_text.find(self.TOOL_CALL_END) + if end_tool_pos == -1: + tool_body = unprocessed_text + else: + tool_body = unprocessed_text[:end_tool_pos] + + if end_tool_pos == -1 and self.TOOL_CALL_END.startswith(tool_body): + return None + + function_name, arguments = self._parse_steptml_invoke(tool_body) + if not function_name: + return None + + tool_call_arr = {"name": function_name, "parameters": arguments or {}} + + # Send the function name as soon as it's parsed. + if not self.current_tool_name_sent: + self.current_tool_name_sent = True + self.prev_tool_call_arr[self.current_tool_id].update(tool_call_arr) + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", + function=DeltaFunctionCall(name=function_name), + ) + ] + ) + + # Update our internal state with the latest parsed arguments. + self.prev_tool_call_arr[self.current_tool_id].update( # noqa: E501 + tool_call_arr + ) + + # Only send arguments when the tool call is complete. + if end_tool_pos != -1: + self.position += end_tool_pos + len(self.TOOL_CALL_END) + self.prev_tool_call_arr[self.current_tool_id]["finished"] = True + + final_args = self._cast_arguments( + function_name, + tool_call_arr.get("parameters", {}), # type: ignore + request, + ) + if final_args: + final_args_json = json.dumps(final_args, ensure_ascii=False) + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=final_args_json + ), + ) + ] + ) + + # If tool is not finished, return None to wait for more tokens. + return None + + return None + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + if self.TOOL_CALLS_BEGIN not in model_output: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + pre_text, rest = model_output.split(self.TOOL_CALLS_BEGIN, 1) + if self.TOOL_CALLS_END not in rest: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + tool_block, post_text = rest.split(self.TOOL_CALLS_END, 1) + content = (pre_text + post_text).strip() + + tool_calls: list[ToolCall] = [] + call_parts = tool_block.split(self.TOOL_CALL_BEGIN) + + for part in call_parts: + if not part or self.TOOL_CALL_END not in part: + continue + + call_content = part.split(self.TOOL_CALL_END, 1)[0] + if self.TOOL_SEP not in call_content: + continue + + type_part, invoke_part = call_content.split(self.TOOL_SEP, 1) + if type_part.strip() != "function": + continue + + function_name, params_dict = self._parse_steptml_invoke(invoke_part) + + if function_name and params_dict is not None: + params_dict = self._cast_arguments(function_name, params_dict, request) + params_str = json.dumps(params_dict, ensure_ascii=False) + tool_calls.append( + ToolCall( + function=FunctionCall(name=function_name, arguments=params_str) + ) + ) + if tool_calls: + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None, + ) + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) diff --git a/entrypoints/openai/tool_parsers/utils.py b/entrypoints/openai/tool_parsers/utils.py new file mode 100644 index 0000000..570eb44 --- /dev/null +++ b/entrypoints/openai/tool_parsers/utils.py @@ -0,0 +1,229 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from json import JSONDecodeError, JSONDecoder +from typing import Any + +import partial_json_parser +from openai.types.responses import ( + FunctionTool, + ToolChoiceFunction, +) +from openai.types.responses.tool import Tool +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionNamedToolChoiceParam, + ChatCompletionToolsParam, +) + + +def find_common_prefix(s1: str, s2: str) -> str: + """ + Finds a common prefix that is shared between two strings, if there is one. + Order of arguments is NOT important. + + This function is provided as a UTILITY for extracting information from JSON + generated by partial_json_parser, to help in ensuring that the right tokens + are returned in streaming, so that close-quotes, close-brackets and + close-braces are not returned prematurely. + + e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') -> + '{"fruit": "ap' + """ + prefix = "" + min_length = min(len(s1), len(s2)) + for i in range(0, min_length): + if s1[i] == s2[i]: + prefix += s1[i] + else: + break + return prefix + + +def find_common_suffix(s1: str, s2: str) -> str: + """ + Finds a common suffix shared between two strings, if there is one. Order of + arguments is NOT important. + Stops when the suffix ends OR it hits an alphanumeric character + + e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}' + """ + suffix = "" + min_length = min(len(s1), len(s2)) + for i in range(1, min_length + 1): + if s1[-i] == s2[-i] and not s1[-i].isalnum(): + suffix = s1[-i] + suffix + else: + break + return suffix + + +def extract_intermediate_diff(curr: str, old: str) -> str: + """ + Given two strings, extract the difference in the middle between two strings + that are known to have a common prefix and/or suffix. + + This function is provided as a UTILITY for extracting information from JSON + generated by partial_json_parser, to help in ensuring that the right tokens + are returned in streaming, so that close-quotes, close-brackets and + close-braces are not returned prematurely. The order of arguments IS + important - the new version of the partially-parsed JSON must be the first + argument, and the secnod argument must be from the previous generation. + + What it returns, is tokens that should be streamed to the client. + + e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}') + -> 'ple' + + """ + suffix = find_common_suffix(curr, old) + + old = old[::-1].replace(suffix[::-1], "", 1)[::-1] + prefix = find_common_prefix(curr, old) + diff = curr + if len(suffix): + diff = diff[::-1].replace(suffix[::-1], "", 1)[::-1] + + if len(prefix): + # replace the prefix only once in case it's mirrored + diff = diff.replace(prefix, "", 1) + + return diff + + +def find_all_indices(string: str, substring: str) -> list[int]: + """ + Find all (starting) indices of a substring in a given string. Useful for + tool call extraction + """ + indices = [] + index = -1 + while True: + index = string.find(substring, index + 1) + if index == -1: + break + indices.append(index) + return indices + + +# partial_json_parser doesn't support extra data and +# JSONDecoder.raw_decode doesn't support partial JSON +def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]: + try: + return (partial_json_parser.loads(input_str, flags), len(input_str)) + except JSONDecodeError as e: + if "Extra data" in e.msg: + dec = JSONDecoder() + return dec.raw_decode(input_str) + raise + + +def is_complete_json(input_str: str) -> bool: + try: + json.loads(input_str) + return True + except JSONDecodeError: + return False + + +def consume_space(i: int, s: str) -> int: + while i < len(s) and s[i].isspace(): + i += 1 + return i + + +def _extract_tool_info( + tool: Tool | ChatCompletionToolsParam, +) -> tuple[str, dict[str, Any] | None]: + if isinstance(tool, FunctionTool): + return tool.name, tool.parameters + elif isinstance(tool, ChatCompletionToolsParam): + return tool.function.name, tool.function.parameters + else: + raise TypeError(f"Unsupported tool type: {type(tool)}") + + +def _get_tool_schema_from_tool(tool: Tool | ChatCompletionToolsParam) -> dict: + name, params = _extract_tool_info(tool) + params = params if params else {"type": "object", "properties": {}} + return { + "properties": { + "name": {"type": "string", "enum": [name]}, + "parameters": params, + }, + "required": ["name", "parameters"], + } + + +def _get_tool_schema_defs( + tools: list[Tool | ChatCompletionToolsParam], +) -> dict: + all_defs: dict[str, dict[str, Any]] = {} + for tool in tools: + _, params = _extract_tool_info(tool) + if params is None: + continue + defs = params.pop("$defs", {}) + for def_name, def_schema in defs.items(): + if def_name in all_defs and all_defs[def_name] != def_schema: + raise ValueError( + f"Tool definition '{def_name}' has multiple schemas, " + "which is not supported." + ) + all_defs[def_name] = def_schema + return all_defs + + +def _get_json_schema_from_tools( + tools: list[Tool | ChatCompletionToolsParam], +) -> dict: + json_schema = { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "anyOf": [_get_tool_schema_from_tool(tool) for tool in tools], + }, + } + json_schema_defs = _get_tool_schema_defs(tools) + if json_schema_defs: + json_schema["$defs"] = json_schema_defs + return json_schema + + +def get_json_schema_from_tools( + tool_choice: str | ToolChoiceFunction | ChatCompletionNamedToolChoiceParam, + tools: list[FunctionTool | ChatCompletionToolsParam] | None, +) -> str | dict | None: + # tool_choice: "none" + if tool_choice in ("none", None) or tools is None: + return None + # tool_choice: Forced Function (Responses) + if (not isinstance(tool_choice, str)) and isinstance( + tool_choice, ToolChoiceFunction + ): + tool_name = tool_choice.name + tool_map = {tool.name: tool for tool in tools if isinstance(tool, FunctionTool)} + if tool_name not in tool_map: + raise ValueError(f"Tool '{tool_name}' has not been passed in `tools`.") + return tool_map[tool_name].parameters + # tool_choice: Forced Function (ChatCompletion) + if (not isinstance(tool_choice, str)) and isinstance( + tool_choice, ChatCompletionNamedToolChoiceParam + ): + tool_name = tool_choice.function.name + tool_map = { + tool.function.name: tool + for tool in tools + if isinstance(tool, ChatCompletionToolsParam) + } + if tool_name not in tool_map: + raise ValueError(f"Tool '{tool_name}' has not been passed in `tools`.") + return tool_map[tool_name].function.parameters + # tool_choice: "required" + if tool_choice == "required": + return _get_json_schema_from_tools(tools) + # tool_choice: "auto" + return None diff --git a/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/entrypoints/openai/tool_parsers/xlam_tool_parser.py new file mode 100644 index 0000000..9d308af --- /dev/null +++ b/entrypoints/openai/tool_parsers/xlam_tool_parser.py @@ -0,0 +1,556 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa +import json +from collections.abc import Sequence +from typing import Any, Optional, Union + +import regex as re + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, +) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +class xLAMToolParser(ToolParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + # Initialize state for streaming mode + self.prev_tool_calls: list[dict] = [] + self.current_tool_id = -1 + self.current_tool_name_sent = False + self.streamed_args: list[str] = [] # Track arguments sent for each tool + + # For backward compatibility with tests + self.current_tools_sent: list[bool] = [] + + # For backward compatibility with serving code + self.prev_tool_call_arr = [] + + # Regex patterns for preprocessing + self.json_code_block_patterns = [ + r"```(?:json)?\s*([\s\S]*?)```", + r"\[TOOL_CALLS\]([\s\S]*?)(?=\n|$)", + r"([\s\S]*?)", + ] + self.thinking_tag_pattern = r"([\s\S]*)" + + # Define streaming state type to be initialized later + self.streaming_state: dict[str, Any] = { + "current_tool_index": -1, + "tool_ids": [], + "sent_tools": [], + } + + def preprocess_model_output( + self, model_output: str + ) -> tuple[Optional[str], Optional[str]]: + """ + Preprocess the model output to extract content and potential tool calls. + Returns: + Tuple of (content, potential_tool_calls_json) + """ + # Check for thinking tag + thinking_match = re.search(self.thinking_tag_pattern, model_output) + if thinking_match: + content = model_output[: thinking_match.start() + len("")].strip() + thinking_content = thinking_match.group(1).strip() + + # Try to parse the thinking content as JSON + try: + json.loads(thinking_content) + return content, thinking_content + except json.JSONDecodeError: + # If can't parse as JSON, look for JSON code blocks + for json_pattern in self.json_code_block_patterns: + json_matches = re.findall(json_pattern, thinking_content) + if json_matches: + for json_str in json_matches: + try: + json.loads(json_str) + return content, json_str + except json.JSONDecodeError: + continue + + # Check for JSON code blocks in the entire output + for json_pattern in self.json_code_block_patterns: + json_matches = re.findall(json_pattern, model_output) + if json_matches: + for json_str in json_matches: + try: + json.loads(json_str) + # Extract content by removing the JSON code block + content = re.sub(json_pattern, "", model_output).strip() + return content, json_str + except json.JSONDecodeError: + continue + + # If the entire output is a valid JSON array or looks like one, treat it as tool calls + if model_output.strip().startswith("["): + try: + json.loads(model_output) + return None, model_output + except json.JSONDecodeError: + # Even if it's not valid JSON yet, it might be a tool call in progress + if ( + "{" in model_output + and "name" in model_output + and "arguments" in model_output + ): + return None, model_output + + # If no tool calls found, return the original output as content + return model_output, None + + def extract_tool_calls( + self, model_output: str, request: ChatCompletionRequest + ) -> ExtractedToolCallInformation: + """ + Extract tool calls from a complete model output. + """ + try: + # Preprocess the model output + content, potential_tool_calls = self.preprocess_model_output(model_output) + + if not potential_tool_calls: + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=content + ) + + # Parse the potential tool calls as JSON + tool_calls_data = json.loads(potential_tool_calls) + + # Ensure it's an array + if not isinstance(tool_calls_data, list): + logger.debug("Tool calls data is not an array") + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=content or model_output, + ) + + tool_calls: list[ToolCall] = [] + + for idx, call in enumerate(tool_calls_data): + if ( + not isinstance(call, dict) + or "name" not in call + or "arguments" not in call + ): + logger.debug("Invalid tool call format at index %d", idx) + continue + + tool_call = ToolCall( + id=f"call_{idx}_{random_uuid()}", + type="function", + function=FunctionCall( + name=call["name"], + arguments=( + json.dumps(call["arguments"]) + if isinstance(call["arguments"], dict) + else call["arguments"] + ), + ), + ) + tool_calls.append(tool_call) + + return ExtractedToolCallInformation( + tools_called=len(tool_calls) > 0, + tool_calls=tool_calls, + content=content, + ) + + except Exception as e: + logger.exception("Error extracting tool calls: %s", str(e)) + return ExtractedToolCallInformation( + tools_called=False, tool_calls=[], content=model_output + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + """ + Extract tool calls for streaming mode. + """ + # First, check for a definitive start of a tool call block. + # This prevents premature parsing of incomplete output. + stripped_text = current_text.strip() + preprocessed_content, preprocessed_tool_calls = self.preprocess_model_output( + current_text + ) + + # For JSON code blocks, we need to detect them earlier, even if incomplete + has_potential_json_block = ( + "```json" in current_text + or "```\n[" in current_text + or "[TOOL_CALLS]" in current_text + or "" in current_text + ) + + is_tool_call_block = ( + stripped_text.startswith("[") + or stripped_text.startswith("") + or stripped_text.startswith("[TOOL_CALLS]") + or + # Check if we have thinking tags with JSON-like content following + ("[" in current_text) + or + # Check if the text contains a JSON array after preprocessing + preprocessed_tool_calls is not None + or + # For JSON code blocks, detect early if we see enough structure + ( + has_potential_json_block + and '"name"' in current_text + and '"arguments"' in current_text + ) + ) + + if not is_tool_call_block: + return DeltaMessage(content=delta_text) + + try: + # Initialize streaming state if not exists + if not hasattr(self, "streaming_state"): + self.streaming_state = { + "current_tool_index": -1, + "tool_ids": [], + "sent_tools": [], # Track complete state of each tool + } + + # Try parsing as JSON to check for complete tool calls + try: + # Use preprocessed tool calls if available + tool_calls_text = ( + preprocessed_tool_calls if preprocessed_tool_calls else current_text + ) + parsed_tools = json.loads(tool_calls_text) + if isinstance(parsed_tools, list): + # Update our tool array for next time + self.prev_tool_call_arr = parsed_tools + except json.JSONDecodeError: + # Not complete JSON yet, use regex for partial parsing + pass + + # Check for test-specific state setup (current_tools_sent) + # This handles the case where tests manually set current_tools_sent + if ( + hasattr(self, "current_tools_sent") # type: ignore + and len(self.current_tools_sent) > 0 + ): + # If current_tools_sent is set to [False], it means the test wants us to send the name + if ( + len(self.current_tools_sent) == 1 + and self.current_tools_sent[0] is False + ): + # Extract the function name using regex + name_pattern = r'"name"\s*:\s*"([^"]+)"' + name_match = re.search(name_pattern, current_text) + if name_match: + function_name = name_match.group(1) + + # The test expects us to send just the name first + tool_id = make_tool_call_id() + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=0, + type="function", + id=tool_id, + function=DeltaFunctionCall( + name=function_name + ).model_dump(exclude_none=True), # type: ignore + ) + ] + ) + # Update state to reflect that we've sent the name + self.current_tools_sent = [True] + self.current_tool_id = 0 + self.streaming_state["current_tool_index"] = 0 + if len(self.streaming_state["sent_tools"]) == 0: + self.streaming_state["sent_tools"].append( + { + "sent_name": True, + "sent_arguments_prefix": False, + "sent_arguments": "", + } + ) + else: + self.streaming_state["sent_tools"][0]["sent_name"] = True + self.current_tool_name_sent = True + return delta + + # Use regex to identify tool calls in the output + # Use preprocessed tool calls text for better parsing, but also try to extract from incomplete JSON blocks + search_text = ( + preprocessed_tool_calls if preprocessed_tool_calls else current_text + ) + + # For JSON code blocks that aren't complete yet, try to extract the JSON content + if not preprocessed_tool_calls and has_potential_json_block: + # Try to extract the JSON array from within the code block + json_match = re.search( + r"```(?:json)?\s*([\s\S]*?)(?:```|$)", current_text + ) + if json_match: + potential_json = json_match.group(1).strip() + # Use this as search text even if it's incomplete + if potential_json.startswith("[") and ( + '"name"' in potential_json and '"arguments"' in potential_json + ): + search_text = potential_json + + # Try to find complete tool names first + name_pattern = r'"name"\s*:\s*"([^"]+)"' + name_matches = list(re.finditer(name_pattern, search_text)) + tool_count = len(name_matches) + + # If no complete tool names found, check for partial tool names + if tool_count == 0: + # Check if we're in the middle of parsing a tool name + partial_name_pattern = r'"name"\s*:\s*"([^"]*)' + partial_matches = list(re.finditer(partial_name_pattern, search_text)) + if partial_matches: + # We have a partial tool name - not ready to emit yet + return None + else: + # No tools found at all + return None + + # Ensure our state arrays are large enough + while len(self.streaming_state["sent_tools"]) < tool_count: + self.streaming_state["sent_tools"].append( + { + "sent_name": False, + "sent_arguments_prefix": False, + "sent_arguments": "", + } + ) + + while len(self.streaming_state["tool_ids"]) < tool_count: + self.streaming_state["tool_ids"].append(None) + + # Determine if we need to move to a new tool + current_idx = self.streaming_state["current_tool_index"] + + # If we haven't processed any tool yet or current tool is complete, move to next + if current_idx == -1 or current_idx < tool_count - 1: + next_idx = current_idx + 1 + + # If tool at next_idx has not been sent yet + if ( + next_idx < tool_count + and not self.streaming_state["sent_tools"][next_idx]["sent_name"] + ): + # Update indexes + self.streaming_state["current_tool_index"] = next_idx + self.current_tool_id = next_idx # For backward compatibility + current_idx = next_idx + + # Extract the tool name + tool_name = name_matches[current_idx].group(1) + + # Generate ID and send tool name + tool_id = f"call_{current_idx}_{random_uuid()}" + self.streaming_state["tool_ids"][current_idx] = tool_id + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + type="function", + id=tool_id, + function=DeltaFunctionCall(name=tool_name).model_dump( + exclude_none=True + ), # type: ignore + ) + ] + ) + self.streaming_state["sent_tools"][current_idx]["sent_name"] = True + self.current_tool_name_sent = True # For backward compatibility + + # Keep track of streamed args for backward compatibility + while len(self.streamed_args) <= current_idx: + self.streamed_args.append("") + + return delta + + # Process arguments for the current tool + if current_idx >= 0 and current_idx < tool_count: + # Support both regular and empty argument objects + # First, check for the empty arguments case: "arguments": {} + empty_args_pattern = ( + r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}' + ) + empty_args_match = re.search(empty_args_pattern, search_text) + + # Check if this tool has empty arguments + if empty_args_match and empty_args_match.start() > 0: + # Find which tool this empty arguments belongs to + empty_args_tool_idx = 0 + for i in range(tool_count): + if i == current_idx: + # If this is our current tool and it has empty arguments + if not self.streaming_state["sent_tools"][current_idx][ + "sent_arguments_prefix" + ]: + # Send empty object + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments_prefix" + ] = True + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments" + ] = "{}" + + # Update streamed_args for backward compatibility + while len(self.streamed_args) <= current_idx: + self.streamed_args.append("") + self.streamed_args[current_idx] += "{}" + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + function=DeltaFunctionCall( + arguments="{}" + ).model_dump(exclude_none=True), # type: ignore + ) + ] + ) + + # Move to next tool if available + if current_idx < tool_count - 1: + self.streaming_state["current_tool_index"] += 1 + self.current_tool_id = self.streaming_state[ + "current_tool_index" + ] + + return delta + + # Extract arguments for current tool using regex for non-empty arguments + args_pattern = r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})' + args_matches = list(re.finditer(args_pattern, search_text)) + + if current_idx < len(args_matches): + args_text = args_matches[current_idx].group(1) + + # Handle transition between tools + is_last_tool = current_idx == tool_count - 1 + + # For multiple tools, extract only the arguments for the current tool + if tool_count > 1: + # Parse the entire JSON structure to properly extract arguments for each tool + try: + parsed_tools = json.loads(search_text) + if isinstance(parsed_tools, list) and current_idx < len( + parsed_tools + ): + current_tool = parsed_tools[current_idx] + if isinstance(current_tool.get("arguments"), dict): + args_text = json.dumps(current_tool["arguments"]) + else: + args_text = str(current_tool.get("arguments", "{}")) + except (json.JSONDecodeError, KeyError, IndexError): + # Fallback to regex-based extraction + pass + + # If arguments haven't been sent yet + sent_args = self.streaming_state["sent_tools"][current_idx][ + "sent_arguments" + ] + + # If we haven't sent the opening bracket yet + if not self.streaming_state["sent_tools"][current_idx][ + "sent_arguments_prefix" + ] and args_text.startswith("{"): + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments_prefix" + ] = True + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments" + ] = "{" + + # Update streamed_args for backward compatibility + while len(self.streamed_args) <= current_idx: + self.streamed_args.append("") + self.streamed_args[current_idx] += "{" + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + function=DeltaFunctionCall( + arguments="{" + ).model_dump(exclude_none=True), # type: ignore + ) + ] + ) + return delta + + # If we need to send more arguments + if args_text.startswith(sent_args): + # Calculate what part of arguments we need to send + args_diff = args_text[len(sent_args) :] + + if args_diff: + # Update our state + self.streaming_state["sent_tools"][current_idx][ + "sent_arguments" + ] = args_text + + # Update streamed_args for backward compatibility + while len(self.streamed_args) <= current_idx: + self.streamed_args.append("") + self.streamed_args[current_idx] += args_diff + + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=current_idx, + function=DeltaFunctionCall( + arguments=args_diff + ).model_dump(exclude_none=True), # type: ignore + ) + ] + ) + return delta + + # If the tool's arguments are complete, check if we need to move to the next tool + if args_text.endswith("}") and args_text == sent_args: + # This tool is complete, move to the next one in the next iteration + if current_idx < tool_count - 1: + self.streaming_state["current_tool_index"] += 1 + self.current_tool_id = self.streaming_state[ + "current_tool_index" + ] # For compatibility + + # If we got here, we couldn't determine what to stream next + return None + + except Exception as e: + logger.exception(f"Error in streaming tool calls: {e}") + # If we encounter an error, just return the delta text as regular content + return DeltaMessage(content=delta_text) diff --git a/entrypoints/renderer.py b/entrypoints/renderer.py new file mode 100644 index 0000000..3c5a396 --- /dev/null +++ b/entrypoints/renderer.py @@ -0,0 +1,409 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import io +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Annotated + +import pybase64 +import torch +from pydantic import Field + +from vllm.config import ModelConfig +from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt +from vllm.inputs.data import TextPrompt as EngineTextPrompt +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt +from vllm.inputs.parse import get_prompt_components, parse_raw_prompts +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils.async_utils import AsyncMicrobatchTokenizer + + +@dataclass(frozen=True) +class RenderConfig: + """Configuration to control how prompts are prepared.""" + + max_length: int | None = None + """Maximum allowable total input token length. If provided, + token inputs longer than this raise `ValueError`.""" + + truncate_prompt_tokens: int | None = None + """Number of tokens to keep. `None` means no truncation. + `0` yields an empty list (and skips embeds). + `-1` maps to `model_config.max_model_len`.""" + + add_special_tokens: bool | None = True + """Whether to add model-specific special tokens during tokenization.""" + + cache_salt: str | None = None + """String to disambiguate prefix cache entries.""" + + needs_detokenization: bool | None = False + """If True, detokenize IDs back to text for inclusion in outputs.""" + + def verify_truncate_prompt_tokens(self, model_config: ModelConfig) -> int | None: + """Validate and normalize `truncate_prompt_tokens` parameter.""" + truncate_prompt_tokens = self.truncate_prompt_tokens + if truncate_prompt_tokens is None: + return None + + if truncate_prompt_tokens == 0: + return 0 + + if truncate_prompt_tokens < 0: + truncate_prompt_tokens = model_config.max_model_len + + max_length = self.max_length + if max_length is not None and truncate_prompt_tokens > max_length: # type: ignore[operator] + raise ValueError( + f"{truncate_prompt_tokens=} cannot be greater than " + f"{max_length=}. Please select a smaller truncation size." + ) + + return truncate_prompt_tokens + + +class BaseRenderer(ABC): + """ + Base class for unified input processing and rendering. + + The Renderer serves as a unified input processor that consolidates + tokenization, chat template formatting, and multimodal input handling + into a single component. + It converts high-level API requests (OpenAI-style JSON) into token IDs and + multimodal features ready for engine consumption. + + Key responsibilities: + - Convert text prompts to token sequences with proper special tokens + - Apply chat templates and format conversations + - Handle multimodal inputs (images, audio, etc.) when applicable + - Manage prompt truncation and length validation + - Provide clean separation between API layer and engine core + """ + + def __init__( + self, + model_config: ModelConfig, + tokenizer: AnyTokenizer | None = None, + ): + super().__init__() + self.model_config = model_config + self.tokenizer = tokenizer + + @abstractmethod + async def render_prompt( + self, + *, + prompt_or_prompts: str | list[str] | list[int] | list[list[int]], + config: RenderConfig, + ) -> list[EngineTokensPrompt]: + """ + Convert text or token inputs into engine-ready TokensPrompt objects. + + This method accepts text or token inputs and produces a + list of [`TokensPrompt`][vllm.inputs.data.TokensPrompt] objects + for the engine. + + Args: + prompt_or_prompts: One of: + - `str`: Single text prompt. + - `list[str]`: Batch of text prompts. + - `list[int]`: Single pre-tokenized sequence. + - `list[list[int]]`: Batch of pre-tokenized sequences. + config: Render configuration controlling how prompts are prepared + (e.g., tokenization and length handling). + + Returns: + list[EngineTokensPrompt]: Engine-ready token prompts. + + Raises: + ValueError: If input formats are invalid or length limits exceeded. + """ + raise NotImplementedError + + @abstractmethod + async def render_prompt_and_embeds( + self, + *, + prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None, + prompt_embeds: bytes | list[bytes] | None = None, + config: RenderConfig, + ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]: + """ + Convert text/token and/or base64-encoded embeddings inputs into + engine-ready prompt objects using a unified RenderConfig. + + At least one of `prompt_or_prompts` or `prompt_embeds` must be + provided and non-empty. If both are omitted or empty (e.g., empty + string and empty list), a `ValueError` is raised. + + Args: + prompt_or_prompts: Text or token inputs to include. + prompt_embeds: Base64-encoded bytes (or list thereof) containing a + torch-saved tensor to be used as prompt embeddings. + config: Render configuration controlling how prompts are prepared + (e.g., tokenization and length handling). + + Returns: + list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]: + Engine-ready prompt objects. + + Raises: + ValueError: If both `prompt_or_prompts` and `prompt_embeds` + are omitted or empty (decoder prompt cannot be empty), or if + length limits are exceeded. + """ + raise NotImplementedError + + def load_prompt_embeds( + self, + prompt_embeds: bytes | list[bytes], + truncate_prompt_tokens: Annotated[int, Field(ge=0)] | None = None, + cache_salt: str | None = None, + ) -> list[EngineEmbedsPrompt]: + """Load and validate base64-encoded embeddings into prompt objects.""" + if not self.model_config.enable_prompt_embeds: + raise ValueError( + "You must set `--enable-prompt-embeds` to input `prompt_embeds`." + ) + + def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt: + tensor = torch.load( + io.BytesIO(pybase64.b64decode(embed, validate=True)), + weights_only=True, + map_location=torch.device("cpu"), + ) + assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( + torch.float32, + torch.bfloat16, + torch.float16, + ) + tensor = tensor.to_dense() + if tensor.dim() > 2: + tensor = tensor.squeeze(0) + assert tensor.dim() == 2 + if truncate_prompt_tokens is not None: + tensor = tensor[-truncate_prompt_tokens:] + embeds_prompt = EngineEmbedsPrompt(prompt_embeds=tensor) + if cache_salt is not None: + embeds_prompt["cache_salt"] = cache_salt + return embeds_prompt + + if isinstance(prompt_embeds, list): + return [_load_and_validate_embed(embed) for embed in prompt_embeds] + + return [_load_and_validate_embed(prompt_embeds)] + + +class CompletionRenderer(BaseRenderer): + def __init__( + self, + model_config: ModelConfig, + tokenizer: AnyTokenizer | None = None, + async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] + | None = None, + ): + super().__init__(model_config, tokenizer) + self.async_tokenizer_pool = async_tokenizer_pool + self.async_tokenizer: AsyncMicrobatchTokenizer | None = None + + async def render_prompt( + self, + *, + prompt_or_prompts: str | list[str] | list[int] | list[list[int]], + config: RenderConfig, + ) -> list[EngineTokensPrompt]: + """Implementation of prompt rendering for completion-style requests. + + Uses async tokenizer pooling for improved performance. See base class + for detailed parameter documentation. + """ + truncate_prompt_tokens = config.verify_truncate_prompt_tokens(self.model_config) + if truncate_prompt_tokens == 0: + return [] + + tasks = ( + self._create_prompt( + prompt_input, + config=config, + truncate_prompt_tokens=truncate_prompt_tokens, + ) + for prompt_input in parse_raw_prompts(prompt_or_prompts) + ) + + return await asyncio.gather(*tasks) + + async def render_prompt_and_embeds( + self, + *, + prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None, + prompt_embeds: bytes | list[bytes] | None = None, + config: RenderConfig, + ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]: + """ + Render text/token prompts and/or precomputed embedding prompts. At + least one of `prompt_or_prompts` or `prompt_embeds` must be provided. + """ + truncate_prompt_tokens = config.verify_truncate_prompt_tokens(self.model_config) + if truncate_prompt_tokens == 0: + return [] + + rendered: list[EngineTokensPrompt | EngineEmbedsPrompt] = [] + + if prompt_embeds is not None: + rendered.extend( + self.load_prompt_embeds( + prompt_embeds, truncate_prompt_tokens, config.cache_salt + ) + ) + if prompt_or_prompts is None or prompt_or_prompts == "": + return rendered + + token_prompts = await self.render_prompt( + prompt_or_prompts=prompt_or_prompts, + config=config, + ) + rendered.extend(token_prompts) + + return rendered + + def _maybe_apply_truncation( + self, token_ids: list[int], truncate_prompt_tokens: int | None + ) -> list[int]: + """Apply truncation to token sequence.""" + if truncate_prompt_tokens is None: + return token_ids + if truncate_prompt_tokens >= len(token_ids): + return token_ids + + return token_ids[-truncate_prompt_tokens:] + + async def _create_prompt( + self, + prompt_input: EngineTextPrompt | EngineTokensPrompt, + config: RenderConfig, + truncate_prompt_tokens: int | None, + ) -> EngineTokensPrompt: + prompt, prompt_token_ids, _ = get_prompt_components(prompt_input) + + if prompt_token_ids is not None: + # NOTE: detokenization is needed when echo is enabled, + # where the input token IDs are decoded back to text. + return await self._create_prompt_from_token_ids( + prompt_token_ids, + config.max_length, + truncate_prompt_tokens, + config.cache_salt, + config.needs_detokenization, + ) + + if prompt is not None: + return await self._create_prompt_from_text( + prompt, + config.max_length, + truncate_prompt_tokens, + config.add_special_tokens, + config.cache_salt, + ) + + # TODO: Also handle embeds prompt using this method + raise NotImplementedError + + async def _create_prompt_from_text( + self, + text: str, + max_length: int | None, + truncate_prompt_tokens: int | None, + add_special_tokens: bool | None, + cache_salt: str | None, + ) -> EngineTokensPrompt: + """Tokenize text input asynchronously.""" + async_tokenizer = self._get_async_tokenizer() + + # Handle encoder-specific preprocessing + if ( + self.model_config.encoder_config is not None + and self.model_config.encoder_config.get("do_lower_case", False) + ): + text = text.lower() + + # Tokenize texts + if truncate_prompt_tokens is None: + encoded = await async_tokenizer(text, add_special_tokens=add_special_tokens) + else: + encoded = await async_tokenizer( + text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=truncate_prompt_tokens, + ) + + return self._create_tokens_prompt( + encoded.input_ids, max_length, cache_salt, text + ) + + async def _create_prompt_from_token_ids( + self, + token_ids: list[int], + max_length: int | None, + truncate_prompt_tokens: int | None, + cache_salt: str | None, + needs_detokenization: bool | None = False, + ) -> EngineTokensPrompt: + """Optionally detokenize token IDs and build a tokens prompt.""" + token_ids = self._maybe_apply_truncation(token_ids, truncate_prompt_tokens) + + prompt = None + if needs_detokenization: + async_tokenizer = self._get_async_tokenizer() + prompt = await async_tokenizer.decode(token_ids) + + return self._create_tokens_prompt( + token_ids=token_ids, + max_length=max_length, + cache_salt=cache_salt, + prompt=prompt, + ) + + def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer: + """Get or create async tokenizer using shared pool.""" + async_tokenizer = self.async_tokenizer + if async_tokenizer is not None: + return async_tokenizer + + tokenizer = self.tokenizer + if self.tokenizer is None: + raise ValueError("No tokenizer available for text input processing") + + if self.async_tokenizer_pool is None: + async_tokenizer = AsyncMicrobatchTokenizer(tokenizer) + else: + async_tokenizer = self.async_tokenizer_pool.get(tokenizer) + if async_tokenizer is None: + async_tokenizer = AsyncMicrobatchTokenizer(tokenizer) + self.async_tokenizer_pool[tokenizer] = async_tokenizer + self.async_tokenizer = async_tokenizer + return async_tokenizer + + def _create_tokens_prompt( + self, + token_ids: list[int], + max_length: int | None = None, + cache_salt: str | None = None, + prompt: str | None = None, + ) -> EngineTokensPrompt: + """Create validated EngineTokensPrompt.""" + if max_length is not None and len(token_ids) > max_length: + raise ValueError( + f"This model's maximum context length is {max_length} tokens. " + f"However, your request has {len(token_ids)} input tokens. " + "Please reduce the length of the input messages." + ) + + tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids) + if cache_salt is not None: + tokens_prompt["cache_salt"] = cache_salt + if prompt is not None: + tokens_prompt["prompt"] = prompt + return tokens_prompt diff --git a/entrypoints/responses_utils.py b/entrypoints/responses_utils.py new file mode 100644 index 0000000..d966f58 --- /dev/null +++ b/entrypoints/responses_utils.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from openai.types.chat import ( + ChatCompletionAssistantMessageParam, + ChatCompletionMessageToolCallParam, + ChatCompletionToolMessageParam, +) +from openai.types.chat.chat_completion_message_tool_call_param import ( + Function as FunctionCallTool, +) +from openai.types.responses import ResponseFunctionToolCall +from openai.types.responses.tool import Tool + +from vllm import envs +from vllm.entrypoints.openai.protocol import ( + ChatCompletionMessageParam, + ResponseInputOutputItem, +) + + +def construct_chat_message_with_tool_call( + item: ResponseInputOutputItem, +) -> ChatCompletionMessageParam: + if isinstance(item, ResponseFunctionToolCall): + # Append the function call as a tool call. + return ChatCompletionAssistantMessageParam( + role="assistant", + tool_calls=[ + ChatCompletionMessageToolCallParam( + id=item.call_id, + function=FunctionCallTool( + name=item.name, + arguments=item.arguments, + ), + type="function", + ) + ], + ) + elif item.get("type") == "function_call_output": + # Append the function call output as a tool message. + return ChatCompletionToolMessageParam( + role="tool", + content=item.get("output"), + tool_call_id=item.get("call_id"), + ) + return item # type: ignore + + +def extract_tool_types(tools: list[Tool]) -> set[str]: + """ + Extracts the tool types from the given tools. + """ + tool_types: set[str] = set() + for tool in tools: + if tool.type == "mcp": + # Allow the MCP Tool type to enable built in tools if the + # server_label is allowlisted in + # envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS + if tool.server_label in envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: + tool_types.add(tool.server_label) + else: + tool_types.add(tool.type) + return tool_types + + +def convert_tool_responses_to_completions_format(tool: dict) -> dict: + """ + Convert a flat tool schema: + {"type": "function", "name": "...", "description": "...", "parameters": {...}} + into: + {"type": "function", "function": {...}} + """ + return { + "type": "function", + "function": tool, + } diff --git a/entrypoints/sagemaker/__init__.py b/entrypoints/sagemaker/__init__.py new file mode 100644 index 0000000..c176713 --- /dev/null +++ b/entrypoints/sagemaker/__init__.py @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""SageMaker-specific integration for vLLM.""" diff --git a/entrypoints/sagemaker/__pycache__/__init__.cpython-312.pyc b/entrypoints/sagemaker/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cff6e6b9387344dd362a58e8874d6d91d5f16ef2 GIT binary patch literal 228 zcmXw!F%AJi7=>pLHWGK(l4AWO5^(@Qqti^*?(F(wc4jg=#JYk5D4f7q+(7FBLZ>p5 z@S6X<=6nCUbvkY0BpN5Yxcl(oFZwg@z?m~i$%<68S37o>an9%QmYb;5Ji!c;5T)h0 zi7hG_X3B*7*=*il^yt0CSgI1nuH@};pDEZvgGL?M*yP+shuvT_+zvPhT!XIcMoHKU yQ2>|LR9ZQ538iZ+d?PT15-nqWUEPCmq0*h;6Vr5P|8D8mJnNLwS3oG3hKpYq20&u~ literal 0 HcmV?d00001 diff --git a/entrypoints/sagemaker/__pycache__/routes.cpython-312.pyc b/entrypoints/sagemaker/__pycache__/routes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2546e43928d4acca19016b24531be5d8431b902f GIT binary patch literal 4235 zcmb7HYit|G5#D?9btEm(l=ZShTXrnkGFj_k*^y&dvDo@4OO))!OoPLPcaq4ENA~Vy zSt?mLiIEzu0n05=3up_tK+r0%isnau_(y=&t&IfeA2KvRb*mx>+5+v57L28!dGtqT zk33Q-;yw=G?%dq$?Ck8!H^bk$-A)8giv3RFb)f!8UaW_%5$o50xQ%2aQz;Z9Go7N+ zbd08|JSNS?xEjsJc=*nwY-xMUo_53>X=lutcEw!gcQz%Y-7$CC6Z4qwT*{l?65En) ziZ!L1W6fz_%xCiX)Yh~=<_FrQ*r#aOevOT_%u_+fUx`VhK?*xhPEL+Z7?Lro2U+YG z9y=Ay%o>V@Z6iunQDq&wiA3Z|T*(@VjEZehj_vtqwX?hKwye(kJsHjjqQdQ0xGsK#vh zbCXn{@ztVa+ls^T-R66OByc|YZYl_E$4x$_E&zLzX zNH6RkgUti+872O5C=gNQY$l-^0g@>RO_2jr87(j&O)KZ5mlX}NSrcF@m`Hr+WUs)d zC&NZM3unk4FeGhSF%Ddr9XxXQT)cRT?mtV^%o1XI0F|IK&Q# zjyVKY{YQWYt5^zMLW|UmE=$88d1FjH;CZ6O#0}fRkuPsXhMqt*Lrvg-(ya$b9hL(b zHDJsr7B^tbXO+hC@KmenfL>R!O58AaRRWnQ&`DuiTG0uNfO=azqZ$N)NIeMp zF-v*^>?O90n5LLLfkx`FMvxV|o*Nz!qml6ok%>vnK`Sw*)Lk} zI&P}@P;4x<)mWGkF=b#UvDq|O$4pwE#tc}elO7`Qxkn>dfL)tVb|LgX;Q1zbay$BU zC@m(eS2}rzf387O)N?!^n9fs*Gr;CnmnocQVJk4H@K3*9_TYElP^ZLr(uA;XK&gAf~H9xi1`Ee276$Tdkb3*`Ee7w<&wswL?i@yDj5#?^fTUsj&%%2kZ!=J6QVDcwA z`A_jG6&awe`9^H$HZ{VrODs@7bOP@uZ4sCKmjY1tg{DX|eSf=22dPL8e}A7zA9^+t z=6~Hc+zdhwI0{xiu=yf(=0STf!Y~gR4(Nw=qU>R_FB0S)?qLYsO%XcC5<1Mo7;J(L zAp^3Y-3-eMIbpQOB(#0@Ye&5vNr~(b3_v;zT5RF%lIc(dhXo?x=6nh+k_v+)B_) znVLSn##}|Koa61z^JLjA3M}$iBadr85vrBtl4GkP<5CIISQG z)jf?Jby7PHdNk58VB3@gve|^j5|$ZNX5k<~hMQ_10r{#3uA2~EjoV$M6%K7`g+m$m z7c3=0pbbJryA0S)s*G6?%y!$1R~&fKx~!HlGVx5x3}G`M-bC6-qQS&s`%yUTY)Uz% zjRTLg?D`-KkmnS&&Lc|r1M2uQI`wMXw_mJ`xZ$RN?!n<~xZE I1e2-wU*qTf&;S4c literal 0 HcmV?d00001 diff --git a/entrypoints/sagemaker/routes.py b/entrypoints/sagemaker/routes.py new file mode 100644 index 0000000..498b729 --- /dev/null +++ b/entrypoints/sagemaker/routes.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +from http import HTTPStatus + +import model_hosting_container_standards.sagemaker as sagemaker_standards +import pydantic +from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import JSONResponse, Response + +from vllm.entrypoints.openai.api_server import ( + INVOCATION_VALIDATORS, + base, + health, + validate_json_request, +) +from vllm.entrypoints.openai.protocol import ErrorResponse + + +def register_sagemaker_routes(router: APIRouter): + @router.post("/ping", response_class=Response) + @router.get("/ping", response_class=Response) + @sagemaker_standards.register_ping_handler + async def ping(raw_request: Request) -> Response: + """Ping check. Endpoint required for SageMaker""" + return await health(raw_request) + + @router.post( + "/invocations", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, + ) + @sagemaker_standards.register_invocation_handler + @sagemaker_standards.stateful_session_manager() + @sagemaker_standards.inject_adapter_id(adapter_path="model") + async def invocations(raw_request: Request): + """For SageMaker, routes requests based on the request type.""" + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}", + ) from e + + valid_endpoints = [ + (validator, endpoint) + for validator, (get_handler, endpoint) in INVOCATION_VALIDATORS + if get_handler(raw_request) is not None + ] + + for request_validator, endpoint in valid_endpoints: + try: + request = request_validator.validate_python(body) + except pydantic.ValidationError: + continue + + return await endpoint(request, raw_request) + + type_names = [ + t.__name__ if isinstance(t := validator._type, type) else str(t) + for validator, _ in valid_endpoints + ] + msg = f"Cannot find suitable handler for request. Expected one of: {type_names}" + res = base(raw_request).create_error_response(message=msg) + return JSONResponse(content=res.model_dump(), status_code=res.error.code) + + return router diff --git a/entrypoints/score_utils.py b/entrypoints/score_utils.py new file mode 100644 index 0000000..309a4c9 --- /dev/null +++ b/entrypoints/score_utils.py @@ -0,0 +1,242 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, TypeAlias, cast + +from torch.nn import CosineSimilarity +from typing_extensions import Required, TypedDict + +from vllm.config import ModelConfig +from vllm.entrypoints.chat_utils import ( + BaseMultiModalItemTracker, + ChatCompletionContentPartImageEmbedsParam, + ChatCompletionContentPartImageParam, + ChatCompletionContentPartTextParam, + MultiModalItemTracker, + _ContentPart, + _parse_chat_message_content_part, +) +from vllm.inputs import TokensPrompt +from vllm.model_executor.models.interfaces import supports_score_template +from vllm.multimodal.inputs import MultiModalDataDict +from vllm.outputs import PoolingRequestOutput +from vllm.transformers_utils.tokenizer import ( + AnyTokenizer, + PreTrainedTokenizer, + PreTrainedTokenizerFast, +) + +ScoreContentPartParam: TypeAlias = ( + ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam +) + + +class ScoreMultiModalParam(TypedDict, total=False): + """ + A specialized parameter type for scoring multimodal content + + The reasons why don't reuse `CustomChatCompletionMessageParam` directly: + 1. Score tasks don't need the 'role' field (user/assistant/system) that's required in chat completions + 2. Including chat-specific fields would confuse users about their purpose in scoring + 3. This is a more focused interface that only exposes what's needed for scoring + """ # noqa: E501 + + content: Required[list[ScoreContentPartParam]] + """The multimodal contents""" + + +def _cosine_similarity( + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, + embed_1: list[PoolingRequestOutput], + embed_2: list[PoolingRequestOutput], +) -> list[PoolingRequestOutput]: + scorer = CosineSimilarity(0) + scores: list[PoolingRequestOutput] = [] + + for emb_1, emb_2 in zip(embed_1, embed_2): + pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data) + + padding = [] + if (pad_token_id := getattr(tokenizer, "pad_token_id", None)) is not None: + padding = [pad_token_id] + + tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids + + scores.append( + PoolingRequestOutput( + request_id=f"{emb_1.request_id}_{emb_2.request_id}", + outputs=pair_score, + prompt_token_ids=tokens, + num_cached_tokens=emb_1.num_cached_tokens + emb_2.num_cached_tokens, + finished=True, + ) + ) + + return scores + + +def _validate_score_input_lens( + data_1: list[str] | list[ScoreContentPartParam], + data_2: list[str] | list[ScoreContentPartParam], +): + len_1 = len(data_1) + len_2 = len(data_2) + + if len_1 > 1 and len_1 != len_2: + raise ValueError("Input lengths must be either 1:1, 1:N or N:N") + if len_1 == 0: + raise ValueError("At least one text element must be given") + if len_2 == 0: + raise ValueError("At least one text_pair element must be given") + + +def parse_score_data( + data_1: str | ScoreContentPartParam, + data_2: str | ScoreContentPartParam, + model_config: ModelConfig, + tokenizer: AnyTokenizer, +) -> tuple[str, str, MultiModalDataDict | None]: + mm_tracker = MultiModalItemTracker(model_config, tokenizer) + + content_1 = _parse_score_content(data_1, mm_tracker) + + content_2 = _parse_score_content(data_2, mm_tracker) + + def ensure_str(content: _ContentPart | None) -> str: + if content is not None and isinstance(content, str): + return cast(str, content) + else: + raise ValueError(f"Only string content is supported, but got {content}.") + + prompt_1 = ensure_str(content_1) + prompt_2 = ensure_str(content_2) + + return prompt_1, prompt_2, mm_tracker.all_mm_data() + + +def _parse_score_content( + data: str | ScoreContentPartParam, + mm_tracker: BaseMultiModalItemTracker, +) -> _ContentPart | None: + if isinstance(data, str): + data = ChatCompletionContentPartTextParam(type="text", text=data) + + mm_parser = mm_tracker.create_parser() + + parse_res = _parse_chat_message_content_part( + data, + mm_parser, + wrap_dicts=False, + interleave_strings=False, + ) + + if parse_res: + return parse_res + + mm_placeholder_storage = mm_parser.mm_placeholder_storage() + + if ( + len(mm_placeholder_storage) != 1 + or len(next(iter(mm_placeholder_storage.values()))) != 1 + ): + raise ValueError("Only one multi-modal item is supported") + + return next(iter(mm_placeholder_storage.values()))[0] + + +def apply_score_template( + model_config: ModelConfig, + prompt_1: str, + prompt_2: str, +) -> str: + # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf) + from vllm.model_executor.model_loader import get_model_cls + + model = get_model_cls(model_config) + if supports_score_template(model): + full_prompt = model.get_score_template(prompt_1, prompt_2) + if full_prompt is None: + raise ValueError("Get empty score template from model") + return full_prompt + + raise ValueError(f"Unsupported model architecture: {model_config.architecture}") + + +def post_process_tokens( + model_config: ModelConfig, + prompt: TokensPrompt, +) -> None: + """ + Perform architecture-specific manipulations on the input tokens. + + Note: + This is an in-place operation. + """ + # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf) + from vllm.model_executor.model_loader import get_model_cls + + model = get_model_cls(model_config) + if supports_score_template(model): + model.post_process_tokens(prompt) + + +def get_score_prompt( + model_config: ModelConfig, + tokenizer: AnyTokenizer, + tokenization_kwargs: dict[str, Any], + data_1: str | ScoreContentPartParam, + data_2: str | ScoreContentPartParam, +) -> tuple[str, TokensPrompt]: + prompt_1, prompt_2, mm_data = parse_score_data( + data_1, + data_2, + model_config, + tokenizer, + ) + from vllm.model_executor.model_loader import get_model_cls + + model = get_model_cls(model_config) + if supports_score_template(model): + full_prompt = apply_score_template(model_config, prompt_1, prompt_2) + prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) + elif model_config.use_pad_token: + # cross_encoder models defaults to using pad_token. + prompt_inputs = tokenizer( + text=prompt_1, text_pair=prompt_2, **tokenization_kwargs + ) + full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) + else: + # `llm as reranker` models defaults to not using pad_token. + full_prompt = prompt_1 + prompt_2 + prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) + + engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"]) + + if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None: + engine_prompt["token_type_ids"] = token_type_ids + + post_process_tokens(model_config, engine_prompt) + + if mm_data is not None: + engine_prompt["multi_modal_data"] = mm_data + return full_prompt, engine_prompt + + +def compress_token_type_ids(token_type_ids: list[int]) -> int: + """ + Return position of the first 1 or the length of the list + if not found. + """ + first_one = len(token_type_ids) + err_msg = ( + "Token type ids are expected to be a sequence" + " of zeros followed by a sequence of ones" + ) + for i, type_id in enumerate(token_type_ids): + if type_id == 0 and first_one < i: + raise ValueError(err_msg) + elif type_id == 1 and first_one > i: + first_one = i + elif type_id > 1: + raise ValueError(err_msg) + + return first_one diff --git a/entrypoints/ssl.py b/entrypoints/ssl.py new file mode 100644 index 0000000..4d947bc --- /dev/null +++ b/entrypoints/ssl.py @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +from collections.abc import Callable +from ssl import SSLContext + +from watchfiles import Change, awatch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class SSLCertRefresher: + """A class that monitors SSL certificate files and + reloads them when they change. + """ + + def __init__( + self, + ssl_context: SSLContext, + key_path: str | None = None, + cert_path: str | None = None, + ca_path: str | None = None, + ) -> None: + self.ssl = ssl_context + self.key_path = key_path + self.cert_path = cert_path + self.ca_path = ca_path + + # Setup certification chain watcher + def update_ssl_cert_chain(change: Change, file_path: str) -> None: + logger.info("Reloading SSL certificate chain") + assert self.key_path and self.cert_path + self.ssl.load_cert_chain(self.cert_path, self.key_path) + + self.watch_ssl_cert_task = None + if self.key_path and self.cert_path: + self.watch_ssl_cert_task = asyncio.create_task( + self._watch_files( + [self.key_path, self.cert_path], update_ssl_cert_chain + ) + ) + + # Setup CA files watcher + def update_ssl_ca(change: Change, file_path: str) -> None: + logger.info("Reloading SSL CA certificates") + assert self.ca_path + self.ssl.load_verify_locations(self.ca_path) + + self.watch_ssl_ca_task = None + if self.ca_path: + self.watch_ssl_ca_task = asyncio.create_task( + self._watch_files([self.ca_path], update_ssl_ca) + ) + + async def _watch_files(self, paths, fun: Callable[[Change, str], None]) -> None: + """Watch multiple file paths asynchronously.""" + logger.info("SSLCertRefresher monitors files: %s", paths) + async for changes in awatch(*paths): + try: + for change, file_path in changes: + logger.info("File change detected: %s - %s", change.name, file_path) + fun(change, file_path) + except Exception as e: + logger.error( + "SSLCertRefresher failed taking action on file change. Error: %s", e + ) + + def stop(self) -> None: + """Stop watching files.""" + if self.watch_ssl_cert_task: + self.watch_ssl_cert_task.cancel() + self.watch_ssl_cert_task = None + if self.watch_ssl_ca_task: + self.watch_ssl_ca_task.cancel() + self.watch_ssl_ca_task = None diff --git a/entrypoints/tool.py b/entrypoints/tool.py new file mode 100644 index 0000000..c74ce1e --- /dev/null +++ b/entrypoints/tool.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +from openai_harmony import Author, Message, Role, TextContent + +from vllm.logger import init_logger + +if TYPE_CHECKING: + # Avoid circular import. + from vllm.entrypoints.context import ConversationContext + +logger = init_logger(__name__) + +MIN_GPT_OSS_VERSION = "0.0.7" + + +def validate_gpt_oss_install(): + """ + Check if the gpt-oss is installed and its version is at least 0.0.7. + If not, raise an ImportError. + """ + from importlib.metadata import PackageNotFoundError, version + + from packaging.version import InvalidVersion, Version + + try: + pkg_version_str = version("gpt_oss") + pkg_version = Version(pkg_version_str) + except PackageNotFoundError: + raise ImportError("Package 'gpt_oss' is not installed.") from None + except InvalidVersion as e: + raise ImportError(f"Invalid version string for 'gpt_oss': {e}") from None + + if pkg_version < Version(MIN_GPT_OSS_VERSION): + raise ImportError( + f"gpt_oss >= {MIN_GPT_OSS_VERSION} is required, " + f"but {pkg_version} is installed." + ) from None + + +class Tool(ABC): + @abstractmethod + async def get_result(self, context: "ConversationContext") -> Any: + pass + + +class HarmonyBrowserTool(Tool): + def __init__(self): + self.enabled = True + exa_api_key = os.getenv("EXA_API_KEY") + if not exa_api_key: + self.enabled = False + logger.warning_once("EXA_API_KEY is not set, browsing is disabled") + return + + try: + validate_gpt_oss_install() + from gpt_oss.tools.simple_browser import SimpleBrowserTool + from gpt_oss.tools.simple_browser.backend import ExaBackend + except ImportError as e: + self.enabled = False + logger.warning_once( + "gpt_oss is not installed properly (%s), browsing is disabled", e + ) + return + + browser_backend = ExaBackend(source="web", api_key=exa_api_key) + self.browser_tool = SimpleBrowserTool(backend=browser_backend) + logger.info_once("Browser tool initialized") + + async def get_result(self, context: "ConversationContext") -> Any: + from vllm.entrypoints.context import HarmonyContext + + assert isinstance(context, HarmonyContext) + last_msg = context.messages[-1] + tool_output_msgs = [] + async for msg in self.browser_tool.process(last_msg): + tool_output_msgs.append(msg) + return tool_output_msgs + + @property + def tool_config(self) -> Any: + return self.browser_tool.tool_config + + +class HarmonyPythonTool(Tool): + def __init__(self): + self.enabled = True + + try: + validate_gpt_oss_install() + from gpt_oss.tools.python_docker.docker_tool import PythonTool + except ImportError as e: + self.enabled = False + logger.warning_once( + "gpt_oss is not installed properly (%s), code interpreter is disabled", + e, + ) + return + + self.python_tool = PythonTool() + + async def validate(self): + if not self.enabled: + return + try: + message = Message( + author=Author(role=Role.ASSISTANT), + content=[TextContent(text="print('Hello, world!')")], + channel="analysis", + recipient="python", + content_type="code", + ) + msgs = [] + async for msg in self.python_tool.process(message): + msgs.append(msg) + assert msgs[0].content[0].text == "Hello, world!\n" + except Exception as e: + self.enabled = False + logger.warning_once( + "Code interpreter tool failed to initialize (%s), code " + "interpreter is disabled", + e, + ) + return + logger.info_once("Code interpreter tool initialized") + + async def get_result(self, context: "ConversationContext") -> Any: + from vllm.entrypoints.context import HarmonyContext + + assert isinstance(context, HarmonyContext) + last_msg = context.messages[-1] + tool_output_msgs = [] + async for msg in self.python_tool.process(last_msg): + tool_output_msgs.append(msg) + return tool_output_msgs + + @property + def tool_config(self) -> Any: + return self.python_tool.tool_config diff --git a/entrypoints/tool_server.py b/entrypoints/tool_server.py new file mode 100644 index 0000000..0d83031 --- /dev/null +++ b/entrypoints/tool_server.py @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from contextlib import AbstractAsyncContextManager, asynccontextmanager +from typing import TYPE_CHECKING, Any + +from openai_harmony import ToolDescription, ToolNamespaceConfig + +from vllm.entrypoints.tool import HarmonyBrowserTool, HarmonyPythonTool, Tool +from vllm.logger import init_logger + +logger = init_logger(__name__) + +if TYPE_CHECKING: + from mcp.types import ListToolsResult + + +async def list_server_and_tools(server_url: str): + from mcp import ClientSession + from mcp.client.sse import sse_client + + async with ( + sse_client(url=server_url) as streams, + ClientSession(*streams) as session, + ): + initialize_response = await session.initialize() + list_tools_response = await session.list_tools() + return initialize_response, list_tools_response + + +def trim_schema(schema: dict) -> dict: + # Turn JSON Schema from MCP generated into Harmony's variant. + if "title" in schema: + del schema["title"] + if "default" in schema and schema["default"] is None: + del schema["default"] + if "anyOf" in schema: + # Turn "anyOf": [{"type": "type-1"}, {"type": "type-2"}] + # into "type": ["type-1", "type-2"] + # if there's more than 1 types, also remove "null" type as Harmony will + # just ignore it + types = [ + type_dict["type"] + for type_dict in schema["anyOf"] + if type_dict["type"] != "null" + ] + schema["type"] = types + del schema["anyOf"] + if "properties" in schema: + schema["properties"] = { + k: trim_schema(v) for k, v in schema["properties"].items() + } + return schema + + +def post_process_tools_description( + list_tools_result: "ListToolsResult", +) -> "ListToolsResult": + # Adapt the MCP tool result for Harmony + for tool in list_tools_result.tools: + tool.inputSchema = trim_schema(tool.inputSchema) + + # Some tools schema don't need to be part of the prompt (e.g. simple text + # in text out for Python) + list_tools_result.tools = [ + tool + for tool in list_tools_result.tools + if getattr(tool.annotations, "include_in_prompt", True) + ] + + return list_tools_result + + +class ToolServer(ABC): + @abstractmethod + def has_tool(self, tool_name: str) -> bool: + """ + Return True if the tool is supported, False otherwise. + """ + pass + + @abstractmethod + def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None: + """ + Return the tool description for the given tool name. + If the tool is not supported, return None. + """ + pass + + @abstractmethod + def new_session( + self, tool_name: str, session_id: str, headers: dict[str, str] | None = None + ) -> AbstractAsyncContextManager[Any]: + """ + Create a session for the tool. + """ + ... + + +class MCPToolServer(ToolServer): + def __init__(self): + try: + import mcp # noqa: F401 + except ImportError: + raise ImportError( + "mcp is not installed. Please run `pip install mcp` to use " + "MCPToolServer." + ) from None + self.harmony_tool_descriptions = {} + + async def add_tool_server(self, server_url: str): + tool_urls = server_url.split(",") + self.harmony_tool_descriptions = {} + self.urls: dict[str, str] = {} + for url in tool_urls: + url = f"http://{url}/sse" + initialize_response, list_tools_response = await list_server_and_tools(url) + + list_tools_response = post_process_tools_description(list_tools_response) + + tool_from_mcp = ToolNamespaceConfig( + name=initialize_response.serverInfo.name, + description=initialize_response.instructions, + tools=[ + ToolDescription.new( + name=tool.name, + description=tool.description, + parameters=tool.inputSchema, + ) + for tool in list_tools_response.tools + ], + ) + self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp + if tool_from_mcp.name not in self.urls: + self.urls[tool_from_mcp.name] = url + else: + logger.warning( + "Tool %s already exists. Ignoring duplicate tool server %s", + tool_from_mcp.name, + url, + ) + logger.info( + "MCPToolServer initialized with tools: %s", + list(self.harmony_tool_descriptions.keys()), + ) + + def has_tool(self, tool_name: str): + return tool_name in self.harmony_tool_descriptions + + def get_tool_description(self, tool_name: str): + return self.harmony_tool_descriptions.get(tool_name) + + @asynccontextmanager + async def new_session( + self, tool_name: str, session_id: str, headers: dict[str, str] | None = None + ): + from mcp import ClientSession + from mcp.client.sse import sse_client + + url = self.urls.get(tool_name) + request_headers = {"x-session-id": session_id} + if headers is not None: + request_headers.update(headers) + if not url: + raise KeyError(f"Tool '{tool_name}' is not supported") + async with ( + sse_client(url=url, headers=request_headers) as streams, + ClientSession(*streams) as session, + ): + await session.initialize() + yield session + + +class DemoToolServer(ToolServer): + def __init__(self): + self.tools: dict[str, Tool] = {} + + async def init_and_validate(self): + browser_tool = HarmonyBrowserTool() + python_tool = HarmonyPythonTool() + await python_tool.validate() + if browser_tool.enabled: + self.tools["browser"] = browser_tool + if python_tool.enabled: + self.tools["python"] = python_tool + logger.info( + "DemoToolServer initialized with tools: %s", list(self.tools.keys()) + ) + + def has_tool(self, tool_name: str) -> bool: + return tool_name in self.tools + + def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None: + if tool_name not in self.tools: + return None + if tool_name == "browser": + return ToolNamespaceConfig.browser() + elif tool_name == "python": + return ToolNamespaceConfig.python() + else: + raise ValueError(f"Unknown tool {tool_name}") + + @asynccontextmanager + async def new_session( + self, tool_name: str, session_id: str, headers: dict[str, str] | None = None + ): + if tool_name not in self.tools: + raise KeyError(f"Tool '{tool_name}' is not supported") + yield self.tools[tool_name] diff --git a/entrypoints/utils.py b/entrypoints/utils.py new file mode 100644 index 0000000..088bb67 --- /dev/null +++ b/entrypoints/utils.py @@ -0,0 +1,319 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import dataclasses +import functools +import os +from argparse import Namespace +from pathlib import Path +from typing import Any + +from fastapi import Request +from fastapi.responses import JSONResponse, StreamingResponse +from starlette.background import BackgroundTask, BackgroundTasks + +from vllm.config import ModelConfig +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import ( + load_chat_template, + resolve_hf_chat_template, + resolve_mistral_chat_template, +) +from vllm.entrypoints.openai.cli_args import make_arg_parser +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + CompletionRequest, + StreamOptions, +) +from vllm.entrypoints.openai.serving_models import LoRAModulePath +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.transformers_utils.tokenizers import MistralTokenizer +from vllm.utils.argparse_utils import FlexibleArgumentParser + +logger = init_logger(__name__) + +VLLM_SUBCMD_PARSER_EPILOG = ( + "For full list: vllm {subcmd} --help=all\n" + "For a section: vllm {subcmd} --help=ModelConfig (case-insensitive)\n" # noqa: E501 + "For a flag: vllm {subcmd} --help=max-model-len (_ or - accepted)\n" # noqa: E501 + "Documentation: https://docs.vllm.ai\n" +) + + +async def listen_for_disconnect(request: Request) -> None: + """Returns if a disconnect message is received""" + while True: + message = await request.receive() + if message["type"] == "http.disconnect": + # If load tracking is enabled *and* the counter exists, decrement + # it. Combines the previous nested checks into a single condition + # to satisfy the linter rule. + if getattr( + request.app.state, "enable_server_load_tracking", False + ) and hasattr(request.app.state, "server_load_metrics"): + request.app.state.server_load_metrics -= 1 + break + + +def with_cancellation(handler_func): + """Decorator that allows a route handler to be cancelled by client + disconnections. + + This does _not_ use request.is_disconnected, which does not work with + middleware. Instead this follows the pattern from + starlette.StreamingResponse, which simultaneously awaits on two tasks- one + to wait for an http disconnect message, and the other to do the work that we + want done. When the first task finishes, the other is cancelled. + + A core assumption of this method is that the body of the request has already + been read. This is a safe assumption to make for fastapi handlers that have + already parsed the body of the request into a pydantic model for us. + This decorator is unsafe to use elsewhere, as it will consume and throw away + all incoming messages for the request while it looks for a disconnect + message. + + In the case where a `StreamingResponse` is returned by the handler, this + wrapper will stop listening for disconnects and instead the response object + will start listening for disconnects. + """ + + # Functools.wraps is required for this wrapper to appear to fastapi as a + # normal route handler, with the correct request type hinting. + @functools.wraps(handler_func) + async def wrapper(*args, **kwargs): + # The request is either the second positional arg or `raw_request` + request = args[1] if len(args) > 1 else kwargs["raw_request"] + + handler_task = asyncio.create_task(handler_func(*args, **kwargs)) + cancellation_task = asyncio.create_task(listen_for_disconnect(request)) + + done, pending = await asyncio.wait( + [handler_task, cancellation_task], return_when=asyncio.FIRST_COMPLETED + ) + for task in pending: + task.cancel() + + if handler_task in done: + return handler_task.result() + return None + + return wrapper + + +def decrement_server_load(request: Request): + request.app.state.server_load_metrics -= 1 + + +def load_aware_call(func): + @functools.wraps(func) + async def wrapper(*args, **kwargs): + raw_request = kwargs.get("raw_request", args[1] if len(args) > 1 else None) + + if raw_request is None: + raise ValueError( + "raw_request required when server load tracking is enabled" + ) + + if not getattr(raw_request.app.state, "enable_server_load_tracking", False): + return await func(*args, **kwargs) + + # ensure the counter exists + if not hasattr(raw_request.app.state, "server_load_metrics"): + raw_request.app.state.server_load_metrics = 0 + + raw_request.app.state.server_load_metrics += 1 + try: + response = await func(*args, **kwargs) + except Exception: + raw_request.app.state.server_load_metrics -= 1 + raise + + if isinstance(response, (JSONResponse, StreamingResponse)): + if response.background is None: + response.background = BackgroundTask(decrement_server_load, raw_request) + elif isinstance(response.background, BackgroundTasks): + response.background.add_task(decrement_server_load, raw_request) + elif isinstance(response.background, BackgroundTask): + # Convert the single BackgroundTask to BackgroundTasks + # and chain the decrement_server_load task to it + tasks = BackgroundTasks() + tasks.add_task( + response.background.func, + *response.background.args, + **response.background.kwargs, + ) + tasks.add_task(decrement_server_load, raw_request) + response.background = tasks + else: + raw_request.app.state.server_load_metrics -= 1 + + return response + + return wrapper + + +def cli_env_setup(): + # The safest multiprocessing method is `spawn`, as the default `fork` method + # is not compatible with some accelerators. The default method will be + # changing in future versions of Python, so we should use it explicitly when + # possible. + # + # We only set it here in the CLI entrypoint, because changing to `spawn` + # could break some existing code using vLLM as a library. `spawn` will cause + # unexpected behavior if the code is not protected by + # `if __name__ == "__main__":`. + # + # References: + # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods + # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing + # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors + # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders + if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ: + logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def _validate_truncation_size( + max_model_len: int, + truncate_prompt_tokens: int | None, + tokenization_kwargs: dict[str, Any] | None = None, +) -> int | None: + if truncate_prompt_tokens is not None: + if truncate_prompt_tokens <= -1: + truncate_prompt_tokens = max_model_len + + if truncate_prompt_tokens > max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({max_model_len})." + f" Please, select a smaller truncation size." + ) + + if tokenization_kwargs is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + else: + if tokenization_kwargs is not None: + tokenization_kwargs["truncation"] = False + + return truncate_prompt_tokens + + +def get_max_tokens( + max_model_len: int, + request: ChatCompletionRequest | CompletionRequest, + input_length: int, + default_sampling_params: dict, +) -> int: + max_tokens = getattr(request, "max_completion_tokens", None) or request.max_tokens + default_max_tokens = max_model_len - input_length + max_output_tokens = current_platform.get_max_output_tokens(input_length) + + return min( + val + for val in ( + default_max_tokens, + max_tokens, + max_output_tokens, + default_sampling_params.get("max_tokens"), + ) + if val is not None + ) + + +def log_non_default_args(args: Namespace | EngineArgs): + non_default_args = {} + + # Handle Namespace + if isinstance(args, Namespace): + parser = make_arg_parser(FlexibleArgumentParser()) + for arg, default in vars(parser.parse_args([])).items(): + if default != getattr(args, arg): + non_default_args[arg] = getattr(args, arg) + + # Handle EngineArgs instance + elif isinstance(args, EngineArgs): + default_args = EngineArgs(model=args.model) # Create default instance + for field in dataclasses.fields(args): + current_val = getattr(args, field.name) + default_val = getattr(default_args, field.name) + if current_val != default_val: + non_default_args[field.name] = current_val + if default_args.model != EngineArgs.model: + non_default_args["model"] = default_args.model + else: + raise TypeError( + "Unsupported argument type. Must be Namespace or EngineArgs instance." + ) + + logger.info("non-default args: %s", non_default_args) + + +def should_include_usage( + stream_options: StreamOptions | None, enable_force_include_usage: bool +) -> tuple[bool, bool]: + if stream_options: + include_usage = stream_options.include_usage or enable_force_include_usage + include_continuous_usage = include_usage and bool( + stream_options.continuous_usage_stats + ) + else: + include_usage, include_continuous_usage = enable_force_include_usage, False + return include_usage, include_continuous_usage + + +def process_lora_modules( + args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None +) -> list[LoRAModulePath]: + lora_modules = args_lora_modules + if default_mm_loras: + default_mm_lora_paths = [ + LoRAModulePath( + name=modality, + path=lora_path, + ) + for modality, lora_path in default_mm_loras.items() + ] + if args_lora_modules is None: + lora_modules = default_mm_lora_paths + else: + lora_modules += default_mm_lora_paths + return lora_modules + + +async def process_chat_template( + args_chat_template: Path | str | None, + engine_client: EngineClient, + model_config: ModelConfig, +) -> str | None: + resolved_chat_template = load_chat_template(args_chat_template) + if resolved_chat_template is not None: + # Get the tokenizer to check official template + tokenizer = await engine_client.get_tokenizer() + + if isinstance(tokenizer, MistralTokenizer): + # The warning is logged in resolve_mistral_chat_template. + resolved_chat_template = resolve_mistral_chat_template( + chat_template=resolved_chat_template + ) + else: + hf_chat_template = resolve_hf_chat_template( + tokenizer=tokenizer, + chat_template=None, + tools=None, + model_config=model_config, + ) + + if hf_chat_template != resolved_chat_template: + logger.warning( + "Using supplied chat template: %s\n" + "It is different from official chat template '%s'. " + "This discrepancy may lead to performance degradation.", + resolved_chat_template, + model_config.model, + ) + return resolved_chat_template diff --git a/env_override.py b/env_override.py new file mode 100644 index 0000000..14dae28 --- /dev/null +++ b/env_override.py @@ -0,0 +1,378 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + +import torch + +from vllm.logger import init_logger +from vllm.utils.torch_utils import is_torch_equal + +logger = init_logger(__name__) + +# set some common config/environment variables that should be set +# for all processes created by vllm and all processes +# that interact with vllm workers. +# they are executed whenever `import vllm` is called. + +# see https://github.com/vllm-project/vllm/pull/15951 +# it avoids unintentional cuda initialization from torch.cuda.is_available() +os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1" + +# see https://github.com/vllm-project/vllm/issues/10480 +os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" +# see https://github.com/vllm-project/vllm/issues/10619 +torch._inductor.config.compile_threads = 1 + +# =================================================== +# torch 2.9 Inductor PythonWrapperCodegen monkeypatch +# =================================================== +# This change monkeypatches memory_plan_reuse in pytorch 2.9.0 to work around +# a test failure for test_multi_graph_piecewise_compile_outputs_equal. +# For more context, see https://github.com/pytorch/pytorch/pull/165514. + + +def memory_plan_reuse_patched(self): + import torch._inductor.ir as ir + from torch._inductor.codegen.wrapper import ( + EnterSubgraphLine, + ExitSubgraphLine, + MemoryPlanningLine, + MemoryPlanningState, + SubgraphPythonWrapperCodegen, + ) + from torch._inductor.virtualized import V + + def get_output_names(graph_outputs) -> list[str]: + import itertools + + names = [] + shape_counter = itertools.count(0) + none_counter = itertools.count(0) + for node in graph_outputs: + if isinstance(node, ir.NoneAsConstantBuffer): + names.append(f"{V.graph.name}_none{next(none_counter)}") + elif isinstance(node, ir.ShapeAsConstantBuffer): + names.append(f"{V.graph.name}_shape{next(shape_counter)}") + else: + names.append(node.get_name()) + return names + + if ( + isinstance(V.graph.wrapper_code, SubgraphPythonWrapperCodegen) + and V.graph.wrapper_code.partition_signatures is not None + ): + out_names = get_output_names( + V.graph.wrapper_code.partition_signatures.output_nodes + ) + else: + out_names = V.graph.get_output_names() + + while ( + self.lines + and isinstance(self.lines[-1], MemoryPlanningLine) + and self.lines[-1].node.name not in out_names # type: ignore[attr-defined] + ): + # these lines will be pointless + self.lines.pop() + + # codegen allocations in two passes + planning_states = [MemoryPlanningState()] + past_planning_states = [] + for i in range(len(self.lines)): + line = self.lines[i] + if isinstance(line, MemoryPlanningLine): + self.lines[i] = line.plan(planning_states[-1]) + elif isinstance(line, EnterSubgraphLine): + planning_states.append(MemoryPlanningState()) + elif isinstance(line, ExitSubgraphLine): + past_planning_states.append(planning_states.pop()) + past_planning_states.append(planning_states.pop()) + assert len(planning_states) == 0 + + +# =================================================== +# torch 2.9 Inductor get_graph_partition_signature monkeypatch +# =================================================== +# This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to +# fix inductor partition + attention-nvfp4 quant fusion, tested in +# `tests/compile/test_fusions_e2e.py::test_attn_quant`. +# For more context, see https://github.com/pytorch/pytorch/pull/165815. + + +def get_graph_partition_signature_patched( + self, partitions, skip_cudagraphs: list[bool] +): + """ + Gets signature for each graph partition, including input nodes, output nodes, and + whether deallocating an input within graph partition. + """ + from torch._inductor import dependencies + from torch._inductor.ir import GraphPartitionSignature, MutationOutput, NoneLayout + from torch._inductor.virtualized import V + from torch.utils._ordered_set import OrderedSet + + signatures = [] + + unmet_output_names = OrderedSet(V.graph.get_output_names()) + name_to_node = self.get_name_to_nodes() + + def is_none_layout(buf_name: str) -> bool: + """ + Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated + so graph partition should not take it as inputs or outputs. + """ + buf = self.name_to_buf.get(buf_name, None) + + if buf is None: + return False + + if isinstance(buf.node.layout, NoneLayout): + if isinstance(buf.node, MutationOutput) and ( + real_name := self.mutation_real_name.get(buf_name, None) + ): + return is_none_layout(real_name) + + return True + + return False + + for partition, skip_cudagraph in zip( + reversed(partitions), reversed(skip_cudagraphs) + ): + output_names: OrderedSet[str] = OrderedSet() + + for node in partition: + output_names.update(node.outputs_by_name.keys()) + + returned_output_names = output_names.intersection(unmet_output_names) + + # all reads/writes are partition inputs except those generated + # within the partition and tensor constants + read_writes = dependencies.ReadWrites.merge_list( + [node.read_writes for node in partition] + ) + + # WeakDep is fake dependency on unused buffer. It should not appear + # in partition_input_names for inputs that are actually read or written. + partition_input_names = ( + OrderedSet( + [ + x.name + for x in read_writes.reads | read_writes.writes + if not is_none_layout(x.name) + ] + ) + - output_names + ) + + partition_input_names = OrderedSet( + self.mutation_real_name.get(name, name) for name in partition_input_names + ) + + buffer_names_to_free: OrderedSet[str] = OrderedSet() + for node in partition: + buffer_names_to_free.update(node.last_usage) + + # buffer_names_to_free may contain buffers allocated in previous + # graph partitions. These buffers should also be a partition + # input. + extra_input_names = [ + name + for name in (buffer_names_to_free - output_names) + if name in name_to_node + ] + partition_input_names.update(extra_input_names) + + input_nodes = { + name: name_to_node[name] + for name in partition_input_names + if name in name_to_node + } + input_deallocation = { + name: name in buffer_names_to_free + for name in partition_input_names + if name in name_to_node + } + + # if an input tensor is not freed in the partition function, it should + # also be returned as an output. This brings benefits to cudagraph + # since the returned output tensor is a cudagraph managed tensor with + # a static tensor address. + extra_output_names = [ + name + for name in partition_input_names + if name in name_to_node and name not in buffer_names_to_free + ] + + returned_output_names.update(extra_output_names) + + returned_output_names = OrderedSet( + self.mutation_real_name.get(name, name) for name in returned_output_names + ) + + output_nodes = [ + name_to_node[name] + for name in returned_output_names + if not is_none_layout(name) + ] + + constant_names = [ + name for name in partition_input_names if name in V.graph.constants + ] + + symbol_inputs = self.get_graph_partition_symbol_inputs(partition, input_nodes) + + partition_signature = GraphPartitionSignature( + symbol_inputs, + input_nodes, + output_nodes, + input_deallocation, + skip_cudagraph, + constant_names, + ) + + signatures.append(partition_signature) + + unmet_output_names = partition_input_names.union( + unmet_output_names - returned_output_names + ) + + return signatures[::-1] + + +# ======================================== +# torch 2.9 Inductor Scheduler monkeypatch +# ======================================== +# This change monkeypatches a function in Inductor to work around the following +# bug: https://github.com/vllm-project/vllm/issues/26678 +# +# The bug occurs when `use_inductor_graph_partition` is turned on and there +# exists operators inside of `splitting_ops` that have an in-place mutation. In +# vllm, this specifically occurs on the operator +# vllm.unified_attention_with_output. In this case, inductor does not populate +# the inductor IR's `origin_node` field, causing an assertion error when trying +# to access the node's `origin_node` field. +# +# So, we will monkeypatch torch._inductor.scheduler.Scheduler.should_partition +# so that it does not access the inductor IR node's `origin_node` field and just +# returns True if a node is registered as having a custom partition function. +# This is ok for now since vllm's implementation of the custom partition +# functions just return True. +# ======================================== + + +def should_partition_patched(self, node, should_log: bool = False) -> bool: + # This is a patched version of + # torch._inductor.scheduler.Scheduler.should_partition that modifies + # the following piece of code so that we always return True: + # https://github.com/pytorch/pytorch/blob/ecb53078faf86ca1b33277df33b82985675bb011/torch/_inductor/scheduler.py#L4712-L4724 + """Return True if we should partition the inductor graph on this node""" + + import torch._inductor.ir as ir + from torch._inductor.scheduler import ( + BaseSchedulerNode, + FusedSchedulerNode, + ) + from torch._inductor.utils import ( + _unstable_customized_partition_wrapper, + is_cudagraph_unsafe_op, + maybe_log_cudagraph_partition, + ) + + # Allow users to manually specify if a node should be partitioned + # Can only do this for FallbackKernels + ir_node = node.node + if isinstance(ir_node, torch._inductor.ir.FallbackKernel) and ( + op := ir_node.op_overload + ): + op_overload_packet_name = op.name() + op_overload_name = ( + f"{op_overload_packet_name}.{op._overloadname}" + if isinstance(op, torch._ops.OpOverload) + else op_overload_packet_name + ) + if ( + op_overload_packet_name + in torch._inductor.config.custom_should_partition_ops + or op_overload_name in torch._inductor.config.custom_should_partition_ops + ): + assert isinstance(op, torch._ops.OpOverload) + return True + + # When not using cudagraphs, keep all kernels in the `call` function + # instead of graph partition functions, since graph partition only brings + # benefit to cudagraph + if ( + not torch._inductor.config.triton.cudagraphs + and _unstable_customized_partition_wrapper.wrapper is None + ): + return True + + # avoid duplicating logs when should_partition is called multiple times + # on the same node + def noop_log(msg: str, node: BaseSchedulerNode | None) -> None: + return + + log_partition_reason = maybe_log_cudagraph_partition if should_log else noop_log + + if isinstance(node, FusedSchedulerNode): + return any(self.should_partition(snode) for snode in node.snodes) + + assert node.node is not None + + if not node.is_gpu(): + log_partition_reason("non gpu ops", node=node) + + return True + + if isinstance(node.node, ir.DeviceCopy): + log_partition_reason("DeviceCopy ops", node=node) + return True + + if isinstance(node.node, ir.Conditional): + log_partition_reason("Conditional ops", node=node) + return True + + if getattr(node.node, "unbacked_bindings", None): + log_partition_reason("unbacked binding ops", node=node) + return True + + if is_cudagraph_unsafe_op(node.node): + log_partition_reason("CUDAGraph-unsafe custom ops", node=node) + return True + + return False + + +def _update_scheduler_patched(self) -> None: + # Copied from torch._inductor.graph.GrahLowering._update_scheduler. Patches + # this method so that we can patch Scheduler.should_partition with the + # function above + """ + (Re)initializes the scheduler member. When initializing the scheduler, no CUBIN + files should be generated (to avoid biasing any benchmarks and pessimizing + fusion decisions). + """ + import torch._inductor.config as config + from torch._inductor.scheduler import Scheduler + + Scheduler.should_partition = should_partition_patched + Scheduler.get_graph_partition_signature = get_graph_partition_signature_patched + + with config.patch("triton.store_cubin", False): + self.scheduler = Scheduler(self.operations) + + +if is_torch_equal("2.9.0"): + from torch._inductor.codegen.wrapper import PythonWrapperCodegen + from torch._inductor.graph import GraphLowering + from torch.utils._config_module import _Config, _ConfigEntry + + # `custom_should_partition_ops` is a new config after 2.9.0. So this would + # not overwrite any user configs. + torch._inductor.config._config["custom_should_partition_ops"] = _ConfigEntry( + _Config(default=[]) + ) + + PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched + GraphLowering._update_scheduler = _update_scheduler_patched diff --git a/envs.py b/envs.py new file mode 100644 index 0000000..db66d33 --- /dev/null +++ b/envs.py @@ -0,0 +1,1729 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import functools +import hashlib +import json +import os +import sys +import tempfile +from collections.abc import Callable +from typing import TYPE_CHECKING, Any, Literal + +if TYPE_CHECKING: + VLLM_HOST_IP: str = "" + VLLM_PORT: int | None = None + VLLM_RPC_BASE_PATH: str = tempfile.gettempdir() + VLLM_USE_MODELSCOPE: bool = False + VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60 + VLLM_NCCL_SO_PATH: str | None = None + LD_LIBRARY_PATH: str | None = None + VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE: int = 256 + VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False + VLLM_FLASH_ATTN_VERSION: int | None = None + LOCAL_RANK: int = 0 + CUDA_VISIBLE_DEVICES: str | None = None + VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 + VLLM_API_KEY: str | None = None + VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False + S3_ACCESS_KEY_ID: str | None = None + S3_SECRET_ACCESS_KEY: str | None = None + S3_ENDPOINT_URL: str | None = None + VLLM_MODEL_REDIRECT_PATH: str | None = None + VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm") + VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") + VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" + VLLM_NO_USAGE_STATS: bool = False + VLLM_DISABLE_FLASHINFER_PREFILL: bool = False + VLLM_DO_NOT_TRACK: bool = False + VLLM_USAGE_SOURCE: str = "" + VLLM_CONFIGURE_LOGGING: int = 1 + VLLM_LOGGING_LEVEL: str = "INFO" + VLLM_LOGGING_PREFIX: str = "" + VLLM_LOGGING_STREAM: str = "ext://sys.stdout" + VLLM_LOGGING_CONFIG_PATH: str | None = None + VLLM_LOG_STATS_INTERVAL: float = 10.0 + VLLM_TRACE_FUNCTION: int = 0 + VLLM_ATTENTION_BACKEND: str | None = None + VLLM_USE_FLASHINFER_SAMPLER: bool | None = None + VLLM_PP_LAYER_PARTITION: str | None = None + VLLM_CPU_KVCACHE_SPACE: int | None = 0 + VLLM_CPU_OMP_THREADS_BIND: str = "" + VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None + VLLM_CPU_MOE_PREPACK: bool = True + VLLM_CPU_SGL_KERNEL: bool = False + VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") + VLLM_XLA_CHECK_RECOMPILATION: bool = False + VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 + VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True + VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto" + VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False + VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True + VLLM_XLA_USE_SPMD: bool = False + VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = "fork" + VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") + VLLM_ASSETS_CACHE_MODEL_CLEAN: bool = False + VLLM_IMAGE_FETCH_TIMEOUT: int = 5 + VLLM_VIDEO_FETCH_TIMEOUT: int = 30 + VLLM_AUDIO_FETCH_TIMEOUT: int = 10 + VLLM_MEDIA_URL_ALLOW_REDIRECTS: bool = True + VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8 + VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 + VLLM_VIDEO_LOADER_BACKEND: str = "opencv" + VLLM_MEDIA_CONNECTOR: str = "http" + VLLM_MM_INPUT_CACHE_GIB: int = 4 + VLLM_TARGET_DEVICE: str = "cuda" + VLLM_MAIN_CUDA_VERSION: str = "12.8" + MAX_JOBS: str | None = None + NVCC_THREADS: str | None = None + VLLM_USE_PRECOMPILED: bool = False + VLLM_DOCKER_BUILD_CONTEXT: bool = False + VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False + VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False + CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None + VERBOSE: bool = False + VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False + VLLM_RPC_TIMEOUT: int = 10000 # ms + VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds + VLLM_PLUGINS: list[str] | None = None + VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None + VLLM_TORCH_CUDA_PROFILE: bool = False + VLLM_TORCH_PROFILER_DIR: str | None = None + VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False + VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False + VLLM_USE_AOT_COMPILE: bool = False + VLLM_USE_BYTECODE_HOOK: bool = False + VLLM_FORCE_AOT_LOAD: bool = False + VLLM_TORCH_PROFILER_WITH_STACK: bool = True + VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False + VLLM_USE_TRITON_AWQ: bool = False + VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False + VLLM_SKIP_P2P_CHECK: bool = False + VLLM_DISABLED_KERNELS: list[str] = [] + VLLM_DISABLE_PYNCCL: bool = False + VLLM_ROCM_USE_AITER: bool = False + VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False + VLLM_ROCM_USE_AITER_LINEAR: bool = True + VLLM_ROCM_USE_AITER_MOE: bool = True + VLLM_ROCM_USE_AITER_RMSNORM: bool = True + VLLM_ROCM_USE_AITER_MLA: bool = True + VLLM_ROCM_USE_AITER_MHA: bool = True + VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False + VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False + VLLM_ROCM_USE_AITER_FP8BMM: bool = True + VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False + VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True + VLLM_ROCM_USE_AITER_TRITON_GEMM: bool = True + VLLM_ROCM_USE_SKINNY_GEMM: bool = True + VLLM_ROCM_FP8_PADDING: bool = True + VLLM_ROCM_MOE_PADDING: bool = True + VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True + VLLM_ENABLE_V1_MULTIPROCESSING: bool = True + VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 + VLLM_DISABLE_COMPILE_CACHE: bool = False + Q_SCALE_CONSTANT: int = 200 + K_SCALE_CONSTANT: int = 200 + V_SCALE_CONSTANT: int = 100 + VLLM_SERVER_DEV_MODE: bool = False + VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 + VLLM_MLA_DISABLE: bool = False + VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32 + VLLM_RAY_PER_WORKER_GPUS: float = 1.0 + VLLM_RAY_BUNDLE_INDICES: str = "" + VLLM_CUDART_SO_PATH: str | None = None + VLLM_DP_RANK: int = 0 + VLLM_DP_RANK_LOCAL: int = -1 + VLLM_DP_SIZE: int = 1 + VLLM_USE_STANDALONE_COMPILE: bool = True + VLLM_DP_MASTER_IP: str = "" + VLLM_DP_MASTER_PORT: int = 0 + VLLM_MOE_DP_CHUNK_SIZE: int = 256 + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False + VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict" + VLLM_MARLIN_USE_ATOMIC_ADD: bool = False + VLLM_MXFP4_USE_MARLIN: bool | None = None + VLLM_V1_USE_OUTLINES_CACHE: bool = False + VLLM_TPU_BUCKET_PADDING_GAP: int = 0 + VLLM_TPU_MOST_MODEL_LEN: int | None = None + VLLM_TPU_USING_PATHWAYS: bool = False + VLLM_USE_DEEP_GEMM: bool = True + VLLM_MOE_USE_DEEP_GEMM: bool = True + VLLM_USE_DEEP_GEMM_E8M0: bool = True + VLLM_DEEP_GEMM_WARMUP: Literal[ + "skip", + "full", + "relax", + ] = "relax" + VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True + VLLM_USE_FLASHINFER_MOE_FP16: bool = False + VLLM_USE_FLASHINFER_MOE_FP8: bool = False + VLLM_USE_FLASHINFER_MOE_FP4: bool = False + VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency" + VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024 + VLLM_XGRAMMAR_CACHE_MB: int = 0 + VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 + VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False + VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" + VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600 + VLLM_ALL2ALL_BACKEND: Literal[ + "naive", + "pplx", + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter", + "flashinfer_all2allv", + ] = "allgather_reducescatter" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 + VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 + VLLM_SLEEP_WHEN_IDLE: bool = False + VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16 + VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300 + VLLM_KV_CACHE_LAYOUT: Literal["NHD", "HND"] | None = None + VLLM_COMPUTE_NANS_IN_LOGITS: bool = False + VLLM_USE_NVFP4_CT_EMULATIONS: bool = False + VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal[ + "FP", "INT8", "INT6", "INT4", "NONE" + ] = "NONE" + VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True + VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: int | None = None + VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480 + VLLM_USE_CUDNN_PREFILL: bool = False + VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False + VLLM_ENABLE_CUDAGRAPH_GC: bool = False + VLLM_LOOPBACK_IP: str = "" + VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False + VLLM_ENABLE_RESPONSES_API_STORE: bool = False + VLLM_USE_TRTLLM_ATTENTION: str | None = None + VLLM_NVFP4_GEMM_BACKEND: str | None = None + VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False + VLLM_HAS_FLASHINFER_CUBIN: bool = False + VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False + VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False + VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False + VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False + VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True + VLLM_TUNED_CONFIG_FOLDER: str | None = None + VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set() + VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False + VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False + VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False + VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False + VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True + VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" + VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024 + VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: bool = False + VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL: bool = False + VLLM_DBO_COMM_SMS: int = 20 + VLLM_PATTERN_MATCH_DEBUG: str | None = None + VLLM_DEBUG_DUMP_PATH: str | None = None + VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True + VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True + VLLM_USE_NCCL_SYMM_MEM: bool = False + VLLM_NCCL_INCLUDE_PATH: str | None = None + VLLM_USE_FBGEMM: bool = False + VLLM_GC_DEBUG: str = "" + VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = True + VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256 + VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" + VLLM_FLAT_LOGPROBS: bool = False + # optional envs we add. + VLLM_W8A8_MOE_USE_W4A8: bool = False + VLLM_W4A8_FORMAT: str = "TN" + VLLM_W4A8_VERSION: int = 2 + VLLM_MIX_QUANTIZATION_TYPE: str = "" + VLLM_MLA_CUSTOMIZE: bool = True + VLLM_USE_INT8_MLA: bool = False + VLLM_USE_MIX_MHA: bool = False + + # support Iluvatar IxServer + VLLM_SUPPORT_IXSERVER: bool = False + + VLLM_ATTN_OPT_LEVEL: bool = False + VLLM_MOE_OPT_LEVEL: int = 0 + VLLM_LINEAR_OPT_LEVEL: int = 0 + VLLM_OPT_EXCLUDE_LAYERS: str = "" + VLLM_USE_LORA_FUSION: bool = False + +def get_default_cache_root(): + return os.getenv( + "XDG_CACHE_HOME", + os.path.join(os.path.expanduser("~"), ".cache"), + ) + + +def get_default_config_root(): + return os.getenv( + "XDG_CONFIG_HOME", + os.path.join(os.path.expanduser("~"), ".config"), + ) + + +def maybe_convert_int(value: str | None) -> int | None: + if value is None: + return None + return int(value) + + +def maybe_convert_bool(value: str | None) -> bool | None: + if value is None: + return None + return bool(int(value)) + + +def disable_compile_cache() -> bool: + return bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))) + + +def use_aot_compile() -> bool: + from vllm.model_executor.layers.batch_invariant import ( + vllm_is_batch_invariant, + ) + from vllm.utils.torch_utils import is_torch_equal_or_newer + + default_value = ( + "1" + if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache() + else "0" + ) + + return ( + not vllm_is_batch_invariant() + and os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1" + ) + + +def env_with_choices( + env_name: str, + default: str | None, + choices: list[str] | Callable[[], list[str]], + case_sensitive: bool = True, +) -> Callable[[], str | None]: + """ + Create a lambda that validates environment variable against allowed choices + + Args: + env_name: Name of the environment variable + default: Default value if not set (can be None) + choices: List of valid string options or callable that returns list + case_sensitive: Whether validation should be case sensitive + + Returns: + Lambda function for environment_variables dict + """ + + def _get_validated_env() -> str | None: + value = os.getenv(env_name) + if value is None: + return default + + # Resolve choices if it's a callable (for lazy loading) + actual_choices = choices() if callable(choices) else choices + + if not case_sensitive: + check_value = value.lower() + check_choices = [choice.lower() for choice in actual_choices] + else: + check_value = value + check_choices = actual_choices + + if check_value not in check_choices: + raise ValueError( + f"Invalid value '{value}' for {env_name}. " + f"Valid options: {actual_choices}." + ) + + return value + + return _get_validated_env + + +def env_list_with_choices( + env_name: str, + default: list[str], + choices: list[str] | Callable[[], list[str]], + case_sensitive: bool = True, +) -> Callable[[], list[str]]: + """ + Create a lambda that validates environment variable + containing comma-separated values against allowed choices + + Args: + env_name: Name of the environment variable + default: Default list of values if not set + choices: List of valid string options or callable that returns list + case_sensitive: Whether validation should be case sensitive + + Returns: + Lambda function for environment_variables + dict that returns list of strings + """ + + def _get_validated_env_list() -> list[str]: + value = os.getenv(env_name) + if value is None: + return default + + # Split comma-separated values and strip whitespace + values = [v.strip() for v in value.split(",") if v.strip()] + + if not values: + return default + + # Resolve choices if it's a callable (for lazy loading) + actual_choices = choices() if callable(choices) else choices + + # Validate each value + for val in values: + if not case_sensitive: + check_value = val.lower() + check_choices = [choice.lower() for choice in actual_choices] + else: + check_value = val + check_choices = actual_choices + + if check_value not in check_choices: + raise ValueError( + f"Invalid value '{val}' in {env_name}. " + f"Valid options: {actual_choices}." + ) + + return values + + return _get_validated_env_list + + +def env_set_with_choices( + env_name: str, + default: list[str], + choices: list[str] | Callable[[], list[str]], + case_sensitive: bool = True, +) -> Callable[[], set[str]]: + """ + Creates a lambda which that validates environment variable + containing comma-separated values against allowed choices which + returns choices as a set. + """ + + def _get_validated_env_set() -> set[str]: + return set(env_list_with_choices(env_name, default, choices, case_sensitive)()) + + return _get_validated_env_set + + +def get_vllm_port() -> int | None: + """Get the port from VLLM_PORT environment variable. + + Returns: + The port number as an integer if VLLM_PORT is set, None otherwise. + + Raises: + ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue. + """ + if "VLLM_PORT" not in os.environ: + return None + + port = os.getenv("VLLM_PORT", "0") + + try: + return int(port) + except ValueError as err: + from urllib.parse import urlparse + + parsed = urlparse(port) + if parsed.scheme: + raise ValueError( + f"VLLM_PORT '{port}' appears to be a URI. " + "This may be caused by a Kubernetes service discovery issue," + "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html" + ) from None + raise ValueError(f"VLLM_PORT '{port}' must be a valid integer") from err + + +# The start-* and end* here are used by the documentation generator +# to extract the used env vars. + +# --8<-- [start:env-vars-definition] + +environment_variables: dict[str, Callable[[], Any]] = { + # ================== Installation Time Env Vars ================== + # Target device of vLLM, supporting [cuda (by default), + # rocm, cpu] + "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), + # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9], + # 12.8 is the default. This follows PyTorch but can be overridden. + "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() + or "12.8", + # Maximum number of compilation jobs to run in parallel. + # By default this is the number of CPUs + "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), + # Number of threads to use for nvcc + # By default this is 1. + # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU. + "NVCC_THREADS": lambda: os.getenv("NVCC_THREADS", None), + # If set, vllm will use precompiled binaries (*.so) + "VLLM_USE_PRECOMPILED": lambda: os.environ.get("VLLM_USE_PRECOMPILED", "") + .strip() + .lower() + in ("1", "true") + or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), + # Used to mark that setup.py is running in a Docker build context, + # in order to force the use of precompiled binaries. + "VLLM_DOCKER_BUILD_CONTEXT": lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "") + .strip() + .lower() + in ("1", "true"), + # Whether to force using nightly wheel in python build. + # This is used for testing the nightly wheel in python build. + "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL": lambda: bool( + int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0")) + ), + # CMake build type + # If not set, defaults to "Debug" or "RelWithDebInfo" + # Available options: "Debug", "Release", "RelWithDebInfo" + "CMAKE_BUILD_TYPE": env_with_choices( + "CMAKE_BUILD_TYPE", None, ["Debug", "Release", "RelWithDebInfo"] + ), + # If set, vllm will print verbose logs during installation + "VERBOSE": lambda: bool(int(os.getenv("VERBOSE", "0"))), + # Root directory for vLLM configuration files + # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set + # Note that this not only affects how vllm finds its configuration files + # during runtime, but also affects how vllm installs its configuration + # files during **installation**. + "VLLM_CONFIG_ROOT": lambda: os.path.expanduser( + os.getenv( + "VLLM_CONFIG_ROOT", + os.path.join(get_default_config_root(), "vllm"), + ) + ), + # ================== Runtime Env Vars ================== + # Root directory for vLLM cache files + # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set + "VLLM_CACHE_ROOT": lambda: os.path.expanduser( + os.getenv( + "VLLM_CACHE_ROOT", + os.path.join(get_default_cache_root(), "vllm"), + ) + ), + # used in distributed environment to determine the ip address + # of the current node, when the node has multiple network interfaces. + # If you are using multi-node inference, you should set this differently + # on each node. + "VLLM_HOST_IP": lambda: os.getenv("VLLM_HOST_IP", ""), + # used in distributed environment to manually set the communication port + # Note: if VLLM_PORT is set, and some code asks for multiple ports, the + # VLLM_PORT will be used as the first port, and the rest will be generated + # by incrementing the VLLM_PORT value. + "VLLM_PORT": get_vllm_port, + # path used for ipc when the frontend api server is running in + # multi-processing mode to communicate with the backend engine process. + "VLLM_RPC_BASE_PATH": lambda: os.getenv( + "VLLM_RPC_BASE_PATH", tempfile.gettempdir() + ), + # If true, will load models from ModelScope instead of Hugging Face Hub. + # note that the value is true or false, not numbers + "VLLM_USE_MODELSCOPE": lambda: os.environ.get( + "VLLM_USE_MODELSCOPE", "False" + ).lower() + == "true", + # Interval in seconds to log a warning message when the ring buffer is full + "VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int( + os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60") + ), + # path to cudatoolkit home directory, under which should be bin, include, + # and lib directories. + "CUDA_HOME": lambda: os.environ.get("CUDA_HOME", None), + # Path to the NCCL library file. It is needed because nccl>=2.19 brought + # by PyTorch contains a bug: https://github.com/NVIDIA/nccl/issues/1234 + "VLLM_NCCL_SO_PATH": lambda: os.environ.get("VLLM_NCCL_SO_PATH", None), + # when `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl + # library file in the locations specified by `LD_LIBRARY_PATH` + "LD_LIBRARY_PATH": lambda: os.environ.get("LD_LIBRARY_PATH", None), + # flag to control the chunk size (in MB) for sleeping memory allocations under ROCm + "VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE": lambda: int( + os.environ.get("VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE", "256") + ), + # Use separate prefill and decode kernels for V1 attention instead of + # the unified triton kernel. + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": lambda: ( + os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() + in ("true", "1") + ), + # Force vllm to use a specific flash-attention version (2 or 3), only valid + # when using the flash-attention backend. + "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int( + os.environ.get("VLLM_FLASH_ATTN_VERSION", None) + ), + # Feature flag to enable/disable Inductor standalone compile. + # In torch <= 2.7 we ignore this flag; in torch >= 2.9 this is + # enabled by default. + "VLLM_USE_STANDALONE_COMPILE": lambda: os.environ.get( + "VLLM_USE_STANDALONE_COMPILE", "1" + ) + == "1", + # Debug pattern matching inside custom passes. + # Should be set to the fx.Node name (e.g. 'getitem_34' or 'scaled_mm_3'). + "VLLM_PATTERN_MATCH_DEBUG": lambda: os.environ.get( + "VLLM_PATTERN_MATCH_DEBUG", None + ), + # Dump fx graphs to the given directory. + # It will override CompilationConfig.debug_dump_path if set. + "VLLM_DEBUG_DUMP_PATH": lambda: os.environ.get("VLLM_DEBUG_DUMP_PATH", None), + # Feature flag to enable/disable AOT compilation. This will ensure + # compilation is done in warmup phase and the compilation will be + # reused in subsequent calls. + "VLLM_USE_AOT_COMPILE": use_aot_compile, + # Feature flag to enable/disable bytecode in + # TorchCompileWithNoGuardsWrapper. + "VLLM_USE_BYTECODE_HOOK": lambda: bool( + int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1")) + ), + # Force vllm to always load AOT compiled models from disk. Failure + # to load will result in a hard error when this is enabled. + # Will be ignored when VLLM_USE_AOT_COMPILE is disabled. + "VLLM_FORCE_AOT_LOAD": lambda: os.environ.get("VLLM_FORCE_AOT_LOAD", "0") == "1", + # local rank of the process in the distributed setting, used to determine + # the GPU device id + "LOCAL_RANK": lambda: int(os.environ.get("LOCAL_RANK", "0")), + # used to control the visible devices in the distributed setting + "CUDA_VISIBLE_DEVICES": lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None), + # timeout for each iteration in the engine + "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int( + os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60") + ), + # API key for vLLM API server + "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None), + # Whether to log responses from API Server for debugging + "VLLM_DEBUG_LOG_API_SERVER_RESPONSE": lambda: os.environ.get( + "VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False" + ).lower() + == "true", + # S3 access information, used for tensorizer to load model from S3 + "S3_ACCESS_KEY_ID": lambda: os.environ.get("S3_ACCESS_KEY_ID", None), + "S3_SECRET_ACCESS_KEY": lambda: os.environ.get("S3_SECRET_ACCESS_KEY", None), + "S3_ENDPOINT_URL": lambda: os.environ.get("S3_ENDPOINT_URL", None), + # Usage stats collection + "VLLM_USAGE_STATS_SERVER": lambda: os.environ.get( + "VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai" + ), + "VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", + "VLLM_DISABLE_FLASHINFER_PREFILL": lambda: os.environ.get( + "VLLM_DISABLE_FLASHINFER_PREFILL", "0" + ) + == "1", + "VLLM_DO_NOT_TRACK": lambda: ( + os.environ.get("VLLM_DO_NOT_TRACK", None) + or os.environ.get("DO_NOT_TRACK", None) + or "0" + ) + == "1", + "VLLM_USAGE_SOURCE": lambda: os.environ.get("VLLM_USAGE_SOURCE", "production"), + # Logging configuration + # If set to 0, vllm will not configure logging + # If set to 1, vllm will configure logging using the default configuration + # or the configuration file specified by VLLM_LOGGING_CONFIG_PATH + "VLLM_CONFIGURE_LOGGING": lambda: int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")), + "VLLM_LOGGING_CONFIG_PATH": lambda: os.getenv("VLLM_LOGGING_CONFIG_PATH"), + # this is used for configuring the default logging level + "VLLM_LOGGING_LEVEL": lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(), + # this is used for configuring the default logging stream + "VLLM_LOGGING_STREAM": lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"), + # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages + "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), + # If set, vllm will log stats at this interval in seconds + # If not set, vllm will log stats every 10 seconds. + "VLLM_LOG_STATS_INTERVAL": lambda: val + if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10."))) > 0.0 + else 10.0, + # Trace function calls + # If set to 1, vllm will trace function calls + # Useful for debugging + "VLLM_TRACE_FUNCTION": lambda: int(os.getenv("VLLM_TRACE_FUNCTION", "0")), + # Backend for attention computation + # Example options: + # - "TORCH_SDPA": use torch.nn.MultiheadAttention + # - "FLASH_ATTN": use FlashAttention + # - "XFORMERS": use XFormers + # - "FLASHINFER": use flashinfer + # - "FLASHMLA": use FlashMLA + # - "FLASH_ATTN_MLA": use FlashAttention for MLA + # - "FLASHINFER_MLA": use FlashInfer for MLA + # - "CUTLASS_MLA": use CUTLASS for MLA + # All possible options loaded dynamically from AttentionBackendEnum + "VLLM_ATTENTION_BACKEND": env_with_choices( + "VLLM_ATTENTION_BACKEND", + None, + lambda: list( + __import__( + "vllm.attention.backends.registry", fromlist=["AttentionBackendEnum"] + ).AttentionBackendEnum.__members__.keys() + ), + ), + # If set, vllm will use flashinfer sampler + "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool( + int(os.environ["VLLM_USE_FLASHINFER_SAMPLER"]) + ) + if "VLLM_USE_FLASHINFER_SAMPLER" in os.environ + else None, + # Pipeline stage partition strategy + "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), + # (CPU backend only) CPU key-value cache space. + # default is None and will be set as 4 GB + "VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")) + if "VLLM_CPU_KVCACHE_SPACE" in os.environ + else None, + # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", + # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. + "VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "auto"), + # (CPU backend only) CPU cores not used by OMP threads . + # Those CPU cores will not be used by OMP threads of a rank. + "VLLM_CPU_NUM_OF_RESERVED_CPU": lambda: int( + os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0") + ) + if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ + else None, + # (CPU backend only) whether to use prepack for MoE layer. This will be + # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might + # need to set this to "0" (False). + "VLLM_CPU_MOE_PREPACK": lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))), + # (CPU backend only) whether to use SGL kernels, optimized for small batch. + "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))), + # If the env var is set, Ray Compiled Graph uses the specified + # channel type to communicate between workers belonging to + # different pipeline-parallel stages. + # Available options: + # - "auto": use the default channel type + # - "nccl": use NCCL for communication + # - "shm": use shared memory and gRPC for communication + "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE": env_with_choices( + "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto", ["auto", "nccl", "shm"] + ), + # If the env var is set, it enables GPU communication overlap + # (experimental feature) in Ray's Compiled Graph. + "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool( + int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0")) + ), + # If the env var is set, it uses a Ray Communicator wrapping + # vLLM's pipeline parallelism communicator to interact with Ray's + # Compiled Graph. Otherwise, it uses Ray's NCCL communicator. + "VLLM_USE_RAY_WRAPPED_PP_COMM": lambda: bool( + int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1")) + ), + # Use dedicated multiprocess context for workers. + # Both spawn and fork work + "VLLM_WORKER_MULTIPROC_METHOD": env_with_choices( + "VLLM_WORKER_MULTIPROC_METHOD", "fork", ["spawn", "fork"] + ), + # Path to the cache for storing downloaded assets + "VLLM_ASSETS_CACHE": lambda: os.path.expanduser( + os.getenv( + "VLLM_ASSETS_CACHE", + os.path.join(get_default_cache_root(), "vllm", "assets"), + ) + ), + # If the env var is set, we will clean model file in + # this path $VLLM_ASSETS_CACHE/model_streamer/$model_name + "VLLM_ASSETS_CACHE_MODEL_CLEAN": lambda: bool( + int(os.getenv("VLLM_ASSETS_CACHE_MODEL_CLEAN", "0")) + ), + # Timeout for fetching images when serving multimodal models + # Default is 5 seconds + "VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")), + # Timeout for fetching videos when serving multimodal models + # Default is 30 seconds + "VLLM_VIDEO_FETCH_TIMEOUT": lambda: int( + os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30") + ), + # Timeout for fetching audio when serving multimodal models + # Default is 10 seconds + "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int( + os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10") + ), + # Whether to allow HTTP redirects when fetching from media URLs. + # Default to True + "VLLM_MEDIA_URL_ALLOW_REDIRECTS": lambda: bool( + int(os.getenv("VLLM_MEDIA_URL_ALLOW_REDIRECTS", "1")) + ), + # Max number of workers for the thread pool handling + # media bytes loading. Set to 1 to disable parallel processing. + # Default is 8 + "VLLM_MEDIA_LOADING_THREAD_COUNT": lambda: int( + os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8") + ), + # Maximum filesize in MB for a single audio file when processing + # speech-to-text requests. Files larger than this will be rejected. + # Default is 25 MB + "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB": lambda: int( + os.getenv("VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", "25") + ), + # Backend for Video IO + # - "opencv": Default backend that uses OpenCV stream buffered backend. + # + # Custom backend implementations can be registered + # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and + # imported at runtime. + # If a non-existing backend is used, an AssertionError will be thrown. + "VLLM_VIDEO_LOADER_BACKEND": lambda: os.getenv( + "VLLM_VIDEO_LOADER_BACKEND", "opencv" + ), + # Media connector implementation. + # - "http": Default connector that supports fetching media via HTTP. + # + # Custom implementations can be registered + # via `@MEDIA_CONNECTOR_REGISTRY.register("my_custom_media_connector")` and + # imported at runtime. + # If a non-existing backend is used, an AssertionError will be thrown. + "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"), + # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache + # Default is 4 GiB per API process + 4 GiB per engine core process + "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), + # Path to the XLA persistent cache directory. + # Only used for XLA devices such as TPUs. + "VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser( + os.getenv( + "VLLM_XLA_CACHE_PATH", + os.path.join(get_default_cache_root(), "vllm", "xla_cache"), + ) + ), + # If set, assert on XLA recompilation after each execution step. + "VLLM_XLA_CHECK_RECOMPILATION": lambda: bool( + int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0")) + ), + # Enable SPMD mode for TPU backend. + "VLLM_XLA_USE_SPMD": lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))), + "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int( + os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768") + ), + # Control whether to use fused MoE activation chunking. Current chunking + # logic is incompatible with torch.compile and causes IMA. See issue + # https://github.com/vllm-project/vllm/issues/19631. + "VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING": lambda: bool( + int(os.getenv("VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING", "1")) + ), + # If set, the OpenAI API server will stay alive even after the underlying + # AsyncLLMEngine errors and stops serving requests + "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH": lambda: bool( + int(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", "0")) + ), + # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows + # the user to specify a max sequence length greater than + # the max length derived from the model's config.json. + # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": lambda: ( + os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() + in ("1", "true") + ), + # If set, forces FP8 Marlin to be used for FP8 quantization regardless + # of the hardware support for FP8 compute. + "VLLM_TEST_FORCE_FP8_MARLIN": lambda: ( + os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() + in ("1", "true") + ), + "VLLM_TEST_FORCE_LOAD_FORMAT": lambda: os.getenv( + "VLLM_TEST_FORCE_LOAD_FORMAT", "dummy" + ), + # Time in ms for the zmq client to wait for a response from the backend + # server for simple data operations + "VLLM_RPC_TIMEOUT": lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "1000000")), + # Timeout in seconds for keeping HTTP connections alive in API server + "VLLM_HTTP_TIMEOUT_KEEP_ALIVE": lambda: int( + os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5") + ), + # a list of plugin names to load, separated by commas. + # if this is not set, it means all plugins will be loaded + # if this is set to an empty string, no plugins will be loaded + "VLLM_PLUGINS": lambda: None + if "VLLM_PLUGINS" not in os.environ + else os.environ["VLLM_PLUGINS"].split(","), + # a local directory to look in for unrecognized LoRA adapters. + # only works if plugins are enabled and + # VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled. + "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv( + "VLLM_LORA_RESOLVER_CACHE_DIR", None + ), + # Enables torch CUDA profiling if set. + # On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered. + "VLLM_TORCH_CUDA_PROFILE": lambda: bool( + os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0" + ), + # Enables torch profiler if set. + # Both AsyncLLM's CPU traces as well as workers' + # traces (CPU & GPU) will be saved under this directory. + # Note that it must be an absolute path. + "VLLM_TORCH_PROFILER_DIR": lambda: ( + None + if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None + else ( + val + if val.startswith("gs://") and val[5:] and val[5] != "/" + else os.path.abspath(os.path.expanduser(val)) + ) + ), + # Enable torch profiler to record shapes if set + # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will + # not record shapes. + "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool( + os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0" + ), + # Enable torch profiler to profile memory if set + # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler + # will not profile memory. + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool( + os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0" + ), + # Enable torch profiler to profile stack if set + # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL + # profile stack by default. + "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool( + os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0" + ), + # Enable torch profiler to profile flops if set + # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will + # not profile flops. + "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool( + os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0" + ), + # If set, vLLM will use Triton implementations of AWQ. + "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), + # If set, allow loading or unloading lora adapters in runtime, + "VLLM_ALLOW_RUNTIME_LORA_UPDATING": lambda: ( + os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() + in ("1", "true") + ), + # We assume drivers can report p2p status correctly. + # If the program hangs when using custom allreduce, + # potantially caused by a bug in the driver (535 series), + # if might be helpful to set VLLM_SKIP_P2P_CHECK=0 + # so that vLLM can verify if p2p is actually working. + # See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa + "VLLM_SKIP_P2P_CHECK": lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "1") == "1", + # List of quantization kernels that should be disabled, used for testing + # and performance comparisons. Currently only affects MPLinearKernel + # selection + # (kernels: MacheteLinearKernel, MarlinLinearKernel, ExllamaLinearKernel) + "VLLM_DISABLED_KERNELS": lambda: [] + if "VLLM_DISABLED_KERNELS" not in os.environ + else os.environ["VLLM_DISABLED_KERNELS"].split(","), + # Disable pynccl (using torch.distributed instead) + "VLLM_DISABLE_PYNCCL": lambda: ( + os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1") + ), + # Disable aiter ops unless specifically enabled. + # Acts as a parent switch to enable the rest of the other operations. + "VLLM_ROCM_USE_AITER": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in ("true", "1") + ), + # Whether to use aiter paged attention. + # By default is disabled. + "VLLM_ROCM_USE_AITER_PAGED_ATTN": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in ("true", "1") + ), + # use aiter linear op if aiter ops are enabled + # The following list of related ops + # - scaled_mm (per-tensor / rowwise) + "VLLM_ROCM_USE_AITER_LINEAR": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in ("true", "1") + ), + # Whether to use aiter moe ops. + # By default is enabled. + "VLLM_ROCM_USE_AITER_MOE": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in ("true", "1") + ), + # use aiter rms norm op if aiter ops are enabled. + "VLLM_ROCM_USE_AITER_RMSNORM": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in ("true", "1") + ), + # Whether to use aiter mla ops. + # By default is enabled. + "VLLM_ROCM_USE_AITER_MLA": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in ("true", "1") + ), + # Whether to use aiter mha ops. + # By default is enabled. + "VLLM_ROCM_USE_AITER_MHA": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in ("true", "1") + ), + # Whether to use aiter fp4 gemm asm. + # By default is disabled. + "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_FP4_ASM_GEMM", "False").lower() in ("true", "1") + ), + # Whether to use aiter rope. + # By default is disabled. + "VLLM_ROCM_USE_AITER_TRITON_ROPE": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_TRITON_ROPE", "False").lower() in ("true", "1") + ), + # Whether to use aiter triton fp8 bmm kernel + # By default is enabled. + "VLLM_ROCM_USE_AITER_FP8BMM": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in ("true", "1") + ), + # Use AITER triton unified attention for V1 attention + "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION", "False").lower() + in ("true", "1") + ), + # Whether to use aiter fusion shared experts ops. + # By default is enabled. + "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "True").lower() + in ("true", "1") + ), + # Whether to use aiter triton kernels for gemm ops. + # By default is enabled. + "VLLM_ROCM_USE_AITER_TRITON_GEMM": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_TRITON_GEMM", "True").lower() in ("true", "1") + ), + # use rocm skinny gemms + "VLLM_ROCM_USE_SKINNY_GEMM": lambda: ( + os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in ("true", "1") + ), + # Pad the fp8 weights to 256 bytes for ROCm + "VLLM_ROCM_FP8_PADDING": lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))), + # Pad the weights for the moe kernel + "VLLM_ROCM_MOE_PADDING": lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))), + # custom paged attention kernel for MI3* cards + "VLLM_ROCM_CUSTOM_PAGED_ATTN": lambda: ( + os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in ("true", "1") + ), + # Custom quick allreduce kernel for MI3* cards + # Choice of quantization level: FP, INT8, INT6, INT4 or NONE + # Recommended for large models to get allreduce + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION": env_with_choices( + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", + "NONE", + ["FP", "INT8", "INT6", "INT4", "NONE"], + ), + # Custom quick allreduce kernel for MI3* cards + # Due to the lack of the bfloat16 asm instruction, bfloat16 + # kernels are slower than fp16, + # If environment variable is set to 1, the input is converted to fp16 + "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16": lambda: ( + os.getenv("VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", "True").lower() + in ("true", "1") + ), + # Custom quick allreduce kernel for MI3* cards. + # Controls the maximum allowed number of data bytes(MB) for custom quick + # allreduce communication. + # Default: 2048 MB. + # Data exceeding this size will use either custom allreduce or RCCL + # communication. + "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB": lambda: maybe_convert_int( + os.environ.get("VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", None) + ), + # Divisor for dynamic query scale factor calculation for FP8 KV Cache + "Q_SCALE_CONSTANT": lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")), + # Divisor for dynamic key scale factor calculation for FP8 KV Cache + "K_SCALE_CONSTANT": lambda: int(os.getenv("K_SCALE_CONSTANT", "200")), + # Divisor for dynamic value scale factor calculation for FP8 KV Cache + "V_SCALE_CONSTANT": lambda: int(os.getenv("V_SCALE_CONSTANT", "100")), + # If set, enable multiprocessing in LLM for the V1 code path. + "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool( + int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")) + ), + "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float( + os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1") + ), + "VLLM_DISABLE_COMPILE_CACHE": disable_compile_cache, + # If set, vllm will run in development mode, which will enable + # some additional endpoints for developing and debugging, + # e.g. `/reset_prefix_cache` + "VLLM_SERVER_DEV_MODE": lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))), + # Controls the maximum number of requests to handle in a + # single asyncio task when processing per-token outputs in the + # V1 AsyncLLM interface. It is applicable when handling a high + # concurrency of streaming requests. + # Setting this too high can result in a higher variance of + # inter-message latencies. Setting it too low can negatively impact + # TTFT and overall throughput. + "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": lambda: int( + os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128") + ), + # If set, vLLM will disable the MLA attention optimizations. + "VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), + # If set, vLLM will pick up the provided Flash Attention MLA + # max number splits for cuda graph decode + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int( + os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", "32") + ), + # Number of GPUs per worker in Ray, if it is set to be a fraction, + # it allows ray to schedule multiple actors on a single GPU, + # so that users can colocate other actors on the same GPUs as vLLM. + "VLLM_RAY_PER_WORKER_GPUS": lambda: float( + os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0") + ), + # Bundle indices for Ray, if it is set, it can control precisely + # which indices are used for the Ray bundle, for every worker. + # Format: comma-separated list of integers, e.g. "0,1,2,3" + "VLLM_RAY_BUNDLE_INDICES": lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""), + # In some system, find_loaded_library() may not work. So we allow users to + # specify the path through environment variable VLLM_CUDART_SO_PATH. + "VLLM_CUDART_SO_PATH": lambda: os.getenv("VLLM_CUDART_SO_PATH", None), + # Rank of the process in the data parallel setting + "VLLM_DP_RANK": lambda: int(os.getenv("VLLM_DP_RANK", "0")), + # Rank of the process in the data parallel setting. + # Defaults to VLLM_DP_RANK when not set. + "VLLM_DP_RANK_LOCAL": lambda: int( + os.getenv("VLLM_DP_RANK_LOCAL", sys.modules[__name__].VLLM_DP_RANK) + ), + # World size of the data parallel setting + "VLLM_DP_SIZE": lambda: int(os.getenv("VLLM_DP_SIZE", "1")), + # IP address of the master node in the data parallel setting + "VLLM_DP_MASTER_IP": lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"), + # Port of the master node in the data parallel setting + "VLLM_DP_MASTER_PORT": lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")), + # In the context of executing MoE models with Data-Parallel, Expert-Parallel + # and Batched All-to-All dispatch/combine kernels, VLLM_MOE_DP_CHUNK_SIZE + # dictates the quantum of tokens that can be dispatched from a DP + # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE + # units. + "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")), + # Randomize inputs during dummy runs when using Data Parallel + "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: os.environ.get( + "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0" + ) + == "1", + # Strategy to pack the data parallel ranks for Ray. + # Available options: + # - "fill": + # for DP master node, allocate exactly data-parallel-size-local DP ranks, + # for non-master nodes, allocate as many DP ranks as can fit; + # - "strict": + # allocate exactly data-parallel-size-local DP ranks to each picked node; + # - "span": + # Should be used only when a single DP rank requires multiple nodes. + # allocate one DP rank over as many nodes as required for set world_size; + # This environment variable is ignored if data-parallel-backend is not Ray. + "VLLM_RAY_DP_PACK_STRATEGY": lambda: os.getenv( + "VLLM_RAY_DP_PACK_STRATEGY", "strict" + ), + # Whether to use S3 path for model loading in CI via RunAI Streamer + "VLLM_CI_USE_S3": lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1", + # Use model_redirect to redirect the model name to a local folder. + # `model_redirect` can be a json file mapping the model between + # repo_id and local folder: + # {"meta-llama/Llama-3.2-1B": "/tmp/Llama-3.2-1B"} + # or a space separated values table file: + # meta-llama/Llama-3.2-1B /tmp/Llama-3.2-1B + "VLLM_MODEL_REDIRECT_PATH": lambda: os.environ.get( + "VLLM_MODEL_REDIRECT_PATH", None + ), + # Whether to use atomicAdd reduce in gptq/awq marlin kernel. + "VLLM_MARLIN_USE_ATOMIC_ADD": lambda: os.environ.get( + "VLLM_MARLIN_USE_ATOMIC_ADD", "0" + ) + == "1", + # Whether to use marlin kernel in mxfp4 quantization method + "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool( + os.environ.get("VLLM_MXFP4_USE_MARLIN", None) + ), + # Whether to turn on the outlines cache for V1 + # This cache is unbounded and on disk, so it's not safe to use in + # an environment with potentially malicious users. + "VLLM_V1_USE_OUTLINES_CACHE": lambda: os.environ.get( + "VLLM_V1_USE_OUTLINES_CACHE", "0" + ) + == "1", + # Gap between padding buckets for the forward pass. So we have + # 8, we will run forward pass with [16, 24, 32, ...]. + "VLLM_TPU_BUCKET_PADDING_GAP": lambda: int( + os.environ["VLLM_TPU_BUCKET_PADDING_GAP"] + ) + if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ + else 0, + "VLLM_TPU_MOST_MODEL_LEN": lambda: maybe_convert_int( + os.environ.get("VLLM_TPU_MOST_MODEL_LEN", None) + ), + # Whether using Pathways + "VLLM_TPU_USING_PATHWAYS": lambda: bool( + "proxy" in os.getenv("JAX_PLATFORMS", "").lower() + ), + # Allow use of DeepGemm kernels for fused moe ops. + "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "1"))), + # Allow use of DeepGemm specifically for MoE fused ops (overrides only MoE). + "VLLM_MOE_USE_DEEP_GEMM": lambda: bool( + int(os.getenv("VLLM_MOE_USE_DEEP_GEMM", "1")) + ), + # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs. + "VLLM_USE_DEEP_GEMM_E8M0": lambda: bool( + int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1")) + ), + # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm + # JIT all the required kernels before model execution so there is no + # JIT'ing in the hot-path. However, this warmup increases the engine + # startup time by a couple of minutes. + # Available options: + # - "skip" : Skip warmup. + # - "full" : Warmup deepgemm by running all possible gemm shapes the + # engine could encounter. + # - "relax" : Select gemm shapes to run based on some heuristics. The + # heuristic aims to have the same effect as running all possible gemm + # shapes, but provides no guarantees. + "VLLM_DEEP_GEMM_WARMUP": env_with_choices( + "VLLM_DEEP_GEMM_WARMUP", + "relax", + [ + "skip", + "full", + "relax", + ], + ), + # Whether to use fused grouped_topk used for MoE expert selection. + "VLLM_USE_FUSED_MOE_GROUPED_TOPK": lambda: bool( + int(os.getenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "1")) + ), + # Allow use of FlashInfer MoE kernels for fused moe ops. + "VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool( + int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP16", "0")) + ), + # Allow use of FlashInfer MoE kernels for fused moe ops. + "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool( + int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0")) + ), + # Allow use of FlashInfer CUTLASS kernels for fused moe ops. + "VLLM_USE_FLASHINFER_MOE_FP4": lambda: bool( + int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0")) + ), + # If set to 1, use the FlashInfer + # MXFP8 (activation) x MXFP4 (weight) MoE backend. + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8": lambda: bool( + int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0")) + ), + # If set to 1, use the FlashInfer CUTLASS backend for + # MXFP8 (activation) x MXFP4 (weight) MoE. + # This is separate from the TRTLLMGEN path controlled by + # VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8. + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS": lambda: bool( + int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0")) + ), + # If set to 1, use the FlashInfer + # BF16 (activation) x MXFP4 (weight) MoE backend. + "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool( + int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0")) + ), + # Control the cache sized used by the xgrammar compiler. The default + # of 512 MB should be enough for roughly 1000 JSON schemas. + # It can be changed with this variable if needed for some reason. + "VLLM_XGRAMMAR_CACHE_MB": lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")), + # Control the threshold for msgspec to use 'zero copy' for + # serialization/deserialization of tensors. Tensors below + # this limit will be encoded into the msgpack buffer, and + # tensors above will instead be sent via a separate message. + # While the sending side still actually copies the tensor + # in all cases, on the receiving side, tensors above this + # limit will actually be zero-copy decoded. + "VLLM_MSGPACK_ZERO_COPY_THRESHOLD": lambda: int( + os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256") + ), + # If set, allow insecure serialization using pickle. + # This is useful for environments where it is deemed safe to use the + # insecure method and it is needed for some reason. + "VLLM_ALLOW_INSECURE_SERIALIZATION": lambda: bool( + int(os.getenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "0")) + ), + # IP address used for NIXL handshake between remote agents. + "VLLM_NIXL_SIDE_CHANNEL_HOST": lambda: os.getenv( + "VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost" + ), + # Port used for NIXL handshake between remote agents. + "VLLM_NIXL_SIDE_CHANNEL_PORT": lambda: int( + os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5600") + ), + # all2all backend for vllm's expert parallel communication + # Available options: + # - "naive": naive all2all implementation using broadcasts + # - "allgather_reducescatter": all2all implementation based on allgather and + # reducescatter + # - "pplx": use pplx kernels + # - "deepep_high_throughput", use deepep high-throughput kernels + # - "deepep_low_latency", use deepep low-latency kernels + # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl + "VLLM_ALL2ALL_BACKEND": env_with_choices( + "VLLM_ALL2ALL_BACKEND", + "allgather_reducescatter", + [ + "naive", + "pplx", + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter", + "flashinfer_all2allv", + ], + ), + # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. + # Both require compute capability 10.0 or above. + # Available options: + # - "throughput": [default] + # Uses CUTLASS kernels optimized for high-throughput batch inference. + # - "latency": + # Uses TensorRT-LLM kernels optimized for low-latency inference. + "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices( + "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"] + ), + # Control the workspace buffer size for the FlashInfer backend. + "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int( + os.getenv("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", str(394 * 1024 * 1024)) + ), + # Control the maximum number of tokens per expert supported by the + # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for + # the blockscale tensor of activations NVFP4 Quantization. + # This is used to prevent the kernel from running out of memory. + "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int( + os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840") + ), + # Specifies the thresholds of the communicated tensor sizes under which + # vllm should use flashinfer fused allreduce. The variable should be a + # JSON with the following format: + # { : } + # Unspecified world sizes will fall back to + # { 2: 64, 4: 1, : 0.5 } + "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": lambda: json.loads( + os.getenv("VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}") + ), + # MoE routing strategy selector. + # See `RoutingSimulator.get_available_strategies()` # for available + # strategies. + # Cutstom routing strategies can be registered by + # RoutingSimulator.register_strategy() + # Note: custom strategies may not produce correct model outputs + "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: os.environ.get( + "VLLM_MOE_ROUTING_SIMULATION_STRATEGY", "" + ).lower(), + # Regex timeout for use by the vLLM tool parsing plugins. + "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": lambda: int( + os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1") + ), + # Reduce CPU usage when vLLM is idle. Enabling this will incur small + # latency penalty when a request eventually comes. + "VLLM_SLEEP_WHEN_IDLE": lambda: bool(int(os.getenv("VLLM_SLEEP_WHEN_IDLE", "0"))), + # Control the max chunk bytes (in MB) for the rpc message queue. + # Object larger than this threshold will be broadcast to worker + # processes via zmq. + "VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int( + os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16") + ), + # Timeout in seconds for execute_model RPC calls in multiprocessing + # executor (only applies when TP > 1). + "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": lambda: int( + os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300") + ), + # KV Cache layout used throughout vllm. + # Some common values are: + # - NHD + # - HND + # Where N=num_blocks, H=num_heads and D=head_size. The default value will + # leave the layout choice to the backend. Mind that backends may only + # implement and support a subset of all possible layouts. + "VLLM_KV_CACHE_LAYOUT": env_with_choices( + "VLLM_KV_CACHE_LAYOUT", None, ["NHD", "HND"] + ), + # Enable checking whether the generated logits contain NaNs, + # indicating corrupted output. Useful for debugging low level bugs + # or bad hardware but it may add compute overhead. + "VLLM_COMPUTE_NANS_IN_LOGITS": lambda: bool( + int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0")) + ), + # Controls whether or not emulations are used for NVFP4 + # generations on machines < 100 for compressed-tensors + # models + "VLLM_USE_NVFP4_CT_EMULATIONS": lambda: bool( + int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")) + ), + # Time (in seconds) after which the KV cache on the producer side is + # automatically cleared if no READ notification is received from the + # consumer. This is only applicable when using NixlConnector in a + # disaggregated decode-prefill setup. + "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": lambda: int( + os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480") + ), + # Controls whether or not to use cudnn prefill + "VLLM_USE_CUDNN_PREFILL": lambda: bool( + int(os.getenv("VLLM_USE_CUDNN_PREFILL", "0")) + ), + # Controls whether to use TRT-LLM ragged DeepSeek prefill + "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL": lambda: bool( + int(os.getenv("VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", "0")) + ), + # If set to 1/True, use the TRTLLM attention backend in flashinfer. + # If set to 0/False, use the default attention backend in flashinfer. + # If not set, auto-detect the attention backend in flashinfer. + "VLLM_USE_TRTLLM_ATTENTION": lambda: ( + None + if "VLLM_USE_TRTLLM_ATTENTION" not in os.environ + else os.environ["VLLM_USE_TRTLLM_ATTENTION"].lower() in ("1", "true") + ), + # If set to 1, when we use fp8 kv, we do not quantize Q to fp8 + "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION": lambda: bool( + int(os.getenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "0")) + ), + # If set, it means we pre-downloaded cubin files and flashinfer will + # read the cubin files directly. + "VLLM_HAS_FLASHINFER_CUBIN": lambda: bool( + int(os.getenv("VLLM_HAS_FLASHINFER_CUBIN", "0")) + ), + # Supported options: + # - "flashinfer-cudnn": use flashinfer cudnn GEMM backend + # - "flashinfer-trtllm": use flashinfer trtllm GEMM backend + # - "flashinfer-cutlass": use flashinfer cutlass GEMM backend + # - : automatically pick an available backend + "VLLM_NVFP4_GEMM_BACKEND": env_with_choices( + "VLLM_NVFP4_GEMM_BACKEND", + None, + ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass", "cutlass"], + ), + # Controls garbage collection during CUDA graph capture. + # If set to 0 (default), enables GC freezing to speed up capture time. + # If set to 1, allows GC to run during capture. + "VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool( + int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0")) + ), + # Used to force set up loopback IP + "VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""), + # Used to set the process name prefix for vLLM processes. + # This is useful for debugging and monitoring purposes. + # The default value is "VLLM". + "VLLM_PROCESS_NAME_PREFIX": lambda: os.getenv("VLLM_PROCESS_NAME_PREFIX", "VLLM"), + # Allow chunked local attention with hybrid kv cache manager. + # Currently using the Hybrid KV cache manager with chunked local attention + # in the Llama4 models (the only models currently using chunked local attn) + # causes a latency regression. For this reason, we disable it by default. + # This flag is used to allow users to enable it if they want to (to save on + # kv-cache memory usage and enable longer contexts) + # TODO(lucas): Remove this flag once latency regression is resolved. + "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool( + int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0")) + ), + # Enables support for the "store" option in the OpenAI Responses API. + # When set to 1, vLLM's OpenAI server will retain the input and output + # messages for those requests in memory. By default, this is disabled (0), + # and the "store" option is ignored. + # NOTE/WARNING: + # 1. Messages are kept in memory only (not persisted to disk) and will be + # lost when the vLLM server shuts down. + # 2. Enabling this option will cause a memory leak, as stored messages are + # never removed from memory until the server terminates. + "VLLM_ENABLE_RESPONSES_API_STORE": lambda: bool( + int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0")) + ), + # If set, use the fp8 mfma in rocm paged attention. + "VLLM_ROCM_FP8_MFMA_PAGE_ATTN": lambda: bool( + int(os.getenv("VLLM_ROCM_FP8_MFMA_PAGE_ATTN", "0")) + ), + # Whether to use pytorch symmetric memory for allreduce + "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool( + int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1")) + ), + # Allows vllm to find tuned config under customized folder + "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), + # Valid values are container,code_interpreter,web_search_preview + # ex VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter + # If the server_label of your mcp tool is not in this list it will + # be completely ignored. + "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS": env_set_with_choices( + "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", + default=[], + choices=["container", "code_interpreter", "web_search_preview"], + ), + # Allows harmony instructions to be injected on system messages + "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS": lambda: bool( + int(os.getenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "0")) + ), + # Enable automatic retry when tool call JSON parsing fails + # If enabled, returns an error message to the model to retry + # If disabled (default), raises an exception and fails the request + "VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY": lambda: bool( + int(os.getenv("VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY", "0")) + ), + # Add optional custom scopes for profiling, disable to avoid overheads + "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool( + int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0")) + ), + # Add optional nvtx scopes for profiling, disable to avoid overheads + "VLLM_NVTX_SCOPES_FOR_PROFILING": lambda: bool( + int(os.getenv("VLLM_NVTX_SCOPES_FOR_PROFILING", "0")) + ), + # Represent block hashes in KV cache events as 64-bit integers instead of + # raw bytes. Defaults to True for backward compatibility. + "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES": lambda: bool( + int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1")) + ), + # Name of the shared memory buffer used for object storage. + # Only effective when mm_config.mm_processor_cache_type == "shm". + "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME": lambda: os.getenv( + "VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME", "VLLM_OBJECT_STORAGE_SHM_BUFFER" + ), + # The size in MB of the buffers (NVL and RDMA) used by DeepEP + "VLLM_DEEPEP_BUFFER_SIZE_MB": lambda: int( + os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024") + ), + # Force DeepEP to use intranode kernel for inter-node communication in + # high throughput mode. This is useful archive higher prefill throuhgput + # on system supports multi-node nvlink (e.g GB200). + "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool( + int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0")) + ), + # Allow DeepEP to use MNNVL (multi-node nvlink) for internode_ll kernel, + # turn this for better latency on GB200 like system + "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool( + int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", "0")) + ), + # The number of SMs to allocate for communication kernels when running DBO + # the rest of the SMs on the device will be allocated to compute + "VLLM_DBO_COMM_SMS": lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")), + # Enable max_autotune & coordinate_descent_tuning in inductor_config + # to compile static shapes passed from compile_sizes in compilation_config + # If set to 1, enable max_autotune; By default, this is enabled (1) + "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE": lambda: bool( + int(os.getenv("VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", "1")) + ), + # If set to 1, enable coordinate_descent_tuning; + # By default, this is enabled (1) + "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING": lambda: bool( + int(os.getenv("VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", "1")) + ), + # Flag to enable NCCL symmetric memory allocation and registration + "VLLM_USE_NCCL_SYMM_MEM": lambda: bool( + int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0")) + ), + # NCCL header path + "VLLM_NCCL_INCLUDE_PATH": lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None), + # Flag to enable FBGemm kernels on model execution + "VLLM_USE_FBGEMM": lambda: bool(int(os.getenv("VLLM_USE_FBGEMM", "0"))), + # GC debug config + # - VLLM_GC_DEBUG=0: disable GC debugger + # - VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times + # - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger with + # top 5 collected objects + "VLLM_GC_DEBUG": lambda: os.getenv("VLLM_GC_DEBUG", ""), + # Disables parallel execution of shared_experts via separate cuda stream + "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool( + int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "1")) + ), + # Limits when we run shared_experts in a separate stream. + # We found out that for large batch sizes, the separate stream + # execution is not beneficial (most likely because of the input clone) + # TODO(alexm-redhat): Tune to be more dynamic based on GPU type + "VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD": lambda: int( + int(os.getenv("VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD", 256)) + ), + # Format for saving torch.compile cache artifacts + # - "binary": saves as binary file + # Safe for multiple vllm serve processes accessing the same torch compile cache. + # - "unpacked": saves as directory structure (for inspection/debugging) + # NOT multiprocess safe - race conditions may occur with multiple processes. + # Allows viewing and setting breakpoints in Inductor's code output files. + "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices( + "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"] + ), + # Flag to enable FlatLogprobs whose GC overhead is significantly smaller than + # the original list[dict[int, Logprob]] approach. + # After enabled, PromptLogprobs and SampleLogprobs would populated as + # FlatLogprobs. + "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))), + + # vLLM do not support W4A8 and W8A8, we add it. For MOE, we default use W8A8, If set to true, we use W4A8. + + "VLLM_W8A8_MOE_USE_W4A8": + lambda: os.environ.get("VLLM_W8A8_MOE_USE_W4A8", "0").lower() in + ("1", "true"), + + # If set to true, we use int8 mla attention for decode stage. + + "VLLM_USE_INT8_MLA": + lambda: os.environ.get("VLLM_USE_INT8_MLA", "0").lower() in + ("1", "true"), + + # If set to true, we use int8 MHA attention for decode stage. + + "VLLM_ATTN_OPT_LEVEL": + lambda: os.environ.get("VLLM_ATTN_OPT_LEVEL", "0").lower() in + ("1", "true"), + + # For W4A8 MOE, we default use TN gemm format, choices: [TN, NN]. + "VLLM_W4A8_FORMAT": + lambda: os.environ.get("VLLM_W4A8_FORMAT", "TN").upper(), + + "VLLM_W4A8_VERSION": + # For W4A8 MOE, we default use version 2, choices: [1, 2]. + lambda: int(os.environ.get("VLLM_W4A8_VERSION", "2")), + + # temp param to support compressed-tensor's multi-quantization + "VLLM_MIX_QUANTIZATION_TYPE": + lambda: os.environ.get("VLLM_MIX_QUANTIZATION_TYPE", "").upper(), + + # Use Customize mlp impl for faster speed and less gpu memory usage. + "VLLM_MLA_CUSTOMIZE": + lambda: os.environ.get("VLLM_MLA_CUSTOMIZE", "1").lower() in + ("1", "true"), + + # support Iluvatar IxServer + # Does vLLM support Iluvatar IxServer which is a distributed inference framework. + "VLLM_SUPPORT_IXSERVER": + lambda: os.environ.get("VLLM_SUPPORT_IXSERVER", "0").lower() in + ("1", "true"), + + "VLLM_MOE_OPT_LEVEL": + lambda: int(os.getenv("VLLM_MOE_OPT_LEVEL", "0")), + + "VLLM_LINEAR_OPT_LEVEL": + lambda: int(os.getenv("VLLM_LINEAR_OPT_LEVEL", "0")), + + "VLLM_OPT_EXCLUDE_LAYERS": + lambda: os.environ.get("VLLM_OPT_EXCLUDE_LAYERS", "").upper(), + + "VLLM_USE_LORA_FUSION": + lambda: os.environ.get("VLLM_USE_LORA_FUSION", "0").lower() in + ("1", "true"), +} + +# --8<-- [end:env-vars-definition] + + +def __getattr__(name: str): + """ + Gets environment variables lazily. + + NOTE: After enable_envs_cache() invocation (which triggered after service + initialization), all environment variables will be cached. + """ + if name in environment_variables: + return environment_variables[name]() + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def enable_envs_cache() -> None: + """ + Enables caching of environment variables. This is useful for performance + reasons, as it avoids the need to re-evaluate environment variables on + every call. + + NOTE: Currently, it's invoked after service initialization to reduce + runtime overhead. This also means that environment variables should NOT + be updated after the service is initialized. + """ + # Tag __getattr__ with functools.cache + global __getattr__ + __getattr__ = functools.cache(__getattr__) + + # Cache all environment variables + for key in environment_variables: + __getattr__(key) + + +def __dir__(): + return list(environment_variables.keys()) + + +def is_set(name: str): + """Check if an environment variable is explicitly set.""" + if name in environment_variables: + return name in os.environ + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def compute_hash() -> str: + """ + WARNING: Whenever a new key is added to this environment + variables, ensure that it is included in the factors list if + it affects the computation graph. For example, different values + of VLLM_PP_LAYER_PARTITION will generate different computation + graphs, so it is included in the factors list. The env vars that + affect the choice of different kernels or attention backends should + also be included in the factors list. + """ + + # The values of envs may affects the computation graph. + # TODO(DefTruth): hash all environment variables? + # for key in environment_variables: + # factorize(key) + environment_variables_to_hash = [ + "VLLM_PP_LAYER_PARTITION", + "VLLM_MLA_DISABLE", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", + "VLLM_USE_TRITON_AWQ", + "VLLM_DP_RANK", + "VLLM_DP_SIZE", + "VLLM_USE_STANDALONE_COMPILE", + "VLLM_FUSED_MOE_CHUNK_SIZE", + "VLLM_FLASHINFER_MOE_BACKEND", + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION", + "VLLM_ATTENTION_BACKEND", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_DISABLED_KERNELS", + "VLLM_USE_DEEP_GEMM", + "VLLM_MOE_USE_DEEP_GEMM", + "VLLM_USE_DEEP_GEMM_E8M0", + "VLLM_USE_FUSED_MOE_GROUPED_TOPK", + "VLLM_USE_FLASHINFER_MOE_FP16", + "VLLM_USE_FLASHINFER_MOE_FP8", + "VLLM_USE_FLASHINFER_MOE_FP4", + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", + "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", + "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", + "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", + "VLLM_USE_CUDNN_PREFILL", + "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", + "VLLM_USE_TRTLLM_ATTENTION", + "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", + "VLLM_ROCM_USE_AITER", + "VLLM_ROCM_USE_AITER_PAGED_ATTN", + "VLLM_ROCM_USE_AITER_LINEAR", + "VLLM_ROCM_USE_AITER_MOE", + "VLLM_ROCM_USE_AITER_RMSNORM", + "VLLM_ROCM_USE_AITER_MLA", + "VLLM_ROCM_USE_AITER_MHA", + "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM", + "VLLM_ROCM_USE_AITER_TRITON_ROPE", + "VLLM_ROCM_USE_AITER_FP8BMM", + "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION", + "VLLM_ROCM_USE_AITER_TRITON_GEMM", + "VLLM_ROCM_USE_SKINNY_GEMM", + "VLLM_ROCM_FP8_PADDING", + "VLLM_ROCM_MOE_PADDING", + "VLLM_ROCM_CUSTOM_PAGED_ATTN", + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", + "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", + "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", + "VLLM_ROCM_FP8_MFMA_PAGE_ATTN", + "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", + "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", + "VLLM_NVFP4_GEMM_BACKEND", + "VLLM_USE_FBGEMM", + "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", + "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", + ] + for key in environment_variables_to_hash: + # if this goes out of sync with environment_variables, + # it's not a user error, it's a bug + assert key in environment_variables, ( + "Please update environment_variables_to_hash in envs.py" + ) + + factors = [environment_variables[key]() for key in environment_variables_to_hash] + + ray_noset_env_vars = [ + # Refer to + # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/nvidia_gpu.py#L11 + # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/amd_gpu.py#L11 + # https://github.com/ray-project/ray/blob/b97d21dab233c2bd8ed7db749a82a1e594222b5c/python/ray/_private/accelerators/amd_gpu.py#L10 + # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/npu.py#L12 + # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/hpu.py#L12 + # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/neuron.py#L14 + # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/tpu.py#L38 + # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/intel_gpu.py#L10 + # https://github.com/ray-project/ray/blob/c584b1ea97b00793d1def71eaf81537d70efba42/python/ray/_private/accelerators/rbln.py#L10 + "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", + "RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES", + "RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES", + "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES", + "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES", + "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES", + "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS", + "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR", + "RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES", + ] + factors.extend([os.getenv(var) for var in ray_noset_env_vars]) + + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + + return hash_str diff --git a/forward_context.py b/forward_context.py new file mode 100644 index 0000000..25fb718 --- /dev/null +++ b/forward_context.py @@ -0,0 +1,356 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import time +from collections import defaultdict +from contextlib import contextmanager +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, NamedTuple + +import torch + +import vllm.envs as envs +from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig +from vllm.logger import init_logger +from vllm.v1.worker.dp_utils import coordinate_batch_across_dp +from vllm.v1.worker.ubatch_utils import UBatchSlices + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + +logger = init_logger(__name__) + +track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0 +last_logging_time: float = 0 +forward_start_time: float = 0 +batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL +batchsize_forward_time: defaultdict = defaultdict(list) + + +class BatchDescriptor(NamedTuple): + """ + Batch descriptor for cudagraph dispatching. We should keep the num of + items as minimal as possible to properly and uniquely describe the padded + batch for cudagraph. + """ + + num_tokens: int + uniform_decode: bool = False + """ + False can also be used for an uniform decode batch to dispatch to the + cudagraph supporting non-uniform batches. + """ + has_lora: bool = False + """ + Whether this batch has active LoRA adapters. + """ + + @property + def non_uniform(self) -> "BatchDescriptor": + """ + Return a non-uniform version of current batch descriptor. + """ + return BatchDescriptor( + self.num_tokens, uniform_decode=False, has_lora=self.has_lora + ) + + +def _compute_sp_num_tokens( + num_tokens_across_dp_cpu: torch.Tensor, sequence_parallel_size: int +) -> list[int]: + sp_tokens = ( + num_tokens_across_dp_cpu + sequence_parallel_size - 1 + ) // sequence_parallel_size + + sp_tokens = sp_tokens.repeat_interleave(sequence_parallel_size) + return sp_tokens.tolist() + + +def _compute_chunked_local_num_tokens( + num_tokens_across_dp_cpu: torch.Tensor, + sequence_parallel_size: int, + max_num_tokens: int, + chunk_idx: int, +) -> list[int]: + sp_tokens = _compute_sp_num_tokens(num_tokens_across_dp_cpu, sequence_parallel_size) + sp_size = len(sp_tokens) + + local_size = [-1] * sp_size + for i in range(sp_size): + # Take into account sharding if MoE activation is sequence parallel. + local_size[i] = min(max_num_tokens, sp_tokens[i] - (max_num_tokens * chunk_idx)) + if local_size[i] <= 0: + local_size[i] = 1 # ensure lockstep even if done + return local_size + + +@dataclass +class DPMetadata: + max_tokens_across_dp_cpu: torch.Tensor + num_tokens_across_dp_cpu: torch.Tensor + + # NOTE: local_sizes should only be set by the chunked_sizes context manager + local_sizes: list[int] | None = None + + @staticmethod + def make( + parallel_config: ParallelConfig, + num_tokens: int, + num_tokens_across_dp_cpu: torch.Tensor, + ) -> "DPMetadata": + assert num_tokens_across_dp_cpu is not None + assert parallel_config.data_parallel_size > 1 + dp_rank = parallel_config.data_parallel_rank + batchsize = num_tokens + + # If num_tokens_across_dp is None, it will be computed by all_reduce + # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize + assert num_tokens_across_dp_cpu[dp_rank] == batchsize, ( + f"{num_tokens_across_dp_cpu[dp_rank]} {batchsize}" + ) + max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp_cpu) + return DPMetadata(max_tokens_across_dp_cpu, num_tokens_across_dp_cpu) + + @contextmanager + def chunked_sizes( + self, sequence_parallel_size: int, max_chunk_size_per_rank: int, chunk_idx: int + ): + """ + Context manager to compute and temporarily set the per-rank local token + sizes for a specific chunk during chunked forward execution. + + This is necessary to ensure each DP (data parallel) rank processes its + designated portion of tokens in lockstep with others, even when the + token counts are uneven or some ranks have completed their input early. + + For chunked execution, we break up the total tokens on each rank into + multiple chunks (of at most `max_chunk_size_per_rank`), and for a given + `chunk_idx`, this context manager sets `self.local_sizes` to the number + of tokens to process in that chunk on each rank. + + `self.local_sizes` is only valid inside the context. + + Args: + sequence_parallel_size: When Attn is TP and MoE layers are EP, + we use SP between the layers to avoid + redundant ops. We need this value to + compute the chunked sizes. + max_chunk_size_per_rank: The max number of tokens each rank is + allowed to process in this chunk. + chunk_idx: The index of the chunk to compute sizes for. + """ + self.local_sizes = _compute_chunked_local_num_tokens( + self.num_tokens_across_dp_cpu, + sequence_parallel_size, + max_chunk_size_per_rank, + chunk_idx, + ) + try: + yield self.local_sizes + finally: + self.local_sizes = None + + @contextmanager + def sp_local_sizes(self, sequence_parallel_size: int): + """ + Context mamager for setting self.local_sizes. Same as self.chunked_sizes + but without any chunking. + """ + self.local_sizes = _compute_sp_num_tokens( + self.num_tokens_across_dp_cpu, sequence_parallel_size + ) + try: + yield self.local_sizes + finally: + self.local_sizes = None + + def get_chunk_sizes_across_dp_rank(self) -> list[int] | None: + assert self.local_sizes is not None + return self.local_sizes + + # Get the cumulative tokens across sequence parallel ranks. + # In this case the input to the MoEs will be distributed w.r.t both + # DP and TP rank. + # When sp_size==1, this is just the cummulative num tokens across DP. + def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor: + num_tokens_across_sp_cpu = ( + self.num_tokens_across_dp_cpu - 1 + sp_size + ) // sp_size + num_tokens_across_sp_cpu = num_tokens_across_sp_cpu.repeat_interleave(sp_size) + return torch.cumsum(num_tokens_across_sp_cpu, dim=0) + + +@dataclass +class ForwardContext: + # copy from vllm_config.compilation_config.static_forward_context + no_compile_layers: dict[str, Any] + """ + Type Dict[str, AttentionMetadata] for v1, map from layer_name of each + attention layer to its attention metadata + Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one + for each microbatch. + Set dynamically for each forward pass + """ + attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]] + # TODO: remove after making all virtual_engines share the same kv cache + virtual_engine: int # set dynamically for each forward pass + # set dynamically for each forward pass + dp_metadata: DPMetadata | None = None + # determine the cudagraph style at runtime to be FULL, PIECEWISE, or NONE. + # by default NONE, no cudagraph is used. + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE + batch_descriptor: BatchDescriptor | None = None + + ubatch_slices: UBatchSlices | None = None + + def __post_init__(self): + assert self.cudagraph_runtime_mode.valid_runtime_modes(), ( + f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}" + ) + + +_forward_context: ForwardContext | None = None + + +def get_forward_context() -> ForwardContext: + """Get the current forward context.""" + assert _forward_context is not None, ( + "Forward context is not set. " + "Please use `set_forward_context` to set the forward context." + ) + return _forward_context + + +def is_forward_context_available() -> bool: + return _forward_context is not None + + +def create_forward_context( + attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0, + dp_metadata: DPMetadata | None = None, + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + batch_descriptor: BatchDescriptor | None = None, + ubatch_slices: UBatchSlices | None = None, +): + return ForwardContext( + no_compile_layers=vllm_config.compilation_config.static_forward_context, + virtual_engine=virtual_engine, + attn_metadata=attn_metadata, + dp_metadata=dp_metadata, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ubatch_slices=ubatch_slices, + ) + + +@contextmanager +def override_forward_context(forward_context: ForwardContext | None): + """A context manager that overrides the current forward context. + This is used to override the forward context for a specific + forward pass. + """ + global _forward_context + prev_context = _forward_context + _forward_context = forward_context + try: + yield + finally: + _forward_context = prev_context + + +@contextmanager +def set_forward_context( + attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0, + num_tokens: int | None = None, + num_tokens_across_dp: torch.Tensor | None = None, + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + batch_descriptor: BatchDescriptor | None = None, + ubatch_slices: UBatchSlices | None = None, +): + """A context manager that stores the current forward context, + can be attention metadata, etc. + Here we can inject common logic for every model forward pass. + """ + global forward_start_time + need_to_track_batchsize = track_batchsize and attn_metadata is not None + if need_to_track_batchsize: + forward_start_time = time.perf_counter() + + dp_metadata: DPMetadata | None = None + if vllm_config.parallel_config.data_parallel_size > 1 and ( + attn_metadata is not None or num_tokens is not None + ): + # If num_tokens_across_dp hasn't already been initialized, then + # initialize it here. Both DP padding and Microbatching will be + # disabled. + if num_tokens_across_dp is None: + assert ubatch_slices is None + assert num_tokens is not None + _, num_tokens_across_dp = coordinate_batch_across_dp( + num_tokens_unpadded=num_tokens, + parallel_config=vllm_config.parallel_config, + allow_microbatching=False, + allow_dp_padding=False, + ) + assert num_tokens_across_dp is not None + dp_metadata = DPMetadata.make( + vllm_config.parallel_config, num_tokens or 0, num_tokens_across_dp + ) + + # Convenience: if cudagraph is used and num_tokens is given, we can just + # create a batch descriptor here if not given (there's no harm since if it + # doesn't match in the wrapper it'll fall through). + if cudagraph_runtime_mode != CUDAGraphMode.NONE and num_tokens is not None: + batch_descriptor = batch_descriptor or BatchDescriptor(num_tokens=num_tokens) + + forward_context = create_forward_context( + attn_metadata, + vllm_config, + virtual_engine, + dp_metadata, + cudagraph_runtime_mode, + batch_descriptor, + ubatch_slices, + ) + + try: + with override_forward_context(forward_context): + yield + finally: + global last_logging_time, batchsize_logging_interval + if need_to_track_batchsize: + batchsize = num_tokens + # we use synchronous scheduling right now, + # adding a sync point here should not affect + # scheduling of the next batch + from vllm.platforms import current_platform + + synchronize = current_platform.synchronize + if synchronize is not None: + synchronize() + now = time.perf_counter() + # time measurement is in milliseconds + batchsize_forward_time[batchsize].append((now - forward_start_time) * 1000) + if now - last_logging_time > batchsize_logging_interval: + last_logging_time = now + forward_stats = [] + for bs, times in batchsize_forward_time.items(): + if len(times) <= 1: + # can be cudagraph / profiling run + continue + medium = torch.quantile(torch.tensor(times), q=0.5).item() + medium = round(medium, 2) + forward_stats.append((bs, len(times), medium)) + forward_stats.sort(key=lambda x: x[1], reverse=True) + if forward_stats: + logger.info( + ( + "Batchsize forward time stats " + "(batchsize, count, median_time(ms)): %s" + ), + forward_stats, + ) diff --git a/inputs/__init__.py b/inputs/__init__.py new file mode 100644 index 0000000..d9aed70 --- /dev/null +++ b/inputs/__init__.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .data import ( + DataPrompt, + DecoderOnlyInputs, + EmbedsInputs, + EmbedsPrompt, + EncoderDecoderInputs, + ExplicitEncoderDecoderPrompt, + ProcessorInputs, + PromptType, + SingletonInputs, + SingletonPrompt, + TextPrompt, + TokenInputs, + TokensPrompt, + build_explicit_enc_dec_prompt, + embeds_inputs, + to_enc_dec_tuple_list, + token_inputs, + zip_enc_dec_prompts, +) + +__all__ = [ + "DataPrompt", + "TextPrompt", + "TokensPrompt", + "PromptType", + "SingletonPrompt", + "ExplicitEncoderDecoderPrompt", + "TokenInputs", + "EmbedsInputs", + "EmbedsPrompt", + "token_inputs", + "embeds_inputs", + "DecoderOnlyInputs", + "EncoderDecoderInputs", + "ProcessorInputs", + "SingletonInputs", + "build_explicit_enc_dec_prompt", + "to_enc_dec_tuple_list", + "zip_enc_dec_prompts", +] diff --git a/inputs/__pycache__/__init__.cpython-312.pyc b/inputs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e0f2a45005cabae9c8b78224a1b155d84d08635 GIT binary patch literal 794 zcmZXSzmL-}6vv$({goz7n-&Qn7#WImWIWIzrt820NOZE0thTX=b$*GJIH+M`<`1}? zjlY6Ffd3#X6C2`&lbEm__f97~eBzh)eee4m=SLI;NG^;uAw)QW37$ZVW0>M8%uuxnY6+AMsD7g1qj*B&grhmX!$1R&8uA0vTDN>Y-v{AjjVjJ z%8lGsS!f(`@)t7YkH7&KVBCX z^r#F?P}$U@ZOwQ_sGWq+d`B|6BceNJM9rZH>&|#j7B>o63r-1Vm25+)`yWN~PfXmX zx<1u1Db)?4GSY}O^wpN3#!zFT(bMQ_%r#~jzQ#afq%qc*Y9ty9h1bhP>u2h2Itx>9 z+sTv~3sY}wO%vm7<}u!8DAU;3n(k%_ZP5-0*>FzCH*}>!^!F^4AX34^iL%{Od2*;v zpFg`er736IR5bEKk*6OyhqU{0DN$dCNkX2ArX9Y3BC9!lSw2v`^xsuIR`~5Xj&nVn UJMK05AAg?A$L{Ard0#320ZrH3cK`qY literal 0 HcmV?d00001 diff --git a/inputs/__pycache__/data.cpython-312.pyc b/inputs/__pycache__/data.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d7f301211b1275520569958d66eba2bd57bf423 GIT binary patch literal 8951 zcmb_BTTC2TcDK5!s~zsDEn3D}+8v`EM^@-fkfCL~Ep zhzbJsT}f9;jEX5KDy7^}cS?@RDNods@-N~@zGm{I?t;Ts9L1bS@@-uMPQ%y>EkJT=h(b<;K)o^~O< z;>>oK)6RHO7*8S26Pwoz^IA&wau|3$t+mn)+KDu@Hrk|#I2X7-JS}VC&)v}@Y-T&m zR7z%cv6+Xd@6%hz@BU{Woi3o$0d$Vi!?cS@tcG#d!?^1#;qC~m(N(fW!bbluKVc#XjkGAElQ%D0tSao zd$j@B%OIOONCn!*Ga((2=&C#)ImUt<> zXnC&DG}RMv%X2G%m`Tewu{cLBB@?P)NpaOM$0CwdF_tkuqCcHa=v3nf+HfLnS~B8} z8kPrtf2`^e(W<>YpEMJA88+D8>5t--I`Q{djZc^sz)2cUBvL)5V;7EY={RTAV;s| zpP&!TJ9;KHXFdvz$7g9uRi-n#qADf~D06INz>;}q%nGJ(|FIMbAf|x?th!Vx2BUFm z7#ThG*#lLdF|3+WwE6jjW?25X3glyknlvL7mNynlt0@|bS^gMOn@?gt5R3hEUQOB~ zo>)xF!~rZCrfyZoVrn{_F;z2>NgJ`4j-gqvg2s|G5U}pX_aHRS$u6|yf6fh>skymURq(BS1ATF9Up;^6tL%b zi)%Qm`j~mSBYrC|k@<|K4coxN`wa|-5|5}>w6Y+#vY3e@F)$jv4p_FT_`3;Ch`=$u z9;Jl3(HpTj1Wlw;Z^AzEq&H*J0*xgl6NYJtiL|M=A&}WjEB4TE^e{F>yV2Y6Rl(*k zHl0p$9y-R)p+Pg~*=}rJJ@quPDYb7&r?vx!R>fSPbyI5HlCEvnx35WS<6rx7^}U-? z@0N7+2eB1yK4onMdE93!A^`B&y$sEpw(=<}Xc$JZp>MUEMJeL80wbv@syRA5xle~& zVW_}`ga(>iNI>8~?I{!|X^HfVF<@2NdSx(!>1!e0L8__r81YCpGl-aL=uao71+ig{-#t& zOrCNQ;^xYXnc49HkqiL)IyOo88t5Jmv+9Mg{gy50-TbC=~$7$5^vO~)zo$marM!9el~b44qdFnCljFxNn`($Ocsgs*W7 znq~5D-rAM3U!C6cDzAyl*Si%6uXSewJ+DQ0*$%X@jE>O^q8qHWsAQ%UbC$BPyOReTX!71&tSSb$?+>u5;sDA$ z#bC9T!S~3x*Qo;K4RFH?QCw+OU7)y*VnD5APqBCPK{~m}hHN=HfN(SuY$V{yj( zqN2jq)R}a~FcWdy7VhYP`t4moA&zr^4>fHI_g%JCPSWqfzsG2zi0sYG8xWLL16H#w z57p~fNPh-p%$z`1~-|6!IYsuMLQpd0Lvtl zGH7yv8cJ&3KoQ#;KVUND_Id#29&CcgQDcA+v8whnIc8PuXOLAvKLNItVrEEH$fP)Av}D^5sy{D zid24ddlW8-Qmm5cpHk6CY-hHhK!NhclCT=(Sq#h1v53E{456~NW!R0qqu4M(mdo9^ zDIJ^Nz}Gkq4X-!-H7iGR{$`Nhmef{SeRh{GZWtYwV3DiKfs+?AT;=X$iO|ZKX36k9Wg2p}l9|Jko0?>n zxu7QJsWG5m13n@y{W|>YNi!laF+{{YkR@{k7|1>35?&Srm@;B~NWTd~-^15HSuc}9 zfK-K6XR_z7zp5NvzPjzH$$Hwh+9OZoCrdeR>vl!W@@-xX0DAE$n%D$@-pr71n%JjR z#ks|flQ-dQUS1iLI_hF%xfP84M z8lYI}V12@_o$Peg@6>3Fl9S9l_9#}ntiXKR5s3xBYLEm^Uk9?(I1XS_FM#GSBx9E1 zXY8s_w&hr^^4O+yY)k6ju52oO zr_%6Dq#%V)S}OmR!rO1bOT#J!D}(ejzxwz7UdKW6#9{G{KHBh3a>q z*>AgTQ}chr2 zNC7!doU@cwfrRX`&#Ks?XzbfwC6H&^n}jvk_S*Kgws(3oF+;J(l${WZ=-VR?;AUL?0>sSRGn$xlifvY+I<94F415-@MrWxGp@0>?cw+{&Dw}(E@?F)m z6D@cs;pd!p;q&Udq(o(FOqMy8OtMp$nlv(a;s8$B?b3TnVp@Uf94>nmh$TGc^sz{S z=Oac92eHb~7nKKzWD=*d8Xxbp;Y0&qJBT=k7W>>Zpo{`-I0e}MC%^)Xp0jozj1(z8 zO3ckcw0n>+XUm;sY?5}^f@>VKR({vW%05;)p9PpD0wo}InVDXc16 zK=$3bfqnrM5InpHsskbioY~N)0O`-soBtU)%Ou~*z4^Ov$1hL65MPY1d%u2vP3|{zyN2;uXumSk5acZ7;eEvc`xg-n=TX0RkzhDxiEU2m`p26n$--*oUhb(VMWZx~C?OnnO6 z@V`?!aW-DNvfD|WIl>8iWhYF$&C^j9Iy-f;1Lq}t9meJ=Hr?1@wq|ucE7o{ni$$3J zAvTzr^&eyN7uftIHb23J9b3`Qm|wsd4mH7)aNy40@(#g{;LoV{^%S6khKTX+pwGN0 z2*Q8KMCke*x&E)@`tM2LcVuLfjC?N!|Jwzr@HOVce1J47S*daBP+JymSwrp6FUX4E zT?rKk{1mPVUGQ?DK;WlvNAU3%_$f4b1O>6+Ctpv(hu1FV+WI#FxwfH~OS!h2%hfBP zO>#5uY7nISakn6^#=tKGd86@VAa`P97yJ2OdH7C{_-nSLz{&^fv44=j1O2XLc}Fe- zECiZ`Giw*tXEvr^PH&`h10w|jko-jw>dE$A%{5(J_CFcjBvQ0>FQyK;S3e zLBbt7?h2uHi~l>47ooo$tXt8Z`tu0S%S5i(^+2!Cit|7S`{(fkM%j6_Y#zig<=FZs zx%La$pU2J(oS_Y%2dt#i#->&dx~?fMYZL2e5l3Ok-KP^t;7 zc0O&(i_qW3{+jfQ-ds&r-VFdG*|BzH%~-#(_VDY0T;&2 zjpmUGF5=JnyE$gge>~?tvFpQu%YqDxG_+;I7hj(GR(yFr7aqm%M X@zv8$qbt=pzmjtwF1TTYmAL;0Y7w}* literal 0 HcmV?d00001 diff --git a/inputs/__pycache__/parse.cpython-312.pyc b/inputs/__pycache__/parse.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..019d24eee83fd983dc6b674940bc211908f4912e GIT binary patch literal 5339 zcmb^#TWlN0agV$^Qshy5Nz}`>sAojQkrLbSGq4=%kvKJErBV!(XiLz%lf#&=?42A- zkOLt|0SO5Zx+oACFi;lAhm8bxud(s_tCp}?L z(i`?BePLhHAND6Z!X3%Za3@C;E9GGiTOtq+$abYm=~lXWK%g}nA%OR^;I&?zBDX1= z(t$l_JL}YCUUt08hkGsLB_ccDC9LEsT1J%<)JgNgGUgC`xQ|M%KjH%8_>ZIlJ$ce9U#X5^u5q`C~lMxsGUF^lmm(fs6&cR zwy|}5%a+=B7mN?Hy}M!W9z|$KKT=z31Xk$|htxO^wi)*u~SXx%-jKZ4Q zs2kdtqN-_nDV15#RKsBo;QEGpF`l}fP_%TaHrOQ8raP9DTUrg5meMzrRBhXNL(G`_ zi&W88Xex%n)jzaoyz}bMEojkNV{DyhKrz3MYZ!HDLY=lwKH)yDabM>a_s*>Q38~M_ za#YaTnse(jWy_5*La0-7*2iRPee4}hbJeK_O7ooNu8(Vt9h*|4m8>@;xDEIW`GAXA zu9KMMC+&pX=32Djn%dm4yzk+G=Qe89Dgi_T^Q&t`hYysX!M9%m&0k*uPLHqa$xg3z zpN1}teXbw4PlHd}v*R~*Z8>+D`A&W3?|%B`yyiX)$$Aes)uxtQQ_DT4+F3sbS*XkM z4%`gKPW>%_bu1*#Kj0{`Q?$XgL2$^ZsszEJZ^mUM7}bJE(1Pg{BZ8ZOCwfeSk7(!y zk+`g`Et!;Has_y=!MvcVQE;iF!6>EC)nIzLMo?QQY&sCqw8_P2W>*b4(dXkSRf|G2 zF|4p8McoY^!+>E;C@I5nHJVsaW+_cmiZ)L1G7M`xrO^(=Sa4y(Aq&&f;FW}uR8iYt z@V}=gwPa>86VE)OMd@`#d*;^4sWYcJGVXXsk5)_JER|evEaL?OBZ*Xo!RAkv7&Qur?cnY@ICczD0j8cd31B` zucyx!{8PUXAM(F=B`1H@@$@4x@1N48sqz!2lskeCM~fY2bpKghI&1RDMdwhNEv-%L?w;+@ocC_%qs5{- z^pBV43tdpu4fG2eAX_gvX=_$wB!?L^gTb&3^Y6~wQAJ|r0K1-uz~QKJhi8q+=)F?bE5yDgSZX-Z0)hm2&8L{gA@A`!z8i6qnVN&;~y z5_xkanlMKw24TY;i9}PWv=-Ik>699YumvBGDvS|2h+w1+&H#hZfq>l?l8nxpnb*q| z)7bZv96a!$7gb9D{-2UV<{)ia>Z#}3N)CElwyu+#1d5hM0+ji^E675Pqgj54`#5Yh z4m=6=$@aY*&D1QnHIrG$G0z+lqI$#l4VM{-Beg3y1?y1^8T2>;mgSxWZ0({M%n!&` zP0uGy#1ZI!2`$%r~;mM-=tZqMBZu4j7?#&e21}Y+v`nH$z&f&6u zM#3;?gt(PivA0_bfN6IMwtTmXlt>F5V{X zN5EWmUy7Q;0R*?;r{c9iWbf+RnAsf4@>}B=+y_TDE^TVr>8;i66M1)kfgddmk8RB9 z(m;_PGovRiu*VSKwQL%~zX16*)233(Tc+h@wg{&22ClU4lP&d7b&I>r!TYNbBG((Q zItZ|so7wk-RT^)#CyY1VI8PXFRR5od^|w2iv27mgmEFU6*xZbZEKHt-Zpdm5v5R3> zD6~X6JVnC_g|g8Jpyrm5Si>}GMK;BSYc;l6E3sUL;B6d<1K1nj`g2!{?xVW>=;NOL zU-sVZ)rU_MdY;#9&u_d9pzI#b{j}gdq1#XFI3bt&KL~Gyx9tVjK*`tr;rN|#{qSVL z_iV}C`N7)O+V*6@J-%z>C5UQ#Evki{GTgCrBB8`sIX4l#7NZyjpz>JFKp>_(bj(^w zL=5kg+7aN*8D5hM0u^%Z$Aqr0mmgnJ3Oej97 zsRscTj1(&_voOP=#t@hznf}rAiXmhgLBPs2R_d@S;WW~iAECEW43Dq`{;R?w4f7hx zj()FC8TJ><()T?5Ayv=NIz)Ld8nBKz6kX?3Rk7ofC6Y6#|WZ>T(s^%Al2-=1NkaVnyV!;MAqDyb!7i z*sVwouDc}lRjklH795*X+e>-pKyFHRf~3c*0+4rmtz0O(wo9O?NKV*cyh5PSpL=l^ ITNY#f2eCwz)c^nh literal 0 HcmV?d00001 diff --git a/inputs/__pycache__/preprocess.cpython-312.pyc b/inputs/__pycache__/preprocess.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..860402055847d80023c04acc541c657dfaec52d8 GIT binary patch literal 23443 zcmdUXYiwNCo!`Cl=FE^Ye80)dhe!@Z4n<0$BubWTy)4TX?O3uK(}_nT-b-<$c`)}5 zC6S;KB{0xYnr!55cB6FDmf6h*dhKl#Y=H{fEnsYmVv9wAxum=^87N%_^3xxzyGI{vgn5nq{DO1J@4~BzyIU>&;5()YCnfx()G_0-;HzJf2RlI=h7qF z9X!WPX}8;b{%?Wv9#bW^MrYm%L*u9>b>_e^(6m=RJvGd-!^nch_2Okb*hra!fB zW?yPxW*{{HFKE_YKnF9{) z1x^gV!-=);JM^^irMgJ{FX$n{m%JC_3k!+#wTPqSd_KKYa?Qq-EYgEDe>V?2GAz#jH{aTj_dAS(~rF{^A)bjW(P2XcMo381&`L4Jn;iX5$QBS2F2XA}h(U zq!hm?Dag}cHN$#&DI3o!zo03O)Rbz`9ZZe|mz1)ZwEogc(2M&oOSiLnJix|k z^o%_y`m1V*^*WZ&<~Eq6S<#=c#Jve6Ch5J5N!lP{qBi!}f;PCOg}AIpF~#VSo<6G) zo)*^V58Hp~W>6up%XZlxn2ZNd8uI38jKa35nH4iqWR= zJm(T?kRx2hiDP0&s+H=b`Z))cHFK7wA$Ry4_e=xJjdZcjwuI5TN32KgM)Ze9)PQ!H z5N}4j5%CsQ3okbPfETF^y%d|FW~o(dk=l^jUeO!7_FAP5v#*_VL9y)}?@X82E_I_V z0X^pa=WE{Vha(hyNazrBwF2=U(ByjlA#c z&pfxnuW&J)|8d-k15Z?*{Z^cKG9(9X#f2xcofVfH&Q=>ajrx!S)T{lLAX!vCzsFBU zd?mNCxPYPgVn9%E%9uQe{PGY55ekMWs7Fw$!GPH?V+!$=h^ORIq~x4kfl^g0Hk$-z zj>QxY$_n(m`w!2IWm5}d3yFo%Y+Sx3Wk+u>9yxaO#t~^OnYcQ(u#}z8q{nV1lc_Nc z+s78nb%`!4$z!N#xoy{iMUAdU@m@KL;P3BppVn|yHTS%#ujR3?ZL?!2=X&gGEw=XN zT;HlGwhz!_OR@J5{q6qw(F>2eCN{e8^pUSU|LSG-cN-OPv$}jjacd>d_`Pnx!i^6O?oz7&119o*%LfzWqO-aDBO^gOBSD2BhV?R3=FY;lh2nk^p}Zurhu?|n7z>y!^6v;Bn0 zc(dn(HDyhLzitvV3g|y;5_xWEk4e8yM7HCnP3?296O$LFGifl6j4Ygb<&wZ=QAmiw zRcSW9s7P7@?QohtcHg3 zzTy8bR^aS!xB~JN=0rY@pcIszI_u>*_L}wbJg-pYMDn}mkZ6*&^!D4>OnXuBb(4mX zQ~xPVSre|Wn?P*5C2`Dv2T(Sk-VPwX-ejMC-DlNf#O^+=J!g4)*?>{*^S2HiojU5~5k1Ej1a+ z5SvXCy_VzW;>jd|<<-VGSqy^F!p$D49uLTh?fZqr8-j4DmAmR-CZ?5 zXUJY&0BAg$mE~tZG2n8<(n@t`jzkbB{krNWDzy6qozJxq>h3Yd5qtc#&;3PIs{91O zU2dxee6A1@)R0gJ9b69`d}09dmV>W7`l-{&cp)G;5Gn*Z)Idi*(7)y?3>{O4junPZ zt3#*rLoZbn8n+64+}U3Yh6}+?HP~4Q4ywVyeB}8L4}r~g`SZRuV12MFAJ|t`+)>;4 zsndlv{ao)rp=VOnJ}7CNgDm4N`pZeGkLh?kc`iCIK%A3!4}=H?_> zN@s;z@g?QB5Qi)v-7@Nk%N7>IMNtw*gaiZyY~OTNO5+)|$AL|>qvsaWv)M$(8qBRk zG6`uzfLN#J%~0pX#aUn*YZm*??2`QyDI=J(1r(<(iru*zoNmVDgp^$>n~&$ODm0$B zHi_{hgiJ*m(dR*1hpWoFna;S|mPSC5bPw!3|wi+?}cs_HB?2-npEMycV0gi$CKr1?Pn_Gm=Xc1QiZOI1J zrQ%Bhx?oTVEhc6$l_|?V3v*pky3M9(mQ6CvgFOvd!@tayIFn7>#7GrtgNA7l%Dj|> zTBs9&IUHI-`sWEz)q(bI7!ned-0`zOIcsW1E2iWq@;9KPDZUk6o9nDM?nwt|l{A#||Ah zcqBe?_3+j4i6cjk9GsL6Pfo_A+3^W!eDc`%fddmq4j(>rSc;D+@@!>stG8nW_6p1e z=1_G8!noClnR;ahJVjy5VN+t#mbtz$bQ0vmSQzGGC~~E2*w4$?7E{=lN(H&^n$#)m zc94sXNFW+QCCj{FZ56xC^Qwk@#A6tsC2>Y!4n!vEi#TO6O2`)wl$=Q^jRX>wf>90wSM#w9ChB_SWZ}s((cdClFwV4KqpQIko4xjh<6^Uso}-FCQAu`^FJ& zGI1Q@p z20K0Mx*yf=_-Xr8c8l2YqLWZ=g_mnr9CkX9wYy;ubU^%aO_#i2jUZ(CT&WJyM~u;Q zJgMQS67iN?BAq`uadsusC^e%T*}_VGwo_wSI)m{nE{MQ;gE&a(EqQLhqH#^Bq>ly$ zpN!fP%W;xrVmEFr57>|bq4SP>{}I(HR5#3UKCZ6myR$zycz^!gu8rEBoW~T93xNSO zFi;4LtAX){Z>|SU!j?2Lo;#PXhX6eE^FTu}SofWa_bxt&JP!76h8lC#+S+(U1EBy7q?mQ;d@eXdqc|Ev0#|l5kCqNxP;(v<-bmy`qY(BrRpv%c@g zP06d#rF27=bAnPzZdRn^(cVSe8c4lKg=Q!qd!zhi1PZMlYe@sFut8*!epY8^)Y+Nm zF;#Q}0pO>OYiunv45$qQ8x2Fbs$y4fq3ej+btHfErH!ta^S<`Y*3Lrfu-ZCYXg#L3 z9?RAIJka_eUFbTXb{%-w{o%gHUFVC9?ce*-yI=b4w{zzJp zGZgwx?pD1_I!vP{d)bea7RzjGVNlkz8W|K}P|hW?gxXM-kx%3L@+g|bEVF5)e#o^5 zN}kNs>(XqNt=M~-R?V_h2wt@e0$}qusb{%wkNsbs>2J`0LY8>UxrZ5tzxmZC_02yB z{%GzG<~Qoc^1-pqhL-Y`2({#WEuS%MAEL5yuwEgxrrgyB@hWDLk_vsr0_PM4}<H3~!o_KOCP)YPTVS~N4$7S;151b4Y7p{D!U2eHSY(IN!mmZ;hi z-Do+GdvO!2xo2GM887slQhQDnVGarRsNtSsTVJuMtdU}znbx9Ig+vhPJftd9azZ*y z9vv`klT`B($FrsRasz1|mV~v!TQZihd4HGx9?x^%YI5OR=Z~DXcnz%_C4T~EHZbtv zfJMHHI^opeh#n}p;O8<|LG4v1NGiX*AfH@5Y6DIBXku*A*mOouBs0(hl#@}jBwU_2 zr30<}Egc);P^;hn)98z<;%bDj@V>3r#og7yBlQ!ogx==T?Ag7}(}K4gQ=D z2HM8X2XCnjkz5tjhMxXUXeFOSPd-|D9DNa%)PDaq=jeesweu6OrG0deS$E>mn~z?? zlAq@fA*_YL)kWu;WLE`pwd6vY#970#2KqS(e;6OWYM4(8O!%FJE$siPSGhDp;PZ$fRvknC7sfl==7qjFXD=75L1hM}+wV47q8z zfh|jsXvwcTH^h@!Fz35e_d5t8K24r&&^oJ&pL(CzRx7z6CZ|eXJzv30a8{RCOxM@`r^bpBq@&m|78hYs5u(6prtGWD$94izESr^ zKKKSs8d?VmEfKXPveB|XSN(CQqu4h1aJq2loOZjb)aZP05c%;im9HPj2M076n`WPWXv~tRNC4bR zqOwej#%_js%iGCRP;9AI{BHAi-10EiN|&s(R4pW#Ymf4#oOGL496Uz~?uzpRmnoll z(t$Eb%w@E82WxEG9}^4FAFu3Q@vgv2^8~uF;vp$)x)j#2g6TM9%PobOv=C$B0uxRd z--MQAa!w5uT=IR?!*;lQhhn`5fECQ@#`d~Z;Gfg`-==^O4WWucss`htCe2t#zh&{FufLRUm;rP9#^}^^Al$_ zy3ghV_}7lG2)(#*RBaqBh6VPos}Sy0!@c>wqZ{F4>)~e~y;kf$Qf%!3yKRbs-8R+S zJNIdrYwFGSPHZ$Dgf84VSZLj^w(ehhd874MK5#5Ig`n6snmc#@gc=kqhYp z8No67V^l{XgF$tMTbRj!$m`6`EvzC#27^@o2-Pr>Yu>l|MYSoK3qaR@I(@Z&Zu;5^ zb3%Wi`;gjws2FYs{cX89;qW%+6l(Ip4p=KOz3I= zeb~~R=p^Q3tTfn+QO?E6kq?h85NhQBaVB+@mG!W)%_wVUtbp@5waEa|Yxv}pmG-gH zJ@%z76%{>Y^*~h9sds0MWqv{?;KEjDG`;FIDGtZs5buPz zegyc`qZ5sUr}S!!b0<3XG>)D$;ZV!>*#{?YUAY8>tnAXkl`p?V{-se(GKn%Jct@5m zquZMDX^yVs(yZs2_*L@iia^Pup|G69LLd)qI@-RP(#o@)Jst7L{gif(g5RNl6dt*Y z0;0j)Hc+N(do-P6rx~=-5I?i9z9v(Ka4~2vGeW3_^{y=Aw`ueKC0d1LzMrcbTJvnw zj*(5i_Wr=VlMlYQ9@tl`Z!W+BUq86!SzChh%0~TEJ~&mh%TWwB-CrUT5O{-|YZ$`z zY=HWtxxxhNSe1}T`dF1re=`bb{KU)DHy4%BTv_xouaj&_-Jmj0!fmSqYprVU!N=i4 zIwPtq_-T-h4 zYQQMngdKLun`t<8rfbm}nTvP)L=)%kIJ5P}%N27j?l{szBF>+0a&q}xnVb?Trft2L zXVxnNhal=ZDUEn#w$W?{%EAXeVMVx49^6#J18YO;;fasQ zOZ*kfOZ-zm*Vgre8;?g$s9h%>r8inH=4zg_cA6L;C% zwb6~vqxYua54ZZZ3V*#O0iJq|$G4r%<~m?hZ5?Xr>?w3yP&+Q5^LdEI;R$rPvnF>D z!py|8xoJ&^{y5ZEZ0NS)eHY2onWc2h(erT3c$-p~w;@*bPci zS^gs3L{a85i%AhD!*mxGPGFX2vIUs7M-+QQYm~t%J3-Ko6_}YhL^kLooPp*uN!&G6 zAPg}uXBhl%&~IjqmVZpKcMz1^4CA05QI1HJEr1f8m4dG=rn8BZb{R%PPNqgNk;_|V z!TCd~n}D>`XzXWQiPAU-!&lb0LSncs@M>0`7WiQ#acK(E$-L4q8m-Xsunn&6k@sVQ zRk~ULSZ&~{>+etAn|=UP9r$S|QVb8TO#+u5&aa10+MsG()01HHH4O z3;DL&S?AArpd$?on{U@HtHJSNZ~!j{U=HYoex(arUs?}DilNBECN*^ANmK8Z!_n9U zJ2-))tF{iv-PHED?ZCr#V&jr^`^yh2>|zGE~DQygjrt z-5U?Ooq14vhwzMPTZBb-ouccGmq}T6`kiSvz@g;lmipvt$ZO)P(O)1gOkak~^d(54 zcid?C3-*048!^cv>}QRbX@}N`e7{ED8u#d%d0b%j49BK-+%!8VVIr*2m&(yV0(8f( zVrfj|``keLs&=ZToyL)Gp0@9Sv+Rh@tnjTXIuh$ZtRbzPCTW-N%Vs@>T3Mz6UxuqM zoZ|66dn+T5lF1}X1|oWQ;8$oU*_L9h^^~p#lE3t9T$Yzi?|3u24)S_EBkbhYxEOB; zuLdhJ3)$6t6)wq4vo~s_Gh}G`W+5FojUJQde2R2sSs<5mI^rpJ4`yeghKoErkJBRF zv-;;UN%#kmFOo8g(-t|ADI5K;U@GcH@AU?!$giH6AB+xaqcHm?;7Exa!O87%0P71k z|2^E8t&C>jVZr$3fzhlnX?rzq78Tf>SI$mZ3ueCQ4A?z$2`3Kvl;#xc#9zXtV?kNG zsvn>UL&o^aj3wGsMCf{{*&g#KkEfRe(;wX;Eh56>=*%K5Nx~8+^y>h!luQtjrUl(Y z+*nBF(3)Xaw#;^{S*LL^+mPAWa*D9`q1AdmwR3+y&Ej)b6Px~#n^mx}0+}9dTF;oR z$FeSHiPI2D`T$j$R!1_byh;ISb)|aTd$PnZI%ANM@*mUdP72;bfa2p6f0F|91fk^B zYL97>3Tb;ND{In*CZW*Df?{Ld?KZ8qI&ql(&03Yq<2IrSUXl0E&P!Ulyx0Q{qWo{P z2P@JWJQs|v%{>kr{uqKn`;kXY8%@vUUMNDf>O7)$9w~;K^DRSaIAWS^x{ks@h=~BW zyA;Vsj;TFU`S8^J>U`*Qv1i}AHO$lKxY~34(Z!9P3-{@&t98bCXgzeOIDWV=KCO;V zLn&LGhR!xor-q~VYko-Aq2P4^xf4`kt8@m(fnAW6S=MGB&4k#I^{Ciq?ED2;&O)Txw^?id%m{Y>k7Ja1<*;p5#zCh6jAv6P8Z1OJ=nH@$?pa~5#cc5{#8Cpx7?(XB|S|+fATl~48 z?pZjYh9(}K*a$tB_dWL+0{(wSos8u0cjwQ}H%F#}9=ih@<}(O?K_X=QE|vQQn#V>k zkPr^DpLJ--%$}y<#CQ+WrYWvx_Y$|AR(tSTV5bes$4-}hE<#6kQ#;qDpZsX_SuWoQ z+PZ5_E%F(3ME(;B7}J`h*oPD_qa<-9!`4{Ikak!Y10VzA3o6Tq!8e+U;XXQ9fU%!AhJH!_W53RCn5WB2mXT2>i34ir zz>{G3yWOh;8^MT4IpJ~CLFPM`Zoabzr`t>w#Omwtf_|Dm_>qt|g1Pz(oCtS!{N(SW zylY1y9>4x=q}$}TQy@jh2bMt{Ux~4+?>29D7$CZqK~b}gvM0<7S-Q>9F% z4wWSj;-b{r;U6D=@3xk8!sn1fj&{12yJIzlB=%8tK>DK<{|J z^JnJ@V9Wt7ZrSd-uh&5kw0&cJ=V^&sqOSs72J#N0%~D_%j?Z+>249$BlUQm98-g1kAt|Mhp!rRspOM8 zTpSw7ozW=jqu{pr`kTzG&+hCZyty^iuAN>n7E6biOw!U=Y#TlP83lhq0Uf2v|CIt20d9+EU(lMB zlF3*M7Vin9vX2Wr$E19DfU;6BN-rK`x$>OPVp9E^$uk)KtIGCphF;-!< zU#%b5^0H?*&%j_KzT90~VXu4tYTugc zhr`<(qFZ%4p9OdC+UbY=?_JpD5ZyY%_qw|tUVhZ~lQ*|HM7PfKeeRyk_U={Z_m8b+ z)%N{cZX}>*{+itEz1D3nJx%l6$bnDXA$QYeee(n7gG@19&eqt*{?QKFvv{=wY* zy{>I9r3`VCQ=fQi-Op}@>$lwai#%Put9`4=>d^P!UK7>Mi7k2!#YJzZik=3%?$*^4 zG?dn@I=>sQ)XwpTXVlK4TUC_q-_-}b%KCj6d)@vmsu!%m-?m*#u~+#Hw|}#xf6I-( zG|HxJFU3xBuke@nPn;q5Aa2=EAGTP0a_Z5+51o&W{v`cjR-Jr_C2m$#vx&q+4YFz3 z^0r}u{4|UJVyt>fv6&z(qtQ9aO?1$hZyr>u2Dhu}`6NvYI%U*|*k(&Fb=bB>Eu&c@ zJ(kx<&+vDpDu>*idM6Rv40mt2DaEFfT1K-}JQc1a!S?yIPCSII1x8W)M04q9^|L zZ7;<-SbOd6=Z)ql%BF~=*)*o-HQQp(XsUg?nnj<}=DZzKp%DgRtoxK=TOVaCtIn^_ zIpx}+_&&?V8(a! z6z$t?GM!tOLM)GErqh@dv8Gbvj#p8QClkjiHSK)IWLHJwIm8q+PD4DW)V4FPt`yTC zh+CCR588>3=jxj#Q!I-_0@g>H!7=L4&jxj#Q2L&2R4+Xu{Q#bQd)3nEaN?oU5iUL}+ z_q6Y+X?7{!2~7+*DgP~Ak%py+2)3L&&;QcL@%?|rb^J9q{MTIH&$zz7;!gjJJNYYz zn|Ew;2!0vl_-B8{P5q2J@H1}mSFRB6_!U@<+<+XNB<7D JH+hYW{x9SS TypeIs[TokensPrompt]: + return ( + isinstance(prompt, dict) + and "prompt_token_ids" in prompt + and "prompt_embeds" not in prompt + ) + + +def is_embeds_prompt(prompt: SingletonPrompt) -> TypeIs[EmbedsPrompt]: + return ( + isinstance(prompt, dict) + and "prompt_token_ids" not in prompt + and "prompt_embeds" in prompt + ) + + +_T1_co = TypeVar( + "_T1_co", bound=SingletonPrompt, default=SingletonPrompt, covariant=True +) +_T2_co = TypeVar( + "_T2_co", bound=SingletonPrompt, default=SingletonPrompt, covariant=True +) + + +# TODO: Make fields ReadOnly once mypy supports it +class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): + """ + Represents an encoder/decoder model input prompt, + comprising an explicit encoder prompt and a decoder prompt. + + The encoder and decoder prompts, respectively, may be formatted + according to any of the + [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] schemas, + and are not required to have the same schema. + + Only the encoder prompt may have multi-modal data. mm_processor_kwargs + should be at the top-level, and should not be set in the encoder/decoder + prompts, since they are agnostic to the encoder/decoder. + + Note that an + [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] + may not be used as an input to a decoder-only model, + and that the `encoder_prompt` and `decoder_prompt` + fields of this data structure themselves must be + [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] instances. + """ + + encoder_prompt: _T1_co + + decoder_prompt: _T2_co | None + + mm_processor_kwargs: NotRequired[dict[str, Any]] + + +PromptType: TypeAlias = SingletonPrompt | ExplicitEncoderDecoderPrompt +""" +Set of possible schemas for an LLM input, including +both decoder-only and encoder/decoder input types: + +- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt]) +- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt]) +- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt]) +- A single data structure containing both an encoder and a decoder prompt + ([`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]) +""" + + +class TokenInputs(TypedDict): + """Represents token-based inputs.""" + + type: Literal["token"] + """The type of inputs.""" + + prompt_token_ids: list[int] + """The token IDs of the prompt.""" + + cache_salt: NotRequired[str] + """ + Optional cache salt to be used for prefix caching. + """ + + +def token_inputs( + prompt_token_ids: list[int], + cache_salt: str | None = None, +) -> TokenInputs: + """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional + values.""" + inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) + + if cache_salt is not None: + inputs["cache_salt"] = cache_salt + + return inputs + + +class EmbedsInputs(TypedDict): + """Represents embeddings-based inputs.""" + + type: Literal["embeds"] + """The type of inputs.""" + + prompt_embeds: torch.Tensor + """The embeddings of the prompt.""" + + cache_salt: NotRequired[str] + """ + Optional cache salt to be used for prefix caching. + """ + + +def embeds_inputs( + prompt_embeds: torch.Tensor, + cache_salt: str | None = None, +) -> EmbedsInputs: + """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional + values.""" + inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds) + + if cache_salt is not None: + inputs["cache_salt"] = cache_salt + + return inputs + + +DecoderOnlyInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs +""" +The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are +passed to the model executor. +This specifies the data required for decoder-only models. +""" + + +class EncoderDecoderInputs(TypedDict): + """ + The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they + are passed to the model executor. + + This specifies the required data for encoder-decoder models. + """ + + encoder: TokenInputs | MultiModalInputs + """The inputs for the encoder portion.""" + + decoder: TokenInputs | MultiModalInputs + """The inputs for the decoder portion.""" + + +SingletonInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs +""" +A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be +passed to [`Sequence`][collections.abc.Sequence]. +""" + +ProcessorInputs: TypeAlias = DecoderOnlyInputs | EncoderDecoderInputs +""" +The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][]. +""" + +_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) +_T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt) + + +def build_explicit_enc_dec_prompt( + encoder_prompt: _T1, + decoder_prompt: _T2 | None, + mm_processor_kwargs: dict[str, Any] | None = None, +) -> ExplicitEncoderDecoderPrompt[_T1, _T2]: + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + return ExplicitEncoderDecoderPrompt( + encoder_prompt=encoder_prompt, + decoder_prompt=decoder_prompt, + mm_processor_kwargs=mm_processor_kwargs, + ) + + +def zip_enc_dec_prompts( + enc_prompts: Iterable[_T1], + dec_prompts: Iterable[_T2 | None], + mm_processor_kwargs: Iterable[dict[str, Any]] | dict[str, Any] | None = None, +) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]: + """ + Zip encoder and decoder prompts together into a list of + [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] + instances. + + `mm_processor_kwargs` may also be provided; if a dict is passed, the same + dictionary will be used for every encoder/decoder prompt. If an iterable is + provided, it will be zipped with the encoder/decoder prompts. + """ + if mm_processor_kwargs is None: + mm_processor_kwargs = cast(dict[str, Any], {}) + if isinstance(mm_processor_kwargs, dict): + return [ + build_explicit_enc_dec_prompt( + encoder_prompt, + decoder_prompt, + cast(dict[str, Any], mm_processor_kwargs), + ) + for (encoder_prompt, decoder_prompt) in zip(enc_prompts, dec_prompts) + ] + return [ + build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt, mm_proc_kwargs) + for (encoder_prompt, decoder_prompt, mm_proc_kwargs) in zip( + enc_prompts, dec_prompts, mm_processor_kwargs + ) + ] + + +def to_enc_dec_tuple_list( + enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]], +) -> list[tuple[_T1, _T2 | None]]: + return [ + (enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"]) + for enc_dec_prompt in enc_dec_prompts + ] diff --git a/inputs/parse.py b/inputs/parse.py new file mode 100644 index 0000000..211551b --- /dev/null +++ b/inputs/parse.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence +from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cast + +from typing_extensions import TypeIs + +from vllm.utils.collection_utils import is_list_of + +from .data import ( + EmbedsPrompt, + ExplicitEncoderDecoderPrompt, + ProcessorInputs, + PromptType, + SingletonInputs, + SingletonPrompt, + TextPrompt, + TokensPrompt, +) + +if TYPE_CHECKING: + import torch + + +def parse_raw_prompts( + prompt: str | list[str] | list[int] | list[list[int]], +) -> Sequence[TextPrompt] | Sequence[TokensPrompt]: + if isinstance(prompt, str): + # case 1: a string + return [TextPrompt(prompt=prompt)] + + if isinstance(prompt, list): + if len(prompt) == 0: + raise ValueError("please provide at least one prompt") + + if is_list_of(prompt, str): + # case 2: array of strings + prompt = cast(list[str], prompt) + return [TextPrompt(prompt=elem) for elem in prompt] + if is_list_of(prompt, int): + # case 3: array of tokens + prompt = cast(list[int], prompt) + return [TokensPrompt(prompt_token_ids=prompt)] + if is_list_of(prompt, list): + prompt = cast(list[list[int]], prompt) + if len(prompt[0]) == 0: + raise ValueError("please provide at least one prompt") + + if is_list_of(prompt[0], int): + # case 4: array of token arrays + return [TokensPrompt(prompt_token_ids=elem) for elem in prompt] + + raise TypeError( + "prompt must be a string, array of strings, " + "array of tokens, or array of token arrays" + ) + + +class ParsedStrPrompt(TypedDict): + type: Literal["str"] + content: str + + +class ParsedTextPrompt(TypedDict): + type: Literal["text"] + content: TextPrompt + + +class ParsedTokensPrompt(TypedDict): + type: Literal["tokens"] + content: TokensPrompt + + +class ParsedEmbedsPrompt(TypedDict): + type: Literal["embeds"] + content: EmbedsPrompt + + +ParsedSingletonPrompt: TypeAlias = ( + ParsedStrPrompt | ParsedTextPrompt | ParsedTokensPrompt | ParsedEmbedsPrompt +) + + +def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt: + if isinstance(prompt, str): + return ParsedStrPrompt(type="str", content=prompt) + elif isinstance(prompt, dict): + # Type ignores are because mypy does not correctly infer the TypedDicts + # Pyright does succeed. + if "prompt_embeds" in prompt: + return ParsedEmbedsPrompt(type="embeds", content=prompt) # type: ignore[typeddict-item] + elif "prompt_token_ids" in prompt: + return ParsedTokensPrompt(type="tokens", content=prompt) # type: ignore[typeddict-item] + elif "prompt" in prompt: + return ParsedTextPrompt(type="text", content=prompt) + raise TypeError( + "inputs must be a string, TextPrompt, TokensPrompt, or EmbedsPrompt" + ) + + +def is_explicit_encoder_decoder_prompt( + prompt: PromptType, +) -> TypeIs[ExplicitEncoderDecoderPrompt]: + return isinstance(prompt, dict) and "encoder_prompt" in prompt + + +def split_enc_dec_inputs( + inputs: ProcessorInputs, +) -> tuple[SingletonInputs | None, SingletonInputs]: + if "encoder" in inputs and "decoder" in inputs: + # NOTE: This passes pyright but not mypy + return ( + inputs["encoder"], # type: ignore[typeddict-item] + inputs["decoder"], # type: ignore[typeddict-item] + ) + + return None, inputs + + +class PromptComponents(NamedTuple): + text: str | None = None + token_ids: list[int] | None = None + embeds: "torch.Tensor | None" = None + + +def get_prompt_components(prompt: PromptType) -> PromptComponents: + if isinstance(prompt, str): + return PromptComponents(text=prompt) + + if encoder_prompt := prompt.get("encoder_prompt"): + return get_prompt_components(encoder_prompt) # type: ignore[arg-type] + + return PromptComponents( + text=prompt.get("prompt"), # type: ignore[arg-type] + token_ids=prompt.get("prompt_token_ids"), # type: ignore[arg-type] + embeds=prompt.get("prompt_embeds"), + ) diff --git a/inputs/preprocess.py b/inputs/preprocess.py new file mode 100644 index 0000000..839c138 --- /dev/null +++ b/inputs/preprocess.py @@ -0,0 +1,727 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Mapping +from typing import Any, cast + +from typing_extensions import assert_never + +from vllm.config import ModelConfig +from vllm.logger import init_logger +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal.cache import BaseMultiModalProcessorCache +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalEncDecInputs, + MultiModalInputs, + MultiModalUUIDDict, +) +from vllm.multimodal.processing import BaseMultiModalProcessor +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils.jsontree import json_iter_leaves +from vllm.v1.metrics.stats import MultiModalCacheStats + +from .data import ( + DecoderOnlyInputs, + EmbedsInputs, + EmbedsPrompt, + EncoderDecoderInputs, + ExplicitEncoderDecoderPrompt, + ProcessorInputs, + PromptType, + SingletonInputs, + SingletonPrompt, + TextPrompt, + TokenInputs, + TokensPrompt, + embeds_inputs, + token_inputs, +) +from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt + +logger = init_logger(__name__) + + +class InputPreprocessor: + def __init__( + self, + model_config: ModelConfig, + tokenizer: AnyTokenizer | None, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + mm_processor_cache: BaseMultiModalProcessorCache | None = None, + ) -> None: + super().__init__() + + self.model_config = model_config + self.tokenizer = tokenizer + self.mm_registry = mm_registry + self.mm_processor_cache = mm_processor_cache + + self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None + + def get_tokenizer(self) -> AnyTokenizer: + if self.tokenizer is None: + raise ValueError( + "You cannot pass text prompts when `skip_tokenizer_init` is True" + ) + + return self.tokenizer + + def get_bos_token_id(self) -> int | None: + if self.tokenizer is None: + logger.warning_once( + "Using None for BOS token id because tokenizer is not initialized" + ) + return None + + return self.tokenizer.bos_token_id + + def get_eos_token_id(self) -> int | None: + if self.tokenizer is None: + logger.warning_once( + "Using None for EOS token id because tokenizer is not initialized" + ) + return None + + return self.tokenizer.eos_token_id + + def get_decoder_start_token_id(self) -> int | None: + """ + Obtain the decoder start token id employed by an encoder/decoder + model. Returns None for non-encoder/decoder models or if the + model config is unavailable. + """ + + if not self.model_config.is_encoder_decoder: + logger.warning_once( + "Using None for decoder start token id because " + "this is not an encoder/decoder model." + ) + return None + + if self.model_config is None or self.model_config.hf_config is None: + logger.warning_once( + "Using None for decoder start token id because " + "model config is not available." + ) + return None + + dec_start_token_id = getattr( + self.model_config.hf_config, "decoder_start_token_id", None + ) + if dec_start_token_id is None: + logger.warning_once( + "Falling back on for decoder start token " + "id because decoder start token id is not " + "available." + ) + dec_start_token_id = self.get_bos_token_id() + + return dec_start_token_id + + def _get_default_enc_dec_decoder_prompt(self) -> list[int]: + """ + Specifically for encoder/decoder models: + generate a default decoder prompt for when + the user specifies only the encoder prompt. + + Encoder/decoder models utilize the decoder + prompt in different ways; as new models are + added, it is intended that this function + will be extended to produce differing + default decoder prompts, depending on the + model variety. + + Absent a special case, the default behavior + of this method is to mirror the behavior of + the HuggingFace (HF) GenerationMixin for a None + decoder prompt, which is to employ a logit processor + setting to force the first decoded token to be . + Here, this behavior is approximated by having the + "default" decoder prompt be . + + However, it is possible that in the future + other models may have different or more + complex logic for the default decoder prompt. + This motivates having a special helper method + for default decoder prompts. + + Returns: + + * prompt_token_ids + """ + + bos_token_id = self.get_bos_token_id() + assert bos_token_id is not None + return [bos_token_id] + + def _prepare_decoder_input_ids_for_generation( + self, + decoder_input_ids: list[int] | None, + ) -> list[int]: + """ + Prepares `decoder_input_ids` for generation with encoder-decoder models. + + Based on: + https://github.com/huggingface/transformers/blob/4037a2b5b1278736e566aec12e169100275545ea/src/transformers/generation/utils.py + specifically, + `GenerationMixin._prepare_decoder_input_ids_for_generation()`. + + Arguments: + + * decoder_input_ids: input token ids to preprocess + + Returns: + + * Processed token list + """ + + decoder_start_token_id = self.get_decoder_start_token_id() + assert decoder_start_token_id is not None + + if decoder_input_ids is None: + # no decoder prompt input -> + # use decoder_start_token_id as decoder_input_ids + decoder_input_ids = self._get_default_enc_dec_decoder_prompt() + + if ( + len(decoder_input_ids) == 0 + or decoder_input_ids[0] != decoder_start_token_id + ): + decoder_input_ids = [decoder_start_token_id] + decoder_input_ids + + return decoder_input_ids + + def _get_tokenization_kw( + self, + overrides: dict[str, Any] | None = None, + ) -> dict[str, Any]: + kwargs = dict[str, Any]() + + if self.model_config.hf_config.model_type == "whisper": + # For Whisper, special tokens should be provided by the user based + # on the task and language of their request. Also needed to avoid + # appending an EOS token to the prompt which disrupts generation. + kwargs["add_special_tokens"] = False + + if overrides: + kwargs.update(overrides) + + return kwargs + + def _tokenize_prompt( + self, + prompt: str, + tokenization_kwargs: dict[str, Any] | None = None, + ) -> list[int]: + """ + Apply the model's tokenizer to a text prompt, returning the + corresponding token IDs. + """ + tokenizer = self.get_tokenizer() + tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) + + encoder_config = self.model_config.encoder_config + + if encoder_config and encoder_config.get("do_lower_case", False): + prompt = prompt.lower() + + return tokenizer.encode(prompt, **tokenization_kwargs) + + def _get_mm_tokenizer(self) -> AnyTokenizer: + # PrithviGeoSpatialMAE needs to be initialized without a tokenizer + # while using also multi-modal input + if not self.tokenizer: + return cast(AnyTokenizer, object()) # Dummy + + tokenizer = self.get_tokenizer() + return tokenizer + + def _get_mm_processor(self) -> BaseMultiModalProcessor: + if not hasattr(self, "_mm_processor"): + tokenizer = self._get_mm_tokenizer() + + self._mm_processor = self.mm_registry.create_processor( + self.model_config, + tokenizer=tokenizer, + cache=self.mm_processor_cache, + ) + + return self._mm_processor + + def _process_multimodal( + self, + prompt: str | list[int], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object] | None, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> MultiModalInputs: + """ + Apply the model's multi-modal processor to a multi-modal prompt, + returning the corresponding token IDs and metadata. + """ + mm_processor = self._get_mm_processor() + + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + mm_input = mm_processor.apply( + prompt, + mm_data, + hf_processor_mm_kwargs=mm_processor_kwargs, + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + mm_hashes = mm_input["mm_hashes"] + + # Validate that all mm items have a string as their hash + contains_only_strings = all( + isinstance(leaf, str) for leaf in json_iter_leaves(mm_hashes) + ) + if not contains_only_strings: + raise ValueError( + f"mm_hashes must contain only strings, got: {mm_hashes}. " + "This is likely due to an incorrect custom implementation of " + "MultiModalProcessor.apply method." + ) + + return mm_input + + def _process_embeds( + self, + parsed_content: EmbedsPrompt, + ) -> EmbedsInputs: + if not self.model_config.enable_prompt_embeds: + raise ValueError( + "You must set `--enable-prompt-embeds` to input `prompt_embeds`." + ) + + prompt_embeds = parsed_content["prompt_embeds"] + + # prompt_embeds must be (seq_len, hidden_size), but if the user + # passes in a batch of size 1, i.e. (1, seq_len, hidden_size), + # we can unambiguously process the intent by squeezing the batch + # dimension. + if prompt_embeds.ndim == 3: + prompt_embeds = prompt_embeds.squeeze(dim=0) + + if prompt_embeds.ndim != 2: + raise ValueError("prompt_embeds must be of shape (seq_len, hidden_size).") + + # Tensors must be on CPU for serialization between processes + # in the MsgpackEncoder. Casting to CPU here ensures that there is no + # hidden device transfer in the critical path of generation. + prompt_embeds = prompt_embeds.cpu() + + return embeds_inputs( + prompt_embeds=prompt_embeds, cache_salt=parsed_content.get("cache_salt") + ) + + def _truncate_inputs( + self, inputs: list[int], tokenization_kwargs: dict[str, Any] | None = None + ) -> list[int]: + if ( + not tokenization_kwargs + or "truncation" not in tokenization_kwargs + or self.tokenizer is None + ): + return inputs + + max_length = tokenization_kwargs["max_length"] + + if self.tokenizer.truncation_side == "left": + return inputs[-max_length:] + else: + return inputs[:max_length] + + def _process_tokens( + self, + parsed_content: TokensPrompt, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> TokenInputs | MultiModalInputs: + prompt_token_ids = self._truncate_inputs( + parsed_content["prompt_token_ids"], tokenization_kwargs + ) + + inputs: TokenInputs | MultiModalInputs + if multi_modal_data := parsed_content.get("multi_modal_data"): + inputs = self._process_multimodal( + prompt_token_ids, + multi_modal_data, + parsed_content.get("mm_processor_kwargs") or {}, + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + else: + inputs = token_inputs(prompt_token_ids) + + if cache_salt := parsed_content.get("cache_salt"): + inputs["cache_salt"] = cache_salt + + return inputs + + def _process_text( + self, + parsed_content: TextPrompt, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> TokenInputs | MultiModalInputs: + prompt_text = parsed_content["prompt"] + + inputs: TokenInputs | MultiModalInputs + if multi_modal_data := parsed_content.get("multi_modal_data"): + inputs = self._process_multimodal( + prompt_text, + multi_modal_data, + parsed_content.get("mm_processor_kwargs") or {}, + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + else: + prompt_token_ids = self._tokenize_prompt( + prompt_text, + tokenization_kwargs=tokenization_kwargs, + ) + inputs = token_inputs(prompt_token_ids) + + if cache_salt := parsed_content.get("cache_salt"): + inputs["cache_salt"] = cache_salt + + return inputs + + def _prompt_to_llm_inputs( + self, + prompt: SingletonPrompt, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> SingletonInputs: + """ + Extract the singleton inputs from a prompt. + + Arguments: + + * prompt: single encoder or decoder input prompt + + Returns: + + * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance + """ + parsed = parse_singleton_prompt(prompt) + + if parsed["type"] == "embeds": + return self._process_embeds(parsed["content"]) + if parsed["type"] == "tokens": + return self._process_tokens( + parsed["content"], + mm_uuids=mm_uuids, + ) + if parsed["type"] == "text": + return self._process_text( + parsed["content"], + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + if parsed["type"] == "str": + return self._process_text( + TextPrompt(prompt=parsed["content"]), + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + + assert_never(parsed) + + def _build_enc_dec_llm_inputs( + self, + encoder_inputs: SingletonInputs, + decoder_inputs: SingletonInputs | None, + ) -> EncoderDecoderInputs: + if ( + encoder_inputs["type"] == "embeds" + or decoder_inputs + and decoder_inputs["type"] == "embeds" + ): + raise ValueError( + "Embedding inputs are not supported for encoder-decoder models" + ) + + # Needed for mypy + encoder_inputs = cast(TokenInputs | MultiModalInputs, encoder_inputs) + decoder_inputs = cast(TokenInputs | MultiModalInputs | None, decoder_inputs) + + if decoder_inputs is None: + if self.model_config.hf_config.model_type == "whisper": + # For Whisper models, the text prompt should go to the decoder. + # If no explicit encoder/decoder inputs, then copy the prompt + # from the encoder to the decoder. The encoder tokens are later + # overridden by the audio features. + dec_token_ids = encoder_inputs["prompt_token_ids"].copy() + else: + dec_token_ids = self._prepare_decoder_input_ids_for_generation(None) + decoder_inputs = token_inputs(dec_token_ids) + else: + if "multi_modal_data" in decoder_inputs: + raise ValueError( + "Multi-modal decoder inputs of encoder-" + "decoder models are not supported yet" + ) + + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + decoder_inputs["prompt_token_ids"] + ) + decoder_inputs["prompt_token_ids"] = dec_token_ids + + return EncoderDecoderInputs( + encoder=encoder_inputs, + decoder=decoder_inputs, + ) + + def _split_enc_dec_mm_inputs( + self, + inputs: SingletonInputs | MultiModalEncDecInputs, + decoder_inputs_to_override: SingletonInputs | None = None, + ) -> tuple[SingletonInputs, SingletonInputs]: + """ + For encoder/decoder models only: + Separate Encoder/Decoder inputs from a MultiModalEncDecInputs + """ + if ( + inputs["type"] == "embeds" + or decoder_inputs_to_override + and decoder_inputs_to_override["type"] == "embeds" + ): + raise ValueError( + "Embedding inputs are not supported for encoder-decoder models" + ) + + # Needed for mypy + inputs = cast( + TokenInputs | MultiModalInputs | MultiModalEncDecInputs, + inputs, + ) + decoder_inputs_to_override = cast( + TokenInputs | MultiModalInputs | None, + decoder_inputs_to_override, + ) + + encoder_inputs: SingletonInputs + decoder_inputs: SingletonInputs + + if inputs["type"] == "multimodal": # Multimodal data inputs + if "encoder_prompt_token_ids" not in inputs: + raise RuntimeError( + "You should register an encoder-decoder " + "multi-modal processor for encoder-decoder " + "models." + ) + inputs = cast(MultiModalEncDecInputs, inputs) + + encoder_inputs = token_inputs(inputs["encoder_prompt_token_ids"]) + + decoder_prompt_inputs = decoder_inputs_to_override or inputs + decoder_inputs = MultiModalInputs( + type="multimodal", + prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"], + mm_kwargs=inputs["mm_kwargs"], + mm_hashes=inputs["mm_hashes"], + mm_placeholders=inputs["mm_placeholders"], + ) + if cache_salt := inputs.get("cache_salt"): + decoder_inputs["cache_salt"] = cache_salt + + elif inputs["type"] == "token": # Text-only inputs + encoder_inputs = token_inputs(prompt_token_ids=[]) + decoder_inputs = decoder_inputs_to_override or inputs + else: + assert_never(inputs) # type: ignore[arg-type] + + return encoder_inputs, decoder_inputs + + def _process_encoder_decoder_prompt( + self, + prompt: PromptType, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> EncoderDecoderInputs: + """ + For encoder/decoder models only: + Process an input prompt into an + [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] + instance. + + There are two types of input prompts: + singleton prompts which carry only the + encoder prompt, and explicit encoder/decoder + prompts which carry both the encoder and the + decoder prompts as member variables. + + This function handles the following scenarios: + * Singleton encoder prompt: extract encoder prompt + token ids & infer default decoder prompt token ids + * Explicit encoder/decoder prompt: extract encoder + and decoder prompt token ids + + Note that for Explicit encoder/decoder prompts, + each sub-prompt (encoder or decoder prompt) can + have any possible singleton type; thus this + method relies on helper functions to obtain + token ids for the sub-prompts. + + Arguments: + + * prompt: an input prompt + + Returns: + + * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] + instance + """ + encoder_inputs: SingletonInputs + decoder_inputs: SingletonInputs | None + + if is_explicit_encoder_decoder_prompt(prompt): + # `cast` is needed for mypy, but not pyright + prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt) + encoder_inputs = self._prompt_to_llm_inputs( + prompt_["encoder_prompt"], + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + if (decoder_input := prompt_["decoder_prompt"]) is None: + decoder_inputs = None + else: + decoder_inputs = self._prompt_to_llm_inputs(decoder_input) + # For multimodal model, override decoder prompt from processor + # with explicit decoder prompt. + if self.model_config.is_multimodal_model: + encoder_inputs, decoder_inputs = self._split_enc_dec_mm_inputs( + encoder_inputs, decoder_inputs + ) + else: + # `cast` is needed for mypy, but not pyright + inputs = self._prompt_to_llm_inputs( + cast(SingletonPrompt, prompt), + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + if self.model_config.is_multimodal_model: + # Encoder-Decoder Multimodal model + encoder_inputs, decoder_inputs = self._split_enc_dec_mm_inputs(inputs) + else: + encoder_inputs = inputs + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) + + def _build_decoder_only_llm_inputs( + self, + prompt_inputs: DecoderOnlyInputs, + ) -> DecoderOnlyInputs: + if "prompt_token_ids" in prompt_inputs: + prompt_inputs = cast( + TokenInputs | MultiModalInputs, prompt_inputs + ) # Needed for mypy + + return prompt_inputs + + def _process_decoder_only_prompt( + self, + prompt: SingletonPrompt, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> DecoderOnlyInputs: + """ + For decoder-only models: + Process an input prompt into a + [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance. + + Arguments: + + * prompt: input prompt + + Returns: + + * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance + """ + + prompt_comps = self._prompt_to_llm_inputs( + prompt, + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + + return self._build_decoder_only_llm_inputs(prompt_comps) + + def _preprocess( + self, + prompt: PromptType, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> ProcessorInputs: + if self.model_config.is_encoder_decoder: + # Encoder-decoder model requires special mapping of + # input prompts to encoder & decoder. + return self._process_encoder_decoder_prompt( + prompt, + tokenization_kwargs, + mm_uuids=mm_uuids, + ) + + if is_explicit_encoder_decoder_prompt(prompt): + raise ValueError( + "Cannot pass encoder-decoder prompt to decoder-only models" + ) + + # Decoder-only operation + # `cast` is needed for mypy, but not pyright + return self._process_decoder_only_prompt( + cast(SingletonPrompt, prompt), + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + + def preprocess( + self, + prompt: PromptType, + tokenization_kwargs: dict[str, Any] | None = None, + *, + mm_uuids: MultiModalUUIDDict | None = None, + ) -> ProcessorInputs: + """Preprocess the input prompt.""" + res = self._preprocess( + prompt, + tokenization_kwargs, + mm_uuids=mm_uuids, + ) + + if self.mm_processor_cache and self.mm_cache_stats is not None: + delta = self.mm_processor_cache.make_stats(delta=True) + self.mm_cache_stats.requests += 1 + self.mm_cache_stats.queries += delta.total + self.mm_cache_stats.hits += delta.hits + + return res + + def stat_mm_cache(self) -> MultiModalCacheStats | None: + mm_cache_stats = self.mm_cache_stats + if mm_cache_stats is None: + return None + + self.mm_cache_stats = MultiModalCacheStats() + + return mm_cache_stats + + def clear_mm_cache(self) -> None: + if self.mm_processor_cache is not None: + self.mm_processor_cache.clear_cache() + + if self.mm_cache_stats is not None: + self.mm_cache_stats.reset = True diff --git a/logger.py b/logger.py new file mode 100644 index 0000000..9341008 --- /dev/null +++ b/logger.py @@ -0,0 +1,267 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Logging configuration for vLLM.""" + +import datetime +import json +import logging +import os +import sys +from collections.abc import Hashable +from functools import lru_cache, partial +from logging import Logger +from logging.config import dictConfig +from os import path +from types import MethodType +from typing import Any, Literal, cast + +import vllm.envs as envs + +VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING +VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH +VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL +VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX +VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM + +_FORMAT = ( + f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " + "[%(fileinfo)s:%(lineno)d] %(message)s" +) +_DATE_FORMAT = "%m-%d %H:%M:%S" + +DEFAULT_LOGGING_CONFIG = { + "formatters": { + "vllm": { + "class": "vllm.logging_utils.NewLineFormatter", + "datefmt": _DATE_FORMAT, + "format": _FORMAT, + }, + }, + "handlers": { + "vllm": { + "class": "logging.StreamHandler", + "formatter": "vllm", + "level": VLLM_LOGGING_LEVEL, + "stream": VLLM_LOGGING_STREAM, + }, + }, + "loggers": { + "vllm": { + "handlers": ["vllm"], + "level": "DEBUG", + "propagate": False, + }, + }, + "version": 1, + "disable_existing_loggers": False, +} + + +@lru_cache +def _print_debug_once(logger: Logger, msg: str, *args: Hashable) -> None: + # Set the stacklevel to 3 to print the original caller's line info + logger.debug(msg, *args, stacklevel=3) + + +@lru_cache +def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None: + # Set the stacklevel to 3 to print the original caller's line info + logger.info(msg, *args, stacklevel=3) + + +@lru_cache +def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None: + # Set the stacklevel to 3 to print the original caller's line info + logger.warning(msg, *args, stacklevel=3) + + +LogScope = Literal["process", "global", "local"] + + +def _should_log_with_scope(scope: LogScope) -> bool: + """Decide whether to log based on scope""" + if scope == "global": + from vllm.distributed.parallel_state import is_global_first_rank + + return is_global_first_rank() + if scope == "local": + from vllm.distributed.parallel_state import is_local_first_rank + + return is_local_first_rank() + # default "process" scope: always log + return True + + +class _VllmLogger(Logger): + """ + Note: + This class is just to provide type information. + We actually patch the methods directly on the [`logging.Logger`][] + instance to avoid conflicting with other libraries such as + `intel_extension_for_pytorch.utils._logger`. + """ + + def debug_once( + self, msg: str, *args: Hashable, scope: LogScope = "process" + ) -> None: + """ + As [`debug`][logging.Logger.debug], but subsequent calls with + the same message are silently dropped. + """ + if not _should_log_with_scope(scope): + return + _print_debug_once(self, msg, *args) + + def info_once(self, msg: str, *args: Hashable, scope: LogScope = "process") -> None: + """ + As [`info`][logging.Logger.info], but subsequent calls with + the same message are silently dropped. + """ + if not _should_log_with_scope(scope): + return + _print_info_once(self, msg, *args) + + def warning_once( + self, msg: str, *args: Hashable, scope: LogScope = "process" + ) -> None: + """ + As [`warning`][logging.Logger.warning], but subsequent calls with + the same message are silently dropped. + """ + if not _should_log_with_scope(scope): + return + _print_warning_once(self, msg, *args) + + +# Pre-defined methods mapping to avoid repeated dictionary creation +_METHODS_TO_PATCH = { + "debug_once": _VllmLogger.debug_once, + "info_once": _VllmLogger.info_once, + "warning_once": _VllmLogger.warning_once, +} + + +def _configure_vllm_root_logger() -> None: + logging_config = dict[str, Any]() + + if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH: + raise RuntimeError( + "VLLM_CONFIGURE_LOGGING evaluated to false, but " + "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH " + "implies VLLM_CONFIGURE_LOGGING. Please enable " + "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH." + ) + + if VLLM_CONFIGURE_LOGGING: + logging_config = DEFAULT_LOGGING_CONFIG + + if VLLM_LOGGING_CONFIG_PATH: + if not path.exists(VLLM_LOGGING_CONFIG_PATH): + raise RuntimeError( + "Could not load logging config. File does not exist: %s", + VLLM_LOGGING_CONFIG_PATH, + ) + with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: + custom_config = json.loads(file.read()) + + if not isinstance(custom_config, dict): + raise ValueError( + "Invalid logging config. Expected dict, got %s.", + type(custom_config).__name__, + ) + logging_config = custom_config + + for formatter in logging_config.get("formatters", {}).values(): + # This provides backwards compatibility after #10134. + if formatter.get("class") == "vllm.logging.NewLineFormatter": + formatter["class"] = "vllm.logging_utils.NewLineFormatter" + + if logging_config: + dictConfig(logging_config) + + +def init_logger(name: str) -> _VllmLogger: + """The main purpose of this function is to ensure that loggers are + retrieved in such a way that we can be sure the root vllm logger has + already been configured.""" + + logger = logging.getLogger(name) + + for method_name, method in _METHODS_TO_PATCH.items(): + setattr(logger, method_name, MethodType(method, logger)) + + return cast(_VllmLogger, logger) + + +# The root logger is initialized when the module is imported. +# This is thread-safe as the module is only imported once, +# guaranteed by the Python GIL. +_configure_vllm_root_logger() + +logger = init_logger(__name__) + + +def _trace_calls(log_path, root_dir, frame, event, arg=None): + if event in ["call", "return"]: + # Extract the filename, line number, function name, and the code object + filename = frame.f_code.co_filename + lineno = frame.f_lineno + func_name = frame.f_code.co_name + if not filename.startswith(root_dir): + # only log the functions in the vllm root_dir + return + # Log every function call or return + try: + last_frame = frame.f_back + if last_frame is not None: + last_filename = last_frame.f_code.co_filename + last_lineno = last_frame.f_lineno + last_func_name = last_frame.f_code.co_name + else: + # initial frame + last_filename = "" + last_lineno = 0 + last_func_name = "" + with open(log_path, "a") as f: + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + if event == "call": + f.write( + f"{ts} Call to" + f" {func_name} in {filename}:{lineno}" + f" from {last_func_name} in {last_filename}:" + f"{last_lineno}\n" + ) + else: + f.write( + f"{ts} Return from" + f" {func_name} in {filename}:{lineno}" + f" to {last_func_name} in {last_filename}:" + f"{last_lineno}\n" + ) + except NameError: + # modules are deleted during shutdown + pass + return partial(_trace_calls, log_path, root_dir) + + +def enable_trace_function_call(log_file_path: str, root_dir: str | None = None): + """ + Enable tracing of every function call in code under `root_dir`. + This is useful for debugging hangs or crashes. + `log_file_path` is the path to the log file. + `root_dir` is the root directory of the code to trace. If None, it is the + vllm root directory. + + Note that this call is thread-level, any threads calling this function + will have the trace enabled. Other threads will not be affected. + """ + logger.warning( + "VLLM_TRACE_FUNCTION is enabled. It will record every" + " function executed by Python. This will slow down the code. It " + "is suggested to be used for debugging hang or crashes only." + ) + logger.info("Trace frame log is saved to %s", log_file_path) + if root_dir is None: + # by default, this is the vllm root directory + root_dir = os.path.dirname(os.path.dirname(__file__)) + sys.settrace(partial(_trace_calls, log_file_path, root_dir)) diff --git a/logging_utils/__init__.py b/logging_utils/__init__.py new file mode 100644 index 0000000..7202259 --- /dev/null +++ b/logging_utils/__init__.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.logging_utils.formatter import NewLineFormatter +from vllm.logging_utils.log_time import logtime + +__all__ = [ + "NewLineFormatter", + "logtime", +] diff --git a/logging_utils/__pycache__/__init__.cpython-312.pyc b/logging_utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b769d28727ddac41d2f1d5e5be83d0ba0b186fd7 GIT binary patch literal 339 zcmZutF;2rk5Zv?GaUfWtqaY=Xi}=a_LKGBKQ6AvtWaPt~*t4z68E{F>19$^(;RDDL z6&0POh=z)FL<%IfSk3It%u4$jMY}+S{*&Hg03QR{miG(RM+BZBhnyi8mNGVA4?V+E zPMGI@;iY~X#G}u_60>f9sqPoLRF{=CnRCk0EX1mC+9=COV|{ANIOwK#h$f*`q02%x zPU9-M-nur@?(iQ88RRBx$mv2UGejw0U`-UhFN8C-sC7ManJtu?Jv8%^CT>Ys2dtRRVdJ)heZa2smN=K-2Y8*g#vKXgfK%sfaaY1UP=XPIB)jCfi;W!z zyrNwy9b!c8mzDva$V*nyA@QP9a*B8gPz~8c7fAfCmAF9?n8cxyf6>KyLzp zspL>}7<$fVGOC8-so`NshOVbSG9rniaY^nQRnwy?43!jy`qe>Ip*rF4m?SIFR5BbU zHd#_fV*?IWrTC3;rZ_&! z{)YKPu%HM#%}f{OPqU&W!-AIe74$B1mBLAPiMBW42aHUF8Kwx`fR1?`hHK0x9@w9m z-hfK9zp`)1EBm&-XS( zBUWj0#kR204v_;7tSU-O4*Y@#3HaRR4)nH1B zt?BEV^xOr#H0ZrsoNsd97dO+EH+ZYXj40h--yTsVuk+^5&@2lc{hEkULngZ7y~K8y zW5pIM<9#UoF4Jxs1B)n%=OfLCO5>? zBN>wR>%1w#oJIX;98Iui(KrrydkJowVs;5rOou@5&j_9-_#=Wpo^pREC6$!CT^Sin zONXZTU4kf$MI*r4cZupGj1e3n{9qDtt}LZx!iB?PG@?QR#vs}rsw9r65=|=VU@{^R zR#9bQp%aKDmWn2cHL6Mpg_Ng3c~F+ZsdF(2wAEBt98IK&B_35&(2`VQRY%itiCCoM zXo6U}bgJ)YRa0_ggxCzE#HvFjHUlTI>L`hQWKbDYRavH?FP8!#77Cx@O%5g`z-x&q zDF8(GRS5qf2)krCB@=fa6qTWPYFzJztb_%S1fM5(jBxr{;ZQOSNCm1266p8xNVA$q zH>acN-Rht`EUCLEM%&sCoNtqw zk7~YS3l}s`x5jlFYW^HwO@Zoa_W}!%d%@-ElZ(L(s==k|lgs`d&G*5BQ<|qwyYE(;9brmG@@-+4DKRcE#hnI(>Qi%Fi{f#u$y|_;*&k z{u{1quE*ZmC2#GrxBikfZ~Cu0kLFy*7R$a0%$Lr07L0x&7i^xx%fWqGMax6?{)MuK zu48$lKeHkKmaK9;^w?LwFjz1av#UfpOv7}%3FstPv|May7mlvz5Su9YK?{N zONHi)MQdHR!}0raU$@ushpoQuGRM6wzV1rL{ieF^ZH@=6f$j$M$A-!i4EraHff8Fp zf1wetVm`>iDg8x8JjD@+c2$WKUNNwb4gU3)*Ay@tUtY1oy9)&QPc6Q$@-MVj1bM9R zzRI8s&M+4Tcc*>d;1*1vGhCTM$?dd z6bSjQiHMX|;q^$$_3RUtk`S3vSTe^$th^T>$~48wEkG0s0m5CTrX*#0@NQO;+d+1K zPTmRRB7$3S1RnEMOMKO>vyb@3Rjy`cOlxeL-JTr?+?nu5`wEENs9;o10>)tC%hG<1fMx4dOO z3m28YVtNhqYE*hK{euEk0T*j_QLOO5Ees*iVm<)rWlk20mbG{4y(5q=A$baV#uUUs zbx;^e$-v zkeO3Ssx!6qRSn(MbzzNO9|WV4QBX!L6V#MI9V28THsj|Eu}XSjah(B>;WJ&Q`@8#k zja=Ub=O%p7;VhdR4hjw}5U{dBV+35!@I9G2m`)A@qa=WUr#X<-`Rkrv^sI14SGm9n zSMtnmvD(%gHtWGve@NTWvFty1iCg7LuDUL}w6bFlDstTEubq{vF5lIj%RO4~2lJjs zuH$Pga&3M7tup{=1T?vl$6Un{SD^*l|H5@ZF0mebsnCi#+um7$7AtBx+u6kf7-*Y_+)hPRlxi zX6*##@Cm%)DqmwM^SAPbH7jMH6qVQC5tqt#tlRZ|xfdVQr$MXFg0?v6nH{?FQ9$w^ DlYpkF literal 0 HcmV?d00001 diff --git a/logging_utils/__pycache__/formatter.cpython-312.pyc b/logging_utils/__pycache__/formatter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..231ede02288ef7be9b98d5d1bdb7188891c90bee GIT binary patch literal 3900 zcmb_f-ES1v6~D8ypEkw@+t|j-G7SWK1>Th%hd3XQVlb{N$D|NwaUVv*&h>ii+1cgZ z@!DQ34G&0NH$k-{RV>N_t`w<6R2Ax9(5jVMsV`kNlFf)pk(xg6X2nEBc?dmcW@i>V zj>1E)wD;b*=i{Dx&hMUk_8(164FsC*|3C{g67p~Sa9f}@*_?pMEus=t(8(l@K3&N8 zCVc|VMP1DJC;bBPkx`=hZxA&=$&9VlRpgBKL}#1~E(-DBWBizi3zq*~#hk{G8hM^_ zSe(AbPc{?q{Y$Wbl1V`&lRj0L6jgtTObM#*Mr6{Ni8sQNff88|CW9A9JYcm<(D`xA zpl7ozqnIXTg%hV#l}mayoz{%Blw)*CTae7GWYBpX7&*=rT}vBMW==P?K3tSClyfCb zc_QV1&Ng%nnz#8SbiWa95z`|s`HSQx(2GLRR}|et3&ix~XM!HHH&KPzUXUTO$5kXV zVUO%h*eMgjUFeeopl%OPtD>R!CJ$ijx307jW^&1#mg_SWmZoOk!rW`G56!+tle%^#nOiicvqmzn>zQN?4f33+={z~* zVkMDVWC-Cx*LD;o++#KYng1NRr`O4&aMMz#67DRwmeZ?qpN0Fr=svjCJy7W$xSy_c zpD6i13wN$JHm@~yKWOZJ6qP>d{CHw5I#7uYRHK7y(Kjp6H`k)WmFVzVbgU8`E1kLd z&W1>KMK?h`N%^p^HH3~0yiQm};Hg3#daBR#c&GfTc*B3q2Qcy=*aSC$YDULGv#G}zmNXnobycFk|JKevmp6Bz5fJ_6Ut z?|v7BIv|Zb#T{jz=P}#d{g*;;J^S!_y1K3fK)%zZ*GDxl`xRq72tQT7B8?NaDf~m=5CkwFkzjLH0n{<@r>Sn{>$XUkLJmO zc$v%#@yG<;C;OgRK}O9vW~4k#umj$dR`_jx105tUj|LI*5Co)J^L7!^1P~>#Ccun< zXHb2ZwitL&g^awPToZu0&!Yxx>TZY}Vl23y6yVmN2JnN;Z8Y239J=@%y{64XOYK&V zml~Ws1A8_VeQGET`>A2nh2C_u8v)$UnX8c*59c*SFcf|6iQuJZQ2q`VssR&++b&FJ znMn;UDH3KU3GV-GZjAGoB%z1dO3{`^c=X~fiPj$Xk zD3#T9CCAYkl+0+V3I|fS)RRagdbS&K4tUtS#_2I>`)>c|o#4o5k9};7OB_9ZOmd*g zCCvmA`!v4YzL5nblhY~R>UcYmXR<2QWx7C9b7qz$Av%=t{wrq5YrMLMVuS+eis3!uYp=@(-e01@=`*6LHPlFk=Kr*Kb z$G%k}IAuiQEj^o3bbd1N9Ctg2^FKm&ojgg9_O5croddTIRNHz=BkP^}OJfh4+sf_b ztG7E>FFoiwy54!9tdy^mBg=t0;k^f4@%5Ij^2ODG)l=omm6jvR5sz^t@Nmz*@1!mV z9yRY>e)WSw`P8p|fn2ow)781V=kH!vEmRKlS6cd4BA=t`O3O>2v~}ODp_QTXShe-Y zKX)Br9eDIGp4m_`)FPJ9Q1CT1*e;JZI|ZExfn)gSTOp{s0F;d3lV&x#ppfBqVkK%t zMix>uhlsVpm}PC9_DnOtfmmu-LC2F;s8)qT(U;RTC;}OU{F|cjFxzAEF`CLUmG$69 zSh>RO$k>6j#0*c{(PXyov8IMNV!Lh!ll8+&q2oJ(h;;+U|0+=BZTMb}^nB8`L16g( zp-mh%CIqsprB>H<#LD^A!u`Q&$581<51TtSeBz$LN3oYyhdwseVuO{~U^RApEjC<< z4gYEIZzukIq8j_j%IK3Y={{J_-+BM``_=u)<r#5jC||1XK3o#l8~6PB$*V{U7jjfE8r|uZHjcQvTAZc&=t)nYP z%j)XcyBF{GfBLh^;a97BkC#rbH}5SqJbltdI)Csb5gg-=|L*PIzFm#I^3mvq2uojc z^ynW3PWO_(^v2Fa#J`3G=&|A#UXF*bV#i29#oi#xkT2)#>V20iL#owll#nc|*%bT} zz!KW_uhmp5(^FK}WjQW5!N42(LnjJY*fQ);hT|=JdV@UlkACG31b!_1clYao)(rxk zjY69dj0x4scPoUbgxcN)A&a6e`(H@ptXt9BBA*gR+`h!&w;c0NvSHvyQ}|)%zzc$~ h874yWzewkQNc)rEX+daRo?q+et#tH$Mexhk@;~O*ycYlf literal 0 HcmV?d00001 diff --git a/logging_utils/__pycache__/log_time.cpython-312.pyc b/logging_utils/__pycache__/log_time.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c585bf95e48bc50b0a3aef66c4d527741098443 GIT binary patch literal 1376 zcmah|O>7%Q6rR~1|0PkH#CDv}kP(t)wo+^%0pX{rT2NXJX#|2(E}PBn*x7WwyUeWf zW7(1qm0}Bta;X}G5F9uJ<-(C8cP_Dl2rNAy!G)WVsRW#Ov)(kQ2zb(dGw;25Ki|%q zuO}ul2*gp|vo~afeiF*LNbzP!VDkW3XdPMDk{rB_qg7g$EX9)V;&pjn($wa&nKzlY zV_TFHg9LV+a>q6)alBg1c5B3;x+R;ztF@U>Z4&E-~b)q2dIm>xGGur zZmKP{q_>eKHMcVaWYw#25^PaI_o&$jY|o9V5w99q<2uwL)rM;dcR32X;p`gwocN9b zP1yn2q%JjrE#g5ccF`Rloi;HvaxMyNX5bsX0dQYBfWHwu_HX~4M;y^Z$FUz+NL{%q z3m(U@w~bp^@L4lsL$s)Xi|`G<1-LK&Ux~-bBnkBrEo7q>{tUm5+wy<6b{^M~nsD`P zrI+d~Sh$zi79^jLkB}vSC-HAGy6H9CfU+>7>vhj+ zI8@ieab176VK}Z)7n*9(%|=a2hl;^!JWSl#jaHQh1`EOjqrBk+VUmujg$bY0s=XH~ zVpPI}7*$9w*LjT#N@Cgw>2x}OzE%n9e#N)_B}i1G!O~u1`Q?}HEYpf(Z&v*MV9RqW zJC0MY#N)2RU^-lAx*$;Y_rsLF%M2fgG&L|s)~k+Z8V#(E7woTH@=$cUwZw=Tw#zz6ElMhz`fPp>dD*- zU+jFj`_1^R?`Gz@w}((2hBZt_&&l%~9;%`{t|?4B3N|YsD9*s|`g~2mue7bWxN~Pg7O$n2_Zh5AzvpG1%QUXLI tyDUId9HmWN9Sz+zCZ4Hy-&{~1$Qa{a6B1U2DTJqfKx4lpX7O|s_!kg4QUd@0 literal 0 HcmV?d00001 diff --git a/logging_utils/dump_input.py b/logging_utils/dump_input.py new file mode 100644 index 0000000..cb289d0 --- /dev/null +++ b/logging_utils/dump_input.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import contextlib +import enum +import json + +import torch + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.metrics.stats import SchedulerStats +from vllm.version import __version__ as VLLM_VERSION + +logger = init_logger(__name__) + + +def prepare_object_to_dump(obj) -> str: + if isinstance(obj, str): + return f"'{obj}'" # Double quotes + elif isinstance(obj, dict): + dict_str = ", ".join( + {f"{str(k)}: {prepare_object_to_dump(v)}" for k, v in obj.items()} + ) + return f"{{{dict_str}}}" + elif isinstance(obj, list): + return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]" + elif isinstance(obj, set): + return f"[{', '.join([prepare_object_to_dump(v) for v in list(obj)])}]" + # return [prepare_object_to_dump(v) for v in list(obj)] + elif isinstance(obj, tuple): + return f"[{', '.join([prepare_object_to_dump(v) for v in obj])}]" + elif isinstance(obj, enum.Enum): + return repr(obj) + elif isinstance(obj, torch.Tensor): + # We only print the 'draft' of the tensor to not expose sensitive data + # and to get some metadata in case of CUDA runtime crashed + return f"Tensor(shape={obj.shape}, device={obj.device},dtype={obj.dtype})" + elif hasattr(obj, "anon_repr"): + return obj.anon_repr() + elif hasattr(obj, "__dict__"): + items = obj.__dict__.items() + dict_str = ", ".join( + [f"{str(k)}={prepare_object_to_dump(v)}" for k, v in items] + ) + return f"{type(obj).__name__}({dict_str})" + else: + # Hacky way to make sure we can serialize the object in JSON format + try: + return json.dumps(obj) + except (TypeError, OverflowError): + return repr(obj) + + +def dump_engine_exception( + config: VllmConfig, + scheduler_output: SchedulerOutput, + scheduler_stats: SchedulerStats | None, +): + # NOTE: ensure we can log extra info without risking raises + # unexpected errors during logging + with contextlib.suppress(Exception): + _dump_engine_exception(config, scheduler_output, scheduler_stats) + + +def _dump_engine_exception( + config: VllmConfig, + scheduler_output: SchedulerOutput, + scheduler_stats: SchedulerStats | None, +): + logger.error( + "Dumping input data for V1 LLM engine (v%s) with config: %s, ", + VLLM_VERSION, + config, + ) + try: + dump_obj = prepare_object_to_dump(scheduler_output) + logger.error("Dumping scheduler output for model execution: %s", dump_obj) + if scheduler_stats: + logger.error("Dumping scheduler stats: %s", scheduler_stats) + except Exception: + logger.exception("Error preparing object to dump") diff --git a/logging_utils/formatter.py b/logging_utils/formatter.py new file mode 100644 index 0000000..02ba308 --- /dev/null +++ b/logging_utils/formatter.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import logging +from pathlib import Path + +from vllm import envs + + +class NewLineFormatter(logging.Formatter): + """Adds logging prefix to newlines to align multi-line messages.""" + + def __init__(self, fmt, datefmt=None, style="%"): + super().__init__(fmt, datefmt, style) + + self.use_relpath = envs.VLLM_LOGGING_LEVEL == "DEBUG" + if self.use_relpath: + self.root_dir = Path(__file__).resolve().parent.parent.parent + + def format(self, record): + def shrink_path(relpath: Path) -> str: + """ + Shortens a file path for logging display: + - Removes leading 'vllm' folder if present. + - If path starts with 'v1', + keeps the first two and last two levels, + collapsing the middle as '...'. + - Otherwise, keeps the first and last two levels, + collapsing the middle as '...'. + - If the path is short, returns it as-is. + - Examples: + vllm/model_executor/layers/quantization/utils/fp8_utils.py -> + model_executor/.../quantization/utils/fp8_utils.py + vllm/model_executor/layers/quantization/awq.py -> + model_executor/layers/quantization/awq.py + vllm/v1/attention/backends/mla/common.py -> + v1/attention/backends/mla/common.py + + Args: + relpath (Path): The relative path to be shortened. + Returns: + str: The shortened path string for display. + """ + parts = list(relpath.parts) + new_parts = [] + if parts and parts[0] == "vllm": + parts = parts[1:] + if parts and parts[0] == "v1": + new_parts += parts[:2] + parts = parts[2:] + elif parts: + new_parts += parts[:1] + parts = parts[1:] + if len(parts) > 2: + new_parts += ["..."] + parts[-2:] + else: + new_parts += parts + return "/".join(new_parts) + + if self.use_relpath: + abs_path = getattr(record, "pathname", None) + if abs_path: + try: + relpath = Path(abs_path).resolve().relative_to(self.root_dir) + except Exception: + relpath = Path(record.filename) + else: + relpath = Path(record.filename) + record.fileinfo = shrink_path(relpath) + else: + record.fileinfo = record.filename + + msg = super().format(record) + if record.message != "": + parts = msg.split(record.message) + msg = msg.replace("\n", "\r\n" + parts[0]) + return msg diff --git a/logging_utils/log_time.py b/logging_utils/log_time.py new file mode 100644 index 0000000..9e94f46 --- /dev/null +++ b/logging_utils/log_time.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Provides a timeslice logging decorator +""" + +import functools +import time + + +def logtime(logger, msg=None): + """ + Logs the execution time of the decorated function. + Always place it beneath other decorators. + """ + + def _inner(func): + @functools.wraps(func) + def _wrapper(*args, **kwargs): + start = time.perf_counter() + result = func(*args, **kwargs) + elapsed = time.perf_counter() - start + + prefix = ( + f"Function '{func.__module__}.{func.__qualname__}'" + if msg is None + else msg + ) + logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed) + return result + + return _wrapper + + return _inner diff --git a/logits_process.py b/logits_process.py new file mode 100644 index 0000000..7b6a652 --- /dev/null +++ b/logits_process.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable, Sequence +from typing import TypeAlias + +import torch + +from vllm.transformers_utils.tokenizer import AnyTokenizer + +LogitsProcessor: TypeAlias = ( + Callable[[list[int], torch.Tensor], torch.Tensor] + | Callable[[list[int], list[int], torch.Tensor], torch.Tensor] +) +"""LogitsProcessor is a function that takes a list +of previously generated tokens, the logits tensor +for the next token and, optionally, prompt tokens as a +first argument, and returns a modified tensor of logits +to sample from.""" + + +def get_bad_words_logits_processors( + bad_words: list[str], tokenizer: AnyTokenizer +) -> list[LogitsProcessor]: + bad_words_ids: list[list[int]] = list() + + for bad_word in bad_words: + # To prohibit words both at the beginning + # and in the middle of text + # (related to add_prefix_space tokenizer parameter) + for add_prefix_space in [False, True]: + prefix = " " if add_prefix_space else "" + prompt = prefix + bad_word.lstrip() + + prompt_token_ids = tokenizer.encode(text=prompt, add_special_tokens=False) + + # If no space at the beginning + # or if prefix space produces a new word token + if (not add_prefix_space) or ( + add_prefix_space + and prompt_token_ids[0] != bad_words_ids[-1][0] + and len(prompt_token_ids) == len(bad_words_ids[-1]) + ): + bad_words_ids.append(prompt_token_ids) + + return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)] + + +class NoBadWordsLogitsProcessor: + _SMALLEST_LOGIT = float("-inf") + _NEUTRAL_LOGIT = 0.0 + + def __init__(self, bad_words_ids: list[list[int]]): + self.bad_words_ids = bad_words_ids + self.word_bias: torch.FloatTensor = None + + def __call__( + self, + past_tokens_ids: Sequence[int], + logits: torch.FloatTensor, + ) -> torch.Tensor: + if self.word_bias is None: + self._init_word_bias(logits=logits) + + last_token_bias = torch.zeros_like(logits) + + for bad_word_ids in self.bad_words_ids: + if len(bad_word_ids) == 1: # 1-token words already processed + continue + + if len(bad_word_ids) > len(past_tokens_ids) + 1: + continue + + prefix_length = len(bad_word_ids) - 1 + last_token_id = bad_word_ids[-1] + actual_prefix = past_tokens_ids[-prefix_length:] + expected_prefix = bad_word_ids[:prefix_length] + + assert len(actual_prefix) == len(expected_prefix) + + is_match = tuple(actual_prefix) == tuple(expected_prefix) + last_token_bias[last_token_id] += ( + self._SMALLEST_LOGIT if is_match else self._NEUTRAL_LOGIT + ) + + logits = logits + self.word_bias + last_token_bias + + return logits + + def _init_word_bias(self, logits: torch.FloatTensor) -> None: + # Code based on NoBadWordsLogitsProcessor and SequenceBiasLogitsProcessor # noqa: E501 + # from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py + + vocab_size = logits.shape[-1] + + self._check_token_ids_bounds(vocab_size=vocab_size) + + self.word_bias = torch.zeros( + (vocab_size,), dtype=torch.float, device=logits.device + ) + + for bad_word_ids in self.bad_words_ids: + if len(bad_word_ids) == 1: + bad_word_id = bad_word_ids[-1] + self.word_bias[bad_word_id] = self._SMALLEST_LOGIT + + def _check_token_ids_bounds(self, vocab_size: int) -> None: + invalid_token_ids = [] + + for bad_word_ids in self.bad_words_ids: + for token_id in bad_word_ids: + if token_id < 0 or token_id >= vocab_size: + invalid_token_ids.append(token_id) + + if len(invalid_token_ids) > 0: + raise ValueError( + f"The model vocabulary size is {vocab_size}," + f" but the following tokens" + f" were specified as bad: {invalid_token_ids}." + f" All token id values should be integers satisfying:" + f" 0 <= token_id < {vocab_size}." + ) diff --git a/logprobs.py b/logprobs.py new file mode 100644 index 0000000..a34398d --- /dev/null +++ b/logprobs.py @@ -0,0 +1,208 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools +from collections.abc import Iterable, Iterator, MutableSequence +from dataclasses import dataclass, field +from typing import overload + +import vllm.envs as envs + + +# We use dataclass for now because it is used for +# openai server output, and msgspec is not serializable. +# TODO(sang): Fix it. +@dataclass +class Logprob: + """Infos for supporting OpenAI compatible logprobs and token ranks. + + Attributes: + logprob: The logprob of chosen token + rank: The vocab rank of chosen token (>=1) + decoded_token: The decoded chosen token index + """ + + logprob: float + rank: int | None = None + decoded_token: str | None = None + + +LogprobsOnePosition = dict[int, Logprob] + + +@dataclass +class FlatLogprobs(MutableSequence[LogprobsOnePosition]): + """ + Flat logprobs of a request into multiple primitive type lists. + + Compared to list[dict[int, Logprob]], this data structure reduced GC + overhead significantly. As it flattened logprob information for + all positions and ranks in to multiple primitive type lists (i.e. + logprobs, token_ids, ranks per token_ids, decoded_tokens). + So regardless of the sequence length and top_logprobs setup, + FlatLogprobs would only introduce a constant amount of objects. + + As each position might contains different amount of ranks, + start_indices_per_position would be used to access the logprob ranges + for different positions. + + NOTE: To reduce the migration overhead and improve backward compatibility, + we support the key Sequence APIs of list, so it could act as + list[LogprobsOnePosition] + """ + + # Start / end indices to indicate the range of logprobs for each position. + start_indices: list[int] = field(default_factory=list) + end_indices: list[int] = field(default_factory=list) + + # Flatten Logprob information for (each position, rank). + # For position , the logprobs are ranged + # from self.start_indices[i] to self.end_indices[i] (exclusive). + token_ids: list[int] = field(default_factory=list) + logprobs: list[float] = field(default_factory=list) + ranks: list[int | None] = field(default_factory=list) + decoded_tokens: list[str | None] = field(default_factory=list) + + def append(self, logprobs_one_position: LogprobsOnePosition | None) -> None: + """Appends the container with logprobs for the next position""" + self.start_indices.append(len(self.logprobs)) + if logprobs_one_position: + for token_id, logprob in logprobs_one_position.items(): + self.token_ids.append(token_id) + self.logprobs.append(logprob.logprob) + self.ranks.append(logprob.rank) + self.decoded_tokens.append(logprob.decoded_token) + self.end_indices.append(len(self.logprobs)) + + def append_fast( + self, + token_ids: list[int], + logprobs: list[float], + ranks: itertools.chain[int], + decoded_tokens: Iterable[str | None], + ) -> None: + """ + Appends logprobs for the next position without creating + the intermediate logprob dictionary. + """ + self.start_indices.append(len(self.logprobs)) + for token_id, logprob, rank, decoded_token in zip( + token_ids, logprobs, ranks, decoded_tokens + ): + self.token_ids.append(token_id) + self.logprobs.append(logprob) + self.ranks.append(rank) + self.decoded_tokens.append(decoded_token) + self.end_indices.append(len(self.logprobs)) + + def extend(self, logprobs_multi_positions) -> None: + """Extends the container with logprobs for the next multiple positions""" + for logprobs_one_position in logprobs_multi_positions: + self.append(logprobs_one_position) + + def __len__(self) -> int: + """Gets number of positions stored in the container""" + return len(self.start_indices) + + @overload + def __getitem__(self, position: int) -> LogprobsOnePosition: ... + + @overload + def __getitem__(self, s: slice, /) -> "FlatLogprobs": ... + + def __getitem__(self, index: int | slice): + """Extracts logprobs of a given position or slice""" + if isinstance(index, int): + return { + self.token_ids[i]: Logprob( + logprob=self.logprobs[i], + rank=self.ranks[i], + decoded_token=self.decoded_tokens[i], + ) + for i in range(self.start_indices[index], self.end_indices[index]) + } + elif isinstance(index, slice): + min_index = self.start_indices[index][0] + max_index = self.end_indices[index][-1] + return FlatLogprobs( + # Shift updated start_indices and end_indices to + # be 0-indexed + start_indices=[i - min_index for i in self.start_indices[index]], + end_indices=[i - min_index for i in self.end_indices[index]], + token_ids=self.token_ids[min_index:max_index], + logprobs=self.logprobs[min_index:max_index], + ranks=self.ranks[min_index:max_index], + decoded_tokens=self.decoded_tokens[min_index:max_index], + ) + else: + raise TypeError(f"Invalid index type: {type(index)}") + + def __setitem__(self, item, value) -> None: + raise TypeError("Cannot set logprobs in FlatLogprobs") + + def __delitem__(self, item) -> None: + raise TypeError("Cannot delete logprobs from FlatLogprobs") + + def insert(self, item) -> None: + raise TypeError("Cannot insert logprobs to FlatLogprobs") + + def __iter__(self) -> Iterator[LogprobsOnePosition]: + """ + Iterates the container and yields LogprobsOnePosition for + each position. + """ + for i in range(0, len(self.start_indices)): + yield self.__getitem__(i) + + +# {token_id -> logprob} per each sequence group. None if the corresponding +# sequence group doesn't require prompt logprob. +PromptLogprobs = FlatLogprobs | list[LogprobsOnePosition | None] +# {token_id -> logprob} for each sequence group. +SampleLogprobs = FlatLogprobs | list[LogprobsOnePosition] + + +def create_prompt_logprobs() -> PromptLogprobs: + """Creates a container to store prompt logprobs for a request""" + logprobs = FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else [] + # NOTE: logprob of first prompt token is None. + logprobs.append(None) + return logprobs + + +def create_sample_logprobs() -> SampleLogprobs: + """Creates a container to store decode logprobs for a request""" + return FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else [] + + +def append_logprobs_for_next_position( + request_logprobs: PromptLogprobs | SampleLogprobs, + token_ids: list[int], + logprobs: list[float], + decoded_tokens: Iterable[str | None], + rank: int, + num_logprobs: int, +) -> None: + """Appends logprobs for the next position""" + if num_logprobs == -1: + num_logprobs = len(logprobs) + # We do not need a special case for the sampled token + # being in the topk, since inserting duplicated data + # into a dictionary twice is the same as doing it once. + topk_ranks = range(1, num_logprobs + 1) + ranks = itertools.chain((rank,), topk_ranks) + + if isinstance(request_logprobs, FlatLogprobs): + request_logprobs.append_fast(token_ids, logprobs, ranks, decoded_tokens) + else: + request_logprobs.append( + { + token_id: Logprob( + logprob=logprob, + rank=rank, + decoded_token=token, + ) + for token_id, logprob, rank, token in zip( + token_ids, logprobs, ranks, decoded_tokens + ) + } + ) diff --git a/lora/__init__.py b/lora/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lora/__pycache__/__init__.cpython-312.pyc b/lora/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b5680e2e90c37783dc14c132ff67f7a65c3d618 GIT binary patch literal 154 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVS?ZVM7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?D`Ewj#0bR2AjU^#Mn=XWW*`dy+Os4e literal 0 HcmV?d00001 diff --git a/lora/__pycache__/lora_weights.cpython-312.pyc b/lora/__pycache__/lora_weights.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74c10bb3f26dbdf0038da974051ef212edf0b368 GIT binary patch literal 7808 zcmcIpU2GfImA=E_@Mk#u6-oWtrhZ(TiA~wA?KHI$$F&s4v7@AG2U#l(1{7x`Q6Bze zW@KA5)i{NLQjP;z*li^Pi?zBhwcr#rurGb+gH!BVU&OemndugA(OqEkMp?>k)2BV> z&XA%c$}Sezfq3q{=l-92?m6eX`mNW?6ZjPSC+S~A3HdV?Y9D81RUG%^Z2yLwPRX zxn<9N*5V3NY@GiJ3vtG9otHnHl(Q)r^Ka+%bS|4zhM>T|Gx++2H)JI*t59l9Yhpf~ z6*F=sr%s8<%gMBo98u(yIi^WNR$=v@8QhReh8c+r+azX~brUy{+9g)DOLm!)IQTha z$Aqu3yP21q+wfi6@NESr!40}3GsjD|(SYQ+!412m4oSFSALeC`#LL{MP4eEb-?ye5 z7V2DI1J_HrK4|N2;QH%r0}b4O9E4qU(p?4Rko4>o@)8+!Nue8DeFZHI{aZjHVfdkR zBCylFO(YWq{tH|A6Z^LEq*fTGCJzGgXlt0ab;=g+G6Lsve|Pd+a!OX;lhb44x>h)b zoY)n!6dTQ{v1Cla2C-Bwlh0|g6w8gq^eefTlDiU9li5qLOj1|VS7j|RWOy<;X;P8J zY%(JocC0mcC8s9Eq>>*`8V)mWr6WdA&Wy;Cl+KQ6qAq8(oN73=R8j$k;ZS9LQq87Z z4I4w9Mh#xQ2V|ZUnIaME)kPbWKjX(3P#viUSfvgPSp(!RI~#sbRepL#ES&gU-bity9~P~ zE2DoyQw@a@v%bH+(ywRo{rPl$zn)abWPShD$(LUK&ZU>+ekDE9pP$mlbJ_mOijwI^ z@lzsN2b{=H87@&wXVbbU7TPy!L!wp{g!#00fLtflFzMP;vj5K2RtdM2?6=&Njy_2I zm1q|vo?6m<2;1y@;t7>bR62LvJ#pv6y$ip*xOB1Hd9*a}>A5GJJ(#S*m?Tv{H7MXI z(KFT|17}PoAl>{)%q6%SDxCOvXmOQkb~4R-7QjHIZ0mX=ZPXzcZ^>3Tf<`ctF32%` zT#nHjF)|g)$m$qqCgoXL8cSz&ya}M;BZ;qY8*v*wOnSCUjM@S$4f!!O4AgMttdW}t zyH>Ra`y2#vojmdQXOGVuUpV-Mr>oM^S~^>C3A3J?p4<9-ajsZ)^?vH9wBjiU6@N>~ zqk?T)zVhDR)0FPs@Z{xD zT^y&eND5_K`_Qmbdoc`|d5D0D=of6wtOjtKSa67`2zU!yGn;=O!V3IAC>YqpS%Xoj z0XN#vpo;Y7Ted(PO{Vmm8n+v4O3_R!4o1qf@ER@|4Yp|pJfmtZBc^iM(ezlMYm4j} zYEMEx?I@7zWFbTUleJdVH`~T+;FBFi*Jrl-*4a+GpVNyi+MD#^i%zwpp(n0>(POP!E5MOevIY_A0tZiKgFAF;t0c**9n2tOP=p!0t-4v(mbw(z*L0XKqHS^GKMyc^_8`(QOh4jrw; z`c{HruJ=i#t;#_P_SGM%Ix*)Wz;k1cCvZB#oKW>(NvM^)Sn`ps?z_kD9KY9B?o3qu z$O3W0In8xdabTE*_pP=-Zgro73qz&DSG6%*-@><_5=d6r8Uv=TRh`r>kGueh z^)6nxQ>fyoD5tKcEs&!Vuh<}Go*|xG3V!tuBYyl=1vttdgJIo>l%-GuPIlCA9GVxz z%(8O=6?@ZXVm;ah@bh!q08*9Rw6%qv0W~&ec5Xng_6_*i zC$&g`oQM5#0MIw0>qf*JtaBR?bBJ6S~L+dcg@R+a@A*j}XyzmI9fk;_ip31WxPuzpz zPZ;ocl#OJ@RV1In)y4P%aDyGah(y3=S^XJ|BN|bsh6ER~m--zgP&UUNC4lSWo18SWjQ@~>={|W>qR+Re;ph>rj)&sl{r%i&rGA-B} zz%pG7wurkaW?7e>O6%7>4IKtCtl<_#YSTnfL!s5Y&UMpZp>-_mYJQS+T=_$2q)h_( z3%HdKar~ar+;SAziF!-~qgh|AA46`n+DY^1 zbJ`C4CK1y#Ml2Sb!PP-fa3H#<77eg{dyF+xEmbX4CgK_Cy`o z^gW{|tdOqOCb3p4-dx#d_FB|6?I=3V*R-6y@|;|pD}0RV7J~eg{;{C}HGgyd8IpAk=dTcy- zS%$fAjzoR94Nn=UNEoNBehbK1!ce640o;{orpQp$V1CNXGoS( zw;8s?0mDue6Gy6H2lw$YIV)eytFIOk+gvZSjEUn4z%xaAHBoPW7e>)Ar1&qqZ_Dk2 zOGg$DKNu(<9{8uer6b?aSEV00gUn6%S*V{epQfCK9j4T2v}4)|yz!f!vf!0#A9BwD zG5FE6s_8V`RNp}^jgPUH!RAILG&Gsk8JJ_heyi7ndelmILMA3FiR2NpSrql4}kHBb}UDB&oTE7mjnHku5PeP5#O4Z z_wyyTiji%J|K@3sc!!v84B-s@fvt-8v^LoI=ib)u9TPr(q4gB;{l?jPiVOUv&wGjw zZO}S27}R9_H)I+C+d4FB?v!dKtd@cqdyndBQ`c0a4@4&P)BMdxG~0lt@7}Kds_m)` zV~ghQ&+Jd1?7+w!hj#c}*UTgq8LYi^5lkt9brco=aSmRf$7F+5 zf8p<4)PH{MXV)Ga{Miqe_q<}M<=L6D3(O}&3({TXj&kqdom^Si|H)8g=dRMg?75k9 z3&ERjSKTBOhafZ%e&&lFUgXB(t$CCmnMP!ZFN`9q!*?occRliy6w(had}Co`(8(RQ)a_Dn4+jsDC!C6lSKmc~;8Hz_Wc@MVdsswR zX?U&pD}zHShO1MNT*C?{0FA#OXgmS|)Seet{H>gi!YN20hC;-M zITu){h2BN`ofpcH-BmZT06Io)zdQefxgRVJlp}k~fq2z}d;pP~`CjDv0IJ`X=QRKUXKaN(q5$ic#SVdrGgP|;-ZQ1PT|$i?C;VfUnG$TL|oRKlL~!=;nn zAukJC!)24@L**=N3s+254pmN84OLB64^>ar4Ao5fhJ2H?L$#B2Lv<{TJzPK8Fx0@p z1>wfYrlF?E=AmZxTo`Vd^bh%2*b#1>Y#VB0VQ08~vSX-YGB6Zi&qd+R$*!R;7A_8V zPxcJ;=r}zWa)f%{(Mh8=vhM<|DhM`&tP@|&Od*AXjiE!KeNqpgIIKUBoaC^G8^Kb8IQg|DMjb8nMf#jcKUHO#zygHN3LESpStYVi%xo!@`T1O zzYvXxB~OfuUI_(r*1e%LyPz&8L~LL327cxm(WX!n0Vc8m9 zwGWwtcoQ^*@`C2MJdTs!Zdxw}_6x;K4 z8I6u#8;N2Hrbgw>$67;S`rGP(_Z{6$F3O_GTQtPDZ<#nQrc<`Fj*IDI+_1boIc}DF zTXzoY*MykT7@EBi*2Nomd4-b!qMGyQw_T-$S`4--BqTxztRy1833D1PiWsM89 zkBtkDK=!yyP<)OH>M%<3xL}mH35;8=%N)n3vKFLK#-S6_#q>%Em{0w=xrz%|PuH*n zeIwG$`o^cCzVYCIuQBL1iDn6Wh^AoZ(#&NxzyD1NrGuww@LOqR&{iKFM&soDjj(-q_{Eu#u>8h=-tjcFqIvq#L})b1 zW|*%*3~QoTAg`X$P&hn1{I*W&3!OZNq>j{5Qu3%rHJdt`vu#+t02= z)!{0#twRlds~0**VIBl3CHm9Gv^Nvo*6!K{=(q!$)j&`QU%ml(MG_Mfzt6He-JTy z@E0kD_cHg9(Y5rVVD!Fxe$!SSH~lMXlJnn z$XkHt!jL1M%lsFtW~Dn=-OdSu1UXJ_k26#pEJ9huA?E}w;v6RdzM$(f%V7XA=thsE zkcZ_k%e6>xB_Vgn%uaIYxisXFQ`0Dv1ii1C)P9$Q96@ui5P#`^W#6>~%hA>fwH#_a zjiOZEXZPdtwyzSnl~f0{)ElgN)vUI(bk^*z5%U?6KnEE>qmLeZbmn%jGuOv}XFx1a z&g@WlcswW?`H`tBqKP6SqL~jxXZWdeqXYmIN6Vcp`7a}IlRykE}Qvm>c9TM+Vdp1AUE}i_WXiCu` zqa#5dLzW@|t6B(8uwgDR;=_T0#;``LpXz}i6c|Ku6Ez$>8Otm zP(612VdUkHz*B(sAVsLL3efgRu#TO6#p-OU=N@H>5CD0ZOOE4H$BE? zSvn_t*MT(qg5zUjAwI-T$jmu;W;`6m424{(wPa_FCGBFDl&gp*rtBYSv+RKtfo||Pi z%96z;H(PJC-t4^5xn5kK%Hy2XTb$8pO>xG2dE_(4mQV>c@`Q@8k;iYY<6EPYhPj~b zRhl2=G2>SYEm21ND5r$pk!ybn`x&EeI5(Qv9O`~A{lb&~S+&pj~txEq) zcT=b1UMn^NU;GQhbsgV`j5rXTp_ihd9j;A}j$9g!0Fwvatq*jGMz#+#M>&80<{yOzHdKDbqjq&AEpLuD~fc zj@@qfQR@#|6FoKZ1%x18|hL<&E$ZA|jbn zWKj^9xeTDB#JxbNOX0oD-FLWNpIe-x4Y7P>b=SJ1D{kpx&_>LsdEee!s)hBo8Ug9> z(OPBKxTiRIWl2y}Zt1gBcVXe{ZTe~sgw!wAn8%oj#gkRB_DS(UX$Z@=l8 zcPx5h_G>ym7}U=@V`lzX%<{7S6!(fQny)l_!NX`?OU#l6M?sY?nlFkK@oh14%n~c2 zT}^)znZ1^ai5X+2ca2KzsLRGXW6lX+=ebVpt263Q-c10OS*^eYK z2Y)nH_}ML>^wid2wVyFtMEBbFmG}ul^mE*!TC3zsm-pI#_`TX;RVs_+%QHZ`lQ}j6 zrdKVo{PYYc>xG|;<^9nWrS=K8k|+H;UmVMe6;BZ1o#TQG<-YP9?M9mON2i-Z3+G)? zg|Oi!Dszg%;{*YO95+#$6M@HAv4p(GTx05*Q2H>O9tm4SzodKZ1?7!a?|!E_QAZr!)?+&Y|HU&13#`m*B6qFyg0Sbw+s0+%ifzN&hOmUHe66rZMpfLR zArj?z+61C$JQ|ve@Gc4)fhk2reRNv1rkRpFW(`~hHpZij3r+kZjP;28dr35mOo0^! zOc|$@DLT!MzQ7aa3C8ful(b~Rq5($`|0Km)rmuph3ZzX0AEpG$Cmi7i@q)(&=lJKz zQwN^^GChuvcZs}F@`B_|lXnE3Xp<(L9siO8Bfm;XOdQ7m_ESz30*=&3)>8x1(5~#1HLcv~# zxH4dW&We>r!O@)bcf_4dAGoTL747kgJ#qIQ_?s>5zhJ9<-Fxc&qsZtliMtygm^rUM z*}X4W*%1#sCR84JV9@uJ|805Y8+9wkZZ|I1t(WgfnYgk}B&=*o_MJ}jJuUP-z25h1 zys~Gru`kg$C^QZx-8Cx}@c@$bx*|C19)N5qOSbp?;N;sU*PedAZ}sGQ`;+p! z4X0m3>$>`GUlFs?PTJ-*x~xSQ^|JMQn8J6YNlFWnz^?uY-8 zyL6qcb@&_Z3rTNR+`IpNRo%)yp{g@cwOgp#y>u*D(R=&ak77TJ-5LDhAKtAv7Iz$^shDjxc$BMJ1@T1Db$=?IEs3rpLf4T**Kwiic+y+FQnWI-^5SYs z+}DdPc9+Oa+;DennK^g$qdW3*cSq{Hj%(;mUC?o!@|$OFocXhJ3rCZ7=j&$|&wlHC z%D{Q*F}zOK(t(w>w>oZh-1e+H`s0>*0{k-FI zo*+6U$0_*3ya%|0D`p37Z9_^8Y;#-}^Jrl_ti?shT^uV?us|KRq??L;MlV!+GTkKL zC_3qhf11WJN+>aN6Dppk!~kmpbNSr@Wd{1V`4ZNO5~U1CSxP3V(&?3u(q~{+fcNvI zNJS!q92YB9%83=lO84XKYhEK**I@PHpHeI9Y2N4PX@OFDq~>$vE>v>!|CYuH7@sev z`FNc7M(dT>m^a#>gq5~O3^DIz-MlyE#Yg~!!$_1)5Z5rr1szIlK_^Yfd|9jvxSbBT zUFBls1aZT1Tzc$)mj;a7VkO@eZBr6yw{}s51rblQLrEJfTGaguE?B&%hwoafgzx@d zp3)xBg-SwArs9Ow8dFD}*T)K`bg{zu@+b+gn49QQLh0XFdCU~_yj!B+&+|o7h3OWu z_LYKpEw9{Hbiu;;iqFw5b(F#URjH$iK)M}DXo7faIWDH=p0Akd{~WbDmG<-A&lnA7 z&S+GCk=U2h96ZL1FI%JiO1!e?Fy2P&Cqv8x8odG(9;nDg>$~1}WSZzHuAb`#)o@*} z`d+L>%&tUVv&&s?1b!#{)TzJrVar=}zxkg{GK0h1Vf z_o(lMk%+{a4~|4fvLyhThwOmw9Ek$JbCyqCsTkrYE9rgFX|VULxwxe&DF7>tPV(%^ zGPp$ppwmWzAz5S~VE{Ay2;r}gZa}Cp$->$B8E$|kF)0p}aA8mqc|Z^n@SPmz=}c-9vI*YDEow&sHW7oNT}5(bZb1|k(cyE=)VHQ+nS_*%Z}FM!+5 zSnAXcrD7Uc>PSvjVu20#7$;2O-Dl*I*;pA{pk@r{S!)fXwtAOI1cPK&3lb1pkO-bAxBEhzxFU5Vw z_zlBOViIM)05XpQ;_A}8H_+923Iu0dGs8TzLt?sX9Itz=G^k%RAk-Ws!i>2EgTyae zG-_~bbgy+Snze*1iv|TAUxI(yeDc? zN(-77O@Qrri$?gCMKk>TOgKLa9f5PA^%At8Bn_lFOMtc(AB|)Uun`3kc-%S!FN0|J z&>$}AwIxzYS7ma(G>e>v7{U^o3PwnTq;M+?q417G{l#tcAa>Mn23S&g77{BNs!j^t zF0U93DaEhQn@BA@iS&n;DS( zY2h<3j6+ySyN+a9pH{VJ041h_r9!RoDc`4szDqMvA0HYGfzu!KeY!i~dm3A9NPR0fa~s6rCa@dzLTkIYfOSu{d(M&dZ|5j0&ilV;RZ5FCW7;qj4KtdYe-&@|uhHw`?PJB51dN{p$a?5yIEhgA?lib_ga7FH zNun*0zl8VqVEWfIy%~EW_WiFe**0tUt(|_qdcF498-+`@5TWE zx$8NTbT=j39fG@K^~yc>K&pcC1s+s$&brMKU$UfO<=K@}DNfhfnrwY6?mvW>cw>LO zVGrW=p3$XD2G3dDmcdk7oZ?JH#h^-^b&LR&?>A>4<2ggIzRE04WfSjA*t>B~uUi ze@^WqzK>X#js6JInkDJ z$cXv$Fg%SN!2b!=W+l;=R3mC;`%o3zSjVAP7>SNdK{{j%kE0ELO0E1hc|@A=-yv@X z9*IInf+QFl2|{-gVCj$O)j@dvDt?Z_v~{FS#CUFl^!P|#17jJ8PPP}N215FT5iDjg zINMl7Qd|*>yBf`i$~h?IK1#_evk%JQtimM5;2YFFniR1pV?HE=M|5OFv7y$DiDn6G z@^oa1`OqSbN-V-c)-)OuRyY4;DtH-QglIzFm%l9IV#=@lh##5R=aHFx+3l@WOiYQ*lc}vaMqwFS%$Mry=t?j_RZ^WTqjVLbk-)EU4pYK-g6Q)`3mFqN&u*uu0-`Iq59N% z^_hk9AC$BtoxTSKV`<_2lIBE7hfvayGT{jcU3H1#JwoxG2YGm2IR3!O6*R6q^K*O0 zM+J`8pILn7^)D@cDPGyWQP7bpl?E5c(!PnIMR`^!<_U`qjfu-Zi$NuE}0|Qsn{7D{Hd{QSAop@;F3Z3Z+ z0viRL|EA|h{Xgvg(ZL@cj30h#qxT|O$0|%#wIr(gg{uCAbDOTJ<*9dr8?N4DU01xe zd#Ugj{z`dRCX`rCta2ChP`)Pd(&We?4G+XW#cOR58QyJ8?A$2 z+9sT>g0nT=KKRg}Z!5$kmDc{=!1Pz{=TOP${+<5 z=xk}_&50WmE7fab@u!~ID0z1A_(#Q+E0*QCxPLHSb$Gq_NZfwpK2&zBo3?`2`xpCP zKe%`>UcPI?*0)*KkSJ>v%32d;yM(e`@xJrxWlt=g+RU7wJsSmkH_NKuEPtas>1$A5Yvk2HW|+pim`w450p73elM&otUi@I;406Z6kz>zsh}J ztVP`Y1kD}K?|;*zD&U&cjiYnpC zkPjGcikh18a#qjBWeCw&6s7bO7t>L&kZbfOODd_4868XpdOohWpf_3X7kg4B3ZR$1 zhLt04%%<|#qtrPTv~%8y<%>7wQU&a>kUHyNL1$KPi&$K-(jympbZ_gDhsBp<^{JG_ zdDTvpvFCEGws9qT%a&qIO;u7StLP2umwcMzNGI9sEWNN zQtX9dE)+wdlA0IF)6!lTvrDSI!F;-15VS&7*BY|PPf+Bg! zL*8Ins4Q3>tiZcUyekJswv*+rLS5CNiV)t*%Df)B3xWEJp~zf=%Q%&k7j^oOuZoo; zwXGJ<)wteMCf8Yy=Neq|DVCob@$6&IRCj);HpogIGY6aSrY_j5_Ob=f^+7-7WW8?1 zb3;aN+VI?%k-i##$W)kEwdhfmvpg*@i)$nOpQRV zb*>+L9VsZ0XVOE&$L_UU1z|=wOGmDrZzLd^fINl5gqOcGJT)^p90|RMYeLG?rI9G; zsNisP`U>EqXiY2P$)uWumN9g4RMqQ2Csb9?G2lUxMpjxK7;=sb$+%S>!%zt5JVx@M%B?EFgXfL8nb@9;)6ztlXxW~Sct77W zKpxhnywff(qlHAXF_-37KROSz*lZAj*#gm5sYohzeV}CX-}^LZh~-&9H+ohx-Fz#>#!=fnlzOpPZTW0lZv-Vmp}R zcsCOQZ43Cs^o~ZM>dwy)AKN#gHbZF+-LL}yW1uL_&ZW0XRoPH#r%ELlmR;kv%BTgt zooh_PKdBb>jP!Gi_fI@A)}icXGBQ*Glu|CnFq0x=4t_d=Xr#Blrd>{2|2FEQ)V~OW zvSL2PP?3?jx^vS$^_Hc|+^1WXxQPHQc8sT}^D(B3SV-K4$w}qLq~FK?6jg|Bx{C?^ z0vPqdVVTtv5uL;AE?_7fV^`Qf#HQzehO80`;XhEQj!Lq!Zbrh>xMpgE903d$a1g`% zf25ZZFJT;t^|X34ZK zh4Lgr6Z0U-8Hq-D@M7efpBM$6wl(U^x=bo@APc}2=2QKlxB)d11X`YIo| zpjZf=vGL^Dcu<$64ypqcp~F#L(HXZh(t5@6CRHzKoWNF&%)x_xcnjCy$Hfb4a$x_0 zal=x3zq^-ytCE#<^jns!Xrf;)uwNvzB`aH`Xy{STFV3$R-*Vh?+;w!uEuC>k=Vnvu zTjRIJ<6XzzFA};=tv8*1e?(|HPdVyA7Tj)H+x6qe-g`{wIYm)b$?^u2>Da99O4{p^ z-pYj6FF>EjReE#ZjeW_I@Z`i3>$uxs7fm%`<;0u07OJ6GF% z+x?^RAC|BC*tk`JCBj;l?860?*X;MZ`{=Qp-B3h*wKM{C7|PeJi`ILV8VTV(iiyjT zFc5M=e_QSFvXrJZMbOH@ps1efMMD@Yz`sUj{sO#=O9MpR;@$A@oL6NrrPl>}^EFx) zgtXwws-t*mSHjf}rgFwj%B-d$vD8!&BJX$$K*ha~NOo#t*;W?|2(pl2sZ6y$;iwTDH3>(DkiP!;|3{-4T-nytyJ$v+#-7sDH0=7OW@ACG z`{NcrTM^ogr9*$G5mrW1$5ZvWZ4wI<*vf$CT=llO&wBsI)IdTQrHc}OxcG;0OEqhl zn8&&?9?ZH4LV$-YR}r|Ygo$h6X(TtpF- z`j>5Kt^KsDkS_PJy8woHBVj>Wt%>PK-v;lra9|xl?|D;H#hND^N{iCJK~v16h5N@0 ziWUd3D(w`SeT^C978R%KP(l+PC6xYEaJQq}lKy#YKh|UOd>;CwrIP|}2nFk$$IXT% zJ+DveKE?9VTJB1!B_03o(^P~O1rW1L_|olBLdu*W|Mzxo3$BnNgHqRgzH$dZxs4IC z%v-g~pCDu)#|86f#*n@t=LH_;0Db0dv3#wzYUv`yY{9&DRSX?kZJ#%10di{4AwdKF z<^MB7&?-^mJG8-;(T2huO0xgnB{^c&&p8fG&y_)W*_2fR?E@?X z+&w6s=-57YG3XT1%qpp|qqNgGf5&tM+tRr->n;2p(-r-W=^VM~UW?&YMsduNrktT0 zX&I(NjJa6eM5hu?|ALm++;fQ4m>a})rSmEwg=&Xhq5idM)I+@Q90wRiCp#QV)eUup z0)OG$zOyjrAdvykewy};_>MpE1qk!$(yL6dk--?EvWQXv2?%Lr(03KqHUh`8PBl6& zsNOBY8`N7J;9f8&rkf|&ZYFCKV|rO-!jYvik$tmcoc~)C6QM$Z0cYIo*tPwnQ%sP^ zN1lcz#6PF=uMuUj>m9fcTTu9`y2Ce4Ee$SRPrAyN9)mJ_v1{r0pV~jDJ1pPbba`~= z=88crjqxG*;}Rz5C=DV&w2(cGIa?sDeWHlW4Z-orUlRfq zki_2m5WMIw=i~V0C*g_tA<(@cC~AfHdldN{@)-6*R&uiKWBJtw>4L zWmp!I^Y2m;QpAyj7IgPPl85pyQe>38!|-tD_=TCNE7COw6K=7CMfh!sw?M6nA#quk zBBU}zD-UV5^xE%{Xd0S0(J?m8gCwG{8>S(_RT3H#{wBSY$Ua8(O4=H06sz1zC4@vQ zx=co8X0DQfpb4q=!r=7vsf=eEu^wf09vHfy^epj+n6Km;$&A00j=v<@(uE;W`r(o^ zDm)s*|7Um+I;LeBEc-`N8F8uxaOBlDSNoX@^iOCtKEMJ1nP%neuBBk2q)otoTU*lQ z-7K$upvMCJsIG1GXrg^kXdm3DJG^B=6jB!yZ58BISQi|Sjo2&Vwz|#Y%6L`xdT~$O z-jj6IEm{^#3oqiT$AU3osS+$z%a;X9Q_@qG@B{=;VD-A->4$L7!pyBC?R(b-eth7) z1MfFNOvi2txAeRLr`aYq)FHzqwUp7gWS0~DM3+20S&)wO# zUVeVzwCE@YW#z739@}Wzeb3eZ?;ij0`S;G_7D&R~gsU6BG3XBzqYM6yM9Wd3qyk?66$s(>IQ_mfwgX- z?!cXj#Nd+x{?|RZbozc>ukA z*S->e;>o!0;JWX^(n&Snd%bH$ja(VmQF(! zRNEt!g$rWtwL$zv*2)Cmk%SMD@smo9+EbLHwq)rf`c>bZsOuN%`V(~r(CW1-@4pzY zJFrfTh6p+570+H>K!2)gAi^eZlP<3OysC3qhkGY)7Q9iga^-gIt*H%f-%iitw{^F| z8{Xdgzfbym-rkghtL{n_bFE!Vr#IY9Nq;B%z3(i!>A2xYy35r0%no!5?(YARPbxx? zfR!ekwSu#DrSa#^meeF}x$H?z=^)9JXdwc9iZdd>D2pZQNe_*12+`L6>od zd@(&?;Iq&Vj#5p`HE_Iv^n_=*>zt$|EZy%mNU-+5AQ1z&(A#4?8iHp$#b%vGqH8$A z`jxWz8@6qJjZ}X_`~NW%i2biO<`*U1-UOt(ZhykvDY!dt=dHWs&aRu-6K9 zjN@H<%WpRwWgm08{Gt!+&XfVak{tY%)(-A3^7k6nbbAf_ITV^j$kEWU;e8eXMrz7P z03KDBX+Vl>uxMSyfe8k60M(3~G`RBciIoc93rE*w`9@I$rn{JxHR<@GLrXDwL52(GDTLyyzsul*>R0vd&C3fbytOb>a>ezxJv|=aT zh4ya6!qQkmh}GZ{z~&Xo$2$vtavWkcD10q>*gN1}kketW^8*Ng?fP^4h>n@7P|^>< zvR)KB7kYFI%VLa{zruTof#qSGkz8va%HYG+X7=iJLkt>KMql9t&oyqF1+8Sip~8-$ zX7FcdHCG6hiiD+2u(T!ZE?fzezyRG1JimB;c?2i09A@t-1$$+}-Xqw1eq%FhS$d&0 z!Y1v{5cqv2O<_pDHm8mWEj1=k7FtsI%xO=W){;n<4ttmSxn^dUF66^ig7lt&RD$l8 zGbxF*=K|;aCI(E%NKR;km-u6B0vQ5*0jc7&g>_u_% ztPo5|SegY(^M=KrtZiH|ueyIw^L7o83gQV@F+fB(SHe;+SnBUt8n^TQi2uZ<;$Opu z8Q6@wNI^Z};*x`&g5gyoZj5UJW379YLF2n71$2Rbq)U;%`f(m5`u z-qQ+N7|#wea4k8zCP&P8In94EW$?l@M=_?4s&ZB)QLoHt3La5+wMHy7473hOEk29_ zgA@+pyg8U3gN*}zE@++4!`(wr#L6fW7mvs7CX#E%@?hM|z<7U|_>$b1iF%DzsUtU^ z|CxQyixN4^TrB_h?EOEY?-;ixrQY-}U!1W*%~6%_p{!75K?7EHC)(0^ln^vJR4ypV zyft}0>Ifr+S(6LHY4TO%)enw|?zMkK+zP+#oJ6q_u-?Z0pyj;P3U4P&D~(+Qnc5Ds zf+Z=y(`1%s+L<$FeWNqD)iD*t%`no{SFoZ8bO2a3YCQ3n!r(Q`KqE=Y-0o|P!1Njp0T{L$ zzdSV!DRUrwNdN+iN!UVy`a0yp*(QYjWs+X!Pthf@fNW?473Fw|3qspn^x$lUf>N#} z;VM!_2ft-_W=h50nCx3J{y=;eF%P^i!pkJZ`Xr+GFOWw(0^UI$Lm_^KkZ6{b&kKkx zqgoi|os>!>kKnNAP$cPcLq)GDTgP0H*=3~%kVT^)87E^{3~o5kPT|Y2Kr+}UCR#ud zu{bL`>?ucZd4?3}C4snDnLB#&3rP*WAZwk8r8^{!5F(ali%4v`4-uz@_h_t46Rp6% zLi>plGUfNYYrrj&{GNYf*rFd$k_|0y4crTUa{Zopu3rE><)1L=6 z9NkH$_r9w%;pz}vxZ|?n8bAbybf-Jv=oTEf9kby$xC#2T@CFoPb)J*D`|g^Dc}9?` z&`g!+*+#+LxH7u#Ze8sW+yM|@wT+3|{X*^jMC~D=_Rt;A`}+0T)1-D@)|zy>Z(46y zp|ri}EKj=e-+S}?4e&WY#We`-1}J({)TtY%5^gGY%VsVr-paEwHRy7fzgRv^);Kx` zC zcY#~dRvdRWKeTYrm6mEk^rpKr>BawQ2;ARvyy0MS_ZGq1LeG$NOG0=w##4gER2MH< zQ*~^NYVJB}-nHHy*m&%Tjg}`raC#Y1Ns{%Iiq(k?`(E};GWONB4g2o<_L79X4%S6h zb|va}3-!ByZts6EsOK6k>KL?0FGaS*M+w_eoDl(f-XcsVlkCD-te$cz)VD!f40jZ9 z8tT7iP>@AvjTtgzj4EU&Q=owEWGGaFo{(gNfZxV*b9rYaL!5I>09+B=SOv=I+h!>p zl$&&yrhU%D>}kS;BV(nqeQRVYEni136fVJ22=i<3vbY3%Jr?&|8l6>0z?^M=p7z1x z$V?J?EbEQB-VMht*{-H%>B`dc@wyXnJ6)|HvzoiGx^2!sH7-55sp1HePZRRj9-gmtMiLK!8eg?TD7!=#p* zGS+FDC*3}x7IR!o#Zvj&w4^s?j;b`BGXH!LO3=bM(XiL~)#);kOcS3bYiI8ZP zXrIBzcBuOWWw)JwZZX3zCEzQ#aw~>$F;VLSGIpxdEE4x;U z*PZQTajrRykSFR7$hi1@!hHk4oL9fH;oi4dTD#J;(YkM=bU$=H6RtkN1(R+YuEY0% zTaq+T#kPTHJIYrY5_Nq-UEf_tU$V3%Ub-vp+yx(^(B2(3>~H7-r2OTIn5Q30dL83p&>_4ePds^6)EJkC>tG5W`q(29$h zmFvte#!HsX%Ep!O&ntUx9EDQFi<=c~YJXNnq5Qi#h<-exAv-qxku6+03Ng5+ipZ~( zqxanYl#8p{g;DVa5~V#tY0vFF>!tgbjLGJqB9>A$S+xjV;kp#21UV@il zY6P$z_FB}6v~cg>4qtzPNqae8+8WkNx(GfV7K8B;vcxRw+w_8ir6)`2e#4wQ(E429 zD_Ir<<{JQ#3$mivHw1Xt?PM$y(JWaEk`0>hj}eR_a3+&Hk`)d9JYwedZUZx#Y?m}^ zO0}*2eS$t`QHEl*#O$)-K6u@MrGkg~9Bvz*SUiz*HYc0`!5LV4KJE;xJ5Rs=MUX}X zg{0lxPNaX;icN5|ZROz+7ANZt$QDle2gq`31cmbn@}7n#i6Fl$@h<3)nQQvp+WtLi zC~;AL%k-+ZI5ZR-g}Nqi4(`_O2c_+@B9iVet5TWFbw-3FBgH^7NgPA`<~sh1+H37m zgczw>X%fMDEpZy5MGz6tsmNT+aFl!9II6!4N%He9yfZ@ft3`&XhJ0@5U-KA|dR~or ziksDqf*hx~>-3$4zqBd`)ISy1P%SWr4}iyJXq7^Lu2E(o&yG*Sqeu;Zpj8rT9t(B zWSBHomBXh*mX|8|FUxnr#4+BxIt0+#7~CSc9XTBfZz?RUc2YreP>80ePZ49WZeE_(&7I5 zj}4snNnM1X+E4PmgZNySQ|{i_4CHOZgUu2jQ^(kvH=|47lIyf=D`gPxSjiF@Gfzt2 znY#xCk<;1-F++wJ$e1BoP}gVBaHSlRsG9|YNy|tTY6a7&Tvbw?H^pEG>vgldDkFF4s=AohpK(vaFpzQTx-cZV6f!J>gs(JmbN=nHP31*paf~DXnikNvQ3y0F zoW6v!NpO;BA!i$~z>7Y%N?iNaS-~P&-!|SET7wk|2vpW`I>$b`j z-CbL4(q5b_a^Ljc@Gdvv0)O#x>v~b+g85hdM*y|EtsnFsfijbM;n>nsi)WU*@7gPY zE!aD5$6$518+U^wc{t2}+_iN4{l^^Qqy<;*8a!y^KS2|-))*~Dw#JU42cJjU&BUk3 z%M@23J_Q+SOtTyncGo=S9;*P*@_9@e6Vz$3yI_!BVP8PJXVhG67UclPk!^vf)sGJu zV0~$dWK|&_2*3j%+001)Y}z1fRAq*xKgXbyj+O$BLoweNV+1M$zxhy{!V{5zuR$Fb zYgU6%#_z^H#YvFs(t^uKBTgKWg5)?Tv!{*aT0g54nAWt;>!a#b2Q5}vx>qO-dpOrg z3STi3`U>U-$X?AGNW6>1S+*T=o#42Z7MWK-)BBrWv`ceCJb zUe(=mx5AQ6Nz(&6=dOp?XVX*u-QIXj*Lp?w&pkaT($%o?RkSXdlg{#l( zECv$ZMu(UB;k1!4h-+bUQ2v-Dy_e7@U$eATn*%sW)h)F*iQ71U+Rvue-^u{f{jI0%{vR*1KPiGyGMzn?iq0%)!V=E{<3wx+z8K@g7v&C?OG}k?yUwIW zs{e-uuMH*R{3j?^CD^J~O7GfQm=ZywU~gQxykQR{Yg-nM-m_POiEEc{NF$@_Xr;3; zkm1t~AB9v_FzpVEL}m$cQ7Qz}c=x-&C;oX_QwDyRhKVgcf=~RHatEwXQ6H7QsGMQ$ zQ9+ur6X?)81t5{aZ;fE7S+QLxD74$es$%bA*!Ph zW24rNKqlK#du)`^Q*N5;CDV9PFHu6K5^Vw56HTknR&Rs3S??{>J0ci}G3Hu2Dw&^M zcLd@V>FdYZ3w{r4HX4iHT=(c@p6&u`{TGmX`y8=vAWvm)9W!!6OzVE0=neYq5 z1%?arC}iPk8&dfcvT`18%0?kO83HVz5QIr3H|;mDX-0Br6~MR6n;|W6{gMW~+sK5URAHXUy|Oojn%!gw5d<1&*1hw-1>Y@Ttk6JR&Sa zI{_x;YK?>=Wa5nMM#tEcHpSJ1V!uP;9<@b_puN5Y?2hSZW9gG)cS+|XMA zk;h;QvdJ)mffb+x98XY`f$l@n4#vehjwjDSl~9w}*Xwnre)@Vne%4y<9J$l?e#4!8 z!oc}YDCPlq(kxNNtXU#X0l6&#TG_KC8IROLCh?_sGYm@7m)`Nc^H3fPN?No&7c^#P zvub(OK4gV~LmR#el))!o8Yqjd76@hWx(G76ArL9BO%#{U%I9qnUDn~Z9b zSy!1Gt#+qbR5x8I?T6=)8=LN7F_&M+wq4R*w6s57+rL(R=ZoW((8<2cZ>r)_1hKPZFcaV} zA=kxY;AV9qYc4rV_#u5+6Y@>WGTSyZ>SF$Nu!BL@hJvzu-X|Rhg!Y2o}Ws?AzBv}oAE zNg+T`@ey3B{ZUHE941T&X8bC92XBoT(Q=USKFYOoYUwm9A|XIv749+I!9%!%QWvIM zvn`s!bu|}gTPzZZX3ygppX@%G$!G?#K&C3E!}zE~RD!UKdbo}`R*(`JNzn{D9W&vm z7G6RbAQ#+mJ>-~I8K6ZGM414anAZX~LBuXt;}+~)oA_EKs6D#NzU~C;><80tPv3PO zxUbw`P1rjHd*_C|d$XwIMyXKbUu|A5>RB+eORwphlPkxG6X)#VQI;)4FR4}b6BX*&IzmI^#4S*8ScH3;8tXWZrN(k_G z8e+}#Cup2IP!mwXY{t^yU0a9BPD$8%1$*y?y$`B(kjP5|mf1zx3G}yh0v_Whq0PB2 zy-FE3m(u6B%oKE8m(NEr^mUe!VY*k&Fl+ph!9RAoni9J zxtRVH=n~B9zpB5kQ_M~H4d>=OPskPn#zye15qwvTj$7ZHOR*r3KE>(iCawlVh2vA( z3{!{(e2pd>zW*VOXXlQKO?$d7($LZhk#1s@W~?uE^QZZqy+^V@VaP9q&6#9z^J?D8 zY`p78+<6q(j;rv#t0ZO87nJ@(%77%FGOg5CDyq5f=kGVXXt3s2qSHOWVw-jL^}&ioYRfqeYH_;|`#gzqCbie8^xoK2Y!29txh zKopX=KopX=KopX=KonBAK)-R&QXyKVkQ%+*mjD7-kMVo|T zu&z;$cBC+58^kgmHcTHQKZ^8O)EY76Z1~%Lr;XFN)8@?t78$BZs;ZHE+YJLw?iC$w zh=X*7EpOb`S06Ss^Wo@@v>9vg{R|| z(;v9%!Mk&tlZCFNr#4xGL$V4Q=s+zV6d)Q)(N^@pfzZompz3V*#*L+HY4{KG>(r$; z;fapX>2MfYe$W_<1V%26@@NG5fE$2x71&4*CV*=gH#~!)tsV6)PpA6oK%_Y>1Dr89`GEAu#Q1z5{Vgu(`)TDn=7>m4bT5_LxO1uiB#9w@ zYy=7el8J8rBhsQN0Gf*KB~J=OqVEyurH)rn|0*f1G$SlFAQ9136kW!OkfNb}6D2+A zVJUAl<)t$~qTR^$z67&tDYT!wgXA3|?=Q(?3_Av!{u$PjBsHMD#dbgre_pigm%4P2 z|1Of?KNq13Ffb6M(`{Keo&6VF{)b${hg|vJaK*piyuaY=A9C*hoooJ(tN$f;R^ZP5 zk~{q&*Zv{b|B2qB(|^LjdqCcAiaFg$-7mQlA99C3$q zUpD{BXj?e)wKFg49-1w>z|srLk(I&aYq0G75cvaOguC1?J3clWbQPc2j~aE>2ORmI zoUrZxWxR z+Pk5S6UITfT_Lq R6qSc9Z!UT0LWphp{||o3tUCYz literal 0 HcmV?d00001 diff --git a/lora/__pycache__/peft_helper.cpython-312.pyc b/lora/__pycache__/peft_helper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2515c13b80a81cd0dc31b1bace5412f3961f91be GIT binary patch literal 6076 zcmbt2TWlLwc6Wv|d{Y!9>Sf8&sETdTvP3)HSoZn_M~WYbBW>4?Hz9U07;`SAk%y1m z8CnrbZjD_Rkhj5F-hNm@y08ossDcD-{+b20*dX1H7X2t0s*pUJpfUOYivG}+iy-+Z zdhQ(#O)1)+_7Xhz-gD1A_i-NgocYI4$d91t!k=pA+YtI!+Hs0lZ*2Vn8uyWkR8B_; zyXF!c>+=a7YF>Axg@lk66Jpw(aHl;94@c=-x;O1h_*h%e{pmm=z}li7OotL7)^_VH z>2M;S4~8)$fFBL_1s3P_XFO>eo)_1qHTeT`TmpkrecESdHvFrE0?A& zSUy!Tm87m1h9#yntgDurRYr`nJeM^S6Gexn|8n;G=O(h5lr{ssfR@orS!wOp*_#-1LE@f17& zMOC0!IILF{KXf;a>%)FkWOSnHX;65ZC<3_7N%ht_`5JWoCb}S8qo@XU9cy7@!Nypq zX?Xylk=sI|O>Mi4674vuw&M=9RYfU5joua-0_((4 zA5weO4(w4o9d2qD(6ZC88D6#fw%C}_rS{-%z$`XmvNJ%E&JW)(7n=uYaMh+*nl$KO6*D9C1-K+hC%r;PB#^zW%Gtac}rS4 zr{grvnD$=mwJeTUemzSRSE^BD?R5DIq$Epf6hUF&M%#%#AZjp?g3n2Y({TRS~G=oZ95zVoo?SMpz+#IFc zQB=-Rc@Du$VaG6ZfA37bX(o3C#ax96cGXk~Cg|-;T1ql!F~O2T|Ct3TpE2^eT$Y$1 zX(ovjR^Uw7uth*;7RGK~zWlnB&45UwQBA>%l3&-#z`a(@oej>%p^|;qJ1x zdkYw#ZJI$P$u#3XpmCqdmXho!7b=6=phWfpv_dkq zR#RrLmR#s=mUi>VOLX3A0B#{F$+JH@`+2aJ9qEKYBH(nEwulostIkgtFC-&jdwLjX3pRL5tR^#WYBQIBaUS4c@ z6zKW9t*<ztM(elC5i=j&Ql}F(=*u>J00F=E$Uq2l{-N(P)5@6-8x4Kc!RnDL; z=I7A~x9~TICj#zIW5J1t`_p40K+DGh1_*Fwt*+k}A#|FJP+4yF3G^*+O2a3-2$p#t znGPQ9X7{^UL6L)3q3$C}LeWJYm<Vy;^6nL_KxcC&2aQVdxTP|K|DF^$WxKZ2Q%~t7&i*IY$JTOvYb^Odm{Bk9J`6=yv8EN~xXYYfa zm7dij@AWOYzw!X`nIDDP@4j~DwawPE4Zi;^x^c*i!wZFfs|O3T-(;TOu@7rq(*)-?p+o3A>7xEwj~jnNHP@TseFl5_oC zsB<#p`Yk6;`aQo5iIbfkE6^mWZLt#dVJub}X0Z}jP9s`^k$P!(FCND0KF}GXHN*|g zzP3JeV-ghQ5JWc(KlywP#2&kHdJp1--MPVt>zATq!7OWQL{UwAg&Ux-kh|@^YV&-a z^qK)@rs|$+L{tur#(&Ow?=>RFpKuGFn75GkJMTIK5;PZwJPeXPJN~xyx}Nh(5OkO{ zla-Vi2*(Xmf{aXu4*)dLZE%4pV3aTpVLPP!PV7Uo_2w**eKL}WlDQ!j1|;}OFasV3 zd=jj!WF-1PGC?x}iyI@<`4{@n*^W!1nLC|C6UIhHI%aXwRLn#Afs{r3R%ja^vWE`9 zS2;w+C^!cIl1u8o$utFh01T=_a3gk*)6&QWbv&c!+l+Vn5TSGL0f02nk2-obq9c{) zNHsdj^1-{u?i^csZ6i8biH?327;U70n{9o|S07wkxwd?!IHj(-;zR z)e>m#!(23(qRgm8kgF8XSA-pbNrcdJlmYtU8-rlue}bL}g6M_d&l~vZ`OVh$pH6Q^ zyO-PVU0QYBo308&kNXDK_OJIHxiz_XU}^uIcvU#^Xn6D^b$$5ct?8w!%R~3xT8&mZ z_Em+GkGuL--&pS&x%KLjP!&cVckEexX}u$M>*AtX6=IL~k9;(-zW;bx=v&5B;rOHY zcRqe&Jw8zuq}8D{;o-rmFi{hHV$bU2TKmIGTL_v@J4Nvo4mRhG)cA<_!s=uVLACb7 zr?lE~Ir!Rs+VZ3L(I>tEao?Bioi!0^@ScCLZ`FLbr{wZHRe09ti6 z6!$XMB95;etRbjAjy|K6eS7rcV(sKSy}uy}&H}IkjSsxoV7UgN{?8UI%ShbjLJn-6 zP!U_L^t;KJ?G(nSeKI*&0fx5Ia6)57YlLBF^q|Z59h}U=*Re6XOCM)U#-2Z>EFd6n z&}9cHU=rRgb<2Co{(^Ldq=1AvcH<%dFc6O03L-A}I~4dG^8E`sQAH=dKzsfj9rywb meSt1M_C=T8tor(Ix#1@Hz*oG^apN^aAzT_Cc)}3uhW-~UQMdU3 literal 0 HcmV?d00001 diff --git a/lora/__pycache__/request.cpython-312.pyc b/lora/__pycache__/request.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..851f0a1b732dac7a116a862fb4d832ce9164417f GIT binary patch literal 4310 zcmb^!O>7g{`Heju8#|7j#Q6ah^7tzlkz}=i*up}!QUG-qycMvlB2??sY zeUjgN?|a|--uJ$L|0$hL5*ViZbK_zUAz$O7GvWcTItjo6(TOgZr09R8q7)(Li5|I4 z^eB^N<<6nCuP*EIr7}p0PAdMonyJ#!@=RuNAC;M z=>2R+A7H~}S`QN7VjtC*<7#(6A4v)$K+v+ysYa26 zYPwc+86Qsyf`aWb>dt5`Ool^Crsg=*nXzl8PNx{HIZUU91@x9?n)Pvd4j9>%X;@4g zcNm{Dtmz}va48(;tYlXz%+leMYg5a1DIBw}BOmS{3V`rUQ#DucF)YVrnog_So-=et zH8?uMI5o98%yyY-Q;y5El55x&vZ-=5$1FGCTjq8}u+d5-<~Se_tyx;low2#m08zs4 zc7lhSHrG^mnQG{?Qgd8s+v1f$V*;aXJxHsx?V<5nRZDtRowLYOxv1P z1-7EyP)%lenJLX-YQ@%>sS4IZF0&l*UappGt87fGx>0gVL1YNW!U&DRZ#5|*@FtlJ z`CA9nrE3vF_=t;?V(5|0DSu5qi!25FKaqf@6XBhM@a3o7PIe))iO@y>>QPtZC0vRu z65J{g;0JkX<8THyzvqVJL?ogsH;Jlr&TPRU{tnUOooM|;s*`-8@C={P$c23241N6w zod&Bm=B~^zdT@hH4pPl^xdHym&=GV79$C^{FurE#^txf1=un(h>^a{dG+M5?HO}Z9 zL;`s6IQ^cZ6vm%{704sWOF}%Cu9@KPCP!5^whK{^n3gmP9S?3| zm8x6M$2mUUOJ3E?8avCm%{gk@c(JpgQQk z-kfkN)rqQ69dk84&D^mYwG-cd^V$hEVH#5t)j9;2H8E$Jl?fDSg8SKLyju6ts#>)j z2rkQTRkg7_l$^%bZS94NoFBq8Pg<$W)6~x8)XqiiNow>j{jW7gkA6A%^w{L`vB~Dq zv&~)SR{GC3d(I2GQ6%Zf-Me`AVmlJ)d*g*d`u4SUzV?rllJ0HylHAC`nNQEO_8fS+ z=lJrT<4^XS`l|oF{L9|9LbADgw|;i3nK;=TJ-L!NdH2>TTu;^o3O-*1$_@Y)&=rwn z!?+CUg18kb=fkk+YmILQ&`Sns7Sc;&DE#_0{82p83)4LLYi8hSW^_3-+Dwe{oq+mZ zq~W{ZAjpGJ{*pX0CW%HiED2&x0PQMIJy*0B;Ay-b_lE|7aC>|+nrSVe-7*Q5zVu3 zJ@13*k<4=l58^b0)21p*uGN6`PPB&q4jgt4!Zc6*mdM^q-%Ud}r1KCC>Cx8k2!zA7 zB!mO{e8{$8{dKkc4siJfT809kc1uQ`@B{d)Z^85^!bjla#pW<8Ie0gvB1$+4zSYZ6 z9A`gFuYTE%(L3@ldWG}w!$9pRoI#$oO5r!6XNY*#0SLeGCYG%*CG6u^3UD_hzFm|e z>kE)pjpt?P^|bmG+$KYe~jL7DjE^dVmYE)fmv0UAhmayU`_HbF`R%wr=g)* ziQCZgF>4x{I_Qp2VjTIXmo%W4@~`){sj6rWRn<>0dC*4KmY$-j>?6qa{h?Ptu1DPa zNb5W4iyo5Qv3THN?u*@@?_SBg{v>nt%PYVA@HZd+zV^qD|M2nh@vF_m)t6HN8c!JT z2mtF0xVia)RDp)jdSKCg)p}dZXwFQ?4q_P+U({lu{OWv(Nd3T)@Vs2VqzbEGR!$Kv1`Q+0GOgH!YkIJ1>u zBc6dURjxz^zD{CDp-EAnt_@5IA{ zUyOY|_LcnSy{G5jyMKA{*n{GVe7d#&P*Wbf?<`(<&{&ZVZB2K0yeaQkoP2ofSMRRK zV-{D-VtEO>{ z<%8mO5-2_8vde2=9HGs(0poKKJyPN=-#O0kv#^07>tMZTM73&o zvv-tA!VQen1!DBUezTjPsQXkK>eplchrU={4O z>O$AtL^D}UHdEDzLjq+A42)R`cn=IV$Z9|E)ODu%1 z8QTi2x@!eNlxR52#ZOtsb3)Vg*Vh@x=C}QeYZuwOZ5D*}!GhwI9yI!21?4tjq^j7Y zs@h6bW4f)fgss6(XGuHJO%c*a!lCrxCR3e+k%}gz@xXUC8Mlst56)Ntqhd$2;d6@L zsbyQOka5bH92k@)CMY~_Z8*VTtj!vZ#{wE|FzR?PxM9_quplQn-a55Bn>JWj-@wMW zVX%khE;L&%YcemiLdW+4>NmjLaD`yQZ@V_#04pvN>wt@#x>&>Y>o7+yAxzo+mgoAG z{me?V!Tl!Pw7BE90~+{kUWZ*Tv(kErUOKq~r}5i14MLw=>ug2Xf*qF(xx=VX1NWFs z-)64qSxpvYTo@_UBxhlpdzJbCGa@{Ql%K$_KMCSZqClWJ%5SwT#Zay1;p5r>7*ryI zhJNhjk-MlRy(N~7gE0uf9=`=tx&({}k1$~)_Qf8TQ#^=GeNT82k0MXF$KD_gO=o>0 zl#!F}vuMrrYq6Ioz{H2R^VpqZTU+Dn!K&~g8pdaz16Kh=jNaRft~-pUJjeB za?5EILyNDou=u0)sn=h7=M*cuPOaSP0L6IaP1kLf(L3chYfG(8G&yJhy>P30Se{HK zb!q(5Jgx`aOa4LreU0=MNhbH7K!^7Gs;(I6C~XSNn`Sg&Vmh>42KuCFzT383j3%^F zrfK_i)8sfW(trSY1~fj4VhThbeuU4Uz@>!9gA%+3;yQVvCKKu26iLqY)Tu=7;lg51 z!*}^r@qV0{yA|$aUieo6I{O+7dxFFG;n-9il;M*cX#FdojL7nP$o@c=zfrbh{!cKV zOqOb>Xo#h4^$mCfm1bBHdm5^(U)QS>b^?eM#@U*k0%}b|Usm*KFju!T(4Oq1jEU$u zbKJ`p*}4Os=m4=7fHSSO8=63sz&D*J6GtlG=!W`WN1zheCLtLA{?|a%5}jAsYk;(_NZea`SQntcw$HS)MHd-J^;?|ocY+$k*X7LGkg9}_{0adwj2NoaFY*LgDviaA@gXhBkKj*8g65> z=2B3q7@CAnQr=&p)Rx6PsKU^XNa8P|SOgKXnMBoe49Fp%y2lU3(=!84qOnsTfT!lj zWheW}qsf_@l^d0ws!V_5@zl|WQ-_{r61jXYPmB{E=r{8>^0$8c=k$`m zT78HNFoY~65Oo)nFF=&SUk!&Dhv@kTZlB>0SqG!%3S7nZKXH!69P|8N6Rbs-unI9_uu}wczUOJdbfCHclqptNVlDOC{D=_VJDW_s9>7n z?jdh@P@@sGoT7Lc#VaVVM$2+4YV-@(MDZ2)1ty4}l247&AH%;G=dNA&AiQ<<&X*p{ zE$tLu-qp|bbTzSbZ}#4$`-l1j)LwcfapLj(kzNAcy(F1k=%r9glZkvkgW7UFQM~t3 zkHG5>D^Kwy*Txkq#&o1vwK~UillO^2-FID97ylGW_z6!CWrzQ#SirI7OCMzMHdK<7 z#PNSqG(G6y7TkNp)lObcga?n|MD&w5ZZvmL0pI7HDE)@K27R3)Az{G`z6%0ErzoGL ziE``{vhY{3{CD!wC&c&^?k<;iGhe=ze40F>2Gf>ta<>NnU_#z`L9{{Pus z0F*#G58WZop0npa|GE5^^Zn=WKPxJn1fCfG&FHUs3Hdb^jHgo1Y!EXcx5<}?NJK71 zg7`MYxVR~3ikpLGPW#RWc~)-;TH$Su+2Zz~J?;oP;?AHm?h3jX9UpVYD}og)Z;4gL zJwXr4TVqvmZ_vx~wpexC7xb~bJ+>`g6Re5X25aMW!Mb>Tus+@pY=}1o8{{9I>u=cd(n~ow1&HZ?Ko;U9s(Pf6yQA z3-)nDGSO-nw>#D!9|#Vx`ij_KJP-`9d}VA$d}nYcM@&TWNxRyJQk&EywMZRO{X}JOxAe61j8xMAm323)=6smI)6{p*(C^BPC7L2MM8M1wjt1~2FgF$a(pe)ezCQDFI)C*7u#Sy2Uza{v~QQpVh64Uv^Xg3hZ&o&G-ee$q3=Ut z7kt|%?Zv6sMCIvT5^(MeAsmZ}VQi&+syi!9CgrG-e5|dXQ%#8kq*Z6Aq|k6Q zA&JAu#8`A3zBx}O&mJn2T+u{S3B{7*;}Z3IRPW)iES(I`Nc82XGJ%w8^KdejiYLy5 zX*d>>Vkd!FxLDh{RqeTnFcl?{v8WnJ$&z>~d9=W^?qqU2s>o+(G9t-xvdFLFltjmI zw4c+b|MO`|dv4Q6|M@Vb7(NUH(fDLcic7H6=-~qAN5CoX1K*6inV9$(H?wE^TrwJbDIAl_YSW$k-0Y;OeAed z6GMOJtZD1z(!O-wX%5=*@90*;k+d~p;>bC|gC3p({W%S5?3R-$8j(V5t`r7B=yRA1 zLE<;lPI#%_NH`Iq;KjlQq3VlJ2|Su+*;AUy`)%}TZ2Al)n>nd=-37>MrJ=-Hmol!4 zpgo!hontf^*T-`JDo?^sei@Q^@>wOR=~=Kpbotg@-C0+6uAwzo-Py~!v2n%v*RGY03O+b;kEnx~^9~5Bb_yM;6fXW3tvXMTUTA$e#916#_2E>&`jQI zIG4hx=y{-!@pl1|d6Ku2O5b{gkgX7weLtvwtA3?_y>DN(Z{OYF)xLxGeD^vseMeW@ zM^-Cdm>}MiTs=iC=1F1-RO^KF(^R^&7u|6Otf7Ar$GZywzIMW%d)*t zcCz|b(Q?^g@F~7QF-@YqsN{1zDC#CoW1y;qzFY;~THo$378g03O~rxR5d(3QNG<*9Fqt8+67Te=f;4Mr|NU(O4Ev5pik)%+`BoGPSh=|O9eU!+;n^k}LMkDrCMS~=K&*yYwMF+^RVTo5 zs>tX*Rh#b1R0sMJwsQ0gj^HIoR3}97DTxK};3~g_-_gOTF2*4gO^DL8Y5|w2$f_N8 zAQbT9?y?c;L2XdGLQK0fu^=fO18#Hhlj}f{!Eg2uyZeo6*RN&j`qpgy0B>)mZ=^HL zJML7kdY)e3-`{m$;nj6dZ`RYh!moJ-@10-ZA9^~L&u2aU8~lf5YV%G~-+cSpt!tUy zy=!&*ZnfX@&YyVbuDbc+jTaZ?Z@x5tG-q=!oV`A?cF9F*%4nu}*Q)QS1$)j@w^Xx4ztyljk!=WMJb{co@K9rw z+17L4*0a)m$Mm*8^W5?KeaAnaL#Aga+c1>z3}x&?A33)z@rze8Ekl`_J*&>W8Gf(k z_}#H&xDe*5H5dwGe4?$TrnT~>EdcQx+XDU_GPl7czfR^%q6t!SNjyiMjOS!-#teuZ zoa{R~R-lKJ!j4`J6N)_08ruZ8Na|1X<=kn}L6<{@qSfec&I0|LA!gtVuIC`;_1nU3 z7!Wi6Ge<|x11MnnEen$ns$-CLN#C^f<+y}kfjPD$jiXD98+l1?WR3V$3d;mGmXep==7^QQ$uKZHzL3J4y~D%0m8v zc;P^S>nsWJ%7p@5gqk^Wk<6H;$h7$)nd1EB(-8KWRTGs|TO=8ujK(C@0$?Ob;b6r! z3HVBa(*{%_J=(1VuXz=Z1tB^cm!@ZTK$+!2o8?vv;x{+=2Vf;*+zgJw#5)=uxGR|% zPp!Gza=yCXKJa*NPT!dRlyHuQ_kHzwGrWHb3z#<<392<)GX(CN z10_Ql%R%E311i8_gWd>JOsW}BrfQGM023hx2OxIHg;?P2U~vd$WwaYXUm*ufSfifp z`$HB-qy1;#AU8oWPab%x7l*z%x9;iAdb+5)Ugt zXw}sFiD;>Ai)m2w`ftNez68lU`E;Ab((q{|XL;^B(;Ms>_PreO_A1#iTsa{yt>iQcG(bCAC#_|9^UV^ES*1!*=$7U)Sdgah_%NDJMdz;|byF$&;rB z1UP~fV(k3aIJ(pwJ%cft<5Rsw1f+ZW)3gAe{0{nc=t!+Vj8WV)Z4eP+)CVVZqR2F! zwg8!ickql-<^BP(g?S{}&^9Jse_wRt3CZ^*ftGp_BwsP`}O zxvHM^s=jPh-^$bPRqf21NkiWwJ6g@ycV?I3Yp#KuyJOwmlXdqjD?gZhYxYj(_paTc zSwKMUeIL1MfNXl{%6qQ9t;pVU@5?ulhQOm%;;zb6^{l#jGkmXxLmUi$f_I-NZ%_ZC z*%M4qd&+G+<<196&;}q62&&45cm>U&ZN)RmvV9b6$QTI$o<+U;g4V8LD>1->t8N6_ZZy7J zMkRmjDf$FBXN9AKi?HH_jhzqof?H!oc~+8>v8$36y^Tk&N(n8XLHPsRflCD;JPIT! zMS==Ac?T?}Oz1J^aqV8BpfHGVq7X4E(Xl9?CBR!@;aF;X9HH9?p7VeuOR*ve)xcR) zfs-Ea31V_8fgqScI*k|+xiYd~in7Q>Su)R3_#aOJ2>(7yr(zk;89 z7ZUL0UBvEuW9s_U8*|s^7B8;Z+H#)yjJ-Z@C3f$+tto43S{h!n35ZdrZcMFv+OwYa zW&f&YAY&f@XRkMY_4=!e(IVN+Yd5Yf&8&L-8M{Akby*r7v<|Mf9>}&H_?cs^^=!u8 zl&>f5whZ6;+ebCTQTM>!_Q2lwz}}E|Slrf6iNn(HsERn?@L9VvI=p5dhH>48arvO` z_cA(_pL%N#`^iuJJBORi|JKAoy2-QSQZl6UbI6n;rJ_fprxhLVl=|3dZjPhRmLQ{o z_eC6ZgxoaWMYE|S6AqLXUnAwVM0QE;5b@k$p~SL0QtaaE_^bJrT> z&3D`?7gAl?p-$Ozmsb5+qQGu*bNXHAuL^c+`7a;?p@Yr0cy!Iy_@Mo{yO!1V{TUbh zwLtpFO?nP;_bjX3BN=zcCsuNldzpJ=Htni>|Hy^xz`>P6E8n<#=ss;WPB{D%#WmGr(<4HmW8N-p3Oc8ObRe0+h&~8Qp=%^_|0@cMnYj?X8Lra|y zDFWI}JH_3fBAlkU%TwIo*_9PVYoi!$QryC64JNpS(>hFW3#SOsDQ?{q13I;fb;mY4 zP&X-(f%V`LuN;kEIOqK*O{MJc`+JM zn8CbF7!1}fg6z-3FyY>5Km8{l#gIlG0^|e-4;%-8oOAw?^!=LD{(@A0X0mXm4Fbt; z+=Q$7fcXE5^nO4(KeJYGrq93(OOj8p_?r&G?frlZ{gQn7Lvz&!{D}|vvmfyLKXhzc z{PL=!b>0dai={Ki3pu_g$G7Bo$V1wk8RA{sy{KfG z2eWm7tZ&B#@o=Vmmxc2#nf})CEk_+}M*hL;j(s zI&XnDbY0z?w_(ms8e8)Y%sGj#6V73){dsJbcN44qjf>YWZd73DX~xdOc_4_%o8kSi zq9t#EH?XSm-v72A$#>o5@1DEYbx;0R|J|=;hfY4iZ%niP4{@vK8vpyGu!A z!B}yXmY%q^l0Kv~laBf;ROL?CKOOa-o5@T(?Q{lfN?l|lPnD#dX?}DDHkC=@^heLR ziv>V%Wjm8!y)(FbpXa{LIrn_`Ui_=q>n7mRwy&oW`w00NMhs^!1-AbY0=EcHcqUC2 znK_2RuqAC-w9Z)<( z2_!`!Aqzl9OvH`0p{W;6oql;nNG}0xXbP*H7hYc$BpIUNi-H_a@QEepSv+w;%BGir z!cs!Ml+;J2`)nz)y#!NpiwI1EfXx z#!}qI2O!?g2ZauynfA6xXy!w!bua=qADVaY&5+;8x0LfS)y9V*mE&8XEcV>Sw?VwC zifRPn-F*8EGS>qoI)pYlehc3L>AfqVXs6cjYF4}~h;+$yK3M(!t|U&rl#n@n;Ux~s za*N1B5|_jn`U!mmmGSPipBExe6rJaR>qTU&U5>TMDVj63(X*`*hG|v@~ zi8#m(2!-b0g)6C~pfw1K7X+S9WiG~HFw4-k7Oo^MC3a@j>%5PnMv_TK)!L%~qquJ0 zpgf$j=SaMqowGu$B=&^lY!E96JghAPv67%e9<$&qS1Sh=sA?-mX#i>&YVB#Y47sYR z2(`++hPLXfvP!{)hTUm~BtUJKkqfnhM!52Y66L-=g}#ZrE+9+RIHS!qN6xX(T9XM; zXvu4&<}45^k0nPKvO?}!W=}_bny;j=j71Z%(_#~NT4OwknvRe2%ZrOEw4CObFa;Pv zG+2dQOk9g+mKWoa@VcahjM#;QOzX;jR65J<8hvmhk;ic5lkem>q(V=V0Q~RecPYJ{6)P>=t75P#&GmOGAj6EIJ#o7?1 z*wTvTjK}GY5?^iIr6poU9tSN-cfq?(ihjby)@=`*kwVuPeme@$XYt!#ICT!corTDN z8ktri(`w|D5;=uw;m2&V$_^^*pvoRq*rP=&ad{s&dx{Ry)~~jXDXn8_>v5&^_*UzQ zb=%v%0;IN$DQ&cY7AV!+t%hPsD5i#{mC&>rno&YCTcLC7p2q`Y>cBZ=fHvJ*2t}Z} z=ci6~8#+spWhsRS%4iS#wnxBw4!85gaqin+zWCXf7QB$5^8@2yvvsC6~| ziX?z4OG#X5%Py%;k(0Ba7ME$Rz)|)C4JambjY^drE)|qYT*;<*j&8l6q_~s}^_FlS z=F=-NU~oo6(YVWqeejX6^PD&UzUHqmwa(_N-Mbg7w#-2!(gxmtlV8EQIZ4j|U7sI2 zy@hTFK6G~eTCzd@o52vHIYdET7Bf4h4Dxg}h^219R75SaFW)8NUWitA&z+LuLMCw` zE%2+6-5smsp1_Kz=&qB;zJM5okY=aCmMp7KNJF(m)$b1=aEr+0Wz4~XEmW6;d~}eC zU954CT2_yhEJ!s`%8*)Cbvw$MsNHv&dnkh7)+|(6m158j%Yt9esrFwYFmF+Z7=c8w z9X$@cF7!~?HNUAN*O>_fvETTNNQIR#Fn zJVhH=?c2RtCYIqDs3*M!-gWZCNm}+6*oH#5T@CM1!g~rm18UC^rRPYYx$V{)-*}_Y z5>Z*_ zY8{l^Q=3LLk;!Pypv5Xw1{yI_8n{Q*IP=-hx!$$iEAhvgJLg_FP*V+}PZ^U5+_`)b zl+K9!j6s91GpG1%!aF}dRVeNHPFVupWODA8U?-Nxj8=MKM_Kb!wQ7g4P8zwDFVJy7 zjtOUfn|!A%4>`}9z!__vuXwI9^0ZMK+NfLeKpucmJ?|>HW_^vo*6yCw>V#_md1Xm! z025Bm#wog3Oe{g7W=$?FM_rm{Ntl=8mnb&Yyt-PYW@w7nmAIXkSC#~=MYqE$CLSPU z3Op(%ofeY8%FK#jOd5u1oKGcXt^TTzx_C*Jz&^vKSG~^z_(4m7niH!X+@}Q5*-Y0rsLcD!{i@fm$Kh*BrzBak zN`fq+C9k>YgyC6%h#RU_KQCq%fpjvFhM|jMFC>b`XwH>HI>pno3E?6DM!HMlzNXcI zd7F@BQAC}qHC0gn+dX-CDVxg3nr@IoD4yaqH*J-!3*B3cEHcAKOOkLKB~AGeO&bQA*NeEYZ(7%v2)YH&mej^szr zyzfv(zxXit!eb+CGxQ)h4MlIiwiW0vm)s0J3{Drlq;2#`J?V?S8^{kmyA?gL)q4`C zKWZ7>+^3GsC}T5QEoaxy{%32i+B&YZju-slf=2XjBP}7NSGn>%7}pXbh>1 z-AZG(+BmH=PJd)&_UwP$-LG~}E8WwDJu!99jIw8@Ffgt|l0xT5F-(F-wuv`bzkafK zfUthm!zmu_&V<@CuJnwnJ!h4kvmbcQK5C2<8V5ImYV?Q_J(3?hS|rRdW$$?p0Fg)UTEyR!>Zh* z!cFEor-0gYd!aq5!t%9`=OYtEE7LH!ZM6as2MIP6>#W|U-+*oB4}VN7uHfUw!1|fr zKufORClXrwZyxVH(@p-l@8lqSe%S4r>9ziFZ~sh#_5B7G!tb|RW;(6!ciJJW*~CQV zauTgI!=c4bt+!RM$7MfS}@zc%61IeA133vUPlj|%JcJ-ew3}c zs(x6F{=Y;-wBDj_aTY$Xg@J}PG>4=CgiB>4Igv>U6oiU6lbSu5&SnICOK2zog9^-~ zdQ)bOMdp4`l-Q(si3zPJ^B z;jMFz5yQlka7+y!Q^Lonf*Dl&gB!h@Hg)8*GIDy`YH6)sKL;`bTFBS0`1pk)H0U=B^n3k`)z zF2__B@-8zQwdkwjSxRM$*}rSiXP<{E5~5@S>Rp8JYWwc_t*ZN7tb-dXOzx+R%UMEa zl2(UC3he!}J4C%az6(H$jt@k>)>Xy==D+2u5ts$+ZjD-ly?^UJ0vy? z9!#N=Rh<#V8M*x>JQ(vd<~? z^hw#~rmjKBg2}WRx)KmCbCSGcO+3Q-u;Nh*nrJyQno!zWjUCJoFyM4c&c5m<1aLd(KsaP3 zraIYCVV<{}Yy{A~rBn;hF0BP~RqFsPI(DM`w_9w$dPxfdzcuSs61B~)MrN1c@)S_;oiZfWYUKAFy6$Ta)kQlAgl|vQ_Gzix~`-AY7d0g`uR7?D) zx%J+o+-TlXB3S+@UZNe6kR?%@gVPU`(vYy)wOh;7i01d8nDjI7KujW>8i^^9*j8jD z?+kzBAiWdYc3a5(lb-RLz90EIigx0S=r-PlPhqDDJ>!t%3V`JkazAks*0;X;^`p06 zR@=vv_A#~nn9_di1JAKXjU9!?*t^o^fqQS{V_-{JjxoWxdB^>b-3t=icIa{IKz`4`N^V49Be&-^ z++bThWG6ntBESB)9l9u?3j52@39C)K3ZWe$UDlL8g`D5TC<10T)0E}?f*ydGiJRF$ zRaw7+$}ksFGsDSZW)@Y1EoU{1k2=abI$O`dxG6_d{?z;lRHq31N}Q-!QoLbM;OuE| zV~6*O<$Q-8-)UD+VrH9Nm6`H=peCVEpe50&dioSkAKKmrmFU5(=wZck7%T~DIqX%! zdpE#ZIJ910XlPO!`jv)$wP8$Y7~4Fu)$r`PgKGMA#o4YpV~R7j@fCIWm@<6qf%AC5 z*R<}Xy4`%)e-{Hl-}ZD9KwQRoyo5u-P1KM^Of|EqcB+Pd!D_p<{c*)NzIkrTcQo%j zs*j+0x4~BEUi90Jz}TN~>#iPw7BD)MP8*9W)m}1)U&R5LoL8IF{WlyHUJaIZVtsu^ z73b(f=lJdc*^B|9^4xLV@VV_0V`v7?1HbE*3025Cms4H77rTbDSyGyqXolN&Sm$}H zgvW~4$s=Fm%?ULys00Spz@!qG+zL!@`S$0X`{}r%eR!dV2NdzR=8l)nX&}tTW-e#3zW+9GNK%?gJ$ zViKbl(ZjV6Gw9*ILZw>gbmRkFfuOBLN3HKacrGZShcHLN*C$^iA6cySDS*iP4qbo# zW^cZEKxrJ@vK@r8gS`_UdBgUR$4#9@JAA>!ZwTFDzrhykFyh0Z+eZhCGy>;!KH{N=qXoono@AhujVTd~p#W(gA34ZTKx9PWd+TyVv-Z)q! z@VTF)pZEA}j1^lM`{6r~1fLB^f=|(5wIAI$v1z?KQzQ_)FTMBjKdsW>`%nT>itYyc zGn>}U6aU~W5(wUZc^f~)euq7@F01D62TvFvcQ(QV%V@AJF}g4aOK1-uKdab}kl@M_2d?_})o{)L6N zL0+LA)fsQ&>mbz$sXpE%)I%vh#N7~YSgDJ8w6O6sYL~r|Aiz1hQt{mMw|p=I_r(&t zZsDLMj=sW*{q}EY+-P1s?Hx_&olTTJp-Bq~3Gb7WLPk!naBv-6{VIpjftNYPdm1h^ zkI!;A`ep)c)5HZZ7$FMhyD51kMsYh1`8K6q*&UCq{q(rJ)&&O!%4s!xFG8+p7R9P3V#bX3oHQ~n^?BSzFq>B@YZXgd<9$Sbgm3; z8G^vF3=S|o*l^_YU_*MdQ-X%#>(Da5IXuP_MfB*Ejegld%ZM_hY7V$b zfv1}~`m^ik?=P@8UQ6gVGW4cN{1U_|QbA=?4d$XxXV)4~_diGiC+j)^6xS646xZbb z2{&|;{gJN?QCwJMdlk0#PFi8d3fvH)xJE#6O(hh!^7SLPPpXkoB{Hf;4k?jCA9xN? z6gTv4ax;1_lOH+^DDFr#h?`J?6Zyb1o97Y4?XQ41*r}U3)L^d??EUv(|N2Ra-zY+R z`61g4;H+uj*B>`S%XG+J3q9LpqcgID?U9M7j{Bs<_NEN}=_l5V_|4CTW-mH5d%pgiC5S8jArh}XG)Gw!zXF~as_lEaJXWFG z|HaxGcyPK<{hl{V{sD0J{0|Cm32s?=AHH z483>IL+RGx#|(y0qw6T+Pcf80uLfIu6VuRp06&Qb53z;KJ`OWH@jRo0i(#d4WXtwk z(e@ekz1QBe@zQ3Sf;tVt@Abk3p}nWr(FUfsr%2#)-}VGQx;>4}M#GxB^z0DcbyBFM zIRLT25uQF8JOQG^)SpJj^e3r0a-)x>WBOkPMASE$A0IeCikOJ$r+hljqiB%MA8v(O zQ>9eZ;|Ki$zoC*>G92hMYQ9SPvYbjw^!lH!lLp5%_@@O5(db;(OtR)Y1f79r21ms2 zLh~qh(u?51T_wZ(!bzC^{~(<|BkX^X?hi@NFD-V)vQ5DI1xuKr4@vYx^2|?M!JA*& ha&=sH{L-;vVGevm(0_8xeu7~_d2aL<1Os$h{|`fO4f+58 literal 0 HcmV?d00001 diff --git a/lora/layers/__init__.py b/lora/layers/__init__.py new file mode 100644 index 0000000..8a4f5ff --- /dev/null +++ b/lora/layers/__init__.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.lora.layers.base import BaseLayerWithLoRA +from vllm.lora.layers.column_parallel_linear import ( + ColumnParallelLinearWithLoRA, + ColumnParallelLinearWithShardedLoRA, + MergedColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithShardedLoRA, + MergedQKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithShardedLoRA, + QKVParallelLinearWithLoRA, + QKVParallelLinearWithShardedLoRA, +) +from vllm.lora.layers.fused_moe import FusedMoEWithLoRA +from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA +from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA +from vllm.lora.layers.row_parallel_linear import ( + RowParallelLinearWithLoRA, + RowParallelLinearWithShardedLoRA, +) +from vllm.lora.layers.utils import LoRAMapping +from vllm.lora.layers.vocal_parallel_embedding import VocabParallelEmbeddingWithLoRA + +__all__ = [ + "BaseLayerWithLoRA", + "VocabParallelEmbeddingWithLoRA", + "LogitsProcessorWithLoRA", + "ColumnParallelLinearWithLoRA", + "ColumnParallelLinearWithShardedLoRA", + "MergedColumnParallelLinearWithLoRA", + "MergedColumnParallelLinearWithShardedLoRA", + "MergedQKVParallelLinearWithLoRA", + "MergedQKVParallelLinearWithShardedLoRA", + "QKVParallelLinearWithLoRA", + "QKVParallelLinearWithShardedLoRA", + "RowParallelLinearWithLoRA", + "RowParallelLinearWithShardedLoRA", + "ReplicatedLinearWithLoRA", + "LoRAMapping", + "FusedMoEWithLoRA", +] diff --git a/lora/layers/__pycache__/__init__.cpython-312.pyc b/lora/layers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d22079dd40ffa0bfd4d595fc9649c4360e320335 GIT binary patch literal 1279 zcma)5O>fgM7`Bssq@QcM&!WM`K%gAfT#z;-g!oY5K!-*_L-Hj`8`DMDb>yTQJ#vOS zcW(R^{(+^PIB`QvQzcH=-WoP#orsi2uk}8D{rY)cf3H-EKz=0iDOk+`@Ix)8pK%r( zf0n@)-~%5LKp}!q#Rf5`iA}J>ewPnbQvu}FaUnxD?}^fqFwr_TC<_3Kk*{m z@y3|H4dVR{>pr$EvHFye5gqP&+#>{&PB6q?k{7pTWnb@m+{eBuA~tp~A7FnLRdejC zIp@o*ol;-D*qhPRohu0E3pHn#SidxNZnKoD_G}bke}}abwV+t;ut5+L4lfSUQ#fppr)Xzpst`S;icUaiwA_z7Gd0L5j8?l zt0$*PpCs<4a(z8iZg@6TF42noZzJMbs^B~2g-aRM4T+m60@22};nW#@bFq=)@x-NR zQ|hkbcACfzQ~f6biKP^O6bB^Inh86VJdlo&f93GhGi{Q{yRLkEuKNuf%U-~rM@~#b zCk(>P*pn|S-uy7yy1)H?3p*s}IpH|oXG2Gw;V3_xQ$HNn4TeGNx~*_5tOu0&BZ43D zHCYhKKiZb?%Y+b~aA&+n|7;?h&uK=@3EMf@`afO;7ATK&%P2_XRm1+|ImU3iQsVSV7 zoFP}OsRE$_Q{bHJq}@y{BM_0CCYG{7tRX60Dh`yrnD3`zzNU4=vidJB%0if%|MuJ| zYls^(U^ez8AgiP!l*R7MSB~AFI$qOF-EQiB+p&Yd_7?PJ zi|GONe3;rMGaZLIw(nNv?vb(>NA++QG?>1+bmXHC zFCU>*$F5h~onW!$Rj)XXTSaA62krK&b%0Q5cS6laJB--;?%SJ?Ro3no5Cmi=xlSUL zq_sY;@u?C}NkVIjb+6#+;-<{eVi!F}rMrlb%@(_|3oNRj*z52?Cymy}Z zH1PFcG0DV4w{8YHm!Q|p#$`Qd=?4dJl2GhKwQ&N*5Hbu!affy%He3E;_g%b+J>fce zsAgkx3)HPs%Yx+hjX3iggRTJw`Pk_A$uF-^w#@8c;P?2R|J?L}!E)?>G?CAI{7y5DaLZ# zR0bJdf>Z0CxclXw2CEM@gU91ycoQF;bKS+P5yJ81xert(gnPc&XV{M2k zwG7DgC5QS3JTJjIheIQaAz+oQk^pEtz8A_u_t<&<#QFL>TcTLUJbh5I-Y`AAPIX4x zj@h7=UhlxB?d$kP>RfpRz>m$ff-`R0p)U0TYMo-N#maJ+YB)aI3yR`8nL>^i%Z4}= zK#P&6h!J)7Z1&8iLq|cLkH0C`$4#| z{5sy-f2=okyjMIPrC|GEI{)*;j}tegn_u6)c0pj(MX0F z?iPxx+6^rl!j(E$rE!%Z6O*@RZ_VD_+uK#?XW`S6aY;TQL>f^Fk*Lb0hs7PM_6-|F zl@c?MQ_0(<2t<@X#K0i9UV@96{F5AT{S=6Jj3Q#Eqrc0!PzgF%bqs5nO#=y49{q~v zu|v4j;J4B9p09Y%1XvE~e?)`8 A!~g&Q literal 0 HcmV?d00001 diff --git a/lora/layers/__pycache__/base_linear.cpython-312.pyc b/lora/layers/__pycache__/base_linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da915123bc98cae6dc70833576be2d6ae70c6f8d GIT binary patch literal 7931 zcmcIJTW}lKb$1uLcyU1hgg{av4N@j82_gl`mP|SlB}z7zxMYKS6k{QiZnW_5)1$5(f z`q6XmVwa@42-PONL!SFO_uRAhJnlIc{}2fH5fp{{K}Kjt=wC?3Dtx0+e-9cmLYAI;>vPKjQZeMw)|pY&&2k}Y)Htq9pbG5~Fl z7@=3Ntl?B9CrdBn zb5og%Fy?WXCyh!8M|J{E5&T?l@v}Ximbc@i&7Y{Y5jakm5tDAazV+Y zQ<^L_X2Y-L-`T!^Y?>}&Eax<_pro{^JkEZ>06Q=dCRAQZ&B*vW8SN5bgJ6pnWldD_ zI3-H*+nKbSHowG#zDB41CeXMEhbyBbBcUWKF|uolokLTM#NH5+oDWI6Bu?ffUUo}v z*(14TJ%}8gYsxEmZ}3U4-i(h8IX;3C)r zFVtkb!Kw4?lv@e|_W`+8@-)F+mu9yd7|}ZNUvC5P{E%5fnmswsme3!8Wmt?csKk_zXs<4zY0gHr z8`EyPX*Xb1L%R`bvdU{C+YMM{({HkhTC}}@Hei)y%}5JabI$AU619c`Ye3b^QbO>Z zjsF>Rhna|b4PGr4WNdgvF_X(^qKHF)Yxpy{LQxad%&cs9w8CZsr*fAu+1hZM1`&4| zoGL3*<~pA!rj?YcilRzRrMLI`f4(rJWeY=vOyQ80!eC8@t`~A;D_8N5@RBwIUbGW>6LH?)wLDH=GWb~;=T#hqQNwS>5H?!e zZLo>M1|u3lI+GIf4S$PcfRCRbfSiy@HU~cn1rs$ee4`iToP4!_PtKlf<`C0ZhB(NK zTi%f~7cXgQVpPeeQ;K>rVXr?;$e#u9)xQ#tSwsGQ|HC%aesF==2%|vThiB){Rzth1 zp>Q?SQw{A}3N8k#?RzWjgL?ZQ@s;0O{NAqvoC0~knOmZe62QndO!Y*mnrfW--21p*>gW7-e3q#n{eNWc&PYEEyx^;Ykx zQ>HmZ9BgHF;D2FQM0eB^;EMhWOh`c8`{%f_xAQ?ubU|9mFXqepN6IbFRb%l=Y($TZ z%)MNVJz0q*^jKp4qL=c~GFc=0`n!4y2Bi1IM@Q_XyQ!CDeN_)Jv$2SGouF?!i^* zN6H;#J$n2hp}HZU$i6ku#&CgvO-msp}EC1w;L#UVQUNZE?>!2$64;OZx+mBcwb z49IwdfaeJK8i2UV)IGSIkl|4wpoWiW#KFC$w)wHFHWJF&Cz~nRCeH5xC3Oq{#IqgU z%ja)jxOL&5I|k;yQGMpf+;~Ok(S@EBk1iZojq1YC{~Ww03_avg=b(AIZquL)vz|1< za&}Udq)hIjDr$00g-gp$FJMoSHqetl0O$PpC%;WXTSHzTM~Wq(G<%i349*m{gM=JT z-=23%48c?R7twog{}Ko*tlZlqN@0nyXp=O@DcA){U%wf-8Uhtqc8 zv<1UuW4HjrLI0$NUP-KkO6;^99&OSvF}~k?o*|m~eRhflNYjzcl~$|7O0JJME10F< z>Tu56IC1U3iSzuBlnl44q%*S5E&>7#}5 zCRx549L#mJ(TM_~1+Eh6(?fmhLjOmu)$z*Um_9hRe&AGDc;mD2zdQFg=b*9n`WyFz zH)<}{=l@xwu0ribzQBjU`QS2N_Qt9$!4F60N0;~BYw4;+cUPkO^yt2ox7VYIxigRU z?|X%cw@deSEuYrCvE|}QYAt?jb@)f4cSb*9?uqHQ$JNcRX%_E(>!$6dd}(I ziHi5U?mb@>+E#YK-Qjzk*+9S&<>Pll?@L0@^drnwgPY8x)HLx9Rw1DQ^%j2u2S{S{ zX=`Rn?PM|2TYXLg*LgKO(XiH2;x$)^aq=3bufRu2juC!iwjX94{0G>q%d#p;e2XE5n~|G*@wHQZ((*LX=W zJC9g1egzO`pJ;Y*Qif*$MO_7O9aY28rJlu}O8Af-K2!-G(ZfeR9{psiy}9`2%ITFW>(MxvX(+tpTl6jO zU4HeCgL9|v4-8cXPU-_ED+3eyzyz4x!s!LA(s5AlIA}SF&mx~ml{4Sf&wRJ;Vh8a7Hy(Sy&FgGZ~;-nldPgVDJFHL7jN67lBLmi~>}(zYAb>CRy+t)+LV1BO%T z^z9vEDT&>YuEV|k7<6qbYsxS2CXbZIpljcWP3byV>toR6cRUCCfozj$ple4Ek{hhq zK`wuW{@ji^-ecmP3Bzrs>qfw=#e$jpc&;?Yndc4Hcf>2Rov-KM;kJ;+kVf@R!uxkG zeK4fZr$J4+yvb!0^A3Qi?Lo!R|2TTLY*YGVe`Hmd(bR zS(?es4**B~YXERzx{(lBIJz{pIJWZ5a?7D=q=U38k+>d-uL@pr92m4`_vs{!+4rEW zd#z{W6Yi(MPlIbC-&}j*TkCBvt_d&x2BRYm@}9p0QBYEaAk5DCSD2C8-yfH_!I$80WjR>Y4U(yBqPK_ z8{PuW!{wmOP&G2$d!X^tKw4H5QH(Qq4`9--H8XKctc3#dUZA?*9KO1NYOJ5{S~>P5 zf~wYwo__YB?qqX^8XM-%GRypJ;g;|#MA~(iJzU#In;hS{ zATKG4%F21YJ^m{MeTuZfLdz5yIU0DBp{FgtPo_DQ{BkK{mCC~ik)KZz=C2J2lH^fY z8SQjf%BUL7OcudWB#K%_QR$x0e!mkj+&Zw5?_INqUb8gpldFMVd{Rw1b%_y ozeM{1{+i_(wvGUNDIjL>m#FU-sBgnP!7`m|v7s*!X;8xd4YA)%;s5{u literal 0 HcmV?d00001 diff --git a/lora/layers/__pycache__/column_parallel_linear.cpython-312.pyc b/lora/layers/__pycache__/column_parallel_linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef56580cbbfb15f873831c5f9a044fed9b1da7fa GIT binary patch literal 25344 zcmeHvdvIIVncu~eAP5iu2||2Hx)e!?(1Q{wDSFY8Xw#HsJsjIw$6@S1Anqjz2?VHn zK|L6zUrZe3A*nQq#eioCU)PA5t?ZIw)?o6WXUTu~`djMHki&b0og1BG-p z-p;iBedprhL0*vZ*3M?8Jwu**&bjA>^Y|XW^PT%=PN$uNizzyoFyAr z>@slN4Nl+$Lxk&7pN3&$pV2^JW5hIU?lZHnDPkG6_E}lj94Q;N_1Re15-A_H_t{z4 z8mSm|^f`u|ea_*^zDkx>7I6)```ifIgz|`IxT>#;h3%2*VQ-&zxTdcL@kYUM&D`fR za>qEqd5se)#H!mSwU2%4SZXCwUE;c2YAs83Bh^!oTE|kWkXkL)=JM9FR4-C%3i39v zR3B2;736JXskKP06YFv{tY@k9NNo`7bEzAy82sx$rf#108`ADCOJZCKhNGhJR4h6W z9z>e`WbB3RT#S#4Q8^|BhGT*l35*1#U?hS7{J>x^J|s#gVHLs`!-D8HrM*wZBBR66 zFK4rzM8!cVU4KfH2B`&|cb?}@yi}Z>x)mCgBrzHfj6{O*ftWP>F{Yfui*-*1Wl^p5 zWble8eI*Q>H5IHXe4qaAP)s4K@2Yq{pe?Wo(>Qm|v$CXMKzfy4 zdL0sP8`U=x+_jPv0r z=w3L^4@j|LR>$h)x1*GJg(t!!QjnK}!yjm(mve9(2tTk4d4Sv}H_`Vsg4{AQB74cXvv(-_n&MqtS3E7`P;%AZoKyC*>h2 z96c`)5lUAPkscipf^m^LPqbPhHm~LsFOLMHg5Q=lTuz(&!$CQ1QF~=oJDje>GZ2%) zgGBVvm$LrRfdNsH)0JpsGs6)j69Q+dlr#Yq+gu zntxG-#Ei*eu|6_tp$rQ-xXS9JbG_nRpLA|joLiI5U5azpotk7vx6;vluj8Ju;5>1) z_o1!)x^>DrDNT3W*f+CpwsYptf^Fl~?$r9`x4(AtYx7m}FTMZDtyk`DQg%GCu>SBp z>z_D&=$JfxwRg$JxxClUPn}Pp{~M+@%zEBlcXQpG=VtxfZzWp~D6I$XHYu%L$<`;8 z)+g_ED6LN?Y`q^=R$m{U8cw;~*Dp_9UMe%YtXGd=2)6A?Zau7QJ)GQnT-ka&Wphn? zrk_rF+Z1oxeDk8oxXpU?G#Yd?CM(*NiuMN;yHZvBWZ6fR?emuwD)(JI_HfJgu}l&un#q-@Tltx>TxvOt4kYe?1jQdPc;lWRL>SmesutXEGgRTWk1X`Hk^ zbkxY^w~TSJbRG!q@!K8_3gaaKG=R%fRro-h63?e zV9(H5a0vcvZ!i*yMZsR43XaOb$jMV@)n+@}`O@-mwmOK`A_!w(M2ekLYy0w0c=yV> zr1coBv=u`9s1!XNBCer1l0oTz@o&L<%W#8>Yj3zI<2d&%&{;@r7-w<@F0QFp1*!2I zuO8*bxj;^mg`D!3G9*f2<*1G%z(9`AUjchNEmh!^zZ|?|l=y=!04@-Y2Lfp`&4xso z)84QQ{zLjh0Ax3yl3Mk5*J+F^JSGN^p9pe191q8$>576^5{X*3Olh+$Mh4W9<$(a> z=z)MtW5n}Ur4Br$1MtRL^;Vgt4YX^0d=XjY!|;A}mHW`y5d12E0z}A#%*!cF5n@5rjJxW6lrPMyGt(!D|%PH+f6=|!=qKhVwriM)* z%>q;n!;9Q4!|7WXH|Ypcq;7aJ%{b4q&Zix0%?H$~#{8@Pio&d~QbjaPSGkm{X0oVD z{Y71(HNv{|U33mz;`Fp1=?{=zxDru_oM0G3>V$FJpr;e5vU4_%X`Hl$-0D-ylRs`8 zqjjtMPB4w-a(vS$01Wu1QKeZEX4FS|Ny!>n$r3pV>!p0l>bAtosb}Q$cEz}P+%jks zjF959V=`f&snj1MjLs^(oUrP@E$rb$8GfsEylh!7S(|>#X^9jhCStO^cgv7A3E^SC zRXRlJPmtFHFI^TN2}nW6X`3i!3I*eSo4V|dQG%Um*#Q|FFKtx?(R4*b9Eb-liQ&Pa zc-jG}Y$z8iXF_TqEL={PiBSlu!sQ|{Y~}jxfBf~^e-=HGf%uS&1)<5EtZ2Dk(UPiL zpRDsMb^iJ1mAdxHXHzu|H(r@}Wv*!^FmF(5{FA*8>-m)1d&4o~m>pBx{`oUZ{ON}H zbFW_6ej?#M@ssBh?$eJ<#)itt6R5gzQ_Ag|^}gMBvvJ<8)O9NE&O0xqs@L7OR6jDA8!IPIF4;L>!;P=be08p3{*}bWgZB&% zYMz|z&A2(YZ_=45=ju18Jw|718{c-^bfCW4j>+SxbqzPhXU6A(%DOG{2NmDm$z%96 zv&ZKK6OB8*b0N{VOR4FY>@D29Oce+eZC)bZZ1awHUVw6{Ht6@PZ5+FkXMJ{2*&e0Sg($_Q!74h(iqx=Icgj?VdLw!q9d9? z$u+ZWsa00+)EgD)vyO^|NWp*+LXK(dyI|#)fK!!{Y3xkoAPuEk8kAK5gkt2>sXfS_ zVZa=JX(%R(kb&elkDZJ(us;%j&Z_xni$d_o)%OAu6_!=GgIC#QE*G`4gD0IUmy~7J zHkE-o<@4c@0A&b3sgt&1up~ku(oPi>45$?{IznAg)w&8*P+{dja~K;tLq(~4+Lh(m znxH0ar&NuTQ(H0-aO#`J^2#9J(KwE^tjg`OxvrqJ{1!YAidt^_uJ@1JI+EP>%>8Z8 zOvbOjHuV~Gviq*)9}AOaY<4j0tv9#My)=Jjp}up{m8x)EKQeVBSWMO{yf|kYl4#NYlSn#T8;CpxJcw2-DvQiXDxJGDIO6gjNM+ z)hwZ$5QiU0d!6lf+Gh~|u*NK z@rwL3&oVKzs~i+Rw|DyFf~)C4>w$#pz%MwH+y0@$J>9tA*zjOeSHjWt3(nwd|Il5X z@NQXfx6OAe?j0Fu;dVADejkaZQJ^*uP3?i~1R(4P1OSjkvTw=)0U;I&1k_(mqNK__ zt?EvddMSm(Mt~-xBN0)0g~Am~5LFQtiRe<=7>%Z_r`RM*H1ugR&9p>w2^=IAi%0?m zomm++Bm#>M+%iZ?RuvR%**1t^Yr&lesFS;f*I?&{3`^tR|sZ0>n+VGu(ilcBx6GOAuxi* zsm58$toU~1W@P^9_fOtBd3TSp{YYZN5vBI1Qhs#Nf_$0%oYj-U(`dKYANp!L6kb`G4+tN3kQyS1DXywKsxgffwI@^Vdsg_e&ap=*{tJ2Yy+^D*hwZ-HvGJ-B zD+c%JFpq;T`w_^j&9?>MX`XXE}XBRw+&20JVUqcVR< z_LY99t;if1OSH7i+Bv+1?7kg5UVHfH;Y!VjyxHQ52;86sv89dXI14O zVq%;sX26)Pv=_r&tmG4UU^M=h2)t#W1zw=$o2FK%`BR*p)TXBxO&Ifv4Fk3e*KFg) z9!>*oZf7~g6c@;GoxCK5cI?X+NB-aoWpW$@Idy>g0Rz%?q%>0hP&%h_UE%!3)1e&G zLX*@P(y14T>b+$+!5NsQ`f5FNPeUPln=v+eJbnN(HMAP1V)+wd_0KgTODpHjX2QM>JL{^24v* zGA8kc;SJ*(<{N;?u+p?Vi0LgC*@5)Jy0ZDB8}l-feh#ef92k~sBzF+{H_dUv#+aKl zPZ}o;*n?&Pd$4Ga>Ff|~U+V7ET#E&@t91j$zGg$8bPiB;Ys&5JD<;2r>^vi9B)$Pu z<%eNVz)NCKh9&~hn~z--B`GX~ql2J;pqvAv(Ga0@q^5vEiqWB9G$isM-h!B`h$@NW zKiD6RgyUCKI!G-P?RFvrY`0G%@Rm8Z+adTB?smPYn=%VD#TLEkn=9A{81f zvbWeDRdK$p(}pDv=d8MZV(LW7U6*ouQtn-Zt+-}fDPL34w_WjVCluowGvD|NXPL7y z!@+yxM#)bXY4`n3x2fA|SY>8SpJ`@I0^rgT+fMfcddmFSW>%U&r;BFPsSz0!HhT^4 z)!DSL*;M%i(|PiS4a3Sx;WOe8L>duUbbJivut@jczRwQdX0LIur+*D$;1?I&(N@u0@PSdFAb z&c2Z`tcE2YvVB6?GF2SklHAyFe`7~so}bzp7^`BKei3L#k_}@`G@j}q0Fy+d9>k@m z;T0YMFcoub=VOdWA^!gn)k1`n;irT}s+~E8q@vE`_~gIC8?wL~B$yuI<&LB@$dA-< z<_Zp%^x^GQd10QTy}E_tUIAt?VPd`+<|hF5N!7Y+n6O?oyy*e-4V;Y~65xsB1iYhI zF&sV)Y_JP&-t?Psq@ZCI7PdUo5x)26ZTh;#)RnYiRHQ&%dDOWMgsP*5pm zf7%dE+f+i-ypWZGJ6mg+amTIam)u778} zQrkZ1{LtU|p~E$8CW1TXO*ZXTnszTZ_WaUh+-g@Gt&`n~t99BoyA{gy`sTTgo5vGw z#1&U(s(D+=yJ4A}D^tn2 z*C%X^s$gXaM+Nb6+9MA4ivqECIiOk#7U59DGT9RTA6U%n(1;fGGMC4s3?+_6fG_o6 zA~s>g5#3zofGlTgEr~wov0+nQv(ZPEwYqw^SN|?n8_br*E!rx3{b`iZV184XR;H&_ zsfTZ@XZr8g?paxJ&))p|q&U%m6N$0x!08O^HyzPfG|&$n=R8)6-=4NhBH1v}R$)q2 z9wU=}m&(Y9*2SH9+1Vc|+u}z7mIJ zI(gaz&4wYn4AYg~pchf{Hjx)5ua`XH6VeUx&XY&Vo)jT(n7kNy0(m3kU4U1J{>qd{ z-an!jC-JND=&!?aFjc=NvF{nB{+Wykv4`c(Pb_$5TDXqAkIJ_IlCAb-EO>%r5FAS( zTMmS!48UaVm5UV=dck0__)@<5*}bHL&$lQI?TT+_#)MQ_}3f=dIgMK?fL+*i1r z?W*?d^Dq#$3LvaAUAMv(U&CI5v}f%k!8`^AE02hw@Bj=yVUEuWq8uK?^#TF{SOpi( zUu0%YFsBkFQt)|pf(7JM9wp=y7%psV2P4C~d02n){lRc(2u7$o_VsUP)~ACwLxr8FNT351(T-BJdkD;67sxJ| zFvx`4s-hfgp24*JFdGZujsj#vbS7tYgkOo|n4}^dFew8T0tScSh=&0YlRbr;MUbCY zVP8NcS5(kf2$Sh5?mUp4@Cexn2c!H6HUuwe5OLe3Aq)oJPpB~=#jHblw3x~Pp=NUC zdd#y9UYw)KKf-*jZ4?H1!kk;%xqu!tQX(f2zGaTd<^`B>iIUn*DkxndXX!S|w8AAy zy0!QDXNjDp+aRjWT#1ri?S1}PB4_C~NQ7dJnb74%=1TUg)wvw*b;V=xU?c!FCNLxh z1vy<#57k=7bAjzX7%Ea0X24UPjEFz<2DTkKi2rmF1jRj=8o<*V||t<0BiM-}GC zVy*}(S8eV#l_WhmY}`+b`8y~M6uijb?xDkd{kf^^8KQ+#fShbg~%dFhaym3hBM$ojh^nbMh`zzsa1PujaO@9c219~ zz`PRbR3lczB_?O+I}Isj{#Z`$4Gk%HaojL2QDtBE3mC| z;Kn=1C%sMHx5@j*nb+i@`D${q}9`}sL#+jC08a}4Z*c}@z}f{sm!W7FK3cLVPP6vv)B z@#MZ!%Dz)6)sSb-gWDId=-b}{qX=iqpF5ne=ySSY(dV^K+8}LuYm?P}rP@DfW7T=5 zdH$~uq+_4h9^Tld<%z_AK zCkJV1F|CO(4yhlWu9~;4^!iw#-93l(vA7vE6uB*qu1?!or~!6JL2* zIrMVE_A=9O{nffk9qZWv>KR-9VG1#g?L38+DQhG~sX88sU~B1a0G&9Ayhd8ne~19n z-R!X3sdbX&&Yld1=Uwk7^vGAnnXN6Mhm*ff=eY;Bm~LHRXj zAJiRKm6Wn)p^_C)cnV63t`Z85n<+dVC_I(vjVeG;a{UBPVEjvi2&>qh?0#I^6R0at zRCs20G7yx9a4gCs_%>BegDuK=Ca@u(>P=*mqNz5k*rV)T%^ro3O+AB@zizdp7vjg- zYROv133?*KZOkM&h(URfN%t^97u?m)sUv!1)tj8q0`WjcAk&3u^>dbPBPYP24ZT~< zTtM54B-rWZ(Cu5jI#PvxR)Ea)1@-BON=VfcOmtHRAZ`W7LXkO?B&c_pLn@i6EOYv) ztiD{Ipc*}eVz^GW9}m?MhA|PQ7hk3{1})D~=pT^x4tf8OJSNtVcqF|`-aL7?$fLff zr$Hr@n6@>@Nluj+AhQ0SQQdM?=A+t%*`BwbyZKzQalg{If1ws;EN?m=Hnn~ZstYq| zURx>xUvPROG&5ENr7s!;Evqm_u!BjON_O4NJH0A=x)PRG}toe;Z0(1$(8`_D) z4;S0uE!+1n-B<6Z(P5o-1Kz;l^A*OitBgbv&{4|V1lN#D+M57b5r7zlqd<{lSk?k` z${U~FX(c3a{E`zT)JGX zD+aK%YB|+BcYOXEcTfJ*cKp|k#D7e)Cfy;gq*&Im=FiN3p+hzCp2ClmKA=i>lgD;& z)f%$bP0GonE7m0%U|qC4N-x%c%>NnXA@6H6+TTThiCs2`UEUgtorzs|lGueu=1UAK zw_BbyFtD5gS)qyr3*d4VC%pUZQ!a+H5@2fYNyoy`icb_ADokT=H6;#7E;2u~Kl?Zyvb z2yPK{L@S*Z`XceLQqYmWj%)Rx9QB}=0Wnn5GN<0I%)&7y#PEI@Ov3IF-ZGt5uMpB< zGZZxv7}&KzhAQ03nvSv`K&BAV^YEDID1)XuD2dLu#&$i%bhHCu>YPKb7I&cNo`UgC`P}h4cBSQDqW0jw z=uXr=p?D4#s<@{mKlN!Vybukp=5p+bIDhdOCYTbZF(ufKJcSInfze41lWHZBt1^pr zW^F$O@W&FOabV{rF@_AkoR}cXZhO* zFqUElOYxcWT04T8){fvKhr!~X9;G{Gu{Ku{TVf&wMr>i-B zFdVu@voT|=EVxF4aF!EMNY(BsRG{2blyv(gCqks!^W@uG0@M89M5OU%c|^?R_$uKg`-dD6eg)8BFE$Y zFW9DZ4?kS`Av_&MxrQlg;eaxBnZ`&;v`)WZ?kN2mYUAIM_b1eIf_P{x@P*9n(CVAI zu=6We+|THT{Fv&`sY18(`4(eEpNV-r4&VASYMMMU6p)D(Fur90->R(CHTFTOj_nRo zAMw8l+>CyUJ~#V+S|j3hvG0PVIripCW~E!^*Tm!LeVtY>Wr!OBAf)_&Wt8YErbz{glh)X`NZlDW0I9Vadc$phlziLY~c9~Pr zq#5Izyct(=aN&pc4s*I~;B1JsLBi59N1{Gyh;({}`h2L*jJ%qdU$2FLQn2ejxu;2u zuSBY{Y681T@j8`fA(ShA;=7HkU9??{bY;u4UcJ5^ji=QyJ(Jwz}$|)Y%P{?S& zNKY6q!Pi1`LE?L8d)a5Xpp5)3r4|Eg+Brivz4{h;d324GG$w$mH;j4_UGFZQ{op(AGe0%R+ zp0p=wp84?5(R-dB>|gNqe1HF);Pe^xxvgjCFT8VVp{C>R{>k!$w?|iFnGVkxW+HQ6 zMb=mDJbU-T2d9+Q?u4)VM;!@Y&wacXT?f6>7C zjvGErdwXQk9)G|2DWmy^T-{S;=FbI2T$Mv_UxnHfPATKlJBiQzh}H?o4|>Sl=K~c! zmNK;eF?+s(TX)j0DAt4|g?`U+Zk?h;^1QT$i&YknuF8T`9hC*Ck2e1%s4VnbaL%}m zeixmT`!_O6qqL2Ea=ASkWvFD@fe-jcWti6w)mQ9V(lk literal 0 HcmV?d00001 diff --git a/lora/layers/__pycache__/fused_moe.cpython-312.pyc b/lora/layers/__pycache__/fused_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a0fab75b853c45afa8cda1ffc1812babd6663a6 GIT binary patch literal 19112 zcmdsfYj7J^c4jx84G@n83BExRBqfRvB~YYDJ*<~yy&_p69b1-dW;_^>-IPG#HQk^n za++&RN;4)WPE4iBD{AwwW;a_ytyDFdeU2yKl!6ICNs z3D<~=_-%1_!ZYF_v_0-kup=y?nYb@eJyK0*N4zHCAMq2~8Lv&$jnomkDqf#x7->i} zjx-X#E8di79%-g26LpH>+}A12!+Y+U)i#U-rs3LI>U$w7@@J#dyzp8qa|x?}z#7aPPYKaTDjgXkj35Wh$w`st&ZSNOT^qqPYFvcN zOsDyXm=PeN0l1lrM}e!T5RWAz_cgIcl8dEZ1El7S76B zc^hx%%rkb18aFjUX3n%eY}~@xu3JVJ&d%F7hIdT(%d1pr&apwb6UwiGKNr;RhPQ{e zjkBB=%B$ko>(q$LL2+9+AH-DiZmx#+sN4`<8P0#*I^yMOdHcADtAji&NezCkeuMfN zHVALrAe^+FYx=y{aVOUdeE9fkF2L7JbQ`oDd2ius%lYT)AYbcs%iUt~k$SET+S|Yd z!ENN5xc2Lm9@h;1jxv7${3a;D#C4XH&@x9|rAJy{p+Z{~R+|jkgjJlQFoz>?oWzO^ zKA=esr&B^EIGz%Mq&tJ~RR{6w55~k`GL;F&66rXf;FB4i3r_QyaHvYL zLN4fGCK8DyW0^=qKz0-xeBTj1EgI+os2I-LOy%LWndQph=l9?g)j%$)AoSqiyps1O~%Bd9A+SUQl z;ze*iyGDJ)xaaLM)9{#S$~SG9v%KS7uBpFy0){tzT|eN;dAsD3X}`MO}PZo6E!Jy+K!*Y!bARi1I>m~NTr{u^fd3RDGssmjwe za~Ed}nY*6>ag&NsvnH4wAml0^ZJdu&Z*mm%6_~-Z=2?qmp1|A{kCs;WC5YGOWGNBM z<6;d$7iWSr`phg#W7g7e78IZp`aCS9(dwCjpjm6O21@#zRFDGdnzc$6$s}11LY{ZV zEtF(}&)zzu*hb^2vC9#q>V>nz$6k!Qa(d)MM4=-e|0_Hpt7461_=Kp~V`4lt1=G|i z!raJ+Q=rvg=BDGZjADgZ&kG@&&;msZ?cgZBWJ&;a1G`KI9eJ8tdx-SGUG zxnm2pd2jW*-*@ZMwY7RO??%0x+3l2wx^vyKYC6Leei`Vu5ZDaQsW zoO%kUC`}wgidWYLD$rHz5%H1`OI{XmiV9ueeEcscS?r1(W;+60#gT|!C8n5}l`5P}d0qkW;Be_zFsm7ZN-di)H{EgNT7T_^at?k^@LvrAjmL zQZ&tnJOW}2rQyQav*#k`UmZGeJaTM!=;Y~Bk(0w`kDqv1z>f>07$#O6tL4BzzP2-08p8jWxd;7n*Wpz! z)8d&sw^~iL?0Uwf&YxTLQ*GV3)?T@__ak-^rpVj}D9$ud1eM~s!@3!Ixon*Tn08gEl?+uvpohkVCA zPfb8nhR+s#xQZ$J~Qyb(lklzzbDZfU|f-W(Vl45%adrR~Nbr?v`d-UC+ z_DTjbx+MMCN|-gx*b3VyP!0De=^Pp}L4%|$;!E8w>j{RD%6pCDfoHE)*S zE0gHoqkn@gGGQB1+?vM8fL0L|2B6sGNQ@H|7opTumZ&(@cETo1S_wqc>G*U+NKJx1 zDk_Kw?7qa3>B-C;=pV%bt0+;iU7k|k0xwR+Gl~Vpw&Fo?3-}Op7#ADMh}d!v;McCH z2-rR)K=HgVbAG+V-gt!Et3*T!CI+MnHU4-Gl0we`yRB( z{ihc^i*LznZ+_?Q`)A~xN5DHHgY4|v`@kdjodWNFklAp)ckq5(?mY%xNoIHCkDki= z`=6LiFVNKso^0D8nLYf(Mm4oAX0nap+gIeq@PaL0-?HE);!hZXDnk}k;e8co;cMtb zz!6az2EP_8Hs8I5|CwXYpcAIC7B4vec#ZnhL`En+>k1miy{c3)k#R^Z7X=* zGlOVWXD&5op0mtZ$E_RBUo4-@-~R|-%6nFrztmTNnl<5?alTQV%d=LX$Sj_<0Y&ET ztQ{yab7z^1t|cU^k<{m`_zzGu_8B~Ys*4n?77e8MtP{#9Uq`6fD9)~9xT=gUdjKyw zH7PLbns1uWcT|em;KeZzZXnAc)L?&wb4afFhWTd6anE_Ts8Pr|+CA$rF2_hZb9R&i zvtGzyAc-gHOWbLcWQVr6xhf(*nv9V01R}YLN5UEc6@lQD7;P0pMwFfP%~ne+jGBwc z86Qw?LRAChLCUyo5Q_7v;<;#=t$`SW-hHjKTQa zEzVOs5SXtolks(+(;GP-8Jjv-rBn|IuR~Jh zIw&!eMzS(G`RXgMGCDVWB694PUL8`mfP^y`Z%)LL!b?c`rux(cU1?K857PHFgp-)m zlb%e*#-fp;u2C9M)2SOG@z}*=q_{%Eg%sK8^bwsE!FCl0P zYWMo;i$Qwzised-pHix!L)66jS~L_8eg#+)zJdO$fz3HYo_u z!K(TvQ$t;rIZK_Rko_BV5Y#2yIV^Yxox|uP&`F|`0!OLF6=cPhP${g$zgt@fld~q_ zO)P+@NwPvjHL0#7ajhugC!ntQ*16a4Avl64*q;?cL}43wV+=8eOl*fa{~2J2&TX)~ zTA&y9EDkOQ!%J_;!6OUqeA~{Yq}+CB!Ig)w89Dgef_u58XX&imvVXz(w`~LYrfs>V zopRI8e0x{Evm5@eI;_F!1=p&JYVF9i9FSWMEI9Km+jA|wa!c=h`cX^&g7YJP{i5^s znMeNK1$w!6cdqxe+{$2nY2VSBPAMKIZz4-w^ zPuaFzGTXNdac0WbmTfyIvxf={wXG-Dbx`g)_<+kDJ})0WpWXUumhH%QZ8r=KZpj7r z$iY3i;6XWf@WIP+@OkL;j=o&S9=T)B{pp7tk2+2)c=O$da@{Y;-7h@c|H0^^?qANb zTbEgXj_s7$&KwIY?aBKaZ=ShvW>L63xU}{AJ$HKUkKXBh0Ev%YdKB5ct7jmNVb8 z=)lji3Gl;(t`YDnZ3x=RfS(OE3X3=x;ApKO@Uua<5%8-uf-an0vd`Dd*GU%Gk}L;) z%&Y@i?I57U36zsiRX`E22C%c%2pkxJ4t0xp)-6>TLO?g?(%S^UcGEP-jhk@ek*H$X z-Y>w7hva1;hru%4aI4h|fX1*-FVYZh00=MAl38*Bv~>aetb@&B2c$!7Q1NiyyTy?r zkf^#068R(_03o&v2%Qu-$gYAJ(^UV*xC9X{qqA0*5p*G>G2$RN=R@8KOd)(7;}AUv z0y-~&1M0iKQvc~ey807DOpg#k_y#(Y=t$^X1t(Nrrov13Z6C&8L+6|5kp0w>GOpmA z=w{^I2p;Vgu44(GuXIsdtW&orrPk=%*MGLbh-t|F%4+_u3x0wkPY~v4-AE(SgQWZSjJx>!{Fm z6$(!H77kUV7W1te;+a#=Xs-MS>#GHaXtO=2%+N{GC0z#RAqcq1QG*fy8>dNuCk8>~ z=N7QkiDiTrG2K~k6fZ7+5?9C5+Iq*0Q7@x|TaAia1=$*Mw}a^eYr6Lf9tN;DHo zB^7(I*#d5lgd9Z8Yv(8GX$fqHmFyHE<4GZk6?_|=-$3UEIs^a{?29_>YwCCS{PQZ% zMFBXohjb zhC_%o5vyiXwWm-;IqFyaE=Th}1YV-S&4nv8)wnI!Fd#P!%(?$hdw41H{kQJC z_0W@h{(}7cg-7kL&3PU(tv~AAm3Oi44$luSzL{$olv@VDQeum9&i$AP!X8|Ee_3q5 z+|m!R?IgAZ_~|}WjLo^aWmor77-GAfa~5py-!NO$-8vMPN)=hR6i3OWn#8HTF@4hs z(&0>pjV@|s8IF44Xd1dzeGG974LS@qa%UjPjFl=g8!|Eqf&)k_z2doN);9^Lh|Wn_ zz50fHDI=O7P3RlsrEnue1IWoh_im6MKdXvo9E>fD@XAxP1{dk%d4{bqZnptOF>d$I z(HfK&&$=~VWNZxD_;a*I$G%_s*ci;IXFfJ4kAIP4W6;K*V{G*A^-CWcgE{rg$Hu3P z%|$pwi0C~YcEKnzn?jGfk%fCW46!?Ba?+gb2Yz6s*s|pe?Lg z`g{k94w=-wW;}@=3e_oK;3j}|k4?3&SIn?SEnpm3;3-zv5Kf6IM4iG=vL1ULD8)%Y z2-%${t3;)3&Gt4LMq~;jl*&@Qlt$BnaD*99pYiq^?9uIO!+Tv01%$9Xn)F4#{d~IR} z04FfCVX5HO_oZ7>-t7U4#r=z4zi~)*wco4#e&e0S2Lpe4;ExUfXbb{Kd&~p?!gXx> zUh{XGm)S-gw)Vl6VxDczu_2ia>7hL`EVdk0w+EnVO>3^YTdwZTgQ4w#8wXZ>jt){@ zD;UgdR9+DiACcK3df}UwSGQq#mHc7Bo0qpKf7r|~g}+}&{}r03wssup=6q|Ayg#ZB zEWVMe-!^yZqt?#b$G{l&aw^N?Qm+pG?42#D)$_HxJ~Xk^TBg+ z&xObAYcM0UP2Zm@&A(e+KV)C~#7DKAqMw+lhHkPZ!Sfo9zq-cE1eYCT>G4A#;Mx*` ze4r{HiP(>j*y1K6`H#ti=(PHHwcuBp6OAoi>Hc zGd(Z!BA*e1nM=GXZ;Ns!7==p(*eAhJur&^5Qo&uS1Re_6)g3h=3QGd8Bowa#Um&81 z)%Je{TEu<{2348n1Z$PzZKav!b)u{kDWL=tAmcS^xjQsx$uX@m)4Dh*Gh6e|+TXPe z%MZv*VDZ%tnND>^*of4PD(ZVw@$rdKp5tQ4i(>I2mtq->Ma8l5k;8E!bLQ`1w9!h# z$nw7gso=sH&Z-UPP06Y`qg(WYEmaXp!9o{uYCZr~rZ@Kku(E+Qz7664WHkaiv-Xmo zqfu9pP^K)@tcN-z^P0TQlD{l(Ras~$uS>Fj(;_k7G)azY^gE}ZT?Vt3qc0x^9r!xL z8q8XwMKo-|Q-%v^H@0vVw$qJbRc<9UjTg4HJ(BlZCJwB(zGYG`kb(7z%OKPxF={F` zu9%jATJ&WAbYby)aro*q;G61jHHL1d{BW?VepbW+2ek2nMS> zI5y$IIDT^;9P4;6TBN8vRBgh8as1{yRITH|2n?${I5*+JIDT^;oa=ZnT0E#cxHjRz zIDT^;Tn(VbZ2JThp+I$6xiae~p4G zLJl|XsxDe#&o*DK!DOcMqnd`_u35bBTg^+f+!)H$Y@a)pZ)jZnr+>1f)&U|$Ywy&b!6&vxzoV_z05 z{Lz9UZyffYqf3qV_hv)SY3Y{Xa)GNuc6ERW{(_hd>{^VypSqP=ips6Kz?bbmdH?hu z4*y{I;Zb?uB>1xINjO1rd;XuNW}aU-y>xtzf9&b||F0C-o__}Y@(fD(p{MW3D{zRU zUSLpMVro(t({2UVq{;m@yeU5!=fy}oCT7M!+v^XYD-~yT08xG{Aki5eO=+4Fj-=_C zLobty#$xciFv$mF<0|0AJIKMYXfim;2L(PIkB-6p+0ki;jEU;~T5^Rh442`ZF)@~e zPdYiq3kY4n4%<8y7ll6oT3y)x52O~s(TuJ&%d(Sf+>!9ztG=G9$JMPxO z8s1N_O^ZA3F^{}`S*DKw)HQcLuy1hFb>%s5Zi1yE;1!@MA7i8mR_i9TT7QMQLk|IP z$2~#eH^I4MR;8wH<5s-Ut3r8_WTs(VLy9MLA0YdvpWqbC+clSY_pSN2a^5c4+qGoL zdiUP9$=pvP7GL%fk-D~8J4}_EEE3HxK~e${}L)K=l4bA zw`JXcl;4{B(16(UhYpu6MQ71I3q=A3A8JMR|ey13=vl3{=8zr8zG7re)vQ9ZQz95A{71;P>St5 z`DqejRU;s<6;PrG1giZWQUsCxcT8GEZub|jCz_H;#YTSRgI6plrD07BR~>~v!#pTL z1pIy!CM~4!9?!IZvrQdW>KcAYAi zoYtP(`wA4i?pr>^SD~TG+J2i`+I`=ApZkGx>4@BQpg=*`L(9WgKIqK8bp9{5Ka9xF zzq*2fg{B_sj^&!Vf)(C?TN+#6cinOo?C4`Op96hhu-Q9*S)!c#&c$ zwx-}is+y{ATB!l*lM(u=$?7afR;q9CiQ{$J`Z`T&A{5kB7cAISnxX2qE&1iTodq=+ zROa-oRH=~}6JzbZZC;_^RR~gBLr>fT*8XL0b-@a6D92ZK({%%QMjxa39O!dWfp&tW zw#d!lLKOzND6jwB_Vf@_^0XH?2 z)>2$5EryG~kAvi7CI+Cufr++|M5<{6f0ZAb#4A(9qQ)DsamwIR zan1d?R zJ1qQP@WNM4iD$tnm}#0`u~D?|CzR{QRPB$c(BGlE?WZO)ZTgg<=z$+ohks1%|I}mx azYUz9+DHsgtIjDK-Jh-R{**$3)b?**)@Nb> literal 0 HcmV?d00001 diff --git a/lora/layers/__pycache__/logits_processor.cpython-312.pyc b/lora/layers/__pycache__/logits_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b3691a812cb04dba7fb2fc3286c6b5318b7c5da GIT binary patch literal 10762 zcmcIKTWlLwb~7Z09FoJQ-Y<)K*b?=iWXHA~+p+w}Yhy==6V(Bf-Of^+p+t$Vof$fo zLL*M$Z6##8w$K)G_9xn6fy!`-xL@^tbw5+IKsm7k5(5V{u-T6Q{h%YKyV!oT=iK3p zXqrq?w7$0P+;i`_cg{I;?s?qHfAM%+6qKaxgTz#bqJE2jQJ6|&=jYJaqy$Q!lhmC4 zq~~Y?TT<3JD~)}Yq%FnFF{Eux+Eb1>2Wi`qY|1(3Ou6P<1ZI-%6gS6_wms=ddFQ;O z?MV7k{y9IiS;3hMq=IunnzB$YQG)9#CAh`lEvrt?T+<32=DtE;CQNJo*CkPrqKUL9 zoXMo)i3R9$oy)v&ssdTGJqx13D`HyCNPH?Ih)F&hm7>WcG^A*HSsQFT{!T_p3cQ?H z6=7B1xy(XBkzbcGF;RxapsV#lCKjE4y|Ua3sd-Tl5@{sCAInLSm{#~~GOENgQtB%j z2!?^c>8LE8i>`>$TM1XP@oZ|p?SHT{DxhmKT zu%`<40qm`U{Q&!_-~hn>DmVympbBnsQo^7R1iqTYW}#VZsiKkkwTk{1Xq+nq z1(huHjug#iMJWWj2px|p@>u8{P{ojV3HDJa31hL4k_jzF(?U`VDT^Xm1ZF1_v8a;B zq(hlFcI)d#g=kiRbpbmR6$DYxyD#F#$R(rFO2kFxpOO~jqojo-&BG?(lei0x)>0E9 z-6rN@0v1h^J$N*v10mdy1`%XOo7jS~k`<4Jgs2h?VY302crg(}H1e#EBQHiJ;D%Q+ zd_0#-@+lAiZ0)0=L|PCpg(_euE@e`o3T}unjk3cTI%HCYTuWk1Ok4yJky*Pkh@b^( z@WSD6B=m!LXeN^u#~xcOoq_cTF+di4L3rc57!m+4Lhv-#?h7<`l^8XKNJ+En>}zce z-aMn(!L;V2bj)6*rf7xnZ|7G~{SCcIDdyn1Jx~25cnw2MX(~_WDIN@&Uh)=zDps7L z@>YOs=9mp2!{bnj$LKfKf>AJBdQk#Kt7_C~^yL}AC0-4CR;ZiwOxUe4at^d#V|hN2 zPAEJtq4|`2SfRm`a7hXI)Er8dXq$x85D2}56a*;j&}_1pjMtrv2%3}UW67v2^Sq4Y z;ztjKuKe-k2_==C$R@JmN>qZ6J$@-S_3V-5DRCm1n4ie5D2ti&#KmMXHId9n(Fw8x zWN70y<+FyviDXx#HduAFcFP!BK4NgU7qH|XK=p?!)FVIT^sYNpw*4O4S?U~Iv;E9h z>e{zvyT`VdS~>xAm0G*u;lAH?>~7n0(Bkg59lF~#g{{)ii8b5j?BEV8M#a#F83o%! z5hBpor1I2~F;ewhGC%%q=bMz&0#MjO#HjfyY({1HG})YTla@MRSjP23Qa996FI1~T zHCD@Hx@P`ioQxmi3iZI(bnU(M_lmxV>WdWEh|~k!PlX8=+4{QPgphzL>iYp+?QcZ= z?!==wuZHlj>I)ax@Y5rlkz>&$nC}{z2a)EUMl_Q-=(}pk9aMdT1$OZ1k;}@NxWdPx z*(WI8hZHy57{;{Ib%-8PeM1Fy=;;yd&4IInfRkT{g4>jyAbl7~-`7akk57IPaCd5y zWL)))7ufNq$UsI~;CB`E2?j>Lm4U}6zXG^Y7^|nY8QT--STZMw{6aQ|)^lF2g#S+v zK8}P>Hj;d!WoBSjt>#auzNrE`_4JtPmlrcRNJb#uNyJz91xXZzl?r!YH5V z**n^Zz3*P|jXI2;Qhlci>?y*i=1fH|@#wN-&5aK}Mp7`1)AKAHikVIxmSf?Mc#XqQ zLsAI=!uRzXG0P>(GWZk8Cs>xBhbO0)0fS(@Rq;)KM$?pHh6+pgzM7J!1zX;-WQGX4 z!$6CXf9-Zsam22ZZ2@+jTgfH zC*d2RF(n0X(TUWQ_fJ;Qj+VNn)Sp%Vy$S}Hq%4gXt=;FxY?bDiIA8q4_bUbX;Tye-M_>0kGPTU=ZxQztMsQckIJrqXCYpx53_<|&5 zB#lurQfyIURz)czYaYW)kOLPIb!#qSFGwar*3-PCBg)H4G`1`Xnk@z1QnNwEl+@bD znC^kHkYxh1go(gH8A{lBflIed0KC{i|x;I_!k12K!jY)QgoPexor z@m!K9!>I!5UE}0q7w6A${$Z%DP^G<)92VIwmF>Fmg39*S!uP|G?Tfd15T{pVd$*X+ z*nPVE>_p3>n*N;E0#9tRnCY?MRp_OOZ2ub6n`Dp`4+Z^Z$e&kX3o{k{Lk}?vhu2&>lji4>nbe@Jqt0?pbFjM|dsEAE; zdan7_eK(kU?4ANXrVoI^v#;Da{7=V!ef(4U*V6zLx?fS* zvqkp2%APNA!L4R6o6a}rN0i0s0z2mP{J$;5^%UI0Wfv9bFFPqupvVoV+(60SQu2=# zf&;~1SPh1^`_$lA!9P~7ugq8ik*}>b-LmVa0h-4+(o?{{$eJUW;upoJpn1(r$ImCC z@?+7rx#LcP@L|VUmOzNLWeN9WO@X63+Smh^ZF%ea4CtSA72-R9sCwE~cQ4tDy$rK# zSLhYo(RFsy`%QTpTq(i6tv?d9%4R+~@I?dqJ&Lj;6A35)8-!T#muD z1G&x9>Jz8*2mHVmI?q8g`p%i8 z$c0r7qXuql`zI?*6r1*|P5U3Y z?QO0#_aiUd7G2}kx$CpVmNB(uY&-Qy=4R$M$3B${ufI_^^XBL5?3Z1m#ja_!Yx+0t z;>q*s$@6!+-hk_+HOKX~2c3Ou?(3)5eWljE&Gbfk&0Xr~DRzvh9b?6gGit|~HSd@1 zuA;kNb@$(M58fL-QXHOEho|oipDOi?7RHWm$3I!VxqPSZR=Uvh+}aF?&C{;)KoZzK z)!(-T)!^2<#i1kW(2+X`ZUf161?`>`fEjQMCC@dXFuU@^2PD9>iF4`x1;Fo zRlO)iZ~sFp9T_N7v}d4fwRi>}T5S+6a@5#larA^bdZIWwqmItthkteJU~%BEI&k=o zt$5_DdgSbT|e4?K^MBFZp?nff5;bmrqrG(01NF? zYTyVcl&@vY^W^a|#sYYYxr4i96MitGm^l>KHMBuB#qiMeEPS3?p!4+K(tmGHKy3Ic z`scK9S7+61c%7~rCI|e6;2pRf4_nDb)ogI5DQos+=v$Cwj2>Zo={I*P0Ic>l+EkTw z%77%}(Hi8QzOkPv*Pq7&hP%GOqHj#~jcv~sd}nSy|C#U1oflQ#nF4!8&o((ZlapeV zpAl$HS#oTI=i|8~n6G3)RvybXVSPQ@MAx=S!HEdT8Mow%qMwrXMs9 zlo@z}6=>}!JFo>ir{&ta>+hDG2ti8Ky2)*DWj8_`)xU2m_REg#S+#F}*@InP%H{pW zhpm3f?tN&XnTZ`Mw943mW%a%hvv0)g+iUjiHT#CmzG1U3V)jM8weN_Dcf`ayO`Dyk zX`prgp@+^0bMQe^Yng#3;I)Rz4s5Yh+t7Bi+B#lF1mN4{-Em{<$F!B1+OqCY@F=^Q znEnU8V3~m@Os>erfe@U->5_6HGY_3JG}HQ^t$Q=S zkuNg{z%s$%vV$~Ps$M45qhg(qyW)E>x(vE?$kb0f$BcVBy;h0+N;!X@1@dbT{RbeU zWQu-+C0-I^Ie1A#|49a{@c2(M^v(Fpx6CjOtt-DcG?R0(pENI-Y5Yb|)?+ps{%}Wf zBbQ`@yoW8kE!6`S_PB22PDq>3hvr^>7OJw9rs*A)qPgEv+!s{K7gYFn)X?uOR@(9n f^uA1gL0$NQ8vPwL@W>vw(k+EO6W>tSAWQ!bd!Q3% literal 0 HcmV?d00001 diff --git a/lora/layers/__pycache__/replicated_linear.cpython-312.pyc b/lora/layers/__pycache__/replicated_linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8098be2f44a79641e91fef9ec378c657a5e9262 GIT binary patch literal 3314 zcmbtW&2JM&6rWx1+Uu{_A;Asb3u)Rq5CA9rfStw-|X1Nq_ITBN`8Jb^XAR_ z_`NstQ!*Jt@R`y##&-%rzpz2;lbcNAGhh}FMp!UWp8pFaF)s>?7tK&f%1Z$sGUZY@ z9}alQjFgnT0=$gFX0#N`#{?vz_YjWEA*_(t?GV?WkIx9%=L7c{$ zgT`;L>ud0fpuB)lUc>?kVG)OlQVkUaEX^hI5|)WPo#@=oc`U9!Pa<9B*Vhu7jUZGM zH-XZClW=%W$}1#_BP2GRS?64ZByf}@ag6lfU^iHjD#mbpPP*L`&!=&sixtWB=eeeq z6dmgW0rV|r8!U$#L0#{$suNu z%S1J;VCs&eX%0I`RWJSX!Khm*kCu({h^y1H#2vX%IsED?=MIxm)0iAB&$v^zHTs2V zmPSpR>Z3teoKe~?IL!5R z8EmdOnJ%*JxIb2630SxY*`0<0UHesqs(8Dp|6-_%&UMbG|Mz^Ui*9JtYd=SPk_Y}Z^H7Z9`j@c*98Br$<-DyuvWMz)sq;t+FYiv!^G4>?J zd9q$En=@XVmuavH^1ToYx{;9qAYPcC(ykD;(_wd6;4AH8GLa&W&TQA%3>hf1Bslnyc;E%W_`C+snv#@!7^YwSXfBRl*S=gz?OIZH z(SBxT!m%r~(CSWcR+3sFxD>s=D7!k86QfS@ZpeZPrplF_l z0kp7YyXtIRZX2={1P~G?vZ5!sGthgOB4|XRg6Yuhz|$R!sEq6cvNpKA1>MqE%g|e< z&klEkVD3M%`c~w`9qUylY!F3JmtO^NMU5@UuPIc2H5W`Y#5vqqn>IG&A3yP|8)Fgr? ztMS8Uhr0UIDRu<*H=;lQKMG1`YROI)CZEEE{T(h$KIy`Fc;Kw(1APe&Y`L);_^xhD zKAjtrZ-Np!%m@Q!wx3-`mR=&7=Ebxo2!T&%+UFJB{8^FX zZ;1dVUYtL}g4YFQmoCp712VTJ=-bK>-tUgmF<4+}c374AAwdutF(f1(qKywx_E+@c YLv-|4wD(c?u#j2Wap+G3|AC%=0VaI{LjV8( literal 0 HcmV?d00001 diff --git a/lora/layers/__pycache__/row_parallel_linear.cpython-312.pyc b/lora/layers/__pycache__/row_parallel_linear.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3aa9b1b8e6bcdaca8b82617896f8ed1998754905 GIT binary patch literal 7678 zcmb7JTWlNGnVuns_nRn*k}TQs$aX5x5~=zUI~Ucm9Yxy2iqgc{rr@HTP@JJfc{pUx z3@uBg5EpP!3cFoP54)20VWR-Mh+GGV^Ata(PwiqKRF0mW`Op3Qm+#EK`+RN&lFa>WO6_5o|H4cy*{aO$CNS%a$cU`W#Ef4y zZHZY}+%q0}dJ)Jr=T*|7gB6qyOLdfWGj zq>*?^k;GRrN-}i=%G}eLx35%l7QIi+$|+6IBt^{-AueZ>8-g5HH9<_Jb-r4h&WMsM zWaA_*%fJ8?h$Q9`5_A`QJG1cpYRxnZ6el=tB1ed%XhK$wYsm~rf5pO#prZR~T$K#5 z>G+~V-c4z9IG?aZ?-i0cSzZ*>xi}FeQAorU7zH{=NP54ZWVF`x31dqP>aAvWe+Xm# z0!~X}Vywu-EFvpeMT=-nas?*Iirigqj1z5=ZNAo`p%>9&UY_Jc`&};P5FO_Af2N%i z+BrqnU)D})A>X1G>ym?El9nT|63TIKYq*$O@*qeq$X4<~AGayv~H%^dpB3zIK?pdlpU^t`QutZigF~Bg#y|(l9pO zVf^3yXCVK;t}|M_a>bHoehK=PXCLC7fy6R-W}Y+nMmleA%o%*15$cjHT2U=O*b-XR zG#oS=a`m1UnFp2`;sWuoL$|59EUd^W2q^`0Mj!zw*LzY*Hm3<{YDp5H8r4KgX{n5& zyZ38!PbR0;#B`@3sB$VHsbQIP zV~f_-)dh0u7?3~RVV*gCD~_VGXQTHE=c$UF={x?x=>5@B@6}@O)dKf-fie_~-ybja zT`2ZlC~#jmdvRgo!mJ zT>HGIsSvU?Ye%o3?{IzIQnO6U9Q1rxfSLCnE+l>CGZig2C*aa~wP3AHo~!q7Q!III z+16qdyu?jOJ>NnWb-yY7>`1;<&nD8SSPGlBE!#!wvP0=X?%R*G>}Vm28rYOnOWuB+ z$=maeqy;<;7e5Ubcrrs4ATHoDN&b*>x!qI~fm|Us)R*cxEH$p*OVvn*Uzp3N5|056 z4*?DYJz%H#l*+%iZ>rym@~3u(>QvWp*_8ML$Xe_m16$I6Z znn9cdL9?X06B$KI-NN1mA*Xu` zpHGX!PQ%EAQQB{eU5g}jKaD;$i1uiO9;kZBx`3lvpxX_m?niMP6_vVnIEEc|K(LAT zWNF~gEvZE((W(eeNd0dhcbG~)<88Y)zA|3&j6Cs-l#iWUnc1^5{V$f=x;C|s@*n2+ ztd@R%VFnyv|H#(FC({q6i^pCp9h)c~n|OL`l2(8tJbnN4=8vEDo-YK-o`BI`_=zXH zC4M45kRQMGBzoyt*M&##Kkb@YncC@kt}wL|=(zX(%KN3j$dkYbbPmRUd~LkcF<9&v z-0nD6u`OPuik8p`#TC|_4Jd#>2k-B z!W%n*(CSaO11GkfCk!{?RWmt~sKsa=>bKy;0U_ue*?8h6oCpN{IayK#Ii+g4*MIHElpu2_YHURGxgrDT%vQb0lO#>?3re#QCPPe~9Arv`@Qy&5X6}hY|k(ZG7(7-M& zLA)js#2dOrQFQwobdL##1srZP!L0->%4B4664j1iH~BKX@dRBB zCAcj08IV8SWhxejt#|w6`3eKSk2}A{pGv#U*0uTkM`It3RTy9&z53bor_;O0R{ZRK z6|l7`U@P{B?PR&DZ(UxKD^}olT>g7USB`GjHlz>a`|{SSs0@$K7e~go`^Sqtmy52; zyEc%k{Fr6z!7?x~my0`HJ?p8pRK*57-19EJ;y}*H^q;7>kaIKLgIk@&u1Lj$Suf-E z|G@{`u9dN$LK58LAWM&kE;j)yt*07-+x;UH{Q|DxKIV%7=99R2lc-;`nwJB{;{Y|; zMZah_mjLu<-lGjsSId6wqVujT=8&9{3vi%&-o*EbZUd2@;8PE@VMznd?4Lpq~JMMssLmkCLDw&js$|p%CZQcy!`PoGt1J;ys1AQB9 zNz1Pes(f5lGgW_updMVa!e5VI^+>(Z9C!{;3y_6(1?nCRGmg$TO1pb!4!q%FN|Hs8 z;uD$lEIwB76hy^kUd^SAIihPNnFXw%j}SaUOxgp*JbYXcp|zY#D@I!cU^pj=y7s%k z=@|nPv{$4JhX z1TSH3hTza&Au+J2Rrga9J}UrJLF)}z753inl zoP|LWXdi0{-9P4c^FUjjl^KqhZ**YQ-pu1uc!j8*GT;by9=%mlg7*T}3|$y^q~4m% zTU)fYesgPRYl2^vxn-;7sZGc16${p6f<#)(m)=}xfhy6Ij9YCs2hoa}khE%b`#Nue zN?=42MDa0Kd$ptp$HW@WW9Ll?FbiTnvsv06R<&x`t|2O?$?VOmW<3#U(6Re#()+)n zo#HZ&hRQ-y-u4jvE+pCp^`|L)(>_rYglgE7@(%i_;e>UW?ZRM_IAp;#Aa*mE1C(O) z(UkH|(ebhKA%^UbmR;X^e_YE}3yfxlvGd&D<3o%Ue2;79tN(*U%@Vl|BS3s-6EVQk z?V@xGp1N+(ftkuUHCXRJy*HCg0(3%)MKs+7=)nL1;gEq-ybx`{3vxAP8in>ccPk|= zkiQ0&Ok&;!XeFZ(c&!I$VZ6Q64WJ__XX4tqvm{1qvpFS|hzkn@JrdE~IGj31Qp!z& zs-?GMteDG+aZSQ$;Hxugx=_t!VKm(`ckA zXhV;p-hpci);SnjKfZRn6pX-|Q7L$?7(DlA;e>5|KJ}g~u;qclkNQ6B+j{#^*JnpRJ^FaIc;?E}fvcZS{$~2u)2sf1{aLVU zy>G3r>CO&_wI5f5G{i|o~ z-RqIHNV%hP{e`s`_8eThyI_Z)FM75Vxm=7~E=8^tBiG8__SGw^TB-YVvHSGaE4x-p z)LrlaFl*yWfwRTH*)Ia;%b^p1e*kTbt&D99Jn{5B3qAiR`84#s!ephL>FRCikh=TU z^K1Fd`C|8X3*PeZXleLTarjbc__gBjYcMeVYb$w=7rn>JUSRr*-v08D=gM8Z6(2M5 zI=jm_M%)Dt!idJ%@P)~p;L%mOBOA*0!Phx6gmVB)p?{$sZ3LL6cX7~n<6S&YA2dbw zW6Q&;9|H}v&P;-zH$hOkQV4aSpN zMkfWOQAMFUp?l#YprYc3I0*U;jM;`?4Wb61kD~1~P^eKXA{0%{=HO+9J`NdgJp;6f z@e&~2mJ&G_Bx-zeh#FrRC@j|fRIc{vPc;mz$M_~hKSmHVYq~9`r2zHQ^Fjb;Kv{YM z=<$q2p&!?sW5#2|C2|Y0=!8{VWyQ*}?5>+(eP1%6FPZS~n8E+CSXs;0PKLeAe#s2~ kj`{9?GH>kJr&;#mHh*!CL1u5x0zdBU!OLG`-q^hV2j!2uIRF3v literal 0 HcmV?d00001 diff --git a/lora/layers/__pycache__/utils.cpython-312.pyc b/lora/layers/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4267e6762195427d70859ddf538fa03d50d6b047 GIT binary patch literal 2875 zcmcImO>7fK6rR~#+iSup)9naTH+w?VUzN^t=O`>+#%^Lu+fFA7HyYhOu zcYmpO|JtqtOVYaB{a|ZwRVXryY183|%% zb&qJ=J3*KdKS%8*5U`+e5TwBE%^)!FX714KPlv?JQ$l{K2}p*}WP=6!iS`^e#3mce zZ%?%6uu-oA{4TuGG#hz_4Hg>Sj8GnZfxZ+rA^jmvgy)A(a*ukdKTTAFTr@L8%{o-Q zG)*W`eMfaI9j2I+0Z{dc?K#v-1>sA?JU8tJt<_$0^~`x<1d$81dGU;PVZQUU?d4q8 zp*}HG7DtuGaa2u2g2=S)>Ap`XDp0YAfQU;3hKvi~nJ=zPh&#qp-T2}f>N?%d# zd(_rh-1*w-iPf=U*IR3C<3)L#Be@B%td4NNZUN?Ffb0^=VgoPAS8yJmLJ491VoZf* zkPO_+=fN;D)2eP5)i7lo+b~%Ys!mpggj91LluBY0qc=wjGwWLu-}_7Q{i73w zaN$a+Gg_V0J21d} zpjOYg`=5_TS*2xffu{h0V?Bo@^|YCTSl`L+RMZI+6B6N11=Pdp(who8jlap6`Fo(6jFu?VgeBf?;>ekfiL@9Z=oSZ5pr#3_(8mm5MEU@$ywy-gx-Ot{Uz6#3h`i;?yHAXYh z613Dm-iSxph2Xoyp_%DG=3UCR1HrcWTO5QZ_#;nugBn#CVR+%&P=31l10Kj@)enM0 zlpTO&y!S4U4H0AfyNvMOhbaCN+O`o!LdzdQ1dnVWAQcgY8+_Q&jt}FNE#Gu~)wO{b Xe=GNG{&v2?`QP@nV5QOnguCW%w3wn# literal 0 HcmV?d00001 diff --git a/lora/layers/__pycache__/vocal_parallel_embedding.cpython-312.pyc b/lora/layers/__pycache__/vocal_parallel_embedding.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b82bf8ba88eb8479c4ae1d2666e34448b835b809 GIT binary patch literal 8478 zcmcIJU2GFcn$@=3Y1?hP{gZ@Afc)VE=g0XWJ0U}czz`M$vw<+ou!=_8GVab`$3K~F z2T0av*%PZvbUOkbR$=aCf^^aZG1BnJ=pL4Py3yT>YhcyB){ZXH?P|rF=fdh%_j31D zwc8G%O@wZ*WOsd4_0?BjeO2}SRQ;>h>qd~$j=xLZ+=I}6kb;@m3b0iGU=2w~V$x{V z$eE07*2WmHJ!4A%oN>&uw8oZpW*TN2C~Qx28P}|f!j7~%#2HDUlByqQp;`?(9p ztEHxoa*6om^Km7fPRr?&nai>yC9?!iNGvLfoXv|1>3DuFr(~WpKouGS@5NR5{rHls zTuSDz5NZiyaO3N#6t)fn{@;QmWHif2Xx1h%vR$%C_BqE2nqwr#Eq>PFLQmsn^k=?1L?E@OH#Mi0!)~Tl>&2f?sDD#rP z4wlWje$F8Y(9bLTq=4*CwOjB*W)zeI+h-*Q=iE~0mg8XseKsVArSL6OEjO;9>&$G^ z2PoR4@3&qVQ|7w+GAx6bChMR(VVM_|>~zAmjS;dX$baiFe15~Mp?tM*)mA{C!G<*l zW>A4CAhFuJfS?BcOXvYJ9kuJMy0`$taH5#ZCi9}Gcz_htJ5)J6M+>@36cg#Vs*0jY z25oP@`QPUU^O=Rgh2+9OKCXNs=LfDYj*TClACm{u$;*QaOZh9g?BLaOIy0EgDe*x% z6?O0`Y{;~@P}z~9Y;MWe!jd8Y<>j8&3npeV83bgt6+VBtiGI&{R~s<5=NZ>hYB{jt z__hDvI9})YEoer%D-*vi5*}3DLozWz4~x!ca*~{`loAetTS-pHscZ#+)<9M%vt?VJ zs>-ugDKYa?2;E_njyip|k5cKDBBSj{b$W9<(4hE##W#v`%1_Go zYF!;3$#U;V%bK1NR%)n z24#H`S%E5Vv{YS>{96VdLa7!Ls^xInZK=JrvI*6h-+r^bme1Mi;9AL|z1Gi!-k^7< z*{7C&Wit&PNT9mLyUyAAlv=F3wRFg?hnDUdA0`CSu%!-~rCR=1oMod)b+=hc^;X7W z73<8#@&7)XssRTH^_nxQ<*C6MY(jM~hs^d`E-_^DzX86T;4j13d4oO<(&u4CH^MHg zp0_0wWv3O=5Vh&7lwVqqb*Cg>O(tXod>izfHnkX5vopJx5#y31OEp8MiePJr0n8`! z$y`=PWDV~q(?W9Le`J_I6uxc_@(deq`UiC^`k-&OOz=o9nY``J9rtqO;4gl2i3YYE0$p z;DT-^>aF$`rDR4Sl2eG1(zkP-m$Pb40nSa^EGqHryzZtX4XOGl948Y<%*&E)l<9N=~UjyIeN*tCT@G`PEB1IiafDBDemO?h9LNu;Q6j(bXB?N#TN+@zX4}KPL3o6oF z9@)hMvFHQQ0#ZMOrN4>FJmQ7hldF@O=Uwc1_X+dVb9}SA_siV`_KCUpbp1Qt%bid1f3Ba~_f@azrTKDT;K<2y0m zsqsCS?|F&CqVQhXZu9#7vr2Na$NrZljc2U#1**iP;)O7YkhT zDD)bN0at}$IgC2HwT@BTF{*Wp1p*{du)!`3mt&j0~}Qt+<}h9Nqe zy81tbFLm`RE*R8TQ(>E91|qf;A~|Z%d14Y8QRMl0ibuTawYomhQlCR3Rb=#?#1-_^$wB*aag9pbQ=9ezu#KKhvPEf(I z3m?|)CLi_XKswO z=^|V*@!4EfyqwM@=HWI&L$$NR=h;XPWo4~P4- z@GuS!KkC5Y(MNf0Y#NVEYhxGj*u~<9AK|e}IDE+p73|&qAP@~TYr!5I?Ac&&uy^C4 zHt-%Ec<;$D9ynQUuSyz3sX;KeZF;RaV|u)GJh&qE-t z3_>eWiaQe2T3Jd%H(Iw<^|4g7rZB$qwPQOi$9BGEZ0k0wv5EAo)|}|b9jO3(0=C6P zkHmiEd|26;H(dEPvtz2=glf5YNvzrmE+Y9hvoVIKq6Y* z1>qz_nhT5hXi({a7M;CzMOI`Q3@e1LLN&W@VKJLbK)|KM7s$nPXnVA42C>YKbyrn~ zE5swz8{nPd8U(t!0|MO)wF7#Pj7eR-8Zz}w$x+sWiFj6AT!7G8CXA)y5X&}@xKWI& zU~y=Gc#I$hj7T`2n}@(u%*0iQMM)@bROqe>SK@FD9MYTS79lFGM4r{*;g*@!=N9uI z3EjV~KqC9f9kkJ^lBK4WiGFBMs+@tk~tr}4q*;`y242N$32{jkWr1?sb@dF|5OOZR5(&aSwrF7CnM9xZ$X zhmSlyj>AVE|NM!&7(Tn=dTrw$Iy%2>z292my?0!zKcS|EgpV|EDS~a}P;={A?ru(N z8o*5h8}Ucu#mV#d(0Lr2dD?Vg#k1KGS@D-VO`4|_ds^>(`kkk})DqEJdT~oH4Z%9F z(6J7m1M5H6dJf~B!;b^F=Li;#Jif4Hx3&6K&Vs#wc-7O5J>44}-+B5*n( z3Ei5|kA?n?87vGIM^0-a(|Bb1+fVTbRLqowAh6l(EApKXl=csqK^eqjU`v~eZw)>Q zYr|7`c&d2*BG}db3(OX>^}AO*FE}K$YyJ`JA9>~<1%6k~Zu(m^e<${Lu1`MoA1ZQ( z4BNu1xkV*mKH`KH=rx82j}D7zSryYsHUCm1LXHyoZykUSJm*wxPc8b$S0j%_^Z`lkf0{Y>h?rhRfxM{+c*w6qz0g?#j=W1w@eb7D1(9^3D zn#N_0LaLV|yG+8ksp+bmS zQOJjAICT#Q=`XiX+2&w>RO}irBgh|1e(V`oH zmYVo{V&edAjcs{o-7%(v9lh81#oo{NmJxs(k+1t7^luTc{9^_=17%wSJF(fcXDxX* zS!MyQ@JJvIoY1n1K<+BH9#Z6i+yC?ev}HF*l!M6W`@zPr-K5UGJCWdHakRZ;dfxB torch.Tensor | list[torch.Tensor | None]: + """Slice lora a if splitting for tensor parallelism.""" + ... + + def slice_lora_b( + self, lora_b: torch.Tensor | list[torch.Tensor | None] + ) -> torch.Tensor | list[torch.Tensor | None]: + """Slice lora b if splitting with tensor parallelism.""" + ... + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + """Initializes lora matrices.""" + ... + + def reset_lora(self, index: int): + """Resets the lora weights at index back to 0.""" + ... + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: torch.Tensor | None, + ): + """Overwrites lora tensors at index.""" + ... + + def set_mapping( + self, + punica_wrapper, + ): + self.punica_wrapper: PunicaWrapperBase = punica_wrapper + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + """Returns True if the layer can be replaced by this LoRA layer.""" + raise NotImplementedError diff --git a/lora/layers/base_linear.py b/lora/layers/base_linear.py new file mode 100644 index 0000000..3db4165 --- /dev/null +++ b/lora/layers/base_linear.py @@ -0,0 +1,164 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +from transformers import PretrainedConfig + +from vllm.config.lora import LoRAConfig +from vllm.distributed.utils import divide +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + LinearBase, + ReplicatedLinear, + RowParallelLinear, +) +from vllm.platforms import current_platform + +from .base import BaseLayerWithLoRA +from .utils import _get_lora_device + + +class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): + def __init__(self, base_layer: LinearBase): + super().__init__() + self.base_layer = base_layer + self.input_size = self.base_layer.input_size + # Ensure tp_size and tp_rank consistency with the base_layer. + self.tp_size = self.base_layer.tp_size + self.tp_rank = self.base_layer.tp_rank + self.device = _get_lora_device(self.base_layer) + self.output_slices: tuple[int, ...] + self.output_size: int + self.n_slices: int + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + self.lora_config = lora_config + # + if isinstance(self.base_layer, ReplicatedLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, ColumnParallelLinear): + lora_a_out_size = ( + lora_config.max_lora_rank + if not lora_config.fully_sharded_loras + else divide(lora_config.max_lora_rank, self.tp_size) + ) + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, RowParallelLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = ( + self.output_size + if not lora_config.fully_sharded_loras + else divide(self.output_size, self.tp_size) + ) + else: + raise NotImplementedError + + self.lora_a_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_a_out_size, + self.input_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) + for _ in range(self.n_slices) + ) + self.lora_b_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_b_out_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.device, + ) + for _ in range(self.n_slices) + ) + self.output_slices = (self.lora_b_stacked[0].shape[2],) + + def reset_lora(self, index: int): + for s_index in range(self.n_slices): + self.lora_a_stacked[s_index][index] = 0 + self.lora_b_stacked[s_index][index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: torch.Tensor | None, + ): + # Except for QKVParallelLinearWithLoRA and + # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers + # store weights in a tuple of size 1. These two layers will + # override this function. + assert ( + len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1 + ) + + self.reset_lora(index) + if self.tp_size > 1: + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) + + self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_( + lora_a, non_blocking=True + ) + self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_( + lora_b, non_blocking=True + ) + + def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: + output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + + # In Transformers modeling backend, x and output have extra batch dimension like + # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim), + # therefore we need to flatten the batch dimensions. + if x.ndim == 3 and output.ndim == 3: + output = output.flatten(0, 1) + x = x.flatten(0, 1) + + lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_linear( + output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, self.output_slices + ) + if not current_platform.can_update_inplace(): + output = lora_output + + return output + + @property + def weight(self) -> torch.Tensor: + # unquantizedLinear + if hasattr(self.base_layer, "weight"): + return self.base_layer.weight + # Compressed Tensor + elif hasattr(self.base_layer, "weight_packed"): + return self.base_layer.weight_packed + # GPTQ/AWQ + elif hasattr(self.base_layer, "qweight"): + return self.base_layer.qweight + # marlin + elif hasattr(self.base_layer, "B"): + return self.base_layer.B + # HQQ marlin + elif hasattr(self.base_layer, "W_q"): + return self.base_layer.W_q + else: + raise ValueError(f"Unsupported base layer: {self.base_layer}") + + @property + def bias(self) -> torch.Tensor | None: + if hasattr(self.base_layer, "bias"): + return self.base_layer.bias + else: + return None diff --git a/lora/layers/column_parallel_linear.py b/lora/layers/column_parallel_linear.py new file mode 100644 index 0000000..637ded9 --- /dev/null +++ b/lora/layers/column_parallel_linear.py @@ -0,0 +1,578 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config.lora import LoRAConfig +from vllm.distributed import tensor_model_parallel_all_gather +from vllm.distributed.utils import divide +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, +) +from vllm.platforms import current_platform + +from .base_linear import BaseLinearLayerWithLoRA +from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace + + +def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"): + """ + For `ColumnParallelLinearWithLoRA` or classes that inherit from + `ColumnParallelLinearWithLoRA`, they share the same `apply` logic. + """ + assert ( + layer.n_slices + == len(layer.lora_a_stacked) + == len(layer.lora_b_stacked) + == len(layer.output_slices) + ) + + output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape + + # Since communication is needed, the buffer is directly initialized as a + # tensor rather than a tuple of tensor. + buffers = torch.zeros( + (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + + shrunk_buffers: torch.Tensor | None = layer.punica_wrapper.add_shrink( + buffers, x, layer.lora_a_stacked, 1.0 + ) + + if not current_platform.can_update_inplace(): + buffers = shrunk_buffers + + buffers = tensor_model_parallel_all_gather(buffers) + + lora_output: torch.Tensor | None = layer.punica_wrapper.add_expand( + output, + buffers, + layer.lora_b_stacked, + layer.output_slices, + offset_start=0, + add_input=True, + ) + + if not current_platform.can_update_inplace(): + output = lora_output + + output = output.view(*out_orig_shape) + # now have column partitioned and packed output + return output + + +class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): + """ + LoRA on top of ColumnParallelLinear layer. + LoRA B is sliced for tensor parallelism. + There are two types for the `base_layer`: + 1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`. + 2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`. + """ + + def __init__(self, base_layer: ColumnParallelLinear) -> None: + super().__init__(base_layer) + # The base_layer type is ColumnParallelLinear or + # MergedColumnParallelLinear, their weight sharding logic is + # inconsistent when TP is greater than 1. + self.is_merged_col_linear = type(base_layer) is MergedColumnParallelLinear + self.output_size = self.base_layer.output_size_per_partition + # There is only one LoRA layer + self.n_slices = 1 + + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + return lora_a + + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + # Applicable to cases where the base_layer is + # MergedColumnParallelLinear. + if self.is_merged_col_linear: + shard_size = self.output_size // 2 + offset = lora_b.shape[0] // 2 + + left_weight = lora_b[ + self.tp_rank * shard_size : (self.tp_rank + 1) * shard_size, : + ] + right_weight = lora_b[ + offset + self.tp_rank * shard_size : offset + + (self.tp_rank + 1) * shard_size, + :, + ] + lora_b = torch.cat([left_weight, right_weight], dim=0) + # Applicable to cases where the base_layer is + # ColumnParallelLinear. + else: + shard_size = self.output_size + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + lora_b = lora_b[start_idx:end_idx, :] + return lora_b + + def forward( + self, input_: torch.Tensor + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]: + """Forward of ColumnParallelLinear + + Args: + input_: Tensor whose last dimension is `input_size`. + + Returns: + - output + - bias + """ + bias = self.base_layer.bias if not self.base_layer.skip_bias_add else None + + # Matrix multiply. + output_parallel = self.apply(input_, bias) + if self.base_layer.gather_output and self.tp_size > 1: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + + if not self.base_layer.return_bias: + return output + + output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None + return output, output_bias + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + return type(source_layer) is ColumnParallelLinear or ( + type(source_layer) is MergedColumnParallelLinear + and len(packed_modules_list) == 1 + ) + + +class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): + """ColumnParallelLinear layer that is composed of 2 sublayers (slices) + packed together (e.g. gate_proj + up_proj -> gate_up_proj). + + This means we have 2 LoRAs, each applied to one half of the layer. + + Both slices must have the same size. + """ + + def __init__( + self, base_layer: MergedColumnParallelLinear | QKVParallelLinear + ) -> None: + super().__init__(base_layer) + # There are two LoRA layers + # the output_sizes in MergedColumnParallelLinear is not sharded by tp + # we need to divide it by the tp_size to get correct slices size + output_sizes = self.base_layer.output_sizes + self.output_slices = tuple( + divide(output_size, self.tp_size) for output_size in output_sizes + ) + self.n_slices = len(self.output_slices) + self.output_ids = (self.tp_rank,) * self.n_slices + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + """ + The main reason for overriding this function is to enhance code + maintainability. + """ + self.lora_config = lora_config + + lora_a_output_size_per_partition = ( + lora_config.max_lora_rank + if not lora_config.fully_sharded_loras + else divide(lora_config.max_lora_rank, self.tp_size) + ) + + self.lora_a_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_a_output_size_per_partition, + self.input_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) + for _ in range(self.n_slices) + ) + self.lora_b_stacked = tuple( + torch.zeros( + max_loras, + 1, + output_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.device, + ) + for output_size in self.output_slices + ) + + def slice_lora_a( + self, lora_a: list[torch.Tensor | None] + ) -> list[torch.Tensor | None]: + return lora_a + + def slice_lora_b( + self, lora_b: list[torch.Tensor | None] + ) -> list[torch.Tensor | None]: + sliced_lora_b = [None] * self.n_slices + for i, (shard_id, shard_size) in enumerate( + zip(self.output_ids, self.output_slices) + ): + if (lora_b_i := lora_b[i]) is not None: + sliced_lora_b[i] = lora_b_i[ + shard_size * shard_id : shard_size * (shard_id + 1), : + ] + return sliced_lora_b + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: torch.Tensor | None, + ): + self.reset_lora(index) + + if self.tp_size > 1: + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) + + for i in range(self.n_slices): + if (lora_a_i := lora_a[i]) is not None: + self.lora_a_stacked[i][ + index, 0, : lora_a_i.shape[0], : lora_a_i.shape[1] + ].copy_(lora_a_i, non_blocking=True) + if (lora_b_i := lora_b[i]) is not None: + self.lora_b_stacked[i][ + index, 0, : lora_b_i.shape[0], : lora_b_i.shape[1] + ].copy_(lora_b_i, non_blocking=True) + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + return ( + type(source_layer) is MergedColumnParallelLinear + and len(packed_modules_list) == 2 + ) + + +class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): + """ + ColumnParallelLinear layer that is specifically designed for + qkv_proj. Certain models, such as chatglm3 and baichuan-7b, + only contains a single LoRA within their qkv_proj layer. + + During inference with Tensor Parallel, the weights of lora_b + must be accurately partitioned according to the respective ranks. + + Q slice may have different shape than K and V slices (which both have + the same shape). + """ + + def __init__(self, base_layer: QKVParallelLinear) -> None: + super().__init__(base_layer) + self.q_proj_total_size = ( + self.base_layer.total_num_heads * self.base_layer.head_size + ) + self.q_proj_shard_size = self.base_layer.num_heads * self.base_layer.head_size + self.kv_proj_shard_size = ( + self.base_layer.num_kv_heads * self.base_layer.head_size + ) + self.kv_proj_total_size = ( + self.base_layer.total_num_kv_heads * self.base_layer.head_size + ) + # There is only one LoRA layer + self.n_slices = 1 + + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + self.q_shard_id = self.tp_rank + self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas + lora_b_q = lora_b[ + self.q_proj_shard_size * self.q_shard_id : self.q_proj_shard_size + * (self.q_shard_id + 1), + :, + ] + k_offset = self.q_proj_total_size + lora_b_k = lora_b[ + k_offset + self.kv_proj_shard_size * self.kv_shard_id : k_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1), + :, + ] + v_offset = k_offset + self.kv_proj_total_size + lora_b_v = lora_b[ + v_offset + self.kv_proj_shard_size * self.kv_shard_id : v_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1), + :, + ] + lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=0) + return lora_b + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1 + + +class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): + """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices) + packed together in qkv proj fashion + (q_proj + k_proj + v_proj -> qkv_proj). + + This means we have 3 LoRAs, each applied to one slice of the layer. + + Q slice may have different shape than K and V slices (which both have + the same shape). + """ + + def __init__(self, base_layer: QKVParallelLinear) -> None: + super().__init__(base_layer) + # There are three LoRA layer. + self.n_slices = len(self.base_layer.output_sizes) + + self.q_proj_shard_size = self.base_layer.num_heads * self.base_layer.head_size + self.kv_proj_shard_size = ( + self.base_layer.num_kv_heads * self.base_layer.head_size + ) + self.q_shard_id = self.tp_rank + self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas + + self.output_slices = ( + self.q_proj_shard_size, + self.kv_proj_shard_size, + self.kv_proj_shard_size, + ) + self.output_ids = ( + self.q_shard_id, + self.kv_shard_id, + self.kv_shard_id, + ) + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + """ + The main reason for overloading this function is to handle inconsistent + weight dimensions in qkv lora. + """ + super().create_lora_weights(max_loras, lora_config, model_config) + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3 + + +# These following layers are based on the tensor parallelism strategy given in +# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, +# https://arxiv.org/abs/2311.03285. + + +class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): + """ + Differs from ColumnParallelLinearWithLoRA by slicing LoRA A also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + + # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`, + # their `lora_a` and `lora_b` have different sharding patterns. After + # completing the `lora_a` GEMM , a gather operation is performed. + # Therefore, the sharding of `lora_a` only needs to correspond with the + # gather operation. + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + shard_size = self.lora_a_stacked[0].shape[2] + start_idx = self.tp_rank * shard_size + lora_a = lora_a[start_idx : start_idx + shard_size, :] + return lora_a + + def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) + + +class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLoRA): + """ + Differs from MergedColumnParallelLinearWithLoRA by slicing the + LoRA A's also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + + def slice_lora_a( + self, lora_a: list[torch.Tensor | None] + ) -> list[torch.Tensor | None]: + # NOTE: lora_a contains 2 subloras, and each sublora could be None. + output_shard_size = self.lora_a_stacked[0].shape[2] + output_start_idx = self.tp_rank * output_shard_size + lora_a = [ + lora_a[0][output_start_idx : output_start_idx + output_shard_size, :] + if lora_a[0] is not None + else None, + lora_a[1][output_start_idx : output_start_idx + output_shard_size, :] + if lora_a[1] is not None + else None, + ] + return lora_a + + def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) + + +class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): + """ + Differs from QKVParallelLinearWithLoRA by slicing the + LoRA A's also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + shard_size = self.lora_a_stacked[0].shape[2] + start_idx = self.tp_rank * shard_size + lora_a = lora_a[start_idx : start_idx + shard_size, :] + return lora_a + + def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) + + +class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): + """ + Differs from MergedQKVParallelLinearWithLoRA by slicing the + LoRA A's also. + + Based on S-LoRA, slicing happens along the rank dim. + """ + + def slice_lora_a( + self, lora_a: list[torch.Tensor | None] + ) -> list[torch.Tensor | None]: + # NOTE: lora_a contains 3 subloras, and each sublora could be None. + shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] + start_idx = [self.tp_rank * shard_size[i] for i in range(3)] + lora_a = [ + lora_a[0][start_idx[0] : start_idx[0] + shard_size[0], :] + if lora_a[0] is not None + else None, + lora_a[1][start_idx[1] : start_idx[1] + shard_size[1], :] + if lora_a[1] is not None + else None, + lora_a[2][start_idx[2] : start_idx[2] + shard_size[2], :] + if lora_a[2] is not None + else None, + ] + return lora_a + + def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) diff --git a/lora/layers/fused_moe.py b/lora/layers/fused_moe.py new file mode 100644 index 0000000..8fb3efa --- /dev/null +++ b/lora/layers/fused_moe.py @@ -0,0 +1,472 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm import envs +from vllm.config.lora import LoRAConfig +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.lora.layers.base import BaseLayerWithLoRA +from vllm.lora.ops.triton_ops.utils import get_lora_op_configs +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe.config import ( + _get_config_dtype_str, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + modular_marlin_fused_moe, +) +from vllm.model_executor.layers.fused_moe.fused_moe import ( + modular_triton_fused_moe, + try_get_optimal_moe_config, +) +from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( + FusedMoEModularMethod, +) + + +class FusedMoEWithLoRA(BaseLayerWithLoRA): + def __init__(self, base_layer: FusedMoE) -> None: + super().__init__() + self.base_layer = base_layer + + assert not self.base_layer.use_ep, ( + "EP support for Fused MoE LoRA is not implemented yet." + ) + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.device = base_layer.w2_weight.device + self._inject_lora_into_fused_moe() + + def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]: + normalized_config = {} + for key, value in config.items(): + if key.islower(): + if key.startswith("block_"): + normalized_key = "BLOCK_SIZE_" + key.split("_")[-1].upper() + else: + normalized_key = key.upper() + else: + normalized_key = key + normalized_config[normalized_key] = value + return normalized_config + + def _get_lora_moe_configs( + self, + op_prefix: str, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + num_slices: int, + M: int, + layer: FusedMoE, + top_k: int, + config_dtype: str, + ): + if envs.VLLM_TUNED_CONFIG_FOLDER: + shrink_config = get_lora_op_configs( + op_type=f"fused_moe_lora_{op_prefix}_shrink", + max_loras=lora_a_stacked.shape[0], + batch=M, + hidden_size=lora_a_stacked.shape[-1], + rank=lora_a_stacked.shape[-2], + num_slices=num_slices, + moe_intermediate_size=lora_b_stacked.shape[-2], + ) + expand_config = get_lora_op_configs( + op_type=f"fused_moe_lora_{op_prefix}_expand", + max_loras=lora_a_stacked.shape[0], + batch=M, + hidden_size=lora_a_stacked.shape[-1], + rank=lora_a_stacked.shape[-2], + num_slices=num_slices, + moe_intermediate_size=lora_b_stacked.shape[-2], + ) + else: # fall back to the default config + get_config_func = functools.partial( + try_get_optimal_moe_config, + layer.w13_weight.size(), + layer.w2_weight.size(), + top_k, + config_dtype, + block_shape=layer.quant_method.moe_quant_config.block_shape, + ) + shrink_config = get_config_func(M) + expand_config = get_config_func(M) + shrink_config = self._normalize_keys(shrink_config) + expand_config = self._normalize_keys(expand_config) + return shrink_config, expand_config + + def _inject_lora_into_fused_moe(self): + moe_state_dict = {} + top_k = self.base_layer.top_k + + self.base_layer.ensure_moe_quant_config_init() + quant_config = self.base_layer.quant_method.moe_quant_config + + m_fused_moe_fn = ( + modular_triton_fused_moe( + quant_config, shared_experts=self.base_layer.shared_experts + ) + if not quant_config.use_mxfp4_w4a16 + else modular_marlin_fused_moe( + quant_config, shared_experts=self.base_layer.shared_experts + ) + ) + + def fwd_decorator(layer, func): + def wrapper(*args, **kwargs): + moe_state_dict["hidden_states"] = kwargs["hidden_states"] + moe_state_dict["topk_ids"] = kwargs["topk_ids"] + moe_state_dict["topk_weights"] = kwargs["topk_weights"] + moe_state_dict["expert_map"] = kwargs["expert_map"] + moe_state_dict["apply_router_weight_on_input"] = kwargs[ + "apply_router_weight_on_input" + ] + result = func(*args, **kwargs) + return result + + return wrapper + + def act_decorator(layer, func): + def wrapper(*args, **kwargs): + _, output, input = args + + hidden_states = moe_state_dict["hidden_states"] + topk_weights = moe_state_dict["topk_weights"] + curr_topk_ids = moe_state_dict["topk_ids"] + + expert_map = moe_state_dict["expert_map"] + + config_dtype = _get_config_dtype_str( + dtype=hidden_states.dtype, + use_fp8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + ) + CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE + num_tokens = hidden_states.size(0) + M = min(num_tokens, CHUNK_SIZE) + + shrink_config, expand_config = self._get_lora_moe_configs( + op_prefix="w13", + lora_a_stacked=self.w1_lora_a_stacked, + lora_b_stacked=self.w1_lora_b_stacked, + num_slices=2, + M=M, + layer=layer, + top_k=top_k, + config_dtype=config_dtype, + ) + + # get the block size of m from customized config or default config + max_loras = self.w1_lora_a_stacked.shape[0] + ( + sorted_token_ids_lora, + expert_ids_lora, + num_tokens_post_padded_lora, + ) = self.punica_wrapper.moe_lora_align_block_size( + curr_topk_ids, + num_tokens, + shrink_config["BLOCK_SIZE_M"], + self.base_layer.local_num_experts, + max_loras, + self.adapter_enabled, + expert_map, + ) + + moe_state_dict["sorted_token_ids_lora"] = sorted_token_ids_lora + moe_state_dict["expert_ids_lora"] = expert_ids_lora + moe_state_dict["num_tokens_post_padded_lora"] = ( + num_tokens_post_padded_lora + ) + + w13_lora_a_stacked = [self.w1_lora_a_stacked, self.w3_lora_a_stacked] + w13_lora_b_stacked = [self.w1_lora_b_stacked, self.w3_lora_b_stacked] + max_lora_rank = self.w1_lora_a_stacked.shape[-2] + expert_ids_lora = expert_ids_lora.view(max_loras, -1) + sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1) + + self.punica_wrapper.add_lora_fused_moe( + input.view(-1, top_k, input.shape[-1]), + hidden_states, + w13_lora_a_stacked, + w13_lora_b_stacked, + topk_weights, + sorted_token_ids_lora, + expert_ids_lora, + num_tokens_post_padded_lora, + max_lora_rank, + top_k, + shrink_config, ## pass the shrink config + expand_config, ## pass the expand config + self.adapter_enabled, + ) + + result = func(*args, **kwargs) + + moe_state_dict["intermediate_cache2"] = output + return result + + return wrapper + + def moe_sum_decorator(layer, func): + def wrapper(*args, **kwargs): + hidden_states = moe_state_dict["hidden_states"] + topk_weights = moe_state_dict["topk_weights"] + + config_dtype = _get_config_dtype_str( + dtype=hidden_states.dtype, + use_fp8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + ) + CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE + num_tokens = hidden_states.size(0) + M = min(num_tokens, CHUNK_SIZE) + + shrink_config, expand_config = self._get_lora_moe_configs( + op_prefix="w2", + lora_a_stacked=self.w2_lora_a_stacked, + lora_b_stacked=self.w2_lora_b_stacked, + num_slices=1, + M=M, + layer=layer, + top_k=top_k, + config_dtype=config_dtype, + ) + + sorted_token_ids_lora = moe_state_dict["sorted_token_ids_lora"] + expert_ids_lora = moe_state_dict["expert_ids_lora"] + num_tokens_post_padded_lora = moe_state_dict[ + "num_tokens_post_padded_lora" + ] + max_loras = self.w1_lora_a_stacked.shape[0] + expert_ids_lora = expert_ids_lora.view(max_loras, -1) + sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1) + intermediate_cache2 = moe_state_dict["intermediate_cache2"] + intermediate_cache3 = args[0] + max_lora_rank = self.w1_lora_a_stacked.shape[-2] + self.punica_wrapper.add_lora_fused_moe( + intermediate_cache3, + intermediate_cache2, + [self.w2_lora_a_stacked], + [self.w2_lora_b_stacked], + topk_weights, + sorted_token_ids_lora, + expert_ids_lora, + num_tokens_post_padded_lora, + max_lora_rank, + top_k, + shrink_config, ## pass the shrink config + expand_config, ## pass the expand config + self.adapter_enabled, + True, + ) + + result = func(*args, **kwargs) + return result + + return wrapper + + fused_experts = m_fused_moe_fn.fused_experts + + m_fused_moe_fn.forward = fwd_decorator(self.base_layer, m_fused_moe_fn.forward) + fused_experts.activation = act_decorator( + self.base_layer, fused_experts.activation + ) + fused_experts.moe_sum = moe_sum_decorator( + self.base_layer, fused_experts.moe_sum + ) + + self.base_layer.quant_method = FusedMoEModularMethod( + self.base_layer.quant_method, m_fused_moe_fn + ) + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + """Initializes lora matrices.""" + + self.adapter_enabled = torch.tensor( + [0] * (max_loras + 1), dtype=torch.int, device=self.device + ) + + self.w1_lora_a_stacked = torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + lora_config.max_lora_rank, + self.base_layer.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.w1_lora_b_stacked = torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + self.base_layer.intermediate_size_per_partition, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + + self.w2_lora_a_stacked = torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + lora_config.max_lora_rank, + self.base_layer.intermediate_size_per_partition, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.w2_lora_b_stacked = torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + self.base_layer.hidden_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + + self.w3_lora_a_stacked = torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + lora_config.max_lora_rank, + self.base_layer.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.w3_lora_b_stacked = torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + self.base_layer.intermediate_size_per_partition, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + + # They will be used by 'LoRALayerWeights.create_dummy_lora_weights' + # to create a dummy LoRA weights. + self.lora_a_stacked = [] + self.lora_b_stacked = [] + for lora_id in range(max_loras): + for experts_id in range(self.base_layer.local_num_experts): + # gate_proj,down_proj,up_proj + self.lora_a_stacked.append(self.w1_lora_a_stacked[lora_id][experts_id]) + self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id]) + self.lora_a_stacked.append(self.w3_lora_a_stacked[lora_id][experts_id]) + + self.lora_b_stacked.append(self.w1_lora_b_stacked[lora_id][experts_id]) + self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id]) + self.lora_b_stacked.append(self.w3_lora_b_stacked[lora_id][experts_id]) + + def reset_lora(self, index: int): + """Resets the lora weights at index back to 0.""" + self.w1_lora_a_stacked[index] = 0 + self.w1_lora_b_stacked[index] = 0 + self.w3_lora_a_stacked[index] = 0 + self.w3_lora_b_stacked[index] = 0 + self.w2_lora_a_stacked[index] = 0 + self.w2_lora_b_stacked[index] = 0 + self.adapter_enabled[index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: torch.Tensor | None, + bias: torch.Tensor | None = None, + ): + """Overwrites lora tensors at index.""" + self.reset_lora(index) + self.adapter_enabled[index] = 1 + for eid in range(len(lora_a) // 3): + w1_lora_a = lora_a[eid * 3] + w2_lora_a = lora_a[eid * 3 + 1] + w3_lora_a = lora_a[eid * 3 + 2] + w1_lora_b = lora_b[eid * 3] + w2_lora_b = lora_b[eid * 3 + 1] + w3_lora_b = lora_b[eid * 3 + 2] + + # Handle the case of adding LoRA to only a subset of experts + if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None: + continue + + if self.tp_size > 1: + shard_size = self.base_layer.intermediate_size_per_partition + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + + w1_lora_b = w1_lora_b[start_idx:end_idx, :] + w3_lora_b = w3_lora_b[start_idx:end_idx, :] + w2_lora_a = w2_lora_a[:, start_idx:end_idx] + + self.w1_lora_a_stacked[ + index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1] + ].copy_(w1_lora_a, non_blocking=True) + + self.w3_lora_a_stacked[ + index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1] + ].copy_(w3_lora_a, non_blocking=True) + + self.w2_lora_b_stacked[ + index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1] + ].copy_(w2_lora_b, non_blocking=True) + + self.w1_lora_b_stacked[ + index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1] + ].copy_(w1_lora_b, non_blocking=True) + self.w3_lora_b_stacked[ + index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1] + ].copy_(w3_lora_b, non_blocking=True) + self.w2_lora_a_stacked[ + index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1] + ].copy_(w2_lora_a, non_blocking=True) + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + """Returns True if the layer can be replaced by this LoRA layer.""" + # return type(source_layer) is FusedMoE + return isinstance(source_layer, FusedMoE) + + def forward(self, *args, **kwargs): + return self.base_layer.forward(*args, **kwargs) + + def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs): + return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs) + + @property + def _shared_experts(self): + return self.base_layer._shared_experts + + @property + def quant_method(self): + return self.base_layer.quant_method + + @property + def is_internal_router(self) -> bool: + return self.base_layer.is_internal_router diff --git a/lora/layers/logits_processor.py b/lora/layers/logits_processor.py new file mode 100644 index 0000000..adc5e86 --- /dev/null +++ b/lora/layers/logits_processor.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config.lora import LoRAConfig +from vllm.distributed import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.platforms import current_platform + +from .base import BaseLayerWithLoRA + + +class LogitsProcessorWithLoRA(BaseLayerWithLoRA): + """ + LoRA wrapper for LogitsProcessor, with extra logic to handle the + application of the LoRA adapter and added LoRA vocabulary. + + Args: + base_layer: LogitsProcessor layer + hidden_size: hidden size of the model + dtype: data type of the model + device: device of the model + sharded_to_full_mapping: index mapping from sharded vocab to full vocab + received from base_layer.get_sharded_to_full_mapping(). If None, + no reindexing will be done. + """ + + def __init__( + self, + base_layer: LogitsProcessor, + hidden_size: int, + dtype: torch.dtype, + device: torch.device, + sharded_to_full_mapping: list[int] | None, + ) -> None: + super().__init__() + self.base_layer = base_layer + self.hidden_size = hidden_size + self.dtype = dtype + self.device = device + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.sharded_to_full_mapping = sharded_to_full_mapping + + @property + def logits_as_input(self): + return self.base_layer.logits_as_input + + @property + def vocab_size(self): + return self.base_layer.vocab_size + + @property + def scale(self): + return self.base_layer.scale + + @property + def soft_cap(self): + return self.base_layer.soft_cap + + @property + def use_all_gather(self): + return self.base_layer.use_all_gather + + @property + def org_vocab_size(self): + return self.base_layer.org_vocab_size + + @property + def include_gpu_probs_tensor(self): + return self.base_layer.include_gpu_probs_tensor + + @property + def should_modify_greedy_probs_inplace(self): + return self.base_layer.should_modify_greedy_probs_inplace + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + # TODO: Verify if this condition can be further relaxed + if 32000 < self.base_layer.vocab_size > 257024: + raise ValueError( + "When using LoRA, vocab size must be 32000 >= vocab_size <= 257024" + ) + self.lora_a_stacked = torch.zeros( + ( + max_loras, + 1, + lora_config.max_lora_rank, + self.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + # Pad for kernel compatibility + math.ceil( + self.base_layer.vocab_size / lora_config.lora_vocab_padding_size + ) + * lora_config.lora_vocab_padding_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.embeddings_tensors = torch.full( + (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), + fill_value=float("-inf"), + dtype=self.dtype, + device=self.device, + ) + if self.sharded_to_full_mapping is not None: + self.sharded_to_full_mapping_gpu = torch.tensor( + self.sharded_to_full_mapping, device=self.device, dtype=torch.long + ) + else: + self.sharded_to_full_mapping_gpu = None + + def reset_lora(self, index: int): + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + self.embeddings_tensors[index] = float("-inf") + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: torch.Tensor | None, + ): + self.reset_lora(index) + self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_( + lora_a, non_blocking=True + ) + self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_( + lora_b, non_blocking=True + ) + if embeddings_tensor is not None: + self.embeddings_tensors[ + index, + : embeddings_tensor.shape[0], + : embeddings_tensor.shape[1], + ] = embeddings_tensor + + def _get_logits( + self, + hidden_states: torch.Tensor, + lm_head: VocabParallelEmbedding, + embedding_bias: torch.Tensor | None = None, + ) -> torch.Tensor | None: + # Get the logits for the next tokens. + logits = lm_head.quant_method.apply(lm_head, hidden_states) + if embedding_bias is not None: + logits += embedding_bias + + # Gather logits for TP + logits = self.base_layer._gather_logits(logits) + + if logits is None: + return None + + if self.sharded_to_full_mapping_gpu is not None: + # Reindex full logits tensor to ensure 1:1 mapping between + # index and token_id + # Example for: + # org_vocab_size = 4 + # added_vocab_size = 2 + # pad_to_size = 8 + # tp_size = 2 + + # indices: [0, 1, 2, 3, 4, 5, 6, 7] + # token_id: [0, 1, 4, -1, 2, 3, 5, -1] + + # Therefore, the mapping is expected to be: + # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex, + # we get: + # indices: [0, 1, 2, 3, 4, 5, 6, 7] + # token_id: [0, 1, 2, 3, 4, 5, -1, -1] + logits = logits[:, self.sharded_to_full_mapping_gpu] + + lora_logits = torch.empty( + self.embeddings_tensors.shape[0] + 1, + self.embeddings_tensors.shape[1], + hidden_states.shape[0], + dtype=self.embeddings_tensors.dtype, + device=self.embeddings_tensors.device, + ) + torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1]) + + neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype) + + lora_logits[-1] = neg_inf + lora_logits = lora_logits.mT + indices_padded = self.punica_wrapper.sampler_indices_padded + + if current_platform.is_tpu() or current_platform.is_xpu(): + indices_padded = indices_padded[: logits.size(0)] + + lora_logits = ( + lora_logits.reshape( + lora_logits.shape[0] * lora_logits.shape[1], + lora_logits.shape[2], + ) + .index_select(0, indices_padded) + .nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf) + ) + + logits[ + :, + self.base_layer.org_vocab_size : self.base_layer.org_vocab_size + + lora_logits.shape[1], + ] = lora_logits + + lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits( + logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0 + ) + + if not current_platform.can_update_inplace(): + logits = lora_output + + # Remove paddings in vocab (if any). + logits = logits[:, : self.base_layer.vocab_size] + return logits + + def forward(self, *args, **kwargs): + return type(self.base_layer).forward(self, *args, **kwargs) + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + # Special handling for the LogitsProcessor. + return False diff --git a/lora/layers/replicated_linear.py b/lora/layers/replicated_linear.py new file mode 100644 index 0000000..243736c --- /dev/null +++ b/lora/layers/replicated_linear.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config.lora import LoRAConfig +from vllm.model_executor.layers.linear import ReplicatedLinear + +from .base_linear import BaseLinearLayerWithLoRA + + +class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): + def __init__(self, base_layer: ReplicatedLinear) -> None: + super().__init__( + base_layer, + ) + # To ensure interface compatibility, set to 1 always. + self.output_size = self.base_layer.output_size + self.n_slices = 1 + + def forward( + self, input_: torch.Tensor + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]: + """Forward of ReplicatedLinearWithLoRA + + Args: + input_: Tensor whose last dimension is `input_size`. + + Returns: + - output + - bias + """ + bias = self.base_layer.bias if not self.base_layer.skip_bias_add else None + + # Matrix multiply. + output = self.apply(input_, bias) + + output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None + + if not self.base_layer.return_bias: + return output + + return output, output_bias + + # ReplicatedLinear should always be replaced, regardless of the fully + # sharded LoRAs setting, because it is, by definition, copied per GPU. + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + return type(source_layer) is ReplicatedLinear + + def slice_lora_a( + self, lora_a: torch.Tensor | list[torch.Tensor | None] + ) -> torch.Tensor | list[torch.Tensor | None]: + """Slice lora a if splitting for tensor parallelism.""" + return lora_a + + def slice_lora_b( + self, lora_b: torch.Tensor | list[torch.Tensor | None] + ) -> torch.Tensor | list[torch.Tensor | None]: + """Slice lora b if splitting with tensor parallelism.""" + return lora_b diff --git a/lora/layers/row_parallel_linear.py b/lora/layers/row_parallel_linear.py new file mode 100644 index 0000000..2ef1bd9 --- /dev/null +++ b/lora/layers/row_parallel_linear.py @@ -0,0 +1,181 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config.lora import LoRAConfig +from vllm.distributed import ( + split_tensor_along_last_dim, + tensor_model_parallel_all_reduce, +) +from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.platforms import current_platform + +from .base_linear import BaseLinearLayerWithLoRA +from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace + + +class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): + def __init__(self, base_layer: RowParallelLinear) -> None: + super().__init__(base_layer) + + # reset input_size + self.input_size = self.base_layer.input_size_per_partition + self.output_size = self.base_layer.output_size + # There is only one LoRA layer. + self.n_slices = 1 + + def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: + shard_size = self.input_size + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + lora_a = lora_a[:, start_idx:end_idx] + return lora_a + + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + return lora_b + + def forward( + self, input_: torch.Tensor + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]: + """Forward of RowParallelLinear + + Args: + input_: tensor whose last dimension is `input_size`. If + `input_is_parallel` is set, then the last dimension + is `input_size // tp_size`. + + Returns: + - output + - bias + """ + # set up backprop all-reduce. + if self.base_layer.input_is_parallel: + input_parallel = input_ + else: + # TODO: simplify code below + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size + ) + input_parallel = splitted_input[self.tp_rank].contiguous() + + # Matrix multiply. + output_parallel = self.apply(input_parallel) + if self.base_layer.reduce_results and self.tp_size > 1: + output_ = tensor_model_parallel_all_reduce(output_parallel) + else: + output_ = output_parallel + + if not self.base_layer.skip_bias_add: + output = ( + output_ + self.base_layer.bias + if self.base_layer.bias is not None + else output_ + ) + output_bias = None + else: + output = output_ + output_bias = self.base_layer.bias + + if not self.base_layer.return_bias: + return output + + return output, output_bias + + @classmethod + @_not_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + return type(source_layer) is RowParallelLinear + + +# The following layer is based on the tensor parallelism strategy given in +# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, +# https://arxiv.org/abs/2311.03285. + + +class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): + """ + Differs from RowParallelLinearWithLoRA by slicing the + LoRA B's also. + + Based on S-LoRA, slicing happens along the output dim. + This yields a combined partial sum from the row parallel base + layer and column partitioned output from the LoRA. + """ + + def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: + shard_size = self.lora_b_stacked[0].shape[2] + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + lora_b = lora_b[start_idx:end_idx, :] + return lora_b + + def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: + output = self.base_layer.quant_method.apply(self.base_layer, x) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape + buffer = torch.zeros( + (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + + shrunk_buffer: torch.Tensor | None = self.punica_wrapper.add_shrink( + buffer, x, self.lora_a_stacked, 1.0 + ) + if not current_platform.can_update_inplace(): + buffer = shrunk_buffer + if self.tp_size > 1: + buffer = tensor_model_parallel_all_reduce(buffer) + + # following S-LoRA, allows the fusing of all_gather and all_reduce + # by adding the column partitioned lora output to a slice of output + # tensor, which is a partial sum due to row parallel. All that + # remains is a standard all_reduce. User should be aware though that + # the output is not the same as a normal row_parallel, it should be + # reduced before being used + # NOTE offset are based on the rank. + shard_size = self.lora_b_stacked[0].shape[2] + offset_start = self.tp_rank * shard_size + lora_output: torch.Tensor | None = self.punica_wrapper.add_expand( + output, + buffer, + self.lora_b_stacked, + self.output_slices, + offset_start=offset_start, + add_input=True, + ) + + if not current_platform.can_update_inplace(): + output = lora_output + + output = output.view(*out_orig_shape) + return output + + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + # specifying kwargs so they can be easily accessed in decorator + return super().can_replace_layer( + source_layer=source_layer, + lora_config=lora_config, + packed_modules_list=packed_modules_list, + model_config=model_config, + decorate=False, + ) diff --git a/lora/layers/utils.py b/lora/layers/utils.py new file mode 100644 index 0000000..2da90f1 --- /dev/null +++ b/lora/layers/utils.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass + +import torch +import torch.nn as nn + + +@dataclass +class LoRAMapping: + index_mapping: tuple[int, ...] + prompt_mapping: tuple[int, ...] + is_prefill: bool = False + + def __post_init__(self): + self.index_mapping = tuple(self.index_mapping) + self.prompt_mapping = tuple(self.prompt_mapping) + + +def _get_lora_device(base_layer: nn.Module) -> torch.device: + # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34 + """Returns the device for where to place the LoRA tensors.""" + # unquantizedLinear + if hasattr(base_layer, "weight"): + return base_layer.weight.device + # Compressed Tensor + elif hasattr(base_layer, "weight_packed"): + return base_layer.weight_packed.device + # GPTQ/AWQ + elif hasattr(base_layer, "qweight"): + return base_layer.qweight.device + # HQQ marlin + elif hasattr(base_layer, "W_q"): + return base_layer.W_q.device + else: + raise ValueError(f"Unsupported base layer: {base_layer}") + + +def _not_fully_sharded_can_replace(can_replace): + """ + decorator which adds the condition of not using fully sharded loras + intended to wrap can_replace_layer() + """ + + def dec(*args, **kwargs): + decorate = kwargs.pop("decorate") if "decorate" in kwargs else True + condition = not kwargs["lora_config"].fully_sharded_loras if decorate else True + return can_replace(*args, **kwargs) and condition + + return dec + + +def _fully_sharded_can_replace(can_replace): + """ + decorator which adds the condition of fully sharded loras + intended to wrap can_replace_layer() + """ + + def dec(*args, **kwargs): + return ( + can_replace(*args, **kwargs) and kwargs["lora_config"].fully_sharded_loras + ) + + return dec diff --git a/lora/layers/vocal_parallel_embedding.py b/lora/layers/vocal_parallel_embedding.py new file mode 100644 index 0000000..ca4ad80 --- /dev/null +++ b/lora/layers/vocal_parallel_embedding.py @@ -0,0 +1,166 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from vllm.config.lora import LoRAConfig +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.platforms import current_platform + +from .base import BaseLayerWithLoRA + + +class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): + def __init__(self, base_layer: VocabParallelEmbedding) -> None: + super().__init__() + self.base_layer = base_layer + self.embeddings_slice: tuple[int, int] | None + self.embeddings_weights: torch.Tensor | None + + def create_lora_weights( + self, + max_loras: int, + lora_config: LoRAConfig, + model_config: PretrainedConfig | None = None, + ) -> None: + if self.base_layer.num_added_embeddings_per_partition > 0: + # We can start adding lora weights + self.embeddings_weights = self.base_layer.weight.data[ + self.base_layer.num_org_embeddings_per_partition : self.base_layer.num_org_embeddings_per_partition # noqa: E501 + + self.base_layer.num_added_embeddings_per_partition + ] + self.embeddings_slice = ( + self.base_layer.shard_indices.added_vocab_start_index + - self.base_layer.org_vocab_size, + self.base_layer.shard_indices.added_vocab_end_index + - self.base_layer.org_vocab_size, + ) + self.base_layer.weight.data[ + self.base_layer.num_org_embeddings_per_partition : + ].fill_(0) + else: + self.embeddings_slice = None + self.embeddings_weights = None + + self.embeddings_tensors = torch.zeros( + ( + max_loras, + lora_config.lora_extra_vocab_size, + self.base_layer.embedding_dim, + ), + dtype=self.base_layer.weight.dtype, + device=self.base_layer.weight.device, + ) + self.lora_a_stacked = torch.zeros( + ( + max_loras, + self.base_layer.org_vocab_size + lora_config.lora_extra_vocab_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + self.base_layer.embedding_dim, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_a_stacked_2d = self.lora_a_stacked.view( + self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1], + self.lora_a_stacked.shape[2], + ) + + def reset_lora(self, index: int): + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 + self.embeddings_tensors[index] = 0 + + def set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: torch.Tensor | None, + ): + self.reset_lora(index) + # NOTE self.lora_a_stacked is row-major, and lora_a is col-major, + # so we need transpose here + self.lora_a_stacked[index, : lora_a.shape[1], : lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True + ) + self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_( + lora_b, non_blocking=True + ) + if embeddings_tensor is not None: + self.embeddings_tensors[ + index, + : embeddings_tensor.shape[0], + : embeddings_tensor.shape[1], + ].copy_(embeddings_tensor, non_blocking=True) + if self.embeddings_slice is not None: + # TODO(yard1): Optimize this copy, we don't need to copy + # everything, just the modified part + embeddings = self.embeddings_tensors.view( + self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1], + self.embeddings_tensors.shape[2], + )[self.embeddings_slice[0] : self.embeddings_slice[1]] + assert self.embeddings_weights is not None + self.embeddings_weights[: embeddings.shape[0]].copy_(embeddings) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, 1, 0) + + # NB: Don't use torch.narrow here. torch.narrow triggers some + # Dynamic Shape specialization in torch.compile + num_tokens = x.shape[0] + indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens] + indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens] + + full_lora_a_embeddings = F.embedding( + x + indices_1, + self.lora_a_stacked_2d, + ) + full_output = self.base_layer.forward(x + (indices_0 * added_tokens_mask)) + + full_output_org = full_output + if full_output.ndim == 3: + full_output = full_output.view( + full_output.shape[0] * full_output.shape[1], -1 + ) + if full_lora_a_embeddings.ndim == 3: + full_lora_a_embeddings = full_lora_a_embeddings.view( + full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], + -1, + ) + + lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_embedding( + full_output, full_lora_a_embeddings, self.lora_b_stacked, add_input=True + ) + + if not current_platform.can_update_inplace(): + full_output = lora_output + + return full_output.view_as(full_output_org) + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None, + ) -> bool: + return type(source_layer) is VocabParallelEmbedding + + @property + def weight(self): + return self.base_layer.weight diff --git a/lora/lora_weights.py b/lora/lora_weights.py new file mode 100644 index 0000000..7691481 --- /dev/null +++ b/lora/lora_weights.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence as GenericSequence +from typing import Optional + +import torch +import torch.types + +from vllm.lora.peft_helper import PEFTHelper +from vllm.utils.platform_utils import is_pin_memory_available + + +class LoRALayerWeights: + """LoRA weights for a layer composed of two low rank matrixes.""" + + def __init__( + self, + module_name: str, + rank: int, + lora_alpha: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: torch.Tensor | None = None, + scaling: float | None = None, + ) -> None: + self.module_name = module_name + self.rank = rank + self.lora_alpha = lora_alpha + self.lora_a = lora_a + self.lora_b = lora_b + self.embeddings_tensor = embeddings_tensor + + if scaling is None: + self.scaling = self.lora_alpha / self.rank + else: + self.scaling = scaling + + def optimize(self) -> "LoRALayerWeights": + """Optimize the LoRA by merging the scaling into lora_b.""" + if self.scaling == 1: + return self + self.lora_b *= self.scaling + self.scaling = 1 + return self + + @property + def input_dim(self) -> int: + return self.lora_a.shape[1] + + @property + def output_dim(self) -> int: + return self.lora_b.shape[0] + + @property + def is_packed(self) -> bool: + return False + + @property + def extra_vocab_size(self) -> int: + return ( + self.embeddings_tensor.shape[0] if self.embeddings_tensor is not None else 0 + ) + + @classmethod + def from_config( + cls, + module_name: str, + peft_helper: PEFTHelper, + embeddings_tensor: torch.Tensor | None = None, + ) -> "LoRALayerWeights": + # lora_a and lora_b are set to None for config-based construction + return cls( + module_name, + peft_helper.r, + peft_helper.lora_alpha, + None, + None, + embeddings_tensor, + peft_helper.vllm_lora_scaling_factor, + ) + + @classmethod + def create_dummy_lora_weights( + cls, + module_name: str, + input_dim: int, + output_dim: int, + rank: int, + dtype: torch.dtype, + device: torch.types.Device, + embeddings_tensor_dim: int | None = None, + ) -> "LoRALayerWeights": + pin_memory = str(device) == "cpu" and is_pin_memory_available() + lora_a = torch.zeros( + [rank, input_dim], dtype=dtype, device=device, pin_memory=pin_memory + ) + lora_b = torch.zeros( + [output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory + ) + + embeddings_tensor = ( + torch.rand( + 10, + embeddings_tensor_dim, + dtype=dtype, + device=device, + pin_memory=pin_memory, + ) + if embeddings_tensor_dim + else None + ) + return cls( + module_name, + rank=rank, + lora_alpha=1, + lora_a=lora_a, + lora_b=lora_b, + embeddings_tensor=embeddings_tensor, + ) + + +class PackedLoRALayerWeights(LoRALayerWeights): + """LoRA used for packed layers (eg. qkv_proj).""" + + def __init__( + self, + module_name: str, + rank: int, + lora_alphas: list[int | None], + lora_a: list[torch.Tensor | None], + lora_b: list[torch.Tensor | None], + scaling: list[float] | None = None, + ) -> None: + super().__init__( + module_name=module_name, + rank=rank, + lora_alpha=0, + lora_a=lora_a, + lora_b=lora_b, + scaling=scaling, # type: ignore + embeddings_tensor=None, + ) + self.lora_alphas = lora_alphas + if scaling is None: + self.scaling = [ # type: ignore + lora_alpha / self.rank # type: ignore # noqa + for lora_alpha in self.lora_alphas + ] + + @classmethod + def pack( + cls, loras: GenericSequence[Optional["LoRALayerWeights"]] + ) -> "PackedLoRALayerWeights": + """Pack a list of LoRAs into a single LoRA. + + If LoRA is None, it signifies that the submodule does not have a LoRA. + """ + first_lora = next(lora for lora in loras if lora is not None) + for lora in loras: + if lora is None: + continue + lora.optimize() + rank = first_lora.rank + module_name = first_lora.module_name + obj = cls( + module_name, + rank, + [lora.lora_alpha if lora is not None else None for lora in loras], + [lora.lora_a if lora is not None else None for lora in loras], + [lora.lora_b if lora is not None else None for lora in loras], + scaling=[ + 1 if lora is not None else None # type: ignore + for lora in loras + ], + ) + return obj + + def optimize(self) -> "PackedLoRALayerWeights": + """Optimize the LoRA by merging the scaling into lora_b.""" + for i in range(len(self.lora_b)): + if self.scaling[i] == 1 or self.lora_b[i] is None: # type: ignore + continue + self.lora_b[i] *= self.scaling[i] # type: ignore + self.scaling[i] = 1 # type: ignore + return self + + @property + def input_dim(self) -> int: + raise NotImplementedError() + + @property + def output_dim(self) -> int: + raise NotImplementedError() + + @property + def is_packed(self) -> bool: + return True diff --git a/lora/models.py b/lora/models.py new file mode 100644 index 0000000..02c252f --- /dev/null +++ b/lora/models.py @@ -0,0 +1,890 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +import os +from collections.abc import Callable +from typing import TypeVar + +import regex as re +import safetensors.torch +import torch +from torch import nn + +from vllm.config.lora import LoRAConfig +from vllm.logger import init_logger +from vllm.lora.layers import BaseLayerWithLoRA, FusedMoEWithLoRA, LoRAMapping +from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.peft_helper import PEFTHelper +from vllm.lora.punica_wrapper import get_punica_wrapper +from vllm.lora.utils import ( + from_layer, + from_layer_logits_processor, + get_supported_lora_modules, + is_regex_target_modules, + parse_fine_tuned_lora_name, + process_packed_modules_mapping, + replace_submodule, +) +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig +from vllm.model_executor.models import SupportsLoRA, supports_multimodal +from vllm.model_executor.models.interfaces import is_pooling_model +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper +from vllm.utils.cache import LRUCache +from vllm.utils.platform_utils import is_pin_memory_available + +logger = init_logger(__name__) + +T = TypeVar("T") + + +class AdapterLRUCache(LRUCache[int, T]): + def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]): + super().__init__(capacity) + self.deactivate_fn = deactivate_fn + + def _on_remove(self, key: int, value: T | None): + logger.debug("Removing adapter int id: %d", key) + self.deactivate_fn(key) + return super()._on_remove(key, value) + + +_GLOBAL_LORA_ID = 0 + + +def get_lora_id(): + global _GLOBAL_LORA_ID + _GLOBAL_LORA_ID += 1 + return _GLOBAL_LORA_ID + + +class LoRAModel: + """A LoRA fine-tuned model.""" + + def __init__( + self, + lora_model_id: int, + rank: int, + loras: dict[str, LoRALayerWeights], + ) -> None: + """ + Args: + lora_model_id: The integer id for the lora model. + rank: lora rank. + loras: module name -> weights for lora-replaced layers. + + """ + self.id = lora_model_id + + assert lora_model_id > 0, ( + f"a valid lora id should be greater than 0, got {self.id}" + ) + self.rank = rank + self.loras: dict[str, LoRALayerWeights] = loras + + def clone(self, lora_model_id: int) -> "LoRAModel": + """Return a copy of the object with different ids. + + Will share the underlying tensors.""" + return self.__class__( + lora_model_id, + rank=self.rank, + loras=self.loras.copy(), + ) + + @property + def extra_vocab_size(self) -> int: + return ( + max(lora.extra_vocab_size for lora in self.loras.values()) + if self.loras + else 0 + ) + + def get_lora(self, module_name: str) -> LoRALayerWeights | None: + """Get LoRA for a given module by name""" + return self.loras.get(module_name, None) + + def check_lora_name(self, lora_name: str) -> bool: + return lora_name in self.loras + + # (yard1): TODO see if we can derive target_embedding_padding automatically + @classmethod + def from_lora_tensors( + cls, + lora_model_id: int, + tensors: dict[str, torch.Tensor], + peft_helper: PEFTHelper, + device: str = "cuda", + dtype: torch.dtype | None = None, + embeddings: dict[str, torch.Tensor] | None = None, + target_embedding_padding: int | None = None, + embedding_modules: dict[str, str] | None = None, + embedding_padding_modules: list[str] | None = None, + weights_mapper: WeightsMapper | None = None, + ) -> "LoRAModel": + """Create a LoRAModel from a dictionary of tensors.""" + pin_memory = str(device) == "cpu" and is_pin_memory_available() + loras: dict[str, LoRALayerWeights] = {} + for tensor_name, tensor in tensors.items(): + module_name, is_lora_a = parse_fine_tuned_lora_name( + tensor_name, weights_mapper + ) + if module_name not in loras: + lora_embeddings_tensor = None + if embeddings: + assert embedding_modules is not None + embeddings_module = next( + (k for k in embedding_modules if k in module_name), None + ) + if embeddings_module: + lora_embeddings_tensor = embeddings[ + embedding_modules[embeddings_module] + ].to(device=device, dtype=dtype) + if pin_memory: + lora_embeddings_tensor = lora_embeddings_tensor.pin_memory() + loras[module_name] = LoRALayerWeights.from_config( + module_name, peft_helper, lora_embeddings_tensor + ) + + if is_lora_a: + loras[module_name].lora_a = tensor.to(device=device, dtype=dtype) + if pin_memory: + loras[module_name].lora_a = loras[module_name].lora_a.pin_memory() + else: + loras[module_name].lora_b = tensor.to(device=device, dtype=dtype) + assert embedding_padding_modules is not None + if ( + any(name in module_name for name in embedding_padding_modules) + and target_embedding_padding is not None + ): + lora_b = loras[module_name].lora_b + assert target_embedding_padding >= lora_b.shape[0] + addition = target_embedding_padding - lora_b.shape[0] + loras[module_name].lora_b = torch.nn.functional.pad( + lora_b, (0, 0, 0, addition) + ) + if pin_memory: + loras[module_name].lora_b = loras[module_name].lora_b.pin_memory() + + for lora in loras.values(): + lora.optimize() + + return cls(lora_model_id, peft_helper.r, loras) + + @classmethod + def from_local_checkpoint( + cls, + lora_dir: str, + expected_lora_modules: list[str], + peft_helper: PEFTHelper, + *, + lora_model_id: int | None = None, + device: str = "cuda", + dtype: torch.dtype | None = None, + target_embedding_padding: int | None = None, + embedding_modules: dict[str, str] | None = None, + embedding_padding_modules: list[str] | None = None, + weights_mapper: WeightsMapper | None = None, + tensorizer_config_dict: dict | None = None, + ) -> "LoRAModel": + """Create a LoRAModel from a local checkpoint. + + Args: + lora_dir: The local path that has lora data. + expected_lora_modules: Name of modules that are expected to be + replaced by lora. + peft_helper: Loaded lora configuration information. + lora_model_id: LoRA model id. If not given, automatically set by + a global counter. + device: Device where the lora model is loaded. + dtype: dtype of the lora model weights. + + Returns: + Loaded LoRA Model. + """ + lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") + lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") + lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") + new_embeddings_tensor_path = os.path.join( + lora_dir, "new_embeddings.safetensors" + ) + new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") + tensors: dict[str, torch.Tensor] = {} + unexpected_modules: list[list[str] | str] = [] + + def check_unexpected_modules(modules: dict): + for lora_module in modules.keys(): # noqa + module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) + # Handle FSDP file format where experts.base_layer is the + # gate_up_proj and experts is the down_proj + if "base_layer" in lora_module: + continue + # Case for expert lora weights + if ".experts" in module_name: + if not any( + module_name.endswith(ele) for ele in expected_lora_modules + ): + unexpected_modules.append(module_name) + elif module_name.split(".")[-1] not in expected_lora_modules: + unexpected_modules.append(module_name) + + if unexpected_modules: + raise ValueError( + f"While loading {lora_dir}, expected" + f" target modules in {expected_lora_modules}" + f" but received {unexpected_modules}." + f" Please verify that the loaded LoRA module is correct" + ) + + if tensorizer_config_dict: + from tensorizer import TensorDeserializer + + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + lora_tensor_path = os.path.join( + tensorizer_config.tensorizer_dir, "adapter_model.tensors" + ) + tensorizer_args = tensorizer_config._construct_tensorizer_args() + tensors = TensorDeserializer( + lora_tensor_path, + dtype=tensorizer_config.dtype, + **tensorizer_args.deserialization_kwargs, + ) + check_unexpected_modules(tensors) + + elif os.path.isfile(lora_tensor_path): + # Find unexpected modules. + # Use safetensor key as a source of truth to find expected modules. + # in peft if you have target_modules A, B, C and C does not exist + # in the model it won’t error and model will be trained with A, B + # loraified. C won’t exist in the safetensor but it will exist in + # the target_modules of the adapter_config.json. + unexpected_modules = [] + with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore + # Load tensors if there are only expected modules. + check_unexpected_modules(f) + for module in f.keys(): # noqa + tensors[module] = f.get_tensor(module) + elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path): + # When a bin/pt file is provided, we rely on config to find + # unexpected modules. + unexpected_modules = [] + target_modules = peft_helper.target_modules + if not isinstance(target_modules, list): + target_modules = [target_modules] + for module in target_modules: + # Compatible with more modules, + # such as:layers.11.self_attn.k_proj + part_name = module.split(".")[-1] + if part_name not in expected_lora_modules: + unexpected_modules.append(module) + # loaded lora's target modules must be a subset of + # expected_lora_modules. It is not reliable. See + # https://github.com/vllm-project/vllm/pull/5909. But there's no + # other better mechanism. + if unexpected_modules and not is_regex_target_modules( + peft_helper.target_modules, expected_lora_modules + ): + raise ValueError( + f"While loading {lora_dir}, expected" + f" target modules in {expected_lora_modules}" + f" but received {unexpected_modules}." + f" Please verify that the loaded LoRA module is correct" + ) + lora_file_path = ( + lora_bin_file_path + if os.path.isfile(lora_bin_file_path) + else lora_pt_file_path + ) + tensors = torch.load(lora_file_path, map_location=device, weights_only=True) + else: + raise ValueError(f"{lora_dir} doesn't contain tensors") + + embeddings = None + if os.path.isfile(new_embeddings_tensor_path): + embeddings = safetensors.torch.load_file(new_embeddings_tensor_path) + elif os.path.isfile(new_embeddings_bin_file_path): + embeddings = torch.load( + new_embeddings_bin_file_path, map_location=device, weights_only=True + ) + + return cls.from_lora_tensors( + lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id, + tensors=tensors, + peft_helper=peft_helper, + device=device, + dtype=dtype, + embeddings=embeddings, + target_embedding_padding=target_embedding_padding, + embedding_modules=embedding_modules, + embedding_padding_modules=embedding_padding_modules, + weights_mapper=weights_mapper, + ) + + +class LoRAModelManager: + """A manager that manages multiple LoRA-fine-tuned models.""" + + def __init__( + self, + model: SupportsLoRA, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + device: torch.device, + ): + """Create a LoRAModelManager and adapter for a given model. + + Args: + model: the model to be adapted. + max_num_seqs: the maximum number of sequences model can run in a + single batch. + max_num_batched_tokens: the maximum number of tokens model can run + in a single batch. + vocab_size: the vocab size of the model. + lora_config: the LoRA configuration. + """ + self.model: SupportsLoRA = model + self._registered_adapters: dict[int, LoRAModel] = {} + # Dict instead of a set for compatibility with LRUCache. + self._active_adapters: dict[int, None] = {} + self.adapter_type = "LoRA" + self.lora_config = lora_config + self.device = device + self.max_num_seqs = max_num_seqs + assert self.capacity >= self.lora_slots + self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 + self.lora_index_to_id: list[int | None] = [None] * self.lora_slots + self.vocab_size = vocab_size + self.punica_wrapper = get_punica_wrapper( + max_num_batched_tokens, + max_batches=self.max_num_seqs, + device=self.device, + max_loras=self.lora_config.max_loras, + ) + + self.supported_lora_modules = get_supported_lora_modules(self.model) + assert self.supported_lora_modules, "No supported LoRA modules found in" + f" {self.model.__class__.__name__}." + + self.packed_modules_mapping = process_packed_modules_mapping(self.model) + # Used to indicate whether the model is a multimodal model + self.supports_mm: bool = ( + supports_multimodal(self.model) + # In case the model only supports LoRA for + # text modules (e.g. ChatGLM) + and hasattr(self.model, "get_mm_mapping") + ) + self.is_pooling_model = is_pooling_model(self.model) + self.packed_modules: dict[str, list[str]] = {} + self.modules: dict[str, BaseLayerWithLoRA] = {} + # Dict instead of a set for compatibility with LRUCache. + self._last_mapping: LoRAMapping | None = None + self._create_lora_modules() + self.model.lora_manager = self + + def __len__(self) -> int: + return len(self._registered_adapters) + + @property + def capacity(self) -> int: + return self.lora_config.max_cpu_loras + + @property + def lora_slots(self) -> int: + return self.lora_config.max_loras + + @property + def adapter_slots(self) -> int: + return self.lora_slots + + def activate_adapter( + self, + lora_id: int, + ) -> bool: + """Move LoRA into a GPU buffer to be used in the forward pass.""" + if lora_id in self._active_adapters: + return False + first_free_slot = next( + ( + (i, lora_id) + for i, lora_id in enumerate(self.lora_index_to_id) + if lora_id is None + ), + None, + ) + if first_free_slot is None: + raise ValueError("No free lora slots") + index, _ = first_free_slot + self._active_adapters[lora_id] = None + lora_model = self._registered_adapters[lora_id] + logger.debug( + "Activating LoRA. int id: %d, slot index: %d", lora_model.id, index + ) + self.lora_index_to_id[index] = lora_model.id + for module_name, module in self.modules.items(): + module_lora = self._get_lora_layer_weights(lora_model, module_name) + if module_lora: + # Note (gnovack) - If MOE lora weights are not split into + # num_experts chunks, we split them here + if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor( + module_lora.lora_a + ): + # Handle FSDP file format where experts.base_layer is the + # gate_up_proj and experts is the down_proj + gate_up_proj_lora = self._get_lora_layer_weights( + lora_model, module_name + ".base_layer" + ) + + assert gate_up_proj_lora is not None + assert module_lora is not None + + down_proj_lora = module_lora + num_experts = module_lora.lora_a.shape[0] // module_lora.rank + + gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0) + up_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0) + + gate_proj_b = gate_up_proj_lora.lora_b[::2, ...].chunk( + num_experts, dim=-1 + ) + up_proj_b = gate_up_proj_lora.lora_b[1::2, ...].chunk( + num_experts, dim=-1 + ) + + down_proj_a = down_proj_lora.lora_a.chunk(num_experts, dim=0) + down_proj_b = down_proj_lora.lora_b.chunk(num_experts, dim=-1) + + lora_a = [] + lora_b = [] + for i in range(num_experts): + lora_a.append(gate_proj_a[i]) + lora_a.append(down_proj_a[i]) + lora_a.append(up_proj_a[i]) + + lora_b.append(gate_proj_b[i]) + lora_b.append(down_proj_b[i]) + lora_b.append(up_proj_b[i]) + + module_lora.lora_a = lora_a + module_lora.lora_b = lora_b + + module.set_lora( + index, + module_lora.lora_a, + module_lora.lora_b, + module_lora.embeddings_tensor, + ) + else: + module.reset_lora(index) + return True + + def _deactivate_adapter(self, lora_id: int): + try: + index = self.lora_index_to_id.index(lora_id) + self.lora_index_to_id[index] = None + except ValueError: + pass + + def _add_adapter(self, lora: LoRAModel): + self._create_merged_loras_inplace(lora) + self._registered_adapters[lora.id] = lora + + def pin_adapter(self, lora_id: int) -> bool: + """Pin a LoRAModel in the manager cache.""" + raise NotImplementedError( + "Pinning is not supported in LoRAModelManager. " + "Use LRUCacheLoRAModelManager for pinning" + ) # type: ignore + + def _set_adapter_mapping(self, mapping: LoRAMapping) -> None: + # update lora states + self.punica_wrapper.update_metadata( + mapping, + self.lora_index_to_id, + self.lora_slots + 1, + self.vocab_size, + self.lora_config.lora_extra_vocab_size, + ) + + def remove_all_adapters(self): + """Remove all LoRAModels from the manager.""" + self._registered_adapters.clear() + self.lora_index_to_id = [None] * self.lora_slots + self._active_adapters.clear() + + def _create_lora_modules(self): + def _parent_module(module_name: str) -> str: + # module name is a dot separated name. + # for example: + # - given an input 'x.y.z' return 'x.y' + # - given an input 'x' return '' + return module_name.rpartition(".")[0] + + for module_name, module in self.model.named_modules(remove_duplicate=False): + if isinstance(module, PPMissingLayer): + continue + + if not self._match_target_modules(module_name): + continue + # A temporary approach for multimodal models to support LoRA + # TODO: Remove this restriction + if self._filter_unsupported_mm_module(module_name): + logger.warning( + "Regarding multimodal models, vLLM currently only supports " + "adding LoRA to language model, %s will be ignored.", + module_name, + ) + continue + parts = module_name.split(".")[-1] + packed_moduled_lst = self.packed_modules_mapping.get(parts, []) + new_module = replace_submodule( + self.model, + module_name, + from_layer( + module, + self.lora_slots, + self.lora_config, + packed_moduled_lst, + self.model.config, + ), + ) + + # (yard1): TODO make this more robust + if "lm_head" in module_name: + logits_processor_module_name = "logits_processor" + parent_module = _parent_module(module_name) + if parent_module: + logits_processor_module_name = ( + f"{parent_module}.{logits_processor_module_name}" + ) + + logits_processor_module = self.model.get_submodule( + logits_processor_module_name + ) + + new_module = replace_submodule( + self.model, + logits_processor_module_name, + from_layer_logits_processor( + logits_processor_module, + module, + self.lora_slots, + self.lora_config, + self.model.config, + ), + ) + + # In some models, especially multimodal ones, layers with the same + # name may have different types, such as nn.Linear and + # ReplicatedLinear. The nn.Linear layers cannot be replaced with + # LoRA layers, leading to assertion error. The following check + # aims to prevent this error + if self.supports_mm and not isinstance(new_module, BaseLayerWithLoRA): + continue + self.register_module(module_name, new_module) + self._register_packed_modules(module_name) + # All lora layers share the same punica_wrapper based on reference. + new_module.set_mapping(self.punica_wrapper) + + def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): + assert isinstance(module, BaseLayerWithLoRA), ( + f"Module {module_name} must be a BaseLayerWithLoRA instance," + ) + f" got {type(module)}" + self.modules[module_name] = module + + def create_dummy_lora( + self, + lora_id: int, + rank: int, + embedding_modules: dict[str, str] | None = None, + ) -> LoRAModel: + """Create zero-initialized LoRAModel for warmup.""" + model = LoRAModel(lora_id, rank, {}) + for module_name, module in self.model.named_modules(): + if ( + not self._match_target_modules(module_name) + or not isinstance(module, BaseLayerWithLoRA) + or self._filter_unsupported_mm_module(module_name) + ): + continue + parts = module_name.split(".") + if module_name not in self.packed_modules: + assert embedding_modules is not None + if parts[-1] in embedding_modules: + input_dim = ( + module.base_layer.org_vocab_size + + self.lora_config.lora_extra_vocab_size + if hasattr(module.base_layer, "org_vocab_size") + else module.base_layer.weight.shape[1] + ) + output_dim = ( + module.base_layer.embedding_dim + if hasattr(module.base_layer, "embedding_dim") + else module.base_layer.weight.shape[0] + ) + embeddings_tensor_dim = ( + module.base_layer.embedding_dim + if hasattr(module.base_layer, "embedding_dim") + else module.base_layer.weight.shape[1] + ) + lora = LoRALayerWeights.create_dummy_lora_weights( + module_name, + input_dim, + output_dim, + rank, + module.lora_a_stacked[0].dtype, + "cpu", + embeddings_tensor_dim=embeddings_tensor_dim, + ) + else: + lora = LoRALayerWeights.create_dummy_lora_weights( + module_name, + module.lora_a_stacked[0].shape[-1], + module.lora_b_stacked[0].shape[-2], + rank, + module.lora_a_stacked[0].dtype, + "cpu", + ) + else: + parts = module_name.split(".") + replacements = self.packed_modules_mapping[parts[-1]] + subloras: list[LoRALayerWeights | None] = [] + for i, r in enumerate(replacements): + lora = LoRALayerWeights.create_dummy_lora_weights( + module_name + "." + r, + module.lora_a_stacked[i].shape[-1], + module.lora_b_stacked[i].shape[-2], + rank, + module.lora_a_stacked[i].dtype, + "cpu", + ) + subloras.append(lora) + lora = PackedLoRALayerWeights.pack(subloras) + model.loras[module_name] = lora + return model + + def _match_target_modules(self, module_name: str): + return any( + re.match( + r".*\.{target_module}$".format(target_module=target_module), module_name + ) + or target_module == module_name + for target_module in self.supported_lora_modules + ) + + def _filter_unsupported_mm_module(self, module_name: str) -> bool: + """ + Regarding multimodal models, vLLM currently only supports adding LoRA to + language model. LoRA for other modules, such as the vision tower, will + be filtered out. + """ + if self.supports_mm: + module_mapping: MultiModelKeys = self.model.get_mm_mapping() + prefix_lst = module_mapping.connector + module_mapping.tower_model + return any([module_name.startswith(prefix) for prefix in prefix_lst]) + return False + + def _register_packed_modules(self, module_full_name: str) -> None: + parts = module_full_name.split(".") + module_name = parts[-1] + replacements = self.packed_modules_mapping.get(module_name, []) + # When replacements is less than or equal to 1, it indicates that this + # module is not a packed module. + if len(replacements) <= 1: + return + prefix = ".".join(parts[:-1]) + self.packed_modules[module_full_name] = [ + prefix + "." + r if prefix else r for r in replacements + ] + + def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: + for module_name, new_module_names in self.packed_modules.items(): + replacement_loras: list[LoRALayerWeights | None] = [] + replaced_module: set[str] = set() + has_replacement = False + for r in new_module_names: + lora = self._get_lora_layer_weights(lora_model, r) + replacement_loras.append(lora) + if lora: + has_replacement = True + replaced_module.add(r) + if not has_replacement: + continue + for i in range(len(replacement_loras)): + if replacement_loras[i]: + continue + replacement_loras[i] = None + # HACK Temporary solution for the pool model. + if self.is_pooling_model and not lora_model.check_lora_name(module_name): + replaced_module_name = module_name.replace("model.", "") + if lora_model.check_lora_name(module_name): + module_name = replaced_module_name + lora_model.loras[module_name] = PackedLoRALayerWeights.pack( + replacement_loras + ) + # Remove the modules that have been replaced. + for module in replaced_module: + lora_model.loras.pop(module, None) + + def _get_lora_layer_weights( + self, lora_model: LoRAModel, module_name: str + ) -> LoRALayerWeights | None: + org_module_name = module_name + if self.is_pooling_model and not lora_model.check_lora_name(module_name): + # If it's a pool model, and the layer name is not found, + # remove the prefix 'model.' and search again. + module_name = module_name.replace("model.", "") + if lora_model.check_lora_name(module_name): + org_module_name = module_name + logger.info_once( + "For the pool model, successfully loaded the LoRA weights " + "after removing the prefix 'model.'." + ) + return lora_model.get_lora(org_module_name) + + def deactivate_adapter(self, adapter_id: int) -> bool: + if adapter_id not in self._active_adapters: + return False + self._deactivate_adapter(adapter_id) + self._active_adapters.pop(adapter_id, None) + return True + + def add_adapter(self, adapter: LoRAModel) -> bool: + logger.debug("Adding lora. Model id: %d, int id: %d", adapter.id, adapter.id) + if adapter.id in self._registered_adapters: + return False + if len(self._registered_adapters) >= self.capacity: + raise RuntimeError("No free adapter slots.") + self._add_adapter(adapter) + return True + + def set_adapter_mapping(self, mapping: LoRAMapping) -> None: + if self._last_mapping != mapping: + self._set_adapter_mapping(mapping) + self._last_mapping = mapping + + def remove_adapter(self, adapter_id: int) -> bool: + self.deactivate_adapter(adapter_id) + if adapter_id not in self._registered_adapters: + return False + self._registered_adapters.pop(adapter_id, None) + return True + + def list_adapters(self) -> dict[int, LoRAModel]: + return dict(self._registered_adapters) + + def get_adapter(self, adapter_id: int) -> LoRAModel | None: + return self._registered_adapters.get(adapter_id) + + +class LoRALRUCache(AdapterLRUCache[LoRAModel]): + def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int], bool]): + super().__init__(capacity, deactivate_lora_fn) + + +class LRUCacheLoRAModelManager(LoRAModelManager): + """A model manager that manages multiple LoRAs with LRU cache.""" + + def __init__( + self, + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + device: torch.device, + ): + super().__init__( + model, max_num_seqs, max_num_batched_tokens, vocab_size, lora_config, device + ) + self._registered_adapters: LoRALRUCache = LoRALRUCache( + self.capacity, self.deactivate_adapter + ) + self._active_adapters: LoRALRUCache = LoRALRUCache( + self.lora_slots, self._deactivate_adapter + ) + + def list_adapters(self) -> dict[int, LoRAModel]: + """List all registered LoRAModels.""" + return dict(self._registered_adapters.cache) + + def add_adapter(self, lora: LoRAModel) -> bool: + """Add a LoRAModel to the manager.""" + logger.debug("Adding lora. Model id: %d, int id: %d", lora.id, lora.id) + if lora.id not in self._registered_adapters: + self._add_adapter(lora) + was_added = True + else: + # We always touch to update the LRU cache order + self._registered_adapters.touch(lora.id) + was_added = False + return was_added + + def activate_adapter( + self, + lora_id: int, + ) -> bool: + if ( + lora_id not in self._active_adapters + and len(self._active_adapters) >= self.lora_slots + ): + self._active_adapters.remove_oldest() + result = super().activate_adapter(lora_id) + # We always touch to update the LRU cache order + self._active_adapters.touch(lora_id) + return result + + def remove_oldest_adapter(self) -> bool: + if len(self._registered_adapters) > 0: + self._registered_adapters.remove_oldest() + return True + return False + + def pin_adapter(self, lora_id: int) -> bool: + """Pin a LoRAModel in the manager cache.""" + self._pin_lora_in_cpu_cache(lora_id) + self._pin_lora_in_gpu_cache(lora_id) + return True + + def _pin_lora_in_cpu_cache(self, lora_id: int): + try: + self._registered_adapters.pin(lora_id) + except ValueError as err: + raise ValueError( + f"Pinning failed. LoRA {lora_id} is not registered." + ) from err + + def _pin_lora_in_gpu_cache(self, lora_id: int): + if lora_id not in self._active_adapters: + # move lora to gpu if not already active + self.activate_adapter(lora_id) + + self._active_adapters.pin(lora_id) + + +def create_lora_manager( + model: nn.Module, + max_num_seqs: int, + max_num_batched_tokens: int, + vocab_size: int, + lora_config: LoRAConfig, + device: torch.device, + lora_manager_cls: type[LoRAModelManager] = LoRAModelManager, + **kwargs, +) -> LoRAModelManager: + """Create a LoRA adapter for a given model.""" + if not isinstance(model, SupportsLoRA): + raise ValueError(f"Model {type(model)} is not supported for LoRA.") + lora_manager = lora_manager_cls( + model=model, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + vocab_size=vocab_size, + lora_config=lora_config, + device=device, + **kwargs, + ) + return lora_manager diff --git a/lora/ops/__init__.py b/lora/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lora/ops/__pycache__/__init__.cpython-312.pyc b/lora/ops/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3e3a5470c06053066aa95facaab745e9250335b GIT binary patch literal 158 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJV+3J_%7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?D`Ewj#t6j4AjU^#Mn=XWW*`dyihL!d literal 0 HcmV?d00001 diff --git a/lora/ops/ipex_ops/__init__.py b/lora/ops/ipex_ops/__init__.py new file mode 100644 index 0000000..f5a5e0e --- /dev/null +++ b/lora/ops/ipex_ops/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.lora.ops.ipex_ops.lora_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink + +__all__ = ["bgmv_expand", "bgmv_expand_slice", "bgmv_shrink"] diff --git a/lora/ops/ipex_ops/__pycache__/__init__.cpython-312.pyc b/lora/ops/ipex_ops/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad2274aedaa88d470da65987e3abcc89c03d6fea GIT binary patch literal 315 zcmYjNJx{|h5Ix7K`lSwt9{_8H#$!POMn+apmQzZs)T*6Wj$0)oGe3Zxjo-q=nh_>8 zqz*_-oTXOk4NvdAyL%_y`)oD=Djh#7&H%oGI8?MJyK^EhP(Z;n7?v>;Ws%_-EtKTF zvMlbRgnx(eg!j{BX|^a^n^#4DawM?P>QV0d&C022eU}XSv#r)9)wRph+BT`Ovc*7x z6NuxK>*4?df_{jYYmN8@JCg8sE4;D7Dm(YNE2W>eo5khDdLe~YOJO^|sw)w0h>$2K xQ5Lz#iBK+<#84pN4Ks{y)60Smpo# literal 0 HcmV?d00001 diff --git a/lora/ops/ipex_ops/__pycache__/lora_ops.cpython-312.pyc b/lora/ops/ipex_ops/__pycache__/lora_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b641d98e8b087b8fb07594af07c1b960c8ff939 GIT binary patch literal 2053 zcmc&#&x_+^82-NIN7MZ1@0oGjL5H1XF^8SvXXIg&b1v=YnmI~xgyooU4C^?WVj;Uq zYRZ7hw!p|pOd_Zm5RNhhc={^|>ZZKZsz#ikKJrJg%k7j%Pa^7S1N~VW;IfUN4q8 ziw4~5fNSO>4@iJx@-BoU63rL(a&tHbd*U3)*A)UG8{yOo<~A$9uVq0Xet=A;h_L98X?vME#%< zIKh>u#d|EeG8|mJ_R^iJtl>KCMsN`A`d;HM@H9}Ah986tq_z;GG6C2O4tNP`V<=<& z7a$yxM@1qPKDzn-&4=R3skrh`+&C3C#!B&C@k`^|#;af7cr4^3c>>px=__f^Uy0hX zZ7X|S|CiESgT_pe>u}-*h9Y*FY0QQ}%d=-1NVSy_pKc zaX*Un&;>hMezzO4C|0v9blzi~v(_oVrKu6y0RDX=j--)%Jmo_(E|92_+Ax#IN;@_4 znHkRswG3VI6%46lSUI)j`g$hXDlTC_y}}nEfUBzKTxFh$Cb(*>ERU6?v2tOo)W=E{ zeveCvB&X_`vuxki-QZ-i2PtZIih{y^pQCn37)GjUrZCz`aqBAk?S+g0JDiug}tLY0>Q literal 0 HcmV?d00001 diff --git a/lora/ops/ipex_ops/lora_ops.py b/lora/ops/ipex_ops/lora_ops.py new file mode 100644 index 0000000..0767f90 --- /dev/null +++ b/lora/ops/ipex_ops/lora_ops.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + import intel_extension_for_pytorch as ipex +except ImportError as e: + raise e + + +def bgmv_shrink( + inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + scaling: float = 1.0, +) -> None: + ipex.llm.functional.bgmv_shrink( + inputs, lora_a_weights, output_tensor, lora_indices_tensor, scaling + ) + + +def bgmv_expand( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + add_inputs: bool = True, +) -> None: + ipex.llm.functional.bgmv_expand( + inputs, lora_b_weights, output_tensor, lora_indices_tensor, add_inputs + ) + + +def bgmv_expand_slice( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + slice_offset: int, + slice_size: int, + add_inputs: bool = True, +) -> None: + ipex.llm.functional.bgmv_expand_slice( + inputs, + lora_b_weights, + output_tensor, + lora_indices_tensor, + slice_offset, + slice_size, + add_inputs, + ) diff --git a/lora/ops/torch_ops/__init__.py b/lora/ops/torch_ops/__init__.py new file mode 100644 index 0000000..89865af --- /dev/null +++ b/lora/ops/torch_ops/__init__.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.lora.ops.torch_ops.lora_ops import ( + bgmv_expand, # noqa: F401 + bgmv_expand_slice, + bgmv_shrink, + sgmv_expand, + sgmv_expand_slice, + sgmv_shrink, +) + +__all__ = [ + "bgmv_expand", + "bgmv_expand_slice", + "bgmv_shrink", + "sgmv_expand", + "sgmv_expand_slice", + "sgmv_shrink", +] diff --git a/lora/ops/torch_ops/__pycache__/__init__.cpython-312.pyc b/lora/ops/torch_ops/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df03f6d0e3263eed171821bf5f443fed55abaac6 GIT binary patch literal 392 zcmYk2y-ve06oqZ)#}T1OFfg-5qVZUe5GxxG;KfRtD6wiMmg82<$jk$X@r8px1_ zGNO@;X$(Q_=FgDg0cF95Kbplho!4^Bs*Nhz(w>eC>jhs_`+47J-Y#w0kC#X0u>9Nl zTfWZ{d$ATm=AzR@-l;w}onAD|E8p@$2ksR*^oTrS5A2aRSj<=<1Y<8?>n!-bBStEs zxSEsP{#NF$@ b74+v0p8kK%IXZCO literal 0 HcmV?d00001 diff --git a/lora/ops/torch_ops/__pycache__/lora_ops.cpython-312.pyc b/lora/ops/torch_ops/__pycache__/lora_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c367ce996d9fe3a550ea251009f4dd858f80bee0 GIT binary patch literal 5264 zcmd^CO>7(25q?W@xhwujr1-0!N(wAUjDIMlZso?RUCWjY6siycg`(SmZ|p(xawjtzR@N(n?Pa&XaJe4`=vl2d2i^3MvHIB8)N z=#w~{dHZ(W%)B?>4FBeKI|vNP_WRUGJ0br-p*kWJ=0vg*vPA?UFcO(1ZA1tP7LhDj zstfDig0;oGt;IasV&2|jo@+7Bix!~`J`VUeMeDp@aDBqg+Q8P(f6dlbd*gl%!43Pd zE%)-Yu={n!rCqc`Tsl6n&2kWza7$Z%=oz+(3-0F-ybu@ufBw$+;NEmr?oL?ngUxNP zk$I5uVaOYMxJ3xO&j)9Cbz6AhDXZEiTGUnV==X`2(ml7tipr1DrS_dY_!kC z74f6Eq9$ZDE@hL32SrKDR2Ov8_fr`ml@yigBsZ5(lZzr~PbVHy1@Oj#x@b7m?0tAQ zvz%58M?w(drWHww6qQ`(;2ThZoJ<4RA}UjvZ;A>UtJ(G{XpwiQY;~aE_mBpNxP{9OMlqE6oKs20l3+V@O@nJ5J5u#2Rt&nXsY0%CN)yHZvV(J>kk1)ufT$2YfPYC0ForE=#Xqzj^Y{^9c2FD~65 z6Jt_pE|yzS7qgkz14&B9FtD*~PKi--LCcgVkTvomOv1g# zp$mHGLNPXRFnttyuW0lC)7HLW-}XFl7sqb>wd*U}?Gso`lC_wDP>X#SV!DMfv?5e5 z)y@FlFo$MohSHxidzH0$w&66`B@EuFQC%rm?hiu%eHy!jF+DW|>*u!W$~>$6I9rH{ z=Rn629@MEJfL@K=!}Erbw}Wp@>=o=7i#(^*SDv|6*n3|F7Wxm7ouzBOqX2Y#jLiqqQ6 zl~HbbmoZpDUCD{hFg>2(k;9Z#NTt7V&!x`w&t+5n=dZw*jJX-*4GUc2FtCOdkicM- z#Y9dtIOU^dQCt;e41!@7QyFDBZCKN(OtkHE^3jx|IR*$MiAfc54AVg|I1{lHgOyV0 zlxjGV*>pOaL6B1LKGl*>?gUBpRtcJ;^nmGWr0gNVGsQDwdf?J=;IbaLd~p7F;*LIX zrx>_=6nKAa`fJXye&yKNr#t(Oxc;4MhumN(GISh?>5*9BS}_tkiu`=tQ*w7czWeCz z_PyiY5xsY0{qB)_bnWKXp3cY8BWe5UlbeOf2#-*Bfh-CC>A^ z{BQFcGddUA9zEoGc6yHoF6sjp3p4t_*#79@z<8;{yUxC_6JKECgROg;_jX40z~D~) zc<60?=2R!IF9Ztqedm7HzOy)TT_3)&?ykRm$PdZ^h*G6UBx*&1Q<4cOm05UQ zxGj6Yil~*|6S)H|GLc$YlvA1eRip+Gc_pMqjFf#qPJ=VOAQfOWRmx*5DBD>xz@0gpgWZzyF`-HTR>Nn_3H}am`wjYWq}cw7BnWw76Gjn7CTQ z#A|lBtD4Q44gT0w2+m6l2w%zjmz$2tZ1Z=`td}o%+|t%WYJ2kU$q* z?4+WLboKqQ=MOz)Hx)g^<1M#Swu1!1WiMrY#Op72Qr1trffH&$`5X!NJa>ke$+|Hp z)Qmx)!5Av?)$Q7-=s;~$MfswKdAF_zg_1KXaHwJ~8F(@<`L&dsRRCJ&=s-k=e4=fU9?6va(zNQQK23084 z-9oAE6-srdP%?c&&U6Vm(<9{kq%&9!AQ!~53n3TAV52SN8w?&@yN`LN?j#B|!%%3j cjEZ$9sp!BRoRoFp7H-OVV2>A6F}>IS27w~){Qv*} literal 0 HcmV?d00001 diff --git a/lora/ops/torch_ops/lora_ops.py b/lora/ops/torch_ops/lora_ops.py new file mode 100644 index 0000000..4fc6248 --- /dev/null +++ b/lora/ops/torch_ops/lora_ops.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + + +def sgmv_expand( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + add_inputs: bool = False, +): + exploded_indices = torch.repeat_interleave(lora_indices_tensor, seq_len_tensor) + + bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices, add_inputs) + + +def bgmv_expand( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + add_inputs: bool = True, +): + selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype) + if len(selected_loras.shape) == 4: + selected_loras = selected_loras.squeeze(dim=1) + inputs = inputs.to(dtype=output_tensor.dtype) + outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras) + + limit = output_tensor.shape[0] + if outputs.shape[0] == 1 and output_tensor.shape[0] != 1: + limit = 1 + + # LoRA adapter and model may add different amounts of padding to output + common_len = min(outputs.shape[1], output_tensor.shape[1]) + + if add_inputs: + output_tensor[:, :common_len] += outputs[:limit, :common_len] + else: + output_tensor[:, :common_len] = outputs[:limit, :common_len] + + +def sgmv_shrink( + inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + scaling: float, +): + exploded_indices = torch.repeat_interleave(lora_indices_tensor, seq_len_tensor) + + bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices, scaling) + + +def bgmv_shrink( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + scaling: float = 1.0, +): + selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype) + if len(selected_loras.shape) == 4: + selected_loras = selected_loras.squeeze(dim=1) + inputs = inputs.to(dtype=output_tensor.dtype) + outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras) + + output_tensor[:, : outputs.shape[1]] = scaling * outputs[:] + + +def sgmv_expand_slice( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + slice_offset: int, + slice_size: int, + add_inputs: bool = False, +): + exploded_indices = torch.repeat_interleave(lora_indices_tensor, seq_len_tensor) + + bgmv_expand_slice( + inputs, + lora_b_weights, + output_tensor, + exploded_indices, + slice_offset, + slice_size, + add_inputs, + ) + + +def bgmv_expand_slice( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + slice_offset: int, + slice_size: int, + add_inputs: bool = True, +): + selected_loras = lora_b_weights[lora_indices_tensor].to(dtype=output_tensor.dtype) + inputs = inputs.to(dtype=output_tensor.dtype) + if len(selected_loras.shape) == 4: + selected_loras = selected_loras.squeeze(dim=1) + outputs = torch.einsum("bi, boi -> bo", inputs, selected_loras) + + if add_inputs: + output_tensor[:, slice_offset : slice_offset + slice_size] += outputs[:] + else: + output_tensor[:, slice_offset : slice_offset + slice_size] = outputs[:] diff --git a/lora/ops/triton_ops/README_TUNING.md b/lora/ops/triton_ops/README_TUNING.md new file mode 100644 index 0000000..3ebe1fd --- /dev/null +++ b/lora/ops/triton_ops/README_TUNING.md @@ -0,0 +1,60 @@ +# Multi-LoRA Tuning + +**Note**: The LoRA configuration folder should be specified by exporting `VLLM_TUNED_CONFIG_FOLDER=/path/to/configs`. +Without this, the shrink/expand kernels will use default configurations. + +## Tuning Process + +Multi-lora shrink/expand Triton kernel tuning follows a similar methodology from +[Triton MoE tuning](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py). + +1. Define the searching space. Here is an example of searching space: + + ```python + block_m_range = [16, 32, 64, 128, 256] + block_n_range = [32, 64, 128, 256] + block_k_range = [32, 64, 128, 256] + num_warps_range = [4, 8] + num_stage_range = [2, 3, 4, 5] + num_ctas_range = [1] + split_k_range = [4, 8, 16, 32, 64] + ``` + +2. Get all hidden_state sizes and num_slices that the target model uses for a specific TP size. + + For example, you can acquire the info by simply checking + [add_lora_linear](https://github.com/vllm-project/vllm/blob/main/vllm/lora/punica_wrapper/punica_gpu.py#L181): + + ```python + print(f"x_shape: {x.view(-1, x.shape[-1]).shape}") + print(f"num_slices: {len(output_slices)}") + for i in range(len(output_slices)): + print(f"a{i} shape: {lora_a_stacked[i].shape}") + print(f"b{i} shape: {lora_b_stacked[i].shape}") + print("y_shape", y.shape) + ``` + +3. Benchmark the shrink/expand kernel runtime with different kernel configurations generated from the pre-defined search space + by performing a grid search to find the optimal kernel configuration. + vLLM's [benchmark_lora.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_lora.py) + can be used to search for configurations for different shapes. + +## Config Files + +### File Naming + +| Kernel Type | File Name Template | Example | +|---------------------------|--------------------------------------------|---------------------------------------------| +| shrink | `{gpu_name}_SHRINK.json` | `NVIDIA_H200_SHRINK.json` | +| expand | `{gpu_name}_EXPAND_{add_input}.json` | `NVIDIA_H200_EXPAND_TRUE.json` | +| fused_moe_lora_w13_shrink | `{gpu_name}_FUSED_MOE_LORA_W13_SHRINK.json` | `NVIDIA_H200_FUSED_MOE_LORA_W13_SHRINK.json` | +| fused_moe_lora_w13_expand | `{gpu_name}_FUSED_MOE_LORA_W13_EXPAND.json` | `NVIDIA_H200_FUSED_MOE_LORA_W13_EXPAND.json` | +| fused_moe_lora_w2_shrink | `{gpu_name}_FUSED_MOE_LORA_W2_SHRINK.json` | `NVIDIA_H200_FUSED_MOE_LORA_W2_SHRINK.json` | +| fused_moe_lora_w2_expand | `{gpu_name}_FUSED_MOE_LORA_W2_EXPAND.json` | `NVIDIA_H200_FUSED_MOE_LORA_W2_EXPAND.json` | + +The `gpu_name` can be automatically detected by calling `torch.cuda.get_device_name()`. + +### JSON Structure + +Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n][i]`, +where `i` is an optional dimension in the `fused_moe_lora` configuration, representing the intermediate size of the MoE layer. diff --git a/lora/ops/triton_ops/__init__.py b/lora/ops/triton_ops/__init__.py new file mode 100644 index 0000000..7e8b9a7 --- /dev/null +++ b/lora/ops/triton_ops/__init__.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from vllm.lora.ops.triton_ops.fused_moe_lora_op import ( + fused_moe_lora, + fused_moe_lora_expand, + fused_moe_lora_shrink, +) +from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand +from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta +from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink + +__all__ = [ + "lora_expand", + "lora_shrink", + "LoRAKernelMeta", + "fused_moe_lora", + "fused_moe_lora_shrink", + "fused_moe_lora_expand", +] diff --git a/lora/ops/triton_ops/__pycache__/__init__.cpython-312.pyc b/lora/ops/triton_ops/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c6cd891fdc92f2d6d7faa3d7dc50461d17104cc GIT binary patch literal 620 zcmah^yH3L}6t(kgQ?-bJA7DX2YCIMs#6x1J#A9G!$YKS>v{CKEa$MAo%zOYl8{fho zSn9;Y22==%2|F!s0awb&z1F$so^#V|20#~eF0x|_fLk-Xm7c4b?&{wqNI?oIU@(Tv ziY;cxHZ*S*b(kBwRc}*|`LSR14z00zT!)|sQuhkR!PpD^J0lp{s&zUnNs3uPFfBx) zmcGuIjChi#Kc36J$nrr5Rl{t}S$b^k7e_mLMC61X5ShFa=6p|h`mCrK@D11oTmz1V z!?2-N&nRV)QH=^-Mp9(5$gyd^IV<>V+bmWRYUS6}PbRfgWkR+8W)G?mnCTR0B9qGQ ze-~yc%>|QC`4}gZVtfszdIT?BBpF9M<87_iBeFdjcGlMh9fD|fg7{eWiyRqO$oNG% zGx~^+FwXK!VjS_Ys%@|$9a6F>7PT4I-3i!wbo_>O J?u!LYdj)Xuxmf@J literal 0 HcmV?d00001 diff --git a/lora/ops/triton_ops/__pycache__/fused_moe_lora_op.cpython-312.pyc b/lora/ops/triton_ops/__pycache__/fused_moe_lora_op.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa9f9e404c0f50b1144690df16e9461729e3b3c7 GIT binary patch literal 19306 zcmeHu3ve4pn%)eagC_xyAOVmB0lvZaOB5whFX};2FH5pyowZ)W5C)_`f*>^jtw#;k zSjQKOd(Mg6dkEUwi|MOPOs$n#?&gw}lTB8ey3{(!rNS{B6E{(+RCP`zo2pbnmNs?S zR3-WU9t;3NprpOMy1T0E6sPg^-`)Ro|NT$@fB!xFcP5jbf-9o>t?=HX6!qtrk+0Dl zc-&>6s9V%IiltaOLS4XLMTDMJTu{s^FDPluD>UFdJcLd?d?+%uvr=D3=j(V0BRDktv1WNfoPqM|HufphC*& z<^o+tQxt3XBl@oKBU+-crjTN~RW2?3vgR%56UUHhk#fwxBzoIYX{3TGehruiJuNzXjb5`qZ#R znP&_77Fxr4fmb8kO>j5E-2!(j+--2T!|j8+1MW_^yWsAIy9e&xP&M1f(v!n%|2t|~ zmKSPXNZJEi^l__M82tZ2+Cw1KZE(Mc)OWJOMN)VCN>cyJ>G}!GH=0lXg31) z4!C#1y$kM9xOc<72kyOa?}K|k++%PbfcqfavbK+~hl=FxBx}c3+Wc#|{xWhs{Eqs9 z>(^`XuecVEgA{9@sg=K8V_$rYJp$6K(^0Uxs@aKu3)QeS zP*VF;$?=ebw5cmD0o%7z_CX52)3UEwAW_qMiZVHP0qy>aw%=E#J^BYP+J3A|d+#;6 zi3&9~Q=E>bLQd#U%>_@$al!j4b&bA4T~)qHU7>v^e}+v@`V^uf9uYNhE*zhW`e@O~ zhPhBM?&m^N;aEJx`GX6w_}r|2ZvJQJmO>3QdC0CtEROjjUdabr*%(LOP@|t z?=`8YxIwNtZIn^rmsjz0oNQB&Hf=7H$mp)(BOEha2_>&AjE7gCUKR4s9^cE9PZ`jI z!1Nxt-q^o=gUZ2u(8HZ%;!x;)`Hw4M!wqcLC^jl~1exF-;0J+Xz0G2j}4>_RLs74n4f zQQX|Z)FsdHb7y-9qcfpNWHh&vK*Lq@44DLe=UA#F4`Mkh|h81KqS`NH_|-?kD3p$-BY32*`E1p~XizVn`f3#jo#yH3;+jy7mps|OPv4?pqN`QZHR>NJuh4hslc4Gh z8;A$|^KlMelb$}&P^{aik{-`5PWQ%V=X>YF^WE_PHx-I^UtQR~V`OG~s5cV6D39XJ zNMyDb8~4u5$8v*)S1Q&!xeyDn{@J;ZpRk^rhhd4j(Wy{0bakHFbG_CNEU^bYqmj8F zh+$7pz7U2*#axj5H|j5v)OyAE$6c$tmJAz~X2H_DVQKxi>wm>?A$;3aNBa~Us;8)C z7v>`&jzOXlo*`-i^Yfu7%c1T<=^Pil1j|Aw8k^%p%|bXDA0F~)IDB6u zPRg0kHBk>e^XDoVq+jf8jI%(!m3wNm5VDl))n40MZrX zY5jQez!%lc8`bSXb^B^%y1ILv>As;~*Y3g zxnZam4E3pC+R(VJYJ3b`p?n(A5?z?}zZr;xS$}+P1_U4~4xJEnF;H7J5O*pdL zg%d~?-}^B2#f$HBeu!cLe0&p7fN!FMAfP8YpTkARyYub4o>N!LH{|Pf^IpCipPuO9JNcgL_^e*O zm-iA{U%_zkU3}klSOs`*zHL9>N7_J7`F`v#N-i-_L_=Ier|vBdi|^Q zH!;drgQlTgjglVhE_!k~y;SkL$vAg|7FA~!b{YB#{ z6N~2Ee8Z$7o}bfcS03iUU5z|fiLp3ZTXKQ!_+&b`#@~REAAr^CfV_T@9>5BbI9Sw! zGLn63KVLeogG9F4yG&lXhn~Y@AHTOm;-yk1Jgl^{BLgx|qemFl>xalD;BvRNDDlElPQvs6-(*W%3MEG3Te zjjWQ`S7(v*JUQhn`gv>Ot+BU`-qp#vLW9>`)S|JdY7R;*Sp8i?VQ+tp@)=KZ@NnFf zssdNTF%E11^)saW?Ek{$qq$1JQ*e_Dym-}YAT}ea=U{`xK{4(XL@jJQ0#`#U_{b!$ zjD==>HV(ZcaB#V~DK0SU53`~wG8bS)BYJQ-YD`oG+3=g9ayA@=ee|4YoSy?v8FtO+ zWs`@Cdf3&>!j1_@;A3;J=?RHy;!ca|>mhC~CTb^vaeQz3xc>RM80<3OA99BwEYxEKks zdDl2FTYxipc(LF+UknwfMHQlXYOuf|80Ea!6I2TheOay0a1^llp;W))Po@Ofb<#~F5^0B7?JwC-&xE*WD8l_6co?cr?EA2m!+vv~rVOT=-HY9+)1T=Zk_R4CSZ^j4 z6D!`&Dw>l=N^)k)&8fwyrAsRgq1u-=cO=Ij*s5-|Ew`n5SH}g{VA?j6JeJX#mm1$r zqzo&UKXtS}WXumNwp$g;6)UGdv$Q2A9+)a_o?kq_bUI~Rxgt1v(x%?z;jD%-TbH&k zel>YG1K|8(6j(zu?XotdNF7gGe9$nLr&FV8OM7zSVNKIYSg7e;S5;*ys@Gi|LPbaN zNXF($y(ZXv$zu;HtCy~=>=J5v){N=O-O1w_V@+y&y>(cq8@|^i)E*X$4*Ec!7cN4!lK6T`M($t$lTeU>g#cp&J@d zrV`qsChMil?nliO<9uMRxpjW|{Ca)&nqH{idEd4E^4ZVq=WggT)!vn(-@Uxnyn01w z-Y!&czhMCJRJw2VEcdL~Kdk+rcEh_}@NQ4rMv})KnoC0b(Mi~3WYa;{wX0lhY~ zrWIO71m}og8o9UQ{*_PSpPqZ=lPkjbd12Ri!E_#8X;aO&=+sb3E0{bhi_B7Ws&;kwb7tTR$eR{TOOp_~(%hT)oH@*)lPlT- zU{k=@&W6}W5fuJQNen#HvA9Djo>VfL%5g`v{_? z)IR_xCGQ(>5quaJ28c%a!47e!A21i0Mb2fnL#*YK|CUB@kIQNhyqaZbX;+xSXwh3&AJtXXuhif?NZwFT#MQNiow z%|oK@hUtdkhT?_;KHX8i!zAkC$eCPxJ#yn|z7E0$rB>X&vnt-oJ2?fbiqgE6)w0aDHGJ)N=-;Ms%i+^W+lM>avX0|2%pLblOwfhg zbk2bLj7qp4uR`*BDo`&~L50diM;B=xqN%7dvU1@L>p5){(1v`Iwy8~CsTP&;R4po5 zCGsvC$s{F{a3pp|b+TxjARVA^xVPnYLLw?7p(t!SE(PFxKrO0H96GySdeiZ>=l(7x zzXM4OpYKV+l?&>RMgp@J*}$Ib?WN9F>=GO{%@mznq5On_;S#E(?rvgJ}?$ zh80?18Xp-nP7PTwrOhC|uv|YRhwo^fy?OAd$gu?R6-A8{?ibatz=a|{H}?+m`AtlS zB=FpVyMf6fCb(vD5Kf_rR!fMZNJ}R7TZlZE;G)UBhY7Bh+~33Gw=ubi$r2{G7IFpE zLy>d83yEmG(k~qn6_$0ABw;@s(oKe^xFc9ih5MBlS`0WKlzrU?JIGimRM|1Aa?#@(uGL^1J6?(fSdE$wKGTE12 zUwI{M>`ET`pDK+}lROAVJnGuikf8EpaNbNArar?|Wf%vzLK()LVRYnAPqLL6#*$&c zORC*q8evjb`qNBHX%1%HBQTy7Yno}wpborXu6~1QMvv=snrVYxY|1%gO-eJ3(31_u z3EGwNrkT2IzgMlv4p7y;4SSDZ?@8PHGWL!Qd!JzMOWOz5rUm=HOqD%T<^HqE=EcCO zFXL)>v>V&(gT031@qQYxWXLsO!{!&_IU{{BZ=_A~rqd5`fEm#S^DWi%vtc+if{kJj zjhlSs4V@WuTrzaU=Q*1V8E;FN`O0q?@@B9e7T3HnlUc%osQ@~z2_=`bzR2rPf{y1G z=4qy+Dr8~NHS$`%auSZzxvtb#OzuNl9N&1>=MJuyQiOD$I4q-Enmy(!00weMmJt5^W&}Zh#?Z)433!> ztX(bwkCArXyJ$cg$$MEXMlhuuMBtTI6<3K>ggCetwE<(iL1mD9Nbz1Pp5XiRl40it zpqP6fk~<0#jUz|b+$|uKH|Ku@CBLUEevMwtnE&+}a_%+`>sye3&H6*c9!!XxMu&u) z`lE>^YdrV&f%LR-29F;;>3G^WzXA(B3C@6Ve!!S-8Ws&pV<}bYb=c#lneIQ<{Lt`y z!@d3Y;`dJqdtXlXofVj~4;aIi_E@Vbw#UR2b1TpzpPraoQT1jxbcOpiP`DaQ_G5xt zLJTqY0V13h?f@jdM(#tz#2VvJxxYd~d$&$g%+vUPaZogP|t7ej9E}QFr z0FM&saI6u2WvdJZdy}n}4Y>RX#Zv(;i6!@sfrkTU?g1pffC$$?y4VP37<+PmhH)kL zKXkOpR{UXI|3)3gg))qO-RRvgwh6|zRa!81K-{8Uv#xSJVW|4X_mAB^meEy0=t{TV zv~8Uk&X}uiMiwKP3f#SudRgZf{?&Xiqb~_3p9z{X+L4fjRUU<9no}%ngzYQJHi6Htn#>5Tjxl;!K{nDFkiF z@x(2?v<>TQ~)FojPA%Xmw9o2YqK zX`XKne1`Z8F8O+{^1hk;w0u1cOZr>V&vM=y*?IDsvYs6&%k!+{wOiLh=@dV$oD_Mb zw9hTeh3rL%Rj109tI(d4rznWj%h!@dqbkdRlofH%fy~QEL|yVEy4>EmI51SuokX2H z#zZ}z3oC8?4DfH6M8g)nAiYg^K)bf=Bltz-J#sRFL?fJG>rkSJCh#oFaUs4^qB*XU zUyrgLc^&fM8RD@U<(h?Gz81bH$mwpPC7$FqE&l~t!z5%#-`9}Em z$qfNABX8o(BrIly;G63sm&BK*;&~quf@m$WKffg(2ELlfhwh}^!biCs5eesLb6#IR z@c-HWfdgR4|HEJncNY?$j{5`n5tXo?k~|KwM=xw?qTvVCjShRe~!tIF!>jl{7X!JjLCgWeu4>Z{UxuJ1Pi{0 zSrR)S-d?%L0`XwpfqJo4ND2y@tKY(bFp>+aNmxgVQhJ?loHQ4s3Ww z1n#g zf-9D>K^#M{HEq~@aEiC;6>ME_q$oLURk=@G`vs;RoH!W1u0haYv`g2R93xQ?FGykq zl4dBk+Lm5jt_SYUjtys@;KVLEp?}q1)Om|sxs~1PO#joK+!G@>X(Tp-!L`j18mknQ zfm4~HXbpsIAbtY@8%Uu^{tx0hPQt0ZZL*3t|-DC`Vy|qB(9pF_|2_N*@o*$YzN#XJ4q`PaB-9@A?*_RV;LE^2AV{@Q!KMFEP_f})oU6hz_zEV@ z;dhZ@h2I~D!WZ`Be_LXHKYXGE|Fk4(&XRvykjy52{3t4?!*TdT2LG1@I>|{felz@I zOo$6b;RF8JIW{D!F3!zGNaUNCe`5W}yT+lWaVT3+3IDVKAqyB5_wSMTDJE$wG4GGV zKrh1aDkY8K) zA^&~;F~V#Ir7$L|-*tW6^`$~vqj{p#DHPeN6AF5q#xOp7y8{0gkv%|56xfgED2H;C z!!!ja{9x#^<2lL^nsT^q@ymR6B1bt&&m-zI<7tJhm70_6-%Ks7~59XuCk$({xRC57p58X!mc>v?f#Qd4KfwXjX|C za3fp|@7LX~%W6rMq3mFMOgD{-M)-oluF2{M*-TYBNS`Z6)t0_xmwuV3lRY3wEStrT5sOs8VuPwipttDAEWwPBozIZ&_ zK(Zc+F=V|2HX>{yuo+IUvMmI*BDIaccFJ0n^%2-X+3eX)0=tmfO<)gIS)J`Aun%EB zfddE!2^>PWjlf~-@OA=6ki!lFcfuD(*BDr!XeFamlH9}GY^X^NrlUmuT8VCyC%`1-&GHOXNO_Aq1y(6=Le)XQ4*QJLG9_O8Gi@9_ppM8|B;}VQEq%+=x@rBw_f3#E@-~a4QO}O~UOs z8$JnlAiYzGj2bg#CXA4crSoVa zWcYMQdXFkewo0BwJIPkdlUPHt4tWlpB^by!W)wr^q1a?u5+H5z0J(S&%?Io}e>-G~kfN+q&A%xoq97edE zz!7A&gTS50Y!`u}$ZR)((xeb5&k1I7Q$pxhRru0uucr-p`2)_2A25-A%8t_^RJCTc zNWvAYK++*eFH7BkIioCe6Xwja%quWwkwtFBoDC&jiAZ{V5EWh?%oSc8%(-yjdOFyCkl4*vdfNu#y{SH2!c)^&~0@fP}v3m&I=T9sX&ngAA3nUtHYdLmYxA~Mp=3$ z%$a5BRbbA7N@_)9lXbKbb5*j6+A&uxFE%xpbKo-KMC3wLi>MBlnR-OhLWIbJueuje zBQ7vah?k~LA{S>8CwmIdIO)ToLMbUe-Xpewe;>W?JH{PbG6}>O6BJ^8q)s< DkKv2C literal 0 HcmV?d00001 diff --git a/lora/ops/triton_ops/__pycache__/kernel_utils.cpython-312.pyc b/lora/ops/triton_ops/__pycache__/kernel_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9dacd422b3437f4fcd263fc5282f6d247f62e36 GIT binary patch literal 10301 zcmds7T}&KVcCPCGZo27i{u(!bg>4LN^JB0LHvU@-7_(sP9siEwY^zNbxJ`F8TUBk0 zb9t0>D_CQ;!~>J~WnUs3TZ)vYoO|o1 z%MApNGW)V6eEXhz?>YC}bI!f>om>7_pU*?Vld%0|Txh1K|GBI0_LJO99|-)S{F z`FJO*?6jIc^M+73ZBOD@{*wivQY`{^*sf9}V{PM~5%<^o<6zaKk@%uUX;WTgHZWq(tLJ z{M>^bTKFI*a*bIeqxl~26h6=Bry6G_fZV9)p`D^KL6clZAqPbv2MQWoC6Zeofl8ig z`t(s*`cMA>B$c946vQ%7@k1DvDMMw*O={)HBM8VZRGQ}6=EOiWvOcpIvsLiryN%a% z$W-hyk}^hykXHzz&^@2NlKe07nMMUn5{vU=^co7G3SHBOAS)Bd(tH!F*fDXNq67<2 z^{5=EFi=1HV_!S})MPDl3~T0Mt;nrgD^m$ps>E>t)`57;>;dc4oA`H#477)}v9`ak zu(mPF9^hu}&m2#+D9=>zb-PN~%I|!N_ZK|7$B#NYp5K@~&;19BDMXMVvK?v z5=1tGrnB^4{j-fifh$zR73+mz9tNI)_+oAf|DHhJyR?;&# z#%PBnbNV#Xj;G<6R9XUg4Azg=hcj`J%!6faVLhXBf|6VeR!=6<)Djs@Crz%{%3Os> zv__(ZI-$tK1hNP)q?Ag+iy@JiX2f(d$+1ivRtj9WjT1lJ9#pTAs3>3? z>1NEU_)s=b%%lo&x*=G7m3K2&g*3;+RVxw&wB3U@**ORYp1H#@iByb*7@3MocR^R! zQPYMT25cx!fY;Ta#X+J4d^dABCd85u&jKUH$9dSHI5;3Kaf~J6Vp0z}lahHU2DIqL z>R~mTK^;*d{=Oeeq%p)1QpxxHk9RAMhkW>8{mAAd=p-**gDc$(8g6ab+u0<>Vn#wB2lD6XlSmvSd2 za?y!y<{bQjj$<>91B`VG_&{{guq1Z;*_!it@o8;YGA1B%<&(u&*Ir%=^5pW;%N8J!T|_@@L6;RGix zFnkm+UK8*ORVc!R7YfCpT1mhNNGXbqj;5381RQdL;<$3P|1yTEd^#D$hrOuK6N&@Z zq7yJFK1rh-∋&Kw)PqgP^zdm^eTA`&}@0DCAL-`hX`Gu$R6C)+0DlPywA80Vyx z2kEX8$0xeDwnY3++vGIt9lq^;B9UxMq=Z;oYEo=dQFjzqRfrxQaBfTv?w@r4S9w+LhFrFP*8N>k z@Qs%W)UAiB7aJEEml{^wFT>sQ&euBq>(;Duy`u8*`22W|`xQSs@H>CyTDX4k;KISB z8>``#taHs9`m$rObD?voak*=?>WJ(;@)_1FP>~CM_GjM(%GQG6Tx zIKD8x#H|dztUWV7@LH#T&1VM;CGe!bE3ICuZCJdwaBn%ZTHBr-SPN8rDJM zJhfWaB?r3Zi(u6?b&G=wgG(_OR^6U;uZ3!s+;S+g+#`otvbHy_{H$x0HB0do_UrcN z?DMX#lD|Fj&T6Tvm>+oOqU!23wf6t*&z|pJnfj{b8}ViBkV&UycID{T!IiEj$#0Il z3=e#mg4v=CmoHn9i^#!!vTNV0ea%dMX5UQHRB`R*1Df)NUf3$$g4-dMuk>8h(<#M$ zWq7c+KRR;lqT=c4>x&MKT)uYo%4jtnsXchFk_7ADNP>kb1ScxaR3m?;1_d&;`~?%F z#3w+(gNgx~LEBR;5z18WGTvV>zGj#4fr9b1D8$keO|Xyc$eO7`whY5xM^eq(!zA07C9~JWyS+x`x<;9Os1CVVD{F-m#HuC%$gAs!_Te7f zr}g;sp3Hv4plU?xv^vrxqt@uzTOgxG@5wZC%$difi=A?hSi1fT4k(tP@J~8}f zjuI3h@qchv?oz{}hmd`XN67^SmV^!#c%#JdMl(93`r3#C7fxiia6};TKoahI2xmm5 z1%-v(&WL8z^2}~%*ovA^D@pyDOzSEu*f?0*W*CV%wS<vCJ&e3AorG_NeJVH;$-YW6s|6%_;@;%7L_7^c2XRY zZ$M`RASrfYqez=#pNz9nUh(8TqquaRC~o2a>^jAzJ5h0hKS&cjLcpeCzZf0vA5p9j zBZ^<;6#>slQLwY(S4FUgwdB%hXfJH&&>P$dF$&}HtfbnWuXd1XK#d4INJZv~(V|N& zM8K`ser>oQSeU@z!M4&=#UJFVfVe3sp>QhPFP;o41qK5Jjw}!Ezp~bB7lg zJYy9uFaS{z`Ch2$02<7N*-bW7+?s?@K3`2L_EBO8$7qx!A8r+hgIPqpMjX}O;}H&F z6#{H3ss?>j4SL}gG|#|4@gL#nnxWp@rYTqHM$Lg2uJD>(%C0bANI;=C7H%wkyizGQ zoLH^th7s$4D?g+SkghA3?fK=;*DC90eQTk*Iqybg{nCZyU&#B<$dzYiee0n;kEiCR zau1f9PVDaL@#aGowX=bNro&(KUS ztmi%6nF|}P@->g|vskulc6_cmS0{U#-dJgOWW!UEJ^Dp3+Xd2cDcRHV#%^)9Zdxh3 zXVXi$fvGz8Q1(Rd;Da#u^Y$!D2EV91EPD>)(e1GQqT*Th^Byev#hyd5=g=Fc)!nR0 z2~hsP%)slC3IM+3FSG{6$A5t}mwV0S*Z+!1CAj7)d*up)Q(n0mWLLw6Kd=^#yxUh& z>VRygiSmaZ@pF9c_N$sUxu)$^O}AXry;^*7W?;=3AU+63=KOl@(zbFz-g{~_e0tWe zCP(ghcdkBnV>QsQY+rFc@k0jjyiE=a&J3;xN*?EXX9hPwvAn@v?1|)!P@3)gqCPty zd#Yd5waK0~Y=pz=Vm*|<@(Vh5H0P9ynWbZLanth0E7I3LS(%bs&VFA<(Ph|mII zJc>7-{urSDpWs#bn1Xv<6;fM~JyXSxm|n9G`-KkxhW8-B448j z@Tx5UUIxg9l&1<{uT@_eswo&*Z1fNrCXnsddmstP!{|DI=3YqS`~aj~1VmTBcb!Vs zhN?c)OS%P67{Hefrp*b<_wm&3>4*V+Asx{p*Mc0p6TZ_~P{3H_en=4tEu4q8g^5NU z+BR%SPuv9B7IHWZP|8$0$?i#07?zLKiNMi3aojAbK@LN{)5U}z~$2&eF%lejpI zi!-=5i;ExOg4|d3;MzG{^x>i(3Z=LJkZH65#xz<7V#1Fx=Md&J(4gL@I7RqM16S#V zurPlyO0XdYgTiH8`~(*t;ex?M9Q1@3bJg4R1HdBn zO42F&>;8K0-$Z@`7ld$)xVKC8cVU1O%=W6U?)l-RvvOI7?CMy#`JDZip6BE8vGc#J zlaJk$T{qSDyo*4i%3QbXX~59vz=p@CBcNQZ>}kZ9pa}z`QUfvNc-hm8fl&*Q@h(7> z0u(YZ5#S^E+ZGeSm;y{x4gkYnw^nsf14i#!@zrl0zZmFdE3g6FyIlXpPv;v~LeG!C zIDcI}d0lS5J~Oo8D*eF$PrcmIEBpFp&ac`0fEsOKa$yY^r~a2{-Q{`Ynsa5#b4AMs zzIC;~2HH30%W}CtS-$`L#J8?~0(nLvZp9gmvZ>K%R0-mX^j7WuoLri#dmSI8iuGPx zQrzT!yFd~ks2dw600NG~ClnEu3&bRuQsV@2w%^UJ7pCZ_nTUuYH(&Yo#n+O{05I*EIY&O&{Dm ztTVO~vO_125b~%_9wTHYRZ{l2dcJzIi;%~49ZnFkTUYobAy4VOrwMrmhnsiayLpz7 z=V)EUUK%%OI;wp(L&rExpQlYqs&rmUmCmcF(w({{T`GNCU&aZQ?$%X1snTb3UCt7k M=<>!w>LdjI7pFaR)Bpeg literal 0 HcmV?d00001 diff --git a/lora/ops/triton_ops/__pycache__/lora_expand_op.cpython-312.pyc b/lora/ops/triton_ops/__pycache__/lora_expand_op.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e71edd3e3e8032f0cd364c1b3705a0c3d638a16 GIT binary patch literal 10691 zcmd@)TWlLwb~AhqIYUx>==~bKMO&sGdB}162rVgzY)O7d*))n%g3^p6%6xfdXj|@Z z%VYzr)eTlsv5n+nu~CWz;=n03{i+KT=x+P53oHtdW&$x2FIGTZp!s8BFE(8iXwSXF zA*ojswENdFb?4l3&pr2?d(Yf+?z#T8&1NOw2^)S8+UO>Te?t}RsVe}FDFZ>Q6C5#3 za3rS-lhY);^CO>j1oafBc+_^OR}t))Bm65msjRqs@;;Z)-RAY z6JAoW&&Hw>e^ZJCqrrJz089lJ3-CAN!6+BF$_r6G?A0lbz&tMn!Z9HjmP%klO!SQ{_mqM0hezx>Jm)YA7i{5-*P<7$p2P< zgZz-3@ER0dOjOK#^m<5$MHM}mhoXhP=*X^nFg*w&}_=-T08Pubf*xw=%Ijv2lEJ z>Qm;(cH6Oc&LZ6E%?Awob>2)cRo{Z}zr-dG>@9!Qkc0e1t>LVP_(kI&!2j$yKE&w% zh0z17n9q(~7&;#q*MJEPIIoySzB4irIImd!!^45miHomHO)8cl|KwC)>a~j_ie++a zbZ8_n@yfVDPx{A40+UmhM~6og$19T>92mRcAD(RlDYOSkgx2Tr1F(Mh_6pw-LG1VK z$p*Pn`%03HhtTR}Pm#8%jJB{LYXYD*M}DL$e!rlhD5GsFqec7Cu1B`WZMOhdl(1cH=Zse&{1C{c@)_H&uX$JB4w;obw_p!TcFLV{ z$JHw;7r9>0$%bh4A(FV|)*X^`$t`jv;wE=#oFt1TzU+}ZU3SP7vQu_}mX&gqTrbzj)pD&|BRAY@P&tsq65%yX2xwae4`m46 z42fSL`~Cu9zW8r=yk-G81H~}Qg|4GLD>g3#BY_a77{ak2r|2W0sA38V!6^7kWVqgA ziX{|{CnW3{DeNVYk2J>$C~{mOClvC$!b-8Ld^8Y>a-msX42Us7;<>=glImGQoT#{? ziAVrbiGetHOvFs_V3ulM5w=O>iDwmcAk@hrhJjz8)Z|db;rak4gg9Q&5dFIm0~*2- z76U+!a9H9yo)u^aJ1&XGHI#kr70xgSx z_8!GDD+L450O+rR5e%B*AubS!Dn|TPUr~ip6^13zCgG-H#*+jn8UvAl7`nwP4B`qA z$K0F<9?3MD5QM;lHb9t<7UmX~S6dS_zlsNS!+b>kn%1UJDw{w=(St>Z$3SdwVc-4{ z+(=Qk%6LbJC5%A}I-`ky4aS%vo(z)&xGD3Rd4t>-%vQ9eMjuwVcGMr)94l`uzp+Vv zVrxykm}6Y4uVt8~)JV?eUR}!A*v-)lb1XIT$l+W`E+;ozK5?|CUVgZv_Q+nbGQT{( zy0BT3sq$v+M^Z24%=XpRx02hG{gKPF-nG`1t~>TIm8m6lzwNHs7+gD(8q1m8 ztAF;u+zNDGS_ya4@}-{+d@%U_;N4UAk7te!W!r{9Br|jS#+!e>(fPp40^Ji6;q3C`sPz_D*4c-|{w+-I!&a|D))(@o(HQQ8o`skSqb>`bzT%arI}d1F4}$22YN%o4J zTmo#*Q5IaKb5!*syM|89tEHJTp75)q0Ce_o9%wOW8@M$Y9^@~-_F&K|SWG?uiztD?{K2-E0b`{Z>Y7%qd;Cx^< zo=}*Aq2VuDu?9H~Y!E&^Q85X;ln|n`D4xfyu`~t|4 z$W4j5bHrODP@MgKB$dBxL z?iFnJ3fN%Mmb51w$%>>i=}Nkj9vSj($k!|)ue0mn45%epDO;0Ohsl+!=8Vaj{VX6^ zDM zs(lzBXf?@Z2v>P)+r3Y_}8>W*2;r)B+FcJ)xpm)u9$?ce-7+I}F*S*ULM;0X-2>*9@!r=3>`E zfT}XwGn(4Ed$_%Bd{0Mzp${8T{XEMDXBRZ#QOsWy0O=~lSZRUB5*`b2tiXqZa1Kz> zK}kI%_*g%SGbLz?7R?5#6gvrB{d;AIxPB%)cYK5_6)cW z&Fi5z*bq{iXQ6e57m5tg@Gv`AD|KGjpui*LXLuIQZE(gy+1HXVc*@}-H z%oI;hd*t%7vx!Io)+uzIXT?MW#w{{HV(cTgPaSO+=hyqNHo@?X;F8Gpu_CgCT?@LO zL?Cew8w|&y^X!d~w6NcRC@P; zA|Wu01s==+nM5$U1o~e~@S;>O6jX!kG&a+cAmwp4+Z~EZJ!~v4!L$X#u%%qUr$`oC z2|E`P*d4n`i3^(iVSX+M`FX5wU#Y1S%Cp@wu~>Kq|5q1yBr=*JphMz=ZV>isimUs^ z4lQ@=SiT*jz8DOPygIVLf|pT~9s=^BFDfSW_~|u6mLeXImg2l(i3D%rbBL%IXM)n~ zf}}oMPUj!Ddri-MO=j=aRG>Gz*WV9Xm13CxTsJFgfq2R zU?9O=gE(7$NI?O_Q}FlrG8BJBs#$-Tcmr~gNm3^FJJ3LOzQAEwhg>>@kZ>&FbjVG= zOQHigSzF_# zC1dm6y_{isKkomOdHNTp|5HSB@V)jfa!RTH&e+&EBxW!B1H+?}{u$putgM8zdk>~mrkkc|kFs6aL-ET96# zT#Bna6Q(+O#ew%E$Vb7=06v#jVT8K&IO(-NzW)IF0S)gi`#eMqNsDl)r6tvjkLhd}3~bRM0l< zTsgUX^2a0Zjo%sn=$W6L{_ylaKJ!nf{^3-*el$amrVXQMdh~M_d)t(ARIbR&@=uK) z*xt8&l=#_qKm2aG`Sb(F=`?jZ?Klm&A$LpOL@=(E=a-*fz4n9CKc3ud$~N{rp!?E> zzBJvJqieS4&J5l8i6#q>TO2F&GQH~mfj#BV(T!VlYld#!?9b92cbyODBWc5tG<{@e zL)tsK<(%H`VzLYjxO4FBew0q^$@~I7Hmab1X4{S9LW|{}@liB7I59kwV z!-+I~B1boG(d`+!ee+V5?z(&YXQw_qb>Epi`uzQ1=IEIR^qI8bOqxFPq>8ZAY}oHI zAJ6@LgAlXX+zF9ds64NZT8jb<<~d- z8CzrO#m~&HTwU|BW7`AMoAGpPc?L3`fvo50+m?rQjgJf4;$uB*k5H+%-ge|Eg0g>K zepCKYVm-N*d|++?r$5U+osdW*BzUnPu95>y8>Qb!$HuwJOI{;<6ef z$Uom=vF1+_)oiY^F<0I4*iQoD_eKyke~u)K^zYA;MfeXQx3?oqum4xQahx&l z$RI+ann6S!XdSLd1VsNK;idnVTe3()|FT_0-9DV<+ect72pirS&^-%YTto>I*h1<+ za$y|cLK?pirWkOTg8#n&idW_m(OHFwBqVeo3c$Uws8|rYKqwN2|E_=?{EOaZ#T*E5 zvDrXCbvxF=e|3P1WdUW5gllM=4s^NfLoUGwH_E=!<+87^A19{eo!O)zmt3;gy#)&Y0TDS{16k&APuNE#m73DWu-qT_Rd`FG;%Z;1=P zCAxn@oP;e&SCQ8-q;F#~Pr&Q0`>yn1-B$>Ftk>G|O@xm6e%+gOd0mtwP5C1PTtZw~ zzLGzRt36uv7_R!XYA>$#5pd}-wmg>a$JGIJN5ZTz`KNI8X>>;b-Xi}Du0E?(2XXZ{ z0T&t#YYqA5aP>4%RkMC&?aEhYP!-9?6eOAfxGGh-MOJ3W$}Cx(uOe#ea@7rAR$tMP zG<@?|0zIm1dulc+-)p$jknwcpjR*si;Hh0dxpp#d##M@_ZF-Nn!{jZvY9+dQJ_x@b z&eOQc5OvL{d)|htcA}nrujfur-hrzXgu}BEUJmD-xauOR8`i^X;k+AHJ%qa|Ux~pg z!daQe(&cLill7}wgheXV* zQLQ97s;Pn_NR=BwNbJY82x6^d^Jj*}Tm}3Za#bDa%Fdjt?ROTV(F7y9Yu3GM-gV!a zZ_Cw_arL};{_AaXs!63vjrtU+FP??U)~F9@w=a2f7FLz|(w=+A9t;>Z zsj80Tnfr6@ch3De-#OQRuB{DG@aWvH;%$u-^)0THAHGyrAArI!_lx7;5Zb=EvR#e4Sq?D

$;to<06zreQzo87N8C$U*zF6G?uU z@LG_8zG|bYV04@!?1x9}(5CB9Y9cA{;?$ zWJ-Zv(74=&k0^Y`MdWOCWM@IovZNytEQ74F1Z_BVNu{rRf`|KUV8ws|UItx~A1YL% ztVpO%VF#5#S!i=RZ_&}WukTzf)k@gP4s&Q$Fz^6xv%#hI{Lm{ChO;kAW zla=PuE@<*J{N#I}vVolNa}DMq*IeLw^IY#jY?0eqGD6h~+fg#Xvt*{Y_L2o5mg1UA z970x#Ybx0Qx@V_2`^@ozy(4e$SY$g(4t(vTxIoE;kelNC1-3cQHkUk5o<4c65kz;+ z+P=he5EY&A##W9!|t6#Bg$FZmY4t$>i3V*Fsq!S`}6yo0eg0|?NHIwbwnTu z=`1~tx<*|R3^$EB*Bz}@r3??n#AFR$3RA5HVOUSLXDcJnN0I>z08}~CaK=mwfB}^r z0ENydi|LG+u$_UR88`#5FKG+R2pNYWCI!_Jk;4kWk{xTA@l-4c>faMkke|3_$ESxruy~5BOdgPtK#pWmSwkLAT z6A}*}1pQGTNkwIZY6^KOITpyiK!>KMO{hg{iw2M(OTP(DpC22`%g798?D7QINYZ2o zS!#v>4z?#_B89g>W0np9B26pCgQe@l-s+Md%R|*wItuUKfSTGxu!JNY_wNc8Ldpm7-(YSI{FhTo>igwGxd8p$@3nMvr!3YxUD zatvVY=-{a#Ya}CO5m}�6gLhnqKJYb_EYK8Wf!(967z_3j+9Psh_wn<1vdQj$ z4cNi@OhLgDLXv_kge1+awuW8QDEwKIPI|ETVD*5q3?!%}Sb8w`K?!D+ii(|OKwT~~ zp{yA;pv+o8wd1D2os+eUSCnlX57>B|{uMCzcQRDgEYaXtsKE-GY0&YGqb**SgbJ$w z713H%F~?D@s7qN()+#VnH^qZ0V%GAYGKh6$_0BvDM}as5Dpq4vV~|e3=Om;837l<| z6of2jllH;`^p03mXH2OJ<6=}%O;RLzQ6$O&IccgHd=kL1q(SVw6|+Yn0}QD8YsyN6 zV`=m`j4z0?!lBgAtW;EuHS_jyfMnb>az3?8JEg=-$1c)2czpxg-GU7GnV<4Bym9ue z?zb+_Z^<`o%)2)&x`We4?>4s0ANgSlPOXnG+Py`4Lou)s|C+j%n>XLsb!&XFd0)}r zcF)E7tut)NPucx>TPL(D*aLZcVE(Ij>>EohRMW;!0~G558Gn~+n?HIBg!N+uZg-yB zef!K3x9{C&blDr6e`evOJ8TeT?|vFoBzr@S@uM@MTKZhZlmFgRE&IESKkG8>-(pei z#B~CU2^@T~YS$QyR;q9qB!CLZ{LBB9WVkE?1abuk#9FoJvS7nz%{q6I+-pb&3CEgc zfketuIq9n&_;K_;)g_%@!2!2rP4Klx;A?iJLJV0m;Ir#rn{rf%AZr6cg?Cv89DNoD zNheBY)(J$}o8`tU7HajQ*&6HU^~V^^ufN8^eSH#0l_jLwsP`L3uSs26&)Te#{FE!} z8gH-b0>!LZAcCsO8nUhj$@2AoE)yj)3=A91_o}N-XV?`sEmcS*W77p+RK5(>$6oAs z8nZ2!J%brCY~p=WtbqM&faeqzTy*E>N8iddpNB(r_#( z3SnVX#wDnsbOLJR&Y`fzrm_@)M!$le{GX5ky*E>C{~HJ2^1U@W-&py1h*_iS49 z1WO*)u^BXdr~ku_&9m0!z~C+G?PLFHT@0L=84F}ww38Lj6*GZ+ThCtZ}k)JY#!&FD%1x-k*1awyp;Hgq0NKGLX`LfI8S2fxCrkR^j*lI3W6~)o>m!f##a?4 zoMeUj{{<*4wF=XCS%Ilt39ynX%2Tyo=S@?Fs!sZl-W7#wo!(%djkDEy9hx>=q>>$f z6!SVgE8}7SGl?GSIJKw5|6y6Pey<8vQ?jw9KXRP96r(Oy{L)hp6&goMt4m*C4D0l& zjKTb+#$bjqtfLA}S+W*^z6AR%*%V~0BncIJ_Zn>gFRte41q|!RHk9xRs(~ayn}7~i zpZb_V`qJwWBh>QM);a|CcX{W);$?;<+~y2_Uf2E4^AJ&-hzmdn=z5Eb5n184g}|$Y zx`U=D{&Ddl3D7v`0g@Ywi&x>1j>nXG3`91e*Hq;sF(QP~R+KOftO+b4tF~hiPlALT zQxjsiecA_D)fkP-(im3M4X-Nr5RZ(r)WuiXtwyA1G#(2h1!n?lSxjinTL&;{*$vq= zb=Ti={rKGR1?I1vf9YKG4}x*^K+TG`vEbd9_ioI2L$B@s`=Nh0{?p^{GKIZo^Lx)O z?mSmAq9cx_?Cyds@EcoTxh+^|+mUbEaqD1V=TLs<&|=%6*G_;-D6qNE_GrHC(c6Yw z%3|B@nG-8E*O%@BuBttAduG_8%{LqQ?rwCBwLiV+-J7%T{n+I#Hg@7)U`w$%P-yPU zH}@4=df^Y;Zu19!a_UE?7MuEu4edpL^Ys&RCyIdV$+;)*HQ3zN5@qA8n&F6^RbZdU zXq5@68f}I(ug05xM+6<5NqE*Lfv$R(1cP9_S&4r@)L{J{rZqjxU!aGjrUxakH#t4V z^3ZQs1)E^M>CkoGhqiOB!R-11nB8l7c)manui&fc;};sBPg#k0=(rlIhPlurG=G7w zw7`s8zrc*!YQ{ndfwh={ue1+mY!H7yfjt2C7S3jNo*ff;P!ch4&QHjqz$+;p6h~fx zdQJ2`!Jo(QwN~Ooq0o5! zFcd<3_%#^g`|y?pAE{9WaHugEX8^i@BKGsKQ5r(hTl6l3FvI2~ZPmPjk3F2r3 zvM6~g?H=$P0SJRbt{LIQe#yBb<^Nr4iH-oVX{f%44`tE{9X zF|0@vV&>~dM^PE+=i{l1vbKH-UKEkw!&=k@zAslo6cA-OCyS#Kaeg!<@q&0^;^IY& z!CaV7_-G`Fd>+!Ft~fi&YT)10UlVD zq!NiG5o?4;Cn>G~X^z)maM8ewh#*Rt3KM34}c3+lw&a8v;^06qDm+O z6Vnb+d1Al*Q%0xs@Q#yT(ny1@Vtfcy#i0W(1@^oM(ll{}p3^*1||;RXia1`_to9MVrMxecS8 z+cH0Z0^tR?lVTF1go+Fs2^U)6h>BOm=mZHG!plS`!)lRBP~CrhMpU%Dhwku_M};0o z6{(qHT8f^- zB@QBf_b3x*g=;6A>wyNRAJU)`G5DKH2HLx;+{6qxAbwcE(N^?z;``m;ZC>vi0Ud?W zJ4Bapuz|aTo2a1XX*{^Hj-3zEwY#X40Uc=-K^{`81UWyX_%^ZPW*A@SfHO zYaxSmH_kYSG8FWwY?#=V;jn5AhZE#-Ai(x;_@#+R91m$YEbYY|C0pzYhe_01F5m2d*CrR4X7HuzP(#an_uTV~Oh?hv zu*ftNz5F7>mvgs&k?|Luo^PEjIxw@lW_w;cw#YQDboEUipKXSo?TZjo?K)R2om(-v zONI?5>q2v(Ye&9o$7d82R*jU|L#j(hXaXLgO^pkEg$=v%8+Mf_EZrU}JasPr)Va?{ ziS+o?fG@OZkr|;)5Uo`;ZBSu^-|Ahcg|H7b4u$%3m6p3}FA=V5oR>ST8tX$SjU8h+I7rd zs9nd5h1!Y(wmKlFUC&^p#_`r$UBW=xdZsJXuH!udB_5AGBy`9sJOLqti@XgGp6k}j z0+{|VbMucY(V49A8eEi`GQI-WIV)FtEyEc{@AJKhULo9jr1xa+naq~*1ryC7#G{P| zj5s+!u7Lp4tq~l3Djf`R^;RXZM60IdTAylW@GL5`WknneFFM?4bG-OHQCNS?NxOhH?<-9Q3p?F?uNt!DS^B^h#)3OXxLHjmgv%m4%@6 zC<-#oIu7AmVm3z*sxyo0CV;ka`9jKmi2V*-jf!d1(^S0UgE-0M7YJ!1IjLMlT#`?z zM)7L2F04&rKM7C^wYu6nM(@&biKoU3pLc^syC-?K>}DdwKqeLR){nt$(R4wCrrowH`yTGVpz-}y_kqpE_cxmWUc-veZX>K{F95Y3!m3rZUSzp? zA6qAQywMk=F+zc>t+DbXKyVcqH%vzUSH5t{kWHC|ih0}=49rYZ%Z&Z?uNNG_yd!v{ zXVEcuhuNxG^dQ2peK-uaAZ&1xoSbPm7s0L4x1gu=JZ4)lLoNnQ^sDotEaA+w;|W)^ zlracy5IF{cib=fcMWmS8gr;5y6uY<_iY3x1NePpJgfS%IVT~lhA3DISxg({Id zswS;5x>_X+=SnYNt?ekmq+Qr@mV+r^nMEKHD(%H)TQK`3X1H5gWQGJ~h>*oAxLU|` z2%)2xk!TGG#YC_e#f&WWs|ewGN>iA98#9#6L{B4(^4uC+28rn$z z(snBiqTD;XW8OUbWZv0XGD5{l!={o6o^b4;uLU8N^7vOdgjx^M(8b;|e=u+7OGYe} zHHeZ43(yogaRk)9O+Y&!C^-n|#Id;u=!PM!dI&gdu+dKf<0u*7x#HeXGQkr@;OSnq zAQWKetqVOR3Ldu(-0u15u~h_1PAlyxLTebeXT{Tr$hE*81r_{>R4b{Oyp)yv0 zaaP?W4k0T-HiYaD#ws}wa#AjD$%T+MOoR^6gLG46)d8;H0~pt<74@c)1xr?(rwt)H zD8Q^3`%LCH#+IU^XXdK~2V9uzxz$nFc09lB_>$wq^r0f- zEij#VrgOo(#BBPQVP_f(Tx*_dUE~6Hn0C;YnbzNzp0UH7z0Uw+?+0A#{NsyU&$ssD Zk?k$VyclMvmHK(BV`z)<=bLHB{}<@MDkcB` literal 0 HcmV?d00001 diff --git a/benchmarks/sweep/__pycache__/serve_sla.cpython-312.pyc b/benchmarks/sweep/__pycache__/serve_sla.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b42164cc379271f75e1369ac2591616f6007d86 GIT binary patch literal 16061 zcmeHuYj7Obm0tHu&-?wr00V$P4@i(05-+~QH%*D8#D@gQBBeDU(ZeCSK@2z#xMv^| zqk)XIl^C!Y3yi%cyq2>_?WPtgY0Gpfm10#YDLuR!SLM%)02Rh9I;InDt^Fr85}1;# zRHgEr+tV{c2(sf!_E*v*Zl8Pm-h1xzeCM2d=Rdhzb`DQc_)g-_AK|$FN)i1DRf)A9 z7&-11Cvg&=*TdIAyJ+*0g6N|f&9jVUYPM))AEf8;V^JTK!CKl*dAqa=^4IceaK zQ)$RjCFMQixzDG*b@)wQOC3Ih98;gqAs^Cn@Eq#iCm$Mr`SZE-R@d`R@l*U}PVU;m zDNa_>C%N^n!;uTzJbxp8jk|8Vz+K~`M}A9H4n+;BIi8ggBRSP3#dGnIWIUUV@~RaP zIhRPupoM4Rxv{8GwH~LB&&L(j@?;_>EAgaiy&~sgapejXapI$_i(p=zb=y#8Wb=ICo8!CshB#@zXgu1$J)s@tmx@BtHrcs!N$n z$1cn1k+D=4!+)F*M&pi?|oFnnQ6Lc#Dy)x_m& zI#V<(4>pX!U6RUw(k z=8%|ERI_|NC#NOV63@nRpp_<6uL|RtL^>)c&4?&16wq*~rir+c%Vq`8BA(A4AIPO9 z1||{{eL1W(IoEf6a^HddSNF*S$;9P>iK*OJCOz;{GMO5vEsyL#mX_9lHi_9}ynkX! z^~Y$rA`-hC&&sh0ENi73Oxp36rL8{0J+N~w-}eJ^r@wP{=HxFdzOvQznsd%MKelLJ z6?c`yUAMDEao4hVsMvIP#d_oq=eXx+T4{+cZT?Bm8$Gx8zB#bmaj~5nSk9=ve&PQh3Qx zY~E9H>?sO+l&!?gR%4mm<=x^;rP z2nB80RF_T#tb&3`hrela!GJs%g=tH!Cfy3=f~8=TxVQK>X`t|IB(-$Ortmi+JjZiW z27$Y=U*M*UQDNu}o-K&;k4KGcn<+HiBncoDvQewH;rAjGU94J> zbeFy1vbUw|Z7;WWJg}L9?oaHTr)h5VJ7XU?Ou^=lom@-%M;#l5O!GV zQW|qt-RGFMU6?m(t|qlHvwXo&Fcz#imtGs@+xlj8=@-nH(JIbN+X}WB{zg~9GVamy z*53u2&V}c0>=C$vS>dO+##*sD40^rezIsWWl4`iImz%cd0($y*P^ao|tTwwOyj5G3 zP{G+hy?!8`;X2&S1&WiF~Wh_L8%G;VU0F zw^qVj^VW~rIjg7a3Ea1}E*!WWT(Lc}YCBZ29lCRT#dhrNNAFslui55o^U;MbFMjO< zYxEN%XMdEhm^o)tQ3!rwX^efAr9JY_(ePst>L+-kgg3wMjXfiz!e;RZl8@ z9mtp-nN$>PyeZ{rl&Ebp&>T%#gV1Xbb&%+68&x;VG^Dy@V0ycuU^s`(A{kjM^iUb$ zp)y9=Ptlsq@Hk_-IiGl`Vn!SKhaDmpzf9HG){Nx#xXL&)wF}Me9;W zN!-8OdSDjw8(Dn1+7r;Cp{0PIYNH8aWNlVRrK}0R zgAhL7^8O!*KZnFmJMrf#*rweDx5QsPf&LlJaNpo<4=Jz(E1`D5%~YJ4pyauYL}bC8 zVAgtrP&1LREVy;hBN&8G(gmic;F)1O3yuv%rl-Cd3c?BQHSf2)BZdUl>c!0*_igW- zH&+vrv))l&GQVmqcuv#|1if@!R2FJA(#rdu;YPQ>P5Y+31!0^t=yi_{$He(H|A_Gl zaR#Kmi1DBHPWub~aRS(NPr+C4>SFe$;YJu70+c&Pn}9uqfZj4CP=_JOqNhR-yQiI) z6>HuPPVTh3Ay)&8AeSp`DYy&vt1n`a^J)IZe=ax*&Vp;yFpAwc0>M6FnD$=NDC4xB z=%{^2A@uspdY;yL`il6O;h^}LS;S5o$i89vLXkRB-Sn&vgvPt^w?>-6pYdmb9?*`s zi_~DgVe|8I%zdJF?FalV9{3l8PQ(3;@f!aIPkPOtppG|;s-b_7ZC91QnkDfgiZj}- zv>m=8r{(Jt%F+C`hd1cu%w$^1_8(4WM&ilr(f(RSn!7d9xj+34_vs8*^dI|aVyWl* zdly<3BR_0g4RrrB@w+vWbKh)f<3DSHN-mBlq~#{!VqB3$*wg*LWy2Ky+_!Wl$G*Se zVrZt2y3N2$IRy&#mXb*47>Y0*O%W;82xV6_!a`O}V3W$KewHl;Q3!t z#h%KL37X9b_mea&|Ha@BLs~6>^&+z}v+aZ|^Q{*;i`bcQ@3t6uy1xPV(o+mqX8# zn>J|x++?4%0f4>XqGwCdx&>j`>V3^K=UK3=Si3&5a-og~CS$Ap*G-$|`HGQqxo>2b zY0V&5_8j}!?zgAjgFW%(lJm=`6N;>@xxyiT#mz;cf9?L4?iFzm6*Sv_Ee_sv&hzsJ zsFKjCv!~?jS?ajs`@ngqEDln^;71(t!7_GwW?x=$wB2?3ivH+|b9=dU)9l%@(=)$w z?$q4z`;^?b;_R;``|e>}qVGGHoigE5H+H!9MM>aU_SxXo3w$^0HBe8U8=2aNIv4 zsx*%EI#fx!8nDEwi)Ds5u7a!J%tLLGY*$+$AWWD;pa~(#jeq!JEV+s~RxVQTRSIGh zTte`V0T;?;s(pk43Bf}spqz6)+hNYt8>X9H;X4HX}wAVI-61xX4>!BSEP{#X%Q zpn@cVl^g;%>f})}XcAY(a>_MIt_fV_Iz>sDRA_Jf0lDf~*BRwcQRY1=7yAPu)f=8U zQTDaYoP-F4VKFby6&CrDJ2Gqhe@UPQLUSkTax~aHd-9&cJHKb{@WRgb9qkaL_Z)%w z^Rv+5!dV>+$x~CKo-Q@mH+3ZmzA<5(O00Wy{S%JFn#&QIcg;PeJ16E$1#?yNu4>jC%=>~s z={3#UTClW8JL55#-nBZ6muxs@V^41o?9{8)TfEA?qGkBYr9PR+H|Y{JbNmp@j`&Tkw6} zJ#-^fbq{=AKqJ2`aX z;>P$uzRL!40`8#o=7errRZR)MZb~q0fai&8 zBw4_qZa_HBzGJ^5tT;Otg~jBmbN4&;-({Ko5DN2o>uEXzizmgrQ_N+mojoY#Et*l) zAGI}DRexBM&)c3QXPB5774xpf3SY<@dd0l;;vZFQ4W07Vr{dQWsmYX>w~M10Ma0Bl zw)Ufw%5e%#P;e4K)TN|}j!-~e56#&uz%eqW3L|nNsbna1f&$X>88RtniF%TPrw~K~ zUu?)kDHBOf%`sgwX>d*m;mYihLDj7Z+L~Xa{%xmfp>NP>IL&;stQO>Yfc?LYP z0R+r+A(N&4XvPapCTVFh&);{E@qbc@_p12X%9-r5eJeuCUBNaxw6K4*ZAYnX$8uX= zv9-TwA6O9v@B3QHR(ILzD+e~=|6=>nbWuD~3LLrX-?Dh&w)IZ$&t1#@=Wz&A_BO*6 z=r_-tsrWgoAEy+*@PuaFv)2I0Mfa8!Yd475t1H%4as@VpZ(X=~;jY!OOe+t;vi0m; zyK8p*I|CmXd0Xq6(HJt%3_Y+?3s23zI6t)5yYx!2|LC17Mf>9`!V~wq4>UH68sKl! zno(|wlmn5nzw<$}bE~-|bj_S8dAsJD7q&fM9X>suod4?Lk=xC;ZFl^4p15=1?Ndej zQ!B#I{q`NduTDQ$gi+@v)ES;RGjFTFP zmj_-NgB>eE2zWm8^e0V}8%bWx8Tzxq@W~G2-*%WzZnNN+ht9a!S%W7&N=Ik)!v{xp zEORZE%}MZQt4=K=&Q9Yn)}30eYK$#JjjkPF6Y>+PG30M2%{&vG5yw9r5c&8ad*}o?H!uH#p{Skl*YgHrlkaj?A#kpc%{oDc`K> z5{frhM`Rpe;=lt|+*^YFJsqfH?KoUK>mOZH0Uo)ihZo_3hhNbQY9A<1uI60>I>WQ2 zqtUdtj$W`nc=}fGpk>~I4g5`SR&|YOU%`i?m2L17*1Tx-HWYmNSi!Y)07quW1nrg) zjy!a%_Y|y>xlx0GMdjZBLI19b zpq8PB;E#ZWdDD)8Xn;Qq+ce!|fHYeF%+*xFDiAiW147j{hT8#bl~&=9T?hCgG;FJ$ zer)7S`zrWnH=zYpXPW(~cbvDk&mLTLbmBl`(YfmAd&gOY!?q4SAHI;86tS5mlaffz zGX1aoT1R=3*4Ge9qGwrvKrS;X9-F)&!1sJQf(*TVtbsf>8Q?8 zC4)0gc_gDqS=F4&#E5%9d5sDjreK}|)}Mb))b}X(J_W1~r0&)Gpjt5i3^270AEK|c zN3?qc)L}UFaICGJeL0Av0gOqbbyyU|OLbJN`t;a&89+9x{3YsmhE~Ai2tH+gJfIS- z0R$_Sr|!C2%IVO2S4oC zK5zd}+&+KmF7B1!koxBbm&NDjPv57a9p#otd2=8Ak3`GeJIh@=h!D3u2-qW``O}pa zt|d}x?k%^7f8qI|XUXbMy_+mClO-8QGg@t%^#iNtLJt8;H}=9y{rBm zCI614Q_KFnaAXReRUuRoLi72>r%KH`SA<>n93h+%eR7W1YxyioGW4B8{U;2jcegho z{F%XY!eJXhYVFa@ytQ|c^euSTWsWX;4jZ21WTPxdJgpVk1j;~^WE?ij7THR3k2Ha$ znTBl*X=YZ!E?H#9My3L;;#pr6bWuL+lI@ZWIona;x<|5aoIWbda)3F}fX zKkUWrCa3DFUqQomr(0zG(f{!{ek(8l$Gm9hJU#nwGPEM!--KneD=+qgqxENCZHxP|DJr3kv{}gIgJDqoq zldxX*C}OT=!;ROFxt?j`QZ-C5rkfhd*YLbv{z~04HX3luBH^)rT~4gDRNwj(HXq*FA!H~ z?3g%;3aW7=nN{9kx!Cn98f;dS*+(Q;n4+!Fn#gy4ZVmK5#PdfaXvif8Ib-IUHm?Q; zOToe0!R6p1v$lH<&ud@$_LuI4Bdg&ZrSOiW6F+(Sji;Bxhv&}Rr?mc3xPM7m4)2;h z^J|a)BTn$z@4AEY7na?f6(b_|a99xDv>M)C3U6PtAfdtuHhc9lini!fvm)mvmHM?4 zG^M^;evZhW@mMdE%APr6f%|9Jsts#|09)U@dN#c4z8|cjFSt@z*XkrAyS`CfYtSY( zE@7^hsGrE@$dSn&eT7LD$@-Q}*XB?gtphu(cx*Q@Gm%SV(%HPTuMg2aO|Ly4|F@V4 zx~fVW?##KTheW#HDH6uTaaN6E!?;Kn63Iv<@Wuz+OvI;LmRYR8=IrQHzj!h}GS={s zP@PWlaIF+5PO8AmU5C$}yl}MZpxAZz>1U1}e*V!XpF4T9Yx)wIsDSuhQSQIee~>ax z95|pK_Ot9q_6`nGMmo9z*RgmGfh?ZfbyBRJ@atL#l!qi(Ql7MkQmL$X;6QgbHJ(b) z)!csZoF+Yxf<|hiB0?*oQlyLEXlw#nNt$(6(^amGCD1#~fRwY$rl#s}`o_%cOBhdh z3ALMtmxy^z%(8(=AUEY~95u4YWo@(wqmeI@rUqVWv_vlTu2Y_RMVufa4>)NvcL~~% zD&(dnWYtQSClZQ`OEe@0^KNa1^%k*<6!~+1GA5%b5MdcHN3ynh<3gm%A|l$P_Xcg% zuZ%RzFzd#;1<;Rs7ylXyP_>MVWfHhTjN4rDXdF^n`8P<8T2vD>J6XXcXGx0b;zspT z4ANsVg{wlbE4c45q2ga7%d08DhA%a zx#D11EyxOmC^YM0LJ2ylHO62t7E|r97#1DweS>z!Vqcq#CkY^9F@;HJPYg%NP`25! zL6?<(M?>=06c99NL$?G1-vaso1q<7}* ze9OY-n~3$6BVVb{;zPQ0h8R z;SjkqwtDor($VK0P=w`vKzY~*vZgR_ddS|Yx#+!sd%Cwkh_n8=&Sy8n-A+))o{fNC z*K;DYziGaZ!)e-$?DM$8-vg zit0F#$(`Gxr{kv^P1DBQHa(78*>nlnTz9S%jNiZyr>2ds;4Vt-#~hZSysP^{cdA>8 zb)V{fs{34i$8ol|AR1vY$|7kjFl^I(?D8#ym584>;1Xj-nG$l; zkuiDX>I8ltkn4{INy(v(D5R#S#&qTy?vCV?QG!A3*8>Nr`9GxKFDdxn?M=LR6NbMe#!W2zeE>m$0PZnQ#E&?=N7+VOLeDQh5b z&D$Hi<+$m1-}}Vt*Z-pM!@`eVS?zqJ)cMHmuP=8#F>|`?Yr55Wvvcu4$=5q`=AOmz zt(U*?^1{K@w%$@(?@C)gY}R7i<8SBRP8Od%R}7zDc0X6NK35U^rodfKb2-#{D}6Jq zzuLvs_Q6v7V7aAzwWX)j(o<6+>qK+gt*_twdP6({7`S!u=EZWb<<^ropM+PG)J><& z{waK{&hW>a09NkSvir!L#GMylGoR+aM3(chs^x6vyn%|vXbb!gsII2KXIV1r-syYv z!0~O|d)sU$Ovd*Hy(c*1&o~q4hol?LDoL%iV+b0JoBHo)Hd0UvH6ue;8*u}j(~eb) zZ2UE8O7aJipQeyW{_0fWZ~)WvK2q<~G?B2tT}i>Y;_6s+^_E<{OXrteyFL(hYx*{T zjS3Kms1DpMX4(}rp6F5KJrq^`oC0>1@J~S@3mhm6xa6#6eUNIbEno{>JcbuWp$p&e zs7y^jJ=IW;gs#FA2A`H3W51lqlETYwh^o!>OPBu1Tq2q6PoyR?_{kNEC?rvs85vK- z*sos{LJfs%E2uB*mr^8NwHwRi5YkRNNT6y;h*pTeSs{6+7{Wh6UrteQfr9#B%y%iub}zxPHh(`O zil#{U4;1_@1*8)&)k#BmJJSX<`)@aW+DgF|1giD0rhpt({tKc6`q|%NomGrH&###| z-uofv`;c?s|F5{{FS$*>AdnTepZAqfwGr=ozRrpXZ`9#!saS}zQf)S( z?6~c{<{)aj$TyXp;e`_=r&uu}at}8tj?Yb1OrTN1x4B{?3MHVx5#@xZtKuTc&3XJ4 z4^dvu?O*c|1^<~RR1EDax%baFDrPI+xwvI9yY%?t%cZW}6%O&Y53SLw5<1Ke);S|m z;|#jWnJA4jQBKa&R&f#KrZ#wp@=_aoMETK%N`R;!7wD)o5f$Qm!AdhxEf~MGFj24Y z`}x*7V?=6@0HBGm>kQfmPtwI(pqP2KY_(#r*#E38IRse_dOOAXSPG%+$n6VuE{$#9J4o9Utq zO|Ow6HO3UdLJ74(WxY_bu&AB$hbj(6I=Mik0;!EmU>>P>80qDF;fjxueri&Hk@Q1? zN)sbP)Q)CGwop65jKnY)91jdT`Ot!Ujl&D-tie&CgvuBn<^#pRmNgFV%6^{TGkbm2 z6DfHjYaAjN8^Jl7f8Q7`ySB`JW!1H%7&-6`tkplFL6)q-6ceTI+{4<5G+qH&&E1b!;h#6GaIkH)vEOD~h|aD3M%x zb}35)6&QtqN~iEFvLM#Au@M~$ox%*#d*rT%tV~u;9w%c82+;~ zo=|4uiG{PvSp^8(q>`FVSZl3Xum?MTg3fQ?Pzu3=z#$lgRk(Jb%8pu(Rd|_``B`C= z%rdg`k`E5VDXv+E?7GBF39_KL6_2@x&%0nJjAX0lQM`7~+%J1!y-)UCB2#|Z4|Rjw zQ2h#U3&3nZZju|JZG~lHkpm;ae7YCqf5CH&kZk}P-3a% zOf0*s T%Vk(u6nF2`1>4Fy1E+|SiX31zUyG+4ENxo%|2nVZ&!#V9RRNr4D4;zTP zVa-((+O~y`dma6&+~q*IXK7Oz*0Amfei8Kf`%0kJYTBUsJ{&U+Cow%hDs*9v7_^x=?JePgt zK+~$1hLD4xEF~lzJi>5eZ!w(jy+#g!^*&}f@B~#+)&tSbZ zdC%AE6ejEf`l!FBO_jl_=bhCxyDbG^$^>@(VZMqTyGw{}f*%s*QIGqI?2y?cgUIk1 zchmkdcnSJ|S(d>8Ce9ko?~ISXAx)0G{m$6i(&*Sb(i?A%jv35Qgf)VuRY(~HycoPa zsb&nP?ar){S&Rta|GAm0jRpK$YF5cu&V=q|xPaTlFUH3WhpHJa*pQ^aAppD(LL_@1 zJpgO#OpZ3gIN!9_*vx~Vb7-SbT_g{BBBh>Ku_spcHC}mh?alJR-g0M8sq^Spoky=7 z|8qxa=yY-D^n=FGbw@25bYyFpqL8|hP(@BWPq~7zDq7J7HD|s5#VNUSv8e1JP8$0 zVo3)qGz381EXC*$vctr^)ut+5r^r^M#El%(a^VSSEdL)`Oto2QL{rp*QTKYP-^ckwMUEX(ba}~-u9CB zV9|SU%PVerk30x=mV*7oVE^{OiQBqo+bRx97po^Om>np+4%KcyCjRh>1^fPb~=2vn2$}YNhI)XvDCh=AyL)NZlRO zdQ~^VeU{m?X8vIKz`zwLUj+urt%o+AD)k;K_8z-6yw&@{=jV&PCyT8wUFH7Sc%b4U?fn%GY3^Aaz2A0Z zch}w4j`fj^`BM1VV))rxOIzWSUk(++qs7j#t=5-oTO+u&ZFTfsux-70-MJp$;MNy6 z1K~{}{HR^k;OjDo{Kdl7H)U_3R__3F^k(*s^9vb`0@V_KLD5iCIx5J%4s85HTWL&>X>kwMHZ^F zzh;(69o=2!3Agin!tSn$cqb^v8G}PR%s0P{kfbLw=R^yZ#JDczn_?DHK}4QSE?Iz; z57@+X+9DXfWM&rMNXwFvp2;^wql-#HS7}sEY70@5TYmhEy+qW^{7uY3PH27!zc`cA zMOg(xM%6`SDXVIV7#GtDJd0#eS4Hy>qCTfsXUj(>R56*(rj#_iBH}uvmZCZ@s{Hk%Oc^E!t5@TujEr>1fnM*Qogpi$XA- z9*hW<_X|o!)-ELVIr=injn7iR0Yjk5`JAHZ8YVN~`;9c7)D#K64v*Kkp3{t`2~|H+ zJ*^^}nN0PiVZui>MX^BuqAEZEM)4YAP>hH9j=kVef8AffymkkwIzXs6iLd2qeC=l? z@1dgi&_;OM8!2}mDnTyWJ+jsP!s^6bUvRVe$c=-WM}{{;&u#lg9)vosN3TUUUfk?G zws~x9Q+#_iapU z`=2QX`fqrOfn#@DA{7U7AhN@3lApvS+wd5_H09 zIyJZ%RZW?XXo|9>El?Sp`L^_*0>ceN;opAjbvXFu_2WAPnu_4$o9~5MuC5eA12?#0=qD8(D8Quo(8lp% z)8UE>hXQFE*wBluM=Nd|dVtlA7kb||3cTmrW*7e&Qwj0BXQO{Z`(tE>KzFBs=Z{qy zg8bndeLDo2@4~ITC3W)>5f43z7YbvADYBIV@+d;h652RWjUW)ftKpTbD;a$EX_mlI z8lMtVi}p+U1xsr$()VEsL$Zdif(pwp%#M>Vp*zHVhxqT1;Mb(*J_&qHdcSij4Abx} I!P?~XA8^rZl>h($ literal 0 HcmV?d00001 diff --git a/benchmarks/sweep/__pycache__/sla_sweep.cpython-312.pyc b/benchmarks/sweep/__pycache__/sla_sweep.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f6050f530a44ec0500b6c54786007b1dfc3baf3 GIT binary patch literal 7045 zcmcH;ZERE5_1^d4kJz!3IA4$gFAX7a0!~|K=|`ZD039DqTSBWfqjDX;o5sN(>3uIi zJfoqONtxDZsWy#NT~$||v_xndl`4%%`?3DsA6w}JdSfcmHfj53FjH54?VS7k94B~c zrM6efx!3pHkMqvgJ?G@ljg1WiT2lFOVj@7u-|>gj+?7Fjgc5R@yhwDS(WR!Rq#H*`|IHS6?`M$19k9?%*Ps81M};_Un&y;7GzDVnRzz>eNz%b z)}l8JLodFf8BS?oAw7f`ht`Q!fola^NbL98JFED%c^Yo}H|!u|VabvYj*MD?*rZ|d zSlmo8b2_7kspZvUW-OkJ83z3DWX>|q6FQ6IE~mLFgYpavE)zyj3KPGuM=&E0>fipWT2_nFlZ)z_@BKUfm7f?_sh{HAn$hxu#sY_mVQ}8#GL_ zPo5Mzv{z5@IrgfSn9|JYgrUWqYc%))+<8jNnA41(OBhUh^++sfuva51)C!Gd%%iE< zBulZh$@IgVXZVU_DF#bUaa3yqDbcihM~2MQ?9gmtc9%`@uJdzypLqPtUN)3WOb*S? z1N+lMXOqd)&?HO8r&BS0#uzfrF*ZA7Bx6xAipI2d;-G?fFJ69; z+;x%Qg9T4_8GaEFjFrjs@4PVCZ^qTQRsT|5Se0vT^mX zYdu$Y7PO~Vg9jFro1Vr(la_DIyYlfuVBilsR;K?nT?{<;spt7y!M24)A=$C8-||GG z=~#+IqgF#Sn#$;NNgVs5(bvIDD;RYunTeTJFbZFi&X_S1tlEf1xq_TfCNr6&;Fu+Y zkFwdO=!v+=djP*e#8rYH9Cb?o9}Elwzz@kSfAgEq-cmy)Nm3tNnky0LmRs*&SC)yZ zRm_(>VnktuQ6kXgjXT&?R=}v_@~Q2&n%frROOGzj6q~zCDom7Iqa;1ao|HkKr+Zb!|74mA%t)T@A)2Mw}E z1GeT3*qS#WR@{c21NH@Y?6;~A1w5}PmMymWYPF?CY}K2q`n4N* zxu##O?{~e^RSa$~c(#9|`gKnl-uI;nhF;yVxiv~_G;J>uTdQf4OM>A1c%d^~2o4lH zwm8T5qp)(L{ydIz2p+~`*6B~Ltv|iC{`6M$r?;v_4LE>k z1Q22U-Rk^>W133Ma#E529BgX3GRe2so9z0p)#y9k&+>`cyxyfTV zRhPCJ$v%se?Q_VhFWJM0M(_*{gn|mm;#64SLf+p&6#6?p3c&Ki)zQ(DM^7C-dGz2q z>Eo+}tqlCVFP50Wh<`(yl60vig2329W^Tx`ysVNPV(1C0YIC%Pc{S{^JV3=b<{y@4 zDv@NfF>`v|bB?sg@K;_009jQv9?wyoUcndw+N0!xWa28(W)x?%*5@QAvc4een7tEc zFGSj8#>xNJHjp1wF=bUnb{hSQ@h=J{eXlB{R#?mNH{x z07AH_TmC7YNkut}XSi;_3OSxM_zAecqEnW;GH)QM7;)PHw-+3o9>qhlEe>BI)j5=g zfs%y$rm=akf9cStjotYdZnkf}oOnBtr$0+BJoC9s0uW4+CMURduLk=Ip8n6>Fkfiw zzGq+n_LKHu`H$*B)y-doU-kK&2F!YXn-y+DmQooR9W@yZHLG0AQHR^_lJ}~de}hWE zYtR%3$_^1HS9ey5Ff4oE)HFkT)kG$$8Pl1$q^?aeEv6+ChN)$yG>pw5f{*b8Gj?i| zbEY=MIX}yu)Z$eI0s4`<;(aHe&YcE|04grND3{-KIyyW`uXtn&|on%xP0V#X!j?5UkE*VV_P=` zMQcbHiL_zH(s2f=2u!oQr+_u0W?(7ErLe@u;Xon$bz$yFn9a7;r3@W}@iYKXzTl>% zzF+P6G}wQ$r3<3WP|wYtzGe4^`|?92nKth(%W^1CB64Hkb^|D45lLQcIkp-+R`47X z56)QFw;>X?e4-cxX(`l)EGc7Hp69_8oJgEu{3WC{ieM0dh>wxdjq%NKoI~(E_%n_H zxI}KKO(jWD!}*q9cYV-xhrkd+mu4|p@{5sMJx=p`mX+o4t3Au#T?rI>_TC{lDOTQ= zaA?1i;s#7?ksH;fRz_uXyBkw4rb6l^$_X2tr0xbMsX`KlshN67op3|q<>7cdM^j&C zmOhw32Ul(d{k}{R;P7&^UUA|IEW&n(ypkm3G-%%i`GT@$x-bu9UQlz&j6=7*LBO2q zn4>}8YFl%uITamvje#pO=tnlRwcXAsPM?!%1aKrSXZ!|Lpg@@;Yfc@UhccG!J(?DE3JshwkC_<`o`DN*0_6=XKte4<>$B*- zY-e9Y6V;UJQbUVza37HQs!;e#861NGs{!nYo5s0>IW5vO%g z#j;#RhC^u%YAwRHP=b~pYigAeU(9N-i!NAxviy}X&^rg^#mp?uZY;p?2{DVKB znp$s>0j%*M0LVM{JhtG!)gSqtf5rdXK+(T_;np7bN8gRUhx5s+-B(Xs-4Che2;F&e=OZ5vukJih7(7@~rM6Kp7p0@6M3m;1QZotf zz5M_f|Kjj!P%C&eVLwn|0nVZD*XcIuq0sFy7+}U;Pp@?f&qm#1+nUBZBS%j_E2Odd z55gh1s`?;6vMmgSf>BBJE?;}kD$l{@_0Itn`8rhI(DPXIj785H_(jgc&R>Ba*}ipx zj2Tgroj2=v{7bOd*b2b*_#Go3onAS${K8ebMBi_Gr!gP9-r4`r>59+aszDS2fk!5h z=F_kTe0vjg{B=B}_U&<3E)->!>dO_W%)W`sw*a_EZUkF?dj3b}iyb3ZPyK%4qlu4C zuXc>A21g2>k&5%CS!t&Ru4+48)QbQs5_}TDAOhi{QF-|}1RLrY!owlJ5W{!^K*dit ztKCrYi!blaPk*rW4uPK)Uk#(%JE%HJmv&>BW0WrU+!ehbDBRAU$7|vMEIBqA=g51z zyyvU_pMe=170qgf7Z56_QQ$pG8(0dot1wtm6V(-wB8!wr1k}P2)O=P9*I`x4ev*#k zNPN-yue3bFb~$oC|2{0isKodJ0KAZtLb;s!|4O`{k)FSi?thXU5V1+VyHW>jUo^@D zI{0wOSHg+X9)WVxfkMlJWdeN#2cuGJ1vgme8bloC<+u+jC8e-sSFtlv2t|s4p)ygabhoLA9;XZD;uw_V?h^cQ acSk!NrHgx(WLRD3+*J%k{zGs|i0XgyEyYa$ literal 0 HcmV?d00001 diff --git a/benchmarks/sweep/__pycache__/utils.cpython-312.pyc b/benchmarks/sweep/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22651d268276e52c0aad5194071992c034579a98 GIT binary patch literal 535 zcmZutJx>Bb5Z%2yKH`cA!H&k57(;PvVF4sY5~ZR56FVDnu!|hr9hbdBg;*G4=MTV= z5<}xpur#spLV?-V#*zdpD|Zo$F}!5w&CJWZ%|3E^cg==$_8eXa=AQdTbh&inElysUal?sQu9B!u+9%0nAj2h)3&u6Ba zf=n*tN#CunL+slwc1>6J4R%I-{j!tEWm_4FEwhTRFVef)c z7k7Nq^3raX$34R_eX~a^%MW8QVJ;W;vLalAHnQ!K=M%iob!D68^7Ol2><{9 literal 0 HcmV?d00001 diff --git a/benchmarks/sweep/cli.py b/benchmarks/sweep/cli.py new file mode 100644 index 0000000..108cd75 --- /dev/null +++ b/benchmarks/sweep/cli.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG + +from .plot import SweepPlotArgs +from .plot import main as plot_main +from .serve import SweepServeArgs +from .serve import main as serve_main +from .serve_sla import SweepServeSLAArgs +from .serve_sla import main as serve_sla_main + +SUBCOMMANDS = ( + (SweepServeArgs, serve_main), + (SweepServeSLAArgs, serve_sla_main), + (SweepPlotArgs, plot_main), +) + + +def add_cli_args(parser: argparse.ArgumentParser): + subparsers = parser.add_subparsers(required=True, dest="sweep_type") + + for cmd, entrypoint in SUBCOMMANDS: + cmd_subparser = subparsers.add_parser( + cmd.parser_name, + description=cmd.parser_help, + usage=f"vllm bench sweep {cmd.parser_name} [options]", + ) + cmd_subparser.set_defaults(dispatch_function=entrypoint) + cmd.add_cli_args(cmd_subparser) + cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( + subcmd=f"sweep {cmd.parser_name}" + ) + + +def main(args: argparse.Namespace): + args.dispatch_function(args) diff --git a/benchmarks/sweep/param_sweep.py b/benchmarks/sweep/param_sweep.py new file mode 100644 index 0000000..986561e --- /dev/null +++ b/benchmarks/sweep/param_sweep.py @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +import os +from typing import Any + + +class ParameterSweep(list["ParameterSweepItem"]): + @classmethod + def read_json(cls, filepath: os.PathLike): + with open(filepath, "rb") as f: + records = json.load(f) + + return cls.from_records(records) + + @classmethod + def from_records(cls, records: list[dict[str, object]]): + if not isinstance(records, list): + raise TypeError( + f"The parameter sweep should be a list of dictionaries, " + f"but found type: {type(records)}" + ) + + return cls(ParameterSweepItem.from_record(record) for record in records) + + +class ParameterSweepItem(dict[str, object]): + @classmethod + def from_record(cls, record: dict[str, object]): + if not isinstance(record, dict): + raise TypeError( + f"Each item in the parameter sweep should be a dictionary, " + f"but found type: {type(record)}" + ) + + return cls(record) + + def __or__(self, other: dict[str, Any]): + return type(self)(super().__or__(other)) + + # In JSON, we prefer "_" + def _iter_param_key_candidates(self, param_key: str): + # Inner config arguments are not converted by the CLI + if "." in param_key: + prefix, rest = param_key.split(".", 1) + for prefix_candidate in self._iter_param_key_candidates(prefix): + yield prefix_candidate + "." + rest + + return + + yield param_key + yield param_key.replace("-", "_") + yield param_key.replace("_", "-") + + # In CLI, we prefer "-" + def _iter_cmd_key_candidates(self, param_key: str): + for k in reversed(tuple(self._iter_param_key_candidates(param_key))): + yield "--" + k + + def _normalize_cmd_key(self, param_key: str): + return next(self._iter_cmd_key_candidates(param_key)) + + def has_param(self, param_key: str) -> bool: + return any(k in self for k in self._iter_param_key_candidates(param_key)) + + def apply_to_cmd(self, cmd: list[str]) -> list[str]: + cmd = list(cmd) + + for k, v in self.items(): + for k_candidate in self._iter_cmd_key_candidates(k): + try: + k_idx = cmd.index(k_candidate) + + if isinstance(v, bool): + cmd[k_idx] = self._normalize_cmd_key(k if v else "no-" + k) + else: + cmd[k_idx + 1] = str(v) + + break + except ValueError: + continue + else: + if isinstance(v, bool): + cmd.append(self._normalize_cmd_key(k if v else "no-" + k)) + else: + cmd.extend([self._normalize_cmd_key(k), str(v)]) + + return cmd + + def as_text(self, sep: str = ", ") -> str: + return sep.join(f"{k}={v}" for k, v in self.items()) diff --git a/benchmarks/sweep/plot.py b/benchmarks/sweep/plot.py new file mode 100644 index 0000000..9947d61 --- /dev/null +++ b/benchmarks/sweep/plot.py @@ -0,0 +1,580 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import json +from abc import ABC, abstractmethod +from concurrent.futures import ProcessPoolExecutor +from dataclasses import dataclass +from functools import partial +from pathlib import Path +from types import TracebackType +from typing import ClassVar + +from typing_extensions import Self, override + +from vllm.utils.collection_utils import full_groupby +from vllm.utils.import_utils import PlaceholderModule + +from .utils import sanitize_filename + +try: + import matplotlib.pyplot as plt + import pandas as pd + import seaborn as sns +except ImportError: + plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot") + pd = PlaceholderModule("pandas") + seaborn = PlaceholderModule("seaborn") + + +@dataclass +class PlotFilterBase(ABC): + var: str + target: str + + @classmethod + def parse_str(cls, s: str): + for op_key in PLOT_FILTERS: + if op_key in s: + key, value = s.split(op_key) + return PLOT_FILTERS[op_key]( + key, + value.removeprefix(op_key).strip("'").strip('"'), + ) + else: + raise ValueError( + f"Invalid operator for plot filter '{s}'. " + f"Valid operators are: {sorted(PLOT_FILTERS)}", + ) + + @abstractmethod + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + """Applies this filter to a DataFrame.""" + raise NotImplementedError + + +@dataclass +class PlotEqualTo(PlotFilterBase): + @override + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + try: + target = float(self.target) + except ValueError: + target = self.target + + return df[df[self.var] == target] + + +@dataclass +class PlotLessThan(PlotFilterBase): + @override + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + return df[df[self.var] < float(self.target)] + + +@dataclass +class PlotLessThanOrEqualTo(PlotFilterBase): + @override + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + return df[df[self.var] <= float(self.target)] + + +@dataclass +class PlotGreaterThan(PlotFilterBase): + @override + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + return df[df[self.var] > float(self.target)] + + +@dataclass +class PlotGreaterThanOrEqualTo(PlotFilterBase): + @override + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + return df[df[self.var] >= float(self.target)] + + +# NOTE: The ordering is important! Match longer op_keys first +PLOT_FILTERS: dict[str, type[PlotFilterBase]] = { + "==": PlotEqualTo, + "<=": PlotLessThanOrEqualTo, + ">=": PlotGreaterThanOrEqualTo, + "<": PlotLessThan, + ">": PlotGreaterThan, +} + + +class PlotFilters(list[PlotFilterBase]): + @classmethod + def parse_str(cls, s: str): + if not s: + return cls() + + return cls(PlotFilterBase.parse_str(e) for e in s.split(",")) + + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + for item in self: + df = item.apply(df) + + return df + + +@dataclass +class PlotBinner: + var: str + bin_size: float + + @classmethod + def parse_str(cls, s: str): + for op_key in PLOT_BINNERS: + if op_key in s: + key, value = s.split(op_key) + return PLOT_BINNERS[op_key](key, float(value.removeprefix(op_key))) + else: + raise ValueError( + f"Invalid operator for plot binner '{s}'. " + f"Valid operators are: {sorted(PLOT_BINNERS)}", + ) + + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + """Applies this binner to a DataFrame.""" + df = df.copy() + df[self.var] = df[self.var] // self.bin_size * self.bin_size + return df + + +PLOT_BINNERS: dict[str, type[PlotBinner]] = { + "%": PlotBinner, +} + + +class PlotBinners(list[PlotBinner]): + @classmethod + def parse_str(cls, s: str): + if not s: + return cls() + + return cls(PlotBinner.parse_str(e) for e in s.split(",")) + + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + for item in self: + df = item.apply(df) + + return df + + +def _json_load_bytes(path: Path) -> list[dict[str, object]]: + with path.open("rb") as f: + return json.load(f) + + +def _get_metric(run_data: dict[str, object], metric_key: str): + try: + return run_data[metric_key] + except KeyError as exc: + raise ValueError(f"Cannot find metric {metric_key!r} in {run_data=}") from exc + + +def _get_group(run_data: dict[str, object], group_keys: list[str]): + return tuple((k, str(_get_metric(run_data, k))) for k in group_keys) + + +def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]): + parts = list[str]() + if group: + parts.extend(("FIGURE-", *(f"{k}={v}" for k, v in group))) + else: + parts.append("figure") + + return fig_dir / sanitize_filename("-".join(parts) + ".png") + + +class DummyExecutor: + map = map + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + return None + + +def _plot_fig( + fig_dir: Path, + fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]], + row_by: list[str], + col_by: list[str], + curve_by: list[str], + *, + var_x: str, + var_y: str, + filter_by: PlotFilters, + bin_by: PlotBinners, + scale_x: str | None, + scale_y: str | None, + dry_run: bool, +): + fig_group, fig_data = fig_group_data + + row_groups = full_groupby( + fig_data, + key=lambda item: _get_group(item, row_by), + ) + num_rows = len(row_groups) + num_cols = max( + len(full_groupby(row_data, key=lambda item: _get_group(item, col_by))) + for _, row_data in row_groups + ) + + fig_path = _get_fig_path(fig_dir, fig_group) + + print("[BEGIN FIGURE]") + print(f"Group: {dict(fig_group)}") + print(f"Grid: {num_rows} rows x {num_cols} cols") + print(f"Output file: {fig_path}") + + if dry_run: + print("[END FIGURE]") + return + + df = pd.DataFrame.from_records(fig_data) + + if var_x not in df.columns: + raise ValueError( + f"Cannot find {var_x=!r} in parameter sweep results. " + f"Available variables: {df.columns.tolist()}" + ) + if var_y not in df.columns: + raise ValueError( + f"Cannot find {var_y=!r} in parameter sweep results. " + f"Available variables: {df.columns.tolist()}" + ) + for k in row_by: + if k not in df.columns: + raise ValueError( + f"Cannot find row_by={k!r} in parameter sweep results. " + f"Available variables: {df.columns.tolist()}" + ) + for k in col_by: + if k not in df.columns: + raise ValueError( + f"Cannot find col_by={k!r} in parameter sweep results. " + f"Available variables: {df.columns.tolist()}" + ) + for k in curve_by: + if k not in df.columns: + raise ValueError( + f"Cannot find curve_by={k!r} in parameter sweep results. " + f"Available variables: {df.columns.tolist()}" + ) + + df = filter_by.apply(df) + df = bin_by.apply(df) + + df["row_group"] = ( + pd.concat( + [k + "=" + df[k].astype(str) for k in row_by], + axis=1, + ).agg("\n".join, axis=1) + if row_by + else "(All)" + ) + + df["col_group"] = ( + pd.concat( + [k + "=" + df[k].astype(str) for k in col_by], + axis=1, + ).agg("\n".join, axis=1) + if col_by + else "(All)" + ) + + g = sns.FacetGrid(df, row="row_group", col="col_group") + + if row_by and col_by: + g.set_titles("{row_name}\n{col_name}") + elif row_by: + g.set_titles("{row_name}") + elif col_by: + g.set_titles("{col_name}") + else: + g.set_titles("") + + if scale_x: + g.set(xscale=scale_x) + if scale_y: + g.set(yscale=scale_y) + + if len(curve_by) <= 3: + hue, style, size, *_ = (*curve_by, None, None, None) + + g.map_dataframe( + sns.lineplot, + x=var_x, + y=var_y, + hue=hue, + style=style, + size=size, + markers=True, + ) + + g.add_legend(title=hue) + else: + df["curve_group"] = ( + pd.concat( + [k + "=" + df[k].astype(str) for k in curve_by], + axis=1, + ).agg("\n".join, axis=1) + if curve_by + else "(All)" + ) + + g.map_dataframe( + sns.lineplot, + x=var_x, + y=var_y, + hue="curve_group", + markers=True, + ) + + g.add_legend() + + g.savefig(fig_path) + plt.close(g.figure) + + print("[END FIGURE]") + + +def plot( + output_dir: Path, + fig_dir: Path, + fig_by: list[str], + row_by: list[str], + col_by: list[str], + curve_by: list[str], + *, + var_x: str, + var_y: str, + filter_by: PlotFilters, + bin_by: PlotBinners, + scale_x: str | None, + scale_y: str | None, + dry_run: bool, +): + all_data = [ + run_data + for path in output_dir.rglob("**/summary.json") + for run_data in _json_load_bytes(path) + ] + + if not all_data: + raise ValueError(f"Did not find any parameter sweep results under {output_dir}") + + fig_dir.mkdir(parents=True, exist_ok=True) + + fig_groups = full_groupby( + all_data, + key=lambda item: _get_group(item, fig_by), + ) + + with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor: + # Resolve the iterable to ensure that the workers are run + all( + executor.map( + partial( + _plot_fig, + fig_dir, + row_by=row_by, + col_by=col_by, + curve_by=curve_by, + var_x=var_x, + var_y=var_y, + filter_by=filter_by, + bin_by=bin_by, + scale_x=scale_x, + scale_y=scale_y, + dry_run=dry_run, + ), + fig_groups, + ) + ) + + +@dataclass +class SweepPlotArgs: + output_dir: Path + fig_dir: Path + fig_by: list[str] + row_by: list[str] + col_by: list[str] + curve_by: list[str] + var_x: str + var_y: str + filter_by: PlotFilters + bin_by: PlotBinners + scale_x: str | None + scale_y: str | None + dry_run: bool + + parser_name: ClassVar[str] = "plot" + parser_help: ClassVar[str] = "Plot performance curves from parameter sweep results." + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + output_dir = Path(args.OUTPUT_DIR) + if not output_dir.exists(): + raise ValueError(f"No parameter sweep results under {output_dir}") + + curve_by = [] if not args.curve_by else args.curve_by.split(",") + row_by = [] if not args.row_by else args.row_by.split(",") + col_by = [] if not args.col_by else args.col_by.split(",") + fig_by = [] if not args.fig_by else args.fig_by.split(",") + + return cls( + output_dir=output_dir, + fig_dir=output_dir / args.fig_dir, + fig_by=fig_by, + row_by=row_by, + col_by=col_by, + curve_by=curve_by, + var_x=args.var_x, + var_y=args.var_y, + filter_by=PlotFilters.parse_str(args.filter_by), + bin_by=PlotBinners.parse_str(args.bin_by), + scale_x=args.scale_x, + scale_y=args.scale_y, + dry_run=args.dry_run, + ) + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser.add_argument( + "OUTPUT_DIR", + type=str, + default="results", + help="The directory containing the results to plot, " + "i.e., the `--output-dir` argument to the parameter sweep script.", + ) + parser.add_argument( + "--fig-dir", + type=str, + default="", + help="The directory to save the figures, relative to `OUTPUT_DIR`. " + "By default, the same directory is used.", + ) + parser.add_argument( + "--fig-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate figure " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--row-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate row " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--col-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate column " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--curve-by", + type=str, + default=None, + help="A comma-separated list of variables, such that a separate curve " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--var-x", + type=str, + default="request_throughput", + help="The variable for the x-axis.", + ) + parser.add_argument( + "--var-y", + type=str, + default="p99_e2el_ms", + help="The variable for the y-axis", + ) + parser.add_argument( + "--filter-by", + type=str, + default="", + help="A comma-separated list of statements indicating values to filter by. " + "This is useful to remove outliers. " + "Example: `max_concurrency<1000,max_num_batched_tokens<=4096` means " + "plot only the points where `max_concurrency` is less than 1000 and " + "`max_num_batched_tokens` is no greater than 4096.", + ) + parser.add_argument( + "--bin-by", + type=str, + default="", + help="A comma-separated list of statements indicating values to bin by. " + "This is useful to avoid plotting points that are too close together. " + "Example: `request_throughput%%1` means " + "use a bin size of 1 for the `request_throughput` variable.", + ) + parser.add_argument( + "--scale-x", + type=str, + default=None, + help="The scale to use for the x-axis. " + "Currently only accepts string values such as 'log' and 'sqrt'. " + "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", + ) + parser.add_argument( + "--scale-y", + type=str, + default=None, + help="The scale to use for the y-axis. " + "Currently only accepts string values such as 'log' and 'sqrt'. " + "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, prints the information about each figure to plot, " + "then exits without drawing them.", + ) + + return parser + + +def run_main(args: SweepPlotArgs): + return plot( + output_dir=args.output_dir, + fig_dir=args.fig_dir, + fig_by=args.fig_by, + row_by=args.row_by, + col_by=args.col_by, + curve_by=args.curve_by, + var_x=args.var_x, + var_y=args.var_y, + filter_by=args.filter_by, + bin_by=args.bin_by, + scale_x=args.scale_x, + scale_y=args.scale_y, + dry_run=args.dry_run, + ) + + +def main(args: argparse.Namespace): + run_main(SweepPlotArgs.from_cli_args(args)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=SweepPlotArgs.parser_help) + SweepPlotArgs.add_cli_args(parser) + + main(parser.parse_args()) diff --git a/benchmarks/sweep/serve.py b/benchmarks/sweep/serve.py new file mode 100644 index 0000000..45ac446 --- /dev/null +++ b/benchmarks/sweep/serve.py @@ -0,0 +1,416 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import contextlib +import json +import shlex +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import ClassVar + +from vllm.utils.import_utils import PlaceholderModule + +from .param_sweep import ParameterSweep, ParameterSweepItem +from .server import ServerProcess +from .utils import sanitize_filename + +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + + +@contextlib.contextmanager +def run_server( + serve_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_overrides: ParameterSweepItem, + dry_run: bool, +): + server_cmd = serve_overrides.apply_to_cmd(serve_cmd) + + print("[BEGIN SERVER]") + print(f"Server overrides: {serve_overrides}") + print(f"Server command: {server_cmd}") + + if dry_run: + yield None + print("[END SERVER]") + return + + with ServerProcess(server_cmd, after_bench_cmd, show_stdout=show_stdout) as server: + yield server + + print("[END SERVER]") + + +def _update_run_data( + run_data: dict[str, object], + serve_overrides: ParameterSweepItem, + bench_overrides: ParameterSweepItem, + run_number: int, +): + run_data["run_number"] = run_number + run_data.update(serve_overrides) + run_data.update(bench_overrides) + + return run_data + + +def run_benchmark( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_overrides: ParameterSweepItem, + bench_overrides: ParameterSweepItem, + run_number: int, + output_path: Path, + dry_run: bool, +): + benchmark_cmd = [ + *bench_overrides.apply_to_cmd(bench_cmd), + "--percentile-metrics", + "ttft,tpot,itl,e2el", + "--save-result", + "--result-dir", + str(output_path.parent), + "--result-filename", + output_path.name, + ] + + print("[BEGIN BENCHMARK]") + print(f"Benchmark overrides: {bench_overrides}") + print(f"Run Number: {run_number}") + print(f"Benchmark command: {benchmark_cmd}") + print(f"Output file: {output_path}") + + run_data: dict[str, object] + + if output_path.exists(): + print("Found existing results. Skipping.") + + with output_path.open("rb") as f: + run_data = json.load(f) + return _update_run_data( + run_data, + serve_overrides, + bench_overrides, + run_number, + ) + + if server is None: + if not dry_run: + raise ValueError(f"Cannot find results at {output_path}") + + print("[END BENCHMARK]") + return None + + output_path.parent.mkdir(parents=True, exist_ok=True) + + server.run_subcommand(benchmark_cmd) + server.after_bench() + + with output_path.open("rb") as f: + run_data = json.load(f) + + run_data = _update_run_data( + run_data, + serve_overrides, + bench_overrides, + run_number, + ) + + with output_path.open("w") as f: + json.dump(run_data, f, indent=4) + + print("[END BENCHMARK]") + + return run_data + + +def _get_comb_base_path( + output_dir: Path, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, +): + parts = list[str]() + if serve_comb: + parts.extend(("SERVE-", serve_comb.as_text(sep="-"))) + if bench_comb: + parts.extend(("BENCH-", bench_comb.as_text(sep="-"))) + + return output_dir / sanitize_filename("-".join(parts)) + + +def _get_comb_run_path(base_path: Path, run_number: int | None): + if run_number is None: + return base_path / "summary.json" + + return base_path / f"run={run_number}.json" + + +def _comb_needs_server( + serve_comb: ParameterSweepItem, + bench_combs: ParameterSweep, + output_dir: Path, +): + for bench_comb in bench_combs: + base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb) + if not _get_comb_run_path(base_path, run_number=None).exists(): + return True + + return False + + +def run_comb( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + base_path: Path, + num_runs: int, + dry_run: bool, +): + comb_data = list[dict[str, object]]() + + for run_number in range(num_runs): + run_data = run_benchmark( + server, + bench_cmd, + serve_overrides=serve_comb, + bench_overrides=bench_comb, + run_number=run_number, + output_path=_get_comb_run_path(base_path, run_number), + dry_run=dry_run, + ) + + if run_data is not None: + comb_data.append(run_data) + + if dry_run: + return None + + with _get_comb_run_path(base_path, run_number=None).open("w") as f: + json.dump(comb_data, f, indent=4) + + return comb_data + + +def run_combs( + serve_cmd: list[str], + bench_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_params: ParameterSweep, + bench_params: ParameterSweep, + output_dir: Path, + num_runs: int, + dry_run: bool, +): + all_data = list[dict[str, object]]() + for serve_comb in serve_params: + with ( + run_server( + serve_cmd, + after_bench_cmd, + show_stdout=show_stdout, + serve_overrides=serve_comb, + dry_run=dry_run, + ) + if _comb_needs_server(serve_comb, bench_params, output_dir) + else contextlib.nullcontext() + ) as server: + for bench_comb in bench_params: + base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb) + + comb_data = run_comb( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + ) + + if comb_data is not None: + all_data.extend(comb_data) + + if dry_run: + return None + + combined_df = pd.DataFrame.from_records(all_data) + combined_df.to_csv(output_dir / "summary.csv") + + return combined_df + + +@dataclass +class SweepServeArgs: + serve_cmd: list[str] + bench_cmd: list[str] + after_bench_cmd: list[str] + show_stdout: bool + serve_params: ParameterSweep + bench_params: ParameterSweep + output_dir: Path + num_runs: int + dry_run: bool + resume: str | None + + parser_name: ClassVar[str] = "serve" + parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings." + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + serve_cmd = shlex.split(args.serve_cmd) + bench_cmd = shlex.split(args.bench_cmd) + after_bench_cmd = ( + [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd) + ) + + if args.serve_params: + serve_params = ParameterSweep.read_json(args.serve_params) + else: + # i.e.: run serve_cmd without any modification + serve_params = ParameterSweep.from_records([{}]) + + if args.bench_params: + bench_params = ParameterSweep.read_json(args.bench_params) + else: + # i.e.: run bench_cmd without any modification + bench_params = ParameterSweep.from_records([{}]) + + num_runs = args.num_runs + if num_runs < 1: + raise ValueError("`num_runs` should be at least 1.") + + return cls( + serve_cmd=serve_cmd, + bench_cmd=bench_cmd, + after_bench_cmd=after_bench_cmd, + show_stdout=args.show_stdout, + serve_params=serve_params, + bench_params=bench_params, + output_dir=Path(args.output_dir), + num_runs=num_runs, + dry_run=args.dry_run, + resume=args.resume, + ) + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser.add_argument( + "--serve-cmd", + type=str, + required=True, + help="The command used to run the server: `vllm serve ...`", + ) + parser.add_argument( + "--bench-cmd", + type=str, + required=True, + help="The command used to run the benchmark: `vllm bench serve ...`", + ) + parser.add_argument( + "--after-bench-cmd", + type=str, + default=None, + help="After a benchmark run is complete, invoke this command instead of " + "the default `ServerWrapper.clear_cache()`.", + ) + parser.add_argument( + "--show-stdout", + action="store_true", + help="If set, logs the standard output of subcommands. " + "Useful for debugging but can be quite spammy.", + ) + parser.add_argument( + "--serve-params", + type=str, + default=None, + help="Path to JSON file containing a list of parameter combinations " + "for the `vllm serve` command. " + "If both `serve_params` and `bench_params` are given, " + "this script will iterate over their Cartesian product.", + ) + parser.add_argument( + "--bench-params", + type=str, + default=None, + help="Path to JSON file containing a list of parameter combinations " + "for the `vllm bench serve` command. " + "If both `serve_params` and `bench_params` are given, " + "this script will iterate over their Cartesian product.", + ) + parser.add_argument( + "-o", + "--output-dir", + type=str, + default="results", + help="The directory to which results are written.", + ) + parser.add_argument( + "--num-runs", + type=int, + default=3, + help="Number of runs per parameter combination.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, prints the commands to run, " + "then exits without executing them.", + ) + parser.add_argument( + "--resume", + type=str, + default=None, + help="Set this to the name of a directory under `output_dir` (which is a " + "timestamp) to resume a previous execution of this script, i.e., only run " + "parameter combinations for which there are still no output files.", + ) + + return parser + + +def run_main(args: SweepServeArgs): + timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = args.output_dir / timestamp + + if args.resume and not output_dir.exists(): + raise ValueError(f"Cannot resume from non-existent directory ({output_dir})") + + try: + return run_combs( + serve_cmd=args.serve_cmd, + bench_cmd=args.bench_cmd, + after_bench_cmd=args.after_bench_cmd, + show_stdout=args.show_stdout, + serve_params=args.serve_params, + bench_params=args.bench_params, + output_dir=output_dir, + num_runs=args.num_runs, + dry_run=args.dry_run, + ) + except BaseException as exc: + raise RuntimeError( + f"The script was terminated early. Use `--resume {timestamp}` " + f"to continue the script from its last checkpoint." + ) from exc + + +def main(args: argparse.Namespace): + run_main(SweepServeArgs.from_cli_args(args)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=SweepServeArgs.parser_help) + SweepServeArgs.add_cli_args(parser) + + main(parser.parse_args()) diff --git a/benchmarks/sweep/serve_sla.py b/benchmarks/sweep/serve_sla.py new file mode 100644 index 0000000..0403d1d --- /dev/null +++ b/benchmarks/sweep/serve_sla.py @@ -0,0 +1,492 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import contextlib +import json +import math +from dataclasses import asdict, dataclass +from datetime import datetime +from pathlib import Path +from typing import ClassVar, Literal, get_args + +from typing_extensions import assert_never + +from vllm.utils.import_utils import PlaceholderModule + +from .param_sweep import ParameterSweep, ParameterSweepItem +from .serve import SweepServeArgs, run_benchmark, run_server +from .server import ServerProcess +from .sla_sweep import SLASweep, SLASweepItem +from .utils import sanitize_filename + +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + + +def _get_sla_base_path( + output_dir: Path, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, +): + parts = list[str]() + if serve_comb: + parts.extend(("SERVE-", serve_comb.as_text(sep="-"))) + if bench_comb: + parts.extend(("BENCH-", bench_comb.as_text(sep="-"))) + + return output_dir / sanitize_filename("-".join(parts)) + + +def _get_sla_iter_path( + base_path: Path, + sla_comb: SLASweepItem, + sla_variable: str, + sla_value: int | None, +): + if sla_value is None: + prefix = sla_comb.as_text(sep="-") + return base_path / f"SLA--{prefix}.json" + + return base_path / f"{sla_variable}={sla_value}" + + +def _get_sla_run_path(iter_path: Path, run_number: int | None): + if run_number is None: + return iter_path / "summary.json" + + return iter_path / f"run={run_number}.json" + + +def _sla_needs_server( + serve_comb: ParameterSweepItem, + bench_combs: ParameterSweep, + sla_combs: SLASweep, + sla_variable: str, + output_dir: Path, +): + for bench_comb in bench_combs: + base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) + for sla_comb in sla_combs: + if not _get_sla_iter_path( + base_path, + sla_comb, + sla_variable, + sla_value=None, + ).exists(): + return True + + return False + + +def run_sla( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + iter_path: Path, + num_runs: int, + dry_run: bool, +): + iter_data = list[dict[str, object]]() + + for run_number in range(num_runs): + run_data = run_benchmark( + server, + bench_cmd, + serve_overrides=serve_comb, + bench_overrides=bench_comb, + run_number=run_number, + output_path=_get_sla_run_path(iter_path, run_number), + dry_run=dry_run, + ) + + if run_data is not None: + iter_data.append(run_data) + + if dry_run: + return None + + with _get_sla_run_path(iter_path, run_number=None).open("w") as f: + json.dump(iter_data, f, indent=4) + + return iter_data + + +SLAVariable = Literal["request_rate", "max_concurrency"] + + +def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable): + request_throughput = float(run_data["request_throughput"]) # type: ignore + if sla_variable == "request_rate": + return request_throughput + if sla_variable == "max_concurrency": + mean_latency_ms = float(run_data["mean_e2el_ms"]) # type: ignore + return request_throughput * mean_latency_ms / 1000 + + assert_never(sla_variable) + + +def _estimate_sla_bounds( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + base_path: Path, + num_runs: int, + dry_run: bool, + sla_variable: SLAVariable, + init_value: int, + max_value: int, +): + sla_data = list[dict[str, object]]() + + max_passing: int = 0 + min_failing: int = 0 + + val: int = init_value + assert val > 0 + + while True: + print(f"Testing {sla_variable}: {val} req/s") + + iter_data = run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: val}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), + num_runs=num_runs, + dry_run=dry_run, + ) + + assert iter_data is not None + sla_data.extend(iter_data) + + iter_data_mean = { + k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore + for k in sla_comb + } + + sla_results = [ + criterion.print_and_validate(iter_data_mean, k) + for k, criterion in sla_comb.items() + ] + + if all(sla_results): + print("SLA criteria are met.") + max_passing = val + val *= 2 + else: + print("SLA criteria are not met.") + min_failing = val + break + + if val >= max_value: + break + + return sla_data, (max_passing, min_failing) + + +def _find_sla_value( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + base_path: Path, + num_runs: int, + dry_run: bool, + sla_variable: SLAVariable, + min_value: int, + max_value: int, +): + sla_data = list[dict[str, object]]() + + left: int = min_value + right: int = max_value + + while True: + val = (left + right) // 2 + print(f"Testing {sla_variable}: {val} req/s") + + iter_data = run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: val}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), + num_runs=num_runs, + dry_run=dry_run, + ) + + assert iter_data is not None + sla_data.extend(iter_data) + + iter_data_mean = { + k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore + for k in sla_comb + } + + sla_results = [ + criterion.print_and_validate(iter_data_mean, k) + for k, criterion in sla_comb.items() + ] + + if all(sla_results): + print("SLA criteria are met.") + left = val + else: + print("SLA criteria are not met.") + right = val + + if right - left <= 1: + break + + return sla_data, left + + +def search_sla( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + sla_variable: SLAVariable, + sla_inf_value: int = 65536, # The value that represents infinite QPS + base_path: Path, + num_runs: int, + dry_run: bool, +): + print("[SLA START]") + print(f"SLA criteria: {sla_comb.as_text()}") + + sla_data_0 = run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: sla_inf_value}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value), + num_runs=num_runs, + dry_run=dry_run, + ) + if sla_data_0 is None: + assert dry_run + print("Omitting SLA search.") + print("[SLA END]") + return None + + sla_init_value = math.ceil( + sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0) + / len(sla_data_0) + ) + print(f"Initial {sla_variable} to search: {sla_init_value} req/s.") + + sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + sla_variable=sla_variable, + init_value=sla_init_value, + max_value=sla_inf_value, + ) + print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.") + + sla_data_2, sla_value = _find_sla_value( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + sla_variable=sla_variable, + min_value=sla_min, + max_value=sla_max, + ) + + sla_data = sla_data_0 + sla_data_1 + sla_data_2 + print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.") + + with _get_sla_iter_path( + base_path, + sla_comb, + sla_variable, + sla_value=None, + ).open("w") as f: + json.dump(sla_data, f, indent=4) + + print("[SLA END]") + + return sla_data + + +def run_slas( + serve_cmd: list[str], + bench_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_params: ParameterSweep, + bench_params: ParameterSweep, + sla_params: SLASweep, + sla_variable: SLAVariable, + output_dir: Path, + num_runs: int, + dry_run: bool, +): + if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params): + raise ValueError( + f"You should not override `{sla_variable}` in `bench_params` in SLA mode, " + "since it is supposed to be determined automatically." + ) + + all_data = list[dict[str, object]]() + for serve_comb in serve_params: + with ( + run_server( + serve_cmd, + after_bench_cmd, + show_stdout=show_stdout, + serve_overrides=serve_comb, + dry_run=dry_run, + ) + if _sla_needs_server( + serve_comb, + bench_params, + sla_params, + sla_variable, + output_dir, + ) + else contextlib.nullcontext() + ) as server: + for bench_comb in bench_params: + for sla_comb in sla_params: + base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) + + comb_data = search_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + sla_variable=sla_variable, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + ) + + if comb_data is not None: + all_data.extend(comb_data) + + if dry_run: + return None + + combined_df = pd.DataFrame.from_records(all_data) + combined_df.to_csv(output_dir / "summary.csv") + + return combined_df + + +@dataclass +class SweepServeSLAArgs(SweepServeArgs): + sla_params: SLASweep + sla_variable: SLAVariable + + parser_name: ClassVar[str] = "serve_sla" + parser_help: ClassVar[str] = "Tune a variable to meet SLAs under multiple settings." + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + # NOTE: Don't use super() as `from_cli_args` calls `cls()` + base_args = SweepServeArgs.from_cli_args(args) + + if args.sla_params: + sla_params = SLASweep.read_json(args.sla_params) + else: + sla_params = SLASweep.from_records([]) + + return cls( + **asdict(base_args), + sla_params=sla_params, + sla_variable=args.sla_variable, + ) + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser = super().add_cli_args(parser) + + sla_group = parser.add_argument_group("sla options") + sla_group.add_argument( + "--sla-params", + type=str, + required=True, + help="Path to JSON file containing a list of SLA constraints to satisfy. " + 'Each constraint is expressed in `{"": ""}` format, ' + 'e.g.: `{"p99_e2el_ms": "<=500"}` means that ' + "the E2E latency should be less than 500ms 99%% of the time. " + "Setting this option runs this script in SLA mode, which searches for " + "the maximum `sla_variable` that satisfies the constraints for " + "each combination of `serve_params`, `bench_params`, and `sla_params`.", + ) + sla_group.add_argument( + "--sla-variable", + type=str, + choices=get_args(SLAVariable), + default="request_rate", + help="Whether to tune request rate or maximum concurrency to satisfy " + "the SLA constraints.", + ) + + return parser + + +def run_main(args: SweepServeSLAArgs): + timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = args.output_dir / timestamp + + if args.resume and not output_dir.exists(): + raise ValueError(f"Cannot resume from non-existent directory ({output_dir})") + + try: + return run_slas( + serve_cmd=args.serve_cmd, + bench_cmd=args.bench_cmd, + after_bench_cmd=args.after_bench_cmd, + show_stdout=args.show_stdout, + serve_params=args.serve_params, + bench_params=args.bench_params, + sla_params=args.sla_params, + sla_variable=args.sla_variable, + output_dir=output_dir, + num_runs=args.num_runs, + dry_run=args.dry_run, + ) + except BaseException as exc: + raise RuntimeError( + f"The script was terminated early. Use `--resume {timestamp}` " + f"to continue the script from its last checkpoint." + ) from exc + + +def main(args: argparse.Namespace): + run_main(SweepServeSLAArgs.from_cli_args(args)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=SweepServeSLAArgs.parser_help) + SweepServeSLAArgs.add_cli_args(parser) + + main(parser.parse_args()) diff --git a/benchmarks/sweep/server.py b/benchmarks/sweep/server.py new file mode 100644 index 0000000..f175787 --- /dev/null +++ b/benchmarks/sweep/server.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib +import os +import signal +import subprocess +from types import TracebackType + +import requests +from typing_extensions import Self + + +class ServerProcess: + def __init__( + self, + server_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + ) -> None: + super().__init__() + + self.server_cmd = server_cmd + self.after_bench_cmd = after_bench_cmd + self.show_stdout = show_stdout + + def __enter__(self) -> Self: + self.start() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + self.stop() + + def start(self): + # Create new process for clean termination + self._server_process = subprocess.Popen( + self.server_cmd, + start_new_session=True, + stdout=None if self.show_stdout else subprocess.DEVNULL, + # Need `VLLM_SERVER_DEV_MODE=1` for `_reset_caches` + env=os.environ | {"VLLM_SERVER_DEV_MODE": "1"}, + ) + + def stop(self): + server_process = self._server_process + + if server_process.poll() is None: + # In case only some processes have been terminated + with contextlib.suppress(ProcessLookupError): + # We need to kill both API Server and Engine processes + os.killpg(os.getpgid(server_process.pid), signal.SIGKILL) + + def run_subcommand(self, cmd: list[str]): + return subprocess.run( + cmd, + stdout=None if self.show_stdout else subprocess.DEVNULL, + check=True, + ) + + def after_bench(self) -> None: + if not self.after_bench_cmd: + self.reset_caches() + return + + self.run_subcommand(self.after_bench_cmd) + + def _get_vllm_server_address(self) -> str: + server_cmd = self.server_cmd + + for host_key in ("--host",): + if host_key in server_cmd: + host = server_cmd[server_cmd.index(host_key) + 1] + break + else: + host = "localhost" + + for port_key in ("-p", "--port"): + if port_key in server_cmd: + port = int(server_cmd[server_cmd.index(port_key) + 1]) + break + else: + port = 8000 # The default value in vllm serve + + return f"http://{host}:{port}" + + def reset_caches(self) -> None: + server_cmd = self.server_cmd + + # Use `.endswith()` to match `/bin/...` + if server_cmd[0].endswith("vllm"): + server_address = self._get_vllm_server_address() + print(f"Resetting caches at {server_address}") + + res = requests.post(f"{server_address}/reset_prefix_cache") + res.raise_for_status() + + res = requests.post(f"{server_address}/reset_mm_cache") + res.raise_for_status() + elif server_cmd[0].endswith("infinity_emb"): + if "--vector-disk-cache" in server_cmd: + raise NotImplementedError( + "Infinity server uses caching but does not expose a method " + "to reset the cache" + ) + else: + raise NotImplementedError( + f"No implementation of `reset_caches` for `{server_cmd[0]}` server. " + "Please specify a custom command via `--after-bench-cmd`." + ) diff --git a/benchmarks/sweep/sla_sweep.py b/benchmarks/sweep/sla_sweep.py new file mode 100644 index 0000000..327e3c7 --- /dev/null +++ b/benchmarks/sweep/sla_sweep.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +import os +from abc import ABC, abstractmethod +from dataclasses import dataclass + +from typing_extensions import override + + +@dataclass +class SLACriterionBase(ABC): + target: float + + @abstractmethod + def validate(self, actual: float) -> bool: + """Return `True` if this criterion is met; otherwise `False`.""" + raise NotImplementedError + + @abstractmethod + def format_cond(self, lhs: str) -> str: + raise NotImplementedError + + def print_and_validate( + self, + metrics: dict[str, float], + metrics_key: str, + ) -> bool: + metric = metrics[metrics_key] + result = self.validate(metric) + + cond = self.format_cond(f"{metrics_key} = {metric:.2f}") + print(f"Validating SLA: {cond} | " + ("PASSED" if result else "FAILED")) + + return result + + +@dataclass +class SLALessThan(SLACriterionBase): + @override + def validate(self, actual: float) -> bool: + return actual < self.target + + @override + def format_cond(self, lhs: str) -> str: + return f"{lhs}<{self.target:.2f}" + + +@dataclass +class SLALessThanOrEqualTo(SLACriterionBase): + @override + def validate(self, actual: float) -> bool: + return actual <= self.target + + @override + def format_cond(self, lhs: str) -> str: + return f"{lhs}<={self.target:.2f}" + + +@dataclass +class SLAGreaterThan(SLACriterionBase): + @override + def validate(self, actual: float) -> bool: + return actual > self.target + + @override + def format_cond(self, lhs: str) -> str: + return f"{lhs}>{self.target:.2f}" + + +@dataclass +class SLAGreaterThanOrEqualTo(SLACriterionBase): + @override + def validate(self, actual: float) -> bool: + return actual >= self.target + + @override + def format_cond(self, lhs: str) -> str: + return f"{lhs}>={self.target:.2f}" + + +# NOTE: The ordering is important! Match longer op_keys first +SLA_CRITERIA: dict[str, type[SLACriterionBase]] = { + "<=": SLALessThanOrEqualTo, + ">=": SLAGreaterThanOrEqualTo, + "<": SLALessThan, + ">": SLAGreaterThan, +} + + +class SLASweep(list["SLASweepItem"]): + @classmethod + def read_json(cls, filepath: os.PathLike): + with open(filepath, "rb") as f: + records = json.load(f) + + return cls.from_records(records) + + @classmethod + def from_records(cls, records: list[dict[str, str]]): + if not isinstance(records, list): + raise TypeError( + f"The SLA sweep should be a list of dictionaries, " + f"but found type: {type(records)}" + ) + + return cls(SLASweepItem.from_record(record) for record in records) + + +class SLASweepItem(dict[str, SLACriterionBase]): + @classmethod + def from_record(cls, record: dict[str, str]): + sla_criteria: dict[str, SLACriterionBase] = {} + + for metric_key, metric_value in record.items(): + for op_key in SLA_CRITERIA: + if metric_value.startswith(op_key): + sla_criteria[metric_key] = SLA_CRITERIA[op_key]( + float(metric_value.removeprefix(op_key)) + ) + break + else: + raise ValueError( + f"Invalid operator for " + f"SLA constraint '{metric_key}={metric_value}'. " + f"Valid operators are: {sorted(SLA_CRITERIA)}", + ) + + return cls(sla_criteria) + + def as_text(self, sep: str = ", ") -> str: + return sep.join(v.format_cond(k) for k, v in self.items()) diff --git a/benchmarks/sweep/utils.py b/benchmarks/sweep/utils.py new file mode 100644 index 0000000..49d7867 --- /dev/null +++ b/benchmarks/sweep/utils.py @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +def sanitize_filename(filename: str) -> str: + return filename.replace("/", "_").replace("..", "__").strip("'").strip('"') diff --git a/benchmarks/throughput.py b/benchmarks/throughput.py new file mode 100644 index 0000000..23b5faa --- /dev/null +++ b/benchmarks/throughput.py @@ -0,0 +1,799 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark offline inference throughput.""" + +import argparse +import dataclasses +import json +import os +import random +import time +import warnings +from typing import Any + +import torch +import uvloop +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase + +from vllm.benchmarks.datasets import ( + AIMODataset, + BurstGPTDataset, + ConversationDataset, + InstructCoderDataset, + MultiModalConversationDataset, + PrefixRepetitionRandomDataset, + RandomDataset, + SampleRequest, + ShareGPTDataset, + SonnetDataset, + VisionArenaDataset, +) +from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.inputs import TextPrompt, TokensPrompt +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import BeamSearchParams +from vllm.utils.async_utils import merge_async_iterators + + +def run_vllm( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + do_profile: bool, + disable_detokenize: bool = False, +) -> tuple[float, list[RequestOutput] | None]: + from vllm import LLM, SamplingParams + + llm = LLM(**dataclasses.asdict(engine_args)) + assert all( + llm.llm_engine.model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests." + ) + # Add the requests to the engine. + prompts: list[TextPrompt | TokensPrompt] = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompt = ( + TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"]) + if "prompt_token_ids" in request.prompt + else TextPrompt(prompt=request.prompt) + ) + if request.multi_modal_data: + assert isinstance(request.multi_modal_data, dict) + prompt["multi_modal_data"] = request.multi_modal_data + prompts.append(prompt) + + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + ) + ) + lora_requests: list[LoRARequest] | None = None + if engine_args.enable_lora: + lora_requests = [request.lora_request for request in requests] + + use_beam_search = False + + outputs = None + if not use_beam_search: + start = time.perf_counter() + if do_profile: + llm.start_profile() + outputs = llm.generate( + prompts, sampling_params, lora_request=lora_requests, use_tqdm=True + ) + if do_profile: + llm.stop_profile() + end = time.perf_counter() + else: + assert lora_requests is None, "BeamSearch API does not support LoRA" + prompts = [request.prompt for request in requests] + # output_len should be the same for all requests. + output_len = requests[0].expected_output_len + for request in requests: + assert request.expected_output_len == output_len + start = time.perf_counter() + if do_profile: + llm.start_profile() + llm.beam_search( + prompts, + BeamSearchParams( + beam_width=n, + max_tokens=output_len, + ignore_eos=True, + ), + ) + if do_profile: + llm.stop_profile() + end = time.perf_counter() + return end - start, outputs + + +def run_vllm_chat( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + do_profile: bool, + disable_detokenize: bool = False, +) -> tuple[float, list[RequestOutput]]: + """ + Run vLLM chat benchmark. This function is recommended ONLY for benchmarking + multimodal models as it properly handles multimodal inputs and chat + formatting. For non-multimodal models, use run_vllm() instead. + """ + from vllm import LLM, SamplingParams + + llm = LLM(**dataclasses.asdict(engine_args)) + + assert all( + llm.llm_engine.model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests." + ) + + prompts = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + ) + ) + start = time.perf_counter() + if do_profile: + llm.start_profile() + outputs = llm.chat(prompts, sampling_params, use_tqdm=True) + if do_profile: + llm.stop_profile() + end = time.perf_counter() + return end - start, outputs + + +async def run_vllm_async( + requests: list[SampleRequest], + n: int, + engine_args: AsyncEngineArgs, + do_profile: bool, + disable_frontend_multiprocessing: bool = False, + disable_detokenize: bool = False, +) -> float: + from vllm import SamplingParams + from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args, + ) + + async with build_async_engine_client_from_engine_args( + engine_args, + disable_frontend_multiprocessing=disable_frontend_multiprocessing, + ) as llm: + model_config = llm.model_config + assert all( + model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests." + ) + + # Add the requests to the engine. + prompts: list[TextPrompt | TokensPrompt] = [] + sampling_params: list[SamplingParams] = [] + lora_requests: list[LoRARequest | None] = [] + for request in requests: + prompt = ( + TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"]) + if "prompt_token_ids" in request.prompt + else TextPrompt(prompt=request.prompt) + ) + + if request.multi_modal_data: + assert isinstance(request.multi_modal_data, dict) + prompt["multi_modal_data"] = request.multi_modal_data + + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + detokenize=not disable_detokenize, + ) + ) + prompts.append(prompt) + lora_requests.append(request.lora_request) + + generators = [] + start = time.perf_counter() + if do_profile: + await llm.start_profile() + for i, (prompt, sp, lr) in enumerate( + zip(prompts, sampling_params, lora_requests) + ): + generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + if do_profile: + await llm.stop_profile() + end = time.perf_counter() + return end - start + + +def run_hf( + requests: list[SampleRequest], + model: str, + tokenizer: PreTrainedTokenizerBase, + n: int, + max_batch_size: int, + trust_remote_code: bool, + disable_detokenize: bool = False, +) -> float: + llm = AutoModelForCausalLM.from_pretrained( + model, dtype=torch.float16, trust_remote_code=trust_remote_code + ) + if llm.config.model_type == "llama": + # To enable padding in the HF backend. + tokenizer.pad_token = tokenizer.eos_token + llm = llm.cuda() + + pbar = tqdm(total=len(requests)) + start = time.perf_counter() + batch: list[str] = [] + max_prompt_len = 0 + max_output_len = 0 + for i in range(len(requests)): + prompt = requests[i].prompt + prompt_len = requests[i].prompt_len + output_len = requests[i].expected_output_len + # Add the prompt to the batch. + batch.append(prompt) + max_prompt_len = max(max_prompt_len, prompt_len) + max_output_len = max(max_output_len, output_len) + if len(batch) < max_batch_size and i != len(requests) - 1: + # Check if we can add more requests to the batch. + next_prompt_len = requests[i + 1].prompt_len + next_output_len = requests[i + 1].expected_output_len + if ( + max(max_prompt_len, next_prompt_len) + + max(max_output_len, next_output_len) + ) <= 2048: + # We can add more requests to the batch. + continue + + # Generate the sequences. + input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids + llm_outputs = llm.generate( + input_ids=input_ids.cuda(), + do_sample=True, + num_return_sequences=n, + temperature=1.0, + top_p=1.0, + use_cache=True, + max_new_tokens=max_output_len, + ) + if not disable_detokenize: + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + pbar.update(len(batch)) + + # Clear the batch. + batch = [] + max_prompt_len = 0 + max_output_len = 0 + end = time.perf_counter() + return end - start + + +def save_to_pytorch_benchmark_format( + args: argparse.Namespace, results: dict[str, Any] +) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={ + "requests_per_second": [results["requests_per_second"]], + "tokens_per_second": [results["tokens_per_second"]], + }, + extra_info={ + k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] + }, + ) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def get_requests(args, tokenizer): + # Common parameters for all dataset types. + common_kwargs = { + "dataset_path": args.dataset_path, + "random_seed": args.seed, + } + sample_kwargs = { + "tokenizer": tokenizer, + "lora_path": args.lora_path, + "max_loras": args.max_loras, + "num_requests": args.num_prompts, + "input_len": args.input_len, + "output_len": args.output_len, + } + + if args.dataset_path is None or args.dataset_name == "random": + sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["prefix_len"] = args.prefix_len + dataset_cls = RandomDataset + elif args.dataset_name == "sharegpt": + dataset_cls = ShareGPTDataset + if args.backend == "vllm-chat": + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_name == "sonnet": + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset." + ) + dataset_cls = SonnetDataset + sample_kwargs["prefix_len"] = args.prefix_len + sample_kwargs["return_prompt_formatted"] = True + elif args.dataset_name == "burstgpt": + dataset_cls = BurstGPTDataset + elif args.dataset_name == "hf": + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = VisionArenaDataset + common_kwargs["dataset_subset"] = None + common_kwargs["dataset_split"] = "train" + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = InstructCoderDataset + common_kwargs["dataset_split"] = "train" + elif args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = MultiModalConversationDataset + common_kwargs["dataset_subset"] = args.hf_subset + common_kwargs["dataset_split"] = args.hf_split + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_cls = ConversationDataset + common_kwargs["dataset_subset"] = args.hf_subset + common_kwargs["dataset_split"] = args.hf_split + sample_kwargs["enable_multimodal_chat"] = True + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_cls = AIMODataset + common_kwargs["dataset_subset"] = None + common_kwargs["dataset_split"] = "train" + elif args.dataset_name == "prefix_repetition": + dataset_cls = PrefixRepetitionRandomDataset + sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len + sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len + sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes + sample_kwargs["output_len"] = args.prefix_repetition_output_len + else: + raise ValueError(f"Unknown dataset name: {args.dataset_name}") + # Remove None values + sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} + requests = dataset_cls(**common_kwargs).sample(**sample_kwargs) + requests = filter_requests_for_dp(requests, args.data_parallel_size) + return requests + + +def filter_requests_for_dp(requests, data_parallel_size): + # Note(zhuohan): The way we get data_parallel_rank is hacky and only + # works for external launcher mode. Should be cleaned up and deprecated + # in the future with a better vLLM distributed process design. + if data_parallel_size == 1: + return requests + + global_rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + data_parallel_rank = global_rank // (world_size // data_parallel_size) + return [ + r + for i, r in enumerate(requests) + if i % data_parallel_size == data_parallel_rank + ] + + +def validate_args(args): + """ + Validate command-line arguments. + """ + + # === Deprecation and Defaulting === + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next release. " + "Please use '--dataset-name' and '--dataset-path' instead.", + stacklevel=2, + ) + args.dataset_path = args.dataset + + if not getattr(args, "tokenizer", None): + args.tokenizer = args.model + + # === Backend Validation === + valid_backends = {"vllm", "hf", "mii", "vllm-chat"} + if args.backend not in valid_backends: + raise ValueError(f"Unsupported backend: {args.backend}") + + # === Dataset Configuration === + if ( + not args.dataset + and not args.dataset_path + and args.dataset_name not in {"prefix_repetition"} + ): + print("When dataset path is not set, it will default to random dataset") + args.dataset_name = "random" + if args.input_len is None: + raise ValueError("input_len must be provided for a random dataset") + + # === Dataset Name Specific Checks === + # --hf-subset and --hf-split: only used + # when dataset_name is 'hf' + if args.dataset_name != "hf" and ( + getattr(args, "hf_subset", None) is not None + or getattr(args, "hf_split", None) is not None + ): + warnings.warn( + "--hf-subset and --hf-split will be ignored \ + since --dataset-name is not 'hf'.", + stacklevel=2, + ) + elif args.dataset_name == "hf": + if args.dataset_path in ( + VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() + | MultiModalConversationDataset.SUPPORTED_DATASET_PATHS + | ConversationDataset.SUPPORTED_DATASET_PATHS + ): + assert args.backend == "vllm-chat", ( + f"{args.dataset_path} needs to use vllm-chat as the backend." + ) + elif args.dataset_path in ( + InstructCoderDataset.SUPPORTED_DATASET_PATHS + | AIMODataset.SUPPORTED_DATASET_PATHS + ): + assert args.backend == "vllm", ( + f"{args.dataset_path} needs to use vllm as the backend." + ) + else: + raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") + + # --random-range-ratio: only used when dataset_name is 'random' + if args.dataset_name != "random" and args.random_range_ratio is not None: + warnings.warn( + "--random-range-ratio will be ignored since \ + --dataset-name is not 'random'.", + stacklevel=2, + ) + + # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not + # set. + if ( + args.dataset_name not in {"random", "sonnet", None} + and args.prefix_len is not None + ): + warnings.warn( + "--prefix-len will be ignored since --dataset-name\ + is not 'random', 'sonnet', or not set.", + stacklevel=2, + ) + + # === LoRA Settings === + if getattr(args, "enable_lora", False) and args.backend != "vllm": + raise ValueError("LoRA benchmarking is only supported for vLLM backend") + if getattr(args, "enable_lora", False) and args.lora_path is None: + raise ValueError("LoRA path must be provided when enable_lora is True") + + # === Backend-specific Validations === + if args.backend == "hf" and args.hf_max_batch_size is None: + raise ValueError("HF max batch size is required for HF backend") + if args.backend != "hf" and args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + + if ( + args.backend in {"hf", "mii"} + and getattr(args, "quantization", None) is not None + ): + raise ValueError("Quantization is only for vLLM backend.") + + if args.backend == "mii" and args.dtype != "auto": + raise ValueError("dtype must be auto for MII backend.") + if args.backend == "mii" and args.n != 1: + raise ValueError("n must be 1 for MII backend.") + if args.backend == "mii" and args.tokenizer != args.model: + raise ValueError("Tokenizer must be the same as the model for MII backend.") + + if args.data_parallel_size > 1 and ( + args.distributed_executor_backend != "external_launcher" or args.async_engine + ): + # --data-parallel is not supported fully. + # Old issue: https://github.com/vllm-project/vllm/issues/16222 + # Currently we only support data parallel with external launcher + # mode (i.e., launch with toruchrun). + raise ValueError( + "Data parallel is only supported with external launcher mode " + "with synchronous engine in offline benchmark, " + "please use benchmark serving instead" + ) + + +def add_cli_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--backend", + type=str, + choices=["vllm", "hf", "mii", "vllm-chat"], + default="vllm", + ) + parser.add_argument( + "--dataset-name", + type=str, + choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition"], + help="Name of the dataset to benchmark on.", + default="sharegpt", + ) + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in\ + the next release. The dataset is expected to " + "be a json in form of list[dict[..., conversations: " + "list[dict[..., value: ]]]]", + ) + parser.add_argument( + "--dataset-path", type=str, default=None, help="Path to the dataset" + ) + parser.add_argument( + "--input-len", + type=int, + default=None, + help="Input prompt length for each request", + ) + parser.add_argument( + "--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " + "output length from the dataset.", + ) + parser.add_argument( + "--n", type=int, default=1, help="Number of generated sequences per prompt." + ) + parser.add_argument( + "--num-prompts", type=int, default=1000, help="Number of prompts to process." + ) + parser.add_argument( + "--hf-max-batch-size", + type=int, + default=None, + help="Maximum batch size for HF backend.", + ) + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Path to save the throughput results in JSON format.", + ) + parser.add_argument( + "--async-engine", + action="store_true", + default=False, + help="Use vLLM async engine rather than LLM class.", + ) + parser.add_argument( + "--disable-frontend-multiprocessing", + action="store_true", + default=False, + help="Disable decoupled async engine frontend.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=( + "Do not detokenize the response (i.e. do not include " + "detokenization time in the measurement)" + ), + ) + # LoRA + parser.add_argument( + "--lora-path", + type=str, + default=None, + help="Path to the lora adapters to use. This can be an absolute path, " + "a relative path, or a Hugging Face model identifier.", + ) + parser.add_argument( + "--prefix-len", + type=int, + default=0, + help="Number of fixed prefix tokens before the random " + "context in a request (default: 0).", + ) + # random dataset + parser.add_argument( + "--random-range-ratio", + type=float, + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for RandomDataset. Must be in the range [0, 1) to define " + "a symmetric sampling range " + "[length * (1 - range_ratio), length * (1 + range_ratio)].", + ) + + # hf dtaset + parser.add_argument( + "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + ) + parser.add_argument( + "--hf-split", type=str, default=None, help="Split of the HF dataset." + ) + parser.add_argument( + "--profile", + action="store_true", + default=False, + help="Use Torch Profiler. The env variable " + "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.", + ) + + # prefix repetition dataset + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options" + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=None, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=None, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=None, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=None, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + + parser = AsyncEngineArgs.add_cli_args(parser) + + +def main(args: argparse.Namespace): + if args.tokenizer is None: + args.tokenizer = args.model + validate_args(args) + if args.seed is None: + args.seed = 0 + random.seed(args.seed) + # Sample the requests. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code + ) + requests = get_requests(args, tokenizer) + is_multi_modal = any(request.multi_modal_data is not None for request in requests) + request_outputs: list[RequestOutput] | None = None + if args.backend == "vllm": + if args.async_engine: + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + disable_frontend_multiprocessing=args.disable_frontend_multiprocessing, + disable_detokenize=args.disable_detokenize, + do_profile=args.profile, + ) + ) + else: + elapsed_time, request_outputs = run_vllm( + requests, + args.n, + EngineArgs.from_cli_args(args), + disable_detokenize=args.disable_detokenize, + do_profile=args.profile, + ) + elif args.backend == "hf": + assert args.tensor_parallel_size == 1 + if args.profile: + raise NotImplementedError("Profiling not implemented yet for backend='hf'.") + elapsed_time = run_hf( + requests, + args.model, + tokenizer, + args.n, + args.hf_max_batch_size, + args.trust_remote_code, + args.disable_detokenize, + ) + elif args.backend == "vllm-chat": + elapsed_time, request_outputs = run_vllm_chat( + requests, + args.n, + EngineArgs.from_cli_args(args), + disable_detokenize=args.disable_detokenize, + do_profile=args.profile, + ) + else: + raise ValueError(f"Unknown backend: {args.backend}") + + if request_outputs: + # Note: with the vllm and vllm-chat backends, + # we have request_outputs, which we use to count tokens. + total_prompt_tokens = 0 + total_output_tokens = 0 + for ro in request_outputs: + if not isinstance(ro, RequestOutput): + continue + total_prompt_tokens += ( + len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 + ) + total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) + total_num_tokens = total_prompt_tokens + total_output_tokens + else: + total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) + total_output_tokens = sum(r.expected_output_len for r in requests) + total_prompt_tokens = total_num_tokens - total_output_tokens + + if is_multi_modal and args.backend != "vllm-chat": + print( + "\033[91mWARNING\033[0m: Multi-modal request with " + f"{args.backend} backend detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details." + ) + # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. + # vllm-chat backend counts the image tokens now + + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s" + ) + print(f"Total num prompt tokens: {total_prompt_tokens}") + print(f"Total num output tokens: {total_output_tokens}") + + # Output JSON results if specified + if args.output_json: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": total_num_tokens / elapsed_time, + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) diff --git a/collect_env.py b/collect_env.py new file mode 100644 index 0000000..4ca0852 --- /dev/null +++ b/collect_env.py @@ -0,0 +1,857 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# ruff: noqa +# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py + +import datetime +import locale +import os +import subprocess +import sys + +# Unlike the rest of the PyTorch this file must be python2 compliant. +# This script outputs relevant system environment info +# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` +from collections import namedtuple + +import regex as re + +from vllm.envs import environment_variables + +try: + import torch + + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + +# System Environment Information +SystemEnv = namedtuple( + "SystemEnv", + [ + "torch_version", + "is_debug_build", + "cuda_compiled_version", + "gcc_version", + "clang_version", + "cmake_version", + "os", + "libc_version", + "python_version", + "python_platform", + "is_cuda_available", + "cuda_runtime_version", + "cuda_module_loading", + "nvidia_driver_version", + "nvidia_gpu_models", + "cudnn_version", + "pip_version", # 'pip' or 'pip3' + "pip_packages", + "conda_packages", + "hip_compiled_version", + "hip_runtime_version", + "miopen_runtime_version", + "caching_allocator_config", + "is_xnnpack_available", + "cpu_info", + "rocm_version", # vllm specific field + "vllm_version", # vllm specific field + "vllm_build_flags", # vllm specific field + "gpu_topo", # vllm specific field + "env_vars", + ], +) + +DEFAULT_CONDA_PATTERNS = { + "torch", + "numpy", + "cudatoolkit", + "soumith", + "mkl", + "magma", + "triton", + "optree", + "nccl", + "transformers", + "zmq", + "nvidia", + "pynvml", + "flashinfer-python", +} + +DEFAULT_PIP_PATTERNS = { + "torch", + "numpy", + "mypy", + "flake8", + "triton", + "optree", + "onnx", + "nccl", + "transformers", + "zmq", + "nvidia", + "pynvml", + "flashinfer-python", +} + + +def run(command): + """Return (return-code, stdout, stderr).""" + shell = True if type(command) is str else False + try: + p = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell + ) + raw_output, raw_err = p.communicate() + rc = p.returncode + if get_platform() == "win32": + enc = "oem" + else: + enc = locale.getpreferredencoding() + output = raw_output.decode(enc) + if command == "nvidia-smi topo -m": + # don't remove the leading whitespace of `nvidia-smi topo -m` + # because they are meaningful + output = output.rstrip() + else: + output = output.strip() + err = raw_err.decode(enc) + return rc, output, err.strip() + + except FileNotFoundError: + cmd_str = command if isinstance(command, str) else command[0] + return 127, "", f"Command not found: {cmd_str}" + + +def run_and_read_all(run_lambda, command): + """Run command using run_lambda; reads and returns entire output if rc is 0.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out + + +def run_and_parse_first_match(run_lambda, command, regex): + """Run command using run_lambda, returns the first regex match if it exists.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + match = re.search(regex, out) + if match is None: + return None + return match.group(1) + + +def run_and_return_first_line(run_lambda, command): + """Run command using run_lambda and returns first line if output is not empty.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out.split("\n")[0] + + +def get_conda_packages(run_lambda, patterns=None): + if patterns is None: + patterns = DEFAULT_CONDA_PATTERNS + conda = os.environ.get("CONDA_EXE", "conda") + out = run_and_read_all(run_lambda, [conda, "list"]) + if out is None: + return out + + return "\n".join( + line + for line in out.splitlines() + if not line.startswith("#") and any(name in line for name in patterns) + ) + + +def get_gcc_version(run_lambda): + return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)") + + +def get_clang_version(run_lambda): + return run_and_parse_first_match( + run_lambda, "clang --version", r"clang version (.*)" + ) + + +def get_cmake_version(run_lambda): + return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)") + + +def get_nvidia_driver_version(run_lambda): + if get_platform() == "darwin": + cmd = "kextstat | grep -i cuda" + return run_and_parse_first_match( + run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]" + ) + smi = get_nvidia_smi() + return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ") + + +def get_gpu_info(run_lambda): + if get_platform() == "darwin" or ( + TORCH_AVAILABLE + and hasattr(torch.version, "hip") + and torch.version.hip is not None + ): + if TORCH_AVAILABLE and torch.cuda.is_available(): + if torch.version.hip is not None: + prop = torch.cuda.get_device_properties(0) + if hasattr(prop, "gcnArchName"): + gcnArch = " ({})".format(prop.gcnArchName) + else: + gcnArch = "NoGCNArchNameOnOldPyTorch" + else: + gcnArch = "" + return torch.cuda.get_device_name(None) + gcnArch + return None + smi = get_nvidia_smi() + uuid_regex = re.compile(r" \(UUID: .+?\)") + rc, out, _ = run_lambda(smi + " -L") + if rc != 0: + return None + # Anonymize GPUs by removing their UUID + return re.sub(uuid_regex, "", out) + + +def get_running_cuda_version(run_lambda): + return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)") + + +def get_cudnn_version(run_lambda): + """Return a list of libcudnn.so; it's hard to tell which one is being used.""" + if get_platform() == "win32": + system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") + cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%") + where_cmd = os.path.join(system_root, "System32", "where") + cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) + elif get_platform() == "darwin": + # CUDA libraries and drivers can be found in /usr/local/cuda/. See + # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install + # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac + # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. + cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*" + else: + cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev' + rc, out, _ = run_lambda(cudnn_cmd) + # find will return 1 if there are permission errors or if not found + if len(out) == 0 or (rc != 1 and rc != 0): + l = os.environ.get("CUDNN_LIBRARY") + if l is not None and os.path.isfile(l): + return os.path.realpath(l) + return None + files_set = set() + for fn in out.split("\n"): + fn = os.path.realpath(fn) # eliminate symbolic links + if os.path.isfile(fn): + files_set.add(fn) + if not files_set: + return None + # Alphabetize the result because the order is non-deterministic otherwise + files = sorted(files_set) + if len(files) == 1: + return files[0] + result = "\n".join(files) + return "Probably one of the following:\n{}".format(result) + + +def get_nvidia_smi(): + # Note: nvidia-smi is currently available only on Windows and Linux + smi = "nvidia-smi" + if get_platform() == "win32": + system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") + program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files") + legacy_path = os.path.join( + program_files_root, "NVIDIA Corporation", "NVSMI", smi + ) + new_path = os.path.join(system_root, "System32", smi) + smis = [new_path, legacy_path] + for candidate_smi in smis: + if os.path.exists(candidate_smi): + smi = '"{}"'.format(candidate_smi) + break + return smi + + +def get_rocm_version(run_lambda): + """Returns the ROCm version if available, otherwise 'N/A'.""" + return run_and_parse_first_match( + run_lambda, "hipcc --version", r"HIP version: (\S+)" + ) + + +def get_vllm_version(): + from vllm import __version__, __version_tuple__ + + if __version__ == "dev": + return "N/A (dev)" + version_str = __version_tuple__[-1] + if isinstance(version_str, str) and version_str.startswith("g"): + # it's a dev build + if "." in version_str: + # it's a dev build containing local changes + git_sha = version_str.split(".")[0][1:] + date = version_str.split(".")[-1][1:] + return f"{__version__} (git sha: {git_sha}, date: {date})" + else: + # it's a dev build without local changes + git_sha = version_str[1:] # type: ignore + return f"{__version__} (git sha: {git_sha})" + return __version__ + + +def summarize_vllm_build_flags(): + # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. + return "CUDA Archs: {}; ROCm: {}".format( + os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"), + "Enabled" if os.environ.get("ROCM_HOME") else "Disabled", + ) + + +def get_gpu_topo(run_lambda): + output = None + + if get_platform() == "linux": + output = run_and_read_all(run_lambda, "nvidia-smi topo -m") + if output is None: + output = run_and_read_all(run_lambda, "rocm-smi --showtopo") + + return output + + +# example outputs of CPU infos +# * linux +# Architecture: x86_64 +# CPU op-mode(s): 32-bit, 64-bit +# Address sizes: 46 bits physical, 48 bits virtual +# Byte Order: Little Endian +# CPU(s): 128 +# On-line CPU(s) list: 0-127 +# Vendor ID: GenuineIntel +# Model name: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# CPU family: 6 +# Model: 106 +# Thread(s) per core: 2 +# Core(s) per socket: 32 +# Socket(s): 2 +# Stepping: 6 +# BogoMIPS: 5799.78 +# Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr +# sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl +# xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16 +# pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand +# hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced +# fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap +# avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 +# xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq +# avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities +# Virtualization features: +# Hypervisor vendor: KVM +# Virtualization type: full +# Caches (sum of all): +# L1d: 3 MiB (64 instances) +# L1i: 2 MiB (64 instances) +# L2: 80 MiB (64 instances) +# L3: 108 MiB (2 instances) +# NUMA: +# NUMA node(s): 2 +# NUMA node0 CPU(s): 0-31,64-95 +# NUMA node1 CPU(s): 32-63,96-127 +# Vulnerabilities: +# Itlb multihit: Not affected +# L1tf: Not affected +# Mds: Not affected +# Meltdown: Not affected +# Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown +# Retbleed: Not affected +# Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +# Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +# Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence +# Srbds: Not affected +# Tsx async abort: Not affected +# * win32 +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU0 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 +# +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU1 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 + + +def get_cpu_info(run_lambda): + rc, out, err = 0, "", "" + if get_platform() == "linux": + rc, out, err = run_lambda("lscpu") + elif get_platform() == "win32": + rc, out, err = run_lambda( + "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \ + CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE" + ) + elif get_platform() == "darwin": + rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string") + cpu_info = "None" + if rc == 0: + cpu_info = out + else: + cpu_info = err + return cpu_info + + +def get_platform(): + if sys.platform.startswith("linux"): + return "linux" + elif sys.platform.startswith("win32"): + return "win32" + elif sys.platform.startswith("cygwin"): + return "cygwin" + elif sys.platform.startswith("darwin"): + return "darwin" + else: + return sys.platform + + +def get_mac_version(run_lambda): + return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)") + + +def get_windows_version(run_lambda): + system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") + wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic") + findstr_cmd = os.path.join(system_root, "System32", "findstr") + return run_and_read_all( + run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd) + ) + + +def get_lsb_version(run_lambda): + return run_and_parse_first_match( + run_lambda, "lsb_release -a", r"Description:\t(.*)" + ) + + +def check_release_file(run_lambda): + return run_and_parse_first_match( + run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"' + ) + + +def get_os(run_lambda): + from platform import machine + + platform = get_platform() + + if platform == "win32" or platform == "cygwin": + return get_windows_version(run_lambda) + + if platform == "darwin": + version = get_mac_version(run_lambda) + if version is None: + return None + return "macOS {} ({})".format(version, machine()) + + if platform == "linux": + # Ubuntu/Debian based + desc = get_lsb_version(run_lambda) + if desc is not None: + return "{} ({})".format(desc, machine()) + + # Try reading /etc/*-release + desc = check_release_file(run_lambda) + if desc is not None: + return "{} ({})".format(desc, machine()) + + return "{} ({})".format(platform, machine()) + + # Unknown platform + return platform + + +def get_python_platform(): + import platform + + return platform.platform() + + +def get_libc_version(): + import platform + + if get_platform() != "linux": + return "N/A" + return "-".join(platform.libc_ver()) + + +def is_uv_venv(): + if os.environ.get("UV"): + return True + pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg") + if os.path.exists(pyvenv_cfg_path): + with open(pyvenv_cfg_path, "r") as f: + return any(line.startswith("uv = ") for line in f) + return False + + +def get_pip_packages(run_lambda, patterns=None): + """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" + if patterns is None: + patterns = DEFAULT_PIP_PATTERNS + + def run_with_pip(): + try: + import importlib.util + + pip_spec = importlib.util.find_spec("pip") + pip_available = pip_spec is not None + except ImportError: + pip_available = False + + if pip_available: + cmd = [sys.executable, "-mpip", "list", "--format=freeze"] + elif is_uv_venv(): + print("uv is set") + cmd = ["uv", "pip", "list", "--format=freeze"] + else: + raise RuntimeError( + "Could not collect pip list output (pip or uv module not available)" + ) + + out = run_and_read_all(run_lambda, cmd) + return "\n".join( + line for line in out.splitlines() if any(name in line for name in patterns) + ) + + pip_version = "pip3" if sys.version[0] == "3" else "pip" + out = run_with_pip() + return pip_version, out + + +def get_cachingallocator_config(): + ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") + return ca_config + + +def get_cuda_module_loading_config(): + if TORCH_AVAILABLE and torch.cuda.is_available(): + torch.cuda.init() + config = os.environ.get("CUDA_MODULE_LOADING", "") + return config + else: + return "N/A" + + +def is_xnnpack_available(): + if TORCH_AVAILABLE: + import torch.backends.xnnpack + + return str(torch.backends.xnnpack.enabled) # type: ignore[attr-defined] + else: + return "N/A" + + +def get_env_vars(): + env_vars = "" + secret_terms = ("secret", "token", "api", "access", "password") + report_prefix = ( + "TORCH", + "NCCL", + "PYTORCH", + "CUDA", + "CUBLAS", + "CUDNN", + "OMP_", + "MKL_", + "NVIDIA", + ) + for k, v in os.environ.items(): + if any(term in k.lower() for term in secret_terms): + continue + if k in environment_variables: + env_vars = env_vars + "{}={}".format(k, v) + "\n" + if k.startswith(report_prefix): + env_vars = env_vars + "{}={}".format(k, v) + "\n" + + return env_vars + + +def get_env_info(): + run_lambda = run + pip_version, pip_list_output = get_pip_packages(run_lambda) + + if TORCH_AVAILABLE: + version_str = torch.__version__ + debug_mode_str = str(torch.version.debug) + cuda_available_str = str(torch.cuda.is_available()) + cuda_version_str = torch.version.cuda + if ( + not hasattr(torch.version, "hip") or torch.version.hip is None + ): # cuda version + hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A" + else: # HIP version + + def get_version_or_na(cfg, prefix): + _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s] + return _lst[0] if _lst else "N/A" + + cfg = torch._C._show_config().split("\n") + hip_runtime_version = get_version_or_na(cfg, "HIP Runtime") + miopen_runtime_version = get_version_or_na(cfg, "MIOpen") + cuda_version_str = "N/A" + hip_compiled_version = torch.version.hip + else: + version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A" + hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A" + + sys_version = sys.version.replace("\n", " ") + + conda_packages = get_conda_packages(run_lambda) + + rocm_version = get_rocm_version(run_lambda) + vllm_version = get_vllm_version() + vllm_build_flags = summarize_vllm_build_flags() + gpu_topo = get_gpu_topo(run_lambda) + + return SystemEnv( + torch_version=version_str, + is_debug_build=debug_mode_str, + python_version="{} ({}-bit runtime)".format( + sys_version, sys.maxsize.bit_length() + 1 + ), + python_platform=get_python_platform(), + is_cuda_available=cuda_available_str, + cuda_compiled_version=cuda_version_str, + cuda_runtime_version=get_running_cuda_version(run_lambda), + cuda_module_loading=get_cuda_module_loading_config(), + nvidia_gpu_models=get_gpu_info(run_lambda), + nvidia_driver_version=get_nvidia_driver_version(run_lambda), + cudnn_version=get_cudnn_version(run_lambda), + hip_compiled_version=hip_compiled_version, + hip_runtime_version=hip_runtime_version, + miopen_runtime_version=miopen_runtime_version, + pip_version=pip_version, + pip_packages=pip_list_output, + conda_packages=conda_packages, + os=get_os(run_lambda), + libc_version=get_libc_version(), + gcc_version=get_gcc_version(run_lambda), + clang_version=get_clang_version(run_lambda), + cmake_version=get_cmake_version(run_lambda), + caching_allocator_config=get_cachingallocator_config(), + is_xnnpack_available=is_xnnpack_available(), + cpu_info=get_cpu_info(run_lambda), + rocm_version=rocm_version, + vllm_version=vllm_version, + vllm_build_flags=vllm_build_flags, + gpu_topo=gpu_topo, + env_vars=get_env_vars(), + ) + + +env_info_fmt = """ +============================== + System Info +============================== +OS : {os} +GCC version : {gcc_version} +Clang version : {clang_version} +CMake version : {cmake_version} +Libc version : {libc_version} + +============================== + PyTorch Info +============================== +PyTorch version : {torch_version} +Is debug build : {is_debug_build} +CUDA used to build PyTorch : {cuda_compiled_version} +ROCM used to build PyTorch : {hip_compiled_version} + +============================== + Python Environment +============================== +Python version : {python_version} +Python platform : {python_platform} + +============================== + CUDA / GPU Info +============================== +Is CUDA available : {is_cuda_available} +CUDA runtime version : {cuda_runtime_version} +CUDA_MODULE_LOADING set to : {cuda_module_loading} +GPU models and configuration : {nvidia_gpu_models} +Nvidia driver version : {nvidia_driver_version} +cuDNN version : {cudnn_version} +HIP runtime version : {hip_runtime_version} +MIOpen runtime version : {miopen_runtime_version} +Is XNNPACK available : {is_xnnpack_available} + +============================== + CPU Info +============================== +{cpu_info} + +============================== +Versions of relevant libraries +============================== +{pip_packages} +{conda_packages} +""".strip() + +# both the above code and the following code use `strip()` to +# remove leading/trailing whitespaces, so we need to add a newline +# in between to separate the two sections +env_info_fmt += "\n\n" + +env_info_fmt += """ +============================== + vLLM Info +============================== +ROCM Version : {rocm_version} +vLLM Version : {vllm_version} +vLLM Build Flags: + {vllm_build_flags} +GPU Topology: + {gpu_topo} + +============================== + Environment Variables +============================== +{env_vars} +""".strip() + + +def pretty_str(envinfo): + def replace_nones(dct, replacement="Could not collect"): + for key in dct.keys(): + if dct[key] is not None: + continue + dct[key] = replacement + return dct + + def replace_bools(dct, true="Yes", false="No"): + for key in dct.keys(): + if dct[key] is True: + dct[key] = true + elif dct[key] is False: + dct[key] = false + return dct + + def prepend(text, tag="[prepend]"): + lines = text.split("\n") + updated_lines = [tag + line for line in lines] + return "\n".join(updated_lines) + + def replace_if_empty(text, replacement="No relevant packages"): + if text is not None and len(text) == 0: + return replacement + return text + + def maybe_start_on_next_line(string): + # If `string` is multiline, prepend a \n to it. + if string is not None and len(string.split("\n")) > 1: + return "\n{}\n".format(string) + return string + + mutable_dict = envinfo._asdict() + + # If nvidia_gpu_models is multiline, start on the next line + mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line( + envinfo.nvidia_gpu_models + ) + + # If the machine doesn't have CUDA, report some fields as 'No CUDA' + dynamic_cuda_fields = [ + "cuda_runtime_version", + "nvidia_gpu_models", + "nvidia_driver_version", + ] + all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"] + all_dynamic_cuda_fields_missing = all( + mutable_dict[field] is None for field in dynamic_cuda_fields + ) + if ( + TORCH_AVAILABLE + and not torch.cuda.is_available() + and all_dynamic_cuda_fields_missing + ): + for field in all_cuda_fields: + mutable_dict[field] = "No CUDA" + if envinfo.cuda_compiled_version is None: + mutable_dict["cuda_compiled_version"] = "None" + + # Replace True with Yes, False with No + mutable_dict = replace_bools(mutable_dict) + + # Replace all None objects with 'Could not collect' + mutable_dict = replace_nones(mutable_dict) + + # If either of these are '', replace with 'No relevant packages' + mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"]) + mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"]) + + # Tag conda and pip packages with a prefix + # If they were previously None, they'll show up as ie '[conda] Could not collect' + if mutable_dict["pip_packages"]: + mutable_dict["pip_packages"] = prepend( + mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version) + ) + if mutable_dict["conda_packages"]: + mutable_dict["conda_packages"] = prepend( + mutable_dict["conda_packages"], "[conda] " + ) + mutable_dict["cpu_info"] = envinfo.cpu_info + return env_info_fmt.format(**mutable_dict) + + +def get_pretty_env_info(): + return pretty_str(get_env_info()) + + +def main(): + print("Collecting environment information...") + output = get_pretty_env_info() + print(output) + + if ( + TORCH_AVAILABLE + and hasattr(torch, "utils") + and hasattr(torch.utils, "_crash_handler") + ): + minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR + if sys.platform == "linux" and os.path.exists(minidump_dir): + dumps = [ + os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir) + ] + latest = max(dumps, key=os.path.getctime) + ctime = os.path.getctime(latest) + creation_time = datetime.datetime.fromtimestamp(ctime).strftime( + "%Y-%m-%d %H:%M:%S" + ) + msg = ( + "\n*** Detected a minidump at {} created on {}, ".format( + latest, creation_time + ) + + "if this is related to your bug please include it when you file a report ***" + ) + print(msg, file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/compilation/__init__.py b/compilation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/compilation/__pycache__/__init__.cpython-312.pyc b/compilation/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..253844c9606a69d4d20b412905a407ab48b626dd GIT binary patch literal 161 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVIp~+<7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?)l~^D;}~OVL}Sr>)46Vq3Oluivqq*s&ESvXc$hiz&q!N|a|vW@hL_ zS~*Yx1Cjbz+XY%n?G{#wEn>p~lAi+kD2nu_Kl?*M4oD0XtlRar@E;Q?&7wc;Id>kU zWXj1dasi&XbM85J?&I8Z&OL{}2?on4D2n$lVpp0d>Ua3!6xOb+{})41cc{yhL`k$l zsdR`|nGi$cJEM3)9(Z~bugZp4g7YdqwJcPoav@Ijhx`P`D&=Y*6d-kZc2KBknb82I#QEdt}sm-Bgg6EYM zwKddA>OrMVZ4b4p9ia~LUZDtTXQ)%{3U$#GL&=qL_XFCRSEvWx>!;5-(=9w7*ws*< z?3o@b;GJ4%C*WUq-U`p@ONF{qn|{No6`s>SE!3S_Xi%>CBYM5!@D-k=dX3ScVfh8Q zS`I`R*j`HBEma89b}_8$CKr825)*M{#;nui$(Ro9q9!Mma70$+xB;D&znXmYnye{_u!L`K zD2n=GA|8!R!doB~j~Swpn4FX~_yG4>DjYXnk!Q@>t7np<<3`wsMJ~y4J)w=ysAk~m z*tKYKgb)F9RV1Zp@C^~ZW<(R3`UHi(-)s6`yL@8olKASyakE@jlg5lV5#7DtsYjE0 z_n3YQAKSYJJCft!2}Ks8;j6N!CL|e_rqZ4y`Pi9@qh_^@7>_Bb6LIN6N&%`WoXy#| zl!_P$YdsneQ(~R`XD*MQK1IgVs6!WGI4+55N)h8YCGl;Xc*NRe3))*wt&hU{9ZIG` zv_yp%i3U}~$lfR;d2V<^tmM5xg?w_E#L67td^fy~c!&HmW&NDlZ~`{*8Z2ZYPUd2d zZT8J_0m`K*S(pgxvJg>VHw9Qlp+G-C7=&m_2WEYUFpN(nQi>$R6NWG$3n^We1SzJ; z5kr}=2uweji+EL@iP%SlObF+Lzx8vlOLwR_dX5rF1A;kvx|r6+1BHv7qe`3~%J4Z| zT%+{5bD+9Q)BQd54~(-eAJ7G^($uI#;M-oO@6uETF)B!7f14>yx8>U|S7~bd+wQ!K zN@o<#_+ez6gPWoHJ!7*%Dz2xJ$%JOWZjjZF&5{ilbU3G~EX+0+Q&{`ywDtS7cIeJz zC1WNjn*nS0$ct8u4V-hDv}W?^2(tDZzSeCLX4TRRa5B8QtVBuAI_{w${O_y7hMF8s z#*%|ZSeulM!FN*!4vt|1E%dD$h~P-3#8otjM+W+@Y{C1GYkX8$ zd-J`;_kP`!seC?9G5+#g#n3(D)A#PbmmWHr=8vs=0GFF^IqMj?f2W$J4e&}!R!bVq={`uLL)K9St)amXg9R z(Gv43W{&wM`Z{Ac>Vlef#THoYGDpX#Ip){&FX?d<7}Fa`N#TA*6QJGnN%FN=MDF)! zo%qUvCW}m&EK!hv>_wy(c1$;YSKq;>j^6^^q4i-?z|y2dY-YQ**@z;Vmw-th05DJG zS~{{V1DTeA&#q;vU;d&YUH$TcZ>_TK<}Vh1v2-e3Ik0jjQ#p|42DF{{rLv?dCP7PW zQY*M>M6Fan}OK324oR+p5fGwWZWH`r$Z7Wg^ z!7Ip$=Ww2ST+x~4I>{Kv`pZ!)<6%`6MYCK)Ymx%XBAUF2@~hY_Wuho0BBH4AIE)<# zUO<3*r{USx_5rw0kxt3bv^K;d7=oXU%{Qp884t_l%d6SZN3|{MEIhyIW7u6F^k=T;TC(dKt(~+Zg3&5)~pp%Rs7k0Qkc^wZ;V(%Q9T!!=_yixxuxzT@MG3ueO~) z_0Vi>8OrJ=$;a7|RXGkb#x0ZRg7lX%rxNq*_NbR^wC4tY9o&p()BeD|ywI0tl3FaO zm8RYOftlFFF>?P%1CMLwoK}i@7LUZD@w`o6_cEaE=Dfe8Kn4O1FJL@#UIPytDRcA# z)Ur@$3e2c)%w%KnWXb>`h=dh+(^8{w)xvMx5x_<7v@K@Xfm7xvNBFKYK!KaHrNAI) z=K^uvGv|?L95S3CW=z8)2#To(9@;Uiv)YSTKLx<7)MP!S7$UK@;5M4PjUci~w3&|6 z0ktexk_hV3^44efKT{eVg*J3pNx;PQqeC`d2)ffz>Oed}qjYp<+Ye;g4}8w9wjW)1 zmFVulOv~WC#OJBimeUL8bG02S^zCD5-qJCf77$e$S-}5<3ZCHYEE+C{#LU84T;Il> zl0eRzVSl6aEYM!}e2?Yxz?WQS3K;CN=qn{%pLm>m(sge^a)8#A8iyYoMDHB9t)F-w z*qY}$JI7iQ2;1o!d-8u!p4fYe=8hO!xa0V&jVtV}Rt2s7WgvG`VbDIikaiFO;p)U1 z?yS}gpueoZXVQ|%CsGE;tZ1>G2MUu38h4dk4d)-+pEBGJl3kWGQd7 zs^6o$o#XNrvfl=Pi@^B&-D|atcMjh^ocBQeu~q+;g?he<_cytXP)poeqgBlhplIMF&z4zJeQI5C`?2q9#{77T*q1yotgl*tq zwcH|=Nt|PW3l5^>m&#qN-ysItfQxk<35l0@2RrEnLxG+mDYlD&;$XdRb}@ou8qv_M z(ndd{pwA<=At9`&xQM}wLj|Q1j1Zp7EoF{zJq~Zd5CUy+Up5bzD=$P{JfFCuA|{K3 zOu=+{(DX(bFkap?of`ok zF)S6fVOi>0dTqJ)*5NeYK@MH%oxr^#dcxg@MYfcijcx%-LeV;V=e&}&&Txa;w9b}9 z)Y#741h`;VMdE*wr88@+HJ?-x6Jf<}C@eiwHLrboE=l8Ag6BL*y0)lx2D6mhC1F+m7&hmpW z|6tvW>#$aX0FOn%x@+i?m$c*Ct6pbv?<4EK0zlbeD*VD)UDKWOx6kK2P=DOqad+49 zt~?8M@J*lvt$P{1arWD{et63Z!6HVeyNpmTV#K!sBfRAW3JgErn&DfQURz;S`R+Bo z{#ImZ_bT7E#y8x0d#TH@p}HM~t~Z{mzlK5hGrUW$fb1{+!d z6zn(6_T_y$9P^D8%iXssTW&bp8 z=9V&zSM4nYA`NU&Je!B;HWdnzxI+c#sWP)^OSs93F<@RIJO!&j+=EL~FDC1UqU9t!1|5=2 zh+qc=04qR;5}~l$;8TFGj}11cL*-;dC*?>Ox^w!_g-AFqUW*x}*b?!Gyvyzp zh7Z#S3wkU*smKNdKns(5C#ERK8kQu`BN&m7#ep=MGE$l>KRGp14#LlASt7%a9Io+(8c-o5wW-aWg!W4fM_fe4d2gL_8y?b*9;?ZBOahA#urIitsMbx+kvQV2lhC2rrEimD65Uh*Q4D5KN~uZs3Xx|Wf)2gVrH08C zv(Co4Q3o-a95oxB!jcr0b$7sBEV43q-*_r0TsWn!6a3FWr=zO_n)tCD3R`M;$O(^k z9nCe2z)f(l9K-FPL50Js9Y@k!YmRHpa=jU@cSZVG{YZVtjr=|^xYm1QwHHGWIle2) z4`ld(d#66VaR0(X{?H?i&vMNfu6b$y%E-q@K05Lxx0@(DlbzJU$*I!9djrO_U9_TP zfX5YS5MAfqQ(#N9QKBLU>L7}&)zGG=HHQf2*5n`sIBwv&zFCm zZF?cp_QGfNt8I|ac*r%caTPcDMLu0WaPLxf@K|Q>*q7XK%d@i2J8)AJ$}Z6m=b!>X zUAL{YS5*k&8puhCfwv)2oG{F|6U@~b0dEGT!uph`CsJBOwnFHK0Zk;opSE04{H%5V zH@&G;Oqy-q)~XC)`)`0=?*st%waSicIRtMY*lOxyh(v(eQAe+c znEEnE5Y$k<2Jmm#JxjA26g)5+U?IQ<@ik=Gt_<6ihgH$Y=TwnV#@OM8h4l@)mfJHlHUZWJsKfO~PLU#Dpdx1Vj4tUkZr#YIfKR zU_D6|-3dQB@pda*f%hTe^qJ*0J@l#JVKM`lGX(pEe<>gZ}w8^Hxc^NeUGh$;ruu3K)zG(8r=D;ii_a_YDWHT^+_WWh>!sYYu#{{#c zn8wPsS}9+z6pI*Ur3j)4d4H0Z)gh%-7SlycB${P#$Uq%IW(^0}f`DYJ+@nSWMmvNJ zI}!8&FuB84a_p$~ckqg*P(KMU@1be>JC34z(p2~FDE=$z+;@zZX1=BXd|OV@UB9Qg zzM{G}BX|8bb$pdN{uMRwTk6Pfsh9u5Uw7-LtN!+R-y^;+&9`Rxz6{^D%I}=_=lJTI zuPnYY?|)QLdo#8eyQwTH*@_*RiXGXC;Y`Kwd;lVjY~xb>-KOQHyRFNu*{0D<(`dHo zc&6!i-ox}W^ZtdATinv~>8hSgpm&{OXa-pNj;4F}WqXfghz)|{)5rYVOF6Iq!?W+7 z{qX$z=hHR2ARBy|X3|vG*FMVImuu^~`_}SXcdsm8$+n%$w4Kbhoy)YHn?Jj7HREmG x@O98#OXC|9JT^|zeKf?`x({wpSlM`;9>({MeH#>3HY8{-PYe6Mg=XuF{y&rjt$zRj literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/backends.cpython-312.pyc b/compilation/__pycache__/backends.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b720670fd43e25b99546ffa4d3ba14baa1c8812 GIT binary patch literal 28531 zcmch=d30OXnJ4(LZw~}Wg5XYqs|c=Is9lyti`q9kk~el@7~%sZQUF0efRf07jwcXo24? z6h&KOTL!i;zbD!j3l0Qh?E~$xj)9I?=RjwyYoLq8d86I2o`D|bFOF`FZ5!BT;EY_P zKC=BigEEE#JA{%*wNM%f4w(htTjqgYBX^7w%HHAx|9co+dIokz3?os+pYO7i6~9@s5ZM!X|q`KcAZP!-u5*;k1V)VaTK8l>1wDTa*9wF=c}*8!m>{0ytH7B$vI zibeuTiTUnGD*sFIzh_e31J4Te;X{#qkv<`C06`I#&=A=-WE2|TGVASrj@8hF8k)5} z4IG{@1Y7==3OO4z%J#$IXf%8=8jP$HIw3NX?0&>xV~$@xf$V42^}wH8vA^$_7#_QPCN4x|=S$IO zOo>8cT*ITo$xt+YNok^Ya9k84qsh=1iWrKEF(fG$BZ+wQY9tiDI1;)z5qe`h932hE zsE_5ti4Yn$csYcAhod31b2M@-BK|EcCF&~Gbv_~vhoi%j)aj$+ql476phatGL_C3E z5{JTr5t%9#l$ITE6|WZ)^iL^7=sy723bHO zN|gvUBuF$))ghSIN_!qdE|+Xf#0JhPMv~*==%BhUXu3Hw{%G~A|1LsraVc(uLN`5v z;Yu6FiEk#=-NJDrlyB1`o=e)aAi=2ZKCwTk!7?t%;2otI;kA^xw_wuNqu8D-(!xiI zHDB&6n6=UkT(VRPN85dxKlj#Bp69-#OpBHhWtMCCwOw=mX(MB$kxAH2a)M1u`%S}_ z)MU@AiMBs`wA|;ppyRA;*Y?SzV|(|W?ycQIef(ASB!I40{~Obm}+3TSixxs^0o zwlRUWz$_73nv>5S9#16WvB1Uf;FZXzkO+*%lYzwe*jQXlMufn%;pF8&-VUS<*ZtDz z(`Q15&z*Vx#OWiGB}(!63q9zTZEAV4IWl@RAy>03FP=XVI)9<>Y=7VBb7zleC5SY? zvh8v>5l$vW*}}G;>&Xh zm7vug_|>7FWNfTwYH_F>u4o6NRIZnRnPhwJh!+AM>qxW2?KBmW5kS8!-O-*BA6pq!4k3I?6l(B z9&rd|f<59CY?IjSNac9TqOIKlmjh?2;EdP=SA-XMi4XC?Rt;;Etpe&z*qklsAQ%geY1uCO-Q#mC=JkEd1K-q*X zSRs3qbsZ7UCH?u@|TTbP@!H6M=9*7#>WbZDV6v z&9AqKI0=Shkx=4tcr4PX)JoeZa=laA-ywF+z0L|m-(C-oPK1PTGW>c#IU3N|Arasp zulb@{|I3#nqk)8a^l1%b4eAVBiv(y328NR~V&TB^3=;?pVWY6Bur*OZSO{Mn#;HEh z9e6ns5aOd-l7aC=BruYQj|PA)L;@EhgJC^Hh({7g!U|9!5(YaN2qywpA`{&#obB*I z{ZztgqO*+t)?b5ji%aTfowi0(Tu2WS4EWM>r|66FvL|IS_Br;wQME@$Au{ zRW{R9$QCvfvX{H8NiO`G3#W&DE2V6?>=6zwZ`1 zsbm+yHb5^z1jHJI6dtubhm9f*;6>gsqh5&~GijKE$5 zio4(>@_H-oMZjdu=8ooP{VHW4hz|hkDZM#0Go?blj%ClTw0#%r+gaJYVd-z^8DS(% z_*?%3&MD5YVNK*(zY{UQAM~|=#OhNJ)9K1#I#0=E*YfAy+A5;D1j8SiZW-uJ|1`YE zz>S&_V=NRiVQAvE;WWEuG;pt?1OxYW$#v7K+_lezPZ*T&e`LC52pT8aeg8 ze5vSmfOaNrc41s(_+%hHv;oXvJ0~K_9wf*Jc!3=b{5Wta8owBh5*Qqd0FjR*2nbvY zV}DYXL4+!xfUuKjM8{9quBDP_mQ}Pq&=%>w)E&?Xf&dW&fUHD1ARQw*g576>7Lf*( zZ5p`-U_wU`LJUskC<>7fPT9>!h$QL{#m5rbBHV|l{6#2xl$Pie$Ygz?1qnz-n}_J1 z5S0*t^EcctZ*Y&DTv640*8_JO7HG!PDtTHnp2JI?!>eVzSzkrQ*CF{jGQPc%Z||oj zqrYU`%9T|;vB8If8=p?D*#E@C6$R7owkKvS_*YLXO3>dWXzP5F@7rzo&aS==(+6ft zU%ll6hnxI<2ZBGSxAe6+K4`X4NQb4b*Y?3y8^UFW0*8cQQFenSVsplJ^T$`Q|9ox@Er1I8yJ`7D66Y(!8O~X=h zh0#9M>4;pgw!6fmNsku##VL1kQ|6>jU5u1yxpME6d4w>@O^ioE-i^*q<^{n zj22XAak=*hA*P!ijIl9gdf%kc@&<0g__-b5rtIvXIs6MuhrXFk;LKnoDK?#sCSc+t zL}^dxn_OM#+KEovkg$rlcqu#za+c^=+W&=$AeEfL!Ta0rk*gD7SL1 z0u$E@>S9o2JgHE^3bM2zI*B$Uh%Sf-*u5ZK5Ub}V&=(tzCWprWpOiKd)InP&fM@~N zxg18LXfP6B4(Qn~hJmuMegg*+X}rFm>w!)^AAv>821<*^7%YxqwDX}L@k78hNG252 zhGu87tl$8g4VuJfv3$fsP+sl^lN z-Dj8U&rx*EFFZAWRPt{5kILVz`%&Fpb7pJ5w6%Yw;mD)nvRNVHZEe!GC5U73L2vM|+4Qf>mcwq_1~!TgKEXA8Dgu?@3nL2&!vvSq5z9hA zRWQ7-GgPJwDMLXZ`-Na}%9zwaDO#bkSI{Z-$+2Gm7)>ctA;wEeZv{{n_AdwOCV*#c zL{KI;4P2f5TRJnqq_Pq{GMbbP*MbgZ$!)_4sdoW9A$4$#*7iPcD1f zr_HPG;@M_3jcLW*dDoET-8YZT9GmTb`^*y)SJahpcS`O~bwM?h^c^sLaKQ4M#U@)A z2$PE<*TXRYCEz!}p%b!8L2no(H>fWYG?8#yc4jXsi_V-fVQLY$`6-0g4pCYfrcKXP zgNWr=I4<=Y)t3 zyr=dW7{3)PCg7E~{8Pk&J0;`S9 zG&`?_36Sc;gLDc3m1abFOCoS}I2^bPJ^@awmc-+hnqt zxD?_RjIP)QM>gRoM=}QOU!~9ia{eAUgi6a+r8y)WV5hX8Ama&|UD|+n7-=gKJwEB% zj9XJ}w-A%~5DuV!5$CR&-@j=7vHOlY)3#4)+n09jzu)`s_IC0ZdJ`y z&3|q2)IHyy)&8_Lv-gy=_f)#(v{Za%+M2DbeW&(zZKkqAs_ejiojo#taB=@VH@IU} z&gr5xE5}!+U4d1F`gywo!*?Dcg;Tq%IX0(Q)`(PQjFN=8l_`9eekS)8}FL z=pkSvJt%`l9}o5;mGLF=gCsGO*`w7hnlPTOQM-U~`vE6dQ?z; zHupw71g8tNEs#s4`G!mf;N7qe8@0IH8!JdzhdPZVU=@aRI!j;q`!~%!L7z>J;1Hbe zyR;bqpQIsg?hB>obCg|f23Q%n*|**3$}1(fsY-kb?%!loQoOc{(7s(?QgTtsQmBQ$ z{v4pT+_|!Gzh3(PX}?zG^|8<#cv8+yBffFZ7RuwrN>p;?PHE)Yj9Shb-cG5=W${^& zj$M(~)#MAGKN7WrNjY5G62YgPuR7feA}}7m5@_k(F%(Elp8j=<+<4X&E=~4mWS+vN z56R9@V*H|l;Z0T_4Ff?Hh;yd4n&Ad4_QGV(VaEQaf}e^ffTo3@J0cQeUA9EygW>2M zgKS|8xyj(>0zkax8l9hY@|p9DR;9ZBzhEYNo}N8dAqg5jhtq6TFa#1XgrC~YBqVVO z%+Ge3pN_&g^m_6W2g_tL$w*|sGQJAu0Af`!8KQ8J0JvO1C$}p4Qw4r3VS*tsvNu03 z%S8)WHV==2S|O&H>>LK$lZ3T{5r$wXLdk;^Wm7UoL}Cjmbj{Z&yprlv2zAg$A{eS6 zO@hEEBnOoqic^V%mj&IMP%^nHEC@`W0z|wfJJtC8#BNe>wu|H=#7!g`M>dVdugSJZ zG(47w2(lv@BT zec~YmzU~Ie-LL>=@YZZg@ZGQd=xd-GCD4uDvaGi*>ut|^YqQ?Ub(giZNa7o(&(0qu zf^gef3E_>NVA_5>-En*wt|y3Y^3CbygDb9Q?;lO~zmz@o!gT-5lQSph-3wp8+yBqc z{K=V2?@_7uXuABE#2;Vf>ofe`C4TR{7bO1R1AcF|xox_Cg>Rst3om)DcRQvJXZ@8cHQOcs_UU6;zAnS}Eb%>ecT4=f2YgS~Q=>E_;|)mO zK-PCO>#NRIHK4J@4QMQit^Klz3MGa0 zXZjC5%gH*MNJ6oo@u?vS5W;*h9*;7{nfMCD7>BOQu4AgY6KVj72JW3m94C=* z83vR@+LU)VMtLZ^lynnBa*p9I;RE*j7Wc7{x7gQe%$9xWjsxo)-fO(UvU}0F#^H50 zu}&}0NLJs+Mwi9Ec>Zqhoqjd+oZn4nu=AnZ3{fzQZ%R`?i39I=gf^)hr1 zV$GCp!UAcr<2)(0ok&JvvI_@477bfMvRy#|h6T9>A$Ub#f-i zVTX+;6bg?*ZAUqxu+Y9vQ7Lk!$a#~T8{~A7(@xGeHK!fxB0W^T&%87=zOP&&IVzlZ>#1@cB<#BlmP@^no7jo+Kq+Tg{ zLO21qSeAlqGGEz^0r8|PGyGKu7Ef~DG|ccR3+2I^&HLL|@+4Ab$eam=;1L^AM!`Zf z(v&e})KY&r<29G-?fG23xBn+Im(*#if|Q(2$tv}eTTnVhjoItERbux7qy<2aR$b)@Wb#(=U+rSN)rW{lDYY^2a z^a_+ftnu+AIJ~k8%Bh2q%kiiHp0^t)do(np>XnK!D4mfu8^jH?f{1UiPJH~oZ@9(H zKzD_v?L5a$E4JTB&194`WJ}2<_P0)H3=4`3{00LDvi(3b9J?rl4^GxH$fK~Wx(_h= zA#o5MCHf_T7IcpN@&=b~+<8}6YTVCY!v#`Hy%LFxL1T9K((ou#ou%Fwy2YzVFWXdD zx1kp(P3c7yJWta)QF{b!zPFGH^%$-zDt!th`wjxXq}7+H)x>qX!Og$%&gAzeS(8zc z37BrMlmS=SLSvMG>}*W7DjI^aom3mbpo3%^6b2!1h{oF2 zDCNe-RIZ}t7WnI#~pV%epZp$dqUcKVyXQEuz!a3Z@XPHeKPB*BrTr>_mZb8 zTi%dv*&~(jNqhIC%lBkEpS|ytI!~p&O$+^!H<m9^6bgl-P6hCvi<27UIeP_-Mh{iz1ADYF#?|A8^<2DZOybDl-driv>jUE`=qu* zbEC7#`Imn0Lb~nHjUy7@_m|!>8jhmb=9&HT%?pj+Z-2=99u-&2S1j}{7q?AYAC>xN zlebcHsf81Z;&S=+<`(+%wK<3p#TZ%w4O(n?R8gr=O4XE2RMAmjOdD=i z2pfzQVi&ZE43XLGsUzB;Kk)f%#yIz7;<#USe{07l1A;a{y{tw~M0iDEKd1kCB z?)O3S<&+PZ8lW+1jG3k^WV{5Cp_&%#avc$7Dp^D;vIl5l0b~{Fb>x?Xq9LGLYI-xi zv^@_KFnkfx&K{_;niZr3yh1Nc)L=41>yJ*Y$kCt^48(Dqg^YTY-O36NT^bLILPGo# z6d^nG)Z+X2R+L_Mk?$W-vm)$&52LCj<#6v`7MD#!|J z#ANdq>`Yx%Hbtk_8PtJO3p@!&(Tq*9SIqlY?17K$+ka6RT&{$w$=gNQ<~^iR(vWTJ z$}}F58V@lg+H7?zy(_XcZS<~uSl>KteqgU**uw@qh6b7b*85Q*O?JuvoYy#LWEk&W z!tZk&`IIpyBP&EvXt=Fd4Z2C6Oq8xZ1*LepTE-}}$bePv2UcwYRZ*}6l0zj4i(`+r z)L1BJ5#^b|a{2ulEK<{VfW2;(IDI}u#c_>O#%rJB)sLDKPW|Rmo~|&Wa9r%-C>p~M zEZKS?G79WS{0Ty27eU%kQVF4f7ymH@n*dBT#iZ|3fFY?aOop}nGX&}CN$OhqQ-n>{ zf6-b(nctyh^i8D1GWyuc`K#6~rZVR*$|`3Kk4;?no_mI$?fkO?KRvM0bsQ|ZvomKi ze5=H_F1)eA2R{r;{JCsd<(h?aK;sknfIJkH+Dt{lUz>UDvrimI`~=EN6!NQ^uTkKy5_udd_e9pq78mVEG{E zfd9`8mgk&~KeyV*@3%ZxZxczfF49nR%hoH`*eGC5xj`MK;DeA^2B~Ne)1h%5>rd=gJ8&&etHf7J-lJ=y|wkf0rs`uUSc0RDRA!_-lN@uXUYjRRa4X(E1q(( z6UPNol^bbRV)NZ!zx)(P_`>6%*Gj_Y0+`)MYkrN2e}q>0rbh!+;=4~Nhj?3?p2Qq_ zYL5D2$1O=rWA=>f(|ox% zN^__9kpsC9%~v>-=MYZZ;Z4u`7H!A9Z`BwTXwRWd$?!Sy+~>nmjFCB7$&>z2&9WWPoY^X?xEwy2#T21A za-+o>wgip8@j3O(Io9Xr-!lj5II!)QrD{Yt}Y8O|nG z0PzjwGU%}hk=C5-qyUADAl|1!2>ywrb0gP)*8%-)5Cwxug3vX{9)SG;BLn*aB2FA6 zf=I}x?0Ejfk;6w`K5_ntTm-H)L(-K@vK!xoi>sBqOUWTFAzLZ1sEJyXn@QxE*G5oL zutFhAas~4bB_fg0pjQ!e>PMOQFOgTACg(#c#i~#_vSC>KE`1j#CSs~=9m6>cBRyFY zrmMPXWGNa__3MRj3%W9F}~C^-tKJt=2a#^k$k4NKFTp>z~Oswyt@&;{EHK9U?Xq-t}n( z=RWX@z*gwjL3d7-$a%|1pRPaSqfRV)yR+VgtQScTPYqc2*c*z{d@a}=)TS+xcT2{* zOY-i@c+W}Rb1U8#AgIVx^hgywnTpd(6{kP7bH#fh%IrR3SoJk$e0`FyZ`pSk%75J@ zX>a?-UT*72L$fMJ?V%648;|!`|Gd}?H|X0e;F&~<9EVX6<$Qy}7)T;y zNRcT1Df!-p14NmbH^Yv494f@Jy-&OZU#+;K&$+%Ear7Z0a>JyOxu zH5*HUIWC($Ki@R_%3RT!g9SMu&wJt`@5c~MTKtSeV~v;XL~mZi%?i+N0F;l0lHp6T z3C;!zj>d(Ks&%3`6f4109?4EJoX z&5Au=$`Ixz%uPU)!`wAhF`ilISteT!E0$&XX2-C)W9%2!=m6T=J?T6{9Uy;TGC)?q z#P1;66f(^rzk1gNOCippC1)|OkMM@^+h5@Yt}t%aK>iH|aBft^4h=Ro;@=?Q-_Vqw zB@aO+OeOF6UJ!a5Fj=!$vrgaa`Pn1s%Dq1e-mAXfm$pObi|{nl^YB#MIF|L6-Z-&l zG+AnXZg)TQRnItnIeToz@plOyiu`8tPQ$zAZtgwPPQxFYx10Vk-h&9&AC1Maj>~28 z*pj#b2bSp(bHt)cT`h{#RWJ{L0f}%6oox|EA(R}k)-Q~;u~=BG)6s;~cJ}Rt&%t~} z5w{xSR8xA?Z3qG$N8>a20*2qP_Jvjzz}Ox>+CSBUls17l6wrfyUG1U{_zzf&E3a0ZF0*B1NWG$EwvigeB-D19X6Ley+TB$shb92UvnG99m7xroKK~ zTTkr2s)rgoa08R@(Y8Zb-kaf@B)$nyaJ!ROgteb%5rVg(jQ{r-Py>yIfXxpb8HG%B zMl_I^mjNm1(AxB9^eR-_4TZQbQ@GxPY`apVN^v#E1SHc_0#!MMUXAA=Zq?!nspM;P z@>AMP1jVLD%Rz$u#7aWJ-KFk)PcDC_e-qEJRW%B9bL9=gcp@$q3nBvuH_ zF9kufWV0%z$euhxkloy5f4=Yh`Ov8&uWmAI!&Oiek7{#GovhDa=>-z>(4zl0s2e73 zz^jYfBoakt?R>`GB-xuX_AbfZwJ2n|4@%t!SM1N?{I71vRPB+f_Dp-Sr9F^PLfrTb z$Xtn56#;yD5It*m+BIHO3mOTO*PzTRwkb*6l~RK7h^zFR8ayI+_y_x zPpo)PLIo_-{H)ac?6hazzGQDuaJ?FIX9K_^;f4!;>$If5#eFM4I!U+*kBb?m%rPSg zT4oAvp(!iE?Z~<5DI~g_H6+X=u1Q(7g*IiwxA`e0pe36hJ)iy8&+|na5gWgtZVPl~ z0C?zO{nqt_lrVA&S-R)-%kh+Q9l3IIw`LX6DoTJCOfqfbMFy4A2&Ty{8Q2rdARABa{uOIincpaJg6dP z%1u^L7?tgtip8hKw45sXl94FdG!jLHM>#~0ls!pMYRXeM<%rIf)zYCg>dxS~DNo8H zRwVanaU-OJv+03G84n#VeRf8_EPYX`h~-dGsHg6~XE#-ZPq3UlUp#_9;SOo}z>0lN z^XJ}zLAAot*Bs!Whi=`R@*?*ME#=5@&6j&?do1N;)_lcZGBVy&@#lJ40Ft7eCv$aZ zzErW=&K4~MJwL7ab8nGs-HNrAeG|s;GhAqk7XOr4e_o5vy+!CFPL+&SJtfCiaw#<* zP-f`yQLf~RS{Tsb5^XJzUL)Glp~b1xvG37h3a5rB&8JGWvE!zEDc{INEl2K+ zYYu!PLhf7hL2ym(&l`sJo$zep7IOTS~xCks7O_eB(%8PJ5{dL@%c*|@^+(b_c?C+7QTK?QSd0q35P)9dCj2Q4grL@$h#Q01!CW#Bm9_&CPauzsc3eSVK ze$I2)leN)9vA!wXS`4hMQiCZ|RVgD@dbPUJu_vaglfR?aSDpM_%|9}&`EqYHL9^EwqZ}yOjV;bb5`0Vwp3h%e}Rc>({Y2 zt<|D2Rf|5pY6f;bRX0_O)uTf_%+C*VWoSP1vrZlPGK_p3NVGa4O-J6*QsmxRy{Mra zH5jMrP`(c9nd)KSE&Ejcud`ZG^|_V7^tFvszR&Bj!0z<>iZ+Xyruv$u-HqObLETl?uIi#QY$p;ks*MQDL3 zmof!N_KJcPGtQUihl;&)CRL@)sj$ zBbegEL!5MlhaiGgKbT72=BuJ6w=$j05Y+7h!{L9ByLRL-;O zW>lf_q^>%z2xnfPA*>~)salt+vq)FWFg3`Qgm_o!gp2HaCSn)k(c!@j!U#IxNIPMj z@b|xO(8Lj61NjKK5jSJZX}tWQfvyUCyT}X<1_=}Ll>be#v3sj*5JWrBTa_QC$S7sU zflHCm$n`Pt;N+IPQ3qFy$tFX2ivL=|Rg8SXkM{hMi3HN6JMSD{cw_F=g5lQLJI6m` zY>D44Dci`&r;e0UNB&RNk^I{@Q##f?Wk~AG#Ti>cjW`uQhkZz&3ipg)V&T>of(y)A zr~sy*+3ldr0%uf;-McP;b_eZf1x@5aQJWszLZj9U8q7-d5v%C!!)BZoC>7JO`AHeR z0}Bk?3(Cv{q2)-Yl=vUW`R{O`MoAjLjE#v4L*ru;A;t{`qx1U|?I6Rx5L46oIpz6~ z0*f>y1y$c|9|gKq0ks-N%ui-Tvpcg5?Tb65hHbM&*+6$D&?^Oc@4g@fcF($* zfOWeR*nW4L6xexp^zKwTaAejE9#L^!#@i%8jsNV5cL%6i-aqNfDL zI2s9yBAr+YUwkio?5^p{xb;~!#e|)T4V@Kt+ygerj?0nj!tkXC>?;pbrhamckVBZH z>|ysLKRRM5WP!5PzE- zI(fxECFd9%xk!PD${oMB9eaUdj+65uMLQKm9R|YF6iRzXHlM{c;BuL+n}U(Ua#d8` zMPllT|Ad@BA!n4F7IN;9^HVr-i*l+#)gG`67Za11I~?uSV5*|pkznwT?LpZ^8u3i8 z7)l{7rn0PTAlZ2o^LT+Nl!75UIssE0h_Td7J4hGvvrQXp*kpCDD5Gp#jnI)%j&NcM zYsWpdBGc#*mDAtOZb>9Vd!qWqIvRYNFcxC#0|=mcwI9u`>|&}aadtm}%nw~xj6>;8 zQ)SZ3%rdDwU)i5*z0?Y!5=ExN%Ef=CggD6VM^gs%Csg;pAm?Y~FqrdQ@_|U@lyjp2 zpRcP>Ze8|l z{g~&vLZxRg`1j z}|O_4yBojqQ5RGpBeu4mHDRm?Q0feMg8<~D40}Mzhk>?dxyWxuadF~DKKQc zEwD&hx0x%8rcXe8v~AyXzake};kzE@Zi3w@`F2h_R_)H4t{K}sbY)5oNhOD-ZL3u^?`*lf<(2masylo8SmN0;H^%7d~hi1H{CBy_Gu$<>%~J-g(3_F-$=%;|^K^_lAJ zQuX#s^&zSH&{Fl$`_E0Ee$=z?-p>1-%RT2LdnYoxXOk=CoquINhyiib&%e2t{PE4ySpmid}&MHjvtfi(`P*1SeX(TcqZb9D2} z%$aoc_PhO=-hQdK|Gt2Uayb9m>7Hqte)D6C(NT{D>94r;^4!a}zB>2Sg`V_|6U${M zr=6>?y1zDajV$lyM^-#REXv{|hAggF=$gY)>Xu5n?;S~(bT5}Y_p!-PeAqyWT3!rg zssC2TTnBF4=vpq>Gktg!`ZgmAO)JH%i%`!hzGa)U&6~c*LtDLM3wDpYG{e_QJZ`q= zxm$IAHx*Kc*}{6Ta}_lhL>@yxog^7)hTksn+kX^d&(I=;joS4KiAZO@b*S}HyC zsI&vtgtKohB;Wn|kJNiE&|mcz5k7Ttjvc@7HRCdgX$Js=VOb7=B9Jb z46J4w3AP%3_+qBETdM8O)b5jN_ho92NVP{+JVya1mP>ao_Wk(iouls`hj+!h^Al`> zqHIOyCsYcv9wCLW-b~NYr5@6bsdN5C&(XyO-1q?_#ad-39`G&co}(XABoi<{cJhv{ z$GCQ`Ez1|r9$e^scmI#}-=&-DytFMH&d)xH7_l^-bYk44;?}?Wq#YIlpRM~)yeTGL8R?AdIn5yy72-Diqe)kCK%1|6qeuhc( zQ4%ndIOQHDB1sv#O_wpTUr$o>k)cSFKLk~j>>4=?A@3j`Eos@I&n+|C&f5Zk2wEda{NbJ*+<;|zv3!B;&wkVnhZwR5gCpc zB(DD>uIVGL1Cbwb`~HgSdSbL1jGuDk6mf>&N8A=bRcG1kE6a|$8`d?e-%yus-Mz-q z`@orXdOz9zy2$`tVRF}w+5qS?2ESzRKQWpORXDp0Rk$n0SiZ(Si}o05)oVo7OyqCx zUQFBxGXF!1b9y^nFf!k};JE#4+SM*uI@T>nwRY9ORn?|B|6}Wz!QfwLdAIFHZJEYh zQsb`29EGgmrzlElZaAiQEOXV^s75cYrZLpaCU94cp@!;L0-n?yG8>$qaOASl{eJ-> C*Gvfj literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/base_static_graph.cpython-312.pyc b/compilation/__pycache__/base_static_graph.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b3b1329cf89f7441b56f2c93a3c1aba2eb6640c GIT binary patch literal 2518 zcmai0&u<(x6!z@zY?20A2&Dqa4Mk+DW*5YvB_O0Iq*WkIDneUnFB9+Bo3%2(@OYYR zPYCtKAJ86qq~gMrzkv&!kmdps2gGer+6yP%vu7qrQ`C|6jqS0YzxVlj&-?T2Y@5L2 z*S?d#H3|6>JH=Da8;94S@sM!B9iIeF-*K>A@hd^KUkz&gn(0^ldeG=Mf@Z(zkP5j$ zc>u0&j!BeRI=(yIL=zPRHP#>@?l_h_4a2gHx!FU zHzO{x_8s33R-VDd$@SOfY zWvGQ3GEY!FVwysH(SAZ>pXp(wf`sab?t>4?sF%lXJi+ilz)-AHB{*dXOi0YObQdY; znps&(rD2GKb+hSvz=!J@Lb4ec*D?@pfK)a1_Lm!8Cd@KPAcSI0 z6+^>g>(Xr@=m}RKBm=i+iR#AVlPJ)~W~6hOMWH{oyQPCfXez=qpgWsen`=8)=+3S6 zwbiw|>)UHPhMS%1w>LI+P9b7S&@{2)gH1gUk0=0A)Rp1GOF6UHxlCfFy%C)C5wtJF zct28{o<2pZ$z{62IjAZMf$hC~GKe(|M5q(9Ly;>>Nh~}$l-?<%&ua&{z~_mRodZtR zgV-17Pw+LRBGsv9wM6(sg{4BF_SFoUSuhFKH?#QP^_~vmUM%A!4S*Oibvu6zHT|F0q4V$v&!1F?NW zo?any3%|539zu^~HNcU0n6Slz?K$|pIC(!lp1f-&4fAt{wkA;a=i zh3Ojf2Wl#v;ob#eDu*~>#FFqm4cRWjVj-tvp}TYoW~+Uf*xhiM*{gIGDY+|Ab)`ox zmGHC=iB(QIE!)d@2FvsgQxfYg^+mV~#ZHEC3e~+7uB|{Z6hlZLxC0_2u5@bz2Bz5h zC|~f)<;z8G6xs6j}S&k+`dLhMx4S$L^= zG$upv<_47N9K2E4xbUUQwv!fNNulfot8Dc;jCI=RC}eP9xmnxATPyXkJ>$Axr_9g4 zG+mcRp6e=vpzsn?@b`qM2AbE>Nw7`7IEJFozTC?2TvP4XW{YrX9Qkm^?&{J#bW>f literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/caching.cpython-312.pyc b/compilation/__pycache__/caching.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00522eb0b9c93c089571344655e6b5631e59ff7a GIT binary patch literal 9492 zcmbtaeQXp*mam@fnd$lRjK?;{7))au@L(7Ru=$P?2p>rRhr}kASR`}9bdRUU^R?AI z7+Ygxc99sf?Ap<77U!hXv8UCY5uBF&=fvKv?#P}_7wP_)!Hzt1K%3KMmA(H=jE>z% zC*6D9(_>={ySgiDs=K~kRlR!k-tWEke|S7j0x7|LUp};kkl$dzOxE(t!YYQ4YeXa> zlOX-Zlj&#lvZddmKiPg3o|XicwDwyWqmQlM26b%0p5*%-Gve7 z`m6M^HQ`PA`h5&>;AmT-I_dBCGsHrqI>|4sx@j?1qgdV~-({fIC)MACbz$mXZ6U{q zXum{6Uh>~$%e4)nL-L4DX^qs_098>=bY0^5*NSdwozy5bMlDzi?TyglftDt*N?LF9 zE!TRXRv2blh*Z-`DC|qB?f1V=t{xvE=h^qk5EJnI7f#j_U^HtctVUw+Vh?48RVqmV zi{^Yckx0IlPDSMbt$sjKLy;_{Qc4Y-gZfZJZ-5alIVG#1M0#LAqEM{um8cv}$Rpvi z3F)}(}0s;;1n#ENIxgCkXt32$N?{{k{xzvyTsirkMHN9 z#V*-JtB73Tm$Wz}r|7suDm`7p&VWl>jr&--s-xLdM3vL2umH#66d)ZGhG;mGk*E-+ z>1;|A!a^jS%*Y8z6r$zc!r5T~cODepP=z5mkr1M3Iuxd&pvELYl~PKY>cr$!CaWrf zn${bhr4t7FR8q4vrFWo`l1->VeJD^4CzP}xCo>5t3A|QdC5p*VT(cmjREb8z5tuS9 zDA`OVO;zF4a9^56Vw)8qL#6VJI>8}XjR}#gqNbC=*>Ge~N{L$qm_Ut%VH1)N38w@l zd`=Q@QHgZe*vzwQPN#HoqO1%Cg}#_XC73If7E%&$2o{3uGkJ&Xc$R`3#rec^YBNk6 zPN_OQ+(LPGu%DQegzX8LFjeJ9HW8+9!Ii@*~!=8c%*isHw?4=?C~ zTURP-`3e(~W7P5$CnS!&mZi9DSp^iIRth;LM?#g6ISZ63Gv`<+RVd^*DA`SVB7qWb zmed;aO=K=XOI%lwW}WdoT)7}(Weua&ob}?`oHbtibYCpReb3e{opYEBlYpfsQeg&k z$EELt-@?0)9VZOj^BnnL10kc#pU$)jgP5yLIp)XAHHINqTR52R@7W>d2GbK@HK(k| zDMbyZB9dlR@e*jZ-r?jMDHSxBCHRuY$O@i}Abk5uhnmcEWaP{?HB3RDZ9AXcy?4*x zZmA<7pX~s(7fYu)@P>C7(yPl?M6Z^7Ox)`Nf&fJKntdHq`w=!Snp? z`T+Ac_5c%b(JClXFJzh}dR}uKLy0|^7PARSbL-+8N*blAvIN5#P=YlZsDr3{UaLxH zR5>Y+NMZ=ql+YYXES!--QtF(>!H!hO)1nlxX`CV@qO>08Ta6hS849MAz>wi&nJh{o zC{0Kd#ZBYQJ&$boqiop>1C$ZS#>fM2!_?MW)}Qe=__?+N#kK>p-j@sf%LVVtrL}E& zZkBH-HEqRbV`YRDUyp6c) z3NE3vZqr|X_{oQJ>$-~Ty7GsA&@;~x_l5#^Yz08@6PF?Meoi6zmvjEBAUz8vY}{7vl%8OQc*+D1C@lXu6_x%m+hDN%ML(7 zU~LoRiUXdO1e3H}u>*o(0O_zvZq$;rjv$z|^E=zIb(XwRP+EFgY#jBS!LQ&*y zmh}QqUAc#KX%EqcA{X~ROKBz5beMdw4|EDBulTAJMy#|R`=Zi=75(Ys_@(2n9^gO{ z;&?@tr7L{ip6Bz9C<{o0AGN8CW{3D%^KJ2|qTJ{yE|g!EJVYm`aKL@Y5fc|;`2dvF z7PDi#)qGog<}9K+XEVoif&PWn;?-N0fSS&4ka$)kw>;)an3@;cakl46a%3RHsRfjPiDTs{8G1>t4Zw14> z@~wr`lV4mLI;MO1iEB8jk~pxsrnXN`B>1k{>67FNQlO6g0mFRi9bD+b^ajwS?%OVVwb z#;L;@DPT3!CYwGRr(1BqD#MsjNz6u|fBKw6sVstF2iubwR*fI+4znu*kb&*~%2$8Q`;qrLJL_wGP}?-6TtD)6 zJwNH0t=;*cwsGpU!iJr*wVhwqubHcFFV?qDN9KY@ioqj=ZO01r#|pk>tglT+2%2yCX~9yQ>5LMJ$_emP0(Eph z4AQD~u?mJvp^+5h$m@;z==oro?Pal}?J$aN#i0N%L^+~{LK+uI0}Nv`>5OL8&t9`4 zZc#KYDFLQ?6?@wZ|5fv#7aK~3Gof@UF>IhGo1r%-+C0#!CTqP}a9H{@|Cs#yMYW(bqcX3lx2U8DD$eTH-wuj&Vo9-#p8=lnayXvwTyj zwqYt*tPRZ7b`)zn=4!iUYXKUS+`fr7#^0DYIexORrhU3^*1cnS)#vUV^Hx%~Yn~^~ z!MP3H#SPsBzUjX?hokH9YSI#zYu;6C-u1v;J#k|E#3#Y&jfIz9Ep9mokgUa-w-&0J ziv0S=Hd0kHac2C?)Yd}VzQTdt&pmym)|cj5cNJTAm3(!RsiJS=PloQa&Tf7M25zj% zADIjm-5UVVZQePzX-{#}o;!7gSKcb_JvFoG?fengUeVq1#6hY~Gmo9bw`w9co|`%{ z<83bR%}-odUGVgMt00*9*Y4d%{OrH^tw$Q|8YhOq2rSj)2x@h#bOQKmV5Q5>%dX3A z9WhxFB*|Q{16pz|MN9fSm#`*nh?)RJSutKT%IWX6gd^!3g}94~cv?>@A%;cUam5LC z*aE*wtmBs5)Y(_~92-Xe+Tx5d990R#l&?$&8g#$ zGOw<<4SbV#=3RMr)CN&0Kshj$h#r}YJAcX`@{k!s$yY!C!K##PH(2VbJ{pdIGX+5< z6}=9-YLqDg1OU@`=Rtk&z381BNu7gt7izCVM&tr|RP6bpdlG>`m4v5=++MK3DSm8Q zYPZL2D_L%nzlX^LGhvCa10-Vk0Q{{pu-{Q;)RMEDXB0R?w3Nt)k1SVx92sS?4J;@R zKW@B&U`=X>xS+)`ZWr}YIo3R#m4XA~wopY9MGO9!y>%+KmB!@A4@obql_3#!)H2Ep zJs&%<;SYfsPD48x`CCTlDJoM%6=*gkq@#LNq~az*#{p^jqsM01~mI}j@KtYT^a2boYv$GRgu&o%U5BF!k5+~28$M~8cFZ*F_`G`O{nnRE3pCx0D&GIKU-&2pmZiF8*0$16QI^vsNVUlfC+Gt6^u5NWexBJBm-isd*R8B?E1e+1TH}NE6Dz5jC{@S1(5NO-}8_^^jm)K zSH4x(+#k88`euC1dF%b!<{R(b?4PaOlXsMat#iVTqOjw3*PW)~&euN+%m{sXYd%%v z*OdS;G_1-YI`Nr2)BK(Z3hvi8&$jHFX*qC5y8HSs(lhlvdG`~}>)7>( zVAf2ks*9dIlU$y-X^>bzrRe9XeK7s~jU30OwjD>wPHg28-<7K>75I12gM z?7#p<_9{InqPQ}g)?JgOfZ85O>kiC_@LZ;rdt5~JVd!Oy%mXIbqA#>3;6@;eE)W1& zXUKFtLz+7j`c5{SD7V-F?4~235EXDO8zF-*4|-gHaARpvvz{GR0Xx5gts=Ja8JY%( zRflO5>rnU&bdK(;u_N3T8crnz*(LJG;;{1Hta4d*eY1wKb^^AtRnJ>mtu3WBo%1X_ zzjRc6V}<7eYqrb-D8f#H)qlHgp1`B9|MWNbEcb!8`NPns>XC)BZox|9%~%R9Ky%@> zDStMi7>jtyx50DM&+f;#V4|6L#pO2YFwa?Tg4M%>#ygI*mn9S8 zjIz(6z%HRs3Hh-a2Vql1bE5*ts!|ACUWhs4*Q1me;3)D~v!hFyri#WxU!w;E>+nmO zUeNt0yw1igUcW?_lo_OV3zezRb3k((e#&9UASzhp8#@M2+E+X06^dQ~;O^M#CC+uZ z=VH&~o>^`!DA1awIscZTf6I)2>)3I#ZT~E{zU1I4Wp( z4=`~lic%|#1g5UQc9W_2`QcUt$C!*9W#C}pz#wk7t&Fi)h#XqtcwLqyz)Xt=$cpS> z8^|+rb=`A%SE5J|HR|BVVq|5dNQ%hCQKK$Piz4%)NT+DMWi#>FN)bg`KA_h2wjus=lRdMCdE4Xz-Jm@R%I2iJtHTiV;S#c< zYBu<+sY;-DWK~Kklmb;f%oBr;CK{U*yEHDImLdKurEq1MBPN{}!JMl;MOYuHjgP(E*GWeL__Wfwpln1^Azz39>)!TmF zJzITf#{JsZky1_F*oiOJH5d7YJU_X)#MjR8jYYn3>h)QEW64!BIdad14yLDWvFX_7 zeCxc2G_^dgBJP@r{p0&fyc_-L$*nW|`Z<16k>51Kx82zYILK}L+UC6|e{XQ!YO(r_ zn`OoQJ$<84!gU4LYTPI&8fhifgt7URZda}n!!sq2Au_1S9Dy6vgUV3Fo2#cdeneW9 z;;v;CP+XLH#U_F(%vsPIyxOK#ZpdPfF7SN753R%+7pa+2Z;oFc1HO6+?qs#uf6?b% zA?7QUW4>I^$X;|9(QG3-g*NzHd=|egrxb_^LflC==S!5Q;cNuE7egiLej=od{O&0twK>wt#QxKX^sI_51`8?2`&`&vU8 zkBj=3_h2{;|4?ECVJS+g=6WNE|I2WM(ln(fkr24ygzmv?0J9%phERfDfJ|dSf$5eJ zf;kW#)K3j6B~?F!Wk5*d4;Y_FyXkcpjM`QCC&=bmhG8I-#(2LVRbLRt7sT_Btp5#h zKO}+QlGVQ^`yY~)d5&e8A9CG~c*6S&{!P=|tbJR7Y%AEe{imaL^1T^H)0pkCZIEHM z6xQ#2Oz>s?4C`WQZm*dq@VLv}?fulTfUomw-HiWs=N5$I0OeTff=s`wtxV3rZ+gjU2hRDU0K- zaZhnUE@%pK{iY6+iQ?w4xx>tUE&bLmvx)1sb!5qRvit2Fb{4mU9sSM@CyQIdmVQ@< zi^XkWcYjVt4vS}nJ^i^Ixh$R?_V(v>IrT+r|9@bwpV6tc796gm?XT;oV|m_i zeSbqoLw{pOqlq(fq4K_Vr319zH{@u?ay(%Q=DlH(GWbG=)Dunp%^fZ5{nk)RF#ip+ zeBT^u888LS9W4W93h{Pvs7-mhVkX zFt1~snLEe@eJ^vtLghOh>sekA@`^+2->^t;b!=d{CCDw+%3)Tl4f{nptuUZgWj*+HV)2rr=?V%l^ zo$PN{x49Hmmcr2PVDmPl@T~IQ@?eYZy*=~3*Glh&TCSSDwkBxq+#6icvCqLt|AJ?N zZ5WGhNIjzugDdCBUo}tu>d>y>n$XVRTKugG9SE++-v<0`M0ykcHqY~(c_bUH+Opu? ztqb1Ww&30EL36i1xZ`D8$HD*KwR$I7eF(F(GPo;ru!jro{;nx>=*>NESf!CRap<|P z=to_O_x)Rgyx%s+Gn$7-7`*QVyzf)z;Zc^iA9)9KGvgS`J&4>xy4>R|_b_sg=yKax z?os3()8(FExyO;)uFE~ia!(-lq%QXq%RPnM)4F+h`m(u>3$3W5}@!L$8?!B99b)X8^-!-yaZ@S)&vR|s!Zu{R8#<2wf~iuLo~ ze{qlxqe-Yc>f!1g=#2)#gFQVV9?61>2ZlDCiFQVNyUw!0&s^?Dwzq4T=R*V0z)-j| zie^w=X|R_MbwyF7p56#rALts6L z4)J1fl#k*m>OHCRFnkdVp6HEqb%sO1K=;tP!1MIgEunDWFD( z=I9XbqDVG?k3ob*v_?YVZtcMceZlYd|NY^W(f*;8L%l;QqMbaTYsIDEjhi=J+!$IJ z?mfS9=yLSJ;K0fk(A6uu2K$G40VwEY!~wgQEiZ)Res3GPEZPGB26llFpRSwQl-zar zVx$hi6)ss=HfH^nGg(xDAJ=St)pyzu$(}9VI#axT%=&HjJv_#>H)T_q1D*Y$KtOZ` z2-t=(^8%tfKw}%0@7Mr1T^Cr}L`x*f^YwW7CdW76ho=wnZt^eTHkYwtwP9B|f1V-ZV?_NRWBc$~WOhw1xVIqL;P&CJSFn`Ii3S zu;>6beKFMAb0MmLCL21@Nf@tpU}!iR8L6AETidjUh&qZKMsS6j^%T5vas1-sn#t!U zmrd8d)pE1t_Sx@ue6QoK=g^pC)>nG%=)}>fys6z&S<}(CUb^|xo%$a(zt?=%cYMr| z^cGxwHlAA@w^uWW5VME+fwLo#F8$bm){#aK>;?C)0T)426j{<`IEv5CwDjuUs$wM`|NJkkpsn3miW?Y8f z!7z5z*QS>OwU4_lw!VPVmYWG5JCN&QQ%gYybLHH;HPKZ64fHnrczN>4F<&Qo)8c+ACW+S zX1@jjEXdlpT(p#GJ z7A7ky)j?h~o0SZX)5}|f~1r3)V->wuiQnP5IX%0n*Q zwP3GKx(lv8H&y@o%GXxT)UJK!^t;c!{oI{p_bq0JleM}X&2GfE_+=C9OS8r1PdQ6$&bWWg9EV@&1$yi;NFm?uaQI+n6#NtQ z*^?lHOb`mMSznX$I2oyTUbV~$)~6Hy>;~F z(c97Qk9=?BZr+ixtdG2TS2v8k6t|a2qe}pvh8~H5U_|NfV~u_iBu&tw5f2Nm_Y2>Z z#b;TJKq8{%u?eKlq-HioZmA1|5mzabTg0JdTq?PxlJiD%NCvr;)y{XI9R4{3Viq5Y z4)X(PL#mQpMHtdYT=CQjX*D(^hd=>i1vH=$k!wU!84yl7a%eHaZHWYyVNEe%6$0@L z$8SNqWP*yVrTkv%O(zLwdKmWNiFVPbCw@OAcTqs7zR9YM0Z(Y(VBKWv%1E2xJXcnh z2Pmc>%8G;#U>%vWb7j>xO0SpRsJdR2C|x6zu1S<`5lXity?OJ~L}gn+5|w$zj?8&D zusoY5HecH@u_IAXFBH@#3R;DN*6EJhMKcAv;_h7xCm6%dzegP&5q8wcfFk6JsHGmD zTM)^Fa563<_%VVW4N}gTId0S}kWwZtF6SH)t9P}0U(rkNOt%oCL& z(MC3Hd__lRPhfC33YJuKf=H60GNnXIE(ivhSY}kj$eJY%xsE(}9>qjXBA{W<;u@M? zKm6L^MEwS#enXIl)pRdIJE8)1cL zqz{O;=N8Ce$r!)K^;{5yDm0wmyS(FK7u`6=V!@)059~Yp?8*J2vuki5+S@ZcI2;ijQON%zLxYi!nB5&7?2N8m(_~|djA&!hF(z#h zv%%v*LfWZB&PNo4B=Im3t+{L=sMFD$Uq^)^lL$a_dJC_)CR~%Jr}8HcO`m=1xtq`Z zz;V}mWX$%lOpWGGo}RQ%``#+QS^lSo@465CJg0cPMgb?A7NdYrYvsWpj@F zIVV?8a;;^eWs;v-J=rrIeCy)Pi$5s*VdZ<3ck@q-WzXgpv%Akv)f(IdM|kz})UxS~ z@!Yj>`&wxZK0w0~`9Y9t(BNiVLEFOjyPb@x&3Zyqt@`!P zu&&vkyI9fQ&JUnvq7_n&HT)o={164i8Fdk-u3RL?WD=e&82(43SyUvc6v`ROM~RN1 zNgl+`i}5TYy5&vN8#4Vfx(JXMQ8-3|MI+=IvqJKeWh8J!u7Sw0&!@FZi9z-x_u{B6 zW@GYv$Q5|p);e3Xp6~69h;9kZ@Sz9^8Xcg30!$ai#8f;96`EX{T#Da?4CrszOAt!+ zOSE(amAsdNeH7GC;HQ8{Yam7%?99(Qv@+OBwRk2%{Yx~LN_8DT#>j>Rk=(ol_rIYM z#}N?a<>pGOZaA(xZg{SH5+$pJlGTZl%|gj$5=<4ACQGX6KO&Vi$+D^&&DWb#xm@X% zdz`)0Gj@0`pR1}f7GajoPp;1=!qfwJE~>`>99nJ$;60=QLT2U#ur!r%-!;WdhIiS9 zlnK))G@PThsEW59SMJUxgf#{r8qYSG5=#Yi05wgkY$LD^+TO^KT05E*%fkAa)&9vp z;9{$)1coFmIdIb<+9DS^hoA;WQpeggLnIBYJ`}}FQmVk!yaH2=7hsCUCc7bgx+P{H zAbs*MeeyU33{HvDVPL8PYCCd9wmc-D>ez%o!26MZg@8zH4_C2c@~IooU4L%6V7fM5 zxguV^9eS;inyT;4NM`H-7HE3#n9FsiRyi z2643*#C63`N-Ks^S}~N`uym=cA#>3~y+gs*sEve%N%ZQltxFZHmzfLvN`wtP_881X ziOQxaoxY&>_3wa8mcUrdt0~qo7EeU;Y*e1n+P5HMVFjX)5gRlrCL`1nw93Z#m>EL?gh>d= zkSlRW*iz+6cv?RAixh+@_$mc86!as&1{A_?Itht6vRb1VB1B6`cu&$kkbq&NZa(1D z@gIMQ2P6LxfnMglXEJ-*`j+da>rXc`dH2KdAGr{OcyeEPX8f6}r^Y)b>utS(0tu4t~Gqdp(IAr-U7+QWlPc_hu%%x2Fm3No=1eSSJ*$OM1(a zh2_Sqh;Kbu5uXQ2=Nu11N@;8|K?36<;)pyVM9O1>0@OaRz^f!XiTmB)CggAUq}1Kcy-VK=$XATy>Blp`_yK3(4xb z$?jx@|HhH)M<%x={SA{Z8YvhGxrV2qU~p*)h7z5EVPPJiX3-}7glezS|M?D&ss%t? zRUja)Bit#a5$=@I2zR7X%ky>01jKd4RHYSDl~znuS}|2~%O8{e|2mq+Q}7qmMuq;* zv8_l|waD~8ekBru9(xS>zi#hv6g#FP3bI=IhE%V4Nvy!UyCrm_`#--u(cE)pn!IQYQ&M4*Chca zM1hL-R9$cNdWyCs0I-f-u!6FR{O|bGZ;ZO)x*yW zu`gWKXU$qHu`Q0zT_NI>6e4Wnk2bsb-=@jNlp+vbs25S+jM^J|bpFGD+FN*NjA%Y1 zzQl;u1Kv@UD>^00BAPTKUdBj%+?S*Fj2ekS$U>r^pan8#1EmRaed@z%q_0Sgs*yLX zsw#y^r?DmeW|n9Httpy1MbmlF-l-6v_VaR#AI9rC@io(;xHalibYH;f2pq^7nVv3Ce}M;;VdA3*lV_NAcZ z2dPte4>(RzbCFk34pF8auBP#I$7_y+e}mxPknkT7{6|RD?3i#|b5FPv-ZsJ8mhf&6 zyc?3fhGc$mGQT8Q>ZkvOrBJri90d_-ukm1K%bagsbxX}u4TRVat#e_@r!M^@deVv) z8x${DPh9Z=kQ=uzb($GX7)%6Gyx5HKrbF4k#B5Lek6#C2>9 zr8Mq{Qclj_a3gj-mU2-FQc+1)l7m!gO_@$tg1D+HL0nfTrL;mRr4>pktx!sBdQ8sg z-xB0dV4?OGB0`!Im+c@nifxivC;Unz6g~GCtdq7IJ7ZuL)Sz^OI{Rl03VL35{S(um zn2ku4ppD1|U~&MabQv*x*b0_4pPmTJm;u3|(+_1Ux|>)wW5!;_ca;6g^D<-UY8pjv zO~_X#kNQ`XRCm*e=Exc`OVVQN*MB`*)~w2Jj338WJD+T{61;ku| z`7+W3u;FLrFb<57$MY~>n-@dA^D z4-XYOe(42*3`JSw$vAED1uKh|FSTW4QAOj5e2(R+&IoapD?(hyz)(tKU?}C}3d`8> z6c?p51_r5AYk^LKgt%HL;<`d9r4>pktx!s7g;J{dF}b4e5`0i_jM`5ak+>qSZGEb~ zO=gPlE72(QSQ=AA$ce6dUq?W4WyK&)$O4l%uxY}iwC0!H${wo@NGw7?k<7}MEtC!8 zI9V``V-GouL}S71kgKnReIYG?trYvqR<+EWG47J7!vmD-5M8??mj}Aeo+M*SVipcS zz>-qhfi$YJ1%LPc5MjF3dK~6az725=4)&oh=rZZ7>qJ%QrSf5;>24%8(Uqo34TC0V zO60G=UXNOKG*9|6+fZge(u2@9ShbWbYJJ7Z+uE{>8e_{fiBs_ZKlv%am_wzo4fKH&H6k&5XlX z+Qpnsn&V^`jSTACgf%w$mF$rCuw8o?yS14OC!UB`X3|RAOTmm9h;~_+9AQ=rM3-b! zO*(^6NiRJ`!Ivl?eWhqW&kjwk-oUorGW|OfB{9q?<^+fPsWQykTSPvt&l8#u9f3d> z>Cgj#2w@#nQdy70h);*CR5Mo-goh6z0Gai%J!d>yu$N30-L?C%L^x`HzxDUp61Cff z+U<$jLqhGLyY|DgtvixM4RaRr3MY(U<M zY@%Vk(6Ihi^xc=;-gMdY;Oar!iG$p*>5_?u z^(W#bMgXosGhsR`WtR4{jD2XbiupRrX#r_T6LTU%;ofMpcK}C;VZNt=wjRMGAv1yp zn=G9}7sx_jw3n={F z$>kL$yvqddvV?d2hu-zc+=4`IqmbK}$ld#4?%t0Iif8jT%;awx%TDI{Cilg2{U7FT zo_cEfnb!ihO+v%wxP3Dl9nr>WY`jjtywJ$d*kxSW>Wy_(9qXvd#~N|3LBc!+BbSDG z%J+HXO-SNVf>9dIyJB1ZHe`1%;<2U7I8h0TYgC~-MVM)=_i+Wnm4-STT+Wo+j zXk2thdto-O6IL$K0MXoiNwgg#u;Txm?&ip7tt&)zzG>qB7`YKtlG`Qi(f;2MQuV3bpw!)OT%C=^5WRAnnJqwpr#4=L5OitUDskPI-*S4fML~qsKqn}g>o=Dhg z1zT;(Y_{z-O`gAT;rfMNaulc6>*iRtG{@<4pX{Q5dbS6DxIWqm$o~Kl$*9V|MC=nk zf+q7P#7Tb=L?BilI2_o0?AYo3`<~jnKXBlwGlx&K2afOlQsB(tj{Sk-d)T{TT_EkT z6Ycv0XAYe>t3G%JPkzF2UUVLYQns7bJ0QaDPoy^pdxN5b?~lM9M1MaH*itlvsViIO zBJ-U5cB)KnxD%E{F9crfjbai9!(wTa?0Pc27)j^G_-aNZ8s#a zyA+5HUJMNc28R1Zw+x+lCm0g#12CpZTV+HCvuPP+ThaM}09L@H1G8h#UAhKjw>il(MPjROivcs49uNl6?d+ zFr_A6H^p36Y%y-s6*G+%4S=&4EsClVDkB;`SlSEhALG6Q^L@r^izHdXeMp`VX2psS zX2)Cz?J+Y#N6doI8M7jE#cT-O^OTfBL_h9(wE82J8_PoIjTIuyi)ACskJ%9x#GDAx z8wd+yZiGd#9E8PDl|wPw#bCs`;jNOG=R2lXexF8*|5snY7Lmv%Dd%1SW3aPadIkl~&Fu-5jH`0T*~B9}?TzSSsqX6WT0mp*sB5705o%AwY{nIHp>MF;TG?@e(mh{mi{*UnQOa-0C>%-j zy(^Xz^Q4t;j=8>;57=B`@O-fX2?QZ92U*rWvg&|px!2zhd&2(nA-`lrIOu=zLTJDb z+r0xle$BowPwH(LJy7r=Bc((bo|6zUNF~v&CV&_DpW%7_Clt_*dpS)^c=?&WBU0e$KJq8mS_ z8?JEYW#ljv&3{DM)O}6G?0A&u4uzS8JZ6BGC)6Vrl9^K3?p{~tQ0Mtx(14f4EEu20 z63z?sj0Mk#mgBv9L?10z&zv}Y^vucKSk=z%Z$ES5w3w@P60IlwJjqu?-zhS6+&jXo z^PV|yj8#nZoO$MON5`@K`vR-iY`V0LryUpCh9V}F7VG5g?7?7wRy4NIM3|*$m{8s% znNSu>=f9Qz;QLrG<&tA;{h@OK|RXi;dX^kwLfkz|HxH1S^1%>an`>o;Xfew56t)vCHyA@ z|A`s@Dcbv##nm4)CF+g{bw`rknkkTG=6cV6_2i9rU2UEW&UorlHZG@z2>ym?YuwxV zOU{(H-;^w9j5qK2L4nY`Pbk<&kCZ@G>02HzSexR&2qwLIP*9GC(pj9h1om)~<;!4n zsCwCi8-@8>ueym@tVp=m3huSP$ zk+I{+-12ztvgyO|*6kk^l*bF|$jI>FiNhpOPWbi-zJ1A(nq)~svc%6K^*Fb4KPS7e z2(|(scUGU=BKWr8fvN4cnr@%_(MDK$EGz^zO^WC>e8*!+XunCpu zh?Xt>B$(l(C52XbX-%h!%BGVtpQKw6DxHB91182{k1W%A$f!eA+23UXr_Cq;;N;;7 zOCis5B+KgYlbiIfB0j$|*|dUw8*$Exe(BH^N&S+Q&12S|$W}Siti%(a&-!s85f?}? z>hFvTxoXd8w~(v$Sz0c!i5VB3RT(nvR?ze+Om3jJakx&Uw2ano+}8$!4S_~mQD;N& zP>UZXc#s8IHAExLGwCkANHb?-*CL~_XbEF9Oxy5bNMm5PrRjPTnQvKneI*7ZQN32E zUW-vmG#wP04q|w2T4tmxfxD@x4ca6TP;Nkc*5~3=aqc)no(J z*t_kDnW_#4LpRQ#J>-Hm&jB4EPIN$vmjk$Pip46%k^Ov}lx>Fpz)4f_a+0Kg(8i+@`QOA!p(Gb2c znE3%#{`nA)Aif=q4)}WqSUtLXd2lIb$>tjQMG5i#^Dt$8Q7TT%hTNw&48bfhx02a5 zuZl8BZVM!vh6@28(i}dINQTl9LI$R^Bo+>%PZtv>>o6k2bA$ufrMU!26P2o1QC@Lx zX7RW1yaaS!0x;S)%#-#}%thZaN?($7@_4qEo@I}zjgnsYW8Bb8I9$*iQ8wd+lNK^%?t6tQ4h6P_h zWIhqrLV_zM-ZC*8AZkykrO)F*Lt$hk{}zmnfuV38>Lq4LE-Bz@Wl-8zDfzOHcECS} zf+JL0z`E358;}^Oov*(LVM*ovd-Pa@R(|+MDXB=5vW=5Gh}&1NUSlTA(V#ZoSguuPBp#G6&^4-6VX{UWGk2SDMuI3{s16waP2dCiL;bj@ zLmfbJWL4IJ=_xIY3Na#2JF3#i>iNKE-ko3nfuM>+HJFF9)jRLlW~vXw-Po2@l*n%s z@*CsJHo^jM{;n~oe2U^F;QyA#-OHs(h*#CijTq}1I>VFRHOAzJGJ(=;dQt~xT)K|6 zXqvXDvS7g_3eJMbX7*z9*tUpL?bu~>q{Z+ki|JK6EMllKQ00A`*DciIN=%Uq|2|}& zqgFDwN5T#KTIiRC84hr6+UI2nbTW@wyzmJ@kF=vuGL-*90O(QZ6Xf+(%)hw%^l5pOl$Tx!ixbrWYD|SX#h{xTtdSt9?ikbih4|LCWMxG{D(RmS-$Cr?dmn<|}N%}fl( z-7>N~flAG5t8Cu&O_^wKQoD$$LA&5i%F*=ddh$vFcT$d~cM9LFc)Q|0MN@9CqYXNE zI&8xs`p#4Do_qV;eTvT2FM0ddO-}}e$-`eMgllYLgCy0fpw=`~zzE8EiqXoWa|H1| zCI1S6UfBm{ryNhgcc?*!aTm(IPvIZSo^m^F8$Ze`O4;y>mB(B7%1h%frLrjnaey!7 zpqP^@-+bSN*e^@0wl>8|@~=46N-~w*g<{o9nc1H`DM}}KB<5G*+*IFm3Q78>j6{4C zlJrT?ql1AVez4Eas9i>kc6Ng;muOKaZ~gt@A>B&vkLXUl6_X3E-bWyNzh#_byyBQN8q`ja3p7dso${bXH$ zpjV~Kh=h^9B{}s_2i8!20`lZZ`A~yEkAvB*1;1w|>6f|%m8C2Sx%H�f}06yCmLJ z35PD4D#}GQWCS4*2k8W%#+!~>KxFCPlHVrk7=)jEM>gKXWO^#i%)@Oqku7Wzs8!&r zkjul1wtSE{lTu++GM$rHaB3)v_yaCCKBk$n8kHn!MMo`H_!AYYgo;)1)w}O(_|c}B zinHU#$M#+=Nfs1M+OKYpyZsWytZ%AHG5@CrI!HVA4eA#3EgiWIg31MBVL&14_CG0W zr`(P7GmntYjBvpmJ{0LkIT%2wr%GEh3ChaPjG#=D z+@c!N?C>B$n2|}(OfTD+?9_N7RpC*ZNM>#vLUG#jpiGdUiTDims&p=iDVweQb>#B@ z4uNRu!alEoh`dfn{VCeFK|+5%*tuim*u!_K@~$>VlpGVFJVN`>(8q7{w`*>nx?}pI zm+q|o!>>xq1go5h--uyml;&5YRcjN{BU>K5BJ8s?-|^YI%l1p(A|hUq=x;=vCj`J; zuZVd}kOvlroFXw#ZxMpGuMogVHF^rkkwwW41zG%pK#})5`_(wG51es4Jm%|g`l!f9gtKQmn6Q+zQzWcxrE8eTPPw_=r z0chkLPH0Ih513#(=wrK6at`-XcjaWyjJuJ3v9Go{>Gr;IV*JF6yK>g;AwOa89(UiB zB@3+K4H|uu$pExcCIx>*?aO2UcFGLE&NK#Kr^*2AR2hJs!~kqcxf|KZlV5T8m4=AQ z?n1Yzmx9FZokiB;%c4V0=}bw7i%Fr~m?jP}gF2EqR>=)E+b;oE8wwZji$~@H9u!Qs z_@xaLer%xdv;BJhCnl?Ikqgvu0<7jsky|gFS$GG7!j^M&yv841Osx#D;p(JU+(p&( zNFy3>UwuDHBnG>pJ|(7*I(W{doX|89d^h1SF^6%>n*yPA{cF1iM3=OiQaU$4 zWlQ`CeUGQ3K4OmafGj3ofN@!yj?)%`HI;|)5e_|u&(J#M;T9H;m-{6zr=&D2(S+sD zAVL~)cg3!m%H5g*aRDLlF3d#zS_J!Bi+EXEj2>Uibg!O<{+zx=j#L(*pFw?EK{cBd z!bfvbXml(LfGJbs><~+Kdm1vsI7$n>GH*b4AuIDTB7&9AGx8T)q%rlk=$piQa(yJ} z&$v*YI*+tl>g+MWN~^+q-n2>3UrLL?P726Pv!vb7%d1TRoYBD+i;yHI%%IW_-)18*eql3pT~wo1{faV<`5ap=tCO(NK)B8A2q3vC($A zlASOb`-}^qr6~cdct!>t{bP)IBLWc@8+$qwY>Qb`*5{^0Qx~w&0JWW7VaYoo&%Det zZb-B-Y{l=TVXH;KdIKIZdChpEeUCLfLy|7HnZ{-hzCzdwySTR8h5&ch*b!B$Fg0=u zhex)I?Nvu;x^8-Srg%NZXaCjJ$%2x}=6FGU++ELxBdz}l`Lh0h8f{7MeZ7RSxB2xd5x7M zOh=SGE*abzH+kr>z+J~!t|O42#zz41i|R3G!EO=CTc-El_9ZqS7B(NgTYhBhn4;gv zP1A2Qrs+4pv=-JUe6$NN;d@H(Jq1n3r{#DfRbimysKgB@IqK(AasaOr?Zcm>;1mU? zDL8`ww+K&2Zm*f9KxLf_`J{)TaumE?OPxldRl*Zyn`3jzV|O&Do>&l9&(R}}reo(K zT)o`SMk?hi(ljTCN}7`thv)|lKW=>=PP7r7Ta~Uvd9C^PTHi6h>w4RD`x(ti_RSn4 z{R!Vp5X<_ff_kEv0!@2Dm|Hqa`aZ=n`KGP(G6imG zSW?oRu&AUwVbO|KnQy|cv_w(aT_{$)l*RnB<#%`AM5yLSZ*{d_s3;3DRdw0(D@5X@?jWkd=+F1NPCx{&4O z&yvHkr*l{;J$@S_MD!q33hD&YB1Y7>LJ6J2;Epmq9o{vd_ZEB?6yK-?yFMVNH{`Nx z3vyXz$p@^Y;oAf7Qg&k8Mb#zq&5ULzHbf%aiSE?XF^sD^@^F(#S%8fgt6rK|XYmm% z%E`v&MB^5raZ93cx6rtIrg5*}t^Y4T~6ayQ_uDknp9mUk-p`p9b|GgWJD9k{(e zUa%+b-XkqttZnHWOP9XeX=!C7{v|}xNqkKVG-@4p7!3@uPZ_!+Fi?2bh}8mHrI755 z)yf{V$FgHtDyzjGpdoQ)swmiKsH!Gx$_hpJLo`Afe?>$;8zu_$999McyvP!2uzN;k zvW94!j!5GK!$JOCraQ{!En<6HgyNPsY$|l$xiC|F0({lps|D(qrEyyyf!nIPx<&B- zo;DIHvt>r>(nrDw!+@#DGMYd-#>hI)CN8GtBDaoEa~XO|efUjWq^bgAWbEa1v29#% zQ7JP)iCE`LHNTo_jbSfOg)fW-hy+|qTMM|w(3S+{b zqW1q4s;b|Ub%ru2I7JOfXDxCR)`B8B)`BG^)`CSP)`CT=R>`adekJ@&Wq0AD>h*cB z7Om;<{Ggb;{pkZI*C|SwzhNZ$qioGuKq)iuLif;$%h-tZ{so_Im4yMj3XIYb2Rf_ezX% z3!CDGhY!PYb*zT=r;)5GcMgX%!_=(2G4nhtyr{o7IA1B z#97iM)ZRd@t*gD2?OvfRURE+#-ehlw9Ul@JOAcpOQj9iP;aCKV|4YhH?|*WIl0n=iZ(>ou90~v(JiTL|9NtCp>t*Gc0*pVGNh3 zdg|6RS%ys4;fGnV3b@49a7d-v$uNAE;`IP%`eUEk@k z{j=r%8(Xh$oqAC+yaq96%lj>N%b$ki7*==7Un@`MHES|@gSinmu_ zXGkqvpvXo@pPP7Ys&>kkX1KiY*w_KN1Tj&-KCjA8&ns>)rBa#*TgD zE1ldnT@+uv_s+R^!RfgBw1lG>2NxuiO`A~p5XHI?r0p^$;+64oF+|d*woc(Um)NF`JM=+qsgxg2#p8W%SK~k1c zO+}Xk1Cq{?f!GIl04KRCEAQ#{?;vH~bTVr!)8l;0fCY7Er*=R;`}(*1^O? zvZyp!*_5nkn{#DVd&W+X`Q^H))j~l7lyy7rtQOXSh2I}{OB<`whFE11E70onAvR=@`r(6F^R?09jJGq=P;WD8j&NDhGXtxyn$4t$avD*!1Pwc{gM6f~|4)Rtcjm z%x*p$VXO4VThw&g<4beV>~NehrHb+$2TZD(ixMBh8haa~E2(h8!KRt}}MJUT!69Dx%Bze7#kM@^+OLQK_# zlRry8Ba|xHDYK&Zl}3b$?Lvv_r4V1ArIu@DIuF z`sU|GH$CK7stoD>05J0_sqh8>--8S((YSbogBR|9Yz%2ITgT0y_L-OLf|I#G`Auh7 zm)I6^!TDJ-w9a&fmgrMP^`1ocE>04`89E#$8_Uj-t5Un#Kez!b#U@ZWs-SvZXt2UU!fQ=#(XDYuy({4G)D0;2942!DwifPA`j>mm^yLZ^B_8% zynX>(kh#qQH5(yYKpJy9l|OZQ%6`lDZu#5gclQ18$a_cb`c91P|G2o~+Dj8JP3@UB zPi;zLe~VA!I5?#65(9gFs^nJvyDe|Gz@mA_dmVQRPpT~JqcX?v#thba9tIe4^t{TX zw**d^lirG?w&S+5g!sjy(s6@!YL96w zX?e-G7`JOu4*tkGOOqjI%t4cBiP?0Vu{~y&Ib#!IDrd}Jpvv|U%sBZ4+o3yN7p8Y= zQM(Au5QUdb({yIYM;0Zc44_EnXUhnL&fz;mLwnP7oyykV%%gWpVQeqms=kvatlk?h z*c*56W#gDOQYvHnBYZl2q|%P)F>~*jq1f7qZJ4NQ=B3Fij5Y%p9npjKk(f}!EuxDa z(xXkCk_n2;&7BW0p(O6$2WaRQII%ZKW34GI?TY}3rnw{pErW($b$R?@X)=YEGUP2+ z5^&H?0SP$dGkrSVQBqPqF7KGGqMmV_lR#xABDEU9-mgooWSZ+Sn5?xvbc!AvSeBd(T=xUOJIY2{K% zE0$7Psg&CA=&a@Q1bGyE3S7v16X;pXhd6zfDmiG9c}t2&IGYOYLf5L7f^hE?>McR!_!uWQDSgc5A~pnxXR}97wb2oE*96P`A!VTDDlApCHVJOC-Xa zj-;6*qS-|?zDEaC7|sO0z_+ zya7{RoRKG@C>^n7SD#;w=zcCPr=4C=TJzX-r6^jj#EpF3z#X>SFLW>nt;YMAS6_2x z4)*M?cred+Je2au=FpXzMQW2ZPZ=^qusT|Jb1-`VHs?oekf<6uo{ibaLjDNeLLaNA z3!o1?`@h5e zt~Ka>Ge;>MzOm|*V)`~IG3}SOkQvPY%oV{do*56Bp8>6tLh;y*va46`9 zjajl)?uVPE;c$rE?1p!hV5>iJv3F>QY`^0{sc16k2yEylpHL$^bfrGL>h13g`(ar8JlZhW z4Rew3#u0&SdK7*BvScM!YT$0Qve^BA%5L$CS(1ve$wY_aU^TZXOU#CcD`qmc1|MMt za{0fZeuTH2!QmlhoEFL|%|NOhn1nsnRHvO_745;^2>J90GB-dxD+`m8lbqJLCC^u~ zSD{?eHtMaR)Aw`^w7yDTAX1+HLkgNG=%*k>!5su*Hf*d8U_khHDdl3Gx#ZFdG&7q4 zGMvurt^XH#%+e2o{N4z;&B6GRqZ9JmCl=9DWT^eT>{3VSQPIhK>EJkiS4dnf+j{Nd zL$Hk;=pE_`Xa@Q*paBefI?zF%hleg;2Y(nlGXGzEUplZsw5yn>kW{KWFa8}o0m8SI z9;68&^*D95Xjhx@ceo*XwJ%GC<0ZE^v`D5CxP+=kDgXuQo_2ZtRt1YEvC2klm`0#i zXZ2pdV8V>-{mvF|ohjbV3}Cu)U)eUkZF2R8u8O2PpLqjogGN{K1_rLwou6=339hQR zs}c9197&X|5Xx30%C-w-+h@vlqLAF#!kX{=R-$l~PzWo--*V38u9(SPIc80kS4*4x z-W+~w^ya8gzAcW!!>+Qpt9I6%|4RFKd&+FeTRms7z^*#Hc@-oJN)rXGLP2YyV3Sa= zDN%4tC^$wI1&)m$gDH5QbIeXP^2J@{A4>%!z07x*d%`_uQ%d=1MeD@#w`yR=zOrGu z{;icaSKdA*G#?Tw57X)8GSHY+H7GRUt`gi;Q&|cBM!~-^;cplG?f6I)+@MKMIdT*B zQo&xD^wdoqx>YXJZ;#jQfMxoftsk|nO0@0}T6e@-cBX9RoZYkyuAm$y$sJ{LoXz0` z14(A-j|=YO=yBO@T(YV*QMF2_T7`bOvGe-QWKr3*zKOnM&FZ-v@c1A<9WCs9b-b(@ zu8aJu;y7LH-;vBK#HQM7v58o+as>`*Np|h$9JU%((@L0nCckA>g1riE-ja5=u@AE@ zK%9D!e1MU4{|(_aY<1ZOPfK6^TQVf2sY@ApmQk_#CL%!Q0I$poxhi3%-BT6|Bey$v zgOEYSr4SrgK8ZJG*B4%sT%{(4B)Dn^iT{RXEZ_^Tvr5E%;Wbf7xG%gWWW$DCUwBQx z^z~gKC|skw5&--5Z}fFY)i20P3A&p%RgoP6E!q9+gu!AXvs9f8Z1aHx~#{{_C?B)h!xA|4@Rq*%`RCNf4Ej26KM z*5Kc!)$k=e50{w5bz=u--5ZkLV)%Ycc>RLcPfj3R6E2o+5WEfd9JU|`K%xEMVcto=!gycreSdD zVjQO7LS(oAsqbNm5K+snMZQH`o6)i*FOo}maD5$-YdWnU49J;+>IPUQzL7hGH1(O66FUkFG$$)-=JY&%!v!J2>v)fGtb4mD1^wZX1Nif zfr|yY`2lshWZQDqm49{JWZBf&yDnJ5%4f80=KnG8PF%e6;BwCdDrCsl9E59fTj)esmV{$-S;W^R|r@mc1bqIF3}wd zPEbY9;w}W7Df1o^xzfDP(QoRI8Ll!vHMiQT?n!Pn5lfvj>HKGA>HKF>oSv&onJN0; zy;(L{MMa;%3WKZZjOc+k1hB^8-t%q4(cUmmP#`S^)v#ONb)ijN6529W1pX@3d4hsd zcuBNN&v##9trnZ**Dv(;VEG6P@<9rRB5j?+aAro{Zph1JXP7*4q0N#7VzEwo3zHBY zG0zW!Wp4{aF8B9`kn9Tdhx!Nk%gpDq=z1X>?r)RWNapEO^s*e?s}fsFB15q{^B!8m zgKq?EKmq}Kx`Zwm`z#kM`E=z;@*!?!FH3B6=u)T)4dC10*$vKd+Ss;E*{d4sK{+cx zy%~{AM0wczL*dRSd5VZgYjY9HWKXrRM$2yy84F*%ilblRu6Y72#p^6DKS$&OPkRd_ ze+~ImoH|yL)P<-^zKGs+kfS!qhE*S>buQx2h5h}al|pG+W`k&;Nh-PyGtk)2^Y9qj zNM+rjfW%VFF&uN<#_yrzMS_BIijmNg{}Kf~6c7&5+-Ng*v3wkPqBYpt6_p&HkbMO) zHvmtx5D7`1gy0iJpf?%o{@*Bel7hdW;D1u^4;1XB0PBg&-RDw_%*aXf5f(RTJDNo5 zuo>1uZ!}O)hX9rjC2`XZ{w+L=|6Js0z+%c`GMWB+7H4Ys1?T=b*Ba+qf5sL5j63o( zZpY8K!+*oo{+v7dE3?I9zHjGD_4mzY#PB$t+V(eG%g?#f0(bi7+@X)$jj*Aga5oC> z#u@kWD~`XmxW~@jwUnjI4qNrl?5pAA%v*f*U&NR1zMH!Tyk$+@7S%opgCV&9*wS;FE^ZImklVne}Oo zYnRC!=c=cCQ(e>RZdqhW=6>7HK<6I^)Op_QDfg;r+_E8aaQ)h8zHF|6B z?ZR6}gq1tv%{zsLT~ZMhoV7r(RDp4iTT7D8!Yg}W3#DlV{PfWO)>WTYUjP z@lk&9wblu2x1&}x;WTrBFQXby{FLv>Xumqoo?{qj^Ei{``r zN~(ZGeQ>#xDrC_juDC2!%%UY64i)07Q$8WTAyvxqV1AyNG?}Uw3K~=8EUzLhuaf0e zao8FGGm%qgg`%buYf#G1IX(Ak*zK?-y}^059JRfOtIb7R*IG(Rjis2>R*FeYrI^%G zib)NnnAA>+NzJ5~)JlqFG?KD1+DKU$O{A=h7E+efK*T<6*uP+*$J(N@B9m?_AjY93}B8{m#YAd(9?K zylmBdjv~KKov7j5#lPfCwra?NzL^VrVypi92x06$%PBbK;Xd$WA1k$dP+~&({{f)6 B$1DH< literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/compiler_interface.cpython-312.pyc b/compilation/__pycache__/compiler_interface.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d7ca086a6db62ebcf7f5799b4de9d9a04770f80 GIT binary patch literal 24254 zcmdUXd2k%pnP1O+V+NSP4FZFMAch11-V{#}1P_rSL9Gb+fREu2-5>^>1KTqo3G6_k zys`z*vPH(bA+)KK>8#?Ak+Q37Qi+nPRFv#`vz5&zGa^(oYE!jztz0fwh-wF%DU(thhS=5`=69z%JEr^0> zhzVo-*Dz*a&&Dw$`!$W3@N0~jBXTsA1K+on7)twftW4@L!&L!i^cRjMbs0 z2C)|3KEJj)z%SNemudmS4;cPgXQ)xB9cH5`$){)>CnDq0+!1FG*XX1ajB4+K6G=H3 z3G$Jb*!HcuOD`(D}9biQjb0QL#6vudEd{PRFQCabb(G*Q` zn7vZ0)3P)Xy*6$x86Uc(rL{^gZwpu?#>iK(f*C|=(A7yPfr=$oQE*yLj!UT&)t(Ao z&}+I7jHFm5_9lqBrE#o-*?t`iLD2h=noQ2bMD#P7jz(h9IdnOll429VtI_o2XTy;O zE)7t(Woav~C7QTIrGv?<2@DN-UaUQsPGa;@GZ)pYXwZODZxJ7ivf+#ZL^6IcnqaG( zw?^+HLBlGak)>dACe2!TF_MyWcomiR2U%0mZ&})eGVz&MIyxPbg3}S1nmZAUCq*fS zsiYpLLn9Uk>>}#8Dh01b5~wQ~Oie}rghnxiNnX=awoqa*0*gHHl5SEf%G*dwBX}s- zH3-z6984z%@jQ4D=zlSpkh=OY$Fxc)hyL<007#KbSZI?{L@WbQ@>&Nna!H9!$rl2dAUcThkHwl9b+hZD#MjCob=m24m5SgVVF=$z)>i z3TV$DA38Q;>b)d~brc$yo>hFsg~l;qu4}jw{RS%YbWn3Z9)#<{N>FfkR&gsRE~*mb zkLaCZm8JBIoETRLK4oB6B|d!<98B;0Nentr8RVDErYyyzD>TQ^1m-Z#7v6-NjI*zV z+*lh_TXAJzHBg^t(h^qpWF$2iOeK*Uqyg)Nr~_Y#v{tlb#n)QNsyHu6!LM8>((VBj zVwm7v_}W*fn|MbR53G~xYkHTcL6g$>WzY&VqNF2wj%W%Uma#TwsW+*p2&7=tOOz%e zK#T#;2~NsJSJ!D?p&BAJNI>M*o|Y+y<%;ax}R)U-4norsPTx|DQYE-rsXkn1o* zA)_3?kKBYLMUxi{%JukB+@+&1*FL;vk(7SlPNmfa5)gcKawA@hD3(idWO`Dmk*-DJ zKzFP}%v4J8QqgD(OOubP;*r5Zqj4z=%$}AM(`9LP+zJS#SLH&w4D`^)Lv0=FDyy?d zJugMCfM}~Nz=Ti>f{}!%tp}NAp=?bt!W#*yZ6azjEe!-uF=`ULP}bB7L5%eXNJT&B zZZIxI5-Gh+H0&43dwu~+^a`z1aTe?ev!lX3M`&<0ngSTy=@2UGQYRJMhG4U@Dv?wB zH4uC@dTBBpOeC+;`Y&5DshR2Nq@2b}u&;<5C0?k)&`^>T2YA&oZ4U{)un{#RE0P_Yd`Vfs}ib>6eop*UL+z=d}CWEL$k36BLtDDzPbz0p&AGqz<#dmZI7sb(m;{GQg#) zi?>~0*gDvWNHm72=Y*A-MXP{K*ILMW`%7ym&WSc^+Dw#;-UZ+|n{64=F+?PhwGWba zLX5=F+*1j0hQy{JF)|IQkXOx#8rDAqF_Hk1di^QFD{81dV~{MOkR=&8t@1huJ_<64 zVwmY7oC~^$Ge`|AlFNI>kr75P?^!v)+yLSvw^KsgQ4Uhli6ljYi`ceCNNsA9=9-7A z(ohWXAu4A_0{(i$AoyAs{Zy=ciWCiOPO(;Ttt3N-FDVIRR>aCC zDFFm1HUs*kl2mnoF6gLR9QL^;byX8NN+k8gE{UXujU>~jh_S>aXbIwRYF$X3o?Z-u zT%4m#^cY04W_rwo>~jrk=Z4-x5f@SfddSS&K4wDL~ezAqaVsad>O`rsdHyn;6&@(3Or0~{! zF`10Xq`t`xO4^Vp#))f+>m+g0vka(Wjzv+L3|C0Bt$oEoZ=6C?>(c10xfK^ZxrLhAo4XdrRtU~Yjo|byTwCf{eBqAi zj`#y-*1I|9*s|)SZ_gM|<42wv%i)JU|B3~_sLEHzDz?#+U8~qZPfnqsdBsI{ZmmKp zOI6f8^5Tw<5sl0^{`5-|X;*15i#j@K=%P(DLGOlM4qJmov>+wTTo|*>8bfxab|`i= zGMjn|CIu2Ajss5~VQv^^eQ|OY6r9*}LLwSOGT(SCl1lC8YS{^3T7snDSnvWfBQUWP z#Nq-J&d4Q*3epuxCK*yUMQ|odA_l90h`!2wFV3ccf1*YQxnxa35arQGZ7C>@fulg& zYG1)br>DXAgEJsWq>2+@px`x#7YwIa{hv`ccrz;JJXkG=?lU!BCB~uzsvnP{c?7*o zUMiKUF$(E%m95Z-C1+#;18IPSDOCm{QSLKGA_lD;1%3ZbBy4p&zDwq%{=wft& z$vmJ;G{s!fn2ybWUV_CW$a17fc{G}(vqhX?kF!MCD|Hs3-Az*fXwBeYn-S&*q6ifq2VAIFb(7lxSJd zC0bX^+7L@7FV9S?({XfGg-DViMBbTrd{z}W%PJ;4DZ=n1I0@qgYbrE1Rx1-2X#uQX zTS-~Is<`6EimQx;wU_ZwCS+7j8A&IEgOVn6(nL)pW3l8_YC3OtT!NM@Vpbt{Brm2T zSeZ;|gDe+`ZN;+L%9taomXw(oDFj!O^5qmqrIB%+IfIH2NdX}cR*5v<7A^@HfmjW{ zW(Bz)cVKYVOEVE!Ooa?g=8#gXm#@-qidwCUHZ4%sV>6Mo6y^;LOV?m9Nu{8+ z&h@Wt*ZR3%0n`*RGqlv}U)xIRIcp^eS2SMd`51B5b_6fW8&(VHE<6oHmPEB_bKPs( zvnJE41c_kYuZk(3Z6=S~v6GA;8)I)v|U$ zb0<^oNXRa4#jiX-$si@$C^<~YFeS$+IY~){AaH=*Q8JCc6iG5KK@e!L*jKk1EIl7v z4VC~Ub*q%HI$3U3?KpDl7xkF_R(}TxP0u{+*YH6Ial^d9AY=?v)bxt$4fT26m=UIE z&?~Nt@eLJa;9W2wbHHiN|yGLOiFbHFLbFp;XG;1Be`4fGK8 zLl&i(vssd}nd$+CD@u&8$)`|OangE8&8E^)T(L3+D9Wb^s70}nrje9WJYx;d6xLo+ zCdI?6Qs1$;1RiNhLV#dv73&3m0|oys{!&9ofEvxM*H8Y`yz^mwd%ixDs}J3;-*SDV zV0OJc^5V$N!^`HbPppDv*G~c)zW4O?v)cPZ%jV94tM2*;n?a?6S?waC2zzrf0l$P^ zD;NyyAZ66SjC$F`-oednP^l?(O7N&(G}W3@XWo$3#3f;hN>^NN;hI6-Y!K4Ad1s3B zo{DSAS@8moQzVyFTnrkPM8@EzNj8lr0Weh{TbZptbs}U_Oc-Fr491gETD|~FV))pp z^Fv3^91jnlJNwKj`W+g6>No^w^4KU2eN+@H9~OBObpz#|#gEd(#;{`GDk`O6&}O#@ zt{{~caieQkN|60Ul)s8xKg3^(z(f193Y$+E@}V=i(3yh0{f=eTWZZ1cntLDl1W#?= zy(Q<~lC^L7STH(vKJeAw_TKX5eZ4tfZ{Bwx=R1&fANbgWPY-=NvhE!#HnYXbNf?0Z z7pXIJt^O}072>QBqO1u{C$s95vp~3oKr1mv6P!$T?qm`hMF$*DPRR-9j~&h^H=H>x z=6q^`qsKkf!fGf>;97zUh&i8pa4p#-uj+iNg`3I4oKJOt$?t4Zo#f~Da2>ICy7P$~ zM*%pWn&I>#x6zsGs$OT9oEfDz+C*56t9{l;?iNm^ldpu*OVd zVQ40lQfY)6GC7ZbmYXtCu>OqFG79b(cw&ntNt2$;Ran?`;j~e$y}?-t_94~50oxJ$ zOR(6Ijtu#sJSUbmYf%YwRn&S85-(fS&z(OWK0i7%a%||#xelBKiD|e;B(coWUm?$A3{~YB`2tpi zW^ktaQiC&NSaV(LPKPel{4o_I0s(^9+Ph$W&)&p&6B$7`Z_-H+-Vof~_BRgnpgdOmpOTZDO2#&jMoppg?`M_$r^ zFyt_nz@VC*{W*0L1}EYJijCY6aQG;uxVT#;i@Rcltra{#n)Brm z%44TuVb}_Ee-gI>kXd1_cm3fedIh%%bAtRS@~(0o^q{WyeqG;!?Sa4L_JLam9=N?X zZQnkVcX#C69rxXxZ|#0$G6tMKbNUxU-`;X_{HDBOF$P)}?4+4DEqM7%l{>o1e6WwX zk{AR(npM#&GHJ|hUcXnBo~^$i5Hv}Q_O_6aT*KNpQer$RH}I8N;Am~I)%8bUj96WG zI--?Dg53CEuu+B!K!n`$Oda*WRttKNu(ou7QWfs?=HDlHoF5c^FF=I~a zj8pqAn?&QhEnyT*^ES~eT4L6?ZQjnLC_B*9n6Zi0j5TU_)AokSuIC*Y%XQ;v;Twju z#wvs9`euT61AQ_%#qW?Yh=ARt~Q#0=Hx-m~BqY3V8 zCh-H#XHa@Zo0T@IUX@hDux3I}g_)f^%`3LADTjy&A3b7ADC z_B}so-coSaqFA9dRH)y0=lN`4&m)JW#ko+k;uC!Sg%`5+_JtQ7z`%KR@#?FY#mv&N z?~lATa(DQ?ciV#bLs$2mbIYz>tX?9DcShgp$?rIt+i^7O?pSsYKX7;C-3Q-wAAHA< za}T}eK3MQ>fN67d>9=6o%-T0|3fW6Tw+TrZwIK{;)TSMa_GQP*&QgAsDG|Lj{77*; z)~QX#AQ}h((<-im8>v*PN=_DowO&p@V&CVjzlJCe8d!N{tiW}m>J`_UCar}x98fgO zF~qLT!)6UF&{pKfH=GNO1>iI+u=aU-#tzK3a*a=x0Z^Vrat{sz?nvQI2t;R*v3$I= zLK<9hU9Sd_{I2mfs#%13`dwk(uxOHv8N+$#8hi-Bw2f*NL>xnAc?ds>n^%FT7;sO- zfT_}D5vkaBc)Xb-L~Vo0#I9fGuJ?v=Yzbcl&P+@4+~M`nf}9lt2V+TiK~smJxRzp@ zR^EkWkm^Dse{o%Szjeo3$Fi;a7S4QR7knEEjon#a_ru2SJ43f#$ohKOA`Cg@7m!=t zM+wOr@|P(2G7`l%tX46ky4kp=;R1bYsUSaEbHWqXvY*w*N@3XJ}Qx!-~~nX(DG3e#jYwACOA!Lo!*qnGcD6 z$%jP0rS`paE zgE_6xrybBwc`#?gV}w_Fpjlh126I}~U{3R61amh2TEU#=VlXFyQv*|kjTINjasd5o zK(J?%;?)pU)3Q%KzLwn|gq%XgWhmCW`;qLSWUs2l+BI6sVwwIwRRm=TgC|c?5=T;@ z-4LqJwN;EDBd?mK)`1l#C#g8Gv5JVoGDS<*i+{)#X}QXtF+qXHdwyk1vKMSmNK+Yh z6=ipq&>|!ZtTd4o{?#ZOW`(qF*sE&Jt0n^oQ$*k@F43S#0Q2Ueu2W4`$e2ZAQC@&q z&|K064Jk{Yi{^uPD#@d3PsL1AkKfLsmSg#xb|R{~teuvUHdJe;mD)Kgglr>9jke0U zA#oH|1G{t$u>j%ZWuB;8-4cF=(&;N z_mG3H6slV>#iWE%L*cxttrB5T@{9Pw`SLd?kDVNIcn^g}_2p@C7v-wGe4EPs9+Jwq z3&yPGbW55_4c0`S_F@sH{sGGC`uQt0g@(2T^DD0R4-73u@7BDz^X`>54&3d}weNig zta;hq`o62F;I4b*6k6K8-}qYN_uF1;yCdd%_vL!`We+@kzxm8U%>%phRo9~H=E&XA z_v||g+YY~CeAT(=%=-It_N@;Zn{Qveb@g`UR_4yJe9zuo&)&D&?>9cR;MR(rxx4E< z`?f-8w^nT9yLQeDepR!tAB5JafS<5J5LqL9{M4Fz6^y73uU)qj`g9r*Qcjfg66b%E3f+d)#@W z4r;n?7MN;6r4?5-tprthIt?6bx?1YPi)Zg%AeZ{v9TKD&+SE1#-r6yY)|!4i^3G?EEShF z+Qrgo5}ny4dbGFl8`WW!tchx?i$w2=Yf7KD;s??CX0uk4_Pg#37S}V0^A*?B=tJW#OtJ)G@-X z1E5*t8Fs2_GxXF^rDoz9w^eZ_ctWYyU&vBC3Bx$|40lsmqvAes?Ic;Ym~&YvD)4$C zvcmt;1i5u?%UbnVbu$l?0vRBczD%tk>5EL!t8alY{;r7`Ctx*+!X)v!;kyR-(T!91 zTzQd)p_(nany=%AWzka27r$t$=BCvqAoVJ0s3JSP_}kUr9}~Wo76eFhI%CP0tFaax zzTuVT>Ny^_U%y7546mI3o7H&(yDu}aF`HS+ReQVQ(`}?`ygG%9k`o7!pRVy_m;%8m zPE>KiqKnN@6l5mL znn);S%B(o35;9UxDxRbqy#&u*EKJLdhZ&LF&P=k`HG`CEGCBpPAIme>Urql2z@qu7 zBsVD$sUHF0FJN=py!Y)ucJov3G~_lvmGy0U*uFd8ej?X?;-;(6wkhAXE!Va!-*zz9 zcJQX-VN++msXy1$e|I#$^+;~(k>#eLLQ{9XX)xC`m~A=$O}(=}AKbef+`D9c(6Q;R zIp4QG*SCMU<3ORUr_kDwZ{3<}-I{ORlWX0RZ#|l8J(_Pln`=E=Xba}s26JtLpO}no z6pR|$%1h?^hI4(x`Mz_xzH>{b3T>T*#(}JFV8t(VJ^{-IYV{2;rHZh4Pj$rnuPR~| z1F;`_fmTs`%2Aup_{ARqp>(whqpDu>W-Qez2v{v<;)V^#?A1yZe-|B7Ra6fh1v99D z?3@+Xi__KL1H~giaZ6fv5@jsaTthm1Lh-6Os`U#lGi$ZJ{Tlr;yz*RiST|r~P8fi^ zrfL%F4LhMWM`};eE-L>3;{zP$#!)a5_+StND&NIx`5qF%FU@&`y1 zKNaB?WX4{?6q)2KqO@Y=hGlLO{X;7E7L_v-1(E+LJtrs$AYndo)xV-+ax=$GgvC|1 zac=)(;&rvMgyjnA1eQN&ZvTGAYaRLKow??n`Q~G}=40=O+2-f|Y$DhE+)eAl&D((b z1^?!(f6v3Nefh4_xvtYUpDqNq_kzDY|%~KECHgbyaRv^FqSZ@2V<+kID zO6<(F?aa143C!KNE#Grsx#z&ru?O8-?jFncAI|k3UhaOf5bP^-Z_0P?&UNq3cOTAm zA6`0E=;+RO4CXop^Bwzh9sA$jl|L|&J23KI$2nL4w{Fk(AI$Y1yyJL3xN)Uc*mw{K zAKaVoIGyV_ee#LqJoX)qHD6arN@QL(>*SwM3lj`tJ5*>-N3Xo2%Qm zF#NEwWoi4ZD+^B*{0$4I*LEI-xGX}BBy5!~3CFx=EOxInz(etc+0IA^PO2edZu{ph zxymBHM@=S2IP3v6f`^BSt_?^eiH@QJR!Mv=b&_ckqKdos3_PNdriEL~h+Zi*VHWB~ z0o{nyx-vKT`Ryz#@cYzC7a##0{5|*my;(Om56o>_U-+v&N59;?eu*-1vAoG}cKL?R zjSkJElOQJ|6HDbkMvH!VR)xp*W>J*VKsb_2ha+mv0qus%w*!?x2FmN1+xxi9tC)Ml zH2*%D1F^fF$HM6?;1%OToBBbOx62y_ncKr?t47G=D*!k5-`Or<3Q}kLy3}}kzKMYC zF{j z^(D^2EEV^&e3vobC)V)U!3=yLzDdxiyGmV|pv*RT$^VSL{%cD94JGeTLO7;aAiIxW zW=)m9MQ_<)71{G2;a!SI(*>137k`f6Dv5+>T!-@PBMS2eAanntY5>Fs_I(f6i-7>c z>)*CelWp9Tv-cHt?4{d*oPF@aVr+y{sI9;4x#d}s?)!QcETFUu&#V=74ALQNIux_(ShKZ@%wn zuJ35R?|csb-5Vcx>hhlUoTvTH#&SSv(ZGQTFxeLZGYo^%L-tFB(vduMy*t4eeX@?$>L15NIwmT`Dy7B7c1Y zpw-q@LOWXs^c10$e7$4lV$J>Fo@Mu5gsS*9-I-Y~=>mY?K6mR}KCn3#*qje+%LTT* zGm{N$yB|1STzLB*9Ov4mZsMZ`|QwdTwcl?jG4@g{{hM0V&mij;5}};noHWXZD@JhFmKln0b=yT+*yr7mhC|d za#3g?_zdJUM8|VUizFr{X_cId#5SU8qSb)>&jA&(XZx@k-=xXdh}lGhWjggm*yB#d z<%P=eAYw%bwR#W+|KYM4K6t;paXu`)V4JG_nZr- zC^~H4{nkSZXMW;ss~#H`sEiH!FVrv+&PrrN!f*LMQ$muSoTVg3$(Jbk3rYy%6%QLTt!wOov2T!e@bKazp?i5#SanL}8!2+0&DeC620 z#l$m$gcHKFnN|q*h+`l3~(poVZ>h~eGtF{hr9@&Mu#(Zs8uD0ur^lt5z z4}4$vN!$Lnci(Rty6Grv-Iw2bGPm`l2JqP1A%GmH%lS3~>fPG);eLAD0Ab>jF+)XM zomGviYtPy@d^$^cG=byawG4M1Z!!I-#d5scR<^&5u$KN-Zz3thSp=P6^5~NLn1>x3 z8L^F#7Iei`)u-&J$b?6YwVfBzF70bI8)h+I$Q=0;*y+?3A3GL`??VPXqdh|WgdTOD zy{ZX87iF4Iun*^SU=I-s4oMYH%u0d?va^PQkzis5r{k#kLpTWr2Ue-O@aZr;91>Ji ze)-mQ92Wo@$OEpZS(`ie&Mn%~!@n}Nt5-W6w z3I+RU4-}p75t)FB$XgR{^ImJbU_AjXIsxBd@DKv?df|T4q3oftcf5~TuE)e5{;;P?{#?mECU)LKJr(w zp}t;XFV(jA)c5XMb>d=3*>59vph+6pp^($BNO2t{3>5ydn%FxGuUO10HiQ~R4n}2PnNg}s{pxn+0@YXDHpAJZPBZz zpAkPQ6`mNEn(L^f>=j_2Cf>OhMM2|(g1h_9Zfqv_vAg%7zv=d&TZi)g-kiTT@86&E z@4xRqm<4I|Z^-*YIe+NBe@oWAh08Hp0H7rLw{iXGU!p31OEJGc11P0o=py!oVdwHN zADy$21P&IFIg2hgiOS!jx+!TQ&en&7qJrtr7&|snP*m{xF~GH!JuC*8eklf+ekleR zzbo6c7~rEO%Wgdgm>$3Zv?yTqSRMw&%Wl7Fthe;vX~}mU&2=7qB;bM1G7VkXmPukI zjP=O4DP=v?v|D*oaUVuChLeFlN`hxet}1RqNe^JK2`tZJ4HzT^Ld$HgTG{Z2K^qs${HKa003%uOB1H9XVDsK)Yyo>5r*Zz!L2}juEIfMfz?(Rd^FS(7MYYxVm?UqT&J@~LDPIV(*&IL-lp$Qpu(m|>dpLG-Lz`OQ= zw>oq7o$uKPh?vyh6c>IgYv;$gF&ZDLsU+`Y5nF%L#&-Jp1SJ#_RdTRb1$kbjuax{I zHR>8}UJ~fghhwlGs+%lamR6m<;=Y8u(o;Egr8h<5N>4@FN>9~E>mx6|adJmZ32l^# z&B%XFNtBWokJeH+aL!b7sq+0h#V@`^z1a~4d1jv%Rupf0HxE|*gOzV!lKU9 zNTxjAqOW!?5MW=?E_!sZlUnJF2boc+CiYw^E2(F7^CiH}uf=3COe5Z^G_V3pju>E| z!|a|(AN<%f=E@+|obAcx8*A7c?WA{KrR3|Be2)@Fx(J2Y%v55${UARd?y&s7k%bm) zihTJiCWFDSY84E1KNeg+7HWPhI6e@1J`jR`DRg`wbo`az{;|;cfl&Wbq3HwRi4TOX z4}|@nTEA#89R5h4^wYGl+fetBfMn&2!DH|*iFbCs7F`u^yArH1SPL6Ncf0SV?)LrQ z>nkQaKeRYjE%<%Z^>u@xHyhmkNT8cXM@>&z4E@=LO^*b+`S=S?o5A^Uz+gD=$X;V; O&H8&j5%A0A`~LvqrPJ;J literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/counter.cpython-312.pyc b/compilation/__pycache__/counter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3fe05201e0b915177b24346260e0be185639ddd6 GIT binary patch literal 2076 zcmZuyO=ufO6rR}~{rE?cE%_%--Fj`=sns}fous5qLrX|XpbhRJEd|lCUGI+UwO6~! z>?(NZb8rZSLVL)uJr|0C4mFz;1ihp;86Em_>YLH7tk4YXoB7^* z^XAQaZ{GfrNOU6@7P(~>q6q!LK{z78qWL8(wvd4gY@vK>#(4}_vZSJ%mocJvIUa7? zpCxo28S-^xkVjIB`zR3RBNC#KHj1v|T;xwKo6liCsXMkuuX;tz))pz#gFRxB1JL{x z7F&oS?nhqw6nx9el+cJFodyY@s6cXnVgiu>#RZB4s7s(|fVu^W1*k`$cz}8Z>IzUo zpzZ)A1?mYD#7uBL;P|Htp!bA zp|+vw`+m6o7?&)T$|@_{Hf3td098F^E<%1>e~2sSWkVCEhH}FnPwhTWxUXya5>=@UhN-KTONQoA!ygsaTbm74W1d;ibPukrT?Xlh>pA+03J2&* zp*ff#b~+Afg-^V0_}82PvW2|1Y&ob*V7JFQdKAbnq8y&jNj}k?(yAXfC@t|76e)43 zRdBhQ^6&ezUa>S=GD}B1jV)5|$kp=6civt(NoOr{VYamDEjjk=Wy>ngcElCdBEd(& za74Epo30)BfBKIK$v7P2_5;~O&r+G&8#gy9pVU$l)%Zk{JN?2<`FlPLU-LPTj}V62 zuA}AleSsN_u=X}21Kq`27^8K(9KNqm3Fc7m9QxusLZ3stHslTBh1~(Hugl^4?%+!M z+%@oNovchF#PHYI4?+JPywI2Hf<)?&bl3)oyiV?+`?1iYYnY%_^qqVK=g9opm|{Dg zqPYsaTB15s&Wy4^3l5`-=_#hGtR-g@tpHCd#F?44s$!Uh0$0k(Y#8qBM@)|vT|Wld z)I5*nB5aVeqbpY+!>%tomf_`r}RX;paCE0D*HB+C+R>{H2*&4~#$0w>}q;kAQChD(^Rmo6A zu92}$W7D}RQMSQYu7leoIFfBdQFLHWibs-7Ib5|o7r~6N40!AJKv&arOLJXlh26og z4*G<+uiw>{ic9*Lj*ii4Yw$~Wod=>F50Z*mj28VkWFdEL)hTeFBGQ#+OYQr9S literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/cuda_graph.cpython-312.pyc b/compilation/__pycache__/cuda_graph.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b160874388c72b7e8d026c1e97b99faede5f73e3 GIT binary patch literal 9053 zcmcgSZEPDycC-8zNr{q3Nwz+$rDRE_B~h_rEB=ga*|IH{k%P0GG(^)iwY!p*UVhx} z(y{~!RRdok`r;&9ttyZO}vKUdsDu+kJh_@N`O!2Ttl4lKgb8aK&i4eF+g!+$cM*T;X z=6Lt-!Ds-C#+NYmQffhH3z=>4o_QwH^c$o(5n*)CnS?ASreqP|nz0*_c3DgC*PviO zot}q}Gn>%(Spb53CY6=sgeGOu9G}UhHBo`;z1I`6BqTJE<8wk{MoDC6IX;oqa*C9m z;lwnO79uvi_RN*Dr_W>a8yP{=y>H2K>P#j*EzJOt236EJ12>z=$ee^2fl(a!tf=ye zl+`kd9-M(*(-~zhp$ISon7|DUkbP2G(l|LYGeaj-&*v0HOlw>g<_f)2(4~G(OkCp> zahlV_w3<;=s0)tc3R0pV!Q>~6YQDm0`mt9E8}9*dkBB7B2qbR1LPWdh5SZgoE7;;r z(Uq*L^xX7G)^93QK*6@)CIoQp72F!pYt=a_bw1HA*6s!jtHp^e^R|dfud{eKme!Pc zy?zQg#0eIQCiEI>7j#!9r)6`R9+1-T;Sz$Nh^i{8kviSOap^=#GpVb{WmR@5`N@<2h1rM##g54alq-wfd zN^3tO8&E-hH#(%HvO`%ZJE$cT*v-Kkxz~;yzV@0pBui66*?Da?lODPbk}8gNiU;Ti5QtBwQZ-5pEk zmd~v0y>)Tb(Xs9!k-?=)E0g!T@4oXORBY;5a||e5CfS{3;4n0AmH~l@gA|BP!AAGZ z9(U1_v5QRHO-shkxgu`8-eS_FEFK_rQm+@psoV_8ZRM1l)4el1CrBzD89iW$tg(#~ z5s$J9=ZLbQ;M$d5$aKdP92lhy%YB&b!z_#$T~RkepjV9y0Tmm8WgX#yBeWu{I>P0a z_N9v}j#Wo{xt&GEtT|W%8~bnJ2i4V3x$zccRem5hYKTxL;K@L)z^M>iqT7(+X^-H( z>5O{>k61Hp6KZZc;$FckIso=XPqN!%~gLS74U9@xaKw?ScKMI~_rqzYk`O|VKX zO|=TEWmphu7Fo+u<-uBthfSmfRuMr3%!3+Y6H}5bY4cI`bXxGTkg`hVx+I8fLems! zDyIR&^Kd$lrZy|GxwIfEGR3`Wj(3&4E+yEj90!7ncpP^%>ZQ~s0b?$sT$`3Nb8KcB zC`c+hXH1=y(v+fgE?L-Ns#9@d&@Z8~(-|2s)uWWy0hpx*Sdu1zW?i5sZ@^f#CZ$9c zj;m!Mg>7e zd$eighP;aVq-hp-$y$pMZ-C|;V0lFZ+eC>a*t9suf>~4MDP#Er59B2+LODVOoi)i& zY}FDx>2cMIQBdxRAC1U z0=5Omq6a5YMZhgK4q%jh8>C28b0EA@Dl3aAAdJFqi34n|kV%UyXgU>7rLlP?^P=qJ zED(lHQzDmu|K zWl$y+F;Egb)j@EMU?=!%m8Ba)q6Ra}G7_8;@${Iy7pMuJ z71QinvDh12?9%!3;}hq(*w|ZRG4{j>w(IQJ=#}$bhDcB79#hzLA6i#*p1|?a{hNha zZ?H}zmobbK4_ens=#1Low=qne@jR0!oYk1OL8;={5|Xz=3H#HOwU~1q(+l~ zv>awhGmQ*Mq7Sky3ASrs;-EjcQbx*?q=jqN5u#ZRShBHNYeLsv14C%LNZw;K^k`^G zqWiKfnXWgCXU!$=Sn_IUaA`7oEIRTI=w-Qr63#JT_L?UV`@}ijNw+0ZuM9wgGKg6e zGQDw=B$=DjI4vov#=#bG(_jtMSy9j(V*0wOH@>{*dV{2b9|#V`42Tz1+)52ncA(Z& za2|RSy3aFmW-1|X^0mR~03|TXq2Fyd3Paer6+{Ho&la~HwT}e;qf>XNqCCBA5enj| zC^2YYS*07vfTKyhb8ca`%hqU#VT2B>QPp!6&!Z3lSzV?D|?_>6rRd%@G?r{c{{@%WqiDpSToDt^7 za4Fas=f~u)93NHY@E!xQw%|Q&xi$;KR99{_3#LXuMQ9y-Q{^6IBM#j?n^4iuq5G`6 zD_)OYXAvG#l#CLwDaT-e$_dC+T#dQ)zD?RozfNwLM=z9@(TdAND!l0QejEW84df=lxKF* zcS%c9%zlKJy6G&rW4mML?GoW_?=}$fM?inkwrJ1WG*l8azY&9suv)_lW{yAG9O-}{$|FD6Q{tHs#Wwb;~JJ70`VeU|)`|NPMX%WGZ3 zpB!54I#TR7Qix4GYUiKZnQoCO_&c8ldkg;F$AQiVuGK(a!PEEK4JYXk7`mXpZ9d)U z-6E*Co2a!n{tPtDX6Z2Z6n4+PllFjpvhO4xi_W}Lv+Quv1GKj#w5w_$?}S5PS>S3H zw3tex4Uddv39GGn2JD^fL*~z!%LMJ-xB68M=e@~S zt7Q|?4w;Z(7aTWp+|M(bVs$XhNgk0 zBu0U@C6gyr-0$G^$!!Fnn6KG+J|DYHo*V+3_T8s)(f2L8a_Z$Vl?Ld0=H>dl&s@o( zPr$&twJ$Wxy`{V}g+H+-dj*A|bGXSZt17LLi7>KAye2IL1mQq`2rs3)+dbv?CmDT0} z8E@jyX-oRw%}aR`kp53o`fEU-W`2nj-khX!0vE}FH?H5HU%4D&`B0f=-Eo`d9H`hF*`}P`#9Lvk-xVfVh)Z8S!mA1IB9u`bm0o zItMY0ZO>1lpr9hx;TnKp;7Bd?GIa+$JgBQPFjR^hEk=$$3LIUp zC!uiV7PYq!Jn*o$7(7t$A9(6(dvLJSJz9i6-{`Yb6Xo!M=XTp^rf&Ic$=_e}_ZJ3^ z7yT#T3f<6Dsvj!W4;5ZHRjfa~?0o90zjN}|$)c~T90-;IJ;gvzDS#L2=U&p#a?gL) zUk;6wL%T|$o?@t{6xv@5?JtLR8^Ck7qoMJ+pVS^$^ACURB7OZEc3WrNzxD2a(C~5c z(}uqv`-j+Pv9F@zE6hFbUGIRIa@DfM0`_Fh=+y|@6x_V3D3&rq-pEs|D-@Na7(L|chGcUrVv19!>X=h7qqs6w- z&qr3oz9XU+#aU)PJhjf2y$mG+eGnnU-?P{z7yA zy3^J$xM7EG1?cW4HFb9aw*n>4o}y>Znuje1LU;1F@&!-ZZ@(Kfc--3p@TP*a-PQH;*2P4cmVnI2!A9eo^BA_!r^M*z1m8GL4Xb z+3SoA)qL6ShWamuov~wXYAGUh+g`zRH^&JX9-b!cFo1%7x}n^95JI<1x2vFyMzM&# zg+i6bG(uYqj?#~Hm<_;B6(GAwzP2?xJ?lrw$kDHT?aoQ&X=CfU69M34GrR7lpog^V zU9X{_m$VN&6pC%b8?^iSaktZZU)><^S#PvCJMMcn2zJp68hz!tSBfYv0 zwI?q_BwGaKR576VUKmYG@pOf{xAMLMFQ!zd>Mm_Q3qe2n@T%A5;Nb}F$yzj(0Zm19 zE7i|682F81{7^XvE7Nz;@}{^Wm}~l`uTYixa&syJlrsv|`$iZUFffd4up1e#k5HoU zz*Tuw6QY86UE<+?12U-;%#2R}1r=U#(>yP8V8Ow7BTZENdWa%#SIBtTr1q>breTT{ zjI-)>6vKSyR}ItAOe=GmB&$(s_&EyGP=j7e8&&c9Injzo&{x$fNUz4?zF`Kb2vMon zDk-|>IK2A6qxtK~Jm8@XSAPf@grOK_!$X+h6SDUSVV{uZC#3V&#Q$Gp*AsI12^s$t zIq-x`{)&t}A>B_%{}b}+6EgT8HKCP1T&>x&_R1Bu;;l-m?hhGoNW7`5 z`+R@hJu`?g*ty(YgMR&+-}~|XeShC)|F_c85)Mz)_Rk}`PjTE|(GT+}Ga?WF+RAa) zIFS?iC^w|P`5~VDwhUR=+d5=rZ`+U!Z%fo3a|}6nDr=28W5SRSa}BxJZ(Fn^<{omh zI3M*4d04&DSlLh+OS4D4vGSpE7I#Ef#43g=jIzF%f5^|`&S+(3L2`+_Sp6O^)<_Pi{v1y^RA1yLoCy6$?Lumoo{AP~ z-*kLzbTzaQwdo@k-rTer}-tcB-|;oJ7$5LYhiA-{r6 zg*h%xT#NOie}VU`Y@PLW`#&AgH`)!Ui>-Evxav*k&_?W44|cT~yShbe7hCY|b8#49 z2OHPdkLY=;^L1nC8ZHrA-?R^HmAFya0EvQ^*e2CuJgW=VYx~!fT=OQEgNPR{^X=>Y zhT0iy=QW`}6pe<Bc~l!D!;#IW`2}AsS0e5+k9cbW(~d30Xm@YFSbe(F;;AadtF#_HyvM0rD$K`m~z&~t*?FfO7xrb{JT;r0tycCx-unu96kEskE zo=b5jxOPh@h|H1zp1lE43MXXRjzBUI&_@z5X7c1f^Cel$L_CljkpkgpNKv{2Cr2Vm z0RP7oX?Q#uxHuxk1EIiJh;~3P6Br38tYYYS&36XQjwb^brGS`-uR$b)CXxwO*I2WB z&w(>onltzvN(LhF5lN0D6_#7rbvLWahM0&)FB`K|0(xuM5msY3AzuWH1royr&0~tJ za(=v6Br4|Cqv&FH78JY~iADoTlESlC za2;&KC%b0bKurb`aVdb!i~*vs&}U-_aXc#NE6-MGBy@o?7^p5AUw)Ft3_H!<0yG9T z)p#OFqtF3G3}DeP(u7jjm$BT0C~(|D$)v1R9C~r^*zx_r{$od9K6GS%@bLcce8kH& z07->FHt^NH^~u=S`mxAZ7Y++fZ`Y;qzHM92_etxck+bW^E~Ah5dOFGL_4%`Nzdko$ zrF-nM<~PpK6UR$#Li_F1Q5AwK+@k3S-b(dXv=^=8s;^(RrInO^= zbfzZuvi0d7k;3N#kwO?!MiS%L@8zpTNTA!;O?}A@(2`;Dv}AGvdX^gz=qn_zL{w`C z>W!HwhOi`Xb&af7t(;-SNH~c74T_PN z^6XyaHE2L4wARkc>+mY*ob|$j!&In)f&c=oA{LU*2gl=QL*et1_-%XXLK8)MIZhqq zddZF3M6ge+R>8O}Y}_(ymdXYKY!maN{|Ctl%V8tmY>1kr zo8bzV?dW1F({9a=Mc?wmt)|A=B%+AC#lK~F%L)pgPvV#Uh?abO*fzyaS*I++R?+&V zXVRA9exoi)wb_GvhgxHgN%tz2m$>iq7A}Rd9~=Fiz%QH{Td3Jp{3Z&CqKTy#k|~a> zV-gSpL)jtYCejh|DJ67H3T)g?_fbHM$Wk~7ybtUc%HLwZX{Zd841gSu;hs{q>y^)( zIin*ScA)nZ1$36BBi*ByiT;SLjmNtY0kUo1&T{E`6{TSsJ>4s76)8I08Mw3^*9M(@ zou=pvT;3iyHb(SMDEcdfIj^^G4|I3yhyf!ioYy9OUPY$!kDANcc#=ss%7%3T&FXQsT%^wSmA=%i*0MTP00ut;BHCkX30M-E^ zho#0XsUv8!4iPW_6r$TAVKkpB7Y@YxxG>?A1cKq!vemfdu0_{Nye7yxx9 zphG{%rI3QU20=EQh(;3^iKqeag~p@FfJ_8{0oz^X)nv|`EvViP+5kwMDFnhZm|YH- z&lHDK%z}uT;fRbS$SobC7_fA?+Ju_*9*kwyLHipFs0bxq*k`VXeBW3pa4Y&k>ZCiC znEj;vF{%1MQqRjZzz8kuc9Eg7G6Um8zGFS(NkB-aPUV754TY0n1At(tVtxs9DAW7s z)H}g;i_(}B7l}?|1G*53j)P#sfR|w*wlF$)m|$l@S75CHmT~DEz~h3{Zjg%!0Fg45 zh>L9dO(@RI&j28^Cv%I6&6ki@!3c8T$~YC!p3C!#@6k8dgxuha&3OKx5_A+VMiguS zmWS<`9Kuf=mjF&tLN?Cc;IWhYw-dM{JvU_>w%lBj0fRCNMK6XfD|F7sz|0uOfF00) z$hMpfl7KuK8J1}8P5c4sT8efCo*WW3$DoqKp)s&*dV}UBvbN|p8`qC^;2e0`5o63} za)g4wMS4AlWvfaj8-^T2GOz?nR+ixYZw6xSOa-AEjbV!9&hLuVJM_}N4vDq$6L zSc?EW!T=jUGT1bR*MKVxLX^QIn-$?TOhe!5W$Yam^>Sj|F+MECTj`6??L$Gu?sZoGYe=6Md0U z?9s@13C*w}N^zjwFn-cLu&xr|Pq7=c)ykL@j)bC-2>?bsuwng1eW__j*n);5z=aWPvvS-v`#`5nJH%jd+-0LrL+^=GUlU5Lswoy~STu>!hZvGgpFk|`mvvWlz08J7n#GZUi z&l$E|0I~m0zi3TaQq~i=AJ65xW@AD}ORtqI2tto5qmA=4ZE)@l>I8#=% zB2ZhuW6QUM8Mc@`rMQ%3+Bxm`*dW+Jj=Kgmr$JpzbmdQuPUjlPlF1K3CRJv62@r@+kQj<)k0hm-qLt){IL*#TE6t&k{xU(g=FlaMnq8F6j-S(P zL`Z73^U`HSvqq&j#F|7P=>1j&i}E7v0z(g)6{FE=O8nFX((Aas4`D zN=KY7^T3rWS9BSJ9}=}-PO=glk)n{ePV^M*x_d`75e`L_UEReh{T|v@b^zV{9fT1M zuDp6inLV`NUAV`uEGOmt|@?)tb&vYr3!6X84(nS)c#fj;lKs{k7MQTs>mEZuF}DHH+;3 zT&wEuUUJ!6D;{umf5noA>)o98cg=VYLIzv2M@ z&}#LSEpgV;vWF#H#k#4zS#SBQmBxG*^Q{VGtD0``vr4-8z#Zj|l=dHaXth*VEOC|< z6-#zhcvQlbug;>I6&>l$U3Xl_E-yj0tHk8r2{UOWxFjG-55>chW+NV<-L78^Bq3oN z%D8*^Agw%M_W)SOGJKOSJ za>3i07FyHZ)}MM?ZuHN4*QSNF_uWltTN4}F6BiP7X~o~eICh<62qG8dCi(pJGn$hQ zpp@~LD3h*}Q4)tNdw}waTtuVpWySE*Y{^7;karhvksI|qUHku}#iZA457%by+A`BP zuGj@;j`AD#TbXaqjF~eg3FII{!tHL&MuJj?xkey8)f~!5Xw&8`nqz!Sgfc^O2ZM%+ zCK%KlQalWjL-Pc4x(uXRp{~({AXE~fgr8;MM3C`!(B??-gytHNE{TzIl9FtpA_*1@z;0pIMoA>jI+H2icyXPyK zuN=^^Z)B>w)atJJ>h%k@4Y{n} zRAx85|HAbbGWDC)`pxszTQb!<)#{z|)w>sLd$R7Tj60yZ1M}|Yw5?eOVuMn`XgH2& zAqK=mO)((0M7bD0X^B}Ttvm;M2XsI&iV3&$2ySV6G6$~3sG^j;81NCfG3m&IQ86G& zIZTk5bmCAHp@~Lw^bSo#e@3g#SpJ=Inl0go*hfhazwCjq&dI1EoLGqNtf?FoF)84; zrGO5yeIq&uYe~x$>8qe$%&UvB*uwWrz%V*5*`V~&w&L1@?<1D7`YIH6*|wusO;{*ap=KFAxC@> zv3d+*yW7h(XE+odL8|;K7?S*}2sCem^qP<`E36}2)e7!wC#sOBMRVv8c?*>#f&MV* z`6>#!qoIU-uU!T*KW0I_k7ph`SyMDRsC0vWt}y%$L>1U zEtb_~J+I+f_3GE^l9dxW^|N1iCFL%#62Pl23V+M-mJ^&6rldbO`-RxTNWH+x7E-Pj z&&}5!<|j}i#RZM?K=ERi;@i$-?kWV=k;5p3%3RSRT1DG?7SW!}abKc^nTt4*xeE>g zd+U@F1^~`C%ZSrL-?{Naly5^0?NqY)BNU)%HTrMROuapoJEYlbN zqowrJ)`39{M_eAodT5R@SsIR9Vn1XBDLJx&#!9 zp)sAqdXqMh2hUlA>%~6WQ+&!Q@`U|F%QH%WZxk(5%EC=MQ`|5g0iOJT{}()PDvuS+ z!PT@w zY97j|Lr(6V07MqK5zH~8G;YC^-9u2E%w>;goj@yL zOE5=55uY8*5m9-Lb+mO^Rs77Wlz8YW+wFsS5Mp=8fp(uvgJj59#8cD(zCxaDpLQ6J3VlVH1`*+?mrAF<{*4^`1QMINAnWri@&sLm*R znA}7~R$n2MK*xw!H(jxqLdwM2;!I+mQ!~4VMINVk5&Rw400-x(PPzO(X zcEnp{@MXe$Svz<#cg<}7`-9g9Rd+|Wb^X+VnVqT!JeqGtx%R!Q-&5Uf*`_w6ZdE-E zKNZSn{HhR`t-31&?$-maF9gq#j?@M5?N)Pi7I9qA+=f387AF(12?D$12W#`<$9}a#r zIPZPo0a&$WH0W^;CQ=)MtcscZGFx-&KVT_k?xX zn#L*HJEe=lnvAdkNw@s>gf04|kVDT{Q-Tc=-aPyON1D)29`zKv3W_82V#vlkQj=C9 zv5IjyDefGf;(wR_eFu<>g^&vqmeGioJ?3%dx#^N;3*Kh<*Vh=LH~XZcxae(k#j@t{ zGiA?>*O_u&;ooUX5|zfDlp`--GsTXwHRU+R$N7{4gl)d0vp~pJ49ysqlRS`e3~=uX zVaqunhu2YZQb-9Re;)a=H{~pbZc+kWuIN#?YIȏ-E0DCjMRh>StwHJTI3<JvQOi)g(;)tnuCL)dlzFC5%XybK1^AiKLa0mDpv73YnT&k+&gVbD3>{ zodM?~AEHDn)UNUiRDB2iv_TC7`GjF7sdqbBBK!b#jaKs zmTclP8UB2Z&GPryTMWCfOOEZDZYGCtx*%n;7m5nneuDy2k zwb^jGvggLY-O8T10}xJ5m1KpAjIc@-RxJoESrU+t6eLr2lLgR^;S&3vAAm8+`u2VEUelxYrA^k_P~s7(bqT|{$TTmJ8te+@NM|{ zOM~j!Tr|8nqlkJ_;7lh&Dmbp0`swI02CG~@4vgmq52 zHE_?rZLtzE+=aHS3zgfl&21m9ySXmY+^aVCe%icc*75UKPR%!+R$n>wNz1Jhw>SRL zt2Z1Z?wt$Y4Q#yAq6YS-UpX~vOE;ZftZ96|^m=KgW|La8>DHFpz4vPNJaBR?eIQBd z+mc~`et5w}|-<_}8IAeuK_v-G&w)MAce;i(D z+deyReaju&%)nyxsv9jIjDMKAnYvfq`}4u$^GzrIOuVDqYPrq-(Hh9Dg`2`$&+nGq zR@A`0Pehu=iQF_cs5KihHM`WBUAL7x1NUkUec|OAwt)Neto-{2wOqr?{NFz);R3JX z)xZT#@(K}}KibxK$m;l$j>bcR@RKc#he{neb_lr~K_{a}UIQKQ1O6@Ut%_^hG)E-u z@<&;jw*xN5W)_pTnzxi(0cNGB5->%q4x>g+a7m0of2Ntd*Hn1@Mir}Fgprwkil#XM z-;hZw{%i$oOpXCC+vm6$i!8v79&B&a6)Tyv6FZf%V~jbqUA}CNR&k^p#gut@0ao!j zn)S^$7^IxKuNWJXSH9)Nbfd&DEPKq=HQRj_oiOQ4t~RTWt~F!%_vpHO+>8~hV5gau ze}lDhnmaUUExvN2-TCrnY;=Pe%fBauZ&80!zMdJIbbb5Tf#tTR1W+H2l#8i04LWVo z`j2YMX;N{MHW!!NY>uq>yv!Zj%mEgNlVnfICJJUNo4Lu7Bwd5-d3Gyk!x%fL(445i ztcx_dFPA;35_1fbZjt<{IJ0Nwwe-aABv}Hp=MDGhZZn>Ln{fCwqcp|_M&CWUH($$) znYAf@nhWLxDPM~XI+JwuvS;+D8Oy)L5(t=ceSRCvf$Bl)-)n=BVgQ(m!*=*MKtOdXAzvHUwfLa{tmX5u<_W=W~Tp3!eQlBiiD z{}%nY`@F=7vo^nL<`mxxbLCiBl54WZ+-;WnOtGFEVP_#PKQ^0yh|#=*3(wy-KRJ^t zQnoAX6r`;B8+(=G07oz6I?6?WdspPW86jsVu0_3^tB{N2l}9`mul=^CZgR!*aNTqB zS742Db9-aO39g0P2=kwd;G(_*Y|ewSc<1IN>vy<|e36{rGjo#r%qg`(6Ex|0&Yc4u zfBBPN3HZ(AF(PqE%O25{vK7!29utyGh{jpWLk@99%O0pBJm2TPLzI>0dmha1&-K_v z{vwx~PmW~7d}`5ReniD^kJlUntr{c-|Fxi{!j= z|BhhJ=c%01ug$6W+MH@Vr@e9z`mnrk7fQOgu9bv;;?t@;UAH|91sWU$wH3_LCswg zJ$CYIt4-9OAm>9PpH!PUgVHwT37&#*BgyC<{woAb#Zr`?fZ3n5n9Q>%1>;x-lK465 zdOEGR3M3C}z-dsenZHkP%m=33p*e;jc!%hofS8H$WkTz+gg|pMiv<`^k#D9=vy+@X zsM)AId3uueHVDI|s62_%3XxwqF}mhso~+P|7L=RV|6DGgwHFNL1?@$e`2%A6i7*4( zpY_#{_0rTp79wg-1LWqq3_Ro6p?Y>?JbP5no;&``-dEJUuPk_u|J1i~c6{E~25Tre zfEEsD0aHIPHWpMa$Id2T=LY|<(3liF$Gl}HHb1Wf0E6_<%oZ+mAp}ocxP@t+f{8OX zR6)7~v$SC1{KQ5BU={KrTn@0XxP2O2bsw&sGF=Ny29(o>$mmEhbSI==@1YXS6wD&f zY*PG!qWMoBIdU|3>csva;5?^|A<9L*P62Jc<{+=&xX8>(WTGx*A_?Ua6cC9gpGKfr z5{hP{0cy60vtKEcR;g|;$QxC6wGOq2$rFc^25RSTm-H}>|~IWGBo`1E-D$O zAa_9IoAi@ZNb-jiOe4@7`n2WWqaS5^;e{B1%c^dCDAQrq9D0jdTP$?>tQ3TI72K@F z5Hn$?6-qMm?%+k;OkO$}Rpe|>_Njl3kXfSzj7otcZ2K#n+Ybjb4!PnEv z+3siORi2?ClYmcik#U9uPd{1n=bK_C9n|Lt0}>du8hBYC0}^I|^zW#)QydGjE#`+c z7wsTzU0mZO`9Dz)|Axv`GQniR0115}k|MuHc@DUbK?W#)K&5{R0rR{(lb0MCu|0-@ z?mwc)H&M{(yZI6!i+SLtub3mHsmS`E?ZfVy-I?Z{YV*#8=G|Fe(@ZZRk{kZCZ^K-b z>f4a^Y=CYIzMs9&x6KXAyEmrYJF~tuGrbw#8r8RE&X)G=noHjG?YebA_3cV~cHQ^a zOu4d6Eoq@)(e2H+Tj$-aOO}$-@-M5n(!Ko8SLHC#{Ho(qFW&d~-W{ACd~f$$?;q~? zXvcj0)&T&ui-5cV2hC z@44=Qv8Ar+%a*rf%WEDwZM9`nhhU@WugUmYR9{QR*Q5G+GQO>a3aw=dp~q&FO#uQ@bzXtAa3!@iq+B;CB_ zy4^S5(x3JOmMj&e4WIiV3C(owP&;?r^Y2{r*Q9H^=8_Bk-rM}={!O=fGd=xkPyd2{ zU@_44VcE^HOkh9cly^e&frB%L9y__p&5x~I^rXR zJHAkUB3oXYDeqFtyB5mVXV+|)u`QIhKJt?tY%<;0eaklQ-JJIBT`aG<=DzCA`uC*w zoql8`i%(pwDFFeYdx$)%()r`|i8F@9uni=VEnzrh3EOYT9n> z{GxAD#`l8id*Sv#+IRd;?_J;VPqwJO<7vKNz4%cyhvAptH z>D5ww^E2i9)$;u?mpyni?OBy>9#B2|AC+;X6=apWSl%`Fz4Y$mUs^4&!gWyU(dnZZ zPe7#(SF7}YcN(ki*}dr5c&jzDX`i}j-#t(NqNnqor~7lSFXLT%*Sj{mrq7(|ZGU>r z!8=vznuBaNvb9~a{!DF`imiuodf?VcsHbnAR%-`l9NFr1>FU08c^|@Tbs$~cnJ(`{ zxMUGlR4w|et{uL5_=by#zO#T?|60|*Hsjx|`gf=IoV@2hg>|UgmZ{yV*6zL2ny!8A z6aU@X*V3=Ns@A?X<9OV{m97W;ln(F|?%^LhF~_A1TrJZ`XT6nErH@|(d~H_!y4>+N;VbNE4t?z2zwOv*{A-{L;$#JAb+4MI*m_G{JM_D<5!H>ovaF*nuI&tiKJ+x-=r;};wz z`jzrCpu#_Gt31A$`_t{E$Jbha;@o_^-u{yv0^*<8_F9qgKg!!rHgbRN?>OCL{qtQ_ zrz`A#;dh_5JO83-ujABKS9+Q$eTz3Q1-rcqK z;uS{Ewq__5Hq^5kyo!Y&@+J&WtqX7@dJ;M4~9y>)gp zeQ$%EQZ`#DzSVx(QhIMEb@r*Zu>T*`;agY zX};{4ghfO#Yl>pLg(+!~&ll%1ydWpQZ-zCH!a#UjNhV_Os1p%2C`@n{289_gC`{AO`Ve{Tt$|PBI5}ts4#h=$Y$zdj z>7LPyJk3`|3Ha{9JXrrg5Ihx`PZNxO*|08-z*{nKX82O~i{zPk23~}qwxwL70k~Jg z02bZA(~_AAk42=gL>{TEz5;)vBy+C>D+UGu&W^+I2x1<|=1AaWieywD#(?1u*&R42 z$>dQCrb^V^H}OFOwgiEZMB+Tt(wnm)&qkVmKmzTXpO&tr*Gl_O4eVu0 zbrjq+K3Y_hOL$f11CHX8Cd5M_uRNEhJCoBlmCDU8HKFY!mO7>h@S$Qg!a3nF;D|7$ z4}TBeM(_h0S%DJwxBOw=h=uix0%T+b+e_GLPTPv173A7e78n>o(Z=3=^d=k637~RH z{h`$qzWvZv;57%EL+VZ!4q5S`6Y}JuFG&S;_FF4C7Y2vIxIfK%Q3{<8!q_#a^9~SB z2yiH8-(%64S&!jg&TI&^6*=e+CK#irz~zD%RfE-nA%=8GDPWV+Ez{VmHug>p{?yl%t*y&8tjRWZ z=xIM~-jr?IcztF(+=RfBN( zV;1%JG7m{c^z|X$8BQ=J9@2NF2E1f2cl7h~SF{G=i?J~b2N~;`TY0UL$p*V~1-i%a zX(vTqpg!u!nBVv)knWd2o$5{KyHv(Xm?uS%KSec}jPJDbwW#G)TS6&ncDz9{I2ElK}|G5+Y0t%H&5>ih}#Je?LLw zmw!!L_+<@e@3>#vnyKwmYx~l+s{2A+Mrc=s_KdJk74|I%kczF$cwV^cdEpnrzF!D? zJ`;LAZv&?RA=x(7vu)n<0&MwLm!t*Qe0p4E=`DkgIlPxBc+|kT%HH)(dox0#Dl}$< z-KwxV>sia#lZOHa7dLP}umvsN;pal>^2&E?_k{g$)GOTq%MI6l{_hnMhy1ht?*3k$ zfFt)f5o3f6N&227XeK%M5AiW8{D^EsRN(VDWboiN>lX705_4WR&n{P>3MiuN{1{`Q zCv3T7IS(ssx#roei8({h^FVB}5C>@#w~G#))VS;@=#8vZmObCp2T6p@JXzYOEa!5n zU#1YNHXDTh@)J22ft83I(Lw+6YS87sFFTeiao+hFasOq9F6=i*6Au=O-FbVLeSJtL zD|GP{eO1j6Uj^1iKq>44fqwx~%9l;&UwoYeVz+ko0R+~ZQJuc~6WG+<*S*2`V4i+I z&{y+_oVX}O&W+F~2qA}(gBLdC=nMLYQDT_2^mJ$V`knb9BakuF4(@k$PwTEuF`X`u zrE}xa5QwaTZ=M*0%^4jb8D{2Zy66KJg;%^yTk+HsH9uYU(O21!a}(Ffk0&lZu@v~a zVSgez9*aMb`oxi;-DcKIMC#*%V>~<=56HM~$vGq^=~L*6HT?-a`nN5D!*D&8Cbm3J z%9y`m3hS*#i_AwBH;UKKu>XV=yF^E`NcE$`2@bNWmit2(8c;Hf2T9 z?I>TSRO0mH|4ad4Be((=tNORd(p`ZGGwTfFkEsp?)wBkGgb2gRkIU_L$3qUmgEHP; z34ws%eQWYJCTC8~JL})O`2OVe$seA|bnQ~RcFi~KUT~iNlQlo-`Y&DSmru>_JAKc2 z`e#CERv_Y6Z54lSmg2xs3$$X)|$|wGVvIWEb{=S<2O8!T6rTs4JkJi}t?X>@> zQ=oXSi_&-6`yH+yTbxMwvCH0H;rwx#6Y07F$-=&;F$icIY~P`k5D^S+1P-&h27#KF z|A2xZ0?ix7ca?Ce8MPid8VTmvj+{)OD-%xpGkPecE=Lo^K4@bt%nX8 z@wjHepKA=Cs0%?Vymf-wsE5s>dP+gO9kWYh?QeTHJ#vDb-NVF!*R`r9Rw zQw&|1%CBLQVOGlX0875&XIvTk@A?^6@)_6qmt5;-T+?51p3k_-&$!0VxP!moYChw( ze#Q+vw%Btn|H0e;<#VAF3nZ^9ABe6IqRQ2 z`+nql&YoT3@VasQA-$H?+W4M1|3eNhIJH=ag4weDf!)eC-uG6| z*3Ek0TvOAbde_Z~s&~^8WuSp8`c}* zT+hwOO>eq>y;`;5RtcCGLB*=NC2Nf51L)+1-Ps*` zvYWR(a8p9o<(?9!Yje5P^5B+6)$)A`lN(er}pO$07 literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/fix_functionalization.cpython-312.pyc b/compilation/__pycache__/fix_functionalization.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10cada3f40f2152dd3b37d8f3646b402eba2d322 GIT binary patch literal 12149 zcmcgSYitwgcHekBj_ufqlQ_YF;7p#7CC^O=feY*c2`mt@4<47By>xIq<79$g%#0z4 zoGsmp)MSf9QuQ{8N<|{A1PFRJTU}|{s?x4*|J{FBpZNAL@p|BP>IK5Yhi|~feTAaH0YB85h3&~UJp9*cu}2<$D9cU zLr24G%r8ZRcYLDg*A~~F`RU4PBj7ZHIMh$DsNcjA{Um1|R;idh908R$%W%QkHe4Ha zJ>3>CvE)_r-E>F4jWy?(>wbDJm;d5p%VGLh9zw7lb=BPOup!p zi;w3DHOw%Mtc`QM>`c}kFsXAbDjX}Df0gPl`v1Z>U}haa(c+g{&-ro`()I@{6~Ia* z|EFj7zo);2Ed=_zM=cFSisvvtayUyueOOeng!!sL4DB@hP7U!Q9fc;E<<pL(q~Aw2|&U)I)D) zIUn2P2QV}jtr)P1lX4%;%B_jgyj72N)-%jPARe#z-WtmBz`9^xwD{+KctM#+y|odYp<= zQbzL)ml^$fY1pHejQ8+nqpX*PEA*1_zOWskDMI*%xS1uf{usAF$*h+0;%Kn>P61(60$W4-!99fH*5Ms`-FWLf+l};+PIckovnY|94&%1xt#s~>;r#7! z`*5RyhhFMN=gvWIeijw;Ux8N5Iy~e3ELwrg?2YFSw;7}LQrxD~g8(@lZr9t5cRb%1 zaSa4H$0&Q&s6SCh?KEKMr8ogxpO)jC>t*DEKvSmhKe8liW-T{N;|^GPmk!BzW0-M= zZsVzQK6xcc?=oQOrHyEcy+TC0F+QvKQ?lA8cJzr0-#cTHW_AB9a%)m!4<;1H3s9u6Oly#3gpQTsh z&U>W>M}$pN=4;Dw6YCfQ8*Lod{zcXGC-fDKcbs6$bV?IQdRy;-)vS8ic!aI{&#*P%jPjB&!KLSEoT{TUk^ zIxO&q@8s2Az~eX~z($Xwb3pBJ0w@7(>wiEe(9Po}DIN1bC~nH428Orpx6x^AWS)!*jZA{DkQuuaPTC=c-(a28wOK-L6#)Yj(csAAhY z$c4F)sIY&mBbS@moN=f21|#5DiTj%~vvvboB34TOfquAxQpj9U_D$o#+1_MrYofMu zxwdn*`w4~0>3Q?V+%0~AzZG5xC+ZK%^#>F6$K?8B3D5DzwVf##(o6iN@!;=8d{7@A zC^=9{G?)bJV|}af2_KHeB*iLlBG1NrLB#?kpqN4d4Iz?6s$+$%cmgJsS_KF+E_>l| zhXB$GgCdN~v};9ODmizHlr$*4X`&V_t_v7(xV%5E=+YE^#caw=}#a;8Rg75bPU?#uSSd z7gQ*g8;A`mh3Yg0!id^Lg;E<-))(-ds#q>u&Y&WcK#PFgQN_d@Qd~j=_;{4zLIWJj z^5H>Y6Yg^A5}@+FVvR&0VhK{DWZAEZYaznLPMcq7P(+Rz{*dd z{E-;QgBc&JfDLoS=93sr0SP!zAr#`q%&DmipyY)#4JP1Ff>MZ&l8FQu{RG?bJe50v zu>pMgELai40#%p&6SiOwR$UQ|RXtZ%;ZTM65wPDC^u&rB6^Z<6Ilp>2zxI)%_&;9q31@=&7*TiZ}!ghCMtHz6}uA^2jq$a4+{?@J>J>N zbA?k>vaDkE^mJ&7OwzRp`gNIpeX-*q-8JQWVnL-9vppYoC2HE`n)VfU$0JYWJo)k2 zMBPrgZs&@pGwIoS(>`aPZ<}-8F8RcB$Fo> z+eLGsgtt}pwk{UQ-u9^@NxCUPcgS?dS9IsBIoZ1(dF{@e=bdOB;d^xyG_rZQ|ITjqZnH73pDj&7(NjXsY z)|-ww$NZ@k&ughdRJAKrj7rNB?pE2|x>$POyzJhWbbDrcr+eq!U%Bg3F0{R6ai?6f zd#V@O6YlM@d;9IuMe}0tEB7H-uXxK1?j}FS-we(L6W)EYcVEKWBYS%u6!j!)Ub{Ue zSMHf|Bwg-=t5$Z^&Y!;Bz1aAb>y2d5=InWWvbQhcJtKS1JSaMotl6<>mn(nt0`rc` z-s1`HN!feyLD5M(FYo>_x$;PknU7qhGq!2ljAPo7aMj7Kx`eAqb~UYO?v!1f3Db4BDva+j43M%S&kYKT20q}W`Zt$d2a@isAD7-LUnsv-u~3ny zIxJTmPE_^DRecZKeUEE)EVXval|5Oz0_5P7lWsa?qBfVMY=GeR$PNg4)@^d_K?8z2 zXUC=-DU-ufJY`O8Mn$E@f!~iWy?18GRlDN)@i!pHj-sh|Cy#y9lNuyZDV-W7kYWwzh&$mRHPnCn{Ryiq?mowh5=kc+2kX6WvKiQNmFnJ1S;RtvIS-_oWRm z-d1?sG3iJ;_T2CJav!J_n;j@>b0(}6vb7>%t(L9TE7saoP)VR9v2tW0f~sxBH9={7 zS*k(lzk@<{P&!UX>EICrsPU+Ezu^tcTirwzDsq_l`Rjl{HJ%;7CUVTn1gVYrYv(nX z#!lFE?R`@64RW+kwJqrFkY`Zuh6WnU5zWx>1!*57As~#;g;_chP>a}C(1Q0E4z{Q5 zJXk((sfHn+flUa-6c|y=efUDc@hioG(+Oc#v2Y_`V6$NQoq}fJkKiF<+R?OnhX~zJ z9;0(#ADRtw2T#K>747yYn+kn=`qtTnv$xJIoLkOsPdZ!)2Q543gkziR*mm2w;^=(5 zL7M;}Pt+%zfv1qU?wb92Y8aXkE~{rQKmj{B!`W^S9sxga+t!8;@r-pp2nU+t=6l2% z_bz*-^!T_*cR9hCGc}N7|DHLk^+ut7Yy+ckRM>t3`;6u#8rxgY2lj9fCzSBnMFf5j zGD1P)(2hzpxC)WG%vOXIca0Wg?4G^FYJgmy+fD?LPpvor=l*UBZ23mKBBBm_n%`{sbhU(b%oNTqu ziu_v`E#fdSxJ({ZQ8dcd#@nYB_byxCOj;cYYprb6E-PutTDxrB3yoOCOr2V>ZdSR< zpUnapCi*vknUje!!VN0#sh{rQgC7lE_D?}%#!1L40pqE;97@Wt62xdS>L#OP3c)Mi zAi5E!@a!-VryvVu9^OdhPzE~!^KYbF%*F`S%LdQTw+o~s!?*8g`~mzHT*qN)b%H-~ z*eDU1;lK>2XK>=4s88^PRQ&!D zh4Z*f0tD5v7Y;#H<9>_A{Tc10kUTr08S^Ce=P(zjB9NAW0My62a>~GS#O_xxTlB*N zWWtR$dlTh5H!&LyziP7fXKWU#{OX zRXej|ddIB)L2>n$)$@W!*YidZ(3#z)7fUgYR(ZVW} z=2_z%2>6l|-oTGqc<35aBCqnP{;3_QQ>l z;n1V%WlV$B2cK}PyZf|HK!DTImV+9GfbT|dd`eZW2DgF6i-f_p&=jAh)v!L>7KusQ z8tJGIxx~ZQC|DQKEH7|=DL87dHhfr;r47Ma3F8p;0JlyZVgqAHlA_|CmX<*R2;O;ydefS(Bf-t4fp~b9j#qmU4g*BuGc#Td~8Qsd!YUG-M)_9-0K6K?Ok2m zt`==B1LCB94%ZTlT;{NbhLCGbZ~M+R?K>ANfayCzrNSQi2>wt7+f6fAR z3tvqb^g4L!fnaLxp@H<5Ih|%8gS1BdMp}$=em;QJLl)839NDYhOJjBTxWE<^)l=Hw zc7Co=hiI-EZ+zV*p=t8_LO3PI4a zl>9Pyk97SZU`%%hnK+P2ab?I1J0%j7w`kCLlJ8jn!cjBAvioW zb||Oz%Vo_z9f1D>2HgaYAJP$c)7IkOynFiBHTUd`?SD@$Z{IuBJ#&2e_&oW*UHfIt zLwAq-?&;r)c!xjR(M+gTF&G4@F2{|EOIsV5mC=M!RqTorTl<+Df))g%q*xR};!j!0 zwHS_7PXMJA|9Rb1?t$L1)|cJ-zrrl>20VTMm#iAOHcd&(TU(b4+foSJ?PNjwV)tU_ zeb18P;41dNm!QLr3Q7{r8rfMh-@W2&NN(CPGdevwUo;(`cq>_0G7(R@AWBdpyK3g& zT0HgH`=7qQV59RP>kvvn0|gal|?i=m>%6)dQzEHk46MlZX%{<{Pfyq0h_$j*j8 zh}cv3#eq!+3&>v;PzN{VX}nR2BbN~KGfXJL#)7!)U~tGbnC{7A7&hW(7y+wh z#Ub%99%m&W+O7r7eYl-cRfO&MYMH7|nU0yLx>@e#(A?0?@Lc#m z5w5S2$Wo(qrOul3s9p0J7kTmQeeR2)&xgJUeIAl`_R4j=s|W_9Dp1$n@A3<%EsqP` z*GDEtu75E3L5hMpgiD}hXmaR!Xfl+_!!1^{Y0GT;Jax;uV7=vBa4wQE-M+M?T`ucL z+3)~6OquGLd29NunG@3|<`2p4ZMRF7+>LTU)1p}}=vc)IrRV~x5=V5?!%`%!x9wc? z-{1K~*XLbd?E8G*uiNDx^(}Sw$!%}1;_eh}r%LDRZf#%KeyeGrDTScA*nPjm*wfPP z<8lY=2|7|V-jl78+P!8^P+zwv++uwW%1EIW`B6;V+bNPx9T!9md< zJ?GBM?vh-}lI%DhkmugL@5edke)r6o|8O{L1fC%Kmw|V;67n1Tp+7VSek}f$BjgG> zO#~tsf+S>c8$yhmG2m||XmlHC$c8u{q=igwGtDsujUkKMLgQ@E8nU@*ok-LcgwgihqCGHX$w+2f?W$v<2xw|}6;jRc(x+_Cf?y69= zyE;_kt_jt;YiT)Kur5^Zu7|i?;DhTz>)q>V+!1t!8r%(`Mt5VV$=yWL3W6I#&FG6M#`V~+Uyo(O;h}6?kGUUxfHs4At!$XL%^hO(1GE~=wx-AOwJKIV zK%0`@QZ~%nX4M0<8Oye%$MUrqFuHe%-D0`u@H5p=P~0t)c0)+HA)(jIhB`M$N1?>t z`yY=`_Bx}sBDLRtt>FvVp7mJ1S%|yD8g2p+?tzi^h}A-+K7t46 z|DxV&Hq6(*)e+3+4cJj@*KvxfldJ_x+llf!G(nO&=a zU8~jC&)v%q*vUFbs~3ASJ9&tvu7lL|IjM(fsuNNha#D}b)J8~c%1H&Tg?2YUYO~mq zX}52bakYGa%j04cV^4TgvGsdHqA(DT2E~4tL1Fwa04?>#V-b%(9`?lok+3%y7!w6Z zE;;Fq#Y8E5+#B-^iPA~0EJLd8MD)aSq7;mH1!&avbTAm&9|`*dgG%L~81wkzk|c&> zp64Llo(qZ|zjs9Rgd&3IGAb3?8uZ3xQRoqbr;eZMhbl^;mQFkB zJ=vucXbD;YSfL_Zp(9}-?u$j}8Y&f-vU+WOSvDrb3_l9te-RI^D3uH4d}6`D$D+2+XU@4-DTjoFcQveueeL1uqPc6Jn#V4<qv)2!+V!xolEBbm%flI6A2jh+?LT zv;7fi1N_ofL~6wxZCd^)WVS&9Ca#`VA5N2rd-?Jyslmjv*c z&><_C>GCGP-!jOBO_G7XAw913K5ChAP_eLNZa5ximb>9 z44w(131UW!2`peq^C;&sDMeJTfX>i$;n??WPI#!YcE(U!J3&P`0|KblfPXX)9(2Zr zL?;TaGvaq@+QzxrNyp{Xgp0O8Q+8&G(w~7bDugs2%EFwyg0RymL`0dk8;Lr-K{?`# ziV|u@nkI>IK!|&TPAEz%DjaP~v1VHEWwaDb5AGQR*JSZ1kSnkX2{H_^mCtwK-T^sz zw3-C#1i9SzxKprJddy|&A1k2C^`v%gY#cMSIiC;2h7_V0Bk|an&8d&on9djmb1~p(4xHqinaIjse=cQx0i9Ues(O@K|@(i77U_MU_{o> z3dW19n;T_aoKmW>9^DM>OjFEaR1O3qVZ}Hij{0=LfW|08P^O5&CR~9-D?vP%PM-vm zj}7u2Wl$^AYRszF*CG6QW5PJBUwHLQ2&Ibr>G^sHgYZHxCcS!si52Ma=ZI`OiVV$s zmwb=Wd;OXrTOvWP98lk)*MYf}=<#eQ+hgVWuY}=sO?<+(Smo+eXpPrXW3_rL|C;G7 zL7mK~M*#!&kG00bay%N1NHI}BH33u#$zc%k%~YD5Ao{3jT!j+Kttvh1A*R@AzK4PZ z30GI)5gY|Ikf+$yUuym#%(v+5Ma56TTbP6sQwp^%Js}lYO31N_L+wCIbFtDMtUyQ6 zhxsNIP0Glgs2o)Z2BD(Z(lb_)X=9VH=o7XIZ5AQwrRDd4)Mxio=buC03sL$wMP^ud4TK~0Sh)zmNLO1w#z`F!H! zXUB=(0J^~8HVLd~_A>$pdciVIUNE?=r-;j}RBKzJ;_qA?q3-K(=+&pI8yo`43)K(L zLgYJ!T;MAhVi}nSY0T707~jZTiQh2l7p?PbEF-%UT!Oq@Oi z)#LF6;qnA03XTbR*LgwRIc#98am`iAu&)f|#;r>{u@+32la{K<%4ye4OB>yD#T*TJqA&zs?m3Hh47%7epm=cKVTmD0 zee$Vdf%$}dn*B#ENZ#@NAl1Ug7Y%FtF4QJGKP8q}NnuH?Ly3G9wpz~-v zrkI`=1A{{`=}S;{*(r;%fow`Ik0?bG?67vWWSfP~K_YZ{IB$e98TrbVg5fcG8Nq+o)d3yL- z^L%O7JH^S;U32^{Dhtb23OAUp)c-*QR|=qLo|T&5@`=M6nG^dwiRns|3}^HXG^st% zY93aC)Xc`~B?*DP@-RwP$DKFE&@ezBC|jh*vmxvm>N*B;^I#6pk=h8)vk6vUwLRtf zXAM*p0%UeVTWXwBOzJkS*j-h9BX)7w6b9@CC$LHbl13n)AJMj>HbiwIb}5}if(JV2 z%;nr71+f<6jj61%LQN`mNdzm6PYl8JXKcrV@Jmj`uR#}b1PJg<+C<80t{lF6c)Byy zv_09h{nFw2vd-~?cOdmkm%p@7RDQ*N**<;nhsRz!_70osI-Kk}3~IKbV7zz9L`s?$ zips7yE;|+~>p>N3w5skZKgBOO%q0cm`_g5ktm2aQ@}uL&)KeGNYSu!RNHM7`dQZ;a6^s9Q!4nAwN8vD{z~~$_CNNI5np9UJEzo6;O4!!|$$zb?tI zOYzM~zImQ+oiQZ&O=&aaj+;KT5o^(<3OLQ~SZpcF`lMz3w0WlXn*S%~{&nPEMt-{E zrsWh}-dV%ItZfBsl1}5gP@cpio*vc`YJBMhAc{jfm3?9`=z$qmaPJ%i-SK=Pv}2?4*lAuCh?-Nms@pE%tNa(WF~${?qLbqf_W zd5}|~ZQQsFIW^t1w5XeiVzvx5WgzOW44)5b64*~bRe-nwha=MYgQKeZ9E<65MO|hc zb$#Yo%hI1crxc5BRZ;<0<^lo?f`Uz91ab=nT;_Z*LOquCw=-uMLe8`2nF(`V;Ke2~ zml;};J8w#G3C7QW_1@4g4MI;YOLlnDi;zG^CJkYX9;gP7dY&TB;;{gacJaW2V{F$a zi;;dhtU_RjljXfY@VMZ~hK;GF9m%F0?{M!tcm0L=rjtqDIev7~vQSl*s@jyS+BCEC zT7150_r#I$-b+2VORFcl=1QC9_$Df4%kF?yXp}DIuOM;{OlJDDm~6t750wF`gN?xl z?vNSq4rTM8vIGlT$^h0fLVh0T4yaALVw~p^V9rEFg^*${+(;MKFO5JNXhGaBbvNlo ziue<8bQWoSfWv|<4`FHsZ|UzDf`JiH$_KT~d3l6(hhRu$5F90C51l^{Q;h9>82-os zE&mLS$YCJJBSmW>u2gw*vb=d(yw;i8zCXEr|BdnkBE>8h`jpHzy`*e)?UU8==v4b?*RhRvQ$8dYhG=>+CJ4jQ+~}pU(`L$ z-9jLSCame&YX`5NxlwXri8E3VX2~&M-2;T-8dB3$9kA(VVY{YXJg0w%P=AJ8J@(FWolB zZ&PtbMoyug4gTo@Q~(d%1800{+Yi`b@@TajoPm}a9LPh?U<^jEL_3jc$P0iD_sx7ML96#BWO*=!2u`T!!VQ= zrYw|IrOH~8Wi6?)ZOO82U~Vky8b7!Syr^~})j|Z!3V%z*R*GM$*1#{Y;EW}mwRqDz}rVU3~`?}WQRif>Kwt@FGqAM}{zSDJ2k7=>azdGIOE zz=^(t{r5KINPmViPvGc5l6BBBfr&`o$Bg|z0JNlSq^UjC_*k;>v4xU~5A8hHdb_wH z%|Qq|Ur~$3zqA>DSxANRt`%c-4H}?K6deG_HW>_Xai+9`t`;K*Ov8G#TLY zM&{Mq`TGDga53u5H5Gjm#xA`KWGz&(`IFZogHDQpOFj{upqc1Xqm#4H(`SKBgfHle zwLdz#G3B5edj2eXyMySRp75w2Da0|Y1##dl7vf2HsIkQeP{))P>KOI&S2N!LYDetu zH!g!dt1jI`(6V}l(xZ$bAq26YrQyj1L*4hm$-2f~1&W-=+Pk4$1Vv4)YuIG!oT<8w zWL?M1*nHj2iQ{0AIY=!s)2HW3H_h>zRIH?}2Q-$0mDWKA_rOYjc$*AbS{@uLu{rO| zO!s(a#zO0P-kE_f5R-$IR@rjEro*9a2e|+o)Tm8Ix(o>t!l4gvB_+f}(q92_tzwRR z5N8#0P>akH55*igSO>n3Au?WI@r<`Um`Ubgu}(vNtTSCQ-Sb-cTxrW3ui4tagG=-` zK=L7&`_b^aT)>w?M82ral3}fzbtD5({aqGBe_sBJx}VkEB@j*X<<{1WX$Ir2jyLOO z>(WMy_w>G3_io)?j4yRO6vUCP;M`VXl)uG9B=DKkr^6^4G>ihFiczrDWf%qTd-yEk zAaac^s(&gPSp1CU(ZlOkO}9xj=Mxt&xmbtGXnV_u>I0OiZ3j~fubE;T)D%-dO)(W3 zRHrr=U}^Af#IFm+zG_2lI4~5Do#?GX9n@rWQ1hUNnzKc7Z}r9HSR@3lYO)jjlp{`W z){~7ERK0T{zk{}SB&K?^YL0Jev()7kofBvcwV-qgCbR64j3)?6@8S(j?;#Rbp9y%& z{xale{r14w1IyD#Fo3>*I{1JW9=1Ur!FYf?^Y>s|UIvx279Rboo&WBa&$mGD1$r;k zdjbEbjE?+NEf#;~E#i4hpU-NH-<)C5YXcSSiU;n#dA%|86`}4lY`;>iyCbR&hXra%7oYkOSwtZ9wDTUIQZhl)%Mrr=_G zvdeQ|;LOQ`^sTe{KADO=ULq8S5D!J6EBc*{touA78>$vMFT_?@OsElVeSQJTK-^=f zPu_ycpyuyb9I8WD#SM$|PTQV^if!P^&~8IN#YLV}Z5Tgvhi{)5e0y}B?@94Tll)Q5 z@(#ru1xq&KXqg^-19Bfv@%xhezU!Uu?S6On4gN_mSv$7OJpU8xJijl+_a*thl@(j3 zpMM*2cc%FLNq+zJ1MeMw_xMfzsq}j2Hr)u0kSR+|(o!?|*mUO)cfYp#rbV^WDBPgr zjSejnMbuwZ6j85JM;!=PK6xQq#!UwVF=GIVp$&mP!&W_(4XL-ft}(O8WEty#J8juh zs$+`aY(|x(Z6hcB9n?`cZ!{VlRUE2Es{oF&@i2I8sb4|CXE)-x_#k-OP7*v#}jxajIF)bYJmXmbqCZ35B2(ntIte5ld9jEtlxUAocfjCu+-eL6uiPu@N?yx zX9iO3dz0;ZZ(5$9#{|0f#NtwTndd&5gy>2?4Ux56q7o!)^v)V2v#T1@3`>OMs3L96 zHK+_9KG=u6@{mW4#3i3dJy=x%LD``4Lk~v@0WH1Ss}>D~jmHDRSnV1zfSF-jlUAq( zFu1TZRk$8Jb{F`fTm1T4eDf{7BV89n+ScMVg!GE z8!=U+nHuhq$+0B@!J<)5NOwUj#Wg0m#x%olJ%-71SBIvCJ|Y;$Vohpt+G62smwX=+ z2vjJ5J-Lo4Ciq$eJ{(a8hrCSRC#m#^QROYM7$a z-=sA&1TKl9Z558cOI)dI&QIb2T9@oBc>4?VyK`qHc;}RN|)Yl^PeF>hLQV|k7?IeW% zs*`U4?Lz-NP?iwyC=Puw=m9~QLzi$#Yrhg(K$mvv@~)I%GnqA1t05CfwMd>o@&*v_ z7>;&n?)3l=fT7|5Zo2RU>C{e;jaaq~$rdErv6K;(PGLOz!9794GKyIc{a$drR7@Zt z@O6h$0!NzqjZX&fC^pRNIe=e(D3vn(d{VW=;`dEmDuPm~1oacl)j5rN4tV)_V)!lQ zsxne1j*k@rzL<0Xqvn7NpN9%w#SBspx>AxJ#cAMMDtZR1*i9Qtb`VXgM-LzE;P;+6 zBh7$B&+36uc90HXUq_Ijz^U4W>KS?(Q<3b&3>Okq9g3w}{YYnzbQ6Aps1n%^=LdZ2 zV=#PdB8H|p()b&~|C+STk+%0q+56<^`()4iq~U+auJ=jP$Bfm$d`y5WBDq^e3~m2S zHvgJDl_XF7njHQwYuTkgowu&LU|Qhe4pf)oTatXsJiqaRb%8I&_p=wQADIk>r&R1q_qVabCkStX5!2%XD8049BoNQTguUy zbaY-oXFINHvixfGRQ1)msk&74?qv1uRQ17R^}z+UYO?vNYsz)CZK^F*vo~3@H&t^e zS#xNCt<0@)I9Yu-Reds9eG(k>T+9XQc;_X{6&%D**4oY*LCzpW8Xr3UbKv;Jj#dv4*_$_#sB~S literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/fusion_attn.cpython-312.pyc b/compilation/__pycache__/fusion_attn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..253df49fff27e1cd339033e3f670cff92a39112d GIT binary patch literal 17705 zcmd^nYit}>mR?oApJek*krG)H^>B++%hp>OSr3c0W!dIPBhBo%F;=_PRV170m#eBJ zk)|B2p~Pg?6G_;e6}2x>8wZkL2bO^ZmXXI8NPYxJ0g$Tr3sS>B&%$~MQFSvrtu$+pH@vu*LVY_xt(pF`cRl1v%wxP7Wru2KPbOlN~lnqwfJ!8&j=NAMn>M)!Ko;hgL zB~NO)noQ|gMIXt_QPJ=nOlC64lNn_Y8OP9B#7)Vfo=*%Fb16NY&m}YIafMno9!cuD zqUK&s>ZuV$J(AQkl==^#Fh>nBM&L)8O!i$xL4vDyf&_Jy2WOzC;H z9Y!maJiJXQexf|~R5FX8CXl%z#6?+%J7h8LlpTsob}DY!rFcf`YCG}tkVkf3aK(MH z=YkOT%U;9*B`EupkP;qU`EdJTQT7ixR-j_YAqOtFEP|;=uOPb_E44p0>H2AS5wj3d7IGp(Xi|^A#X&?@2wCvEJjkN# z_!Gy3Yu2fM@-g59G^T^8SbV`XjdDac?nHn+N zj2Yi@s_QYV;lhDvhUe57`qgMG9GLn9e$}TCjd$AbHG{Dq4YVkt^TJ$b_qE+uci(O; zH9T`?`)tEAlb-qd=8JDky)iv7Ti-YHT&ccq*4L+QKI!N;rD(YS z&L)O(+TfC%AO_{d-(oAGw}rRGE5f8WNjHjk!FLYWaEypMg-RXk)bF7W z&UP<;*bLQu1c~tXwbwZQ8sQ~FkM};t{0 z4M%^A5+Z04AY0NT&WR*j)6G@gZ*XSkph{;Cx+FetUg2C9V*R(r8R-=`MWnXr8H1O~m@KDsN_oS#>H>ll>X?N?6hA{rJU?;%6 zM;n8niSO*eeIMbu+E^Cp?gV>nGD~I01wn5yb5&Xy5X}47JWW-6XBbf1%(m6vM~vm9 zvCQ7$BbfduHCh%X#A>g1uDb+ak!w(GM5~AJkO+8@INpnb$S#&wu~$FAKAZy|fJ7sn z0wMpQcm=rmP8~t|=gu>tx*HuTv}8D9Pa0y{aLVcIQvT@QGpyv4vjuhU_|x_Xr7GF{ zX(f@%=L?B~qGpRa@YbSrtNUvhO`~Q0ccS;%#@)9zUH8n0Gg~fgneO@7&Rd)Qp2-Yv zH8zW!RKkWEh+n{^m4kMmskG&cl^f1{LBr(*mB{c`Ml`&#GL$T4bjHjKXEHYi%>!bM zqN1JAqAE~hz{H@3B_N(^i8E<^1bT`J`4&@$_TgltWw$FW{+_%- z8}Pe`&I_M4w_Z7P`OuZYKNy@Gm~ZNs-uiCKjTP5d+~~UARqokQ>e*55*;ne>cdu#x zT%`T8rnW1)F7KK?dU+38*Eg0LcfTLKJ@EdfyN$c=)$c|NaK~^%Ikc`6S~sKL7`r}p zyW#r8z0lUVP~F8tQ-{i-?oz0G#&tim_RAKbrI(R5wl2%KOsYaJm>H*UM&thonMa8z zb~HX}(Kds_A_zDZl~+qmtX3OQ@2{4dsam`r1FkYWjk>wi>Plw|8Dk~HLq$&Hi+Z7` z^NTrQNxe)6HT>YO!)Xn=G$bODcGMpuTEaAe6ud~-3vZH7m8dQoEn_oaX-|H%Pe$AXmD@-tiFB0jbD-(RxWrce=>{fPzXRq!v0H?lrR;_bm$er71KoI7IjA*&aZ{ zFEwobl1liBX`LzjS%L0h3X8`Py$x)sHE;L@*=L$K>~2h$J-kC6h?XME9APqZC@^2Z z%;Aa$CWW(NJP1+KZ&cN+=L$QydFUuD(m3HD(ECX!y0UENDbcr);eeH^q`$ z5n2>Kh#zC~h~i|i%AxIUL)q~y_0DgpcYRB}`_BUj%Ud+`w zCD_RkvJd4pk_1ZWvRD>O1pog5R+WThzw9?PpJ6x1pDtz=>~tjP1kMSUZs|s&mL_EO zg4NQ}w?><*rDm$8FWD@Xg__#-WmU+UCj?2aA~lMn-2?^=@pVHyWr(MZV1;vnjptMn zS5md|CD9|zc+o)gD8@KJ3=iv^^4b~CWPIxbyR6{PIk#@!s=|a@X6Af*tE=oJ)&`g~ zhh+h!)}B?X#xe6@hj}*VJZMGia#@)0fC*QaIv)e~uxs#^*KogHRMauU$!zAQlbNDo zc=+z9s{zCa)m}12V_ed_C!Rw>dbB96k21)7%rlo6(5B5|AmM}uo9ZSH zK4=zJt)GiDT{>{tGu?2l?P}Xh!_|(N<2O!RKXF_BDEDEmw02;&{aILYSJh3v41uh< zz1+06)UC{oYCPE83U+vC$JcDNBEb&hxFuuC zV~2MrC1qk+7{&)M)L4{Xtp>gmqAUyVJKw7a80VZ-3yE&gyhUZ6Q5#C1aDmEPHB{DV zf(MnkiIQZ`cbWZCS^|Z5xn1pVO|U8{W9n&&Xfa>>AB1QfS5RK2(m_OqYp9sXs7EOO z4=Adl`&!CwqKF7ZJx38ynffLo2tJRZ1NA<8~JhjtuQhybo544q@Qb=Gxk?oV|Q@!7p@fUliP(b(1ebVtO=1tzJS=&0W)} znVzddGwRK)AMCxg_s+o22R|Mx^&OsF_u}2Amu5pRF>3jOxHdb9Xqek>Jp3y&*Ac7x zPPRA0QOFuFvsCo#nq@OxkLihqZ|DpZf2a~wjq#S73zj6dsc%vAHbL{GbFdC*8cpFf zIuNcql0I2u8~v73ATx;eeNZo7vU}0%3fDnB4mW?@6z~Tac0|SF(U6+N z7ON*IN>Lj0gh5l(QOA*H~km$)UWd6a6T$L!}PJ%Xf4 zQGqHBAo@Ep6L?`J=ny))zw`&(JDIHjzpxc>3&Dbya&!lLP4x@y1EM#-3%01AV9?(N^;PzejK_nWASHXhxqhgQk(KYaLH+$Tt<_KJbM*t89QajK=HK@I zShHo*GVuo@=6Bi)5;5iAxC3e`{wP#cX*r}@s@r$jUj{QwFy%TEs`;zKGRK2lFV~x; z!(OQHmQrq~fLBy_OAxpFgs58ry`2|YRfxCqN2{qN&b2H&LWPeE+DXWkipuPP>KlMX zu8H*RhAF5i5}D0l>GmVI+`d(}58CNe%XE8hjc)IK4Bb9xr)fCDw8AI*RR%e$+bjns%g2EejKVu4Czz3!;f$=K`qX zR*t|BIhS|_t4b3$6}VI~rW$T`PVCIoD=7GahyofF9oaJVXt^%6j+!tbu151BF9jyl zsTl$kvcnhk52+`i2^*5@R=ngnh0^hDj~1e0)>pBI ztPOvOuG(!xxN9FYeuHlGt6Li&>^H5Oi!@&eUk=Z;_0F~S&aH~htz7>g=wY^tI-zIX z4gYojjk@b~Wob)E+ESKwm84yFdhaAZUia^#cO~drtLB;@qhH=W+u1+c)IS^QXL5i& zRsC<1s<12`>@HY4s`|B9Dgt&?G!?L;71==y6ZQ%x>LccCtG|!1SGa2I6;|WNUL#3X z%Yv=1hg-L-9TrV_b<4V;uQY*J_L0D-4I72k&-=dHvXA)m9;+X|%rX0j7pu3iD%$TE zyJ+~nRV|MG3I7DX2@*6$>ppHXQU3@9>II5o6rHAsIFR}e5k(u8k*-Rr@VD4Z{+OCb z6cKUT@rY>faR~lTsrR2yL_CO_Rp??jN!t>eNztz{C)$Y3ra7TX+%`T?kSzH#lN%$QHi^=_@2F=gfiYo|}!_ zqH{<7dG6y}Y2!@9-V#S2X|YJO+Gi*v|@V8^y^o*UxL+=f7(|ku}F5{Uz|tk z3nHg4NCV-vnFv4YDJmces4<9br)(=lB&Vokr&0egMZZT8eVbPIQ}hl+0~E23_&-I; z*7CCEaX7I-z5fk370zYB(dFN0Sxb<%j3r3d*h(l@HI+~?R~>_L)iEen9fNZFMIqewkd=m8SC(3C9=8k@ zo^ohqDYSC>=!}alh}+E{b$!@XlJITHzw7OFGkubddPVgWUrvBSHM z-ZrFj@bpT`QhrF1i!>~@fjfY(S)!n_sDu1b6HSqf$e1Jmq@r7f!RJN$fsBjXg^@%wJCl)26^}}B#jcv{;7!3z!HpV&AV!1%f2d})h@$g4rOWYU; z)tMcLyhoN9x4fot>pdbl1o(%Tqycw__bvBvc!&@T7Id=HvH+u)^+D7Uh~RoU8a7*0 ze@)beTkyZExT!j!?OK{Olq6fX-BBC$$QsK%;U1;SX<2}fXyZ6gv&J^FgWcyVbEa@5 z{*j${lv8RQ=|y9 z2*?S%PUMcAOdB2lqXQNj>#GnM2SEhOUlPK{i^w%s%hov4kj^L|D&_|UP9YtvzBA&_ zC}W%pB=EuwPhn(wKdCEcbqS0K&?@6p7z#Ys2$(5Wf{jI6YYe_Vtkm$CZPg&M>|+M> zXkE!_>~n+>qUVm8$}>$aFQ1@>B$crzlJt7iqb4y4<#*)V87A_GGE;NeC+Rg_0bd z+smPirO?Kk@(0;l*?XZqpVxPn>(`g+*Wa&?zCZjgl#kLMra#Jjn3)~;(f!!(%uBoH znpaG3FL%aDow0J~)>7wI0PAVI6b7({u5x5cDYE7Ez(+5C`11Y8GYc)k>TM6&g;4$E z8+U`9)35)Rpfn$9B>&w7hofQF11EjP3JLyj+1F9>bxd!c*?QyY>rdbJJ;mjV7in8P z+;?$s3HK#&n{nR@NIcTD+TyKfR0&mwXz<<&HNh%r@ijYl0rtd@MqCz5KE$XI&jov* zBTwoIMKPIREv_w#)|lr#e6Mc-jN!&xpUjvM<`+MaNinCZw^@~}oGcFGc?!cg+5?Ry zYCcbaTQPhm@s)1;VZ`ATKOw{+nGVkJt9q`zdiB-vswYdUp1j?{T=VYvI_71NK?>c66be@l_AxcyJ5hN3Mr%acejaXWyor|p8Lb-}U1y?J{4fq) zyH}Uot3fXABjU7jZRF|*X&>dT&84o*ctVNN+xl1Rmpb<@bpAFpv1~2ApTP358v5Ph2_knU& zNns>4#@TeNR$h-4^>jw#MG2PXp7&&9=H9n_9_C|xjbA1$3_S0P%^0b4B~{+=@QaPT z=jB>IQ|?LEhnL0KN>h@LjIgpEIO3uhm|eU30sh&6 f7*;X-!3hZNr$>+rCn*P3?hzKQ}l5b4M8{*2htXJ*1BPuCkZlE ze6>!<^wKkgOi<>c$gxpA&y?UCgTyBc7|u{oRf{KF5)2_0H%jy%8&Pzhz)C@cxnR-| z$Lsy(r8YehZ;OZH$4s7M6XWfj9hcgbHZ?rd7N68dW6`$PRW;HUibdjKHHeD04NnBd z^{}e7#wQJ1SmWPg6@Un7fM^F`nkK7~>stG4`&=v|H9eHN77yP&vUDU>lWFNni(PBL zLRdLWn=V9zANYmz12Io@3JV_zq8Amvlc(THJp4*fYRix*`bFVlmV^O@LQ58YNLB*0 zA;O(#g)706ufBoqAM%G%Fo@DD^HM{?K>NmwNf$ z(ff^=6TNA%mw6DvigwpF>4&;;{W^d^gZGjjRTJ`tkPvnR{pb$W5yuxYz47b*LQKPHu_5AGf3yoQ6-)d#;+vjptQc;sDA!Tl^&L%yqAk6i60f^0ZO3;FRP^)QYKV<#{ zeGl?}g1kpQ5buBlj<;w+7z7Olrd&O-aaHz4W4ad)?45FW!%^76<#|C-JS^J-(`Mx= zwkgN<15*PsJ_{3CsWJK7BIt8$x)-!dOMAgI3vVv6MR~a;^;)Vg-O!sAd*OF@JALj6 zTK*XD^49X9wKn*ru-z}gUR7Yv|KOFt)x6QFtO`ED&?-A9>*30p{#Y=g7g!I&qaBB7 zns6f)9P=~j*ha6+p6LDmnUtXkz;0exc2K2kqbdyJ3DbSiWDN^S@_bf$=D#hV90z16H{b!@3e-*yi{wW~x2}v>|f-O`&Z%L|!5v)klE$6g*ld zOpt%wRr8K_95W(!iudbQ>V27d-)B5?EPN{yAOiBc1y75 zRq#%g7J?pQ9#z?N1s15Do8#Z~qr#oFf&k{Cd@wwA>|3~yJehSgPxq`!2y?;8ruRkmH&ve1d>Hd7W9$~@w=r%_}3#?rQ%bg>1ORC!*`s_4RXiSb< zMj`2e(}BlB0x7h|Py!M{!Li=3=vbfmNLbbr=~-x#e+uS*SC|)mVYR|Jt{%ohuptc* zowwpBT0ql;eoV4t3~Tb1F^2VS%h2{F=or@UEd$aw`E~J(+t0F_l#WG^k9BF#AL{ZO zVnQ6lbhl;57GAUo_B#+hH3{)uI!{5{y{K(Fx%xAB&iux5nfk;5hH5bOAr+_qG+Q*Pp$9+}&evgvLCL3HAhh4Y^iT5jH2JE8*gbQNk7~a4 z$kx1Cx94Wrjj~^s&wTswo`cE8+JBz2EXkYD$YuX{l*da$Y+#duB!QK9szxm31qXhccCil5eJ1y7ExAvU|neowj#> z;q-$^?g{_Onj3|*DVuAQDeJ$G=nb3wS$wd$$7()TYbDIcIY7jbre zopX@V9{P>e0!M$Vt4E{{DD5F74}_Pku=&VFdj#<#`*EZ@?J)mApfHDu-5ZD%I|c@u z%^kx|E$yAWtl9jUuB;aWYh(C4NxFbCNLUkm)+ZVkRf)1@prI>QCZIQmj0^3(**xJl z^R1H;5=$T!RVM?Ods!E}`2jo{)++2Q%4hUzQ?pZ7x)y>Dr2}d4Kw3KRSaQ81&Rv-I zU-xI--mG*mEgs~l#aGY3qlw`P#Z*<{e;u^e;82KRN@rMe3@cQC&?rLYb~ zeM^0rL#J{C_Nf-WeB$|2?WeWr!Hb{nU&EhsZVPoUd_OsKd*s$grlB=QV80B2u-rfK zar=YzH6(N8z`HP-WVbKhx}4eH#<7Z!Pvi#!8xZYD&+XbNSOjwwbhS`dau9J`i|6lNT)K!j zuAJLNt5)rvbahk4-juW8uJ!oc(WOyNe_?gb3|$+U9m!a0)~xU{*CNoy1>vUchAl^c QOm^SyyVds?1C@5uwF7Kx83+&s$cGJg(R}P;^H~G^SYUr7$cC~Lr`@7mVEe~}DpDBT zMbEi2LyEG}UWs?^J@?$ldERsGeza6(YT%> zh?y*Q*Oc#5xrweMI zNM0s-;u_JDcSUbTu3zt<{drMOU6XPHG^TgbL0zau2?1EwHR-O0MsgWGw;Sg6(4M;? zfA$^@O#`%-?rA`u(EDga@2By+I0%Ta!?s>t(g$F#p?P6+@b|dP`BA}*ol#9wy=YQs zcW5Qcp;w%uYN<1n0VFhK&BI5`Ul|qMj+fL+bedXriJdFy)a|QSHg%LdKoy4;%O$2V z!$kak!&aOU(+UcGtD>4pi76JHqpTX5$UXz(sWTPZDHUI4YPs;5YTLg@E&{x78YfMh z*fFf(VT0l`fYkQ_A=%W-^Nx+vWcksH<4&*NEbdrvn#>MgSyi+Jy-F74p+Y>aUTSD1?u!YPZjRFuO&d>7gU08)4L`Fsnr*C_T zy|O62N`5Fb`>%tLt3xkSM|KKS)+&rq%aM_K*;lvjTrTM=;A z5%}3i+amd0`@mZJlWWqGP(OsWG~mrf;c33PLX!{)emJlGDpcPf0%%k7s8~{hV96uylfj)oLiFZGSv*#gl?8kI6o?~Avh=(CA5)HH&JHv za==4hO}W5f1o$0{uwg(Rg`bVp|B^p0lC1lK(yM zD?8Hd#p^KGN;ZsHZ+w9t&IKjv%# znQ*To&!F4!w2n)qVgp}QxuDtw?)1Ne?IqIz=zX!{88$8E;}#lyQJF!W+Jn!4<% zyi}dfS1D9t*t<> z=mr?7Gb*vez;!o*yHf@mG0lxarNE9h-jT>HFIG4Jz3M=V8R0D7@LqyZ8x$O^sgraM z-rW1{-i@x|rO2l}{Yz&arUq9=-m~wgMn1_5y&HYlwP)qbhX*$IKfS*H>3`MNhM&LR zb?RYecr$ZkJ#*ytnT^a-??$&GWY7Kp^7C_d&TV8SZ$v*ybYDNWd~W5?``KIB+x;7v zuRTai)VfG|pw>+~`!-Yi)>Hd7Qp0PpVb6rax}iBri_1m2(7D$C8LGc#!waCbKWkXl zY{PnCs$cX_pv@3Bpl{%1wqdmagn*t0n6vFoXwcfG?^5->hK_uNzN<141{$ui`J>>! zEm<`k$}DinmnlRL7pdp!(fgyjYbtP{h4K{KR;X)@+Ygm})&@_{Xc>(N`^{aND@;S6 z)&`k?qgd0o}cESCmb1_A3F_I%W;#C6i#eYh%mNHzeK8&wp&q$vtRM0N;439x;~&^#~4vIs{D zbS2~ko5DW&Om-Hl)-ai0i@ZX1*%Yd4~Ek6oE? z1X5puYDXj(*FzBBg=ZU=7$SOQM$kn+lZ6P;Pwyl>d@Y=dVO+S|Ww_fqUeuVwXfX=y zHi#43Aw6t&(?RyTU3k?JH#{ByasTPX;d2l+mi<@?f<8ZO_M#rO0_iNuibw8cFA~H1 zfv?mWD+N~7j4IE&v0ynacnv*MP_T`^`gfqZ3I1byV!9wM_AiiK!^DQ40sKZ>G7k|) zYoB$&vZEpPDttWc3M=R<0L0av!%-V;JVfIrz*MU`v>W#t*=i9dp}vCF56DZ>JIPOu zpZw_5jp@za1M9s9Rtp=w<4e+yGoY@P^;QiJ1BV8L`CmgH z_L06eHn~3he2pOd7svl2eQ&bP+qIYw9$KOA7j6~aw{BT=a1}$fh!D=)(9msUyyN2@ z@YZBycfYD%gW(?mQ~z?nz1&FFb-|Hf3NLwaiR8hXTuTI3ECl2Ngq5M{Rc@nG=*f8= z*!s#9B=T7=A%YwPvZs8-l;;Z6lIKemSua_K9C=QKmkh`A92%n#Yhlz04^H49cxX9i zI0d*=Ra18O6Ix3)7#tJ77&PEZPyPR)`ianz4>k-LhWxT?IMU$333%3M1|$LP9NHGF zupKIr<X}I!x7orwlp8qENv@p9Jr2wi~%L=Y73Ylf4duK@7X- z<${sCg^!U{nOb;nd(W5fVn(y4%o3Qq{X*8mcpK-w2^GAF{4Uvj{f*@}Hk13;llxbn zevmwTf9!7`jGg}F`3GmG|EsruCB4>rc(rf6_wdq7Uxvtw!UbV-?DS^xa4iB6=~D8` zD2J_`oqh!HwXxHWnq!ab=U|f$+NUN$KWm>l82Eq8P;`+Jz_1$z_N zdI76(sN9rRGEGQKFcHhD7d7sQ+_?X`2k8z&BXuK?#h^^XI1%OTIjbS=o3e^sxyY|# zctr8voADW%3!kfj+lQ~K*$O0vc9zdkc$=Y?yTkq}hW{#*^5Q9L5??Z zLV0`~9d7fLjqSxQ559)6HG&noFV4yx4%CgE@?K0QAc)v)303V_V(&7UYC0Y_C(Y7R8GWDP(L+_+U)F2RdDS^a;UI4Pm zKr)-rOu#md$fPNuB(3NsGZshgSni+5)6UpQI}>#})4`f*f$YdLnu+tn|KLy_W%{Q* z=k5a_xRm8g+O&6O@1Faq2$$`cQqmA=D7}ggo)aP-EO1@{)9G%olG8H4)ktYmT>s zT4;*XkMB}CU3rfWRcM78e^#L?@9{kqTA@PiU!u+zlv0(qW}O{hqR!z;T?(aH=Tqy{ zc|<8yd5^zTp%qHA&Tg)8om#a^73!8u@Gn7*txw(`)^STG73H@)d8zTQEA3X^ zT3^BSabE28m#Fz0l^PXFtLD%)9d(#uomVN=#nHF)Qr^&Z66=Q8I&QlfyMx5mLu^A; z>>!EtKx|`G>`oHvg;-xz>@E`91hLIkvAd^qftF9P>3~ku??3RO=!~3CrTNHcI?kmp zCfQ(s78@d&bTT}aNsOkW$wVX;o#t4GwjQ56eu?8_$q0sg7r1nIG{f^;A|1XIi^apE z$;4Rn0%klNjb-*H*q1Xg2$@d3d^(uq<3O5EW+I97p%X&^qf8&=ro;{51SFGY!mnb6 z@C81ZnGB~cMkYDYGjV8gXm~0SiAP7zaEVlsKRp!}8>(ZI6I?=#Je`iDt8>~Wf|tf7 z2T9A&1=p!$I>JvKjGyONHk!chx|Oe(j78F8&=t{g^6dWLneg!wr$u*aG!o<3FjOIR zZd93Le9Mzatz3e@0>x1wnx#TImge+hI#zep5HfHKtG`Mqi#=q7u%Rk!VwtN{$jlnS zx3DJgt*jY*8*Ab0oMU`Lb#oGp%muYup?24lEnpX2FGtd&7digLOe&p>AD?=e^qT`#8ONEtjp`fI>HV;LS z8a3j_iAr6aU#^i+VzV^VuP9h5OY`+vYTT)$R^E`yqJ*ZPPvdSSRC%w?vqT32R^9~3 zqCK8uxtP47MTSjJ!Ai7Aky60Ha+ji`oRY;T1$iHqcETchEyAffHNO7?-sgzOTpA?Rr-|zp=g9GXKY#sclb+B;qY|e1c+*I26Tyf{fM>|L6#){rP z!P{5#Zu`i)?QS|}_>S#?ZzH_yzuLNUE?xBX3BJCZ;Z6IU!Qyj4;kh8vE6^4-sxCkb zfpfvLf@^Alnxc4I)gG{6v~ME%5TDP?^4O%TtwgEzgg_QO_>|8~f;g6n67 zdSJHm-9SA-+k7wN;{)KWZqW}h>$fP)Fk&jcAH6k=VL!-+V#)$2M_8Fa)w(Pl1>X57 z{ayOBG!y}v?|@8eJA@MqA}Xdm!BPrhw^S2&GgQgdnoD2H&Svvx3a-H7X~7jJm;-D3 zi5y1yiJH4wT4+fK9oyF%28%Qd zV>`A_H){_9B)zCT1db)g9rlBX_a_9`$-A%)P8Q53$q2omPMlk>wi3^8CYd8#= zDicH69!As>lT!@mlgSv5e8C&gL+Q!;(HlSy8H(SH9tsT65Q_pb#5E}zCFI5L!$cw& zQG7`v*ozc;L{_puT&2o7BXgWyG^|kgF1snC=QAD6I9BvPm60kN*T((UNHcxtou$z~ zLHtv6*{EYW^M?G{8|SW{6Po%SQV=HJO`08r7FCBL3*E{uU`)RaQ>8#M3uOU>&FVNk ztLF@?0Uib(BZN)x(9@vzSu;zIIdjw)&04NHLJVu>jGRdeI8;F}3v1KP!`j!0cdQdH z^})GLs+qHDr)bAYUnK3s#;mLhz@qK{MNi$(Q+tkjoenupQ-OL>VT2mo9u!T{R5-yU zUl%PGxdL7$rfViw4 zLaL(78X%2Tz4h9Y3@G)GM$*6D@L%4UmR8p2 zWdw1Uzs9lafIGzs1KY;+%c-lnQmI-FR=s9>Sf9FvHOd))mvq20h6@Ie5c=S!$VgEN z_ICkBQeQ-op?QrzjvgAtmKbRxVVO*(qPSB^iOj1Rj-P@}1=t`QqsbJkxdcFI*h#0b zMzS2F)qrYmx^11+ASq@JvZVeLJb>7qrlMz);Mufza@o_9vy|MP1?#*uzb((+n7BT% z?C#4kC1?G$@!9cwU4Hn+vFpc{oq?Rb)U@G-^}2O&@TTY8_NDgare|`FlDj#-bMdLg z%LUiYf_bN;_5K8!;a@=y!8!jm@J6x6ibEzQ6aP0rRHIiISe=ro@|K5F8^MDx(!%iX z(6?j}b(zUBEHU_9Hl{1$gHe_htDhK#PGT-Ss!iO_Up8fpSuhjA^cjKyBVY4DL@eM} z5f1(oSM8_RZEOdbuAlI%bZ7xghNp|E^SO;`y6x>P2X7A<7H!^uoqG{KxBQO)@h z0F!oWyU#YER)5%uo-VRDD;;?!FeB4VJv^rFvh<-5@h{sig;&#KN)pV-H>C zdTY)IvZApyzioabXSv_rxwz{`-@JM9hi}Yv=j*-~xILURm0V5vo`Oq-ciNMSAhddN zzx32M6Op%8!2LTlV%0^O%lFe#SyAuCO^bYf6)L-`{)@gJ_H^a zV0U}bwMB4kDcKuK_PUbYrHnTW`|;W1C3`bQ9GgA1;xfCfWy)-^5`GEj4h2CEJrDKp zFQB&{y#wHhde}lq`shWBBl|B{trP*euocY!Uaz2J(IKmhQ4aUpOEiB5@)Gp^f+Q#q zp7ZFTnn?}7%KaUHVJGG9Dz!iLnajuYJg93dGw=rd>+zLMNSTqcAZ4X|EoB>0cB;9p z>_7?@mN##hZ!KeKK+;y{=XFSJXP8}8pm>FXaM@YUbUtWkTG%tcr_2Bit+*R;w_i44 z$V@pKuAQ4bmmd}!{;~z5tW;CmLUuk|wqeLlSzWmdF%J;5eREi7d%ECyTCi+eabSjC z7h_c$0J^FH4AnG%QI!TT^q2-7IUs{1;BW*-;ZdEUMppgU8}%}C?3@^$bCQq?^%Qq@%k$^}qxOZp)H6{zD=fG23pMCd7P zv~iEEm^Z=P16Bjk_!<|za525w;7GuSV&;<1QnpH<uR`g1B#@Mf(P2nO4a*{bsa)o$70uV zU2o3#pw(Y&-7K_jUT)n|FgM-z^xU`Cml~Q&^_}R;_`zmuw9Z;`bZ!@D0=GBkAd|OB zGN`jP4K01~Y{5t)@6muM2+qU*G_D6T!hkHDr+C$OCrj zl3^Di%<9GNhnNb|t{FxN6ugIp2Eil4==K#|U4jd+SKZto>b=Frw_Dz7xq09x8*h7x z+x83F_J6o*MXy6ZppyZCv@^BWAOevg=0*YxBxyh>)Et1@Fq$^6IyA&I52FQf3N&MH$Y2cX-lw5a`mON+v( zrbS!7YRAe3EHgRnwEm%V@9*6U8=BFG#jrO?JsfKzD&g;e_&=d;wM_|CiXGJvnhh?1%HUtc zuS$WZfEQJBh-TO=Mp>|#D?=s=c%p_C0be_0k~;n;s4#^Vhu=dTH~1EwoqskTS*%}f z*pxF9^_+jb;OZ@ydnI;OyU6FT_e4$OJSa{T{Al>77W`@4I}-;ZpeM?;x}2O=rCecV zFB^z@WwLOOK~b@!1Eg(&6Nrk{OIp;5T6F*L@cNpRl*>X}L{q0U)u*y4tR3CeYV=>A zlGIM{K>pjQ_U-rWUfgaswyn~W?N-6uii&b>h^Wdr7|Zs8xm{Yz)!%Q1s>oW#-c^s= zIveUgsKU(ZZ^?TbSd~|x1N;nn-v+N{{rMqE-8Qc`LXw936_(xw9vPu0(vWwDGx2XvL@!;qPR4R8N^f7yZ| zYt0T84!H$O--->>{N^eZ3UpOP7^+l+A?=DD*&t2gNBo9ScvPntzgEV+gnETjDLb4* z8MQ3+pzgHerIR8P2j^2-<+NjERceImMNG)@M`+gq?ONCSR{27c4YuWW?f#?j5OSdk zb`TZpBq~_B{G?nh3!S-yrSa=lodVHXGn=kcR@->JLRH>M%@e)ASte+)sx3hDo=Q%I`ZL{26!Zg8 zEOC&_tI(DAcvFQ|sC9C-DCw2Ava^P-Q%|F&&~88N&m`bPXfnyedAOfUT6CJ+V)CcJ z2p{MC(=Epnu_?bYwf;)(^HTvAj{_`is4EtU?_yj7cyPf`zQiY8FTwE=En|~IVd(;w zkzDNCE;8gT8YKg!Xsf<^D6b?a1UpQ!H6+!OJ^Xq`4&|1W(rFb={RTRa`aAG`r&>8S z7QNkqw|m*!TWjZ#4~bk&KOTY2O~HJ&5E!`|xZ7IjIlBx2SJ(Zfj>Uk`)DLGtjt8!$ zYj4cHaj$cGv2%~mxo5d^-$$-}WWQ6g_kv)zSPKvaTVQk1)h>V@boCU>U}5gL-wd_9 zJ@D2*v2#f19J&+uVBr0M;oMDCMa3vz1dfU4^F~*Ah#;qgQs<=iqArpy|QNB)0~~C~%ExBQ!x4dq7W6 zmShjXXFnHvlUPn$;%oLdZ_s#Ar+S&zMd(qbYk*&Mb&{SyHRIqOX|9;bEJce`3 zI~K=oAN;Tp$~8M@M{@gfmw+!kE%}{udkc0yn6}A#ZZBF~j?W)oJh<%X%N@MmzI|13 zxCaR}_??1%16DDYsjftVnDUB*@1Q7E>vTh4y^~m6Pl~KmPFZOFub>1>=js90u$d@# zAlD441PN`xoGDz0a8hlh+@5P!X0PPA#k0$<%?0yjsmD4SI+Pur9a(BgRHSuL)wXlkvy zKEQYKTms%O1bG}L9#4AtJa_>Mk9Q$>!YyR+{{Vw%nXT}Z#8A&*c3F>PfEK;#y?1ui zm3N!On}io@b^$+va?>rc9bKL43NO~0^W7hBUk0=*)C@4Z*KKex(Gpe`en9_uaqYV$iWtyeEZD2MJzmly9L3siyPQeon=aPdo7d}x@@ z(ih-vEl%iHAf4yY!&wDsHi&d(Lb31UMjVLPPNZ@2QrJmOgpB6`HSscFf46O+;V1fJBtfk(Dz4(6-$VsDwk_w6%x zhTr?z=Y%eAqPqJ&Gc__D>Mjjvgd61OG0zIF&N3zwBW3mRqP#h?VA>$5gkk#CN`S7d z1f$T>sZ|N4IW5d7TGA3L6w+lc!}zL>Ngsjr)hA&Z{J>FoRA-Q>m0tj{$~Dz5w+PqSx#1kaUWI|Z3jErky`O>7n!C3#1N;Ue9W-jDKc<`+%2f^&tzw-l zC@TNs1f?wzE?&iTtrzm-hql zgwjcp8p@#_)&Q4yiNJUZ=(SQo*$z~Ffg~3;WB+lTRLh0PvfB@Gp^Du_`t?n;gC;N z$!S!S#K93C{f{D~1hZj6j z*i%4NpiQ0E4$mH*I{+GW8v;YR6LF!#Vx^x=la*+MR|E_us2K1Shnv#<|Gs6dVSwg=aWdRV%l$f!DTK zDo3&ktkg;T=!aThcst?6;{dH%y_h)tm=Tb0SFhmey;)aqM7CR<*liv{v;m|*uHHWu-jCoL`gbw(1=2jFeFR)J*ZXH5c_TprS7PA{^et z;ZivM!f9f;u?XDy<59Ya^<#KI7ml!OSjoZvIVQN*v~n7w4cLmb5vYw{tM@EX?RG^;vHD&*p+Wj%r@f&L3V`|UG)X>M&u}3-+t$Rd) z_qm&*kI)}en;%%5*P3UW@$9;2*(g{x7A@U^rF+I$f(^$jv#;P4w4$S5aP$`)gMwpl z#)^B6k=YTvyjQdb1bd)p2NAk$#!~WaSeTlhTDUTQrRdu)`1TilBZ6MnWlW> zjppmkH#S_~P;A~SH191o9~7DomJCh#t{Z{tfgAnT`-?5l3oXwVTMi2?hf4-ub%`TF z^O0in38DE!*<=XlW-PhEIdi_R;NB$Ix>hIyt%Lh&h7ARMLs8!<=v(jU+urKA+4XK< zDe!LpQh#CSrF)%6OKqJuUb+4X{+m*4dsb+Bw%B$+Xge@7lA90=Ee}B0M`xqgVzaTL zvqy0D6rBTtb702y(AZ9Q&W%2#;Pr5r9;Rt)KKziv_rsvBo3_qD2wo3Q(coi}6xc`m X=8_L7cs)FzYohz}uFolW5yAA|6J-SS literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/monitor.cpython-312.pyc b/compilation/__pycache__/monitor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5dafcdf1fe0f42e58689d248259aac49bba43a54 GIT binary patch literal 2468 zcmaJ@&2Jk;6rWwM?X|s*)ATEGLbAtB4jdsM-{vv_h89YU7+F|i z$5mWXDWD!&fmCiS;s6H@se(lR1Xpn?=Hf6K{6CPMVfZk&blEL`AmpCTH@e9D#6Iht%_2fOFm~GP{&|P{;%i(U&N(Wk* zd#HFomvA@eW>o5p9phG{{W63WPa;J0Wu9m{ zrz10YH@3`I4!o$R^E&@sY>f{k9HKpuPUUb!?XJ48%>IU%>A*2`=?ES1uFgu|deu|o>x{hZo`8j|R#Yf?1` zVC&Sm#;L3IQ&-o8BhAc_MrNX(nb^pjUb)mv_BE1&_2l5Ug~sqyeR%4pgfISGqJEKc0H6HNb85sJe_rDNmp$3A0pe6M{kZ9rG!t4}+$%XQR#E|w2+L;xU((H}gE09i!=86k z$1Y*Ndk<8Cjz?HI`y!xn2^yuU@VlV&SSV$gu%h(`u&t~Ai=+`y8G@f@!L)*!d(ra} zyZ|&BVsBmSZAxdF(qKa>)TKhRr>~VnslFW~q|!SQlKR&($2X<^hBR82M!%U|oqT%e zne=9JaQMNERvbwu059T2G5{WD{r*x_*UCh5p{r{ytrCWIY-@8S)=Qw#87$=Nw|yKY z=;IxMm!XUw44sR5)~8$zDg4ZxkEU;4dPfc%$Q=jJc_=PXtjijeH5+oHxh`>BVrUf0 ztN~5PvkQhNLw-omeq{J4RS9clW=+-&FF4=?M12$-`_czho0(lE#34^maJMl0qqjBF z&|zEk6&EG&0HD+G^9EsBLI0#7N}FBz7SCnh-j2m`=@yEm(ybV^28N6d zLc~gf)tUzFSm9k;jq`o=S@eWeoj?2+lD$}|0#H&##$X2 zVN(!jVa{>PAY6nUjhtW;LsA%R@dCU8RTEEU8fzu8-00KvHiD%kXSk8o$~JJ@?Mc z?%H0HI+3zM<2!fGJ@-4`xj*L|{#se-L-3PaZ;1a0BJ>$~v43u>u_8lb3JFMHB$QxA z7>2YRl4HaPf1Q#u;Tmz#K9}T9ct$+Xb_*VO)mu642>HGwFCoV89@DYPk4pFiym*Gk6USs7S{0%!>YSvEjbZgm zq7M4XM2(B4C0gX5WX2gkIaY{(B zI09p_C%$EMA9;&cukVXG1m_jkE$hQbx!}S+Trp7wTHN`T2m7NQq3nulq*CzWEq4EQ z7zI?Uem7-(AX_zF5%L>>6UxctlaiPaWnLE*nV83Opx?*B?{rMmSop`wtZ3QKCP9ML zW4z9)IH8=!8vGSfvcSuvM~m@E+^$Jt1hYyK$R zO==P;iRcX&PS_{?H z%D&vz*~;2dAv!fx(p74j)f!AWO9h2eG`_oNyse%fEuo@&9SRC#5cbw623I5{J_267 zAnLKr(6A8#!c_^1yG-3c>U$hv-Vcoh|fIC zBA?|gvX|Dk2b@>GdCaSKLAU85ULETIq*@1APv%wCqIH;DShOV)4>N^s%+ek<{YM2_ z^151CvPodSk(teTHe>@%YH%AIxtyBNI2n%cg6FJx*VC|eI~SoK3qglNw-^m|Dg~6s z^bI@aJ5%=U>^0YL0w`F!Sl3!?u2XWe8g6>zMI1$UK8Ykq*s^jIf+Yx%m`O*Pi5Cgj z%^{R=oE=AK(y3cv4nk?C?y|qf?U=;wfWL8X(TmkA6`jQW>*a1)DAl*p&Qg(!z>rvD z(p7}U^*x|@7SPnJM=LOC*TyJqTNtDV?6DvZ;p*DoEi14Tm~$-%sdeCsZEIzKcb6^4 zptM79OjH6Bjv^+$2ft{&-*LYB^>)Q~tnahAQ!hPy9RQy@*N+yQWR=8st^YFV27Q|w z;Yp+T?)93|?zG27sW~#twYNd@o)U7!UMuZaXDPpe#y@{YuJ^7-^RPWft@wHazLDVC zfzH5bFprz?xNcU&9ZfO~Hdvzy`GaV_ma`Xe@f z@sQmr{1x13hmCZgM@QbgJ8rI|~) z#{O($f39&b+c@}1&CpU$@1KsncWhyF;kDb(W~)0hu8c5!Y%Vn)S!&o#n4U{z8`?~& zGrm%N_w%5hS@3-l7%YI&7lS<;fLb14P%Zmlh(YC5zx2QEzf;@4@WS7`gDc>B*zT}s z4Yx>klHpOYo>JvVK3)O=5--tsYZ@9;NVo2LxYHeaJ}$FQ6ScPJJ5%DjJI$mWBvKe& zttlUsZJ=rW_aaT>Ovu5LjJLJBSr!(+B^ zc-UD#%l0LCl}`}KKB(^C!VnVmlWK!?q?sk0{H)e6%B49lqNpI6RAfPv$BNWg^%XOX zGYb+0Ev6It*a1LC$_zz-3t)d}CUV4JKnRk=7GfSL*nq9FRoYpjV70|&4n=f8nPrus zd2XP-(=w{)X1NC{f0~)|kTFmlSu=(T7HqcbG8&)2R&tiN@@U9u_(V;V;h>Zw*ly1L+%xyOJSW}MOOaxuD! zqKx3U;?>Fg^hfrYy_bfNT^qIM8U~tA9F62rz!0K~aK7?1Jli?(OC#b&%i#P zggonYXS{d>$dO<1OuuHP7zSPYfeWIF-#afbAqHy5>f<04av6?rhrwvzX-=aq)`+{o zv6oHHmLD3!GQOBp4^Q-!@@a|dK72@0;Fi-4hx22920FE7czpd&^z~(QXX~l#;gjzj zy*+SS%k<;|`?7(3^UQpAF3=AB|Dh0m+StSZJeMSDx=NB4!wGX4&IvIYa+`~>4|>&p z^2tq?78JhC&_g4%y$VK{%TpzB`^3Kg6aFO-*2oPEK;fFJsm+{z(|6TfX{t#`Mc`XKV*-j6(shfhNLZq=zf-cxj4o(NTw zdtZj_#&L#^;}TG;lt|mpaZovutS@C8Cnyn)Q%SaK_;olCISA-TZC@qjb;A*b2<9nL zZB#qqUF{_g;w{59swfh5s7jVOwfDD(&Q86kl2Sf>kepNd1w5{xe>uW#@A9h*^3>(r zO<8x-vZL8O#7yJa*i39zo{|5J$oq;DdG?r}mQOJ??%KKT`MR5jmJu`;oD0$S{j~W} zaE0_NKjL*i$1DYRP7lu=oH>&X?q7D&o}NPs!yg`e|4g>$8MEhpdF2B)`DDT*2qEOF ziYQPZrWb@3=0_vc9>Z&vTcQ^!5vI)*B;X30A)`wCZFr*>xs)zSR?NGV2#~;!{@({j zYX6j4^(hcmcasODbI(!=9HRq z!qxKBPXoGJUr~+_E^Z_4ATjE z=ADpgsk&v$dWtv!+n}itM{v*Ex~c&W(OI8-CnsxaS|ti-Bq0UF301Y=*s_su(kT1I zy0HLdhm^7tM(g0^)S0e--tu*TMVW&gIC4m2l(5GZ)Ux9)3{MIJf(=U0<|*+PW0ozYyKOSkt)@*|-#GS%|b; zIZ=po6~wMJO7Vqge6eQV%BJn}W0%H0%Dz`u45A&apS*o#=as`tZ9NNZJ%v5Jg>8ol z;+EyejxP@uBKz({2Cp5vd9ctiSP%zS>eiLOE0ikJ5|a@t$Yyv-f5GT6gxx~7qBfgz z+bE-eCz^tNYQsQ!ALDt9(bTF!fBrImT50<>W^f)oKVq^%HP48Jw?2jzy|y?b0b=MAt7DOQt}EIdBK+nUNQZ2fU*m z8^U50;-O?*HFA~&#AO4PE|EzUD9;7xr5P*XN{mL)LBLX?*!YfLpwmILN_Ipsk5>>B8w?(kC!POJa232t_GDHTtw>6Vjz zpq2QlPPqy4;%$$KPO$u|5Kzc)F5?Iq#6=mcIkidOq7zIRdRomm+r0fvmU*xO>he6V z5k1Da1nsTuoYjS3>>s^xQCI-I4$4*v3m}g0_#Ev9=5wu$1vA%=XujuhCdbkPI zZ@nirT&%xPU)XqJQSARo3@?ifk3_V)^`02M7`YHx5}OvprX?}4ASOyYw>wx6HUo7g zj^J~zogX+mup~qmgy^Es`9KJrKYsT3ve^ASfAhrMo|BJwE)Xn=fj}_fly$lEq$BjhKLy>+x;*TdBMcjc=H5w8Lm9AK)ov=k zL6rSdddU}G$r`MO@^D~!biU_*bPTgp}e%cjpc9hsiu@iCiOE|iKqeX59?#0(SZt&N;en(6$@rc{@ J&>eO?`5RDg&c6Tv literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/pass_manager.cpython-312.pyc b/compilation/__pycache__/pass_manager.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65a13f83eeea3573ac191d37bd8ae853f67e4295 GIT binary patch literal 6980 zcmbt3TWlNGm3R2kNF+tdk}b)y&C$y-ZHe+ze#WukBLlVP9tgj`ZP}W zrF|T>{hD77qywxS(1Lm>9n$O4b*vxMc)dPd&)OlaK@X?HtX-!?^k_N?ZC9TyIJ@al}WI@OY0$NcH>FZPIP9GORW0E$J4vGNNtNH>EeRc2sNC+tO{U-Ke$e zo70_@6ty;cP~57^_{Ha(F}=t5dxtDCe=y(`_t+8eZPy(itH_ojRG z?dk1otXX?Q-;v(Iu-7Zer*LOj(mUmrdnCQfN6ry><2@p8QhJ{H-MPL=F4_8b>BXiYaD_lqxw>yba@GhD`u?Sk0-XsO3jT6$;(B z2UXA{Gn=P6g5YYvD&Cgp-;pdBi(Snd<|$2)a>c?`$uMv!Zg@T?7c*v_x?}A_8B@J2 znQA`w!=jWk&lU}6x|5r(UA{gHTf9EgIAof+6|gNoyojLJ@>kES7;n9&_8U7E^-30KfOoWD{yqp7-@t6<8w56PYk9}a&@C!zWY z2!&ah9rrPa71aacvB7#HEHlWwwy7~CCDKtF=@9Qm+m z!v7<3hfDf~?aq-a*G~Q5{P5`;r=TTX9U2)qb8T3>JT!9Z2jb~7CvTif`e^_Nw%gvm zbm?*}09p^z$$%Y@=%`_bZrx$udUn|$L>fqH&Oj9Ka(BNyJz(mEfr47-HvvvD`zMMA z4WTQ zG1)VV?3qQjbLyCuhu1WYr(Dn-K!I@us{g%9UbSy6w+~wFgQdve{qv>psaH)~N?T8S zntai8=`XGAGy6-eJD&_%tvkQ*lMR?h)n#fR5sgKbN`V6c11PeSdB*jj8Pk>M>DBnv+S| zCv$1P>;p9$P=c~w35_>0s1;S#CkO5YJo!uWa!{#& z8Md2INrL{6QGHx?-b(6$^D=oF0cpcm1W{zKD4+-yU`78ZEMZfM;YiPUQy7zSvZfe< zqqtom?Gq$fMk&lNRpqXM(VQ&MVlIbMxWzI&MzK)HQ_~QNMUY}o>MPQl$;*n6kuqaS zN*F;6qUJJMQHG5{r)0*C?M)3b;$1WX=yTjd&D_$ku>&+g%F{JWJEm4?(PXNja{4@# z73wLPWhciwaL5)^;|K%alX4Ft?0_3EOv-4FaMOt-_3 z9b|N6kP5G$4rdhIgKfaMr~TvM z*Jr^7xcks9ADQ+~b6%fYW=|%#S{6n2O+`U7_^OnCIA}(^J;q~RtNN9J3fR%rA7C+t z1MmjpO;rf5_5Y&-aDp4Xx-7>jFFRe8{zQa;yu?(~|>pvE*Ok8y_majC>Xs%JIJE z@xEuA72oscc;Bn|rpJ+gjg+MjW6Qo7QYgMir4vUzJxjQSZGzzsVb29bHVm z7EMeNH=6Jdp9Nza1!pRA3!+4H-j!sly_UU0H1fiFdKWXD_x2_s$6EKp>Ad$T*5RYg#;l*Ps1wV)qF(W$Vb9)L53 z9xw$XN$d?CK8VS^6)3qCl<(gHBB?G?@N7H)6iJWUCj?Qw&eyd}Dx#NV@apHae6!6y}(%Adt?N1K;IXW~S zYc9tURxDAD?Rp;DH6L$z9R5XkPT2SC?5uFKlsIO^k39;%tU?`q9y?lrI`~OwR(PwF zIAO(4JPOY@x0ai`t>*4>^SJdd62-|Qic zC%JNT$cheqYoJ>H`0T!Oz5f5`4W3KY(L*q>!nC1bVD6w3SXI*+)0|l`NnS{9_%4ut z=D>th%n7V^%XppFs(!2D%#le#cft%r*9|Jf9)>AF@`s{yIu1(+qu`wE@fi}jg3tUA zF-^7B;xL$)KE3!M7 zp@-w)Vvz-hk-WmmO0ti4l8P2m2Z3<{8bP6 z!|mX|u7{_lNB-S#S3QXF8va1_F(H5DpZYHrC_y3w!c9e}@h%Vo3Xp&lBtwvv$GH%h zvIJEnbcLX|f=kL~l?(*A%QP*EYQZ5+3Fp1&S5}Z!SbhvsUpH=?KMkEM25S{e4h-Z? zz3>fVYlKFH?M96$;ku$!GGJG60t0J3PZgjtn|A^_JR`8>ad#Q%S2C`)>&GNxOqA7(33(^fo!~^95I-^*H1M$jFN^Z%CFo4G)wshHVxNXtqX^Yq zvKS$qy=7syCG0K>2Q1+L=zvf6TJf{Xo##l?!yONUp^cHcqr7qWFGM(%hLkQIzu;QKbDqE=&DTJg6N!hw(8zjH4)Q z^ek3P_%C1!FW_UswsLL+Q(P3Qeeg37<{tTPUnIzXwS^1zg9{(pw&>ds?3=mytFeC@ z`-(slvq;Y1;*KrB3*3{2C4%3@W4>V5%$6krpCvvV?02?784IewmawHC^0nFGr zdBQL@Q?6}iWIG0)4Lbo9MXhHyvJo%yZcy~j?N&BeG*!(=8Iy*o=pyT|B%@Mg}?l`1KDymI)5fa?V?V;^!mX#VZ6(-9$;n4`?( zH?L*u?UuD2k~)_FyD5{06g_TV@e|PXU=E7I@#rVm(UphTh#>5ZS(F+_he(7(zn5fZ zei6NbQ25FYyRe5J!R~gf&sxk(#xef zyK+o}F?>i1wKbab$L10tD!(@6CQF*=#csXWpCl zX6DVz`_2Bnvok@UXyWhGrEWt0g@|7w9`W87BbAl7QRMfQMvPJ<8 zU(jpNB;VJx%Ee;oCe;-{hoO-xRb4f!Ox0IqXi95xsajcA`KSc4U#jXW>M9Oafl=0~ ztE-fiJTZ6^_K@9E06ZX+6asGURUqXdEI0+r;}~yrdpx0u~uBCZ#gkfFlEt+0ZHdui#st{){DR;I|&&!qikph zOS1gSjiTmJ!m4gMDOoP+detnNYE?I6nMts?BQ95~8tVj*;XeLEcA+N!xiVu`)@Rn$ z^=Y%nR;f9Cd*k)fvo~L-Gn%?QvtBdTs`|_=IG!0-Dn2Rj(-QqiHK@Go9&&!Y#!&Zo z)dPSr4Arm5PEY?+vA@xG=-%6#%2V-BV=x2I+P0WseLnlXGwy<3EW5ves&&SSK*Qx2 zr|heg5RE8d8dXI2MQ8_&4gy_?(wNdg<11aDRAM2a#6Ka0q>`Z0N=QjUPiIZabvo&5 z{x&>SaU!d%xW48jal@b=s1VM<)@HqnkyTxxx19kK&RCOQARA7%QLDJOiL7dg8cx=~ zRGBX!uT+`5N_EO$27d33#259>v$S`wXu5c3yHPNX2c$04i5#3lJp`C%UWC*`WicQH z0C0F_1sq|(d`%{G5wKuHy#p}cQ2?hR+&l&&h#d8&g3+Q9x&v43Q91$&^m~1Q>?Udg zFLlyRu5zzWZ?(RwfK|COLyCejCw-oJ6fo3jUg{CRP_20}QSEpskN8llRYXt1wBM)V z?!Z;L1*B{vQF+{_w_4vl!RWnvY%}^Y`4Al5$1yYKkFLjHodF+fwO>Sh2M6@AR=*F` zW9UWh9}`I}n(HWDg-cIly|T1Mv!3c?LAA1hQhIvF%{CvfUoo?lqN;1P>{?M*G-_l` zm|%~EXR9k&13gG7%eB+U0$(s(QH}MYSy}^M3ac7a$yW8O=fzz9d?H}}fM!2ZHO)7G z>UDCB5ZDF3(?lZy^_AsC3+EF6R0ex)yc_D(Aq9^RgkN zqWZDJ$kL@6@`}m~(=~Xf6L77#4_(>NG>;lKsOfa>6U{l`4u^AviFDOiqY6W%W*MAc zr1~wxIq?3~tMABH-(6U^vbZ2$oqvD+s?!ZCDRg;bbroYBL<;KkfiL4xN5);taBNHk zrzDoD>oup>tjf#O4~Gh8F}Neo#T^lEtF`kzyF@8mdH8-KuAao9*H_9c>qki;aPf#L z-x++~;pTl39-a|~!@ozG6QqA=Q@o#O3{7l`+p&Sh*t9)%-WofP*x|$eh}my|#6I$(miVGE3NZXeWaV8P0I%>&dA{r|jbwt>YK% zmkT42N{fF$n$?d+$&H1lhzR(ys0x{4YJZBA_`#usr(Y1Nm zmL@D|;^D>8 zaO!*d@cN!BzN-_6*>b_895~<>`(saLfz`||sJ8k7o|3f;{ zjFZXp!dB8vW}GPGbsJ3IcXfE67$bLo4OjEOuFjZH_Q!Nm4k9#+^-Aj^LZ||5uCy-v zo?PeoLA%RULg>Ko_i+fbhH?<6{S^qgC9u?w%-(ifozMMoxE?MCQM)y(@mJVicu!{0 zR(;B;)(9V}hgwPn5(jbHrxQ{>(1LxajJf4=1xBMcdahtD)h-s7(ak?s+H*p0kaM&K zKyRD;qaOMq{MgHw;5>re0C$^s&xWDDjhPu~=>IYO2$!=hX?)1lkDR!;#DiTf>?GZ6 z)JmaPK zQG$1uA7;F0yeFRUrwKZXoX8E|5#d4N3gSdCV$}>(c?>Q=Tq(iJPi`MqT++{BgfY(J zN+Z={rzW1HCK?k{o0qqxBMm8SOVgG#y)Df&dS*X6W%tZlJ+ogP-Rha!lIC^>hCg4j zGbgRg$-i8%GOt+!uWxqlBnKXfcJ_<~|H(5uZAPVg?evtDo_h4Al|H>0*-7<&diS?? z8=29~`TL7}$#F{>x22aX>7_>>ZcAr(GNX1TZ)Ng!X3omY*_j0^v(QNQK1lo_0S)}y z=}9X+X{YC`^jv^8hO_qYtTjB_7&>eZ&00gVjgjNc6d8G2*d?)%E?lnJPhfOv^hs*; z(NTNq(vzu6eEbP3ePTO(awnD9-1zkFZ|;6>Zl#WH#g6jGvzY5*_zGiajtq6w39a0A zk_-4WdWXL|8R{^@wAm3g)iC*slHtRFy@C~27`!ZY9xKd>-RB-!Aj8y`S7>7fh3XU1 z480oZ+DUbzG|dRW7&o)c4uoQ4Y`hsqC_#=)G?NHPWH|dU{KwPHPQ<#%@srIILTPev zY_}Vr?{py&Ios^YM#dlZd{3ZpCBVgV{Z3LTn#B_2V+N!_sZv$bXo)|U^Tp*7-;NVC zYwHkV81BP~@lOW%mD}tBqH%nyt79w#1%@4mYEk?uccC-#Q>k>DJA7sg{0H9ml<85RU#H%5e> zXXNNJ(*F(V`-TiWBPX7b3(v^dGjimg@!rpV_B1|nFAA3s>a>M^OX&Y06c$FB1ghOI V2_0-A`hz43!uWUa{1n{N{V&S=F3$h} literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/post_cleanup.cpython-312.pyc b/compilation/__pycache__/post_cleanup.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b2d89f7fc646b7c0fbe175af29a58d023cb34bb GIT binary patch literal 1210 zcmZuw&1)1f6i?>MvADG&){l#6E5d^Af(lg-DJT>XiimjFOUO>LyHjT-W0EOdso}I!|jvR^sK4#yo=cC*V1#A}th<9)lN^pnE>usdRFkOoBdlkHB2l4@Oe+_m zTUE?WLyXL2BKvW~l4MAx2u+Ht(1Mdp@@aTg5~^lGaIkbm z#K5Hr&zXUm8A+Lm22h+!$dE}x`UO)gGeVGDNoZr7^(!dsKS&^F1>;9SKH5)O6RBO%BvJt_|xZg6``mwf$_m1OPK0Ny=Y%JH~AB+&FuP*(tzp;ygrSH~O=d-ESntth5bE(TN*;cjowW?(g_C`kS)2cAn vK#p59d*I6OYJJ^n>J%6(fwWadF2;B_K={~KwDL1(;4^RM&+a1lSFwKpQ6WM^ literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/qk_norm_rope_fusion.cpython-312.pyc b/compilation/__pycache__/qk_norm_rope_fusion.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..854206ff7a4ccc33ac05187929923df22229b5e2 GIT binary patch literal 10897 zcmc&aTWlLwc6az3l0%9T^?sOR%T`1^W5`83N&rfk7oPR2I!BtN&!yFfJaaU8P7=jA18exC1#^)nyyM~#g;o)8`bwu&l@V0jlYA!B9Ta#5eZyLCTw-Vc8eCt zCR*X&2LE=+E;=NK=!Ab(bj5t4`?5Xk6g`(on3Y^2C%GlhWP2@ny*$o9Z%YSMViwVR z*>uwMyjkkasaN2 zn}FAWgM^Yj$OPNFj;1}ZR1jW?x z(wPxACHX>yP(NIUsFYE&a0h2X`zeg)r-%5dA^tpnh*vU+xY|EGb$*C{d0Lr;_VY%5 zc8DL|9WpQ`ptCF~6OoM659lus?;he)*`zQbMMPzY#~(qAC(jxPQ?&>%n29iT{;oLF zDI6rJ2r1a9{^^M!ep8R7+9{`Mra0mY;@UyuwI>(8W>I5IA5klv}fQA2z*tq4jyB}5~@>_dDUmMx_(8jF-A zH7ln~wDbD09;3ITa`;KO7RHIv;}@o<=++4`+%o-8YmNwu#xnG?tJaL;8>$>P-pVZr z4xsits#{)v9X~?J+?A2@ap^)oT(T{T{=+V%#u~HMT;?)0u6CK46=tnDD_g5I;Jyg4 zL`0d0r(%+C=7~g(lPqb*1O}Hkem%D8OoffUIRa8yH%Mfz|9rr>`-IPJ`Sk4jGEeF zl(1L5dEa%7^=IE?(5wEw&D^TY6-M?L?K31~84KCvE@;qP^k5axiZY698O5{agR5V2 zb|J0`liRR^V;b0LR??hsttoCN;x;0v0fY+56d|c=RyvH_jc}Zy#?gyW0fwBKO_35Y zO0{MXN)G2o5f$9f3XA;vsZlkV8O_8q!=Pe8D-U1H?tA3nseRIDB7SZ(GowzVQ=_Pz zN2BRvCZ33(P#CS{Rxkz>XdB7QXsjSm%`MD!-k~!_4A^cMU0Hs%obn6KV7ME02lf`%P7XHlyc zoH4HboV7ZF2#h;vUVA{TB$I;ezt9(%gWA;w75!Ikt1_PmO!=eZIZ~}PBi^$&h#@O5 zx;^I@w?9up_OWk}i@w1qBCJF=l3*GzIdr~?{*r4baV;gTnf`T@Jr2LSOdKw^ z+>h(O+Obf&Am2Z5z7c{?y;OwC62f*-C=L zWd~_)&7Ytkmf8duTjpkGM zd%*2K4v)EkhZxaunVSPn`tiP^=XJPcjs5zIz0yB{+&BBJJ6pn|Ij z!2d=LZqQpG<}OeMEk~Gvx!=`!&B6*7nEJ2<=9i8)$Q%pMI_HyPL0DJ#ZqAi+ zO`4WRm3sa87-DD64eNB=Wu1;6U1xph|7@L}yR6fbvzqJVa$MbY)}4>JPA^b4!Z_8< zK@sfEdBHuY+?SuiZqE7U8gjl|L(Unqf;P2|ee)lXZZet`q6#5avs0(AYT0Nu6spjS z5eePG>4hQWosfkZ?|R)eW?EyWGzQF`Y2n-q4Mk}FY3ephqS*qz1h|Q+=9)5_{JL*l z+i6Pg1f9`;01Zulsdc5G-vMfPQ!-R?Mi2AN4&JE-WNAFEKn$>|jVqlqKu?duVi}8gxm_=RJx^&y={UaYf^}$oC{E3g`EAh3~=jM;+AInc)*$%3?Be=w@IF=n{ zV(IXe8k-i|7TT5qOFOP~m#x?miRYObmVNs0C=PBfy}wj&RbAeSo25za{0EdMqWHP7jFH3f7!GXBVv~0_Pkc zZtzBKNiaek4;@;Yw{(PR{Z=Z-ZHAbjj){B>dWSre1vQTunGvRID^szMh0X`n0oFfv3deC z(?trgcNZ2AA5yXjRdegV5MMj5b2px3g|#b8?WqaM&5r&dj9r89I*?P|gbbMbwx6_y z*8O~`xvkXAuZ%5^!B4S!xX?XZEp+s(^ep$3n%ftjTzIn7u^s=m4VK!wSGZ-a)YG@} z%JM5W9$)Jjz1`t$^#MzElTQ9Bv)qwC3EbJdtr+Mn1bVNYx&HFCm*3s_H$#6p^efvx za(~aQ?tFUn`4?6Ly=#FNSGgA{yM$btom#<~8)7epbQO3cWQLp5ZhnVvRLwF08pp_x6>8%Ll*rK&||6 zhwCNAq=OZc5LRj^(Q1_xtyXE#>VD$#6xm>b4VJh-iEAry{x_eQe`byAS?79+++cwl zT=zFEdKNtE{_REo-hzMchTZD*m099#{=U&waY{qXu}~8Rgi?{DBnX;Yz{i&?Jj4hZ zCkWFJ)vrLDf*_`&&}@yTRLvesq(OVx&ZW}{`99budN3NNs1SLICMAeRi88thGH$-+ zIS!8C#M6{AJvcdnFg!CEC7B$<4Ba~2;p)Q@J!Q<0v6R1s>@xY*60o!7hspN7ZyVa} z?k}5K%69n0E7w+bVhOgbv+Tl>oA?4{50*I6)LQmp$wxZ3l@Sv(oZGk22&M1#HbS+J zIGV~9*53AR`}=!7Jp94o)%_SU4ZF$;BJtd}{%xDPud z=VTqz9Nwv1_2jAuFIO>}=&X4?@`xg(cGGOHb;IA`vZ;!=uJ!vsk%kfxUaNBlox5x1iGt^4y>mEF{q-N z+OMu7RE5^r?U>~}1K~JRyG@#rZDTHkKuFLuUdfRQhIQ+Nf^bXAFuOT8wCedHsl4#(%-kK`5BXz$$ zV_JKiU_XuEK-kHJ0VOKpXCHj_6pxHPNZD9_6W~*6mDk_j#r^O?tYkBpw5-A_v8c{u zBz~7d<5#ms>)}@LhWK+?6$tU>w*%lNkwU*+HX{nv-b zfBw7ouKw^3?;Z|W&KwI_j-3fPe*8@Y-fyVAWj@+eEw4}qxN2GO)O2-(JEpM}*GADe z{G=dJ`E)_E({)iDq_O9yDuZtmoJPT+f6}0F5D!hkhYVm~L?s!+Ei!uInl&M%G+Ro# z2nsL`u9W6DA4y~-MMiO^IV(G&>o4TL8f!6zJ-lKznSryRpJFH>mu|Gk*mDNq9)P4O z-(6^Sx{W|Jqr?+wI&S(ReIU|Yf)ItLMnw=5oLt3Rf(0&E@*n@G1<=9Xa1d_*Oq<~+nG)CZixV4^rUzQM zhWyXgJY7rEzxMDoh%Z&q*8R=DII}cZ>>Mg|4y|>LthJ1;`R~u$ziR8vAOEa<*A?4( zU`H`9PzVfs92oj+@ZqI~_4ZI{*T8k*no#N+yq><6E`>&lp#z1`ff9dTX=i_F=)i5S zv!n9|AL;5{?!UVC`k`xwKIz>3Z6g_a4mewCQ(iU8pSW;K!mghcc9sYI~>F9$Xb^Ffp0C9PXY-fS(1dmXs&F-eU zU$a9XH!~rlWFFh31JOXDI`A?SXg~~px28!`!&Ms?Tj=ONn^v<8zC-U1SWaMvrUgZ~8)F{%h}Iz8K0&Mu!V_S|3Sx&PgkwVr)>ct`A9 zXMJyS^W197gYT-veJ2Y0PJF_iqF#&Zyfn} z&l776M^@P*bP*w+JPtkN3C!Y{O=5-?iOga~t#3LdPK|>R4?6rSS{5;w)sFzBuckNaszY{T%OLHjYaZ#M6orxG z5nU^e$XmYT(RGJISIGJsE5_#O^Qz`GR!9{(J$r-aWFf1@6AJwbLI2dC2}d>eM90AE zA!Eozze|)st+0B{B0Qlr-D$^bPEm?QATS}HMI^kU`o-AnqG{|wSRHtahvmP3E_jO+ zlnZ4m!!X~ogxU2K;l3b4t7PbN()>9&`8hfKTe9;D@<*SO-p|SLeT zUyvt%<7&S0lQq}2OO7uC?Te2rJhFIj;b1Yarx4gv4D2ri_OBBE4~~~CjQd-H`EAwq R2y>KK?b`i4!2-Qf{|QJIfDr%y literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/sequence_parallelism.cpython-312.pyc b/compilation/__pycache__/sequence_parallelism.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05430f16ad6d872b8cf7063fbb3b10d26697fb3d GIT binary patch literal 19511 zcmd^HYiu0Xb)J0>mrE|iHz`pX$)u!}NJ`Yhk| za8Zy2*%lLm{NEO|v3q;m5wY8ZxHIUo@h9%MC+J~ud(0jegCdJNVr6k}(97b^Sb5wR z^u;TJ73|&>tBm`DeinDfs^TkxD_GnUtB%(MYgk;2)yC_Bb%>YA-dKISA=tp;<*~+i zQ?M!C9BhuS46bBpzSydGOR$B-D`KnTYl3UyQc$u9c0pO8tbHA$p)0sft~@6M@3IRA z1=)X2kgHy|8&6tU+6tsqE3L0PjMO%kT7%Tug4Fe>wQit?=P3+dH*SI(S-Cott1l=Q zV5tpAZ7fLL#8R7(+N?-c%bN#nftA0c`g#I3z4~Z4omSMuk#IWFuc$}Enub*Gld)L* zKr+!A?F-m-U!Rf=rBk6kHJM52Qd&u9Ni`Ht%1SJh3ajB*3=xDORgp6h1?5{=iIVxw zB-NN4(xO9@uRNNFrbDr0U!S7lw&v8ssolrZ;dC@|f)zeK7)NqdB%`WIA{|P_!s*_m z8vi9xogOHO@X44G>J6V#SbqWzz1|qf!x>GH@0aCcM~?TPi0(JiKa&Y3(hncqtydZe zMh0rWC;52lp;$B?O`r?O1S5uBuOebRlEBc?Nk$;Oo>FcmT*MFt#w%hp{pKG)@D+%B z+Il)*hz84N1Y?3JN~1!rO}3pAO_qQypnMQqmpIXLjw`eQKAoB^Euo zB{i7tPbRjU#wfQ$lJQg&bO$}QXv%XLB@t09u1B?aXKGOQoKeH6l%fu;C%*SaRV^J# zX3`-ex$~Y_G7^qy`#Oz;7IZ>ef#5f1g`fFXoZm4zIKEEfDY4!~Wi)D0!o}rK427673k_|!H5oh2j2lp1Ye#Ta$oYiEHKWe2 z`R3}H@#meZUxmN&tBtMSszjvXYQx@{hI>byFZr01>bB=~x2mKwY9eA$1r-rB8AWMk z6l$Ddo2gxn3u-GK25c&kWXi5mFVp}6t{RNdlo#mpp-syTD6jjsQ$-aBFys7NF4<=M z+p^*|KD7#aN{zFgejgF^6d}J#>Ba$~^rCCP;%NTCKv_O+#)hFwhlK$KOw2U?kJJ*A zcc8L37u|i|{<_hF>)yOQ)4j|cO+Aq?j%*J zS91Fp(jsi0=pI?=4QFC$bv+tUNm1$LnM4#imerw>Y!q?DZ&+BfhM~^oI8)prK$)QJ zM?l=EeR*(f@Z}fBUYzx}Py5?v{X3@pJLY^9=es6qzqjhORhL`e?s%(Xw)^;W_wk%V zsMvbM-bC=TE>ly?A43u$ zvAiKym?1Bv9bnwuy>5oZoacN&56rp1%z0U_UG|WP9u=Oq1U*crG#DFSn74&`Stt~VVI2?(Y19^(xU-zwBp5?& zw;HTv+#WzyjYNvDmsm08nHC$z8z$N&>#vAgudeBs?3-D0_o(wdaTQl4mZ?V(iz@Hn zrqGO)G7fz}yx$=eU&2Tr)nqf1wI8g(CB;lRU70GK`3 zhBn-iH~EYYqXLas1m;xLT`a#){%4O&%5R*SI`vNN-#7nl^OdT{MqO7|)L&eAVdZ#u z{P<+;8!M+)UViZH!*3nFvf|*V`&w1q`O~9c$cn2O`*cq#jzxi{6~PfgQB1nAY-bZs zw&mr!6fb4_t)KTg*@SeQ^Nj$7-#YJY)`iJE;>gdbQu0FcZ%il!E=>5Dwq&oAD#1K8 z=By3yGvMk`NfD{X5fqV&DyhMb=_yhMW6puXX@N&{4O=V45j%2N6Q4h2IUszd`(eZZ z>}>CaATrzs$zuwg^i9B(?+7<|94+V`Lr*Z-TciWPX%6y-?!Kk0sY+i|gJU7T_A|;I zLCe}U1W*jH1o-l;Z<_4Bbb6+K-{`?>HO=FPvo-6p zzV%#F6v9Z0WF!lWWUggN=31U)W3B;9%c5fAf)+N|G2$F{77!J)lo?1ODzlOScb>8; zfi5*j0oi&^Q_@P9z@}o=%T)!wjffJ5mt&~=BNDct?31XM&1+8F)$3+kx~5yYF15|H z>>lm8W)XMa<-2F<9~eD2*VJN=f5S}EHjuxjW&Fi#O-I((!H7NO=Gfo>>ecP3xL#}W z6-dT`b)5XUwJa)r{c&=J3>$DysZaqYNp;jm>pB2|ki)-e5HRqq6 zX#L)n*S5^8+x}O_e)RO4PrtL_hQoxnQ$F=x>ia$l2s5kqQ^0WIT8feSbrhzJE(Ra4r zpp*|qr)y2lSMTy>rER|!@Xz?eYN1B0%d~gDJ+RYqBr3}><@XHjs0Tp;&JJ3Bs%$Pk zg4iPc#j*^ozvUP#5YtQLiP@z=%+5;yG5eMSF%mex9Eg#zO6`Nko$+|6OBYo|i^^EQ=vA;yAqHa4DAB(Dv{vkf zVXA0oPqR=y|3(@v`uzRoW3Og=+v91ep36-&HvDRW#x&i*oXyCMctgQ zYR@cV_KFOl zxjqSm_B-jho8du1uIw5h%)aO{@Sx>p%s;m9ps9sRx|j;_VBVEks--k7+t@X1H?c=S znuSMzPCt>}0G)bz)EHV+Pf-x3U=xA}iKKZEG-5dQSwwCDx-6gLhy#GCum+P_`zJ^#wR`EhK?O#2=t; zMnrc>%K-Z{6GwzGcWD$`OmyV3q#ebK%VdQqbtA4}7t$PT zjuTs)+zr~v_yT3U2L<6K$7?-qdhfwK-0X`uRSFAJ>wty03@-zr#i6|)m0tyT^eigS z5(tQARf2_rTW4B!jrPnntbrQunQh%Q-MZ_tZKkz5D|KH!kd^jNH{3URD8B;Pm)-m5 zPd+om0?wpjiW)*(eU<`-6vGs2q2MHf zVqDA+V}K$7us{c#G4k}!kX4sW3R+= z9=a2S%9?NNB=r1U=3)cjpRVYdJy@5Vxv9yr3GoTdEp*CM|=r7YJvgJ`;^! z^{t)QFyq@q|C?ugov@ZKKR)*OjIZUYukz(b#~uY-zftZs&8AG$W9SUv%7oSZlt_Vs zUm>jC&tdicJgnZ&VfFqGMK3&xjdvNa8vh(tQ!NqvtjkpBzdcx8+O5W4hU0bhM{y>m z&}~vFd4qkY&cw9!MkP(agk7defYZ^W3fvmZyMPIQ5P9-0N3IL#m5NF==@oE7<2}CN ze^i&m=ba^&z>A4t;q(dAt#u*5L|E_%WtB_08?T6~uC}ya>9}vE<$l8L&0H~)5SbS# zW>VDi>uv+Le`3zZ5-{WAI3Ej8yTvzifOz@rEWOgm%S)*@V+C$g2X{&{W!(Ksb-(~=Q<~Y&a9Nfc&jeP@3DTu%I z_9P?^_5Tu$YNUzp)a6NzQr`ui=?wkxLvJ5>>&TU=!=tV_a)yp)vi`QL*v1veO}L({ z922WU5UWoE*XLhee0*Q&a!A~0i%|ck>q~VTzSLn*R~(!BWy{ZH&@z#+!g00T01Y~sw zR=B#0DN5ZI;{HNX4-6t%Ac_2HL;G)U5}0`5CUE;ED$x!hfZi+ww}vmax_-8L{dD#E ziJ?n(&s2Ah9++!bIolAJZU{`?HJO=d*g1A+^r7=zb2W|Qo3b^nSzqhTIM?FApQ9PZ z194d%h_}H5@pgGICYHpvF4K&Sc!oU%Tr!|6H(bOe?$9k;%qoUtzevx&M8O|ZkoWJB zu`~EpglZ4mDZ>iT?cbnojTS07W=sx?LVI~(}vIDqx2~T(dJ}rTl3-BpSF?p7UoqbO3svum- z5i0q~@ZytzDZ`-!u=EA~XahZB?l0!NU_LN{7S@Xd{sxPCn!9dO((*X;U8?G<2$pj> zla9P&m$UI!fBZ8Ow(*B2cf7H8YVVJgx8rZc*_-`qRgL4VS!wH~wyb}5R@}V=X7!<$ z472_KjQTV%tL!XbaVg<&eWJc`l5$UJ}GzHIR(UpCES?$9P& zYF?Hv_8d|eMlCCeQ7td)h;yv0Bt|90V>z_VRRHpqS}$O2P_$m~jEF#@Zfm`uo<(m0 z-o-9dHUwHIsDwUspZR8tHry9rQPN@>t^G|HYT{6~aJcU_)s_k1Zdy6~8&yw>TmqrK zo8013>#Ve6TG}yJ)tp_`KG{F-u&=G0_O*;2I8WQ{8&@r{jA$?1=uwhI|AICdi@eLR zh`Gy4?vGi*UG6Q(rqV3+mR?lc%+fJ$sf|O!9&7ngHeAM;ZocX0gn1%wy80Xiv^wKx z=N`lY6VEC**!}XOj&pai9-+Gu`CoUO8x^iXQo^4uYUBn&nqLu5D zaKw`ws1*E^Xhz$o*;>khg%Ww<>ECq4Q=EM9^#7sP?b?&8>M#~j_~&e-nu-=trec_< zC&e3jgASa_u`8}#ySY>Fb>yKd=%#ZyvTF#QIGmCZangpi@s=sxfqI^!FqB2bH$dB* z7hTx=i*q(sot118upc1Pt2Zn@JcAQ4bb>70j268#_fGUjHHmgfO8nfKR{9hT9j{@B z>$FZ*?2)t-m9a4_+8b4OG84o993ESKg5<3_d9b)AJ`KovyPV_69l5abVmW|mjOJ}0#ekm;Rz0b$_ z*@k@i_M1B*vEyK754IFG+>QYm{ox}!$oAmptLyX{q*SCnC4u3_q_i4NXml*l9C|k= z8*Q1DI`$RN2RG)h5q&(9Mh=$uq=I8^idI~%ctFKvL35*L`HEQSr~0EY1q3xY9Zl2G zAxTbxb%|tJlB22;Nyi4I!Kf0GB`}Qj9Ea+pXgsARPqRIMT05&OnwDbWL5=t$(bE!5XOc70@!t@TB3+)3+ zB#cv_tS)2PNXDxtNyE8EWDcj=RCt0PDB+jx%(goeTzZd&tC8ClwxBJ7V73rjc zS;_k=w@YUfiN+F^N*q$9x+5_KGbWQ_Jv~DmW0fS5XHXGIH^i?anc_W%W?}hyl4)ZU zD4UGQ=m)sS#K|BBQ&)0SX2P3LpiY=Y{pcl49S!GUFe9`KHrJD+65DuJIWH0lqzVPuDB@_ExQtOndl^^E zZs(N{+JFLN!fFa7lSqTA2lCZ+oQY}*NfW9s>>7^QM41{msgOFSE+xPx2#+Fl7!j@m z_}qs~N@s}Z2}^#II3^<|Uks$DDa|Ou+B2y`MTviWHataYx|l%7peh@Tfon#E`8F6` z3`!T*^+1wTzCmZO1453|ZX}&%_xf;Xw;vS6EGd#~BAmfX-YwltlP20%v1o!iPfiWBfeF^I+ypaJIqigvyF?iis!eR6++D}vRgh=V!-$u)&&Ign_dMew$ zZ|%-1d{Hht9~7SXPrMzn7wqFAcUKi3OJRuA>cV>-H@8a-)ZP3HCCF#9=qjL7gZYTo zsh?7tO}~RSSAP0!e(k@VIGQogVz!+Bhbyu)UkneIaVkW&b?Qy#vFgGT_UvE{K2K0a zbD16P3)uU9)&KL+AO7b0zP^|K=Z6!&{<};20{()%7bH!(FOf{9aFh#nA(K-76iIB8 zx-Ob#hQ065%lU^K*g0+P4UJG%!q#kiIondBSIE$w=m`g(=F9q*DU(FOVFc!TdSf5g zMkEbMOQ2%Q_r3+QSZm_zs_W)KF&q+;u*hlIddWSrYS*aqir6$4=y+rG)avZU-Ir~b zTW7l;o$h{gw)=_c?kBQ)p3JU)YScL^Hcg97SH!!nx9^*)@50Ka!%K&!=S88hb@bqM zU)Nk!!+5Q6;DPx>vCyoj{P)WGvHtVviz63Cu2i;N^=-SIY*>(01a2qeb>H&Y-m7fO ztwkqutwLGFtk^s)Hjm#uvEzI9zIN|>Vi#KlVJie#z8>zqD)fy~oaxH1A%bJ{!T|mk zU!_i4sr5_bHP=@Km8iBsTYj&qx&9e}n_U(%w&!gsj(E|gkRfPp*?tPFXInQ<@UzlR zy_&4L@fnE_SNM7bGoWeM@erMnUImPRb8P%G8~D};><7()bd+p1JCSdkhEvDUF?Jxg z_#j+GR13i-0h!T9L5A9vlDwj9M@jr1MuqTOtQJ<(%~oxguG%nLwe#Joopb)0S^xTJ z|N2?~fp`4}uGKVLt;UP+-J_m4f9?2#S-3=vI1y=jbIykq_*waIIzStIxdh%xufnX=sQ#+SLgS(R@{`eAzeV%hqk4JTcp` zf4XD;74g1nqL1r^@wWFwiLob8wfJPueW+F?%c&C4*!DHu0T`$|PvR3K>bEIz1p?id zj^b!F&=!5O2H#V8UUwZNK~=v?SriI-5fmQw`8p+1;H3eC5rH%0SN3%-F^8={?)sd4 zwQJk>(1L(J`X&ia$lYhFb#+f{pR9eYJ0~D|sdb+Ia_z`5>smMMT9>msU2DfrzS{pv z|AzviRQRq1O5|=6YBSKTk}2f-u-32-5_DQ~tkclu=5{AW7Tl+7B z%^wKIriEkg3tb-whkjO8d;YUCWvkD+KXkinHNUQEwt0UgAXw;LYx8FBx_?2y|H2{L z4&1dqupr<+m#_&H8?wGtv%dAyzV$P{jc3ai+?$Z0{qTZ-|AnA!FYY=XT@diUFl-Ck PY-_Wtcia#t!iMyJ3!g%n literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/torch25_custom_graph_pass.cpython-312.pyc b/compilation/__pycache__/torch25_custom_graph_pass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94adc400bcae33ed7d4fe58be1e3e155873cc9d7 GIT binary patch literal 2170 zcma)7&2QX96d!+WHrq{Fl~5Im!YDy%S88{qh_)cLksw7?I7EdM4tp`Pp4qGud+d(w zPzMeL1tC-kiZ~=a^<3q`|G@=LNEV!W;0H-j5_rCFbxve)Hyi{O0#&ewv!9 z5g38}(ErLIxa$D`TFz;x--w~xW(&1h1B;aeHfmkgmxqht^Kn&Ve%91n+Zz~q}?yw|b%LXAe3gd3iPpB7wjw%rc%o7PcW=y*( z3TcXtZr)sIUZbD(grr&G%e7K_1C&keZ8@c0B)gFc6Ih~YPf*<8%T%Z?_@M~`pr^nKZ2?SuN_J*mihcrgF^M-j5ixK9dI@4XfVLN$0S_`*yvoW zAdKX5Iz7E{v`ud+oHYctsDNFjt5TTGMOZ`aF zs9P4IPm9KIDrNp?y7ClxX(uA+sw((OFbbvre$Yz8xE1^Hm6WM9kzTo@h=S9_ZJr)G6yY>3X{AmsiUc$;S zkuS&rNa6vnj*R1_KN^+(p$3Zr%FDO#EHmlf%S4bY`&!Ksh2p#BlG#?TC0 ztwcx!;MRs8pFkVrD|%KCWW0Apoq~j)8VcJq!Stz`0$JF|kgW?Z000+X0W)P)7Y(&g>Vu@&2__wX93P3*u9JegOv%_<&o#Q3U8sp(S;Uf_rAVK3P(Q- z3Lhnf?^${3FXK#e96Llu({(6#c!(C0FiAcEu}%IoC!A{kEUC=&P0Ok8iYL9ty(jXq z{B?HWm)V6r0p*YBK7J0Yp%qX8?QN(2$lJa3{rWzE>0lXV`*p{;a4_vSWvM!3W1_HF z$5Y6VvY1Mm$I#kk;uI1JSn(!}zB{j5mHK=bn-JUzK^I Mw!8W_fsd~EKViEy-~a#s literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/vllm_inductor_pass.cpython-312.pyc b/compilation/__pycache__/vllm_inductor_pass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97e5b9d55f5611eac854c4d8ccd11c951294f185 GIT binary patch literal 9703 zcmb61(Vd2g=0cX^ZIA#y}Y5tk;HrcR1FB3hJXU6gDqPQut4FL#FIQhRx3D2m%9 z%pY=1CQd~a2t*+jM8O3_!v;#B1!SOq)B-^Z6exB@Im?V?R6*+Apv zUpB%;`2n86J}x3eZ38x1W+L_h2OZ}eaKe|5xT5X>H|-N5o~U=g8}$wNXx$bmiIxtO z(y~2L7A+qrhq6Onf4mAG2ob$N3`<)X+W; z^m&!akGcGuEwt4KttHBqLTfE;Err&yHLZ0MjKBPAoY?PY45u8_gP}-J)1cxwhJ`nR zDwL}u!Q_NA6j!4`T^d${i4iFjmlZ#2l%Egkx}wI;2KCU0qMnB-MrFS3ysGH>#CbIw zgAuTb>y1bxdMq9r3J*ig9gc-{DH0zZR@AQ%UTE{2jLBmmJxc7CVOV!CPMZe zdGdp{sk5_t=D6zz=JZTO%M#zV><^@Qaen(FKA>)~rmVw053QJ1{tmK@*q7M>z&~Og zv5)tl*kulJ?gVB~97C)uT;&Ixic7X#B?aPj%XY;xS_i{6q|t^A4N#RGO34r}J7Jwt zWQKt<*$sL51nc)0rASxS%_0RRRhI~S98TUk=L(j0s199NI-BQd$VcWKkXi(uoa=V|p0qC9G)Xyne%tJyJqdhQe12 zTZqzmD9^GKBE%xn@yaxi`wgP!nVgUmGrD#h9Zblmh4p+LKhGIyTQekI{DiT@MT$s& zX(z|#`7o=nKkP~c==xD=b@BF5&-!s#ivhR7ia;M`f|;dOdG$+IEYdz&zBYC;RNxs{eG4*DrH8$mokh|zJ|L%)}hQ=9b~wXobt_v2bf=p zST#Y`ClZR`HXF@~S5_{ELy9%TW>yWSBvD}tasYugoOD$b9UxeFO^FPtTcKS;+7Lw* zISZu@m77wsb|cjXtgan6>F*(%BF}83<&?{lF8e#P{+BZTm$H5- ziuwti2heowakXr}%swr*MOK9cP|{jmG=+`epmAX6WJ5fAJ4 zq+k7!*}eB*cJIlDdr!^|WvkmV)ot18u7}lK|EW*&AGnumx882N*_f?;F;n~EH2+H< zl`o+JIl>uH|5B#73WfLAu|+Yam?4&Klo`E1pfal$SYWgWF&87>A%2<+F*0}6V~M0m zCdIr?{CvOK3TH6{s^~Uxa0T?O%tQ7o<_6I8=vrW)6_d3fgZ-A~GP`J=V&npMqRO9P zQ^464zy6t7?7A_Q>9S_!)t!N|xqibosDRd13?Vp#I+!zrK?Ova;i9sN0%*9Em<(!3 zMqTXZDD3JBu%=-T1tSq@D5fC@qBvzzpnV+le?pOyflUMm@=~!@3LJ{WVZL@GkRQJr z7SWDC_Wcxj;wim$YUb1jXO^LGZsy$F@%ipYp7!5)DrWoUc0BTktL^(Ad0v28*3*#j zG<@c1q!i@08P1`x7+4DNh-L`5gQnKwnFTN&LL&w{bj1)(prHcHy$d>2x;=8z+j&m% ztUYaV>z@Ut3+8F4(ctV~I?Jy$%{0x`EIGI3Si#x!sIDzX;QKFo?|0wpzJKK2k!M(4 z;fSl54#@dPY0b1r`zB@uO%Dz$p|zxeR7?3_8VaBsAU9JnE|}y}+!QM_F^Dqq1 zPC_e>^Qbb~G;gYUYI5@wwnUf7f@%mu~7@ zcxSO{(Vq5nf4U>vb2ig+_FIm1x;6_Tqzj6C)4E*M%*Bvz@fhflg4UiOA2I#FUOW<& zQIb&9A&~ZCX!uGo&2R(cU^+_J>t`r#G?W`ADXN`NOjd8g0cF)b=+jW<0QmJy+4{~* zedkhr*EIi`W6Neh5V;hPe}}?a6XUN1L|;|W#SIWrjAoKx9<0jcq??dQMlYBQqiBb2 zNR&F!Jl>E%4Cc9lAykA7dM$y81FU+e|;h5M*U=Y?zWP2nM84G}}~ak4sT)t*TX#>5~ncZe@+MJOLv# z+!pSpc5Ak_BU9V4RND!f&-b{xey;C!|IPk|3)zmt8Tj9Dc&YlxgQ`sR%hT@14&OD; zj3-?cSmf{f?)e_LKXaThC0hlx1=KQFRYkMMuuGC04}pz9IgZCB4m>Kg3P&LSs(x(q z0EOw{P*ed2K~@i7I})?HAG2=E&^)5-M(K@$r`BK_wfWJyFgY9LDDI^xkiAEqvYg<^ zxro1G>h$cExt4kUW?QCg#}ePZ%pc70E}?5KxkBKR+d^#RPg$o>1ESAXp2JeES`b|G z!d=H52eKr{Nx`+Kt>EVyA0zBqv2ns##ykP;8-iaUA*`5Ieh%4ly^g%60gki1m`y43 z7Ga_X0%M9;gw$uT$-*JnN46;*^i06paDcbrm7R)Dc2V!cjo!zE$L}>LuWt5ygW%*7 z>P7U58|g;WM}oQtJ|(ynBZ_FbE8v(=|5*gj8XaiM3DKYdNF2tCVJ+{Gpx1$OO+Uk& zr{(>S33Gkyq8rdtJZ}?gQw_KX{GgyX6pkqE;`m57G$IbGN>GnXh{3R|iIiQ?_X)V@ ziWkoHookJZT?+ceX#BDQnX0UMLg-ZKBY0jgk4;Ea6p@D4gibbPV)Dg9=Mv{GD=P4( z{AO!Bad>Cz&o1tKtIfZ&U4&I>`)b>p{v$@Ebnd)#?v3NGojKRrC!O!TaN+oCVBLT| zfwLJl2sDhTu}!8Lm4tYy)KG)(##GEWlbNe)j4`kW+2p?0ltd&LQp7dqZVdzi?IL9U zBc@*&0}EdjF-9_G1{Sn+b}njSYC85q0pd(dann&|Epi1t%{^yKfuuo;^Ke3;Kk1@& zPXHnp0Tl(WifZ#~L_UO7z>HqVRnG%heU}j2+(ikhVpKz;| zmdwjSgpG2O9J&K3Zn^|~6BfcH4vQKl`4m4|aCYG|`6Sz|5qrs`po8W$lkAuw#T;>< zD(5XOC4Ayq-95#Q7SY6`sDd^m%k=3Zv&>tdS>A*jo3u^ZQ#RR+LR2p}>cyP#OQ3Jn z3fZHh)}aaJ@i@C(~XrpbbK(w%aTHd?K#-%%v34M{Fp>)OG*mJI`-nsSY9-`rY&(j%9S zqD{Ra$z>@g;HUf+H_X7gXuE7kedL-q#14}X`xfjv=`B7g+AtEHIH@i;kT z>a{{B4>Xn%3l2sUxt+EZ)zBSz4P8*4E?f%}Riaceqj4!&>Uji|K|}_f5L2$`rfM{G zH(0MjS3u8#cM4jvu*?P2oTmLI%5kt0!n!67DjJ=ez$m(?#j)SC1HhVy#N(GVF%rI{ zi05fw80=$NDJ%%ONK}Ga7>-cT<06JI6LDRM>0$8hC&XCra(I~9zX3!{Fg777sv1`{ z5xxAN3eXi6L>S{^dSXm(7r_h5h*btKE&}LwI30?vN$+F4bvj3e%%q!j1hyiz%&6@ ziwGg{BJ^n9*X3X&EO(gH*Mani#EDcW4vs-2eI1dk-y%GOYzF2d-0B*#1M(SC<53_g zh~osLk?=4KlHw{b628j>f@6AIqOl)%hJnqKU?4{maa9*L_sAl^UZi>54%QPvAb-u0 z#pn5D+gz8yI1&%%PlRW{1|T~g@k^~OgK14xfb9x?e-I8HfOng)u0;LGDi`1{k8O=& zHJ<1Zjfx_NG8k9VBP&A|3rI<>)!&okVCSUWG;qbFKv}|HW7L@OX+_#Vd4>?ZB!^X_ z2G1D4HwrDFO=xf!iB{A>AR~i^lg12z3KcVc*j2&|&oX&s~1S^{ij!yQ5gLZOdih6kQQ;0+L7ML>KZ2UAWm_|Z57AuFOvG!7Ri zOCS(5Izcl^*Ec2-1^`d3Z-1#7R6_l%Uc`*T;>243i%lOfm?49Y0qv=A^Gr}gD^jIQ zsv;K|EG*~%87AR4gS(_mz!}X`nL}k5rIkYVHkIv#?5G+2^sXb~A#MJMyHWW{$bO{~t!Oje$&K;9*5zxw+*Z z`xYt}f*;j0QUu;>*`hekpFBzwn`J_T-ZWG25^&)39%GVyU5b*7;>=ReDS3 zqI2PI($xnaZ2PRV?{V`B3$OgPdGGa8v%Rz9U(#U6>`Tj))i=&uKa;I&&s4UjJ6?Lw z_o(t@&PK%M+wb0dcVYaW-u>v^rKTUwzOvl1^X`E=2bSx1E->@jLQA?f02=_{t!>$z zhac`dykh6;>z>+3>yddbC%^*ProEY_y^EntQ%}0_;HSHno3>?}c4eA&E$+@V?N2v$ zf5S0bd!Fn({FKh*gxb2Chcxb(|H=HRbbV*K?!~8GTW!}0=h$BTT`8$)ygoc9&%d%% z-H|IJRoilvWP5A2<#49u@TZZbmeaFm9#_`Ry*byv(44O8SnU06<-Turkd}Qrk&C|!7@NmB=X+2>t{muSf8&v+oE@18vPE^_Z z{bd^hKk8YSroM(5x{Ng5W3Uj*x93@(I#DVQeTMzKNy27~2RYP?458DhzK7X{K+mh# zh8b?6A%S9`#|D9(Qc_nB55&8d__}3&?~|^Bi{`{z&P2!7KCPCugsGtPtChTvC72m+>+HzC6( zAnSEsW)x0l6@K-pW2Tu=#21qaJ z{jGEQP=DiZWfx@l+Oryzp8?nWmjW2e1$P}@ZNQ~p8xdBI!8Ckw(Z)b8&?uey3vA*| zIfiArsrmybX@~`oHhFUaV|vMkB8M%!@-5&byjCy@9y`1<_Kai8T=gS|xLmmfp4yxy zvR{A!e<3acf@xzyP>M#3(L;xXu~`#5){H1pxXgem4f?Nh<>Aeez9dP{1IXMW#Qg=?_64c`11bNUc)lR}{)b%n ziZp*gTD~GXfA6fE{mGKEamogYfOUVzvP>u7n(2hlG3y58Go8;|0p=_-n|em@`)rW; bF~jUnZ|!(Su<%T0ZA@Fb4z32i^cwyT93Kmq literal 0 HcmV?d00001 diff --git a/compilation/__pycache__/wrapper.cpython-312.pyc b/compilation/__pycache__/wrapper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1cdb1bf25a33fec679ddce1c1d06a7798d53f96 GIT binary patch literal 12449 zcmeHNYiu0Xb)MOseQ}rE2Pu*w^%zPP$rZV~dQhZ9%T)L>ZHXckiIx(!$z-@Q8}pCckbM|bME_`?|k?EOKoip1y_>!%Xs?%iux@+7#CBDZ2bfx zHzUZ$oS1Sf<$M4RAxnR=^K z!RbZ|HAV?FFH?g1ElVkX(-Ix@{0<)`g0xo0&&Z0zN0pSQ#4-Y=L^EkcyrQJ|G(RUw zkm4N92;$Vzyco1<-r-DYKAz;2cqV-olC|357e)rhBz`_d9-HPwg^Ok-NlYu;LNb}+ zK-_G64#c?Q>A1orGjkv{TB?1iq7>zY*cNu|4GQXKSq@EPLYaA1$*}U}INW0*J2pN! zG&s&(m>QfKW#_WIB(P9OToRK@6F;JUWYrj1OwMWyFU`rC{qiE-Wh;neS@zRY5hXPr znUBwh6i649(3Nc8iQ||1#7Hte6PaIv#-}4#-$=CFz{n!BYhIMX^Gn(;E}hBDbG#sM zbIHsMpX6kPS456fMDjy14cG)51XrmI#<>L%>aor2f4G@bljnz@jPwF<&N16Ut10NWxee6^RLZ#X~8Ya8^kp3+}=6l|N;9 z*mK*i6}qB*)!I_G?E`pBV5e`pRxF=G4vxXLYsKmU`XPK1WSBmr)I+!HgLZ z>WdzC4wnqMltwdVM85da%-JYd|JHJYrm5VYaE_JRma81aFL>o}W}9-KXSB_^lCyq} znP$$FCF}nG@5?W!FBhpR)~`~F^gmb^=^(@Jg$XmPTV=Lnmf6{i#8#LpRu&Z{o}QCg zJ}t2Gyb_Ixa=4}jY#%!ni_6A${UqgZn7tq>G8>D}#YBk}mePDG!$x`dh)u>*ad@>& zewgi?5Jf>0@CA{9l3!dD(?_^tVCbQIkyS)l;mT>8xFDOxhfl_prC`|nbWBW(5}!;i zu{H1`L@9t&OL_rlq4IL;x^=#S81= zM7^{lVfLKfA8aP8u+=TNi8LNY88RvG=)+PTV_glAW_=%H*S8)oaUYueP)p;}V+ftj>gY~u ztlk~keoTK%12wyj?KB^mIh-6{7CAC~v|W5OnoVT^CyD~M{VhRD5@L@aqqSzF_*^^< zcpSE#X4TDYfNPo@u(<*8aoY-``zmJEpcBE3}t|MB-8qa|dLNnP+?E{#Y(l=yN9G&GEXN+1 z<=m#7Iy^!@SWBq)#mbcTohN?3S^Ixj+8^XL7}=MGPMq$ie|~c4&~tw3m)%1L;OSTX zq4v{#)_?A^!ox#&7(Sx5z7D|+N~F-)o3;q_v{kTx`^5-W2yFrbp8N^Dhuc)p4@C)k`-2or$m+m z?T}1z92O8yFJvx@LN};+8B|_UB-N={n3zccbpZ~fD1_M&-E7IROg1TiWk5qQ16By! zS+G<591r$@pH(0gn@og(T(A>hO|ZcgouF6x0e8&-V{FC2?u8J%d^$&Q@Tgx3z9E<4 zYp>8b%wa;i9Q~w@&x$2yx$0KXnua`h(RVlUl`^qp!8(*zhq<$KO%^g2x`dyVwOK_gba=cq(?`N@b04wzTg9P{_?w@Pjh%%LXZZAIT5 zGSbR-Y4V9iJnJ06O){-H3pIE(46tKkd30|DlGsa4lOlO6IN*{@%u!Lx*Wld> zEzqx7Ua>Rqyh2}qkS4K3DrlK_mXIVti$-73Ximn}X_$QH$t0he5%_`S{>LcVFvk(< ziVL4iW7WSFAxMfm;OFJ8?EIX>!x)@fB8_H2XUaegAy!3zT_Gb3;LJ6fO(x-k z`IzP)Fftz193|MTIWqIeD#^j!nzPgbK|%*eb5{_5TA)IKm&ibk!Fiy0&S!zYONpbB zl#w(K>0?89O&nCU4(l~5FqB%8F*l6fD#?=2`PJ&H7$S`k(+jfJaB+P6Ece2NQ4Shc znr@?-BgV_TqDT@hHi>XxsD7jfO0(-ErRD$*F%6bTbClYrIl%jYL_rG})j)WgBqN|T znk5Z=b!v1NiM!_}C(mfLFpkjA3O6h9DN$>b#JRYP(_m&v0nZ0GxmYH1S*v~Uv7!Xy zMbIvx=9h4JN}y>e2^X()6avkflILXX3k0?195-g}a_5uI@vndg`2wuJt5mUh_bT&B z?MIEFwZ_P*yV%;k%B(w^i{6%kH>7$)1@95ndt{X?*{hd_nux0oXLC6Yy@@} z0*BPVp+evpHSo;s%WHwttBzuzVb$@Gd&jGD-<8+h`)@hk?!VLdOtG=8(Ac9k_7ob& z)yDDFu?@HP+ouZky=r~$9e3|+qwQfTT@ zo4WF$!FBIYv47y(wi~st)fRkDslKP)_w{Wwj=gFv`s(vdW2(=Hj;OwoSFLw_jW_CE ztGg+$`}Tjduj7ZI?}v&W-}UipciZ>ZV3YUW_fZ~S!QFnx-F~a* zC#MQWFRDi`-s!q{*WOHxxENi>f6gOc+uU2R8sVmRS>&T%dxs3C&%hx0bUN4~A-zv-9&A3EV$+W{H5l z(xw2YC{$`$iC{}`s&&nj;xCIV;?oB`c$B8AKY?8Y2uviQR|)uWIcO-a%&e);I*nqo z2qAT0{j5d@VeNbg+9Hi&a2x|%`I?*KN>VsZn!x0T2z8RY-1WExRM7`lL5}?A5L~5T zuD+@in!;*R`0d(t?*J^5O$+Uxq&FI#`6sb3FsTkq=ASv2KlkN);QYrni~j<>Wo7E? zHYujIZnK8+H59z#|<9j0V@5g^c4p*}QVFr7&Jtn?qz6G6W;02Pp)#ei5A#Cf`TL8T%+g8^AK zr!i(4Osz&-0NO@3YJN#fWfnwu%D|Q%5{X)a$-WR9mzhb3QAKOv0GJSd66mE0BMN3v z8bSF7FnDMYYEHvbd=gWp`;^x}lKh_#z%oSJf$Dn$7L<=weQd#ZRP`M#_>QZ-<9F-# z=?3SXmG<`mUC|sd7cG--I(6HwHl51*PHogT73vS(sXqwRjlWShHl3=kv*7Djef_t4 z3Qtd|Pfx8qeR17)sn`^*E@Q{e!j2wwN6-5^dW)?cH+$YV`TEIQ4f)pNZ#Sx~$3L-B zjeY8lUML~Z{=UDXsu;)==v4h3pEw|Men;=8pFr-;yuag98NIEap9l;;ZT-dGz=*^8 zi>GZPjN@I0ZN%qz098Z22R2ndOehslXCA`W^_9(K-Qy=7!BXH>a;c)M zzUdu?eQg&^FXT?Z3g9dx(+!as7o-X1lCiQnStz{#(dC}Ub%s$s%yE#Tm;WYwNrniv zh`sgK|Ki$T6ubvi?}36hta`(FXISqnXC?`-5H8at>&!N%z%`uo{ryCLmdgzL>M&!Z;knkyj6@y z!lW?D-yOKL&*ykv=89g14~EDlO5cBhQ#Q!7g}9gy>$}(3bwln*&*17 z#LsAbjz-zC@VjqVmTA>t+*6a%H+y91eamFj!`Q$xtc_W@veN& zxr|?|-2sE5<(@Y#%zO#&C2mGGURl;@Gc z2HcMv5xCZwMUYQ8X&ur>vjR^j3EcOKgKDqn5#)cPJQ5kecQ3|)9iDVsLsZ(l*B>nVc!%j1=awoN6ZLxpaQhA3`BbtsC0e?cN7<6@o22n zG_3fHi1|sQ<52g`68R#FJn;EssIbae8HzRyQt5~e`whJdzY*-O3GODA2I!9^8SE1r zTGFAGeU?L(M1woA5kO9GuzxFRZioQE1pX2{*yMTGan&ptSz`ch#XyWWuQ_Mo02WeM z62Wo^_q3+c^xy<>COgLo+0;BozOai|fTxf(n;_0F&1x>8v`s5&Oa^vmHQS;T2O47# zg+LzoSaXeDiHd|4k`P70VKQU1$l)UiNp+2xn4B0z&WKc7Cf-=kqf;ASgACHwF_^)u zE^MLRIu|yLM5PqI@|x8VNUJ3YQ?0TT)#zF23_QrFO$i%P3dmlc^gN_4haP9Xjip7n znIE9E_h43o-FDaC>?ySNsI5I~t-Y$dx9ACc^H*Ift{>}p|!j19Q z##cvyMR~*Xx~I_Gtu}YBK5srAQkxGIntRpe-k(T6UHtLlPrv!&Z>}{D|7=KY9)oP& z=9{PQc-wEC$R9s_r|Wdlx8q|s9VBc9K#ziVuj<`f@b;hMQ9ci+t1lGk@0 zefz~?^UgOMuRGqfe%Axow512o>yB1ze!lJ5UwPLXz5J4C5cCVYBfpeUARs zu}v4_KTpFpYOU{n9aX>gro0x|p9iXCC*==g@(c7QeyB)(=h6R$&Ccewdlc>8`9VYT zCLsKspUMYeK;PZnHpWu#4g|&;tiSZ_f%tn3wz1ug_jWok&br3Bt?zZw5I%rf8Tb^< z(T8%jDv`eu259dHkt41^}SOa(u3iITV@Ns>6 z4n|%nZUCc{!laQ(!UVZZ;}{6Bt!v#0cWmVFgG&4L!lFb!BTI<{cjN(Xze_zQ6ECZ;I!a(^TuhzlX; z+-O#o!FH403umOch!R zLKQZ@oCvikKj_Y#h$$iDvgJ@b9U>bBnaJVZ6GtO`#O$T87X%S3XtH$J1zRu?v=Kw6 zo1{O3=}KHM2??~Ig_s4+Y1FE63rD&LIi6tm#+Yvp5e|3ZdpQXKIoq+>Of@tY>buqY z?zQ^xsd5tpYZC>}KGn1DeNWf#Z^gEy{6R*7=%qns$U(p3UEmaY7~vF6%gm)Y;2R7(?Gs??d#$_0C1BC*jH_ zy{7y5xT~hyM8xnC29xlU(FngxeQdGYocDa~wyt~UXoqc>z847CzC>?!Quc-qn651= z+>1=tCS$Qh-i&QgaNBhD+Maqdwn@S5cIOt}Hs@)`_p!xq>my%~7;$q+dBJ802S5@b z(5(DSlvG};L7)XkaNuw@sWhBug7fFHMA}1$+a`T_bHia{w@g26pt*4K3HHnuWWsCd zn^C2$G_qMlKuE<#RaoEckz81hmoPv?Q$D1mGg737z_HSgN$aPEoG0~x8Ib-8UVu5E zWIQ4OHio9RoD^O2YpUi$YR`w%fe)#c-%_4mQ+xiMYW_EBQl%z8r1t;D)$r<9)?9n8 r+V9ydwCi3C1YjsFzK<;-`V#QK5ZxrvPrgBO7P{{qoG+mJNay`8=+qE2 literal 0 HcmV?d00001 diff --git a/compilation/activation_quant_fusion.py b/compilation/activation_quant_fusion.py new file mode 100644 index 0000000..b5fd67c --- /dev/null +++ b/compilation/activation_quant_fusion.py @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod + +import torch +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._inductor.pattern_matcher import ( + PatternMatcherPass, + fwd_only, + register_replacement, +) +from torch._ops import OpOverload + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, + kFp8StaticTensorSym, + kNvfp4Quant, +) +from vllm.platforms import current_platform + +from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 +from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherQuantFP8, MatcherSiluAndMul +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass + +logger = init_logger(__name__) + +FP8_DTYPE = current_platform.fp8_dtype() +FP4_DTYPE = torch.uint8 + +SILU_MUL_OP = torch.ops._C.silu_and_mul.default + +FUSED_OPS: dict[QuantKey, OpOverload] = { + kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default, # noqa: E501 +} +silu_and_mul_nvfp4_quant_supported = current_platform.is_cuda() and hasattr( + torch.ops._C, "silu_and_mul_nvfp4_quant" +) +if silu_and_mul_nvfp4_quant_supported: + FUSED_OPS[kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default # noqa: E501 + + +class ActivationQuantPattern(ABC): + """ + The base class for Activation+Quant fusions. + Should not be used directly. + """ + + def __init__( + self, + quant_key: QuantKey, + ): + self.quant_key = quant_key + self.quant_dtype = quant_key.dtype + + assert self.quant_key in QUANT_OPS, ( + f"unsupported quantization scheme {self.quant_key}" + ) + self.QUANT_OP = QUANT_OPS[self.quant_key] + + assert self.quant_key in FUSED_OPS, ( + f"unsupported fusion scheme {self.quant_key}" + ) + self.FUSED_OP = FUSED_OPS[self.quant_key] + + self.silu_and_mul_matcher = MatcherSiluAndMul() + + def empty_quant(self, *args, **kwargs): + kwargs = {"dtype": self.quant_dtype, "device": "cuda", **kwargs} + return torch.empty(*args, **kwargs) + + @abstractmethod + def register(self, pm_pass: PatternMatcherPass): + raise NotImplementedError + + +class SiluMulFp8StaticQuantPattern(ActivationQuantPattern): + """ + Fusion for SiluMul+Fp8StaticQuant Pattern + """ + + def __init__(self): + super().__init__(kFp8StaticTensorSym) + self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym) + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + scale: torch.Tensor, + ): + result_silu_mul = self.silu_and_mul_matcher(input) + result_quant = self.quant_matcher(result_silu_mul, scale) + return result_quant[0] + + def replacement( + input: torch.Tensor, + scale: torch.Tensor, + ): + d = input.shape[-1] // 2 + output_shape = input.shape[:-1] + (d,) + result = torch.empty( + output_shape, device=input.device, dtype=self.quant_dtype + ) + at = auto_functionalized( + self.FUSED_OP, result=result, input=input, scale=scale + ) + return at[1] + + inputs = [ + *self.silu_and_mul_matcher.inputs(), # input + self.quant_matcher.inputs()[1], # scale + ] + pattern(*inputs) + + register_replacement(pattern, replacement, inputs, fwd_only, pm_pass) + + +class SiluMulNvfp4QuantPattern(ActivationQuantPattern): + """ + Fusion for SiluMul+Nvfp4Quant Pattern + """ + + def __init__(self): + super().__init__(kNvfp4Quant) + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + result: torch.Tensor, + output_scale: torch.Tensor, + input: torch.Tensor, + scale: torch.Tensor, + ): + result_silu_mul = self.silu_and_mul_matcher(input) + at = auto_functionalized( + self.QUANT_OP, + output=result, + input=result_silu_mul, + output_scale=output_scale, + input_scale=scale, + ) + return at[1], at[2] + + def replacement( + result: torch.Tensor, + output_scale: torch.Tensor, + input: torch.Tensor, + scale: torch.Tensor, + ): + at = auto_functionalized( + self.FUSED_OP, + result=result, + result_block_scale=output_scale, + input=input, + input_global_scale=scale, + ) + return at[1], at[2] + + inputs = [ + self.empty_quant(5, 32), # result + empty_i32(128, 4), # output_scale + empty_bf16(5, 64), # input + empty_fp32(1, 1), # scale + ] + + register_replacement(pattern, replacement, inputs, fwd_only, pm_pass) + + +class ActivationQuantFusionPass(VllmPatternMatcherPass): + """ + This pass fuses a pre-defined set of custom ops into fused ops. + It uses the torch pattern matcher to find the patterns and replace them. + + Because patterns can only be registered once, the pass is a singleton. + This will be addressed in a future version of PyTorch: + https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 + """ + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="activation_quant_fusion_pass" + ) + + pattern_silu_mul_fp8 = SiluMulFp8StaticQuantPattern() + pattern_silu_mul_fp8.register(self.patterns) + + if silu_and_mul_nvfp4_quant_supported: + pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern() + pattern_silu_mul_nvfp4.register(self.patterns) + + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log + def __call__(self, graph: torch.fx.Graph): + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + + def uuid(self): + return VllmInductorPass.hash_source( + self, + ActivationQuantPattern, + SiluMulFp8StaticQuantPattern, + SiluMulNvfp4QuantPattern, + ) diff --git a/compilation/backends.py b/compilation/backends.py new file mode 100644 index 0000000..f408ffc --- /dev/null +++ b/compilation/backends.py @@ -0,0 +1,759 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import ast +import dataclasses +import hashlib +import operator +import os +import pprint +import time +from collections.abc import Callable, Sequence +from contextlib import contextmanager +from typing import Any + +import torch +import torch.fx as fx +from torch._dispatch.python import enable_python_dispatcher + +import vllm.envs as envs +from vllm.compilation.inductor_pass import pass_context +from vllm.compilation.partition_rules import ( + inductor_partition_rule_context, + should_split, +) +from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.utils.torch_utils import is_torch_equal_or_newer + +from .caching import VllmSerializableFunction +from .compiler_interface import ( + CompilerInterface, + EagerAdaptor, + InductorAdaptor, + InductorStandaloneAdaptor, + is_compile_cache_enabled, +) +from .counter import compilation_counter +from .inductor_pass import InductorPass +# from .pass_manager import PostGradPassManager + +logger = init_logger(__name__) + + +def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: + if compilation_config.backend == "inductor": + # Use standalone compile only if requested, version is new enough, + # and the symbol actually exists in this PyTorch build. + if ( + envs.VLLM_USE_STANDALONE_COMPILE + and is_torch_equal_or_newer("2.8.0.dev") + and hasattr(torch._inductor, "standalone_compile") + ): + logger.debug("Using InductorStandaloneAdaptor") + return InductorStandaloneAdaptor( + compilation_config.compile_cache_save_format + ) + else: + logger.debug("Using InductorAdaptor") + return InductorAdaptor() + else: + assert compilation_config.backend == "eager", ( + "Custom backends not supported with CompilationMode.VLLM_COMPILE" + ) + + logger.debug("Using EagerAdaptor") + return EagerAdaptor() + + +class CompilerManager: + """ + A manager to manage the compilation process, including + caching the compiled graph, loading the compiled graph, + and compiling the graph. + + The cache is a dict mapping + `(runtime_shape, graph_index, backend_name)` + to `any_data` returned from the compiler. + + When serializing the cache, we save it to a Python file + for readability. We don't use json here because json doesn't + support int as key. + """ + + def __init__(self, compilation_config: CompilationConfig): + self.cache: dict[tuple[int | None, int, str], Any] = dict() + self.is_cache_updated = False + self.compilation_config = compilation_config + self.compiler = make_compiler(compilation_config) + + def compute_hash(self, vllm_config: VllmConfig) -> str: + return self.compiler.compute_hash(vllm_config) + + @contextmanager + def compile_context(self, runtime_shape: int | None = None): + """Provide compilation context for the duration of compilation to set + any torch global properties we want to scope to a single Inductor + compilation (e.g. partition rules, pass context).""" + with pass_context(runtime_shape): + if self.compilation_config.use_inductor_graph_partition: + with inductor_partition_rule_context( + self.compilation_config.splitting_ops + ): + yield + else: + yield + + def initialize_cache( + self, cache_dir: str, disable_cache: bool = False, prefix: str = "" + ): + """ + Initialize the cache directory for the compiler. + + The organization of the cache directory is as follows: + cache_dir=/path/to/hash_str/rank_i_j/prefix/ + inside cache_dir, there will be: + - vllm_compile_cache.py + - computation_graph.py + - transformed_code.py + + for multiple prefixes, they can share the same + base cache dir of /path/to/hash_str/rank_i_j/ , + to store some common compilation artifacts. + """ + + self.disable_cache = disable_cache + self.cache_dir = cache_dir + self.cache_file_path = os.path.join(cache_dir, "vllm_compile_cache.py") + + if not disable_cache and os.path.exists(self.cache_file_path): + # load the cache from the file + with open(self.cache_file_path) as f: + # we use ast.literal_eval to parse the data + # because it is a safe way to parse Python literals. + # do not use eval(), it is unsafe. + self.cache = ast.literal_eval(f.read()) + + self.compiler.initialize_cache( + cache_dir=cache_dir, disable_cache=disable_cache, prefix=prefix + ) + + def save_to_file(self): + if self.disable_cache or not self.is_cache_updated: + return + printer = pprint.PrettyPrinter(indent=4) + data = printer.pformat(self.cache) + with open(self.cache_file_path, "w") as f: + f.write(data) + + def load( + self, + graph: fx.GraphModule, + example_inputs: list[Any], + graph_index: int, + runtime_shape: int | None = None, + ) -> Callable | None: + if (runtime_shape, graph_index, self.compiler.name) not in self.cache: + return None + handle = self.cache[(runtime_shape, graph_index, self.compiler.name)] + compiled_graph = self.compiler.load( + handle, graph, example_inputs, graph_index, runtime_shape + ) + if runtime_shape is None: + logger.debug( + "Directly load the %s-th graph for dynamic shape from %s via handle %s", + graph_index, + self.compiler.name, + handle, + ) + else: + logger.debug( + "Directly load the %s-th graph for shape %s from %s via handle %s", + graph_index, + str(runtime_shape), + self.compiler.name, + handle, + ) + return compiled_graph + + def compile( + self, + graph: fx.GraphModule, + example_inputs, + additional_inductor_config, + compilation_config: CompilationConfig, + graph_index: int = 0, + num_graphs: int = 1, + runtime_shape: int | None = None, + ) -> Any: + if graph_index == 0: + # before compiling the first graph, record the start time + global compilation_start_time + compilation_start_time = time.time() + + compilation_counter.num_backend_compilations += 1 + + compiled_graph = None + + # try to load from the cache + compiled_graph = self.load(graph, example_inputs, graph_index, runtime_shape) + if compiled_graph is not None: + if graph_index == num_graphs - 1: + # after loading the last graph for this shape, record the time. + # there can be multiple graphs due to piecewise compilation. + now = time.time() + elapsed = now - compilation_start_time + compilation_config.compilation_time += elapsed + if runtime_shape is None: + logger.info( + "Directly load the compiled graph(s) for dynamic shape " + "from the cache, took %.3f s", + elapsed, + ) + else: + logger.info( + "Directly load the compiled graph(s) for shape %s " + "from the cache, took %.3f s", + str(runtime_shape), + elapsed, + ) + return compiled_graph + + # no compiler cached the graph, or the cache is disabled, + # we need to compile it + if isinstance(self.compiler, InductorAdaptor): + # Let compile_fx generate a key for us + maybe_key = None + else: + maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}" + + with self.compile_context(runtime_shape): + compiled_graph, handle = self.compiler.compile( + graph, + example_inputs, + additional_inductor_config, + runtime_shape, + maybe_key, + ) + + assert compiled_graph is not None, "Failed to compile the graph" + + # store the artifact in the cache + if is_compile_cache_enabled(additional_inductor_config) and handle is not None: + self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle + compilation_counter.num_cache_entries_updated += 1 + self.is_cache_updated = True + if graph_index == 0: + # adds some info logging for the first graph + if runtime_shape is None: + logger.info_once( + "Cache the graph for dynamic shape for later use", scope="local" + ) + else: + logger.info_once( + "Cache the graph of shape %s for later use", + str(runtime_shape), + scope="local", + ) + if runtime_shape is None: + logger.debug( + "Store the %s-th graph for dynamic shape from %s via handle %s", + graph_index, + self.compiler.name, + handle, + ) + else: + logger.debug( + "Store the %s-th graph for shape %s from %s via handle %s", + graph_index, + str(runtime_shape), + self.compiler.name, + handle, + ) + + # after compiling the last graph, record the end time + if graph_index == num_graphs - 1: + now = time.time() + elapsed = now - compilation_start_time + compilation_config.compilation_time += elapsed + if runtime_shape is None: + logger.info_once( + "Compiling a graph for dynamic shape takes %.2f s", + elapsed, + scope="local", + ) + else: + logger.info_once( + "Compiling a graph for shape %s takes %.2f s", + runtime_shape, + elapsed, + scope="local", + ) + + return compiled_graph + + +@dataclasses.dataclass +class SplitItem: + submod_name: str + graph_id: int + is_splitting_graph: bool + graph: fx.GraphModule + + +def split_graph( + graph: fx.GraphModule, splitting_ops: list[str] +) -> tuple[fx.GraphModule, list[SplitItem]]: + # split graph by ops + subgraph_id = 0 + node_to_subgraph_id: dict[fx.Node, int] = {} + split_op_graphs: list[int] = [] + for node in graph.graph.nodes: + if node.op in ("output", "placeholder"): + continue + + # Check if this is a getitem operation on a node from an earlier subgraph. + # If so, assign it to the same subgraph as its input to avoid passing entire + # tuple as input to submodules, which is against standalone_compile and + # AoTAutograd input requirement. + if node.op == "call_function" and node.target == operator.getitem: + # Assign this getitem to the same subgraph as its input + input_node = node.args[0] + if input_node.op != "placeholder": + assert input_node in node_to_subgraph_id + node_to_subgraph_id[node] = node_to_subgraph_id[input_node] + continue + + if should_split(node, splitting_ops): + subgraph_id += 1 + node_to_subgraph_id[node] = subgraph_id + split_op_graphs.append(subgraph_id) + subgraph_id += 1 + else: + node_to_subgraph_id[node] = subgraph_id + + # `keep_original_order` is important! + # otherwise pytorch might reorder the nodes and + # the semantics of the graph will change when we + # have mutations in the graph + split_gm = torch.fx.passes.split_module.split_module( + graph, None, lambda node: node_to_subgraph_id[node], keep_original_order=True + ) + + outputs = [] + + names = [name for (name, module) in split_gm.named_modules()] + + for name in names: + if "." in name or name == "": + # recursive child module or the root module + continue + + module = getattr(split_gm, name) + + graph_id = int(name.replace("submod_", "")) + outputs.append(SplitItem(name, graph_id, (graph_id in split_op_graphs), module)) + + # sort by integer graph_id, rather than string name + outputs.sort(key=lambda x: x.graph_id) + + return split_gm, outputs + + +compilation_start_time = 0.0 + + +class PiecewiseCompileInterpreter(torch.fx.Interpreter): + """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`. + It runs the given graph with fake inputs, and compile some + submodules specified by `compile_submod_names` with the given + compilation configs. + + NOTE: the order in `compile_submod_names` matters, because + it will be used to determine the order of the compiled piecewise + graphs. The first graph will handle logging, and the last graph + has some special cudagraph output handling. + """ + + def __init__( + self, + module: torch.fx.GraphModule, + compile_submod_names: list[str], + vllm_config: VllmConfig, + vllm_backend: "VllmBackend", + ): + super().__init__(module) + from torch._guards import detect_fake_mode + + self.fake_mode = detect_fake_mode() + self.compile_submod_names = compile_submod_names + self.compilation_config = vllm_config.compilation_config + self.vllm_config = vllm_config + self.vllm_backend = vllm_backend + # When True, it annoyingly dumps the torch.fx.Graph on errors. + self.extra_traceback = False + + def run(self, *args): + fake_args = [ + self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t + for t in args + ] + with self.fake_mode, enable_python_dispatcher(): + return super().run(*fake_args) + + def call_module( + self, + target: torch.fx.node.Target, + args: tuple[torch.fx.node.Argument, ...], + kwargs: dict[str, Any], + ) -> Any: + assert isinstance(target, str) + output = super().call_module(target, args, kwargs) + + if target in self.compile_submod_names: + index = self.compile_submod_names.index(target) + submod = self.fetch_attr(target) + sym_shape_indices = [ + i for i, x in enumerate(args) if isinstance(x, torch.SymInt) + ] + global compilation_start_time + + compiled_graph_for_dynamic_shape = ( + self.vllm_backend.compiler_manager.compile( + submod, + args, + self.compilation_config.inductor_compile_config, + self.compilation_config, + graph_index=index, + num_graphs=len(self.compile_submod_names), + runtime_shape=None, + ) + ) + # Lazy import here to avoid circular import + from .piecewise_backend import PiecewiseBackend + + piecewise_backend = PiecewiseBackend( + submod, + self.vllm_config, + index, + len(self.compile_submod_names), + sym_shape_indices, + compiled_graph_for_dynamic_shape, + self.vllm_backend, + ) + + if ( + self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() + and not self.compilation_config.use_inductor_graph_partition + ): + # We're using Dynamo-based piecewise splitting, so we wrap + # the whole subgraph with a static graph wrapper. + from .cuda_graph import CUDAGraphOptions + + # resolve the static graph wrapper class (e.g. CUDAGraphWrapper + # class) as platform dependent. + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls() + ) + + # Always assign PIECEWISE runtime mode to the + # CUDAGraphWrapper for piecewise_backend, to distinguish + # it from the FULL cudagraph runtime mode, no matter it + # is wrapped on a full or piecewise fx graph. + self.module.__dict__[target] = static_graph_wrapper_class( + runnable=submod.forward, + vllm_config=self.vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=piecewise_backend.is_first_graph, + gc_disable=not piecewise_backend.is_first_graph, + weak_ref_output=piecewise_backend.is_last_graph, + ), + ) + else: + self.module.__dict__[target] = piecewise_backend + + compilation_counter.num_piecewise_capturable_graphs_seen += 1 + + return output + + +# the tag for the part of model being compiled, +# e.g. backbone/eagle_head +model_tag: str = "backbone" + + +@contextmanager +def set_model_tag(tag: str): + """Context manager to set the model tag.""" + global model_tag + assert tag != model_tag, ( + f"Model tag {tag} is the same as the current tag {model_tag}." + ) + old_tag = model_tag + model_tag = tag + try: + yield + finally: + model_tag = old_tag + + +class VllmBackend: + """The compilation backend for `torch.compile` with vLLM. + It is used for compilation mode of `CompilationMode.VLLM_COMPILE`, + where we customize the compilation. + + The major work of this backend is to split the graph into + piecewise graphs, and pass them to the piecewise backend. + + This backend also adds the PostGradPassManager to Inductor config, + which handles the post-grad passes. + """ + + vllm_config: VllmConfig + compilation_config: CompilationConfig + _called: bool = False + # the graph we compiled + graph: fx.GraphModule + # the stiching graph module for all the piecewise graphs + split_gm: fx.GraphModule + piecewise_graphs: list[SplitItem] + returned_callable: Callable + # Inductor passes to run on the graph pre-defunctionalization + post_grad_passes: Sequence[Callable] + sym_tensor_indices: list[int] + input_buffers: list[torch.Tensor] + compiler_manager: CompilerManager + + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + ): + # if the model is initialized with a non-empty prefix, + # then usually it's enough to use that prefix, + # e.g. language_model, vision_model, etc. + # when multiple parts are initialized as independent + # models, we need to use the model_tag to distinguish + # them, e.g. backbone (default), eagle_head, etc. + self.prefix = prefix or model_tag + + # Passes to run on the graph post-grad. + # self.post_grad_pass_manager = PostGradPassManager() + + self.sym_tensor_indices = [] + self.input_buffers = [] + + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config + + self.compiler_manager: CompilerManager = CompilerManager( + self.compilation_config + ) + + # `torch.compile` is JIT compiled, so we don't need to + # do anything here + + def configure_post_pass(self): + config = self.compilation_config + self.post_grad_pass_manager.configure(self.vllm_config) + + # Post-grad custom passes are run using the post_grad_custom_post_pass + # hook. If a pass for that hook exists, add it to the pass manager. + inductor_config = config.inductor_compile_config + PASS_KEY = "post_grad_custom_post_pass" + if PASS_KEY in inductor_config: + if isinstance(inductor_config[PASS_KEY], PostGradPassManager): + # PassManager already added to config, make sure it's correct + assert ( + inductor_config[PASS_KEY].uuid() + == self.post_grad_pass_manager.uuid() + ) + else: + # Config should automatically wrap all inductor passes + assert isinstance(inductor_config[PASS_KEY], InductorPass) + self.post_grad_pass_manager.add(inductor_config[PASS_KEY]) + inductor_config[PASS_KEY] = self.post_grad_pass_manager + + def __call__( + self, graph: fx.GraphModule, example_inputs,**kwargs + ) -> VllmSerializableFunction: + from .caching import _compute_code_hash, compilation_config_hash_factors + + vllm_config = self.vllm_config + if not self.compilation_config.cache_dir: + # no provided cache dir, generate one based on the known factors + # that affects the compilation. if none of the factors change, + # the cache dir will be the same so that we can reuse the compiled + # graph. + + factors = compilation_config_hash_factors(vllm_config) + # 2. factors come from the code files that are traced by Dynamo ( + # it mainly summarizes how the model is used in forward pass) + code_hash = _compute_code_hash(self.compilation_config.traced_files) + self.compilation_config.traced_files.clear() + factors.append(code_hash) + + # 3. compiler hash + compiler_hash = self.compiler_manager.compute_hash(vllm_config) + factors.append(compiler_hash) + + # combine all factors to generate the cache dir + hash_key = hashlib.md5( + str(factors).encode(), usedforsecurity=False + ).hexdigest()[:10] + + cache_dir = os.path.join( + envs.VLLM_CACHE_ROOT, + "torch_compile_cache", + hash_key, + ) + self.compilation_config.cache_dir = cache_dir + + cache_dir = self.compilation_config.cache_dir + os.makedirs(cache_dir, exist_ok=True) + self.compilation_config.cache_dir = cache_dir + rank = vllm_config.parallel_config.rank + dp_rank = vllm_config.parallel_config.data_parallel_rank + local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}", self.prefix) + os.makedirs(local_cache_dir, exist_ok=True) + self.compilation_config.local_cache_dir = local_cache_dir + + disable_cache = not is_compile_cache_enabled( + self.compilation_config.inductor_compile_config + ) + + if disable_cache: + logger.info_once("vLLM's torch.compile cache is disabled.", scope="local") + else: + logger.info_once( + "Using cache directory: %s for vLLM's torch.compile", + local_cache_dir, + scope="local", + ) + + self.compiler_manager.initialize_cache( + local_cache_dir, disable_cache, self.prefix + ) + + # when dynamo calls the backend, it means the bytecode + # transform and analysis are done + compilation_counter.num_graphs_seen += 1 + from .monitor import torch_compile_start_time + + dynamo_time = time.time() - torch_compile_start_time + logger.info_once( + "Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local" + ) + self.compilation_config.compilation_time += dynamo_time + + # we control the compilation process, each instance can only be + # called once + assert not self._called, "VllmBackend can only be called once" + + self.graph = graph + # self.configure_post_pass() + + if self.compilation_config.use_inductor_graph_partition: + # Let Inductor decide partitioning; avoid FX-level pre-splitting. + fx_split_ops: list[str] = [] + else: + fx_split_ops = self.compilation_config.splitting_ops or [] + + self.split_gm, self.piecewise_graphs = split_graph(graph, fx_split_ops) + + from torch._dynamo.utils import lazy_format_graph_code + + # depyf will hook lazy_format_graph_code and dump the graph + # for debugging, no need to print the graph here + lazy_format_graph_code("before split", self.graph) + lazy_format_graph_code("after split", self.split_gm) + + compilation_counter.num_piecewise_graphs_seen += len(self.piecewise_graphs) + submod_names_to_compile = [ + item.submod_name + for item in self.piecewise_graphs + if not item.is_splitting_graph + ] + + # propagate the split graph to the piecewise backend, + # compile submodules with symbolic shapes + PiecewiseCompileInterpreter( + self.split_gm, submod_names_to_compile, self.vllm_config, self + ).run(*example_inputs) + + graph_path = os.path.join(local_cache_dir, "computation_graph.py") + if not os.path.exists(graph_path): + # code adapted from + # https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 + # use `print_readable` because it can include submodules + src = ( + "from __future__ import annotations\nimport torch\n" + + self.split_gm.print_readable(print_output=False) + ) + src = src.replace("", "GraphModule") + with open(graph_path, "w") as f: + f.write(src) + + logger.debug_once( + "Computation graph saved to %s", graph_path, scope="local" + ) + + self._called = True + + if ( + self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE + or not self.compilation_config.cudagraph_copy_inputs + ): + return VllmSerializableFunction( + graph, example_inputs, self.prefix, self.split_gm + ) + + # if we need to copy input buffers for cudagraph + from torch._guards import detect_fake_mode + + fake_mode = detect_fake_mode() + fake_args = [ + fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t + for t in example_inputs + ] + + # index of tensors that have symbolic shapes (batch size) + # for weights and static buffers, they will have concrete shapes. + # symbolic shape only happens for input tensors. + from torch.fx.experimental.symbolic_shapes import is_symbolic + + self.sym_tensor_indices = [ + i + for i, x in enumerate(fake_args) + if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) + and any(is_symbolic(d) for d in x.size()) + ] + + # compiler managed cudagraph input buffers + # we assume the first run with symbolic shapes + # has the maximum size among all the tensors + self.input_buffers = [ + example_inputs[x].clone() for x in self.sym_tensor_indices + ] + + # this is the callable we return to Dynamo to run + def copy_and_call(*args): + list_args = list(args) + for i, index in enumerate(self.sym_tensor_indices): + runtime_tensor = list_args[index] + runtime_shape = runtime_tensor.shape[0] + static_tensor = self.input_buffers[i][:runtime_shape] + + # copy the tensor to the static buffer + static_tensor.copy_(runtime_tensor) + + # replace the tensor in the list_args to the static buffer + list_args[index] = static_tensor + return self.split_gm(*list_args) + + return VllmSerializableFunction( + graph, example_inputs, self.prefix, copy_and_call + ) diff --git a/compilation/base_static_graph.py b/compilation/base_static_graph.py new file mode 100644 index 0000000..12f1ff5 --- /dev/null +++ b/compilation/base_static_graph.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any, Protocol + +from vllm.config import CUDAGraphMode, VllmConfig + + +class AbstractStaticGraphWrapper(Protocol): + """ + StaticGraphWrapper interface that allows platforms to wrap a callable + to be captured as a static graph. + """ + + def __init__( + self, + runnable: Callable[..., Any], + vllm_config: VllmConfig, + runtime_mode: CUDAGraphMode, + **kwargs: Any, + ) -> None: + """ + Initializes the StaticGraphWrapper class with graph capturing and + execution-related configurations. + + Args: + runnable (Callable): The callable to be wrapped and captured. + vllm_config (VllmConfig): Global configuration for vLLM. + runtime_mode (CUDAGraphMode): The style of the static + graph runtime. See CUDAGraphMode in vllm/config.py. + Note that only the subset enum `NONE`, `PIECEWISE` and `FULL` + are used as concrete runtime mode for cudagraph dispatching. + Keyword Args: + kwargs: Additional keyword arguments for platform-specific + configurations. + """ + raise NotImplementedError + + def __call__(self, *args: Any, **kwargs: Any) -> Any: + """ + Executes the wrapped callable. + + If the current runtime mode in the ForwardContext matches the runtime + mode of this instance, it replays the CUDAGraph or captures it using + the callable if it hasn't been captured yet. Otherwise, it calls the + original callable directly. + + Args: + *args: Variable length input arguments to be passed into the + callable. + **kwargs: Keyword arguments to be passed into the callable. + + Returns: + Any: Output of the executed callable. + """ + raise NotImplementedError diff --git a/compilation/caching.py b/compilation/caching.py new file mode 100644 index 0000000..16e34c2 --- /dev/null +++ b/compilation/caching.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +import inspect +import os +import pickle +from unittest.mock import patch + +import torch +from torch.utils import _pytree as pytree + +import vllm.envs as envs +from vllm.config import VllmConfig, get_current_vllm_config +from vllm.logger import init_logger + +try: + from torch._dynamo.aot_compile import SerializableCallable +except ImportError: + SerializableCallable = object + +assert isinstance(SerializableCallable, type) + +logger = init_logger(__name__) + + +class VllmSerializableFunction(SerializableCallable): + """ + A wrapper around a compiled function by vllm. It will forward the tensor + inputs to the compiled function and return the result. + It also implements a serialization interface to support PyTorch's precompile + with custom backend, so that we can save and load the compiled function on + disk. There's no need to wrap around the compiled function if we don't want + to serialize them in particular cases. + Right now serialization for the custom backend is done via + serializing the Dynamo fx graph plus example inputs. + """ + + def __init__(self, graph_module, example_inputs, prefix, optimized_call): + assert isinstance(graph_module, torch.fx.GraphModule) + self.graph_module = graph_module + self.example_inputs = example_inputs + self.prefix = prefix + self.optimized_call = optimized_call + self.shape_env = None + sym_input = next( + (i for i in self.example_inputs if isinstance(i, torch.SymInt)), None + ) + if sym_input is not None: + self.shape_env = sym_input.node.shape_env + + def __call__(self, *args, **kwargs): + return self.optimized_call(*args, **kwargs) + + @classmethod + def serialize_compile_artifacts( + cls, compiled_fn: "VllmSerializableFunction" + ) -> bytes: + import sympy + from torch._subclasses import FakeTensorMode + from torch.fx._graph_pickler import GraphPickler, Options + + state = compiled_fn.__dict__.copy() + state.pop("optimized_call") + state.pop("shape_env") + for node in state["graph_module"].graph.nodes: + node.meta.pop("source_fn_stack", None) + node.meta.pop("nn_module_stack", None) + + graph_reducer_override = GraphPickler.reducer_override + + def _graph_reducer_override(self, obj): + if ( + inspect.isclass(obj) + and issubclass(obj, sympy.Function) + and hasattr(obj, "_torch_unpickler") + ): + return obj._torch_unpickler, (obj._torch_handler_name,) + if isinstance(obj, FakeTensorMode): + return type(None), () + return graph_reducer_override(self, obj) + + # Mask off tensor inputs since they are large and not needed. + state["example_inputs"] = pytree.tree_map_only( + torch.Tensor, lambda _: None, state["example_inputs"] + ) + with patch.object(GraphPickler, "reducer_override", _graph_reducer_override): + state["graph_module"] = GraphPickler.dumps( + state["graph_module"], Options(ops_filter=None) + ) + state["example_inputs"] = GraphPickler.dumps(state["example_inputs"]) + return pickle.dumps(state) + + @classmethod + def deserialize_compile_artifacts(cls, data: bytes) -> "VllmSerializableFunction": + from torch._guards import TracingContext, tracing + from torch._subclasses import FakeTensorMode + from torch.fx._graph_pickler import GraphPickler + from torch.fx.experimental.symbolic_shapes import ShapeEnv + + from vllm.compilation.backends import VllmBackend + + state = pickle.loads(data) + fake_mode = FakeTensorMode(shape_env=ShapeEnv()) + state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode) + state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode) + vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"]) + + def optimized_call(*example_inputs): + """ + On the first run of the optimized call, we rerun the compiler + backend which should result in a cache hit. After the backend + call returns, we just do a one-time replacement of the optimized + call with the compiled function, so that subsequent calls are on + the AOT compiled path. + """ + compile_inputs = [ + inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs) + ] + with tracing(TracingContext(fake_mode)): + fn.optimized_call = vllm_backend( + state["graph_module"], compile_inputs + ).optimized_call + return fn.optimized_call(*example_inputs) + + fn = cls(**state, optimized_call=optimized_call) + return fn + + @property + def co_name(self): + """ + Used for depyf debugging. + """ + return "VllmSerializableFunction" + + +def compilation_config_hash_factors(vllm_config: VllmConfig) -> list[str]: + factors = [] + # 0. factors come from the env, for example, The values of + # VLLM_PP_LAYER_PARTITION will affect the computation graph. + env_hash = envs.compute_hash() + factors.append(env_hash) + + # 1. factors come from the vllm_config (it mainly summarizes how the + # model is created) + config_hash = vllm_config.compute_hash() + factors.append(config_hash) + return factors + + +def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str: + items = list(sorted(file_contents.items(), key=lambda x: x[0])) + hash_content = [] + for filepath, content in items: + hash_content.append(filepath) + if filepath == "": + # This means the function was dynamically generated, with + # e.g. exec(). We can't actually check these. + continue + hash_content.append(content) + return hashlib.md5( + "\n".join(hash_content).encode(), usedforsecurity=False + ).hexdigest() + + +def _compute_code_hash(files: set[str]) -> str: + logger.debug( + "Traced files (to be considered for compilation cache):\n%s", "\n".join(files) + ) + file_contents = {} + for filepath in files: + # Skip files that don't exist (e.g., , , etc.) + if not os.path.isfile(filepath): + file_contents[filepath] = "" + else: + with open(filepath) as f: + file_contents[filepath] = f.read() + return _compute_code_hash_with_content(file_contents) diff --git a/compilation/collective_fusion.py b/compilation/collective_fusion.py new file mode 100644 index 0000000..69d4606 --- /dev/null +++ b/compilation/collective_fusion.py @@ -0,0 +1,1234 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from importlib.util import find_spec + +import torch +import torch._inductor.pattern_matcher as pm +import torch.fx as fx +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch.distributed._symmetric_memory import enable_symm_mem_for_group + +from vllm.config import VllmConfig +from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + kFp8StaticTensorSym, +) +from vllm.platforms import current_platform +from vllm.utils.torch_utils import direct_register_custom_op + +from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass + +FP8_DTYPE = current_platform.fp8_dtype() + +if find_spec("flashinfer"): + try: + import flashinfer.comm as flashinfer_comm + + flashinfer_comm = ( + flashinfer_comm + if hasattr(flashinfer_comm, "trtllm_allreduce_fusion") + else None + ) + except ImportError: + flashinfer_comm = None +else: + flashinfer_comm = None + +logger = init_logger(__name__) + +if hasattr(torch.ops._C, "scaled_fp4_quant"): + STATIC_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant.default + + +class BasePattern: + def __init__(self, dtype: torch.dtype, device: str): + self.dtype = dtype + self.device = device + self.tp = get_tp_group() + self.tp_size = get_tensor_model_parallel_world_size() + + +class GEMMReduceScatterPattern(BasePattern): + def get_inputs(self): + mul = torch.empty([16, 4], device=self.device, dtype=self.dtype) + mm_weight = torch.empty([4, 4], device=self.device, dtype=self.dtype) + return [mul, mm_weight] + + def register(self, pm_pass: PatternMatcherPass): + def pattern(mul: torch.Tensor, mm_weight: torch.Tensor): + mm = torch.ops.aten.mm.default(mul, mm_weight) + reduce_scatter = torch.ops.vllm.reduce_scatter.default( + mm, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name, + ) + return reduce_scatter + + def replacement(mul: torch.Tensor, mm_weight: torch.Tensor): + gemm_rs = torch.ops.symm_mem.fused_matmul_reduce_scatter( + mul, + mm_weight, + "avg", + scatter_dim=0, + group_name=self.tp.device_group.group_name, + ) + + return gemm_rs + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class AllGatherGEMMPattern(BasePattern): + def get_inputs(self): + x = torch.empty([4, 4], device=self.device, dtype=self.dtype) + weight = torch.empty([4, 4], device=self.device, dtype=self.dtype) + + return [x, weight] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + x: torch.Tensor, + weight: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + all_gather = torch.ops.vllm.all_gather.default( + x, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name, + ) + + return torch.ops.aten.mm.default(all_gather, weight) + + def replacement( + x: torch.Tensor, weight: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_matmul( + x, + [weight], + gather_dim=0, + group_name=self.tp.device_group.group_name, + ) + return mm_outputs + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class ScaledMMReduceScatterPattern(BasePattern): + def get_inputs(self): + input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE) + mm_weight = ( + torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE) + .contiguous() + .transpose(0, 1) + ) + scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32) + scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32) + return [input, mm_weight, scale_a, scale_b] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + mat2: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + ) -> torch.Tensor: + scaled_mm = torch.ops.aten._scaled_mm.default( + input, + mat2=mat2, + scale_a=scale_a, + scale_b=scale_b, + bias=None, + scale_result=None, + out_dtype=self.dtype, + ) + reduce_scatter = torch.ops.vllm.reduce_scatter.default( + scaled_mm, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name, + ) + return reduce_scatter + + def replacement( + input: torch.Tensor, + mat2: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + ) -> torch.Tensor: + # Calculate output shape: input @ mat2 with scatter_dim reduced + output_shape = [*input.shape[:-1], mat2.shape[1]] + scatter_dim = 0 + gemm_rs = torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter( + input, + mat2, + scale_a, + scale_b, + "avg", + scatter_dim, # orig_scatter_dim + scatter_dim, # scatter_dim_after_maybe_reshape + self.tp.device_group.group_name, + output_shape, + None, # bias + None, # result_scale + self.dtype, # out_dtype + False, # use_fast_accum + ) + + return gemm_rs + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class AllGatherScaledMMPattern(BasePattern): + def get_inputs(self): + x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE) + weight = ( + torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE) + .contiguous() + .transpose(0, 1) + ) + + s1 = x.shape[0] * self.tp_size + + scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32) + scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32) + + return [x, weight, scale_a, scale_b] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + x: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + ) -> torch.Tensor: + all_gather = torch.ops.vllm.all_gather.default( + x, dim=0, world_size=self.tp_size, group_name=self.tp.unique_name + ) + + return torch.ops.aten._scaled_mm.default( + all_gather, + mat2=weight, + scale_a=scale_a, + scale_b=scale_b, + bias=None, + scale_result=None, + out_dtype=self.dtype, + ) + + def replacement( + x: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + ) -> torch.Tensor: + ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul( # noqa + x, + [weight], + scale_a, + [scale_b], + gather_dim=0, + biases=[None], + result_scales=[None], + out_dtypes=[self.dtype], + use_fast_accum=[False], + group_name=self.tp.device_group.group_name, + ) + return mm_outputs + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class CutlassScaledMMReduceScatterPattern(BasePattern): + def get_inputs(self): + input = torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE) + mm_weight = ( + torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE) + .contiguous() + .transpose(0, 1) + ) + scale_a = torch.empty([16, 1], device=self.device, dtype=torch.float32) + scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32) + + cutlass_mm_output = torch.empty([16, 16], device=self.device, dtype=self.dtype) + return [input, mm_weight, scale_a, scale_b, cutlass_mm_output] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + cutlass_mm_output: torch.Tensor, + ) -> torch.Tensor: + cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized( + torch.ops._C.cutlass_scaled_mm.default, + out=cutlass_mm_output, + a=input, + b=weight, + a_scales=scale_a, + b_scales=scale_b, + bias=None, + ) + + reduce_scatter = torch.ops.vllm.reduce_scatter.default( + cutlass_scaled_mm[1], + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name, + ) + return reduce_scatter + + def replacement( + input: torch.Tensor, + mat2: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + cutlass_mm_output: torch.Tensor, + ) -> torch.Tensor: + # Calculate output shape: input @ mat2 with scatter_dim reduced + output_shape = [*input.shape[:-1], mat2.shape[1]] + scatter_dim = 0 + gemm_rs = torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter( + input, + mat2, + scale_a, + scale_b, + "avg", + scatter_dim, # orig_scatter_dim + scatter_dim, # scatter_dim_after_maybe_reshape + self.tp.device_group.group_name, + output_shape, + None, # bias + None, # result_scale + self.dtype, # out_dtype + False, # use_fast_accum + ) + + return gemm_rs + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class AllGatherCutlassScaledMMPattern(BasePattern): + def get_inputs(self): + x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE) + weight = ( + torch.empty([16, 16], device=self.device, dtype=FP8_DTYPE) + .contiguous() + .transpose(0, 1) + ) + + s1 = x.shape[0] * self.tp_size + + scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32) + scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32) + + s2 = weight.shape[1] + output = torch.empty([s1, s2], device=self.device, dtype=self.dtype) + + return [x, weight, scale_a, scale_b, output] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + x: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + output: torch.Tensor, + ) -> torch.Tensor: + all_gather = torch.ops.vllm.all_gather.default( + x, dim=0, world_size=self.tp_size, group_name=self.tp.unique_name + ) + + cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized( + torch.ops._C.cutlass_scaled_mm.default, + out=output, + a=all_gather, + b=weight, + a_scales=scale_a, + b_scales=scale_b, + bias=None, + ) + return cutlass_scaled_mm[1] + + def replacement( + x: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + output: torch.Tensor, + ) -> torch.Tensor: + ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul( # noqa + x, + [weight], + scale_a, + [scale_b], + gather_dim=0, + biases=[None], + result_scales=[None], + out_dtypes=[self.dtype], + use_fast_accum=[False], + group_name=self.tp.device_group.group_name, + ) + return mm_outputs + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class AsyncTPPass(VllmPatternMatcherPass): + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + # Enable symmetric memory for the TP process group + enable_symm_mem_for_group(get_tp_group().device_group.group_name) + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="async_tp_pass" + ) + GEMMReduceScatterPattern(self.model_dtype, self.device).register(self.patterns) + + AllGatherGEMMPattern(self.model_dtype, self.device).register(self.patterns) + + # These fusions are enabled only for bfloat16 models because + # `scaled_mm` or `cutlass_scaled_mm` with per-token (row-wise) scaling + # only supports bfloat16 as the output dtype. + if self.model_dtype == torch.bfloat16: + ScaledMMReduceScatterPattern(self.model_dtype, self.device).register( + self.patterns + ) + AllGatherScaledMMPattern(self.model_dtype, self.device).register( + self.patterns + ) + + CutlassScaledMMReduceScatterPattern(self.model_dtype, self.device).register( + self.patterns + ) + AllGatherCutlassScaledMMPattern(self.model_dtype, self.device).register( + self.patterns + ) + + self.dump_patterns(config, self.patterns) + + def is_applicable(self, shape: int | None) -> bool: + # This pass is applied on top of the sequence parallelism pass. + # It inherits the same applicability condition as `SequenceParallelismPass`. + # See `SequenceParallelismPass.is_applicable` for more details. + if ( + not self.compilation_config.splitting_ops + or self.compilation_config.use_inductor_graph_partition + ): + return True + tp_size = get_tensor_model_parallel_world_size() + return shape is not None and shape % tp_size == 0 + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph): + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + + +# Max size of the input tensor per world size per device capability +# to use flashinfer fused allreduce +FI_ALLREDUCE_FUSION_MAX_SIZE_MB: dict[int, dict[int, float]] = { + 90: { + 2: 64, # 64MB + 4: 2, # 2MB + 8: 0.5, # 0.5MB + }, + 100: { + 2: 64, # 64MB + 4: 32, # 32MB + 8: 1, # 1MB + }, +} + +# Max size of the input tensor per world size per device capability +# to use flashinfer one shot fused allreduce +# OneShot max size is at most 64MB / world size (FlashInfer restriction) +_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB: dict[int, dict[int, float]] = { + 90: { + 2: 32, # 32MB + 4: 2, # 2MB + 8: 0.5, # 0.5MB + }, + 100: { + 2: 32, # 32MB + 4: 4, # 4MB + 8: 1, # 1MB + }, +} + + +if flashinfer_comm is not None: + _FI_WORKSPACE_TENSOR = None + MiB = 1024 * 1024 + + def call_trtllm_fused_allreduce_norm( + allreduce_in: torch.Tensor, + residual: torch.Tensor, + rms_gamma: torch.Tensor, + rms_eps: float, + world_rank: int, + world_size: int, + launch_with_pdl: bool, + trigger_completion_at_end: bool, + fp32_acc: bool, + max_token_num: int, + pattern_code: int, + norm_out: torch.Tensor | None = None, + quant_out: torch.Tensor | None = None, + scale_out: torch.Tensor | None = None, + scale_factor: torch.Tensor | None = None, + ) -> None: + num_tokens, hidden_size = allreduce_in.shape + element_size = allreduce_in.element_size() + current_tensor_size = num_tokens * hidden_size * element_size + + if num_tokens <= max_token_num: + device_capability = current_platform.get_device_capability().to_int() + # Get one shot input size limit for the current world size + # for the current device capability + max_one_shot_size_mb = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get( + device_capability, {} + ).get(world_size, None) + # Use one shot if no max size for one shot is specified + use_oneshot = ( + max_one_shot_size_mb is None + or current_tensor_size <= max_one_shot_size_mb * MiB + ) + + assert _FI_WORKSPACE_TENSOR is not None, ( + "Flashinfer must be enabled when using flashinfer" + ) + if norm_out is None: + norm_out = allreduce_in + residual_out = residual + else: + # return residual_out as allreduce_out with zeroed residual_in + # as flashinfer does not support rms_norm + # and allreduce_out together + residual_out = allreduce_in + # For the sizes that are smaller than the max size, + # we only use flashinfer one shot allreduce + flashinfer_comm.trtllm_allreduce_fusion( + allreduce_in=allreduce_in, + token_num=allreduce_in.shape[0], + residual_in=residual, + residual_out=residual_out, + norm_out=norm_out, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + world_rank=world_rank, + world_size=world_size, + hidden_dim=allreduce_in.shape[-1], + workspace_ptrs=_FI_WORKSPACE_TENSOR, + launch_with_pdl=launch_with_pdl, + use_oneshot=use_oneshot, + trigger_completion_at_end=trigger_completion_at_end, + fp32_acc=fp32_acc, + pattern_code=pattern_code, + allreduce_out=None, + quant_out=quant_out, + scale_out=scale_out, + # in vllm we only support swizzled layout + layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, + scale_factor=scale_factor, + ) + else: + allreduce_out = tensor_model_parallel_all_reduce(allreduce_in) + if scale_factor is not None and scale_out is None: + # Do fused rms norm static fp8 quant fused op + if norm_out is None: + torch.ops._C.fused_add_rms_norm_static_fp8_quant( + quant_out, + allreduce_out, + residual, + rms_gamma, + scale_factor, + rms_eps, + ) + else: + torch.ops._C.rms_norm_static_fp8_quant( + quant_out, allreduce_out, rms_gamma, scale_factor, rms_eps + ) + else: + if norm_out is None: + torch.ops._C.fused_add_rms_norm( + allreduce_out, residual, rms_gamma, rms_eps + ) + norm_out = allreduce_out + else: + torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, rms_eps) + if scale_factor is not None and scale_out is not None: + torch.ops._C.scaled_fp4_quant( + quant_out, norm_out, scale_out, scale_factor + ) + if scale_factor is None or norm_out is not None: + # we need to return allreduce output + # in cases of non quant fused AR + RMS norm + # and fused AR + RMS norm + quant without fused add + allreduce_in.copy_(allreduce_out) + + def call_trtllm_fused_allreduce_norm_fake( + allreduce_in: torch.Tensor, + residual: torch.Tensor, + rms_gamma: torch.Tensor, + rms_eps: float, + world_rank: int, + world_size: int, + launch_with_pdl: bool, + trigger_completion_at_end: bool, + fp32_acc: bool, + max_token_num: int, + pattern_code: int, + norm_out: torch.Tensor | None = None, + quant_out: torch.Tensor | None = None, + scale_out: torch.Tensor | None = None, + scale_factor: torch.Tensor | None = None, + ) -> None: + pass + + direct_register_custom_op( + op_name="flashinfer_trtllm_fused_allreduce_norm", + op_func=call_trtllm_fused_allreduce_norm, + mutates_args=[ + "allreduce_in", + "residual", + "norm_out", + "quant_out", + "scale_out", + ], + fake_impl=call_trtllm_fused_allreduce_norm_fake, + ) + flashinfer_trtllm_fused_allreduce_norm = ( + torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default + ) + + +class FlashInferFusedAllReduceParams: + """Parameters for FlashInfer fused allreduce operations.""" + + def __init__( + self, + rank: int, + world_size: int, + use_fp32_lamport: bool = False, + max_token_num: int = 1024, + ): + self.rank = rank + self.world_size = world_size + self.use_fp32_lamport = use_fp32_lamport + self.trigger_completion_at_end = True + self.launch_with_pdl = True + self.fp32_acc = True + self.max_token_num = max_token_num + + def get_trtllm_fused_allreduce_kwargs(self): + return { + "world_rank": self.rank, + "world_size": self.world_size, + "launch_with_pdl": self.launch_with_pdl, + "trigger_completion_at_end": self.trigger_completion_at_end, + "fp32_acc": self.fp32_acc, + "max_token_num": self.max_token_num, + } + + +class AllReduceRMSNormPattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (without residual) + with fused flashinfer implementation. + Applies to allreduce + rmsnorm before attn in the first Transformer block. + """ + + def __init__( + self, + epsilon: float, + dtype: torch.dtype, + device: str, + allreduce_params: FlashInferFusedAllReduceParams, + ): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + self.rmsnorm_matcher = MatcherRMSNorm(epsilon) + + def get_inputs(self): + input, weight = self.rmsnorm_matcher.inputs() + + # input goes through allreduce first, always 16-bit + return [input.to(self.dtype), weight] + + def register(self, pm_pass: PatternMatcherPass): + def pattern(input: torch.Tensor, weight: torch.Tensor): + allreduce_output = tensor_model_parallel_all_reduce(input) + rms = self.rmsnorm_matcher(allreduce_output, weight) + + return rms, allreduce_output + + def replacement(input: torch.Tensor, weight: torch.Tensor): + residual = torch.zeros_like(input) + rms_result = torch.empty_like(input) + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=rms_result, + quant_out=None, + scale_out=None, + rms_gamma=weight, + rms_eps=self.epsilon, + pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + # rms_result, allreduce_in + return allreduce[3], allreduce[1] + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class AllReduceFusedAddRMSNormPattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (with residual) + with fused flashinfer implementation. + Applies to o_proj + rmsnorm after attn and mlp + rmsnorm before attn. + """ + + def __init__( + self, + epsilon: float, + dtype: torch.dtype, + device: str, + allreduce_params: FlashInferFusedAllReduceParams, + ): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon) + + def get_inputs(self): + input, residual, weight = self.rmsnorm_matcher.inputs() + + # input goes through allreduce first, always 16-bit + return [residual, input.to(self.dtype), weight] + + def register(self, pm_pass: PatternMatcherPass): + def pattern(residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor): + allreduce_output = tensor_model_parallel_all_reduce(input) + rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual) + return rms, residual + + def replacement( + residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor + ): + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=None, + quant_out=None, + scale_out=None, + rms_gamma=weight, + rms_eps=self.epsilon, + pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + # allreduce_in, residual + return allreduce[1], allreduce[2] + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + # Same pattern, but only return the output and not residual + # (helpful for end of graph where residual is not used again) + first_return_only = lambda fn: lambda a, b, c: fn(a, b, c)[0] + + pm.register_replacement( + first_return_only(pattern), + first_return_only(replacement), + self.get_inputs(), + pm.fwd_only, + pm_pass, + ) + + +class AllReduceFusedRMSNormStaticQuantFP8Pattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (without residual) + + static fp8 quant with fused flashinfer implementation. + Applies to allreduce + rmsnorm + quant before attn + in the first Transformer block. + """ + + def __init__( + self, + epsilon: float, + dtype: torch.dtype, + device: str, + allreduce_params: FlashInferFusedAllReduceParams, + ): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + self.quant_dtype = torch.float8_e4m3fn + self.rmsnorm_matcher = MatcherRMSNorm(epsilon) + self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym) + + def register(self, pm_pass: PatternMatcherPass): + def get_inputs(): + input, weight = self.rmsnorm_matcher.inputs() + _, scale = self.quant_matcher.inputs() + + # input goes through allreduce first, always 16-bit + return [input.to(self.dtype), weight, scale] + + def pattern( + input: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + ): + all_reduce = tensor_model_parallel_all_reduce(input) + rms = self.rmsnorm_matcher(all_reduce, weight) + quant, _ = self.quant_matcher(rms, scale) + return quant, all_reduce + + def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor): + residual = torch.zeros_like(input) + result_rms = torch.empty_like(input) + result_quant = torch.empty_like(input, dtype=self.quant_dtype) + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=result_rms, + quant_out=result_quant, + scale_out=None, + rms_gamma=weight, + rms_eps=self.epsilon, + # We don't use norm_out afterwards + pattern_code=( + flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant + ), + scale_factor=scale, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + + # quant_out, allreduce_output + return allreduce[4], allreduce[1] + + pm.register_replacement( + pattern, replacement, get_inputs(), pm.fwd_only, pm_pass + ) + + +class AllReduceFusedAddRMSNormStaticQuantFP8Pattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (with residual) + + static fp8 quant with fused flashinfer implementation. + Applies to o_proj + rmsnorm after attn + quant and + mlp + rmsnorm + quant before attn. + """ + + def __init__( + self, + epsilon: float, + dtype: torch.dtype, + device: str, + allreduce_params: FlashInferFusedAllReduceParams, + ): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + self.quant_dtype = torch.float8_e4m3fn + + self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon) + self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym) + + def register(self, pm_pass: PatternMatcherPass): + def get_inputs(): + input, residual, weight = self.rmsnorm_matcher.inputs() + _, scale = self.quant_matcher.inputs() + + # input goes through allreduce first, always 16-bit + return [residual, input.to(self.dtype), weight, scale] + + def pattern( + residual: torch.Tensor, + input: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + ): + allreduce_output = tensor_model_parallel_all_reduce(input) + rms, res = self.rmsnorm_matcher(allreduce_output, weight, residual) + quant, _ = self.quant_matcher(rms, scale) + + return quant, res + + def replacement( + residual: torch.Tensor, + input: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + ): + result_quant = torch.empty_like(input, dtype=self.quant_dtype) + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=None, + quant_out=result_quant, + scale_out=None, + rms_gamma=weight, + rms_eps=self.epsilon, + # We don't use norm_out afterwards + pattern_code=( + flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant + ), + scale_factor=scale, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + # quant_out, rms_norm_residual + return allreduce[4], allreduce[2] + + pm.register_replacement( + pattern, replacement, get_inputs(), pm.fwd_only, pm_pass + ) + + +class AllReduceFusedRMSNormStaticQuantNVFP4Pattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (without residual) + + static nvfp4 quant with fused flashinfer implementation. + Applies to allreduce + rmsnorm + quant before attn + in the first Transformer block. + """ + + def __init__( + self, + epsilon: float, + dtype: torch.dtype, + device: str, + allreduce_params: FlashInferFusedAllReduceParams, + ): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + self.rmsnorm_matcher = MatcherRMSNorm(epsilon) + + def register(self, pm_pass: PatternMatcherPass): + def get_inputs(): + input = torch.empty([1, 16, 16], device=self.device, dtype=self.dtype) + quant_result = torch.empty((16, 8), device=self.device, dtype=torch.uint8) + input_global_scale = torch.empty( + [1, 1], device=self.device, dtype=torch.float32 + ) + weight = torch.empty([16], device=self.device, dtype=self.dtype) + output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32) + + return [input, quant_result, weight, input_global_scale, output_scale] + + def pattern( + input: torch.Tensor, + quant_result: torch.Tensor, + weight: torch.Tensor, + input_global_scale: torch.Tensor, + output_scale: torch.Tensor, + ): + all_reduce = tensor_model_parallel_all_reduce(input) + rms = self.rmsnorm_matcher(all_reduce, weight) + quant_out_tuple = auto_functionalized( + STATIC_FP4_QUANT_OP, + output=quant_result, + input=rms, + output_scale=output_scale, + input_scale=input_global_scale, + ) + + # quant_out, allreduce_output, output_scale + return quant_out_tuple[1], all_reduce, quant_out_tuple[2] + + def replacement( + input: torch.Tensor, + quant_result: torch.Tensor, + weight: torch.Tensor, + input_global_scale: torch.Tensor, + output_scale: torch.Tensor, + ): + residual = torch.zeros_like(input) + result_rms = torch.empty_like(input) + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=result_rms, + quant_out=quant_result, + scale_out=output_scale, + rms_gamma=weight, + rms_eps=self.epsilon, + # We don't use norm_out afterwards + pattern_code=( + flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant + ), + scale_factor=input_global_scale, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + + # quant_out, allreduce_output, output_scale + return allreduce[4], allreduce[1], allreduce[5] + + pm.register_replacement( + pattern, replacement, get_inputs(), pm.fwd_only, pm_pass + ) + + +class AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern(BasePattern): + """ + This pattern replaces the allreduce + rms norm (with residual) + + static nvfp4 quant with fused flashinfer implementation. + Applies to o_proj + rmsnorm after attn + quant and + mlp + rmsnorm + quant before attn. + """ + + def __init__( + self, + epsilon: float, + dtype: torch.dtype, + device: str, + allreduce_params: FlashInferFusedAllReduceParams, + ): + super().__init__(dtype, device) + self.epsilon = epsilon + self.allreduce_params = allreduce_params + self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon) + + def register(self, pm_pass: PatternMatcherPass): + def get_inputs(): + input = torch.empty([16, 16], device=self.device, dtype=self.dtype) + + residual = torch.empty([16, 16], device=self.device, dtype=self.dtype) + weight = torch.empty([16, 16], device=self.device, dtype=self.dtype) + quant_result = torch.empty((16, 8), device=self.device, dtype=torch.uint8) + input_global_scale = torch.empty( + [1, 1], device=self.device, dtype=torch.float32 + ) + output_scale = torch.empty([128, 4], device=self.device, dtype=torch.int32) + + return [ + quant_result, + residual, + input, + output_scale, + weight, + input_global_scale, + ] + + def pattern( + quant_result: torch.Tensor, + residual: torch.Tensor, + input: torch.Tensor, + output_scale: torch.Tensor, + weight: torch.Tensor, + input_global_scale: torch.Tensor, + ): + allreduce_output = tensor_model_parallel_all_reduce(input) + rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual) + quant_out_tuple = auto_functionalized( + STATIC_FP4_QUANT_OP, + output=quant_result, + input=rms, + output_scale=output_scale, + input_scale=input_global_scale, + ) + + # quant_out, allreduce_output, output_scale + return quant_out_tuple[1], residual, quant_out_tuple[2] + + def replacement( + quant_result: torch.Tensor, + residual: torch.Tensor, + input: torch.Tensor, + output_scale: torch.Tensor, + weight: torch.Tensor, + input_global_scale: torch.Tensor, + ): + allreduce = auto_functionalized( + flashinfer_trtllm_fused_allreduce_norm, + allreduce_in=input, + residual=residual, + norm_out=None, + quant_out=quant_result, + scale_out=output_scale, + rms_gamma=weight, + rms_eps=self.epsilon, + # We don't use norm_out afterwards + pattern_code=( + flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant + ), + scale_factor=input_global_scale, + **self.allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + # quant_out, rms_norm_residual, output_scale + return allreduce[4], allreduce[2], allreduce[5] + + pm.register_replacement( + pattern, replacement, get_inputs(), pm.fwd_only, pm_pass + ) + + +class AllReduceFusionPass(VllmPatternMatcherPass): + def __init__(self, config: VllmConfig): + super().__init__(config) + self.disabled = True + self.tp_size = get_tensor_model_parallel_world_size() + if self.tp_size <= 1: + return + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="all_reduce_fusion_pass" + ) + if config.model_config is None: + return + self.hidden_dim = config.model_config.get_hidden_size() + self.group = get_tp_group().device_group + rank = get_tensor_model_parallel_rank() + use_fp32_lamport = self.model_dtype == torch.float32 + if flashinfer_comm is None: + logger.warning( + "Flashinfer is not installed or comm module not found, " + "skipping allreduce fusion pass" + ) + return + max_size = config.compilation_config.pass_config.flashinfer_max_size( + self.tp_size + ) + if max_size is None: + # Flashinfer doesn't support current world size + logger.warning( + "Flashinfer allreduce fusion is not supported for world size %s", + self.tp_size, + ) + return + element_size = 4 if use_fp32_lamport else 2 + self.max_token_num = max_size // (self.hidden_dim * element_size) + # take the min to save workspace size and we'll never use more + # than max_num_batched_tokens anyways + self.max_token_num = min( + self.max_token_num, config.scheduler_config.max_num_batched_tokens + ) + logger.debug_once( + f"Flashinfer max size: {max_size // (1024 * 1024)} MB," + "Maximal number of tokens used by " + f"Flashinfer Allreduce Fusion: {self.max_token_num}", + scope="global", + ) + + self.ipc_handles, workspace_tensor = ( + flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion( + tp_rank=rank, + tp_size=self.tp_size, + max_token_num=self.max_token_num, + hidden_dim=self.hidden_dim, + group=self.group, + use_fp32_lamport=use_fp32_lamport, + ) + ) + + global _FI_WORKSPACE_TENSOR + _FI_WORKSPACE_TENSOR = workspace_tensor + self.allreduce_params = FlashInferFusedAllReduceParams( + rank=rank, + world_size=self.tp_size, + use_fp32_lamport=use_fp32_lamport, + max_token_num=self.max_token_num, + ) + + self.register_patterns() + self.dump_patterns(config, self.patterns) + + @enable_fake_mode + def register_patterns(self): + for epsilon in [1e-5, 1e-6]: + AllReduceFusedRMSNormStaticQuantFP8Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + AllReduceFusedAddRMSNormStaticQuantFP8Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + if current_platform.has_device_capability(100): + AllReduceFusedRMSNormStaticQuantNVFP4Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + AllReduceRMSNormPattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + AllReduceFusedAddRMSNormPattern( + epsilon, + self.model_dtype, + self.device, + self.allreduce_params, + ).register(self.patterns) + + # WARNING: This is a hack to clear the pattern matcher cache + # and allow multiple values of epsilon. + torch._inductor.pattern_matcher._seen_patterns.clear() + + self.disabled = False + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph): + if self.disabled: + logger.debug("AllReduceFusionPass disabled") + return + + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + + def __del__(self): + if getattr(self, "disabled", True): + return + if flashinfer_comm is not None: + flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce( + self.ipc_handles, self.group + ) diff --git a/compilation/compiler_interface.py b/compilation/compiler_interface.py new file mode 100644 index 0000000..aee5790 --- /dev/null +++ b/compilation/compiler_interface.py @@ -0,0 +1,639 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib +import copy +import hashlib +import os +from collections.abc import Callable +from contextlib import ExitStack +from typing import Any, Literal +from unittest.mock import patch + +import torch +import torch._inductor.compile_fx +import torch.fx as fx + +import vllm.envs as envs +from vllm.compilation.counter import compilation_counter +from vllm.config import VllmConfig +from vllm.utils.torch_utils import is_torch_equal_or_newer + + +class CompilerInterface: + """ + The interface for a compiler that can be used by vLLM. + """ + + # The name of the compiler, e.g. inductor. + # This is a class-level attribute. + name: str + + def initialize_cache( + self, cache_dir: str, disable_cache: bool = False, prefix: str = "" + ): + """ + when the vLLM process uses `cache_dir` as the cache directory, + the compiler should initialize itself with the cache directory, + e.g. by re-directing its own cache directory to a sub-directory. + + prefix can be used in combination with cache_dir to figure out the base + cache directory, e.g. there're multiple parts of model being compiled, + but we want to share the same cache directory for all of them. + + e.g. + cache_dir = "/path/to/dir/backbone", prefix = "backbone" + cache_dir = "/path/to/dir/eagle_head", prefix = "eagle_head" + """ + pass + + def compute_hash(self, vllm_config: VllmConfig) -> str: + """ + Gather all the relevant information from the vLLM config, + to compute a hash so that we can cache the compiled model. + + See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash] + to check what information + is already considered by default. This function should only + consider the information that is specific to the compiler. + """ + return "" + + def compile( + self, + graph: fx.GraphModule, + example_inputs: list[Any], + compiler_config: dict[str, Any], + runtime_shape: int | None = None, + key: str | None = None, + ) -> tuple[Callable | None, Any | None]: + """ + Compile the graph with the given example inputs and compiler config, + with a runtime shape. If the `runtime_shape` is None, it means + the `example_inputs` have a dynamic shape. Otherwise, the + `runtime_shape` specifies the shape of the inputs. Right now we only + support one variable shape for all inputs, which is the batchsize + (number of tokens) during inference. + + Dynamo will make sure `graph(*example_inputs)` is valid. + + The function should return a compiled callable function, as well as + a handle that can be used to directly load the compiled function. + + The handle should be a plain Python object, preferably a string or a + file path for readability. + + If the compiler doesn't support caching, it should return None for the + handle. If the compiler fails to compile the graph, it should return + None for the compiled function as well. + + `key` is required for StandaloneInductorAdapter, it specifies where to + save the compiled artifact. The compiled artifact gets saved to + `cache_dir/key`. + """ + return None, None + + def load( + self, + handle: Any, + graph: fx.GraphModule, + example_inputs: list[Any], + graph_index: int, + runtime_shape: int | None = None, + ) -> Callable: + """ + Load the compiled function from the handle. + Raises an error if the handle is invalid. + + The handle is the second return value of the `compile` function. + """ + raise NotImplementedError("caching is not supported") + + +class AlwaysHitShapeEnv: + """ + Why do we need this class: + + For normal `torch.compile` usage, every compilation will have + one Dynamo bytecode compilation and one Inductor compilation. + The Inductor compilation happens under the context of the + Dynamo bytecode compilation, and that context is used to + determine the dynamic shape information, etc. + + For our use case, we only run Dynamo bytecode compilation once, + and run Inductor compilation multiple times with different shapes + plus a general shape. The compilation for specific shapes happens + outside of the context of the Dynamo bytecode compilation. At that + time, we don't have shape environment to provide to Inductor, and + it will fail the Inductor code cache lookup. + + By providing a dummy shape environment that always hits, we can + make the Inductor code cache lookup always hit, and we can + compile the graph for different shapes as needed. + + The following dummy methods are obtained by trial-and-error + until it works. + """ + + def __init__(self) -> None: + self.guards: list[Any] = [] + + def evaluate_guards_expression(self, *args, **kwargs): + return True + + def get_pruned_guards(self, *args, **kwargs): + return [] + + def produce_guards_expression(self, *args, **kwargs): + return "" + + +def get_inductor_factors() -> list[Any]: + factors: list[Any] = [] + # summarize system state + from torch._inductor.codecache import CacheBase + + system_factors = CacheBase.get_system() + factors.append(system_factors) + + # summarize pytorch state + from torch._inductor.codecache import torch_key + + torch_factors = torch_key() + factors.append(torch_factors) + return factors + + +def is_compile_cache_enabled( + vllm_additional_inductor_config: dict[str, Any], +) -> bool: + vllm_inductor_config_disable_cache = vllm_additional_inductor_config.get( + "force_disable_caches", False + ) + + # TODO(gmagogsfm): Replace torch._inductor.config.force_disable_caches + # with torch.compiler.config.force_disable_caches when minimum PyTorch + # version reaches 2.10 + return ( + not envs.VLLM_DISABLE_COMPILE_CACHE + and not torch._inductor.config.force_disable_caches + and not vllm_inductor_config_disable_cache + ) + + +class InductorStandaloneAdaptor(CompilerInterface): + """ + The adaptor for the Inductor compiler. + Requires PyTorch 2.8+. + This is not on by default yet, but we plan to turn it on by default for + PyTorch 2.8. + + Use VLLM_USE_STANDALONE_COMPILE to toggle this on or off. + """ + + name = "inductor_standalone" + + def __init__(self, save_format: Literal["binary", "unpacked"]): + self.save_format = save_format + + def compute_hash(self, vllm_config: VllmConfig) -> str: + factors = get_inductor_factors() + hash_str = hashlib.md5( + str(factors).encode(), usedforsecurity=False + ).hexdigest()[:10] + return hash_str + + def initialize_cache( + self, cache_dir: str, disable_cache: bool = False, prefix: str = "" + ): + self.cache_dir = cache_dir + + def compile( + self, + graph: fx.GraphModule, + example_inputs: list[Any], + compiler_config: dict[str, Any], + runtime_shape: int | None = None, + key: str | None = None, + ) -> tuple[Callable | None, Any | None]: + compilation_counter.num_inductor_compiles += 1 + current_config = {} + if compiler_config is not None: + current_config.update(compiler_config) + set_inductor_config(current_config, runtime_shape) + # set_functorch_config() + + if isinstance(runtime_shape, int): + dynamic_shapes = "from_example_inputs" + else: + dynamic_shapes = "from_tracing_context" + + from torch._inductor import standalone_compile + + compiled_graph = standalone_compile( + graph, + example_inputs, + dynamic_shapes=dynamic_shapes, + options={"config_patches": current_config}, + ) + + # Save the compiled artifact to disk in the specified path + assert key is not None + path = os.path.join(self.cache_dir, key) + + if is_compile_cache_enabled(compiler_config): + compiled_graph.save(path=path, format=self.save_format) + compilation_counter.num_compiled_artifacts_saved += 1 + return compiled_graph, (key, path) + + def load( + self, + handle: Any, + graph: fx.GraphModule, + example_inputs: list[Any], + graph_index: int, + runtime_shape: int | None = None, + ) -> Callable: + assert isinstance(handle, tuple) + assert isinstance(handle[0], str) + assert isinstance(handle[1], str) + path = handle[1] + inductor_compiled_graph = torch._inductor.CompiledArtifact.load( + path=path, format=self.save_format + ) + from torch._inductor.compile_fx import graph_returns_tuple + + returns_tuple = graph_returns_tuple(graph) + + def compiled_graph_wrapper(*args): + graph_output = inductor_compiled_graph(*args) + # unpack the tuple if needed + # TODO(rzou): the implication is that we're not + # reading the python bytecode correctly in vLLM? + if returns_tuple: + return graph_output + else: + return graph_output[0] + + return compiled_graph_wrapper + + +class InductorAdaptor(CompilerInterface): + """ + The adaptor for the Inductor compiler, version 2.5, 2.6, 2.7. + """ + + name = "inductor" + + def compute_hash(self, vllm_config: VllmConfig) -> str: + factors = get_inductor_factors() + hash_str = hashlib.md5( + str(factors).encode(), usedforsecurity=False + ).hexdigest()[:10] + return hash_str + + def initialize_cache( + self, cache_dir: str, disable_cache: bool = False, prefix: str = "" + ): + self.cache_dir = cache_dir + self.prefix = prefix + self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir + if disable_cache: + return + # redirect the cache directory to a subdirectory + # set flags so that Inductor and Triton store their cache + # in the cache_dir, then users only need to copy the cache_dir + # to another machine to reuse the cache. + inductor_cache = os.path.join(self.base_cache_dir, "inductor_cache") + os.makedirs(inductor_cache, exist_ok=True) + os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache + triton_cache = os.path.join(self.base_cache_dir, "triton_cache") + os.makedirs(triton_cache, exist_ok=True) + os.environ["TRITON_CACHE_DIR"] = triton_cache + + def compile( + self, + graph: fx.GraphModule, + example_inputs: list[Any], + compiler_config: dict[str, Any], + runtime_shape: int | None = None, + key: str | None = None, + ) -> tuple[Callable | None, Any | None]: + compilation_counter.num_inductor_compiles += 1 + from torch._inductor.compile_fx import compile_fx + + current_config = {} + if compiler_config is not None: + current_config.update(compiler_config) + + # disable remote cache + current_config["fx_graph_cache"] = True + current_config["fx_graph_remote_cache"] = False + + set_inductor_config(current_config, runtime_shape) + # set_functorch_config() + + # inductor can inplace modify the graph, so we need to copy it + # see https://github.com/pytorch/pytorch/issues/138980 + graph = copy.deepcopy(graph) + + # it's the first time we compile this graph + # the assumption is that we don't have nested Inductor compilation. + # compiled_fx_graph_hash will only be called once, and we can hook + # it to get the hash of the compiled graph directly. + + hash_str, file_path = None, None + from torch._inductor.codecache import FxGraphCache, compiled_fx_graph_hash + + if torch.__version__.startswith("2.5"): + original_load = FxGraphCache.load + original_load_name = "torch._inductor.codecache.FxGraphCache.load" + + def hijack_load(*args, **kwargs): + inductor_compiled_graph = original_load(*args, **kwargs) + nonlocal file_path + compiled_fn = inductor_compiled_graph.current_callable + file_path = compiled_fn.__code__.co_filename # noqa + if ( + not file_path.startswith(self.base_cache_dir) + and compiled_fn.__closure__ is not None + ): + # hooked in the align_inputs_from_check_idxs function + # in torch/_inductor/utils.py + for cell in compiled_fn.__closure__: + if not callable(cell.cell_contents): + continue + if cell.cell_contents.__code__.co_filename.startswith( + self.base_cache_dir + ): + # this is the real file path compiled from Inductor + file_path = cell.cell_contents.__code__.co_filename + break + return inductor_compiled_graph + + hijacked_compile_fx_inner = torch._inductor.compile_fx.compile_fx_inner # noqa + elif torch.__version__ >= "2.6": + # function renamed in 2.6 + original_load_name = None + + def hijacked_compile_fx_inner(*args, **kwargs): + output = torch._inductor.compile_fx.compile_fx_inner(*args, **kwargs) + nonlocal hash_str + inductor_compiled_graph = output + if inductor_compiled_graph is not None: + nonlocal file_path + compiled_fn = inductor_compiled_graph.current_callable + file_path = compiled_fn.__code__.co_filename # noqa + if ( + not file_path.startswith(self.base_cache_dir) + and compiled_fn.__closure__ is not None + ): + # hooked in the align_inputs_from_check_idxs function + # in torch/_inductor/utils.py + for cell in compiled_fn.__closure__: + if not callable(cell.cell_contents): + continue + code = cell.cell_contents.__code__ + if code.co_filename.startswith(self.base_cache_dir): + # this is the real file path + # compiled from Inductor + file_path = code.co_filename + break + hash_str = inductor_compiled_graph._fx_graph_cache_key + return output + + def hijack_compiled_fx_graph_hash(*args, **kwargs): + out = compiled_fx_graph_hash(*args, **kwargs) + nonlocal hash_str + hash_str = out[0] + return out + + def _check_can_cache(*args, **kwargs): + # no error means it can be cached. + # Inductor refuses to cache the graph outside of Dynamo + # tracing context, and also disables caching for graphs + # with high-order ops. + # For vLLM, in either case, we want to cache the graph. + # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa + return + + def _get_shape_env() -> AlwaysHitShapeEnv: + return AlwaysHitShapeEnv() + + with ExitStack() as stack: + # hijack to get the compiled graph itself + if original_load_name is not None: + stack.enter_context(patch(original_load_name, hijack_load)) + + # for hijacking the hash of the compiled graph + stack.enter_context( + patch( + "torch._inductor.codecache.compiled_fx_graph_hash", + hijack_compiled_fx_graph_hash, + ) + ) + + # for providing a dummy shape environment + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._get_shape_env", + _get_shape_env, + ) + ) + + from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache + + # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache + if hasattr(AOTAutogradCache, "_get_shape_env"): + stack.enter_context( + patch( + "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env", + _get_shape_env, + ) + ) + + # for forcing the graph to be cached + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._check_can_cache", + _check_can_cache, + ) + ) + + # Dynamo metrics context, see method for more details. + stack.enter_context(self.metrics_context()) + + # Disable remote caching. When these are on, on remote cache-hit, + # the monkey-patched functions never actually get called. + # vLLM today assumes and requires the monkey-patched functions to + # get hit. + # TODO(zou3519): we're going to replace this all with + # standalone_compile sometime. + if is_torch_equal_or_newer("2.6"): + stack.enter_context( + torch._inductor.config.patch(fx_graph_remote_cache=False) + ) + # InductorAdaptor (unfortunately) requires AOTAutogradCache + # to be turned off to run. It will fail to acquire the hash_str + # and error if not. + # StandaloneInductorAdaptor (PyTorch 2.8+) fixes this problem. + stack.enter_context( + torch._functorch.config.patch(enable_autograd_cache=False) + ) + stack.enter_context( + torch._functorch.config.patch(enable_remote_autograd_cache=False) + ) + + compiled_graph = compile_fx( + graph, + example_inputs, + inner_compile=hijacked_compile_fx_inner, + config_patches=current_config, + ) + + # Turn off the checks if we disable the compilation cache. + if is_compile_cache_enabled(compiler_config): + if hash_str is None: + raise RuntimeError( + "vLLM failed to compile the model. The most " + "likely reason for this is that a previous compilation " + "failed, leading to a corrupted compilation artifact. " + "We recommend trying to " + "remove ~/.cache/vllm/torch_compile_cache and try again " + "to see the real issue. " + ) + assert file_path is not None, ( + "failed to get the file path of the compiled graph" + ) + return compiled_graph, (hash_str, file_path) + + def load( + self, + handle: Any, + graph: fx.GraphModule, + example_inputs: list[Any], + graph_index: int, + runtime_shape: int | None = None, + ) -> Callable: + assert isinstance(handle, tuple) + assert isinstance(handle[0], str) + assert isinstance(handle[1], str) + hash_str = handle[0] + + from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache + from torch._inductor.codecache import FxGraphCache + + with ExitStack() as exit_stack: + exit_stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._get_shape_env", + lambda *args, **kwargs: AlwaysHitShapeEnv(), + ) + ) + # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache + if hasattr(AOTAutogradCache, "_get_shape_env"): + exit_stack.enter_context( + patch( + "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env", + lambda *args, **kwargs: AlwaysHitShapeEnv(), + ) + ) + + # Dynamo metrics context, see method for more details. + exit_stack.enter_context(self.metrics_context()) + + if torch.__version__.startswith("2.5"): + inductor_compiled_graph = FxGraphCache._lookup_graph( + hash_str, example_inputs, True, False + ) + assert inductor_compiled_graph is not None, ( + "Inductor cache lookup failed. Please remove" + f"the cache directory and try again." # noqa + ) + elif torch.__version__ >= "2.6": + from torch._inductor.output_code import CompiledFxGraphConstantsWithGm + + constants = CompiledFxGraphConstantsWithGm(graph) + inductor_compiled_graph, _ = FxGraphCache._lookup_graph( + hash_str, example_inputs, True, None, constants + ) + assert inductor_compiled_graph is not None, ( + "Inductor cache lookup failed. Please remove" + f"the cache directory and try again." # noqa + ) + + # Inductor calling convention (function signature): + # f(list) -> tuple + # Dynamo calling convention (function signature): + # f(*args) -> Any + + # need to know if the graph returns a tuple + from torch._inductor.compile_fx import graph_returns_tuple + + returns_tuple = graph_returns_tuple(graph) + + # this is the callable we return to Dynamo to run + def compiled_graph(*args): + # convert args to list + list_args = list(args) + graph_output = inductor_compiled_graph(list_args) + # unpack the tuple if needed + if returns_tuple: + return graph_output + else: + return graph_output[0] + + return compiled_graph + + def metrics_context(self) -> contextlib.AbstractContextManager: + """ + This method returns the Dynamo metrics context (if it exists, + otherwise a null context). It is used by various compile components. + Present in torch>=2.6, it's used inside FxGraphCache in + torch==2.6 (but not after). It might also be used in various other + torch.compile internal functions. + + Because it is re-entrant, we always set it (even if entering via Dynamo + and the context was already entered). We might want to revisit if it + should be set at a different mode of compilation. + + This is likely a bug in PyTorch: public APIs should not rely on + manually setting up internal contexts. But we also rely on non-public + APIs which might not provide these guarantees. + """ + if is_torch_equal_or_newer("2.6"): + import torch._dynamo.utils + + return torch._dynamo.utils.get_metrics_context() + else: + return contextlib.nullcontext() + + +def set_inductor_config(config, runtime_shape): + if isinstance(runtime_shape, int): + # for a specific batchsize, tuning triton kernel parameters + # can be beneficial + config["max_autotune"] = envs.VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE + config["coordinate_descent_tuning"] = ( + envs.VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING + ) + + +def set_functorch_config(): + torch._functorch.config.bundled_autograd_cache = False + + +class EagerAdaptor(CompilerInterface): + name = "eager" + + def compile( + self, + graph: fx.GraphModule, + example_inputs: list[Any], + compiler_config: dict[str, Any], + runtime_shape: int | None = None, + key: str | None = None, + ) -> tuple[Callable | None, Any | None]: + compilation_counter.num_eager_compiles += 1 + # we don't need to compile the graph, just return the graph itself. + # It does not support caching, return None for the handle. + return graph, None diff --git a/compilation/counter.py b/compilation/counter.py new file mode 100644 index 0000000..2091809 --- /dev/null +++ b/compilation/counter.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import copy +import dataclasses +from contextlib import contextmanager + + +@dataclasses.dataclass +class CompilationCounter: + num_models_seen: int = 0 + num_graphs_seen: int = 0 + # including the splitting ops + num_piecewise_graphs_seen: int = 0 + # not including the splitting ops + num_piecewise_capturable_graphs_seen: int = 0 + num_backend_compilations: int = 0 + # Number of gpu_model_runner attempts to trigger CUDAGraphs capture + num_gpu_runner_capture_triggers: int = 0 + # Number of CUDAGraphs captured + num_cudagraph_captured: int = 0 + # InductorAdapter.compile calls + num_inductor_compiles: int = 0 + # EagerAdapter.compile calls + num_eager_compiles: int = 0 + # The number of time vLLM's compiler cache entry was updated + num_cache_entries_updated: int = 0 + # The number of standalone_compile compiled artifacts saved + num_compiled_artifacts_saved: int = 0 + # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE + stock_torch_compile_count: int = 0 + + def clone(self) -> "CompilationCounter": + return copy.deepcopy(self) + + @contextmanager + def expect(self, **kwargs): + old = self.clone() + yield + for k, v in kwargs.items(): + assert getattr(self, k) - getattr(old, k) == v, ( + f"{k} not as expected, before it is {getattr(old, k)}" + f", after it is {getattr(self, k)}, " + f"expected diff is {v}" + ) + + +compilation_counter = CompilationCounter() diff --git a/compilation/cuda_graph.py b/compilation/cuda_graph.py new file mode 100644 index 0000000..19309a7 --- /dev/null +++ b/compilation/cuda_graph.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import dataclasses +from collections.abc import Callable +from contextlib import ExitStack +from typing import Any +from unittest.mock import patch + +import torch + +import vllm.envs as envs +from vllm.compilation.counter import compilation_counter +from vllm.compilation.monitor import validate_cudagraph_capturing_enabled +from vllm.config import CUDAGraphMode, VllmConfig +from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id +from vllm.forward_context import BatchDescriptor, get_forward_context +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.torch_utils import weak_ref_tensors +from vllm.sequence import IntermediateTensors + +logger = init_logger(__name__) + + +@dataclasses.dataclass +class CUDAGraphEntry: + batch_descriptor: BatchDescriptor + cudagraph: torch.cuda.CUDAGraph | None = None + output: Any | None = None + + # for cudagraph debugging, track the input addresses + # during capture, and check if they are the same during replay + input_addresses: list[int] | None = None + + +@dataclasses.dataclass +class CUDAGraphOptions: + debug_log_enable: bool = True + gc_disable: bool = False + weak_ref_output: bool = True + + +class CUDAGraphWrapper: + """Wraps a runnable to add CUDA graph capturing and replaying ability. And + provide attribute access to the underlying `runnable` via `__getattr__`. + + The workflow of this wrapper in the cudagraph dispatching is as follows: + 1. At initialization, a runtime mode is assigned to the wrapper (FULL or + PIECEWISE). + 2. At runtime, the wrapper receives a runtime_mode and a + batch_descriptor(key) from the forward context and blindly trust them + for cudagraph dispatching. + 3. If runtime_mode is NONE or runtime_mode does not match the mode of the + wrapper, just call the runnable directly. + 4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, + the wrapper will perform cudagraph capture(if key does not exist, create + a new entry and cache it) or replay (if key exists in the cache). + + Note: CUDAGraphWrapper does not store persistent buffers or copy any + runtime inputs into that buffers for replay. We assume implementing them + is done outside of the wrapper. That is because we do not make any + assumption on the dynamic shape (batch size) of the runtime inputs, as a + trade-off for staying orthogonal to compilation logic. Nevertheless, + tracing and checking the input addresses to be consistent during replay is + guaranteed when VLLM_LOGGING_LEVEL == "DEBUG". + """ + + def __init__( + self, + runnable: Callable, + vllm_config: VllmConfig, + runtime_mode: CUDAGraphMode, + cudagraph_options: CUDAGraphOptions | None = None, + ): + self.runnable = runnable + self.vllm_config = vllm_config + self.runtime_mode = runtime_mode + self.compilation_config = vllm_config.compilation_config + + self.first_run_finished = False + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + + # assert runtime_mode is not NONE(no cudagraph), otherwise, we don't + # need to initialize a CUDAGraphWrapper. + assert self.runtime_mode != CUDAGraphMode.NONE + # TODO: in the future, if we want to use multiple + # streams, it might not be safe to share a global pool. + # only investigate this when we use multiple streams + self.graph_pool = current_platform.get_global_graph_pool() + + if cudagraph_options is None: + cudagraph_options = CUDAGraphOptions() + self.cudagraph_options = cudagraph_options + # the entries for different batch descriptors that we need to capture + # cudagraphs for. + self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry] = {} + + def __getattr__(self, key: str): + # allow accessing the attributes of the runnable. + if hasattr(self.runnable, key): + return getattr(self.runnable, key) + raise AttributeError( + f"Attribute {key} not exists in the runnable of " + f"cudagraph wrapper: {self.runnable}" + ) + + def unwrap(self) -> Callable: + # in case we need to access the original runnable. + return self.runnable + + def weak_ref_tensors_with_intermediate(self, output): + if isinstance(output, IntermediateTensors): + intermediate_states = IntermediateTensors( + tensors={key: weak_ref_tensors(value) for key, value in output.tensors.items()}) + return intermediate_states + return weak_ref_tensors(output) + + def __call__(self, *args, **kwargs): + forward_context = get_forward_context() + batch_descriptor = forward_context.batch_descriptor + cudagraph_runtime_mode = forward_context.cudagraph_runtime_mode + + if ( + cudagraph_runtime_mode == CUDAGraphMode.NONE + or cudagraph_runtime_mode != self.runtime_mode + ): + # CUDAGraphMode.NONE could mean the profile run, a warmup run, or + # running without cudagraphs. + # We do not trigger capture/replay if the runtime mode is not + # matches. This enables properly dispatching to the correct + # CUDAGraphWrapper when nesting multiple instances with different + # runtime modes. + return self.runnable(*args, **kwargs) + + if batch_descriptor not in self.concrete_cudagraph_entries: + # create a new entry for this batch descriptor + self.concrete_cudagraph_entries[batch_descriptor] = CUDAGraphEntry( + batch_descriptor=batch_descriptor + ) + + entry = self.concrete_cudagraph_entries[batch_descriptor] + + if entry.cudagraph is None: + if self.cudagraph_options.debug_log_enable: + # Since we capture cudagraph for many different shapes and + # capturing is fast, we don't need to log it for every + # shape. E.g. we only log it for the first subgraph in + # piecewise mode. + logger.debug( + "Capturing a cudagraph on (%s,%s)", + self.runtime_mode.name, + entry.batch_descriptor, + ) + # validate that cudagraph capturing is legal at this point. + validate_cudagraph_capturing_enabled() + + input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + entry.input_addresses = input_addresses + cudagraph = torch.cuda.CUDAGraph() + + with ExitStack() as stack: + if self.cudagraph_options.gc_disable: + # during every model forward for piecewise cudagraph + # mode, we will capture many pieces of cudagraphs + # (roughly one per layer). running gc again and again + # across layers will make the cudagraph capture very slow. + # therefore, we only run gc for the first graph, + # and disable gc for the rest of the graphs. + stack.enter_context(patch("gc.collect", lambda: None)) + stack.enter_context(patch("torch.cuda.empty_cache", lambda: None)) + + if self.graph_pool is not None: + set_graph_pool_id(self.graph_pool) + else: + set_graph_pool_id(current_platform.graph_pool_handle()) + # mind-exploding: carefully manage the reference and memory. + with torch.cuda.graph(cudagraph, pool=self.graph_pool): + # `output` is managed by pytorch's cudagraph pool + output = self.runnable(*args, **kwargs) + if self.cudagraph_options.weak_ref_output: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph in piecewise cuadgraph mode, because + # the output of the last graph will not be used by + # any other cuda graph. + output = self.weak_ref_tensors_with_intermediate(output) + + # here we always use weak ref for the output + # to save memory + entry.output = self.weak_ref_tensors_with_intermediate(output) + entry.cudagraph = cudagraph + + compilation_counter.num_cudagraph_captured += 1 + + # important: we need to return the output, rather than + # the weak ref of the output, so that pytorch can correctly + # manage the memory during cuda graph capture + return output + + if self.is_debugging_mode: + # check if the input addresses are the same + new_input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + assert new_input_addresses == entry.input_addresses, ( + f"Input addresses for cudagraphs are different " + f"during replay. Expected {entry.input_addresses}, " + f"got {new_input_addresses}" + ) + + entry.cudagraph.replay() + return entry.output diff --git a/compilation/decorators.py b/compilation/decorators.py new file mode 100644 index 0000000..11a18c0 --- /dev/null +++ b/compilation/decorators.py @@ -0,0 +1,571 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import contextlib +import hashlib +import inspect +import os +import sys +from collections.abc import Callable +from typing import TypeVar, overload +from unittest.mock import patch + +import torch +import torch.nn as nn +from packaging import version +from torch._dynamo.symbolic_convert import InliningInstructionTranslator + +import vllm.envs as envs +from vllm.compilation.counter import compilation_counter +from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper +from vllm.config import ( + CompilationMode, + VllmConfig, + get_current_vllm_config, + set_current_vllm_config, +) +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors +from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.utils.torch_utils import supports_dynamo + +from .monitor import start_monitoring_torch_compile + +logger = init_logger(__name__) + +IGNORE_COMPILE_KEY = "_ignore_compile_vllm" + +_T = TypeVar("_T", bound=type[nn.Module]) + + +def ignore_torch_compile(cls: _T) -> _T: + """ + A decorator to ignore support_torch_compile decorator + on the class. This is useful when a parent class has + a support_torch_compile decorator, but we don't want to + compile the class `cls` that inherits the parent class. + This only ignores compiling the forward of the class the + decorator is applied to. + + If the parent has ignore_torch_compile but the child has + support_torch_compile, the child will still be compiled. + + If the class has one or more submodules + that have support_torch_compile decorator applied, compile will + not be ignored for those submodules. + """ + setattr(cls, IGNORE_COMPILE_KEY, True) + return cls + + +def _should_ignore_torch_compile(cls) -> bool: + """ + Check if the class should be ignored for torch.compile. + """ + return getattr(cls, IGNORE_COMPILE_KEY, False) + + +@overload +def support_torch_compile( + *, + enable_if: Callable[[VllmConfig], bool] | None = None, +) -> Callable[[_T], _T]: ... + + +@overload +def support_torch_compile( + *, + dynamic_arg_dims: dict[str, int | list[int]] | None, +) -> Callable[[_T], _T]: ... + + +@overload +def support_torch_compile( + *, + mark_unbacked_dims: dict[str, int | list[int]] | None, +) -> Callable[[_T], _T]: ... + + +@overload +def support_torch_compile( + *, + dynamic_arg_dims: dict[str, int | list[int]] | None, + mark_unbacked_dims: dict[str, int | list[int]] | None, +) -> Callable[[_T], _T]: ... + + +@overload +def support_torch_compile(cls: _T) -> _T: ... + + +def support_torch_compile( + cls: _T | None = None, + *, + dynamic_arg_dims: dict[str, int | list[int]] | None = None, + mark_unbacked_dims: dict[str, int | list[int]] | None = None, + enable_if: Callable[[VllmConfig], bool] | None = None, +) -> Callable[[_T], _T] | _T: + """ + A decorator to add support for compiling the forward method of a class. + + Usage 1: use directly as a decorator without arguments: + + ```python + @support_torch_compile + class MyModel(nn.Module): + def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ... + ``` + + Usage 2: use as a decorator with arguments: + + ```python + @support_torch_compile(dynamic_arg_dims={"x": 0, "y": 0}) + class MyModel(nn.Module): + def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ... + ``` + + `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic + dimensions of the argument. The dynamic dimensions can be either a single + integer or a list of integers. + + if `dynamic_arg_dims` is `None`, it is inferred from the type annotation + of the `forward` method, based on the following default rules: + + - if the argument is annotated as `torch.Tensor` or + `Optional[torch.Tensor]`, the first dimension will be + marked as dynamic. + - if the argument is annotated as `IntermediateTensors`, the first + dimension of all the tensors in the intermediate tensors + will be marked as dynamic. + + During runtime, when we actually mark dimensions of tensors, + it depends on the value of arguments: + + - if it is a single integer (can be negative), the corresponding dimension + of the argument will be marked as dynamic. + - if it is `None`, ignored. + - if it is `IntermediateTensors`, all the tensors in the intermediate + tensors will be marked as dynamic. + - otherwise, it will raise an error. + + NOTE: if an argument is `None`, it should always be passed as `None` during + the lifetime of the model, otherwise, it cannot be captured as a single + computation graph. + + `enable_if` is a function that takes a `VllmConfig` object as input and + returns a boolean value indicating whether to compile the model or not. + This is useful if you want to compile the model only when certain + conditions are met. + + `mark_unbacked_dims` is a dictionary that maps argument names with a dynamic + dim to be decorated with `mark_unbacked`. This is useful if we would like to + enforce that dynamo does not specialize on 0/1 values in the case of dummy input + such as for vision model compilation + """ + + def cls_decorator_helper(cls: _T) -> _T: + # helper to pass `dynamic_arg_dims` to `_support_torch_compile` + # to avoid too much indentation for `_support_torch_compile` + if not hasattr(cls, "forward"): + raise TypeError("decorated class should have a forward method.") + sig = inspect.signature(cls.forward) + inferred_dynamic_arg_dims = dynamic_arg_dims + if inferred_dynamic_arg_dims is None: + inferred_dynamic_arg_dims = {} + for k, v in sig.parameters.items(): + if v.annotation in [ + torch.Tensor, + torch.Tensor | None, + IntermediateTensors, + IntermediateTensors | None, + ]: + inferred_dynamic_arg_dims[k] = 0 + + logger.debug( + ("Inferred dynamic dimensions for forward method of %s: %s"), + cls, + list(inferred_dynamic_arg_dims.keys()), + ) + + if len(inferred_dynamic_arg_dims) == 0: + raise ValueError( + "No dynamic dimensions found in the forward method of " + f"{cls}. Please provide dynamic_arg_dims explicitly." + ) + + for k in inferred_dynamic_arg_dims: + if k not in sig.parameters: + raise ValueError( + f"Argument {k} not found in the forward method of {cls}" + ) + return _support_torch_compile( + cls, inferred_dynamic_arg_dims, mark_unbacked_dims, enable_if + ) + + if cls is not None: + # use `support_torch_compile` as a decorator without arguments + assert isinstance(cls, type) + return cls_decorator_helper(cls) + + return cls_decorator_helper + + +def _model_hash_key(fn) -> str: + import vllm + + sha256_hash = hashlib.sha256() + sha256_hash.update(vllm.__version__.encode()) + sha256_hash.update(fn.__qualname__.encode()) + sha256_hash.update(str(fn.__code__.co_firstlineno).encode()) + return sha256_hash.hexdigest() + + +def _verify_source_unchanged(source_info, vllm_config) -> None: + from .caching import _compute_code_hash, _compute_code_hash_with_content + + file_contents = {} + for source in source_info.inlined_sources: + module = sys.modules[source.module] + file = inspect.getfile(module) + vllm_config.compilation_config.traced_files.add(file) + file_contents[file] = source.content + expected_checksum = _compute_code_hash_with_content(file_contents) + actual_checksum = _compute_code_hash(set(file_contents.keys())) + if expected_checksum != actual_checksum: + raise RuntimeError( + "Source code has changed since the last compilation. Recompiling the model." + ) + + +def _support_torch_compile( + cls: _T, + dynamic_arg_dims: dict[str, int | list[int]], + mark_unbacked_dims: dict[str, int | list[int]] | None = None, + enable_if: Callable[[VllmConfig], bool] | None = None, +) -> _T: + """ + A decorator to add support for compiling the forward method of a class. + """ + if TorchCompileWithNoGuardsWrapper in cls.__bases__: + # support decorating multiple times + return cls + + # take care of method resolution order + # make sure super().__init__ is called on the base class + # other than TorchCompileWithNoGuardsWrapper + cls.__bases__ = cls.__bases__ + (TorchCompileWithNoGuardsWrapper,) + + old_init = cls.__init__ + + setattr(cls, IGNORE_COMPILE_KEY, False) + + def __init__( + self, *, vllm_config: VllmConfig | None = None, prefix: str = "", **kwargs + ): + if vllm_config is None: + vllm_config = get_current_vllm_config() + + # NOTE: to support multimodal models (such as encoder), + # we may not have vllm_config so we may need to patch + # it + sig = inspect.signature(old_init) + if "vllm_config" in sig.parameters: + kwargs["vllm_config"] = vllm_config + if "prefix" in sig.parameters: + kwargs["prefix"] = prefix + old_init(self, **kwargs) + + self.vllm_config = vllm_config + enable_compile = enable_if is None or enable_if(vllm_config) + # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner + # will handle the compilation, so we don't need to do anything here. + self.do_not_compile = ( + vllm_config.compilation_config.mode + in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE] + or not supports_dynamo() + or _should_ignore_torch_compile(self.__class__) + or not enable_compile + ) + if self.do_not_compile: + return + + compilation_counter.num_models_seen += 1 + self.compiled = False + TorchCompileWithNoGuardsWrapper.__init__(self) + + cls.__init__ = __init__ + + def _mark_dynamic_inputs(mod, *args, **kwargs): + sig = inspect.signature(mod.__class__.forward) + bound_args = sig.bind(mod, *args, **kwargs) + bound_args.apply_defaults() + for k, dims in dynamic_arg_dims.items(): + arg = bound_args.arguments.get(k) + if arg is not None: + dims = [dims] if isinstance(dims, int) else dims + if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] + torch._dynamo.mark_dynamic(arg, dims) + elif isinstance(arg, IntermediateTensors): + for tensor in arg.tensors.values(): + # In case dims is specified with negative indexing + dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims] + torch._dynamo.mark_dynamic(tensor, dims) + else: + raise ValueError( + "Unsupported dynamic dimensions" + f" {dims} for argument {k} with type {type(arg)}." + ) + if mark_unbacked_dims: + for k, dims in mark_unbacked_dims.items(): + arg = bound_args.arguments.get(k) + if arg is not None: + dims = [dims] if isinstance(dims, int) else dims + if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] + torch._dynamo.decorators.mark_unbacked(arg, dims) + + def __call__(self, *args, **kwargs): + # torch.compiler.is_compiling() means we are inside the compilation + # e.g. TPU has the compilation logic in model runner, so we don't + # need to compile the model inside. + if self.do_not_compile or torch.compiler.is_compiling(): + return self.forward(*args, **kwargs) + + # if aot_compiled_fn is set, just call it. + if getattr(self, "aot_compiled_fn", None) is not None: + return self.aot_compiled_fn(self, *args, **kwargs) + + cache_dir = None + aot_compilation_path = None + if envs.VLLM_USE_AOT_COMPILE: + """ + When using torch.compile in AOT mode, we store the cache artifacts + under VLLM_CACHE_ROOT/torch_aot_compile/{hash}/rank_i_j. The {hash} + contains all of the factors except for the source files being + traced through, because we don't actually know which source files + to check at this point (before dynamo runs). + On loading we will actually look at the source files being traced + through. If any source file have changed (compared with the + serialized backend artifacts), then we need to generate a new AOT + compile artifact from scratch. + """ + from .caching import compilation_config_hash_factors + + factors: list[str] = compilation_config_hash_factors(self.vllm_config) + + factors.append(_model_hash_key(self.forward)) + hash_key = hashlib.sha256(str(factors).encode()).hexdigest() + + cache_dir = os.path.join( + envs.VLLM_CACHE_ROOT, + "torch_aot_compile", + hash_key, + ) + + rank = self.vllm_config.parallel_config.rank + dp_rank = self.vllm_config.parallel_config.data_parallel_rank + cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}") + aot_compilation_path = os.path.join(cache_dir, "model") + try: + with ( + set_current_vllm_config(self.vllm_config), + open(aot_compilation_path, "rb") as f, + ): + start_monitoring_torch_compile(self.vllm_config) + loaded_fn = torch.compiler.load_compiled_function(f) + _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config) + loaded_fn.disable_guard_check() + self.aot_compiled_fn = loaded_fn + except Exception as e: + if os.path.exists(aot_compilation_path): + logger.warning( + "Cannot load aot compilation from path %s, error: %s", + aot_compilation_path, + str(e), + ) + if envs.VLLM_FORCE_AOT_LOAD: + raise e + if getattr(self, "aot_compiled_fn", None) is not None: + logger.info( + "Directly load AOT compilation from path %s", aot_compilation_path + ) + return self.aot_compiled_fn(self, *args, **kwargs) + + if self.compiled: + assert not envs.VLLM_USE_AOT_COMPILE + return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) + + # This is the path for the first compilation. + + # the first compilation needs to have dynamic shapes marked + _mark_dynamic_inputs(self, *args, **kwargs) + + # here, it is the starting point of the `torch.compile` process + start_monitoring_torch_compile(self.vllm_config) + original_code_object = self.original_code_object() + logger.debug("Start compiling function %s", original_code_object) + + # we do not want tp delete the original code object entries since + # we depend on them now to look up cached compiled functions. + # torch._dynamo.eval_frame.remove_from_cache(original_code_object) + + # collect all relevant files traced by Dynamo, + # so that the compilation cache can trigger re-compilation + # properly when any of these files change. + + # 1. the file containing the top-level forward function + self.vllm_config.compilation_config.traced_files.add( + original_code_object.co_filename + ) + + # 2. every time Dynamo sees a function call, it will inline + # the function by calling InliningInstructionTranslator.inline_call_ + # we hijack this function to know all the functions called + # during Dynamo tracing, and their corresponding files + inline_call = InliningInstructionTranslator.inline_call_ + + def patched_inline_call(self_): + code = self_.f_code + self.vllm_config.compilation_config.traced_files.add(code.co_filename) + return inline_call(self_) + + # Disable the C++ compilation of symbolic shape guards. C++-fication + # of symbolic shape guards can improve guard overhead. But, since + # vllm skip guards anyways, setting this flag to False can improve + # compile time. + dynamo_config_patches = {} + try: + _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards + dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False + except AttributeError: + # Note: this config is not available in torch 2.6, we can skip + # if the config doesn't exist + logger.debug("enable_cpp_symbolic_shape_guards config not available") + + with ( + patch.object( + InliningInstructionTranslator, "inline_call_", patched_inline_call + ), + torch._dynamo.config.patch(**dynamo_config_patches), + maybe_use_cudagraph_partition_wrapper(self.vllm_config), + _torch27_patch_tensor_subclasses(), + ): + if envs.VLLM_USE_AOT_COMPILE: + self.aot_compiled_fn = self.aot_compile(*args, **kwargs) + output = self.aot_compiled_fn(self, *args, **kwargs) + assert aot_compilation_path is not None + assert cache_dir is not None + try: + os.makedirs(cache_dir, exist_ok=True) + self.aot_compiled_fn.save_compiled_function(aot_compilation_path) + except Exception as e: + logger.warning( + "Cannot save aot compilation to path %s, error: %s", + aot_compilation_path, + str(e), + ) + else: + output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) + + self.compiled = True + return output + + cls.__call__ = __call__ + return cls + + +@contextlib.contextmanager +def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): + """ + Context manager to set/unset customized cudagraph partition wrappers. + + If we're using Inductor-based graph partitioning, we currently have the + whole `fx.Graph` before Inductor lowering and the piecewise + splitting happens after all graph passes and fusions. Here, we add + a custom hook for Inductor to wrap each partition with our static + graph wrapper class to maintain more control over static graph + capture and replay. + """ + from vllm.config import CUDAGraphMode + + compilation_config = vllm_config.compilation_config + if ( + compilation_config.cudagraph_mode.has_piecewise_cudagraphs() + and compilation_config.use_inductor_graph_partition + ): + from torch._inductor.utils import CUDAGraphWrapperMetadata + + from vllm.compilation.cuda_graph import CUDAGraphOptions + from vllm.platforms import current_platform + + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls() + ) + + def customized_cudagraph_wrapper(f, metadata: CUDAGraphWrapperMetadata): + partition_id = metadata.partition_index + num_partitions = metadata.num_partitions + return static_graph_wrapper_class( + runnable=f, + vllm_config=vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=partition_id == 0, + gc_disable=partition_id != 0, + weak_ref_output=partition_id == num_partitions - 1, + ), + ) + + torch._inductor.utils.set_customized_partition_wrappers( + customized_cudagraph_wrapper + ) + + yield + + if ( + compilation_config.cudagraph_mode.has_piecewise_cudagraphs() + and compilation_config.use_inductor_graph_partition + ): + torch._inductor.utils.set_customized_partition_wrappers(None) + + +@contextlib.contextmanager +def _torch27_patch_tensor_subclasses(): + """ + Add support for using tensor subclasses (ie `BasevLLMParameter`, ect) when + using torch 2.7.0. This enables using weight_loader_v2 and the use of + `BasevLLMParameters` without having to replace them with regular tensors + before `torch.compile`-time. + """ + from vllm.model_executor.parameter import ( + BasevLLMParameter, + ModelWeightParameter, + RowvLLMParameter, + _ColumnvLLMParameter, + ) + + def return_false(*args, **kwargs): + return False + + if version.parse("2.7") <= version.parse(torch.__version__) < version.parse("2.8"): + yield + return + + with ( + torch._dynamo.config.patch( + "traceable_tensor_subclasses", + [ + BasevLLMParameter, + ModelWeightParameter, + _ColumnvLLMParameter, + RowvLLMParameter, + ], + ), + patch( + "torch._dynamo.variables.torch.can_dispatch_torch_function", return_false + ), + ): + yield diff --git a/compilation/fix_functionalization.py b/compilation/fix_functionalization.py new file mode 100644 index 0000000..126ad35 --- /dev/null +++ b/compilation/fix_functionalization.py @@ -0,0 +1,253 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import operator +from collections.abc import Iterable + +import torch +from torch._higher_order_ops.auto_functionalize import auto_functionalized + +from vllm.logger import init_logger +from vllm.platforms import current_platform + +from .fx_utils import is_func +from .vllm_inductor_pass import VllmInductorPass + +logger = init_logger(__name__) + + +class FixFunctionalizationPass(VllmInductorPass): + """ + This pass defunctionalizes certain nodes to avoid redundant tensor copies. + After this pass, DCE (dead-code elimination) should never be run, + as de-functionalized nodes may appear as dead code. + + To add new nodes to defunctionalize, add to the if-elif chain in __call__. + """ + + @VllmInductorPass.time_and_log + def __call__(self, graph: torch.fx.Graph): + # XPU does not support auto-functionalization yet. + # Will enable this when switch to vllm-xpu-kernels. + if current_platform.is_xpu(): + logger.debug( + "XPU platform does not support fix functionalizationpass currently." + ) + return + + self.nodes_to_remove: list[torch.fx.Node] = [] + count = 0 + for node in graph.nodes: + if not is_func(node, auto_functionalized): + continue # Avoid deep if-elif nesting + + kwargs = node.kwargs + at_target = node.args[0] + + if at_target == torch.ops._C.rotary_embedding.default: + query = kwargs["query"] + key = kwargs["key"] + getitem_nodes = self.getitem_users(node) + + if ( + is_func(query, operator.getitem) + and is_func(key, operator.getitem) + and query.args[0] == key.args[0] + and is_func(query.args[0], torch.ops.aten.split_with_sizes.default) + and all( + is_func(user, torch.ops.aten.slice_scatter.default) + for getitem_node in getitem_nodes.values() + for user in getitem_node.users + ) + ): + # Pattern where query and key are slices of an mm_node. + # While functionalized, results at [1] and [2] are scattered + # back into mm_node. So after de-functionalization, we can + # just use mm_node directly. + + mm_node = query.args[0].args[0] + for user in getitem_nodes.values(): + for user_of_getitem in user.users: + if is_func( + user_of_getitem, torch.ops.aten.slice_scatter.default + ): + user_of_getitem.replace_all_uses_with(mm_node) + self._remove(user_of_getitem) + self._remove(user) + + self.insert_defunctionalized(graph, node) + self._remove(node) + + else: + # Directly replace the auto_functionalize(rotary_embedding) + # with the inplace rotary_embedding. In theory, we shouldn't + # do this blindly, but in practice in vLLM it's ok. The best + # solution is to use auto_functionalization_v2 and then use + # inductor's builtin defunctionalization (reinplacing) pass. + mutated_args = {1: "query", 2: "key"} + self.defunctionalize(graph, node, mutated_args) + + # rms_norm replacements avoid the most copies for LLaMa. + elif at_target == torch.ops._C.fused_add_rms_norm.default: + mutated_args = {1: "input", 2: "residual"} + self.defunctionalize(graph, node, mutated_args) + elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default: # noqa: E501 + mutated_args = {1: "result", 2: "residual"} + self.defunctionalize(graph, node, mutated_args) + elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default: # noqa: E501 + mutated_args = {1: "result", 2: "scale", 3: "residual"} + self.defunctionalize(graph, node, mutated_args) + elif at_target in [ + torch.ops._C.rms_norm.default, + torch.ops._C.rms_norm_static_fp8_quant.default, + ]: + mutated_args = {1: "result"} + self.defunctionalize(graph, node, mutated_args) + # For some reason we need to specify the args for both + # silu_and_mul and silu_and_mul_quant. The kwargs + # pathway gets the wrong answer. + elif at_target == torch.ops._C.silu_and_mul.default: + mutated_args = {1: "result"} + self.defunctionalize( + graph, node, mutated_args, args=("result", "input") + ) + elif at_target == torch.ops._C.silu_and_mul_quant.default: + mutated_args = {1: "result"} + self.defunctionalize( + graph, node, mutated_args, args=("result", "input", "scale") + ) + elif ( + hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant") + and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default + ): + mutated_args = {1: "result", 2: "result_block_scale"} + self.defunctionalize( + graph, + node, + mutated_args, + args=( + "result", + "result_block_scale", + "input", + "input_global_scale", + ), + ) + # Defunctionalize fused_qk_norm_rope to remove higher-order wrapper. + elif at_target == torch.ops._C.fused_qk_norm_rope.default: + mutated_args = {1: "qkv"} + args = ( + "qkv", + "num_heads_q", + "num_heads_k", + "num_heads_v", + "head_dim", + "eps", + "q_weight", + "k_weight", + "cos_sin_cache", + "is_neox", + "position_ids", + ) + self.defunctionalize(graph, node, mutated_args=mutated_args, args=args) + else: + continue # skip the count + + count += 1 + + self.dump_graph(graph, "before_cleanup") + + # Remove the nodes all at once + count_removed = len(self.nodes_to_remove) + for node in self.nodes_to_remove: + graph.erase_node(node) + + logger.debug( + "De-functionalized %s nodes, removed %s nodes", count, count_removed + ) + self.nodes_to_remove.clear() + + def _remove(self, node_or_nodes: torch.fx.Node | Iterable[torch.fx.Node]): + """ + Stage a node (or nodes) for removal at the end of the pass. + """ + if isinstance(node_or_nodes, torch.fx.Node): + self.nodes_to_remove.append(node_or_nodes) + else: + self.nodes_to_remove.extend(node_or_nodes) + + def defunctionalize( + self, + graph: torch.fx.Graph, + node: torch.fx.Node, + mutated_args: dict[int, torch.fx.Node | str], + args: tuple[torch.fx.Node | str, ...] | None = None, + ): + """ + De-functionalize a node by replacing it with a call to the original. + It also replaces the getitem users with the mutated arguments. + See replace_users_with_mutated_args and insert_defunctionalized. + """ + self.replace_users_with_mutated_args(node, mutated_args) + self.insert_defunctionalized(graph, node, args=args) + self._remove(node) + + def replace_users_with_mutated_args( + self, node: torch.fx.Node, mutated_args: dict[int, torch.fx.Node | str] + ): + """ + Replace all getitem users of the auto-functionalized node with the + mutated arguments. + :param node: The auto-functionalized node + :param mutated_args: The mutated arguments, indexed by getitem index. + If the value of an arg is a string, `node.kwargs[arg]` is used. + """ + for idx, user in self.getitem_users(node).items(): + arg = mutated_args[idx] + arg = node.kwargs[arg] if isinstance(arg, str) else arg + user.replace_all_uses_with(arg) + self._remove(user) + + def getitem_users(self, node: torch.fx.Node) -> dict[int, torch.fx.Node]: + """ + Returns the operator.getitem users of the auto-functionalized node, + indexed by the index they are getting. + """ + users = {} + for user in node.users: + if is_func(user, operator.getitem): + idx = user.args[1] + users[idx] = user + return users + + def insert_defunctionalized( + self, + graph: torch.fx.Graph, + node: torch.fx.Node, + args: tuple[torch.fx.Node | str, ...] | None = None, + ): + """ + Insert a new defunctionalized node into the graph before node. + If one of the kwargs is 'out', provide args directly, + as node.kwargs cannot be used. + See https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 + + :param graph: Graph to insert the defunctionalized node into + :param node: The auto-functionalized node to defunctionalize + :param args: If we cannot use kwargs, specify args directly. + If an arg is a string, `node.kwargs[arg]` is used. + """ # noqa: E501 + assert is_func(node, auto_functionalized), ( + f"node must be auto-functionalized, is {node} instead" + ) + + # Create a new call to the original function + with graph.inserting_before(node): + function = node.args[0] + if args is None: + graph.call_function(function, kwargs=node.kwargs) + else: + # Args passed as strings refer to items in node.kwargs + args = tuple( + node.kwargs[arg] if isinstance(arg, str) else arg for arg in args + ) + graph.call_function(function, args=args) diff --git a/compilation/fusion.py b/compilation/fusion.py new file mode 100644 index 0000000..1d6e297 --- /dev/null +++ b/compilation/fusion.py @@ -0,0 +1,374 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, NamedTuple + +import torch +import torch._inductor.pattern_matcher as pm +from torch import fx +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch._ops import OpOverload + +from vllm.config import VllmConfig, get_current_vllm_config +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + QuantKey, + ScaleDesc, + kFp8DynamicTensorSym, + kFp8DynamicTokenSym, + kFp8StaticTensorSym, + kNvfp4Quant, + kStaticTensorScale, +) +from vllm.platforms import current_platform + +from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass + +logger = init_logger(__name__) +FP8_DTYPE = current_platform.fp8_dtype() +FP4_DTYPE = torch.uint8 + + +def empty_bf16(*args, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda") + + +def empty_fp32(*args, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda") + + +def empty_i32(*args, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.int32, device="cuda") + + +def empty_i64(*args, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.int64, device="cuda") + + +RMS_OP = torch.ops._C.rms_norm.default +RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default + +QUANT_OPS: dict[QuantKey, OpOverload] = { + kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501 + kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501 + kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501 +} +if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): + QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default + + +class FusedRMSQuantKey(NamedTuple): + """ + Named tuple for identifying the type of RMSNorm + quant fusion. + quant: type of quantization + fused_add: does the op also perform the residual add + """ + + quant: QuantKey + fused_add: bool + + def __str__(self): + return ( + f"FusedQuantKey({self.quant}, with" + f"{'' if self.fused_add else 'out'} residual)" + ) + + +FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = { + FusedRMSQuantKey( + kFp8StaticTensorSym, False + ): torch.ops._C.rms_norm_static_fp8_quant.default, # noqa: E501 + FusedRMSQuantKey( + kFp8StaticTensorSym, True + ): torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, # noqa: E501 + FusedRMSQuantKey( + kFp8DynamicTokenSym, False + ): torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa: E501 + FusedRMSQuantKey( + kFp8DynamicTokenSym, True + ): torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa: E501 +} + + +class RMSNormQuantPattern: + def __init__(self, epsilon: float, key: FusedRMSQuantKey): + self.epsilon = epsilon + self.quant_dtype = key.quant.dtype + config = get_current_vllm_config() + self.model_dtype = config.model_config.dtype if config.model_config else None + + assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}" + self.FUSED_OP = FUSED_OPS[key] + + self.rmsnorm_matcher = ( + MatcherRMSNorm(epsilon) + if not key.fused_add + else MatcherFusedAddRMSNorm(epsilon) + ) + self.quant_matcher = MatcherQuantFP8(key.quant) + + +class RMSNormStaticQuantPattern(RMSNormQuantPattern): + def __init__(self, epsilon: float, quant_dtype: torch.dtype, symmetric=True): + fused_key = FusedRMSQuantKey( + fused_add=False, + quant=QuantKey( + dtype=quant_dtype, scale=kStaticTensorScale, symmetric=symmetric + ), + ) + super().__init__(epsilon, fused_key) + + def register(self, pm_pass: PatternMatcherPass): + # Cannot use methods, as the self argument affects tracing + def pattern(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor): + result_rms = self.rmsnorm_matcher(input, weight) + return self.quant_matcher(result_rms, scale)[0] + + def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor): + # In case we're matching native rms-norm, conversions might be + # optimized out. We convert here just to be safe. + input = input.to(dtype=self.model_dtype) + + result = torch.empty( + input.shape, device=input.device, dtype=self.quant_dtype + ) + at = auto_functionalized( + self.FUSED_OP, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=self.epsilon, + ) + + # result + return at[1] + + inputs = [ + # input, weight + *self.rmsnorm_matcher.inputs(), + self.quant_matcher.inputs()[1], # scale + ] + pattern(*inputs) + + pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + + +class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern): + def __init__(self, epsilon: float, quant_dtype: torch.dtype, symmetric=True): + key = FusedRMSQuantKey( + fused_add=True, + quant=QuantKey( + dtype=quant_dtype, scale=kStaticTensorScale, symmetric=symmetric + ), + ) + super().__init__(epsilon, key) + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + weight: torch.Tensor, + residual: torch.Tensor, + scale: torch.Tensor, + ): + result_rms, residual = self.rmsnorm_matcher(input, weight, residual) + result, _ = self.quant_matcher(result_rms, scale) + + return result, residual + + def replacement( + input: torch.Tensor, + weight: torch.Tensor, + residual: torch.Tensor, + scale: torch.Tensor, + ): + # In case we're matching native rms-norm, conversions might be + # optimized out. We convert here just to be safe. + input = input.to(dtype=self.model_dtype) + + result = torch.empty_like(input, dtype=self.quant_dtype) + at = auto_functionalized( + self.FUSED_OP, + result=result, + input=input, + residual=residual, + weight=weight, + scale=scale, + epsilon=self.epsilon, + ) + + # result, residual + return at[1], at[2] + + inputs = [ + # input, weight, residual + *self.rmsnorm_matcher.inputs(), + self.quant_matcher.inputs()[1], # scale + ] + + pm.register_replacement( + pattern, + replacement, + inputs, + pm.fwd_only, + pm_pass, + ) + + +class RMSNormDynamicQuantPattern(RMSNormQuantPattern): + def __init__( + self, + epsilon: float, + quant_dtype: torch.dtype, + group_shape: GroupShape = GroupShape.PER_TOKEN, + symmetric=True, + ): + scale = ScaleDesc(torch.float32, False, group_shape) + key = FusedRMSQuantKey( + fused_add=False, + quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), + ) + super().__init__(epsilon, key) + + def register(self, pm_pass: PatternMatcherPass): + def pattern(input: torch.Tensor, weight: torch.Tensor): + result_rms = self.rmsnorm_matcher(input, weight) + # result, scale + return self.quant_matcher(result_rms) + + def replacement(input: torch.Tensor, weight: torch.Tensor): + # In case we're matching native rms-norm, conversions might be + # optimized out. We convert here just to be safe. + input = input.to(dtype=self.model_dtype) + + result = torch.empty_like(input, dtype=self.quant_dtype) + scale = self.quant_matcher.make_scale(input) + at = auto_functionalized( + self.FUSED_OP, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=self.epsilon, + scale_ub=None, + residual=None, + ) + + # result, scale + return at[1], at[2] + + pm.register_replacement( + pattern, + replacement, + self.rmsnorm_matcher.inputs(), + pm.fwd_only, + pm_pass, + ) + + +class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): + def __init__( + self, + epsilon: float, + quant_dtype: torch.dtype, + group_shape: GroupShape = GroupShape.PER_TOKEN, + symmetric=True, + ): + scale = ScaleDesc(torch.float32, False, group_shape) + key = FusedRMSQuantKey( + fused_add=True, + quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), + ) + super().__init__(epsilon, key) + + def register(self, pm_pass: PatternMatcherPass): + def pattern(input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor): + result_rms, residual = self.rmsnorm_matcher(input, weight, residual) + result, scale = self.quant_matcher(result_rms) + + return result, residual, scale + + def replacement( + input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor + ): + # In case we're matching native rms-norm, conversions might be + # optimized out. We convert here just to be safe. + input = input.to(dtype=self.model_dtype) + + result = torch.empty_like(input, dtype=self.quant_dtype) + scale = self.quant_matcher.make_scale(input) + at = auto_functionalized( + self.FUSED_OP, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=self.epsilon, + scale_ub=None, + residual=residual, + ) + + # result, residual, scale + return at[1], at[3], at[2] + + pm.register_replacement( + pattern, + replacement, + self.rmsnorm_matcher.inputs(), + pm.fwd_only, + pm_pass, + ) + + +class RMSNormQuantFusionPass(VllmPatternMatcherPass): + """ + This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op. + It also supports fused_add_rms_norm. + """ + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="rmsnorm_quant_fusion_pass" + ) + + # Make sure fused add patterns are before simple rms norm, + # as the latter is a subset of the former in torch ops + for epsilon in [1e-5, 1e-6]: + # Fuse fused_add_rms_norm + static fp8 quant + FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register( + self.patterns + ) + + # Fuse rms_norm + static fp8 quant + RMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(self.patterns) + + # Fuse fused_add_rms_norm + dynamic per-token fp8 quant + FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register( + self.patterns + ) + + # Fuse rms_norm + dynamic per-token fp8 quant + RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(self.patterns) + + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph): + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + + def uuid(self) -> Any: + return self.hash_source( + self, + RMSNormQuantPattern, + RMSNormStaticQuantPattern, + RMSNormDynamicQuantPattern, + FusedAddRMSNormStaticQuantPattern, + FusedAddRMSNormDynamicQuantPattern, + ) diff --git a/compilation/fusion_attn.py b/compilation/fusion_attn.py new file mode 100644 index 0000000..4f44fae --- /dev/null +++ b/compilation/fusion_attn.py @@ -0,0 +1,359 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod +from collections.abc import Callable + +import torch +import torch._inductor.pattern_matcher as pm +from torch import fx +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._inductor.pattern_matcher import PatternMatcherPass + +from vllm.attention import Attention +from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, + kNvfp4Quant, + kStaticTensorScale, +) +from vllm.platforms import current_platform +from vllm.utils.math_utils import round_up + +from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32 +from .fx_utils import is_func +from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherQuantFP8 +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass + +logger = init_logger(__name__) + +FP8_DTYPE = current_platform.fp8_dtype() +FP4_DTYPE = torch.uint8 + +ATTN_OP = torch.ops.vllm.unified_attention_with_output.default +RESHAPE_OP = torch.ops.aten.reshape.default + + +class AttentionQuantPattern(ABC): + """ + The base class for Attn+Quant fusions. + Should not be used directly. + """ + + def __init__( + self, + layer: Attention, + quant_key: QuantKey, + dtype: torch.dtype, + ): + self.layer = layer + self.layer_name = layer.layer_name + self.num_heads = layer.num_heads + self.head_size = layer.head_size + self.quant_key = quant_key + self.quant_dtype = quant_key.dtype + self.dtype = dtype + + assert self.quant_key in QUANT_OPS, ( + f"unsupported quantization scheme {self.quant_key}" + ) + self.QUANT_OP = QUANT_OPS[self.quant_key] + + def empty(self, *args, **kwargs): + kwargs = {"dtype": self.dtype, "device": "cuda", **kwargs} + return torch.empty(*args, **kwargs) + + def empty_quant(self, *args, **kwargs): + kwargs = {"dtype": self.quant_dtype, "device": "cuda", **kwargs} + return torch.empty(*args, **kwargs) + + @staticmethod + def wrap_trace_fn(trace_fn, *process_fx_fns: Callable[[fx.GraphModule], None]): + def wrapped(*args, **kwargs): + gm = trace_fn(*args, **kwargs) + for process_fx in process_fx_fns: + process_fx(gm) + + return gm + + return wrapped + + @staticmethod + def fx_view_to_reshape(gm: torch.fx.GraphModule): + from torch._inductor.fx_passes.post_grad import view_to_reshape + + view_to_reshape(gm) + + @staticmethod + def remove_noop_permutes(gm: torch.fx.GraphModule): + for node in gm.graph.nodes: + if not is_func(node, torch.ops.aten.permute.default): + continue + + dims = node.args[1] + if any(dim != i for i, dim in enumerate(dims)): + continue + + # this is now an identity op, remove + node.replace_all_uses_with(node.args[0]) + gm.graph.erase_node(node) + + def register_if_supported(self, pm_pass: PatternMatcherPass): + if self.layer.impl.fused_output_quant_supported(self.quant_key): + self._register(pm_pass) + + @abstractmethod + def _register(self, pm_pass: PatternMatcherPass): + raise NotImplementedError + + +class AttentionFp8StaticQuantPattern(AttentionQuantPattern): + """ + Fusion for Attention+Fp8StaticQuant. + + Only triggers when the attention implementation returns True in + `fused_output_quant_supported()`. If the pattern is found, the + Fp8StaticQuant op will be removed from the graph, and its scale + will be passed into Attention op as the `output_scale` argument. + """ + + def __init__( + self, + layer: Attention, + dtype: torch.dtype, + symmetric: bool = True, + ): + quant_key = QuantKey( + dtype=FP8_DTYPE, scale=kStaticTensorScale, symmetric=symmetric + ) + super().__init__(layer, quant_key, dtype) + self.quant_matcher = MatcherQuantFP8(quant_key) + + def _register(self, pm_pass: PatternMatcherPass): + def pattern( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + output_attn: torch.Tensor, + scale: torch.Tensor, + ): + at1 = auto_functionalized( + ATTN_OP, + query=q, + key=k, + value=v, + output=output_attn, + layer_name=self.layer_name, + output_scale=None, + output_block_scale=None, + ) + attn_out_view = RESHAPE_OP( + at1[1], [q.shape[0], self.num_heads * self.head_size] + ) + + return self.quant_matcher(attn_out_view, scale)[0] + + def replacement( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + output_attn: torch.Tensor, + scale: torch.Tensor, + ): + # attn output in quant_dtype + output_attn = torch.ops.aten.full.default( + [q.shape[0], self.num_heads, self.head_size], + 0.0, + dtype=self.quant_dtype, + device=q.device, + ) + at1 = auto_functionalized( + ATTN_OP, + query=q, + key=k, + value=v, + output=output_attn, + layer_name=self.layer_name, + output_scale=scale, + output_block_scale=None, + ) + return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size]) + + inputs = [ + self.empty(5, self.num_heads, self.head_size), # q + self.empty(5, self.num_heads, self.head_size), # k + self.empty(5, self.num_heads, self.head_size), # v + self.empty(5, self.num_heads, self.head_size), # attn_output + empty_fp32(1, 1), # scale + ] + + pm.register_replacement( + pattern, + replacement, + inputs, + AttentionQuantPattern.wrap_trace_fn( + pm.fwd_only, + AttentionQuantPattern.fx_view_to_reshape, + AttentionQuantPattern.remove_noop_permutes, + ), + pm_pass, + ) + + +class AttentionNvfp4QuantPattern(AttentionQuantPattern): + """ + Fusion for Attention+Nvfp4Quant. + + Only triggers when the attention implementation returns True in + `fused_output_quant_supported()`. If the pattern is found, the + Nvfp4Quant op will be removed from the graph, and its scale + will be passed into Attention op as the `output_scale` argument. + """ + + def __init__(self, layer: Attention, dtype: torch.dtype): + super().__init__(layer, kNvfp4Quant, dtype) + + def _register(self, pm_pass: PatternMatcherPass): + def pattern( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + output_attn: torch.Tensor, + output_quant: torch.Tensor, + output_scale: torch.Tensor, + input_scale: torch.Tensor, + ): + at1 = auto_functionalized( + ATTN_OP, + query=q, + key=k, + value=v, + output=output_attn, + layer_name=self.layer_name, + output_scale=None, + output_block_scale=None, + ) + attn_out_view = RESHAPE_OP( + at1[1], [q.shape[0], self.num_heads * self.head_size] + ) + at2 = auto_functionalized( + self.QUANT_OP, + output=output_quant, + input=attn_out_view, + output_scale=output_scale, + input_scale=input_scale, + ) + output_scale_view = torch.ops.aten.view.dtype(at2[2], FP8_DTYPE) + return at2[1], output_scale_view + + def replacement( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + output_attn: torch.Tensor, + output_quant: torch.Tensor, + output_scale: torch.Tensor, + input_scale: torch.Tensor, + ): + # attention output in quant_dtype + output_attn = torch.ops.aten.full.default( + [q.shape[0], self.num_heads, self.head_size // 2], + 0.0, + dtype=self.quant_dtype, + device=q.device, + ) + # attention output block scale + output_scale_view = torch.ops.aten.view.dtype(output_scale, FP8_DTYPE) + at2 = auto_functionalized( + ATTN_OP, + query=q, + key=k, + value=v, + output=output_attn, + layer_name=self.layer_name, + output_scale=input_scale, + output_block_scale=output_scale_view, + ) + output = RESHAPE_OP(at2[1], [-1, self.num_heads * self.head_size // 2]) + return output, at2[2] + + inputs = [ + empty_bf16(5, self.num_heads, self.head_size), # q + empty_bf16(5, self.num_heads, self.head_size), # k + empty_bf16(5, self.num_heads, self.head_size), # v + empty_bf16(5, self.num_heads, self.head_size), # output_attn + self.empty_quant(5, self.num_heads * self.head_size // 2), # output_quant + empty_i32( + 128, round_up(self.num_heads * self.head_size // 16, 4) + ), # output_scale + empty_fp32(1, 1), # input_scale + ] + + pm.register_replacement( + pattern, + replacement, + inputs, + AttentionQuantPattern.wrap_trace_fn( + pm.fwd_only, + AttentionQuantPattern.fx_view_to_reshape, + AttentionQuantPattern.remove_noop_permutes, + ), + pm_pass, + ) + + +class AttnFusionPass(VllmPatternMatcherPass): + """ + This pass fuses post-attention quantization onto attention if supported. + + It uses the pattern matcher and matches each layer manually, as strings + cannot be wildcarded. This also lets us check support on attention layers + upon registration instead of during pattern matching. + + Currently, only static fp8 quant is supported, but patterns could easily be + added for other quant schemes and dtypes. The bigger hurdle for wider + support are attention kernels, which need to support fusing output quant. + """ + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + self.patterns = PatternMatcherPass(pass_name="attn_fusion_pass") + + attn_layers = get_layers_from_vllm_config(config, Attention) + for layer_name, layer in attn_layers.items(): + pattern_fp8 = AttentionFp8StaticQuantPattern( + layer, config.model_config.dtype + ) + pattern_fp8.register_if_supported(self.patterns) + + if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): + pattern_nvfp4 = AttentionNvfp4QuantPattern( + layer, config.model_config.dtype + ) + pattern_nvfp4.register_if_supported(self.patterns) + + if len(attn_layers) == 0: + logger.warning( + "Attention + quant fusion is enabled, but no attention layers " + "were found in CompilationConfig.static_forward_context " + "so no fusion patterns were registered." + ) + + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log + def __call__(self, graph: torch.fx.graph.Graph) -> None: + self.matched_count = self.patterns.apply(graph) + logger.debug("Fused quant onto %s attention nodes", self.matched_count) + + def uuid(self): + return VllmInductorPass.hash_source( + self, + AttentionQuantPattern, + AttentionFp8StaticQuantPattern, + AttentionNvfp4QuantPattern, + ) diff --git a/compilation/fx_utils.py b/compilation/fx_utils.py new file mode 100644 index 0000000..f249795 --- /dev/null +++ b/compilation/fx_utils.py @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import operator +from collections.abc import Iterable, Iterator + +from torch import fx +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._ops import OpOverload, OpOverloadPacket + + +def is_func(node: fx.Node, target) -> bool: + return node.op == "call_function" and node.target == target + + +def is_auto_func(node: fx.Node, op: OpOverload) -> bool: + return is_func(node, auto_functionalized) and node.args[0] == op + + +# Returns the first specified node with the given op (if it exists) +def find_specified_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node | None: + for node in nodes: + if node.target == op: + return node + return None + + +# Returns the first specified node with the given op +def find_specified_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node: + node = find_specified_fn_maybe(nodes, op) + assert node is not None, f"Could not find {op} in nodes {nodes}" + return node + + +# Returns the first auto_functionalized node with the given op (if it exists) +def find_auto_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node | None: + for node in nodes: + if is_func(node, auto_functionalized) and node.args[0] == op: # noqa + return node + return None + + +# Returns the first auto_functionalized node with the given op +def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node: + node = find_auto_fn_maybe(nodes, op) + assert node is not None, f"Could not find {op} in nodes {nodes}" + return node + + +# Returns the getitem node that extracts the idx-th element from node +# (if it exists) +def find_getitem_maybe(node: fx.Node, idx: int) -> fx.Node | None: + for user in node.users: + if is_func(user, operator.getitem) and user.args[1] == idx: + return user + return None + + +# Returns the getitem node that extracts the idx-th element from node +def find_getitem(node: fx.Node, idx: int) -> fx.Node: + ret = find_getitem_maybe(node, idx) + assert ret is not None, f"Could not find getitem {idx} in node {node}" + return ret + + +# An auto-functionalization-aware utility for finding nodes with a specific op +# Also handles op overload packets and finds all overloads +def find_op_nodes( + op: OpOverload | OpOverloadPacket, graph: fx.Graph +) -> Iterator[fx.Node]: + if isinstance(op, OpOverloadPacket): + for overload in op.overloads(): + overload_op = getattr(op, overload) + yield from find_op_nodes(overload_op, graph) + return + + assert isinstance(op, OpOverload) + if not op._schema.is_mutable: + yield from graph.find_nodes(op="call_function", target=op) + + for n in graph.find_nodes(op="call_function", target=auto_functionalized): + if n.args[0] == op: + yield n + + +# Asserts that the node only has one user and returns it +# Even if a node has only 1 user, it might share storage with another node, +# which might need to be taken into account. +def get_only_user(node: fx.Node) -> fx.Node: + assert len(node.users) == 1 + return next(iter(node.users)) diff --git a/compilation/inductor_pass.py b/compilation/inductor_pass.py new file mode 100644 index 0000000..9af635a --- /dev/null +++ b/compilation/inductor_pass.py @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import functools +import hashlib +import inspect +import json +import types +from collections.abc import Callable +from contextlib import contextmanager +from typing import Any + +import torch +from torch import fx +from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily + +from vllm.utils.torch_utils import is_torch_equal_or_newer + +if is_torch_equal_or_newer("2.6"): + from torch._inductor.custom_graph_pass import CustomGraphPass +else: + # CustomGraphPass is not present in 2.5 or lower, import our version + from .torch25_custom_graph_pass import ( + Torch25CustomGraphPass as CustomGraphPass, + ) + +_pass_context = None + + +class PassContext: + def __init__(self, runtime_shape: int | None): + self.runtime_shape = runtime_shape + + +def get_pass_context() -> PassContext: + """Get the current pass context.""" + assert _pass_context is not None + return _pass_context + + +@contextmanager +def pass_context(runtime_shape: int | None): + """A context manager that stores the current pass context, + usually it is a list of sizes to specialize. + """ + global _pass_context + prev_context = _pass_context + _pass_context = PassContext(runtime_shape) + try: + yield + finally: + _pass_context = prev_context + + +class InductorPass(CustomGraphPass): + """ + A custom graph pass that uses a hash of its source as the UUID. + This is defined as a convenience and should work in most cases. + """ + + def uuid(self) -> Any: + """ + Provide a unique identifier for the pass, used in Inductor code cache. + This should depend on the pass implementation, so that changes to the + pass result in recompilation. + By default, the object source is hashed. + """ + return InductorPass.hash_source(self) + + @staticmethod + def hash_source(*srcs: str | Any): + """ + Utility method to hash the sources of functions or objects. + :param srcs: strings or objects to add to the hash. + Objects and functions have their source inspected. + :return: + """ + hasher = hashlib.sha256() + for src in srcs: + if isinstance(src, str): + src_str = src + elif isinstance(src, (types.FunctionType, type)): + src_str = inspect.getsource(src) + else: + # object instance + src_str = inspect.getsource(src.__class__) + hasher.update(src_str.encode("utf-8")) + return hasher.hexdigest() + + @staticmethod + def hash_dict(dict_: dict[Any, Any]): + """ + Utility method to hash a dictionary, can alternatively be used for uuid. + :return: A sha256 hash of the json rep of the dictionary. + """ + encoded = json.dumps(dict_, sort_keys=True).encode("utf-8") + return hashlib.sha256(encoded).hexdigest() + + def is_applicable(self, shape: int | None): + return True + + +class CallableInductorPass(InductorPass): + """ + This class is a wrapper for a callable that automatically provides an + implementation of the UUID. + """ + + def __init__(self, callable: Callable[[fx.Graph], None], uuid: Any | None = None): + self.callable = callable + self._uuid = self.hash_source(callable) if uuid is None else uuid + + def __call__(self, graph: torch.fx.Graph): + self.callable(graph) + + def uuid(self) -> Any: + return self._uuid + + +def enable_fake_mode(fn: Callable[..., Any]) -> Callable[..., Any]: + """ + Applies a FakeTensorMode context. This is useful when you don't want to + create or run things with real tensors. + """ + + @functools.wraps(fn) + def fn_new(*args, **kwargs) -> Any: + with torch._guards.tracing(None), unset_fake_temporarily(), FakeTensorMode(): + result = fn(*args, **kwargs) + + return result + + return fn_new diff --git a/compilation/matcher_utils.py b/compilation/matcher_utils.py new file mode 100644 index 0000000..38eb4e5 --- /dev/null +++ b/compilation/matcher_utils.py @@ -0,0 +1,317 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod + +import torch +from torch._higher_order_ops import auto_functionalized +from torch._ops import OpOverload + +from vllm.config import get_current_vllm_config +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, + _normalize_quant_group_shape, + kFp8DynamicTensorSym, + kFp8DynamicTokenSym, + kFp8StaticTensorSym, + kNvfp4Quant, +) +from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding +from vllm.platforms import current_platform + +RMS_OP = torch.ops._C.rms_norm.default +RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default +ROTARY_OP = torch.ops._C.rotary_embedding.default +FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default + +QUANT_OPS: dict[QuantKey, OpOverload] = { + kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501 + kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501 + kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501 +} + +if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): + QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default # noqa: E501 + +SILU_MUL_OP = torch.ops._C.silu_and_mul.default + + +class MatcherCustomOp(ABC): + def __init__(self, enabled: bool): + config = get_current_vllm_config() + self.model_dtype = config.model_config.dtype if config.model_config else None + self.device = config.device_config.device if config.device_config else None + + self.enabled = enabled + self.forward = self.forward_custom if enabled else self.forward_native + + @abstractmethod + def forward_custom(self, *args, **kws): + pass + + @abstractmethod + def forward_native(self, *args, **kws): + pass + + def __call__(self, *args, **kws): + return self.forward(*args, **kws) + + def empty(self, *args, **kws): + return torch.empty(*args, dtype=self.model_dtype, device=self.device, **kws) + + def empty_int64(self, *args, **kws): + return torch.empty(*args, dtype=torch.int64, device=self.device, **kws) + + def empty_f32(self, *args, **kws): + return torch.empty(*args, dtype=torch.float32, device=self.device, **kws) + + def inputs(self) -> list[torch.Tensor]: + """Utility for inputs to the pattern""" + raise NotImplementedError + + +class MatcherRotaryEmbedding(MatcherCustomOp): + def __init__( + self, + is_neox: bool, + head_size: int, + num_heads: int, + num_kv_heads: int, + use_flashinfer: bool = False, + enabled: bool | None = None, + ) -> None: + if enabled is None: + enabled = RotaryEmbedding.enabled() + + super().__init__(enabled) + self.is_neox = is_neox + self.head_size = head_size + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.q_size = self.num_heads * self.head_size + self.kv_size = self.num_kv_heads * self.head_size + self.rotary_dim = head_size + if use_flashinfer: + self.rotary_op = FLASHINFER_ROTARY_OP + else: + self.rotary_op = ROTARY_OP + + def inputs(self) -> list[torch.Tensor]: + positions = self.empty_int64(5) + query = self.empty(5, self.q_size) + key = self.empty(5, self.kv_size) + cos_sin_cache = self.empty(4096, self.rotary_dim) + return [positions, query, key, cos_sin_cache] + + def forward_custom( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None, + cos_sin_cache: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + result = auto_functionalized( + self.rotary_op, + positions=positions, + query=query, + key=key, + head_size=self.head_size, + cos_sin_cache=cos_sin_cache, + is_neox=self.is_neox, + ) + query_out = result[1] + key_out = result[2] if len(result) > 2 else None + return query_out, key_out + + def forward_native( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None, + cos_sin_cache: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + return RotaryEmbedding.forward_static( + positions, + query, + key, + self.head_size, + self.rotary_dim, + cos_sin_cache, + self.is_neox, + ) + + +class MatcherRMSNorm(MatcherCustomOp): + def __init__(self, epsilon: float, enabled: bool | None = None): + if enabled is None: + enabled = RMSNorm.enabled() + + super().__init__(enabled) + self.epsilon = epsilon + + def inputs(self): + input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16) + weight = self.empty(16) + return [input, weight] + + def forward_custom( + self, + input: torch.Tensor, + weight: torch.Tensor, + ) -> torch.Tensor: + result = torch.empty_like(input) + # TODO: support non-contiguous input for RMSNorm and remove this + input_contiguous = input.contiguous() + _, result = auto_functionalized( + RMS_OP, + result=result, + input=input_contiguous, + weight=weight, + epsilon=self.epsilon, + ) + + return result + + def forward_native( + self, + input: torch.Tensor, + weight: torch.Tensor, + ) -> torch.Tensor: + return RMSNorm.forward_static( + input, self.epsilon, input.size(-1), self.model_dtype, weight + ) + + +class MatcherFusedAddRMSNorm(MatcherCustomOp): + def __init__(self, epsilon: float, enabled: bool | None = None): + if enabled is None: + enabled = RMSNorm.enabled() + + super().__init__(enabled) + self.epsilon = epsilon + + def inputs(self): + input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16) + weight = self.empty(16) + residual = self.empty(5, 16) + return [input, weight, residual] + + def forward_custom( + self, + input: torch.Tensor, + weight: torch.Tensor, + residual: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + _, result, residual = auto_functionalized( + RMS_ADD_OP, + input=input, + residual=residual, + weight=weight, + epsilon=self.epsilon, + ) + + return result, residual + + def forward_native( + self, + input: torch.Tensor, + weight: torch.Tensor, + residual: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + return RMSNorm.forward_static( + input, self.epsilon, input.size(-1), self.model_dtype, weight, residual + ) + + +class MatcherQuantFP8(MatcherCustomOp): + def __init__(self, quant_key: QuantKey, enabled: bool | None = None): + if enabled is None: + enabled = QuantFP8.enabled() + + super().__init__(enabled) + self.quant_key = quant_key + assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}" + self.QUANT_OP = QUANT_OPS[quant_key] + + assert quant_key.dtype == current_platform.fp8_dtype(), ( + "Only QuantFP8 supported by" + ) + assert quant_key.scale2 is None + self.quant_fp8 = QuantFP8(quant_key.scale.static, quant_key.scale.group_shape) + + def forward_custom( + self, + input: torch.Tensor, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + result = torch.empty( + input.shape, device=input.device, dtype=self.quant_key.dtype + ) + + if self.quant_key.scale.static: + assert scale is not None + _, result = auto_functionalized( + self.QUANT_OP, result=result, input=input, scale=scale + ) + return result, scale + else: + assert scale is None + scale = self.make_scale(input) + _, result, scale = auto_functionalized( + self.QUANT_OP, result=result, input=input, scale=scale, scale_ub=None + ) + return result, scale + + def forward_native( + self, + input: torch.Tensor, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return self.quant_fp8(input, scale) + + def make_scale(self, input: torch.Tensor): + normalized_group_shape = _normalize_quant_group_shape( + input, self.quant_key.scale.group_shape + ) + scale_shape = ( + input.shape[0] // normalized_group_shape[0], + input.shape[1] // normalized_group_shape[1], + ) + + return torch.empty(scale_shape, device=input.device, dtype=torch.float32) + + def inputs(self) -> list[torch.Tensor]: + input = self.empty(5, 16) + if self.quant_key.scale.static: + return [input, self.empty_f32(1, 1)] + + return [input] + + +class MatcherSiluAndMul(MatcherCustomOp): + def __init__(self, enabled: bool | None = None): + if enabled is None: + enabled = SiluAndMul.enabled() + super().__init__(enabled) + + def inputs(self) -> list[torch.Tensor]: + input = self.empty(5, 4) + return [input] + + def forward_custom( + self, + x: torch.Tensor, + ) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = x.shape[:-1] + (d,) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + result = auto_functionalized(SILU_MUL_OP, result=out, input=x) + return result[1] + + def forward_native( + self, + x: torch.Tensor, + ) -> torch.Tensor: + return SiluAndMul.forward_native(x) diff --git a/compilation/monitor.py b/compilation/monitor.py new file mode 100644 index 0000000..660fb98 --- /dev/null +++ b/compilation/monitor.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import time + +from vllm.config import CompilationConfig, CompilationMode, VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + +context_manager = None +torch_compile_start_time: float = 0.0 + + +def start_monitoring_torch_compile(vllm_config: VllmConfig): + global torch_compile_start_time + torch_compile_start_time = time.time() + + compilation_config: CompilationConfig = vllm_config.compilation_config + path = vllm_config.compile_debug_dump_path() + if compilation_config.mode == CompilationMode.VLLM_COMPILE and path: + import depyf + + path.mkdir(parents=True, exist_ok=True) + logger.debug("Dumping depyf output to %s", path) + global context_manager + context_manager = depyf.prepare_debug(path.as_posix()) + context_manager.__enter__() + + +def end_monitoring_torch_compile(vllm_config: VllmConfig): + compilation_config: CompilationConfig = vllm_config.compilation_config + if compilation_config.mode == CompilationMode.VLLM_COMPILE: + logger.info_once( + "torch.compile takes %.2f s in total", + compilation_config.compilation_time, + scope="local", + ) + global context_manager + if context_manager is not None: + context_manager.__exit__(None, None, None) + context_manager = None + + +cudagraph_capturing_enabled: bool = True + + +def validate_cudagraph_capturing_enabled(): + # used to monitor whether a cudagraph capturing is legal at runtime. + # should be called before any cudagraph capturing. + # if an illegal cudagraph capturing happens, raise an error. + global cudagraph_capturing_enabled + if not cudagraph_capturing_enabled: + raise RuntimeError( + "CUDA graph capturing detected at an inappropriate " + "time. This operation is currently disabled." + ) + + +def set_cudagraph_capturing_enabled(enabled: bool): + global cudagraph_capturing_enabled + cudagraph_capturing_enabled = enabled diff --git a/compilation/noop_elimination.py b/compilation/noop_elimination.py new file mode 100644 index 0000000..42b8d3d --- /dev/null +++ b/compilation/noop_elimination.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable + +import torch.fx +from torch import SymInt + +from vllm.logger import init_logger + +from .fx_utils import is_func +from .vllm_inductor_pass import VllmInductorPass + +logger = init_logger(__name__) + + +class NoOpEliminationPass(VllmInductorPass): + """ + This is an inductor pass that removes redundant reshape/slice operations. + It is required for RMSNorm-quant fusion to work properly. + That's because apply_fp8_linear adds a reshape, which is redundant + in the 2D-case. Additionally, torch internal no-op elimination pass does + not handle certain slice variants. + + Cases handled: + 1. A chain of reshapes is equivalent to the last reshape called on the + base tensor (input of the first reshape). + 2. A reshape that produces the shape of the input is redundant + 3. A slice that produces the shape of the input is redundant + + Example graph 1: + mul_1: "f16[s0, 4096]" = ... + view_1: "f16[s0, 128, 32]" = torch.reshape(mul_1, [-1, 128, 32]) + view_2: "f16[s0, 4096]" = torch.reshape(view_2, [-1, 4096]) + view_3: "f16[s0, 128, 32]" = torch.reshape(view_3, [-1, 128, 32]) + + Can be replaced with: + mul_1: "f16[s0, 4096]" = ... + view_3: "f16[s0, 128, 32]" = ... + + Example graph 2: + getitem_1: "f16[s0, 4096]" = ... + view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096]) + at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...) + out: "f8e4m3fn[s0, 4096]" = at[1] + + Can be replaced with: + getitem_1: "f16[s0, 4096]" = ... + at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...) + out: "f8e4m3fn[s0, 4096]" = at[1] + + Example graph 3: + arg0: "s0" = SymInt(s0) + scaled_mm: "f16[s0, 4096]" = ... + slice_1: "f16[s0, 4096]" = torch.slice(scaled_mm, -1, 0, arg0) + at = auto_functionalized(fused_add_rms_norm, input = slice_1, ...) + out: "f16[s0, 4096]" = torch.slice_scatter(scaled_mm, at[1], 0, 0, arg0) + + Can be replaced with: + arg0: "s0" = SymInt(s0) + scaled_mm: "f16[s0, 4096]" = ... + at = auto_functionalized(fused_add_rms_norm, input = scaled_mm, ...) + out: "f16[s0, 4096]" = at[1] + """ + + @VllmInductorPass.time_and_log + def __call__(self, graph: torch.fx.Graph): + count = 0 + # Remove no-op reshapes/views: + for node in graph.nodes: + if is_func(node, torch.ops.aten.reshape.default): + # Case 1: rewrite reshape chains to reshapes on the base tensor + input = node.args[0] + # If the input is a reshape, rebind to that node + if is_func(input, torch.ops.aten.reshape.default): + # The new input is guaranteed not to be a reshape, + # because we process nodes in order + node.update_arg(0, input.args[0]) + if len(input.users) == 0: + graph.erase_node(input) + count += 1 + + # remove reshape/slice if it produces the original shape + if is_func(node, torch.ops.aten.reshape.default) or is_func( + node, torch.ops.aten.slice.Tensor + ): + input = node.args[0] + input_shape = input.meta["val"].shape + output_shape = node.meta["val"].shape + if self.all_dims_equivalent(input_shape, output_shape): + node.replace_all_uses_with(input) + graph.erase_node(node) + count += 1 + elif is_func(node, torch.ops.aten.slice_scatter.default): + base, view, dim_index, start, end = node.args[:5] + base_shape = base.meta["val"].shape + view_shape = view.meta["val"].shape + + if self.all_dims_equivalent(base_shape, view_shape): + node.replace_all_uses_with(view) + graph.erase_node(node) + count += 1 + + logger.debug("Removed %s no-op reshapes and slices", count) + + # ---------------------- Shape comparison helpers ---------------------- + def dims_equivalent(self, dim: int | SymInt, i_dim: int | SymInt) -> bool: + """ + This function checks if two dimensions are equivalent. + :param dim: The dimension arg to reshape/slice + :param i_dim: The corresponding dimension in the input tensor + :return: Are the dimensions equivalent? + + There are two cases in which the dimensions are equivalent: + 1. The dimensions are equal (both integers) + 2. The dimensions both correspond to the same SymInt + """ + # Case 1 + if isinstance(i_dim, int) and isinstance(dim, int): + return dim == i_dim + # Case 2 + if isinstance(i_dim, SymInt) and isinstance(dim, SymInt): + return dim == i_dim + return False + + def all_dims_equivalent( + self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt] + ) -> bool: + dims_ = list(dims) + i_dims_ = list(i_dims) + if len(dims_) != len(i_dims_): + # Different ranks can't be equivalent + return False + return all(self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims)) diff --git a/compilation/partition_rules.py b/compilation/partition_rules.py new file mode 100644 index 0000000..08bd27e --- /dev/null +++ b/compilation/partition_rules.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import contextlib + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def should_split(node: torch.fx.Node, splitting_ops: list[str]) -> bool: + """ + Check if a node should be split for dynamo graph partition. + It operates on dynamo graph, so the node.target can be anything. + We need to check and split only on OpOverload and OpOverloadPacket. + """ + + if node.op != "call_function": + return False + + target = node.target + + if isinstance(target, torch._ops.OpOverloadPacket): + # Example: "aten::add" + return target._qualified_op_name in splitting_ops + + if isinstance(target, torch._ops.OpOverload): + # Example: "aten::add" + packet_name = target.name() + + # Example: "aten::add.default" + op_overload_name = f"{packet_name}.{target._overloadname}" + return op_overload_name in splitting_ops or packet_name in splitting_ops + + return False + + +@contextlib.contextmanager +def inductor_partition_rule_context(splitting_ops: list[str]): + """Context manager to temporarily register Inductor partition rules. + + Registers custom partition rules for specified operators, forcing the + Inductor scheduler to partition the graph at these operators. The rules + are automatically restored to their previous state on exit. + + Args: + splitting_ops: List of operator names to partition on. + """ + if not splitting_ops: + logger.debug("No partition ops provided; skipping rule registration.") + yield + return + + # Save current state before registering + + saved_splitting_ops: list[str] = list( + torch._inductor.config.custom_should_partition_ops + ) + torch._inductor.config.custom_should_partition_ops = splitting_ops + + logger.debug( + "Registered inductor partition rules for %d operators", len(splitting_ops) + ) + + try: + yield + finally: + # Clear and restore previous state + torch._inductor.config.custom_should_partition_ops = saved_splitting_ops + logger.debug("Restored previous partition rules state.") diff --git a/compilation/pass_manager.py b/compilation/pass_manager.py new file mode 100644 index 0000000..0e8bb2f --- /dev/null +++ b/compilation/pass_manager.py @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools + +from torch import fx as fx + +from vllm import envs +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.system_utils import set_env_var + +from .post_cleanup import PostCleanupPass +from .vllm_inductor_pass import VllmInductorPass + +if current_platform.is_cuda_alike(): + from .activation_quant_fusion import ActivationQuantFusionPass + from .fusion import RMSNormQuantFusionPass + from .fusion_attn import AttnFusionPass + from .qk_norm_rope_fusion import QKNormRoPEFusionPass + from .sequence_parallelism import SequenceParallelismPass + +if current_platform.is_cuda(): + from .collective_fusion import AllReduceFusionPass, AsyncTPPass + +from .fix_functionalization import FixFunctionalizationPass +from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context +from .noop_elimination import NoOpEliminationPass + +logger = init_logger(__name__) + + +def with_pattern_match_debug(fn): + """ + Function decorator that turns on inductor pattern match debug + for the duration of the call. + Used to avoid logging builtin Inductor pattern matching. + """ + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + if (debug_val := envs.VLLM_PATTERN_MATCH_DEBUG) is not None: + # optionally check rank here + with set_env_var("TORCHINDUCTOR_PATTERN_MATCH_DEBUG", debug_val): + return fn(*args, **kwargs) + return fn(*args, **kwargs) + + return wrapper + + +class PostGradPassManager(CustomGraphPass): + """ + The pass manager for post-grad passes. + It handles configuration, adding custom passes, and running passes. + It supports uuid for the Inductor code cache. That includes torch<2.6 + support using pickling (in .inductor_pass.CustomGraphPass). + + The order of the post-grad post-passes is: + 1. passes (constructor parameter) + 2. default passes (NoopEliminationPass, FusionPass) + 3. config["post_grad_custom_post_pass"] (if it exists) + 4. fix_functionalization + This way, all passes operate on a functionalized graph. + """ + + def __init__(self): + self.passes: list[InductorPass] = [] + + @with_pattern_match_debug + def __call__(self, graph: fx.Graph): + VllmInductorPass.dump_prefix = 0 # reset dump index + + shape = get_pass_context().runtime_shape + for pass_ in self.passes: + if pass_.is_applicable(shape): + pass_(graph) + VllmInductorPass.dump_prefix += 1 + else: + logger.debug("Skipping %s with shape %s", pass_, shape) + + # post-cleanup goes before fix_functionalization + # because it requires a functional graph + self.post_cleanup(graph) + VllmInductorPass.dump_prefix += 1 + + # always run fix_functionalization last + self.fix_functionalization(graph) + VllmInductorPass.dump_prefix = None # Cleanup index + + def configure(self, config: VllmConfig): + self.pass_config = config.compilation_config.pass_config + + # Set the current vllm config to allow tracing CustomOp instances + with set_current_vllm_config(config, check_compile=False): + if self.pass_config.enable_noop: + self.passes += [NoOpEliminationPass(config)] + + if self.pass_config.enable_sequence_parallelism: + self.passes += [SequenceParallelismPass(config)] + if self.pass_config.enable_async_tp: + self.passes += [AsyncTPPass(config)] + + if self.pass_config.enable_fi_allreduce_fusion: + self.passes += [AllReduceFusionPass(config)] + + if self.pass_config.enable_fusion: + self.passes += [RMSNormQuantFusionPass(config)] + self.passes += [ActivationQuantFusionPass(config)] + + if self.pass_config.enable_attn_fusion: + self.passes += [AttnFusionPass(config)] + + if self.pass_config.enable_qk_norm_rope_fusion: + self.passes += [QKNormRoPEFusionPass(config)] + + # needs a functional graph + self.post_cleanup = PostCleanupPass(config) + self.fix_functionalization = FixFunctionalizationPass(config) + + def add(self, pass_: InductorPass): + assert isinstance(pass_, InductorPass) + self.passes.append(pass_) + + def uuid(self): + """ + The PostGradPassManager is set as a custom pass in the Inductor and + affects compilation caching. Its uuid depends on the UUIDs of all + dependent passes and the pass config. See InductorPass for more info. + """ + state = {"pass_config": self.pass_config.uuid(), "passes": []} + for pass_ in self.passes: + state["passes"].append(pass_.uuid()) + state["passes"].append(self.fix_functionalization.uuid()) + + return InductorPass.hash_dict(state) diff --git a/compilation/piecewise_backend.py b/compilation/piecewise_backend.py new file mode 100644 index 0000000..2931580 --- /dev/null +++ b/compilation/piecewise_backend.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import dataclasses +from collections.abc import Callable +from typing import Any + +import torch.fx as fx + +import vllm.envs as envs +from vllm.compilation.backends import VllmBackend +from vllm.compilation.monitor import end_monitoring_torch_compile +from vllm.config import VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@dataclasses.dataclass +class ConcreteSizeEntry: + runtime_shape: int + compiled: bool = False + runnable: Callable = None # type: ignore + + +class PiecewiseBackend: + def __init__( + self, + graph: fx.GraphModule, + vllm_config: VllmConfig, + piecewise_compile_index: int, + total_piecewise_compiles: int, + sym_shape_indices: list[int], + compiled_graph_for_general_shape: Callable, + vllm_backend: VllmBackend, + ): + """ + The backend for piecewise compilation. + It mainly handles the compilation of static shapes and + dispatching based on runtime shape. + + We will compile `self.graph` once for the general shape, + and then compile for different shapes specified in + `compilation_config.compile_sizes`. + """ + self.graph = graph + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config + self.piecewise_compile_index = piecewise_compile_index + self.total_piecewise_compiles = total_piecewise_compiles + self.vllm_backend = vllm_backend + + self.is_first_graph = piecewise_compile_index == 0 + self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1 + + self.is_full_graph = total_piecewise_compiles == 1 + + self.compile_sizes: set[int] = set(self.compilation_config.compile_sizes) + + self.first_run_finished = False + + self.compiled_graph_for_general_shape = compiled_graph_for_general_shape # noqa + + self.sym_shape_indices = sym_shape_indices + + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + + # the entries for different shapes that we need to compile + self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} + + # to_be_compiled_sizes tracks the remaining sizes to compile, + # and updates during the compilation process, so we need to copy it + self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() + + # We only keep compilation management inside this class directly. + for shape in self.compile_sizes: + self.concrete_size_entries[shape] = ConcreteSizeEntry( + runtime_shape=shape, + runnable=self.compiled_graph_for_general_shape, + ) + + def check_for_ending_compilation(self): + if self.is_last_graph and not self.to_be_compiled_sizes: + # no specific sizes to compile + # save the hash of the inductor graph for the next run + self.vllm_backend.compiler_manager.save_to_file() + end_monitoring_torch_compile(self.vllm_config) + + def __call__(self, *args) -> Any: + if not self.first_run_finished: + self.first_run_finished = True + self.check_for_ending_compilation() + return self.compiled_graph_for_general_shape(*args) + + runtime_shape = args[self.sym_shape_indices[0]] + + if runtime_shape not in self.concrete_size_entries: + # we don't need to do anything for this shape + return self.compiled_graph_for_general_shape(*args) + + entry = self.concrete_size_entries[runtime_shape] + + if not entry.compiled: + entry.compiled = True + self.to_be_compiled_sizes.remove(runtime_shape) + # args are real arguments + entry.runnable = self.vllm_backend.compiler_manager.compile( + self.graph, + args, + self.compilation_config.inductor_compile_config, + self.compilation_config, + graph_index=self.piecewise_compile_index, + num_graphs=self.total_piecewise_compiles, + runtime_shape=runtime_shape, + ) + + # finished compilations for all required shapes + if self.is_last_graph and not self.to_be_compiled_sizes: + self.check_for_ending_compilation() + + return entry.runnable(*args) diff --git a/compilation/post_cleanup.py b/compilation/post_cleanup.py new file mode 100644 index 0000000..5511751 --- /dev/null +++ b/compilation/post_cleanup.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from torch import fx + +from vllm.compilation.vllm_inductor_pass import VllmInductorPass + + +class PostCleanupPass(VllmInductorPass): + """ + This pass performs cleanup after custom passes. + It topologically sorts the graph and removes unused nodes. + This is needed because the pattern matcher does not guarantee producing + a topologically sorted graph, and there may be unused nodes left around. + """ + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph) -> None: + from torch._inductor.pattern_matcher import stable_topological_sort + + stable_topological_sort(graph) + graph.eliminate_dead_code() diff --git a/compilation/qk_norm_rope_fusion.py b/compilation/qk_norm_rope_fusion.py new file mode 100644 index 0000000..e3c399e --- /dev/null +++ b/compilation/qk_norm_rope_fusion.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch +import torch._inductor.pattern_matcher as pm +from torch import fx +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._inductor.pattern_matcher import PatternMatcherPass + +from vllm.attention import Attention +from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.logger import init_logger +from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding + +from .fusion import empty_bf16, empty_fp32, empty_i64 +from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherRMSNorm, MatcherRotaryEmbedding +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass + +logger = init_logger(__name__) + +FUSED_QK_ROPE_OP = torch.ops._C.fused_qk_norm_rope.default + + +class QkNormRopePattern: + """ + Match the unfused sequence in attention blocks and replace with the fused op. + + Unfused (conceptually): + q, k, v = split(qkv, [qsz, kvsz, kvsz], -1) + qh = reshape(q, [-1, num_heads, head_dim]) + kh = reshape(k, [-1, num_kv_heads, head_dim]) + qn = rms_norm(qh, q_weight, eps) + kn = rms_norm(kh, k_weight, eps) + qf = reshape(qn, [-1, num_heads * head_dim]) + kf = reshape(kn, [-1, num_kv_heads * head_dim]) + qf, kf = rotary_embedding(positions, qf, kf, head_dim, cos_sin_cache, is_neox) + return qf, kf, v + + Fused replacement: + fused_qk_norm_rope(qkv, num_heads, num_kv_heads, num_kv_heads, head_dim, + eps, q_weight, k_weight, cos_sin_cache, is_neox, + positions.view(-1)) + return split(qkv, [qsz, kvsz, kvsz], -1) + """ + + def __init__( + self, + head_dim: int, + num_heads: int, + num_kv_heads: int, + eps: float, + is_neox: bool, + rope_flashinfer: bool = False, + ) -> None: + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.eps = eps + self.rmsnorm_matcher = MatcherRMSNorm(eps) + self.is_neox = is_neox + self.rope_flashinfer = rope_flashinfer + self.rope_matcher = MatcherRotaryEmbedding( + is_neox=is_neox, + head_size=self.head_dim, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, + use_flashinfer=self.rope_flashinfer, + ) + + def get_inputs(self): + # Sample inputs to help pattern tracing + T = 5 + qkv = empty_bf16(T, self.q_size + 2 * self.kv_size) + positions = empty_i64(T) + q_weight = empty_bf16(1, self.head_dim) + k_weight = empty_bf16(1, self.head_dim) + if self.rope_flashinfer: + cos_sin_cache = empty_fp32(4096, self.head_dim) + else: + cos_sin_cache = empty_bf16(4096, self.head_dim) + return [ + qkv, + positions, + q_weight, + k_weight, + cos_sin_cache, + ] + + @staticmethod + def wrap_trace_fn(trace_fn, *process_fx_fns: Callable[[fx.GraphModule], None]): + def wrapped(*args, **kwargs): + gm = trace_fn(*args, **kwargs) + for process_fx in process_fx_fns: + process_fx(gm) + + return gm + + return wrapped + + @staticmethod + def fx_view_to_reshape(gm: torch.fx.GraphModule): + from torch._inductor.fx_passes.post_grad import view_to_reshape + + view_to_reshape(gm) + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + qkv: torch.Tensor, + positions: torch.Tensor, + q_weight: torch.Tensor, + k_weight: torch.Tensor, + cos_sin_cache: torch.Tensor, + ): + # split qkv -> q,k,v + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + # Q path: view -> RMS -> view back to q.shape + q_by_head = q.view( + *q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim + ) + q_normed_by_head = self.rmsnorm_matcher(q_by_head, q_weight) + q_flat = q_normed_by_head.view(q.shape) + + # K path: view -> RMS -> view back to k.shape + k_by_head = k.view( + *k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim + ) + k_normed_by_head = self.rmsnorm_matcher(k_by_head, k_weight) + k_flat = k_normed_by_head.view(k.shape) + + # RoPE: apply to flattened q/k + q_rope, k_rope = self.rope_matcher(positions, q_flat, k_flat, cos_sin_cache) + return q_rope, k_rope, v + + def replacement( + qkv: torch.Tensor, + positions: torch.Tensor, + q_weight: torch.Tensor, + k_weight: torch.Tensor, + cos_sin_cache: torch.Tensor, + ): + # Run fused qk_norm_rope op + result = auto_functionalized( + FUSED_QK_ROPE_OP, + qkv=qkv, + num_heads_q=self.num_heads, + num_heads_k=self.num_kv_heads, + num_heads_v=self.num_kv_heads, + head_dim=self.head_dim, + eps=self.eps, + q_weight=q_weight, + k_weight=k_weight, + cos_sin_cache=cos_sin_cache, + is_neox=self.is_neox, + position_ids=positions.view(-1), + ) + result_qkv = result[1] + + # Split back to q,k,v and return + return result_qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + # NOTE: use fx_view_to_reshape to unify view/reshape to simplify + # pattern and increase matching opportunities + pm.register_replacement( + pattern, + replacement, + self.get_inputs(), + QkNormRopePattern.wrap_trace_fn( + pm.fwd_only, + QkNormRopePattern.fx_view_to_reshape, + ), + pm_pass, + ) + + +class QKNormRoPEFusionPass(VllmPatternMatcherPass): + """Fuse Q/K RMSNorm + RoPE into fused_qk_norm_rope when the custom op exists.""" + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="qk_norm_rope_fusion_pass" + ) + + dtype = config.model_config.dtype + if dtype not in (torch.bfloat16, torch.float16): + logger.warning_once( + "QK Norm+RoPE fusion not enabled: unsupported dtype %s", dtype + ) + return + + # use one attn layer to get meta (such as head_dim) for QkNormRopePattern + attn_layers: dict[str, Attention] = get_layers_from_vllm_config( + config, Attention + ) + if len(attn_layers) == 0: + logger.warning_once( + "QK Norm+RoPE fusion enabled, but no Attention layers were discovered." + ) + return + layer = next(iter(attn_layers.values())) + + for epsilon in [1e-5, 1e-6]: + for neox in [True, False]: + if RotaryEmbedding.enabled(): + for rope_flashinfer in [False, True]: + QkNormRopePattern( + head_dim=layer.head_size, + num_heads=layer.num_heads, + num_kv_heads=layer.num_kv_heads, + eps=epsilon, + is_neox=neox, + rope_flashinfer=rope_flashinfer, + ).register(self.patterns) + else: + QkNormRopePattern( + head_dim=layer.head_size, + num_heads=layer.num_heads, + num_kv_heads=layer.num_kv_heads, + eps=epsilon, + is_neox=neox, + ).register(self.patterns) + + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph) -> None: + self.matched_count = self.patterns.apply(graph) + logger.debug("Fused QK Norm+RoPE on %s sites", self.matched_count) + + def uuid(self): + return VllmInductorPass.hash_source(self, QkNormRopePattern) diff --git a/compilation/sequence_parallelism.py b/compilation/sequence_parallelism.py new file mode 100644 index 0000000..bb4dcf1 --- /dev/null +++ b/compilation/sequence_parallelism.py @@ -0,0 +1,363 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import functools + +import torch +import torch._inductor.pattern_matcher as pm +import torch.fx as fx +from torch._inductor.pattern_matcher import PatternMatcherPass + +from vllm.config import VllmConfig +from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce +from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + kFp8StaticTensorSym, +) +from vllm.platforms import current_platform + +from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm +from .noop_elimination import NoOpEliminationPass +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass + +logger = init_logger(__name__) + + +def get_first_out_wrapper(fn): + @functools.wraps(fn) + def wrapper(*args): + return fn(*args)[0] + + return wrapper + + +class _SequenceParallelPatternHelper: + """Helper for sequence parallelism patterns.""" + + def __init__( + self, + epsilon: float, + dtype: torch.dtype, + device: str, + ): + self.epsilon = epsilon + self.dtype = dtype + self.device = device + self.tp_group = get_tp_group() + self.tp_size = get_tensor_model_parallel_world_size() + + def _all_reduce(self, x: torch.Tensor) -> torch.Tensor: + return tensor_model_parallel_all_reduce(x) + + def _reduce_scatter(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.vllm.reduce_scatter.default( + x, dim=0, world_size=self.tp_size, group_name=self.tp_group.unique_name + ) + + def _all_gather(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.vllm.all_gather.default( + x, dim=0, world_size=self.tp_size, group_name=self.tp_group.unique_name + ) + + +class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper): + def __init__(self, epsilon: float, dtype: torch.dtype, device: str): + super().__init__(epsilon, dtype, device) + self.rmsnorm_matcher = MatcherRMSNorm(epsilon) + + def get_inputs(self): + input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype) + arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype) + + return [input, arg3_1] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + arg3_1: torch.Tensor, + ): + all_reduce = self._all_reduce(input) + rmsnorm = self.rmsnorm_matcher(all_reduce, arg3_1) + + return rmsnorm, all_reduce + + def replacement( + input: torch.Tensor, + arg3_1: torch.Tensor, + ): + reduce_scatter = self._reduce_scatter(input) + + rmsnorm = self.rmsnorm_matcher(reduce_scatter, arg3_1) + all_gather = self._all_gather(rmsnorm) + return all_gather, reduce_scatter + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper): + def __init__(self, epsilon: float, dtype: torch.dtype, device: str): + super().__init__(epsilon, dtype, device) + self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon) + + def get_inputs(self): + mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype) + + residual = torch.empty([4, 4], device=self.device, dtype=self.dtype) + rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype) + + return [ + residual, + mm_1, + rms_norm_weights, + ] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + residual: torch.Tensor, + mm_1: torch.Tensor, + rms_norm_weights: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + all_reduce = self._all_reduce(mm_1) + rmsnorm = self.rmsnorm_matcher(all_reduce, rms_norm_weights, residual) + return rmsnorm[0], rmsnorm[1] + + def replacement( + residual: torch.Tensor, + mm_1: torch.Tensor, + rms_norm_weights: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + # pattern matcher replaces from top-to-bottom, + # so residual is still the full size here. + # once the seqpar pattern with the previous rmsnorm is replaced + reduce_scatter = self._reduce_scatter(mm_1) + residual = residual[0 : reduce_scatter.size(0), ...] + rmsnorm = self.rmsnorm_matcher(reduce_scatter, rms_norm_weights, residual) + all_gather = self._all_gather(rmsnorm[0]) + # shape of residual changes but that's fine, + # next node is already slicing it, now becomes a noop + return all_gather, rmsnorm[1] + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + pm.register_replacement( + get_first_out_wrapper(pattern), + get_first_out_wrapper(replacement), + self.get_inputs(), + pm.fwd_only, + pm_pass, + ) + + +FP8_DTYPE = current_platform.fp8_dtype() + + +class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper): + def __init__( + self, + epsilon: float, + dtype: torch.dtype, + device: str, + ): + super().__init__(epsilon, dtype, device) + self.rmsnorm_matcher = MatcherRMSNorm(epsilon) + self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym) + + def get_inputs(self): + input = torch.zeros([1, 8, 4], device=self.device, dtype=self.dtype) + weight = torch.empty([4], device=self.device, dtype=self.dtype) + scale = torch.tensor(1.0, device=self.device, dtype=torch.float32) + return [input, weight, scale] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + ): + all_reduce = self._all_reduce(input) + rms = self.rmsnorm_matcher(all_reduce, weight) + quant, _ = self.quant_matcher(rms, scale) + return quant, all_reduce + + def replacement( + input: torch.Tensor, + weight: torch.Tensor, + scale: torch.Tensor, + ): + reduce_scatter = self._reduce_scatter(input) + rms = self.rmsnorm_matcher(reduce_scatter, weight) + quant, _ = self.quant_matcher(rms, scale) + all_gather = self._all_gather(quant) + + return all_gather, reduce_scatter + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + +class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper): + def __init__(self, epsilon: float, dtype: torch.dtype, device: str): + super().__init__(epsilon, dtype, device) + self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon) + self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym) + + def get_inputs(self): + mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype) + residual = torch.empty([4, 4], device=self.device, dtype=self.dtype) + rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype) + scale = torch.empty([1, 1], device=self.device, dtype=torch.float32) + + return [residual, mm_1, rms_norm_weights, scale] + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + residual: torch.Tensor, + mm_1: torch.Tensor, + rms_norm_weights: torch.Tensor, + scale: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + all_reduce = self._all_reduce(mm_1) + rms, residual_out = self.rmsnorm_matcher( + all_reduce, rms_norm_weights, residual + ) + quant, _ = self.quant_matcher(rms, scale) + return quant, residual_out + + def replacement( + residual: torch.Tensor, + mm_1: torch.Tensor, + rms_norm_weights: torch.Tensor, + scale: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + # pattern matcher replaces from top-to-bottom, + # so residual is still the full size here. + # add a temporary slice which will become a noop + # once the seqpar pattern with the previous rmsnorm is replaced + reduce_scatter = self._reduce_scatter(mm_1) + residual = residual[0 : reduce_scatter.size(0), ...] + rms, residual_out = self.rmsnorm_matcher( + reduce_scatter, rms_norm_weights, residual + ) + quant, _ = self.quant_matcher(rms, scale) + all_gather = self._all_gather(quant) + # shape of residual changes but that's fine, + # next node is already slicing it, now becomes a noop + return all_gather, residual_out + + pm.register_replacement( + pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass + ) + + pm.register_replacement( + get_first_out_wrapper(pattern), + get_first_out_wrapper(replacement), + self.get_inputs(), + pm.fwd_only, + pm_pass, + ) + + +class SequenceParallelismPass(VllmPatternMatcherPass): + """ + This pass enables sequence parallelism for models. + It identifies patterns where an AllReduce operation is followed by + an RMSNorm (or RMSNorm and then Quantization) operation. + These patterns are replaced with a ReduceScatter operation, followed by + a local RMSNorm/Quantization, and then an AllGather operation. + + The general transformation is: + Input -> AllReduce -> RMSNorm -> Output + becomes + Input -> ReduceScatter -> RMSNorm -> AllGather -> Output + + While this pass itself does not directly yield performance improvements, + it lays the groundwork for subsequent fusion passes, such as + GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can + significantly reduce communication overhead and improve overall model + performance. + + + This pass splits up the residual tensor across TP ranks and hence divides its size. + Because the pattern matcher starts at the end of the graph, the replacement + contains a slice that temporarily conforms the input residual to the correct size. + After all patterns have been matched, we use a NoOpEliminationPass to clean up + what have now become no-op slices. + + Note that an older version of the pass did not need this as it operated only on + custom rms_norm and fused_rms_norm_add custom ops which did not complain about + mismatched shapes during replacement. So this approach has the same assumption that + correctness is only maintained if all rms_norm operations are split across ranks. + + Correctness-wise, this is approach strictly better than before - before, + the graph was incorrect semantically and shape-wise during the pass. + With this approach there's only semantic incorrectness during the pass. + Both approaches restore a correct graph once all patterns are matched. + """ + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + # Used to cleanup redundant views created temporarily + # to circumvent residual shape change issues + self.noop_cleanup = NoOpEliminationPass(config) + self.noop_cleanup.pass_name = f"{self.pass_name}.{self.noop_cleanup.pass_name}" + + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="sequence_parallelism_pass" + ) + + for epsilon in [1e-5, 1e-6]: + # RMSNorm + Static FP8 quantization patterns + FirstAllReduceRMSNormStaticFP8Pattern( + epsilon, self.model_dtype, self.device + ).register(self.patterns) + MiddleAllReduceRMSNormStaticFP8Pattern( + epsilon, self.model_dtype, self.device + ).register(self.patterns) + + # Normal RMSNorm patterns + FirstAllReduceRMSNormPattern( + epsilon, self.model_dtype, self.device + ).register(self.patterns) + + MiddleAllReduceRMSNormPattern( + epsilon, self.model_dtype, self.device + ).register(self.patterns) + + self.dump_patterns(config, self.patterns) + + def is_applicable(self, shape: int | None) -> bool: + # When sequence parallelism is enabled, the residual tensor from RMSNorm + # needs to be split along the sequence dimension. However, this dimension + # is symbolic during piecewise compilation, and splitting symbolic shapes + # is not supported. + # + # This pass is therefore only applied when the sequence dimension is + # concrete: + # 1. In full-graph compilation mode (no Dynamo splitting ops are used). + # For this case we always pad num_tokens to be a multiple of + # tensor_parallel_size, so there's no need to check shape % tp_size == 0. + # 2. For specific shape provided during compilation (e.g., from + # `compile_sizes`), which must be divisible by the tensor-parallel + # size. + if ( + not self.compilation_config.splitting_ops + or self.compilation_config.use_inductor_graph_partition + ): + return True + tp_size = get_tensor_model_parallel_world_size() + return shape is not None and shape % tp_size == 0 + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph): + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + # Clean up reshape nodes + self.noop_cleanup(graph) diff --git a/compilation/torch25_custom_graph_pass.py b/compilation/torch25_custom_graph_pass.py new file mode 100644 index 0000000..1031856 --- /dev/null +++ b/compilation/torch25_custom_graph_pass.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from typing import Any + +import torch + + +class Torch25CustomGraphPass(ABC): # noqa (redefinition) + """ + This class replaces CustomGraphPass from torch==2.6 when using torch<2.6. + It conforms to the 2.6 interface but also supports pickling, as that's what + the inductor code cache uses to determine the cache key before 2.6. + (in 2.6 and above, uuid() is used.) + + Subclasses can just "pretend" that uuid is used. + """ + + @abstractmethod + def __call__(self, graph: torch.fx.graph.Graph) -> None: + """ + Implementation of the custom pass. + """ + + @abstractmethod + def uuid(self) -> Any | None: + """ + Return an ID to uniquely identify your custom pass implementation. + Return None to skip inductor code caching entirely. + """ + + def __getstate__(self): + """ + Pickling is used instead of uuid() in torch<2.6. Just return uuid() + to enable subclasses to only have to implement uuid. + """ + return self.uuid() + + def __setstate__(self, state): + raise ValueError( + "Cannot unpickle CustomGraphPass because pickling" + " is used for cache key uuid. Use torch>=2.6 with" + " native uuid support for custom passes." + ) diff --git a/compilation/vllm_inductor_pass.py b/compilation/vllm_inductor_pass.py new file mode 100644 index 0000000..08721e3 --- /dev/null +++ b/compilation/vllm_inductor_pass.py @@ -0,0 +1,173 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +import operator +import time +from dataclasses import dataclass +from typing import ClassVar + +import regex as re +import torch +from torch._dynamo.utils import lazy_format_graph_code +from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter + +from vllm.config import VllmConfig +from vllm.logger import init_logger + +from .inductor_pass import InductorPass + +logger = init_logger(__name__) + + +@dataclass +class InductorCompilationConfig: + splitting_ops: list[str] | None = None + use_inductor_graph_partition: bool = False + + +class VllmInductorPass(InductorPass): + """ + An inductor pass with access to vLLM PassConfig. + It provides timing, logging, and dumping utilities. + """ + + dump_prefix: ClassVar[int | None] = None + """Keep track of pass index for debug dump ordering.""" + + def __init__(self, config: VllmConfig): + # Get only the necessary CompilationConfig for the inductor pass, since + # full `CompilationConfig` contains pointer to model which is unsafe. + self.compilation_config = InductorCompilationConfig( + splitting_ops=config.compilation_config.splitting_ops, + use_inductor_graph_partition=config.compilation_config.use_inductor_graph_partition, + ) + self.pass_config = config.compilation_config.pass_config + self.model_dtype = config.model_config.dtype if config.model_config else None + self.device = config.device_config.device if config.device_config else None + self.pass_name = self.__class__.__name__ + + @staticmethod + def time_and_log(call_fn): + @functools.wraps(call_fn) + def wrapped(self: VllmInductorPass, graph: torch.fx.Graph): + self.begin() + self.dump_graph(graph, "before") + call_fn(self, graph) + self.dump_graph(graph, "after") + self.end_and_log() + + return wrapped + + def dump_graph(self, graph: torch.fx.Graph, stage: str): + i = VllmInductorPass.dump_prefix + i_str = "" if i is None else f".{i}" + lazy_format_graph_code( + f"post_grad{i_str}.{self.pass_name}.{stage}", graph.owning_module + ) + + def begin(self): + self._start_time = time.perf_counter_ns() + + def end_and_log(self): + self._end_time = time.perf_counter_ns() + duration_ms = float(self._end_time - self._start_time) / 1.0e6 + logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms) + + +class VllmPatternMatcherPass(VllmInductorPass): + """ + A VllmInductorPass that uses the Inductor pattern matcher. + Its main use is providing the dump_patterns utility that dumps the + Inductor pattern matcher patterns into a file, which greatly aids debugging. + + TODO(luka) move more utilities to this pass. + """ + + matched_count: int = 0 + """The number of matched patterns in the pass.""" + + _OP_OVERLOAD_PATTERN: ClassVar[re.Pattern] = re.compile( + r"" + ) + + def _replace_op_overloads(self, string: str) -> str: + """Replace with nicer formulations""" + return self._OP_OVERLOAD_PATTERN.sub( + lambda m: f"torch.ops.{m.group(1)}.{m.group(2)}", + string, + ) + + def dump_patterns(self, config: VllmConfig, pm_pass: PatternMatcherPass): + """ + If debug dumping is enabled, dump the Inductor pattern-matcher patterns + into the debug_dump_path folder next to the dumped fx graphs. + + This method does its best to print something that looks like Python code + for easier debugging and potentially navigation. If any errors appear in + the output, please add to this method. + + TODO(luka): use pattern object to manually produce pattern graph + """ + debug_dump_path = config.compile_debug_dump_path() + if not debug_dump_path: + return + + debug_dump_path.mkdir(parents=True, exist_ok=True) + + from vllm.utils.system_utils import unique_filepath + + file_path = unique_filepath( + lambda i: debug_dump_path / f"patterns.{self.pass_name}.{i}.py" + ) + + with file_path.open("w") as f: + print( + f"# This file was produced by VllmPatternMatcherPass." + f"dump_patterns for {self.pass_name}.\n" + f"# It does its best to produce valid-Python-looking code but" + f" please add to dump_patterns if there are any errors.\n\n" + f"from torch._higher_order_ops.auto_functionalize import " + f"auto_functionalized as auto_functionalized\n" + f"from torch._inductor.pattern_matcher import *\n" + f"vllm = torch.ops.vllm", + file=f, + ) + + for node, patterns in pm_pass.patterns.items(): + # fix the operator.getitem repr + if node[1] == operator.getitem: + node_repr = f"({repr(node[0])}, operator.getitem)" + else: + node_repr = repr(node) + + node_repr = self._replace_op_overloads(node_repr) + + print(f"\n\n# Patterns for op: {node_repr}", file=f) + for i, pattern in enumerate(patterns): + # reserve auto_functionalized ahead of time + pp = PatternPrettyPrinter() + pp.namespace.create_name("auto_functionalized", None) + + # Assemble pattern + out_node = pp.pretty_print(pattern.pattern) + pattern_repr = "\n".join( + [f"def pattern_{i}():"] + + [ + f"{pp.memoized_objs_names[key]} = " + f"{pp.memoized_objs_pp[key]}" + for key in pp.memoized_objs_names + ] + + [f"return {out_node}"] + ).replace("\n", "\n ") + + pattern_repr = self._replace_op_overloads(pattern_repr) + print(f"{pattern_repr}\n", file=f) + + +class PrinterInductorPass(VllmInductorPass): + def __init__(self, name: str, config: VllmConfig): + super().__init__(config) + self.name = name + + def __call__(self, graph: torch.fx.Graph): + self.dump_graph(graph, self.name) diff --git a/compilation/wrapper.py b/compilation/wrapper.py new file mode 100644 index 0000000..493e57f --- /dev/null +++ b/compilation/wrapper.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import sys +from abc import abstractmethod +from contextlib import contextmanager +from types import CodeType + +import torch +import torch._C._dynamo.guards + +import vllm.envs as envs +from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def _noop_add_global_state_guard(self, *args, **kwargs): + """No-op to skip the GLOBAL_STATE guard entirely""" + pass + + +def _noop_add_torch_function_mode_stack_guard(self, *args, **kwargs): + """No-op to skip the TORCH_FUNCTION_MODE_STACK guard entirely""" + pass + + +@contextmanager +def _compilation_context(): + """Context manager for compilation settings and patches. + + This manager: + 1. Sets higher dynamo cache limits for compilation. (Needed for + qwen2_5_vl see test_qwen2_5_vl_evs_functionality). + Generally a recompilation can happen whenever we use a new + backend instance in torch.compile. + 2. Patches out add_global_state_guard to skip GLOBAL_STATE guards + 3. Patches out add_torch_function_mode_stack_guard to skip + TORCH_FUNCTION_MODE_STACK guards. + 4. Restores everything when compilation completes + """ + # Save original values + original_global_state_guard = ( + torch._C._dynamo.guards.GuardManager.add_global_state_guard + ) + original_torch_function_mode_stack_guard = ( + torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard + ) + original_cache_size = torch._dynamo.config.cache_size_limit + original_accumulated_cache = torch._dynamo.config.accumulated_cache_size_limit + + try: + # Set higher cache limits for compilation + torch._dynamo.config.cache_size_limit = 2048 + torch._dynamo.config.accumulated_cache_size_limit = 8192 + + # Patch guard manager + torch._C._dynamo.guards.GuardManager.add_global_state_guard = ( + _noop_add_global_state_guard + ) + torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = ( + _noop_add_torch_function_mode_stack_guard + ) + yield + finally: + # Restore original values + torch._C._dynamo.guards.GuardManager.add_global_state_guard = ( + original_global_state_guard + ) + torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = ( + original_torch_function_mode_stack_guard + ) + torch._dynamo.config.cache_size_limit = original_cache_size + torch._dynamo.config.accumulated_cache_size_limit = original_accumulated_cache + + +class TorchCompileWithNoGuardsWrapper: + """ + A wrapper class for torch.compile, it ensures that all guards are dropped + when CompilationMode is not CompilationMode.STOCK_TORCH_COMPILE. + When guards are dropped, the first time __call__ is invoked, a single + compilation is triggered. Dynamo should never be traced again after that + since we drop all guards. + """ + + def __init__(self): + self.compiled = False + + vllm_config = get_current_vllm_config() + self.vllm_config = vllm_config + mode = vllm_config.compilation_config.mode + if mode is None: + raise RuntimeError("Compilation mode cannot be NO_COMPILATION") + + backend = vllm_config.compilation_config.init_backend(vllm_config) + options = {} + + if isinstance(backend, str) and backend == "inductor": + options = vllm_config.compilation_config.inductor_compile_config + + if mode != CompilationMode.STOCK_TORCH_COMPILE: + # Drop all the guards. + options["guard_filter_fn"] = lambda x: [False for _ in x] + + if envs.VLLM_USE_AOT_COMPILE: + if hasattr(torch._dynamo.config, "enable_aot_compile"): + torch._dynamo.config.enable_aot_compile = True + else: + msg = "torch._dynamo.config.enable_aot_compile is not " + msg += "available. AOT compile is disabled and please " + msg += "upgrade PyTorch version to use AOT compile." + logger.warning(msg) + + self._compiled_callable = torch.compile( + self.forward, + fullgraph=True, + dynamic=False, + backend=backend, + options=options, + ) + + if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE: + torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook) + self._compiled_bytecode = None + + def aot_compile(self, *args, **kwargs): + if not hasattr(self._compiled_callable, "aot_compile"): + raise RuntimeError( + "aot_compile is not supported by the current configuration. " + + "Please make sure torch.compile is enabled with the latest " + + f"version of PyTorch (current using torch: {torch.__version__})" + ) + return self._compiled_callable.aot_compile((args, kwargs)) + + def __call__(self, *args, **kwargs): + if envs.VLLM_USE_BYTECODE_HOOK: + if ( + self.vllm_config.compilation_config.mode + == CompilationMode.STOCK_TORCH_COMPILE + ): + return self._compiled_callable(*args, **kwargs) + + if not self._compiled_bytecode: + # Make sure a compilation is triggered by clearing dynamo + # cache. + torch._dynamo.eval_frame.remove_from_cache(self.original_code_object()) + return self._compiled_callable(*args, **kwargs) + else: + with self._dispatch_to_compiled_code(): + return self.forward(*args, **kwargs) + else: + with _compilation_context(): + return self._compiled_callable(*args, **kwargs) + + @abstractmethod + def forward(self, *args, **kwargs): ... + + def original_code_object(self) -> CodeType: + """Return the original code object of the forward method.""" + return self.__class__.forward.__code__ + + def bytecode_hook(self, old_code: CodeType, new_code: CodeType): + """Hook to save the compiled bytecode for direct execution.""" + if old_code is not self.original_code_object(): + return + # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25 + frame = sys._getframe() + while frame and frame.f_back: + frame = frame.f_back + code_name = frame.f_code.co_name + file_name = frame.f_code.co_filename.split(os.path.sep)[-1] + if code_name == "_compile" and file_name == "convert_frame.py": + break + frame = frame.f_locals["frame"] + assert frame.f_code == old_code + + if frame.f_locals["self"] is not self: + return + + self._compiled_bytecode = new_code + + path = self.vllm_config.compile_debug_dump_path() + if path: + decompiled_file = path / "transformed_code.py" + if not decompiled_file.exists(): + try: + # usually the decompilation will succeed for most models, + # as we guarantee a full-graph compilation in Dynamo. + # but there's no 100% guarantee, since decompliation is + # not a reversible process. + import depyf + + src = depyf.decompile(new_code) + + with open(decompiled_file, "w") as f: + f.write(src) + + logger.debug("Dynamo transformed code saved to %s", decompiled_file) + except Exception: + pass + + if ( + self.vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and "update" in new_code.co_names + ): + import depyf + + src = depyf.decompile(new_code) + msg = ( + "Assigning / modifying buffers of nn.Module during forward pass is not " + "allowed when using cudagraph inside the compiler because it will " + "cause silent errors. Please use eager mode or fix the code. The " + "following code contains clues about which buffer is being modified " + f"(please search for the usage of the function `update`):\n{src}" + ) + raise RuntimeError(msg) + + @contextmanager + def _dispatch_to_compiled_code(self): + # noqa: E501 + """ + Context manager to dispatch to internally compiled code for torch<2.8. + Why does this work? Because Dynamo guarantees that the compiled + bytecode has exactly the same arguments, cell variables, and free + variables as the original code. Therefore we can directly switch + the code object in the function and call it. + + See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details. + """ # noqa: E501 line too long + original = self.original_code_object() + assert self._compiled_bytecode is not None + self.__class__.forward.__code__ = self._compiled_bytecode + try: + yield + finally: + self.__class__.forward.__code__ = original diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..dd76a72 --- /dev/null +++ b/config/__init__.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.config.cache import CacheConfig +from vllm.config.compilation import ( + CompilationConfig, + CompilationMode, + CUDAGraphMode, + PassConfig, +) +from vllm.config.device import DeviceConfig +from vllm.config.ec_transfer import ECTransferConfig +from vllm.config.kv_events import KVEventsConfig +from vllm.config.kv_transfer import KVTransferConfig +from vllm.config.load import LoadConfig +from vllm.config.lora import LoRAConfig +from vllm.config.model import ( + ModelConfig, + iter_architecture_defaults, + try_match_architecture_defaults, +) +from vllm.config.multimodal import MultiModalConfig +from vllm.config.observability import ObservabilityConfig +from vllm.config.parallel import EPLBConfig, ParallelConfig +from vllm.config.pooler import PoolerConfig +from vllm.config.scheduler import SchedulerConfig +from vllm.config.speculative import SpeculativeConfig +from vllm.config.speech_to_text import SpeechToTextConfig +from vllm.config.structured_outputs import StructuredOutputsConfig +from vllm.config.utils import ( + ConfigType, + SupportsMetricsInfo, + config, + get_attr_docs, + is_init_field, + update_config, +) +from vllm.config.vllm import ( + VllmConfig, + get_cached_compilation_config, + get_current_vllm_config, + get_layers_from_vllm_config, + set_current_vllm_config, +) + +# __all__ should only contain classes and functions. +# Types and globals should be imported from their respective modules. +__all__ = [ + # From vllm.config.cache + "CacheConfig", + # From vllm.config.compilation + "CompilationConfig", + "CompilationMode", + "CUDAGraphMode", + "PassConfig", + # From vllm.config.device + "DeviceConfig", + # From vllm.config.ec_transfer + "ECTransferConfig", + # From vllm.config.kv_events + "KVEventsConfig", + # From vllm.config.kv_transfer + "KVTransferConfig", + # From vllm.config.load + "LoadConfig", + # From vllm.config.lora + "LoRAConfig", + # From vllm.config.model + "ModelConfig", + "iter_architecture_defaults", + "try_match_architecture_defaults", + # From vllm.config.multimodal + "MultiModalConfig", + # From vllm.config.observability + "ObservabilityConfig", + # From vllm.config.parallel + "EPLBConfig", + "ParallelConfig", + # From vllm.config.pooler + "PoolerConfig", + # From vllm.config.scheduler + "SchedulerConfig", + # From vllm.config.speculative + "SpeculativeConfig", + # From vllm.config.speech_to_text + "SpeechToTextConfig", + # From vllm.config.structured_outputs + "StructuredOutputsConfig", + # From vllm.config.utils + "ConfigType", + "SupportsMetricsInfo", + "config", + "get_attr_docs", + "is_init_field", + "update_config", + # From vllm.config.vllm + "VllmConfig", + "get_cached_compilation_config", + "get_current_vllm_config", + "set_current_vllm_config", + "get_layers_from_vllm_config", +] diff --git a/config/__pycache__/__init__.cpython-312.pyc b/config/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8c1df33346b835cade0a6a72c78ee96b48858fa GIT binary patch literal 1981 zcmaKtOK;mo5P+A|(|TEwWy^2Jc2ZY<$xfW4X@UZYViZN<#;9E%_CnAkWf9pGiRDs= zjy?AW^xj+lLjOYl!2&`LJ@wYsFbwq6os}$;PEZ{_%+71MvoovTip3m~Ka%{>{WXu! zAF#Oj=_BECTSMq4WFuQ6h-z5VK-P&)6F5PWI7w4DMbkK~^a+xoS)5gJl1$JX&M7%X z^0a^pN=}m^E#Z=qGh~vMaaqY(QlV8`Rq_OxqBUHj(|DTB;2EXMkvg5lvr5jBIXaK$ zm0TbTbP+GoCA_58MY2p+@QRX401v@E+aA`}84xNFU)x^Z*}d=mOcbpEdlrKi!!5 z6H3?6M80KO9jE1W+wMgpDJm_G`YtiK>ve}pF&R-EdA1{pt?ysI{F<45M=f*5W)O@L zXrl1i>ABWTLQRwpTc^zI25pCpeM)c64tq|Q-$s?+oPFe#d+V9@I7Y6%dpTMqL>_h~ z!=;#axx);TSsf`@JY^o6IES<||}S;n?#?VGRW^P`3gdRC*kUG{kHQd1Hn7{)K?QtHsZ&zqe3P2cr*xXI*T?R*HIKKtVRQ>RJX z^QPbD9k1Jjl+9r;HjUwY8@qmAWM9bZkgv~I?1juF{~%FtA@Ns2)3mEB(l)R4l%`)I ziK_zAwg>27fc6wO25942Pip$#Na8AoG--PFJ9<7qTLZK|K#%2efL>ke8BM=NfO1+o RB8h)$PxAT?8TO^z>puwQPXGV_ literal 0 HcmV?d00001 diff --git a/config/__pycache__/cache.cpython-312.pyc b/config/__pycache__/cache.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9fde9a49e0ac988e8f86c1e553ac03c8e89c38f7 GIT binary patch literal 6069 zcma(#TWl2Bl~vVM-R*Y2{I>C{3^u00Zi4~C389tRJ@8>~^J{^;@hCyHn}pIU>>!>=UuN>~5u-=W+s(lx~3|Nz!wl_hnRiNziER zgMrcqy(sJLgIShvk)#&3&50jv~!L_$( zubu|BuL;}R#9IK~eT~!ZYogj$TN_pS$pA>y-xv?F@j!#ikRcfH2ZIeR!%bXVVG;(b$DEphJMjF@yB+}5_{swlGj5V+aYS>Y7Fe4aaABf7haqt69IRqL_ z88Ku0!J)dm3UQYMTH-~VWv!q(1x+WuYqhdHRl{8=kT<WwYK9Wx_2i1Sirkq@j+(zsC^Ze@#T|$_W-hBPn>i!D>~)cxHg6HNKuN}YUq!w! zONVjm-WQSTg61r!nw8CaaSSr~3_7@>X6L;QU9ZTlwSNr zP55J!%-4+9g90yT6m^o(9M{)`WqDmN;cwYLCSj#;A*oXDJ#&ch}3XSKb@Bgu? z9&MXe?yYiL>gYvQ71T$5>&EKiHNL)PF;U0uwRLYZ_ymqv>v*Iimo3FvQ#T9q$3T| z8#I3#oDC#Hm+?0%=FIOEiM1lb1UM>)3!++9jTa_&r?H+%=WoX59hVk$FeM`AGcRx6 zSj2>oGc<3*NkBB%$eAMcJTcJUL1@$n0X^k3OI|R+<8rbC2Ov(9cHT!_0J*H?RzQEy znI{nH$g5duIRkV8rA-!c@U5Ou8C{Drxfl;;Gk8FP$P&z z?Va6A3F+pUJFtkg6A1M9*%e-l>lwf-`f;^*L2!m4VRx~r<51of}&&?~?o6`{5x&vipCjX+sn`cmD=M(@?c#u1O>KBtgpL{%hxjcP&XZp%c z;%a&N%HLhvp1s$#8Ti4?t*-AYpCn4tS3XNzErqZCcO`Mv!8rLh-KT?|#7r3MQnbOa z^ya9Gs$vTLCUiiduZ+}2BkIvEx3PM$p)gol`;3O_Fn(ifo1<94uKw&@2Yn4{Xc?=1 z8S^?itOJpV$V<0tHHQ3_D^p?=b%-74X;*H%EBna_>XE!#fFuS%3Y>C2pLc3;sz4~X znyuw@l9t~lT5fs4w8#VoKU-S?#^8*yrh_MH`igAOd_i6|!5tugXmSCD5MeQ)Gx?Z* zrQvEL5ILypAf*@dA0r;*+e{v+`hadZ%#j^bhAjKn+Esttkmg*yI@0V=uOxX0Oq?SC z*hNQs9=DE^TSvCXc3Ka89&P{l`bXERLZEH`9|BS=UX60e{!(}!{RYr`i9+r1qxw$$ z!qwvSv#b3RbiT@0_O5dku5(@3^Mbm^tOvj-UXeW4#Wi6~WUHh#sfkZ32M?GZiH5Ki zcm+1_6LCGbCK=*d0M{;Ef=C$OOTUJ+7F-2+>mK+QJkSQ4XnWj7f?jN-Wmr)AjL=%J zo_cw11^%VNbE|>$bY?Xv!%I@eYp>;))ft9y^Dg9z**#0c6l+1H3z@@cl)+e_%B;pq zXEW^k&-1I@@8t8ch4EHynwPUbyo=wj&##W$$UBZX4@uE|CFPit3egFWo#I(&9Dvse zb*_`AP&4`uTfYK}z+1XVE>oJPR6(-PvPN@Yv%mnu4gD@6lY4b^t(r@+<2AjRy|!At z=%`@%Y7SFP63fL}5mC0dPe(Sa?!GubmFtt6Q8#a zlm<^e3X})WRNBv$qGxwwgWG}9!I`o=^I2>L{cY}}xl;eRPXqrNy#y@cr znoS_sM^qI8X%QY3suxw&?-ezxHWPw`kk?g}j=}jTW-G7dB0idKVilcGRV|kTbHith z15~JBnnEgx{gmLF0+t9~glY5AI}j&n8WAFdJc~|YHvmzP9)%t~hF}_TT{qbK23LVm zT!pIQ4S2`U<4AAb|9lIG0EuqZ`*Y<6A4`WU3+_tqPLV_AJVc9e#M=(a5L#&gjLGxNqxnMZ8!Q zg3<;4S)A(|+Scy9vD2G|g;GUK?}{_KL!%qlH}_SBwz)M&8 zr-#!ev4894_KCX+tU=^8{^`ir#@y!Y*61hKD&km;Ff~~c`?gMOkKR2~5hr)WH+N^w zmBfM4(D90R4uZ7wHvcTl4eq};@{r#dm<9$Q-E>t5!M;gu{)&sGINdJCD19l`Ug zR>Z@IN&qGWZ2PueXNM7h!LtarfBc^NXl!TX!rrMa?251ddgS0M@lQunDA_v|G3ATj z&iRKwzTJ5j7*C)wXW&qWCQ4%Wmb5M2jaI}XsDo=fINfVs0|`p~>4%-={*yIfQ`K<0 z)VrlW=b-uhp#Vq^f(5%a-@DhdBOTk7l2sur^*@YMIcOfO{vMm>0WR2vV7e;8u-e%n zUFM(mjaMb?z*!^lKZLOBTNK(+YM5Sz#%pc-zDOKq*8Qw1#XcRHM48us$5Mwb_ zLv4|B9TxMw&X=ZrUVbKM-lLuld@nLlM$7}$Mo>WDB5)8C5iq}6#vaxRegF@j^#j*A z`saYdD%bgY0M!7`^UuQ^pLoIz{+3IZx%7W=XUg1}C*0xx;6|Tt2cB^0C*0H*f#h$* z<4?PKH|D-*>D>HwrDbR%_%zy4>P(lTQyVS2u}-v$_Gf``T-byjOI+7(_h4!0Sh;(;#C5p^$P$&Ni$Wcw5ShXfA8o2{eAtf%ggN?o-qIGA>m_=`}g#sJteAV z=l4wBLxQz}ItD#Klo?&)q57d4AL)7@rHY!Q7wMDOXD?lp5SaYEI%IiXtQ=B_BA zx$X}w%G-25OX)^R4^mF&rcwNQO2617dRXcjq^|v;S&dh-{80U!9@Nv-Azl5R=?*NJ zd<}n2xh8!kxpaJfF&Y;ocx~gsa5#7^Ec(oHNiZgaX5zASHYA1xMDmw{@f$u~E}we) z(wV^c%V)-4JvaH1Y#EC#A!9fgi(Lsya_RXH$_j>2f@5lFK^zl;3kZ;1teC*fU^pZM zsn+sf93VeMr7>))bBBE>(`YZ$%7bsaM7C?P7 zHv%H12+T`?sCbKwhy6-892sX-&ZvXTe$>h4cDnF-j}tjs=+kDwG;I;g(|j2g`m_}; zKV1UXI$gR{;wzP1(E zGoX(O-bw=jVSWZ4e)8hv8QFb#>f-pTfvJnHkG~ukzj)!&x$|e_s)@HJ$1Yq9Ouarf zekO2na{P?!ymJ2h1uelR+zx8wesO9r9$6S%2rUf6gVJ>|KJeb+(PJa;9u)_}p=*N+ zOYs}?(ZQP--a%zb@uLrpe_=^#L?O)a!V?>W^KI@6v&mYqUBWrbS0=XMZ+YCS*F<+Qbr(pEXaH;@65_o8T5o zzRgdU;kOjOcEKiKVnxR+FO=ck$^2%)4u3h)IPhDchB@JPss3{KD^-65{8g&o1%I{b zuY}(Xzh~AAIKofYELHnFa{2fh6Jsw)!G#-a>HI6K%2+%YjmMN_LJMh82wtZIY$bR@ zQSZ1hq{<@pvxRa&3`TF=2!+LdT07p@;BBL{s7#F<~AW0}9DlMS(YcxuW5<;5x|Cz ziQ(B8wZJR&P>e$mXR!NdUx|r^yqWUnM4dp|o&9iDxa{De%-}?7aAIrl?EAOZr@lY^ z;q-mehwr3aeM#G;&nDoo1^nM*6xdV@pqbJEIJy0_M?&w3g3`_U{B9aCFHp`y-aTz_UgF4=Ua_EAE%t`!2;0e zQ!NO;+Hh#U%Kle|hF6vX+wYc0@IAIDc8yFmVgfp7Q8TdcokL0)ZKsZaOU9I z)WNe`2VZ{w_D7eMweY<_+BJ~0z5ba1XZLu}6tnS|fah7`@$?0uEPZV;uno(4%oCX3 z)mt|$@!7?2*ywUB(VpMkB1tBxzXZ|QjAvY3DOXq8)${3vO#i7A{#>V$wo^~(=Dz3X z=4DRlgh?XUx~t{&)U2Gvt6|7kqHA~ai=}&w`cP%`VJPJqO1p-Ww&C3k=aG!f55bOm zTwKRc#lcXV(z8dS)3wf6WP2zUV(5KFWD8KX#updDVqyQ-T1~X(ccl<_O`(pM8_o*% z$knjk`TgDxdp{P_uKuL0e|IPTn=P^-w17bzX^eb@n59xhlOTKpjEE`hR$=^`)UBM= z)s}L#rCseGzmwU2G=)Fc(WLF@?*8eMBp^R+qBwq)rx>otiPfh|OV8!zEdpj{Ere+_ zutNJwq9rXIC#UEjxEXg8P$?w_R(5T)B1wpKj1ecoHrhw!nfl&TeJ|~^@l^l#pT3hh z^JePIo5{MjlCHOswzqZ<3awPOa*60>kOjYmkJc5CKg3Avnn5rRvA7fnBx>^4qZ<4U z%8NPSDDZ#j&Qj7=&zeH`w+|6)b@*zfR`R!z<0Yq^ocw{4D+x{oCeDmsoFE?L{M&Lh z3mKc72pIT`ExE(UF7wys=feupV{n2&ywgEY zmnM>}USc4KC$U&<+jhp2WGkyy&Sf3Wm5FS9^SZFMoa8;(#@3CpWJ7O~uifEI*4Ios zCFE|G@Yd$_z&3}U2c8}J*>;+&M;`Dy9DXR#Wh63LU)Y$UkQc~*_C#dN&_^&@2qTG3eAcGlt0|c_d$%3trb@92Vb!8bET1(KDGyc&ER;qJfPJd`HaG1Ns#ON6%;%QvBv4S;pv3>H{4RpV z2QSVzsF}rBd_LmUT56_k7{8KH7q(mzaTfMFEJ2=Vsu6f5PK93i{Vf#BD=KQv8CutAif|sssXd1 z01`5{noq`7FbdSGV<680?-VK zeVZ+2!F1aU%nN_vbKA0PS!qKYr7n21B{ZjPjhy&plSVO83E!-UW8o^#vSrzvKn^Yb zGByi1N=xG3vl*f#^g3f|G0m7QbeDuUk;Q0eMp=Q>dE(fwEyYEJqo0Lb3_S9zDCtG$ zgNcPg^QLqD*F!hOsF#h6mrC;MwN1`PQSNL`oL(B1b96oq?tfunULrpJ+LF3Juxzl} zIy;-#-(FywAQoDD991JezKOIn#JP)p-81>2%{;tFJ!x4JUmsz8_mVzTr-}JJYWHN!u%b0s{RN z1&yLq{m~cT-$4*W;Uw>>HQQU7QSD7<(&l z`P}rGz=hKi(T)%x?jvEHDv4x^7Jh($nK%}o3N)lyB$X>zLQM`ElROky3Lz^oOwv_) ztHIm{1o37FfMF)M5WE%&hvG|eNqjy)3&P9_Vi|LJa4CSiiJ_dye$KV_J>-iKeqZ5! zZoBfi?a*Uachc3CadoF$-CHhS)?KrDF6;7VTtwezokK}ybH+K8at@`P!x`uCl=FDn zdGevOV=-6Sw>fi}U738Mo3w}E1%p1P;K`-_i;|T(bXnG6h5Wxv7DG!oQlbwR;c*iOn4FH8f8KJP>C^;%czO~=@=v$j$H&Lzql9@ z60V%-rhvbPq%lJIE8G(&SJ|9#?N7P(e>{7C>i5EF*P*2C(C)dRd0}(&HO$yv^lfaW zIG-lf7@XRi4)IZ<`UpB!FpqJt<@W5X)lR3n|i#;DYD{$q)%e6{H7A zFi{}_(A9;@lfdVh80*5yGBQOT^{K7Z+q*5Taoqs^|WxV#!@B|48# zIPqIv$l71OI3DqXR>dAwCg4^G^dvU6KG0`0c1+s0LTAUWD}b>&4*?asXK;9R;>XUuZ3j0zWlB1F z85H?M;Y=s@;7Vx}B@z+Iu0S9d#ayt39Sa1MQDj5?7A1O{95$$oNV3j`=Yw%&fP)kz zP$5nzbmF0zhzQ-7AoC0|63UaJ_Ld3DW`=3BD-scg?cl5g9<+d!5-m~C!fxwtl! z=G(LU{%m)Dl5gEWuKp}Pk{upN@*Nv*+@JcvSJV8+wz<-JaC;v|*|jvMwrx)9tET%i z+Z_Eqn0Ywy59fB6f4iA0sopl*to8>}4?7+%KI}~$KJz7q_Z^EC^QEW6I{c)v)~X2C zSjT42C;ic2&CVLX7rQE$RdzC$X|?I02J2cT+VxP4fi0PKh)%GgD#HrRez{=#Huq_n z2J@yXz=+xj`yiFZpq79grCd&hVWnJRRZgC2u}XBRdCGMr*R6%>t*B7rT+<%02IW=i zX=}wAJyn$+T89#>qLcbdrRyoGl@zQU^*QZm5bIH@dym*g#5zT%!cHTdN3W+zY}U)J zQA2B|TNsmGr>APA*5tLLUX622x3T^-sPB!_UiRLk$G5ZhX62o=dtXkwJH&lxPm2*N zm_KdRXQWfK>aAlp)pchyRdxXnGi$Plt<6vOmU zmZxUVJco+p8DV)^_RKR}B+oIH$Dz$UYnPL?pBD}nNk1x_5ssi=$AzQ92&EHG&|X0= zjv@3UVn^|NN;nQt(F?)}xGxGP;f^6~4aVmbVo$677vLWkUPSDKFoqR(8dd?kFpkhO z!URIkE}4C2<*HqRpx`CYgGGc*2L0{#o{JM@5BgjZiJ+I_u~9|pG~k_@Ul<6BH^p$S zDxc{6xHKOgwG&%R&w%$US4qoy^_{P%Od5%h{JUD?r$q1MWfR0}i`N6fVq^g_o%oGs zmQavBzpxpYu85%MrDqnSr_3+lT5#rFF)BPWyP7UPvm(I@%rC^Ancqm7UqoyHd?*zD zAcdZzl)TjW1t|;`rDjulq^+J{OxrM3He=UV3 zt!HcG%^OSJTcRSX7ebJAkewluvnrBWed?K|lvac?K?8;X)TkG7du@MdJ6SRE3wmP< z;!J2ZG{cHTWYjw!4KESzMwWp3Qe_r_;O}Bkz^RhnH$@0}6tRuE3z%ks_(e92)bv}# zOGB=MvJ^4QfUcLIRw+hfD#&7TV678O!i)r}qC;WfsddE~Rxma|Y?|(aa*)}WDt7{Z z3T`#8%dvjFkGF1MfFXWnqrem-k$DP;B2z>W9H1-n3$QaIp*eL_>AZdW8MIV*S`JFQ`GuUgRpv76tPKYL++8zP7i}sO%h|sqp znZTG_Ls^s#N+GmS9Z|}m7X$`79)zGj79su5H8O-_(}XrD{no||y#4k0d05xQwZ7^G zbCa@_5e3XhNp=;WWEqk*9lhqUPb99i;QtP3Qz4>+t31U-l$yHNEc0UYW=w9)HS!FM zkHP*oaCz(sB*qtCzc4mcz?sUn6X9Uwnh-pdXdE|&RH20YT3iKM6eDrh3imOe@s4kL z#~E`AsWf_}su6-@47~IS43e)wa3sOjcrm&_bb=r^s3Qz)MTAET+O&YOO5{?NM3C(U zu9wU6F!ouwTFY;sY1x?z9_4Z^1)C(83aiso0N7-g9-+V%xkBIS49zmjLb=`u%*Q2i zoe`6Z%VKg(PK1G2rqFc_lmysDf?x*fv+%qWl$&#Vpcqq-zAhdj(Ut|KdZVBc-5q5zqg%C*YorU?g5N5P6+DHlnQJddfLARI6!NCv zXJ~+3;hMePH^*L|#1WQJ@0&M(xIr0!7}7OnBwf$Zh~Z5kVhu_~jK)YIrr3GF)B{TM zsG{s5RX!0WvkZB~ys(o%{H$Ivs2fESr`Q7!Y+F=h#|kZ@nw?NPq5r!X5}=$@)PJlp zMF$A_5D`#l0Lr1MASb9wrI7~ZlB@P(L}6UQ@fvEka&&}IDQe|<`xJtT8M!b{1{K8y z#mF^L5I}{X(TesFG!!VE8zBIGP&DX(m_x@Yia#U=S&Y*jo4|mlm%Oadal*GTAH=FKB%};u@=9#ba!dX)^5Bn-(B9absFz~ z?e1UOvUTh4nL1yp&bMXj%?asA)%9%Id^sT_sk)IZ+p(OGL#et$TejhxkiJx1--<}`!JMeWsk*~kwj+;RwIBT2onQNCW~2Li;Y{nnRO`WX>mk_2 zW?W?2n|8gJw7s}n6-zLe(V#f92Mhwc2;(2lPlaG7@lok*Q;555HVkMD({61q{}8$9 zDQ2dlrJ3N~>EyVt0#n_#+~#wPb&4u!mo1vWbD1ae=$Pq!uV7j>FEd5L`&~S@Y?iEe zH!1dSafi|ZSb^%WU2m1HYSq%vSp?)XeQMPX_1(6{tF+i_VMh zVgW?D`AVe?z<4|3xs(pvSm!gIbEJktG$5$rvH-Y|{ zhDEmGz+NOKm%=6k)PSVy5$cnafP=FwOgP96Y_a{}D^cuqKBZW94mlanYU8OKIuxUj z@Gx1GsIM_X(+ccWjCpV-O%fN#X0!;CpvBc;0%M;b=L+|@t2eu^E8EeX^>${v`?Ag6 z$Cdr}g>8$ur{W8j-EChj-EQRC+HsEJQTzUn=U@@gekjv^EY*JO&)Q+d(Xu+R)`x|= z7SGht3cv43yZjIO(yo)){sYO%j*W@UO5e7X+jrs%95QRm?jKowY3)MFx$kFH&08)0 zbk$&{>UgT^c-C3D-u=L{ZQ-g<>~Q94`)bJ(Xe!-nM}GYqNn0aZ^@+? zv5cCP8U8qY|$FBO1T^aB36#iVtAJuxZwFe&5JSg4fOd~zn zp8g+|eNy%ZgPEZVsi6y*q1RJGucwEm(mih^yRU3pOf_$rb}W{<>TM31{wH?M)3I?Z zG(L!SjvlkxNmjTfO+*TL_i~zC$AW^sT z!huEtfR;(=+sKi4VTNr;vV|;=V|gPGmyoozTM>SBlbuhuCIIX1hm0^j>IIJZBb2jy zeD>+%vu6#7;inAKp1Izp(V|ljAY*|ltCIEpl*9k1x*0Y#?s^!Qtlh#6ukvDpyDPB4 znTbthjkc>*>J9lhj<6ORyvJC6dl~0_bI8Dn#qeN(5gwIU@T^opx=;{CuP|Yw+Syaw ze6N@tuAomPBp^lU$)dS*Gdk%RFce`|nz`a?!usve19!(aMs{%hsqpM+TtN$9wdTnf1K?=CZYzj6l#$25!S> zXO8xG!LwX~!~D)h-4yqBDMFaZ&TU(~Lrc6|s?Ee)rxs@XF53u?+%C)MSy{YT&&u+8 zrkfT$wP$|{?t`1_GHTO&bKq18AE^mxiO(u{=K2Z(@G6vReKK)5Ypy~Il|}_u@s?q| z^$V4I##%NBc=at4n};IMB~GxAJ#%#Gtl%MjQtCpl zh6^I_D%4{n>vL*tD4wnWeQgzun#2->p{P$!A-P;P7<$6jJ_{~Iy&CJH)C zD+UGsxmkrlP;6?bP9nG1wHE9QO+VARvM{YNP^Jie9C<{EvV~6IG@*+L@3^L-Rn(YE z-fl{vpl77&PFRNgN8}d8gf|dego0#tF{%(aq^c1DH;2w9tVD-+6WuvR_|UDe8+fR1 zK{A65Gc5+=cVv_Q&;Ki)L{*;zTT^K%pv&YEWkddB{sXve;wDpCU7HWUP&X=uV{$FD zk~23VLFrv+1tNOHRATXIiXs|EHX6nOA*Ntu7t)+Msn_QBUdH7iPb=xZi;B zQ-%GnpdUEd)F+z;PLWsom*jj#LtCkFQaDZ)$3dy?#5>(g>8k4J=NI}7s)&ZVUmbh= z_RTLSQ!qfD9z~_vO@8!-L`sNAN>)%o+XD#LT%b=WG6Nx z5^Kp;rbLzgXY%7Jp-_a5kAY8xFo!63h;4p{@_&__-y?^KVE>4G1X85^|} z#{&lnJ(Pt*i5k(VcGTxkWUqGCV4MlWMl4hOn0?vu>Y2A?)4Q_irfg;0yc?8E9-WGp$av?WTkCjMqtc6MGB?#_JB(2cvgJla6-Sl7BFH zXL8%jSM@%zlz_~wfW`U;bh_E)g?#G!uYCB*8vk2mk82w41?~niHNB~t-up+=HAAcR ztf&6o#k&_X9$(7iyT3o}Ij~xm#R>g$IL~swYSV?QLnf2`lWDlF-X|W;*|BkW%i+sb zA6y;Jx~kVZGmS@5_;Ve}4j)_NGnM@*NW(6tDi1#P)Mq?BDNj$v<4<|~f95%GXJU11 z_3L0})`K6rx7_<5)wOMye(M{_$}1b<{jvLplKwZ6ombM8SJuAqvxbh1v5%Lx8iwzV zZH-QT_Vx7WYmjtR)u6GaOLt#dAOBnDSI=Z?>hHaE_pNntV|?TF z^|^G7Z}lt~f%R)2maU$nnrGAQ{Ta7E<@VoSO1Y1%PW-HX-^TE#&0FKA&~gJ+YXN*ipK>W!r47I<;friK*c++fELbxeWf+;OgnD zt7@(N&f@wjIID1f=s{q!Z-NO#x;CT@0cXbgPQs}$|9J_Pu zBUh$=I8{Hq=@@?K`Qzq4Y(|<%ll=>hAGJd^;%>=SpL{6%arh6z55JbIJdcxT)C4o* zt7vGNQ;z1OqbKKVgSqPX6AQJ`&h`2;zR^wJ=!c_!Qo1_v{&{rogA;d7tPgEES{^m^ zWtvW>noh4y{LS+id}pg7zI2e97<-ZIwxoLOnmpkpj{stucI?;?dogV*4nuHnFldZ4 zL)OB_5Z4sDvxN~Gx#j5$*teiE5O}&*@V1#9t~48DqQ-DQ?5>h`ipxo-hK7qZ&B@mLcc-&p579z!FL`(#pl6go+g~ zuLw$&Wu}PIf?isAh-Nm-dTC?Q^~DIZoiL8pDxlw-RXWO!KILR4H6@v>6l(;|?;Fxv z^=x*2$%N3K-DabZ#D|lb$(JGhWz?t`r&C!V1eB9K8nLHY_~mwo{DgBspWF=Pjf83+ zMkA+CuPBeMZNuSiLWxvS;e(_9q&gP-iKeGCjnQMU1S%&bbwKTL3;|9}XA}=!#|@1N zX~Bq$Ei4x#a9qJeomNb?eH99AVc_|FNF=u@_ZJ3)CE(_HzgD_mp~Ivl=|3Ws^b$E8 zRn-}VkFyS1EfgCo$Rcd zT2L{iZEpN$g!=z46aZ-#B;gETCmqd9+-0t6%E1S?2Sf+w{i%9FAxM>M8BJFoPdbnP z+&1>phOR$2^FO|{)o^09g6Igq{zuNHjB_yM9Aut>lyhL)YH0xMw>a(W1l$M9cb3)-~OORv4#)D9@%t_ByA%}*9g@B zNS|yQPC16rlD)9Vf3k69qhCOW)wF&*jl=z*5!trzlkV=5HEs-;fa( zhhQ3n(|*A)Vw{zycn+}eSHZ%Ya}>JfxJ}EWi;()3&4QUhKs!WGxSxSRF*O;68NL!gS>$K_nf@Dv*!aapV{~gLx+= zUkBU@Y-gG)=~s3z#LQUeo|CI1vXZ4i*-yEfJW6Fk_SAR*8Df zN$EGy%f!VuqL4w%Uyl+~u9#mlwGM<2c{TQX#rI&O9rtz{nwLCNO+^^*mHs!B>T^nT z>4eM^OvQadVmybbDuFzMY*wgxDM${nNw`!Je3Z0K-Zr&AfxRp%EecCitEMC4M>^22 z9@mgf@8zoj8;sMrM&sEyCi=#A=>qpmyiH*6FVH}CkH)^vq^*_RYuJ)AbZ^ykubB)6B-_-S^d8%6ItJxHO)azr4P&P4 z*y%s9KyA=)h7^M}wMoRK;rv1mB;cYip+q3QAT8CHpEn6d4g|hODtutlo%-DP>8!Tq}ShKq~9o_7v#z82> z9+*?^Ll5GaqgPT#uWY)nB%N21?kj5E{*yR`3YARF(n4=7MOv##^nZj?rpO~Hen>c}Aa68iV__7Is<@UZsuKBGN48d!TbVDw~S_zh;qQB+(8 zK`)Fn#K;0}rOInT&;!cil6Kc<9*3}37OyZ*cOU?x(6Ne`H>X0)K%Pk`=}@16k(fCI zu?J)Wf^#7-7#a0KIJ`8!$b`YTJ^{&f!*PudAQ>XgW2h2Qqqu(JR$?HvaWW(rg~%4B znHb(Z>ZF_MNV;2ihaI&?kr%U5EJpD$eqgshMU_+Ed79TSi)kINiY>L;TlEsl5?CK+K195a>mFx z04Mjd;`-v%-8`7+%~`+W3ZJ=GS{J~3tQE_WDY_NWUJjKF>{Vt5_9{OFj8$d_#Z6ZM zLLNhuKg`53&KFY77q*;ZS!Xp9Dk&l?|E9zLm}2RUp)KdI7OP;={ZFd7eO((urt4U$ z>sY#dly0VIeNCUo8>-T+LQEN*>b+guF}|-a_###lIc2`>N>gU zI+=4XhP9^qiKP^#HkDlWaHeY{)itul|G2Uv3(;OxXR506)ACHu%c-81H>+OG$!4x; zgB*!j=os0ykgdIgwQIYZt59ys$h)grM~zWT%U%*9T{;2uS@;0W5XZqD0MsG~Q9~F2 zZ9PQ3WM>tU}&{sOI%!zfC<*yO;b>p;?WAn7`g zLm=&kO=97slEAI&DRf08I=0u~6+5-14`>leW93zb3G|^ZKB!NlFr?nNPF)nU8#IJ` zz)w0pEQaebWwb!pgLdfjgcfF4?Mp-(nKi29TP9#H%Vh91jEkmQGcGM8Aqcn#P_*Df zaI=ttXo{c&5~f6-sy-Ug)%6e?Gd>6_x-V40tn)MkIef(`OpAcAiF2 zP0e7errlAG68wtzhp-8iT2P^i--vMu<4|Ip4h+l!#W3ZG$V@3=nol=GVyq&-GUpA9 z2UUqi2+$Qnus{Msl!vU9F)Tg_Q_`a7vm2maf$I!>Phg63KstlwV+fPm@=7PkGodcy zMAELU`wQes9D3fZn}cO0Xt#b7xwUJEaCI=Qe%o{nC2d1VTmVPXe-#unC`U{TT~kD` zHRB>%$E`fbT2k}E6N?QXvN{K{0z^Khd*F|z+(){uh(+KQP4Af`ASOf9vdYcE@a)?*mhB;xCqB%gWo%L_ZS&LpGe`)eIn^R0igltL=xF1QZC=7 z>*9UkN1;zb|M-Q>@rx<^`7b7I7kBS`8g(|m9?V*v@omw0)ON99WLkXewR8y-FR*W3 zz>Z2Mrix*E!?qC;1BLa^9sH1ZcOzj!XX-iO&^7Fneb8-JC= z#t;jaKqY5--v%H142)2EQ?b9aX}bp!7_(3&SQ>G12=k*Z{w_xT7Eg>?ae|3!d;G-2M#e3uFGS*Fqe4Z*&_YTrqj$6PMK7XKh}a7IClj^mSmY!c_3n=)>WYjFMHCn! zN&h=`OkzYc0@7>ps&+{|#>J^@P%(PKA|t9uXzgQmVo;LV##s|DE+htoc`-&ee~a&d zMD+H6XuB0;0g4TcC=43rNxp=K=q?DiV|3|+)-5TCM#(nTGl(eg2{m?yCcKHRiwoZF z{jqNJq#yqk+Lr~;?NU+57T7hdAj|#>%E&oCwm?N%*)0<6^rYNEGsW(T3gbwZI0NG~ z$*=Su=k}#iEJE2azkst_WK0Dm9@+2!Y#*e8Dk1%#)BU)QhDlk%4EM{WNKfWDYVJrV zMj{saVt{N{X-N2EbWB5C-wK7`*6XEjy#?4vC-(0~#zm5a2!4}9NjbmbT8*pUAaQhm z+HqgV_|K;NXVbker5rDPc5u^io@{x}-#MRb7}RQ7#B#?4-W-IF- zyKB~hcMq?>f&&H2ej{7?3Q&J}MH26JuC8IdYQ1`GcJ-yKt2Wt6H(P#Qc^YOW^=%tB zGF_u7{MC(qVc}{fOphA|w@bMyd^dpW_umhGa`?gc@1OW^`r%jB>^l3N^_<9hnzEj@ ztfw{Wsr^rmlB()224Hx?@Q++icO>9i72L(lzF&o#g<(_(p^NN+iUJA=x;TQ=(P4HG z)C}P#?V+h+`09@^#NXk*!-FOuFzfOL2Uz*FlxI<>AR_I4G-V8igX+KOc zNU3*+3mBpj3*L144*k-l?pnzPZ%t+R$|jX>uMw@7I0?H}Kqvup5>(bcDyLkc zqewvMJ8#0Lm)A1;{&9}Jlwy@dq$~gx_Jd~D0<(m5PxhAtjhCECS zh{I$sXd=ZRRFk+9Uma*%Vy19XG|67?io!}sL>eSs-3=I(QDJwlvs3-bjc!cVPg9`o z{35*6j|hA+{zc`b_{xf(Hjk`LJaRX!Pi*w1-F{NZy_On!ZPWeQ6DuqN*FEds$W-?I zq=;t+T0)ONkI->{FZ&WFiQ6Hr(E3P=X|IXIO7iyX7ktN`d*1iH z*@huueTnB^dgs$LxxYgb@)kL#$oV63nC_P8YD-9MD+j$e9W5JGn>e$l8ReR*Ch-*R%WbO?ta{R^r1P`dz@6m-PvxY zBJO;anwQ`KLv=4>#GcKX2_|B;G=M+BePEg~Ux)qVJJ_+e@llAo=J!i@yc4Su&{=c? zm4Ue_f>MezLP}lNS&f4E`W}bTSbNl(zzB`1hq}x_M(C09fbGy>6Vh~{t^6{-Y+=j` zF(pMAwPoe#RXr|7cv^f9&%J+wAl(c=8uk1ZJN-y{UB&*)DqE#s6bAUTsn~4>vLz8( zU>i-{Lb53&TfkSq0v%@`;!}Ywh11OV*8jL9eucI+41mm8p4{z4An+|J79gtDgl^ z{TGvcmr~A4+473jaS}cGQ}}cG|FhN7WWVzY!fJ3px2}2p%*LTVtLZf`xj+gUC$R~DCDocEJZ8W0DQ0PbQ>?l(SF}cMZcc) z4I0xRIg8{NLX~BDB|=X6HF91jhxm2rJLEXXStVzi96C0j$jW}5e6Nx7J~?FSBE3h> zZ;|F1~2~>`F@L>-zJAdD~gDi&H3M^*YA--TP=q_VUYFjP}uj$xk=6j zIb;jL^n?=8Zqg^@{7>Xel0zpbWe5Abv|@KD{V@gqQ*!>4oPSPEj2yzG(*I4)Uy$?v zkVA?g=@;ZMX%s$L&Pi3|G?GJ*z2M3_c3TE>{skgqRuoE?vG2#%!7iD&hL)8xtB2M+ zcSh4Z4$1Neb#k3OD-)}O>k}!vH_dl%o1ND4CR|{FVBEpJK5_Re2&GHeYwA|ct;W)P z9n8g6)ITJ}BY+-g3$f)fdAm+zLP`J>xryY&?ljL^IL=(6~A%iUw^@sztG&G*v%?tt3+ zIPZS#>UgUCRTH?B_Wdj8*E-idcl&UmJSDUt-M~Smi1nEd%kJW)R+J%h? z9ArrIgWKkPsOPMS+uxPso7ZR3{QhmT#d-)PLI)4CayL3YeEYs9)iRLghpFCC!1a!v zB;T+;%!aoF`76*>8n8I)B860Py@N^KyV05E{mMvnac!LsUfF7W0k`BgqOn}meRZF2>NwUujzRQJJ|t)@}*1H(O<<-334^JgnP+4AZqR!c7? z2%VC1OE)YZI@A1XxOV?1h`9dQnXR7J@onq% zx()Y-&1wGiEZ_ALOyHr1Z)~-UWB5DPX6|BG#lOW?G-w z;plgJsKMH_4nZ}3wvTYGx|NrHYW1vL+34A__GPV=G+Z_J-P;^~9=N}xpB>!hUqw#; zHjmdYy10^>WX=BjFK<~7vm88<9-`0Q)jT}(CH?3*j%@RI#lV!*ti5x;e#<(fXRs(4 zEcqD@85xo_2h!GqEPV+oh;4KDc^LbWe)I~4wt0H3oH1F)P1~FVmB+&#*`1`5QLUX>AG%QUB$pRzMQl~MvoHF4GLUjAdw)rY+G0oNQO2e zk?>uCiyke^pIbVqY~?a0{h>QQr4@Q_W$%PR)lb*^buBmv4cVbE<%2%#enS{)GsAs+ zHygK;R8a}+D2ZSB*oZ_7u0(v5a_EcQ)#zu5{pwLNC1(xgtYLL4vAFy7MnH+Hr8ss` zqMB1N_*No`-&Lzrn!--6$R+dF=3u_5;72C%Dm@pCpP}h!q#Qi+GK|F#Qv!hn5FVGB zDo4vsNk@PL@W;jeF&vmInoK)3&gA?zT*c>H+vi;8zvK3Q&ei`rZY0f({2$!G&$-^8 z)9VF9ea>}!&h>rH9Y^5j+~`j&mH(FSdR$ev^2$%kJZo>I%UV}To|KfE4y+0H!gs?@ zIC!6U{icSsiF@bno_oT>$Us&9l&#$eH4#+S#>}__ofHC`qNQNvovFR;06IY|-2;ZQX${EX{jm(cy=B zNh+4%07lUP9)_XmlO1e8QebG&CBXU*4A^6VzD%_TyF`ixDEhEB1u8ILPdk^CR3-2* zLOS>3eBR$Z=a8Qd4W$u0ru2K`Z!r6YFM5yMope5e$u=U02qseaD4K#LDx$#WF*9aK zie$x=c(@i!S&?C1G80x(Nm?l-74FB)w3SgZ;aoPe){rtJAQ~r$XTZ8LOov27Gw@H6 zbq28oK{ zQKp%&PXElYtH#Qlp?ksb>(`C?l4cr2^9;uZwG_-W-PBwc7P20S=Hr2+`JNLU`!!^D@EepgQTQs2DpicOOiBG6-nwnNy&2P zl%ZxUp9!*av|;EpLSelWjXnzjz*UD~uSPKmWw7U9-=*Phqf+@LrfXF9IZ_42^JX!S zBf%>Gf@Ii+7i6N15Qji!)bp9GcW*O1OdKeOqVq10ZM1@J3SOVT+P&LQvP(R!5k4D2 zFB?_?iYK359uVQa*vpaC)RR)EpD#DX4e@o9k7=KQ`!w!8Z(Y8&`18eum+@OQYSRW~ zSi?5Gfx~RVhKn^qDB-|(HCROna<;EdZP#ZMc$$X|__^4yb<^i6h8-eX)pXBcE;bDp z_|?89NNd$9)jc;9(;chsd(ow=Fs)uIr2C@3Vot*#6e6o>ZY^@=+r~RSHJcb1wr4=N z5UOc*ZSjP6`}$#Yk+YERdd%11q9}H%Pde`_g+bsdb1XOsga(e>BDIz&A%?vIL32bP zRSa7*am@gfZ0y1Ts5^kp8pi^}v1T^`KbS02NI81ANzEpJ3Dj*ri2vcC&==r|n1hrh zVAX|OWro*;@=b7{Ok+8SS>(ka1_usgYU>W6L8?aAh_OOlFCPmemzvce(Y>P}$<0}L)7T32o?r7z7cX2d(IQwc*5Oc&U2K@9Rg5yD7?nez-VCzQ zl~R=(U4Le7pb86vEDWyP^FTIH`xrWgw-OJ=r?$%g-3E&Xw0f98eO%nQ3S=l@Rk z#w+{dORe#x-SOXSB@c2FTe%MCp~vb?f{n1=c$#mc4YbNdo_L6G{S-p1Kt$;0(A#ec z0@@H(dvAaN=?$cEYvBX74v}yhUcc&ybgx_6?+RXKK(99)7`<$7(W_oIc6hXdG1#yC zU%SISyN8{;?`!&SU#lZM?eE7y|A%8h;tj--1khIdYE9(jdW*v%IowhYzsD`*<;Ac8 zGu|&DjIF+*E6DSmqiI=)(YZD9fWp<0<=ny>t1ZvNFb7v2tQ9r`+UYz!T8*#T3gv$DeL z8QMUNnssou?86E=g5kP8E#w6@#A8jdF&KhjUL%EGv@Saio9rph$zJoJe_3>%F^2_d z!!>MZBeqW26lcewdFr)5;^)YV>?qgc$}UF&V}@Y}hE%ogxSq=2NUFL%^;oSv#yiJV z{{mza9b|Ivmw#8@Iq^jXf152moGI?llv^|9t+~DIk%RGx{qfl^#%CY=;JLOaKK%=! z6O$&!+DIB5`)(MGOzj-MbLRG$Kf8OmGY7LLKQ4b*z9s#01V0#^-kG~|_4d{KPyhAE zCr9>1U+QxAXV0}}&)t&tMsQn3*^&LsbSpExbEB0xaWHjkYwizMzmw54zH|2W*}LBU z$x`cNX?N-Z-)GN3xcT8Asj9YSQB@7ns%kmJH~BoPs_*!k+2tful{mVphM+V4(hTGX z3jHe+m@eb(ixoNH0k9u)!aW-SFQUNQ)Bpeg literal 0 HcmV?d00001 diff --git a/config/__pycache__/ec_transfer.cpython-312.pyc b/config/__pycache__/ec_transfer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51dcc55798e53304725c477bfe232fe1ba260d41 GIT binary patch literal 4526 zcmd59T}&Lud3JB_AMSv|2J^cPk>OnX&bW4PY|A#m#Ib@Qft5Q{Y$lIkS{JiOd;55>_Nrk~PVoi-4TdGwPfb#j=*a~In`WT-7L zqURUbX|RdlQ&@q+euXUs5j~ynJP?P$Izq4k%DGU#5Nd!1Txd`T;|TDEJl-g7@+5{m zXfuv^(1;6-3UM3*-X?e5EVN)htYhvvF0|VC7H7>O+=de$w?2ZC&?dCw1kfLFtPgx` zE*~o#Y|u2}Lff%F!zu^va6*UDtsI206CYCAaTh>4?r@%Wx;|i8@;_vBHm!@vx$+ zG*iq-RIQ?Q2ymX3BEAg=plceNozI}M5TqIw^O8Al1qe0^M3-G3QpJ%Uel$+EKZNc+ znnUj~riWi}vj|XLGbxANmY8}pEI@n|vZy+MG4B}E;57&fUd?!%KrMlnWz(?9WTbI+7AuOX&wi zU@JV2Z!78?HcY6>oPo8B6?A=Mg(#Z{)CP@Mh>-tWNSoPwIAoSt6Pv~1dmj5Oqim+SeW)#UgUi&S!{cX$0rHKH%Vp2jKY%Ba?Z`uOGG z(?r+(bN9~u+*liVu=J$=+CZCCqXY;U+3trMC} z^;uo&B148uxbmH*FnddQ0N2>JNqK;N!PO;xNM*FtCGLzL4DBtu+}*Ih?>ly3cXlbF zB1&?Tr32GY67saHLJfdupauFYUpkoLr#vu)LApX7l*ZrCuw-D~$YWW}6zd%PEhtBH z;niSTTMTR-<9DR*h)IS?430*4=q#>KfZN1R`%|Qif~h;L!1&m8dRatfB&~qQ3CR)C z42Xve8Gyl{e&xv#O4=;Ky)a8I(DPXpET#Y*ZDWqgHc;Pj!?D-1A^AyD?2T;Tv<=gM0!*)LZVRA6p1q?@6+ z^sN_{>E%y7J{Hjw7BhIgLmH4qg$FDtZ|9#JIdp_^{(~;`vwaW0dO6c1E$RcDIGVy4V zu&*KN1S%MUyhJ-Y`n{?T5HI@A!eYPZUufh2I@*?)CGY7qg&c3PgnlkrKDaECKD)A5 zzMB}necTsj{c3e8%>nu$L?0Q?JTXg0p7ZLSbXnNR=!Om|4~_EU8}Z{$;=>#9;l~4y zZ&l+LE1`?_ag!{eWRuB$DJ6fEg%BF28Lak4_ zXg5f^G)0FXeDj!?MT?uw$Pmr7Fj7a>=cMWQA*AkykI@Tc~nLN_K>j z@#V>t@haC(0Ve^__d@x~^2~~I_nj(tm;#dk?CuBP)XJ&5LY3>Qu`B>XsJG7+Smk;t z*-;>C>n)Ejf4|E0Y;u>k5}lN%%5`pXum1hS>CcoePrP2?dRNW0vGv5-YahK|C5mVV4Q z_iuZ?s7UZsms5=#^uxw#r7uW$3({wMs$i;`K|+*|J_ZSVsUhc@u)TbhE^Srpt2%Vv ziQx-m5)kwbH!{${-N!K70mQ`rf}(##y?;Z=4V3%}9sLTuxYgXg^3F!{KsmCxGYhw@ rbgc@N*1?VNP&oj0VMCitWuvqUeZ$!RTaW(&%@jqt literal 0 HcmV?d00001 diff --git a/config/__pycache__/kv_events.cpython-312.pyc b/config/__pycache__/kv_events.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1a5e4195730de9930272bd793b8dab7effa53df GIT binary patch literal 1417 zcmY*Z&u<$=6rS0gwb#GmBn>HT5x5DYairQp5CUx!sY+XlTDb?*9#%-B&CWPkI{Txu z>$J9tWQl@|5F(sXZg525fcOizb14{!Xe1;g)B`t17DD30%P6Gp~z%Oo~-eEa@z+45XgWBEeMsZG0oF0J+XC$Ji{^)8?l_1xAG9s zJT)}%HXz1hKyn9Kn(QFmz+|_~ZoY2*sl0V4^0(ZWa+^uhxb9L$q){MtY&*=3B1v*i z;J4jvC(Dv-dPc_IOJg5UU_k;b?T)P1(urmvYS0|f*X3yeXtFMXzBLYDddo6tf#l9e zB4hGt3^A=DElFO;II_yLL{KlEL=`h{$*RyP8CA}B#i}NJ>X=U_yqfW8YbMD$lkwTd zz?$u%`YADW^X|1>>c`P$@^-zYbnkFGb_2iG4tVY6-5OOPwXn0r+-S%3w;R{RJoW7@ zM)AE}?AXo@#Ys%mQc@DywmU4YYXbQl#)RJU?u$Zpo$_8ecEU@|=E|kj)zvj&QlEr@ z>&K$XX~^s@o;cj;3jm1nR;S&j97iz?BloM1ArSh`15ZqO_CtKXLpu~lZjTBej5r8g z$I0|fR93xLT^iqzun!)A_Y?O?=+8@kHYNW*;TnNHB-$f@kKJ9+1$Di}%hJP366zYS zD7&C+C{RRMJ5nXo{#kFvUf2xX@M3K9Z5m&E*jan)&3kLK$=t1G*o}7rzq!kp*Gy-t zDaA}>Y=m7=!8i<}7`wh3W88b;WTzS@PT!EvixwpG!C|TT`Nk(3PfO^UqH!Fejko zjJ}re5je^LhCnG|q1e2SyUdYC&`*X!2yyp(y1@ zIg#KNYLNjBR$%VS;vyU1VFTtM1)2>5_R@zw^kKkOK%NT3;4;j^(1*Uc^MC<++IJ*H zJC4&nZfE=MzTdn1{_f~MwzY*3G>!YQ`r8;n|DZs#1>DKTPhoNoDM(>7Bse1@FaWa} zTkr`!hT?o0SMUq|LO=)tD8wpED*O$_j;9#QpHs1>qa1>-)UARqXw{=HdI3{%1^mYek5$?qCPdXmL z8E6x_a2)t|Ii3JLovs`!?B25TxQp!)x^WM1?Q&t@$+1fJ9ZuMzB$XZ*dvTw#8}|Tq z&mGS5T*AZlZowu!Y~L1aKgh#=lmuuSsIaMit9>kcok+To!{nUp!|D-dEECC8i#nex z5?)aalc-Z=6D#~!mX{?tk9pG-OOIQv;Ae2Ij%Bk*tPs|xRUM0}VsUa=k*sza2IU+q zAu4!QmGLw+5B~DYQ7gJhGSn)zBZd?;cOvOGzM;mnHIyU>K@DpnWvqsVkB+2|z|XMd z2OVlj-8c@6G~!A{VnR&D6r7XFnknWax~+<}2WZ|dMSL3$Kw*emtLG_F0m4_qVo5Ud z7p-orOH&lWqyPpGwW2I(;*D7=K(JXRy6k$J9trv3kH+%G2hiO^)978s^yo8g?g7bb zCY7+;9Mg=O8Hn@kt?6xr`Gjre@Jwhs7eYx|O~YoBi-l6zbmD88NTqx_?6F=U#aUIsu(P~m@KzN#pY>SV`v%H~==bSm!pi1+epyUIF{TWC}y};GI=Ux&k(Vy6SD> zIV+?+hKn?+%LZ0JtAXV*QO!!qZ?#ZEY3h{aD=4p6J~*%yz`9&iuocSV+lo4k4HIfP zXJ9R7wYWaAf>cZdZi7ZFM96Yjv_D4T6iwY}1;(=1=rt0~N?I9TAf!l06bKI(@*)fd^(#*X zDQlYuH-=$~s;-)%SnYo%Y@eb}Qt4&rYG^eQ`^o9Mr|XfuPm=tvx_;gJi{4)*8p-th z`47j}fOhKcsd{ApGWTiggVxVp|03Hsb$Mm*@{`WQYUe=*yY${VVqQJ?8$7%k=W686UG~ z0t`K#;%t*|dWEE6L2?ATt@iI$4Y*FdLIQq(%Bst_I{1typ2LZ$#skpR&{{11IL5EU z_+_phJN)O^)kj^wPy8nFMbDoOG)Ax1hp*OySN}&AAuq!YpRtJ*9?Qa&XTO)9F-_Dt zaPvJ~9e9pKnhYweu(7itCOtzOYK%_ShbQa7$^Sb;oNIm{Y>yGw5KtfN4~XX)qRCJI zLlAfmBIk%+)d!3h{U-o;Ui8mwl>k24=9oF}={12IZ?dEpD3%W{%aqTqESB#EhA$@f z?b*Cqol3Kdem>Dh##4{a>Vapxx+C9BSjp&v4k{0g@}ZU3(Bs(fN^JPi{ztbOvDfOs z*X-k_Sds*`@f2?bMNyXuSQM?WD7w#D(P|aNn`KFJDJ`O?6lGC#lGQG%F;SFsz39Bq z3?SL{wgZ%sPUztCP%BENlh}4ncTucQ)lIUSGIi6ghj!GC7N@APXl1nilCUXL z#iB-16dtB<5F!~K+Gd5&rvVwE-AUS=f$o9hP1}~Fjbdp>H)Om6_54TZ+W_*PTpQS5 zyS%V}k^AUSgWC^f(jQz8pyb}#8}r7(^}E#u$FFj!wZy(U*R>!wxP6pm4`tzN;|muW z+yF%!1w{Y8+NJsIf^zqV4Q?+*9tULK03at8j@=a+T>m=D`cvyclRfWsTspTLUwZxH_Zr;TDmSwBQl`!& z7UiY$AI~(n3{{l`RWBZ@bA5}~8(eCYJG;)c`+FAU4Fto+3sHY`okxKvWxd8Mcl{I5 zal_}KK-{5bnI(==vzyc`^LuFMP8*H%~I#yusuFT zL3*P(85KO`1odfh1#t9IH}cTIWymlaLBz!Vf+BxK$-kl03QB#A4t-f?gxG*}COW!T^<6aF{*4#S*b){#AcVc7`ng5PpB!rsY$0eZ$? AjsO4v literal 0 HcmV?d00001 diff --git a/config/__pycache__/load.cpython-312.pyc b/config/__pycache__/load.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80b62b1805827267d22a2fc6fa0279468bfd9eda GIT binary patch literal 3396 zcmai0O>7j&6|Vjp&#!0fu`v+94K}+murolw&H~wJMPQfMa#q0swzE;I>8cqw^v_V; z1B}BeRzxJw9%4>-l^mQK5h%(bhdu5QDHnSO39GdvMo|>yHY|}M?J4hd_Y5}KD6N_L zd4Kil)%RZ2p9_U7fzOsdvGi?({0%4FFO>{l{s9K}2_;mpiH5ys3y!FXj-*Kf&LvxR z6isnbS}K~$cAt~h($QG4Gfq~^M&p#7bMjhVAWY;1(Cf1cPQTV4@zQqDDQP8vxH6Np ze)yL}!t!k3p%D9_4YDjti{w2*Gq(xNJ``j6!^F*Qua#TEMDCwRot+TF!rX@+oY7Ce zf9CY`x!LzZ>0P%4ROPJ2Y#J78*y+~|+oDF`@d+W!z`!tV!}np5GCj9u)nSygTr1FR zuU=;;m7DPlde-BP5ujXYj=8?atp&zUM;d10RMhnX`4Md8K4C-?DAB~9KtMH#iki$6 zmRjf|g!NJB6sTkfm9;cesGLxhgqqSaAlpa#;yDB;%W{v>y%6PDjujG}OwUsPCQI2@ znlyKtY|<^V5>s}->P$&27C>zbd3v(!;gD9NrY?!Cs4L71ti-)pp zEVRNMzENXAblyGy0;K%_R$%oO0HH`(&1l+zUNcO<6G%s^L~@Ml>?VlnF{r{6W!Ei} zZGj46U4sA1G&GQNLR~mdCdApV3xKY=Ki(F*20w|d>Sooc*r6`ig$T4DN>Z8(>y@y}io za8cor*oEJf%xgk0?aF;D-V!0&lJOXp&#K9H@!cQKLghH2UYutxyUw_3s4lyqMx{fw zeAS?oQ8n-YR4|KUe5j`mKxuLYJR?vo`1z{knsyUaEH}bo3>9CsEg$%`o+d~ewHh-6 zKN2%Nr_l^zDA&2sn6G4eq91VYxhYDLOYX7MPKS3qZxeWd_;DzS*SVQiurvujj=7SfSDrIEz^c5T#(%XJ(UIL2E(@ z`!G;^XQYib(u7!Qc8HH(7!-V${v49Z16Z$Y8 z9}d%zy)d#Vn>I5(0#W}MG>fFOi|kSF^nF?0dH>|SlfU&>rhaql>FCk5(WC9rW1q7p z<(a4D3v1;I?ee8N>Gk5sonoARO0;{}l8kl5+?*UZj?!o!4DOR#!j)bU0U`OiPzRU` zvtUNCy&G_$iIB6l&x;`Cqn+-)wn63s-!ve_qhyW?wa~Z@U&ovmJ=UDcxP7%V#UloE^sZPbKKLFmgptt)gy@CJ+l8;KD zkw^$&6Lu`m@)6*2q%NE~hlQ+r7t}l(c8arZVN-8Fx~tv#kDTPDIHB(M0T`0a+%OeC z2twI%Yu<#&52N{A&}@m~j_w14>91AKh4IaSOr9bCCB@O;acBTHX_Eie@-J5RRKLho z|8)AZV*A+pkB@wDZ05`S;IF6ePOq0{*Gr@SQlvtECre7BtLZ&ayeGDYXhv|Bl z)pbBm)5dXL*RM4VJK^-{I`vFlkC4Fk<8JVM7#4Nia9uC1K0feE$Qr~3Lm$f2G6Oz@ z^BgwA*kBys&V;JQcSpm^o@et>WF5o?pGEu#Hj~)kg%v5qJx-v+@|dH1p(3FVaZ zi|ypZ_a6sO-kw8pu=>X2+UR6koUehDyrzW%DJ0?!`Ds4KMxd!N?Qk2x^Y~vD_iheOCSoyO$D49gXO)+tl#t z@EZ>0e7`}-i zJ_cQw3R(?7F~>yXm~mmc(V_<23}$qO(Q&}7ozN=1XE5_Q-mT#PJ|Qac^;T&H7Ss!T zq=dPM9w!3F+ci`pO8oF)H=*XjOQ^E>9TbD=;Qte=#IG!TRmZS`l0Y~X&=?(C5d+Ve z7}f9avmk=S&;Kbj9Z3)bcuxt%zmom`AP1k31Aiyu&&b$wsq{=f^%r?&V{mwJdVORk qoagh*(9)%LX3t`(lOkezUC91hGzDR_L$HPTiaBi3FJzoYQT`8&tcYj; literal 0 HcmV?d00001 diff --git a/config/__pycache__/lora.cpython-312.pyc b/config/__pycache__/lora.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72ec99833dac23449650a27cda8758a2d17de90a GIT binary patch literal 4632 zcmb6cTWs6b^-`iF>Sf86Y&kEPapI;*8rjavS(Bv=^61;`k}T~w6f7-X*>p)ty`-GT za9d;O0C&UC7#Ls|?NApO&=v`>e(rPF$1q?&)@;Bk(X1)DedJG_3k=wPb`B{qPUZj| zA)R~fdELW%&OPVyFUe$#z@ziO)CRf<`35JQC)`Fhvm7ByL?tSx6A8zF&KUtIU<9S0 z!Araml0uxv4@+U-2la>{NP-cSqDD-LxjbHv8wn}l;vqe0bV*$<9@bMvT1vZkMDI3w zq#hR+^o)^}U}+j)Nw68!dyPJ+&*jDRoY61!bHwCnQ0jw!e}K>)y5lBzijTCDhSc~D z#C8FeaIxKhC0%R}O@n*_{({Mj#@1 z62UwI0&ML`0Cx#&Od-H_0+sCoa0OP$DQrTsnZoL_U9VD0R&>2`jjAQ!`w!p7Hqp!y zp+w?TA_aa7=_Li#fW*@f4XeS^AQpv1Xhh{z{#K|H2uYw(;DJLVy};Q{z&Sc1`DfAAE$S+meYb>m}W!Xa`>EhQtc1X`i2&e$-d@kxqI@y|j-; zG08xu8+FvEl%xH02i;K)Al>(^r|nw?IcoMDUfKz7yG!l8)z?WwHA8m;FZT}b+ux)1 z(>&02fc!u`TG;8tp08Xw>qT}p=Z!U{*jmLDAs0mC7Fi#x>?954I%Ew~WTu#}vn#NHz@bwVg1dc8!@OJT09k1V|A6n}2c>dy>4#&2iIP zCEAj6WV)SP9aS7VLFdRrwK_*eTW7PYX-H1JCzZ%k3ujt9=ix#QhpLyQgKR zI^1cjNDn-ArbC$HIy%$69ooIIbAVLtR-jW#rgQiB9jqhwW?(LGnG}M`pCN!TvHiS! z_R=`i_s7MTr>IG1C=(UYq}RkU7II8K$QsByyCT|CFe|wQ`!QdgnpTZb;3>AK!4ERl zEa^2=fwZ@$s907?&;cw_*DT6ukU*2E#~qqo7GB?>{P6jA6(p5a-9&% ztYW~MfNQXjHB5~O>YgHS8g~RjF**#^hpX`F6?1q-*NtH>G{abyiq*Q4@WM`IG&!5Q zSNDrs8PC8n)`tM*Nvoga#f3;yNWItfZr2TaBUfnT3hTl_pR)AU;#=#&BmR6NH{8e# zuL~nSW$Eq3x7URO{(K`h+Q^Nr3rDt4jy7^f*M(z$&+J?}wRq}7Yh~<%xs8D%je#R; z1IPYA*E7#=WUe+cSJyJHEC|h1ZXvY^{t+kS#t(wR;}z(>Vha7+E2e)0WQjn@M2!cY z`ONmEbdJcr^zNZo2vl<_2?j8Ch|8aid{CXna z>LTgvLcEnET|FDgfktv*W&bC|kBfgzj(pj(yV_Wd>}thaTkItIobc|vo~xy;-4J^mo|Mhs19({ARc?l` z9_W)>H9;Pqt--I8dx6|OjJDq$J6(Z^cGR;(i0${Y?X^2P_pe!pYt=aX+{;lR>=qIy zndTjoDXy`Q?3 zSx+ARGC%ZN`stP5$)Cw<`5z*`6(n6nH+%)r5oFm^3@Xb`Oh$KVI^qdgexs)7ZB9g% z)k;a0-NNc5UQn*1r^?x_K3Eo0s9T$-0oS3%1ug7XN#%bL%?8lgQ4`DYfaHU3bOA8Cq1t9))byT%W-0>O~b63E^mP&vAsU3?tWS^+MUy>oD6 z{@EL$rTAieS!<*Ztbrc?^cN$?o9Rr84f=LaKL6==1O{(ka3=}_ ztO$b>hK~a12Z8{@Y~VZF1=AVEQ(H;BWllJ3YPg&ovQz9nIc&Wi6VY z58p{-R?o=-G;A>57LM^;G>#5y2+!NnF2etu=W=kay!)V~5R Vaom{}K?s2ioJJsg%j4Yj`oA3XZZH4< literal 0 HcmV?d00001 diff --git a/config/__pycache__/model.cpython-312.pyc b/config/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3ae8de216b2a34fc05492b43b92b0e3cc929e58 GIT binary patch literal 74258 zcmcG%33OXmdL{@EAPEv60TST8f&0Ew3$;@swTs#)yNoIug7_XON&rYdfLc(XORhL6 zs>+F}R1!;7u9&X4r{pP3!eo*zRl1X?CY_FZ`pg+Tk`CZEN*`5PeL8i{^c>l8^;Bhg zdglA@Tkt?V+OA6SBtG1C?|=9I-uvHwx%b}}6y%%m1g+ngF@3>o`j7OXK1GV-=_lDH z)0?JqCeFl}gC?K+YxbGhca|>;zga;`$m+A2Da;bg4(0fALb<+N_H7N?LV3PC=FSf0 zhwMIk$l-H@3Va2jLSJFX>2tC$bI|2;A%0Gs;#GyJebvmJ7pw`@`f5XUzPgat=VgBR!TL~xuYtMk!NyRNuZg)G!RAnl zuO-y#YYnye+L&KKuszh_>tOD}U}vby*Tvk5Hp>4ixq3yoyp<&-Ji{}dN2<`Ok4DIsmV&6r<-Jw0cJzk|6{)6%%>!H zA~fzBXYSJA$c)|02HXnE#jH@-o+paMd%{OK>$Xm!Ch+2Y81Zi?{Nl(`J5D{)*)y{R@C> zWRR0^H8B^*HFGV~S<@!2^}A-SZQ9JWzh&|9NU4dR;=e>NN_x{=$1Z$L;~&@gT{GSU zc~OD%Wk7ZjB;ShKmf^3M>%P~c*7;tq>dG=5GI4#cnYjM%X8|0K+P>bda9OOs2L->_jH-|fnkgF{A z5#t=Vqx`k09PSw4uA@bdzh&0a*^izuEEVfN#7pPI?J|a(a&RZm&M{W5P3nl=3v3$S z#*L$7U(Pe-nYibpe})ERPa*`k&Jx?t>X{QPEYf~duGWsIWGu&IO?AZ2BK55 zLa5y#IpjPJ;wuo9@~JVD#A?WWamqgv3QY6z zt8ncE7eTIkc#`)=g+Mq$S@VKFI(wN9&&1$Tax9*S_yI+2wZdtuVB1cRZTOwXQR;5sivdS2Eg8LKn`Wbq=BHGs4*j;!I?FfV2Ep$j}G+1P-XnYu3J z2>jK6z_sTITQQVrqEAYY&au+bXnG|N`*jUWwguqm^ z-72SJjRa?}O1Yt#2%v%(4D*rbY^Xg)3B4EyPD!~_!P!8xf2(A>pt`b|LkSripy4kB zuF_~hLrD(B4ZOb)kz5M;^PC!9oZ-WuwDM33Ne&&VN2GiVNTBJgFnJLJI1htV?p(=* zvBC!gf7&NI6bM}i2s^1E_Tbf?EzlC0ZW&=WeNwv!dnw$)V5MAT;4x3Bz<=i4`1sfv z|HR1IV`ty9=%TY0b*bS8PL4k}c4p%21hAwmAY7P1xiRDjH6woD*6dXt)2dL8oF?(7 zJ(Y^<>32c3ziHx4M1%XXK$)9;7B0(Y<+J%5&O$}UI$-tX@-}>DYu|Z%KE895?_8f9 z^L{R^Qt-8Dz6E?CpUo(V94=3b>*QU4%2&Saz9POD-wrL@&3pKgOD$SiHHVUZftG$L zU#8_+DEoqLD(CF?oGP{CtKchyRVhCYqj!S z<7?x~&=R%Ecb%^tG<6Q=)!;iBe7*MF#l9P~?{41Et`?zb_M1*U`}L&3@ahy2sxP%vO@_ug%D+w zD;U002|10BGZ}GqDIsSOGNH#QCFX7={2anF>%2z^dG48zy-LXQ2+1t_J{E!=oyy|& zgJ%8$cL1}AuC-X-Q7>xv#*J{fp#8@H{Q@ZTO3?GqhSCng$8FK~p;{yU;pspO8J#9zc{{SsHj z9ldu<-Iww&a>w~$?1hH0t2*(TRo$PezrL3c$Ip!;hO)EgeuF!ihVz|ATx#7@NIw8~ zG2Ex&zQ9j%XI|6xy*`ke2vz1Dz?{{k!Lu+*WkTmW#aS>C^YAD4$T_4sjopgLcaeL8 zdmdji+!xdok@h9j;05?z*7;`hHWre@`4AG+g?y2}gpd$=`b!82ZwSfZ^I52kdl8|t zy3i4Rj(dr46}kBl_A=+b---UPa#Q%e z#!d6rmG6uAjv-H_w~^uue7;ORa9@HuuDCD5y}(+G+B&Ffu~!s`5FoyyxWjPYkn=%G zR&EyGuj;<%@cmT<_GP$#MG1QiD-m*5YAPV)S9P%>`2ID_$V<#G%72Zeki*SmFS$r= z8KY6YD}cNSpM!8;h5PGpXN(E%8hpR;jPG^$zOL0WrmN+zYrbFB`TmCH8`t^1q4_T8 ze3vxeS9HF&G~ci2e1B8(y`l5{rsn&q&iA)8->>R?e_QkY6`k+z!1p!X_;^!;`c)m& zZO!-BbiQ{q->>O>zXjh#U5ekSTp>pIBq0P-8U^nMrLuj{_w<|-I& z{yKVU`Fd{qEo{SS%OmfdV%75Ryu5exVoU0cUgW)OBhwRep1#V5d+9aU-Pf~qzm&}a zX-S7su_%_yKEpg78Bak{*kUqL8vjg>y=5AtEDf>`ECpl|nv43`o-!g8fmLz^`!?2r{%|0~Q{}0J z{ltd=)5As$O}q%&Cv*H%0Q?yLRA3Ujwd;}tfwSmkVTJ?SpbN233T*AdQqi-F zh)Ojc_!k?Q6h%r|BSD^@lUpEPF5ygQE-002QMG-c%q@_LHkKeFxo2ke1q#kj&!B^8 zH<)IvlxhiP`WJ@KN_7Z`TnzbTNWvPhTEJ29N5ClZhb~CP08n{EQKjW5 zHPw&M!+$|$nMfrwU^@xmPG3P>BpelhS%eaVJ@7zrr)G#*B^6&m|57u~hY9&GD}bv8 zuFSj;o>V9cAg#${!yic?p5C>ZOkc}>6Fm6^bKHDM4?}Qx;-*UqcU}IL-zuGAHf|@J_~WF{#ifHd@oClxsbI9 z-fIwMQ*=;GBW~8>S)*GtU#sd1*lnusC1M?Jc#KlIM4Yt^kGh{w)4fD2whfO8bJo;o ziZ-Z$O(uclofgg-gp~N}+030Cgrs=Eg3@)XaF=>iSNb>06t^s9Pno%#*KC`#W`7!5 zb*bf0_efkWXSCs$3^EH9X@n%nJx_c0aZdzI(;ku&2EhwSh^ ze-R547B_Fe8|JTinS{eT6Y&N(j_175S&&5dlGlYUEjn20^8$~sK-4>f{}Jy@crrLo zQD(x7ijo)4h&Kq9taoNgiwWq!)D))k2m=FCd~O~B3lJpUX(2Fou_s>xJtfT2l#G&* zlta!8bMwpdeDJy#0pM|?xF~8csH85{ zWFX9;O|VL7h1evA9u43i=oA}-6@G4JYD%CrMj+%O>g7@dLWWRa4ip>Kyhz5OHasfYoi#|C8J&107VFu?XyV8oIAmBq zY8&Z5)<3&pdRW!G97wbeuUGAOVlg!~eXsqU_TTG#V#;c4dR*gOF8cO!PqN{DO11n6 z6?X!~&jCRSw3G56P>T95^VcJiIlsM5$_dQPK|UkA065`1y~6aGq1PpP1?lxNy(0AT z(`$iVH|X^Wy?&EkSLii~7i}N>tSe=TMam<=->)FXlsY5#d`D;oI~oqp(~WPH12$2eQL~EnB4=nR3F;0|MEZX+9Ed&!J5T z6Phq0IKs9f3e8MO%emN5IE74($Q?^02%BQ|3E!c-9Qq!Km2cKp+o)%Lh!=Y1Nx7-k zyO^7@Io~L}S-2E^Gj=PsZfn@|J6YBymbI=0{qJQz#V>{zQgx;Xx%!<6S@E%M5`t>tlP#qHK_1{duQ zZDqIZ>$awc&eAttx%tXllgq8&4kl~+#oGS0+QBvFP|~?abnaPm?n~JA{p^!|ldIxm zQ*mD5-#WJZC&$iDY^I`;H}h}hf9Pp^nw3@2^d!erQS)};ox)^!k67NbR^InCFT14l ziNjP~ld5g_q$sbb_=(5lF8in?zo_hEi>aXMNdc1khX~Cpf3>&z&@R(o?;0sSQe^pe zE;HUzjy$xb-1#|3Uoj!+Cl*2%`hMy|-@j>!XBkn^>a;Uzdhxc&^d$_&1w8CVFXw!+8O((*?27IhiKZ|7MGDq{sn*hQz;xJV;yxo1L`*(Y9 z9$h@JRPxYSal83l&$_eavAgE>)VtieyEE0)@x7z(9K9R)y_2_2JZ|c|dvN8zea~9c zo~08Q37-0dyX&E+_V&U(ZrwBZunexdm)6UMA9>1?o>tM*dbeO@e%-S><*EK8$K!DR1CWW#ikucY6P( zvQt4gcjfJ^%Y%2VYwqsF(UjAjaMrzh^lmh1q^Rjmx8Vn?G>%0fCCj z)7auPRdq3JnbcYvIThlMTKnlDcDG+MeXaXV(@hgm7#p7(+c@)06VVy+6IJ@yWV)oy zm1-Dj;f-p%gG<_+mk#H$jB`n2MS89Sq`@v9zEt)Nj~X8()$lc{>iJU53{q0(XT;IU z3`m20`;8*w6cAIZBOna+?kr~MhQi_#F*UwNMmY^~*OgPtElR=)_8{L%)t&y0R;zBc zWrWI%d<^2~^3gCQTBoLie7veV{j0_iie^j;1;#K7g$Q$o4-f>ojG-vwVm=*zjoi=*y&%SV>p&B?N7s~F2Ihe?({FPq41vl`^p@#;E8)M5;e?= zJB4DzJCcT1U2#v`sg5M{`!UrY&<9m_`d4iOq}`(WMo+3P#5u0I)4!a9GpXwz;%WE< z=(A}k)upyQAZZ)Qp6K(c3vrCwl-8IH4`2=2^j?9kEZu4pzfR4gZ(Y->Pe& zBwlhUpnj!)ZvtLz%Y{_<%$+6@K?zdFH~{vg=%BCsJ8fV9-3UQm=g`M6^tb9#u>|GN_^g1uu7;<7SqfLu->M7H zbE-T2tHW)LHw#FE_?(B$-A(ARhG<+(LB^1Udc=vTe(B$M{S~vIpA+lUKKm*tDZ^Y! zST1|xWs%*wTo`sOG;ESf19E8rp<93HE9tydSG+;ZlM2W+(0id#x7IYq8_}Ol-_drd z=>3vQud4CWzxt8LcFau+&G9Dm$cPFPZ^oB41MBF&uc;8}U#>actoACy2BS7-t+8Yl zT2b@GbX?W->KAo+d_(n3|8gyA-7!<^=_G@+4Csfd=&!43as4y2`WnQ0*Zdnq6(3R| z!$ct(!0BNC@f#|9`uB!2ou6)2*HVt&QehF_cvd%q2q1pXv2(qJtI?6=f->0kJ6in$0g=#hJ^-&gkBz=;!TN`QS!b*Fy; zp{Ew!Nr$K|<9I6GBmTG!(x|<^tA?k4g|EbITpOE-Xg)TI^==wkb%DyvQQen}Mq|5L zDoC?B7HrXG6&ic~EaEYNUzHQjjoadR@%*?w?zq?aeFe`KI^rFlj=68NzR@;mnK4af zy=YJ4aJ=n?`SrYbYrO57S+5sZO$!|hgxTLybEa_}?@-3@gzAer|BtFW{VNo4gtG<| zoBuf#3gN%6y3@ZA7ySP){jIvxQI7WLLQlP*`UCn;RCoF}4UKT(SZsLG;n*{nf?rzb zjCWr8(+xoQyhNOh4G-51nlnrGT&RkTn51_@;_Fdv523Fp0nOnapKgU3hC9zHUDaO{kf zKLt}QNa&_`A@*miOd}H$W8)J?PLBKcj~qBQHa@B^ix#oVdjYb+bY#*RgH@9EY9Qhb z&mv@YK1_AxdGEz&bS|=^w-+`g7w0dKh>D4{x=}<@Ymwz!7tm_Gq@mTz&BE44=pAK0 zP)&iX5!_jlsFCy%a?*$wveJvQSE)Ujpbg^BnP`ttg)WNiR2f*^V0Ov&CEZNPOsxeK zn+iA&A>FhW4Ic8UreMM{Qh*3CAHuuI%RxwmgR#D|q@Bcc;wS}*8JVA(gZK;0iE=3B zWLg?Mkdis|`clvW+Tbx2mONVF{F4{w!T) z^|!MiFaU#;roKovTv==qG+)nxTi(2zuUp`@D*jfuvzfc{ro~9ma8px2AcYTFo%?oNMqHdR(j9Bl&l0a~eYt=?H?s711SNWUQ^=lD4o z#$l*J+6`9k)fwRJ1>Q^gkWkJ8DtoW=_4E(+^pP~;N`FsZm-p($8H`)fPQa*vqTSLLd~VGOsrE2+PtC)`gmgl zzj)rTF8klnpinZ5Xljh4O-F6Q3?fx5XFjomgagd!ja;0a4|3GY8SRGt&8Qvv8Tdh1 zqs~(3hFD&Y_aM!pJG_$F8|#~33jpf}ZWg$pn~J;!J=r6fcS+{BWZn}y{Q07)ebeK8 zVRqg-83=plU}Q^aYmL`H_yHr*Ia%u^OkbH*>4{aG;Ulv_!f5^I-r;SI#BQEfXJ?FC zooeQJs`lp_9s}v&kvTwUjh%(brO@M5&;jZp-t!yt!7{=JILy+*D!P=-wGiXMDmHfU zGg>JW>Q)piyQ6d6)W2SecOJb21so>S!Ky(GgcdUZzRJ%`UyMf1_ei;l6Q;*7eI!NG zw3ewryD{M=y}nMbZ_w*?deQbypuLLl2ECT(MVl<9IB`%aQwWNKC(j&!x%auz5&xkx zBc~2SA;sJ;nXgOc7baqVq%;kGjaD#~2H6;8cR$ra)^^1L#{qYlW)jd*0NP#c(s`?O1Mt#g-ocA(mBusL#jWD0x}y!@0r?R z--~)mRz7?sBGrC6;UwiiIT$+EXykxEdaP0jDQiEY04=MkT&4Ewq}Pk|x=gPbdeNRs z;ORwbBe35^7n5SO@MZENdRDkk0Zp>$^W1z8y5}@Dbqd*N1=&_pG4h!ne&)AOM{$T)kt#sz7+c57b0*a7 zr^zV!x>SxbGF^V1COB&{HWH=$2~2+{W!q-^0eP_}l^;ek2xlp;!VzZk4-!z;1M*QR zzDOa(qsrQfu;-P2cd?4d3mh}$mF)VDA{MO7Y;rhsbr_AozJzm>u=Ua`6v>M~WL1N#0~DZWGLE&nWNexZa4{2IOL=|x)`DEBMc_R93D z6eZ)Ugq0OUgCC>-tjJ|Oc;P%H5~M^*XN4JPypsV$hQUH9;l3^&GeLkMgIl3tVe7By zzDtb;o|;ZM^vr3t6G&Jn{2IL?lyPCEse(XUQmIreX4;~H8cr&du}RTb;3SU-;R}lDetVBVds^XZ^!nxIQsLiG-2as_ zC?t7&+V(*wOV3sG>*XQJhSU>^Cen*7EeUd^Q8YgknTBpXKQ(hrvXL8`uBb%EJ*eXW zBn6|2o$%ATpUC8Vj=wNJ4JsF?WqLZPpred>9@=2;~1*Kvo ze3k4MKt!r059Z6lTU26`8TEF-FhJ=n*|I@^$P4YQnzf|6hK=$_4`QBIlnOrAkD?t{ z;MWuIv5FHkz7DWCrza!mJIFp#Xfv694#k!(lf(7K@Xg_by*gE1u{iqrk%#WGq6?YO3ZG9=Ubo6MIfk@h8QmhHWWCJbLr!ZL6%^_0*EpkiTeM zx7B`q+w=z=?{%z|Y+1Cvp8v@1d}GheJqdd~>7)7X_>j+d%I-|s8$^3U!rqZ; z>G)pgozUXwx}))7MfY9q55n(-*D7`+a^Yip$r6_=Z5K=1?+vcodmlDzU5O^Q9Tm47 zU28Z-QR*Js-AkiMPqXN0UbnZTpz(JS`en)5Ua__p&hFlurn<(( zlaFkUq^)1H^{+JjpyLla?sI<;{$Y6Cb^=wes7aO&isgfeg2Bb-@k&8$XVu4aq~pVbs|+&{nKnqK_Lp_a3$^atM>Xt$C*UOj%3FfvE$5o$Hc?p>O{@S zwc=9=$EnAjT69y=Q71a;Rvq2TL*Lu+&JO0&AUYbT2Psv8s1+SDD05VIrm8zXaoS4q z7xSQ@SJ_A%@u9tffwqW_mgVa!J?oBdCzh-Z7NrH9M$OqF zmUk=`q}*k1?!L8q(Uz)hN!IQaYxh1Vct5&UJCU$erK%c}RYPLc(8`MsM%Sv2By8nR ztoFRhcXvE7;pdM#|114Gsnn3&n`&%dJeX?kTs-u|YAzZxLvOdB4EdE0EgpVU-~7Fb zcPj3M?jIF<4khXjCD1mt4R6ofnYp+7eudb1AW?f@@#K%44PxuI58chFn%1R*soJI` zq?J|C|B=O1Qu9&1sT2V%x_}-Q09CS*iVusL#J24jQB!q0ZXN!xq>07(u(s!J^bfAT zcYUpPcT%V4T=PpWPEN4eJ8;-&mg@=ayUPqNG< z!;i|U-n@S6`ci0l_^$8nap)kI?09OimK8rSS)q9eowj-m#A8SsKgS=H)GZxfHZQj= zS0;+P77r*mce`=DuK#}E1INBcw14jttNK=I9$5d{{v-QZ)p6>^@>I1qSv??D52PFw zw}+Bdonlqz-HAW=(tBS*@7J)dtU%KezE+9$s@t(ud%HL=_R!n6QYm_OB^)*25G0*m z(dkXJ4U5j<$EDsx{n@qBiG*u{O%p8I^;L|H{*i}uy(@WQ-HwE#S{H4*=-i&FZ(5Fg z@7g=p5`FvDT1UkCkp~CF`a?^F4@ zaMJuKD`h>&Lc@PzL7RW#FhK*5OV$pHwZs2j)wC_uvNP4TH`RAC)wqQm_-q)2`;&?SZz0g? ziPzLHkgVS()^A&{AI5a#E&Q;4_>Sjx(e07@r+@h3l6k5BmNij7Op`{$dv;!&ZdVi&jt4=uUKvJ>b@AjwNiBDSJ)A-k#{$_u!=1b2e4q zvN-yIqZX4G6{Gf{qgc)vSXji`rQnbm4U<63xjJA14)A0borg;xAh8_mqTY83rKcv|f}dU4S8o`O|bx&q6QytVy_y zq`)L%n?|-@%%&rQ$lk+(y#gzPZ1Ev$@k-Uo32MAwPx<=gqQ~0wRM4kEen@Oh+KSyU zVG(zfF7a=jT?f75E`M|9t)1W8&4z(z(ZTqWQZ_TO$zUgIZSk#*ZDDA)X`th6PwUx2 zI~+tBJzq8J&1x=zYTNik^Hm?Ur6c;3G&VG(`=+yK+VNPgwwLgRU_~yo)MV2T*>=dA zK`-{iwq)#t(viYov_^Sw2n?e^Ih*w&V+T-I3jdj2Y?Ea{#P`f>dl8|UDxF!SbrB() zYC`O1tsQGM?8|2XA5mBCMxr-NkD;dB^mfmko_me!&K^b?jjt7-OgK)G0(VogvP-P& zx;MG*9AIG+YsKdhj&qpr7VWaWtkMfeOo#2q;Y-@o#MpkC<2 zRv6-dSOK&#NfbQ~^cAq#(&G`1i&1H9mJE0BtcN62GASch&*~8kaQeN$1bmSVx%1_Y44tU;VGFnKY9M}nganSdz5_K$3vtlC91qv-O! zS(X;5kdoQI(HugmVT=)d8>x2heoVZvc2*m8KS@Q~4D|15{QVtjfbC_D$yu{#d1SNS zj=ufMomW;o$^KEXe{{8O^k)412GMqW(UL+S_K;m-dDmKb&q~p1`M}Nj`&puG2O<`g zEauQ~v_33jyS^jhwj=M?i`$N`m7Pe~Pb_Z7E9Gqy9W{$1H(y!GxoxGr=FXctZ)4^4 z63Ow!TVGr}NYrQX(hIBh+U21{=RUD<9};CUDVIA@)+)MMmn_&(x!jBS*ya_N-mY9N zYD`hsfan@P7z{Ef4C}pXX}9R`E*?lZix=~sog@e&*pm1KI7p^wG(8wi4wwu~pSC=b zp>v!`B{7McW1|Ny^5Brq$i}A(;fXygOJa%dt4HZg22c;y+5bkE=Fqd;gzZ2Ygb?e8 z*@be{|8&r$cGPA>j~NlFE*KTEkg<}Lzz{cLEqWB%9L|CS85iV+=@IcF8bx^+GY)v+fiH&1xjfWC8P^M*xszK2?n6M2#+ZQx$Szk230rCWI`7vsLmGsDx@5h!gUXZ67SY*~u(do}Mwy_oN z+|OEb?o8NrK3ia|Q7c)2jS-xoXcNiq^m%!SBkzR`=sY92L27Cn9;x*jRosEsug?U8od-TU)R%K8Z=Jv z(9)0VPa?3yEk*-4PA6$)ms?U)h|nT(tc~>xFcgG4)26|s<1IM9UyI)7lN>g%0oBm2rmG-&t9dIl;~)m z_G|_ajTLnC=#CVQWq0&kmfn*&LDJIQNxt^IxE-DF?v8kwjxn$kdy570-{*m=ga081UgF!U5Zf|(lxaEHH zdgDIYcsaaDN2loMOu0%{oA=Wy-1Uhm3rx@)Q$s7;<)$jSlNJ49ML#%MM`=nZxt&LP zD6XcYt5(!Q_y_7QN9AGQwP zpIL7O@1rT@Xi7SIMMp34mg=5I`~t)$2CU6gyER!eEY=J!j{f@ilLC{Y_1W3dii*-W zf4U#9jM>L3fFJcA0|5{nQazV`5QR&l}T^*jVoYe9Ib9~)XLFjC~Zob zPEp;!K-IlHZ5YTAwPes#P-xHkv{Vpo`Hs%WicQ2sv$5y8sogprD?b;$49Pr{NVK6( z<7$WZABuXsCzK5{v?EAMP%`&`)}}oa&W2aPso@2(h7oWXhe@!Li?+T{lwc!}g7iEL z!A6T+Lq>lo!bz$(@MAK0oG3NQ*IgYBa&sl{wbH6{TTvUJOcVP7Ghwj7{N%A?|gfaN!ChME33 zGSP>UaF&rVsftC|gth^KntH=SJ*=e(MN_1jWoRSOITMkUHZYBu%Kh?U$-kwR5|WZSujEb9CS!NL|atZITx z^qZVU6?}1WAZ-8zI^;t5zY+AT4RSdf?5ZeSC#R1!WuO=p!2oKUkc_lsuzj` zg(gREuHU|N=MqGF-S2g;+qb9O)&J205d!U{t|iL`#j-)-EjzcQ+*L_;m+0=g`{Mo4 zb@#}JRZYxmNOUs^&AR)@hgHpwJ%cNwYo6i7G0EvkwRFOKdC3e~f9cqAQL3zA)z$E% z#8lFNBRpJ{&(d`?_}OGc{3JT<0m_?RfB~T0Pv+>cL{se@AiYLLBrl%Ss>V^ZP ziatbBfwHxZ%N&56NI&+XrlFPA8)IArB3RO1&Cpfyo+1@6+M9vP1(iT->w`0sGf|Mj zfQ?`UCtYS)MG#jb;Iu>9pq-f^%zLmxwsSiJPys=X@Zu2?+rsI)R!Iw+P7uAEpa-M?tZTIwjf-LYzK0wej2 zV>gdMkOFbpBUjZYxmK)#4wI`2JZO2Qpt58f$)U{@pCDwj-7rQsjq?D;IYt-AR_fLb zk8Yd~u&yTdlV>wxc*BS%hLx75eg-YnDIf!k@PioH3_sL4V6(wqyWFwr z?pi5N_MZ{^&!jx{%R>;F3>1HqW2$aQxoel(SKXZ}wq)NCvG0f)+%E^SN#McdbA300p8G#UAeaOvbgfJXHHk`4d z%m^5@o6QK=?Zeo6ZCsIt8Px9ZjSl&bAr;WUj3w?`u*~=!V81~KSpNqte}wg zY*t{%LYiCx!;TF^`_HI=HP0Y{zVq);=N|yBI4kd^u1INKzi4IVpsEs}K>#y*%?_ng{fcr9O9E$dFWx)WvH%3!n8 zU@J7a%H$EI5y!|)o6fyx4gCfH+}C)~M_9Rh#owOk|7u|x(>O=m3Y%z+ zg91DP4bomPF1a&gk{D#909s7Pm1f6uTnC0*0De`&kc+K=qSzw1Zcm>k)XsX zS9HUU#t`Pu36cK_Sz<@ysk(h6(YotCx8^ypn3uA9pp*6(S_*lI&b<#jYvl(Qk36Zy zkz=IXQ(~%VdHckj6L&jSF054#C#!dh)w}OctyPaLo=7>Xm-APh9nhqC`^7si-raM5 zV6A3PvSzRsNpTGEl~+oOir zl(1ZcB}zIURyHMCM<4he98WZ#UaLHl@SJ(%EJv~A5u@Qfszt;Gs%U!p-|)tWfm42r zSkO!aU{xW~zpu8ZWqrC7Xx-oNV3jk7uZ}2i6t&R|NTbb?p?$G7PH0?r=(a_M(E6DG zQb&tWv`Jo?-8sz{OXgRd7{^5$DuhqW9oj5@$^C4&N*dBAzC?`d4G(9Hy6wOn32?^4Bvg5eX^ub`%u4HAifS-LR)c zJp^a8MvcqC#uh9PTZYYlK3Xncj~lo%(`+@-NnZ;6WQFvZh;zfT-hfnDA3NM>3>RnX ztWd2hO&-mg9P&28T7=j$ST%-FubwcD9XF%q`n*V*Z7?>FOF`U>ccDHn&dIrg&X8-t zZZ_c@4*R7BwFT3^#_6f;4@Lw25j8QHwU7h5)}nFAt~-8T;DI_!yqsPSaH_n@`qkD5 zp6gcp+U-&9JHeJr{i$qF@11asR-OWY>M;);Le)+AC{unY9Zn_+Zh> z4E$yVaM_3lM~dJ)?jfAydt!q|MJ9U#I^CB=qcqt%sUBC@7CYPS)vUl6cZF@z*=nX- ze5Y3anUv1!q0Xa-)AJ!%%6i-WA*;P7y#(u_l(cqug_G3!xWyJqAwVl;bK72wrC(V| zCC}+|Bd{L!jW7%16JryHPmZ3IPlF|nqcBJ>Vsyw#1L^gg*p5=Jz%vs*sf-vs`Vy!_ zN446vnW1t=r`eXm-%~!-c;Sp#*=9nYTDyboRHahQmNz3g;UX0Dd0;|3FtO%6N1}!rYDxlUM z6#il1nrE*{t~{)2fR6Q^clNC8e$cd51;t?~W#WZ6s$TfYv9TwKl6Y*)9w%>0T8xKI9 z@)V`%Ua~%{s83c5ixtE7o7O7!F6E@!dXjC&#I|E8PYr2{_K9_UNVmN>;c7;CJni7# zfm>8j^RDe)d9r;>Y#&?m9u&(DLjU(}o><-gxV9}-JFqge(gHs6@NqNVHOI}+a<8j| zmU~4d6=Hblc&cqTskJtL=&5+piIP0&F?DUfKd{=l`_8ThLyO0mek4-{cDALO`j)I8 zxaw0yCCQ=&v8Z7=xLP!z=<}AR%A1y6LH%0FACWR>yI9_yDC@Y}@Fdq#(t~_S(>WVz zpoL6HSLs@CE0c<@F&$=$v{K-9jV6c2#G$do;6X6R51H#z_1hBO?f0PD`AR6J3E62BI33XhiQZF$(E@Zg@WJd_|mD z0jhA2##T{Vr^7 z?wJ7Fyb;MW(v^=c~;zcc75kLVtYX2 znYg0$IE9l8W|FRU(bX>VKk`itvN|s0gS(zHCz`e<>@qhb10jeNfJw>pv=jy03?byx z2^P$sC0AhtGo^-yv&@@`ycSxF$rU5yF_N?BiD(dm*-$^!whv&&&>=2{n*oPuTHCS< zHPJ%3R5*&nh|teT6Hg#ibw_eux8NXHTWlp?o$*PtL_IA66Flvti4p+K6M==^h3hyP z+S}IGGqkm*zg<3uczZVj<%J*S(TdHtmL70XJ&MAcM&a}k`HkL*lcOiSXV~!qGK-#> znP+6E;9VpgO>*b~gqP5{Pr?3IC)fQV@Z)INMtLgFRxSvk}hK{R|y9)prxWH z2TZ7Phr&%qSK6j^T~`-)h(d(rtA%*N3-$8U`%nuhNWtvU9+ zZ%sIkV-?Hu!k)&}O6yq>4Y#DDM|AWg9Rs3cU}evPE$faWkBUm4ooAuw zoDyRP2m|HYNGR56G;eMK{6*^K9yAXQ2_cD6hgjONR@${_PuWXv7kzy%#L_fnZq-Sh zEhdLNGfH+gLS!aV2EQ1I%6;&E!jZv$G+ZU2MX#)#!b>zM2D3tz1?_#iOoX-z0-)yX{t-0e;6$iCB<%k>IkvELY zWg7@63RN+&vdn4Q2qTq6HVG(yCO|%~fSZ7T>R+Lv@@m`74C>k5NxSD;x#62S45vlN zibsBShlKD=!o1(aOO;tJkq7xo#pw)H4ru07A~h&v`=``7)}zPA&xW5Ql>7#21H~h# z1G|@o-rRF*Poi<(x_#uQR%jD_WHUAFzCZC`=UP211ZqEYlF4F8E2#{X$m{wJ(X}Jx z>Hf%)#cYdUP*zp<_MSU?l2v13)!0WM6%U#jg^V~*J;p?q^z0Emds5{CAH&`Z6BL86 zn;_bDnjHJhY?_gr>L^p@*9@EoM@q)|UjuGp-Z5W-$Hqq{4cM9M1{i!J_bAOhN+ej6 z5FejCz()fW47zs8oCc3~uAZ-g(i>2l?1~tt zTBh``T4HwJm@DnCx)i?sDA1}OH7Qp8)4xXiKtPtJAyijdWDM&WjZ??vrDtrEy;6lv z|1t@Zjag|#kTUI(H$0Ik_>*PPhDWU(IE$8e4y6z&L_g)TR9aP(FGu_$#u20&EDUO= z^q*D+h0=GAOGW%OQf9=Aq#TGQ#O8278K)TskC}P`_tL*Up7nNnmkCXR1YwS3GLiY^f)+=zd zN({>rOBX(aEQqSr_FlfC)HJ4&3hXvZoCiZ!b+Uu0&?j0ISaA=X9_c-CdL%Z!84zm= zMvp+2^IIYwY#Kh+zyIM9dWK5 zc(MF7+;RfdSk>5potz^}%{Yt*$s<=*qmkIbbBxxKN|oy^W%5je5Z<8GuM~GwG5pc| z<(N#OTtWv{%=y!MScW}J2TyX3&qj}^NW*jN;1(2_bto+zjJ>R_CDZ`8CM1*=GhYr{nb6MmPlf0fq_wL;58-v(1ZndfK-XvHk{f1yOO92^$H+eG`e z#Ln?m`#7B!Fnnt`;ck<)8bNLz1v3h?KD@9?XzpV6xAI8^Tkj(XClw@YwCH<#lXVBg zx&sMYb;=G|-%ioK^Zv`L_I*s7axgJ;A~84)zSBvw=aHu^;cmZM4CDHuZrGEP4yCLY z>~7DDA{vb>^aR~=>&#lD>P_& z^PzR;ZXrVX2$fvb&&)TqZA-SE6&iomc$ zj09p9nyR-XwjUL%j#6@UiH1F*d(VUVRrdjwXsURd9N%)#>>+}yDe2iKdiEvUBM*ux zr7_|l78HtrZ!~QgF(>Su&+g7N&4fN++UMCBfF4rzWsuZcVtWtKnWD7L$kzpfX+YNn z%d-eGN1(vN2eCR}(v-23jIT(J4}O1-kP9RnSM6|Ude~@(BNa2+JmWUvOjgvh;eSl% zU5>;Q+ni)shgjB;=pq@p6WDp+pS&T)rTi&&yZ#yKV3+Sw{H#ha14|uftV+M5pjzkx|CMny zPTcQbbM8*qc0XGUM35=>U1-72gOloh6qZLM7dD{ejU5bI0*Q%|H^>=V^DIW zLk0(QU!mxnlzSOR1Ntu`26GVc77*yl@bEOOaQ*D)UD}3)r|113JB0w$?Rm+)!NX5U zWrNceUrmp_Lu%f=cTM-g${v{GZ0&P2cs zfqza2_%@Qym<;Y7Uw01Ul!8=2QL?}*7I>2donis*p3Lvd|HzbInE#{{I^0Qji|B4i zx(7w~;Qb-Y$_dBb4KeOsT`kxO@e%|r>?SFhi=A{0iLRlPdxUHR_M1O8WfhH>wHwQl zo*~gQbbmDA+4qEL1A|GYC}yGsv`3qqUjs^S#@!aalxCCxMQHGBc%WFKP1;OV#sIm6 z3gfs&TiCcch8%GJsK%f(j*IPZI$v1f3c28?Lv19UrNpGdRrYtJ-8bcUuiWTLy z(oJG$oOCsBstI5Xw#l%h_)J?2E+?&^_v&VO8f>sXN9*N&j+8y=TBt5o8v`sb-n<*( z%OzSlH$1d~#)5Afms~GjhJ0ua*Slg%f&1D*Y~#qh!x`ReFz3W@>uX>@vZ>ILN4GXG zGM7Ty3xo$WBm60{l@Eh_mp=c3UW}2(5-ZN6t|@aIix|=EWT)uv)xV-x|CU~VM6b8- z!nDK4_5X!D{t~ZDI~J_|G)@%?L`-T+)L0ka5weV|AC(C67`WR+a{K6OYAr*mK{yFjy`To8yD?6lbATS*7#h) z<^2#=+ilxZRh>9*NvztF@a%bPFJ2mw=b^ifb^8{L_)a=HL`TQni>r>U%-ma^sfI=O zaKbUnN=JvBrQ9AeK>^oJNh2S&BX%5HHwHRio!)vycW=Vcn*j)7^?`NGK9;bb)3Ylg~`Kyh#iLz9_T|4}sqY>4cFy-gv5iy>gTE4QjDTx(3rcO`ED_d{oo1 zDKA`Sj6$3*Iv17lK?a6pE(#9Ez93wSIl>SlM{)XUcQ_D< zH6Ng3&BLuxaOS}DV@Kem*_J97PZsc4r93$)|K%$(hehS%$gEJ@ed}koWlDY}x+3B9 z*V*|wV%dyCV`~zFf|04|rAo3BhfNVdu+>H?z)cd9mldywa1xQE?C`9Ac206Bbs|AF zOIdQ9na5f8AznUD_Bwn%TvlnVPr;JMEFZ8 z>^iM>q)x5ykz`Z1e*xc2J#$*;p5;#cj21j%a@{a}>@d|HF~erStkO?R_b?cFJ7m3-_?vTj(!e-P?8krC%>B+L56vi_AWv21U`wU-!GkVGZj zTSWJkgkuZz&~ypqtlE1Xx+_zvZb^se>PWe&wQoA0=4r0gljb|=i&400-47Uh!mE*IHQ@^rqwsx_5}>J5p8NxA)%Jo2=R?R_%l|t76y377L_S z*`~^^pA?$FyQw?4R!mB5`epfJtF1JDG5=|?sibaklxe^v8@Gy$Tkl)Jrl>jqu8*@y z#A$c3EmtnU$h-t+b~K32hNN?==-isHZI$=W`gv2^HiXgZn{FG(`p0BsE27Fj^fp1z zcLD)#G=Ihz$nAMVM=a8ziEA}GfYM-Vy4*|s ziPE;aCzD;LMEoy3wdOpXu$}&t_8n*Ko7QDVd+PfEq4Z>Izd(ne9WkgjJ_>;i6KJAW zt9uju z2j4fZd5%I`(^-D|WU_jni2v*|`)9c~8?aBM@II@R$ks`v4)mB8&x&fRmWpTCM8Xz< zgpHtavTzqFbQ9=u81N%Ijj_*2Kcol9M`znePSWe9#!#SWj>eoXi!$UB#EK{wb2@nU z3Xrm}2viDr$8rZQ$qJ2+sDi&pM@y6$0YzAT8Qmq2o=GPALd9Ptql45X`&pNKY|X9) zr@b7fp_Vl+56QCyPR@M4ey#NQqWvKbe7^kd(7L@1ld7lg_REmcJL(@fD{-65?Lfj- z_bI4+1Bk1;=o}3SWHvxkqbWeP5soY%h5_Mf7=y9KBHS+!gWVYvIra<0U?h2D;1`II zOGI#li1N={51DK(PmAyo*Pzw7JgC&rwg=CGC4Bf7$U!!v`#d?=bvcxMW{Dj75@AX* zAfw@1D3@9Prilt62K*8dkb_(`3|q)%%0ukRI<=gd4LD4MrewoEc~s%x9vTD?=Rue- z0~-h7K8;gbXeKnvc!`ebU}%U~jqEEA^7I+mS@`l*K0N3TL)bt;!kPMyPLY{|$s@CI9heZ6EI7o`z;3;onoBOIp8?{^ z1X7h`jhA_r>!(3JM^jBFN_NBa(;QRY4ooBIT|Q$rgjXHApa=fOD>q+BI@?5N+uft9 zIMq1AXMEMUBh%;Ds&iY04<)n@5sw1z=zV--!e1`I?I37sQc|?bWwp;it#h&_+(Q(Wm4Px7{d-=iMlVd8<>W zZs)p5ii1UZ^HfFUtazNUAK1d0VLCx*PKIz#kK%RAWM)&AYQZ<;=A?VNYzQ(Z)ns z%`RN`r>CIw<%(M5Tk3rF_jIpfqV%-5{nSds?SZ#<-GNzHw^-G^Ue*o2pR+0RtM!d$ zEc$Gg)TJvkbp8zro6$lv!Lb%-c!c22IYQt2+0t7Bo+5%N%e;s2wBh}y{Oy&*G|{?XNXhpI)=r=Q8) zS^2y;^xWN-Z}%_fe`hb8a;<*FSquu1G2Lik^3#^fxR$?2$1#3{+-rgj_4}vou>XN- zMzf_-u`#i8bUA>EnZL6Y&YuhUgeG=|nlh*FA73M>aEh*k)A#`YY?Zr5l_NWQ6&veH zKiqb9rT(@#QPcfFdAGRjtXx6i6oCHm6=TW#v(;wBpfM`w>bQ31&u0n$~a|m`ZUl~|NVbvT*JJ|%6HU`3^?VD!VlXL;(DOsV#a+RFg z3#2@Vb>(tMd1Or$o}HbG2*ey`WU;)0yTz<1p}n~3m{^W}@0E96Ni=SO z`#zW4Ga>GoNbHy(ldv*uZeUupk@Wwuz2e6W{Y!(lOKyRqzkTIpyqKy><7z=8c=#1< z;Ey`C-#xps<-IS;VQ;+j^_O77{Py^r@nrROv3mRBiHGh|vGnx)f(QKj2e9m}m7ZR6 zpH4VVL$thMN7kh6$Z+axYXUb(xGU)rq#m)VXT`DZ-px$VT18K5(la1>239VvdiMOd ztmR={18$CKtV|6aeSb^h^aK_=q|{jX)Rg6gxpWrHrEMl}Q?hPAtQ%O_y9RC4@&gac zD%UEXPYfMTw{Q*Z@X*#gTnU;QOoF>_c*(T$(wL?(cw5%IMD53RV_+bSDM;( z`2CXiBbfbAz`9Ztu(}ilgkS(2;HADP*`mP?E43M>+US9?m#wZ_t38+4e&qe;#3|f% zG=ZxFZ`A z$vhXq2;xUHA`c=RSV-aevYOiy$(lZ~rY|utio1qNjxFY=>?H|%&EpF1vL{(TB-RhD z1aaRQ4xK#;WObAx48o?xqmP_8N=acUV4l(*S{LZ=>0!J!ljz*gTTE)FuG8%`M$|l$ z)!fwD?vSF!o4$%uQP!Oo*lim$Khspr)(PD~;b#If5q`r_;m>Jz$?otT(M40ubkljH zEq-Iw8LJSj5jrq**sr1HY&L&JrIy{Tn+8^-uCe-!yNYK(R;k-)xn}byT}M_g;#NR= zJG;|n=UOptSlh)8RcIES%?Vrcvx6lEv02A-p}RKg7!E2c!okndmV+3>!8jn4=7tAd z2;ANHz}JBP0AB-ME{>z84FR47bGR#`EV6gTd(-@kQr1*vXb|7XngIt3yk0}zu_{^w z@>v7K($erxzW17)%0^@b4y=a_Q*;*QOps>d@_B}QVTk^fQ#ZX#T#rK$7=CHVsMnan z{#(&C-(XFYuYnT2Jqu1Zjw5C?083(agD%dsAtQO5yaQ5*9b>~Rb}qx%sS8&{Ysp>Z zaUMTT^3e>j$TS1zv5>u$_xX!6lNSm7uU>?m6uU!RuBtqPdH?^~z67|9>^u`kHx2>> zKpeae@V+66mn2H0MDfr?Q8N-{&9O5ShomG*6e%~Ti$i8OIc(CAGgO@HfHmV9XyaOA zV(*%}*{Y?>sYFgCmDwYeZh~3Du;k$+lgcJln}S4D9H(|S`~9yQ4SR$v*Q;ZK3~B>dx=ZOXu^AGrjKtaYl)WR?Q|q>mw6vYRp znpk4w%McnWjMwfy4CJdD(A!M0y+IRGe3px+&D0I$*@QGZI}+1aqxaUp!0BoN1G|Y6 z%w+skKWW)q^p8(o^6eTqon^$zD!ZtbxG}M|BPAvC&rX40c~err+=mxQ0E~`(n7gLN z<*kD-i9g>An5KObbD%u)8KO=A zf)U6Y!H(X5g5u=NU4o>$4$*)Sbd~WaKQnz}_?moyq60+HhHpe|Wc!bVwU{Xae|lm; z6?$_L84im)Euu@hTXD)_Mhab(5#cZw%QVFs0-HW*qDUpdweQ^eq*+AL&{7LAbtF^Y zY;96SYv7BEUtI3L_a?kD4fR)SSz4h1g9j)=sG(fD*=Vn9OD)~{2ZTE;;0)eQ!6F+Q zpJE!r@eL(9-SL~3VLbk_w;#&8Y1U-{%9_`q^9wUIURAA3CR&UVyvMZQllPifWbsG) zN74nR_zhmD3&Y4b_7bwnH5)Mq2B(yO<4He~5a$y;tx?%K&?vd+gQ};xyhRN+Z@>io zxbRD8bYQ|v-Br+GZcN{hL6-g*0nlk7PH)sPa%*H_as>L(^1|&GeAI5PG@nEMlvvU5 zDZSn!GQog}@}!BpTa+hFr2j+9;}#MmGC{GrJ_dnUUmx-0EcI^@|F1>zw2-m!ejFvu z*^7pEGM6&RYIvKJ-3BwgPdTG3apC|tOqg{DxR65sq>hl)%56WW>j=KMc6II2A71_L z)sM2icU|hb05h*0%kA%W-0ujCe63S*cwuFm5NNzarzQj3cmhE;@QquKZML40S^=C! z)PObsrj-W=uwLU(R9-1sOFpqAn_<;Mh2|NlG-3sIl#8aapFFKbf($b3`-M424|Csd z^}yXV)Z`sHn&R`7$-bp5HQ1qQXY2azU`id86&#QE10IG{z=b`E9IHY!gA&wbA2u}Q z$Uvq`?NC=@A*m{hAUDO0$|#97)8HfK9L{#Mbl860 zmLdi5ih!n70V|v}AeI0V4QZuSNWk6?$*81Lf)<=eJ}$Q*b9xbH4slejELb9{5iRg~ zrw~Tk%j}2KK$6U*^KBnbDB4jLFSR>ze;V3(081cPyIeDiX&47kFR9~nFv)RcLe&YE)>Qj^wBp+?g zOdY!Ip0xesK6d^@ei~KoU6bh&EDK^1+!E zND7kp;Hq)W6D~abls9^^K_4lqhOzOt9=!GYT)41P%V>uO zaA9w(AS<2z2z?~h@f+azS%!0*FNsa;b4DqJq-n|Pr0g_EElJR8q(i8f#tqwRMN{qW zu}S@r6|oqe=Z(!Vrc5dqN-GmLcFY=47bb}$ngv_zQjJrkiEuVCji{81(2)3?X@n*_ zR#e|-6g3^+H!h^2lkH%Z#Ub313oMM>h6vJ)n=`}HQ~ z@&J*NVN6o&9$;c%#wK<(jsulVv5$}{;=D8T6{q%vPa9K%1?SwRPQkVU63| ze?kKN_!VjYh36<{{i3>pMXl$J8@sASdb`OW9jSt^I|47J{hIZXyK{EISRY2x3 z?HG&fVXE~H6jo)d4wKs9SlZ(5m7X*7)D`?otn+;?(LC8q1da%AyAsMkV>(Efq&vQA z@6rm7a_#)3qb3qfZcuPw_S&_7`k93wnDmFtpvC&dPAk03LZits4tvPN1yT&nqfuq1 zUz*d&b!y*ak%#(>ANURg&oA~Zzxo$e?}yi9rffep-%N{i6gde?=oM`I2_EDer#ew3 zlh;xfIO|DXMAed9p=^&KErx!UL#Di@R;EJN*M^vxC&2QFma=d7!SG?_!8W!&12z_G zGB6O`d60UHPeZlRbJmMrRL9Lmt#*z$fgYyTOy#DMA?w|<<^S(uM(tF9i78MANK_R^ zshwl(3>7&AQDVm%x6>AUk7VB?n)d9T=6^x$8!b|2Hkm_2@aYqGhDS$6uh2Zcvl=f_ zHByX!f9Csdrma+`^c;)~BrdPqN8owh+J)_O7B;pNm;KUF46Ulbsa{sW{uoiCjxT_y zG<hhYCE&8=*Tb+ArLl_FuU+ zLdxZ!@q^|A^RSo=5+VK2dF0-ho}NsaXLf%4FVM%&OKzQ%Bz}e3s14cDcGC4*#{#{? zLl+x*9u7gxCTu?iya-3ef4>c#IYL_voCy(E_yuhSPSELg#p+_xgfx>I57WQ;^{JZj zFS*I&oyvK{5{}Yjxw9q|lFCa#hr zQDpsKG?V(<4XErh3e-^g(!a9nJEXkzRVuX%v4ew+BS*F3jbf+ArcsMcT3Vmq(&-P_ z=@;d1Csag;XN#ik{3%w}4w4nGbYq!W9ch>W6;b}BMJ|8Np+9~Ef8M?pLekpAfjymK z2#{JEKPPV}homogd>B4I_oS55L0|(`B&h}@OJEpz6Q4*W6PJ;yTp4J9FnPf^i^L3r zNst-u00Xs=MUQIywZu{c6A<%XgZi@nCfIi%aI<%0tve!ek0cW++4(H-3Ij0&EY>LO zPm^HODDv`?F%3a}Gky;k9pW#tx0CpK7iGK*l5e~iN9=~ZOFBu#0k{2L#j=_FGT^h- z@;1Um2v8O=M-fStrJbPagR>|@OJ2Is(#+Jj6Q#a8eUn7A$i%$2vGK~K#<=kazqk5L z)`Y`jRHm{HZy8IrY`&VRNFYL`Hjhu8Is~s!OG+9#Uiu%dRZM(LG*X0YNJmeNA2)d`k?`2XN<_i zmyxM`iTZs)l_UbNZ1lsSEck##WEuLS8Dryb+`NqQ@TiKrpOHu|WlKsHr&6gAiHKjL zY3uip5B8+Fn(RtN5jy1lPX34a58a#HFG}4nLdBZYi%Gr&k_`nC%!A)ux737QS!?~n zj_-D?H}(O7U)IPbpu8p$AIrF(5r^N0?Me`EA=|WN@oW?|ZWeV(__K8Fe2FPo(|ZIG z4#KSZ-lrzprYTP{<%#)qp_3aXXaJ=bZsAZ7Muhmc@O=K1ngmWwcsppOoxgNbBhvv% z7gwcUCQ^yBL5UTUJ}o-j6SfSra~R!@oi*q|ltH5WT__P&Q-XgXHTPiW1~3uhD({Vwt5LKz0wet0sryr#*{xD`YdE_dIN|pE zO?#DOuM$mFyH{Pct#2Ul&u{DWlQkA9r6s9_-UjPfR>mFE)A5Y8behGB?XY0Pl|%^s z_Mh>KS|L&~d6O>WVcA`SFPg*F4j9>p zTMHoODH4$R04%$Imy%tQm0AV7iWTe{`o1D3u9!dNXeKF3?)4-tvpc6hq25oQh!F`3 zy-+0IB1{e<6$Z`g*4iZaj_!xeVMqT*b?c6yPfPYGm>CI)7}#XWYm)MsuvmC)z`6oe zKd*_|E2x#SYa>OCk^JgNdG%&_msH*btasTMOt_#mQdPHE)hku?vWFz<`^;|2%UH~O z=G2(&?^u>BqPrnydFDMNfK?rh1%@%(i1`up3OyxM7SJeQ{uA3X z;Z=H+iHREEyEDqOe}!klQ*zj+Cdn1yb#h*$@5sLV1$->F|A}U>@6$VkPDyeC0xZ#w zP~2LIONR=1FR_PwW8{pJbD5kg$xkTXrr7s(OeM6CnU zmtlwW4L@ZscWOFc3pdHRMb2$Th+r>#nH-LsMRJzN z`3gC|PEJ2L_sF5SNcbu_zd??RoOj5fldrH$&V6!zlbruZ&hL}+$K-ga4Q=p6Eo}F# z*lQy=H-TnCp`LvE$?1d>CEKzD|8Q3}HvZ=nrGrBMntcC@924ErTqm2G{|EAIlk+kK z{3H2jg%JLXoU8OKgB%MvHgX8qTG&gW0rEAI?{~?^CWBS-{T?|F$@wqj5LAlrm*o64 zIpBE$TWIDcWE^D82vU29+6G8v1ANKRYt-e0Lh6X6m3MWgvO%@c!$uinZkTj*PC33Fu-VCP9nzOZbee{mqpm12j=4`{cF$`_6;z82=n zDMF7HF-po8dKc@KXQYe*_8!NZAYf2cw{T+dmja`}$q93{^m5r&Mcu;S^64;F7t?Y4 zAeoyesSr8O@(fE(%V%#@H6Z(r0RMqy#UfeC!(2m5XXLA64oz+$aw%Wtmg>S>0VP-R zNm+I1{6<;J!U@q{66RVW+@2p)H>_4}RPSFH6x|hJZhwS(38>P@v?ow;|74h}RLW5$ za)p7>Fb8~U9bb<+>}e6Xyg+MEzj7crBNfz#xn_!52rF6(#}`MJdzY?;xe|KRr`_^a zES!dthWma;n5&=^yhx#{PUNz|l^t8jVD*)s*7*>%D_9N$(=gXab;(0r8uyA^QNSM@ zTA2-VO%bj)a=2$3qKB1Mt&VMY52E~N;)B$lr?p%4d)Vg)_O76j_fUcNp)AF$Yxr5$ z;Q5ofSTbXHA8A|s@y88Sva|@U!J&cXANOQ z(>OE@P06`MxT?s}e%1*CJ;93LTTJ2)C@S#__DuC=UQ`47cX{b8}{uvB{FA#@dvNqNV^+?fb<9QX3&z!UtoeP;B zbV-m~;_(n4$CSJbho1w`s4vZ#>A{OtJ@|stNV52f- zVKIQ+wj-fT_S~lNRt9HQ=3;hs>(G=nhA<1X#Io4EQ&Wfg5A$O#cI(y@HJ})U%`p$V z&(@T+ujw&qtc^*fM`JncVJ@_^5Ih=nYn^9E=sP+d@rI83x-xtccwfQ_GZ) z_sJcz#^70gJyf&7w?udc4J^;93pNON&zkEgUAFZa0}w$phSnH|TbMR8cmmZMeA!bS z&$}rA3j=&HTORKU`L{K=#M(K&HPpMU!3E?AgY5}Ael^79iV6BE~HWT zLu&9LC0NIo1x7)X=gX8J_+q9yzGJOCrorVS`68bE!ZzJLeMzh3hqT)UBi|TvX7l-5 zIR$~9`*&kJ{75pxy=^34lant{gWyL{R#D7IKJ*0lwwZi2MOd5S@57HsQ1LR750O0E zX7Y8puwKL?!LLLjpGo5^iJ8flLD4PbtF!SrYIOLiOj%-HqyP<$>)hpN47P% zZr8l7<$JZ;NAzaCYCF3S&LIPDi#a@e(H21Icw8v~+qRK>zWuyyD-Y(Ma8(|YZ$_&_ zBEcSvypvFcgFj}{@|{8bHd?$hX3+9oK^OVE@DhET9NrUjKh@x(v^jjS=3+i0fG>+n z?1=Utzb}OMaf!`pE&N&SD$xAs`mkl2U1Rw=3|6wTK-Z7TDPs&?la2RMX1K(>#e9CG zq%o$)b<1p9^e^`;-dQTy=5dcKbbLW*ie4{xh!=3d!c(>PiLp=1J6SEsgA#knW9%7y zlgw>1dpfAI^ZA%rdmmig*1#Jp%iycT>V4Z9Tw}RKd}XM0Rlj;})%0Q4wg$Jct`@#L z2gr*(%z19QOK_wx!-=ggFd`7T>24p-t_|Uf~NXs|SH^W7Jjixy{ zs5#i0K06Z%i6C5*x8RGp)$+5q;z9B9(=djWT_sYJnn_n^hBE0=gW}yu%OBMS+!S^+ zQKzG{IZ^5qz>B-qNg~r*3dH*xL~37jA*#od|)u3w&|WR33rZY)+wb*pPAyGWbv@9HgZ7!2zO3E{T>-1I&x5bWI*p zH7kU-aK=IebI+BL8Fu`RKd}s)J`V0h$}>BX83GDgA4xHzm66L?%7yw3GTw`fgRWQLm6WrP z^4am*BZ6F;q=I+IC9@=j3coHFhnJI}>O@_`=g#-{pLl6_=*)0$Uw`-cfuVDwcvbz; z=qbnlHp!IAkSIhXk$-)l>Ar~=|62F1?n^wc(E{gYpXMH)z`D^2zkzIaoN}Z26A*1l zK{^Kf@ud;6%_b}(J_o9&pQsKDGvg5eUAeM#uI!(fH2W`SxV87RPVTD)Hy~wX^5Rbr zS0+C&UK+b40+=eTg+w*#WUzpnWcohQ{pbfMX;2#&=9r$j91QVk-!_0TD`PTh?`6&D zaIc$_5f`=h2oI5(Xvc3(|1IVaOy&GDCXm!K@B#q>jJD{+xqs&3Ro$aSP@ zQES*yi>SeW^AaR!pcf%G(6GQrZ%kx5>W)8zP6iR*WS9kvFAa^&xF}g0bzX8z)U0C& z1#8Ty$*A}r=E^5Vz1gs6cmgxzhKQ*kVxrBFEl8O+(mBCAY;b`1)<%xuV+9wwDi?054YdA%*6fH!i^j=*-@Bnx&W*JXwm1(d(<_ zqlU}l*RR8Vf_!a>TG)NdFtM1T2JkwsPd7(7f8o5s+R3vd=&g{2f%@)*!WI?2j#qs2-3k#ExOB#>4NtRVY+k+j z$NZmWem`@4-)T&sfKF!Zol-mM2ds8P0o_%*#-FNo`H2>p)8ru?Fh)=s`A3G{-vdS* z8Rq=Ds)z_P#Ck|e&f1Cbe4W-nFA+Muv*o+F&dT@o!3g$+z+Lh*L%WKE zI2P$>E)*tqB1|nD1>X6KPfokuEl&u!5|dHgy(D8yq(!CvoRsv>Ny)UcWK6|G6)$7; znMj6Kozs0=HBv)J{lY7ATsrmpJKA|3+O&K+C6;*Yp=oP)$2h9Lg0_CG0^JY@{^_`K zn8HE!hBZ^;j5A(d3}tiTZq7*E_&4?UJe=lU6?AM(^9IB>!KSv#wVN}=r*AN9^$U&} zqGGdOIxZd6T+yG^&l~3r(4gf9+0c$=k_k{~Mq+-xG;x`6$^_~xFzLcLVk6YmzlW5C z-yz@MldlR6(5WWC%N+(4egrCYBT<8_(l1-}WxB3N6VUk4g5h)B{e44ygXhkiJx8nf z(3!J6$A^1|UVfoZXr%PmNIC`|boCxkF1hrSx^2`dmrItwh#H~FNf5G8lal#N)JQ&h zWM)Dd5K0xIR;I$MNNuDu!e|+WrfYou`uE83Fpa@%j0UVb4h^8Q+k^W;yAjDeV}#Rr=L=SY)ZiZ)=g@zYv=V43Dc)rnJ`?qQQc)4EMQ&FbBQ?{ zCd$=8Dd^_7`?`Bh3<4el6k2*of@EoQ(&k`pP00G}IP-=n9;PYqNmhcT4gh$Dkdf9nNqjPxH)68XryKri3}9(4a4=k~n>SD8&Y7+fFKow8jhjL7 zpzoIflt}LD+`M^)xNhv14#N|MsLd#?;ai*>2j9S;s?T#6{DwhxDiR71KAZ2YVLG(X z(1o@SB(X+rC_5c*R9icH{N?GJ0>sa50m&Kca<)Q}*$zcI3Xgoy`kiE=YGm7(PGX35 ziB5V*5o}py+XV9){}cH!;VR)Z>P573NLof|29PCii&L)xHx=BNL-B=OHslq_UxX!7TjMcD5A)1It>1q+3fJbhYoXufp z>%yr>ZkL!_vzgl^<#vU0kAR-OWy}?g-bilw@>@$8TiMl{+3iwxdpLVPB$Vi(SasxK z!NbgtUKBlN7BfEOGQK?c_F%+xHezx{Ozwy&2l8aR0kk%20eBied3sEX^80yI{5LBa&grfaGk3>N9b?!d zLO>U%e!-r!MF6t!oc`OYocX+Q>J&^GsPhDbW^D;E8~|bH-qU?uh0=h@c{S!7GkAda z>r=|9itVHm(^0oMz?R0FJ9E;ckZhqV?9e~M`Q?=&?@y$ z_VZKMrlxOC#YI79UFUCr1T{mlK$xf4Yz1@Uy@;~=aaPaHm=G7D4TC(#tRMs7J#s3r z?6FZE%^N;9bXMNo?A+}94aH=eor~n7&A&jCo;-Q|U-JDW9B3COc0m{EDOE%kA7Qf^ z&2NCPATbnWlS4aSWfoGzig1jF84@_ZP16qa9hI$3N#IT>d#z@}1+H-90(?NPQjczCsKwN`}n{{-7zwLvAD+f2s)sgIiCCjI_ z+`!>bTc}#hYYy95L{rPBT=w#8@ZbiwXUphZo?JJUK`ZP1N}Ac$E!Ce|o$s7lIweBj zZ*J|?4eJRyx?^XO9kXj}`R||q;I)<4)-&tWXnR%**WTQ)o&qtKMSF(*j*GM~M8YLp zyS8CHxdo*!E?Yj0(wr0jsdJEk)=Hg&b{h&O*#Sp_?&<}6%GS9G#p6GPU&_ki33SG z%yE>I)gH5^EavF=?9`AW_DzYImDMd}2BRYhlRtSG^%uTJgX1iGSkKu(WXD+heG1<( zpx9UP@q5#1S5zz#4{)mRj8p3b_FlZWDR@>JOedG?; z4=$YC!q~X5ZY-3^!06H^zoFkSS42D|o1RL^QyKPDGl;;AqISvB4)nReTcO#|lvvam zcEW^9*CW_j$d~M`qN$Yz#K~{)-?TiiY?vElk?M%qC7KH!Ssd?lE_H5rn>M}O68kt_Gz&ikFB)ys+@52KDyg9tXfZNF&RFLN43sj}CoQj_HKlane3 zIhx!VX+dPm?nAiAT^?fE7zFg2H!7%8rZN4zdCr9MYc-x`By~K1swY&j<}Gs;U`V;< zEOSOy5ZzqHly=UHO)CvB_QRrNiFe0&gZR zxbo*TV}^NKV#ub|(!ptc&X%-H+F({dooePZV`$5G=)8T-{&wG(8Dv^@_$o!m1;cFL zoP8{VC<612Imeg zoIx;;Wz89Ikb>!eu`DHAw=jTmmc+|4=42`4%w)yOno>VUVlj`GZC*EJNyJ8YynpFt z)T2+jiDpjSPJyte=9YMh@1jF*=tW1LNhw28FPeAGIp207p7UBh;$@=!UQcR2SG+90 ztQ~U`sc;M+{cy8!?9zSB>C`;-Y={8XLbFMje{M$pjGi{A# z)D6#^3wamJxk0{E3HI1lrhTinaJG(sKp{Kq#W7{#5`h&e1WrISL1KC0I-r^%wS0Nz ziZ7}MKq9naa!8VSw*a<3Y5@$Ef`ft^isc{wBFQ{QGZYV!0cb0uH7erJ^u$emn7bh{ z=0pN6C4rlOQ~~BYD}xNOm6VuK@ec}rLQ@!X6Ud?FULVCG`CEB!*|@ zm@jH&#+Cix86dv!E(#wtvqwZ>2K`;vToI z$EI&nvV?}E>LRg&H7go;s04kLoS({?5D3?mIH?mTIn>yuy|?LLPoS8$#>@z49Yg*;Y#u+R> zRV`+V&x2>($gn-3pEpKdQ(<4Dmw9Vu8*8Z%FB8ldxXUwF&2h6nlv0f!5I`y1mj zQ=EM`uz-Z|>-1S*h-!LD0YHm0B*0wqGsv?UloGmtG9ZzRVMX1e=IrzfeYF5biw{x2 zDUSlig2I%lvCk)0gIWSit1&w6OpTLT^{}6{!+3&Z;~z8@zr6;COuM z7O-Ol?`Jl4=U$%|=d^zGYqocUY08Ny2FH{j6Uhg-e}nHP3-QyoG3t^zG$}nu_?SY9 zJ~KmEYw&LnvDPDN0`pFyT zNlJ&DDE3_bzl8@IX9yUU1uNf!lx^NqU6#kNa5`3{apf$W`k}pWQUA!97pR5oZAI9* zXVDn3I3-Il=+?8ZW&RC zz+VzPzItM_^|*w;isN|bDGe4#?#4y)mc=ash~KjR2lh&oEC;ggI!}2L_`5?>T(?=& zC>1q^uB_?9MTZs#w_wtzIC(j88Gn6 zg9k+45pYZjjzM!NVkr%j3wVuvZ)DJ@xQFOt19}LTpT*qZTh1YVvsSN6{z8G6q!y!uv&5b#Xf!XO%fzAvPZgpV{Rq&Pk|}iM6;5^WJ`=Wlp;H_)0qRLcHnpVCZ{i7#yi-y zYZB~(XfX>*GWp>jW@-53Qpn0s#q>@IIbL!~$tfeJoSX`BD#@uLrKta=cNs~ zmXQfLAEB0_*HQGM=S7z*RZnqpSRA>8@<&&aX`oQKndMNKs3*V5c%!E&qdgR@nk6F7 z@Z)#JN5NeXd@?MP4=Qr#4YN!KPH3ap2026^$qL~PX7y-(y7yUf{-{B|3Wq78mqk?J zv3$hUA@LudFn$>p8(`+WiE?nTXv$}Y6wDY?go?V89}^=M17cEWred_vYb7lF)?Ob2 z?;WQPJTU`YF<}~(LeVXSR*Gf3JURWw$fRGmj*cm>+QhPwwX;YbMONKox_Tr6Bitf~ z=r=Mw_BP$#A!nYPa}+?rf5I2(_B`FbM9yE4!vw(QD1c<0m>?39fsu)`eH4X}o8G6; zi*(C4SB!qkh`~%zka#?d;>zer@~(odl0Tw1K47oGiJA_fu`v+3grDFIlu6@1k4XVj zj#i6#Mr;3}#_~gr{jsLvV@(xvyZ%Dc^;a6}$C?+znin5y>K zbA6Aw`o~<$WA4%O``9z;#(=BAgvH@8s<{p$Bhgg{C z8I&^5EM!FN*%9vK6O>1{K)iFi3wl-h<^4-jfjeUD0kNh-JTxHI3`ixXCEID_Vs(k` zBa*dy!5l$?dEHXh(FKxl2fc1sbTmfoy(rV-%TJ74n{FXB+9q|t6M2%y)0@UkP81X4@6?0l8XIng?&h!Iq#{Ih0p$%uphW6l( zbwZxDD5fD7vcfvjQqY-V9D91T#i}g|x?&n!)?8~d4|~3Q_q!z@wMqxhY}2!+6^;{H z??p|AZEq({Kf$hHSiN#Qaa^^w)+kQqHu?N4%@5z)CU4ALsfE;2R!ooUmbEa(G*fU!(=Ym0}n#K9pkn4e`)2%C2PA9d89i2wiq literal 0 HcmV?d00001 diff --git a/config/__pycache__/multimodal.cpython-312.pyc b/config/__pycache__/multimodal.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffe5516f7e578bb4984345caf185ca9807830115 GIT binary patch literal 8619 zcmcIpU2GfIm7W>S@Lw8IBxU_r*67EUXxWr4+p(QEacub~u`SiLy>6Cm!5wifrLl&j zJTtT{K`zw71(fZgP=du;Mf0Gd#jc9@!H;?L!(vfjU&v5^)Iqx|W3wm%>;oNT5n!M8 zoI68`vMFy`UI6ut$$=0<}veE zjKV0a#w790X>68DaxB(enk&mE`K*u>vhJii>q&ZOJFj`OzN9bfPx`ZgWFQ+%25GyX zHD#NV&9v^;#B3-TVj05g9^mEC!r7K&3(Mp-p4KRx-jd z*iU?l|4HCeSR+19+|?Q#8HABdB=V`t?yZfDev7fqFjj0ZHuf#XhG1;C!Pr*fR$AI& zG08npi_lsdiIC`=hha#JmaSMO+lhyCDA6-OOG1tuV!*ZaWjWOL)H%z>_k4*P9gnk? zXH1@%QFD`V-fFt_*ViwmM&G+QdiDMB%a-e0Zr1W#Q%$1Fn&rDSJ44QCs%!wIe>9)V zsFN4ev}p;KRH7-Cn88=-o~)^gZ02=K%;puMZ8yMNKIoCtnrs;G;Z9Q?zh$lHInyLL zQ_bhj%js#7Q!eHT*)%S&`tVL=>j+dHGlWU93X|l()Yzm8#LO$4;(EeYx11D+n|Krf zg)-}od#uoT*&r7R+3f6hW^f*3?i992dHYMsqKTm&=>509als^;>2O`k4QDd)Um*49}=DhfP_ZB!U)h`_(m3dK*E=luES;npQ% z`R-b{XW`=Fu{FL&-&-ZL+#rkzRe{|Jjyf7B%L^7>rE#Om z8qoOaXgpM{90)voTUAJYhh9%R3T~{@dq}^FV#ldc!5q0s)AEA_rZPHc)3Aj5s$x!A z?kS>9PMLAP9z$87EcHEDv|&Nj*^f2!RlNg?{a8S>Wb{rfsEE6;)>AEz%(wsrYGw3! zSMSR4)pKiIiG?dmt|jFof0QIsXwd&A&VbR-yjQe&MK9%;i z@Egf}5Sd}o2mi(}6t>*|Oc5i8#v*7&k{M{Uw%m5NLL`oJmghe88N=c+m_mcgn95o%O)pr%dbiyn zO>`xEw$LFPNs|aqxa#ShbQ1wiKyQXa)x#O@!sy^*6h7%E0r;kefRDzM9pi#@9FA?W z$JRK|7z7+V$!1C;+C7Sg#xbAbwP&DF%ElB9;z))_7-kGQD`)~-5`pjLI(m^}kScwY z#GpUq(1(+)WDk6|INzRR8)>i28gW|rWCz_BJ@M&2JLz|3(B&Y3~~M?CigbwmXu}@XEeNd=*nv-^p%}N|zFOg|COS zBPV+mN>_DXz@@tZ7YVrZC=#sAL2F9SBc?V-FO2tOIHmUypX^im;Gpj!G2qk>-~G<_ zetRZp-4Cq?lmlb{G+ouCy%XmPXa<00P#Jv0B;&K5c-#t&;UPMPho}v7=7!*H>UqtO z%qb#GkSY0|3Ya0Ek+#oUi2@!Y0Vk)Zkc& z$;lId5VF}6$xaZ(0K`NTRZgk-)bxEg?2U0N0-ZpgCWevMoz^cA4A}q#9L$+W$MfBa z?I2If=_!($oX|1QS$i7vnE>g&T)^IL=mpnI*<-TugH(Rv4oRCSL!E;qPUwe_ss*S> zE)BR&Pnk0aaR}vGZIewimzuCKoD~CDLv)SE_vjuC6M>Xrg^g(yhSGi6kZKZG)5ob) z&cZd3K&dzx%rc(77DCYMf&l(47Z5-k}S;hTmO8u77m6ieN&eWo z5pG$U{CMxTZvUclsML96RffW5P-$mky_F6o6k7~Zl(oaUaPQQ*egc)pa3z6TFOy{- z!mW3n{c+F~pqV~s1vSgzP&4&?CR!zg0^`Z z$?DDUId`GXW$WD&;+A1TAd_Ut_Dl)FnVgdvUBwb{!IJQfkpK?%MJ?fYxslMxqzW8n zbrGi1$Dn{i-qbCjFayyJJb4`3(Nrjffs(2jO3$|LcUkA$$ZPCnmocj_h4DTV3(S89 z8Q&FlGZ0#Q`;#jxg<|6T+TPKPNZaByw4eRIiL8nHD-7rJZv<~_x!_%dZ#NU_U%B{C zSATW&*JCAdr05-?N(YC@KtxN|#}3>L)x8N7h%C$(;lC4og0aIfXiUnpfAg%`nN`ZmZ6Q2i;Yy?B=!6T*Mkw1vt>*Cpxc=jt7+w?B` zwVR1_RX8^M#+J(!X|6D?rsj&52}KtJw)5625S^1M@OMzzO@TjdZLHWv3d^wyJIOv| zw_W>A&9Pw$=P_$;JkM4^&YfDIuTpKhw!%GeRl~xaTF8Hu?jbkNVF-|a2b=U`M8@ZP z=WdKcM(0iGd!TM|kASC3IdWg3DH~L>tSCg0%sikQ__DQhs5YGBjDk*}Pc|hL{tQXY zrL_VMQF9a!*)YeDGzbNfnyHNhYB`eu9AV?8G$5aXo#K!tb$Mng;jfWi*YkKf!OEs& zW6GWkPW!tB0@ey0kRO8O!csMjPH8XFsEwzqvnQkd5Q7S7Kmw5D$v6XQlh+2(MMxPy zWUv!h8cbviQ#7U^Sg|ESKu$SXlcrRVQVzl;pbw)?IuB45Kn~#((lJ=Ekx65ga!vu6 zKpd$p0ug0{84@*u@kXBr!|~#^1*tkE1Tji=$E?shv?dXzPijuzRlS6=$vQ{%2rld;kR;(R;zi z*2STcI8+o*R$NR|cs)2+3J$IZ4}Ts!ycv#FJp4ZYqUUQr6YYHb)<jKS+y9DA8G=QT-IRbhvLf{s_I`n+v@ zW>>CBdRwM9=q5lhPDYsp|4Vn&rMsL|FL1ih-dwwc5^suNv>T|ph4i>kxG zW4;Q8!zfysQBFCFn(J*S{v9-500#rpFd+LN^{f7p*uO3gmc+r;wsIXR38}dZ(ZSh? zb3xqUB52gqMc#qRZjyh%8UR*RxbVz#br+u6$%I#TD*umPk^eNAy`Kleh5KqD3qhL3 z%o)h=S7T=u&-24ncIXZERkp@PPN!is#!Pzn5Q(Oo>$o8+P=b?&uWfaQ4gg$198}=P zh^kNIu*GfPqIdd3dzL^N-MN46xL3~|d_^m}(*76lH!$G{+6gf&t?S`1_`%)tjx)GmjBE7F((&gVJ9mBpY8F3{&ucm+RdvK^I#smW z&P?cUB88CA^0G;>sTCL-8^x3d{!nEDRhkV(!?(n-v5R)%;@0&s%wOQyX*Euw=1@3e|Ni@G|gn&L2TDMb4Pn zFs^_F{FHg&W`sAMA35<%SvzvI$VZoyvj(iq*7G4_~E%JRUH_QAe&Jlt+hK?8c)@8BGAFFTy;jKy=bLL$Z1{b@I zKD|)tI#uS+BIzv_X!{0nqT^+L0J$bMac8k5v&{EZI6-K85o87rKfU!?`dNBy;0m%8 zR`_28%lwrJ=Muax{ERzRj18<#tqCVLU~(=b#5SUB%MsY%h1JMvdbRCW z^jO6O-J8Cq#q`pdVsm%NCw(PA%a)6Ahp{+S;oCG*CRodLbvDLl`1Fz!9`DeFEBkynFMQQ(#(J^6Vd1q!789;Vg_hK8~PjvJJYV=mL+MB5Tgi8!Phy%kkGhiQ> ziGry@VhArN`b{ioQh>Th4}Navk!JDp0T$anPhAAhJbOPhmE=7GQ44-ZeMet_4!pS; zF(@l8mSw;8GOYMVrtjaG(`DxLpP2W`%zJ-i4*n<8|J)V&H~!*g=e{D}22$e!-*7Ir z7jA-VFDQo#fH|?fU~OFBqk}7_)>;m&v4_9r1a_#{Hn_#W8>tdtlk5<9$xbTXp@O$V z1vT)=om4J|3f>MC)ZiqKbQC*=N-c*L#Es_Il2&T&TL^6O(c+%L5TC2J8QZgQU_;wgLw=yy#A_@V?+_0~Oo<0?h(?SpWb4 literal 0 HcmV?d00001 diff --git a/config/__pycache__/observability.cpython-312.pyc b/config/__pycache__/observability.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdbe34d69f82c6714ee63558d13e792a1d32cdc1 GIT binary patch literal 5399 zcmd5=O>7&-6`o!0QWQyvlt@{MV_O^lRG}@AjwDO2^V2xC6B~8qHjc6fG&?)<-prf#^Sw9xM^8_fz++0kHclr9`4@gPpJ1c2{s-t>CpysulVtHNnnF>` ziURfnW}ql#rJ|gbi@|J=k4a`%F_aDQzHD|E!`W~#l8x~1pcyUpWP5nO%j_-2vN7Hd znek#@wof2b*1Mm99kczkkM@Y<4Wfsy5jzri<)WZnrkyJRrsg>C(X~jKW7rmSgE`yE8w;?B%$dC4MVnoq%nN9ynQKVOUtNul^$!Kq zCM8)xCt2|d4Anqo#OP6RUFY5ADsdd_Jpbc-J4f?b&8lh3z zgPLot)Ag-&LL1iUZLQwDVKfFi%CKYjs+5h3g!buyJ0LGUv|o=j_eCmym%OA$+x#sl z6P!2roxD9Ac$1vBYZLFLlfmwrzCniG4(klSI@|Qvr}5@Wdf!#jl5lWIO!a$-GxH8* zi`u+l8t&3DZp_M*|1LAl1uH9gn<@52GG$CDQkNMyN6{=@ahaAgtcCQ9H|!Mbi)z8p zb!w>%5*66hhQoM4M%^;Aat)Poy#}Dg|Me_1*U4pao)2$s-WQex7peGjUTXH*@6LS9 zqtB9*SQ$N5pt%c*kyqRTReUj9CjrsRrmoCW1rK4RUrz-XAOc}gXuLs)C%UK>4Xb?` z^`hoL97zr}^A4U!`R}V4w^+)QjMA8^u?6amT`Eui@bHCcnlX*}Oliq2*j8rIG>aKO zZZeznnl3GQi4H2PjC8P4x&z@b2;uC9=6zBRl4yJ-lBq>9)yVW}V(3w9+e$25i=}V5 zE1B6^W_CF?R}Iar12yrIwriGDG$eIYYUw50uv~92XPYL?xvEZG%`gE}e2PuxLsa7C zdj~pOnNJtW!X<(20;Cf=8)}hU5}aO`4{S9LCX^~uZxtvQk14LLG^A5F@SvAo)EIc9 zQKZ~u(Z-CvSEi1eo?*Qp9g9PATVOk}KM0L)HHMhsZEqgd2fxO0DlS(^chdjX^y9en z%g}H$#FHzr1GU(JyOY(}^b-<@hgYMCkKC2PiQ3>qH9GNx2t9{ZW62xR%FtwO2&g9O za-ciRo`n_uFI7!1(OlW3Td6t*a&uKP^vCZ~bYrpat>}&PM_~P@E4r=K=;ZfTv<&NV zpZ)+k*U198BDgKcI^W2{Fx7focXdDGoPRD-aydkHtDX2 zmqqaVfc80TA66RATSwoVIW_af5e0(5qJXW6rdaf%!t=Od04f6jLnPRMbMWOu&1j3A zT27gP^$MD+81UyPhLtnR$YNL=8uMBX>HtSE4F~4)El!};@_9f6hm+;(VyW!V2AMY{dtIm~NE2S5s2}nI za9}S;tsInMUUz|B(v1Z`GDu28;3HmF!$e*Pxl|x_&|$$rd^C&t>Y*x5SCr29h1T;a zUISid5*k3P9b}hst?N-@`}G%p`{F0g&FPOXuMCgZhR2tOCm+yk2fG8iZd&+F{fV6ai+y$%Pq)5p!Gg0y)!RW6*bS zwdRph&4QGw9a7)MGNbZxr}KVq_Bq^n8#I0v?W;z1{5#tB(Z1iD29&-Q^>KOQIVGUw z=R6J_e$Jh`0G9+(6t0Lx@k&4-BDpL!gWBnJThIXMF7Pq=y?CbqFCPe87t##F4WKyh zGaS`+p&Hi~Aq8vmCiR}>-$t=yGgqaIK}Evz^1PP2Q2EuF61C>2NwNHbNOKg+hNx{a zr4;3^@#`kxuD5dEs;DD<@F&l*A6=IGVdn;h;*vnH%1`t8KeZl2*o_ZtqS!KZgyR z!_c&i{8e=JpNYK>6F+#gO{t6Gz{KN;ndDS>;N@ zkt+5{fG9LL0G~-0wA_*c7eK09gers!TW<UkWFFp?eci2ZHPuZtA1Tk8GX9`H&d7KME(us zaU`}98TleIa#vb8c^}VuYHfFk1Rkw4h>fhW(U^6eKs+t8D$jcN5rqB|eG;iA`+m0vYXXdBJ zUke!4?M%UDH#Vc#V1?#I_|G2zs|8zUFW~SrHb259jLk7&&}CelYh)EOH*}GmWRO&g9CL@lFte^=bjLJuLoK~^(c9Mv?>kV z*mG0*-Tr0iz^e4>yIc&+V)6{-VM%kuQ997nkZ?&lsH|Li<~pgi#TY<2dnzh_a*0bIHd zo5OVpKG&k0@xZb?${EA*z}<=a$$LlY1iBCQd@lWM{}b%C_{QoIe69^~zVx!3;e1{4 zz-n@+`pj4@nZBE>B@fjDF!Z>)r`o%_*1hMZUhCffM24Z(ItS|#e6IDwIyXih%DdM< ztx25S!+XAE&=#pA+XpYNJ72b-(zT)7;9e1Qmr4L|3=x=NN$&$!pP%wzCf)cUgN2Yi zR9ezCC}eZoLAVom$>vzP^@|yGSOm!>%!dBY34J^<`T>+Ky9Nl*gWGZE<8Jd~{KOle zU(bbJ@;&|)HVb12vCf;&)B}PbtcQpY`;zSYitMYAeP5G3Uy(y!lKo$kp)bka$Ai17 l!_&3F!&Q=mfD%KiLij)8f*>U81Y1y#7{MmACgC^V^M5G1gMt76 literal 0 HcmV?d00001 diff --git a/config/__pycache__/parallel.cpython-312.pyc b/config/__pycache__/parallel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4c4981d4432e0bd8584ae2eca06adfaab3dfee0 GIT binary patch literal 21952 zcmcJ1eQ+DcmFEB$kRSmPAPIhmq{xv-i3C3&eu|>3FN&l-DT?~C9V8nF#EeKt_@D=% zB*IdnIN4n}n=Q+^tFv6$RJfbVmAqD6>?>c{x!tN&_O9w&wN+Obk$1q((igenq%KwW z2W>gES9?`=_j^5<0SJPYH&+=^jsAH3y8HE;?)P55p1-T8DCh8m`9BGL6yvyGQbK=9 zvXQ3?CXRc96FJco=6u?($!9{^95zQRK1+o6@e!-fYSPL|d?hHegi9kfpDj}6D`WY5 zxIAL_*;(8gc0?+C6_H9`B}b8XIUNU*=e?zsWJr=d%;#X!S{o^Ml$4{S}KCW8E zqDyM&R45_Ifw0G{T91XKu&6pCF;NQpuLr^*F_4JK9+O&zNFW#v#Nz;S%*e5z6ptU5 zV~Y!_?Ytx|2BotL0OK!6;Vbwp3C5yVLUV}OL(x#eACAq^MhvNR(Kq5Hr z4@IvB;`5?5k!1bH=cQo}eoS>?!ZF3vj3h=eE7Rzz&F_x}B9h;) zmizr+vBfaO9e)2yi-B;qq}1;hV?n=PZb56R1&ymtzdsO-#xTDaR2)UED3WPKk=rO} zCrExa77M?{Jw=@Rr|0%0A`5#KLJPf#fD8`my}39ty6@VEv?m;z-LtTin2$yG5F_r< z*mTcAmOS2tC3!pHq`-s!@zV(Yko&ZRvyPdb)HmHZeXnMPZ_4m}nQhzeoVho+!f#(U zTdlTFE4iH=4_Q<5z&1=A4I<(SBh<)$o&R&`baE82YysRC(7wpN9& zlGUorrky?~OS`h^DxXWL0(^BgUE`~kYLIptX*WyP8tGb=t~1hgEM2dq!TR-5!@~w+ zncOBd0Nb`KG?rzl3F)S6y3E%sZ5NyA^)p7Y-6(6x8()i&+nSf#YUB!exq^|qL#i?G z?l5w9=E3gF=GOSy*f`pZbURD8XVbfUyBOVeXVV?D0bnGZ27CvD_ZaC;mhQ@?%X}WG z%jiqDk=rfx7`Z)0ZZGE0%Vu_u)JN2zxB=Ijh4cFQ*<9~2(gQ5rmreKk23dL_o3{Ig zSb8v<9`X&dF$`zZrM|r^y*HaS`$kxLL`&1$iZu`S8FRajm5v&bHGR+Wa)!B9HLn)eR#-NNDr~{!+GVU;*p1AM)_e@e$*&GB91?t z(0eRSN@G{d;<4}W`bO+KDvcx8irnLQxf9vE6W`%|lZYQf`ANi&Gw2$mPUS(LFmRmC z%ROo2PUYpEGIG!4<(`(N#A)f6cozIRDb-vl5ohvB&tyx_fp4j=CF1$K(rNL+cR1f! z@dfcBdN^bB@KPSkIm}%#N<5zjbAGAJ^CGs{>>`@ojCwY2Mu~Yz5V3>Hq1nZRBnlGS zhOw%9e?!8NT8AZDqW##fFUV{+Qr!tD8jr~Zxz!|+6y!B*&V%d`%k;aRm#iUww}t!f*wK^)bt=P!h3Ard*Wd$|8f zAP6yN3FRn&Bt=4-Tt{=UO-G>%Pmr)6S4eY9o)Xm>4TP>sD!;H0zKOw0(gFm9`Ow_F zKQRwEYHogEF#%$0`C$kZkm#gna7nF)6fze8AIMB$8(S+^ys*CkCPg+r{S90}Ys$bxAOh!BJPH)8TNNsfc$ zp*W*oa50{UMW`p)eyZhJ3W~F_-v~sYM#Lb31|cf@6ERE~ja`b*$HHQWgRG=B5x{~h zT?qk80fgG7w-gNq!+uC=esRGcUxI8Y$D*MmlXcWO2%&o4{EKnPAB}^?kccQ>)?{42 z6a}FmGHW8Pe~!uI&3UgGm_d_vbuy}a0K%_Yu8rCs496kriP?r&l2hH;LgG5K8jG@g zdzN_vq9}(B5_9W0#LjA2l!!qukXizXO_WF&=ScxzjfF++b>7baaqXiTORhDdrF!6CCV`DTk?nasDBXa|u@RC|ax&r6bFaGeSngNf zkK_4RZ5yyw=b-i22I#U4&~6;02Xqk}pq#xp9+WsX()0q&AFaX# z&s9=63m&oJ+6buBmPgadFJBvNJAY`&AO8i;}^>DwCY~1+5(`&44v_fZchewED z%_S@c4howAg-+Gvk^3?DvT*+Fc*ICYV>i6XK7F%g+OV;1 z83Vj1gQTDsR&;^1cW=^1KY}d1v>3t`A`}yMsHu$uC;DXxKPOArU}YgO4+8XSRG*6_ zgo`r=lYGy_%t6)UW%_$kD5|j2NQqwPRC0>V8S4;op?3?~a70XZh<5I5nsuFewHT$U zMe%w)6*4W2@;m~y{CQ%gNm-7`@-XENQ9uH>e3$}y&(#XphM08+?1w%15ETqkFh&6> z{4n6~ahRFnv?2?#7ooaMH&*3|j+!)+?A$1qdbfmx&5(=#0KpyZiQSpD?@;VJmM^Av z9Z+^1cx*qgR^7aAHdXD(>={h&Ii~D6mhv87=S(L|Et!^`>6Rg-Whk|Mc-?BQ8hL8r zYbw_{zM^v7!8u%Md#7UWd@z*mKB{ydeQZApRGI3Y2N%=5r&gH;kd&h%`bkC{BJ*Nnn>xHKlGr^ew zXJc?pkL^usHErpdex;^=wPtXwPC)l-_W@yjH4s)+1JK=&uJJyu@uD!bccx?j!cJB=@dtu( zEFOn6M(@PMnVT~+Hz#I#giwOgq`_rvw78WlAl3Dam z<9NtTTt4%XthK56WLp8>76Up+EK8d49S*bUa=-S?SHHRJPVd;K;Lo`)W!txzU_>lN zu=|K?MKHM;dkD%4pQ}`};L&&B+ooG4wsXOTRrsKcyJ@;wq(n3&D_}x3`wdx%uz1XY zS1~q&v@b|PghXES_& zC1D#Et*ZHx)ninXymMY*HkOztq}ln#nj{3GqMjo~=b*3lV5bQM=Z%WA4~3y25tJCZ z0;LAlXT76R+$&s+34!Y|?60)12((g#L%~EGc=9?OTMz;m@C^;SN#@Ok(&dJRQL%h7&q~+#^q9LlAxLYN+`LE!F z?smyHO1}5Xt&&%8#zFBLrW@Q%%U8G?CXa2J2^>(PToYi?X(@pP>>Atrk>@pzYc$?~ z5dx<{`AVy(In^rZ`v$^J2Dqc~y^VC9hl;u5`d5Usqz zc_|?;(QglRjfsCUW@W=w4v6~u|R&dcQFwP$K`KOp%wZej3k)VWUScO-YUz!D97SI|rRT z=F!jXB@NY|aV6NaOS$Uy_1&DSA?0jK+1e0hD(mi6{;%~~&fN}>iiRgOt^ay(`K2H4 zRcbtI?#4H!U!P93J-_OHA=4_{JF(_&&eYZ4EB)W=6@dMxI2{TtkClw|nSb7AJ-W;K z^WpZRg7t4(%oGZgQ(QKn+1qNX6HV8#HOXFZby0;KWjN_YS+UK_Oe!@XoD8CfA}%rr zY-pBg4=*1_Kjd)=CMh^Z0qHa-I!W=Iw9A;PP*B)LHlR4P1lZE@1l>qX+9c&}3Hfbe zfalQ|*47fPqAKkW6bD2@tE+O|Qc_X*8CO=(18J+OHtp(GT-~2JDeYoukK*z?7*kxm zpH)$2HOuT!TpbT~0CK(7URJ(&nf?C}&_K`&koTvt(fHA2)umaL{PaSHqS#sFLK4*& zCa4@xa%aK2Q3U4uL>5{XA^baQUi-$Y-&p3;!mxrr=Wxn4`~~BZQEO}Fy4Hz#H7#vn z|6-${)G^6T1gXYs>EvWXjT3=P{qSvIrs^`@~F0{mzrACNrVp58g4;Lkad zvW+l4P;L6Mt5(hj;{HXL&7-0Cyd(yR+x3SwEhb?68j&}+1W;>FF}vU`)0`=x0~B9u zOb^X(6Fxk*&3UjzmYu7Hu%f|Jc*~qH#FMKAhZY^biOX5tZkexE<*?{6U4YRA zh})K17H}m$ty+Q$i|`Y$iixEKm`^oR2{O#F1=Y$nD^HnfW&5vMqM15WD-QYx2UL6B zeyo<~tnV^)T(z7Tdr|JD7!Qw?2o_CJVdOU{_%UT~3(Ub1g7XDQ^lzM(IFEALNVQ$G zYGWxDlBD4xG0qM_yqh3?Oq_86Xzp;IRMoxF{Ce~KZ=_nh?@8}pd-vLhyH=}a?v{LB z&XrZBZQB&vwzREXv9&Kh|DNrA$GeV4CsMZdRoj`h_Ra@&KW%!e>Fw<+?ck>!8E0q8 z)~Ruo(=ca8qzF!BiZjy?`hmB_B%0XL()-11mB%JTf`Uh%ykaKg+on9sMU1>(f?>@$ zhT6ce1{>zOQ%pt%!y2-2G1D&?md`OP*t8~xPmk$r3f8nv%OPaTKcL{xDEJWqPr3XT z_yGsv@F~U^aqp07fl$VHO8yA}{u>IiJa}?ien9ms8uw8H3^Goyromy`=qx99KA}x0 zJDb?FRc{ITZ;9jPftA+X#{096?ZR41+mEijdF|n@pLW01z0$Hb-LhY4*?;#eq&CKp z-%Pdad2jLkm*0K)!{ODcbKuC5XK-Z3Sxf8mpWIH_nm2E9#uD~_u&}+s&C#}Gx@s(B z=v($FzE11>1?gz3T3^7GUr?*alJ_OrD#FEIqE-=Z{Svi`EU?751Arotyd-dpUsWzy?h zC8x21M_(HZm>mTGNa`?HKTLu5!T9C0i_kM8$F7G&NeBqoDqwvf!xDT2A)Mp<0R`|t zyaJUDT4|V>Z8URq+^Ekg1YjEq!tPFX#e}>V#7QGNZJ~@{EP`Wnb`oq_pV)1a+=R3*K@A;^I7vrHm~24TR(DtV3iDi4!wqH6<&WMiAXuksUW}94rO{ zTi10mo#9*qqhsTa;Gl+lM_~X9#^BQ!Opu`s#EO9W@FpP7#|7q85RK;ySe`0asTSZ! zD#`Ff_hwHqSx7Y25xF*D>>^j~dY#;_Xnq@WMh~+H6xjv7)rY;ZNIr6l ztj&I7r+Y@bhE0;STf}k@cUoTO@yrb`6?7qg}%ZfwgN` zE;ZUy$l5KEzeT%-jrJMs8WvO5ZgZ|ZJqG-=auhst)&zbt=ntr!1t3U@rA($D%Im|f zB2b2V1N8yND#B(M;NQJDH;J7dM=kliC;}yxkyR`11dh(lIK*<+$N%|-Blv2fTd$eu z(PLM=dMx)V+G#7--x=T(a%FmKi?QI{COZJAYRvL8=;i>XJ{@cBS9bwrGfKNd!4r2S z$gsj5u_D)3zafPHZ;^Ue;h0hQyq(QJuMOT2$z`NSzk=qQX68skGf`xou3_}?*P|vV z>(A-!h~)fH1Q$7a6vu)RSHAm6-kM1^pMnSdHnglfNZLsYo(=PYo@&q4BYG_N3&|l|ta=8AG6(l(a8`}Z6^w_9w06&Q zX9WEa^wx8~+S_)|BzK7AqC31RvYVYi)!sHk8#tST(_=aZ-L@po>3LVr>#^Lg4ky>- zQR-r@T#v~XvF^4x`geKQ$UI!|7}%;YiWdtCk<>;3TN=l3pGu5P1MZaNw%j*}J zG728C<(7HFT2_P)@6;8i66OzUy<94yJl7szuNE|dXPf24D;J{m^5a0r!ssXBpdz3%P~dRhPwu_c zKb&JSzh+H$@)kt}?*dZ4iVWsDJlO}}&Mu6TK1GTj)IX?Ibm>pi@i649UxlVfF(mVF ztA!(&ATedtn|zmbAC?&X3-B-8==(^H z+GOn51-^wZI}8F){lnoUul!AF)m;R-Guf2$G0pn(u=9L@yjA5G>ZztawS_Gplhb2I zrzZW+_xsOWn!0#$22P%n7cSt+Y;tgdT@j!`J>Q@8G}8QOD2uLKZW^l>9*bqm?SWFkNBHx^~ingLi%wX?UOot_MP4wv-*8 zxfF*Jt=Z#cw^EZ68r9L}hWj=Kje$29AotUabkbKatO=+Je~U^uV;U=hBqFDY9Cq?9 z>gL}%NO~ultCDAeG0C5tsjpc%lR}fP<_2$CPkn-8lR) zoWL`0RLw8-6wN6!7@ejV85`v(3YaB{bOX)eG)u7;5YV*<5QZ6_m@lk{GR?8LTAJOg zH0KhLDv-V=FHt~VSMo~;R0reJi5dTelfKDZie4DCS}&uNf9mY`*py7Vh+3r=oEf`t zaq_(Xb`dR)zhhlsa3}rIO)B1 z_Pb~AC(^BZ75q8&{^?l;&-|OuJ+|+D(4Ou(u5=y$u>Wz_6nrw$&OXK2ml`?_S7e-O zb$cJq{`BfwS3jszyhm2L#^8ta#@W};k|V+)1%K{CpIXe_)%W;SR|mkW>ff-xZqK+0 zF6}0NrO&N7ALxyr#|do8975Wh8V%lDZS^?y_b~UODnxE&;)lqao65=r?>Sf z+xk}Br`{P#51vv6Pa#UK%ynJ5i+8GP93I^hrv6O-aJv75(tje=ck;t3cy!lJkwn(aiwRR$~z4pKNVPK_U=VAiR-~O)s zgTTLYW*S>wFa5+Nq+Okgt8>-WwdUTl;_iQE;{CJlo_*{d|D<_jrFryT8KV~b-+$_V z%b)5w2wEK)GiAn3J+gl|Ktwv5MBRoA7fx|OQ#2aAuZhVWX^Op-7A;O7=?3Y&!2 zOaH#AF;i2YuIYGO(~)Uud;R#Y+$TP+_pH`;f7Ec|Qrjm$q!+-zF-h0We{fU@#vXM-!PH5H?Nox6ee-H3^0ry7#{aD1Ml#US^cJ;H9dOi zKeRI6{7ZSh`4(>fNfVmbY|%DOTEB~>Z7YkmcIJQ66d#1m$FNvKi$a~kCxOLEdvu); zeroloN49hJ6t9PC(}o)ebLe18v}%L65XarVrDO2BU^`fO2%;Shm#`D6JoEJ^g>ZnA z7kQH(PAR#gp>DFSV2#b@f6a{5^(Z>Uysxx3{hdk0Irgdp8pHjc?uiIA~ zhd<*?71O3QSN(lH-7u;&jDE2H(c+5h%(~T5%mK}y?0K}Z(a$$`Y0A*LOGR9Y^XJpn zmL*r-&$ogJyPgCUi$Ej{3%^eN_{nG1G3@_(9l0)H=>Emq!3ifuMNfQN!m!MJ;q^V^ zq<~!>EMl3L>)`)f0$*r5`9H(2!mx8{Em5e`3;aZKK6|bj7B@X72N!SNNY{j6Bfo0c z{&KZ8_!<|tAlTSt#(y4G(_`$)ViAhqx?qmBd|zzkuhCs#Qr4x(E)AE0(qDNHZ#z&6u%^X^<3WZ!Ztyj6b;Tp}D zW+1Op{mT?wL;&xg1jSiT9wD~TJILX`F3F)Qz%S3mHys1#7m^QH+y>zj{@QTm@j{ew zw5{6PA)v6mKkLKWt2la7j?p!D3rSQ*l-48vscqGL>=Rcbo!ruy?O%(luJOF`N1dzg z)4*1xpScFrNCW++uXjN4axkC<@%s_F=~VJqD#(}mF24t)2(M~;Cfj#^x>P#j&W zj-F3QLKsq7u6!TH=H46TKg0EBHr{FRLvlIjos65>+l0`~O6kMTTo&u6M zWRfIg+6d)E1gZ^hDSd5fNxngOFH=Cr$nu|3@EQgGf`ac;@CE|aLB1)WU_`$R?k;aLRPX#R4LC{3pErnWm8^JKIux2mjXZYuy>_(F<%b1c*B1|oSJ3*?}hDa@ujntAB5b?|1-O4AU&^rH&e; zS~qjn%jPGZ-aBXS`Ig(2+TG|4DE_3kA30xN7L{7`q(8&&18Pvn%2{`0Z1z-Tr((k$ z*LQ4h9a3zgA518=BUo%ep^_5oAP7{y9i2qC=+>{b|H-yi?Q>s@an6{Y2>7rU<16-I zFS>Qx(58JI3>hZtei|hESQx9n|4D7#os)OL8+BS~b-?zI_IB{c-=^M~T(`sTAL~eDOispMPbApCZcI(5eS>1A1rYrK=#v zhA)%3xz_dvWAE%(#RX-J`G?lcB`9!lEo~3_->F{Rj)K~I=o|`4QBcj5)ZXt{#mt$z ztnLRxpK zpY^Y6bsW}m!iS%19mH@+m}+BUmoGrJx9-grX){elB`9jceIcU=F-(zGYabPvthm{% zXMjISs@Kgnt7my?ox{(gcJ}ijT1JWhwc>R*i!Be&78Who5ZIh<)nF(XQL-zAt{p;fKoGJ~bnztzfixd3xmc zXbt<%2;c|5gii_0$)tD4Sj#w>Z4rM#WwwPS`nX;wsBJW~=&Ei#&r23uCi6*gne-6( z6$;q0TTYNw8fM%n%YMiu&Q@%-j9K@Xne#s~IF`b?=LyWFSzcHxUOGtLmzQFf5kO? z#I=9Kwf)v?Hkl#8nR-6r_I<=1`q<+7i0}G{ANe)!c~a%Rb1GBYaA!K>tiE&N<1+WX z7gx$!?vy-f+>zQjsx;{C|U3mskJ* literal 0 HcmV?d00001 diff --git a/config/__pycache__/pooler.cpython-312.pyc b/config/__pycache__/pooler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6cf5bc74d8f31fdd209c37cdbc5ad3dacdcf266 GIT binary patch literal 3113 zcmbVOO>7&-6`uX!a``98)Su-aX&pIk8CYcExOL?O35=#SlDKLTCr&~XSS)vjv4=3il8ee>mR+eKmt{$; z6<2AfWi?!@Zla-;wQ!wqlMTJBOGrjUC#iiYvQ|zLji@rZim-ME;pDy?@%N(%9nR$p z)=9Q1(^?8+9RXk zq>LU8PC+Pph@i5BQCWTuj#XB$TvkZ}%Uel=h=!GyK{*8nEhmW{bJdueBI%f$h`E_D zT`lHj$8?i1H%BI73wq3*9OI^9?$nswbj&?Arkjbm`7v+Vm^)2oNO}Rz6#LALWD|E# zc@|HS1f+TjQa#s}3&(=&TfXlS`ol1(TbE(x)OVS|>%^##y0zi>)bOi@-{Q>{H=3aB zcs0ZE;$_3f#AU^`AUhJ8yxk;0%JXT%a-A&_q_NW=9&>z;1q;NpDlReYdds^>u-T-( zO&9~4L9StKnxs(yu1mZi$^0q@R*<%A?rc~b_6HL!M$ED2G`xNQLB+9HkYSuOO>Wgp z2M2jdc#C>qi~Bc;2P|fffEi;%;N;*ekUhj>*)}=^@HR4IZW|S(wSpYvYlNFeW1v%D z0pO@I;#Qg1Vf=Ho#2d|0(`l}7i^7(b&DN#WAKbh|O0H8WHQT)Id!-H6ZImK3mqh9a zEjHUh&NQ1o<7NnMbL+(T`zaog-2hh%N*Q&~qs8YxpBXHEe@Fe3e)0fVDA1w=9659X z1mU570gF9UL${?T2VWo6r`UZsxY!YnZRs#A>rgn~9*4DzrO)KUI$uwHTMNZCe_P&` zucLxueF^zW8>8|5m3P)&U%PtQc)w0OvO%a}86LT1R2|}C!(oPnF~J7+0iEy^rS5#Z z9U#`C1Y{OB9QZLnsqMDJ7J#b&bQO@}Q)ajh19>&x3EEb*N^BnC-u4@S^$>7Ijatom zF&(SEMg0v26Ns#Cv3lgJ!p5?*DRLod*YW^= zVo@Pg9~?i`q~&4A2`4lT;=gz(#tJ`%BAA5Qz+jaTyG0$|E+m4aNEE<6P#XAqfdU5( zG~(INHG))~Y+|QI7%wPv4&V|bN3ST*#V!++nE)&Rgw6|jHcAgMg?MY>NO6pd%Am|H z0_mXPDYR(pBp*(n*n4UBrQfi-mwvr{f9b-#r3-_li@zs-oW6E{`o_KK8-wW&cJv36 z^E;DN^v}Y4pqr*=HHc{jY14!@*Mh!f23gbmsAah$Nzyd2Z<{6+Xs1g+f=Sb~JkO5+ z#6To2-axHDiPMZ&Dh@Q+tkBO1G9h@0s_R>vigFGc3Oy!Pc|oRyTost5$HlYgSiePK zkYQ4&F1Uh--7Enm^-J_O4JCdg{dsA*bFI7FQ+J;msLMlHN$A5YT70JS`VQ~nkAFT; z&kSW5n3`a|(N(&5HytvCrjCji`s(Q(zl%Qs)fa}c2HXjB=Go3qx-a%-b}tXqvk^0o zPM_&q-FdJ3?#DkHsK%jD<183WfctFdKJv8uoz7bKwcf?uHwWsuC?*{}MKg^c`Zv4N zUA}u_pguE{Rq&oh=b!JZb3L_p{gc$)*X|wv?m#VtqQ7V5#HFD=l{ntBA0wEe42ir1 zlK=ER_&X3DZT=WO74`+>U|`geHl^DVo#%0HT#q|qq!a4N`;c=nB(yDEKXRF1zQ=%CtK2%oK2Mwg{=%YnvIAqFN*N^EEW_`K0{iN%Nkr78Yn z*C&)d+?f_GhPd0h^5kOuf9-^Kp%NtF&a=3v^yu>}$lkK32R+C1J)1zWmmpbTl| z&u#jwAQuD)5vuu>b+~|{!xsfxG@+>J=`Tmk=M}mNJOMTKHjtqrNz#Fiq{*+)+}G&2 zuhH4B(DGkWGu;mcsl|>4nUM7dQu-hAhmv%Bhy*=Qkem@jAF3gDQ_e}n-nqvJrqJcT E0qpQ4PXGV_ literal 0 HcmV?d00001 diff --git a/config/__pycache__/scheduler.cpython-312.pyc b/config/__pycache__/scheduler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28bf46cdd9ecd9b86a2c07df4956e9ce041e81d9 GIT binary patch literal 8976 zcmbU`TWlLwb~EJgA(0d*k+Ni2kI`GCBubR!+Lj-QVq1>aw(^jiWZB1L#JQ5j9zHTN zR4mphgl!6Vu|Vr>KcXO8Kn03IfsNLmM!(X(6#XG31yrVXS3$RE1N4JgxhUGNo^yvo zQ3|z9OGx*1&OP_sbMJZG`3Ik`o`Ijj{Y+-Z8Rnm{WBr`v#{R!U;}b?=BvxVKSlbjf zX^Y!fY}*xkl8bY+%_)wgGww{f;;y7S?oQUl>yq{HdJ1zWo}@SKP5RYQ1o8nEh?^c4zP&`E2bxL#cMEnG8*DEc_)_5z+5U1*dwLQwoWLvzAWo!&N zK~6npO^)O3q)zglgigEzT0Yw9Bq8FrG1nQX;bTVfKem~;kIjm)>Uh^G8xDMnqqAYw za9MO;t_v|Tq+vi^>!h8>1){>N`#6Rx~|?aIv?v)2v#bZQk)6;aawZn$pAI#ET% z;1Z&yhi!&qMkb16_!qGfRzyXXL_Mtn%{pj^37iOM+#FFBje3b>RFV*NA_0Okkxnhj zOVIMju)Lt8mzD_5Y*C4pR#u3RUbrhPtO_4wMI|LBkWt5@W4ETliW}0tb>Sqi4)-U zRdDwqxR1g;2XN^yyn({K$HM&-?la+1Lp(tH{wn+isiBIWMhXv<@f+h!BnbR9mHWYX zi1tI}esjDTuAn7{6BVo$7!8)C(_)U|-lgD^*2;Blr7N9026~c0+sYJA#oNd!;G?~= zLObQ7qiS~NldcNw4$?`w2#*}(T=b0i%FJ#$v%Avoq5YmJN{1!fOF8K+?TB22tU4Z% zK#Ece2}*rZC*X_%wP7*>^hTwqwJVq(gq6l>%#V`j-QP3Ut1e+>oH2edBOawI^^x&K zPKv=QvDMD-gyFB~&K0V7xz@@6Z>nMzbVyF8qA~-TVRT--HZy(a*1T|g`aNOx&TZkf z>3OgVSB3dEZeE+6`*sbuFhWO>=B~Xprv?$^QL`yfOF>`FkZ;j`fi}7%vM6R1-LNeY z<3v)tFQl?bVL{YUtpz=OkEAriYmIB<1I=i)dKppGWl<5pge=O6q8TRA*N5W?4ytX33XW$mRA>4SrYE8 z2qx{Mm=eL*8UayTO(lf#EHs>cO;?GCbiuxYJvW>x(X(nwZ36m!}WtqMR-3_T(b!>P2+ zXR-@$ZFn&yfqo}5pxX+jh8lY8~Qo zXfjkhKXC4x2B8C8h7!{7%JDq^-xtU9WM(WQXGV2VT_XDE{p{KE=kA>)V~V^mmRZ%8 z)2T7!Ys@rmV-<#?nN_0^W?0uA6cv;u*F9(Hj%gpG-$VHnmexLoVvX7L_=}#7f~OEHDSi~hbXf8UG1?Y)-%e4zh@_wrt_J@0LQS_gnXTjU~ z(#6%)131vO-dAiJDYT6|N$mJ1o(Efg!R0$*h1Q9uSBsM~g~^%i;Pt%s`g8xlhF%;z zUl=^U?VrlKr}lx5!wTtu0jiKcfyO7y67vzOR}{9E66!dE{vNQF1G)>Y^ytrWVG{eq zW?lKacv44x7T!lTRN$~({1TS0hr^=3ccv*xvQyZyF2hWgI3BW|1m*G$2;78e&u zLf0r7>Oxkho{nEq#msWFzJmIunqHA50-P<2+OoM=HYIBXaqS0+&=_(c`eoLX~ONE;)`1tLkHD8MvI1hyxqM1^0L zK}snebVesym2~c-EYM(xDY*LBSRjj_wzWy3O@R2As1+uTSs_|Mm;v+T&uT;hsv1dT zF`9;a!G(eXjWg^?>5O^_ZiL}9{g_d=Ozuna5`jd)uD*g0SNVtxH_j43I+i-CuOi%Q zo(&NYoNGU-kk!z?0eo!|iZ!Ox&UEq*UC)DUpIrF)gB2fenEDE zw~E1c3c+`_gYP|b@A_LF`qkHPHSR+=ee})fybT99yDVf7%OD%bFA}Ip)(_kGzuyK0 zxH@*thAztnj_N>j;KsALnYnv%hA-zq5KF;5q=!d|m5$n_r;SxvORLvpQ9fb86i zxAEbn-C*m(TVMN6Jj(v;5AyB~DxX6Vz>Jwn;NL){K>{{4BC!t`0fx<=WP@JC+DLZj zRouRA<=adxt08fZ%k#kTGk0m$Zc#OBpjDx8EJnp{0Y+uz13D`Izi{EGP1&+{l()bE zJ*y=-?_GrxdJkrE*4RNU+3yVjG^k@lOn(np$&~}UYwh{}ASXAhQG){*4O!F=>e~D~ zF7rrt({g}qwGM7G%d9fty4m>|8m#1ge9a7LN6R5CPXk`KuxT1eXxU6At?GO^Z05^h zGr&}{eV@m`6U@>D-i%49g-Ywo(cIJ(GuEz50zELJaEbp=)|Vkhsg)*N>W7e}6Bu9m z{m@@u|0egBujKv^95)yy70fhWP6;fc$B;alH~`TRx?at+fhH*j!iow*8ZY?ApH6T2&pe+z`^@(xS2%ZTd-C=pZYL1?m9@UDaiZj8 z8k>uO(L!L95{(uDQQC+V0@6E2A6~2!#IJkC2;&Ce6RtO6qs8&fby<)~CE924FMD$SJ zgvPGgnbgdi=9tgf+0R*ejUc&0@2TNL-gJ~L$80Az_yhNi51$$u6L0|y44jugQt(Ie z?ucpns|_f&DZiL7q4NWi1ugTp%BXQ@slN?H*hcxJE=$8%yveQlJJ9~Yoc#wNYv=^l zn7x7Fhul~0-tY5-#-8%@SFp};x#t7+9zfM0Fonp?k)PIn5dNYotChWe1`omySSam( z?ywB@PmaL_3Kb`HuoUHB;0LXQ8Ep*;u)5>_yZQqbT@7_mQQdm&WbS$n__JD)v$h0k z^JLPsUcaa_sbuRGMR+ZB(Uo5V8nxu_sgjEOkaQakXjm;c%d2pf)n`@*bZcOOyz1$h z9hF*b*I9#ax>EI-d0?d)vp=&*UVzk~01)5dGlSr;VK!HJ%R2nx#Si7W!x;W)Nk}Xp zF@iKAMcq*4SqV`7He62m2~B3?70gK#IVqR(8gLtOG7C=y1kx2qsCjseS_1b8=|gP~ zN?!(&;yuV`R&L$84QA^uBuP9vIN-q|0}l;vDn!&UlSshR5HyY<2fiMng9DTb^U+)^ z%qQU?1C8SnAVA__?h&Q{w~*FUF^R8!uw7d?np2KhX$br*PwpoOk+cdobYA4B*Z{LE zHhlGfjZg!hO_BQ<*en?71+pxzz{deIn8;l{8jVsu8`i>d^>-zs$#?xz67d zCBWuJ!VnLo@R8=IL@*D-T~~w$*j@Mp62;NlVM*>%MY;;nFw2GOQ!U!ce0=jf%JC(G zjfU&Y*aV{%qy8Ptgs!P-T2*^sz>HvA<%b&@wV07j9{??*kMYSNqyH`p4C3*N!a(Jk-U-B4>WG`1BQ`&Co>Zc}7)vN$|b7@o;Tu9q10b#`ob z?BZs+#Bi4*CA*_BvTt_<8%m5B6ky0PQ1A?FhKk{-LU?M+GxhxYRBW%ap*>uszYyxr z2M0D5N)8)R;*b&t;f0quO6l3}7Q#Q;^85(no!OtxVw^O~?pYxZjZV4?f$ojl7^2iU zfJOtQaS#9nTZ+NxRxtYf1P`;DC-?34mZlP8Z)}243w<$-jxKHaUoB&_?zXlUThA9- z&*xjFVDA^qtv`DkxNV+hfnA)>!F-e5aKRJayjmQ&Q5d>?ZEh2VI;Y2s;j$<8*O0Y-4&X3!Q0J@in|3*Qdhf`Ac@AQ<%mUQPG{2W_uV z`8EeVmmo;#gdnJxB&+D5)pwvU{DMHQI!$slfVU&413~p~(@zBR_}qss-@JtM&S7j! z!(#KQ4PckvPHq7{@u`qiF=kB6VG9G}s=zu5-Ky^$Z_?*;GcF%TOnjzLFC&r@zA=Iw zP-8fPkDlsjET*xzi3R#f!$TiklLX!rB^7WP{ z0t}t%!x7X5^(`!3!{Sve@X44KC@}5Mz(@WO^PQ7%^zKcb%X2*&uW!!%#qDkG+-_tv z&$X@(ZJhpebekL9>l|9U`KWE18`|Y2zUdjvkI&}k=68DTE8ENB@!0^-9v&NnN3wf?% zUEAg^lx%j#EDN_X9LaP11`r7&ZUk`o{>`_Z#&){Sr5d}EvILjt#1(1w9w+@rM3 zMUa_W>~~E_RN5K308E@-53MhL+Fl6u)2-SZJUoYtPUN}XjrnaZhGgA9wX-+Rg&qNo z9!r4j-J6Y@AMBipt-bMRu8euQR3CC&*qF=@T`cro+Gn6s8VNXB*Ax2;RNwV6&gMr# ze&F1*l^w^-t|NfNIyZYkZylY_+_d^)?$_Acw=>R0EUvR94*Df;wkk~8KAJq4Dade-+6EAzhMRSom%HK+`KA`L-G0-vn3sJM`zx?p%JpZkfX zqD~r4c*Tb-Uxf&iF_nhe1PnvXXMT5PRTAMvHDT&L)wGZ`7Okd-A(qi-(i6yi9o~X* zbyKxaR1^=C=nGV>qLQj!=}SoWxfP!NkY!4K=@HR6eZG zFC^*$;8B;TE@4Z?f~xXi^)}oWO+$HEy$>juylE;FU}#x(-_5Z87fjy^rt6o?<*%5_ z|H=%%U?RU{qW^+}y)T#x-`E2$xL3ZZ3q5*oyRLK1x#tVzn@<;flWX<6O(*m4$)IU; h%@0RvbMLbC|7n}$*iebV63)}+!NUC=hkbg4{{tCb_S^sf literal 0 HcmV?d00001 diff --git a/config/__pycache__/speculative.cpython-312.pyc b/config/__pycache__/speculative.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ff9e4099d4d1e18ec4f4c5192764422070ad1e2 GIT binary patch literal 22183 zcmb_^dvF^^n&03>f&@sA1W1AuAK;td0~FsBDN+xUvfh?0+3Ta%2Sb=42?-B+07@oI zRndFCTPfL!rMlg^+{4og z29A55<2l|C<`$K^VbOqhW7rrmEt(?cMRUZmXo-|8mKdnq6fTWe7p;-9#j;5GVmZq* zhiwu2qMbck!WEIq#Y*;E5_UwY7OU8EX}CI4vslBPtzl=xwdi8cW#QUL-C|v&ezD%b z2_~@tZIp-Ik%q+vR&EG4Mh*mx1}@UX(&oh$mTwEUMm&oi0~d7)=EVm5dyJe=$J>#% zeUj@QU-5}ad0cEkzDsET1pTFJu|u%%l@0XNi6;kNC3G&C*n1b=tN9w<$-DSkp=+sx z<#r1-0&qXi@pbQVeElazhK5sHQtyjB>jtm;S5$c3YnCf6{oaKI|8pl7o;!2u{PVKu zSae-3JsV00VjwJAmxY8sATGzfCfRZ#B!qdn^5WId>MMb8hz}$}v8e2b#CRd>zoz8G zM6W?ELs1|Y4#eYlF<%tIOSo5F2moVP2tOB#E`^qnQxa64Y@ui<;Sa}_mj%&ll*`Ws zlIv$<0bUSg*GeD`3WAAH&>xB}38D}U3aDIlK@<{VAQTmN1>wgA*>;XdQC{Urg)ZSO zA-I-^iOHU8;c&#aJJS~qtP5h?_vTt4ng}IXhrCwV&I`h7ToA7MBZ*bn8VN;W?8&|y zj*R&uF@e3631T#)JXXGWU5JkQqrzJWR#Y91MVEttgntQxu)?xTgvz-Zh%T@B*TSG& z!($Nw%V9w-VYgA)5?vMp5xFEH@N4k^x)EOW$5+vbZ~)zqZM+y*LMzl$xg@@}v=n+P zsC;N8Ez7+;ox;3&pA$G*D~raLv3?d!ym8TtxnoRXbqOWB=`euHILy0J-pHFjwWxDu z(JGXoyhJHSt#ZMp)hg9;?Lvi?Yu$yb?9*}$SEW#Q zz6uSWvw*@%E!S0$>)3^{s(`}U0*uwUx;2Y+f(2vX+*PZvU-bpGT>C<~3!rLqxQ*Zj z`SGmNavOyMLKCfJv|F#`HY2wsm+RJYTMJ+tv|LXCY-29>z+xNwnoYU)=EZjQ-jaK7 z1*hpmi=Nzj+hP}cZ_mAV2(~37-}$b2v0LcjyM$i8L+D*H^WE>7HJ;+-d*0<1`}kg@ z{ebeSz2^Jy)Q|rTp=`;@4;0iH0GwNB;RiKXAHz*>Lj`by1#lfg%Tft{u%Oh3**nA! ze>$S_O|*0n^o|z53=1QyeIq|skUNSz3n0hQvoSnVFDLLkzHanR%GDQhi!O9cP*_T` zN4c+wj8S-&VxmV~nE>+eLNLaMqRYPXa)l5@X;AQEQ469-oEUtqloyr)Sp2eaIU(0Y z*CN`&Ms`?zZ^&eR~W__p{aoQGt=GSH)Oll{k0o>e{Lw4erSbMdiAJK~j3@Pl$pbI}_Mhu)iogImk8x zxh|Kf^(>cPQ^5E9xx88>uL$&4Zt75r!W#YRM2GP%sc&K~F{D z%{3vOh%W;huIv%HRS%l_=3fc~!JC&q{D7{%JfhQhD4OdzM(na&g1u`^j0ST%J!`;% zlq7+te~sjQZkfAb*f3lvgkWGCx4~V>?d-V>cH<3KdpS;B0|qWZ_}F!&M19KN6Y4I( zZ5Y%#S8Vxm^+{ddyzx_$%FnMOxQERxPa@{QhlzWXuREYs$5dYv1(XF6o)G@xo=`LxUc+?6 z1F{8=GX8N-I21?ul2#MYfu$uOn20m5ASUZtLRq}nWLH;wl}iGvt3s3)`vD@C($IxNmt|9g zAD2xS0!TGc;x=;GitrX6TE>EcBx#OAB$P{YUrn}BH9w%^H2*PY{?FV{BC(C=-RAh3 z-d#vd(JW~A8tb5OZ+qCq4xER~X47=RYoi5<;kD5bu@>VZyGe{A#V_h#6$L&NgfzsI zO1WwtTI5A&k>@U5I7jLlu$5_0as`yy6{y*4lZeZXBGpiABJ9;KfV*BGrBOw~WJ%d^ zE_5z-A|^f;fQ}eGdrqt)XglSqfZ(&$M4Pg07|T3LLot z6_IRbitdXunSZpL31_;b`7JBk? zfy4aa^(;&vXpmJoV7-$*9>|bKADzNAA|G zvQ+=VgT#*e?9Edddu`g@B-xuXEj_83bE)a`Qq>F3OvZ{zY$hmE@O6$$wuwx~)NXvK zj;U?i)Ne(2J~jPd8fB+)0vJzhIKzQkG*}Fw!TdFSgLFM zNvXNI=9#Gk#8nmJn%%a|sy+THT$8D;sXe%8;MfRdI()nPxizzG^X-QdTF>z8MedQwxLYN=x%JOj?rz~=x;+emzq8;Rh=pR3R_3EZ6leEiQU8X=8pSR@n?K7 zHT{xQby20Ksv%`{ix)6ac5UY{r6}9Ey=wpMJkOL8X27#c-Ey$GPVB?b4Ob^I%X<>q zhIB;}+*K^t3?U@wi4dy54r3%lfL&q)GrQ(7(9>obSnRv==Jq2Dby>%MXx-GPF9{oC#~gZ`#!K%);lY2-347Zl4a_gVI!&2-ewzc>-78` zY}k7FjJ{GvpN4H8>Zm8b*#k+;yy)6I01UK!6; zF1B6cqSJO^yV0?IF!t_4!Tv4j(2JI|KI!ArZ~f{#R(e;#sB~a!ngwiWr`MT=UC%FewC@yk*W#O2xH*F4} zzRqmDZFy&rujC!%gt=ibz(5T_eJaH9Rr>IMY#?_CQYXrFdVQr&r7V93v}+$|l1+*( zK+_7+!|f{q#Vere_JKaA3thht^oTAr#9duV#&n?@_JN+zg>D4&lx_*F>S^7QrhQr< z`*G2Qk!jusdaei*uYhg=v<~k@N$Wm%NE9x*0PWca`e+d-UIE>fIIiYj!MZHI{@C!Z zXh$tvNyQ*|=i9+CjkgWa7~g@`$%;~R>COEs&ldsVRb5ZOSH^duWeo(_kMdprAF%`1 zF6`C&;6AC5pgrjB*xk83t-BC!^nd-1SbK6s;qFM!4YFd|#M+yvv zmJQ207xd}bv~G~>t?47t!}@R=7Pbxs_bW{T17FFsn*#+h&!|2;WS+4N%SXl{`R1Ey zKlAs^sf5P95&<<|AFDcp=ehSvg2rVoXnY;A$c>VX623Qys{D8oGW#9lckE`A-6&Dv z(SL2vkIp25YAaWG^(lWx%Q{r5m~NC7P@8i!!7y72HzfDzq&hcL=)!gj_NP$3HT4Ot zderCq9l2NXZ|W0r^>!wm*{ZG*&@ryoE}$c#=H>6et+T7|)1p44p2VdCnWs&!H~vh$~^fu^$EE;bQwp{1Dht7)!fFyZP@#T8n=!(BV?kjj8 zMFHUQ3mE-V7G5<55#Y<@85-eiW z$za4+ztc$g*5_bXCfldmlR0N%Sj zUSAJ9;YFG7%;o9LwQ(KpKhIj6TvX(AVuX5>vGFBq_8^!|wrDsG&lWcjcx#r~LPfG2 zh$P^{t-2PCB4)_K{K9e>^QglyF3OeKBl9-P)$q>66Y%jyVhI608hEJO@a)B|!|@xA z1p{F}!f!$W_}3FFa;;jTu$yZlDqMtKYQH zEiVpH#crx-Cl57ZpfC=XMVTctsG85Lykxy&8(Zy$NR}9DLUUBJYX8*BFQh9@u3)%%nYVeI*pc z1P!cX%EnR8gO5yO27hia2(Nkop;;aTM+nTsn|vc@-DMcqJOchG+^H)8L`%e?f`ZoP zITu=9Nx&nFB@mCHMfx6|;0ny%pa?$D_}c1fOiW-h)>ZHevF-46@QyIKr}vX%?p=-8b*+tu5sYOP+$SENj!jT ziq;zNupq7hS`W02;WO33NZ>YoPK$S2IQIP6h1{!=IZwqU^hI{j7o%v3z={yyb0Hzj z%AKs&|CrNBkG4qo5Jh+(ioK;wmL?QX*0krzAm%#v;TudL=`Ec%2e5pop}K z%NDkpm}^6Jo{uF?MX)j>;PnEp^re+GjOf8=9LqF<*hT+ZA{36pq0Pb(#C1Z`^%YP) zHcLg4apZE$m?&ZjuymRKo?I>B+e8t`BO;jsM3Mz$3mQfIktq;Q$o8-jAL`}sVutSl!BzGVLXZ^CEakX6W?kJ_EpeTTWNe>%I<2p`Pwef~dQ{fny)Uf!NK{m5SP-uYYSGgZeQ%>S1& z|4DN{9Pr$n|IV4mHphF@x299J=B>_j`>fPHn`%FmaX8<5`_|icg>6Si##x_s_DjzG zlyme^V`r-CrJcr$Dc40UKkb~BoYQINg5+HIY0I|r8*ni7`EJiYbag*&Zc8_hOU>iy z<{7DZ=C=8}w7c31L@wQQt#1p?>VXW-0k_Fxw3hTuK|M-lJL0aLh2285&IwEl5KPDc^~#$xwTe zXrdu9VTh`@p8jdBjX)Rd*>zZJiT>iqh_>o)0T~xxyqXNUc2?$-ILoD zofylsy-%|DJ+u#GujD|M&0Kd6)oRalb+h|p%$MY?B!=)GOStN>FZ#y6ie)SYr^vv2imh1+|i%bj&#d})H0E7IlA3)G+Ro!R<5%*-7zC|%%nR`Z+D!|mQilG zMsm9Q!glwCtc`N*oNpvOxF8KKqz8Y0d+_(O6_i`a^$(@{j!S*V(|xaQ_r02RP;M1> zpf%k%A~lYr8>hA#r?S#CF3()=9Z8uBH7)uYdS@wwB)NxQ?E5 z``C8-Shk+t-5jQC&k?ETNV?~x?Vgvi4V2r+VaR9&#W zwvlWzy|*x2(^C6%y8YyK`^ju8<$4&@A*t<9y6wbv+lg!&a-X$x?W5_oh3&S5YzN+H z3if3CIG3A3eNye?l5_m;Ty6|aUCWj$-8Ln)O{LrBq_(+q+q~2^zteU?sy&fu_uiiW z{+Uc|^KXOcQ(zj>wF6S^K)QB9s+~yJ9+PU1J$QAy_FSeN0N(==9e~ur%iFatXWB>h zh4`CSAGzAFLQfcKo^ZxG`tTjSx97imCR2YPT|Xk#kKBK8yME%4yX8lfA5>9mJ8mViIdr8W;dY+hacqV=;o@oV*o$EMAaE}@Xvqn>c_o>N3%hQ78S+#38KX<)| zzhBh1Wx$!-Ly~(a?LH*A4`muUQ{9s(_tZ0s$xUmzq4|rRkxY{(-83pSji#Gsr6#zh zd;VObpZ~IejVT^iQs5T<)~1Uvl-QUE`8# zJT-X^-rS=GS^*8sY4@n)9?hCK40!?2L)YnNR<60TsP6r@AIxKbn`>^DVQ^i|DQB-X zwyFB*#}h~Hnt$dVeN4R~y!S)f?jwx+NvUb_Gq2S2+=Dl_n@&E%lJsU;#_#j#i5I1b z7gOUeWm=Bh|GoQX!9$vl6Ax)^1P@`K2IS{DGx3X&@khfG_Y>*Km!!#;9u8mJabJ># zFWx(Lx9x7=gRZTyk6Kd07bW*45II^$?0x82IoH{f?iiIiM(@w3$6k=eUfAilaQo~d zpmMj|-H?V)Jaj*wgMY~2YrE5~ImtE0J`Qz*JxogO$)eBowA74GYvQVFA65-LtKq2P zDamz;f%Qr5z5@7J$vw+n4@&NXY4@Dup37HHSM^9$JrAqKb4ZR$uHy`t4dSKr_$6ta z`lPkihi^{I^EY44at$vV;v^vceY4{YgLRjPODZdgxlex$S+h{Y{REkG#cg?uFRz8c zGJB?3fhbHjhhYbxriw1;{}h12t^|o0?9vOSWqx`c%&H}Uh2QD4z?-h31+gKadChpT zQtCj$PG^xSJ3#D@oFRb zY(d{~j~|E{U^CA5AAV%Ldcbchq<5Y3+RrEXW9;~l=ahPUh;%^;?+zhgONruV$7nR= z(Fah7#88m0g#w;85Qhp)SIJOLA`NOMk!EKP`i6bOZ}^fk^Ga|(Q0i-Hq{(*0b=y<) zcHQ`LCdIOl_1R>Frs>VidWM-&m+UOkO<0_JQ3!ic&4Wp3uXJu`cW^wZySYgA+2meY zc15tyCTlcWU<7#W;DBfNb#J*y-(0pqZGlF>G!{iCBZHmzE+v0R$sbX2gp#9_97FQ4 zfr<77>ZN!S*~xw#u{?LwY>!%&BAs#o$tL&6(ecpH^{A#9qGI&|u~cUzSSkXZ5Ij|t zwslLk?v%|3K2B21IjQd)306It?0zl-DX^+vs_MsPgj(gS774NUs}Vi)sHQ1XGt8vY z3CTH;s+qih1I)Yn#NL)hCEMuzz_x9Y)tO**W+dlKs^;)#=cvw^y>*5p+tB?sif*EI zMp&Im$vK&-nfk1P>O8-@PF3~o`Lt_Na!o#T9R6$=$zB3elc)ByX5y-5na-^ksOW?l z9Yoo!kvSDDZg-|j&z2P=ZmV9zof!8eEt7(r-|=n0i_g3Q)p>IR`P4o#f-@S5t@|6s zePH1QDu%ttBItGaK=IHQB)w6I3TXW+(vD~DwZ1!*rsF5OeVY)By$|~BN$i> z1QG3@JIJ6lRWMj&@5*;MXEV;9t=N}r8eTG{!X`*Z3zYAGGXjc*co$v4KvZivjuWVP zBXumAKdO*y+_$aSWIwtRB^w|X4(tQ0_f=K;vVC}mpQ?Ji!SNor(d0k1G@Uf3||xLgp&$o&4J zypbJh(Bv*u%U!&}HGKIf%PX+i6kj}eh#vj|NzA|^pLANFV_7IOFlkPwe93stAUg5~ ztRSm_w-w1{z~C)O-qB|CD`Yrh$BXEA4u1vC3FXL=GMFCK5aywCer%Zp3YnFLhrR|| z^*KZK*ZVs8^;6(Qcm=XbZoS~N4~|_Yuu8FV;4@H+>xtF5;Tdp_1Y53Oucg`HWY5=x zOdeO8N27{C9vlA}@m8$aVa9UU`N5ohUvJuZEjhEgUSW|yf^|EMc_RKBO8ykd?sfZJ zB9cY!F09fLavU+f?Q2!%2z>ddHzj(Lat7P#HWT& zaw|M{_|TZV{TwqQdjk1&qu9sU%E3-U{*nBCe?2B%6-1qVG6D|YvA;$kuy5oY>aUs+8e8zhu?fs_Y{bv55`*6DZ6{-7`JXq66y6L#obo`ma z+E$*jHp8Y^)s?b#DtnWKopXdg$xP12UA6&DI;{Z--^Q%Ir5hWBD0Z6p>?*Vn=tE=g zA)U3MppHo&5_+n+P)`Mv&WaJc^z-$0G(hUCqfa0)&}E?h>8z-Cdi3*o3$qZo3wlzb z4+%+JJ3ja=!!pO0{)yp+0kAqNl~-bhUqGo&REPLnJ`U(6w`AbU-nDKR=C#;IwJ-6Z z0;Mrbu=JSS>j*tOG!C59BI*#x9RJOD02Lont zc*_!ok2fHj=}eeh3W+!rT#euu7k(!o7LT(ShFxosRv1zG^fHp~8O~s~lD_zQH)63M zYu(b7)bw^9y)|@=`@Z3Q*p9x_Yv$IuKQ~=Bh&Rv-m>Kzr#cxxx#4==~@1X3^!UL3F zB~XkKM4`vCsrU`c4lm;n`&+A`zS~9AWd9IUA(kCaVba2 z5c5h5O3uMl%}~~2tRBwO_9Z+grwu7<>tlFQ&)hn5cWN8%Lb5;gZddg_a@9Z0%@XKd z?#gFycvix$4}q>`D0T4p)X+(Ya;FSu49poZB-Ib4>*u8Uxpe(0ss2=|?hN(nT%o~8 zv7Re}UT%ZH_oH1RC+J}pV@ps^MFMj)B>(Rv3g2484;6^SKJfDmA^I&S{76AO9Ls4g zarzaFwJ;#SX4b%GD2j@L-aM-XKRyso2&)4WN2KCVfAc^=q}EYP0|I`T!*ewnyAERs z{aA{Q$ObzGsL1_pLY~4xw#KT)2Pk-lAuK4<^e6_SSvccC{G0*|DXIks0W~btvzNJSo}Xt3Y*^8xr& z#`EBGTId*yMg;Nws3;Z_Uq%8VTcXJWdr8&`zfFYF#=6Pn!WL33Qz^$($~v`|R1(aI zc|QFXlKn^}CeMEDz{qn^I%Go~EV}eqJH&lmmEs0|<%+4?%uopR+r-rGQVd54PIMOp zMmFr}XRi{CIGeO0q>K0^4BZ3T=9`Phnw1S+uUQ#I7BkcbYT1lHE(=g$15CNHgT6J* zuv}fn$7voZn~BvAS$$RX`;(1(Xh8^-QuZ%s$j<{ChFpo)xLJ~Mw~%w%J$}dh*;sn! zC28iR^vtW$%&Rvo2m?qrj!TW>HyxSk{;d;Rhwh!aw{~wOHF4^**3{JLpSGnOXH(X* z>~nc1WY7>roS)w>m-}hg!0WSyiu8jgG*98ApT*l!!LupkzMh2K~csJUoDcD`8fTWktkT#1#+ z$4{~ChC+~)xGfgJ#R8Sso=$SG5+cO(d8XO0&qPX#$XpSc??JXKL7z%67lil%k!h#j zR|y4`c_3C(6$;G~KcwU)C1k-9$s8b(&Lfg~EZQjf2@=_g@J%qd#Jc#`l=py=zoCSd z02^|}4%kJnlu$q7Jy<1w$UQ0HEF+KGx;D?<9=qFlXJ*ITl`+qK(cW|4{8{zG_L-Eq z@$Pdw<{6j=EHA)3(6qU5bM&_B*7S}U8=k>Z{kW%p^F+$tyk(Z`ojd0Ktg*y0{G^ie zjea)v;Ml{#6Df1cmT8Osarut<1l2eKf5@TP&9k?A(=|S+#&_Q()r{|$4`chZoHBr( zw(fiU{Wl+a4uPi8yRJKE>JU|&e0-pF^W@Fo?fF|*cFe5=?}j&p(Vl}tn2yoX^`xBh z`tI|ez4@^BSjyaRcXZ430gxOcB=d&HUSG=W*#fOT1PxfmpV+yszWbf%(b`U zcQ4&Z?wDr@y7P-p?>5O?kbNDFHJ9p0~9XPaOKAJI~2VDPP z%G|z%k1#+joHror;CRa1bI-YB9-}T$sEBVkW$xH|dB=Q^hQNn09z00Z7Iw@-l+&8E zR$JV6gHJi!vfW%sP1bnW(va$&$Z~X_IrAmmpPJN?FE8(jLMCD*;0B^D&r|vneyh9$~BfQRvC9xMx2qQ3l+=VUzXx22acyFyB!j9V{bz3Fr`tQfleny7(oB8b)Balcnm5miZKXUwz5kt@M*tT zDa1umlo27$@WC2GwnGRvakj{1S3&5fPc_Og^NCztSj56IMe8P zk((jrELFldMF&b1Cn~}h^Ff}B56NHm7c4~=wNau3Nh?9F|Bdk8qJ&~6p^SxNmjmIr z_@623T}u9d@*PaKI7b^Fi?bE~D;3dL7qKtddRW;h=ER?)lqAP^3JGLQgW;)_GuZx~ ztNb}PwZl#Q12-daGoN#PpL6a1lN-?PS_&GQKg{kUu^WZ<2=XT6<|J~g4xV|xE zc4ca6H_v?GXvkRIAM||WO?90|;73)-=E*0RjT|tlF^9*Dn;~(hSAH9fVxRWJM)8g?4^?1!%hS zMYcz?nh8a7DJp7?v4_{yBd%pu(R{ZqA5z0X4JJMW@weUOLX?!3T<=V%sid&n|qCo&C4|E9>z88m|Kla|z4=deOb~Yj^XP?&eQq+TEHaTXTTX;G9b+Ewh4C>e`fo z@-=8p-72MD)|p&ZRw!jEr&Pn-k&DJsW!7~m&B{{Q#TX3~TfP{I(&`RG3qf2`X;pD; zCyqQ&O6m(x`2p#RzTvz#{%tIbe?CTn`5K+XD7x|Dwcf#JPhUAAy;%@My;*|Z=uOG` z)BK2B@yJ~a(A77miH-G0XY zMBwHj04E-3hWU-ybnzG3Pf`f|odNzA_7?j;gT(`+B9+rok8ci+Z>kDMp-iNRL)y#h9}W>+x!$nBWi=)X1yAv6#dO9O2Pbq(<){HTH;iohW_*_bGLmZPbvsbXDTRdh{NY?HuN3>HdBS1bzvaRKXP zn8PL0C~K8$h>A+xHfeONLX>J%Av9X41F2EMG-}puT{Ez?(W_S{%OE%p<2kid=77q|cMNUOU{>NaWqM-6c5r=SC6)wYR45?xP=>DfLJU|t6hqHt86)!9uOmDCmh4XE4zM)ql{@%gtn_T4{^L}`*Fxx!$`FMBu zO@0Ao1?A5mJ}!CR;@dap-ITs%O%LCjT6#h}@;VlBJE>W21l9qa2792m9+>Vm4e?(t%AV+Hh%cCIkzJP0+( zEI%(iAaazD%EYX~O+YmeWaV*n2}A4{6;PU41cRL*ilk)?tYkn8D-dgkth2^!0q7D$ zBG?ZWOBli#AJ0-TRImxuoQOEHeQcqYFW$Pc?_9TEo*%@w2|UcHgaL->p{PNB5&U zse${ceU<|>iA zWwk2YE+9lsZF0wDD)@q)Ge;rxOR5 z{)!B~gj2$~@nK%c!@QuA&nn`S^ zi>#gb&jY9ns^q^AB(J4eIhW;o0n6js)k`?Dit#+dTsBEZS(4_0+vSuc>!5LY-OeK8 z>^`r+Km{EebU>>zrC&g#DOSy4y8;u2CQq2JY5bO=*YP`qn1qTere+WpN`%E7VL_4= z$S_+oDRjWG_Y15ozx0p{!^-$?FU-o4_oVPDi|m%1jhUB~lmAcq7Ra!^h2bvxHjH97 zxZOmmojALlIQ#g@6S4iqT=R{&*2MM3jSsgcKHN!W+R2$_a^_Fd?MsWzON-BvAMIu? zwlmk7nQN`g`;F-EZZh3Yo@pk}JiNJ`JiC+bZ>Pta>9NLSp_wlHqjzYhcjAj5wI^qr zle3M9D;>lYxXeyw?DO&8jyKNFwlY^5X9`aX9f9k4pW7FNzU~eZy1F}Y1bh>(Zzo>= zV)}QNzr5V|!D93L;#a*x9iHo%I)oT?5GS`2lTSwfIPv?5r{}h(<~+QKBY5fl`w21% zH=ob;(WopNN)^j8jmt9B$~wHJWSW%akL!x=okV0=HA}MWxRaiI!RE8>dyJ-JSuqUL zZRHl=r$Go6mrPT4!+}MD6E9?ltyr`XmiT0Z4eaqlSfwRXFnoy|9i3Pp-E5Z)jDz(# z3_nMEVHCQ~?T+W}UTt)ZwZvS97ee3bMA7MyhS>j0wIybD#JQc3@rF3KRcMLhV6o5# z-0r!2Lp<|vx+Ug2JP#Z?Vf5NCqbRk+(;m*qSVKIuwa^mB*v9Ey3E+XPet>txiybi? zI=NNaM=K9sX3P|4HFr2&R1jg-7<-hI_vN=40m%PSGP6YBCN|1 zXbi5|$P8OCpS!sgz3jUB`{XUyW8KU800xLYj@yqSF7-Dw{7*FcH9Ge-%D`gx6xgz` lC-6gj1D%AR^YQz~zBk`3x zL&;+3g_;_u(hVA=&DPXzQp>1;(6HMWXn%Nt0-K)&`iEMnwK}VGT_iu+f9%pfcDH|e z&K(X(sah<00p2_Jb?&|Ao_o%B4*$8f)z@Y0ev;m z1F^PX8%N?6xgpplAC%i#Vbj5O3z2=2OKyMUerPdH5$upWa)(qacSLxp?lvFngnGN= zl?7;XM(k2Ov>lQf>%+Kl0QB9`eR8;ocb;oO?8F!{&xZ;PDYmIZDlX~P za8%XpRMt{79tOcJC;UeC7VVJTAsI4G{G3<=^5GI7T@oZYoS>oxOwwjVO_1Yiipr`W zMx%mpU_prsVufqMj5rShAxFeiR1@aKXi8QYDKRdUNQmJXV=jyZN`jOKs~T0}(*YM7 zIU0e+$kyxMN{Qyiwg+~gG*x= z!6Z%0D-w{HS`;RyWi2$f0R8Huu%N&(rero~GPx*;z}|2GxqnF#lW^rdCMN@M5(TbR z4$Z2GcuzR0gn)z}NX#g39F5ANrs75nTBZI;W7F5hUmp{UBbLU)CRAh&BWdi*Tz;~| z1!HA%5Qr)=A}Ldl*O+k>JhqldMAg0|1(`_1Vv5#h5`{X7b0E3McBF=>lGIL?mIq2o zd_FM;bfgq=p0N<9^cqwICSYoSO??0oK^s9lR1_BBw*!4zEZLV-lD!(RK-PM1rd~Wh zF!!R|7geVEl8f3*BHlM2jmG+nE%c?}&8e^v-EK5ceEtWJU=3tTq;PbIN--b^tVjoYGv`wkcXvj)ve(OmL$$>{gBgaW@}~aF!{t1wkRZQx#1<` zmL+4EMcLn#GM2xzeB8j3TUHpsWvufYJ*eS!S(2dbCq%Ml$SkuIX1h^B5C4U&vWtv0 z!{L!{@fm*BvvVxgZgRJ5TAkUR;;2V!Fl!mxqcWv&0`vEoJ(UvX+jq>rrtLR-Gj^@X z{5G{d4P@+l?crjtk5x;46$LwCa@tX$RBNLU-ER%{n8fbdi?M1waXV zON)g{`-LV)sVN;0VkuQKv?lDnT5k%lWwW6)GvN17L?C(%Dyb1l6O`inpl#UbdI!}l zYt@~h5S~61q8-@JV}^20JK0D@RZ7-^!Yd$t2YcZL87`u`z+Qw5{jR%_j%3sfVn{;8 zWs#APXbaM;0rr+gbwm9)4*DO+mdJBE@w8;Eo9=_#bZ|l!H`}7JG|LE zlCK|mcHq$6zSX`J2T;qtyXifacOLuB)wt!Z`{cq$7nTEAE_*cFl<#~k?;I;yEpB^( zSZwy+{Hl}eJENlhx!*W6!0YzJ)T|uVbQ^M0Ww#J6>ns?BBe7n*3KFl1^|sFV%TDAxnwh9qG3sxeUE{0SCUa#NG4QO0bInMni>!$X3(}UiycWtVJf_v zh^&QY(8!cm0MrFbu$_r?O3|nYk9ZP_xXR2MRReF}-b8qZl3@q|2b2W1WS0CE-2wOk zY3dFTJ^b+VvMu2CJ(9C-IUO1vNEQLLxD-bS;P@lE6Nh#s`JNAQq=?UN+Lj!;HYY zvcLul;xy910iaK#(ZqrQM9%b{?gs(839vz=s?Y-jsoY(HeK7SVs%97h7)BKO&-KEM z3+ke($uWNk?4Yt8WNYDu*%5nSoBlx%C;&NlWG!K!5(6W?#h`4}8~~*VID4i55W%#` z37P^p2s{%@dk3LpYWbuvd0C9A(3!w}A|N-|lx}86e^2aJsM%DP>ZMH-&*TWJ8UO?Q-drrQ{ERJ?T9yrB&Dn@kTGk0``0iJpTouftD03K>H_ zUjNdSZBN}NqaTf~OlP$X&(WokEq8sE&$|!i`GZW#_Li@HAaRFOWUB&kRBn;0CdY$b3o zR(bTIFqN7imIl-3QyoAPRXbTa4lIH6#N#(5WVgacv}?RF=o2} zZX$yXxM{G{i~bnZLO2us9{M5-5OEMb^(C_H7S3~zEiMS zYK|4HJZfd$X8*t3EQ8B;;QFe$xz|grz{$NJV+G*s1qv&-W8Q;V?ghmp12A$L8fTU= zoMwKK+4}Nl`NPy80EK;Kqf~Y03P9Z|Sjg?Qs=1?KvclaE2>C+giyHXt6yLL4~bsF4-j2++w zSiD<~s`VUI{6jOkE1B!TxjQnpj3wikMU1yAU4Z(>0Ujji<0B;yVofi&P_179^o}N| z?C*m=i?I>#)+1niD)0cjW>uMvqmR$Dj97x+QweFYWH4qlEvXLn^+_Ul_KF${faBT+ z{%>zo2}6uR?qe%bG`H#YD-6DyXa?d427qUHKHzWCt)dFRED5$t)}3f<7O0ri?GQ15 z9q0})c}awB2yoRzib8~i$F0yKKwWnZVX$aej!*%r6#YTn4hjtppx0cA$4j65$jv0x zZQ!1#U=hGH7^~5AD_A+*83v6R2e#-A!=;8;jq#!LV*nR*iz4}*458y#H55-svTjYJ zG@TQ5Zc4YpT#CSvjzXq7Awz~^GW2IHY(0bsDTK_YZ^D2%9N7Uj0q}VpsrP-d_|f9$ zEq6OtJ0CSa^$)K52cMkX@V~Or`Ue}{OHaL**S(j&et+FNx^(qB0M0Y_tzUZXc{bg> z4AQ^2Q8$ox4m@*tmWBZBH?(G3?zXSCKdtZnh+pQGPh(qK_VnlK-Sleu(b=zFe(-X> zbOC(|4TJ?oO{x-&eoP+{^&}Z7JkL= zLG~l=Rl=crKyHBcy~Sm?*jUUxxcD?heP1k6U=MNdDHkz&+p~p2=Xb z->O>zrzmO;11952g0NKb<}OUjaT#RxVj3)XNQo;Y=Pqy|ng|1cx)>;r_z_H0QPchU zKgq9`NZxyX^`#Xtcj&9G2VIYc9|k_-R$uxJlgdA7?BueJPCy_K=z)jEoL8Mk>Hr2B z%UeNy3zH?;-LX9VK6HHsKNVFxthVj;Zg|gs-oJL_^OxXb!+rj#`{KI$VxGTf+#L)n zC-WYQXr1m5nLyN@V8mXwMgVw2oS0vNXGkBI=@;2q#B zoq!Jt{}qr2oX%rZvKqa3KshDBv10Hg-C9i>^;Ws-$1tBcD@$bUEZh8;-$oBZtL_Sg zVhJe~1&3bughKD9#AwOAzly9zoy}R- z23F&n{Aq(@Y$V3=^qRyxc#jZOR(Av|F}!DmUlDQn9fGI}-Y9(UPNp&dI}Gi)9ClN$Y3?!& zUMq(z%LpKF4T7J)V@EvOk*%t?yc+D(Ue#V+54;m+)&468$d=*FH8FS!n8V8O#gBPX z6=oz6PJwX8eeTcEZ+48Oz35?pkuSvsCv|QzUHiIuU+C@7b~s$z*W z3yYS~9tYFhDy-Rau8o$n%eC9?=4>rEQjaoEMmGDthvrwlKLMt>K&-WO zTl+fi^K0+?jVJqq+?%;{E?OYmOD)?i2eUUeT2ADfPv)-ugZ5~4PD_FUP5w2*3 zS$W@au*~}pZ`HSN)$cFbto!ShM+#1I;#A(*mc90KXLq5Ubo4#%fVj#^|GN8hzWo&D zrAO!Q)g2+kEfC8*4~M2NWA+kcx)=T}ivl#mKk@?NRG8xP(cR@3Fa+9AHBbz$8PQLC z8^-0~9qBa~17iQ7lzNOZI(3j(tP={=-`T@BE2pEp2&z|FhbL{JtaWwMUm+|K@5~ z@vjYTx&lj%g1wgWt~d$=itLpl76qY+v#$&m2o!7MtjN7x#9HA@4d=@?6$lhL=VRZ4 zTK4(1l@0*Jv=h{T> zNKVc5-20;_&0UMq>Iqpoh k=Q*^VDL5P8ySqT3$VrdSKCwO?co1jRubZD^D>DuM3twQpw*UYD literal 0 HcmV?d00001 diff --git a/config/__pycache__/vllm.cpython-312.pyc b/config/__pycache__/vllm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d8644d9405f35256a2c869e6c55978573993b60 GIT binary patch literal 48524 zcmch=32+-(dM20vK@ucD0wlrvO5OwyP$WfCq(n))bW)T`qNFPJ0YOYi0tJxl1SpkY z%9h&YZB?{qTIH*%x7*Vi+q0{7*M>EA?^-kCh?TeP+1{OrT_j0uva(gSdZ&8riiwz| zO69S8cOz!M|K$Zz36-j+Vi08J%Xj|o`rrTl_y7Ove?@(=V0&bxoq~%nag4C+__x5?NMhe zZ!XV9d1OWNW3D+Db33B$n0wC6+}Y8BSm9hDbLT`oG4Gt0xpSjMvEsSnSjk*T%s1zY zmCltie_OO{t_=A(qvf%Rxr$ijTxG0ku8M`_MXO^qb2VDL+F0FO9dqYL>tp;J&)lwP zL#%PGG1fHK#J=6p=2**I3v(AlTVri=ZEzO~o@jf_Kj&v|Z?q%UIoBEMn(K;n&vi4u zqG%x2GuOl1#nD}{-nm}pE{XQV`sezY+ZWv(8<-nl?$YRBY-nzXxyzz^V#9O8%v~NG ziS3=+%iI;weX-HGQRc3U?vEXqJHXsk(SxzExiRLhj*iDB<|dfCCOR3Lnww(o+UTL! z;km=HBXdV=T)0>)K^xUYkH(J89YZ)L)JKoUrst-an~$D|&CJc%xW$}s$=osgPvd{a z&V`%8Cx2j5N5b5x@Dcdb;J-p>c)wBif4@n0*}21<(EJuBwEV!Xrul(3i1hc|Yl1Cw zI*eyBm*c``gw}nC0l(SsS;Mc5{K9#8oX{_{Bd%W$yW|i$-pV%eItNG!g~Inc_40+T zw>Ts8ywDwf?UG#xyycjC9Wi}Ej~;LC4dlI><-IFB7k;CXVCW5hDSVc>7;H%Wif3oV8&hei10is9vGXg&<5QwSx( ziAapxc~NmCI3Jq7LLZJ(p~MxxLn)Yjgs*@uM?`m8`MF8%pl%jpgvUkf=B> zhZDh&cscIRQt~I37B59EPe$ewO7@{hI4UTf*pd*A2Cs#p5frmTMd!g0nvaI!aeR2D zLYJfA6BLPZjQj0MRd_LUF&YjgM8Frkw6Zv#h%7Avs%TVkMHVB8V07uSS_^L^9u!v= zLy_Rp#f9LC7(tFMF|;Tw#eyp<5#eJR5ac&ORZ^h?oJA8$vE@iKMERDvmBmZ zp;}%up)Nm@5Lf0CD`HqUxsq63F+jB|?lUXP%S&P+ek2sXqU6l8zC^{nr^GP&BC;4Z zfi8Y+CA64`B-5G@F$<2LOKa#^_>_mlixDIiZv+$6y+J^|bUiHiZDJd`W}XJ9{&Q-M z=l=($B`D=Y(?w2Ti~{jzYuE_p^=Ays#6YIccH! zX`!xgK08a)P`%2#%rXh3)dTI ziVR;q++g??8@`R7W9x(XmVzfc+O z5X!?HK$TVa?i2z-HQrs}Zov`G5%R-cgle_Alq1x<J>7#{)_Bq09!9*)KG{#m(&&n!>#(vlkFG!*@Vvf$yNu3K;D|8+?X@ zcKGZ;oME*!;O~e32#`HDw-=*upV0Y!mp%%GT|##nM58zC{$3^jTr?U}r)}~dP3oAC zzkVe$e}%`KOJJhKc}x@j8X|U!VNBq#Kq0`aDG^cKes23^N~dsJP*uvF%%C6SXQ)9PI01siBh zE@r!#W#d-u3)w90w#TZ0^bmtvpnP`x#EuLpEn_dB{GC62YUx84^i`2{p2Bu$7*)PK zx0gU;5&H{;#=RX9`g#d87O}rjXbL|UG#0VHP-wiL3mS{qUnn%ip9>m`*k33#J_~4G zw;unPfflh}4I1kl6Xkm8$lGcZROsR6Tc4-;`<3Twm6?a`ddUp2i2Z6ceMRh1qr+gn zUjmIq>@N@+Yuw8%)S0m9u2(A+y~*n-%(rOEBP`x%`sl8g)Z8NWtHJP!{m`t#V7^}h zjYaG)6q;5O3f=V*Xe?rXq0qGJFqm)g8;MT+3w>!3<4xO5N1{uQqw^TVqlq5fZ(*12 zGT(`Q-Mz3|cbV^1hu{FYt??@shRhJ%rLW>b&ilDWykR}meCzRUW-oeI9WQIGo7vVl zYt`6ly&~qZSyDE36Mub1%a}OQrll5JFv#M?%K(~MB#d?ffy`ftpp_PR(8h^yJgs%!q_$w&mBkxqKkQpB zhA%Cl!7Q@`O9yR|2#W>~8A?#MGE_B@CJMGRu3!*-7bg}{P;tyGEry9PkrxN2BX~7@ zBYp^9o|Sl5K$51XZ1+*Sow$KT1_%Hf(??zlg6Ha~UxWM{SHtbXtKhO>Rs<&j-AYp$A2 zcZKY(m)!OD95UZ4@x3y?U*h+#^9LWh$DSI2IUDY_UpP(unyY?WIlCo(x6F@8{Mb4_ z@z_0SQ4YW6;9c1^j|7X!B0FGA;jV*Ytqoo3!xj7K{X!0Ir*H|Cwp2YPpjN7A&^ddW!GhI5bgeKW{>y zyFR1rCOw|{Hd%E--uwA_Eo@w(MUP{~uyHqBH(Y*q@{U2%oCzoRm1SN(^5VgsoF~c% z!&oK8Rzwjb9o%9#B$&ihlfLPZe z;?iP_GPxEKBXrJYQcMbOk%-NqWpQaf9FGSTXHYF0lvFU32qql?oW&_NUa>FSKXL~#IwxKd(Kkm5{)W6PH!(Xf&aN`gLwh^W|?;)-J# z2azCgmLiKvF2ab@%^V=xL?wrOz;w-1K|_NndGG=s7S$AKzA0IYOV>r}b#aV7ow3l> zFfvCbcrJ!wVX+AQeh0%PN-k)cAeHSTXed~5qMATHi7Ro+g-9P2%d})Zt7HwQf|T5q zhIw8o#ZOR5d<~CV+^6}R+bg@8)?H0c%If4Yzf|U5FYCN>_@lClcQ3zl`Q7L{(R)Mh zAO540-#@wDHM(B5|IXp3#eF~Q`_tinKKv(pA9QaPSABO_t`11mf%WQLsjA)wz8_xs z)962s{>fr$VCvEAR+g>kxb1HWOW(eHcmBQp^}_nCT&|$>QzutZ_3rg|uD`qb&g!PO zeA8RDS%vMUPj9QoAE6?ndN{O#l4wSO=7+qvKKeA{#1 zxn9xx5H!x%6L;&qHy^uqZPjz`W)+F1V|_wG42k}4*nYi><35ceZfE>%ZL%Vc@w>f_ zIF4vm%t2gpJGYv(%B|Y;>2}kBJ*&%zY&dk(gFS8b?Nh7S3l>P!h=Uy=cE$!mH8L?y z?XRK+Vt0bZ4f_e%Ksgq$0(pK$PKc4anuDkFcXNNBofv%GhSE#)yv?@(UoJ}Xnf|(K zHTN1$R!yj^W1+0PjL_z<<^oL``+Ek7vHRA`;b=8~311RD=QzXN$nv|AHHU%KXt%~r zWV9n=Z6g|Bo3!uaj2ZhbLTy0^B_x3EVssegrCFSQj+3mI-I-4;j4g)1C->3@dvPhj z$JIk@?6P-a&BLe82KW;YAc(ER`1#PXcBnqWC(8o-goawv{S{jv*N^1_Ko6u%JJ#!1ciCrkp*tAbNCku;#>*2`d zD~aGrA`*>@zd|{bQuFACYDRUG9FR$o#e`B!!LbMlPUuy|;bSPe5}p#pB~huMaHF+? zIMxfQFk=@k(&9wu5JIJ&ic~9tM#pRIqm`a)O2ea>B0WQXaS!IvE$;8~xxC#^UFEW?S#mX} zT)RGWpZjTb=l$u2o$J+y?o4mi_sI2AQvK9={o$0m=BcMX<>}n4?UrkIOSQY#YX|S< zZvu?AlL#`W=>V{Iads2I*H_O{nW$mEFOFN!tIm(N-I7d-2 z$X42|bxH27`(Kv34@lhyu!q|$sZY7zSabm;X;+{LXJRjCHZv!+r^4`NP7P^}ew*vJQdl4e_x2IU+T-_#5>2ZF-Qe z_*?X#j5KOU^)SMB&_9LOBZ(_Pqp6aWX??Oi2;z)foJd5rzj9ZmT+L4^YQ9(S?Sgxe z^@?40j&8R0-#NPBZu+Ra`rVuF+`Km{w+u=xgX`r(cifu|9e3Ot&bp60Wp@+zyEZ&S zDd*6OyD?)xSD{ru&xRB1dfP4A4V(3%c*~QZeIEPi3-Bsu_8d~_!D;UWV+Ut?q8$4z zm)Sdl!2riH%Pm+PNLw5TW{7FA=S!kRZenZ*SyypLA&v_U>ly})ec-BH$fi`QSxJl` zkW!XJjTPy!U_~T2gd8FFyGAW>SkMD#oD76O!S#>oJagjOr;Tot2Y;FtX3!qjyFliE z3Nf~Cv`JH0_)C{A(K1eZxj42qcq1D4=lQhg(JxJGjwfEX$}J^y!$_wWzzogLFNu_2 z^oF^An@Ntpv9!X+R&bnvVt5F%m<2O#3B-)Ri{I{lF*FZOG&|5hqJWaK$Rq}pl9L9| zV9_oD`z?tP4CR9=LboeL$mik>aP@=BVKE3vnyX47ak%G~SArL#OY>K;y^6+|n2=ID zeL}CIn$x-7S-(A5q=jP_3Gq%!nXJ;ih+%GcGw79EDsc%isRfMXu9RBVL}i^5XF)06 z9)UPi;x`b$nDvS;0S-JkM274Vg)3#?W{0DYcHH6bMqcO^6?~;K{VNF3T2p>0N2Qn; zwBk}U3`EOVRQIUhe@bAvshN-gwOL&J4tW*g^YTTbCF3OJ!$yWi6V(Qnm#~Ru;;)Fh zLvShCo;EUH8ncVga6KMC!)~s%Lq)Q3%e2%oopR%^A>}%|=`IF?k9M7nlB+T0>Uh#R zxZb+w&Lq(D)5cc0v0rNJmm3dBjUZp}SC?{~d{SJq>7B(YT2hErw4ji9$3?Q6m)yMU z9+2Dvn}x-4VWU*oC>IV(g~QLXYy~4*SzKNz=oWXwy(4R`uFd`wv&Z1x{c z&#SiQL6=m!|IWo(*AvSH`%_G{Mu3mrRSa$JJwefCQPbeEn#I2GyTrg0oHS>$a)kn+khGPE_lCF@WbW;-bsh_JyuITm zp0v(mfeyfT{1W?hkA;J}%X~AUvVApcVZ!v&T{?ypJS#!Bt31h^-lNBRXrxZ{dv{ z9{3b|dQGq)p;gAJhhH_XQZug%InBKY!2+Em?)Zt-Z0=dXVv$3TcI)hi5g*ds=35`} zckN3#BbTUzf~Q3KU_OdDzKH~^2ujDv7TCQ8O)&WSgybFrGA z3%7Pu2cPT=vjk_DzxraBUR$Att(8SMNwiu6Yt33EeofCM162Q&9&WycI>G$|+7IIw zU&~NNy;;WMqYQal&HXx9~f<%X}|<+jQ$Lp-Hcy(9C)_ zoszZMQMBmsGws&}+luW?n^=}9eBr;=%QoLPb5|Ylebzm2yUf~u!J?nzRTvs?)6duU9%U@2CtRB!*pivrOF;oY7r?*8*?d zE@Hbii;&=nw3Poucbo6qW606Mf8m}PM|X*x)}>qZQ2evTMXT91))3sDwDw>4JslqN zEwr(*NbtR4T|N`Ob`w6(xBtv6OLx7!ou~Er=(FDhq*f{Z8$EaPjhGhwx^Ul&p}VeD zVP~@I)#JZ949OhqSqcbe#0QUYg8$Xa0M~=4|5vOlVs}FFL#q(uk-TC`^?)f}g0d{u zi)gj4+S{yKcHxUBd2bT-*iN23B3b zZiBy@wXIZ(-e0ilul?zV+PWnR&ez^w=oxFLby0zTKKuTPvz_*sN_vesX#l z7E}Xhs!W{d^pqBTa{DbnZQ;N0ugx0huG@=<+pd3^@7rD0{x98K7JoLxi-U!mFG zue!^8$3LrOM?PoETCH;b)+|qViA0CKV%&xQT~B4c^)a;|nSQ!U53?>Icu9|!)-r3l zpZUHmCLZfwFOR9W&@T$-O&WKv*cM{hXUv4Xt8R0LxSAnnvke&6ecxNon+1@wE6`hx zQT|pCS*+nV3x(0uLagLPtA&>P{{4FUFe*sa+thsPr9wf$=0A`eJQo%tmu?thFd^t@ zGIL&hEBVDC+fV8QB<;nAVP=bmfCl7MXJBW`9$QwjppL2(K|mr5LHlSh8iGA9go~vp zFnM?^eq(W-*KEa*c#I~+djsMtq!9>F6sRbunOn}r2Rr5nRwY?Fl);OnpIEn1WM*X!sM#1N1zCO(P(8Ma47lvFBH2o5)#aZ zplYJ2ac`G0XW)={`-PVH%WKWIvsT>JcqHp()p|6^j4JyBvrOhnyoovEw|`6$Bp+wf z{Q13Es4HXEs8ZlKJY)h*m^makCM=~3#PcM9mgw)}W1&QBB?=Q#OZ@dE@v31h3gagl ziA7XJ^Dh1x^oKA{&|D-+))one!)=;w_OckBhdMHZZGq)a8g@*pL6`w3+*D+i}ju#5BKB18xo^9p1KE9I04}(O$Ftj_qBWjr;XF z2X}+}@3OAj#2~WNwFUF%Ap-{+fJ@6{me3y9rP$(32ul+!zNHG9=k2=;&Hgu+#r?_7 zbOEX~394XV9}_~0?++M}ACU}I6N>wXzvcen7MCiV{qvI#zjo(q?>Xd(9;u>7t{9Oj zMm9Vne}3}kOkVa^8(VBJY?uWte8sT{4Rpl`i~}=#OQJ}d0`Y&qBl*jdgmRdUKG|x7 zJulV9AKyMXamv39m8jOj&|@$uumek8T0tL>?LyU7Ks>;og!;P}VZw)oE@Y6z_(@U! z)X}Mlsq;tAOa+qXPAx4#?(H(qMEqYIKa{bN!RN^Lt5ZxHUF>Ek6=oE^;QU)BbPZ2x zBMs9DJZzw$tT5TTWF;p57ZVM(^$M89UkNQj$Df2&S<7M>K;N>dsRFhL)jfv&(BF_rjY7)NXoh&` z8fuK*R2;C#nMj^GNh@U|xf}UqZ9r(42sPa?*9m zY;+R(3`LV)WtnT;zru{wsUds_`hKugfhuTa{USv{WvJxD+3DG%Cr(a|O$Sd;9X@(y z_VgRcuUHO1AWahK4+|r@OfJ?n%}9f4KS6}JC5$|1rW!8LbkPEMs0CgIz0=+j_bc`1 zA@vIp*h5QX2voNWcNmyOwbrN9Fm1>4Cr=-rIvr#Ho;rPUB6wnI_Q=UeB|E+xy1tnF zx|R!#;j37_Ndg&0J+5KZiN{yay=Y+g?_B8 zC?&K=v<2on44NP*4SOMS4Qn6~rx$PV!u&GH^Qt95`6?`e^tu*SWv<(kc7GswfCx?2 zhcAr`YF2MrWBn)D@<1}LTa3kni_ncr=5-oQ@8k_AsG>zM2$)U6q)#V-j0GeTk0r01 z0rVKXnEBeW0^fA{fJPNacvbkD*Yj5e#?`6X64wOW8C4LCTqPSbs3+?Yv=CMqD6J)L z&j6Z8cs_hRf-zw%7z`n)gdpLD`A~cwdTr*!5Af%SdbmWGlgLCiY8dj?5&<_r&pCuZ z7*OG1^_MUxI^%_$;z=0FHSGH`oS4?BX+an`;Q*T*6DLocIyyb2*46-7T_i2)f|BDr znT+W&>&GIGVwEo@y{7OoWY@1_w*#~`e?uwL{H{P%fHbI=RxV!wW=Td)MCQdMRwLFZ zsuCiabvwOHv*E>D)nTX>wj*O&fBN*6gm8a=PyXW7?D`yp;gdcAbJ-m z9h4jRD>p8Jwt-56hPX5f*9OUcfi0lAxO}&YMlMjkr0l@#sH#*h3?i>!b`Yq*<-oJZ z3E$WiGD;Yu4+OD>h2t=0O#oa66}NQV9K(bpPK06?L#7u)(X`VcR2m{eJse#^0hBQT ztt%9C>AN-f(E2Uv3n7)0sgktDBX7{)dNzcNpxjQ0nDwb0p{RBU0T-F|Z35xK8Qtdd zi81V6K*7VxGy08mM2t*(uJ!gXGqKAz0?Cb8?A~cnLQdN722lZ23FEV9B&J}ZE$tey z!02bQMN}k6?W|SHSPZnj3-IR9h$OK10KckMY1xQX#{>3aU2JDU8Ks*wVDouwdivyf zSS&v~b9`ztIDK*g`d_d(b*%xBX5kKJ~|mZel9pM2CL?y;2%s+u=S;z_LXEY z3-~>JaRnPI*i!<=#8Qt6FR6qynX>{+tf0xM@w116lV?wy3Z5F9JwiyUB|eXVYW#@R zTY~@R3Ld|0dzZv>j6lTm8{mVLrU}*7)71lAV)XxQO#de1K%)IS7lS-uHG#!w% z_si54n4-61Mk&&Cl$r96sl5KLaF!7#YxP&I|gf)xlqQT!1K5{b|JeVe*V z!zAa_JsR^)4t)0R%%Y(GXvaS70ha2}(%SU~1H22p~*?j5vo?4vmu5v{zLfZ4!x8t&vrvXUa%%J>m$(G4*q6K#*ZC z&p0?xa8h;B!Kq>#9A@ZD@iNv}&}83fXG!VLJq31 zQV?DQ=7gnYwmB3@F+_2v(*ZCOcXDQm?aGy0y~clxXyP%7>mvIjTB|7@4KB4Q63a@p z<`LHjE~Do_a4D4=>AV^my)M#ZBWH>Y+JA~c>y_~uV1p=fHf{-KciGoo5x`wp6Q$tJB(SSxz?-H zQ2XjkC~X)Tdk7_qny6q~LZyKA@!+)J3I~w>icg=&8vRX(XHlsjG;C1JW_QPxI-OsU z9&D27O1{>V!o?-UqtT)Uw((Ju)$1jybc4C!(&z@IT%TM!A~1ze#g~Q|i85g$nQ62t zeV?CbP&{mi1c^YvWP^DkXXs*lDGGi!s`BPCdJQcsDn2VRK_m{Y_=kA-J*vqe?@k1( z)EYiot#Ee+9c-xBR>g@%i|jy(L2O)M4l*wO2P%ing@VxXJiM_7#BmKXP){)wH#(J2 z2{VvJX6rB0O63amh4i+E`KHBdtcGkU76}c;J$PJD)l>DS%26!{rt?md6`9uUb6DZy zbp>o`hEaz4rtd|+9sSN?smti z-NTZ5_+hg=G9!)5Y`9ORg&dUJ2Os(5vDc)r*EV3Ikn*aOoRzY3KynT|5ahvQ(%`WT z=W&*EwdAT!x!PFz8p&OwDyveg!_wg44fm0>@|*AbWdDfdA4&NSJ>ffKenR3WHeoBy z+aP%x?(LSF`lY6R=)g8WwRD4Ntk&>yb&pitBUevL)zc|w#b)~+SnC*tsR@TaKjmzJ z;wdTG@{)^}UA=3r-ls&VG)bN&*)t+}MjpPl=GhOEAX|1@Y2Wj#EHd_y1!Es>!$w^S z#w<$8<&qAmq(d(0l}dX5tl}?v)`w3^C8uGn1PLn-*^rXVk7U98h?DbG=&=v4l^kYq z9Au#+hoa<^mOU%ve7&qpduhefiuP2w|9&x98|mI=Z3GIv1&vZcqg>E074)YH_9E+| zQrX)qd7EYLUdg-nVQ9^JzyK~)GWe0V?A^k53aK*3ZJ%aQMV#Eofx8ab+ah^e9(xBr zDD8WqN3xfWK%uv^<m~}|YhL`Ml>Pp?RDebXLz8POTFWA@2R!kQ#8(9@3>O1Tk4&9GJHrLJ}wO#qXvWe>C_=} zyo_&-PwHQfy$7C77+>hwr-T?=BzMbWciTq;UWrGA(!B5Z@X_h}(lP@+!@ThRfu7G_00H@GL|Y*X`IhYenRk0pBo zYo5SneUn_jORC=`*YB6=_sjLirTXJ{-DHrcr0xDbxpP$N99{1`@Mv(Y^XR6pU-pei z7!C|wp#bzNEYQ`zm(VjAk?Kbt9{;g#qkLvZ?7ySf7~I8Wa#5pH)F>AXtQ8G>=skee zFKw57U$hd{ytx zyfcFqtZe->i|jpB7Vt}w=Q&*U$lV-mB)xa&v9}4iRgXN~dyuAdgXC`5 zaPQe18#ll9vjs*Qkeez>rR8hGOB0+ z2kXQPnGLRahuKI|oA%(E@1T)GR#6KY3Ioc`yheu=Q6k5R?tJH zFElF~>caB=0lD+A)OmQ#b(m2ksk$cmB}dZm!=U6FlzpR;Z}j2S$G&42vV7OGEUvQiX>C7jMwV6s6PH$-_<0l+uB^xE zjY;6;KwG)|lFPs0>NN0=y7tQ6W0LpShIg8kMID#&^=?+w$Q6F6!Y^0sUjq)NPxW+t zzf`ebt~f7MoZqN;0~CeqZIUqV?e@|D)&n+OcdjV2l4o|qbB-+oYMb;+<^4bW6?y24 zG<0UI{ETWr4Oz54g;5l(1*WCe=~T-JqI^z*fg1|`R27h` z_Dfa!?;J-frW%R*0x?A+#8$L-bMK*t-5|e?*lMH8rVh9*iArt3g7HcOsn*)zHBnMD8k zFd_CzC3`nYMxP#;riDYrcT;J@vJV&*xQ`KJIlz6NW;xM%#oSTb3ER#TlzGatYkOX- z9|>0AE2HmJJf89$LhtL#f66zM)`7!P#qdT2k&7p6ui0qHCamz-ck~G@w5Qf8PhpwY zk?1Lar4FF<$Q!<+Y15%^&DEDSriLW%knBAuc@M68$7SycSch5no&;hf1f^P+=?B`U zsizqRd{SQhUCo&IdikJSen2WeuwFhUmrqOO)9d9kca9j%CVK|gJk&EfAv^HYSEDWy z`P|R}os!yc_kZfh$*X);!MW=pa})f8PJO#iT>Mz&>*Z?myYVnD<=eopq)VDij1$$iT{Z@1@7j=c+g?P}vy*c-?(yHIcNYL=uh_Nw!tD ztyhgs!^I3_cB(*wK<(olX7*Y%Bb*Fm#=_ByH5h$#$eV6v_?1%7g~{>v0njb(Sw2_U z_B`8Bng7#@*1HZ2_q&2z)*+R3$Ylp4{CN&M>Rt0peuDL*XtSaf>qs7Ui-?bu?#d@R zIbbj9!x%2AW7LDXOw-QfQx_24GoM<9_Z7={mhB*$jE=ngpMO$|l79Y44Lst+P5!l( z@odMR*S3u3JGM!f-v(RiHxwuAAY3GBmiW_52ft zHs|NhTQ3796SIIv9U30Rn1UfhSd}IXzqsw!^Hv?ejc(w^Z1`s{)azxNZyhpdL4^dT zMP4?`ylg@qwXN8?nw?go20dRPFRfS54@m?FhTasc2hqsKPp?N33^r(r+k||UYfXk+ z^ASVM%atJq%lpa}n$^x39i>+|?Ym9-hvS!fJfCnpUrbf(+>_pn`raj+Ra*^ zL%bb7!Mkc)mOz^F)};l33)P=>_<~g}&ZW&#!!}o%pLJ_nj$F-=bUcb^iG^lDAd`Fd)Ybp$1Ln0dg&nDI^oz>OEl~K~xDSm3$!)5L4RM z$W1$~gRts|X{hwVg$wG9Ohz<11T*ha;YWGc3vbs?#z11k`3}CXiy!Fi^BZZJ_}(t^ z+)dJ8Ab@n@2=Wn^_&s!g1%x2_1_o7$(-&g&4yNU#M&1sE5zG#oOz7eO?@L$&s@97i7V%k)2ojZ)H7|{eM=mcyNCpEl0Z}zNwuQKoD%eHRzc>wpfF2}dLf4j5 zIjK|Z9xx_9a|QB(Bz6@xfKqS2V`Cl)1(7H+BRX8S!)wUovQckboOVHncu((eAH?O1 z>|!vJ%EQ!u+}{NPUB6Ttc39O}l~_uk&NyGBTj+V5QD0PLu~dO;l6(poS+Npf(;q_i zkTD|FCY|brjMWWQI5%Y6qw+##YIG#2Trrr$U((mwykvEO;^AHj-1(>%t7G@4L6O?s;OL!RfXC1GXV0J9!1F=;x zL{h~3FGE>GZI&(wQ8Lkaz{Zkp!$3^V5K?W|X%d>&h$*upXl3NbT7r+?z(9$q)y1{s zkPBqp00gH_1a)MqC;=EJ{YFH}3t|Gx5TQc_V%V!ez!{?%>MCjNGP9J}(I^m)rc+q# z#@@ss_WU3KqF0uKo$%T>dAU1wF-X?8; z`R&aymLR!R5(7%Z%2c;*667`;mESjtk)UojfjBN0pzc63YP-%y0ZBZu4PsTlEDaH; zkV2@Lu*P<%b{4)La(%eV4;}uJf(?*Z#@ouK9rFtX&3sI|kOU2%(sL#WnGnr$C~V-EJi%Hz91XKekE&OKjD{bQpogI!zZh)42* z_@gR6gdl&FdU3!ewa{LAJf#QGhf1VPm`y zS`BbxCS+?IbVH|-&DQQT9?BDn`&aZJE7q#4nZZ|4DorcdW@Q;-@#m^mxEVBsIt(g) zQ(U-}bwy?D#1&hF0fcK?An777$sv{l32k7m(vJx{i2oiQ$@+8=2MZCW24b;X#sdbo zPuq->APFbb=7CE8x(|0L8OOhL9{p$C^2oF_GQIAd2A|Yq@sCU1apGjQfy?6UNO^jn zbnjw0sRt{>qJLO~bF_yOkV6O*r@S4ASl1}m_DQvUa_v#6_UN6NC;qPcUrmjkU-!SB za<@J$>3&eOR?-Llw92l=$sb%P_?S`#>P>hl&#@=vOnS0Us_v7kk4e?X*2|CoIJ8!N zQk6@n8zfL^Q~!1-b07;lMegyvJ@j6BxHm7(O`l+T72MGF7(yUgSjl zakA1bJBKCb@MGuRpMUZtJBG#2KXKz3CvncdXvja|aQsD=1JA$A%|DUn_{(Ako=O2t zd{ycdbk#OaI_)Lt|M>+xGF=`*{PYEiwd1$#S{LH8?Fgq&(~8FpPSsXbFTPRANKO%i zjV=yMFakQU9|#Uh&pD)JZiZ*KUyf4=#lui^euX6Q8KyGomTXQ#Oc`VS_oz#pI4!rh zr@Dv)u3M>SmMiv36?-3^Sg$yCr(m;Vh)m;?5Y@{~P%np zkGC;Z&B_}tE7SEq!04r)_#4QdXU=Vpg~n~4>;vpis!3a-Sr1!i(_Q8pEKe#-HUFVjRs^Nisp&JpiIX#jjvfx47(1oZA7RQ*W4gpYQ$0EX3HRs((-Bad zOwR`z2GESHIity=Pe8Xf&bIHQZ>z;DQ1OPQp`kcJxPUHsY*N#ZU&UU9(Lo-0lHI{R8MjrEI%*kYvF@izP43E3*u#qZsD!^H=t!SP9Hq|zD$teIBrVA{m`Af4?qg|zB!F^%Zp}6HGxzIH z-R-h_OmdGsYTj@k{>W9V!+LLN!!@|E_jsC6ai`? i3?@dK&o~Sv|Q;PQ#@}g57}&$0J@ZG z@Wq>J9OzDT68F3iy_9Krwvq?Pd?i|j1=f;#Ze@!Lbd+^mI=|iVqtiIS&K8Dj5Cpi% z<9F?gxvC2nn%N1JvWv=gq@z!(TCF6l7;0UCL-G)7tqut zu>i$EMM7mbln5Co`RY|#JN#$|s?B34k}a!oa*-!xAf7;mklJ4+9-=C%zlhv-&TD9t zFh*7ww}t|2Q+FYmewAW|IRAmB`EMgv>|@i#i2Ed0U&=KCF}yS}yH2UN^G$FNj9 z3=i6(^h&k8a_yv4JDGA;(QInG2NCyH?1MbMyNB;WuD|waNyU5R@8#X|-Mf76jZ{$& zWXtk=Km?{#*P!GXd{`}cChw4>!X4L32K_V+!Rd0#1+>x5-(?(0F-4&ci}_|{I5VcO z5I{jc!jmzD1y#&acr&IjXl1ipmZB(Q3X5ZSmZF%Yu$l{?%6I-)ijs^eENNbh zL?27hwKF%MYU5zo3}a=KvJ|^aU){ApD0Ih}HQn2XWFl;Q!7GMS9~3HtN0Uw*U=lq&nk>YT z6}Tv1F%u_cH0ePIZ0ayVS%?50L`b8WN0a5q(Bd}8(PSkoGgw{-$za>QaML0^nyf)t znv%0-h79Li0F>-YFi4ncxMq54H0kZa0W0VST_2gXiJ=y%)d!LEo4(negO!5a6>Nw> zDOU(Ctt4PiL{~~?)mgfTldo&qCFe%3ta z09MU9v3Afxb|@RN8f;T4W-asLF~X?7NnFCfQMXyNF;SJo^NbEwN-V~RQfP!TP!*>> z43uIs#PBP6VO%g3ag~9_woC5xksuz@X)#qq^Y;lXMov{`>_qYF6p5X8kVuZ=Qtke* zt`Z6BC^>3piG(u5Ve)ZMt1Cq>G`aW>6ouqR#C3Z7j2?_g{MT^BiDuDm0L=~3b~+eL z7N>2@G{65u#F>Q0E$;6f)dl;Xb9mgjv_-GGv#J)9T-qg-cBOnh%oiJrU5(7ONf&w! zOO3;+hJ9OENPtbuXeDz!HT9l$OO?A*6+>Iu6v1KJ)1r^MHck26>{ZifjRPBMSJc{GA9kf-bP6(gY>eBvwQbk{?yno9@aq?}u z8&#K^Yiiqe^Jv52tpbYTwjHrGsLnzTm1*dc8v5jhNvUBn#ZPT{C{Cg6kgZg8dbzTy z?Jd&1RYY;Twm`M&D&|OSlk7vt)gw~%NUCb@RtZHZw(YA?T|N%9Y)zL`(U~?K6vby7tyW!SESFBHx>K$mlB$PNReQF|DN31bxLI{oa4mkhd5_e*M{YhLHJ?Z| z&1_XtlnSV!wW`i4uB}sU9g$i`u(lbL8VBXZqf+D1 zRKu~YT8dL+J8tu<&N>dK5&luhKPvmrOaAkz_Sd)SDNdd34QM4WC(mltE!A|(H6v2Z zNUD18Rs+T1ZTo6fS0g)2BAXa;?LMh?U#ezwtBImCVyCLQn$tRbuT-};Rl9Gig`zau zMr&19E6b%vs_l_$2c_Dz=(*>t4C_wA6Y!)pBO5pW^h{W^J{qbN35vc3^9O;_Su@XjGkp ztP8by_~{UZX4UGK(PuL=vADQh20qv(?~FOG?(P309GUj^x3JqjLu^nhvc`dgJAPy| z26w`AGnV!%ra|sIa-mNvJ^{&75;I#v z;|JZe5JRj3R@=y$A^3whn1!GMw)7cn)M0Dg18~)ArM)aG)ocwD$5$5*hN+3E2IeK5 z$tdMJ;{~BjkCNilWZEIIQprw{)#Dien3n6d6Pk>%>d`;(Rb+@0nsoMBN%@`QOzTrs zhYv{Zzyl%W9)bE-!8x0uNGN+rJ6Km|mj};E`1776>3tshTkfX&yXDS(QYX|gPp@^J zX16%MhSNR$7H8siY1?a5dmM1g=$2OLgV~Gm?^m3`AQ2G3ppqY?ReB{#Zg((9CiAoq zEMUUYd@v|NDp?}}c(@c#5Qkq78e#mCIKpWw$9h6+Bo|TnViP^sG6@-Mj;*pHZManP zHbl&eB!3bC17E%aaua09t$6=r6KMB?s?<@8|dERozI35_$u93t)UL?X1sYI>Zc z#{qiKriq#E5vR#@91kT67U;wfef|=Cx>c(#n$eqYk^gVfgLY412N4?YlIsuYQAM9j zgyG-Q=OBF&-&M5@@fv*w=|TH*QGo0F9Ak2*q6gjNOAmrReg^~Y*SKdn_H5@fH`ma5 zU)bRLZ%yCrTzB+uI_f^)TOM59;Kx;;aq?;Hd^oewdi>VOyI);*9N%;_e9+$gaA~7` zCgt$mU0Qd{Y&sf0Xl#4v-e{bi;6b;s1EqyB@srU%D1>JHpGez$7fabVL?yV>HW zkM{&<-L5jC?8Bq!ylP1O%R=j_=z;QeHG zeXwg_vw}FQy$GsERdpLdZJU91b;sZ)zzg(IZ5ENfOikan**C~irbojp2=qVNz0rLn(fFYaR3lJ-+P{ERQN9CrYb;~hl%dU8D{y9hQtvBu2*~e}7C!ivY_k)S&?ESpZ zhlE{RPVtt#F59R-~ zd}t})+N$9?d!FUGv-dnL0X2p&FF+42K}@IM!)*I>7WDTs6CdY9f4Y`BsX*Nuo8xF z>>C0r?{~7uiZs@uDy&5o?6w8MRXct^(7su^P=<6CY_){JII}tuX8i3D9NSvJ3>^fZ z5XjCOY_SExUps!_g>wrPJ3R4e#pu$Hc5NJ5BMYtJ2-y+_18bq-#azr<-^gm>5`-q% zuNk^`Hty@(b?(iqH@NGzrq7+9o|3G1`?Ev1@lTbCpAE;Bmqb`(CZSEN-y|8Q4IR}c zqjnI1p*8fypzHQRD4gmk(|5`4lEj0lr# zre;qFFbDWk>Jw)*k@}0o z7p<*Le12KVU@mxRz0YLhx7LUqRnLkm4{p$4eB3UZ!BjW9+NWu)oe_Fwn zciuYrq=>(D>_f-Cr)7S*Y*;EAzIBpTirt&_9VutcW)*+O^Al$Q5oeW~?sD1PA-Owb z_kPK}f5UwcQg8)To8D^K8<4yK+1oF9O*MA!nN4q#>i6JR*1U(9*$&l8gT09C;}q4% z-ge0gp*l+acX?!Vp^)QySh>x2JRdn*WM{AB?0xL)|A$WpICm+?$?U>SXTeq$-v94b zAVcK;FHYRsuh&m>+kVeEVdwsE?7)PV`=9)k@c5D4HBpfDqk=5>{K#81(UJ9|jx6|N zWaolON3{Ap5*t54=Y;;B6ASrWj+$Nl=_tr*8-s0tEjczprk*tsZ5pji0|DH`WisR} zV8TX84Pc39L*~V`sof7ji}DPCDzz8|G--6$b^4@-Ks{XoIB#)u?A^5PYL;C?k_*Jp zkQ#p+f!pdyb!7E?9S+nJPxW`J-dxfaaC)`#2TPz@o7lJn4M_IWTL;!YbuVpmRIlCa zu?_t|pqC*7SiyxhGMmPR7%ihL#T`}s6{;#d=wK{P)b%ayNp5v2w+RQ>d7f_-zFn9q z-?QNye&VV8=4#4W{lrzDa@4cRE4EoBKN`9Lwl9-+aH($E7{s?N^;8?yCQsnNcGiBY z+wA`g+eLu0^Jk@^uD6fgcI*h+ARWvctj_Z`K9I3uK3kHz<C6e&yC}CLe$JmrQ*06z7ZE^(0GVWxLN!|sobh3WmWI(m8*xP>S4M1lvI66 zu0AVOpIxs$zh3tG9oN&c2DxmvRJL0#J1&)Bs|1W(-3kqPcg|0%TJ9Ws;wrtnbZwPc+xfa?{m)t#h@0jH4k(@o7RV~n7cjrKRJv-;;pKx%ptrpGCIoTf*N9#`oRr^o+{hmxfpOKIV!1{NkfGW@+S!n50+!MH0oBs0NFtRYoeyg=P}oi{#z~q2 z5>LSuXbHsws)!hKGs!0~u+I1R%#MmK96-#-oH((q*v0Dr-gMTGo__ER|`MDVb+ zkgXEpZG6J=F_4am2N)-Xrs9r;MRD#^3F`gge~VE3aq-{66BLKdhNZ>k{s~ve{#-xd z@_xd3e!_Ww!nr==T0Z3JKjdoJv*NG0J?q?_f5*B1nj4b1p%1x^pK%o*a{YhJ9r%zt z_#xN$Gp_EhxnmM{>_d+Kkn^+Tc-FALx(~U)hg|cATq_cO$PN89*Y?va?}v`=pE=I` zm1ED-@+xo=o>ta?dGOR%e(S_f^L%$-U(c(%MNAC4^8;J{I~@;ZH;P9#Y@Y;-)Xw@wL7i1zx-()eAr_C{{sRt(vJWD literal 0 HcmV?d00001 diff --git a/config/cache.py b/config/cache.py new file mode 100644 index 0000000..864cf1b --- /dev/null +++ b/config/cache.py @@ -0,0 +1,207 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from dataclasses import field +from typing import TYPE_CHECKING, Any, Literal + +from pydantic import Field, SkipValidation, field_validator +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import get_cpu_memory + +if TYPE_CHECKING: + from vllm.config.parallel import ParallelConfig +else: + ParallelConfig = Any + +logger = init_logger(__name__) + +BlockSize = Literal[1, 8, 16, 32, 64, 128, 256] +CacheDType = Literal[ + "auto", + "bfloat16", + "fp8", + "fp8_e4m3", + "fp8_e5m2", + "fp8_inc", + "fp8_ds_mla", +] +MambaDType = Literal["auto", "float32"] +PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"] +KVOffloadingBackend = Literal["native", "lmcache"] + + +@config +@dataclass +class CacheConfig: + """Configuration for the KV cache.""" + + block_size: SkipValidation[BlockSize] = None # type: ignore + """Size of a contiguous cache block in number of tokens. On CUDA devices, + only block sizes up to 32 are supported. + + This config has no static default. If left unspecified by the user, it will + be set in `Platform.check_and_update_config()` based on the current + platform.""" + gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1) + """The fraction of GPU memory to be used for the model executor, which can + range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory + utilization. If unspecified, will use the default value of 0.9. This is a + per-instance limit, and only applies to the current vLLM instance. It does + not matter if you have another vLLM instance running on the same GPU. For + example, if you have two vLLM instances running on the same GPU, you can + set the GPU memory utilization to 0.5 for each instance.""" + swap_space: float = Field(default=4, ge=0) + """Size of the CPU swap space per GPU (in GiB).""" + cache_dtype: CacheDType = "auto" + """Data type for kv cache storage. If "auto", will use model data type. + CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports + fp8 (=fp8_e4m3). Intel Gaudi (HPU) supports fp8 (using fp8_inc). + Some models (namely DeepSeekV3.2) default to fp8, set to bfloat16 to use + bfloat16 instead, this is an invalid option for models that do not default + to fp8. + """ + is_attention_free: bool = False + """Whether the model is attention-free. This is primarily set in + `ModelConfig` and that value should be manually duplicated here.""" + num_gpu_blocks_override: int | None = None + """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks` + if specified. Does nothing if `None`. Used for testing preemption.""" + sliding_window: int | None = None + """Sliding window size for the KV cache. This is primarily set in + `ModelConfig` and that value should be manually duplicated here.""" + enable_prefix_caching: bool | None = None + """Whether to enable prefix caching. Enabled by default for V1.""" + prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256" + """Set the hash algorithm for prefix caching:\n + - "sha256" uses Pickle for object serialization before hashing.\n + - "sha256_cbor" provides a reproducible, cross-language compatible hash. It + serializes objects using canonical CBOR and hashes them with SHA-256.""" + cpu_offload_gb: float = Field(default=0, ge=0) + """The space in GiB to offload to CPU, per GPU. Default is 0, which means + no offloading. Intuitively, this argument can be seen as a virtual way to + increase the GPU memory size. For example, if you have one 24 GB GPU and + set this to 10, virtually you can think of it as a 34 GB GPU. Then you can + load a 13B model with BF16 weight, which requires at least 26GB GPU memory. + Note that this requires fast CPU-GPU interconnect, as part of the model is + loaded from CPU memory to GPU memory on the fly in each model forward pass. + """ + calculate_kv_scales: bool = False + """This enables dynamic calculation of `k_scale` and `v_scale` when + kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model + checkpoint if available. Otherwise, the scales will default to 1.0.""" + cpu_kvcache_space_bytes: int | None = None + """(CPU backend only) CPU key-value cache space.""" + mamba_page_size_padded: int | None = None + """ Optional override for mamba page size; used by hybrid mamba/attention + models to ensure exact alignment with attention page size.""" + mamba_block_size: int | None = Field(default=None, gt=0) + """Size of a contiguous cache block in number of tokens for mamba cache. + Can be set only when prefix caching is enabled. + Value must be a multiple of 8 to align with causal_conv1d kernel.""" + mamba_cache_dtype: MambaDType = "auto" + """The data type to use for the Mamba cache (both the conv as well as the + ssm state). If set to 'auto', the data type will be inferred from the model + config.""" + mamba_ssm_cache_dtype: MambaDType = "auto" + """The data type to use for the Mamba cache (ssm state only, conv state will + still be controlled by mamba_cache_dtype). If set to 'auto', the data type + for the ssm state will be determined by mamba_cache_dtype.""" + + # Will be set after profiling. + num_gpu_blocks: int | None = field(default=None, init=False) + """The number of blocks to allocate for GPU memory.""" + num_cpu_blocks: int | None = field(default=None, init=False) + """The number of blocks to allocate for CPU memory.""" + + kv_sharing_fast_prefill: bool = False + """This feature is work in progress and no prefill optimization takes place + with this flag enabled currently. + + In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254), + some layers can skip tokens corresponding to prefill. This flag enables + attention metadata for eligible layers to be overridden with metadata + necessary for implementing this optimization in some models (e.g. Gemma3n) + """ + + kv_cache_memory_bytes: int | None = None + """Size of KV Cache per GPU in bytes. By default, this is set to None + and vllm can automatically infer the kv cache size based on + gpu_memory_utilization. However, users may want to manually specify + the kv cache memory size. kv_cache_memory_bytes allows more fine-grain + control of how much memory gets used when compared with using + gpu_memory_utilization. Note that kv_cache_memory_bytes + (when not-None) ignores gpu_memory_utilization""" + + kv_offloading_size: float | None = None + """Size of the KV cache offloading buffer in GiB. When TP > 1, this is + the total buffer size summed across all TP ranks. By default, this is set + to None, which means no KV offloading is enabled. When set with + kv_offloading_backend, vLLM will enable KV cache offloading to CPU""" + + kv_offloading_backend: KVOffloadingBackend | None = None + """The backend to use for KV cache offloading. Supported backends include + 'native' (vLLM native CPU offloading), 'lmcache' This option must be used + together with kv_offloading_size.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.cache_dtype) + factors.append(self.mamba_cache_dtype) + factors.append(self.mamba_ssm_cache_dtype) + # `cpu_offload_gb` does not use `torch.compile` yet. + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def metrics_info(self): + # convert cache_config to dict(key: str, value: str) for prometheus + # metrics info + return {key: str(value) for key, value in self.__dict__.items()} + + @field_validator("cache_dtype", mode="after") + @classmethod + def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType: + if cache_dtype.startswith("fp8"): + logger.info( + "Using fp8 data type to store kv cache. It reduces the GPU " + "memory footprint and boosts the performance. " + "Meanwhile, it may cause accuracy drop without a proper " + "scaling factor." + ) + return cache_dtype + + def verify_with_parallel_config( + self, + parallel_config: ParallelConfig, + ) -> None: + swap_space_bytes = self.swap_space * GiB_bytes + total_cpu_memory = get_cpu_memory() + # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel + # group are in the same node. However, the GPUs may span multiple nodes. + num_gpus_per_node = parallel_config.tensor_parallel_size + cpu_memory_usage = swap_space_bytes * num_gpus_per_node + + msg = ( + f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the " + f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory " + "is allocated for the swap space." + ) + if cpu_memory_usage > 0.7 * total_cpu_memory: + raise ValueError("Too large swap space. " + msg) + elif cpu_memory_usage > 0.4 * total_cpu_memory: + logger.warning("Possibly too large swap space. %s", msg) diff --git a/config/compilation.py b/config/compilation.py new file mode 100644 index 0000000..6e53b25 --- /dev/null +++ b/config/compilation.py @@ -0,0 +1,978 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import enum +import hashlib +from collections import Counter +from collections.abc import Callable +from dataclasses import asdict, field +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Literal + +from pydantic import TypeAdapter, field_validator +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import is_torch_equal_or_newer + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = object + +logger = init_logger(__name__) + + +class CompilationMode(enum.IntEnum): + """The compilation approach used for torch.compile-based compilation of the + model.""" + + NONE = 0 + """No torch.compile compilation is applied, model runs in fully eager pytorch mode. + The model runs as-is.""" + STOCK_TORCH_COMPILE = 1 + """The standard `torch.compile` compilation pipeline.""" + DYNAMO_TRACE_ONCE = 2 + """Single Dynamo trace through the model, avoiding recompilation.""" + VLLM_COMPILE = 3 + """Custom vLLM Inductor-based backend with caching, piecewise compilation, + shape specialization, and custom passes.""" + + +class CUDAGraphMode(enum.Enum): + """Constants for the cudagraph mode in CompilationConfig. + Meanwhile, the subset enum `NONE`, `PIECEWISE` and `FULL` are also + treated as concrete runtime mode for cudagraph runtime dispatching. + """ + + NONE = 0 + PIECEWISE = 1 + FULL = 2 + FULL_DECODE_ONLY = (FULL, NONE) + FULL_AND_PIECEWISE = (FULL, PIECEWISE) + + def decode_mode(self) -> "CUDAGraphMode": + return CUDAGraphMode(self.value[0]) if self.separate_routine() else self + + def mixed_mode(self) -> "CUDAGraphMode": + return CUDAGraphMode(self.value[1]) if self.separate_routine() else self + + def has_mode(self, mode: "CUDAGraphMode") -> bool: + assert not mode.separate_routine() + if self.separate_routine(): + return mode.value in self.value + return self == mode + + def requires_piecewise_compilation(self) -> bool: + return self.has_mode(CUDAGraphMode.PIECEWISE) + + def max_cudagraph_mode(self) -> "CUDAGraphMode": + return CUDAGraphMode(max(self.value)) if self.separate_routine() else self + + def has_full_cudagraphs(self) -> bool: + return self.max_cudagraph_mode() == CUDAGraphMode.FULL + + def has_piecewise_cudagraphs(self) -> bool: + return self.requires_piecewise_compilation() + + def separate_routine(self) -> bool: + return isinstance(self.value, tuple) + + def decode_use_graph(self) -> bool: + return self.decode_mode() == CUDAGraphMode.FULL + + def valid_runtime_modes(self) -> bool: + return self in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL] + + def __str__(self) -> str: + return self.name + + +@config +@dataclass +class PassConfig: + """Configuration for custom Inductor passes. + + This is separate from general `CompilationConfig` so that inductor passes + don't all have access to full configuration - that would create a cycle as + the `PassManager` is set as a property of config.""" + + enable_fusion: bool = False + """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.""" + enable_attn_fusion: bool = False + """Whether to enable the custom attention+quant fusion pass.""" + enable_noop: bool = False + """Whether to enable the custom no-op elimination pass.""" + enable_sequence_parallelism: bool = False + """Whether to enable sequence parallelism.""" + enable_async_tp: bool = False + """Whether to enable async TP.""" + enable_fi_allreduce_fusion: bool = False + """Whether to enable flashinfer allreduce fusion.""" + fi_allreduce_fusion_max_size_mb: float | None = None + """The threshold of the communicated tensor sizes under which + vllm should use flashinfer fused allreduce. Specified as a + float in MB. + Unspecified will fallback to default values + which are compute capability and world size dependent. + FI_ALLREDUCE_FUSION_MAX_SIZE_MB = { + 90: { + 2: 64, # 64MB + 4: 2, # 2MB + 8: 1, # 1MB + }, + 100: { + 2: 64, # 64MB + 4: 32, # 32MB + 8: 1, # 1MB + }, + }, where key is the device capability""" + enable_qk_norm_rope_fusion: bool = False + """Whether to enable the fused Q/K RMSNorm + RoPE pass.""" + + # TODO(luka) better pass enabling system. + + def flashinfer_max_size(self, world_size: int) -> int | None: + """ + Returns the max communication size in bytes for flashinfer + allreduce fusion for the given world size. Returns None if world size + is not supported by configs as it's not supported by flashinfer. + """ + + MiB = 1024 * 1024 + max_size_mb = self.fi_allreduce_fusion_max_size_mb + if max_size_mb is None: + max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size) + + return int(max_size_mb * MiB) if max_size_mb is not None else None + + @staticmethod + def default_fi_allreduce_fusion_max_size_mb() -> dict[int, float]: + from vllm.compilation.collective_fusion import FI_ALLREDUCE_FUSION_MAX_SIZE_MB + from vllm.platforms import current_platform + + if not current_platform.is_cuda(): + return {} + return FI_ALLREDUCE_FUSION_MAX_SIZE_MB.get( + current_platform.get_device_capability().to_int(), {} + ) + + def uuid(self): + """ + Produces a hash unique to the pass configuration. + Any new fields that affect compilation should be added to the hash. + Any future fields that don't affect compilation should be excluded. + """ + return InductorPass.hash_dict(asdict(self)) + + def __post_init__(self) -> None: + if not self.enable_noop: + if self.enable_fusion: + logger.warning_once( + "Fusion enabled but reshape elimination disabled. " + "RMSNorm/SiluMul + quant (fp8) fusion might not work" + ) + if self.enable_attn_fusion: + logger.warning_once( + "Fusion enabled but reshape elimination disabled. " + "Attention + quant (fp8) fusion might not work" + ) + if self.enable_fi_allreduce_fusion: + logger.warning_once( + "Fusion enabled but reshape elimination disabled. " + "Allreduce + rms norm + quant (fp8) fusion might not work" + ) + if self.enable_qk_norm_rope_fusion and not current_platform.is_cuda_alike(): + logger.warning_once( + "QK Norm + RoPE fusion enabled but the current platform is not " + "CUDA or ROCm. The fusion will be disabled." + ) + self.enable_qk_norm_rope_fusion = False + + +@config +@dataclass +class CompilationConfig: + """Configuration for compilation. It has three parts: + + - Top-level Compilation control: + - [`mode`][vllm.config.CompilationConfig.mode] + - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] + - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] + - [`backend`][vllm.config.CompilationConfig.backend] + - [`custom_ops`][vllm.config.CompilationConfig.custom_ops] + - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops] + - [`compile_mm_encoder`][vllm.config.CompilationConfig.compile_mm_encoder] + - CudaGraph capture: + - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode] + - [`cudagraph_capture_sizes`] + [vllm.config.CompilationConfig.cudagraph_capture_sizes] + - [`max_cudagraph_capture_size`] + [vllm.config.CompilationConfig.max_cudagraph_capture_size] + - [`cudagraph_num_of_warmups`] + [vllm.config.CompilationConfig.cudagraph_num_of_warmups] + - [`cudagraph_copy_inputs`] + [vllm.config.CompilationConfig.cudagraph_copy_inputs] + - Inductor compilation: + - [`use_inductor`][vllm.config.CompilationConfig.use_inductor] + - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] + - [`inductor_compile_config`] + [vllm.config.CompilationConfig.inductor_compile_config] + - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes] + - custom inductor passes + + Why we have different sizes for cudagraph and inductor: + - cudagraph: a cudagraph captured for a specific size can only be used + for the same size. We need to capture all the sizes we want to use. + - inductor: a graph compiled by inductor for a general shape can be used + for different sizes. Inductor can also compile for specific sizes, + where it can have more information to optimize the graph with fully + static shapes. However, we find the general shape compilation is + sufficient for most cases. It might be beneficial to compile for + certain small batchsizes, where inductor is good at optimizing. + """ + + # Top-level Compilation control + level: int | None = None + """ + Level is deprecated and will be removed in the next release, + either 0.12.0 or 0.11.2 whichever is soonest. + Please use mode. Currently all levels are mapped to mode. + """ + # Top-level Compilation control + mode: CompilationMode | None = None + """The compilation approach used for torch.compile-based compilation of the + model. + + - None: If None, we will select the default compilation mode. + For V1 engine this is 3. + - 0: NONE: No torch.compile compilation is applied, model runs in fully + eager pytorch mode. The model runs as-is. + - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline. + - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding + recompilation by removing guards. + Requires no dynamic-shape-dependent control-flow. + - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching, + piecewise compilation, shape specialization, and custom passes.""" + debug_dump_path: Path | None = None + """The path to dump the debug information.""" + cache_dir: str = "" + """The directory to store the compiled graph, to accelerate Inductor + compilation. By default, it will use model-related information to generate + a cache directory.""" + compile_cache_save_format: Literal["binary", "unpacked"] = field( + default_factory=lambda: envs.VLLM_COMPILE_CACHE_SAVE_FORMAT + ) + """Format for saving torch compile cache:\n + - "binary": saves as binary file (multiprocess safe)\n + - "unpacked": saves as directory structure for inspection/debugging + (NOT multiprocess safe)\n + Defaults to `VLLM_COMPILE_CACHE_SAVE_FORMAT` if not specified. + """ + backend: str = "" + """The backend for compilation. It needs to be a string: + + - "" (empty string): use the default backend ("inductor" on CUDA-alike + platforms). + - "eager"/"openxla"/...: use the specified backend registered in PyTorch. + - "full.module.name": a qualified name which can be used to import the + + backend function. + We use string to avoid serialization issues when using compilation in a + distributed setting. When the compilation mode is 1 or 2, the backend is + used for the compilation directly (it sees the whole graph). When the + compilation mode is 3, the backend is used for the piecewise compilation + (it sees a part of the graph). The backend can not be custom for compilation + mode 3, i.e. the backend must be either eager or inductor. Furthermore, + compilation is only piecewise if splitting ops is set accordingly and + use_inductor_graph_partition is off. Note that the default options for + splitting ops are sufficient for piecewise compilation. + """ + custom_ops: list[str] = field(default_factory=list) + """Fine-grained control over which custom ops to enable/disable. Use 'all' + to enable all, 'none' to disable all. Also specify a list of custom op + names to enable (prefixed with a '+'), or disable (prefixed with a '-'). + Examples: + + - 'all,-op1' to enable all except op1 + - 'none,+op1,+op2' to enable only op1 and op2 + + By default, all custom ops are enabled when running without Inductor and + disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. + Inductor generates (fused) Triton kernels for disabled custom ops.""" + splitting_ops: list[str] | None = None + """A list of ops to exclude from cudagraphs, used in piecewise compilation. + + The behavior depends on use_inductor_graph_partition: + + - When use_inductor_graph_partition=False (default): + These ops are used for Dynamo FX-level graph splitting. The graph is + split at these ops before Inductor compilation, creating separate + subgraphs for cudagraph capture. + + - When use_inductor_graph_partition=True: + These ops are used to register Inductor partition rules. The graph + partitioning happens at Inductor codegen time after all passes and + fusions are finished, allowing compilation and custom passes to operate + on the full graph while still excluding these ops from cudagraphs. + + If None, defaults to attention ops for piecewise cudagraphs. + If empty list [], no ops are excluded (suitable for full cudagraphs).""" + compile_mm_encoder: bool = False + """Whether or not to compile the multimodal encoder. + Currently, this only works for `Qwen2_5_vl` on selected platforms. + Disabled by default until more models are supported/tested to work.""" + + # Inductor capture + use_inductor: bool | None = None + """ + Whether to use inductor compilation. + + This flag is deprecated and will be removed in the next release 0.12.0. + Please use the 'backend' option instead. + + - False: inductor compilation is not used. graph runs in eager + (custom_ops enabled by default). + - True: inductor compilation is used (custom_ops disabled by default). + One graph for symbolic shape and one graph per size in compile_sizes + are compiled using configurations in inductor_compile_config. + + This setting is ignored if mode512) that would + greatly increase startup time with limited performance benefit. + """ + local_cache_dir: str = field(default=None, init=False) # type: ignore + """local cache dir for each rank""" + bs_to_padded_graph_size: list[int] = field( + default=None, # type: ignore + init=False, + ) + """optimization: + Intuitively, bs_to_padded_graph_size should be dict[int, int]. + since we know all keys are in a range [0, max_cudagraph_capture_size], + we can optimize it to list[int] for better lookup performance.""" + + # keep track of enabled and disabled custom ops + enabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False) + """custom ops that are enabled""" + disabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False) + """custom ops that are disabled""" + traced_files: set[str] = field(default_factory=set, init=False) + """files that are traced for compilation""" + compilation_time: float = field(default=0.0, init=False) + """time taken for compilation""" + + static_forward_context: dict[str, Any] = field(default_factory=dict, init=False) + """Per-model forward context + Map from layer name to layer objects that need to be accessed outside + model code, e.g., Attention, FusedMOE when dp_size>1.""" + + # Attention ops; used for piecewise cudagraphs + # Use PyTorch operator format: "namespace::name" + _attention_ops: ClassVar[list[str]] = [ + "vllm::unified_attention", + "vllm::unified_attention_with_output", + "vllm::unified_mla_attention", + "vllm::unified_mla_attention_with_output", + "vllm::mamba_mixer2", + "vllm::mamba_mixer", + "vllm::short_conv", + "vllm::linear_attention", + "vllm::plamo2_mamba_mixer", + "vllm::gdn_attention_core", + "vllm::kda_attention", + "vllm::sparse_attn_indexer", + ] + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.mode) + factors.append(self.backend) + factors.append(self.custom_ops) + factors.append(self.splitting_ops) + factors.append(self.use_inductor) + factors.append(self.use_inductor_graph_partition) + factors.append(self.inductor_compile_config) + factors.append(self.inductor_passes) + factors.append(self.pass_config.uuid()) + factors.append(self.compile_cache_save_format) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __repr__(self) -> str: + exclude = { + "static_forward_context": True, + "enabled_custom_ops": True, + "disabled_custom_ops": True, + "compilation_time": True, + "bs_to_padded_graph_size": True, + "traced_files": True, + "inductor_compile_config": { + "post_grad_custom_post_pass": True, + }, + } + + # exclude default attr in pass_config + pass_config_exclude = {} + for attr, default_val in vars(PassConfig()).items(): + if getattr(self.pass_config, attr) == default_val: + pass_config_exclude[attr] = True + if pass_config_exclude: + exclude["pass_config"] = pass_config_exclude + + config = TypeAdapter(CompilationConfig).dump_python( + self, exclude=exclude, exclude_unset=True + ) + + return str(config) + + __str__ = __repr__ + + @field_validator("mode", mode="before") + @classmethod + def validate_mode_before(cls, value: Any) -> Any: + """ + Enable parsing the `mode` field from string mode names. + Accepts both integers (0-3) and string names, like NONE, STOCK_TORCH_COMPILE, + DYNAMO_TRACE_ONCE, VLLM_COMPILE. + """ + if isinstance(value, str): + # Convert string mode name to integer value + mode_name = value.upper() + + if mode_name not in CompilationMode.__members__: + raise ValueError( + f"Invalid compilation mode: {value}. " + f"Valid modes are: {', '.join(CompilationMode.__members__.keys())}" + ) + + return CompilationMode[mode_name] + return value + + @field_validator("cudagraph_mode", mode="before") + @classmethod + def validate_cudagraph_mode_before(cls, value: Any) -> Any: + """Enable parsing of the `cudagraph_mode` enum type from string.""" + if isinstance(value, str): + return CUDAGraphMode[value.upper()] + return value + + @field_validator("pass_config", mode="before") + @classmethod + def validate_pass_config_before(cls, value: Any) -> Any: + """Enable parsing of the `pass_config` field from a dictionary.""" + if isinstance(value, dict): + return PassConfig(**value) + return value + + @field_validator("compile_cache_save_format") + @classmethod + def validate_compile_cache_save_format(cls, value: str) -> str: + if value not in ("binary", "unpacked"): + raise ValueError( + f"compile_cache_save_format must be 'binary' or 'unpacked', " + f"got: {value}" + ) + return value + + def __post_init__(self) -> None: + if self.level is not None: + logger.warning( + "Level is deprecated and will be removed in the next release," + "either 0.12.0 or 0.11.2 whichever is soonest." + "Use mode instead." + "If both level and mode are given," + "only mode will be used." + ) + if self.mode is None: + self.mode = self.level + + count_none = self.custom_ops.count("none") + count_all = self.custom_ops.count("all") + assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" + + # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2: + # 1. A bug in PyTorch, fixed in 2.7: + # https://github.com/pytorch/pytorch/issues/147924 + # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't + # work with V2. Addressing this will take extra engineering effort + # and it is not yet a priority. RFC here: + # https://github.com/vllm-project/vllm/issues/14703 + + if is_torch_equal_or_newer("2.6"): + KEY = "enable_auto_functionalized_v2" + if KEY not in self.inductor_compile_config: + self.inductor_compile_config[KEY] = False + + for k, v in self.inductor_passes.items(): + if not isinstance(v, str): + assert callable(v), f"pass {k} should be callable or a qualified name" + self.inductor_compile_config[k] = ( + v if isinstance(v, InductorPass) else CallableInductorPass(v) + ) + continue + + # resolve function from qualified name + names = v.split(".") + module = ".".join(names[:-1]) + func_name = names[-1] + func = __import__(module).__dict__[func_name] + self.inductor_compile_config[k] = ( + func if isinstance(func, InductorPass) else CallableInductorPass(func) + ) + + if self.pass_config.enable_qk_norm_rope_fusion: + # TODO(zhuhaoran): support rope native forward match and remove this. + # Linked issue: https://github.com/vllm-project/vllm/issues/28042 + self.custom_ops.append("+rotary_embedding") + + if ( + is_torch_equal_or_newer("2.9.0.dev") + and "combo_kernels" not in self.inductor_compile_config + and "benchmark_combo_kernel" not in self.inductor_compile_config + ): + # use horizontal fusion, which is useful for fusing qk-norm and + # qk-rope when query and key have different shapes. + self.inductor_compile_config["combo_kernels"] = True + self.inductor_compile_config["benchmark_combo_kernel"] = True + + if self.use_inductor_graph_partition and not is_torch_equal_or_newer( + "2.9.0.dev" + ): + raise ValueError( + "use_inductor_graph_partition is only " + "supported with torch>=2.9.0.dev. Set " + "use_inductor_graph_partition=False instead." + ) + + for op in self.custom_ops: + if op[0] not in {"+", "-"} and op not in {"all", "none"}: + raise ValueError( + f"Invalid syntax '{op}' for custom op, " + "must be 'all', 'none', '+op' or '-op' " + "(where 'op' is the registered op name)" + ) + + # Currently only eager and inductor backend are supported. + # for piecewise compilation. Custom backends are not suppported for + # piecewise compilation. Update when more backends are supported. + if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [ + "", + "eager", + "inductor", + ]: + raise ValueError( + f"Invalid backend for piecewise compilation: {self.backend}" + ) + + if self.use_inductor is not None: + logger.warning_once( + "The 'use_inductor' flag is deprecated and will be " + "removed in the next release (v0.12.0). " + "Please use the 'backend' option instead.", + ) + self.backend = "inductor" if self.use_inductor else "eager" + + if self.backend == "": + self.backend = current_platform.simple_compile_backend + + def init_backend(self, vllm_config: "VllmConfig") -> str | Callable: + """ + Initialize the backend for the compilation config from a vllm config. + Arguments: + vllm_config: The vllm config to initialize the backend from. + Returns: + The backend for the compilation config. + """ + if self.mode is None: + raise ValueError( + "No compilation mode is set. This method should only be \ + called via vllm config where the level is set if none is \ + provided." + ) + if self.mode == CompilationMode.NONE: + raise ValueError("No compilation mode is set.") + + from torch._dynamo.backends.registry import list_backends + + torch_backends = list_backends(exclude_tags=tuple()) + if self.mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + ]: + if self.backend in torch_backends: + return self.backend + return resolve_obj_by_qualname(self.backend) + + assert self.mode == CompilationMode.VLLM_COMPILE + if self.backend not in ["eager", "inductor"]: + raise ValueError( + f"Invalid backend for piecewise compilation: {self.backend}" + ) + + from vllm.compilation.backends import VllmBackend + + # TODO[@lucaskabela]: See if we can forward prefix + # https://github.com/vllm-project/vllm/issues/27045 + return VllmBackend(vllm_config) + + def post_init_cudagraph_sizes(self) -> None: + """To complete the initialization after cudagraph related + configs are set. This includes: + - initialize compile_sizes + - pre-compute the mapping bs_to_padded_graph_size + """ + + computed_compile_sizes = [] + if self.compile_sizes is not None: + # de-duplicate the sizes provided by the config + self.compile_sizes = list(set(self.compile_sizes)) + for x in self.compile_sizes: + if isinstance(x, str): + assert x == "cudagraph_capture_sizes", ( + "Unrecognized size type in compile_sizes, " + f"expect 'cudagraph_capture_sizes', got {x}" + ) + computed_compile_sizes.extend(self.cudagraph_capture_sizes) + else: + assert isinstance(x, int) + computed_compile_sizes.append(x) + self.compile_sizes = computed_compile_sizes # type: ignore + + # make sure the sizes are in ascending order + self.cudagraph_capture_sizes.sort() + if self.cudagraph_capture_sizes: + assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size + + # May get recomputed in the model runner if adjustment is needed for spec-decode + self.compute_bs_to_padded_graph_size() + + def set_splitting_ops_for_v1(self): + # NOTE: this function needs to be called only when mode is + # CompilationMode.VLLM_COMPILE + assert self.mode == CompilationMode.VLLM_COMPILE, ( + "set_splitting_ops_for_v1 should only be called when " + "mode is CompilationMode.VLLM_COMPILE" + ) + + if self.use_inductor_graph_partition: + self.set_splitting_ops_for_inductor_graph_partition() + return + + if self.pass_config.enable_attn_fusion: + # here use_inductor_graph_partition is False + self.set_splitting_ops_for_attn_fusion() + return + + if self.splitting_ops is None: + # NOTE: When using full cudagraph, instead of setting an empty + # list and capture the full cudagraph inside the flattened fx + # graph, we keep the piecewise fx graph structure but capture + # the full cudagraph outside the fx graph. This reduces some + # cpu overhead when the runtime batch_size is not cudagraph + # captured. see https://github.com/vllm-project/vllm/pull/20059 + # for details. Make a copy to avoid mutating the class-level + # list via reference. + self.splitting_ops = list(self._attention_ops) + elif len(self.splitting_ops) == 0: + logger.warning_once("Using piecewise compilation with empty splitting_ops") + if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.warning_once( + "Piecewise compilation with empty splitting_ops do not" + "contains piecewise cudagraph. Setting cudagraph_" + "mode to NONE. Hint: If you are using attention backends " + "that support cudagraph, consider manually setting " + "cudagraph_mode to FULL or FULL_DECODE_ONLY to enable " + "full cudagraphs." + ) + self.cudagraph_mode = CUDAGraphMode.NONE + elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: + logger.warning_once( + "Piecewise compilation with empty splitting_ops do not " + "contains piecewise cudagraph. Setting cudagraph_mode " + "to FULL." + ) + self.cudagraph_mode = CUDAGraphMode.FULL + self.splitting_ops = [] + + def set_splitting_ops_for_inductor_graph_partition(self): + assert self.use_inductor_graph_partition + if self.splitting_ops is None: + self.splitting_ops = list(self._attention_ops) + + def set_splitting_ops_for_attn_fusion(self): + assert self.pass_config.enable_attn_fusion + if self.splitting_ops is None: + self.splitting_ops = [] + if self.cudagraph_mode.has_piecewise_cudagraphs(): + logger.warning_once( + "enable_attn_fusion is incompatible with piecewise " + "cudagraph when use_inductor_graph_partition is off. " + "In this case, splitting_ops will be set to empty " + "list, and cudagraph_mode will be set to FULL. " + "Please ensure you are using attention backends that " + "support cudagraph or set cudagraph_mode to NONE " + "explicitly if encountering any problems." + ) + self.cudagraph_mode = CUDAGraphMode.FULL + + assert not self.splitting_ops_contain_attention(), ( + "attention ops should not be in splitting_ops " + "when enable_attn_fusion is True" + ) + + def splitting_ops_contain_attention(self) -> bool: + return self.splitting_ops is not None and all( + op in self.splitting_ops for op in self._attention_ops + ) + + def is_attention_compiled_piecewise(self) -> bool: + if not self.splitting_ops_contain_attention(): + return False + + if not self.use_inductor_graph_partition: + # Dynamo-level FX split case + return self.mode == CompilationMode.VLLM_COMPILE + + # Inductor partition case + return self.backend == "inductor" and self.mode != CompilationMode.NONE + + def custom_op_log_check(self): + """ + This method logs the enabled/disabled custom ops and checks that the + passed custom_ops field only contains relevant ops. + It is called at the end of set_current_vllm_config, + after the custom ops have been instantiated. + """ + + if len(self.enabled_custom_ops) + len(self.disabled_custom_ops) == 0: + logger.debug("No custom ops found in model.") + return + + logger.debug("enabled custom ops: %s", self.enabled_custom_ops) + logger.debug("disabled custom ops: %s", self.disabled_custom_ops) + + all_ops_in_model = self.enabled_custom_ops | self.disabled_custom_ops + for op in self.custom_ops: + if op in {"all", "none"}: + continue + + assert op[0] in {"+", "-"}, ( + "Invalid custom op syntax (should be checked during init)" + ) + + # check if op name exists in model + op_name = op[1:] + if op_name not in all_ops_in_model: + from vllm.model_executor.custom_op import CustomOp + + # Does op exist at all or is it just not present in this model? + # Note: Only imported op classes appear in the registry. + missing_str = ( + "doesn't exist (or wasn't imported/registered)" + if op_name not in CustomOp.op_registry + else "not present in model" + ) + + enable_str = "enabling" if op[0] == "+" else "disabling" + logger.warning_once( + "Op '%s' %s, %s with '%s' has no effect", + op_name, + missing_str, + enable_str, + op, + ) + + def adjust_cudagraph_sizes_for_spec_decode( + self, uniform_decode_query_len: int, tensor_parallel_size: int + ): + multiple_of = uniform_decode_query_len + if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism: + multiple_of = max(uniform_decode_query_len, tensor_parallel_size) + if ( + multiple_of % uniform_decode_query_len != 0 + or multiple_of % tensor_parallel_size != 0 + ): + raise ValueError( + f"Can't determine cudagraph shapes that are both a " + f"multiple of {uniform_decode_query_len} " + f"(num_speculative_tokens + 1) required by spec-decode " + f"and {tensor_parallel_size} (tensor_parallel_size) " + f"required by sequence parallelism please adjust " + f"num_speculative_tokens or disable sequence parallelism" + ) + + if not self.cudagraph_capture_sizes or multiple_of <= 1: + return + + assert self.max_cudagraph_capture_size is not None + rounded_sizes = sorted( + set( + round_up(size, multiple_of) + for size in self.cudagraph_capture_sizes + if round_up(size, multiple_of) <= self.max_cudagraph_capture_size + ) + ) + + if len(rounded_sizes) == 0: + logger.warning( + "No valid cudagraph sizes after rounding to multiple of " + " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens" + " or max_cudagraph_capture_size (or cudagraph_capture_sizes)", + multiple_of, + ) + return + + self.max_cudagraph_capture_size = rounded_sizes[-1] + self.cudagraph_capture_sizes = rounded_sizes + + # Recompute after adjusting the cudagraph sizes + self.compute_bs_to_padded_graph_size() + + def compute_bs_to_padded_graph_size(self): + # pre-compute the mapping from batch size to padded graph size + self.bs_to_padded_graph_size = [ + 0 for i in range(self.max_cudagraph_capture_size + 1) + ] + for end, start in zip( + self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1], + [0] + self.cudagraph_capture_sizes, + ): + for bs in range(start, end): + if bs == start: + self.bs_to_padded_graph_size[bs] = start + else: + self.bs_to_padded_graph_size[bs] = end diff --git a/config/device.py b/config/device.py new file mode 100644 index 0000000..e85cd15 --- /dev/null +++ b/config/device.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from dataclasses import field +from typing import Any, Literal + +import torch +from pydantic import ConfigDict, SkipValidation +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + +Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class DeviceConfig: + """Configuration for the device to use for vLLM execution.""" + + device: SkipValidation[Device | torch.device | None] = "auto" + """Device type for vLLM execution. + This parameter is deprecated and will be + removed in a future release. + It will now be set automatically based + on the current platform.""" + device_type: str = field(init=False) + """Device type from the current platform. This is set in + `__post_init__`.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # the device/platform information will be summarized + # by torch/vllm automatically. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self): + if self.device == "auto": + # Automated device type detection + from vllm.platforms import current_platform + + self.device_type = current_platform.device_type + if not self.device_type: + raise RuntimeError( + "Failed to infer device type, please set " + "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` " + "to turn on verbose logging to help debug the issue." + ) + else: + # Device type is assigned explicitly + if isinstance(self.device, str): + self.device_type = self.device + elif isinstance(self.device, torch.device): + self.device_type = self.device.type + + # Some device types require processing inputs on CPU + if self.device_type in ["tpu"]: + self.device = None + else: + # Set device with device type + self.device = torch.device(self.device_type) diff --git a/config/ec_transfer.py b/config/ec_transfer.py new file mode 100644 index 0000000..d95236f --- /dev/null +++ b/config/ec_transfer.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import hashlib +import uuid +from dataclasses import field +from typing import Any, Literal, get_args + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + +ECProducer = Literal["ec_producer"] +ECConsumer = Literal["ec_consumer"] +ECRole = Literal[ECProducer, ECConsumer] + + +@config +@dataclass +class ECTransferConfig: + """Configuration for distributed EC cache transfer.""" + + ec_connector: str | None = None + """The EC connector for vLLM to transmit EC caches between vLLM instances. + """ + + engine_id: str | None = None + """The engine id for EC transfers.""" + + ec_buffer_device: str | None = "cuda" + """The device used by ec connector to buffer the EC cache. + Currently only support 'cuda'.""" + + ec_buffer_size: float = 1e9 + """The buffer size for TorchDistributedConnector. Measured in number of + bytes. Recommended value: 1e9 (about 1GB).""" + + ec_role: ECRole | None = None + """Whether this vLLM instance produces, consumes EC cache, or both. Choices + are 'ec_producer', 'ec_consumer'.""" + + ec_rank: int | None = None + """The rank of this vLLM instance in the EC cache transfer. Typical value: + 0 for encoder, 1 for pd instance. + Currently only 1P1D is supported.""" + + ec_parallel_size: int = 1 + """The number of parallel instances for EC cache transfer. For + PyNcclConnector, this should be 2.""" + + ec_ip: str = "127.0.0.1" + """The EC connector ip, used to build distributed connection.""" + + ec_port: int = 14579 + """The EC connector port, used to build distributed connection.""" + + ec_connector_extra_config: dict[str, Any] = field(default_factory=dict) + """any extra config that the connector may need.""" + + ec_connector_module_path: str | None = None + """The Python module path to dynamically load the EC connector from. + Only supported in V1.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self) -> None: + if self.engine_id is None: + self.engine_id = str(uuid.uuid4()) + + if self.ec_role is not None and self.ec_role not in get_args(ECRole): + raise ValueError( + f"Unsupported ec_role: {self.ec_role}. " + f"Supported roles are {get_args(ECRole)}" + ) + + if self.ec_connector is not None and self.ec_role is None: + raise ValueError( + "Please specify ec_role when ec_connector " + f"is set, supported roles are {get_args(ECRole)}" + ) + + @property + def is_ec_transfer_instance(self) -> bool: + return self.ec_connector is not None and self.ec_role in get_args(ECRole) + + @property + def is_ec_producer(self) -> bool: + return self.ec_connector is not None and self.ec_role in get_args(ECProducer) + + @property + def is_ec_consumer(self) -> bool: + return self.ec_connector is not None and self.ec_role in get_args(ECConsumer) + + def get_from_extra_config(self, key, default) -> Any: + return self.ec_connector_extra_config.get(key, default) diff --git a/config/kv_events.py b/config/kv_events.py new file mode 100644 index 0000000..ce46cc0 --- /dev/null +++ b/config/kv_events.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from typing import Literal + +from pydantic import Field +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + + +@config +@dataclass +class KVEventsConfig: + """Configuration for KV event publishing.""" + + enable_kv_cache_events: bool = False + """If True, enable KV cache events for tracking block storage and removal. + Events can be published externally by zmq using the event publisher config. + """ + + publisher: Literal["null", "zmq"] = Field(default=None) + """The publisher to use for publishing kv events. Can be "null", "zmq". + """ + + endpoint: str = "tcp://*:5557" + """The zmq endpoint to use for publishing kv events. + """ + + replay_endpoint: str | None = None + """The zmq endpoint to use for replaying kv events. + """ + + buffer_steps: int = 10_000 + """The number of steps to cache for replay endpoint. Will only save + events from the last N steps for the replay endpoint. + """ + + hwm: int = 100_000 + """The zmq high water mark for the event publisher. After queueing N events, + events will start dropping if the consumer is not keeping up. + """ + + max_queue_size: int = 100_000 + """The maximum number of events to queue while waiting for publishing. + """ + + topic: str = "" + """The topic to use for the event publisher. Consumers can subscribe to + this topic to receive events. + """ + + def __post_init__(self): + if self.publisher is None: + self.publisher = "zmq" if self.enable_kv_cache_events else "null" diff --git a/config/kv_transfer.py b/config/kv_transfer.py new file mode 100644 index 0000000..dfd7ef6 --- /dev/null +++ b/config/kv_transfer.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +import uuid +from dataclasses import field +from typing import Any, Literal, get_args + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + +KVProducer = Literal["kv_producer", "kv_both"] +KVConsumer = Literal["kv_consumer", "kv_both"] +KVRole = Literal[KVProducer, KVConsumer] + + +@config +@dataclass +class KVTransferConfig: + """Configuration for distributed KV cache transfer.""" + + kv_connector: str | None = None + """The KV connector for vLLM to transmit KV caches between vLLM instances. + """ + + engine_id: str | None = None + """The engine id for KV transfers.""" + + kv_buffer_device: str = "cuda" + """The device used by kv connector to buffer the KV cache. Choices are + 'cuda' and 'cpu'.""" + + kv_buffer_size: float = 1e9 + """The buffer size for TorchDistributedConnector. Measured in number of + bytes. Recommended value: 1e9 (about 1GB).""" + + kv_role: KVRole | None = None + """Whether this vLLM instance produces, consumes KV cache, or both. Choices + are 'kv_producer', 'kv_consumer', and 'kv_both'.""" + + kv_rank: int | None = None + """The rank of this vLLM instance in the KV cache transfer. Typical value: + 0 for prefill instance, 1 for decode instance. + Currently only 1P1D is supported.""" + + kv_parallel_size: int = 1 + """The number of parallel instances for KV cache transfer. For + P2pNcclConnector, this should be 2.""" + + kv_ip: str = "127.0.0.1" + """The KV connector ip, used to build distributed connection.""" + + kv_port: int = 14579 + """The KV connector port, used to build distributed connection.""" + + kv_connector_extra_config: dict[str, Any] = field(default_factory=dict) + """any extra config that the connector may need.""" + + kv_connector_module_path: str | None = None + """The Python module path to dynamically load the KV connector from. + Only supported in V1.""" + + enable_permute_local_kv: bool = False + """Experiment feature flag to enable HND to NHD KV Transfer""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self) -> None: + if self.engine_id is None: + self.engine_id = str(uuid.uuid4()) + + if self.kv_role is not None and self.kv_role not in get_args(KVRole): + raise ValueError( + f"Unsupported kv_role: {self.kv_role}. " + f"Supported roles are {get_args(KVRole)}" + ) + + if self.kv_connector is not None and self.kv_role is None: + raise ValueError( + "Please specify kv_role when kv_connector " + f"is set, supported roles are {get_args(KVRole)}" + ) + + @property + def is_kv_transfer_instance(self) -> bool: + return self.kv_connector is not None and self.kv_role in get_args(KVRole) + + @property + def is_kv_producer(self) -> bool: + return self.kv_connector is not None and self.kv_role in get_args(KVProducer) + + @property + def is_kv_consumer(self) -> bool: + return self.kv_connector is not None and self.kv_role in get_args(KVConsumer) + + def get_from_extra_config(self, key, default) -> Any: + return self.kv_connector_extra_config.get(key, default) diff --git a/config/load.py b/config/load.py new file mode 100644 index 0000000..e424f8c --- /dev/null +++ b/config/load.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import TYPE_CHECKING, Any + +from pydantic import Field, field_validator +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config +from vllm.logger import init_logger + +if TYPE_CHECKING: + from vllm.model_executor.model_loader import LoadFormats + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig +else: + LoadFormats = Any + TensorizerConfig = Any + +logger = init_logger(__name__) + + +@config +@dataclass +class LoadConfig: + """Configuration for loading the model weights.""" + + load_format: str | LoadFormats = "auto" + """The format of the model weights to load:\n + - "auto" will try to load the weights in the safetensors format and fall + back to the pytorch bin format if safetensors format is not available.\n + - "pt" will load the weights in the pytorch bin format.\n + - "safetensors" will load the weights in the safetensors format.\n + - "npcache" will load the weights in pytorch format and store a numpy cache + to speed up the loading.\n + - "dummy" will initialize the weights with random values, which is mainly + for profiling.\n + - "tensorizer" will use CoreWeave's tensorizer library for fast weight + loading. See the Tensorize vLLM Model script in the Examples section for + more information.\n + - "runai_streamer" will load the Safetensors weights using Run:ai Model + Streamer.\n + - "runai_streamer_sharded" will load weights from pre-sharded checkpoint + files using Run:ai Model Streamer.\n + - "bitsandbytes" will load the weights using bitsandbytes quantization.\n + - "sharded_state" will load weights from pre-sharded checkpoint files, + supporting efficient loading of tensor-parallel models.\n + - "gguf" will load weights from GGUF format files (details specified in + https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n + - "mistral" will load weights from consolidated safetensors files used by + Mistral models. + - Other custom values can be supported via plugins.""" + download_dir: str | None = None + """Directory to download and load the weights, default to the default + cache directory of Hugging Face.""" + safetensors_load_strategy: str = "lazy" + """Specifies the loading strategy for safetensors weights. + - "lazy" (default): Weights are memory-mapped from the file. This enables + on-demand loading and is highly efficient for models on local storage. + - "eager": The entire file is read into CPU memory upfront before loading. + This is recommended for models on network filesystems (e.g., Lustre, NFS) + as it avoids inefficient random reads, significantly speeding up model + initialization. However, it uses more CPU RAM. + - "torchao": Weights are loaded in upfront and then reconstructed + into torchao tensor subclasses. This is used when the checkpoint + was quantized using torchao and saved using safetensors. + Needs torchao >= 0.14.0 + """ + model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict) + """Extra config for model loader. This will be passed to the model loader + corresponding to the chosen load_format.""" + device: str | None = None + """Device to which model weights will be loaded, default to + device_config.device""" + ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"]) + """The list of patterns to ignore when loading the model. Default to + "original/**/*" to avoid repeated loading of llama's checkpoints.""" + use_tqdm_on_load: bool = True + """Whether to enable tqdm for showing progress bar when loading model + weights.""" + pt_load_map_location: str | dict[str, str] = "cpu" + """ + pt_load_map_location: the map location for loading pytorch checkpoint, to + support loading checkpoints can only be loaded on certain devices like + "cuda", this is equivalent to {"": "cuda"}. Another supported format is + mapping from different devices like from GPU 1 to GPU 0: + {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings + in dictionary needs to be double quoted for json parsing. For more details, + see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + @field_validator("load_format", mode="after") + def _lowercase_load_format(cls, load_format: str) -> str: + return load_format.lower() + + @field_validator("ignore_patterns", mode="after") + def _validate_ignore_patterns( + cls, ignore_patterns: list[str] | str + ) -> list[str] | str: + if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0: + logger.info( + "Ignoring the following patterns when downloading weights: %s", + ignore_patterns, + ) + + return ignore_patterns diff --git a/config/lora.py b/config/lora.py new file mode 100644 index 0000000..84e92ee --- /dev/null +++ b/config/lora.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import TYPE_CHECKING, Any, ClassVar, Literal + +import torch +from pydantic import ConfigDict, Field, model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.platforms import current_platform + +if TYPE_CHECKING: + from vllm.config import ModelConfig + from vllm.config.cache import CacheConfig +else: + ModelConfig = Any + CacheConfig = Any + +logger = init_logger(__name__) + +LoRADType = Literal["auto", "float16", "bfloat16"] +MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512] +LoRAExtraVocabSize = Literal[256, 512] + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class LoRAConfig: + """Configuration for LoRA.""" + + max_lora_rank: MaxLoRARanks = 16 + """Max LoRA rank.""" + max_loras: int = Field(default=1, ge=1) + """Max number of LoRAs in a single batch.""" + fully_sharded_loras: bool = False + """By default, only half of the LoRA computation is sharded with tensor + parallelism. Enabling this will use the fully sharded layers. At high + sequence length, max rank or tensor parallel size, this is likely faster. + """ + max_cpu_loras: int | None = None + """Maximum number of LoRAs to store in CPU memory. Must be >= than + `max_loras`.""" + lora_dtype: torch.dtype | LoRADType = "auto" + """Data type for LoRA. If auto, will default to base model dtype.""" + lora_extra_vocab_size: LoRAExtraVocabSize = Field( + default=256, + deprecated=( + "`lora_extra_vocab_size` is deprecated and will be removed " + "in v0.12.0. Additional vocabulary support for " + "LoRA adapters is being phased out." + ), + ) + """(Deprecated) Maximum size of extra vocabulary that can be present in a + LoRA adapter. Will be removed in v0.12.0.""" + lora_vocab_padding_size: ClassVar[int] = ( + current_platform.get_lora_vocab_padding_size() + ) + default_mm_loras: dict[str, str] | None = None + """Dictionary mapping specific modalities to LoRA model paths; this field + is only applicable to multimodal models and should be leveraged when a + model always expects a LoRA to be active when a given modality is present. + Note that currently, if a request provides multiple additional + modalities, each of which have their own LoRA, we do NOT apply + default_mm_loras because we currently only support one lora adapter + per prompt. When run in offline mode, the lora IDs for n modalities + will be automatically assigned to 1-n with the names of the modalities + in alphabetic order.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.max_lora_rank) + factors.append(self.max_loras) + factors.append(self.fully_sharded_loras) + factors.append(self.lora_dtype) + factors.append(self.lora_extra_vocab_size) + factors.append(self.lora_vocab_padding_size) + + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + @model_validator(mode="after") + def _validate_lora_config(self) -> Self: + if self.max_cpu_loras is None: + self.max_cpu_loras = self.max_loras + elif self.max_cpu_loras < self.max_loras: + raise ValueError( + f"max_cpu_loras ({self.max_cpu_loras}) must be >= " + f"max_loras ({self.max_loras})" + ) + + return self + + def verify_with_model_config(self, model_config: ModelConfig): + if self.lora_dtype in (None, "auto"): + self.lora_dtype = model_config.dtype + elif isinstance(self.lora_dtype, str): + self.lora_dtype = getattr(torch, self.lora_dtype) diff --git a/config/model.py b/config/model.py new file mode 100644 index 0000000..784ed44 --- /dev/null +++ b/config/model.py @@ -0,0 +1,2172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +import json +import warnings +from collections.abc import Callable +from dataclasses import InitVar, field +from importlib.util import find_spec +from typing import TYPE_CHECKING, Any, Literal, cast, get_args + +import torch +from pydantic import ConfigDict, SkipValidation, field_validator, model_validator +from pydantic.dataclasses import dataclass +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE + +import vllm.envs as envs +from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig +from vllm.config.pooler import PoolerConfig +from vllm.config.scheduler import RunnerType +from vllm.config.utils import assert_hashable, config, getattr_iter +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.transformers_utils.config import ( + ConfigFormat, + get_config, + get_hf_image_processor_config, + get_hf_text_config, + get_pooling_config, + get_sentence_transformer_tokenizer_config, + is_encoder_decoder, + try_get_dense_modules, + try_get_generation_config, + try_get_safetensors_metadata, + try_get_tokenizer_config, + uses_mrope, +) +from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri +from vllm.transformers_utils.utils import maybe_model_redirect +from vllm.utils.import_utils import LazyLoader +from vllm.utils.torch_utils import common_broadcastable_dtype + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + import vllm.model_executor.layers.quantization as me_quant + import vllm.model_executor.models as me_models + from vllm.attention.backends.registry import AttentionBackendEnum + from vllm.config.load import LoadConfig + from vllm.config.parallel import ParallelConfig + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.v1.sample.logits_processor import LogitsProcessor +else: + PretrainedConfig = Any + + AttentionBackendEnum = Any + me_quant = LazyLoader( + "model_executor", globals(), "vllm.model_executor.layers.quantization" + ) + me_models = LazyLoader("model_executor", globals(), "vllm.model_executor.models") + LoadConfig = Any + ParallelConfig = Any + QuantizationMethods = Any + LogitsProcessor = Any + +logger = init_logger(__name__) + +RunnerOption = Literal["auto", RunnerType] +ConvertType = Literal["none", "embed", "classify", "reward"] +ConvertOption = Literal["auto", ConvertType] +TaskOption = Literal[ + "auto", + "generate", + "embedding", + "embed", + "classify", + "score", + "reward", + "transcription", + "draft", +] +TokenizerMode = Literal["auto", "slow", "mistral", "custom"] +ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] +LogprobsMode = Literal[ + "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" +] +HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig] +ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"] +LayerBlockType = Literal["attention", "linear_attention", "mamba"] + +_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { + "generate": ["generate", "transcription"], + "pooling": ["embedding", "embed", "classify", "score", "reward"], + "draft": ["draft"], +} + +_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { + "generate": [], + "pooling": ["embed", "classify", "reward"], + "draft": [], +} + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class ModelConfig: + """Configuration for the model.""" + + model: str = "Qwen/Qwen3-0.6B" + """Name or path of the Hugging Face model to use. It is also used as the + content for `model_name` tag in metrics output when `served_model_name` is + not specified.""" + runner: RunnerOption = "auto" + """The type of model runner to use. Each vLLM instance only supports one + model runner, even if the same model can be used for multiple types.""" + convert: ConvertOption = "auto" + """Convert the model using adapters defined in + [vllm.model_executor.models.adapters][]. The most common use case is to + adapt a text generation model to be used for pooling tasks.""" + task: TaskOption | None = None + """[DEPRECATED] The task to use the model for. If the model supports more + than one model runner, this is used to select which model runner to run. + + Note that the model may support other tasks using the same model runner. + """ + tokenizer: SkipValidation[str] = None # type: ignore + """Name or path of the Hugging Face tokenizer to use. If unspecified, model + name or path will be used.""" + tokenizer_mode: TokenizerMode = "auto" + """Tokenizer mode:\n + - "auto" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "custom" will use --tokenizer to select the preregistered tokenizer.""" + trust_remote_code: bool = False + """Trust remote code (e.g., from HuggingFace) when downloading the model + and tokenizer.""" + dtype: ModelDType | torch.dtype = "auto" + """Data type for model weights and activations:\n + - "auto" will use FP16 precision for FP32 and FP16 models, and BF16 + precision for BF16 models.\n + - "half" for FP16. Recommended for AWQ quantization.\n + - "float16" is the same as "half".\n + - "bfloat16" for a balance between precision and range.\n + - "float" is shorthand for FP32 precision.\n + - "float32" for FP32 precision.""" + seed: int | None = None + """Random seed for reproducibility. Initialized to None in V0, but + initialized to 0 in V1.""" + hf_config: PretrainedConfig = field(init=False) + """The Hugging Face config of the model.""" + hf_text_config: PretrainedConfig = field(init=False) + """The Hugging Face config of the text model (same as hf_config for text models).""" + hf_config_path: str | None = None + """Name or path of the Hugging Face config to use. If unspecified, model + name or path will be used.""" + allowed_local_media_path: str = "" + """Allowing API requests to read local images or videos from directories + specified by the server file system. This is a security risk. Should only + be enabled in trusted environments.""" + allowed_media_domains: list[str] | None = None + """If set, only media URLs that belong to this domain can be used for + multi-modal inputs. """ + revision: str | None = None + """The specific model version to use. It can be a branch name, a tag name, + or a commit id. If unspecified, will use the default version.""" + code_revision: str | None = None + """The specific revision to use for the model code on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + tokenizer_revision: str | None = None + """The specific revision to use for the tokenizer on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + max_model_len: SkipValidation[int] = None # type: ignore + """Model context length (prompt and output). If unspecified, will be + automatically derived from the model config. + + When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable + format. Examples:\n + - 1k -> 1000\n + - 1K -> 1024\n + - 25.6k -> 25,600""" + spec_target_max_model_len: int | None = None + """Specify the maximum length for spec decoding draft models.""" + quantization: SkipValidation[QuantizationMethods | None] = None + """Method used to quantize the weights. If `None`, we first check the + `quantization_config` attribute in the model config file. If that is + `None`, we assume the model weights are not quantized and use `dtype` to + determine the data type of the weights.""" + enforce_eager: bool = False + """Whether to always use eager-mode PyTorch. If True, we will disable CUDA + graph and always execute the model in eager mode. If False, we will use + CUDA graph and eager execution in hybrid for maximal performance and + flexibility.""" + max_logprobs: int = 20 + """Maximum number of log probabilities to return when `logprobs` is + specified in `SamplingParams`. The default value comes the default for the + OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * + vocab_size) logprobs are allowed to be returned and it may cause OOM.""" + logprobs_mode: LogprobsMode = "raw_logprobs" + """Indicates the content returned in the logprobs and prompt_logprobs. + Supported mode: + 1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits. + Raw means the values before applying any logit processors, like bad words. + Processed means the values after applying all processors, including + temperature and top_k/top_p. + """ + disable_sliding_window: bool = False + """Whether to disable sliding window. If True, we will disable the sliding + window functionality of the model, capping to sliding window size. If the + model does not support sliding window, this argument is ignored.""" + disable_cascade_attn: bool = False + """Disable cascade attention for V1. While cascade attention does not + change the mathematical correctness, disabling it could be useful for + preventing potential numerical issues. Note that even if this is set to + False, cascade attention will be only used when the heuristic tells that + it's beneficial.""" + skip_tokenizer_init: bool = False + """Skip initialization of tokenizer and detokenizer. Expects valid + `prompt_token_ids` and `None` for prompt from the input. The generated + output will contain token ids.""" + enable_prompt_embeds: bool = False + """If `True`, enables passing text embeddings as inputs via the + `prompt_embeds` key. + + WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. + Only enable this flag for trusted users!""" + served_model_name: str | list[str] | None = None + """The model name(s) used in the API. If multiple names are provided, the + server will respond to any of the provided names. The model name in the + model field of a response will be the first name in this list. If not + specified, the model name will be the same as the `--model` argument. Noted + that this name(s) will also be used in `model_name` tag content of + prometheus metrics, if multiple names provided, metrics tag will take the + first one.""" + config_format: str | ConfigFormat = "auto" + """The format of the model config to load:\n + - "auto" will try to load the config in hf format if available else it + will try to load in mistral format.\n + - "hf" will load the config in hf format.\n + - "mistral" will load the config in mistral format.""" + hf_token: bool | str | None = None + """The token to use as HTTP bearer authorization for remote files . If + `True`, will use the token generated when running `huggingface-cli login` + (stored in `~/.huggingface`).""" + hf_overrides: HfOverrides = field(default_factory=dict) + """If a dictionary, contains arguments to be forwarded to the Hugging Face + config. If a callable, it is called to update the HuggingFace config.""" + logits_processor_pattern: str | None = None + """Optional regex pattern specifying valid logits processor qualified names + that can be passed with the `logits_processors` extra completion argument. + Defaults to `None`, which allows no processors.""" + generation_config: str = "auto" + """The folder path to the generation config. Defaults to `"auto"`, the + generation config will be loaded from model path. If set to `"vllm"`, no + generation config is loaded, vLLM defaults will be used. If set to a folder + path, the generation config will be loaded from the specified folder path. + If `max_new_tokens` is specified in generation config, then it sets a + server-wide limit on the number of output tokens for all requests.""" + override_generation_config: dict[str, Any] = field(default_factory=dict) + """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If + used with `--generation-config auto`, the override parameters will be + merged with the default config from the model. If used with + `--generation-config vllm`, only the override parameters are used.""" + enable_sleep_mode: bool = False + """Enable sleep mode for the engine (only cuda and + hip platforms are supported).""" + model_impl: str | ModelImpl = "auto" + """Which implementation of the model to use:\n + - "auto" will try to use the vLLM implementation, if it exists, and fall + back to the Transformers implementation if no vLLM implementation is + available.\n + - "vllm" will use the vLLM model implementation.\n + - "transformers" will use the Transformers model implementation.\n + - "terratorch" will use the TerraTorch model implementation. + """ + override_attention_dtype: str | None = None + """Override dtype for attention""" + logits_processors: list[str | type[LogitsProcessor]] | None = None + """One or more logits processors' fully-qualified class names or class + definitions""" + io_processor_plugin: str | None = None + """IOProcessor plugin name to load at model startup""" + + # Pooler config + pooler_config: PoolerConfig | None = None + """Pooler config which controls the behaviour of output pooling in pooling + models.""" + override_pooler_config: dict | PoolerConfig | None = None + """[DEPRECATED] Use `pooler_config` instead. This field will be removed in + v0.12.0 or v1.0.0, whichever is sooner.""" + + # Multimodal config and init vars + multimodal_config: MultiModalConfig | None = None + """Configuration for multimodal model. If `None`, this will be inferred + from the architecture of `self.model`.""" + limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None + enable_mm_embeds: InitVar[bool | None] = None + media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None + mm_processor_kwargs: InitVar[dict[str, Any] | None] = None + mm_processor_cache_gb: InitVar[float | None] = None + mm_processor_cache_type: InitVar[MMCacheType | None] = None + mm_shm_cache_max_object_size_mb: InitVar[int | None] = None + mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None + mm_encoder_attn_backend: InitVar[AttentionBackendEnum | str | None] = None + interleave_mm_strings: InitVar[bool | None] = None + skip_mm_profiling: InitVar[bool | None] = None + video_pruning_rate: InitVar[float | None] = None + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.model) + factors.append(self.dtype) + factors.append(self.quantization) + factors.append(self.revision) + factors.append(self.code_revision) + factors.append(self.max_model_len) + factors.append(self.max_logprobs) + factors.append(self.disable_sliding_window) + factors.append(self.trust_remote_code) + factors.append(self.generation_config) + factors.append(self.model_impl) + factors.append(self.override_generation_config) + factors.append(self.video_pruning_rate) + factors.append(self.enable_prompt_embeds) + + # hf_config can control how the model looks! + try: + hf_config_json = self.hf_config.to_json_string(use_diff=False) + except TypeError: + from transformers import PretrainedConfig + + from vllm.utils.jsontree import json_map_leaves + + # Handle nested HF configs with unserializable values gracefully + hf_config_json = ( + json.dumps( + json_map_leaves( + lambda v: v.to_dict() + if isinstance(v, PretrainedConfig) + else str(v), + self.hf_config.to_dict(), + ), + indent=2, + sort_keys=True, + ) + + "\n" + ) + + factors.append(hf_config_json) + + str_factors = str(factors) + assert_hashable(str_factors) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def _update_nested( + self, + target: PretrainedConfig | dict[str, Any], + updates: dict[str, Any], + ) -> None: + """Recursively updates a config or dict with nested updates.""" + for key, value in updates.items(): + if isinstance(value, dict): + # Get the nested target + if isinstance(target, dict): + nested_target = target.get(key) + else: + nested_target = getattr(target, key, None) + + # If nested target exists and can be updated recursively + if nested_target is not None and ( + isinstance(nested_target, dict) + or hasattr(nested_target, "__dict__") + ): + self._update_nested(nested_target, value) + continue + + # Set the value (base case) + if isinstance(target, dict): + target[key] = value + else: + setattr(target, key, value) + + def _apply_dict_overrides( + self, + config: PretrainedConfig, + overrides: dict[str, Any], + ) -> None: + """Apply dict overrides, handling both nested configs and dict values.""" + from transformers import PretrainedConfig + + for key, value in overrides.items(): + attr = getattr(config, key, None) + if attr is not None and isinstance(attr, PretrainedConfig): + # It's a nested config - recursively update it + self._update_nested(attr, value) + else: + # It's a dict-valued parameter - set it directly + setattr(config, key, value) + + def __post_init__( + self, + # Multimodal config init vars + limit_mm_per_prompt: dict[str, int] | None, + enable_mm_embeds: bool | None, + media_io_kwargs: dict[str, dict[str, Any]] | None, + mm_processor_kwargs: dict[str, Any] | None, + mm_processor_cache_gb: float | None, + mm_processor_cache_type: MMCacheType | None, + mm_shm_cache_max_object_size_mb: int | None, + mm_encoder_tp_mode: MMEncoderTPMode | None, + mm_encoder_attn_backend: AttentionBackendEnum | str | None, + interleave_mm_strings: bool | None, + skip_mm_profiling: bool | None, + video_pruning_rate: float | None, + ) -> None: + # Set the default seed to 0 in V1. + # NOTE(woosuk): In V1, we use separate processes for workers (unless + # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here + # doesn't affect the user process. However, without a consistent seed, + # different tensor parallel workers would sample different tokens, + # leading to inconsistent results. + if self.seed is None: + self.seed = 0 + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", + self.seed, + ) + + # Keep set served_model_name before maybe_model_redirect(self.model) + self.served_model_name = get_served_model_name( + self.model, self.served_model_name + ) + self.model = maybe_model_redirect(self.model) + # The tokenizer is consistent with the model by default. + if self.tokenizer is None: + self.tokenizer = self.model + if self.tokenizer_revision is None: + self.tokenizer_revision = self.revision + self.tokenizer = maybe_model_redirect(self.tokenizer) + + if isinstance(self.hf_config_path, str): + self.hf_config_path = maybe_model_redirect(self.hf_config_path) + + if callable(self.hf_overrides): + hf_overrides_kw = {} + hf_overrides_fn = self.hf_overrides + dict_overrides: dict[str, Any] = {} + else: + # Separate dict overrides from flat ones + # We'll determine how to apply dict overrides after loading the config + hf_overrides_kw = {} + dict_overrides = {} + for key, value in self.hf_overrides.items(): + if isinstance(value, dict): + dict_overrides[key] = value + else: + hf_overrides_kw[key] = value + hf_overrides_fn = None + + self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) + + if ( + (backend := envs.VLLM_ATTENTION_BACKEND) + and backend == "FLASHINFER" + and find_spec("flashinfer") is None + ): + raise ValueError( + "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " + "module was not found. See " + "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 + "for instructions on how to install it." + ) + + from vllm.platforms import current_platform + + if self.override_attention_dtype is not None and not current_platform.is_rocm(): + warnings.warn( + "override-attention-dtype is set but not using ROCm platform", + stacklevel=2, + ) + + if self.enable_sleep_mode and not current_platform.is_sleep_mode_available(): + raise ValueError("Sleep mode is not supported on current platform.") + + hf_config = get_config( + self.hf_config_path or self.model, + self.trust_remote_code, + self.revision, + self.code_revision, + self.config_format, + hf_overrides_kw=hf_overrides_kw, + hf_overrides_fn=hf_overrides_fn, + ) + + self.hf_config = hf_config + if dict_overrides: + self._apply_dict_overrides(hf_config, dict_overrides) + self.hf_text_config = get_hf_text_config(self.hf_config) + self.attention_chunk_size = getattr( + self.hf_text_config, "attention_chunk_size", None + ) + self.encoder_config = self._get_encoder_config() + self.hf_image_processor_config = get_hf_image_processor_config( + self.model, hf_token=self.hf_token, revision=self.revision + ) + + architectures = self.architectures + registry = self.registry + is_generative_model = registry.is_text_generation_model(architectures, self) + is_pooling_model = registry.is_pooling_model(architectures, self) + + def _task_to_convert(task: TaskOption) -> ConvertType: + if task == "embedding" or task == "embed": + return "embed" + if task == "classify": + return "classify" + if task == "reward": + return "reward" + if task == "score": + new_task = self._get_default_pooling_task(architectures) + return "classify" if new_task == "classify" else "embed" + + return "none" + + if self.task is not None: + runner: RunnerOption = "auto" + convert: ConvertOption = "auto" + msg_prefix = ( + "The 'task' option has been deprecated and will be " + "removed in v0.13.0 or v1.0, whichever comes first." + ) + msg_hint = "Please remove this option." + + is_generative_task = self.task in _RUNNER_TASKS["generate"] + is_pooling_task = self.task in _RUNNER_TASKS["pooling"] + + if is_generative_model and is_pooling_model: + if is_generative_task: + runner = "generate" + convert = "auto" + msg_hint = ( + "Please replace this option with `--runner " + "generate` to continue using this model " + "as a generative model." + ) + elif is_pooling_task: + runner = "pooling" + convert = "auto" + msg_hint = ( + "Please replace this option with `--runner " + "pooling` to continue using this model " + "as a pooling model." + ) + else: # task == "auto" + pass + elif is_generative_model or is_pooling_model: + if is_generative_task: + runner = "generate" + convert = "auto" + msg_hint = "Please remove this option" + elif is_pooling_task: + runner = "pooling" + convert = _task_to_convert(self.task) + msg_hint = ( + "Please replace this option with `--convert " + f"{convert}` to continue using this model " + "as a pooling model." + ) + else: # task == "auto" + pass + else: + debug_info = { + "architectures": architectures, + "is_generative_model": is_generative_model, + "is_pooling_model": is_pooling_model, + } + raise AssertionError( + "The model should be a generative or " + "pooling model when task is set to " + f"{self.task!r}. Found: {debug_info}" + ) + + self.runner = runner + self.convert = convert + + msg = f"{msg_prefix} {msg_hint}" + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + self.runner_type = self._get_runner_type(architectures, self.runner) + self.convert_type = self._get_convert_type( + architectures, self.runner_type, self.convert + ) + + if self.runner_type == "generate" and not is_generative_model: + generate_converts = _RUNNER_CONVERTS["generate"] + if self.convert_type not in generate_converts: + # Currently we don't have any converters for generative models + raise ValueError("This model does not support `--runner generate`.") + if self.runner_type == "pooling" and not is_pooling_model: + pooling_converts = _RUNNER_CONVERTS["pooling"] + if self.convert_type not in pooling_converts: + convert_option = "<" + "|".join(pooling_converts) + ">" + raise ValueError( + "This model does not support `--runner pooling`. " + f"You can pass `--convert {convert_option} to adapt " + "it into a pooling model." + ) + + # Note: Initialize these attributes early because transformers fallback + # may fail to load dynamic modules in child processes + model_info, arch = registry.inspect_model_cls(architectures, self) + self._model_info = model_info + self._architecture = arch + logger.info("Resolved architecture: %s", arch) + + # Init pooler config if needed + if self.runner_type == "pooling": + if self.override_pooler_config is not None: + logger.warning_once( + "`override_pooler_config` is deprecated and will be " + "removed in v0.12.0 or v1.0.0, whichever is sooner. " + "Please use `pooler_config` instead." + ) + + if isinstance(self.override_pooler_config, dict): + self.pooler_config = PoolerConfig(**self.override_pooler_config) + else: + self.pooler_config = self.override_pooler_config + + if self.pooler_config is None: + self.pooler_config = PoolerConfig() + + base_config = get_pooling_config(self.model, self.revision) + if base_config is not None: + # Only set values that are not overridden by the user + for k, v in base_config.items(): + if getattr(self.pooler_config, k) is None: + setattr(self.pooler_config, k, v) + + default_pooling_type = self._model_info.default_pooling_type + if self.pooler_config.pooling_type is None: + self.pooler_config.pooling_type = default_pooling_type + + self.dtype: torch.dtype = _get_and_verify_dtype( + self.model, + self.hf_config, + self.dtype, + is_pooling_model=self.runner_type == "pooling", + revision=self.revision, + ) + + self.original_max_model_len = self.max_model_len + self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + # Init multimodal config if needed + if self._model_info.supports_multimodal: + if ( + mm_encoder_tp_mode == "data" + and not self._model_info.supports_multimodal_encoder_tp_data + ): + logger.warning_once( + "This model does not support `--mm-encoder-tp-mode data`. " + "Falling back to `--mm-encoder-tp-mode weights`." + ) + mm_encoder_tp_mode = "weights" + + mm_config_kwargs = dict( + limit_per_prompt=limit_mm_per_prompt, + enable_mm_embeds=enable_mm_embeds, + media_io_kwargs=media_io_kwargs, + mm_processor_kwargs=mm_processor_kwargs, + mm_processor_cache_gb=mm_processor_cache_gb, + mm_processor_cache_type=mm_processor_cache_type, + mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb, + mm_encoder_tp_mode=mm_encoder_tp_mode, + mm_encoder_attn_backend=mm_encoder_attn_backend, + interleave_mm_strings=interleave_mm_strings, + skip_mm_profiling=skip_mm_profiling, + video_pruning_rate=video_pruning_rate, + ) + + mm_config_kwargs = { + k: v for k, v in mm_config_kwargs.items() if v is not None + } + + self.multimodal_config = MultiModalConfig(**mm_config_kwargs) + + if self.disable_sliding_window: + # Set after get_and_verify_max_len to ensure that max_model_len + # can be correctly capped to sliding window size + self.hf_text_config.sliding_window = None + + if not self.skip_tokenizer_init: + self._verify_tokenizer_mode() + + # Avoid running try_verify_and_update_config multiple times + self.config_updated = False + + self._verify_quantization() + self._verify_cuda_graph() + import os + enforce_cuda_graph = os.environ.get("VLLM_ENFORCE_CUDA_GRAPH",None) + if enforce_cuda_graph is not None and enforce_cuda_graph in ["1", "y", "Y"]: + self.enforce_eager = False + else: + self.enforce_eager = True + logger.warning_once( + "Please export VLLM_ENFORCE_CUDA_GRAPH=1 to enable cuda graph. " + "For now, cuda graph is not used and --enforce-eager is disabled ," + "we are trying to use cuda graph as the default mode") + self._verify_bnb_config() + + @field_validator("quantization", mode="before") + @classmethod + def validate_quantization_before(cls, value: Any) -> Any: + if isinstance(value, str): + return value.lower() + return value + + @model_validator(mode="after") + def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": + if not isinstance(self.tokenizer, str): + raise ValueError("tokenizer must be a string after __post_init__.") + if not isinstance(self.max_model_len, int): + raise ValueError("max_model_len must be an integer after __post_init__.") + return self + + def _get_transformers_backend_cls(self) -> str: + """Determine which Transformers modeling backend class will be used if + `model_impl` is set to `transformers` or `auto`.""" + cls = "Transformers" + # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal + cls += "MultiModal" if self.hf_config != self.hf_text_config else "" + cls += "MoE" if self.get_num_experts() > 1 else "" + # Check if the architecture we're wrapping has defaults + runner = None + task = None + if defaults := try_match_architecture_defaults(self.architectures[0]): + _, (runner, task) = defaults + # User specified value take precedence + if self.runner != "auto": + runner = self.runner + # Only consider Transformers modeling backend pooling classes if we're wrapping + # an architecture that defaults to pooling. Otherwise, we return the LM class + # and use adapters. + if runner == "pooling" and task in {"embed", "classify"}: + if task == "embed": + cls += "EmbeddingModel" + elif task == "classify": + cls += "ForSequenceClassification" + else: + cls += "ForCausalLM" + return cls + + def using_transformers_backend(self) -> bool: + """Check if the model is using the Transformers modeling backend class.""" + used_cls = self._model_info.architecture + transformers_backend_cls = self._get_transformers_backend_cls() + return used_cls == transformers_backend_cls + + @property + def registry(self): + return me_models.ModelRegistry + + @property + def architectures(self) -> list[str]: + return getattr(self.hf_config, "architectures", []) + + @property + def architecture(self) -> str: + """The architecture vllm actually used.""" + return self._architecture + + def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None: + """Pull model/tokenizer from Object Storage to temporary + directory when needed. + + Args: + model: Model name or path + tokenizer: Tokenizer name or path + """ + + if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): + return + + if is_runai_obj_uri(model): + object_storage_model = ObjectStorageModel(url=model) + object_storage_model.pull_files( + model, allow_pattern=["*.model", "*.py", "*.json"] + ) + self.model_weights = model + self.model = object_storage_model.dir + + # If tokenizer is same as model, download to same directory + if model == tokenizer: + object_storage_model.pull_files( + model, + ignore_pattern=[ + "*.pt", + "*.safetensors", + "*.bin", + "*.tensors", + "*.pth", + ], + ) + self.tokenizer = object_storage_model.dir + return + + # Only download tokenizer if needed and not already handled + if is_runai_obj_uri(tokenizer): + object_storage_tokenizer = ObjectStorageModel(url=tokenizer) + object_storage_tokenizer.pull_files( + model, + ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], + ) + self.tokenizer = object_storage_tokenizer.dir + + def _get_encoder_config(self): + return get_sentence_transformer_tokenizer_config(self.model, self.revision) + + def _verify_tokenizer_mode(self) -> None: + tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) + if tokenizer_mode not in get_args(TokenizerMode): + raise ValueError( + f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " + f"one of {get_args(TokenizerMode)}." + ) + self.tokenizer_mode = tokenizer_mode + + def _get_default_runner_type( + self, + architectures: list[str], + ) -> RunnerType: + registry = self.registry + + # Some Sentence Transformers models use *ForCausalLM archs + if get_pooling_config(self.model, self.revision): + return "pooling" + + for arch in architectures: + if arch in registry.get_supported_archs(): + if registry.is_pooling_model(architectures, self): + return "pooling" + if registry.is_text_generation_model(architectures, self): + return "generate" + + match = try_match_architecture_defaults(arch) + if match: + _, (runner_type, _) = match + return runner_type + + return "generate" + + def _get_runner_type( + self, + architectures: list[str], + runner: RunnerOption, + ) -> RunnerType: + if runner != "auto": + return runner + + runner_type = self._get_default_runner_type(architectures) + + # Don't log the most common case + if runner_type != "generate": + logger.info( + "Resolved `--runner auto` to `--runner %s`. " + "Pass the value explicitly to silence this message.", + runner_type, + ) + + return runner_type + + def _get_default_convert_type( + self, + architectures: list[str], + runner_type: RunnerType, + ) -> ConvertType: + registry = self.registry + + for arch in architectures: + if arch in registry.get_supported_archs(): + if runner_type == "generate" and registry.is_text_generation_model( + architectures, self + ): + return "none" + if runner_type == "pooling" and registry.is_pooling_model( + architectures, self + ): + return "none" + + match = try_match_architecture_defaults(arch, runner_type=runner_type) + if match: + _, (_, convert_type) = match + return convert_type + + # This is to handle Sentence Transformers models that use *ForCausalLM + # and also multi-modal pooling models which are not defined as + # Sentence Transformers models + if runner_type == "pooling": + return "embed" + + return "none" + + def _get_convert_type( + self, + architectures: list[str], + runner_type: RunnerType, + convert: ConvertOption, + ) -> ConvertType: + if convert != "auto": + return convert + + convert_type = self._get_default_convert_type(architectures, runner_type) + + # Don't log the most common case + if convert_type != "none": + logger.info( + "Resolved `--convert auto` to `--convert %s`. " + "Pass the value explicitly to silence this message.", + convert_type, + ) + + return convert_type + + def _get_default_pooling_task( + self, + architectures: list[str], + ) -> Literal["embed", "classify", "reward"]: + if self.registry.is_cross_encoder_model(architectures, self): + return "classify" + + for arch in architectures: + match = try_match_architecture_defaults(arch, runner_type="pooling") + if match: + _, (_, convert_type) = match + assert convert_type != "none" + return convert_type + + return "embed" + + def _parse_quant_hf_config(self, hf_config: PretrainedConfig): + quant_cfg = getattr(hf_config, "quantization_config", None) + if quant_cfg is None: + # compressed-tensors uses a "compression_config" key + quant_cfg = getattr(hf_config, "compression_config", None) + + else: + # Set quant_method for ModelOpt models. + producer_name = quant_cfg.get("producer", {}).get("name") + if producer_name == "modelopt": + quant_algo = quant_cfg.get("quantization", {}).get("quant_algo") + if quant_algo == "FP8": + quant_cfg["quant_method"] = "modelopt" + elif quant_algo == "NVFP4": + quant_cfg["quant_method"] = "modelopt_fp4" + elif quant_algo is not None: + raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") + + return quant_cfg + + def _verify_quantization(self) -> None: + supported_quantization = me_quant.QUANTIZATION_METHODS + if self.quantization is not None: + self.quantization = cast(me_quant.QuantizationMethods, self.quantization) + + # Parse quantization method from the HF model config, if available. + quant_cfg = self._parse_quant_hf_config(self.hf_config) + if quant_cfg is None and ( + text_config := getattr(self.hf_config, "text_config", None) + ): + # Check the text config as well for multi-modal models. + quant_cfg = self._parse_quant_hf_config(text_config) + + if quant_cfg is not None: + # Use the community standard 'quant_method' + quant_method = quant_cfg.get("quant_method", "").lower() + + # Normalize library names + quant_method = quant_method.replace( + "compressed_tensors", "compressed-tensors" + ) + + quant_cfg["quant_method"] = quant_method + + # Quantization methods which are overrides (i.e. they have a + # `override_quantization_method` method) must be checked in order + # of preference (this is particularly important for GPTQ). + overrides = [ + "bitblas", + "gptq_marlin_24", + "gptq_marlin", + "gptq_bitblas", + "awq_marlin", + "ipex", + "moe_wna16", + "modelopt", + "modelopt_fp4", + "petit_nvfp4", + # Ensure heavy backends are probed last to avoid unnecessary + # imports during override detection (e.g., MXFP4 imports Triton) + "mxfp4", + ] + quantization_methods = [ + q for q in supported_quantization if q not in overrides + ] + # Any custom overrides will be in quantization_methods so we place + # them at the start of the list so custom overrides have preference + # over the built-in ones. + quantization_methods = quantization_methods + overrides + + # Detect which checkpoint is it + for name in quantization_methods: + method = me_quant.get_quantization_config(name) + quantization_override = method.override_quantization_method( + quant_cfg, self.quantization + ) + if quantization_override is not None: + # Raise error if the override is not custom (custom would + # be in QUANTIZATION_METHODS but not QuantizationMethods) + # and hasn't been added to the overrides list. + if ( + name in get_args(me_quant.QuantizationMethods) + and name not in overrides + ): + raise ValueError( + f"Quantization method {name} is an override but " + "is has not been added to the `overrides` list " + "above. This is necessary to ensure that the " + "overrides are checked in order of preference." + ) + quant_method = quantization_override + self.quantization = quantization_override + break + + quant_method = quant_method if quant_method != "" else None + # Verify quantization configurations. + if self.quantization is None: + self.quantization = quant_method + elif self.quantization != quant_method: + raise ValueError( + "Quantization method specified in the model config " + f"({quant_method}) does not match the quantization " + f"method specified in the `quantization` argument " + f"({self.quantization})." + ) + + if self.quantization is not None: + if self.quantization not in supported_quantization: + raise ValueError( + f"Unknown quantization method: {self.quantization}. Must " + f"be one of {supported_quantization}." + ) + from vllm.platforms import current_platform + + current_platform.verify_quantization(self.quantization) + + def _verify_cuda_graph(self) -> None: + # CUDAGraph capture not supported for encoder-decoder models on ROCm + unsupported_rocm = self.is_encoder_decoder + if unsupported_rocm and not self.enforce_eager and current_platform.is_rocm(): + logger.warning( + "CUDA graph is not supported for %s on ROCm yet, fallback " + "to eager mode.", + self.hf_config.model_type, + ) + self.enforce_eager = True + + def _verify_bnb_config(self) -> None: + """ + The current version of bitsandbytes (0.46.1) with 8-bit models does not + yet support CUDA graph. + # TODO Remove this when bitsandbytes supports. + """ + is_bitsandbytes = self.quantization == "bitsandbytes" + has_quantization_config = ( + getattr(self.hf_config, "quantization_config", None) is not None + ) + is_8bit = ( + self.hf_config.quantization_config.get("load_in_8bit", False) + if has_quantization_config + else False + ) + if all( + [ + is_bitsandbytes, + has_quantization_config, + is_8bit, + not self.enforce_eager, + ] + ): + logger.warning( + "CUDA graph is not supported on BitsAndBytes 8bit yet, " + "fallback to the eager mode." + ) + + self.enforce_eager = True + + def _verify_with_expert_parallelism(self) -> None: + num_experts = self.get_num_experts() + if num_experts < 1: + raise ValueError( + "Number of experts in the model must be greater than 0 " + "when expert parallelism is enabled." + ) + + def verify_dual_chunk_attention_config( + self, + load_config: LoadConfig, + ) -> None: + if hasattr(self.hf_config, "dual_chunk_attention_config"): + # Try loading the sparse attention config + from vllm.model_executor.model_loader.weight_utils import ( + get_sparse_attention_config, + ) + + sparse_attn_config = get_sparse_attention_config(self, load_config) + if sparse_attn_config: + self.hf_config.dual_chunk_attention_config[ + "sparse_attention_config" + ] = sparse_attn_config + if ( + "sparse_attention_enabled" + not in self.hf_config.dual_chunk_attention_config + ): + self.hf_config.dual_chunk_attention_config[ + "sparse_attention_enabled" + ] = True + + def verify_with_parallel_config( + self, + parallel_config: ParallelConfig, + ) -> None: + if parallel_config.distributed_executor_backend == "external_launcher": + assert self.seed is not None, ( + "Seed must be set when using external launcher backend to " + "make sure sampling results are the same across workers." + ) + + total_num_attention_heads = getattr( + self.hf_text_config, "num_attention_heads", 0 + ) + tensor_parallel_size = parallel_config.tensor_parallel_size + if total_num_attention_heads % tensor_parallel_size != 0: + raise ValueError( + f"Total number of attention heads ({total_num_attention_heads})" + " must be divisible by tensor parallel size " + f"({tensor_parallel_size})." + ) + + if parallel_config.enable_expert_parallel: + self._verify_with_expert_parallelism() + + pipeline_parallel_size = parallel_config.pipeline_parallel_size + if pipeline_parallel_size > 1 and not self.registry.is_pp_supported_model( + self.architectures, self + ): + raise NotImplementedError( + "Pipeline parallelism is not supported for this model. " + "Supported models implement the `SupportsPP` interface." + ) + + decode_context_parallel_size = parallel_config.decode_context_parallel_size + if decode_context_parallel_size > 1 and not self.use_mla: + total_num_kv_heads = self.get_total_num_kv_heads() + assert tensor_parallel_size > total_num_kv_heads, ( + f"tensor parallel size {tensor_parallel_size} must be greater " + f"than total num kv heads {total_num_kv_heads} when enable " + f"decode context parallel for GQA/MQA" + ) + + max_dcp_size = tensor_parallel_size // total_num_kv_heads + assert decode_context_parallel_size <= max_dcp_size, ( + f"decode context parallel size must less than or equal to " + f"(tensor parallel size {tensor_parallel_size} // total " + f"num kv heads {total_num_kv_heads}) = {max_dcp_size}, " + f"but got {decode_context_parallel_size}" + ) + + num_q_per_kv = total_num_attention_heads // total_num_kv_heads + assert num_q_per_kv % decode_context_parallel_size == 0, ( + f"Total number of q per kv attn heads ({num_q_per_kv})" + " must be divisible by dcp world size when enable " + "decode context parallel for GQA " + f"({parallel_config.decode_context_parallel_size})." + ) + + def get_sliding_window(self) -> int | None: + """Get the sliding window size from the HF text config if present.""" + return getattr(self.hf_text_config, "sliding_window", None) + + def get_vocab_size(self) -> int: + return getattr(self.hf_text_config, "vocab_size", 0) + + def get_hidden_size(self) -> int: + return getattr(self.hf_text_config, "hidden_size", 0) + + @property + def is_deepseek_mla(self) -> bool: + if not hasattr(self.hf_text_config, "model_type"): + return False + elif self.hf_text_config.model_type in ( + "deepseek_v2", + "deepseek_v3", + "deepseek_v32", + "deepseek_mtp", + "kimi_k2", + "kimi_linear", + "longcat_flash", + "pangu_ultra_moe", + "pangu_ultra_moe_mtp", + ): + return self.hf_text_config.kv_lora_rank is not None + elif self.hf_text_config.model_type == "eagle": + # if the model is an EAGLE module, check for the + # underlying architecture + return ( + self.hf_text_config.model.model_type + in ("deepseek_v2", "deepseek_v3", "deepseek_v32") + and self.hf_text_config.kv_lora_rank is not None + ) + return False + + def get_head_size(self) -> int: + # TODO remove hard code + if self.is_deepseek_mla: + qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0) + if self.use_mla: + return self.hf_text_config.kv_lora_rank + qk_rope_head_dim + else: + qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0) + if qk_rope_head_dim and qk_nope_head_dim: + return qk_rope_head_dim + qk_nope_head_dim + + if hasattr(self.hf_text_config, "model_type") and ( + self.hf_text_config.model_type == "zamba2" + ): + return self.hf_text_config.attention_head_dim + + if self.is_attention_free: + return 0 + + # NOTE: Some configs may set head_dim=None in the config + if getattr(self.hf_text_config, "head_dim", None) is not None: + return self.hf_text_config.head_dim + + # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head` + if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None: + return self.hf_text_config.hidden_size_per_head + + # FIXME(woosuk): This may not be true for all models. + return ( + self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads + ) + + def get_total_num_kv_heads(self) -> int: + """Returns the total number of KV heads.""" + # For GPTBigCode & Falcon: + # NOTE: for falcon, when new_decoder_architecture is True, the + # multi_query flag is ignored and we use n_head_kv for the number of + # KV heads. + falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] + new_decoder_arch_falcon = ( + self.hf_config.model_type in falcon_model_types + and getattr(self.hf_config, "new_decoder_architecture", False) + ) + if not new_decoder_arch_falcon and getattr( + self.hf_text_config, "multi_query", False + ): + # Multi-query attention, only one KV head. + # Currently, tensor parallelism is not supported in this case. + return 1 + + # For DBRX and MPT + if self.hf_config.model_type == "mpt": + if "kv_n_heads" in self.hf_config.attn_config: + return self.hf_config.attn_config["kv_n_heads"] + return self.hf_config.num_attention_heads + if self.hf_config.model_type == "dbrx": + return getattr( + self.hf_config.attn_config, + "kv_n_heads", + self.hf_config.num_attention_heads, + ) + + if self.hf_config.model_type == "nemotron-nas": + for block in self.hf_config.block_configs: + if not block.attention.no_op: + return ( + self.hf_config.num_attention_heads + // block.attention.n_heads_in_group + ) + + raise RuntimeError("Couldn't determine number of kv heads") + + if self.is_attention_free: + return 0 + + attributes = [ + # For Falcon: + "n_head_kv", + "num_kv_heads", + # For LLaMA-2: + "num_key_value_heads", + # For ChatGLM: + "multi_query_group_num", + ] + for attr in attributes: + num_kv_heads = getattr(self.hf_text_config, attr, None) + if num_kv_heads is not None: + return num_kv_heads + + # For non-grouped-query attention models, the number of KV heads is + # equal to the number of attention heads. + return self.hf_text_config.num_attention_heads + + def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: + """Returns the number of KV heads per GPU.""" + if self.use_mla: + # When using MLA during decode it becomes MQA + return 1 + + total_num_kv_heads = self.get_total_num_kv_heads() + # If tensor parallelism is used, we divide the number of KV heads by + # the tensor parallel size. We will replicate the KV heads in the + # case where the number of KV heads is smaller than the tensor + # parallel size so each GPU has at least one KV head. + return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size) + + def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int: + num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) + return num_heads // parallel_config.tensor_parallel_size + + def get_num_experts(self) -> int: + """Returns the number of experts in the model.""" + num_expert_names = [ + "num_experts", # Jamba + "moe_num_experts", # Dbrx + "n_routed_experts", # DeepSeek + "num_local_experts", # Mixtral + ] + num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0) + if isinstance(num_experts, list): + # Ernie VL's remote code uses list[int]... + # The values are always the same so we just take the first one. + return num_experts[0] + # Coerce to 0 if explicitly set to None + return num_experts or 0 + + def get_layers_start_end_indices( + self, parallel_config: ParallelConfig + ) -> tuple[int, int]: + from vllm.distributed.utils import get_pp_indices + + if ( + self.hf_text_config.model_type == "deepseek_mtp" + or self.hf_config.model_type == "mimo_mtp" + or self.hf_config.model_type == "glm4_moe_mtp" + or self.hf_config.model_type == "ernie_mtp" + or self.hf_config.model_type == "qwen3_next_mtp" + or self.hf_config.model_type == "pangu_ultra_moe_mtp" + ): + total_num_hidden_layers = getattr( + self.hf_text_config, "num_nextn_predict_layers", 0 + ) + elif self.hf_config.model_type == "longcat_flash_mtp": + total_num_hidden_layers = getattr( + self.hf_text_config, "num_nextn_predict_layers", 1 + ) + else: + total_num_hidden_layers = getattr( + self.hf_text_config, "num_hidden_layers", 0 + ) + # the layout order is: DP x PP x TP + pp_rank = ( + parallel_config.rank // parallel_config.tensor_parallel_size + ) % parallel_config.pipeline_parallel_size + pp_size = parallel_config.pipeline_parallel_size + start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) + return start, end + + def get_num_layers(self, parallel_config: ParallelConfig) -> int: + start, end = self.get_layers_start_end_indices(parallel_config) + return end - start + + def get_num_layers_by_block_type( + self, + parallel_config: ParallelConfig, + block_type: LayerBlockType = "attention", + ) -> int: + # This function relies on 'layers_block_type' in hf_config, + # for w/o this attribute, we will need to have workarounds like so + attn_block_type = block_type == "attention" + is_transformer = ( + not self.is_hybrid and not self.has_noops and not self.is_attention_free + ) + start, end = self.get_layers_start_end_indices(parallel_config) + + if is_transformer: + # Handle the basic case first + return end - start if attn_block_type else 0 + elif self.is_attention_free: + # Attention free + # Note that this code assumes there + # is only one type of attention-free block type. + return 0 if attn_block_type else end - start + elif self.has_noops: + block_configs = self.hf_config.block_configs + return sum(not bc.attention.no_op for bc in block_configs[start:end]) + else: + # Hybrid model Jamba + layers_block_type_value = getattr( + self.hf_text_config, "layers_block_type", None + ) + if layers_block_type_value is not None: + if hasattr(self.hf_text_config, "model_type") and ( + self.hf_text_config.model_type == "zamba2" + ): + if attn_block_type: + return sum( + t == "hybrid" for t in layers_block_type_value[start:end] + ) + else: + return self.get_num_layers(parallel_config) + return sum(t == block_type for t in layers_block_type_value[start:end]) + + # Hybrid model Minimax + attn_type_list = getattr(self.hf_config, "attn_type_list", None) + if attn_type_list: + return sum(t == 1 for t in attn_type_list[start:end]) + + # Hybrid model Qwen3Next + layer_types_value = getattr(self.hf_config, "layer_types", None) + if layer_types_value is not None: + if block_type == "attention": + return sum( + t == "full_attention" for t in layer_types_value[start:end] + ) + elif block_type == "linear_attention": + return sum( + t == "linear_attention" for t in layer_types_value[start:end] + ) + else: + return sum(t == block_type for t in layer_types_value[start:end]) + + if ( + layers_block_type_value is None + and attn_type_list is None + and layer_types_value is None + ): + raise ValueError( + "The model is an hybrid without a layers_block_type or an " + "attn_type_list, or a layer_types in the hf_config, " + f"cannot determine the num of {block_type} layers" + ) + + def get_mamba_chunk_size(self) -> int | None: + """ + Returns the mamba chunk size if it exists + """ + # used by e.g. Bamba, FalconH1, Granite, PLaMo2 + chunk_size = getattr(self.hf_text_config, "mamba_chunk_size", None) + if chunk_size is None: + # used by e.g. Mamba2, NemotronH, Zamba + chunk_size = getattr(self.hf_text_config, "chunk_size", None) + + # Since Mamba1 does not have a chunk notion + # we use a default chunk size of 1024. + if chunk_size is None: + chunk_size = 2048 + + return chunk_size + + def get_multimodal_config(self) -> MultiModalConfig: + """ + Get the multimodal configuration of the model. + + Raises: + ValueError: If the model is not multimodal. + """ + if self.multimodal_config is None: + raise ValueError("The model is not multimodal.") + + return self.multimodal_config + + def try_get_generation_config(self) -> dict[str, Any]: + """ + This method attempts to retrieve the non-default values of the + generation config for this model. + + The generation config can contain information about special tokens, as + well as sampling parameters. Which is why this method exists separately + to `get_diff_sampling_param`. + + Returns: + A dictionary containing the non-default generation config. + """ + if self.generation_config in {"auto", "vllm"}: + config = try_get_generation_config( + self.hf_config_path or self.model, + trust_remote_code=self.trust_remote_code, + revision=self.revision, + config_format=self.config_format, + ) + else: + config = try_get_generation_config( + self.generation_config, + trust_remote_code=self.trust_remote_code, + config_format=self.config_format, + ) + + if config is None: + return {} + + return config.to_diff_dict() + + def get_diff_sampling_param(self) -> dict[str, Any]: + """ + This method returns a dictionary containing the non-default sampling + parameters with `override_generation_config` applied. + + The default sampling parameters are: + + - vLLM's neutral defaults if `self.generation_config="vllm"` + - the model's defaults if `self.generation_config="auto"` + - as defined in `generation_config.json` if + `self.generation_config="path/to/generation_config/dir"` + + Returns: + A dictionary containing the non-default sampling parameters. + """ + if self.generation_config == "vllm": + config = {} + else: + config = self.try_get_generation_config() + + # Overriding with given generation config + config.update(self.override_generation_config) + + available_params = [ + "repetition_penalty", + "temperature", + "top_k", + "top_p", + "min_p", + "max_new_tokens", + ] + if any(p in config for p in available_params): + diff_sampling_param = { + p: config.get(p) for p in available_params if config.get(p) is not None + } + # Huggingface definition of max_new_tokens is equivalent + # to vLLM's max_tokens + if "max_new_tokens" in diff_sampling_param: + diff_sampling_param["max_tokens"] = diff_sampling_param.pop( + "max_new_tokens" + ) + else: + diff_sampling_param = {} + + if diff_sampling_param: + logger.warning_once( + "Default sampling parameters have been overridden by the " + "model's Hugging Face generation config recommended from the " + "model creator. If this is not intended, please relaunch " + "vLLM instance with `--generation-config vllm`." + ) + return diff_sampling_param + + @property + def is_encoder_decoder(self) -> bool: + """Extract the HF encoder/decoder model flag.""" + return is_encoder_decoder(self.hf_config) + + @property + def uses_alibi(self) -> bool: + cfg = self.hf_text_config + + return ( + getattr(cfg, "alibi", False) # Falcon + or "BloomForCausalLM" in self.architectures # Bloom + or getattr(cfg, "position_encoding_type", "") == "alibi" # codellm_1b_alibi + or ( + hasattr(cfg, "attn_config") # MPT + and ( + ( + isinstance(cfg.attn_config, dict) + and cfg.attn_config.get("alibi", False) + ) + or ( + not isinstance(cfg.attn_config, dict) + and getattr(cfg.attn_config, "alibi", False) + ) + ) + ) + ) + + @property + def uses_mrope(self) -> bool: + return uses_mrope(self.hf_config) + + @property + def is_multimodal_model(self) -> bool: + return self.multimodal_config is not None + + @property + def is_multimodal_raw_input_only_model(self) -> bool: + return self._model_info.supports_multimodal_raw_input_only + + @property + def is_cross_encoder(self) -> bool: + return ( + self._model_info.supports_cross_encoding or self.convert_type == "classify" + ) + + @property + def is_pp_supported(self) -> bool: + return self._model_info.supports_pp + + @property + def is_attention_free(self) -> bool: + return self._model_info.is_attention_free + + @property + def is_hybrid(self) -> bool: + # Handle granite-4.0-micro case which uses hybrid config but does not + # actually contain any non-attention layers. + layer_types = getattr(self.hf_config, "layer_types", None) + if layer_types is not None and all( + layer == "attention" for layer in layer_types + ): + return False + return self._model_info.is_hybrid + + @property + def has_noops(self) -> bool: + return self._model_info.has_noops + + @property + def has_inner_state(self): + return self._model_info.has_inner_state + + @property + def supports_mamba_prefix_caching(self) -> bool: + return self._model_info.supports_mamba_prefix_caching + + @property + def use_mla(self) -> bool: + return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE + + @property + def is_matryoshka(self) -> bool: + return bool(getattr(self.hf_config, "matryoshka_dimensions", None)) or getattr( + self.hf_config, "is_matryoshka", False + ) + + @property + def matryoshka_dimensions(self): + return getattr(self.hf_config, "matryoshka_dimensions", None) + + @property + def use_pad_token(self) -> bool: + # cross_encoder models defaults to using pad_token. + # `llm as reranker` models defaults to not using pad_token. + return getattr(self.hf_config, "use_pad_token", True) + + @property + def head_dtype(self) -> torch.dtype: + """ + "head" refers to the last Linear layer(s) of an LLM, + such as the lm_head in a generation model, + or the score or classifier in a classification model. + + `head_dtype` currently only supports pooling models.\n + - The pooling model defaults to using fp32 head, + you can use --hf-overrides '{"head_dtype": "model"}' to disable it. + """ + + head_dtype = _get_head_dtype( + config=self.hf_config, dtype=self.dtype, runner_type=self.runner_type + ) + + if self.runner_type != "pooling" and head_dtype != self.dtype: + logger.warning_once( + "`head_dtype` currently only supports pooling models." + "fallback to model dtype [%s].", + self.dtype, + ) + return self.dtype + + if head_dtype not in current_platform.supported_dtypes: + logger.warning_once( + "The current platform does not support [%s] head dtype, " + "fallback to model dtype [%s].", + head_dtype, + self.dtype, + ) + return self.dtype + + logger.debug_once("head dtype: %s", head_dtype) + return head_dtype + + @property + def hidden_size(self): + if hasattr(self.hf_config, "hidden_size"): + return self.hf_config.hidden_size + text_config = self.hf_config.get_text_config() + return text_config.hidden_size + + @property + def embedding_size(self): + dense_modules = try_get_dense_modules(self.model, revision=self.revision) + if dense_modules is not None: + return dense_modules[-1]["out_features"] + return self.hidden_size + + def get_and_verify_max_len(self, max_model_len: int): + # Consider max_model_len in tokenizer_config only when + # pooling models use absolute position_embedding. + tokenizer_config = None + if ( + self.runner_type == "pooling" + and getattr(self.hf_config, "position_embedding_type", "") == "absolute" + ): + tokenizer_config = try_get_tokenizer_config( + self.tokenizer, + trust_remote_code=self.trust_remote_code, + revision=self.tokenizer_revision, + ) + max_model_len = _get_and_verify_max_len( + hf_config=self.hf_text_config, + tokenizer_config=tokenizer_config, + max_model_len=max_model_len, + disable_sliding_window=self.disable_sliding_window, + sliding_window=self.get_sliding_window(), + spec_target_max_model_len=self.spec_target_max_model_len, + encoder_config=self.encoder_config, + ) + logger.info("Using max model len %s", max_model_len) + return max_model_len + + +def get_served_model_name(model: str, served_model_name: str | list[str] | None): + """ + If the input is a non-empty list, the first model_name in + `served_model_name` is taken. + If the input is a non-empty string, it is used directly. + For cases where the input is either an empty string or an + empty list, the fallback is to use `self.model`. + """ + if not served_model_name: + return model + if isinstance(served_model_name, list): + return served_model_name[0] + return served_model_name + + +# Some model suffixes are based on auto classes from Transformers: +# https://huggingface.co/docs/transformers/en/model_doc/auto +# NOTE: Items higher on this list priority over lower ones +_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [ + ("ForCausalLM", ("generate", "none")), + ("ForConditionalGeneration", ("generate", "none")), + ("ChatModel", ("generate", "none")), + ("LMHeadModel", ("generate", "none")), + ("ForTextEncoding", ("pooling", "embed")), + ("EmbeddingModel", ("pooling", "embed")), + ("ForSequenceClassification", ("pooling", "classify")), + ("ForAudioClassification", ("pooling", "classify")), + ("ForImageClassification", ("pooling", "classify")), + ("ForVideoClassification", ("pooling", "classify")), + ("ClassificationModel", ("pooling", "classify")), + ("ForRewardModeling", ("pooling", "reward")), + ("RewardModel", ("pooling", "reward")), + # Let other `*Model`s take priority + ("Model", ("pooling", "embed")), +] + + +def iter_architecture_defaults(): + yield from _SUFFIX_TO_DEFAULTS + + +def try_match_architecture_defaults( + architecture: str, + *, + runner_type: RunnerType | None = None, + convert_type: ConvertType | None = None, +) -> tuple[str, tuple[RunnerType, ConvertType]] | None: + for suffix, ( + default_runner_type, + default_convert_type, + ) in iter_architecture_defaults(): + if ( + (runner_type is None or runner_type == default_runner_type) + and (convert_type is None or convert_type == default_convert_type) + and architecture.endswith(suffix) + ): + return suffix, (default_runner_type, default_convert_type) + + return None + + +_STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.float16, + "float16": torch.float16, + "float": torch.float32, + "float32": torch.float32, + "bfloat16": torch.bfloat16, +} + +# model_type -> reason +_FLOAT16_NOT_SUPPORTED_MODELS = { + "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3_text": "Numerical instability. Please use bfloat16 or float32 instead.", + "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.", + "glm4": "Numerical instability. Please use bfloat16 or float32 instead.", +} + + +def _is_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: # noqa: E501, SIM103 + return False + + return True + + +def _check_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: + reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type] + raise ValueError( + f"The model type {model_type!r} does not support float16. Reason: {reason}" + ) + + return True + + +def _find_dtype( + model_id: str, + config: PretrainedConfig, + *, + revision: str | None, +): + # NOTE: getattr(config, "dtype", torch.float32) is not correct + # because config.dtype can be None. + config_dtype = getattr(config, "dtype", None) + + # Fallbacks for multi-modal models if the root config + # does not define dtype + if config_dtype is None: + config_dtype = getattr(config.get_text_config(), "dtype", None) + if config_dtype is None and hasattr(config, "vision_config"): + config_dtype = getattr(config.vision_config, "dtype", None) + if config_dtype is None and hasattr(config, "encoder_config"): + config_dtype = getattr(config.encoder_config, "dtype", None) + + # Try to read the dtype of the weights if they are in safetensors format + if config_dtype is None: + repo_mt = try_get_safetensors_metadata(model_id, revision=revision) + + if repo_mt and (files_mt := repo_mt.files_metadata): + param_dtypes: set[torch.dtype] = { + _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] + for file_mt in files_mt.values() + for dtype_str in file_mt.parameter_count + if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE + } + + if param_dtypes: + return common_broadcastable_dtype(param_dtypes) + + if config_dtype is None: + config_dtype = torch.float32 + + return config_dtype + + +def _resolve_auto_dtype( + model_type: str, + config_dtype: torch.dtype, + *, + is_pooling_model: bool, +): + from vllm.platforms import current_platform + + supported_dtypes = [ + dtype + for dtype in current_platform.supported_dtypes + if _is_valid_dtype(model_type, dtype) + ] + + if is_pooling_model and torch.float16 in supported_dtypes: + preferred_dtype = torch.float16 + else: + preferred_dtype = supported_dtypes[0] + + # Downcast for float32 models + if config_dtype == torch.float32: + config_dtype = preferred_dtype + + if config_dtype in supported_dtypes: + return config_dtype + + # Ensure device compatibility + device_name = current_platform.get_device_name() + device_capability = current_platform.get_device_capability() + + if device_capability is None: + device_str = f"{device_name!r}" + else: + version_str = device_capability.as_version_str() + device_str = f"{device_name!r} (with compute capability {version_str})" + + logger.warning( + "Your device %s doesn't support %s. Falling back to %s for compatibility.", + device_str, + config_dtype, + preferred_dtype, + ) + + return preferred_dtype + + +def _get_and_verify_dtype( + model_id: str, + config: PretrainedConfig, + dtype: str | torch.dtype, + *, + is_pooling_model: bool, + revision: str | None = None, +) -> torch.dtype: + config_dtype = _find_dtype(model_id, config, revision=revision) + model_type = config.model_type + + if isinstance(dtype, str): + dtype = dtype.lower() + if dtype == "auto": + # Set default dtype from model config + torch_dtype = _resolve_auto_dtype( + model_type, + config_dtype, + is_pooling_model=is_pooling_model, + ) + else: + if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {dtype!r}") + torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] + elif isinstance(dtype, torch.dtype): + torch_dtype = dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + _check_valid_dtype(model_type, torch_dtype) + + if torch_dtype != config_dtype: + if torch_dtype == torch.float32: + # Upcasting to float32 is allowed. + logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) + elif config_dtype == torch.float32: + # Downcasting from float32 to float16 or bfloat16 is allowed. + logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) + else: + # Casting between float16 and bfloat16 is allowed with a warning. + logger.warning("Casting %s to %s.", config_dtype, torch_dtype) + + return torch_dtype + + +def _get_head_dtype( + config: PretrainedConfig, dtype: torch.dtype, runner_type: str +) -> torch.dtype: + head_dtype: str | torch.dtype | None = getattr(config, "head_dtype", None) + + if head_dtype == "model": + return dtype + elif isinstance(head_dtype, str): + head_dtype = head_dtype.lower() + if head_dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {head_dtype!r}") + return _STR_DTYPE_TO_TORCH_DTYPE[head_dtype] + elif isinstance(head_dtype, torch.dtype): + return head_dtype + elif head_dtype is None: + if torch.float32 not in current_platform.supported_dtypes: + return dtype + if runner_type == "pooling": + return torch.float32 + return dtype + else: + raise ValueError(f"Unknown dtype: {head_dtype}") + + +def _get_and_verify_max_len( + hf_config: PretrainedConfig, + tokenizer_config: dict | None, + max_model_len: int | None, + disable_sliding_window: bool, + sliding_window: int | None, + spec_target_max_model_len: int | None = None, + encoder_config: Any | None = None, +) -> int: + """Get and verify the model's maximum length.""" + derived_max_model_len = float("inf") + possible_keys = [ + # OPT + "max_position_embeddings", + # GPT-2 + "n_positions", + # MPT + "max_seq_len", + # ChatGLM2 + "seq_length", + # Command-R + "model_max_length", + # Whisper + "max_target_positions", + # Others + "max_sequence_length", + "max_seq_length", + "seq_len", + ] + # Choose the smallest "max_length" from the possible keys + max_len_key = None + for key in possible_keys: + max_len = getattr(hf_config, key, None) + if max_len is not None: + max_len_key = key if max_len < derived_max_model_len else max_len_key + derived_max_model_len = min(derived_max_model_len, max_len) + # For Command-R / Cohere, Cohere2 / Aya Vision models + if tmp_max_len := getattr(hf_config, "model_max_length", None): + max_len_key = "model_max_length" + derived_max_model_len = tmp_max_len + + # If sliding window is manually disabled, max_length should be less + # than the sliding window length in the model config. + if ( + disable_sliding_window + and sliding_window is not None + and sliding_window < derived_max_model_len + ): + max_len_key = "sliding_window" + derived_max_model_len = sliding_window + + # Consider model_max_length in tokenizer_config + if tokenizer_config: + tokenizer_model_max_length = tokenizer_config.get( + "model_max_length", derived_max_model_len + ) + derived_max_model_len = min(derived_max_model_len, tokenizer_model_max_length) + + # If none of the keys were found in the config, use a default and + # log a warning. + if derived_max_model_len == float("inf"): + if max_model_len is not None: + # If max_model_len is specified, we use it. + return max_model_len + + if spec_target_max_model_len is not None: + # If this is a speculative draft model, we use the max model len + # from the target model. + return spec_target_max_model_len + + default_max_len = 2048 + logger.warning( + "The model's config.json does not contain any of the following " + "keys to determine the original maximum length of the model: " + "%s. Assuming the model's maximum length is %d.", + possible_keys, + default_max_len, + ) + derived_max_model_len = default_max_len + + rope_scaling = getattr(hf_config, "rope_scaling", None) + # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE + # scaling, so we skip applying the scaling factor again. + if rope_scaling is not None and "gemma3" not in hf_config.model_type: + # No need to consider "type" key because of patch_rope_scaling when + # loading HF config + rope_type = rope_scaling["rope_type"] + + if rope_type not in ("su", "longrope", "llama3"): + if disable_sliding_window: + # TODO(robertgshaw): Find a model that supports rope_scaling + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "with rope_scaling. Please raise an issue so we can " + "investigate." + ) + + # NOTE: rope_type == "default" does not define factor + # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + scaling_factor = rope_scaling.get("factor", 1.0) + + if rope_type == "yarn": + derived_max_model_len = rope_scaling["original_max_position_embeddings"] + derived_max_model_len *= scaling_factor + + if encoder_config and "max_seq_length" in encoder_config: + derived_max_model_len = encoder_config["max_seq_length"] + + # If the user didn't specify `max_model_len`, then use that derived from + # the model config as a default value. + if max_model_len is None: + # For LongRoPE, default to original_max_position_embeddings to avoid + # performance degradation for shorter sequences + if rope_scaling is not None and rope_scaling["rope_type"] == "longrope": + max_model_len = int( + getattr( + hf_config, "original_max_position_embeddings", derived_max_model_len + ) + ) + else: + max_model_len = int(derived_max_model_len) + max_model_len = current_platform.check_max_model_len(max_model_len) + + # If the user specified a max length, make sure it is smaller than the + # derived length from the HF model config. + elif max_model_len > derived_max_model_len: + # Some models might have a separate key for specifying model_max_length + # that will be bigger than derived_max_model_len. We compare user input + # with model_max_length and allow this override when it's smaller. + model_max_length = getattr(hf_config, "model_max_length", None) + if model_max_length is not None and max_model_len <= model_max_length: + if disable_sliding_window: + # TODO(robertgshaw): Find a model that has model_max_length + # with sliding window to see if this case should be allowed. + raise NotImplementedError( + "Disabling sliding window is not supported for models " + "model_max_length in the config. Please raise an issue " + "so we can investigate." + ) + else: + msg = ( + f"User-specified max_model_len ({max_model_len}) is greater " + f"than the derived max_model_len ({max_len_key}=" + f"{derived_max_model_len} or model_max_length=" + f"{model_max_length} in model's config.json)." + ) + warning = ( + "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme " + "caution. If the model uses relative position encoding (RoPE), " + "positions exceeding derived_max_model_len lead to nan. If the " + "model uses absolute position encoding, positions exceeding " + "derived_max_model_len will cause a CUDA array out-of-bounds " + "error." + ) + if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN: + logger.warning_once("%s %s", msg, warning) + else: + raise ValueError( + f"{msg} To allow overriding this maximum, set " + f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}" + ) + return int(max_model_len) diff --git a/config/multimodal.py b/config/multimodal.py new file mode 100644 index 0000000..9f62b35 --- /dev/null +++ b/config/multimodal.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Literal, TypeAlias + +from pydantic import ConfigDict, Field, field_validator, model_validator +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + +if TYPE_CHECKING: + from vllm.attention.backends.registry import AttentionBackendEnum +else: + AttentionBackendEnum = Any + + +@dataclass +class BaseDummyOptions: + """Base options for generating dummy data during profiling.""" + + count: int = Field(999, ge=0) + + +@dataclass(config=ConfigDict(extra="forbid")) +class VideoDummyOptions(BaseDummyOptions): + """Options for generating dummy video data during profiling.""" + + num_frames: int | None = Field(None, gt=0) + width: int | None = Field(None, gt=0) + height: int | None = Field(None, gt=0) + + +@dataclass(config=ConfigDict(extra="forbid")) +class ImageDummyOptions(BaseDummyOptions): + """Options for generating dummy image data during profiling.""" + + width: int | None = Field(None, gt=0) + height: int | None = Field(None, gt=0) + + +@dataclass(config=ConfigDict(extra="forbid")) +class AudioDummyOptions(BaseDummyOptions): + """Options for generating dummy audio data during profiling.""" + + length: int | None = Field(None, gt=0) + + +MMEncoderTPMode = Literal["weights", "data"] +MMCacheType = Literal["shm", "lru"] +DummyOptions: TypeAlias = ( + BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions +) + + +@config +@dataclass +class MultiModalConfig: + """Controls the behavior of multimodal models.""" + + limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict) + """The maximum number of input items and options allowed per + prompt for each modality. + Defaults to 999 for each modality. + + Legacy format (count only): + {"image": 16, "video": 2} + + Configurable format (with options): + {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, + "image": {"count": 5, "width": 512, "height": 512}} + + Mixed format (combining both): + {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, + "height": 512}} + """ + enable_mm_embeds: bool = False + """If `True`, enables passing multimodal embeddings: + for `LLM` class, this refers to tensor inputs under `multi_modal_data`; + for the OpenAI-compatible server, this refers to chat messages with content + `"type": "*_embeds"`. + + WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. + Only enable this flag for trusted users!""" + media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" + mm_processor_kwargs: dict[str, object] | None = None + """Arguments to be forwarded to the model's processor for multi-modal data, + e.g., image processor. Overrides for the multi-modal processor obtained + from `transformers.AutoProcessor.from_pretrained`. + + The available overrides depend on the model that is being run. + + For example, for Phi-3-Vision: + `{"num_crops": 4}`.""" + mm_processor_cache_gb: float = Field(default=4, ge=0) + """The size (in GiB) of the multi-modal processor cache, which is used to + avoid re-processing past multi-modal inputs. + + This cache is duplicated for each API process and engine core process, + resulting in a total memory usage of + `mm_processor_cache_gb * (api_server_count + data_parallel_size)`. + + Set to `0` to disable this cache completely (not recommended).""" + mm_processor_cache_type: MMCacheType = "lru" + """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, + use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" + mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0) + """Size limit (in MiB) for each object stored in the multi-modal processor + shared memory cache. Only effective when `mm_processor_cache_type` is + `"shm"`.""" + mm_encoder_tp_mode: MMEncoderTPMode = "weights" + """Indicates how to optimize multi-modal encoder inference using tensor + parallelism (TP). + + - `"weights"`: Within the same vLLM engine, split the weights of + each layer across TP ranks. (default TP behavior)\n + - `"data"`: Within the same vLLM engine, split the batched input data + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP.""" + mm_encoder_attn_backend: AttentionBackendEnum | None = None + """Optional override for the multi-modal encoder attention backend when + using vision transformers. Accepts any value from + `vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`).""" + interleave_mm_strings: bool = False + """Enable fully interleaved support for multimodal prompts, while using + --chat-template-content-format=string.""" + skip_mm_profiling: bool = False + """When enabled, skips multimodal memory profiling and only profiles with + language backbone model during engine initialization. + + This reduces engine startup time but shifts the responsibility to users for + estimating the peak memory usage of the activation of multimodal encoder and + embedding cache.""" + video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0) + """Sets pruning rate for video pruning via Efficient Video Sampling. + Value sits in range [0;1) and determines fraction of media tokens + from each video to be pruned. + """ + + @field_validator("limit_per_prompt", mode="before") + @classmethod + def _validate_limit_per_prompt( + cls, value: dict[str, int | dict[str, int]] + ) -> dict[str, DummyOptions]: + for k, v in value.items(): + # Handle legacy format where only count is specified + if isinstance(v, int): + v = {"count": v} + # Convert to the appropriate DummyOptions subclass + if k == "video": + value[k] = VideoDummyOptions(**v) + elif k == "image": + value[k] = ImageDummyOptions(**v) + elif k == "audio": + value[k] = AudioDummyOptions(**v) + else: + value[k] = BaseDummyOptions(**v) + return value + + @field_validator("mm_encoder_attn_backend", mode="before") + @classmethod + def _validate_mm_encoder_attn_backend( + cls, value: str | AttentionBackendEnum | None + ) -> AttentionBackendEnum | None: + # We need to import the real type here (deferred to avoid circular import). + from vllm.attention.backends.registry import AttentionBackendEnum + + if value is None or isinstance(value, AttentionBackendEnum): + return value + + assert isinstance(value, str), ( + "mm_encoder_attn_backend must be a string or an AttentionBackendEnum." + ) + return AttentionBackendEnum[value.upper()] + + @model_validator(mode="after") + def _validate_multimodal_config(self): + if self.mm_processor_cache_type != "shm" and ( + self.mm_shm_cache_max_object_size_mb + != MultiModalConfig.mm_shm_cache_max_object_size_mb + ): + raise ValueError( + "'mm_shm_cache_max_object_size_mb' should only be set when " + "'mm_processor_cache_type' is 'shm'." + ) + return self + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [ + self.mm_encoder_attn_backend.name + if self.mm_encoder_attn_backend is not None + else None + ] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def get_limit_per_prompt(self, modality: str) -> int: + """ + Get the maximum number of input items allowed per prompt + for the given modality (backward compatible). + """ + limit_data = self.limit_per_prompt.get(modality) + + if limit_data is None: + # Unspecified modality is set to 999 by default + return 999 + return limit_data.count + + def get_dummy_options(self, modality: str) -> BaseDummyOptions | None: + """ + Get the configurable dummy data options for a modality. + Returns None if no options are configured for this modality. + """ + # All values are now DummyOptions after normalization + return self.limit_per_prompt.get(modality) + + def merge_mm_processor_kwargs( + self, + inference_kwargs: Mapping[str, object], + ) -> dict[str, object]: + """ + Get the keyword arguments to pass to the multi-modal processor + according to the extra arguments passed during inference. + """ + kwargs = self.mm_processor_kwargs or {} + return kwargs | dict(inference_kwargs) + + def is_multimodal_pruning_enabled(self): + return self.video_pruning_rate is not None and self.video_pruning_rate > 0 diff --git a/config/observability.py b/config/observability.py new file mode 100644 index 0000000..564c4f7 --- /dev/null +++ b/config/observability.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from functools import cached_property +from typing import Any, Literal, cast + +from packaging.version import parse +from pydantic import field_validator, model_validator +from pydantic.dataclasses import dataclass + +from vllm import version +from vllm.config.utils import config + +DetailedTraceModules = Literal["model", "worker", "all"] + + +@config +@dataclass +class ObservabilityConfig: + """Configuration for observability - metrics and tracing.""" + + show_hidden_metrics_for_version: str | None = None + """Enable deprecated Prometheus metrics that have been hidden since the + specified version. For example, if a previously deprecated metric has been + hidden since the v0.7.0 release, you use + `--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while + you migrate to new metrics. The metric is likely to be removed completely + in an upcoming release.""" + + @cached_property + def show_hidden_metrics(self) -> bool: + """Check if the hidden metrics should be shown.""" + if self.show_hidden_metrics_for_version is None: + return False + return version._prev_minor_version_was(self.show_hidden_metrics_for_version) + + otlp_traces_endpoint: str | None = None + """Target URL to which OpenTelemetry traces will be sent.""" + + collect_detailed_traces: list[DetailedTraceModules] | None = None + """It makes sense to set this only if `--otlp-traces-endpoint` is set. If + set, it will collect detailed traces for the specified modules. This + involves use of possibly costly and or blocking operations and hence might + have a performance impact. + + Note that collecting detailed timing information for each request can be + expensive.""" + + @cached_property + def collect_model_forward_time(self) -> bool: + """Whether to collect model forward time for the request.""" + return self.collect_detailed_traces is not None and ( + "model" in self.collect_detailed_traces + or "all" in self.collect_detailed_traces + ) + + @cached_property + def collect_model_execute_time(self) -> bool: + """Whether to collect model execute time for the request.""" + return self.collect_detailed_traces is not None and ( + "worker" in self.collect_detailed_traces + or "all" in self.collect_detailed_traces + ) + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + @field_validator("show_hidden_metrics_for_version") + @classmethod + def _validate_show_hidden_metrics_for_version(cls, value: str | None) -> str | None: + if value is not None: + # Raises an exception if the string is not a valid version. + parse(value) + return value + + @field_validator("otlp_traces_endpoint") + @classmethod + def _validate_otlp_traces_endpoint(cls, value: str | None) -> str | None: + if value is not None: + from vllm.tracing import is_otel_available, otel_import_error_traceback + + if not is_otel_available(): + raise ValueError( + "OpenTelemetry is not available. Unable to configure " + "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " + f"installed. Original error:\n{otel_import_error_traceback}" + ) + return value + + @field_validator("collect_detailed_traces") + @classmethod + def _validate_collect_detailed_traces( + cls, value: list[DetailedTraceModules] | None + ) -> list[DetailedTraceModules] | None: + """Handle the legacy case where users might provide a comma-separated + string instead of a list of strings.""" + if value is not None and len(value) == 1 and "," in value[0]: + value = cast(list[DetailedTraceModules], value[0].split(",")) + return value + + @model_validator(mode="after") + def _validate_tracing_config(self): + if self.collect_detailed_traces and not self.otlp_traces_endpoint: + raise ValueError( + "collect_detailed_traces requires `--otlp-traces-endpoint` to be set." + ) + return self diff --git a/config/parallel.py b/config/parallel.py new file mode 100644 index 0000000..9a6326d --- /dev/null +++ b/config/parallel.py @@ -0,0 +1,655 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +import os +from typing import TYPE_CHECKING, Any, Literal + +import torch +from pydantic import Field, model_validator +from pydantic.dataclasses import dataclass +from torch.distributed import ProcessGroup, ReduceOp +from typing_extensions import Self + +import vllm.envs as envs +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.model_executor.layers.batch_invariant import ( + vllm_is_batch_invariant, +) +from vllm.platforms import current_platform +from vllm.utils.network_utils import get_open_ports_list +from vllm.utils.torch_utils import cuda_device_count_stateless + +if TYPE_CHECKING: + from ray.runtime_env import RuntimeEnv + from ray.util.placement_group import PlacementGroup + + from vllm.v1.executor import Executor +else: + RuntimeEnv = Any + PlacementGroup = Any + Executor = Any + +logger = init_logger(__name__) + +ExpertPlacementStrategy = Literal["linear", "round_robin"] +DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] +DataParallelBackend = Literal["ray", "mp"] + + +@config +@dataclass +class EPLBConfig: + """Configuration for Expert Parallel Load Balancing (EP).""" + + window_size: int = 1000 + """Window size for expert load recording.""" + step_interval: int = 3000 + """ + Interval for rearranging experts in expert parallelism. + + Note that if this is greater than the EPLB window size, only the metrics + of the last `lb_window_size` steps will be used for rearranging experts. + """ + + num_redundant_experts: int = Field(default=0, ge=0) + """Number of redundant experts to use for expert parallelism.""" + + log_balancedness: bool = False + """ + Log the balancedness each step of expert parallelism. + This is turned off by default since it will cause communication overhead. + """ + + +@config +@dataclass +class ParallelConfig: + """Configuration for the distributed execution.""" + + pipeline_parallel_size: int = 1 + """Number of pipeline parallel groups.""" + tensor_parallel_size: int = 1 + """Number of tensor parallel groups.""" + data_parallel_size: int = 1 + """Number of data parallel groups. MoE layers will be sharded according to + the product of the tensor parallel size and data parallel size.""" + data_parallel_size_local: int = 1 + """Number of local data parallel groups.""" + data_parallel_rank: int = 0 + """Rank of the data parallel group.""" + data_parallel_rank_local: int | None = None + """Local rank of the data parallel group, + set only in SPMD mode.""" + data_parallel_master_ip: str = "127.0.0.1" + """IP of the data parallel master.""" + data_parallel_rpc_port: int = 29550 + """Port for data parallel messaging.""" + data_parallel_master_port: int = 29500 + """Port of the data parallel master.""" + data_parallel_backend: DataParallelBackend = "mp" + """Backend to use for data parallel, either "mp" or "ray".""" + data_parallel_external_lb: bool = False + """Whether to use "external" DP LB mode. Applies only to online serving + and when data_parallel_size > 0. This is useful for a "one-pod-per-rank" + wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank + is provided explicitly to vllm serve.""" + data_parallel_hybrid_lb: bool = False + """Whether to use "hybrid" DP LB mode. Applies only to online serving + and when data_parallel_size > 0. Enables running an AsyncLLM + and API server on a "per-node" basis where vLLM load balances + between local data parallel ranks, but an external LB balances + between vLLM nodes/replicas. Set explicitly in conjunction with + --data-parallel-start-rank.""" + enable_expert_parallel: bool = False + """Use expert parallelism instead of tensor parallelism for MoE layers.""" + enable_eplb: bool = False + """Enable expert parallelism load balancing for MoE layers.""" + eplb_config: EPLBConfig = Field(default_factory=EPLBConfig) + """Expert parallelism configuration.""" + expert_placement_strategy: ExpertPlacementStrategy = "linear" + """The expert placement strategy for MoE layers:\n + - "linear": Experts are placed in a contiguous manner. For example, with 4 + experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have + experts [2, 3].\n + - "round_robin": Experts are placed in a round-robin manner. For example, + with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1 + will have experts [1, 3]. This strategy can help improve load balancing + for grouped expert models with no redundant experts.""" + all2all_backend: ( + Literal[ + "naive", + "pplx", + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter", + "flashinfer_all2allv", + ] + | None + ) = None + """All2All backend for MoE expert parallel communication. If not set, uses + the value from VLLM_ALL2ALL_BACKEND environment variable. Available options: + - "naive": Naive all2all implementation using broadcasts + - "allgather_reducescatter": All2all based on allgather and reducescatter + - "pplx": Use pplx kernels + - "deepep_high_throughput": Use deepep high-throughput kernels + - "deepep_low_latency": Use deepep low-latency kernels + - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl""" + num_redundant_experts: int | None = None + """`num_redundant_experts` is deprecated and has been replaced with + `eplb_config.num_redundant_experts`. This will be removed in v0.12.0. + Please use `eplb_config.num_redundant_experts` instead.""" + eplb_window_size: int | None = None + """`eplb_window_size` is deprecated and has been replaced with + `eplb_config.window_size`. This will be removed in v0.12.0. + Please use `eplb_config.window_size` instead.""" + eplb_step_interval: int | None = None + """`eplb_step_interval` is deprecated and has been replaced with + `eplb_config.step_interval`. This will be removed in v0.12.0. + Please use `eplb_config.step_interval` instead.""" + eplb_log_balancedness: bool | None = None + """`eplb_log_balancedness` is deprecated and has been replaced with + `eplb_config.log_balancedness`. This will be removed in v0.12.0. + Please use `eplb_config.log_balancedness` instead.""" + + max_parallel_loading_workers: int | None = None + """Maximum number of parallel loading workers when loading model + sequentially in multiple batches. To avoid RAM OOM when using tensor + parallel and large models.""" + + disable_custom_all_reduce: bool = False + """Disable the custom all-reduce kernel and fall back to NCCL.""" + + enable_dbo: bool = False + """Enable dual batch overlap for the model executor.""" + + dbo_decode_token_threshold: int = 32 + """The threshold for dual batch overlap for batches only containing decodes. + If the number of tokens in the request is greater than this threshold, + microbatching will be used. Otherwise, the request will be processed in a + single batch.""" + dbo_prefill_token_threshold: int = 512 # TODO(lucas): tune + """The threshold for dual batch overlap for batches that contain one or more + prefills. If the number of tokens in the request is greater than this + threshold, microbatching will be used. Otherwise, the request will be + processed in a single batch.""" + + disable_nccl_for_dp_synchronization: bool = False + """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py + to use Gloo instead of NCCL for its all reduce""" + + ray_workers_use_nsight: bool = False + """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.""" + + ray_runtime_env: RuntimeEnv | None = None + """Ray runtime environment to pass to distributed workers.""" + + placement_group: PlacementGroup | None = None + """ray distributed model workers placement group.""" + + distributed_executor_backend: ( + str | DistributedExecutorBackend | type[Executor] | None + ) = None + """Backend to use for distributed model + workers, either "ray" or "mp" (multiprocessing). If the product + of pipeline_parallel_size and tensor_parallel_size is less than + or equal to the number of GPUs available, "mp" will be used to + keep processing on a single host. Otherwise, this will default + to "ray" if Ray is installed and fail otherwise. Note that tpu + only support Ray for distributed inference.""" + + worker_cls: str = "auto" + """The full name of the worker class to use. If "auto", the worker class + will be determined based on the platform.""" + sd_worker_cls: str = "auto" + """The full name of the worker class to use for speculative decoding. + If "auto", the worker class will be determined based on the platform.""" + worker_extension_cls: str = "" + """The full name of the worker extension class to use. The worker extension + class is dynamically inherited by the worker class. This is used to inject + new attributes and methods to the worker class for use in collective_rpc + calls.""" + master_addr: str = "127.0.0.1" + """distributed master address for multi-node distributed + inference when distributed_executor_backend is mp.""" + master_port: int = 29501 + """distributed master port for multi-node distributed + inference when distributed_executor_backend is mp.""" + node_rank: int = 0 + """distributed node rank for multi-node distributed + inference when distributed_executor_backend is mp.""" + nnodes: int = 1 + """num of nodes for multi-node distributed + inference when distributed_executor_backend is mp.""" + + world_size: int = Field(init=False) + """world_size is TPxPP, it affects the number of workers we create.""" + + rank: int = 0 + """Global rank in distributed setup.""" + + _data_parallel_master_port_list: list[int] = Field(default_factory=list) + """List of open port auto-queried for data parallel messaging. + Set to be private as it's not intended to be configured by users. + """ + + decode_context_parallel_size: int = 1 + """Number of decode context parallel groups, because the world size does + not change by dcp, it simply reuse the GPUs of TP group, and tp_size + needs to be divisible by dcp_size.""" + + dcp_kv_cache_interleave_size: int = 1 + """Interleave size of kv_cache storage while using dcp or cp > 1, + store interleave_size tokens on (d)cp i, + then store next interleave_size tokens on (d)cp i+1. + Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size. + Interleave_size=block_size: block-level align, first fill the block on first rank, + token is stored on rank i+1 block j after rank i block j is full. + Block_size should be greater than or equal to dcp_kv_cache_interleave_size. + Block_size should be divisible by dcp_kv_cache_interleave_size. + """ + + _api_process_count: int = Field(default=1, gt=0) + """ + The number of API processes initialized. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + + _api_process_rank: int = Field(default=0, ge=-1) + """ + The rank of this API process, or `-1` for engine core processes + under API server scale-out. + + Note: + This is an internal config that is only valid for and + should only be set by API server scale-out. + """ + + @model_validator(mode="after") + def _validate_parallel_config(self) -> Self: + if self._api_process_rank >= self._api_process_count: + raise ValueError( + "Invalid value of `_api_process_rank`. " + f"Expected to be `-1` or `[0, {self._api_process_count})`, " + f"but found: {self._api_process_rank}" + ) + + if self.data_parallel_size_local > self.data_parallel_size: + raise ValueError( + f"data_parallel_size_local ({self.data_parallel_size_local}) " + f"must be <= data_parallel_size ({self.data_parallel_size})" + ) + + if self.data_parallel_size <= 1 and self.data_parallel_external_lb: + raise ValueError( + "data_parallel_external_lb can only be set when data_parallel_size > 1" + ) + + if self.enable_eplb: + if not current_platform.is_cuda_alike(): + raise ValueError( + "Expert parallelism load balancing is only supported on " + "CUDA devices or ROCm devices now." + ) + if not self.enable_expert_parallel: + raise ValueError("enable_expert_parallel must be True to use EPLB.") + if self.tensor_parallel_size * self.data_parallel_size <= 1: + raise ValueError( + "EPLB requires tensor_parallel_size or data_parallel_size " + f"to be greater than 1, but got " + f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}." + ) + else: + if self.eplb_config.num_redundant_experts != 0: + raise ValueError( + "num_redundant_experts is set to " + f"{self.eplb_config.num_redundant_experts} but EPLB is not " + "enabled. Either enable EPLB or unset " + "num_redundant_experts." + ) + + return self + + @property + def world_size_across_dp(self) -> int: + """world_size_across_dp is TPxPPxDP, it is the size of the world + including data parallelism.""" + return self.world_size * self.data_parallel_size + + def get_next_dp_init_port(self) -> int: + """ + We might need to initialize process groups in multiple + processes that is related to data parallelism, + e.g. both in the worker and in the engine, which + can live in different processes. To avoid port conflicts, we + pop a new port from the prepared port list each time we need to + initialize a new process group related to data parallelism. + """ + if self._data_parallel_master_port_list: + answer = self._data_parallel_master_port_list.pop() + else: + answer = self.data_parallel_master_port + self.data_parallel_master_port += 1 + + return answer + + def stateless_init_dp_group(self) -> ProcessGroup: + # NOTE: In high-concurrency scenarios multiple processes + # can pick the same (currently free) port through a race + # condition when calling `get_open_port()`. When the first + # process binds the port the others will subsequently fail + # with `torch.distributed.DistNetworkError: EADDRINUSE`. + # To make the initialization more robust we retry a few times + # with a fresh port whenever this specific error is observed. + from torch.distributed import DistNetworkError + + from vllm.distributed.utils import ( + stateless_init_torch_distributed_process_group, + ) + + max_retries = 5 + last_exc: Exception | None = None + for _ in range(max_retries): + try: + # use gloo since the engine process might not have cuda device + return stateless_init_torch_distributed_process_group( + self.data_parallel_master_ip, + self.get_next_dp_init_port(), + self.data_parallel_rank, + self.data_parallel_size, + backend=current_platform.dist_backend, + ) + except DistNetworkError as e: + # We only want to retry when the root cause is EADDRINUSE. + if "EADDRINUSE" in str(e): + logger.warning("Address already in use. Retrying with a new port.") + last_exc = e + continue # try again with a new port + raise e + + # If we get here all retries have failed. + assert last_exc is not None + raise last_exc + + # The all_reduce at the end of attention (during o_proj) means that + # inputs are replicated across each rank of the tensor parallel group. + # If using expert-parallelism with DeepEP All2All ops, replicated + # tokens results in useless duplicate computation and communication. + # + # In this case, ensure the input to the experts is sequence parallel + # to avoid the excess work. + # + # Not needed for pplx-kernels as it can handle duplicate input tokens. + @property + def use_sequence_parallel_moe(self) -> bool: + return ( + self.all2all_backend + in ( + "allgather_reducescatter", + "naive", + "deepep_high_throughput", + "deepep_low_latency", + ) + and self.enable_expert_parallel + and self.tensor_parallel_size > 1 + and self.data_parallel_size > 1 + ) + + @property + def node_rank_within_dp(self) -> int: + return self.node_rank % self.nnodes_within_dp + + @property + def nnodes_within_dp(self) -> int: + if self.nnodes == 1: + return 1 + data_parallel_node_size = ( + self.data_parallel_size // self.data_parallel_size_local + ) + return self.nnodes // data_parallel_node_size + + @property + def local_world_size(self) -> int: + return self.world_size // self.nnodes_within_dp + + @staticmethod + def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool: + tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu") + # dp rank 0: has_unfinished_seqs=True + # dp rank 1: has_unfinished_seqs=False + # aggregated: has_unfinished_seqs=True + # so this is an OR operation, i.e. MAX in integers + torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group) + aggregated_has_unfinished = bool(tensor.item()) + return aggregated_has_unfinished + + @staticmethod + def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int: + if kv_cache_memory == -1: + kv_cache_memory = torch.iinfo(torch.int64).max + tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu") + # we cannot use broadcast for stateless dp group since it depends + # on global rank + torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group) + return tensor.item() + + def compute_hash(self): + """ + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + + This hash is also used for DP worker configuration validation + to prevent hangs from mismatched collective communication patterns. + """ + factors: list[Any] = [] + factors.append(self.pipeline_parallel_size) + factors.append(self.tensor_parallel_size) + factors.append(self.enable_expert_parallel) + factors.append(self.data_parallel_size) + factors.append(self.all2all_backend) + factors.append(self.enable_eplb) + if self.enable_eplb: + factors.append(self.eplb_config.log_balancedness) + factors.append(self.eplb_config.window_size) + factors.append(self.eplb_config.step_interval) + factors.append(self.eplb_config.num_redundant_experts) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __post_init__(self) -> None: + # Set all2all_backend from env var if not specified, with deprecation warning + if self.all2all_backend is None: + self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND + if envs.is_set("VLLM_ALL2ALL_BACKEND"): + logger.warning_once( + "VLLM_ALL2ALL_BACKEND environment variable is deprecated and " + "will be removed in a future release. Please use the " + "--all2all-backend command-line argument instead." + ) + + # Forward deprecated fields to their new location + if self.num_redundant_experts is not None: + self.eplb_config.num_redundant_experts = self.num_redundant_experts + logger.warning_once( + "num_redundant_experts is deprecated and has been replaced " + "with eplb_config.num_redundant_experts. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect." + ) + if self.eplb_window_size is not None: + self.eplb_config.window_size = self.eplb_window_size + logger.warning_once( + "eplb_window_size is deprecated and has been replaced " + "with eplb_config.window_size. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect." + ) + if self.eplb_step_interval is not None: + self.eplb_config.step_interval = self.eplb_step_interval + logger.warning_once( + "eplb_step_interval is deprecated and has been replaced " + "with eplb_config.step_interval. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect." + ) + if self.eplb_log_balancedness is not None: + self.eplb_config.log_balancedness = self.eplb_log_balancedness + logger.warning_once( + "eplb_log_balancedness is deprecated and has been replaced " + "with eplb_config.log_balancedness. This will be removed " + "in v0.12.0. Changing this field after initialization will " + "have no effect." + ) + + # Continue with the rest of the initialization + self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size + + if self.distributed_executor_backend == "external_launcher": + logger.info("Using external launcher for distributed inference.") + self.world_size *= self.data_parallel_size + + if self.data_parallel_size > 1 or self.data_parallel_size_local == 0: + # Data parallel was specified in the engine args. + if self.distributed_executor_backend == "external_launcher": + # For external launcher, + # we need to set the data parallel rank automatically + self.data_parallel_rank = int(os.environ["RANK"]) // ( + self.world_size // self.data_parallel_size + ) + logger.info( + "Set data_parallel_rank to %d automatically.", + self.data_parallel_rank, + ) + if not self._data_parallel_master_port_list: + self._data_parallel_master_port_list = get_open_ports_list(5) + self.data_parallel_master_port = self._data_parallel_master_port_list.pop() + + if not (0 <= self.data_parallel_rank < self.data_parallel_size): + raise ValueError( + f"data_parallel_rank ({self.data_parallel_rank})" + f" must be in the range [0, {self.data_parallel_size})" + ) + else: + # Otherwise fall back to env vars (e.g. for offline SPMD case). + self.data_parallel_size = envs.VLLM_DP_SIZE + self.data_parallel_rank = envs.VLLM_DP_RANK + self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL + self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP + self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT + + if self.distributed_executor_backend == "external_launcher": + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" + logger.info("Disabling V1 multiprocessing for external launcher.") + + if self.distributed_executor_backend is None and self.world_size > 1: + # We use multiprocessing by default if world_size fits on the + # current node and we aren't in a ray placement group. + + from vllm.v1.executor import ray_utils + + backend: DistributedExecutorBackend = "mp" + ray_found = ray_utils.ray_is_available() + if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: + backend = "uni" + elif current_platform.is_cuda() and self.nnodes > 1: + backend = "mp" + elif ( + current_platform.is_cuda() + and cuda_device_count_stateless() < self.world_size + ): + gpu_count = cuda_device_count_stateless() + raise ValueError( + f"Tensor parallel size ({self.world_size}) cannot be " + f"larger than the number of available GPUs ({gpu_count})." + ) + elif self.data_parallel_backend == "ray": + logger.info( + "Using ray distributed inference because " + "data_parallel_backend is ray" + ) + backend = "ray" + elif ray_found: + if self.placement_group: + backend = "ray" + else: + from ray import is_initialized as ray_is_initialized + + if ray_is_initialized(): + from ray.util import get_current_placement_group + + if get_current_placement_group(): + backend = "ray" + self.distributed_executor_backend = backend + logger.debug("Defaulting to use %s for distributed inference", backend) + + if self.distributed_executor_backend is None and self.world_size == 1: + self.distributed_executor_backend = "uni" + + if self.max_parallel_loading_workers is not None: + logger.warning( + "max_parallel_loading_workers is currently " + "not supported and will be ignored." + ) + if self.distributed_executor_backend != "mp" and self.nnodes > 1: + raise ValueError( + "nnodes > 1 can only be set when distributed exectuor backend is mp." + ) + + @property + def use_ray(self) -> bool: + return self.distributed_executor_backend == "ray" or ( + isinstance(self.distributed_executor_backend, type) + and getattr(self.distributed_executor_backend, "uses_ray", False) + ) + + @model_validator(mode="after") + def _verify_args(self) -> Self: + # Lazy import to avoid circular import + from vllm.v1.executor import Executor + + # Enable batch invariance settings if requested + if vllm_is_batch_invariant(): + self.disable_custom_all_reduce = True + + if ( + self.distributed_executor_backend is not None + and not isinstance(self.distributed_executor_backend, str) + and not ( + isinstance(self.distributed_executor_backend, type) + and issubclass(self.distributed_executor_backend, Executor) + ) + ): + raise ValueError( + "Unrecognized distributed executor backend " + f"{self.distributed_executor_backend}. Supported " + "values are 'ray', 'mp' 'uni', 'external_launcher', " + " custom Executor subclass or its import path." + ) + if self.use_ray: + from vllm.v1.executor import ray_utils + + ray_utils.assert_ray_available() + + if not current_platform.use_custom_allreduce(): + self.disable_custom_all_reduce = True + logger.debug( + "Disabled the custom all-reduce kernel because it is not " + "supported on current platform." + ) + if self.nnodes > 1: + self.disable_custom_all_reduce = True + logger.debug( + "Disabled the custom all-reduce since we are running on multi-node." + ) + if self.ray_workers_use_nsight and not self.use_ray: + raise ValueError( + "Unable to use nsight profiling unless workers run with Ray." + ) + + return self diff --git a/config/pooler.py b/config/pooler.py new file mode 100644 index 0000000..6bece8d --- /dev/null +++ b/config/pooler.py @@ -0,0 +1,122 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import Any + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@config +@dataclass +class PoolerConfig: + """Controls the behavior of output pooling in pooling models.""" + + pooling_type: str | None = None + """ + The pooling method of the pooling model. This should be a key in + [`vllm.model_executor.layers.pooler.PoolingType`][]. + """ + + ## for embeddings models + normalize: bool | None = None + """ + Whether to normalize the embeddings outputs. Defaults to True. + """ + dimensions: int | None = None + """ + Reduce the dimensions of embeddings if model + support matryoshka representation. Defaults to None. + """ + enable_chunked_processing: bool | None = None + """ + Whether to enable chunked processing for long inputs that exceed the model's + maximum position embeddings. When enabled, long inputs will be split into + chunks, processed separately, and then aggregated using weighted averaging. + This allows embedding models to handle arbitrarily long text without CUDA + errors. Defaults to False. + """ + max_embed_len: int | None = None + """ + Maximum input length allowed for embedding generation. When set, allows + inputs longer than max_embed_len to be accepted for embedding models. + When an input exceeds max_embed_len, it will be handled according to + the original max_model_len validation logic. + Defaults to None (i.e. set to max_model_len). + """ + + ## for classification models + softmax: float | None = None + """ + softmax will be deprecated, please use use_activation instead. + """ + activation: float | None = None + """ + activation will be deprecated, please use use_activation instead. + """ + use_activation: bool | None = None + """ + Whether to apply activation function to the classification outputs. + Defaults to True. + """ + logit_bias: float | None = None + """ + If provided, apply classification logit biases. Defaults to None. + """ + + ## for reward models + step_tag_id: int | None = None + """ + If set, only the score corresponding to the `step_tag_id` in the + generated sentence should be returned. Otherwise, the scores for all tokens + are returned. + """ + returned_token_ids: list[int] | None = None + """ + A list of indices for the vocabulary dimensions to be extracted, + such as the token IDs of `good_token` and `bad_token` in the + `math-shepherd-mistral-7b-prm` model. + """ + + def __post_init__(self): + # raise deprecated warning for softmax and activation + self.use_activation = get_use_activation(self) + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + +def get_use_activation(o: object): + if softmax := getattr(o, "softmax", None) is not None: + logger.warning_once( + "softmax will be deprecated, please use use_activation instead." + ) + return softmax + + if activation := getattr(o, "activation", None) is not None: + logger.warning_once( + "activation will be deprecated, please use use_activation instead." + ) + return activation + + return getattr(o, "use_activation", None) diff --git a/config/scheduler.py b/config/scheduler.py new file mode 100644 index 0000000..8194295 --- /dev/null +++ b/config/scheduler.py @@ -0,0 +1,298 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from collections.abc import Callable +from dataclasses import InitVar +from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast + +from pydantic import Field, field_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self, deprecated + +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils.import_utils import resolve_obj_by_qualname + +if TYPE_CHECKING: + from vllm.v1.core.sched.interface import SchedulerInterface + +logger = init_logger(__name__) + +RunnerType = Literal["generate", "pooling", "draft"] +SchedulerPolicy = Literal["fcfs", "priority"] + + +@config +@dataclass +class SchedulerConfig: + """Scheduler configuration.""" + + DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048 + DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128 + + runner_type: RunnerType = "generate" + """The runner type to launch for the model.""" + + max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1) + """Maximum number of tokens to be processed in a single iteration. + + The default value here is mainly for convenience when testing. + In real usage, this should be set in `EngineArgs.create_engine_config`. + """ + + max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1) + """Maximum number of sequences to be processed in a single iteration. + + The default value here is mainly for convenience when testing. + In real usage, this should be set in `EngineArgs.create_engine_config`. + """ + + max_num_partial_prefills: int = Field(default=1, ge=1) + """For chunked prefill, the maximum number of sequences that can be + partially prefilled concurrently.""" + + max_long_partial_prefills: int = Field(default=1, ge=1) + """For chunked prefill, the maximum number of prompts longer than + long_prefill_token_threshold that will be prefilled concurrently. Setting + this less than max_num_partial_prefills will allow shorter prompts to jump + the queue in front of longer prompts in some cases, improving latency.""" + + long_prefill_token_threshold: int = 0 + """For chunked prefill, a request is considered long if the prompt is + longer than this number of tokens.""" + + num_lookahead_slots: int = Field(default=0, ge=0) + """The number of slots to allocate per sequence per + step, beyond the known token ids. This is used in speculative + decoding to store KV activations of tokens which may or may not be + accepted. + + NOTE: This will be replaced by speculative config in the future; it is + present to enable correctness tests until then.""" + + enable_chunked_prefill: bool = True + """If True, prefill requests can be chunked based + on the remaining `max_num_batched_tokens`. + + The default value here is mainly for convenience when testing. + In real usage, this should be set in `EngineArgs.create_engine_config`. + """ + + is_multimodal_model: bool = False + """True if the model is multimodal.""" + + max_model_len: InitVar[int] = 8192 + """Maximum length of a sequence (including prompt and generated text). + + Note: This is stored in the ModelConfig, and is used only here to + provide fallbacks and validate other attributes.""" + + is_encoder_decoder: InitVar[bool] = False + """True if the model is an encoder-decoder model. + + Note: This is stored in the ModelConfig, and is used only here to + disable chunked prefill and prefix caching for encoder-decoder models. + """ + + # TODO (ywang96): Make this configurable. + max_num_encoder_input_tokens: int = Field(init=False) + """Multimodal encoder compute budget, only used in V1. + + NOTE: This is not currently configurable. It will be overridden by + max_num_batched_tokens in case max multimodal embedding size is larger.""" + + # TODO (ywang96): Make this configurable. + encoder_cache_size: int = Field(init=False) + """Multimodal encoder cache size, only used in V1. + + NOTE: This is not currently configurable. It will be overridden by + max_num_batched_tokens in case max multimodal embedding size is larger.""" + + policy: SchedulerPolicy = "fcfs" + """The scheduling policy to use:\n + - "fcfs" means first come first served, i.e. requests are handled in order + of arrival.\n + - "priority" means requests are handled based on given priority (lower + value means earlier handling) and time of arrival deciding any ties).""" + + disable_chunked_mm_input: bool = False + """If set to true and chunked prefill is enabled, we do not want to + partially schedule a multimodal item. Only used in V1 + This ensures that if a request has a mixed prompt + (like text tokens TTTT followed by image tokens IIIIIIIIII) where only + some image tokens can be scheduled (like TTTTIIIII, leaving IIIII), + it will be scheduled as TTTT in one step and IIIIIIIIII in the next.""" + + # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler" + # (default) or "mod.custom_class". + scheduler_cls: str | type[object] = Field(default=None) + """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is + the default scheduler. Can be a class directly or the path to a class of + form "mod.custom_class".""" + + disable_hybrid_kv_cache_manager: bool = False + """If set to True, KV cache manager will allocate the same size of KV cache + for all attention layers even if there are multiple type of attention layers + like full attention and sliding window attention. + """ + + async_scheduling: bool = False + """If set to True, perform async scheduling. This helps to avoid gaps in + GPU utilization, leading to better latency and throughput. + Async scheduling is currently not supported with some features such as + speculative decoding and pipeline parallelism. + """ + + stream_interval: int = Field(default=1, ge=1) + """The interval (or buffer size) for streaming in terms of token length. + A smaller value (1) makes streaming smoother by sending each token immediately, + while a larger value (e.g., 10) reduces host overhead and may increase throughput + by batching multiple tokens before sending.""" + + def get_scheduler_cls(self) -> type["SchedulerInterface"]: + if self.scheduler_cls is None: + if self.async_scheduling: + from vllm.v1.core.sched.async_scheduler import AsyncScheduler + + return AsyncScheduler + from vllm.v1.core.sched.scheduler import Scheduler + + return Scheduler + + # This warning can be removed once the Scheduler interface is + # finalized and we can maintain support for scheduler classes that + # implement it + logger.warning_once( + "Using custom scheduler class %s. This scheduler interface is " + "not public and compatibility may not be maintained.", + self.scheduler_cls, + ) + if not isinstance(self.scheduler_cls, str): + return cast(type["SchedulerInterface"], self.scheduler_cls) + return resolve_obj_by_qualname(self.scheduler_cls) + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + @field_validator("scheduler_cls", "async_scheduling", mode="wrap") + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + """Skip validation if the value is `None` when initialisation is delayed.""" + if value is None: + return value + return handler(value) + + def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None: + if is_encoder_decoder: + # Chunked prefill should be disabled for encoder-decoder models. + self.disable_chunked_mm_input = True + self.enable_chunked_prefill = False + self.long_prefill_token_threshold = 0 + logger.info( + "Encoder-decoder models do not support chunked prefill nor" + " prefix caching; disabling both." + ) + + self.max_num_encoder_input_tokens = self.max_num_batched_tokens + self.encoder_cache_size = self.max_num_batched_tokens + + if self.enable_chunked_prefill: + logger.info( + "Chunked prefill is enabled with max_num_batched_tokens=%d.", + self.max_num_batched_tokens, + ) + + if self.max_num_partial_prefills > 1: + if self.long_prefill_token_threshold == 0: + self.long_prefill_token_threshold = int(max_model_len * 0.04) + + logger.info( + "Concurrent partial prefills enabled with " + "max_num_partial_prefills=%d, max_long_partial_prefills=%d, " + "long_prefill_token_threshold=%d", + self.max_num_partial_prefills, + self.max_long_partial_prefills, + self.long_prefill_token_threshold, + ) + + self.verify_max_model_len(max_model_len) + + @property + @deprecated( + "`SchedulerConfig.chunked_prefill_enabled` has been renamed to " + "`SchedulerConfig.enable_chunked_prefill`. " + "The old name will be removed in v0.12." + ) + def chunked_prefill_enabled(self) -> bool: + return self.enable_chunked_prefill + + @chunked_prefill_enabled.setter + def chunked_prefill_enabled(self, value: bool): + self.enable_chunked_prefill = value + + def verify_max_model_len(self, max_model_len: int) -> Self: + if ( + self.max_num_batched_tokens < max_model_len + and not self.enable_chunked_prefill + ): + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " + f"smaller than max_model_len ({max_model_len}). " + "This effectively limits the maximum sequence length to " + "max_num_batched_tokens and makes vLLM reject longer " + "sequences. Please increase max_num_batched_tokens or " + "decrease max_model_len." + ) + + if self.max_num_batched_tokens < self.max_num_seqs: + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " + "be greater than or equal to max_num_seqs " + f"({self.max_num_seqs})." + ) + + if self.max_num_batched_tokens > self.max_num_seqs * max_model_len: + logger.warning( + "max_num_batched_tokens (%d) exceeds max_num_seqs " + "* max_model_len (%d). This may lead to unexpected behavior.", + self.max_num_batched_tokens, + self.max_num_seqs * max_model_len, + ) + + if self.max_num_partial_prefills > 1: + if not self.enable_chunked_prefill: + raise ValueError( + "Chunked prefill must be enabled to set " + "max_num_partial_prefills > 1." + ) + + if self.long_prefill_token_threshold > max_model_len: + raise ValueError( + "long_prefill_token_threshold " + f"({self.long_prefill_token_threshold}) cannot be greater " + f"than the max_model_len ({max_model_len})." + ) + + if self.max_long_partial_prefills > self.max_num_partial_prefills: + raise ValueError( + f"{self.max_long_partial_prefills=} must be less than or equal to " + f"{self.max_num_partial_prefills=}." + ) + + return self diff --git a/config/speculative.py b/config/speculative.py new file mode 100644 index 0000000..13a8632 --- /dev/null +++ b/config/speculative.py @@ -0,0 +1,654 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import ast +import hashlib +from typing import TYPE_CHECKING, Any, Literal, get_args + +from pydantic import Field, SkipValidation, model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +from vllm.config.parallel import ParallelConfig +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils.import_utils import LazyLoader, has_arctic_inference + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + import vllm.model_executor.layers.quantization as me_quant + from vllm.config import ModelConfig +else: + PretrainedConfig = Any + ModelConfig = Any + + me_quant = LazyLoader( + "model_executor", globals(), "vllm.model_executor.layers.quantization" + ) + +logger = init_logger(__name__) + +MTPModelTypes = Literal[ + "deepseek_mtp", + "mimo_mtp", + "glm4_moe_mtp", + "ernie_mtp", + "qwen3_next_mtp", + "longcat_flash_mtp", + "mtp", + "pangu_ultra_moe_mtp", +] +EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes] +SpeculativeMethod = Literal[ + "ngram", + "medusa", + "mlp_speculator", + "draft_model", + "suffix", + EagleModelTypes, +] + + +@config +@dataclass +class SpeculativeConfig: + """Configuration for speculative decoding.""" + + enforce_eager: bool | None = None + """Override the default enforce_eager from model_config""" + # General speculative decoding control + num_speculative_tokens: int = Field(default=None, gt=0) + """The number of speculative tokens, if provided. It will default to the + number in the draft model config if present, otherwise, it is required.""" + model: str | None = None + """The name of the draft model, eagle head, or additional weights, if + provided.""" + method: SpeculativeMethod | None = None + """The name of the speculative method to use. If users provide and set the + `model` param, the speculative method type will be detected automatically + if possible, if `model` param is not provided, the method name must be + provided. + + If using `ngram` method, the related configuration `prompt_lookup_max` and + `prompt_lookup_min` should be considered.""" + draft_tensor_parallel_size: int | None = Field(default=None, ge=1) + """The degree of the tensor parallelism for the draft model. Can only be 1 + or the same as the target model's tensor parallel size.""" + + # Draft model configuration + quantization: me_quant.QuantizationMethods | None = None + """Quantization method that was used to quantize the draft model weights. + If `None`, we assume the model weights are not quantized. Note that it only + takes effect when using the draft model-based speculative method.""" + max_model_len: int | None = Field(default=None, ge=1) + """The maximum model length of the draft model. Used when testing the + ability to skip speculation for some sequences.""" + revision: str | None = None + """The specific model version to use for the draft model. It can be a + branch name, a tag name, or a commit id. If unspecified, will use the + default version.""" + code_revision: str | None = None + """The specific revision to use for the draft model code on Hugging Face + Hub. It can be a branch name, a tag name, or a commit id. If unspecified, + will use the default version.""" + + # Advanced control + disable_by_batch_size: int | None = Field(default=None, ge=2) + """Disable speculative decoding for new incoming requests when the number + of enqueued requests is larger than this value, if provided.""" + disable_padded_drafter_batch: bool = False + """Disable input padding for speculative decoding. If set to True, + speculative input batches can contain sequences of different lengths, + which may only be supported by certain attention backends. This currently + only affects the EAGLE method of speculation.""" + + # Ngram proposer configuration + prompt_lookup_max: int | None = Field(default=None, ge=1) + """Maximum size of ngram token window when using Ngram proposer, required + when method is set to ngram.""" + prompt_lookup_min: int | None = Field(default=None, ge=1) + """Minimum size of ngram token window when using Ngram proposer, if + provided. Defaults to 1.""" + + speculative_token_tree: str | None = None + """Specifies the tree structure for speculative token generation. + """ + # required configuration params passed from engine + target_model_config: SkipValidation[ModelConfig] = None # type: ignore + """The configuration of the target model.""" + target_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore + """The parallel configuration for the target model.""" + + # params generated in the post-init stage + draft_model_config: SkipValidation[ModelConfig] = None # type: ignore + """The configuration of the draft model initialized internal.""" + draft_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore + """The parallel configuration for the draft model initialized internal.""" + + # Suffix decoding configuration + suffix_decoding_max_tree_depth: int = 24 + """The maximum depth of the suffix decoding global and prompt trees. The + tree depth limits the sum of the prefix match and speculation lengths.""" + + suffix_decoding_max_cached_requests: int = 10000 + """The maximum number of requests to cache in the global suffix tree. If + exceeded, will trigger eviction in FIFO order. If set to 0, the global + suffix tree is disabled and past responses are not cached (prompt trees + are still used).""" + + suffix_decoding_max_spec_factor: float = 1.0 + """The maximum spec factor for suffix decoding. The spec factor controls + speculation lengths based on the prefix match length: max_spec_tokens = + max_spec_factor * prefix_match_length.""" + + suffix_decoding_min_token_prob: float = 0.1 + """The minimum token probability for suffix decoding. Will only speculate + tokens with estimated probability (based on frequency counts) greater than + or equal to this value.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + # Eagle3 affects the computation graph because it returns intermediate + # hidden states in addition to the final hidden state. + factors.append(self.method == "eagle3") + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + @staticmethod + def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: + if hf_config.model_type in ("deepseek_v3", "deepseek_v32"): + hf_config.model_type = "deepseek_mtp" + if hf_config.model_type == "deepseek_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update( + {"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]} + ) + if hf_config.model_type in ("pangu_ultra_moe"): + hf_config.model_type = "pangu_ultra_moe_mtp" + if hf_config.model_type == "pangu_ultra_moe_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update( + {"n_predict": n_predict, "architectures": ["OpenPanguMTPModel"]} + ) + + if hf_config.architectures[0] == "MiMoForCausalLM": + hf_config.model_type = "mimo_mtp" + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update( + { + "num_hidden_layers": 0, + "n_predict": n_predict, + "architectures": ["MiMoMTPModel"], + } + ) + + if hf_config.architectures[0] == "Glm4MoeForCausalLM": + hf_config.model_type = "glm4_moe_mtp" + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update( + { + "num_hidden_layers": 0, + "n_predict": n_predict, + "architectures": ["Glm4MoeMTPModel"], + } + ) + + if hf_config.model_type == "ernie4_5_moe": + hf_config.model_type = "ernie_mtp" + if hf_config.model_type == "ernie_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update( + {"n_predict": n_predict, "architectures": ["ErnieMTPModel"]} + ) + + if hf_config.model_type == "qwen3_next": + hf_config.model_type = "qwen3_next_mtp" + if hf_config.model_type == "qwen3_next_mtp": + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update( + {"n_predict": n_predict, "architectures": ["Qwen3NextMTP"]} + ) + if hf_config.model_type == "longcat_flash": + hf_config.model_type = "longcat_flash_mtp" + n_predict = getattr(hf_config, "num_nextn_predict_layers", 1) + hf_config.update( + {"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]} + ) + + return hf_config + + def __post_init__(self): + # Note: "method" is a new parameter that helps to extend the + # configuration of non-model-based proposers, and the "model" parameter + # will be used to set the draft model, eagle head, or additional weight + # when needed. If users do not specify "method", the speculative method + # will be detected automatically if possible. If the speculative method + # can not be detected, it will be considered as the "draft_model" by + # default. + + if self.method in get_args(MTPModelTypes) and self.method != "mtp": + logger.warning( + "method `%s` is deprecated and replaced with mtp.", self.method + ) + self.method = "mtp" + + if self.model is None and self.num_speculative_tokens is not None: + if self.method == "mtp": + if self.target_model_config is None: + raise ValueError("target_model_config must be present for mtp") + if self.target_model_config.hf_text_config.model_type == "deepseek_v32": + # FIXME(luccafong): cudgraph with v32 MTP is not supported, + # remove this when the issue is fixed. + self.enforce_eager = True + # use the draft model from the same model: + self.model = self.target_model_config.model + # Align the quantization of draft model for cases such as + # --quantization fp8 with a bf16 checkpoint. + if not self.quantization: + self.quantization = self.target_model_config.quantization + elif self.method in ("ngram", "[ngram]"): + self.model = "ngram" + elif self.method == "suffix": + self.model = "suffix" + else: + raise ValueError( + "num_speculative_tokens was provided but without speculative model." + ) + + # Automatically configure the method for ngram when "model" is used + # instead of "method" + if self.method is None and ( + self.model is not None and self.model in ("ngram", "[ngram]") + ): + self.method = "ngram" + + if self.method in ("ngram", "[ngram]"): + # Unified to "ngram" internally + self.method = "ngram" + # Set default values if not provided + if self.prompt_lookup_min is None and self.prompt_lookup_max is None: + # TODO(woosuk): Tune these values. They are arbitrarily chosen. + self.prompt_lookup_min = 5 + self.prompt_lookup_max = 5 + elif self.prompt_lookup_min is None: + if self.prompt_lookup_max is None: + raise ValueError( + "Either prompt_lookup_max or prompt_lookup_min must be " + "provided when using the ngram method." + ) + self.prompt_lookup_min = self.prompt_lookup_max + elif self.prompt_lookup_max is None: + if self.prompt_lookup_min is None: + raise ValueError( + "Either prompt_lookup_max or prompt_lookup_min must be " + "provided when using the ngram method." + ) + self.prompt_lookup_max = self.prompt_lookup_min + + # Validate values + if self.prompt_lookup_min > self.prompt_lookup_max: + raise ValueError( + f"prompt_lookup_min={self.prompt_lookup_min} must " + f"be <= prompt_lookup_max={self.prompt_lookup_max}" + ) + + # TODO: current we still need extract vocab_size from target model + # config, in future, we may try refactor it out, and set + # draft related config as None here. + self.draft_model_config = self.target_model_config + self.draft_parallel_config = self.target_parallel_config + elif self.method == "suffix": + self._validate_suffix_decoding() + else: + self.prompt_lookup_max = 0 + self.prompt_lookup_min = 0 + + if self.model is not None: + # TODO: Move this import to the top once `ModelConfig` + # lives in `vllm.config.model`. + from vllm.config import ModelConfig + + self.draft_model_config = ModelConfig( + model=self.model, + runner="draft", + tokenizer=self.target_model_config.tokenizer, + tokenizer_mode=self.target_model_config.tokenizer_mode, + trust_remote_code=self.target_model_config.trust_remote_code, + allowed_local_media_path=self.target_model_config.allowed_local_media_path, + allowed_media_domains=self.target_model_config.allowed_media_domains, + dtype=self.target_model_config.dtype, + seed=self.target_model_config.seed, + revision=self.revision, + code_revision=self.code_revision, + tokenizer_revision=self.target_model_config.tokenizer_revision, + spec_target_max_model_len=self.target_model_config.max_model_len, + quantization=self.quantization, + enforce_eager=self.target_model_config.enforce_eager, + max_logprobs=self.target_model_config.max_logprobs, + hf_overrides=SpeculativeConfig.hf_config_override, + ) + + # Automatically detect the method + if self.method in ("eagle", "eagle3"): + pass + # examples: + # yuhuili/EAGLE-LLaMA3-Instruct-8B + # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B + # AngelSlim/Qwen3-8B_eagle3 + elif "eagle-" in self.draft_model_config.model.lower(): + self.method = "eagle" + elif "eagle3" in self.draft_model_config.model.lower(): + self.method = "eagle3" + elif self.draft_model_config.hf_config.model_type == "medusa": + self.method = "medusa" + elif self.draft_model_config.hf_config.model_type == "mlp_speculator": + self.method = "mlp_speculator" + elif self.draft_model_config.hf_config.model_type in get_args( + MTPModelTypes + ): + self.method = "mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "Enabling num_speculative_tokens > 1 will run" + "multiple times of forward on same MTP layer" + ",which may result in lower acceptance rate" + ) + elif self.draft_model_config.hf_config.model_type in ( + "longcat_flash_mtp" + ): + self.method = "longcat_flash_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "LongCat MTP models only have " + "one layer. Might need some code changes " + "to support multiple layers." + ) + else: + self.method = "draft_model" + raise NotImplementedError( + "Speculative decoding with draft model is not " + "supported yet. Please consider using other " + "speculative decoding methods such as ngram, medusa, " + "eagle, or mtp." + ) + + # Replace hf_config for EAGLE draft_model + if self.method in ("eagle", "eagle3"): + from vllm.transformers_utils.configs import SpeculatorsConfig + from vllm.transformers_utils.configs.eagle import EAGLEConfig + + if isinstance( + self.draft_model_config.hf_config, + (EAGLEConfig, SpeculatorsConfig), + ): + pass + else: + eagle_config = EAGLEConfig( + self.draft_model_config.hf_config, + method=self.method, + model_type="eagle", + ) + self.draft_model_config.hf_config = eagle_config + + if self.num_speculative_tokens is not None and hasattr( + self.draft_model_config.hf_config, "num_lookahead_tokens" + ): + self.draft_model_config.hf_config.num_lookahead_tokens = ( + self.num_speculative_tokens + ) + + n_predict = getattr( + self.draft_model_config.hf_config, "n_predict", None + ) + if n_predict is not None: + if self.num_speculative_tokens is None: + # Default to max value defined in draft model config. + self.num_speculative_tokens = n_predict + elif ( + self.num_speculative_tokens > n_predict + and self.num_speculative_tokens % n_predict != 0 + ): + # Ensure divisibility for MTP module reuse. + raise ValueError( + f"num_speculative_tokens:{self.num_speculative_tokens}" + f" must be divisible by {n_predict=}" + ) + + if self.speculative_token_tree is None: + # Generate chain of tokens. + self.speculative_token_tree = str( + [(i + 1) * (0,) for i in range(self.num_speculative_tokens)] + ) + else: + # Sort the token tree breadth-first. + tree_choices = ast.literal_eval(self.speculative_token_tree) + self.speculative_token_tree = str( + sorted(tree_choices, key=lambda t: (len(t), t)) + ) + + self.draft_tensor_parallel_size = ( + SpeculativeConfig._verify_and_get_draft_tp( + self.target_parallel_config, + self.draft_tensor_parallel_size, + self.draft_model_config.hf_config, + ) + ) + + self.draft_model_config.max_model_len = ( + SpeculativeConfig._maybe_override_draft_max_model_len( + self.max_model_len, + self.draft_model_config.max_model_len, + self.target_model_config.max_model_len, + ) + ) + + self.draft_parallel_config = ( + SpeculativeConfig.create_draft_parallel_config( + self.target_parallel_config, self.draft_tensor_parallel_size + ) + ) + return self + + def _validate_suffix_decoding(self): + if not has_arctic_inference(): + raise ImportError( + "Arctic Inference is required for suffix decoding. " + "Install via `pip install arctic-inference==0.1.1`." + ) + if self.num_speculative_tokens is None: + # Suffix decoding decides the actual number of speculative tokens + # dynamically and treats num_speculative_tokens as a maximum limit. + self.num_speculative_tokens = self.suffix_decoding_max_tree_depth + logger.warning( + "Defaulted num_speculative_tokens to %s for suffix decoding.", + self.num_speculative_tokens, + ) + # Validate values + if self.suffix_decoding_max_tree_depth < 1: + raise ValueError( + f"suffix_decoding_max_tree_depth=" + f"{self.suffix_decoding_max_tree_depth} must be >= 1" + ) + if self.suffix_decoding_max_cached_requests < 0: + raise ValueError( + f"suffix_decoding_max_cached_requests=" + f"{self.suffix_decoding_max_cached_requests} must be >= 0" + ) + if self.suffix_decoding_max_spec_factor < 0: + raise ValueError( + f"suffix_decoding_max_spec_factor=" + f"{self.suffix_decoding_max_spec_factor} must be >= 0" + ) + if not 0 <= self.suffix_decoding_min_token_prob <= 1: + raise ValueError( + f"suffix_decoding_min_token_prob=" + f"{self.suffix_decoding_min_token_prob} must be in [0, 1]" + ) + + @staticmethod + def _maybe_override_draft_max_model_len( + speculative_max_model_len: int | None, + draft_max_model_len: int, + target_max_model_len: int, + ) -> int: + """Determine the max sequence len for the draft model. This is usually + the draft_max_model_len, but may be the target_max_model_len if it is + less than the draft_max_model_len, or may be speculative_max_model_len + if it is specified. + + This is necessary so that sequences do not exceed the capacity of the + draft model or the target model. + + speculative_max_model_len is mainly used for testing that sequences can + skip speculation. + """ + + if speculative_max_model_len is not None: + if speculative_max_model_len > draft_max_model_len: + raise ValueError( + f"{speculative_max_model_len=} cannot be " + f"larger than {draft_max_model_len=}" + ) + + if speculative_max_model_len > target_max_model_len: + raise ValueError( + f"{speculative_max_model_len=} cannot be " + f"larger than {target_max_model_len=}" + ) + + return speculative_max_model_len + + return min( + draft_max_model_len, + target_max_model_len, + ) + + @staticmethod + def _verify_and_get_draft_tp( + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: int | None, + draft_hf_config: PretrainedConfig, + ) -> int: + """ + Verifies and adjusts the tensor parallel size for a draft model + specified using speculative_draft_tensor_parallel_size. + """ + # If speculative_draft_tensor_parallel_size is unset then set it + # appropriately else verify that it is set correctly. + if speculative_draft_tensor_parallel_size is None: + if draft_hf_config.model_type == "mlp_speculator": + speculative_draft_tensor_parallel_size = 1 + if target_parallel_config.tensor_parallel_size > 1: + logger.warning( + "%s cannot currently be run with tp>1; " + "setting speculative_draft_tensor_parallel_size=1", + draft_hf_config.model_type, + ) + else: + speculative_draft_tensor_parallel_size = ( + target_parallel_config.tensor_parallel_size + ) + elif speculative_draft_tensor_parallel_size not in ( + 1, + target_parallel_config.tensor_parallel_size, + ): + raise ValueError( + f"{speculative_draft_tensor_parallel_size=} cannot be " + f"other value than 1 or target model tensor_parallel_size" + ) + return speculative_draft_tensor_parallel_size + + @staticmethod + def create_draft_parallel_config( + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: int, + ) -> ParallelConfig: + """Create a parallel config for use by the draft worker. + + This is mostly a copy of the target parallel config, except the tp_size. + """ + draft_parallel_config = ParallelConfig( + pipeline_parallel_size=target_parallel_config.pipeline_parallel_size, + tensor_parallel_size=speculative_draft_tensor_parallel_size, + distributed_executor_backend=target_parallel_config.distributed_executor_backend, + max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers, + disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce, + ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight, + placement_group=target_parallel_config.placement_group, + ) + + return draft_parallel_config + + @model_validator(mode="after") + def _verify_args(self) -> Self: + if self.num_speculative_tokens is None: + raise ValueError( + "num_speculative_tokens must be provided with " + "speculative model unless the draft model config contains an " + "n_predict parameter." + ) + + if self.num_speculative_tokens <= 0: + raise ValueError( + "Expected num_speculative_tokens to be greater " + f"than zero ({self.num_speculative_tokens})." + ) + + if self.draft_model_config: + self.draft_model_config.verify_with_parallel_config( + self.draft_parallel_config + ) + + if self.disable_by_batch_size is not None and self.disable_by_batch_size < 2: + raise ValueError( + "Expect the batch size threshold of disabling " + "speculative decoding is > 1, but got " + f"{self.disable_by_batch_size=}" + ) + + eagle3_target_supported = ["llama", "qwen", "minicpm", "gpt_oss"] + if ( + self.method == "eagle3" + and self.target_model_config + and not any( + supported_model in self.target_model_config.hf_text_config.model_type + for supported_model in eagle3_target_supported + ) + ): + raise ValueError( + f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501 + f"Got {self.target_model_config.hf_text_config.model_type=}" + ) + + return self + + @property + def num_lookahead_slots(self) -> int: + """The number of additional slots the scheduler should allocate per + step, in addition to the slots allocated for each known token. + + This is equal to the number of speculative tokens, as each speculative + token must be scored. + """ + return self.num_speculative_tokens + + def use_eagle(self) -> bool: + return self.method in ("eagle", "eagle3", "mtp") + + def __repr__(self) -> str: + method = self.method + model = None if method in ("ngram", "suffix") else self.draft_model_config.model + num_spec_tokens = self.num_speculative_tokens + return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" diff --git a/config/speech_to_text.py b/config/speech_to_text.py new file mode 100644 index 0000000..3eafff1 --- /dev/null +++ b/config/speech_to_text.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + + +@config +@dataclass +class SpeechToTextConfig: + """Configuration for speech-to-text models.""" + + sample_rate: float = 16_000 + """Sample rate (Hz) to resample input audio to. Most speech models expect + 16kHz audio input. The input audio will be automatically resampled to this + rate before processing.""" + + max_audio_clip_s: int = 30 + """Maximum duration in seconds for a single audio clip without chunking. + Audio longer than this will be split into smaller chunks if + `allow_audio_chunking` evaluates to True, otherwise it will be rejected.""" + + overlap_chunk_second: int = 1 + """Overlap duration in seconds between consecutive audio chunks when + splitting long audio. This helps maintain context across chunk boundaries + and improves transcription quality at split points.""" + + min_energy_split_window_size: int | None = 1600 + """Window size in samples for finding low-energy (quiet) regions to split + audio chunks. The algorithm looks for the quietest moment within this + window to minimize cutting through speech. Default 1600 samples ≈ 100ms + at 16kHz. If None, no chunking will be done.""" + + @property + def allow_audio_chunking(self) -> bool: + return self.min_energy_split_window_size is not None diff --git a/config/structured_outputs.py b/config/structured_outputs.py new file mode 100644 index 0000000..9530d3d --- /dev/null +++ b/config/structured_outputs.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import Any, Literal + +from pydantic import model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +from vllm.config.utils import config + +StructuredOutputsBackend = Literal[ + "auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer" +] + + +@config +@dataclass +class StructuredOutputsConfig: + """Dataclass which contains structured outputs config for the engine.""" + + backend: StructuredOutputsBackend = "auto" + """Which engine will be used for structured outputs (e.g. JSON schema, + regex, etc) by default. With "auto", we will make opinionated choices + based on request contents and what the backend libraries currently support, + so the behavior is subject to change in each release.""" + disable_fallback: bool = False + """If `True`, vLLM will not fallback to a different backend on error.""" + disable_any_whitespace: bool = False + """If `True`, the model will not generate any whitespace during structured + outputs. This is only supported for xgrammar and guidance backends.""" + disable_additional_properties: bool = False + """If `True`, the `guidance` backend will not use `additionalProperties` + in the JSON schema. This is only supported for the `guidance` backend and + is used to better align its behaviour with `outlines` and `xgrammar`.""" + reasoning_parser: str = "" + """Select the reasoning parser depending on the model that you're using. + This is used to parse the reasoning content into OpenAI API format.""" + reasoning_parser_plugin: str = "" + """Path to a dynamically reasoning parser plugin that can be dynamically + loaded and registered.""" + enable_in_reasoning: bool = False + """Whether to use structured input for reasoning.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + @model_validator(mode="after") + def _validate_structured_output_config(self) -> Self: + # Import here to avoid circular import + from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager + + if self.reasoning_parser_plugin and len(self.reasoning_parser_plugin) > 3: + ReasoningParserManager.import_reasoning_parser(self.reasoning_parser_plugin) + + valid_reasoning_parsers = ReasoningParserManager.list_registered() + if ( + self.reasoning_parser != "" + and self.reasoning_parser not in valid_reasoning_parsers + ): + raise ValueError( + f"invalid reasoning parser: {self.reasoning_parser} " + f"(chose from {{ {','.join(valid_reasoning_parsers)} }})" + ) + + if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"): + raise ValueError( + "disable_any_whitespace is only supported for " + "xgrammar and guidance backends." + ) + if self.disable_additional_properties and self.backend != "guidance": + raise ValueError( + "disable_additional_properties is only supported " + "for the guidance backend." + ) + return self diff --git a/config/utils.py b/config/utils.py new file mode 100644 index 0000000..7e0878d --- /dev/null +++ b/config/utils.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility functions for vLLM config dataclasses.""" + +import ast +import inspect +import textwrap +from collections.abc import Iterable +from dataclasses import MISSING, Field, field, fields, is_dataclass, replace +from itertools import pairwise +from typing import TYPE_CHECKING, Any, Protocol, TypeVar + +import regex as re +from pydantic.fields import FieldInfo +from typing_extensions import runtime_checkable + +if TYPE_CHECKING: + from _typeshed import DataclassInstance +else: + DataclassInstance = Any + +ConfigType = type[DataclassInstance] +ConfigT = TypeVar("ConfigT", bound=ConfigType) + + +def config(cls: ConfigT) -> ConfigT: + """ + A decorator that ensures all fields in a dataclass have default values + and that each field has a docstring. + + If a `ConfigT` is used as a CLI argument itself, the `type` keyword argument + provided by `get_kwargs` will be + `pydantic.TypeAdapter(ConfigT).validate_json(cli_arg)` which treats the + `cli_arg` as a JSON string which gets validated by `pydantic`. + + Config validation is performed by the tools/pre_commit/validate_config.py + script, which is invoked during the pre-commit checks. + """ + return cls + + +def get_field(cls: ConfigType, name: str) -> Field: + """Get the default factory field of a dataclass by name. Used for getting + default factory fields in `EngineArgs`.""" + if not is_dataclass(cls): + raise TypeError("The given class is not a dataclass.") + cls_fields = {f.name: f for f in fields(cls)} + if name not in cls_fields: + raise ValueError(f"Field '{name}' not found in {cls.__name__}.") + named_field: Field = cls_fields[name] + if (default_factory := named_field.default_factory) is not MISSING: + return field(default_factory=default_factory) + if (default := named_field.default) is not MISSING: + if isinstance(default, FieldInfo): + # Handle pydantic.Field defaults + if default.default_factory is not None: + return field(default_factory=default.default_factory) + else: + default = default.default + return field(default=default) + + raise ValueError( + f"{cls.__name__}.{name} must have a default value or default factory." + ) + + +def getattr_iter(object: object, names: Iterable[str], default: Any) -> Any: + """ + A helper function that retrieves an attribute from an object which may + have multiple possible names. This is useful when fetching attributes from + arbitrary `transformers.PretrainedConfig` instances. + """ + for name in names: + if hasattr(object, name): + return getattr(object, name) + return default + + +def contains_object_print(text: str) -> bool: + """ + Check if the text looks like a printed Python object, e.g. + contains any substring matching the pattern: "at 0xFFFFFFF>" + We match against 0x followed by 2-16 hex chars (there's + a max of 16 on a 64-bit system). + + Args: + text (str): The text to check + + Returns: + result (bool): `True` if a match is found, `False` otherwise. + """ + pattern = r"at 0x[a-fA-F0-9]{2,16}>" + match = re.search(pattern, text) + return match is not None + + +def assert_hashable(text: str) -> bool: + if not contains_object_print(text): + return True + raise AssertionError( + f"vLLM tried to hash some configs that may have Python objects ids " + f"in them. This is a bug, please file an issue. " + f"Text being hashed: {text}" + ) + + +def get_attr_docs(cls: type[Any]) -> dict[str, str]: + """ + Get any docstrings placed after attribute assignments in a class body. + + https://davidism.com/mit-license/ + """ + + cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] + + if not isinstance(cls_node, ast.ClassDef): + raise TypeError("Given object was not a class.") + + out = {} + + # Consider each pair of nodes. + for a, b in pairwise(cls_node.body): + # Must be an assignment then a constant string. + if ( + not isinstance(a, (ast.Assign, ast.AnnAssign)) + or not isinstance(b, ast.Expr) + or not isinstance(b.value, ast.Constant) + or not isinstance(b.value.value, str) + ): + continue + + doc = inspect.cleandoc(b.value.value) + + # An assignment can have multiple targets (a = b = v), but an + # annotated assignment only has one target. + targets = a.targets if isinstance(a, ast.Assign) else [a.target] + + for target in targets: + # Must be assigning to a plain name. + if not isinstance(target, ast.Name): + continue + + out[target.id] = doc + + return out + + +def is_init_field(cls: ConfigType, name: str) -> bool: + return next(f for f in fields(cls) if f.name == name).init + + +@runtime_checkable +class SupportsHash(Protocol): + def compute_hash(self) -> str: ... + + +class SupportsMetricsInfo(Protocol): + def metrics_info(self) -> dict[str, str]: ... + + +def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT: + processed_overrides = {} + for field_name, value in overrides.items(): + assert hasattr(config, field_name), ( + f"{type(config)} has no field `{field_name}`" + ) + current_value = getattr(config, field_name) + if is_dataclass(current_value) and not is_dataclass(value): + assert isinstance(value, dict), ( + f"Overrides to {type(config)}.{field_name} must be a dict" + f" or {type(current_value)}, but got {type(value)}" + ) + value = update_config( + current_value, # type: ignore[type-var] + value, + ) + processed_overrides[field_name] = value + return replace(config, **processed_overrides) diff --git a/config/vllm.py b/config/vllm.py new file mode 100644 index 0000000..672b004 --- /dev/null +++ b/config/vllm.py @@ -0,0 +1,1166 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import copy +import getpass +import hashlib +import json +import os +import tempfile +import threading +import time +from contextlib import contextmanager +from dataclasses import replace +from datetime import datetime +from functools import lru_cache +from pathlib import Path +from typing import TYPE_CHECKING, Any, TypeVar, get_args + +import torch +from pydantic import ConfigDict, Field, model_validator +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.config.speculative import EagleModelTypes +from vllm.logger import enable_trace_function_call, init_logger +from vllm.transformers_utils.runai_utils import is_runai_obj_uri +from vllm.utils import random_uuid + +from .cache import CacheConfig +from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode +from .device import DeviceConfig +from .ec_transfer import ECTransferConfig +from .kv_events import KVEventsConfig +from .kv_transfer import KVTransferConfig +from .load import LoadConfig +from .lora import LoRAConfig +from .model import ModelConfig +from .observability import ObservabilityConfig +from .parallel import ParallelConfig +from .scheduler import SchedulerConfig +from .speculative import SpeculativeConfig +from .structured_outputs import StructuredOutputsConfig +from .utils import SupportsHash, config + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + from vllm.v1.kv_cache_interface import KVCacheConfig +else: + PretrainedConfig = Any + + QuantizationConfig = Any + + KVCacheConfig = Any + +logger = init_logger(__name__) + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class VllmConfig: + """Dataclass which contains all vllm-related configuration. This + simplifies passing around the distinct configurations in the codebase. + """ + + # TODO: use default_factory once default constructing ModelConfig doesn't + # try to download a model + model_config: ModelConfig = Field(default=None) + """Model configuration.""" + cache_config: CacheConfig = Field(default_factory=CacheConfig) + """Cache configuration.""" + parallel_config: ParallelConfig = Field(default_factory=ParallelConfig) + """Parallel configuration.""" + scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig) + """Scheduler configuration.""" + device_config: DeviceConfig = Field(default_factory=DeviceConfig) + """Device configuration.""" + load_config: LoadConfig = Field(default_factory=LoadConfig) + """Load configuration.""" + lora_config: LoRAConfig | None = None + """LoRA configuration.""" + speculative_config: SpeculativeConfig | None = None + """Speculative decoding configuration.""" + structured_outputs_config: StructuredOutputsConfig = Field( + default_factory=StructuredOutputsConfig + ) + """Structured outputs configuration.""" + observability_config: ObservabilityConfig = Field( + default_factory=ObservabilityConfig + ) + """Observability configuration.""" + quant_config: QuantizationConfig | None = None + """Quantization configuration.""" + compilation_config: CompilationConfig = Field(default_factory=CompilationConfig) + """`torch.compile` and cudagraph capture configuration for the model. + + As a shorthand, one can append compilation arguments via + -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`). + + You can specify the full compilation config like so: + `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` + """ + kv_transfer_config: KVTransferConfig | None = None + """The configurations for distributed KV cache transfer.""" + kv_events_config: KVEventsConfig | None = None + """The configurations for event publishing.""" + ec_transfer_config: ECTransferConfig | None = None + """The configurations for distributed EC cache transfer.""" + # some opaque config, only used to provide additional information + # for the hash computation, mainly used for testing, debugging or out of + # tree config registration. + additional_config: dict | SupportsHash = Field(default_factory=dict) + """Additional config for specified platform. Different platforms may + support different configs. Make sure the configs are valid for the platform + you are using. Contents must be hashable.""" + instance_id: str = "" + """The ID of the vLLM instance.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + + # summarize vllm config + vllm_factors: list[Any] = [] + from vllm import __version__ + + vllm_factors.append(__version__) + if self.model_config: + vllm_factors.append(self.model_config.compute_hash()) + else: + vllm_factors.append("None") + if self.cache_config: + vllm_factors.append(self.cache_config.compute_hash()) + else: + vllm_factors.append("None") + if self.parallel_config: + vllm_factors.append(self.parallel_config.compute_hash()) + else: + vllm_factors.append("None") + if self.scheduler_config: + vllm_factors.append(self.scheduler_config.compute_hash()) + else: + vllm_factors.append("None") + if self.device_config: + vllm_factors.append(self.device_config.compute_hash()) + else: + vllm_factors.append("None") + if self.load_config: + vllm_factors.append(self.load_config.compute_hash()) + else: + vllm_factors.append("None") + if self.lora_config: + vllm_factors.append(self.lora_config.compute_hash()) + # LoRA creates static buffers based on max_num_batched_tokens. + # The tensor sizes and strides get captured in the torch.compile + # graph explicitly. + vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens)) + else: + vllm_factors.append("None") + if self.speculative_config: + vllm_factors.append(self.speculative_config.compute_hash()) + else: + vllm_factors.append("None") + if self.structured_outputs_config: + vllm_factors.append(self.structured_outputs_config.compute_hash()) + else: + vllm_factors.append("None") + vllm_factors.append(self.observability_config.compute_hash()) + if self.quant_config: + pass # should be captured by model_config.quantization + if self.compilation_config: + vllm_factors.append(self.compilation_config.compute_hash()) + else: + vllm_factors.append("None") + if self.kv_transfer_config: + vllm_factors.append(self.kv_transfer_config.compute_hash()) + else: + vllm_factors.append("None") + if self.ec_transfer_config: + vllm_factors.append(self.ec_transfer_config.compute_hash()) + else: + vllm_factors.append("None") + if self.additional_config: + if isinstance(additional_config := self.additional_config, dict): + additional_config_hash = hashlib.md5( + json.dumps(additional_config, sort_keys=True).encode(), + usedforsecurity=False, + ).hexdigest() + else: + additional_config_hash = additional_config.compute_hash() + vllm_factors.append(additional_config_hash) + else: + vllm_factors.append("None") + factors.append(vllm_factors) + + hash_str = hashlib.md5( + str(factors).encode(), usedforsecurity=False + ).hexdigest()[:10] + return hash_str + + def pad_for_cudagraph(self, batch_size: int) -> int: + # if batch_size > self.compilation_config.max_cudagraph_capture_size, + # it should raise an IndexError. + # the caller should make sure the batch_size is within the range, + # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size + return self.compilation_config.bs_to_padded_graph_size[batch_size] + + def enable_trace_function_call_for_thread(self) -> None: + """ + Set up function tracing for the current thread, + if enabled via the `VLLM_TRACE_FUNCTION` environment variable. + """ + if envs.VLLM_TRACE_FUNCTION: + tmp_dir = tempfile.gettempdir() + # add username to tmp_dir to avoid permission issues + tmp_dir = os.path.join(tmp_dir, getpass.getuser()) + filename = ( + f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}" + f"_thread_{threading.get_ident()}_at_{datetime.now()}.log" + ).replace(" ", "_") + log_path = os.path.join( + tmp_dir, + "vllm", + f"vllm-instance-{self.instance_id}", + filename, + ) + os.makedirs(os.path.dirname(log_path), exist_ok=True) + enable_trace_function_call(log_path) + + @staticmethod + def _get_quantization_config( + model_config: ModelConfig, load_config: LoadConfig + ) -> QuantizationConfig | None: + """Get the quantization config.""" + from vllm.platforms import current_platform + + if model_config.quantization is not None: + from vllm.model_executor.model_loader.weight_utils import get_quant_config + + quant_config = get_quant_config(model_config, load_config) + capability_tuple = current_platform.get_device_capability() + + if capability_tuple is not None: + capability = capability_tuple.to_int() + if capability < quant_config.get_min_capability(): + raise ValueError( + f"The quantization method {model_config.quantization} " + "is not supported for the current GPU. Minimum " + f"capability: {quant_config.get_min_capability()}. " + f"Current capability: {capability}." + ) + supported_dtypes = quant_config.get_supported_act_dtypes() + if model_config.dtype not in supported_dtypes: + raise ValueError( + f"{model_config.dtype} is not supported for quantization " + f"method {model_config.quantization}. Supported dtypes: " + f"{supported_dtypes}" + ) + quant_config.maybe_update_config(model_config.model) + return quant_config + return None + + @staticmethod + def get_quantization_config( + model_config: ModelConfig, load_config: LoadConfig + ) -> QuantizationConfig | None: + import copy + + # For some reason, the _ version of this modifies the model_config + # object, so using deepcopy to avoid this problem. + return VllmConfig._get_quantization_config( + copy.deepcopy(model_config), load_config + ) + + def with_hf_config( + self, + hf_config: PretrainedConfig, + architectures: list[str] | None = None, + ) -> "VllmConfig": + if architectures is not None: + hf_config = copy.deepcopy(hf_config) + hf_config.architectures = architectures + + model_config = copy.deepcopy(self.model_config) + model_config.hf_config = hf_config + + return replace(self, model_config=model_config) + + def _post_init_kv_transfer_config(self) -> None: + """Update KVTransferConfig based on top-level configs in VllmConfig. + + Right now, this function reads the offloading settings from + CacheConfig and configures the KVTransferConfig accordingly. + """ + if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None: + return + + # If no KVTransferConfig is provided, create a default one. + if self.kv_transfer_config is None: + self.kv_transfer_config = KVTransferConfig() + + if (kv_offloading_size := self.cache_config.kv_offloading_size) is None: + raise ValueError( + "You must set kv_offloading_size when kv_offloading_backend is set." + ) + num_kv_ranks = ( + self.parallel_config.tensor_parallel_size + * self.parallel_config.pipeline_parallel_size + ) + + if kv_offloading_backend == "native": + self.kv_transfer_config.kv_connector = "OffloadingConnector" + kv_bytes_per_rank = kv_offloading_size * (1 << 30) / num_kv_ranks + + # NOTE(ApostaC): the actual calculation for num_cpu_blocks should be + # done after the model's KV cache is initialized + self.kv_transfer_config.kv_connector_extra_config.update( + {"kv_bytes_per_rank": kv_bytes_per_rank, "num_cpu_blocks": 0} + ) + elif kv_offloading_backend == "lmcache": + self.kv_transfer_config.kv_connector = "LMCacheConnectorV1" + kv_gb_per_rank = kv_offloading_size / num_kv_ranks + self.kv_transfer_config.kv_connector_extra_config = { + "lmcache.local_cpu": True, + "lmcache.max_local_cpu_size": kv_gb_per_rank, + } + + # This is the same for all backends + self.kv_transfer_config.kv_role = "kv_both" + + def __post_init__(self): + """Verify configs are valid & consistent with each other.""" + + # To give each torch profile run a unique instance name. + self.instance_id = f"{time.time_ns()}" + + self.try_verify_and_update_config() + + if self.model_config is not None: + self.model_config.verify_with_parallel_config(self.parallel_config) + self.model_config.verify_dual_chunk_attention_config(self.load_config) + + self.cache_config.verify_with_parallel_config(self.parallel_config) + + if self.lora_config is not None: + self.lora_config.verify_with_model_config(self.model_config) + + if self.quant_config is None and self.model_config is not None: + self.quant_config = VllmConfig._get_quantization_config( + self.model_config, self.load_config + ) + + executor_backend = self.parallel_config.distributed_executor_backend + executor_supports_async_sched = executor_backend in ( + "mp", + "uni", + "external_launcher", + ) + + if self.scheduler_config.async_scheduling: + # Async scheduling explicitly enabled, hard fail any incompatibilities. + if self.parallel_config.pipeline_parallel_size > 1: + raise ValueError( + "Async scheduling is not yet compatible with " + "pipeline_parallel_size > 1." + ) + # Currently, async scheduling only support eagle speculative + # decoding. + if self.speculative_config is not None: + if self.speculative_config.method not in get_args(EagleModelTypes): + raise ValueError( + "Currently, async scheduling is only supported " + "with EAGLE/MTP kind of speculative decoding" + ) + if self.speculative_config.disable_padded_drafter_batch: + raise ValueError( + "async scheduling for EAGLE/MTP kind of speculative " + "decoding is enabled, but disable_padded_drafter_batch=True " + "disable_padded_drafter_batch=True is not supported for " + "this situation now. please set " + "disable_padded_drafter_batch=Fasle" + ) + if not executor_supports_async_sched: + raise ValueError( + "Currently, async scheduling only supports `mp`, `uni`, or " + "`external_launcher` distributed executor backend, but you chose " + f"`{executor_backend}`." + ) + elif self.scheduler_config.async_scheduling is None: + # Enable async scheduling unless there is an incompatible option. + # NOTE: we won't reach here until async scheduling is enabled by default. + if ( + self.parallel_config.pipeline_parallel_size > 1 + or self.speculative_config is not None + ): + logger.warning( + "Async scheduling is not yet supported with speculative decoding " + " or pipeline_parallel_size > 1 and will be disabled." + ) + self.scheduler_config.async_scheduling = False + elif not executor_supports_async_sched: + logger.warning( + "Async scheduling will be disabled because it is not supported " + "with the `%s` distributed executor backend (only `mp`, `uni`, and " + "`external_launcher` are supported).", + executor_backend, + ) + self.scheduler_config.async_scheduling = False + else: + self.scheduler_config.async_scheduling = True + + from vllm.platforms import current_platform + + if ( + self.model_config is not None + and self.scheduler_config.enable_chunked_prefill + and self.model_config.dtype == torch.float32 + and current_platform.get_device_capability() == (7, 5) + ): + logger.warning_once( + "Turing devices tensor cores do not support float32 matmul. " + "To workaround this limitation, vLLM will set 'ieee' input " + "precision for chunked prefill triton kernels." + ) + + # If the user does not explicitly set a compilation mode, then + # we use the default mode. The default mode depends on other + # settings (see the below code). + if self.compilation_config.mode is None: + if self.model_config is not None and not self.model_config.enforce_eager: + self.compilation_config.mode = CompilationMode.VLLM_COMPILE + else: + self.compilation_config.mode = CompilationMode.NONE + + # If user does not set custom ops via none or all set it here based on + # compilation mode and backend. + if all(s not in self.compilation_config.custom_ops for s in ("all", "none")): + if ( + self.compilation_config.backend == "inductor" + and self.compilation_config.mode != CompilationMode.NONE + ): + self.compilation_config.custom_ops.append("none") + else: + self.compilation_config.custom_ops.append("all") + + # async tp is built on top of sequence parallelism + # and requires it to be enabled. + if self.compilation_config.pass_config.enable_async_tp: + self.compilation_config.pass_config.enable_sequence_parallelism = True + + if current_platform.support_static_graph_mode(): + # if cudagraph_mode is not explicitly set by users, set default + # value + if self.compilation_config.cudagraph_mode is None: + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: + # default to full and piecewise for most models + self.compilation_config.cudagraph_mode = ( + CUDAGraphMode.FULL_AND_PIECEWISE + ) + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + + # if cudagraph_mode has full cudagraphs, we need to check support + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): + # decode context parallel does not support full cudagraphs + if self.parallel_config.decode_context_parallel_size > 1: + logger.warning_once( + "Decode context parallel (DCP) is enabled, which is " + "incompatible with full CUDA graphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + elif self.model_config is not None: + if self.model_config.pooler_config is not None: + logger.warning_once( + "Pooling models do not support full cudagraphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + elif self.model_config.is_encoder_decoder: + logger.warning_once( + "Encoder-decoder models do not support full cudagraphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + + # disable cudagraph when enforce eager execution + if self.model_config is not None and self.model_config.enforce_eager: + logger.info("Cudagraph is disabled under eager mode") + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + # override related settings when enforce eager + self.compilation_config.max_cudagraph_capture_size = 0 + self.compilation_config.cudagraph_capture_sizes = [] + else: + self.compilation_config.cudagraph_num_of_warmups = 1 + + self._set_cudagraph_sizes() + else: + self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE + + if self.cache_config.kv_sharing_fast_prefill: + if ( + self.speculative_config is not None + and self.speculative_config.use_eagle() + ): + raise ValueError( + "Fast prefill optimization for KV sharing is not " + "compatible with EAGLE as EAGLE requires correct logits " + "for all tokens while fast prefill gives incorrect logits " + "for prompt tokens." + ) + + logger.warning_once( + "--kv-sharing-fast-prefill requires changes on model side for " + "correctness and to realize prefill savings. " + ) + + disable_chunked_prefill_reasons: list[str] = [] + + if self.model_config: + if self.model_config.pooler_config: + pooling_type = self.model_config.pooler_config.pooling_type + if pooling_type is None or pooling_type.lower() != "last": + disable_chunked_prefill_reasons.append( + 'Only "last" pooling supports chunked ' + "prefill and prefix caching; disabling both." + ) + if not getattr(self.model_config.hf_config, "is_causal", True): + disable_chunked_prefill_reasons.append( + "Only models using causal attention support chunked " + "prefill and prefix caching; disabling both." + ) + elif self.model_config.is_encoder_decoder: + from vllm.multimodal import MULTIMODAL_REGISTRY + + self.scheduler_config.max_num_encoder_input_tokens = ( + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) + ) + logger.debug( + "Encoder-decoder model detected: setting " + "`max_num_encoder_input_tokens` to encoder length (%s)", + self.scheduler_config.max_num_encoder_input_tokens, + ) + if ( + self.model_config.architecture == "WhisperForConditionalGeneration" + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" + ): + logger.warning( + "Whisper is known to have issues with " + "forked workers. If startup is hanging, " + "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " + "to 'spawn'." + ) + + # Final off-switch for CP/APC: + # Disable for (a) collected blockers, (b) encoder–decoder, or + # (c) explicit CP=False when APC wasn't requested. + # Do NOT disable merely because the resolved CP flag is False. + apc_requested = ( + self.cache_config is not None and self.cache_config.enable_prefix_caching + ) + if ( + disable_chunked_prefill_reasons + or (self.model_config is not None and self.model_config.is_encoder_decoder) + or ( + self.scheduler_config.enable_chunked_prefill is False + and not apc_requested + ) + ): + for reason in disable_chunked_prefill_reasons: + logger.info(reason) + self.scheduler_config.enable_chunked_prefill = False + self.scheduler_config.long_prefill_token_threshold = 0 + + if self.cache_config is not None: + self.cache_config.enable_prefix_caching = False + + if ( + self.kv_events_config is not None + and self.kv_events_config.enable_kv_cache_events + and not self.cache_config.enable_prefix_caching + ): + logger.warning( + "KV cache events are on, but prefix caching is not enabled." + "Use --enable-prefix-caching to enable." + ) + if ( + self.kv_events_config is not None + and self.kv_events_config.publisher != "null" + and not self.kv_events_config.enable_kv_cache_events + ): + logger.warning( + "KV cache events are disabled," + "but the scheduler is configured to publish them." + "Modify KVEventsConfig.enable_kv_cache_events" + "to True to enable." + ) + current_platform.check_and_update_config(self) + + # If DCP, ensure the block size is right. + if self.parallel_config.decode_context_parallel_size > 1: + assert ( + self.parallel_config.dcp_kv_cache_interleave_size + <= self.cache_config.block_size + and self.cache_config.block_size + % self.parallel_config.dcp_kv_cache_interleave_size + == 0 + ), ( + f"Block_size({self.cache_config.block_size}) should be greater " + "than or equal to and divisible by dcp_kv_cache_interleave_size " + f"({self.parallel_config.dcp_kv_cache_interleave_size})." + ) + + assert ( + self.parallel_config.dcp_kv_cache_interleave_size == 1 + or self.speculative_config is None + ), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now." + + # Do this after all the updates to compilation_config.mode + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: + self.compilation_config.set_splitting_ops_for_v1() + + if self.compilation_config.pass_config.enable_sequence_parallelism: + # With pipeline parallelism or dynamo partitioning, + # native rms norm tracing errors due to incorrect residual shape. + # Use custom rms norm to unblock. In the future, + # the pass will operate on higher-level IR to avoid the issue. + # TODO: https://github.com/vllm-project/vllm/issues/27894 + is_fullgraph = ( + self.compilation_config.use_inductor_graph_partition + or len(self.compilation_config.splitting_ops) == 0 + ) + if self.parallel_config.pipeline_parallel_size > 1 or not is_fullgraph: + if "-rms_norm" not in self.compilation_config.custom_ops: + self.compilation_config.custom_ops.append("+rms_norm") + else: + regime = ( + "Dynamo partition" + if not is_fullgraph + else "pipeline parallelism" + ) + logger.warning_once( + "Sequence parallelism not supported with" + "native rms_norm when using %s, " + "this will likely lead to an error.", + regime, + ) + + # final check of cudagraph mode after all possible updates + if current_platform.is_cuda_alike(): + if ( + self.compilation_config.cudagraph_mode.has_full_cudagraphs() + and self.model_config is not None + and not self.model_config.disable_cascade_attn + and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() # noqa: E501 + ): + logger.warning_once( + "No piecewise cudagraph for executing cascade attention." + " Will fall back to eager execution if a batch runs " + "into cascade attentions" + ) + + if self.compilation_config.cudagraph_mode.requires_piecewise_compilation(): + assert self.compilation_config.mode == CompilationMode.VLLM_COMPILE, ( + "Compilation mode should be CompilationMode.VLLM_COMPILE " + "when cudagraph_mode piecewise cudagraphs is used, " + f"cudagraph_mode={self.compilation_config.cudagraph_mode}" + ) + + if self.parallel_config.enable_dbo: + a2a_backend = self.parallel_config.all2all_backend + assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], ( + "Microbatching currently only supports the deepep_low_latency and " + f"deepep_high_throughput all2all backend. {a2a_backend} is not " + "supported. To fix use --all2all-backend=deepep_low_latency or " + "--all2all-backend=deepep_high_throughput and install the DeepEP" + " kernels." + ) + + if not self.model_config.disable_cascade_attn: + self.model_config.disable_cascade_attn = True + logger.warning_once("Disabling cascade attention when DBO is enabled.") + + if not self.instance_id: + self.instance_id = random_uuid()[:5] + + if not self.scheduler_config.disable_hybrid_kv_cache_manager: + # logger should only print warning message for hybrid models. As we + # can't know whether the model is hybrid or not now, so we don't log + # warning message here and will log it later. + if not current_platform.support_hybrid_kv_cache(): + # Hybrid KV cache manager is not supported on non-GPU platforms. + self.scheduler_config.disable_hybrid_kv_cache_manager = True + if self.kv_transfer_config is not None: + # NOTE(Kuntai): turn HMA off for connector for now. + # TODO(Kuntai): have a more elegent solution to check and + # turn off HMA for connector that does not support HMA. + logger.warning( + "Turning off hybrid kv cache manager because " + "`--kv-transfer-config` is set. This will reduce the " + "performance of vLLM on LLMs with sliding window attention " + "or Mamba attention. If you are a developer of kv connector" + ", please consider supporting hybrid kv cache manager for " + "your connector by making sure your connector is a subclass" + " of `SupportsHMA` defined in kv_connector/v1/base.py." + ) + self.scheduler_config.disable_hybrid_kv_cache_manager = True + if self.kv_events_config is not None: + # Hybrid KV cache manager is not compatible with KV events. + self.scheduler_config.disable_hybrid_kv_cache_manager = True + if ( + self.model_config is not None + and self.model_config.attention_chunk_size is not None + ): + if ( + self.speculative_config is not None + and self.speculative_config.use_eagle() + ): + # Hybrid KV cache manager is not yet supported with chunked + # local attention + eagle. + self.scheduler_config.disable_hybrid_kv_cache_manager = True + elif not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: + logger.warning( + "There is a latency regression when using chunked local" + " attention with the hybrid KV cache manager. Disabling" + " it, by default. To enable it, set the environment " + "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1." + ) + # Hybrid KV cache manager is not yet supported with chunked + # local attention. + self.scheduler_config.disable_hybrid_kv_cache_manager = True + + if self.compilation_config.debug_dump_path: + self.compilation_config.debug_dump_path = ( + self.compilation_config.debug_dump_path.absolute().expanduser() + ) + if envs.VLLM_DEBUG_DUMP_PATH is not None: + env_path = Path(envs.VLLM_DEBUG_DUMP_PATH).absolute().expanduser() + if self.compilation_config.debug_dump_path: + logger.warning( + "Config-specified debug dump path is overridden" + " by VLLM_DEBUG_DUMP_PATH to %s", + env_path, + ) + self.compilation_config.debug_dump_path = env_path + + def has_blocked_weights(): + if self.quant_config is not None: + if hasattr(self.quant_config, "weight_block_size"): + return self.quant_config.weight_block_size is not None + elif hasattr(self.quant_config, "has_blocked_weights"): + return self.quant_config.has_blocked_weights() + return False + + # Enable quant_fp8 CUDA ops (TODO disable in follow up) + # On H100 the CUDA kernel is faster than + # native implementation + # https://github.com/vllm-project/vllm/issues/25094 + if has_blocked_weights(): + custom_ops = self.compilation_config.custom_ops + if "-quant_fp8" not in custom_ops: + custom_ops.append("+quant_fp8") + + # Handle the KV connector configs + self._post_init_kv_transfer_config() + + def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list: + # remove the sizes that not multiple of tp_size when + # enable sequence parallelism + removed_sizes = [ + size + for size in possible_sizes + if size % self.parallel_config.tensor_parallel_size != 0 + ] + if removed_sizes: + logger.warning( + "Batch sizes %s are removed because they are not " + "multiple of tp_size %d when " + "sequence parallelism is enabled", + removed_sizes, + self.parallel_config.tensor_parallel_size, + ) + + return [ + size + for size in possible_sizes + if size % self.parallel_config.tensor_parallel_size == 0 + ] + + def _set_cudagraph_sizes(self): + """ + vLLM defines the default candidate list of batch sizes for CUDA graph + capture as: + + ```python + max_graph_size = min(max_num_seqs * 2, 512) + # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16 + # up to max_graph_size + cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( + range(256, max_graph_size + 1, 16)) + + In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` + will be the final sizes to capture cudagraph (in ascending order). + + These sizes are used to capture and reuse CUDA graphs for + performance-critical paths (e.g., decoding). Capturing enables + significantly faster kernel dispatch by avoiding Python overhead. The + list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on + most GPUs), which controls the total allowed number of tokens in a + batch. Since each sequence may have a variable number of tokens, the + maximum usable batch size will depend on actual sequence lengths. + + Example: + With `max_num_batched_tokens = 8192`, and typical sequences + averaging ~32 tokens, most practical batch sizes fall below 256. + However, the system will still allow capture sizes up to 512 if + shape and memory permit. + + Note: + If users explicitly specify cudagraph capture sizes in the + compilation config, those will override this default logic. + At runtime: + + - If batch size <= one of the `cudagraph_capture_sizes`, the closest + padded CUDA graph will be used. + - If batch size > largest `cudagraph_capture_sizes`, cudagraph will + not be used. + """ + + if ( + self.model_config is not None + and not self.model_config.enforce_eager + and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + ): + # determine the initial max_cudagraph_capture_size + max_cudagraph_capture_size = ( + self.compilation_config.max_cudagraph_capture_size + ) + if max_cudagraph_capture_size is None: + max_cudagraph_capture_size = min( + self.scheduler_config.max_num_seqs * 2, 512 + ) + max_num_tokens = self.scheduler_config.max_num_batched_tokens + max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size) + + assert max_cudagraph_capture_size >= 1, ( + "Maximum cudagraph size should be greater than or equal to 1 " + "when using cuda graph." + ) + + # determine the cudagraph_capture_sizes + if self.compilation_config.cudagraph_capture_sizes is not None: + assert len(self.compilation_config.cudagraph_capture_sizes) > 0, ( + "cudagraph_capture_sizes should contain at least one element " + "when using cuda graph." + ) + # de-duplicate the sizes provided by the config + dedup_sizes = list(set(self.compilation_config.cudagraph_capture_sizes)) + cudagraph_capture_sizes = [ + i for i in dedup_sizes if i <= max_num_tokens + ] + # sort to make sure the sizes are in ascending order + cudagraph_capture_sizes.sort() + else: + cudagraph_capture_sizes = [ + i for i in [1, 2, 4] if i <= max_cudagraph_capture_size + ] + if max_cudagraph_capture_size >= 8: + # Step size 8 for small batch sizes, up to 256(not included) + cudagraph_capture_sizes += list( + range(8, min(max_cudagraph_capture_size + 1, 256), 8) + ) + if max_cudagraph_capture_size >= 256: + # Step size 16 for larger batch sizes + cudagraph_capture_sizes += list( + range(256, max_cudagraph_capture_size + 1, 16) + ) + + if ( + self.parallel_config.tensor_parallel_size > 1 + and self.compilation_config.pass_config.enable_sequence_parallelism + ): + cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism( + cudagraph_capture_sizes + ) + + # user-specific compilation_config.max_cudagraph_capture_size get + # truncated to valid_max_size when they are inconsistent. + valid_max_size = ( + cudagraph_capture_sizes[-1] if cudagraph_capture_sizes else 0 + ) + if ( + self.compilation_config.max_cudagraph_capture_size is not None + and self.compilation_config.max_cudagraph_capture_size != valid_max_size + ): + # raise error only when both two flags are user-specified + # and they are inconsistent with each other + if self.compilation_config.cudagraph_capture_sizes is not None: + raise ValueError( + "customized max_cudagraph_capture_size" + f"(={self.compilation_config.max_cudagraph_capture_size}) " + "should be consistent with the max value of " + f"cudagraph_capture_sizes(={valid_max_size})" + ) + + logger.warning( + "Truncating max_cudagraph_capture_size to %d", + valid_max_size, + ) + # always set the final max_cudagraph_capture_size + self.compilation_config.max_cudagraph_capture_size = valid_max_size + + if self.compilation_config.cudagraph_capture_sizes is not None and len( + cudagraph_capture_sizes + ) < len(self.compilation_config.cudagraph_capture_sizes): + # If users have specified capture sizes, we only need to + # compare the lens before and after modification since the modified + # list is only the subset of the original list. + logger.warning( + ( + "cudagraph_capture_sizes specified in compilation_config" + " %s is overridden by config %s" + ), + self.compilation_config.cudagraph_capture_sizes, + cudagraph_capture_sizes, + ) + # always write back the final sizes + self.compilation_config.cudagraph_capture_sizes = cudagraph_capture_sizes + + else: + # no cudagraph in use + self.compilation_config.max_cudagraph_capture_size = 0 + self.compilation_config.cudagraph_capture_sizes = [] + + # complete the remaining process. + self.compilation_config.post_init_cudagraph_sizes() + + def recalculate_max_model_len(self, max_model_len: int): + # Can only be called in try_verify_and_update_config + model_config = self.model_config + max_model_len = model_config.get_and_verify_max_len(max_model_len) + self.model_config.max_model_len = max_model_len + + def try_verify_and_update_config(self): + if self.model_config is None: + return + + # Avoid running try_verify_and_update_config multiple times + if getattr(self.model_config, "config_updated", False): + return + self.model_config.config_updated = True + + architecture = self.model_config.architecture + if architecture is None: + return + + from vllm.model_executor.models.config import ( + MODELS_CONFIG_MAP, + HybridAttentionMambaModelConfig, + ) + + cls = MODELS_CONFIG_MAP.get(architecture, None) + if cls is not None: + cls.verify_and_update_config(self) + + if self.model_config.is_hybrid: + HybridAttentionMambaModelConfig.verify_and_update_config(self) + + if self.model_config.convert_type == "classify": + # Maybe convert ForCausalLM into ForSequenceClassification model. + from vllm.model_executor.models.adapters import SequenceClassificationConfig + + SequenceClassificationConfig.verify_and_update_config(self) + + if hasattr(self.model_config, "model_weights") and is_runai_obj_uri( + self.model_config.model_weights + ): + if self.load_config.load_format == "auto": + logger.info( + "Detected Run:ai model config. " + "Overriding `load_format` to 'runai_streamer'" + ) + self.load_config.load_format = "runai_streamer" + elif self.load_config.load_format not in ( + "runai_streamer", + "runai_streamer_sharded", + ): + raise ValueError( + f"To load a model from S3, 'load_format' " + f"must be 'runai_streamer' or 'runai_streamer_sharded', " + f"but got '{self.load_config.load_format}'. " + f"Model: {self.model_config.model}" + ) + + def compile_debug_dump_path(self) -> Path | None: + """Returns a rank-aware path for dumping + torch.compile debug information. + """ + if self.compilation_config.debug_dump_path is None: + return None + tp_rank = self.parallel_config.rank + dp_rank = self.parallel_config.data_parallel_rank + data_parallel_size = self.parallel_config.data_parallel_size + append_path = ( + f"rank_{tp_rank}" + if data_parallel_size == 1 + else f"rank_{tp_rank}_dp_{dp_rank}" + ) + path = self.compilation_config.debug_dump_path / append_path + return path + + def __str__(self): + return ( + f"model={self.model_config.model!r}, " + f"speculative_config={self.speculative_config!r}, " + f"tokenizer={self.model_config.tokenizer!r}, " + f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, " + f"tokenizer_mode={self.model_config.tokenizer_mode}, " + f"revision={self.model_config.revision}, " + f"tokenizer_revision={self.model_config.tokenizer_revision}, " + f"trust_remote_code={self.model_config.trust_remote_code}, " + f"dtype={self.model_config.dtype}, " + f"max_seq_len={self.model_config.max_model_len}, " + f"download_dir={self.load_config.download_dir!r}, " + f"load_format={self.load_config.load_format}, " + f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa + f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa + f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa + f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa + f"quantization={self.model_config.quantization}, " + f"enforce_eager={self.model_config.enforce_eager}, " + f"kv_cache_dtype={self.cache_config.cache_dtype}, " + f"device_config={self.device_config.device}, " + f"structured_outputs_config={self.structured_outputs_config!r}, " + f"observability_config={self.observability_config!r}, " + f"seed={self.model_config.seed}, " + f"served_model_name={self.model_config.served_model_name}, " + f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " + f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, " # noqa + f"pooler_config={self.model_config.pooler_config!r}, " + f"compilation_config={self.compilation_config!r}" + ) + + @model_validator(mode="after") + def validate_mamba_block_size(self) -> "VllmConfig": + if self.model_config is None: + return self + mamba_block_size_is_set = ( + self.cache_config.mamba_block_size is not None + and self.cache_config.mamba_block_size != self.model_config.max_model_len + ) + if mamba_block_size_is_set and not self.cache_config.enable_prefix_caching: + raise ValueError( + "--mamba-block-size can only be set with --enable-prefix-caching" + ) + return self + + +_current_vllm_config: VllmConfig | None = None +_current_prefix: str | None = None + + +@contextmanager +def set_current_vllm_config( + vllm_config: VllmConfig, check_compile=False, prefix: str | None = None +): + """ + Temporarily set the current vLLM config. + Used during model initialization. + We save the current vLLM config in a global variable, + so that all modules can access it, e.g. custom ops + can access the vLLM config to determine how to dispatch. + """ + global _current_vllm_config, _current_prefix + old_vllm_config = _current_vllm_config + old_prefix = _current_prefix + from vllm.compilation.counter import compilation_counter + + num_models_seen = compilation_counter.num_models_seen + try: + _current_vllm_config = vllm_config + _current_prefix = prefix + yield + except Exception: + raise + else: + if check_compile: + vllm_config.compilation_config.custom_op_log_check() + + if ( + check_compile + and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE + and compilation_counter.num_models_seen == num_models_seen + ): + # If the model supports compilation, + # compilation_counter.num_models_seen should be increased + # by at least 1. + # If it is not increased, it means the model does not support + # compilation (does not have @support_torch_compile decorator). + logger.warning( + "`torch.compile` is turned on, but the model %s" + " does not support it. Please open an issue on GitHub" + " if you want it to be supported.", + vllm_config.model_config.model, + ) + finally: + _current_vllm_config = old_vllm_config + _current_prefix = old_prefix + # Clear the compilation config cache when context changes + get_cached_compilation_config.cache_clear() + + +@lru_cache(maxsize=1) +def get_cached_compilation_config(): + """Cache config to avoid repeated calls to get_current_vllm_config()""" + return get_current_vllm_config().compilation_config + + +def get_current_vllm_config() -> VllmConfig: + if _current_vllm_config is None: + # in ci, usually when we test custom ops/modules directly, + # we don't set the vllm config. In that case, we set a default + # config. + logger.warning("Current vLLM config is not set.") + return VllmConfig() + return _current_vllm_config + + +T = TypeVar("T") + + +def get_layers_from_vllm_config( + vllm_config: VllmConfig, + layer_type: type[T], + layer_names: list[str] | None = None, +) -> dict[str, T]: + """ + Get layers from the vLLM config. + + Args: + vllm_config: The vLLM config. + layer_type: The type of the layer to get. + layer_names: The names of the layers to get. If None, return all layers. + """ + + if layer_names is None: + layer_names = list(vllm_config.compilation_config.static_forward_context.keys()) + + forward_context = vllm_config.compilation_config.static_forward_context + + return { + layer_name: forward_context[layer_name] + for layer_name in layer_names + if isinstance(forward_context[layer_name], layer_type) + } diff --git a/connections.py b/connections.py new file mode 100644 index 0000000..31b0d5e --- /dev/null +++ b/connections.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Mapping, MutableMapping +from pathlib import Path +from urllib.parse import urlparse + +import aiohttp +import requests + +from vllm.version import __version__ as VLLM_VERSION + + +class HTTPConnection: + """Helper class to send HTTP requests.""" + + def __init__(self, *, reuse_client: bool = True) -> None: + super().__init__() + + self.reuse_client = reuse_client + + self._sync_client: requests.Session | None = None + self._async_client: aiohttp.ClientSession | None = None + + def get_sync_client(self) -> requests.Session: + if self._sync_client is None or not self.reuse_client: + self._sync_client = requests.Session() + + return self._sync_client + + # NOTE: We intentionally use an async function even though it is not + # required, so that the client is only accessible inside async event loop + async def get_async_client(self) -> aiohttp.ClientSession: + if self._async_client is None or not self.reuse_client: + self._async_client = aiohttp.ClientSession(trust_env=True) + + return self._async_client + + def _validate_http_url(self, url: str): + parsed_url = urlparse(url) + + if parsed_url.scheme not in ("http", "https"): + raise ValueError( + "Invalid HTTP URL: A valid HTTP URL must have scheme 'http' or 'https'." + ) + + def _headers(self, **extras: str) -> MutableMapping[str, str]: + return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras} + + def get_response( + self, + url: str, + *, + stream: bool = False, + timeout: float | None = None, + extra_headers: Mapping[str, str] | None = None, + allow_redirects: bool = True, + ): + self._validate_http_url(url) + + client = self.get_sync_client() + extra_headers = extra_headers or {} + + return client.get( + url, + headers=self._headers(**extra_headers), + stream=stream, + timeout=timeout, + allow_redirects=allow_redirects, + ) + + async def get_async_response( + self, + url: str, + *, + timeout: float | None = None, + extra_headers: Mapping[str, str] | None = None, + allow_redirects: bool = True, + ): + self._validate_http_url(url) + + client = await self.get_async_client() + extra_headers = extra_headers or {} + + return client.get( + url, + headers=self._headers(**extra_headers), + timeout=timeout, + allow_redirects=allow_redirects, + ) + + def get_bytes( + self, url: str, *, timeout: float | None = None, allow_redirects: bool = True + ) -> bytes: + with self.get_response( + url, timeout=timeout, allow_redirects=allow_redirects + ) as r: + r.raise_for_status() + + return r.content + + async def async_get_bytes( + self, + url: str, + *, + timeout: float | None = None, + allow_redirects: bool = True, + ) -> bytes: + async with await self.get_async_response( + url, timeout=timeout, allow_redirects=allow_redirects + ) as r: + r.raise_for_status() + + return await r.read() + + def get_text(self, url: str, *, timeout: float | None = None) -> str: + with self.get_response(url, timeout=timeout) as r: + r.raise_for_status() + + return r.text + + async def async_get_text( + self, + url: str, + *, + timeout: float | None = None, + ) -> str: + async with await self.get_async_response(url, timeout=timeout) as r: + r.raise_for_status() + + return await r.text() + + def get_json(self, url: str, *, timeout: float | None = None) -> str: + with self.get_response(url, timeout=timeout) as r: + r.raise_for_status() + + return r.json() + + async def async_get_json( + self, + url: str, + *, + timeout: float | None = None, + ) -> str: + async with await self.get_async_response(url, timeout=timeout) as r: + r.raise_for_status() + + return await r.json() + + def download_file( + self, + url: str, + save_path: Path, + *, + timeout: float | None = None, + chunk_size: int = 128, + ) -> Path: + with self.get_response(url, timeout=timeout) as r: + r.raise_for_status() + + with save_path.open("wb") as f: + for chunk in r.iter_content(chunk_size): + f.write(chunk) + + return save_path + + async def async_download_file( + self, + url: str, + save_path: Path, + *, + timeout: float | None = None, + chunk_size: int = 128, + ) -> Path: + async with await self.get_async_response(url, timeout=timeout) as r: + r.raise_for_status() + + with save_path.open("wb") as f: + async for chunk in r.content.iter_chunked(chunk_size): + f.write(chunk) + + return save_path + + +global_http_connection = HTTPConnection() +""" +The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used +by vLLM. +""" diff --git a/device_allocator/__init__.py b/device_allocator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/device_allocator/__pycache__/__init__.cpython-312.pyc b/device_allocator/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b33ad1ceeb18985242058556120f2de4e19e10d GIT binary patch literal 166 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVx$2kX7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?SVD`Ewj$_T{8AjU^#Mn=XWW*`dy5FIHF literal 0 HcmV?d00001 diff --git a/device_allocator/__pycache__/cumem.cpython-312.pyc b/device_allocator/__pycache__/cumem.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0a77204c7998f4d9406958d79353292274b56b5 GIT binary patch literal 13260 zcmb_iYit|Wm7d}IC6RhREXkvXCDWFr`0YBjvwp;}<3veo=cR6!p*bUo3SXHS+7?T- zYMNqY)SD>ShvjYBD!>A()?L6qgZb7zK> zXeMony%O)-_w(FyzH{#VkA?;x2bXYsKmN`G9QSLASk7$*Hkxf5casx2kxy`w#+RSu zS==^h!`GIuCmoXxp7QJoXVNw4Vqr(Zo%Bq4SeQ?EC%wpbCVWZ%q@Sg^5`kpHWCPFH zILR+H-sX)SCxfC}3WVHVl+v2l-ZV&lu47`%`ZvX!5g!m+B!|>H%}c#gHtIucr8rV-sAHtKCGYH6*6bE< z_S}{zi_AWHf7+BwY(r1OlIu_D3%$C;_Lm)%8H}*mbzpX#*6b#C&hdksv~7r!gIH63 zvK7BJ{MzyB#IFm#?&r80Z(QZB*`MRC^5L$JXkZgzUiX}iCKA!h2?^oGSUROj*VJS* z6`hu3q}WfU=J4^yQ*kwtNKYH79dRX+iKik-DVdh%BGD_+IBP<8$ tQ!$em*yLUC z?f7k+Kq^R)pXb#|zG$;H{2(`P%iGRy-vCk1+w)xBeuY!`8=kyyyVdTzBhTj@BHzu;I%^6cYAx%$Gw+R)3<4$Y$1mf0;aIl~EcVWm~z#S@b7JS#M=NQo)b$tceYL?6+3N)f1x zvHJq$2dN$B~gpM@1xDp4(h~ zOn62TV$qb4RV1Xx6=OJ<=Ttl;8YQFZjM*t`IBKm$Q#4 zX_>e~uI;ImbS)#rRE&eBor)%V^K9NkH@muqOr`J-a_k!<4bFe5zyk#$=&FLe#+bM7PxXF*WAAF=(@L|(764s zS6DguL2KLMnNqN0`M`U@?WNxSCEqVYZDlvt*8Z8D^Yxc~oTs_q?z$IjDZ99aj$)uk z3-l~sTnqFsJY4d(ElEpL1%GeRKdAW!*Zeyc&{oT2S=NFB3+GDymZHB; z^Y<-hQDP;k`F9rm`!)amJLm5DkC!={(^c|qU-2yGR(tPUyE9wxoZPS@v4Bh$7YHp} z_^cm8DR_n7D21b>MPjlPRk2oB7SONW zX(ne=h_O^zN=Q+Km}0}}tQh^;rN@kA(}QePtE+q|hJGu~VnrLo_-}F&N3wR(_7o@C zB`~35NR^xj*(fC1CtZ?zmIhwqL_5h(lygwIIeXZtH=Z>16*P7xsz!C!3Z!CEB|=t^X$O_dLl}YX1Ubi@=5kdA4Pn<^9NdipmgoUf|XpU4?CY9c@qD>{<-KSE zJ@?`x@1r(1_1knG9mxk#-rs}Y$>RZ^y8Ns3HmQ?Zx{ zWzYu4ktAcwl1E_cvg9{ONVQUD)=IUHLjj)Qz4NJ2l;EeZIU=i>x*32KB0S) zb2vmqow^%A8gbYqZ$lM%H$2@D!vWG=hPcq%Pd|0$|O+>D}OyL^bM z9j0pDlyucNW(su)UTAD;2bf?aV>M7ci1I@y^$q-#ufSX2K6P=|YPH z-SoZYD~9^DP=7IWL<=2R_k@;qFVBIYG`SYe6F+GyI9d%_s4N_sqGcMyZ=)God;wJV zc?&*o`~O++ylu9!%$YdyU+hA4E4YF)rQxxRiV{olbxOnddw$>j8*$f{u|4a>9=4 zmdk|Yb&r*$5Y1xS&3*4Ps}bU)nh_GQ)n?^~QF(}y?rnUv`+9fL+pT%wk7(YJRnPn0 zy`{$XWpAOe|2`M}Yq$I4gZ8)fc`5v$^OVE=Hs?k- zh7uKL!Ox9fAZfE~06Mpewn>Kw*&y0!;Y0^X2E?2clRRRZ=-#9Qc%fHulRnWa`9+s# zgD&HPMiCJGQiJG{{D=n-Zxq|b1}P}H#YRJOk=!IzL>u#sdcl{SlOdD~NllWQ*!h;+ zCge6tEy!<1E36)Lx|Wxnw^5QDG!2*3fV5Wh*EVMhcj%3$v)J^?xsHxt7a15kDJVdz z2}w<-gjgb~5GstSfRChWC_>c2leD)Cdl91$3}L1ic2d7Lt z3FIcml~`0JaCvn`qQa5^6fJZlC<#dd^7XOg^C=}R64(Ult)a7>5U^nb`a(4t6+p<= zs02g4po@4S!P>znOhQM;Y>XA0Od~~Y0N9zR0iYF#U;(WQMw~T3(F_&`)q!_uJf==& z4)htC%vFNN7{>E#4$p5L6CDdyQ30rpPsOPvG)hCnBC$@Y3xz&TKlzlo^v39@Pgzn_ z3)7oBU^P%(SYuDN4(Mw11u2rv)C`GjG?2fLk%3XuStT)7-7GdDb95GpCnQ7jf8TZifq0l*ZlrsWri z>Mg~~nDT_~u?C>KMQJLEVd;J|5CIX(9`KwkJeB0M9lwp2;6X0KH=N>zip^Vm>Lvb* zW#&SQO06(w)v4BqRM0>*tsbBP9J1y9o-I3xt*wkuz+!G9IddN>g$<3D)}}%vIQYQd z*%-DLhfW6FB}~1RwN4Cx?WjRDLM z<{bGEEQm~IEqM=l47HPx&;x|QtX^{4w$5~HbLQWmxI*A}f%|PE7usI%oGJ*X);y;c zzrF77D*6XB|3JaNYkl~@+VG(p6L$lH>%rb)a99ft7d*rF19QRzY{P$pz~&8~t!xr9 zOjpn1!!WE_cAgjQdHxE|h-H%vCeG!#NQEF&qB`d4!7`Mr;AmD4BGzzuuAXX#@)i3L zN|0Vt<50qHCCzbR=Y-zz;IdBM>uVM3zp%<`5?i!sG=o7jTt8GHOhClBvFeDiasBtVz=l zE7{9qv2>D*4#fH@-?*Y=VOH9E;P8=ydzt8_YND$~g^F(K{krwI+*?>g}JC z-ey(U!glr6ddxU%(;Z~^g7%V5O({e$77&G6&CH>ZTt^MzTN$@dT1mr$ju~ts%=zlK zu79i0clgcbwcxQuM=2P-)2amzmN|#lSGL(2hBxd86_66(9z4G2xa;Xz7siUhQB62X zvCh)|gY*^39&TiJarl%re2U_OrJ8AGkme?9F{w4pyc^;f@#yzRExGSqoT*qNKH z@Wk^(8YxJ-Du=7eI+G$u+7abcDlxQYk~6Vf1Mk&EV`CrDjB_VWR?6&9V2T8_G9pHV zosFm{0uLzy;!|=uS<6X3qD7ntI}Jk!nHS_s@N}19F4vu+bU8Z>)%NR@MVq60GIJ3a zo}}a!%~pN}dAZ?QnyOFCKSf#PS$MGAdAQacD__-GMixEm;lrgs@3LClKCW#ahibda zx9BK(dNfbZ^0S&}7)r4>_-f#KpcHtF3P!bnPzr<$+g&Np`l;6miVUC-qb1#*QRVuy zbQgVtmbh7bYh68yJnRyd0t?GQJ!<+6u4VGJO~Uq%(N$eF*`zhwY@l{jMCDaltFL53 zggbIG6=JGGfr)hb1u!)xXa7VRBz99z8yK6&9X-qP{v5iKpT`8lZiCqDnRJF(>f|U= z<;&y^kT*z$OxhC>)*WCB{APV7E?>*2NP16;7Uo>1K1;N^IV(wpb0<#TjmujBd=$wp*d%i&Wz(6lm0Xt+8lWR3bgiRdhPtfxT`G29V zp9WBqNz5_Zz-HCHGUB7S+KMKvX4qLgDqNajhKtI*pDAxQjgG1OX1kTPwv1*Tt@9tvHA+6_7VcU`GkF7U!EC>E6cDG?ysb}Z%LyM37 zCfKz+`ZMR<;8?*kRtS#WYwuX}mpVEZ1NXXmi(PxPu06%B{aV-lUvwS3etz-vQfsNL z>*m*A`}*S9QlMk`&`STUuV{gx#S3&{dF}eOe~EuKUoZ1BBH4*Zrg)bkYrL=kkaa z=r092mk(=!ffC}!w1$yYMD~5!;P(1HZRWiGSAExgHv;RP1_~5CeVV85eUD%WR0j#t zEpoJ?8KkhnUk1%tJPy)q*_37^4f*c`^Y(h!4q7&D$Zm8Xw}R@hv!hZNCsOpzgaRo1IuX*u7q@83I{s~pC=y}qXk#0XIiZ=cI%l1DfEU} zwZfAPrd!|iOFYGBh$NQaQA9sOo88!%lZ})K;O5AuIL~mT80=0*}7`4}6@e1eI zip7D!D<~a3m=2g3HjRkGimQGrg!1)tLvFSX61RH7>ohxXc72N;d)6WH#^ot+CwnG1 zTfyE|zf~TCH)#%VPDp#=SO(4)Z7=M^X7e`!=%pS{aT+hsZe#t(&AZeNBLu9w86%H6 zIPjlMGp~I&M~?-!zAu~OjrV1Aof~#WM>g}Nlhy^4c~d9T*~Dc8LpO=itHPyB_$Y~$rvB&++U6@R^QwnVSCu7IIFfMzQn`(*Le)*&g+QEn z-9UF{m0?hm78J$Q%^Mm@HBhMDSQ$EF5xU#78HEE?peLuWT@2yLPf_S$^2mZCPm=dE zJl&(D@S1KWt;(d6vqvM)@R{L=4v+3mCq#A;LTF3(S0|z9zG{SVvCWUj3}zFox!BD0 z)i9l!{1%Xb@}KC4U8aNo1cHkXt~U$C=25MAbn!I7tijinyTQSNXRr_)yw}`bY#z{> z2NvC>ZGFsax!$mwS(gA@Mpj#k;Zs`p)Tee^qwiir6L8UIrU8Nai4t>pk-qZNA(YrTix^!@xR z+L6cCdMB37uD9-3nR>r$ACvb7We8Jx>15~W0RL9^ zfzzD*?ch%G4||dF4(B}Wb-&|uBfRB75b+T^ej8Wd8LwDzP7$tk^=w}jw!oQ(M$L48 zLQ8d#lKEbh@ero_^EbBPMDz64Cii%SBp)%adKfAfAf_%$GLg7pzO0aAjrzFtpjI^~ zScooRB8DI&5D#-EQJwBpvdOwAUm-X{C%}|sO?2sFne4L)E=zMY7hvpKXiJHQh`{I= zFl$oxwnZ;QSA7vz#pi9y?JNAPE+F9R=gUqA9LT3oSHaWC1W(w;?g))j$DU}l%RA76 zJWL)L40LxSBI4<5#JJcZD}ekcdEmyl*n%Ra#$$%TS$>?-=}D0c*32>ctZs+SWt`45 zrF{z?J?6ws-FGQYbRvJByjRHEL*5v9WJ)oH^$0>Q;-`>q^F8ikTf5U!et;VoUN~21 z>|1j{E(;46mRi;v!aZlxCpNEhWW&z6n#&aWq&?_-s65c<48GZ2=J0vfu|c1*(BRy* z+EV85xzoKtpO5<j*~}E+YEt03tU50 zy62ejxbuYkL*(L6amwGpFWY&Z-)QIfmJc{D{y*e~KIC?M$hH5P3;Y*1_#t=vSKij8 zXV<)a3$C)OlMk(2Ts`o{nhI?5~>;Q6tY*)oUEol6_^DSLQ+? str | None: + """ + According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html, + the file `/proc/self/maps` contains the memory maps of the process, which includes the + shared libraries loaded by the process. We can use this file to find the path of the + a loaded library. + """ # noqa + found_line = None + with open("/proc/self/maps") as f: + for line in f: + if lib_name in line: + found_line = line + break + if found_line is None: + # the library is not loaded in the current process + return None + # if lib_name is libcudart, we need to match a line with: + # address /path/to/libcudart-hash.so.11.0 + start = found_line.index("/") + path = found_line[start:].strip() + filename = path.split("/")[-1] + assert filename.rpartition(".so")[0].startswith(lib_name), ( + f"Unexpected filename: {filename} for library {lib_name}" + ) + return path + + +cumem_available = False +try: + from vllm.cumem_allocator import ( + init_module, + python_create_and_map, + python_unmap_and_release, + ) + from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary + + lib_name = find_loaded_library("cumem_allocator") + libcudart = CudaRTLibrary() + cumem_available = True +except ModuleNotFoundError: + # only cuda and rocm platforms support cumem allocator + init_module = None + python_create_and_map = None + python_unmap_and_release = None + CudaRTLibrary = None + lib_name = None + libcudart = None + +# py_device, py_alignedSize, py_d_mem, py_p_memHandle +HandleType = tuple[int, int, int, int] + + +@dataclasses.dataclass +class AllocationData: + handle: HandleType + tag: str + cpu_backup_tensor: torch.Tensor | None = None + + +def create_and_map(allocation_handle: HandleType) -> None: + python_create_and_map(*allocation_handle) + + +def unmap_and_release(allocation_handle: HandleType) -> None: + python_unmap_and_release(*allocation_handle) + + +def get_pluggable_allocator( + python_malloc_fn: Callable[[int], int], python_free_func: Callable[[int, int], None] +) -> torch.cuda.memory.CUDAPluggableAllocator: + init_module(python_malloc_fn, python_free_func) + new_alloc = torch.cuda.memory.CUDAPluggableAllocator( + lib_name, "my_malloc", "my_free" + ) + return new_alloc + + +@contextmanager +def use_memory_pool_with_allocator( + python_malloc_fn: Callable[[int], int], python_free_func: Callable[[int, int], None] +) -> None: + new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func) + mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator) + with torch.cuda.memory.use_mem_pool(mem_pool): + yield mem_pool, new_alloc + + +class CuMemAllocator: + """ + A singleton class that manages a memory pool for CUDA tensors. + The memory in this pool can be offloaded or discarded when the + allocator sleeps. + + Inside the `use_memory_pool(tag)` context, all tensors created will + be allocated in the memory pool, and has the same tag as the + tag passed to the context. + + When we call `sleep`, all tensors with the specified tag will be + offloaded to CPU memory, and the rest of the tensors will be discarded. + When we call `wake_up`, all tensors that are previously offloaded + will be loaded back to GPU memory, and the rest of the tensors will + have empty memory. + + Why it needs to be a singleton? + When allocated tensors are garbage collected, PyTorch will call + the free callback, which will call the `python_free_callback` method. + The C-extension uses a global variable to store the function of an + instance of this class. If we create multiple instances of this class, + the global variable will be overwritten and the free callback will + not work as expected. + """ + + instance: "CuMemAllocator" = None + default_tag: str = "default" + + @staticmethod + def get_instance() -> "CuMemAllocator": + """ + CuMemAllocator is a singleton class. + We cannot call the constructor directly. + Call this method to get the instance. + """ + assert cumem_available, "cumem allocator is not available" + if CuMemAllocator.instance is None: + CuMemAllocator.instance = CuMemAllocator() + return CuMemAllocator.instance + + def __init__(self): + conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") + assert "expandable_segments:True" not in conf, ( + "Expandable segments are not compatible with memory pool. " + "Please track https://github.com/pytorch/pytorch/issues/147851 " + "for the latest updates." + ) + + self.pointer_to_data: dict[int, AllocationData] = {} + self.current_tag: str = CuMemAllocator.default_tag + self.allocator_and_pools: dict[str, Any] = {} + # Creating strong references to the two callbacks here to prevent + # these ephemeral bound-method objects being garbage collected. + # See discussions in https://github.com/vllm-project/vllm/pull/22724 + self.python_malloc_callback = self._python_malloc_callback + self.python_free_callback = self._python_free_callback + + def _python_malloc_callback(self, allocation_handle: HandleType) -> None: + """ + Internal method to store the allocation data + when memory is allocated in the memory pool.""" + py_d_mem = allocation_handle[2] + self.pointer_to_data[py_d_mem] = AllocationData( + allocation_handle, self.current_tag + ) + logger.debug( + "Allocated %s bytes for %s with address %s from cumem allocator", + allocation_handle[1], + self.current_tag, + py_d_mem, + ) + return + + def _python_free_callback(self, ptr: int) -> HandleType: + """ + Internal method to look up the allocation data + when memory is freed in the memory pool.""" + data = self.pointer_to_data.pop(ptr) + if data.cpu_backup_tensor is not None: + data.cpu_backup_tensor = None + logger.debug( + "Freed %s bytes for %s with address %s from cumem allocator", + data.handle[1], + data.tag, + ptr, + ) + return data.handle + + def sleep(self, offload_tags: tuple[str, ...] | str | None = None) -> None: + """ + Put the allocator in sleep mode. + All data in the memory allocation with the specified tag will be + offloaded to CPU memory, and others will be discarded. + + :param offload_tags: The tags of the memory allocation that will be + offloaded. The rest of the memory allocation will be discarded. + """ + if offload_tags is None: + # by default, allocated tensors are offloaded + # when the allocator sleeps + offload_tags = (CuMemAllocator.default_tag,) + elif isinstance(offload_tags, str): + offload_tags = (offload_tags,) + + assert isinstance(offload_tags, tuple) + + total_bytes = 0 + backup_bytes = 0 + + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + total_bytes += handle[1] + if data.tag in offload_tags: + backup_bytes += handle[1] + size_in_bytes = handle[1] + cpu_backup_tensor = torch.empty( + size_in_bytes, + dtype=torch.uint8, + device="cpu", + pin_memory=is_pin_memory_available(), + ) + cpu_ptr = cpu_backup_tensor.data_ptr() + libcudart.cudaMemcpy(cpu_ptr, ptr, size_in_bytes) + data.cpu_backup_tensor = cpu_backup_tensor + unmap_and_release(handle) + + logger.info( + "CuMemAllocator: sleep freed %.2f GiB memory in total, of which " + "%.2f GiB is backed up in CPU and the rest %.2f GiB is discarded " + "directly.", + total_bytes / 1024**3, + backup_bytes / 1024**3, + (total_bytes - backup_bytes) / 1024**3, + ) + + gc.collect() + torch.cuda.empty_cache() + + def wake_up(self, tags: list[str] | None = None) -> None: + """ + Wake up the allocator from sleep mode. + All data that is previously offloaded will be loaded back to GPU + memory, and the rest of the data will have empty memory. + + :param tags: The tags of the memory allocation that will be loaded + back to GPU memory. If None, all memory allocation will be loaded + back to GPU memory. + """ + for ptr, data in self.pointer_to_data.items(): + if tags is None or data.tag in tags: + handle = data.handle + create_and_map(handle) + if data.cpu_backup_tensor is not None: + cpu_backup_tensor = data.cpu_backup_tensor + if cpu_backup_tensor is not None: + size_in_bytes = ( + cpu_backup_tensor.numel() * cpu_backup_tensor.element_size() + ) + cpu_ptr = cpu_backup_tensor.data_ptr() + libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes) + data.cpu_backup_tensor = None + + @contextmanager + def use_memory_pool(self, tag: str | None = None): + """ + A context manager to use the memory pool. + All memory allocation created inside the context will be allocated + in the memory pool, and has the specified tag. + + :param tag: The tag of the memory allocation. If None, the default tag + will be used. + """ + if tag is None: + tag = CuMemAllocator.default_tag + + assert isinstance(tag, str) + + old_tag = self.current_tag + self.current_tag = tag + with use_memory_pool_with_allocator( + self.python_malloc_callback, self.python_free_callback + ) as data: + # start to hit another PyTorch bug in PyTorch 2.6, + # possibly because of gc-related issue w.r.t. the allocator and + # the memory pool. + # to avoid the issue, we keep a reference of the data. + # see https://github.com/pytorch/pytorch/issues/146431 . + self.allocator_and_pools[tag] = data + yield + # PyTorch's bug, calling torch.cuda.empty_cache() will error + # when using pluggable allocator, see + # https://github.com/pytorch/pytorch/issues/145168 . + # if we have some memory allocated and then freed, + # the memory will not be released, e.g. in online quantization, + # where the model is created in higher precision, and then + # quantized in lower precision. + # Find all unused allocations and manually release them. + # TODO: we should expose `empty_cache` method in the memory pool. + # TODO: ask for help from PyTorch team to expose this method. + allocations = data[0].snapshot() + for allocation in allocations: + if allocation["allocated_size"] == 0: + handle = self._python_free_callback(allocation["address"]) + unmap_and_release(handle) + self.current_tag = old_tag + + def get_current_usage(self) -> int: + """ + Get the total number of bytes allocated in the memory pool. + """ + sum_bytes: int = 0 + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + sum_bytes += handle[1] + return sum_bytes diff --git a/distributed/__init__.py b/distributed/__init__.py new file mode 100644 index 0000000..e911b2a --- /dev/null +++ b/distributed/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .communication_op import * +from .parallel_state import * +from .utils import * diff --git a/distributed/__pycache__/__init__.cpython-312.pyc b/distributed/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be1943ae0d5de23cb904aeef8d2a68e56bafe347 GIT binary patch literal 250 zcmX@j%ge<81UW3-nfrnCV-N=hn4pZ$NzqVpTF}GQVU5 zD%51W#i-?{$$U#7IX^eIG%qtbu_QA;FFwEE7GFVPQDRO`YEFD{Nn%OrE!NVK%$(wt z44*+3{c_MR$t}h$xPBOs4U6I&(kl<$;s7E$t*4@ z%1kOPNlnp@kI&4@EQycTE2zB1VUwGmQks)$R|N7D$T7tNK;i>4BO~J%4hBYrI}EaS K8AOUWfPw&9kwgFh literal 0 HcmV?d00001 diff --git a/distributed/__pycache__/communication_op.cpython-312.pyc b/distributed/__pycache__/communication_op.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c8ec37d0989125c1a44f635750a51854dfcf498 GIT binary patch literal 2312 zcmcIk&2Jk;6rb4-$MM){Oqw*Lk-CDYvXwd}apDFksuVzgpvr*&2P@Lb+1(}^*7j<4 zT;a%vB2_}D2YM@<=+RLQ{%6|C;X=?Vg-Iq9E{KdWFiwgsDuj`b6awxf+R*6M%h#xrKA>A zSW_)&C*Doc8_me+^S2ap=&n*28b;>mnXyqTdS-mY9KNd-hR+cngZLQ4C(o=;p#Dqt zU`o3~2;GXBamYj<(<^4eid*>f6I)n#ORoYrL8LiiMFEEk5S^$=>9J!DTN?ZFYPq&{ z4Dj=x_?E?V=IUFtQg!`^U3RNX_aoF|Ra!QH%WU*Y-n?9lkmsb+jM) ztwXil?C3#hr6&ez+Z7>_hMFL+=%)1S)badgGw(px?-)dpC~AOpDAs z{Pd7qYbw`HU;+8EX_pL%o1rK$3JCLH{BsEFct6b2197Ve_|bEK7$n$}1c)>N8}fJ& zO{4sYz6^yRk|x28e;#Mt9rP@U;*&ib8taE-zNySp4%Ghu!pcX2MVWQDJN93J7zvo9 zdqEO9%8%2J!c0KTK`so0U4*bn9y)+Tszp+biG#62veZ8t`sVoFIkT92N8_btIYMjoI+mH)D0UgLr|8wfjaR>*b+mk%0j@A)_J{m}OerLEl z(;MsP#fP?*R(+LKsIl#nzNX=i*q&~eZDto8d)G39#x%Z3D@D^NdeBWpJJb2eplxwZ z&?!*;#G)K1p|0|kc~F2@1jUm_WV%JBkH~C`%>GO=?F&=Q=+rMkhsm}`_JBNsm$lkd zRt1h}eEb&()ZxHT4OCiM6F=z@Kf-FRUEZSn7U?u7KPk>L_rh_Ww>Xa#;VFqe8~Ioq zLbN(xw#!UBcgmSg`9=E*Kb|`qBBpIpJm}j;^Mj$_id+$CF>*{OV>$*jHW?rSDe0U1_}Dx}0fFWLn9&gNv=?QYUbD``u^45QzK@##IA> literal 0 HcmV?d00001 diff --git a/distributed/__pycache__/kv_events.cpython-312.pyc b/distributed/__pycache__/kv_events.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..501066e206b0caa3ed37c152b163617b04e4ad49 GIT binary patch literal 16650 zcmb7rdvF}bncvL57rWS9U>EP#;6Z{*5+EtQL{Q?JlqgaVC4%w^C*69vGXNIcmu3be zfh=jtrp|!PgkUZgg0U}z&#p36VwIVyRE3vRCAO3DId_$0SAY)dF;!v8N+rr4DNumw zRQ4a2-`BGjbrpw@uzffF=i52%H}qlzcQLV&<2mj& zCvqa6;D+@lKg_dt%diDcOTwD84cn6TVLMOx)`TPJ9Cos_E#XSKhuth~PgEs6!ycA) zB)rM$;cAw4Cb*<;*vHbYgg+S=4zM(zs2Q$d?bHs};^|HVlcC`d%d1M%CF_UlS=y6m zNHz{P@|=Z}>ZPU+(O(L~%@*z@PV`>q#A>N#e2)$-k3}DT{`-Ls9XfQlmDQ+0joRN= zL-gGbe&{ULX%j=k?W|rM>eWkaA6kn!9Sqt4Xk!^vV9+K&o6De`4B7%{tJG>@=wi?| zK-(po3GHUk4nPH|+l2NoXeXduWzY=_+6`#Whn8Z$Hcs;44WH4=VA!Hr4<9+I`J(3) zRgT8gq@<3e#W1hgMd{rM2?^KHXd)3kpP;lOs)+HJO4+gWM2aO|pO9EN>*3TSGJL1r zJbqD1smjrGY9u}ih$o(ktC2){bX1b@T66reD#@v6;z%MLyYO;U8H?%t(&OUfaVsz2 z{Wd3YL~6s9bDU(AY$8uk$u2pX=xGxz^e$S5osvtmPGLy^vk`348n$aytS3jJYHUoi zsEX#0SkDHvS~03dBblg-p-G8|98FyayERuNl8Pp!NJOiOM3QN7B0*_yB=YV=G*Qg4 zk0jDjRr5z8(Nrp}M%8#arJ$585yu45Zjx&eY1Vj3{eW9RlKbu9ZE7;JEfdf5sZn`U zQu{7X?A`bLg}u_YMEv}=%p_(!we4aek=!N{lKA-vRT8&dxERs9+n)7i@Kf4dI-ve$F~4-lU0uIB1$9Ci3H}wG8$X62E`4;tptFfv<4zSY@M_KmG{(}N29SZiB0d+<|9!>5@Ly{ zq6i~tS-_kMr`{A|RH9Fo5>bpsD9(J?CI_ikA&Tk{Y0gMQOvfS-xtTz$%dM2^G@>3P z6xwXpxzBx^eYakQvVp9PO0`qefk?Be(NWF%(wTE>hGUY{7|N7}qmvXPBzuvtKv0Z` z?4p$snzcx1cAcmshh!J+EXQG%bg*})`R=55Op{CVFbY4bre#UgyyuDBBV(X{>U4awCnunxY1ZXhfP7 zgtUH2H&QM|JMdSio9}al+WK6>rhIL9!4@twY`V8A->_}Lwyod~&K{kaT(C730>N*+ zR;Uil?wT21ur(CC{@I?HR~Bvc`Z^H0%9GuXoZmayr72{SjVZHfUN&WKNXhg?NtAnm zVo=_SIPCn2LHD81`tfcf7)1m4D>O^%#=CATRCLW)%zfZRPH8M)pB61}IH7Ye-s*fBWap;nWBUED|STJxq{85Pb9(DG;UJuX5^F_*Qr*rGdgup%qZBYQ?Ri; z+0JB)Pp(1r!DkQ+qX;%ltZ$5b)^&O(s>>7|#ow4ju$-2;Y`$houKz&3=0M(iaM5;9 z9|sj(GY%Rjg_S`>rEw555#6E%4BIN&Mr@+}I`jw0Avz>y5#qdV`;b+IEO#NNN_3S$ z+>^GjNAt0fo|!lg{yQeg(U&0PPRB>b)Jqb@LdM8ptf@3D!g-Pkk|JD+t78I*1Hljn zaV1$LASe@=Oj=g^tJvGg)Y?*lBB>J@Wvd{8&k4uQ2qZ)Vc_O620D&{0cd1V} zUfy<0mVibPqUaip1t`tYvo`w#bDmIYLQN+jzG7h4lW3F)GqV$HR@5;mF0)au6_rI~ zLt}BENhH#j6d?vFIFXcOMZmyFip5i-6Ic;rP$P0W3HZo}L|o69E=3>5&{$bd1t66r zB?E+UC>^>)5k=smT2N1-Y_XI$W>P>81qI7UqFQF%q6uTO5KoCxMnaTQF~0N&)mGD) zH1rS}t|%yD>4}7x>Q#l51ObKY=q2LGlA`o8RJtH{$XI7NH5e;uPmCZbP5X0&Wc_U} z%V)U=%b;kI`+y&Wq7#f%&@wEVO_35K47WmP1VL`agOm@ZCqP?YS{5a_NZUm1CzF|kl*DZ5 zDot^EgwY95Qc9I1&|e=Iouc-oY34;K5x*$OY%1Ao5sqXURFqD|V>)(PvrSU}X#ZB> zVw|WF3MJJ|O(f5QRKof+Q^i5Xx<9bU#Hks_S~EA!A{|fFubY}8{4La4p~-wt*C1jB z;|7QYwpL}6(QtNbLKSI`5(SyNq`%k?p7lc^EXL|6#R^ZYj*$^XVVM{^TayoXnfikf zk=ZdAu39e|x~L>^Zh1dN)HhwpJ3y%;6s=>#y9q|oOZY2i5MAe|KF$BNZ}FZ+=C+``*8GT0MJ>C{9`tYP3Z5DF^- zqIC=$iqdtMej*ekJnaxa4y|@@B9U0TP*Z(_>ArMEh_4|sVN+296)ufQV6Sn7%nUKy zzjm`x+vN+$$K-pbPxKAiyTM(3W+3aBA1Vi`1evvd1HsEu@X_R zRhnw4(3U|H2~?0@rikga$B+s;DpJBTSN#Olrf39z#gFJZ_bc1p6&r8&uR0LvsHw!7 zMI)X-VdeiI`gYl}or4_@$!i(5kr}1yxuR{HFs}<(REOxi?=me+mzyj7#AdNda*H0R zO7!CICQSr(wp;Y!@5es?XpQI|2~2Y%yjXj^de|cdB`<^7#1P1(T5J*P@buwXkEb8c z20R0JHsV<$)iV4#ZfdQ@XlcYDHkIiD!D;R?KO8#CgAmKN_u!y{b zGv|(Md*iipL&x7JZG2q|h^8Q;`;t&T!PjIdlZZ|V=O;*J75U`ZO!N}ud$c5A)iDTN zk%ZnDWD2kwWCqs9WL{JN)0>cr2+n|+LWF_&=`ys>2<#QoDwBrYIAAhV^i_z8BIKdc zFPwvw^>sCt8Q8XM^T6)iyZ3&b+C2|J2X<@ZjHMOT%rx;pJg1}>R2gEn`p6P#zv)N*4AsJT`h*(4A zIzNdCOfiwp1URf>(iheX2qB88E+Koq@BAcGn)C6|KB~+XOO@bL&_|^&R76pwj8dAw zK_+TQ{AVD#%pPFgE=W?QY-Bo;u}jHP-C;>izxui`20gOx5_Af=FBz3DusXCZ3d}jF zIHk$xWjG12&Ji-%N+@29UXCXxl0}I}-H@0Ef-M}QEKnSj8lpJ82T)Psu@ZtIdZp4u zx+>Mzi;wDicKM(dIQ#Osp<}OoZ7?!)^3~&@@6Y}L;WHNsXjV+x5Hu2p5+6;$QqeUV zmq7uVe;rA{SY@5lXH&CcS~TCBmow(xmgf7yqWE@H5a=hpm?7(M^8}+srIi;sDN~{o>)+`t>)Fw-jg{*U& zOviOWw3f}aIftSRHA$;sK{Xw+})gk(ZWP1z^Zk-q1LrrORLY-|`z z!&E7uV3LHGs-8-|+h;=ILv!f9K+PdWvHeq;jU0^OF3k?JQ<8P9kf@Ytg>XVL#-TGA znf44eAS_2o_x9L{H7f9>>HAi*qBYG453DL(RyCJld1|&uOudX49YJAz*TeD@ccjHSE?+2ihg_ z>lD2~(OHUyC^|^^Ibdht-Bun$ztp2Y1^W!=}|-L_s{)k=Y9M9K)$UX z{A8H< za}S<>IP@Rh`Kxypnoa@kD+oPH!oIw)ui$N&8@l_>op)BOmd+|_xD8f7T_Z!~@-DfW z^RDJaS1a0VYX6>-a28rS@1DMM`d-6A>kigaRA_3WoJTI-P0x%c7uvb#+VyFJu+-3- zZ|E%q8H@v9iuD^DU~)-<#Gu--ZD59 zJY&!uxg}zWG}IVq@b99A=6wSPAn~MhT$a-^lc+NBG5Jp@)0&x3<;uoM)6>LNr0#!> znhNnD*e3qq%`4xz@~E~w*RgA%c6ZLZyWpu=^7Q0AJvq@R=gEnwSMQc%WeZ>$RoTD}Yyq zuLAt5@l9H~xl7z->s#C_fy0{2TAxr+fHRk23=XiAFp(h#WqLx6NlO2T!J+^4 zFYZwItAkSmUyI5l(FTOq$rupe2E|zzj(mg-1uzMG#2?10bOw=5+%o-W*dj4DUYzqjIYULO5g~CdEhP+yqesn=t6|`Tp_j$IjM`u zBY3GKCJoQWzeB##hh@9QJ#J(orHo1Sp9Vs=-M8F-=9#t^8e6BUf8}fW zQRv4_KWO@4%e{$w=dK6p(%#egy{A9wJk8|Xj=ZpAN!Xhg_AUtfa;^KOU;UN8^%0O% z-Kv_~`Oo}MoSeV4B8G*|(@&~6Z_^`h?ajfN!9w74ArM*$bmjw{OM%{epm!kI^KfiH*&fB)=ePP+* zaMuveo}8J?`Fntg#&z3&%Rm3_tbY->k(Zsx=KQ^s=e^}!3UuWIUH3Y3fv&~C*2mt! zm($07tB^tQpLQJ%oUn7huv<B`M=FN zO3VL8Ez=}nT(Z<{lo+oorHxc&45c=ZU%|M2ZwT-| zQ0kY_37w;^3r0(z6VoIH?h2g$88cL&cLq)vUpx*l98>9&xJD8O_nHD@Wxf~PiAKu% zn5x3-vkiACVEFWt!3D<>t`#W#@$@zcmn(Mc>)+nLeH;8NxZe^@k<$#EP3a#~lZkHK zlIT;SBa%872WeM{DrwZ$ge4bvc@C<2A zvY}uW$q9gDT12giSt$w~t7$fBU2_(Fa`FX+M0dF{CM+KYq}Oa!A58OJi6x^B*ZoT(M zi@`l>A%DCW-1oT10)4l9bL!ozcdmXM=r7dN-EP0to~zkXWRN-op6~f_{}1{X13MVY z)!8lSxGf!b_ubjI)UqYtvgQ8KpS<$HD+?`q(eu-581bCA)V?#{zVl=6u4Oyt?fX57 zZ!0W_a(~nxO8ka$1o9^K%L2%|g7S0C@(KrAg&T(gyFMuGfLCCMm0a&ei=ySiF^-$% z8h^cmSs>yhJIJxr*ubRw#oKCFP1qff>m|32C3nt_h{wj~YH0Y^IyrD`1 zv#E}c3+vF}6|LC_1M1!`t5e>a-Cx-oS9xz#I<{n?;zdtW;FtP+<0tETO$NY`XzaqM zjJu6E)G40rj74#c1dJbtFt}YoCl<^XS3KAmWa|1|3*6g^45xbzv9`Zpjn?U|HrAT|% zY`ET#z}-Yek}t+%lHuTUAHN)v*x8P58n`9nU9;jkqGo4D7n=PzGa_s$_zdL~Ax#rf zmzpkm?0j@;6G`IAU>x^LR5?RsqEyBa&5*GG-(6I>fctzYN&XgPzK;n060#{YUV@oI zrysbRCc!(IiB94wCF+oykhugE;sH$d7Ojknc%SO7Ao$FwD|!LHKtbh)%-s02uJv~0 zR%EGeYrbylLLK?gsvet~dCt`X{rs`7_Ga=s$vJhYV|%`1`%=fAe8--Jj^`ilSm=0x z3hS!+y<%mNxItZ{Gj#tzSBG;WLYY*GbGW zYoa?J=$=1&uXlcMF|ZSr4BKW)KCopm(6_vU!?2P!DvkN?I$rp4wTf%({G8)!4n7Js zFNJ#Yp`P4^frp2eLI*wy9r)d{6*<3K4RE2R+xu_rN5SeQEzk&Xb<^)wo4HWi=bW`# z_%u+z6cF+OA=i26r=5#|W2S9JrqJ(}ohb8L=HdFeEpWoY{k*RI#7^$#dyaIzRL%Xu z;Xbk5`U`*ki7nP&Y_U^%XYi#e>tDNhimTl(d#rz5zvX3z^_LDi(rZj3^5inp=-+|} zo@E-D(iw%BZ z>&a13V{nvukadof@z5JD*hoJx@s<+LY@wnnO0<__g_`f2+H-P5h#IHw#sM;%ujKN8 z%8tWkFq-1AeCbTNfABL}z^Uewsf#cX#B~T5HzxM0yQYa7hdnya^vP1}Vu`6~RvhXu zHYxupl_$TKRz*koI*(%fPW~Zf{Uv2Nj}IP;oIU=QaxIDLVHvye!`XWU8|Q|69x8n?du+e?(b0TTJGabM?a8 zVDbmz0}Fw^Y5OCOZ+hap19LUsIaqXt7%p3g1>e)zToeGsR5;)aUg1;P}e?hoj?2Iw}0^V zgU$!%7B=jEcs3V0{Dsw0Q@iZpnl>zl%)^52ysLZutq0ZzL;vROfA#iHcjkIt`q=fd zuD+LBrL^^EZCCz1BHG%rJZWxm7mXxSwQs$!rT5Qa#BF%Z+|2B}fg=B1JpinsQ<%5R%= z(JhlM(*|bO`@6zxsy=<+QE%?)zLt#{V>6Mw!R}hF>-w4w8y8arHciE8!-bNkn>LZ^ zLlUU_3a&N8FAKW1B8g@Zro(9teQ=r;2cr0tAePbifx*v+Vy3nme8rf#0U52kkh_D}#J@y^YuuAAF3^~3>3dM~QOk3M zV9WGtk3GTJ%Xv>np}u`~cuvW6@6L7Y$p!c3yn73t!0gL#CH%_YFn4tR`8%&Y?hOC9 z{Riz2tP7pHW_^X$p83dpD%ZMu*8QlpW8QVI{b6T5ygwIyF(({YXgvrerllv>yD#6e zZ`M_4?w)IzKRiD+Kb&jWo(pVetv~YN2=&y0?{v;}nsEU5d%WeuHy97V82oJhPkBxz zPN#7)3}Y@_SMfI8X=KS{Oz0m(5TlkKP(<8Z?xTpV6s*5+Nq!cYbWWKVujU1l#{Fph zizu098t2^l*>OxtBqEV8FHZu>Ts4PuzxXMtO%dtF%HJdU@_pRWth2k8`?-#;oGmmf zE!a8>{+gS~ndIEzxruv2)5!(jbJt#4uHu?{3mZ2VTDuCt=0f|1LS4g?nqGU`OugzYwa-hk+du30zU08XAud|=z(Q8NTvZdYrDqpCr zo3q{8yliDJ&+XO|kKNvBb*A}qJ3;9xY3(X2tqWS3i5UK7Fl_M^60W7Yq{i1>-?z{w z01|!8Ws%(a?HxVF#Xt6yh~yD%_&~s3UV-A=IXD#?dW z3*0E@xo$T!yJ7#NGhD6J9VffQJnKDy@8EDD3m>JbV4PHy}%+^nKQ(3>^!` z78TO3_r^#*8NrlSR0{Hn+(AZO*%R`}Uy?C6jfLD2Eq`IU8sJ7EKX_RUBbj9%!Qp_~ zIymJqEg_+ID%6YX_~iVg+l$myT(%d(7Wr>bQ*)h?CYk98&SZ@x6gCL?Z;`d;WJjwt zs+R6tOl_(-Kk&2ajg(5hMJ#3qa6%LD_-A+Ce(~0eI6Qe&+dTKj_d@p^x&9-$mZJ-` z$8z3dh4!v#>&@z!>RIt4R|}&T%>jOjj|LfUTr;U;88Rh6Knpaf@3r7Y0D$X*><7GD zeJB=ywoVE|RY@=#Gyq`@`p`p3yN6H8I$lwq|BwU%0`osrDp95E8tv}-b34fM@@5J=$z)w+ax2cUP{q5*$6&AWFo%!5#m(Sxzq(5 zJVGg~U$^A%$oo6y_k83Je;VAd5bVu)dq1sjoqO}oQq!Kt{^pzCnEA%s zKU(nPUQWRkyy>6u=Nh*!x^}=H;6B9{n%nO7-RZk$%QyE;pLzlZdQ0AaCf9!`=Rfn) zQH_uzF7{l+^o7-3|B3cd6GZMc+cC zS)=D;GI0;~nFY}lzWQaVx1wLA^Or&TsW8o&<{_Pu&6lL;RP4q{xU8nr;GQJNWZM0> z$Tvw_N&oiupA+~zf$Qjl-hQ2;`^|58*>r0jmZ#r(lWDVSEevvT2P()C71#IP1TGLL z`lp?2#o5We<1E_{y7tmZwX0q|e(dDAS0niFm3`mIK6+$d8nQ3<^z~<(k`$r4z_G^w+hl}4_uz9{c_}(B+!11kV!Rh_N4d80Mli$5Q z2Ui5X@Bj`Y(HFc{wd!#24YONUIXqTtEl9tx%HgqE#q;jj>?()Hs*C3j@v{T;a)@7b vwelNh&#rQKtR8Uk{<-#54v*E)*RA}k{G4xo~&JY}FVrVmOM7>PbbmEMaH<`GLO{LpCoE>JOeQ*3`b`~<_+1M{{ zvcLaVM^`rrAayv;`!+?Q`m3t%{D0T~{{EfIW#@1OO!oq8iSV6+<{yJr7;DqqqYGX3!8(6 zQTu>>)G^>-&z7Kb)HUE@;oM-}sC&T8!q%W?G=CtUg>Aut(ZYek(V~GO_G}LpkCqIS zuy9VWbfA=#TPFXOk5&xk7`V~Ofhrcm5iA?69;hCz8K_~;&fuz1?|@fMT{~JgP{+cq zVEt&rKm!Zs1sg}32AWvd9c&(L8E9c)Pq1~gZJ>>X^Mk8L+XvcNxFEP@v}2%Sv~!@7 zJ?8}1j#dnG$>DXQ-2>|lT)>sX?dSNyuW)?Pn>iAE-;}{Tupx&FIoXX|6p`3EO;>!X(fvx!494M6D&|AFGO-bLRSVJbh{41t` zZNuEac2?33>3v`)U$Kqytqg2N-kTAx3h{O!-fkApc7_WS@zt7~SUr%ghNX-0s}O$= z@71pXU(528LJq{yLh%&%YZ z>jr%NCI)2I#Y<^71`cAB>W`)>e$!XX1Bd>94S~&oz+tQ~oyvM6wBxCR-xBBz^u5*d zrcnZ`fn#sC2KEN_EqS{wus`t3lDFFfhXaQ)-(rp1fi(gS_7oU&aCw70KIPMWZ9J?j|D)N}%1-v=WhjG8ae~FJUC{3V77Xh0u^XCIK zfhM-fT|jHgFd`4Kozuc!M4U;yA*fCwrztr+h45>dbX1m!e+_S=lR2$l@&9um5a$mc zKH}wv!x3Tl^h6}Ud&3cbB+y|$5b;L*=K^7G=zKu%M$QJjgQ2mA5DI!jL+nwTCNOq> zSO|@c2F4=ZAt5yCeRlFhNEkfZVLyR{k;(DlL4VMDAtam|3WhFthr>N~`x-Ca1--8f zj}1qBn#_Ef%wF+ELSBk6><Xaz@}3bw6XW3yYK4Dn5}@Hxu>dMLcE)>QIC6GLO~a9i zp`mo6`2YYOn#`&NYp(o><*-bt3h0zvG$2_5J#BWHc$uKH9~kmZh9cemFyfJ9uJ`T0lNyM;qYiX%XQE{<_(`49@lG<3U_~GI5Y-e@_-{? zGcicdc8taV!TF~&^ZHv2Df^gzG{8qD#)APoS$om3{?iEOr1EHd1};WM{bT+!0U_lW zn+OKwcZh1^{Sp6Q&>s%dLu44a1cDJi9$ewGej&j7MgyZEVG=nRcaKe`tVhSG2L2#o zn8*E*!L#_1yT?CxE-=QYoX-lO!9X~?pT+@V*!jS?5E#VF;qg?`7dSsW80Za+j!ukW z=tn}r9)FlBTzs5nR}d+)GH|eI5Dc9`)e)~$2!uny^8sJz^oZ~Dr0>-Ue~?;^PnBl^ zne&CE91TRyhImx5oF7I-B0eE-29r7<_y#A!ksvG;Sd0Bo{D_ShW$Q@CYOPGT#R*yn!|qEHB1X;eNqCezw;?9>Ij`m3k*- zLFWeiqy4S9DXY&%ZSwh2b|1}zi6DiYKDD)YG6QylXH%xZ3ErQw9A{}$ZlBLTHWmUX z(L-UM?+*8vs`1NxosrRT8i{KnegR-wb8%wRmd)oj1v-Pnr#r_d0ivkczc~Y!%Z9T zmlNeiw6Q2AE@g}D<6geXaW7#dpEgdLB1#`}QB%Yu#pJjVixNuzDu|Wv`Kak>dw1F# zHH^5@c_^W%G2&LkX5{Blp3=W~TbO=QLP{$X*>g-NXYSo$ItLbQxUuupk$b#-`m zdxMxxwD+}~3rx0q&-;TDfmTEs7Q!7~ET5Dcdx_o6>z6Zb_o5s;3xV;Ve=xwa+$cxy zX_OQRdogXoH_Z>QG7M< zos&0Ty7AHjN86&EE2+Hx(zTZo#Z6*y)6Bl4vpnJSicW7bzvx?IbEoD{e(%LQFWznZ ze%rfkf7~7~*&p*f^N}&fmA7c&TxAJojp(eIop|5b_((}OS3fr~SM|QLW3ikoZCkA1 zoaM8tV~)m{sgcduRH2XU-qPAL7@EMYA*4*S5@pOrnmueb4k2)zi*gbCO4oUVVB-zx z=O{;OUX=Sb$LDC@5_T$I=?#wpCDLE~T!OKrf|G)+bK2O=0g*fw6B z2zaSd-XUNuG`d)=rQxvmqJI=Spfoytv>`|dSeshvBI9+f-ZeYC2=uJ2V*_aANL^AS z0jmg?GW7hE)|*y?uz`MZgpKqzcPwy$suQL-%4qOpXDe*Q*FpS+`*6cpaB^1XgTf{; zw<+1s_`q5@+b3FUlljG0Em!>y%2vYuW(KJ!lH(F%y&8UuM^jPH=1WwbQFceENMDB4falDbzA(7DAv;B^2RKQRatXd>cc z%O_&eG9&D#8;!3pwOU=8?CCVZVSKniF;3!kiAy?)zf&qYyfM?>xqb6J^X&^`vAVr+ z)83fF`_Nf<^`Pin6?fKN+MCREUYdTGS2nwGu5GSr{`0Z&ZSlPAG3)lE!;^3{iH@eZ z^KnN<%+$fUCZikF!Pky5pMLNi=Y*X|fO*E2b9-QPJTi%;`5chibbB+Vojcu9-w=p2 z;V3frBL2d(#bP{BWh+HTqNvKb24oJPGV%Zs4kDZ?S4Kq`Xf>g?>gc#* zmY_vbUwDpg>2X4h!#Givtvq>T$$1DgNWP4}@aJ&DqGjQnC7;E3X%ii7^BWhsVX*aD1gpnSj$K{qNF5j^<1uZPXNHhpm3soA{G=n2#(Rc9TA=k3>;0G#SGHc>dUbN% z@;&Dr=iO7^fBD^)A2<%hOovwJmtXI>Agz*T{6G#g^FTO3z{!FnTN|aA!fAZ}ZHn_5 zk9KKwoDatx-7!=53Q+xwM?0GOs~+vb^MIE?@Dg4o$Ynv7t;cddu=SWnq?;7654TI) ziet1wEbe}~F}g2h*qt)$N#*YLVdV-2z>bguHf5_EPkBPZ@ELl|R!3U#S7{!6{vld4 zNBxth16WDftV=nuhK^&Cm7YwehyCG{6CWpnc*hE|(JCv^vUy+&VIqbjKL6n0#OR=w z98?L|OIU%hhvck8xblW`fQ~SPSUPJR4|JshDE0E+&TT^Kru6#88}(yj-!bWzfiLFY zF)OPoLu(*g$i8?BpQ{kdBamZfUunc?Y|9f{oLy(@r>%3uq10!@sf5zM$o*5ah)CwB zF=~t2qmHQaO_>EW4GDDAHIkPuMG2kZqE@Njeh&zzZqG4P^P1sbTE4+eyCV5Y$`RtR zWL;4gX0m;j^Tb`N>w+2tJt~7Hf4IeQ~B*`GWk`? z(8|cEpZ-rO-g{sSMD;MA8B3Ext0QF|n-~oQg%?q03D?OT3eqedOWBEyIeca!G!d4O zL}^}vXE~ZOogWTdNLhv^#s)*dRB_gpDU%M43x1VoZ*>a+Dr1OlXXtj8Zo_mNq1!pS zG2|&op;5Yx(Je%`aokddUZQ`(VPg9*#43H0_ypjp0dU}|4q?3jx=G6js_T!8PN;~z zvM#YDlhD`O_zU-g%m5Bn$64K9oqm1#%U_($zn@zHTJXyD%iCuQXX|1PP>8onZ9?mBLie6qH0R_$V|Z*d73M8+ zCU6eyTw%ra&TE}>wR6Yf1ucn!Zn2;{Ua;YEZqnwtQhmAl>#JsW-?vqR4p_`FxXP2| z6*o(7l+M~>wd)o(y|ex8?F*gv@)wN;S0A!9y6uac(Psb1&e`%3)-usrmbBWg*e=_y z)?Pgpx0WTW^`f;tZf(Ttq}9#-JCoL;q%}WjEqR=8vso_fUM%8l?is@s%VkT<+Huu6 zdwlkE%u|c-T-WVQw>Hgp-Rik-?YOl2q1}_PH;MKpmK$^P1+H80xEInS8rK~G9DWi44Sr< zXl~T1E*)~&3aMj~MzJw89!^=qlcS@=dr#Fpd8o9yKK0NIQ(e-d6(MQBwmtoTg_w@x z{saEP&)|mby@msGc+2H22}iZ)s7^R`iH=>Yd({4x`l%UR#X4To{gru6%OaO&wy<7e zgew@xxuJ1J=d%D!?UapXiEkKeA4aY)ERI!V5Oq*5WmSG$fOTKX!siVG#(*hc<{7go zUU! zZw{DvSHK={Fga#kz{$Ju=Yh^IVAMQq6M_p35}P#dt2)JeLGY`9pj~piHhw4qy2dv$D4jl=D@AiV`UL4n=c$@MylhG@V7OeV7QPnntNDa!Nn>gi>$xM#<$&JQg#-02 z>r<*-z0&70EG>KkYUTaJZE8g7T8xDLq7loC=bBLhWyvzxmDGuIL3De+Au$L`#2!wD>1KG znB}9E2<a#D!%5Om_9efY8Jvy=SY}JQ3pKhbnf7_61E&BZsza1%vme0EQ9eB@Yy~HEy zD+M09{R_RPoyfhL--SA@=XWC?!Uwai5yBa=F6f<*Zq+8#>jdAsOiTKfdA}DsKy_fV z{#X6Cvfinc--r5cVWmpB=(o!ReJ=Ar&!nyOq(GaK{}^ygJ0StGo{Cp1jrU%F*cv=6 zCV25nVi%$d#wC$|Ko-gACIzSaG;z5}e4n-SDaFenPj!(!zrz9?yp88XT@?*LPhy?ORU-g|9LVkV_ zya!V6kc1(utW-JE63EaZl0lp*C4PlohSl@|@~kklMWAsImZUhhA>pbdBe0tBTKQoDN9MBoKFsMT$)LdmXR2^2<6BwJr?YUO4uv0(lQ$4 zt)>Nu&iT}OMpVhW9!Wh?oX$%L@i{!Fus3+)dBb!r&rO4jpSDE}&@FfidIk0;)U;H| zEBy>8<1Lh0YAL+P0@ zXte-l3aZSZuAGl3B6dY;l~B~Kq`-HTRS~t1G%7LDzxuV)q*rwxqK;{6)H!W?;}4;e zV7ruakOSk%D*xJ0lFHfu{lxjy9wRnXuw=A!8fFe)zjUR`RzeE0D%3>O@i4MZ`I`Ra zt`TadLprV!dg{DwE989wd4mC#eW`i3tz2)U-khD5!R)v64_;4NTbs>-pzXyfs(cq?MUbI@kO0)faj5{tnsM8 z^P)k3;Z5YI5<|1*kW4V^8abBr0*`!wvSOiB6^fHwYhi!N<{t}5_)>M$_`{YgB_jm0-+q>8JrLVsL7Hjw)ISzE{;$12sFS`M(m7?b=hhXeiJX4CXVn83hluSxj2tV`_^-`ZT*zvc5y$Y z4Pjr(!;}HTXTedC)F4||GnEXMkf|L=tQUty0WM`F1rpjwh09dN*KtebguL@JsTD(2x)6fY7UxcX!fG z5q^s@zd|=!ab;ymri|y)q7%NsU^vyyO4dN4LkCX?cMvpH%tN?Ed0(Zx%Lk#z7=bVZ z3eP?no01xyDj?iKRlM{81q!n2qhnLKXZ>NYbp_#d$|T3vn<`_mfepzi!qDAlkT3v3_76 zcWW9?;9)Hd6Fxx*s`io?X3*Yb1Hd;`o{gS%D4#YMz$Ckdl?M#n0x}(j zrs_AW(AA3dM590i@h^@A69x3mF;3>d!llqSh+Fj zDUPk$ylCXAHa+HYs`6$IgHKytlPGHy%UTm<-C|ky!rpruV`bg(vV$`RlJ3GQlb0uF z_uhBc{^ZEOcN-FQTgAGqcMm_PI}m$?ziRrHJ6TkZMR>%_2NR& zJLPYe#~Zg^H9hduB^%a)f19jryOy_D$(7VD)^MJhn7cJ*ZN)w5_Qu@pF>5>SkII|T z#JptH8iGgp|7q=hQ%mYwmI-9O`UTJ1kP^AL>L z1D6k6IehtWtfC$1_8aUoruVIqDqw~Jj1?oD;rFyzT$jvLxlzNk3Ha7Bsw#Ra6}%DR zMkY9W%lxK{r9&8H>1R8DFo(>A-=o{_;|5Nz0N zS7|;I7xWVMa|`P;cV)u8L3D3OIz3nVFZU;%g-NF?SyrE{-;yk@UUYGujgPrp4@4xn zHoGKoq7_zJSsUbXGBWjL_E>mo z!*X4=VzS?+9^8TAGJ3FAboXZUU=^*mvHDG7*=FiU2j_vXi?dKqT9lqdUh0KKUg}9N z0w9mM5%p6#z_Z>!59Ca3KE@Q_CC)=IbQ<hjcV?d9p1wMrVMpQdq}R&E@uP?l+% zfC>qJMYkX0rfrvv5=qUfSkf%@+iks8)m{EV%qkfyWN6E(UA7)&E!SoGl*2jT^XNvy z)KP3_XDYQ3hbpWoJRdH;=m(cKoawk>Tr zt+#B=eF=f(x_#Q9-|@^Gg_f(VD++%OQ+NmMLu`7)Phhir`^pnR#+ors{G=WngmIuW zXb58t2#g52W0WX{=a8o~>5bBE$LI)p8>ZH%t0&Y6FD^S#)NlV9V0%nV6gFVWs}p5y zVp-daD_LBA-FMA5V<$|ImW1u|T|g!m>|#Za=-wK$ZvDpusC@}QcYF+hl0_P<+?GI8 zAUjs!CrcrR1m~AqV$^Sc0>}v@{bqnj6`vs^FOWFQ$_o?-Gs4SIjHXS#9)FK}u;Luz z3^eErm)atDC0%;F3BAoETm70g-a0CuU7>38` z*(|-jE0X5h>BfkvJkGGh$jX)Vyf z$YDxy`A|zELxZd-+&&C8rj|OzZkC@h-*au6kf{G=Sd8Q zF6?d>z(4;SQX9B83N9Ky$6YXFzn|o!_y5v(!O&`yaA@#WJiXuz^r|A2kkFdP_!WyQAAykY)TXDS^%>7W%P0^YQ5KA$&`$Z}4*Df38v!=rVRtHXbR@$@AoQea zB|Ws}N=w!ML77lxQXWb5lI6l?id4XVrf&|)NlqE2NrN?wE(t4iWlS`k4Kd^U4{2Kf zXNdbR+}~Z|9>Mg%(ekjk;`+d~0m;-VUfeRX?_tBHyOwyvPSIKWsJJar+$9!w&FuTg z%2hRg8&(fxKhMdr*%z&x6ZQo4B1{P!oevAEXZb|UMzLmNZ1Xb@3J=7b2V#W>k|h<1 zk`A$?BT=$ZEZI1-Kj|)sx$BbkZMS#c+8M83AM<#TZ)w%_k!vG~l1{OtGhWg)<48J6 zW}EIiyh&$)Y|QoA1Lx`wTm@GKE)OJJHKMC#_Wb*@tLC&K|@&5(bPlpwM1 zu8~v-y`*-;jIvJ$z^Dv(F9f{)!3YeIVIxlaGhugX9-4Hj)DhBS$Gz_GUUcYd9I3mC)IX=7fnZO?fp zB(wE1Wateku4;hB{2Zh+BTQF{Lm9S{mjb3x#!z5A;80!FMCp7qC1}8R0FdB$xIqYw zwW~5~wVh#rFav<26Ow5(`8yC8nT3X_W%$t*7Zl4^S@s(yC=kSshgd%cFZd^6oeGt+ zWgDR82VU%&(9jxYq9qA}*`l=tV-55faVT4@l0nX0gr#-*b%da~3Z5m?c$3)EhbTpk zv|`fI6(#~jUjA4#9fA&^1=US&ni{J7@j(9jlUsPR! z0ZC4{n?-kX!o5y(uUps{clX4sJq%m|@zDkeM#@DSF@xDl$ay)KO&g=Y%rpzCVMLpU zoR@;-I6!a=!k+dVB)&o+u)h`JoR&bmkvuy!`=U!Cq6L!X7z_XY;?dlTG2Ir{uc*n*Fz& zGt+a%2kv!qL$^n6jr@KP!9@;uN;0o?rVQMeK!4e!rlNzit+@PDJQjEPTh2F=VE!fJ;H8fcM@z=byV&PhyHLq*Qn%5;3 zb}cmBt&bP(h&gviLzC7y<@&Xm$gpW*&H`kcqVvKpP{phs`Fp&Y@;;$Q&Ua|$FATv| z%6GZ>yGe9|h`wjMcQWoi6tl|a#M=4$YqBx0A^URz3snvqGvg}=G=8VrmOw?us>yOI z)6O1FW9C!D|ti95PVC2IQ{q5i?>ayH+0IPTS2G$^kx++cBUujIT zi&t5L40%JnvoBpUCMLX1ZyK$}sehr;I{ima?Cl}464WHD^*|hfaZO}g22KTf6iUAI zn$cACk3K$H;pI`9UFFVoIM*&RFKc zA{q+WBQeL*Btgm|ZFGvXm>~!MlcF$Ok0NJDivy?Tx#4#jLyJ5fT;#GjJ<)osmfga*BnLvoD6?OAb#6z0YuX zqMw2{K50bhnZF5LX$G_&?}qVdz`W1CqRe(;86J*F8FOH#f|C?c6;$b%x0EfdjP*5o$rR&ATT`Ho6u592^`)#bS35I}=%oTwHOqrEsBa@53~DRq zDZF}uhV^}SL(IB6*3xsg`)=diS7S}P<9Kv8kf#tmMgx(`2}jp`N7qBA`^u5aM`oL# z_Hy&!je~RH8%L8hb%~lzv8MCq<{VeoVjkxynQ?uT%elY;CGT*trXoiL2!!5G8gLP08f>0i08WP3Bu}Lg!N)&Dq3pd?0#tRwWj5bEH6`m`+?`VLMnlkabMQ1lW-(5A& zuD{uHqi1gL#?FO?cRJqgpozbpCO$d7D=M8i_^7Dt>c*MA?6jVu*@l~KH`?Zo-{^!w zV$It%nJLR=4c9i!$X?Z~=l}`0eSm;IO7>Uy@~c`vA}UIzx5+Mpoaw81DN|Qx(*^fT z#>St8jZ})-lNj+c7E*bta9e)2Ub_YS}t146V^`A+L?4_jITu396Sn6v(^4d3gz({ncfU$3S183%G9=PJ@dR3lnzW?x${ zy<>mdez*H=_r1uEzVHL6IMzI{p85!C$(a$Sv4OU|-FEl*+nw(f{kY~YYFNY<9=Qvp zWlFoCsLK?&&|}NgGO{`ip}EAfCL_bNxdN7Sef(vL*G_g-$|U7R_l$)hM>9=zc0pZV z%Cw;BA)}Jgw7x7#MTCL=uDHlT>!>hYS0$t@dYanmEMplh(?a2CTSiOXCvO>av!aYK zz_t~uQfGKgRkgKrmp7B1o|Te%gw}$rOZleAhNq#m1=Jpk-!bU!4WF09?-FxKyZ*65 z78rv`Lt>}#^{L&;YNsW^A>3E#@h7fv5+g}n;TZZdwM(uQ@0BY<>jn^irn#-^aio1_ z(SqdF#OZ+Vsm!%eASG0xi13|1ppczH>XHXbGdR5#3h8A&ZYgsR7T{Pl;VNr3_Q)h*$Ay@a&82Gd5<{bklLefp`gLmFTRBJ8K>mlwMzbZFRD+?4w*$xgA^vCseQ` zkKBopcCn-#{Yk)&DY=f;sXE9*xO^swUnfvwC)lQq;hunzPnixv8v+KX4KnV zd?k1}IJ-Sj*&$YT#4Fb(DmROjo8y%|@w}}wMgV3;GWn4Fj_0X<$Ip;5&7it?px;T` zjqt_CRiO2s=#WaA5{c+iCX(RB#Mj{$WO0sl7E>r5=z2W`-rUD z9gLBbJ|)h@RfDhz&iySK$N7e|@xCI;4;O-iV$vv~s#;vkfLm%h49FwJauA$DWhr4t z0$8SVQ4@91Qepx3gK{WrV&rYgK!*)jqq#8n7h&*ki`qtr$&hs^Fy<|c_hNXX6!CO$ zM{q~glzRBO-a-ghVcbZKnoJGIII4-yeaosSK}_4nMiJYqMp6SKb!t!z2+fg3l7%Cd2|4zQxHKr2i;qK=W)bUY=b^c7+kA-1HM zs51_D80@FV!L(&qI&{Jswe)dU4KJ^ekCt%4TcfJ95_Lt|)N=vzb7xeXb-aps@Ksscyl_Y=&+k3Ck? zJ#f|VMk{=J%Kd?-wVz}Zw>qYV^_Vx3pG(a_9-j#dhjbXzSk*79_FjZPA4uMyo2n_Q z-FpVwtI&01_y@;az%1at#S@xz~jIFw`Z6D&sl{wUi|zVrhm7UAU!OQd1)?F&*3r8=Vkcez%2u-lC|ga*wS91JidvU;ilv>2 z((U(4w?C|HxLtj#dj5Q}d(%5TZ};3Ce0yhdLr-$^wvTd+bvtfYfI8MSCu-M;wd)eK z+r-*!nFVZkZ$0v_+m8I}N@gwK@#WVfJPo3!VeZuXp7zC7uCnQI8&?ajrAbef1myQU z^+;3Ou?SCFp30cDV)2lHtFBE{b%|A73k`RBV^v-8s$DZjk`+}mhZqDN)~vo}ziPaC zl7Jid?&sexT>r4T?&ga(UQCu%lN@Zm@q4T9tiBuge(>EO*a9^rSNEYVWi{8M*P_4m zh57vZrEQN^cg;HHdckW*mU)w<)z_!LIsHLxQ%2jW>XP1u+xA=b`R-fpWZl|iW9y=g zHP2jEal-;pHVo7&yT!`xMCI=LmAfA{w%mUC*2@bUlbg1_Gxhe=z52JmnB1}>xosD~ z*R;oQV;|txv^vqaQEc3pXxt?>?n+kG&6~bwzhhtMzT>_-@m>V5XxeQ60GldisnW%D ziJ}&?^@aSJTu5G-86Q1Fg#;Auu10SaEM)q9~C$|94(;@j8R;!0!!?n?+ zDx}fLOSVtc#pKCmkq$mV%9ZeZPy)~@FlXWUw7z9s!g}PSBSnOEipmm!WE2Z5T}(2n z6I;(+7xqt8(bs@u)_}L$(o6{){i1SR72KZD5V;Z604z-*WYG{Rdm`(?<66!e4JLRIpQb!i3_t2ppsIPNI`U#tX=bPwQE_bGBz7fdhVx>)v3BQ8yW!4g z1>9QJ9WlI736=Gn2?HDuf!VNfny8t^nv`!2XQ?@ zG4cjRo_$5lyiJK4wGiRPYEHO9)+IcNdZA4|>xvRZt%Hrrl$kY+m4nup(;xRE>$5V& zBYgKmZX;c~G{DN`=4O=pP&0Dm@u=iYzk2S{p3tt0{DG~yOqgwSCj6jr+BxmGz_mL3 zRVE1H!2#0MpDdzA&>0$k)2Ej zBkS|Se1K1Tnlvm)@g`&ouZ(=sO=w3YhEYBp z9SWbBmWD%fciL#bsnB^~4(GbVmtVX{ta2QhAf?@$S|j z^(11FymwB@b&;dN=YMD-$kXwtndN2oK^nM9YfxXNA`yTj2xJ7>1W=D<6ch4(~Ie3juU3JL6D>$>{*{46{*`Q4y+z0zRfPi33-qBf)S+ z>m)lKk&a2l*prkb+1QcBTMM~H%^bSYzzPrZutiKa3O-9W$CqDx63DV5+Z%`yg6aE_tF)HsM zLMac;$c*zCiCz?%DLHKc61iALkF?JSw1EhJh+E1yH9Rhnkay@YS3VC$A}EU~!Ph7* zqa|qe78P`7r&6r%(8C$jKMr~~Sk;L{F#O*K4 zxF6-$#`0S)?~ZluOmrR*JC6Xhx0Qje%P(F+UOsZ<&1_{FK#gKyul9Eu%eMgIGWfyaIJ}mLZYInvv+IdsV(|+G`YN6zvs<*5DsODa$*!5iOs@dKpf$)bumlR_0Pe1S4<{yCSEUjTBNjK5Zkg{u>VTkaQbc?jFI&t3c6?0Gmw zyWMlEXMXV3&SYcT|1js2w1X1N6x!hQw}?VCz88oUwf%L`iN)<))t1E_oTum)kM|n5 z($5*fM9TbdOWnaz%b!=7aL-W1B8r63YZbp*dd)CNM?Xr`T82&BZMy=t5A^orCFu&QMmvF=Ol+@Zf9X zVuqVK{P(B|WSYnlb%Hu%Cxd|Vo};0TR!ki=X{aMt z)ClUxs4ooxoriv=3)GQo+7)#PwWt}fFtaY42IbVHu;A5S$s@{iYFXQ8^US)QNHr0U zHS0qAIbEtoQmx3KF2>I3k3T)C3slnvr6^r_%F5+hv{chr=M$)=kKeA0{Lv>3Py$h@ zJf+^4*G}H}mP3I)gFE3SS=SfSCwgJjDdP#W*j1cICXX&WU56aO2BirnqsNXxiqFu; z42zsJnwO>kKE4TqitDAZ5{c*bPxRP7(PRHakNtm&9!uLcFnSE0!zEQtfl*`3YQew^ z<31Tt#?*x*3XzGm1TPi9D6{3naA{IZIsY4=XB@iyD$eRUpiY8|V9CMQeY?8H#s6Qg_pN!};yJ`BiY#dUv;2wtdme<*)kC$(6N% zh$(JP6p`}SeD^}$gQA^qH4W;BeJ-j?);8a+x>a@cV6wXY=I3vGe*Wm)WAW-;_g0D3 z&t2X3NESV&y>c@pv`o<>m9QFQQgO*rsbg8yP1_CITvM{Cjr?>h6y6FYTh@R~D(_^v zYKjZ)4zUatpm*1QzvtbadxP)pj2(MER@U*L?1i5c);_9UlNNdIMrpE?lS%$+8d$9l z#;Z4>y7jGB9T3oZypQTzBut+I!8zG- zDI}qRx2@H9+~WsV35-xu1vo2&#uUCQJD4ZU@5)9`_|RB0aZ3Z zPD|M`M_62r!*+UDM+X$1a(M=SY`*>3JQ> z679j?;nZf5E`Zyva5({Ei6@z9{*5k09K}bt7rwjtB@=M1LnvbCDaNtHUt_$-OygLJ zGF7|;j>Ja$Zv;eA=U`R=%X)&bIull}X!RznjiR-2ZfJh+ed{_l@g!eDO~@gGxdE2O z$A+#ty_nz>8Dw9}L09Vw1}>p+w?$;SlIt;m^K9@=&pmIZ3s$yU4QZl6D~umKJ^+&f}c*&zqYP%mUE!IRvfE$2$% zbpZ>POg^hA5fC=&zt^+Q14BPoNvp>ULQcwO&F`CXT@Bn=YXl4)>C$5k=^D5{Ax|C* zBcGBfF^}@p2Q~2e>Q7XzPEP5zM7dW;%~l7ZuUF{5U&$N{B4lo)K&`MEc-_j{XH#BG za+92}6HoA1^9}q+<5R^DQ^jQIB!nh2&n8JPs%$My<;t}_lfz>8@+UD0MR&NtmZ+qXfUBiXl zBziX8H6=D55H}xq;5i5-9XcDL8nfccj>|h@j>argS4jo_fwLC}LtM?%Odc31s(kTw z(b@jM*}=XlM!I!PGkxznR{@)FN`App=Ft!z2xd6`ROitNG;i5?M9fQk&%X4$(500s zfE1`bm%}`CxccJ&82URJXJ53P4&}h<7n>q7psk8uPG$Ap56B76 z(Cq*L6`%l66|f#6ur58_M#;lXfZM?Um*MHFlgWG}I$^z7xIXEro}G?$kXm4xSSW%9 zV63n+=IoU4QW|i|MYW*k$ER>x7B(Mi@TIkez=?)7cm`B*G3-tcj|CL;fB}WUgKD9I z;|PIc>0%%!CS`~)iT1FcY5gW?q0ucCb}t->^*$djd?Dt1;ZdPC=Jc}mx905YNAWbM z`%hq;Q(&J|NdqIv<#+w~B?_H>DGqyp>Mey^b7Tk6Ch4~yIrjJW-!bfK-6{~@Tp%SV zfjH!;T%V5*4f=d3BhX>NOYaBhM(eylvH*d0r<4&ZsNkZI>2xR*6#kUH{D^LUNiVGs z%tI6$nG|C5?mpcf(Cx43mZIB#q#I#5l1J+QLLr9NGCAyFdOSq8({%IG?eFMzl5T%b zx8ro<0D4Ksm+Ad7t8pgM_0k6>>uslygCdhOl#%BQF%}4`OPP*CC@ZP_GHwDb?5Uiw zv6SUVh@XJ3eOf|N=E%f2!Zhv$>TRKx@~fj;J>9k`H!3^)V>IhGxDRu3%~l*=@5uY| z0j#rEjhDB?O+}!MzkC$Bl=($pJ_@%8aTCORm2p#LvY;kzs(Do2xzHe%Z;YEZCW}gE z8?GJ1OT_M6pR8^C(EY5zY=2Z-KHD%GzR`Nkw`ispq&8W$n9G7zuAzCp^w#!88+(NQ zYSW^F1)W?=`=W~l^AMl1e$4V)9LnMP49^%I8=c6jhFiPgvC)B01!u|oFek@c4SdX! zw@4w-*wVWi^<9nn4u=TI@!HjQtJQa_<##6YHu)VwSR5$7CGUzp%&9cD#JcyrS0r{H z`5A`?N>sE+uNQkt%p2#5i9%oG5L{?{r~U2r#}r&#Ycy}2-}}A8cMike*1YxZ$?w1P z?n{p;_@O<|yj4ks;6g8*Ex+|%@sFy0Q1zICi(aF7mm!^cm*H-ha@l3LH}RwB52BRK zE(7|as5n{I^r6jae#Y>~Q+(Zd%{d#H+k5-)t;6D~F45DqXr@o-mAw4LTo#0_Skcv9 z9K5k!bgz2MQhr#JXWj`e$4j?fqKfRiYh*w7PX6enAE3r~vslOV8;%)1G@8wa4N1us zcu{(2ZI=T*JMMntnxsbl#XDJlO5|mbi8CRSJG6e zpPjfFy%C)sMCZgR;KzO4!U?f>%c7Yg$eqN3=%lJyUUb(j+SoHrLu;Iih-Ir6SzZ8> zJ#S|4>IPCopR0%7LM*RQv^C8aiMBP5T`b8SBUia9xn+BD!xmP7BK-=mvWr;RSJz)2 zSj}Aqm0MN6NO7n= z?wJWvs+c`_^Q9Xv;lMf3*7n#z->SH+Gz~1~T9^YY(yv^HkIV>5xShrK{Ly2F`%-S- zf&Tuz$9#Ry$}b$g{-b?+ao*+0{u6`P3)G9YUxu-e09MOMyF@;q42pCZmWpZ;NbGX@ zr)ngPGMN)3`GMEi!$3vqb4&N3l$%bhWGN;2n*dH9*G32kiOMf=4;}e2M|I4!Gq!4H z9Ct@G-ds8&VOnYp5ctIPZXO;R12>N!msR3cXuyC@1InlYEDIk$p3xFIHM)~pLI?}B zq$1{MikY5^wLBNc-O)sEOp@}yq@VS5L=?V_TL#;Th;!B>4G5q|kgaq@IrzT! z$OIU1z!0mF#T8fgCkv~Sb*)KnW3sT~BZtLl!TDll3+t&9QZ2qk1^pIogIZk>((qrV zebhQK;E$TXiIvl^(bj6wZ5Jzm8Yi4aEi)of(}eVQszsP`L`!iA^$4cQ*)f=mKvPK_ zc~}OJ-!5u3MMSGT)aoL5q?g*fDcRJE@JFs(YId&N?3B@W;@LqYP_IusN;@P*t(R5K ztI%Nz-&Jq64WY(aSYOIXgEcZPx9KcmoTN69HVX~VyqTRb)8Sav;kfDW%uYOCI>@?j z>T9pCgiP~PB4D%N^NJT|p+d6~s+*vHgMOS=#F)=sDWP2AE9^uDoSTA}h&KuaP3VaZ zf!g$*U)#~OuEYCm5FF@$#3k3(mAK*(Z&@FlC1`NLzn4dVxujvhml zA?j#lO?Af{l`+%FSoO&`?vBcb4)-O=-7SXfv(Mtz_ZhXTiT6jKWGid&t~6f9b=twk zE0$HZCJO>$5M7@+Ui)HIeQ{HtcD$ngdq1Q4w*+7)D&UO2r)U8QGO~vlYe7J1!31Kw zwxk97VpaR%rhVEL=#TP`yCtt3A07_`!4CKs!zr{}BgU0RTtJLVOB&H1tLl%N`n8Ri zy3A(3Qi)Hna42Q!E*ub*EVF8)lYQD%gW8(irL{DpU z3W9LfOxqkToEMh$JeiYyF4(f?h2?0^3-fO`%EuhNl5sc|v#Ic>S54`tblJ>9Hf0Fo zEUwUmY{tSCSu$L()3y&8s&x1l8G@5MuJ>f$ZY)wb2ahD8bZ!nt8P0GYCu0l5hS(09 z-~?f7VSQ4SKuS$*B209tw znM_tX^oQhHuk5F4^-5=z^-9~sD~k6X*~SKMQDNrijO8{4u~;~aLzld48D_>CYGveO z&pN!l(y?d3NoD}T>{@6W%E$@avw(k8YIV9jau(DsGJ52PTHYXIY!iL5Lp+ zvcV+-lvRdh9YU1^uMz{pkgR+ry+J>Y^g0Wmz=q|tfAAb_kN^(~q$Xt6MH0t@{%XXO z?1zUXGAxxim!wQA9kR#JBgWFCB@ZlFii|Vzq;VFe39azbSuDDJgEj#_0_)OCw>-FG*}>bm1~d*XS$m-auj6@R@ZVFLrAaqi6fwvK=0{G+_Y+QZ`7 z!|y%+{@RntyyCC9a3EaQDMP~65wmq9)}D;5J^4%NK|T>~w{efl@B?$j9)s}*tIWGM zn19ebgyi5w3E?F16VnY{N7|~C`pr9jWe(Kf|!yxNo5F}|s)}{92 zvd~ELX>>;mEINqFeiMV0E^Pm_D6nh9D+e?`BlVgGIjGbj{QH`+X#L2#^hYKvQHZmY ztKQ>d-2^)82V2;;QJ$6G~dGO(}(yS~7QnHs$F{2Hp;T zt}EA@#6(#a{JFwY0pM?j4n1Z9T~Hb@@=mgE7fQj~aj}pm(jk3?1vE+KvMyLokPq{9 z%IizF(zXR@+w3RQZ;O&T{j0Q3fv)lkOO~xlENJk9KciTc_{;U@#0qOk>C>n1*Nl}% zND9oZbbCj3$*(j&N?rXmnh`;(Xx7E&MTif^uJp(l+o;KXkMceJ`^jNR{mDj;TY)Z= zpY+#XthCveyjFeni`ErZvRAQU>eLzD6;ulj#L!s@6_~-pgX>D3?BSsnyM#-V(y57+ zHweEK@B#r5E39)|Q72o^^QKU3cy(}^@NGH|p?dgtaKpnxUWB-F?25W02b8b^iIIcK zoAfWvit+IIe8F3V%5LUwn&i^GLr;?^2lo6Z>w*+APUlU#VA)^PKXpK2UXm&6_C{ck zuwbQMb`%RcBAcF;*^ak2f~f`c3Tkr*GST!ML#mVrKpBhm(aD!|R`$^F8D=D$%EzJQ zOe+%yzcVCFnA*LYADLh{pTuKk?ku&HS)q8s+54WII<`y{?MoxDT%4tf)uXkx*p92#s#GK#*8*TU;qznKQWFM+R}@9FHETH61n=$Iujkc=b$)VgMoeNUAVhCso`7mAi4agrp3^Odb&?eG_liB@oJ# zs$$^C=&1Bm6mFx$w}66$FJOma;svwgtA}4doU|TKS__g^C%dmoTHO#am@Pl|aOU-Y z<7$Yznl9~2=DK3J<+DSHn)PDM`b5oEv1aRC|NWYs$qH|xVy#%QcE+7_RV7?4qN`;l zC+V(9xVuC+Ry!!Vcibxy-Fs(D$>N4Y@fL`gXYA~Io9J$vF+Hr?e)sgfhIrk6(Yor9 z724Gm>~!xs(OD;1s^2x-hubqc@mqR_^T&6_dQaVVo_drn9MVnCs-#B#=yerPjPPp7nI!mA$m~_^EWHa0B#Fo!bI7nwP;aDv?RzGm8 zp}3CAaH0FW5*I1zlg>iG52qV=I8QU3B+9|HIXO9RlAWBdgEQQ;2f7WSd&9z+2kspY zods9=FZU-s&n7*kADNA=Jj8=NMPY+j*pMjPC>CyfP`Fw0Am`a4JB{0nc9&Mq?AIK; z&@NirlSP%uvd&~_Z8EtZYwIZWSxH#w)kSi+995J7>%f^K0iC=PMSTkLPcn zu_PUUO0DRqh4TT?(K3%yjM~*Pk*L@tR&2U_=!1&J?+zuJHi}If;}x4`4#_}{74QGc z%J;1&KXA+b_`bV0(Xc^m*pO)G6C3*C4g2o9_sO1w56I33$RCynO&gkz29ShHZjF-pn0u0-87u?~l)tb**&+c@V-H0~E0_di(m3?N&X_cyDa zxw`)&Be#8zA-3n_gKekcp6A7FrxsqlS{j3U;(TNp&tDf~?~|*Zfsu@>>_ZL-(Iu-k z!*#+#&#JhmHsNU(J?-c&C|4vaSHoZhJ-;gN^3jh9xK)jpk1p17*6v^Y4DKFYF#H0& zUzrysLhmm(7W8{8|K95EcR2rjsTtwFa?m|hA)A0@Ne89%cR6k_`&uW7c4$Y4&|<5T zLxw)3O6_mUruJpJ1=Un9Jq39Mn+i+*vC?t%(xZ&*OGyukM*XCx60N7$t)_+4_M#!& z1$uJjJRIt0HK4#@|q>NutGHF9Ehon`hRghXKQn zGnQiM(svB6l&j|;o;SlAGx#qGH4Tq$RZ4EJXP=>E)APiJ95{sv8L3s$rhma)m+J># zZg%p-CS%~>kT4IzE70e_!{I@An*0!`)Opmd1MQ}Cc}fU0X)bSl%cfAN{=5k280lgy zvnxFhYR0wCKr|!2r z%2@Rmr+Eyl-T6;!x8aHHw)~p5yFkW@M*c8j5ZV@i*Vk&WqZN+>;Fbib-Apzc={_Ih96ly@f>T9{-CAO8BDHq=_; zcL1lh6Fz=GA&B*5;c)fEqU{#W;4s*+NVtbRQp)A`{T?q4uQ(0yOlXK~DPb@f29e2x zY?6;Qw$rN_QDzEx2zO>_E6J6xhvmyqQc1G?Czm2qZ^+P(<8D!9(1`HMr=dt7MjaiU z7$g5}bR4amK!(iWKm=an!qDBh5P-4|Jl?^UF%*H{He|I~YCo(fg&j$)B-JJf-0dvcdcC-;rRn$z7)Y*1To4EdLD&z0 z^THZI(pZ*4GJ5r{R}3&&X<0r6A!daTZJk%iQ>s#Sr1a2OIkR*;>iHMXtc7>+u9w>p zT8B_K3#~_J0}E|LXw%DTH$HtO(wS4Xl;K8}VHZN(_}hR#Ipcox4nU2Aj%kp>cOe9l z@UjQFFTryV+#C#MS!zD&KL;TvNxJ>~2=uuCRN_-eHp%4l4!z&kudLxrMBE+W@?5p7KuMGQ#U$ov>}7$#wE>c%9Xt7#h1R_G$hOF2yc@CjRscKSO;&DsXCSL{`)z3IiQ|qKLx2%$k z4lC(IDkm}yDe1Q;KjIKg^*p5lmriCWBUH7f)@P17b$ZDBEO6CylKzxX`k#Vl0Di^z zxbR=8CT+{8T81#bPn{x|Gfv@{hJ_}x4so9L@R>2J0#FF-dsgbo z#&Em$SxxwaCcIY-FZIvyyXYk0pHu4`#1@iQB#F244ZIeC3k3Aw zBnj*24by_0q#P_0W3Y_kQI05l7>myLtyG?Orz^E`$AkW zUkENKnIrvHr!Y_1-=W*Tpc`W)F(;3VpGh^5Ds~H@l#Tohv60I7L@C2X<|&Tmu5w9i zBBp{W9U+iKC<=d!j8*}u2%nI6MBA2fJz}OcG4MW0DwTtQHi{)1<0YFf9en8UC>)YT z(a{)pG+*k2(EWbisqYjg%G<>9wnX_x*zx?UFUB`Llh|}p+;lQte(KWxN9E8_F8Pn9 z?o4hI98DyuyT$76MD;$gdLI}=aNVC@!fZce-EeGQThY|(H#DZh7<0s+;Cs!!%;j*QbF;(YF zeN-B2SbwaHDZ*`iWX(@lD@AMN?C#lD5=bz7y z7pzVcbc+Su3wvUlKTd<6rpSGa+j~HY0q17QBD7Zr4=lM3X|>!-CR&v+2)wmiF6CKiXE>?>7C#=F7l2N@g_>43xZyU6iT~f54+Q3yd=MQ#PZ~}m0ij^U9}7ogxUs{w~LOvikgrY2;j`T zCJn8YsrR1=1fg6@J2x@MrBzRN7l!Bu`0CjJ{G0%tAmLOxf$R;N5D!^1k0IY_tc`Fw zWp7QtCDt6aRAdKN!{mLKsc12Q6GIiw`-2lCa$@B!Qz=TVsfdJQ^iKiH8VjR73WB@5 zL<&lP6A?n7Cm@EQ18J~rQpJEchax>*+Vk|ukF+qNrFHvMi}!rE1I|LgU*vmMAXv6f zy`TW%(^n2kw&_}IpK8mfgHlY^9wrGs;pHGLWBMbE>hy-5`hQ!y65uwj^Sq1uA^`#* z0r3P$kOU+WBqdUobx`IZJC=1tViG%JhoJ~smdrz0P*KE?5#{mJWUY>^u0WeGOvTJpnYJ_S3?$m4)J~7@f4hra!jNbufyB3Oci+A5 z_}~A&|Nkp;xT82%(zP&K1JmJ;(=Gyo?$UyclgB)3b-G|p1#+#dR?D3!xBU9DTt&%W zq4oGD$ONL&e$HL8?s~fHj4V1gF59Y8uKJWKl7dD9oJr6r2E^p-?wii$q_b6QKX~1F zFvFS57MkX^EqmIMo~@!ACk*`kkiu>fz0Dc3!QQ%RL=e$=khIlSzi?p5-z52)ruL_t zL9u+>btl3rNmM~{U36C=eyBACS#>XjJ`MIQEWYI2CPS1-8NnKu98GR)g8L;rdfeH$ zVjWkPLgf6w!&D|K@PCzQh~p3|>(7TpS`&*VSN21!tbg5Dwrs4Dv8;c+SZ8=lM}~gz zs<}h5_6B)Bw01YljQ(bI^s{a%5BOf8KaSaJbOf`=+8y-J@uNaNbVVg#iX6Ksrqzk& zjQ*kF(S(dZmN&iq$42nr_<&9o`{3}PaE#8}M+Q&dGg&rU?H}EPREIX>FBIt+96Y(_ zP!!uPOl+hZ3hK~FC;MG%#gc%=bId-7nD9WBat2(Ix|9ws$z;=+^EwS25{+l7oNUgivuyF+EZH_Mv@hE3U$#1ahevPP z?guSp3$Xq?p!Nm2(msiG<$8GZT{+G(b00Qz)a>yHJtHx*@m<|d4Fd5%i-0_(>(`Q; zDO6rebQ463G#jwC$sPjZg2s?u@;Zm=DjL6-_Z#leA5h z*;+F^X)x8|Z}mwyiUa|C@=wSS*Nhwbxv}g}RabC;F1db#9m;Uh`0c#JnbccbcW*oCz-*9Q$t`xfziZv)-)8bMiZ)D4$QhJ>w}Dqpn*D2 zr1;=5VGunR#=21q@H>dz)h}S(P*!p!Cync;XZR_CE}_UnIeN&VSX}jW9vB6@`}jyd zIBBdP*q1f8Tk zQ{KwIY>IGyZf@o0EivQwjIDhB@wO46^GW9AJ5a=!G_+^646;BnhuHKLrJ*Knq@pOmE5Bd%8`P;5d{Ce ziB@$@#45;Rt)$hJ{SYtf&qqqGiD%G=Rm4O2<=?vTj2gL$c=se-*W_o?bl@SL<74M% zCX#nNa)hL*u%s~oTPA@tG751~B6AeYaP)c@q>M!-WxDCpF?xn^O|!IK?&mB~qUUU& zgzJc3NT0PIMV;c_w{bQnC_ln#oi9i^3a7f?2Z$)oBiFLcMd+{dq#$mp=PsevL_F7h zkKXYkL+aEA}r{-6E>X#F$)`^>yz*J*uoF2W-IjanxWZ#61 zDQ#O_Pv5DFoNA=O+CgS8SE(OHH)8vAC|Y3=kv7p6>O#aR?$_1=f`k>$6fW9=se&Tl z4ZS5(k1ShVOV*%d4K7(5Bx}Q>HM(qrnVAg|abue$Tl1T?RwBhWvaHjt#8aon;-jZW zzyv=tfE=IsE>1cKUtAl5Pc%Rk+-ES5=oHo*yg=*QrOOdxgndwmq7 znJrg7873`F(zh{W9kxvZa}KKr`DBxf6UK8^$XOY+Sau3abbiiH4UwoC%OdjgA&>GQ z4b-@Dr>NyLv{4vZ%1|RsBBeSfTinKt-%)DJbB_T-uM+xEMeCaIO#~X>!On?eeciD> zm>e7H1D!v1rW+>Z1|v~y7ioW~0>!}$3mrNmpDTCl+8k?-7Nrfx2GX`uBTs@JhuSh4 z{;Y?kjpztzo7yc%(h#SQu#3EaJcU1llXkM4`UWT~S$jI`-an>yp4_se3p>XYfx0~q z_Z0*>wGrmvKtJG{@g7blCtDPmG*82@UU5#gQ-fVG4vy-tee!MtB$D z+i64)=oplv$@1JI=N@@>|4mcrqN#k@RJ&-Zd)w-J$L3wZOvdJ@WrW##Ztt`EZkU=^ zyfsVSnBjIEd4^^5KZv|?WP)Xb-rTy>JG?oC$% zhL0!uo14xJ&<~EF$|w{Ay>HQp0X_O4L4EJF=2%x1_j*-jH)ni(HxCzQfdHq=1_;@y z(;9`CG)mZr`v5-uh?~%DfmP&+@EzUamtD?~g7V9U0o3?7>%`42S88gQHc8V)&W~j6 zn8b`;`w-0=?3d53&rzHy!SJ}sW=(^_THG|~udn6!_| z`N486mlu_2)GP<-?7wEmuR4x_*v!t0C$hL3Et9wr8iS04OwyG?2soTyQK+I1|f`qlD~Isr(yvEu;Y1*A2f*NGf zaJq#RW{rbdWHC=ZVF&U|`C^k+AWHF1YlTnp zy-Oj(OovQ@S!=tGrY963&0T7Vw5zsmqE9R3q3@}&0M$|n)$6o$RvJ-DT|d*HUwF`} zc|vJr_`ODzcfu-oP|Hp&f7PeuklQlr)H%FPox9Z>v@7~A&9BPn#i%z=I!AWtW}_Z0 ztm>OEXZwOP->lZdJ%P<1Y)a)HSLO+qu2&X74z*w$g!82HJQUc%CfY-4KH61n5#Xj( z$wjPzkX$U3>cy(dL~c2;dLxDfI@B$-D%V zOOwUmCrMP1WWto=*Z#3FqVQm4SVnkJS}Ylv`huAa2+j?MA~=j-T0jOUE(KN|$ibrW zSO2WTgPlu-9#3hcayY8xyEs%jmF|$@eIn1Z~BJ`;Q$T z05u3zD#r#7^!~>bo<$95i;O9h?ID;jjR;|K2xB%L9fo0F7-eNO1@AfrVB+X0VPLQi z5F-eb3V3h~wC12NfZ%I>Rk&Lv^5&ANl!EQykf8}I>_pn~G^Fm1B@#5&RLym^okRV@ zN8|k+<8^r&Ahwf9yCyo|k%K))9O!Ug3tOrr=pf?eCF z+ky3phGip2B$|j>)}`x(WaHbURJ@I6Y5=~t$Tr3;P< zqy6y#h++1P9z7utotd`p`O3f`%r1|Pu>C^X3_B~M5bqoqA_!Oql^U=yh`v58?4$4w zY+*(deRP^9{59UfY|{#r_E+R+)%ary>&w$_>4G&~2}-G-8fGRaSlg$0GPZjRM;EaE z4ZR?Q;}gD3g|HJhP`>{W?(b^K37LMh`wRbrDEsJxp8?s=gbN{OR`d0tMQiP{+rQ)v zN$${+n^~kdoOHMR)t;rcJyILo{ZiZhrMAOT+u`f(!#ToUhaqmINyy*VU_k~N$pZei&WBb#dLMcV#!0Z z##GC$S>q+^MXMNWe5rPBI9b{#dYYDf6>~?w)wt+uUe*jJ?U4LCQX9%&4!#(ibIhCP zpOzxquhw9rR=iWTorE$3mi%E@c)EP#)kj}>bYX|+55rtjsugVM4#9Fr@j0cMwkusM?e~~nCJR>{y3D_L-^HGlvWlg$O;XvW`JRQR#j;Jw zvaZ=4e2g8Lh5grkovD^q(cds{0*2tuZmw|Stp|Bv%Tg})g@fo1v9>vAiwmY?sR0X1_Gu zIa~BR)bF)P&UVq%ey5X13qhE$#QVXWEocF$Ao}5Z51wbH?lyhxO3}aUKSzyC~2JUAq-S-_RHU6R@ zaG=2ai;f8V|4?tki+?H$9&9xJONZ;=Hq*5N^MQcvqj0j60E?)?DTd}}2l`4IDp}~7GQt|PCE6H*S_lYwFwJGB zk0%CY%8V#Y`KRQPO;*Q<&U0J@!VlyV#0iv-s1SO+j820xGp1B}5q(BSgD~dzx9Qn! z^ov{$I9W82i?3@=;r|eFo)*b3A|B9OrEaqy^O(y^DyMxA^=^=S4Wg?7&$MN)S#mco zx`C()ta(hUVbZ;6{!5a(MYOemhv&Zy?ce@ixqX0Lnf)ImNJ{XT;?5v9R|)3_AvmuE z?VVxcYqdPw{3Ev!Ma&twzI%_{IODwhp5qtr5Djpujl+EUyuO1H(ofA7=7YyoKm|g2-{r}@n3XD#ckL{KIOo%Ah!C@Lm zl)lHhUN2CLtnWRK+U54WPgA%1201U0!+Ig>kJS{ISZDHK4AF=zOMOq|LD>JCdg3&~ zkx4O6PERDI1M|G(i;Ax3O^txlI%CFWw6`z&O1?gJ>FmX`NngW!tK@4GU5&q4b_cPN z(aWH!yr4<)H6?w`3kcG7-PMNm%-;SX>#JfggXmytUC#4*oR>VS4tNk*lX8>% z2yqVb@R;c&NM2b4{tST5q(z;0qLc6gTuqx7_R|bKffKO@?@3#iiFjqOjZXUC`13Bv zMH?8`0Vh86mUws?oElKWDb=&i9ADP4t+Y7R7AZ=aJ_CZu^w)!ARt8AG%_$S zM0P#d=gc|;c%+*H!a)jp08VsLI7I&cBxi}7S1F$&@;yLFIaQ_;$Pj@hhqPPwB-Vd) zKrm4ZnmYtEKVs0ncwh)0elQ_`eFx)Bcrr+eGX)#nx^s9h0?ibV`#N(HlmxQ|#jGo( z3n1${MjWBo@aVwPj3dA%@^q1W`2rFIkkwi`bsF@Jxv`bNDB`dfR=v*-!V8L0C*l|L4j`hjmX_Za~b#F%o#|TlW zy!O*;IWIXfYdhs7oq!9N@;IoJ`U;?N_-HX*wFFx3(;k+#s#kUNWCF^c6eX*;5zbK> zq^(5=lhZ(*=`cA>bo(gz7$>QXZt1W>U<-1He8hzn=tw}ojLtE(j&P8CpCPA@oTKE# z$*Cvj2&H$He9w}@#tB>bnK=-KyLf_v^pQjBoveWBN%ApBB3TiTg$xP*mS$odcTonz zCT1OsTA-*2a!!%MWE#$pua1iH@8p0*4&DFA;c*h+B z&sRdsfhUV!1}DTP_{v*0&RZ^)H(rlDoGjWqRhY2^_{A!rk~4HUeEHO?6R%8wB_lf9By;L_ki)1Vg-Z!@;!{Krn#-(t9WwI{5ho7=e`(cl1+g#UWn;6|8K`gQy#>aPwg}Wqs zN0RGaweWmf#%AW5XHPL>KpeaaK3FB>gCmWM)G9D}6mW@e%Ilnd~zqXW|Ut2a7!u&q@g}LgwRHQK#j-<*$ zx5`?1cgpH|ZsOcT1~_E*isA=0*^L#n(0AeSna49$y0fWwcDi$L9$&^uJ{K1#%@mNY zkgJbo+~gyZcbOvcc{yKTHgPeU@zEU&!jIzTlX1*a4*BHjkWa1-`LycLT~2lAQBHN}QBHN}kz5_}Y1P5qE%3`sC5y(u+opa!lHD3auwQ>B$t&gmm$KP<^rW-JEYD{hFw{uU4J zr-3gMYc|tOS;k>UOUs51Ge+{))X7)s?zmqu7q6Oe&4jp3EvbrX^tW3DZiK_!5ue*~ z@%Tm9p2htNx{(=I`Z_S7M?Tm!jAZQObD$f|#xFWDPP%h(l{Fbk17F5cvg#&Zm=sMvsg!Pr${Ji`%@8x1$zZtU;Q7*3Cwv+5A*1TLDU)a^L3gxN zB)Kp}_K1N7itIst;Z~6gtyI;n$szZCO^zt?y$VEa`0-0;@MH`gzGSYBxc9d?cxhsm zAKt3x9L2MbChe6|mX-35$ZeocW0fvNc6`bVxI+&!) z7|FBZD9D(}gO;zX&RCfjD&71UJM%i|dpMcb#T5q7Lv)#j)ySZ}ytFt@p8%DwTwA zW$ECmONQ>WROn7lgnV6mA-^5wAtT?9{Gj>!&9JJ-Z@*n*z?2X9zEY7ZLORrR#puHu zx8LS)RT>k%+bssZ?l!(ngq(VEq8W#g56l^1H<1tAc9rt(+uN=D=39P0|5<)MzRJ-x z(}LBh8;0Z_R}AI6e-80+$#nSP@1?Z- bnOK+)&h8Uyw@H4;9^&T8k@x7rNQD0fIFyU2qdwcD~(eNv$6v|48OXHf|eZkXg%vXFOocLST&t24fh0b{H{9N2uGj?A8z6 zEn|D+V7915WJtzr)s{UETVtLO#Z<+QQmIXYukO#e=iYnnx!*bW+<&gAu_O2-=JW9VW`ur&3#HFe%)Iz1jnD!TkU&W& zLIo%a=V^(Km;xq}H%Ux@fjX09j#vVgVx2W&3DFdaumM}K#vXBy8fTz})G*SPh%4YC zd9zd-sSDJRyhW;yGz1zV?tnYu33w<(Be70wd_?Ip3T&m(2S~8qLINvV#@{JTxbpkR zRHWQ4*hFUBS!pTdjCu)nv1a_e5=G@#aJ)H>5sTow#RR;6hH-0P+@?RnxLaV{X4oN0 zaEZ+$v`{O0A*~aeM(9yfdCztV^|w%7ZTq!}NnX-d z#Y9A9>`&Gy#eW7eq2ie!lVKpmua$BeuT1q3G~;d(#Zhr zFY{W%wS0yu9Nvm+`6RF5Y5+NJ#kG7=RYz%FM(5F3DoKE|lGO#o*o=dL1m2Q^!D`oX zfO&2yt)ucgPu*morM6O2%H4EAJ=W`W-yB9Ba|LNOLxa{Tg}V%#_}>4Y3;lS!dr(m)07k(d`M|EvUn{KmPLgh zm4m`VTBD|hXAZq$VL4?imXHK)EI280*M7iF{D2Ea1#XhVpe`z$8Uu746J@`TlG|Xm zKD&$oLu0QM0bQfjn8r?&O6+7QM^7nQO(ZB!==dIvj%t=`x@oXX6n`hluaB0o8$r-s z!4So9vKLougJh=77?!Jz#$KRPu7bhW%UKd zVwRpY%`(t#WQEFTUFkVqMw3ci3Abj=YWeIW%>sBqJHTpvO_yxdtfjg=Ny9(W46s>8 zWnD)_XoU4tEFe(X0#xQAn39&#c-VV0q+ycQO=-+Ywu*)_vQjrcubQbLg@FugoAudH zF7`L*X)G(p26C_@ZB=VDmX$SW-;{<;Iw~~N8%fPY)cgFe#G;B2uc+j@tLAeDq zK;DIu-8kvMi4P~8IN1k@&#F0N32-)5;s7g}MfZI*MhL1w%@m3!bP%nV#ZhoiMVXJ6 zu6P=Qf!F8M5fMAVn&TQDmt*6)GwqnDDp25(V^3OV|N2QOm>2lYR8?B@3{cK4Bt^XNB;mnPb6Pq6@m?&e}+cI!Kt ztvwm%yT9x$w!%2JnmpT*Wm}f)>61COmw3&uDL->~zBqd4XvWd8;@P=4^VN+nZ={pU zj!etpZ%*Vq=Tn1FU8vo$;JW8pa-=67pU(O}$kd)J?A-OzY--$+8hpv3#+E9IEZ3Le z`ZKlAA0~w@M73>ijU~d#I`m;2&!IrN7NFM`iCVk*$yoMVEINcYgHRck{WlHa+TktIcZ5+M@?%i61E~0l5$So zu?a7*TuEjvFCs}B@M|oyDQ6Tq$zDK7HfbB7A+Q2>{{MmxB1a}6906Rx(_M#G8a1d{ z^dQSR4AIPLOb(3!WCc~(13kh0!>*cU4o3mr*7E`X4kMKY{+<3issFMVq)3gUh>IBH z6gzM#!;$XtY1WaLd>z68Gfjq=RBXKhJ(U6^a|n*xZePCr{cqdfpLZ{Kf98EWkg^oo z_r5{ZN?rZJwtL$acQ3ggG~aJtwmhjjkg~i@?aNx^-oDUxuWxZ=X(T-YkoLGI*Kj1o zKI=c4??3Zx|CzfdzBqN~)coY)#DLPElMC(mC?1f^mN9gS+1 zGo($%Czk{_h$}5)u$KoQ(Q1k@m5?Y&JP*G84yeU&j{ZWo3)tV0ae`G$!D#=-Tj)9M zGVfmMdC+&iZxumy`Q+nIvhRKHdz^pKR%<@E+TLP*?=k&Pwtuv}K#={zajMb0n=FPf z)67zET9oy;jzy0at#KyHK(+`Yt_@zOW#}q70D7pd63!bp03A3a<;x$iEPilxqDpW8 z*L$ppGeHiO+zM1P9mnEvDGYJ55Q)NeVJnWquyCmY6^0uRT%Wkz ziq9%zxR9|wEAPPxzOcxCNZ{O&P_AYc#H)$XhqOF|oA5NMZ$cUetkp1%3G@0`#ACo? z6{Dl_dFXzuFoitQ;v###=+>ujX5vI>n(QnNL zkqVyRfr3{AmViKyRwZ9zH;SW_6#ZkM{VDt@??Y0h=vDHqZC|>S9(Yo_f9_OSy-QMc zXBf8*NMwHGOycE-F!+@|oF`JoXB6(VhL+pNm%4!D6VL@<8w09*66Yz+qC+*o*OHxs zunIqNEh-0q1Rp8|8|FFm%u$zjc(V>~-my39*qdSYlF4djU?EDx4y*x8JWDcDd=I%z z-KODX)0Co96g;%hw`_k))94Ncu;S_{G)?)Kncvy5-90ac60#DW6yXgZG$w{7xFC0K z`XaXY92t&;f`Wi?(;TF7EG~zGs#rm{s}Ld*;0=LrgD4Sq=Ip?wp%eVYPtHMdmjC$t zv2*864S&E7UK;)p7XcoCiGDjFKPHbVhe!@jg$#EH;1b66N2C5T#C~D;gkeJ>_zIW` zfA|sT5O*xfT^>3{UMffp7rQzxhEy(rCj?GHd5)yt@S4XWY$Y%rzjz0P@^cUvhE4q& z$D&;KJt~$5&|h9a=uf|A=eIf}4>NpqFG!giT_aDfLNm6oJ=7EvflF-R?2GEiQFC#2FG9u*Y>RtHpM zb(UExC=Hvmf)k4>yzkaA`Xkaz_r+M88xlcp@V+|TgY7xt7F(EvRc#Yv;{m&Rg5U)cvSF1vnrw*R8?7HB)B+hTd>a~ z@S(A=B*gSXWG?(dqbJl-sQ>1f0J_$ueiO7gG&Qx zf6m?aZ2O*h%XhV&)fTkxEcLPt`MT0S&UYQpb{&7xHSnvwC$i4=)Q5}C@0{Mnp>*f+ z$W!MJpPP_xkRo^lwqW=n+%9D7iR&&ZUbYV8f35qFnxjevvINb)z$7@5{hxE_5AFu@ zk>{Pr?4CdIl-Z$kZPgs$-bkQC;OQtbK8>0M_6_z+_hk>`8ny%z@+n+wC>8y9OZ9hi z&0fjN*n8D>5Go#Qj=$;&(67WJ@NQXE1J938Mce_czRVe$I`Ras2W}krY0U9&&LIeexy2ES|*S5C%mgp6&S{8u=^ zW`f92p1?UAh1IPkdstV{Bl0xV;nS@2Lb__ADC+kXM74j9yuU%t?~&)%s3n71evS5I z(4Ozn-hV^8{)1*H`Z>ah6WKlUm!H^n&RJesu29tO#h$PFzU+I6aDFvOTc|qlNWM6D z=OB)steSASt^L8C`+G?Knc05(^3BV)`J4O;GqkPN{|`EN){Wq)8o{$}1kdw&#O-@# zYs&2G&#;FJww6rmu?%~>P^wwAGG@yQlZB%*sOjJ9Y09^1Z=t;NiG`_qQwull-B?9X POoz$OF0`vfs zM4Gg@UZ+bXP9-(zif)oMeVlVl9lN#C?ddAbwpG$JRX5ozAkrZ^q0cH^_q6$sv9xFN z(LegTZ$1Fhpd+{G9P-V>%zO9Uci(;Y-Fx4C^W*${7l$h%d?ozaDUSPl`e9riJ@eo> z3&-8yBu?TZ+@N;n2YL3}GH79U>!6k0g+YPcZG$#;w-4HJw?y(r9fJ;@>RKbtQP-f0 z<%NiQ)HCQ|d0QlZv|zA+<@reAU?IxdBi_-X!J^UP!D9A2FH$n<8}yBq4wjCV4VI0T z50J`kg%4JvX5(In5AyvF)8*Gy*QMwBM z)%Y)$x5=$?>ySk+QS+KG*e-J;e`LVU+%4QuPO5#4lj`2Z*wgi{cH;(jNcD25Bm&+h z{5QxRz%3ebNsWNkAvMW6rDi5x2T>|N-QMD<42X)_?YTA({*Yf z5mRKppynM8hAzlaNp+u6NwBWnet2$iv}yLRJD} z8pOabfkaC#1yJP4i?Q)Ie!0WZa3T?Zes0RQ&T_QdGe_ObBTwnR_?b+%m|-*;gDg?;k& zNci0Lv5Ca_SakcvNMv-o1bQpsbK?nF+CH8LN8;^c6G}B;uw)xD@hT)U+=qE(D-PGy zf-40dI4VAJc&@sxxUO}4HD9U6>yXZs$a0(w|BPIRk-5Q5aZ~)H9rKH)c^EH04*4^p z&u}VkTvTp7cfZO{SpCA}-kxZDT#?1Z`Cvkfj*rSp5CbfR4@gLugWQM!%_Fa1^F-|lpGf&nq#Z@q9(!Q5 zwAvrg^OU>glhh+NV$^Fw2FzwsDw{sMMpd9T!1?*f=l-wgBL<_AsK|+NCF&A!G4e5b z#-ifIU}Riw_w%fGN+WeIn-F%QfFvszQSLE%NiuAPduZou6?4w{$CiZcX}53AopLuO zg+|s()t&(LVv1S-LI=izN+O&H$D(nyJQRy2!o%a3+kyBPNE!&8ACF#$t9A{!kU7WD zhRIz20Y8Yja|RU&nV+^n>ZT0Ad7M^bY5fwCuVAxKnTyAS2!w;%A+x5Lhs*c#*bOO z>l7mpQH1Xq*3USDsg!`gn7&rfCuB|$Jfo4=h5rYSBRRqGj7Prc0~hd9{HO8CpYb<% zo_oDafc*WJ)=T_Np2UMI(-$msmi8U0O=HgN$>4l^SdPk<$CM`~+cgH(NEl#a8K7~5 z#29UVJQ529Bk?EN&GOF!NhV+a7x(KKF6rz1&KDMXu069{+MFtFUM_9<&KG_YCxP&4 zRSoaAEA7a#DW?kY@FavjTjn79iG}cMd@LfPn4gNZQnD3^>M#&c^Nn19Rd?%X5`p88 z(400xny9P(l%6O=+{5^fe;&yUx7xsY%dSpdnf%JsTrgEwKl9XzuWD}Zn~%Nm*f$O> zRHUkR-s)TQb`Xd3@4L&A zLYXEsiim&Dhb2g45OP9R;%eU5SU?FzFR%=CWQ~BJWtN!W!JlJ@vKQ+S8nSiQv~|k* z$CFNO%9=|oNS1eX*6QZ23L(od*10d{gO#$x!jxc?xo&yAP~fI*c#AUHHDxp2PFXJO zqeV60G~P)$%YDNlL5Lu=f6J`7{sq;LCnil4zL_k+oG1E;CJ6a!rChA#2D&T=aUez$J>c_Bn2()W~J-k{kL$dFHo~lRUYFtI;~oWm7}K z^snNK(U^8mxq%f=;+`;`_TaZ?$TENt=J7u*T;dwJgsG@Caf&!?&!V(-#1!%dK3F)c z27Kc_Wq;Ko@zeH~?U(pV+-2+Y+$COb!EuT6=Ldg;AvmKzb%}}cML8OofWS8tGAsbZ z?_w~d#NzQRajlC=$X{BrGXh;NpG(8&D<2BB7C?J=5#!u1K9BmVwmc(;%C^#OMvCcuVLkzJs1MQ=?nlciU zB&Hq&BhcKGQ4J}kv0ReHAW#b?67uL+f)EDr&IKbu7$^WUibYV)Q0Od#1<45U332CH zR(~xiKGE&4w!oa-AwDi%79T$(8UVHt091~C7IHDN=mSU%k3y0`Sv3h2*C;gOgotT(o<~N zF!&i3pe&9@b(S{Kkfnp-7n6n$y})8u&1*W*Eb88<130o$x2&g`;wl%6?Xv@PS$O|=idH_(r1!~2a=u7B%gUUS^N3L;^&f{=kC|< zxOFsD|5(ydv$FfZ?UsMjw^-WsoxX(^@0GRQ>YH^XOS`DZzEu6eq#J+LNLNa$<~ovf z?YD|jbvu7ny6YOB_Eybx&h7kK!ThF_xAE%*t5&XT_eY-Mt0%6UShZM+o?NwaMZO!3 z>y8vL&|<~zMepv1PFsG#!ve0P?ApkD#e7S$taY(?>&%I?w|Z`5p*&UBk@9xT9J}u+ zpQ}tV>+m-p@8PPR=6}6XSotw$b(W)pX>a?ocTdW@C%N~jMei|~%q0c)OUhPlmi+SH zty)p&e}C-b%DVaZHjK#EOSzqA#)*rT1_gJj| z*VI_MW1qn;6R#H_pIg7=*7>PqThpJ9M>b)^scMvW*B$|ix>v^j|``H9h`zstyz+VyH75WOXV`TT*ICeGxVU4;o*f7 zwn8eBE2U!ePJmNZ7_5?fcv=njs{r?FC7zb!X$>pI=xDyN!}Tf**0MKca-CdBuu;Ae zPwOQS?pU-bRl_|i%A1@VoV}Z*8o5lW1!N2U!3UB9|8@8;lenP*ss1&a;V~F&n6UX9 z)Y5@OFd;|a1I>1YPX4%4l!A$12#b20w3PF)2oxwRs)-;y$Jz)tX`88)kWtenUWzFZ zNz++i^kSK3D>ZY3wu}9wBI=4007Bgv9gl`XXgmY;oSe8MLru|rsF42*!V!4PsMT#_ zP?eabq*ukF-X?|=nSk_lb@gsj;3coK2?$E_RTnQOZ zm=8}aJEz2gQiu#`R>de@m3h9tqMC>f;&Nn2`8yI_?l+%&u4tw~2~vaoe!`|ew3{xa~Lz|!`f zMe#^#d(XnA`J(yGw|vR%Jt^_Xtgz&$eY8Rcp17U(kFR|9m8Apyi>;?p2l{VyE_5se zZ|+VW=ufqtniW29wEX5{8@K5w{~J|2@*B1FRGbX!zi=I{5WZjII$SIKz~(yKAgESM zEBLMAi8XEua=$RQ#eRb^b*YZ$=4D77Y?|?K62Ot3n*pboq5Tc+TLK&$T&|so5py@f z1P=2xW8jXkZOF_!%E2$H61i4agAzlg z%L4l*g5NgTJOGgbiK+`PlDa``3dCV3GI1o56l?#jgNczuY`e;DXVyZW->JHp)RF@v z!8E=N03)cdazeF_g+mu2aJfmagW}40dSMTajiEKA20#3QGJ=~Dp=6ZaS}({G%y~z8 zrKVD9tz!Hq>vFRJ6#&G^F@fy_hiBPQk#bZlIjYj$in)?G+uYN0iRJp;srucw z22%A0l641@1&_^I)7qvCZP8qdfAh$jy>Ij`lq}Y4TXMAH)v}{D<*1ztEjq9>k#e-) zP8B>?@TNNDsGb}6cGEllxBPE!P1dx3;MlJ1Ok^`5u_t3uY@}`|JC&C($SAn}l9;dM z1<258e%g|8$mf!@#^A!w1UAggz!$nf=e>AG6KCBuV*1!JFEY$adndUt%;nKBFyXl< zKS7f%1JKMV{OgkwmI;gBI@#Gn%N{nNXmMwoRj??r=EL8PUs3Vw25aM4QC}prEvr6A zQDQL0SjsudF-CM@-&Ib?0WcPPpD{RlhZ5M&B&3v!NVLsaSUKQO8YqMBlo#=1a#L=W z%i{Tei(>I05^!BX@zuzeBll}Jz4^i$FDz_&BXFf}w(DBuiY8~j^3vS%slv?*zGZ(; z%HMOZuqW;H-EduZEqj|&-sbtxlDCzfc&>Yvy-g`^(+A$o_r0a7g&gWfz8cYdyH>38 zN*mf+vwcW|$Jp$zk;&cw%f%|WnIo5UgT&-=?Y!zz)P=^@U7CB4q^K$P^yx57tdO(D z267LMMZYlFd|D34@WEt>ry(UaDh9=PY+MP!2S>6=i2)B~jFJ*0;DtDDuz{fg#WhB- z!Zg*^Sz1^P7MP+cB!AWnzUp9_(WeL<V`>yoO zJvT4C6Mic^|Akxpt)}IjM^ihG-nFK7_9i<{Cae1vJ^e7!3SF~S*o4mfT*9WXc-Ex} z6&hXRGS@vjM8k#(nT58Fl^dXXc3mAtW|C&Dih|KV=5DTsk18^(nVq~`>sCgjWUq&p z`CRL*q^*ON0D7ri=u0k3(p>e7a^zM{5F@Z_B!^XL^gnwooZ<(vb%Y$&X10!y)z31c zrOX|e=AsyxQR}WxI>4s|)Xv%Z1~WxqGv!Bz4SoMp=L%ch;fRjGN>bSD(Tx>aa*tv2 z)Nn3_A~MA9_?Q@kvsu9&YeXw8o)2O=Vgk^#l*ri9^@hy;=`)&_8esz1h#U=*T|@yF z@itLzA8v0G=>_SmVfZ~KnTnCAJ6V{qPiUrZ&zjcx+5{%itPD{?B3@A_q4};PC}FF> zIORykU>daYGUZ-D;wF{0tr>2sTiz*A)7%1ZEitw)fzz z@?Je~<-nZvp1T@CVcAoY^3*JO>e9Z(`L6jr3;aUUa%)ejwI}I6k}N(t+m$Ujc75Ca zj^{1U!oZ@~vEbKnUv@CR_JZ&F%TBvOAmEL7feacfmU-E4xXWqV$6n8FpcBMYmNA9eZ}JK;Umw`97kAC{V@4oj+*k1|n^V<%4Itu| zz>0BZ2UyMFA;FQ>h=nr1RTu;leaS6TKpq`VFDyB50NKX&t2(%bNXcklh``sM0vsp@SD z(rrFjy=}4j;5A34=5rr-+gJ0s;w`HMoVOOfR!yzR(<9n)Yc+r5Am+lBb53Pa3zAZ{ zO4%xM4u&#hKGrk~(?)XHbJ%)(&zf1Aq9tfEM>$EzE;o&>XJ%XG<;IY$q5{L;13Is} za@un6pJlLkG`nuvmf5^D`ic!$a~+(r8&eNh=2%}YU}7uZd&IW#rt+q2%+%nSS>esB zhms3U8}~N^$@499a~BSva--Y1TQ_inSa=*_!7=5SvR~w2C^$^oIv5?D8`3e?>S}s= zAt>x46r{KAnsOK|V;Vn!jrGq)FIS?`0BF!0o9Len2LJl^q=MheKkg0toy!;C&gP%o zT#0$J4kx#8@%7qPTlJ3YZgs})b_dIva^7_d&!7W>-U5PrQ z)W{|ym$^&%Q?4u-oaHvcZ@>3f#5awHEG#0oPv~d=alYMJ0A_9APErf{0)s7Zid#C!(SA zh)|$_7}E$su#Ah@3P@n>Am&6IMPPn-+t~Raj2}2TVT>U*1N9MsNpaXz)!bcz`NZDB zcG6oip`DXq9U!iYyE$3lRkSW<|o%80o6zLGh#>phFK} ze;N)%8YDuR;mpQG8^kS{BL1x5J{=5Wz%!x5)Wm7*!~rM^j}bQ@tNA8H zn2)+RQ(b!@OYseYZSr)CCTnwBdHU=BiRR+JfsqcUM0=j|Dc;!k>x4yD-}0XDzU!vz zUC;c>$=bcwTp6qG;H?g3(d~gld%1Rds&@M=;r6Cv?e@jm$FI3oy&1V2P!`CMzNuxb9%y~6gDvW8^yq1(fELbtzo zcT2KyV6p5>(t8GHC(5d|4a}dFZbkG>i7#2?Lfv;_~gez-%*u;5Dk|j-xh0P1X`-L|P z?-lM_@m7C5|KT34us9p%#e%)7M|iHfd46!dm)yL6>Hm)ZR&}!C&}=@@qIV4~4lOh- zdA6@Q4Brs{Fa|-xx5$$Xo`vvF7-*$w5Cd08EG2k56-tO1aI*R@nqSqPa)0bWnXK` z*SgU0fp1$jX_^;lE93q8oAvh!_iLot$4HYwd9h&M!^gSe@+@ZpD3X5$F;ZFk9t)0J zRku91J8uVX4jQ81uL|Kq`1QkS3+F`y z6c4suDXvS_Ke<@^xjVt6=O`kc=;7}k`t$;5Z7ja~0yppjhR9 z_f@Qvvup!G@o)5A?@ey%y7T!Z-^mqU{f(3V?BwqdOI6hHJ7NyjX0HaaYq7Zd?xv*Y z1OwR%ko!L~$O_I^i*A58E8a5N0!r5Ixm|Im^-j&*r|-UaHZ>Z_5)mVl zk)`J3e(vt>w!Tu!U+>-2UvBwv#imnM%a0#-_LfrdKkl;7(-xySZ576+v+@#GHM-|Vnae!sJ?*!ol6)0c1kX}*o}rOtkz?Wda@l={W~ zLfhXK+K~U-^5#>V?QfstktVIpr`l}EtvpgyIFFD9RTzsY2{oTZ%NWNUL)k8MbIa>r zFwf%@95>`KoVHKdm4BY`L6QJpf03J5yK-%hWxdViMA8+=X*fV~Ifk-o!)dkt%6gk) zD-35RAg1Ypvm$SV#QwUA&aO@L9vz*Llu%) zoG&SdXW2QN_Dtx$)Xu_K^z~0W#d?vg`446yu!%I37hzJVM%mYLMdYE(CGh z7~vx{rU{A$569y;1(~r79*w*jJbf%_U=Y-I9!jtVb4YL1M1!@?hMWQj*+{n2?$xC* z`PkWcIG}enr($&0jB_F0(t_7bz=#vn`ho)l2tODmMbwBrY!^RIW5{}GVAN(BW9!Dl zgh&lghc>{bU5g?Fr*Ftt#(*)VG2wP;oX5uPvo(;-mYIz=W{T)>J-nKyM^_y2=c!JT z$bo1OXNcd#q-LtL>colhQ3Mytil!esK{-U9pBulZdbQ^)79B^>h9vA|WFp+yp50l3 za}J5$qflPW>pT)T*4J}Jbr1A+od^t^Io;EF5{k8)pg_sk7-HoI`g;TP5`{bi{ejax zIOg5i-F;dW&LLqx$J35d78fc>~izo zRP)}yvM=vHk=lRa?$b-2e)hI5<*Az&=B2r*B~KfBBBngzlBXeE(Y##Ima1r5tZ2V- z?0$_1a1}MP$3F5_q|2((MWyNTP3f}66<_WA=#p=Dx~w`~yZK?EqsTRDBh{%{FH&5d zE)mmBk3DqTy{=Wx?$p8({QNMwg6oOT;cP_^nQ$9A=x7r_&MT-SkSN>S8YSuDSw>gNN7Q6GGJ+2#)Pn9rQ|#%`zh&0f=Y-MNhlZS0Vx{F zB}&*{^9!Pp~rb>Rv)st6F&UIrn z7X|TtV@py%)Y+oYf_GaE%)hkUvOm?b|E_PTr7tP?u1Ryd=B;!4uSXY!zWXivKlOdO z0msa%=DHSz#&pNwTh+@Qhf^JglLMb$>Ub_G5IYSlG|oTwR>7k199r`qTyQUMIgr|N z;BNEMmQyoFXP>#Y2a$F1Tve*1DV5*6D4hCO5NuELs~)>;$6Ob<#I|ET^nh+3d!4qj z)iS%SM0;K`-$l<$R=r+Z&3wsx;;pJx4%u6c>~_2H0sUHSa@lI<@d&qtriJ)T{{zbY zs-oLsdzPOYf55TKYOlcAi&s6|mbP@Q_^bR~Hdnf!Xw{1QecWxh1OLL})jY~MxYCMM zC*@pRL-VSeavrXtdX?&7Q_{8R<3i*fSON1_7UZn{1Le`&fyxzk zD&?8jIWvlRR^N-Jgb{Pc%xE3q16ezBP-IPYG*iM*WT7q#xsJO+9mO$RgOXH=6w*oU zbbu+PStyElD(ibJnyvQ(L}5p^wj}9 zo@Uazvwg2B5ER7m!&$=su~inoz1i&@MHz$y{3&{-T0>*wSQP1Wj6zzrn%B=B@djo0@tkNA3U6`oBxHcDeds#5LWB0;fGdBseR@o zoHIUgxnygqWb1Otj#SBxnckKB;;S!Qd10=3F<+cHvXWQF8o%#wEj#K{j{14el4Dz1 zC|VYFq=X$y!p{4GeOahT2^Di?^EC_8OTwYFJAdYwCRlQ5B|k#HLb?fsEXR>9!z9Sw zbrOVTu(8L4{eN9%VD6X(8;;hSkg}fIGhgY*M6ly{HyLqu1i$F=z?iu&0P=L+v<2Zh zc_VIy)0!j`TYy<+!S1~krlD!T0&bp(r3LKBqP9Mrn7>xi3t)Fn686MaUSF z*wOPD&0=2;G1Xb-r%~RNcOlF3FhI>69(!;Zf|yO*&Oe+rx}hb?PIGG-p|n=GATS3< zgmlA$4kj^!gCQ1=VP_(SNueo0VtgE81v@)uVLxc6jL&8aLD-9CupWHH6vGd@3>GH? zg?U3}F$<3spNd_=8D6CgL9-h)VD2-VCB(=M#ED+QjR7+bP*anvCG28oXS%bS(QLj( z@l1br|3RRAff5OHElKiWde5VP)(pa9G`gTPj*o#5=VO-` zyMiD%=SD}gCk!7*rpx3MAd@*NV`Q*IhNEk`cz}5uI(C4pcu8Xv#OeuO0)31CGnYJ4RL`>E6*Sk9obfergsfI%~cZx+2e7|MGv6R--( z=$e$}?7<`Q0p&!S0XhfL88Z^xM&;S81`KHgd(1}1*-k2JDRWFvloO$LbJj9&tRGaz zVDBuDG$&{R1S6M%6Zm38Fg~GS3N~UM5u7!b71C80C>j7pbNYHk?jGnfuYX*D$XG*9 zy^T-nJ{w_^)NH(1yV{J&q9Nw7#%AW^Hk}~4k78r;vQDfYPW^)fph6osbr>g-n0p92 z*pL+%X2rzHU;u^~eK5jcf;DqdV}Q2bLoASA1EZ0jNQ04IgJ3q(ngjqNm>=v|(@_S^ zm`fPjTo8>8X!BLMi*?b|EwSH7YH0u-*96st|AkbFT-qAWVCB%^pn#G}uaJ{nq2*o` z@UfaEuhchwHvL)l{7tUB-5Ck@72yHYRaenl|fQm1JDyo z`eKP=ByGtrEKAyEEa`W^U?a^)|CFI_bCb%cZTM!?P}UIOs!KxM{i4#DeFt-``Z-d5HV|p^C#5)*qDf zNR@kdyypBbDV6m9fIJ`@n6E_$`J*!Hu8e^eEnrHYWz;&&GvgCSi{1rE>4F+(MqvX^ z4zn{;k{wvol#*_oP*h?QYn3a__lxOwF5?9MY@@x!uT3ek-}V_(swfaYKc0|cm!bjP zc%cJlWTdGMKj1jgG5!~h%DD!d)%{4QojdpD=o_OS2yM*d@bgFf zdz9}HK2?;&DLIP-r#2_z+Np`J(oc5?+bpz1lOac=s}^QGX%C(YDP$(74*1?=5(v!Q zifd*+L#nkaHI!9!nLQI0T- z*>CNWDGD7GQY+XK?JJ#89P-D8MS%Tc0!}T_ftTY;e$F|6&gHW|*UvdO{yyYtKjfM|l#++EgW<9f?1i4 z(4}nE51jO9|2DoUU0$_nrF-)h?Y?U-yWh9DW)ZkByZ?^^4{Uh9dKO|Of0-{!@n!#L zIW6#TQ&S2@o~3+Mdobc?<_prrwX0U#?|VyDZMXx4BHwBr torch.Tensor: + """All-reduce the input tensor across model parallel group.""" + return get_tp_group().all_reduce(input_) + + +def tensor_model_parallel_all_gather( + input_: torch.Tensor, dim: int = -1 +) -> torch.Tensor: + """All-gather the input tensor across model parallel group.""" + return get_tp_group().all_gather(input_, dim) + + +def tensor_model_parallel_reduce_scatter( + input_: torch.Tensor, dim: int = -1 +) -> torch.Tensor: + """Reduce-Scatter the input tensor across model parallel group.""" + return get_tp_group().reduce_scatter(input_, dim) + + +def tensor_model_parallel_gather( + input_: torch.Tensor, dst: int = 0, dim: int = -1 +) -> torch.Tensor | None: + """Gather the input tensor across model parallel group.""" + return get_tp_group().gather(input_, dst, dim) + + +def broadcast_tensor_dict( + tensor_dict: dict[Any, torch.Tensor | Any] | None = None, src: int = 0 +): + if not torch.distributed.is_initialized(): + return tensor_dict + return get_tp_group().broadcast_tensor_dict(tensor_dict, src) diff --git a/distributed/device_communicators/__init__.py b/distributed/device_communicators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/distributed/device_communicators/__pycache__/__init__.cpython-312.pyc b/distributed/device_communicators/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..051ae84f78d5b66114fdd8a0d03902b4368ae482 GIT binary patch literal 182 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVh3c2&7U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?D`ExO!U)90AjU^#Mn=XWW*`dy DV_+|{ literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/all2all.cpython-312.pyc b/distributed/device_communicators/__pycache__/all2all.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9eb8f3fdba15b78c6689f0abf8d1934636a49ae1 GIT binary patch literal 20137 zcmd^nX>=P`mR=QhfB*rm0x2%VMV26olqD~+tj$^|QL;qKvh21Q&!8bxQ3QpBE&!H@ z27A=qb^^8CBdODKBu`Ib^_)&xikyt5v$Xxo@=S8#Y$t>yH{gn#QJ?W7an8v(v`H`V z{7An0YQe&ytnTSE`H_h1OF)qVGV_rCk^AACL!hbwM-HTLQbj{8eW=$BoKEPRva zxC@-fiF}+(@Z&sBaZB7XZee$8!ZvQ#@*D~0xRb@LaaY1U?oN2dJuGdDHzd5{UKY2< zeTl~LMizI({fWSMfW@8hrbP32bE0Lu1!QGl%0$Y> zl`F8^Cge6(<@T}M7UZ^8<@U4OHsrQT{YJk7EO!NRgH^eMEVl!>omD-onz4jdeoO-j z@rre4a%L=KQM|{cOjw)_AD2_vX*DHjDb#AA(utIOG9ruNXeybJPGykY5KG1~;dtu! zaY;t9eKL{`PsAhX$yjnilEaaBd?WtQdjpk-5_*)T6&EF@r{kwSrny3C^G;O6-=Rn{ zf}VCo(vo7|9f?j#=+t>AGCdtj9;e40N+w^5?@lErVw7~Txk}U^3{HqDK&31Vd=nN@l?;t_HoZ$pL(}ZJH!UGZxFpNbK~9ucZwhPJ?Nu8o8*4s5peAkl2j(d64P-hAtf`BOe~cYQWFBcGkt3zno18w1RN@EOZ0Td}@q;v-4Aw-fQYbi>{vd51jM}2tIQuU2#71H0rAc8;OFK`*7 z@QfwT{Qx{D&tKJ6Fwf0!SNO3j7R8>nx$o>(u& z(iu5+ESr(U^`i7rEGl7<>GPP9)9ba_9iE<%SD^P?m&qp@)`vHM8VdmgU!LY}yL@My zMOW}*$NR2<1>|uGe@a;q&4x3n6H+oAjzr}YMxCAx<6V^27^2dPSt*GjOh;rqSBfhR zS;}PPWYk9OwabY+n@TSx7!#>Y8cpC-^^|MX^+^@^6KgmwZ8^Ywm(S=dpPO>%-B#{- zi^#|v{kbc-dQ7zHJx*9qgNYTrOuZx5WO=`7(^=-MEpL^*d2Xt)wjHIC4alb%th;9I zTASv&0wh_R)}{Hgj=Wv8(_70H(VleXEl68)7%?s4S(auoSKgL)=3TFlnB!itsEd4x zK6l+!O`;iU%C5XapOHP>?`%fT?!3G1x$e5>y4B}qR_cA|0&l8Y&#BzaMwfSA)xI^) zy|%^1VuBA$~eIWC5&lHx;VRni_!P0xfCJL*A{5QQo3 zV{$4YMkDD=$RpD$%e@o`6!alb9LKT~kXP~$<*cV1dm2kNqgbV+s90m-sWcH7F+lZ_ zyQq?%@xpK-8_$5>r8Bt!vsGDFiQXyAW3`;-7QI}c<=ln~o6c`~HGlEN8-8KVCCvGS z+fD5kR-Rv3_6=S-QuM94-1jH){X~Qkos@pBW zi=Dr087%BBcXVCi|1fxd=iKV;Z>6t4_4di)>V1Wga&zeNK(Tqau=}<*`07~6J6QA% zUV8TO$hCpX`+xr8<%7jlo8NlyeeaHWC)c`a-opjj3qHArCeb$;6GbT*#)N~DD?Y%F z44?xrMJ$u9;W+dy8OOPe$OR57u^p=v@%qb*|0H>lpS4vujzQ@roCmAiewuNpyxolV z&BtDG8zVBWGT~Nl?D)e zLdrzMNG77Rm~e9WQB(^#RZ<5h;N-_BV5IH@qi1Vt8)XRy(!|MY=-LHwG#$xAC(Xr- zUb>wczl`8C_lcKVznecdcwy-LP^o$CjpnuQ@dfAY_R!7t`%CScitU>U4P}4lod3Q# z*L?`f{X?bxoyGp0*H_={KQQO&y%X#z1=kgW>&o7axs@B|9vWS+S~@%h-@JqC=w7xc z(0ws6=f8i>bw4AfD3OPLk+5Rq@&Fba$i>2#y%XN=oKWX)Rdkz8zfn?^1cMv|_o z&b6qSfMO(R&)cSaOY7(DRU~E=Et>eSKzx9(M7~OVAg!@+zuu2Q+)a8;VEvZlxK+)e4_Nhv_#EfIJ73cnXXAAmL)Nl4+a;t7WnDKQ(T zxHlYrF&l|%CC+eIOhv@(2a{siIHA z#8C;KHH_gC&|t%VoDE+l7{vls@D5U6zRZ1MvD;mXjl6wu-qCJvzta#nyYrQjc=?-T#TZ$-eH+imnU!^Z9u@;%FP} zW+^A7)V36(wuS83{F(g4XWn?>^%v&+p`v^3LIahp;r8=m{3lkwy}R7qPo?uLzH@}# z@3fQB6 z7wMm(8~S4d(;vOy91T$Cj~_|ztWoK6Sr3VroymLE9ED>r0*FE?$=ZP}3>}X^U6h42 zy7&o^#wVz}W+)AP6R<&8qb>f>Z$tTG%T8{?CzFE+biGa`U8aVuT>7Sl78a*=grnJl zlWXO$+>ZitbYo%Ss-jOB%-l>wRYZ?%idJ~trMIm@G*;P@A&Ji}y*%m87&?}oM|w-% zVJ5gp+?Z=uB2Q%%4G--Np`<>MLc%%6VUEe$-?Zo$=>@#qEI(VpK-Sz7?9&t3>{CB| zjbKo}el=JWCgfCtYz2ZiEs&f^t3*IOFgXTI-YUit0Qb-v(jiwxmMJ}C--=FVlP4HD zGSp&C;b6i|CUy*fik@KyP}P1pM0pJ)wp99j$sBj7BJb}|We+L)YU?G?i!sog8CB_2 zpF$Vue~19;Lo>Hx?Q#Gsu(A|*tQdIgW?(DilD`#dg{oZqYW0>Mz3`*o{nuT8-c#_P z;M{nrZK#O<{^4swrHv!SjU(41#f_sk{G)TO(K-KUdG&_U>TSi<+seMqi-+dCYs-PA za`Orx`8C@YINKUe(KT4`otq_8e-}7rp!c6+@YVJLo!-n)`>Zu@)u5I2uJm152bO!L zZ=ig0><-GbRVY&jM7iAtJsx33EQ9IP(Iqpe+>H+9=P?B#M}_?47m!g!V}6yZg*C4B z66(^rQh5kLsR}vCVI7|o*J{YS7|G74#o#zuI+@X!2$Wqq?@BulV7p3B!Ln({>NBtq z`~b4GqIU5dtO3+#-6g*#T10EymaxxS5{_AG!Z~Z>xlBN>Hr1lXDtFO_zD#V6$ZfCG z(qp=;Wg{?YrKy=t3v4U;yCfY*S@hJby<%^i>eTzK-1F7t#EI%IjEmN3-E~!)PtD$I zTIWrBW%OS&C%`Qgt0YEm1CIHPW+_9gmRX2AEU*=wVe;w<_Jqm(r67DM6cW(A$_fHH-}z z*!06OQEAhQ3Gh_1y?QonaOAe^M;MF50h5HWXP+KDG;)Y-wNyXwF$S@=t)k5vGR6u7 zs*r@S*P522ik)pF=Gxfe(mQ!({~lqRuqqvDmdT(h6Ol5`thi&?Quj1|{C&zg;is`^?nmz}QG1eP)%LlYplmi{GzFR>rvU-4RQp0m0=^A_H> zX2FVBA-Cw_+>P2k%`Ml!od*Be?cd-2g~R4+{DeaQOR=xz?7=e!%Yj4XKB1}|(rzZph}?Vnh=ru%LMR{yGPs-!aRM9X$J2_9y3+JW&4S~hu2d?_b(;`TrTEYAl=Kq_ zPILbSG~#b5`GuliDEWth-OJwQ^6LA4_U%9Zc45!iLuU?^yaPq=z=GWx@Gxq-vt`FS z+uq(*+H$1034b6GzABNu~cTiz9}q;BeI;5uh^f=b|R+_xHuJQbPR9K*I#jvWsf6mpXQXGv)zrJpS@+@jzQNKn7M%R1^4K$mK zL<7w?(1kt_2Uj%^O?rR3}UHY}lFEnzUojD0)J1sJvXU z$1+kP%@hc>K4oT)c?HuYCn!iF2&n|B@L|QFE;d$`y=Tz`8zA2QnZ*J|?5Z&0Tiy*Hc)Q;B_RRY@Z|4_oRG#wy(-9^~q`4}LnzJM5nMAk&5% z_I|_sig=YaL@8ApqS$iU5S2A++K|=WZxPTCOehBk!w=bi?8ugcD7fKmJz7tIk} z3m+l)|3n)K?t!<}$YIE%I?t-FM(;%@QdsCNsg)Aw+lJ2BC89;xuBU*-tz)^@v_b#BLF2U#AO4sWEEsO^R#iBcp-z=w zIg{B$I1LuHjBUAZxesXI%KwS3(_0WQT-o{Btbr@nsg`J7r3lmjMi?9+w%*P6-iX>{`MQRhS} zbvR@BIn~j||3)MEGzFxn)@&%>+oQjWW+u2oSawP@bit#0`FqBL>0#9uTUzu+#(v;k z3}(weOcPlSde{N0DxgUh!`HUV?Q?a%grbi)e!rr&F}UuH=HNr+dP}+&VpMYpVfUTP~$t z{fwhWYx$~7qOfKa(f4BCY!H!q@ihGU&8A-SXB+ImPan0i;nYtKkoV6z5 zxQnsaFcyoxGoaoxx|Ao-ZQ}VyyG|nf_uh0|)#L`gsB`RN4%8ykS7Q1ebG+Gm#V$`F zNAVGlhRbpI*bF%*HBwR*6A>*PfN3WjhX$C8&d~n31kj2aFHXa~89q$3QxtM4HVH^p z>3sI!!9(FaBO`}L4u^L=vv=>vk?_;|$4A14cB%T~y6Os}`}d88kB%OB@|k_3ho3nb z-uvW{-6P@sV@Hqd439mzXGA_v!-K6Onqk%q4av&?fZl`b4e})fwJcC+4JRTeB;6}J ztm>q>`&68N;X`>()nHZi@|C^JCdB4h2koDqYF+#hV8*j(GszP^>u2MsRIz`+sq!X z_XY1&^0m~|rQC%1M--$}(=acRTeScKM3y|Z)58Lo7xTTA;prR0UK5qUJzJK@LJpXR z9Qaw2cedt{FTfWM{?5Z+Y5b3Nx0ts#YY2OuOJ0M?fP<#$;xGRrq$+wA13dCeRKu~0 zxmsd@u^lbK5AtOy@ToqS8db{QM_!uNDM@f`SBqf_O@!BT;Nc3g2oaG^qA6M1n$JD* z>bEXNZuol*aI}emqd9*slzI0G=6SZE7~F6(_+Y_z+tqm1d&WE0HgtLRmTOzNZ&j&p zW3g{zsqgV(-{WvjySV9%?XPc#b6P(&o-WE)O<@fy{|d$g}lov ztUpDUGRZBKsE7#QlAFxPG%)k+I^s|=1vm@Dr8go{K`qR!sLk%VBwl_H-dK4 ziSx;sw9-U*unbkQRNoteYvWA~Wb`kPU@O%Ed$psoi)Xg>;7d;kiG}dux6iuZ?NZ$< z`l8EBDi9XQTj6D3yJ}Nwku&B!;ut6S_2*r+I41IYm`6qr#4nehMBy>U*HIys$R-3# z&M~-6;rxy|-$F##bC~Vy7<~ysG?El_{fPNBkYr#EHbNqs&fo|Swtpxu4QE2(rARz0 zRXlH&d^2?bs?P+vg-A+kM~EbB!l>h@G_U@?YRk%yM-B^YSEt_DhuYLq!DGUh@uC`G z=n{@#TfFvpHPo_9Q%ngSPm@q-%K+q<{9}C4kWHbv1>iar)~8NCaic>+*c_SbyW*yr zY(A9+ifSV~D*qAn?m;%PYnZr^w{A93{_TaESz#JLZq>U`y4fp5N^q*5e zL@xhJ3TS&q?VaJqOO#}rRUX8Mj?_!t<|2`o)eArcoYjwh3;%mRu#%ez6f`mt3(HJM1{DYOTPs9KAtI>b)81>`R zmxLLGXV}QpjG}mH;lel<#=%8Vw`VPRk?Km{p;saL!Y+V6z|h0pP|ZI>O?8phLK~T! zpv_UuH7cQCH$7?!kvdef%I>){xO(1>JCJEt@4S;@E^bBVyqjVku3MOIpqQ6ReH3ft zJdK6uxd#iAXL=U=$oX`P_YxA~?M)Y>m-f6q^#zBhiiYUnHE2i|cMHFZl4>;c3`_^( zR*4XhWh=^9%t|qK0@sEUxDI$a+e8cg?2-*mJq~7kaWdnJ3!$6&6<2VON9ov0M~nB< z(PAJWw%4lZC%LVR!Zm!foS#qSWO6M8s(267;CF*ryoF)^RxH~3beUoTvS^!m+alMD zK@i#C!Vz>>Wx@ta62Z#C2J?Yk+%}fmQ}yDy;--50n>eg!&L!({_&Rz*sI<&ITICc% z=WZ)Sq)*Z&0~3c!Jwo<|+%k=*;_XqzONOX`4w0y5$yKKlQSo7%nG^ze+TvkBRM51N zYNw0-y~`1V0WZ(eTKNW=GaLD;TmIGaoW<=adyhbeb9?6et09)EeJ?Z(uc;(jS0`H-@CTUAsx}|m^{?^krf<$f&VR#5mfH}P z`5Dp*wQq_R#--aCu-$rh0b~K9?+=hF3};{-gN5Eho2bvTe544K3?6m45b@>TNt5FZ&KA$m!4O--$PG^ z2ReC`gadk8%vY5+7tlBtZr0007gBnndgfOfnMOo(*UtP5AM)f{s}>8~qfX#FT~r#& z^#dl+S@lF@dRhM|{Ne!){joE8WC_6U95n9+%1$MASDgrtp+pE-6)y}niSV&V6lb|b zB`^^=CdZ;UxP4qw568A+2|@RvL;CQ5iX^k~h#bqz9L+sgSvtn@CEqV?0WxSokQB~Z z>zHd7d2O}Ie~#xW&M17_lG*91WlU2swFEuC|=}`MHaIb!-ec0?!`*4DO&4CWO zokU8aL%!-frj>vZ55xFMXOTMeTGO(GbL&;LmEe*r zt&w_ojd7rVLr=01uhD(2i6^Oofy&tM6gMgl zZtimTTFUQ)nU`_Gk65*$l$4cD{DLm{lleK1cSa4HA0;5lpaglX4 z*E^C-XJJ=0=#_wZg$TsG>Oo&?9Vx<|$|NQuLRxe3`xG!zU5l7vi)O`$;x(347AoH; z3aWhPl~q_7csAMlRZ(gTg8htzw$gJumZ#Qx>fqlJA^#4ZjXeQB3D|NAGyd;5vjcY5 zhF(gNuUao`u$n9d>AbPy^&RE5jtkG9fBtsQz#CJqPnCKeF7`Z3{)%0TN!aj(Eoj8F5Q^E%gU_Sk4yAIL{Nm}TM7+#q2^&! zT~?-VHK)MutT(5Z@ASLz65oW3`X-%?;==TT>|!ZnWw4ZiBpLeCD?+DH{|#`JU5P=Q zvYq8K2GyIXp}?NJu5yw~7S77^-{xQ2qux*QCppzUYXBv`*~s)BuqK*!llTy_)0I^* zmRq|w5{pAug?qCmU4#lwRYDk@Erlw&eUal+j)2}vaX zCzOU%Uj>DiY6#*#qA1r_c}P90)`p=FHITLjKJ9=TZOPkp!`lV#+fravF|cYbuz|&g zih-fIz~*_I+ZO_(oNHCX4d9-kp>wudfmP)P9-{jYHoeZ-e&j6=tT|_c@7MeauIaIP z`0Q>(;OqW!5x%_LpK`qKFn?Qh5S#0L;=QLzfhUWBC;xihN(Fz7k=(u4uhJn5`5)VM zdboEz?wt=>-)(8%Ic$A5WZ(G+_wKNravro&{E?nrt=45UReEj39}Y*7aBOEF0zYq~ zo}^*?=mV6?WRRC>GR?Ktcd3MeS7;&+AoAraIN}SJF=(L;jo;f3pXBd7xoB&EBSB{f z8f@!|c{}cSMSt_WlVb2yS-Idw?9*rYM!WE-z}s6tcLeMqRV$^=L-#2T{IUpra^eZh z!Hw+2_(_p!9D?6YsDKhVg_??+VHMhbQS}xgVWb4<04#noYsrrR$)pXc-s}EK zzjUM^Au|}(R+p>dEQYO=-ukkPoB>udWQ0tROWlh}P>c{UJNU%B{+K5S+gM?C2=-Z3 zc1zAO%uJ}bwyLV-cKLrH58s?ilfe>%%k!T(IDW-na^4R);jg&hUvQf~&g#pt3R}DfCHqRe%i#-cJ8^G?w-?*MaL-5`{p{=EpilDeAe;+ zk{$Oia=0%}@cVhbmnA8(_?)F5rM(*#IouZy@Q?88=DHtVvjzRjE{UsghN_zt&y#M=N!kDCv)xK*Hn&OOEQ5HvdTNDqZiNJ?Fl8 zGXth^b}Q|Lx!?EP^Ugi@+;h&oAKL9j1VYI0{Xok>LjH&aEts>J`>*H;xk)Y&p72bF zc+`~fFtn`m=xD0<=xJ*37-(wr7-?$qm>|`K%wg8UGB{QrvV@B~MKo^+S;IDujpmt< z-D9U?9N}V*Gh0^@F7=dU^X1_RPX)~zLzQ8d$3^p|P*u3vQ%&>cP))ejQ%mz~s4iUZ zsi%2Ms3F|wX=I3w2-O1j4x@gXr-?7}H1k%WdCo`(Z&OpDnYYjB2!YgrDPR1yfwyP- zXsI(>)>7~qr3mm_9x{VMq4lAY)P857ZHY*r|Q;*gLgS*s)AFX>QTcvTrcY((} z$2`Y9y)a*wa7;M<4y-%|&k3k`Lg?l9X*E5>b5b}Nd|7QTj47^(!WuO%oY3acQs7Dk z|1_8knPp~~J>(FPm>C^IE)xd+W|^z{%VdUWZ~p=TjkGfgJ1V>s7b1QE^5&Qr<>P)C zviehzSxD^tao%@vY$z};`b4a-1|k938;VX&3L+F;S3;q%Hz0Y(eX@Vb8;D%-i2+{( zA=JfVyiXRqLgY$7j7GvjME26w@sJ=vd#xWpdwJnXz%O|H(Ks|qG7NxLDL{ymeDQaH z;ujo7NV+c!5VadbJ0zk5D@+)%7~%G7*UjmB-3R9khYq?AicYMar`5DGt!URPj&LC2 zor#Je-YW&>1f@ux5(Q~08sa6T))$)b%}U<5BzS$}(JMk>$HtNV{vq$^<>6uP@W8P5 z)Xl&niU)P;G%oEG~OxAtFUZ#Uu)HT#Wd$3;;+G0_;J5_x}wt zH*xWFArfYiuv9XL4mD^@B{H*m9ZBf4vIOM>k@5R!;0?G%rU5?oET}hPebA!zDvT3E zW1OTup?}SpV1iU|7xp0}p`ZRPg=yM0o0-mbnvhY_&W?!qS-76S1=4IdEXZQO?+pv# zs5lFY?}O#$&xmEvNi2tGOsv5ECgmuyj++li(0@EG3%onW3*m4)67c)vs3^H(vk|{P z+} zDf>hi-FY>BtoQizF`+9I81IVB%2Uxu7b4eHfLzyZ$aMi$UQvK8BY5L-AcWW|4Zuxt zU@wX`5(H_1_^TboCctA5eiBNSdGcqS&S-kT61H*O%w>wpKez5(x3)nhV=h@QKa%P` zoh~2v$UFcwo6fFVwzTtG8_rW{=c(W9Uvm!64`$5Hl(}xDZlkF?-PC>C@=?>NOmSJp zTCrYrEOq=+y6U;K^|_Scx%)61&_ms)lIT~;>HfPCkOHtm*?lPabG!$=k-v~n?|&DT z_$EmhUbD;DgAN2dlQ0Arjf;S5u_p~0mnMw~a+)kLFTjpXni9rfA?$45geeaJdT1&_ zwiOdzr}aphp~r5#Z07aQD?2|!pqI4(KsKvkMcYw^><9^SZXOnPL-CGgC?;5~2f)w; zOCIf^&5^VuEMjbTD7@jDLIIlMuAw1_TW3e$b*N_Y4g%%7h&?TzTX4F<@EdS_N-PS{M>x3?LnLrIJM(Ln&RK1z3=17&E{d z*PI42OWMsNN_qdK)2F=84UP^zH8kKoJ@DLM|G?-M1a_^6l%`UK%KkI$#UgTni0rA@ zk&8zK`7}L8lp-t+Q(=+K3tPK(!UI=2V?%Tf;=pQpY}!gkf7*6 zbc=|$V)DzgF+qaV>!0$8If#DP=0?#MnS{gX10Qc`ArrIAY_1 z;_yc!Qaqe1+u-Plhh()?Y{4OVkU? zuJ3ov_ix%O7nlrd-C(QJZ1r-}I$M`1t+?sF;a(|UE8V+bhPLaDMaR<23bWF^&bHCM zO=%W-?#YytZQXq; z)GT-Z@H`KXv#9CR)DzjGBp4rp3H1uhC^_H1bHf^!QQnA5SrP-?Gx^;Fh zMYcB0)_%g)f7wnPF0mPDm6eI|q`bTI8~@v+ACUbUNk3*c>8I_LB<$fNYeCtzs|3{c zT++hplPpN5-6UM0{9?s70oE>3?o|tK;Emg)>DNd)*yCS|H#O@!&)kv=pX~syzH`F1U)(s76wEyDvF)h zdp|}8Fmhwmg%OzML_CDiVT_JI)NT{|FyD<)4@O5ZLZKvr=qCyj6i;B`35yr7C_CM(`(3l;wA=W*P6py_ zW(4ZNsQBFffC?!YvT6uId1pxK@}@;FXFTM4f%TYG@6agNP0-}Tk>N4z#>cdqpdAmc z9YID$q9Fn5X2O^-C3F+UcXgUo3o795)Xw{s&IPhlPHGScT@0atE78|yv|ILq(#oDcR-z-r4M3_Cj?dWWKq(ZFa>i8 z-lKRXZI5ZUCu|8j3PG`5uF=NtCf;}hsEF2rW5N<_)cWUB5Xo7NV#u|z#IAi3wt_wr zpjKMItTg#XfkBHAaQ-R5KOKlna)Al%!r=>?kE%jkK;p7iCg=A>xN-3G zL?Z$>K1)Zx2u7rT%8d$|8=5tLgF4n569lmhjG+z=PXG27-Q3gB8R3c`c5qS@3|SDQ z5``j90aP&nEnGM-IVIyP<6?ALh;(qEo^k)HStvx+B_MGxj)`&MMGhv7Jo;Ucs0?2z z)9>eEQSgg^sph8RM+JeKlI56mqN{5%AWy}|-Tr8pnysCn`vnERtlF>vNs0?n*WqJ_ zk0Yvdur_caD#pVS+TB73)JxF~t{Of7gaSx1Sid4b>70zk=?^NiW_chQ>Exq=)Cu@? zN`esXl%t*C{g}p?0ZeGN>+ta-hmUr59Xd$o8=DeD0VWeVxEVm(cSYdjCK2Y+( zcGTLDYJONj@V~%PW`P7ENLr*8bo5C`ZdegmdbF?r1cj(UfEm2bmD11?b^P6@Ld z5R&$nd|`o8y<40lOrl#;-I0ToI)QL3D$3viMLNO$z(GJa4K@;2l|6s$%Q|oroiTD^3ScZA8au$3^pN$LaS3%yiiuge4@H)(5@jtR|}{HkfWUM zN;JT$3kdxS*99$+@H7AZ%3*E0j{`t&XD@;aYhP#6~L*#|Ul&&Ie-y67_t#&$K%g1Nguts@*HPa7?j+ z(-A!0RK6(2XQ^wzd_kR-+Urzb2224%Suq5`6QHod)qw1e^5Aip3&gVS3Dpg+m}h)q z1n94rd@-;F_;v?%KdAb=>Q#`!u^pPdVvKo#(=h?2L?1P}A{0H=>9L0KDa^Qpiq&>O zqw4lfJWMsI>;-ld!Bb3r4nOGv)$t#kW=I{kQQMua?M@l09uyI){knV6z3f^mYMSrQ zR8-x(eB<()-Q<~O^%{1Be7TA zClTo!9#o`<9>m$l9>d^G|(0 z?Ys5tM|+N@3`bMeqnYa3`7@j4u4T(g?O%Q(v(Sd4bZmW|lUt`9E`FSmYdn0D|~!pt^V&qHU%Ds@K#s7y?b|Cdr!yuQ~Gh1o;to zSQE^T$dC1?H;xwgaf1e^*b#CDfKJZ&YIAjZ8O;$IcDqeT8WM*2CSJdrZ-O_xLFAk_ zBB6gLt2Um_`8Tziq(R2J9d#D8kamRKpcmk^o0@G*7=Q|Jc4-3up>MM zgIyHxL{1Ur&F`|>JaB1e;fq4{up?&0|p23D4(?XHApuw=jh?9@Oq3O}F zc^sA)>8#wW^$50VxqQl7p?B7VWJrJ{%`!S9XxMF?a;wA2?YVyW5eKxo1Ph*%EL?5O zKdQ!yC0Qh+!-9q&`YLt=jV?VTX_e7=K|>Hd7&}73g2qX(dq*`CCvu#Xw8`k!pdpA( zj2%HnX9o>I^mFV8GWtqrnA9a~u%hClfA&uO*0W63);8o{>~ z7kulGHZGqgiZwbWojb39cb4;OgA{tzso@MJB)als zRLx`j^;9+Qq~fb)Ux74zRDyz^? zc`c`XJFG^|bDD6xuH%`c;}ypYqj?c`UiiZei(d6F?$W~U-g51|zfzyfdMYdCTKdrz zNAoPJq*Bv6Ymj3Vs^}wgy3Rg@>F{S|IsFzqQ;xvzK5pA{gu#17@&ZWyB$K~e1m{L} z%nzBH47d(T(0}^R`WfaI48B>=!<{i`a5}i>r|w4-D4NsJa&l6L2v=ib-`u{PrzIlo zK8Y8}A*s)u>sSSoNq8&$pX4v|B(?ACuN=1yy{lh6^-I&Q9RDRD%CA?r7^QkAJUo59 zpd)+djP9Tmjf??qsHnl!Omtdx-ZDxtYJ2M2fJreLkIl6rf@rsZdXBUz@MX~R9F^8q z3BJ!rUlG8qkuL`C2Lx|4F2mK-+!NHf3$_$0d!T*KOo8u!D$klC4yLsrqEU`%*52SI z#AukCGoW2Er#n%k=tQ_^nxT&)0Buts!h9pC0gDJb!*nVZ**@ z0UQpJVum;ZHkQ)($ol}WWx#L{Lh3saaS~w`(be?AW!$u4qNJC^XR*N$h)hHkLkQm& zphyyN85LF*!m)_}-13! z->jq*30DPwTt*#VVZB}i>Gdj3>b8I<0rb5GYU`GPt9~JdHmZn6f?|?nUJ&70j~A#m zZvY!M0%%5kJZSpBLqJrRTrrRPL@@wwY|8WM9i?=^n*_xiiHFgnrF!XO_?46>M#@Ku zKEO+POF*%z{6lfYcRct)g+7~5SQTM`R~SKICdB6vQ=K;ezo5d15-LsGjR3WzngZoJ zw*_u9aT&(In>-@@9CqhCd0-=KeahUnS=_R!Ux}|iwJNO&E8cW*Pm1liS5duD(Uz`g zTdUYVKXT8}zUDYEe|EFBF=swx%(W@=o=ju&Pp-aoHNEfbuZ_1Sf9+V?cXqAuY>KVB z2e!qr#beh`E}qU6~mgd^_JmdL(d(Dc zSc0$S$4hIx8`VG(BErp{@LkwUAM|^`PbS{thGFm zYCH)jEm)R%A5;=Y<%Yd2ZEst*?_ZtXtl@sr@m9yG5aaTbl*s7-}$wg3k&Bq zT}?M%x$(+oN%_r&8x23JN;&u6I`BUKK(DKEEIhr%vi9nXvueZHo_4mcI}bc?lG4hh z>5a<0>B_yUhV_!pTP2%tm0J5&?aH%1YFILTr6;BCuNsNtz=LK|Q@?x}CO!H|^@+v7 zg{PKuOQ$k>S{4SDE-sy3)-4}iZdvO52&^qVsd?g_y=19nS%1yD!fe`{*9R8|mxSeK zS6=$q)($Wo^`Dz7K4o2NZ1o14y$jwnP%A6tJbK%(Zur)w#lFB_t66$!(^-Dga>KIR z^rmgW2xE)N@76V?4Aq-fZl!;vXH~a)WVLmrF>O7RG93ELgMFmD{!heYufA8=42F~I zVCG=&>dUuVZqMGHe!nc^YP|XCjaN5ZooQF+nyV{Sd2q|9t2lgLZ*Y~umc>v~ z{oh+gfbj4u2Yh|Hr1K%AK-#|l;a7VB_XY36uZjSVg_?(w4|w+9_nj&pV)Va1=6vR; z?mcrQiW2J!ZNyP-cXE@D$Cf=w{$;9ZcN*qa`+L6t$J&~*P*$N z@kJN9SG_YL9$%vRh{H+;s@@FIjG*?@nlMPvkXMzoQPCap%?dDHeZrJ`YDnLLsSgp| z>LVY`HFw_$ja1D z8}1V`quVMkW}atOr7eQ#Z(85$_-%)pPmMl*pH^UO%$QN|N(naPX!<6QrVBu~Sq!9~SCSr0d@UQ*-eKRdSr6-4a zQlw%@UhZEhUmm>iDmXoo=NvSv(_`T);63rw9XGM%q}1J)Ly?F zSgl^OADKV%DTw^(4<#XtkN^Mx literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/base_device_communicator.cpython-312.pyc b/distributed/device_communicators/__pycache__/base_device_communicator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3113f3cf1b44c414bc9b362700dfb83450322e3c GIT binary patch literal 14463 zcmb_jTW}lKdEUkD;tqlo3En}1ltda9Mar^uu`JS(MahdA(>I`a3=QFGIkeHzaYLMosK@(43qTmxu5VMnvpH;X6!O&?Tq&3H0NGStX;VYSgTy3cp1?v=yc%sT=fwpA&Hk#T*FX{E97cS`>nOI5whi zFTWTMol&?z1b%MN3M&bbVF@SneIfKDqGLT{;jyiWpg1BVww@c`_4MvDyM&%dc&KM= zGI2T{>zRl|qCI?AN{Hd1@r1zl@WMnmBm_e7XmmUl4h0i&QR*2AN~BrC}BbZdDRP$4x!bdkEdrEg`+zBK!7$6Zg$Okg^&;^|1U-?6VkEfvC#(}D#O zWIb8W=Qp{c$B*lhIa4z6KjelwC6>1~(1SQdD;q1H) zw9v5g=a|>2v$UTXC}B8Uh9}`~^>K)fLc|5X8c_a1*3HkW#s%l!S zfrQ_zaEbA;h@hCIxCp$aaN&dyl|(00iZ+N82Id5AufVkV%voStNnwY_VqH%sa zf@xxXx9pjcZmLVw({B=>bm? zhSepA3ZV8XB`^ZZ)5wV=Wx^EC0LXEOzJq2P#aHs2U>QYv$^vg*que6chRwY75;th) zZ2~iF;_a8%K?m;;7~Yu#TA;)QCC+um#xYpF0>*IhZlI)M(&?{KHtdf?w!`1CU@SNy zh|d8ODXbWbol)#(<6?vlNa3Uq%1aDFE-nS~_Ub8uA#fn5fc_mktxBmWQIiB>HIU(Kdyp1#Rw_@vF@M9Oron56vdfQSnZsN%#E z$d3g^P`IkOgqEug#{!AdLO{X;5R3CdASi)6c1ZDY5kw8{CG4{;G&Y{^-ilJGyGSZ^ zj|IhGBqBrtaQ+j5h}b7?!3cB{O0nt_TlGm@I4nq}{E!xCf+gZ(*ue$40`n*=PNi7! z6G)p*Y*T^RqDBl5pxDAOm>$0&4^v_{Qc+4z89I@3ooxDJHr6F2o_Xn;5M89Q4ecN% zD?T@~Hv4@FqYA2F^ZON0==x>-CN+_>QuWO;_--xJSIMm5$>;uD=ePCs7#^gz%q=;!JLA` zH>j-L_0HjJW%WD9AFy@Y3Hrfy3)h}&q&l|Tx0$*2dv(n-sp(XXgLKYJ)q6?Kf;lVI zx^WfT-HUH!)1wT16&DG3?=KJSHF<)~Z^6m4lUO#p1R1Wg(zo^0D_Eji$kyRk{ zgn$%1IP$60p8#7FG#H*9WNZ}v$G7pQ%di4Bib_#g;7zD^fF{G{Q=q#HTX_rU7gpX1 zeGDWo=q`rkpqHRNW9Qq~?CB63YxXwtPMFn6Xx_Ni2!hLaZd9?!m*5O8Xyw3?{5P#Z z@0X1Q>wtPz#aI3wH}rPH4s2TMYe4gRWNLw$N*J#a#@oE+cvUdoI%}-12aL1FT6YyLE(ALQ}8_&Rd*C=lh8i5;|jBAZchfT;=Kyxe5+`R6r z>gdKN#mxJlzjxB)e@vXVoCWH}@psepPtq`XYg;0DXhC_l(lmOehl~;T{aZ;FrcicN5PWPXB#!Lq^MDSr3xg2 zoo8UFX|@#5gZ>s0c+(WeQ&VQ1;n_%Kv}y`GqhNhmG)ktdDeGVMq^zT;5tIc(q~gx> zgA<5bqlW>8!QAE)JArncN1FiXM|-9$oTEkXO{F+!S*um0%#hRfi^oxPYZa_R%Yeeb z5j~{Zh_q6NT776-6v03Upos@w#@KLpa!>(M6wS05-4WuD2$0=tRqC3RSG3MM~J8-s6HcP@MLShA^z%y=bO+-s)gmhNaZT1 zI`8e;CqLk>JFhtxUS07$2{xi5Z;xi1A48+EA=?Cg+dJ0!FE=jldKxmGCuGkPi~beQ zUKq-Lx2g4N?_BTu&#EJ|ZO*hk1=i?5j3HV#Hh{w75xY$N;B{eueUz2EdQc z5r*_LgzXhOvXm(B;~_yrcF@e~IHZd$Kv9B%WZN2>UKr#Z?AHVlSUgteW%s-8S@S%f z>DVQA?E2KYJL~qORh?X6_;9quKaV~z^_%ljh^Nm$N~}(S<9SqlCqZVSkR!NYD!!)!3|9K;ogmzPGLYF&6-h1?98Tsm6(V zrw%ZbGVzJsu1&xlq$$f|%1A(Su?L|I113P5xac-J58WD|0 zak0vR@!H|K;fiU#JorLx6nUU5@Nj*gaU^MDGy?-zZ)#j)mq4E&nxwj0I472X&J8JT z--@XKp0}ueL=oMzH%to0Cnm=P0)Hog*F`8*tU+lq77E12#J3>puMiVh%A)g^$N?fb z6i$eaB__qQP)E8eF2u~dtcgYyHhgZFS8DMnj08c46v0E7hzAnHh$d%CJ#iLMkWL53 z1jQ1KPY7@b{ALxG@yMxD+f`UV0u%@Mo)9BQ#+A>ggGzW}ihx-?%uzkL5dTKs#{lc{i za^245+TBZgZUt`*E%nK@htrPV)O)Ww<{VjP-R#8G)Ld%u(8tFL_p{k2cKpu5xOP3XQ?4ql!Q#ZNlbM6B$p>Hi z)cLi0^_w>jlz#3yB|JxN%{*d*7`+a`&;7`hm17tJeh=t5@oG6qRJ1wO10e%(Al? zTtSyFynA83apAzVR@uF6xnlc$hIT!9ucCgodiJfky4i;1ijDUF&QK7E75OStrSVsZD>OEO1+aVC=O>)@=qc@zHty~Tcv5H*Z}^gR_wsZnTA z9#lX)1A2}K(D@2pd#(eY4mgw{(>c?bMrdEXep>`>$s40->Ik?JQl|Ie3J7`>8ytCJ zGys2}`T?t@-v*H2*&^UyhbkTTb)YVMCC?VXW>hx@3iaB2c%^Ct<2*}*1(SqB2K2x> zNv0zdP`7?-_;!RgfJEbVaQmMZFXn`93K)q4P=HyZYceP`0Cf-mns~7!bn#;(EJk%T zy08{03%OP{FfB?%f@SniSZ5WA5rw~S9ga9`jifmxtCQkbZ zO!|n5FHS&56`oxxCL>0RlNeoqNU?*unh1}K$HzfCCmmINA0*eiV8hcD{E)$)AsK=(^K=z9ryP6;p?N}*|QZ*vj=8(Ec^PFCVqPUlk@*FwG>}=4y-azw+w9nn%zd|kC5Lo^B2>90T=Jp zD|K5lb-U%d-79r_($0JCri{BycDKz#v}rymyPsHm>9%|4vUTUOduP_umhp7Tp3VhY z_B_5YDtmUP?b-VFD^v5SOy|>b=hI7VOYO^@y>eY|+NsNp`gv!@w_WyaU)q~_Wx_F7QuNXSv*PZ` zxOd6!T`TUVm#t5$FqzkqU`0_?%IljIkTAG-BDXZ+UqNLVRJ%*}XCm40Nc%YPA2?M0 zn#NpwO~dd~qC~9VIm7e3I=>!*qXN1PIs#@hUkS58ode89h5FGko8g%}W+T?GLlwd7 zHJ-|(Ohg`aC3L(_v6?#9PhI9hrV%ic-T(xf;))zDI^MyGimzVE0;v37 zVcwgTf;j+`p_*70@+;j?w2C0v%LB~K+_dT{yCr3L1!UnA`3-@~CIn8Vvy|VK{72%y zM4%fue)8ZR-`4~mxL(Jj=q?Q=hy$|(-oug)Je=rg#f$sAd)5~WiE&AFh2l`45RoRt z*P^;N3p2b4N9Ii*jDv%a_d`Bx=ZkBOSSb;Xsa{tsFLW2*g|8If!wAk6HE|s;XDc*0N;{oL`3ibov}O-=b&xUy9W5E{qH~%sgDScTqRZ8 zsEUt2Rb-gg5*bi|YQF67&R5@dY`WX~+^s!#dIwe-kITISOZ4}5WSX|fOO;sN8t`9(Z1k-gCHh^WgH)caP4Rv-KOZ&AzKg=8nuubH}qS?U|MyxuqxDw)uB9 z*46pYNx3#;oL<@K%{Vv7&P`e8hAT(sPhKCqHn_-K3oP|#_8yb>9s~K{+6?l+WrwT9 zQsSE1t{$E{Jbz|Uy3_nj`WVy`WgmKY+d!PnG(8`GCI0O$}I5==-^TqR0k&m zh}0BDcn=6K0h7C5fy)NT7t}N`xb?uJg}2JZ2MO*opv@4z357?ZzO%u}0b*MFEg~KX z^_bv+P#GN_jmG!__rqC3VJrRsBjov#D|PWIlq8$h;Ab4^3+#f1VRl zb(!kTa`k3VHmdy#Cx0Bc5s)|RO&_>xb!kdNi)?L~odC-M43&steP@3U)nsRxHFuUy!XB)%v&{OhbYT}w&>Qu}vT)O+?G|{Y z3fy6^}wTPF`CP z0PZH-K-6{cH~}R@=E!$ryoMunv8;@(tvo< z2($w4vz4;#qu1w^-&EDkG)*_no?P%OCO$s@(fK=72h!$8;GjF!BL$b{SAEw(-ccl5 z*5Ld6yABt8k@x~G2-^D!2M;}kNQuu_=hTPL0H5v}p^rpQ((5Z|plE@qPZTY<69K=f z$;h{CgiJh9-=v2BLk}kJfG~lYwj0zm2Igb>sW&#{XVw)#_@sQ$40m{fTmw&#>AWIh z0!4&P?$;HO6QTguD&_aJz-DN<1YOfh2NDDr++Cj@#}I5TtNs zntWHc&TajE&njoDv;V5mf5i-iRdtz4AG}GD%6)e=kRHNqxrWBM&^zBuwz*{cyWsKgzT9#1!CFq$P}?zMQMUECRWDqxxr< z!U+GQ6nxPo>b{BNthpMhb4#{sTeiu2-_gcBd#}1R$H6b?svxqQ)6Ut1T)8o4!6GYF zw{iZYTVy!mI%tC-C7(^QQo+uor*)vNTVv(6v9{^GS4<<=+U$|tmkdlgl={G+|fnkxt1 zot%A2cDLrp>peSVcVBLPw|SPGef8?8xl?le)mvWtNp~&<`r^zesc3QF?aA)xC3ja}=f*LW})3M%DDK z{#8;2R@L+LQ5fIO)n;oNW{yk)37GEex_d3RgKjemV7SN?qk; zx1X#s)-_Ux=>Z_q0-x+{$34iA^!|h7H&50>4(e6LL{nO%#U7KNeH% z-74}(wD=`GQR>vk6nMzso%A^nR8c`JX++FoQ(RL28>k8tsDfP}&c1Ac-3N}PEQwhK5YY4?Z*=6M?Y`{( z{}~QRF=NB~kR#~KKmVNjIsffD{12zoPC<$???v{sP}CprMNe#zSotXsQ#xoC^foF~~BkUkccudqXVjZ*+nvHTJwm}=AEm8Z3W6%M#Rp6rhh;z_M zQzq&JCD?9Kf?aezFl&7dx(L?+T>gQn$a9a<0q58FG7z9u^O4vnJos}`JS@uc2`QdR zz`G?8315whK-Gj(k|f5G!9+Ba9EwXLz;pB^Qb(llm1D8g$k#Nq1Uyf_cr6kZ`{E-b zsaPZ&O2(z5Az2J-y=pmGBp_G550(B77DJ>4X@MFv*(m%I>I9Q$7R(|ute32)Ezo0D ziV_$op;4@88R9b35G`14ItQ(S^(HmQK`ZqF2QAc9R4tN%O|--E*agQkzLR!`IDx;( z4myN}XSVa4Hij&M6KeBy@^xmNJ%Fkg>nbs+R)&)b_m1c%5RsEoBrc?)qJ+z>He42yLEP~W6pkdta55!{ z5}Y1{g~eUpNl=okwi}-lIVh#VLV;$r1rkldv&ka z6OCNzNsK11#A7|zqS29_in;U@&t$klf5<&JlS)o^VpM8`p2nK0&Uv@KPAK;B1Z01C zmwM`=Y_18b!Zl4d&m5mUJ->C4JHFK7&$V<`?W;+a23o{tlnr*t=NsQoh!gtKFp3ZklBlYrE!Ja@$WR+fV$SS=@f+ ztG4ZP>lfPgX8OP2eFc`Oa)V@elC;~*VE&wq_rKw@$R;A&h z4zFZ`xKYC;0V;oaSUSWsh$JWi)4(UR$`Yv$n58BtsoF!)XiyS`R9GZC7%)kGf^5uNy~O1PI^es!5wdrwJjds_CRpMe*P(D7GsoxF=XUfdJNmL* z$0FCCcQ<#g~6k%jTWXr8q&<4l@1S{z+tIHhg51J@7BKnh8LP^L(KRMa{ zq!^Rql0PE*<1xaP!O;0Nb>k0(rMN7Ynt~%vib$c@Rk^#w)_Qmo&hDGo4z?YQheJ_6 zmh;Dl2uBd*WF!_!MxYzym#dFCPsXl=q7lIl2j9PWY;C{~j@0Vmq5k#DS3(KVFTsHV z-^?Th;3=V)1!!ptK6ussc3g@IK{+xe5*8?DAXO)J2$rH~(ph)~bArpO+QpGXax@r? zTopA%V7VNE2}mjyx+)W0pqjy73vfgs5INPl#y+JkY=jP~4y_tA1QP6wr;?~Ow6&Ev zC_}aaGEIZ*0M3}FT7zg!pmveLV+HL|sG&UGso-QVW6#%krtFjU5BSXZjCs+$V`k|7 z@V()Wqd*oYlg*xYwM_R-@6P%U&0qWM`lr|bWqdxK zaf}77sxRNVyC?6QoVhwDFZ%Xp&OnPbty#_oHCoqY&g9)bc=_u$EmX@nnpn_(v-tXL zOt@0@mvrJ&STWFvCV?($9Mn`rEdeW9O1gF;8VAc%wpJ{qmjR6g-ZVNrAhVU4xL9rs zG~p0xIg>znsWIS=GfDI&NE*g{uS#ju&IaB)a1Fe?hh{xg83&K@g*6mQ455LBddTSQ zfgxmv(N(BQ#`DBE;_P(o38P_$LElyqz{X2)Zg@sZa^14ODy4TR^4bC@KM>jP}CUvJv~RCqU5+S(yu)QA#QDaSsn;i8|KMTg={^8A|VYj@Lk(sRducjh-|o|;X~_KXv%mxa1V;d=6X^YotC zhFoBu5L`3w(nk>;@*~Z?Ra9QojaFZP1E($Z{1ll-JErGJYh}F z^($tE49>3U2jVcP4o$lTg-AHLN+qD65;tid5YKXxjKm(pHU0RWr#_+4$dF@EBa1Pi zcos;1;5GW8ap?$VXoqC%u%zDm;bqKUt=_8}5wKn>WT5C_W(_$vEJIt)y$u#)u4O*F z;69q=j%t%N%TjoilZ^e5N%jKq|C^*PKj$f_4=R@?DT(20t0t+rz|t`&w95ZI0gP`? zue$nGtcKwpOpmLcUj2f5cb3~tCKoUd45&B^H0)phFC;MNCxH&sVStB$cQB~hgT+V= zXg(NxD;0`rUnB;K5}_u7tePPJBvBCY(^bm_;^3%ekY@?)u5=hOmBC{p;R;IqnBkmM zXE9(4i=YOAFp71+mC=YC*6g+ei(-b3p$u>n^_QDCyG>j3o!jy)txp_ows*O{t-!() z%vr5xil5}ClS)lT!HO)7TGKIeLGc6%Hhi^Hp5{Mcop0JrtOs5X9j2d{+t}8;r)4TS z87&Yx(5XH5f&jv|Wt;1kZ^AdtC^p{;3ydd6X)oJ4(|F%^&sU&;oa-ZxM~(mP``ovJ zoI)?l_R-6Y&C?yzDU|xzu+p|Y>)oz2?kKSM4Myv#Em#T3QFhmgjgT&e-LvehDX{Q_ zB6e3Me0%rQfyo0i9ZLO%tZRc}+xTZId?Z^*)+P*VQ%bm4m{zX9VwDjsffmg}pw|JP zVF1x$fo6cV5R}UzT8+>Q!7ADWmY`f_fdxc}10=~#kgp!!{)uZGH-sJ?m%O$#vTM}z3@GcySHao<-Y60(p_ZO>M#JlQ7*+7HT z25YYJmEgcC^-*%0y2-Ybp{rpO!m6aXK;!5vp{!LUaFR?UFof{%BQur)KnDejB;#Pm z$Tfmcjwyz67z{GyK&TwVEC?=`+Fpljko4Pxc=hxS-NqD*d`jUmUY` zg%c^jtO9n`6qi-27`qmc;xPgUN#~(yX(wi9AXAy)cqFFU2@XNj94H_HXF(H)d#RQy zp_l+LvsJS;SY54;!~l|tg~eb*0Ngn;6cHs2ZelQEfj|Vo%pi$g)CNS)G)B3!j0*s^ z3X`VR2xipsZFH*y8_7^Z{ur`%sin34yTf;ev)~Y{-Jj)J@}Bmbr(5xKXB_!;yR!b> zbKjf0J~xtG+m9c8n-NE9UhdkS>pG-#9m;j}Kkn*Z@ElXR`e!culRUla=cC!K{){8* zIhJ47KC|?2-&?cZa~bA+_p-;E z^K4c;n-@G=GR)5$fBnWrtvRkm3ZGl*j@B{KRv2;^^|zi7y1=u!qJ<03KletbeYkXb z^d?DgVC3pkMvlIbn-AyEAS5ty_jUJuoURzT0q{FqUml%KJ);)`utJw0fUEV&l#yOA zTpxq5fP^CkE2R5ctNQs4XM@=AD&(Mp@SOu~>THITdTTD{VCVr~e2Mub6=9v5D^l&w;L)ue^2eQ8~^T|o6*VV4L@ z)@-m3T44MEl$H-d276GmY7aCxs^-?IiBoyL7DC-6zE&5YPKEEB{lQ${0^j>58*8&` z0hUFJ)>q-}C1Vl$?zacrCSAGHa0jNTsc#)+SMewpF$%KC0Y+O-ls2h+pc#0@rPM6^ zes~)T7_6i*;PKB;TyBR9R+eA2vg8({|Ng0ar{)^xEnjdiYkvO`D7#7tlpIVGIFxX- zOGfzlh?{PD#?7m0;7(K}-AyaiU@n7Wm}dH^i^suzw3KnUv?WSrP%b z_biSWZZX(lyw0miY0LARN4+*pz5fbBm2WPL?hTS9l+5S`7Vt0ja_Nq>VkU@8RHcU{ z_yTSO7{gIp%Ok63%UjhDybhyrtV+WeRa7N#EBsX~cmPjgUN!~5K(ADC1vHNncalScY((c^%LX4UTmfNN43R5U}_a6qG-Q z4CK;IH8f2-a!s3+rp*gYTNdiKX1FB?@VDl?orNVeEx@VEw@>Qj$5m30m z?D}7C{bcLh*rT0`+=*p(y{0yf!Bkeor6ra7TM)nR+9K+L4>2nM_yX<4D@eFNKxm*c z{xtO24aG}WN-v@s5JVRU*bOl%1o@;_Yx&0-X^SqZ6}*!8;+3dudX@SKUGdeZM(M{F z{ZsphdrLS?Q+#J2(1 z_d~}7o-BdEq^0Y~{#(d2`6K{5ULZ(m%usq&GYF-o7wDvnH!o-^^`x216TmeT%F2@;Q$%+PpAXEDQQr>u)LSQb5LDS;V*UeOJa&S8cCNEQ7038i(4eh0y?v<%+l{h$p1 zXf@f{tuw(D3Lb?fC+nG(S15QC{L~Tp7yzecc5lAXJN3ro8wEmd*)!juY&l3?mTk@z z777&Vs0L4dL*R+S$=a9Q4Y$%0=>iKhs4$Ordc)n$JDp17rrGsMV|T%dd=7z88&ZJM z`=$lO-Cn@T1s-8iCsG@!ja#1BEbL)=xuy{z-vWy-&}2;`>A*@}0R}JDCuN~MXuv@} z)zq^M&w^*+6SRcrEK(Xci`14nwuNA`@Pq<}*z8j)eALzzjd!3%`Vl--3tWA5e8y{N{-4pz6V&B2}`xQ^`nF)+E(MKI%Vxl(ipOHLYfLgMAgHp%DDA2ID%l z=@R_zF=+66CJDnK?NH%RrS~xFzzk1>${o;t$#qEjM|gz;0mlNef|;i26^^3qUs9ew uQ2dwFmOoN~FR86xQrrGW9e8T_F-^lASm#p;i9a7T2Wh(H8wzvM`2PVntE8p? literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/cuda_communicator.cpython-312.pyc b/distributed/device_communicators/__pycache__/cuda_communicator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..265f5fb529c07fa89fcc34d139ca45e731b03c01 GIT binary patch literal 14466 zcmc&bYfxKPdiUxrbU{J_0p=whMhGxCwu2qOc5J|j9l-DGh9*&D-HVZ|2f30+z#_G` z=|rUI1UI_{Pj<(+-8S;Xon@y#7SD7VcczbY+G%wGkCeOdluWZTyZ`Xona%EWrv1Ki zbtN7bC)wSZ9$V+0d+v9>^ZL&BIv@X5SXe+o3TXbs_rhL^`fq%ZlQvs;_}5UFrC5rk z1Jp1+RRKDv8dlL#ojRx)){wF~pbhGVb)>8b=!1q~11W0*#$ds40V(SOrXVxSkg`5d z7%Untg0g`%2FyXru!Ypqf#RTb*p@Aq440@Vj^Ro_q@@uJ+f~#v6kG5n#hN(Fhw5x? z*#zxket|DTE?Q(x@nJ6)i9ExHV`K1M6B!N10<1ga^#-MWP&^hvA~U&+874MewsPLfT1n&^XLbxVez$LV#o8v&k3oMcsk$#fuyduNH5N z=ebbSJr?jpN5XvY7dUwMVDIC`eO|6N91O-nKCdSl=8t+JoL8EJl$)|I4_{GH)LXFj z95qa{)Ue7(;Xk&DRdH%o&1w8(_trP1V3#KKkZS-!Drq_0h#^6Z(5&vw!eKqDf0G(E zu+^*q+FLia#W&W-6^v+Lb+yAL=&R;&aIpRiS5SeiNM9p5mVtJKY%N>34fZ0gAUiHI zO9Q+G;ESN2TDBNk)F_O{3iS?!dRrcoa$h3`wgj*jv!yU&D_J?mcC&$bD6R(1jjSiE zio$H|fZqnAuVc%U+q31{)K_pNteGoS{;FI%qQcR@tad=J_Q&=rVON|Hta&B0FH2Cb z)5GOwDObH{>5Z|T^%OZI8ogsN_eDHAqK+jLD>BkETWbR9eKp3pL!Kb#m01Puk`8`& zBp*HtrSH+RR1|ATnzqKNKLyx09aqKGam|MssU8kG9oLN0yp~l>;ZGMTXVt7GpbDy| z^fVRMPidouED}nI=%;jXZJcsr-z3F#@CL^~N>PhkQX+VNsr)tnWVI~yAsjwTtZtkJ zZ1SO`U}X9L6D*uU=B8#1fuf*!%1Fw_fF)Qwg^W(k76hz8+ms2)6tpgpaKYRue~Vns zKUveIK8oUxs;IaTAlmX@*2>|IKLsaY=PpE7gJ zj(3Org^a#(!&{a77T59nquuhGzeg_RpSk`UU}mP@dS>|^wfwt)W&0Jp0{s}iNk0I4 zAU|HY#8NT(b(${&+Ee`QKO{rupA2=oP!>k88@Ux+w?a?nq2NwQ!?&kWn6qOeHYCoie7-ueZT$$zd)A z%uQ2;1asjx!2IvqV7BHk+W@n33RDOfbI~`zd}ABTB{|HcfVp|fOfZ|j0cL(1%=R4S zGQjMb0#ygbZ21P5`?tYdp2J)Lm|Lcb3FhK&fVn*SEqOQD9m>7DPyIk8@BVk_=(lBf z{~ySu{FAK&94GSca>@UqT*^QFFXzj0iLJU_EuSQsnt*?g^qH_nsUabg4!936Xw5 zq$i-Yxi=PxhJ#1eDSy^e6GNaGt`~v;Sadq(^IihDGqRFsI}57gAQxN@$QyA`Dp5=1 zC0~SZY94tODT|hzx|vlMMQzCA8%LefHiQM}U=RETJs}Thk7Ai85V+`xj)EpCDW?%o z6G1NpcOa!=rL2c7DAkuOeOJ z6(i+5t?f=n!U-V=y%{X8G$MV(~CiAEsnusa;`a-z}g_5wvm+-{^dRC(QQmrc|} zxWLG|d72R)>Y6+%VEi7;km$&6lKtUvoyoe95pFvm9=T@(+d8(SE5CWUBN`m*81s$o zih6jUx?Qiw_CIyt(tfTZ;JeT{Dx!=<(|W;BK36e+EIF{)x?(t%F_~vB3#R(jhUOIG$ka6n zOwG@j)2j_V@cL=p&UD=oq3%eEsab72kkG6c>N55F@LBtLb?uGe>%%u*zW#Em?daWo z%hk`OPPtRnZv49A3!TbX@P&>to}<4|WsBdUS3A4#YdLiK?&0afbElU~HJ{Wzk+JWX z4P6Um$|`3sU%UKBt9BG6`c{~#M+VB$x?*no)Y6*lOSkn1Z9OZNgR32T6PkCJO#2@4 zc^ed+E&p?fgh{k+iqd}{ymkBqu1K=#hSBMW75q|Hvj>|8Os z9x#-neWC9sLq8t+c;A0^|7-V3?YCw$-(^-!=CrAD*;JV^+tTI+!Q3$a+LF2bzU9DY z&aTCMA04`LXxVvUMsv^7m?Hf@laFP8ebSJdz9NdT`ilqQv}QBc1NnAXYpM6;A!FD=~T~|RNdKS`?-|m+~XDahN_p&j89*qQ!FdSm|BC7a^&K$X$r$XLkiqMeCa!@o zXto)H#*d6+Q_A-MW6=0>W=tL|53ywoIkPYiJ*ACnuQtcEeiQ(kQXUiZrS+pQ*p%`R zTl!MiZ!*0(UFrw|MB7tL(WT=DuukU-fqLwQsl}qwpm7vDb`Zf6 zEeT_O5G&E}nXKF*CG$OA0f0yZ4)7IfO-C^n;ManwGG%hEw4KPnE8%5n#wjq)r2Uav zwX5KfjobYojM` zn<=SGmozSwG-fPi5;shauUI-)trcl&i(qX@b}n1@00x_V=5UIspE(Sr#EFd6zCDs= z!P1Mmbb=q}8aGm(LbH#NgW37Gc z2(kj(vt+8x*eVi(M5I+OnW{f!EbpG2J~?+R*_dwW7FxRRviF$hVB!{gqVSQLDg-%D zEilz-rdeQMpbOFT{%3{#&!(8>73O%xT$(84hj5F~wiMN@FX+v&bEv_MBq9I|@NGo^ zFjUIFUT{DoOvvP9MhFZ5Ab1;iGig5xmQ6|a?Mwn|2ZloXa~{b&Ab37i@`3cDA-O5Z zIFPX^i4B%Ut{PWQ!mK2Q6p`5U zpjP2>AvdZJn$A6@oaPg z>`D}|=lD~Qy9x=@lX&1OP$BAr;c*U#N-Xe(Ls8$wSU45|b9B@L?am;8mJ5zWCwMQE zB?IV1tSyx|ojYsIyTP`Ogn7b?Tt>ov2)**JU^C>iq)U;JSds({U5>^=mxzHSGQgye zTalhco%9w#8p_!Wgj0z{8+)Ft-NLjs0wB#regoMRDpOi9>%Qhr6o5LLwl@j(ri3YD z-jOmlr3_7wXG}#i#c%IlWy~|3vt8G^-s=&V+C=Y~fhw<=edF32^PZ$_xolUu?0`^q zV7aU-!K|7q)8;zCTsIF{u09s<7wwp0w}I^i}fkzL80tmg897IK3jRMGQ*V5Gik>j!LettC*3_HbPoYLwH73F zj|wPbVcJkH80zOclWnL~4haW`QZKx8&+sx~DDS>8{l=VU-nMMsnKthh%)6J(ds2oy zgnz)Mf*~96u9y8e{qgO`F%Qi)%mWVb{|ocbfp&N>YJx2#DXy0F%D4t~m@LcC`574^ z|CAU^6h$UU`L^p?BC}Lr0?3-C^dVRLF_`0;Ev)PD(8#*su8Z9)SzQ!1rAMxtN>Y^?^Q0rMUPHq{Gf>$ ze@8Y3S{y74Mc5dB0ZO-MJ_HY!InPTZ*1{qf2bri2a3Ow#lp?X9Sjc;TwfI8P4Ig1; z{6z%LF*4D-kx`8ELHO<}*v!4K<8|Ho@D=SW{t{*Z$RY)h<=B)%(x~vJaMJM=%xV0e zu=#0OtpYRu_spM6mXxXLQ_z-T?{&@ZShBUInAVi7HDjwy+ja`Jo%0^S)_l8nLHpr} zC0l2T=}g%=GZi&+(RA$-0{pMoGks!p18}lSusyk0ykzT2F$emogpVS2ur>FKhf zM$JZ|qK@aHF+Q|Gf6B4IX4r&T2uJ|aunHIe?91lFs!)eeW`=m|K`Ww^OliI&n$wk( z3}|9FAqj0tadJftI29OK{zy*HQIzi_DX}djJ?wYw(WRa|a2EXB;M{gd3+#x79pL15 zuv{eS3wffxaL9ot(BT8;xZ^?~?7f8U@Yg&Ol7R+FLIj2^ohvkEc%oK-6Eeh#T5K61 z%7k=qUjhKQg+Qx@k6|_rSp>NfoI2^aO~A`!>6YV$_Im;u7QkaN^{Ka|K6(;p z{-^_JCqU0>Aq!^k!_QATr}>qR&%G+Oymzhzkw4 z1afMLl5q3q1a^K>@!z>mpkn74&db52)jC2R;ln{f%8_u4_i}`a+w+phWm-qLs31#H z1&<&x%CmILbKY?t*@Z;Kgn+dY0bj;iBnP-$g$(IAf;&e9h(ye9z^ei=Bhp_Y_$*{# zhZ`v4)AZx%IjdC8olBN3M1OMm$Cp=%dlR}mm4k75eERsz$Q*maf8C$7T@NRt>5iv` zj;B|cBT7cDlM0F|SgTvO_5 zy>h*hQX*TV*+>x_D@k}ydy5`&X(UsLI8Khi7g3KKjt|2s8;C0(J^nq@Jt>eT#Za^_KcA{ac1vYFY($^?o2bQxr!|61(m$ zd|J^@@U{W!6;}Ff{j~lnGp$$RH$1W2r`1>O(VQnBp@zUc^_zxibsr_KysW(Qpz6L2 z5lE~inv;b}l8@EqKE@xK?AUS)(3O}SVtWth1D-(SP0fwt{&aHZ0)4CbuU@*-|LX{j|BtMtwB%VpwS3EE2%1O_{~2(7PeV9Oj>e%r zo8;9*AJ)~;byp*~>xpT@I%80g?x(dxuF=YrFE3)G=>*Dag>xgde;*?SAA(qFdJ2D-b7S~%M3>Kn1PA`CNsXiKphJ?sNP@&1j~ytC>GB*B)NsY^uA*P{BM6Iwd>hb$?=D308EwjgEQaJF#qKHlXJnOf3bMcnhXgwJ=aXD zwXS5{^{M2fQ2SJ>rhC!x!JR*`>5~^xV%-> zMTtSwElOeB@HK653KnOwaAExJ`IN=EVnOeCNnHx0o0=-FN?V!)OVj++!U)`LXKW6r zCjyTU0e=Nn-c<;(N8#x4;8=hQLYz9slHiARM*9(f64gKel2222j<_pDO#lLHM71vz zg^h>pfzTR^kT8+hCk(XV!D#f1jiFIulq#XP)xn9ph<_LRkZDFnJSGuIjy_&-yz+{} z;duIKLLdCw&}OpgaRdSJw^Zc60PKH2;bSOpP1e1A;8W|)RMT_I)`1i=u)R|=c{$zD zyVTK}GW4d*=*1+4^$%^!$Xp>?sXLk}DNpo2&{&OiKa714zZqYnp!j!v|7g2&{9}#K zGw>^XCzX$yDaJlC@_p_`;Cf)i)cT2S=ln>LTej^>G5c2S<={y**wTht!B7h^5I==0 zkX~ZK*Fd-gI1&~b7SFHL_TFRqU{6bStRif+V5o*TiB<5x8g`H^6$?jwEX#$Y7^et0 zap2TJG!_K0_@WW91pjJ*gFtQAB)1$7wL$O6S@Xybw$nAUPD@1_-=F1E&^(uj<&?X^ zd=a?N6phj&t{Q>sV~GM)Jkbs@#%IhE^^St#NEpNWfQA1)W{y?F;O@( zg@>$qE|@a6rVOoQg*W&r-< zCcG%<(%HtxaAmpxgj_~$T^YQ|YZJ=b(&hVv@_n!}smcymmx_W!Az6}?p^B`AOFc9s zLI~zB{uh!<#s4Ls93mUazX?SbeauzCKL;hT!0ir(*;oK#r9{TAkc{$Lg++c&8fLdaMWEl$Raukml-Ze%39@gp0H_=dZwr1LPXDX{7 zm^fNn1RlLLEf!!k;MRe;#>_%!7P;1YWRLq{Zc4Cc` zooydnzIjT(y)+6Q1w? zVBqEeIyo&*tZCs3EJjiBnjT9KS5-38J3TSCPcYX$#Aa)wG}YAlz*wMdTP>?X!Mdh} zGPEftVs=fBZw9KiVGUu{3aA2eBKB^4IzD&)#*5cqOqpGRvE`u&zwMxo(?c*Hy|y}2 zU5{VaNcl)Vd1lA|D=mDNBpq(3t6Y*@#Ux%~eP}2lLD5L!9Wfx3?*Q!V32|wzh(?(Hf|EIfo2aQBw>=GiHu1KLFCaU6^lrX{C8v#DMYgT0{qdK zTOp*6M@tac{_FThXB@v_(&$nl&K4eR8gfecXR$<%z#n622(v-RL_?2sdJpk`3$Ji0 zs0f~%H8oAs4+|)|@IKY@GpZaS@n{tU<ohM!W#PmCoq-(EJ> VT+uz!#WZwvs%+O66c$Kd{|D;OPeA|x literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-312.pyc b/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d324ba3e6fa7e731fceed0ff67c20185f10599bd GIT binary patch literal 9500 zcmcgyYfKwimacMD8RG{A48}M-OkN##0!~5#c|d47guG~y3F-7UGd(qqs{k{8ZB;>n z$646vo)wwSZWC!Gh!GNB#Y`@6dSc5!Q`90Q zP@|MU3x*IqO2d~48N$pcLt_~mV#C}hN79CnanwlYD#E5w6Di|Dm0|O!nWT*&OV~PU zr6~g?nnl|q;EiFlN~9-Sv~*!STD?Pgje$B(2^Akuf(a=07PisSj~MMc>JS-1TM4x0 zM+UuCZ!lUXSj2k45U-%9F@s?JfE{%bY8y~j<)}v+W(;0?{1?|Jg0g2U7!p13^?PDd zlIXcQqfA62o?DVX7851USXA;T6QU)t5bbMTt zAX^=nk|gMdkA?gSh!YOzx*{J&&ukrr#3Ch9U=7r$L7+uu(o{&Or9rb`M`MO@dX$@C zysTP_in$mIyd{P&fM_9+S5#9lqKHz&ANqO@S`B-Rs)^?#{;~+6={3fHG7}SJ)fnJ!&L|?Z84HRbLFQMfE$sXc@A{N*%ohvB+7-VvE-LM}r@Bv` z_R!GK?hO0o|J&FfR#5u1@v zQW7;oLUuv)0hOuQwRGd*dk@}Q8A&xCPF5dD+K>L}1*tYiT3(Gup?X`5Z&M;8vI5-$ zr6LC@14$W0t`SREfx-L?>*Z9_pgz4SiwaRIBvD3%s3s7WXlB@3Az6T2vSMU|NM%4y z)z0(&NF=KGiE;3}#9|d2LM7B7DH*Z@Q}wy14HB{wq90LTnQiZ1%-E{mAIenKE_B`v zK4lxUCP=LsB~h7@A_0A2LlH$8;SUXG>x>2^^c`Bs*9(Rm2pp&G7!rm7>OBCZJ4}K~ zFw>Mw-!lo!9h#;hG)oCA{>v>(no7_zJ;Tm2lR2}*-t-`FIlgz;1e1VvZfY_IYB?hK zKGM^7xCA!|Fi;w{`3Odwg*(QCak8?MiUKgo^(UB+ZW4805)xF7Mz&p|gvx~RV*sTD z^)dBVY;GJsq!Zi-wTC(avfl#ieV3x)-@WPE%)8Vr+H3K9L3L&ig#JJvDha{JI6{fP zFf*YjG5M6w7xqWGJEGD!SbSvaHfGyn{&7*p)Nx--iU#;tP~gMyntuT1Ej62|@R?*DCF{ro`2-mrA+3wvwE)wICkk1pN4 zv><xpmpJ6klmxyS+AAUkd-8kdns zREA5*C@OZMgI>z$V~4((qP|V9;8|uw<#~#hakhwS zZ+oB)kr^t-`?_ z_#N7ze7k)+wD;`TSCeRkF?Rq>o8S|zuaN67(CnUZc{^3xS@6lOT_4gMY}JM?+K8wO z5cf^3$E=nfnTZ4@q-Z1<7ghUq@ikEv70n$zFDmCGDJqQsoJYo0GZMW8esnaTnlL>m ziK1RAhOw1KA%+978P$Ol;Lrk-yicpv>4jHdx7aSVP6T7yL#%_`%OkAHmuds7fqcpM zDxpXpUIiq{=t!A1I6aADtA*;)p*gg&qLF#c61wx!5sZ#v)QQnCjE-Z}g;6&~Conn* zky?+N7OzBe`w<`T2PQ|oB0}6T;G0^F-S|LZlKM#ok{Qw3W3jD%WdRaqK+uJE)(!VKkQht*2(%y8j@M`wYgKPpB*{k}t=512}o*Jk?-?c(VF|+mm;l74WG6XB2epZww8+#h-n9p#R!+e&h;&wg37B zZ!PitR91{k%c`Rch2(=_Ndp+6H>0v=2Zn}J2F4+s!+b3`yHN?8T|R$FpjN;U*{>*) zgeya>2+BNo@?j0%aG8TFx#cY3Nl-$Ik9OQ{ON@xar!r;X3iubz*_ttlO2Tb1h!Df8qBHuWdB=QVqVR4M$!u zRQ>Lay5p(3<7>ia!B2x<)V=XlWAnqt2aQ{mTz%~VowZWUZ4ZYY3@y-qS($+(+$BKL zoY}i?!MLC-&fK5*qPi{Xpc+qR>!{QHG>)<(W$jqGm9lnk0{8mvRDJh)MXLUddG^oE z{~y{`h0n z?f>kBb7wEd-NklNSogQkN?MYgDvK(dAE-+M%^k1YU90$?TB}gp7ttDs&Ql&wl2_)nd6^V{_B}8=HK+2j0QAmFOg%)}!RY-0x zrztQH!!YqTRt?g(ft-k*6PbsCB9U=63R3uxkCR=hJ08nq*-S^D&#{a8m#6v{UqyUbuKrpO{H}51E@o0aJ zO*{}4iPW7FsVn8~PP8m9eNchBEFRNBN348|a6+d1#)1WLR9_I^Nx6^yyz8@` zPkWvi)1BvkQ%8aT`QM)^u2xbu{HZyyi^1Pdqc7)Xa&S_J^k6-Yo$4a7uuMhPjY8tE*4~+u{ET z?GlDW{x)IT6g0#mXDA{ltD!pzDupl#LnPy1V8_!T0;$VRH#y|}5*zG;s7UO>DPHSZ zKmKH2+I``f>7ph&BghIo07!u%vC*!{Y8r(Eyzeu3sEeMVT4*xYj9@a$PCBV0ddR^o{P$UA1*)T62Fl!+Cf3k_E*45 z#-#`MqDHE@C24BR*gf;?4hv$fbDdd#`^iAsees#;646)D3dkGBkAY<&<`m%z>aipc ze9`q$Sf%4j{;*f>5c=Q47d()AIqgL%znc=ck%CyI`&Vdes&RrmxfMBAtnEJ=dmLH~t#_q6-h48cwqAb07%I&$ z4b{#Z#7qO9@(TVNs3B%(C;;DA0Xw`Cm3!6M5!i{T7Vs_y>`cyd(WGb=Wj$&MkojJH+2U|5(K z^Y+Ae>0y9jJSq%(_mn&ylE*;Rf`SpxomaF?jSIj54jyp>d;p%V`IuU9^~%NJ>*uaX z6Tk<{aSMlM-YRUWE$z#&qKV1k>+KXYmrx+-TNn*sD;95$B=o7t>!gHt4dm!5-M|zv zbo8H91{3ZC^+LwI&blTLTwO%^7Vtj00 zwXZvW+4PI1C)2;4{d_k0+B-0D(98tQay>Xc;_vcHV-KSbqASynXIE#RR3)ufw%KQ_ zZG|IAHg%+JzBT7(t)I3gt^I{`y9?>^BTja_@nrw6+dpqlTCWo7>^b^1w=u~&vo~nU zeIV1;o!NCT({d!^Y|GR&V+{XWJ2Rf+&+9F_fUqIkLbdynY~#|gWoCI~+4SH{n(cs7 zc=yR9>ss2oBrgxFI938Hu1|)Otw&St&NO@CIornd(?BnLV&0-KogJhZu6Ow~9O${; zwPRcO$(m{IJiTJcQutkGw#YYYq`AK3vsl=-GJs`$TTs+bV|+SmWH{%NjBK3Rv>_`e zvO2Xi{c!fdEH*raJ-`QfU80wrKW+V_70WNt>kjSvWJLS^<~kByqO(ShyG&;sjf;Kv z`?3tlczussS6lT=|A3bGs?xT_VRg2Oq&@F*@H>XUgj$Kecj&-- zBk2;-SV&&`?T3VZo?1mpa=-nM$!!op0@bLK{Q$_MI7YMBpc1alrvmWiFKU|*UNmYe z7au6Kn<%^})5h+=CZ-=8SM#Vz2)wbT>3-$$-b(biZk?>V=V}JC~drjowtF_XPzR vurY(>8v{eTV76)376aKV$y!}>_43Hdu}?;~C`f+O^|pa_{yT*+8R>rlJFXsU literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-312.pyc b/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea8468af13dcbf964608eb8ac094c5d5a2c79652 GIT binary patch literal 14661 zcmb_jYit`=cAnu&!-qslq~5R5+q5k?vFzBAUy=22WIKvu%So2ql%Y5ynKECQ8QPYX zLb3~EOC>HWZMT(zW*fP1VHtK21z2Dmv}n9Vfo@wA6+<~xrb=O;DbVznV!Lfp^heLR z!{JagV{hBNkY?`Od+xcfbI(2JyO)1gTx_8r1P$*7dUjLPKjViQOqs~TE}EhiD30Rj zAQhrLG>vgxP#4mB^dzni8axI_(+7r<|!j_Cg=>gJT4M92g^eho(dAT1S>;To+_F$Q#y(-o`hEChg+Eh zptpKzfXc!vn@t6>aMd91(^~Xdh67oi(qFX1|_Mbg7`qGQuzLO`trw0el z_78er?0e07X4o@$Ms`g^W8UcQsMqI57GD23-hUo=I0Ip?bdL9mz7X#XM>yW=6OnZw z9FV-h$b`}ahd(9=d|2{EgFb0IB7}gX)*s`1UXH&I@bg}OBnG*nUYwLWac%Pd)rUl;N>JNtr>Xs5q@ozyS67>l8Jki|gXl z`E3*>&{xb-mdB)l@Y(VZ_DQNJQj_K_S>%Ju@}?9R* zeb5SVS4K>I!Jt>*xtO2ticZOl7aIj4Db!+(@GpAc@iw(lTsGJHS#d*xUbouj4Da-; z+guw~d)nIaskLRpR(AE|?8(*YGi$cd4QJIt?|g6CxhLt|vod<;*_89pr?x{I_LilA z<)(Y~ou50aHtkeN5l1zd(GCnTyl5B(J9YRGwRx9& zsDY^{RE&`fhQWsY^0- zDW+k`nQrP%Hg%_(_TJH__rH+b|3Yg2@nqBSBy${!U+tOgdH2AgGhOXYR=ZQxJCfEN zU+Zb}&P~1EXcFq68bADJgF0vre+(^Mpm@qdbCgFn0o&9c(;mGUHVCeU)AI(-z#Dnf zG)xpD9P_D}@%6!d69j%5a+p;xydQ00a=^32vLq1q&25_M-@&^9nByRj4!rpdCr zg7A5MA1sri;~-?}F#m{_%2hh4QjNuF&UCv--Hg2U_9G;ShKz1oWzz0|AH zP99Wa+Iz+vHw$j*h?*kxWn)l}`x6|1AG)}?;WSi*HV90%1h(lIrr_Dn6I zzGp0POI$bkf=Z{ou~#$JocTVk<*Bi_b%K_(xy`m;&4+EMIJe7B7Le+B0nh ztvF+k+b3UAd1!BStf6)n)b5zE=Sb%%jZTfJ^osoF(4JN)w0GR0j@wKr=px7Y0kDj# z;-yd3sikT;qBpcvHLCGeW7+mJORuVFphZop)IGJXvfR4JK9}9uV3fb7^3>kaZ>jOg z@2fHGt+pqwpN2%(Q*^hTs!V~d$o8c$@3=lr!+wx)$bdepM{*<$O}@OqxpE|%%gcmM zsI7ob)ubJ2>Q+lZ8z}Fd?AMkZDQ<$itS*X|!dF)Fzw1;4F*sWC_$SL50n zcfrY^_N~TlWh@P-b-$XHQ>&n+X>Z^Y)xOjiq`slXwYNIzD0AqF`gJqK7vV`__KTFe z-1jkvh6UU?68eM*u{ZmLNyMn?C9Y>w0T+}cR*JCTx}Rqw;VyP~oE?dT`JHS)VlM`Q zK{h7xSY!()A_V0CJ9#&~uN z@CV=!VaJ=%;qCoFFyi-tw+R@;OtaR4VTlchq$0=-5=UF$#b_if2F3!xfHZ|HLca4n zE5-z#<$eBhtUsc;ve_m87p2V+*2jXQ`(_M$cLIg%Bu+hW-%?Oy17WBaijfh|^6^0} zBnN5)1cpTfMr-cS==2N9h_Jv=v5&C>;BfoKf;`8L@qQmpMr51~1w;{11jX0Rp$BhW z@Cj2R)BExVS3DPq1(6BslO*UWFof>=pb_EDk^cUZN?W@GT-%5ViIJE9SPu^B^od`q zU8Kibq=6NqzOcyp{6a(&Nq5Dro3zjgReXh7Z2IOC8FMhs*v2pnx)_ThJ_4;KAQOu& zX)oKxb+Km(s8Wo^& z{!%m;@CT&eRM)hTh?-3Al=c2-3=lXYCrw3pp$qyUz-CKLUw>j})Y%J20QiI>5(u5z zqez$yML;&BbI?`kVuU@_Kg@SV!oex_*r~I>;eOouwOTm^v(Kvd9~<=f&x_a=1Krm6@WCJ2svK7EB?-=xl4|8(S(M-gED?rxcx-%nfQASc^ z2FY;^5ZFvE!A}k46wpYME4*jUo;vl?>CwRf@2fAJJ~@ytF*35in17`2`-JV6S)%6ddOr zi;a&1Gd`6B~nxY zEA9D>C?TkIcD`i;@{{cXKLMTRg-ko#WwJr!gX2U7lyOI(Kz4{zp^!JkhrE24sMj1^ zwDj@$D1R&oX9K#919buB7<d8b#TXAr5e&ToyeyB?2@WR3|yW2`^&H()4J`@7 z8dI}gRhKYaDgGCG)gu#C?7Vtn_Qa-+E`9EiUSC|Y$w1Qm@*1G9d}(*4?Ttx$9z9Sgk{}Y zzF7Lsk^8o?`_}UJz3KY?B>Y+X5g#V3b)_Tprim)8-EcU6`cm4}n{@T290wj*P0*DR z=*pMY^11Q%{cG0djq;j>@6UgKX>hrHxoWj}cdER5c4(uxa?x~MzgFC;X1Axx-HD+u z?bVCz*V}IFytZ@AzH61)wQAqBUV-Cg-Kg7{t~;8nJDRE+SY@iewAH0;tw~$!nyvkF z2fOZQT6J{ZZ{3@29Z0qgq*{koZB6U;*0pW@NqhfBZNr9Z*XlEaN!QTb>7?uQx~uM? zQD0X!$H4fpt?9=7$;SPwOx@?!#*OOw_x;yfRyz))JNl9xeRo?@9Yd*xW2x#FW{-af znfzMokyLA6s^Ms=x<7IJetpNX=|kI1Te5!dDpRvjTCq?!U$@%$>}tdAKX3cQ{jvL3 zgKMRy6Q(bRPWobCs6QhhDu~dSwvN_Qj?}=k>uG!`Fsa_|I$ySBGBR)Kg{` zOmw9MB*k2O)jn%qv@X4}vga4QANAg24uAga!BoSck7}3u7T^5f!h08fc4?({m8o5A zIP}OuIoPzlGimS4j^9Sj_H@mmWX++~!!M_5PABX*yYxGUukFg2&O!Q--T-rFfw{{t z=}6i;*6i+|Z@aVSZq3^6lR4BQIn?*7>QSsgfNL7k)w`3`yEkn1t1r&J`0h)K=aRPe zW&REa9jfVmNa<=UK#G;rvFX}9$=W?DuiZVnTDvDzdjgk8)gv2K)A&K{d$sGf+Ut97 z9JqF1Mf$}LKKj8Uy;{^nRX1UE8R1>8X@xKQdLzj58dJC7s!Y4uldkr4n*#-Pao~fI z_ePcnR$9}$hmyO8VCc)S31%EuTbij&GPOW6{NC`Al2n|F3?E{8t zJS?IP|68W(A3>-S8?Ut~GROcW z>@?KFZ$P4Q4|xko7|2(~XND5la4Umwd>=T>(13p8$p8t34|pZCq)5;(Q_Wk}&??EU z(iOtyp)I)0=7rl?6jp5&;8yw!ZUya{(Q|O$VL}J^6a)B_VFnPAxNbrhHvm60LGuEF zm>Q;^)1s<8Liqf3TQp;o@KlwAxKV58>pLiljG+lIU9InRDy{a0)+)$rQQhWsBc~L--P+CKS zJE0VT6KSQ4DB^<7v2}@|OKO>J&)0#4)#zuyKzs)D2;4BJu1i-xm#lsc_U@^Vrc%{? zvn2`BoN0q8!6WEZSI?h!-?{K-2iBNlU}@}l=I4g=GyTbD`hV5>58MA{``R;S*S5cs za=x0}{>swJbEAuUesXek`zr}%)%ogrW82b&)rPJFGv`{Up09@8d&5!jQz>29oviG> z)B4HwkGHQ?4yGJK3FAg3yHx&JWykEXL|;PMusPDUcCbd)9aW2^^VTKT^5}=2o1VMI z|Iq8e?9o$I+n_Pd%AZQ_4_tS?H$4B=N0(CDp8x35O4D-b^0T*GE0>bno==taC5kr6 zpvtFHG1fHGoMf88;_kWDvplhK@g8&J+Xv;8>oxk@2M)^JM}I4Cd!D!&zwERQb{hW5 zY#ZEd`K$f5!F?8T=)dJM59)ZL=l=;Jf6C$A#bMNQ)VlNUGxo-&HxaVjC$@szC4g`o4W1kX@YwA(l` z^pyjc5^#)wJ?Lhpr= z&K;|^9qV=%xm*E&mp&4cWQ)4qcux2o>|6pKS;0V^g;PTW18(%8VCdd1&r!HYkr%Wn zgsIjGZBkYsCbJvAR;2J0D5!bfcNRZiVU+bn4OXFPQcjw0Lg+?nYk=w$(pTz#A}0%HZ5Y3I~9x9CyJn=5onGA zoEVJ&c)|-v;{)W_$4zNWPW5+cT)Nno5b3!X5E0-HN3q0sKmhd4ABhHt0ZTyPsKDox z$}&@oUd^Uez+#4ATC#-zIj}#R=av16Kub!mlnn|GrWBZktPX>=Iy6xd4M2pdU(xyR z!q;?7zRt(pL}C059){*G{+**zfAbMYWKInO` zXUV_Zy+W@%yJp)9j-SJsuskZFiivv$rtyM#-n`hoq+c3Mx9v@~?Oox15%?&OYtA>NvJ)IX(9R}Oj4TU;)1omcy8xgi6!w5D0Jz@zotlyOR!Ef>w8fCoXUHut2tzp* zaRGhzq#I*`6U4YUk0E@Llfdk`J1m{7hAkx8S}V_>m0Fme;dTx`w{iMl){TStQ>KsY zJ2eaf4K5hiAr}H1NOT4kXJMh_m`q(Gqj2*Kb)_uKv@#rfd|0K9LBG}Lqvs?kv6&1pFxka@-^L*FME%XSYejM2yg)r zBpMYI@{%FV?oG0LSNJvdAUIy2r_9l=s(!O`izHhy2Zb7XW09-)GA@fDt7zEYx0<_}S*0JD(W$Gi> zF$;6cUQ5y{f;tc{$qTvMrofb+f{l74bM8gdlPY0jP*722MR0ZyjfjATj|+Sr(k*z9 z43~gF6dY=!T=(7R4?@Z5^2Y=Tlj~7y43XBX?Y%RDWE(YHL}u_2*=zTlXYe_uSd@NzcbU zka^EGxM`)DcO+&Wmm3^gqOe*a^0Z5_===!i{K0zsJ$MQE{VnnW)E)GCABQV48Ex2S zwD6(NjP0sr*7CbHMVS`ksUD!&UUDB*cn)a%xjGWf0n$*swfl5zK@~{i1M6oDPZ4VR zcJ`_`c+{HdP(Z`NEeMU)1pYrAHxgU5D5qQ@@h$>S8_Q;TMN<>t32O*Nyhv>`T!eo< zNz(TuhbW&$uYvvmdy*`-DVc7`UXG$sDX+zaC=xcLL61M zSy>HDpuge7J^#qZ3yz;xapeTf?>q_>$)&hsXHGj5KT^Q!C$h;Gjp9L^JpwOs8l#j4 zXYvmqUvXMd4CJDWjU&V{9oM?DVWZGHXv1%~gDvJ*9}PmG4bhHZsQKe_PJHHSpB(^y@Xd9*V{YJ`Dewjt?DO`eqI-^Q z_m%i{eVXk~vfV3vDfYR0j(wYEs=NaYm#T!FEE?gD)UnvIR1A9u33X72BHYjp{B0wZZ_z9-!l_qy$Dyq#b4iO=l zZqA>i$JOp3oFcpw1lz6+9GS$y4Fovn4=u&koSlh*4Qol_2v~k;TXWLZytIFL;M(DP zw%y=b+iJA)i`N^9a4|U4^U5}__sy6usN@I( z*lbh|3iL6EF=Bv_ivVZ`2srM7hz|rtWYZ`)Y8B96kqu*!NKi3Ft|A3dp}&tgHvBDabeMAB`PrQ~5TiN0Z=Os=nWHe=UAJ;XLK_MooR_&mMY zUTiF18oY7x+R29$L^oNg=g{iQuRPF~8lB)qE*zXcxJlyft`F;O)+zDhBM&icv)y4l zxD>k)zZT!5AbQ96iS1+CT`9T$#6$dk&}uRc(GMIn@HJ7UGI+FaUuWCCE-Euttyk7; z>fsF?S>%%T=1n7f0EM%1(KWwg6G`Bh$5N8;&sIM$!xuKaY*UBn+p^4R8^11MjNR*H z6?5Xeds7c_jUjvhLwniW!0hBEk`aav%n&2YzScoZ8G9UdcdKm4oDSek8=Po3NuRSDB(|pCkD>WxLdK z=$v~>VwdI*d1F!_D3V(OG<#Vs4%nfgwHce*QG1t9G z!E33Xyp{(Z;@4)Cfi@}Y9a?zIvvRU_PMn*V;#kbbw( zb-Sz57MonIK8UZreQoP1Z+PxTquZ11FlcOhp5I_@B!#;XOU@dMZ@Uh+oC#u4!w-UO z4{cpr*1NsLuq-!lrDgROUyh@mJ7<7LGzH>g^7Q=Tp89!di2M^>a2bohp{R|OSNDFRPSRAC2O2UK9UMtG1ZkWOOU;nG6ismya{8I}&` zK>+je+!qf&efXrjd{kb3T&^A(Rq+~l{$KWy+vfsH{}OlfyplUQahIr|K$?llY-yip zExe{_YY6~&O*f&rI+Mi!JB~cHec17lBFD+^sF{CK?kL!zo&3T~xW1?P<)vgtS%(Ti z>xvSrsHx%tOh6I(FF6aB(%Q6T8!m3R(cz9vD_JX~H53cD(#ZUrMM%apM@5~2t(m8P z@sxfXd$FQX0|b<{vhd}V!Moqw`|94;?;Xvp>>0lqGf#}ABV%ds;JeBX@BeiF$NRq; zH=mX-?3J@jYEI~dA}MU~o-@t`{0uXf<`aX*)*ZFt0m->9fkP~VI7wSu?|QE6FPuU- z^}LQdiXhT1+$*It)@oCD{Q}$PmX%CfR`!emp0TVCW82F-Vixu#1vijl4p5qCaS737 z6jc=1AL*?S$UGHTmgqK!UGk4YwTs6()h-X-7!sIH;)1p?aE1h?<7ui@25%k1n##B9 z>2$by^BJny5%~9-k(|Xhu3evaY?9K1A~4)!{>AfM$2X{+K0N~So6O^5Rgb0XMfDfU z1a@N=dpFLLdV;-z1T%ZFs`u)6jbWO4bK Peuh>Ci_Zy6X$b!TWVM!n literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/pynccl.cpython-312.pyc b/distributed/device_communicators/__pycache__/pynccl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1b0fdea3919b07e979ec01418c88c0c1eeca27c GIT binary patch literal 17305 zcmeG@TW}lKb-Mr-#A87~;tQle2qZ;GASphqm#wEQk(Mdblp{)N#JWWx>{0@R2fe$H zOcqSk)SZxyI+7+$Mb$|~&A20~@{Ht}&e)mB6Li*KF&S&bPK_#GI$;_$)0j5J>4as}l3+#|lBeU=MAc{&Nt@%ggniVWaEv+<&QWKA9c2@) zQCFgRv^wD)brTp%+>=-{x`w2g_}WCxXbnkQ;0y`>Pb^xNR!cHcik$=dUp{95l)7i$W^Gi{c?6l^&OEM|mzC;g5_5t7PZ! z!Gl9X(NjWLn2?>P(qm)15RxXwd6|u*x$uZ2@Zm&Ak{!uNBz`cJz_bU`{b4D54B!Tm z>4dxnO91PUajl5OqT|Wv%V~a)3(|7kh!mFiI1HkEEKuQ$CZkd)o;rP+7a&_5Necp> zltLou`q@ZnzGPku#Af(g`e%r~PJP`1yFtUsQwjPt3#=B+nd7E}d5Wef$wpd0B+wHS zX9-(4Mk0HRXU!`NXtrbDXO;aQQbOd6?BEmQ(nKg8J77qKd3Gn?6OW$i8K02Oq>?@7;_*Ze2iqk?Po*WE>*4rw(FhN_nntc|GfYMd zL-gAV)Gu7EbGzSl^(;2}^Nqo5V{pDJ*SK@~@M6>Yd{alZsbhXmuIb6?=Rahq>fWN2 zve)Eo-mJ|#>$`8;Sg3Ew*LP*>yXHsVS(mFHn06N2O$)ZB51jzK;OhBEbV4t$bw1N- z{!uIQOlQ?H(nLNWq}c_DtE(o%5lX0o20@ch4z68y3Rcbtx3@3P!7)vK3vK>*A zy$2b-18J(ejHbGcXbOml5C`-I^Z-<*&~iO#oR*?-v72y6h@^!zIJCwP9uJ|BlP|mw zI`ZO3=;*+q!I5JFM+f@J3d`1GM7#jk_yC-=5&W16V1&Fv*yE3YKb}?=f88S}Rp@{Q zxJ4ol(SKf`J}{XW>jzefd4`7A)>5!l7Xlj=EX_rWm09<__8)Y8x2s4&@*lVV)1JTI z^BYVSJv7s;RzXs!TB0EN5HR@*p5B6IYhms7-&9!`tIz^HMc_{h_EKu;O^|q3DV`do zIcn5or?~Z;i8u3>NsvXHnP+$_rueEDPO~Bq7z=OZXiSY+Ip$Sn)XveoBZib&5jc9x z%vk}Gx1s{FT-6HYHr~P6jqAp2oCErEkuf`sVK~;fe9XeRpl$W<(w3{{-8|C>qlOu~ z0gGqF`I@*-I4%z63s^mXbgCq)4Cf;~G(*0T^Fu3C_X6_T7bk|{*g!Q+ zJxYdU<}^|ItcpyKSvZSO*$a(_r85y#wMD{1eSj!eqd>FeLN`N8+Uc*JxZ>9&?bME8 z%ZhNjlow5hsjtyxu$o$?H^|Vc9wBMtiD{~u4ripao!trIqUtcHerk`)daKr3t0}#= z3>DHwQ#L|W04l<=HrmLNtr^;g`^)hHL~R8!$O43hB|}3F4?_|%lo328#)H9RbgB|+g&i_UHcNTIQ8hYqW?`j=xfrq`+O znACYqnUd|gy^}FXLn^F>vF_0rGiA7zVcMZ$iW!VU#jeOyFd3;(y9zJqEr-=I{Tq8; zPpc_?3~1@Cz$0H$q4h67YFxdn)PQyY)`06(&Vum=RSeQ`HKoy%y=)CnsI~f+1Do$n zm%1iX4(RKoUaO`u4t3XHj`rB^tL6Gv?T@Cu;@}wkr{V-NoN1$^4Jzyg&~U-Y@CX@u z5?Y+66sxn+8l7O2@?drNAqWfy=)@#EBb*j@X*nz+2llRjbcPR%D}EcLC?KT*=TpMj zKq}cC7#s@>r;>bEASwmUN8|B8TI8|GGE9h#35uM@#r&4J|k_wEqqC08V9}`Nwl zc1WNEqIa!!^|2w87DJnSsbv6_9=+Q^DG(J2dN7dVys$|O2)sC+N{Z1_(Rfsvz}^#J zX$8czzz6tnL4PS?5xEXr!70^tCtUf}eCuF&LWri#+_=Ok|TmwA# z{mh~e5w<{EFoUxS2xS}A;fBf<5?5m$$TZG{L})BK^fVcYY~hpVM7ib!Od`}j zIP%Pap@GngCkZZl_*^&|51)$jvJD1;2~I#gQ?}qNW&6M@5q?~Xrjm-w@enT^4WB>3 z3nCT~mt*K$Dd)x(0HFFHWxKL5p(rO?6?da-k0!@bp%lzst}cO>kO@xcRWWiM2PPz5 zF?3~19GAxy1rMB%49DdLoNS3nLu9YWQi!tsl#mK@kuX48r4(sp=fPFV3*ZHnU0Q7@ zbt)#C0Uy~B*f2?uEx1XFmlaj{LD-0+WPOqL1XRytW|9|DqJZZ#`lphSGeRl}s~vR8 z7LkvSk(DMWg&u^lmi&MWZV`?SrUX7P7#$szEtr$7io+Es06e?IJA=Gj3X1V=3lXXa zg(la1oR+)QO4Mh>Je>L$sP|hbPt$Ds{gxf~-8&a+8uB$8vo#y<*KA&3H!sv|CM9jz znzp%tTusND+@1KXc)ojowtN4*n*Bw}ZzV5o_n&WfcuIc`Z zgNy#wyuUB&@4MTc^FKM=|K~>@c$=>~t~s!>;BU+OyR-go$Q*f4*EDUp=l~}b>$$Xl zX8&yXo^$;h+wyI@@3rkNG`9lGaKRUV&kF^1ZP8R^uUYiA%?;(eeTb{*p_=@Fpt|5~ zhL7ukFK~VE+F;(dGwa)#^F1|fxo>MMINg_?nt5uLer?}_=9cRxubs>{@5naq$Th>z zE)FlUYxArx%lZnQ&GQ@X1aAfN9S5=<2XY<#KRHx1Q?BL@aKi69D1XQ6e~2x9{pF z4A1VEOV5Al#w&Nz@3MOfUOz70>+b2n#dRCzPX2sd&y}jn^yO`ZEq#|QSF5j7=RKRU zo=x-gJx|9&Gvz~<9;{(=Z??JjZtdItxBNf+(n51@uK5s|j1T0H8{cTZ*>$7qzPo3^*0bR5 zDb)L5QJt{A)isw!XGZfbf7a!nJ@>9_V`0PQn}at7^BeYMH|#5TYOlJlxaVqb`fvDQ zM2&AX0&TQc;|@XN`VG0dcIctTdv*7f-Lo&x*Wa({o$fEdS{#`x2s%ue|qOZ{z(}lV+lfl5$FEcJo z8OsHSq`J^370VPDqJ{>&t8y+BCFuqy9n%a`9Zto0yJU2g!JAeC4|B#1s0b=n3H`?; z7&+>ot!99$aJn%jty9Yk+m;#JG4z&J1PQ5%1TeJ%rDWMho$qaM&R_s_7Op&QnTJ2x zBAo$U0FSR_$_Xk65xBjBI|fu^N{VCPqvoyvTs#o1hum)jl%oN`Q^`PZjc@?4D@O-f zxWtQ24jlmvK-q;IJrl>b%DKQ{#A7suQ5YiiiY1^p@X4t+*=%qE>5lZj#AXj6xFZtlN-Q0a+_dR!akuuvIh3eh!^#9=b?>_&|-XcX0(jA44t@(~! z*^XTc!QDkO?b^F!2CxMHcTjBog0oq_6$$1Vw@n{@;9N7kZ?Ue82%&m2;?Z|pN;NZ%!Td=^9_jYEzojLE89NS%B>rh7J*ycsnmuEL+*-i8IJMLTV z9Q))V+ni@NXW7m3YjbQDO0&V4!9~_PTc2<2%r^iseY+%{@d8;m0}X=JW$cWa<8mfG1m+Mg5GEGlSew8I7gR`Ml)^~kS798oL5sqt zFJl!>Phn9Dq)=B>6h(5QdYT|#kx`ZEx1pW*4-f&P6738b70z5>RA1iRnsv9%aW~K2 zID60Cw-Tf7$an0`cI;gU?n6d>nl5o_AK+luK7uFNymw0$_|MyuV|$S&U3qVJ*4v%) z_U72W0_#9kC~7LgqgV_c9cZkjV+B{@t&%Iz zI#mTOa|ZrZO#2lD26}-yl8hB*iFd0Nq0FUkiM3W#LRM{GrZ?cqfl!=cF5b4FwrR9O?FN`iMdWpg zWmt$pUWj3Y>SwT~LehX2;!GIc5t+qwLXkR(iH)~W@D@mrgrm?*(I*b5#_}f;MT)_H z2oVk>xuCsLFr{&9s6fc9I1cWdP-<*Uf3OgslgQGb3YeW=&D9A`q zq?r@-3PoIvQ&Z~I*Q?n^&BoZUXW;5{-uSQ2S@wk^xH zEqd0?4&{A)Szlkyw>9V4h6P~qc4vLvIbUzi(^v2`qCJ-Lv@Ci8c~587(>Xts@7kB` z+L!a}2m1;QHv1L(qQ^ITBH!GTZSKi=dO=xnRfDqPYMecsWjD?Y6&t9QzM_}1yA+kl zpSAhlwFQ)$c}*a}vO3|9h|mA}BI+zFiO(;N&O(1)I*X)tvnnHiArCSEe7&o zImkm45e0}oe-WXYA#GV(8)`U@Q+XQm?D{ObKF78gsXVJNMLs}NL}A@>tpQbV!n%L= zMbvy4L(PW){RPk5@_E2CFc_H}>KPCE3koU{mgz4)D!U3b!D(*I*ie7@GZSYSGody$ zWnZB;0|TZ^N3yTR>rn>3Dz6G$m1&xiG8c^1IX=cqV)_(b67BhE2CwZ6#6g*#goy?8 zv8gFd-?1qr;3vVAMt#S6@yA5pG1}2KwY~mTXG`=Qrc&SO4z4v4Kf(kcP&6CzWV*~v zva*F`Ac2HG20THBp-Q94A|ZpFO=0vEh~PcznKZcf5nV}vfsdlYxR=YIbCa!)l~d%-TEbH1*eXA5Ru$orni`ku)7w>A3!b{G&MVGux{P#|MNdoK)0Opf%^w9#Wkn5U zjU6RbV_p|iBtj?eEX>=3A7gnkmz*iFg z6CM>CxlAbL4S0QcT%S}@5ry^V_RcfkYmKsRVbFtSWD@DdS5XW+zB``4XwMjmx&Yv4)O zmb+w&fbO zPe1po#%02@Xm4sX=KJaQ|)^7W!a3!79c1RF7&PKHB;{cs!Xpf3)<& zOvSx~tg0J7P?apeuXDVNR;Tm)VSCWJW{|eIjN(ar#ihc)6{~96@M+Tx&F^IiJ%LR}3 z&QEr(a%~a#E4aR!0B^&Dh&n=*2x(3PjHp+ zhka$M1eX?lcQ?Epd@K0QiJZHC!PZaKM)?8fIvJiC;x8UU@ax=AD3RjQaZIzJ(97v? zT&WRo*~qI^;TWclV?>^;bYm)l5w5Om9y$Jk5X3CH*#rrr7cjbj(Vt@Ub&S4&(It%V z5EZ_K(N&DTjnN!NZ$NZQ(JGbWwhK!!+68}N9wz!K1y4jaY<_4zNi%g1-0K$nTeI%1 zMFz8Q#l5EXa^#8~d=x|1hO(Z`#VS&6qZ*oK`>({7NULJT1Q-ep>#t_6WPooUI@dEz z58z?jo-2Eb45R@Myt*5@GE}U>oQ-nT744W}DTn)#Z^pOe!W^iCvp(q%0%heKPkW9mS8D_2`te~>O~g*mmqq8)P%#NmY0!^5*F0Ca#>s1tWOxK4Q&yoNk&D*xTB>@52cuCN11gU~3U zLOc1PExvsg_Ct-l?GycK_q-4uhX+*TF|?vo;;AQ~lT&%GL41f#M6CRxw`?Tc$}f6R z@0OjUSLOM$fCrs`(o1>Y^*5Np!$R1MQ3pnZzHecw1tVl)*|tYv$bG_H$O6ezA~Fnk z5omgerD)eLsldNb+y0&MerU4MrVlBI9%A&tT8e(^msHy?slI=;*Ixc|&hEcpec)?Z Ypc>z^Hqq7(cRk0@1N3_o`H&v}8}%a~*Z=?k literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/pynccl_allocator.cpython-312.pyc b/distributed/device_communicators/__pycache__/pynccl_allocator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42cda9a529167da12b34165c31431e2aba8104d9 GIT binary patch literal 7588 zcmb6;TWlLwc6a!GiljtJvTVsCQ!na4$?quMIBQFO$+GIK?KIXlosKvoiS+Q1J3~u! zr9uu2ROz~i|B_(0-K0Np;6>L33bX|Zw8&2jv>#-ofb6V{O_A-#{wb1+Ew(>B=MIOI zX(eqh#5?zK&bjA4&Qt&5_j?Ew#rC7*d^;guW5+DEYGXIW5%K|%h{P#m*7$O>9P3+V zE%3D{*0gQb#u;<=Sv!nb6-U}R>ttg##g%r?x>?(B{^M{@o zlSJ%S3`5%zkZmCJOrzWcadp`3PDe%BH8PbA<`R)24~W$I#A1+`9V~FMwqIjsCdTR3bD08soOfla zCbJo6wocrdJa=8mFD!_2ihNE{vI$YkQkd~7Sy2*_86}yKp=*;fcUAZ{UoXrg63Rq2 zoz7>HHQ;0>sR>GUVL?XRU?NYcoY91wB5Lzlnuf8q`D8{CGJp$26G~o^g`B7@s^Sqi zbx*W9#(xhX;)`cJ^0Fq=41Z+e2>*v3&)sZN8sr(ah)e?I{K9~myeA79kDp;anbCMj zzMD)y+W=QJDvRkD;PcgyoJP+;2kHDkR`ZGm*fOR3Yi#J6oMtC5&N{ zISdhEhaWq~KZgdKV{Oh9cIwb-C%VQJxO#7rynURIRc?ioEC&D^`Z4HRRxFYg`zs`c z)EkOPH-$&vP|UeI1F#+L;Q&WCQn1;`!;>}wVr-wes5|E1?cY&#rzlBOR@Iv^8#@6# zA|%AbBHT_cn^gogBj(h_tfmL@8Lg0mVHM6&T#$8_Bx<67@5)NUu)gk8<%Kkes%|$( z<7x2lx zzKR^VKVxtJ{aQiG#yEHvsO^0$`w5+SnW`16grz{-M5|qS3b)WuO!!r>eyg13HK%Li z77h2x$_l47nQe)atP7kLH2W#MxP~Ix@JUJb6|!Rand3=ybx;Wr6L5w6m`KhQ4rB zyeBB8{o=@N@Bvn>e`GCGzf0T_S+?FLOI*x1b5n16QC-|~j3_vw+b1uazjf(i%%$5h zVp`U{iCj)lvw513Ro%Um)E0$AUJ`Xbm>DVxiZri?3m{iu8RxR9tOw!c8;VOvXQjL% z>w&p^Qjr8HN#z6xQ$cSrP+u@+IyZXH8dtM=pw1Vw4^>8M_Tt{x7nwHYwJcxNraY=& z-cWCRP3y+_8>Z?r=a!O+!q3S(C{v~b93zTN*t9-8-Yo;hOx)Tl)Pn!m?=hEbKZXV2s?g3kY9Ux2b!BYp$ zN9sq%%#LwuhOKI5Y)=2qrAt?4E(y~Y-n=kPQOoMhLhZzaB~S%9nd+TE7 z37KtPcNiXv?pzWn?nSqPd5YO|XEv|lP3x@H(b4}qKO6NSyeQx)r)Pg zqxAa#H3>hp2o;*FgJo~DrBe7*rSnMTP_)7y`_GWu z?Rg$1T_-Am*7fA0cZvr_i|u1itwrC7im$WcJGASxb$Ni|a}Q}9C`XQ#B1bnO$KUh( z*OoWDLp*km!PSHVhnfHS+l{kf0$gO9i$UHnbJ<@S6-p!)A^JAnRH1+SlcbZ+dt zhkWjFpFeB;yruj6xb^dKJG2upS1YJs@VEOAs=YqU+IZc{s>ijO6*W~2C@y>`tlW=k zHD#J&p@%tRd{Z^0Ss$-so9af~<3Zi18WpGRMqh)O6!vi0z0uP^N4J9lNG$3$rb3+H zHeLs-OHl_it6><=5WH^V>T6^!0z?Xa>QSiR=AFT!v;E8N^INX2O&7oAjTF7zTkV}c z^?c+hw-1)u2Y>eJM*FGtD{v7-Tli&*gxZoFA)Wz7RC{b9Jr84zJhci5MHVypTmxg5 z0anFd)c^*=_pkgvmqTBmak;}UE?1LBQ$*<$AT8s%xB)P{2Rci()|VGHpTO=gfPDbb z5=M)&7B|=|iIc68MYhe`Bg8c1SAfvFF^c0;F>!j zsoE1ZmusMR383BPoVk)j@lJc3Y`nT>Mr0`WX6!Y1CHx0>R!rg)WuMnZe z4Y;=Zco}q5La3S$YUq)(l0u!(P|O_xMomN+e%-S{YG^T7DZuHzK)nW+;;SHgKB?%Q zPPO^+X`1`n%fp_N<6owc>V0`^&rWJQ{V!Ddzmv+IJ;$swzsBcw&vh*RN_;FXj>W=; z!|z$jQgBBw^kqhtjsX}&1w--O>h_#SRhdp;+bamwh+h!s41!$4VNX>Sz^LioOkPn; z+3?pCpODa&nJ;8`(+M#Lk+7^gjM$A%A{k#^m1_trSTm!4BmRcS1hLS(HkmU9e$1-d zR9Ts4>@dbul((7zbqSCvfV_fmuZ!8+Mj?)y^X~$dYKMbbBimq^B0I$HpXRn&;u|ev z#lYB(6}w*rI?93GQlR&7s1z97vE$ft2MPC-TZT$4LmMq4#lXmplMTB_uzMqL*S3T7#&EW~($Td(`KMREIy}gLx4Qd2 z?f9gl-2F%20COPijb+U_1=%|DrN0Z3xF1vs*J&t{ z<<9S+T>*US3;6r!KYFHa%-M>zI)+x$EBooYR83~;%L9-5=)U?2^%w&n_t9smT69#W zY4QWQ9?l9v@_R>La?h#!P9FSZl{YL5dd}di<3kn|kX9-A^S3~tCz!Wi6-)kD{MhOE zaXw4=#pJ@GOwpOI(vf&bpcxJ}zJHbksYU|#hF=JB5ec1qbm#LK@cVf-%~z#1=G`Ov zJ|~Lj4(Va6n7*(G6M7r-*dfg#oimbx^35#sa)5b4--HVMBbbIHogl^<67` zvF{+kurXT>43`4KPwig>PFA7bs6w3tC||Jb<-hRqTP@wimJtB(H&udX0l?n`{pT*? zZ+`#sdzZ_;;gWCo@y(zA@UtKOeSFh*rQ-6HT^%J?N7>a=a`kMw`hcBvuOU=>A2>c> zR9CzI{9QuDf#KR;SZb~S(5MzLaTW-Ikt!50^0kzK`S4b4E4DSu6af*mrD_J*fXCW| z6Cv2&Pf!fpx-dc33Vgp+`-)xSFuJcBuDcxMpaT0-Nnt{*p_mll%kp7$@v75IVYaTJ zn0Tf!=E%b-8=e`ary2?!-II*i$EipiY(Ri6P&)MTXgw)W=d%C_B`~-Y{TAJ$WkFt< zeT@YT!C{XcY?8c;Hi_9<-DRfQD6(VN)mDMZ3y|oPDJ0(JM6iyStyA@ObC~5kg(b*` zXO{#jF6mYTU_yxUNB;mB=raUx4oGE85u7m6>Ngi5uPyDBN0i-|g9)MTl9v+#X8m+0 zw(g3G?qEZ6y4{Fo@RU@1zF=vguyBRS(EoNjYwYFDKgQLEBf)} z#|uyIZo1BHfe{Tx*X@w~D+c(Yi-)?x_kP;+Nz(=ofo|tsK`*+7K)iq1HoSfb0@X)_ zQlNV?_WBor*DE0JLB15^%fazdaJ&*e_~DgOxbLy^sjoEf+Hgyr>v66yp`$Hiu!~+t6g{jvZk`2t=oG7OVU4Qz1i49uUZI zb{zi3*=|4eRiuC04qrIKmbP6S**Oukhj)8I_S+oeiQ&eEbcYCGOH!jZksgMRkk2mU zz^S6BEeyTw&WVXTBD|Lc!v_qpFg1=*YTU>x8Of3Z#&}*!Dry`PWB9+84E{&R$iUbz zS8pU-hOrsnleUlVm$rv2J!*R6RJ;tu$gEIQQF;>_G5;5&8cCr58#H)e$X?t>7OQcr zAh}Lhv`txLc^fD_DEq0u+w4)u!exXBXyEHlkgy7c=9zjy33S8 z<7&`FFb(|8hjC4LDC!Ud` z|3Mw<*{&G+aAIN*BpNwxF0A6`!-2`g#^s*=q4HajpHnLo7*8+vyJ`_E=Aw( literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-312.pyc b/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26e4c7bbd63643e4b3cb5db024dcf9f3dee2df2f GIT binary patch literal 15911 zcmeG@TWnj$mG_dDZ;CI`6!kQ1OSWmtvMo9CTdrdHAzPLbznZd1S(^7srp$*rB;`n< z8#PG*xyi02G=P=e7Lj+m8wKv7%5H%e+hSKOvQ4lHEJ!nfh`SEpqS%k@7ah6CqW#%( z=HAPfD7#4ml`E=3WiW^)t-xu%J8|8Cdx_%P==ksb0~Gb4I>O+`!}p_;{@wgTWA`mFrf=NPzXu>}h4kU(SBmy}5a45?AqrqU<9}Zn2 z0WuMk`$IEXg3Jnv_a-ASxKey7o)y%i`cgs(=2erh&W2SZtWPtc_Hv>bIbVo|zLOOC zc~KvVCIk`Kt%BSMWf#ziSfxW!e+n!JIO#;uF(b)6JtVL)CT>R zCK3X)84d|yKJI^;S;5!-;hQ@Xk+Gd)p|Kr_02vVyJDyMO-M{b3USTH;Y3JBP;&Lpy zb37c5?Bql71PNVACIo&bFN}wRfUR2 z3a0kyc+RKiH9iBc^%;4c&jf#FUhk`cKMQa0S>eyd8+~@(1WHuPn+0bzh2d)i*D%Lh zUf~K$t8T*TwTU(yTOTZ)^B~V-(PV_+BK`#R@)HW-Y=6RGf}Q?Ei-`^soa#>un4p0X zH9XFUs3k%oNuohn6`+P;ur;z-u>kfalVTHyrO#+7os7G(6{KrUF{9Y7;yG8|8hcam z6k1nViWRM?VJvT6Xl*XXW2vIGWi>^iwQY4>MQg`uib8AW>bi>7uGJKU*0>xio{H8D z71dBIv~F5mSJApT#L&Bcw@Egn=3n3Thzj>J%vkNB~4k_nra$WF zOo_%}5{vlhcqSVw6|SRLc?KK456KjB-(-9J*v(V3r|z0sa<#6Thh`6@&Fga%dwTZt zUDJBR9-cj%HhVrl_RQ>=yQU4u;mGWfw0Wb%;qzbzX9w?^HUfv`+D)0-O=I$)9f|@}1GbExaHbzd(esklJVj!>s6NyCj6`rI@H~|I2jx;I3hv{@ z6H=*ne1z-;eqI&X2N^jINvUb0$}-usYQdBkwFox(J|t+=YF>BWY?^JlYidU0b8z-x z+T4;u?6Q4B#=b!|N{X>tle#*ORak10wl6%nuqR!=D{I-E*6)^0Qca~vf};^bPGf>b zm<(a^G$fMY0lP6Y7AH?47R5`BVp7de9K#As1o(@84H7UE)F@DEL7`%T27s!)Nz*xZ zGS5J^Q2QxnvdRM_sXVeQQ+YKY@L56gS5)~6jGz`co;?IuK?9JALRvwC+6!ehmM1u` zM${iqMuUk^EGly7Rf{?z#HkOb2ag$n_@Gx;AtzA{&a!Cr`vcKvED^xgalfCSrzUb? z^rR~c=Xt0hn55t@j<)ps%zb0c%l)|;+sgwvi({tidgw#0P8uZ=mW@(zW>$WrDIX=A zAHh`?xX~(3oS+?Hm2)1YQwdtgxzQ>v8(^zAYl?$X!AJcFK87{gaF1wXRN7SUgk8g%PNc&+pRUf;f|jonoZy1EU*UZ9Qkk!Xay^tApso>U8vt&ibSl0P;AVggfNui0g>M$t z(=x}mK-t5u7aDmFxHlUhuM>>JdVV9|H}IPT6WU++G#0)CXxb=^(zl%z@3Zi`fM#=9|97vV=>(cB ztKL=FcN^aYbX)1zaoilg2g+W)hkpX{ZIE|EzMa9z<Yn5gw05lw?fj}8n-0VC0YF2M;QK@UdfAixqG5iK(AqCn!f?O5eRq*5(_GR~5%3FJth(JDcb6suT@YpRs0 zgj`s| zjV#N8JSLb;Orp@wR)*}lrcM+*n$g;dmp~B@5(x}T(r47EK*d~HPm1klnUwk^i1A)i ze^-5#y~;eV{swcEh4*@tHt^nOfU2sTdDS>WP9`=gw4<#GRLseSl0I)K;uVNjSLY3D zRrZ5HAG`{6wE(CxCIWq*VP9)`iBZPkxZg?0}>9<&z)?vc>QrQz%w2op#X$T zJQoKB&iGLT{&PeAr+UwyM7fN`AUuKqjsQCdLMR^zTD1;1D_()AAYn)*Up^1C@rlT# zSQvVT!b3p~$C6QyUX*ewlQatXe9R+6i`}?z?wH(_Z=5;fKXdGiM;StL3@$wcoB|@v z*SNH^U>|!9dpbKh_jc^+>hRVPj8l_mF^OVgheYI#_6-aWJf4bb5RPa#_IywnqX#0b zbUz|$1dQv6dPTsZ1E!t^D*eOq;VCZa;CcZz96~N2>x-Ct9hq8s0ZA_w=q5_DBFz2v0B0C&M z%kGgLDf9@JDfGg}Kj|#lo;wum9$5JCaY&|^oTL7(W7mhx9ognx)4DuoH#l=v*N?6& zxAbJ-zqRL{tLf&I*(>u~vaW42>fGV4y>42z?atVCFK)@$4&1BTxLmg{Q@8I8aWy}WWJ+tG+!1aN= zN^9@QTNu0hp3S{%+mNws$l5kNtTonH@-C*h}(70RA^kZ`1M3n%}glAjSKYcru(=9d1KANWFwp0NjN8RoDvws(g5s?jVIdepI%f zrF48DK!1+$st3t)P&uhdCWd$H1Kp+zPRk3y7%xEayHH79hNLVkM4Pmft~j)K<^JGh zTzj|~z}mC!noit!;ra{d#+@1Ku0>bYx^LNfBx60Yq{~`QruEWATA5qKK}$N|0RYe~ z;@K+@sv_%Q;J~@^DRpZSg+cS+;*{S=Owb{t4>f6~mWi4G2&}2797OZkWHb?q$Ps^a zar~lzQu@Jj!M3X^^2=^q@-xto^VJ2UQ` zi`uMv?{xn?N6UQELTKTe>Gk`wjst1)fxLz>OJS2>>2M3);v%rSS>h#DyJ}N*#cM1G z4I(U+4_!qCzpYp+t)ij_`e;8=@h6oWJO#E|+y+*gYLI*jUgOnDCR!5&>y;qa0VSa49@K|eWTUAAq` z*f!6{v$pMkb=1w&&9QTBb9M9VeA|3o+UiZ~y{l)oplj36m(rP4=8)=Dl{uz*g$AtR zk@X7s(&zK*m83&{XbsXgK~_{As#4@Ntdh*OXn>C#;42J&h%Z$gf~_YTtB7AYsDBsb z-wjkCf2Ze%gVTnBnsh5_@G_2^@8No}=*ud3QX zQ+XdMZ1G@{Qo*EfgQNyVHR(O3oE>&|?(^WcD{=6K33a0t@wR(Lz(=0cO4Z=@sl!5) zUc*Q;O1)QIM2JLU6ZLg0FyT$Dc_8og(>&bduQ5%N$Nez zp_`6s6ffh9e-pUDjGHXeiC6c}sqdN^?>U-sj)T8Ey2SmW|IWds;k50{idt1`$up`N z%R?=*ZUZikdwRcjG57cy^zBlXWs@-ag%T!x|uJa;~8+cf z7pUg#@Gr(Sai2#%UBdTxNG1u1c3f+RngSFt(TPG?*LE!9+S5) zS-|8tCfH^2HY9QEFWq_|k#-+sHd$XO?LrXpCW;y_-KH2g;TUSrJ;<52N;00aX3Sf2 zHuv1t`C|+F7SG;s-WjLz{XNQ8Hhd%&O9b&msywK-R8Bqsz6?2~^ta`CsFc(tcmqK_ z@fsc@qJKcjqMWe5ikxIYA+K_e7M&_7Jo{6W(jFiHDR~M~YLlg8&Y0WErIc8@SRtpc zW+m}BYLpN8DI`@^2NJCIB&9*if#+Z2*t>wPY=yKdTDTn!tL1~QTq41O0{~2L_o0ge zBMX22;<7vE;7U5vv*W#kPI>#Cen3%7^xYXm0VQa+-x zI0A%rKGI7c^;HiUeEH)URmMZ^08}~}teJFI`aR0sX9JF5I4}|q;w&kT5dgXvX8@?`!)v~X51(SuT`W1f$!?JHTFg@# zWsXzaZMjbh1~0KBFa8Qpf)`Ie8Eaa0Z zl@#`(>|`ok7qzY}l;WlORus!dU{GQxjwo@5o{am*61Oy*b)Wf2KS+n{Wd|YR(FzR) zy&V-Fw9#+Vq+p{y=nul@$o?^D($CUByFURRPY(4Do z0xnBB4V549(Tg)oUWUK;KZ4GDpLw8V4ED6WZGQZ%7jM0Ir|FI*t3UBuy)A9uFn?&V zV{!Al9q)83x!!O3Ra3TeFui*ytAC1CZ=4@k>{;x7x96RnrS@#sP`dM}tp4c-MqrnN zk3r4uTutNj#T(zc{;j!F*_tg2uAeo%-L#lmy7>OHzj`)pKD(lWlhgwjC2s_B2=_sB zF9*k3>#Q|rYoUdPoUJKu)z?^7bY`I5OsU;#QPUG~&PE&A4cMp#8f|^VD=2A=4oOWw z+Db{+uf@NjH*g+;p}G|6V#iDSQDhE>%+EtEaj!0!(iaq^gS}7Nab6BAwrBN63bW|Q z>i0ll2Cp7SR^RzRPrK4edk%vtk@}4Gre)SLmx8b=P6b35@+K{c#|Yv9haX*J4_?UXFXm5chE)yPH0R2n*DyPFrn&mLJ@e}MbMyK)da_&xT%+yXpXS_it#k4D zzJ=O_;DYCT>NNEW>+%d_i-%S)%R3Aj zXRe_+uf}{^x0Ltv()_-mc0~jDysK7YyzgkvYaq{Snc9}T4k0~btIZn_GBUNUya^#Q zQ)A25AY@^z^?55oHpbqZwvAeV=RkTf)eq@f`M zJ#yr05RzsWA!&9Il4chn53A{!KZT37XK@d%*`5_GtJ#mq-n`bJapjyXxWrt!&AX+1 z{~;;wAE5bt!#Z4guDpxY46zG7WHrPtQ9T%9(`PR}qzGyAM^;a_8O*s;fBA>Kbm zH)Ohy#-AG+eO__5wwKx3N{Zq!Ab={2@ufI5DeyC;Hu9msBuYY!|L5 zmsFXvIB&$R$EHm8?REGAhbhZv+E#Yo0|t`(Aj<%g@b&PN=`*d9?R>yMlJ8(trq5Jr n)(uvVb;B;r~opPsGJn^fP^(Xsv?u=Xo# literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/quick_all_reduce.cpython-312.pyc b/distributed/device_communicators/__pycache__/quick_all_reduce.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a36038a2a4951ed56b549454b4d27886925cf0d GIT binary patch literal 13622 zcmbU|ZEzdMb$7r4#1{#Y0Qe#C<4B66AW@_~C|e>$N&J#TnW99Al4DLB6vQ1!kochP z4wQ(3wd{Dtq+?fBl15bHR8;GZsj)kz{xtGTJFzpJN!tDa1iD~u=ow|=OzTd6u%bF~ z(m#D~4+nrSXgghjyL-ED-@bkOcK5xvtA8ylG*J-z%=dh+?V+gO;D;J?sm$gsnxbw~ z9L3RoDnO6YH0Cva&8UW?+5j`EBV}5@HlQEXr|Js=hEW5_Gk#;hG-@V!o!=5D94#by zy}u}69kr5tfxkFVGFn3N27hV5Hfkezqu(BIj5=saL-8ei*?T|>hS72(mY65^E8p4i zd#Mj-ZyK%SEWBp&ne2Kc@6~XHuQH=m8tOE~6}?JvR-S%OtJIBFbH$@Iq^1OFO5f9@ z%4=se4HUm)7bW0`&7+ksP;VT#MqSswKwYDqw$BmFNy|)sFdP_yjOBt5^6;W~S_p-w z;oI!;gheS7aD}Et_$(jiB`6UDJ}9|n{QiK;6AF5L<4|Su1zpkv?-Jbs-W3dSyvvO( z%)X#ca`{8!=BOg+38^F8Rj8p|B_yz~l(-abDI3$7Us7 zl#5aYu8`L&@{();HU@CEL|=rLbui7m-$@HdkBDqz|L|g)6qs(C_D%1T+ycz+zU$#< zj~#@eQ5rHN3m4d=Rz^NEt+8w*Q3*Os2HKp-6SdE7vT*mgDS^Gvzie!olL zxv+NsXp&*?@B zX7$bjxnvkec99I~B0ug6@B*%|&k@h(B!}YlIZEb`(;ypMt{^T0muzym0wFH!$GpYm zx*B%-Q#G1X7i4B&XygcqpCNI(%nY3$>c3A3Wq?VrV^pC=4UiFcLi8%NW;EUCSuJnGY@CI&y-jZi#o4!M&2Wxa^`myK4Ek_zZILQCZ; zkc6BjAxd-l_X|`NLG5qLm~flP37kQNB)rIs0w**l0N3-ZmG_M*hB<8(7iw}E!{^tw zXQEo*X_|jg2EIyEJW+--;gKoOv)U*lr6qo{NrlLyGqli_B?I}!Z3Y}pwZ&IsQ^%*O zWrS0E)wWTM)S~89?hBPtn_8AnHRLQ&&5x;{q~&l<6Ks^SF~T^NkjCivYK+k=Tw5@; zs+jW8p<)*`XQ@}{s0R3rrwj?wK{YqouI4i7oGz*poD#^U5~QQ)T$&qE-DIa)nn}6B zG(Yi(g`j@V>4Q~S7`v2Oijq#KIdui7E!1>QAJs=SlP6WUOp426t{{tdJ()T+7cCg4 zr3|0a^sDu-0~ft-RcC+BAf+h-iLTE;sX3LZKI*2?qZy|>nioOR=8RuW1*iXGsJNwh zV=qI^nE-2M&GxAjWYVZ9YS?z&rS@A;q)kApIYle%OLOL^dGd4yi<(pU553!=cgvhv z8c^#d&#Jjhs=_Psl?6MPfl+f&i^?~U!sc{oUJi4`Q!rMc9!55#LT1t}dmfyhxq&ND zd%~3rIEHf}GKL01cr-N3;hgar#3X0FMmbB}31FXzO@&@!PqCmRD>^beEre!#951rL zP;eilk`VStz8Ri%dpw}L3vS653bO7X$NI!H7Wa(X=Xa0!c@~T)w)b*hH-Wf#zBj<0 z8#uIoKP&Qrj~81_=_aX;iH@Gty`P_fKE|PcP)@in~;7@nRE36jVTA8BESpJ^df4Y#LJz%f5dfoVz)3Z zcBCOtT!{A5LAnDeLAzWj!Vz8z2~%t+2>kc5L!ltw!ullkn$PcN!y=DOwqQbR9Pf39 z{SpdX233YJ3J4nEN{@=pgAWaAm??HyD`-%3ndblr%dumCO@oTkX|A3-VM zIfajtHMjx5!%0bl9npb(}Oi%?o&r6Kf>m`UYgE^Wl>~K7%1?>nsOC zAT!bg5Gh>?u@`y=_u%1Sa@7E zkVyt0nehc-Q$d!=40>W@GdknO+@2{u$jJpgsSJZNDQoe3!4pj`xaJmuAT+W8!Er9- zBFqu)C|)6kl$tUn0-maJnd{Q!3m48`9O>_KJ%9e#1Wa|X4*sLER-3M_j?slB!Yl?pU{il*ok9u21W`CG3B5XWmC%a<>N%T zbb4Y|1n-w$bzk~8;Q}_LT`)a_ZWkFZEy=bo!-`@d0F%u`jfm3+9Y};+SEPIxJx|@L z72T><9D|19eb9?=l*VXUvkY!(A&pr(0avkY)z-8Wept6^JGN@=Qm_v#3?cUR40b?!b8ca7sr}uK zdmXFR)_G>#QJHk?jyrZQOCLn;M^+sj^QJX>1<+tz+p!B0>zdiISn~Rb$ClE^X2;v# zNmlp7;b-o_fu(vFpdT0SSTDBya42a%8n+)!6n8u|>R?0`7|~~D$6MaFJ*(#Wb;pj| z-@f(jrT*on<%(GS!Gz<`!m0Jb@;haCUZmnEwdZ+Q-mU}I$*0z|T zEoN<7E2{#E%%9aZBx{evYmdYXRlhRVu2)vS?Rlpm*4&wF?v6Kif7Fm@K9#6Bov1vs zaP~8(%uN`~eqMvH1LNcj~b&Ixl`rjS6H?YG0 zo8{P=WABEJGTJvNfP8A8jDefK5kpnkfBaiqlGIAH!GUoU~LWEYg1ZUt(=eF9SXrlT%*oZw_yCa35*a^ax)<$>PkTo?QMF8B7sNB7Xkz-af#!1$?Mo+2#9RTvLz;O#@-D}EU08eNBvQLgdJ?psRPe| zF)|$n`<0k^pvD6wV7!C6gL?6)3jhGAFMu8eo(07g^u+*bXszH4kXTg8TDKW)Z}+7U z7k1$qIqyPDUHcIidfUP9@U|*iy7)pU%%-deqWJ<=H&|rXi5|&{0Z`Y#a3<>Sfpa}A z?6nCW+GwB+`*}B^k{1N9jN!-e(IXd3xJ9t61i_6>P(Fk+q&zj#vO)E12)Jan9nrb&ZxL17)$_tg|0UPC z9$5o_TDXFji-=mNFw1pS5PGl^qO3BmX;90}K$kinpuY+{+@rh12{2?i*I3x=1@@zl z)~OSoM+!^=xy*$7c4(S)pRW)<4Im=kjT7ZmxsJR7pOIZTN@-rz`WQuE4l*PreN1&Z4`tOEDQ_}TR%Yag7 z0;O=hrftO(ujxc8cCMH0OO&X;&M=#zO!1~PIAeL{qc(ZE6uCL9gpp` zNqbA&-jcK*jN1>+_pe#)Z(qLq;++@Y@Bd)v{!px{Gj8o%FD|=nxn%)m$norBd*#yt zrlb`5EGd5s(7L-tcZwcao0i*?drv;vdlG@ox6B(Ft)tK?G@6O(vUD@^1 zwg+t=btdfRi7LJ4aBNTaoz8y}fAP}CFC|WY3&<h6-9;+)%e_XjrQN%i+yJa&Npv zAH!)U`dBf{;A;$MWGStKqVj2E>V7ZMv$T=Qx>3oaljVEvkhy7_qa`!+O#n$!N7b{5_SV0w*CKb$cRog!WmF!DUXZm@fYz&#}6eWmyM(RL*{yoH}Q2HX)g zn;`lN`ZVjs5`VoDbrl~hf4Cr-#2fYb6N}&-Vr&iXq?w7TI9n5%7 zkoXKlM9H!@6pPV#Y@84B*QbT!k!NyNXFfMW>oMYj6OXs1+xKJ=EjIPt@m9<=>lL|y5-MJ04(|0(c2D`yVJgQGOy3#Qzmd7Pf;d2yN)v%LVG zf}qPo*%cYIu4M=MB2pGLoLg{cN#M4Y3!oUcQgCDhXJupmbq_y{uGq|$tZrdCY$eLp zv`0u`j!g4H6i1})UniCQ>oAl>8omLC9%JJfh=3RC*4m`CIc{xUwK~^q6&soYW97QN z^7h$VXOs5!xV?R4IAMQwzJI;EW-+`JSh<)eKQe!2t+r{oAXc+~{`|VFEP?EVC}Hb~ zS$e)$w^lp>bGkCM-^F&G`{mVD%ka9b;(uXBGgkgV#Ot-iZf$oJ^>LN4N2mR`v8;#E z{+!Z5ev4$KS86G&(63vk`E_1(#lYG_pAiX@Aep%VTBDto6W&kXS zg)41eWtzebF5HW6gDLBr1MIz^ebYA*k1w%7t1r#3QSA_o{mz2rc|>jLFWWR){9 zX5+ZiA48Ms>j?F4l#)NSCd!HZV|TTkIMdI7a?cmN~>Ae9T@EJo=DXCMb^Q3%R} z4ve}n+KV718UL0mG&TvYE7=OIU>O8c%|s`o9M$?1nqF)Nng-$&$)d!;DZ+gf>b@l0 z`ILbF0>IScD{ugmI+7*z@sfJ*&m6sXG*Pl|UZ2-LV=I5FX|W?-+BjdZj%P?s+)%T4 zZCSs<{dDrd7KOB4j;Ii;4E+ch)6BW zBz)usxWR|LG2eNUf9AnbR9d8^ID=6=p#UHH5OacBOC%SpWKI)-lZ(^hn+n94eI}`^ z1<$(t)Pc>L_XUJdoH>v3z5al9aI-#{=e@@iSZdd4;V|65B7{AMeE7r{5tF!m=LD}t z@e53*@9Q%F`Q8yB^whWU!J(6h97>CJ?*#9eVtroaQY&@;1NR(h$8{R=l;8vu$|XV( z(Kd-TDVJoqllw+qxX^z|KsSkiqa`l_1rO#>V-|cEq527r=7ul2&J7F^g_vl%!Z?=e zz)lSFsl`E_Gs*_Aj));b`rEoR*xoY#S$Eoo*SnOQtv&fPjU-~ZTRUu2RMyW{Y)?1odL z@CvCd&t1ETFnhd*6y3VcA*|a=B*NFWOxdZFquc=1If&H znvi!S`A&d*uZI9fo)h4`RKK_`Kw|00feVV+3z7+zYgpPB^6rDT#va(f6$_IH0A38k z-6DCT3fncxoNVF))6y)s>%nMD@87sTD{#vGjj|`ag%JuXylF!-iijU!Xi6tjwqve@&vwER=k%rrn#o?vWJ6D#5mrL(mSsq>~y?m#dC)a6{uJTERQMZ4itW0NG zZp0hAO$xGbD|nV3qMuMkoo%WA-NAc+0kRJpf7bFLKtT3MnNC-?Sw!m`${gX$I9a^B z4F?1iTV-D?As4%?c;%dW{4blcJ`|M}rA0QP2dNd^jEcvO!EXU~Xl0oa4^wN(eTv6~ zTuWMHVQZas3#e%OmHz^wTwyEilzVXj4-&bEv{U~NiKv*DWu!1H`TSxlq1i?99czs3M(=CqUp~Le zIAG7Yac0B9(E7#xyMuQIHz~+IEz;42i_B9Bl8s_YUvfk9JH4c(9Zx8Xzu0J_3?-lp k&<^k~y>bSOM%wWQ5p}7LO|~8#^>bZ8k5&8gA{yfV1F$w4w*UYD literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/ray_communicator.cpython-312.pyc b/distributed/device_communicators/__pycache__/ray_communicator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bec28142da4542a71cbd33582220ce8e26f8e374 GIT binary patch literal 11299 zcmc&aTWlLwcEiW;EzuGsQKDpftd~Vurq+oO$6CjZ*UCe#BYXYWBy3Y=%o$0fNe**o z=vZ8;aRTo`?xt{D6tIF|yFxz-2Tp-}?8pAK1q$?|KtYCXsZJf#i}g?XMX!>Nra;ki z?wuKuA_b(|0v%g3_j&F)_n!CVKXi5ZICwJ7pQZj{jN^WT74zYZ!uDT6;T|V(k}bo< z@oUf6vi7*$#@ZZl2kmplo$z&J_^d1LqJ7ScJL`#ivfj9t*7=Mt>yP_s*_8=oyW(B6 z?9Oy&gYh6Odop{nJ@Fn|_GWssp?Ju~**UpK?t5Tk7{&YT+$)^qyTwU<+4I0*v<|G= zqJgilvKY1Lj?=kS_yjMCt7lfkTu#osM2JG7c0F)L$z}_=R8rIwqW4`^NOI+}oKqEY zI+IaoD@^oUl%+yaKA(s3z`T4tm6V(3{XkS@=m{*#S|XoMEEA=GBmJpdN=syvMK{OEyyQ3m1HGIS zm7KCm;$^qwf-mir+_FdV3_-u-mA&ac17g!BJ@FtmlGL3t2bhL-Gz94_4R@e+T3dSYZ@s-uE5Hj=|XdvLKD$;vN{F;|HKV(NrIm4oQ=+{uod- z4ip}Q@`OAIH8<3zZaJ-)2UqRU8NCO1^1_8?ZYutjPk^VnNzfGGsw@;#SrV2MBIHwf zIg`rCLS7_dCIe+PE2MG)z`}X?R0amVLd5(^jKaFSl2Qfu|1LDCf(ZPXlLT!=7OtN= z_nMHnZ~+Db&z?~fBBgRD3Nc?}VlpGDbb3zFU=}d1bXXOaWCnXt=MzZ~BHj0{LNZc9 zO%-L`jV+0kq&tb2yQX)ENr0IE(UUT=TI?s{YK%6;PGiTbv|aZn3zBF^hduW}UxP;XyVa?DJ99MEe&2Dy1f`1c1( zyyi7Wng@367{|dJzu7K1n&vp(FKDI^bLnn#Oyg@pQQc=b$zp;JY+8>w#vGCEHCq~A zz?|x4Y$Ejz=dQL=VCCO)8jc^GX`w`!yNYT>i9H|i2+;YcX| zAR{k;7EFm5U<*M+4sGRjU?@=&l48zcaNu?19(L}s@NP<5v0y2{w4?)@3zVDGU4_G7 zXGG^TSytyPfY{RD+d1Jf2t}@ty$Y)+ODJAiDw)cQS}M0}tiiY*=iy`U`J8ZC0EFKu z$W6F|pf1eE03;K>bU_92LE>^C!`W8C7&L(jr6I+ZSrSfPSP(!&P=ZkeQ4O?WfkMV` z#}Z<$;i62$goPzxQOU_O2s}quNTSjpCOolaHxLtx?wrPe$sG=knmJ)gjxEP1XdKc8 zNg}E1VEtOrrFfaz0JjeG7hQ;U2!@(yoztWnaK@P{^QQ%7NO4c4lnRHGTxr^U&=`UO z+D=p05wxLZ?50}C=M|#u+|pgu&z$i3icHxM)`d|!mSi}ot;pk^nrwh4*kR4Z;`FXd zng?Y~E6z1-hromah|2Cz2TjL?Y_eovNH!B0-o> z_TUS3D?xQiLij?TNJX8G=K;^RZ_R4i{47}0!RP4X>MA{U|pw zx$b=E8QeNBvF`lPx3%vOR)_z^KmMq9qC9!A+Iy)SxU}UTDEmh@fBef_^|_b7IQa6T z-jS_6=e8a8PkPbssml zcis6XfjST4Xm5A~>)SvAm$a-cR4`_sw)r2y>fS@Ug+q2cH|;6z5~txf`lOv^sqr-} zQQFmLGfQi>k}d5qYmILjS#ZZ=!dbIR_BBVtUYG1?mZ!PkdxJ2{9gy~RKCF(o1hSEvyHi<;EH*zK?fU?iyK0^i3_Si%xfvdlJv ze}D!`Kz-I^GN`FgSCm3VLa&HgYe`UYNr*KH1YHqSE7Yd~&qT>JjLqU#0^nHW3)<4* z=Zhm(a`KHlpaszYGp!5rLPpLlYb$dCM5Mf=t>$Hd90fIpl*)dEoE{yZDwqsISr4Jk zCWV}q%A!9GD*lG1cbTCqg{pfEwG95gV@1B9d#|o)vPe+nyX0I_k!0Ng%j!J%`pFgD zhZ}IUu(Tu-oiC(v+Vi?yQ*;;8tGW-Ilf<%!CRXpUASX11*&iM@3j%74Wpzg~U(k8* z*Ff32H3gSd5e_aH4G?5Wf~TzqnB7Q0c%l(KaQe*U^A{5f^NDjWE$ZD?ADbVv>KaW1 zKwM9ektJ%k(5C2K6%2+3^&7fd&SA$5-OuJRARZHl8i4U8r{I^MaFG4*Djw=!6?QXo zZ^8uiEF2G*2_LBUJ9D?^%Kp7uq5k!SCxgSa!P&~-Y<2KRZSYiO@YH(OR(QM?j#a|3 zTKJht_?c?>=z5^;;szt@fk#85nnjJJ`P#z`3$@5}B{F^gW8_}(Zm|}gsf1_lC;sL@IXqJh&r^g($I-&= zd(=0yG0uSgbKjx$`K`f`jZ$Us(E7qwcz8p+dt>uNWngywm95C+=IZC+a^!d|@d8rzCr4~6~iJY%Sez^X7Tj9}KI9dru?;os&pILu(Yxddo`C4GK5*Xcl z{)@oOQ$N=~Loj=xyNIk6NG@sURcK-lwOnY7?s2Q!r?$mUxu~6S6mo!y!;Qn#eU>=N z#gX$ z<3BUv9&5*{ZP0rFK}Fx|Cbty~)qGVi~HIrlgXF`!QgvR0;< zpy6(l`2%CUbE9ES8e#2tER}69*+C(9F#Xp#671YlqM0IaE36=1a+GYJfPwybQ_vpe zi*H&={37K78XYZz7SAz-c~GUg8T@?Ix=cYeDR_wRl8UAHA=+qYVglI$u6X`5<~JA& zQ~hHAfKsj*p`oGZS46NKS7lgPf?#bIMPGC=fJK2uf=e`!1BstTzznOg{3EZs!L1iH zjgT2Yg&f8gB1Mq7q7KH(i`ei2yj0{nQ$6V(s`bGtb@E)zz7KF3dJi`_&(`Tux97SdInKB{|8o1V6&&H{jJ`3CQn5OL3rWg&0o# za~ypfULf@X`lKiR_{5{$1Le_$YVX-{;OygIh-$pxMAn&@8OC3Xv!Y<3YW*P{<)|Nl};G;Rbs$a?RDqJeW>Sr~HH2XWJ!nva1$!T>|qiqK9hn$VE9RV{E!Z zZLvs{vK>t@H_J=Aj5dqPk_LiKlgt>M3Wl0vd0=8J#zH(yUgB*LjdMXrX!vU(CgxTR z*^LRO=>=Bv1uY0I00*HWVhZoRKnXySQ^vbnK}k{#B3Yn}Of-?Rie}1-l!CnA45ba} zX<&rvv>B^r+S)D%D0?!Iq(w6)&f-Hx7~S87P@*(b7G~3E@jFiYE=s`MVm!dsMJ<70 z3tLsh60_CtNrJ9|9l{MU;J{@d4Ru>VAq{5M8dAbwXV~o{J@%BD^4*bvAaP)S%qFNL z8TIVYIOGU)>ds4eMd*aJud_t4zlranZswT&0gR*yf$Ajk7M+N9iv$%_a~78c)FQNY z(o?t!1}N%vc!8dp{tkNTQIL9S8xz%_uoRyL%~m+r5wE7k7f>yEDj zoHtzc4Awm370>uX&*Wo&aHj^`3ihu1n3D6L#)K^PlI}sAu+-xDH~>HV+s3G_Ih<^c7bK{Lq%O6_z}G2Wf7rtWWWJ|0Fp z_Lu{qCx+R6_&HU}&lXl8)3kUQLQnzc?hHZQhD%o*YTs0&>px(g>W3HTJ4pG_>zQ^O zpr7G@>;p+I8?353bID|eEI>=!!o3*sX%Jwlcwd^xiCMWg-i|~2sQ&_JM%HU9*x!zb zwHebCkwzk^6Z@JBYG%Rgy0cJ##50fYqiA@N8MQfNA6%AmG693vaysi3I#mP0^@%Rbj9L66FMWMt9tZsGvSS>sq5 z#TWjlguVu#p#CT9@=v&@cAodZEhMCB=PUlHs&lH|%N_dO)8H|F@8iC3orf>5b#H&& zjU^8Und|x6>ADwdm}@SNJX;ALt^2Va;0A{8#qY*z{gajc$@_hk{=@Yy?C9pkr?wH~ zZ;4IdkJjgHcD`#vh0HhKwW)36r|z-wW1C0IQ>Q9pFK%;Csk^%Q^R|uTFH#?;YWtq8 z?0dG(Vau-^+w@x(oP1cYpP3VMAJzk_ku@vBZgWFwLI%nhg7j2Jw zg*uM~m=`**?WV;6Zcw;ax?6hcb(ur3K!?=bwCLfw`nPF!eXwctJOk+B=NqG;XpV;B z9?sSM)NbSFwjEG{Y#%=daNF9pT@HSv?z8a&n`dwf2XIgi4j;K6`Ykk!;QSthG(x3` zazivs)_@^&7u-)l;+3*JV`6HBx(WP#Oumtn^LYEE(uh`{V2!|$a8s4j#7ry+*>6a* z#F~?%1Vcy4-FlA^Xf|&48JnNP3;$Ttg@3Hmg@5cS{OB{$p@)g@{x%@yPxasi3X~E1 z`GcR1V!se2KBRbnwiF=iqQ)qG30h`0_IUD6PB7hal6(w9L7F&q9$s~a z&1Tznakjvh+`w-*-@kHDnEp5J@|WD?H%`0F{tfgU|B@T~k~{KiZ{J66RK5Fdy1sU0 P9k!8;z&9Lz>EZl0n`ID3 literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/shm_broadcast.cpython-312.pyc b/distributed/device_communicators/__pycache__/shm_broadcast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ef27ed7cdd8e3b46c27bf7950bf136b560d4ce8 GIT binary patch literal 31362 zcmd^oX>eTEmEP-p1-h~Cz{5tM0T2YZN+d;!3qX++L5id#3$om5q8~tjy+E%UqyRNW zv`5K+GHpS#2^`BMXvP&o5hp_@o(w&iifPMMR5PheKLT>M+fbQP;~%3eRiH=@$DUM@ z@7%X_Hyfffm84RYN8-Nw?sD#a&bjBDd;717ikuvtkol|Q1LryJZ|OsQtcvH3+sJWm za6v9;2yw&m+c0ck-^O7h-o}tAY#ufnD9#kJgssEYux;4JzRe+f*fH#2eoM#@b`Cq4 z-x_j--NSC?w}pzr#lyv6&#))#9rm&?d#EH_I$X;9hEUmX87r?mTrpg!gjEe!A1K*47Y?^hg;dV zJJc3#A8u#D}?JE5}ywS)B z+{7345Y;Dans{FI>+hiFs`p*=FFnKk!4hFpuoQn~!sbzHu>2MC@D{*p7q-4@P||D* zRAID&zAU?dcX#WI%B zaUm2$s-oE0fG7n0VIdq9rxD;jF+OrGB+{>i8q&s1oiGvOByKjKGK&t@vdWB&7j(0C9*=ePR< z!JvrZ&;Zw|@fjf~w?1P!Gd}i?A>%kR9T%cvP!KZ~WSSB(<``0CtfCN~5+ftZ9AQIa zhD)P*rxVT_T!NcWW(SK^oP;50xN20#?G)!T4T<%L6C2>5683o1Pi4k@MzM*)c{njD zhUb4hyeS@@+%!2pxiKCP(YlQsQgO`-8KnxiE{>m> ziVML_LE-%Ph~OWIhQm{l@sU70D#kX&&W8PG#AqN$!`M4HopGxL`_Xc-9R+Trk}Kd` zPwpY?jVB8IO1o~duOAe;7E33sT){V6vG1~Wk_(}07Ckv zi8Xj;>rj*roDYnL0%t-3FHb{Wo`$?uDgI0-I&zL5jZQ^^{CI?)`CUFHU}6Vjz5MC3 z<1s!yJ&D$YLeu=&z$8XJ#$PxqMEF=VEbx=^>=9x-ra0mwW8;xAJ}UCTD0x_#_;^^@ z2!OGvFoj)+is$%Fp?9p8M^{ava`EwKq=&yCjE|j-Z#s9L9|>R<@-IvUBJuGV7VGPM z2ZJVdpp`LdKQFey&6I|skum>;046CF;t$8h#4bc^rF3<0E^=#TgQGZITs8OX8fU8U zt(nXYI97{Q+<^EI{ArKcv3IHw^ak)4VR>M9!zc(+Mzd?~5z)nB&>t9ykDnipPtVk7Kzs9|_5wuA0teH> z?v|^qRu!Qf%~@8iTLT_c6?%}SnYal~c4f>lG-#$+uK|7Ur$jb57r9k? z+1H9#!9zYv#_sn=0_bdi#_9Kmqrs^V`Q3g$1}r4Ui2YQ|7ILTqSXMS-Vh??ivj=}M zH=I|vPmBgj(H*P7QYj~-=$wI}#?y0$8j(FP7#4fbD42sL!3=_6e#J6u30g2v)}R%x zZQAB@WLUdSg#=-ejalIP=pZML=P>3lu*)RP=kbxVn7!u(o@NwQfjTbyq#%w*gE4+Q z&SNPAh1j^r7E?UR1E)`o2n96j%^gB#~(REY^Us zur5TWLP0(x1cE%kfjG#(4xD(BkDnF9a3I8=MWh&whGjqjl=;Gx5R3ByQ4I7q79|#u z5bNQE*rYHr&O&H;vTB8=LhJ|}3fUWpIj2E!ZC<;rGDuTKR;)U@T zRyEK-G$NoDB2_LBXVCil`r3J_+*AzIuGh%|Ph+LV#=|HCwTeYUQ>@v9HPjA5uY^Lh z@JCtYBO}vtW2iKNIyDkykhCrz1^Ni^BZ3$YpdRu7@X=A8mUu5qnsJ?(icNzsMJN5S zjGG~V7>%$$HbP55eYB-<5+QGJ`qG%7n7l__E(y-B#U_AI{TjrC5nr^r*`qEhOU6t! z%Q(VO3~v;eKT`w-Ag--#@nNJFcf#?R#6I$C0WpG&sVJi}tSm-w{WDdx7WFyXt42^e zV$E={C>xsJ`qGszEjMhOGq2cd({&r@U7fD)oHKuDulW=_u^svUDV?D<*d*=-1o07a zXc_0@6Sn$i8uOZvht-}&##jlQ+?IT3uf4zK)JgPr=Q^CvTJu>04c>${T*#_C2vwPT zH3;t+{G9{iD)L#=#GaKv@gg_dG~~v-t6*Kiwecb~PMuOZ2Dat*9ZiLdg-u>IMHq$# zPeSt~;~1F|MMl~b&O}C{gji?F^7@x>HgV)J8WMDBdG#AZuMaJkx6L`yuJQ$Qs=7<6 z?pivX+W3fs|J9G&bUm`__P%!H(vkUis;X0}>ip2{`-QveYsc6K4Ec;>fP}~X=-3}z zq33&heRh#(tGJCEnnxL%-ye*QfB;xVL(xE7Bxr;2=^@-D9>AhpMSmYVOvW*~z_Nc>fCxjrKEI7De5r~2bM8Om^uFpV06u2;nQ{=+dh=aNuK>nNhFfqYpnlniE=XuktMlhtQ!*rNwI$3KzaBIL z8u1nEi4zK7&qS%})8CYKqQDxwYEc<`z|!K8)~n}LeG}!XPk$pNu^xF(&{WH9<7O@K zDm7w)SgO1yVTo6&K@+vAPk&#vs&$*7tw!DxH0qclh$$L0si}b@OhH>t-_X8YUM zP1Tg!I0NV4KE3A>=BrM%q|eYtyi!ost8mW~hKw1Ha9=TCncY|aR<0k6D5{N+oZxW4wy&^v7!gwgeKQBaB zaoR0{A10F3l9T_<*f@?u731(n~WDAJVDw0){!t)VW8S$x`$xvWA zIu+lk=b$)<=TRNmdZl z`K_mPtSW3P9!U6ZVU= zX!-ybd%@8F_E$PS$?K`l@|m9|YoL|d`l)dAm|UyA+|PdXQ=S8Injs}ngOy*1He$?E zFg8C>ABWTEsu;-*38!g;LVX_t)(wTC7h*g0)SZNN_N zR3I^=Pk?CXnOIBiZS1@WH?x@WQP!LX7Pl3};X&=;=GDGXaoE%=<1d$1sDHH4W`bQl zE(Rj8akdvHTBy`DYh<1#5ne!vz+!>851+CqkWm~6JhX->olbJSEj~soDT+ynvqqB& zA~_)l8{7N`g8JGxPNXz28mBnSASD@!9gm%bT!iXJ)STj29pp7ESDV!%9DvkNi>BE| zk$M3q@g;~cnPwMG%y*88kfiic?fTISf=1nqI#C9qv`XnJ_-VBvO4fe1_ zGgSI04;?=}ATJoED9UA6T#tR^&1<|oT58pQ3QgY0v|+dQ!GxL9oQxX zPq9f6VbdXZHYoLy**Y2jWi2s6fAMH|e598@JSy{8jG>g-G!eo&5onNAnXMkiD~?fB zpu-tEE0?~7p%{!JIK2P_^6Kbq?$K}$P4F;D$ytZ88jeE&C5DCIIH6?LiY?&sm^@R# zu8=cB_5wPk&5MImjCCGC*T|yx=V@E|JoV-fvAZfsRC#MKkCaph_VQ0tArQ(z!9~Su zAe97s^vndxqPjC_VwQdYXcKCLb4(IA9+x{ZKMnPZ1`<3nYa!K*iT^NYFxocStHlDgq{!XEJh>WrnUP zma(3@K<{jUPsYAG6bPRQ2KLN!=Exm1xnS>Z(9=LDwg(CbL9tg?83x9OfFZWQ!MGQK=N3o_MGIBQKBrL1|wM=3DK18b0P{=RRo}Be`rettD zBn(C4gQS!i5XGqIv&y1Vs=HoU)}pN1$Y5hs17a2w!739<%~TfdIvSwZ7)BksqDcc+ z(>!OsT$FBTT&%wGMOm`i)JyOBbZr}=Jh$B5l$)2_{NmPB>oy7h-P_WTqaD9={I zJ>#|k-t?n#!@w@nk1F?Z13ktcSJKCidCP#$_TwJMz(bZFZ!=QJ4g>igGLe6mWzcNP z(YnzTH{tJ&2!|rB%P1fLLn!6Wdl0J0tMsTsQm9e$DMhblVM{7&S`q{{H}hrV z87p+8!H_Uho;w@m(D#uowi(Vv4%6(lW0#I$X34WnnPTj_QF1q?+^v$kbLRdEV{mibx?-$P?`F#z8GEi2C`TaB1xsB6f{u6S> z>fy)@9J(|_?K*nt=vR-eSxnRvQ=z74gS?2W*$RLJRNkssOx$01m?t#RB{D0I+CT$Y z4-^<~v2Gn$4i$p_)?rNpqfc`_;}AoEP?g%w24aDDTtSJJkx(=yh+ja~jCCpkN_g(R zUL(!bxHqIdWv{*b>dR?wdCJ=& zd0SFmpXBv@=i(&7jvw z#5`*xfpx(Y6btcnp+#6|OaMm~(3%tG-#5NtV75xIxl+fUlq~lNAQchgADb>1WEHqs zd}xjsz-HN3VfumFLH-KMzDC;*YHjfU z|1z4SM-)$!qm8B*#%KPhxoLD%u4Se+cXegQ{S9T#9lbWm+qU@9wbQ9>ha~*(I3#%w zX`|OERd!0nou6a$#22W#%pk*^uTC?8AIg|uQzR?W7^W2{`2;zHs>MliV&vr6Rve-* zawu2q%Wz)d7!AMe3K%Rkw@NG4EaU-mQroa*V_uNx>NN-RIyq12{J~2TcUbCA24N)Q z{KRX9<2GA*0LWUtW;9zW7Y~5CTPp9E)Brtd$)rc!rnC&pusV&PI@+URcMDW0I!JivEnDHv%WEjUxQ|7fxMqESvY<+P?GRvH5KaWeX#VTb4}8nr^AAXDKd~ZCN(&Om}oA z&D9I7i{>j`%jWKMXHU{xv#@2c{L0Q{b5FXVC220753ZP7~iFIZ1{TtOaVrokmcALw+=(CJ!obh_4{O|U_KYllrJX>%PIS6h&SQIIoBck6_H zmm78o?qEaEEfgsbjKQK;EGiWrE*9KM>5Y1tV6otlQwKeQ7q+S1tQ;kI&`O|VF8vHJ zN|CM%buAOhC%Vuzc~4g9Hlb3sht+CXg)%E%F$`A=4%Smm!AhYfScRUc7QE<-+90hY z8?BB}BXjM+nyfn4$@$R7JmnPXX$8SvR>_Z^vj^+4(l?OZYsP&Hx*^#9JV83HA$d;f z8%L*NTtp_IfTlgxw3SxzuGH`J+VPub1Ye&3Z4V%!^ z!)gd&!h}Q5zfSqhdV1A|TnfeoS4AcWT3aKqGSBXZAUEq^AzCY$LSIWCw9GmaHl=6( zBw>v?|t@W>Q{V+>Qnm$X|!x zx;EiR6eXNjUGFNZ9HkO#%bo|!bn3<3YFhoR&wlL0iiVgZ@AOR3ekeGY&^pSu$3Fg_ zcm~B|$dxfXn=w2$V~vkY?%cE~W7z4l$Y_m-Y91k-=|2DX5nrLtuMrd*2nrpqkjH`XGWRo6>>A_9A?yBAbqB zLe#s6iZuZHLK0ZXxf_$N_SNS0ZwX67%NuqrH}8hj0n--|@}aZnyzg>JazRYhWabE+Zbl3&D8zgr_%H1isJFnJXi?6tMueLt4+`2RA=2zXN zuRVV0@r9;j*Pa#kUbgv{Q@2R&mSpRW75788n)tbcAG+&r)izQw>DCTpO!JWbp@ei( zGrj9?)$vK#i@ItTp1A31ylv&m8&?}zzww26XBw4ie7o_x=H=#XOa_@HVemE>2?H{NPy=CF;cyzh@(Ean%L_35Vm^yck<*8QFCw}4mE>{ z)*;d%k&wn7sPPr@lFng+zq=-0|GT;>EFc*uJa z+V&FpU1}s)C!=4h8Ed7wJpt=k$d)gzkt(&X070`rOc*D46}JAKXx06yFW#p5f@Yn3 zp*;oIomq1ntgrk`bgMr74YHmMqi`KI>giPZjhQw|42s_WpMA`ZTzYg32%u#%gnm=gEnG27q;rjb?&xJy5d%#?v@$=b-24y}f!_9aP z$R|iNW=j0$s1f6(e9hu-!cQjQ5#fTWC-{AeW{eG^^Ng=LL}6bi=Nx6QfJYHt6c;Gq z4RWrK^BOsX5X5;nnG&7d@za^R$e2iDE%OGyO}^iuWW+Pj2^aDA=*uib&c`z4Pl0dq zpL%>?;DrC_#|DP{hYvnJ!0gH8^GJU{xrx0HUnhrH2gXEz7bH^k^6p z<`wszbS(uTBM=NNJ-gz5I4ga-OQF?wtz5e=)%M8Ewnu)q zc-hNC8&On(!q+-EZ~2_-ZU=YJaLn-DuDtNIX0EPDZjH>jq}w}`%KMbcvv*T3Oc{beiY*>eJp*C)pYPT0|qpXD6Ah|azzO>@r za?4XT=VH7;4qHGZmj3RX0AINtTOj`XEGbAqeTa&j4|2}|wc=pp&3Tqi0nq%T&*m1$WNk639IUX4aO{u1KPIYf2Z$AM`=(&3FU#MjXBu)4Yku^LgjQV)tztI?{qo*d_hRo=S+}q?Ro5%k z_5P${)4YM{<(egLbIQ9x@^1Lh+X=mBWgnCyP}q4|Bu~rY6Tl7lE`fc8o^bKS6>lFf znG9}|RJZ9T6@3agS=ZVjc{@Jz`UqUV0*)~IV&JByLjk8JT(W=Y?O(HTDBjK$m!^tZ zZWguNDyvGBwMk`dsWP8bhW%#u+`)8N)xtwJ%UY9A{Fk*dRblJRqE?o)T`Fr&m32vF zUCU)X`AGr1xQ$_^jExQ|!caeFe?a2dR{Y(09Ej@ERJ}3V5mL7!DLr2?=nKCPQc=^* zVoA)J9P23B3oUyryaKxi6^-D8jyX$UYYyTHj#^HQody3JG?64^)|#-2uOuJ>D~Qwz zEWdL*FeT6F=W5gzKoCnPoIRh`Qcd)1__7P)XIZ#x)kVZ#F(hnpO(2{wk&2uNpgXco z6j9al9$gHq=T_URzG29KBkD8Yg{iND?4Qm*UcQ0d4f=)TW$W}uLtdeB%1<8$d&2(v zrb3GNxF(t=f#hfHf7BnPsrhEPQS$qMUY`H(b6z1_wU#&`cvUe3tPkoxvfq#22Q z3iXx9c#|3`AFcg5`C{apg9D}TR9H4MqAhL4O^4aBl$mi{hG!h^Nn9GT|H;9@fs_8H z_njQVm;dn4>4B3^?R#7#4mo2UkHAQ~i(pQY^C~$<-~em)k^fKNK(IKL~DaYS&v zf*2ug`*{|}m1&+BEm!?Z9-l%)M zE>+@_N_y!g4cj1tYU*8jL2BxscmK4kY4OOl zHj)daHV;ah2XD3w-Yw=T_}@JSJm1v%R^&=#xv}?Jt<<<{{!qHAF+kh*rMr_QyViDd8#?Aaiw0Qf+(Lt%ef`-~g-@!`D_^nedMsVr@Y`enQn?!vi*9A)Uxv@HM{2b-STzKA79*e6Q+9>u68#FiUu2eeyVM&)VB57 z3sT!d*AIUG==YAMcAu2+zxCw2c?A+WZ^gam405gRJ!(K%$%a`t@xS=PCPD z>>$SUEBm~IZH6C|?k^dvG`-jAz{h(%UGV>?(lY3@{HSi<27J74posUKmccT|`yLyG zAiBx+ejSCwvX;)Ng)&as{Lw*xxG<8X&D zE2NMoAWzYlu!(bWO(VMEgeTYou@aUMJ)f0mC(KNX z>PQeNgUiU2RVPy*Wv1AcghSU`CM;RxtX3|K;|hw+G~%G=R_mv}LC!1+Il~y;N6uE) zLGaCLx%wL^2@T~vYB^vgoC#-PA_h|PspF{-(*n>xl?b)uRRy2cSxpFXaMaNFW>lFAsa)#l=TyS*Sq+Gz^RtXSgo2kj!Wn@SFHIXKwPmDCJ6|*fv z+f7ef(ms&fFt7~Q)Asr3Q@Xx;>0qj7ztpq;#uHM{pj3Zo-jPM4s#@QkUMq(|b*kJa zmHSfVy;6DaN_pR{ss?DJmPY?1{7(346!d6m*K*^IWc7~EM!KN(so#(W^=jFF;~A-C z=w|iMFFphHQmMk)AP7b61F07DTzelifXtaw{C0^alEv?oyt^JumDnz=UGlc4yxS!2 zwiWLVqEpYPbc%|X=@c{~jG*Szscl53HV$@}-rMUK+-CTZy&e7^ZL|#5S$?#6ACHgs zODW?0I?G_2<9*&nAsv>%KHK{}6rOXR4ACUo3f%d7NOOM6@mtRIBs%$9rJ!-l@G4l@ zSDis~9C_s@Xo+hSZ`3^Jm~+B{$@+?W))c3b*i*>xJYk4ya+*TeEn&K9Q*Br-Ib{1b z;FLm?$<8qQUtec_MY!%kv$DzV1GV+RVih{_I1-jR z?vevrlWy8z>gjds5m+8sf*$I#S-TDy7BL=Lefj%^X~g9Na=6bs^YGJjXp+u&g>y_H z*lh-|Mu2<>H&-d51|A|O^*%Y@O!a-+A>~p9CZG72W;H|4B2n@jK`Jd2Vt1Iy=|Sb?>bfO&H}vsk)of63FwG7DVmu1@ zV#cEwysGtLEC6E$h6d@HyOHSR^v8rDKPDt6z7NOel{dD8(Znyq$ynJrKmjHyxFs!? z@jiC=&|?FqPWw-sJbwE4f#Z+MVngagSx0o3e1A^PJY_E?A);SCfQNmmNNkZVQ&jkZ zj0q+a^j}$*lIj!>PH3l_fLsR@@mE$Cj67!do_OIaN@&}JTDvyax>G+xQLc?LF zj-SP({1vSJ)$QueB)@anzH{#7bl1kEiaA%hvTDw?X0|(O-VCjAcwLSEJ-zOhbG7_i z^;hcCZqJ(=Qnme3ZT}5A23G5wbItF&RE+&2&H}j_C0FBO$xT;F+Ee@Hsl~dbC*E#c zZs?Odeb)~D+0pMDO>I3cZ9RU|b36@Jp`t^o=vcC?RP)!o>Z^!9z|Jq%;+%G$yi3BurNTia@zobT-Y#w>}M?-7p$u- z?Te#-)V*N4)!w}nczbxkx!V88^|2c-EcYK?Ff9x&?o3wqrjMP1|H!o?H>#wqk0rYv zTiPI1AGz(~T6^y~NMZ`p8?>`Yay2bB-*mNR>25Yk?u|?5Zdj7;jUT!Pp(S#+-RnS> zWJTYv?|BtZjL5_fo2&z^ruXXic?UL`e%xvq=&}8HlV#u`+kIP1M$T!;tVhblKLvuN zRCy1q9B?2-C9_1(Kp+fMxU2$zB+?Q)7lip=mI*61oe67}w9+<4WI`kM37blGKw^4H z`i1SHnJ~_*GaCv>hcgR`D*@9&mWm833n&plXq?&QLK)O&=5(XjLaaf;F1H46W6)B| z>GvAVx`Vi7F{s_aWZ+)5;8?%5i7m8sMIRPm7Jyd?Pf+{ZFS}0>o1w`^<#2MvmIuNv zz%FFNP5^dXV;`VdS189<4EK@0QH9LeXLg>%CIFWf;>v)}OXqa7C`!}-!h=J=kA!n@ zqFPbllo@K};hSSMUN4GRdCE&>)HM#0)tl>2Y`#!ARzvkc?d+$2A zy5_ffuJqu1PygG$as8za`ajrw{pFR~JF5;>wzm zO%E@Z!j^C6y;kMa?5~+I*W$kI2X>j>+hsXuwlVXE96b%O7z|rog$LNma1ol1NRe(z zPgfq35JsKdWVuxka~W|lva$kVR(%Tsp$<2p;UW~GJVDb{O@QkGV7Y)Z0-hSZr zF%3Jwe#AO7{&%Q34xrOC92EZ=uJ{og9O{nal`+#5p={@$DJDf>7ImkYF5aTF7TF?1 z7EvqQC2ss>Q$J&8*}w+n*=4ZUnUWlGO-YZ_oIDAzAk__R3;x$nf4BA8*bVb{BFmcx zAW!yGkWt;@&bgOXJiQpxq`Q&XJZzF`H(h%&S-WYucGsL|&OGX^}nq;QyXNVU67LIbG zq`Zm==c-HBtK9?bm+x-KQ?BMmKTRFC0RZT-Y7ePaL7VYikugq=Wn410lkJA>?2;E~6$;;l_}iGBIhKl4ANqLW2E2OdOf?-{t ztBYcZOeUN!8e+J`Z%ot1(MfQhHPC^goT}LD07euSsPh3ndwH?D_X4WT0Y65!)XNv$ z1^5_79Jdk6Ga>6j5@y7J!NK0KE^=wDBt0RM7U2Wx9Tc};yda1WKNN| z0GE(aCV`CxU@yRN}J+NvY$K18(?;{^V#f0*2B}G z&}Zf2s?R-_J0GY=vJ<0gq0xo1jjbo@4%)tBW~vjSN1K~e1-3>Qd>S2fhIX=b7l}qT z3b@*QS_37IjS2}@V-W~?bQJ+aAddm5$lH3hnI|be?ibFOg0XnU!cYVl)n^e35yDh3 zka26jr9g^iGfiUo+�z{7r)P7NstxliT<~fe_A`V2a|GDcIstNSXMrD4~lSI*Y^3 zpg=zGU&yc3L;SDw`QPAVoXYtcx({cG0vvQ>6lzWfl5_7 zvW-2JHSK5ZL_G-yDhG*Ywq_4h?`O>Q9MND>cUM*{{9~Gt%-m z>F=zR_Q-bdZUw~autDMqDR;qPl&Fy68nrH*_FaWjSKqVPm*XdgoMG22epbjxN9e%4 zfG<&aBq%ghzT*c8^Ovi1%08h9y4A8l>CF&bt2tefFT^!FpoDgR@Imf1_iIHXrZH~B z_#ArTi%37~o-Ilg#SE7n3HR&9%Wg9_Tg<|2K?_qIdvqJ1M6p_TCC-~DCYU3*U|fcX zBav?KO0d7mfmkx@)_3g*Pr{ohVLX^sJuW!wO1LJP_1369mD`yu&DtGt1?Ejd))=>_ zx%GFVbj$z&J+mhRr==-X3|u6N(IUxX-jgVWOi1ZZr266m%5NjiaJK9LwJ0lGheVky zp_wg>loqZHj^=8$L0?p>l`S|EO0#D|!U9gxHPNBw(BEp`f|GQe!uT1u5z}nZtowrO zH=lhnw`xK=JB97`znY~rYUD~m2)M0r_A@|@T5<|^e*prRRYRUN z&+c_#cUXdFIX@m0a0TxaZmwju-}2<6$wS6k_W?a|NS$8Fq*o3|6Z1)eRO0k8YYY6p~!hjI_)zze;(U=+}#SvygP}WIzP^z%6Ac3G?%R5O~ItzeDLA z0=9N^{6JuOP?=A!&=8Ztn)_-0g8OsSUq+)~#;*Iy^ZE1tPEae=xthBa6uw>Ntf?s( zuk6>Gl5xwsL;21gR@;n~YCaVce??XQV}fspWjy*o$eohc>1QZ0QNI}JC}`#8KeCdr zW?e3RW<&OzS)begDj0~^UxM!ge-8^6cjf$zYz^JEXcD;Iiv>U3&ahP2F2nSE?Y8YifRL_R6fBJ zbK9aH$H=c6q~?d`i+|ex=yLm$-)UOfcVpiNb;}1&|6$--Q_|a#Y=06(r5oDQt^Mha zhtq9a(oH^`!)_>@cR*)Q*`#l823I{b3oUPkRy^ISIH}?7lDu8%in{s7aGv}nsb$xV z&3|t)mXyNywWLhBc&&1&?nCdkUugN#)%Ek|=goR{?2UAqRF;(M}YG9Ci&9z*!d)~TQ-Il7}Ayx0VZoT1Lu0A?<`2IZ)4%Ywb&LdpwVZ*OJ zv2q)pGQ^1Yde7uMRAhax*i3GZ^H7=jy&Yz_FccrhU>iQBZN`v%Gni>;Xb?hMVkKc$ zz}~o!wlV*PD7LR98M=J)6WwkgU$wxt$8MVE<0G1Z;5GRB@U4D=j7PQ`7${_ zBu5h(XH57($Bgxqd{+cjPS!L1XYw)pa*=%hNDi$D(Stdgf6)}f4CD|lih0l=+)C!; z`nEzN!uO?X>v59*cG(jK%M%8SSwr(;*_FdfW~shs%^?RC(`{uQ+3%(6%1UH^Df|m< zuOGRiWFE!O$ki>Jzr*1L*w)fDW4EPzar+vF*R`Qf=tY;7$>E<28=5R_Yx@nj#q73m z3$8VLa_RKjU;KoFSGn2@zBT-`B(6QPa?S1Orq*;Ff7`L$QhBSSa-sb7`ZWvuh{Kop zb@!T$zUa!+PbiH71HRjn2A}1S0k`=beEab`&|@4jtPLBoAl6#SG>GtLL8LE&c*jA$ zhgjV$maX%rXvdFhKP_9A+OM^})5E;iw|qh&N)yRfY_>Ggw`F(Q zUA<<)8;~n;JuCT~Tv^o`#UZuRGxr+{@kMOdlb&wL(ep_W!nJTah@?6BrZi99t;`|`6hx9F$iJV-?wDt@J!7RGwZx$Y z2c=*$+eOKW@WY+>jR^cwXBd-SzL1!SrAioQn@t8$3A&d}{_Rhh(BkKMm32W9e);S& zN$%P0E%F^LwAs%%7)hn0$>P7D@5=i~EPh1~J;2r!+0kWOEQNfvi%81uOf?H(>3SnV zoPJc(FXyU4v`FZNT?oT&cfsYYbZ<4T+7F5(1Yq{H85;;W{rrurgJah+%hF%*=Tw4) z343Mpnyr+H+2c-;uZx@>avmb*5IGmfVT;{IK2j0M^w7Rr{*lN%B7R<&!+$PD?wZM9 zFx;_nhLWFgML*+QA93wJ=Q=;)d>?U5A8~a*=ei}X8~(rL+#hlL&p78tT-QCL&0xI8 z!TDt=XQ=r(H}DbHde3Mv81Ha!ZWrU);g8Jiw^}<=t&d5qk0m+&R!PfzjiPAY^6fou z@45cuO38tXFsCsX`WH-V9A1lC*lTGUaEGBE*A_b~=D)D)XuN1$v)T;S#nv?xXfvgZw!~_NYw#^^tadyw z9IyyhK@P}u@1k#X4cA+R!-C{{U64xOwQ6|{EUyfC<+Al%o0i+iax0L#PHr@Eo8%If z??-;6-om@QiZI;F-c{jUwf=6nWy%t)`6ZP&9<=6c{RakeW#O~QloF1l#^uylLPEBC zARLQ@&&K4O`$$SwC>6Bi$|8w)O1_*L568pjWCb}MDVz#NV&P;G501!KI2uQyG&vSl zWNBnv9#1G!$gm$vL@wah)*q)w$3!?48AHMwjYm@>vBWvF7STG*e%#u_>R%yoLy(1G zizEzNCCjkQBX|U$FP{B!}!AbxO|H9m6ik^|~>-J%fRD!1x z$&1o%**j{-RNIGrQmO34vmMW6crL|6)B7^njy!7}zKJ&#cvCJ{V7ief`Cqpi)4C2l z2uPK(ALT30S@CUFN2)?AygzOkY1WQYsz$G>r>w!cT-{JKel8||MTufghsMTF;_<-b z=qM)m$EAW$BI4>7ldMqBC043i9G{G(q7yN>N0GyltcVI;i)UFaF*=SH<8nL|PDK-O zagwTveDYaid;vn@sj+BMWXmK@D2ajEMl? z1PaTNK3S2m$@m2^8ppV0J`zRmw$|Lx7EO+fDzZE>8gCOvC*u*C z<)oNOh?0yI9FJm(Fj;Z!3w+FHW!9RA!A~Y-No)EbbxB(_-VxeM)b&2`%*)$*&Q7Id zF)E$blga3`eEMZ<$;ohRga&r{44=QF@{~A`h)3j!R8pV6@kBh4N?>kAqS8n*6;@Kc za$M>YPhn4AV8d0y@pD+xQ4uXhFUopPjtug(?O|&=!ux05!o=8A5|ba(7l^lr*~B2V zG4@Yl(y%-c%x-uhtRyDm5}zNy0aiT^x9FW#=U8}pN<0@<&H|xeiDNN927&U~Fy;<0 zDCUV3rRi=2QP;S_5nInaVR3k!%8%P8DZ_}&*GLoafoda@fa2>tcd*_D)0$b z@XVPr?3eFCY`tl@Pu#M3vq#WnS^$uNIAq`|S*DrqI-5x8`xU4u6^+FRa{$)=*^QWuw4Z?vK?XLAI389m zVE52q0LTmr%Zi3AUOsg4;K4y1WD0-9(*#;yY^~Nzf6=1`TL7V=?EHhVSOr~Y^9`bb zbz0IO{gGWQ>~?WF4^H{-eTh`Tu6#5!ZR-=il1FfPj3RasPLIYX34++Emj}Q+WKwU2 z2uB;G!ff%hgHOo=C_!Alg?}z{!WT_DT*d8$=wNG5>;cY>O^MsKa||kmFNULxkQBoy z5tyZ0OePABhdfH#%5lVYGaQj$;?pMJ`|B(@1HmoZi$PQa+($3`_CE2TqJY47Ivkf` zw0=F}iOCe5t3wmv2tNn)BgJfu3(iH2JfX9$aGAILbDa^k?5C6|8Z`(Yow}kk3Pe8P z8623r^NC1gav}=)pAM;!xO_?6OG__qV&^O7j2(T*!~!65>rQ0t*dgxIs1jDK3x_b@ zGd=snli?^)h#ZBp73dS&jwcA95|ih~#3W6=PT4UkkW(^ktkbk0y!VXW6^B6YrlxpT zdq5r#Nv_d@ec}m%-U8BBBWFRKCMIK`W{4;kyY$f$#iaF8d{(2=4Ro8&(MVV`;Pfo=nV+yFcX-?da5zG-aJ5@vzp(|%W<*@ttl^~uOwKH0-_gykOV)GcmCrtdX)pzmgaY)N;;qORLD+ND%%Q0OZHd4EjJyxAaU9Q8upaVqY{WTxJUlMv zoDp;Y+>Tx2XX(d@|B0zzeGUx&4PnMIBaBc!yJl>7(u4!xYw4XP7QmAgPntMDduOye zBb+zVdRu~kT%D87XA09uqa|HX0zn9#s8{S_PvEHwo{TowJCFWu#-{W}sU;?iq1;rh z{_cE0Pvw6lJNlDO?1YpR zKa$fyE?VF`KK4KeJ18-!MC1_?23?R9WSom8flwkSUUaqKX_dZn1AVz^Iet@s(8wNE z&Lwls3zz6ONnaJkUybyp#wU6wq7yx!dCtkHp39R@?%H|bNx3%`J=;4m1%Xq%_hKwI z-Yel#O7!d`SoU5?z8FP6@;rY+N%khk#zzuo&tr!I7bFxgexZq}oO=%l`m<7a-}LrJ zidRA-BTUjX652yTfmm`M61>(o(aj`{rs-JCdMk80Xj_1^vo8w2||GZ{-r4oae~k$ngU&ojN#_b0Ya{|5tO~p(Dcw z^-R~ZgCq2ut0+WiBRDa0{-PXqZ06P#p&BA2bLE9uRL`v=R)w}^&c{c>F}eatlc3Nh z$T_*>E$5KrvyOZY;D6rd#3G4we87U(tVq? zt!#GiBVY6V3c+8St=Ro>MfX2CZ2q$QZehc=Oviq;WB=__cRHS#v)`#`Ltg`pncARQ z8_d*()Y{NeduHolb?e~|YY(R@4y(0?A5>G_fZ!>A(|666^&QCiDl)!i)z_Tytyg{P z*<-8fYyC%;-Ba?Q4y9KF+lqw{y7=|$I229CZX9T6 zO1tzrki{1tp7dnbOR;xGZT4gwhf}toeY$2KM#3W!92SUQL(?6MTr7|$lF&0^l;!k5 zh#}zPL6<^UQ|ZDDqH-#v(EKSKblX6;PP+BtRZdym^lAGsUZY89;#%@l*k1te|=DFa$( zXk9coR1W+)>w-GQT%&RAQW_r7mO*^zB?TeB##1QoDO}Ojz&uyP?x#JBMY=L&w z*M7&hAi@pa51_Shi&1$Ca7dHzYhr_ z7S9`Vj|}ZHpDW2CS>LniM8l8(IBtaG7*udd1ej`HBW8~AZXp~8(NroXpSo$uIg`PI&D^KyzEE+~ZCJz07~0+Jq7aT5B@pEoynog8lnt z@02aMZ&#(uTb9cY0e<hJl9hwA zHKrz{>_$7Mls&kUstn!Fm5^MAPl!GjPvLd(Tu_5sQTlOnp_lnns!e3`Yr-$e{cm2q zc6C0w7`W9wdv&>df7-pDO#(Zv&4>S=G2AbVHvthB2Jt<@r#5}x0G~brqfXl z=--;leBVxEFJH0FSZ6Gp$uQg7)B|PXmueL|Lb5-)#cAx4E7rKryv?Hi7_3ZwIMbk! zuGon2dSq|CDX(5$a_DRY=8aAfW4D@r-Mm#cPs9!pC9vi~ZrPeFJnCP?+lq(Zo+pBS z%{61kf5Q2yHQ9KinQS1s!tCpoU+YVc{0{4bF-Q5n7=18@EfXfqd|r@i!fMj|(zeB0 zllHGT%-g!+oDnWt%xVChF;@|5(Q2Elk^!ZU=o8?@V6A5yGf*4@FFU9IphTB@^y`Av zu&$#!qp=5PASydUY{nT)vs z%W9~yIlZF`!?X@3IlVL8FoedSAWo*lv$A-O$w7#O0)E~6OFG1+k9`3VUbmQ1rl3oX zF>$LdyXp!Ore^w$$q6q03zl;7YCT%!gcPH7iLT0d3nYiiek#yTH^v*;Fj{uBF~Uq~ z#S2hsL>6-%sX`O#fUOzmHJC=3FQQ-RtF?KQajsQaq)rQi&NW)AOL5#dDGX zfMydHKRbQ;3{tJ>4UZ-~3d{W&T1hTr9vN3vQMH!lE3cdj|Mumydq=u%$Fh6J>}61a zbM_m)>ps;DF6&-HOQxY$ZRpjx8nxkY+8xMx8|Ed|+qQ5j)6u7P^nK{^=PSj7#huW`jKi0zq{ZEDN5G@eR?rk=%=+PG~=%Ir9% z;$P#jw7V|rZJbYKTDGe#+duSfzw2*Wv0B#kt=epr#2uBEQ7fAnL$~4-Dr+b}K6~xi`SxXB3rH^y=;5N`RX=FH*{rrad8_mG;7^aee@yLrF719E_4XaG zWcEIh8PO)S7A9qBdEYp#0Z)Ve|^k!fA?n#iS*uDJ@@BMfeGs+i*L$BYUic4(t>ri{~Y#;Koy zPY73BGcKHAj#2BV^;^~(7G~82!Jp3cmkJ8IhU7EwNO3oUUIBH_XS3Hj=sd|zq9KtPhar{z}*D*}D zKL)!Pnuvcu6UqOG+usZS1QOj}H$RwZ+@j*2f6J^3+Y%ri*r*0JW&$BK5L&!+tM>Nx zpYC~okGktbI`G_VX%>6!^tIFTd(**Pi`##&@8-UAL*H`g?pZ5wLsza{nLo|5Zw>zB z*pH8;n~p4(KQn7b%hh$6stsz@hD_B)wQA$ylS`+5^vVxkQ8)G9KKwK3{p0DXp;^x_ zd^K}ZcYLi2FWvJu%%5FopC3;9J3sVwK5z)7ZCn)bzZrk=^#hv+Dl8vt^$gf;AMExZ z{TFt}K&k65JT9bjC3)kLoHNeYfC$KG<04E=&#V6v+vSTA*(n1o&l@5IqeUi$FavFX zvk7UcGjA5!U?v*^ThjU(k-a9H?}}Bjvu$Tm4>f%GKIy58!kk6v1h88#U<^pNnwK;0 z3t77{!m3+JCoXfSN_pFZ0bA>Q?Ho_U{E z_Jek8vHf5&U#Mk_HxGTJvr+avW~EH(OoJr{xnD4oU<^!HDf2ICwT*Vo$9Tm( zv-%OIS&7o&b!3zq1L+Ewybx#-r!PQW!X6Z^FeCmKvjr7$lghqWXCFa&&xhyr`LidCIem)2(JAZN_Qw&n83D&l6yf&Rplc z*zDme1uepExj;Y9b@$Me!TzTx9}7si$C-#+IRrrGtxL$%*j#2vMPvB2Ti02`jc8uk zL^mdKZ>JPv61yn%BHf7n%gf?(4rb-55Eck}I9c`;D)Cj^a+O6RIm-+Pb6%P%Nt2i| z67CdLpf9Y^<}uTPBsI;}if)=Hm6~9z;A50)dYNCLesT~;D%ikwL1zOGq+1RwyARA> zz3VO`-4s}?(%no=Q>2%=?{Roapyz5?pJ{$VZGIx{Zn)=NH@hzjDy({=TD@`hP`0La z)|)N!XUaO%vW~^7A2i);`mk)rt=&JXS>AmjTT_2y{QCI9V5WVyihnhGW)ElARo`g2 z-m*}8XI)V3Ij|VL75KQaX#kZe*>)FNj%*K8d z{~8Zuo7*zY+tudnpV}-wgb~Fzh3iXDsQ!k$L-_KxnDz z&X$AOs+NVFi~aBHR;xnSj}kRn|DaRwZ%nTXu82al_@EUJkl2@37EXuCr{m7<6Zb=! zE>N5ATE?OlPr28LDfbIdnR#1PZ)*Xg%?h`D=xd{S*{$NAZ}&&OHkK`Z=o4v{cd7X2 z+x3x8WGv@)wP`z8&U@abbSvYgD+B4e?sVv><;woFum2a7fx8ve>6##j?cFzbFIPPI zpQ>;DM*6_1}L~+n9_r1Wi#DK zuchpy*A7N`bHU6Y%ScVawp%$)C5Q&+dy}YX<$1ah&8WOeH#!};guIbbMBuR-i4T+i zfZOZBr`A&zhkFGWrcun+Hh$^}IZE!;H?27EizP1i&kbHXzv7}Nx3I2ymEM1L#$tDr ztomGz*3bNQNB4@)>Nsp!uwlONyXaVDzbkI5<0;ER2jxCxSsb9;rz|V(kYnpY<%)ox z#p=c6(!k>7n=PyKxN^oKIIC8yrH-zJ^`8m&S+(gI*uBkdpSq4%9A)>a8s{Smed(sq zl0DtDU9Eaz#X3YcuD~9ft_~+JNYr5~Dw|p9C1|s<1xO56;m1Z;VNh8}N z3wf)^4*0M*Wfy!{xW9^BvK4r`xKYk7dr*#iWgG=>N~WAaSI&Ea*<%dJiX`V87sIhh znE^|Y%mOq8XQN19TBxe|x+Q3tZbD_)*VFk1(IB_)ptFquMF_g6N{9)M=#DP3#O(ic z-OG8jSfqRLK*7gbo31YCTgb>hjNYsgVfLDEAFJP%cDHhIi=E`1U!vdbgOCR5J!hRQ zR3Y(kgp?S*q<3~=ZD+mk!_k|Vt}X0t!HZ+)u#!Mk*3IKVuX2VS!gM=JH#^;k7+22G zjdq+eO1C)O*oILkwPvqGDUWW4@PXt>BwiP?j>;9Q)zLPeB$#enm|PrMTE8@TE0XT% zS33`UMwuGOA_V{>Xr_Wt^W$eRnJ@7v<#U*%j}*bAPT8*>!<#%zN*4HP*$CF)-35;> zf=b4(00+o!Ji84TwZk{d1Fx?V@&f~8I`QnCas}Nv-?Pbcpua8*5lEUVvHuSHd&Lnyn5x$E&8#J7EGdH%HdDU6XPAUsv3&6n7xXkg zA_ur($BcEk6MQ` zIX;ohRXly<@Y4r}PK`Ww^2DhV11FB@f?tBnoD0=PU?G(AP`ixa%2|?}@hInq3pO_Q zdCXNBrwIAn^2~?j!0bGjt|{7BM#a;#vp&L?UlX$anpxLfZ}pt~9r^9pTd~E#O!uJL zJ(zAfc*lF_ZlH1gg>QQ^fo?U>J$ocu6}WNi`msz^r&`sy=vb}_%^v)?vwOLw=bg!g z)_;?n-|~AG7ba&9r)zrdRtM&5fA^JZhkj92Pjwcm|D-0fvznS-)(NE9%X$E-y7w;1C07wO2D^(Hmaze8%;NF`IBwwC!bl~ zb~IgnH0?h6zdoxKYC}od*Y7v^2X@<)7mzGE<_WvAJ->tP`TTx2Q!vaJu=xt=U=P_K zW3|lKyucMukpNax24*mm*qTYTE`U2!4xNKkRI)(bU>_7ti)hTQDnu5YJ<2-Me zA)wFNHRI8t7g8}QP5a*UMw^`^hhG$j7zz)lz@0;MR+&ne{y+ELM8ao^0Uzpp0WL63 zV12#>fz5aA=d80KvF0;UAX9^y?Awr>e%GeYJIGi&Kd-0jUVtM4b0UVIi*HkrHn7+^ zUD`=v5`B9HeabcTDCdYJAk!*9CTA$u8J>WZv7~&BJ!q#Zp&8{hx)F7kD|wNqZB7lB z@MD!gTlps4NI;>yLB-0AEiN%5s+lLz)q?Ng_o^Ai*yt zmQaX>oDLLWj{+(+LHJ`gZj7wn(Ea^a-g#vyxZJTvb+^rW=61|KnXO;H@Z!SZg{RZ? zJ+tNayj9_9e9H+$rN)VIz#vrXNLFWsuU{i3?-Xu9dx zTuHW~@t5Xx%j+cWiB)%vZO`rT^%?hi|M|G$3kM(_3BjK53ucP&;f`+M*;AOD&cU;9qS z+o88YAC`9A^#^WrU+@QQ$+rGfkO!^tRWxrnwWOiI`4i@4=! z`xgY6A~slvcw;FmOF59%q9+j5&+=Tdo2Y&~+u1Yde2VdaXD2;RIly<;z$97Ym$)au z)66?6R}Mj^;UfT+GUtiIYJ!5(FKue*IFlTq$|vD)4JH zc&^r-YryM4y-r`kn_9f7Wp4s<0Hq7xI;A?it&{5Uy$1TG96;2MM?P2e3H+zO`ds50g?JHEg?|FVp@gttKSbx`DEP>*92&CM z98<*P6#Qn!VGeL6N+3SeS;1L@XMaVx8boLr74ch~Z$ssF`R84Z%Cq=r&B4rZf!9Z@ev!&oXK5(fn&IRqz1^1sxbY8m7l)PMv|wGRUXgCNZNC_xur z91Rq(>Z6mfnAX3O#xf6R%Z%m<0Ez|IfM zAw0Y<<76dp!di0RB^_S^X7l9F1%XGmcpgTLTGI-}<G!8|-2p%?3{lzS4vvGn0X#4MXnA{Gg3qq77nD)?7*x^F;bRvm~>5t0`Fd9mm=x_S~0E7$H0hj z;C#&-f}K39P98nc_q74@(dB!=BS#dQS>*ZVJ~Evs_zcC)AxjF@KF?3)+fqm^Zj0;7c$V)j1= zgeOTn0i%MAg4oNfv9on>w}q>2!v=Z`d|cD|yW_}!(=N|ocF#ezE|{s?rq*qvviIuA ztrxt0`J4NGd*8iXyWyOBT=UVr*SlqQ@D4WzE+3$JZ%PL76B7;E{>hpbNIiRxmlX={h>Ax&JEPY6gay4ssCCGx(tgcqX$d1y(*R z_!-eMq2mxxx1d!H#7uop)1|yew?CrWM!GQrn_i@n)C+FY!kt*xnzIxGAM<1VdxAVV zZ(w%Jp@x}1G{UE;D>A6_4(gC zcI{ZY4$j{PSG-IZLRBRkOC`@T~I973ZBjtoC%snEP z`LHzzsr=z+$%~5(H4#@(SV^SZ0Dsn^(nfEL(!G!S7KE2j37xJdk#K=3#!z@b$n3%x zAoS&8W_X2BibTlHHyZ8qbwlJVSqd!V4H)~j@LfZ71VO0v_;j#P21=95C7|rIC?<#D zX!D1dQRQ{IjneJgbo-ZdBaFvHD!SnnEtaxGw=TNsl{2%e zLiEy!sS$E#Wqi`~rpMyK;=-R2Tz(7R07k431}sM{5NfuQ2ykKW`zPKxu{5T(?}N7{ zsZFHsKu^+^-LP@?$lUbYg}E3)0o*OEowt8Sx>LHIY@5p8DqrkU16v{LY;Kz?x!c&X zQ1a~)bDr$}BXgel;6ijM2=kwG#U7G#zNA)cxbFGX1uL!lB}{Jk^na`Rne^B7Kl7!V zURd_OnD)KM!FdS%E7EM(k-@<@S;`&5ltX9^p=TM&$k~WY)|`r||3$_e)zLVjZjf91L1!|AP$T{zP^ z4&eVJp(o%KSi80Ksx7_TQ)%hIEz5FC->v7>mVM+x6j0rPc|Ux1*qoYA=L{P9a?`0} zWcZ?*;tMl;KZ9_$H1&F2m=9PbD=-3K^v6y&Clf;2OxrlZ9z;{j#tLB|x+zVYC+E{h z3dVRO30QQ3Ay+W_S|M7~FFgANbv!ityU;dv54Ys&RBJmkwVT!2%}aI5wYz71h}4m( zEy!(zi`ka`<=O+Yp6jjGe0(wJK+!KlYxfJ}e5u7SbqOU7!%SaCuEqLJ;QIB`uvufv zXmZrgFexU}-kTiiGdAX-U|Ns8(WSWh81Xw}-7CD&2SiMZ-gyX?q2gp5`wJ{6!(vGj z+uP7DiQ1un6O1H$6%V;;P3|zFyS%Vqs+>$^2RJ**=@i#&G;$s$H-m~OnRDyOoR=t7 zpis&vHrwEquOqy3(_N3OiYQ@G+J*dhcDrduZiuO3o97CDSNC2z|ERb4zT@>KAYndJ}II#hWO)EyZ zjl4!e)JD zW4_In={?3s4{Ij`AC1n{LUO@oW)v}^A<}^Gt$^pktT4d*5x%7x577)MCveO83N5hL zfMW>`7Jc429}A87g6nDGHlt-WajhK`kzz5airZ9o)2wT*k@<7BL8G~_=Z<@Gw!CKE zS1_vz0ZOEBVrZwcsr@H3{5W7z%JW1XFp&lNWow~+853?tTWhZKt~D8WoH2;8S5Skj z6RtSOt=&xOGQ&eEIO&jKZjPpM6&9|cv7mpp85jCTemf>O4sRUaHFW0`B@N|F@S3R} z-ylmFpt-M5#5A+I)h6RMqP=MzW+hj=kEmVyaBa!3AebqUT&QP)qglWN+68Ma$hY7g z3bMK6T}@sEdqPB^mt6X}!I()Sl3l)fk>!Sa#xwc|;$VhRSqv35=4~(BHJfz>R&IaKW03b`@Rhz1bX_FqSh9t|q0WayQ~i*?>2!$=_BLhL%OG@vd*Bm|ul4>qYG zI-@Hjl3^;MJ`sNo0WL?}h?xGyLa~gW%f+dMeqB(+Jwl%F{1pYG*qsytG1v<1JjP9DBC$Lc0RsB zXgV;Y3l5Dgu?6p%Zsp=os3R{&;B-TAuP^Jp){3~aB3cNpa(leZF|b`oC20@RQtKD@1+oDSABx3ReC(sdCOWsig~J!;R{$^kJKMAEU77Zs zYWvRg`dznbS8SGb2P_349efVY2@5N;U2Wf_|C4rDj=W;X3sH|@@B8dNt8-hO_0)8W}8>6*@YJ8{x! zUl$`3N&1tmXqX>fY`N8S`v?pJ;o9O5%C;he0|k39&TR}u>|FGvo1Z{Hhq7yBb4i3n z%Wep2;T>YCdws#t{!jN?A(s3MQS2}R3Uo)Zs5`#qdo*IR(frOUD=sShs|P-OJYBlw zS4onZy}ze&MV%Sokh0K4?6L1sD^kMS~Jc^eKgoSR*LAlHlFK@!^Bs%lAmg8PrI+7 ztajcxG+V&xL6UtjMGg<-(HE7(&YY!hx~&sByd>OC;YkZ){OBlxWm8CDpxS60M?+K! zLF&bj5yL)te8n6&7{2;r^j`Thx)F5g+5jSp^Oz;)1J*Gtf|VP1`H))RPm%dK7zPCL ztAt@bMp4L5O4pg})Z^HxjrYnM=KHUGZK3Sp-Qrytr$re<`-~D$vBBWhd^{ zJAoK#{fPCl&X28GS$kvC^-URnQ1u5FgG+-yI`P93>Fo#88xJk}55r;(cD+@Za#1Z8 z)8*aFF>%9k^#+(7XR1PQ(_5TM?|SCe_Mhzk@&0u0(K}Vg9@vDM&UEz#G#zNXarJku zrYkz;u0B{#h+qS=zI#y4OZ@6VT|VbPQCuiWCq;finvK_mLfO@4!Cz_8aJzro5|S}d~7x^zuTd<>{xEu36W|h0C{Q` zO9a(G@V?hckw@Wb$@pF7aA>}6AK){EE@@`Clw?DBT0W08&DZT;BV=BIsQpW<6r=FV z(4jRKzGH&3-m@00tL^x-H~_kf8Lnuw7R+MsH;nj{Qjp@J1!c(P9ejvc007%;r1U1b z(YKTZy8Qv&@{61=ZsjhV)JT!6!q0;wLp+2aiTumcy^me-hbw%IR{tw#8J5Iu&7U@1 z{p78hJMO3M*7PpHXW$ui%QNYkqqE+-<(smNZJEZcYU9>yO=G5JgIcp88-S3#nFpvb z5^EviX?&pQq@_K=b~16pp9c{nnw6~e6hDLfy{H~8CKxf>-|W=I`AeLl>0T!(BgKJW z!Qj4O5Xpm@t&k?OGyVh>XnoS=vpLfbCYA*oLuGiHc|jAMhTPrUZIIqevDhrrzxBg;VsRW5wr0Rs<*>S@Pn*Qsv^)?<7{~aiwC5 zqhk(c?f6;P#(ox`SgKr#ECp`vU!~lYQA@R>avtySvxr9UvlLmSpU*;W_<>gmPXCJ4 z>wwR}h7}ur?|C{ubKsYJU-_GrdY5A_jQ~GO8`;n8s{8cB!HSjyU`0k0Ia~N_L?Il( zTy=Bh`9E`(4ACpv8qB9XS3*%lV9bZm_{{M==VeaGWStJ}Gzq1cj#B|c4*yIKJ(sTu z;?JWRHmu1F<*j98#Y^A&9#wY2ND${SgTh=X{<_UDPW)Sb36h}0j#)zQ?yth%D0sLTv-2e zA@qsR@rls-iO}$g5d1{g`H4{TiO}-5g75Ey^?&23n)~Xqr};JK1LwHiQue7p_xoo& z9!uNY`uhTY?gw^SD(80J7x1%k5pKNR2Udrrbw$7pOK0`2(Bp%Ot2WEt&jh-&9{yj3 CBu)JQ literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/symm_mem.cpython-312.pyc b/distributed/device_communicators/__pycache__/symm_mem.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa0edef78952504642ea621646c15e60f878a713 GIT binary patch literal 6798 zcmcf`TWlN0aqoCX9eJcEQKD>1ltjt0Ejl)3RjyOZQW7;nghi<*+=u$ zy`vp5bn5{AklGZMkPpddiYRS?*l~cQUroQ-^rt}cBSr#998}ck<0JiIVjB(8k9PJ> z%cEtsPW#aXwL7=7GdnvwJ3F)b*z0v8C`s;zYD)m2Ka&qyxJ}^By8tX98Ocl%MXi=e zu~7#@U^dC5xF~1BjwGLQMx7?iB^y#gR4`#a=}NhyZWDGU#gr%NVUWw>b)#_&9gGjT=Jp2C@27QQ=Ay*M=`O^r`UW0RB8$?>CSo*b8^#$J$49gB{i zf&rqMR&^S-{W1DUmE&bLz3V=Xt#oXaW8gn?EY$GQs}@zyMiM(H zZ@aar8B{~vZjYc4Xt}!D0t;WRDu(QUkACSr6iW0`$ZiWL(Pe*8i0T*kRCQLfcf090 zyQO<~nJ@5H9=3lRr$D^#6gWWs>iLvC z@Kt53KSe=$DzI?Bmz{-kv=ix%(Q$A|hc7qO*v%Nl*sbi+?UU6&Wua?h(2Ec%TyFSY z!|Y~ziK$-p-V>+ukJh=$LyC^ocnB_z44w_CS}2{-L$Ql7H5r>uDv>-N&&jdj z{Bw2bts5v5k7Z-iYEsor2D$WwbmmfeUnpL4yVc=*wl2$~=A~B4a+f8Omdj={SXbnS zGJdu$0*R+*bwvxanD~5ym-X4K zlD}S;i}t`CKzyafl4?GttC@6Y2DU(#wb7a)#Nh+jd)DcLz%3CO(C^V3@i8RG%^wHGXz=H zh#AR-pDa@XE0e0Wg-Il0G(hM`Ow4dvA*IoLzEOhZ>iLw z>Db^*LN_c=+Kh#S>6v6ErVk#%#3f-8YZx2|p&O0oVALgI17>VwxNG15!C4$6+m?;S zm?l94rNJ&pWV+@DhF}u083NsTgHzKp8KcpZQ=%(2cqNt9XAMEsl@zf*!#SOsnNhIe zhUi#U@)t8X&2XDRos^EH6vJ~emnN~?IK~-lcs3OxWr!xcRrO0*9b*zVU=o~#1%uO+ zwDfv9z323`WsN*cRF>U!EDzQ`v#6n%%F*FeqgIE4fDxf8)ZV z%0N<`9>~t>=QHU666_BUePVSw2WN4Bs-)(qYXfxUku0oQf}8*n1}Pt`V_p%8B~g#| z9WcWysM3UXbk1?Fd;jX|DSP~X@pYGcdy2k2>%Q=Y$A9g_{E5|$)9aoyfZGrQ3o~!V z*TvqAK>N~5i!ZH=-`IU)`)cpk)&fKG6C2)b%g&!W*1dfcy?ZSXo}1VZyw}|G?vl`1 z6grn@)`eXJ|MKF?Fsr-V7KBDD2inQkMtfJOeSfii{|)*5#LdK7`(fzyZ3K3e0!NC0 zBWr=tId@qMtcsnRvQ=27dGM~o;ccu4$lFrxd8E`cTlvBj)`hOJ=qrgmMX_f+ zJW&)UYN>8FwU(N87n^q9ZQy|1O~^&xwa4clU+;bVH(cq7=Zo;`JzEsdZXsLpE&6Pt z+hTB4>@NrR&mDbzii|uyf4toEbh)W@$-C&i$AfHl-KcZ;!@kn+vqks?pCdTn@tK3| zY$Q|_XQQiSqrJb>9x1j*HYrzH-)Volz0`BC2*0L-BMR#%Z#Ru$L+o7_`^v%4+|eIRRUF8(>x;WRs44h4Y5+-rpQpR%>8`MB^MSig z)ZDr-_J;GWi-)CpK#%{Z_+Z$V8i|j7H~3BGGo$GDqpsrvj*p$)$M-ru-pd0_yi7Ju zTp=waRxxv>-vY2iA{~mUX)!a)6qrOU-6j^s*Xh0RC>&XfrkY>b>}bpY9EpG2QYbbL zl$Y58E3uFMu4UP?$S0fl1bF`ui}CTL?@%rNO6jDJ5xml%$d( zDLe5~co$&84t#)&BEcWHuyoc$X+$gFIWUPSjTY^ECI^l+rzushm)}$GxYrq&0}k5r zP+dX9+DCsdT5j2178|b}ofbDz*&G z31zWmAvQm<45kz$<9yqe``+4reLuL2RbkH^;Edog!jZ}8cobm6i4{GE)76b8uPDTW znj1XQta`tRj6JHEhE_GY9eTp(`{}?^%82J zsV31jja-$EN(t{i>}3d46V=4i9{rlPe*a}hdKX}-iLgxpz4gQVYRYYoHPb5ub35?2 z?I(r0N%eM{ib9DF3fV2pPWEU^dAR=I!JvLJQzv<#sR@J9(f>hea)oUv|6i#KY+Y;l zA2nCc#$bltElY$?YHze<;B1)A=F@*dnJ^q$E*19UBQO+`ml4Bl21HU)y`W(7M1Vh+Tid~3CjevCaTPG)vTJMk2)S1cC$KZ7m_Ys^Ulhq7?h8spJeFKwX z$)to8IS21zRzO5DYNJ)Zl###%>9u%+*UrbXN|>|#oaHAx#GO%&8adx2RI)st+nylb zRnO7c`0oRuHU{nlA{!@yE8YxdZaCI``^qibz|qYe+3;^G`Fo50-j$&>|DL%g%Yx^c zI4>?tEyS19MfHsft3q!{=r0QWH(K7`eslW=y|;qv!XaP>ccaIQD7uPb*Q&Vpc1L%q zBU0>$%uho2(HtO``pUu5&e7t|(GTU)W2cMo+j(ZS=?q+<-W{I_$g_RJ(|qmN{IQa! ztLW(}d4`IfA#&l1i{kRsa{Mjzy87b_t4(`vjDK+Om&3mpE)6|h9D4emgN3UVA{_sJ zPkWcpg77ZSHP^grp>OGt#YdJ?D~@+uZ@YdbuKM?t{DVdR;0HT@8U96h%|Eg(jFx@= zIX5QBeV9FQBHT#AN)jqdlHr!5R7TDv3G9)i@8)7j3xmgDI3^w%6Muw>S269+${@Ju zOeTrH4xNVcw35~`*s{<`=(&l^xy-%B#6O!AVW7PS)m8K--}Z8|e_?FF`5Eu%bgptc zDy?YGBjx^mpLtI*e9vuPM};Q`xXcw@!Mi++wiZIx5Dr+Uuh~pbA~jLGA=Z50Zn9pYHscTp_iG16>V^9e z+t#I=t|m1rD-um&E2+aI&@uL!J+N6yQc_+~;yDl`l8ntliWS-N-vcwHF&a$*`riZ1 zN*H%OAb!LQI?Q(rl3~I-hSB^$7v4jbMeZs?JeDFa31)VMr${&XkASt$CkaIC%sS`7 zh-F)k;g_I~!~~iQRmH(D%pCzS-anwwAJE==9LuoxoQUbW$8JLVDjd&r|B-uQm3!j$ zj?PuC4Gx<-u<*i~tLuvM@6IunY55$H`iqJe2>>%pcX>zGCtMrh-uU)h#{~3| N&@kqAeALW9{V(Sf%_sl> literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/__pycache__/tpu_communicator.cpython-312.pyc b/distributed/device_communicators/__pycache__/tpu_communicator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0df31ff1fe2a8b1beaed90dc8f86247fc7e8627 GIT binary patch literal 4209 zcmai1Z)_9E6`!@&_WED`mc#**7($XWNgVzv^zO(({y-8SAOfd$RkU)vlVqLUwQqKv zBatI<71AZWB50)wkSdXWDiwU7pL+e&_ERfWs_Z6GqG_UDb?||2L2&A+U)ndj-q?+D zFqUWDym@ct&D%G>H}ltUIDnw2{1?id8ic-P9jACo#A}^H=suE>%&ADUTTbMl?^0cw zTXb`b=T>>mBYG^FSG}4~^jWk=6*Rx-w`i{#(1KzRXrC;oAuTM1Ipjjv^Q5#?I_fN5 zZqvPS`Ii;)~fzA%(1gLmXHo> zzk9?zR@n2 zcOJ=Aw~<_p!%y7ys$we+5Dra+ z(#Y7kp{qk<14A#F<6xv=5YH+pJdn|}oUWvjW`^`94NTo6IX8dO%;qFTpTw|g3di(} z8Jo=Hbh#^*GZcL)Hhb~nr4A!@Et{*Y(iqSZTGHb(s^aJA(W>XofeLPK_2;s)XKe3HYGp44#4IiqX?Uih=fdcEvI=oj35 zWHPQ*mO^=S2W0bH-j#Rf`6s;1=TIJ{OLR%pY8E`eXY=hS3tm%jSmd&~?eTQj;g(yO zE1x#A+Tri!ZRgA__$uV;9F20jEhpp!*)=nW5OL*&1z+Bq_szqe7yNlo-aij^F9h@( z@378YVcqwfHPQ_ZRc_NwW!j;(EeYm>GS9HcaE+oLbLQJ;+UMw(+wGYuY&Z8i<9Ux` zrMZoQ>6kNGZe`Dh(5cYdM!I+Cc`dzGXQuQahbp)EK;HiZyvquQ%xzqK8+w9Vr>%jI zv%@@+w!J%t;=Zx@Lpy>$#_&r?stg}upDN~b?CRMc61Mbw>pMj+T_G$qY3;zp%hv}j zjJtMr{DL$xNNXA3eLONg(tmMC8WhQQDw;hi>Qx5Cm2M%Ho{UK05L;Y{qbQ%xmnrw!`5Nod60V|g=DhRBMZR7r^GdP^7_ro4gGNx}e;)Ut{lLQ>Bv zb_mE`sHmS=W$)3lxV*n9TBBxC;KK4kG+=Wqr&^lR8N)QFUy@R4(l8{+U=EB`lw1F~ z(PL`ao~)AXHj@P0*nKnC`~6chy|_nJCVH}S=5$8yVX3c2R=^v|M9#!=PYEPb6$#Af zVc(;YlgQ4I2rM`s*%43)Cl1#&Rx&Y2fP6 z;PrtaY5e*nTf{{eK#>zr8O)qmj2vT(Q&7$C-5DI^nSR6+TA+gH@S>W!y9-MTzr6bB z#={$r&wY8~4rn74l>j&D_V;Az$q-(T0R*D;y%i|WX{SSzEQYMJ4gw6 zox@9xOjDZ2CyN|+cjl^IR-BU?(8j}P# z>U}UE?jpU=*q*Y7;_k9zEE$8D3&+!T&uQrF?jBn`3FAf^RJYK2b=|#*pUf46!NT6b z6=863Zlk)Ex$D8lkLy>fy9+|M<)pZKY>dJUl(1A34>Lq&R~ta&fFx-dIS0@u(U2s4 zl1r-g2w^r+kABv|bza1G|>GxpBd z4TZ3ZS#K;t_3dr6C7_PZ^{yl9`}V&Ip75Ods=j5*11+4a+J^g~pM@Tn#hTVFA7crq zv32=cu_3^XKTFqW@;3HzZzs%1(hn7_wY-q1KtkI)K?aZs`)xp2*ioBfCojSd zC#|w%oacc-*o}|Y{Xa7S5K7<@jc8;7-UHGuFiY4SomOX)1VRj}r3VR(R@fxrn>dvN zQ>BSyY6j~vVRs?&5v!uCVpkqO6N6`VF!_JcA`Fd}yD9J|aSwg{|=vsx3Fiajyl$ z1-?e1O?3DL>e)oCo2YXW9eII1eC0)+=nB7ojqfh<-D~`4kstjFF7o4BZWrge_VaD{3<`Z?rSdi8rOWyMPKu(FZK^1@=t-C+D)N- xQ|S7}j$P{{WFI6VdL`xBy*s&a2@lWkIO<=1*oS-h_^g{8D7A5{= z?iqP#t5;za;QyugneQeQ(l1W?v5Wk}o_xvV43 zb4WWfTvmt+jOH_9R*FkN3#yoLX63ld5eLyE&2@vbw8h;H@;p(cD@1i_@(sSl^~`fo z`5T-}Mmf{_63r(yU4NeDiv^f_(z&#uWbzj-X!ILY1iapp+FUxRoycdi#aud>F!J<= z30+HC3a#GLnymj8HvIygK_hWaC2@y~;Gf!~Iy7G8HDQKDVe3``Pg` z7fFf(?|pG6_%F-^?%#v6ltY!T2yqwa-=VtN%cyS6t$N`103DXal&pHMh;fhF`#?QD z)S+@IN$mhhFWBr;{SUAicmTgs^MM~7?N{4#Q;wbZq_WxtR{amPYH4pDDM{^y{eeYt ziHmohCDEYiKUXMjJ<>_jl`IsM3z!I|q_QJ7z1G;`df@oqENV(Fk=2rR&f-Bx@WcFC z-w&t%Yi@-Y$YpIwDwDTCw#=0sWxg!j5G+1N%4DWRw?x@M=YfyMZWumld0la#UtzodWPvuo1BW0ZnBGy(}1nD+eXxAI)b=j_L z@p6aF+ZovNGLz7CMbQB}NQAY^|9(AY zWDBuEy6}XNpcgdbiA%-tXAaJeYq3mvDpr^`rt`VjTqcu^scGGy>8TK*Qg?>L0B82dfu9^N)QQ=w0bp?r1u=9nW)XJ;BwU;d;;TJ*Uvw)pV2L*frsc z!0!9E+zA}~YUt3d{huEBPRO8whoys2Oa)^(J;Oum%8SN%)=id0os z_l(!~jNg_IHvBsleKdrc2B9-~73}kBI-A^zHXDd_5=^ha!_90wJZx&U8?Dhp%6!ss z$uZjxO1mE}I}7^saE{~1i-Li z%a++3KNf8r?CsFAxnfgg$1EcAb7gU+&*p88DqrU9WLywI26Jp%D+?-|W}gp^A9363^)qlL4;wZ^D(vgCUqH*<#LY1@CC*S=^<)^i-p}V4^b9kK>!X1lenjIt%uF0V$Pr{Fa zPJaj{48%-cr^)2j632qZ`TGBe+IS_}3VpB@8{l?JD_%kXwE>g&0iQ;oi}IA^lWU6R z6KXO6uS$=_#_Fch}}0U2N+_tP*b?c9F$8!BGF4MFHE;Ow$gKX;sy7if$wfO^2+7oP%_RyF}X1&7QQb=o;KYb4g7pBq$`jmbnkwH{z25 zQvmH`2FC5}vg1VpVB2=B9iUp*S36?op>!9x1pxN4Bjq?tKH@NXh)AXk@4G_6NKQuu ztC|h~m%`PR(1XzFcoJALR>Tg&?ECi#Tg7SIa0)u87cU9)SXYLcKUR}t&^LxgR)-GP zhYqg}ovaU?Y%=qaHpC|=QKz$mP;%?EHwA!(zCFo)VDpCT6@pIqK--#kbFNd!O&`O zv>qH?4eqZ8_d{T612G5-7CMWFU0hLqa?(W45Q@L~540#}!7_I$+J*OA{M1nt)2%4k zyjsj4?NyXFi;0Z2LeXg&#SQ@+s@0(EmS{?^FvMrq8IuPvrca?1)`%%g*0AA*(du8^iohDI8rKWGeu?|OQ~ ziM8%sO%X)7MY0 z6Ch#b$T99N9}>fj-hq|Oa;C}X=%_W0A7bO0tK({DDO3^auF$#&!gnV)@iGq7=b-BgSwZ70wdAZyafd*fLVK#H(OeVVqnpqB)i+b5qYp|tsEp!hO2~#{-JqBHq=QwWNO*r2j61hY6eoY3yCKF$|daj-O-1W$1>7Mjsjtkd%N53UV HFopjGyjuQ0 literal 0 HcmV?d00001 diff --git a/distributed/device_communicators/all2all.py b/distributed/device_communicators/all2all.py new file mode 100644 index 0000000..9ca1139 --- /dev/null +++ b/distributed/device_communicators/all2all.py @@ -0,0 +1,490 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +import torch +import torch.distributed as dist + +import vllm.envs as envs +from vllm.distributed import get_dp_group, get_ep_group +from vllm.forward_context import get_forward_context +from vllm.logger import init_logger +from vllm.utils.flashinfer import has_flashinfer_all2all +from vllm.utils.import_utils import has_deep_ep, has_pplx + +from .base_device_communicator import All2AllManagerBase, Cache + +if has_flashinfer_all2all(): + from flashinfer.comm import Mapping # type: ignore[import-not-found] + from flashinfer.comm.mnnvl import MnnvlConfig # type: ignore[import-not-found] + from flashinfer.comm.trtllm_alltoall import ( + MnnvlMoe, # type: ignore[import-not-found] + ) + +logger = init_logger(__name__) + + +class NaiveAll2AllManager(All2AllManagerBase): + """ + A naive implementation of all2all communication. + It uses all-reduce under the hood, which is not + efficient at all. The main purpose is for testing and + debugging. + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def naive_multicast( + self, + x: torch.Tensor, + cu_tokens_across_sp_cpu: torch.Tensor, + is_sequence_parallel: bool, + ) -> torch.Tensor: + assert len(x.shape) == 2 + buffer = torch.empty( + (cu_tokens_across_sp_cpu[-1], x.size(1)), device=x.device, dtype=x.dtype + ) + + rank = self.rank if is_sequence_parallel else self.dp_rank + world_size = self.world_size if is_sequence_parallel else self.dp_world_size + + start = 0 if rank == 0 else cu_tokens_across_sp_cpu[rank - 1] + end = cu_tokens_across_sp_cpu[rank] + buffer[start:end, :].copy_(x) + for idx in range(world_size): + start = 0 if idx == 0 else cu_tokens_across_sp_cpu[idx - 1] + end = cu_tokens_across_sp_cpu[idx] + get_ep_group().broadcast(buffer[start:end, :], idx) + + return buffer + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + sp_size = self.tp_group.world_size if is_sequence_parallel else 1 + dp_metadata = get_forward_context().dp_metadata + assert dp_metadata is not None + cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size) + + hidden_states = self.naive_multicast( + hidden_states, cu_tokens_across_sp_cpu, is_sequence_parallel + ) + router_logits = self.naive_multicast( + router_logits, cu_tokens_across_sp_cpu, is_sequence_parallel + ) + return hidden_states, router_logits + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + ep_rank = self.rank if is_sequence_parallel else self.dp_rank + + dp_metadata = get_forward_context().dp_metadata + assert dp_metadata is not None + sp_size = self.tp_group.world_size if is_sequence_parallel else 1 + cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size) + + start = 0 if ep_rank == 0 else cu_tokens_across_sp_cpu[ep_rank - 1] + end = cu_tokens_across_sp_cpu[ep_rank] + + all_hidden_states = get_ep_group().all_reduce(hidden_states) + hidden_states = all_hidden_states[start:end, :] + return hidden_states + + def destroy(self): + pass + + +class AgRsAll2AllManager(All2AllManagerBase): + """ + An implementation of all2all communication based on + all-gather (dispatch) and reduce-scatter (combine). + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Gather hidden_states and router_logits from all dp ranks. + """ + dp_metadata = get_forward_context().dp_metadata + assert dp_metadata is not None + sizes = dp_metadata.get_chunk_sizes_across_dp_rank() + assert sizes is not None + + dist_group = get_ep_group() if is_sequence_parallel else get_dp_group() + assert sizes[dist_group.rank_in_group] == hidden_states.shape[0] + hidden_states, router_logits = dist_group.all_gatherv( + [hidden_states, router_logits], + dim=0, + sizes=sizes, + ) + return hidden_states, router_logits + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + """ + Reduce-scatter hidden_states across all dp ranks. + """ + dp_metadata = get_forward_context().dp_metadata + assert dp_metadata is not None + sizes = dp_metadata.get_chunk_sizes_across_dp_rank() + assert sizes is not None + + dist_group = get_ep_group() if is_sequence_parallel else get_dp_group() + hidden_states = dist_group.reduce_scatterv(hidden_states, dim=0, sizes=sizes) + return hidden_states + + def destroy(self): + pass + + +class PPLXAll2AllManager(All2AllManagerBase): + """ + All2All communication based on PPLX kernels. + """ + + def __init__(self, cpu_group): + assert has_pplx(), ( + "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md" + " to install pplx_kernels." + ) + super().__init__(cpu_group) + + if self.internode: + # inter-node communication needs nvshmem, + # intra-node communication uses p2p mapping directly + from pplx_kernels.nvshmem import ( # type: ignore[import-not-found] + nvshmem_alloc_empty_unique_id, + nvshmem_get_unique_id, + nvshmem_init, + ) + + logger.debug( + "Initialize NVSHMEM for pplx_kernels: rank=%d, world size=%d", + self.rank, + self.world_size, + ) + uid = ( + nvshmem_get_unique_id() + if self.rank == 0 + else nvshmem_alloc_empty_unique_id() + ) + dist.broadcast( + uid, + src=dist.get_process_group_ranks(self.cpu_group)[0], + group=self.cpu_group, + ) + logger.debug("PPLX NVSHMEM UID = %s", uid) + nvshmem_init(uid, self.rank, self.world_size) + + self.handle_cache = Cache() + + def get_handle(self, kwargs): + import pplx_kernels as pplx # type: ignore[import-not-found] + + return self.handle_cache.get_or_create( + kwargs, + pplx.AllToAll.internode if self.internode else pplx.AllToAll.intranode, + ) + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + raise NotImplementedError + + def destroy(self): + with self.handle_cache._lock: + for _, handle in self.handle_cache._cache.items(): + handle.destroy() + + if self.internode: + from pplx_kernels.nvshmem import ( + nvshmem_finalize, # type: ignore[import-not-found] + ) + + logger.debug("PPLX NVSHMEM finalize") + nvshmem_finalize() + + +class DeepEPAll2AllManagerBase(All2AllManagerBase): + """ + All2All communication based on DeepEP High-Throughput kernels. + """ + + def __init__(self, cpu_group): + assert has_deep_ep(), ( + "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md" + " to install DeepEP kernels." + ) # noqa + super().__init__(cpu_group) + self.handle_cache = Cache() + + # This is the DeepEP default. Stick to it till we can establish + # reasonable defaults based on profiling. + self.num_sms = 20 + + def get_handle(self, kwargs): + raise NotImplementedError + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + raise NotImplementedError + + def destroy(self): + pass + + +class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): + """ + All2All communication based on DeepEP High-Throughput kernels. + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def _make_all2all_kwargs(self) -> dict[Any, Any]: + # Defaults for internode and intranode are taken from DeepEP tests. + num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024 + num_rdma_bytes = None + num_qps_per_rank = None + + if self.internode and not envs.VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: + num_rdma_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024 + num_qps_per_rank = self.num_sms // 2 + else: + num_rdma_bytes = 0 + num_qps_per_rank = 1 + + assert num_rdma_bytes is not None + assert num_qps_per_rank is not None + return dict( + group=self.cpu_group, + num_nvl_bytes=num_nvl_bytes, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=False, + num_qps_per_rank=num_qps_per_rank, + ) + + def get_handle(self, kwargs): + assert len(kwargs) == 0, ( + "DeepEPHTAll2AllManager expects no arguments. All the required " + "args are computed in the Manager itself." + ) + + import deep_ep # type: ignore[import-not-found] + + buffer_kwargs = self._make_all2all_kwargs() + logger.debug("DeepEP all2all args %s", buffer_kwargs) + handle: deep_ep.Buffer = self.handle_cache.get_or_create( + buffer_kwargs, deep_ep.Buffer + ) + return handle + + def set_num_sms(self, num_sms: int): + import deep_ep # type: ignore[import-not-found] + + # Right now the buffers are sized for only what the kernels were + # created with. So we can only reduce the number of SMS used + # but not increase it. + if num_sms > self.num_sms: + num_sms = self.num_sms + deep_ep.Buffer.set_num_sms(num_sms) + + +class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): + """ + All2All communication based on DeepEP Low-Latency kernels. + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def _make_all2all_kwargs( + self, + max_num_tokens_per_dp_rank: int, + token_hidden_size: int, + num_ep_ranks: int, + num_global_experts: int, + num_local_experts: int, + ) -> dict[Any, Any]: + """ + max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank + can dispatch all the ranks must hold the same value. + token_hidden_size: the hidden dimension of each token. + num_ep_ranks: the number of EP group ranks. + num_global_experts: Number of experts in the model. + num_local_experts: Number of experts in an EP rank. + """ + import deep_ep # type: ignore[import-not-found] + + # Defaults for internode and intranode are taken from DeepEP tests. + num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024 + num_qps_per_rank = num_local_experts + num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( + num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank, + hidden=token_hidden_size, + num_ranks=num_ep_ranks, + num_experts=num_global_experts, + ) + + assert num_rdma_bytes is not None + return dict( + group=self.cpu_group, + num_nvl_bytes=num_nvl_bytes, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=True, + num_qps_per_rank=num_qps_per_rank, + # allow_nvlink_for_low_latency_mode=True, + # allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL, + ) + + def get_handle(self, kwargs): + """ + The kwargs for DeepEPLLAll2AllManager is dictated by + _make_all2all_kwargs. + """ + import deep_ep # type: ignore[import-not-found] + + buffer_kwargs = self._make_all2all_kwargs(**kwargs) + logger.debug("DeepEP all2all args %s", buffer_kwargs) + handle: deep_ep.Buffer = self.handle_cache.get_or_create( + buffer_kwargs, deep_ep.Buffer + ) + return handle + + # DeepEP LL uses RDMA so no SMs are used for communication + def max_sms_used(self) -> int | None: + return 0 + + +class FlashInferAllToAllManager(All2AllManagerBase): + """ + All2All communication based on flashinfer kernels. + """ + + # This type lint could be removed after all of the work in + # https://github.com/vllm-project/vllm/issues/26533 done. + rank: int + world_size: int + + def __init__(self, cpu_group): + assert has_flashinfer_all2all(), ( + "flashinfer all2all module not found. Please install/check flashinfer" + ) # noqa + super().__init__(cpu_group) + logger.debug( + "Initialize for flashinfer All2All rank=%d, world size=%d", + self.rank, + self.world_size, + ) + self.initialized = False + self.alltoall_info = None + + def initialize( + self, + world_size: int, + rank: int, + gpus_per_node: int, + ): + """Initialize workspace""" + if self.initialized: + return + + self.cleanup() + logger.debug("making map: rank=%d, world size=%d", rank, world_size) + self.mapping = Mapping( + world_size, + rank, + gpus_per_node, + tp_size=world_size, + ) + + from vllm.distributed.device_communicators.mnnvl_compat import ( + CustomCommunicator, + ) + + dp_config = MnnvlConfig( + comm_backend=CustomCommunicator(get_dp_group().cpu_group), + fabric_page_size=1 << 29, # 512MB + allocation_granularity=0, # Auto-detect + ) + + self.workspace_tensor = MnnvlMoe.get_moe_workspaces(self.mapping, dp_config) + self.prepare_workspace_tensor = MnnvlMoe.get_moe_prepare_workspace( + self.mapping, dp_config + ) + + self.world_size = world_size + self.rank = rank + self.gpus_per_node = gpus_per_node + self.initialized = True + + logger.info( + "FlashInfer All2All initialized for rank %s, size %s", rank, world_size + ) + + def ensure_alltoall_workspace_initialized(self): + """Ensure workspace is initialized""" + if not has_flashinfer_all2all(): + return False + + if self.world_size <= 1: + return False + + if not self.initialized: + self.initialize( + world_size=self.world_size, + rank=self.rank, + gpus_per_node=torch.cuda.device_count, + ) + return self.initialized + + def get_handle(self, kwargs): + return self + + def cleanup(self): + """Clean up workspace""" + if ( + self.initialized + and self.workspace_tensor is not None + and self.prepare_workspace_tensor is not None + ): + try: + del self.workspace_tensor + del self.prepare_workspace_tensor + except Exception as e: + logger.warning("Failed to cleanup FlashInfer workspace: %s", e) + finally: + self.workspace_tensor = None + self.prepare_workspace_tensor = None + self.mapping = None + self.initialized = False diff --git a/distributed/device_communicators/all_reduce_utils.py b/distributed/device_communicators/all_reduce_utils.py new file mode 100644 index 0000000..ff2d743 --- /dev/null +++ b/distributed/device_communicators/all_reduce_utils.py @@ -0,0 +1,344 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import ctypes +import json +import os +import pickle +import subprocess +import sys +import tempfile +from collections.abc import Sequence +from itertools import product +from typing import Any + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +import vllm.envs as envs +from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary +from vllm.logger import init_logger +from vllm.model_executor.layers.batch_invariant import ( + vllm_is_batch_invariant, +) +from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import cuda_device_count_stateless + +logger = init_logger(__name__) + +MiB = 1024 * 1024 +# Max size for each world size in case symmetric memory is available +# For different SM architectures +CUSTOM_ALL_REDUCE_MAX_SIZES = { + "9.0": { + 2: 64 * MiB, # 64 MB + 4: 32 * MiB, # 32 MB + 6: MiB // 2, # 512 KB + 8: MiB // 4, # 256 KB + }, + "10.0": { + 2: 2 * MiB, # 2 MB + 4: 2 * MiB, # 2 MB + 6: 1 * MiB, # 1 MB + 8: 1 * MiB, # 1 MB + }, +} + +SYMM_MEM_ALL_REDUCE_MAX_SIZES = { + "9.0": { + 2: 64 * MiB, # 64 MB + 4: 32 * MiB, # 32 MB + 6: 64 * MiB, # 64 MB + 8: 64 * MiB, # 64 MB + }, + "10.0": { + 2: 8 * MiB, # 8 MB + 4: 32 * MiB, # 32 MB + 6: 128 * MiB, # 128 MB + 8: 128 * MiB, # 128 MB + }, +} + +NCCL_SYMM_MEM_ALL_REDUCE_CONFIG: dict[str, Any] = { + "min_world_size": 4, + "thresholds": { + 4: 2 * MiB, # 2 MB + 8: 1 * MiB, # 1 MB + }, + "always_use_above_world_size": 8, # Always use symm mem for world_size > 8 +} + + +def should_nccl_symm_mem_allreduce(world_size: int, input_tensor: torch.Tensor) -> bool: + from vllm.distributed.device_communicators.pynccl_allocator import ( + is_symmetric_memory_enabled, + ) + + if vllm_is_batch_invariant(): + return False + + if not is_symmetric_memory_enabled(): + return False + if world_size < NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["min_world_size"]: + return False + threshold = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["thresholds"].get(world_size) + if threshold is not None and input_tensor.nbytes >= threshold: + return True + return world_size > NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["always_use_above_world_size"] + + +def producer( + batch_src: Sequence[int], + producer_queue, + consumer_queue, + result_queue, + cuda_visible_devices: str | None = None, +): + if cuda_visible_devices is not None: + update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) + + lib = CudaRTLibrary() + for i in batch_src: + lib.cudaSetDevice(i) + pointer = lib.cudaMalloc(1024) + lib.cudaMemset(pointer, 1, 1024) + lib.cudaDeviceSynchronize() + handle = lib.cudaIpcGetMemHandle(pointer) + producer_queue.put(handle) + open_success = consumer_queue.get() + if open_success: + # use two queues to simulate barrier + producer_queue.put(0) + consumer_queue.get() + # check if the memory is modified + host_data = (ctypes.c_char * 1024)() + lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore + for i in range(1024): + if ord(host_data[i]) != 2: + open_success = False + break + result_queue.put(open_success) + lib.cudaDeviceReset() + + +def consumer( + batch_tgt: Sequence[int], + producer_queue, + consumer_queue, + result_queue, + cuda_visible_devices: str | None = None, +): + if cuda_visible_devices is not None: + update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) + + lib = CudaRTLibrary() + for j in batch_tgt: + lib.cudaSetDevice(j) + handle = producer_queue.get() + open_success = False + try: + pointer = lib.cudaIpcOpenMemHandle(handle) # type: ignore + open_success = True + except RuntimeError: + # cannot error out here, because the producer process + # is still waiting for the response. + pass + consumer_queue.put(open_success) + if open_success: + # modify the memory + lib.cudaMemset(pointer, 2, 1024) + lib.cudaDeviceSynchronize() + # use two queues to simulate barrier + producer_queue.get() + consumer_queue.put(0) + # check if the memory is modified + host_data = (ctypes.c_char * 1024)() + lib.cudaMemcpy(host_data, pointer, 1024) # type: ignore + for i in range(1024): + if ord(host_data[i]) != 2: + open_success = False + break + result_queue.put(open_success) + lib.cudaDeviceReset() + + +def can_actually_p2p( + batch_src: Sequence[int], + batch_tgt: Sequence[int], +) -> Sequence[bool]: + """ + Usually, checking if P2P access is enabled can be done by + `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes + the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)` + returns `True` even if P2P access is not actually possible. + See https://github.com/vllm-project/vllm/issues/2728 and + https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10 + Therefore, we have to perform a real P2P access to check if it is actually + possible. + + Note on p2p and cuda IPC: + Usually, one process uses one GPU: + GPU src --> cuda context src --> tensor src --> process src + + We need to combine p2p and cuda IPC, so that: + GPU src --> cuda context src --> tensor src --> process src + |shared| + GPU tgt --> cuda context tgt --> tensor tgt --> process tgt + That is to say, process src creates a tensor in GPU src, passes IPC handle to + process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the + tensor in process tgt will be reflected in the tensor in process src, because + they are the same memory segment. + It is important to note that process tgt accesses the tensor in GPU tgt, not + GPU src. That's why we need p2p access. + + The most time-consuming part is the process creation. To avoid creating + processes for every pair of GPUs, we use batched testing. We create two + processes for testing all pairs of GPUs in batch. The trick is to reset + the device after each test (which is not available in PyTorch). + """ # noqa + cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES + # pass the CUDA_VISIBLE_DEVICES to the child process + # to make sure they see the same set of GPUs + + # make sure the processes are spawned + smp = mp.get_context("spawn") + producer_queue = smp.Queue() + consumer_queue = smp.Queue() + result_queue = smp.Queue() + p_src = smp.Process( + target=producer, + args=( + batch_src, + producer_queue, + consumer_queue, + result_queue, + cuda_visible_devices, + ), + ) + p_tgt = smp.Process( + target=consumer, + args=( + batch_tgt, + producer_queue, + consumer_queue, + result_queue, + cuda_visible_devices, + ), + ) + p_src.start() + p_tgt.start() + p_src.join() + p_tgt.join() + assert p_src.exitcode == 0 and p_tgt.exitcode == 0 + result: list[bool] = [] + for src, tgt in zip(batch_src, batch_tgt): + a = result_queue.get() + b = result_queue.get() + if a != b: + logger.warning( + "Two processes do not agree on the P2P access" + " status on %d -> %d, treat as disabled.", + src, + tgt, + ) + result.append(False) + else: + result.append(a) + return result + + +# why do we need this cache? +# we are testing peer-to-peer (p2p) access between GPUs,across processes. +# if we test it every time, it will be very slow, because we need to create +# N * N * 2 processes, where N is the world size. This is very slow. +# to reduce the time, we use a cache file to store the p2p access status. +# the cache file is generated by the master process if it does not exist. +# then all the processes can read the cache file to check the p2p access status. +# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we +# can have different cache files for different CUDA_VISIBLE_DEVICES settings, +# e.g. used by different vllm engines. The device id in the cache file is a +# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number +# of visible devices in the vllm engine. +_gpu_p2p_access_cache: dict[str, bool] | None = None + + +def gpu_p2p_access_check(src: int, tgt: int) -> bool: + """Check if GPU src can access GPU tgt.""" + + # if the cache variable is already calculated, + # read from the cache instead of checking it again + global _gpu_p2p_access_cache + if _gpu_p2p_access_cache is not None: + return _gpu_p2p_access_cache[f"{src}->{tgt}"] + + is_distributed = dist.is_initialized() + + num_dev = cuda_device_count_stateless() + cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES + if cuda_visible_devices is None: + cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) + + path = os.path.join( + envs.VLLM_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json" + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + from vllm.distributed.parallel_state import get_world_group + + if (not is_distributed or get_world_group().local_rank == 0) and ( + not os.path.exists(path) + ): + # only the local master process (with local_rank == 0) can + # enter this block to calculate the cache + logger.info("generating GPU P2P access cache in %s", path) + cache: dict[str, bool] = {} + ids = list(range(num_dev)) + # batch of all pairs of GPUs + batch_src, batch_tgt = zip(*list(product(ids, ids))) + # NOTE: we use `subprocess` rather than `multiprocessing` here + # because the caller might not have `if __name__ == "__main__":`, + # in that case we cannot use spawn method in multiprocessing. + # However, `can_actually_p2p` requires spawn method. + # The fix is, we use `subprocess` to call the function, + # where we have `if __name__ == "__main__":` in this file. + + # use a temporary file to store the result + # we don't use the output of the subprocess directly, + # because the subprocess might produce logging output + with tempfile.NamedTemporaryFile() as output_file: + input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name)) + returned = subprocess.run( + [sys.executable, __file__], input=input_bytes, capture_output=True + ) + # check if the subprocess is successful + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + raise RuntimeError( + f"Error happened when batch testing " + f"peer-to-peer access from {batch_src} to {batch_tgt}:\n" + f"{returned.stderr.decode()}" + ) from e + with open(output_file.name, "rb") as f: + result = pickle.load(f) + for _i, _j, r in zip(batch_src, batch_tgt, result): + cache[f"{_i}->{_j}"] = r + with open(path, "w") as f: + json.dump(cache, f, indent=4) + if is_distributed: + get_world_group().barrier() + logger.info("reading GPU P2P access cache from %s", path) + with open(path) as f: + cache = json.load(f) + _gpu_p2p_access_cache = cache + return _gpu_p2p_access_cache[f"{src}->{tgt}"] + + +__all__ = ["gpu_p2p_access_check"] + +if __name__ == "__main__": + batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read()) + result = can_actually_p2p(batch_src, batch_tgt) + with open(output_file, "wb") as f: + f.write(pickle.dumps(result)) diff --git a/distributed/device_communicators/base_device_communicator.py b/distributed/device_communicators/base_device_communicator.py new file mode 100644 index 0000000..cfa81cc --- /dev/null +++ b/distributed/device_communicators/base_device_communicator.py @@ -0,0 +1,311 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import threading +from weakref import WeakValueDictionary + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup +import ixformer.distributed as ixfd +import os + +class Cache: + def __init__(self): + self._cache: WeakValueDictionary = WeakValueDictionary() + self._lock = threading.RLock() # Reentrant lock for thread safety + + def get_or_create(self, kwargs, func): + # Create a hashable key from the kwargs + key = tuple(sorted((k, v) for k, v in kwargs.items())) + + with self._lock: + instance = self._cache.get(key) + if instance is None: + instance = func(**kwargs) + self._cache[key] = instance + return instance + + +class All2AllManagerBase: + rank: int + world_size: int + + def __init__(self, cpu_group): + self.cpu_group = cpu_group + + # compute some common properties + from vllm.distributed.parallel_state import ( + get_dp_group, + get_tp_group, + in_the_same_node_as, + ) + + # all2all lives in ep group, which is merged from dp and tp group + self.dp_group = get_dp_group() + self.tp_group = get_tp_group() + + # no self.ep_group since self.ep_group is still in construction + # when we create this object + self.dp_rank = self.dp_group.rank_in_group + self.dp_world_size = self.dp_group.world_size + self.rank = dist.get_rank(cpu_group) + self.world_size = dist.get_world_size(cpu_group) + + # all2all communication often has separate implementations for + # intra-node and inter-node communication + self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0)) + + def get_handle(self, kwargs): + # get a handle for the all2all communication, + # based on the kwargs. + # different layers can have different configs, + # e.g. one layer has hidden size 1024, another has 2048. + # usually the underlying implementation caches the handle + # and reuse it for the same config. + raise NotImplementedError + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ): + raise NotImplementedError + + def set_num_sms(self, num_sms: int): + pass + + def max_sms_used(self) -> int | None: + return None # None means it could use the whole GPU + + def combine(self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False): + raise NotImplementedError + + def destroy(self): + pass + + +class DeviceCommunicatorBase: + """ + Base class for device-specific communicator. + It can use the `cpu_group` to initialize the communicator. + If the device has PyTorch integration (PyTorch can recognize its + communication backend), the `device_group` will also be given. + """ + + def __init__( + self, + cpu_group: ProcessGroup, + device: torch.device | None = None, + device_group: ProcessGroup | None = None, + unique_name: str = "", + ): + self.device = device or torch.device("cpu") + self.cpu_group = cpu_group + self.device_group = device_group + self.unique_name = unique_name + self.rank = dist.get_rank(cpu_group) + self.world_size = dist.get_world_size(cpu_group) + self.ranks = dist.get_process_group_ranks(cpu_group) + self.global_rank = dist.get_rank() + self.global_world_size = dist.get_world_size() + self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank) + + use_ep = False + all2all_backend = None + from vllm.config import get_current_vllm_config + + config = get_current_vllm_config() + if config is not None: + # as long as we use data parallel (coupled data parallel + # where all data parallel ranks execute forward together), + # we initialize the all2all manager used in expert parallel. + use_ep = config.parallel_config.data_parallel_size > 1 + all2all_backend = config.parallel_config.all2all_backend + + self.is_ep_communicator = "ep" in unique_name + self.use_all2all = self.is_ep_communicator and use_ep + self.all2all_backend = all2all_backend + self.all2all_manager: All2AllManagerBase | None = None + + def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: + dist.all_reduce(input_, group=self.device_group) + return input_ + + def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + input_size = input_.size() + # NOTE: we have to use concat-style all-gather here, + # stack-style all-gather has compatibility issues with + # torch.compile . see https://github.com/pytorch/pytorch/issues/138795 + output_size = (input_size[0] * self.world_size,) + input_size[1:] + # Allocate output tensor. + output_tensor = torch.empty( + output_size, dtype=input_.dtype, device=input_.device + ) + # All-gather. + if self.use_vllm_comm: + ixfd.all_gather_into_tensor(output_tensor, + input_, + group=self.device_group, + async_op=True) + else: + dist.all_gather_into_tensor(output_tensor, input_, group=self.device_group) + # Reshape + output_tensor = output_tensor.reshape((self.world_size,) + input_size) + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape( + input_size[:dim] + + (self.world_size * input_size[dim],) + + input_size[dim + 1 :] + ) + return output_tensor + + def all_gatherv( + self, + input_: torch.Tensor | list[torch.Tensor], + dim: int = 0, + sizes: list[int] | None = None, + ) -> torch.Tensor | list[torch.Tensor]: + raise NotImplementedError + + def reduce_scatter(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + world_size = self.world_size + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + ) + + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Note: This will produce an incorrect answer if we don't make + # the input_tensor contiguous. Possible bug in reduce_scatter_tensor? + input_tensor = input_.movedim(0, dim).contiguous() + + assert input_tensor.shape[0] % world_size == 0 + chunk_size = input_tensor.shape[0] // world_size + output_shape = (chunk_size,) + input_tensor.shape[1:] + + output_tensor = torch.empty( + output_shape, dtype=input_tensor.dtype, device=input_tensor.device + ) + + # Perform reduce-scatter operation + torch.distributed.reduce_scatter_tensor( + output_tensor, input_tensor, group=self.device_group + ) + + # Reshape before returning + return output_tensor.movedim(0, dim).contiguous() + + def reduce_scatterv( + self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None + ) -> torch.Tensor: + raise NotImplementedError + + def gather( + self, input_: torch.Tensor, dst: int = 0, dim: int = -1 + ) -> torch.Tensor | None: + """ + NOTE: We assume that the input tensor is on the same device across + all the ranks. + NOTE: `dst` is the local rank of the destination rank. + """ + world_size = self.world_size + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + ) + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Allocate output tensor. + if self.rank_in_group == dst: + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + else: + gather_list = None + # Gather. + if self.use_vllm_comm: + ixfd.gather(input_, + gather_list, + dst=self.ranks[dst], + group=self.device_group, + async_op=True) + else: + torch.distributed.gather( + input_, gather_list, dst=self.ranks[dst], group=self.device_group + ) + if self.rank_in_group == dst: + output_tensor = torch.cat(gather_list, dim=dim) + else: + output_tensor = None + return output_tensor + + def send(self, tensor: torch.Tensor, dst: int | None = None) -> None: + """Sends a tensor to the destination rank in a blocking way""" + """NOTE: `dst` is the local rank of the destination rank.""" + if dst is None: + dst = (self.rank_in_group + 1) % self.world_size + torch.distributed.send(tensor, self.ranks[dst], self.device_group) + + def recv( + self, size: torch.Size, dtype: torch.dtype, src: int | None = None + ) -> torch.Tensor: + """Receives a tensor from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + if src is None: + src = (self.rank_in_group - 1) % self.world_size + + tensor = torch.empty(size, dtype=dtype, device=self.device) + torch.distributed.recv(tensor, self.ranks[src], self.device_group) + return tensor + + def destroy(self): + pass + + def prepare_communication_buffer_for_model(self, model: torch.nn.Module) -> None: + """ + Prepare the communication buffer for the model. + """ + if not self.is_ep_communicator: + return + + moe_modules = [ + module + for module in model.modules() + # TODO(bnell): Should use isinstance but can't. Maybe search for + # presence of quant_method.maybe_init_modular_kernel? + if ( + module.__class__.__name__ == "FusedMoE" + or module.__class__.__name__ == "SharedFusedMoE" + ) + ] + for module in moe_modules: + module.maybe_init_modular_kernel() + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Dispatch the hidden states and router logits to the appropriate device. + This is a no-op in the base class. + """ + return hidden_states, router_logits + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + """ + Combine the hidden states and router logits from the appropriate device. + This is a no-op in the base class. + """ + return hidden_states diff --git a/distributed/device_communicators/cpu_communicator.py b/distributed/device_communicators/cpu_communicator.py new file mode 100644 index 0000000..fdfb74d --- /dev/null +++ b/distributed/device_communicators/cpu_communicator.py @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +from typing import Any + +import torch +from torch.distributed import ProcessGroup + +from vllm.distributed.utils import pickle +from vllm.platforms import current_platform +from vllm.platforms.interface import CpuArchEnum + +from .base_device_communicator import DeviceCommunicatorBase + + +class CpuCommunicator(DeviceCommunicatorBase): + def __init__( + self, + cpu_group: ProcessGroup, + device: torch.device | None = None, + device_group: ProcessGroup | None = None, + unique_name: str = "", + ): + super().__init__(cpu_group, device, device_group, unique_name) + self.dist_module = torch.distributed + + if ( + (current_platform.get_cpu_architecture() == CpuArchEnum.X86) + and hasattr(torch.ops._C, "init_shm_manager") + and (unique_name.startswith("tp") or unique_name.startswith("pp")) + ): + self.dist_module = _CPUSHMDistributed(self) + + def all_reduce(self, input_): + self.dist_module.all_reduce(input_, group=self.device_group) + return input_ + + def gather( + self, input_: torch.Tensor, dst: int = 0, dim: int = -1 + ) -> torch.Tensor | None: + """ + NOTE: We assume that the input tensor is on the same device across + all the ranks. + NOTE: `dst` is the local rank of the destination rank. + """ + world_size = self.world_size + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + ) + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Allocate output tensor. + if self.rank_in_group == dst: + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + else: + gather_list = None + + # Gather. + self.dist_module.gather( + input_, gather_list, dst=self.ranks[dst], group=self.device_group + ) + + if self.rank_in_group == dst: + output_tensor = torch.cat(gather_list, dim=dim) + else: + output_tensor = None + return output_tensor + + def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + input_size = input_.size() + # NOTE: we have to use concat-style all-gather here, + # stack-style all-gather has compatibility issues with + # torch.compile . see https://github.com/pytorch/pytorch/issues/138795 + output_size = (input_size[0] * self.world_size,) + input_size[1:] + # Allocate output tensor. + output_tensor = torch.empty( + output_size, dtype=input_.dtype, device=input_.device + ) + # All-gather. + self.dist_module.all_gather_into_tensor( + output_tensor, input_, group=self.device_group + ) + + # Reshape + output_tensor = output_tensor.reshape((self.world_size,) + input_size) + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape( + input_size[:dim] + + (self.world_size * input_size[dim],) + + input_size[dim + 1 :] + ) + return output_tensor + + def send_tensor_dict( + self, + tensor_dict: dict[str, torch.Tensor | Any], + dst: int, + ) -> None: + return self.dist_module.send_tensor_dict(tensor_dict, dst) + + def recv_tensor_dict( + self, + src: int, + ) -> dict[str, torch.Tensor | Any]: + return self.dist_module.recv_tensor_dict(src) + + +class _CPUSHMDistributed: + def __init__(self, communicator: CpuCommunicator): + instance_identifier = os.environ["VLLM_DIST_IDENT"] + unique_name = communicator.unique_name + instance_identifier = f"{instance_identifier}-{unique_name}" + self.communicator = communicator + + group_ranks = [str(rank) for rank in self.communicator.ranks] + shm_group_identifier = f"[{'-'.join(group_ranks)}]" + self.group_name = f"{instance_identifier}-{shm_group_identifier}-cpushm" + + self.handle = self._init_cpu_shm() + + def _init_cpu_shm(self) -> int: + handle = torch.ops._C.init_shm_manager( + self.group_name, + self.communicator.world_size, + self.communicator.rank, + ) + torch.distributed.barrier(self.communicator.device_group) + torch.ops._C.join_shm_manager( + handle, + self.group_name, + ) + torch.distributed.barrier(self.communicator.device_group) + + return handle + + def all_reduce( + self, input: torch.Tensor, group: ProcessGroup | None = None + ) -> None: + torch.ops._C.shm_allreduce(self.handle, input) + + def gather( + self, + input: torch.Tensor, + gather_list: list[torch.Tensor] | None, + dst: int = -1, + group: ProcessGroup | None = None, + ) -> None: + # Note: different from the torch gather, here we use local dst rank. + torch.ops._C.shm_gather( + self.handle, + input, + gather_list, + torch.distributed.get_group_rank(group, dst), + ) + + def all_gather_into_tensor( + self, + output: torch.Tensor, + input: torch.Tensor, + group: ProcessGroup | None = None, + ) -> None: + torch.ops._C.shm_all_gather(self.handle, input, output) + + def send_tensor_dict( + self, + tensor_dict: dict[str, torch.Tensor | Any], + dst: int, + ) -> None: + key_list = list(tensor_dict.keys()) + value_list = list(tensor_dict.values()) + size_list = [] + for v in value_list: + if not isinstance(v, torch.Tensor): + raise RuntimeError("CpuCommunicator only supports sending tensors.") + size_list.append(v.size()) + key_size_tensor = torch.frombuffer( + pickle.dumps([key_list, size_list]), dtype=torch.uint8 + ) + value_list.append(key_size_tensor) + + torch.ops._C.shm_send_tensor_list(self.handle, value_list, dst) + + return None + + def recv_tensor_dict( + self, + src: int, + ) -> dict[str, torch.Tensor | Any]: + tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src) + + value_list: list[torch.Tensor] = tensor_list[:-1] + key_size_tensor = tensor_list[-1] + + key_size = pickle.loads(key_size_tensor.numpy().tobytes()) + key_list = key_size[0] + size_list = key_size[1] + assert len(key_list) == len(size_list) + assert len(key_list) == len(value_list) + + tensor_dict: dict[str, torch.Tensor] = {} + for key, size, t in zip(key_list, size_list, value_list): + tensor_dict[key] = t.view(size) + return tensor_dict diff --git a/distributed/device_communicators/cuda_communicator.py b/distributed/device_communicators/cuda_communicator.py new file mode 100644 index 0000000..9c04664 --- /dev/null +++ b/distributed/device_communicators/cuda_communicator.py @@ -0,0 +1,333 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +from torch.distributed import ProcessGroup + +import vllm.envs as envs +from vllm.distributed.device_communicators.all_reduce_utils import ( + should_nccl_symm_mem_allreduce, +) +from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops +from vllm.distributed.device_communicators.pynccl_allocator import ( + is_symmetric_memory_enabled, +) +from vllm.logger import init_logger +from vllm.platforms import current_platform +from .base_device_communicator import DeviceCommunicatorBase +import ixformer.distributed as ixfd +import os +logger = init_logger(__name__) + + +class CudaCommunicator(DeviceCommunicatorBase): + def __init__( + self, + cpu_group: ProcessGroup, + device: torch.device | None = None, + device_group: ProcessGroup | None = None, + unique_name: str = "", + ): + super().__init__(cpu_group, device, device_group, unique_name) + if "tp" not in unique_name: + # custom allreduce or torch symm mem can be used only by tp + use_custom_allreduce = False + use_torch_symm_mem = False + else: + from vllm.distributed.parallel_state import _ENABLE_CUSTOM_ALL_REDUCE + + use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE + use_torch_symm_mem = envs.VLLM_ALLREDUCE_USE_SYMM_MEM + + self.use_custom_allreduce = use_custom_allreduce + self.use_torch_symm_mem = use_torch_symm_mem + + self.use_vllm_comm = os.environ.get("VLLM_FORCE_NCCL_COMM",None) not in ["1", "Y", "y"] + + # lazy import to avoid documentation build error + from vllm.distributed.device_communicators.custom_all_reduce import ( + CustomAllreduce, + ) + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + from vllm.distributed.device_communicators.quick_all_reduce import ( + QuickAllReduce, + ) + from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator + + self.pynccl_comm: PyNcclCommunicator | None = None + if self.world_size > 1: + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, + device=self.device, + ) + if is_symmetric_memory_enabled(): + register_nccl_symmetric_ops(self.pynccl_comm) + + self.ca_comm: CustomAllreduce | None = None + self.qr_comm: QuickAllReduce | None = None + self.symm_mem_comm: SymmMemCommunicator | None = None + if use_torch_symm_mem and current_platform.is_cuda(): + self.symm_mem_comm = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + ) + + if use_custom_allreduce and self.world_size > 1: + # Initialize a custom fast all-reduce implementation. + self.ca_comm = CustomAllreduce( + group=self.cpu_group, + device=self.device, + symm_mem_enabled=( + self.symm_mem_comm is not None and not self.symm_mem_comm.disabled + ), + ) + + if current_platform.is_rocm(): + # Initialize a custom quick all-reduce implementation for AMD. + # Quick reduce is designed as a complement to custom allreduce. + # Based on quickreduce (https://github.com/mk1-project/quickreduce). + # If it's a rocm, 'use_custom_allreduce==True' means it must + # currently be an MI300 series. + self.qr_comm = QuickAllReduce(group=self.cpu_group, device=self.device) + + if self.use_all2all: + if self.all2all_backend == "naive": + from .all2all import NaiveAll2AllManager + + self.all2all_manager = NaiveAll2AllManager(self.cpu_group) + elif self.all2all_backend == "allgather_reducescatter": + from .all2all import AgRsAll2AllManager + + self.all2all_manager = AgRsAll2AllManager(self.cpu_group) + elif self.all2all_backend == "pplx": + from .all2all import PPLXAll2AllManager + + self.all2all_manager = PPLXAll2AllManager(self.cpu_group) + elif self.all2all_backend == "deepep_high_throughput": + from .all2all import DeepEPHTAll2AllManager + + self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group) + elif self.all2all_backend == "deepep_low_latency": + from .all2all import DeepEPLLAll2AllManager + + self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group) + elif self.all2all_backend == "flashinfer_all2allv": + from .all2all import FlashInferAllToAllManager + + self.all2all_manager = FlashInferAllToAllManager(self.cpu_group) + else: + raise ValueError(f"Unknown all2all backend: {self.all2all_backend}") + + logger.info_once( + "Using %s all2all manager.", + self.all2all_manager.__class__.__name__, + scope="global", + ) + + def all_reduce(self, input_): + # since currently we perform copy input -> symm_input -> out-of-place AR + # return symm_output, we don't need to check if input is symmetric + if self.pynccl_comm is not None and should_nccl_symm_mem_allreduce( + self.pynccl_comm.world_size, input_ + ): + out = torch.ops.vllm.all_reduce_symmetric_with_copy(input_) + if out is not None: + return out + # always try quick reduce first, then custom allreduce, + # and then pynccl. (quick reduce just for ROCM MI3*) + qr_comm = self.qr_comm + if ( + qr_comm is not None + and not qr_comm.disabled + and qr_comm.should_quick_allreduce(input_) + ): + out = qr_comm.quick_all_reduce(input_) + assert out is not None + return out + ca_comm = self.ca_comm + if ( + ca_comm is not None + and not ca_comm.disabled + and ca_comm.should_custom_ar(input_) + ): + out = ca_comm.custom_all_reduce(input_) + assert out is not None + return out + symm_mem_comm = self.symm_mem_comm + if symm_mem_comm is not None and symm_mem_comm.should_use_symm_mem(input_): + out = symm_mem_comm.all_reduce(input_) + assert out is not None + return out + if self.world_size == 1: + return input_ + + if self.use_vllm_comm: + ixfd.all_reduce(input_, group=self.device_group, async_op=True) + else: + torch.distributed.all_reduce(input_, group=self.device_group) + return input_ + + def reduce_scatter(self, input_: torch.Tensor, dim: int = -1): + world_size = self.world_size + pynccl_comm = self.pynccl_comm + assert pynccl_comm is not None + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Note: This will produce an incorrect answer if we don't make + # the input_tensor contiguous. Possible bug in reduce_scatter_tensor? + input_tensor = input_.movedim(0, dim).contiguous() + + assert input_tensor.shape[0] % world_size == 0 + chunk_size = input_tensor.shape[0] // world_size + output_shape = (chunk_size,) + input_tensor.shape[1:] + + output = torch.empty( + output_shape, dtype=input_tensor.dtype, device=input_tensor.device + ) + + # Perform reduce-scatter operation + ixfd.reduce_scatter_tensor(output,input_tensor,group=self.device_group, async_op=True) + + # Reshape before returning + return output.movedim(0, dim).contiguous() + + def reduce_scatterv( + self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None + ): + world_size = self.world_size + pynccl_comm = self.pynccl_comm + assert pynccl_comm is not None + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + + # Note: This will produce an incorrect answer if we don't make + # the input_tensor contiguous. Possible bug in reduce_scatter_tensor? + input_tensor = input_.movedim(0, dim).contiguous() + + if sizes is not None: + assert len(sizes) == world_size + assert input_tensor.shape[0] == sum(sizes) + chunk_size = sizes[self.rank_in_group] + else: + assert input_tensor.shape[0] % world_size == 0 + chunk_size = input_tensor.shape[0] // world_size + output_shape = (chunk_size,) + input_tensor.shape[1:] + + output = torch.empty( + output_shape, dtype=input_tensor.dtype, device=input_tensor.device + ) + + if sizes is not None: + pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes) + else: + pynccl_comm.reduce_scatter(output, input_tensor) + + # Reshape before returning + return output.movedim(0, dim).contiguous() + + def send(self, tensor: torch.Tensor, dst: int | None = None) -> None: + """Sends a tensor to the destination rank in a blocking way""" + """NOTE: `dst` is the local rank of the destination rank.""" + if dst is None: + dst = (self.rank_in_group + 1) % self.world_size + if self.use_vllm_comm: + ixfd.send(tensor, self.ranks[dst], self.device_group) + else: + torch.distributed.send(tensor, self.ranks[dst], self.device_group) + + def recv( + self, size: torch.Size, dtype: torch.dtype, src: int | None = None + ) -> torch.Tensor: + """Receives a tensor from the source rank.""" + """NOTE: `src` is the local rank of the source rank.""" + if src is None: + src = (self.rank_in_group - 1) % self.world_size + + tensor = torch.empty(size, dtype=dtype, device=self.device) + if self.use_vllm_comm: + ixfd.recv(tensor, self.ranks[src], self.device_group) + else: + torch.distributed.recv(tensor, self.ranks[src], self.device_group) + return tensor + + def destroy(self): + if self.pynccl_comm is not None: + self.pynccl_comm = None + if self.ca_comm is not None: + self.ca_comm = None + if self.all2all_manager is not None: + self.all2all_manager.destroy() + self.all2all_manager = None + + def all_gatherv( + self, + input_: torch.Tensor | list[torch.Tensor], + dim: int = 0, + sizes: list[int] | None = None, + ): + if dim != 0: + raise NotImplementedError("only dim 0 all-gatherv is supported") + world_size = self.world_size + pynccl_comm = self.pynccl_comm + assert pynccl_comm is not None and not pynccl_comm.disabled + + # 'sizes' is not needed if all inputs in the same group have the same + # shape + if sizes is not None and all(s == sizes[0] for s in sizes): + sizes = None + + def _all_gather_single(input_: torch.Tensor, sizes: list[int] | None = None): + input_size = input_.size() + if sizes is not None: + assert len(sizes) == world_size + assert input_.shape[dim] == sizes[self.rank_in_group], ( + f"{input_.shape[dim]} != {sizes[self.rank_in_group]}" + ) + output_size = (sum(sizes),) + input_size[1:] + else: + output_size = (input_size[0] * world_size,) + input_size[1:] + # Allocate output tensor. + output_tensor = torch.empty( + output_size, dtype=input_.dtype, device=input_.device + ) + if sizes is not None: + pynccl_comm.all_gatherv(output_tensor, input_, sizes=sizes) + else: + pynccl_comm.all_gather(output_tensor, input_) + return output_tensor + + if isinstance(input_, torch.Tensor): + return _all_gather_single(input_, sizes) + + output_list = [] + pynccl_comm.group_start() + for inp in input_: + output_list.append(_all_gather_single(inp, sizes=sizes)) + pynccl_comm.group_end() + + return output_list + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + assert self.all2all_manager is not None + hidden_states, router_logits = self.all2all_manager.dispatch( + hidden_states, router_logits, is_sequence_parallel + ) + return hidden_states, router_logits + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + assert self.all2all_manager is not None + hidden_states = self.all2all_manager.combine( + hidden_states, is_sequence_parallel + ) + return hidden_states diff --git a/distributed/device_communicators/cuda_wrapper.py b/distributed/device_communicators/cuda_wrapper.py new file mode 100644 index 0000000..6aadab3 --- /dev/null +++ b/distributed/device_communicators/cuda_wrapper.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""This file is a pure Python wrapper for the cudart library. +It avoids the need to compile a separate shared library, and is +convenient for use when we just need to call a few functions. +""" + +import ctypes +from dataclasses import dataclass +from typing import Any + +# this line makes it possible to directly load `libcudart.so` using `ctypes` +import torch # noqa + +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +# === export types and functions from cudart to Python === +# for the original cudart definition, please check +# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +cudaError_t = ctypes.c_int +cudaMemcpyKind = ctypes.c_int + + +class cudaIpcMemHandle_t(ctypes.Structure): + _fields_ = [("internal", ctypes.c_byte * 128)] + + +@dataclass +class Function: + name: str + restype: Any + argtypes: list[Any] + + +def find_loaded_library(lib_name) -> str | None: + """ + According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html, + the file `/proc/self/maps` contains the memory maps of the process, which includes the + shared libraries loaded by the process. We can use this file to find the path of the + a loaded library. + """ # noqa + found = False + with open("/proc/self/maps") as f: + for line in f: + if lib_name in line: + found = True + break + if not found: + # the library is not loaded in the current process + return None + # if lib_name is libcudart, we need to match a line with: + # address /path/to/libcudart-hash.so.11.0 + start = line.index("/") + path = line[start:].strip() + filename = path.split("/")[-1] + assert filename.rpartition(".so")[0].startswith(lib_name), ( + f"Unexpected filename: {filename} for library {lib_name}" + ) + return path + + +class CudaRTLibrary: + exported_functions = [ + # ​cudaError_t cudaSetDevice ( int device ) + Function("cudaSetDevice", cudaError_t, [ctypes.c_int]), + # cudaError_t cudaDeviceSynchronize ( void ) + Function("cudaDeviceSynchronize", cudaError_t, []), + # ​cudaError_t cudaDeviceReset ( void ) + Function("cudaDeviceReset", cudaError_t, []), + # const char* cudaGetErrorString ( cudaError_t error ) + Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]), + # ​cudaError_t cudaMalloc ( void** devPtr, size_t size ) + Function( + "cudaMalloc", + cudaError_t, + [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t], + ), + # ​cudaError_t cudaFree ( void* devPtr ) + Function("cudaFree", cudaError_t, [ctypes.c_void_p]), + # ​cudaError_t cudaMemset ( void* devPtr, int value, size_t count ) + Function( + "cudaMemset", cudaError_t, [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t] + ), + # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa + Function( + "cudaMemcpy", + cudaError_t, + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind], + ), + # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa + Function( + "cudaIpcGetMemHandle", + cudaError_t, + [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p], + ), + # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags ) # noqa + Function( + "cudaIpcOpenMemHandle", + cudaError_t, + [ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint], + ), + ] + + # https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Runtime_API_functions_supported_by_HIP.html # noqa + cuda_to_hip_mapping = { + "cudaSetDevice": "hipSetDevice", + "cudaDeviceSynchronize": "hipDeviceSynchronize", + "cudaDeviceReset": "hipDeviceReset", + "cudaGetErrorString": "hipGetErrorString", + "cudaMalloc": "hipMalloc", + "cudaFree": "hipFree", + "cudaMemset": "hipMemset", + "cudaMemcpy": "hipMemcpy", + "cudaIpcGetMemHandle": "hipIpcGetMemHandle", + "cudaIpcOpenMemHandle": "hipIpcOpenMemHandle", + } + + # class attribute to store the mapping from the path to the library + # to avoid loading the same library multiple times + path_to_library_cache: dict[str, Any] = {} + + # class attribute to store the mapping from library path + # to the corresponding dictionary + path_to_dict_mapping: dict[str, dict[str, Any]] = {} + + def __init__(self, so_file: str | None = None): + if so_file is None: + so_file = find_loaded_library("libcudart") + if so_file is None: + # libcudart is not loaded in the current process, try hip + so_file = find_loaded_library("libamdhip64") + # should be safe to assume now that we are using ROCm + # as the following assertion should error out if the + # libhiprtc library is also not loaded + if so_file is None: + so_file = envs.VLLM_CUDART_SO_PATH # fallback to env var + assert so_file is not None, ( + "libcudart is not loaded in the current process, " + "try setting VLLM_CUDART_SO_PATH" + ) + if so_file not in CudaRTLibrary.path_to_library_cache: + lib = ctypes.CDLL(so_file) + CudaRTLibrary.path_to_library_cache[so_file] = lib + self.lib = CudaRTLibrary.path_to_library_cache[so_file] + + if so_file not in CudaRTLibrary.path_to_dict_mapping: + _funcs = {} + for func in CudaRTLibrary.exported_functions: + f = getattr( + self.lib, + CudaRTLibrary.cuda_to_hip_mapping[func.name] + if current_platform.is_rocm() + else func.name, + ) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs + self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file] + + def CUDART_CHECK(self, result: cudaError_t) -> None: + if result != 0: + error_str = self.cudaGetErrorString(result) + raise RuntimeError(f"CUDART error: {error_str}") + + def cudaGetErrorString(self, error: cudaError_t) -> str: + return self.funcs["cudaGetErrorString"](error).decode("utf-8") + + def cudaSetDevice(self, device: int) -> None: + self.CUDART_CHECK(self.funcs["cudaSetDevice"](device)) + + def cudaDeviceSynchronize(self) -> None: + self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]()) + + def cudaDeviceReset(self) -> None: + self.CUDART_CHECK(self.funcs["cudaDeviceReset"]()) + + def cudaMalloc(self, size: int) -> ctypes.c_void_p: + devPtr = ctypes.c_void_p() + self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size)) + return devPtr + + def cudaFree(self, devPtr: ctypes.c_void_p) -> None: + self.CUDART_CHECK(self.funcs["cudaFree"](devPtr)) + + def cudaMemset(self, devPtr: ctypes.c_void_p, value: int, count: int) -> None: + self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count)) + + def cudaMemcpy( + self, dst: ctypes.c_void_p, src: ctypes.c_void_p, count: int + ) -> None: + cudaMemcpyDefault = 4 + kind = cudaMemcpyDefault + self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind)) + + def cudaIpcGetMemHandle(self, devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t: + handle = cudaIpcMemHandle_t() + self.CUDART_CHECK( + self.funcs["cudaIpcGetMemHandle"](ctypes.byref(handle), devPtr) + ) + return handle + + def cudaIpcOpenMemHandle(self, handle: cudaIpcMemHandle_t) -> ctypes.c_void_p: + cudaIpcMemLazyEnablePeerAccess = 1 + devPtr = ctypes.c_void_p() + self.CUDART_CHECK( + self.funcs["cudaIpcOpenMemHandle"]( + ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess + ) + ) + return devPtr diff --git a/distributed/device_communicators/custom_all_reduce.py b/distributed/device_communicators/custom_all_reduce.py new file mode 100644 index 0000000..0259180 --- /dev/null +++ b/distributed/device_communicators/custom_all_reduce.py @@ -0,0 +1,326 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager +from typing import cast + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +import vllm.envs as envs +from vllm import _custom_ops as ops +from vllm.distributed.device_communicators.all_reduce_utils import ( + CUSTOM_ALL_REDUCE_MAX_SIZES, + gpu_p2p_access_check, +) +from vllm.distributed.parallel_state import in_the_same_node_as +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.torch_utils import cuda_device_count_stateless + +try: + ops.meta_size() + custom_ar = True +except Exception: + # For CPUs + custom_ar = False + +logger = init_logger(__name__) + + +def _can_p2p(rank: int, world_size: int) -> bool: + for i in range(world_size): + if i == rank: + continue + if envs.VLLM_SKIP_P2P_CHECK: + logger.debug("Skipping P2P check and trusting the driver's P2P report.") + return torch.cuda.can_device_access_peer(rank, i) + if not gpu_p2p_access_check(rank, i): + return False + return True + + +def is_weak_contiguous(inp: torch.Tensor): + return inp.is_contiguous() or ( + inp.storage().nbytes() - inp.storage_offset() * inp.element_size() + == inp.numel() * inp.element_size() + ) + + +class CustomAllreduce: + _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8] + + # max_size: max supported allreduce size + def __init__( + self, + group: ProcessGroup, + device: int | str | torch.device, + max_size=8192 * 1024, + symm_mem_enabled=False, + ) -> None: + """ + Args: + group: the process group to work on. If None, it will use the + default process group. + device: the device to bind the CustomAllreduce to. If None, + it will be bound to f"cuda:{local_rank}". + It is the caller's responsibility to make sure each communicator + is bind to a unique device, and all communicators in this group + are in the same node. + """ + self._IS_CAPTURING = False + self.disabled = True + + if not custom_ar: + # disable because of missing custom allreduce library + # e.g. in a non-GPU environment + logger.info( + "Custom allreduce is disabled because " + "of missing custom allreduce library" + ) + return + + self.group = group + + assert dist.get_backend(group) != dist.Backend.NCCL, ( + "CustomAllreduce should be attached to a non-NCCL group." + ) + + if not all(in_the_same_node_as(group, source_rank=0)): + # No need to initialize custom allreduce for multi-node case. + logger.warning( + "Custom allreduce is disabled because this process group" + " spans across nodes." + ) + return + + rank = dist.get_rank(group=self.group) + self.rank = rank + world_size = dist.get_world_size(group=self.group) + if world_size == 1: + # No need to initialize custom allreduce for single GPU case. + return + + if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES: + logger.warning( + "Custom allreduce is disabled due to an unsupported world" + " size: %d. Supported world sizes: %s. To silence this " + "warning, specify disable_custom_all_reduce=True explicitly.", + world_size, + str(CustomAllreduce._SUPPORTED_WORLD_SIZES), + ) + return + + if isinstance(device, int): + device = torch.device(f"cuda:{device}") + elif isinstance(device, str): + device = torch.device(device) + # now `device` is a `torch.device` object + assert isinstance(device, torch.device) + self.device = device + device_capability = current_platform.get_device_capability() + if ( + current_platform.is_cuda() + and symm_mem_enabled + and device_capability is not None + ): + device_capability_str = device_capability.as_version_str() + if device_capability_str in CUSTOM_ALL_REDUCE_MAX_SIZES: + max_size = min( + CUSTOM_ALL_REDUCE_MAX_SIZES[device_capability_str][world_size], + max_size, + ) + cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES + if cuda_visible_devices: + device_ids = list(map(int, cuda_visible_devices.split(","))) + else: + device_ids = list(range(cuda_device_count_stateless())) + + physical_device_id = device_ids[device.index] + tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu") + gather_list = [ + torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size) + ] + dist.all_gather(gather_list, tensor, group=self.group) + physical_device_ids = [t.item() for t in gather_list] + + # test nvlink first, this will filter out most of the cases + # where custom allreduce is not supported + # this checks hardware and driver support for NVLink + assert current_platform.is_cuda_alike() + fully_connected = current_platform.is_fully_connected(physical_device_ids) + if world_size > 2 and not fully_connected: + logger.warning( + "Custom allreduce is disabled because it's not supported on" + " more than two PCIe-only GPUs. To silence this warning, " + "specify disable_custom_all_reduce=True explicitly." + ) + return + # test P2P capability, this checks software/cudaruntime support + # this is expensive to compute at the first time + # then we cache the result + # On AMD GPU, p2p is always enabled between XGMI connected GPUs + if not current_platform.is_rocm() and not _can_p2p(rank, world_size): + logger.warning( + "Custom allreduce is disabled because your platform lacks " + "GPU P2P capability or P2P test failed. To silence this " + "warning, specify disable_custom_all_reduce=True explicitly." + ) + return + + self.disabled = False + # Buffers memory are owned by this Python class and passed to C++. + # Metadata composes of two parts: metadata for synchronization and a + # temporary buffer for storing intermediate allreduce results. + self.meta_ptrs = self.create_shared_buffer( + ops.meta_size() + max_size, group=group, uncached=True + ) + # This is a pre-registered IPC buffer. In eager mode, input tensors + # are first copied into this buffer before allreduce is performed + self.buffer_ptrs = self.create_shared_buffer(max_size, group=group) + # This is a buffer for storing the tuples of pointers pointing to + # IPC buffers from all ranks. Each registered tuple has size of + # 8*world_size bytes where world_size is at most 8. Allocating 8MB + # is enough for 131072 such tuples. The largest model I've seen only + # needs less than 10000 of registered tuples. + self.rank_data = torch.empty( + 8 * 1024 * 1024, dtype=torch.uint8, device=self.device + ) + self.max_size = max_size + self.rank = rank + self.world_size = world_size + self.fully_connected = fully_connected + self._ptr = ops.init_custom_ar( + self.meta_ptrs, self.rank_data, rank, self.fully_connected + ) + ops.register_buffer(self._ptr, self.buffer_ptrs) + + @contextmanager + def capture(self): + """ + The main responsibility of this context manager is the + `register_graph_buffers` call at the end of the context. + It records all the buffer addresses used in the CUDA graph. + """ + try: + self._IS_CAPTURING = True + yield + finally: + self._IS_CAPTURING = False + if not self.disabled: + self.register_graph_buffers() + + def register_graph_buffers(self): + handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) + logger.info("Registering %d cuda graph addresses", len(offset)) + # We cannot directly use `dist.all_gather_object` here + # because it is incompatible with `gloo` backend under inference mode. + # see https://github.com/pytorch/pytorch/issues/126032 for details. + all_data: list[list[list[int] | None]] + all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))] + all_data[self.rank] = [handle, offset] + ranks = sorted(dist.get_process_group_ranks(group=self.group)) + for i, rank in enumerate(ranks): + dist.broadcast_object_list( + all_data[i], src=rank, group=self.group, device="cpu" + ) + # Unpack list of tuples to tuple of lists. + handles = cast(list[list[int]], [d[0] for d in all_data]) + offsets = cast(list[list[int]], [d[1] for d in all_data]) + ops.register_graph_buffers(self._ptr, handles, offsets) + + def should_custom_ar(self, inp: torch.Tensor): + if self.disabled: + return False + inp_size = inp.numel() * inp.element_size() + # custom allreduce requires input byte size to be multiples of 16 + if inp_size % 16 != 0: + return False + if not is_weak_contiguous(inp): + return False + # for 4 or more non NVLink-capable GPUs, custom allreduce provides + # little performance improvement over NCCL. + if self.world_size == 2 or self.fully_connected: + return inp_size < self.max_size + return False + + def all_reduce( + self, inp: torch.Tensor, *, out: torch.Tensor = None, registered: bool = False + ): + """Performs an out-of-place all reduce. + + If registered is True, this assumes inp's pointer is already + IPC-registered. Otherwise, inp is first copied into a pre-registered + buffer. + """ + if out is None: + out = torch.empty_like(inp) + if registered: + ops.all_reduce(self._ptr, inp, out, 0, 0) + else: + ops.all_reduce( + self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size + ) + return out + + def custom_all_reduce(self, input: torch.Tensor) -> torch.Tensor | None: + """The main allreduce API that provides support for cuda graph.""" + # When custom allreduce is disabled, this will be None. + if self.disabled or not self.should_custom_ar(input): + return None + if self._IS_CAPTURING: + if torch.cuda.is_current_stream_capturing(): + return self.all_reduce(input, registered=True) + else: + # If warm up, mimic the allocation pattern since custom + # allreduce is out-of-place. + return torch.empty_like(input) + else: + # Note: outside of cuda graph context, custom allreduce incurs a + # cost of cudaMemcpy, which should be small (<=1% of overall + # latency) compared to the performance gain of using custom kernels + return self.all_reduce(input, registered=False) + + def close(self): + if not self.disabled and self._ptr: + if ops is not None: + ops.dispose(self._ptr) + self._ptr = 0 + self.free_shared_buffer(self.meta_ptrs, rank=self.rank) + self.free_shared_buffer(self.buffer_ptrs, rank=self.rank) + + def __del__(self): + self.close() + + @staticmethod + def create_shared_buffer( + size_in_bytes: int, + group: ProcessGroup | None = None, + uncached: bool | None = False, + ) -> list[int]: + pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes) + + world_size = dist.get_world_size(group=group) + rank = dist.get_rank(group=group) + handles = [None] * world_size + dist.all_gather_object(handles, handle, group=group) + + pointers: list[int] = [] + for i, h in enumerate(handles): + if i == rank: + pointers.append(pointer) # type: ignore + else: + pointers.append(ops.open_mem_handle(h)) + return pointers + + @staticmethod + def free_shared_buffer( + pointers: list[int], + group: ProcessGroup | None = None, + rank: int | None = None, + ) -> None: + if rank is None: + rank = dist.get_rank(group=group) + if ops is not None: + ops.free_shared_buffer(pointers[rank]) diff --git a/distributed/device_communicators/mnnvl_compat.py b/distributed/device_communicators/mnnvl_compat.py new file mode 100644 index 0000000..61aee2d --- /dev/null +++ b/distributed/device_communicators/mnnvl_compat.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch.distributed as dist +from flashinfer.comm.mnnvl import CommBackend as CommBackend + +from vllm.utils.flashinfer import has_flashinfer_all2all + +assert has_flashinfer_all2all(), "Flashinfer alltoallv module cannot be found" + + +class CustomCommunicator(CommBackend): + def __init__(self, group): + self._group = group + + def Get_rank(self) -> int: + return self._group.rank() + + def Get_size(self) -> int: + return self._group.size() + + def allgather(self, data: int): + gathered = [None] * self.Get_size() + dist.all_gather_object(gathered, data, group=self._group) + return gathered + + def Split(self, color: int, key: int) -> "CustomCommunicator": + return self diff --git a/distributed/device_communicators/pynccl.py b/distributed/device_communicators/pynccl.py new file mode 100644 index 0000000..2fc35e8 --- /dev/null +++ b/distributed/device_communicators/pynccl.py @@ -0,0 +1,386 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +# ===================== import region ===================== +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup, ReduceOp + +import vllm.envs as envs +from vllm.distributed.device_communicators.pynccl_wrapper import ( + NCCLLibrary, + buffer_type, + cudaStream_t, + ncclComm_t, + ncclDataTypeEnum, + ncclRedOpTypeEnum, + ncclUniqueId, +) +from vllm.distributed.utils import StatelessProcessGroup +from vllm.logger import init_logger +from vllm.utils.torch_utils import current_stream + +logger = init_logger(__name__) + +_NCCL_SYMM_OPS_REGISTERED = False + + +def register_nccl_symmetric_ops(pynccl_comm): + from vllm.distributed.device_communicators.pynccl_allocator import ( + nccl_symm_mem_context, + ) + from vllm.utils.torch_utils import direct_register_custom_op + + global _NCCL_SYMM_OPS_REGISTERED + if _NCCL_SYMM_OPS_REGISTERED: + return + _NCCL_SYMM_OPS_REGISTERED = True + + def all_reduce_symmetric_with_copy_impl(input_tensor: torch.Tensor) -> torch.Tensor: + with nccl_symm_mem_context(pynccl_comm): + symm_input = torch.empty_like(input_tensor) + symm_output = torch.empty_like(input_tensor) + symm_input.copy_(input_tensor) + symm_output = pynccl_comm.all_reduce(symm_input, symm_output) + return symm_output + + def all_reduce_symmetric_with_copy_fake(input_tensor: torch.Tensor) -> torch.Tensor: + return torch.empty_like(input_tensor) + + direct_register_custom_op( + op_name="all_reduce_symmetric_with_copy", + op_func=all_reduce_symmetric_with_copy_impl, + fake_impl=all_reduce_symmetric_with_copy_fake, + ) + + +class PyNcclCommunicator: + def __init__( + self, + group: ProcessGroup | StatelessProcessGroup, + device: int | str | torch.device, + library_path: str | None = None, + ): + """ + Args: + group: the process group to work on. If None, it will use the + default process group. + device: the device to bind the PyNcclCommunicator to. If None, + it will be bound to f"cuda:{local_rank}". + library_path: the path to the NCCL library. If None, it will + use the default library path. + It is the caller's responsibility to make sure each communicator + is bind to a unique device. + """ + if not isinstance(group, StatelessProcessGroup): + assert dist.is_initialized() + assert dist.get_backend(group) != dist.Backend.NCCL, ( + "PyNcclCommunicator should be attached to a non-NCCL group." + ) + # note: this rank is the rank in the group + self.rank = dist.get_rank(group) + self.world_size = dist.get_world_size(group) + else: + self.rank = group.rank + self.world_size = group.world_size + + self.group = group + + # if world_size == 1, no need to create communicator + if self.world_size == 1 or envs.VLLM_DISABLE_PYNCCL: + self.available = False + self.disabled = True + return + try: + self.nccl = NCCLLibrary(library_path) + except Exception: + # disable because of missing NCCL library + # e.g. in a non-GPU environment + self.available = False + self.disabled = True + return + + self.available = True + self.disabled = False + + self.nccl_version = self.nccl.ncclGetRawVersion() + if self.rank == 0: + # get the unique id from NCCL + self.unique_id = self.nccl.ncclGetUniqueId() + logger.info_once( + "vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local" + ) + else: + # construct an empty unique id + self.unique_id = ncclUniqueId() + + if not isinstance(group, StatelessProcessGroup): + tensor = torch.ByteTensor(list(self.unique_id.internal)) + ranks = dist.get_process_group_ranks(group) + # arg `src` in `broadcast` is the global rank + dist.broadcast(tensor, src=ranks[0], group=group) + byte_list = tensor.tolist() + for i, byte in enumerate(byte_list): + self.unique_id.internal[i] = byte + else: + self.unique_id = group.broadcast_obj(self.unique_id, src=0) + if isinstance(device, int): + device = torch.device(f"cuda:{device}") + elif isinstance(device, str): + device = torch.device(device) + # now `device` is a `torch.device` object + assert isinstance(device, torch.device) + self.device = device + # nccl communicator and stream will use this device + # `torch.cuda.device` is a context manager that changes the + # current cuda device to the specified one + with torch.cuda.device(device): + self.comm: ncclComm_t = self.nccl.ncclCommInitRank( + self.world_size, self.unique_id, self.rank + ) + + stream = current_stream() + # A small all_reduce for warmup. + data = torch.zeros(1, device=device) + self.all_reduce(data) + stream.synchronize() + del data + + def all_reduce( + self, + in_tensor: torch.Tensor, + out_tensor: torch.Tensor = None, + op: ReduceOp = ReduceOp.SUM, + stream=None, + ) -> torch.Tensor: + if self.disabled: + return None + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert in_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {in_tensor.device}" + ) + + if out_tensor is None: + out_tensor = torch.empty_like(in_tensor) + + if stream is None: + stream = current_stream() + self.nccl.ncclAllReduce( + buffer_type(in_tensor.data_ptr()), + buffer_type(out_tensor.data_ptr()), + in_tensor.numel(), + ncclDataTypeEnum.from_torch(in_tensor.dtype), + ncclRedOpTypeEnum.from_torch(op), + self.comm, + cudaStream_t(stream.cuda_stream), + ) + return out_tensor + + def all_gather( + self, output_tensor: torch.Tensor, input_tensor: torch.Tensor, stream=None + ): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}" + ) + if stream is None: + stream = current_stream() + self.nccl.ncclAllGather( + buffer_type(input_tensor.data_ptr()), + buffer_type(output_tensor.data_ptr()), + input_tensor.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def all_gatherv( + self, + output_tensor: torch.Tensor, + input_tensor: torch.Tensor, + sizes: list[int], + stream=None, + ): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}" + ) + if stream is None: + stream = current_stream() + assert output_tensor.shape[0] == sum(sizes) + split_offset = 0 + self.nccl.ncclGroupStart() + for root, split_size in enumerate(sizes): + dst_slice = output_tensor[split_offset : split_offset + split_size] + self.nccl.ncclBroadcast( + buffer_type(input_tensor.data_ptr()), + buffer_type(dst_slice.data_ptr()), + dst_slice.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + root, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + split_offset += split_size + self.nccl.ncclGroupEnd() + + def reduce_scatter( + self, + output_tensor: torch.Tensor, + input_tensor: torch.Tensor, + op: ReduceOp = ReduceOp.SUM, + stream=None, + ): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}" + ) + if stream is None: + stream = current_stream() + self.nccl.ncclReduceScatter( + buffer_type(input_tensor.data_ptr()), + buffer_type(output_tensor.data_ptr()), + output_tensor.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + ncclRedOpTypeEnum.from_torch(op), + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def reduce_scatterv( + self, + output_tensor: torch.Tensor, + input_tensor: torch.Tensor, + sizes: list[int], + op: ReduceOp = ReduceOp.SUM, + stream=None, + ): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}" + ) + if stream is None: + stream = current_stream() + + split_offset = 0 + self.nccl.ncclGroupStart() + for root, split_size in enumerate(sizes): + chunk = input_tensor[split_offset : split_offset + split_size, ...] + self.nccl.ncclReduce( + buffer_type(chunk.data_ptr()), + buffer_type(output_tensor.data_ptr()), + chunk.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + ncclRedOpTypeEnum.from_torch(op), + root, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + split_offset += split_size + self.nccl.ncclGroupEnd() + + def send(self, tensor: torch.Tensor, dst: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}" + ) + if stream is None: + stream = current_stream() + self.nccl.ncclSend( + buffer_type(tensor.data_ptr()), + tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), + dst, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def recv(self, tensor: torch.Tensor, src: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}" + ) + if stream is None: + stream = current_stream() + self.nccl.ncclRecv( + buffer_type(tensor.data_ptr()), + tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), + src, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def broadcast(self, tensor: torch.Tensor, src: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}" + ) + if stream is None: + stream = current_stream() + if src == self.rank: + sendbuff = buffer_type(tensor.data_ptr()) + # NCCL requires the sender also to have a receive buffer + recvbuff = buffer_type(tensor.data_ptr()) + else: + sendbuff = buffer_type() + recvbuff = buffer_type(tensor.data_ptr()) + self.nccl.ncclBroadcast( + sendbuff, + recvbuff, + tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), + src, + self.comm, + cudaStream_t(stream.cuda_stream), + ) + + def group_start(self): + self.nccl.ncclGroupStart() + + def group_end(self): + self.nccl.ncclGroupEnd() + + def register_comm_window(self, tensor: torch.Tensor): + return self.nccl.ncclCommWindowRegister( + self.comm, + buffer_type(tensor.data_ptr()), + tensor.numel() * tensor.element_size(), + 1, + ) + + def register_comm_window_raw(self, ptr: int, size: int): + return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1) + + def deregister_comm_window(self, window): + return self.nccl.ncclCommWindowDeregister(self.comm, window) diff --git a/distributed/device_communicators/pynccl_allocator.py b/distributed/device_communicators/pynccl_allocator.py new file mode 100644 index 0000000..401b800 --- /dev/null +++ b/distributed/device_communicators/pynccl_allocator.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import atexit +import contextlib +import tempfile +from typing import Any + +import torch +from packaging import version +from torch.cuda.memory import CUDAPluggableAllocator +from torch.utils.cpp_extension import load_inline + +from vllm import envs +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.nccl import find_nccl_include_paths + +logger = init_logger(__name__) + +nccl_allocator_source = """ +#include +extern "C" { + +void* nccl_alloc_plug(size_t size, int device, void* stream) { + void* ptr; + ncclResult_t err = ncclMemAlloc(&ptr, size); + return ptr; + +} + +void nccl_free_plug(void* ptr, size_t size, int device, void* stream) { + ncclResult_t err = ncclMemFree(ptr); +} + +} +""" + +_allocator = None +_allocator_wrapper = None +_mem_pool = None +_registered_base_addrs = set() +_graph_pool_id = None +_nccl_allocator_failed_to_compile = False +_cached_pool_snapshot = None + + +def is_symmetric_memory_enabled(): + global _nccl_allocator_failed_to_compile + return envs.VLLM_USE_NCCL_SYMM_MEM and not _nccl_allocator_failed_to_compile + + +def is_symmetric_memory_tensor(tensor: torch.Tensor): + if not is_symmetric_memory_enabled() or _cached_pool_snapshot is None: + return False + for segment in _cached_pool_snapshot: + for block in segment["blocks"]: + if block["address"] == tensor.untyped_storage().data_ptr(): + return True + return False + + +def set_graph_pool_id(graph_pool_id): + global _graph_pool_id + _graph_pool_id = graph_pool_id + + +def compile_nccl_allocator(): + global _allocator, _allocator_wrapper, _nccl_allocator_failed_to_compile + if not current_platform.is_cuda(): + _nccl_allocator_failed_to_compile = True + return + try: + out_dir = tempfile.gettempdir() + nccl_allocator_libname = "nccl_allocator" + nccl_include_paths = find_nccl_include_paths() + load_inline( + name=nccl_allocator_libname, + cpp_sources=nccl_allocator_source, + with_cuda=True, + extra_ldflags=["-lnccl"], + verbose=envs.VLLM_LOGGING_LEVEL == "DEBUG", + is_python_module=False, + build_directory=out_dir, + extra_include_paths=nccl_include_paths, + ) + _allocator_wrapper = CUDAPluggableAllocator( + f"{out_dir}/{nccl_allocator_libname}.so", + "nccl_alloc_plug", + "nccl_free_plug", + ) + _allocator = _allocator_wrapper.allocator() + except Exception as e: + _nccl_allocator_failed_to_compile = True + logger.warning( + "Failed to compile NCCL memory allocator. " + "Symmetric memory will be disabled. " + "This is expected if NCCL headers are not available. " + "optionally set VLLM_NCCL_INCLUDE_PATH to point to a directory " + "containing the NCCL header. " + "Error: %s", + str(e), + ) + + +def get_nccl_mem_pool(): + global _mem_pool, _nccl_allocator_failed_to_compile + if _mem_pool is None and not _nccl_allocator_failed_to_compile: + compile_nccl_allocator() + if _allocator is not None: + _mem_pool = torch.cuda.MemPool(_allocator) + return _mem_pool + + +def _cleanup_nccl_mem_pool(): + global _mem_pool + _mem_pool = None + + +def _cleanup_nccl_allocator_wrapper(): + global _allocator_wrapper + _allocator_wrapper = None + + +atexit.register(_cleanup_nccl_mem_pool) +atexit.register(_cleanup_nccl_allocator_wrapper) + + +class nccl_symm_mem_context: + def __init__( + self, + pynccl_comm: PyNcclCommunicator, + disabled: bool = False, + ): + self.disabled = ( + disabled + or not is_symmetric_memory_enabled() + or pynccl_comm.world_size == 1 + or not current_platform.is_cuda() + or get_nccl_mem_pool() is None + or version.parse(torch.__version__) < version.parse("2.8.0.a0") + ) + if self.disabled: + self.pynccl_comm: PyNcclCommunicator | None = None + self._mem_pool_ctx: contextlib.AbstractContextManager[Any] = ( + contextlib.nullcontext() + ) + self.is_graph_capture = None + self.device = None + else: + self.pynccl_comm = pynccl_comm + self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool()) + self.is_graph_capture = torch.cuda.is_current_stream_capturing() + self.device = torch.cuda.current_device() + + def __enter__(self): + if self.disabled: + return self + assert self.pynccl_comm is not None, ( + "Symmetric memory requires pynccl to be initalized" + ) + assert self.pynccl_comm.nccl_version >= 22703, ( + "NCCL version 2.27.3 or higher is required for NCCL symmetric memory" + ) + if self.is_graph_capture: + assert _graph_pool_id is not None, ( + "graph_pool_id is not set under graph capture" + ) + # Pause graph memory pool to use symmetric memory with cuda graph + torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id) + self._mem_pool_ctx.__enter__() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.disabled: + return + global _cached_pool_snapshot + global _registered_base_addrs + self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb) + _pool = get_nccl_mem_pool() + assert _pool is not None + _cached_pool_snapshot = _pool.snapshot() + assert self.pynccl_comm is not None + for segment in _cached_pool_snapshot: + if segment["address"] not in _registered_base_addrs: + self.pynccl_comm.register_comm_window_raw( + segment["address"], segment["total_size"] + ) + _registered_base_addrs.add(segment["address"]) + if self.is_graph_capture: + torch._C._cuda_beginAllocateCurrentThreadToPool(self.device, _graph_pool_id) diff --git a/distributed/device_communicators/pynccl_wrapper.py b/distributed/device_communicators/pynccl_wrapper.py new file mode 100644 index 0000000..b2433d5 --- /dev/null +++ b/distributed/device_communicators/pynccl_wrapper.py @@ -0,0 +1,564 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# This file is a pure Python wrapper for the NCCL library. +# The main purpose is to use NCCL combined with CUDA graph. +# Before writing this script, we tried the following approach: +# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself +# often gets stuck when initializing the NCCL communicator. +# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce` +# contains many other potential cuda APIs, that are not allowed during +# capturing the CUDA graph. For further details, please check +# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ . +# +# Another rejected idea is to write a C/C++ binding for NCCL. It is usually +# doable, but we often encounter issues related with nccl versions, and need +# to switch between different versions of NCCL. See +# https://github.com/NVIDIA/nccl/issues/1234 for more details. +# A C/C++ binding is not flexible enough to handle this. It requires +# recompilation of the code every time we want to switch between different +# versions. This current implementation, with a **pure** Python wrapper, is +# more flexible. We can easily switch between different versions of NCCL by +# changing the environment variable `VLLM_NCCL_SO_PATH`, or the `so_file` +# variable in the code. + +import ctypes +import platform +from dataclasses import dataclass +from typing import Any + +import torch +from torch.distributed import ReduceOp + +from vllm import envs +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.nccl import find_nccl_library + +logger = init_logger(__name__) + +# === export types and functions from nccl to Python === +# for the original nccl definition, please check +# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in + +ncclResult_t = ctypes.c_int +ncclComm_t = ctypes.c_void_p +ncclWindow_t = ctypes.c_void_p + + +class ncclUniqueId(ctypes.Structure): + _fields_ = [("internal", ctypes.c_byte * 128)] + + +cudaStream_t = ctypes.c_void_p +buffer_type = ctypes.c_void_p + +ncclDataType_t = ctypes.c_int + + +class ncclDataTypeEnum: + ncclInt8 = 0 + ncclChar = 0 + ncclUint8 = 1 + ncclInt32 = 2 + ncclInt = 2 + ncclUint32 = 3 + ncclInt64 = 4 + ncclUint64 = 5 + ncclFloat16 = 6 + ncclHalf = 6 + ncclFloat32 = 7 + ncclFloat = 7 + ncclFloat64 = 8 + ncclDouble = 8 + ncclBfloat16 = 9 + ncclNumTypes = 10 + + @classmethod + def from_torch(cls, dtype: torch.dtype) -> int: + if dtype == torch.int8: + return cls.ncclInt8 + if dtype == torch.uint8: + return cls.ncclUint8 + if dtype == torch.int32: + return cls.ncclInt32 + if dtype == torch.int64: + return cls.ncclInt64 + if dtype == torch.float16: + return cls.ncclFloat16 + if dtype == torch.float32: + return cls.ncclFloat32 + if dtype == torch.float64: + return cls.ncclFloat64 + if dtype == torch.bfloat16: + return cls.ncclBfloat16 + raise ValueError(f"Unsupported dtype: {dtype}") + + +ncclRedOp_t = ctypes.c_int + + +class ncclRedOpTypeEnum: + ncclSum = 0 + ncclProd = 1 + ncclMax = 2 + ncclMin = 3 + ncclAvg = 4 + ncclNumOps = 5 + + @classmethod + def from_torch(cls, op: ReduceOp) -> int: + if op == ReduceOp.SUM: + return cls.ncclSum + if op == ReduceOp.PRODUCT: + return cls.ncclProd + if op == ReduceOp.MAX: + return cls.ncclMax + if op == ReduceOp.MIN: + return cls.ncclMin + if op == ReduceOp.AVG: + return cls.ncclAvg + raise ValueError(f"Unsupported op: {op}") + + +@dataclass +class Function: + name: str + restype: Any + argtypes: list[Any] + + +class NCCLLibrary: + exported_functions = [ + # const char* ncclGetErrorString(ncclResult_t result) + Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]), + # ncclResult_t ncclGetVersion(int *version); + Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]), + # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); + Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]), + # ncclResult_t ncclCommInitRank( + # ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); + # note that ncclComm_t is a pointer type, so the first argument + # is a pointer to a pointer + Function( + "ncclCommInitRank", + ncclResult_t, + [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int], + ), + # ncclResult_t ncclAllReduce( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + # cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function( + "ncclAllReduce", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ncclRedOp_t, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclReduce( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclRedOp_t op, int root, + # ncclComm_t comm, cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function( + "ncclReduce", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ncclRedOp_t, + ctypes.c_int, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclAllGather( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclComm_t comm, + # cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function( + "ncclAllGather", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclReduceScatter( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + # cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function( + "ncclReduceScatter", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ncclRedOp_t, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclSend( + # const void* sendbuff, size_t count, ncclDataType_t datatype, + # int dest, ncclComm_t comm, cudaStream_t stream); + Function( + "ncclSend", + ncclResult_t, + [ + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ctypes.c_int, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclRecv( + # void* recvbuff, size_t count, ncclDataType_t datatype, + # int src, ncclComm_t comm, cudaStream_t stream); + Function( + "ncclRecv", + ncclResult_t, + [ + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ctypes.c_int, + ncclComm_t, + cudaStream_t, + ], + ), + # ncclResult_t ncclBroadcast( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, int root, ncclComm_t comm, + # cudaStream_t stream); + Function( + "ncclBroadcast", + ncclResult_t, + [ + buffer_type, + buffer_type, + ctypes.c_size_t, + ncclDataType_t, + ctypes.c_int, + ncclComm_t, + cudaStream_t, + ], + ), + # be cautious! this is a collective call, it will block until all + # processes in the communicator have called this function. + # because Python object destruction can happen in random order, + # it is better not to call it at all. + # ncclResult_t ncclCommDestroy(ncclComm_t comm); + Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]), + # ncclResult_t ncclGroupStart(); + Function("ncclGroupStart", ncclResult_t, []), + # ncclResult_t ncclGroupEnd(); + Function("ncclGroupEnd", ncclResult_t, []), + # ncclResult_t ncclCommWindowRegister( + # ncclComm_t comm, void* buff, size_t size, + # ncclWindow_t* win, int winFlags); + Function( + "ncclCommWindowRegister", + ncclResult_t, + [ + ncclComm_t, + buffer_type, + ctypes.c_size_t, + ctypes.POINTER(ncclWindow_t), + ctypes.c_int, + ], + ), + # ncclResult_t ncclCommWindowDeregister( + # ncclComm_t comm, ncclWindow_t win); + Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]), + ] + + # class attribute to store the mapping from the path to the library + # to avoid loading the same library multiple times + path_to_library_cache: dict[str, Any] = {} + + # class attribute to store the mapping from library path + # to the corresponding dictionary + path_to_dict_mapping: dict[str, dict[str, Any]] = {} + + def __init__(self, so_file: str | None = None): + so_file = so_file or find_nccl_library() + + try: + if so_file not in NCCLLibrary.path_to_dict_mapping: + lib = ctypes.CDLL(so_file) + NCCLLibrary.path_to_library_cache[so_file] = lib + self.lib = NCCLLibrary.path_to_library_cache[so_file] + except Exception as e: + logger.error( + "Failed to load NCCL library from %s. " + "It is expected if you are not running on NVIDIA/AMD GPUs." + "Otherwise, the nccl library might not exist, be corrupted " + "or it does not support the current platform %s. " + "If you already have the library, please set the " + "environment variable VLLM_NCCL_SO_PATH" + " to point to the correct nccl library path.", + so_file, + platform.platform(), + ) + raise e + + if so_file not in NCCLLibrary.path_to_dict_mapping: + _funcs: dict[str, Any] = {} + for func in NCCLLibrary.exported_functions: + try: + f = getattr(self.lib, func.name) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + except AttributeError: + if func.name in [ + "ncclCommWindowRegister", + "ncclCommWindowDeregister", + ]: + if envs.VLLM_USE_NCCL_SYMM_MEM: + logger.warning_once( + "The symbol %s is not found in the NCCL " + "library %s. To enable VLLM_USE_NCCL_SYMM_MEM " + " please update your NCCL version to >= " + "2.27.03.", + func.name, + so_file, + ) + if current_platform.is_rocm(): + # Having an exception here on ROCm platform is + # not allowed during graph capturing + continue + raise + NCCLLibrary.path_to_dict_mapping[so_file] = _funcs + self._funcs = NCCLLibrary.path_to_dict_mapping[so_file] + + def ncclGetErrorString(self, result: ncclResult_t) -> str: + return self._funcs["ncclGetErrorString"](result).decode("utf-8") + + def NCCL_CHECK(self, result: ncclResult_t) -> None: + if result != 0: + error_str = self.ncclGetErrorString(result) + raise RuntimeError(f"NCCL error: {error_str}") + + def ncclGetRawVersion(self) -> int: + version = ctypes.c_int() + self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version))) + # something like 21903 + return version.value + + def ncclGetVersion(self) -> str: + version_str = str(self.ncclGetRawVersion()) + # something like 21903 --> "2.19.3" + major = version_str[0].lstrip("0") + minor = version_str[1:3].lstrip("0") + patch = version_str[3:].lstrip("0") + return f"{major}.{minor}.{patch}" + + def ncclGetUniqueId(self) -> ncclUniqueId: + unique_id = ncclUniqueId() + self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id))) + return unique_id + + def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId: + if len(data) != 128: + raise ValueError( + f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes" + ) + unique_id = ncclUniqueId() + ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128) + return unique_id + + def ncclCommInitRank( + self, world_size: int, unique_id: ncclUniqueId, rank: int + ) -> ncclComm_t: + comm = ncclComm_t() + self.NCCL_CHECK( + self._funcs["ncclCommInitRank"]( + ctypes.byref(comm), world_size, unique_id, rank + ) + ) + return comm + + def ncclAllReduce( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + op: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + # `datatype` actually should be `ncclDataType_t` + # and `op` should be `ncclRedOp_t` + # both are aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK( + self._funcs["ncclAllReduce"]( + sendbuff, recvbuff, count, datatype, op, comm, stream + ) + ) + + def ncclReduce( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + op: int, + root: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + # `datatype` actually should be `ncclDataType_t` + # and `op` should be `ncclRedOp_t` + # both are aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK( + self._funcs["ncclReduce"]( + sendbuff, recvbuff, count, datatype, op, root, comm, stream + ) + ) + + def ncclReduceScatter( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + op: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + # `datatype` actually should be `ncclDataType_t` + # and `op` should be `ncclRedOp_t` + # both are aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK( + self._funcs["ncclReduceScatter"]( + sendbuff, recvbuff, count, datatype, op, comm, stream + ) + ) + + def ncclAllGather( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + # `datatype` actually should be `ncclDataType_t` + # which is an aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK( + self._funcs["ncclAllGather"]( + sendbuff, recvbuff, count, datatype, comm, stream + ) + ) + + def ncclSend( + self, + sendbuff: buffer_type, + count: int, + datatype: int, + dest: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + self.NCCL_CHECK( + self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream) + ) + + def ncclRecv( + self, + recvbuff: buffer_type, + count: int, + datatype: int, + src: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + self.NCCL_CHECK( + self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream) + ) + + def ncclBroadcast( + self, + sendbuff: buffer_type, + recvbuff: buffer_type, + count: int, + datatype: int, + root: int, + comm: ncclComm_t, + stream: cudaStream_t, + ) -> None: + self.NCCL_CHECK( + self._funcs["ncclBroadcast"]( + sendbuff, recvbuff, count, datatype, root, comm, stream + ) + ) + + def ncclCommDestroy(self, comm: ncclComm_t) -> None: + self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm)) + + def ncclGroupStart(self) -> None: + self.NCCL_CHECK(self._funcs["ncclGroupStart"]()) + + def ncclGroupEnd(self) -> None: + self.NCCL_CHECK(self._funcs["ncclGroupEnd"]()) + + def ncclCommWindowRegister( + self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int + ) -> ncclWindow_t: + window = ncclWindow_t() + self.NCCL_CHECK( + self._funcs["ncclCommWindowRegister"]( + comm, buff, size, ctypes.byref(window), win_flags + ) + ) + return window + + def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None: + self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window)) + + +__all__ = [ + "NCCLLibrary", + "ncclDataTypeEnum", + "ncclRedOpTypeEnum", + "ncclUniqueId", + "ncclComm_t", + "cudaStream_t", + "buffer_type", +] diff --git a/distributed/device_communicators/quick_all_reduce.py b/distributed/device_communicators/quick_all_reduce.py new file mode 100644 index 0000000..9c77658 --- /dev/null +++ b/distributed/device_communicators/quick_all_reduce.py @@ -0,0 +1,290 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from enum import Enum + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +import vllm.envs as envs +from vllm import _custom_ops as ops +from vllm.config import get_current_vllm_config +from vllm.distributed.parallel_state import in_the_same_node_as +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.torch_utils import cuda_device_count_stateless + +logger = init_logger(__name__) + +try: + ops.qr_max_size() + quick_ar = True +except Exception: + # For CPUs and CUDA + quick_ar = False + + +def is_weak_contiguous(inp: torch.Tensor): + return inp.is_contiguous() or ( + inp.storage().nbytes() - inp.storage_offset() * inp.element_size() + == inp.numel() * inp.element_size() + ) + + +class QuickReduceRegime(Enum): + FP = 0 + INT8 = 1 + INT6 = 2 + INT4 = 3 + NONE = 4 + + +MB = 1024 * 1024 + + +class QuickAllReduce: + _SUPPORTED_WORLD_SIZES = [2, 4, 8] + _SUPPORTED_DTYPES = [torch.float16, torch.bfloat16] + # The following data is based on kernel tests. + # In this order [FP, INT8, INT6, INT4]. + _QR_MIN_SIZE = { + (torch.float16, 2): [1 * MB, 2 * MB, 2 * MB, 1 * MB], + (torch.float16, 4): [1 * MB, 16 * MB, 4 * MB, 2 * MB], + (torch.float16, 8): [16 * MB, 4 * MB, 4 * MB, 2 * MB], + (torch.bfloat16, 2): [2 * MB, 8 * MB, 8 * MB, 8 * MB], + (torch.bfloat16, 4): [8 * MB, 64 * MB, 64 * MB, 16 * MB], + (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB], + } + + def __init__(self, group: ProcessGroup, device: int | str | torch.device) -> None: + """ + Custom allreduce provides non-destructive acceleration and is + available for CUDA and ROCm MI300 series. + + Custom quick allreduce leverages quantization for further + acceleration on ROCm. It currently supports Q8, Q6, and Q4 + quantization formats and FP(float16, bfloat16). + + Quick allreduce is designed as a complement to custom allreduce. + Its initialization requires even stricter conditions. + + Only the ROCm MI300 series is supported for quick allreduce at + this time. + + Args: + group: the process group to work on. If None, it will use the + default process group. + device: the device to bind the CustomAllreduce to. If None, + it will be bound to f"cuda:{local_rank}". + It is the caller's responsibility to make sure each communicator + is bind to a unique device, and all communicators in this group + are in the same node. + """ + self.disabled = True + if not self._rocm_arch_available(): + logger.debug( + "Custom quick allreduce is only supported on ROCm MI300 series." + ) + return + + if not quick_ar: + # disable because of missing quick reduce library + # e.g. in a cuda environment + logger.info( + "Custom quick allreduce is disabled because " + "of missing custom quick allreduce library" + ) + return + + self.group = group + assert dist.get_backend(group) != dist.Backend.NCCL, ( + "Custom quick allreduce should be attached to a non-NCCL group." + ) + if not all(in_the_same_node_as(group, source_rank=0)): + # No need to initialize custom quick allreduce for + # multi-node case. + logger.warning( + "Custom quick allreduce is disabled because this " + "process group spans across nodes." + ) + return + rank = dist.get_rank(group=self.group) + world_size = dist.get_world_size(group=self.group) + self.rank = rank + self.world_size = world_size + if world_size == 1: + # No need to initialize QuickReduce for single GPU case. + return + + if world_size not in QuickAllReduce._SUPPORTED_WORLD_SIZES: + logger.warning( + "Custom quick allreduce is disabled due to an " + "unsupported world size: %d. Supported world sizes: %s.", + world_size, + str(QuickAllReduce._SUPPORTED_WORLD_SIZES), + ) + return + + if isinstance(device, int): + device = torch.device(f"cuda:{device}") + elif isinstance(device, str): + device = torch.device(device) + assert isinstance(device, torch.device) + self.device = device + + cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES + if cuda_visible_devices: + device_ids = list(map(int, cuda_visible_devices.split(","))) + else: + device_ids = list(range(cuda_device_count_stateless())) + physical_device_id = device_ids[device.index] + tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu") + gather_list = [ + torch.tensor([0], dtype=torch.int, device="cpu") + for _ in range(self.world_size) + ] + dist.all_gather(gather_list, tensor, group=self.group) + physical_device_ids = [t.item() for t in gather_list] + + # test nvlink first, this will filter out most of the cases + # where custom quick allreduce is not supported + # this checks hardware and driver support for NVLink + assert current_platform.is_cuda_alike() + self.fully_connected = current_platform.is_fully_connected(physical_device_ids) + if self.world_size > 2 and not self.fully_connected: + logger.debug( + "Custom quick allreduce is disabled because it's not supported " + "on more than two PCIe-only GPUs. " + ) + return + + self.init_quick_all_reduce() + + def init_quick_all_reduce(self): + # On RocM, bfloat16 kernels are slower than fp16 + # due to slower match operations + # If environment variable is set to 1, we convert input to fp16 + self.use_fp16_kernels = envs.VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16 + regime_str = envs.VLLM_ROCM_QUICK_REDUCE_QUANTIZATION + if regime_str not in QuickReduceRegime.__members__: + logger.warning( + "Custom quick allreduce:", + f"Invalid quantization level: {regime_str}. " + "Supported levels: " + f"{list(QuickReduceRegime.__members__.keys())}", + ) + return + + if regime_str == "NONE": + logger.debug( + "Custom quick allreduce is disabled based " + "on env variable " + "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION='NONE'" + ) + return + self.qr_quant_level = QuickReduceRegime[regime_str] + vllm_config = get_current_vllm_config() + if ( + vllm_config is not None + and hasattr(vllm_config, "model_config") + and hasattr(vllm_config.model_config, "dtype") + ): + dtype = vllm_config.model_config.dtype + if dtype not in [torch.float16, torch.bfloat16]: + logger.debug( + "Custom quick allreduce disabled: only supports " + "float16 and float16, but get %s.", + dtype, + ) + return + + if dtype == torch.bfloat16 and self.use_fp16_kernels: + logger.info( + "Custom quick allreduce: BF16 inputs will be converted " + "to FP16 to improve performance. set " + "envs.VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0 " + "to turn off." + ) + + # VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB is specified in MB + qr_max_size = envs.VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB + if qr_max_size is not None: + if qr_max_size < 1: + logger.info( + "You should not set a max_size smaller than 1MB, which can " + "lead to error or degradation to custom allreduce or rccl." + ) + qr_max_size = qr_max_size * MB + self._ptr = ops.init_custom_qr(self.rank, self.world_size, qr_max_size) + self.qr_max_size = qr_max_size if qr_max_size is not None else ops.qr_max_size() + self.create_shared_buffer() + self.disabled = False + + def _rocm_arch_available(self): + if not current_platform.is_rocm(): + return False + try: + props = torch.cuda.get_device_properties(0) + gcn_arch = getattr(props, "gcnArchName", "") + supported_archs = ["gfx94", "gfx95"] + return any(gfx in gcn_arch for gfx in supported_archs) + except Exception as e: + logger.warning("Failed to determine ROCm for quick allreduce: %s", e) + return False + + def create_shared_buffer(self): + """ + Creates a shared buffer for quickreduce. + Has to be called after init_custom_qr + """ + handle = ops.qr_get_handle(self._ptr) + world_size = dist.get_world_size(group=self.group) + handles = [None] * world_size + dist.all_gather_object(handles, handle, group=self.group) + ops.qr_open_handles(self._ptr, handles) + + def should_quick_allreduce(self, inp: torch.Tensor): + """ + Check if quickreduce is available + """ + if self.disabled: + return False + if inp.dtype not in self._SUPPORTED_DTYPES: + return False + inp_size = inp.numel() * inp.element_size() + # custom quick allreduce requires input byte size to be + # multiples of 16 + if inp_size % 16 != 0: + return False + if not is_weak_contiguous(inp): + return False + dtype = inp.dtype + if self.use_fp16_kernels: + dtype = torch.float16 + return ( + inp_size <= self.qr_max_size + and inp_size + >= self._QR_MIN_SIZE[(dtype, self.world_size)][self.qr_quant_level.value] + ) + + def quick_all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None): + """Performs an out-of-place custom quick all reduce.""" + # quick allreduce doesn't require a separate graph mode, + # as QR uses static IPC buffer. + if out is None: + out = torch.empty_like(inp) + ops.qr_all_reduce( + self._ptr, inp, out, self.qr_quant_level.value, self.use_fp16_kernels + ) + return out + + def close(self): + if not self.disabled and getattr(self, "_ptr", None): + if ops is not None: + ops.qr_destroy(self._ptr) + self._ptr = 0 + self.disabled = True + + def __del__(self): + self.close() diff --git a/distributed/device_communicators/ray_communicator.py b/distributed/device_communicators/ray_communicator.py new file mode 100644 index 0000000..d9517f5 --- /dev/null +++ b/distributed/device_communicators/ray_communicator.py @@ -0,0 +1,259 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import uuid +from typing import Any + +import ray +import torch +from ray.exceptions import RayChannelError +from ray.experimental.channel.communicator import Communicator, TorchTensorAllocator +from torch.distributed import ReduceOp + +from vllm.distributed.device_communicators.base_device_communicator import ( + DeviceCommunicatorBase, +) +from vllm.distributed.parallel_state import get_pp_group +from vllm.logger import init_logger +from vllm.utils.torch_utils import current_stream + +logger = init_logger(__name__) + + +class RayPPCommunicator(Communicator): + """ + Communicator to be used for pipeline parallelism in Ray Compiled Graph. + This is wraps around the vLLM _PP GroupCoordinator. + + This class is not thread-safe. + """ + + _comm: DeviceCommunicatorBase | None + + def __init__( + self, + world_size: int, + comm_id: Any, + rank: int | None, + actor_handles: list["ray.actor.ActorHandle"], + cuda_stream: torch.cuda.Stream | None, + use_communication_streams: bool = False, + ): + """ + Initialize a RayPPCommunicator that can be used to communicate with + other Ray Compiled Graph actors for pipeline parallelism. + + Args: + world_size: The number of participating actors. + comm_id: A unique communicator ID. This is just to conform with + the Ray Communicator API and is not used. + rank: The rank of this actor. If None, then the caller is not a + participant of the RayPPCommunicator group (e.g., the Ray + driver). + actor_handles: A list of actor handles. + cuda_stream: A CUDA stream to dispatch communication ops to. This + is not supported. + use_communication_streams: Whether to use communication streams. + This is not supported. + """ + self._world_size = world_size + self._rank: int | None = None + self._actor_handles = actor_handles + if use_communication_streams: + raise NotImplementedError("use_communication_streams is not supported") + if cuda_stream is not None and cuda_stream != current_stream(): + raise ValueError( + "cuda_stream other than the current stream is not supported" + ) + + if rank is not None: + # Rank is not None, this is Ray worker + assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned" + + self._comm = get_pp_group().device_communicator + assert self._comm is not None + + # Since we wrap around the vLLM _PP communicator, we use + # the rank from the vLLM communicator, and ignore the rank + # passed in from Ray. + # TODO(rui): refactor the Ray Communicator API so that + # it also supports no rank passed in. + self._rank = self._comm.rank_in_group + + self._build_actor_rank_mapping() + else: + # Rank is None, this is Ray driver + self._comm = None + + self._closed = False + + def _build_actor_rank_mapping(self): + """ + Use collective communication to build a mapping from actor IDs to ranks. + This should be called once during initialization. + """ + if self._comm is None: + return {} + + current_actor = ray.get_runtime_context().current_actor + actor_id_str = current_actor._actor_id.hex() + + # Ray actor IDs are 32-character hex strings (128 bits) + ACTOR_ID_LEN = 32 + actor_id_bytes = bytearray(actor_id_str.encode("utf-8")) + assert len(actor_id_bytes) == ACTOR_ID_LEN, ( + f"Unexpected actor ID length: {len(actor_id_bytes)}" + ) + + actor_id_tensor = torch.frombuffer(actor_id_bytes, dtype=torch.uint8).to( + self._comm.device + ) + + # All-gather full actor IDs from all actors + gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0) + + # Build mapping: actor_id -> device_comm_rank + self._actor_id_to_rank = {} + for rank in range(self._world_size): + start_idx = rank * ACTOR_ID_LEN + end_idx = (rank + 1) * ACTOR_ID_LEN + actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy().tobytes() + actor_id = actor_bytes.decode("utf-8") + self._actor_id_to_rank[actor_id] = rank + + def initialize(self, rank: int) -> None: + # No additional initialization is needed. + pass + + def get_actor_handles(self) -> list["ray.actor.ActorHandle"]: + return self._actor_handles + + def get_rank(self, actor: ray.actor.ActorHandle) -> int: + """ + Return the given actor's rank using device communicator collective ops. + """ + assert hasattr(self, "_actor_id_to_rank"), ( + "Actor rank mapping not built. " + "This should have been done during initialization." + ) + + actor_id_str = actor._actor_id.hex() + + if actor_id_str in self._actor_id_to_rank: + return self._actor_id_to_rank[actor_id_str] # type: ignore + else: + raise ValueError(f"Actor {actor} not found in communicator group") + + def get_self_rank(self) -> int | None: + """ + Return this actor's rank. + """ + return self._rank + + def get_world_size(self) -> int: + """ + Return the number of ranks in the RayPPCommunicator group. + """ + return self._world_size + + def send(self, buf: "torch.Tensor", peer_rank: int) -> None: + """ + Send a torch.Tensor to a peer. + + This returns when the send kernel has been queued, but the kernel may + not have completed. Therefore, the caller should ensure that there are + no concurrent writes to the sent `buf` until the send has finished. + That is, either all writes should be submitted on the current stream + (self._cuda_stream) or, if on a different stream, that stream should + synchronize with the current stream. + + Args: + buf: The torch.Tensor to send. It should already be on this + actor's default device. + peer_rank: The rank of the actor to send to. + """ + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + + assert self._comm is not None + self._comm.send(buf, peer_rank) + + def recv( + self, + shape: tuple[int, ...], + dtype: "torch.dtype", + peer_rank: int, + allocator: TorchTensorAllocator, + ) -> "torch.Tensor": + """ + Receive a torch.Tensor from a peer and synchronize the current stream. + + After this call returns, the receive buffer is safe to read from + any stream. An RayChannelError will be raised if an error occurred + (e.g., remote actor died), and the buffer is not safe to read. + + Args: + shape: The shape of the tensor to receive. + dtype: The dtype of the tensor to receive. + peer_rank: The rank of the actor to receive from. + allocator: The allocator to use to create the received tensor. + This is ignored for this implementation. + """ + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + + assert self._comm is not None + size = torch.Size(shape) + buf = self._comm.recv(size, dtype, src=peer_rank) + + # Buffer values are undefined if NCCL ops are aborted. Therefore, we + # need to synchronize here and check that the channel is still + # open to ensure that the receive buffer is valid. + # TODO(swang): Avoid CUDA synchronization. + current_stream().synchronize() + + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + return buf + + def allgather( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + ): + raise NotImplementedError("allgather is not supported") + + def allreduce( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + op: ReduceOp = ReduceOp.SUM, + ): + raise NotImplementedError("allreduce is not supported") + + def reducescatter( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + op: ReduceOp = ReduceOp.SUM, + ): + raise NotImplementedError("reducescatter is not supported") + + @property + def recv_stream(self): + return torch.cuda.StreamContext(current_stream()) + + @property + def send_stream(self): + return torch.cuda.StreamContext(current_stream()) + + def destroy(self) -> None: + # Just sets a flag, vLLM manages the lifecycle of the underlying + # _PP GroupCoordinator. + self._closed = True + + def get_transport_name(self) -> str: + return "nccl" + + @classmethod + def generate_communicator_id(cls) -> Any: + return uuid.uuid4() diff --git a/distributed/device_communicators/shm_broadcast.py b/distributed/device_communicators/shm_broadcast.py new file mode 100644 index 0000000..052df19 --- /dev/null +++ b/distributed/device_communicators/shm_broadcast.py @@ -0,0 +1,733 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +import pickle +import time +from contextlib import contextmanager +from dataclasses import dataclass, field +from multiprocessing import shared_memory +from pickle import PickleBuffer +from threading import Event +from typing import TYPE_CHECKING, Any, cast +from unittest.mock import patch + +import torch +import torch.distributed as dist +import zmq +from torch.distributed import ProcessGroup +from zmq import ( # type: ignore + IPV6, # type: ignore + SUB, + SUBSCRIBE, + XPUB, + XPUB_VERBOSE, + Context, +) + +import vllm.envs as envs +from vllm.distributed.utils import StatelessProcessGroup, sched_yield +from vllm.logger import init_logger +from vllm.utils.network_utils import ( + get_ip, + get_open_port, + get_open_zmq_ipc_path, + is_valid_ipv6_address, +) + +if TYPE_CHECKING: + from _typeshed import SizedBuffer + +VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL + +from_bytes_big = functools.partial(int.from_bytes, byteorder="big") + + +def to_bytes_big(value: int, size: int) -> bytes: + return value.to_bytes(size, byteorder="big") + + +logger = init_logger(__name__) + + +def long_wait_time_msg(threshold: int) -> str: + return ( + "No available shared memory broadcast block found " + f"in {threshold} seconds. This typically happens " + "when some processes are hanging or doing some " + "time-consuming work (e.g. compilation, " + "weight/kv cache quantization)." + ) + + +class SpinTimer: + def record_activity(self): + pass + + def spin(self): + sched_yield() + + +class SpinSleepTimer(SpinTimer): + """ + In setups which have long inactivity periods it is desirable to reduce + system power consumption when vllm does nothing. This would lead to more + CPU thermal headroom when a request eventually comes, especially when + multiple GPUs are connected as each GPU would otherwise pin one thread at + 100% CPU usage. + + The simplest solution is to reduce polling frequency when there is no + activity for a certain period of time. + """ + + def __init__(self, busy_loop_s: float = 3.0, wait_sleep_s: float = 0.1): + self.last_activity = time.monotonic() + self.busy_loop_s = busy_loop_s + self.wait_sleep_s = wait_sleep_s + + def record_activity(self): + self.last_activity = time.monotonic() + + def spin(self): + curr_time = time.monotonic() + if curr_time >= self.last_activity + self.busy_loop_s: + time.sleep(self.wait_sleep_s) + else: + sched_yield() + + +class ShmRingBuffer: + def __init__( + self, + n_reader: int, + max_chunk_bytes: int, + max_chunks: int, + name: str | None = None, + ): + """ + A shared memory ring buffer implementation for broadcast communication. + Essentially, it is a queue where only one will `enqueue` and multiple + will `dequeue`. The max size of each item, together with the max number + of items that can be stored in the buffer are known in advance. + In this case, we don't need to synchronize the access to + the buffer. + + Buffer memory layout: + data metadata + | | + | (current_idx) | (current_idx) + v v + +-------------------------------+----------------------------------------+ + | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata | + +-------------------------------+----------------------------------------+ + | max_chunks x max_chunk_bytes | max_chunks x (1 + n_reader) bytes | + + metadata memory layout: each byte is a flag, the first byte is the written + flag, and the rest are reader flags. The flags are set to 0 by default. + +--------------+--------------+--------------+-----+--------------+ + | written_flag | reader0_flag | reader1_flag | ... | readerN_flag | + +--------------+--------------+--------------+-----+--------------+ + + The state of metadata is as follows: + + (case 1) 0???...???: the block is not written yet, cannot read, can write + (case 2) 1000...000: the block is just written, can read, cannot write + (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write + (case 4) 1111...111: the block is written and read by all readers, cannot read, can write + + State transition for readers: + + When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read. + Only after the caller finishes reading the block, the reader can mark the block as read. + Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0). + + State transition for writer: + + When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case + to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer + can reset the reader flags to 0, and mark the block as written (from 0 to 1). + NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct. + + During creation, `name` is None and the buffer is created. We can pass the + created object to other processes by pickling it. The other processes will + get the name of the shared memory and open it, so that they can access the + same shared memory buffer. + """ # noqa + self.n_reader = n_reader + self.metadata_size = 1 + n_reader + self.max_chunk_bytes = max_chunk_bytes + self.max_chunks = max_chunks + self.total_bytes_of_buffer = ( + self.max_chunk_bytes + self.metadata_size + ) * self.max_chunks + self.data_offset = 0 + self.metadata_offset = self.max_chunk_bytes * self.max_chunks + + if name is None: + # we are creating a buffer + self.is_creator = True + self.shared_memory = shared_memory.SharedMemory( + create=True, size=self.total_bytes_of_buffer + ) + # initialize the metadata section to 0 + with self.shared_memory.buf[self.metadata_offset :] as metadata_buffer: + torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0) + else: + # we are opening an existing buffer + self.is_creator = False + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch( + "multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None, + ): + try: + self.shared_memory = shared_memory.SharedMemory(name=name) + # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa + # Some platforms allocate memory based on page size, + # so the shared memory block size may be larger or equal + # to the requested size. The size parameter is ignored + # when attaching to an existing block. + assert self.shared_memory.size >= self.total_bytes_of_buffer + except FileNotFoundError: + # we might deserialize the object in a different node + # in this case, this object is not used, + # and we should suppress the error + pass + + def handle(self): + return ( + self.n_reader, + self.max_chunk_bytes, + self.max_chunks, + self.shared_memory.name, + ) + + def __reduce__(self): + return ( + self.__class__, + self.handle(), + ) + + def __del__(self): + if hasattr(self, "shared_memory"): + self.shared_memory.close() + if self.is_creator: + self.shared_memory.unlink() + + @contextmanager + def get_data(self, current_idx: int): + start = self.data_offset + current_idx * self.max_chunk_bytes + end = start + self.max_chunk_bytes + with self.shared_memory.buf[start:end] as buf: + yield buf + + @contextmanager + def get_metadata(self, current_idx: int): + start = self.metadata_offset + current_idx * self.metadata_size + end = start + self.metadata_size + with self.shared_memory.buf[start:end] as buf: + yield buf + + +@dataclass +class Handle: + local_reader_ranks: list[int] = field(default_factory=list) + + buffer_handle: tuple[int, int, int, str] | None = None + local_subscribe_addr: str | None = None + remote_subscribe_addr: str | None = None + remote_addr_ipv6: bool = False + + +class MessageQueue: + def __init__( + self, + n_reader, # number of all readers + n_local_reader, # number of local readers through shared memory + local_reader_ranks: list[int] | None = None, + # Default of 24MiB chosen to be large enough to accommodate grammar + # bitmask tensors for large batches (1024 requests). + max_chunk_bytes: int = 1024 * 1024 * 24, + max_chunks: int = 10, + connect_ip: str | None = None, + ): + if local_reader_ranks is None: + local_reader_ranks = list(range(n_local_reader)) + else: + assert len(local_reader_ranks) == n_local_reader + self.n_local_reader = n_local_reader + n_remote_reader = n_reader - n_local_reader + self.n_remote_reader = n_remote_reader + + context = Context() + + if n_local_reader > 0: + # for local readers, we will: + # 1. create a shared memory ring buffer to communicate small data + # 2. create a publish-subscribe socket to communicate large data + self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes, max_chunks) + + # XPUB is very similar to PUB, + # except that it can receive subscription messages + # to confirm the number of subscribers + self.local_socket = context.socket(XPUB) + # set the verbose option so that we can receive every subscription + # message. otherwise, we will only receive the first subscription + # see http://api.zeromq.org/3-3:zmq-setsockopt for more details + self.local_socket.setsockopt(XPUB_VERBOSE, True) + local_subscribe_addr = get_open_zmq_ipc_path() + logger.debug("Binding to %s", local_subscribe_addr) + self.local_socket.bind(local_subscribe_addr) + + self.current_idx = 0 + else: + self.buffer = None # type: ignore + local_subscribe_addr = None + self.local_socket = None + self.current_idx = -1 + + remote_addr_ipv6 = False + if n_remote_reader > 0: + # for remote readers, we will: + # create a publish-subscribe socket to communicate large data + if not connect_ip: + connect_ip = get_ip() + self.remote_socket = context.socket(XPUB) + self.remote_socket.setsockopt(XPUB_VERBOSE, True) + remote_subscribe_port = get_open_port() + if is_valid_ipv6_address(connect_ip): + self.remote_socket.setsockopt(IPV6, 1) + remote_addr_ipv6 = True + connect_ip = f"[{connect_ip}]" + socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}" + self.remote_socket.bind(socket_addr) + remote_subscribe_addr = f"tcp://{connect_ip}:{remote_subscribe_port}" + else: + remote_subscribe_addr = None + self.remote_socket = None + + self._is_writer = True + self._is_local_reader = False + self.local_reader_rank = -1 + # rank does not matter for remote readers + self._is_remote_reader = False + self._read_spin_timer = SpinTimer() + + self.handle = Handle( + local_reader_ranks=local_reader_ranks, + buffer_handle=self.buffer.handle() if self.buffer is not None else None, + local_subscribe_addr=local_subscribe_addr, + remote_subscribe_addr=remote_subscribe_addr, + remote_addr_ipv6=remote_addr_ipv6, + ) + + logger.debug("vLLM message queue communication handle: %s", self.handle) + + def export_handle(self) -> Handle: + return self.handle + + @staticmethod + def create_from_handle(handle: Handle, rank) -> "MessageQueue": + self = MessageQueue.__new__(MessageQueue) + self.handle = handle + self._is_writer = False + + context = Context() + + if rank in handle.local_reader_ranks: + assert handle.buffer_handle is not None + self.buffer = ShmRingBuffer(*handle.buffer_handle) + self.current_idx = 0 + self.local_reader_rank = handle.local_reader_ranks.index(rank) + self._is_local_reader = True + self._is_remote_reader = False + + self.local_socket = context.socket(SUB) + self.local_socket.setsockopt_string(SUBSCRIBE, "") + socket_addr = handle.local_subscribe_addr + logger.debug("Connecting to %s", socket_addr) + self.local_socket.connect(socket_addr) + + self.remote_socket = None + + self._read_spin_timer = ( + SpinSleepTimer() if envs.VLLM_SLEEP_WHEN_IDLE else SpinTimer() + ) + else: + self.buffer = None # type: ignore + self.current_idx = -1 + self.local_reader_rank = -1 + self._is_local_reader = False + self._is_remote_reader = True + + self.local_socket = None + + self.remote_socket = context.socket(SUB) + self.remote_socket.setsockopt_string(SUBSCRIBE, "") + if handle.remote_addr_ipv6: + self.remote_socket.setsockopt(IPV6, 1) + socket_addr = handle.remote_subscribe_addr + logger.debug("Connecting to %s", socket_addr) + self.remote_socket.connect(socket_addr) + + return self + + def wait_until_ready(self): + """This is a collective operation. All processes (including the + readers and the writer) should call this function. + """ + if self._is_writer: + # wait for all readers to connect + + # local readers + for i in range(self.n_local_reader): + # wait for subscription messages from all local readers + self.local_socket.recv() + if self.n_local_reader > 0: + # send a message to all local readers + # to make sure the publish channel is working + self.local_socket.send(b"READY") + + # remote readers + for i in range(self.n_remote_reader): + # wait for subscription messages from all remote readers + self.remote_socket.recv() + if self.n_remote_reader > 0: + # send a message to all remote readers + # to make sure the publish channel is working + self.remote_socket.send(b"READY") + elif self._is_local_reader: + # wait for the writer to send a message + recv = self.local_socket.recv() + assert recv == b"READY" + elif self._is_remote_reader: + # wait for the writer to send a message + recv = self.remote_socket.recv() + assert recv == b"READY" + + @contextmanager + def acquire_write(self, timeout: float | None = None): + assert self._is_writer, "Only writers can acquire write" + start_time = time.monotonic() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_count = sum(metadata_buffer[1:]) + written_flag = metadata_buffer[0] + if written_flag and read_count != self.buffer.n_reader: + # this block is written and not read by all readers + # for writers, `self.current_idx` is the next block to write + # if this block is not ready to write, + # we need to wait until it is read by all readers + + # Release the processor to other threads + sched_yield() + + # if we time out, raise an exception + elapsed = time.monotonic() - start_time + if timeout is not None and elapsed > timeout: + raise TimeoutError + + # if we wait for a long time, log a message + if elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: + logger.info( + long_wait_time_msg(VLLM_RINGBUFFER_WARNING_INTERVAL) + ) + n_warning += 1 + + continue + # found a block that is either + # (1) not written + # (2) read by all readers + + # mark the block as not written + metadata_buffer[0] = 0 + # let caller write to the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has written to the buffer + # NOTE: order is important here + # first set the read flags to 0 + # then set the written flag to 1 + # otherwise, the readers may think they already read the block + for i in range(1, self.buffer.n_reader + 1): + # set read flag to 0, meaning it is not read yet + metadata_buffer[i] = 0 + # mark the block as written + metadata_buffer[0] = 1 + self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks + break + + @contextmanager + def acquire_read( + self, + timeout: float | None = None, + cancel: Event | None = None, + indefinite: bool = False, + ): + assert self._is_local_reader, "Only readers can acquire read" + start_time = time.monotonic() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_flag = metadata_buffer[self.local_reader_rank + 1] + written_flag = metadata_buffer[0] + if not written_flag or read_flag: + # this block is either + # (1) not written + # (2) already read by this reader + + # for readers, `self.current_idx` is the next block to read + # if this block is not ready, + # we need to wait until it is written + + # Release the processor to other threads + self._read_spin_timer.spin() + + if cancel is not None and cancel.is_set(): + raise RuntimeError("cancelled") + + # if we time out, raise an exception + elapsed = time.monotonic() - start_time + if timeout is not None and elapsed > timeout: + raise TimeoutError + + # if we wait for a long time, log a message + if not indefinite and ( + elapsed > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning + ): + logger.info( + long_wait_time_msg(VLLM_RINGBUFFER_WARNING_INTERVAL) + ) + n_warning += 1 + + continue + # found a block that is not read by this reader + # let caller read from the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has read from the buffer + # set the read flag + metadata_buffer[self.local_reader_rank + 1] = 1 + self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks + + self._read_spin_timer.record_activity() + break + + def enqueue(self, obj, timeout: float | None = None): + """Write to message queue with optional timeout (in seconds)""" + assert self._is_writer, "Only writers can enqueue" + all_buffers: list[SizedBuffer] = [b""] + total_bytes = 6 # 2 bytes for oob buffer count, 4 for main buffer size + + def oob_callback(buf: PickleBuffer) -> bool: + raw_buf = buf.raw() + if len(raw_buf) < 1024 * 1024: + # In-line buffers smaller than 1MiB. + return True + all_buffers.append(raw_buf) + nonlocal total_bytes + total_bytes += len(raw_buf) + 4 + return False + + all_buffers[0] = pickle.dumps( + obj, protocol=pickle.HIGHEST_PROTOCOL, buffer_callback=oob_callback + ) + if self.n_local_reader > 0: + if total_bytes + len(all_buffers[0]) >= self.buffer.max_chunk_bytes: + with self.acquire_write(timeout) as buf: + buf[0] = 1 # overflow + self.local_socket.send_multipart(all_buffers, copy=False) + else: + # Byte 0: 0 + # Bytes 1-2: Count of buffers + # Then each buffer follows, preceded by 4 bytes containing its length: + # [4 byte int L][L bytes of buffer content] ... + with self.acquire_write(timeout) as buf: + buf[0] = 0 # not overflow + offset = 3 + buf[1:offset] = to_bytes_big(len(all_buffers), 2) # oob buf count + for buffer in all_buffers: + buf_len = len(buffer) + # prepend each buffer with 4 bytes containing its size. + buf_offset = offset + 4 + buf[offset:buf_offset] = to_bytes_big(buf_len, 4) + buf[buf_offset : (offset := buf_offset + buf_len)] = buffer + + if self.n_remote_reader > 0: + self.remote_socket.send_multipart(all_buffers, copy=False) + + def dequeue( + self, + timeout: float | None = None, + cancel: Event | None = None, + indefinite: bool = False, + ): + """Read from message queue with optional timeout (in seconds)""" + if self._is_local_reader: + with self.acquire_read(timeout, cancel, indefinite) as buf: + overflow = buf[0] == 1 + if not overflow: + offset = 3 + buf_count = from_bytes_big(buf[1:offset]) + all_buffers = [] + for i in range(buf_count): + buf_offset = offset + 4 + buf_len = from_bytes_big(buf[offset:buf_offset]) + offset = buf_offset + buf_len + all_buffers.append(buf[buf_offset:offset]) + obj = pickle.loads(all_buffers[0], buffers=all_buffers[1:]) + if overflow: + obj = MessageQueue.recv(self.local_socket, timeout) + elif self._is_remote_reader: + obj = MessageQueue.recv(self.remote_socket, timeout) + else: + raise RuntimeError("Only readers can dequeue") + return obj + + @staticmethod + def recv(socket: zmq.Socket, timeout: float | None) -> Any: + timeout_ms = None if timeout is None else int(timeout * 1000) + if not socket.poll(timeout=timeout_ms): + raise TimeoutError + recv, *recv_oob = socket.recv_multipart(copy=False) + return pickle.loads(recv, buffers=recv_oob) + + def broadcast_object(self, obj=None): + if self._is_writer: + self.enqueue(obj) + return obj + return self.dequeue() + + @staticmethod + def create_from_process_group_single_reader( + pg: ProcessGroup, + max_chunk_bytes, + max_chunks, + reader_rank: int = 0, + blocking: bool = False, + ) -> tuple["MessageQueue", list[Handle]]: + """ + Creates a MessageQueue for a process group with a single reader. + + This method is designed for scenarios where only one process (the reader) + will consume messages, and all other processes are writers. It sets up + the shared memory buffer and communication handles accordingly, and + gathers the handles from all processes to the reader. + + Args: + pg (ProcessGroup): The torch distributed process group. + max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer. + max_chunks (int): Maximum number of chunks in the buffer. + reader_rank (int, optional): The global rank that will act as the reader. + Defaults to 0. + blocking (bool, optional): If True, blocks until all processes are ready. + Defaults to False. + + Returns: + tuple[MessageQueue, list[Handle]]: + The MessageQueue instance for the calling process, + and a list of handles (only non-empty for the reader process). + """ + local_size = torch.cuda.device_count() + rank = dist.get_rank() + same_node = rank // local_size == reader_rank // local_size + buffer_io = MessageQueue( + n_reader=1, + n_local_reader=1 if same_node else 0, + max_chunk_bytes=max_chunk_bytes, + max_chunks=max_chunks, + ) + handle = buffer_io.export_handle() + handles = [None] * dist.get_world_size(pg) if rank == reader_rank else None + dist.gather_object(handle, handles, dst=reader_rank, group=pg) + if blocking: + buffer_io.wait_until_ready() + return buffer_io, cast(list[Handle], handles or []) + + @staticmethod + def create_from_process_group( + pg: ProcessGroup | StatelessProcessGroup, + max_chunk_bytes, + max_chunks, + writer_rank: int = 0, + external_writer_handle=None, + blocking: bool = True, + ) -> "MessageQueue": + """ + Creates a MessageQueue for a distributed process group with one writer and + multiple readers. + + This method is designed for scenarios where one process (the writer) sends + messages, and all other processes (the readers) receive messages. It sets up + the shared memory buffer and socket communication handles accordingly, and + broadcasts the handle from the writer to all readers. + + Args: + pg (ProcessGroup | StatelessProcessGroup): The torch distributed process + group. + max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer. + max_chunks (int): Maximum number of chunks in the buffer. + writer_rank (int, optional): The global rank that will act as the writer. + Defaults to 0. + external_writer_handle (Handle, optional): Used when there is a handle + from an external Message Queue. If provided, use this handle to init + PG writer message queue instead of creating a new one. Defaults to None. + blocking (bool, optional): If True, blocks until all processes are ready. + Defaults to True. + + Returns: + MessageQueue: The MessageQueue instance for the calling process. + + """ + if isinstance(pg, ProcessGroup): + group_rank = dist.get_rank(pg) + group_world_size = dist.get_world_size(pg) + global_ranks = dist.get_process_group_ranks(pg) + else: + group_rank = pg.rank + group_world_size = pg.world_size + global_ranks = list(range(pg.world_size)) + from vllm.distributed.parallel_state import in_the_same_node_as + + status = in_the_same_node_as(pg, source_rank=writer_rank) + if group_rank == writer_rank: + if external_writer_handle is not None: + buffer_io = MessageQueue.create_from_handle( + external_writer_handle, group_rank + ) + else: + same_node_ranks = [i for i, s in enumerate(status) if s] + n_reader = group_world_size - 1 + n_local_reader = len(same_node_ranks) - 1 + local_reader_ranks = [i for i in same_node_ranks if i != writer_rank] + buffer_io = MessageQueue( + n_reader=n_reader, + n_local_reader=n_local_reader, + local_reader_ranks=local_reader_ranks, + max_chunk_bytes=max_chunk_bytes, + max_chunks=max_chunks, + ) + handle = buffer_io.export_handle() + if isinstance(pg, ProcessGroup): + dist.broadcast_object_list( + [handle], src=global_ranks[writer_rank], group=pg + ) + else: + pg.broadcast_obj(handle, writer_rank) + else: + if isinstance(pg, ProcessGroup): + recv = [None] + dist.broadcast_object_list( + recv, src=global_ranks[writer_rank], group=pg + ) + handle = recv[0] # type: ignore + else: + handle = pg.broadcast_obj(None, writer_rank) + buffer_io = MessageQueue.create_from_handle(handle, group_rank) + if blocking: + buffer_io.wait_until_ready() + return buffer_io diff --git a/distributed/device_communicators/shm_object_storage.py b/distributed/device_communicators/shm_object_storage.py new file mode 100644 index 0000000..4af2caa --- /dev/null +++ b/distributed/device_communicators/shm_object_storage.py @@ -0,0 +1,660 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pickle +from abc import ABC, abstractmethod +from collections.abc import Callable, Iterable +from contextlib import contextmanager +from dataclasses import dataclass +from itertools import chain +from multiprocessing import shared_memory +from multiprocessing.synchronize import Lock as LockType +from typing import Any +from unittest.mock import patch + +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class SingleWriterShmRingBuffer: + """ + A single-writer, multiple-reader ring buffer implementation using shared + memory. This class provides a thread-safe ring buffer where one process + can write data while multiple processes/threads can read from it. + + Architecture: + - Uses shared memory for cross-process communication + - Maintains metadata for each allocated buffer chunk in the writer process + - Supports custom "is_free_fn" functions to determine when buffers can be + reused + - Each buffer chunk contains: `[4-byte id][4-byte size][actual_data]` + + Key Concepts: + - monotonic_id_start/end: Track the range of active buffer IDs + - data_buffer_start/end: Track the physical memory range in use + - Automatic wraparound when reaching buffer end + - Lazy garbage collection based on is_free_fn checks + + Example Usage Scenarios: + + Scenario 1: Simple Linear Allocation + ``` + Buffer size: 100 bytes + Initial state: [................................................. ] + ^start=end(0) + + After allocating 20 bytes (id=0): + [id:0|size:20|data........][...................................] + ^start(0) ^end(28) + + After allocating 30 bytes (id=1): + [id:0|size:20|data........][id:1|size:30|data..............][..] + ^start(0) ^end(66) + ``` + + Scenario 2: Memory Reclamation + ``` + Before freeing (both buffers still in use): + [id:0|size:20|data........][id:1|size:30|data..............][..] + ^start(0) ^end(66) + + After id:0 is marked free by readers: + [FREED.................... ][id:1|size:30|data..............][..] + ^start(28) ^end(66) + + After both are freed: + [FREED..............................................][..] + ^start=end(66) + ``` + + Scenario 3: Wraparound Allocation (continuing from Scenario 2) + ``` + Starting from after memory reclamation in Scenario 2: + [FREED..............................................][..] + ^start=end(66) + + Allocate 40 bytes (id=2) - only 34 bytes available at end, so wraparound: + [id:2|size:40|data........................][FREED.............][..] + ^end(148) ^start(66) + ``` + + Scenario 4: Error Handling - Out of Space + ``` + Starting from after wraparound allocation in Scenario 3: + [id:2|size:40|data........................][FREED.............][..] + ^end(148) ^start(66) + + Trying to allocate 20 more bytes: + occupied_size_new = end + size - start = 148 + 28 - 66 > buffer_size(100) + -> Raises MemoryError: "Not enough space in the data buffer" + ``` + + Thread Safety: + - Single writer: Only one process/thread should write (allocate_buf) + - Multiple readers: Multiple processes/threads can read (access_buf) + - Reader synchronization handled by is_free_fn callback + - Writer handles garbage collection (free_buf) based on reader feedback + + Memory Layout per Buffer Chunk: + `[4-byte monotonic_id][4-byte chunk_size][actual_data...]` + ^metadata_start ^data_start + + The monotonic_id ensures data integrity - readers can verify they're + accessing the correct data even after buffer wraparound or reuse. + """ + + def __init__( + self, + data_buffer_size: int, + name: str | None = None, + create: bool = False, + ): + self.data_buffer_size = data_buffer_size + self.is_writer = create + + self.ID_NBYTES = 4 + self.ID_MAX = 2**31 # exclusive, so 2**31 - 1 is the max value + self.SIZE_NBYTES = 4 + # 4 bytes for id, 4 bytes for buffer size + self.MD_SIZE = self.ID_NBYTES + self.SIZE_NBYTES + self.monotonic_id_end = 0 + self.monotonic_id_start = 0 + self.data_buffer_start = 0 + self.data_buffer_end = 0 + + if create: + # we are creating a buffer + self.metadata: dict[int, int] = {} # monotonic_id -> start address + self.shared_memory = shared_memory.SharedMemory( + create=True, size=self.data_buffer_size, name=name + ) + else: + # we are opening an existing buffer + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch( + "multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None, + ): + self.shared_memory = shared_memory.SharedMemory(name=name) + # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa + # Some platforms allocate memory based on page size, + # so the shared memory block size may be larger or equal + # to the requested size. The size parameter is ignored + # when attaching to an existing block. + assert self.shared_memory.size >= self.data_buffer_size + + logger.debug( + "Shared memory created/opened with name: %s, size: %d", + self.shared_memory.name, + self.data_buffer_size, + ) + + def handle(self): + return ( + self.data_buffer_size, + self.shared_memory.name, + ) + + def clear(self) -> None: + """Clear the ring buffer.""" + assert self.is_writer, "Only the writer can clear the buffer." + self.metadata.clear() + self.monotonic_id_end = 0 + self.monotonic_id_start = 0 + self.data_buffer_start = 0 + self.data_buffer_end = 0 + + def __del__(self): + if hasattr(self, "shared_memory"): + self.shared_memory.close() + if self.is_writer: + self.shared_memory.unlink() + + def int2byte(self, integer: int) -> bytes: + """Convert an integer to bytes.""" + return integer.to_bytes(self.ID_NBYTES, "little", signed=True) + + def byte2int(self, byte_data: bytes) -> int: + """Convert bytes back to an integer.""" + return int.from_bytes(byte_data, "little", signed=True) + + def allocate_buf(self, size: int) -> tuple[int, int]: + """ + Allocate a buffer `MD_SIZE` + `size` bytes in the shared memory. + Memory layout: + `[4-byte monotonic_id][4-byte size][buffer data...]` + """ + assert self.is_writer, "Only the writer can allocate buffers." + assert size > 0, "Size must be greater than 0" + size += self.MD_SIZE # add metadata size to the buffer size + # reset to beginning if the buffer does have enough contiguous space + buffer_end_reset = self.data_buffer_end % self.data_buffer_size + if buffer_end_reset + size > self.data_buffer_size: + buffer_end_reset = ( + self.data_buffer_end // self.data_buffer_size + 1 + ) * self.data_buffer_size + else: # no reset needed + buffer_end_reset = self.data_buffer_end + + # check if we have enough space in the data buffer + # i.e. if the new end (self.data_buffer_end + size) + # exceeds the start of the data buffer + occupied_size_new = buffer_end_reset + size - self.data_buffer_start + if occupied_size_new > self.data_buffer_size: + raise MemoryError( + "Not enough space in the data buffer, " + "try calling free_buf() to free up space" + ) + self.data_buffer_end = buffer_end_reset + + # first 4 bytes as the monotonic id + buf_idx = self.data_buffer_end % self.data_buffer_size + self.shared_memory.buf[buf_idx : buf_idx + self.ID_NBYTES] = self.int2byte( + self.monotonic_id_end + ) + # next 4 bytes as the size of the data buffer + self.shared_memory.buf[buf_idx + self.ID_NBYTES : buf_idx + self.MD_SIZE] = ( + self.int2byte(size) + ) + + # record metadata + self.metadata[self.monotonic_id_end % self.ID_MAX] = self.data_buffer_end + # update buffer and monotonic id indices + current_buffer_end = self.data_buffer_end + current_id_end = self.monotonic_id_end + self.data_buffer_end += size + self.monotonic_id_end = (self.monotonic_id_end + 1) % self.ID_MAX + return current_buffer_end, current_id_end + + @contextmanager + def access_buf(self, address: int): + buf_idx = address % self.data_buffer_size + + # read metadata + metadata_buff = self.shared_memory.buf[buf_idx : buf_idx + self.MD_SIZE] + id = self.byte2int(metadata_buff[: self.ID_NBYTES]) + size = self.byte2int(metadata_buff[self.ID_NBYTES : self.MD_SIZE]) + + # yield the data buffer and metadata + data_buff = self.shared_memory.buf[buf_idx + self.MD_SIZE : buf_idx + size] + with ( + memoryview(data_buff) as data_view, + ): + yield data_view, (id, size) + + def free_buf( + self, + is_free_fn: Callable[[int, memoryview], bool], + nbytes: int | None = None, + ) -> Iterable[int]: + """ + Free a buffer of the given size. This is a no-op in shared memory, + but we need to keep track of the metadata. + + If freed memory spreads across the end and start of the ring buffer, + the actual freed memory will be in two segments. In this case there + still might not be a contiguous space of `nbytes` available. + + Args: + nbytes (int, optional): The size of the buffer to free. If None, + frees the maximum size of the ring buffer. + """ + + assert self.is_writer, "Only the writer can free buffers." + logger.debug( + "Freeing up space in the ring buffer, " + "monotonic_id_start: %d, monotonic_id_end: %d", + self.monotonic_id_start, + self.monotonic_id_end, + ) + monotonic_id_before = self.monotonic_id_start + # if nbytes is None, free up the maximum size of the ring buffer + if nbytes is None: + nbytes = self.data_buffer_size + freed_bytes = 0 + while self.monotonic_id_start in self.metadata and freed_bytes < nbytes: + address = self.metadata[self.monotonic_id_start] + with self.access_buf(address) as (data_buff, metadata): + if is_free_fn(self.monotonic_id_start, data_buff): + # check passed, we can free the buffer + del self.metadata[self.monotonic_id_start] + self.monotonic_id_start = ( + self.monotonic_id_start + 1 + ) % self.ID_MAX + if self.monotonic_id_start in self.metadata: + # pointing to the start addr of next allocation + self.data_buffer_start += ( + self.metadata[self.monotonic_id_start] + - self.data_buffer_start + ) % self.data_buffer_size + else: + # no remaining allocation, reset to zero + self.data_buffer_start = self.data_buffer_end = 0 + freed_bytes += metadata[1] + else: + # there are still readers, we cannot free the buffer + break + + logger.debug( + "Freed %d bytes from the ring buffer, " + "monotonic_id_start: %d, monotonic_id_end: %d", + freed_bytes, + self.monotonic_id_start, + self.monotonic_id_end, + ) + + # buffer wrap around + if self.data_buffer_start >= self.data_buffer_size: + self.data_buffer_start -= self.data_buffer_size + self.data_buffer_end -= self.data_buffer_size + + monotonic_id_after = self.monotonic_id_start + # id wrap around + if monotonic_id_after >= monotonic_id_before: + return range(monotonic_id_before, monotonic_id_after) + else: + return chain( + range(monotonic_id_before, self.ID_MAX), range(0, monotonic_id_after) + ) + + +class ObjectSerde(ABC): + @abstractmethod + def serialize(self, value: Any) -> tuple[Any, int, bytes, int]: + """Serialize an object to bytes.""" + raise NotImplementedError + + @abstractmethod + def deserialize(self, data: memoryview) -> Any: + """Deserialize bytes back to an object.""" + raise NotImplementedError + + +class MsgpackSerde(ObjectSerde): + def __init__(self): + # Delayed import to avoid circular dependency + from vllm.multimodal.inputs import MultiModalKwargsItem + from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder + + self.encoder = MsgpackEncoder() + self.tensor_decoder = MsgpackDecoder(torch.Tensor, share_mem=False) + self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem, share_mem=False) + self._mm_kwargs_item_cls = MultiModalKwargsItem + + def serialize(self, value: Any) -> tuple[bytes | list[bytes], int, bytes, int]: + len_arr = None + if isinstance(value, (torch.Tensor, self._mm_kwargs_item_cls)): + type_name = type(value).__name__ + value = self.encoder.encode(value) + len_arr = [len(s) for s in value] + nbytes = sum(len_arr) + else: + value = pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL) + type_name = type(value).__name__ + nbytes = len(value) + + object_metadata = (type_name, nbytes, len_arr) + serialized_metadata = pickle.dumps( + object_metadata, protocol=pickle.HIGHEST_PROTOCOL + ) + return value, nbytes, serialized_metadata, len(serialized_metadata) + + def deserialize(self, data_view: memoryview) -> Any: + # pickle.loads do not read past the end of a pickled object + # within a large buffer, so we can skip storing the metadata size + type_name, nbytes, len_arr = pickle.loads(data_view) + serialized_data = data_view[-nbytes:] + + if type_name == torch.Tensor.__name__: + obj = [] + start_idx = 0 + for length in len_arr: + item_bytes = serialized_data[start_idx : start_idx + length] + obj.append(item_bytes) + start_idx += length + obj = self.tensor_decoder.decode(obj) + elif type_name == self._mm_kwargs_item_cls.__name__: + obj = [] + start_idx = 0 + for length in len_arr: + item_bytes = serialized_data[start_idx : start_idx + length] + obj.append(item_bytes) + start_idx += length + obj = self.mm_decoder.decode(obj) + elif type_name == bytes.__name__: + obj = pickle.loads(serialized_data) + else: + raise ValueError(f"Unsupported object type '{type_name}' in metadata") + + return obj + + +@dataclass +class ShmObjectStorageHandle: + max_object_size: int + n_readers: int + ring_buffer_handle: tuple[int, str] + serde_class: type[ObjectSerde] + reader_lock: LockType | None + + +class SingleWriterShmObjectStorage: + """ + A single-writer, multiple-reader object storage system built on top of a + shared memory ring buffer. Provides key-value storage with automatic memory + management and cross-process serialization support. + + This storage system follows a FIFO (First-In-First-Out) eviction policy + where the oldest objects are automatically freed when memory runs low. + Memory is reclaimed based on reader reference counting - objects are only + freed when all readers have finished accessing them. + + Architecture: + - Single writer process can put(key, value) objects + - Multiple reader processes can get(address, monotonic_id) objects + - Built on SingleWriterShmRingBuffer for efficient shared memory management + - Thread-safe operations with reader synchronization via locks + + Key Features: + - FIFO Eviction: Oldest objects are evicted first when memory is full + - Reference Counting: Objects are only freed when no readers are + accessing them + - Duplicate Key Handling: Existing keys are not overwritten, just + re-referenced + - Customized Serialization: By default uses Msgpack for efficient + serialization of Python objects, but can be extended for custom types + - Cross-Process Safety: Uses shared memory with proper synchronization + - Automatic Cleanup: Garbage collection happens transparently during + allocation + + Memory Layout per Object: + `[4-byte reference_count][metadata_size][serialized_object_data]` + + Thread Safety: + - Writer operations (put, clear) are single-threaded by design + - Reader operations (get) are thread-safe with lock-based reference + counting + - Memory reclamation is handled exclusively by the writer process + """ + + def __init__( + self, + max_object_size: int, + n_readers: int, + ring_buffer: SingleWriterShmRingBuffer, + serde_class: type[ObjectSerde] = MsgpackSerde, + reader_lock: LockType | None = None, + ): + """ + Initialize the object storage. + + Args: + max_object_size: Maximum size for a single object in bytes. + n_readers: Number of reader processes that can access the storage. + ring_buffer: The shared memory ring buffer for storing objects. + serde_class: Serializer/deserializer for objects. + reader_lock: Optional lock for synchronizing reader access. + Raises: + ValueError: If reader_lock is None for readers. + """ + + self.max_object_size = max_object_size + self.n_readers = n_readers + self.serde_class = serde_class + self.ser_de = serde_class() + self.ring_buffer = ring_buffer + self.is_writer = self.ring_buffer.is_writer + + self.flag_bytes = 4 # for in-use flag + + if self.is_writer: + # Key-value mapping: key -> (address, monotonic_id) + self.key_index: dict[str, tuple[int, int]] = {} + # Reverse mapping: monotonic_id -> key + self.id_index: dict[int, str] = {} + # Writer flag to track in-use status: monotonic_id -> count + self.writer_flag: dict[int, int] = {} + else: + if reader_lock is None: + raise ValueError("Lock must be provided for readers.") + + self._reader_lock = reader_lock + + def clear(self) -> None: + """Clear the object storage.""" + if self.is_writer: + self.ring_buffer.clear() + self.key_index.clear() + self.id_index.clear() + self.writer_flag.clear() + logger.debug("Object storage cleared and reinitialized.") + + def copy_to_buffer( + self, + data: bytes | list[bytes], + data_bytes: int, + metadata: bytes, + md_bytes: int, + data_view: memoryview, + ) -> None: + data_view[self.flag_bytes : self.flag_bytes + md_bytes] = metadata + if isinstance(data, bytes): + data_view[-data_bytes:] = data + elif isinstance(data, list): + start_idx = self.flag_bytes + md_bytes + for item_bytes in data: + item_size = len(item_bytes) + data_view[start_idx : start_idx + item_size] = item_bytes + start_idx += item_size + else: + raise ValueError(f"Unsupported data type for serialization: {type(data)}") + + def increment_writer_flag(self, id: int) -> None: + """Set the in-use flag for the writer.""" + self.writer_flag[id] = self.writer_flag.get(id, 0) + 1 + + def increment_reader_flag(self, data_view: memoryview) -> None: + """Set the in-use flag for the reader.""" + # >0 for in-use flag + reader_count = self.ring_buffer.byte2int(data_view) + data_view[:] = self.ring_buffer.int2byte(reader_count + 1) + + def free_unused(self) -> None: + """Free unused buffers in the ring buffer.""" + # try to free up 2*max_object_size bytes of space in the ring buffer, + # since the buffer might be fragmented + freed_ids = self.ring_buffer.free_buf( + self.default_is_free_check, 2 * self.max_object_size + ) + # update the metadata after freeing up space + for freed_id in freed_ids: + key_to_free = self.id_index[freed_id] + del self.key_index[key_to_free] + del self.id_index[freed_id] + del self.writer_flag[freed_id] + + def is_cached(self, key: str) -> bool: + """ + Check if the object with the given key is cached. + """ + return key in self.key_index + + def get_cached(self, key: str) -> tuple[int, int]: + """ + Get the cached object by key if it exists. + """ + address, monotonic_id = self.key_index[key] + self.increment_writer_flag(monotonic_id) + return address, monotonic_id + + def put(self, key: str, value: Any) -> tuple[int, int]: + """ + Store a key-value pair in the object storage. + Attempts to free max_object_size bytes using FIFO order + when the ring buffer runs out of space during a put() operation. + + Args: + key: String key to identify the object + value: Any serializable Python object + + Raises: + MemoryError: If there's not enough space in the buffer + ValueError: If the serialized object is too large + ValueError: If the key already exists in the storage + """ + if key in self.key_index: + raise ValueError(f"Key '{key}' already exists in the storage.") + + object_data, data_bytes, object_metadata, md_bytes = self.ser_de.serialize( + value + ) + buffer_size = self.flag_bytes + data_bytes + md_bytes + + # Sanity checks + if buffer_size > self.max_object_size: + raise ValueError( + f"Serialized object size ({buffer_size} bytes) exceeds " + f"max object size ({self.max_object_size} bytes)" + ) + + # Allocate new buffer + try: + address, monotonic_id = self.ring_buffer.allocate_buf(buffer_size) + except MemoryError: + self.free_unused() + # try again after freeing up space + address, monotonic_id = self.ring_buffer.allocate_buf(buffer_size) + + # Write data to buffer + with self.ring_buffer.access_buf(address) as (data_view, metadata): + data_view[: self.flag_bytes] = self.ring_buffer.int2byte(0) + self.copy_to_buffer( + object_data, data_bytes, object_metadata, md_bytes, data_view + ) + self.increment_writer_flag(monotonic_id) + + # Update key index + self.key_index[key] = (address, monotonic_id) + self.id_index[monotonic_id] = key + return address, monotonic_id + + def get(self, address: int, monotonic_id: int) -> Any: + # Read data from buffer + with self.ring_buffer.access_buf(address) as (data_view, buf_metadata): + # check id from metadata + if buf_metadata[0] != monotonic_id: + raise ValueError( + f"Data for address:id '{address}:{monotonic_id}'" + " has been modified or is invalid." + ) + + obj = self.ser_de.deserialize(data_view[self.flag_bytes :]) + + # decrease the in-use flag for reader reads + if self._reader_lock is not None: + with self._reader_lock: + self.increment_reader_flag(data_view[: self.flag_bytes]) + else: + # if self._reader_lock is None, it means we are the writer + # in this case, we do not need to decrease the reader count + assert self.is_writer + + return obj + + def handle(self): + """Get handle for sharing across processes.""" + return ShmObjectStorageHandle( + max_object_size=self.max_object_size, + n_readers=self.n_readers, + ring_buffer_handle=self.ring_buffer.handle(), + serde_class=self.serde_class, + reader_lock=self._reader_lock, + ) + + @staticmethod + def create_from_handle( + handle: ShmObjectStorageHandle, + ) -> "SingleWriterShmObjectStorage": + logger.debug("Creating storage from handle: %s", handle) + ring_buffer = SingleWriterShmRingBuffer(*handle.ring_buffer_handle) + return SingleWriterShmObjectStorage( + max_object_size=handle.max_object_size, + n_readers=handle.n_readers, + ring_buffer=ring_buffer, + serde_class=handle.serde_class, + reader_lock=handle.reader_lock, + ) + + def default_is_free_check(self, id: int, buf: memoryview) -> bool: + """ + Default is_free function that checks if the first 4 bytes are zero. + This indicates that the buffer is free. + """ + reader_count = int.from_bytes(buf[0:4], "little", signed=True) + writer_count = self.writer_flag[id] + return reader_count >= writer_count * self.n_readers diff --git a/distributed/device_communicators/symm_mem.py b/distributed/device_communicators/symm_mem.py new file mode 100644 index 0000000..eb1f173 --- /dev/null +++ b/distributed/device_communicators/symm_mem.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.distributed.device_communicators.all_reduce_utils import ( + SYMM_MEM_ALL_REDUCE_MAX_SIZES, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.batch_invariant import ( + vllm_is_batch_invariant, +) +from vllm.platforms import current_platform + +try: + import torch.distributed._symmetric_memory as torch_symm_mem + + symm_mem_available = True +except ImportError: + symm_mem_available = False + +logger = init_logger(__name__) + + +class SymmMemCommunicator: + _WORLD_SIZES_MULTIMEM = { + "9.0": [4, 6, 8], + "10.0": [6, 8], + } + + def __init__( + self, + group: ProcessGroup, + device: int | str | torch.device, + # add options for testing + force_multimem: bool | None = None, + max_size_override: int | None = None, + ): + self.disabled = True + + if not symm_mem_available: + return + + if not current_platform.is_cuda(): + logger.warning("SymmMemCommunicator: symmetric memory is not available.") + return + if isinstance(device, int): + device = torch.device(f"cuda:{device}") + elif isinstance(device, str): + device = torch.device(device) + torch.cuda.set_device(device) + self.dtype = torch.bfloat16 + self.device = device + self.group = group + self.world_size = dist.get_world_size(self.group) + capability = current_platform.get_device_capability() + if capability is None: + logger.warning( + "SymmMemCommunicator: device capability is unknown, " + "communicator is not available." + ) + return + self.device_capability = capability.as_version_str() + if self.device_capability not in SYMM_MEM_ALL_REDUCE_MAX_SIZES: + logger.warning( + "SymmMemCommunicator: Device capability %s not supported, " + "communicator is not available.", + self.device_capability, + ) + return + if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability]: + logger.warning( + "SymmMemCommunicator: World size %d not supported, " + "communicator is not available.", + self.world_size, + ) + return + # Use override max_size if provided, otherwise use default + if max_size_override is not None: + self.max_size = max_size_override + logger.info( + "SymmMemCommunicator: Using override max_size: %s bytes", + self.max_size, + ) + else: + self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][ + self.world_size + ] + try: + self.buffer = torch_symm_mem.empty( + self.max_size // self.dtype.itemsize, + device=self.device, + dtype=self.dtype, + ) + handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name) + except RuntimeError as e: + logger.warning_once( + "SymmMemCommunicator: symmetric memory initialization failed: %s " + "Communicator is not available. To suppress this warning set " + "VLLM_ALLREDUCE_USE_SYMM_MEM=0", + str(e), + ) + return + if handle.multicast_ptr == 0: + logger.warning( + "SymmMemCommunicator: symmetric memory " + "multicast operations are not supported." + ) + return + self.force_multimem = force_multimem + self.disabled = False + if vllm_is_batch_invariant(): + self.disabled = True + + def should_use_symm_mem(self, inp: torch.Tensor): + if self.disabled: + return False + if inp.dtype != self.dtype: + return False + inp_size = inp.numel() * inp.element_size() + if inp_size % 4 != 0: + return False + return inp_size < self.max_size + + def all_reduce( + self, inp: torch.Tensor, *, out: torch.Tensor | None = None + ) -> torch.Tensor | None: + if not self.should_use_symm_mem(inp): + return None + if out is None: + out = torch.empty_like(inp) + self.buffer[: inp.numel()].copy_(inp.view(-1)) + + # Determine which algorithm to use + use_multimem = False + if self.force_multimem is not None: + # Test override: use forced setting + use_multimem = self.force_multimem + else: + # Normal logic: use multimem for supported world sizes + use_multimem = ( + self.world_size in self._WORLD_SIZES_MULTIMEM[self.device_capability] + ) + + if use_multimem: + torch.ops.symm_mem.multimem_all_reduce_( + self.buffer[: inp.numel()], "sum", self.group.group_name + ) + else: + torch.ops.symm_mem.two_shot_all_reduce_( + self.buffer[: inp.numel()], "sum", self.group.group_name + ) + out.copy_(self.buffer[: inp.numel()].view(out.shape)) + return out diff --git a/distributed/device_communicators/tpu_communicator.py b/distributed/device_communicators/tpu_communicator.py new file mode 100644 index 0000000..a7724a8 --- /dev/null +++ b/distributed/device_communicators/tpu_communicator.py @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os + +import torch +from torch.distributed import ProcessGroup + +from vllm.config import get_current_vllm_config +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.platforms.tpu import USE_TPU_INFERENCE + +from .base_device_communicator import DeviceCommunicatorBase + +USE_RAY = parallel_config = ( + get_current_vllm_config().parallel_config.distributed_executor_backend == "ray" +) + +logger = init_logger(__name__) + +if not USE_TPU_INFERENCE: + logger.info("tpu_inference not found, using vLLM's TpuCommunicator") + if current_platform.is_tpu(): + import torch_xla + import torch_xla.core.xla_model as xm + import torch_xla.runtime as xr + from torch_xla._internal import pjrt + from torch_xla.distributed.xla_multiprocessing import ( + create_optimized_replica_groups, + ) + + if USE_RAY: + from vllm.v1.executor import ray_utils + + +class TpuCommunicator(DeviceCommunicatorBase): + def __init__( + self, + cpu_group: ProcessGroup, + device: torch.device | None = None, + device_group: ProcessGroup | None = None, + unique_name: str = "", + ): + super().__init__(cpu_group, device, device_group, unique_name) + + # NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node + # must be used together. Therefore, the local rank and world size can + # be simply calculated as follows. + global_rank = self.global_rank + global_world_size = self.global_world_size + + if USE_RAY: + logger.info("TpuCommunicator initialized with RAY") + # Calculate how many TPU nodes are in the current deployment. This + # is the Ray placement group if it is deployed with Ray. Default + # to the number of TPU nodes in the Ray cluster. The number of TPU + # nodes is computed by the total number of TPUs divided by the + # number of TPU accelerators per node, to account for clusters + # with both CPUs and TPUs. + num_nodes = ray_utils.get_num_tpu_nodes() + num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group() + if num_nodes_in_pg > 0: + num_nodes = num_nodes_in_pg + + local_world_size = global_world_size // num_nodes + local_rank = global_rank % local_world_size + else: + logger.info("TpuCommunicator initialized with MP") + # Sanity: Verify we run on a single host + num_hosts = torch_xla.tpu.num_tpu_workers() + assert num_hosts == 1 + + # Get the current number of TPUs (we have locally) + local_world_size = torch_xla.tpu.num_available_chips() + + # Get current rank + local_rank = global_rank % local_world_size + + # Ensure environment variables are set for multihost deployments. + # On GKE, this is needed for libtpu and TPU driver to know which TPU + # chip is actually visible. Otherwise the TPU driver will fail to + # initialize because the number of devices would be different from + # the number of visible worker addresses. + os.environ["CLOUD_TPU_TASK_ID"] = str(global_rank) + os.environ["TPU_VISIBLE_CHIPS"] = str(local_rank) + + pjrt.initialize_multiprocess(local_rank, local_world_size) + xr._init_world_size_ordinal() + self.groups = create_optimized_replica_groups() + + def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: + # TODO: Remove the groups specification after XLA compiler can support + # auto-reordering the ring order for all-reduce. + return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups) + + def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: + assert dim == -1, "TPUs only support dim=-1 for all-gather." + return xm.all_gather(input_, dim=dim) + + +if USE_TPU_INFERENCE: + from tpu_inference.distributed.device_communicators import ( + TpuCommunicator as TpuInferenceCommunicator, + ) + + TpuCommunicator = TpuInferenceCommunicator # type: ignore diff --git a/distributed/device_communicators/xpu_communicator.py b/distributed/device_communicators/xpu_communicator.py new file mode 100644 index 0000000..ad61fdf --- /dev/null +++ b/distributed/device_communicators/xpu_communicator.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.logger import init_logger + +from .base_device_communicator import DeviceCommunicatorBase + +logger = init_logger(__name__) + + +class XpuCommunicator(DeviceCommunicatorBase): + def __init__( + self, + cpu_group: ProcessGroup, + device: torch.device | None = None, + device_group: ProcessGroup | None = None, + unique_name: str = "", + ): + super().__init__(cpu_group, device, device_group, unique_name) + if self.use_all2all: + if self.all2all_backend != "naive": + logger.warning( + "`%s` all2all manager is not supported on XPU. " + "Falling back to `naive` all2all manager for XPU.", + self.all2all_backend, + ) + self.all2all_backend = "naive" + if self.all2all_backend == "naive": + from .all2all import NaiveAll2AllManager + + self.all2all_manager = NaiveAll2AllManager(self.cpu_group) + logger.info("Using naive all2all manager.") + + def all_reduce(self, input_) -> torch.Tensor: + dist.all_reduce(input_, group=self.device_group) + return input_ + + def gather( + self, input_: torch.Tensor, dst: int = 0, dim: int = -1 + ) -> torch.Tensor | None: + assert -input_.dim() <= dim < input_.dim(), ( + f"Invalid dim ({dim}) for input tensor with shape {input_.size()}" + ) + if dim < 0: + # Convert negative dim to positive. + dim += input_.dim() + # For xpu path, gather doesn't work properly together with ray + # cluster so we use all_gather instead for now. + input_size = input_.size() + # Allocate output tensor. + output_tensor = torch.empty( + (self.world_size,) + input_size, dtype=input_.dtype, device=input_.device + ) + # All-gather. + dist.all_gather_into_tensor(output_tensor, input_, group=self.device_group) + if self.rank_in_group == dst: + # Reshape + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape( + input_size[:dim] + + (self.world_size * input_size[dim],) + + input_size[dim + 1 :] + ) + else: + output_tensor = None + return output_tensor + + def broadcast(self, input_: torch.Tensor, src: int = 0) -> None: + dist.broadcast(input_, src=src, group=self.device_group) + + def dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_sequence_parallel: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + assert self.all2all_manager is not None + hidden_states, router_logits = self.all2all_manager.dispatch( + hidden_states, router_logits, is_sequence_parallel + ) + return hidden_states, router_logits + + def combine( + self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False + ) -> torch.Tensor: + assert self.all2all_manager is not None + hidden_states = self.all2all_manager.combine( + hidden_states, is_sequence_parallel + ) + return hidden_states diff --git a/distributed/ec_transfer/__init__.py b/distributed/ec_transfer/__init__.py new file mode 100644 index 0000000..0decfd1 --- /dev/null +++ b/distributed/ec_transfer/__init__.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.distributed.ec_transfer.ec_transfer_state import ( + ensure_ec_transfer_initialized, + get_ec_transfer, + has_ec_transfer, +) + +__all__ = [ + "get_ec_transfer", + "ensure_ec_transfer_initialized", + "has_ec_transfer", +] diff --git a/distributed/ec_transfer/__pycache__/__init__.cpython-312.pyc b/distributed/ec_transfer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b61decc06ae4ef6d3eecd142562fe5cd4e98ed0 GIT binary patch literal 377 zcmYk2Jx&8L5QWD-Q34@KM~P^+$hr(jh!Y?VutKsNY!VB5v$DO2rK9Em^mLqs8@Q#S z15zMS;SC^pil;Xo`+K%uCX-!AW$1&xKmf0O^I!PSYz~uo1PKxuu&6>7SJ;w@AXtLL zPpG0DDd=0jy&&;G)xLF#t4ki7sQs04T-Q2iVf3Anak^9?AI9lQ_nxxW)_4?7n`(M<;eITcb|hIKk6Dg5OW_P7p$0BZ}|` PB>CKy!=a@qGyyJvKt&}AP%61l3Kh9nZoExh96MTX5UDss zh#MDf<;Z~xf`7s#fkTi55(qBbB87*2_laRg*ZPt2Bt z&=>wOByk}0DG1x>HZqWbEmXs0jJYgWLQO1-HK{E5-=Y<%$z>TMf~gF4$%@vLvI4e< z9wVX=*@_}W;)Z+?;u%pQt_z0pSSlw#PJ$c*Ib|e_IP4?MNG|z}7m&kVYUS>Yl2*J{ zDlRUlS95}wE)|Q8ZIcRf=w;m{yA;cgMc~j_$mfdAJ)5A7-?6M3oLx7osyD8cidu0= zRZGQ{rDbjYYDr!3WJ*|r+7+IANCF7F8~tfu+lUSIs1>~qJ$W_weuV#8ZlRo@ZsiQq z)vHxXsyZV^wp7ek^vVXwnr_y1Sk|;nX6lx?MU0o2DiG!+>hoZ~Zx>jtUZ|V(S*Ft} zVY3e!XV0Izca{_^bFENsvJJ;B+=sgqAOoZ3S_9%2NJV2*x7~F@!?NZwNT07ay)=Z< zLS8ft$@al=j{s|-4~a}WG1Hb-+b37w0Z+_ujg+A&F#o$HV_*rl#6S%Um&YnifQ2SG z&0b7%?eE!@Qawruh- z?jbtMcvPQSQw#ta&?Xo27Zq?VgAS1CNt9za>Wzg)r44L{%v}xCOdFhqx{ae3#~`8D`8P(I$^ z{-^){DaEO`ESXS)*Q#89Myie_Tpc;SaXbq zMds)+aN(_YuL0|c7~`KZ!pD9|A{Krq2;an?#nF$_)TjKJHX47bJ!hQ*C*GkmJz2mL Z&nEf^wq6|L6VLYd5p3U;Q5^Ss_y>zeZzBKz literal 0 HcmV?d00001 diff --git a/distributed/ec_transfer/ec_connector/__init__.py b/distributed/ec_transfer/ec_connector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/distributed/ec_transfer/ec_connector/__pycache__/__init__.cpython-312.pyc b/distributed/ec_transfer/ec_connector/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4c53d490ef02f8fc0c6138bc4db10344458aea7 GIT binary patch literal 186 zcmX@j%ge<81UW3-nIQTxh(HIQS%4zb87dhx8U0o=6fpsLpFwJVMe3L27U&mb7U-5F z7Nw__=vI`PTUunBr|RcqCg~ScmSp7T>6hi?WHa^HWN5QtgUZfi^J$aWRPTk(rT^v4|PS F0suGjF%bX& literal 0 HcmV?d00001 diff --git a/distributed/ec_transfer/ec_connector/__pycache__/base.cpython-312.pyc b/distributed/ec_transfer/ec_connector/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d454973ec3a9a07410085cc1f8fd50653e77036 GIT binary patch literal 9812 zcmdT~O>i8?b)Nmj?&4=b03<;Wl$s(VVkHoeL`#+#MK%RNGA$93K}bpv%frD=0~m64 z2kMyxh%Ht^yLg}~sS522nNz4-5|u8##m7``spJ5Z;v#CR6w`5Xa8(X#g1O|B?{&}g z%r1b?mC_|!g_)l2*WIt*|9g#p>hCWZ__^6Xb3T6BFn&!R(JvP^cK;QPyM}En8g|M~ zyQ#$#-WfMt%PeMU*~M&%`m=7XmS4=*dKP=+ciJs17SNw_i?z~XN%rO4-df*cpKSNI z{k4I`ft11Xq7QHdw_F=s9F${4_fTzUaVTY^4PN5IU!)W!iz8{{ZNo0zGVETSy82v% zX#D;n6G9%5kbQvc=Na95RQ3*_x18u5-Aq>ot-r(6((KHPSFiJG;E6XZpR*a)@_n}K z3HGMr2f|ru1l(q`^{Quc!Dg)LDre7@-hb(R-B*=;Rj*cS)SapoI9{DOb@t(lrP2a` zt8&3L;jKG1_gR3hWzTiJ4X3`s)`U}Y0%x83=SvK~%UIrSxLlm|0ezl{M%|~yXqF$1 zO|m7YZu_jp1IxAoYm#lOI@MLVfa9}<4}_OC<(v&qTniUtfyZ3WvZsA(op&NXC8yH< z*rIyPlqd2{ejOB;sIaqahPtr}2OgqxF7uX|;GZrq$9k(PQKZ;2l^NV8!Q^cQ?RQDy> zeNDR=xn$r?FAJ|G*P)HHD6%EUqRk*l)EzN8<`YOI0+ZNjaz#(sS3!I|yVxjHII$O^~I{Rm3b zaUhkDhiE(^uRn=a&=7TBV?axV&=EOtR4ZVsB%M&EHJ6j(NyM4Y>o!g`EqHYu@2S#n zNa^P)=~m{#8#Ao|YYD1qRRa)e)w9vvzwk%z&zdveot=5-;@sPSs&7_OtzO_wR9ji$ z0-t5AZSOULwMKxhLkW4*ouv<5w+7ZMJ1YR`nOD+Yjct%&gyzcF9YOD1gBviI#$wt| zEoSWWVs#x9}p-yci|wYBLrXYDKm11vgwz46U&y?X7NeA;!Erq?z> zmiqKMjxlY=R&APBO^~hbFLOcdYGeqfN&2SNHbpd4h~Z9Z!0={T!wE`U3XND(X~Bh%lPv02<_xwgHn5y6@nC~PfuS5yqBJ8r z?1@#knbOW=w^E|HN>-E!c|uVf!lTt=nzmOpO+h?VCp$u|(dh9K8h#%hw~XJWGr2;$ zWaNqpkYGAUP(TO$cK;a+PLjY*+Zmo&&f3{q@S{9u=kS%b^S82Js2Rp$UiS8&wo`>xtm@~?I zhBPZ-}{uF3&0_#5=u6~5~NQyAEyDjn3 zm^=l@lbgi~>riBh8Ct!hS)@L&5Q3tsR#rg4tCoKgEMWs~d)?hi|0t71S{h}$L}AF@85ql61*6cc8UT!W77yospZd3eG%^Y<5VR**L*2q55m*1 zTEh>ZZm__C42lw(OGmHtw1#85VMdM$b}K=WR?Z}Hw1)Ps-x_cbXH-puM|Pvi1ty|H zf0aJ8+y{CvC`~J7*A64CW9Fr6kQN2 zVIDwR(r+*6*o5s=gUa~>Xc32qbC4?JcSa>Gg-cAK?GvdZjX6qL#6HT~Jk!nT0|38I zE0RCFX|zX-)34ml{vzagha`5f*WSb4=A9XrTjIbIY5IIUtdt3c+^n0EdZRh<0*&4iXXb;?IOIh0spUPbpRhn0eh$4sQCQ7HFhncbi4M@>pP_Nu~OK|KJF6Vac+ zLkjBP$b<5k?edwe^0}?TIXrjD!(mV1j51k8iiRme^E(U`R#05YqH)(C)71P#3(Z9& z7mnpRO%5XvriziW`>|f`RdT)Ac}BVon_|KIAdli<)j^`7_nDIcYNXaAARTI57JLOU zK9X&9g67lRve^Y7Xaz)IiaN!zB0+z+xmslp8F~S~8XHY#z9ycLfQtc61AhlDuw3NdR=r~`FLJ*WzWW`07LK40qlF*9* zCH|2kP!|4*xncohVb*FUuPt$#Y_xwx20wHXa8wKS6nhH>es(4<84$#k4$56&l!jcv zYO9vNiZ6H(AH^o*wy5h|xwe6P&A*a3?LP9M=%P5G?EfDl-QT|SMw)2KNXLVaHBSYLV-rHLY0+H>IQO)gb>pT3U%#|E`U-} zc3pOjZ=&{rjw*so-}C*(#)fDTBbTdE5kQcaZ!Iz$(3yzY}}Iwu8J)l>FP*E*r9MNk7& zVk}q6h_w<2AmE9=3gNM%5eAZ4D0fjsj$enq*D*^g*8fL=3c122Al#7V1(2GAlPn?8 zEf!=Z1$SfBL%mjt0;%y5Cut`#+p%zc{c@U(5krBU-hfkJ}_ENb>FJTdx-#u}2 zV(<JDCPydh)AAZ^ zk0Kw!U{ta}(7?L}<&P?nYUP-|#3OB3$>df!s^=5yjwM+(D}!WD_!SiwBPd2cnB~cW zo8B10P4zn}F{6+j;vZKZ6T-Z2q;YT|QJ+ZG5m$v*_Zq0dhea>FKe2A2vkti?___mN zgh49BgXS%W28#ETY=p3hN@yi^0<>DtKmn5^cOt4?g6%|4;7Zj{S;w3XQvm9$#6+d} zL|BbHNel|eD1!R5v*cu$n7f|j5wm25u6-U{i`8*Da&lzXCG|sJIw3#g2-y<4jKCY! z-UKeF3v|XC7a*t|@kU>=BHSGBHp$7+zoxm`8V`BvHSt)?sn*by-~&JXwc+Yp9L%&SUom~2Je-qKZE7d=TPFEAHT7+dl@mo6LN z^(iJ?r+NliD=A7qT$wL}E^}#xpWPW8G|t z$9|q*%2V1!=8zlm>w6qzm;Gt}R3rR>yzEI9TEqJs2wiw#0}+I43nH5>ljHE_=xbny zqDe%RN*GTNRm6Ach=e*|l!YEW=x(aHCzCq}@*fj)8`N|g)Yi~cHtvZ6eKsl3Z!{cK zC&=4rPo5}q$RG;BgXn%OA%8N==&o?gy|=&+^*PI%up``s<>YFl^#75-fSy8Q!$0{7}P%PJ%`H*2UmZ z?B5|azzc%6inSw4M8mFhv3a^%0rrFXGQrZZn5FuBzavYV|L|n3kX~e8KOH&8|CN~C zD6yp2c91i%hsG+Rq+TW7GaBkN`g7exJ{oF;ULhpS-jw_Rz<*1S8rq|h&L`^1SW;JJ zQ~|H+CKrZHJ5F4btEA4MyXjp`rwro@ zgbbAPNz-oY<%zOk^$}4!)mG?!*$%H<6YC@JBlL9CL88*JO46lHbolDG6x+mG-TWF1 zan}J$Gij-*l0VfA@t+9}9We%vOJ|L$!vy;0D$jJ?o~f(CGZIfhz-_#Rqw51v_0dBH@;i{QTShzGle?U{|8Co$ z_s`3_^4%UX@&oO3DmQk=dt~6Xn~6GbAF75<-dmudlX~c6d!UqiW$XNQgWiY5{zo~yX-@^$ekN+|KV#?HZyY#tFgLctPTimR ztkBM&{jfN2+rM+})9d$6fBL8QGt~dL-}tP&Ri551zOb7EM0-+hHULBpjkR-lgBqoQ z+xDkpk9zPGqDB2oLiXhPe^w;Y_W%8|A!2z@vzn{H?DepnoTl*2oOL90Dp z$d$KVy|`=O-5%-99p5?1+8MkF*Wk{=y;Jx6dlNtT;eC7i@N-+`=eCQ_Kh9xTO0Z;g z<(Qy*#8#Hq8?{!(TB-^P52UoU^1&t*P6XYVDY5y8VC5y>l=|lZK|yTm7~OGAt-m-` zMLB0m`mHJD^rZy0%F(#$^rbY0>%eg8%R&tos-O&nrGB;YYQI)?7Kkbxlh@HwXgdYD zRd@~l9{)D*x>x~rkYmIDAs+2aDwX zuQG#QWuN^jd*)Zgkvo6*W$~$-`N#QOs(i=3>-@xdY@qpgFrO;kJN4MW>+#Tybn5t> M>VM&$GNowxKl5pRcK`qY literal 0 HcmV?d00001 diff --git a/distributed/ec_transfer/ec_connector/__pycache__/factory.cpython-312.pyc b/distributed/ec_transfer/ec_connector/__pycache__/factory.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a4a3d3f1495da1fc846152740c17e2f7a084830 GIT binary patch literal 3522 zcma)8O>7&-6`uX!|B{p?OQkGITFH`arZSl}j{GA#4n)X5X(ii{9Hi`Fv*L~s+4C@Ep|zUC4_W zk?ErD$$K;2yf5QpZIABH2QmQ;VUhS@tXB`_Lzz%s%19jI5f0$+W6s$n6XDTKr1{=M zn*TB1vPE%peXMngu1X7NK`roDa7-L3af#4BX=5S5+5WVu>*}(OK@Tte={c3Eqy*rPIYi0cXu3nNTwTQkru59Emxx+0Rxp97E4fuW z_?E8c9X;E1Dimlnj(!cw4#FtIX(+?rM>&)cu!ud_yNa+6`v;+ah3`ll(73B$!$FP5 z9!-D`Mr$6;yW$6rh>u$fXF@Q-Pj8PU&4c|bUMMU}RQVNN z6W^1z#EN+Q4i)Cgk|*IUUw@O`3KLnCv+lFYKg*eGV9?ca>AYT4wOnCUcCIU{1x;p` zmFZ=ZC3)mxsK2{Vim=tV0#_kaL-^h!WgFLa+n5{6>+OdT!YcJF4Ow-S(|wQ#DNjgS?K3 zL}3FG0bx*w%iG&uznwDk8>x-l#(7gEFyZ{qH!ohk^plG?1uLdDO6FRzka`!;ozf^8 za?6`0)>1gDm@NiSz3s`=iUXhIM#=V4PhwJj^+j|gT~~ExCa>zntg0KYC7o$9JtP(X z*+z%`BM1Fcwf?F7AJ_XYS8p8lo!Xt=yHf8l6oG! zvUBZM*J>g8us6Q@i~S4xXRMcRS;M#My>D338%>c9q&^itRcf*G|3;uU{b+QuDts7z z7=EPv)jvciz-=%179tFxk;E1W#G@>II=8fW_D8T#jvdFaEukW8hq^iDg6je0WE)=Y zq6k&m?AsDMj9YiZS!W0`k5{Gcc7wc%;Yf2t80wnCGDH*0vY-n(Q; zOO4pjK`d2^rL31{>#=k-eCUsS7>L;Dv%Qf03pZr zRalyI6ClO*BE@|>FSu(UL^Hy6qzm_Ol26cQ?a=ywGuwzh7Q>>;lP!-)V+Si0PY1Qm z?g^nR-=yeV!(9k;vc__$l{kjx(r%{%=|YjgdZAdr<@o)AvAIFxIEHl7&Pz0aU6BE+ zQg@JLD0RYJtlQp-`z%8fnrp~OD*8brybd-QA|ve$7>LPP7-08zjNOU8=XtY1{S5Gh z!U854hj#hQF`zw$Mb}`g@ev5XZ4jNBIvBiE8@yDFG(v+`XpF)3O1*c+l4cs=p-64$_|L0iV}K>QLlUZ35*T{R!2cVm0wd)of> z+L;+^e5N*d)jD~#7QI#%XPd$?wvhLk~My*Ez@}6MBngdKb z>G|%Ho_Ny(H6*O)NYh6(KMD>$3xL)dPi<3deAF}C7<;8DK>Z{Tt`=*7k!K!guz^o` z(43plOW3iw^x~QdKSPV~KdG+bw2SKUEd8BGcKuEyyCN~w+n7rlYz4(|mgsaOPB4Jm z(QHwNjL2M_RF|{tF>EjJ2C@O6fyGXK``wn|_n$o?dm%f-W;$`7_~=@W1&&8EGL42T zEi=H7)3nK+kG=7$v`I6#^IPG+>aglHLjUUNub*)jL{s26?#Pe06JMZ-FOd9|SK#=>Px# literal 0 HcmV?d00001 diff --git a/distributed/ec_transfer/ec_connector/__pycache__/shared_storage_connector.cpython-312.pyc b/distributed/ec_transfer/ec_connector/__pycache__/shared_storage_connector.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7f4f4787e4ba969a9b8db2305a28348174731c0 GIT binary patch literal 9582 zcmdT~TWlLwdY<8RB$1*-Nv5R8*7zlSU(jBvIX3tK>gT7QS_lN;z};rfdgo<=|lUZtej@ir+)vL z3n`k)_Mv^~0eI%jxnKVO`@i!K|2Y_JN)jW2~9S->N2j$dV>1eQR{k6k|w2k+aAa315V~*yu`&^Z*j6) z_DK908O*lm!?E>T;@Ix^{mc4>e~pt zwkEu`jVzqK%kZhPo>#MM0a3va;_+-EBgf-LYdoIG(E%YHh{wO1PbgN28}l_n@pvMe z&FKj}oy%&d<4$LFBcM@65_6f)Nop6y_=Hot@leAQv0b957e8$uqCe#}gB?G-S`CGN zIJN2R{GE&Up4oPDzVH{6GMiF^<}z^{C5)UJ92&9xRp%Urd$zvTHS$aaJYGSk_5>7|*m*_}q>0p_-V zXJc5P8V#Yy+gQVMOPqQD#Zebqhv7>sF3MR6uLqH<9!6vdxVfmdge1kSy(%30gX^lR z9i^^FGO#oua6h;btOx^TVPN&`C&FR2%BiTJ?x9g56p_Rj9))y;Nl`y#9-?T7B9iku zK^vr3ifAz!6++P3!#_T;qO5!Rf9K)7FKqdE?~#Wmo^g1XEvbGIKPULD)b?K@@n?0C zCh?LBQsqv0N?eMEWCda#sYmid!n}A2c>1Iksa5jIepw)iOnIa>D}n-M`?0f_O2*il@kZgl0v)=&GS1 z&!tHZ97!zl3%32V8{&D_U37o{TG72g(%cYeFKui^Pc$(W^njgOAOvd&MR!eVzrLn~ zc`q<)zq1Q_OphJ8jvz9+8en)aq0bjQg!F|z6H!D9p+zbZ z)0%i#E4*BX>elXynY^Zpv$Ckkx|qtTKqLo*$f`zL46l}7lvO6;Mz^hFT0ASm3@EvT zq#5Do`!#xCYT_w1mx;@FU`edujG$vVXkLcTG{c5hl4tXCu2ii;LHUj&e z`M6-${i`ciR|hu&(WikhrPun#A4!!H7t1FuJ_%gxeC}q z2Vvx)CQZck9QXc@ph%vgXTebndt6e=Tb#;&7>1}fQ@n|p^$xqI1+r}oVS#q5 zAru`ok=z7u3xv`Q!PzZe(Ff3IItQX{b$dfyH3X<{)0XyL^W29Yd9Z6yZ{h?{sFehg z!Ze7NE|RLGvvZ<8FN?_pIHyS3VZPezN(6PuiMOs^y>?U+!3lGcteTZaTb=G^=Aq3d zv0SuLl&A3ZIe6{!rY zFrhAq84w8C{!Cuc)8LZ{Wk;vfF%cP&Xt}J3L(UYV=+_?8IyjvTq1ASdTANch=R<>I zr`XtvcvD`wol_;ObuOQgv%2Q&9xR}FfvI%`oG*x%vSO_WOjf1)Nmm8IK!!2RyG9U?IdW0Bh&e0Qr0vkE)8oT%%?TOK172?Bu=pHjLo<_yGOOAwR9FU z$tGn+XVh1q9EKmqmBp?yj>fJbMlX8E%4$NF$=22xUUpbU3tJXd zG(5??lu%Eg+o+#GzY*MV(uU96VMDN2%2=Qw*u$u=p{+(#iH-xoWQLv3?O$PdSIh8R z-8ZPue?YX%ZSCWtBbA|Z<)L%yL+4Av(@@{)(8i&28==WHVR9`rSq%-Wg@)FIA;i_T zuC=xo$OcTUOl`T`o#AJ0Z?JtU028qp8LUKxHX=i%3)Rq``yZ@)@NlFY8mo5nRwFM| zBID)A_@lS~F82f@a2Nfk<~l0H}Lv zAgt4Mn1uz0OW8Rbyd_0ftuDaYlMr@*b>f!X1W%d1n9&2db438bC2N2wp zRiL|)1JK$e&FKrnn9R;D5uu(5+-?EEB8Zf%X3@&g&;O4&X+gg@1k46b&cH-4{K=)* z4%hJ6#N*rZIn6ehK(<+#K%BN&s$nK6`~MdFG88EsnkF$Y&9r8MXDY8S+_vWryljCl zNsGg>N_oJdN?2j|Bv{p@2nWl;!G{+< zerH2C@l20CTneM20sa!?2VvrPo%jtsbiMpDg#EeC)5Bx?DbWdAERe$^R7#vbBrLuS;Bi%Q8HK5%Ed+H9 zq3Etrlsfb|2J~q@nLVM(NZN7zaQuEh~~nw=x2$#yzje;aNq?9RtiL3Rz}0n$z>`w4wmt z$hX8gVFGR$`Z^LrkU@b>E0+br&azd}^yz}&pc<7q8uUECHcnC;nznDb*0&uKah-ai zDL|qPz2-DT5jPO>!7}YJylGv|Xr_0Ud3?K2WO$Q`oN&BU?^BkCE+(cSe;4nLQ_9k& zZnmg~%T-}yS50bG{Q>nGN3_g6>*YF*uZLbLd8)&smEno<@I+soW{%7VA*a4@vo?(!-$r-%?9@L zG}L+j_R8)1-&^_K>il}>=$dep?WGC1!K4b43m%wL;Mv(>pJ`;@rgT(dBs6^d?&|7G3vsOZK?xC{U2=$uJOm@=DdAbhwzp9UAI9g>4O zHdR*AxE0k9<5Ehzv121{kl$%W&CBqf$f*P<1)MH8b1AK|&$~;^FU7V_a;Ac!ysDvR zzy(!KrByH^_;zFvA!iS#43_gYlOf2HLo3M!z7Ol#N49`L?omIyvDbNQ?jH}s1 zhe^&H6}6fxCXa?jr$u045}p1!p9eeHn%AT7N4x6x?v4Hiy=&xPftGi3dwL(d{gbyJ zy!Vs$9v-S37%v|fe>At=^LnZEU)y@02HNjmS-G;>`ti(@z{}N8AFSr{y6eamE&2aS zcO^&CATzGQ=gvXZ%S;~jfPIj#j6j`B2nVR(6m5%T$xcmbCbJrGRd&{ka2sazI{7fQ z9k;_|BCpBL(C}V5&ZvfiVKQg=I&TwmPQx!}NuSexXs4WhgJkqZv^J+gpG~(IJdM^Z z2lA7_Zr7wtrK`;jJW`Oc6ORbQz7CIStJ&dN`xSKrf&zKjYUlR!S9-?FJ@6r|_k5$& zY9mr{rMP-|ZD?XWbZ$*J$Ako*l-TEoC_fYJZurEh5#C}>+G{Qu|Xrmhy`)q}K z7XiNE)0fCIt&)~9kAeAlbQA>1`8+yGA52H<{-7~h_sNaYH8w@Af0b?_=5f$5S{<6- z+En`RY4ouUZ@q+3RC$yUqEV9!v7$Z=Tt5EH7Kdotjn^&q8b<1w%g-NKJ9>JX!_z85>RI=DE`E|< N{oqRwcO=+R{5MY;Lec;L literal 0 HcmV?d00001 diff --git a/distributed/ec_transfer/ec_connector/base.py b/distributed/ec_transfer/ec_connector/base.py new file mode 100644 index 0000000..2b7b14d --- /dev/null +++ b/distributed/ec_transfer/ec_connector/base.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +ECConnectorBase Class for Distributed Encoder Cache & +P2P Encoder cache communication in V1 + +The class provides the following primitives: + Scheduler-side: runs in the scheduler, binds metadata, which + is used by the worker-side to load/save Encoder cache. + check_caches_exist() - Check whether Encoder cache of requests exist + update_state_after_alloc() - update ECConnector state after + allocate. This will decide to load the cache or not + request_finished() - called when a request is finished, + free the cache with the requests + + Worker-side: runs in each worker, loads/saves Encoder Cache to/from + the Connector based on the metadata. + start_load_ec() - starts loading all ECs (maybe async) + wait_for_save() - blocks until all saves are done + + get_finished() - called with ids of finished requests, returns + ids of requests that have completed async sending/recving. +""" + +import enum +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +import torch + +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import ECConnectorOutput + +if TYPE_CHECKING: + from vllm.config import VllmConfig + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +class ECConnectorRole(enum.Enum): + # Connector running in the scheduler process + SCHEDULER = 0 + + # Connector running in the worker process + WORKER = 1 + + +class ECConnectorMetadata(ABC): # noqa: B024 + """ + Abstract Metadata used to communicate between the + Scheduler ECConnector and Worker ECConnector. + """ + + pass + + +class ECConnectorBase(ABC): + def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole): + self._connector_metadata: ECConnectorMetadata | None = None + self._vllm_config = vllm_config + self._role = role + if vllm_config.ec_transfer_config is not None: + self._is_producer = vllm_config.ec_transfer_config.is_ec_producer + else: + raise ValueError("ec_transfer_config must be set for ECConnectorBase") + + @property + def role(self) -> ECConnectorRole: + return self._role + + @property + def is_producer(self) -> bool: + return self._is_producer + + # ============================== + # Worker-side methods + # ============================== + + def bind_connector_metadata(self, connector_metadata: ECConnectorMetadata) -> None: + """Set the connector metadata from the scheduler. + + This function should be called by the model runner every time + before the model execution. The metadata will be used for runtime + EC cache loading. + + Args: + connector_metadata (dict): the connector metadata. + """ + self._connector_metadata = connector_metadata + + def clear_connector_metadata(self) -> None: + """Clear the connector metadata. + + This function should be called by the model runner every time + after the model execution. + """ + self._connector_metadata = None + + def _get_connector_metadata(self) -> ECConnectorMetadata: + """Get the connector metadata. + + This function should only be called inside the connector. + + Returns: + ConnectorMetadata: the connector metadata. + """ + + # Should only be called while set to valid metadata. + assert self._connector_metadata is not None + return self._connector_metadata + + def register_caches( + self, + ec_caches: dict[str, torch.Tensor], + ): + """ + Initialize with the EC caches. + Args: + ec_caches: dictionary of encoder cache + """ + # TODO: Implement this later for P2P feature + return + + @abstractmethod + def start_load_caches( + self, encoder_cache: dict[str, torch.Tensor], **kwargs + ) -> None: + """ + Start loading the cache from the connector into vLLM's encoder cache. + + This method loads the encoder cache based on metadata provided by the scheduler. + It is called before `_gather_mm_embeddings` for the EC Connector. For EC, + the `encoder_cache` and `mm_hash` are stored in `kwargs`. + + Args: + encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal + data hashes (`mm_hash`) to encoder cache tensors. + kwargs (dict): Additional keyword arguments for the connector. + """ + pass + + @abstractmethod + def save_caches( + self, encoder_cache: dict[str, torch.Tensor], mm_hash: str, **kwargs + ) -> None: + """ + Save the encoder cache to the connector. + + This method saves the encoder cache from the worker's local storage + to shared storage or another external connector. + + Args: + encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal + data hashes (`mm_hash`) to encoder cache tensors. + mm_hash (str): The hash of the multimodal data whose cache is being saved. + kwargs (dict): Additional keyword arguments for the connector. + """ + pass + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens on the worker. + The scheduler process (via the Executors) will use this output + to track which workers are done. + + Returns: + ids of requests that have finished asynchronous transfer + (requests that previously returned True from request_finished()), + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + return None, None + + # ============================== + # Scheduler-side methods + # ============================== + + @abstractmethod + def has_caches( + self, + request: "Request", + ) -> list[bool]: + """ + Check if encoder cache exists for each mm data of requests + + Args: + request (Request): the request object. + + Returns: + A list bool where ith value is True if cache exist for + ith mm_data of requests + """ + pass + + @abstractmethod + def update_state_after_alloc(self, request: "Request", index: int): + """ + Update ECConnector state to decide allocate cache for requests + + Args: + request (Request): the request object. + """ + pass + + @abstractmethod + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> ECConnectorMetadata: + """ + Build the connector metadata for this step. + + This function should NOT modify fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + pass + + def update_connector_output(self, connector_output: ECConnectorOutput): + """ + Update ECConnector state from worker-side connectors output. + + Args: + connector_output (ECConnectorOutput): the worker-side + connectors output. + """ + return + + def request_finished( + self, request: "Request" + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called when a request has finished, before its encoder cache is freed. + + Returns: + True if the request is being saved/sent asynchronously and cached + should not be freed until the request_id is returned from + get_finished(). + """ + return False, None diff --git a/distributed/ec_transfer/ec_connector/factory.py b/distributed/ec_transfer/ec_connector/factory.py new file mode 100644 index 0000000..bfdf51d --- /dev/null +++ b/distributed/ec_transfer/ec_connector/factory.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import importlib +from collections.abc import Callable +from typing import TYPE_CHECKING + +# yapf: disable +from vllm.distributed.ec_transfer.ec_connector.base import ( + ECConnectorBase, + ECConnectorRole, +) +from vllm.logger import init_logger + +# yapf: enable + +if TYPE_CHECKING: + from vllm.config import ECTransferConfig, VllmConfig + +logger = init_logger(__name__) + + +class ECConnectorFactory: + _registry: dict[str, Callable[[], type[ECConnectorBase]]] = {} + + @classmethod + def register_connector(cls, name: str, module_path: str, class_name: str) -> None: + """Register a connector with a lazy-loading module and class name.""" + if name in cls._registry: + raise ValueError(f"Connector '{name}' is already registered.") + + def loader() -> type[ECConnectorBase]: + module = importlib.import_module(module_path) + return getattr(module, class_name) + + cls._registry[name] = loader + + @classmethod + def create_connector( + cls, + config: "VllmConfig", + role: ECConnectorRole, + ) -> ECConnectorBase: + ec_transfer_config = config.ec_transfer_config + if ec_transfer_config is None: + raise ValueError("ec_transfer_config must be set to create a connector") + connector_cls = cls.get_connector_class(ec_transfer_config) + logger.info( + "Creating connector with name: %s and engine_id: %s", + connector_cls.__name__, + ec_transfer_config.engine_id, + ) + # Connector is explicitly separated into two roles. + # Scheduler connector: + # - Co-locate with scheduler process + # - Should only be used inside the Scheduler class + # Worker connector: + # - Co-locate with worker process + return connector_cls(config, role) + + @classmethod + def get_connector_class( + cls, ec_transfer_config: "ECTransferConfig" + ) -> type[ECConnectorBase]: + """Get the connector class by name.""" + connector_name = ec_transfer_config.ec_connector + if connector_name is None: + raise ValueError("EC connect must not be None") + elif connector_name in cls._registry: + connector_cls = cls._registry[connector_name]() + else: + connector_module_path = ec_transfer_config.ec_connector_module_path + if connector_module_path is None: + raise ValueError(f"Unsupported connector type: {connector_name}") + connector_module = importlib.import_module(connector_module_path) + connector_cls = getattr(connector_module, connector_name) + return connector_cls + + +# Register various connectors here. +# The registration should not be done in each individual file, as we want to +# only load the files corresponding to the current connector. + +ECConnectorFactory.register_connector( + "ECSharedStorageConnector", + "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector", + "ECSharedStorageConnector", +) diff --git a/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/distributed/ec_transfer/ec_connector/shared_storage_connector.py new file mode 100644 index 0000000..c838814 --- /dev/null +++ b/distributed/ec_transfer/ec_connector/shared_storage_connector.py @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import safetensors + +from vllm.config import VllmConfig +from vllm.distributed.ec_transfer.ec_connector.base import ( + ECConnectorBase, + ECConnectorMetadata, + ECConnectorRole, +) +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class MMMeta: + mm_hash: str + num_token: int + + @staticmethod + def make_meta(mm_hash, num_token) -> "MMMeta": + return MMMeta(mm_hash=mm_hash, num_token=num_token) + + +@dataclass +class ECSharedStorageConnectorMetadata(ECConnectorMetadata): + mm_datas: list[MMMeta] + + def __init__(self): + self.mm_datas = [] + + def add_mm_data(self, mm_data: MMMeta): + self.mm_datas.append(mm_data) + + +class ECSharedStorageConnector(ECConnectorBase): + # NOTE: This is Simple debug implementation of the EC connector. + # It save / load the EC cache to / from the disk. + + def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole): + super().__init__(vllm_config=vllm_config, role=role) + # req_id -> index + self._mm_datas_need_loads: dict[str, int] = {} + transfer_config = vllm_config.ec_transfer_config + if transfer_config is not None: + self._storage_path = transfer_config.get_from_extra_config( + "shared_storage_path", "/tmp" + ) + logger.debug(transfer_config) + logger.debug("Shared storage path is %s", self._storage_path) + else: + raise ValueError("ec_transfer_config must be set for ECConnectorBase") + + def start_load_caches(self, encoder_cache, **kwargs) -> None: + """ + Start loading the cache from the connector into vLLM's encoder cache. + + This method loads the encoder cache based on metadata provided by the scheduler. + It is called before `_gather_mm_embeddings` for the EC Connector. For EC, + the `encoder_cache` and `mm_hash` are stored in `kwargs`. + + Args: + encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal + data hashes (`mm_hash`) to encoder cache tensors. + kwargs (dict): Additional keyword arguments for the connector. + """ + + # Get the metadata + metadata: ECConnectorMetadata = self._get_connector_metadata() + assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert encoder_cache is not None + if metadata is None: + logger.warning( + ( + "In connector.start_load_caches, ", + "but the connector metadata is None", + ) + ) + return + # Load the EC for each mm data + for mm_data in metadata.mm_datas: + if mm_data.mm_hash in encoder_cache: + continue + filename = self._generate_filename_debug(mm_data.mm_hash) + ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda() + encoder_cache[mm_data.mm_hash] = ec_cache + logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash) + + def save_caches(self, encoder_cache, mm_hash, **kwargs) -> None: + """ + Save the encoder cache to the connector. + + This method saves the encoder cache from the worker's local storage + to shared storage or another external connector. + + Args: + encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal + data hashes (`mm_hash`) to encoder cache tensors. + mm_hash (str): The hash of the multimodal data whose cache is being saved. + kwargs (dict): Additional keyword arguments for the connector. + """ + # Return if it is PD Instance + if not self.is_producer: + return + filename = self._generate_filename_debug(mm_hash) + ec_cache = encoder_cache[mm_hash] + tensors = {"ec_cache": ec_cache.detach().cpu()} + safetensors.torch.save_file(tensors, filename) + logger.debug("Save cache successful for mm_hash %s", mm_hash) + + def has_caches( + self, + request: "Request", + ) -> list[bool]: + """ + Check if cache exist externally for each mm_data of request + + Args: + request (Request): the request object. + + Returns: + List of bool indicate that ith mm_data exist in cache or not + """ + result = [] + for feature in request.mm_features: + result.append(self._found_match_for_mm_data(feature.identifier)) + return result + + def update_state_after_alloc( + self, + request: "Request", + index: int, + ) -> None: + """ + Update ECConnector state after encoder cache allocation. + """ + mm_hash = request.mm_features[index].identifier + num_encoder_token = request.get_num_encoder_tokens(index) + # Insert mm_hash only if this block has not been recorded yet. + self._mm_datas_need_loads[mm_hash] = num_encoder_token + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> ECConnectorMetadata: + """Build the connector metadata for this step. + + This function should NOT modify any fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + This only build for load mm_data only + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + meta = ECSharedStorageConnectorMetadata() + for mm_hash, num_encoder_token in self._mm_datas_need_loads.items(): + meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token)) + self._mm_datas_need_loads.clear() + return meta + + # ============================== + # Helper functions + # ============================== + + def _found_match_for_mm_data(self, mm_hash) -> bool: + """Check if the cache is hit for the request.""" + filename = self._generate_filename_debug(mm_hash) + return os.path.exists(filename) + + def _generate_foldername_debug( + self, + mm_hash: str, + create_folder: bool = True, # <- now defaults to True + ) -> str: + """ + Return the folder in which the cache for this mm_hash lives. + If `create_folder` is True (default) the directory is created + recursively the first time it is needed. + """ + foldername = os.path.join(self._storage_path, mm_hash) + if create_folder: + os.makedirs(foldername, exist_ok=True) + return foldername + + def _generate_filename_debug(self, mm_hash: str) -> str: + """ + Return the full path of the safetensors file for this mm_hash. + Ensures the parent directory exists because + `_generate_foldername_debug` is called with its default + (`create_folder=True`). + """ + foldername = self._generate_foldername_debug(mm_hash) # <- folder auto-created + return os.path.join(foldername, "encoder_cache.safetensors") diff --git a/distributed/ec_transfer/ec_transfer_state.py b/distributed/ec_transfer/ec_transfer_state.py new file mode 100644 index 0000000..ef3c978 --- /dev/null +++ b/distributed/ec_transfer/ec_transfer_state.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING + +from vllm.distributed.ec_transfer.ec_connector.base import ( + ECConnectorBase, + ECConnectorRole, +) +from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +_EC_CONNECTOR_AGENT: ECConnectorBase | None = None + + +def get_ec_transfer() -> ECConnectorBase: + assert _EC_CONNECTOR_AGENT is not None, "disaggregated EC cache is not initialized" + return _EC_CONNECTOR_AGENT + + +def has_ec_transfer() -> bool: + return _EC_CONNECTOR_AGENT is not None + + +def ensure_ec_transfer_initialized(vllm_config: "VllmConfig") -> None: + """ + Initialize EC cache connector. + """ + + global _EC_CONNECTOR_AGENT + + if vllm_config.ec_transfer_config is None: + return + + if ( + vllm_config.ec_transfer_config.is_ec_transfer_instance + and _EC_CONNECTOR_AGENT is None + ): + _EC_CONNECTOR_AGENT = ECConnectorFactory.create_connector( + config=vllm_config, role=ECConnectorRole.WORKER + ) diff --git a/distributed/eplb/__init__.py b/distributed/eplb/__init__.py new file mode 100644 index 0000000..4cd51dd --- /dev/null +++ b/distributed/eplb/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Expert parallelism load balancer (EPLB). +""" + +from .eplb_state import * +from .rebalance_algo import * diff --git a/distributed/eplb/__pycache__/__init__.cpython-312.pyc b/distributed/eplb/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74f26a5cf060240d260dd5a8e3f74fbb95807397 GIT binary patch literal 291 zcmXw!u}Z{15QcY>pdoM;mZFgIa&WihDTsxTPULEvWplfni<9iK*~rCL@Bw@V3!lW= zCSYY{r<~PQ?&A3uGv5r%&p)rDQAWHlc~oz{e}v&*(GE7p9-fFGH$;Gl5MCnqqbN`x z7sP49k(m#rt5Bgc6v9L6jen{?{0NN?-6S%RO_s& xIwu7UzLYW5D96~$^h0`HYtf;+u;KTRrakoKD+Yk?y$F(@l)%wDIkBl<{{kAmQBD8= literal 0 HcmV?d00001 diff --git a/distributed/eplb/__pycache__/eplb_state.cpython-312.pyc b/distributed/eplb/__pycache__/eplb_state.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b39d3ba9bb933e44a7564f3e5fa1a28fc3a8e77 GIT binary patch literal 27954 zcmcJ2dvF^^dgtIp;xQyZ5+uMkNP-XW0g9BUHziRL^`InLvgF57c4&wgQlLP99)Pw* zgKO``TcIwhn2awGeNJlWBvqEm%2`{NyLB#i6*-r?xRQS`qU^!qxV7BMCb_Nq<0wms zvs-s{_xpM<07#BxJIS@>Mo+){>+Y|=S2zEvsL0B}g^j-+nxEsizork>6=VVr3v?WJ zg9~yaTu>L(hjk-5{2IdgDZ_|CM{&lmamqAeVqtx_V59(Xrf}hudBi+r8L_bVg0OYU zHe#FNM|k#K7%rMB9x0x(kJzUiBMug44wp=oj+C;nC0sV;9C5O+HC#SbF;aoBEy#ym zQ|=Kr3m1her>aJ(bex_OoI>@7=qI_68o{LJj&s4{x45AFL%kCFq4L9x)Uq@O(n{2{ zk-A{1U=~Ve3pp-WCe#Za!85L}M2Vmip@LxfTgH)wt9oBW;73M|vktyCEr@Z?bU+M* z!$LR|oAQLCfuQG7ARLH{38JTM@a*ZsKF^d87eiw)PaqQX#NvUt5c6BDO`hZ7Xe<^G zuUf5Lp7!?B(TUJlAnZ{pZExS}8HjiUrG@zAK-?3GQA=@8blelaEO@4(K_T24LyJne zSX`VLi_eIH-|F$4jC(Fe!$CCZi3zO8t3qhva$M;HY7&L%Fq(@CK~G>zL_a)JGvRn> zIxKjmFJFyOPf(50JbFtVd|C+3M1p}xJgJ)k%`|f zm`K0i@AO0?1Yc#IGXR+iT=iTI0Pfj&!eL(5!4k7jWeLdGkuM9 zvK6@zdaM@KeRUmVx~wdM(@|QZTCmEl8+C~4qTLKLEM9bGY(~WVsa?T-IEM8iycU?k z65Q)KDhSi(1>s89(>)kv%+uSqa}xm2`B+q$`7_Z$PdIQ@5Upxf6%^r61i)1ivDlOy z5Q^mX58J!go`M+jdrpqig2YPnZ11&-+0~xe1f3f?^=c>_R^U%v>p%%lzo&Qi?#)n$ zVWnE*(Rd*2$9@iaUcD?t0RHK4^eWpWuL`Z%ictEA9j6^dlV!~gSnV@g2w&hK%~O?F?b2BhkosXd+#FHiI0MZ=WH}PYCf* zVS03e6)mQ(2nvjjMQ0-MbXh1eioTE1E>+?KF*II&p3r6(E#^0j1RJgLQS6NgLByxy zOz1VjflrMOGRztKIUNQ_C=rDWRvyjFCnB*!Gg~hcIht7`qoK%jI4~xR5g=4&$OIn# zHw12Q0*3>T8__?5qi@6@7$?~gs%BPxg@T}dKY|wSvFV1dNH=~(y6G#@3o_}2=u07m zaljP>%@n$7@mbTwgVW(lY{nRfXN%~_Qco;@+NG|J(Rh@=Ob5g0RA4&oQa=!%%|i+0 zYC`6S$xN(60Xyml1V^W_iqfU|F|URq!RV`LD?8OjBbfcPiG_!Kmb7`4#f*-ot)rue zoe2vF^P{6L&jiAh7hi>=qrvDH0;YIW9J`z@xB$!^71Q?7(Lf}E4M?X;4Cw|OlwvW4 ze~WtvpZn$U?)cPn_jG8wD;^NB9J^kd*|lf)m0d!2ICQCd`fB`gG}8S_I6T!I454i3 z5*;Yr*us~XH_A?Q|MXR{47D+D2Ldqx-uJkl8eMC(jo*La-4|AjzIA=E$-G|2)iq@! zDiGo4Y8t-3|K0s7#@2Ox5zFDKxe;CruNWKG^(I8{TxH$&`FHshqjz1;Bch5casKL4 zDO>e@W3}j2+ug8Six5lh;Q}b1+;XMG8q^EMpg}NA@=BI`f7}=}g32ir%t4c22^Qf0 zLcuCnhzbG)WPZz-rJ!s$S^8QbdHtSK!c`)MvqUf`K00fNP`FGqlgE>`t28;V4?Bkn6q;&GMdGv= zi6fxzhY@%;xP&gjjUuGDgdSfRPJ-h?9KJFf1flaF<;~WW`36g%xXia{)3?e=aaTE? zepsaHe1&2gencO)PeeW zB@o8ZY>Wxv@mz;uG(bR8Y^L4;3A3|ybhb1*Q+_q>H;^Ot47@kFRK0)Rc;8%;s;HSa zUbm%o?V-;~ATkRNMApSpfyNB@H0M6e_$=kRw^8O6HFY=2erE1w7%p8JnVA|@$a(b) zPnWYel~l_{7l;C=fXG;O#?Ui@KAY`V5V*m`vvV8I&SQf6FcYi(&gm2SymXx!f0TRM zFs7g2#`G_OSDZ5>3<+))EiCGq^1HN>W)ks!fA?WqbkkRLz`Q3VH3KR}HdsgG$B zW3YTiu@{psZiknyn+Qii66T1Pjl)grLNVIVRE@F$Ixo?5u}uKBYV^s)XLsf{$u}wY zZ_#Az9K1KVM>Sl#|F)3aacsqZe8qPnX>LiGTUI-JlD)_8cb<4?(05pW)Olk0<@tf6 zqj80AOm?1lK#>nPy~T>$>jj*{pEP&<;)_DWveEoLzq251i3qO}86|`_Y!C-pWgFx^ zwoU#9NpO?-8wuISeVExQLEV+39JinopNeM@UL2c3aoS}kEoyr1S0yXddevVox|%kY z7V(nMX~kaI1(*g=f~dYt4ka{VS&369K``0mP(mY?mFPu@(oNa%i4q#ItVAVBI5kUX z#8L_Iy|_!wAJk9MIljrotJR2vUWFlA(IE8@t1))OYK#xD8e>4LMnCg8mPw$gqk_benM;AjP)R>;-}wv*jI%8rbNGgCdRbZNNZ4lU9zm&&OrEN3wfXYz@l;E4JoUr~7xD zOF_A=TdM1p>vl?YJ6GyCsJi@ zxy&z>`S1AT-g8p#xqD^jSX`G>)^(>+?s-b;dFo!-Q!K7iD(k#sk++?Zww<|Gc80~Z zOJ(ghL$d#*=IebpyT4%(`7Pyg5>gJhv(OxNkFVQmq!h$ zg8aFHxJ`}I#C!$ll?Lv+t;^O>7}U?1wOdJ;uV6vPTti&Ypq=AAE@(`c-{XEa%PfCa zmjFXOi%!j1^7^KQX*Exa1eoFA<9nSUVdt!Qb!*^7^i~6lkAs=ff<`VS`*LAm7?84Tg%Or#gC}5>KNyoiIT}dH8%H~O)pQBC!EC8 z2hEJ~t__;zobh97&dHN%DEAxW6XoyegGCAFdwSsh%(;gX?v$Dbf`ZaSS;Cnp|1fjb zy#RbLS25?Bb0=I{@C_C)Oj((z1Rm0Wzmm>QBnHJ*4N{y-E+^@O4E+wBglfSfEa*&hM78S z5L9{s;Ry(_;+LoVR<)}BU-BrdR@Q$@mo_mr=9sTnJWlPml1G~&$I^&&{-+D#O#P#D z+&DfH4vU8=-ZU$~+5u9bXk*l#h4;(m7zTr=j3$B8fyqH$|6b z#+DCrb5`5T)0}4Xl9*Ap%t#F#gG@|aWoa{u#zIb4lc4M|!%-5|PHa|!R#Hhdaib!% zlt!7YlnpmVz*M*R)?Y>ZXn|pZBDI?>%W=rr$iGDuu|L9Ae3M(R;4DS5xkWO!EYI9G zcm0enm-)jIe>kObye!ZmIXWJi3`NE32F}v_i07KxKInYEGsU;1%3L@0E$(|{GBiPs zSifG(H8dy9wJEz(wzo_6_N4#FJ^PV$&S0^oioMIv$jyhO=0i#Ez&fWptm{g39Zq&0 zSvTm426Yb&$dp88GuPJt@yk-{p`_Wn+SvL*`}^&wqEfl2MJj4ZwjE9KgDJjPZMe|L z8h&Ku8d^TEy>H9q%yw?I)ODkMvHek@u>pOoMfXcx3p28-M{@OiUeY^1kgDpr(;!vt zm_L8baxkm`VBG6g zuB}sU-6yr~lUt8Vt;glo^HS@1QnPqhp_#t;?!`5SQ+9Mpj?TP+LNwAYIoj6?jYY+n zhB9}4)pm0os;d(BoW*?g<6rgy$AyLu&8?>s~>&2m|XRMxSvH$KVXTQ?cD6@S4Q zoW)psShp->=A11`f!i}5y?*QUdu4l*{N7~Q-nD9vT-_&C_syS9Ro2Os+oZ~E^Fyod zmKAr~{K=H7{>I$m+&gc~pIEcImVA=EY5C=Q_O?__?Sf_1Q6W2;BuCS_UhisHbJZ_g zS*~3^xx9bfq%ZGzWH6SLKC0kK8dDDZt4TSkQjW@$!~M`!P*VKB0+6K~=sy1Wus9m~ z<%Yvj!{NKVQp2$%UxxuEeTQVvqN_FNY0)+jgqr3xqC=*4kaC@QWb8j+K$7zhn%saILWtSvRe=9h-q+e{_gps_xbjp?K#fMQx&YPx)kr0 z`8o-M=1U&h&3mx`WOJotu3YMrtF}v3+gGak?wNNg3O;ivHX59nn!1{{A~Y>V$3tNu zZ2=l(hgEEh2o2TI$qDj?dw2+RG^9*=u?#ephNLN|yPcJT>$t0s*ob?OGb?jaE95vW z>61!}L2^+8I^CEsYDuG>q|eZC*Xtm)0H-Ae(XK7mo*Giym@~yyqYX}t(~`?Tx<$t} za|zQoAB)Nt9);HCMVy*102XtF7+c{anXqhfh}DoJ1&zd&Bnl?!=-uShz6VVz<*lWs z2%2sesPy|}R%el`O05M_IJ2fK#F$r}IJwv8U` z%=JVKU3md>SqO=6zT|~k(vUr6TgpnX9y42WO87R!e*m$;7@KWZu8x?0FFERF8h`}reTFFRsX5umoGN+=rDWDEc{UwuNg5$;} zFU^pl_r}p?Pf@MGB9T?2@PTYrvL>K8Ymp`onQO&1bm9UUSLZ7LOAlBAdtx}lG0GWC zvvRdS;h;D6r_1wF;FSqjm68SiOM3wz6n+K5feB?*sAZWw^OmB-Lv1Lk!k*o~$TX{6 zG-C!Fj6mv@oyX278)gdofOR;Qh`1-tkUqCo`P>=3u9Ke}r`rId+UQS&XkXZON`e#+i$U z&>mxc6}iSJtX*SQBV#!uTP9!-FCkmnFdLdq7s7gmi5JGGq`=}aU1`%e00+(Oln{uB z0ZL<16cdC5V^<(di83o1BSBOgr&2~-z=$e>3_5!uUVubJU_7`;XpIR!n2f_`R|E!c zQ0$xJeFt9J!5DQ06Gaq}F3(6IXbPEyup%*-p&~?Oq?uM)nRO;0px&fQa=;u_WIpMV z95~ge;!H6m2U!1HDcBrmp`YgSS4WH_VCqiG{3^V-k&xIdGo7X_A^7S0H-Khi6QHHu z^LBFe5a`d9^WMePr`0{4i*(o_YQ|>m|?U&sCn=eDfd}sF$`+v~??+)HQ z4Bo7~;>!ZAqH^*4($fp)-?{jc@~XvW-g$1_#+7)%=QZzEI()Yz>D|BLJtBLLOWxz3 z4kW$DSG>bXM?=cHO9q?g-FH`)^zK{n9+tf)CGSbudq(n}Nj?pp`^<{>0x~`-;JSBw zbmi8SR5|$mJxS+YG~QkMcct#dmUr5qsV=F+tj+O=M(okye(Yr7INB`37&7phs zJ3k#;;MdBkZ}czrzjJUouu|5#>a4nPYVp*%URSwq&0QnA6@@s4si@T(Q*Qk8{)63I zUa{T>usj&jaSnG%YaZ5|UfJ0P21|DClbrijoc)-Gk_OqaU2<%{b5!1OR>D8WIrNWW zH_n&^GY0dJyT`ZP9DoSnGe|4!WwO0PvUl8!%iX7>?o;>dr&8V97L4~DoygAQ zH22IcHyb2#$7e9`ve{*u4=nM`3v$;5sq4Z$+l3TNCW*Vstf$ZT8bFDEMyD`McHuQ) zjD&Ns%fR~Jm&&&4@nd5dT?5v0o_?O^H`;lrG=UaxG-pbfp2Q!`;j$kj`MB>uR4CEx z;^}xseh$vc%v#>yJ&6;7#w5!LX>mYmPv;8bSq-`t??U={Q6Dsdem1FG4`~4w^l#pn zfF?enk418}YFd0w&e{re7)cm6xeqh6)r2l@v?R{kC3HodBIg~c3PZD!kRN! zRYyPR*<2F3D*bJ<%5`C+nth&Z+FDMnlktSVjTyA%!Nt2ZKWc8#h_!uf9j$gw4T>1L z9}T`_a+_xBS~)d12h2rgN5B?=pX}ROSI~lXcW#ZJB;oTW7tC>ZWbL%cY4=}UHo3+}H;VpAdxM`G$U#I6VgK;z+<@b@Ode5Fa=+k*GrL{~3Ci_%iEq@>BquCtK89&V z;-s=-?wjvR)CJ0d9JpxMFEd$z4L#IcrE~Uw0u-U~+=xA{_hDrEm^ooNPrn(hJ?xiA zpy4M>&_XYS1&QhQ^=`a)}NV8hPjGx74`YZ?KrzH`uq>H~b5&Y{(;n zxwRd?sJ09ZdkO3&G_5A`wDFpIAyJ{dt~B-aCR_qC=A>Pq9(31FY45Sg$KpF!K|D5% z!Mp=G0_xauzmI80}eJ+BnsbD>~eC@`2R%>SLN+q4M~2mgi*t#poYs5Rf$4oBW6&>205V4 z=M1>;X;oR?r`6lhd9oqM@y19hN7n5SkTBU(<|aS!?YPIUme!OKW~5wNGN>mC+;YMf zwfap0HP6=*pl!XN6oHh7$<|=3|BAq_TrtTa>|!!K5`uVjLJ=mC9FvI>H}+=}3#K)? zOcn;P45yZblsY7cIR)IWkB7##+(l>`96amW&<&EVZtR5y-{)x~xy?m(BigTAh}C(yLUxX3P~%yB@n~KZ<=&KAV|+qptI=H#vaf>PNP9~fm3<8M5#d) zQ%(g?jAXw-F))r>30pwWrnH4OHSkEn7%YgWJ(8K?5f!>Lk$0DZ8Y2miYJ=`+gJF7{ z?xuC3bMZ~%q?{15F>CcLt$wWmD$>5$N_AHdFcgUuES_S1zkk-!mRa7uG{kLV(=*~z z*j%^tX_NBSfrb5y#>6ouGsF#vw~$UoKDY^od+CTLiIe!N@WwI}CW4Bd6Eq+9kXWu$ zoQ_%5zX%QJoGyVk2tYR_VFJYUe&>7i{z5SViSB^gVX!B!LS8JDFlcZ zk&^1QHp~7A0ry0v#RIb~IlGdqpwYjdy%G^S;Lqk8LGc(#^aTv8f-(QB&69L?)~h7pN4GN+NMv zis+PV_|mq#8~tg^B@xSg3>z?Q!Q?6CSc-`CIV#pg9jh}E+#KLd!K)9%~mJbTGo6WNxtPrWsPLOgQq1{%iT-POv&<& zq_gvn27h?!2d6$YJu>L)N*An;Ah3i%OM~Qq*pke?QqH*UWZaE;e9Qd?>#GIXv zI6ZC(nVGmbC!v;^gnG5B_oIuqE~cDr*||+}ZcCA*v|Vzx%g#NLb5Gvq0m*qFRn;z6 z^-ER#a@7&3>PV`(L#{p`RUeS62c_!4N45fgX%ePI#au~;BJobSt5})CQq^I(>a%HGkyqj37>6B}FrJ7#3WwBh{RNl~ZLEtejB( z%DY!m_1;Hzv!`O=1kA05Rz>5%RTpO-c)Vf z{L$;f*fug>FY)!T-72evwqT)S>BVKsQUbPHC0(%HfHK-=*stFqAPb~bQ-jb zm34bcsyT&aQ&~k*-29nyKNJb1;mOsPa&O1h$FeNoTCh->k4eqPU~1M3qp`*{XG_Y} zD!UF!u7k2`P;w34y(GC#QhT@~XfOZ?=qZS00Y%N_TbE`3faD*L{ezN!Fy-uCb62tr zct~;|lHJE7H`VMON>z0}Fz70~)=fsVYvrnIao@1}7);a5-PrCOFoYwe2|-A99FQCb zP!;-!))Z~ZPRX%TcI=lN`;-08e(rc~J*LA{Ja|RN@$OVvZL-Xlg!x(-tkE{%Ojot+ zYLi@TE3WqW6B(SDa_sz3L&sv#oif0FX|LpHhD94J)vTPu$B&CmNa89{oPMC%$O%$KqzR$%0HlSDpl za@jHHpzy$p6?29>i zcA=&K25siNv1p0L3lfF6s+JL}Lij+G&nBlWsLh*AOWtL73vz3Stf6ekv;nmituyAd z)={D$Vg6_L$eK5zuM2gncF#9KU9YK_$_lx$0u7outRH5W873}gLgj>E9518z%(LgR z%-GYym=J=&3yBmIDp0XQA$E~ou450fG5&m9j83s^Q=rxH$|ez_IYw0Bp_Tz;#(uHu zD&qV2{5iq?mQMUV{P-LSQ=wQn2vgJXt0Hj(;z#7&A&;m>c6WYCxhL`)6v-F?5{EH* zU!=XxtSiJnLda)h#Ig7%6iWniy6Lg^r#M}{qm#p)>SeJCQDt}+$5QsmWS?KmUo_zcAiLHcqVCXx^I3qRbGE%ba8a0yag0M zohNCoPWiVb%^j;H6$`U+N#p#$DxNWWeew0>p4)>Tow{}EuIaw(7~yE>$d!YaY>;o7 zKQ_y-Xx=@PeD*omFyH5&Uvs!`*ca_9js`mVhjjD9$(rtD&q*9fxH(Y$es!v(KH0D% z>F8UnsJ?M!@d_Dpzp?lRkbEO(sXCByMXsdFcgHRF9FuyEea0VOZR`5TcFV>J?nyfK zZYk(P!DcE5>LJ%TGJLM6lPfx;ijGuOHLR_R+8%MrQBvfQ`4$OeL+hUV{N5DbE%Q4i zerIym+0XfN>)l*w)A}~f(F7BTr2Tl(d|Wwrvgii$2Yx|0cy=H*|KQQmz~hiXcExES zj}3>6J}r z>~0n9*w-n8xrW^WVIJlZg(__DsFZn3AHri%FQS)ZLy<7QqrYBggrx$$jIi~#j2p5K zN}2JXRKf6L(H&46wITi&@;2c%GM-luoB^UT@z3#@w!Rt_fpXZ*wN^_1l)N$WeuX>| zKru{+M~o73``ls$MriRReMe;93CVY2#W%Flj0-M}s}+s! zMC6KYsiJ${#_m^nB;JEJn1HY>13KWsWbGl4oAtf8UR1Y3uG=fs?ftk%s_RG68sBoW zTH<#kcMVDW5Qun-TS2n-&3i!c7j3(vllWel-!1XP3-J4r2cDPs=TmkEi1eZo(CE0F zp{I~m=9JwBLcFM*>UDx^8q^KxKJB=F^w|~HbJEdg?_6B6EnmdjBgv!B&htsvb8CD( za2GCWfD3k5*J;qd_`<-spV!3*bNso{dbHe_wkei#Og6RQ)hHq{=wN;Lw*dLC>GHAI zwJdpwH=^1w^WGf!a}7_3)2__D8Kq(ht@{-qxG<~ablf%Fc`i;1m^nOFiI}5Vn+bI< z#R~B{he@sTySiM7dEK}!Sn!tlv84)gr6|wa`piRbzzpo&DO#JG9G-o1V3!(z-JWQ- zM5Bg%PG&41w&?_5PHj_b^tcy75N3|BKt41;(8J~mmRAPY1++7)QJ{r~QLYgGnH8v$ zDC~Rspe1Peo?dx^QiU6cxLhubWv=jgC-hhaS*)W6*0JW@NFE4>yA-*l2ajB5g#Q^c z%gsa?ja&RPc-qqK!$`?Ww;^99>9*F59u-!Qd}CE*B;S=e$@kBxonatj8Ktzm-gMNULIKJxzV@S_s*`Rz;ErpRh^N0*VgN1 zU4V)w1BC1~>)xaO4dXVV6HKaq34!0zWk=qU898D%yC6q<=9*qP(Iz)1<6Fs*2bVU$ zMb9(>2IZ(=z<~wx7;-}eLEsTTC+{A-%p*;2D&o#>4ki)#`4w(_yd>i+=rvHy)c{|M^aOs`hG zzGAChdikEsv&vU2fKFcg_U#uw{1v(Dgw%E7GyY_%tPa%ds=ZXA^|n&lci-NZG@k{9 zxdPwbhm@64Kp5VA`|vtF`)GX<@a#99F#E1uUp}doTBr6&PCG>~(8qNqEjfHMNDCWmi`+_Y<(Zl;_&n z^#HnqL6l9Nit}hxh(%iQbi#DzrMiurF{=yeGl8!2=ULL{(5|l_@16j$sLDnL@LrqvYlzvfv^U-}aMQkj8hLR0+{$6$6sJ61+0!F=dSuUT$+P=& z4~*AwyR=H?ao@TDX6bU{5vlQr+;~!IJh{?%>NEZ{D3*qOYh*CpM7ME=SM1x9=IxAa z@)?GQhkb@)$By}o$1WgPBa*^KByo#aOI{s$M2sj@ua81K@X|a1H8ch#(&cDS{7Z_b z6{NfxLDar@oV-HvJn+&+(j$xCW_!|X`JWku#gZP=cCCWlz zWzb@pSJrt2IkZv&dDrlNY!JBME$)F{kH@^tb*4Qxahn-GcTIo6f5Jbc;QApQR{&XT ziOHX;sb4qXcdfp0-GpE4C~N6L)3w?!3i0*Opypd|X*884w;y@P;dlM4j@!N~wWmMj z?|o1-t~XiN?Dgv=3ZT5JaUH@h1kD_DNT=UDeRGFY)4h&Y#t>)ayuJ_izrX*ES8Ce1 zZewvg=c-=XzHC_9y%<@JgB^az>Ri8I;;QOX)t*#U{R7LW&Q!ZrTCr}T09q-pTrXrn zT+(*0vt(3mEq-WYvBwO!&W?qW>7?xqy~(+Tj3x@8cvancAq$#u(R{g?UMG-hyV0xK zo$D5sVBOrSbyf)k;#T|HHP>pEjFP41VG+xFxPZZ_(R4z$WLgR?_t6lgx@~v7cV>Q= z_(4MIJ({dLD!B*00QgSe<$`SX^%Izil68Hv34)H!bpw9aEX64^KmW|zFI;;;GFL5K zkjxF55$6}0=AXY-yyTS3wa`FV>K_=4)`BlgsDoNiYF!`JRhsH=?zm(6A^!tj>gZqR z@O{^Ncjhk=pCqKi7ar30`cAWH$Fg%-_#pg#SZdh*h{LzC!f0Kk_0toIigpLSHW2Aq z{UY? zpl2^IQ zy05#9-1TR1L>O@hn1&gVn0pcQEoe@V}zuUF6v) z*&=|?#^t!U0E*{tDEbt6lq+p#e?9|OB101ql8>f&diGN>5C{eFdW&M8rD)~txj(Ei z9u7=E6(0p{B`S%ez|W>D1de)`DLCvuP^08E5x)N!0*sOY$Raim^j}N-Fhuo*}m?D?kQ<>DJM{)a>V>01Nj)<;uhE&J28LY z(il5YU`?X_g%rHAN?S;wK1&qyt2|Q^W%7v@N!#cTOZb&1IF&Pq&aiYni_R+cr!?(O_pwE*%gvOu}s5VKE)ox4}lO2Xz<5R zV(hPEq>FQZIfcE@&MxP#O2KjyqY<{JK*aY}C0^W@pmXDVc%a@r6@tA+~g6;k8Box(VN?tibZ95Fyt0{|6_f8ms^S literal 0 HcmV?d00001 diff --git a/distributed/eplb/__pycache__/rebalance_algo.cpython-312.pyc b/distributed/eplb/__pycache__/rebalance_algo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1426b43a1af29b81b146fe5962d4d9a3327a37c4 GIT binary patch literal 11139 zcmd5?Yit`=cAg=Jui;ao-j?LCCCj2MOS0p~+Ho8^u;Q2ecD$}1Q7CaFQQ|8zBik~D zS!Opyq$GgoHmzv1KolTA6xcu(|VZ}}b)IbBY`BNyBf$a82+jH(b zM9Gxn6uUq#z?nPuo_o%Hp7Wh^`Hx;NOF@b;A4Ywx6!ow8VI;qvS$;xO)HD^Lu2K;? zVu{gLE#z&zYK>TV>xk26u=0*Dye(qm?d5CBmkwI$fx)0XVtR6Yv+E09t%G={5*eP*`A0OWV_at?faw3_JeFT5*VVdy2mY{itrIwX13>ciWej< z6&Av=7$1v@aW0k&N4WlQESwnN1+Km4T<>ckZZIiuH+y?ece3oI;i$;Plaa9)&qd>@ z7$4^olE_KJJU5UOcuwHMf)Gv&k#a5^8%hdMX*f<=u3O6i&u}E1lK2QWC?w-UX!~kom5b1^}0>gLvTj5 z-{PY~!;;ek<6wF|s7CL{(dE60Swqhp2NvWKCl_x};lmWWlV zGD^zL;RCxdgK5_vm>(-`FeJ+sW7dySH)PmZ?ucWEhMfhwBkhUMM*f3bz2u!DGTd5rFYH-*cAmrC0bGb1ZvCnvFIq?*}e(-h7h**@rSF{i86K`kwa5F4--6 z2Q3G&MyHV3l``2Q`)*rq6QdzJv~uKf2X@|ob--+A&_pbyH6IN#)JN|yRLP#>pz)d{ zyNvOSw5+LuPnj`lSM6P{3?*w&{s`}Y5AbRXXpi}rV=#--t8qU&ZX2BtGAN!l64YdR>IbjwXgW55|%M;h1=+(#1(f_=L>u@kZykNtQ3s<`t)u*Ss8Y3w!(kYl8!AgtJIGrksTQwFQhwZNAwFS^Rs82p_OVv}l z)l`>uBKn{gENoBME+Xc=EBhEf{iSWNfR3sjLZY8d)kegr3>FG#(S%BPh?s+8SDFqV z7SO7xfwgNS;QqrZ@jqZ^{ysx3HB-&4pIpCpU11tBC$ntPU7d3&?oBi2AG_P~Ok3XF zR;=M>PAD}yGH0K7gHtyqZ)9G7;;6}OdhFP+Sluvv{qFUmr*i7lr_NuXw$z03Z&XCbgz32ZI|S~Dl0VP(~H!`+75pX5%?UYM`w&Yb+V zs%HAw-DA0{vkm#i-E;lFv3%qF+Bsi!B6E7Nrfz!Z?ojT|?16mq-iMaoZ2qSG>-PDY zQ<<~C$rsEX_^r2jf-cr=$d2X4mD+abKuvA-Vs8K4z6th8^~M?c;~Nt_-v(`h0$p`lo+qO-d&TY!Q zk!x2xEyea-bGw!HpG}<29m$R5jw_xmMNeI>s^Dp!3C$jv9h*I_Y~G`I_CP_=Q1J z`%(A_VDk=Wxilj)070*t=DIxaEQNIC;3i!HC{YcpNwV#M#ehR2s0r328Q_!&BkU!n z+_Q!dJ^_Bch%_DG0{B}2%H_Z)fVmgwyTQ#_!mZpqSYCh^t`Q`Fg(jdy;BSl{0NOFA zWo1fs-hsI)2W=w^z*XHUjhHZ0<4btE0Y-uU9kKtZMP@bV1pp9$nC6?yj`DRT#Imk} zSSJ`twgWVC4q7i#AxHR25KS!V1#r%y17V5h-XdBxIrJ8Q6!bd@s3C>0P2U2~kr1xJ zH)yBhU~l7N8jK*`sSaHXdUPH^yGP?-C?YQkS5Utvhiv!tTKuI-w~j=!WXWeIf9Xu|y=C zkTfIffN8;bR2dwnYKJTg5RE&cf`rh8Oe)Mqh!WA)n%?Bbi&pstvgzrD{v&bkSXxJM!4wlxLdq?xvz)7#DrPsn}#J zx9`E0FSmcO{o&xeuV=y6tN3~!ozI^;U+`VX99wh;re2x1eqH+-?-;pKVv z(ah0BZ{^h0$*VcKq_wvv^SZ%v=Vv{ipZfIFL;Jk%Rp8>S%nSXtXNaa%;3Gg zg~pvqJ7PL`7L|%jeC{qeG|+F z=d<+0rAfBv^iAB#*M;)_P{FxvHmo?i=AGNn^lSke=Lu$y<^T(Dd4>}Zy{c8Z86kVr zoIMCHP+GVnk&N)7>O`uc)R;J6YEq!UT|}(S2t+8s-kb^%)vyKd($)izjkT=MU1bwjK zPhsJRO*;n$W8Js~Dy?{u*F8OAFA)`>H8wQeGYDao9$Xd3SZ&84p#2PDFboq>mc*qm zWuRFmPIEO$=aC+1iHA$~(b8el%dJFu8b@Dc{8O+Lu0o$nm# z?BLE^?u{l!Lx`IO!y>>MlgQBGrLvBldjlX@d`y(Meuxr5E+$1GI*tH_fH%^Mx7LS4 z+Kfp_b86$^6xU9?{LV{!0(|w*N=ym6d9B>K7%=JCg)vqPB$!11ClLlC9U^paS5{Wv zs2fO*C8VF;d!xC|TH{Ob7I^u*5EsyFu8JWc3`75Y*#bIsnv!(Q3Yr(T_Sz&<_YKVp zf!&c(8?2Lp?kMF^&Mzr{9bg7|B~xg*P&VlIGJaTzM`zqGnq0unP$Olmk1-NMPtlZ2 zgGXUKqk&SKmynAwEX}}a!EkUOmV#Q!M0k9#9a1!#WHJZ37 z;!JX!1`z?J!jTgBJ0-y|eYN_Aqr4D?pcE>GmEf1&APY>y8}P_b0N@sU&5Ex%Cl!3H z6V{^BJ>{A76g}ReC$Q+LnChME%~gF;f3JRK=U+8Gc;m}!UtIgD>YEK;ZRdm#DuU=e0z-N0j!%4UdP-9v3)2E+C)&9n^FE2LCT>f4ZFXN@(}z9 z;D1uWaeQorhZ@8)yJnZGEYW3sB!X8O!$ggt2H~(8hnYk+0X_ zuuD#Cm4XN4NEUJ~lGB-DA(4!Q$W?d^+KG69f+}s@q;0